This backport contains 309 patchs from gcc main stream tree.
The commit id of these patchs list as following in the order of time.

0001-re-PR-target-89261-ix86_data_alignment-has-wrong-arg.patch
f8b906a2de3044f1dea753b182c244a1a560d40e

0002-Fix-Wenum-compare-switch-warning-in-i386.c.patch
791536baadc9f469ec8eef2d7213c6f6091c5fa9

0003-Prefer-to-use-strlen-call-instead-of-inline-expansio.patch
786e0e5239529de9a4254fe8411a0e8f843e721a

0004-Enhance-target-and-target_clone-error-messages.patch
cc2a672a60ff7476b3e4751ba41cb77c7fc85b09

0005-re-PR-middle-end-88963-gcc-generates-terrible-code-f.patch
a7eb97ad269b6509bd7b31ca373daea98e4d7e85

0006-Split-i386.c.patch
2bf6d93547e516b6b2b2051c0fb1b47ea4acc8a4

0007-Split-part-of-functionality-from-lto.c-to-lto-common.patch
a79420f995764129dc40d1abcbf8ce75a0b0f906

0008-Error-only-when-a-non-default-mabi-is-used-with-sani.patch
080629d32eca5ea202479022f0bd429a813be7c4

0009-This-patch-adds-support-to-vectorize-sum-of-abslolut.patch
a9fad8fe6c84de272f2a56d462e67d53c9f4a73d

0010-cfgexpand.c-asm_clobber_reg_is_valid-Reject-clobbers.patch
0a59215131c02dee4c8829f93d1ee678647614da

0011-re-PR-tree-optimization-90395-ICE-verify_flow_info-f.patch
362e280d10c61bec13c1d02c11a1c4ac0846db7e

0012-re-PR-c-59813-tail-call-elimination-didn-t-fire-for-.patch
b5b9147d35ee509714c34d813c7723bf18bb7b7a

0013-Accept-code-attributes-as-rtx-codes-in-.md-files.patch
75df257b38bd4cdcb750fc893c5023363230cfe8

0014-x86-fix-pr82920.patch
0f8768f73440b040707deafd254d189c2887d00d

0015-2019-05-14-Przemyslaw-Wirkus-przemyslaw.wirkus-arm.c.patch
a52cf5cf278e4a9e58bfa2bb67a93244766a122f

0016-re-PR-tree-optimization-88828-Inefficient-update-of-.patch
962372f9f853c582c879f11c0db14973cc8687e0

0017-re-PR-tree-optimization-88828-Inefficient-update-of-.patch
595ffc073bf5b1753e3a18dfa704391ad5fad626

0018-gcc-move-assemble_start_function-assemble_end_functi.patch
f7430263c07b4a1bcf3deb708c8c691f233fcb40

0019-trans.c-check_inlining_for_nested_subprog-Quote-rese.patch
a9c697b88395a0f2b175ac30c59bd8c0c22d0db1

0020-gcc-aarch64-move-assemble_start_function-assemble_en.patch
6b5777c6c7059b6b8e372e567a74bdccb59a02c3

0021-gimple-match-head.c-Include-vec-perm-indices.h.patch
ebd733a78ccf5792067e94852c6c81a5f9aa0020

0022-i386-Fold-__builtin_ia32_shufpd-to-VEC_PERM_EXPR.patch
4d508751f421491052bc1d83150344e6cba30b3b

0023-aarch64-Introduce-flags-for-SVE2.patch
28108a5341653568e9ebc49ea755ff93cc1e1711

0024-aarch64-Change-two-function-declaration-types.patch
1ec77eedd529f81b1dc99cda9818f1ef9e952b96

0025-PATCH-3-3-GCC-AARCH64-Add-support-for-pointer-authen.patch
8fc16d725206f2c40bae423d7d0d93bd1baf6da2

0026-This-patch-implements-the-u-avgM3_floor-and-u-avgM3_.patch
0617e23c9531373d3b232152c0d81a2c707858d9

0027-tree-ssa-alias-access-spath-1.c-new-testcase.patch
987c9fc581ffb04d5ab7a782bb7aee6205c45663

0028-PATCH-GCC-AARCH64-Fix-libstdc-build-failure-after-r2.patch
0e2e15abd0765c1866f36f0312f77c9595e7fdec

0029-aarch64-add-support-for-fabd-in-sve.patch
3db85990dbde7f9c8212fe0fb8a241c5d2993198

0030-New-.md-construct-define_insn_and_rewrite.patch
f4fde1b378ad68fb2dec6719ed26c1b901488e03

0031-re-PR-target-88837-SVE-Poor-vector-construction-code.patch
3a0afad0d212b3ff213b393728e018caf2daa526

0032-AArch64-Emit-TARGET_DOTPROD-specific-sequence-for-us.patch
72215009a9f9827397a4eb74e9341b2b7dc658df

0033-AARCH64-ILP32-Fix-aarch64_asan_shadow_offset.patch
10078f3e1d0cbebc5e6f7f4821d3ad41421ef1e0

0034-Make-SRA-re-construct-orginal-memory-accesses-when-e.patch
3b47da42de621c6c3bf7d2f9245df989aa7eb5a1

0035-Fix-fwprop-call-to-call-to-paradoxical_subreg_p.patch
6c202d9dc65833e04e35f566c645fde8278c1a24

0036-init_1.c-Remove-options-O2-fno-schedule-insns-and-in.patch
3a9debbd7660bafbd7658c9e843eddbac8980188

0037-iterators.md-ADDSUB-Fix-typo-in-comment.patch
dd550c996578ea7e94f3a59e57f24636186fbb95

0038-re-PR-target-88834-SVE-Poor-addressing-mode-choices-.patch
fa9863e7d34ecd011ae75083be2ae124e5831b64

0039-Darwin-The-need-for-picsym-stubs-is-dependent-on-lin.patch
ce3a201593d0ed5b606360c064778de34b5b04ef

0040-netbsd-aarch64-add-netbsd-aarch64-target.patch
f32f75858a14e7b304df7a71dae15d75081b0deb

0041-Vectorizer-Support-masking-fold-left-reductions.patch
bce29d65ebe1316d15ec7582a1d257ef1be163f7

0042-Darwin-The-need-for-FDE-symbols-is-dependent-on-link.patch
dbe89f49da468fbd42a27bdb7b8f06de76a871b4

0043-AArch64-Simplify-SVE-IFN_COND-patterns.patch
32cf949cec180799d3fb14d405772ea35b5aafd3

0044-AArch64-Factor-out-ptrue-predicate-creation.patch
16de3637c4df37e0203b3ad52b238887e6ca38fc

0045-AArch64-Factor-out-pfalse-predicate-creation.patch
e7053b0c7cf3f1cd8a23cc71e7e36ec29c46b217

0046-AArch64-Tabify-aarch64-sve.md.patch
ea403d8bb5129632aac4d2f270566d2d0073a8ae

0047-AArch64-Add-a-new-CC-mode-for-SVE-conditions.patch
57d6f4d04d438522dc03488ca31f71b4b7b904c8

0048-aarch64-Refactor-common-errata-work-around-specs.patch
91bed1a15a6dfb891b9658532b49f9488b5537f4

0049-objective-c-c-testsuite-Fix-stubify-tests-for-fnext-.patch
b7a0332ccd21c04a37535c97f04abc4bc28fb321

0050-builtins.c-get_memory_rtx-Fix-comment.patch
76715c3216cf6ccd071fc852920af55d6b0054ae

0051-Use-alternative_mask-for-add_insn_allocno_copies.patch
73bb8fe9e915cf3219f16afdc61c308c08aa7659

0052-Simplify-ira_setup_alts.patch
06a65e803ed06f3ad1fd8e5f90db03aa0a7e5414

0053-Make-ira_get_dup_out_num-handle-more-cases.patch
ed680e2cc18c73f90e6bfbd3f346a8820476371b

0054-Allow-earlyclobbers-in-ira_get_dup_out_num.patch
ae5569fa33c9f3286e0b747f8b6607d21a4b9827

0055-Use-ira_setup_alts-for-conflict-detection.patch
6de20b9d7a1af863fb51b4a783c153ea0092810a

0056-aarch64-force-frame-pointer-setup-before-tlsdesc-cal.patch
0e510d1824241953c67b38f7a894de7238c23c61

0057-AArch64-Remove-constraint-strings-from-define_expand.patch
1bbffb87a9ecc3e27a4074145e55e3315df57b7d

0058-re-PR-target-88833-SVE-Redundant-moves-for-WHILELO-b.patch
75da268e1a563a1a52389cd2ecee12d07c45a655

0059-PATCH-GCC-AARCH64-PR-target-90712-Fix-gcc.dg-rtl-aar.patch
2bdc7dcbbd2eee4f114c09443933cc37a546dbff

0060-aarch64-redefine-aes-patterns.patch
5169fa77322e36dd4783bc5126185159c35a3584

0061-simplify-rtx.c-simplify_unary_operation_1-Use-GET_MO.patch
4faba5c3bc37c0bfceec6b254d76c5d0b3e2fe8b

0062-Support-multiple-operand-counts-for-.md-patterns.patch
d281492de84960b5885f88fffeeb226650f5141d

0063-arch64-Fix-ambiguous-.md-attribute-uses.patch
e7ba492a04d0bfef9752cbb16fcce3ffc31bf99f

0064-Relax-vector_builder-elt-sanity-check.patch
72ab1c51b607dd5446ee24ff9fce9178d6b811cb

0065-re-PR-target-90723-pr88598-2.c-segfaults-with-msve-v.patch
f2b29269c407f10718bc935b3dd5c7e8641b6847

0066-AArch64-Rename-bitperm-to-sve2-bitperm.patch
c10abf530e52972ef708f6e72cf20dd920cd22a2

0067-aarch64-add-usra-and-ssra-combine-patterns.patch
462e6f9a932a44ca73715dc5c2960e5b332f63f7

0068-config-i386-x86-tune.def-X86_TUNE_AVOID_256FMA_CHAIN.patch
ef893a2a769b18c61953d80670b1db8c27bc44e0

0069-i386-options.c-ix86_option_override_internal-Default.patch
105c2795b0d63b2cc5cb224ba066fa8b9a0ad0ff

0070-Come-up-with-function_decl_type-and-use-it-in-tree_f.patch
cb50701ec2c7abdc48db278802022f7e94675d07

0071-cif-code.def-NEVER_CALL-New-code.patch
5ab2422adf894bdf84deed8c7c0557c16d6dca2b

0072-AArch64-Make-processing-less-fragile-in-config.gcc.patch
3644cadf6a9d5a5cd8e83b0123316cf184fa4e3e

0073-Implement-more-rtx-vector-folds-on-variable-length-v.patch
4ce6ab6889446984fd7017e2150962eb4550a7ee

0074-Generalise-VEC_DUPLICATE-folding-for-variable-length.patch
708cc6132bb374e2c5bd1c4f43f9fe7306d20970

0075-Add-dg-test-for-matching-function-bodies.patch
4d706ff86ea86868615558e92407674a4f4b4af9

0076-Prevent-Og-from-deleting-stores-to-write-only-variab.patch
ec8ac265ff21fb379ac072848561a91e4990c47f

0077-Don-t-run-DSE-at-Og.patch
c0fe6bce2a8c35e997f45b0a674ab2058ba50ae0

0078-Prevent-tree-ssa-dce.c-from-deleting-stores-at-Og.patch
f33b9c40b97f6f8a72ee370068ad81e33d71434e

0079-re-PR-target-91150-wrong-code-with-O-mavx512vbmi-due.patch
fa2987ed8db073b9d59688363e2dfb6c60f47d70

0080-Handle-IFN_COND_MUL-in-tree-ssa-math-opts.c.patch
c1b3d827832f883e0634b18c88eb2bbde335aa42

0081-Make-lra-use-per-alternative-earlyclobber-info.patch
a25f3e8efbbc7182fa58c445574848a73856e9b4

0082-GCC-AArch64-Enable-Transactional-Memory-Extension.patch
89626179b6fe42cbd58c715808f7c6401879757f

0083-Add-a-gimple_move_vops-helper-function.patch
779724a5913b4e6a7ccccc0b8b415a772144a067

0084-Make-function_code-a-32-bit-field.patch
55f863c4d694deafb968dbf44d08ba49bb7c0766

0085-AArch64-Remove-unused-commutative-attribute.patch
871b49afafe043d57f717e70532d66c5a56ca173

0086-AArch64-Reorganise-aarch64-sve.md.patch
915d28fe74dbb30352702ab07ea5bf30747043bb

0087-AArch64-Make-SVE-UNSPEC_COND_-s-match-the-insn-mnemo.patch
cb18e86dd005fe009c536a8bb0aec7aa88ca66df

0088-AArch64-Remove-redundant-SVE-FADDA-pattern.patch
8ad84de26e1032d80225905c611a47b64a385e8a

0089-AArch64-Merge-SVE-FP-unary-patterns.patch
d45b20a5539b6f306a559470c3a7e9f84a058bfb

0090-AArch64-Merge-SVE-FMAXNM-FMINNM-patterns.patch
214c42faa06a9eb1aa7f0296399f28df4fb068ec

0091-AArch64-Merge-SVE-ternary-FP-operations.patch
0d80d083a2e1d368fcb11eb7ea5490c274f0ea15

0092-AArch64-Merge-SVE-reduction-patterns.patch
b0760a40bef3ca690691bf5d214da95b5dc25266

0093-AArch64-Prefer-FPRs-over-GPRs-for-CLASTB.patch
801790b37ca817089ecbae214340162e6d94ea6a

0094-AArch64-Prefer-FPRs-over-GPRs-for-INSR.patch
61ee25b9e7d84fbb18218887d1fecfb10f72993a

0095-AArch64-Fix-INSR-for-zero-floats.patch
9b6fb97c99abe64147f82a3ea6e6ed598e387482

0096-C-Fix-bogus-nested-enum-error-message.patch
99769e7fb6ed153a53174b7f08415eee347655f0

0097-AArch64-Make-perm_insn-the-complete-mnemonic.patch
3e2751ce5591dc8f3b5f4ffd3dacf0fb8f789395

0098-AArch64-Add-a-y-constraint-for-V0-V7.patch
163b1f6ab2950553e1cc1b39a6b49293b3390e46

0099-AArch64-Make-aarch64_classify_vector_mode-use-a-swit.patch
806f69cd68c18399e8e54b1a0913ae57beabbe69

0100-AArch64-Make-simd_immediate_info-INDEX-explicit.patch
1da83ccee8e7b61e7777abb63eb0e5a0ff1f1e93

0101-AArch64-Use-simd_immediate_info-for-SVE-predicate-co.patch
1044fa32e2b456b59b3cdc31b4f261145f1589cc

0102-AArch64-Increase-default-function-alignment.patch
4e55aefa3ee19167a41892e4920a3e8c520aee42

0103-AArch64-Improve-SVE-constant-moves.patch
4aeb1ba7f62c1d680c819ae3e137c3bad6f520ca

0104-Darwin-There-is-no-need-to-distinguish-PIC-non-PIC-s.patch
d308419c64c52c2d48bdf53a65e1790a2c897e83

0105-Optimise-constant-IFN_WHILE_ULTs.patch
0b1fe8cf6f1dde656c505dde6d27279dff388962

0106-Protect-some-checks-of-DECL_FUNCTION_CODE.patch
cb1180d547e3b28547134a06ee020163afa59cc3

0107-Use-checking-forms-of-DECL_FUNCTION_CODE-PR-91421.patch
4d732405bd91b54c196fdc38191f838bb01f23a6

0108-AArch64-Rework-SVE-PTEST-patterns.patch
34467289631e29545e14148515ab5f5d0d9e4fa7

0109-AArch64-Canonicalise-SVE-predicate-constants.patch
678faefcab01f9e9eeb222852675b5a042aaf900

0110-AArch64-Don-t-rely-on-REG_EQUAL-notes-to-combine-SVE.patch
35d6c5913d2209eb50f48b589b29f0dce13cb9b7

0111-AArch64-Use-unspecs-for-remaining-SVE-FP-binary-ops.patch
6fe679cc6be7a55832f9b88a8cf0751e8d5eff6e

0112-AArch64-Add-a-GP-strictness-operand-to-SVE-FP-unspec.patch
c9c5a8090c58b84c1eb45e39e77eee223f992009

0113-AArch64-Commonise-some-SVE-FP-patterns.patch
0254ed7970e64abd82f21aedf9373720a73671c7

0114-AArch64-Add-support-for-SVE-HF-vconds.patch
a70965b114281553fa46cac9b8abab543f36793f

0115-AArch64-Rework-SVE-FP-comparisons.patch
4a942af61c16f38f7fe51ed72a7ac23f73f62f2a

0116-AArch64-Use-unspecs-for-SVE-conversions-involving-fl.patch
99361551624427aebe7a856a4327e083aa33733a

0117-AArch64-Rearrange-SVE-conversion-patterns.patch
95eb5537d8bb23b952105b46250ed4fba8766b84

0118-AArch64-Use-x-predication-for-SVE-integer-arithmetic.patch
063082768aab23d26e42954eb115b76318f0176d

0119-AArch64-Rework-SVE-integer-comparisons.patch
00fa90d975bfacfd91a615fbee24e3e6a100100f

0120-AArch64-Handle-more-SVE-predicate-constants.patch
2803bc3bbca332f53801770715a5b592b2467492

0121-AArch64-Use-SVE-ADR-to-optimise-shift-add-sequences.patch
a229966c9c76afe0cf18c566a3c13ddde3878288

0122-AArch64-Add-support-for-SVE-CLS-and-CLZ.patch
bca5a9971f47cf5fe79e6595beb762539f200f46

0123-AArch64-Add-support-for-SVE-CNOT.patch
e0a0be93d7c2b760779c3085c5abfd0496e3458b

0124-AArch64-Add-support-for-SVE-SU-MAX-MIN-immediate.patch
f8c22a8bbaf3ef4260f7d8beea22ed151ca4b726

0125-AArch64-Add-support-for-SVE-F-MAX-MIN-NM-immediate.patch
75079ddf9cb867576bbef66f3e8370d9fdeea3b8

0126-AArch64-Make-more-use-of-SVE-conditional-constant-mo.patch
d29f7dd50de9e8e46f7e247c53f3b0405a3dadd9

0127-AArch64-Use-SVE-MOV-M-of-scalars.patch
88a37c4d72899c5a3f5a7b2bca0ae0096f3270a3

0128-AArch64-Add-support-for-SVE-absolute-comparisons.patch
42b4e87d317377d6dcbb25ee2523da4a0c42478a

0129-AArch64-Add-SVE-conditional-integer-unary-patterns.patch
3c9f496337f754f7c22afb46b017871db5844a97

0130-AArch64-Add-SVE-conditional-floating-point-unary-pat.patch
b21f7d53095b253753c5622f99809e9c82fd3009

0131-AArch64-Add-SVE-conditional-conversion-patterns.patch
c5e16983cd1bd6dd6eca1b939c3c8859f0c6c866

0132-AArch64-Use-SVE-UXT-BHW-as-a-form-of-predicated-AND.patch
d113ece60450b2efb07e9057b6d2732b08fee2c4

0133-AArch64-Use-SVE-BIC-for-conditional-arithmetic.patch
1b187f36ec16d43d0227805955d8fae51af26970

0134-Add-support-for-conditional-shifts.patch
20103c0ea9336d2b5286eb7f2605ace3fd49a431

0135-AArch64-Use-SVE-SU-ABD-in-conditional-arithmetic.patch
9730c5ccd522cd955bcb6e65295023621cade8b6

0136-AArch64-Use-SVE-FABD-in-conditional-arithmetic.patch
bf30864e4c241e50585745af504b09db55f7f08b

0137-AArch64-Use-SVE-binary-immediate-instructions-for-co.patch
a19ba9e1b15d248e5a13ee773f4acd4ae29fdeaa

0138-AArch64-Use-SVE-MLA-MLS-MAD-and-MSB-for-conditional-.patch
b6c3aea1892c148c21f8b87668f344b2397f4aa5

0139-AArch64-Add-a-commutativity-marker-to-the-SVE-SU-ABD.patch
9a8d9b3f2422d4885e5c846dee66acf6336e6ccf

0140-aarch64-Use-neoversen1-tuning-struct-for-mcpu-cortex.patch
42418c1f7f5cb3b2f466f88053acc818ddc5cd4d

0141-AArch64-Use-SVE-reversed-shifts-in-preference-to-MOV.patch
7d1f24018b04c13134bc47619fb8aaa390b01754

0142-AArch64-Add-more-unpredicated-MOVPRFX-alternatives.patch
5e176a613ef2eda92aa65736763a562dc42a50fe

0143-AArch64-Remove-unneeded-FSUB-alternatives-and-add-a-.patch
2ae21bd133c357fcd7b6e06dc7d7d9e0660abe2c

0144-AArch64-Add-MOVPRFX-alternatives-for-SVE-EXT-pattern.patch
06b3ba23eb6ff965a92cd99d2835d4c29316a447

0145-AArch64-Add-more-SVE-FMLA-and-FMAD-z-alternatives.patch
432b29c189a6d26ed701c7518402708b2fcb794f

0146-AArch64-Rework-SVE-REV-BHW-patterns.patch
d7a09c445a475a95559e8b9f29eb06ad92effa91

0147-AArch64-Rework-SVE-INC-DEC-handling.patch
0fdc30bcf56d7b46122d7e67d61b56c0a198f3b3

0148-AArch64-Optimise-aarch64_add_offset-for-SVE-VL-const.patch
7d8bdfa7e409821c50f6d8a7b557bd7dc760c4ce

0149-AArch64-Pass-a-pattern-to-aarch64_output_sve_cnt_imm.patch
139df05a29eb71075e42f502978dea4d00a99708

0150-AArch64-Tweak-operand-choice-for-SVE-predicate-AND.patch
2d2388f82f2e7f2fd1da063192ba98be45f099d2

0151-AArch64-Fix-predicate-alignment-for-fixed-length-SVE.patch
07108a9ebe4776610bb23f684b3a346d28511bed

0152-AArch64-Add-a-aarch64_sve_mode_p-query.patch
5c38705dbde776f68bf1f99a71657d0e21b772a5

0153-Remove-TARGET_SETUP_INCOMING_VARARG_BOUNDS.patch
06b5889c434b941804d5592cd4fc8946b25c1c4b

0154-As-discussed-below.patch
1f2a3ac34620ab4669f9f32417a7a4496c8f603a

0155-AArch64-Use-scvtf-fbits-option-where-appropriate.patch
188d00796f5bd338b9b8ab1cc8ba4b43af8ab8fd

0156-Add-pass_va_arg_by_reference.patch
fde65a89fad742c2dca8ad50452e482d22f3c1b2

0157-Add-must_pass_va_arg_in_stack.patch
4f53599cb5b822cd7f95997861c2e064977ecb6a

0158-Use-function_arg_info-for-TARGET_ARG_PARTIAL_BYTES.patch
a7c81bc1fb43366ca1b4332d8a6042b648a84cdc

0159-Use-function_arg_info-for-TARGET_PASS_BY_REFERENCE.patch
52090e4dbd064f486af606e3f8a283dbddc7c18a

0160-Use-function_arg_info-for-TARGET_SETUP_INCOMING_ARGS.patch
e7056ca417326a70eca05defb6a8b20b737d3417

0161-Use-function_arg_info-for-TARGET_FUNCTION_-INCOMING_.patch
6783fdb7057d559aa1da8afa2c15a702c532a03e

0162-Use-function_arg_info-for-TARGET_FUNCTION_ARG_ADVANC.patch
6930c98c69ad695469ee7daa74b3b6d578afdd0d

0163-Use-function_arg_info-for-TARGET_CALLEE_COPIES.patch
7256c7194e186fce6ff866a124a77b08196c2a5f

0164-Use-function_arg_info-for-TARGET_MUST_PASS_IN_STACK.patch
0ffef2005fd7536efbc9c3a572701998c8a8080c

0165-Add-a-apply_pass_by_reference_rules-helper.patch
b12cdd6e8e8dd1f39a941b731ba1056d656a094f

0166-re-PR-target-88839-SVE-Poor-implementation-of-blend-.patch
9556ef20164e69d094f5a3e1af262dbb45ed8e3a

0167-aarch64-sve.md-vcond_mask-Add.patch
b1c9ec725da365165ce4c2fdf63daa33b7d86649

0168-aarch64-add-intrinsics-for-vld1-q-_x4-and-vst1-q-_x4.patch
391625888d4d97f9016ab9ac04acc55d81f0c26f

0169-arm-aarch64-Add-comments-warning-that-stack-protecto.patch
a7e73b4158f528600ef97aca29201ddc92b3439f

0170-AArch64-Add-Linux-hwcap-strings-for-some-extensions.patch
75f935365dba3eb5e9cbd11bc0d75009cad3d019

0171-AArch64-Add-support-for-missing-CPUs.patch
e0664b7a63ed8305e9f8539309df7fb3eb13babe

0172-AArch64-Implement-ACLE-intrinsics-for-FRINT-32-64-Z-.patch
10bd1d964ef12daa9f92ff0b8d1e5f600aa63f7b

0173-AArch64-Add-support-for-__jcvt-intrinsic.patch
e1d5d19ec4f84b67ac693fef5b2add7dc9cf056d

0174-Remove-bt-load.c.patch
f78f73cbd284abe4f1718fd7803f5f98800de225

0175-Simplify-the-implementation-of-HARD_REG_SET.patch
504279ae0a0ce28ad37f820dcdb7f6557aabef7c

0176-Make-note_stores-take-an-rtx_insn.patch
e8448ba5300e32917fb12f877ae40711c2b452a3

0177-Remove-COPY_HARD_REG_SET.patch
6576d245386e2ce52df274ef8f2ffed81cfaa1c3

0178-Remove-COMPL_HARD_REG_SET.patch
50b3f54d551787e0a066451ef60ef3b055a893e6

0179-Remove-AND_HARD_REG_SET.patch
dc333d8ff60909dbed89126443e3024f1592f8a4

0180-Remove-IOR_HARD_REG_SET.patch
44942965f4eae141bd1f8300e7f77d0c9a3936e4

0181-Remove-AND_COMPL_HARD_REG_SET.patch
d15e5131845e2a68513230a624839ef5abcda690

0182-Remove-IOR_COMPL_HARD_REG_SET.patch
4897c5aaa7a5db4c1ece28ef66acb3d5e41787b3

0183-Remove-hard_reg_set_equal_p.patch
a85796511b2b7985f79331c996761f7a87cb8116

0184-Tweak-interface-to-ira-build.c-ior_hard_reg_conflict.patch
75f4e3a1b322e16a1aca28bd0ced9af57cb0a683

0185-Add-fast-conversions-from-arrays-to-bitmaps.patch
148909bc700e4f52aa582346a29abc5bc51a9bda

0186-Remove-global-REG_SETs.patch
0b0310e9a0e0d553bbe9f961c52e0851328aa8b0

0187-Remove-call_fixed_reg_set.patch
df1f0eef67939274e9ddd3df426e8dfc5184086b

0188-Remove-no_caller_save_reg_set.patch
026116ce2a4dedad81518b0ca89dd8243b545778

0189-Replace-call_used_reg_set-with-call_used_or_fixed_re.patch
a5647ae846f6765f12a359acba6a71fc12254fa8

0190-Add-call_used_or_fixed_reg_p.patch
a365fa0636886aeda83e57b84d837cfba13597fe

0191-Hide-call_used_regs-in-target-independent-code.patch
53bee79caba4fb88acbcd9bad7891ea45b5511e3

0192-Remove-call_really_used_regs.patch
d7fb4c3162307590c0babddcea4fb60c07a7c033

0193-Vectorise-multiply-high-with-scaling-operations-PR-8.patch
58cc98767aa1d8136d36467b892dc4adaf427acc

0194-arm-aarch64-Make-no_insn-issue-to-nothing.patch
f62281dc1b3d751977266d8c30b4488833fcb9dd

0195-Two-more-POLY_INT-cases-for-dwarf2out.c.patch
ef20d2215067b1bfa8b3f9549ca0baed636a94a0

0196-Handle-variable-length-vectors-in-compute_record_mod.patch
defc6f266c1dd625cc64ad1ecfbd1eacbcd66e4f

0197-Don-t-treat-variable-length-vectors-as-VLAs-during-g.patch
22b6299199da4efd3944cdaabca1d095d19ff901

0198-Make-get_value_for_expr-check-for-INTEGER_CSTs.patch
01b57ebf58b8cc0d16db827d1d9aa5f10da23cce

0199-aarch64-Extend-R-for-integer-registers.patch
e3f15286d1129de2cceee6acd5d5584cb5422db6

0200-aarch64-Implement-TImode-compare-and-swap.patch
4a2095ebace8534038ce2adf4ae94bfc854066c4

0201-aarch64-Tidy-aarch64_split_compare_and_swap.patch
b7e560deb37e38fb224a0cf108e15df4a717167a

0202-aarch64-Implement-moutline-atomics.patch
3950b229a5ed6710f30241c2ddc3c74909bf4740

0203-Rework-constant-subreg-folds-and-handle-more-variabl.patch
f24f4c15884bf1ee65a10e2f959842eec4198876

0204-Extend-neg_const_int-simplifications-to-other-const-.patch
681fc0fa40cc4f018cb691d796aa819a24257774

0205-Avoid-adding-impossible-copies-in-ira-conflicts.c-pr.patch
9f635bd13fe9e85872e441b6f3618947f989909a

0206-AArch64-Fix-memmodel-index-in-aarch64_store_exclusiv.patch
3a30d2558b3a199fe346479e6140cddae7fba5ed

0207-AArch64-Use-implementation-namespace-consistently-in.patch
9a3afc3564b36fb34826899a345a9c35b1c53e39

0208-C-C-Allow-targets-to-check-calls-to-BUILT_IN_MD-func.patch
c6447c2014b76b5c077a07712a7f0b0aaa2e14d4

0209-AArch64-Split-built-in-function-codes-into-major-and.patch
6d4d616a782d5be693ea9575f69d5ebf450be090

0210-AArch64-Strengthen-aarch64_hard_regno_call_part_clob.patch
51051f474a768d285714d713f1b7535d6a139350

0211-Add-function_abi.-h-cc.patch
bd785b44932274f7067105de417938597289962c

0212-Add-a-target-hook-for-getting-an-ABI-from-a-function.patch
002ffd3caa684c3eb30f8f53206439b7aa34b370

0213-Add-a-function-for-getting-the-ABI-of-a-call-insn-ta.patch
5a5a3bc5fa14664be26748c11325021b6b6f8e74

0214-Pass-an-ABI-identifier-to-hard_regno_call_part_clobb.patch
6ee2cc70024253d2670a4a317158b2a65251a1d1

0215-Remove-global-call-sets-DF-entry-exit-defs.patch
559c1ae100489da76a0283750361ace146fdeb77

0216-Remove-global-call-sets-IRA.patch
6c47622219d6386807b26890dcdc84f192499d33

0217-Remove-global-call-sets-LRA.patch
a1e6ee38e708ef2bdef4dfbb99473344bd56fa2f

0218-Remove-global-call-sets-regrename.c.patch
0ce77f463d1d150e70a91807502d628492ca7ae5

0219-Make-ira-call-df_set_regs_ever_live-for-extra-call-c.patch
6d1e98dfd2bfce30640d71df355bedf114229744

0220-AArch64-Allow-shrink-wrapping-of-non-leaf-vector-PCS.patch
ce9d2a37f2db20328286f5d3d5a13a4e765c59f7

0221-AArch64-Make-more-use-of-function_abi.patch
dcdd0f055731a8c960a15e5de8715d041d9a7876

0222-AArch64-SVE-Utilize-ASRD-instruction-for-division-an.patch
c0c2f013906a695b8a02226f119649a370d9e083

0223-AArch64-Make-call-insns-record-the-callee-s-arm_pcs.patch
08cc4d925f640c3cd0336bae4dc6004244a5c80a

0224-AArch64-Use-calls-for-SVE-TLSDESC.patch
bb6ce448fc194cca8e51aea274a1b2408c7746c3

0225-Remove-clobber_high.patch
17d184e5c4896264c27c27d125a6c1f8462d9d37

0226-C-Improve-diagnostics-for-vector-types.patch
8209db250f305cc79fd751c3ed056fb9ff551a83

0227-invoke.texi-early-inlining-insns-O2-Document.patch
0b92cf305dcf34387a8e2564e55ca8948df3b47a

0228-cif-code.def-MAX_INLINE_INSNS_SINGLE_O2_LIMIT-.-New.patch
562d1e9556777988ae46c5d1357af2636bc272ea

0229-Fix-EXECUTE_IF_SET_IN_HARD_REG_SET-use.patch
1c8264003ab1d6932d874bd1a9af4ac498d4b4a4

0230-Use-CONSTEXPR-in-machmode.h.patch
ad00d6c1746fdcbfd86b2d50f2500d7ccb0d1691

0231-pretty-print-support-URL-escape-sequences-PR-87488.patch
d26082357676a3c3843595dfe88a6c682b56e334

0232-Relax-store_bit_field-call-in-store_expr.patch
8b27c9052b8d191c98686e77d2fa610390c78f32

0233-Darwin-machopic-8-n-Back-out-part-of-PR71767-fix.patch
f922d945244558904be6868dc036c31fd05750dd

0234-Add-expr_callee_abi.patch
63d25773e166e2e3babe626a5800e70939844754

0235-AArch64-Use-frame-reference-in-aarch64_layout_frame.patch
ab43763e519ed8efbbfdac801d008c338fbcb187

0236-AArch64-Add-an-assert-to-aarch64_layout_frame.patch
8e66b377a93e3fc371d0836768740d68ef8fffc5

0237-AArch64-Improve-poly_int-handling-in-aarch64_layout_.patch
9b17a646d90ad0cc30daf8432aa60ad0d751d914

0238-AArch64-Add-partial-SVE-vector-modes.patch
550a338052c374cb1f6c07ffd883c4046565fdd4

0239-AArch64-Fix-symbol-offset-limit.patch
7d3b27ff12610fde9d6c4b56abc70c6ee9b6b3db

0240-AArch64-SVE2-Support-for-EOR3-and-variants-of-BSL.patch
2d57b12e2acd52b843adbcd6d5909cb0b9f7196b

0241-re-PR-target-86753-gcc.target-aarch64-sve-vcond_-45-.patch
cc1facefe3b4e3b067d95291a7dba834b830ff18

0242-Pass-a-vec_info-to-get_vectype_for_scalar_type.patch
7ed54790da87bbb4a134020a9fb8bd1b72fd0acb

0243-AArch64-Implement-__rndr-__rndrrs-intrinsics.patch
c5dc215df17071281c21450fa2d584e1161e4bc2

0244-re-PR-debug-90231-ivopts-causes-optimized-away-itera.patch
d9eabacb0483ac1f730112d551551c258365f02e

0245-Add-a-simulate_builin_function_decl-langhook.patch
740785381ec9944c861dcc29b420c96aa933f040

0246-Add-a-simulate_enum_decl-langhook.patch
ac2cfa6cc35175311f92c25acbdd244f0f3bbb87

0247-AArch64-Handle-scalars-in-cmp-and-shift-immediate-qu.patch
6bc67182b6500b942674d6031c1bf0f02c779cbd

0248-AArch64-Add-FFR-and-FFRT-registers.patch
183bfdafc6f1f98711c5400498a7268cc1441096

0249-AArch64-Extend-SVE-reverse-permutes-to-predicates.patch
28350fd1bee1e238e9c57b04c0796e1e17b659e4

0250-AArch64-Add-support-for-arm_sve.h.patch
624d0f07d51b7fa8bc99142bd0e8380fb9e7badc

0251-AArch64-Add-support-for-the-SVE-PCS.patch
c600df9a4060da3c6121ff4d0b93f179eafd69d1

0252-AArch64-Add-main-SVE-ACLE-tests.patch
bc73c4c24daec96ad3e7ff904645c3095a4febe9

0253-Remove-cgraph_global_info.patch
a62bfab5d2a332925fcf10c45b4c5d8ca499439d

0254-AArch64-Remove-unused-mode-iterators.patch
ffc111637291037e5546428275e39d8ca16d1fac

0255-AArch64-Use-aarch64_sve_int_mode-in-SVE-ACLE-code.patch
86194087ce338c8d0073d905eb60dca654d6bba3

0256-Add-build_truth_vector_type_for_mode.patch
0a0ef2387cc1561d537d8d949aef9479ef17ba35

0257-AArch64-Add-FULL-to-SVE-mode-iterator-names.patch
f75cdd2c4e5282985a6fbdb2e72e17cb77782044

0258-LRA-handle-memory-constraints-that-accept-more-than-.patch
1aeffdce2dfe718e1337d75eb4f22c3c300df9bb

0259-Handle-VIEW_CONVERT_EXPR-for-variable-length-vectors.patch
13c247d6f2a75b7e7a11546e897489716bc31506

0260-re-PR-target-90867-Multiplication-or-typecast-of-int.patch
94cdd3b7ceff688d039a9f134013ac9069df2e8c

0261-re-PR-inline-asm-92615-ICE-in-extract_insn.patch
8d0d7a63019a7d67943d1867348673e3ca3dc824

0262-re-PR-tree-optimization-92645-Hand-written-vector-co.patch
1fa715db5490fb44668e0a37f9a5927d9030a50e

0263-re-PR-tree-optimization-92690-vector-CTOR-optimizati.patch
88feafba3cb5b186d53080c4958474065c4bd5d2

0264-target.def-TARGET_VECTORIZE_BUILTIN_CONVERSION-Remov.patch
477daf831aea18923733772d686eb1ed448d96e7

0265-re-PR-tree-optimization-92645-Hand-written-vector-co.patch
78307657cf9675bc4aa2e77561c823834714b4c8

0266-re-PR-tree-optimization-92715-error-position-plus-si.patch
438d9c4afa635c7a1475feebbc220fe8d335c664

0267-re-PR-target-92758-r278833-breaks-gcc.target-powerpc.patch
577f4a0e5e7f7ef9b5729a3eed79e523cba9dfa9

0268-re-PR-tree-optimization-92803-error-type-mismatch-in.patch
a3408fa3fbf20455eb3b17b5c78397f9d66065c7

0269-Add-ARM-specific-Bfloat-format-support-to-middle-end.patch
d5ffd47e9a739770aa7ef5ad06c07fe9f16a3260

0270-re-PR-target-92904-varargs-for-__int128-is-placed-at.patch
46f3e52e834ab0c06902e7424e57513ee6a8aacd

0271-AArch64-Enable-CLI-for-Armv8.6-a-armv8.6-a-i8mm-and-.patch
a93e1d5c70abe9fba3522318131a352fad0a4f48

0272-gcc-testsuite-ChangeLog.patch
9260fb066b7ed0b237a3300e05fca9bffe018c6b

0273-Add-a-compatible_vector_types_p-target-hook.patch
482b2b43e5101921ad94e51e052a18b353f8a3f5

0274-AArch64-Specify-some-SVE-ACLE-functions-in-a-more-ge.patch
99a3b91535cb41807d62478cd769bc1bed0db5df

0275-AArch64-Rename-SVE-shape-unary_count-to-unary_to_uin.patch
5b052959dcd2e9c390c7de34f806c4b22a66d8f7

0276-AArch64-Rename-UNSPEC_WHILE-to-match-instruction-mne.patch
6ad9571b172cd98099b477cba4efdd92c85bd222

0277-AArch64-Add-support-for-the-SVE2-ACLE.patch
0a09a9483825233f16e5b26bb0ffee76752339fc

0278-config.gcc-Add-arm_bf16.h.patch
abbe1ed27355178223cd099fb73227f392416ea6

0279-aarch64.c-aarch64_invalid_conversion-New-function-fo.patch
9869896730f3055850034c05c596828d517fa9a2

0280-GCC-PATCH-AArch64-Add-ACLE-intrinsics-for-dot-produc.patch
8c197c851e7528baba7cb837f34c05ba2242f705

0281-GCC-PATCH-AArch64-Add-ACLE-intrinsics-for-bfdot-for-.patch
f275d73a57f1e5a07fbd4978f4b4457a5eaa1e39

0282-AArch64-Fix-shrinkwrapping-interactions-with-atomics.patch
e5e07b68187b9aa334519746c45b8cffc5eb7e5c

0283-AArch64-Enable-CLI-for-Armv8.6-A-f64mm.patch
336e1b950db8b91027cdf0ab33bd905930d7f363

0284-AArch64-SVE-Implement-svld1ro-intrinsic.patch
9ceec73fc0e5033049704becef5d79001e31a245

0285-AArch64-Obvious-Correct-pattern-target-requirement.patch
568f0f355f259f58688dd73f749f4d80adc10e40

0286-AArch64-effective_target-for-aarch64-f64mm-asm.patch
3c9e580511e713068c0ea0d7b34f6e50ebf85447

0287-testsuite-Add-target-xfail-argument-to-check-functio.patch
4c33b2daeb5a87aedef77993971db1a1a1c291e6

0288-aarch64-Skip-some-SVE-ACLE-function-body-tests-for-I.patch
b02fbed15a36a86dda6a09a8dc237a8d288f6c09

0289-i386-Fix-ix86_fold_builtin-shift-folding-PR93418.patch
bff948aa337807260344c83ac9079d6386410094

0290-forwprop-Tweak-choice-of-VEC_PERM_EXPR-filler-PR9282.patch
1ee3b380dfb479b335f3b50039ce26abcbffe59a

0291-SRA-Add-verification-of-accesses.patch
5b9e89c922dc2e7e8b8da644bd3a8917c16b22ac

0292-SRA-Total-scalarization-after-access-propagation-PR9.patch
636e80eea24b780f1d5f4c14c58fc00001df8508

0293-aarch64-Fix-SVE-PCS-failures-for-BE-ILP32.patch
2171a9207f51bc486ed9c502cb4da706f594615e

0294-aarch64-Add-Armv8.6-SVE-matrix-multiply-support.patch
3669677425f249c163201c4760d05abb3cf4e6bc

0295-aarch64-Add-svbfloat16_t-support-to-arm_sve.h.patch
02fcd8ac408be56d2a6e67e2e09b26532862f233

0296-aarch64-Add-Armv8.6-SVE-bfloat16-support.patch
896dff99e18d67afdbe4d1effec20a3da474b22b

0297-aarch64-ACLE-intrinsics-bfmmla-and-bfmlal-b-t.patch
f78335df69993a900512f92324cab6a20b1bde0c

0298-aarch64-Add-an-extra-sbfiz-pattern-PR87763.patch
b65a1eb3fae53f2e1ea1ef8c1164f490d55855a1

0299-x86-64-Pass-aggregates-with-only-float-double-in-GPR.patch
ea5ca698dca15dc86b823661ac357a30b49dd0f6

0300-aarch64-ACLE-I8MM-multiply-accumulate-intrinsics.patch
40f648378061c170cf6a9ab680af01b3a3a83569

0301-i386-Skip-ENDBR32-at-the-target-function-entry.patch
1d69147af203d4dcd2270429f90c93f1a37ddfff

0302-testsuite-Fix-recently-added-ipa-testcases-PR93763.patch
103bc4db7665a03bf2390ccc8ceca0dc5a7a81b7

0303-aarch64-Add-bfloat16-vdup-and-vreinterpret-ACLE-intr.patch
8ea6c1b89a20ef7c675535ba1994355361dac977

0304-aarch64-Add-bfloat16-vldn-vstn-intrinsics.patch
e603cd43b145c426468c95cf85b3c12c94daedaa

0305-aarch64-ACLE-intrinsics-for-BFCVTN-BFCVTN2-and-BFCVT.patch
1f520d3412962e22b0338461d82f41abba8a4f12

0306-testsuite-Fix-misquoted-string-in-bfcvt-nosimd.c.patch
db3fa3476e9e922ca3e283df03ebd14be7220b6e

0307-aarch64-Fix-bf16_v-ld-st-n.c-failures-for-big-endian.patch
cf9c3bff39cf973c5c8621ff44199dcb831193a7

0308-testsuite-Fix-gcc.target-aarch64-advsimd-intrinsics-.patch
58a703f0726b3bb6c5ac8b600369106985906590

0309-cleanup-graphite-results.patch
1acde74cf611f560172c74324610c29ca81edf94

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index bc188bbed..46ba89598 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1239,7 +1239,6 @@ OBJS = \
 	auto-profile.o \
 	bb-reorder.o \
 	bitmap.o \
-	bt-load.o \
 	builtins.o \
 	caller-save.o \
 	calls.o \
@@ -1305,6 +1304,7 @@ OBJS = \
 	fold-const.o \
 	fold-const-call.o \
 	function.o \
+	function-abi.o \
 	function-tests.o \
 	fwprop.o \
 	gcc-rich-location.o \
@@ -2522,6 +2522,7 @@ GTFILES = $(CPPLIB_H) $(srcdir)/input.h $(srcdir)/coretypes.h \
   $(srcdir)/libfuncs.h $(SYMTAB_H) \
   $(srcdir)/real.h $(srcdir)/function.h $(srcdir)/insn-addr.h $(srcdir)/hwint.h \
   $(srcdir)/fixed-value.h \
+  $(srcdir)/function-abi.h \
   $(srcdir)/output.h $(srcdir)/cfgloop.h $(srcdir)/cfg.h $(srcdir)/profile-count.h \
   $(srcdir)/cselib.h $(srcdir)/basic-block.h  $(srcdir)/ipa-ref.h $(srcdir)/cgraph.h \
   $(srcdir)/reload.h $(srcdir)/caller-save.c $(srcdir)/symtab.c \
diff --git a/gcc/alias.c b/gcc/alias.c
index 053c3494e..1a60f905a 100644
--- a/gcc/alias.c
+++ b/gcc/alias.c
@@ -1572,16 +1572,6 @@ record_set (rtx dest, const_rtx set, void *data ATTRIBUTE_UNUSED)
 	  new_reg_base_value[regno] = 0;
 	  return;
 	}
-      /* A CLOBBER_HIGH only wipes out the old value if the mode of the old
-	 value is greater than that of the clobber.  */
-      else if (GET_CODE (set) == CLOBBER_HIGH)
-	{
-	  if (new_reg_base_value[regno] != 0
-	      && reg_is_clobbered_by_clobber_high (
-		   regno, GET_MODE (new_reg_base_value[regno]), XEXP (set, 0)))
-	    new_reg_base_value[regno] = 0;
-	  return;
-	}
 
       src = SET_SRC (set);
     }
@@ -3284,7 +3274,8 @@ memory_modified_in_insn_p (const_rtx mem, const_rtx insn)
   if (CALL_P (insn))
     return true;
   memory_modified = false;
-  note_stores (PATTERN (insn), memory_modified_1, CONST_CAST_RTX(mem));
+  note_stores (as_a<const rtx_insn *> (insn), memory_modified_1,
+	       CONST_CAST_RTX(mem));
   return memory_modified;
 }
 
@@ -3412,7 +3403,7 @@ init_alias_analysis (void)
 		      && find_reg_note (insn, REG_NOALIAS, NULL_RTX))
 		    record_set (SET_DEST (PATTERN (insn)), NULL_RTX, NULL);
 		  else
-		    note_stores (PATTERN (insn), record_set, NULL);
+		    note_stores (insn, record_set, NULL);
 
 		  set = single_set (insn);
 
diff --git a/gcc/array-traits.h b/gcc/array-traits.h
new file mode 100644
index 000000000..eb65ede94
--- /dev/null
+++ b/gcc/array-traits.h
@@ -0,0 +1,48 @@
+/* Descriptions of array-like objects.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_ARRAY_TRAITS_H
+#define GCC_ARRAY_TRAITS_H
+
+/* Implementation for single integers (and similar types).  */
+template<typename T, T zero = T (0)>
+struct scalar_array_traits
+{
+  typedef T element_type;
+  static const bool has_constant_size = true;
+  static const size_t constant_size = 1;
+  static const T *base (const T &x) { return &x; }
+  static size_t size (const T &) { return 1; }
+};
+
+template<typename T>
+struct array_traits : scalar_array_traits<T> {};
+
+/* Implementation for arrays with a static size.  */
+template<typename T, size_t N>
+struct array_traits<T[N]>
+{
+  typedef T element_type;
+  static const bool has_constant_size = true;
+  static const size_t constant_size = N;
+  static const T *base (const T (&x)[N]) { return x; }
+  static size_t size (const T (&x)[N]) { return N; }
+};
+
+#endif
diff --git a/gcc/attribs.c b/gcc/attribs.c
index 444192254..d447ea0e4 100644
--- a/gcc/attribs.c
+++ b/gcc/attribs.c
@@ -691,6 +691,7 @@ decl_attributes (tree *node, tree attributes, int flags,
 
 	  if (!built_in
 	      || !DECL_P (*anode)
+	      || DECL_BUILT_IN_CLASS (*anode) != BUILT_IN_NORMAL
 	      || (DECL_FUNCTION_CODE (*anode) != BUILT_IN_UNREACHABLE
 		  && (DECL_FUNCTION_CODE (*anode)
 		      != BUILT_IN_UBSAN_HANDLE_BUILTIN_UNREACHABLE)))
diff --git a/gcc/bitmap.c b/gcc/bitmap.c
index 5a8236de7..911d506f3 100644
--- a/gcc/bitmap.c
+++ b/gcc/bitmap.c
@@ -958,17 +958,17 @@ bitmap_set_bit (bitmap head, int bit)
 /* Return whether a bit is set within a bitmap.  */
 
 int
-bitmap_bit_p (bitmap head, int bit)
+bitmap_bit_p (const_bitmap head, int bit)
 {
   unsigned int indx = bit / BITMAP_ELEMENT_ALL_BITS;
-  bitmap_element *ptr;
+  const bitmap_element *ptr;
   unsigned bit_num;
   unsigned word_num;
 
   if (!head->tree_form)
-    ptr = bitmap_list_find_element (head, indx);
+    ptr = bitmap_list_find_element (const_cast<bitmap> (head), indx);
   else
-    ptr = bitmap_tree_find_element (head, indx);
+    ptr = bitmap_tree_find_element (const_cast<bitmap> (head), indx);
   if (ptr == 0)
     return 0;
 
diff --git a/gcc/bitmap.h b/gcc/bitmap.h
index ed25c1ee5..7217f9e0a 100644
--- a/gcc/bitmap.h
+++ b/gcc/bitmap.h
@@ -210,6 +210,7 @@ along with GCC; see the file COPYING3.  If not see
    on which many random-access membership tests will happen.  */
 
 #include "obstack.h"
+#include "array-traits.h"
 
 /* Bitmap memory usage.  */
 struct bitmap_usage: public mem_usage
@@ -418,7 +419,7 @@ extern bool bitmap_clear_bit (bitmap, int);
 extern bool bitmap_set_bit (bitmap, int);
 
 /* Return true if a bit is set in a bitmap.  */
-extern int bitmap_bit_p (bitmap, int);
+extern int bitmap_bit_p (const_bitmap, int);
 
 /* Debug functions to print a bitmap.  */
 extern void debug_bitmap (const_bitmap);
@@ -937,4 +938,123 @@ class auto_bitmap
   bitmap_head m_bits;
 };
 
+/* Base class for bitmap_view; see there for details.  */
+template<typename T, typename Traits = array_traits<T> >
+class base_bitmap_view
+{
+public:
+  typedef typename Traits::element_type array_element_type;
+
+  base_bitmap_view (const T &, bitmap_element *);
+  operator const_bitmap () const { return &m_head; }
+
+private:
+  base_bitmap_view (const base_bitmap_view &);
+
+  bitmap_head m_head;
+};
+
+/* Provides a read-only bitmap view of a single integer bitmask or a
+   constant-sized array of integer bitmasks, or of a wrapper around such
+   bitmasks.  */
+template<typename T, typename Traits>
+class bitmap_view<T, Traits, true> : public base_bitmap_view<T, Traits>
+{
+public:
+  bitmap_view (const T &array)
+    : base_bitmap_view<T, Traits> (array, m_bitmap_elements) {}
+
+private:
+  /* How many bitmap_elements we need to hold a full T.  */
+  static const size_t num_bitmap_elements
+    = CEIL (CHAR_BIT
+	    * sizeof (typename Traits::element_type)
+	    * Traits::constant_size,
+	    BITMAP_ELEMENT_ALL_BITS);
+  bitmap_element m_bitmap_elements[num_bitmap_elements];
+};
+
+/* Initialize the view for array ARRAY, using the array of bitmap
+   elements in BITMAP_ELEMENTS (which is known to contain enough
+   entries).  */
+template<typename T, typename Traits>
+base_bitmap_view<T, Traits>::base_bitmap_view (const T &array,
+					       bitmap_element *bitmap_elements)
+{
+  m_head.obstack = NULL;
+
+  /* The code currently assumes that each element of ARRAY corresponds
+     to exactly one bitmap_element.  */
+  const size_t array_element_bits = CHAR_BIT * sizeof (array_element_type);
+  STATIC_ASSERT (BITMAP_ELEMENT_ALL_BITS % array_element_bits == 0);
+  size_t array_step = BITMAP_ELEMENT_ALL_BITS / array_element_bits;
+  size_t array_size = Traits::size (array);
+
+  /* Process each potential bitmap_element in turn.  The loop is written
+     this way rather than per array element because usually there are
+     only a small number of array elements per bitmap element (typically
+     two or four).  The inner loops should therefore unroll completely.  */
+  const array_element_type *array_elements = Traits::base (array);
+  unsigned int indx = 0;
+  for (size_t array_base = 0;
+       array_base < array_size;
+       array_base += array_step, indx += 1)
+    {
+      /* How many array elements are in this particular bitmap_element.  */
+      unsigned int array_count
+	= (STATIC_CONSTANT_P (array_size % array_step == 0)
+	   ? array_step : MIN (array_step, array_size - array_base));
+
+      /* See whether we need this bitmap element.  */
+      array_element_type ior = array_elements[array_base];
+      for (size_t i = 1; i < array_count; ++i)
+	ior |= array_elements[array_base + i];
+      if (ior == 0)
+	continue;
+
+      /* Grab the next bitmap element and chain it.  */
+      bitmap_element *bitmap_element = bitmap_elements++;
+      if (m_head.current)
+	m_head.current->next = bitmap_element;
+      else
+	m_head.first = bitmap_element;
+      bitmap_element->prev = m_head.current;
+      bitmap_element->next = NULL;
+      bitmap_element->indx = indx;
+      m_head.current = bitmap_element;
+      m_head.indx = indx;
+
+      /* Fill in the bits of the bitmap element.  */
+      if (array_element_bits < BITMAP_WORD_BITS)
+	{
+	  /* Multiple array elements fit in one element of
+	     bitmap_element->bits.  */
+	  size_t array_i = array_base;
+	  for (unsigned int word_i = 0; word_i < BITMAP_ELEMENT_WORDS;
+	       ++word_i)
+	    {
+	      BITMAP_WORD word = 0;
+	      for (unsigned int shift = 0;
+		   shift < BITMAP_WORD_BITS && array_i < array_size;
+		   shift += array_element_bits)
+		word |= array_elements[array_i++] << shift;
+	      bitmap_element->bits[word_i] = word;
+	    }
+	}
+      else
+	{
+	  /* Array elements are the same size as elements of
+	     bitmap_element->bits, or are an exact multiple of that size.  */
+	  unsigned int word_i = 0;
+	  for (unsigned int i = 0; i < array_count; ++i)
+	    for (unsigned int shift = 0; shift < array_element_bits;
+		 shift += BITMAP_WORD_BITS)
+	      bitmap_element->bits[word_i++]
+		= array_elements[array_base + i] >> shift;
+	  while (word_i < BITMAP_ELEMENT_WORDS)
+	    bitmap_element->bits[word_i++] = 0;
+	}
+    }
+}
+
 #endif /* GCC_BITMAP_H */
diff --git a/gcc/bt-load.c b/gcc/bt-load.c
deleted file mode 100644
index f68879ca4..000000000
--- a/gcc/bt-load.c
+++ /dev/null
@@ -1,1577 +0,0 @@
-/* Perform branch target register load optimizations.
-   Copyright (C) 2001-2019 Free Software Foundation, Inc.
-
-This file is part of GCC.
-
-GCC is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free
-Software Foundation; either version 3, or (at your option) any later
-version.
-
-GCC is distributed in the hope that it will be useful, but WITHOUT ANY
-WARRANTY; without even the implied warranty of MERCHANTABILITY or
-FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-for more details.
-
-You should have received a copy of the GNU General Public License
-along with GCC; see the file COPYING3.  If not see
-<http://www.gnu.org/licenses/>.  */
-
-#include "config.h"
-#include "system.h"
-#include "coretypes.h"
-#include "backend.h"
-#include "target.h"
-#include "rtl.h"
-#include "tree.h"
-#include "df.h"
-#include "insn-config.h"
-#include "regs.h"
-#include "memmodel.h"
-#include "emit-rtl.h"
-#include "recog.h"
-#include "diagnostic-core.h"
-#include "expr.h"
-#include "insn-attr.h"
-#include "tree-pass.h"
-#include "cfgrtl.h"
-#include "cfganal.h"
-#include "cfgcleanup.h"
-#include "cfgloop.h"
-#include "rtl-iter.h"
-#include "fibonacci_heap.h"
-
-struct btr_def;
-
-/* Target register optimizations - these are performed after reload.  */
-
-struct btr_def_group
-{
-  btr_def_group *next;
-  rtx src;
-  btr_def *members;
-};
-
-struct btr_user
-{
-  btr_user *next;
-  basic_block bb;
-  int luid;
-  rtx_insn *insn;
-  /* If INSN has a single use of a single branch register, then
-     USE points to it within INSN.  If there is more than
-     one branch register use, or the use is in some way ambiguous,
-     then USE is NULL.  */
-  rtx use;
-  int n_reaching_defs;
-  int first_reaching_def;
-  char other_use_this_block;
-};
-
-/* btr_def structs appear on three lists:
-     1. A list of all btr_def structures (head is
-	ALL_BTR_DEFS, linked by the NEXT field).
-     2. A list of branch reg definitions per basic block (head is
-	BB_BTR_DEFS[i], linked by the NEXT_THIS_BB field).
-     3. A list of all branch reg definitions belonging to the same
-	group (head is in a BTR_DEF_GROUP struct, linked by
-	NEXT_THIS_GROUP field).  */
-
-struct btr_def
-{
-  btr_def *next_this_bb;
-  btr_def *next_this_group;
-  basic_block bb;
-  int luid;
-  rtx_insn *insn;
-  int btr;
-  int cost;
-  /* For a branch register setting insn that has a constant
-     source (i.e. a label), group links together all the
-     insns with the same source.  For other branch register
-     setting insns, group is NULL.  */
-  btr_def_group *group;
-  btr_user *uses;
-  /* If this def has a reaching use which is not a simple use
-     in a branch instruction, then has_ambiguous_use will be true,
-     and we will not attempt to migrate this definition.  */
-  char has_ambiguous_use;
-  /* live_range is an approximation to the true live range for this
-     def/use web, because it records the set of blocks that contain
-     the live range.  There could be other live ranges for the same
-     branch register in that set of blocks, either in the block
-     containing the def (before the def), or in a block containing
-     a use (after the use).  If there are such other live ranges, then
-     other_btr_uses_before_def or other_btr_uses_after_use must be set true
-     as appropriate.  */
-  char other_btr_uses_before_def;
-  char other_btr_uses_after_use;
-  /* We set own_end when we have moved a definition into a dominator.
-     Thus, when a later combination removes this definition again, we know
-     to clear out trs_live_at_end again.  */
-  char own_end;
-  bitmap live_range;
-};
-
-typedef fibonacci_heap <long, btr_def> btr_heap_t;
-typedef fibonacci_node <long, btr_def> btr_heap_node_t;
-
-static int issue_rate;
-
-static int basic_block_freq (const_basic_block);
-static int insn_sets_btr_p (const rtx_insn *, int, int *);
-static void find_btr_def_group (btr_def_group **, btr_def *);
-static btr_def *add_btr_def (btr_heap_t *, basic_block, int, rtx_insn *,
-			    unsigned int, int, btr_def_group **);
-static btr_user *new_btr_user (basic_block, int, rtx_insn *);
-static void dump_hard_reg_set (HARD_REG_SET);
-static void dump_btrs_live (int);
-static void note_other_use_this_block (unsigned int, btr_user *);
-static void compute_defs_uses_and_gen (btr_heap_t *, btr_def **, btr_user **,
-				       sbitmap *, sbitmap *, HARD_REG_SET *);
-static void compute_kill (sbitmap *, sbitmap *, HARD_REG_SET *);
-static void compute_out (sbitmap *bb_out, sbitmap *, sbitmap *, int);
-static void link_btr_uses (btr_def **, btr_user **, sbitmap *, sbitmap *, int);
-static void build_btr_def_use_webs (btr_heap_t *);
-static int block_at_edge_of_live_range_p (int, btr_def *);
-static void clear_btr_from_live_range (btr_def *def);
-static void add_btr_to_live_range (btr_def *, int);
-static void augment_live_range (bitmap, HARD_REG_SET *, basic_block,
-				basic_block, int);
-static int choose_btr (HARD_REG_SET);
-static void combine_btr_defs (btr_def *, HARD_REG_SET *);
-static void btr_def_live_range (btr_def *, HARD_REG_SET *);
-static void move_btr_def (basic_block, int, btr_def *, bitmap, HARD_REG_SET *);
-static int migrate_btr_def (btr_def *, int);
-static void migrate_btr_defs (enum reg_class, int);
-static int can_move_up (const_basic_block, const rtx_insn *, int);
-static void note_btr_set (rtx, const_rtx, void *);
-
-/* The following code performs code motion of target load instructions
-   (instructions that set branch target registers), to move them
-   forward away from the branch instructions and out of loops (or,
-   more generally, from a more frequently executed place to a less
-   frequently executed place).
-   Moving target load instructions further in front of the branch
-   instruction that uses the target register value means that the hardware
-   has a better chance of preloading the instructions at the branch
-   target by the time the branch is reached.  This avoids bubbles
-   when a taken branch needs to flush out the pipeline.
-   Moving target load instructions out of loops means they are executed
-   less frequently.  */
-
-/* An obstack to hold the def-use web data structures built up for
-   migrating branch target load instructions.  */
-static struct obstack migrate_btrl_obstack;
-
-/* Array indexed by basic block number, giving the set of registers
-   live in that block.  */
-static HARD_REG_SET *btrs_live;
-
-/* Array indexed by basic block number, giving the set of registers live at
-  the end of that block, including any uses by a final jump insn, if any.  */
-static HARD_REG_SET *btrs_live_at_end;
-
-/* Set of all target registers that we are willing to allocate.  */
-static HARD_REG_SET all_btrs;
-
-/* Provide lower and upper bounds for target register numbers, so that
-   we don't need to search through all the hard registers all the time.  */
-static int first_btr, last_btr;
-
-
-
-/* Return an estimate of the frequency of execution of block bb.  */
-static int
-basic_block_freq (const_basic_block bb)
-{
-  return bb->count.to_frequency (cfun);
-}
-
-/* If the rtx at *XP references (sets or reads) any branch target
-   register, return one such register.  If EXCLUDEP is set, disregard
-   any references within that location.  */
-static rtx *
-find_btr_use (rtx *xp, rtx *excludep = 0)
-{
-  subrtx_ptr_iterator::array_type array;
-  FOR_EACH_SUBRTX_PTR (iter, array, xp, NONCONST)
-    {
-      rtx *loc = *iter;
-      if (loc == excludep)
-	iter.skip_subrtxes ();
-      else
-	{
-	  const_rtx x = *loc;
-	  if (REG_P (x)
-	      && overlaps_hard_reg_set_p (all_btrs, GET_MODE (x), REGNO (x)))
-	    return loc;
-	}
-    }
-  return 0;
-}
-
-/* Return true if insn is an instruction that sets a target register.
-   if CHECK_CONST is true, only return true if the source is constant.
-   If such a set is found and REGNO is nonzero, assign the register number
-   of the destination register to *REGNO.  */
-static int
-insn_sets_btr_p (const rtx_insn *insn, int check_const, int *regno)
-{
-  rtx set;
-
-  if (NONJUMP_INSN_P (insn)
-      && (set = single_set (insn)))
-    {
-      rtx dest = SET_DEST (set);
-      rtx src = SET_SRC (set);
-
-      if (GET_CODE (dest) == SUBREG)
-	dest = XEXP (dest, 0);
-
-      if (REG_P (dest)
-	  && TEST_HARD_REG_BIT (all_btrs, REGNO (dest)))
-	{
-	  gcc_assert (!find_btr_use (&src));
-
-	  if (!check_const || CONSTANT_P (src))
-	    {
-	      if (regno)
-		*regno = REGNO (dest);
-	      return 1;
-	    }
-	}
-    }
-  return 0;
-}
-
-/* Find the group that the target register definition DEF belongs
-   to in the list starting with *ALL_BTR_DEF_GROUPS.  If no such
-   group exists, create one.  Add def to the group.  */
-static void
-find_btr_def_group (btr_def_group **all_btr_def_groups, btr_def *def)
-{
-  if (insn_sets_btr_p (def->insn, 1, NULL))
-    {
-      btr_def_group *this_group;
-      rtx def_src = SET_SRC (single_set (def->insn));
-
-      /* ?? This linear search is an efficiency concern, particularly
-	 as the search will almost always fail to find a match.  */
-      for (this_group = *all_btr_def_groups;
-	   this_group != NULL;
-	   this_group = this_group->next)
-	if (rtx_equal_p (def_src, this_group->src))
-	  break;
-
-      if (!this_group)
-	{
-	  this_group = XOBNEW (&migrate_btrl_obstack, btr_def_group);
-	  this_group->src = def_src;
-	  this_group->members = NULL;
-	  this_group->next = *all_btr_def_groups;
-	  *all_btr_def_groups = this_group;
-	}
-      def->group = this_group;
-      def->next_this_group = this_group->members;
-      this_group->members = def;
-    }
-  else
-    def->group = NULL;
-}
-
-/* Create a new target register definition structure, for a definition in
-   block BB, instruction INSN, and insert it into ALL_BTR_DEFS.  Return
-   the new definition.  */
-static btr_def *
-add_btr_def (btr_heap_t *all_btr_defs, basic_block bb, int insn_luid,
-	     rtx_insn *insn,
-	     unsigned int dest_reg, int other_btr_uses_before_def,
-	     btr_def_group **all_btr_def_groups)
-{
-  btr_def *this_def = XOBNEW (&migrate_btrl_obstack, btr_def);
-  this_def->bb = bb;
-  this_def->luid = insn_luid;
-  this_def->insn = insn;
-  this_def->btr = dest_reg;
-  this_def->cost = basic_block_freq (bb);
-  this_def->has_ambiguous_use = 0;
-  this_def->other_btr_uses_before_def = other_btr_uses_before_def;
-  this_def->other_btr_uses_after_use = 0;
-  this_def->next_this_bb = NULL;
-  this_def->next_this_group = NULL;
-  this_def->uses = NULL;
-  this_def->live_range = NULL;
-  find_btr_def_group (all_btr_def_groups, this_def);
-
-  all_btr_defs->insert (-this_def->cost, this_def);
-
-  if (dump_file)
-    fprintf (dump_file,
-      "Found target reg definition: sets %u { bb %d, insn %d }%s priority %d\n",
-	     dest_reg, bb->index, INSN_UID (insn),
-	     (this_def->group ? "" : ":not const"), this_def->cost);
-
-  return this_def;
-}
-
-/* Create a new target register user structure, for a use in block BB,
-   instruction INSN.  Return the new user.  */
-static btr_user *
-new_btr_user (basic_block bb, int insn_luid, rtx_insn *insn)
-{
-  /* This instruction reads target registers.  We need
-     to decide whether we can replace all target register
-     uses easily.
-   */
-  rtx *usep = find_btr_use (&PATTERN (insn));
-  rtx use;
-  btr_user *user = NULL;
-
-  if (usep)
-    {
-      int unambiguous_single_use;
-
-      /* We want to ensure that USE is the only use of a target
-	 register in INSN, so that we know that to rewrite INSN to use
-	 a different target register, all we have to do is replace USE.  */
-      unambiguous_single_use = !find_btr_use (&PATTERN (insn), usep);
-      if (!unambiguous_single_use)
-	usep = NULL;
-    }
-  use = usep ? *usep : NULL_RTX;
-  user = XOBNEW (&migrate_btrl_obstack, btr_user);
-  user->bb = bb;
-  user->luid = insn_luid;
-  user->insn = insn;
-  user->use = use;
-  user->other_use_this_block = 0;
-  user->next = NULL;
-  user->n_reaching_defs = 0;
-  user->first_reaching_def = -1;
-
-  if (dump_file)
-    {
-      fprintf (dump_file, "Uses target reg: { bb %d, insn %d }",
-	       bb->index, INSN_UID (insn));
-
-      if (user->use)
-	fprintf (dump_file, ": unambiguous use of reg %d\n",
-		 REGNO (user->use));
-    }
-
-  return user;
-}
-
-/* Write the contents of S to the dump file.  */
-static void
-dump_hard_reg_set (HARD_REG_SET s)
-{
-  int reg;
-  for (reg = 0; reg < FIRST_PSEUDO_REGISTER; reg++)
-    if (TEST_HARD_REG_BIT (s, reg))
-      fprintf (dump_file, " %d", reg);
-}
-
-/* Write the set of target regs live in block BB to the dump file.  */
-static void
-dump_btrs_live (int bb)
-{
-  fprintf (dump_file, "BB%d live:", bb);
-  dump_hard_reg_set (btrs_live[bb]);
-  fprintf (dump_file, "\n");
-}
-
-/* REGNO is the number of a branch target register that is being used or
-   set.  USERS_THIS_BB is a list of preceding branch target register users;
-   If any of them use the same register, set their other_use_this_block
-   flag.  */
-static void
-note_other_use_this_block (unsigned int regno, btr_user *users_this_bb)
-{
-  btr_user *user;
-
-  for (user = users_this_bb; user != NULL; user = user->next)
-    if (user->use && REGNO (user->use) == regno)
-      user->other_use_this_block = 1;
-}
-
-struct defs_uses_info {
-  btr_user *users_this_bb;
-  HARD_REG_SET btrs_written_in_block;
-  HARD_REG_SET btrs_live_in_block;
-  sbitmap bb_gen;
-  sbitmap *btr_defset;
-};
-
-/* Called via note_stores or directly to register stores into /
-   clobbers of a branch target register DEST that are not recognized as
-   straightforward definitions.  DATA points to information about the
-   current basic block that needs updating.  */
-static void
-note_btr_set (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
-{
-  defs_uses_info *info = (defs_uses_info *) data;
-  int regno, end_regno;
-
-  if (!REG_P (dest))
-    return;
-  regno = REGNO (dest);
-  end_regno = END_REGNO (dest);
-  for (; regno < end_regno; regno++)
-    if (TEST_HARD_REG_BIT (all_btrs, regno))
-      {
-	note_other_use_this_block (regno, info->users_this_bb);
-	SET_HARD_REG_BIT (info->btrs_written_in_block, regno);
-	SET_HARD_REG_BIT (info->btrs_live_in_block, regno);
-	bitmap_and_compl (info->bb_gen, info->bb_gen,
-			    info->btr_defset[regno - first_btr]);
-      }
-}
-
-static void
-compute_defs_uses_and_gen (btr_heap_t *all_btr_defs, btr_def **def_array,
-			   btr_user **use_array, sbitmap *btr_defset,
-			   sbitmap *bb_gen, HARD_REG_SET *btrs_written)
-{
-  /* Scan the code building up the set of all defs and all uses.
-     For each target register, build the set of defs of that register.
-     For each block, calculate the set of target registers
-     written in that block.
-     Also calculate the set of btrs ever live in that block.
-  */
-  int i;
-  int insn_luid = 0;
-  btr_def_group *all_btr_def_groups = NULL;
-  defs_uses_info info;
-
-  bitmap_vector_clear (bb_gen, last_basic_block_for_fn (cfun));
-  for (i = NUM_FIXED_BLOCKS; i < last_basic_block_for_fn (cfun); i++)
-    {
-      basic_block bb = BASIC_BLOCK_FOR_FN (cfun, i);
-      int reg;
-      btr_def *defs_this_bb = NULL;
-      rtx_insn *insn;
-      rtx_insn *last;
-      int can_throw = 0;
-
-      info.users_this_bb = NULL;
-      info.bb_gen = bb_gen[i];
-      info.btr_defset = btr_defset;
-
-      CLEAR_HARD_REG_SET (info.btrs_live_in_block);
-      CLEAR_HARD_REG_SET (info.btrs_written_in_block);
-      for (reg = first_btr; reg <= last_btr; reg++)
-	if (TEST_HARD_REG_BIT (all_btrs, reg)
-	    && REGNO_REG_SET_P (df_get_live_in (bb), reg))
-	  SET_HARD_REG_BIT (info.btrs_live_in_block, reg);
-
-      for (insn = BB_HEAD (bb), last = NEXT_INSN (BB_END (bb));
-	   insn != last;
-	   insn = NEXT_INSN (insn), insn_luid++)
-	{
-	  if (INSN_P (insn))
-	    {
-	      int regno;
-	      int insn_uid = INSN_UID (insn);
-
-	      if (insn_sets_btr_p (insn, 0, &regno))
-		{
-		  btr_def *def = add_btr_def (
-		      all_btr_defs, bb, insn_luid, insn, regno,
-		      TEST_HARD_REG_BIT (info.btrs_live_in_block, regno),
-		      &all_btr_def_groups);
-
-		  def_array[insn_uid] = def;
-		  SET_HARD_REG_BIT (info.btrs_written_in_block, regno);
-		  SET_HARD_REG_BIT (info.btrs_live_in_block, regno);
-		  bitmap_and_compl (bb_gen[i], bb_gen[i],
-				      btr_defset[regno - first_btr]);
-		  bitmap_set_bit (bb_gen[i], insn_uid);
-		  def->next_this_bb = defs_this_bb;
-		  defs_this_bb = def;
-		  bitmap_set_bit (btr_defset[regno - first_btr], insn_uid);
-		  note_other_use_this_block (regno, info.users_this_bb);
-		}
-	      /* Check for the blockage emitted by expand_nl_goto_receiver.  */
-	      else if (cfun->has_nonlocal_label
-		       && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE)
-		{
-		  btr_user *user;
-
-		  /* Do the equivalent of calling note_other_use_this_block
-		     for every target register.  */
-		  for (user = info.users_this_bb; user != NULL;
-		       user = user->next)
-		    if (user->use)
-		      user->other_use_this_block = 1;
-		  IOR_HARD_REG_SET (info.btrs_written_in_block, all_btrs);
-		  IOR_HARD_REG_SET (info.btrs_live_in_block, all_btrs);
-		  bitmap_clear (info.bb_gen);
-		}
-	      else
-		{
-		  if (find_btr_use (&PATTERN (insn)))
-		    {
-		      btr_user *user = new_btr_user (bb, insn_luid, insn);
-
-		      use_array[insn_uid] = user;
-		      if (user->use)
-			SET_HARD_REG_BIT (info.btrs_live_in_block,
-					  REGNO (user->use));
-		      else
-			{
-			  int reg;
-			  for (reg = first_btr; reg <= last_btr; reg++)
-			    if (TEST_HARD_REG_BIT (all_btrs, reg)
-				&& refers_to_regno_p (reg, user->insn))
-			      {
-				note_other_use_this_block (reg,
-							   info.users_this_bb);
-				SET_HARD_REG_BIT (info.btrs_live_in_block, reg);
-			      }
-			  note_stores (PATTERN (insn), note_btr_set, &info);
-			}
-		      user->next = info.users_this_bb;
-		      info.users_this_bb = user;
-		    }
-		  if (CALL_P (insn))
-		    {
-		      HARD_REG_SET *clobbered = &call_used_reg_set;
-		      HARD_REG_SET call_saved;
-		      rtx pat = PATTERN (insn);
-		      int i;
-
-		      /* Check for sibcall.  */
-		      if (GET_CODE (pat) == PARALLEL)
-			for (i = XVECLEN (pat, 0) - 1; i >= 0; i--)
-			  if (ANY_RETURN_P (XVECEXP (pat, 0, i)))
-			    {
-			      COMPL_HARD_REG_SET (call_saved,
-						  call_used_reg_set);
-			      clobbered = &call_saved;
-			    }
-
-		      for (regno = first_btr; regno <= last_btr; regno++)
-			if (TEST_HARD_REG_BIT (*clobbered, regno))
-			  note_btr_set (regno_reg_rtx[regno], NULL_RTX, &info);
-		    }
-		}
-	    }
-	}
-
-      COPY_HARD_REG_SET (btrs_live[i], info.btrs_live_in_block);
-      COPY_HARD_REG_SET (btrs_written[i], info.btrs_written_in_block);
-
-      REG_SET_TO_HARD_REG_SET (btrs_live_at_end[i], df_get_live_out (bb));
-      /* If this block ends in a jump insn, add any uses or even clobbers
-	 of branch target registers that it might have.  */
-      for (insn = BB_END (bb); insn != BB_HEAD (bb) && ! INSN_P (insn); )
-	insn = PREV_INSN (insn);
-      /* ??? for the fall-through edge, it would make sense to insert the
-	 btr set on the edge, but that would require to split the block
-	 early on so that we can distinguish between dominance from the fall
-	 through edge - which can use the call-clobbered registers - from
-	 dominance by the throw edge.  */
-      if (can_throw_internal (insn))
-	{
-	  HARD_REG_SET tmp;
-
-	  COPY_HARD_REG_SET (tmp, call_used_reg_set);
-	  AND_HARD_REG_SET (tmp, all_btrs);
-	  IOR_HARD_REG_SET (btrs_live_at_end[i], tmp);
-	  can_throw = 1;
-	}
-      if (can_throw || JUMP_P (insn))
-	{
-	  int regno;
-
-	  for (regno = first_btr; regno <= last_btr; regno++)
-	    if (refers_to_regno_p (regno, insn))
-	      SET_HARD_REG_BIT (btrs_live_at_end[i], regno);
-	}
-
-      if (dump_file)
-	dump_btrs_live (i);
-    }
-}
-
-static void
-compute_kill (sbitmap *bb_kill, sbitmap *btr_defset,
-	      HARD_REG_SET *btrs_written)
-{
-  int i;
-  int regno;
-
-  /* For each basic block, form the set BB_KILL - the set
-     of definitions that the block kills.  */
-  bitmap_vector_clear (bb_kill, last_basic_block_for_fn (cfun));
-  for (i = NUM_FIXED_BLOCKS; i < last_basic_block_for_fn (cfun); i++)
-    {
-      for (regno = first_btr; regno <= last_btr; regno++)
-	if (TEST_HARD_REG_BIT (all_btrs, regno)
-	    && TEST_HARD_REG_BIT (btrs_written[i], regno))
-	  bitmap_ior (bb_kill[i], bb_kill[i],
-			  btr_defset[regno - first_btr]);
-    }
-}
-
-static void
-compute_out (sbitmap *bb_out, sbitmap *bb_gen, sbitmap *bb_kill, int max_uid)
-{
-  /* Perform iterative dataflow:
-      Initially, for all blocks, BB_OUT = BB_GEN.
-      For each block,
-	BB_IN  = union over predecessors of BB_OUT(pred)
-	BB_OUT = (BB_IN - BB_KILL) + BB_GEN
-     Iterate until the bb_out sets stop growing.  */
-  int i;
-  int changed;
-  auto_sbitmap bb_in (max_uid);
-
-  for (i = NUM_FIXED_BLOCKS; i < last_basic_block_for_fn (cfun); i++)
-    bitmap_copy (bb_out[i], bb_gen[i]);
-
-  changed = 1;
-  while (changed)
-    {
-      changed = 0;
-      for (i = NUM_FIXED_BLOCKS; i < last_basic_block_for_fn (cfun); i++)
-	{
-	  bitmap_union_of_preds (bb_in, bb_out, BASIC_BLOCK_FOR_FN (cfun, i));
-	  changed |= bitmap_ior_and_compl (bb_out[i], bb_gen[i],
-					       bb_in, bb_kill[i]);
-	}
-    }
-}
-
-static void
-link_btr_uses (btr_def **def_array, btr_user **use_array, sbitmap *bb_out,
-	       sbitmap *btr_defset, int max_uid)
-{
-  int i;
-  auto_sbitmap reaching_defs (max_uid);
-
-  /* Link uses to the uses lists of all of their reaching defs.
-     Count up the number of reaching defs of each use.  */
-  for (i = NUM_FIXED_BLOCKS; i < last_basic_block_for_fn (cfun); i++)
-    {
-      basic_block bb = BASIC_BLOCK_FOR_FN (cfun, i);
-      rtx_insn *insn;
-      rtx_insn *last;
-
-      bitmap_union_of_preds (reaching_defs, bb_out, BASIC_BLOCK_FOR_FN (cfun, i));
-      for (insn = BB_HEAD (bb), last = NEXT_INSN (BB_END (bb));
-	   insn != last;
-	   insn = NEXT_INSN (insn))
-	{
-	  if (INSN_P (insn))
-	    {
-	      int insn_uid = INSN_UID (insn);
-
-	      btr_def *def   = def_array[insn_uid];
-	      btr_user *user = use_array[insn_uid];
-	      if (def != NULL)
-		{
-		  /* Remove all reaching defs of regno except
-		     for this one.  */
-		  bitmap_and_compl (reaching_defs, reaching_defs,
-				      btr_defset[def->btr - first_btr]);
-		  bitmap_set_bit (reaching_defs, insn_uid);
-		}
-
-	      if (user != NULL)
-		{
-		  /* Find all the reaching defs for this use.  */
-		  auto_sbitmap reaching_defs_of_reg (max_uid);
-		  unsigned int uid = 0;
-		  sbitmap_iterator sbi;
-
-		  if (user->use)
-		    bitmap_and (
-		      reaching_defs_of_reg,
-		      reaching_defs,
-		      btr_defset[REGNO (user->use) - first_btr]);
-		  else
-		    {
-		      int reg;
-
-		      bitmap_clear (reaching_defs_of_reg);
-		      for (reg = first_btr; reg <= last_btr; reg++)
-			if (TEST_HARD_REG_BIT (all_btrs, reg)
-			    && refers_to_regno_p (reg, user->insn))
-			  bitmap_or_and (reaching_defs_of_reg,
-			    reaching_defs_of_reg,
-			    reaching_defs,
-			    btr_defset[reg - first_btr]);
-		    }
-		  EXECUTE_IF_SET_IN_BITMAP (reaching_defs_of_reg, 0, uid, sbi)
-		    {
-		      btr_def *def = def_array[uid];
-
-		      /* We now know that def reaches user.  */
-
-		      if (dump_file)
-			fprintf (dump_file,
-			  "Def in insn %d reaches use in insn %d\n",
-			  uid, insn_uid);
-
-		      user->n_reaching_defs++;
-		      if (!user->use)
-			def->has_ambiguous_use = 1;
-		      if (user->first_reaching_def != -1)
-			{ /* There is more than one reaching def.  This is
-			     a rare case, so just give up on this def/use
-			     web when it occurs.  */
-			  def->has_ambiguous_use = 1;
-			  def_array[user->first_reaching_def]
-			    ->has_ambiguous_use = 1;
-			  if (dump_file)
-			    fprintf (dump_file,
-				     "(use %d has multiple reaching defs)\n",
-				     insn_uid);
-			}
-		      else
-			user->first_reaching_def = uid;
-		      if (user->other_use_this_block)
-			def->other_btr_uses_after_use = 1;
-		      user->next = def->uses;
-		      def->uses = user;
-		    }
-		}
-
-	      if (CALL_P (insn))
-		{
-		  int regno;
-
-		  for (regno = first_btr; regno <= last_btr; regno++)
-		    if (TEST_HARD_REG_BIT (all_btrs, regno)
-			&& TEST_HARD_REG_BIT (call_used_reg_set, regno))
-		      bitmap_and_compl (reaching_defs, reaching_defs,
-					  btr_defset[regno - first_btr]);
-		}
-	    }
-	}
-    }
-}
-
-static void
-build_btr_def_use_webs (btr_heap_t *all_btr_defs)
-{
-  const int max_uid = get_max_uid ();
-  btr_def  **def_array   = XCNEWVEC (btr_def *, max_uid);
-  btr_user **use_array   = XCNEWVEC (btr_user *, max_uid);
-  sbitmap *btr_defset   = sbitmap_vector_alloc (
-			   (last_btr - first_btr) + 1, max_uid);
-  sbitmap *bb_gen = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
-					  max_uid);
-  HARD_REG_SET *btrs_written = XCNEWVEC (HARD_REG_SET,
-					 last_basic_block_for_fn (cfun));
-  sbitmap *bb_kill;
-  sbitmap *bb_out;
-
-  bitmap_vector_clear (btr_defset, (last_btr - first_btr) + 1);
-
-  compute_defs_uses_and_gen (all_btr_defs, def_array, use_array, btr_defset,
-			     bb_gen, btrs_written);
-
-  bb_kill = sbitmap_vector_alloc (last_basic_block_for_fn (cfun), max_uid);
-  compute_kill (bb_kill, btr_defset, btrs_written);
-  free (btrs_written);
-
-  bb_out = sbitmap_vector_alloc (last_basic_block_for_fn (cfun), max_uid);
-  compute_out (bb_out, bb_gen, bb_kill, max_uid);
-
-  sbitmap_vector_free (bb_gen);
-  sbitmap_vector_free (bb_kill);
-
-  link_btr_uses (def_array, use_array, bb_out, btr_defset, max_uid);
-
-  sbitmap_vector_free (bb_out);
-  sbitmap_vector_free (btr_defset);
-  free (use_array);
-  free (def_array);
-}
-
-/* Return true if basic block BB contains the start or end of the
-   live range of the definition DEF, AND there are other live
-   ranges of the same target register that include BB.  */
-static int
-block_at_edge_of_live_range_p (int bb, btr_def *def)
-{
-  if (def->other_btr_uses_before_def
-      && BASIC_BLOCK_FOR_FN (cfun, bb) == def->bb)
-    return 1;
-  else if (def->other_btr_uses_after_use)
-    {
-      btr_user *user;
-      for (user = def->uses; user != NULL; user = user->next)
-	if (BASIC_BLOCK_FOR_FN (cfun, bb) == user->bb)
-	  return 1;
-    }
-  return 0;
-}
-
-/* We are removing the def/use web DEF.  The target register
-   used in this web is therefore no longer live in the live range
-   of this web, so remove it from the live set of all basic blocks
-   in the live range of the web.
-   Blocks at the boundary of the live range may contain other live
-   ranges for the same target register, so we have to be careful
-   to remove the target register from the live set of these blocks
-   only if they do not contain other live ranges for the same register.  */
-static void
-clear_btr_from_live_range (btr_def *def)
-{
-  unsigned bb;
-  bitmap_iterator bi;
-
-  EXECUTE_IF_SET_IN_BITMAP (def->live_range, 0, bb, bi)
-    {
-      if ((!def->other_btr_uses_before_def
-	   && !def->other_btr_uses_after_use)
-	  || !block_at_edge_of_live_range_p (bb, def))
-	{
-	  CLEAR_HARD_REG_BIT (btrs_live[bb], def->btr);
-	  CLEAR_HARD_REG_BIT (btrs_live_at_end[bb], def->btr);
-	  if (dump_file)
-	    dump_btrs_live (bb);
-	}
-    }
- if (def->own_end)
-   CLEAR_HARD_REG_BIT (btrs_live_at_end[def->bb->index], def->btr);
-}
-
-
-/* We are adding the def/use web DEF.  Add the target register used
-   in this web to the live set of all of the basic blocks that contain
-   the live range of the web.
-   If OWN_END is set, also show that the register is live from our
-   definitions at the end of the basic block where it is defined.  */
-static void
-add_btr_to_live_range (btr_def *def, int own_end)
-{
-  unsigned bb;
-  bitmap_iterator bi;
-
-  EXECUTE_IF_SET_IN_BITMAP (def->live_range, 0, bb, bi)
-    {
-      SET_HARD_REG_BIT (btrs_live[bb], def->btr);
-      SET_HARD_REG_BIT (btrs_live_at_end[bb], def->btr);
-      if (dump_file)
-	dump_btrs_live (bb);
-    }
-  if (own_end)
-    {
-      SET_HARD_REG_BIT (btrs_live_at_end[def->bb->index], def->btr);
-      def->own_end = 1;
-    }
-}
-
-/* Update a live range to contain the basic block NEW_BLOCK, and all
-   blocks on paths between the existing live range and NEW_BLOCK.
-   HEAD is a block contained in the existing live range that dominates
-   all other blocks in the existing live range.
-   Also add to the set BTRS_LIVE_IN_RANGE all target registers that
-   are live in the blocks that we add to the live range.
-   If FULL_RANGE is set, include the full live range of NEW_BB;
-   otherwise, if NEW_BB dominates HEAD_BB, only add registers that
-   are life at the end of NEW_BB for NEW_BB itself.
-   It is a precondition that either NEW_BLOCK dominates HEAD,or
-   HEAD dom NEW_BLOCK.  This is used to speed up the
-   implementation of this function.  */
-static void
-augment_live_range (bitmap live_range, HARD_REG_SET *btrs_live_in_range,
-		    basic_block head_bb, basic_block new_bb, int full_range)
-{
-  basic_block *worklist, *tos;
-
-  tos = worklist = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun) + 1);
-
-  if (dominated_by_p (CDI_DOMINATORS, new_bb, head_bb))
-    {
-      if (new_bb == head_bb)
-	{
-	  if (full_range)
-	    IOR_HARD_REG_SET (*btrs_live_in_range, btrs_live[new_bb->index]);
-	  free (tos);
-	  return;
-	}
-      *tos++ = new_bb;
-    }
-  else
-    {
-      edge e;
-      edge_iterator ei;
-      int new_block = new_bb->index;
-
-      gcc_assert (dominated_by_p (CDI_DOMINATORS, head_bb, new_bb));
-
-      IOR_HARD_REG_SET (*btrs_live_in_range, btrs_live[head_bb->index]);
-      bitmap_set_bit (live_range, new_block);
-      /* A previous btr migration could have caused a register to be
-	live just at the end of new_block which we need in full, so
-	use trs_live_at_end even if full_range is set.  */
-      IOR_HARD_REG_SET (*btrs_live_in_range, btrs_live_at_end[new_block]);
-      if (full_range)
-	IOR_HARD_REG_SET (*btrs_live_in_range, btrs_live[new_block]);
-      if (dump_file)
-	{
-	  fprintf (dump_file,
-		   "Adding end of block %d and rest of %d to live range\n",
-		   new_block, head_bb->index);
-	  fprintf (dump_file,"Now live btrs are ");
-	  dump_hard_reg_set (*btrs_live_in_range);
-	  fprintf (dump_file, "\n");
-	}
-      FOR_EACH_EDGE (e, ei, head_bb->preds)
-	*tos++ = e->src;
-    }
-
-  while (tos != worklist)
-    {
-      basic_block bb = *--tos;
-      if (!bitmap_bit_p (live_range, bb->index))
-	{
-	  edge e;
-	  edge_iterator ei;
-
-	  bitmap_set_bit (live_range, bb->index);
-	  IOR_HARD_REG_SET (*btrs_live_in_range,
-	    btrs_live[bb->index]);
-	  /* A previous btr migration could have caused a register to be
-	     live just at the end of a block which we need in full.  */
-	  IOR_HARD_REG_SET (*btrs_live_in_range,
-	    btrs_live_at_end[bb->index]);
-	  if (dump_file)
-	    {
-	      fprintf (dump_file,
-		"Adding block %d to live range\n", bb->index);
-	      fprintf (dump_file,"Now live btrs are ");
-	      dump_hard_reg_set (*btrs_live_in_range);
-	      fprintf (dump_file, "\n");
-	    }
-
-	  FOR_EACH_EDGE (e, ei, bb->preds)
-	    {
-	      basic_block pred = e->src;
-	      if (!bitmap_bit_p (live_range, pred->index))
-		*tos++ = pred;
-	    }
-	}
-    }
-
-  free (worklist);
-}
-
-/*  Return the most desirable target register that is not in
-    the set USED_BTRS.  */
-static int
-choose_btr (HARD_REG_SET used_btrs)
-{
-  int i;
-
-  if (!hard_reg_set_subset_p (all_btrs, used_btrs))
-    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-      {
-#ifdef REG_ALLOC_ORDER
-	int regno = reg_alloc_order[i];
-#else
-	int regno = i;
-#endif
-	if (TEST_HARD_REG_BIT (all_btrs, regno)
-	    && !TEST_HARD_REG_BIT (used_btrs, regno))
-	  return regno;
-      }
-  return -1;
-}
-
-/* Calculate the set of basic blocks that contain the live range of
-   the def/use web DEF.
-   Also calculate the set of target registers that are live at time
-   in this live range, but ignore the live range represented by DEF
-   when calculating this set.  */
-static void
-btr_def_live_range (btr_def *def, HARD_REG_SET *btrs_live_in_range)
-{
-  if (!def->live_range)
-    {
-      btr_user *user;
-
-      def->live_range = BITMAP_ALLOC (NULL);
-
-      bitmap_set_bit (def->live_range, def->bb->index);
-      COPY_HARD_REG_SET (*btrs_live_in_range,
-			 (flag_btr_bb_exclusive
-			  ? btrs_live : btrs_live_at_end)[def->bb->index]);
-
-      for (user = def->uses; user != NULL; user = user->next)
-	augment_live_range (def->live_range, btrs_live_in_range,
-			    def->bb, user->bb,
-			    (flag_btr_bb_exclusive
-			     || user->insn != BB_END (def->bb)
-			     || !JUMP_P (user->insn)));
-    }
-  else
-    {
-      /* def->live_range is accurate, but we need to recompute
-	 the set of target registers live over it, because migration
-	 of other PT instructions may have affected it.
-      */
-      unsigned bb;
-      unsigned def_bb = flag_btr_bb_exclusive ? -1 : def->bb->index;
-      bitmap_iterator bi;
-
-      CLEAR_HARD_REG_SET (*btrs_live_in_range);
-      EXECUTE_IF_SET_IN_BITMAP (def->live_range, 0, bb, bi)
-	{
-	  IOR_HARD_REG_SET (*btrs_live_in_range,
-			    (def_bb == bb
-			     ? btrs_live_at_end : btrs_live) [bb]);
-	}
-    }
-  if (!def->other_btr_uses_before_def &&
-      !def->other_btr_uses_after_use)
-    CLEAR_HARD_REG_BIT (*btrs_live_in_range, def->btr);
-}
-
-/* Merge into the def/use web DEF any other def/use webs in the same
-   group that are dominated by DEF, provided that there is a target
-   register available to allocate to the merged web.  */
-static void
-combine_btr_defs (btr_def *def, HARD_REG_SET *btrs_live_in_range)
-{
-  btr_def *other_def;
-
-  for (other_def = def->group->members;
-       other_def != NULL;
-       other_def = other_def->next_this_group)
-    {
-      if (other_def != def
-	  && other_def->uses != NULL
-	  && ! other_def->has_ambiguous_use
-	  && dominated_by_p (CDI_DOMINATORS, other_def->bb, def->bb))
-	{
-	  /* def->bb dominates the other def, so def and other_def could
-	     be combined.  */
-	  /* Merge their live ranges, and get the set of
-	     target registers live over the merged range.  */
-	  int btr;
-	  HARD_REG_SET combined_btrs_live;
-	  auto_bitmap combined_live_range;
-	  btr_user *user;
-
-	  if (other_def->live_range == NULL)
-	    {
-	      HARD_REG_SET dummy_btrs_live_in_range;
-	      btr_def_live_range (other_def, &dummy_btrs_live_in_range);
-	    }
-	  COPY_HARD_REG_SET (combined_btrs_live, *btrs_live_in_range);
-	  bitmap_copy (combined_live_range, def->live_range);
-
-	  for (user = other_def->uses; user != NULL; user = user->next)
-	    augment_live_range (combined_live_range, &combined_btrs_live,
-				def->bb, user->bb,
-				(flag_btr_bb_exclusive
-				 || user->insn != BB_END (def->bb)
-				 || !JUMP_P (user->insn)));
-
-	  btr = choose_btr (combined_btrs_live);
-	  if (btr != -1)
-	    {
-	      /* We can combine them.  */
-	      if (dump_file)
-		fprintf (dump_file,
-			 "Combining def in insn %d with def in insn %d\n",
-			 INSN_UID (other_def->insn), INSN_UID (def->insn));
-
-	      def->btr = btr;
-	      user = other_def->uses;
-	      while (user != NULL)
-		{
-		  btr_user *next = user->next;
-
-		  user->next = def->uses;
-		  def->uses = user;
-		  user = next;
-		}
-	      /* Combining def/use webs can make target registers live
-		 after uses where they previously were not.  This means
-		 some REG_DEAD notes may no longer be correct.  We could
-		 be more precise about this if we looked at the combined
-		 live range, but here I just delete any REG_DEAD notes
-		 in case they are no longer correct.  */
-	      for (user = def->uses; user != NULL; user = user->next)
-		remove_note (user->insn,
-			     find_regno_note (user->insn, REG_DEAD,
-					      REGNO (user->use)));
-	      clear_btr_from_live_range (other_def);
-	      other_def->uses = NULL;
-	      bitmap_copy (def->live_range, combined_live_range);
-	      if (other_def->btr == btr && other_def->other_btr_uses_after_use)
-		def->other_btr_uses_after_use = 1;
-	      COPY_HARD_REG_SET (*btrs_live_in_range, combined_btrs_live);
-
-	      /* Delete the old target register initialization.  */
-	      delete_insn (other_def->insn);
-
-	    }
-	}
-    }
-}
-
-/* Move the definition DEF from its current position to basic
-   block NEW_DEF_BB, and modify it to use branch target register BTR.
-   Delete the old defining insn, and insert a new one in NEW_DEF_BB.
-   Update all reaching uses of DEF in the RTL to use BTR.
-   If this new position means that other defs in the
-   same group can be combined with DEF then combine them.  */
-static void
-move_btr_def (basic_block new_def_bb, int btr, btr_def *def, bitmap live_range,
-	     HARD_REG_SET *btrs_live_in_range)
-{
-  /* We can move the instruction.
-     Set a target register in block NEW_DEF_BB to the value
-     needed for this target register definition.
-     Replace all uses of the old target register definition by
-     uses of the new definition.  Delete the old definition.  */
-  basic_block b = new_def_bb;
-  rtx_insn *insp = BB_HEAD (b);
-  rtx_insn *old_insn = def->insn;
-  rtx src;
-  rtx btr_rtx;
-  rtx_insn *new_insn;
-  machine_mode btr_mode;
-  btr_user *user;
-  rtx set;
-
-  if (dump_file)
-    fprintf(dump_file, "migrating to basic block %d, using reg %d\n",
-	    new_def_bb->index, btr);
-
-  clear_btr_from_live_range (def);
-  def->btr = btr;
-  def->bb = new_def_bb;
-  def->luid = 0;
-  def->cost = basic_block_freq (new_def_bb);
-  bitmap_copy (def->live_range, live_range);
-  combine_btr_defs (def, btrs_live_in_range);
-  btr = def->btr;
-  def->other_btr_uses_before_def
-    = TEST_HARD_REG_BIT (btrs_live[b->index], btr) ? 1 : 0;
-  add_btr_to_live_range (def, 1);
-  if (LABEL_P (insp))
-    insp = NEXT_INSN (insp);
-  /* N.B.: insp is expected to be NOTE_INSN_BASIC_BLOCK now.  Some
-     optimizations can result in insp being both first and last insn of
-     its basic block.  */
-  /* ?? some assertions to check that insp is sensible? */
-
-  if (def->other_btr_uses_before_def)
-    {
-      for (insp = BB_END (b); ! INSN_P (insp); insp = PREV_INSN (insp))
-	gcc_assert (insp != BB_HEAD (b));
-
-      if (JUMP_P (insp) || can_throw_internal (insp))
-	insp = PREV_INSN (insp);
-    }
-
-  set = single_set (old_insn);
-  src = SET_SRC (set);
-  btr_mode = GET_MODE (SET_DEST (set));
-  btr_rtx = gen_rtx_REG (btr_mode, btr);
-
-  new_insn = gen_move_insn (btr_rtx, src);
-
-  /* Insert target register initialization at head of basic block.  */
-  def->insn = emit_insn_after (new_insn, insp);
-
-  df_set_regs_ever_live (btr, true);
-
-  if (dump_file)
-    fprintf (dump_file, "New pt is insn %d, inserted after insn %d\n",
-	     INSN_UID (def->insn), INSN_UID (insp));
-
-  /* Delete the old target register initialization.  */
-  delete_insn (old_insn);
-
-  /* Replace each use of the old target register by a use of the new target
-     register.  */
-  for (user = def->uses; user != NULL; user = user->next)
-    {
-      /* Some extra work here to ensure consistent modes, because
-	 it seems that a target register REG rtx can be given a different
-	 mode depending on the context (surely that should not be
-	 the case?).  */
-      rtx replacement_rtx;
-      if (GET_MODE (user->use) == GET_MODE (btr_rtx)
-	  || GET_MODE (user->use) == VOIDmode)
-	replacement_rtx = btr_rtx;
-      else
-	replacement_rtx = gen_rtx_REG (GET_MODE (user->use), btr);
-      validate_replace_rtx (user->use, replacement_rtx, user->insn);
-      user->use = replacement_rtx;
-    }
-}
-
-/* We anticipate intra-block scheduling to be done.  See if INSN could move
-   up within BB by N_INSNS.  */
-static int
-can_move_up (const_basic_block bb, const rtx_insn *insn, int n_insns)
-{
-  while (insn != BB_HEAD (bb) && n_insns > 0)
-    {
-      insn = PREV_INSN (insn);
-      /* ??? What if we have an anti-dependency that actually prevents the
-	 scheduler from doing the move?  We'd like to re-allocate the register,
-	 but not necessarily put the load into another basic block.  */
-      if (INSN_P (insn))
-	n_insns--;
-    }
-  return n_insns <= 0;
-}
-
-/* Attempt to migrate the target register definition DEF to an
-   earlier point in the flowgraph.
-
-   It is a precondition of this function that DEF is migratable:
-   i.e. it has a constant source, and all uses are unambiguous.
-
-   Only migrations that reduce the cost of DEF will be made.
-   MIN_COST is the lower bound on the cost of the DEF after migration.
-   If we migrate DEF so that its cost falls below MIN_COST,
-   then we do not attempt to migrate further.  The idea is that
-   we migrate definitions in a priority order based on their cost,
-   when the cost of this definition falls below MIN_COST, then
-   there is another definition with cost == MIN_COST which now
-   has a higher priority than this definition.
-
-   Return nonzero if there may be benefit from attempting to
-   migrate this DEF further (i.e. we have reduced the cost below
-   MIN_COST, but we may be able to reduce it further).
-   Return zero if no further migration is possible.  */
-static int
-migrate_btr_def (btr_def *def, int min_cost)
-{
-  HARD_REG_SET btrs_live_in_range;
-  int btr_used_near_def = 0;
-  int def_basic_block_freq;
-  basic_block attempt;
-  int give_up = 0;
-  int def_moved = 0;
-  btr_user *user;
-  int def_latency;
-
-  if (dump_file)
-    fprintf (dump_file,
-	     "Attempting to migrate pt from insn %d (cost = %d, min_cost = %d) ... ",
-	     INSN_UID (def->insn), def->cost, min_cost);
-
-  if (!def->group || def->has_ambiguous_use)
-    /* These defs are not migratable.  */
-    {
-      if (dump_file)
-	fprintf (dump_file, "it's not migratable\n");
-      return 0;
-    }
-
-  if (!def->uses)
-    /* We have combined this def with another in the same group, so
-       no need to consider it further.
-    */
-    {
-      if (dump_file)
-	fprintf (dump_file, "it's already combined with another pt\n");
-      return 0;
-    }
-
-  btr_def_live_range (def, &btrs_live_in_range);
-  auto_bitmap live_range;
-  bitmap_copy (live_range, def->live_range);
-
-#ifdef INSN_SCHEDULING
-  def_latency = insn_default_latency (def->insn) * issue_rate;
-#else
-  def_latency = issue_rate;
-#endif
-
-  for (user = def->uses; user != NULL; user = user->next)
-    {
-      if (user->bb == def->bb
-	  && user->luid > def->luid
-	  && (def->luid + def_latency) > user->luid
-	  && ! can_move_up (def->bb, def->insn,
-			    (def->luid + def_latency) - user->luid))
-	{
-	  btr_used_near_def = 1;
-	  break;
-	}
-    }
-
-  def_basic_block_freq = basic_block_freq (def->bb);
-
-  for (attempt = get_immediate_dominator (CDI_DOMINATORS, def->bb);
-       !give_up && attempt && attempt != ENTRY_BLOCK_PTR_FOR_FN (cfun)
-       && def->cost >= min_cost;
-       attempt = get_immediate_dominator (CDI_DOMINATORS, attempt))
-    {
-      /* Try to move the instruction that sets the target register into
-	 basic block ATTEMPT.  */
-      int try_freq = basic_block_freq (attempt);
-      edge_iterator ei;
-      edge e;
-
-      /* If ATTEMPT has abnormal edges, skip it.  */
-      FOR_EACH_EDGE (e, ei, attempt->succs)
-	if (e->flags & EDGE_COMPLEX)
-	  break;
-      if (e)
-	continue;
-
-      if (dump_file)
-	fprintf (dump_file, "trying block %d ...", attempt->index);
-
-      if (try_freq < def_basic_block_freq
-	  || (try_freq == def_basic_block_freq && btr_used_near_def))
-	{
-	  int btr;
-	  augment_live_range (live_range, &btrs_live_in_range, def->bb, attempt,
-			      flag_btr_bb_exclusive);
-	  if (dump_file)
-	    {
-	      fprintf (dump_file, "Now btrs live in range are: ");
-	      dump_hard_reg_set (btrs_live_in_range);
-	      fprintf (dump_file, "\n");
-	    }
-	  btr = choose_btr (btrs_live_in_range);
-	  if (btr != -1)
-	    {
-	      move_btr_def (attempt, btr, def, live_range, &btrs_live_in_range);
-	      bitmap_copy (live_range, def->live_range);
-	      btr_used_near_def = 0;
-	      def_moved = 1;
-	      def_basic_block_freq = basic_block_freq (def->bb);
-	    }
-	  else
-	    {
-	      /* There are no free target registers available to move
-		 this far forward, so give up */
-	      give_up = 1;
-	      if (dump_file)
-		fprintf (dump_file,
-			 "giving up because there are no free target registers\n");
-	    }
-
-	}
-    }
-  if (!def_moved)
-    {
-      give_up = 1;
-      if (dump_file)
-	fprintf (dump_file, "failed to move\n");
-    }
-
-  return !give_up;
-}
-
-/* Attempt to move instructions that set target registers earlier
-   in the flowgraph, away from their corresponding uses.  */
-static void
-migrate_btr_defs (enum reg_class btr_class, int allow_callee_save)
-{
-  btr_heap_t all_btr_defs (LONG_MIN);
-  int reg;
-
-  gcc_obstack_init (&migrate_btrl_obstack);
-  if (dump_file)
-    {
-      int i;
-
-      for (i = NUM_FIXED_BLOCKS; i < last_basic_block_for_fn (cfun); i++)
-	{
-	  basic_block bb = BASIC_BLOCK_FOR_FN (cfun, i);
-	  fprintf (dump_file, "Basic block %d: count = ", i);
-	  bb->count.dump (dump_file);
-	  fprintf (dump_file, " loop-depth = %d idom = %d\n",
-		   bb_loop_depth (bb),
-		   get_immediate_dominator (CDI_DOMINATORS, bb)->index);
-	}
-    }
-
-  CLEAR_HARD_REG_SET (all_btrs);
-  for (first_btr = -1, reg = 0; reg < FIRST_PSEUDO_REGISTER; reg++)
-    if (TEST_HARD_REG_BIT (reg_class_contents[(int) btr_class], reg)
-	&& (allow_callee_save || call_used_regs[reg]
-	    || df_regs_ever_live_p (reg)))
-      {
-	SET_HARD_REG_BIT (all_btrs, reg);
-	last_btr = reg;
-	if (first_btr < 0)
-	  first_btr = reg;
-      }
-
-  btrs_live = XCNEWVEC (HARD_REG_SET, last_basic_block_for_fn (cfun));
-  btrs_live_at_end = XCNEWVEC (HARD_REG_SET, last_basic_block_for_fn (cfun));
-
-  build_btr_def_use_webs (&all_btr_defs);
-
-  while (!all_btr_defs.empty ())
-    {
-      int min_cost = -all_btr_defs.min_key ();
-      btr_def *def = all_btr_defs.extract_min ();
-      if (migrate_btr_def (def, min_cost))
-	{
-	  all_btr_defs.insert (-def->cost, def);
-	  if (dump_file)
-	    {
-	      fprintf (dump_file,
-		"Putting insn %d back on queue with priority %d\n",
-		INSN_UID (def->insn), def->cost);
-	    }
-	}
-      else
-	BITMAP_FREE (def->live_range);
-    }
-
-  free (btrs_live);
-  free (btrs_live_at_end);
-  obstack_free (&migrate_btrl_obstack, NULL);
-}
-
-static void
-branch_target_load_optimize (bool after_prologue_epilogue_gen)
-{
-  enum reg_class klass
-    = (enum reg_class) targetm.branch_target_register_class ();
-  if (klass != NO_REGS)
-    {
-      /* Initialize issue_rate.  */
-      if (targetm.sched.issue_rate)
-	issue_rate = targetm.sched.issue_rate ();
-      else
-	issue_rate = 1;
-
-      if (!after_prologue_epilogue_gen)
-	{
-	  /* Build the CFG for migrate_btr_defs.  */
-#if 1
-	  /* This may or may not be needed, depending on where we
-	     run this phase.  */
-	  cleanup_cfg (optimize ? CLEANUP_EXPENSIVE : 0);
-#endif
-	}
-      df_analyze ();
-
-
-      /* Dominator info is also needed for migrate_btr_def.  */
-      calculate_dominance_info (CDI_DOMINATORS);
-      migrate_btr_defs (klass,
-		       (targetm.branch_target_register_callee_saved
-			(after_prologue_epilogue_gen)));
-
-      free_dominance_info (CDI_DOMINATORS);
-    }
-}
-
-namespace {
-
-const pass_data pass_data_branch_target_load_optimize1 =
-{
-  RTL_PASS, /* type */
-  "btl1", /* name */
-  OPTGROUP_NONE, /* optinfo_flags */
-  TV_NONE, /* tv_id */
-  0, /* properties_required */
-  0, /* properties_provided */
-  0, /* properties_destroyed */
-  0, /* todo_flags_start */
-  0, /* todo_flags_finish */
-};
-
-class pass_branch_target_load_optimize1 : public rtl_opt_pass
-{
-public:
-  pass_branch_target_load_optimize1 (gcc::context *ctxt)
-    : rtl_opt_pass (pass_data_branch_target_load_optimize1, ctxt)
-  {}
-
-  /* opt_pass methods: */
-  virtual bool gate (function *) { return flag_branch_target_load_optimize; }
-  virtual unsigned int execute (function *)
-    {
-      branch_target_load_optimize (epilogue_completed);
-      return 0;
-    }
-
-}; // class pass_branch_target_load_optimize1
-
-} // anon namespace
-
-rtl_opt_pass *
-make_pass_branch_target_load_optimize1 (gcc::context *ctxt)
-{
-  return new pass_branch_target_load_optimize1 (ctxt);
-}
-
-
-namespace {
-
-const pass_data pass_data_branch_target_load_optimize2 =
-{
-  RTL_PASS, /* type */
-  "btl2", /* name */
-  OPTGROUP_NONE, /* optinfo_flags */
-  TV_NONE, /* tv_id */
-  0, /* properties_required */
-  0, /* properties_provided */
-  0, /* properties_destroyed */
-  0, /* todo_flags_start */
-  0, /* todo_flags_finish */
-};
-
-class pass_branch_target_load_optimize2 : public rtl_opt_pass
-{
-public:
-  pass_branch_target_load_optimize2 (gcc::context *ctxt)
-    : rtl_opt_pass (pass_data_branch_target_load_optimize2, ctxt)
-  {}
-
-  /* opt_pass methods: */
-  virtual bool gate (function *)
-    {
-      return (optimize > 0 && flag_branch_target_load_optimize2);
-    }
-
-  virtual unsigned int execute (function *);
-
-}; // class pass_branch_target_load_optimize2
-
-unsigned int
-pass_branch_target_load_optimize2::execute (function *)
-{
-  static int warned = 0;
-
-  /* Leave this a warning for now so that it is possible to experiment
-     with running this pass twice.  In 3.6, we should either make this
-     an error, or use separate dump files.  */
-  if (flag_branch_target_load_optimize
-      && flag_branch_target_load_optimize2
-      && !warned)
-    {
-      warning (0, "branch target register load optimization is not intended "
-	       "to be run twice");
-
-      warned = 1;
-    }
-
-  branch_target_load_optimize (epilogue_completed);
-  return 0;
-}
-
-} // anon namespace
-
-rtl_opt_pass *
-make_pass_branch_target_load_optimize2 (gcc::context *ctxt)
-{
-  return new pass_branch_target_load_optimize2 (ctxt);
-}
diff --git a/gcc/builtins.c b/gcc/builtins.c
index 910e614a4..945205c1d 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -1431,7 +1431,7 @@ expand_builtin_prefetch (tree exp)
 }
 
 /* Get a MEM rtx for expression EXP which is the address of an operand
-   to be used in a string instruction (cmpstrsi, movmemsi, ..).  LEN is
+   to be used in a string instruction (cmpstrsi, cpymemsi, ..).  LEN is
    the maximum length of the block of memory that might be accessed or
    NULL if unknown.  */
 
@@ -7224,7 +7224,6 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
 		int ignore)
 {
   tree fndecl = get_callee_fndecl (exp);
-  enum built_in_function fcode = DECL_FUNCTION_CODE (fndecl);
   machine_mode target_mode = TYPE_MODE (TREE_TYPE (exp));
   int flags;
 
@@ -7236,6 +7235,7 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
      redundant checks and be sure, that possible overflow will be detected
      by ASan.  */
 
+  enum built_in_function fcode = DECL_FUNCTION_CODE (fndecl);
   if ((flag_sanitize & SANITIZE_ADDRESS) && asan_intercepted_p (fcode))
     return expand_call (exp, target, ignore);
 
diff --git a/gcc/c-family/c-common.c b/gcc/c-family/c-common.c
index d220e8135..bf3db074a 100644
--- a/gcc/c-family/c-common.c
+++ b/gcc/c-family/c-common.c
@@ -5835,15 +5835,27 @@ builtin_function_validate_nargs (location_t loc, tree fndecl, int nargs,
 /* Verifies the NARGS arguments ARGS to the builtin function FNDECL.
    Returns false if there was an error, otherwise true.  LOC is the
    location of the function; ARG_LOC is a vector of locations of the
-   arguments.  */
+   arguments.  If FNDECL is the result of resolving an overloaded
+   target built-in, ORIG_FNDECL is the original function decl,
+   otherwise it is null.  */
 
 bool
 check_builtin_function_arguments (location_t loc, vec<location_t> arg_loc,
-				  tree fndecl, int nargs, tree *args)
+				  tree fndecl, tree orig_fndecl,
+				  int nargs, tree *args)
 {
-  if (!fndecl_built_in_p (fndecl, BUILT_IN_NORMAL))
+  if (!fndecl_built_in_p (fndecl))
     return true;
 
+  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
+    return (!targetm.check_builtin_call
+	    || targetm.check_builtin_call (loc, arg_loc, fndecl,
+					   orig_fndecl, nargs, args));
+
+  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_FRONTEND)
+    return true;
+
+  gcc_assert (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL);
   switch (DECL_FUNCTION_CODE (fndecl))
     {
     case BUILT_IN_ALLOCA_WITH_ALIGN_AND_MAX:
@@ -7317,8 +7329,6 @@ tree
 resolve_overloaded_builtin (location_t loc, tree function,
 			    vec<tree, va_gc> *params)
 {
-  enum built_in_function orig_code = DECL_FUNCTION_CODE (function);
-
   /* Is function one of the _FETCH_OP_ or _OP_FETCH_ built-ins?
      Those are not valid to call with a pointer to _Bool (or C++ bool)
      and so must be rejected.  */
@@ -7340,6 +7350,7 @@ resolve_overloaded_builtin (location_t loc, tree function,
     }
 
   /* Handle BUILT_IN_NORMAL here.  */
+  enum built_in_function orig_code = DECL_FUNCTION_CODE (function);
   switch (orig_code)
     {
     case BUILT_IN_SPECULATION_SAFE_VALUE_N:
diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h
index 683764267..46b8d265a 100644
--- a/gcc/c-family/c-common.h
+++ b/gcc/c-family/c-common.h
@@ -818,7 +818,7 @@ extern void check_function_arguments_recurse (void (*)
 					      void *, tree,
 					      unsigned HOST_WIDE_INT);
 extern bool check_builtin_function_arguments (location_t, vec<location_t>,
-					      tree, int, tree *);
+					      tree, tree, int, tree *);
 extern void check_function_format (const_tree, tree, int, tree *,
 				   vec<location_t> *);
 extern bool attribute_fallthrough_p (tree);
@@ -995,7 +995,8 @@ extern bool c_switch_covers_all_cases_p (splay_tree, tree);
 extern tree build_function_call (location_t, tree, tree);
 
 extern tree build_function_call_vec (location_t, vec<location_t>, tree,
-				     vec<tree, va_gc> *, vec<tree, va_gc> *);
+				     vec<tree, va_gc> *, vec<tree, va_gc> *,
+				     tree = NULL_TREE);
 
 extern tree resolve_overloaded_builtin (location_t, tree, vec<tree, va_gc> *);
 
diff --git a/gcc/c-family/c-pretty-print.c b/gcc/c-family/c-pretty-print.c
index 3e25624d3..1e14658c0 100644
--- a/gcc/c-family/c-pretty-print.c
+++ b/gcc/c-family/c-pretty-print.c
@@ -470,6 +470,16 @@ pp_c_specifier_qualifier_list (c_pretty_printer *pp, tree t)
 			     ? "_Complex" : "__complex__"));
       else if (code == VECTOR_TYPE)
 	{
+	  /* The syntax we print for vector types isn't real C or C++ syntax,
+	     so it's better to print the type name if we have one.  */
+	  tree name = TYPE_NAME (t);
+	  if (!(pp->flags & pp_c_flag_gnu_v3)
+	      && name
+	      && TREE_CODE (name) == TYPE_DECL)
+	    {
+	      pp->id_expression (name);
+	      break;
+	    }
 	  pp_c_ws_string (pp, "__vector");
 	  pp_c_left_paren (pp);
 	  pp_wide_integer (pp, TYPE_VECTOR_SUBPARTS (t));
diff --git a/gcc/c/c-decl.c b/gcc/c/c-decl.c
index 859a62412..288dbe9d9 100644
--- a/gcc/c/c-decl.c
+++ b/gcc/c/c-decl.c
@@ -604,7 +604,7 @@ static tree grokparms (struct c_arg_info *, bool);
 static void layout_array_type (tree);
 static void warn_defaults_to (location_t, int, const char *, ...)
     ATTRIBUTE_GCC_DIAG(3,4);
-static const char *header_for_builtin_fn (enum built_in_function);
+static const char *header_for_builtin_fn (tree);
 
 /* T is a statement.  Add it to the statement-tree.  This is the
    C/ObjC version--C++ has a slightly different version of this
@@ -1951,7 +1951,8 @@ diagnose_mismatched_decls (tree newdecl, tree olddecl,
   if (!comptypes (oldtype, newtype))
     {
       if (TREE_CODE (olddecl) == FUNCTION_DECL
-	  && fndecl_built_in_p (olddecl) && !C_DECL_DECLARED_BUILTIN (olddecl))
+	  && fndecl_built_in_p (olddecl, BUILT_IN_NORMAL)
+	  && !C_DECL_DECLARED_BUILTIN (olddecl))
 	{
 	  /* Accept "harmless" mismatches in function types such
 	     as missing qualifiers or pointer vs same size integer
@@ -1973,8 +1974,7 @@ diagnose_mismatched_decls (tree newdecl, tree olddecl,
 	      /* If types don't match for a built-in, throw away the
 		 built-in.  No point in calling locate_old_decl here, it
 		 won't print anything.  */
-	      const char *header
-		= header_for_builtin_fn (DECL_FUNCTION_CODE (olddecl));
+	      const char *header = header_for_builtin_fn (olddecl);
 	      location_t loc = DECL_SOURCE_LOCATION (newdecl);
 	      if (warning_at (loc, OPT_Wbuiltin_declaration_mismatch,
 			      "conflicting types for built-in function %q+D; "
@@ -2637,7 +2637,8 @@ merge_decls (tree newdecl, tree olddecl, tree newtype, tree oldtype)
 	    |= DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (olddecl);
 	  TREE_THIS_VOLATILE (newdecl) |= TREE_THIS_VOLATILE (olddecl);
 	  DECL_IS_MALLOC (newdecl) |= DECL_IS_MALLOC (olddecl);
-	  DECL_IS_OPERATOR_NEW (newdecl) |= DECL_IS_OPERATOR_NEW (olddecl);
+	  if (DECL_IS_OPERATOR_NEW_P (olddecl))
+	    DECL_SET_IS_OPERATOR_NEW (newdecl, true);
 	  TREE_READONLY (newdecl) |= TREE_READONLY (olddecl);
 	  DECL_PURE_P (newdecl) |= DECL_PURE_P (olddecl);
 	  DECL_IS_NOVOPS (newdecl) |= DECL_IS_NOVOPS (olddecl);
@@ -2731,8 +2732,7 @@ merge_decls (tree newdecl, tree olddecl, tree newtype, tree oldtype)
 	{
 	  /* If redeclaring a builtin function, it stays built in.
 	     But it gets tagged as having been declared.  */
-	  DECL_BUILT_IN_CLASS (newdecl) = DECL_BUILT_IN_CLASS (olddecl);
-	  DECL_FUNCTION_CODE (newdecl) = DECL_FUNCTION_CODE (olddecl);
+	  copy_decl_built_in_function (newdecl, olddecl);
 	  C_DECL_DECLARED_BUILTIN (newdecl) = 1;
 	  if (new_is_prototype)
 	    {
@@ -3334,13 +3334,17 @@ implicit_decl_warning (location_t loc, tree id, tree olddecl)
     hint.suppress ();
 }
 
-/* This function represents mapping of a function code FCODE
-   to its respective header.  */
+/* Return the name of the header file that declares built-in function
+   FNDECL, or null if either we don't know or don't expect to see an
+   explicit declaration.  */
 
 static const char *
-header_for_builtin_fn (enum built_in_function fcode)
+header_for_builtin_fn (tree fndecl)
 {
-  switch (fcode)
+  if (DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
+    return NULL;
+
+  switch (DECL_FUNCTION_CODE (fndecl))
     {
     CASE_FLT_FN (BUILT_IN_ACOS):
     CASE_FLT_FN (BUILT_IN_ACOSH):
@@ -3590,8 +3594,7 @@ implicitly_declare (location_t loc, tree functionid)
 					    "declaration of built-in "
 					    "function %qD", decl);
 		  /* See if we can hint which header to include.  */
-		  const char *header
-		    = header_for_builtin_fn (DECL_FUNCTION_CODE (decl));
+		  const char *header = header_for_builtin_fn (decl);
 		  if (header != NULL && warned)
 		    {
 		      rich_location richloc (line_table, loc);
@@ -4471,6 +4474,16 @@ c_builtin_function_ext_scope (tree decl)
 
   return decl;
 }
+
+/* Implement LANG_HOOKS_SIMULATE_BUILTIN_FUNCTION_DECL.  */
+
+tree
+c_simulate_builtin_function_decl (tree decl)
+{
+  tree type = TREE_TYPE (decl);
+  C_DECL_BUILTIN_PROTOTYPE (decl) = prototype_p (type);
+  return pushdecl (decl);
+}
 
 /* Called when a declaration is seen that contains no names to declare.
    If its type is a reference to a structure, union or enum inherited
@@ -8746,6 +8759,8 @@ finish_enum (tree enumtype, tree values, tree attributes)
       && !in_sizeof && !in_typeof && !in_alignof)
     struct_parse_info->struct_types.safe_push (enumtype);
 
+  C_TYPE_BEING_DEFINED (enumtype) = 0;
+
   return enumtype;
 }
 
@@ -8851,6 +8866,36 @@ build_enumerator (location_t decl_loc, location_t loc,
   return tree_cons (decl, value, NULL_TREE);
 }
 
+/* Implement LANG_HOOKS_SIMULATE_ENUM_DECL.  */
+
+tree
+c_simulate_enum_decl (location_t loc, const char *name,
+		      vec<string_int_pair> values)
+{
+  location_t saved_loc = input_location;
+  input_location = loc;
+
+  struct c_enum_contents the_enum;
+  tree enumtype = start_enum (loc, &the_enum, get_identifier (name));
+
+  tree value_chain = NULL_TREE;
+  string_int_pair *value;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (values, i, value)
+    {
+      tree decl = build_enumerator (loc, loc, &the_enum,
+				    get_identifier (value->first),
+				    build_int_cst (integer_type_node,
+						   value->second));
+      TREE_CHAIN (decl) = value_chain;
+      value_chain = decl;
+    }
+
+  finish_enum (enumtype, nreverse (value_chain), NULL_TREE);
+
+  input_location = saved_loc;
+  return enumtype;
+}
 
 /* Create the FUNCTION_DECL for a function definition.
    DECLSPECS, DECLARATOR and ATTRIBUTES are the parts of
diff --git a/gcc/c/c-objc-common.h b/gcc/c/c-objc-common.h
index f5e820420..c8739e0b8 100644
--- a/gcc/c/c-objc-common.h
+++ b/gcc/c/c-objc-common.h
@@ -60,6 +60,9 @@ along with GCC; see the file COPYING3.  If not see
 #define LANG_HOOKS_BUILTIN_FUNCTION c_builtin_function
 #undef  LANG_HOOKS_BUILTIN_FUNCTION_EXT_SCOPE
 #define LANG_HOOKS_BUILTIN_FUNCTION_EXT_SCOPE c_builtin_function_ext_scope
+#undef  LANG_HOOKS_SIMULATE_BUILTIN_FUNCTION_DECL
+#define LANG_HOOKS_SIMULATE_BUILTIN_FUNCTION_DECL \
+  c_simulate_builtin_function_decl
 #undef LANG_HOOKS_EMITS_BEGIN_STMT
 #define LANG_HOOKS_EMITS_BEGIN_STMT true
 
@@ -72,6 +75,8 @@ along with GCC; see the file COPYING3.  If not see
 #undef LANG_HOOKS_TREE_DUMP_DUMP_TREE_FN
 #define LANG_HOOKS_TREE_DUMP_DUMP_TREE_FN c_dump_tree
 
+#undef LANG_HOOKS_SIMULATE_ENUM_DECL
+#define LANG_HOOKS_SIMULATE_ENUM_DECL c_simulate_enum_decl
 #undef LANG_HOOKS_TYPE_FOR_MODE
 #define LANG_HOOKS_TYPE_FOR_MODE c_common_type_for_mode
 #undef LANG_HOOKS_TYPE_FOR_SIZE
diff --git a/gcc/c/c-tree.h b/gcc/c/c-tree.h
index 7e35ab1f0..19925e793 100644
--- a/gcc/c/c-tree.h
+++ b/gcc/c/c-tree.h
@@ -561,6 +561,8 @@ extern tree finish_enum (tree, tree, tree);
 extern void finish_function (void);
 extern tree finish_struct (location_t, tree, tree, tree,
 			   struct c_struct_parse_info *);
+extern tree c_simulate_enum_decl (location_t, const char *,
+				  vec<string_int_pair>);
 extern struct c_arg_info *build_arg_info (void);
 extern struct c_arg_info *get_parm_info (bool, tree);
 extern tree grokfield (location_t, struct c_declarator *,
@@ -577,6 +579,7 @@ extern struct c_declarator *set_array_declarator_inner (struct c_declarator *,
 							struct c_declarator *);
 extern tree c_builtin_function (tree);
 extern tree c_builtin_function_ext_scope (tree);
+extern tree c_simulate_builtin_function_decl (tree);
 extern void shadow_tag (const struct c_declspecs *);
 extern void shadow_tag_warned (const struct c_declspecs *, int);
 extern tree start_enum (location_t, struct c_enum_contents *, tree);
diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c
index cb999cbf8..87f4178ec 100644
--- a/gcc/c/c-typeck.c
+++ b/gcc/c/c-typeck.c
@@ -3002,6 +3002,8 @@ inform_declaration (tree decl)
 }
 
 /* Build a function call to function FUNCTION with parameters PARAMS.
+   If FUNCTION is the result of resolving an overloaded target built-in,
+   ORIG_FUNDECL is the original function decl, otherwise it is null.
    ORIGTYPES, if not NULL, is a vector of types; each element is
    either NULL or the original type of the corresponding element in
    PARAMS.  The original type may differ from TREE_TYPE of the
@@ -3012,7 +3014,7 @@ inform_declaration (tree decl)
 tree
 build_function_call_vec (location_t loc, vec<location_t> arg_loc,
 			 tree function, vec<tree, va_gc> *params,
-			 vec<tree, va_gc> *origtypes)
+			 vec<tree, va_gc> *origtypes, tree orig_fundecl)
 {
   tree fntype, fundecl = NULL_TREE;
   tree name = NULL_TREE, result;
@@ -3032,6 +3034,8 @@ build_function_call_vec (location_t loc, vec<location_t> arg_loc,
       if (flag_tm)
 	tm_malloc_replacement (function);
       fundecl = function;
+      if (!orig_fundecl)
+	orig_fundecl = fundecl;
       /* Atomic functions have type checking/casting already done.  They are 
 	 often rewritten and don't match the original parameter list.  */
       if (name && !strncmp (IDENTIFIER_POINTER (name), "__atomic_", 9))
@@ -3109,9 +3113,10 @@ build_function_call_vec (location_t loc, vec<location_t> arg_loc,
   argarray = vec_safe_address (params);
 
   /* Check that arguments to builtin functions match the expectations.  */
-  if (fundecl && fndecl_built_in_p (fundecl, BUILT_IN_NORMAL)
-      && !check_builtin_function_arguments (loc, arg_loc, fundecl, nargs,
-					    argarray))
+  if (fundecl
+      && fndecl_built_in_p (fundecl)
+      && !check_builtin_function_arguments (loc, arg_loc, fundecl,
+					    orig_fundecl, nargs, argarray))
     return error_mark_node;
 
   /* Check that the arguments to the function are valid.  */
diff --git a/gcc/caller-save.c b/gcc/caller-save.c
index 9ff470c33..0d66e0ce5 100644
--- a/gcc/caller-save.c
+++ b/gcc/caller-save.c
@@ -37,6 +37,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "dumpfile.h"
 #include "rtl-iter.h"
 #include "target.h"
+#include "function-abi.h"
 
 #define MOVE_MAX_WORDS (MOVE_MAX / UNITS_PER_WORD)
 
@@ -192,29 +193,17 @@ init_caller_save (void)
 
   caller_save_initialized_p = true;
 
-  CLEAR_HARD_REG_SET (no_caller_save_reg_set);
   /* First find all the registers that we need to deal with and all
      the modes that they can have.  If we can't find a mode to use,
      we can't have the register live over calls.  */
 
   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-    {
-      if (call_used_regs[i]
-          && !TEST_HARD_REG_BIT (call_fixed_reg_set, i))
-	{
-	  for (j = 1; j <= MOVE_MAX_WORDS; j++)
-	    {
-	      regno_save_mode[i][j] = HARD_REGNO_CALLER_SAVE_MODE (i, j,
-								   VOIDmode);
-	      if (regno_save_mode[i][j] == VOIDmode && j == 1)
-		{
-		  SET_HARD_REG_BIT (call_fixed_reg_set, i);
-		}
-	    }
-	}
-      else
-	regno_save_mode[i][1] = VOIDmode;
-    }
+    for (j = 1; j <= MOVE_MAX_WORDS; j++)
+      {
+	regno_save_mode[i][j] = HARD_REGNO_CALLER_SAVE_MODE (i, j, VOIDmode);
+	if (regno_save_mode[i][j] == VOIDmode && j == 1)
+	  CLEAR_HARD_REG_BIT (savable_regs, i);
+      }
 
   /* The following code tries to approximate the conditions under which
      we can easily save and restore a register without scratch registers or
@@ -275,11 +264,7 @@ init_caller_save (void)
 	{
 	  regno_save_mode[i][j] = VOIDmode;
 	  if (j == 1)
-	    {
-	      SET_HARD_REG_BIT (call_fixed_reg_set, i);
-	      if (call_used_regs[i])
-		SET_HARD_REG_BIT (no_caller_save_reg_set, i);
-	    }
+	    CLEAR_HARD_REG_BIT (savable_regs, i);
 	}
 }
 
@@ -442,7 +427,9 @@ setup_save_areas (void)
       freq = REG_FREQ_FROM_BB (BLOCK_FOR_INSN (insn));
       REG_SET_TO_HARD_REG_SET (hard_regs_to_save,
 			       &chain->live_throughout);
-      get_call_reg_set_usage (insn, &used_regs, call_used_reg_set);
+      used_regs = insn_callee_abi (insn).full_reg_clobbers ();
+      /* ??? This preserves traditional behavior; it might not be needed.  */
+      used_regs |= fixed_reg_set;
 
       /* Record all registers set in this call insn.  These don't
 	 need to be saved.  N.B. the call insn might set a subreg
@@ -450,14 +437,13 @@ setup_save_areas (void)
 	 live during the call, but the subreg that is set
 	 isn't.  */
       CLEAR_HARD_REG_SET (this_insn_sets);
-      note_stores (PATTERN (insn), mark_set_regs, &this_insn_sets);
+      note_stores (insn, mark_set_regs, &this_insn_sets);
       /* Sibcalls are considered to set the return value.  */
       if (SIBLING_CALL_P (insn) && crtl->return_rtx)
 	mark_set_regs (crtl->return_rtx, NULL_RTX, &this_insn_sets);
 
-      AND_COMPL_HARD_REG_SET (used_regs, call_fixed_reg_set);
-      AND_COMPL_HARD_REG_SET (used_regs, this_insn_sets);
-      AND_HARD_REG_SET (hard_regs_to_save, used_regs);
+      used_regs &= ~(fixed_reg_set | this_insn_sets);
+      hard_regs_to_save &= used_regs & savable_regs;
       for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
 	if (TEST_HARD_REG_BIT (hard_regs_to_save, regno))
 	  {
@@ -526,7 +512,10 @@ setup_save_areas (void)
 
 	  REG_SET_TO_HARD_REG_SET (hard_regs_to_save,
 				   &chain->live_throughout);
-	  get_call_reg_set_usage (insn, &used_regs, call_used_reg_set);
+	  used_regs = insn_callee_abi (insn).full_reg_clobbers ();
+	  /* ??? This preserves traditional behavior; it might not
+	     be needed.  */
+	  used_regs |= fixed_reg_set;
 
 	  /* Record all registers set in this call insn.  These don't
 	     need to be saved.  N.B. the call insn might set a subreg
@@ -534,15 +523,14 @@ setup_save_areas (void)
 	     live during the call, but the subreg that is set
 	     isn't.  */
 	  CLEAR_HARD_REG_SET (this_insn_sets);
-	  note_stores (PATTERN (insn), mark_set_regs, &this_insn_sets);
+	  note_stores (insn, mark_set_regs, &this_insn_sets);
 	  /* Sibcalls are considered to set the return value,
 	     compare df-scan.c:df_get_call_refs.  */
 	  if (SIBLING_CALL_P (insn) && crtl->return_rtx)
 	    mark_set_regs (crtl->return_rtx, NULL_RTX, &this_insn_sets);
 
-	  AND_COMPL_HARD_REG_SET (used_regs, call_fixed_reg_set);
-	  AND_COMPL_HARD_REG_SET (used_regs, this_insn_sets);
-	  AND_HARD_REG_SET (hard_regs_to_save, used_regs);
+	  used_regs &= ~(fixed_reg_set | this_insn_sets);
+	  hard_regs_to_save &= used_regs & savable_regs;
 	  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
 	    if (TEST_HARD_REG_BIT (hard_regs_to_save, regno))
 	      {
@@ -775,13 +763,13 @@ save_call_clobbered_regs (void)
 
 	      if (code == JUMP_INSN)
 		/* Restore all registers if this is a JUMP_INSN.  */
-		COPY_HARD_REG_SET (referenced_regs, hard_regs_saved);
+		referenced_regs = hard_regs_saved;
 	      else
 		{
 		  CLEAR_HARD_REG_SET (referenced_regs);
 		  mark_referenced_regs (&PATTERN (insn),
 					mark_reg_as_referenced, NULL);
-		  AND_HARD_REG_SET (referenced_regs, hard_regs_saved);
+		  referenced_regs &= hard_regs_saved;
 		}
 
 	      for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
@@ -795,8 +783,8 @@ save_call_clobbered_regs (void)
 		 be live across the call, while the other is set
 		 afterwards.  */
 	      CLEAR_HARD_REG_SET (this_insn_sets);
-	      note_stores (PATTERN (insn), mark_set_regs, &this_insn_sets);
-	      AND_COMPL_HARD_REG_SET (hard_regs_saved, this_insn_sets);
+	      note_stores (insn, mark_set_regs, &this_insn_sets);
+	      hard_regs_saved &= ~this_insn_sets;
 	    }
 
 	  if (code == CALL_INSN
@@ -849,15 +837,18 @@ save_call_clobbered_regs (void)
 		 multi-hard-reg pseudo; then the pseudo is considered live
 		 during the call, but the subreg that is set isn't.  */
 	      CLEAR_HARD_REG_SET (this_insn_sets);
-	      note_stores (PATTERN (insn), mark_set_regs, &this_insn_sets);
+	      note_stores (insn, mark_set_regs, &this_insn_sets);
 
 	      /* Compute which hard regs must be saved before this call.  */
-	      AND_COMPL_HARD_REG_SET (hard_regs_to_save, call_fixed_reg_set);
-	      AND_COMPL_HARD_REG_SET (hard_regs_to_save, this_insn_sets);
-	      AND_COMPL_HARD_REG_SET (hard_regs_to_save, hard_regs_saved);
-	      get_call_reg_set_usage (insn, &call_def_reg_set,
-				      call_used_reg_set);
-	      AND_HARD_REG_SET (hard_regs_to_save, call_def_reg_set);
+	      hard_regs_to_save &= ~(fixed_reg_set
+				     | this_insn_sets
+				     | hard_regs_saved);
+	      hard_regs_to_save &= savable_regs;
+	      call_def_reg_set = insn_callee_abi (insn).full_reg_clobbers ();
+	      /* ??? This preserves traditional behavior; it might not
+		 be needed.  */
+	      call_def_reg_set |= fixed_reg_set;
+	      hard_regs_to_save &= call_def_reg_set;
 
 	      for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
 		if (TEST_HARD_REG_BIT (hard_regs_to_save, regno))
@@ -872,7 +863,8 @@ save_call_clobbered_regs (void)
 	      
 	      if (cheap
 		  && HARD_REGISTER_P (cheap)
-		  && TEST_HARD_REG_BIT (call_used_reg_set, REGNO (cheap)))
+		  && TEST_HARD_REG_BIT (call_used_or_fixed_regs,
+					REGNO (cheap)))
 		{
 		  rtx dest, newpat;
 		  rtx pat = PATTERN (insn);
@@ -1414,8 +1406,7 @@ insert_one_insn (struct insn_chain *chain, int before_p, int code, rtx pat)
       /* Registers that are set in CHAIN->INSN live in the new insn.
 	 (Unless there is a REG_UNUSED note for them, but we don't
 	  look for them here.) */
-      note_stores (PATTERN (chain->insn), add_stored_regs,
-		   &new_chain->live_throughout);
+      note_stores (chain->insn, add_stored_regs, &new_chain->live_throughout);
       CLEAR_REG_SET (&new_chain->dead_or_set);
       if (chain->insn == BB_END (BASIC_BLOCK_FOR_FN (cfun, chain->block)))
 	BB_END (BASIC_BLOCK_FOR_FN (cfun, chain->block)) = new_chain->insn;
diff --git a/gcc/calls.c b/gcc/calls.c
index 567959956..2638752ad 100644
--- a/gcc/calls.c
+++ b/gcc/calls.c
@@ -346,7 +346,8 @@ prepare_call_address (tree fndecl_or_type, rtx funexp, rtx static_chain_value,
    It is zero if this call doesn't want a structure value.
 
    NEXT_ARG_REG is the rtx that results from executing
-     targetm.calls.function_arg (&args_so_far, VOIDmode, void_type_node, true)
+     targetm.calls.function_arg (&args_so_far,
+				 function_arg_info::end_marker ());
    just after all the args have had their registers assigned.
    This could be whatever you like, but normally it is the first
    arg-register beyond those used for args in this call,
@@ -897,13 +898,12 @@ call_expr_flags (const_tree t)
   return flags;
 }
 
-/* Return true if TYPE should be passed by invisible reference.  */
+/* Return true if ARG should be passed by invisible reference.  */
 
 bool
-pass_by_reference (CUMULATIVE_ARGS *ca, machine_mode mode,
-		   tree type, bool named_arg)
+pass_by_reference (CUMULATIVE_ARGS *ca, function_arg_info arg)
 {
-  if (type)
+  if (tree type = arg.type)
     {
       /* If this type contains non-trivial constructors, then it is
 	 forbidden for the middle-end to create any new copies.  */
@@ -911,33 +911,55 @@ pass_by_reference (CUMULATIVE_ARGS *ca, machine_mode mode,
 	return true;
 
       /* GCC post 3.4 passes *all* variable sized types by reference.  */
-      if (!TYPE_SIZE (type) || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
+      if (!TYPE_SIZE (type) || !poly_int_tree_p (TYPE_SIZE (type)))
 	return true;
 
       /* If a record type should be passed the same as its first (and only)
 	 member, use the type and mode of that member.  */
       if (TREE_CODE (type) == RECORD_TYPE && TYPE_TRANSPARENT_AGGR (type))
 	{
-	  type = TREE_TYPE (first_field (type));
-	  mode = TYPE_MODE (type);
+	  arg.type = TREE_TYPE (first_field (type));
+	  arg.mode = TYPE_MODE (arg.type);
 	}
     }
 
-  return targetm.calls.pass_by_reference (pack_cumulative_args (ca), mode,
-					  type, named_arg);
+  return targetm.calls.pass_by_reference (pack_cumulative_args (ca), arg);
 }
 
-/* Return true if TYPE, which is passed by reference, should be callee
+/* Return true if TYPE should be passed by reference when passed to
+   the "..." arguments of a function.  */
+
+bool
+pass_va_arg_by_reference (tree type)
+{
+  return pass_by_reference (NULL, function_arg_info (type, /*named=*/false));
+}
+
+/* Decide whether ARG, which occurs in the state described by CA,
+   should be passed by reference.  Return true if so and update
+   ARG accordingly.  */
+
+bool
+apply_pass_by_reference_rules (CUMULATIVE_ARGS *ca, function_arg_info &arg)
+{
+  if (pass_by_reference (ca, arg))
+    {
+      arg.type = build_pointer_type (arg.type);
+      arg.mode = TYPE_MODE (arg.type);
+      return true;
+    }
+  return false;
+}
+
+/* Return true if ARG, which is passed by reference, should be callee
    copied instead of caller copied.  */
 
 bool
-reference_callee_copied (CUMULATIVE_ARGS *ca, machine_mode mode,
-			 tree type, bool named_arg)
+reference_callee_copied (CUMULATIVE_ARGS *ca, const function_arg_info &arg)
 {
-  if (type && TREE_ADDRESSABLE (type))
+  if (arg.type && TREE_ADDRESSABLE (arg.type))
     return false;
-  return targetm.calls.callee_copies (pack_cumulative_args (ca), mode, type,
-				      named_arg);
+  return targetm.calls.callee_copies (pack_cumulative_args (ca), arg);
 }
 
 
@@ -1350,7 +1372,6 @@ maybe_warn_alloc_args_overflow (tree fn, tree exp, tree args[2], int idx[2])
   location_t loc = EXPR_LOCATION (exp);
 
   tree fntype = fn ? TREE_TYPE (fn) : TREE_TYPE (TREE_TYPE (exp));
-  built_in_function fncode = fn ? DECL_FUNCTION_CODE (fn) : BUILT_IN_NONE;
   bool warned = false;
 
   /* Validate each argument individually.  */
@@ -1376,11 +1397,10 @@ maybe_warn_alloc_args_overflow (tree fn, tree exp, tree args[2], int idx[2])
 		 friends.
 		 Also avoid issuing the warning for calls to function named
 		 "alloca".  */
-	      if ((fncode == BUILT_IN_ALLOCA
-		   && IDENTIFIER_LENGTH (DECL_NAME (fn)) != 6)
-		  || (fncode != BUILT_IN_ALLOCA
-		      && !lookup_attribute ("returns_nonnull",
-					    TYPE_ATTRIBUTES (fntype))))
+	      if (fn && fndecl_built_in_p (fn, BUILT_IN_ALLOCA)
+		  ? IDENTIFIER_LENGTH (DECL_NAME (fn)) != 6
+		  : !lookup_attribute ("returns_nonnull",
+				       TYPE_ATTRIBUTES (fntype)))
 		warned = warning_at (loc, OPT_Walloc_zero,
 				     "%Kargument %i value is zero",
 				     exp, idx[i] + 1);
@@ -1395,7 +1415,7 @@ maybe_warn_alloc_args_overflow (tree fn, tree exp, tree args[2], int idx[2])
 		  && fn
 		  && !args[1]
 		  && lang_GNU_CXX ()
-		  && DECL_IS_OPERATOR_NEW (fn)
+		  && DECL_IS_OPERATOR_NEW_P (fn)
 		  && integer_all_onesp (args[i]))
 		continue;
 
@@ -1989,15 +2009,13 @@ initialize_argument_information (int num_actuals ATTRIBUTE_UNUSED,
 	 with those made by function.c.  */
 
       /* See if this argument should be passed by invisible reference.  */
-      if (pass_by_reference (args_so_far_pnt, TYPE_MODE (type),
-			     type, argpos < n_named_args))
+      function_arg_info orig_arg (type, argpos < n_named_args);
+      if (pass_by_reference (args_so_far_pnt, orig_arg))
 	{
 	  bool callee_copies;
 	  tree base = NULL_TREE;
 
-	  callee_copies
-	    = reference_callee_copied (args_so_far_pnt, TYPE_MODE (type),
-				       type, argpos < n_named_args);
+	  callee_copies = reference_callee_copied (args_so_far_pnt, orig_arg);
 
 	  /* If we're compiling a thunk, pass through invisible references
 	     instead of making a copy.  */
@@ -2118,8 +2136,8 @@ initialize_argument_information (int num_actuals ATTRIBUTE_UNUSED,
 
       targetm.calls.warn_parameter_passing_abi (args_so_far, type);
 
-      args[i].reg = targetm.calls.function_arg (args_so_far, mode, type,
-						argpos < n_named_args);
+      function_arg_info arg (type, mode, argpos < n_named_args);
+      args[i].reg = targetm.calls.function_arg (args_so_far, arg);
 
       if (args[i].reg && CONST_INT_P (args[i].reg))
 	args[i].reg = NULL;
@@ -2129,17 +2147,14 @@ initialize_argument_information (int num_actuals ATTRIBUTE_UNUSED,
 	 arguments have to go into the incoming registers.  */
       if (targetm.calls.function_incoming_arg != targetm.calls.function_arg)
 	args[i].tail_call_reg
-	  = targetm.calls.function_incoming_arg (args_so_far, mode, type,
-						 argpos < n_named_args);
+	  = targetm.calls.function_incoming_arg (args_so_far, arg);
       else
 	args[i].tail_call_reg = args[i].reg;
 
       if (args[i].reg)
-	args[i].partial
-	  = targetm.calls.arg_partial_bytes (args_so_far, mode, type,
-					     argpos < n_named_args);
+	args[i].partial = targetm.calls.arg_partial_bytes (args_so_far, arg);
 
-      args[i].pass_on_stack = targetm.calls.must_pass_in_stack (mode, type);
+      args[i].pass_on_stack = targetm.calls.must_pass_in_stack (arg);
 
       /* If FUNCTION_ARG returned a (parallel [(expr_list (nil) ...) ...]),
 	 it means that we are to pass this arg in the register(s) designated
@@ -2188,8 +2203,13 @@ initialize_argument_information (int num_actuals ATTRIBUTE_UNUSED,
       /* Increment ARGS_SO_FAR, which has info about which arg-registers
 	 have been used, etc.  */
 
-      targetm.calls.function_arg_advance (args_so_far, TYPE_MODE (type),
-					  type, argpos < n_named_args);
+      /* ??? Traditionally we've passed TYPE_MODE here, instead of the
+	 promoted_mode used for function_arg above.  However, the
+	 corresponding handling of incoming arguments in function.c
+	 does pass the promoted mode.  */
+      function_arg_info arg_to_skip (type, TYPE_MODE (type),
+				     argpos < n_named_args);
+      targetm.calls.function_arg_advance (args_so_far, arg_to_skip);
 
       /* Store argument values for functions decorated with attribute
 	 alloc_size.  */
@@ -4222,14 +4242,11 @@ expand_call (tree exp, rtx target, int ignore)
       /* Set up next argument register.  For sibling calls on machines
 	 with register windows this should be the incoming register.  */
       if (pass == 0)
-	next_arg_reg = targetm.calls.function_incoming_arg (args_so_far,
-							    VOIDmode,
-							    void_type_node,
-							    true);
+	next_arg_reg = targetm.calls.function_incoming_arg
+	  (args_so_far, function_arg_info::end_marker ());
       else
-	next_arg_reg = targetm.calls.function_arg (args_so_far,
-						   VOIDmode, void_type_node,
-						   true);
+	next_arg_reg = targetm.calls.function_arg
+	  (args_so_far, function_arg_info::end_marker ());
 
       if (pass == 1 && (return_flags & ERF_RETURNS_ARG))
 	{
@@ -4846,10 +4863,9 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx value,
       argvec[count].mode = Pmode;
       argvec[count].partial = 0;
 
-      argvec[count].reg = targetm.calls.function_arg (args_so_far,
-						      Pmode, NULL_TREE, true);
-      gcc_assert (targetm.calls.arg_partial_bytes (args_so_far, Pmode,
-						   NULL_TREE, 1) == 0);
+      function_arg_info ptr_arg (Pmode, /*named=*/true);
+      argvec[count].reg = targetm.calls.function_arg (args_so_far, ptr_arg);
+      gcc_assert (targetm.calls.arg_partial_bytes (args_so_far, ptr_arg) == 0);
 
       locate_and_pad_parm (Pmode, NULL_TREE,
 #ifdef STACK_PARMS_IN_REG_PARM_AREA
@@ -4864,7 +4880,7 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx value,
 	  || reg_parm_stack_space > 0)
 	args_size.constant += argvec[count].locate.size.constant;
 
-      targetm.calls.function_arg_advance (args_so_far, Pmode, (tree) 0, true);
+      targetm.calls.function_arg_advance (args_so_far, ptr_arg);
 
       count++;
     }
@@ -4885,11 +4901,11 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx value,
 	  && !(CONSTANT_P (val) && targetm.legitimate_constant_p (mode, val)))
 	val = force_operand (val, NULL_RTX);
 
-      if (pass_by_reference (&args_so_far_v, mode, NULL_TREE, 1))
+      function_arg_info orig_arg (mode, /*named=*/true);
+      if (pass_by_reference (&args_so_far_v, orig_arg))
 	{
 	  rtx slot;
-	  int must_copy
-	    = !reference_callee_copied (&args_so_far_v, mode, NULL_TREE, 1);
+	  int must_copy = !reference_callee_copied (&args_so_far_v, orig_arg);
 
 	  /* If this was a CONST function, it is now PURE since it now
 	     reads memory.  */
@@ -4927,13 +4943,13 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx value,
 	}
 
       mode = promote_function_mode (NULL_TREE, mode, &unsigned_p, NULL_TREE, 0);
+      function_arg_info arg (mode, /*named=*/true);
       argvec[count].mode = mode;
       argvec[count].value = convert_modes (mode, GET_MODE (val), val, unsigned_p);
-      argvec[count].reg = targetm.calls.function_arg (args_so_far, mode,
-						      NULL_TREE, true);
+      argvec[count].reg = targetm.calls.function_arg (args_so_far, arg);
 
       argvec[count].partial
-	= targetm.calls.arg_partial_bytes (args_so_far, mode, NULL_TREE, 1);
+	= targetm.calls.arg_partial_bytes (args_so_far, arg);
 
       if (argvec[count].reg == 0
 	  || argvec[count].partial != 0
@@ -4959,7 +4975,7 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx value,
 			     known_le (GET_MODE_SIZE (mode), UNITS_PER_WORD));
 #endif
 
-      targetm.calls.function_arg_advance (args_so_far, mode, (tree) 0, true);
+      targetm.calls.function_arg_advance (args_so_far, arg);
     }
 
   /* If this machine requires an external definition for library
@@ -5302,7 +5318,7 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx value,
 	       original_args_size.constant, args_size.constant,
 	       struct_value_size,
 	       targetm.calls.function_arg (args_so_far,
-					   VOIDmode, void_type_node, true),
+					   function_arg_info::end_marker ()),
 	       valreg,
 	       old_inhibit_defer_pop + 1, call_fusage, flags, args_so_far);
 
@@ -5815,22 +5831,21 @@ store_one_arg (struct arg_data *arg, rtx argblock, int flags,
   return sibcall_failure;
 }
 
-/* Nonzero if we do not know how to pass TYPE solely in registers.  */
+/* Nonzero if we do not know how to pass ARG solely in registers.  */
 
 bool
-must_pass_in_stack_var_size (machine_mode mode ATTRIBUTE_UNUSED,
-			     const_tree type)
+must_pass_in_stack_var_size (const function_arg_info &arg)
 {
-  if (!type)
+  if (!arg.type)
     return false;
 
   /* If the type has variable size...  */
-  if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
+  if (!poly_int_tree_p (TYPE_SIZE (arg.type)))
     return true;
 
   /* If the type is marked as addressable (it is required
      to be constructed into the stack)...  */
-  if (TREE_ADDRESSABLE (type))
+  if (TREE_ADDRESSABLE (arg.type))
     return true;
 
   return false;
@@ -5841,33 +5856,43 @@ must_pass_in_stack_var_size (machine_mode mode ATTRIBUTE_UNUSED,
 /* ??? Should be able to merge these two by examining BLOCK_REG_PADDING.  */
 
 bool
-must_pass_in_stack_var_size_or_pad (machine_mode mode, const_tree type)
+must_pass_in_stack_var_size_or_pad (const function_arg_info &arg)
 {
-  if (!type)
+  if (!arg.type)
     return false;
 
   /* If the type has variable size...  */
-  if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
+  if (TREE_CODE (TYPE_SIZE (arg.type)) != INTEGER_CST)
     return true;
 
   /* If the type is marked as addressable (it is required
      to be constructed into the stack)...  */
-  if (TREE_ADDRESSABLE (type))
+  if (TREE_ADDRESSABLE (arg.type))
     return true;
 
-  if (TYPE_EMPTY_P (type))
+  if (TYPE_EMPTY_P (arg.type))
     return false;
 
   /* If the padding and mode of the type is such that a copy into
      a register would put it into the wrong part of the register.  */
-  if (mode == BLKmode
-      && int_size_in_bytes (type) % (PARM_BOUNDARY / BITS_PER_UNIT)
-      && (targetm.calls.function_arg_padding (mode, type)
+  if (arg.mode == BLKmode
+      && int_size_in_bytes (arg.type) % (PARM_BOUNDARY / BITS_PER_UNIT)
+      && (targetm.calls.function_arg_padding (arg.mode, arg.type)
 	  == (BYTES_BIG_ENDIAN ? PAD_UPWARD : PAD_DOWNWARD)))
     return true;
 
   return false;
 }
 
+/* Return true if TYPE must be passed on the stack when passed to
+   the "..." arguments of a function.  */
+
+bool
+must_pass_va_arg_in_stack (tree type)
+{
+  function_arg_info arg (type, /*named=*/false);
+  return targetm.calls.must_pass_in_stack (arg);
+}
+
 /* Tell the garbage collector about GTY markers in this source file.  */
 #include "gt-calls.h"
diff --git a/gcc/calls.h b/gcc/calls.h
index 128bb5130..01ab3905a 100644
--- a/gcc/calls.h
+++ b/gcc/calls.h
@@ -20,23 +20,108 @@ along with GCC; see the file COPYING3.  If not see
 #ifndef GCC_CALLS_H
 #define GCC_CALLS_H
 
+/* Describes a function argument.
+
+   Each argument conceptually has a gimple-level type.  Usually this type
+   is available directly as a tree via the TYPE field, but when calling
+   libgcc support functions it might instead be inferred from a mode,
+   in which case the type isn't available directly.
+
+   This gimple-level type might go through promotion before being passed to
+   the target function.  Depending on the context, the MODE field is either
+   the mode of the gimple-level type (whether explicitly given or not)
+   or the mode after promotion has been performed.  */
+class function_arg_info
+{
+public:
+  function_arg_info () : type (NULL_TREE), mode (VOIDmode), named (false) {}
+
+  /* Initialize an argument of mode MODE, either before or after promotion.  */
+  function_arg_info (machine_mode mode, bool named)
+    : type (NULL_TREE), mode (mode), named (named)
+  {}
+
+  /* Initialize an unpromoted argument of type TYPE.  */
+  function_arg_info (tree type, bool named)
+    : type (type), mode (TYPE_MODE (type)), named (named)
+  {}
+
+  /* Initialize an argument with explicit properties.  */
+  function_arg_info (tree type, machine_mode mode, bool named)
+    : type (type), mode (mode), named (named)
+  {}
+
+  /* Return true if the gimple-level type is an aggregate.  */
+  bool aggregate_type_p () const { return type && AGGREGATE_TYPE_P (type); }
+
+  /* Return the size of the gimple-level type, or -1 if the size is
+     variable or otherwise not representable as a poly_int64.
+
+     Use this function when MODE is the mode of the type before promotion,
+     or in any context if the target never promotes function arguments.  */
+  poly_int64 type_size_in_bytes () const
+  {
+    if (type)
+      return int_size_in_bytes (type);
+    return GET_MODE_SIZE (mode);
+  }
+
+  /* Return the size of the argument after promotion, or -1 if the size
+     is variable or otherwise not representable as a poly_int64.
+
+     Use this function when MODE is the mode of the type after promotion.  */
+  poly_int64 promoted_size_in_bytes () const
+  {
+    if (mode == BLKmode)
+      return int_size_in_bytes (type);
+    return GET_MODE_SIZE (mode);
+  }
+
+  /* True if the argument represents the end of the argument list,
+     as returned by end_marker ().  */
+  bool end_marker_p () const { return mode == VOIDmode; }
+
+  /* Return a function_arg_info that represents the end of the
+     argument list.  */
+  static function_arg_info end_marker ()
+  {
+    return function_arg_info (void_type_node, /*named=*/true);
+  }
+
+  /* The type of the argument, or null if not known (which is true for
+     libgcc support functions).  */
+  tree type;
+
+  /* The mode of the argument.  Depending on context, this might be
+     the mode of the argument type or the mode after promotion.  */
+  machine_mode mode;
+
+  /* True if the argument is treated as a named argument, false if it is
+     treated as an unnamed variadic argument (i.e. one passed through
+     "...").  See also TARGET_STRICT_ARGUMENT_NAMING.  */
+  unsigned int named : 1;
+};
+
 extern int flags_from_decl_or_type (const_tree);
 extern int call_expr_flags (const_tree);
 extern int setjmp_call_p (const_tree);
 extern bool gimple_maybe_alloca_call_p (const gimple *);
 extern bool gimple_alloca_call_p (const gimple *);
 extern bool alloca_call_p (const_tree);
-extern bool must_pass_in_stack_var_size (machine_mode, const_tree);
-extern bool must_pass_in_stack_var_size_or_pad (machine_mode, const_tree);
+extern bool must_pass_in_stack_var_size (const function_arg_info &);
+extern bool must_pass_in_stack_var_size_or_pad (const function_arg_info &);
+extern bool must_pass_va_arg_in_stack (tree);
 extern rtx prepare_call_address (tree, rtx, rtx, rtx *, int, int);
 extern bool shift_return_value (machine_mode, bool, rtx);
 extern rtx expand_call (tree, rtx, int);
 extern void fixup_tail_calls (void);
 
-extern bool pass_by_reference (CUMULATIVE_ARGS *, machine_mode,
-			       tree, bool);
-extern bool reference_callee_copied (CUMULATIVE_ARGS *, machine_mode,
-				     tree, bool);
+extern bool pass_by_reference (CUMULATIVE_ARGS *, function_arg_info);
+extern bool pass_va_arg_by_reference (tree);
+extern bool apply_pass_by_reference_rules (CUMULATIVE_ARGS *,
+					   function_arg_info &);
+extern bool reference_callee_copied (CUMULATIVE_ARGS *,
+				     const function_arg_info &);
 extern void maybe_warn_alloc_args_overflow (tree, tree, tree[2], int[2]);
 extern tree get_attr_nonstring_decl (tree, tree * = NULL);
 extern void maybe_warn_nonstring_arg (tree, tree);
diff --git a/gcc/cfgcleanup.c b/gcc/cfgcleanup.c
index 8c464ec79..ff7f014da 100644
--- a/gcc/cfgcleanup.c
+++ b/gcc/cfgcleanup.c
@@ -54,6 +54,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "dbgcnt.h"
 #include "rtl-iter.h"
 #include "regs.h"
+#include "function-abi.h"
 
 #define FORWARDER_BLOCK_P(BB) ((BB)->flags & BB_FORWARDER_BLOCK)
 
@@ -1230,12 +1231,13 @@ old_insns_match_p (int mode ATTRIBUTE_UNUSED, rtx_insn *i1, rtx_insn *i2)
 	    }
 	}
 
-      HARD_REG_SET i1_used, i2_used;
+      HARD_REG_SET i1_used = insn_callee_abi (i1).full_reg_clobbers ();
+      HARD_REG_SET i2_used = insn_callee_abi (i2).full_reg_clobbers ();
+      /* ??? This preserves traditional behavior; it might not be needed.  */
+      i1_used |= fixed_reg_set;
+      i2_used |= fixed_reg_set;
 
-      get_call_reg_set_usage (i1, &i1_used, call_used_reg_set);
-      get_call_reg_set_usage (i2, &i2_used, call_used_reg_set);
-
-      if (!hard_reg_set_equal_p (i1_used, i2_used))
+      if (i1_used != i2_used)
         return dir_none;
     }
 
@@ -1269,7 +1271,7 @@ old_insns_match_p (int mode ATTRIBUTE_UNUSED, rtx_insn *i1, rtx_insn *i2)
 	if (REG_NOTE_KIND (note) == REG_DEAD && STACK_REG_P (XEXP (note, 0)))
 	  SET_HARD_REG_BIT (i2_regset, REGNO (XEXP (note, 0)));
 
-      if (!hard_reg_set_equal_p (i1_regset, i2_regset))
+      if (i1_regset != i2_regset)
 	return dir_none;
     }
 #endif
diff --git a/gcc/cfgexpand.c b/gcc/cfgexpand.c
index 4ae8e3b32..218414b39 100644
--- a/gcc/cfgexpand.c
+++ b/gcc/cfgexpand.c
@@ -2874,6 +2874,15 @@ asm_clobber_reg_is_valid (int regno, int nregs, const char *regname)
       error ("PIC register clobbered by %qs in %<asm%>", regname);
       is_valid = false;
     }
+  else if (!in_hard_reg_set_p
+	   (accessible_reg_set, reg_raw_mode[regno], regno))
+    {
+      /* ??? Diagnose during gimplification?  */
+      error ("the register %qs cannot be clobbered in %<asm%>"
+	     " for the current target", regname);
+      is_valid = false;
+    }
+
   /* Clobbering the stack pointer register is deprecated.  GCC expects
      the value of the stack pointer after an asm statement to be the same
      as it was before, so no asm can validly clobber the stack pointer in
@@ -3865,7 +3874,6 @@ expand_gimple_stmt (gimple *stmt)
 	      /* If we want exceptions for non-call insns, any
 		 may_trap_p instruction may throw.  */
 	      && GET_CODE (PATTERN (insn)) != CLOBBER
-	      && GET_CODE (PATTERN (insn)) != CLOBBER_HIGH
 	      && GET_CODE (PATTERN (insn)) != USE
 	      && insn_could_throw_p (insn))
 	    make_reg_eh_region_note (insn, 0, lp_nr);
diff --git a/gcc/cfgloopanal.c b/gcc/cfgloopanal.c
index 6dbe96f9d..3388da7dd 100644
--- a/gcc/cfgloopanal.c
+++ b/gcc/cfgloopanal.c
@@ -353,7 +353,7 @@ init_set_costs (void)
 	&& !fixed_regs[i])
       {
 	target_avail_regs++;
-	if (call_used_regs[i])
+	if (call_used_or_fixed_reg_p (i))
 	  target_clobbered_regs++;
       }
 
diff --git a/gcc/cgraph.c b/gcc/cgraph.c
index 62f1afa2a..9dca43031 100644
--- a/gcc/cgraph.c
+++ b/gcc/cgraph.c
@@ -1883,7 +1883,7 @@ cgraph_node::local_info (tree decl)
 /* Return local info for the compiled function.  */
 
 cgraph_rtl_info *
-cgraph_node::rtl_info (tree decl)
+cgraph_node::rtl_info (const_tree decl)
 {
   gcc_assert (TREE_CODE (decl) == FUNCTION_DECL);
   cgraph_node *node = get (decl);
@@ -1898,7 +1898,10 @@ cgraph_node::rtl_info (tree decl)
     return NULL;
   /* Allocate if it doesn't exist.  */
   if (node->rtl == NULL)
-    node->rtl = ggc_cleared_alloc<cgraph_rtl_info> ();
+    {
+      node->rtl = ggc_cleared_alloc<cgraph_rtl_info> ();
+      node->rtl->function_used_regs = reg_class_contents[ALL_REGS];
+    }
   return node->rtl;
 }
 
diff --git a/gcc/cgraph.h b/gcc/cgraph.h
index 10d1a2c6f..ad6720a4b 100644
--- a/gcc/cgraph.h
+++ b/gcc/cgraph.h
@@ -1347,7 +1347,7 @@ public:
   static cgraph_local_info *local_info (tree decl);
 
   /* Return local info for the compiled function.  */
-  static struct cgraph_rtl_info *rtl_info (tree);
+  static struct cgraph_rtl_info *rtl_info (const_tree);
 
   /* Return the cgraph node that has ASMNAME for its DECL_ASSEMBLER_NAME.
      Return NULL if there's no such node.  */
diff --git a/gcc/cgraphclones.c b/gcc/cgraphclones.c
index cd3f585bd..43423234b 100644
--- a/gcc/cgraphclones.c
+++ b/gcc/cgraphclones.c
@@ -225,10 +225,7 @@ build_function_decl_skip_args (tree orig_decl, bitmap args_to_skip,
   if (fndecl_built_in_p (new_decl)
       && args_to_skip
       && !bitmap_empty_p (args_to_skip))
-    {
-      DECL_BUILT_IN_CLASS (new_decl) = NOT_BUILT_IN;
-      DECL_FUNCTION_CODE (new_decl) = (enum built_in_function) 0;
-    }
+    set_decl_built_in_function (new_decl, NOT_BUILT_IN, 0);
   /* The FE might have information and assumptions about the other
      arguments.  */
   DECL_LANG_SPECIFIC (new_decl) = NULL;
@@ -415,7 +412,7 @@ dump_callgraph_transformation (const cgraph_node *original,
 
    If the new node is being inlined into another one, NEW_INLINED_TO should be
    the outline function the new one is (even indirectly) inlined to.  All hooks
-   will see this in node's global.inlined_to, when invoked.  Can be NULL if the
+   will see this in node's inlined_to, when invoked.  Can be NULL if the
    node is not inlined.  */
 
 cgraph_node *
@@ -1056,7 +1053,7 @@ cgraph_node::create_version_clone_with_body
       location_t saved_loc = input_location;
       tree v = TREE_VALUE (target_attributes);
       input_location = DECL_SOURCE_LOCATION (new_decl);
-      bool r = targetm.target_option.valid_attribute_p (new_decl, NULL, v, 0);
+      bool r = targetm.target_option.valid_attribute_p (new_decl, NULL, v, 1);
       input_location = saved_loc;
       if (!r)
 	return NULL;
diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
index dee6becc7..ddf298583 100644
--- a/gcc/cgraphunit.c
+++ b/gcc/cgraphunit.c
@@ -1793,7 +1793,6 @@ cgraph_node::expand_thunk (bool output_asm_thunks, bool force_gimple_thunk)
       && targetm.asm_out.can_output_mi_thunk (thunk_fndecl, fixed_offset,
 					      virtual_value, alias))
     {
-      const char *fnname;
       tree fn_block;
       tree restype = TREE_TYPE (TREE_TYPE (thunk_fndecl));
 
@@ -1817,7 +1816,6 @@ cgraph_node::expand_thunk (bool output_asm_thunks, bool force_gimple_thunk)
 	= build_decl (DECL_SOURCE_LOCATION (thunk_fndecl),
 		      RESULT_DECL, 0, restype);
       DECL_CONTEXT (DECL_RESULT (thunk_fndecl)) = thunk_fndecl;
-      fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk_fndecl));
 
       /* The back end expects DECL_INITIAL to contain a BLOCK, so we
 	 create one.  */
@@ -1831,12 +1829,10 @@ cgraph_node::expand_thunk (bool output_asm_thunks, bool force_gimple_thunk)
       insn_locations_init ();
       set_curr_insn_location (DECL_SOURCE_LOCATION (thunk_fndecl));
       prologue_location = curr_insn_location ();
-      assemble_start_function (thunk_fndecl, fnname);
 
       targetm.asm_out.output_mi_thunk (asm_out_file, thunk_fndecl,
 				       fixed_offset, virtual_value, alias);
 
-      assemble_end_function (thunk_fndecl, fnname);
       insn_locations_finalize ();
       init_insn_lengths ();
       free_after_compilation (cfun);
diff --git a/gcc/cif-code.def b/gcc/cif-code.def
index 3356377a1..a154f24f1 100644
--- a/gcc/cif-code.def
+++ b/gcc/cif-code.def
@@ -70,8 +70,12 @@ DEFCIFCODE(LARGE_STACK_FRAME_GROWTH_LIMIT, CIF_FINAL_NORMAL,
 	   N_("--param large-stack-frame-growth limit reached"))
 DEFCIFCODE(MAX_INLINE_INSNS_SINGLE_LIMIT, CIF_FINAL_NORMAL,
 	   N_("--param max-inline-insns-single limit reached"))
+DEFCIFCODE(MAX_INLINE_INSNS_SINGLE_O2_LIMIT, CIF_FINAL_NORMAL,
+	   N_("--param max-inline-insns-single-O2 limit reached"))
 DEFCIFCODE(MAX_INLINE_INSNS_AUTO_LIMIT, CIF_FINAL_NORMAL,
 	   N_("--param max-inline-insns-auto limit reached"))
+DEFCIFCODE(MAX_INLINE_INSNS_AUTO_O2_LIMIT, CIF_FINAL_NORMAL,
+	   N_("--param max-inline-insns-auto-O2 limit reached"))
 DEFCIFCODE(INLINE_UNIT_GROWTH_LIMIT, CIF_FINAL_NORMAL,
 	   N_("--param inline-unit-growth limit reached"))
 
@@ -83,6 +87,10 @@ DEFCIFCODE(RECURSIVE_INLINING, CIF_FINAL_NORMAL,
 DEFCIFCODE(UNLIKELY_CALL, CIF_FINAL_NORMAL,
 	   N_("call is unlikely and code size would grow"))
 
+/* Call is considered never executed.  */
+DEFCIFCODE(NEVER_CALL, CIF_FINAL_NORMAL,
+	   N_("call is considered never executed and code size would grow"))
+
 /* Function is not declared as inline.  */
 DEFCIFCODE(NOT_DECLARED_INLINED, CIF_FINAL_NORMAL,
 	   N_("function not declared inline and code size would grow"))
diff --git a/gcc/combine-stack-adj.c b/gcc/combine-stack-adj.c
index 3638a1b10..d14d59abc 100644
--- a/gcc/combine-stack-adj.c
+++ b/gcc/combine-stack-adj.c
@@ -133,7 +133,6 @@ single_set_for_csa (rtx_insn *insn)
 	  && SET_SRC (this_rtx) == SET_DEST (this_rtx))
 	;
       else if (GET_CODE (this_rtx) != CLOBBER
-	       && GET_CODE (this_rtx) != CLOBBER_HIGH
 	       && GET_CODE (this_rtx) != USE)
 	return NULL_RTX;
     }
diff --git a/gcc/combine.c b/gcc/combine.c
index b9d674c96..a425f0ca6 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -571,7 +571,6 @@ find_single_use_1 (rtx dest, rtx *loc)
     case SYMBOL_REF:
     CASE_CONST_ANY:
     case CLOBBER:
-    case CLOBBER_HIGH:
       return 0;
 
     case SET:
@@ -1224,8 +1223,7 @@ combine_instructions (rtx_insn *f, unsigned int nregs)
             subst_low_luid = DF_INSN_LUID (insn);
             subst_insn = insn;
 
-	    note_stores (PATTERN (insn), set_nonzero_bits_and_sign_copies,
-		         insn);
+	    note_stores (insn, set_nonzero_bits_and_sign_copies, insn);
 	    record_dead_and_set_regs (insn);
 
 	    if (AUTO_INC_DEC)
@@ -1763,9 +1761,6 @@ set_nonzero_bits_and_sign_copies (rtx x, const_rtx set, void *data)
 	  return;
 	}
 
-      /* Should not happen as we only using pseduo registers.  */
-      gcc_assert (GET_CODE (set) != CLOBBER_HIGH);
-
       /* If this register is being initialized using itself, and the
 	 register is uninitialized in this basic block, and there are
 	 no LOG_LINKS which set the register, then part of the
@@ -1924,7 +1919,6 @@ can_combine_p (rtx_insn *insn, rtx_insn *i3, rtx_insn *pred ATTRIBUTE_UNUSED,
 
 	      /* We can ignore CLOBBERs.  */
 	    case CLOBBER:
-	    case CLOBBER_HIGH:
 	      break;
 
 	    case SET:
@@ -2439,7 +2433,7 @@ likely_spilled_retval_p (rtx_insn *insn)
   info.mask = mask;
   for (p = PREV_INSN (use); info.mask && p != insn; p = PREV_INSN (p))
     if (INSN_P (p))
-      note_stores (PATTERN (p), likely_spilled_retval_1, &info);
+      note_stores (p, likely_spilled_retval_1, &info);
   mask = info.mask;
 
   /* Check if any of the (probably) live return value registers is
@@ -2595,8 +2589,6 @@ is_parallel_of_n_reg_sets (rtx pat, int n)
 	if (XEXP (XVECEXP (pat, 0, i), 0) == const0_rtx)
 	  return false;
 	break;
-      case CLOBBER_HIGH:
-	break;
       default:
 	return false;
       }
@@ -2897,8 +2889,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
       for (i = 0; ok && i < XVECLEN (p2, 0); i++)
 	{
 	  if ((GET_CODE (XVECEXP (p2, 0, i)) == SET
-	       || GET_CODE (XVECEXP (p2, 0, i)) == CLOBBER
-	       || GET_CODE (XVECEXP (p2, 0, i)) == CLOBBER_HIGH)
+	       || GET_CODE (XVECEXP (p2, 0, i)) == CLOBBER)
 	      && reg_overlap_mentioned_p (SET_DEST (PATTERN (i3)),
 					  SET_DEST (XVECEXP (p2, 0, i))))
 	    ok = false;
@@ -4741,8 +4732,8 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
        been made to this insn.  The order is important, because newi2pat
        can affect nonzero_bits of newpat.  */
     if (newi2pat)
-      note_stores (newi2pat, set_nonzero_bits_and_sign_copies, NULL);
-    note_stores (newpat, set_nonzero_bits_and_sign_copies, NULL);
+      note_pattern_stores (newi2pat, set_nonzero_bits_and_sign_copies, NULL);
+    note_pattern_stores (newpat, set_nonzero_bits_and_sign_copies, NULL);
   }
 
   if (undobuf.other_insn != NULL_RTX)
@@ -13409,15 +13400,6 @@ record_dead_and_set_regs_1 (rtx dest, const_rtx setter, void *data)
 			      ? SET_SRC (setter)
 			      : gen_lowpart (GET_MODE (dest),
 					     SET_SRC (setter)));
-      else if (GET_CODE (setter) == CLOBBER_HIGH)
-	{
-	  reg_stat_type *rsp = &reg_stat[REGNO (dest)];
-	  if (rsp->last_set_value
-	      && reg_is_clobbered_by_clobber_high
-		   (REGNO (dest), GET_MODE (rsp->last_set_value),
-		    XEXP (setter, 0)))
-	    record_value_for_reg (dest, NULL, NULL_RTX);
-	}
       else
 	record_value_for_reg (dest, record_dead_insn, NULL_RTX);
     }
@@ -13487,10 +13469,10 @@ record_dead_and_set_regs (rtx_insn *insn)
 	 the return value register is set at this LUID.  We could
 	 still replace a register with the return value from the
 	 wrong subroutine call!  */
-      note_stores (PATTERN (insn), record_dead_and_set_regs_1, NULL_RTX);
+      note_stores (insn, record_dead_and_set_regs_1, NULL_RTX);
     }
   else
-    note_stores (PATTERN (insn), record_dead_and_set_regs_1, insn);
+    note_stores (insn, record_dead_and_set_regs_1, insn);
 }
 
 /* If a SUBREG has the promoted bit set, it is in fact a property of the
@@ -13853,10 +13835,6 @@ reg_dead_at_p_1 (rtx dest, const_rtx x, void *data ATTRIBUTE_UNUSED)
   if (!REG_P (dest))
     return;
 
-  if (GET_CODE (x) == CLOBBER_HIGH
-      && !reg_is_clobbered_by_clobber_high (reg_dead_reg, XEXP (x, 0)))
-    return;
-
   regno = REGNO (dest);
   endregno = END_REGNO (dest);
   if (reg_dead_endregno > regno && reg_dead_regno < endregno)
@@ -13904,7 +13882,7 @@ reg_dead_at_p (rtx reg, rtx_insn *insn)
 	  if (find_regno_note (insn, REG_UNUSED, reg_dead_regno))
 	    return 1;
 
-	  note_stores (PATTERN (insn), reg_dead_at_p_1, NULL);
+	  note_stores (insn, reg_dead_at_p_1, NULL);
 	  if (reg_dead_flag)
 	    return reg_dead_flag == 1 ? 1 : 0;
 
diff --git a/gcc/common.opt b/gcc/common.opt
index 0bdf51dd8..7dee534b8 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1080,16 +1080,16 @@ Common Report Var(flag_branch_probabilities) Optimization
 Use profiling information for branch probabilities.
 
 fbranch-target-load-optimize
-Common Report Var(flag_branch_target_load_optimize) Optimization
-Perform branch target load optimization before prologue / epilogue threading.
+Common Ignore
+Does nothing.  Preserved for backward compatibility.
 
 fbranch-target-load-optimize2
-Common Report Var(flag_branch_target_load_optimize2) Optimization
-Perform branch target load optimization after prologue / epilogue threading.
+Common Ignore
+Does nothing.  Preserved for backward compatibility.
 
 fbtr-bb-exclusive
-Common Report Var(flag_btr_bb_exclusive) Optimization
-Restrict target load migration not to re-use registers in any basic block.
+Common Ignore
+Does nothing.  Preserved for backward compatibility.
 
 fcall-saved-
 Common Joined RejectNegative Var(common_deferred_options) Defer
@@ -1289,6 +1289,26 @@ Enum(diagnostic_color_rule) String(always) Value(DIAGNOSTICS_COLOR_YES)
 EnumValue
 Enum(diagnostic_color_rule) String(auto) Value(DIAGNOSTICS_COLOR_AUTO)
 
+fdiagnostics-urls=
+Driver Common Joined RejectNegative Var(flag_diagnostics_show_urls) Enum(diagnostic_url_rule) Init(DIAGNOSTICS_URL_AUTO)
+-fdiagnostics-urls=[never|always|auto]	Embed URLs in diagnostics.
+
+; Required for these enum values.
+SourceInclude
+diagnostic-url.h
+
+Enum
+Name(diagnostic_url_rule) Type(int)
+
+EnumValue
+Enum(diagnostic_url_rule) String(never) Value(DIAGNOSTICS_URL_NO)
+
+EnumValue
+Enum(diagnostic_url_rule) String(always) Value(DIAGNOSTICS_URL_YES)
+
+EnumValue
+Enum(diagnostic_url_rule) String(auto) Value(DIAGNOSTICS_URL_AUTO)
+
 fdiagnostics-format=
 Common Joined RejectNegative Enum(diagnostics_output_format)
 -fdiagnostics-format=[text|json] Select output format.
@@ -1963,7 +1983,7 @@ Common Var(flag_dce) Init(1) Optimization
 Use the RTL dead code elimination pass.
 
 fdse
-Common Var(flag_dse) Init(1) Optimization
+Common Var(flag_dse) Init(0) Optimization
 Use the RTL dead store elimination pass.
 
 freschedule-modulo-scheduled-loops
diff --git a/gcc/common/config/aarch64/aarch64-common.c b/gcc/common/config/aarch64/aarch64-common.c
index bab3ab3fa..07c032539 100644
--- a/gcc/common/config/aarch64/aarch64-common.c
+++ b/gcc/common/config/aarch64/aarch64-common.c
@@ -170,9 +170,9 @@ aarch64_handle_option (struct gcc_options *opts,
 struct aarch64_option_extension
 {
   const char *const name;
-  const unsigned long flag_canonical;
-  const unsigned long flags_on;
-  const unsigned long flags_off;
+  const uint64_t flag_canonical;
+  const uint64_t flags_on;
+  const uint64_t flags_off;
   const bool is_synthetic;
 };
 
@@ -201,14 +201,14 @@ struct processor_name_to_arch
 {
   const std::string processor_name;
   const enum aarch64_arch arch;
-  const unsigned long flags;
+  const uint64_t flags;
 };
 
 struct arch_to_arch_name
 {
   const enum aarch64_arch arch;
   const std::string arch_name;
-  const unsigned long flags;
+  const uint64_t flags;
 };
 
 /* Map processor names to the architecture revision they implement and
@@ -238,7 +238,7 @@ static const struct arch_to_arch_name all_architectures[] =
    a copy of the string is created and stored to INVALID_EXTENSION.  */
 
 enum aarch64_parse_opt_result
-aarch64_parse_extension (const char *str, unsigned long *isa_flags,
+aarch64_parse_extension (const char *str, uint64_t *isa_flags,
 			 std::string *invalid_extension)
 {
   /* The extension string is parsed left to right.  */
@@ -326,18 +326,21 @@ int opt_ext_cmp (const void* a, const void* b)
      turns on as a dependency.  As an example +dotprod turns on FL_DOTPROD and
      FL_SIMD.  As such the set of bits represented by this option is
      {FL_DOTPROD, FL_SIMD}. */
-  unsigned long total_flags_a = opt_a->flag_canonical & opt_a->flags_on;
-  unsigned long total_flags_b = opt_b->flag_canonical & opt_b->flags_on;
+  uint64_t total_flags_a = opt_a->flag_canonical & opt_a->flags_on;
+  uint64_t total_flags_b = opt_b->flag_canonical & opt_b->flags_on;
   int popcnt_a = popcount_hwi ((HOST_WIDE_INT)total_flags_a);
   int popcnt_b = popcount_hwi ((HOST_WIDE_INT)total_flags_b);
   int order = popcnt_b - popcnt_a;
 
   /* If they have the same amount of bits set, give it a more
      deterministic ordering by using the value of the bits themselves.  */
-  if (order == 0)
-    return total_flags_b - total_flags_a;
+  if (order != 0)
+    return order;
 
-  return order;
+  if (total_flags_a != total_flags_b)
+    return total_flags_a < total_flags_b ? 1 : -1;
+
+  return 0;
 }
 
 /* Implement TARGET_OPTION_INIT_STRUCT.  */
@@ -373,9 +376,9 @@ aarch64_option_init_struct (struct gcc_options *opts ATTRIBUTE_UNUSED)
 */
 
 static bool
-aarch64_contains_opt (unsigned long isa_flag_bits, opt_ext *opt)
+aarch64_contains_opt (uint64_t isa_flag_bits, opt_ext *opt)
 {
-  unsigned long flags_check
+  uint64_t flags_check
     = opt->is_synthetic ? opt->flags_on : opt->flag_canonical;
 
   return (isa_flag_bits & flags_check) == flags_check;
@@ -388,13 +391,13 @@ aarch64_contains_opt (unsigned long isa_flag_bits, opt_ext *opt)
    that all the "+" flags come before the "+no" flags.  */
 
 std::string
-aarch64_get_extension_string_for_isa_flags (unsigned long isa_flags,
-					    unsigned long default_arch_flags)
+aarch64_get_extension_string_for_isa_flags (uint64_t isa_flags,
+					    uint64_t default_arch_flags)
 {
   const struct aarch64_option_extension *opt = NULL;
   std::string outstr = "";
 
-  unsigned long isa_flag_bits = isa_flags;
+  uint64_t isa_flag_bits = isa_flags;
 
   /* Pass one: Minimize the search space by reducing the set of options
      to the smallest set that still turns on the same features as before in
@@ -538,7 +541,7 @@ aarch64_rewrite_selected_cpu (const char *name)
       || a_to_an->arch == aarch64_no_arch)
     fatal_error (input_location, "unknown value %qs for %<-mcpu%>", name);
 
-  unsigned long extensions = p_to_a->flags;
+  uint64_t extensions = p_to_a->flags;
   aarch64_parse_extension (extension_str.c_str (), &extensions, NULL);
 
   std::string outstr = a_to_an->arch_name
diff --git a/gcc/config.gcc b/gcc/config.gcc
index b2282ecdf..506a918ed 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -315,12 +315,12 @@ m32c*-*-*)
         ;;
 aarch64*-*-*)
 	cpu_type=aarch64
-	extra_headers="arm_fp16.h arm_neon.h arm_acle.h"
+	extra_headers="arm_fp16.h arm_neon.h arm_bf16.h arm_acle.h arm_sve.h"
 	c_target_objs="aarch64-c.o"
 	cxx_target_objs="aarch64-c.o"
 	d_target_objs="aarch64-d.o"
-	extra_objs="aarch64-builtins.o aarch-common.o cortex-a57-fma-steering.o aarch64-speculation.o falkor-tag-collision-avoidance.o aarch64-bti-insert.o"
-	target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.c"
+	extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o cortex-a57-fma-steering.o aarch64-speculation.o falkor-tag-collision-avoidance.o aarch64-bti-insert.o"
+	target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.c \$(srcdir)/config/aarch64/aarch64-sve-builtins.h \$(srcdir)/config/aarch64/aarch64-sve-builtins.cc"
 	target_has_targetm_common=yes
 	;;
 alpha*-*-*)
@@ -382,7 +382,8 @@ i[34567]86-*-*)
 	c_target_objs="i386-c.o"
 	cxx_target_objs="i386-c.o"
 	d_target_objs="i386-d.o"
-	extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o"
+	extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o i386-options.o i386-builtins.o i386-expand.o i386-features.o"
+  target_gtfiles="\$(srcdir)/config/i386/i386-builtins.c \$(srcdir)/config/i386/i386-expand.c \$(srcdir)/config/i386/i386-options.c"
 	extra_options="${extra_options} fused-madd.opt"
 	extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
 		       pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h
@@ -414,7 +415,8 @@ x86_64-*-*)
 	cxx_target_objs="i386-c.o"
 	d_target_objs="i386-d.o"
 	extra_options="${extra_options} fused-madd.opt"
-	extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o"
+	extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o i386-options.o i386-builtins.o i386-expand.o i386-features.o"
+  target_gtfiles="\$(srcdir)/config/i386/i386-builtins.c \$(srcdir)/config/i386/i386-expand.c \$(srcdir)/config/i386/i386-options.c"
 	extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
 		       pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h
 		       nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
@@ -980,7 +982,7 @@ esac
 case ${target} in
 aarch64*-*-elf | aarch64*-*-fuchsia* | aarch64*-*-rtems*)
 	tm_file="${tm_file} dbxelf.h elfos.h newlib-stdint.h"
-	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-elf-raw.h"
+	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-errata.h aarch64/aarch64-elf-raw.h"
 	tmake_file="${tmake_file} aarch64/t-aarch64"
 	case $target in
 	aarch64-*-elf*)
@@ -1017,13 +1019,19 @@ aarch64*-*-elf | aarch64*-*-fuchsia* | aarch64*-*-rtems*)
 	;;
 aarch64*-*-freebsd*)
 	tm_file="${tm_file} dbxelf.h elfos.h ${fbsd_tm_file}"
-	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-freebsd.h"
+	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-errata.h aarch64/aarch64-freebsd.h"
 	tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-freebsd"
 	tm_defines="${tm_defines}  TARGET_DEFAULT_ASYNC_UNWIND_TABLES=1"
 	;;
+aarch64*-*-netbsd*)
+	tm_file="${tm_file} dbxelf.h elfos.h ${nbsd_tm_file}"
+	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-errata.h aarch64/aarch64-netbsd.h"
+	tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-netbsd"
+	extra_options="${extra_options} netbsd.opt netbsd-elf.opt"
+	;;
 aarch64*-*-linux*)
 	tm_file="${tm_file} dbxelf.h elfos.h gnu-user.h linux.h glibc-stdint.h"
-	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-linux.h"
+	tm_file="${tm_file} aarch64/aarch64-elf.h aarch64/aarch64-errata.h aarch64/aarch64-linux.h"
 	tmake_file="${tmake_file} aarch64/t-aarch64 aarch64/t-aarch64-linux"
 	tm_defines="${tm_defines}  TARGET_DEFAULT_ASYNC_UNWIND_TABLES=1"
 	case $target in
@@ -3847,32 +3855,40 @@ case "${target}" in
 				  sed -e 's/,.*$//'`
 			  fi
 
+			  # Use the pre-processor to strip flatten the options.
+			  # This makes the format less rigid than if we use
+			  # grep and sed directly here.
+			  opt_macro="AARCH64_OPT_EXTENSION(A, B, C, D, E, F)=A, B, C, D, E, F"
+			  options_parsed="`$ac_cv_prog_CPP -D"$opt_macro" -x c \
+				${srcdir}/config/aarch64/aarch64-option-extensions.def`"
+
+			  # Match one element inside AARCH64_OPT_EXTENSION, we
+			  # consume anything that's not a ,.
+			  elem="[ 	]*\([^,]\+\)[ 	]*"
+
+			  # Repeat the pattern for the number of entries in the
+			  # AARCH64_OPT_EXTENSION, currently 6 times.
+			  sed_patt="^$elem,$elem,$elem,$elem,$elem,$elem"
+
 			  while [ x"$ext_val" != x ]
 			  do
 				ext_val=`echo $ext_val | sed -e 's/\+//'`
 				ext=`echo $ext_val | sed -e 's/\+.*//'`
 				base_ext=`echo $ext | sed -e 's/^no//'`
+				opt_line=`echo -e "$options_parsed" | \
+					grep "^\"$base_ext\""`
 
 				if [ x"$base_ext" = x ] \
-				    || grep "^AARCH64_OPT_EXTENSION(\"$base_ext\"," \
-				    ${srcdir}/config/aarch64/aarch64-option-extensions.def \
-				    > /dev/null; then
-
-				  ext_canon=`grep "^AARCH64_OPT_EXTENSION(\"$base_ext\"," \
-					${srcdir}/config/aarch64/aarch64-option-extensions.def | \
-					sed -e 's/^[^,]*,[ 	]*//' | \
-					sed -e 's/,.*$//'`
-				  ext_on=`grep "^AARCH64_OPT_EXTENSION(\"$base_ext\"," \
-					${srcdir}/config/aarch64/aarch64-option-extensions.def | \
-					sed -e 's/^[^,]*,[ 	]*[^,]*,[ 	]*//' | \
-					sed -e 's/,.*$//' | \
-					sed -e 's/).*$//'`
-				  ext_off=`grep "^AARCH64_OPT_EXTENSION(\"$base_ext\"," \
-					${srcdir}/config/aarch64/aarch64-option-extensions.def | \
-					sed -e 's/^[^,]*,[ 	]*[^,]*,[ 	]*[^,]*,[ 	]*//' | \
-					sed -e 's/,.*$//' | \
-					sed -e 's/).*$//'`
-
+				    || [[ -n $opt_line ]]; then
+
+				  # These regexp extract the elements based on
+				  # their group match index in the regexp.
+				  ext_canon=`echo -e "$opt_line" | \
+					sed -e "s/$sed_patt/\2/"`
+				  ext_on=`echo -e "$opt_line" | \
+					sed -e "s/$sed_patt/\3/"`
+				  ext_off=`echo -e "$opt_line" | \
+					sed -e "s/$sed_patt/\4/"`
 
 				  if [ $ext = $base_ext ]; then
 					# Adding extension
diff --git a/gcc/config/aarch64/aarch64-arches.def b/gcc/config/aarch64/aarch64-arches.def
index d258bd492..e464d329c 100644
--- a/gcc/config/aarch64/aarch64-arches.def
+++ b/gcc/config/aarch64/aarch64-arches.def
@@ -36,5 +36,6 @@ AARCH64_ARCH("armv8.2-a",     generic,	     8_2A,	8,  AARCH64_FL_FOR_ARCH8_2)
 AARCH64_ARCH("armv8.3-a",     generic,	     8_3A,	8,  AARCH64_FL_FOR_ARCH8_3)
 AARCH64_ARCH("armv8.4-a",     generic,	     8_4A,	8,  AARCH64_FL_FOR_ARCH8_4)
 AARCH64_ARCH("armv8.5-a",     generic,	     8_5A,	8,  AARCH64_FL_FOR_ARCH8_5)
+AARCH64_ARCH("armv8.6-a",     generic,	     8_6A,	8,  AARCH64_FL_FOR_ARCH8_6)
 
 #undef AARCH64_ARCH
diff --git a/gcc/config/aarch64/aarch64-bti-insert.c b/gcc/config/aarch64/aarch64-bti-insert.c
index e519a0f0a..db8ebb1ba 100644
--- a/gcc/config/aarch64/aarch64-bti-insert.c
+++ b/gcc/config/aarch64/aarch64-bti-insert.c
@@ -106,7 +106,9 @@ aarch64_pac_insn_p (rtx x)
 	  int unspec_val = XINT (sub, 1);
 	  switch (unspec_val)
 	    {
-	    case UNSPEC_PACISP:
+	    case UNSPEC_PACIASP:
+            /* fall-through.  */
+            case UNSPEC_PACIBSP:
 	      return true;
 
 	    default:
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index d7b1b7bd6..c890fcc37 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -68,6 +68,9 @@
 #define hi_UP    E_HImode
 #define hf_UP    E_HFmode
 #define qi_UP    E_QImode
+#define bf_UP    E_BFmode
+#define v4bf_UP  E_V4BFmode
+#define v8bf_UP  E_V8BFmode
 #define UP(X) X##_UP
 
 #define SIMD_MAX_BUILTIN_ARGS 5
@@ -107,6 +110,9 @@ enum aarch64_type_qualifiers
   /* Lane indices selected in pairs. - must be in range, and flipped for
      bigendian.  */
   qualifier_lane_pair_index = 0x800,
+  /* Lane indices selected in quadtuplets. - must be in range, and flipped for
+     bigendian.  */
+  qualifier_lane_quadtup_index = 0x1000,
 };
 
 typedef struct
@@ -173,6 +179,10 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned,
       qualifier_unsigned, qualifier_immediate };
 #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };
+#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)
 
 
 static enum aarch64_type_qualifiers
@@ -191,6 +201,19 @@ aarch64_types_quadopu_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
       qualifier_unsigned, qualifier_lane_index };
 #define TYPES_QUADOPU_LANE (aarch64_types_quadopu_lane_qualifiers)
 
+static enum aarch64_type_qualifiers
+aarch64_types_quadopssus_lane_quadtup_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_unsigned,
+      qualifier_none, qualifier_lane_quadtup_index };
+#define TYPES_QUADOPSSUS_LANE_QUADTUP \
+	(aarch64_types_quadopssus_lane_quadtup_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_quadopsssu_lane_quadtup_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_none,
+      qualifier_unsigned, qualifier_lane_quadtup_index };
+#define TYPES_QUADOPSSSU_LANE_QUADTUP \
+	(aarch64_types_quadopsssu_lane_quadtup_qualifiers)
+
 static enum aarch64_type_qualifiers
 aarch64_types_quadopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
@@ -347,6 +370,12 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define VAR14(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
   VAR13 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \
   VAR1 (T, X, MAP, N)
+#define VAR15(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
+  VAR14 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
+  VAR1 (T, X, MAP, O)
+#define VAR16(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
+  VAR15 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
+  VAR1 (T, X, MAP, P)
 
 #include "aarch64-builtin-iterators.h"
 
@@ -432,10 +461,22 @@ enum aarch64_builtins
   /* ARMv8.3-A Pointer Authentication Builtins.  */
   AARCH64_PAUTH_BUILTIN_AUTIA1716,
   AARCH64_PAUTH_BUILTIN_PACIA1716,
+  AARCH64_PAUTH_BUILTIN_AUTIB1716,
+  AARCH64_PAUTH_BUILTIN_PACIB1716,
   AARCH64_PAUTH_BUILTIN_XPACLRI,
   /* Special cased Armv8.3-A Complex FMA by Lane quad Builtins.  */
   AARCH64_SIMD_FCMLA_LANEQ_BUILTIN_BASE,
   AARCH64_SIMD_FCMLA_LANEQ_BUILTINS
+  /* Builtin for Arm8.3-a Javascript conversion instruction.  */
+  AARCH64_JSCVT,
+  /* TME builtins.  */
+  AARCH64_TME_BUILTIN_TSTART,
+  AARCH64_TME_BUILTIN_TCOMMIT,
+  AARCH64_TME_BUILTIN_TTEST,
+  AARCH64_TME_BUILTIN_TCANCEL,
+  /* Armv8.5-a RNG instruction builtins.  */
+  AARCH64_BUILTIN_RNG_RNDR,
+  AARCH64_BUILTIN_RNG_RNDRRS,
   AARCH64_BUILTIN_MAX
 };
 
@@ -490,6 +531,7 @@ const char *aarch64_scalar_builtin_types[] = {
   "__builtin_aarch64_simd_oi",
   "__builtin_aarch64_simd_ci",
   "__builtin_aarch64_simd_xi",
+  "__builtin_aarch64_simd_bf",
   NULL
 };
 
@@ -547,6 +589,21 @@ static tree aarch64_simd_intXI_type_node = NULL_TREE;
 tree aarch64_fp16_type_node = NULL_TREE;
 tree aarch64_fp16_ptr_type_node = NULL_TREE;
 
+/* Back-end node type for brain float (bfloat) types.  */
+tree aarch64_bf16_type_node = NULL_TREE;
+tree aarch64_bf16_ptr_type_node = NULL_TREE;
+
+/* Wrapper around add_builtin_function.  NAME is the name of the built-in
+   function, TYPE is the function type, and CODE is the function subcode
+   (relative to AARCH64_BUILTIN_GENERAL).  */
+static tree
+aarch64_general_add_builtin (const char *name, tree type, unsigned int code)
+{
+  code = (code << AARCH64_BUILTIN_SHIFT) | AARCH64_BUILTIN_GENERAL;
+  return add_builtin_function (name, type, code, BUILT_IN_MD,
+			       NULL, NULL_TREE);
+}
+
 static const char *
 aarch64_mangle_builtin_scalar_type (const_tree type)
 {
@@ -585,7 +642,7 @@ aarch64_mangle_builtin_vector_type (const_tree type)
 }
 
 const char *
-aarch64_mangle_builtin_type (const_tree type)
+aarch64_general_mangle_builtin_type (const_tree type)
 {
   const char *mangle;
   /* Walk through all the AArch64 builtins types tables to filter out the
@@ -627,6 +684,8 @@ aarch64_simd_builtin_std_type (machine_mode mode,
       return float_type_node;
     case E_DFmode:
       return double_type_node;
+    case E_BFmode:
+      return aarch64_bf16_type_node;
     default:
       gcc_unreachable ();
     }
@@ -718,6 +777,10 @@ aarch64_init_simd_builtin_types (void)
   aarch64_simd_types[Float64x1_t].eltype = double_type_node;
   aarch64_simd_types[Float64x2_t].eltype = double_type_node;
 
+  /* Init Bfloat vector types with underlying __bf16 type.  */
+  aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
+  aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;
+
   for (i = 0; i < nelts; i++)
     {
       tree eltype = aarch64_simd_types[i].eltype;
@@ -782,6 +845,8 @@ aarch64_init_simd_builtin_scalar_types (void)
 					     "__builtin_aarch64_simd_poly128");
   (*lang_hooks.types.register_builtin_type) (intTI_type_node,
 					     "__builtin_aarch64_simd_ti");
+  (*lang_hooks.types.register_builtin_type) (aarch64_bf16_type_node,
+					     "__builtin_aarch64_simd_bf");
   /* Unsigned integer types for various mode sizes.  */
   (*lang_hooks.types.register_builtin_type) (unsigned_intQI_type_node,
 					     "__builtin_aarch64_simd_uqi");
@@ -816,8 +881,7 @@ aarch64_init_fcmla_laneq_builtins (void)
 	= aarch64_simd_builtin_std_type (SImode, qualifier_lane_pair_index);
       tree ftype = build_function_type_list (argtype, argtype, argtype,
 					     quadtype, lanetype, NULL_TREE);
-      tree fndecl = add_builtin_function (d->name, ftype, d->fcode,
-					  BUILT_IN_MD, NULL, NULL_TREE);
+      tree fndecl = aarch64_general_add_builtin (d->name, ftype, d->fcode);
 
       aarch64_builtin_decls[d->fcode] = fndecl;
     }
@@ -846,10 +910,10 @@ aarch64_init_simd_builtins (void)
 						  size_type_node,
 						  intSI_type_node,
 						  NULL);
-  aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_LANE_CHECK] =
-      add_builtin_function ("__builtin_aarch64_im_lane_boundsi", lane_check_fpr,
-			    AARCH64_SIMD_BUILTIN_LANE_CHECK, BUILT_IN_MD,
-			    NULL, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_LANE_CHECK]
+    = aarch64_general_add_builtin ("__builtin_aarch64_im_lane_boundsi",
+				   lane_check_fpr,
+				   AARCH64_SIMD_BUILTIN_LANE_CHECK);
 
   for (i = 0; i < ARRAY_SIZE (aarch64_simd_builtin_data); i++, fcode++)
     {
@@ -947,8 +1011,7 @@ aarch64_init_simd_builtins (void)
 	snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s",
 		  d->name);
 
-      fndecl = add_builtin_function (namebuf, ftype, fcode, BUILT_IN_MD,
-				     NULL, NULL_TREE);
+      fndecl = aarch64_general_add_builtin (namebuf, ftype, fcode);
       aarch64_builtin_decls[fcode] = fndecl;
     }
 
@@ -968,8 +1031,7 @@ aarch64_init_crc32_builtins ()
       tree argtype = aarch64_simd_builtin_std_type (d->mode,
 						    qualifier_unsigned);
       tree ftype = build_function_type_list (usi_type, usi_type, argtype, NULL_TREE);
-      tree fndecl = add_builtin_function (d->name, ftype, d->fcode,
-                                          BUILT_IN_MD, NULL, NULL_TREE);
+      tree fndecl = aarch64_general_add_builtin (d->name, ftype, d->fcode);
 
       aarch64_builtin_decls[d->fcode] = fndecl;
     }
@@ -1009,8 +1071,8 @@ aarch64_init_builtin_rsqrt (void)
   for (; bdd < bdd_end; bdd++)
   {
     ftype = build_function_type_list (bdd->type_node, bdd->type_node, NULL_TREE);
-    fndecl = add_builtin_function (bdd->builtin_name,
-      ftype, bdd->function_code, BUILT_IN_MD, NULL, NULL_TREE);
+    fndecl = aarch64_general_add_builtin (bdd->builtin_name,
+					  ftype, bdd->function_code);
     aarch64_builtin_decls[bdd->function_code] = fndecl;
   }
 }
@@ -1030,6 +1092,19 @@ aarch64_init_fp16_types (void)
   aarch64_fp16_ptr_type_node = build_pointer_type (aarch64_fp16_type_node);
 }
 
+/* Initialize the backend REAL_TYPE type supporting bfloat types.  */
+static void
+aarch64_init_bf16_types (void)
+{
+  aarch64_bf16_type_node = make_node (REAL_TYPE);
+  TYPE_PRECISION (aarch64_bf16_type_node) = 16;
+  SET_TYPE_MODE (aarch64_bf16_type_node, BFmode);
+  layout_type (aarch64_bf16_type_node);
+
+  lang_hooks.types.register_builtin_type (aarch64_bf16_type_node, "__bf16");
+  aarch64_bf16_ptr_type_node = build_pointer_type (aarch64_bf16_type_node);
+}
+
 /* Pointer authentication builtins that will become NOP on legacy platform.
    Currently, these builtins are for internal use only (libgcc EH unwinder).  */
 
@@ -1044,21 +1119,77 @@ aarch64_init_pauth_hint_builtins (void)
     = build_function_type_list (ptr_type_node, ptr_type_node, NULL_TREE);
 
   aarch64_builtin_decls[AARCH64_PAUTH_BUILTIN_AUTIA1716]
-    = add_builtin_function ("__builtin_aarch64_autia1716", ftype_pointer_auth,
-			    AARCH64_PAUTH_BUILTIN_AUTIA1716, BUILT_IN_MD, NULL,
-			    NULL_TREE);
+    = aarch64_general_add_builtin ("__builtin_aarch64_autia1716",
+				   ftype_pointer_auth,
+				   AARCH64_PAUTH_BUILTIN_AUTIA1716);
   aarch64_builtin_decls[AARCH64_PAUTH_BUILTIN_PACIA1716]
-    = add_builtin_function ("__builtin_aarch64_pacia1716", ftype_pointer_auth,
-			    AARCH64_PAUTH_BUILTIN_PACIA1716, BUILT_IN_MD, NULL,
-			    NULL_TREE);
+    = aarch64_general_add_builtin ("__builtin_aarch64_pacia1716",
+				   ftype_pointer_auth,
+				   AARCH64_PAUTH_BUILTIN_PACIA1716);
+  aarch64_builtin_decls[AARCH64_PAUTH_BUILTIN_AUTIB1716]
+    = aarch64_general_add_builtin ("__builtin_aarch64_autib1716",
+				   ftype_pointer_auth,
+				   AARCH64_PAUTH_BUILTIN_AUTIB1716);
+  aarch64_builtin_decls[AARCH64_PAUTH_BUILTIN_PACIB1716]
+    = aarch64_general_add_builtin ("__builtin_aarch64_pacib1716",
+				   ftype_pointer_auth,
+				   AARCH64_PAUTH_BUILTIN_PACIB1716);
   aarch64_builtin_decls[AARCH64_PAUTH_BUILTIN_XPACLRI]
-    = add_builtin_function ("__builtin_aarch64_xpaclri", ftype_pointer_strip,
-			    AARCH64_PAUTH_BUILTIN_XPACLRI, BUILT_IN_MD, NULL,
-			    NULL_TREE);
+    = aarch64_general_add_builtin ("__builtin_aarch64_xpaclri",
+				   ftype_pointer_strip,
+				   AARCH64_PAUTH_BUILTIN_XPACLRI);
+}
+
+/* Initialize the transactional memory extension (TME) builtins.  */
+static void
+aarch64_init_tme_builtins (void)
+{
+  tree ftype_uint64_void
+    = build_function_type_list (uint64_type_node, NULL);
+  tree ftype_void_void
+    = build_function_type_list (void_type_node, NULL);
+  tree ftype_void_uint64
+    = build_function_type_list (void_type_node, uint64_type_node, NULL);
+
+  aarch64_builtin_decls[AARCH64_TME_BUILTIN_TSTART]
+    = aarch64_general_add_builtin ("__builtin_aarch64_tstart",
+				   ftype_uint64_void,
+				   AARCH64_TME_BUILTIN_TSTART);
+  aarch64_builtin_decls[AARCH64_TME_BUILTIN_TTEST]
+    = aarch64_general_add_builtin ("__builtin_aarch64_ttest",
+				   ftype_uint64_void,
+				   AARCH64_TME_BUILTIN_TTEST);
+  aarch64_builtin_decls[AARCH64_TME_BUILTIN_TCOMMIT]
+    = aarch64_general_add_builtin ("__builtin_aarch64_tcommit",
+				   ftype_void_void,
+				   AARCH64_TME_BUILTIN_TCOMMIT);
+  aarch64_builtin_decls[AARCH64_TME_BUILTIN_TCANCEL]
+    = aarch64_general_add_builtin ("__builtin_aarch64_tcancel",
+				   ftype_void_uint64,
+				   AARCH64_TME_BUILTIN_TCANCEL);
+}
+
+/* Add builtins for Random Number instructions.  */
+
+static void
+aarch64_init_rng_builtins (void)
+{
+  tree unsigned_ptr_type = build_pointer_type (unsigned_intDI_type_node);
+  tree ftype
+    = build_function_type_list (integer_type_node, unsigned_ptr_type, NULL);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RNG_RNDR]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rndr", ftype,
+				   AARCH64_BUILTIN_RNG_RNDR);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RNG_RNDRRS]
+    = aarch64_general_add_builtin ("__builtin_aarch64_rndrrs", ftype,
+				   AARCH64_BUILTIN_RNG_RNDRRS);
 }
 
+
+/* Initialize all builtins in the AARCH64_BUILTIN_GENERAL group.  */
+
 void
-aarch64_init_builtins (void)
+aarch64_general_init_builtins (void)
 {
   tree ftype_set_fpr
     = build_function_type_list (void_type_node, unsigned_type_node, NULL);
@@ -1066,25 +1197,38 @@ aarch64_init_builtins (void)
     = build_function_type_list (unsigned_type_node, NULL);
 
   aarch64_builtin_decls[AARCH64_BUILTIN_GET_FPCR]
-    = add_builtin_function ("__builtin_aarch64_get_fpcr", ftype_get_fpr,
-			    AARCH64_BUILTIN_GET_FPCR, BUILT_IN_MD, NULL, NULL_TREE);
+    = aarch64_general_add_builtin ("__builtin_aarch64_get_fpcr",
+				   ftype_get_fpr,
+				   AARCH64_BUILTIN_GET_FPCR);
   aarch64_builtin_decls[AARCH64_BUILTIN_SET_FPCR]
-    = add_builtin_function ("__builtin_aarch64_set_fpcr", ftype_set_fpr,
-			    AARCH64_BUILTIN_SET_FPCR, BUILT_IN_MD, NULL, NULL_TREE);
+    = aarch64_general_add_builtin ("__builtin_aarch64_set_fpcr",
+				   ftype_set_fpr,
+				   AARCH64_BUILTIN_SET_FPCR);
   aarch64_builtin_decls[AARCH64_BUILTIN_GET_FPSR]
-    = add_builtin_function ("__builtin_aarch64_get_fpsr", ftype_get_fpr,
-			    AARCH64_BUILTIN_GET_FPSR, BUILT_IN_MD, NULL, NULL_TREE);
+    = aarch64_general_add_builtin ("__builtin_aarch64_get_fpsr",
+				   ftype_get_fpr,
+				   AARCH64_BUILTIN_GET_FPSR);
   aarch64_builtin_decls[AARCH64_BUILTIN_SET_FPSR]
-    = add_builtin_function ("__builtin_aarch64_set_fpsr", ftype_set_fpr,
-			    AARCH64_BUILTIN_SET_FPSR, BUILT_IN_MD, NULL, NULL_TREE);
+    = aarch64_general_add_builtin ("__builtin_aarch64_set_fpsr",
+				   ftype_set_fpr,
+				   AARCH64_BUILTIN_SET_FPSR);
 
   aarch64_init_fp16_types ();
 
+  aarch64_init_bf16_types ();
+
   if (TARGET_SIMD)
     aarch64_init_simd_builtins ();
 
   aarch64_init_crc32_builtins ();
   aarch64_init_builtin_rsqrt ();
+  aarch64_init_rng_builtins ();
+
+  tree ftype_jcvt
+    = build_function_type_list (intSI_type_node, double_type_node, NULL);
+  aarch64_builtin_decls[AARCH64_JSCVT]
+    = aarch64_general_add_builtin ("__builtin_aarch64_jcvtzs", ftype_jcvt,
+				   AARCH64_JSCVT);
 
   /* Initialize pointer authentication builtins which are backed by instructions
      in NOP encoding space.
@@ -1094,10 +1238,14 @@ aarch64_init_builtins (void)
      register them.  */
   if (!TARGET_ILP32)
     aarch64_init_pauth_hint_builtins ();
+
+  if (TARGET_TME)
+    aarch64_init_tme_builtins ();
 }
 
+/* Implement TARGET_BUILTIN_DECL for the AARCH64_BUILTIN_GENERAL group.  */
 tree
-aarch64_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
+aarch64_general_builtin_decl (unsigned code, bool)
 {
   if (code >= AARCH64_BUILTIN_MAX)
     return error_mark_node;
@@ -1112,6 +1260,7 @@ typedef enum
   SIMD_ARG_LANE_INDEX,
   SIMD_ARG_STRUCT_LOAD_STORE_LANE_INDEX,
   SIMD_ARG_LANE_PAIR_INDEX,
+  SIMD_ARG_LANE_QUADTUP_INDEX,
   SIMD_ARG_STOP
 } builtin_simd_arg;
 
@@ -1201,9 +1350,25 @@ aarch64_simd_expand_args (rtx target, int icode, int have_retval,
 		  op[opc] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane),
 					  SImode);
 		}
-	      /* Fall through - if the lane index isn't a constant then
-		 the next case will error.  */
-	      /* FALLTHRU */
+	      /* If the lane index isn't a constant then error out.  */
+	      goto constant_arg;
+	    case SIMD_ARG_LANE_QUADTUP_INDEX:
+	      /* Must be a previous operand into which this is an index and
+		 index is restricted to nunits / 4.  */
+	      gcc_assert (opc > 0);
+	      if (CONST_INT_P (op[opc]))
+		{
+		  machine_mode vmode = insn_data[icode].operand[opc - 1].mode;
+		  unsigned int nunits
+		    = GET_MODE_NUNITS (vmode).to_constant ();
+		  aarch64_simd_lane_bounds (op[opc], 0, nunits / 4, exp);
+		  /* Keep to GCC-vector-extension lane indices in the RTL.  */
+		  int lane = INTVAL (op[opc]);
+		  op[opc] = gen_int_mode (ENDIAN_LANE_N (nunits / 4, lane),
+					  SImode);
+		}
+	      /* If the lane index isn't a constant then error out.  */
+	      goto constant_arg;
 	    case SIMD_ARG_CONSTANT:
 constant_arg:
 	      if (!(*insn_data[icode].operand[opc].predicate)
@@ -1316,6 +1481,8 @@ aarch64_simd_expand_builtin (int fcode, tree exp, rtx target)
 	args[k] = SIMD_ARG_LANE_INDEX;
       else if (d->qualifiers[qualifiers_k] & qualifier_lane_pair_index)
 	args[k] = SIMD_ARG_LANE_PAIR_INDEX;
+      else if (d->qualifiers[qualifiers_k] & qualifier_lane_quadtup_index)
+	args[k] = SIMD_ARG_LANE_QUADTUP_INDEX;
       else if (d->qualifiers[qualifiers_k] & qualifier_struct_load_store_lane_index)
 	args[k] = SIMD_ARG_STRUCT_LOAD_STORE_LANE_INDEX;
       else if (d->qualifiers[qualifiers_k] & qualifier_immediate)
@@ -1497,17 +1664,90 @@ aarch64_expand_fcmla_builtin (tree exp, rtx target, int fcode)
   return target;
 }
 
-/* Expand an expression EXP that calls a built-in function,
-   with result going to TARGET if that's convenient.  */
+/* Function to expand an expression EXP which calls one of the Transactional
+   Memory Extension (TME) builtins FCODE with the result going to TARGET.  */
+static rtx
+aarch64_expand_builtin_tme (int fcode, tree exp, rtx target)
+{
+  switch (fcode)
+    {
+    case AARCH64_TME_BUILTIN_TSTART:
+      target = gen_reg_rtx (DImode);
+      emit_insn (GEN_FCN (CODE_FOR_tstart) (target));
+      break;
+
+    case AARCH64_TME_BUILTIN_TTEST:
+      target = gen_reg_rtx (DImode);
+      emit_insn (GEN_FCN (CODE_FOR_ttest) (target));
+      break;
+
+    case AARCH64_TME_BUILTIN_TCOMMIT:
+      emit_insn (GEN_FCN (CODE_FOR_tcommit) ());
+      break;
+
+    case AARCH64_TME_BUILTIN_TCANCEL:
+      {
+	tree arg0 = CALL_EXPR_ARG (exp, 0);
+	rtx op0 = expand_normal (arg0);
+	if (CONST_INT_P (op0) && UINTVAL (op0) <= 65536)
+	  emit_insn (GEN_FCN (CODE_FOR_tcancel) (op0));
+	else
+	  {
+	    error ("%Kargument must be a 16-bit constant immediate", exp);
+	    return const0_rtx;
+	  }
+      }
+      break;
+
+    default :
+      gcc_unreachable ();
+    }
+    return target;
+}
+
+/* Expand a random number builtin EXP with code FCODE, putting the result
+   int TARGET.  If IGNORE is true the return value is ignored.  */
+
 rtx
-aarch64_expand_builtin (tree exp,
-		     rtx target,
-		     rtx subtarget ATTRIBUTE_UNUSED,
-		     machine_mode mode ATTRIBUTE_UNUSED,
-		     int ignore ATTRIBUTE_UNUSED)
+aarch64_expand_rng_builtin (tree exp, rtx target, int fcode, int ignore)
+{
+  rtx pat;
+  enum insn_code icode;
+  if (fcode == AARCH64_BUILTIN_RNG_RNDR)
+    icode = CODE_FOR_aarch64_rndr;
+  else if (fcode == AARCH64_BUILTIN_RNG_RNDRRS)
+    icode = CODE_FOR_aarch64_rndrrs;
+  else
+    gcc_unreachable ();
+
+  rtx rand = gen_reg_rtx (DImode);
+  pat = GEN_FCN (icode) (rand);
+  if (!pat)
+    return NULL_RTX;
+
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  rtx res_addr = expand_normal (arg0);
+  res_addr = convert_memory_address (Pmode, res_addr);
+  rtx res_mem = gen_rtx_MEM (DImode, res_addr);
+  emit_insn (pat);
+  emit_move_insn (res_mem, rand);
+  /* If the status result is unused don't generate the CSET code.  */
+  if (ignore)
+    return target;
+
+  rtx cc_reg = gen_rtx_REG (CC_Zmode, CC_REGNUM);
+  rtx cmp_rtx = gen_rtx_fmt_ee (NE, SImode, cc_reg, const0_rtx);
+  emit_insn (gen_aarch64_cstoresi (target, cmp_rtx, cc_reg));
+  return target;
+}
+
+/* Expand an expression EXP that calls built-in function FCODE,
+   with result going to TARGET if that's convenient.  IGNORE is true
+   if the result of the builtin is ignored.  */
+rtx
+aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target,
+				int ignore)
 {
-  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
-  int fcode = DECL_FUNCTION_CODE (fndecl);
   int icode;
   rtx pat, op0;
   tree arg0;
@@ -1540,6 +1780,8 @@ aarch64_expand_builtin (tree exp,
 
     case AARCH64_PAUTH_BUILTIN_AUTIA1716:
     case AARCH64_PAUTH_BUILTIN_PACIA1716:
+    case AARCH64_PAUTH_BUILTIN_AUTIB1716:
+    case AARCH64_PAUTH_BUILTIN_PACIB1716:
     case AARCH64_PAUTH_BUILTIN_XPACLRI:
       arg0 = CALL_EXPR_ARG (exp, 0);
       op0 = force_reg (Pmode, expand_normal (arg0));
@@ -1563,8 +1805,24 @@ aarch64_expand_builtin (tree exp,
 	{
 	  tree arg1 = CALL_EXPR_ARG (exp, 1);
 	  rtx op1 = force_reg (Pmode, expand_normal (arg1));
-	  icode = (fcode == AARCH64_PAUTH_BUILTIN_PACIA1716
-		   ? CODE_FOR_paci1716 : CODE_FOR_auti1716);
+	  switch (fcode)
+	    {
+	    case AARCH64_PAUTH_BUILTIN_AUTIA1716:
+	      icode = CODE_FOR_autia1716;
+	      break;
+	    case AARCH64_PAUTH_BUILTIN_AUTIB1716:
+	      icode = CODE_FOR_autib1716;
+	      break;
+	    case AARCH64_PAUTH_BUILTIN_PACIA1716:
+	      icode = CODE_FOR_pacia1716;
+	      break;
+	    case AARCH64_PAUTH_BUILTIN_PACIB1716:
+	      icode = CODE_FOR_pacib1716;
+	      break;
+	    default:
+	      icode = 0;
+	      gcc_unreachable ();
+	    }
 
 	  rtx x16_reg = gen_rtx_REG (Pmode, R16_REGNUM);
 	  rtx x17_reg = gen_rtx_REG (Pmode, R17_REGNUM);
@@ -1576,6 +1834,16 @@ aarch64_expand_builtin (tree exp,
 
       return target;
 
+    case AARCH64_JSCVT:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = force_reg (DFmode, expand_normal (arg0));
+      if (!target)
+	target = gen_reg_rtx (SImode);
+      else
+	target = force_reg (SImode, target);
+      emit_insn (GEN_FCN (CODE_FOR_aarch64_fjcvtzs) (target, op0));
+      return target;
+
     case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ0_V2SF:
     case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ90_V2SF:
     case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ180_V2SF:
@@ -1585,6 +1853,9 @@ aarch64_expand_builtin (tree exp,
     case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ180_V4HF:
     case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ270_V4HF:
       return aarch64_expand_fcmla_builtin (exp, target, fcode);
+    case AARCH64_BUILTIN_RNG_RNDR:
+    case AARCH64_BUILTIN_RNG_RNDRRS:
+      return aarch64_expand_rng_builtin (exp, target, fcode, ignore);
     }
 
   if (fcode >= AARCH64_SIMD_BUILTIN_BASE && fcode <= AARCH64_SIMD_BUILTIN_MAX)
@@ -1599,6 +1870,12 @@ aarch64_expand_builtin (tree exp,
       || fcode == AARCH64_BUILTIN_RSQRT_V4SF)
     return aarch64_expand_builtin_rsqrt (fcode, exp, target);
 
+  if (fcode == AARCH64_TME_BUILTIN_TSTART
+      || fcode == AARCH64_TME_BUILTIN_TCOMMIT
+      || fcode == AARCH64_TME_BUILTIN_TTEST
+      || fcode == AARCH64_TME_BUILTIN_TCANCEL)
+    return aarch64_expand_builtin_tme (fcode, exp, target);
+
   gcc_unreachable ();
 }
 
@@ -1750,7 +2027,7 @@ aarch64_builtin_vectorized_function (unsigned int fn, tree type_out,
 /* Return builtin for reciprocal square root.  */
 
 tree
-aarch64_builtin_rsqrt (unsigned int fn)
+aarch64_general_builtin_rsqrt (unsigned int fn)
 {
   if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
     return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
@@ -1765,13 +2042,14 @@ aarch64_builtin_rsqrt (unsigned int fn)
 #define VAR1(T, N, MAP, A) \
   case AARCH64_SIMD_BUILTIN_##T##_##N##A:
 
+/* Try to fold a call to the built-in function with subcode FCODE.  The
+   function is passed the N_ARGS arguments in ARGS and it returns a value
+   of type TYPE.  Return the new expression on success and NULL_TREE on
+   failure.  */
 tree
-aarch64_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED, tree *args,
-		      bool ignore ATTRIBUTE_UNUSED)
+aarch64_general_fold_builtin (unsigned int fcode, tree type,
+			      unsigned int n_args ATTRIBUTE_UNUSED, tree *args)
 {
-  int fcode = DECL_FUNCTION_CODE (fndecl);
-  tree type = TREE_TYPE (TREE_TYPE (fndecl));
-
   switch (fcode)
     {
       BUILTIN_VDQF (UNOP, abs, 2)
@@ -1787,109 +2065,90 @@ aarch64_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED, tree *args,
   return NULL_TREE;
 }
 
-bool
-aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
+/* Try to fold STMT, given that it's a call to the built-in function with
+   subcode FCODE.  Return the new statement on success and null on
+   failure.  */
+gimple *
+aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt)
 {
-  bool changed = false;
-  gimple *stmt = gsi_stmt (*gsi);
-  tree call = gimple_call_fn (stmt);
-  tree fndecl;
   gimple *new_stmt = NULL;
-
-  if (call)
+  unsigned nargs = gimple_call_num_args (stmt);
+  tree *args = (nargs > 0
+		? gimple_call_arg_ptr (stmt, 0)
+		: &error_mark_node);
+
+  /* We use gimple's IFN_REDUC_(PLUS|MIN|MAX)s for float, signed int
+     and unsigned int; it will distinguish according to the types of
+     the arguments to the __builtin.  */
+  switch (fcode)
     {
-      fndecl = gimple_call_fndecl (stmt);
-      if (fndecl)
+      BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
+	new_stmt = gimple_build_call_internal (IFN_REDUC_PLUS,
+					       1, args[0]);
+	gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+	break;
+      BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10)
+      BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10)
+	new_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
+					       1, args[0]);
+	gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+	break;
+      BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10)
+      BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10)
+	new_stmt = gimple_build_call_internal (IFN_REDUC_MIN,
+					       1, args[0]);
+	gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+	break;
+      BUILTIN_GPF (BINOP, fmulx, 0)
 	{
-	  int fcode = DECL_FUNCTION_CODE (fndecl);
-	  unsigned nargs = gimple_call_num_args (stmt);
-	  tree *args = (nargs > 0
-			? gimple_call_arg_ptr (stmt, 0)
-			: &error_mark_node);
-
-	  /* We use gimple's IFN_REDUC_(PLUS|MIN|MAX)s for float, signed int
-	     and unsigned int; it will distinguish according to the types of
-	     the arguments to the __builtin.  */
-	  switch (fcode)
+	  gcc_assert (nargs == 2);
+	  bool a0_cst_p = TREE_CODE (args[0]) == REAL_CST;
+	  bool a1_cst_p = TREE_CODE (args[1]) == REAL_CST;
+	  if (a0_cst_p || a1_cst_p)
 	    {
-	      BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
-	        new_stmt = gimple_build_call_internal (IFN_REDUC_PLUS,
-						       1, args[0]);
-		gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
-		break;
-	      BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10)
-	      BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10)
-	        new_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
-						       1, args[0]);
-		gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
-		break;
-	      BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10)
-	      BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10)
-	        new_stmt = gimple_build_call_internal (IFN_REDUC_MIN,
-						       1, args[0]);
-		gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
-		break;
-	      BUILTIN_GPF (BINOP, fmulx, 0)
+	      if (a0_cst_p && a1_cst_p)
 		{
-		  gcc_assert (nargs == 2);
-		  bool a0_cst_p = TREE_CODE (args[0]) == REAL_CST;
-		  bool a1_cst_p = TREE_CODE (args[1]) == REAL_CST;
-		  if (a0_cst_p || a1_cst_p)
+		  tree t0 = TREE_TYPE (args[0]);
+		  real_value a0 = (TREE_REAL_CST (args[0]));
+		  real_value a1 = (TREE_REAL_CST (args[1]));
+		  if (real_equal (&a1, &dconst0))
+		    std::swap (a0, a1);
+		  /* According to real_equal (), +0 equals -0.  */
+		  if (real_equal (&a0, &dconst0) && real_isinf (&a1))
 		    {
-		      if (a0_cst_p && a1_cst_p)
-			{
-			  tree t0 = TREE_TYPE (args[0]);
-			  real_value a0 = (TREE_REAL_CST (args[0]));
-			  real_value a1 = (TREE_REAL_CST (args[1]));
-			  if (real_equal (&a1, &dconst0))
-			    std::swap (a0, a1);
-			  /* According to real_equal (), +0 equals -0.  */
-			  if (real_equal (&a0, &dconst0) && real_isinf (&a1))
-			    {
-			      real_value res = dconst2;
-			      res.sign = a0.sign ^ a1.sign;
-			      new_stmt =
-				gimple_build_assign (gimple_call_lhs (stmt),
-						     REAL_CST,
-						     build_real (t0, res));
-			    }
-			  else
-			    new_stmt =
-			      gimple_build_assign (gimple_call_lhs (stmt),
-						   MULT_EXPR,
-						   args[0], args[1]);
-			}
-		      else /* a0_cst_p ^ a1_cst_p.  */
-			{
-			  real_value const_part = a0_cst_p
-			    ? TREE_REAL_CST (args[0]) : TREE_REAL_CST (args[1]);
-			  if (!real_equal (&const_part, &dconst0)
-			      && !real_isinf (&const_part))
-			    new_stmt =
-			      gimple_build_assign (gimple_call_lhs (stmt),
-						   MULT_EXPR, args[0], args[1]);
-			}
+		      real_value res = dconst2;
+		      res.sign = a0.sign ^ a1.sign;
+		      new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
+						      REAL_CST,
+						      build_real (t0, res));
 		    }
-		  if (new_stmt)
-		    {
-		      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
-		      gimple_set_vdef (new_stmt, gimple_vdef (stmt));
-		    }
-		  break;
+		  else
+		    new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
+						    MULT_EXPR,
+						    args[0], args[1]);
 		}
-	    default:
-	      break;
+	      else /* a0_cst_p ^ a1_cst_p.  */
+		{
+		  real_value const_part = a0_cst_p
+		    ? TREE_REAL_CST (args[0]) : TREE_REAL_CST (args[1]);
+		  if (!real_equal (&const_part, &dconst0)
+		      && !real_isinf (&const_part))
+		    new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
+						    MULT_EXPR, args[0],
+						    args[1]);
+		}
+	    }
+	  if (new_stmt)
+	    {
+	      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+	      gimple_set_vdef (new_stmt, gimple_vdef (stmt));
 	    }
+	  break;
 	}
+    default:
+      break;
     }
-
-  if (new_stmt)
-    {
-      gsi_replace (gsi, new_stmt, true);
-      changed = true;
-    }
-
-  return changed;
+  return new_stmt;
 }
 
 void
diff --git a/gcc/config/aarch64/aarch64-c.c b/gcc/config/aarch64/aarch64-c.c
index 6d5acb02f..da78f6fe3 100644
--- a/gcc/config/aarch64/aarch64-c.c
+++ b/gcc/config/aarch64/aarch64-c.c
@@ -110,6 +110,7 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
   aarch64_def_or_undef (TARGET_CRC32, "__ARM_FEATURE_CRC32", pfile);
   aarch64_def_or_undef (TARGET_DOTPROD, "__ARM_FEATURE_DOTPROD", pfile);
   aarch64_def_or_undef (TARGET_COMPLEX, "__ARM_FEATURE_COMPLEX", pfile);
+  aarch64_def_or_undef (TARGET_JSCVT, "__ARM_FEATURE_JCVT", pfile);
 
   cpp_undef (pfile, "__AARCH64_CMODEL_TINY__");
   cpp_undef (pfile, "__AARCH64_CMODEL_SMALL__");
@@ -146,6 +147,13 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
 	bits = 0;
       builtin_define_with_int_value ("__ARM_FEATURE_SVE_BITS", bits);
     }
+  aarch64_def_or_undef (TARGET_SVE_I8MM,
+			"__ARM_FEATURE_SVE_MATMUL_INT8", pfile);
+  aarch64_def_or_undef (TARGET_SVE_F32MM,
+			"__ARM_FEATURE_SVE_MATMUL_FP32", pfile);
+  aarch64_def_or_undef (TARGET_SVE_F64MM,
+			"__ARM_FEATURE_SVE_MATMUL_FP64", pfile);
+  aarch64_def_or_undef (TARGET_SVE2, "__ARM_FEATURE_SVE2", pfile);
 
   aarch64_def_or_undef (TARGET_LSE, "__ARM_FEATURE_ATOMICS", pfile);
   aarch64_def_or_undef (TARGET_AES, "__ARM_FEATURE_AES", pfile);
@@ -156,6 +164,16 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
   aarch64_def_or_undef (TARGET_SM4, "__ARM_FEATURE_SM4", pfile);
   aarch64_def_or_undef (TARGET_F16FML, "__ARM_FEATURE_FP16_FML", pfile);
 
+  aarch64_def_or_undef (TARGET_FRINT, "__ARM_FEATURE_FRINT", pfile);
+  aarch64_def_or_undef (TARGET_TME, "__ARM_FEATURE_TME", pfile);
+  aarch64_def_or_undef (TARGET_RNG, "__ARM_FEATURE_RNG", pfile);
+
+  aarch64_def_or_undef (TARGET_I8MM, "__ARM_FEATURE_MATMUL_INT8", pfile);
+  aarch64_def_or_undef (TARGET_BF16_SIMD,
+			"__ARM_FEATURE_BF16_VECTOR_ARITHMETIC", pfile);
+  aarch64_def_or_undef (TARGET_BF16_FP,
+			"__ARM_FEATURE_BF16_SCALAR_ARITHMETIC", pfile);
+
   /* Not for ACLE, but required to keep "float.h" correct if we switch
      target between implementations that do or do not support ARMv8.2-A
      16-bit floating-point extensions.  */
@@ -237,6 +255,73 @@ aarch64_pragma_target_parse (tree args, tree pop_target)
   return true;
 }
 
+/* Implement "#pragma GCC aarch64".  */
+static void
+aarch64_pragma_aarch64 (cpp_reader *)
+{
+  tree x;
+  if (pragma_lex (&x) != CPP_STRING)
+    {
+      error ("%<#pragma GCC aarch64%> requires a string parameter");
+      return;
+    }
+
+  const char *name = TREE_STRING_POINTER (x);
+  if (strcmp (name, "arm_sve.h") == 0)
+    aarch64_sve::handle_arm_sve_h ();
+  else
+    error ("unknown %<#pragma GCC aarch64%> option %qs", name);
+}
+
+/* Implement TARGET_RESOLVE_OVERLOADED_BUILTIN.  */
+static tree
+aarch64_resolve_overloaded_builtin (unsigned int uncast_location,
+				    tree fndecl, void *uncast_arglist)
+{
+  vec<tree, va_gc> empty = {};
+  location_t location = (location_t) uncast_location;
+  vec<tree, va_gc> *arglist = (uncast_arglist
+			       ? (vec<tree, va_gc> *) uncast_arglist
+			       : &empty);
+  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
+  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
+  tree new_fndecl;
+  switch (code & AARCH64_BUILTIN_CLASS)
+    {
+    case AARCH64_BUILTIN_GENERAL:
+      return NULL_TREE;
+
+    case AARCH64_BUILTIN_SVE:
+      new_fndecl = aarch64_sve::resolve_overloaded_builtin (location, subcode,
+							    arglist);
+      break;
+    }
+  if (new_fndecl == NULL_TREE || new_fndecl == error_mark_node)
+    return new_fndecl;
+  return build_function_call_vec (location, vNULL, new_fndecl, arglist,
+				  NULL, fndecl);
+}
+
+/* Implement TARGET_CHECK_BUILTIN_CALL.  */
+static bool
+aarch64_check_builtin_call (location_t loc, vec<location_t> arg_loc,
+			    tree fndecl, tree orig_fndecl,
+			    unsigned int nargs, tree *args)
+{
+  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
+  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
+  switch (code & AARCH64_BUILTIN_CLASS)
+    {
+    case AARCH64_BUILTIN_GENERAL:
+      return true;
+
+    case AARCH64_BUILTIN_SVE:
+      return aarch64_sve::check_builtin_call (loc, arg_loc, subcode,
+					      orig_fndecl, nargs, args);
+    }
+  gcc_unreachable ();
+}
+
 /* Implement REGISTER_TARGET_PRAGMAS.  */
 
 void
@@ -244,4 +329,9 @@ aarch64_register_pragmas (void)
 {
   /* Update pragma hook to allow parsing #pragma GCC target.  */
   targetm.target_option.pragma_parse = aarch64_pragma_target_parse;
+
+  targetm.resolve_overloaded_builtin = aarch64_resolve_overloaded_builtin;
+  targetm.check_builtin_call = aarch64_check_builtin_call;
+
+  c_register_pragma ("GCC", "aarch64", aarch64_pragma_aarch64);
 }
diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index 82d91d625..053c6390e 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -46,6 +46,7 @@
 /* ARMv8-A Architecture Processors.  */
 
 /* ARM ('A') cores. */
+AARCH64_CORE("cortex-a34",  cortexa34, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa35, 0x41, 0xd02, -1)
 AARCH64_CORE("cortex-a35",  cortexa35, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa35, 0x41, 0xd04, -1)
 AARCH64_CORE("cortex-a53",  cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa53, 0x41, 0xd03, -1)
 AARCH64_CORE("cortex-a57",  cortexa57, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, 0x41, 0xd07, -1)
@@ -99,7 +100,11 @@ AARCH64_CORE("thunderx2t99",  thunderx2t99,  thunderx2t99, 8_1A,  AARCH64_FL_FOR
 /* ARM ('A') cores. */
 AARCH64_CORE("cortex-a55",  cortexa55, cortexa53, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD, cortexa53, 0x41, 0xd05, -1)
 AARCH64_CORE("cortex-a75",  cortexa75, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD, cortexa73, 0x41, 0xd0a, -1)
-AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD, cortexa72, 0x41, 0xd0b, -1)
+AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD, neoversen1, 0x41, 0xd0b, -1)
+AARCH64_CORE("cortex-a76ae",  cortexa76ae, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa72, 0x41, 0xd0e, -1)
+AARCH64_CORE("cortex-a77",  cortexa77, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa72, 0x41, 0xd0d, -1)
+AARCH64_CORE("cortex-a65",  cortexa65, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa73, 0x41, 0xd06, -1)
+AARCH64_CORE("cortex-a65ae",  cortexa65ae, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa73, 0x41, 0xd43, -1)
 AARCH64_CORE("ares",  ares, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_PROFILE, neoversen1, 0x41, 0xd0c, -1)
 AARCH64_CORE("neoverse-n1",  neoversen1, cortexa57, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_PROFILE, neoversen1, 0x41, 0xd0c, -1)
 AARCH64_CORE("neoverse-e1",  neoversee1, cortexa53, 8_2A,  AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_RCPC | AARCH64_FL_DOTPROD | AARCH64_FL_SSBS, cortexa53, 0x41, 0xd4a, -1)
diff --git a/gcc/config/aarch64/aarch64-elf-raw.h b/gcc/config/aarch64/aarch64-elf-raw.h
index bbebd0ef0..8fe7b3783 100644
--- a/gcc/config/aarch64/aarch64-elf-raw.h
+++ b/gcc/config/aarch64/aarch64-elf-raw.h
@@ -27,22 +27,6 @@
   " crtend%O%s crtn%O%s " \
   "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
 
-#if TARGET_FIX_ERR_A53_835769_DEFAULT
-#define CA53_ERR_835769_SPEC \
-  " %{!mno-fix-cortex-a53-835769:--fix-cortex-a53-835769}"
-#else
-#define CA53_ERR_835769_SPEC \
-  " %{mfix-cortex-a53-835769:--fix-cortex-a53-835769}"
-#endif
-
-#if TARGET_FIX_ERR_A53_843419_DEFAULT
-#define CA53_ERR_843419_SPEC \
-  " %{!mno-fix-cortex-a53-843419:--fix-cortex-a53-843419}"
-#else
-#define CA53_ERR_843419_SPEC \
-  " %{mfix-cortex-a53-843419:--fix-cortex-a53-843419}"
-#endif
-
 #ifndef LINK_SPEC
 #define LINK_SPEC "%{h*}			\
    %{static:-Bstatic}				\
@@ -51,8 +35,7 @@
    %{!static:%{rdynamic:-export-dynamic}}	\
    %{mbig-endian:-EB} %{mlittle-endian:-EL} -X	\
   -maarch64elf%{mabi=ilp32*:32}%{mbig-endian:b}" \
-  CA53_ERR_835769_SPEC \
-  CA53_ERR_843419_SPEC
+  AARCH64_ERRATA_LINK_SPEC
 #endif
 
 #endif /* GCC_AARCH64_ELF_RAW_H */
diff --git a/gcc/config/aarch64/aarch64-errata.h b/gcc/config/aarch64/aarch64-errata.h
new file mode 100644
index 000000000..8f062536e
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-errata.h
@@ -0,0 +1,44 @@
+/* Machine description for AArch64 architecture.
+   Copyright (C) 2009-2019 Free Software Foundation, Inc.
+   Contributed by ARM Ltd.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_ERRATA_H
+#define GCC_AARCH64_ERRATA_H
+
+#if TARGET_FIX_ERR_A53_835769_DEFAULT
+#define CA53_ERR_835769_SPEC \
+  " %{!mno-fix-cortex-a53-835769:--fix-cortex-a53-835769}"
+#else
+#define CA53_ERR_835769_SPEC \
+  " %{mfix-cortex-a53-835769:--fix-cortex-a53-835769}"
+#endif
+
+#if TARGET_FIX_ERR_A53_843419_DEFAULT
+#define CA53_ERR_843419_SPEC \
+  " %{!mno-fix-cortex-a53-843419:--fix-cortex-a53-843419}"
+#else
+#define CA53_ERR_843419_SPEC \
+  " %{mfix-cortex-a53-843419:--fix-cortex-a53-843419}"
+#endif
+
+#define AARCH64_ERRATA_LINK_SPEC		\
+  CA53_ERR_835769_SPEC				\
+  CA53_ERR_843419_SPEC
+
+#endif /*  GCC_AARCH64_ERRATA_H */
diff --git a/gcc/config/aarch64/aarch64-freebsd.h b/gcc/config/aarch64/aarch64-freebsd.h
index 899e6f95e..7a3e89b1b 100644
--- a/gcc/config/aarch64/aarch64-freebsd.h
+++ b/gcc/config/aarch64/aarch64-freebsd.h
@@ -46,26 +46,8 @@
     -X" SUBTARGET_EXTRA_LINK_SPEC "                             \
     %{mbig-endian:-EB} %{mlittle-endian:-EL}"
 
-#if TARGET_FIX_ERR_A53_835769_DEFAULT
-#define CA53_ERR_835769_SPEC \
-  " %{!mno-fix-cortex-a53-835769:--fix-cortex-a53-835769}"
-#else
-#define CA53_ERR_835769_SPEC \
-  " %{mfix-cortex-a53-835769:--fix-cortex-a53-835769}"
-#endif
-
-#ifdef TARGET_FIX_ERR_A53_843419_DEFAULT
-#define CA53_ERR_843419_SPEC \
-  " %{!mno-fix-cortex-a53-843419:--fix-cortex-a53-843419}"
-#else
-#define CA53_ERR_843419_SPEC \
-  " %{mfix-cortex-a53-843419:--fix-cortex-a53-843419}"
-#endif
-
 #undef  LINK_SPEC
-#define LINK_SPEC FBSD_TARGET_LINK_SPEC	\
-                  CA53_ERR_835769_SPEC	\
-                  CA53_ERR_843419_SPEC
+#define LINK_SPEC FBSD_TARGET_LINK_SPEC AARCH64_ERRATA_LINK_SPEC
 
 #define GNU_USER_TARGET_MATHFILE_SPEC \
   "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h
index 5e8b34ded..6ff2163b6 100644
--- a/gcc/config/aarch64/aarch64-linux.h
+++ b/gcc/config/aarch64/aarch64-linux.h
@@ -46,25 +46,8 @@
    %{mbig-endian:-EB} %{mlittle-endian:-EL}     \
    -maarch64linux%{mabi=ilp32:32}%{mbig-endian:b}"
 
-#if TARGET_FIX_ERR_A53_835769_DEFAULT
-#define CA53_ERR_835769_SPEC \
-  " %{!mno-fix-cortex-a53-835769:--fix-cortex-a53-835769}"
-#else
-#define CA53_ERR_835769_SPEC \
-  " %{mfix-cortex-a53-835769:--fix-cortex-a53-835769}"
-#endif
-
-#if TARGET_FIX_ERR_A53_843419_DEFAULT
-#define CA53_ERR_843419_SPEC \
-  " %{!mno-fix-cortex-a53-843419:--fix-cortex-a53-843419}"
-#else
-#define CA53_ERR_843419_SPEC \
-  " %{mfix-cortex-a53-843419:--fix-cortex-a53-843419}"
-#endif
-
-#define LINK_SPEC LINUX_TARGET_LINK_SPEC \
-                  CA53_ERR_835769_SPEC \
-                  CA53_ERR_843419_SPEC
+
+#define LINK_SPEC LINUX_TARGET_LINK_SPEC AARCH64_ERRATA_LINK_SPEC
 
 #define GNU_USER_TARGET_MATHFILE_SPEC \
   "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}"
diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index 14c1a43fe..3640540b3 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -33,6 +33,8 @@
 CC_MODE (CCFP);
 CC_MODE (CCFPE);
 CC_MODE (CC_SWP);
+CC_MODE (CC_NZC);   /* Only N, Z and C bits of condition flags are valid.
+		       (Used with SVE predicate tests.)  */
 CC_MODE (CC_NZ);    /* Only N and Z bits of condition flags are valid.  */
 CC_MODE (CC_Z);     /* Only Z bit of condition flags is valid.  */
 CC_MODE (CC_C);     /* C represents unsigned overflow of a simple addition.  */
@@ -60,6 +62,10 @@ ADJUST_ALIGNMENT (VNx8BI, 2);
 ADJUST_ALIGNMENT (VNx4BI, 2);
 ADJUST_ALIGNMENT (VNx2BI, 2);
 
+/* Bfloat16 modes.  */
+FLOAT_MODE (BF, 2, 0);
+ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format);
+
 VECTOR_MODES (INT, 8);        /*       V8QI V4HI V2SI.  */
 VECTOR_MODES (INT, 16);       /* V16QI V8HI V4SI V2DI.  */
 VECTOR_MODES (FLOAT, 8);      /*                 V2SF.  */
@@ -80,13 +86,14 @@ INT_MODE (XI, 64);
    strictly necessary to set the alignment here, since the default would
    be clamped to BIGGEST_ALIGNMENT anyhow, but it seems clearer.  */
 #define SVE_MODES(NVECS, VB, VH, VS, VD) \
-  VECTOR_MODES_WITH_PREFIX (VNx, INT, 16 * NVECS); \
-  VECTOR_MODES_WITH_PREFIX (VNx, FLOAT, 16 * NVECS); \
+  VECTOR_MODES_WITH_PREFIX (VNx, INT, 16 * NVECS, 0); \
+  VECTOR_MODES_WITH_PREFIX (VNx, FLOAT, 16 * NVECS, 0); \
   \
   ADJUST_NUNITS (VB##QI, aarch64_sve_vg * NVECS * 8); \
   ADJUST_NUNITS (VH##HI, aarch64_sve_vg * NVECS * 4); \
   ADJUST_NUNITS (VS##SI, aarch64_sve_vg * NVECS * 2); \
   ADJUST_NUNITS (VD##DI, aarch64_sve_vg * NVECS); \
+  ADJUST_NUNITS (VH##BF, aarch64_sve_vg * NVECS * 4); \
   ADJUST_NUNITS (VH##HF, aarch64_sve_vg * NVECS * 4); \
   ADJUST_NUNITS (VS##SF, aarch64_sve_vg * NVECS * 2); \
   ADJUST_NUNITS (VD##DF, aarch64_sve_vg * NVECS); \
@@ -95,6 +102,7 @@ INT_MODE (XI, 64);
   ADJUST_ALIGNMENT (VH##HI, 16); \
   ADJUST_ALIGNMENT (VS##SI, 16); \
   ADJUST_ALIGNMENT (VD##DI, 16); \
+  ADJUST_ALIGNMENT (VH##BF, 16); \
   ADJUST_ALIGNMENT (VH##HF, 16); \
   ADJUST_ALIGNMENT (VS##SF, 16); \
   ADJUST_ALIGNMENT (VD##DF, 16);
@@ -106,6 +114,40 @@ SVE_MODES (2, VNx32, VNx16, VNx8, VNx4)
 SVE_MODES (3, VNx48, VNx24, VNx12, VNx6)
 SVE_MODES (4, VNx64, VNx32, VNx16, VNx8)
 
+/* Partial SVE vectors:
+
+      VNx2QI VNx4QI VNx8QI
+      VNx2HI VNx4HI
+      VNx2SI
+
+   In memory they occupy contiguous locations, in the same way as fixed-length
+   vectors.  E.g. VNx8QImode is half the size of VNx16QImode.
+
+   Passing 1 as the final argument ensures that the modes come after all
+   other modes in the GET_MODE_WIDER chain, so that we never pick them
+   in preference to a full vector mode.  */
+VECTOR_MODES_WITH_PREFIX (VNx, INT, 2, 1);
+VECTOR_MODES_WITH_PREFIX (VNx, INT, 4, 1);
+VECTOR_MODES_WITH_PREFIX (VNx, INT, 8, 1);
+
+ADJUST_NUNITS (VNx2QI, aarch64_sve_vg);
+ADJUST_NUNITS (VNx2HI, aarch64_sve_vg);
+ADJUST_NUNITS (VNx2SI, aarch64_sve_vg);
+
+ADJUST_NUNITS (VNx4QI, aarch64_sve_vg * 2);
+ADJUST_NUNITS (VNx4HI, aarch64_sve_vg * 2);
+
+ADJUST_NUNITS (VNx8QI, aarch64_sve_vg * 4);
+
+ADJUST_ALIGNMENT (VNx2QI, 1);
+ADJUST_ALIGNMENT (VNx4QI, 1);
+ADJUST_ALIGNMENT (VNx8QI, 1);
+
+ADJUST_ALIGNMENT (VNx2HI, 2);
+ADJUST_ALIGNMENT (VNx4HI, 2);
+
+ADJUST_ALIGNMENT (VNx2SI, 4);
+
 /* Quad float: 128-bit floating mode for long doubles.  */
 FLOAT_MODE (TF, 16, ieee_quad_format);
 
diff --git a/gcc/config/aarch64/aarch64-netbsd.h b/gcc/config/aarch64/aarch64-netbsd.h
new file mode 100644
index 000000000..e6c9264bd
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-netbsd.h
@@ -0,0 +1,63 @@
+/* Definitions for AArch64 running NetBSD
+   Copyright (C) 2016-2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_NETBSD_H
+#define GCC_AARCH64_NETBSD_H
+
+#define TARGET_LINKER_BIG_EMULATION "aarch64nbsdb"
+#define TARGET_LINKER_LITTLE_EMULATION "aarch64nbsd"
+
+#if TARGET_BIG_ENDIAN_DEFAULT
+#define TARGET_LINKER_EMULATION  TARGET_LINKER_BIG_EMULATION
+#else
+#define TARGET_LINKER_EMULATION  TARGET_LINKER_LITTLE_EMULATION
+#endif
+
+#undef  SUBTARGET_EXTRA_LINK_SPEC
+#define SUBTARGET_EXTRA_LINK_SPEC " -m" TARGET_LINKER_EMULATION
+
+#define NETBSD_ENTRY_POINT "__start"
+
+#define NETBSD_TARGET_LINK_SPEC  "%{h*} "				\
+  "-X %{mbig-endian:-EB -m " TARGET_LINKER_BIG_EMULATION "} "		\
+  "%{mlittle-endian:-EL -m " TARGET_LINKER_LITTLE_EMULATION "} "	\
+  "%(netbsd_link_spec)"
+
+#undef  LINK_SPEC
+#define LINK_SPEC NETBSD_LINK_SPEC_ELF		\
+		  NETBSD_TARGET_LINK_SPEC	\
+		  AARCH64_ERRATA_LINK_SPEC
+
+#undef TARGET_OS_CPP_BUILTINS
+#define TARGET_OS_CPP_BUILTINS()		\
+  do						\
+    {						\
+      NETBSD_OS_CPP_BUILTINS_ELF();		\
+    }						\
+  while (0)
+
+#undef SUBTARGET_CPP_SPEC
+#define SUBTARGET_CPP_SPEC NETBSD_CPP_SPEC
+
+#undef EXTRA_SPECS
+#define EXTRA_SPECS \
+  { "asm_cpu_spec",             ASM_CPU_SPEC }, \
+  NETBSD_SUBTARGET_EXTRA_SPECS
+
+#endif  /* GCC_AARCH64_NETBSD_H */
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index 010fd3ccf..345cdc4da 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -45,29 +45,46 @@
      entries: aes, pmull, sha1, sha2 being present).  In that case this field
      should contain a space (" ") separated list of the strings in 'Features'
      that are required.  Their order is not important.  An empty string means
-     do not detect this feature during auto detection.  */
+     do not detect this feature during auto detection.
 
-/* NOTE: This file is being parsed by config.gcc and so the
-   AARCH64_OPT_EXTENSION must adhere to a strict format:
-   1) No space between the AARCH64_OPT_EXTENSION and the opening (.
-   2) No space between the opening ( and the extension name.
-   3) No space after the extension name before the ,.
-   4) Spaces are only allowed after a , and around |.
-   5) Everything must be on one line.  */
+     NOTE: Any changes to the AARCH64_OPT_EXTENSION macro need to be mirrored in
+     config.gcc.  */
 
 /* Enabling "fp" just enables "fp".
    Disabling "fp" also disables "simd", "crypto", "fp16", "aes", "sha2",
-   "sha3", sm3/sm4 and "sve".  */
-AARCH64_OPT_EXTENSION("fp", AARCH64_FL_FP, 0, AARCH64_FL_SIMD | AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2 | AARCH64_FL_SHA3 | AARCH64_FL_SM4 | AARCH64_FL_SVE, false, "fp")
+   "sha3", sm3/sm4, "sve", "sve2", "sve2-aes", "sve2-sha3", "sve2-sm4",
+   "sve2-bitperm", "i8mm", "f32mm", "f64mm", and "bf16".  */
+AARCH64_OPT_EXTENSION("fp", AARCH64_FL_FP, 0, AARCH64_FL_SIMD | \
+		      AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | \
+		      AARCH64_FL_SHA2 | AARCH64_FL_SHA3 | AARCH64_FL_SM4 | \
+		      AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_SVE2_AES | \
+		      AARCH64_FL_SVE2_SHA3 | AARCH64_FL_SVE2_SM4 | \
+		      AARCH64_FL_SVE2_BITPERM | AARCH64_FL_I8MM | \
+		      AARCH64_FL_F32MM | AARCH64_FL_F64MM | AARCH64_FL_BF16,
+		       false, "fp")
 
 /* Enabling "simd" also enables "fp".
    Disabling "simd" also disables "crypto", "dotprod", "aes", "sha2", "sha3",
-   "sm3/sm4" and "sve".  */
-AARCH64_OPT_EXTENSION("simd", AARCH64_FL_SIMD, AARCH64_FL_FP, AARCH64_FL_CRYPTO | AARCH64_FL_DOTPROD | AARCH64_FL_AES | AARCH64_FL_SHA2 | AARCH64_FL_SHA3 | AARCH64_FL_SM4 | AARCH64_FL_SVE, false, "asimd")
+   "sm3/sm4", "sve", "sve2", "sve2-aes", "sve2-sha3", "sve2-sm4",
+   "sve2-bitperm", "i8mm", "f32mm" and "f64mm".  */
+AARCH64_OPT_EXTENSION("simd", AARCH64_FL_SIMD, AARCH64_FL_FP, \
+		      AARCH64_FL_CRYPTO | AARCH64_FL_DOTPROD | \
+		      AARCH64_FL_AES | AARCH64_FL_SHA2 | AARCH64_FL_SHA3 | \
+		      AARCH64_FL_SM4 | AARCH64_FL_SVE | AARCH64_FL_SVE2 | \
+		      AARCH64_FL_SVE2_AES | AARCH64_FL_SVE2_SHA3 | \
+		      AARCH64_FL_SVE2_SM4 | AARCH64_FL_SVE2_BITPERM | \
+		      AARCH64_FL_I8MM | AARCH64_FL_F32MM | AARCH64_FL_F64MM, \
+		      false, "asimd")
 
 /* Enabling "crypto" also enables "fp", "simd", "aes" and "sha2".
-   Disabling "crypto" disables "crypto", "aes", "sha2", "sha3" and "sm3/sm4".  */
-AARCH64_OPT_EXTENSION("crypto", AARCH64_FL_CRYPTO, AARCH64_FL_FP | AARCH64_FL_SIMD | AARCH64_FL_AES | AARCH64_FL_SHA2, AARCH64_FL_AES | AARCH64_FL_SHA2 |AARCH64_FL_SHA3 | AARCH64_FL_SM4, true, "aes pmull sha1 sha2")
+   Disabling "crypto" disables "crypto", "aes", "sha2", "sha3" and "sm3/sm4",
+   "sve2-aes", "sve2-sha3", "sve2-sm4".  */
+AARCH64_OPT_EXTENSION("crypto", AARCH64_FL_CRYPTO, AARCH64_FL_FP | \
+		      AARCH64_FL_SIMD | AARCH64_FL_AES | AARCH64_FL_SHA2, \
+		      AARCH64_FL_AES | AARCH64_FL_SHA2 | AARCH64_FL_SHA3 | \
+		      AARCH64_FL_SM4 | AARCH64_FL_SVE2_AES | \
+		      AARCH64_FL_SVE2_SHA3 | AARCH64_FL_SVE2_SM4, true, \
+		      "aes pmull sha1 sha2")
 
 /* Enabling or disabling "crc" only changes "crc".  */
 AARCH64_OPT_EXTENSION("crc", AARCH64_FL_CRC, 0, 0, false, "crc32")
@@ -76,43 +93,63 @@ AARCH64_OPT_EXTENSION("crc", AARCH64_FL_CRC, 0, 0, false, "crc32")
 AARCH64_OPT_EXTENSION("lse", AARCH64_FL_LSE, 0, 0, false, "atomics")
 
 /* Enabling "fp16" also enables "fp".
-   Disabling "fp16" disables "fp16", "fp16fml" and "sve".  */
-AARCH64_OPT_EXTENSION("fp16", AARCH64_FL_F16, AARCH64_FL_FP, AARCH64_FL_F16FML | AARCH64_FL_SVE, false, "fphp asimdhp")
+   Disabling "fp16" disables "fp16", "fp16fml", "sve", "sve2",
+   "sve2-aes", "sve2-sha3", "sve2-sm4", "sve2-bitperm", "f32mm" and
+    "f64mm".  */
+AARCH64_OPT_EXTENSION("fp16", AARCH64_FL_F16, AARCH64_FL_FP, \
+		      AARCH64_FL_F16FML | AARCH64_FL_SVE | AARCH64_FL_F32MM | \
+		      AARCH64_FL_F64MM | AARCH64_FL_SVE2 | \
+		      AARCH64_FL_SVE2_AES | AARCH64_FL_SVE2_SHA3 | \
+		      AARCH64_FL_SVE2_SM4 | AARCH64_FL_SVE2_BITPERM, false, \
+		      "fphp asimdhp")
 
 /* Enabling or disabling "rcpc" only changes "rcpc".  */
 AARCH64_OPT_EXTENSION("rcpc", AARCH64_FL_RCPC, 0, 0, false, "lrcpc")
 
 /* Enabling "rdma" also enables "fp", "simd".
    Disabling "rdma" just disables "rdma".  */
-AARCH64_OPT_EXTENSION("rdma", AARCH64_FL_RDMA, AARCH64_FL_FP | AARCH64_FL_SIMD, 0, false, "asimdrdm")
+AARCH64_OPT_EXTENSION("rdma", AARCH64_FL_RDMA, \
+		      AARCH64_FL_FP | AARCH64_FL_SIMD, 0, false, "asimdrdm")
 
 /* Enabling "dotprod" also enables "simd".
    Disabling "dotprod" only disables "dotprod".  */
-AARCH64_OPT_EXTENSION("dotprod", AARCH64_FL_DOTPROD, AARCH64_FL_SIMD, 0, false, "asimddp")
+AARCH64_OPT_EXTENSION("dotprod", AARCH64_FL_DOTPROD, AARCH64_FL_SIMD, 0, \
+		      false, "asimddp")
 
 /* Enabling "aes" also enables "simd".
-   Disabling "aes" just disables "aes".  */
-AARCH64_OPT_EXTENSION("aes", AARCH64_FL_AES, AARCH64_FL_SIMD, 0, false, "aes")
+   Disabling "aes" disables "aes" and "sve2-aes'.  */
+AARCH64_OPT_EXTENSION("aes", AARCH64_FL_AES, AARCH64_FL_SIMD, \
+		      AARCH64_FL_SVE2_AES, false, "aes")
 
 /* Enabling "sha2" also enables "simd".
    Disabling "sha2" just disables "sha2".  */
-AARCH64_OPT_EXTENSION("sha2", AARCH64_FL_SHA2, AARCH64_FL_SIMD, 0, false, "sha1 sha2")
+AARCH64_OPT_EXTENSION("sha2", AARCH64_FL_SHA2, AARCH64_FL_SIMD, 0, false, \
+		      "sha1 sha2")
 
 /* Enabling "sha3" enables "simd" and "sha2".
-   Disabling "sha3" just disables "sha3".  */
-AARCH64_OPT_EXTENSION("sha3", AARCH64_FL_SHA3, AARCH64_FL_SIMD | AARCH64_FL_SHA2, 0, false, "sha3 sha512")
+   Disabling "sha3" disables "sha3" and "sve2-sha3".  */
+AARCH64_OPT_EXTENSION("sha3", AARCH64_FL_SHA3, AARCH64_FL_SIMD | \
+		      AARCH64_FL_SHA2, AARCH64_FL_SVE2_SHA3, false, \
+		      "sha3 sha512")
 
 /* Enabling "sm4" also enables "simd".
-   Disabling "sm4" just disables "sm4".  */
-AARCH64_OPT_EXTENSION("sm4", AARCH64_FL_SM4, AARCH64_FL_SIMD, 0, false, "sm3 sm4")
+   Disabling "sm4" disables "sm4" and "sve2-sm4".  */
+AARCH64_OPT_EXTENSION("sm4", AARCH64_FL_SM4, AARCH64_FL_SIMD, \
+		      AARCH64_FL_SVE2_SM4, false, "sm3 sm4")
 
 /* Enabling "fp16fml" also enables "fp" and "fp16".
    Disabling "fp16fml" just disables "fp16fml".  */
-AARCH64_OPT_EXTENSION("fp16fml", AARCH64_FL_F16FML, AARCH64_FL_FP | AARCH64_FL_F16, 0, false, "asimdfhm")
+AARCH64_OPT_EXTENSION("fp16fml", AARCH64_FL_F16FML, \
+		      AARCH64_FL_FP | AARCH64_FL_F16, 0, false, "asimdfhm")
 
 /* Enabling "sve" also enables "fp16", "fp" and "simd".
-   Disabling "sve" just disables "sve".  */
-AARCH64_OPT_EXTENSION("sve", AARCH64_FL_SVE, AARCH64_FL_FP | AARCH64_FL_SIMD | AARCH64_FL_F16, 0, false, "sve")
+   Disabling "sve" disables "sve", "f32mm", "f64mm", "sve2", "sve2-aes",
+   "sve2-sha3", "sve2-sm4" and "sve2-bitperm".  */
+AARCH64_OPT_EXTENSION("sve", AARCH64_FL_SVE, AARCH64_FL_FP | AARCH64_FL_SIMD | \
+		      AARCH64_FL_F16, AARCH64_FL_F32MM | AARCH64_FL_F64MM | \
+		      AARCH64_FL_SVE2 | AARCH64_FL_SVE2_AES | \
+		      AARCH64_FL_SVE2_SHA3 | AARCH64_FL_SVE2_SM4 | \
+		      AARCH64_FL_SVE2_BITPERM, false, "sve")
 
 /* Enabling/Disabling "profile" does not enable/disable any other feature.  */
 AARCH64_OPT_EXTENSION("profile", AARCH64_FL_PROFILE, 0, 0, false, "")
@@ -124,12 +161,69 @@ AARCH64_OPT_EXTENSION("rng", AARCH64_FL_RNG, 0, 0, false, "")
 AARCH64_OPT_EXTENSION("memtag", AARCH64_FL_MEMTAG, 0, 0, false, "")
 
 /* Enabling/Disabling "sb" only changes "sb".  */
-AARCH64_OPT_EXTENSION("sb", AARCH64_FL_SB, 0, 0, false, "")
+AARCH64_OPT_EXTENSION("sb", AARCH64_FL_SB, 0, 0, false, "sb")
 
 /* Enabling/Disabling "ssbs" only changes "ssbs".  */
-AARCH64_OPT_EXTENSION("ssbs", AARCH64_FL_SSBS, 0, 0, false, "")
+AARCH64_OPT_EXTENSION("ssbs", AARCH64_FL_SSBS, 0, 0, false, "ssbs")
 
 /* Enabling/Disabling "predres" only changes "predres".  */
 AARCH64_OPT_EXTENSION("predres", AARCH64_FL_PREDRES, 0, 0, false, "")
 
+/* Enabling "sve2" also enables "sve", "fp16", "fp", and "simd".
+   Disabling "sve2" disables "sve2", "sve2-aes", "sve2-sha3", "sve2-sm4", and
+   "sve2-bitperm".  */
+AARCH64_OPT_EXTENSION("sve2", AARCH64_FL_SVE2, AARCH64_FL_SVE | \
+		      AARCH64_FL_FP | AARCH64_FL_SIMD | AARCH64_FL_F16, \
+		      AARCH64_FL_SVE2_AES | AARCH64_FL_SVE2_SHA3 | \
+		      AARCH64_FL_SVE2_SM4 | AARCH64_FL_SVE2_BITPERM, false, "sve2")
+
+/* Enabling "sve2-sm4" also enables "sm4", "simd", "fp16", "fp", "sve", and
+   "sve2". Disabling "sve2-sm4" just disables "sve2-sm4".  */
+AARCH64_OPT_EXTENSION("sve2-sm4", AARCH64_FL_SVE2_SM4, AARCH64_FL_SM4 | \
+		      AARCH64_FL_SIMD | AARCH64_FL_F16 | AARCH64_FL_FP | \
+		      AARCH64_FL_SVE | AARCH64_FL_SVE2, 0, false, "svesm4")
+
+/* Enabling "sve2-aes" also enables "aes", "simd", "fp16", "fp", "sve", and
+   "sve2". Disabling "sve2-aes" just disables "sve2-aes".  */
+AARCH64_OPT_EXTENSION("sve2-aes", AARCH64_FL_SVE2_AES, AARCH64_FL_AES | \
+		      AARCH64_FL_SIMD | AARCH64_FL_F16 | AARCH64_FL_FP | \
+		      AARCH64_FL_SVE | AARCH64_FL_SVE2, 0, false, "sveaes")
+
+/* Enabling "sve2-sha3" also enables "sha3", "simd", "fp16", "fp", "sve", and
+   "sve2". Disabling "sve2-sha3" just disables "sve2-sha3".  */
+AARCH64_OPT_EXTENSION("sve2-sha3", AARCH64_FL_SVE2_SHA3, AARCH64_FL_SHA3 | \
+		      AARCH64_FL_SIMD | AARCH64_FL_F16 | AARCH64_FL_FP | \
+		      AARCH64_FL_SVE | AARCH64_FL_SVE2, 0, false, "svesha3")
+
+/* Enabling "sve2-bitperm" also enables "simd", "fp16", "fp", "sve", and
+   "sve2".  Disabling "sve2-bitperm" just disables "sve2-bitperm".  */
+AARCH64_OPT_EXTENSION("sve2-bitperm", AARCH64_FL_SVE2_BITPERM, AARCH64_FL_SIMD | \
+		      AARCH64_FL_F16 | AARCH64_FL_FP | AARCH64_FL_SVE | \
+		      AARCH64_FL_SVE2, 0, false, "svebitperm")
+
+/* Enabling or disabling "tme" only changes "tme".  */
+AARCH64_OPT_EXTENSION("tme", AARCH64_FL_TME, 0, 0, false, "")
+
+/* Enabling "i8mm" also enables "simd" and "fp".
+   Disabling "i8mm" only disables "i8mm".  */
+AARCH64_OPT_EXTENSION("i8mm", AARCH64_FL_I8MM, \
+		      AARCH64_FL_SIMD | AARCH64_FL_FP, 0, false, "i8mm")
+
+/* Enabling "f32mm" also enables "sve", "fp16", "fp", and "simd".
+   Disabling "f32mm" only disables "f32mm".  */
+AARCH64_OPT_EXTENSION("f32mm", AARCH64_FL_F32MM, \
+		      AARCH64_FL_SVE | AARCH64_FL_F16 | AARCH64_FL_FP | \
+		      AARCH64_FL_SIMD, 0, false, "f32mm")
+
+/* Enabling "f64mm" also enables "sve", "fp16", "fp", and "simd".
+   Disabling "f64mm" only disables "f64mm".  */
+AARCH64_OPT_EXTENSION("f64mm", AARCH64_FL_F64MM, \
+		      AARCH64_FL_SVE | AARCH64_FL_F16 | AARCH64_FL_FP | \
+		      AARCH64_FL_SIMD, 0, false, "f64mm")
+
+/* Enabling "bf16" also enables "simd" and "fp".
+   Disabling "bf16" only disables "bf16".  */
+AARCH64_OPT_EXTENSION("bf16", AARCH64_FL_BF16, \
+		      AARCH64_FL_SIMD | AARCH64_FL_FP, 0, false, "bf16")
+
 #undef AARCH64_OPT_EXTENSION
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 994bcfc7e..5e0a499e8 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -396,8 +396,81 @@ enum simd_immediate_check {
   AARCH64_CHECK_MOV  = AARCH64_CHECK_ORR | AARCH64_CHECK_BIC
 };
 
+/* The key type that -msign-return-address should use.  */
+enum aarch64_key_type {
+  AARCH64_KEY_A,
+  AARCH64_KEY_B
+};
+
+extern enum aarch64_key_type aarch64_ra_sign_key;
+
 extern struct tune_params aarch64_tune_params;
 
+/* The available SVE predicate patterns, known in the ACLE as "svpattern".  */
+#define AARCH64_FOR_SVPATTERN(T) \
+  T (POW2, pow2, 0) \
+  T (VL1, vl1, 1) \
+  T (VL2, vl2, 2) \
+  T (VL3, vl3, 3) \
+  T (VL4, vl4, 4) \
+  T (VL5, vl5, 5) \
+  T (VL6, vl6, 6) \
+  T (VL7, vl7, 7) \
+  T (VL8, vl8, 8) \
+  T (VL16, vl16, 9) \
+  T (VL32, vl32, 10) \
+  T (VL64, vl64, 11) \
+  T (VL128, vl128, 12) \
+  T (VL256, vl256, 13) \
+  T (MUL4, mul4, 29) \
+  T (MUL3, mul3, 30) \
+  T (ALL, all, 31)
+
+/* The available SVE prefetch operations, known in the ACLE as "svprfop".  */
+#define AARCH64_FOR_SVPRFOP(T) \
+  T (PLDL1KEEP, pldl1keep, 0) \
+  T (PLDL1STRM, pldl1strm, 1) \
+  T (PLDL2KEEP, pldl2keep, 2) \
+  T (PLDL2STRM, pldl2strm, 3) \
+  T (PLDL3KEEP, pldl3keep, 4) \
+  T (PLDL3STRM, pldl3strm, 5) \
+  T (PSTL1KEEP, pstl1keep, 8) \
+  T (PSTL1STRM, pstl1strm, 9) \
+  T (PSTL2KEEP, pstl2keep, 10) \
+  T (PSTL2STRM, pstl2strm, 11) \
+  T (PSTL3KEEP, pstl3keep, 12) \
+  T (PSTL3STRM, pstl3strm, 13)
+
+#define AARCH64_SVENUM(UPPER, LOWER, VALUE) AARCH64_SV_##UPPER = VALUE,
+enum aarch64_svpattern {
+  AARCH64_FOR_SVPATTERN (AARCH64_SVENUM)
+  AARCH64_NUM_SVPATTERNS
+};
+
+enum aarch64_svprfop {
+  AARCH64_FOR_SVPRFOP (AARCH64_SVENUM)
+  AARCH64_NUM_SVPRFOPS
+};
+#undef AARCH64_SVENUM
+
+/* It's convenient to divide the built-in function codes into groups,
+   rather than having everything in a single enum.  This type enumerates
+   those groups.  */
+enum aarch64_builtin_class
+{
+  AARCH64_BUILTIN_GENERAL,
+  AARCH64_BUILTIN_SVE
+};
+
+/* Built-in function codes are structured so that the low
+   AARCH64_BUILTIN_SHIFT bits contain the aarch64_builtin_class
+   and the upper bits contain a group-specific subcode.  */
+const unsigned int AARCH64_BUILTIN_SHIFT = 1;
+
+/* Mask that selects the aarch64_builtin_class part of a function code.  */
+const unsigned int AARCH64_BUILTIN_CLASS = (1 << AARCH64_BUILTIN_SHIFT) - 1;
+
+void aarch64_post_cfi_startproc (void);
 poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned);
 int aarch64_get_condition_code (rtx);
 bool aarch64_address_valid_for_prefetch_p (rtx, bool);
@@ -407,6 +480,8 @@ unsigned HOST_WIDE_INT aarch64_and_split_imm2 (HOST_WIDE_INT val_in);
 bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode);
 int aarch64_branch_cost (bool, bool);
 enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
+opt_machine_mode aarch64_vq_mode (scalar_mode);
+opt_machine_mode aarch64_full_sve_mode (scalar_mode);
 bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
 bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
 bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
@@ -414,14 +489,13 @@ bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
 bool aarch64_constant_address_p (rtx);
 bool aarch64_emit_approx_div (rtx, rtx, rtx);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
-void aarch64_expand_call (rtx, rtx, bool);
-bool aarch64_expand_movmem (rtx *);
+void aarch64_expand_call (rtx, rtx, rtx, bool);
+bool aarch64_expand_cpymem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_float_const_rtx_p (rtx);
 bool aarch64_function_arg_regno_p (unsigned);
 bool aarch64_fusion_enabled_p (enum aarch64_fusion_pairs);
-bool aarch64_gen_movmemqi (rtx *);
-bool aarch64_gimple_fold_builtin (gimple_stmt_iterator *);
+bool aarch64_gen_cpymemqi (rtx *);
 bool aarch64_is_extend_from_extract (scalar_int_mode, rtx, rtx);
 bool aarch64_is_long_call_p (rtx);
 bool aarch64_is_noplt_call_p (rtx);
@@ -436,24 +510,32 @@ bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, unsigned HOST_WIDE_INT,
 					unsigned HOST_WIDE_INT);
 bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
 bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
+machine_mode aarch64_sve_int_mode (machine_mode);
 opt_machine_mode aarch64_sve_pred_mode (unsigned int);
+opt_machine_mode aarch64_sve_data_mode (scalar_mode, poly_uint64);
+bool aarch64_sve_mode_p (machine_mode);
+HOST_WIDE_INT aarch64_fold_sve_cnt_pat (aarch64_svpattern, unsigned int);
 bool aarch64_sve_cnt_immediate_p (rtx);
+bool aarch64_sve_scalar_inc_dec_immediate_p (rtx);
 bool aarch64_sve_addvl_addpl_immediate_p (rtx);
-bool aarch64_sve_inc_dec_immediate_p (rtx);
+bool aarch64_sve_vector_inc_dec_immediate_p (rtx);
 int aarch64_add_offset_temporaries (rtx);
 void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, rtx, rtx, rtx);
 bool aarch64_mov_operand_p (rtx, machine_mode);
 rtx aarch64_reverse_mask (machine_mode, unsigned int);
 bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);
 bool aarch64_offset_9bit_signed_unscaled_p (machine_mode, poly_int64);
+char *aarch64_output_sve_prefetch (const char *, rtx, const char *);
 char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
-char *aarch64_output_sve_addvl_addpl (rtx, rtx, rtx);
-char *aarch64_output_sve_inc_dec_immediate (const char *, rtx);
+char *aarch64_output_sve_cnt_pat_immediate (const char *, const char *, rtx *);
+char *aarch64_output_sve_scalar_inc_dec (rtx);
+char *aarch64_output_sve_addvl_addpl (rtx);
+char *aarch64_output_sve_vector_inc_dec (const char *, rtx);
 char *aarch64_output_scalar_simd_mov_immediate (rtx, scalar_int_mode);
 char *aarch64_output_simd_mov_immediate (rtx, unsigned,
 			enum simd_immediate_check w = AARCH64_CHECK_MOV);
 char *aarch64_output_sve_mov_immediate (rtx);
-char *aarch64_output_ptrue (machine_mode, char);
+char *aarch64_output_sve_ptrues (rtx);
 bool aarch64_pad_reg_upward (machine_mode, const_tree, bool);
 bool aarch64_regno_ok_for_base_p (int, bool);
 bool aarch64_regno_ok_for_index_p (int, bool);
@@ -462,11 +544,13 @@ bool aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
 					    bool high);
 bool aarch64_simd_scalar_immediate_valid_for_move (rtx, scalar_int_mode);
 bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
+bool aarch64_sve_ptrue_svpattern_p (rtx, struct simd_immediate_info *);
 bool aarch64_simd_valid_immediate (rtx, struct simd_immediate_info *,
 			enum simd_immediate_check w = AARCH64_CHECK_MOV);
 rtx aarch64_check_zero_based_sve_index_immediate (rtx);
 bool aarch64_sve_index_immediate_p (rtx);
 bool aarch64_sve_arith_immediate_p (rtx, bool);
+bool aarch64_sve_sqadd_sqsub_immediate_p (rtx, bool);
 bool aarch64_sve_bitmask_immediate_p (rtx);
 bool aarch64_sve_dup_immediate_p (rtx);
 bool aarch64_sve_cmp_immediate_p (rtx, bool);
@@ -476,15 +560,15 @@ bool aarch64_split_dimode_const_store (rtx, rtx);
 bool aarch64_symbolic_address_p (rtx);
 bool aarch64_uimm12_shift (HOST_WIDE_INT);
 bool aarch64_use_return_insn_p (void);
-bool aarch64_use_simple_return_insn_p (void);
-const char *aarch64_mangle_builtin_type (const_tree);
 const char *aarch64_output_casesi (rtx *);
 
+unsigned int aarch64_tlsdesc_abi_id ();
 enum aarch64_symbol_type aarch64_classify_symbol (rtx, HOST_WIDE_INT);
 enum aarch64_symbol_type aarch64_classify_tls_symbol (rtx);
 enum reg_class aarch64_regno_regclass (unsigned);
 int aarch64_asm_preferred_eh_data_format (int, int);
 int aarch64_fpconst_pow_of_2 (rtx);
+int aarch64_fpconst_pow2_recip (rtx);
 machine_mode aarch64_hard_regno_caller_save_mode (unsigned, unsigned,
 						       machine_mode);
 int aarch64_uxt_size (int, HOST_WIDE_INT);
@@ -496,13 +580,17 @@ rtx aarch64_return_addr (int, rtx);
 rtx aarch64_simd_gen_const_vector_dup (machine_mode, HOST_WIDE_INT);
 bool aarch64_simd_mem_operand_p (rtx);
 bool aarch64_sve_ld1r_operand_p (rtx);
+bool aarch64_sve_ld1rq_operand_p (rtx);
+bool aarch64_sve_ld1ro_operand_p (rtx, scalar_mode);
+bool aarch64_sve_ldff1_operand_p (rtx);
+bool aarch64_sve_ldnf1_operand_p (rtx);
 bool aarch64_sve_ldr_operand_p (rtx);
+bool aarch64_sve_prefetch_operand_p (rtx, machine_mode);
 bool aarch64_sve_struct_memory_operand_p (rtx);
 rtx aarch64_simd_vect_par_cnst_half (machine_mode, int, bool);
 rtx aarch64_gen_stepped_int_parallel (unsigned int, int, int);
 bool aarch64_stepped_int_parallel_p (rtx, int);
 rtx aarch64_tls_get_addr (void);
-tree aarch64_fold_builtin (tree, int, tree *, bool);
 unsigned aarch64_dbx_register_number (unsigned);
 unsigned aarch64_trampoline_size (void);
 void aarch64_asm_output_labelref (FILE *, const char *);
@@ -512,7 +600,15 @@ const char * aarch64_output_probe_stack_range (rtx, rtx);
 const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
 void aarch64_err_no_fpadvsimd (machine_mode);
 void aarch64_expand_epilogue (bool);
-void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
+rtx aarch64_ptrue_all (unsigned int);
+opt_machine_mode aarch64_ptrue_all_mode (rtx);
+rtx aarch64_convert_sve_data_to_pred (rtx, machine_mode, rtx);
+rtx aarch64_expand_sve_dupq (rtx, machine_mode, rtx);
+void aarch64_expand_mov_immediate (rtx, rtx);
+rtx aarch64_ptrue_reg (machine_mode);
+rtx aarch64_pfalse_reg (machine_mode);
+bool aarch64_sve_pred_dominates_p (rtx *, rtx);
+bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
 void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
 void aarch64_expand_sve_mem_move (rtx, rtx, machine_mode);
 bool aarch64_maybe_expand_sve_subreg_move (rtx, rtx);
@@ -520,8 +616,9 @@ rtx aarch64_replace_reg_mode (rtx, machine_mode);
 void aarch64_split_sve_subreg_move (rtx, rtx, rtx);
 void aarch64_expand_prologue (void);
 void aarch64_expand_vector_init (rtx, rtx);
+void aarch64_sve_expand_vector_init (rtx, rtx);
 void aarch64_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx,
-				   const_tree, unsigned);
+				   const_tree, unsigned, bool = false);
 void aarch64_init_expanders (void);
 void aarch64_init_simd_builtins (void);
 void aarch64_emit_call_insn (rtx);
@@ -587,22 +684,39 @@ bool aarch64_gen_adjusted_ldpstp (rtx *, bool, scalar_mode, RTX_CODE);
 void aarch64_expand_sve_vec_cmp_int (rtx, rtx_code, rtx, rtx);
 bool aarch64_expand_sve_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
 void aarch64_expand_sve_vcond (machine_mode, machine_mode, rtx *);
-#endif /* RTX_CODE */
 
-void aarch64_init_builtins (void);
+bool aarch64_prepare_sve_int_fma (rtx *, rtx_code);
+bool aarch64_prepare_sve_cond_int_fma (rtx *, rtx_code);
+#endif /* RTX_CODE */
 
 bool aarch64_process_target_attr (tree);
 void aarch64_override_options_internal (struct gcc_options *);
 
-rtx aarch64_expand_builtin (tree exp,
-			    rtx target,
-			    rtx subtarget ATTRIBUTE_UNUSED,
-			    machine_mode mode ATTRIBUTE_UNUSED,
-			    int ignore ATTRIBUTE_UNUSED);
-tree aarch64_builtin_decl (unsigned, bool ATTRIBUTE_UNUSED);
-tree aarch64_builtin_rsqrt (unsigned int);
+const char *aarch64_general_mangle_builtin_type (const_tree);
+void aarch64_general_init_builtins (void);
+tree aarch64_general_fold_builtin (unsigned int, tree, unsigned int, tree *);
+gimple *aarch64_general_gimple_fold_builtin (unsigned int, gcall *);
+rtx aarch64_general_expand_builtin (unsigned int, tree, rtx, int);
+tree aarch64_general_builtin_decl (unsigned, bool);
+tree aarch64_general_builtin_rsqrt (unsigned int);
 tree aarch64_builtin_vectorized_function (unsigned int, tree, tree);
 
+namespace aarch64_sve {
+  void init_builtins ();
+  void handle_arm_sve_h ();
+  tree builtin_decl (unsigned, bool);
+  bool builtin_type_p (const_tree);
+  bool svbool_type_p (const_tree);
+  unsigned int nvectors_if_data_type (const_tree);
+  const char *mangle_builtin_type (const_tree);
+  tree resolve_overloaded_builtin (location_t, unsigned int,
+				   vec<tree, va_gc> *);
+  bool check_builtin_call (location_t, vec<location_t>, unsigned int,
+			   tree, unsigned int, tree *);
+  gimple *gimple_fold_builtin (unsigned int, gimple_stmt_iterator *, gcall *);
+  rtx expand_builtin (unsigned int, tree, rtx);
+}
+
 extern void aarch64_split_combinev16qi (rtx operands[3]);
 extern void aarch64_expand_vec_perm (rtx, rtx, rtx, rtx, unsigned int);
 extern void aarch64_expand_sve_vec_perm (rtx, rtx, rtx, rtx);
@@ -629,11 +743,10 @@ bool aarch64_handle_option (struct gcc_options *, struct gcc_options *,
 			     const struct cl_decoded_option *, location_t);
 const char *aarch64_rewrite_selected_cpu (const char *name);
 enum aarch64_parse_opt_result aarch64_parse_extension (const char *,
-						       unsigned long *,
+						       uint64_t *,
 						       std::string *);
 void aarch64_get_all_extension_candidates (auto_vec<const char *> *candidates);
-std::string aarch64_get_extension_string_for_isa_flags (unsigned long,
-							unsigned long);
+std::string aarch64_get_extension_string_for_isa_flags (uint64_t, uint64_t);
 
 /* Defined in aarch64-d.c  */
 extern void aarch64_d_target_versions (void);
@@ -647,4 +760,17 @@ poly_uint64 aarch64_regmode_natural_size (machine_mode);
 
 bool aarch64_high_bits_all_ones_p (HOST_WIDE_INT);
 
+struct atomic_ool_names
+{
+    const char *str[5][4];
+};
+
+rtx aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
+			    const atomic_ool_names *names);
+extern const atomic_ool_names aarch64_ool_swp_names;
+extern const atomic_ool_names aarch64_ool_ldadd_names;
+extern const atomic_ool_names aarch64_ool_ldset_names;
+extern const atomic_ool_names aarch64_ool_ldclr_names;
+extern const atomic_ool_names aarch64_ool_ldeor_names;
+
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def
index b01569429..2be0ce824 100644
--- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
+++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
@@ -50,3 +50,5 @@
   ENTRY (Float32x4_t, V4SF, none, 13)
   ENTRY (Float64x1_t, V1DF, none, 13)
   ENTRY (Float64x2_t, V2DF, none, 13)
+  ENTRY (Bfloat16x4_t, V4BF, none, 14)
+  ENTRY (Bfloat16x8_t, V8BF, none, 14)
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 17bb0c486..d0fe4e7c8 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -212,10 +212,15 @@
   /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>.  */
   BUILTIN_VB (TERNOP, sdot, 0)
   BUILTIN_VB (TERNOPU, udot, 0)
+  BUILTIN_VB (TERNOP_SSUS, usdot, 0)
   BUILTIN_VB (QUADOP_LANE, sdot_lane, 0)
   BUILTIN_VB (QUADOPU_LANE, udot_lane, 0)
   BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0)
   BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0)
+  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_lane, 0)
+  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_laneq, 0)
+  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_lane, 0)
+  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_laneq, 0)
 
   /* Implemented by aarch64_fcadd<rot><mode>.   */
   BUILTIN_VHSDF (BINOP, fcadd90, 0)
@@ -424,7 +429,7 @@
   BUILTIN_VB (UNOP, rbit, 0)
 
   /* Implemented by
-     aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>.  */
+     aarch64_<PERMUTE:perm_insn><mode>.  */
   BUILTIN_VALL (BINOP, zip1, 0)
   BUILTIN_VALL (BINOP, zip2, 0)
   BUILTIN_VALL (BINOP, uzp1, 0)
@@ -465,12 +470,18 @@
   /* Implemented by aarch64_ld1x3<VALLDIF:mode>.  */
   BUILTIN_VALLDIF (LOADSTRUCT, ld1x3, 0)
 
+  /* Implemented by aarch64_ld1x4<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (LOADSTRUCT, ld1x4, 0)
+
   /* Implemented by aarch64_st1x2<VALLDIF:mode>.  */
   BUILTIN_VALLDIF (STORESTRUCT, st1x2, 0)
 
   /* Implemented by aarch64_st1x3<VALLDIF:mode>.  */
   BUILTIN_VALLDIF (STORESTRUCT, st1x3, 0)
 
+  /* Implemented by aarch64_st1x4<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (STORESTRUCT, st1x4, 0)
+
   /* Implemented by fma<mode>4.  */
   BUILTIN_VHSDF (TERNOP, fma, 4)
   VAR1 (TERNOP, fma, 4, hf)
@@ -670,3 +681,36 @@
   /* Implemented by aarch64_fml<f16mac1>lq_laneq_highv4sf.  */
   VAR1 (QUADOP_LANE, fmlalq_laneq_high, 0, v4sf)
   VAR1 (QUADOP_LANE, fmlslq_laneq_high, 0, v4sf)
+
+  /* Implemented by aarch64_<frintnzs_op><mode>.  */
+  BUILTIN_VSFDF (UNOP, frint32z, 0)
+  BUILTIN_VSFDF (UNOP, frint32x, 0)
+  BUILTIN_VSFDF (UNOP, frint64z, 0)
+  BUILTIN_VSFDF (UNOP, frint64x, 0)
+
+  /* Implemented by aarch64_bfdot{_lane}{q}<mode>.  */
+  VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
+  VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
+  VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
+
+  /* Implemented by aarch64_bfmmlaqv4sf  */
+  VAR1 (TERNOP, bfmmlaq, 0, v4sf)
+
+  /* Implemented by aarch64_bfmlal<bt>{_lane{q}}v4sf  */
+  VAR1 (TERNOP, bfmlalb, 0, v4sf)
+  VAR1 (TERNOP, bfmlalt, 0, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalb_lane, 0, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalt_lane, 0, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, v4sf)
+  VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, v4sf)
+
+  /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
+  VAR1 (TERNOP, simd_smmla, 0, v16qi)
+  VAR1 (TERNOPU, simd_ummla, 0, v16qi)
+  VAR1 (TERNOP_SSUS, simd_usmmla, 0, v16qi)
+
+  /* Implemented by aarch64_bfcvtn{q}{2}<mode>  */
+  VAR1 (UNOP, bfcvtn, 0, v4bf)
+  VAR1 (UNOP, bfcvtn_q, 0, v8bf)
+  VAR1 (BINOP, bfcvtn2, 0, v8bf)
+  VAR1 (UNOP, bfcvt, 0, bf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 29ca37c65..137c88da1 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -19,8 +19,8 @@
 ;; <http://www.gnu.org/licenses/>.
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand" "")
-	(match_operand:VALL_F16 1 "general_operand" ""))]
+  [(set (match_operand:VALL_F16MOV 0 "nonimmediate_operand")
+	(match_operand:VALL_F16MOV 1 "general_operand"))]
   "TARGET_SIMD"
   "
   /* Force the operand into a register if it is not an
@@ -39,8 +39,8 @@
 )
 
 (define_expand "movmisalign<mode>"
-  [(set (match_operand:VALL 0 "nonimmediate_operand" "")
-        (match_operand:VALL 1 "general_operand" ""))]
+  [(set (match_operand:VALL 0 "nonimmediate_operand")
+        (match_operand:VALL 1 "general_operand"))]
   "TARGET_SIMD"
 {
   /* This pattern is not permitted to fail during expansion: if both arguments
@@ -101,10 +101,10 @@
   [(set_attr "type" "neon_dup<q>")]
 )
 
-(define_insn "*aarch64_simd_mov<VD:mode>"
-  [(set (match_operand:VD 0 "nonimmediate_operand"
+(define_insn "*aarch64_simd_mov<VDMOV:mode>"
+  [(set (match_operand:VDMOV 0 "nonimmediate_operand"
 		"=w, m,  m,  w, ?r, ?w, ?r, w")
-	(match_operand:VD 1 "general_operand"
+	(match_operand:VDMOV 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
@@ -129,10 +129,10 @@
 		     mov_reg, neon_move<q>")]
 )
 
-(define_insn "*aarch64_simd_mov<VQ:mode>"
-  [(set (match_operand:VQ 0 "nonimmediate_operand"
+(define_insn "*aarch64_simd_mov<VQMOV:mode>"
+  [(set (match_operand:VQMOV 0 "nonimmediate_operand"
 		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
-	(match_operand:VQ 1 "general_operand"
+	(match_operand:VQMOV 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
@@ -234,8 +234,8 @@
 
 
 (define_split
-  [(set (match_operand:VQ 0 "register_operand" "")
-      (match_operand:VQ 1 "register_operand" ""))]
+  [(set (match_operand:VQMOV 0 "register_operand" "")
+      (match_operand:VQMOV 1 "register_operand" ""))]
   "TARGET_SIMD && reload_completed
    && GP_REGNUM_P (REGNO (operands[0]))
    && GP_REGNUM_P (REGNO (operands[1]))"
@@ -246,8 +246,8 @@
 })
 
 (define_split
-  [(set (match_operand:VQ 0 "register_operand" "")
-        (match_operand:VQ 1 "register_operand" ""))]
+  [(set (match_operand:VQMOV 0 "register_operand" "")
+        (match_operand:VQMOV 1 "register_operand" ""))]
   "TARGET_SIMD && reload_completed
    && ((FP_REGNUM_P (REGNO (operands[0])) && GP_REGNUM_P (REGNO (operands[1])))
        || (GP_REGNUM_P (REGNO (operands[0])) && FP_REGNUM_P (REGNO (operands[1]))))"
@@ -258,8 +258,8 @@
 })
 
 (define_expand "@aarch64_split_simd_mov<mode>"
-  [(set (match_operand:VQ 0)
-        (match_operand:VQ 1))]
+  [(set (match_operand:VQMOV 0)
+        (match_operand:VQMOV 1))]
   "TARGET_SIMD"
   {
     rtx dst = operands[0];
@@ -520,6 +520,20 @@
   [(set_attr "type" "neon_dot<q>")]
 )
 
+;; These instructions map to the __builtins for the armv8.6a I8MM usdot
+;; (vector) Dot Product operation.
+(define_insn "aarch64_usdot<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand" "=w")
+	(plus:VS
+	  (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+		      (match_operand:<VSI2QI> 3 "register_operand" "w")]
+	  UNSPEC_USDOT)
+	  (match_operand:VS 1 "register_operand" "0")))]
+  "TARGET_I8MM"
+  "usdot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
+  [(set_attr "type" "neon_dot<q>")]
+)
+
 ;; These expands map to the Dot Product optab the vectorizer checks for.
 ;; The auto-vectorizer expects a dot product builtin that also does an
 ;; accumulation into the provided register.
@@ -587,6 +601,26 @@
   [(set_attr "type" "neon_dot<q>")]
 )
 
+;; These instructions map to the __builtins for the armv8.6a I8MM usdot, sudot
+;; (by element) Dot Product operations.
+(define_insn "aarch64_<DOTPROD_I8MM:sur>dot_lane<VB:isquadop><VS:vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand" "=w")
+	(plus:VS
+	  (unspec:VS [(match_operand:<VS:VSI2QI> 2 "register_operand" "w")
+		      (match_operand:VB 3 "register_operand" "w")
+		      (match_operand:SI 4 "immediate_operand" "i")]
+	  DOTPROD_I8MM)
+	  (match_operand:VS 1 "register_operand" "0")))]
+  "TARGET_I8MM"
+  {
+    int nunits = GET_MODE_NUNITS (<VB:MODE>mode).to_constant ();
+    int lane = INTVAL (operands[4]);
+    operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 4, lane), SImode);
+    return "<DOTPROD_I8MM:sur>dot\\t%0.<VS:Vtype>, %2.<VS:Vdottype>, %3.4b[%4]";
+  }
+  [(set_attr "type" "neon_dot<VS:q>")]
+)
+
 (define_expand "copysign<mode>3"
   [(match_operand:VHSDF 0 "register_operand")
    (match_operand:VHSDF 1 "register_operand")
@@ -666,8 +700,8 @@
   [(set_attr "type" "neon_fp_rsqrts_<stype><q>")])
 
 (define_expand "rsqrt<mode>2"
-  [(set (match_operand:VALLF 0 "register_operand" "=w")
-	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+  [(set (match_operand:VALLF 0 "register_operand")
+	(unspec:VALLF [(match_operand:VALLF 1 "register_operand")]
 		     UNSPEC_RSQRT))]
   "TARGET_SIMD"
 {
@@ -724,15 +758,15 @@
 ;; So (ABS:QI (minus:QI 64 -128)) == (ABS:QI (192 or -64 signed)) == 64.
 ;; Whereas SABD would return 192 (-64 signed) on the above example.
 ;; Use MINUS ([us]max (op1, op2), [us]min (op1, op2)) instead.
-(define_insn "*aarch64_<su>abd<mode>_3"
+(define_insn "aarch64_<su>abd<mode>_3"
   [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
 	(minus:VDQ_BHSI
 	  (USMAX:VDQ_BHSI
 	    (match_operand:VDQ_BHSI 1 "register_operand" "w")
 	    (match_operand:VDQ_BHSI 2 "register_operand" "w"))
-	  (match_operator 3 "aarch64_<max_opp>"
-	    [(match_dup 1)
-	     (match_dup 2)])))]
+	  (<max_opp>:VDQ_BHSI
+	    (match_dup 1)
+	    (match_dup 2))))]
   "TARGET_SIMD"
   "<su>abd\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
   [(set_attr "type" "neon_abd<q>")]
@@ -778,7 +812,16 @@
 ;; UABAL	tmp.8h, op1.16b, op2.16b
 ;; UADALP	op3.4s, tmp.8h
 ;; MOV		op0, op3 // should be eliminated in later passes.
-;; The signed version just uses the signed variants of the above instructions.
+;;
+;; For TARGET_DOTPROD we do:
+;; MOV	tmp1.16b, #1 // Can be CSE'd and hoisted out of loops.
+;; UABD	tmp2.16b, op1.16b, op2.16b
+;; UDOT	op3.4s, tmp2.16b, tmp1.16b
+;; MOV	op0, op3 // RA will tie the operands of UDOT appropriately.
+;;
+;; The signed version just uses the signed variants of the above instructions
+;; but for TARGET_DOTPROD still emits a UDOT as the absolute difference is
+;; unsigned.
 
 (define_expand "<sur>sadv16qi"
   [(use (match_operand:V4SI 0 "register_operand"))
@@ -787,6 +830,15 @@
    (use (match_operand:V4SI 3 "register_operand"))]
   "TARGET_SIMD"
   {
+    if (TARGET_DOTPROD)
+      {
+	rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode));
+	rtx abd = gen_reg_rtx (V16QImode);
+	emit_insn (gen_aarch64_<sur>abdv16qi_3 (abd, operands[1], operands[2]));
+	emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3],
+					  abd, ones));
+	DONE;
+      }
     rtx reduc = gen_reg_rtx (V8HImode);
     emit_insn (gen_aarch64_<sur>abdl2v16qi_3 (reduc, operands[1],
 					       operands[2]));
@@ -949,6 +1001,21 @@
   [(set_attr "type" "neon_ins<q>")]
 )
 
+(define_expand "signbit<mode>2"
+  [(use (match_operand:<V_INT_EQUIV> 0 "register_operand"))
+   (use (match_operand:VDQSF 1 "register_operand"))]
+  "TARGET_SIMD"
+{
+  int shift_amount = GET_MODE_UNIT_BITSIZE (<V_INT_EQUIV>mode) - 1;
+  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+                                                        shift_amount);
+  operands[1] = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
+
+  emit_insn (gen_aarch64_simd_lshr<v_int_equiv> (operands[0], operands[1],
+                                                 shift_vector));
+  DONE;
+})
+
 (define_insn "aarch64_simd_lshr<mode>"
  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
        (lshiftrt:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
@@ -967,6 +1034,18 @@
   [(set_attr "type" "neon_shift_imm<q>")]
 )
 
+(define_insn "*aarch64_simd_sra<mode>"
+ [(set (match_operand:VDQ_I 0 "register_operand" "=w")
+	(plus:VDQ_I
+	   (SHIFTRT:VDQ_I
+		(match_operand:VDQ_I 1 "register_operand" "w")
+		(match_operand:VDQ_I 2 "aarch64_simd_rshift_imm" "Dr"))
+	   (match_operand:VDQ_I 3 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "<sra_op>sra\t%0.<Vtype>, %1.<Vtype>, %2"
+  [(set_attr "type" "neon_shift_acc<q>")]
+)
+
 (define_insn "aarch64_simd_imm_shl<mode>"
  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
        (ashift:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
@@ -1006,9 +1085,9 @@
 )
 
 (define_expand "ashl<mode>3"
-  [(match_operand:VDQ_I 0 "register_operand" "")
-   (match_operand:VDQ_I 1 "register_operand" "")
-   (match_operand:SI  2 "general_operand" "")]
+  [(match_operand:VDQ_I 0 "register_operand")
+   (match_operand:VDQ_I 1 "register_operand")
+   (match_operand:SI  2 "general_operand")]
  "TARGET_SIMD"
 {
   int bit_width = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
@@ -1053,9 +1132,9 @@
 )
 
 (define_expand "lshr<mode>3"
-  [(match_operand:VDQ_I 0 "register_operand" "")
-   (match_operand:VDQ_I 1 "register_operand" "")
-   (match_operand:SI  2 "general_operand" "")]
+  [(match_operand:VDQ_I 0 "register_operand")
+   (match_operand:VDQ_I 1 "register_operand")
+   (match_operand:SI  2 "general_operand")]
  "TARGET_SIMD"
 {
   int bit_width = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
@@ -1100,9 +1179,9 @@
 )
 
 (define_expand "ashr<mode>3"
-  [(match_operand:VDQ_I 0 "register_operand" "")
-   (match_operand:VDQ_I 1 "register_operand" "")
-   (match_operand:SI  2 "general_operand" "")]
+  [(match_operand:VDQ_I 0 "register_operand")
+   (match_operand:VDQ_I 1 "register_operand")
+   (match_operand:SI  2 "general_operand")]
  "TARGET_SIMD"
 {
   int bit_width = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
@@ -1147,9 +1226,9 @@
 )
 
 (define_expand "vashl<mode>3"
- [(match_operand:VDQ_I 0 "register_operand" "")
-  (match_operand:VDQ_I 1 "register_operand" "")
-  (match_operand:VDQ_I 2 "register_operand" "")]
+ [(match_operand:VDQ_I 0 "register_operand")
+  (match_operand:VDQ_I 1 "register_operand")
+  (match_operand:VDQ_I 2 "register_operand")]
  "TARGET_SIMD"
 {
   emit_insn (gen_aarch64_simd_reg_sshl<mode> (operands[0], operands[1],
@@ -1161,9 +1240,9 @@
 ;; Negating individual lanes most certainly offsets the
 ;; gain from vectorization.
 (define_expand "vashr<mode>3"
- [(match_operand:VDQ_BHSI 0 "register_operand" "")
-  (match_operand:VDQ_BHSI 1 "register_operand" "")
-  (match_operand:VDQ_BHSI 2 "register_operand" "")]
+ [(match_operand:VDQ_BHSI 0 "register_operand")
+  (match_operand:VDQ_BHSI 1 "register_operand")
+  (match_operand:VDQ_BHSI 2 "register_operand")]
  "TARGET_SIMD"
 {
   rtx neg = gen_reg_rtx (<MODE>mode);
@@ -1175,9 +1254,9 @@
 
 ;; DI vector shift
 (define_expand "aarch64_ashr_simddi"
-  [(match_operand:DI 0 "register_operand" "=w")
-   (match_operand:DI 1 "register_operand" "w")
-   (match_operand:SI 2 "aarch64_shift_imm64_di" "")]
+  [(match_operand:DI 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
+   (match_operand:SI 2 "aarch64_shift_imm64_di")]
   "TARGET_SIMD"
   {
     /* An arithmetic shift right by 64 fills the result with copies of the sign
@@ -1191,9 +1270,9 @@
 )
 
 (define_expand "vlshr<mode>3"
- [(match_operand:VDQ_BHSI 0 "register_operand" "")
-  (match_operand:VDQ_BHSI 1 "register_operand" "")
-  (match_operand:VDQ_BHSI 2 "register_operand" "")]
+ [(match_operand:VDQ_BHSI 0 "register_operand")
+  (match_operand:VDQ_BHSI 1 "register_operand")
+  (match_operand:VDQ_BHSI 2 "register_operand")]
  "TARGET_SIMD"
 {
   rtx neg = gen_reg_rtx (<MODE>mode);
@@ -1204,9 +1283,9 @@
 })
 
 (define_expand "aarch64_lshr_simddi"
-  [(match_operand:DI 0 "register_operand" "=w")
-   (match_operand:DI 1 "register_operand" "w")
-   (match_operand:SI 2 "aarch64_shift_imm64_di" "")]
+  [(match_operand:DI 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
+   (match_operand:SI 2 "aarch64_shift_imm64_di")]
   "TARGET_SIMD"
   {
     if (INTVAL (operands[2]) == 64)
@@ -1234,9 +1313,9 @@
 )
 
 (define_expand "vec_set<mode>"
-  [(match_operand:VALL_F16 0 "register_operand" "+w")
-   (match_operand:<VEL> 1 "register_operand" "w")
-   (match_operand:SI 2 "immediate_operand" "")]
+  [(match_operand:VALL_F16 0 "register_operand")
+   (match_operand:<VEL> 1 "register_operand")
+   (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
   {
     HOST_WIDE_INT elem = (HOST_WIDE_INT) 1 << INTVAL (operands[2]);
@@ -1375,9 +1454,9 @@
 )
 
 (define_expand "<su><maxmin>v2di3"
- [(set (match_operand:V2DI 0 "register_operand" "")
-       (MAXMIN:V2DI (match_operand:V2DI 1 "register_operand" "")
-                    (match_operand:V2DI 2 "register_operand" "")))]
+ [(set (match_operand:V2DI 0 "register_operand")
+       (MAXMIN:V2DI (match_operand:V2DI 1 "register_operand")
+                    (match_operand:V2DI 2 "register_operand")))]
  "TARGET_SIMD"
 {
   enum rtx_code cmp_operator;
@@ -1440,8 +1519,8 @@
 ;; On big-endian this is { zeroes, operand }
 
 (define_insn "move_lo_quad_internal_<mode>"
-  [(set (match_operand:VQ_NO2E 0 "register_operand" "=w,w,w")
-	(vec_concat:VQ_NO2E
+  [(set (match_operand:VQMOV_NO2E 0 "register_operand" "=w,w,w")
+	(vec_concat:VQMOV_NO2E
 	  (match_operand:<VHALF> 1 "register_operand" "w,r,r")
 	  (vec_duplicate:<VHALF> (const_int 0))))]
   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
@@ -1470,8 +1549,8 @@
 )
 
 (define_insn "move_lo_quad_internal_be_<mode>"
-  [(set (match_operand:VQ_NO2E 0 "register_operand" "=w,w,w")
-	(vec_concat:VQ_NO2E
+  [(set (match_operand:VQMOV_NO2E 0 "register_operand" "=w,w,w")
+	(vec_concat:VQMOV_NO2E
 	  (vec_duplicate:<VHALF> (const_int 0))
 	  (match_operand:<VHALF> 1 "register_operand" "w,r,r")))]
   "TARGET_SIMD && BYTES_BIG_ENDIAN"
@@ -1500,8 +1579,8 @@
 )
 
 (define_expand "move_lo_quad_<mode>"
-  [(match_operand:VQ 0 "register_operand")
-   (match_operand:VQ 1 "register_operand")]
+  [(match_operand:VQMOV 0 "register_operand")
+   (match_operand:VQMOV 1 "register_operand")]
   "TARGET_SIMD"
 {
   if (BYTES_BIG_ENDIAN)
@@ -1518,11 +1597,11 @@
 ;; For big-endian this is { operand1, operand2 }
 
 (define_insn "aarch64_simd_move_hi_quad_<mode>"
-  [(set (match_operand:VQ 0 "register_operand" "+w,w")
-        (vec_concat:VQ
+  [(set (match_operand:VQMOV 0 "register_operand" "+w,w")
+        (vec_concat:VQMOV
           (vec_select:<VHALF>
                 (match_dup 0)
-                (match_operand:VQ 2 "vect_par_cnst_lo_half" ""))
+                (match_operand:VQMOV 2 "vect_par_cnst_lo_half" ""))
 	  (match_operand:<VHALF> 1 "register_operand" "w,r")))]
   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
   "@
@@ -1532,12 +1611,12 @@
 )
 
 (define_insn "aarch64_simd_move_hi_quad_be_<mode>"
-  [(set (match_operand:VQ 0 "register_operand" "+w,w")
-        (vec_concat:VQ
+  [(set (match_operand:VQMOV 0 "register_operand" "+w,w")
+        (vec_concat:VQMOV
 	  (match_operand:<VHALF> 1 "register_operand" "w,r")
           (vec_select:<VHALF>
                 (match_dup 0)
-                (match_operand:VQ 2 "vect_par_cnst_lo_half" ""))))]
+                (match_operand:VQMOV 2 "vect_par_cnst_lo_half" ""))))]
   "TARGET_SIMD && BYTES_BIG_ENDIAN"
   "@
    ins\\t%0.d[1], %1.d[0]
@@ -1546,8 +1625,8 @@
 )
 
 (define_expand "move_hi_quad_<mode>"
- [(match_operand:VQ 0 "register_operand" "")
-  (match_operand:<VHALF> 1 "register_operand" "")]
+ [(match_operand:VQMOV 0 "register_operand")
+  (match_operand:<VHALF> 1 "register_operand")]
  "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
@@ -1571,10 +1650,122 @@
   [(set_attr "type" "neon_shift_imm_narrow_q")]
 )
 
+(define_insn "aarch64_bfdot<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(plus:VDQSF
+	  (unspec:VDQSF
+	   [(match_operand:<VBFMLA_W> 2 "register_operand" "w")
+	    (match_operand:<VBFMLA_W> 3 "register_operand" "w")]
+	    UNSPEC_BFDOT)
+	  (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_BF16_SIMD"
+  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+	(plus:VDQSF
+	  (unspec:VDQSF
+	   [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w")
+	    (match_operand:VBF 3 "register_operand" "w")
+	    (match_operand:SI 4 "const_int_operand" "n")]
+	    UNSPEC_BFDOT)
+	  (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_BF16_SIMD"
+{
+  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
+  int lane = INTVAL (operands[4]);
+  operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
+  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
+}
+  [(set_attr "type" "neon_dot<VDQSF:q>")]
+)
+
+;; bfmmla
+(define_insn "aarch64_bfmmlaqv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
+                   (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                 (match_operand:V8BF 3 "register_operand" "w")]
+                    UNSPEC_BFMMLA)))]
+  "TARGET_BF16_SIMD"
+  "bfmmla\\t%0.4s, %2.8h, %3.8h"
+  [(set_attr "type" "neon_fp_mla_s_q")]
+)
+
+;; bfmlal<bt>
+(define_insn "aarch64_bfmlal<bt>v4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:V8BF 3 "register_operand" "w")]
+                     BF_MLA)))]
+  "TARGET_BF16_SIMD"
+  "bfmlal<bt>\\t%0.4s, %2.8h, %3.8h"
+  [(set_attr "type" "neon_fp_mla_s_q")]
+)
+
+(define_insn "aarch64_bfmlal<bt>_lane<q>v4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+        (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+                    (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+                                  (match_operand:VBF 3 "register_operand" "w")
+                                  (match_operand:SI 4 "const_int_operand" "n")]
+                     BF_MLA)))]
+  "TARGET_BF16_SIMD"
+{
+  operands[4] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[4]));
+  return "bfmlal<bt>\\t%0.4s, %2.8h, %3.h[%4]";
+}
+  [(set_attr "type" "neon_fp_mla_s_scalar_q")]
+)
+
+;; 8-bit integer matrix multiply-accumulate
+(define_insn "aarch64_simd_<sur>mmlav16qi"
+  [(set (match_operand:V4SI 0 "register_operand" "=w")
+	(plus:V4SI
+	 (unspec:V4SI [(match_operand:V16QI 2 "register_operand" "w")
+		       (match_operand:V16QI 3 "register_operand" "w")] MATMUL)
+	 (match_operand:V4SI 1 "register_operand" "0")))]
+  "TARGET_I8MM"
+  "<sur>mmla\\t%0.4s, %2.16b, %3.16b"
+  [(set_attr "type" "neon_mla_s_q")]
+)
+
+;; bfcvtn
+(define_insn "aarch64_bfcvtn<q><mode>"
+  [(set (match_operand:V4SF_TO_BF 0 "register_operand" "=w")
+        (unspec:V4SF_TO_BF [(match_operand:V4SF 1 "register_operand" "w")]
+                            UNSPEC_BFCVTN))]
+  "TARGET_BF16_SIMD"
+  "bfcvtn\\t%0.4h, %1.4s"
+  [(set_attr "type" "neon_fp_cvt_narrow_s_q")]
+)
+
+(define_insn "aarch64_bfcvtn2v8bf"
+  [(set (match_operand:V8BF 0 "register_operand" "=w")
+        (unspec:V8BF [(match_operand:V8BF 1 "register_operand" "0")
+                      (match_operand:V4SF 2 "register_operand" "w")]
+                      UNSPEC_BFCVTN2))]
+  "TARGET_BF16_SIMD"
+  "bfcvtn2\\t%0.8h, %2.4s"
+  [(set_attr "type" "neon_fp_cvt_narrow_s_q")]
+)
+
+(define_insn "aarch64_bfcvtbf"
+  [(set (match_operand:BF 0 "register_operand" "=w")
+        (unspec:BF [(match_operand:SF 1 "register_operand" "w")]
+                    UNSPEC_BFCVT))]
+  "TARGET_BF16_FP"
+  "bfcvt\\t%h0, %s1"
+  [(set_attr "type" "f_cvt")]
+)
+
 (define_expand "vec_pack_trunc_<mode>"
- [(match_operand:<VNARROWD> 0 "register_operand" "")
-  (match_operand:VDN 1 "register_operand" "")
-  (match_operand:VDN 2 "register_operand" "")]
+ [(match_operand:<VNARROWD> 0 "register_operand")
+  (match_operand:VDN 1 "register_operand")
+  (match_operand:VDN 2 "register_operand")]
  "TARGET_SIMD"
 {
   rtx tempreg = gen_reg_rtx (<VDBL>mode);
@@ -1630,7 +1821,7 @@
 )
 
 (define_expand "vec_unpack<su>_hi_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "")
+  [(match_operand:<VWIDE> 0 "register_operand")
    (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))]
   "TARGET_SIMD"
   {
@@ -1642,8 +1833,8 @@
 )
 
 (define_expand "vec_unpack<su>_lo_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "")
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand" ""))]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))]
   "TARGET_SIMD"
   {
     rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
@@ -1761,9 +1952,9 @@
 )
 
 (define_expand "vec_widen_<su>mult_lo_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "")
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand" ""))
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand" ""))]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))
+   (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand"))]
  "TARGET_SIMD"
  {
    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
@@ -1788,9 +1979,9 @@
 )
 
 (define_expand "vec_widen_<su>mult_hi_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "")
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand" ""))
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand" ""))]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))
+   (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand"))]
  "TARGET_SIMD"
  {
    rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -1855,9 +2046,9 @@
 )
 
 (define_expand "div<mode>3"
- [(set (match_operand:VHSDF 0 "register_operand" "=w")
-       (div:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
-		  (match_operand:VHSDF 2 "register_operand" "w")))]
+ [(set (match_operand:VHSDF 0 "register_operand")
+       (div:VHSDF (match_operand:VHSDF 1 "register_operand")
+		  (match_operand:VHSDF 2 "register_operand")))]
  "TARGET_SIMD"
 {
   if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
@@ -2192,8 +2383,8 @@
 ;; other big-endian patterns their behavior is as required.
 
 (define_expand "vec_unpacks_lo_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "")
-   (match_operand:VQ_HSF 1 "register_operand" "")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQ_HSF 1 "register_operand")]
   "TARGET_SIMD"
   {
     rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
@@ -2215,8 +2406,8 @@
 )
 
 (define_expand "vec_unpacks_hi_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "")
-   (match_operand:VQ_HSF 1 "register_operand" "")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQ_HSF 1 "register_operand")]
   "TARGET_SIMD"
   {
     rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -2268,9 +2459,9 @@
 )
 
 (define_expand "aarch64_float_truncate_hi_<Vdbl>"
-  [(match_operand:<VDBL> 0 "register_operand" "=w")
-   (match_operand:VDF 1 "register_operand" "0")
-   (match_operand:<VWIDE> 2 "register_operand" "w")]
+  [(match_operand:<VDBL> 0 "register_operand")
+   (match_operand:VDF 1 "register_operand")
+   (match_operand:<VWIDE> 2 "register_operand")]
   "TARGET_SIMD"
 {
   rtx (*gen) (rtx, rtx, rtx) = BYTES_BIG_ENDIAN
@@ -2363,8 +2554,8 @@
 ;; 'across lanes' add.
 
 (define_expand "reduc_plus_scal_<mode>"
-  [(match_operand:<VEL> 0 "register_operand" "=w")
-   (unspec:VDQ_I [(match_operand:VDQ_I 1 "register_operand" "w")]
+  [(match_operand:<VEL> 0 "register_operand")
+   (unspec:VDQ_I [(match_operand:VDQ_I 1 "register_operand")]
 	       UNSPEC_ADDV)]
   "TARGET_SIMD"
   {
@@ -3116,30 +3307,31 @@
 (define_insn "*aarch64_get_lane_extend<GPI:mode><VDQQH:mode>"
   [(set (match_operand:GPI 0 "register_operand" "=r")
 	(sign_extend:GPI
-	  (vec_select:<VEL>
+	  (vec_select:<VDQQH:VEL>
 	    (match_operand:VDQQH 1 "register_operand" "w")
 	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
   "TARGET_SIMD"
   {
-    operands[2] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[2]));
+    operands[2] = aarch64_endian_lane_rtx (<VDQQH:MODE>mode,
+					   INTVAL (operands[2]));
     return "smov\\t%<GPI:w>0, %1.<VDQQH:Vetype>[%2]";
   }
-  [(set_attr "type" "neon_to_gp<q>")]
-)
-
-(define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
-  [(set (match_operand:GPI 0 "register_operand" "=r")
-	(zero_extend:GPI
-	  (vec_select:<VEL>
-	    (match_operand:VDQQH 1 "register_operand" "w")
-	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
-  "TARGET_SIMD"
-  {
-    operands[2] = aarch64_endian_lane_rtx (<VDQQH:MODE>mode,
-					   INTVAL (operands[2]));
-    return "umov\\t%w0, %1.<Vetype>[%2]";
-  }
-  [(set_attr "type" "neon_to_gp<q>")]
+  [(set_attr "type" "neon_to_gp<VDQQH:q>")]
+)
+
+(define_insn "*aarch64_get_lane_zero_extend<GPI:mode><VDQQH:mode>"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(zero_extend:GPI
+	  (vec_select:<VDQQH:VEL>
+	    (match_operand:VDQQH 1 "register_operand" "w")
+	    (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
+  "TARGET_SIMD"
+  {
+    operands[2] = aarch64_endian_lane_rtx (<VDQQH:MODE>mode,
+					   INTVAL (operands[2]));
+    return "umov\\t%w0, %1.<VDQQH:Vetype>[%2]";
+  }
+  [(set_attr "type" "neon_to_gp<VDQQH:q>")]
 )
 
 ;; Lane extraction of a value, neither sign nor zero extension
@@ -3280,9 +3472,9 @@
 
 
 (define_expand "aarch64_saddl2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:VQW 1 "register_operand" "w")
-   (match_operand:VQW 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQW 1 "register_operand")
+   (match_operand:VQW 2 "register_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -3292,9 +3484,9 @@
 })
 
 (define_expand "aarch64_uaddl2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:VQW 1 "register_operand" "w")
-   (match_operand:VQW 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQW 1 "register_operand")
+   (match_operand:VQW 2 "register_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -3304,9 +3496,9 @@
 })
 
 (define_expand "aarch64_ssubl2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:VQW 1 "register_operand" "w")
-   (match_operand:VQW 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQW 1 "register_operand")
+   (match_operand:VQW 2 "register_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -3316,9 +3508,9 @@
 })
 
 (define_expand "aarch64_usubl2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:VQW 1 "register_operand" "w")
-   (match_operand:VQW 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQW 1 "register_operand")
+   (match_operand:VQW 2 "register_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -3341,10 +3533,10 @@
 ;; <su><addsub>w<q>.
 
 (define_expand "widen_ssum<mode>3"
-  [(set (match_operand:<VDBLW> 0 "register_operand" "")
+  [(set (match_operand:<VDBLW> 0 "register_operand")
 	(plus:<VDBLW> (sign_extend:<VDBLW> 
-		        (match_operand:VQW 1 "register_operand" ""))
-		      (match_operand:<VDBLW> 2 "register_operand" "")))]
+		        (match_operand:VQW 1 "register_operand"))
+		      (match_operand:<VDBLW> 2 "register_operand")))]
   "TARGET_SIMD"
   {
     rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
@@ -3358,10 +3550,10 @@
 )
 
 (define_expand "widen_ssum<mode>3"
-  [(set (match_operand:<VWIDE> 0 "register_operand" "")
+  [(set (match_operand:<VWIDE> 0 "register_operand")
 	(plus:<VWIDE> (sign_extend:<VWIDE>
-		        (match_operand:VD_BHSI 1 "register_operand" ""))
-		      (match_operand:<VWIDE> 2 "register_operand" "")))]
+		        (match_operand:VD_BHSI 1 "register_operand"))
+		      (match_operand:<VWIDE> 2 "register_operand")))]
   "TARGET_SIMD"
 {
   emit_insn (gen_aarch64_saddw<mode> (operands[0], operands[2], operands[1]));
@@ -3369,10 +3561,10 @@
 })
 
 (define_expand "widen_usum<mode>3"
-  [(set (match_operand:<VDBLW> 0 "register_operand" "")
+  [(set (match_operand:<VDBLW> 0 "register_operand")
 	(plus:<VDBLW> (zero_extend:<VDBLW> 
-		        (match_operand:VQW 1 "register_operand" ""))
-		      (match_operand:<VDBLW> 2 "register_operand" "")))]
+		        (match_operand:VQW 1 "register_operand"))
+		      (match_operand:<VDBLW> 2 "register_operand")))]
   "TARGET_SIMD"
   {
     rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
@@ -3386,10 +3578,10 @@
 )
 
 (define_expand "widen_usum<mode>3"
-  [(set (match_operand:<VWIDE> 0 "register_operand" "")
+  [(set (match_operand:<VWIDE> 0 "register_operand")
 	(plus:<VWIDE> (zero_extend:<VWIDE>
-		        (match_operand:VD_BHSI 1 "register_operand" ""))
-		      (match_operand:<VWIDE> 2 "register_operand" "")))]
+		        (match_operand:VD_BHSI 1 "register_operand"))
+		      (match_operand:<VWIDE> 2 "register_operand")))]
   "TARGET_SIMD"
 {
   emit_insn (gen_aarch64_uaddw<mode> (operands[0], operands[2], operands[1]));
@@ -3467,9 +3659,9 @@
 )
 
 (define_expand "aarch64_saddw2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQW 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQW 2 "register_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -3479,9 +3671,9 @@
 })
 
 (define_expand "aarch64_uaddw2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQW 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQW 2 "register_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -3492,9 +3684,9 @@
 
 
 (define_expand "aarch64_ssubw2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQW 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQW 2 "register_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -3504,9 +3696,9 @@
 })
 
 (define_expand "aarch64_usubw2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQW 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQW 2 "register_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -4039,10 +4231,10 @@
 )
 
 (define_expand "aarch64_sqdmlal2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")
-   (match_operand:VQ_HSI 3 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")
+   (match_operand:VQ_HSI 3 "register_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -4052,10 +4244,10 @@
 })
 
 (define_expand "aarch64_sqdmlsl2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")
-   (match_operand:VQ_HSI 3 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")
+   (match_operand:VQ_HSI 3 "register_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -4119,11 +4311,11 @@
 )
 
 (define_expand "aarch64_sqdmlal2_lane<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")
-   (match_operand:<VCOND> 3 "register_operand" "<vwx>")
-   (match_operand:SI 4 "immediate_operand" "i")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")
+   (match_operand:<VCOND> 3 "register_operand")
+   (match_operand:SI 4 "immediate_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -4134,11 +4326,11 @@
 })
 
 (define_expand "aarch64_sqdmlal2_laneq<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")
-   (match_operand:<VCONQ> 3 "register_operand" "<vwx>")
-   (match_operand:SI 4 "immediate_operand" "i")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")
+   (match_operand:<VCONQ> 3 "register_operand")
+   (match_operand:SI 4 "immediate_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -4149,11 +4341,11 @@
 })
 
 (define_expand "aarch64_sqdmlsl2_lane<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")
-   (match_operand:<VCOND> 3 "register_operand" "<vwx>")
-   (match_operand:SI 4 "immediate_operand" "i")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")
+   (match_operand:<VCOND> 3 "register_operand")
+   (match_operand:SI 4 "immediate_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -4164,11 +4356,11 @@
 })
 
 (define_expand "aarch64_sqdmlsl2_laneq<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")
-   (match_operand:<VCONQ> 3 "register_operand" "<vwx>")
-   (match_operand:SI 4 "immediate_operand" "i")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")
+   (match_operand:<VCONQ> 3 "register_operand")
+   (match_operand:SI 4 "immediate_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -4198,10 +4390,10 @@
 )
 
 (define_expand "aarch64_sqdmlal2_n<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")
-   (match_operand:<VEL> 3 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")
+   (match_operand:<VEL> 3 "register_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -4212,10 +4404,10 @@
 })
 
 (define_expand "aarch64_sqdmlsl2_n<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:<VWIDE> 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")
-   (match_operand:<VEL> 3 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:<VWIDE> 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")
+   (match_operand:<VEL> 3 "register_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -4367,9 +4559,9 @@
 )
 
 (define_expand "aarch64_sqdmull2<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:VQ_HSI 1 "register_operand" "w")
-   (match_operand:VQ_HSI 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQ_HSI 1 "register_operand")
+   (match_operand:VQ_HSI 2 "register_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -4427,10 +4619,10 @@
 )
 
 (define_expand "aarch64_sqdmull2_lane<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:VQ_HSI 1 "register_operand" "w")
-   (match_operand:<VCOND> 2 "register_operand" "<vwx>")
-   (match_operand:SI 3 "immediate_operand" "i")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQ_HSI 1 "register_operand")
+   (match_operand:<VCOND> 2 "register_operand")
+   (match_operand:SI 3 "immediate_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -4441,10 +4633,10 @@
 })
 
 (define_expand "aarch64_sqdmull2_laneq<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:VQ_HSI 1 "register_operand" "w")
-   (match_operand:<VCONQ> 2 "register_operand" "<vwx>")
-   (match_operand:SI 3 "immediate_operand" "i")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQ_HSI 1 "register_operand")
+   (match_operand:<VCONQ> 2 "register_operand")
+   (match_operand:SI 3 "immediate_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -4475,9 +4667,9 @@
 )
 
 (define_expand "aarch64_sqdmull2_n<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand" "=w")
-   (match_operand:VQ_HSI 1 "register_operand" "w")
-   (match_operand:<VEL> 2 "register_operand" "w")]
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (match_operand:VQ_HSI 1 "register_operand")
+   (match_operand:<VEL> 2 "register_operand")]
   "TARGET_SIMD"
 {
   rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
@@ -4879,8 +5071,8 @@
 ;; sqrt
 
 (define_expand "sqrt<mode>2"
-  [(set (match_operand:VHSDF 0 "register_operand" "=w")
-	(sqrt:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))]
+  [(set (match_operand:VHSDF 0 "register_operand")
+	(sqrt:VHSDF (match_operand:VHSDF 1 "register_operand")))]
   "TARGET_SIMD"
 {
   if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
@@ -4933,8 +5125,8 @@
 )
 
 (define_expand "vec_load_lanesoi<mode>"
-  [(set (match_operand:OI 0 "register_operand" "=w")
-	(unspec:OI [(match_operand:OI 1 "aarch64_simd_struct_operand" "Utv")
+  [(set (match_operand:OI 0 "register_operand")
+	(unspec:OI [(match_operand:OI 1 "aarch64_simd_struct_operand")
 		    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
 		   UNSPEC_LD2))]
   "TARGET_SIMD"
@@ -4977,8 +5169,8 @@
 )
 
 (define_expand "vec_store_lanesoi<mode>"
-  [(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
-	(unspec:OI [(match_operand:OI 1 "register_operand" "w")
+  [(set (match_operand:OI 0 "aarch64_simd_struct_operand")
+	(unspec:OI [(match_operand:OI 1 "register_operand")
                     (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_ST2))]
   "TARGET_SIMD"
@@ -5031,8 +5223,8 @@
 )
 
 (define_expand "vec_load_lanesci<mode>"
-  [(set (match_operand:CI 0 "register_operand" "=w")
-	(unspec:CI [(match_operand:CI 1 "aarch64_simd_struct_operand" "Utv")
+  [(set (match_operand:CI 0 "register_operand")
+	(unspec:CI [(match_operand:CI 1 "aarch64_simd_struct_operand")
 		    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
 		   UNSPEC_LD3))]
   "TARGET_SIMD"
@@ -5075,8 +5267,8 @@
 )
 
 (define_expand "vec_store_lanesci<mode>"
-  [(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv")
-	(unspec:CI [(match_operand:CI 1 "register_operand" "w")
+  [(set (match_operand:CI 0 "aarch64_simd_struct_operand")
+	(unspec:CI [(match_operand:CI 1 "register_operand")
                     (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_ST3))]
   "TARGET_SIMD"
@@ -5129,8 +5321,8 @@
 )
 
 (define_expand "vec_load_lanesxi<mode>"
-  [(set (match_operand:XI 0 "register_operand" "=w")
-	(unspec:XI [(match_operand:XI 1 "aarch64_simd_struct_operand" "Utv")
+  [(set (match_operand:XI 0 "register_operand")
+	(unspec:XI [(match_operand:XI 1 "aarch64_simd_struct_operand")
 		    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
 		   UNSPEC_LD4))]
   "TARGET_SIMD"
@@ -5173,8 +5365,8 @@
 )
 
 (define_expand "vec_store_lanesxi<mode>"
-  [(set (match_operand:XI 0 "aarch64_simd_struct_operand" "=Utv")
-	(unspec:XI [(match_operand:XI 1 "register_operand" "w")
+  [(set (match_operand:XI 0 "aarch64_simd_struct_operand")
+	(unspec:XI [(match_operand:XI 1 "register_operand")
                     (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_ST4))]
   "TARGET_SIMD"
@@ -5219,8 +5411,8 @@
 ;; Reload patterns for AdvSIMD register list operands.
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VSTRUCT 0 "nonimmediate_operand" "")
-	(match_operand:VSTRUCT 1 "general_operand" ""))]
+  [(set (match_operand:VSTRUCT 0 "nonimmediate_operand")
+	(match_operand:VSTRUCT 1 "general_operand"))]
   "TARGET_SIMD"
 {
   if (can_create_pseudo_p ())
@@ -5232,8 +5424,8 @@
 
 
 (define_expand "aarch64_ld1x3<VALLDIF:mode>"
-  [(match_operand:CI 0 "register_operand" "=w")
-   (match_operand:DI 1 "register_operand" "r")
+  [(match_operand:CI 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
    (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_SIMD"
 {
@@ -5252,9 +5444,31 @@
   [(set_attr "type" "neon_load1_3reg<q>")]
 )
 
+(define_expand "aarch64_ld1x4<VALLDIF:mode>"
+  [(match_operand:XI 0 "register_operand" "=w")
+   (match_operand:DI 1 "register_operand" "r")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (XImode, operands[1]);
+  emit_insn (gen_aarch64_ld1_x4_<VALLDIF:mode> (operands[0], mem));
+  DONE;
+})
+
+(define_insn "aarch64_ld1_x4_<mode>"
+  [(set (match_operand:XI 0 "register_operand" "=w")
+	(unspec:XI
+	  [(match_operand:XI 1 "aarch64_simd_struct_operand" "Utv")
+	   (unspec:VALLDIF [(const_int 4)] UNSPEC_VSTRUCTDUMMY)]
+	UNSPEC_LD1))]
+  "TARGET_SIMD"
+  "ld1\\t{%S0.<Vtype> - %V0.<Vtype>}, %1"
+  [(set_attr "type" "neon_load1_4reg<q>")]
+)
+
 (define_expand "aarch64_st1x2<VALLDIF:mode>"
-  [(match_operand:DI 0 "register_operand" "")
-   (match_operand:OI 1 "register_operand" "")
+  [(match_operand:DI 0 "register_operand")
+   (match_operand:OI 1 "register_operand")
    (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_SIMD"
 {
@@ -5274,8 +5488,8 @@
 )
 
 (define_expand "aarch64_st1x3<VALLDIF:mode>"
-  [(match_operand:DI 0 "register_operand" "")
-   (match_operand:CI 1 "register_operand" "")
+  [(match_operand:DI 0 "register_operand")
+   (match_operand:CI 1 "register_operand")
    (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_SIMD"
 {
@@ -5294,6 +5508,28 @@
   [(set_attr "type" "neon_store1_3reg<q>")]
 )
 
+(define_expand "aarch64_st1x4<VALLDIF:mode>"
+  [(match_operand:DI 0 "register_operand" "")
+   (match_operand:XI 1 "register_operand" "")
+   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+  "TARGET_SIMD"
+{
+  rtx mem = gen_rtx_MEM (XImode, operands[0]);
+  emit_insn (gen_aarch64_st1_x4_<VALLDIF:mode> (mem, operands[1]));
+  DONE;
+})
+
+(define_insn "aarch64_st1_x4_<mode>"
+  [(set (match_operand:XI 0 "aarch64_simd_struct_operand" "=Utv")
+	(unspec:XI
+	   [(match_operand:XI 1 "register_operand" "w")
+	   (unspec:VALLDIF [(const_int 4)] UNSPEC_VSTRUCTDUMMY)]
+	UNSPEC_ST1))]
+  "TARGET_SIMD"
+  "st1\\t{%S1.<Vtype> - %V1.<Vtype>}, %0"
+  [(set_attr "type" "neon_store1_4reg<q>")]
+)
+
 (define_insn "*aarch64_mov<mode>"
   [(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand" "=w,Utv,w")
 	(match_operand:VSTRUCT 1 "aarch64_simd_general_operand" " w,w,Utv"))]
@@ -5427,8 +5663,8 @@
 })
 
 (define_expand "aarch64_ld<VSTRUCT:nregs>r<VALLDIF:mode>"
-  [(match_operand:VSTRUCT 0 "register_operand" "=w")
-   (match_operand:DI 1 "register_operand" "w")
+  [(match_operand:VSTRUCT 0 "register_operand")
+   (match_operand:DI 1 "register_operand")
    (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_SIMD"
 {
@@ -5502,8 +5738,8 @@
 )
 
 (define_expand "aarch64_ld<VSTRUCT:nregs><VDC:mode>"
- [(match_operand:VSTRUCT 0 "register_operand" "=w")
-  (match_operand:DI 1 "register_operand" "r")
+ [(match_operand:VSTRUCT 0 "register_operand")
+  (match_operand:DI 1 "register_operand")
   (unspec:VDC [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_SIMD"
 {
@@ -5530,8 +5766,8 @@
 })
 
 (define_expand "aarch64_ld<VSTRUCT:nregs><VQ:mode>"
- [(match_operand:VSTRUCT 0 "register_operand" "=w")
-  (match_operand:DI 1 "register_operand" "r")
+ [(match_operand:VSTRUCT 0 "register_operand")
+  (match_operand:DI 1 "register_operand")
   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_SIMD"
 {
@@ -5543,8 +5779,8 @@
 })
 
 (define_expand "aarch64_ld1x2<VQ:mode>"
- [(match_operand:OI 0 "register_operand" "=w")
-  (match_operand:DI 1 "register_operand" "r")
+ [(match_operand:OI 0 "register_operand")
+  (match_operand:DI 1 "register_operand")
   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_SIMD"
 {
@@ -5556,8 +5792,8 @@
 })
 
 (define_expand "aarch64_ld1x2<VDC:mode>"
- [(match_operand:OI 0 "register_operand" "=w")
-  (match_operand:DI 1 "register_operand" "r")
+ [(match_operand:OI 0 "register_operand")
+  (match_operand:DI 1 "register_operand")
   (unspec:VDC [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_SIMD"
 {
@@ -5570,10 +5806,10 @@
 
 
 (define_expand "aarch64_ld<VSTRUCT:nregs>_lane<VALLDIF:mode>"
-  [(match_operand:VSTRUCT 0 "register_operand" "=w")
-	(match_operand:DI 1 "register_operand" "w")
-	(match_operand:VSTRUCT 2 "register_operand" "0")
-	(match_operand:SI 3 "immediate_operand" "i")
+  [(match_operand:VSTRUCT 0 "register_operand")
+	(match_operand:DI 1 "register_operand")
+	(match_operand:VSTRUCT 2 "register_operand")
+	(match_operand:SI 3 "immediate_operand")
 	(unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_SIMD"
 {
@@ -5593,9 +5829,9 @@
 ;; D-register list.
 
 (define_expand "aarch64_get_dreg<VSTRUCT:mode><VDC:mode>"
- [(match_operand:VDC 0 "register_operand" "=w")
-  (match_operand:VSTRUCT 1 "register_operand" "w")
-  (match_operand:SI 2 "immediate_operand" "i")]
+ [(match_operand:VDC 0 "register_operand")
+  (match_operand:VSTRUCT 1 "register_operand")
+  (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
 {
   int part = INTVAL (operands[2]);
@@ -5610,9 +5846,9 @@
 ;; Q-register list.
 
 (define_expand "aarch64_get_qreg<VSTRUCT:mode><VQ:mode>"
- [(match_operand:VQ 0 "register_operand" "=w")
-  (match_operand:VSTRUCT 1 "register_operand" "w")
-  (match_operand:SI 2 "immediate_operand" "i")]
+ [(match_operand:VQ 0 "register_operand")
+  (match_operand:VSTRUCT 1 "register_operand")
+  (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
 {
   int part = INTVAL (operands[2]);
@@ -5749,13 +5985,13 @@
 ;; This instruction's pattern is generated directly by
 ;; aarch64_expand_vec_perm_const, so any changes to the pattern would
 ;; need corresponding changes there.
-(define_insn "aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>"
+(define_insn "aarch64_<PERMUTE:perm_insn><mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
 			  (match_operand:VALL_F16 2 "register_operand" "w")]
 	 PERMUTE))]
   "TARGET_SIMD"
-  "<PERMUTE:perm_insn><PERMUTE:perm_hilo>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  "<PERMUTE:perm_insn>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
   [(set_attr "type" "neon_permute<q>")]
 )
 
@@ -5851,8 +6087,8 @@
 )
 
 (define_expand "aarch64_st<VSTRUCT:nregs><VDC:mode>"
- [(match_operand:DI 0 "register_operand" "r")
-  (match_operand:VSTRUCT 1 "register_operand" "w")
+ [(match_operand:DI 0 "register_operand")
+  (match_operand:VSTRUCT 1 "register_operand")
   (unspec:VDC [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_SIMD"
 {
@@ -5864,8 +6100,8 @@
 })
 
 (define_expand "aarch64_st<VSTRUCT:nregs><VQ:mode>"
- [(match_operand:DI 0 "register_operand" "r")
-  (match_operand:VSTRUCT 1 "register_operand" "w")
+ [(match_operand:DI 0 "register_operand")
+  (match_operand:VSTRUCT 1 "register_operand")
   (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_SIMD"
 {
@@ -5877,8 +6113,8 @@
 })
 
 (define_expand "aarch64_st<VSTRUCT:nregs>_lane<VALLDIF:mode>"
- [(match_operand:DI 0 "register_operand" "r")
-  (match_operand:VSTRUCT 1 "register_operand" "w")
+ [(match_operand:DI 0 "register_operand")
+  (match_operand:VSTRUCT 1 "register_operand")
   (unspec:VALLDIF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)
   (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
@@ -5914,10 +6150,10 @@
 ;; extend them in arm_neon.h and insert the resulting Q-regs.
 
 (define_expand "aarch64_set_qreg<VSTRUCT:mode><VQ:mode>"
- [(match_operand:VSTRUCT 0 "register_operand" "+w")
-  (match_operand:VSTRUCT 1 "register_operand" "0")
-  (match_operand:VQ 2 "register_operand" "w")
-  (match_operand:SI 3 "immediate_operand" "i")]
+ [(match_operand:VSTRUCT 0 "register_operand")
+  (match_operand:VSTRUCT 1 "register_operand")
+  (match_operand:VQ 2 "register_operand")
+  (match_operand:SI 3 "immediate_operand")]
   "TARGET_SIMD"
 {
   int part = INTVAL (operands[3]);
@@ -5932,7 +6168,7 @@
 ;; Standard pattern name vec_init<mode><Vel>.
 
 (define_expand "vec_init<mode><Vel>"
-  [(match_operand:VALL_F16 0 "register_operand" "")
+  [(match_operand:VALL_F16 0 "register_operand")
    (match_operand 1 "" "")]
   "TARGET_SIMD"
 {
@@ -5941,7 +6177,7 @@
 })
 
 (define_expand "vec_init<mode><Vhalf>"
-  [(match_operand:VQ_NO2E 0 "register_operand" "")
+  [(match_operand:VQ_NO2E 0 "register_operand")
    (match_operand 1 "" "")]
   "TARGET_SIMD"
 {
@@ -6020,9 +6256,9 @@
 ;; Standard pattern name vec_extract<mode><Vel>.
 
 (define_expand "vec_extract<mode><Vel>"
-  [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "")
-   (match_operand:VALL_F16 1 "register_operand" "")
-   (match_operand:SI 2 "immediate_operand" "")]
+  [(match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand")
+   (match_operand:VALL_F16 1 "register_operand")
+   (match_operand:SI 2 "immediate_operand")]
   "TARGET_SIMD"
 {
     emit_insn
@@ -6063,56 +6299,23 @@
 
 (define_insn "aarch64_crypto_aes<aes_op>v16qi"
   [(set (match_operand:V16QI 0 "register_operand" "=w")
-	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "%0")
-		       (match_operand:V16QI 2 "register_operand" "w")]
+	(unspec:V16QI
+		[(xor:V16QI
+		 (match_operand:V16QI 1 "register_operand" "%0")
+		 (match_operand:V16QI 2 "register_operand" "w"))]
          CRYPTO_AES))]
   "TARGET_SIMD && TARGET_AES"
   "aes<aes_op>\\t%0.16b, %2.16b"
   [(set_attr "type" "crypto_aese")]
 )
 
-(define_insn "*aarch64_crypto_aes<aes_op>v16qi_xor_combine"
-  [(set (match_operand:V16QI 0 "register_operand" "=w")
-	(unspec:V16QI [(xor:V16QI
-			(match_operand:V16QI 1 "register_operand" "%0")
-			(match_operand:V16QI 2 "register_operand" "w"))
-		       (match_operand:V16QI 3 "aarch64_simd_imm_zero" "")]
-		       CRYPTO_AES))]
-  "TARGET_SIMD && TARGET_AES"
-  "aes<aes_op>\\t%0.16b, %2.16b"
-  [(set_attr "type" "crypto_aese")]
-)
-
-(define_insn "*aarch64_crypto_aes<aes_op>v16qi_xor_combine"
-  [(set (match_operand:V16QI 0 "register_operand" "=w")
-	(unspec:V16QI [(match_operand:V16QI 3 "aarch64_simd_imm_zero" "")
-	(xor:V16QI (match_operand:V16QI 1 "register_operand" "%0")
-		   (match_operand:V16QI 2 "register_operand" "w"))]
-	CRYPTO_AES))]
-  "TARGET_SIMD && TARGET_AES"
-  "aes<aes_op>\\t%0.16b, %2.16b"
-  [(set_attr "type" "crypto_aese")]
-)
-
-;; When AES/AESMC fusion is enabled we want the register allocation to
-;; look like:
-;;    AESE Vn, _
-;;    AESMC Vn, Vn
-;; So prefer to tie operand 1 to operand 0 when fusing.
-
 (define_insn "aarch64_crypto_aes<aesmc_op>v16qi"
-  [(set (match_operand:V16QI 0 "register_operand" "=w,w")
-	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0,w")]
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
+	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "w")]
 	 CRYPTO_AESMC))]
   "TARGET_SIMD && TARGET_AES"
   "aes<aesmc_op>\\t%0.16b, %1.16b"
-  [(set_attr "type" "crypto_aesmc")
-   (set_attr_alternative "enabled"
-     [(if_then_else (match_test
-		       "aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)")
-		     (const_string "yes" )
-		     (const_string "no"))
-      (const_string "yes")])]
+  [(set_attr "type" "crypto_aesmc")]
 )
 
 ;; When AESE/AESMC fusion is enabled we really want to keep the two together
@@ -6121,12 +6324,14 @@
 ;;  Mash the two together during combine.
 
 (define_insn "*aarch64_crypto_aese_fused"
-  [(set (match_operand:V16QI 0 "register_operand" "=&w")
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
 	(unspec:V16QI
 	  [(unspec:V16QI
-	    [(match_operand:V16QI 1 "register_operand" "0")
-	     (match_operand:V16QI 2 "register_operand" "w")] UNSPEC_AESE)
-	  ] UNSPEC_AESMC))]
+	   [(xor:V16QI
+		(match_operand:V16QI 1 "register_operand" "%0")
+		(match_operand:V16QI 2 "register_operand" "w"))]
+	     UNSPEC_AESE)]
+	UNSPEC_AESMC))]
   "TARGET_SIMD && TARGET_AES
    && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)"
   "aese\\t%0.16b, %2.16b\;aesmc\\t%0.16b, %0.16b"
@@ -6140,12 +6345,14 @@
 ;;  Mash the two together during combine.
 
 (define_insn "*aarch64_crypto_aesd_fused"
-  [(set (match_operand:V16QI 0 "register_operand" "=&w")
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
 	(unspec:V16QI
 	  [(unspec:V16QI
-	    [(match_operand:V16QI 1 "register_operand" "0")
-	     (match_operand:V16QI 2 "register_operand" "w")] UNSPEC_AESD)
-	  ] UNSPEC_AESIMC))]
+		    [(xor:V16QI
+			(match_operand:V16QI 1 "register_operand" "%0")
+			(match_operand:V16QI 2 "register_operand" "w"))]
+		UNSPEC_AESD)]
+	  UNSPEC_AESIMC))]
   "TARGET_SIMD && TARGET_AES
    && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)"
   "aesd\\t%0.16b, %2.16b\;aesimc\\t%0.16b, %0.16b"
@@ -6397,11 +6604,11 @@
 ;; fp16fml
 
 (define_expand "aarch64_fml<f16mac1>l<f16quad>_low<mode>"
-  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+  [(set (match_operand:VDQSF 0 "register_operand")
 	(unspec:VDQSF
-	 [(match_operand:VDQSF 1 "register_operand" "0")
-	  (match_operand:<VFMLA_W> 2 "register_operand" "w")
-	  (match_operand:<VFMLA_W> 3 "register_operand" "w")]
+	 [(match_operand:VDQSF 1 "register_operand")
+	  (match_operand:<VFMLA_W> 2 "register_operand")
+	  (match_operand:<VFMLA_W> 3 "register_operand")]
 	 VFMLA16_LOW))]
   "TARGET_F16FML"
 {
@@ -6420,11 +6627,11 @@
 })
 
 (define_expand "aarch64_fml<f16mac1>l<f16quad>_high<mode>"
-  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+  [(set (match_operand:VDQSF 0 "register_operand")
 	(unspec:VDQSF
-	 [(match_operand:VDQSF 1 "register_operand" "0")
-	  (match_operand:<VFMLA_W> 2 "register_operand" "w")
-	  (match_operand:<VFMLA_W> 3 "register_operand" "w")]
+	 [(match_operand:VDQSF 1 "register_operand")
+	  (match_operand:<VFMLA_W> 2 "register_operand")
+	  (match_operand:<VFMLA_W> 3 "register_operand")]
 	 VFMLA16_HIGH))]
   "TARGET_F16FML"
 {
@@ -6510,11 +6717,11 @@
 )
 
 (define_expand "aarch64_fml<f16mac1>l_lane_lowv2sf"
-  [(set (match_operand:V2SF 0 "register_operand" "")
-	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "")
-			   (match_operand:V4HF 2 "register_operand" "")
-			   (match_operand:V4HF 3 "register_operand" "")
-			   (match_operand:SI 4 "aarch64_imm2" "")]
+  [(set (match_operand:V2SF 0 "register_operand")
+	(unspec:V2SF [(match_operand:V2SF 1 "register_operand")
+			   (match_operand:V4HF 2 "register_operand")
+			   (match_operand:V4HF 3 "register_operand")
+			   (match_operand:SI 4 "aarch64_imm2")]
 	 VFMLA16_LOW))]
   "TARGET_F16FML"
 {
@@ -6531,11 +6738,11 @@
 )
 
 (define_expand "aarch64_fml<f16mac1>l_lane_highv2sf"
-  [(set (match_operand:V2SF 0 "register_operand" "")
-	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "")
-			   (match_operand:V4HF 2 "register_operand" "")
-			   (match_operand:V4HF 3 "register_operand" "")
-			   (match_operand:SI 4 "aarch64_imm2" "")]
+  [(set (match_operand:V2SF 0 "register_operand")
+	(unspec:V2SF [(match_operand:V2SF 1 "register_operand")
+			   (match_operand:V4HF 2 "register_operand")
+			   (match_operand:V4HF 3 "register_operand")
+			   (match_operand:SI 4 "aarch64_imm2")]
 	 VFMLA16_HIGH))]
   "TARGET_F16FML"
 {
@@ -6625,11 +6832,11 @@
 )
 
 (define_expand "aarch64_fml<f16mac1>lq_laneq_lowv4sf"
-  [(set (match_operand:V4SF 0 "register_operand" "")
-	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "")
-			   (match_operand:V8HF 2 "register_operand" "")
-			   (match_operand:V8HF 3 "register_operand" "")
-			   (match_operand:SI 4 "aarch64_lane_imm3" "")]
+  [(set (match_operand:V4SF 0 "register_operand")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand")
+			   (match_operand:V8HF 2 "register_operand")
+			   (match_operand:V8HF 3 "register_operand")
+			   (match_operand:SI 4 "aarch64_lane_imm3")]
 	 VFMLA16_LOW))]
   "TARGET_F16FML"
 {
@@ -6645,11 +6852,11 @@
 })
 
 (define_expand "aarch64_fml<f16mac1>lq_laneq_highv4sf"
-  [(set (match_operand:V4SF 0 "register_operand" "")
-	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "")
-			   (match_operand:V8HF 2 "register_operand" "")
-			   (match_operand:V8HF 3 "register_operand" "")
-			   (match_operand:SI 4 "aarch64_lane_imm3" "")]
+  [(set (match_operand:V4SF 0 "register_operand")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand")
+			   (match_operand:V8HF 2 "register_operand")
+			   (match_operand:V8HF 3 "register_operand")
+			   (match_operand:SI 4 "aarch64_lane_imm3")]
 	 VFMLA16_HIGH))]
   "TARGET_F16FML"
 {
@@ -6739,11 +6946,11 @@
 )
 
 (define_expand "aarch64_fml<f16mac1>l_laneq_lowv2sf"
-  [(set (match_operand:V2SF 0 "register_operand" "")
-	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "")
-		      (match_operand:V4HF 2 "register_operand" "")
-		      (match_operand:V8HF 3 "register_operand" "")
-		      (match_operand:SI 4 "aarch64_lane_imm3" "")]
+  [(set (match_operand:V2SF 0 "register_operand")
+	(unspec:V2SF [(match_operand:V2SF 1 "register_operand")
+		      (match_operand:V4HF 2 "register_operand")
+		      (match_operand:V8HF 3 "register_operand")
+		      (match_operand:SI 4 "aarch64_lane_imm3")]
 	 VFMLA16_LOW))]
   "TARGET_F16FML"
 {
@@ -6760,11 +6967,11 @@
 })
 
 (define_expand "aarch64_fml<f16mac1>l_laneq_highv2sf"
-  [(set (match_operand:V2SF 0 "register_operand" "")
-	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "")
-		      (match_operand:V4HF 2 "register_operand" "")
-		      (match_operand:V8HF 3 "register_operand" "")
-		      (match_operand:SI 4 "aarch64_lane_imm3" "")]
+  [(set (match_operand:V2SF 0 "register_operand")
+	(unspec:V2SF [(match_operand:V2SF 1 "register_operand")
+		      (match_operand:V4HF 2 "register_operand")
+		      (match_operand:V8HF 3 "register_operand")
+		      (match_operand:SI 4 "aarch64_lane_imm3")]
 	 VFMLA16_HIGH))]
   "TARGET_F16FML"
 {
@@ -6855,11 +7062,11 @@
 )
 
 (define_expand "aarch64_fml<f16mac1>lq_lane_lowv4sf"
-  [(set (match_operand:V4SF 0 "register_operand" "")
-	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "")
-		      (match_operand:V8HF 2 "register_operand" "")
-		      (match_operand:V4HF 3 "register_operand" "")
-		      (match_operand:SI 4 "aarch64_imm2" "")]
+  [(set (match_operand:V4SF 0 "register_operand")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand")
+		      (match_operand:V8HF 2 "register_operand")
+		      (match_operand:V4HF 3 "register_operand")
+		      (match_operand:SI 4 "aarch64_imm2")]
 	 VFMLA16_LOW))]
   "TARGET_F16FML"
 {
@@ -6875,11 +7082,11 @@
 })
 
 (define_expand "aarch64_fml<f16mac1>lq_lane_highv4sf"
-  [(set (match_operand:V4SF 0 "register_operand" "")
-	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "")
-		      (match_operand:V8HF 2 "register_operand" "")
-		      (match_operand:V4HF 3 "register_operand" "")
-		      (match_operand:SI 4 "aarch64_imm2" "")]
+  [(set (match_operand:V4SF 0 "register_operand")
+	(unspec:V4SF [(match_operand:V4SF 1 "register_operand")
+		      (match_operand:V8HF 2 "register_operand")
+		      (match_operand:V4HF 3 "register_operand")
+		      (match_operand:SI 4 "aarch64_imm2")]
 	 VFMLA16_HIGH))]
   "TARGET_F16FML"
 {
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
new file mode 100644
index 000000000..b28ded0f5
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -0,0 +1,2760 @@
+/* ACLE support for AArch64 SVE (__ARM_FEATURE_SVE intrinsics)
+   Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "tree.h"
+#include "rtl.h"
+#include "tm_p.h"
+#include "memmodel.h"
+#include "insn-codes.h"
+#include "optabs.h"
+#include "recog.h"
+#include "expr.h"
+#include "basic-block.h"
+#include "function.h"
+#include "fold-const.h"
+#include "gimple.h"
+#include "gimple-iterator.h"
+#include "gimplify.h"
+#include "explow.h"
+#include "emit-rtl.h"
+#include "tree-vector-builder.h"
+#include "rtx-vector-builder.h"
+#include "vec-perm-indices.h"
+#include "aarch64-sve-builtins.h"
+#include "aarch64-sve-builtins-shapes.h"
+#include "aarch64-sve-builtins-base.h"
+#include "aarch64-sve-builtins-functions.h"
+
+using namespace aarch64_sve;
+
+namespace {
+
+/* Expand a call to svmad, or svmla after reordering its operands.
+   Make _m forms merge with argument MERGE_ARGNO.  */
+static rtx
+expand_mad (function_expander &e,
+	    unsigned int merge_argno = DEFAULT_MERGE_ARGNO)
+{
+  if (e.pred == PRED_x)
+    {
+      insn_code icode;
+      if (e.type_suffix (0).integer_p)
+	icode = code_for_aarch64_pred_fma (e.vector_mode (0));
+      else
+	icode = code_for_aarch64_pred (UNSPEC_COND_FMLA, e.vector_mode (0));
+      return e.use_pred_x_insn (icode);
+    }
+
+  insn_code icode = e.direct_optab_handler (cond_fma_optab);
+  return e.use_cond_insn (icode, merge_argno);
+}
+
+/* Expand a call to svmsb, or svmls after reordering its operands.
+   Make _m forms merge with argument MERGE_ARGNO.  */
+static rtx
+expand_msb (function_expander &e,
+	    unsigned int merge_argno = DEFAULT_MERGE_ARGNO)
+{
+  if (e.pred == PRED_x)
+    {
+      insn_code icode;
+      if (e.type_suffix (0).integer_p)
+	icode = code_for_aarch64_pred_fnma (e.vector_mode (0));
+      else
+	icode = code_for_aarch64_pred (UNSPEC_COND_FMLS, e.vector_mode (0));
+      return e.use_pred_x_insn (icode);
+    }
+
+  insn_code icode = e.direct_optab_handler (cond_fnma_optab);
+  return e.use_cond_insn (icode, merge_argno);
+}
+
+class svabd_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* The integer operations are represented as the subtraction of the
+       minimum from the maximum, with the signedness of the instruction
+       keyed off the signedness of the maximum operation.  */
+    rtx_code max_code = e.type_suffix (0).unsigned_p ? UMAX : SMAX;
+    insn_code icode;
+    if (e.pred == PRED_x)
+      {
+	if (e.type_suffix (0).integer_p)
+	  icode = code_for_aarch64_pred_abd (max_code, e.vector_mode (0));
+	else
+	  icode = code_for_aarch64_pred_abd (e.vector_mode (0));
+	return e.use_pred_x_insn (icode);
+      }
+
+    if (e.type_suffix (0).integer_p)
+      icode = code_for_aarch64_cond_abd (max_code, e.vector_mode (0));
+    else
+      icode = code_for_aarch64_cond_abd (e.vector_mode (0));
+    return e.use_cond_insn (icode);
+  }
+};
+
+/* Implements svacge, svacgt, svacle and svaclt.  */
+class svac_impl : public function_base
+{
+public:
+  CONSTEXPR svac_impl (int unspec) : m_unspec (unspec) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    e.add_ptrue_hint (0, e.gp_mode (0));
+    insn_code icode = code_for_aarch64_pred_fac (m_unspec, e.vector_mode (0));
+    return e.use_exact_insn (icode);
+  }
+
+  /* The unspec code for the underlying comparison.  */
+  int m_unspec;
+};
+
+class svadda_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Put the predicate last, as required by mask_fold_left_plus_optab.  */
+    e.rotate_inputs_left (0, 3);
+    machine_mode mode = e.vector_mode (0);
+    insn_code icode = direct_optab_handler (mask_fold_left_plus_optab, mode);
+    return e.use_exact_insn (icode);
+  }
+};
+
+/* Implements svadr[bhwd].  */
+class svadr_bhwd_impl : public function_base
+{
+public:
+  CONSTEXPR svadr_bhwd_impl (unsigned int shift) : m_shift (shift) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    machine_mode mode = GET_MODE (e.args[0]);
+    if (m_shift == 0)
+      return e.use_exact_insn (code_for_aarch64_adr (mode));
+
+    /* Turn the access size into an extra shift argument.  */
+    rtx shift = gen_int_mode (m_shift, GET_MODE_INNER (mode));
+    e.args.quick_push (expand_vector_broadcast (mode, shift));
+    return e.use_exact_insn (code_for_aarch64_adr_shift (mode));
+  }
+
+  /* How many bits left to shift the vector displacement.  */
+  unsigned int m_shift;
+};
+
+class svasrd_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return e.use_cond_insn (code_for_cond_asrd (e.vector_mode (0)));
+  }
+};
+
+class svbic_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Convert svbic of a constant into svand of its inverse.  */
+    if (CONST_INT_P (e.args[2]))
+      {
+	machine_mode mode = GET_MODE_INNER (e.vector_mode (0));
+	e.args[2] = simplify_unary_operation (NOT, mode, e.args[2], mode);
+	return e.map_to_rtx_codes (AND, AND, -1);
+      }
+
+    if (e.type_suffix_ids[0] == TYPE_SUFFIX_b)
+      {
+	gcc_assert (e.pred == PRED_z);
+	return e.use_exact_insn (CODE_FOR_aarch64_pred_bicvnx16bi_z);
+      }
+
+    if (e.pred == PRED_x)
+      return e.use_unpred_insn (code_for_aarch64_bic (e.vector_mode (0)));
+
+    return e.use_cond_insn (code_for_cond_bic (e.vector_mode (0)));
+  }
+};
+
+/* Implements svbrkn, svbrkpa and svbrkpb.  */
+class svbrk_binary_impl : public function_base
+{
+public:
+  CONSTEXPR svbrk_binary_impl (int unspec) : m_unspec (unspec) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return e.use_exact_insn (code_for_aarch64_brk (m_unspec));
+  }
+
+  /* The unspec code associated with the operation.  */
+  int m_unspec;
+};
+
+/* Implements svbrka and svbrkb.  */
+class svbrk_unary_impl : public function_base
+{
+public:
+  CONSTEXPR svbrk_unary_impl (int unspec) : m_unspec (unspec) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return e.use_cond_insn (code_for_aarch64_brk (m_unspec));
+  }
+
+  /* The unspec code associated with the operation.  */
+  int m_unspec;
+};
+
+class svcadd_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Convert the rotation amount into a specific unspec.  */
+    int rot = INTVAL (e.args[3]);
+    e.args.ordered_remove (3);
+    int unspec = (rot == 90 ? UNSPEC_COND_FCADD90
+		  : rot == 270 ? UNSPEC_COND_FCADD270
+		  : (gcc_unreachable (), 0));
+    return e.map_to_unspecs (-1, -1, unspec);
+  }
+};
+
+/* Implements svclasta and svclastb.  */
+class svclast_impl : public quiet<function_base>
+{
+public:
+  CONSTEXPR svclast_impl (int unspec) : m_unspec (unspec) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Match the fold_extract_optab order.  */
+    std::swap (e.args[0], e.args[1]);
+    machine_mode mode = e.vector_mode (0);
+    insn_code icode;
+    if (e.mode_suffix_id == MODE_n)
+      icode = code_for_fold_extract (m_unspec, mode);
+    else
+      icode = code_for_aarch64_fold_extract_vector (m_unspec, mode);
+    return e.use_exact_insn (icode);
+  }
+
+  /* The unspec code associated with the operation.  */
+  int m_unspec;
+};
+
+class svcmla_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Convert the rotation amount into a specific unspec.  */
+    int rot = INTVAL (e.args[4]);
+    e.args.ordered_remove (4);
+    int unspec = (rot == 0 ? UNSPEC_COND_FCMLA
+		  : rot == 90 ? UNSPEC_COND_FCMLA90
+		  : rot == 180 ? UNSPEC_COND_FCMLA180
+		  : rot == 270 ? UNSPEC_COND_FCMLA270
+		  : (gcc_unreachable (), 0));
+
+    /* Make the operand order the same as the one used by the fma optabs,
+       with the accumulator last.  */
+    e.rotate_inputs_left (1, 4);
+    return e.map_to_unspecs (-1, -1, unspec, 3);
+  }
+};
+
+class svcmla_lane_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Convert the rotation amount into a specific unspec.  */
+    int rot = INTVAL (e.args[4]);
+    e.args.ordered_remove (4);
+    int unspec = (rot == 0 ? UNSPEC_FCMLA
+		  : rot == 90 ? UNSPEC_FCMLA90
+		  : rot == 180 ? UNSPEC_FCMLA180
+		  : rot == 270 ? UNSPEC_FCMLA270
+		  : (gcc_unreachable (), 0));
+
+    /* Make the operand order the same as the one used by the fma optabs,
+       with the accumulator last.  */
+    e.rotate_inputs_left (0, 4);
+    insn_code icode = code_for_aarch64_lane (unspec, e.vector_mode (0));
+    return e.use_exact_insn (icode);
+  }
+};
+
+/* Implements svcmp<cc> (except svcmpuo, which is handled separately).  */
+class svcmp_impl : public function_base
+{
+public:
+  CONSTEXPR svcmp_impl (tree_code code, int unspec_for_fp)
+    : m_code (code), m_unspec_for_fp (unspec_for_fp) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    machine_mode mode = e.vector_mode (0);
+
+    /* Comparisons are UNSPEC_PRED_Z operations and so need a hint
+       operand.  */
+    e.add_ptrue_hint (0, e.gp_mode (0));
+
+    if (e.type_suffix (0).integer_p)
+      {
+	bool unsigned_p = e.type_suffix (0).unsigned_p;
+	rtx_code code = get_rtx_code (m_code, unsigned_p);
+	return e.use_exact_insn (code_for_aarch64_pred_cmp (code, mode));
+      }
+
+    insn_code icode = code_for_aarch64_pred_fcm (m_unspec_for_fp, mode);
+    return e.use_exact_insn (icode);
+  }
+
+  /* The tree code associated with the comparison.  */
+  tree_code m_code;
+
+  /* The unspec code to use for floating-point comparisons.  */
+  int m_unspec_for_fp;
+};
+
+/* Implements svcmp<cc>_wide.  */
+class svcmp_wide_impl : public function_base
+{
+public:
+  CONSTEXPR svcmp_wide_impl (tree_code code, int unspec_for_sint,
+			     int unspec_for_uint)
+    : m_code (code), m_unspec_for_sint (unspec_for_sint),
+      m_unspec_for_uint (unspec_for_uint) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    machine_mode mode = e.vector_mode (0);
+    bool unsigned_p = e.type_suffix (0).unsigned_p;
+    rtx_code code = get_rtx_code (m_code, unsigned_p);
+
+    /* Comparisons are UNSPEC_PRED_Z operations and so need a hint
+       operand.  */
+    e.add_ptrue_hint (0, e.gp_mode (0));
+
+    /* If the argument is a constant that the unwidened comparisons
+       can handle directly, use them instead.  */
+    insn_code icode = code_for_aarch64_pred_cmp (code, mode);
+    rtx op2 = unwrap_const_vec_duplicate (e.args[3]);
+    if (CONSTANT_P (op2)
+	&& insn_data[icode].operand[4].predicate (op2, DImode))
+      {
+	e.args[3] = op2;
+	return e.use_exact_insn (icode);
+      }
+
+    int unspec = (unsigned_p ? m_unspec_for_uint : m_unspec_for_sint);
+    return e.use_exact_insn (code_for_aarch64_pred_cmp_wide (unspec, mode));
+  }
+
+  /* The tree code associated with the comparison.  */
+  tree_code m_code;
+
+  /* The unspec codes for signed and unsigned wide comparisons
+     respectively.  */
+  int m_unspec_for_sint;
+  int m_unspec_for_uint;
+};
+
+class svcmpuo_impl : public quiet<function_base>
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    e.add_ptrue_hint (0, e.gp_mode (0));
+    return e.use_exact_insn (code_for_aarch64_pred_fcmuo (e.vector_mode (0)));
+  }
+};
+
+class svcnot_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    machine_mode mode = e.vector_mode (0);
+    if (e.pred == PRED_x)
+      {
+	/* The pattern for CNOT includes an UNSPEC_PRED_Z, so needs
+	   a ptrue hint.  */
+	e.add_ptrue_hint (0, e.gp_mode (0));
+	return e.use_pred_x_insn (code_for_aarch64_pred_cnot (mode));
+      }
+
+    return e.use_cond_insn (code_for_cond_cnot (mode), 0);
+  }
+};
+
+/* Implements svcnt[bhwd], which count the number of elements
+   in a particular vector mode.  */
+class svcnt_bhwd_impl : public function_base
+{
+public:
+  CONSTEXPR svcnt_bhwd_impl (machine_mode ref_mode) : m_ref_mode (ref_mode) {}
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree count = build_int_cstu (TREE_TYPE (f.lhs),
+				 GET_MODE_NUNITS (m_ref_mode));
+    return gimple_build_assign (f.lhs, count);
+  }
+
+  rtx
+  expand (function_expander &) const OVERRIDE
+  {
+    return gen_int_mode (GET_MODE_NUNITS (m_ref_mode), DImode);
+  }
+
+  /* The mode of the vector associated with the [bhwd] suffix.  */
+  machine_mode m_ref_mode;
+};
+
+/* Implements svcnt[bhwd]_pat.  */
+class svcnt_bhwd_pat_impl : public svcnt_bhwd_impl
+{
+public:
+  CONSTEXPR svcnt_bhwd_pat_impl (machine_mode ref_mode)
+    : svcnt_bhwd_impl (ref_mode) {}
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree pattern_arg = gimple_call_arg (f.call, 0);
+    aarch64_svpattern pattern = (aarch64_svpattern) tree_to_shwi (pattern_arg);
+
+    if (pattern == AARCH64_SV_ALL)
+      /* svcvnt[bwhd]_pat (SV_ALL) == svcnt[bwhd] ().  */
+      return svcnt_bhwd_impl::fold (f);
+
+    /* See whether we can count the number of elements in the pattern
+       at compile time.  */
+    unsigned int elements_per_vq = 128 / GET_MODE_UNIT_BITSIZE (m_ref_mode);
+    HOST_WIDE_INT value = aarch64_fold_sve_cnt_pat (pattern, elements_per_vq);
+    if (value >= 0)
+      {
+	tree count = build_int_cstu (TREE_TYPE (f.lhs), value);
+	return gimple_build_assign (f.lhs, count);
+      }
+
+    return NULL;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    unsigned int elements_per_vq = 128 / GET_MODE_UNIT_BITSIZE (m_ref_mode);
+    e.args.quick_push (gen_int_mode (elements_per_vq, DImode));
+    e.args.quick_push (const1_rtx);
+    return e.use_exact_insn (CODE_FOR_aarch64_sve_cnt_pat);
+  }
+};
+
+class svcntp_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    machine_mode mode = e.vector_mode (0);
+    e.add_ptrue_hint (0, mode);
+    return e.use_exact_insn (code_for_aarch64_pred_cntp (mode));
+  }
+};
+
+/* Implements svcreate2, svcreate3 and svcreate4.  */
+class svcreate_impl : public quiet<multi_vector_function>
+{
+public:
+  CONSTEXPR svcreate_impl (unsigned int vectors_per_tuple)
+    : quiet<multi_vector_function> (vectors_per_tuple) {}
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    unsigned int nargs = gimple_call_num_args (f.call);
+    tree lhs_type = TREE_TYPE (f.lhs);
+
+    /* Replace the call with a clobber of the result (to prevent it from
+       becoming upwards exposed) followed by stores into each individual
+       vector of tuple.
+
+       The fold routines expect the replacement statement to have the
+       same lhs as the original call, so return the clobber statement
+       rather than the final vector store.  */
+    gassign *clobber = gimple_build_assign (f.lhs, build_clobber (lhs_type));
+
+    for (unsigned int i = nargs; i-- > 0; )
+      {
+	tree rhs_vector = gimple_call_arg (f.call, i);
+	tree field = tuple_type_field (TREE_TYPE (f.lhs));
+	tree lhs_array = build3 (COMPONENT_REF, TREE_TYPE (field),
+				 unshare_expr (f.lhs), field, NULL_TREE);
+	tree lhs_vector = build4 (ARRAY_REF, TREE_TYPE (rhs_vector),
+				  lhs_array, size_int (i),
+				  NULL_TREE, NULL_TREE);
+	gassign *assign = gimple_build_assign (lhs_vector, rhs_vector);
+	gsi_insert_after (f.gsi, assign, GSI_SAME_STMT);
+      }
+    return clobber;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    rtx lhs_tuple = e.get_nonoverlapping_reg_target ();
+
+    /* Record that LHS_TUPLE is dead before the first store.  */
+    emit_clobber (lhs_tuple);
+    for (unsigned int i = 0; i < e.args.length (); ++i)
+      {
+	/* Use an lvalue subreg to refer to vector I in LHS_TUPLE.  */
+	rtx lhs_vector = simplify_gen_subreg (GET_MODE (e.args[i]),
+					      lhs_tuple, GET_MODE (lhs_tuple),
+					      i * BYTES_PER_SVE_VECTOR);
+	emit_move_insn (lhs_vector, e.args[i]);
+      }
+    return lhs_tuple;
+  }
+};
+
+class svcvt_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    machine_mode mode0 = e.vector_mode (0);
+    machine_mode mode1 = e.vector_mode (1);
+    insn_code icode;
+    /* All this complication comes from the need to select four things
+       simultaneously:
+
+       (1) the kind of conversion (int<-float, float<-int, float<-float)
+       (2) signed vs. unsigned integers, where relevant
+       (3) the predication mode, which must be the wider of the predication
+	   modes for MODE0 and MODE1
+       (4) the predication type (m, x or z)
+
+       The only supported int<->float conversions for which the integer is
+       narrower than the float are SI<->DF.  It's therefore more convenient
+       to handle (3) by defining two patterns for int<->float conversions:
+       one in which the integer is at least as wide as the float and so
+       determines the predication mode, and another single SI<->DF pattern
+       in which the float's mode determines the predication mode (which is
+       always VNx2BI in that case).
+
+       The names of the patterns follow the optab convention of giving
+       the source mode before the destination mode.  */
+    if (e.type_suffix (1).integer_p)
+      {
+	int unspec = (e.type_suffix (1).unsigned_p
+		      ? UNSPEC_COND_UCVTF
+		      : UNSPEC_COND_SCVTF);
+	if (e.type_suffix (0).element_bytes <= e.type_suffix (1).element_bytes)
+	  icode = (e.pred == PRED_x
+		   ? code_for_aarch64_sve_nonextend (unspec, mode1, mode0)
+		   : code_for_cond_nonextend (unspec, mode1, mode0));
+	else
+	  icode = (e.pred == PRED_x
+		   ? code_for_aarch64_sve_extend (unspec, mode1, mode0)
+		   : code_for_cond_extend (unspec, mode1, mode0));
+      }
+    else
+      {
+	int unspec = (!e.type_suffix (0).integer_p ? UNSPEC_COND_FCVT
+		      : e.type_suffix (0).unsigned_p ? UNSPEC_COND_FCVTZU
+		      : UNSPEC_COND_FCVTZS);
+	if (e.type_suffix (0).element_bytes >= e.type_suffix (1).element_bytes)
+	  icode = (e.pred == PRED_x
+		   ? code_for_aarch64_sve_nontrunc (unspec, mode1, mode0)
+		   : code_for_cond_nontrunc (unspec, mode1, mode0));
+	else
+	  icode = (e.pred == PRED_x
+		   ? code_for_aarch64_sve_trunc (unspec, mode1, mode0)
+		   : code_for_cond_trunc (unspec, mode1, mode0));
+      }
+
+    if (e.pred == PRED_x)
+      return e.use_pred_x_insn (icode);
+    return e.use_cond_insn (icode);
+  }
+};
+
+class svdot_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* In the optab, the multiplication operands come before the accumulator
+       operand.  The optab is keyed off the multiplication mode.  */
+    e.rotate_inputs_left (0, 3);
+    insn_code icode
+      = e.direct_optab_handler_for_sign (sdot_prod_optab, udot_prod_optab,
+					 0, GET_MODE (e.args[0]));
+    return e.use_unpred_insn (icode);
+  }
+};
+
+class svdotprod_lane_impl : public unspec_based_function_base
+{
+public:
+  CONSTEXPR svdotprod_lane_impl (int unspec_for_sint,
+				 int unspec_for_uint,
+				 int unspec_for_float)
+    : unspec_based_function_base (unspec_for_sint,
+				  unspec_for_uint,
+				  unspec_for_float) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Use the same ordering as the dot_prod_optab, with the
+       accumulator last.  */
+    e.rotate_inputs_left (0, 4);
+    int unspec = unspec_for (e);
+    machine_mode mode = e.vector_mode (0);
+    return e.use_exact_insn (code_for_aarch64_dot_prod_lane (unspec, mode));
+  }
+};
+
+class svdup_impl : public quiet<function_base>
+{
+public:
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree vec_type = TREE_TYPE (f.lhs);
+    tree rhs = gimple_call_arg (f.call, f.pred == PRED_none ? 0 : 1);
+
+    if (f.pred == PRED_none || f.pred == PRED_x)
+      {
+	if (CONSTANT_CLASS_P (rhs))
+	  {
+	    if (f.type_suffix (0).bool_p)
+	      return (tree_to_shwi (rhs)
+		      ? f.fold_to_ptrue ()
+		      : f.fold_to_pfalse ());
+
+	    tree rhs_vector = build_vector_from_val (vec_type, rhs);
+	    return gimple_build_assign (f.lhs, rhs_vector);
+	  }
+
+	/* Avoid folding _b to a VEC_DUPLICATE_EXPR, since to do that we
+	   would need to introduce an extra and unwanted conversion to
+	   the truth vector element type.  */
+	if (!f.type_suffix (0).bool_p)
+	  return gimple_build_assign (f.lhs, VEC_DUPLICATE_EXPR, rhs);
+      }
+
+    return NULL;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    if (e.pred == PRED_none || e.pred == PRED_x)
+      /* There's no benefit to using predicated instructions for _x here.  */
+      return e.use_unpred_insn (e.direct_optab_handler (vec_duplicate_optab));
+
+    /* Model predicated svdups as a SEL in which the "true" value is
+       the duplicate of the function argument and the "false" value
+       is the value of inactive lanes.  */
+    insn_code icode;
+    machine_mode mode = e.vector_mode (0);
+    if (valid_for_const_vector_p (GET_MODE_INNER (mode), e.args.last ()))
+      /* Duplicate the constant to fill a vector.  The pattern optimizes
+	 various cases involving constant operands, falling back to SEL
+	 if necessary.  */
+      icode = code_for_vcond_mask (mode, mode);
+    else
+      /* Use the pattern for selecting between a duplicated scalar
+	 variable and a vector fallback.  */
+      icode = code_for_aarch64_sel_dup (mode);
+    return e.use_vcond_mask_insn (icode);
+  }
+};
+
+class svdup_lane_impl : public quiet<function_base>
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* The native DUP lane has an index range of 64 bytes.  */
+    machine_mode mode = e.vector_mode (0);
+    if (CONST_INT_P (e.args[1])
+	&& IN_RANGE (INTVAL (e.args[1]) * GET_MODE_UNIT_SIZE (mode), 0, 63))
+      return e.use_exact_insn (code_for_aarch64_sve_dup_lane (mode));
+
+    /* Treat svdup_lane as if it were svtbl_n.  */
+    return e.use_exact_insn (code_for_aarch64_sve_tbl (e.vector_mode (0)));
+  }
+};
+
+class svdupq_impl : public quiet<function_base>
+{
+public:
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree vec_type = TREE_TYPE (f.lhs);
+    unsigned int nargs = gimple_call_num_args (f.call);
+    /* For predicates, pad out each argument so that we have one element
+       per bit.  */
+    unsigned int factor = (f.type_suffix (0).bool_p
+			   ? f.type_suffix (0).element_bytes : 1);
+    tree_vector_builder builder (vec_type, nargs * factor, 1);
+    for (unsigned int i = 0; i < nargs; ++i)
+      {
+	tree elt = gimple_call_arg (f.call, i);
+	if (!CONSTANT_CLASS_P (elt))
+	  return NULL;
+	builder.quick_push (elt);
+	for (unsigned int j = 1; j < factor; ++j)
+	  builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
+      }
+    return gimple_build_assign (f.lhs, builder.build ());
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    machine_mode mode = e.vector_mode (0);
+    unsigned int elements_per_vq = e.args.length ();
+    if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
+      {
+	/* Construct a vector of integers so that we can compare them against
+	   zero below.  Zero vs. nonzero is the only distinction that
+	   matters.  */
+	mode = aarch64_sve_int_mode (mode);
+	for (unsigned int i = 0; i < elements_per_vq; ++i)
+	  e.args[i] = simplify_gen_unary (ZERO_EXTEND, GET_MODE_INNER (mode),
+					  e.args[i], QImode);
+      }
+
+    /* Get the 128-bit Advanced SIMD vector for this data size.  */
+    scalar_mode element_mode = GET_MODE_INNER (mode);
+    machine_mode vq_mode = aarch64_vq_mode (element_mode).require ();
+    gcc_assert (known_eq (elements_per_vq, GET_MODE_NUNITS (vq_mode)));
+
+    /* Put the arguments into a 128-bit Advanced SIMD vector.  We want
+       argument N to go into architectural lane N, whereas Advanced SIMD
+       vectors are loaded memory lsb to register lsb.  We therefore need
+       to reverse the elements for big-endian targets.  */
+    rtx vq_reg = gen_reg_rtx (vq_mode);
+    rtvec vec = rtvec_alloc (elements_per_vq);
+    for (unsigned int i = 0; i < elements_per_vq; ++i)
+      {
+	unsigned int argno = BYTES_BIG_ENDIAN ? elements_per_vq - i - 1 : i;
+	RTVEC_ELT (vec, i) = e.args[argno];
+      }
+    aarch64_expand_vector_init (vq_reg, gen_rtx_PARALLEL (vq_mode, vec));
+
+    /* If the result is a boolean, compare the data vector against zero.  */
+    if (mode != e.vector_mode (0))
+      {
+	rtx data_dupq = aarch64_expand_sve_dupq (NULL, mode, vq_reg);
+	return aarch64_convert_sve_data_to_pred (e.possible_target,
+						 e.vector_mode (0), data_dupq);
+      }
+
+    return aarch64_expand_sve_dupq (e.possible_target, mode, vq_reg);
+  }
+};
+
+class svdupq_lane_impl : public quiet<function_base>
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    machine_mode mode = e.vector_mode (0);
+    rtx index = e.args[1];
+    if (CONST_INT_P (index) && IN_RANGE (INTVAL (index), 0, 3))
+      {
+	/* Use the .Q form of DUP, which is the native instruction for
+	   this function.  */
+	insn_code icode = code_for_aarch64_sve_dupq_lane (mode);
+	unsigned int num_indices = e.elements_per_vq (0);
+	rtx indices = aarch64_gen_stepped_int_parallel
+	  (num_indices, INTVAL (index) * num_indices, 1);
+
+	e.add_output_operand (icode);
+	e.add_input_operand (icode, e.args[0]);
+	e.add_fixed_operand (indices);
+	return e.generate_insn (icode);
+      }
+
+    /* Build a .D TBL index for the pairs of doublewords that we want to
+       duplicate.  */
+    if (CONST_INT_P (index))
+      {
+	/* The index vector is a constant.  */
+	rtx_vector_builder builder (VNx2DImode, 2, 1);
+	builder.quick_push (gen_int_mode (INTVAL (index) * 2, DImode));
+	builder.quick_push (gen_int_mode (INTVAL (index) * 2 + 1, DImode));
+	index = builder.build ();
+      }
+    else
+      {
+	/* Duplicate INDEX * 2 to fill a DImode vector.  The ACLE spec
+	   explicitly allows the top of the index to be dropped.  */
+	index = force_reg (DImode, simplify_gen_binary (ASHIFT, DImode,
+							index, const1_rtx));
+	index = expand_vector_broadcast (VNx2DImode, index);
+
+	/* Get an alternating 0, 1 predicate.  */
+	rtx_vector_builder builder (VNx2BImode, 2, 1);
+	builder.quick_push (const0_rtx);
+	builder.quick_push (constm1_rtx);
+	rtx pg = force_reg (VNx2BImode, builder.build ());
+
+	/* Add one to the odd elements of the index.  */
+	rtx one = force_reg (VNx2DImode, CONST1_RTX (VNx2DImode));
+	rtx target = gen_reg_rtx (VNx2DImode);
+	emit_insn (gen_cond_addvnx2di (target, pg, index, one, index));
+	index = target;
+      }
+
+    e.args[0] = gen_lowpart (VNx2DImode, e.args[0]);
+    e.args[1] = index;
+    return e.use_exact_insn (CODE_FOR_aarch64_sve_tblvnx2di);
+  }
+};
+
+/* Implements svextb, svexth and svextw.  */
+class svext_bhw_impl : public function_base
+{
+public:
+  CONSTEXPR svext_bhw_impl (scalar_int_mode from_mode)
+    : m_from_mode (from_mode) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    if (e.type_suffix (0).unsigned_p)
+      {
+	/* Convert to an AND.  The widest we go is 0xffffffff, which fits
+	   in a CONST_INT.  */
+	e.args.quick_push (GEN_INT (GET_MODE_MASK (m_from_mode)));
+	if (e.pred == PRED_m)
+	  /* We now have arguments "(inactive, pg, op, mask)".  Convert this
+	     to "(pg, op, mask, inactive)" so that the order matches svand_m
+	     with an extra argument on the end.  Take the inactive elements
+	     from this extra argument.  */
+	  e.rotate_inputs_left (0, 4);
+	return e.map_to_rtx_codes (AND, AND, -1, 3);
+      }
+
+    machine_mode wide_mode = e.vector_mode (0);
+    poly_uint64 nunits = GET_MODE_NUNITS (wide_mode);
+    machine_mode narrow_mode
+      = aarch64_sve_data_mode (m_from_mode, nunits).require ();
+    if (e.pred == PRED_x)
+      {
+	insn_code icode = code_for_aarch64_pred_sxt (wide_mode, narrow_mode);
+	return e.use_pred_x_insn (icode);
+      }
+
+    insn_code icode = code_for_aarch64_cond_sxt (wide_mode, narrow_mode);
+    return e.use_cond_insn (icode);
+  }
+
+  /* The element mode that we're extending from.  */
+  scalar_int_mode m_from_mode;
+};
+
+/* Implements svget2, svget3 and svget4.  */
+class svget_impl : public quiet<multi_vector_function>
+{
+public:
+  CONSTEXPR svget_impl (unsigned int vectors_per_tuple)
+    : quiet<multi_vector_function> (vectors_per_tuple) {}
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    /* Fold into a normal gimple component access.  */
+    tree rhs_tuple = gimple_call_arg (f.call, 0);
+    tree index = gimple_call_arg (f.call, 1);
+    tree field = tuple_type_field (TREE_TYPE (rhs_tuple));
+    tree rhs_array = build3 (COMPONENT_REF, TREE_TYPE (field),
+			     rhs_tuple, field, NULL_TREE);
+    tree rhs_vector = build4 (ARRAY_REF, TREE_TYPE (f.lhs),
+			      rhs_array, index, NULL_TREE, NULL_TREE);
+    return gimple_build_assign (f.lhs, rhs_vector);
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Fold the access into a subreg rvalue.  */
+    return simplify_gen_subreg (e.vector_mode (0), e.args[0],
+				GET_MODE (e.args[0]),
+				INTVAL (e.args[1]) * BYTES_PER_SVE_VECTOR);
+  }
+};
+
+class svindex_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return e.use_exact_insn (e.direct_optab_handler (vec_series_optab));
+  }
+};
+
+class svinsr_impl : public quiet<function_base>
+{
+public:
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    gcall *new_call = gimple_build_call_internal (IFN_VEC_SHL_INSERT, 2,
+						  gimple_call_arg (f.call, 0),
+						  gimple_call_arg (f.call, 1));
+    gimple_call_set_lhs (new_call, f.lhs);
+    return new_call;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    insn_code icode = direct_optab_handler (vec_shl_insert_optab,
+					    e.vector_mode (0));
+    return e.use_exact_insn (icode);
+  }
+};
+
+/* Implements svlasta and svlastb.  */
+class svlast_impl : public quiet<function_base>
+{
+public:
+  CONSTEXPR svlast_impl (int unspec) : m_unspec (unspec) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return e.use_exact_insn (code_for_extract (m_unspec, e.vector_mode (0)));
+  }
+
+  /* The unspec code associated with the operation.  */
+  int m_unspec;
+};
+
+class svld1_impl : public full_width_access
+{
+public:
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_READ_MEMORY;
+  }
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree vectype = f.vector_type (0);
+
+    /* Get the predicate and base pointer.  */
+    gimple_seq stmts = NULL;
+    tree pred = f.convert_pred (stmts, vectype, 0);
+    tree base = f.fold_contiguous_base (stmts, vectype);
+    gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT);
+
+    tree cookie = f.load_store_cookie (TREE_TYPE (vectype));
+    gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD, 3,
+						  base, cookie, pred);
+    gimple_call_set_lhs (new_call, f.lhs);
+    return new_call;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    insn_code icode = convert_optab_handler (maskload_optab,
+					     e.vector_mode (0), e.gp_mode (0));
+    return e.use_contiguous_load_insn (icode);
+  }
+};
+
+/* Implements extending contiguous forms of svld1.  */
+class svld1_extend_impl : public extending_load
+{
+public:
+  CONSTEXPR svld1_extend_impl (type_suffix_index memory_type)
+    : extending_load (memory_type) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    insn_code icode = code_for_aarch64_load (extend_rtx_code (),
+					     e.vector_mode (0),
+					     e.memory_vector_mode ());
+    return e.use_contiguous_load_insn (icode);
+  }
+};
+
+class svld1_gather_impl : public full_width_access
+{
+public:
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_READ_MEMORY;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    e.prepare_gather_address_operands (1);
+    /* Put the predicate last, as required by mask_gather_load_optab.  */
+    e.rotate_inputs_left (0, 5);
+    machine_mode mem_mode = e.memory_vector_mode ();
+    insn_code icode = direct_optab_handler (mask_gather_load_optab, mem_mode);
+    return e.use_exact_insn (icode);
+  }
+};
+
+/* Implements extending forms of svld1_gather.  */
+class svld1_gather_extend_impl : public extending_load
+{
+public:
+  CONSTEXPR svld1_gather_extend_impl (type_suffix_index memory_type)
+    : extending_load (memory_type) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    e.prepare_gather_address_operands (1);
+    /* Put the predicate last, since the extending gathers use the same
+       operand order as mask_gather_load_optab.  */
+    e.rotate_inputs_left (0, 5);
+    insn_code icode = code_for_aarch64_gather_load (extend_rtx_code (),
+						    e.vector_mode (0),
+						    e.memory_vector_mode ());
+    return e.use_exact_insn (icode);
+  }
+};
+
+class load_replicate : public function_base
+{
+public:
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_READ_MEMORY;
+  }
+
+  tree
+  memory_scalar_type (const function_instance &fi) const OVERRIDE
+  {
+    return fi.scalar_type (0);
+  }
+};
+
+class svld1rq_impl : public load_replicate
+{
+public:
+  machine_mode
+  memory_vector_mode (const function_instance &fi) const OVERRIDE
+  {
+    return aarch64_vq_mode (GET_MODE_INNER (fi.vector_mode (0))).require ();
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
+    return e.use_contiguous_load_insn (icode);
+  }
+};
+
+class svld1ro_impl : public load_replicate
+{
+public:
+  machine_mode
+  memory_vector_mode (const function_instance &fi) const OVERRIDE
+  {
+    return OImode;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    insn_code icode = code_for_aarch64_sve_ld1ro (e.vector_mode (0));
+    return e.use_contiguous_load_insn (icode);
+  }
+};
+
+/* Implements svld2, svld3 and svld4.  */
+class svld234_impl : public full_width_access
+{
+public:
+  CONSTEXPR svld234_impl (unsigned int vectors_per_tuple)
+    : full_width_access (vectors_per_tuple) {}
+
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_READ_MEMORY;
+  }
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree tuple_type = TREE_TYPE (f.lhs);
+    tree vectype = f.vector_type (0);
+
+    /* Get the predicate and base pointer.  */
+    gimple_seq stmts = NULL;
+    tree pred = f.convert_pred (stmts, vectype, 0);
+    tree base = f.fold_contiguous_base (stmts, vectype);
+    gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT);
+
+    /* Emit two statements: a clobber of the lhs, so that it isn't
+       upwards exposed, and then the load itself.
+
+       The fold routines expect the replacement statement to have the
+       same lhs as the original call, so return the clobber statement
+       rather than the load.  */
+    gimple *clobber = gimple_build_assign (f.lhs, build_clobber (tuple_type));
+
+    /* View the loaded data as an array of vectors.  */
+    tree field = tuple_type_field (tuple_type);
+    tree lhs_array = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (field),
+			     unshare_expr (f.lhs));
+
+    /* Emit the load itself.  */
+    tree cookie = f.load_store_cookie (TREE_TYPE (vectype));
+    gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
+						  base, cookie, pred);
+    gimple_call_set_lhs (new_call, lhs_array);
+    gsi_insert_after (f.gsi, new_call, GSI_SAME_STMT);
+
+    return clobber;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    machine_mode tuple_mode = TYPE_MODE (TREE_TYPE (e.call_expr));
+    insn_code icode = convert_optab_handler (vec_mask_load_lanes_optab,
+					     tuple_mode, e.vector_mode (0));
+    return e.use_contiguous_load_insn (icode);
+  }
+};
+
+class svldff1_gather_impl : public full_width_access
+{
+public:
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_READ_MEMORY | CP_READ_FFR | CP_WRITE_FFR;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* See the block comment in aarch64-sve.md for details about the
+       FFR handling.  */
+    emit_insn (gen_aarch64_update_ffr_for_load ());
+
+    e.prepare_gather_address_operands (1);
+    /* Put the predicate last, since ldff1_gather uses the same operand
+       order as mask_gather_load_optab.  */
+    e.rotate_inputs_left (0, 5);
+    machine_mode mem_mode = e.memory_vector_mode ();
+    return e.use_exact_insn (code_for_aarch64_ldff1_gather (mem_mode));
+  }
+};
+
+/* Implements extending forms of svldff1_gather.  */
+class svldff1_gather_extend : public extending_load
+{
+public:
+  CONSTEXPR svldff1_gather_extend (type_suffix_index memory_type)
+    : extending_load (memory_type) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* See the block comment in aarch64-sve.md for details about the
+       FFR handling.  */
+    emit_insn (gen_aarch64_update_ffr_for_load ());
+
+    e.prepare_gather_address_operands (1);
+    /* Put the predicate last, since ldff1_gather uses the same operand
+       order as mask_gather_load_optab.  */
+    e.rotate_inputs_left (0, 5);
+    insn_code icode = code_for_aarch64_ldff1_gather (extend_rtx_code (),
+						     e.vector_mode (0),
+						     e.memory_vector_mode ());
+    return e.use_exact_insn (icode);
+  }
+};
+
+class svldnt1_impl : public full_width_access
+{
+public:
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_READ_MEMORY;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    insn_code icode = code_for_aarch64_ldnt1 (e.vector_mode (0));
+    return e.use_contiguous_load_insn (icode);
+  }
+};
+
+/* Implements svldff1 and svldnf1.  */
+class svldxf1_impl : public full_width_access
+{
+public:
+  CONSTEXPR svldxf1_impl (int unspec) : m_unspec (unspec) {}
+
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_READ_MEMORY | CP_READ_FFR | CP_WRITE_FFR;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* See the block comment in aarch64-sve.md for details about the
+       FFR handling.  */
+    emit_insn (gen_aarch64_update_ffr_for_load ());
+
+    machine_mode mode = e.vector_mode (0);
+    return e.use_contiguous_load_insn (code_for_aarch64_ldf1 (m_unspec, mode));
+  }
+
+  /* The unspec associated with the load.  */
+  int m_unspec;
+};
+
+/* Implements extending contiguous forms of svldff1 and svldnf1.  */
+class svldxf1_extend_impl : public extending_load
+{
+public:
+  CONSTEXPR svldxf1_extend_impl (type_suffix_index memory_type, int unspec)
+    : extending_load (memory_type), m_unspec (unspec) {}
+
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_READ_MEMORY | CP_READ_FFR | CP_WRITE_FFR;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* See the block comment in aarch64-sve.md for details about the
+       FFR handling.  */
+    emit_insn (gen_aarch64_update_ffr_for_load ());
+
+    insn_code icode = code_for_aarch64_ldf1 (m_unspec, extend_rtx_code (),
+					     e.vector_mode (0),
+					     e.memory_vector_mode ());
+    return e.use_contiguous_load_insn (icode);
+  }
+
+  /* The unspec associated with the load.  */
+  int m_unspec;
+};
+
+class svlen_impl : public quiet<function_base>
+{
+public:
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    /* The argument only exists for its type.  */
+    tree rhs_type = TREE_TYPE (gimple_call_arg (f.call, 0));
+    tree count = build_int_cstu (TREE_TYPE (f.lhs),
+				 TYPE_VECTOR_SUBPARTS (rhs_type));
+    return gimple_build_assign (f.lhs, count);
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* The argument only exists for its type.  */
+    return gen_int_mode (GET_MODE_NUNITS (e.vector_mode (0)), DImode);
+  }
+};
+
+class svmad_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return expand_mad (e);
+  }
+};
+
+class svmla_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Put the accumulator at the end (argument 3), but keep it as the
+       merge input for _m functions.  */
+    e.rotate_inputs_left (1, 4);
+    return expand_mad (e, 3);
+  }
+};
+
+/* Base class for svmla_lane and svmls_lane.  */
+class svmla_svmls_lane_impl : public function_base
+{
+public:
+  CONSTEXPR svmla_svmls_lane_impl (int unspec)
+    : m_unspec (unspec) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Put the operands in the normal (fma ...) order, with the accumulator
+       last.  This fits naturally since that's also the unprinted operand
+       in the asm output.  */
+    e.rotate_inputs_left (0, 4);
+    insn_code icode = code_for_aarch64_lane (m_unspec, e.vector_mode (0));
+    return e.use_exact_insn (icode);
+  }
+
+  /* The unspec code associated with the operation.  */
+  int m_unspec;
+};
+
+class svmls_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Put the accumulator at the end (argument 3), but keep it as the
+       merge input for _m functions.  */
+    e.rotate_inputs_left (1, 4);
+    return expand_msb (e, 3);
+  }
+};
+
+class svmov_impl : public function_base
+{
+public:
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    return gimple_build_assign (f.lhs, BIT_AND_EXPR,
+				gimple_call_arg (f.call, 0),
+				gimple_call_arg (f.call, 1));
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* The canonical form for the assembler alias "MOV Pa.B, Pb/Z, Pc.B"
+       is "AND Pa.B, Pb/Z, Pc.B, Pc.B".  */
+    gcc_assert (e.pred == PRED_z);
+    e.args.quick_push (e.args[1]);
+    return e.use_exact_insn (CODE_FOR_aarch64_pred_andvnx16bi_z);
+  }
+};
+
+class svmmla_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    insn_code icode;
+    if (e.type_suffix (0).integer_p)
+      {
+	if (e.type_suffix (0).unsigned_p)
+	  icode = code_for_aarch64_sve_add (UNSPEC_UMATMUL, e.vector_mode (0));
+	else
+	  icode = code_for_aarch64_sve_add (UNSPEC_SMATMUL, e.vector_mode (0));
+      }
+    else
+      icode = code_for_aarch64_sve (UNSPEC_FMMLA, e.vector_mode (0));
+    return e.use_exact_insn (icode);
+  }
+};
+
+class svmsb_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return expand_msb (e);
+  }
+};
+
+class svnand_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    gcc_assert (e.pred == PRED_z);
+    return e.use_exact_insn (CODE_FOR_aarch64_pred_nandvnx16bi_z);
+  }
+};
+
+class svnor_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    gcc_assert (e.pred == PRED_z);
+    return e.use_exact_insn (CODE_FOR_aarch64_pred_norvnx16bi_z);
+  }
+};
+
+class svnot_impl : public rtx_code_function
+{
+public:
+  CONSTEXPR svnot_impl () : rtx_code_function (NOT, NOT, -1) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    if (e.type_suffix_ids[0] == TYPE_SUFFIX_b)
+      {
+	/* The canonical form for the assembler alias "NOT Pa.B, Pb/Z, Pc.B"
+	   is "EOR Pa.B, Pb/Z, Pb.B, Pc.B".  */
+	gcc_assert (e.pred == PRED_z);
+	e.args.quick_insert (1, e.args[0]);
+	return e.use_exact_insn (CODE_FOR_aarch64_pred_xorvnx16bi_z);
+      }
+    return rtx_code_function::expand (e);
+  }
+};
+
+class svorn_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    gcc_assert (e.pred == PRED_z);
+    return e.use_exact_insn (CODE_FOR_aarch64_pred_ornvnx16bi_z);
+  }
+};
+
+class svpfalse_impl : public function_base
+{
+public:
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    return f.fold_to_pfalse ();
+  }
+
+  rtx
+  expand (function_expander &) const OVERRIDE
+  {
+    return CONST0_RTX (VNx16BImode);
+  }
+};
+
+/* Implements svpfirst and svpnext, which share the same .md patterns.  */
+class svpfirst_svpnext_impl : public function_base
+{
+public:
+  CONSTEXPR svpfirst_svpnext_impl (int unspec) : m_unspec (unspec) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    machine_mode mode = e.vector_mode (0);
+    e.add_ptrue_hint (0, mode);
+    return e.use_exact_insn (code_for_aarch64_sve (m_unspec, mode));
+  }
+
+  /* The unspec associated with the operation.  */
+  int m_unspec;
+};
+
+/* Implements contiguous forms of svprf[bhwd].  */
+class svprf_bhwd_impl : public function_base
+{
+public:
+  CONSTEXPR svprf_bhwd_impl (machine_mode mode) : m_mode (mode) {}
+
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_PREFETCH_MEMORY;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    e.prepare_prefetch_operands ();
+    insn_code icode = code_for_aarch64_sve_prefetch (m_mode);
+    return e.use_contiguous_prefetch_insn (icode);
+  }
+
+  /* The mode that we'd use to hold one vector of prefetched data.  */
+  machine_mode m_mode;
+};
+
+/* Implements svprf[bhwd]_gather.  */
+class svprf_bhwd_gather_impl : public function_base
+{
+public:
+  CONSTEXPR svprf_bhwd_gather_impl (machine_mode mode) : m_mode (mode) {}
+
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_PREFETCH_MEMORY;
+  }
+
+  machine_mode
+  memory_vector_mode (const function_instance &) const OVERRIDE
+  {
+    return m_mode;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    e.prepare_prefetch_operands ();
+    e.prepare_gather_address_operands (1);
+
+    /* Insert a zero operand to identify the mode of the memory being
+       accessed.  This goes between the gather operands and prefetch
+       operands created above.  */
+    e.args.quick_insert (5, CONST0_RTX (m_mode));
+
+    machine_mode reg_mode = GET_MODE (e.args[2]);
+    insn_code icode = code_for_aarch64_sve_gather_prefetch (m_mode, reg_mode);
+    return e.use_exact_insn (icode);
+  }
+
+  /* The mode that we'd use to hold one vector of prefetched data.  */
+  machine_mode m_mode;
+};
+
+/* Implements svptest_any, svptest_first and svptest_last.  */
+class svptest_impl : public function_base
+{
+public:
+  CONSTEXPR svptest_impl (rtx_code compare) : m_compare (compare) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* See whether GP is an exact ptrue for some predicate mode;
+       i.e. whether converting the GP to that mode will not drop
+       set bits and will leave all significant bits set.  */
+    machine_mode wide_mode;
+    int hint;
+    if (aarch64_ptrue_all_mode (e.args[0]).exists (&wide_mode))
+      hint = SVE_KNOWN_PTRUE;
+    else
+      {
+	hint = SVE_MAYBE_NOT_PTRUE;
+	wide_mode = VNx16BImode;
+      }
+
+    /* Generate the PTEST itself.  */
+    rtx pg = force_reg (VNx16BImode, e.args[0]);
+    rtx wide_pg = gen_lowpart (wide_mode, pg);
+    rtx hint_rtx = gen_int_mode (hint, DImode);
+    rtx op = force_reg (wide_mode, gen_lowpart (wide_mode, e.args[1]));
+    emit_insn (gen_aarch64_ptestvnx16bi (pg, wide_pg, hint_rtx, op));
+
+    /* Get the location of the boolean result.  We can provide SImode and
+       DImode values directly; rely on generic code to convert others.  */
+    rtx target = e.possible_target;
+    if (!target
+	|| !REG_P (target)
+	|| (GET_MODE (target) != SImode && GET_MODE (target) != DImode))
+      target = gen_reg_rtx (DImode);
+
+    /* Generate a CSET to convert the CC result of the PTEST to a boolean.  */
+    rtx cc_reg = gen_rtx_REG (CC_NZCmode, CC_REGNUM);
+    rtx compare = gen_rtx_fmt_ee (m_compare, GET_MODE (target),
+				  cc_reg, const0_rtx);
+    emit_insn (gen_rtx_SET (target, compare));
+    return target;
+  }
+
+  /* The comparison code associated with ptest condition.  */
+  rtx_code m_compare;
+};
+
+class svptrue_impl : public function_base
+{
+public:
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    return f.fold_to_ptrue ();
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return aarch64_ptrue_all (e.type_suffix (0).element_bytes);
+  }
+};
+
+class svptrue_pat_impl : public function_base
+{
+public:
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree pattern_arg = gimple_call_arg (f.call, 0);
+    aarch64_svpattern pattern = (aarch64_svpattern) tree_to_shwi (pattern_arg);
+
+    if (pattern == AARCH64_SV_ALL)
+      /* svptrue_pat_bN (SV_ALL) == svptrue_bN ().  */
+      return f.fold_to_ptrue ();
+
+    /* See whether we can count the number of elements in the pattern
+       at compile time.  If so, construct a predicate with that number
+       of 1s followed by all 0s.  */
+    int nelts_per_vq = f.elements_per_vq (0);
+    HOST_WIDE_INT value = aarch64_fold_sve_cnt_pat (pattern, nelts_per_vq);
+    if (value >= 0)
+      return f.fold_to_vl_pred (value);
+
+    return NULL;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* In rtl, the predicate is represented as the constant:
+
+         (const:V16BI (unspec:V16BI [(const_int PATTERN)
+				     (const_vector:VnnBI [zeros])]
+				    UNSPEC_PTRUE))
+
+       where nn determines the element size.  */
+    rtvec vec = gen_rtvec (2, e.args[0], CONST0_RTX (e.vector_mode (0)));
+    return gen_rtx_CONST (VNx16BImode,
+			  gen_rtx_UNSPEC (VNx16BImode, vec, UNSPEC_PTRUE));
+  }
+};
+
+class svqadd_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return e.expand_signed_unpred_op (SS_PLUS, US_PLUS);
+  }
+};
+
+/* Implements svqdec[bhwd]{,_pat} and svqinc[bhwd]{,_pat}.  */
+class svqdec_svqinc_bhwd_impl : public function_base
+{
+public:
+  CONSTEXPR svqdec_svqinc_bhwd_impl (rtx_code code_for_sint,
+				     rtx_code code_for_uint,
+				     scalar_int_mode elem_mode)
+    : m_code_for_sint (code_for_sint),
+      m_code_for_uint (code_for_uint),
+      m_elem_mode (elem_mode)
+  {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Treat non-_pat functions in the same way as _pat functions with
+       an SV_ALL argument.  */
+    if (e.args.length () == 2)
+      e.args.quick_insert (1, gen_int_mode (AARCH64_SV_ALL, DImode));
+
+    /* Insert the number of elements per 128-bit block as a fake argument,
+       between the pattern and the multiplier.  Arguments 1, 2 and 3 then
+       correspond exactly with the 3 UNSPEC_SVE_CNT_PAT operands; see
+       aarch64_sve_cnt_pat for details.  */
+    unsigned int elements_per_vq = 128 / GET_MODE_BITSIZE (m_elem_mode);
+    e.args.quick_insert (2, gen_int_mode (elements_per_vq, DImode));
+
+    rtx_code code = (e.type_suffix (0).unsigned_p
+		     ? m_code_for_uint
+		     : m_code_for_sint);
+
+    /* Choose between operating on integer scalars or integer vectors.  */
+    machine_mode mode = e.vector_mode (0);
+    if (e.mode_suffix_id == MODE_n)
+      mode = GET_MODE_INNER (mode);
+    return e.use_exact_insn (code_for_aarch64_sve_pat (code, mode));
+  }
+
+  /* The saturating addition or subtraction codes to use for signed and
+     unsigned values respectively.  */
+  rtx_code m_code_for_sint;
+  rtx_code m_code_for_uint;
+
+  /* The integer mode associated with the [bhwd] suffix.  */
+  scalar_int_mode m_elem_mode;
+};
+
+/* Implements svqdec[bhwd]{,_pat}.  */
+class svqdec_bhwd_impl : public svqdec_svqinc_bhwd_impl
+{
+public:
+  CONSTEXPR svqdec_bhwd_impl (scalar_int_mode elem_mode)
+    : svqdec_svqinc_bhwd_impl (SS_MINUS, US_MINUS, elem_mode) {}
+};
+
+/* Implements svqinc[bhwd]{,_pat}.  */
+class svqinc_bhwd_impl : public svqdec_svqinc_bhwd_impl
+{
+public:
+  CONSTEXPR svqinc_bhwd_impl (scalar_int_mode elem_mode)
+    : svqdec_svqinc_bhwd_impl (SS_PLUS, US_PLUS, elem_mode) {}
+};
+
+/* Implements svqdecp and svqincp.  */
+class svqdecp_svqincp_impl : public function_base
+{
+public:
+  CONSTEXPR svqdecp_svqincp_impl (rtx_code code_for_sint,
+				  rtx_code code_for_uint)
+    : m_code_for_sint (code_for_sint),
+      m_code_for_uint (code_for_uint)
+  {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    rtx_code code = (e.type_suffix (0).unsigned_p
+		     ? m_code_for_uint
+		     : m_code_for_sint);
+    insn_code icode;
+    if (e.mode_suffix_id == MODE_n)
+      {
+	/* Increment or decrement a scalar (whose mode is given by the first
+	   type suffix) by the number of active elements in a predicate
+	   (whose mode is given by the second type suffix).  */
+	machine_mode mode = GET_MODE_INNER (e.vector_mode (0));
+	icode = code_for_aarch64_sve_cntp (code, mode, e.vector_mode (1));
+      }
+    else
+      /* Increment a vector by the number of active elements in a predicate,
+	 with the vector mode determining the predicate mode.  */
+      icode = code_for_aarch64_sve_cntp (code, e.vector_mode (0));
+    return e.use_exact_insn (icode);
+  }
+
+  /* The saturating addition or subtraction codes to use for signed and
+     unsigned values respectively.  */
+  rtx_code m_code_for_sint;
+  rtx_code m_code_for_uint;
+};
+
+class svqsub_impl : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return e.expand_signed_unpred_op (SS_MINUS, US_MINUS);
+  }
+};
+
+class svrdffr_impl : public function_base
+{
+public:
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_READ_FFR;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* See the block comment in aarch64-sve.md for details about the
+       FFR handling.  */
+    emit_insn (gen_aarch64_copy_ffr_to_ffrt ());
+    rtx result = e.use_exact_insn (e.pred == PRED_z
+				   ? CODE_FOR_aarch64_rdffr_z
+				   : CODE_FOR_aarch64_rdffr);
+    emit_insn (gen_aarch64_update_ffrt ());
+    return result;
+  }
+};
+
+class svreinterpret_impl : public quiet<function_base>
+{
+public:
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    /* Punt to rtl if the effect of the reinterpret on registers does not
+       conform to GCC's endianness model.  */
+    if (!targetm.can_change_mode_class (f.vector_mode (0),
+					f.vector_mode (1), FP_REGS))
+      return NULL;
+
+    /* Otherwise svreinterpret corresponds directly to a VIEW_CONVERT_EXPR
+       reinterpretation.  */
+    tree rhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (f.lhs),
+		       gimple_call_arg (f.call, 0));
+    return gimple_build_assign (f.lhs, VIEW_CONVERT_EXPR, rhs);
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    machine_mode mode = e.vector_mode (0);
+    return e.use_exact_insn (code_for_aarch64_sve_reinterpret (mode));
+  }
+};
+
+class svrev_impl : public permute
+{
+public:
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    /* Punt for now on _b16 and wider; we'd need more complex evpc logic
+       to rerecognize the result.  */
+    if (f.type_suffix (0).bool_p && f.type_suffix (0).element_bits > 8)
+      return NULL;
+
+    /* Permute as { nelts - 1, nelts - 2, nelts - 3, ... }.  */
+    poly_int64 nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (f.lhs));
+    vec_perm_builder builder (nelts, 1, 3);
+    for (int i = 0; i < 3; ++i)
+      builder.quick_push (nelts - i - 1);
+    return fold_permute (f, builder);
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return e.use_exact_insn (code_for_aarch64_sve_rev (e.vector_mode (0)));
+  }
+};
+
+class svsel_impl : public quiet<function_base>
+{
+public:
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    /* svsel corresponds exactly to VEC_COND_EXPR.  */
+    gimple_seq stmts = NULL;
+    tree pred = f.convert_pred (stmts, f.vector_type (0), 0);
+    gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT);
+    return gimple_build_assign (f.lhs, VEC_COND_EXPR, pred,
+				gimple_call_arg (f.call, 1),
+				gimple_call_arg (f.call, 2));
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* svsel (cond, truev, falsev) is vcond_mask (truev, falsev, cond).  */
+    e.rotate_inputs_left (0, 3);
+    insn_code icode = convert_optab_handler (vcond_mask_optab,
+					     e.vector_mode (0),
+					     e.gp_mode (0));
+    return e.use_exact_insn (icode);
+  }
+};
+
+/* Implements svset2, svset3 and svset4.  */
+class svset_impl : public quiet<multi_vector_function>
+{
+public:
+  CONSTEXPR svset_impl (unsigned int vectors_per_tuple)
+    : quiet<multi_vector_function> (vectors_per_tuple) {}
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree rhs_tuple = gimple_call_arg (f.call, 0);
+    tree index = gimple_call_arg (f.call, 1);
+    tree rhs_vector = gimple_call_arg (f.call, 2);
+
+    /* Replace the call with two statements: a copy of the full tuple
+       to the call result, followed by an update of the individual vector.
+
+       The fold routines expect the replacement statement to have the
+       same lhs as the original call, so return the copy statement
+       rather than the field update.  */
+    gassign *copy = gimple_build_assign (unshare_expr (f.lhs), rhs_tuple);
+
+    /* Get a reference to the individual vector.  */
+    tree field = tuple_type_field (TREE_TYPE (f.lhs));
+    tree lhs_array = build3 (COMPONENT_REF, TREE_TYPE (field),
+			     f.lhs, field, NULL_TREE);
+    tree lhs_vector = build4 (ARRAY_REF, TREE_TYPE (rhs_vector),
+			      lhs_array, index, NULL_TREE, NULL_TREE);
+    gassign *update = gimple_build_assign (lhs_vector, rhs_vector);
+    gsi_insert_after (f.gsi, update, GSI_SAME_STMT);
+
+    return copy;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    rtx rhs_tuple = e.args[0];
+    unsigned int index = INTVAL (e.args[1]);
+    rtx rhs_vector = e.args[2];
+
+    /* First copy the full tuple to the target register.  */
+    rtx lhs_tuple = e.get_nonoverlapping_reg_target ();
+    emit_move_insn (lhs_tuple, rhs_tuple);
+
+    /* ...then update the individual vector.  */
+    rtx lhs_vector = simplify_gen_subreg (GET_MODE (rhs_vector),
+					  lhs_tuple, GET_MODE (lhs_tuple),
+					  index * BYTES_PER_SVE_VECTOR);
+    emit_move_insn (lhs_vector, rhs_vector);
+    return lhs_vector;
+  }
+};
+
+class svsetffr_impl : public function_base
+{
+public:
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_WRITE_FFR;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    e.args.quick_push (CONSTM1_RTX (VNx16BImode));
+    return e.use_exact_insn (CODE_FOR_aarch64_wrffr);
+  }
+};
+
+class svst1_impl : public full_width_access
+{
+public:
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_WRITE_MEMORY;
+  }
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree vectype = f.vector_type (0);
+
+    /* Get the predicate and base pointer.  */
+    gimple_seq stmts = NULL;
+    tree pred = f.convert_pred (stmts, vectype, 0);
+    tree base = f.fold_contiguous_base (stmts, vectype);
+    gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT);
+
+    tree cookie = f.load_store_cookie (TREE_TYPE (vectype));
+    tree rhs = gimple_call_arg (f.call, gimple_call_num_args (f.call) - 1);
+    return gimple_build_call_internal (IFN_MASK_STORE, 4,
+				       base, cookie, pred, rhs);
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    insn_code icode = convert_optab_handler (maskstore_optab,
+					     e.vector_mode (0), e.gp_mode (0));
+    return e.use_contiguous_store_insn (icode);
+  }
+};
+
+class svst1_scatter_impl : public full_width_access
+{
+public:
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_WRITE_MEMORY;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    e.prepare_gather_address_operands (1);
+    /* Put the predicate last, as required by mask_scatter_store_optab.  */
+    e.rotate_inputs_left (0, 6);
+    insn_code icode = direct_optab_handler (mask_scatter_store_optab,
+					    e.memory_vector_mode ());
+    return e.use_exact_insn (icode);
+  }
+};
+
+/* Implements truncating forms of svst1_scatter.  */
+class svst1_scatter_truncate_impl : public truncating_store
+{
+public:
+  CONSTEXPR svst1_scatter_truncate_impl (scalar_int_mode to_mode)
+    : truncating_store (to_mode) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    e.prepare_gather_address_operands (1);
+    /* Put the predicate last, since the truncating scatters use the same
+       operand order as mask_scatter_store_optab.  */
+    e.rotate_inputs_left (0, 6);
+    insn_code icode = code_for_aarch64_scatter_store_trunc
+      (e.memory_vector_mode (), e.vector_mode (0));
+    return e.use_exact_insn (icode);
+  }
+};
+
+/* Implements truncating contiguous forms of svst1.  */
+class svst1_truncate_impl : public truncating_store
+{
+public:
+  CONSTEXPR svst1_truncate_impl (scalar_int_mode to_mode)
+    : truncating_store (to_mode) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    insn_code icode = code_for_aarch64_store_trunc (e.memory_vector_mode (),
+						    e.vector_mode (0));
+    return e.use_contiguous_store_insn (icode);
+  }
+};
+
+/* Implements svst2, svst3 and svst4.  */
+class svst234_impl : public full_width_access
+{
+public:
+  CONSTEXPR svst234_impl (unsigned int vectors_per_tuple)
+    : full_width_access (vectors_per_tuple) {}
+
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_WRITE_MEMORY;
+  }
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    tree vectype = f.vector_type (0);
+
+    /* Get the predicate and base pointer.  */
+    gimple_seq stmts = NULL;
+    tree pred = f.convert_pred (stmts, vectype, 0);
+    tree base = f.fold_contiguous_base (stmts, vectype);
+    gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT);
+
+    /* View the stored data as an array of vectors.  */
+    unsigned int num_args = gimple_call_num_args (f.call);
+    tree rhs_tuple = gimple_call_arg (f.call, num_args - 1);
+    tree field = tuple_type_field (TREE_TYPE (rhs_tuple));
+    tree rhs_array = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (field), rhs_tuple);
+
+    tree cookie = f.load_store_cookie (TREE_TYPE (vectype));
+    return gimple_build_call_internal (IFN_MASK_STORE_LANES, 4,
+				       base, cookie, pred, rhs_array);
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    machine_mode tuple_mode = GET_MODE (e.args.last ());
+    insn_code icode = convert_optab_handler (vec_mask_store_lanes_optab,
+					     tuple_mode, e.vector_mode (0));
+    return e.use_contiguous_store_insn (icode);
+  }
+};
+
+class svstnt1_impl : public full_width_access
+{
+public:
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_WRITE_MEMORY;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    insn_code icode = code_for_aarch64_stnt1 (e.vector_mode (0));
+    return e.use_contiguous_store_insn (icode);
+  }
+};
+
+class svsub_impl : public rtx_code_function
+{
+public:
+  CONSTEXPR svsub_impl ()
+    : rtx_code_function (MINUS, MINUS, UNSPEC_COND_FSUB) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Canonicalize subtractions of constants to additions.  */
+    machine_mode mode = e.vector_mode (0);
+    if (e.try_negating_argument (2, mode))
+      return e.map_to_rtx_codes (PLUS, PLUS, UNSPEC_COND_FADD);
+
+    return rtx_code_function::expand (e);
+  }
+};
+
+class svtbl_impl : public permute
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return e.use_exact_insn (code_for_aarch64_sve_tbl (e.vector_mode (0)));
+  }
+};
+
+/* Implements svtrn1 and svtrn2.  */
+class svtrn_impl : public binary_permute
+{
+public:
+  CONSTEXPR svtrn_impl (int base)
+    : binary_permute (base ? UNSPEC_TRN2 : UNSPEC_TRN1), m_base (base) {}
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    /* svtrn1: { 0, nelts, 2, nelts + 2, 4, nelts + 4, ... }
+       svtrn2: as for svtrn1, but with 1 added to each index.  */
+    poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (f.lhs));
+    vec_perm_builder builder (nelts, 2, 3);
+    for (unsigned int i = 0; i < 3; ++i)
+      {
+	builder.quick_push (m_base + i * 2);
+	builder.quick_push (m_base + i * 2 + nelts);
+      }
+    return fold_permute (f, builder);
+  }
+
+  /* 0 for svtrn1, 1 for svtrn2.  */
+  unsigned int m_base;
+};
+
+/* Base class for svundef{,2,3,4}.  */
+class svundef_impl : public quiet<multi_vector_function>
+{
+public:
+  CONSTEXPR svundef_impl (unsigned int vectors_per_tuple)
+    : quiet<multi_vector_function> (vectors_per_tuple) {}
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    /* Don't fold svundef at the gimple level.  There's no exact
+       correspondence for SSA_NAMEs, and we explicitly don't want
+       to generate a specific value (like an all-zeros vector).  */
+    if (vectors_per_tuple () == 1)
+      return NULL;
+    return gimple_build_assign (f.lhs, build_clobber (TREE_TYPE (f.lhs)));
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    rtx target = e.get_reg_target ();
+    emit_clobber (copy_rtx (target));
+    return target;
+  }
+};
+
+/* Implements svunpklo and svunpkhi.  */
+class svunpk_impl : public quiet<function_base>
+{
+public:
+  CONSTEXPR svunpk_impl (bool high_p) : m_high_p (high_p) {}
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    /* Don't fold the predicate ops, since every bit of the svbool_t
+       result is significant.  */
+    if (f.type_suffix_ids[0] == TYPE_SUFFIX_b)
+      return NULL;
+
+    /* The first half in memory is VEC_UNPACK_LO_EXPR for little-endian
+       and VEC_UNPACK_HI_EXPR for big-endian.  */
+    bool high_p = BYTES_BIG_ENDIAN ? !m_high_p : m_high_p;
+    tree_code code = high_p ? VEC_UNPACK_HI_EXPR : VEC_UNPACK_LO_EXPR;
+    return gimple_build_assign (f.lhs, code, gimple_call_arg (f.call, 0));
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    machine_mode mode = GET_MODE (e.args[0]);
+    unsigned int unpacku = m_high_p ? UNSPEC_UNPACKUHI : UNSPEC_UNPACKULO;
+    unsigned int unpacks = m_high_p ? UNSPEC_UNPACKSHI : UNSPEC_UNPACKSLO;
+    insn_code icode;
+    if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
+      icode = code_for_aarch64_sve_punpk (unpacku, mode);
+    else
+      {
+	int unspec = e.type_suffix (0).unsigned_p ? unpacku : unpacks;
+	icode = code_for_aarch64_sve_unpk (unspec, unspec, mode);
+      }
+    return e.use_exact_insn (icode);
+  }
+
+  /* True for svunpkhi, false for svunpklo.  */
+  bool m_high_p;
+};
+
+/* Also implements svsudot.  */
+class svusdot_impl : public function_base
+{
+public:
+  CONSTEXPR svusdot_impl (bool su) : m_su (su) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* The implementation of the ACLE function svsudot (for the non-lane
+       version) is through the USDOT instruction but with the second and third
+       inputs swapped.  */
+    if (m_su)
+      e.rotate_inputs_left (1, 2);
+    /* The ACLE function has the same order requirements as for svdot.
+       While there's no requirement for the RTL pattern to have the same sort
+       of order as that for <sur>dot_prod, it's easier to read.
+       Hence we do the same rotation on arguments as svdot_impl does.  */
+    e.rotate_inputs_left (0, 3);
+    machine_mode mode = e.vector_mode (0);
+    insn_code icode = code_for_aarch64_dot_prod (UNSPEC_USDOT, mode);
+    return e.use_exact_insn (icode);
+  }
+
+private:
+  bool m_su;
+};
+
+/* Implements svuzp1 and svuzp2.  */
+class svuzp_impl : public binary_permute
+{
+public:
+  CONSTEXPR svuzp_impl (unsigned int base)
+    : binary_permute (base ? UNSPEC_UZP2 : UNSPEC_UZP1), m_base (base) {}
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    /* svuzp1: { 0, 2, 4, 6, ... }
+       svuzp2: { 1, 3, 5, 7, ... }.  */
+    poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (f.lhs));
+    vec_perm_builder builder (nelts, 1, 3);
+    for (unsigned int i = 0; i < 3; ++i)
+      builder.quick_push (m_base + i * 2);
+    return fold_permute (f, builder);
+  }
+
+  /* 0 for svuzp1, 1 for svuzp2.  */
+  unsigned int m_base;
+};
+
+/* A function_base for svwhilele and svwhilelt functions.  */
+class svwhile_impl : public function_base
+{
+public:
+  CONSTEXPR svwhile_impl (int unspec_for_sint, int unspec_for_uint, bool eq_p)
+    : m_unspec_for_sint (unspec_for_sint),
+      m_unspec_for_uint (unspec_for_uint), m_eq_p (eq_p)
+  {}
+
+  /* Try to fold a call by treating its arguments as constants of type T.  */
+  template<typename T>
+  gimple *
+  fold_type (gimple_folder &f) const
+  {
+    /* Only handle cases in which both operands are constant.  */
+    T arg0, arg1;
+    if (!poly_int_tree_p (gimple_call_arg (f.call, 0), &arg0)
+	|| !poly_int_tree_p (gimple_call_arg (f.call, 1), &arg1))
+      return NULL;
+
+    /* Check whether the result is known to be all-false.  */
+    if (m_eq_p ? known_gt (arg0, arg1) : known_ge (arg0, arg1))
+      return f.fold_to_pfalse ();
+
+    /* Punt if we can't tell at compile time whether the result
+       is all-false.  */
+    if (m_eq_p ? maybe_gt (arg0, arg1) : maybe_ge (arg0, arg1))
+      return NULL;
+
+    /* At this point we know the result has at least one set element.  */
+    poly_uint64 diff = arg1 - arg0;
+    poly_uint64 nelts = GET_MODE_NUNITS (f.vector_mode (0));
+
+    /* Canonicalize the svwhilele form to the svwhilelt form.  Subtract
+       from NELTS rather than adding to DIFF, to prevent overflow.  */
+    if (m_eq_p)
+      nelts -= 1;
+
+    /* Check whether the result is known to be all-true.  */
+    if (known_ge (diff, nelts))
+      return f.fold_to_ptrue ();
+
+    /* Punt if DIFF might not be the actual number of set elements
+       in the result.  Conditional equality is fine.  */
+    if (maybe_gt (diff, nelts))
+      return NULL;
+
+    /* At this point we know that the predicate will have DIFF set elements
+       for svwhilelt and DIFF + 1 set elements for svwhilele (which stops
+       after rather than before ARG1 is reached).  See if we can create
+       the predicate at compile time.  */
+    unsigned HOST_WIDE_INT vl;
+    if (diff.is_constant (&vl))
+      /* Overflow is no longer possible after the checks above.  */
+      return f.fold_to_vl_pred (m_eq_p ? vl + 1 : vl);
+
+    return NULL;
+  }
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    if (f.type_suffix (1).unsigned_p)
+      return fold_type<poly_uint64> (f);
+    else
+      return fold_type<poly_int64> (f);
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Suffix 0 determines the predicate mode, suffix 1 determines the
+       scalar mode and signedness.  */
+    int unspec = (e.type_suffix (1).unsigned_p
+		  ? m_unspec_for_uint
+		  : m_unspec_for_sint);
+    machine_mode pred_mode = e.vector_mode (0);
+    scalar_mode reg_mode = GET_MODE_INNER (e.vector_mode (1));
+    return e.use_exact_insn (code_for_while (unspec, reg_mode, pred_mode));
+  }
+
+  /* The unspec codes associated with signed and unsigned operations
+     respectively.  */
+  int m_unspec_for_sint;
+  int m_unspec_for_uint;
+
+  /* True svwhilele, false for svwhilelt.  */
+  bool m_eq_p;
+};
+
+class svwrffr_impl : public function_base
+{
+public:
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_WRITE_FFR;
+  }
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return e.use_exact_insn (CODE_FOR_aarch64_wrffr);
+  }
+};
+
+/* Implements svzip1 and svzip2.  */
+class svzip_impl : public binary_permute
+{
+public:
+  CONSTEXPR svzip_impl (unsigned int base)
+    : binary_permute (base ? UNSPEC_ZIP2 : UNSPEC_ZIP1), m_base (base) {}
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+    /* svzip1: { 0, nelts, 1, nelts + 1, 2, nelts + 2, ... }
+       svzip2: as for svzip1, but with nelts / 2 added to each index.  */
+    poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (f.lhs));
+    poly_uint64 base = m_base * exact_div (nelts, 2);
+    vec_perm_builder builder (nelts, 2, 3);
+    for (unsigned int i = 0; i < 3; ++i)
+      {
+	builder.quick_push (base + i);
+	builder.quick_push (base + i + nelts);
+      }
+    return fold_permute (f, builder);
+  }
+
+  /* 0 for svzip1, 1 for svzip2.  */
+  unsigned int m_base;
+};
+
+} /* end anonymous namespace */
+
+namespace aarch64_sve {
+
+FUNCTION (svabd, svabd_impl,)
+FUNCTION (svabs, quiet<rtx_code_function>, (ABS, ABS, UNSPEC_COND_FABS))
+FUNCTION (svacge, svac_impl, (UNSPEC_COND_FCMGE))
+FUNCTION (svacgt, svac_impl, (UNSPEC_COND_FCMGT))
+FUNCTION (svacle, svac_impl, (UNSPEC_COND_FCMLE))
+FUNCTION (svaclt, svac_impl, (UNSPEC_COND_FCMLT))
+FUNCTION (svadd, rtx_code_function, (PLUS, PLUS, UNSPEC_COND_FADD))
+FUNCTION (svadda, svadda_impl,)
+FUNCTION (svaddv, reduction, (UNSPEC_SADDV, UNSPEC_UADDV, UNSPEC_FADDV))
+FUNCTION (svadrb, svadr_bhwd_impl, (0))
+FUNCTION (svadrd, svadr_bhwd_impl, (3))
+FUNCTION (svadrh, svadr_bhwd_impl, (1))
+FUNCTION (svadrw, svadr_bhwd_impl, (2))
+FUNCTION (svand, rtx_code_function, (AND, AND))
+FUNCTION (svandv, reduction, (UNSPEC_ANDV))
+FUNCTION (svasr, rtx_code_function, (ASHIFTRT, ASHIFTRT))
+FUNCTION (svasr_wide, shift_wide, (ASHIFTRT, UNSPEC_ASHIFTRT_WIDE))
+FUNCTION (svasrd, svasrd_impl,)
+FUNCTION (svbfdot, fixed_insn_function, (CODE_FOR_aarch64_sve_bfdotvnx4sf))
+FUNCTION (svbfdot_lane, fixed_insn_function,
+	  (CODE_FOR_aarch64_sve_bfdot_lanevnx4sf))
+FUNCTION (svbfmlalb, fixed_insn_function, (CODE_FOR_aarch64_sve_bfmlalbvnx4sf))
+FUNCTION (svbfmlalb_lane, fixed_insn_function,
+	  (CODE_FOR_aarch64_sve_bfmlalb_lanevnx4sf))
+FUNCTION (svbfmlalt, fixed_insn_function, (CODE_FOR_aarch64_sve_bfmlaltvnx4sf))
+FUNCTION (svbfmlalt_lane, fixed_insn_function,
+	  (CODE_FOR_aarch64_sve_bfmlalt_lanevnx4sf))
+FUNCTION (svbfmmla, fixed_insn_function, (CODE_FOR_aarch64_sve_bfmmlavnx4sf))
+FUNCTION (svbic, svbic_impl,)
+FUNCTION (svbrka, svbrk_unary_impl, (UNSPEC_BRKA))
+FUNCTION (svbrkb, svbrk_unary_impl, (UNSPEC_BRKB))
+FUNCTION (svbrkn, svbrk_binary_impl, (UNSPEC_BRKN))
+FUNCTION (svbrkpa, svbrk_binary_impl, (UNSPEC_BRKPA))
+FUNCTION (svbrkpb, svbrk_binary_impl, (UNSPEC_BRKPB))
+FUNCTION (svcadd, svcadd_impl,)
+FUNCTION (svclasta, svclast_impl, (UNSPEC_CLASTA))
+FUNCTION (svclastb, svclast_impl, (UNSPEC_CLASTB))
+FUNCTION (svcls, unary_count, (CLRSB))
+FUNCTION (svclz, unary_count, (CLZ))
+FUNCTION (svcmla, svcmla_impl,)
+FUNCTION (svcmla_lane, svcmla_lane_impl,)
+FUNCTION (svcmpeq, svcmp_impl, (EQ_EXPR, UNSPEC_COND_FCMEQ))
+FUNCTION (svcmpeq_wide, svcmp_wide_impl, (EQ_EXPR, UNSPEC_COND_CMPEQ_WIDE,
+					  UNSPEC_COND_CMPEQ_WIDE))
+FUNCTION (svcmpge, svcmp_impl, (GE_EXPR, UNSPEC_COND_FCMGE))
+FUNCTION (svcmpge_wide, svcmp_wide_impl, (GE_EXPR, UNSPEC_COND_CMPGE_WIDE,
+					  UNSPEC_COND_CMPHS_WIDE))
+FUNCTION (svcmpgt, svcmp_impl, (GT_EXPR, UNSPEC_COND_FCMGT))
+FUNCTION (svcmpgt_wide, svcmp_wide_impl, (GT_EXPR, UNSPEC_COND_CMPGT_WIDE,
+					  UNSPEC_COND_CMPHI_WIDE))
+FUNCTION (svcmple, svcmp_impl, (LE_EXPR, UNSPEC_COND_FCMLE))
+FUNCTION (svcmple_wide, svcmp_wide_impl, (LE_EXPR, UNSPEC_COND_CMPLE_WIDE,
+					  UNSPEC_COND_CMPLS_WIDE))
+FUNCTION (svcmplt, svcmp_impl, (LT_EXPR, UNSPEC_COND_FCMLT))
+FUNCTION (svcmplt_wide, svcmp_wide_impl, (LT_EXPR, UNSPEC_COND_CMPLT_WIDE,
+					  UNSPEC_COND_CMPLO_WIDE))
+FUNCTION (svcmpne, svcmp_impl, (NE_EXPR, UNSPEC_COND_FCMNE))
+FUNCTION (svcmpne_wide, svcmp_wide_impl, (NE_EXPR, UNSPEC_COND_CMPNE_WIDE,
+					  UNSPEC_COND_CMPNE_WIDE))
+FUNCTION (svcmpuo, svcmpuo_impl,)
+FUNCTION (svcnot, svcnot_impl,)
+FUNCTION (svcnt, unary_count, (POPCOUNT))
+FUNCTION (svcntb, svcnt_bhwd_impl, (VNx16QImode))
+FUNCTION (svcntb_pat, svcnt_bhwd_pat_impl, (VNx16QImode))
+FUNCTION (svcntd, svcnt_bhwd_impl, (VNx2DImode))
+FUNCTION (svcntd_pat, svcnt_bhwd_pat_impl, (VNx2DImode))
+FUNCTION (svcnth, svcnt_bhwd_impl, (VNx8HImode))
+FUNCTION (svcnth_pat, svcnt_bhwd_pat_impl, (VNx8HImode))
+FUNCTION (svcntp, svcntp_impl,)
+FUNCTION (svcntw, svcnt_bhwd_impl, (VNx4SImode))
+FUNCTION (svcntw_pat, svcnt_bhwd_pat_impl, (VNx4SImode))
+FUNCTION (svcompact, QUIET_CODE_FOR_MODE0 (aarch64_sve_compact),)
+FUNCTION (svcreate2, svcreate_impl, (2))
+FUNCTION (svcreate3, svcreate_impl, (3))
+FUNCTION (svcreate4, svcreate_impl, (4))
+FUNCTION (svcvt, svcvt_impl,)
+FUNCTION (svcvtnt, CODE_FOR_MODE0 (aarch64_sve_cvtnt),)
+FUNCTION (svdiv, rtx_code_function, (DIV, UDIV, UNSPEC_COND_FDIV))
+FUNCTION (svdivr, rtx_code_function_rotated, (DIV, UDIV, UNSPEC_COND_FDIV))
+FUNCTION (svdot, svdot_impl,)
+FUNCTION (svdot_lane, svdotprod_lane_impl, (UNSPEC_SDOT, UNSPEC_UDOT, -1))
+FUNCTION (svdup, svdup_impl,)
+FUNCTION (svdup_lane, svdup_lane_impl,)
+FUNCTION (svdupq, svdupq_impl,)
+FUNCTION (svdupq_lane, svdupq_lane_impl,)
+FUNCTION (sveor, rtx_code_function, (XOR, XOR, -1))
+FUNCTION (sveorv, reduction, (UNSPEC_XORV))
+FUNCTION (svexpa, unspec_based_function, (-1, -1, UNSPEC_FEXPA))
+FUNCTION (svext, QUIET_CODE_FOR_MODE0 (aarch64_sve_ext),)
+FUNCTION (svextb, svext_bhw_impl, (QImode))
+FUNCTION (svexth, svext_bhw_impl, (HImode))
+FUNCTION (svextw, svext_bhw_impl, (SImode))
+FUNCTION (svget2, svget_impl, (2))
+FUNCTION (svget3, svget_impl, (3))
+FUNCTION (svget4, svget_impl, (4))
+FUNCTION (svindex, svindex_impl,)
+FUNCTION (svinsr, svinsr_impl,)
+FUNCTION (svlasta, svlast_impl, (UNSPEC_LASTA))
+FUNCTION (svlastb, svlast_impl, (UNSPEC_LASTB))
+FUNCTION (svld1, svld1_impl,)
+FUNCTION (svld1_gather, svld1_gather_impl,)
+FUNCTION (svld1ro, svld1ro_impl,)
+FUNCTION (svld1rq, svld1rq_impl,)
+FUNCTION (svld1sb, svld1_extend_impl, (TYPE_SUFFIX_s8))
+FUNCTION (svld1sb_gather, svld1_gather_extend_impl, (TYPE_SUFFIX_s8))
+FUNCTION (svld1sh, svld1_extend_impl, (TYPE_SUFFIX_s16))
+FUNCTION (svld1sh_gather, svld1_gather_extend_impl, (TYPE_SUFFIX_s16))
+FUNCTION (svld1sw, svld1_extend_impl, (TYPE_SUFFIX_s32))
+FUNCTION (svld1sw_gather, svld1_gather_extend_impl, (TYPE_SUFFIX_s32))
+FUNCTION (svld1ub, svld1_extend_impl, (TYPE_SUFFIX_u8))
+FUNCTION (svld1ub_gather, svld1_gather_extend_impl, (TYPE_SUFFIX_u8))
+FUNCTION (svld1uh, svld1_extend_impl, (TYPE_SUFFIX_u16))
+FUNCTION (svld1uh_gather, svld1_gather_extend_impl, (TYPE_SUFFIX_u16))
+FUNCTION (svld1uw, svld1_extend_impl, (TYPE_SUFFIX_u32))
+FUNCTION (svld1uw_gather, svld1_gather_extend_impl, (TYPE_SUFFIX_u32))
+FUNCTION (svld2, svld234_impl, (2))
+FUNCTION (svld3, svld234_impl, (3))
+FUNCTION (svld4, svld234_impl, (4))
+FUNCTION (svldff1, svldxf1_impl, (UNSPEC_LDFF1))
+FUNCTION (svldff1_gather, svldff1_gather_impl,)
+FUNCTION (svldff1sb, svldxf1_extend_impl, (TYPE_SUFFIX_s8, UNSPEC_LDFF1))
+FUNCTION (svldff1sb_gather, svldff1_gather_extend, (TYPE_SUFFIX_s8))
+FUNCTION (svldff1sh, svldxf1_extend_impl, (TYPE_SUFFIX_s16, UNSPEC_LDFF1))
+FUNCTION (svldff1sh_gather, svldff1_gather_extend, (TYPE_SUFFIX_s16))
+FUNCTION (svldff1sw, svldxf1_extend_impl, (TYPE_SUFFIX_s32, UNSPEC_LDFF1))
+FUNCTION (svldff1sw_gather, svldff1_gather_extend, (TYPE_SUFFIX_s32))
+FUNCTION (svldff1ub, svldxf1_extend_impl, (TYPE_SUFFIX_u8, UNSPEC_LDFF1))
+FUNCTION (svldff1ub_gather, svldff1_gather_extend, (TYPE_SUFFIX_u8))
+FUNCTION (svldff1uh, svldxf1_extend_impl, (TYPE_SUFFIX_u16, UNSPEC_LDFF1))
+FUNCTION (svldff1uh_gather, svldff1_gather_extend, (TYPE_SUFFIX_u16))
+FUNCTION (svldff1uw, svldxf1_extend_impl, (TYPE_SUFFIX_u32, UNSPEC_LDFF1))
+FUNCTION (svldff1uw_gather, svldff1_gather_extend, (TYPE_SUFFIX_u32))
+FUNCTION (svldnf1, svldxf1_impl, (UNSPEC_LDNF1))
+FUNCTION (svldnf1sb, svldxf1_extend_impl, (TYPE_SUFFIX_s8, UNSPEC_LDNF1))
+FUNCTION (svldnf1sh, svldxf1_extend_impl, (TYPE_SUFFIX_s16, UNSPEC_LDNF1))
+FUNCTION (svldnf1sw, svldxf1_extend_impl, (TYPE_SUFFIX_s32, UNSPEC_LDNF1))
+FUNCTION (svldnf1ub, svldxf1_extend_impl, (TYPE_SUFFIX_u8, UNSPEC_LDNF1))
+FUNCTION (svldnf1uh, svldxf1_extend_impl, (TYPE_SUFFIX_u16, UNSPEC_LDNF1))
+FUNCTION (svldnf1uw, svldxf1_extend_impl, (TYPE_SUFFIX_u32, UNSPEC_LDNF1))
+FUNCTION (svldnt1, svldnt1_impl,)
+FUNCTION (svlen, svlen_impl,)
+FUNCTION (svlsl, rtx_code_function, (ASHIFT, ASHIFT))
+FUNCTION (svlsl_wide, shift_wide, (ASHIFT, UNSPEC_ASHIFT_WIDE))
+FUNCTION (svlsr, rtx_code_function, (LSHIFTRT, LSHIFTRT))
+FUNCTION (svlsr_wide, shift_wide, (LSHIFTRT, UNSPEC_LSHIFTRT_WIDE))
+FUNCTION (svmad, svmad_impl,)
+FUNCTION (svmax, rtx_code_function, (SMAX, UMAX, UNSPEC_COND_FMAX))
+FUNCTION (svmaxnm, unspec_based_function, (-1, -1, UNSPEC_COND_FMAXNM))
+FUNCTION (svmaxnmv, reduction, (UNSPEC_FMAXNMV))
+FUNCTION (svmaxv, reduction, (UNSPEC_SMAXV, UNSPEC_UMAXV, UNSPEC_FMAXV))
+FUNCTION (svmin, rtx_code_function, (SMIN, UMIN, UNSPEC_COND_FMIN))
+FUNCTION (svminnm, unspec_based_function, (-1, -1, UNSPEC_COND_FMINNM))
+FUNCTION (svminnmv, reduction, (UNSPEC_FMINNMV))
+FUNCTION (svminv, reduction, (UNSPEC_SMINV, UNSPEC_UMINV, UNSPEC_FMINV))
+FUNCTION (svmla, svmla_impl,)
+FUNCTION (svmla_lane, svmla_svmls_lane_impl, (UNSPEC_FMLA))
+FUNCTION (svmls, svmls_impl,)
+FUNCTION (svmls_lane, svmla_svmls_lane_impl, (UNSPEC_FMLS))
+FUNCTION (svmmla, svmmla_impl,)
+FUNCTION (svmov, svmov_impl,)
+FUNCTION (svmsb, svmsb_impl,)
+FUNCTION (svmul, rtx_code_function, (MULT, MULT, UNSPEC_COND_FMUL))
+FUNCTION (svmul_lane, CODE_FOR_MODE0 (aarch64_mul_lane),)
+FUNCTION (svmulh, unspec_based_function, (UNSPEC_SMUL_HIGHPART,
+					  UNSPEC_UMUL_HIGHPART, -1))
+FUNCTION (svmulx, unspec_based_function, (-1, -1, UNSPEC_COND_FMULX))
+FUNCTION (svnand, svnand_impl,)
+FUNCTION (svneg, quiet<rtx_code_function>, (NEG, NEG, UNSPEC_COND_FNEG))
+FUNCTION (svnmad, unspec_based_function, (-1, -1, UNSPEC_COND_FNMLA))
+FUNCTION (svnmla, unspec_based_function_rotated, (-1, -1, UNSPEC_COND_FNMLA))
+FUNCTION (svnmls, unspec_based_function_rotated, (-1, -1, UNSPEC_COND_FNMLS))
+FUNCTION (svnmsb, unspec_based_function, (-1, -1, UNSPEC_COND_FNMLS))
+FUNCTION (svnor, svnor_impl,)
+FUNCTION (svnot, svnot_impl,)
+FUNCTION (svorn, svorn_impl,)
+FUNCTION (svorr, rtx_code_function, (IOR, IOR))
+FUNCTION (svorv, reduction, (UNSPEC_IORV))
+FUNCTION (svpfalse, svpfalse_impl,)
+FUNCTION (svpfirst, svpfirst_svpnext_impl, (UNSPEC_PFIRST))
+FUNCTION (svpnext, svpfirst_svpnext_impl, (UNSPEC_PNEXT))
+FUNCTION (svprfb, svprf_bhwd_impl, (VNx16QImode))
+FUNCTION (svprfb_gather, svprf_bhwd_gather_impl, (VNx16QImode))
+FUNCTION (svprfd, svprf_bhwd_impl, (VNx2DImode))
+FUNCTION (svprfd_gather, svprf_bhwd_gather_impl, (VNx2DImode))
+FUNCTION (svprfh, svprf_bhwd_impl, (VNx8HImode))
+FUNCTION (svprfh_gather, svprf_bhwd_gather_impl, (VNx8HImode))
+FUNCTION (svprfw, svprf_bhwd_impl, (VNx4SImode))
+FUNCTION (svprfw_gather, svprf_bhwd_gather_impl, (VNx4SImode))
+FUNCTION (svptest_any, svptest_impl, (NE))
+FUNCTION (svptest_first, svptest_impl, (LT))
+FUNCTION (svptest_last, svptest_impl, (LTU))
+FUNCTION (svptrue, svptrue_impl,)
+FUNCTION (svptrue_pat, svptrue_pat_impl,)
+FUNCTION (svqadd, svqadd_impl,)
+FUNCTION (svqdecb, svqdec_bhwd_impl, (QImode))
+FUNCTION (svqdecb_pat, svqdec_bhwd_impl, (QImode))
+FUNCTION (svqdecd, svqdec_bhwd_impl, (DImode))
+FUNCTION (svqdecd_pat, svqdec_bhwd_impl, (DImode))
+FUNCTION (svqdech, svqdec_bhwd_impl, (HImode))
+FUNCTION (svqdech_pat, svqdec_bhwd_impl, (HImode))
+FUNCTION (svqdecp, svqdecp_svqincp_impl, (SS_MINUS, US_MINUS))
+FUNCTION (svqdecw, svqdec_bhwd_impl, (SImode))
+FUNCTION (svqdecw_pat, svqdec_bhwd_impl, (SImode))
+FUNCTION (svqincb, svqinc_bhwd_impl, (QImode))
+FUNCTION (svqincb_pat, svqinc_bhwd_impl, (QImode))
+FUNCTION (svqincd, svqinc_bhwd_impl, (DImode))
+FUNCTION (svqincd_pat, svqinc_bhwd_impl, (DImode))
+FUNCTION (svqinch, svqinc_bhwd_impl, (HImode))
+FUNCTION (svqinch_pat, svqinc_bhwd_impl, (HImode))
+FUNCTION (svqincp, svqdecp_svqincp_impl, (SS_PLUS, US_PLUS))
+FUNCTION (svqincw, svqinc_bhwd_impl, (SImode))
+FUNCTION (svqincw_pat, svqinc_bhwd_impl, (SImode))
+FUNCTION (svqsub, svqsub_impl,)
+FUNCTION (svrbit, unspec_based_function, (UNSPEC_RBIT, UNSPEC_RBIT, -1))
+FUNCTION (svrdffr, svrdffr_impl,)
+FUNCTION (svrecpe, unspec_based_function, (-1, -1, UNSPEC_FRECPE))
+FUNCTION (svrecps, unspec_based_function, (-1, -1, UNSPEC_FRECPS))
+FUNCTION (svrecpx, unspec_based_function, (-1, -1, UNSPEC_COND_FRECPX))
+FUNCTION (svreinterpret, svreinterpret_impl,)
+FUNCTION (svrev, svrev_impl,)
+FUNCTION (svrevb, unspec_based_function, (UNSPEC_REVB, UNSPEC_REVB, -1))
+FUNCTION (svrevh, unspec_based_function, (UNSPEC_REVH, UNSPEC_REVH, -1))
+FUNCTION (svrevw, unspec_based_function, (UNSPEC_REVW, UNSPEC_REVW, -1))
+FUNCTION (svrinta, unspec_based_function, (-1, -1, UNSPEC_COND_FRINTA))
+FUNCTION (svrinti, unspec_based_function, (-1, -1, UNSPEC_COND_FRINTI))
+FUNCTION (svrintm, unspec_based_function, (-1, -1, UNSPEC_COND_FRINTM))
+FUNCTION (svrintn, unspec_based_function, (-1, -1, UNSPEC_COND_FRINTN))
+FUNCTION (svrintp, unspec_based_function, (-1, -1, UNSPEC_COND_FRINTP))
+FUNCTION (svrintx, unspec_based_function, (-1, -1, UNSPEC_COND_FRINTX))
+FUNCTION (svrintz, unspec_based_function, (-1, -1, UNSPEC_COND_FRINTZ))
+FUNCTION (svrsqrte, unspec_based_function, (-1, -1, UNSPEC_RSQRTE))
+FUNCTION (svrsqrts, unspec_based_function, (-1, -1, UNSPEC_RSQRTS))
+FUNCTION (svscale, unspec_based_function, (-1, -1, UNSPEC_COND_FSCALE))
+FUNCTION (svsel, svsel_impl,)
+FUNCTION (svset2, svset_impl, (2))
+FUNCTION (svset3, svset_impl, (3))
+FUNCTION (svset4, svset_impl, (4))
+FUNCTION (svsetffr, svsetffr_impl,)
+FUNCTION (svsplice, QUIET_CODE_FOR_MODE0 (aarch64_sve_splice),)
+FUNCTION (svsqrt, rtx_code_function, (SQRT, SQRT, UNSPEC_COND_FSQRT))
+FUNCTION (svst1, svst1_impl,)
+FUNCTION (svst1_scatter, svst1_scatter_impl,)
+FUNCTION (svst1b, svst1_truncate_impl, (QImode))
+FUNCTION (svst1b_scatter, svst1_scatter_truncate_impl, (QImode))
+FUNCTION (svst1h, svst1_truncate_impl, (HImode))
+FUNCTION (svst1h_scatter, svst1_scatter_truncate_impl, (HImode))
+FUNCTION (svst1w, svst1_truncate_impl, (SImode))
+FUNCTION (svst1w_scatter, svst1_scatter_truncate_impl, (SImode))
+FUNCTION (svst2, svst234_impl, (2))
+FUNCTION (svst3, svst234_impl, (3))
+FUNCTION (svst4, svst234_impl, (4))
+FUNCTION (svstnt1, svstnt1_impl,)
+FUNCTION (svsub, svsub_impl,)
+FUNCTION (svsubr, rtx_code_function_rotated, (MINUS, MINUS, UNSPEC_COND_FSUB))
+FUNCTION (svsudot, svusdot_impl, (true))
+FUNCTION (svsudot_lane, svdotprod_lane_impl, (UNSPEC_SUDOT, -1, -1))
+FUNCTION (svtbl, svtbl_impl,)
+FUNCTION (svtmad, CODE_FOR_MODE0 (aarch64_sve_tmad),)
+FUNCTION (svtrn1, svtrn_impl, (0))
+FUNCTION (svtrn1q, unspec_based_function, (UNSPEC_TRN1Q, UNSPEC_TRN1Q,
+					   UNSPEC_TRN1Q))
+FUNCTION (svtrn2, svtrn_impl, (1))
+FUNCTION (svtrn2q, unspec_based_function, (UNSPEC_TRN2Q, UNSPEC_TRN2Q,
+					   UNSPEC_TRN2Q))
+FUNCTION (svtsmul, unspec_based_function, (-1, -1, UNSPEC_FTSMUL))
+FUNCTION (svtssel, unspec_based_function, (-1, -1, UNSPEC_FTSSEL))
+FUNCTION (svundef, svundef_impl, (1))
+FUNCTION (svundef2, svundef_impl, (2))
+FUNCTION (svundef3, svundef_impl, (3))
+FUNCTION (svundef4, svundef_impl, (4))
+FUNCTION (svunpkhi, svunpk_impl, (true))
+FUNCTION (svunpklo, svunpk_impl, (false))
+FUNCTION (svusdot, svusdot_impl, (false))
+FUNCTION (svusdot_lane, svdotprod_lane_impl, (UNSPEC_USDOT, -1, -1))
+FUNCTION (svusmmla, unspec_based_add_function, (UNSPEC_USMATMUL, -1, -1))
+FUNCTION (svuzp1, svuzp_impl, (0))
+FUNCTION (svuzp1q, unspec_based_function, (UNSPEC_UZP1Q, UNSPEC_UZP1Q,
+					   UNSPEC_UZP1Q))
+FUNCTION (svuzp2, svuzp_impl, (1))
+FUNCTION (svuzp2q, unspec_based_function, (UNSPEC_UZP2Q, UNSPEC_UZP2Q,
+					   UNSPEC_UZP2Q))
+FUNCTION (svwhilele, svwhile_impl, (UNSPEC_WHILELE, UNSPEC_WHILELS, true))
+FUNCTION (svwhilelt, svwhile_impl, (UNSPEC_WHILELT, UNSPEC_WHILELO, false))
+FUNCTION (svwrffr, svwrffr_impl,)
+FUNCTION (svzip1, svzip_impl, (0))
+FUNCTION (svzip1q, unspec_based_function, (UNSPEC_ZIP1Q, UNSPEC_ZIP1Q,
+					   UNSPEC_ZIP1Q))
+FUNCTION (svzip2, svzip_impl, (1))
+FUNCTION (svzip2q, unspec_based_function, (UNSPEC_ZIP2Q, UNSPEC_ZIP2Q,
+					   UNSPEC_ZIP2Q))
+
+} /* end namespace aarch64_sve */
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.def b/gcc/config/aarch64/aarch64-sve-builtins-base.def
new file mode 100644
index 000000000..795a5fd90
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.def
@@ -0,0 +1,355 @@
+/* ACLE support for AArch64 SVE (__ARM_FEATURE_SVE intrinsics)
+   Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#define REQUIRED_EXTENSIONS 0
+DEF_SVE_FUNCTION (svabd, binary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svabs, unary, all_float_and_signed, mxz)
+DEF_SVE_FUNCTION (svacge, compare_opt_n, all_float, implicit)
+DEF_SVE_FUNCTION (svacgt, compare_opt_n, all_float, implicit)
+DEF_SVE_FUNCTION (svacle, compare_opt_n, all_float, implicit)
+DEF_SVE_FUNCTION (svaclt, compare_opt_n, all_float, implicit)
+DEF_SVE_FUNCTION (svadd, binary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svadda, fold_left, all_float, implicit)
+DEF_SVE_FUNCTION (svaddv, reduction_wide, all_arith, implicit)
+DEF_SVE_FUNCTION (svadrb, adr_offset, none, none)
+DEF_SVE_FUNCTION (svadrd, adr_index, none, none)
+DEF_SVE_FUNCTION (svadrh, adr_index, none, none)
+DEF_SVE_FUNCTION (svadrw, adr_index, none, none)
+DEF_SVE_FUNCTION (svand, binary_opt_n, all_integer, mxz)
+DEF_SVE_FUNCTION (svand, binary_opt_n, b, z)
+DEF_SVE_FUNCTION (svandv, reduction, all_integer, implicit)
+DEF_SVE_FUNCTION (svasr, binary_uint_opt_n, all_signed, mxz)
+DEF_SVE_FUNCTION (svasr_wide, binary_uint64_opt_n, bhs_signed, mxz)
+DEF_SVE_FUNCTION (svasrd, shift_right_imm, all_signed, mxz)
+DEF_SVE_FUNCTION (svbic, binary_opt_n, all_integer, mxz)
+DEF_SVE_FUNCTION (svbic, binary_opt_n, b, z)
+DEF_SVE_FUNCTION (svbrka, unary, b, mz)
+DEF_SVE_FUNCTION (svbrkb, unary, b, mz)
+DEF_SVE_FUNCTION (svbrkn, binary, b, z)
+DEF_SVE_FUNCTION (svbrkpa, binary, b, z)
+DEF_SVE_FUNCTION (svbrkpb, binary, b, z)
+DEF_SVE_FUNCTION (svcadd, binary_rotate, all_float, mxz)
+DEF_SVE_FUNCTION (svclasta, clast, all_data, implicit)
+DEF_SVE_FUNCTION (svclastb, clast, all_data, implicit)
+DEF_SVE_FUNCTION (svcls, unary_to_uint, all_signed, mxz)
+DEF_SVE_FUNCTION (svclz, unary_to_uint, all_integer, mxz)
+DEF_SVE_FUNCTION (svcmla, ternary_rotate, all_float, mxz)
+DEF_SVE_FUNCTION (svcmla_lane, ternary_lane_rotate, hs_float, none)
+DEF_SVE_FUNCTION (svcmpeq, compare_opt_n, all_arith, implicit)
+DEF_SVE_FUNCTION (svcmpeq_wide, compare_wide_opt_n, bhs_signed, implicit)
+DEF_SVE_FUNCTION (svcmpge, compare_opt_n, all_arith, implicit)
+DEF_SVE_FUNCTION (svcmpge_wide, compare_wide_opt_n, bhs_integer, implicit)
+DEF_SVE_FUNCTION (svcmpgt, compare_opt_n, all_arith, implicit)
+DEF_SVE_FUNCTION (svcmpgt_wide, compare_wide_opt_n, bhs_integer, implicit)
+DEF_SVE_FUNCTION (svcmple, compare_opt_n, all_arith, implicit)
+DEF_SVE_FUNCTION (svcmple_wide, compare_wide_opt_n, bhs_integer, implicit)
+DEF_SVE_FUNCTION (svcmplt, compare_opt_n, all_arith, implicit)
+DEF_SVE_FUNCTION (svcmplt_wide, compare_wide_opt_n, bhs_integer, implicit)
+DEF_SVE_FUNCTION (svcmpne, compare_opt_n, all_arith, implicit)
+DEF_SVE_FUNCTION (svcmpne_wide, compare_wide_opt_n, bhs_signed, implicit)
+DEF_SVE_FUNCTION (svcmpuo, compare_opt_n, all_float, implicit)
+DEF_SVE_FUNCTION (svcnot, unary, all_integer, mxz)
+DEF_SVE_FUNCTION (svcnt, unary_to_uint, all_data, mxz)
+DEF_SVE_FUNCTION (svcntb, count_inherent, none, none)
+DEF_SVE_FUNCTION (svcntb_pat, count_pat, none, none)
+DEF_SVE_FUNCTION (svcntd, count_inherent, none, none)
+DEF_SVE_FUNCTION (svcntd_pat, count_pat, none, none)
+DEF_SVE_FUNCTION (svcnth, count_inherent, none, none)
+DEF_SVE_FUNCTION (svcnth_pat, count_pat, none, none)
+DEF_SVE_FUNCTION (svcntp, count_pred, all_pred, implicit)
+DEF_SVE_FUNCTION (svcntw, count_inherent, none, none)
+DEF_SVE_FUNCTION (svcntw_pat, count_pat, none, none)
+DEF_SVE_FUNCTION (svcompact, unary, sd_data, implicit)
+DEF_SVE_FUNCTION (svcreate2, create, all_data, none)
+DEF_SVE_FUNCTION (svcreate3, create, all_data, none)
+DEF_SVE_FUNCTION (svcreate4, create, all_data, none)
+DEF_SVE_FUNCTION (svcvt, unary_convert, cvt, mxz)
+DEF_SVE_FUNCTION (svdiv, binary_opt_n, all_float_and_sd_integer, mxz)
+DEF_SVE_FUNCTION (svdivr, binary_opt_n, all_float_and_sd_integer, mxz)
+DEF_SVE_FUNCTION (svdot, ternary_qq_opt_n, sd_integer, none)
+DEF_SVE_FUNCTION (svdot_lane, ternary_qq_lane, sd_integer, none)
+DEF_SVE_FUNCTION (svdup, unary_n, all_data, mxz_or_none)
+DEF_SVE_FUNCTION (svdup, unary_n, all_pred, none)
+DEF_SVE_FUNCTION (svdup_lane, binary_uint_n, all_data, none)
+DEF_SVE_FUNCTION (svdupq, dupq, all_data, none)
+DEF_SVE_FUNCTION (svdupq, dupq, all_pred, none)
+DEF_SVE_FUNCTION (svdupq_lane, binary_uint64_n, all_data, none)
+DEF_SVE_FUNCTION (sveor, binary_opt_n, all_integer, mxz)
+DEF_SVE_FUNCTION (sveor, binary_opt_n, b, z)
+DEF_SVE_FUNCTION (sveorv, reduction, all_integer, implicit)
+DEF_SVE_FUNCTION (svexpa, unary_uint, all_float, none)
+DEF_SVE_FUNCTION (svext, ext, all_data, none)
+DEF_SVE_FUNCTION (svextb, unary, hsd_integer, mxz)
+DEF_SVE_FUNCTION (svexth, unary, sd_integer, mxz)
+DEF_SVE_FUNCTION (svextw, unary, d_integer, mxz)
+DEF_SVE_FUNCTION (svget2, get, all_data, none)
+DEF_SVE_FUNCTION (svget3, get, all_data, none)
+DEF_SVE_FUNCTION (svget4, get, all_data, none)
+DEF_SVE_FUNCTION (svindex, binary_scalar, all_integer, none)
+DEF_SVE_FUNCTION (svinsr, binary_n, all_data, none)
+DEF_SVE_FUNCTION (svlasta, reduction, all_data, implicit)
+DEF_SVE_FUNCTION (svlastb, reduction, all_data, implicit)
+DEF_SVE_FUNCTION (svld1, load, all_data, implicit)
+DEF_SVE_FUNCTION (svld1_gather, load_gather_sv, sd_data, implicit)
+DEF_SVE_FUNCTION (svld1_gather, load_gather_vs, sd_data, implicit)
+DEF_SVE_FUNCTION (svld1rq, load_replicate, all_data, implicit)
+DEF_SVE_FUNCTION (svld1sb, load_ext, hsd_integer, implicit)
+DEF_SVE_FUNCTION (svld1sb_gather, load_ext_gather_offset, sd_integer, implicit)
+DEF_SVE_FUNCTION (svld1sh, load_ext, sd_integer, implicit)
+DEF_SVE_FUNCTION (svld1sh_gather, load_ext_gather_offset, sd_integer, implicit)
+DEF_SVE_FUNCTION (svld1sh_gather, load_ext_gather_index, sd_integer, implicit)
+DEF_SVE_FUNCTION (svld1sw, load_ext, d_integer, implicit)
+DEF_SVE_FUNCTION (svld1sw_gather, load_ext_gather_offset, d_integer, implicit)
+DEF_SVE_FUNCTION (svld1sw_gather, load_ext_gather_index, d_integer, implicit)
+DEF_SVE_FUNCTION (svld1ub, load_ext, hsd_integer, implicit)
+DEF_SVE_FUNCTION (svld1ub_gather, load_ext_gather_offset, sd_integer, implicit)
+DEF_SVE_FUNCTION (svld1uh, load_ext, sd_integer, implicit)
+DEF_SVE_FUNCTION (svld1uh_gather, load_ext_gather_offset, sd_integer, implicit)
+DEF_SVE_FUNCTION (svld1uh_gather, load_ext_gather_index, sd_integer, implicit)
+DEF_SVE_FUNCTION (svld1uw, load_ext, d_integer, implicit)
+DEF_SVE_FUNCTION (svld1uw_gather, load_ext_gather_offset, d_integer, implicit)
+DEF_SVE_FUNCTION (svld1uw_gather, load_ext_gather_index, d_integer, implicit)
+DEF_SVE_FUNCTION (svldff1, load, all_data, implicit)
+DEF_SVE_FUNCTION (svldff1_gather, load_gather_sv, sd_data, implicit)
+DEF_SVE_FUNCTION (svldff1_gather, load_gather_vs, sd_data, implicit)
+DEF_SVE_FUNCTION (svldff1sb, load_ext, hsd_integer, implicit)
+DEF_SVE_FUNCTION (svldff1sb_gather, load_ext_gather_offset, sd_integer, implicit)
+DEF_SVE_FUNCTION (svldff1sh, load_ext, sd_integer, implicit)
+DEF_SVE_FUNCTION (svldff1sh_gather, load_ext_gather_offset, sd_integer, implicit)
+DEF_SVE_FUNCTION (svldff1sh_gather, load_ext_gather_index, sd_integer, implicit)
+DEF_SVE_FUNCTION (svldff1sw, load_ext, d_integer, implicit)
+DEF_SVE_FUNCTION (svldff1sw_gather, load_ext_gather_offset, d_integer, implicit)
+DEF_SVE_FUNCTION (svldff1sw_gather, load_ext_gather_index, d_integer, implicit)
+DEF_SVE_FUNCTION (svldff1ub, load_ext, hsd_integer, implicit)
+DEF_SVE_FUNCTION (svldff1ub_gather, load_ext_gather_offset, sd_integer, implicit)
+DEF_SVE_FUNCTION (svldff1uh, load_ext, sd_integer, implicit)
+DEF_SVE_FUNCTION (svldff1uh_gather, load_ext_gather_offset, sd_integer, implicit)
+DEF_SVE_FUNCTION (svldff1uh_gather, load_ext_gather_index, sd_integer, implicit)
+DEF_SVE_FUNCTION (svldff1uw, load_ext, d_integer, implicit)
+DEF_SVE_FUNCTION (svldff1uw_gather, load_ext_gather_offset, d_integer, implicit)
+DEF_SVE_FUNCTION (svldff1uw_gather, load_ext_gather_index, d_integer, implicit)
+DEF_SVE_FUNCTION (svldnf1, load, all_data, implicit)
+DEF_SVE_FUNCTION (svldnf1sb, load_ext, hsd_integer, implicit)
+DEF_SVE_FUNCTION (svldnf1sh, load_ext, sd_integer, implicit)
+DEF_SVE_FUNCTION (svldnf1sw, load_ext, d_integer, implicit)
+DEF_SVE_FUNCTION (svldnf1ub, load_ext, hsd_integer, implicit)
+DEF_SVE_FUNCTION (svldnf1uh, load_ext, sd_integer, implicit)
+DEF_SVE_FUNCTION (svldnf1uw, load_ext, d_integer, implicit)
+DEF_SVE_FUNCTION (svldnt1, load, all_data, implicit)
+DEF_SVE_FUNCTION (svld2, load, all_data, implicit)
+DEF_SVE_FUNCTION (svld3, load, all_data, implicit)
+DEF_SVE_FUNCTION (svld4, load, all_data, implicit)
+DEF_SVE_FUNCTION (svlen, count_vector, all_data, none)
+DEF_SVE_FUNCTION (svlsl, binary_uint_opt_n, all_integer, mxz)
+DEF_SVE_FUNCTION (svlsl_wide, binary_uint64_opt_n, bhs_integer, mxz)
+DEF_SVE_FUNCTION (svlsr, binary_uint_opt_n, all_unsigned, mxz)
+DEF_SVE_FUNCTION (svlsr_wide, binary_uint64_opt_n, bhs_unsigned, mxz)
+DEF_SVE_FUNCTION (svmad, ternary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svmax, binary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svmaxnm, binary_opt_n, all_float, mxz)
+DEF_SVE_FUNCTION (svmaxnmv, reduction, all_float, implicit)
+DEF_SVE_FUNCTION (svmaxv, reduction, all_arith, implicit)
+DEF_SVE_FUNCTION (svmin, binary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svminnm, binary_opt_n, all_float, mxz)
+DEF_SVE_FUNCTION (svminnmv, reduction, all_float, implicit)
+DEF_SVE_FUNCTION (svminv, reduction, all_arith, implicit)
+DEF_SVE_FUNCTION (svmla, ternary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svmla_lane, ternary_lane, all_float, none)
+DEF_SVE_FUNCTION (svmls, ternary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svmls_lane, ternary_lane, all_float, none)
+DEF_SVE_FUNCTION (svmmla, mmla, none, none)
+DEF_SVE_FUNCTION (svmov, unary, b, z)
+DEF_SVE_FUNCTION (svmsb, ternary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svmul, binary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svmul_lane, binary_lane, all_float, none)
+DEF_SVE_FUNCTION (svmulh, binary_opt_n, all_integer, mxz)
+DEF_SVE_FUNCTION (svmulx, binary_opt_n, all_float, mxz)
+DEF_SVE_FUNCTION (svnand, binary_opt_n, b, z)
+DEF_SVE_FUNCTION (svneg, unary, all_float_and_signed, mxz)
+DEF_SVE_FUNCTION (svnmad, ternary_opt_n, all_float, mxz)
+DEF_SVE_FUNCTION (svnmla, ternary_opt_n, all_float, mxz)
+DEF_SVE_FUNCTION (svnmls, ternary_opt_n, all_float, mxz)
+DEF_SVE_FUNCTION (svnmsb, ternary_opt_n, all_float, mxz)
+DEF_SVE_FUNCTION (svnor, binary_opt_n, b, z)
+DEF_SVE_FUNCTION (svnot, unary, all_integer, mxz)
+DEF_SVE_FUNCTION (svnot, unary, b, z)
+DEF_SVE_FUNCTION (svorn, binary_opt_n, b, z)
+DEF_SVE_FUNCTION (svorr, binary_opt_n, all_integer, mxz)
+DEF_SVE_FUNCTION (svorr, binary_opt_n, b, z)
+DEF_SVE_FUNCTION (svorv, reduction, all_integer, implicit)
+DEF_SVE_FUNCTION (svpfalse, inherent_b, b, none)
+DEF_SVE_FUNCTION (svpfirst, unary, b, implicit)
+DEF_SVE_FUNCTION (svpnext, unary_pred, all_pred, implicit)
+DEF_SVE_FUNCTION (svprfb, prefetch, none, implicit)
+DEF_SVE_FUNCTION (svprfb_gather, prefetch_gather_offset, none, implicit)
+DEF_SVE_FUNCTION (svprfd, prefetch, none, implicit)
+DEF_SVE_FUNCTION (svprfd_gather, prefetch_gather_index, none, implicit)
+DEF_SVE_FUNCTION (svprfh, prefetch, none, implicit)
+DEF_SVE_FUNCTION (svprfh_gather, prefetch_gather_index, none, implicit)
+DEF_SVE_FUNCTION (svprfw, prefetch, none, implicit)
+DEF_SVE_FUNCTION (svprfw_gather, prefetch_gather_index, none, implicit)
+DEF_SVE_FUNCTION (svptest_any, ptest, none, implicit)
+DEF_SVE_FUNCTION (svptest_first, ptest, none, implicit)
+DEF_SVE_FUNCTION (svptest_last, ptest, none, implicit)
+DEF_SVE_FUNCTION (svptrue, inherent, all_pred, none)
+DEF_SVE_FUNCTION (svptrue_pat, pattern_pred, all_pred, none)
+DEF_SVE_FUNCTION (svqadd, binary_opt_n, all_integer, none)
+DEF_SVE_FUNCTION (svqdecb, inc_dec, sd_integer, none)
+DEF_SVE_FUNCTION (svqdecb_pat, inc_dec_pat, sd_integer, none)
+DEF_SVE_FUNCTION (svqdecd, inc_dec, d_integer, none)
+DEF_SVE_FUNCTION (svqdecd, inc_dec, sd_integer, none)
+DEF_SVE_FUNCTION (svqdecd_pat, inc_dec_pat, d_integer, none)
+DEF_SVE_FUNCTION (svqdecd_pat, inc_dec_pat, sd_integer, none)
+DEF_SVE_FUNCTION (svqdech, inc_dec, h_integer, none)
+DEF_SVE_FUNCTION (svqdech, inc_dec, sd_integer, none)
+DEF_SVE_FUNCTION (svqdech_pat, inc_dec_pat, h_integer, none)
+DEF_SVE_FUNCTION (svqdech_pat, inc_dec_pat, sd_integer, none)
+DEF_SVE_FUNCTION (svqdecp, inc_dec_pred, hsd_integer, none)
+DEF_SVE_FUNCTION (svqdecp, inc_dec_pred_scalar, inc_dec_n, none)
+DEF_SVE_FUNCTION (svqdecw, inc_dec, s_integer, none)
+DEF_SVE_FUNCTION (svqdecw, inc_dec, sd_integer, none)
+DEF_SVE_FUNCTION (svqdecw_pat, inc_dec_pat, s_integer, none)
+DEF_SVE_FUNCTION (svqdecw_pat, inc_dec_pat, sd_integer, none)
+DEF_SVE_FUNCTION (svqincb, inc_dec, sd_integer, none)
+DEF_SVE_FUNCTION (svqincb_pat, inc_dec_pat, sd_integer, none)
+DEF_SVE_FUNCTION (svqincd, inc_dec, d_integer, none)
+DEF_SVE_FUNCTION (svqincd, inc_dec, sd_integer, none)
+DEF_SVE_FUNCTION (svqincd_pat, inc_dec_pat, d_integer, none)
+DEF_SVE_FUNCTION (svqincd_pat, inc_dec_pat, sd_integer, none)
+DEF_SVE_FUNCTION (svqinch, inc_dec, h_integer, none)
+DEF_SVE_FUNCTION (svqinch, inc_dec, sd_integer, none)
+DEF_SVE_FUNCTION (svqinch_pat, inc_dec_pat, h_integer, none)
+DEF_SVE_FUNCTION (svqinch_pat, inc_dec_pat, sd_integer, none)
+DEF_SVE_FUNCTION (svqincp, inc_dec_pred, hsd_integer, none)
+DEF_SVE_FUNCTION (svqincp, inc_dec_pred_scalar, inc_dec_n, none)
+DEF_SVE_FUNCTION (svqincw, inc_dec, s_integer, none)
+DEF_SVE_FUNCTION (svqincw, inc_dec, sd_integer, none)
+DEF_SVE_FUNCTION (svqincw_pat, inc_dec_pat, s_integer, none)
+DEF_SVE_FUNCTION (svqincw_pat, inc_dec_pat, sd_integer, none)
+DEF_SVE_FUNCTION (svqsub, binary_opt_n, all_integer, none)
+DEF_SVE_FUNCTION (svrbit, unary, all_integer, mxz)
+DEF_SVE_FUNCTION (svrdffr, rdffr, none, z_or_none)
+DEF_SVE_FUNCTION (svrecpe, unary, all_float, none)
+DEF_SVE_FUNCTION (svrecps, binary, all_float, none)
+DEF_SVE_FUNCTION (svrecpx, unary, all_float, mxz)
+DEF_SVE_FUNCTION (svreinterpret, unary_convert, reinterpret, none)
+DEF_SVE_FUNCTION (svrev, unary, all_data, none)
+DEF_SVE_FUNCTION (svrev, unary_pred, all_pred, none)
+DEF_SVE_FUNCTION (svrevb, unary, hsd_integer, mxz)
+DEF_SVE_FUNCTION (svrevh, unary, sd_integer, mxz)
+DEF_SVE_FUNCTION (svrevw, unary, d_integer, mxz)
+DEF_SVE_FUNCTION (svrinta, unary, all_float, mxz)
+DEF_SVE_FUNCTION (svrinti, unary, all_float, mxz)
+DEF_SVE_FUNCTION (svrintm, unary, all_float, mxz)
+DEF_SVE_FUNCTION (svrintn, unary, all_float, mxz)
+DEF_SVE_FUNCTION (svrintp, unary, all_float, mxz)
+DEF_SVE_FUNCTION (svrintx, unary, all_float, mxz)
+DEF_SVE_FUNCTION (svrintz, unary, all_float, mxz)
+DEF_SVE_FUNCTION (svrsqrte, unary, all_float, none)
+DEF_SVE_FUNCTION (svrsqrts, binary, all_float, none)
+DEF_SVE_FUNCTION (svscale, binary_int_opt_n, all_float, mxz)
+DEF_SVE_FUNCTION (svsel, binary, all_data, implicit)
+DEF_SVE_FUNCTION (svsel, binary, b, implicit)
+DEF_SVE_FUNCTION (svset2, set, all_data, none)
+DEF_SVE_FUNCTION (svset3, set, all_data, none)
+DEF_SVE_FUNCTION (svset4, set, all_data, none)
+DEF_SVE_FUNCTION (svsetffr, setffr, none, none)
+DEF_SVE_FUNCTION (svsplice, binary, all_data, implicit)
+DEF_SVE_FUNCTION (svsqrt, unary, all_float, mxz)
+DEF_SVE_FUNCTION (svst1, store, all_data, implicit)
+DEF_SVE_FUNCTION (svst1_scatter, store_scatter_index, sd_data, implicit)
+DEF_SVE_FUNCTION (svst1_scatter, store_scatter_offset, sd_data, implicit)
+DEF_SVE_FUNCTION (svst1b, store, hsd_integer, implicit)
+DEF_SVE_FUNCTION (svst1b_scatter, store_scatter_offset, sd_integer, implicit)
+DEF_SVE_FUNCTION (svst1h, store, sd_integer, implicit)
+DEF_SVE_FUNCTION (svst1h_scatter, store_scatter_index, sd_integer, implicit)
+DEF_SVE_FUNCTION (svst1h_scatter, store_scatter_offset, sd_integer, implicit)
+DEF_SVE_FUNCTION (svst1w, store, d_integer, implicit)
+DEF_SVE_FUNCTION (svst1w_scatter, store_scatter_index, d_integer, implicit)
+DEF_SVE_FUNCTION (svst1w_scatter, store_scatter_offset, d_integer, implicit)
+DEF_SVE_FUNCTION (svst2, store, all_data, implicit)
+DEF_SVE_FUNCTION (svst3, store, all_data, implicit)
+DEF_SVE_FUNCTION (svst4, store, all_data, implicit)
+DEF_SVE_FUNCTION (svstnt1, store, all_data, implicit)
+DEF_SVE_FUNCTION (svsub, binary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svsubr, binary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svtbl, binary_uint, all_data, none)
+DEF_SVE_FUNCTION (svtmad, tmad, all_float, none)
+DEF_SVE_FUNCTION (svtrn1, binary, all_data, none)
+DEF_SVE_FUNCTION (svtrn1, binary_pred, all_pred, none)
+DEF_SVE_FUNCTION (svtrn2, binary, all_data, none)
+DEF_SVE_FUNCTION (svtrn2, binary_pred, all_pred, none)
+DEF_SVE_FUNCTION (svtsmul, binary_uint, all_float, none)
+DEF_SVE_FUNCTION (svtssel, binary_uint, all_float, none)
+DEF_SVE_FUNCTION (svundef, inherent, all_data, none)
+DEF_SVE_FUNCTION (svundef2, inherent, all_data, none)
+DEF_SVE_FUNCTION (svundef3, inherent, all_data, none)
+DEF_SVE_FUNCTION (svundef4, inherent, all_data, none)
+DEF_SVE_FUNCTION (svunpkhi, unary_widen, hsd_integer, none)
+DEF_SVE_FUNCTION (svunpkhi, unary_widen, b, none)
+DEF_SVE_FUNCTION (svunpklo, unary_widen, hsd_integer, none)
+DEF_SVE_FUNCTION (svunpklo, unary_widen, b, none)
+DEF_SVE_FUNCTION (svuzp1, binary, all_data, none)
+DEF_SVE_FUNCTION (svuzp1, binary_pred, all_pred, none)
+DEF_SVE_FUNCTION (svuzp2, binary, all_data, none)
+DEF_SVE_FUNCTION (svuzp2, binary_pred, all_pred, none)
+DEF_SVE_FUNCTION (svwhilele, compare_scalar, while, none)
+DEF_SVE_FUNCTION (svwhilelt, compare_scalar, while, none)
+DEF_SVE_FUNCTION (svwrffr, setffr, none, implicit)
+DEF_SVE_FUNCTION (svzip1, binary, all_data, none)
+DEF_SVE_FUNCTION (svzip1, binary_pred, all_pred, none)
+DEF_SVE_FUNCTION (svzip2, binary, all_data, none)
+DEF_SVE_FUNCTION (svzip2, binary_pred, all_pred, none)
+#undef REQUIRED_EXTENSIONS
+
+#define REQUIRED_EXTENSIONS AARCH64_FL_BF16
+DEF_SVE_FUNCTION (svbfdot, ternary_bfloat_opt_n, s_float, none)
+DEF_SVE_FUNCTION (svbfdot_lane, ternary_bfloat_lanex2, s_float, none)
+DEF_SVE_FUNCTION (svbfmlalb, ternary_bfloat_opt_n, s_float, none)
+DEF_SVE_FUNCTION (svbfmlalb_lane, ternary_bfloat_lane, s_float, none)
+DEF_SVE_FUNCTION (svbfmlalt, ternary_bfloat_opt_n, s_float, none)
+DEF_SVE_FUNCTION (svbfmlalt_lane, ternary_bfloat_lane, s_float, none)
+DEF_SVE_FUNCTION (svbfmmla, ternary_bfloat, s_float, none)
+DEF_SVE_FUNCTION (svcvt, unary_convert, cvt_bfloat, mxz)
+DEF_SVE_FUNCTION (svcvtnt, unary_convert_narrowt, cvt_bfloat, mx)
+#undef REQUIRED_EXTENSIONS
+
+#define REQUIRED_EXTENSIONS AARCH64_FL_I8MM
+DEF_SVE_FUNCTION (svmmla, mmla, s_integer, none)
+DEF_SVE_FUNCTION (svusmmla, ternary_uintq_intq, s_signed, none)
+DEF_SVE_FUNCTION (svsudot, ternary_intq_uintq_opt_n, s_signed, none)
+DEF_SVE_FUNCTION (svsudot_lane, ternary_intq_uintq_lane, s_signed, none)
+DEF_SVE_FUNCTION (svusdot, ternary_uintq_intq_opt_n, s_signed, none)
+DEF_SVE_FUNCTION (svusdot_lane, ternary_uintq_intq_lane, s_signed, none)
+#undef REQUIRED_EXTENSIONS
+
+#define REQUIRED_EXTENSIONS AARCH64_FL_F32MM
+DEF_SVE_FUNCTION (svmmla, mmla, s_float, none)
+#undef REQUIRED_EXTENSIONS
+
+#define REQUIRED_EXTENSIONS AARCH64_FL_F64MM
+DEF_SVE_FUNCTION (svld1ro, load_replicate, all_data, implicit)
+DEF_SVE_FUNCTION (svmmla, mmla, d_float, none)
+DEF_SVE_FUNCTION (svtrn1q, binary, all_data, none)
+DEF_SVE_FUNCTION (svtrn2q, binary, all_data, none)
+DEF_SVE_FUNCTION (svuzp1q, binary, all_data, none)
+DEF_SVE_FUNCTION (svuzp2q, binary, all_data, none)
+DEF_SVE_FUNCTION (svzip1q, binary, all_data, none)
+DEF_SVE_FUNCTION (svzip2q, binary, all_data, none)
+#undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.h b/gcc/config/aarch64/aarch64-sve-builtins-base.h
new file mode 100644
index 000000000..2467e729e
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.h
@@ -0,0 +1,304 @@
+/* ACLE support for AArch64 SVE (__ARM_FEATURE_SVE intrinsics)
+   Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_SVE_BUILTINS_BASE_H
+#define GCC_AARCH64_SVE_BUILTINS_BASE_H
+
+namespace aarch64_sve
+{
+  namespace functions
+  {
+    extern const function_base *const svabd;
+    extern const function_base *const svabs;
+    extern const function_base *const svacge;
+    extern const function_base *const svacgt;
+    extern const function_base *const svacle;
+    extern const function_base *const svaclt;
+    extern const function_base *const svadd;
+    extern const function_base *const svadda;
+    extern const function_base *const svaddv;
+    extern const function_base *const svadrb;
+    extern const function_base *const svadrd;
+    extern const function_base *const svadrh;
+    extern const function_base *const svadrw;
+    extern const function_base *const svand;
+    extern const function_base *const svandv;
+    extern const function_base *const svasr;
+    extern const function_base *const svasr_wide;
+    extern const function_base *const svasrd;
+    extern const function_base *const svbfdot;
+    extern const function_base *const svbfdot_lane;
+    extern const function_base *const svbfmlalb;
+    extern const function_base *const svbfmlalb_lane;
+    extern const function_base *const svbfmlalt;
+    extern const function_base *const svbfmlalt_lane;
+    extern const function_base *const svbfmmla;
+    extern const function_base *const svbic;
+    extern const function_base *const svbrka;
+    extern const function_base *const svbrkb;
+    extern const function_base *const svbrkn;
+    extern const function_base *const svbrkpa;
+    extern const function_base *const svbrkpb;
+    extern const function_base *const svcadd;
+    extern const function_base *const svclasta;
+    extern const function_base *const svclastb;
+    extern const function_base *const svcls;
+    extern const function_base *const svclz;
+    extern const function_base *const svcmla;
+    extern const function_base *const svcmla_lane;
+    extern const function_base *const svcmpeq;
+    extern const function_base *const svcmpeq_wide;
+    extern const function_base *const svcmpge;
+    extern const function_base *const svcmpge_wide;
+    extern const function_base *const svcmpgt;
+    extern const function_base *const svcmpgt_wide;
+    extern const function_base *const svcmple;
+    extern const function_base *const svcmple_wide;
+    extern const function_base *const svcmplt;
+    extern const function_base *const svcmplt_wide;
+    extern const function_base *const svcmpne;
+    extern const function_base *const svcmpne_wide;
+    extern const function_base *const svcmpuo;
+    extern const function_base *const svcnot;
+    extern const function_base *const svcnt;
+    extern const function_base *const svcntb;
+    extern const function_base *const svcntb_pat;
+    extern const function_base *const svcntd;
+    extern const function_base *const svcntd_pat;
+    extern const function_base *const svcnth;
+    extern const function_base *const svcnth_pat;
+    extern const function_base *const svcntp;
+    extern const function_base *const svcntw;
+    extern const function_base *const svcntw_pat;
+    extern const function_base *const svcompact;
+    extern const function_base *const svcreate2;
+    extern const function_base *const svcreate3;
+    extern const function_base *const svcreate4;
+    extern const function_base *const svcvt;
+    extern const function_base *const svcvtnt;
+    extern const function_base *const svdiv;
+    extern const function_base *const svdivr;
+    extern const function_base *const svdot;
+    extern const function_base *const svdot_lane;
+    extern const function_base *const svdup;
+    extern const function_base *const svdup_lane;
+    extern const function_base *const svdupq;
+    extern const function_base *const svdupq_lane;
+    extern const function_base *const sveor;
+    extern const function_base *const sveorv;
+    extern const function_base *const svexpa;
+    extern const function_base *const svext;
+    extern const function_base *const svextb;
+    extern const function_base *const svexth;
+    extern const function_base *const svextw;
+    extern const function_base *const svget2;
+    extern const function_base *const svget3;
+    extern const function_base *const svget4;
+    extern const function_base *const svindex;
+    extern const function_base *const svinsr;
+    extern const function_base *const svlasta;
+    extern const function_base *const svlastb;
+    extern const function_base *const svld1;
+    extern const function_base *const svld1_gather;
+    extern const function_base *const svld1ro;
+    extern const function_base *const svld1rq;
+    extern const function_base *const svld1sb;
+    extern const function_base *const svld1sb_gather;
+    extern const function_base *const svld1sh;
+    extern const function_base *const svld1sh_gather;
+    extern const function_base *const svld1sw;
+    extern const function_base *const svld1sw_gather;
+    extern const function_base *const svld1ub;
+    extern const function_base *const svld1ub_gather;
+    extern const function_base *const svld1uh;
+    extern const function_base *const svld1uh_gather;
+    extern const function_base *const svld1uw;
+    extern const function_base *const svld1uw_gather;
+    extern const function_base *const svld2;
+    extern const function_base *const svld3;
+    extern const function_base *const svld4;
+    extern const function_base *const svldff1;
+    extern const function_base *const svldff1_gather;
+    extern const function_base *const svldff1sb;
+    extern const function_base *const svldff1sb_gather;
+    extern const function_base *const svldff1sh;
+    extern const function_base *const svldff1sh_gather;
+    extern const function_base *const svldff1sw;
+    extern const function_base *const svldff1sw_gather;
+    extern const function_base *const svldff1ub;
+    extern const function_base *const svldff1ub_gather;
+    extern const function_base *const svldff1uh;
+    extern const function_base *const svldff1uh_gather;
+    extern const function_base *const svldff1uw;
+    extern const function_base *const svldff1uw_gather;
+    extern const function_base *const svldnf1;
+    extern const function_base *const svldnf1sb;
+    extern const function_base *const svldnf1sh;
+    extern const function_base *const svldnf1sw;
+    extern const function_base *const svldnf1ub;
+    extern const function_base *const svldnf1uh;
+    extern const function_base *const svldnf1uw;
+    extern const function_base *const svldnt1;
+    extern const function_base *const svlen;
+    extern const function_base *const svlsl;
+    extern const function_base *const svlsl_wide;
+    extern const function_base *const svlsr;
+    extern const function_base *const svlsr_wide;
+    extern const function_base *const svmad;
+    extern const function_base *const svmax;
+    extern const function_base *const svmaxnm;
+    extern const function_base *const svmaxnmv;
+    extern const function_base *const svmaxv;
+    extern const function_base *const svmin;
+    extern const function_base *const svminnm;
+    extern const function_base *const svminnmv;
+    extern const function_base *const svminv;
+    extern const function_base *const svmla;
+    extern const function_base *const svmla_lane;
+    extern const function_base *const svmls;
+    extern const function_base *const svmls_lane;
+    extern const function_base *const svmmla;
+    extern const function_base *const svmov;
+    extern const function_base *const svmsb;
+    extern const function_base *const svmul;
+    extern const function_base *const svmul_lane;
+    extern const function_base *const svmulh;
+    extern const function_base *const svmulx;
+    extern const function_base *const svnand;
+    extern const function_base *const svneg;
+    extern const function_base *const svnmad;
+    extern const function_base *const svnmla;
+    extern const function_base *const svnmls;
+    extern const function_base *const svnmsb;
+    extern const function_base *const svnor;
+    extern const function_base *const svnot;
+    extern const function_base *const svorn;
+    extern const function_base *const svorr;
+    extern const function_base *const svorv;
+    extern const function_base *const svpfalse;
+    extern const function_base *const svpfirst;
+    extern const function_base *const svpnext;
+    extern const function_base *const svprfb;
+    extern const function_base *const svprfb_gather;
+    extern const function_base *const svprfd;
+    extern const function_base *const svprfd_gather;
+    extern const function_base *const svprfh;
+    extern const function_base *const svprfh_gather;
+    extern const function_base *const svprfw;
+    extern const function_base *const svprfw_gather;
+    extern const function_base *const svptest_any;
+    extern const function_base *const svptest_first;
+    extern const function_base *const svptest_last;
+    extern const function_base *const svptrue;
+    extern const function_base *const svptrue_pat;
+    extern const function_base *const svqadd;
+    extern const function_base *const svqdecb;
+    extern const function_base *const svqdecb_pat;
+    extern const function_base *const svqdecd;
+    extern const function_base *const svqdecd_pat;
+    extern const function_base *const svqdech;
+    extern const function_base *const svqdech_pat;
+    extern const function_base *const svqdecp;
+    extern const function_base *const svqdecw;
+    extern const function_base *const svqdecw_pat;
+    extern const function_base *const svqincb;
+    extern const function_base *const svqincb_pat;
+    extern const function_base *const svqincd;
+    extern const function_base *const svqincd_pat;
+    extern const function_base *const svqinch;
+    extern const function_base *const svqinch_pat;
+    extern const function_base *const svqincp;
+    extern const function_base *const svqincw;
+    extern const function_base *const svqincw_pat;
+    extern const function_base *const svqsub;
+    extern const function_base *const svrbit;
+    extern const function_base *const svrdffr;
+    extern const function_base *const svrecpe;
+    extern const function_base *const svrecps;
+    extern const function_base *const svrecpx;
+    extern const function_base *const svreinterpret;
+    extern const function_base *const svrev;
+    extern const function_base *const svrevb;
+    extern const function_base *const svrevh;
+    extern const function_base *const svrevw;
+    extern const function_base *const svrinta;
+    extern const function_base *const svrinti;
+    extern const function_base *const svrintm;
+    extern const function_base *const svrintn;
+    extern const function_base *const svrintp;
+    extern const function_base *const svrintx;
+    extern const function_base *const svrintz;
+    extern const function_base *const svrsqrte;
+    extern const function_base *const svrsqrts;
+    extern const function_base *const svscale;
+    extern const function_base *const svsel;
+    extern const function_base *const svset2;
+    extern const function_base *const svset3;
+    extern const function_base *const svset4;
+    extern const function_base *const svsetffr;
+    extern const function_base *const svsplice;
+    extern const function_base *const svsqrt;
+    extern const function_base *const svst1;
+    extern const function_base *const svst1_scatter;
+    extern const function_base *const svst1b;
+    extern const function_base *const svst1b_scatter;
+    extern const function_base *const svst1h;
+    extern const function_base *const svst1h_scatter;
+    extern const function_base *const svst1w;
+    extern const function_base *const svst1w_scatter;
+    extern const function_base *const svst2;
+    extern const function_base *const svst3;
+    extern const function_base *const svst4;
+    extern const function_base *const svstnt1;
+    extern const function_base *const svsub;
+    extern const function_base *const svsubr;
+    extern const function_base *const svsudot;
+    extern const function_base *const svsudot_lane;
+    extern const function_base *const svtbl;
+    extern const function_base *const svtmad;
+    extern const function_base *const svtrn1;
+    extern const function_base *const svtrn1q;
+    extern const function_base *const svtrn2;
+    extern const function_base *const svtrn2q;
+    extern const function_base *const svtsmul;
+    extern const function_base *const svtssel;
+    extern const function_base *const svundef;
+    extern const function_base *const svundef2;
+    extern const function_base *const svundef3;
+    extern const function_base *const svundef4;
+    extern const function_base *const svunpkhi;
+    extern const function_base *const svunpklo;
+    extern const function_base *const svusdot;
+    extern const function_base *const svusdot_lane;
+    extern const function_base *const svusmmla;
+    extern const function_base *const svuzp1;
+    extern const function_base *const svuzp1q;
+    extern const function_base *const svuzp2;
+    extern const function_base *const svuzp2q;
+    extern const function_base *const svwhilele;
+    extern const function_base *const svwhilelt;
+    extern const function_base *const svwrffr;
+    extern const function_base *const svzip1;
+    extern const function_base *const svzip1q;
+    extern const function_base *const svzip2;
+    extern const function_base *const svzip2q;
+  }
+}
+
+#endif
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-functions.h b/gcc/config/aarch64/aarch64-sve-builtins-functions.h
new file mode 100644
index 000000000..ee1760668
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-sve-builtins-functions.h
@@ -0,0 +1,630 @@
+/* ACLE support for AArch64 SVE (function_base classes)
+   Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_SVE_BUILTINS_FUNCTIONS_H
+#define GCC_AARCH64_SVE_BUILTINS_FUNCTIONS_H
+
+namespace aarch64_sve {
+
+/* Wrap T, which is derived from function_base, and indicate that the
+   function never has side effects.  It is only necessary to use this
+   wrapper on functions that might have floating-point suffixes, since
+   otherwise we assume by default that the function has no side effects.  */
+template<typename T>
+class quiet : public T
+{
+public:
+  CONSTEXPR quiet () : T () {}
+
+  /* Unfortunately we can't use parameter packs yet.  */
+  template<typename T1>
+  CONSTEXPR quiet (const T1 &t1) : T (t1) {}
+
+  template<typename T1, typename T2>
+  CONSTEXPR quiet (const T1 &t1, const T2 &t2) : T (t1, t2) {}
+
+  template<typename T1, typename T2, typename T3>
+  CONSTEXPR quiet (const T1 &t1, const T2 &t2, const T3 &t3)
+    : T (t1, t2, t3) {}
+
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return 0;
+  }
+};
+
+/* A function_base that sometimes or always operates on tuples of
+   vectors.  */
+class multi_vector_function : public function_base
+{
+public:
+  CONSTEXPR multi_vector_function (unsigned int vectors_per_tuple)
+    : m_vectors_per_tuple (vectors_per_tuple) {}
+
+  unsigned int
+  vectors_per_tuple () const OVERRIDE
+  {
+    return m_vectors_per_tuple;
+  }
+
+  /* The number of vectors in a tuple, or 1 if the function only operates
+     on single vectors.  */
+  unsigned int m_vectors_per_tuple;
+};
+
+/* A function_base that loads or stores contiguous memory elements
+   without extending or truncating them.  */
+class full_width_access : public multi_vector_function
+{
+public:
+  CONSTEXPR full_width_access (unsigned int vectors_per_tuple = 1)
+    : multi_vector_function (vectors_per_tuple) {}
+
+  tree
+  memory_scalar_type (const function_instance &fi) const OVERRIDE
+  {
+    return fi.scalar_type (0);
+  }
+
+  machine_mode
+  memory_vector_mode (const function_instance &fi) const OVERRIDE
+  {
+    machine_mode mode = fi.vector_mode (0);
+    if (m_vectors_per_tuple != 1)
+      mode = targetm.array_mode (mode, m_vectors_per_tuple).require ();
+    return mode;
+  }
+};
+
+/* A function_base that loads elements from memory and extends them
+   to a wider element.  The memory element type is a fixed part of
+   the function base name.  */
+class extending_load : public function_base
+{
+public:
+  CONSTEXPR extending_load (type_suffix_index memory_type)
+    : m_memory_type (memory_type) {}
+
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_READ_MEMORY;
+  }
+
+  tree
+  memory_scalar_type (const function_instance &) const OVERRIDE
+  {
+    return scalar_types[type_suffixes[m_memory_type].vector_type];
+  }
+
+  machine_mode
+  memory_vector_mode (const function_instance &fi) const OVERRIDE
+  {
+    machine_mode mem_mode = type_suffixes[m_memory_type].vector_mode;
+    machine_mode reg_mode = fi.vector_mode (0);
+    return aarch64_sve_data_mode (GET_MODE_INNER (mem_mode),
+				  GET_MODE_NUNITS (reg_mode)).require ();
+  }
+
+  /* Return the rtx code associated with the kind of extension that
+     the load performs.  */
+  rtx_code
+  extend_rtx_code () const
+  {
+    return (type_suffixes[m_memory_type].unsigned_p
+	    ? ZERO_EXTEND : SIGN_EXTEND);
+  }
+
+  /* The type of the memory elements.  This is part of the function base
+     name rather than a true type suffix.  */
+  type_suffix_index m_memory_type;
+};
+
+/* A function_base that truncates vector elements and stores them to memory.
+   The memory element width is a fixed part of the function base name.  */
+class truncating_store : public function_base
+{
+public:
+  CONSTEXPR truncating_store (scalar_int_mode to_mode) : m_to_mode (to_mode) {}
+
+  unsigned int
+  call_properties (const function_instance &) const OVERRIDE
+  {
+    return CP_WRITE_MEMORY;
+  }
+
+  tree
+  memory_scalar_type (const function_instance &fi) const OVERRIDE
+  {
+    /* In truncating stores, the signedness of the memory element is defined
+       to be the same as the signedness of the vector element.  The signedness
+       doesn't make any difference to the behavior of the function.  */
+    type_class_index tclass = fi.type_suffix (0).tclass;
+    unsigned int element_bits = GET_MODE_BITSIZE (m_to_mode);
+    type_suffix_index suffix = find_type_suffix (tclass, element_bits);
+    return scalar_types[type_suffixes[suffix].vector_type];
+  }
+
+  machine_mode
+  memory_vector_mode (const function_instance &fi) const OVERRIDE
+  {
+    poly_uint64 nunits = GET_MODE_NUNITS (fi.vector_mode (0));
+    return aarch64_sve_data_mode (m_to_mode, nunits).require ();
+  }
+
+  /* The mode of a single memory element.  */
+  scalar_int_mode m_to_mode;
+};
+
+/* An incomplete function_base for functions that have an associated rtx code.
+   It simply records information about the mapping for derived classes
+   to use.  */
+class rtx_code_function_base : public function_base
+{
+public:
+  CONSTEXPR rtx_code_function_base (rtx_code code_for_sint,
+				    rtx_code code_for_uint,
+				    int unspec_for_fp = -1)
+    : m_code_for_sint (code_for_sint), m_code_for_uint (code_for_uint),
+      m_unspec_for_fp (unspec_for_fp) {}
+
+  /* The rtx code to use for signed and unsigned integers respectively.
+     Can be UNKNOWN for functions that don't have integer forms.  */
+  rtx_code m_code_for_sint;
+  rtx_code m_code_for_uint;
+
+  /* The UNSPEC_COND_* to use for floating-point operations.  Can be -1
+     for functions that only operate on integers.  */
+  int m_unspec_for_fp;
+};
+
+/* A function_base for functions that have an associated rtx code.
+   It supports all forms of predication except PRED_implicit.  */
+class rtx_code_function : public rtx_code_function_base
+{
+public:
+  CONSTEXPR rtx_code_function (rtx_code code_for_sint, rtx_code code_for_uint,
+			       int unspec_for_fp = -1)
+    : rtx_code_function_base (code_for_sint, code_for_uint, unspec_for_fp) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return e.map_to_rtx_codes (m_code_for_sint, m_code_for_uint,
+			       m_unspec_for_fp);
+  }
+};
+
+/* Like rtx_code_function, but for functions that take what is normally
+   the final argument first.  One use of this class is to handle binary
+   reversed operations; another is to handle MLA-style operations that
+   are normally expressed in GCC as MAD-style operations.  */
+class rtx_code_function_rotated : public rtx_code_function_base
+{
+public:
+  CONSTEXPR rtx_code_function_rotated (rtx_code code_for_sint,
+				       rtx_code code_for_uint,
+				       int unspec_for_fp = -1)
+    : rtx_code_function_base (code_for_sint, code_for_uint, unspec_for_fp) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Rotate the inputs into their normal order, but continue to make _m
+       functions merge with what was originally the first vector argument.  */
+    unsigned int nargs = e.args.length ();
+    e.rotate_inputs_left (e.pred != PRED_none ? 1 : 0, nargs);
+    return e.map_to_rtx_codes (m_code_for_sint, m_code_for_uint,
+			       m_unspec_for_fp, nargs - 1);
+  }
+};
+
+/* An incomplete function_base for functions that have an associated
+   unspec code, with separate codes for signed integers, unsigned
+   integers and floating-point values.  The class simply records
+   information about the mapping for derived classes to use.  */
+class unspec_based_function_base : public function_base
+{
+public:
+  CONSTEXPR unspec_based_function_base (int unspec_for_sint,
+					int unspec_for_uint,
+					int unspec_for_fp)
+    : m_unspec_for_sint (unspec_for_sint),
+      m_unspec_for_uint (unspec_for_uint),
+      m_unspec_for_fp (unspec_for_fp)
+  {}
+
+  /* Return the unspec code to use for INSTANCE, based on type suffix 0.  */
+  int
+  unspec_for (const function_instance &instance) const
+  {
+    return (!instance.type_suffix (0).integer_p ? m_unspec_for_fp
+	    : instance.type_suffix (0).unsigned_p ? m_unspec_for_uint
+	    : m_unspec_for_sint);
+  }
+
+  /* The unspec code associated with signed-integer, unsigned-integer
+     and floating-point operations respectively.  */
+  int m_unspec_for_sint;
+  int m_unspec_for_uint;
+  int m_unspec_for_fp;
+};
+
+/* A function_base for functions that have an associated unspec code.
+   It supports all forms of predication except PRED_implicit.  */
+class unspec_based_function : public unspec_based_function_base
+{
+public:
+  CONSTEXPR unspec_based_function (int unspec_for_sint, int unspec_for_uint,
+				   int unspec_for_fp)
+    : unspec_based_function_base (unspec_for_sint, unspec_for_uint,
+				  unspec_for_fp)
+  {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return e.map_to_unspecs (m_unspec_for_sint, m_unspec_for_uint,
+			     m_unspec_for_fp);
+  }
+};
+
+/* Like unspec_based_function, but for functions that take what is normally
+   the final argument first.  One use of this class is to handle binary
+   reversed operations; another is to handle MLA-style operations that
+   are normally expressed in GCC as MAD-style operations.  */
+class unspec_based_function_rotated : public unspec_based_function_base
+{
+public:
+  CONSTEXPR unspec_based_function_rotated (int unspec_for_sint,
+					   int unspec_for_uint,
+					   int unspec_for_fp)
+    : unspec_based_function_base (unspec_for_sint, unspec_for_uint,
+				  unspec_for_fp)
+  {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Rotate the inputs into their normal order, but continue to make _m
+       functions merge with what was originally the first vector argument.  */
+    unsigned int nargs = e.args.length ();
+    e.rotate_inputs_left (e.pred != PRED_none ? 1 : 0, nargs);
+    return e.map_to_unspecs (m_unspec_for_sint, m_unspec_for_uint,
+			     m_unspec_for_fp, nargs - 1);
+  }
+};
+
+/* Like unspec_based_function, but map the function directly to
+   CODE (UNSPEC, M) instead of using the generic predication-based
+   expansion. where M is the vector mode associated with type suffix 0.
+   This is useful if the unspec doesn't describe the full operation or
+   if the usual predication rules don't apply for some reason.  */
+template<insn_code (*CODE) (int, machine_mode)>
+class unspec_based_function_exact_insn : public unspec_based_function_base
+{
+public:
+  CONSTEXPR unspec_based_function_exact_insn (int unspec_for_sint,
+					      int unspec_for_uint,
+					      int unspec_for_fp)
+    : unspec_based_function_base (unspec_for_sint, unspec_for_uint,
+				  unspec_for_fp)
+  {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return e.use_exact_insn (CODE (unspec_for (e), e.vector_mode (0)));
+  }
+};
+
+/* A function that performs an unspec and then adds it to another value.  */
+typedef unspec_based_function_exact_insn<code_for_aarch64_sve_add>
+  unspec_based_add_function;
+
+/* A functon that uses aarch64_pred* patterns regardless of the
+   predication type.  */
+typedef unspec_based_function_exact_insn<code_for_aarch64_pred>
+  unspec_based_pred_function;
+
+/* A function that acts like unspec_based_function_exact_insn<INT_CODE>
+   when operating on integers, but that expands to an (fma ...)-style
+   aarch64_sve* operation when applied to floats.  */
+template<insn_code (*INT_CODE) (int, machine_mode)>
+class unspec_based_fused_function : public unspec_based_function_base
+{
+public:
+  CONSTEXPR unspec_based_fused_function (int unspec_for_sint,
+					 int unspec_for_uint,
+					 int unspec_for_fp)
+    : unspec_based_function_base (unspec_for_sint, unspec_for_uint,
+				  unspec_for_fp)
+  {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    int unspec = unspec_for (e);
+    insn_code icode;
+    if (e.type_suffix (0).float_p)
+      {
+	/* Put the operands in the normal (fma ...) order, with the accumulator
+	   last.  This fits naturally since that's also the unprinted operand
+	   in the asm output.  */
+	e.rotate_inputs_left (0, e.pred != PRED_none ? 4 : 3);
+	icode = code_for_aarch64_sve (unspec, e.vector_mode (0));
+      }
+    else
+      icode = INT_CODE (unspec, e.vector_mode (0));
+    return e.use_exact_insn (icode);
+  }
+};
+
+/* Like unspec_based_fused_function, but for _lane functions.  */
+template<insn_code (*INT_CODE) (int, machine_mode)>
+class unspec_based_fused_lane_function : public unspec_based_function_base
+{
+public:
+  CONSTEXPR unspec_based_fused_lane_function (int unspec_for_sint,
+					      int unspec_for_uint,
+					      int unspec_for_fp)
+    : unspec_based_function_base (unspec_for_sint, unspec_for_uint,
+				  unspec_for_fp)
+  {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    int unspec = unspec_for (e);
+    insn_code icode;
+    if (e.type_suffix (0).float_p)
+      {
+	/* Put the operands in the normal (fma ...) order, with the accumulator
+	   last.  This fits naturally since that's also the unprinted operand
+	   in the asm output.  */
+	e.rotate_inputs_left (0, e.pred != PRED_none ? 5 : 4);
+	icode = code_for_aarch64_lane (unspec, e.vector_mode (0));
+      }
+    else
+      icode = INT_CODE (unspec, e.vector_mode (0));
+    return e.use_exact_insn (icode);
+  }
+};
+
+/* A function_base that uses CODE_FOR_MODE (M) to get the associated
+   instruction code, where M is the vector mode associated with type
+   suffix N.  */
+template<insn_code (*CODE_FOR_MODE) (machine_mode), unsigned int N>
+class code_for_mode_function : public function_base
+{
+public:
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return e.use_exact_insn (CODE_FOR_MODE (e.vector_mode (N)));
+  }
+};
+
+/* A function that uses code_for_<PATTERN> (M), where M is the vector
+   mode associated with the first type suffix.  */
+#define CODE_FOR_MODE0(PATTERN) code_for_mode_function<code_for_##PATTERN, 0>
+
+/* Likewise for the second type suffix.  */
+#define CODE_FOR_MODE1(PATTERN) code_for_mode_function<code_for_##PATTERN, 1>
+
+/* Like CODE_FOR_MODE0, but the function doesn't raise exceptions when
+   operating on floating-point data.  */
+#define QUIET_CODE_FOR_MODE0(PATTERN) \
+  quiet< code_for_mode_function<code_for_##PATTERN, 0> >
+
+/* A function_base for functions that always expand to a fixed insn pattern,
+   regardless of what the suffixes are.  */
+class fixed_insn_function : public function_base
+{
+public:
+  CONSTEXPR fixed_insn_function (insn_code code) : m_code (code) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    return e.use_exact_insn (m_code);
+  }
+
+  /* The instruction to use.  */
+  insn_code m_code;
+};
+
+/* A function_base for functions that permute their arguments.  */
+class permute : public quiet<function_base>
+{
+public:
+  /* Fold a unary or binary permute with the permute vector given by
+     BUILDER.  */
+  gimple *
+  fold_permute (const gimple_folder &f, const vec_perm_builder &builder) const
+  {
+    /* Punt for now on _b16 and wider; we'd need more complex evpc logic
+       to rerecognize the result.  */
+    if (f.type_suffix (0).bool_p && f.type_suffix (0).element_bits > 8)
+      return NULL;
+
+    unsigned int nargs = gimple_call_num_args (f.call);
+    poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (f.lhs));
+    vec_perm_indices indices (builder, nargs, nelts);
+    tree perm_type = build_vector_type (ssizetype, nelts);
+    return gimple_build_assign (f.lhs, VEC_PERM_EXPR,
+				gimple_call_arg (f.call, 0),
+				gimple_call_arg (f.call, nargs - 1),
+				vec_perm_indices_to_tree (perm_type, indices));
+  }
+};
+
+/* A function_base for functions that permute two vectors using a fixed
+   choice of indices.  */
+class binary_permute : public permute
+{
+public:
+  CONSTEXPR binary_permute (int unspec) : m_unspec (unspec) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    insn_code icode = code_for_aarch64_sve (m_unspec, e.vector_mode (0));
+    return e.use_exact_insn (icode);
+  }
+
+  /* The unspec code associated with the operation.  */
+  int m_unspec;
+};
+
+/* A function_base for functions that reduce a vector to a scalar.  */
+class reduction : public function_base
+{
+public:
+  CONSTEXPR reduction (int unspec)
+    : m_unspec_for_sint (unspec),
+      m_unspec_for_uint (unspec),
+      m_unspec_for_fp (unspec)
+  {}
+
+  CONSTEXPR reduction (int unspec_for_sint, int unspec_for_uint,
+		       int unspec_for_fp)
+    : m_unspec_for_sint (unspec_for_sint),
+      m_unspec_for_uint (unspec_for_uint),
+      m_unspec_for_fp (unspec_for_fp)
+  {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    machine_mode mode = e.vector_mode (0);
+    int unspec = (!e.type_suffix (0).integer_p ? m_unspec_for_fp
+		  : e.type_suffix (0).unsigned_p ? m_unspec_for_uint
+		  : m_unspec_for_sint);
+    /* There's no distinction between SADDV and UADDV for 64-bit elements;
+       the signed versions only exist for narrower elements.  */
+    if (GET_MODE_UNIT_BITSIZE (mode) == 64 && unspec == UNSPEC_SADDV)
+      unspec = UNSPEC_UADDV;
+    return e.use_exact_insn (code_for_aarch64_pred_reduc (unspec, mode));
+  }
+
+  /* The unspec code associated with signed-integer, unsigned-integer
+     and floating-point operations respectively.  */
+  int m_unspec_for_sint;
+  int m_unspec_for_uint;
+  int m_unspec_for_fp;
+};
+
+/* A function_base for functions that shift narrower-than-64-bit values
+   by 64-bit amounts.  */
+class shift_wide : public function_base
+{
+public:
+  CONSTEXPR shift_wide (rtx_code code, int wide_unspec)
+    : m_code (code), m_wide_unspec (wide_unspec) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    machine_mode mode = e.vector_mode (0);
+    machine_mode elem_mode = GET_MODE_INNER (mode);
+
+    /* If the argument is a constant that the normal shifts can handle
+       directly, use them instead.  */
+    rtx shift = unwrap_const_vec_duplicate (e.args.last ());
+    if (aarch64_simd_shift_imm_p (shift, elem_mode, m_code == ASHIFT))
+      {
+	e.args.last () = shift;
+	return e.map_to_rtx_codes (m_code, m_code, -1);
+      }
+
+    if (e.pred == PRED_x)
+      return e.use_unpred_insn (code_for_aarch64_sve (m_wide_unspec, mode));
+
+    return e.use_cond_insn (code_for_cond (m_wide_unspec, mode));
+  }
+
+  /* The rtx code associated with a "normal" shift.  */
+  rtx_code m_code;
+
+  /* The unspec code associated with the wide shift.  */
+  int m_wide_unspec;
+};
+
+/* A function_base for unary functions that count bits.  */
+class unary_count : public quiet<function_base>
+{
+public:
+  CONSTEXPR unary_count (rtx_code code) : m_code (code) {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* The md patterns treat the operand as an integer.  */
+    machine_mode mode = aarch64_sve_int_mode (e.vector_mode (0));
+    e.args.last () = gen_lowpart (mode, e.args.last ());
+
+    if (e.pred == PRED_x)
+      return e.use_pred_x_insn (code_for_aarch64_pred (m_code, mode));
+
+    return e.use_cond_insn (code_for_cond (m_code, mode));
+  }
+
+  /* The rtx code associated with the operation.  */
+  rtx_code m_code;
+};
+
+/* A function_base for svwhile* functions.  */
+class while_comparison : public function_base
+{
+public:
+  CONSTEXPR while_comparison (int unspec_for_sint, int unspec_for_uint)
+    : m_unspec_for_sint (unspec_for_sint),
+      m_unspec_for_uint (unspec_for_uint)
+  {}
+
+  rtx
+  expand (function_expander &e) const OVERRIDE
+  {
+    /* Suffix 0 determines the predicate mode, suffix 1 determines the
+       scalar mode and signedness.  */
+    int unspec = (e.type_suffix (1).unsigned_p
+		  ? m_unspec_for_uint
+		  : m_unspec_for_sint);
+    machine_mode pred_mode = e.vector_mode (0);
+    scalar_mode reg_mode = GET_MODE_INNER (e.vector_mode (1));
+    return e.use_exact_insn (code_for_while (unspec, reg_mode, pred_mode));
+  }
+
+  /* The unspec codes associated with signed and unsigned operations
+     respectively.  */
+  int m_unspec_for_sint;
+  int m_unspec_for_uint;
+};
+
+}
+
+/* Declare the global function base NAME, creating it from an instance
+   of class CLASS with constructor arguments ARGS.  */
+#define FUNCTION(NAME, CLASS, ARGS) \
+  namespace { static CONSTEXPR const CLASS NAME##_obj ARGS; } \
+  namespace functions { const function_base *const NAME = &NAME##_obj; }
+
+#endif
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
new file mode 100644
index 000000000..c6f6ce170
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.cc
@@ -0,0 +1,3451 @@
+/* ACLE support for AArch64 SVE (function shapes)
+   Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "tree.h"
+#include "rtl.h"
+#include "tm_p.h"
+#include "memmodel.h"
+#include "insn-codes.h"
+#include "optabs.h"
+#include "aarch64-sve-builtins.h"
+#include "aarch64-sve-builtins-shapes.h"
+
+/* In the comments below, _t0 represents the first type suffix and _t1
+   represents the second.  Square brackets enclose characters that are
+   present in only the full name, not the overloaded name.  Governing
+   predicate arguments and predicate suffixes are not shown, since they
+   depend on the predication type, which is a separate piece of
+   information from the shape.
+
+   Non-overloaded functions may have additional suffixes beyond the
+   ones shown, if those suffixes don't affect the types in the type
+   signature.  E.g. the predicate form of svtrn1 has a _b<bits> suffix,
+   but this does not affect the prototype, which is always
+   "svbool_t(svbool_t, svbool_t)".  */
+
+namespace aarch64_sve {
+
+/* Return a representation of "const T *".  */
+static tree
+build_const_pointer (tree t)
+{
+  return build_pointer_type (build_qualified_type (t, TYPE_QUAL_CONST));
+}
+
+/* If INSTANCE has a governing predicate, add it to the list of argument
+   types in ARGUMENT_TYPES.  RETURN_TYPE is the type returned by the
+   function.  */
+static void
+apply_predication (const function_instance &instance, tree return_type,
+		   vec<tree> &argument_types)
+{
+  if (instance.pred != PRED_none)
+    {
+      argument_types.quick_insert (0, get_svbool_t ());
+      /* For unary merge operations, the first argument is a vector with
+	 the same type as the result.  For unary_convert_narrowt it also
+	 provides the "bottom" half of active elements, and is present
+	 for all types of predication.  */
+      if ((argument_types.length () == 2 && instance.pred == PRED_m)
+	  || instance.shape == shapes::unary_convert_narrowt)
+	argument_types.quick_insert (0, return_type);
+    }
+}
+
+/* Parse and move past an element type in FORMAT and return it as a type
+   suffix.  The format is:
+
+   [01]    - the element type in type suffix 0 or 1 of INSTANCE
+   f<bits> - a floating-point type with the given number of bits
+   f[01]   - a floating-point type with the same width as type suffix 0 or 1
+   B       - bfloat16_t
+   h<elt>  - a half-sized version of <elt>
+   p       - a predicate (represented as TYPE_SUFFIX_b)
+   q<elt>  - a quarter-sized version of <elt>
+   s<bits> - a signed type with the given number of bits
+   s[01]   - a signed type with the same width as type suffix 0 or 1
+   u<bits> - an unsigned type with the given number of bits
+   u[01]   - an unsigned type with the same width as type suffix 0 or 1
+   w<elt>  - a 64-bit version of <elt> if <elt> is integral, otherwise <elt>
+
+   where <elt> is another element type.  */
+static type_suffix_index
+parse_element_type (const function_instance &instance, const char *&format)
+{
+  int ch = *format++;
+
+  if (ch == 'f' || ch == 's' || ch == 'u')
+    {
+      type_class_index tclass = (ch == 'f' ? TYPE_float
+				 : ch == 's' ? TYPE_signed
+				 : TYPE_unsigned);
+      char *end;
+      unsigned int bits = strtol (format, &end, 10);
+      format = end;
+      if (bits == 0 || bits == 1)
+	bits = instance.type_suffix (bits).element_bits;
+      return find_type_suffix (tclass, bits);
+    }
+
+  if (ch == 'w')
+    {
+      type_suffix_index suffix = parse_element_type (instance, format);
+      if (type_suffixes[suffix].integer_p)
+	return find_type_suffix (type_suffixes[suffix].tclass, 64);
+      return suffix;
+    }
+
+  if (ch == 'p')
+    return TYPE_SUFFIX_b;
+
+  if (ch == 'B')
+    return TYPE_SUFFIX_bf16;
+
+  if (ch == 'q')
+    {
+      type_suffix_index suffix = parse_element_type (instance, format);
+      return find_type_suffix (type_suffixes[suffix].tclass,
+			       type_suffixes[suffix].element_bits / 4);
+    }
+
+  if (ch == 'h')
+    {
+      type_suffix_index suffix = parse_element_type (instance, format);
+      /* Widening and narrowing doesn't change the type for predicates;
+	 everything's still an svbool_t.  */
+      if (suffix == TYPE_SUFFIX_b)
+	return suffix;
+      return find_type_suffix (type_suffixes[suffix].tclass,
+			       type_suffixes[suffix].element_bits / 2);
+    }
+
+  if (ch == '0' || ch == '1')
+    return instance.type_suffix_ids[ch - '0'];
+
+  gcc_unreachable ();
+}
+
+/* Read and return a type from FORMAT for function INSTANCE.  Advance
+   FORMAT beyond the type string.  The format is:
+
+   _       - void
+   al      - array pointer for loads
+   ap      - array pointer for prefetches
+   as      - array pointer for stores
+   b       - base vector type (from a _<m0>base suffix)
+   d       - displacement vector type (from a _<m1>index or _<m1>offset suffix)
+   e<name> - an enum with the given name
+   s<elt>  - a scalar type with the given element suffix
+   t<elt>  - a vector or tuple type with given element suffix [*1]
+   v<elt>  - a vector with the given element suffix
+
+   where <elt> has the format described above parse_element_type
+
+   [*1] the vectors_per_tuple function indicates whether the type should
+        be a tuple, and if so, how many vectors it should contain.  */
+static tree
+parse_type (const function_instance &instance, const char *&format)
+{
+  int ch = *format++;
+
+  if (ch == '_')
+    return void_type_node;
+
+  if (ch == 'a')
+    {
+      ch = *format++;
+      if (ch == 'l')
+	return build_const_pointer (instance.memory_scalar_type ());
+      if (ch == 'p')
+	return const_ptr_type_node;
+      if (ch == 's')
+	return build_pointer_type (instance.memory_scalar_type ());
+      gcc_unreachable ();
+    }
+
+  if (ch == 'b')
+    return instance.base_vector_type ();
+
+  if (ch == 'd')
+    return instance.displacement_vector_type ();
+
+  if (ch == 'e')
+    {
+      if (strncmp (format, "pattern", 7) == 0)
+	{
+	  format += 7;
+	  return acle_svpattern;
+	}
+      if (strncmp (format, "prfop", 5) == 0)
+	{
+	  format += 5;
+	  return acle_svprfop;
+	}
+      gcc_unreachable ();
+    }
+
+  if (ch == 's')
+    {
+      type_suffix_index suffix = parse_element_type (instance, format);
+      return scalar_types[type_suffixes[suffix].vector_type];
+    }
+
+  if (ch == 't')
+    {
+      type_suffix_index suffix = parse_element_type (instance, format);
+      vector_type_index vector_type = type_suffixes[suffix].vector_type;
+      unsigned int num_vectors = instance.vectors_per_tuple ();
+      return acle_vector_types[num_vectors - 1][vector_type];
+    }
+
+  if (ch == 'v')
+    {
+      type_suffix_index suffix = parse_element_type (instance, format);
+      return acle_vector_types[0][type_suffixes[suffix].vector_type];
+    }
+
+  gcc_unreachable ();
+}
+
+/* Read and move past any argument count at FORMAT for the function
+   signature of INSTANCE.  The counts are:
+
+   *q: one argument per element in a 128-bit quadword (as for svdupq)
+   *t: one argument per vector in a tuple (as for svcreate)
+
+   Otherwise the count is 1.  */
+static unsigned int
+parse_count (const function_instance &instance, const char *&format)
+{
+  if (format[0] == '*' && format[1] == 'q')
+    {
+      format += 2;
+      return instance.elements_per_vq (0);
+    }
+  if (format[0] == '*' && format[1] == 't')
+    {
+      format += 2;
+      return instance.vectors_per_tuple ();
+    }
+  return 1;
+}
+
+/* Read a type signature for INSTANCE from FORMAT.  Add the argument types
+   to ARGUMENT_TYPES and return the return type.
+
+   The format is a comma-separated list of types (as for parse_type),
+   with the first type being the return type and the rest being the
+   argument types.  Each argument type can be followed by an optional
+   count (as for parse_count).  */
+static tree
+parse_signature (const function_instance &instance, const char *format,
+		 vec<tree> &argument_types)
+{
+  tree return_type = parse_type (instance, format);
+  while (format[0] == ',')
+    {
+      format += 1;
+      tree argument_type = parse_type (instance, format);
+      unsigned int count = parse_count (instance, format);
+      for (unsigned int i = 0; i < count; ++i)
+	argument_types.quick_push (argument_type);
+    }
+  gcc_assert (format[0] == 0);
+  return return_type;
+}
+
+/* Add one function instance for GROUP, using mode suffix MODE_SUFFIX_ID,
+   the type suffixes at index TI and the predication suffix at index PI.
+   The other arguments are as for build_all.  */
+static void
+build_one (function_builder &b, const char *signature,
+	   const function_group_info &group, mode_suffix_index mode_suffix_id,
+	   unsigned int ti, unsigned int pi, bool force_direct_overloads)
+{
+  /* Byte forms of svdupq take 16 arguments.  */
+  auto_vec<tree, 16> argument_types;
+  function_instance instance (group.base_name, *group.base, *group.shape,
+			      mode_suffix_id, group.types[ti],
+			      group.preds[pi]);
+  tree return_type = parse_signature (instance, signature, argument_types);
+  apply_predication (instance, return_type, argument_types);
+  b.add_unique_function (instance, return_type, argument_types,
+			 group.required_extensions, force_direct_overloads);
+}
+
+/* GROUP describes some sort of gather or scatter operation.  There are
+   two cases:
+
+   - If the function has any type suffixes (as for loads and stores), the
+     first function type suffix specifies either a 32-bit or a 64-bit type,
+     which in turn selects either MODE32 or MODE64 as the addressing mode.
+     Add a function instance for every type and predicate combination
+     in GROUP for which the associated addressing mode is not MODE_none.
+
+   - If the function has no type suffixes (as for prefetches), add one
+     MODE32 form and one MODE64 form for each predication type.
+
+   The other arguments are as for build_all.  */
+static void
+build_32_64 (function_builder &b, const char *signature,
+	     const function_group_info &group, mode_suffix_index mode32,
+	     mode_suffix_index mode64, bool force_direct_overloads = false)
+{
+  for (unsigned int pi = 0; group.preds[pi] != NUM_PREDS; ++pi)
+    if (group.types[0][0] == NUM_TYPE_SUFFIXES)
+      {
+	gcc_assert (mode32 != MODE_none && mode64 != MODE_none);
+	build_one (b, signature, group, mode32, 0, pi,
+		   force_direct_overloads);
+	build_one (b, signature, group, mode64, 0, pi,
+		   force_direct_overloads);
+      }
+    else
+      for (unsigned int ti = 0; group.types[ti][0] != NUM_TYPE_SUFFIXES; ++ti)
+	{
+	  unsigned int bits = type_suffixes[group.types[ti][0]].element_bits;
+	  gcc_assert (bits == 32 || bits == 64);
+	  mode_suffix_index mode = bits == 32 ? mode32 : mode64;
+	  if (mode != MODE_none)
+	    build_one (b, signature, group, mode, ti, pi,
+		       force_direct_overloads);
+	}
+}
+
+/* For every type and predicate combination in GROUP, add one function
+   that takes a scalar (pointer) base and a signed vector array index,
+   and another that instead takes an unsigned vector array index.
+   The vector array index has the same element size as the first
+   function type suffix.  SIGNATURE is as for build_all.  */
+static void
+build_sv_index (function_builder &b, const char *signature,
+		const function_group_info &group)
+{
+  build_32_64 (b, signature, group, MODE_s32index, MODE_s64index);
+  build_32_64 (b, signature, group, MODE_u32index, MODE_u64index);
+}
+
+/* Like build_sv_index, but only handle 64-bit types.  */
+static void
+build_sv_index64 (function_builder &b, const char *signature,
+		  const function_group_info &group)
+{
+  build_32_64 (b, signature, group, MODE_none, MODE_s64index);
+  build_32_64 (b, signature, group, MODE_none, MODE_u64index);
+}
+
+/* Like build_sv_index, but taking vector byte offsets instead of vector
+   array indices.  */
+static void
+build_sv_offset (function_builder &b, const char *signature,
+		 const function_group_info &group)
+{
+  build_32_64 (b, signature, group, MODE_s32offset, MODE_s64offset);
+  build_32_64 (b, signature, group, MODE_u32offset, MODE_u64offset);
+}
+
+/* Like build_sv_offset, but exclude offsets that must be interpreted
+   as signed (i.e. s32offset).  */
+static void
+build_sv_uint_offset (function_builder &b, const char *signature,
+		      const function_group_info &group)
+{
+  build_32_64 (b, signature, group, MODE_none, MODE_s64offset);
+  build_32_64 (b, signature, group, MODE_u32offset, MODE_u64offset);
+}
+
+/* For every type and predicate combination in GROUP, add a function
+   that takes a vector base address and no displacement.  The vector
+   base has the same element size as the first type suffix.
+
+   The other arguments are as for build_all.  */
+static void
+build_v_base (function_builder &b, const char *signature,
+	      const function_group_info &group,
+	      bool force_direct_overloads = false)
+{
+  build_32_64 (b, signature, group, MODE_u32base, MODE_u64base,
+	       force_direct_overloads);
+}
+
+/* Like build_v_base, but for functions that also take a scalar array
+   index.  */
+static void
+build_vs_index (function_builder &b, const char *signature,
+		const function_group_info &group,
+		bool force_direct_overloads = false)
+{
+  build_32_64 (b, signature, group, MODE_u32base_index, MODE_u64base_index,
+	       force_direct_overloads);
+}
+
+/* Like build_v_base, but for functions that also take a scalar byte
+   offset.  */
+static void
+build_vs_offset (function_builder &b, const char *signature,
+		 const function_group_info &group,
+		 bool force_direct_overloads = false)
+{
+  build_32_64 (b, signature, group, MODE_u32base_offset, MODE_u64base_offset,
+	       force_direct_overloads);
+}
+
+/* Add a function instance for every type and predicate combination
+   in GROUP.  Take the function base name from GROUP and the mode suffix
+   from MODE_SUFFIX_ID.  Use SIGNATURE to construct the function signature
+   without a governing predicate, then use apply_predication to add in the
+   predicate.  FORCE_DIRECT_OVERLOADS is true if there is a one-to-one
+   mapping between "short" and "full" names, and if standard overload
+   resolution therefore isn't necessary.  */
+static void
+build_all (function_builder &b, const char *signature,
+	   const function_group_info &group, mode_suffix_index mode_suffix_id,
+	   bool force_direct_overloads = false)
+{
+  for (unsigned int pi = 0; group.preds[pi] != NUM_PREDS; ++pi)
+    for (unsigned int ti = 0;
+	 ti == 0 || group.types[ti][0] != NUM_TYPE_SUFFIXES; ++ti)
+      build_one (b, signature, group, mode_suffix_id, ti, pi,
+		 force_direct_overloads);
+}
+
+/* TYPE is the largest type suffix associated with the arguments of R,
+   but the result is twice as wide.  Return the associated type suffix
+   if it exists, otherwise report an appropriate error and return
+   NUM_TYPE_SUFFIXES.  */
+static type_suffix_index
+long_type_suffix (function_resolver &r, type_suffix_index type)
+{
+  unsigned int element_bits = type_suffixes[type].element_bits;
+  if (type_suffixes[type].integer_p && element_bits < 64)
+    return find_type_suffix (type_suffixes[type].tclass, element_bits * 2);
+
+  r.report_no_such_form (type);
+  return NUM_TYPE_SUFFIXES;
+}
+
+/* Declare the function shape NAME, pointing it to an instance
+   of class <NAME>_def.  */
+#define SHAPE(NAME) \
+  static CONSTEXPR const NAME##_def NAME##_obj; \
+  namespace shapes { const function_shape *const NAME = &NAME##_obj; }
+
+/* Base class for functions that are not overloaded.  */
+struct nonoverloaded_base : public function_shape
+{
+  bool
+  explicit_type_suffix_p (unsigned int) const OVERRIDE
+  {
+    return true;
+  }
+
+  tree
+  resolve (function_resolver &) const OVERRIDE
+  {
+    gcc_unreachable ();
+  }
+};
+
+/* Base class for overloaded functions.  Bit N of EXPLICIT_MASK is true
+   if type suffix N appears in the overloaded name.  */
+template<unsigned int EXPLICIT_MASK>
+struct overloaded_base : public function_shape
+{
+  bool
+  explicit_type_suffix_p (unsigned int i) const OVERRIDE
+  {
+    return (EXPLICIT_MASK >> i) & 1;
+  }
+};
+
+/* Base class for adr_index and adr_offset.  */
+struct adr_base : public overloaded_base<0>
+{
+  /* The function takes two arguments: a vector base and a vector displacement
+     (either an index or an offset).  Resolve based on them both.  */
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    mode_suffix_index mode;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (mode = r.resolve_adr_address (0)) == MODE_none)
+      return error_mark_node;
+
+    return r.resolve_to (mode);
+  };
+};
+
+/* Base class for narrowing bottom binary functions that take an
+   immediate second operand.  The result is half the size of input
+   and has class CLASS.  */
+template<type_class_index CLASS = function_resolver::SAME_TYPE_CLASS>
+struct binary_imm_narrowb_base : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_n);
+    STATIC_ASSERT (CLASS == function_resolver::SAME_TYPE_CLASS
+		   || CLASS == TYPE_unsigned);
+    if (CLASS == TYPE_unsigned)
+      build_all (b, "vhu0,v0,su64", group, MODE_n);
+    else
+      build_all (b, "vh0,v0,su64", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (1, 1);
+  }
+};
+
+/* The top equivalent of binary_imm_narrowb_base.  It takes three arguments,
+   with the first being the values of the even elements, which are typically
+   the result of the narrowb operation.  */
+template<type_class_index CLASS = function_resolver::SAME_TYPE_CLASS>
+struct binary_imm_narrowt_base : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_n);
+    STATIC_ASSERT (CLASS == function_resolver::SAME_TYPE_CLASS
+		   || CLASS == TYPE_unsigned);
+    if (CLASS == TYPE_unsigned)
+      build_all (b, "vhu0,vhu0,v0,su64", group, MODE_n);
+    else
+      build_all (b, "vh0,vh0,v0,su64", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (3, i, nargs)
+	|| (type = r.infer_vector_type (i + 1)) == NUM_TYPE_SUFFIXES
+	|| !r.require_derived_vector_type (i, i + 1, type, CLASS, r.HALF_SIZE)
+	|| !r.require_integer_immediate (i + 2))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+
+/* Base class for long (i.e. narrow op narrow -> wide) binary functions
+   that take an immediate second operand.  The type suffix specifies
+   the wider type.  */
+struct binary_imm_long_base : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_n);
+    build_all (b, "v0,vh0,su64", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type, result_type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_integer_immediate (i + 1)
+	|| (result_type = long_type_suffix (r, type)) == NUM_TYPE_SUFFIXES)
+      return error_mark_node;
+
+    if (tree res = r.lookup_form (r.mode_suffix_id, result_type))
+      return res;
+
+    return r.report_no_such_form (type);
+  }
+};
+
+/* Base class for inc_dec and inc_dec_pat.  */
+struct inc_dec_base : public overloaded_base<0>
+{
+  CONSTEXPR inc_dec_base (bool pat_p) : m_pat_p (pat_p) {}
+
+  /* Resolve based on the first argument only, which must be either a
+     scalar or a vector.  If it's a scalar, it must be a 32-bit or
+     64-bit integer.  */
+  tree
+  resolve (function_resolver &r) const
+  {
+    unsigned int i, nargs;
+    if (!r.check_gp_argument (m_pat_p ? 3 : 2, i, nargs)
+	|| !r.require_vector_or_scalar_type (i))
+      return error_mark_node;
+
+    mode_suffix_index mode;
+    type_suffix_index type;
+    if (r.scalar_argument_p (i))
+      {
+	mode = MODE_n;
+	type = r.infer_integer_scalar_type (i);
+      }
+    else
+      {
+	mode = MODE_none;
+	type = r.infer_vector_type (i);
+      }
+    if (type == NUM_TYPE_SUFFIXES)
+      return error_mark_node;
+
+    for (++i; i < nargs; ++i)
+      if (!r.require_integer_immediate (i))
+	return error_mark_node;
+
+    return r.resolve_to (mode, type);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    return c.require_immediate_range (m_pat_p ? 2 : 1, 1, 16);
+  }
+
+  bool m_pat_p;
+};
+
+/* Base class for load and load_replicate.  */
+struct load_contiguous_base : public overloaded_base<0>
+{
+  /* Resolve a call based purely on a pointer argument.  The other arguments
+     are a governing predicate and (for MODE_vnum) a vnum offset.  */
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    bool vnum_p = r.mode_suffix_id == MODE_vnum;
+    gcc_assert (r.mode_suffix_id == MODE_none || vnum_p);
+
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (vnum_p ? 2 : 1, i, nargs)
+	|| (type = r.infer_pointer_type (i)) == NUM_TYPE_SUFFIXES
+	|| (vnum_p && !r.require_scalar_type (i + 1, "int64_t")))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+
+/* Base class for gather loads that take a scalar base and a vector
+   displacement (either an offset or an index).  */
+struct load_gather_sv_base : public overloaded_base<0>
+{
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    mode_suffix_index mode;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_pointer_type (i, true)) == NUM_TYPE_SUFFIXES
+	|| (mode = r.resolve_sv_displacement (i + 1, type, true),
+	    mode == MODE_none))
+      return error_mark_node;
+
+    return r.resolve_to (mode, type);
+  }
+};
+
+/* Base class for load_ext_gather_index and load_ext_gather_offset,
+   which differ only in the units of the displacement.  */
+struct load_ext_gather_base : public overloaded_base<1>
+{
+  /* Resolve a gather load that takes one of:
+
+     - a scalar pointer base and a vector displacement
+     - a vector base with no displacement or
+     - a vector base and a scalar displacement
+
+     The function has an explicit type suffix that determines the type
+     of the loaded data.  */
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    /* No resolution is needed for a vector base with no displacement;
+       there's a one-to-one mapping between short and long names.  */
+    gcc_assert (r.displacement_units () != UNITS_none);
+
+    type_suffix_index type = r.type_suffix_ids[0];
+
+    unsigned int i, nargs;
+    mode_suffix_index mode;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (mode = r.resolve_gather_address (i, type, true)) == MODE_none)
+      return error_mark_node;
+
+    return r.resolve_to (mode, type);
+  }
+};
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:quarter>_t,
+		       sv<t0:quarter>_t)  (for integer t0)
+   sv<t0>_t svmmla[_t0](sv<t0>_t, sv<t0>_t, sv<t0>_t)  (for floating-point t0)
+
+   The functions act like the equivalent of "ternary_qq" for integer elements
+   and normal vector-only ternary functions for floating-point elements.  */
+struct mmla_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    /* svmmla is distributed over several extensions.  Allow the common
+       denominator to define the overloaded svmmla function without
+       defining any specific versions.  */
+    if (group.types[0][0] != NUM_TYPE_SUFFIXES)
+      {
+	if (type_suffixes[group.types[0][0]].float_p)
+	  build_all (b, "v0,v0,v0,v0", group, MODE_none);
+	else
+	  build_all (b, "v0,v0,vq0,vq0", group, MODE_none);
+      }
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (3, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
+      return error_mark_node;
+
+    /* Make sure that the function exists now, since not all forms
+       follow a set pattern after this point.  */
+    tree res = r.resolve_to (r.mode_suffix_id, type);
+    if (res == error_mark_node)
+      return res;
+
+    bool float_p = type_suffixes[type].float_p;
+    unsigned int modifier = float_p ? r.SAME_SIZE : r.QUARTER_SIZE;
+    if (!r.require_derived_vector_type (i + 1, i, type, r.SAME_TYPE_CLASS,
+					modifier)
+	|| !r.require_derived_vector_type (i + 2, i, type, r.SAME_TYPE_CLASS,
+					   modifier))
+      return error_mark_node;
+
+    return res;
+  }
+};
+SHAPE (mmla)
+
+/* Base class for prefetch_gather_index and prefetch_gather_offset,
+   which differ only in the units of the displacement.  */
+struct prefetch_gather_base : public overloaded_base<0>
+{
+  /* Resolve a gather prefetch that takes one of:
+
+     - a scalar pointer base (const void *) and a vector displacement
+     - a vector base with no displacement or
+     - a vector base and a scalar displacement
+
+     The prefetch operation is the final argument.  This is purely a
+     mode-based resolution; there are no type suffixes.  */
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    bool has_displacement_p = r.displacement_units () != UNITS_none;
+
+    unsigned int i, nargs;
+    mode_suffix_index mode;
+    if (!r.check_gp_argument (has_displacement_p ? 3 : 2, i, nargs)
+	|| (mode = r.resolve_gather_address (i, NUM_TYPE_SUFFIXES,
+					     false)) == MODE_none
+	|| !r.require_integer_immediate (nargs - 1))
+      return error_mark_node;
+
+    return r.resolve_to (mode);
+  }
+};
+
+/* Wraps BASE to provide a narrowing shift right function.  Argument N
+   is an immediate shift amount in the range [1, sizeof(<t0>_t) * 4].  */
+template<typename BASE, unsigned int N>
+struct shift_right_imm_narrow_wrapper : public BASE
+{
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    unsigned int bits = c.type_suffix (0).element_bits / 2;
+    return c.require_immediate_range (N, 1, bits);
+  }
+};
+
+/* Base class for store_scatter_index and store_scatter_offset,
+   which differ only in the units of the displacement.  */
+struct store_scatter_base : public overloaded_base<0>
+{
+  /* Resolve a scatter store that takes one of:
+
+     - a scalar pointer base and a vector displacement
+     - a vector base with no displacement or
+     - a vector base and a scalar displacement
+
+     The stored data is the final argument, and it determines the
+     type suffix.  */
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    bool has_displacement_p = r.displacement_units () != UNITS_none;
+
+    unsigned int i, nargs;
+    mode_suffix_index mode;
+    type_suffix_index type;
+    if (!r.check_gp_argument (has_displacement_p ? 3 : 2, i, nargs)
+	|| (type = r.infer_sd_vector_type (nargs - 1)) == NUM_TYPE_SUFFIXES
+	|| (mode = r.resolve_gather_address (i, type, false)) == MODE_none)
+      return error_mark_node;
+
+    return r.resolve_to (mode, type);
+  }
+};
+
+/* Base class for ternary operations in which the final argument is an
+   immediate shift amount.  The derived class should check the range.  */
+struct ternary_shift_imm_base : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_n);
+    build_all (b, "v0,v0,v0,su64", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (2, 1);
+  }
+};
+
+/* Base class for ternary operations in which the first argument has the
+   same element type as the result, and in which the second and third
+   arguments have an element type that is derived the first.
+
+   MODIFIER is the number of element bits in the second and third
+   arguments, or a function_resolver modifier that says how this
+   precision is derived from the first argument's elements.
+
+   TYPE_CLASS2 and TYPE_CLASS3 are the type classes of the second and
+   third arguments, or function_resolver::SAME_TYPE_CLASS if the type
+   class is the same as the first argument.  */
+template<unsigned int MODIFIER,
+	 type_class_index TYPE_CLASS2 = function_resolver::SAME_TYPE_CLASS,
+	 type_class_index TYPE_CLASS3 = function_resolver::SAME_TYPE_CLASS>
+struct ternary_resize2_opt_n_base : public overloaded_base<0>
+{
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (3, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_derived_vector_type (i + 1, i, type, TYPE_CLASS2,
+					   MODIFIER))
+      return error_mark_node;
+
+    return r.finish_opt_n_resolution (i + 2, i, type, TYPE_CLASS3, MODIFIER);
+  }
+};
+
+/* Like ternary_resize2_opt_n_base, but for functions that don't take
+   a final scalar argument.  */
+template<unsigned int MODIFIER,
+	 type_class_index TYPE_CLASS2 = function_resolver::SAME_TYPE_CLASS,
+	 type_class_index TYPE_CLASS3 = function_resolver::SAME_TYPE_CLASS>
+struct ternary_resize2_base : public overloaded_base<0>
+{
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (3, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_derived_vector_type (i + 1, i, type, TYPE_CLASS2,
+					   MODIFIER)
+	|| !r.require_derived_vector_type (i + 2, i, type, TYPE_CLASS3,
+					   MODIFIER))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+
+/* Like ternary_resize2_opt_n_base, but for functions that take a final
+   lane argument.  */
+template<unsigned int MODIFIER,
+	 type_class_index TYPE_CLASS2 = function_resolver::SAME_TYPE_CLASS,
+	 type_class_index TYPE_CLASS3 = function_resolver::SAME_TYPE_CLASS>
+struct ternary_resize2_lane_base : public overloaded_base<0>
+{
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (4, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_derived_vector_type (i + 1, i, type, TYPE_CLASS2,
+					   MODIFIER)
+	|| !r.require_derived_vector_type (i + 2, i, type, TYPE_CLASS3,
+					   MODIFIER)
+	|| !r.require_integer_immediate (i + 3))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+
+/* A specialization of ternary_resize2_lane_base for bfloat16 elements,
+   indexed in groups of N elements.  */
+template<unsigned int N>
+struct ternary_bfloat_lane_base
+  : public ternary_resize2_lane_base<16, TYPE_bfloat, TYPE_bfloat>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vB,vB,su64", group, MODE_none);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    return c.require_immediate_lane_index (3, N);
+  }
+};
+
+/* A specialization of ternary_resize2_lane_base for quarter-sized
+   elements.  */
+template<type_class_index TYPE_CLASS2 = function_resolver::SAME_TYPE_CLASS,
+	 type_class_index TYPE_CLASS3 = function_resolver::SAME_TYPE_CLASS>
+struct ternary_qq_lane_base
+  : public ternary_resize2_lane_base<function_resolver::QUARTER_SIZE,
+				     TYPE_CLASS2, TYPE_CLASS3>
+{
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    return c.require_immediate_lane_index (3, 4);
+  }
+};
+
+/* Base class for narrowing bottom unary functions.  The result is half
+   the size of input and has class CLASS.  */
+template<type_class_index CLASS = function_resolver::SAME_TYPE_CLASS>
+struct unary_narrowb_base : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    STATIC_ASSERT (CLASS == function_resolver::SAME_TYPE_CLASS
+		   || CLASS == TYPE_unsigned);
+    if (CLASS == TYPE_unsigned)
+      build_all (b, "vhu0,v0", group, MODE_none);
+    else
+      build_all (b, "vh0,v0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_unary (CLASS, r.HALF_SIZE);
+  }
+};
+
+/* The top equivalent of unary_imm_narrowb_base.  All forms take the values
+   of the even elements as an extra argument, before any governing predicate.
+   These even elements are typically the result of the narrowb operation.  */
+template<type_class_index CLASS = function_resolver::SAME_TYPE_CLASS>
+struct unary_narrowt_base : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    STATIC_ASSERT (CLASS == function_resolver::SAME_TYPE_CLASS
+		   || CLASS == TYPE_unsigned);
+    if (CLASS == TYPE_unsigned)
+      build_all (b, "vhu0,vhu0,v0", group, MODE_none);
+    else
+      build_all (b, "vh0,vh0,v0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_vector_type (i + 1)) == NUM_TYPE_SUFFIXES
+	|| !r.require_derived_vector_type (i, i + 1, type, CLASS, r.HALF_SIZE))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+
+/* sv<m0>_t svfoo[_m0base]_[m1]index(sv<m0>_t, sv<m1>_t)
+
+   for all valid combinations of vector base type <m0> and vector
+   displacement type <m1>.  */
+struct adr_index_def : public adr_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_index);
+    build_all (b, "b,b,d", group, MODE_u32base_s32index);
+    build_all (b, "b,b,d", group, MODE_u32base_u32index);
+    build_all (b, "b,b,d", group, MODE_u64base_s64index);
+    build_all (b, "b,b,d", group, MODE_u64base_u64index);
+  }
+};
+SHAPE (adr_index)
+
+/* sv<m0>_t svfoo[_m0base]_[m1]offset(sv<m0>_t, sv<m1>_t).
+
+   for all valid combinations of vector base type <m0> and vector
+   displacement type <m1>.  */
+struct adr_offset_def : public adr_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_offset);
+    build_all (b, "b,b,d", group, MODE_u32base_s32offset);
+    build_all (b, "b,b,d", group, MODE_u32base_u32offset);
+    build_all (b, "b,b,d", group, MODE_u64base_s64offset);
+    build_all (b, "b,b,d", group, MODE_u64base_u64offset);
+  }
+};
+SHAPE (adr_offset)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0>_t)
+
+   i.e. a binary operation with uniform types, but with no scalar form.  */
+struct binary_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,v0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (2);
+  }
+};
+SHAPE (binary)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:int>_t)
+   sv<t0>_t svfoo[_n_t0](sv<t0>_t, <t0:int>_t).
+
+   i.e. a version of the standard binary shape binary_opt_n in which
+   the final argument is always a signed integer.  */
+struct binary_int_opt_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vs0", group, MODE_none);
+    build_all (b, "v0,v0,ss0", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
+      return error_mark_node;
+
+    return r.finish_opt_n_resolution (i + 1, i, type, TYPE_signed);
+  }
+};
+SHAPE (binary_int_opt_n)
+
+/* sv<t0>_t svfoo_<t0>(sv<t0>_t, sv<t0>_t, uint64_t)
+
+   where the final argument is an integer constant expression in the
+   range [0, 16 / sizeof (<t0>_t) - 1].  */
+struct binary_lane_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,v0,su64", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (2, 1);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    return c.require_immediate_lane_index (2);
+  }
+};
+SHAPE (binary_lane)
+
+/* sv<t0>_t svfoo[_t0](sv<t0:half>_t, sv<t0:half>_t, uint64_t).
+
+   where the final argument is an integer constant expression in the
+   range [0, 32 / sizeof (<t0>_t) - 1].  */
+struct binary_long_lane_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,vh0,vh0,su64", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type, result_type;
+    if (!r.check_gp_argument (3, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_matching_vector_type (i + 1, type)
+	|| !r.require_integer_immediate (i + 2)
+	|| (result_type = long_type_suffix (r, type)) == NUM_TYPE_SUFFIXES)
+      return error_mark_node;
+
+    if (tree res = r.lookup_form (r.mode_suffix_id, result_type))
+      return res;
+
+    return r.report_no_such_form (type);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    return c.require_immediate_lane_index (2);
+  }
+};
+SHAPE (binary_long_lane)
+
+/* sv<t0>_t svfoo[_t0](sv<t0:half>_t, sv<t0:half>_t)
+   sv<t0>_t svfoo[_n_t0](sv<t0:half>_t, <t0:half>_t).  */
+struct binary_long_opt_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,vh0,vh0", group, MODE_none);
+    build_all (b, "v0,vh0,sh0", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type, result_type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	|| (result_type = long_type_suffix (r, type)) == NUM_TYPE_SUFFIXES)
+      return error_mark_node;
+
+    return r.finish_opt_n_resolution (i + 1, i, type, r.SAME_TYPE_CLASS,
+				      r.SAME_SIZE, result_type);
+  }
+};
+SHAPE (binary_long_opt_n)
+
+/* sv<t0>_t svfoo[_n_t0](sv<t0>_t, <t0>_t).
+
+   i.e. a binary operation in which the final argument is always a scalar
+   rather than a vector.  */
+struct binary_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_n);
+    build_all (b, "v0,v0,s0", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_derived_scalar_type (i + 1, r.SAME_TYPE_CLASS))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (binary_n)
+
+/* sv<t0:half>_t svfoo[_t0](sv<t0>_t, sv<t0>_t)
+   sv<t0:half>_t svfoo[_n_t0](sv<t0>_t, <t0>_t)
+
+   i.e. a version of binary_opt_n in which the output elements are half the
+   width of the input elements.  */
+struct binary_narrowb_opt_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "vh0,v0,v0", group, MODE_none);
+    build_all (b, "vh0,v0,s0", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform_opt_n (2);
+  }
+};
+SHAPE (binary_narrowb_opt_n)
+
+/* sv<t0:half>_t svfoo[_t0](sv<t0:half>_t, sv<t0>_t, sv<t0>_t)
+   sv<t0:half>_t svfoo[_n_t0](sv<t0:half>_t, sv<t0>_t, <t0>_t)
+
+   This is the "top" counterpart to binary_narrowb_opt_n.  */
+struct binary_narrowt_opt_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "vh0,vh0,v0,v0", group, MODE_none);
+    build_all (b, "vh0,vh0,v0,s0", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (3, i, nargs)
+	|| (type = r.infer_vector_type (i + 1)) == NUM_TYPE_SUFFIXES
+	|| !r.require_derived_vector_type (i, i + 1, type, r.SAME_TYPE_CLASS,
+					   r.HALF_SIZE))
+      return error_mark_node;
+
+    return r.finish_opt_n_resolution (i + 2, i + 1, type);
+  }
+};
+SHAPE (binary_narrowt_opt_n)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0>_t)
+   sv<t0>_t svfoo[_n_t0](sv<t0>_t, <t0>_t)
+
+   i.e. the standard shape for binary operations that operate on
+   uniform types.  */
+struct binary_opt_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,v0", group, MODE_none);
+    /* _b functions do not have an _n form, but are classified as
+       binary_opt_n so that they can be overloaded with vector
+       functions.  */
+    if (group.types[0][0] == TYPE_SUFFIX_b)
+      gcc_assert (group.types[0][1] == NUM_TYPE_SUFFIXES);
+    else
+      build_all (b, "v0,v0,s0", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform_opt_n (2);
+  }
+};
+SHAPE (binary_opt_n)
+
+/* svbool_t svfoo(svbool_t, svbool_t).  */
+struct binary_pred_def : public nonoverloaded_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    build_all (b, "v0,v0,v0", group, MODE_none);
+  }
+};
+SHAPE (binary_pred)
+
+/* sv<t0>_t svfoo[_<t0>](sv<t0>_t, sv<t0>_t, uint64_t)
+
+   where the final argument must be 90 or 270.  */
+struct binary_rotate_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,v0,su64", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (2, 1);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    return c.require_immediate_either_or (2, 90, 270);
+  }
+};
+SHAPE (binary_rotate)
+
+/* sv<t0>_t svfoo_t0(<t0>_t, <t0>_t)
+
+   i.e. a binary function that takes two scalars and returns a vector.
+   An explicit type suffix is required.  */
+struct binary_scalar_def : public nonoverloaded_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    build_all (b, "v0,s0,s0", group, MODE_none);
+  }
+};
+SHAPE (binary_scalar)
+
+/* sv<t0:uint>_t svfoo[_t0](sv<t0>_t, sv<t0>_t).
+
+   i.e. a version of "binary" that returns unsigned integers.  */
+struct binary_to_uint_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "vu0,v0,v0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (2);
+  }
+};
+SHAPE (binary_to_uint)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:uint>_t)
+
+   i.e. a version of "binary" in which the final argument is always an
+   unsigned integer.  */
+struct binary_uint_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vu0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_derived_vector_type (i + 1, i, type, TYPE_unsigned))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (binary_uint)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, <t0:uint>_t)
+
+   i.e. a version of binary_n in which the final argument is always an
+   unsigned integer.  */
+struct binary_uint_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,su0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_derived_scalar_type (i + 1, TYPE_unsigned))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (binary_uint_n)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:uint>_t)
+   sv<t0>_t svfoo[_n_t0](sv<t0>_t, <t0:uint>_t)
+
+   i.e. a version of the standard binary shape binary_opt_n in which
+   the final argument is always an unsigned integer.  */
+struct binary_uint_opt_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vu0", group, MODE_none);
+    build_all (b, "v0,v0,su0", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
+      return error_mark_node;
+
+    return r.finish_opt_n_resolution (i + 1, i, type, TYPE_unsigned);
+  }
+};
+SHAPE (binary_uint_opt_n)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, uint64_t).
+
+   i.e. a version of binary_n in which the final argument is always
+   a 64-bit unsigned integer.  */
+struct binary_uint64_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,su64", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_scalar_type (i + 1, "uint64_t"))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (binary_uint64_n)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, svuint64_t)
+   sv<t0>_t svfoo[_n_t0](sv<t0>_t, uint64_t)
+
+   i.e. a version of the standard binary shape binary_opt_n in which
+   the final argument is always a uint64_t.  */
+struct binary_uint64_opt_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vu64", group, MODE_none);
+    build_all (b, "v0,v0,su64", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
+      return error_mark_node;
+
+    return r.finish_opt_n_resolution (i + 1, i, type, TYPE_unsigned, 64);
+  }
+};
+SHAPE (binary_uint64_opt_n)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:half>_t).  */
+struct binary_wide_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vh0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_derived_vector_type (i + 1, i, type, r.SAME_TYPE_CLASS,
+					   r.HALF_SIZE))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (binary_wide)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:half>_t)
+   sv<t0>_t svfoo[_n_t0](sv<t0>_t, <t0:half>_t).  */
+struct binary_wide_opt_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vh0", group, MODE_none);
+    build_all (b, "v0,v0,sh0", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
+      return error_mark_node;
+
+    return r.finish_opt_n_resolution (i + 1, i, type, r.SAME_TYPE_CLASS,
+				      r.HALF_SIZE);
+  }
+};
+SHAPE (binary_wide_opt_n)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0>_t)
+   <t0>_t svfoo[_n_t0](<t0>_t, sv<t0>_t).  */
+struct clast_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,v0", group, MODE_none);
+    build_all (b, "s0,s0,v0", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| !r.require_vector_or_scalar_type (i))
+      return error_mark_node;
+
+    if (r.scalar_argument_p (i))
+      {
+	type_suffix_index type;
+	if (!r.require_derived_scalar_type (i, r.SAME_TYPE_CLASS)
+	    || (type = r.infer_vector_type (i + 1)) == NUM_TYPE_SUFFIXES)
+	  return error_mark_node;
+	return r.resolve_to (MODE_n, type);
+      }
+    else
+      {
+	type_suffix_index type;
+	if ((type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	    || !r.require_matching_vector_type (i + 1, type))
+	  return error_mark_node;
+	return r.resolve_to (MODE_none, type);
+      }
+  }
+};
+SHAPE (clast)
+
+/* svbool_t svfoo[_t0](sv<t0>_t, sv<t0>_t).  */
+struct compare_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "vp,v0,v0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (2);
+  }
+};
+SHAPE (compare)
+
+/* svbool_t svfoo[_t0](sv<t0>_t, sv<t0>_t)
+   svbool_t svfoo[_n_t0](sv<t0>_t, <t0>_t)
+
+   i.e. a comparison between two vectors, or between a vector and a scalar.  */
+struct compare_opt_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "vp,v0,v0", group, MODE_none);
+    build_all (b, "vp,v0,s0", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform_opt_n (2);
+  }
+};
+SHAPE (compare_opt_n)
+
+/* svbool_t svfoo[_t0](const <t0>_t *, const <t0>_t *).  */
+struct compare_ptr_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "vp,al,al", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_pointer_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_matching_pointer_type (i + 1, i, type))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (compare_ptr)
+
+/* svbool_t svfoo_t0[_t1](<t1>_t, <t1>_t)
+
+   where _t0 is a _b<bits> suffix that describes the predicate result.
+   There is no direct relationship between the element sizes of _t0
+   and _t1.  */
+struct compare_scalar_def : public overloaded_base<1>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "vp,s1,s1", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_integer_scalar_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_matching_integer_scalar_type (i + 1, i, type))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, r.type_suffix_ids[0], type);
+  }
+};
+SHAPE (compare_scalar)
+
+/* svbool_t svfoo[_t0](sv<t0>_t, svint64_t)  (for signed t0)
+   svbool_t svfoo[_n_t0](sv<t0>_t, int64_t)  (for signed t0)
+   svbool_t svfoo[_t0](sv<t0>_t, svuint64_t)  (for unsigned t0)
+   svbool_t svfoo[_n_t0](sv<t0>_t, uint64_t)  (for unsigned t0)
+
+   i.e. a comparison in which the second argument is 64 bits.  */
+struct compare_wide_opt_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "vp,v0,vw0", group, MODE_none);
+    build_all (b, "vp,v0,sw0", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
+      return error_mark_node;
+
+    return r.finish_opt_n_resolution (i + 1, i, type, r.SAME_TYPE_CLASS, 64);
+  }
+};
+SHAPE (compare_wide_opt_n)
+
+/* uint64_t svfoo().  */
+struct count_inherent_def : public nonoverloaded_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    build_all (b, "su64", group, MODE_none);
+  }
+};
+SHAPE (count_inherent)
+
+/* uint64_t svfoo(enum svpattern).  */
+struct count_pat_def : public nonoverloaded_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    build_all (b, "su64,epattern", group, MODE_none);
+  }
+};
+SHAPE (count_pat)
+
+/* uint64_t svfoo(svbool_t).  */
+struct count_pred_def : public nonoverloaded_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    build_all (b, "su64,vp", group, MODE_none);
+  }
+};
+SHAPE (count_pred)
+
+/* uint64_t svfoo[_t0](sv<t0>_t).  */
+struct count_vector_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "su64,v0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (1);
+  }
+};
+SHAPE (count_vector)
+
+/* sv<t0>xN_t svfoo[_t0](sv<t0>_t, ..., sv<t0>_t)
+
+   where there are N arguments in total.  */
+struct create_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "t0,v0*t", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (r.vectors_per_tuple ());
+  }
+};
+SHAPE (create)
+
+/* sv<t0>_t svfoo[_n]_t0(<t0>_t, ..., <t0>_t)
+
+   where there are enough arguments to fill 128 bits of data (or to
+   control 128 bits of data in the case of predicates).  */
+struct dupq_def : public overloaded_base<1>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    /* The "_n" suffix is optional; the full name has it, but the short
+       name doesn't.  */
+    build_all (b, "v0,s0*q", group, MODE_n, true);
+  }
+
+  tree
+  resolve (function_resolver &) const OVERRIDE
+  {
+    /* The short forms just make "_n" implicit, so no resolution is needed.  */
+    gcc_unreachable ();
+  }
+};
+SHAPE (dupq)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0>_t, uint64_t)
+
+   where the final argument is an integer constant expression that when
+   multiplied by the number of bytes in t0 is in the range [0, 255].  */
+struct ext_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,v0,su64", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (2, 1);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    unsigned int bytes = c.type_suffix (0).element_bytes;
+    return c.require_immediate_range (2, 0, 256 / bytes - 1);
+  }
+};
+SHAPE (ext)
+
+/* <t0>_t svfoo[_t0](<t0>_t, sv<t0>_t).  */
+struct fold_left_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "s0,s0,v0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| !r.require_derived_scalar_type (i, r.SAME_TYPE_CLASS)
+	|| (type = r.infer_vector_type (i + 1)) == NUM_TYPE_SUFFIXES)
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (fold_left)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>xN_t, uint64_t)
+
+   where the final argument is an integer constant expression in
+   the range [0, N - 1].  */
+struct get_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,t0,su64", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_tuple_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_integer_immediate (i + 1))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    unsigned int nvectors = c.vectors_per_tuple ();
+    return c.require_immediate_range (1, 0, nvectors - 1);
+  }
+};
+SHAPE (get)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, uint64_t)
+   <t0>_t svfoo[_n_t0](<t0>_t, uint64_t)
+
+   where the t0 in the vector form is a signed or unsigned integer
+   whose size is tied to the [bhwd] suffix of "svfoo".  */
+struct inc_dec_def : public inc_dec_base
+{
+  CONSTEXPR inc_dec_def () : inc_dec_base (false) {}
+
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    /* These functions are unusual in that the type suffixes for
+       the scalar and vector forms are not related.  The vector
+       form always has exactly two potential suffixes while the
+       scalar form always has four.  */
+    if (group.types[2][0] == NUM_TYPE_SUFFIXES)
+      build_all (b, "v0,v0,su64", group, MODE_none);
+    else
+      build_all (b, "s0,s0,su64", group, MODE_n);
+  }
+};
+SHAPE (inc_dec)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, enum svpattern, uint64_t)
+   <t0>_t svfoo[_n_t0](<t0>_t, enum svpattern, uint64_t)
+
+   where the t0 in the vector form is a signed or unsigned integer
+   whose size is tied to the [bhwd] suffix of "svfoo".  */
+struct inc_dec_pat_def : public inc_dec_base
+{
+  CONSTEXPR inc_dec_pat_def () : inc_dec_base (true) {}
+
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    /* These functions are unusual in that the type suffixes for
+       the scalar and vector forms are not related.  The vector
+       form always has exactly two potential suffixes while the
+       scalar form always has four.  */
+    if (group.types[2][0] == NUM_TYPE_SUFFIXES)
+      build_all (b, "v0,v0,epattern,su64", group, MODE_none);
+    else
+      build_all (b, "s0,s0,epattern,su64", group, MODE_n);
+  }
+};
+SHAPE (inc_dec_pat)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, svbool_t).  */
+struct inc_dec_pred_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vp", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_vector_type (i + 1, VECTOR_TYPE_svbool_t))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (inc_dec_pred)
+
+/* <t0>_t svfoo[_n_t0]_t1(<t0>_t, svbool_t)
+
+   where _t1 is a _b<bits> suffix that describes the svbool_t argument.  */
+struct inc_dec_pred_scalar_def : public overloaded_base<2>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_n);
+    build_all (b, "s0,s0,vp", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_integer_scalar_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_vector_type (i + 1, VECTOR_TYPE_svbool_t))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type, r.type_suffix_ids[1]);
+  }
+};
+SHAPE (inc_dec_pred_scalar)
+
+/* sv<t0>[xN]_t svfoo_t0().  */
+struct inherent_def : public nonoverloaded_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    build_all (b, "t0", group, MODE_none);
+  }
+};
+SHAPE (inherent)
+
+/* svbool_t svfoo[_b]().  */
+struct inherent_b_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    /* The "_b" suffix is optional; the full name has it, but the short
+       name doesn't.  */
+    build_all (b, "v0", group, MODE_none, true);
+  }
+
+  tree
+  resolve (function_resolver &) const OVERRIDE
+  {
+    /* The short forms just make "_b" implicit, so no resolution is needed.  */
+    gcc_unreachable ();
+  }
+};
+SHAPE (inherent_b)
+
+/* sv<t0>[xN]_t svfoo[_t0](const <t0>_t *)
+   sv<t0>[xN]_t svfoo_vnum[_t0](const <t0>_t *, int64_t).  */
+struct load_def : public load_contiguous_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    b.add_overloaded_functions (group, MODE_vnum);
+    build_all (b, "t0,al", group, MODE_none);
+    build_all (b, "t0,al,ss64", group, MODE_vnum);
+  }
+};
+SHAPE (load)
+
+/* sv<t0>_t svfoo_t0(const <X>_t *)
+   sv<t0>_t svfoo_vnum_t0(const <X>_t *, int64_t)
+
+   where <X> is determined by the function base name.  */
+struct load_ext_def : public nonoverloaded_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    build_all (b, "t0,al", group, MODE_none);
+    build_all (b, "t0,al,ss64", group, MODE_vnum);
+  }
+};
+SHAPE (load_ext)
+
+/* sv<t0>_t svfoo_[s32]index_t0(const <X>_t *, svint32_t)
+   sv<t0>_t svfoo_[s64]index_t0(const <X>_t *, svint64_t)
+   sv<t0>_t svfoo_[u32]index_t0(const <X>_t *, svuint32_t)
+   sv<t0>_t svfoo_[u64]index_t0(const <X>_t *, svuint64_t)
+
+   sv<t0>_t svfoo[_u32base]_index_t0(svuint32_t, int64_t)
+   sv<t0>_t svfoo[_u64base]_index_t0(svuint64_t, int64_t)
+
+   where <X> is determined by the function base name.  */
+struct load_ext_gather_index_def : public load_ext_gather_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_index);
+    build_sv_index (b, "t0,al,d", group);
+    build_vs_index (b, "t0,b,ss64", group);
+  }
+};
+SHAPE (load_ext_gather_index)
+
+/* sv<t0>_t svfoo_[s64]index_t0(const <X>_t *, svint64_t)
+   sv<t0>_t svfoo_[u64]index_t0(const <X>_t *, svuint64_t)
+
+   sv<t0>_t svfoo[_u32base]_index_t0(svuint32_t, int64_t)
+   sv<t0>_t svfoo[_u64base]_index_t0(svuint64_t, int64_t)
+
+   where <X> is determined by the function base name.  This is
+   load_ext_gather_index that doesn't support 32-bit vector indices.  */
+struct load_ext_gather_index_restricted_def : public load_ext_gather_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_index);
+    build_sv_index64 (b, "t0,al,d", group);
+    build_vs_index (b, "t0,b,ss64", group);
+  }
+};
+SHAPE (load_ext_gather_index_restricted)
+
+/* sv<t0>_t svfoo_[s32]offset_t0(const <X>_t *, svint32_t)
+   sv<t0>_t svfoo_[s64]offset_t0(const <X>_t *, svint64_t)
+   sv<t0>_t svfoo_[u32]offset_t0(const <X>_t *, svuint32_t)
+   sv<t0>_t svfoo_[u64]offset_t0(const <X>_t *, svuint64_t)
+
+   sv<t0>_t svfoo[_u32base]_t0(svuint32_t)
+   sv<t0>_t svfoo[_u64base]_t0(svuint64_t)
+
+   sv<t0>_t svfoo[_u32base]_offset_t0(svuint32_t, int64_t)
+   sv<t0>_t svfoo[_u64base]_offset_t0(svuint64_t, int64_t)
+
+   where <X> is determined by the function base name.  */
+struct load_ext_gather_offset_def : public load_ext_gather_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_offset);
+    build_sv_offset (b, "t0,al,d", group);
+    build_v_base (b, "t0,b", group, true);
+    build_vs_offset (b, "t0,b,ss64", group);
+  }
+};
+SHAPE (load_ext_gather_offset)
+
+/* sv<t0>_t svfoo_[s64]offset_t0(const <X>_t *, svint64_t)
+   sv<t0>_t svfoo_[u32]offset_t0(const <X>_t *, svuint32_t)
+   sv<t0>_t svfoo_[u64]offset_t0(const <X>_t *, svuint64_t)
+
+   sv<t0>_t svfoo[_u32base]_t0(svuint32_t)
+   sv<t0>_t svfoo[_u64base]_t0(svuint64_t)
+
+   sv<t0>_t svfoo[_u32base]_offset_t0(svuint32_t, int64_t)
+   sv<t0>_t svfoo[_u64base]_offset_t0(svuint64_t, int64_t)
+
+   where <X> is determined by the function base name.  This is
+   load_ext_gather_offset without the s32 vector offset form.  */
+struct load_ext_gather_offset_restricted_def : public load_ext_gather_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_offset);
+    build_sv_uint_offset (b, "t0,al,d", group);
+    build_v_base (b, "t0,b", group, true);
+    build_vs_offset (b, "t0,b,ss64", group);
+  }
+};
+SHAPE (load_ext_gather_offset_restricted)
+
+/* sv<t0>_t svfoo_[s32]index[_t0](const <t0>_t *, svint32_t)
+   sv<t0>_t svfoo_[s64]index[_t0](const <t0>_t *, svint64_t)
+   sv<t0>_t svfoo_[u32]index[_t0](const <t0>_t *, svuint32_t)
+   sv<t0>_t svfoo_[u64]index[_t0](const <t0>_t *, svuint64_t)
+
+   sv<t0>_t svfoo_[s32]offset[_t0](const <t0>_t *, svint32_t)
+   sv<t0>_t svfoo_[s64]offset[_t0](const <t0>_t *, svint64_t)
+   sv<t0>_t svfoo_[u32]offset[_t0](const <t0>_t *, svuint32_t)
+   sv<t0>_t svfoo_[u64]offset[_t0](const <t0>_t *, svuint64_t).  */
+struct load_gather_sv_def : public load_gather_sv_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_index);
+    b.add_overloaded_functions (group, MODE_offset);
+    build_sv_index (b, "t0,al,d", group);
+    build_sv_offset (b, "t0,al,d", group);
+  }
+};
+SHAPE (load_gather_sv)
+
+/* sv<t0>_t svfoo_[u32]index[_t0](const <t0>_t *, svuint32_t)
+   sv<t0>_t svfoo_[u64]index[_t0](const <t0>_t *, svuint64_t)
+
+   sv<t0>_t svfoo_[s64]offset[_t0](const <t0>_t *, svint64_t)
+   sv<t0>_t svfoo_[u32]offset[_t0](const <t0>_t *, svuint32_t)
+   sv<t0>_t svfoo_[u64]offset[_t0](const <t0>_t *, svuint64_t)
+
+   This is load_gather_sv without the 32-bit vector index forms and
+   without the s32 vector offset form.  */
+struct load_gather_sv_restricted_def : public load_gather_sv_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_index);
+    b.add_overloaded_functions (group, MODE_offset);
+    build_sv_index64 (b, "t0,al,d", group);
+    build_sv_uint_offset (b, "t0,al,d", group);
+  }
+};
+SHAPE (load_gather_sv_restricted)
+
+/* sv<t0>_t svfoo[_u32base]_t0(svuint32_t)
+   sv<t0>_t svfoo[_u64base]_t0(svuint64_t)
+
+   sv<t0>_t svfoo[_u32base]_index_t0(svuint32_t, int64_t)
+   sv<t0>_t svfoo[_u64base]_index_t0(svuint64_t, int64_t)
+
+   sv<t0>_t svfoo[_u32base]_offset_t0(svuint32_t, int64_t)
+   sv<t0>_t svfoo[_u64base]_offset_t0(svuint64_t, int64_t).  */
+struct load_gather_vs_def : public overloaded_base<1>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    /* The base vector mode is optional; the full name has it but the
+       short name doesn't.  There is no ambiguity with SHAPE_load_gather_sv
+       because the latter uses an implicit type suffix.  */
+    build_v_base (b, "t0,b", group, true);
+    build_vs_index (b, "t0,b,ss64", group, true);
+    build_vs_offset (b, "t0,b,ss64", group, true);
+  }
+
+  tree
+  resolve (function_resolver &) const OVERRIDE
+  {
+    /* The short name just makes the base vector mode implicit;
+       no resolution is needed.  */
+    gcc_unreachable ();
+  }
+};
+SHAPE (load_gather_vs)
+
+/* sv<t0>_t svfoo[_t0](const <t0>_t *)
+
+   The only difference from "load" is that this shape has no vnum form.  */
+struct load_replicate_def : public load_contiguous_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "t0,al", group, MODE_none);
+  }
+};
+SHAPE (load_replicate)
+
+/* svbool_t svfoo(enum svpattern).  */
+struct pattern_pred_def : public nonoverloaded_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    build_all (b, "vp,epattern", group, MODE_none);
+  }
+};
+SHAPE (pattern_pred)
+
+/* void svfoo(const void *, svprfop)
+   void svfoo_vnum(const void *, int64_t, svprfop).  */
+struct prefetch_def : public nonoverloaded_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    build_all (b, "_,ap,eprfop", group, MODE_none);
+    build_all (b, "_,ap,ss64,eprfop", group, MODE_vnum);
+  }
+};
+SHAPE (prefetch)
+
+/* void svfoo_[s32]index(const void *, svint32_t, svprfop)
+   void svfoo_[s64]index(const void *, svint64_t, svprfop)
+   void svfoo_[u32]index(const void *, svuint32_t, svprfop)
+   void svfoo_[u64]index(const void *, svuint64_t, svprfop)
+
+   void svfoo[_u32base](svuint32_t, svprfop)
+   void svfoo[_u64base](svuint64_t, svprfop)
+
+   void svfoo[_u32base]_index(svuint32_t, int64_t, svprfop)
+   void svfoo[_u64base]_index(svuint64_t, int64_t, svprfop).  */
+struct prefetch_gather_index_def : public prefetch_gather_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    b.add_overloaded_functions (group, MODE_index);
+    build_sv_index (b, "_,ap,d,eprfop", group);
+    build_v_base (b, "_,b,eprfop", group);
+    build_vs_index (b, "_,b,ss64,eprfop", group);
+  }
+};
+SHAPE (prefetch_gather_index)
+
+/* void svfoo_[s32]offset(const void *, svint32_t, svprfop)
+   void svfoo_[s64]offset(const void *, svint64_t, svprfop)
+   void svfoo_[u32]offset(const void *, svuint32_t, svprfop)
+   void svfoo_[u64]offset(const void *, svuint64_t, svprfop)
+
+   void svfoo[_u32base](svuint32_t, svprfop)
+   void svfoo[_u64base](svuint64_t, svprfop)
+
+   void svfoo[_u32base]_offset(svuint32_t, int64_t, svprfop)
+   void svfoo[_u64base]_offset(svuint64_t, int64_t, svprfop).  */
+struct prefetch_gather_offset_def : public prefetch_gather_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    b.add_overloaded_functions (group, MODE_offset);
+    build_sv_offset (b, "_,ap,d,eprfop", group);
+    build_v_base (b, "_,b,eprfop", group);
+    build_vs_offset (b, "_,b,ss64,eprfop", group);
+  }
+};
+SHAPE (prefetch_gather_offset)
+
+/* bool svfoo(svbool_t).  */
+struct ptest_def : public nonoverloaded_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    build_all (b, "sp,vp", group, MODE_none);
+  }
+};
+SHAPE (ptest)
+
+/* svbool_t svfoo().  */
+struct rdffr_def : public nonoverloaded_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    build_all (b, "vp", group, MODE_none);
+  }
+};
+SHAPE (rdffr)
+
+/* <t0>_t svfoo[_t0](sv<t0>_t).  */
+struct reduction_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "s0,v0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (1);
+  }
+};
+SHAPE (reduction)
+
+/* int64_t svfoo[_t0](sv<t0>_t)  (for signed t0)
+   uint64_t svfoo[_t0](sv<t0>_t)  (for unsigned t0)
+   <t0>_t svfoo[_t0](sv<t0>_t)  (for floating-point t0)
+
+   i.e. a version of "reduction" in which the return type for integers
+   always has 64 bits.  */
+struct reduction_wide_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "sw0,v0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (1);
+  }
+};
+SHAPE (reduction_wide)
+
+/* sv<t0>xN_t svfoo[_t0](sv<t0>xN_t, uint64_t, sv<t0>_t)
+
+   where the second argument is an integer constant expression in the
+   range [0, N - 1].  */
+struct set_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "t0,t0,su64,v0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (3, i, nargs)
+	|| (type = r.infer_tuple_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_integer_immediate (i + 1)
+	|| !r.require_derived_vector_type (i + 2, i, type))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    unsigned int nvectors = c.vectors_per_tuple ();
+    return c.require_immediate_range (1, 0, nvectors - 1);
+  }
+};
+SHAPE (set)
+
+/* void svfoo().  */
+struct setffr_def : public nonoverloaded_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    build_all (b, "_", group, MODE_none);
+  }
+};
+SHAPE (setffr)
+
+/* sv<t0>_t svfoo[_n_t0])(sv<t0>_t, uint64_t)
+
+   where the final argument must be an integer constant expression in the
+   range [0, sizeof (<t0>_t) * 8 - 1].  */
+struct shift_left_imm_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_n);
+    build_all (b, "v0,v0,su64", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (1, 1);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    unsigned int bits = c.type_suffix (0).element_bits;
+    return c.require_immediate_range (1, 0, bits - 1);
+  }
+};
+SHAPE (shift_left_imm)
+
+/* sv<t0>_t svfoo[_n_t0])(sv<t0:half>_t, uint64_t)
+
+   where the final argument must be an integer constant expression in the
+   range [0, sizeof (<t0>_t) * 4 - 1].  */
+struct shift_left_imm_long_def : public binary_imm_long_base
+{
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    unsigned int bits = c.type_suffix (0).element_bits / 2;
+    return c.require_immediate_range (1, 0, bits - 1);
+  }
+};
+SHAPE (shift_left_imm_long)
+
+/* sv<t0:uint>_t svfoo[_n_t0])(sv<t0>_t, uint64_t)
+
+   where the final argument must be an integer constant expression in the
+   range [0, sizeof (<t0>_t) * 8 - 1].  */
+struct shift_left_imm_to_uint_def : public shift_left_imm_def
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_n);
+    build_all (b, "vu0,v0,su64", group, MODE_n);
+  }
+};
+SHAPE (shift_left_imm_to_uint)
+
+/* sv<t0>_t svfoo[_n_t0])(sv<t0>_t, uint64_t)
+
+   where the final argument must be an integer constant expression in the
+   range [1, sizeof (<t0>_t) * 8].  */
+struct shift_right_imm_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_n);
+    build_all (b, "v0,v0,su64", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (1, 1);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    unsigned int bits = c.type_suffix (0).element_bits;
+    return c.require_immediate_range (1, 1, bits);
+  }
+};
+SHAPE (shift_right_imm)
+
+/* sv<t0:half>_t svfoo[_n_t0])(sv<t0>_t, uint64_t)
+
+   where the final argument must be an integer constant expression in the
+   range [1, sizeof (<t0>_t) * 4].  */
+typedef shift_right_imm_narrow_wrapper<binary_imm_narrowb_base<>, 1>
+  shift_right_imm_narrowb_def;
+SHAPE (shift_right_imm_narrowb)
+
+/* sv<t0:half>_t svfoo[_n_t0])(sv<t0:half>_t, sv<t0>_t, uint64_t)
+
+   where the final argument must be an integer constant expression in the
+   range [1, sizeof (<t0>_t) * 4].  */
+typedef shift_right_imm_narrow_wrapper<binary_imm_narrowt_base<>, 2>
+  shift_right_imm_narrowt_def;
+SHAPE (shift_right_imm_narrowt)
+
+/* sv<t0:uint:half>_t svfoo[_n_t0])(sv<t0>_t, uint64_t)
+
+   where the final argument must be an integer constant expression in the
+   range [1, sizeof (<t0>_t) * 4].  */
+typedef binary_imm_narrowb_base<TYPE_unsigned>
+  binary_imm_narrowb_base_unsigned;
+typedef shift_right_imm_narrow_wrapper<binary_imm_narrowb_base_unsigned, 1>
+  shift_right_imm_narrowb_to_uint_def;
+SHAPE (shift_right_imm_narrowb_to_uint)
+
+/* sv<t0:uint:half>_t svfoo[_n_t0])(sv<t0:uint:half>_t, sv<t0>_t, uint64_t)
+
+   where the final argument must be an integer constant expression in the
+   range [1, sizeof (<t0>_t) * 4].  */
+typedef binary_imm_narrowt_base<TYPE_unsigned>
+  binary_imm_narrowt_base_unsigned;
+typedef shift_right_imm_narrow_wrapper<binary_imm_narrowt_base_unsigned, 2>
+  shift_right_imm_narrowt_to_uint_def;
+SHAPE (shift_right_imm_narrowt_to_uint)
+
+/* void svfoo[_t0](<X>_t *, sv<t0>[xN]_t)
+   void svfoo_vnum[_t0](<X>_t *, int64_t, sv<t0>[xN]_t)
+
+   where <X> might be tied to <t0> (for non-truncating stores) or might
+   depend on the function base name (for truncating stores).  */
+struct store_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    b.add_overloaded_functions (group, MODE_vnum);
+    build_all (b, "_,as,t0", group, MODE_none);
+    build_all (b, "_,as,ss64,t0", group, MODE_vnum);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    bool vnum_p = r.mode_suffix_id == MODE_vnum;
+    gcc_assert (r.mode_suffix_id == MODE_none || vnum_p);
+
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (vnum_p ? 3 : 2, i, nargs)
+	|| !r.require_pointer_type (i)
+	|| (vnum_p && !r.require_scalar_type (i + 1, "int64_t"))
+	|| ((type = r.infer_tuple_type (nargs - 1)) == NUM_TYPE_SUFFIXES))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (store)
+
+/* void svfoo_[s32]index[_t0](<X>_t *, svint32_t, sv<t0>_t)
+   void svfoo_[s64]index[_t0](<X>_t *, svint64_t, sv<t0>_t)
+   void svfoo_[u32]index[_t0](<X>_t *, svuint32_t, sv<t0>_t)
+   void svfoo_[u64]index[_t0](<X>_t *, svuint64_t, sv<t0>_t)
+
+   void svfoo[_u32base]_index[_t0](svuint32_t, int64_t, sv<t0>_t)
+   void svfoo[_u64base]_index[_t0](svuint64_t, int64_t, sv<t0>_t)
+
+   where <X> might be tied to <t0> (for non-truncating stores) or might
+   depend on the function base name (for truncating stores).  */
+struct store_scatter_index_def : public store_scatter_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_index);
+    build_sv_index (b, "_,as,d,t0", group);
+    build_vs_index (b, "_,b,ss64,t0", group);
+  }
+};
+SHAPE (store_scatter_index)
+
+/* void svfoo_[s64]index[_t0](<X>_t *, svint64_t, sv<t0>_t)
+   void svfoo_[u64]index[_t0](<X>_t *, svuint64_t, sv<t0>_t)
+
+   void svfoo[_u32base]_index[_t0](svuint32_t, int64_t, sv<t0>_t)
+   void svfoo[_u64base]_index[_t0](svuint64_t, int64_t, sv<t0>_t)
+
+   i.e. a version of store_scatter_index that doesn't support 32-bit
+   vector indices.  */
+struct store_scatter_index_restricted_def : public store_scatter_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_index);
+    build_sv_index64 (b, "_,as,d,t0", group);
+    build_vs_index (b, "_,b,ss64,t0", group);
+  }
+};
+SHAPE (store_scatter_index_restricted)
+
+/* void svfoo_[s32]offset[_t0](<X>_t *, svint32_t, sv<t0>_t)
+   void svfoo_[s64]offset[_t0](<X>_t *, svint64_t, sv<t0>_t)
+   void svfoo_[u32]offset[_t0](<X>_t *, svuint32_t, sv<t0>_t)
+   void svfoo_[u64]offset[_t0](<X>_t *, svuint64_t, sv<t0>_t)
+
+   void svfoo[_u32base_t0](svuint32_t, sv<t0>_t)
+   void svfoo[_u64base_t0](svuint64_t, sv<t0>_t)
+
+   void svfoo[_u32base]_offset[_t0](svuint32_t, int64_t, sv<t0>_t)
+   void svfoo[_u64base]_offset[_t0](svuint64_t, int64_t, sv<t0>_t)
+
+   where <X> might be tied to <t0> (for non-truncating stores) or might
+   depend on the function base name (for truncating stores).  */
+struct store_scatter_offset_def : public store_scatter_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    b.add_overloaded_functions (group, MODE_offset);
+    build_sv_offset (b, "_,as,d,t0", group);
+    build_v_base (b, "_,b,t0", group);
+    build_vs_offset (b, "_,b,ss64,t0", group);
+  }
+};
+SHAPE (store_scatter_offset)
+
+/* void svfoo_[s64]offset[_t0](<X>_t *, svint64_t, sv<t0>_t)
+   void svfoo_[u32]offset[_t0](<X>_t *, svuint32_t, sv<t0>_t)
+   void svfoo_[u64]offset[_t0](<X>_t *, svuint64_t, sv<t0>_t)
+
+   void svfoo[_u32base_t0](svuint32_t, sv<t0>_t)
+   void svfoo[_u64base_t0](svuint64_t, sv<t0>_t)
+
+   void svfoo[_u32base]_offset[_t0](svuint32_t, int64_t, sv<t0>_t)
+   void svfoo[_u64base]_offset[_t0](svuint64_t, int64_t, sv<t0>_t)
+
+   i.e. a version of store_scatter_offset that doesn't support svint32_t
+   offsets.  */
+struct store_scatter_offset_restricted_def : public store_scatter_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    b.add_overloaded_functions (group, MODE_offset);
+    build_sv_uint_offset (b, "_,as,d,t0", group);
+    build_v_base (b, "_,b,t0", group);
+    build_vs_offset (b, "_,b,ss64,t0", group);
+  }
+};
+SHAPE (store_scatter_offset_restricted)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>xN_t, sv<t0:uint>_t).  */
+struct tbl_tuple_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,t0,vu0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (2, i, nargs)
+	|| (type = r.infer_tuple_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_derived_vector_type (i + 1, i, type, TYPE_unsigned))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (tbl_tuple)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, svbfloatt16_t, svbfloat16_t).  */
+struct ternary_bfloat_def
+  : public ternary_resize2_base<16, TYPE_bfloat, TYPE_bfloat>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vB,vB", group, MODE_none);
+  }
+};
+SHAPE (ternary_bfloat)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, svbfloat16_t, svbfloat16_t, uint64_t)
+
+   where the final argument is an integer constant expression in the range
+   [0, 7].  */
+typedef ternary_bfloat_lane_base<1> ternary_bfloat_lane_def;
+SHAPE (ternary_bfloat_lane)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, svbfloat16_t, svbfloat16_t, uint64_t)
+
+   where the final argument is an integer constant expression in the range
+   [0, 3].  */
+typedef ternary_bfloat_lane_base<2> ternary_bfloat_lanex2_def;
+SHAPE (ternary_bfloat_lanex2)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, svbfloatt16_t, svbfloat16_t)
+   sv<t0>_t svfoo[_n_t0](sv<t0>_t, svbfloat16_t, bfloat16_t).  */
+struct ternary_bfloat_opt_n_def
+  : public ternary_resize2_opt_n_base<16, TYPE_bfloat, TYPE_bfloat>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vB,vB", group, MODE_none);
+    build_all (b, "v0,v0,vB,sB", group, MODE_n);
+  }
+};
+SHAPE (ternary_bfloat_opt_n)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:int:quarter>_t, sv<t0:uint:quarter>_t,
+		       uint64_t)
+
+   where the final argument is an integer constant expression in the range
+   [0, 16 / sizeof (<t0>_t) - 1].  */
+struct ternary_intq_uintq_lane_def
+  : public ternary_qq_lane_base<TYPE_signed, TYPE_unsigned>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vqs0,vqu0,su64", group, MODE_none);
+  }
+};
+SHAPE (ternary_intq_uintq_lane)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:int:quarter>_t, sv<t0:uint:quarter>_t)
+   sv<t0>_t svfoo[_n_t0](sv<t0>_t, sv<t0:int:quarter>_t,
+			 <t0:uint:quarter>_t).  */
+struct ternary_intq_uintq_opt_n_def
+  : public ternary_resize2_opt_n_base<function_resolver::QUARTER_SIZE,
+				      TYPE_signed, TYPE_unsigned>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vqs0,vqu0", group, MODE_none);
+    build_all (b, "v0,v0,vqs0,squ0", group, MODE_n);
+  }
+};
+SHAPE (ternary_intq_uintq_opt_n)
+
+/* svbool_t svfoo[_<t0>](sv<t0>_t, sv<t0>_t, sv<t0>_t, uint64_t)
+
+   where the final argument is an integer constant expression in the
+   range [0, 16 / sizeof (<t0>_t) - 1].  */
+struct ternary_lane_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,v0,v0,su64", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (3, 1);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    return c.require_immediate_lane_index (3);
+  }
+};
+SHAPE (ternary_lane)
+
+/* svbool_t svfoo[_<t0>](sv<t0>_t, sv<t0>_t, sv<t0>_t, uint64_t, uint64_t)
+
+   where the penultimate argument is an integer constant expression in
+   the range [0, 8 / sizeof (<t0>_t) - 1] and where the final argument
+   is an integer constant expression in {0, 90, 180, 270}.  */
+struct ternary_lane_rotate_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,v0,v0,su64,su64", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (3, 2);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    return (c.require_immediate_lane_index (3, 2)
+	    && c.require_immediate_one_of (4, 0, 90, 180, 270));
+  }
+};
+SHAPE (ternary_lane_rotate)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:half>_t, sv<t0:half>_t, uint64_t)
+
+   where the final argument is an integer constant expression in the range
+   [0, 32 / sizeof (<t0>_t) - 1].  */
+struct ternary_long_lane_def
+  : public ternary_resize2_lane_base<function_resolver::HALF_SIZE>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vh0,vh0,su64", group, MODE_none);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    return c.require_immediate_lane_index (3);
+  }
+};
+SHAPE (ternary_long_lane)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:half>_t, sv<t0:half>_t)
+   sv<t0>_t svfoo[_n_t0](sv<t0>_t, sv<t0:half>_t, <t0:half>_t)
+
+   i.e. a version of the standard ternary shape ternary_opt_n in which
+   the element type of the last two arguments is the half-sized
+   equivalent of <t0>.  */
+struct ternary_long_opt_n_def
+  : public ternary_resize2_opt_n_base<function_resolver::HALF_SIZE>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vh0,vh0", group, MODE_none);
+    build_all (b, "v0,v0,vh0,sh0", group, MODE_n);
+  }
+};
+SHAPE (ternary_long_opt_n)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0>_t, sv<t0>_t)
+   sv<t0>_t svfoo[_n_t0](sv<t0>_t, sv<t0>_t, <t0>_t)
+
+   i.e. the standard shape for ternary operations that operate on
+   uniform types.  */
+struct ternary_opt_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,v0,v0", group, MODE_none);
+    build_all (b, "v0,v0,v0,s0", group, MODE_n);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform_opt_n (3);
+  }
+};
+SHAPE (ternary_opt_n)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:quarter>_t, sv<t0:quarter>_t, uint64_t)
+
+   where the final argument is an integer constant expression in the range
+   [0, 16 / sizeof (<t0>_t) - 1].  */
+struct ternary_qq_lane_def : public ternary_qq_lane_base<>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vq0,vq0,su64", group, MODE_none);
+  }
+};
+SHAPE (ternary_qq_lane)
+
+/* svbool_t svfoo[_<t0>](sv<t0>_t, sv<t0:quarter>_t, sv<t0:quarter>_t,
+			 uint64_t)
+
+   where the final argument is an integer constant expression in
+   {0, 90, 180, 270}.  */
+struct ternary_qq_lane_rotate_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vq0,vq0,su64,su64", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (5, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_derived_vector_type (i + 1, i, type, r.SAME_TYPE_CLASS,
+					   r.QUARTER_SIZE)
+	|| !r.require_derived_vector_type (i + 2, i, type, r.SAME_TYPE_CLASS,
+					   r.QUARTER_SIZE)
+	|| !r.require_integer_immediate (i + 3)
+	|| !r.require_integer_immediate (i + 4))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    return (c.require_immediate_lane_index (3, 4)
+	    && c.require_immediate_one_of (4, 0, 90, 180, 270));
+  }
+};
+SHAPE (ternary_qq_lane_rotate)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:quarter>_t, sv<t0:quarter>_t)
+   sv<t0>_t svfoo[_n_t0](sv<t0>_t, sv<t0:quarter>_t, <t0:quarter>_t)
+
+   i.e. a version of the standard ternary shape ternary_opt_n in which
+   the element type of the last two arguments is the quarter-sized
+   equivalent of <t0>.  */
+struct ternary_qq_opt_n_def
+  : public ternary_resize2_opt_n_base<function_resolver::QUARTER_SIZE>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vq0,vq0", group, MODE_none);
+    build_all (b, "v0,v0,vq0,sq0", group, MODE_n);
+  }
+};
+SHAPE (ternary_qq_opt_n)
+
+/* svbool_t svfoo[_<t0>](sv<t0>_t, sv<t0:quarter>_t, sv<t0:quarter>_t,
+			 uint64_t)
+
+   where the final argument is an integer constant expression in
+   {0, 90, 180, 270}.  */
+struct ternary_qq_rotate_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vq0,vq0,su64", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (4, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_derived_vector_type (i + 1, i, type, r.SAME_TYPE_CLASS,
+					   r.QUARTER_SIZE)
+	|| !r.require_derived_vector_type (i + 2, i, type, r.SAME_TYPE_CLASS,
+					   r.QUARTER_SIZE)
+	|| !r.require_integer_immediate (i + 3))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    return c.require_immediate_one_of (3, 0, 90, 180, 270);
+  }
+};
+SHAPE (ternary_qq_rotate)
+
+/* svbool_t svfoo[_<t0>](sv<t0>_t, sv<t0>_t, sv<t0>_t, uint64_t)
+
+   where the final argument is an integer constant expression in
+   {0, 90, 180, 270}.  */
+struct ternary_rotate_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,v0,v0,su64", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (3, 1);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    return c.require_immediate_one_of (3, 0, 90, 180, 270);
+  }
+};
+SHAPE (ternary_rotate)
+
+/* sv<t0>_t svfoo[_n_t0])(sv<t0>_t, sv<t0>_t, uint64_t)
+
+   where the final argument must be an integer constant expression in the
+   range [0, sizeof (<t0>_t) * 8 - 1].  */
+struct ternary_shift_left_imm_def : public ternary_shift_imm_base
+{
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    unsigned int bits = c.type_suffix (0).element_bits;
+    return c.require_immediate_range (2, 0, bits - 1);
+  }
+};
+SHAPE (ternary_shift_left_imm)
+
+/* sv<t0>_t svfoo[_n_t0])(sv<t0>_t, sv<t0>_t, uint64_t)
+
+   where the final argument must be an integer constant expression in the
+   range [1, sizeof (<t0>_t) * 8].  */
+struct ternary_shift_right_imm_def : public ternary_shift_imm_base
+{
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    unsigned int bits = c.type_suffix (0).element_bits;
+    return c.require_immediate_range (2, 1, bits);
+  }
+};
+SHAPE (ternary_shift_right_imm)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0>_t, sv<t0:uint>_t).  */
+struct ternary_uint_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,v0,vu0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (3, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	|| !r.require_matching_vector_type (i + 1, type)
+	|| !r.require_derived_vector_type (i + 2, i, type, TYPE_unsigned))
+      return error_mark_node;
+
+    return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (ternary_uint)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, svu<t0:uint:quarter>_t,
+		       sv<t0:int:quarter>_t).  */
+struct ternary_uintq_intq_def
+  : public ternary_resize2_base<function_resolver::QUARTER_SIZE,
+				TYPE_unsigned, TYPE_signed>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vqu0,vqs0", group, MODE_none);
+  }
+};
+SHAPE (ternary_uintq_intq)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:uint:quarter>_t, sv<t0:int:quarter>_t,
+		       uint64_t)
+
+   where the final argument is an integer constant expression in the range
+   [0, 16 / sizeof (<t0>_t) - 1].  */
+struct ternary_uintq_intq_lane_def
+  : public ternary_qq_lane_base<TYPE_unsigned, TYPE_signed>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vqu0,vqs0,su64", group, MODE_none);
+  }
+};
+SHAPE (ternary_uintq_intq_lane)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t, sv<t0:uint:quarter>_t, sv<t0:int:quarter>_t)
+   sv<t0>_t svfoo[_n_t0](sv<t0>_t, sv<t0:uint:quarter>_t,
+			 <t0:int:quarter>_t).  */
+struct ternary_uintq_intq_opt_n_def
+  : public ternary_resize2_opt_n_base<function_resolver::QUARTER_SIZE,
+				      TYPE_unsigned, TYPE_signed>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,vqu0,vqs0", group, MODE_none);
+    build_all (b, "v0,v0,vqu0,sqs0", group, MODE_n);
+  }
+};
+SHAPE (ternary_uintq_intq_opt_n)
+
+/* svbool_t svfoo[_<t0>](sv<t0>_t, sv<t0>_t, uint64_t)
+
+   where the final argument is an integer constant expression in the
+   range [0, 7].  */
+struct tmad_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0,v0,su64", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_uniform (2, 1);
+  }
+
+  bool
+  check (function_checker &c) const OVERRIDE
+  {
+    return c.require_immediate_range (2, 0, 7);
+  }
+};
+SHAPE (tmad)
+
+/* sv<t0>_t svfoo[_t0](sv<t0>_t)
+
+   i.e. the standard shape for unary operations that operate on
+   uniform types.  */
+struct unary_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_unary ();
+  }
+};
+SHAPE (unary)
+
+/* sv<t0>_t svfoo_t0[_t1](sv<t1>_t)
+
+   where the target type <t0> must be specified explicitly but the source
+   type <t1> can be inferred.  */
+struct unary_convert_def : public overloaded_base<1>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v1", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_unary (r.type_suffix (0).tclass,
+			    r.type_suffix (0).element_bits);
+  }
+};
+SHAPE (unary_convert)
+
+/* sv<t0>_t svfoo_t0[_t1](sv<t0>_t, sv<t1>_t)
+
+   This is a version of unary_convert in which the even-indexed
+   elements are passed in as a first parameter, before any governing
+   predicate.  */
+struct unary_convert_narrowt_def : public overloaded_base<1>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,v1", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_unary (r.type_suffix (0).tclass,
+			    r.type_suffix (0).element_bits, true);
+  }
+};
+SHAPE (unary_convert_narrowt)
+
+/* sv<t0>_t svfoo[_t0](sv<t0:half>_t).  */
+struct unary_long_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,vh0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type, result_type;
+    if (!r.check_gp_argument (1, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES
+	|| (result_type = long_type_suffix (r, type)) == NUM_TYPE_SUFFIXES)
+      return error_mark_node;
+
+    if (tree res = r.lookup_form (r.mode_suffix_id, result_type))
+      return res;
+
+    return r.report_no_such_form (type);
+  }
+};
+SHAPE (unary_long)
+
+/* sv<t0>_t svfoo[_n]_t0(<t0>_t).  */
+struct unary_n_def : public overloaded_base<1>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    /* The "_n" suffix is optional; the full name has it, but the short
+       name doesn't.  */
+    build_all (b, "v0,s0", group, MODE_n, true);
+  }
+
+  tree
+  resolve (function_resolver &) const OVERRIDE
+  {
+    /* The short forms just make "_n" implicit, so no resolution is needed.  */
+    gcc_unreachable ();
+  }
+};
+SHAPE (unary_n)
+
+/* sv<t0:half>_t svfoo[_t0](sv<t0>_t).  */
+typedef unary_narrowb_base<> unary_narrowb_def;
+SHAPE (unary_narrowb)
+
+/* sv<t0:half>_t svfoo[_t0](sv<t0:half>_t, sv<t0>_t).  */
+typedef unary_narrowt_base<> unary_narrowt_def;
+SHAPE (unary_narrowt)
+
+/* sv<t0:uint:half>_t svfoo[_t0](sv<t0>_t).  */
+typedef unary_narrowb_base<TYPE_unsigned> unary_narrowb_to_uint_def;
+SHAPE (unary_narrowb_to_uint)
+
+/* sv<t0:uint:half>_t svfoo[_t0](sv<t0:uint:half>_t, sv<t0>_t).  */
+typedef unary_narrowt_base<TYPE_unsigned> unary_narrowt_to_uint_def;
+SHAPE (unary_narrowt_to_uint)
+
+/* svbool_t svfoo(svbool_t).  */
+struct unary_pred_def : public nonoverloaded_base
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    build_all (b, "v0,v0", group, MODE_none);
+  }
+};
+SHAPE (unary_pred)
+
+/* sv<t0:int>_t svfoo[_t0](sv<t0>_t)
+
+   i.e. a version of "unary" in which the returned vector contains
+   signed integers.  */
+struct unary_to_int_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "vs0,v0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_unary (TYPE_signed);
+  }
+};
+SHAPE (unary_to_int)
+
+/* sv<t0:uint>_t svfoo[_t0](sv<t0>_t)
+
+   i.e. a version of "unary" in which the returned vector contains
+   unsigned integers.  */
+struct unary_to_uint_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "vu0,v0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    return r.resolve_unary (TYPE_unsigned);
+  }
+};
+SHAPE (unary_to_uint)
+
+/* sv<t0>_t svfoo[_t0](sv<t0:uint>_t)
+
+   where <t0> always belongs a certain type class, and where <t0:uint>
+   therefore uniquely determines <t0>.  */
+struct unary_uint_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,vu0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (1, i, nargs)
+	|| (type = r.infer_unsigned_vector_type (i)) == NUM_TYPE_SUFFIXES)
+      return error_mark_node;
+
+    /* Search for a valid suffix with the same number of bits as TYPE.  */
+    unsigned int element_bits = type_suffixes[type].element_bits;
+    if (type_suffixes[type].unsigned_p)
+      for (unsigned int j = 0; j < NUM_TYPE_SUFFIXES; ++j)
+	if (type_suffixes[j].element_bits == element_bits)
+	  if (tree res = r.lookup_form (r.mode_suffix_id,
+					type_suffix_index (j)))
+	    return res;
+
+    return r.report_no_such_form (type);
+  }
+};
+SHAPE (unary_uint)
+
+/* sv<t0>_t svfoo[_<t0>](sv<t0:half>_t)
+
+   i.e. a version of "unary" in which the source elements are half the
+   size of the destination elements, but have the same type class.  */
+struct unary_widen_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group) const OVERRIDE
+  {
+    b.add_overloaded_functions (group, MODE_none);
+    build_all (b, "v0,vh0", group, MODE_none);
+  }
+
+  tree
+  resolve (function_resolver &r) const OVERRIDE
+  {
+    unsigned int i, nargs;
+    type_suffix_index type;
+    if (!r.check_gp_argument (1, i, nargs)
+	|| (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
+      return error_mark_node;
+
+    /* There is only a single form for predicates.  */
+    if (type == TYPE_SUFFIX_b)
+      return r.resolve_to (r.mode_suffix_id, type);
+
+    if (type_suffixes[type].integer_p
+	&& type_suffixes[type].element_bits < 64)
+      {
+	type_suffix_index wide_suffix
+	  = find_type_suffix (type_suffixes[type].tclass,
+			      type_suffixes[type].element_bits * 2);
+	if (tree res = r.lookup_form (r.mode_suffix_id, wide_suffix))
+	  return res;
+      }
+
+    return r.report_no_such_form (type);
+  }
+};
+SHAPE (unary_widen)
+
+}
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-shapes.h b/gcc/config/aarch64/aarch64-sve-builtins-shapes.h
new file mode 100644
index 000000000..b36f50acd
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-sve-builtins-shapes.h
@@ -0,0 +1,191 @@
+/* ACLE support for AArch64 SVE (function shapes)
+   Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_SVE_BUILTINS_SHAPES_H
+#define GCC_AARCH64_SVE_BUILTINS_SHAPES_H
+
+namespace aarch64_sve
+{
+  /* The naming convention is:
+
+     - to use the name of the function if the rules are very specific to
+       a particular function (e.g. svext, for which the range of the
+       final immediate value is in no way generic).
+
+     - to use names like "unary" etc. if the rules are somewhat generic,
+       especially if there are no ranges involved.
+
+     When using generic names, the handling of the final vector argument
+     can be modified as follows:
+
+     - an "_n" suffix changes the argument from a vector to a scalar.
+
+     - an "_opt_n" suffix says that there are two forms of each function:
+       one in which the argument is the usual vector, and one in which it
+       is replaced by a scalar.
+
+     - "_int" and "_uint" replace the argument's element type with a
+       signed or unsigned integer of the same width.  The suffixes above
+       then indicate whether this final argument is or might be a scalar.
+
+     - "_int64" and "_uint64" similarly replace the argument's element type
+       with int64_t or uint64_t.
+
+     - "_wide" replaces the argument's element type with a 64-bit integer
+       of the same signedness.  This only makes sense for integer elements.
+
+     - "_lane" indicates that the argument is indexed by a constant lane
+       number, provided as an immediately-following argument of type uint64_t.
+
+     Also:
+
+     - "inherent" means that the function takes no arguments.
+
+     - "_rotate" means that the final argument is a rotation amount
+       (0, 90, 180 or 270).
+
+     - "_scalar" indicates that all data arguments are scalars rather
+       than vectors.
+
+     - in gather/scatter addresses, "sv" stands for "scalar base,
+       vector displacement" while "vs" stands for "vector base,
+       scalar displacement".
+
+     - "_pred" indicates that the function takes an svbool_t argument
+       that does not act as a governing predicate..  */
+  namespace shapes
+  {
+    extern const function_shape *const adr_index;
+    extern const function_shape *const adr_offset;
+    extern const function_shape *const binary;
+    extern const function_shape *const binary_int_opt_n;
+    extern const function_shape *const binary_lane;
+    extern const function_shape *const binary_long_lane;
+    extern const function_shape *const binary_long_opt_n;
+    extern const function_shape *const binary_n;
+    extern const function_shape *const binary_narrowb_opt_n;
+    extern const function_shape *const binary_narrowt_opt_n;
+    extern const function_shape *const binary_opt_n;
+    extern const function_shape *const binary_pred;
+    extern const function_shape *const binary_rotate;
+    extern const function_shape *const binary_scalar;
+    extern const function_shape *const binary_to_uint;
+    extern const function_shape *const binary_uint;
+    extern const function_shape *const binary_uint_n;
+    extern const function_shape *const binary_uint_opt_n;
+    extern const function_shape *const binary_uint64_n;
+    extern const function_shape *const binary_uint64_opt_n;
+    extern const function_shape *const binary_wide;
+    extern const function_shape *const binary_wide_opt_n;
+    extern const function_shape *const clast;
+    extern const function_shape *const compare;
+    extern const function_shape *const compare_opt_n;
+    extern const function_shape *const compare_ptr;
+    extern const function_shape *const compare_scalar;
+    extern const function_shape *const compare_wide_opt_n;
+    extern const function_shape *const count_inherent;
+    extern const function_shape *const count_pat;
+    extern const function_shape *const count_pred;
+    extern const function_shape *const count_vector;
+    extern const function_shape *const create;
+    extern const function_shape *const dupq;
+    extern const function_shape *const ext;
+    extern const function_shape *const fold_left;
+    extern const function_shape *const get;
+    extern const function_shape *const inc_dec;
+    extern const function_shape *const inc_dec_pat;
+    extern const function_shape *const inc_dec_pred;
+    extern const function_shape *const inc_dec_pred_scalar;
+    extern const function_shape *const inherent;
+    extern const function_shape *const inherent_b;
+    extern const function_shape *const load;
+    extern const function_shape *const load_ext;
+    extern const function_shape *const load_ext_gather_index;
+    extern const function_shape *const load_ext_gather_index_restricted;
+    extern const function_shape *const load_ext_gather_offset;
+    extern const function_shape *const load_ext_gather_offset_restricted;
+    extern const function_shape *const load_gather_sv;
+    extern const function_shape *const load_gather_sv_restricted;
+    extern const function_shape *const load_gather_vs;
+    extern const function_shape *const load_replicate;
+    extern const function_shape *const mmla;
+    extern const function_shape *const pattern_pred;
+    extern const function_shape *const prefetch;
+    extern const function_shape *const prefetch_gather_index;
+    extern const function_shape *const prefetch_gather_offset;
+    extern const function_shape *const ptest;
+    extern const function_shape *const rdffr;
+    extern const function_shape *const reduction;
+    extern const function_shape *const reduction_wide;
+    extern const function_shape *const set;
+    extern const function_shape *const setffr;
+    extern const function_shape *const shift_left_imm_long;
+    extern const function_shape *const shift_left_imm_to_uint;
+    extern const function_shape *const shift_right_imm;
+    extern const function_shape *const shift_right_imm_narrowb;
+    extern const function_shape *const shift_right_imm_narrowt;
+    extern const function_shape *const shift_right_imm_narrowb_to_uint;
+    extern const function_shape *const shift_right_imm_narrowt_to_uint;
+    extern const function_shape *const store;
+    extern const function_shape *const store_scatter_index;
+    extern const function_shape *const store_scatter_index_restricted;
+    extern const function_shape *const store_scatter_offset;
+    extern const function_shape *const store_scatter_offset_restricted;
+    extern const function_shape *const tbl_tuple;
+    extern const function_shape *const ternary_bfloat;
+    extern const function_shape *const ternary_bfloat_lane;
+    extern const function_shape *const ternary_bfloat_lanex2;
+    extern const function_shape *const ternary_bfloat_opt_n;
+    extern const function_shape *const ternary_intq_uintq_lane;
+    extern const function_shape *const ternary_intq_uintq_opt_n;
+    extern const function_shape *const ternary_lane;
+    extern const function_shape *const ternary_lane_rotate;
+    extern const function_shape *const ternary_long_lane;
+    extern const function_shape *const ternary_long_opt_n;
+    extern const function_shape *const ternary_opt_n;
+    extern const function_shape *const ternary_qq_lane;
+    extern const function_shape *const ternary_qq_lane_rotate;
+    extern const function_shape *const ternary_qq_opt_n;
+    extern const function_shape *const ternary_qq_rotate;
+    extern const function_shape *const ternary_rotate;
+    extern const function_shape *const ternary_shift_left_imm;
+    extern const function_shape *const ternary_shift_right_imm;
+    extern const function_shape *const ternary_uint;
+    extern const function_shape *const ternary_uintq_intq;
+    extern const function_shape *const ternary_uintq_intq_lane;
+    extern const function_shape *const ternary_uintq_intq_opt_n;
+    extern const function_shape *const tmad;
+    extern const function_shape *const unary;
+    extern const function_shape *const unary_convert;
+    extern const function_shape *const unary_convert_narrowt;
+    extern const function_shape *const unary_long;
+    extern const function_shape *const unary_n;
+    extern const function_shape *const unary_narrowb;
+    extern const function_shape *const unary_narrowt;
+    extern const function_shape *const unary_narrowb_to_uint;
+    extern const function_shape *const unary_narrowt_to_uint;
+    extern const function_shape *const unary_pred;
+    extern const function_shape *const unary_to_int;
+    extern const function_shape *const unary_to_uint;
+    extern const function_shape *const unary_uint;
+    extern const function_shape *const unary_widen;
+  }
+}
+
+#endif
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
new file mode 100644
index 000000000..f830d9294
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -0,0 +1,3568 @@
+/* ACLE support for AArch64 SVE
+   Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "tree.h"
+#include "rtl.h"
+#include "tm_p.h"
+#include "memmodel.h"
+#include "insn-codes.h"
+#include "optabs.h"
+#include "recog.h"
+#include "diagnostic.h"
+#include "expr.h"
+#include "basic-block.h"
+#include "function.h"
+#include "fold-const.h"
+#include "gimple.h"
+#include "gimple-iterator.h"
+#include "gimplify.h"
+#include "explow.h"
+#include "emit-rtl.h"
+#include "tree-vector-builder.h"
+#include "stor-layout.h"
+#include "regs.h"
+#include "alias.h"
+#include "gimple-fold.h"
+#include "langhooks.h"
+#include "stringpool.h"
+#include "aarch64-sve-builtins.h"
+#include "aarch64-sve-builtins-base.h"
+#include "aarch64-sve-builtins-shapes.h"
+
+namespace aarch64_sve {
+
+/* Static information about each single-predicate or single-vector
+   ABI and ACLE type.  */
+struct vector_type_info
+{
+  /* The name of the type as declared by arm_sve.h.  */
+  const char *acle_name;
+
+  /* The name of the type specified in AAPCS64.  The type is always
+     available under this name, even when arm_sve.h isn't included.  */
+  const char *abi_name;
+
+  /* The C++ mangling of ABI_NAME.  */
+  const char *mangled_name;
+};
+
+/* Describes a function decl.  */
+class GTY(()) registered_function
+{
+public:
+  /* The ACLE function that the decl represents.  */
+  function_instance instance GTY ((skip));
+
+  /* The decl itself.  */
+  tree decl;
+
+  /* The architecture extensions that the function requires, as a set of
+     AARCH64_FL_* flags.  */
+  uint64_t required_extensions;
+
+  /* True if the decl represents an overloaded function that needs to be
+     resolved by function_resolver.  */
+  bool overloaded_p;
+};
+
+/* Hash traits for registered_function.  */
+struct registered_function_hasher : nofree_ptr_hash <registered_function>
+{
+  typedef function_instance compare_type;
+
+  static hashval_t hash (value_type);
+  static bool equal (value_type, const compare_type &);
+};
+
+/* Information about each single-predicate or single-vector type.  */
+static CONSTEXPR const vector_type_info vector_types[] = {
+#define DEF_SVE_TYPE(ACLE_NAME, NCHARS, ABI_NAME, SCALAR_TYPE) \
+  { #ACLE_NAME, #ABI_NAME, #NCHARS #ABI_NAME },
+#include "aarch64-sve-builtins.def"
+};
+
+/* The function name suffix associated with each predication type.  */
+static const char *const pred_suffixes[NUM_PREDS + 1] = {
+  "",
+  "",
+  "_m",
+  "_x",
+  "_z",
+  ""
+};
+
+/* Static information about each mode_suffix_index.  */
+CONSTEXPR const mode_suffix_info mode_suffixes[] = {
+#define VECTOR_TYPE_none NUM_VECTOR_TYPES
+#define DEF_SVE_MODE(NAME, BASE, DISPLACEMENT, UNITS) \
+  { "_" #NAME, VECTOR_TYPE_##BASE, VECTOR_TYPE_##DISPLACEMENT, UNITS_##UNITS },
+#include "aarch64-sve-builtins.def"
+#undef VECTOR_TYPE_none
+  { "", NUM_VECTOR_TYPES, NUM_VECTOR_TYPES, UNITS_none }
+};
+
+/* Static information about each type_suffix_index.  */
+CONSTEXPR const type_suffix_info type_suffixes[NUM_TYPE_SUFFIXES + 1] = {
+#define DEF_SVE_TYPE_SUFFIX(NAME, ACLE_TYPE, CLASS, BITS, MODE) \
+  { "_" #NAME, \
+    VECTOR_TYPE_##ACLE_TYPE, \
+    TYPE_##CLASS, \
+    BITS, \
+    BITS / BITS_PER_UNIT, \
+    TYPE_##CLASS == TYPE_signed || TYPE_##CLASS == TYPE_unsigned, \
+    TYPE_##CLASS == TYPE_unsigned, \
+    TYPE_##CLASS == TYPE_float, \
+    TYPE_##CLASS == TYPE_bool, \
+    0, \
+    MODE },
+#include "aarch64-sve-builtins.def"
+  { "", NUM_VECTOR_TYPES, TYPE_bool, 0, 0, false, false, false, false,
+    0, VOIDmode }
+};
+
+/* Define a TYPES_<combination> macro for each combination of type
+   suffixes that an ACLE function can have, where <combination> is the
+   name used in DEF_SVE_FUNCTION entries.
+
+   Use S (T) for single type suffix T and D (T1, T2) for a pair of type
+   suffixes T1 and T2.  Use commas to separate the suffixes.
+
+   Although the order shouldn't matter, the convention is to sort the
+   suffixes lexicographically after dividing suffixes into a type
+   class ("b", "f", etc.) and a numerical bit count.  */
+
+/* _b8 _b16 _b32 _b64.  */
+#define TYPES_all_pred(S, D) \
+  S (b8), S (b16), S (b32), S (b64)
+
+/* _f16 _f32 _f64.  */
+#define TYPES_all_float(S, D) \
+  S (f16), S (f32), S (f64)
+
+/* _s8 _s16 _s32 _s64.  */
+#define TYPES_all_signed(S, D) \
+  S (s8), S (s16), S (s32), S (s64)
+
+/*     _f16 _f32 _f64
+   _s8 _s16 _s32 _s64.  */
+#define TYPES_all_float_and_signed(S, D) \
+  TYPES_all_float (S, D), TYPES_all_signed (S, D)
+
+/* _u8 _u16 _u32 _u64.  */
+#define TYPES_all_unsigned(S, D) \
+  S (u8), S (u16), S (u32), S (u64)
+
+/* _s8 _s16 _s32 _s64
+   _u8 _u16 _u32 _u64.  */
+#define TYPES_all_integer(S, D) \
+  TYPES_all_signed (S, D), TYPES_all_unsigned (S, D)
+
+/*     _f16 _f32 _f64
+   _s8 _s16 _s32 _s64
+   _u8 _u16 _u32 _u64.  */
+#define TYPES_all_arith(S, D) \
+  TYPES_all_float (S, D), TYPES_all_integer (S, D)
+
+/*     _bf16
+	_f16 _f32 _f64
+   _s8  _s16 _s32 _s64
+   _u8  _u16 _u32 _u64.  */
+#define TYPES_all_data(S, D) \
+  S (bf16), TYPES_all_arith (S, D)
+
+/* _b only.  */
+#define TYPES_b(S, D) \
+  S (b)
+
+/* _u8.  */
+#define TYPES_b_unsigned(S, D) \
+  S (u8)
+
+/* _s8
+   _u8.  */
+#define TYPES_b_integer(S, D) \
+  S (s8), TYPES_b_unsigned (S, D)
+
+/* _s8 _s16
+   _u8 _u16.  */
+#define TYPES_bh_integer(S, D) \
+  S (s8), S (s16), S (u8), S (u16)
+
+/* _u8 _u32.  */
+#define TYPES_bs_unsigned(S, D) \
+  S (u8), S (u32)
+
+/* _s8 _s16 _s32.  */
+#define TYPES_bhs_signed(S, D) \
+  S (s8), S (s16), S (s32)
+
+/* _u8 _u16 _u32.  */
+#define TYPES_bhs_unsigned(S, D) \
+  S (u8), S (u16), S (u32)
+
+/* _s8 _s16 _s32
+   _u8 _u16 _u32.  */
+#define TYPES_bhs_integer(S, D) \
+  TYPES_bhs_signed (S, D), TYPES_bhs_unsigned (S, D)
+
+/* _s16
+   _u16.  */
+#define TYPES_h_integer(S, D) \
+  S (s16), S (u16)
+
+/* _s16 _s32.  */
+#define TYPES_hs_signed(S, D) \
+  S (s16), S (s32)
+
+/* _s16 _s32
+   _u16 _u32.  */
+#define TYPES_hs_integer(S, D) \
+  TYPES_hs_signed (S, D), S (u16), S (u32)
+
+/* _f16 _f32.  */
+#define TYPES_hs_float(S, D) \
+  S (f16), S (f32)
+
+/* _u16 _u64.  */
+#define TYPES_hd_unsigned(S, D) \
+  S (u16), S (u64)
+
+/* _s16 _s32 _s64.  */
+#define TYPES_hsd_signed(S, D) \
+  S (s16), S (s32), S (s64)
+
+/* _s16 _s32 _s64
+   _u16 _u32 _u64.  */
+#define TYPES_hsd_integer(S, D) \
+  TYPES_hsd_signed (S, D), S (u16), S (u32), S (u64)
+
+/* _f32.  */
+#define TYPES_s_float(S, D) \
+  S (f32)
+
+/*      _f32
+   _s16 _s32 _s64
+   _u16 _u32 _u64.  */
+#define TYPES_s_float_hsd_integer(S, D) \
+  TYPES_s_float (S, D), TYPES_hsd_integer (S, D)
+
+/* _f32
+   _s32 _s64
+   _u32 _u64.  */
+#define TYPES_s_float_sd_integer(S, D) \
+  TYPES_s_float (S, D), TYPES_sd_integer (S, D)
+
+/* _s32.  */
+#define TYPES_s_signed(S, D) \
+  S (s32)
+
+/* _u32.  */
+#define TYPES_s_unsigned(S, D) \
+  S (u32)
+
+/* _s32 _u32.  */
+#define TYPES_s_integer(S, D) \
+  TYPES_s_signed (S, D), TYPES_s_unsigned (S, D)
+
+/* _s32 _s64.  */
+#define TYPES_sd_signed(S, D) \
+  S (s32), S (s64)
+
+/* _u32 _u64.  */
+#define TYPES_sd_unsigned(S, D) \
+  S (u32), S (u64)
+
+/* _s32 _s64
+   _u32 _u64.  */
+#define TYPES_sd_integer(S, D) \
+  TYPES_sd_signed (S, D), TYPES_sd_unsigned (S, D)
+
+/* _f32 _f64
+   _s32 _s64
+   _u32 _u64.  */
+#define TYPES_sd_data(S, D) \
+  S (f32), S (f64), TYPES_sd_integer (S, D)
+
+/* _f16 _f32 _f64
+	_s32 _s64
+	_u32 _u64.  */
+#define TYPES_all_float_and_sd_integer(S, D) \
+  TYPES_all_float (S, D), TYPES_sd_integer (S, D)
+
+/* _f64.  */
+#define TYPES_d_float(S, D) \
+  S (f64)
+
+/* _u64.  */
+#define TYPES_d_unsigned(S, D) \
+  S (u64)
+
+/* _s64
+   _u64.  */
+#define TYPES_d_integer(S, D) \
+  S (s64), TYPES_d_unsigned (S, D)
+
+/* _f64
+   _s64
+   _u64.  */
+#define TYPES_d_data(S, D) \
+  TYPES_d_float (S, D), TYPES_d_integer (S, D)
+
+/* All the type combinations allowed by svcvt.  */
+#define TYPES_cvt(S, D) \
+  D (f16, f32), D (f16, f64), \
+  D (f16, s16), D (f16, s32), D (f16, s64), \
+  D (f16, u16), D (f16, u32), D (f16, u64), \
+  \
+  D (f32, f16), D (f32, f64), \
+  D (f32, s32), D (f32, s64), \
+  D (f32, u32), D (f32, u64), \
+  \
+  D (f64, f16), D (f64, f32), \
+  D (f64, s32), D (f64, s64), \
+  D (f64, u32), D (f64, u64), \
+  \
+  D (s16, f16), \
+  D (s32, f16), D (s32, f32), D (s32, f64), \
+  D (s64, f16), D (s64, f32), D (s64, f64), \
+  \
+  D (u16, f16), \
+  D (u32, f16), D (u32, f32), D (u32, f64), \
+  D (u64, f16), D (u64, f32), D (u64, f64)
+
+/* _bf16_f32.  */
+#define TYPES_cvt_bfloat(S, D) \
+  D (bf16, f32)
+
+/* _f32_f16
+   _f64_f32.  */
+#define TYPES_cvt_long(S, D) \
+  D (f32, f16), D (f64, f32)
+
+/* _f16_f32.  */
+#define TYPES_cvt_narrow_s(S, D) \
+  D (f32, f64)
+
+/* _f16_f32
+   _f32_f64.  */
+#define TYPES_cvt_narrow(S, D) \
+  D (f16, f32), TYPES_cvt_narrow_s (S, D)
+
+/* { _s32 _s64 } x { _b8 _b16 _b32 _b64 }
+   { _u32 _u64 }.  */
+#define TYPES_inc_dec_n1(D, A) \
+  D (A, b8), D (A, b16), D (A, b32), D (A, b64)
+#define TYPES_inc_dec_n(S, D) \
+  TYPES_inc_dec_n1 (D, s32), \
+  TYPES_inc_dec_n1 (D, s64), \
+  TYPES_inc_dec_n1 (D, u32), \
+  TYPES_inc_dec_n1 (D, u64)
+
+/* {     _bf16           }   {     _bf16           }
+   {      _f16 _f32 _f64 }   {      _f16 _f32 _f64 }
+   { _s8  _s16 _s32 _s64 } x { _s8  _s16 _s32 _s64 }
+   { _u8  _u16 _u32 _u64 }   { _u8  _u16 _u32 _u64 }.  */
+#define TYPES_reinterpret1(D, A) \
+  D (A, bf16), \
+  D (A, f16), D (A, f32), D (A, f64), \
+  D (A, s8), D (A, s16), D (A, s32), D (A, s64), \
+  D (A, u8), D (A, u16), D (A, u32), D (A, u64)
+#define TYPES_reinterpret(S, D) \
+  TYPES_reinterpret1 (D, bf16), \
+  TYPES_reinterpret1 (D, f16), \
+  TYPES_reinterpret1 (D, f32), \
+  TYPES_reinterpret1 (D, f64), \
+  TYPES_reinterpret1 (D, s8), \
+  TYPES_reinterpret1 (D, s16), \
+  TYPES_reinterpret1 (D, s32), \
+  TYPES_reinterpret1 (D, s64), \
+  TYPES_reinterpret1 (D, u8), \
+  TYPES_reinterpret1 (D, u16), \
+  TYPES_reinterpret1 (D, u32), \
+  TYPES_reinterpret1 (D, u64)
+
+/* { _b8 _b16 _b32 _b64 } x { _s32 _s64 }
+			    { _u32 _u64 } */
+#define TYPES_while1(D, bn) \
+  D (bn, s32), D (bn, s64), D (bn, u32), D (bn, u64)
+#define TYPES_while(S, D) \
+  TYPES_while1 (D, b8), \
+  TYPES_while1 (D, b16), \
+  TYPES_while1 (D, b32), \
+  TYPES_while1 (D, b64)
+
+/* Describe a pair of type suffixes in which only the first is used.  */
+#define DEF_VECTOR_TYPE(X) { TYPE_SUFFIX_ ## X, NUM_TYPE_SUFFIXES }
+
+/* Describe a pair of type suffixes in which both are used.  */
+#define DEF_DOUBLE_TYPE(X, Y) { TYPE_SUFFIX_ ## X, TYPE_SUFFIX_ ## Y }
+
+/* Create an array that can be used in aarch64-sve-builtins.def to
+   select the type suffixes in TYPES_<NAME>.  */
+#define DEF_SVE_TYPES_ARRAY(NAME) \
+  static const type_suffix_pair types_##NAME[] = { \
+    TYPES_##NAME (DEF_VECTOR_TYPE, DEF_DOUBLE_TYPE), \
+    { NUM_TYPE_SUFFIXES, NUM_TYPE_SUFFIXES } \
+  }
+
+/* For functions that don't take any type suffixes.  */
+static const type_suffix_pair types_none[] = {
+  { NUM_TYPE_SUFFIXES, NUM_TYPE_SUFFIXES },
+  { NUM_TYPE_SUFFIXES, NUM_TYPE_SUFFIXES }
+};
+
+/* Create an array for each TYPES_<combination> macro above.  */
+DEF_SVE_TYPES_ARRAY (all_pred);
+DEF_SVE_TYPES_ARRAY (all_float);
+DEF_SVE_TYPES_ARRAY (all_signed);
+DEF_SVE_TYPES_ARRAY (all_float_and_signed);
+DEF_SVE_TYPES_ARRAY (all_unsigned);
+DEF_SVE_TYPES_ARRAY (all_integer);
+DEF_SVE_TYPES_ARRAY (all_arith);
+DEF_SVE_TYPES_ARRAY (all_data);
+DEF_SVE_TYPES_ARRAY (b);
+DEF_SVE_TYPES_ARRAY (b_unsigned);
+DEF_SVE_TYPES_ARRAY (b_integer);
+DEF_SVE_TYPES_ARRAY (bh_integer);
+DEF_SVE_TYPES_ARRAY (bs_unsigned);
+DEF_SVE_TYPES_ARRAY (bhs_signed);
+DEF_SVE_TYPES_ARRAY (bhs_unsigned);
+DEF_SVE_TYPES_ARRAY (bhs_integer);
+DEF_SVE_TYPES_ARRAY (h_integer);
+DEF_SVE_TYPES_ARRAY (hs_signed);
+DEF_SVE_TYPES_ARRAY (hs_integer);
+DEF_SVE_TYPES_ARRAY (hs_float);
+DEF_SVE_TYPES_ARRAY (hd_unsigned);
+DEF_SVE_TYPES_ARRAY (hsd_signed);
+DEF_SVE_TYPES_ARRAY (hsd_integer);
+DEF_SVE_TYPES_ARRAY (s_float);
+DEF_SVE_TYPES_ARRAY (s_float_hsd_integer);
+DEF_SVE_TYPES_ARRAY (s_float_sd_integer);
+DEF_SVE_TYPES_ARRAY (s_signed);
+DEF_SVE_TYPES_ARRAY (s_unsigned);
+DEF_SVE_TYPES_ARRAY (s_integer);
+DEF_SVE_TYPES_ARRAY (sd_signed);
+DEF_SVE_TYPES_ARRAY (sd_unsigned);
+DEF_SVE_TYPES_ARRAY (sd_integer);
+DEF_SVE_TYPES_ARRAY (sd_data);
+DEF_SVE_TYPES_ARRAY (all_float_and_sd_integer);
+DEF_SVE_TYPES_ARRAY (d_float);
+DEF_SVE_TYPES_ARRAY (d_unsigned);
+DEF_SVE_TYPES_ARRAY (d_integer);
+DEF_SVE_TYPES_ARRAY (d_data);
+DEF_SVE_TYPES_ARRAY (cvt);
+DEF_SVE_TYPES_ARRAY (cvt_bfloat);
+DEF_SVE_TYPES_ARRAY (cvt_long);
+DEF_SVE_TYPES_ARRAY (cvt_narrow_s);
+DEF_SVE_TYPES_ARRAY (cvt_narrow);
+DEF_SVE_TYPES_ARRAY (inc_dec_n);
+DEF_SVE_TYPES_ARRAY (reinterpret);
+DEF_SVE_TYPES_ARRAY (while);
+
+/* Used by functions that have no governing predicate.  */
+static const predication_index preds_none[] = { PRED_none, NUM_PREDS };
+
+/* Used by functions that have a governing predicate but do not have an
+   explicit suffix.  */
+static const predication_index preds_implicit[] = { PRED_implicit, NUM_PREDS };
+
+/* Used by functions that allow merging and "don't care" predication,
+   but are not suitable for predicated MOVPRFX.  */
+static const predication_index preds_mx[] = {
+  PRED_m, PRED_x, NUM_PREDS
+};
+
+/* Used by functions that allow merging, zeroing and "don't care"
+   predication.  */
+static const predication_index preds_mxz[] = {
+  PRED_m, PRED_x, PRED_z, NUM_PREDS
+};
+
+/* Used by functions that have the mxz predicated forms above, and in addition
+   have an unpredicated form.  */
+static const predication_index preds_mxz_or_none[] = {
+  PRED_m, PRED_x, PRED_z, PRED_none, NUM_PREDS
+};
+
+/* Used by functions that allow merging and zeroing predication but have
+   no "_x" form.  */
+static const predication_index preds_mz[] = { PRED_m, PRED_z, NUM_PREDS };
+
+/* Used by functions that have an unpredicated form and a _z predicated
+   form.  */
+static const predication_index preds_z_or_none[] = {
+  PRED_z, PRED_none, NUM_PREDS
+};
+
+/* Used by (mostly predicate) functions that only support "_z" predication.  */
+static const predication_index preds_z[] = { PRED_z, NUM_PREDS };
+
+/* A list of all SVE ACLE functions.  */
+static CONSTEXPR const function_group_info function_groups[] = {
+#define DEF_SVE_FUNCTION(NAME, SHAPE, TYPES, PREDS) \
+  { #NAME, &functions::NAME, &shapes::SHAPE, types_##TYPES, preds_##PREDS, \
+    REQUIRED_EXTENSIONS | AARCH64_FL_SVE },
+#include "aarch64-sve-builtins.def"
+};
+
+/* The scalar type associated with each vector type.  */
+GTY(()) tree scalar_types[NUM_VECTOR_TYPES];
+
+/* The single-predicate and single-vector types, with their built-in
+   "__SV..._t" name.  Allow an index of NUM_VECTOR_TYPES, which always
+   yields a null tree.  */
+static GTY(()) tree abi_vector_types[NUM_VECTOR_TYPES + 1];
+
+/* Same, but with the arm_sve.h "sv..._t" name.  */
+GTY(()) tree acle_vector_types[MAX_TUPLE_SIZE][NUM_VECTOR_TYPES + 1];
+
+/* The svpattern enum type.  */
+GTY(()) tree acle_svpattern;
+
+/* The svprfop enum type.  */
+GTY(()) tree acle_svprfop;
+
+/* The list of all registered function decls, indexed by code.  */
+static GTY(()) vec<registered_function *, va_gc> *registered_functions;
+
+/* All registered function decls, hashed on the function_instance
+   that they implement.  This is used for looking up implementations of
+   overloaded functions.  */
+static hash_table<registered_function_hasher> *function_table;
+
+/* True if we've already complained about attempts to use functions
+   when the required extension is disabled.  */
+static bool reported_missing_extension_p;
+
+/* If TYPE is an ACLE vector type, return the associated vector_type,
+   otherwise return NUM_VECTOR_TYPES.  */
+static vector_type_index
+find_vector_type (const_tree type)
+{
+  /* A linear search should be OK here, since the code isn't hot and
+     the number of types is only small.  */
+  type = TYPE_MAIN_VARIANT (type);
+  for (unsigned int i = 0; i < NUM_VECTOR_TYPES; ++i)
+    if (type == abi_vector_types[i])
+      return vector_type_index (i);
+  return NUM_VECTOR_TYPES;
+}
+
+/* If TYPE is a valid SVE element type, return the corresponding type
+   suffix, otherwise return NUM_TYPE_SUFFIXES.  */
+static type_suffix_index
+find_type_suffix_for_scalar_type (const_tree type)
+{
+  /* A linear search should be OK here, since the code isn't hot and
+     the number of types is only small.  */
+  type = TYPE_MAIN_VARIANT (type);
+  for (unsigned int suffix_i = 0; suffix_i < NUM_TYPE_SUFFIXES; ++suffix_i)
+    if (!type_suffixes[suffix_i].bool_p)
+      {
+	vector_type_index vector_i = type_suffixes[suffix_i].vector_type;
+	if (type == TYPE_MAIN_VARIANT (scalar_types[vector_i]))
+	  return type_suffix_index (suffix_i);
+      }
+  return NUM_TYPE_SUFFIXES;
+}
+
+/* Report an error against LOCATION that the user has tried to use
+   function FNDECL when extension EXTENSION is disabled.  */
+static void
+report_missing_extension (location_t location, tree fndecl,
+			  const char *extension)
+{
+  /* Avoid reporting a slew of messages for a single oversight.  */
+  if (reported_missing_extension_p)
+    return;
+
+  error_at (location, "ACLE function %qD requires ISA extension %qs",
+	    fndecl, extension);
+  inform (location, "you can enable %qs using the command-line"
+	  " option %<-march%>, or by using the %<target%>"
+	  " attribute or pragma", extension);
+  reported_missing_extension_p = true;
+}
+
+/* Check whether all the AARCH64_FL_* values in REQUIRED_EXTENSIONS are
+   enabled, given that those extensions are required for function FNDECL.
+   Report an error against LOCATION if not.  */
+static bool
+check_required_extensions (location_t location, tree fndecl,
+			   uint64_t required_extensions)
+{
+  uint64_t missing_extensions = required_extensions & ~aarch64_isa_flags;
+  if (missing_extensions == 0)
+    return true;
+
+  static const struct { uint64_t flag; const char *name; } extensions[] = {
+#define AARCH64_OPT_EXTENSION(EXT_NAME, FLAG_CANONICAL, FLAGS_ON, FLAGS_OFF, \
+			      SYNTHETIC, FEATURE_STRING) \
+    { FLAG_CANONICAL, EXT_NAME },
+#include "aarch64-option-extensions.def"
+  };
+
+  for (unsigned int i = 0; i < ARRAY_SIZE (extensions); ++i)
+    if (missing_extensions & extensions[i].flag)
+      {
+	report_missing_extension (location, fndecl, extensions[i].name);
+	return false;
+      }
+  gcc_unreachable ();
+}
+
+/* Report that LOCATION has a call to FNDECL in which argument ARGNO
+   was not an integer constant expression.  ARGNO counts from zero.  */
+static void
+report_non_ice (location_t location, tree fndecl, unsigned int argno)
+{
+  error_at (location, "argument %d of %qE must be an integer constant"
+	    " expression", argno + 1, fndecl);
+}
+
+/* Report that LOCATION has a call to FNDECL in which argument ARGNO has
+   the value ACTUAL, whereas the function requires a value in the range
+   [MIN, MAX].  ARGNO counts from zero.  */
+static void
+report_out_of_range (location_t location, tree fndecl, unsigned int argno,
+		     HOST_WIDE_INT actual, HOST_WIDE_INT min,
+		     HOST_WIDE_INT max)
+{
+  error_at (location, "passing %wd to argument %d of %qE, which expects"
+	    " a value in the range [%wd, %wd]", actual, argno + 1, fndecl,
+	    min, max);
+}
+
+/* Report that LOCATION has a call to FNDECL in which argument ARGNO has
+   the value ACTUAL, whereas the function requires either VALUE0 or
+   VALUE1.  ARGNO counts from zero.  */
+static void
+report_neither_nor (location_t location, tree fndecl, unsigned int argno,
+		    HOST_WIDE_INT actual, HOST_WIDE_INT value0,
+		    HOST_WIDE_INT value1)
+{
+  error_at (location, "passing %wd to argument %d of %qE, which expects"
+	    " either %wd or %wd", actual, argno + 1, fndecl, value0, value1);
+}
+
+/* Report that LOCATION has a call to FNDECL in which argument ARGNO has
+   the value ACTUAL, whereas the function requires one of VALUE0..3.
+   ARGNO counts from zero.  */
+static void
+report_not_one_of (location_t location, tree fndecl, unsigned int argno,
+		   HOST_WIDE_INT actual, HOST_WIDE_INT value0,
+		   HOST_WIDE_INT value1, HOST_WIDE_INT value2,
+		   HOST_WIDE_INT value3)
+{
+  error_at (location, "passing %wd to argument %d of %qE, which expects"
+	    " %wd, %wd, %wd or %wd", actual, argno + 1, fndecl, value0, value1,
+	    value2, value3);
+}
+
+/* Report that LOCATION has a call to FNDECL in which argument ARGNO has
+   the value ACTUAL, whereas the function requires a valid value of
+   enum type ENUMTYPE.  ARGNO counts from zero.  */
+static void
+report_not_enum (location_t location, tree fndecl, unsigned int argno,
+		 HOST_WIDE_INT actual, tree enumtype)
+{
+  error_at (location, "passing %wd to argument %d of %qE, which expects"
+	    " a valid %qT value", actual, argno + 1, fndecl, enumtype);
+}
+
+/* Return a hash code for a function_instance.  */
+hashval_t
+function_instance::hash () const
+{
+  inchash::hash h;
+  /* BASE uniquely determines BASE_NAME, so we don't need to hash both.  */
+  h.add_ptr (base);
+  h.add_ptr (shape);
+  h.add_int (mode_suffix_id);
+  h.add_int (type_suffix_ids[0]);
+  h.add_int (type_suffix_ids[1]);
+  h.add_int (pred);
+  return h.end ();
+}
+
+/* Return a set of CP_* flags that describe what the function could do,
+   taking the command-line flags into account.  */
+unsigned int
+function_instance::call_properties () const
+{
+  unsigned int flags = base->call_properties (*this);
+
+  /* -fno-trapping-math means that we can assume any FP exceptions
+     are not user-visible.  */
+  if (!flag_trapping_math)
+    flags &= ~CP_RAISE_FP_EXCEPTIONS;
+
+  return flags;
+}
+
+/* Return true if calls to the function could read some form of
+   global state.  */
+bool
+function_instance::reads_global_state_p () const
+{
+  unsigned int flags = call_properties ();
+
+  /* Preserve any dependence on rounding mode, flush to zero mode, etc.
+     There is currently no way of turning this off; in particular,
+     -fno-rounding-math (which is the default) means that we should make
+     the usual assumptions about rounding mode, which for intrinsics means
+     acting as the instructions do.  */
+  if (flags & CP_READ_FPCR)
+    return true;
+
+  /* Handle direct reads of global state.  */
+  return flags & (CP_READ_MEMORY | CP_READ_FFR);
+}
+
+/* Return true if calls to the function could modify some form of
+   global state.  */
+bool
+function_instance::modifies_global_state_p () const
+{
+  unsigned int flags = call_properties ();
+
+  /* Preserve any exception state written back to the FPCR,
+     unless -fno-trapping-math says this is unnecessary.  */
+  if (flags & CP_RAISE_FP_EXCEPTIONS)
+    return true;
+
+  /* Treat prefetches as modifying global state, since that's the
+     only means we have of keeping them in their correct position.  */
+  if (flags & CP_PREFETCH_MEMORY)
+    return true;
+
+  /* Handle direct modifications of global state.  */
+  return flags & (CP_WRITE_MEMORY | CP_WRITE_FFR);
+}
+
+/* Return true if calls to the function could raise a signal.  */
+bool
+function_instance::could_trap_p () const
+{
+  unsigned int flags = call_properties ();
+
+  /* Handle functions that could raise SIGFPE.  */
+  if (flags & CP_RAISE_FP_EXCEPTIONS)
+    return true;
+
+  /* Handle functions that could raise SIGBUS or SIGSEGV.  */
+  if (flags & (CP_READ_MEMORY | CP_WRITE_MEMORY))
+    return true;
+
+  return false;
+}
+
+inline hashval_t
+registered_function_hasher::hash (value_type value)
+{
+  return value->instance.hash ();
+}
+
+inline bool
+registered_function_hasher::equal (value_type value, const compare_type &key)
+{
+  return value->instance == key;
+}
+
+sve_switcher::sve_switcher ()
+  : m_old_isa_flags (aarch64_isa_flags)
+{
+  /* Changing the ISA flags and have_regs_of_mode should be enough here.
+     We shouldn't need to pay the compile-time cost of a full target
+     switch.  */
+  aarch64_isa_flags = (AARCH64_FL_FP | AARCH64_FL_SIMD | AARCH64_FL_F16
+		       | AARCH64_FL_SVE);
+
+  memcpy (m_old_have_regs_of_mode, have_regs_of_mode,
+	  sizeof (have_regs_of_mode));
+  for (int i = 0; i < NUM_MACHINE_MODES; ++i)
+    if (aarch64_sve_mode_p ((machine_mode) i))
+      have_regs_of_mode[i] = true;
+}
+
+sve_switcher::~sve_switcher ()
+{
+  memcpy (have_regs_of_mode, m_old_have_regs_of_mode,
+	  sizeof (have_regs_of_mode));
+  aarch64_isa_flags = m_old_isa_flags;
+}
+
+function_builder::function_builder ()
+{
+  m_overload_type = build_function_type (void_type_node, void_list_node);
+  m_direct_overloads = lang_GNU_CXX ();
+  gcc_obstack_init (&m_string_obstack);
+}
+
+function_builder::~function_builder ()
+{
+  obstack_free (&m_string_obstack, NULL);
+}
+
+/* Add NAME to the end of the function name being built.  */
+void
+function_builder::append_name (const char *name)
+{
+  obstack_grow (&m_string_obstack, name, strlen (name));
+}
+
+/* Zero-terminate and complete the function name being built.  */
+char *
+function_builder::finish_name ()
+{
+  obstack_1grow (&m_string_obstack, 0);
+  return (char *) obstack_finish (&m_string_obstack);
+}
+
+/* Return the overloaded or full function name for INSTANCE; OVERLOADED_P
+   selects which.  Allocate the string on m_string_obstack; the caller
+   must use obstack_free to free it after use.  */
+char *
+function_builder::get_name (const function_instance &instance,
+			    bool overloaded_p)
+{
+  append_name (instance.base_name);
+  if (overloaded_p)
+    switch (instance.displacement_units ())
+      {
+      case UNITS_none:
+	break;
+
+      case UNITS_bytes:
+	append_name ("_offset");
+	break;
+
+      case UNITS_elements:
+	append_name ("_index");
+	break;
+
+      case UNITS_vectors:
+	append_name ("_vnum");
+	break;
+      }
+  else
+    append_name (instance.mode_suffix ().string);
+  for (unsigned int i = 0; i < 2; ++i)
+    if (!overloaded_p || instance.shape->explicit_type_suffix_p (i))
+      append_name (instance.type_suffix (i).string);
+  append_name (pred_suffixes[instance.pred]);
+  return finish_name ();
+}
+
+/* Add attribute NAME to ATTRS.  */
+static tree
+add_attribute (const char *name, tree attrs)
+{
+  return tree_cons (get_identifier (name), NULL_TREE, attrs);
+}
+
+/* Return the appropriate function attributes for INSTANCE.  */
+tree
+function_builder::get_attributes (const function_instance &instance)
+{
+  tree attrs = NULL_TREE;
+
+  if (!instance.modifies_global_state_p ())
+    {
+      if (instance.reads_global_state_p ())
+	attrs = add_attribute ("pure", attrs);
+      else
+	attrs = add_attribute ("const", attrs);
+    }
+
+  if (!flag_non_call_exceptions || !instance.could_trap_p ())
+    attrs = add_attribute ("nothrow", attrs);
+
+  return add_attribute ("leaf", attrs);
+}
+
+/* Add a function called NAME with type FNTYPE and attributes ATTRS.
+   INSTANCE describes what the function does and OVERLOADED_P indicates
+   whether it is overloaded.  REQUIRED_EXTENSIONS are the set of
+   architecture extensions that the function requires.  */
+registered_function &
+function_builder::add_function (const function_instance &instance,
+				const char *name, tree fntype, tree attrs,
+				uint64_t required_extensions,
+				bool overloaded_p)
+{
+  unsigned int code = vec_safe_length (registered_functions);
+  code = (code << AARCH64_BUILTIN_SHIFT) | AARCH64_BUILTIN_SVE;
+  tree decl = simulate_builtin_function_decl (input_location, name, fntype,
+					      code, NULL, attrs);
+
+  registered_function &rfn = *ggc_alloc <registered_function> ();
+  rfn.instance = instance;
+  rfn.decl = decl;
+  rfn.required_extensions = required_extensions;
+  rfn.overloaded_p = overloaded_p;
+  vec_safe_push (registered_functions, &rfn);
+
+  return rfn;
+}
+
+/* Add a built-in function for INSTANCE, with the argument types given
+   by ARGUMENT_TYPES and the return type given by RETURN_TYPE.
+   REQUIRED_EXTENSIONS are the set of architecture extensions that the
+   function requires.  FORCE_DIRECT_OVERLOADS is true if there is a
+   one-to-one mapping between "short" and "full" names, and if standard
+   overload resolution therefore isn't necessary.  */
+void
+function_builder::add_unique_function (const function_instance &instance,
+				       tree return_type,
+				       vec<tree> &argument_types,
+				       uint64_t required_extensions,
+				       bool force_direct_overloads)
+{
+  /* Add the function under its full (unique) name.  */
+  char *name = get_name (instance, false);
+  tree fntype = build_function_type_array (return_type,
+					   argument_types.length (),
+					   argument_types.address ());
+  tree attrs = get_attributes (instance);
+  registered_function &rfn = add_function (instance, name, fntype, attrs,
+					   required_extensions, false);
+
+  /* Enter the function into the hash table.  */
+  hashval_t hash = instance.hash ();
+  registered_function **rfn_slot
+    = function_table->find_slot_with_hash (instance, hash, INSERT);
+  gcc_assert (!*rfn_slot);
+  *rfn_slot = &rfn;
+
+  /* Also add the function under its overloaded alias, if we want
+     a separate decl for each instance of an overloaded function.  */
+  if (m_direct_overloads || force_direct_overloads)
+    {
+      char *overload_name = get_name (instance, true);
+      if (strcmp (name, overload_name) != 0)
+	{
+	  /* Attribute lists shouldn't be shared.  */
+	  tree attrs = get_attributes (instance);
+	  add_function (instance, overload_name, fntype, attrs,
+			required_extensions, false);
+	}
+    }
+
+  obstack_free (&m_string_obstack, name);
+}
+
+/* Add one function decl for INSTANCE, to be used with manual overload
+   resolution.  REQUIRED_EXTENSIONS are the set of architecture extensions
+   that the function requires.
+
+   For simplicity, deal with duplicate attempts to add the same function,
+   including cases in which the new function requires more features than
+   the original one did.  In that case we'll check whether the required
+   features are available as part of resolving the function to the
+   relevant unique function.  */
+void
+function_builder::add_overloaded_function (const function_instance &instance,
+					   uint64_t required_extensions)
+{
+  char *name = get_name (instance, true);
+  if (registered_function **map_value = m_overload_names.get (name))
+    gcc_assert ((*map_value)->instance == instance
+		&& ((*map_value)->required_extensions
+		    & ~required_extensions) == 0);
+  else
+    {
+      registered_function &rfn
+	= add_function (instance, name, m_overload_type, NULL_TREE,
+			required_extensions, true);
+      const char *permanent_name = IDENTIFIER_POINTER (DECL_NAME (rfn.decl));
+      m_overload_names.put (permanent_name, &rfn);
+    }
+  obstack_free (&m_string_obstack, name);
+}
+
+/* If we are using manual overload resolution, add one function decl
+   for each overloaded function in GROUP.  Take the function base name
+   from GROUP and the mode from MODE.  */
+void
+function_builder::add_overloaded_functions (const function_group_info &group,
+					    mode_suffix_index mode)
+{
+  if (m_direct_overloads)
+    return;
+
+  unsigned int explicit_type0 = (*group.shape)->explicit_type_suffix_p (0);
+  unsigned int explicit_type1 = (*group.shape)->explicit_type_suffix_p (1);
+  for (unsigned int pi = 0; group.preds[pi] != NUM_PREDS; ++pi)
+    {
+      if (!explicit_type0 && !explicit_type1)
+	{
+	  /* Deal with the common case in which there is one overloaded
+	     function for all type combinations.  */
+	  function_instance instance (group.base_name, *group.base,
+				      *group.shape, mode, types_none[0],
+				      group.preds[pi]);
+	  add_overloaded_function (instance, group.required_extensions);
+	}
+      else
+	for (unsigned int ti = 0; group.types[ti][0] != NUM_TYPE_SUFFIXES;
+	     ++ti)
+	  {
+	    /* Stub out the types that are determined by overload
+	       resolution.  */
+	    type_suffix_pair types = {
+	      explicit_type0 ? group.types[ti][0] : NUM_TYPE_SUFFIXES,
+	      explicit_type1 ? group.types[ti][1] : NUM_TYPE_SUFFIXES
+	    };
+	    function_instance instance (group.base_name, *group.base,
+					*group.shape, mode, types,
+					group.preds[pi]);
+	    add_overloaded_function (instance, group.required_extensions);
+	  }
+    }
+}
+
+/* Register all the functions in GROUP.  */
+void
+function_builder::register_function_group (const function_group_info &group)
+{
+  (*group.shape)->build (*this, group);
+}
+
+function_call_info::function_call_info (location_t location_in,
+					const function_instance &instance_in,
+					tree fndecl_in)
+  : function_instance (instance_in), location (location_in), fndecl (fndecl_in)
+{
+}
+
+function_resolver::function_resolver (location_t location,
+				      const function_instance &instance,
+				      tree fndecl, vec<tree, va_gc> &arglist)
+  : function_call_info (location, instance, fndecl), m_arglist (arglist)
+{
+}
+
+/* Return the vector type associated with type suffix TYPE.  */
+tree
+function_resolver::get_vector_type (type_suffix_index type)
+{
+  return acle_vector_types[0][type_suffixes[type].vector_type];
+}
+
+/* Return the <stdint.h> name associated with TYPE.  Using the <stdint.h>
+   name should be more user-friendly than the underlying canonical type,
+   since it makes the signedness and bitwidth explicit.  */
+const char *
+function_resolver::get_scalar_type_name (type_suffix_index type)
+{
+  return vector_types[type_suffixes[type].vector_type].acle_name + 2;
+}
+
+/* Return the type of argument I, or error_mark_node if it isn't
+   well-formed.  */
+tree
+function_resolver::get_argument_type (unsigned int i)
+{
+  tree arg = m_arglist[i];
+  return arg == error_mark_node ? arg : TREE_TYPE (arg);
+}
+
+/* Return true if argument I is some form of scalar value.  */
+bool
+function_resolver::scalar_argument_p (unsigned int i)
+{
+  tree type = get_argument_type (i);
+  return (INTEGRAL_TYPE_P (type)
+	  /* Allow pointer types, leaving the frontend to warn where
+	     necessary.  */
+	  || POINTER_TYPE_P (type)
+	  || SCALAR_FLOAT_TYPE_P (type));
+}
+
+/* Report that the function has no form that takes type suffix TYPE.
+   Return error_mark_node.  */
+tree
+function_resolver::report_no_such_form (type_suffix_index type)
+{
+  error_at (location, "%qE has no form that takes %qT arguments",
+	    fndecl, get_vector_type (type));
+  return error_mark_node;
+}
+
+/* Silently check whether there is an instance of the function with the
+   mode suffix given by MODE and the type suffixes given by TYPE0 and TYPE1.
+   Return its function decl if so, otherwise return null.  */
+tree
+function_resolver::lookup_form (mode_suffix_index mode,
+				type_suffix_index type0,
+				type_suffix_index type1)
+{
+  type_suffix_pair types = { type0, type1 };
+  function_instance instance (base_name, base, shape, mode, types, pred);
+  registered_function *rfn
+    = function_table->find_with_hash (instance, instance.hash ());
+  return rfn ? rfn->decl : NULL_TREE;
+}
+
+/* Resolve the function to one with the mode suffix given by MODE and the
+   type suffixes given by TYPE0 and TYPE1.  Return its function decl on
+   success, otherwise report an error and return error_mark_node.  */
+tree
+function_resolver::resolve_to (mode_suffix_index mode,
+			       type_suffix_index type0,
+			       type_suffix_index type1)
+{
+  tree res = lookup_form (mode, type0, type1);
+  if (!res)
+    {
+      if (type1 == NUM_TYPE_SUFFIXES)
+	return report_no_such_form (type0);
+      if (type0 == type_suffix_ids[0])
+	return report_no_such_form (type1);
+      /* To be filled in when we have other cases.  */
+      gcc_unreachable ();
+    }
+  return res;
+}
+
+/* Require argument ARGNO to be a 32-bit or 64-bit scalar integer type.
+   Return the associated type suffix on success, otherwise report an
+   error and return NUM_TYPE_SUFFIXES.  */
+type_suffix_index
+function_resolver::infer_integer_scalar_type (unsigned int argno)
+{
+  tree actual = get_argument_type (argno);
+  if (actual == error_mark_node)
+    return NUM_TYPE_SUFFIXES;
+
+  /* Allow enums and booleans to decay to integers, for compatibility
+     with C++ overloading rules.  */
+  if (INTEGRAL_TYPE_P (actual))
+    {
+      bool uns_p = TYPE_UNSIGNED (actual);
+      /* Honor the usual integer promotions, so that resolution works
+	 in the same way as for C++.  */
+      if (TYPE_PRECISION (actual) < 32)
+	return TYPE_SUFFIX_s32;
+      if (TYPE_PRECISION (actual) == 32)
+	return uns_p ? TYPE_SUFFIX_u32 : TYPE_SUFFIX_s32;
+      if (TYPE_PRECISION (actual) == 64)
+	return uns_p ? TYPE_SUFFIX_u64 : TYPE_SUFFIX_s64;
+    }
+
+  error_at (location, "passing %qT to argument %d of %qE, which expects"
+	    " a 32-bit or 64-bit integer type", actual, argno + 1, fndecl);
+  return NUM_TYPE_SUFFIXES;
+}
+
+/* Require argument ARGNO to be a pointer to a scalar type that has a
+   corresponding type suffix.  Return that type suffix on success,
+   otherwise report an error and return NUM_TYPE_SUFFIXES.
+   GATHER_SCATTER_P is true if the function is a gather/scatter
+   operation, and so requires a pointer to 32-bit or 64-bit data.  */
+type_suffix_index
+function_resolver::infer_pointer_type (unsigned int argno,
+				       bool gather_scatter_p)
+{
+  tree actual = get_argument_type (argno);
+  if (actual == error_mark_node)
+    return NUM_TYPE_SUFFIXES;
+
+  if (TREE_CODE (actual) != POINTER_TYPE)
+    {
+      error_at (location, "passing %qT to argument %d of %qE, which"
+		" expects a pointer type", actual, argno + 1, fndecl);
+      if (VECTOR_TYPE_P (actual) && gather_scatter_p)
+	inform (location, "an explicit type suffix is needed"
+		" when using a vector of base addresses");
+      return NUM_TYPE_SUFFIXES;
+    }
+
+  tree target = TREE_TYPE (actual);
+  type_suffix_index type = find_type_suffix_for_scalar_type (target);
+  if (type == NUM_TYPE_SUFFIXES)
+    {
+      error_at (location, "passing %qT to argument %d of %qE, but %qT is not"
+		" a valid SVE element type", actual, argno + 1, fndecl,
+		build_qualified_type (target, 0));
+      return NUM_TYPE_SUFFIXES;
+    }
+  unsigned int bits = type_suffixes[type].element_bits;
+  if (gather_scatter_p && bits != 32 && bits != 64)
+    {
+      error_at (location, "passing %qT to argument %d of %qE, which"
+		" expects a pointer to 32-bit or 64-bit elements",
+		actual, argno + 1, fndecl);
+      return NUM_TYPE_SUFFIXES;
+    }
+
+  return type;
+}
+
+/* Require argument ARGNO to be a single vector or a tuple of NUM_VECTORS
+   vectors; NUM_VECTORS is 1 for the former.  Return the associated type
+   suffix on success, using TYPE_SUFFIX_b for predicates.  Report an error
+   and return NUM_TYPE_SUFFIXES on failure.  */
+type_suffix_index
+function_resolver::infer_vector_or_tuple_type (unsigned int argno,
+					       unsigned int num_vectors)
+{
+  tree actual = get_argument_type (argno);
+  if (actual == error_mark_node)
+    return NUM_TYPE_SUFFIXES;
+
+  /* A linear search should be OK here, since the code isn't hot and
+     the number of types is only small.  */
+  for (unsigned int size_i = 0; size_i < MAX_TUPLE_SIZE; ++size_i)
+    for (unsigned int suffix_i = 0; suffix_i < NUM_TYPE_SUFFIXES; ++suffix_i)
+      {
+	vector_type_index type_i = type_suffixes[suffix_i].vector_type;
+	tree type = acle_vector_types[size_i][type_i];
+	if (type && TYPE_MAIN_VARIANT (actual) == TYPE_MAIN_VARIANT (type))
+	  {
+	    if (size_i + 1 == num_vectors)
+	      return type_suffix_index (suffix_i);
+
+	    if (num_vectors == 1)
+	      error_at (location, "passing %qT to argument %d of %qE, which"
+			" expects a single SVE vector rather than a tuple",
+			actual, argno + 1, fndecl);
+	    else if (size_i == 0 && type_i != VECTOR_TYPE_svbool_t)
+	      error_at (location, "passing single vector %qT to argument %d"
+			" of %qE, which expects a tuple of %d vectors",
+			actual, argno + 1, fndecl, num_vectors);
+	    else
+	      error_at (location, "passing %qT to argument %d of %qE, which"
+			" expects a tuple of %d vectors", actual, argno + 1,
+			fndecl, num_vectors);
+	    return NUM_TYPE_SUFFIXES;
+	  }
+      }
+
+  if (num_vectors == 1)
+    error_at (location, "passing %qT to argument %d of %qE, which"
+	      " expects an SVE vector type", actual, argno + 1, fndecl);
+  else
+    error_at (location, "passing %qT to argument %d of %qE, which"
+	      " expects an SVE tuple type", actual, argno + 1, fndecl);
+  return NUM_TYPE_SUFFIXES;
+}
+
+/* Require argument ARGNO to have some form of vector type.  Return the
+   associated type suffix on success, using TYPE_SUFFIX_b for predicates.
+   Report an error and return NUM_TYPE_SUFFIXES on failure.  */
+type_suffix_index
+function_resolver::infer_vector_type (unsigned int argno)
+{
+  return infer_vector_or_tuple_type (argno, 1);
+}
+
+/* Like infer_vector_type, but also require the type to be integral.  */
+type_suffix_index
+function_resolver::infer_integer_vector_type (unsigned int argno)
+{
+  type_suffix_index type = infer_vector_type (argno);
+  if (type == NUM_TYPE_SUFFIXES)
+    return type;
+
+  if (!type_suffixes[type].integer_p)
+    {
+      error_at (location, "passing %qT to argument %d of %qE, which"
+		" expects a vector of integers", get_argument_type (argno),
+		argno + 1, fndecl);
+      return NUM_TYPE_SUFFIXES;
+    }
+
+  return type;
+}
+
+/* Like infer_vector_type, but also require the type to be an unsigned
+   integer.  */
+type_suffix_index
+function_resolver::infer_unsigned_vector_type (unsigned int argno)
+{
+  type_suffix_index type = infer_vector_type (argno);
+  if (type == NUM_TYPE_SUFFIXES)
+    return type;
+
+  if (!type_suffixes[type].unsigned_p)
+    {
+      error_at (location, "passing %qT to argument %d of %qE, which"
+		" expects a vector of unsigned integers",
+		get_argument_type (argno), argno + 1, fndecl);
+      return NUM_TYPE_SUFFIXES;
+    }
+
+  return type;
+}
+
+/* Like infer_vector_type, but also require the element size to be
+   32 or 64 bits.  */
+type_suffix_index
+function_resolver::infer_sd_vector_type (unsigned int argno)
+{
+  type_suffix_index type = infer_vector_type (argno);
+  if (type == NUM_TYPE_SUFFIXES)
+    return type;
+
+  unsigned int bits = type_suffixes[type].element_bits;
+  if (bits != 32 && bits != 64)
+    {
+      error_at (location, "passing %qT to argument %d of %qE, which"
+		" expects a vector of 32-bit or 64-bit elements",
+		get_argument_type (argno), argno + 1, fndecl);
+      return NUM_TYPE_SUFFIXES;
+    }
+
+  return type;
+}
+
+/* If the function operates on tuples of vectors, require argument ARGNO to be
+   a tuple with the appropriate number of vectors, otherwise require it to be
+   a single vector.  Return the associated type suffix on success, using
+   TYPE_SUFFIX_b for predicates.  Report an error and return NUM_TYPE_SUFFIXES
+   on failure.  */
+type_suffix_index
+function_resolver::infer_tuple_type (unsigned int argno)
+{
+  return infer_vector_or_tuple_type (argno, vectors_per_tuple ());
+}
+
+/* Require argument ARGNO to be a vector or scalar argument.  Return true
+   if it is, otherwise report an appropriate error.  */
+bool
+function_resolver::require_vector_or_scalar_type (unsigned int argno)
+{
+  tree actual = get_argument_type (argno);
+  if (actual == error_mark_node)
+    return false;
+
+  if (!scalar_argument_p (argno) && !VECTOR_TYPE_P (actual))
+    {
+      error_at (location, "passing %qT to argument %d of %qE, which"
+		" expects a vector or scalar type", actual, argno + 1, fndecl);
+      return false;
+    }
+
+  return true;
+}
+
+/* Require argument ARGNO to have vector type TYPE, in cases where this
+   requirement holds for all uses of the function.  Return true if the
+   argument has the right form, otherwise report an appropriate error.  */
+bool
+function_resolver::require_vector_type (unsigned int argno,
+					vector_type_index type)
+{
+  tree expected = acle_vector_types[0][type];
+  tree actual = get_argument_type (argno);
+  if (actual != error_mark_node
+      && TYPE_MAIN_VARIANT (expected) != TYPE_MAIN_VARIANT (actual))
+    {
+      error_at (location, "passing %qT to argument %d of %qE, which"
+		" expects %qT", actual, argno + 1, fndecl, expected);
+      return false;
+    }
+  return true;
+}
+
+/* Like require_vector_type, but TYPE is inferred from previous arguments
+   rather than being a fixed part of the function signature.  This changes
+   the nature of the error messages.  */
+bool
+function_resolver::require_matching_vector_type (unsigned int argno,
+						 type_suffix_index type)
+{
+  type_suffix_index new_type = infer_vector_type (argno);
+  if (new_type == NUM_TYPE_SUFFIXES)
+    return false;
+
+  if (type != new_type)
+    {
+      error_at (location, "passing %qT to argument %d of %qE, but"
+		" previous arguments had type %qT",
+		get_vector_type (new_type), argno + 1, fndecl,
+		get_vector_type (type));
+      return false;
+    }
+  return true;
+}
+
+/* Require argument ARGNO to be a vector type with the following properties:
+
+   - the type class must be the same as FIRST_TYPE's if EXPECTED_TCLASS
+     is SAME_TYPE_CLASS, otherwise it must be EXPECTED_TCLASS itself.
+
+   - the element size must be:
+
+     - the same as FIRST_TYPE's if EXPECTED_BITS == SAME_SIZE
+     - half of FIRST_TYPE's if EXPECTED_BITS == HALF_SIZE
+     - a quarter of FIRST_TYPE's if EXPECTED_BITS == QUARTER_SIZE
+     - EXPECTED_BITS itself otherwise
+
+   Return true if the argument has the required type, otherwise report
+   an appropriate error.
+
+   FIRST_ARGNO is the first argument that is known to have type FIRST_TYPE.
+   Usually it comes before ARGNO, but sometimes it is more natural to resolve
+   arguments out of order.
+
+   If the required properties depend on FIRST_TYPE then both FIRST_ARGNO and
+   ARGNO contribute to the resolution process.  If the required properties
+   are fixed, only FIRST_ARGNO contributes to the resolution process.
+
+   This function is a bit of a Swiss army knife.  The complication comes
+   from trying to give good error messages when FIRST_ARGNO and ARGNO are
+   inconsistent, since either of them might be wrong.  */
+bool function_resolver::
+require_derived_vector_type (unsigned int argno,
+			     unsigned int first_argno,
+			     type_suffix_index first_type,
+			     type_class_index expected_tclass,
+			     unsigned int expected_bits)
+{
+  /* If the type needs to match FIRST_ARGNO exactly, use the preferred
+     error message for that case.  The VECTOR_TYPE_P test excludes tuple
+     types, which we handle below instead.  */
+  bool both_vectors_p = VECTOR_TYPE_P (get_argument_type (first_argno));
+  if (both_vectors_p
+      && expected_tclass == SAME_TYPE_CLASS
+      && expected_bits == SAME_SIZE)
+    {
+      /* There's no need to resolve this case out of order.  */
+      gcc_assert (argno > first_argno);
+      return require_matching_vector_type (argno, first_type);
+    }
+
+  /* Use FIRST_TYPE to get the expected type class and element size.  */
+  type_class_index orig_expected_tclass = expected_tclass;
+  if (expected_tclass == NUM_TYPE_CLASSES)
+    expected_tclass = type_suffixes[first_type].tclass;
+
+  unsigned int orig_expected_bits = expected_bits;
+  if (expected_bits == SAME_SIZE)
+    expected_bits = type_suffixes[first_type].element_bits;
+  else if (expected_bits == HALF_SIZE)
+    expected_bits = type_suffixes[first_type].element_bits / 2;
+  else if (expected_bits == QUARTER_SIZE)
+    expected_bits = type_suffixes[first_type].element_bits / 4;
+
+  /* If the expected type doesn't depend on FIRST_TYPE at all,
+     just check for the fixed choice of vector type.  */
+  if (expected_tclass == orig_expected_tclass
+      && expected_bits == orig_expected_bits)
+    {
+      const type_suffix_info &expected_suffix
+	= type_suffixes[find_type_suffix (expected_tclass, expected_bits)];
+      return require_vector_type (argno, expected_suffix.vector_type);
+    }
+
+  /* Require the argument to be some form of SVE vector type,
+     without being specific about the type of vector we want.  */
+  type_suffix_index actual_type = infer_vector_type (argno);
+  if (actual_type == NUM_TYPE_SUFFIXES)
+    return false;
+
+  /* Exit now if we got the right type.  */
+  bool tclass_ok_p = (type_suffixes[actual_type].tclass == expected_tclass);
+  bool size_ok_p = (type_suffixes[actual_type].element_bits == expected_bits);
+  if (tclass_ok_p && size_ok_p)
+    return true;
+
+  /* First look for cases in which the actual type contravenes a fixed
+     size requirement, without having to refer to FIRST_TYPE.  */
+  if (!size_ok_p && expected_bits == orig_expected_bits)
+    {
+      error_at (location, "passing %qT to argument %d of %qE, which"
+		" expects a vector of %d-bit elements",
+		get_vector_type (actual_type), argno + 1, fndecl,
+		expected_bits);
+      return false;
+    }
+
+  /* Likewise for a fixed type class requirement.  This is only ever
+     needed for signed and unsigned types, so don't create unnecessary
+     translation work for other type classes.  */
+  if (!tclass_ok_p && orig_expected_tclass == TYPE_signed)
+    {
+      error_at (location, "passing %qT to argument %d of %qE, which"
+		" expects a vector of signed integers",
+		get_vector_type (actual_type), argno + 1, fndecl);
+      return false;
+    }
+  if (!tclass_ok_p && orig_expected_tclass == TYPE_unsigned)
+    {
+      error_at (location, "passing %qT to argument %d of %qE, which"
+		" expects a vector of unsigned integers",
+		get_vector_type (actual_type), argno + 1, fndecl);
+      return false;
+    }
+
+  /* Make sure that FIRST_TYPE itself is sensible before using it
+     as a basis for an error message.  */
+  if (resolve_to (mode_suffix_id, first_type) == error_mark_node)
+    return false;
+
+  /* If the arguments have consistent type classes, but a link between
+     the sizes has been broken, try to describe the error in those terms.  */
+  if (both_vectors_p && tclass_ok_p && orig_expected_bits == SAME_SIZE)
+    {
+      if (argno < first_argno)
+	{
+	  std::swap (argno, first_argno);
+	  std::swap (actual_type, first_type);
+	}
+      error_at (location, "arguments %d and %d of %qE must have the"
+		" same element size, but the values passed here have type"
+		" %qT and %qT respectively", first_argno + 1, argno + 1,
+		fndecl, get_vector_type (first_type),
+		get_vector_type (actual_type));
+      return false;
+    }
+
+  /* Likewise in reverse: look for cases in which the sizes are consistent
+     but a link between the type classes has been broken.  */
+  if (both_vectors_p
+      && size_ok_p
+      && orig_expected_tclass == SAME_TYPE_CLASS
+      && type_suffixes[first_type].integer_p
+      && type_suffixes[actual_type].integer_p)
+    {
+      if (argno < first_argno)
+	{
+	  std::swap (argno, first_argno);
+	  std::swap (actual_type, first_type);
+	}
+      error_at (location, "arguments %d and %d of %qE must have the"
+		" same signedness, but the values passed here have type"
+		" %qT and %qT respectively", first_argno + 1, argno + 1,
+		fndecl, get_vector_type (first_type),
+		get_vector_type (actual_type));
+      return false;
+    }
+
+  /* The two arguments are wildly inconsistent.  */
+  type_suffix_index expected_type
+    = find_type_suffix (expected_tclass, expected_bits);
+  error_at (location, "passing %qT instead of the expected %qT to argument"
+	    " %d of %qE, after passing %qT to argument %d",
+	    get_vector_type (actual_type), get_vector_type (expected_type),
+	    argno + 1, fndecl, get_argument_type (first_argno),
+	    first_argno + 1);
+  return false;
+}
+
+/* Require argument ARGNO to match argument FIRST_ARGNO, which was inferred
+   to be a pointer to a scalar element of type TYPE.  */
+bool
+function_resolver::require_matching_pointer_type (unsigned int argno,
+						  unsigned int first_argno,
+						  type_suffix_index type)
+{
+  type_suffix_index new_type = infer_pointer_type (argno);
+  if (new_type == NUM_TYPE_SUFFIXES)
+    return false;
+
+  if (type != new_type)
+    {
+      error_at (location, "passing %qT to argument %d of %qE, but"
+		" argument %d had type %qT", get_argument_type (argno),
+		argno + 1, fndecl, first_argno + 1,
+		get_argument_type (first_argno));
+      return false;
+    }
+  return true;
+}
+
+/* Require argument ARGNO to be a (possibly variable) scalar, using EXPECTED
+   as the name of its expected type.  Return true if the argument has the
+   right form, otherwise report an appropriate error.  */
+bool
+function_resolver::require_scalar_type (unsigned int argno,
+					const char *expected)
+{
+  if (!scalar_argument_p (argno))
+    {
+      error_at (location, "passing %qT to argument %d of %qE, which"
+		" expects %qs", get_argument_type (argno), argno + 1,
+		fndecl, expected);
+      return false;
+    }
+  return true;
+}
+
+/* Require argument ARGNO to be some form of pointer, without being specific
+   about its target type.  Return true if the argument has the right form,
+   otherwise report an appropriate error.  */
+bool
+function_resolver::require_pointer_type (unsigned int argno)
+{
+  if (!scalar_argument_p (argno))
+    {
+      error_at (location, "passing %qT to argument %d of %qE, which"
+		" expects a scalar pointer", get_argument_type (argno),
+		argno + 1, fndecl);
+      return false;
+    }
+  return true;
+}
+
+/* Argument FIRST_ARGNO is a scalar with type EXPECTED_TYPE, and argument
+   ARGNO should be consistent with it.  Return true if it is, otherwise
+   report an appropriate error.  */
+bool function_resolver::
+require_matching_integer_scalar_type (unsigned int argno,
+				      unsigned int first_argno,
+				      type_suffix_index expected_type)
+{
+  type_suffix_index actual_type = infer_integer_scalar_type (argno);
+  if (actual_type == NUM_TYPE_SUFFIXES)
+    return false;
+
+  if (actual_type == expected_type)
+    return true;
+
+  error_at (location, "call to %qE is ambiguous; argument %d has type"
+	    " %qs but argument %d has type %qs", fndecl,
+	    first_argno + 1, get_scalar_type_name (expected_type),
+	    argno + 1, get_scalar_type_name (actual_type));
+  return false;
+}
+
+/* Require argument ARGNO to be a (possibly variable) scalar, expecting it
+   to have the following properties:
+
+   - the type class must be the same as for type suffix 0 if EXPECTED_TCLASS
+     is SAME_TYPE_CLASS, otherwise it must be EXPECTED_TCLASS itself.
+
+   - the element size must be the same as for type suffix 0 if EXPECTED_BITS
+     is SAME_TYPE_SIZE, otherwise it must be EXPECTED_BITS itself.
+
+   Return true if the argument is valid, otherwise report an appropriate error.
+
+   Note that we don't check whether the scalar type actually has the required
+   properties, since that's subject to implicit promotions and conversions.
+   Instead we just use the expected properties to tune the error message.  */
+bool function_resolver::
+require_derived_scalar_type (unsigned int argno,
+			     type_class_index expected_tclass,
+			     unsigned int expected_bits)
+{
+  gcc_assert (expected_tclass == SAME_TYPE_CLASS
+	      || expected_tclass == TYPE_signed
+	      || expected_tclass == TYPE_unsigned);
+
+  /* If the expected type doesn't depend on the type suffix at all,
+     just check for the fixed choice of scalar type.  */
+  if (expected_tclass != SAME_TYPE_CLASS && expected_bits != SAME_SIZE)
+    {
+      type_suffix_index expected_type
+	= find_type_suffix (expected_tclass, expected_bits);
+      return require_scalar_type (argno, get_scalar_type_name (expected_type));
+    }
+
+  if (scalar_argument_p (argno))
+    return true;
+
+  if (expected_tclass == SAME_TYPE_CLASS)
+    /* It doesn't really matter whether the element is expected to be
+       the same size as type suffix 0.  */
+    error_at (location, "passing %qT to argument %d of %qE, which"
+	      " expects a scalar element", get_argument_type (argno),
+	      argno + 1, fndecl);
+  else
+    /* It doesn't seem useful to distinguish between signed and unsigned
+       scalars here.  */
+    error_at (location, "passing %qT to argument %d of %qE, which"
+	      " expects a scalar integer", get_argument_type (argno),
+	      argno + 1, fndecl);
+  return false;
+}
+
+/* Require argument ARGNO to be suitable for an integer constant expression.
+   Return true if it is, otherwise report an appropriate error.
+
+   function_checker checks whether the argument is actually constant and
+   has a suitable range.  The reason for distinguishing immediate arguments
+   here is because it provides more consistent error messages than
+   require_scalar_type would.  */
+bool
+function_resolver::require_integer_immediate (unsigned int argno)
+{
+  if (!scalar_argument_p (argno))
+    {
+      report_non_ice (location, fndecl, argno);
+      return false;
+    }
+  return true;
+}
+
+/* Require argument ARGNO to be a vector base in a gather-style address.
+   Return its type on success, otherwise return NUM_VECTOR_TYPES.  */
+vector_type_index
+function_resolver::infer_vector_base_type (unsigned int argno)
+{
+  type_suffix_index type = infer_vector_type (argno);
+  if (type == NUM_TYPE_SUFFIXES)
+    return NUM_VECTOR_TYPES;
+
+  if (type == TYPE_SUFFIX_u32 || type == TYPE_SUFFIX_u64)
+    return type_suffixes[type].vector_type;
+
+  error_at (location, "passing %qT to argument %d of %qE, which"
+	    " expects %qs or %qs", get_argument_type (argno),
+	    argno + 1, fndecl, "svuint32_t", "svuint64_t");
+  return NUM_VECTOR_TYPES;
+}
+
+/* Require argument ARGNO to be a vector displacement in a gather-style
+   address.  Return its type on success, otherwise return NUM_VECTOR_TYPES.  */
+vector_type_index
+function_resolver::infer_vector_displacement_type (unsigned int argno)
+{
+  type_suffix_index type = infer_integer_vector_type (argno);
+  if (type == NUM_TYPE_SUFFIXES)
+    return NUM_VECTOR_TYPES;
+
+  if (type_suffixes[type].integer_p
+      && (type_suffixes[type].element_bits == 32
+	  || type_suffixes[type].element_bits == 64))
+    return type_suffixes[type].vector_type;
+
+  error_at (location, "passing %qT to argument %d of %qE, which"
+	    " expects a vector of 32-bit or 64-bit integers",
+	    get_argument_type (argno), argno + 1, fndecl);
+  return NUM_VECTOR_TYPES;
+}
+
+/* Require argument ARGNO to be a vector displacement in a gather-style
+   address.  There are three possible uses:
+
+   - for loading into elements of type TYPE (when LOAD_P is true)
+   - for storing from elements of type TYPE (when LOAD_P is false)
+   - for prefetching data (when TYPE is NUM_TYPE_SUFFIXES)
+
+   The overloaded function's mode suffix determines the units of the
+   displacement (bytes for "_offset", elements for "_index").
+
+   Return the associated mode on success, otherwise report an error
+   and return MODE_none.  */
+mode_suffix_index
+function_resolver::resolve_sv_displacement (unsigned int argno,
+					    type_suffix_index type,
+					    bool load_p)
+{
+  if (type == NUM_TYPE_SUFFIXES)
+    {
+      /* For prefetches, the base is a void pointer and the displacement
+	 can be any valid offset or index type.  */
+      vector_type_index displacement_vector_type
+	= infer_vector_displacement_type (argno);
+      if (displacement_vector_type == NUM_VECTOR_TYPES)
+	return MODE_none;
+
+      mode_suffix_index mode = find_mode_suffix (NUM_VECTOR_TYPES,
+						 displacement_vector_type,
+						 displacement_units ());
+      gcc_assert (mode != MODE_none);
+      return mode;
+    }
+
+  unsigned int required_bits = type_suffixes[type].element_bits;
+  if (required_bits == 32
+      && displacement_units () == UNITS_elements
+      && !lookup_form (MODE_s32index, type)
+      && !lookup_form (MODE_u32index, type))
+    {
+      if (lookup_form (MODE_u32base_index, type))
+	{
+	  if (type_suffix_ids[0] == NUM_TYPE_SUFFIXES)
+	    {
+	      gcc_assert (!load_p);
+	      error_at (location, "when storing %qT, %qE requires a vector"
+			" base and a scalar index", get_vector_type (type),
+			fndecl);
+	    }
+	  else
+	    error_at (location, "%qE requires a vector base and a scalar"
+		      " index", fndecl);
+	}
+      else
+	error_at (location, "%qE does not support 32-bit vector type %qT",
+		  fndecl, get_vector_type (type));
+      return MODE_none;
+    }
+
+  /* Check for some form of vector type, without naming any in particular
+     as being expected.  */
+  type_suffix_index displacement_type = infer_vector_type (argno);
+  if (displacement_type == NUM_TYPE_SUFFIXES)
+    return MODE_none;
+
+  /* If the displacement type is consistent with the data vector type,
+     try to find the associated mode suffix.  This will fall through
+     for non-integral displacement types.  */
+  if (type_suffixes[displacement_type].element_bits == required_bits)
+    {
+      vector_type_index displacement_vector_type
+	= type_suffixes[displacement_type].vector_type;
+      mode_suffix_index mode = find_mode_suffix (NUM_VECTOR_TYPES,
+						 displacement_vector_type,
+						 displacement_units ());
+      if (mode != MODE_none)
+	{
+	  if (mode == MODE_s32offset
+	      && !lookup_form (mode, type)
+	      && lookup_form (MODE_u32offset, type))
+	    {
+	      if (type_suffix_ids[0] == NUM_TYPE_SUFFIXES)
+		error_at (location, "%qE does not support 32-bit sign-extended"
+			  " offsets", fndecl);
+	      else
+		error_at (location, "%qE does not support sign-extended"
+			  " offsets", fndecl);
+	      return MODE_none;
+	    }
+	  return mode;
+	}
+    }
+
+  if (type_suffix_ids[0] == NUM_TYPE_SUFFIXES)
+    {
+      /* TYPE has been inferred rather than specified by the user,
+	 so mention it in the error messages.  */
+      if (load_p)
+	error_at (location, "passing %qT to argument %d of %qE, which when"
+		  " loading %qT expects a vector of %d-bit integers",
+		  get_argument_type (argno), argno + 1, fndecl,
+		  get_vector_type (type), required_bits);
+      else
+	error_at (location, "passing %qT to argument %d of %qE, which when"
+		  " storing %qT expects a vector of %d-bit integers",
+		  get_argument_type (argno), argno + 1, fndecl,
+		  get_vector_type (type), required_bits);
+    }
+  else
+    /* TYPE is part of the function name.  */
+    error_at (location, "passing %qT to argument %d of %qE, which"
+	      " expects a vector of %d-bit integers",
+	      get_argument_type (argno), argno + 1, fndecl, required_bits);
+  return MODE_none;
+}
+
+/* Require the arguments starting at ARGNO to form a gather-style address.
+   There are three possible uses:
+
+   - for loading into elements of type TYPE (when LOAD_P is true)
+   - for storing from elements of type TYPE (when LOAD_P is false)
+   - for prefetching data (when TYPE is NUM_TYPE_SUFFIXES)
+
+   The three possible addresses are:
+
+   - a vector base with no displacement
+   - a vector base and a scalar displacement
+   - a scalar (pointer) base and a vector displacement
+
+   The overloaded function's mode suffix determines whether there is
+   a displacement, and if so, what units it uses:
+
+   - MODE_none: no displacement
+   - MODE_offset: the displacement is measured in bytes
+   - MODE_index: the displacement is measured in elements
+
+   Return the mode of the non-overloaded function on success, otherwise
+   report an error and return MODE_none.  */
+mode_suffix_index
+function_resolver::resolve_gather_address (unsigned int argno,
+					   type_suffix_index type,
+					   bool load_p)
+{
+  tree actual = get_argument_type (argno);
+  if (actual == error_mark_node)
+    return MODE_none;
+
+  if (displacement_units () != UNITS_none)
+    {
+      /* Some form of displacement is needed.  First handle a scalar
+	 pointer base and a vector displacement.  */
+      if (scalar_argument_p (argno))
+	/* Don't check the pointer type here, since there's only one valid
+	   choice.  Leave that to the frontend.  */
+	return resolve_sv_displacement (argno + 1, type, load_p);
+
+      if (!VECTOR_TYPE_P (actual))
+	{
+	  error_at (location, "passing %qT to argument %d of %qE,"
+		    " which expects a vector or pointer base address",
+		    actual, argno + 1, fndecl);
+	  return MODE_none;
+	}
+    }
+
+  /* Check for the correct choice of vector base type.  */
+  vector_type_index base_vector_type;
+  if (type == NUM_TYPE_SUFFIXES)
+    {
+      /* Since prefetches have no type suffix, there is a free choice
+	 between 32-bit and 64-bit base addresses.  */
+      base_vector_type = infer_vector_base_type (argno);
+      if (base_vector_type == NUM_VECTOR_TYPES)
+	return MODE_none;
+    }
+  else
+    {
+      /* Check for some form of vector type, without saying which type
+	 we expect.  */
+      type_suffix_index base_type = infer_vector_type (argno);
+      if (base_type == NUM_TYPE_SUFFIXES)
+	return MODE_none;
+
+      /* Check whether the type is the right one.  */
+      unsigned int required_bits = type_suffixes[type].element_bits;
+      gcc_assert (required_bits == 32 || required_bits == 64);
+      type_suffix_index required_type = (required_bits == 32
+					 ? TYPE_SUFFIX_u32
+					 : TYPE_SUFFIX_u64);
+      if (required_type != base_type)
+	{
+	  error_at (location, "passing %qT to argument %d of %qE,"
+		    " which expects %qT", actual, argno + 1, fndecl,
+		    get_vector_type (required_type));
+	  return MODE_none;
+	}
+      base_vector_type = type_suffixes[base_type].vector_type;
+    }
+
+  /* Check the scalar displacement, if any.  */
+  if (displacement_units () != UNITS_none
+      && !require_scalar_type (argno + 1, "int64_t"))
+    return MODE_none;
+
+  /* Find the appropriate mode suffix.  The checks above should have
+     weeded out all erroneous cases.  */
+  for (unsigned int mode_i = 0; mode_i < ARRAY_SIZE (mode_suffixes); ++mode_i)
+    {
+      const mode_suffix_info &mode = mode_suffixes[mode_i];
+      if (mode.base_vector_type == base_vector_type
+	  && mode.displacement_vector_type == NUM_VECTOR_TYPES
+	  && mode.displacement_units == displacement_units ())
+	return mode_suffix_index (mode_i);
+    }
+
+  gcc_unreachable ();
+}
+
+/* Require arguments ARGNO and ARGNO + 1 to form an ADR-style address,
+   i.e. one with a vector of base addresses and a vector of displacements.
+   The overloaded function's mode suffix determines the units of the
+   displacement (bytes for "_offset", elements for "_index").
+
+   Return the associated mode suffix on success, otherwise report
+   an error and return MODE_none.  */
+mode_suffix_index
+function_resolver::resolve_adr_address (unsigned int argno)
+{
+  vector_type_index base_type = infer_vector_base_type (argno);
+  if (base_type == NUM_VECTOR_TYPES)
+    return MODE_none;
+
+  vector_type_index displacement_type
+    = infer_vector_displacement_type (argno + 1);
+  if (displacement_type == NUM_VECTOR_TYPES)
+    return MODE_none;
+
+  mode_suffix_index mode = find_mode_suffix (base_type, displacement_type,
+					     displacement_units ());
+  if (mode == MODE_none)
+    {
+      if (mode_suffix_id == MODE_offset)
+	error_at (location, "cannot combine a base of type %qT with"
+		  " an offset of type %qT",
+		  get_argument_type (argno), get_argument_type (argno + 1));
+      else
+	error_at (location, "cannot combine a base of type %qT with"
+		  " an index of type %qT",
+		  get_argument_type (argno), get_argument_type (argno + 1));
+    }
+  return mode;
+}
+
+/* Require the function to have exactly EXPECTED arguments.  Return true
+   if it does, otherwise report an appropriate error.  */
+bool
+function_resolver::check_num_arguments (unsigned int expected)
+{
+  if (m_arglist.length () < expected)
+    error_at (location, "too few arguments to function %qE", fndecl);
+  else if (m_arglist.length () > expected)
+    error_at (location, "too many arguments to function %qE", fndecl);
+  return m_arglist.length () == expected;
+}
+
+/* If the function is predicated, check that the first argument is a
+   suitable governing predicate.  Also check that there are NOPS further
+   arguments after any governing predicate, but don't check what they are.
+
+   Return true on success, otherwise report a suitable error.
+   When returning true:
+
+   - set I to the number of the first unchecked argument.
+   - set NARGS to the total number of arguments.  */
+bool
+function_resolver::check_gp_argument (unsigned int nops,
+				      unsigned int &i, unsigned int &nargs)
+{
+  i = 0;
+  if (pred != PRED_none)
+    {
+      /* Unary merge operations should use resolve_unary instead.  */
+      gcc_assert (nops != 1 || pred != PRED_m);
+      nargs = nops + 1;
+      if (!check_num_arguments (nargs)
+	  || !require_vector_type (i, VECTOR_TYPE_svbool_t))
+	return false;
+      i += 1;
+    }
+  else
+    {
+      nargs = nops;
+      if (!check_num_arguments (nargs))
+	return false;
+    }
+
+  return true;
+}
+
+/* Finish resolving a function whose final argument can be a vector
+   or a scalar, with the function having an implicit "_n" suffix
+   in the latter case.  This "_n" form might only exist for certain
+   type suffixes.
+
+   ARGNO is the index of the final argument.  The inferred type suffix
+   was obtained from argument FIRST_ARGNO, which has type FIRST_TYPE.
+   EXPECTED_TCLASS and EXPECTED_BITS describe the expected properties
+   of the final vector or scalar argument, in the same way as for
+   require_derived_vector_type.  INFERRED_TYPE is the inferred type
+   suffix itself, or NUM_TYPE_SUFFIXES if it's the same as FIRST_TYPE.
+
+   Return the function decl of the resolved function on success,
+   otherwise report a suitable error and return error_mark_node.  */
+tree function_resolver::
+finish_opt_n_resolution (unsigned int argno, unsigned int first_argno,
+			 type_suffix_index first_type,
+			 type_class_index expected_tclass,
+			 unsigned int expected_bits,
+			 type_suffix_index inferred_type)
+{
+  if (inferred_type == NUM_TYPE_SUFFIXES)
+    inferred_type = first_type;
+  tree scalar_form = lookup_form (MODE_n, inferred_type);
+
+  /* Allow the final argument to be scalar, if an _n form exists.  */
+  if (scalar_argument_p (argno))
+    {
+      if (scalar_form)
+	return scalar_form;
+
+      /* Check the vector form normally.  If that succeeds, raise an
+	 error about having no corresponding _n form.  */
+      tree res = resolve_to (mode_suffix_id, inferred_type);
+      if (res != error_mark_node)
+	error_at (location, "passing %qT to argument %d of %qE, but its"
+		  " %qT form does not accept scalars",
+		  get_argument_type (argno), argno + 1, fndecl,
+		  get_vector_type (first_type));
+      return error_mark_node;
+    }
+
+  /* If an _n form does exist, provide a more accurate message than
+     require_derived_vector_type would for arguments that are neither
+     vectors nor scalars.  */
+  if (scalar_form && !require_vector_or_scalar_type (argno))
+    return error_mark_node;
+
+  /* Check for the correct vector type.  */
+  if (!require_derived_vector_type (argno, first_argno, first_type,
+				    expected_tclass, expected_bits))
+    return error_mark_node;
+
+  return resolve_to (mode_suffix_id, inferred_type);
+}
+
+/* Resolve a (possibly predicated) unary function.  If the function uses
+   merge predication or if TREAT_AS_MERGE_P is true, there is an extra
+   vector argument before the governing predicate that specifies the
+   values of inactive elements.  This argument has the following
+   properties:
+
+   - the type class must be the same as for active elements if MERGE_TCLASS
+     is SAME_TYPE_CLASS, otherwise it must be MERGE_TCLASS itself.
+
+   - the element size must be the same as for active elements if MERGE_BITS
+     is SAME_TYPE_SIZE, otherwise it must be MERGE_BITS itself.
+
+   Return the function decl of the resolved function on success,
+   otherwise report a suitable error and return error_mark_node.  */
+tree
+function_resolver::resolve_unary (type_class_index merge_tclass,
+				  unsigned int merge_bits,
+				  bool treat_as_merge_p)
+{
+  type_suffix_index type;
+  if (pred == PRED_m || treat_as_merge_p)
+    {
+      if (!check_num_arguments (3))
+	return error_mark_node;
+      if (merge_tclass == SAME_TYPE_CLASS && merge_bits == SAME_SIZE)
+	{
+	  /* The inactive elements are the same as the active elements,
+	     so we can use normal left-to-right resolution.  */
+	  if ((type = infer_vector_type (0)) == NUM_TYPE_SUFFIXES
+	      || !require_vector_type (1, VECTOR_TYPE_svbool_t)
+	      || !require_matching_vector_type (2, type))
+	    return error_mark_node;
+	}
+      else
+	{
+	  /* The inactive element type is a function of the active one,
+	     so resolve the active one first.  */
+	  if (!require_vector_type (1, VECTOR_TYPE_svbool_t)
+	      || (type = infer_vector_type (2)) == NUM_TYPE_SUFFIXES
+	      || !require_derived_vector_type (0, 2, type, merge_tclass,
+					       merge_bits))
+	    return error_mark_node;
+	}
+    }
+  else
+    {
+      /* We just need to check the predicate (if any) and the single
+	 vector argument.  */
+      unsigned int i, nargs;
+      if (!check_gp_argument (1, i, nargs)
+	  || (type = infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
+	return error_mark_node;
+    }
+
+  /* Handle convert-like functions in which the first type suffix is
+     explicit.  */
+  if (type_suffix_ids[0] != NUM_TYPE_SUFFIXES)
+    return resolve_to (mode_suffix_id, type_suffix_ids[0], type);
+
+  return resolve_to (mode_suffix_id, type);
+}
+
+/* Resolve a (possibly predicated) function that takes NOPS like-typed
+   vector arguments followed by NIMM integer immediates.  Return the
+   function decl of the resolved function on success, otherwise report
+   a suitable error and return error_mark_node.  */
+tree
+function_resolver::resolve_uniform (unsigned int nops, unsigned int nimm)
+{
+  unsigned int i, nargs;
+  type_suffix_index type;
+  if (!check_gp_argument (nops + nimm, i, nargs)
+      || (type = infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
+    return error_mark_node;
+
+  i += 1;
+  for (; i < nargs - nimm; ++i)
+    if (!require_matching_vector_type (i, type))
+      return error_mark_node;
+
+  for (; i < nargs; ++i)
+    if (!require_integer_immediate (i))
+      return error_mark_node;
+
+  return resolve_to (mode_suffix_id, type);
+}
+
+/* Resolve a (possibly predicated) function that offers a choice between
+   taking:
+
+   - NOPS like-typed vector arguments or
+   - NOPS - 1 like-typed vector arguments followed by a scalar argument
+
+   Return the function decl of the resolved function on success,
+   otherwise report a suitable error and return error_mark_node.  */
+tree
+function_resolver::resolve_uniform_opt_n (unsigned int nops)
+{
+  unsigned int i, nargs;
+  type_suffix_index type;
+  if (!check_gp_argument (nops, i, nargs)
+      || (type = infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
+    return error_mark_node;
+
+  unsigned int first_arg = i++;
+  for (; i < nargs - 1; ++i)
+    if (!require_matching_vector_type (i, type))
+      return error_mark_node;
+
+  return finish_opt_n_resolution (i, first_arg, type);
+}
+
+/* If the call is erroneous, report an appropriate error and return
+   error_mark_node.  Otherwise, if the function is overloaded, return
+   the decl of the non-overloaded function.  Return NULL_TREE otherwise,
+   indicating that the call should be processed in the normal way.  */
+tree
+function_resolver::resolve ()
+{
+  return shape->resolve (*this);
+}
+
+function_checker::function_checker (location_t location,
+				    const function_instance &instance,
+				    tree fndecl, tree fntype,
+				    unsigned int nargs, tree *args)
+  : function_call_info (location, instance, fndecl),
+    m_fntype (fntype), m_nargs (nargs), m_args (args),
+    /* We don't have to worry about unary _m operations here, since they
+       never have arguments that need checking.  */
+    m_base_arg (pred != PRED_none ? 1 : 0)
+{
+}
+
+/* Return true if argument ARGNO exists. which it might not for
+   erroneous calls.  It is safe to wave through checks if this
+   function returns false.  */
+bool
+function_checker::argument_exists_p (unsigned int argno)
+{
+  gcc_assert (argno < (unsigned int) type_num_arguments (m_fntype));
+  return argno < m_nargs;
+}
+
+/* Check that argument ARGNO is an integer constant expression and
+   store its value in VALUE_OUT if so.  The caller should first
+   check that argument ARGNO exists.  */
+bool
+function_checker::require_immediate (unsigned int argno,
+				     HOST_WIDE_INT &value_out)
+{
+  gcc_assert (argno < m_nargs);
+  tree arg = m_args[argno];
+
+  /* The type and range are unsigned, so read the argument as an
+     unsigned rather than signed HWI.  */
+  if (!tree_fits_uhwi_p (arg))
+    {
+      report_non_ice (location, fndecl, argno);
+      return false;
+    }
+
+  /* ...but treat VALUE_OUT as signed for error reporting, since printing
+     -1 is more user-friendly than the maximum uint64_t value.  */
+  value_out = tree_to_uhwi (arg);
+  return true;
+}
+
+/* Check that argument REL_ARGNO is an integer constant expression that
+   has the value VALUE0 or VALUE1.  REL_ARGNO counts from the end of the
+   predication arguments.  */
+bool
+function_checker::require_immediate_either_or (unsigned int rel_argno,
+					       HOST_WIDE_INT value0,
+					       HOST_WIDE_INT value1)
+{
+  unsigned int argno = m_base_arg + rel_argno;
+  if (!argument_exists_p (argno))
+    return true;
+
+  HOST_WIDE_INT actual;
+  if (!require_immediate (argno, actual))
+    return false;
+
+  if (actual != value0 && actual != value1)
+    {
+      report_neither_nor (location, fndecl, argno, actual, 90, 270);
+      return false;
+    }
+
+  return true;
+}
+
+/* Check that argument REL_ARGNO is an integer constant expression that has
+   a valid value for enumeration type TYPE.  REL_ARGNO counts from the end
+   of the predication arguments.  */
+bool
+function_checker::require_immediate_enum (unsigned int rel_argno, tree type)
+{
+  unsigned int argno = m_base_arg + rel_argno;
+  if (!argument_exists_p (argno))
+    return true;
+
+  HOST_WIDE_INT actual;
+  if (!require_immediate (argno, actual))
+    return false;
+
+  for (tree entry = TYPE_VALUES (type); entry; entry = TREE_CHAIN (entry))
+    {
+      /* The value is an INTEGER_CST for C and a CONST_DECL wrapper
+	 around an INTEGER_CST for C++.  */
+      tree value = TREE_VALUE (entry);
+      if (TREE_CODE (value) == CONST_DECL)
+	value = DECL_INITIAL (value);
+      if (wi::to_widest (value) == actual)
+	return true;
+    }
+
+  report_not_enum (location, fndecl, argno, actual, type);
+  return false;
+}
+
+/* Check that argument REL_ARGNO is suitable for indexing argument
+   REL_ARGNO - 1, in groups of GROUP_SIZE elements.  REL_ARGNO counts
+   from the end of the predication arguments.  */
+bool
+function_checker::require_immediate_lane_index (unsigned int rel_argno,
+						unsigned int group_size)
+{
+  unsigned int argno = m_base_arg + rel_argno;
+  if (!argument_exists_p (argno))
+    return true;
+
+  /* Get the type of the previous argument.  tree_argument_type wants a
+     1-based number, whereas ARGNO is 0-based.  */
+  machine_mode mode = TYPE_MODE (type_argument_type (m_fntype, argno));
+  gcc_assert (VECTOR_MODE_P (mode));
+  unsigned int nlanes = 128 / (group_size * GET_MODE_UNIT_BITSIZE (mode));
+  return require_immediate_range (rel_argno, 0, nlanes - 1);
+}
+
+/* Check that argument REL_ARGNO is an integer constant expression that
+   has one of the given values.  */
+bool
+function_checker::require_immediate_one_of (unsigned int rel_argno,
+					    HOST_WIDE_INT value0,
+					    HOST_WIDE_INT value1,
+					    HOST_WIDE_INT value2,
+					    HOST_WIDE_INT value3)
+{
+  unsigned int argno = m_base_arg + rel_argno;
+  if (!argument_exists_p (argno))
+    return true;
+
+  HOST_WIDE_INT actual;
+  if (!require_immediate (argno, actual))
+    return false;
+
+  if (actual != value0
+      && actual != value1
+      && actual != value2
+      && actual != value3)
+    {
+      report_not_one_of (location, fndecl, argno, actual,
+			 value0, value1, value2, value3);
+      return false;
+    }
+
+  return true;
+}
+
+/* Check that argument REL_ARGNO is an integer constant expression in the
+   range [MIN, MAX].  REL_ARGNO counts from the end of the predication
+   arguments.  */
+bool
+function_checker::require_immediate_range (unsigned int rel_argno,
+					   HOST_WIDE_INT min,
+					   HOST_WIDE_INT max)
+{
+  unsigned int argno = m_base_arg + rel_argno;
+  if (!argument_exists_p (argno))
+    return true;
+
+  /* Required because of the tree_to_uhwi -> HOST_WIDE_INT conversion
+     in require_immediate.  */
+  gcc_assert (min >= 0 && min <= max);
+  HOST_WIDE_INT actual;
+  if (!require_immediate (argno, actual))
+    return false;
+
+  if (!IN_RANGE (actual, min, max))
+    {
+      report_out_of_range (location, fndecl, argno, actual, min, max);
+      return false;
+    }
+
+  return true;
+}
+
+/* Perform semantic checks on the call.  Return true if the call is valid,
+   otherwise report a suitable error.  */
+bool
+function_checker::check ()
+{
+  function_args_iterator iter;
+  tree type;
+  unsigned int i = 0;
+  FOREACH_FUNCTION_ARGS (m_fntype, type, iter)
+    {
+      if (type == void_type_node || i >= m_nargs)
+	break;
+
+      if (i >= m_base_arg
+	  && TREE_CODE (type) == ENUMERAL_TYPE
+	  && !require_immediate_enum (i - m_base_arg, type))
+	return false;
+
+      i += 1;
+    }
+
+  return shape->check (*this);
+}
+
+gimple_folder::gimple_folder (const function_instance &instance, tree fndecl,
+			      gimple_stmt_iterator *gsi_in, gcall *call_in)
+  : function_call_info (gimple_location (call_in), instance, fndecl),
+    gsi (gsi_in), call (call_in), lhs (gimple_call_lhs (call_in))
+{
+}
+
+/* Convert predicate argument ARGNO so that it has the type appropriate for
+   an operation on VECTYPE.  Add any new statements to STMTS.  */
+tree
+gimple_folder::convert_pred (gimple_seq &stmts, tree vectype,
+			     unsigned int argno)
+{
+  tree pred = gimple_call_arg (call, argno);
+  if (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (pred)),
+		TYPE_VECTOR_SUBPARTS (vectype)))
+    return pred;
+
+  return gimple_build (&stmts, VIEW_CONVERT_EXPR,
+		       truth_type_for (vectype), pred);
+}
+
+/* Return a pointer to the address in a contiguous load or store,
+   given that each memory vector has type VECTYPE.  Add any new
+   statements to STMTS.  */
+tree
+gimple_folder::fold_contiguous_base (gimple_seq &stmts, tree vectype)
+{
+  tree base = gimple_call_arg (call, 1);
+  if (mode_suffix_id == MODE_vnum)
+    {
+      tree offset = gimple_call_arg (call, 2);
+      offset = gimple_convert (&stmts, sizetype, offset);
+      offset = gimple_build (&stmts, MULT_EXPR, sizetype, offset,
+			     TYPE_SIZE_UNIT (vectype));
+      base = gimple_build (&stmts, POINTER_PLUS_EXPR, TREE_TYPE (base),
+			   base, offset);
+    }
+  return base;
+}
+
+/* Return the alignment and TBAA argument to an internal load or store
+   function like IFN_MASK_LOAD or IFN_MASK_STORE, given that it accesses
+   memory elements of type TYPE.  */
+tree
+gimple_folder::load_store_cookie (tree type)
+{
+  return build_int_cst (build_pointer_type (type), TYPE_ALIGN_UNIT (type));
+}
+
+/* Fold the call to a call to INSTANCE, with the same arguments.  */
+gimple *
+gimple_folder::redirect_call (const function_instance &instance)
+{
+  registered_function *rfn
+    = function_table->find_with_hash (instance, instance.hash ());
+  if (!rfn)
+    return NULL;
+
+  gimple_call_set_fndecl (call, rfn->decl);
+  return call;
+}
+
+/* Fold the call to a PTRUE, taking the element size from type suffix 0.  */
+gimple *
+gimple_folder::fold_to_ptrue ()
+{
+  tree svbool_type = TREE_TYPE (lhs);
+  tree bool_type = TREE_TYPE (svbool_type);
+  unsigned int element_bytes = type_suffix (0).element_bytes;
+
+  /* The return type is svbool_t for all type suffixes, thus for b8 we
+     want { 1, 1, 1, 1, ... }, for b16 we want { 1, 0, 1, 0, ... }, etc.  */
+  tree_vector_builder builder (svbool_type, element_bytes, 1);
+  builder.quick_push (build_all_ones_cst (bool_type));
+  for (unsigned int i = 1; i < element_bytes; ++i)
+    builder.quick_push (build_zero_cst (bool_type));
+  return gimple_build_assign (lhs, builder.build ());
+}
+
+/* Fold the call to a PFALSE.  */
+gimple *
+gimple_folder::fold_to_pfalse ()
+{
+  return gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
+}
+
+/* Fold an operation to a constant predicate in which the first VL
+   elements are set and the rest are clear.  Take the element size
+   from type suffix 0.  */
+gimple *
+gimple_folder::fold_to_vl_pred (unsigned int vl)
+{
+  tree vectype = TREE_TYPE (lhs);
+  tree element_type = TREE_TYPE (vectype);
+  tree minus_one = build_all_ones_cst (element_type);
+  tree zero = build_zero_cst (element_type);
+  unsigned int element_bytes = type_suffix (0).element_bytes;
+
+  /* Construct COUNT elements that contain the ptrue followed by
+     a repeating sequence of COUNT elements.  */
+  unsigned int count = constant_lower_bound (TYPE_VECTOR_SUBPARTS (vectype));
+  gcc_assert (vl * element_bytes <= count);
+  tree_vector_builder builder (vectype, count, 2);
+  for (unsigned int i = 0; i < count * 2; ++i)
+    {
+      bool bit = (i & (element_bytes - 1)) == 0 && i < vl * element_bytes;
+      builder.quick_push (bit ? minus_one : zero);
+    }
+  return gimple_build_assign (lhs, builder.build ());
+}
+
+/* Try to fold the call.  Return the new statement on success and null
+   on failure.  */
+gimple *
+gimple_folder::fold ()
+{
+  /* Don't fold anything when SVE is disabled; emit an error during
+     expansion instead.  */
+  if (!TARGET_SVE)
+    return NULL;
+
+  /* Punt if the function has a return type and no result location is
+     provided.  The attributes should allow target-independent code to
+     remove the calls if appropriate.  */
+  if (!lhs && TREE_TYPE (gimple_call_fntype (call)) != void_type_node)
+    return NULL;
+
+  return base->fold (*this);
+}
+
+function_expander::function_expander (const function_instance &instance,
+				      tree fndecl, tree call_expr_in,
+				      rtx possible_target_in)
+  : function_call_info (EXPR_LOCATION (call_expr_in), instance, fndecl),
+    call_expr (call_expr_in), possible_target (possible_target_in)
+{
+}
+
+/* Return the handler of direct optab OP for type suffix SUFFIX_I.  */
+insn_code
+function_expander::direct_optab_handler (optab op, unsigned int suffix_i)
+{
+  return ::direct_optab_handler (op, vector_mode (suffix_i));
+}
+
+/* Choose between signed and unsigned direct optabs SIGNED_OP and
+   UNSIGNED_OP based on the signedness of type suffix SUFFIX_I, then
+   pick the appropriate optab handler for the mode.  Use MODE as the
+   mode if given, otherwise use the mode of type suffix SUFFIX_I.  */
+insn_code
+function_expander::direct_optab_handler_for_sign (optab signed_op,
+						  optab unsigned_op,
+						  unsigned int suffix_i,
+						  machine_mode mode)
+{
+  if (mode == VOIDmode)
+    mode = vector_mode (suffix_i);
+  optab op = type_suffix (suffix_i).unsigned_p ? unsigned_op : signed_op;
+  return ::direct_optab_handler (op, mode);
+}
+
+/* Return true if X overlaps any input.  */
+bool
+function_expander::overlaps_input_p (rtx x)
+{
+  for (unsigned int i = 0; i < args.length (); ++i)
+    if (reg_overlap_mentioned_p (x, args[i]))
+      return true;
+  return false;
+}
+
+/* Return the base address for a contiguous load or store function.
+   MEM_MODE is the mode of the addressed memory.  */
+rtx
+function_expander::get_contiguous_base (machine_mode mem_mode)
+{
+  rtx base = args[1];
+  if (mode_suffix_id == MODE_vnum)
+    {
+      /* Use the size of the memory mode for extending loads and truncating
+	 stores.  Use the size of a full vector for non-extending loads
+	 and non-truncating stores (including svld[234] and svst[234]).  */
+      poly_int64 size = ordered_min (GET_MODE_SIZE (mem_mode),
+				     BYTES_PER_SVE_VECTOR);
+      rtx offset = gen_int_mode (size, Pmode);
+      offset = simplify_gen_binary (MULT, Pmode, args[2], offset);
+      base = simplify_gen_binary (PLUS, Pmode, base, offset);
+    }
+  return base;
+}
+
+/* For a function that does the equivalent of:
+
+     OUTPUT = COND ? FN (INPUTS) : FALLBACK;
+
+   return the value of FALLBACK.
+
+   MODE is the mode of OUTPUT.  NOPS is the number of operands in INPUTS.
+   MERGE_ARGNO is the argument that provides FALLBACK for _m functions,
+   or DEFAULT_MERGE_ARGNO if we should apply the usual rules.
+
+   ARGNO is the caller's index into args.  If the returned value is
+   argument 0 (as for unary _m operations), increment ARGNO past the
+   returned argument.  */
+rtx
+function_expander::get_fallback_value (machine_mode mode, unsigned int nops,
+				       unsigned int merge_argno,
+				       unsigned int &argno)
+{
+  if (pred == PRED_z)
+    return CONST0_RTX (mode);
+
+  gcc_assert (pred == PRED_m || pred == PRED_x);
+  if (merge_argno == DEFAULT_MERGE_ARGNO)
+    merge_argno = nops == 1 && pred == PRED_m ? 0 : 1;
+
+  if (merge_argno == 0)
+    return args[argno++];
+
+  return args[merge_argno];
+}
+
+/* Return a REG rtx that can be used for the result of the function,
+   using the preferred target if suitable.  */
+rtx
+function_expander::get_reg_target ()
+{
+  machine_mode target_mode = TYPE_MODE (TREE_TYPE (TREE_TYPE (fndecl)));
+  if (!possible_target || GET_MODE (possible_target) != target_mode)
+    possible_target = gen_reg_rtx (target_mode);
+  return possible_target;
+}
+
+/* As for get_reg_target, but make sure that the returned REG does not
+   overlap any inputs.  */
+rtx
+function_expander::get_nonoverlapping_reg_target ()
+{
+  if (possible_target && overlaps_input_p (possible_target))
+    possible_target = NULL_RTX;
+  return get_reg_target ();
+}
+
+/* Add an output operand to the instruction we're building, which has
+   code ICODE.  Bind the output to the preferred target rtx if possible.  */
+void
+function_expander::add_output_operand (insn_code icode)
+{
+  unsigned int opno = m_ops.length ();
+  machine_mode mode = insn_data[icode].operand[opno].mode;
+  m_ops.safe_grow (opno + 1);
+  create_output_operand (&m_ops.last (), possible_target, mode);
+}
+
+/* Add an input operand to the instruction we're building, which has
+   code ICODE.  Calculate the value of the operand as follows:
+
+   - If the operand is a vector and X is not, broadcast X to fill a
+     vector of the appropriate mode.
+
+   - Otherwise, if the operand is a predicate, coerce X to have the
+     mode that the instruction expects.  In this case X is known to be
+     VNx16BImode (the mode of svbool_t).
+
+   - Otherwise use X directly.  The expand machinery checks that X has
+     the right mode for the instruction.  */
+void
+function_expander::add_input_operand (insn_code icode, rtx x)
+{
+  unsigned int opno = m_ops.length ();
+  const insn_operand_data &operand = insn_data[icode].operand[opno];
+  machine_mode mode = operand.mode;
+  if (mode == VOIDmode)
+    {
+      /* The only allowable use of VOIDmode is the wildcard
+	 aarch64_any_register_operand, which is used to avoid
+	 combinatorial explosion in the reinterpret patterns.  */
+      gcc_assert (operand.predicate == aarch64_any_register_operand);
+      mode = GET_MODE (x);
+    }
+  else if (!VECTOR_MODE_P (GET_MODE (x)) && VECTOR_MODE_P (mode))
+    x = expand_vector_broadcast (mode, x);
+  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
+    {
+      gcc_assert (GET_MODE (x) == VNx16BImode);
+      x = gen_lowpart (mode, x);
+    }
+  m_ops.safe_grow (m_ops.length () + 1);
+  create_input_operand (&m_ops.last (), x, mode);
+}
+
+/* Add an integer operand with value X to the instruction.  */
+void
+function_expander::add_integer_operand (HOST_WIDE_INT x)
+{
+  m_ops.safe_grow (m_ops.length () + 1);
+  create_integer_operand (&m_ops.last (), x);
+}
+
+/* Add a memory operand with mode MODE and address ADDR.  */
+void
+function_expander::add_mem_operand (machine_mode mode, rtx addr)
+{
+  /* Exception for OImode for the ld1ro intrinsics.
+     They act on 256 bit octaword data, and it's just easier to use a scalar
+     mode to represent that than add a new vector mode solely for the purpose
+     of this intrinsic.  */
+  gcc_assert (VECTOR_MODE_P (mode) || mode == OImode);
+  rtx mem = gen_rtx_MEM (mode, memory_address (mode, addr));
+  /* The memory is only guaranteed to be element-aligned.  */
+  set_mem_align (mem, GET_MODE_ALIGNMENT (GET_MODE_INNER (mode)));
+  add_fixed_operand (mem);
+}
+
+/* Add an address operand with value X.  The static operand data says
+   what mode and form the address must have.  */
+void
+function_expander::add_address_operand (rtx x)
+{
+  m_ops.safe_grow (m_ops.length () + 1);
+  create_address_operand (&m_ops.last (), x);
+}
+
+/* Add an operand that must be X.  The only way of legitimizing an
+   invalid X is to reload the address of a MEM.  */
+void
+function_expander::add_fixed_operand (rtx x)
+{
+  m_ops.safe_grow (m_ops.length () + 1);
+  create_fixed_operand (&m_ops.last (), x);
+}
+
+/* Generate instruction ICODE, given that its operands have already
+   been added to M_OPS.  Return the value of the first operand.  */
+rtx
+function_expander::generate_insn (insn_code icode)
+{
+  expand_insn (icode, m_ops.length (), m_ops.address ());
+  return function_returns_void_p () ? const0_rtx : m_ops[0].value;
+}
+
+/* Convert the arguments to a gather/scatter function into the
+   associated md operands.  Argument ARGNO is the scalar or vector base and
+   argument ARGNO + 1 is the scalar or vector displacement (if applicable).
+   The md pattern expects:
+
+   - a scalar base
+   - a vector displacement
+
+   If SCALED_P is true, it also expects:
+
+   - a const_int that is 1 if the displacement is zero-extended from 32 bits
+   - a scaling multiplier (1 for bytes, 2 for .h indices, etc.).
+
+   If SCALED_P is false, the displacement is implicitly zero-extended
+   and the scaling multiplier is implicitly 1.  */
+void
+function_expander::prepare_gather_address_operands (unsigned int argno,
+						    bool scaled_p)
+{
+  machine_mode mem_mode = memory_vector_mode ();
+  tree vector_type = base_vector_type ();
+  units_index units = displacement_units ();
+  int shift_idx = -1;
+  if (units == UNITS_none)
+    {
+      /* Vector base, no displacement.  Convert to an integer zero base
+	 and a vector byte offset.  */
+      args.quick_insert (argno, const0_rtx);
+      units = UNITS_bytes;
+    }
+  else if (vector_type)
+    {
+      /* Vector base, scalar displacement.  Convert to a scalar base and
+	 a vector byte offset.  */
+      std::swap (args[argno], args[argno + 1]);
+      if (units == UNITS_elements)
+	shift_idx = argno;
+    }
+  else
+    {
+      /* Scalar base, vector displacement.  This is the order that the md
+	 pattern wants.  */
+      if (Pmode == SImode)
+	args[argno] = simplify_gen_unary (ZERO_EXTEND, DImode,
+					  args[argno], SImode);
+      vector_type = displacement_vector_type ();
+      if (units == UNITS_elements && !scaled_p)
+	shift_idx = argno + 1;
+    }
+  tree scalar_displacement_type = TREE_TYPE (vector_type);
+
+  if (shift_idx >= 0)
+    {
+      machine_mode arg_mode = GET_MODE (args[shift_idx]);
+      if (arg_mode == VOIDmode)
+	arg_mode = DImode;
+      unsigned int elt_bytes = GET_MODE_UNIT_SIZE (mem_mode);
+      rtx shift = gen_int_mode (exact_log2 (elt_bytes), DImode);
+      args[shift_idx] = simplify_gen_binary (ASHIFT, arg_mode,
+					     args[shift_idx], shift);
+      units = UNITS_bytes;
+    }
+
+  bool uxtw_p = (TYPE_PRECISION (scalar_displacement_type) == 64
+		 || TYPE_UNSIGNED (scalar_displacement_type));
+  unsigned int scale = (units == UNITS_bytes
+			? 1 : GET_MODE_UNIT_SIZE (mem_mode));
+
+  if (scaled_p)
+    {
+      args.quick_insert (argno + 2, GEN_INT (uxtw_p));
+      args.quick_insert (argno + 3, GEN_INT (scale));
+    }
+  else
+    gcc_assert (uxtw_p && scale == 1);
+}
+
+/* The final argument is an immediate svprfop value.  Add two fake arguments
+   to represent the rw and locality operands of a PREFETCH rtx.  */
+void
+function_expander::prepare_prefetch_operands ()
+{
+  unsigned int prfop = INTVAL (args.last ());
+  /* Bit 3 of the prfop selects stores over loads.  */
+  args.quick_push (GEN_INT ((prfop & 8) != 0));
+  /* Bits 1 and 2 specify the locality; 0-based for svprfop but
+     1-based for PREFETCH.  */
+  args.quick_push (GEN_INT (((prfop >> 1) & 3) + 1));
+}
+
+/* Add a dummy argument to indicate whether predicate argument ARGNO
+   is all-true when interpreted in mode PRED_MODE.  The hint goes
+   immediately after ARGNO.  */
+void
+function_expander::add_ptrue_hint (unsigned int argno, machine_mode pred_mode)
+{
+  rtx pred = gen_lowpart (pred_mode, args[argno]);
+  int hint = (pred == CONSTM1_RTX (pred_mode)
+	      ? SVE_KNOWN_PTRUE : SVE_MAYBE_NOT_PTRUE);
+  args.quick_insert (argno + 1, gen_int_mode (hint, SImode));
+}
+
+/* Rotate inputs args[START:END] one position to the left, so that
+   args[START] becomes args[END - 1].  */
+void
+function_expander::rotate_inputs_left (unsigned int start, unsigned int end)
+{
+  rtx new_last = args[start];
+  for (unsigned int i = start; i < end - 1; ++i)
+    args[i] = args[i + 1];
+  args[end - 1] = new_last;
+}
+
+/* Return true if the negation of argument ARGNO can be folded away,
+   replacing it with the negated value if so.  MODE is the associated
+   vector mode, but the argument could be a single element.  The main
+   case this handles is constant arguments.  */
+bool
+function_expander::try_negating_argument (unsigned int argno,
+					  machine_mode mode)
+{
+  rtx x = args[argno];
+  if (!VECTOR_MODE_P (GET_MODE (x)))
+    mode = GET_MODE_INNER (mode);
+
+  x = simplify_unary_operation (NEG, mode, x, mode);
+  if (!x)
+    return false;
+
+  args[argno] = x;
+  return true;
+}
+
+/* Implement the call using instruction ICODE, with a 1:1 mapping between
+   arguments and input operands.  */
+rtx
+function_expander::use_exact_insn (insn_code icode)
+{
+  unsigned int nops = insn_data[icode].n_operands;
+  if (!function_returns_void_p ())
+    {
+      add_output_operand (icode);
+      nops -= 1;
+    }
+  for (unsigned int i = 0; i < nops; ++i)
+    add_input_operand (icode, args[i]);
+  return generate_insn (icode);
+}
+
+/* Implement the call using instruction ICODE, which does not use a
+   governing predicate.  We must therefore drop the GP from an _x call.  */
+rtx
+function_expander::use_unpred_insn (insn_code icode)
+{
+  /* We can't drop the predicate for _z and _m.  */
+  gcc_assert (pred == PRED_x || pred == PRED_none);
+  /* Discount the output operand.  */
+  unsigned int nops = insn_data[icode].n_operands - 1;
+  /* Drop the predicate argument in the case of _x predication.  */
+  unsigned int bias = (pred == PRED_x ? 1 : 0);
+  unsigned int i = 0;
+
+  add_output_operand (icode);
+  for (; i < nops; ++i)
+    add_input_operand (icode, args[i + bias]);
+
+  return generate_insn (icode);
+}
+
+/* Implement the call using instruction ICODE, which is a predicated
+   operation that returns arbitrary values for inactive lanes.  */
+rtx
+function_expander::use_pred_x_insn (insn_code icode)
+{
+  /* At present we never need to handle PRED_none, which would involve
+     creating a new predicate rather than using one supplied by the user.  */
+  gcc_assert (pred == PRED_x);
+  /* Discount the output operand.  */
+  unsigned int nops = args.length () - 1;
+
+  bool has_float_operand_p = FLOAT_MODE_P (insn_data[icode].operand[0].mode);
+
+  /* Add the normal operands.  */
+  add_output_operand (icode);
+  add_input_operand (icode, args[0]);
+  for (unsigned int i = 0; i < nops; ++i)
+    {
+      add_input_operand (icode, args[i + 1]);
+      if (FLOAT_MODE_P (GET_MODE (args[i + 1])))
+	has_float_operand_p = true;
+    }
+
+  if (has_float_operand_p)
+    {
+      /* Add a flag that indicates whether unpredicated instructions
+	 are allowed.  */
+      rtx pred = m_ops[1].value;
+      if (flag_trapping_math && pred != CONST1_RTX (GET_MODE (pred)))
+	add_integer_operand (SVE_STRICT_GP);
+      else
+	add_integer_operand (SVE_RELAXED_GP);
+    }
+
+  return generate_insn (icode);
+}
+
+/* Implement the call using instruction ICODE, which does the equivalent of:
+
+     OUTPUT = COND ? FN (INPUTS) : FALLBACK;
+
+   The instruction operands are in the order above: OUTPUT, COND, INPUTS
+   and FALLBACK.  MERGE_ARGNO is the argument that provides FALLBACK for _m
+   functions, or DEFAULT_MERGE_ARGNO if we should apply the usual rules.  */
+rtx
+function_expander::use_cond_insn (insn_code icode, unsigned int merge_argno)
+{
+  /* At present we never need to handle PRED_none, which would involve
+     creating a new predicate rather than using one supplied by the user.  */
+  gcc_assert (pred != PRED_none);
+  /* Discount the output, predicate and fallback value.  */
+  unsigned int nops = insn_data[icode].n_operands - 3;
+  machine_mode mode = insn_data[icode].operand[0].mode;
+
+  unsigned int opno = 0;
+  rtx fallback_arg = get_fallback_value (mode, nops, merge_argno, opno);
+  rtx pred = args[opno++];
+
+  add_output_operand (icode);
+  add_input_operand (icode, pred);
+  for (unsigned int i = 0; i < nops; ++i)
+    add_input_operand (icode, args[opno + i]);
+  add_input_operand (icode, fallback_arg);
+  return generate_insn (icode);
+}
+
+/* Implement the call using instruction ICODE, which is a select-like
+   operation with the following operands:
+
+   0: output
+   1: true value
+   2: false value
+   3: predicate
+
+   MERGE_ARGNO is the argument that provides the "false" value for _m
+   functions, or DEFAULT_MERGE_ARGNO if we should apply the usual rules.  */
+rtx
+function_expander::use_vcond_mask_insn (insn_code icode,
+					unsigned int merge_argno)
+{
+  machine_mode mode = vector_mode (0);
+
+  unsigned int opno = 0;
+  rtx false_arg = get_fallback_value (mode, 1, merge_argno, opno);
+  rtx pred_arg = args[opno++];
+  rtx true_arg = args[opno++];
+
+  add_output_operand (icode);
+  add_input_operand (icode, true_arg);
+  add_input_operand (icode, false_arg);
+  add_input_operand (icode, pred_arg);
+  return generate_insn (icode);
+}
+
+/* Implement the call using instruction ICODE, which loads memory operand 1
+   into register operand 0 under the control of predicate operand 2.  */
+rtx
+function_expander::use_contiguous_load_insn (insn_code icode)
+{
+  machine_mode mem_mode = memory_vector_mode ();
+
+  add_output_operand (icode);
+  add_mem_operand (mem_mode, get_contiguous_base (mem_mode));
+  add_input_operand (icode, args[0]);
+  return generate_insn (icode);
+}
+
+/* Implement the call using instruction ICODE, which prefetches from
+   address operand 1 under the control of predicate operand 0.
+   Operands 2, 3 and 4 respectively specify the svprfop value,
+   the PREFETCH rw flag and the PREFETCH locality.  */
+rtx
+function_expander::use_contiguous_prefetch_insn (insn_code icode)
+{
+  add_input_operand (icode, args[0]);
+  add_address_operand (get_contiguous_base (VNx16QImode));
+  for (unsigned int i = args.length () - 3; i < args.length (); ++i)
+    add_input_operand (icode, args[i]);
+  return generate_insn (icode);
+}
+
+/* Implement the call using instruction ICODE, which stores register operand 1
+   into memory operand 0 under the control of predicate operand 2.  */
+rtx
+function_expander::use_contiguous_store_insn (insn_code icode)
+{
+  machine_mode mem_mode = memory_vector_mode ();
+
+  add_mem_operand (mem_mode, get_contiguous_base (mem_mode));
+  add_input_operand (icode, args.last ());
+  add_input_operand (icode, args[0]);
+  return generate_insn (icode);
+}
+
+/* Implement the call using one of the following strategies, chosen in order:
+
+   (1) "aarch64_pred_<optab><mode>_z" for PRED_z predicate functions
+
+   (2) "aarch64_pred_<optab><mode>" for PRED_x functions
+
+   (3) a normal unpredicated optab for PRED_none and PRED_x functions,
+       dropping the predicate in the latter case
+
+   (4) "cond_<optab><mode>" otherwise
+
+   where <optab> corresponds to:
+
+   - CODE_FOR_SINT for signed integers
+   - CODE_FOR_UINT for unsigned integers
+   - UNSPEC_FOR_FP for floating-point values
+
+   MERGE_ARGNO is the argument that provides the values of inactive lanes for
+   _m functions, or DEFAULT_MERGE_ARGNO if we should apply the usual rules.  */
+rtx
+function_expander::map_to_rtx_codes (rtx_code code_for_sint,
+				     rtx_code code_for_uint,
+				     int unspec_for_fp,
+				     unsigned int merge_argno)
+{
+  machine_mode mode = vector_mode (0);
+  rtx_code code = (type_suffix (0).unsigned_p ? code_for_uint : code_for_sint);
+  insn_code icode;
+
+  /* Handle predicate logic operations, which always use _z predication.  */
+  if (type_suffix (0).tclass == TYPE_bool)
+    {
+      gcc_assert (pred == PRED_z && code_for_uint == code_for_sint);
+      return use_exact_insn (code_for_aarch64_pred_z (code, mode));
+    }
+
+  /* First try using UNSPEC_PRED_X patterns for _x predication,
+     if available.  */
+  if (pred == PRED_x)
+    {
+      if (type_suffix (0).integer_p)
+	icode = maybe_code_for_aarch64_pred (code, mode);
+      else
+	icode = maybe_code_for_aarch64_pred (unspec_for_fp, mode);
+      if (icode != CODE_FOR_nothing)
+	return use_pred_x_insn (icode);
+    }
+
+  /* Otherwise expand PRED_none and PRED_x operations without a predicate.
+     Floating-point operations conventionally use the signed rtx code.  */
+  if (pred == PRED_none || pred == PRED_x)
+    return use_unpred_insn (direct_optab_handler (code_to_optab (code), 0));
+
+  /* Don't use cond_*_optabs here, since not all codes have one yet.  */
+  if (type_suffix (0).integer_p)
+    icode = code_for_cond (code, mode);
+  else
+    icode = code_for_cond (unspec_for_fp, mode);
+  return use_cond_insn (icode, merge_argno);
+}
+
+/* Implement the call using one of the following strategies, chosen in order:
+
+   (1) "aarch64_pred_<optab><mode>" for PRED_x functions; this is a
+       predicated pattern
+
+   (2) "aarch64_sve_<optab><mode>" for PRED_none and PRED_x functions;
+       this is an unpredicated pattern
+
+   (3) "cond_<optab><mode>" otherwise
+
+   where <optab> corresponds to:
+
+   - UNSPEC_FOR_SINT for signed integers
+   - UNSPEC_FOR_UINT for unsigned integers
+   - UNSPEC_FOR_FP for floating-point values
+
+   MERGE_ARGNO is the argument that provides the values of inactive lanes for
+   _m functions, or DEFAULT_MERGE_ARGNO if we should apply the usual rules.  */
+rtx
+function_expander::map_to_unspecs (int unspec_for_sint, int unspec_for_uint,
+				   int unspec_for_fp, unsigned int merge_argno)
+{
+  machine_mode mode = vector_mode (0);
+  int unspec = (!type_suffix (0).integer_p ? unspec_for_fp
+		: type_suffix (0).unsigned_p ? unspec_for_uint
+		: unspec_for_sint);
+
+  if (pred == PRED_x)
+    {
+      insn_code icode = maybe_code_for_aarch64_pred (unspec, mode);
+      if (icode != CODE_FOR_nothing)
+	return use_pred_x_insn (icode);
+    }
+
+  if (pred == PRED_none || pred == PRED_x)
+    {
+      insn_code icode = maybe_code_for_aarch64_sve (unspec, mode);
+      if (icode != CODE_FOR_nothing)
+	return use_unpred_insn (icode);
+    }
+
+  insn_code icode = code_for_cond (unspec, vector_mode (0));
+  return use_cond_insn (icode, merge_argno);
+}
+
+/* Implement the call using an @aarch64 instruction and the
+   instructions are parameterized by an rtx_code.  CODE_FOR_SINT
+   is the rtx_code for signed integer operations, CODE_FOR_UINT
+   is the rtx_code for unsigned integer operations.  */
+rtx
+function_expander::expand_signed_unpred_op (rtx_code code_for_sint,
+					    rtx_code code_for_uint)
+{
+  insn_code icode;
+  if (type_suffix (0).unsigned_p)
+    icode = code_for_aarch64 (code_for_uint, code_for_uint, vector_mode (0));
+  else
+    icode = code_for_aarch64 (code_for_sint, code_for_sint, vector_mode (0));
+  return use_unpred_insn (icode);
+}
+
+/* Expand the call and return its lhs.  */
+rtx
+function_expander::expand ()
+{
+  unsigned int nargs = call_expr_nargs (call_expr);
+  args.reserve (nargs);
+  for (unsigned int i = 0; i < nargs; ++i)
+    args.quick_push (expand_normal (CALL_EXPR_ARG (call_expr, i)));
+
+  return base->expand (*this);
+}
+
+/* Register the built-in SVE ABI types, such as __SVBool_t.  */
+static void
+register_builtin_types ()
+{
+#define DEF_SVE_TYPE(ACLE_NAME, NCHARS, ABI_NAME, SCALAR_TYPE) \
+  scalar_types[VECTOR_TYPE_ ## ACLE_NAME] = SCALAR_TYPE;
+#include "aarch64-sve-builtins.def"
+
+  for (unsigned int i = 0; i < NUM_VECTOR_TYPES; ++i)
+    {
+      tree eltype = scalar_types[i];
+      tree vectype;
+      if (eltype == boolean_type_node)
+	{
+	  vectype = build_truth_vector_type_for_mode (BYTES_PER_SVE_VECTOR,
+						      VNx16BImode);
+	  gcc_assert (TYPE_MODE (vectype) == VNx16BImode
+		      && TYPE_MODE (vectype) == TYPE_MODE_RAW (vectype)
+		      && TYPE_ALIGN (vectype) == 16
+		      && known_eq (wi::to_poly_offset (TYPE_SIZE (vectype)),
+				   BYTES_PER_SVE_VECTOR));
+	}
+      else
+	{
+	  unsigned int elbytes = tree_to_uhwi (TYPE_SIZE_UNIT (eltype));
+	  poly_uint64 nunits = exact_div (BYTES_PER_SVE_VECTOR, elbytes);
+	  vectype = build_vector_type (eltype, nunits);
+	  gcc_assert (VECTOR_MODE_P (TYPE_MODE (vectype))
+		      && TYPE_MODE (vectype) == TYPE_MODE_RAW (vectype)
+		      && TYPE_ALIGN (vectype) == 128
+		      && known_eq (wi::to_poly_offset (TYPE_SIZE (vectype)),
+				   BITS_PER_SVE_VECTOR));
+	}
+      vectype = build_distinct_type_copy (vectype);
+      SET_TYPE_STRUCTURAL_EQUALITY (vectype);
+      TYPE_ARTIFICIAL (vectype) = 1;
+      abi_vector_types[i] = vectype;
+      lang_hooks.types.register_builtin_type (vectype,
+					      vector_types[i].abi_name);
+    }
+}
+
+/* Initialize all compiler built-ins related to SVE that should be
+   defined at start-up.  */
+void
+init_builtins ()
+{
+  sve_switcher sve;
+  register_builtin_types ();
+}
+
+/* Register vector type TYPE under its arm_sve.h name.  */
+static void
+register_vector_type (vector_type_index type)
+{
+  tree vectype = abi_vector_types[type];
+  tree id = get_identifier (vector_types[type].acle_name);
+  tree decl = build_decl (input_location, TYPE_DECL, id, vectype);
+  decl = lang_hooks.decls.pushdecl (decl);
+
+  /* Record the new ACLE type if pushdecl succeeded without error.  Use
+     the ABI type otherwise, so that the type we record at least has the
+     right form, even if it doesn't have the right name.  This should give
+     better error recovery behavior than installing error_mark_node or
+     installing an incorrect type.  */
+  if (TREE_CODE (decl) == TYPE_DECL
+      && TYPE_MAIN_VARIANT (TREE_TYPE (decl)) == vectype)
+    vectype = TREE_TYPE (decl);
+  acle_vector_types[0][type] = vectype;
+}
+
+/* Register the tuple type that contains NUM_VECTORS vectors of type TYPE.  */
+static void
+register_tuple_type (unsigned int num_vectors, vector_type_index type)
+{
+  tree tuple_type = lang_hooks.types.make_type (RECORD_TYPE);
+
+  /* The contents of the type are opaque, so we can define them in any
+     way that maps to the correct ABI type.
+
+     Here we choose to use the same layout as for arm_neon.h, but with
+     "__val" instead of "val":
+
+	struct svfooxN_t { svfoo_t __val[N]; };
+
+     (It wouldn't be possible to write that directly in C or C++ for
+     sizeless types, but that's not a problem for this function.)
+
+     Using arrays simplifies the handling of svget and svset for variable
+     arguments.  */
+  tree vector_type = acle_vector_types[0][type];
+  tree array_type = build_array_type_nelts (vector_type, num_vectors);
+  gcc_assert (VECTOR_MODE_P (TYPE_MODE (array_type))
+	      && TYPE_MODE_RAW (array_type) == TYPE_MODE (array_type)
+	      && TYPE_ALIGN (array_type) == 128);
+
+  tree field = build_decl (input_location, FIELD_DECL,
+			   get_identifier ("__val"), array_type);
+  DECL_FIELD_CONTEXT (field) = tuple_type;
+  TYPE_FIELDS (tuple_type) = field;
+  layout_type (tuple_type);
+  gcc_assert (VECTOR_MODE_P (TYPE_MODE (tuple_type))
+	      && TYPE_MODE_RAW (tuple_type) == TYPE_MODE (tuple_type)
+	      && TYPE_ALIGN (tuple_type) == 128);
+
+  /* Work out the structure name.  */
+  char buffer[sizeof ("svbfloat16x4_t")];
+  const char *vector_type_name = vector_types[type].acle_name;
+  snprintf (buffer, sizeof (buffer), "%.*sx%d_t",
+	    (int) strlen (vector_type_name) - 2, vector_type_name,
+	    num_vectors);
+
+  tree decl = build_decl (input_location, TYPE_DECL,
+			  get_identifier (buffer), tuple_type);
+  TYPE_NAME (tuple_type) = decl;
+  TYPE_STUB_DECL (tuple_type) = decl;
+  lang_hooks.decls.pushdecl (decl);
+  /* ??? Undo the effect of set_underlying_type for C.  The C frontend
+     doesn't recognize DECL as a built-in because (as intended) the decl has
+     a real location instead of BUILTINS_LOCATION.  The frontend therefore
+     treats the decl like a normal C "typedef struct foo foo;", expecting
+     the type for tag "struct foo" to have a dummy unnamed TYPE_DECL instead
+     of the named one we attached above.  It then sets DECL_ORIGINAL_TYPE
+     on the supposedly unnamed decl, creating a circularity that upsets
+     dwarf2out.
+
+     We don't want to follow the normal C model and create "struct foo"
+     tags for tuple types since (a) the types are supposed to be opaque
+     and (b) they couldn't be defined as a real struct anyway.  Treating
+     the TYPE_DECLs as "typedef struct foo foo;" without creating
+     "struct foo" would lead to confusing error messages.  */
+  DECL_ORIGINAL_TYPE (decl) = NULL_TREE;
+
+  acle_vector_types[num_vectors - 1][type] = tuple_type;
+}
+
+/* Register the svpattern enum.  */
+static void
+register_svpattern ()
+{
+  auto_vec<string_int_pair, 32> values;
+#define PUSH(UPPER, LOWER, VALUE) \
+    values.quick_push (string_int_pair ("SV_" #UPPER, VALUE));
+  AARCH64_FOR_SVPATTERN (PUSH)
+#undef PUSH
+
+  acle_svpattern = lang_hooks.types.simulate_enum_decl (input_location,
+							"svpattern", values);
+}
+
+/* Register the svprfop enum.  */
+static void
+register_svprfop ()
+{
+  auto_vec<string_int_pair, 16> values;
+#define PUSH(UPPER, LOWER, VALUE) \
+    values.quick_push (string_int_pair ("SV_" #UPPER, VALUE));
+  AARCH64_FOR_SVPRFOP (PUSH)
+#undef PUSH
+
+  acle_svprfop = lang_hooks.types.simulate_enum_decl (input_location,
+						      "svprfop", values);
+}
+
+/* Implement #pragma GCC aarch64 "arm_sve.h".  */
+void
+handle_arm_sve_h ()
+{
+  if (function_table)
+    {
+      error ("duplicate definition of %qs", "arm_sve.h");
+      return;
+    }
+
+  sve_switcher sve;
+
+  /* Define the vector and tuple types.  */
+  for (unsigned int type_i = 0; type_i < NUM_VECTOR_TYPES; ++type_i)
+    {
+      vector_type_index type = vector_type_index (type_i);
+      register_vector_type (type);
+      if (type != VECTOR_TYPE_svbool_t)
+	for (unsigned int count = 2; count <= MAX_TUPLE_SIZE; ++count)
+	  register_tuple_type (count, type);
+    }
+
+  /* Define the enums.  */
+  register_svpattern ();
+  register_svprfop ();
+
+  /* Define the functions.  */
+  function_table = new hash_table<registered_function_hasher> (1023);
+  function_builder builder;
+  for (unsigned int i = 0; i < ARRAY_SIZE (function_groups); ++i)
+    builder.register_function_group (function_groups[i]);
+}
+
+/* Return the function decl with SVE function subcode CODE, or error_mark_node
+   if no such function exists.  */
+tree
+builtin_decl (unsigned int code, bool)
+{
+  if (code >= vec_safe_length (registered_functions))
+    return error_mark_node;
+  return (*registered_functions)[code]->decl;
+}
+
+/* If we're implementing manual overloading, check whether the SVE
+   function with subcode CODE is overloaded, and if so attempt to
+   determine the corresponding non-overloaded function.  The call
+   occurs at location LOCATION and has the arguments given by ARGLIST.
+
+   If the call is erroneous, report an appropriate error and return
+   error_mark_node.  Otherwise, if the function is overloaded, return
+   the decl of the non-overloaded function.  Return NULL_TREE otherwise,
+   indicating that the call should be processed in the normal way.  */
+tree
+resolve_overloaded_builtin (location_t location, unsigned int code,
+			    vec<tree, va_gc> *arglist)
+{
+  if (code >= vec_safe_length (registered_functions))
+    return NULL_TREE;
+
+  registered_function &rfn = *(*registered_functions)[code];
+  if (rfn.overloaded_p)
+    return function_resolver (location, rfn.instance, rfn.decl,
+			      *arglist).resolve ();
+  return NULL_TREE;
+}
+
+/* Perform any semantic checks needed for a call to the SVE function
+   with subcode CODE, such as testing for integer constant expressions.
+   The call occurs at location LOCATION and has NARGS arguments,
+   given by ARGS.  FNDECL is the original function decl, before
+   overload resolution.
+
+   Return true if the call is valid, otherwise report a suitable error.  */
+bool
+check_builtin_call (location_t location, vec<location_t>, unsigned int code,
+		    tree fndecl, unsigned int nargs, tree *args)
+{
+  const registered_function &rfn = *(*registered_functions)[code];
+  if (!check_required_extensions (location, rfn.decl, rfn.required_extensions))
+    return false;
+  return function_checker (location, rfn.instance, fndecl,
+			   TREE_TYPE (rfn.decl), nargs, args).check ();
+}
+
+/* Attempt to fold STMT, given that it's a call to the SVE function
+   with subcode CODE.  Return the new statement on success and null
+   on failure.  Insert any other new statements at GSI.  */
+gimple *
+gimple_fold_builtin (unsigned int code, gimple_stmt_iterator *gsi, gcall *stmt)
+{
+  registered_function &rfn = *(*registered_functions)[code];
+  return gimple_folder (rfn.instance, rfn.decl, gsi, stmt).fold ();
+}
+
+/* Expand a call to the SVE function with subcode CODE.  EXP is the call
+   expression and TARGET is the preferred location for the result.
+   Return the value of the lhs.  */
+rtx
+expand_builtin (unsigned int code, tree exp, rtx target)
+{
+  registered_function &rfn = *(*registered_functions)[code];
+  if (!check_required_extensions (EXPR_LOCATION (exp), rfn.decl,
+				  rfn.required_extensions))
+    return target;
+  return function_expander (rfn.instance, rfn.decl, exp, target).expand ();
+}
+
+/* Return true if TYPE is the ABI-defined __SVBool_t type.  */
+bool
+svbool_type_p (const_tree type)
+{
+  tree abi_type = abi_vector_types[VECTOR_TYPE_svbool_t];
+  return (type != error_mark_node
+	  && TYPE_MAIN_VARIANT (type) == TYPE_MAIN_VARIANT (abi_type));
+}
+
+/* If TYPE is a built-in type defined by the SVE ABI, return the mangled name,
+   otherwise return NULL.  */
+const char *
+mangle_builtin_type (const_tree type)
+{
+  if (type == error_mark_node)
+    return NULL;
+
+  vector_type_index vtype = find_vector_type (type);
+  if (vtype != NUM_VECTOR_TYPES)
+    return vector_types[vtype].mangled_name;
+
+  return NULL;
+}
+
+/* If TYPE is one of the ABI-defined SVE vector types, or an ACLE-defined
+   tuple of them, return the number of vectors it contains.  Return 0
+   otherwise.  */
+unsigned int
+nvectors_if_data_type (const_tree type)
+{
+  if (type == error_mark_node)
+    return 0;
+
+  type = TYPE_MAIN_VARIANT (type);
+  if (VECTOR_TYPE_P (type))
+    {
+      vector_type_index type_id = find_vector_type (type);
+      if (type_id != VECTOR_TYPE_svbool_t && type_id != NUM_VECTOR_TYPES)
+	return 1;
+    }
+  else if (TREE_CODE (type) == RECORD_TYPE)
+    {
+      for (unsigned int size_i = 1; size_i < MAX_TUPLE_SIZE; ++size_i)
+	for (unsigned int type_i = 0; type_i < NUM_VECTOR_TYPES; ++type_i)
+	  {
+	    tree tuple_type = acle_vector_types[size_i][type_i];
+	    if (tuple_type && type == TYPE_MAIN_VARIANT (tuple_type))
+	      return size_i + 1;
+	  }
+    }
+
+  return 0;
+}
+
+/* Return true if TYPE is a built-in type defined by the SVE ABI.  */
+bool
+builtin_type_p (const_tree type)
+{
+  return svbool_type_p (type) || nvectors_if_data_type (type) > 0;
+}
+
+}
+
+using namespace aarch64_sve;
+
+inline void
+gt_ggc_mx (function_instance *)
+{
+}
+
+inline void
+gt_pch_nx (function_instance *)
+{
+}
+
+inline void
+gt_pch_nx (function_instance *, void (*) (void *, void *), void *)
+{
+}
+
+#include "gt-aarch64-sve-builtins.h"
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.def b/gcc/config/aarch64/aarch64-sve-builtins.def
new file mode 100644
index 000000000..83fba0d41
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-sve-builtins.def
@@ -0,0 +1,100 @@
+/* Builtin lists for AArch64 SVE
+   Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef DEF_SVE_MODE
+#define DEF_SVE_MODE(A, B, C, D)
+#endif
+
+#ifndef DEF_SVE_TYPE
+#define DEF_SVE_TYPE(A, B, C, D)
+#endif
+
+#ifndef DEF_SVE_TYPE_SUFFIX
+#define DEF_SVE_TYPE_SUFFIX(A, B, C, D, E)
+#endif
+
+#ifndef DEF_SVE_FUNCTION
+#define DEF_SVE_FUNCTION(A, B, C, D)
+#endif
+
+DEF_SVE_MODE (n, none, none, none)
+DEF_SVE_MODE (index, none, none, elements)
+DEF_SVE_MODE (offset, none, none, bytes)
+DEF_SVE_MODE (s32index, none, svint32_t, elements)
+DEF_SVE_MODE (s32offset, none, svint32_t, bytes)
+DEF_SVE_MODE (s64index,  none, svint64_t, elements)
+DEF_SVE_MODE (s64offset, none, svint64_t, bytes)
+DEF_SVE_MODE (u32base, svuint32_t, none, none)
+DEF_SVE_MODE (u32base_index, svuint32_t, none, elements)
+DEF_SVE_MODE (u32base_offset, svuint32_t, none, bytes)
+DEF_SVE_MODE (u32base_s32index, svuint32_t, svint32_t, elements)
+DEF_SVE_MODE (u32base_s32offset, svuint32_t, svint32_t, bytes)
+DEF_SVE_MODE (u32base_u32index, svuint32_t, svuint32_t, elements)
+DEF_SVE_MODE (u32base_u32offset, svuint32_t, svuint32_t, bytes)
+DEF_SVE_MODE (u32index, none, svuint32_t, elements)
+DEF_SVE_MODE (u32offset, none, svuint32_t, bytes)
+DEF_SVE_MODE (u64base, svuint64_t, none, none)
+DEF_SVE_MODE (u64base_index, svuint64_t, none, elements)
+DEF_SVE_MODE (u64base_offset, svuint64_t, none, bytes)
+DEF_SVE_MODE (u64base_s64index, svuint64_t, svint64_t, elements)
+DEF_SVE_MODE (u64base_s64offset, svuint64_t, svint64_t, bytes)
+DEF_SVE_MODE (u64base_u64index, svuint64_t, svuint64_t, elements)
+DEF_SVE_MODE (u64base_u64offset, svuint64_t, svuint64_t, bytes)
+DEF_SVE_MODE (u64index, none, svuint64_t, elements)
+DEF_SVE_MODE (u64offset, none, svuint64_t, bytes)
+DEF_SVE_MODE (vnum, none, none, vectors)
+
+DEF_SVE_TYPE (svbool_t, 10, __SVBool_t, boolean_type_node)
+DEF_SVE_TYPE (svbfloat16_t, 14, __SVBfloat16_t, aarch64_bf16_type_node)
+DEF_SVE_TYPE (svfloat16_t, 13, __SVFloat16_t, aarch64_fp16_type_node)
+DEF_SVE_TYPE (svfloat32_t, 13, __SVFloat32_t, float_type_node)
+DEF_SVE_TYPE (svfloat64_t, 13, __SVFloat64_t, double_type_node)
+DEF_SVE_TYPE (svint8_t, 10, __SVInt8_t, intQI_type_node)
+DEF_SVE_TYPE (svint16_t, 11, __SVInt16_t, intHI_type_node)
+DEF_SVE_TYPE (svint32_t, 11, __SVInt32_t, intSI_type_node)
+DEF_SVE_TYPE (svint64_t, 11, __SVInt64_t, intDI_type_node)
+DEF_SVE_TYPE (svuint8_t, 11, __SVUint8_t, unsigned_intQI_type_node)
+DEF_SVE_TYPE (svuint16_t, 12, __SVUint16_t, unsigned_intHI_type_node)
+DEF_SVE_TYPE (svuint32_t, 12, __SVUint32_t, unsigned_intSI_type_node)
+DEF_SVE_TYPE (svuint64_t, 12, __SVUint64_t, unsigned_intDI_type_node)
+
+DEF_SVE_TYPE_SUFFIX (b, svbool_t, bool, 8, VNx16BImode)
+DEF_SVE_TYPE_SUFFIX (b8, svbool_t, bool, 8, VNx16BImode)
+DEF_SVE_TYPE_SUFFIX (b16, svbool_t, bool, 16, VNx8BImode)
+DEF_SVE_TYPE_SUFFIX (b32, svbool_t, bool, 32, VNx4BImode)
+DEF_SVE_TYPE_SUFFIX (b64, svbool_t, bool, 64, VNx2BImode)
+DEF_SVE_TYPE_SUFFIX (bf16, svbfloat16_t, bfloat, 16, VNx8BFmode)
+DEF_SVE_TYPE_SUFFIX (f16, svfloat16_t, float, 16, VNx8HFmode)
+DEF_SVE_TYPE_SUFFIX (f32, svfloat32_t, float, 32, VNx4SFmode)
+DEF_SVE_TYPE_SUFFIX (f64, svfloat64_t, float, 64, VNx2DFmode)
+DEF_SVE_TYPE_SUFFIX (s8, svint8_t, signed, 8, VNx16QImode)
+DEF_SVE_TYPE_SUFFIX (s16, svint16_t, signed, 16, VNx8HImode)
+DEF_SVE_TYPE_SUFFIX (s32, svint32_t, signed, 32, VNx4SImode)
+DEF_SVE_TYPE_SUFFIX (s64, svint64_t, signed, 64, VNx2DImode)
+DEF_SVE_TYPE_SUFFIX (u8, svuint8_t, unsigned, 8, VNx16QImode)
+DEF_SVE_TYPE_SUFFIX (u16, svuint16_t, unsigned, 16, VNx8HImode)
+DEF_SVE_TYPE_SUFFIX (u32, svuint32_t, unsigned, 32, VNx4SImode)
+DEF_SVE_TYPE_SUFFIX (u64, svuint64_t, unsigned, 64, VNx2DImode)
+
+#include "aarch64-sve-builtins-base.def"
+
+#undef DEF_SVE_FUNCTION
+#undef DEF_SVE_TYPE_SUFFIX
+#undef DEF_SVE_TYPE
+#undef DEF_SVE_MODE
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h b/gcc/config/aarch64/aarch64-sve-builtins.h
new file mode 100644
index 000000000..d1aa612b9
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-sve-builtins.h
@@ -0,0 +1,878 @@
+/* ACLE support for AArch64 SVE
+   Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_AARCH64_SVE_BUILTINS_H
+#define GCC_AARCH64_SVE_BUILTINS_H
+
+/* The full name of an SVE ACLE function is the concatenation of:
+
+   - the base name ("svadd", etc.)
+   - the "mode" suffix ("_n", "_index", etc.)
+   - the type suffixes ("_s32", "_b8", etc.)
+   - the predication suffix ("_x", "_z", etc.)
+
+   Each piece of information is individually useful, so we retain this
+   classification throughout:
+
+   - function_base represents the base name
+
+   - mode_suffix_index represents the mode suffix
+
+   - type_suffix_index represents individual type suffixes, while
+     type_suffix_pair represents a pair of them
+
+   - prediction_index extends the predication suffix with an additional
+     alternative: PRED_implicit for implicitly-predicated operations
+
+   In addition to its unique full name, a function may have a shorter
+   overloaded alias.  This alias removes pieces of the suffixes that
+   can be inferred from the arguments, such as by shortening the mode
+   suffix or dropping some of the type suffixes.  The base name and the
+   predication suffix stay the same.
+
+   The function_shape class describes what arguments a given function
+   takes and what its overloaded alias is called.  In broad terms,
+   function_base describes how the underlying instruction behaves while
+   function_shape describes how that instruction has been presented at
+   the language level.
+
+   The static list of functions uses function_group to describe a group
+   of related functions.  The function_builder class is responsible for
+   expanding this static description into a list of individual functions
+   and registering the associated built-in functions.  function_instance
+   describes one of these individual functions in terms of the properties
+   described above.
+
+   The classes involved in compiling a function call are:
+
+   - function_resolver, which resolves an overloaded function call to a
+     specific function_instance and its associated function decl
+
+   - function_checker, which checks whether the values of the arguments
+     conform to the ACLE specification
+
+   - gimple_folder, which tries to fold a function call at the gimple level
+
+   - function_expander, which expands a function call into rtl instructions
+
+   function_resolver and function_checker operate at the language level
+   and so are associated with the function_shape.  gimple_folder and
+   function_expander are concerned with the behavior of the function
+   and so are associated with the function_base.
+
+   Note that we've specifically chosen not to fold calls in the frontend,
+   since SVE intrinsics will hardly ever fold a useful language-level
+   constant.  */
+namespace aarch64_sve
+{
+/* The maximum number of vectors in an ACLE tuple type.  */
+const unsigned int MAX_TUPLE_SIZE = 4;
+
+/* Used to represent the default merge argument index for _m functions.
+   The actual index depends on how many arguments the function takes.  */
+const unsigned int DEFAULT_MERGE_ARGNO = ~0U;
+
+/* Flags that describe what a function might do, in addition to reading
+   its arguments and returning a result.  */
+const unsigned int CP_READ_FPCR = 1U << 0;
+const unsigned int CP_RAISE_FP_EXCEPTIONS = 1U << 1;
+const unsigned int CP_READ_MEMORY = 1U << 2;
+const unsigned int CP_PREFETCH_MEMORY = 1U << 3;
+const unsigned int CP_WRITE_MEMORY = 1U << 4;
+const unsigned int CP_READ_FFR = 1U << 5;
+const unsigned int CP_WRITE_FFR = 1U << 6;
+
+/* Enumerates the SVE predicate and (data) vector types, together called
+   "vector types" for brevity.  */
+enum vector_type_index
+{
+#define DEF_SVE_TYPE(ACLE_NAME, NCHARS, ABI_NAME, SCALAR_TYPE) \
+  VECTOR_TYPE_ ## ACLE_NAME,
+#include "aarch64-sve-builtins.def"
+  NUM_VECTOR_TYPES
+};
+
+/* Classifies the available measurement units for an address displacement.  */
+enum units_index
+{
+  UNITS_none,
+  UNITS_bytes,
+  UNITS_elements,
+  UNITS_vectors
+};
+
+/* Describes the various uses of a governing predicate.  */
+enum predication_index
+{
+  /* No governing predicate is present.  */
+  PRED_none,
+
+  /* A governing predicate is present but there is no predication suffix
+     associated with it.  This is used when the result is neither a vector
+     nor a predicate, since the distinction between "zeroing" and "merging"
+     doesn't apply in that case.  It is also used when a suffix would be
+     redundant (such as for loads and comparisons, which are inherently
+     zeroing operations).  */
+  PRED_implicit,
+
+  /* Merging predication: copy inactive lanes from the first data argument
+     to the vector result.  */
+  PRED_m,
+
+  /* "Don't care" predication: set inactive lanes of the vector result
+     to arbitrary values.  */
+  PRED_x,
+
+  /* Zero predication: set inactive lanes of the vector result to zero.  */
+  PRED_z,
+
+  NUM_PREDS
+};
+
+/* Classifies element types, based on type suffixes with the bit count
+   removed.  */
+enum type_class_index
+{
+  TYPE_bool,
+  TYPE_bfloat,
+  TYPE_float,
+  TYPE_signed,
+  TYPE_unsigned,
+  NUM_TYPE_CLASSES
+};
+
+/* Classifies an operation into "modes"; for example, to distinguish
+   vector-scalar operations from vector-vector operations, or to
+   distinguish between different addressing modes.  This classification
+   accounts for the function suffixes that occur between the base name
+   and the first type suffix.  */
+enum mode_suffix_index
+{
+#define DEF_SVE_MODE(NAME, BASE, DISPLACEMENT, UNITS) MODE_##NAME,
+#include "aarch64-sve-builtins.def"
+  MODE_none
+};
+
+/* Enumerates the possible type suffixes.  Each suffix is associated with
+   a vector type, but for predicates provides extra information about the
+   element size.  */
+enum type_suffix_index
+{
+#define DEF_SVE_TYPE_SUFFIX(NAME, ACLE_TYPE, CLASS, BITS, MODE) \
+  TYPE_SUFFIX_ ## NAME,
+#include "aarch64-sve-builtins.def"
+  NUM_TYPE_SUFFIXES
+};
+
+/* Combines two type suffixes.  */
+typedef enum type_suffix_index type_suffix_pair[2];
+
+class function_base;
+class function_shape;
+
+/* Static information about a mode suffix.  */
+struct mode_suffix_info
+{
+  /* The suffix string itself.  */
+  const char *string;
+
+  /* The type of the vector base address, or NUM_VECTOR_TYPES if the
+     mode does not include a vector base address.  */
+  vector_type_index base_vector_type;
+
+  /* The type of the vector displacement, or NUM_VECTOR_TYPES if the
+     mode does not include a vector displacement.  (Note that scalar
+     displacements are always int64_t.)  */
+  vector_type_index displacement_vector_type;
+
+  /* The units in which the vector or scalar displacement is measured,
+     or UNITS_none if the mode doesn't take a displacement.  */
+  units_index displacement_units;
+};
+
+/* Static information about a type suffix.  */
+struct type_suffix_info
+{
+  /* The suffix string itself.  */
+  const char *string;
+
+  /* The associated ACLE vector or predicate type.  */
+  vector_type_index vector_type : 8;
+
+  /* What kind of type the suffix represents.  */
+  type_class_index tclass : 8;
+
+  /* The number of bits and bytes in an element.  For predicates this
+     measures the associated data elements.  */
+  unsigned int element_bits : 8;
+  unsigned int element_bytes : 8;
+
+  /* True if the suffix is for an integer type.  */
+  unsigned int integer_p : 1;
+  /* True if the suffix is for an unsigned type.  */
+  unsigned int unsigned_p : 1;
+  /* True if the suffix is for a floating-point type.  */
+  unsigned int float_p : 1;
+  /* True if the suffix is for a boolean type.  */
+  unsigned int bool_p : 1;
+  unsigned int spare : 12;
+
+  /* The associated vector or predicate mode.  */
+  machine_mode vector_mode : 16;
+};
+
+/* Static information about a set of functions.  */
+struct function_group_info
+{
+  /* The base name, as a string.  */
+  const char *base_name;
+
+  /* Describes the behavior associated with the function base name.  */
+  const function_base *const *base;
+
+  /* The shape of the functions, as described above the class definition.
+     It's possible to have entries with the same base name but different
+     shapes.  */
+  const function_shape *const *shape;
+
+  /* A list of the available type suffixes, and of the available predication
+     types.  The function supports every combination of the two.
+
+     The list of type suffixes is terminated by two NUM_TYPE_SUFFIXES
+     while the list of predication types is terminated by NUM_PREDS.
+     The list of type suffixes is lexicographically ordered based
+     on the index value.  */
+  const type_suffix_pair *types;
+  const predication_index *preds;
+
+  /* The architecture extensions that the functions require, as a set of
+     AARCH64_FL_* flags.  */
+  uint64_t required_extensions;
+};
+
+/* Describes a single fully-resolved function (i.e. one that has a
+   unique full name).  */
+class GTY((user)) function_instance
+{
+public:
+  function_instance (const char *, const function_base *,
+		     const function_shape *, mode_suffix_index,
+		     const type_suffix_pair &, predication_index);
+
+  bool operator== (const function_instance &) const;
+  bool operator!= (const function_instance &) const;
+  hashval_t hash () const;
+
+  unsigned int call_properties () const;
+  bool reads_global_state_p () const;
+  bool modifies_global_state_p () const;
+  bool could_trap_p () const;
+
+  unsigned int vectors_per_tuple () const;
+  tree memory_scalar_type () const;
+  machine_mode memory_vector_mode () const;
+
+  const mode_suffix_info &mode_suffix () const;
+  tree base_vector_type () const;
+  tree displacement_vector_type () const;
+  units_index displacement_units () const;
+
+  const type_suffix_info &type_suffix (unsigned int) const;
+  tree scalar_type (unsigned int) const;
+  tree vector_type (unsigned int) const;
+  tree tuple_type (unsigned int) const;
+  unsigned int elements_per_vq (unsigned int i) const;
+  machine_mode vector_mode (unsigned int) const;
+  machine_mode gp_mode (unsigned int) const;
+
+  /* The properties of the function.  (The explicit "enum"s are required
+     for gengtype.)  */
+  const char *base_name;
+  const function_base *base;
+  const function_shape *shape;
+  enum mode_suffix_index mode_suffix_id;
+  type_suffix_pair type_suffix_ids;
+  enum predication_index pred;
+};
+
+class registered_function;
+
+/* A class for building and registering function decls.  */
+class function_builder
+{
+public:
+  function_builder ();
+  ~function_builder ();
+
+  void add_unique_function (const function_instance &, tree,
+			    vec<tree> &, uint64_t, bool);
+  void add_overloaded_function (const function_instance &, uint64_t);
+  void add_overloaded_functions (const function_group_info &,
+				 mode_suffix_index);
+
+  void register_function_group (const function_group_info &);
+
+private:
+  void append_name (const char *);
+  char *finish_name ();
+
+  char *get_name (const function_instance &, bool);
+
+  tree get_attributes (const function_instance &);
+
+  registered_function &add_function (const function_instance &,
+				     const char *, tree, tree, uint64_t, bool);
+
+  /* The function type to use for functions that are resolved by
+     function_resolver.  */
+  tree m_overload_type;
+
+  /* True if we should create a separate decl for each instance of an
+     overloaded function, instead of using function_resolver.  */
+  bool m_direct_overloads;
+
+  /* Used for building up function names.  */
+  obstack m_string_obstack;
+
+  /* Maps all overloaded function names that we've registered so far
+     to their associated function_instances.  */
+  hash_map<nofree_string_hash, registered_function *> m_overload_names;
+};
+
+/* A base class for handling calls to built-in functions.  */
+class function_call_info : public function_instance
+{
+public:
+  function_call_info (location_t, const function_instance &, tree);
+
+  bool function_returns_void_p ();
+
+  /* The location of the call.  */
+  location_t location;
+
+  /* The FUNCTION_DECL that is being called.  */
+  tree fndecl;
+};
+
+/* A class for resolving an overloaded function call.  */
+class function_resolver : public function_call_info
+{
+public:
+  enum { SAME_SIZE = 256, HALF_SIZE, QUARTER_SIZE };
+  static const type_class_index SAME_TYPE_CLASS = NUM_TYPE_CLASSES;
+
+  function_resolver (location_t, const function_instance &, tree,
+		     vec<tree, va_gc> &);
+
+  tree get_vector_type (type_suffix_index);
+  const char *get_scalar_type_name (type_suffix_index);
+  tree get_argument_type (unsigned int);
+  bool scalar_argument_p (unsigned int);
+
+  tree report_no_such_form (type_suffix_index);
+  tree lookup_form (mode_suffix_index,
+		    type_suffix_index = NUM_TYPE_SUFFIXES,
+		    type_suffix_index = NUM_TYPE_SUFFIXES);
+  tree resolve_to (mode_suffix_index,
+		   type_suffix_index = NUM_TYPE_SUFFIXES,
+		   type_suffix_index = NUM_TYPE_SUFFIXES);
+
+  type_suffix_index infer_integer_scalar_type (unsigned int);
+  type_suffix_index infer_pointer_type (unsigned int, bool = false);
+  type_suffix_index infer_vector_or_tuple_type (unsigned int, unsigned int);
+  type_suffix_index infer_vector_type (unsigned int);
+  type_suffix_index infer_integer_vector_type (unsigned int);
+  type_suffix_index infer_unsigned_vector_type (unsigned int);
+  type_suffix_index infer_sd_vector_type (unsigned int);
+  type_suffix_index infer_tuple_type (unsigned int);
+
+  bool require_vector_or_scalar_type (unsigned int);
+
+  bool require_vector_type (unsigned int, vector_type_index);
+  bool require_matching_vector_type (unsigned int, type_suffix_index);
+  bool require_derived_vector_type (unsigned int, unsigned int,
+				    type_suffix_index,
+				    type_class_index = SAME_TYPE_CLASS,
+				    unsigned int = SAME_SIZE);
+
+  bool require_scalar_type (unsigned int, const char *);
+  bool require_pointer_type (unsigned int);
+  bool require_matching_integer_scalar_type (unsigned int, unsigned int,
+					     type_suffix_index);
+  bool require_derived_scalar_type (unsigned int, type_class_index,
+				    unsigned int = SAME_SIZE);
+  bool require_matching_pointer_type (unsigned int, unsigned int,
+				      type_suffix_index);
+  bool require_integer_immediate (unsigned int);
+
+  vector_type_index infer_vector_base_type (unsigned int);
+  vector_type_index infer_vector_displacement_type (unsigned int);
+
+  mode_suffix_index resolve_sv_displacement (unsigned int,
+					     type_suffix_index, bool);
+  mode_suffix_index resolve_gather_address (unsigned int,
+					    type_suffix_index, bool);
+  mode_suffix_index resolve_adr_address (unsigned int);
+
+  bool check_num_arguments (unsigned int);
+  bool check_gp_argument (unsigned int, unsigned int &, unsigned int &);
+  tree resolve_unary (type_class_index = SAME_TYPE_CLASS,
+		      unsigned int = SAME_SIZE, bool = false);
+  tree resolve_uniform (unsigned int, unsigned int = 0);
+  tree resolve_uniform_opt_n (unsigned int);
+  tree finish_opt_n_resolution (unsigned int, unsigned int, type_suffix_index,
+				type_class_index = SAME_TYPE_CLASS,
+				unsigned int = SAME_SIZE,
+				type_suffix_index = NUM_TYPE_SUFFIXES);
+
+  tree resolve ();
+
+private:
+  /* The arguments to the overloaded function.  */
+  vec<tree, va_gc> &m_arglist;
+};
+
+/* A class for checking that the semantic constraints on a function call are
+   satisfied, such as arguments being integer constant expressions with
+   a particular range.  The parent class's FNDECL is the decl that was
+   called in the original source, before overload resolution.  */
+class function_checker : public function_call_info
+{
+public:
+  function_checker (location_t, const function_instance &, tree,
+		    tree, unsigned int, tree *);
+
+  bool require_immediate_either_or (unsigned int, HOST_WIDE_INT,
+				    HOST_WIDE_INT);
+  bool require_immediate_enum (unsigned int, tree);
+  bool require_immediate_lane_index (unsigned int, unsigned int = 1);
+  bool require_immediate_one_of (unsigned int, HOST_WIDE_INT, HOST_WIDE_INT,
+				 HOST_WIDE_INT, HOST_WIDE_INT);
+  bool require_immediate_range (unsigned int, HOST_WIDE_INT, HOST_WIDE_INT);
+
+  bool check ();
+
+private:
+  bool argument_exists_p (unsigned int);
+
+  bool require_immediate (unsigned int, HOST_WIDE_INT &);
+
+  /* The type of the resolved function.  */
+  tree m_fntype;
+
+  /* The arguments to the function.  */
+  unsigned int m_nargs;
+  tree *m_args;
+
+  /* The first argument not associated with the function's predication
+     type.  */
+  unsigned int m_base_arg;
+};
+
+/* A class for folding a gimple function call.  */
+class gimple_folder : public function_call_info
+{
+public:
+  gimple_folder (const function_instance &, tree,
+		 gimple_stmt_iterator *, gcall *);
+
+  tree convert_pred (gimple_seq &, tree, unsigned int);
+  tree fold_contiguous_base (gimple_seq &, tree);
+  tree load_store_cookie (tree);
+
+  gimple *redirect_call (const function_instance &);
+  gimple *fold_to_pfalse ();
+  gimple *fold_to_ptrue ();
+  gimple *fold_to_vl_pred (unsigned int);
+
+  gimple *fold ();
+
+  /* Where to insert extra statements that feed the final replacement.  */
+  gimple_stmt_iterator *gsi;
+
+  /* The call we're folding.  */
+  gcall *call;
+
+  /* The result of the call, or null if none.  */
+  tree lhs;
+};
+
+/* A class for expanding a function call into RTL.  */
+class function_expander : public function_call_info
+{
+public:
+  function_expander (const function_instance &, tree, tree, rtx);
+  rtx expand ();
+
+  insn_code direct_optab_handler (optab, unsigned int = 0);
+  insn_code direct_optab_handler_for_sign (optab, optab, unsigned int = 0,
+					   machine_mode = E_VOIDmode);
+
+  bool overlaps_input_p (rtx);
+
+  rtx get_contiguous_base (machine_mode);
+  rtx get_fallback_value (machine_mode, unsigned int,
+			  unsigned int, unsigned int &);
+  rtx get_reg_target ();
+  rtx get_nonoverlapping_reg_target ();
+
+  void add_output_operand (insn_code);
+  void add_input_operand (insn_code, rtx);
+  void add_integer_operand (HOST_WIDE_INT);
+  void add_mem_operand (machine_mode, rtx);
+  void add_address_operand (rtx);
+  void add_fixed_operand (rtx);
+  rtx generate_insn (insn_code);
+
+  void prepare_gather_address_operands (unsigned int, bool = true);
+  void prepare_prefetch_operands ();
+  void add_ptrue_hint (unsigned int, machine_mode);
+  void rotate_inputs_left (unsigned int, unsigned int);
+  bool try_negating_argument (unsigned int, machine_mode);
+
+  rtx use_exact_insn (insn_code);
+  rtx use_unpred_insn (insn_code);
+  rtx use_pred_x_insn (insn_code);
+  rtx use_cond_insn (insn_code, unsigned int = DEFAULT_MERGE_ARGNO);
+  rtx use_vcond_mask_insn (insn_code, unsigned int = DEFAULT_MERGE_ARGNO);
+  rtx use_contiguous_load_insn (insn_code);
+  rtx use_contiguous_prefetch_insn (insn_code);
+  rtx use_contiguous_store_insn (insn_code);
+
+  rtx map_to_rtx_codes (rtx_code, rtx_code, int,
+			unsigned int = DEFAULT_MERGE_ARGNO);
+  rtx map_to_unspecs (int, int, int, unsigned int = DEFAULT_MERGE_ARGNO);
+  rtx expand_signed_unpred_op (rtx_code, rtx_code);
+
+  /* The function call expression.  */
+  tree call_expr;
+
+  /* For functions that return a value, this is the preferred location
+     of that value.  It could be null or could have a different mode
+     from the function return type.  */
+  rtx possible_target;
+
+  /* The expanded arguments.  */
+  auto_vec<rtx, 16> args;
+
+private:
+  /* Used to build up the operands to an instruction.  */
+  auto_vec<expand_operand, 8> m_ops;
+};
+
+/* Provides information about a particular function base name, and handles
+   tasks related to the base name.  */
+class function_base
+{
+public:
+  /* Return a set of CP_* flags that describe what the function might do,
+     in addition to reading its arguments and returning a result.  */
+  virtual unsigned int call_properties (const function_instance &) const;
+
+  /* If the function operates on tuples of vectors, return the number
+     of vectors in the tuples, otherwise return 1.  */
+  virtual unsigned int vectors_per_tuple () const { return 1; }
+
+  /* If the function addresses memory, return the type of a single
+     scalar memory element.  */
+  virtual tree
+  memory_scalar_type (const function_instance &) const
+  {
+    gcc_unreachable ();
+  }
+
+  /* If the function addresses memory, return a vector mode whose
+     GET_MODE_NUNITS is the number of elements addressed and whose
+     GET_MODE_INNER is the mode of a single scalar memory element.  */
+  virtual machine_mode
+  memory_vector_mode (const function_instance &) const
+  {
+    gcc_unreachable ();
+  }
+
+  /* Try to fold the given gimple call.  Return the new gimple statement
+     on success, otherwise return null.  */
+  virtual gimple *fold (gimple_folder &) const { return NULL; }
+
+  /* Expand the given call into rtl.  Return the result of the function,
+     or an arbitrary value if the function doesn't return a result.  */
+  virtual rtx expand (function_expander &) const = 0;
+};
+
+/* Classifies functions into "shapes".  The idea is to take all the
+   type signatures for a set of functions, remove the governing predicate
+   (if any), and classify what's left based on:
+
+   - the number of arguments
+
+   - the process of determining the types in the signature from the mode
+     and type suffixes in the function name (including types that are not
+     affected by the suffixes)
+
+   - which arguments must be integer constant expressions, and what range
+     those arguments have
+
+   - the process for mapping overloaded names to "full" names.  */
+class function_shape
+{
+public:
+  virtual bool explicit_type_suffix_p (unsigned int) const = 0;
+
+  /* Define all functions associated with the given group.  */
+  virtual void build (function_builder &,
+		      const function_group_info &) const = 0;
+
+  /* Try to resolve the overloaded call.  Return the non-overloaded
+     function decl on success and error_mark_node on failure.  */
+  virtual tree resolve (function_resolver &) const = 0;
+
+  /* Check whether the given call is semantically valid.  Return true
+     if it is, otherwise report an error and return false.  */
+  virtual bool check (function_checker &) const { return true; }
+};
+
+/* RAII class for enabling enough SVE features to define the built-in
+   types and implement the arm_sve.h pragma.  */
+class sve_switcher
+{
+public:
+  sve_switcher ();
+  ~sve_switcher ();
+
+private:
+  unsigned long m_old_isa_flags;
+  bool m_old_have_regs_of_mode[MAX_MACHINE_MODE];
+};
+
+extern const type_suffix_info type_suffixes[NUM_TYPE_SUFFIXES + 1];
+extern const mode_suffix_info mode_suffixes[MODE_none + 1];
+
+extern tree scalar_types[NUM_VECTOR_TYPES];
+extern tree acle_vector_types[MAX_TUPLE_SIZE][NUM_VECTOR_TYPES + 1];
+extern tree acle_svpattern;
+extern tree acle_svprfop;
+
+/* Return the ACLE type svbool_t.  */
+inline tree
+get_svbool_t (void)
+{
+  return acle_vector_types[0][VECTOR_TYPE_svbool_t];
+}
+
+/* Try to find a mode with the given mode_suffix_info fields.  Return the
+   mode on success or MODE_none on failure.  */
+inline mode_suffix_index
+find_mode_suffix (vector_type_index base_vector_type,
+		  vector_type_index displacement_vector_type,
+		  units_index displacement_units)
+{
+  for (unsigned int mode_i = 0; mode_i < ARRAY_SIZE (mode_suffixes); ++mode_i)
+    {
+      const mode_suffix_info &mode = mode_suffixes[mode_i];
+      if (mode.base_vector_type == base_vector_type
+	  && mode.displacement_vector_type == displacement_vector_type
+	  && mode.displacement_units == displacement_units)
+	return mode_suffix_index (mode_i);
+    }
+  return MODE_none;
+}
+
+/* Return the type suffix associated with ELEMENT_BITS-bit elements of type
+   class TCLASS.  */
+inline type_suffix_index
+find_type_suffix (type_class_index tclass, unsigned int element_bits)
+{
+  for (unsigned int i = 0; i < NUM_TYPE_SUFFIXES; ++i)
+    if (type_suffixes[i].tclass == tclass
+	&& type_suffixes[i].element_bits == element_bits)
+      return type_suffix_index (i);
+  gcc_unreachable ();
+}
+
+/* Return the single field in tuple type TYPE.  */
+inline tree
+tuple_type_field (tree type)
+{
+  for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
+    if (TREE_CODE (field) == FIELD_DECL)
+      return field;
+  gcc_unreachable ();
+}
+
+inline function_instance::
+function_instance (const char *base_name_in,
+		   const function_base *base_in,
+		   const function_shape *shape_in,
+		   mode_suffix_index mode_suffix_id_in,
+		   const type_suffix_pair &type_suffix_ids_in,
+		   predication_index pred_in)
+  : base_name (base_name_in), base (base_in), shape (shape_in),
+    mode_suffix_id (mode_suffix_id_in), pred (pred_in)
+{
+  memcpy (type_suffix_ids, type_suffix_ids_in, sizeof (type_suffix_ids));
+}
+
+inline bool
+function_instance::operator== (const function_instance &other) const
+{
+  return (base == other.base
+	  && shape == other.shape
+	  && mode_suffix_id == other.mode_suffix_id
+	  && pred == other.pred
+	  && type_suffix_ids[0] == other.type_suffix_ids[0]
+	  && type_suffix_ids[1] == other.type_suffix_ids[1]);
+}
+
+inline bool
+function_instance::operator!= (const function_instance &other) const
+{
+  return !operator== (other);
+}
+
+/* If the function operates on tuples of vectors, return the number
+   of vectors in the tuples, otherwise return 1.  */
+inline unsigned int
+function_instance::vectors_per_tuple () const
+{
+  return base->vectors_per_tuple ();
+}
+
+/* If the function addresses memory, return the type of a single
+   scalar memory element.  */
+inline tree
+function_instance::memory_scalar_type () const
+{
+  return base->memory_scalar_type (*this);
+}
+
+/* If the function addresses memory, return a vector mode whose
+   GET_MODE_NUNITS is the number of elements addressed and whose
+   GET_MODE_INNER is the mode of a single scalar memory element.  */
+inline machine_mode
+function_instance::memory_vector_mode () const
+{
+  return base->memory_vector_mode (*this);
+}
+
+/* Return information about the function's mode suffix.  */
+inline const mode_suffix_info &
+function_instance::mode_suffix () const
+{
+  return mode_suffixes[mode_suffix_id];
+}
+
+/* Return the type of the function's vector base address argument,
+   or null it doesn't have a vector base address.  */
+inline tree
+function_instance::base_vector_type () const
+{
+  return acle_vector_types[0][mode_suffix ().base_vector_type];
+}
+
+/* Return the type of the function's vector index or offset argument,
+   or null if doesn't have a vector index or offset argument.  */
+inline tree
+function_instance::displacement_vector_type () const
+{
+  return acle_vector_types[0][mode_suffix ().displacement_vector_type];
+}
+
+/* If the function takes a vector or scalar displacement, return the units
+   in which the displacement is measured, otherwise return UNITS_none.  */
+inline units_index
+function_instance::displacement_units () const
+{
+  return mode_suffix ().displacement_units;
+}
+
+/* Return information about type suffix I.  */
+inline const type_suffix_info &
+function_instance::type_suffix (unsigned int i) const
+{
+  return type_suffixes[type_suffix_ids[i]];
+}
+
+/* Return the scalar type associated with type suffix I.  */
+inline tree
+function_instance::scalar_type (unsigned int i) const
+{
+  return scalar_types[type_suffix (i).vector_type];
+}
+
+/* Return the vector type associated with type suffix I.  */
+inline tree
+function_instance::vector_type (unsigned int i) const
+{
+  return acle_vector_types[0][type_suffix (i).vector_type];
+}
+
+/* If the function operates on tuples of vectors, return the tuple type
+   associated with type suffix I, otherwise return the vector type associated
+   with type suffix I.  */
+inline tree
+function_instance::tuple_type (unsigned int i) const
+{
+  unsigned int num_vectors = vectors_per_tuple ();
+  return acle_vector_types[num_vectors - 1][type_suffix (i).vector_type];
+}
+
+/* Return the number of elements of type suffix I that fit within a
+   128-bit block.  */
+inline unsigned int
+function_instance::elements_per_vq (unsigned int i) const
+{
+  return 128 / type_suffix (i).element_bits;
+}
+
+/* Return the vector or predicate mode associated with type suffix I.  */
+inline machine_mode
+function_instance::vector_mode (unsigned int i) const
+{
+  return type_suffix (i).vector_mode;
+}
+
+/* Return the mode of the governing predicate to use when operating on
+   type suffix I.  */
+inline machine_mode
+function_instance::gp_mode (unsigned int i) const
+{
+  return aarch64_sve_pred_mode (type_suffix (i).element_bytes).require ();
+}
+
+/* Return true if the function has no return value.  */
+inline bool
+function_call_info::function_returns_void_p ()
+{
+  return TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node;
+}
+
+/* Default implementation of function::call_properties, with conservatively
+   correct behavior for floating-point instructions.  */
+inline unsigned int
+function_base::call_properties (const function_instance &instance) const
+{
+  unsigned int flags = 0;
+  if (instance.type_suffix (0).float_p || instance.type_suffix (1).float_p)
+    flags |= CP_READ_FPCR | CP_RAISE_FP_EXCEPTIONS;
+  return flags;
+}
+
+}
+
+#endif
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 02d33b727..11198e8a9 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -18,8 +18,168 @@
 ;; along with GCC; see the file COPYING3.  If not see
 ;; <http://www.gnu.org/licenses/>.
 
-;; Note on the handling of big-endian SVE
-;; --------------------------------------
+;; The file is organised into the following sections (search for the full
+;; line):
+;;
+;; == General notes
+;; ---- Note on the handling of big-endian SVE
+;; ---- Description of UNSPEC_PTEST
+;; ---- Description of UNSPEC_PRED_Z
+;; ---- Note on predicated integer arithemtic and UNSPEC_PRED_X
+;; ---- Note on predicated FP arithmetic patterns and GP "strictness"
+;; ---- Note on FFR handling
+;;
+;; == Moves
+;; ---- Moves of single vectors
+;; ---- Moves of multiple vectors
+;; ---- Moves of predicates
+;; ---- Moves relating to the FFR
+;;
+;; == Loads
+;; ---- Normal contiguous loads
+;; ---- Extending contiguous loads
+;; ---- First-faulting contiguous loads
+;; ---- First-faulting extending contiguous loads
+;; ---- Non-temporal contiguous loads
+;; ---- Normal gather loads
+;; ---- Extending gather loads
+;; ---- First-faulting gather loads
+;; ---- First-faulting extending gather loads
+;;
+;; == Prefetches
+;; ---- Contiguous prefetches
+;; ---- Gather prefetches
+;;
+;; == Stores
+;; ---- Normal contiguous stores
+;; ---- Truncating contiguous stores
+;; ---- Non-temporal contiguous stores
+;; ---- Normal scatter stores
+;; ---- Truncating scatter stores
+;;
+;; == Vector creation
+;; ---- [INT,FP] Duplicate element
+;; ---- [INT,FP] Initialize from individual elements
+;; ---- [INT] Linear series
+;; ---- [PRED] Duplicate element
+;;
+;; == Vector decomposition
+;; ---- [INT,FP] Extract index
+;; ---- [INT,FP] Extract active element
+;; ---- [PRED] Extract index
+;;
+;; == Unary arithmetic
+;; ---- [INT] General unary arithmetic corresponding to rtx codes
+;; ---- [INT] General unary arithmetic corresponding to unspecs
+;; ---- [INT] Sign extension
+;; ---- [INT] Zero extension
+;; ---- [INT] Logical inverse
+;; ---- [FP<-INT] General unary arithmetic that maps to unspecs
+;; ---- [FP] General unary arithmetic corresponding to unspecs
+;; ---- [PRED] Inverse
+
+;; == Binary arithmetic
+;; ---- [INT] General binary arithmetic corresponding to rtx codes
+;; ---- [INT] Addition
+;; ---- [INT] Subtraction
+;; ---- [INT] Take address
+;; ---- [INT] Absolute difference
+;; ---- [INT] Saturating addition and subtraction
+;; ---- [INT] Highpart multiplication
+;; ---- [INT] Division
+;; ---- [INT] Binary logical operations
+;; ---- [INT] Binary logical operations (inverted second input)
+;; ---- [INT] Shifts (rounding towards -Inf)
+;; ---- [INT] Shifts (rounding towards 0)
+;; ---- [FP<-INT] General binary arithmetic corresponding to unspecs
+;; ---- [FP] General binary arithmetic corresponding to rtx codes
+;; ---- [FP] General binary arithmetic corresponding to unspecs
+;; ---- [FP] Addition
+;; ---- [FP] Complex addition
+;; ---- [FP] Subtraction
+;; ---- [FP] Absolute difference
+;; ---- [FP] Multiplication
+;; ---- [FP] Binary logical operations
+;; ---- [FP] Sign copying
+;; ---- [FP] Maximum and minimum
+;; ---- [PRED] Binary logical operations
+;; ---- [PRED] Binary logical operations (inverted second input)
+;; ---- [PRED] Binary logical operations (inverted result)
+;;
+;; == Ternary arithmetic
+;; ---- [INT] MLA and MAD
+;; ---- [INT] MLS and MSB
+;; ---- [INT] Dot product
+;; ---- [INT] Sum of absolute differences
+;; ---- [INT] Matrix multiply-accumulate
+;; ---- [FP] General ternary arithmetic corresponding to unspecs
+;; ---- [FP] Complex multiply-add
+;; ---- [FP] Trigonometric multiply-add
+;; ---- [FP] Bfloat16 long ternary arithmetic (SF,BF,BF)
+;; ---- [FP] Matrix multiply-accumulate
+;;
+;; == Comparisons and selects
+;; ---- [INT,FP] Select based on predicates
+;; ---- [INT,FP] Compare and select
+;; ---- [INT] Comparisons
+;; ---- [INT] While tests
+;; ---- [FP] Direct comparisons
+;; ---- [FP] Absolute comparisons
+;; ---- [PRED] Select
+;; ---- [PRED] Test bits
+;;
+;; == Reductions
+;; ---- [INT,FP] Conditional reductions
+;; ---- [INT] Tree reductions
+;; ---- [FP] Tree reductions
+;; ---- [FP] Left-to-right reductions
+;;
+;; == Permutes
+;; ---- [INT,FP] General permutes
+;; ---- [INT,FP] Special-purpose unary permutes
+;; ---- [INT,FP] Special-purpose binary permutes
+;; ---- [PRED] Special-purpose unary permutes
+;; ---- [PRED] Special-purpose binary permutes
+;;
+;; == Conversions
+;; ---- [INT<-INT] Packs
+;; ---- [INT<-INT] Unpacks
+;; ---- [INT<-FP] Conversions
+;; ---- [INT<-FP] Packs
+;; ---- [INT<-FP] Unpacks
+;; ---- [FP<-INT] Conversions
+;; ---- [FP<-INT] Packs
+;; ---- [FP<-INT] Unpacks
+;; ---- [FP<-FP] Packs
+;; ---- [FP<-FP] Packs (bfloat16)
+;; ---- [FP<-FP] Unpacks
+;; ---- [PRED<-PRED] Packs
+;; ---- [PRED<-PRED] Unpacks
+;;
+;; == Vector partitioning
+;; ---- [PRED] Unary partitioning
+;; ---- [PRED] Binary partitioning
+;; ---- [PRED] Scalarization
+;;
+;; == Counting elements
+;; ---- [INT] Count elements in a pattern (scalar)
+;; ---- [INT] Increment by the number of elements in a pattern (scalar)
+;; ---- [INT] Increment by the number of elements in a pattern (vector)
+;; ---- [INT] Decrement by the number of elements in a pattern (scalar)
+;; ---- [INT] Decrement by the number of elements in a pattern (vector)
+;; ---- [INT] Count elements in a predicate (scalar)
+;; ---- [INT] Increment by the number of elements in a predicate (scalar)
+;; ---- [INT] Increment by the number of elements in a predicate (vector)
+;; ---- [INT] Decrement by the number of elements in a predicate (scalar)
+;; ---- [INT] Decrement by the number of elements in a predicate (vector)
+
+;; =========================================================================
+;; == General notes
+;; =========================================================================
+;;
+;; -------------------------------------------------------------------------
+;; ---- Note on the handling of big-endian SVE
+;; -------------------------------------------------------------------------
 ;;
 ;; On big-endian systems, Advanced SIMD mov<mode> patterns act in the
 ;; same way as movdi or movti would: the first byte of memory goes
@@ -59,12 +219,339 @@
 ;;   the order of the bytes within the elements is different.  We instead
 ;;   access spill slots via LD1 and ST1, using secondary reloads to
 ;;   reserve a predicate register.
+;;
+;; -------------------------------------------------------------------------
+;; ---- Description of UNSPEC_PTEST
+;; -------------------------------------------------------------------------
+;;
+;; SVE provides a PTEST instruction for testing the active lanes of a
+;; predicate and setting the flags based on the result.  The associated
+;; condition code tests are:
+;;
+;; - any   (= ne): at least one active bit is set
+;; - none  (= eq): all active bits are clear (*)
+;; - first (= mi): the first active bit is set
+;; - nfrst (= pl): the first active bit is clear (*)
+;; - last  (= cc): the last active bit is set
+;; - nlast (= cs): the last active bit is clear (*)
+;;
+;; where the conditions marked (*) are also true when there are no active
+;; lanes (i.e. when the governing predicate is a PFALSE).  The flags results
+;; of a PTEST use the condition code mode CC_NZC.
+;;
+;; PTEST is always a .B operation (i.e. it always operates on VNx16BI).
+;; This means that for other predicate modes, we need a governing predicate
+;; in which all bits are defined.
+;;
+;; For example, most predicated .H operations ignore the odd bits of the
+;; governing predicate, so that an active lane is represented by the
+;; bits "1x" and an inactive lane by the bits "0x", where "x" can be
+;; any value.  To test a .H predicate, we instead need "10" and "00"
+;; respectively, so that the condition only tests the even bits of the
+;; predicate.
+;;
+;; Several instructions set the flags as a side-effect, in the same way
+;; that a separate PTEST would.  It's important for code quality that we
+;; use these flags results as often as possible, particularly in the case
+;; of WHILE* and RDFFR.
+;;
+;; Also, some of the instructions that set the flags are unpredicated
+;; and instead implicitly test all .B, .H, .S or .D elements, as though
+;; they were predicated on a PTRUE of that size.  For example, a .S
+;; WHILELO sets the flags in the same way as a PTEST with a .S PTRUE
+;; would.
+;;
+;; We therefore need to represent PTEST operations in a way that
+;; makes it easy to combine them with both predicated and unpredicated
+;; operations, while using a VNx16BI governing predicate for all
+;; predicate modes.  We do this using:
+;;
+;;   (unspec:CC_NZC [gp cast_gp ptrue_flag op] UNSPEC_PTEST)
+;;
+;; where:
+;;
+;; - GP is the real VNx16BI governing predicate
+;;
+;; - CAST_GP is GP cast to the mode of OP.  All bits dropped by casting
+;;   GP to CAST_GP are guaranteed to be clear in GP.
+;;
+;; - PTRUE_FLAG is a CONST_INT (conceptually of mode SI) that has the value
+;;   SVE_KNOWN_PTRUE if we know that CAST_GP (rather than GP) is all-true and
+;;   SVE_MAYBE_NOT_PTRUE otherwise.
+;;
+;; - OP is the predicate we want to test, of the same mode as CAST_GP.
+;;
+;; -------------------------------------------------------------------------
+;; ---- Description of UNSPEC_PRED_Z
+;; -------------------------------------------------------------------------
+;;
+;; SVE integer comparisons are predicated and return zero for inactive
+;; lanes.  Sometimes we use them with predicates that are all-true and
+;; sometimes we use them with general predicates.
+;;
+;; The integer comparisons also set the flags and so build-in the effect
+;; of a PTEST.  We therefore want to be able to combine integer comparison
+;; patterns with PTESTs of the result.  One difficulty with doing this is
+;; that (as noted above) the PTEST is always a .B operation and so can place
+;; stronger requirements on the governing predicate than the comparison does.
+;;
+;; For example, when applying a separate PTEST to the result of a full-vector
+;; .H comparison, the PTEST must be predicated on a .H PTRUE instead of a
+;; .B PTRUE.  In constrast, the comparison might be predicated on either
+;; a .H PTRUE or a .B PTRUE, since the values of odd-indexed predicate
+;; bits don't matter for .H operations.
+;;
+;; We therefore can't rely on a full-vector comparison using the same
+;; predicate register as a following PTEST.  We instead need to remember
+;; whether a comparison is known to be a full-vector comparison and use
+;; this information in addition to a check for equal predicate registers.
+;; At the same time, it's useful to have a common representation for all
+;; integer comparisons, so that they can be handled by a single set of
+;; patterns.
+;;
+;; We therefore take a similar approach to UNSPEC_PTEST above and use:
+;;
+;;   (unspec:<M:VPRED> [gp ptrue_flag (code:M op0 op1)] UNSPEC_PRED_Z)
+;;
+;; where:
+;;
+;; - GP is the governing predicate, of mode <M:VPRED>
+;;
+;; - PTRUE_FLAG is a CONST_INT (conceptually of mode SI) that has the value
+;;   SVE_KNOWN_PTRUE if we know that GP is all-true and SVE_MAYBE_NOT_PTRUE
+;;   otherwise
+;;
+;; - CODE is the comparison code
+;;
+;; - OP0 and OP1 are the values being compared, of mode M
+;;
+;; The "Z" in UNSPEC_PRED_Z indicates that inactive lanes are zero.
+;;
+;; -------------------------------------------------------------------------
+;; ---- Note on predicated integer arithemtic and UNSPEC_PRED_X
+;; -------------------------------------------------------------------------
+;;
+;; Many SVE integer operations are predicated.  We can generate them
+;; from four sources:
+;;
+;; (1) Using normal unpredicated optabs.  In this case we need to create
+;;     an all-true predicate register to act as the governing predicate
+;;     for the SVE instruction.  There are no inactive lanes, and thus
+;;     the values of inactive lanes don't matter.
+;;
+;; (2) Using _x ACLE functions.  In this case the function provides a
+;;     specific predicate and some lanes might be inactive.  However,
+;;     as for (1), the values of the inactive lanes don't matter.
+;;     We can make extra lanes active without changing the behavior
+;;     (although for code-quality reasons we should avoid doing so
+;;     needlessly).
+;;
+;; (3) Using cond_* optabs that correspond to IFN_COND_* internal functions.
+;;     These optabs have a predicate operand that specifies which lanes are
+;;     active and another operand that provides the values of inactive lanes.
+;;
+;; (4) Using _m and _z ACLE functions.  These functions map to the same
+;;     patterns as (3), with the _z functions setting inactive lanes to zero
+;;     and the _m functions setting the inactive lanes to one of the function
+;;     arguments.
+;;
+;; For (1) and (2) we need a way of attaching the predicate to a normal
+;; unpredicated integer operation.  We do this using:
+;;
+;;   (unspec:M [pred (code:M (op0 op1 ...))] UNSPEC_PRED_X)
+;;
+;; where (code:M (op0 op1 ...)) is the normal integer operation and PRED
+;; is a predicate of mode <M:VPRED>.  PRED might or might not be a PTRUE;
+;; it always is for (1), but might not be for (2).
+;;
+;; The unspec as a whole has the same value as (code:M ...) when PRED is
+;; all-true.  It is always semantically valid to replace PRED with a PTRUE,
+;; but as noted above, we should only do so if there's a specific benefit.
+;;
+;; (The "_X" in the unspec is named after the ACLE functions in (2).)
+;;
+;; For (3) and (4) we can simply use the SVE port's normal representation
+;; of a predicate-based select:
+;;
+;;   (unspec:M [pred (code:M (op0 op1 ...)) inactive] UNSPEC_SEL)
+;;
+;; where INACTIVE specifies the values of inactive lanes.
+;;
+;; We can also use the UNSPEC_PRED_X wrapper in the UNSPEC_SEL rather
+;; than inserting the integer operation directly.  This is mostly useful
+;; if we want the combine pass to merge an integer operation with an explicit
+;; vcond_mask (in other words, with a following SEL instruction).  However,
+;; it's generally better to merge such operations at the gimple level
+;; using (3).
+;;
+;; -------------------------------------------------------------------------
+;; ---- Note on predicated FP arithmetic patterns and GP "strictness"
+;; -------------------------------------------------------------------------
+;;
+;; Most SVE floating-point operations are predicated.  We can generate
+;; them from four sources:
+;;
+;; (1) Using normal unpredicated optabs.  In this case we need to create
+;;     an all-true predicate register to act as the governing predicate
+;;     for the SVE instruction.  There are no inactive lanes, and thus
+;;     the values of inactive lanes don't matter.
+;;
+;; (2) Using _x ACLE functions.  In this case the function provides a
+;;     specific predicate and some lanes might be inactive.  However,
+;;     as for (1), the values of the inactive lanes don't matter.
+;;
+;;     The instruction must have the same exception behavior as the
+;;     function call unless things like command-line flags specifically
+;;     allow otherwise.  For example, with -ffast-math, it is OK to
+;;     raise exceptions for inactive lanes, but normally it isn't.
+;;
+;; (3) Using cond_* optabs that correspond to IFN_COND_* internal functions.
+;;     These optabs have a predicate operand that specifies which lanes are
+;;     active and another operand that provides the values of inactive lanes.
+;;
+;; (4) Using _m and _z ACLE functions.  These functions map to the same
+;;     patterns as (3), with the _z functions setting inactive lanes to zero
+;;     and the _m functions setting the inactive lanes to one of the function
+;;     arguments.
+;;
+;; So:
+;;
+;; - In (1), the predicate is known to be all true and the pattern can use
+;;   unpredicated operations where available.
+;;
+;; - In (2), the predicate might or might not be all true.  The pattern can
+;;   use unpredicated instructions if the predicate is all-true or if things
+;;   like command-line flags allow exceptions for inactive lanes.
+;;
+;; - (3) and (4) represent a native SVE predicated operation.  Some lanes
+;;   might be inactive and inactive lanes of the result must have specific
+;;   values.  There is no scope for using unpredicated instructions (and no
+;;   reason to want to), so the question about command-line flags doesn't
+;;   arise.
+;;
+;; It would be inaccurate to model (2) as an rtx code like (sqrt ...)
+;; in combination with a separate predicate operand, e.g.
+;;
+;;   (unspec [(match_operand:<VPRED> 1 "register_operand" "Upl")
+;;	      (sqrt:SVE_FULL_F 2 "register_operand" "w")]
+;;	     ....)
+;;
+;; because (sqrt ...) can raise an exception for any lane, including
+;; inactive ones.  We therefore need to use an unspec instead.
+;;
+;; Also, (2) requires some way of distinguishing the case in which the
+;; predicate might have inactive lanes and cannot be changed from the
+;; case in which the predicate has no inactive lanes or can be changed.
+;; This information is also useful when matching combined FP patterns
+;; in which the predicates might not be equal.
+;;
+;; We therefore model FP operations as an unspec of the form:
+;;
+;;   (unspec [pred strictness op0 op1 ...] UNSPEC_COND_<MNEMONIC>)
+;;
+;; where:
+;;
+;; - PRED is the governing predicate.
+;;
+;; - STRICTNESS is a CONST_INT that conceptually has mode SI.  It has the
+;;   value SVE_STRICT_GP if PRED might have inactive lanes and if those
+;;   lanes must remain inactive.  It has the value SVE_RELAXED_GP otherwise.
+;;
+;; - OP0 OP1 ... are the normal input operands to the operation.
+;;
+;; - MNEMONIC is the mnemonic of the associated SVE instruction.
+;;
+;; -------------------------------------------------------------------------
+;; ---- Note on FFR handling
+;; -------------------------------------------------------------------------
+;;
+;; Logically we want to divide FFR-related instructions into regions
+;; that contain exactly one of:
+;;
+;; - a single write to the FFR
+;; - any number of reads from the FFR (but only one read is likely)
+;; - any number of LDFF1 and LDNF1 instructions
+;;
+;; However, LDFF1 and LDNF1 instructions should otherwise behave like
+;; normal loads as far as possible.  This means that they should be
+;; schedulable within a region in the same way that LD1 would be,
+;; and they should be deleted as dead if the result is unused.  The loads
+;; should therefore not write to the FFR, since that would both serialize
+;; the loads with respect to each other and keep the loads live for any
+;; later RDFFR.
+;;
+;; We get around this by using a fake "FFR token" (FFRT) to help describe
+;; the dependencies.  Writing to the FFRT starts a new "FFRT region",
+;; while using the FFRT keeps the instruction within its region.
+;; Specifically:
+;;
+;; - Writes start a new FFRT region as well as setting the FFR:
+;;
+;;       W1: parallel (FFRT = <new value>, FFR = <actual FFR value>)
+;;
+;; - Loads use an LD1-like instruction that also uses the FFRT, so that the
+;;   loads stay within the same FFRT region:
+;;
+;;       L1: load data while using the FFRT
+;;
+;;   In addition, any FFRT region that includes a load also has at least one
+;;   instance of:
+;;
+;;       L2: FFR = update(FFR, FFRT)  [type == no_insn]
+;;
+;;   to make it clear that the region both reads from and writes to the FFR.
+;;
+;; - Reads do the following:
+;;
+;;       R1: FFRT = FFR               [type == no_insn]
+;;       R2: read from the FFRT
+;;       R3: FFRT = update(FFRT)      [type == no_insn]
+;;
+;;   R1 and R3 both create new FFRT regions, so that previous LDFF1s and
+;;   LDNF1s cannot move forwards across R1 and later LDFF1s and LDNF1s
+;;   cannot move backwards across R3.
+;;
+;; This way, writes are only kept alive by later loads or reads,
+;; and write/read pairs fold normally.  For two consecutive reads,
+;; the first R3 is made dead by the second R1, which in turn becomes
+;; redundant with the first R1.  We then have:
+;;
+;;     first R1: FFRT = FFR
+;;     first read from the FFRT
+;;     second read from the FFRT
+;;     second R3: FFRT = update(FFRT)
+;;
+;; i.e. the two FFRT regions collapse into a single one with two
+;; independent reads.
+;;
+;; The model still prevents some valid optimizations though.  For example,
+;; if all loads in an FFRT region are deleted as dead, nothing would remove
+;; the L2 instructions.
+
+;; =========================================================================
+;; == Moves
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- Moves of single vectors
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - MOV  (including aliases)
+;; - LD1B (contiguous form)
+;; - LD1D (    "    "     )
+;; - LD1H (    "    "     )
+;; - LD1W (    "    "     )
+;; - LDR
+;; - ST1B (contiguous form)
+;; - ST1D (    "    "     )
+;; - ST1H (    "    "     )
+;; - ST1W (    "    "     )
+;; - STR
+;; -------------------------------------------------------------------------
 
-
-;; SVE data moves.
 (define_expand "mov<mode>"
-  [(set (match_operand:SVE_ALL 0 "nonimmediate_operand")
-	(match_operand:SVE_ALL 1 "general_operand"))]
+  [(set (match_operand:SVE_FULL 0 "nonimmediate_operand")
+	(match_operand:SVE_FULL 1 "general_operand"))]
   "TARGET_SVE"
   {
     /* Use the predicated load and store patterns where possible.
@@ -72,7 +559,7 @@
        head of the file) and increases the addressing choices for
        little-endian.  */
     if ((MEM_P (operands[0]) || MEM_P (operands[1]))
-        && can_create_pseudo_p ())
+	&& can_create_pseudo_p ())
       {
 	aarch64_expand_sve_mem_move (operands[0], operands[1], <VPRED>mode);
 	DONE;
@@ -80,47 +567,37 @@
 
     if (CONSTANT_P (operands[1]))
       {
-	aarch64_expand_mov_immediate (operands[0], operands[1],
-				      gen_vec_duplicate<mode>);
+	aarch64_expand_mov_immediate (operands[0], operands[1]);
 	DONE;
       }
 
     /* Optimize subregs on big-endian targets: we can use REV[BHW]
        instead of going through memory.  */
     if (BYTES_BIG_ENDIAN
-        && aarch64_maybe_expand_sve_subreg_move (operands[0], operands[1]))
+	&& aarch64_maybe_expand_sve_subreg_move (operands[0], operands[1]))
       DONE;
   }
 )
 
-;; A pattern for optimizing SUBREGs that have a reinterpreting effect
-;; on big-endian targets; see aarch64_maybe_expand_sve_subreg_move
-;; for details.  We use a special predicate for operand 2 to reduce
-;; the number of patterns.
-(define_insn_and_split "*aarch64_sve_mov<mode>_subreg_be"
-  [(set (match_operand:SVE_ALL 0 "aarch64_sve_nonimmediate_operand" "=w")
-	(unspec:SVE_ALL
-          [(match_operand:VNx16BI 1 "register_operand" "Upl")
-	   (match_operand 2 "aarch64_any_register_operand" "w")]
-	  UNSPEC_REV_SUBREG))]
-  "TARGET_SVE && BYTES_BIG_ENDIAN"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
+(define_expand "movmisalign<mode>"
+  [(set (match_operand:SVE_FULL 0 "nonimmediate_operand")
+	(match_operand:SVE_FULL 1 "general_operand"))]
+  "TARGET_SVE"
   {
-    aarch64_split_sve_subreg_move (operands[0], operands[1], operands[2]);
+    /* Equivalent to a normal move for our purpooses.  */
+    emit_move_insn (operands[0], operands[1]);
     DONE;
   }
 )
 
-;; Unpredicated moves (little-endian).  Only allow memory operations
-;; during and after RA; before RA we want the predicated load and
-;; store patterns to be used instead.
+;; Unpredicated moves (bytes or little-endian).  Only allow memory operations
+;; during and after RA; before RA we want the predicated load and store
+;; patterns to be used instead.
 (define_insn "*aarch64_sve_mov<mode>_le"
-  [(set (match_operand:SVE_ALL 0 "aarch64_sve_nonimmediate_operand" "=w, Utr, w, w")
-	(match_operand:SVE_ALL 1 "aarch64_sve_general_operand" "Utr, w, w, Dn"))]
+  [(set (match_operand:SVE_FULL 0 "aarch64_sve_nonimmediate_operand" "=w, Utr, w, w")
+	(match_operand:SVE_FULL 1 "aarch64_sve_general_operand" "Utr, w, w, Dn"))]
   "TARGET_SVE
-   && !BYTES_BIG_ENDIAN
+   && (<MODE>mode == VNx16QImode || !BYTES_BIG_ENDIAN)
    && ((lra_in_progress || reload_completed)
        || (register_operand (operands[0], <MODE>mode)
 	   && nonmemory_operand (operands[1], <MODE>mode)))"
@@ -131,12 +608,12 @@
    * return aarch64_output_sve_mov_immediate (operands[1]);"
 )
 
-;; Unpredicated moves (big-endian).  Memory accesses require secondary
+;; Unpredicated moves (non-byte big-endian).  Memory accesses require secondary
 ;; reloads.
 (define_insn "*aarch64_sve_mov<mode>_be"
-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w")
-	(match_operand:SVE_ALL 1 "aarch64_nonmemory_operand" "w, Dn"))]
-  "TARGET_SVE && BYTES_BIG_ENDIAN"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, w")
+	(match_operand:SVE_FULL 1 "aarch64_nonmemory_operand" "w, Dn"))]
+  "TARGET_SVE && BYTES_BIG_ENDIAN && <MODE>mode != VNx16QImode"
   "@
    mov\t%0.d, %1.d
    * return aarch64_output_sve_mov_immediate (operands[1]);"
@@ -144,10 +621,11 @@
 
 ;; Handle big-endian memory reloads.  We use byte PTRUE for all modes
 ;; to try to encourage reuse.
+;; This pattern needs constraints due to TARGET_SECONDARY_RELOAD hook.
 (define_expand "aarch64_sve_reload_be"
   [(parallel
      [(set (match_operand 0)
-           (match_operand 1))
+	   (match_operand 1))
       (clobber (match_operand:VNx16BI 2 "register_operand" "=Upl"))])]
   "TARGET_SVE && BYTES_BIG_ENDIAN"
   {
@@ -166,16 +644,15 @@
   }
 )
 
-;; A predicated load or store for which the predicate is known to be
-;; all-true.  Note that this pattern is generated directly by
-;; aarch64_emit_sve_pred_move, so changes to this pattern will
-;; need changes there as well.
+;; A predicated move in which the predicate is known to be all-true.
+;; Note that this pattern is generated directly by aarch64_emit_sve_pred_move,
+;; so changes to this pattern will need changes there as well.
 (define_insn_and_split "@aarch64_pred_mov<mode>"
-  [(set (match_operand:SVE_ALL 0 "nonimmediate_operand" "=w, w, m")
-	(unspec:SVE_ALL
+  [(set (match_operand:SVE_FULL 0 "nonimmediate_operand" "=w, w, m")
+	(unspec:SVE_FULL
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	   (match_operand:SVE_ALL 2 "nonimmediate_operand" "w, m, w")]
-	  UNSPEC_MERGE_PTRUE))]
+	   (match_operand:SVE_FULL 2 "nonimmediate_operand" "w, m, w")]
+	  UNSPEC_PRED_X))]
   "TARGET_SVE
    && (register_operand (operands[0], <MODE>mode)
        || register_operand (operands[2], <MODE>mode))"
@@ -188,152 +665,67 @@
   [(set (match_dup 0) (match_dup 2))]
 )
 
-(define_expand "movmisalign<mode>"
-  [(set (match_operand:SVE_ALL 0 "nonimmediate_operand")
-	(match_operand:SVE_ALL 1 "general_operand"))]
-  "TARGET_SVE"
+;; A pattern for optimizing SUBREGs that have a reinterpreting effect
+;; on big-endian targets; see aarch64_maybe_expand_sve_subreg_move
+;; for details.  We use a special predicate for operand 2 to reduce
+;; the number of patterns.
+(define_insn_and_split "*aarch64_sve_mov<mode>_subreg_be"
+  [(set (match_operand:SVE_FULL 0 "aarch64_sve_nonimmediate_operand" "=w")
+	(unspec:SVE_FULL
+	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
+	   (match_operand 2 "aarch64_any_register_operand" "w")]
+	  UNSPEC_REV_SUBREG))]
+  "TARGET_SVE && BYTES_BIG_ENDIAN"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
   {
-    /* Equivalent to a normal move for our purpooses.  */
-    emit_move_insn (operands[0], operands[1]);
+    aarch64_split_sve_subreg_move (operands[0], operands[1], operands[2]);
     DONE;
   }
 )
 
-(define_insn "maskload<mode><vpred>"
-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
-	(unspec:SVE_ALL
-	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
-	   (match_operand:SVE_ALL 1 "memory_operand" "m")]
-	  UNSPEC_LD1_SVE))]
-  "TARGET_SVE"
-  "ld1<Vesize>\t%0.<Vetype>, %2/z, %1"
-)
-
-(define_insn "maskstore<mode><vpred>"
-  [(set (match_operand:SVE_ALL 0 "memory_operand" "+m")
-	(unspec:SVE_ALL [(match_operand:<VPRED> 2 "register_operand" "Upl")
-			 (match_operand:SVE_ALL 1 "register_operand" "w")
-			 (match_dup 0)]
-			UNSPEC_ST1_SVE))]
-  "TARGET_SVE"
-  "st1<Vesize>\t%1.<Vetype>, %2, %0"
-)
-
-;; Unpredicated gather loads.
-(define_expand "gather_load<mode>"
-  [(set (match_operand:SVE_SD 0 "register_operand")
-	(unspec:SVE_SD
-	  [(match_dup 5)
-	   (match_operand:DI 1 "aarch64_reg_or_zero")
-	   (match_operand:<V_INT_EQUIV> 2 "register_operand")
-	   (match_operand:DI 3 "const_int_operand")
-	   (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
-	   (mem:BLK (scratch))]
-	  UNSPEC_LD1_GATHER))]
+;; Reinterpret operand 1 in operand 0's mode, without changing its contents.
+;; This is equivalent to a subreg on little-endian targets but not for
+;; big-endian; see the comment at the head of the file for details.
+(define_expand "@aarch64_sve_reinterpret<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand")
+	(unspec:SVE_FULL
+	  [(match_operand 1 "aarch64_any_register_operand")]
+	  UNSPEC_REINTERPRET))]
   "TARGET_SVE"
   {
-    operands[5] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    if (!BYTES_BIG_ENDIAN)
+      {
+	emit_move_insn (operands[0], gen_lowpart (<MODE>mode, operands[1]));
+	DONE;
+      }
   }
 )
 
-;; Predicated gather loads for 32-bit elements.  Operand 3 is true for
-;; unsigned extension and false for signed extension.
-(define_insn "mask_gather_load<mode>"
-  [(set (match_operand:SVE_S 0 "register_operand" "=w, w, w, w, w")
-	(unspec:SVE_S
-	  [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl, Upl, Upl")
-	   (match_operand:DI 1 "aarch64_reg_or_zero" "Z, rk, rk, rk, rk")
-	   (match_operand:<V_INT_EQUIV> 2 "register_operand" "w, w, w, w, w")
-	   (match_operand:DI 3 "const_int_operand" "i, Z, Ui1, Z, Ui1")
-	   (match_operand:DI 4 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, i, i")
-	   (mem:BLK (scratch))]
-	  UNSPEC_LD1_GATHER))]
-  "TARGET_SVE"
-  "@
-   ld1w\t%0.s, %5/z, [%2.s]
-   ld1w\t%0.s, %5/z, [%1, %2.s, sxtw]
-   ld1w\t%0.s, %5/z, [%1, %2.s, uxtw]
-   ld1w\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
-   ld1w\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
-)
-
-;; Predicated gather loads for 64-bit elements.  The value of operand 3
-;; doesn't matter in this case.
-(define_insn "mask_gather_load<mode>"
-  [(set (match_operand:SVE_D 0 "register_operand" "=w, w, w")
-	(unspec:SVE_D
-	  [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl")
-	   (match_operand:DI 1 "aarch64_reg_or_zero" "Z, rk, rk")
-	   (match_operand:<V_INT_EQUIV> 2 "register_operand" "w, w, w")
-	   (match_operand:DI 3 "const_int_operand")
-	   (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, Ui1, i")
-	   (mem:BLK (scratch))]
-	  UNSPEC_LD1_GATHER))]
-  "TARGET_SVE"
-  "@
-   ld1d\t%0.d, %5/z, [%2.d]
-   ld1d\t%0.d, %5/z, [%1, %2.d]
-   ld1d\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
-)
-
-;; Unpredicated scatter store.
-(define_expand "scatter_store<mode>"
-  [(set (mem:BLK (scratch))
-	(unspec:BLK
-	  [(match_dup 5)
-	   (match_operand:DI 0 "aarch64_reg_or_zero")
-	   (match_operand:<V_INT_EQUIV> 1 "register_operand")
-	   (match_operand:DI 2 "const_int_operand")
-	   (match_operand:DI 3 "aarch64_gather_scale_operand_<Vesize>")
-	   (match_operand:SVE_SD 4 "register_operand")]
-	  UNSPEC_ST1_SCATTER))]
+;; A pattern for handling type punning on big-endian targets.  We use a
+;; special predicate for operand 1 to reduce the number of patterns.
+(define_insn_and_split "*aarch64_sve_reinterpret<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
+	(unspec:SVE_FULL
+	  [(match_operand 1 "aarch64_any_register_operand" "w")]
+	  UNSPEC_REINTERPRET))]
   "TARGET_SVE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
   {
-    operands[5] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    operands[1] = aarch64_replace_reg_mode (operands[1], <MODE>mode);
   }
 )
 
-;; Predicated scatter stores for 32-bit elements.  Operand 2 is true for
-;; unsigned extension and false for signed extension.
-(define_insn "mask_scatter_store<mode>"
-  [(set (mem:BLK (scratch))
-	(unspec:BLK
-	  [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl, Upl, Upl")
-	   (match_operand:DI 0 "aarch64_reg_or_zero" "Z, rk, rk, rk, rk")
-	   (match_operand:<V_INT_EQUIV> 1 "register_operand" "w, w, w, w, w")
-	   (match_operand:DI 2 "const_int_operand" "i, Z, Ui1, Z, Ui1")
-	   (match_operand:DI 3 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, i, i")
-	   (match_operand:SVE_S 4 "register_operand" "w, w, w, w, w")]
-	  UNSPEC_ST1_SCATTER))]
-  "TARGET_SVE"
-  "@
-   st1w\t%4.s, %5, [%1.s]
-   st1w\t%4.s, %5, [%0, %1.s, sxtw]
-   st1w\t%4.s, %5, [%0, %1.s, uxtw]
-   st1w\t%4.s, %5, [%0, %1.s, sxtw %p3]
-   st1w\t%4.s, %5, [%0, %1.s, uxtw %p3]"
-)
-
-;; Predicated scatter stores for 64-bit elements.  The value of operand 2
-;; doesn't matter in this case.
-(define_insn "mask_scatter_store<mode>"
-  [(set (mem:BLK (scratch))
-	(unspec:BLK
-	  [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl")
-	   (match_operand:DI 0 "aarch64_reg_or_zero" "Z, rk, rk")
-	   (match_operand:<V_INT_EQUIV> 1 "register_operand" "w, w, w")
-	   (match_operand:DI 2 "const_int_operand")
-	   (match_operand:DI 3 "aarch64_gather_scale_operand_d" "Ui1, Ui1, i")
-	   (match_operand:SVE_D 4 "register_operand" "w, w, w")]
-	  UNSPEC_ST1_SCATTER))]
-  "TARGET_SVE"
-  "@
-   st1d\t%4.d, %5, [%1.d]
-   st1d\t%4.d, %5, [%0, %1.d]
-   st1d\t%4.d, %5, [%0, %1.d, lsl %p3]"
-)
+;; -------------------------------------------------------------------------
+;; ---- Moves of multiple vectors
+;; -------------------------------------------------------------------------
+;; All patterns in this section are synthetic and split to real
+;; instructions after reload.
+;; -------------------------------------------------------------------------
 
-;; SVE structure moves.
 (define_expand "mov<mode>"
   [(set (match_operand:SVE_STRUCT 0 "nonimmediate_operand")
 	(match_operand:SVE_STRUCT 1 "general_operand"))]
@@ -368,7 +760,7 @@
 
 ;; Unpredicated structure moves (big-endian).  Memory accesses require
 ;; secondary reloads.
-(define_insn "*aarch64_sve_mov<mode>_le"
+(define_insn "*aarch64_sve_mov<mode>_be"
   [(set (match_operand:SVE_STRUCT 0 "register_operand" "=w, w")
 	(match_operand:SVE_STRUCT 1 "aarch64_nonmemory_operand" "w, Dn"))]
   "TARGET_SVE && BYTES_BIG_ENDIAN"
@@ -409,7 +801,7 @@
 	(unspec:SVE_STRUCT
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
 	   (match_operand:SVE_STRUCT 2 "aarch64_sve_struct_nonimmediate_operand" "w, Utx, w")]
-	  UNSPEC_MERGE_PTRUE))]
+	  UNSPEC_PRED_X))]
   "TARGET_SVE
    && (register_operand (operands[0], <MODE>mode)
        || register_operand (operands[2], <MODE>mode))"
@@ -432,6 +824,18 @@
   [(set_attr "length" "<insn_length>")]
 )
 
+;; -------------------------------------------------------------------------
+;; ---- Moves of predicates
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - MOV
+;; - LDR
+;; - PFALSE
+;; - PTRUE
+;; - PTRUES
+;; - STR
+;; -------------------------------------------------------------------------
+
 (define_expand "mov<mode>"
   [(set (match_operand:PRED_ALL 0 "nonimmediate_operand")
 	(match_operand:PRED_ALL 1 "general_operand"))]
@@ -439,12 +843,18 @@
   {
     if (GET_CODE (operands[0]) == MEM)
       operands[1] = force_reg (<MODE>mode, operands[1]);
+
+    if (CONSTANT_P (operands[1]))
+      {
+	aarch64_expand_mov_immediate (operands[0], operands[1]);
+	DONE;
+      }
   }
 )
 
 (define_insn "*aarch64_sve_mov<mode>"
-  [(set (match_operand:PRED_ALL 0 "nonimmediate_operand" "=Upa, m, Upa, Upa, Upa")
-	(match_operand:PRED_ALL 1 "general_operand" "Upa, Upa, m, Dz, Dm"))]
+  [(set (match_operand:PRED_ALL 0 "nonimmediate_operand" "=Upa, m, Upa, Upa")
+	(match_operand:PRED_ALL 1 "aarch64_mov_operand" "Upa, Upa, m, Dn"))]
   "TARGET_SVE
    && (register_operand (operands[0], <MODE>mode)
        || register_operand (operands[1], <MODE>mode))"
@@ -452,287 +862,296 @@
    mov\t%0.b, %1.b
    str\t%1, %0
    ldr\t%0, %1
-   pfalse\t%0.b
-   * return aarch64_output_ptrue (<MODE>mode, '<Vetype>');"
+   * return aarch64_output_sve_mov_immediate (operands[1]);"
 )
 
-;; Handle extractions from a predicate by converting to an integer vector
-;; and extracting from there.
-(define_expand "vec_extract<vpred><Vel>"
-  [(match_operand:<VEL> 0 "register_operand")
-   (match_operand:<VPRED> 1 "register_operand")
-   (match_operand:SI 2 "nonmemory_operand")
-   ;; Dummy operand to which we can attach the iterator.
-   (reg:SVE_I V0_REGNUM)]
+;; Match PTRUES Pn.B when both the predicate and flags are useful.
+(define_insn_and_rewrite "*aarch64_sve_ptruevnx16bi_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand 2)
+	   (match_operand 3)
+	   (const_int SVE_KNOWN_PTRUE)
+	   (match_operator:VNx16BI 1 "aarch64_sve_ptrue_svpattern_immediate"
+	     [(unspec:VNx16BI
+		[(match_operand:SI 4 "const_int_operand")
+		 (match_operand:VNx16BI 5 "aarch64_simd_imm_zero")]
+		UNSPEC_PTRUE)])]
+	  UNSPEC_PTEST))
+   (set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(match_dup 1))]
   "TARGET_SVE"
   {
-    rtx tmp = gen_reg_rtx (<MODE>mode);
-    emit_insn (gen_aarch64_sve_dup<mode>_const (tmp, operands[1],
-						CONST1_RTX (<MODE>mode),
-						CONST0_RTX (<MODE>mode)));
-    emit_insn (gen_vec_extract<mode><Vel> (operands[0], tmp, operands[2]));
-    DONE;
+    return aarch64_output_sve_ptrues (operands[1]);
+  }
+  "&& (!CONSTANT_P (operands[2]) || !CONSTANT_P (operands[3]))"
+  {
+    operands[2] = operands[3] = CONSTM1_RTX (VNx16BImode);
   }
 )
 
-(define_expand "vec_extract<mode><Vel>"
-  [(set (match_operand:<VEL> 0 "register_operand")
-	(vec_select:<VEL>
-	  (match_operand:SVE_ALL 1 "register_operand")
-	  (parallel [(match_operand:SI 2 "nonmemory_operand")])))]
+;; Match PTRUES Pn.[HSD] when both the predicate and flags are useful.
+(define_insn_and_rewrite "*aarch64_sve_ptrue<mode>_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand 2)
+	   (match_operand 3)
+	   (const_int SVE_KNOWN_PTRUE)
+	   (subreg:PRED_HSD
+	     (match_operator:VNx16BI 1 "aarch64_sve_ptrue_svpattern_immediate"
+	       [(unspec:VNx16BI
+		  [(match_operand:SI 4 "const_int_operand")
+		   (match_operand:PRED_HSD 5 "aarch64_simd_imm_zero")]
+		  UNSPEC_PTRUE)]) 0)]
+	  UNSPEC_PTEST))
+   (set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(match_dup 1))]
   "TARGET_SVE"
   {
-    poly_int64 val;
-    if (poly_int_rtx_p (operands[2], &val)
-	&& known_eq (val, GET_MODE_NUNITS (<MODE>mode) - 1))
-      {
-	/* The last element can be extracted with a LASTB and a false
-	   predicate.  */
-	rtx sel = force_reg (<VPRED>mode, CONST0_RTX (<VPRED>mode));
-	emit_insn (gen_extract_last_<mode> (operands[0], sel, operands[1]));
-	DONE;
-      }
-    if (!CONST_INT_P (operands[2]))
-      {
-	/* Create an index with operand[2] as the base and -1 as the step.
-	   It will then be zero for the element we care about.  */
-	rtx index = gen_lowpart (<VEL_INT>mode, operands[2]);
-	index = force_reg (<VEL_INT>mode, index);
-	rtx series = gen_reg_rtx (<V_INT_EQUIV>mode);
-	emit_insn (gen_vec_series<v_int_equiv> (series, index, constm1_rtx));
-
-	/* Get a predicate that is true for only that element.  */
-	rtx zero = CONST0_RTX (<V_INT_EQUIV>mode);
-	rtx cmp = gen_rtx_EQ (<V_INT_EQUIV>mode, series, zero);
-	rtx sel = gen_reg_rtx (<VPRED>mode);
-	emit_insn (gen_vec_cmp<v_int_equiv><vpred> (sel, cmp, series, zero));
-
-	/* Select the element using LASTB.  */
-	emit_insn (gen_extract_last_<mode> (operands[0], sel, operands[1]));
-	DONE;
-      }
-  }
-)
-
-;; Extract element zero.  This is a special case because we want to force
-;; the registers to be the same for the second alternative, and then
-;; split the instruction into nothing after RA.
-(define_insn_and_split "*vec_extract<mode><Vel>_0"
-  [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=r, w, Utv")
-	(vec_select:<VEL>
-	  (match_operand:SVE_ALL 1 "register_operand" "w, 0, w")
-	  (parallel [(const_int 0)])))]
-  "TARGET_SVE"
-  {
-    operands[1] = gen_rtx_REG (<V128>mode, REGNO (operands[1]));
-    switch (which_alternative)
-      {
-	case 0:
-	  return "umov\\t%<vwcore>0, %1.<Vetype>[0]";
-	case 1:
-	  return "#";
-	case 2:
-	  return "st1\\t{%1.<Vetype>}[0], %0";
-	default:
-	  gcc_unreachable ();
-      }
+    return aarch64_output_sve_ptrues (operands[1]);
   }
-  "&& reload_completed
-   && REG_P (operands[0])
-   && REGNO (operands[0]) == REGNO (operands[1])"
-  [(const_int 0)]
+  "&& (!CONSTANT_P (operands[2]) || !CONSTANT_P (operands[3]))"
   {
-    emit_note (NOTE_INSN_DELETED);
-    DONE;
+    operands[2] = CONSTM1_RTX (VNx16BImode);
+    operands[3] = CONSTM1_RTX (<MODE>mode);
   }
-  [(set_attr "type" "neon_to_gp_q, untyped, neon_store1_one_lane_q")]
 )
 
-;; Extract an element from the Advanced SIMD portion of the register.
-;; We don't just reuse the aarch64-simd.md pattern because we don't
-;; want any change in lane number on big-endian targets.
-(define_insn "*vec_extract<mode><Vel>_v128"
-  [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=r, w, Utv")
-	(vec_select:<VEL>
-	  (match_operand:SVE_ALL 1 "register_operand" "w, w, w")
-	  (parallel [(match_operand:SI 2 "const_int_operand")])))]
-  "TARGET_SVE
-   && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 1, 15)"
+;; Match PTRUES Pn.B when only the flags result is useful (which is
+;; a way of testing VL).
+(define_insn_and_rewrite "*aarch64_sve_ptruevnx16bi_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand 2)
+	   (match_operand 3)
+	   (const_int SVE_KNOWN_PTRUE)
+	   (match_operator:VNx16BI 1 "aarch64_sve_ptrue_svpattern_immediate"
+	     [(unspec:VNx16BI
+		[(match_operand:SI 4 "const_int_operand")
+		 (match_operand:VNx16BI 5 "aarch64_simd_imm_zero")]
+		UNSPEC_PTRUE)])]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
+  "TARGET_SVE"
   {
-    operands[1] = gen_rtx_REG (<V128>mode, REGNO (operands[1]));
-    switch (which_alternative)
-      {
-	case 0:
-	  return "umov\\t%<vwcore>0, %1.<Vetype>[%2]";
-	case 1:
-	  return "dup\\t%<Vetype>0, %1.<Vetype>[%2]";
-	case 2:
-	  return "st1\\t{%1.<Vetype>}[%2], %0";
-	default:
-	  gcc_unreachable ();
-      }
+    return aarch64_output_sve_ptrues (operands[1]);
   }
-  [(set_attr "type" "neon_to_gp_q, neon_dup_q, neon_store1_one_lane_q")]
-)
-
-;; Extract an element in the range of DUP.  This pattern allows the
-;; source and destination to be different.
-(define_insn "*vec_extract<mode><Vel>_dup"
-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
-	(vec_select:<VEL>
-	  (match_operand:SVE_ALL 1 "register_operand" "w")
-	  (parallel [(match_operand:SI 2 "const_int_operand")])))]
-  "TARGET_SVE
-   && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 16, 63)"
+  "&& (!CONSTANT_P (operands[2]) || !CONSTANT_P (operands[3]))"
   {
-    operands[0] = gen_rtx_REG (<MODE>mode, REGNO (operands[0]));
-    return "dup\t%0.<Vetype>, %1.<Vetype>[%2]";
+    operands[2] = operands[3] = CONSTM1_RTX (VNx16BImode);
   }
 )
 
-;; Extract an element outside the range of DUP.  This pattern requires the
-;; source and destination to be the same.
-(define_insn "*vec_extract<mode><Vel>_ext"
-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
-	(vec_select:<VEL>
-	  (match_operand:SVE_ALL 1 "register_operand" "0")
-	  (parallel [(match_operand:SI 2 "const_int_operand")])))]
-  "TARGET_SVE && INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode) >= 64"
+;; Match PTRUES Pn.[HWD] when only the flags result is useful (which is
+;; a way of testing VL).
+(define_insn_and_rewrite "*aarch64_sve_ptrue<mode>_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand 2)
+	   (match_operand 3)
+	   (const_int SVE_KNOWN_PTRUE)
+	   (subreg:PRED_HSD
+	     (match_operator:VNx16BI 1 "aarch64_sve_ptrue_svpattern_immediate"
+	       [(unspec:VNx16BI
+		  [(match_operand:SI 4 "const_int_operand")
+		   (match_operand:PRED_HSD 5 "aarch64_simd_imm_zero")]
+		  UNSPEC_PTRUE)]) 0)]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
+  "TARGET_SVE"
   {
-    operands[0] = gen_rtx_REG (<MODE>mode, REGNO (operands[0]));
-    operands[2] = GEN_INT (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode));
-    return "ext\t%0.b, %0.b, %0.b, #%2";
+    return aarch64_output_sve_ptrues (operands[1]);
   }
-)
-
-;; Extract the last active element of operand 1 into operand 0.
-;; If no elements are active, extract the last inactive element instead.
-(define_insn "extract_last_<mode>"
-  [(set (match_operand:<VEL> 0 "register_operand" "=r, w")
-	(unspec:<VEL>
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (match_operand:SVE_ALL 2 "register_operand" "w, w")]
-	  UNSPEC_LASTB))]
-  "TARGET_SVE"
-  "@
-   lastb\t%<vwcore>0, %1, %2.<Vetype>
-   lastb\t%<Vetype>0, %1, %2.<Vetype>"
-)
-
-(define_expand "vec_duplicate<mode>"
-  [(parallel
-    [(set (match_operand:SVE_ALL 0 "register_operand")
-	  (vec_duplicate:SVE_ALL
-	    (match_operand:<VEL> 1 "aarch64_sve_dup_operand")))
-     (clobber (scratch:<VPRED>))])]
-  "TARGET_SVE"
+  "&& (!CONSTANT_P (operands[2]) || !CONSTANT_P (operands[3]))"
   {
-    if (MEM_P (operands[1]))
-      {
-	rtx ptrue = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
-	emit_insn (gen_sve_ld1r<mode> (operands[0], ptrue, operands[1],
-				       CONST0_RTX (<MODE>mode)));
-	DONE;
-      }
+    operands[2] = CONSTM1_RTX (VNx16BImode);
+    operands[3] = CONSTM1_RTX (<MODE>mode);
   }
 )
 
-;; Accept memory operands for the benefit of combine, and also in case
-;; the scalar input gets spilled to memory during RA.  We want to split
-;; the load at the first opportunity in order to allow the PTRUE to be
-;; optimized with surrounding code.
-(define_insn_and_split "*vec_duplicate<mode>_reg"
-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w, w")
-	(vec_duplicate:SVE_ALL
-	  (match_operand:<VEL> 1 "aarch64_sve_dup_operand" "r, w, Uty")))
-   (clobber (match_scratch:<VPRED> 2 "=X, X, Upl"))]
+;; -------------------------------------------------------------------------
+;; ---- Moves relating to the FFR
+;; -------------------------------------------------------------------------
+;; RDFFR
+;; RDFFRS
+;; SETFFR
+;; WRFFR
+;; -------------------------------------------------------------------------
+
+;; [W1 in the block comment above about FFR handling]
+;;
+;; Write to the FFR and start a new FFRT scheduling region.
+(define_insn "aarch64_wrffr"
+  [(set (reg:VNx16BI FFR_REGNUM)
+	(match_operand:VNx16BI 0 "aarch64_simd_reg_or_minus_one" "Dm, Upa"))
+   (set (reg:VNx16BI FFRT_REGNUM)
+	(match_dup 0))]
   "TARGET_SVE"
   "@
-   mov\t%0.<Vetype>, %<vwcore>1
-   mov\t%0.<Vetype>, %<Vetype>1
-   #"
-  "&& MEM_P (operands[1])"
-  [(const_int 0)]
-  {
-    if (GET_CODE (operands[2]) == SCRATCH)
-      operands[2] = gen_reg_rtx (<VPRED>mode);
-    emit_move_insn (operands[2], CONSTM1_RTX (<VPRED>mode));
-    emit_insn (gen_sve_ld1r<mode> (operands[0], operands[2], operands[1],
-				   CONST0_RTX (<MODE>mode)));
-    DONE;
-  }
-  [(set_attr "length" "4,4,8")]
+   setffr
+   wrffr\t%0.b"
 )
 
-;; This is used for vec_duplicate<mode>s from memory, but can also
-;; be used by combine to optimize selects of a a vec_duplicate<mode>
-;; with zero.
-(define_insn "sve_ld1r<mode>"
-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
-	(unspec:SVE_ALL
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (vec_duplicate:SVE_ALL
-	     (match_operand:<VEL> 2 "aarch64_sve_ld1r_operand" "Uty"))
-	   (match_operand:SVE_ALL 3 "aarch64_simd_imm_zero")]
-	  UNSPEC_SEL))]
+;; [L2 in the block comment above about FFR handling]
+;;
+;; Introduce a read from and write to the FFR in the current FFRT region,
+;; so that the FFR value is live on entry to the region and so that the FFR
+;; value visibly changes within the region.  This is used (possibly multiple
+;; times) in an FFRT region that includes LDFF1 or LDNF1 instructions.
+(define_insn "aarch64_update_ffr_for_load"
+  [(set (reg:VNx16BI FFR_REGNUM)
+	(unspec:VNx16BI [(reg:VNx16BI FFRT_REGNUM)
+			 (reg:VNx16BI FFR_REGNUM)] UNSPEC_UPDATE_FFR))]
   "TARGET_SVE"
-  "ld1r<Vesize>\t%0.<Vetype>, %1/z, %2"
+  ""
+  [(set_attr "type" "no_insn")]
 )
 
-;; Load 128 bits from memory and duplicate to fill a vector.  Since there
-;; are so few operations on 128-bit "elements", we don't define a VNx1TI
-;; and simply use vectors of bytes instead.
-(define_insn "*sve_ld1rq<Vesize>"
-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
-	(unspec:SVE_ALL
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (match_operand:TI 2 "aarch64_sve_ld1r_operand" "Uty")]
-	  UNSPEC_LD1RQ))]
+;; [R1 in the block comment above about FFR handling]
+;;
+;; Notionally copy the FFR to the FFRT, so that the current FFR value
+;; can be read from there by the RDFFR instructions below.  This acts
+;; as a scheduling barrier for earlier LDFF1 and LDNF1 instructions and
+;; creates a natural dependency with earlier writes.
+(define_insn "aarch64_copy_ffr_to_ffrt"
+  [(set (reg:VNx16BI FFRT_REGNUM)
+	(reg:VNx16BI FFR_REGNUM))]
   "TARGET_SVE"
-  "ld1rq<Vesize>\t%0.<Vetype>, %1/z, %2"
+  ""
+  [(set_attr "type" "no_insn")]
 )
 
-;; Implement a predicate broadcast by shifting the low bit of the scalar
-;; input into the top bit and using a WHILELO.  An alternative would be to
-;; duplicate the input and do a compare with zero.
-(define_expand "vec_duplicate<mode>"
-  [(set (match_operand:PRED_ALL 0 "register_operand")
-	(vec_duplicate:PRED_ALL (match_operand 1 "register_operand")))]
+;; [R2 in the block comment above about FFR handling]
+;;
+;; Read the FFR via the FFRT.
+(define_insn "aarch64_rdffr"
+  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(reg:VNx16BI FFRT_REGNUM))]
+  "TARGET_SVE"
+  "rdffr\t%0.b"
+)
+
+;; Likewise with zero predication.
+(define_insn "aarch64_rdffr_z"
+  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(and:VNx16BI
+	  (reg:VNx16BI FFRT_REGNUM)
+	  (match_operand:VNx16BI 1 "register_operand" "Upa")))]
+  "TARGET_SVE"
+  "rdffr\t%0.b, %1/z"
+)
+
+;; Read the FFR to test for a fault, without using the predicate result.
+(define_insn "*aarch64_rdffr_z_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_dup 1)
+	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	   (and:VNx16BI
+	     (reg:VNx16BI FFRT_REGNUM)
+	     (match_dup 1))]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
+  "TARGET_SVE"
+  "rdffrs\t%0.b, %1/z"
+)
+
+;; Same for unpredicated RDFFR when tested with a known PTRUE.
+(define_insn "*aarch64_rdffr_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_dup 1)
+	   (const_int SVE_KNOWN_PTRUE)
+	   (reg:VNx16BI FFRT_REGNUM)]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
+  "TARGET_SVE"
+  "rdffrs\t%0.b, %1/z"
+)
+
+;; Read the FFR with zero predication and test the result.
+(define_insn "*aarch64_rdffr_z_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_dup 1)
+	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	   (and:VNx16BI
+	     (reg:VNx16BI FFRT_REGNUM)
+	     (match_dup 1))]
+	  UNSPEC_PTEST))
+   (set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(and:VNx16BI
+	  (reg:VNx16BI FFRT_REGNUM)
+	  (match_dup 1)))]
+  "TARGET_SVE"
+  "rdffrs\t%0.b, %1/z"
+)
+
+;; Same for unpredicated RDFFR when tested with a known PTRUE.
+(define_insn "*aarch64_rdffr_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_dup 1)
+	   (const_int SVE_KNOWN_PTRUE)
+	   (reg:VNx16BI FFRT_REGNUM)]
+	  UNSPEC_PTEST))
+   (set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(reg:VNx16BI FFRT_REGNUM))]
+  "TARGET_SVE"
+  "rdffrs\t%0.b, %1/z"
+)
+
+;; [R3 in the block comment above about FFR handling]
+;;
+;; Arbitrarily update the FFRT after a read from the FFR.  This acts as
+;; a scheduling barrier for later LDFF1 and LDNF1 instructions.
+(define_insn "aarch64_update_ffrt"
+  [(set (reg:VNx16BI FFRT_REGNUM)
+	(unspec:VNx16BI [(reg:VNx16BI FFRT_REGNUM)] UNSPEC_UPDATE_FFRT))]
   "TARGET_SVE"
-  {
-    rtx tmp = gen_reg_rtx (DImode);
-    rtx op1 = gen_lowpart (DImode, operands[1]);
-    emit_insn (gen_ashldi3 (tmp, op1, gen_int_mode (63, DImode)));
-    emit_insn (gen_while_ultdi<mode> (operands[0], const0_rtx, tmp));
-    DONE;
-  }
-)
-
-(define_insn "vec_series<mode>"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, w")
-	(vec_series:SVE_I
-	  (match_operand:<VEL> 1 "aarch64_sve_index_operand" "Usi, r, r")
-	  (match_operand:<VEL> 2 "aarch64_sve_index_operand" "r, Usi, r")))]
+  ""
+  [(set_attr "type" "no_insn")]
+)
+
+;; =========================================================================
+;; == Loads
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- Normal contiguous loads
+;; -------------------------------------------------------------------------
+;; Includes contiguous forms of:
+;; - LD1B
+;; - LD1D
+;; - LD1H
+;; - LD1W
+;; - LD2B
+;; - LD2D
+;; - LD2H
+;; - LD2W
+;; - LD3B
+;; - LD3D
+;; - LD3H
+;; - LD3W
+;; - LD4B
+;; - LD4D
+;; - LD4H
+;; - LD4W
+;; -------------------------------------------------------------------------
+
+;; Predicated LD1.
+(define_insn "maskload<mode><vpred>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
+	(unspec:SVE_FULL
+	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
+	   (match_operand:SVE_FULL 1 "memory_operand" "m")]
+	  UNSPEC_LD1_SVE))]
   "TARGET_SVE"
-  "@
-   index\t%0.<Vetype>, #%1, %<vw>2
-   index\t%0.<Vetype>, %<vw>1, #%2
-   index\t%0.<Vetype>, %<vw>1, %<vw>2"
-)
-
-;; Optimize {x, x, x, x, ...} + {0, n, 2*n, 3*n, ...} if n is in range
-;; of an INDEX instruction.
-(define_insn "*vec_series<mode>_plus"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w")
-	(plus:SVE_I
-	  (vec_duplicate:SVE_I
-	    (match_operand:<VEL> 1 "register_operand" "r"))
-	  (match_operand:SVE_I 2 "immediate_operand")))]
-  "TARGET_SVE && aarch64_check_zero_based_sve_index_immediate (operands[2])"
-  {
-    operands[2] = aarch64_check_zero_based_sve_index_immediate (operands[2]);
-    return "index\t%0.<Vetype>, %<vw>1, #%2";
-  }
+  "ld1<Vesize>\t%0.<Vetype>, %2/z, %1"
 )
 
 ;; Unpredicated LD[234].
@@ -744,7 +1163,7 @@
 	  UNSPEC_LDN))]
   "TARGET_SVE"
   {
-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
   }
 )
 
@@ -759,884 +1178,5373 @@
   "ld<vector_count><Vesize>\t%0, %2/z, %1"
 )
 
-;; Unpredicated ST[234].  This is always a full update, so the dependence
-;; on the old value of the memory location (via (match_dup 0)) is redundant.
-;; There doesn't seem to be any obvious benefit to treating the all-true
-;; case differently though.  In particular, it's very unlikely that we'll
-;; only find out during RTL that a store_lanes is dead.
-(define_expand "vec_store_lanes<mode><vsingle>"
-  [(set (match_operand:SVE_STRUCT 0 "memory_operand")
-	(unspec:SVE_STRUCT
-	  [(match_dup 2)
-	   (match_operand:SVE_STRUCT 1 "register_operand")
-	   (match_dup 0)]
-	  UNSPEC_STN))]
+;; -------------------------------------------------------------------------
+;; ---- Extending contiguous loads
+;; -------------------------------------------------------------------------
+;; Includes contiguous forms of:
+;; LD1B
+;; LD1H
+;; LD1SB
+;; LD1SH
+;; LD1SW
+;; LD1W
+;; -------------------------------------------------------------------------
+
+;; Predicated load and extend, with 8 elements per 128-bit block.
+(define_insn "@aarch64_load_<ANY_EXTEND:optab><VNx8_WIDE:mode><VNx8_NARROW:mode>"
+  [(set (match_operand:VNx8_WIDE 0 "register_operand" "=w")
+	(ANY_EXTEND:VNx8_WIDE
+	  (unspec:VNx8_NARROW
+	    [(match_operand:VNx8BI 2 "register_operand" "Upl")
+	     (match_operand:VNx8_NARROW 1 "memory_operand" "m")]
+	    UNSPEC_LD1_SVE)))]
+  "TARGET_SVE"
+  "ld1<ANY_EXTEND:s><VNx8_NARROW:Vesize>\t%0.<VNx8_WIDE:Vetype>, %2/z, %1"
+)
+
+;; Predicated load and extend, with 4 elements per 128-bit block.
+(define_insn "@aarch64_load_<ANY_EXTEND:optab><VNx4_WIDE:mode><VNx4_NARROW:mode>"
+  [(set (match_operand:VNx4_WIDE 0 "register_operand" "=w")
+	(ANY_EXTEND:VNx4_WIDE
+	  (unspec:VNx4_NARROW
+	    [(match_operand:VNx4BI 2 "register_operand" "Upl")
+	     (match_operand:VNx4_NARROW 1 "memory_operand" "m")]
+	    UNSPEC_LD1_SVE)))]
+  "TARGET_SVE"
+  "ld1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.<VNx4_WIDE:Vetype>, %2/z, %1"
+)
+
+;; Predicated load and extend, with 2 elements per 128-bit block.
+(define_insn "@aarch64_load_<ANY_EXTEND:optab><VNx2_WIDE:mode><VNx2_NARROW:mode>"
+  [(set (match_operand:VNx2_WIDE 0 "register_operand" "=w")
+	(ANY_EXTEND:VNx2_WIDE
+	  (unspec:VNx2_NARROW
+	    [(match_operand:VNx2BI 2 "register_operand" "Upl")
+	     (match_operand:VNx2_NARROW 1 "memory_operand" "m")]
+	    UNSPEC_LD1_SVE)))]
+  "TARGET_SVE"
+  "ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.<VNx2_WIDE:Vetype>, %2/z, %1"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- First-faulting contiguous loads
+;; -------------------------------------------------------------------------
+;; Includes contiguous forms of:
+;; - LDFF1B
+;; - LDFF1D
+;; - LDFF1H
+;; - LDFF1W
+;; - LDNF1B
+;; - LDNF1D
+;; - LDNF1H
+;; - LDNF1W
+;; -------------------------------------------------------------------------
+
+;; Contiguous non-extending first-faulting or non-faulting loads.
+(define_insn "@aarch64_ld<fn>f1<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
+	(unspec:SVE_FULL
+	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
+	   (match_operand:SVE_FULL 1 "aarch64_sve_ld<fn>f1_operand" "Ut<fn>")
+	   (reg:VNx16BI FFRT_REGNUM)]
+	  SVE_LDFF1_LDNF1))]
+  "TARGET_SVE"
+  "ld<fn>f1<Vesize>\t%0.<Vetype>, %2/z, %1"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- First-faulting extending contiguous loads
+;; -------------------------------------------------------------------------
+;; Includes contiguous forms of:
+;; - LDFF1B
+;; - LDFF1H
+;; - LDFF1SB
+;; - LDFF1SH
+;; - LDFF1SW
+;; - LDFF1W
+;; - LDNF1B
+;; - LDNF1H
+;; - LDNF1SB
+;; - LDNF1SH
+;; - LDNF1SW
+;; - LDNF1W
+;; -------------------------------------------------------------------------
+
+;; Predicated first-faulting or non-faulting load and extend, with 8 elements
+;; per 128-bit block.
+(define_insn "@aarch64_ld<fn>f1_<ANY_EXTEND:optab><VNx8_WIDE:mode><VNx8_NARROW:mode>"
+  [(set (match_operand:VNx8_WIDE 0 "register_operand" "=w")
+	(ANY_EXTEND:VNx8_WIDE
+	  (unspec:VNx8_NARROW
+	    [(match_operand:VNx8BI 2 "register_operand" "Upl")
+	     (match_operand:VNx8_NARROW 1 "aarch64_sve_ld<fn>f1_operand" "Ut<fn>")
+	     (reg:VNx16BI FFRT_REGNUM)]
+	    SVE_LDFF1_LDNF1)))]
+  "TARGET_SVE"
+  "ld<fn>f1<ANY_EXTEND:s><VNx8_NARROW:Vesize>\t%0.<VNx8_WIDE:Vetype>, %2/z, %1"
+)
+
+;; Predicated first-faulting or non-faulting load and extend, with 4 elements
+;; per 128-bit block.
+(define_insn "@aarch64_ld<fn>f1_<ANY_EXTEND:optab><VNx4_WIDE:mode><VNx4_NARROW:mode>"
+  [(set (match_operand:VNx4_WIDE 0 "register_operand" "=w")
+	(ANY_EXTEND:VNx4_WIDE
+	  (unspec:VNx4_NARROW
+	    [(match_operand:VNx4BI 2 "register_operand" "Upl")
+	     (match_operand:VNx4_NARROW 1 "aarch64_sve_ld<fn>f1_operand" "Ut<fn>")
+	     (reg:VNx16BI FFRT_REGNUM)]
+	    SVE_LDFF1_LDNF1)))]
+  "TARGET_SVE"
+  "ld<fn>f1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.<VNx4_WIDE:Vetype>, %2/z, %1"
+)
+
+;; Predicated first-faulting or non-faulting load and extend, with 2 elements
+;; per 128-bit block.
+(define_insn "@aarch64_ld<fn>f1_<ANY_EXTEND:optab><VNx2_WIDE:mode><VNx2_NARROW:mode>"
+  [(set (match_operand:VNx2_WIDE 0 "register_operand" "=w")
+	(ANY_EXTEND:VNx2_WIDE
+	  (unspec:VNx2_NARROW
+	    [(match_operand:VNx2BI 2 "register_operand" "Upl")
+	     (match_operand:VNx2_NARROW 1 "aarch64_sve_ld<fn>f1_operand" "Ut<fn>")
+	     (reg:VNx16BI FFRT_REGNUM)]
+	    SVE_LDFF1_LDNF1)))]
+  "TARGET_SVE"
+  "ld<fn>f1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.<VNx2_WIDE:Vetype>, %2/z, %1"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- Non-temporal contiguous loads
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - LDNT1B
+;; - LDNT1D
+;; - LDNT1H
+;; - LDNT1W
+;; -------------------------------------------------------------------------
+
+;; Predicated contiguous non-temporal load.
+(define_insn "@aarch64_ldnt1<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
+	(unspec:SVE_FULL
+	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
+	   (match_operand:SVE_FULL 1 "memory_operand" "m")]
+	  UNSPEC_LDNT1_SVE))]
+  "TARGET_SVE"
+  "ldnt1<Vesize>\t%0.<Vetype>, %2/z, %1"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- Normal gather loads
+;; -------------------------------------------------------------------------
+;; Includes gather forms of:
+;; - LD1D
+;; - LD1W
+;; -------------------------------------------------------------------------
+
+;; Unpredicated gather loads.
+(define_expand "gather_load<mode>"
+  [(set (match_operand:SVE_FULL_SD 0 "register_operand")
+	(unspec:SVE_FULL_SD
+	  [(match_dup 5)
+	   (match_operand:DI 1 "aarch64_sve_gather_offset_<Vesize>")
+	   (match_operand:<V_INT_EQUIV> 2 "register_operand")
+	   (match_operand:DI 3 "const_int_operand")
+	   (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
+	   (mem:BLK (scratch))]
+	  UNSPEC_LD1_GATHER))]
+  "TARGET_SVE"
+  {
+    operands[5] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
+;; Predicated gather loads for 32-bit elements.  Operand 3 is true for
+;; unsigned extension and false for signed extension.
+(define_insn "mask_gather_load<mode>"
+  [(set (match_operand:SVE_FULL_S 0 "register_operand" "=w, w, w, w, w, w")
+	(unspec:SVE_FULL_S
+	  [(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	   (match_operand:DI 1 "aarch64_sve_gather_offset_w" "Z, vgw, rk, rk, rk, rk")
+	   (match_operand:VNx4SI 2 "register_operand" "w, w, w, w, w, w")
+	   (match_operand:DI 3 "const_int_operand" "Ui1, Ui1, Z, Ui1, Z, Ui1")
+	   (match_operand:DI 4 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, Ui1, i, i")
+	   (mem:BLK (scratch))]
+	  UNSPEC_LD1_GATHER))]
+  "TARGET_SVE"
+  "@
+   ld1w\t%0.s, %5/z, [%2.s]
+   ld1w\t%0.s, %5/z, [%2.s, #%1]
+   ld1w\t%0.s, %5/z, [%1, %2.s, sxtw]
+   ld1w\t%0.s, %5/z, [%1, %2.s, uxtw]
+   ld1w\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
+   ld1w\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
+)
+
+;; Predicated gather loads for 64-bit elements.  The value of operand 3
+;; doesn't matter in this case.
+(define_insn "mask_gather_load<mode>"
+  [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w, w, w")
+	(unspec:SVE_FULL_D
+	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
+	   (match_operand:DI 1 "aarch64_sve_gather_offset_d" "Z, vgd, rk, rk")
+	   (match_operand:VNx2DI 2 "register_operand" "w, w, w, w")
+	   (match_operand:DI 3 "const_int_operand")
+	   (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, Ui1, Ui1, i")
+	   (mem:BLK (scratch))]
+	  UNSPEC_LD1_GATHER))]
   "TARGET_SVE"
+  "@
+   ld1d\t%0.d, %5/z, [%2.d]
+   ld1d\t%0.d, %5/z, [%2.d, #%1]
+   ld1d\t%0.d, %5/z, [%1, %2.d]
+   ld1d\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
+)
+
+;; Likewise, but with the offset being sign-extended from 32 bits.
+(define_insn "*mask_gather_load<mode>_sxtw"
+  [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w")
+	(unspec:SVE_FULL_D
+	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
+	   (match_operand:DI 1 "register_operand" "rk, rk")
+	   (unspec:VNx2DI
+	     [(match_dup 5)
+	      (sign_extend:VNx2DI
+		(truncate:VNx2SI
+		  (match_operand:VNx2DI 2 "register_operand" "w, w")))]
+	     UNSPEC_PRED_X)
+	   (match_operand:DI 3 "const_int_operand")
+	   (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, i")
+	   (mem:BLK (scratch))]
+	  UNSPEC_LD1_GATHER))]
+  "TARGET_SVE"
+  "@
+   ld1d\t%0.d, %5/z, [%1, %2.d, sxtw]
+   ld1d\t%0.d, %5/z, [%1, %2.d, sxtw %p4]"
+)
+
+;; Likewise, but with the offset being zero-extended from 32 bits.
+(define_insn "*mask_gather_load<mode>_uxtw"
+  [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w")
+	(unspec:SVE_FULL_D
+	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
+	   (match_operand:DI 1 "register_operand" "rk, rk")
+	   (and:VNx2DI
+	     (match_operand:VNx2DI 2 "register_operand" "w, w")
+	     (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
+	   (match_operand:DI 3 "const_int_operand")
+	   (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, i")
+	   (mem:BLK (scratch))]
+	  UNSPEC_LD1_GATHER))]
+  "TARGET_SVE"
+  "@
+   ld1d\t%0.d, %5/z, [%1, %2.d, uxtw]
+   ld1d\t%0.d, %5/z, [%1, %2.d, uxtw %p4]"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- Extending gather loads
+;; -------------------------------------------------------------------------
+;; Includes gather forms of:
+;; - LD1B
+;; - LD1H
+;; - LD1SB
+;; - LD1SH
+;; - LD1SW
+;; - LD1W
+;; -------------------------------------------------------------------------
+
+;; Predicated extending gather loads for 32-bit elements.  Operand 3 is
+;; true for unsigned extension and false for signed extension.
+(define_insn "@aarch64_gather_load_<ANY_EXTEND:optab><VNx4_WIDE:mode><VNx4_NARROW:mode>"
+  [(set (match_operand:VNx4_WIDE 0 "register_operand" "=w, w, w, w, w, w")
+	(ANY_EXTEND:VNx4_WIDE
+	  (unspec:VNx4_NARROW
+	    [(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	     (match_operand:DI 1 "aarch64_sve_gather_offset_<VNx4_NARROW:Vesize>" "Z, vg<VNx4_NARROW:Vesize>, rk, rk, rk, rk")
+	     (match_operand:VNx4_WIDE 2 "register_operand" "w, w, w, w, w, w")
+	     (match_operand:DI 3 "const_int_operand" "Ui1, Ui1, Z, Ui1, Z, Ui1")
+	     (match_operand:DI 4 "aarch64_gather_scale_operand_<VNx4_NARROW:Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
+	     (mem:BLK (scratch))]
+	    UNSPEC_LD1_GATHER)))]
+  "TARGET_SVE"
+  "@
+   ld1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%2.s]
+   ld1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%2.s, #%1]
+   ld1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%1, %2.s, sxtw]
+   ld1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%1, %2.s, uxtw]
+   ld1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
+   ld1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
+)
+
+;; Predicated extending gather loads for 64-bit elements.  The value of
+;; operand 3 doesn't matter in this case.
+(define_insn "@aarch64_gather_load_<ANY_EXTEND:optab><VNx2_WIDE:mode><VNx2_NARROW:mode>"
+  [(set (match_operand:VNx2_WIDE 0 "register_operand" "=w, w, w, w")
+	(ANY_EXTEND:VNx2_WIDE
+	  (unspec:VNx2_NARROW
+	    [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
+	     (match_operand:DI 1 "aarch64_sve_gather_offset_<VNx2_NARROW:Vesize>" "Z, vg<VNx2_NARROW:Vesize>, rk, rk")
+	     (match_operand:VNx2_WIDE 2 "register_operand" "w, w, w, w")
+	     (match_operand:DI 3 "const_int_operand")
+	     (match_operand:DI 4 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, Ui1, Ui1, i")
+	     (mem:BLK (scratch))]
+	    UNSPEC_LD1_GATHER)))]
+  "TARGET_SVE"
+  "@
+   ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%2.d]
+   ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%2.d, #%1]
+   ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d]
+   ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
+)
+
+;; Likewise, but with the offset being sign-extended from 32 bits.
+(define_insn_and_rewrite "*aarch64_gather_load_<ANY_EXTEND:optab><VNx2_WIDE:mode><VNx2_NARROW:mode>_sxtw"
+  [(set (match_operand:VNx2_WIDE 0 "register_operand" "=w, w")
+	(ANY_EXTEND:VNx2_WIDE
+	  (unspec:VNx2_NARROW
+	    [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
+	     (match_operand:DI 1 "aarch64_reg_or_zero" "rk, rk")
+	     (unspec:VNx2DI
+	       [(match_operand 6)
+		(sign_extend:VNx2DI
+		  (truncate:VNx2SI
+		    (match_operand:VNx2DI 2 "register_operand" "w, w")))]
+	       UNSPEC_PRED_X)
+	     (match_operand:DI 3 "const_int_operand")
+	     (match_operand:DI 4 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, i")
+	     (mem:BLK (scratch))]
+	    UNSPEC_LD1_GATHER)))]
+  "TARGET_SVE"
+  "@
+   ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, sxtw]
+   ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, sxtw %p4]"
+  "&& !rtx_equal_p (operands[5], operands[6])"
+  {
+    operands[6] = copy_rtx (operands[5]);
+  }
+)
+
+;; Likewise, but with the offset being zero-extended from 32 bits.
+(define_insn "*aarch64_gather_load_<ANY_EXTEND:optab><VNx2_WIDE:mode><VNx2_NARROW:mode>_uxtw"
+  [(set (match_operand:VNx2_WIDE 0 "register_operand" "=w, w")
+	(ANY_EXTEND:VNx2_WIDE
+	  (unspec:VNx2_NARROW
+	    [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
+	     (match_operand:DI 1 "aarch64_reg_or_zero" "rk, rk")
+	     (and:VNx2DI
+	       (match_operand:VNx2DI 2 "register_operand" "w, w")
+	       (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
+	     (match_operand:DI 3 "const_int_operand")
+	     (match_operand:DI 4 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, i")
+	     (mem:BLK (scratch))]
+	    UNSPEC_LD1_GATHER)))]
+  "TARGET_SVE"
+  "@
+   ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, uxtw]
+   ld1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, uxtw %p4]"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- First-faulting gather loads
+;; -------------------------------------------------------------------------
+;; Includes gather forms of:
+;; - LDFF1D
+;; - LDFF1W
+;; -------------------------------------------------------------------------
+
+;; Predicated first-faulting gather loads for 32-bit elements.  Operand
+;; 3 is true for unsigned extension and false for signed extension.
+(define_insn "@aarch64_ldff1_gather<mode>"
+  [(set (match_operand:SVE_FULL_S 0 "register_operand" "=w, w, w, w, w, w")
+	(unspec:SVE_FULL_S
+	  [(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	   (match_operand:DI 1 "aarch64_sve_gather_offset_w" "Z, vgw, rk, rk, rk, rk")
+	   (match_operand:VNx4SI 2 "register_operand" "w, w, w, w, w, w")
+	   (match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1")
+	   (match_operand:DI 4 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, Ui1, i, i")
+	   (mem:BLK (scratch))
+	   (reg:VNx16BI FFRT_REGNUM)]
+	  UNSPEC_LDFF1_GATHER))]
+  "TARGET_SVE"
+  "@
+   ldff1w\t%0.s, %5/z, [%2.s]
+   ldff1w\t%0.s, %5/z, [%2.s, #%1]
+   ldff1w\t%0.s, %5/z, [%1, %2.s, sxtw]
+   ldff1w\t%0.s, %5/z, [%1, %2.s, uxtw]
+   ldff1w\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
+   ldff1w\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
+)
+
+;; Predicated first-faulting gather loads for 64-bit elements.  The value
+;; of operand 3 doesn't matter in this case.
+(define_insn "@aarch64_ldff1_gather<mode>"
+  [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w, w, w")
+	(unspec:SVE_FULL_D
+	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
+	   (match_operand:DI 1 "aarch64_sve_gather_offset_d" "Z, vgd, rk, rk")
+	   (match_operand:VNx2DI 2 "register_operand" "w, w, w, w")
+	   (match_operand:DI 3 "const_int_operand")
+	   (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, Ui1, Ui1, i")
+	   (mem:BLK (scratch))
+	   (reg:VNx16BI FFRT_REGNUM)]
+	  UNSPEC_LDFF1_GATHER))]
+  "TARGET_SVE"
+  "@
+   ldff1d\t%0.d, %5/z, [%2.d]
+   ldff1d\t%0.d, %5/z, [%2.d, #%1]
+   ldff1d\t%0.d, %5/z, [%1, %2.d]
+   ldff1d\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
+)
+
+;; Likewise, but with the offset being sign-extended from 32 bits.
+(define_insn_and_rewrite "*aarch64_ldff1_gather<mode>_sxtw"
+  [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w")
+	(unspec:SVE_FULL_D
+	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
+	   (match_operand:DI 1 "register_operand" "rk, rk")
+	   (unspec:VNx2DI
+	     [(match_operand 6)
+	      (sign_extend:VNx2DI
+		(truncate:VNx2SI
+		  (match_operand:VNx2DI 2 "register_operand" "w, w")))]
+	     UNSPEC_PRED_X)
+	   (match_operand:DI 3 "const_int_operand")
+	   (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, i")
+	   (mem:BLK (scratch))
+	   (reg:VNx16BI FFRT_REGNUM)]
+	  UNSPEC_LDFF1_GATHER))]
+  "TARGET_SVE"
+  "@
+   ldff1d\t%0.d, %5/z, [%1, %2.d, sxtw]
+   ldff1d\t%0.d, %5/z, [%1, %2.d, sxtw %p4]"
+  "&& !rtx_equal_p (operands[5], operands[6])"
+  {
+    operands[6] = copy_rtx (operands[5]);
+  }
+)
+
+;; Likewise, but with the offset being zero-extended from 32 bits.
+(define_insn "*aarch64_ldff1_gather<mode>_uxtw"
+  [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w")
+	(unspec:SVE_FULL_D
+	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
+	   (match_operand:DI 1 "register_operand" "rk, rk")
+	   (and:VNx2DI
+	     (match_operand:VNx2DI 2 "register_operand" "w, w")
+	     (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
+	   (match_operand:DI 3 "const_int_operand")
+	   (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, i")
+	   (mem:BLK (scratch))
+	   (reg:VNx16BI FFRT_REGNUM)]
+	  UNSPEC_LDFF1_GATHER))]
+  "TARGET_SVE"
+  "@
+   ldff1d\t%0.d, %5/z, [%1, %2.d, uxtw]
+   ldff1d\t%0.d, %5/z, [%1, %2.d, uxtw %p4]"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- First-faulting extending gather loads
+;; -------------------------------------------------------------------------
+;; Includes gather forms of:
+;; - LDFF1B
+;; - LDFF1H
+;; - LDFF1SB
+;; - LDFF1SH
+;; - LDFF1SW
+;; - LDFF1W
+;; -------------------------------------------------------------------------
+
+;; Predicated extending first-faulting gather loads for 32-bit elements.
+;; Operand 3 is true for unsigned extension and false for signed extension.
+(define_insn "@aarch64_ldff1_gather_<ANY_EXTEND:optab><VNx4_WIDE:mode><VNx4_NARROW:mode>"
+  [(set (match_operand:VNx4_WIDE 0 "register_operand" "=w, w, w, w, w, w")
+	(ANY_EXTEND:VNx4_WIDE
+	  (unspec:VNx4_NARROW
+	    [(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	     (match_operand:DI 1 "aarch64_sve_gather_offset_<VNx4_NARROW:Vesize>" "Z, vg<VNx4_NARROW:Vesize>, rk, rk, rk, rk")
+	     (match_operand:VNx4_WIDE 2 "register_operand" "w, w, w, w, w, w")
+	     (match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1")
+	     (match_operand:DI 4 "aarch64_gather_scale_operand_<VNx4_NARROW:Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
+	     (mem:BLK (scratch))
+	     (reg:VNx16BI FFRT_REGNUM)]
+	    UNSPEC_LDFF1_GATHER)))]
+  "TARGET_SVE"
+  "@
+   ldff1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%2.s]
+   ldff1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%2.s, #%1]
+   ldff1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%1, %2.s, sxtw]
+   ldff1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%1, %2.s, uxtw]
+   ldff1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
+   ldff1<ANY_EXTEND:s><VNx4_NARROW:Vesize>\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
+)
+
+;; Predicated extending first-faulting gather loads for 64-bit elements.
+;; The value of operand 3 doesn't matter in this case.
+(define_insn "@aarch64_ldff1_gather_<ANY_EXTEND:optab><VNx2_WIDE:mode><VNx2_NARROW:mode>"
+  [(set (match_operand:VNx2_WIDE 0 "register_operand" "=w, w, w, w")
+	(ANY_EXTEND:VNx2_WIDE
+	  (unspec:VNx2_NARROW
+	    [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
+	     (match_operand:DI 1 "aarch64_sve_gather_offset_<VNx2_NARROW:Vesize>" "Z, vg<VNx2_NARROW:Vesize>, rk, rk")
+	     (match_operand:VNx2_WIDE 2 "register_operand" "w, w, w, w")
+	     (match_operand:DI 3 "const_int_operand")
+	     (match_operand:DI 4 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, Ui1, Ui1, i")
+	     (mem:BLK (scratch))
+	     (reg:VNx16BI FFRT_REGNUM)]
+	    UNSPEC_LDFF1_GATHER)))]
+  "TARGET_SVE"
+  "@
+   ldff1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%2.d]
+   ldff1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%2.d, #%1]
+   ldff1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d]
+   ldff1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
+)
+
+;; Likewise, but with the offset being sign-extended from 32 bits.
+(define_insn_and_rewrite "*aarch64_ldff1_gather_<ANY_EXTEND:optab><VNx2_WIDE:mode><VNx2_NARROW:mode>_sxtw"
+  [(set (match_operand:VNx2_WIDE 0 "register_operand" "=w, w")
+	(ANY_EXTEND:VNx2_WIDE
+	  (unspec:VNx2_NARROW
+	    [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
+	     (match_operand:DI 1 "aarch64_reg_or_zero" "rk, rk")
+	     (unspec:VNx2DI
+	       [(match_operand 6)
+		(sign_extend:VNx2DI
+		  (truncate:VNx2SI
+		    (match_operand:VNx2DI 2 "register_operand" "w, w")))]
+	       UNSPEC_PRED_X)
+	     (match_operand:DI 3 "const_int_operand")
+	     (match_operand:DI 4 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, i")
+	     (mem:BLK (scratch))
+	     (reg:VNx16BI FFRT_REGNUM)]
+	    UNSPEC_LDFF1_GATHER)))]
+  "TARGET_SVE"
+  "@
+   ldff1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, sxtw]
+   ldff1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, sxtw %p4]"
+  "&& !rtx_equal_p (operands[5], operands[6])"
+  {
+    operands[6] = copy_rtx (operands[5]);
+  }
+)
+
+;; Likewise, but with the offset being zero-extended from 32 bits.
+(define_insn "*aarch64_ldff1_gather_<ANY_EXTEND:optab><VNx2_WIDE:mode><VNx2_NARROW:mode>_uxtw"
+  [(set (match_operand:VNx2_WIDE 0 "register_operand" "=w, w")
+	(ANY_EXTEND:VNx2_WIDE
+	  (unspec:VNx2_NARROW
+	    [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
+	     (match_operand:DI 1 "aarch64_reg_or_zero" "rk, rk")
+	     (and:VNx2DI
+	       (match_operand:VNx2DI 2 "register_operand" "w, w")
+	       (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
+	     (match_operand:DI 3 "const_int_operand")
+	     (match_operand:DI 4 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, i")
+	     (mem:BLK (scratch))
+	     (reg:VNx16BI FFRT_REGNUM)]
+	    UNSPEC_LDFF1_GATHER)))]
+  "TARGET_SVE"
+  "@
+   ldff1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, uxtw]
+   ldff1<ANY_EXTEND:s><VNx2_NARROW:Vesize>\t%0.d, %5/z, [%1, %2.d, uxtw %p4]"
+)
+
+;; =========================================================================
+;; == Prefetches
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- Contiguous prefetches
+;; -------------------------------------------------------------------------
+;; Includes contiguous forms of:
+;; - PRFB
+;; - PRFD
+;; - PRFH
+;; - PRFW
+;; -------------------------------------------------------------------------
+
+;; Contiguous predicated prefetches.  Operand 2 gives the real prefetch
+;; operation (as an svprfop), with operands 3 and 4 providing distilled
+;; information.
+(define_insn "@aarch64_sve_prefetch<mode>"
+  [(prefetch (unspec:DI
+	       [(match_operand:<VPRED> 0 "register_operand" "Upl")
+		(match_operand:SVE_FULL_I 1 "aarch64_sve_prefetch_operand" "UP<Vesize>")
+		(match_operand:DI 2 "const_int_operand")]
+	       UNSPEC_SVE_PREFETCH)
+	     (match_operand:DI 3 "const_int_operand")
+	     (match_operand:DI 4 "const_int_operand"))]
+  "TARGET_SVE"
+  {
+    operands[1] = gen_rtx_MEM (<MODE>mode, operands[1]);
+    return aarch64_output_sve_prefetch ("prf<Vesize>", operands[2], "%0, %1");
+  }
+)
+
+;; -------------------------------------------------------------------------
+;; ---- Gather prefetches
+;; -------------------------------------------------------------------------
+;; Includes gather forms of:
+;; - PRFB
+;; - PRFD
+;; - PRFH
+;; - PRFW
+;; -------------------------------------------------------------------------
+
+;; Predicated gather prefetches for 32-bit bases and offsets.  The operands
+;; are:
+;; 0: the governing predicate
+;; 1: the scalar component of the address
+;; 2: the vector component of the address
+;; 3: 1 for zero extension, 0 for sign extension
+;; 4: the scale multiplier
+;; 5: a vector zero that identifies the mode of data being accessed
+;; 6: the prefetch operator (an svprfop)
+;; 7: the normal RTL prefetch rw flag
+;; 8: the normal RTL prefetch locality value
+(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx4SI_ONLY:mode>"
+  [(prefetch (unspec:DI
+	       [(match_operand:VNx4BI 0 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+		(match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL_I:Vesize>" "Z, vg<SVE_FULL_I:Vesize>, rk, rk, rk, rk")
+		(match_operand:VNx4SI_ONLY 2 "register_operand" "w, w, w, w, w, w")
+		(match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1")
+		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
+		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
+		(match_operand:DI 6 "const_int_operand")]
+	       UNSPEC_SVE_PREFETCH_GATHER)
+	     (match_operand:DI 7 "const_int_operand")
+	     (match_operand:DI 8 "const_int_operand"))]
+  "TARGET_SVE"
+  {
+    static const char *const insns[][2] = {
+      "prf<SVE_FULL_I:Vesize>", "%0, [%2.s]",
+      "prf<SVE_FULL_I:Vesize>", "%0, [%2.s, #%1]",
+      "prfb", "%0, [%1, %2.s, sxtw]",
+      "prfb", "%0, [%1, %2.s, uxtw]",
+      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.s, sxtw %p4]",
+      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.s, uxtw %p4]"
+    };
+    const char *const *parts = insns[which_alternative];
+    return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
+  }
+)
+
+;; Predicated gather prefetches for 64-bit elements.  The value of operand 3
+;; doesn't matter in this case.
+(define_insn "@aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>"
+  [(prefetch (unspec:DI
+	       [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl, Upl, Upl")
+		(match_operand:DI 1 "aarch64_sve_gather_offset_<SVE_FULL_I:Vesize>" "Z, vg<SVE_FULL_I:Vesize>, rk, rk")
+		(match_operand:VNx2DI_ONLY 2 "register_operand" "w, w, w, w")
+		(match_operand:DI 3 "const_int_operand")
+		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, Ui1, Ui1, i")
+		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
+		(match_operand:DI 6 "const_int_operand")]
+	       UNSPEC_SVE_PREFETCH_GATHER)
+	     (match_operand:DI 7 "const_int_operand")
+	     (match_operand:DI 8 "const_int_operand"))]
+  "TARGET_SVE"
+  {
+    static const char *const insns[][2] = {
+      "prf<SVE_FULL_I:Vesize>", "%0, [%2.d]",
+      "prf<SVE_FULL_I:Vesize>", "%0, [%2.d, #%1]",
+      "prfb", "%0, [%1, %2.d]",
+      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, lsl %p4]"
+    };
+    const char *const *parts = insns[which_alternative];
+    return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
+  }
+)
+
+;; Likewise, but with the offset being sign-extended from 32 bits.
+(define_insn_and_rewrite "*aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>_sxtw"
+  [(prefetch (unspec:DI
+	       [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl")
+		(match_operand:DI 1 "register_operand" "rk, rk")
+		(unspec:VNx2DI_ONLY
+		  [(match_operand 9)
+		   (sign_extend:VNx2DI
+		     (truncate:VNx2SI
+		       (match_operand:VNx2DI 2 "register_operand" "w, w")))]
+		  UNSPEC_PRED_X)
+		(match_operand:DI 3 "const_int_operand")
+		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, i")
+		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
+		(match_operand:DI 6 "const_int_operand")]
+	       UNSPEC_SVE_PREFETCH_GATHER)
+	     (match_operand:DI 7 "const_int_operand")
+	     (match_operand:DI 8 "const_int_operand"))]
+  "TARGET_SVE"
+  {
+    static const char *const insns[][2] = {
+      "prfb", "%0, [%1, %2.d, sxtw]",
+      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, sxtw %p4]"
+    };
+    const char *const *parts = insns[which_alternative];
+    return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
+  }
+  "&& !rtx_equal_p (operands[0], operands[9])"
+  {
+    operands[9] = copy_rtx (operands[0]);
+  }
+)
+
+;; Likewise, but with the offset being zero-extended from 32 bits.
+(define_insn "*aarch64_sve_gather_prefetch<SVE_FULL_I:mode><VNx2DI_ONLY:mode>_uxtw"
+  [(prefetch (unspec:DI
+	       [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl")
+		(match_operand:DI 1 "register_operand" "rk, rk")
+		(and:VNx2DI_ONLY
+		  (match_operand:VNx2DI 2 "register_operand" "w, w")
+		  (match_operand:VNx2DI 9 "aarch64_sve_uxtw_immediate"))
+		(match_operand:DI 3 "const_int_operand")
+		(match_operand:DI 4 "aarch64_gather_scale_operand_<SVE_FULL_I:Vesize>" "Ui1, i")
+		(match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero")
+		(match_operand:DI 6 "const_int_operand")]
+	       UNSPEC_SVE_PREFETCH_GATHER)
+	     (match_operand:DI 7 "const_int_operand")
+	     (match_operand:DI 8 "const_int_operand"))]
+  "TARGET_SVE"
+  {
+    static const char *const insns[][2] = {
+      "prfb", "%0, [%1, %2.d, uxtw]",
+      "prf<SVE_FULL_I:Vesize>", "%0, [%1, %2.d, uxtw %p4]"
+    };
+    const char *const *parts = insns[which_alternative];
+    return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]);
+  }
+)
+
+;; =========================================================================
+;; == Stores
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- Normal contiguous stores
+;; -------------------------------------------------------------------------
+;; Includes contiguous forms of:
+;; - ST1B
+;; - ST1D
+;; - ST1H
+;; - ST1W
+;; - ST2B
+;; - ST2D
+;; - ST2H
+;; - ST2W
+;; - ST3B
+;; - ST3D
+;; - ST3H
+;; - ST3W
+;; - ST4B
+;; - ST4D
+;; - ST4H
+;; - ST4W
+;; -------------------------------------------------------------------------
+
+;; Predicated ST1.
+(define_insn "maskstore<mode><vpred>"
+  [(set (match_operand:SVE_FULL 0 "memory_operand" "+m")
+	(unspec:SVE_FULL
+	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
+	   (match_operand:SVE_FULL 1 "register_operand" "w")
+	   (match_dup 0)]
+	  UNSPEC_ST1_SVE))]
+  "TARGET_SVE"
+  "st1<Vesize>\t%1.<Vetype>, %2, %0"
+)
+
+;; Unpredicated ST[234].  This is always a full update, so the dependence
+;; on the old value of the memory location (via (match_dup 0)) is redundant.
+;; There doesn't seem to be any obvious benefit to treating the all-true
+;; case differently though.  In particular, it's very unlikely that we'll
+;; only find out during RTL that a store_lanes is dead.
+(define_expand "vec_store_lanes<mode><vsingle>"
+  [(set (match_operand:SVE_STRUCT 0 "memory_operand")
+	(unspec:SVE_STRUCT
+	  [(match_dup 2)
+	   (match_operand:SVE_STRUCT 1 "register_operand")
+	   (match_dup 0)]
+	  UNSPEC_STN))]
+  "TARGET_SVE"
+  {
+    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
+;; Predicated ST[234].
+(define_insn "vec_mask_store_lanes<mode><vsingle>"
+  [(set (match_operand:SVE_STRUCT 0 "memory_operand" "+m")
+	(unspec:SVE_STRUCT
+	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
+	   (match_operand:SVE_STRUCT 1 "register_operand" "w")
+	   (match_dup 0)]
+	  UNSPEC_STN))]
+  "TARGET_SVE"
+  "st<vector_count><Vesize>\t%1, %2, %0"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- Truncating contiguous stores
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - ST1B
+;; - ST1H
+;; - ST1W
+;; -------------------------------------------------------------------------
+
+;; Predicated truncate and store, with 8 elements per 128-bit block.
+(define_insn "@aarch64_store_trunc<VNx8_NARROW:mode><VNx8_WIDE:mode>"
+  [(set (match_operand:VNx8_NARROW 0 "memory_operand" "+m")
+	(unspec:VNx8_NARROW
+	  [(match_operand:VNx8BI 2 "register_operand" "Upl")
+	   (truncate:VNx8_NARROW
+	     (match_operand:VNx8_WIDE 1 "register_operand" "w"))
+	   (match_dup 0)]
+	  UNSPEC_ST1_SVE))]
+  "TARGET_SVE"
+  "st1<VNx8_NARROW:Vesize>\t%1.<VNx8_WIDE:Vetype>, %2, %0"
+)
+
+;; Predicated truncate and store, with 4 elements per 128-bit block.
+(define_insn "@aarch64_store_trunc<VNx4_NARROW:mode><VNx4_WIDE:mode>"
+  [(set (match_operand:VNx4_NARROW 0 "memory_operand" "+m")
+	(unspec:VNx4_NARROW
+	  [(match_operand:VNx4BI 2 "register_operand" "Upl")
+	   (truncate:VNx4_NARROW
+	     (match_operand:VNx4_WIDE 1 "register_operand" "w"))
+	   (match_dup 0)]
+	  UNSPEC_ST1_SVE))]
+  "TARGET_SVE"
+  "st1<VNx4_NARROW:Vesize>\t%1.<VNx4_WIDE:Vetype>, %2, %0"
+)
+
+;; Predicated truncate and store, with 2 elements per 128-bit block.
+(define_insn "@aarch64_store_trunc<VNx2_NARROW:mode><VNx2_WIDE:mode>"
+  [(set (match_operand:VNx2_NARROW 0 "memory_operand" "+m")
+	(unspec:VNx2_NARROW
+	  [(match_operand:VNx2BI 2 "register_operand" "Upl")
+	   (truncate:VNx2_NARROW
+	     (match_operand:VNx2_WIDE 1 "register_operand" "w"))
+	   (match_dup 0)]
+	  UNSPEC_ST1_SVE))]
+  "TARGET_SVE"
+  "st1<VNx2_NARROW:Vesize>\t%1.<VNx2_WIDE:Vetype>, %2, %0"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- Non-temporal contiguous stores
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - STNT1B
+;; - STNT1D
+;; - STNT1H
+;; - STNT1W
+;; -------------------------------------------------------------------------
+
+(define_insn "@aarch64_stnt1<mode>"
+  [(set (match_operand:SVE_FULL 0 "memory_operand" "+m")
+	(unspec:SVE_FULL
+	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
+	   (match_operand:SVE_FULL 1 "register_operand" "w")
+	   (match_dup 0)]
+	  UNSPEC_STNT1_SVE))]
+  "TARGET_SVE"
+  "stnt1<Vesize>\t%1.<Vetype>, %2, %0"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- Normal scatter stores
+;; -------------------------------------------------------------------------
+;; Includes scatter forms of:
+;; - ST1D
+;; - ST1W
+;; -------------------------------------------------------------------------
+
+;; Unpredicated scatter stores.
+(define_expand "scatter_store<mode>"
+  [(set (mem:BLK (scratch))
+	(unspec:BLK
+	  [(match_dup 5)
+	   (match_operand:DI 0 "aarch64_sve_gather_offset_<Vesize>")
+	   (match_operand:<V_INT_EQUIV> 1 "register_operand")
+	   (match_operand:DI 2 "const_int_operand")
+	   (match_operand:DI 3 "aarch64_gather_scale_operand_<Vesize>")
+	   (match_operand:SVE_FULL_SD 4 "register_operand")]
+	  UNSPEC_ST1_SCATTER))]
+  "TARGET_SVE"
+  {
+    operands[5] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
+;; Predicated scatter stores for 32-bit elements.  Operand 2 is true for
+;; unsigned extension and false for signed extension.
+(define_insn "mask_scatter_store<mode>"
+  [(set (mem:BLK (scratch))
+	(unspec:BLK
+	  [(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	   (match_operand:DI 0 "aarch64_sve_gather_offset_w" "Z, vgw, rk, rk, rk, rk")
+	   (match_operand:VNx4SI 1 "register_operand" "w, w, w, w, w, w")
+	   (match_operand:DI 2 "const_int_operand" "Ui1, Ui1, Z, Ui1, Z, Ui1")
+	   (match_operand:DI 3 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, Ui1, i, i")
+	   (match_operand:SVE_FULL_S 4 "register_operand" "w, w, w, w, w, w")]
+	  UNSPEC_ST1_SCATTER))]
+  "TARGET_SVE"
+  "@
+   st1w\t%4.s, %5, [%1.s]
+   st1w\t%4.s, %5, [%1.s, #%0]
+   st1w\t%4.s, %5, [%0, %1.s, sxtw]
+   st1w\t%4.s, %5, [%0, %1.s, uxtw]
+   st1w\t%4.s, %5, [%0, %1.s, sxtw %p3]
+   st1w\t%4.s, %5, [%0, %1.s, uxtw %p3]"
+)
+
+;; Predicated scatter stores for 64-bit elements.  The value of operand 2
+;; doesn't matter in this case.
+(define_insn "mask_scatter_store<mode>"
+  [(set (mem:BLK (scratch))
+	(unspec:BLK
+	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
+	   (match_operand:DI 0 "aarch64_sve_gather_offset_d" "Z, vgd, rk, rk")
+	   (match_operand:VNx2DI 1 "register_operand" "w, w, w, w")
+	   (match_operand:DI 2 "const_int_operand")
+	   (match_operand:DI 3 "aarch64_gather_scale_operand_d" "Ui1, Ui1, Ui1, i")
+	   (match_operand:SVE_FULL_D 4 "register_operand" "w, w, w, w")]
+	  UNSPEC_ST1_SCATTER))]
+  "TARGET_SVE"
+  "@
+   st1d\t%4.d, %5, [%1.d]
+   st1d\t%4.d, %5, [%1.d, #%0]
+   st1d\t%4.d, %5, [%0, %1.d]
+   st1d\t%4.d, %5, [%0, %1.d, lsl %p3]"
+)
+
+;; Likewise, but with the offset being sign-extended from 32 bits.
+(define_insn_and_rewrite "*mask_scatter_store<mode>_sxtw"
+  [(set (mem:BLK (scratch))
+	(unspec:BLK
+	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
+	   (match_operand:DI 0 "register_operand" "rk, rk")
+	   (unspec:VNx2DI
+	     [(match_operand 6)
+	      (sign_extend:VNx2DI
+		(truncate:VNx2SI
+		  (match_operand:VNx2DI 1 "register_operand" "w, w")))]
+	     UNSPEC_PRED_X)
+	   (match_operand:DI 2 "const_int_operand")
+	   (match_operand:DI 3 "aarch64_gather_scale_operand_d" "Ui1, i")
+	   (match_operand:SVE_FULL_D 4 "register_operand" "w, w")]
+	  UNSPEC_ST1_SCATTER))]
+  "TARGET_SVE"
+  "@
+   st1d\t%4.d, %5, [%0, %1.d, sxtw]
+   st1d\t%4.d, %5, [%0, %1.d, sxtw %p3]"
+  "&& !rtx_equal_p (operands[5], operands[6])"
+  {
+    operands[6] = copy_rtx (operands[5]);
+  }
+)
+
+;; Likewise, but with the offset being zero-extended from 32 bits.
+(define_insn "*mask_scatter_store<mode>_uxtw"
+  [(set (mem:BLK (scratch))
+	(unspec:BLK
+	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
+	   (match_operand:DI 0 "aarch64_reg_or_zero" "rk, rk")
+	   (and:VNx2DI
+	     (match_operand:VNx2DI 1 "register_operand" "w, w")
+	     (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
+	   (match_operand:DI 2 "const_int_operand")
+	   (match_operand:DI 3 "aarch64_gather_scale_operand_d" "Ui1, i")
+	   (match_operand:SVE_FULL_D 4 "register_operand" "w, w")]
+	  UNSPEC_ST1_SCATTER))]
+  "TARGET_SVE"
+  "@
+   st1d\t%4.d, %5, [%0, %1.d, uxtw]
+   st1d\t%4.d, %5, [%0, %1.d, uxtw %p3]"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- Truncating scatter stores
+;; -------------------------------------------------------------------------
+;; Includes scatter forms of:
+;; - ST1B
+;; - ST1H
+;; - ST1W
+;; -------------------------------------------------------------------------
+
+;; Predicated truncating scatter stores for 32-bit elements.  Operand 2 is
+;; true for unsigned extension and false for signed extension.
+(define_insn "@aarch64_scatter_store_trunc<VNx4_NARROW:mode><VNx4_WIDE:mode>"
+  [(set (mem:BLK (scratch))
+	(unspec:BLK
+	  [(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	   (match_operand:DI 0 "aarch64_sve_gather_offset_<VNx4_NARROW:Vesize>" "Z, vg<VNx4_NARROW:Vesize>, rk, rk, rk, rk")
+	   (match_operand:VNx4SI 1 "register_operand" "w, w, w, w, w, w")
+	   (match_operand:DI 2 "const_int_operand" "Ui1, Ui1, Z, Ui1, Z, Ui1")
+	   (match_operand:DI 3 "aarch64_gather_scale_operand_<VNx4_NARROW:Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
+	   (truncate:VNx4_NARROW
+	     (match_operand:VNx4_WIDE 4 "register_operand" "w, w, w, w, w, w"))]
+	  UNSPEC_ST1_SCATTER))]
+  "TARGET_SVE"
+  "@
+   st1<VNx4_NARROW:Vesize>\t%4.s, %5, [%1.s]
+   st1<VNx4_NARROW:Vesize>\t%4.s, %5, [%1.s, #%0]
+   st1<VNx4_NARROW:Vesize>\t%4.s, %5, [%0, %1.s, sxtw]
+   st1<VNx4_NARROW:Vesize>\t%4.s, %5, [%0, %1.s, uxtw]
+   st1<VNx4_NARROW:Vesize>\t%4.s, %5, [%0, %1.s, sxtw %p3]
+   st1<VNx4_NARROW:Vesize>\t%4.s, %5, [%0, %1.s, uxtw %p3]"
+)
+
+;; Predicated truncating scatter stores for 64-bit elements.  The value of
+;; operand 2 doesn't matter in this case.
+(define_insn "@aarch64_scatter_store_trunc<VNx2_NARROW:mode><VNx2_WIDE:mode>"
+  [(set (mem:BLK (scratch))
+	(unspec:BLK
+	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
+	   (match_operand:DI 0 "aarch64_sve_gather_offset_<VNx2_NARROW:Vesize>" "Z, vg<VNx2_NARROW:Vesize>, rk, rk")
+	   (match_operand:VNx2DI 1 "register_operand" "w, w, w, w")
+	   (match_operand:DI 2 "const_int_operand")
+	   (match_operand:DI 3 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, Ui1, Ui1, i")
+	   (truncate:VNx2_NARROW
+	     (match_operand:VNx2_WIDE 4 "register_operand" "w, w, w, w"))]
+	  UNSPEC_ST1_SCATTER))]
+  "TARGET_SVE"
+  "@
+   st1<VNx2_NARROW:Vesize>\t%4.d, %5, [%1.d]
+   st1<VNx2_NARROW:Vesize>\t%4.d, %5, [%1.d, #%0]
+   st1<VNx2_NARROW:Vesize>\t%4.d, %5, [%0, %1.d]
+   st1<VNx2_NARROW:Vesize>\t%4.d, %5, [%0, %1.d, lsl %p3]"
+)
+
+;; Likewise, but with the offset being sign-extended from 32 bits.
+(define_insn_and_rewrite "*aarch64_scatter_store_trunc<VNx2_NARROW:mode><VNx2_WIDE:mode>_sxtw"
+  [(set (mem:BLK (scratch))
+	(unspec:BLK
+	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
+	   (match_operand:DI 0 "register_operand" "rk, rk")
+	   (unspec:VNx2DI
+	     [(match_operand 6)
+	      (sign_extend:VNx2DI
+		(truncate:VNx2SI
+		  (match_operand:VNx2DI 1 "register_operand" "w, w")))]
+	     UNSPEC_PRED_X)
+	   (match_operand:DI 2 "const_int_operand")
+	   (match_operand:DI 3 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, i")
+	   (truncate:VNx2_NARROW
+	     (match_operand:VNx2_WIDE 4 "register_operand" "w, w"))]
+	  UNSPEC_ST1_SCATTER))]
+  "TARGET_SVE"
+  "@
+   st1<VNx2_NARROW:Vesize>\t%4.d, %5, [%0, %1.d, sxtw]
+   st1<VNx2_NARROW:Vesize>\t%4.d, %5, [%0, %1.d, sxtw %p3]"
+  "&& !rtx_equal_p (operands[5], operands[6])"
+  {
+    operands[6] = copy_rtx (operands[5]);
+  }
+)
+
+;; Likewise, but with the offset being zero-extended from 32 bits.
+(define_insn "*aarch64_scatter_store_trunc<VNx2_NARROW:mode><VNx2_WIDE:mode>_uxtw"
+  [(set (mem:BLK (scratch))
+	(unspec:BLK
+	  [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
+	   (match_operand:DI 0 "aarch64_reg_or_zero" "rk, rk")
+	   (and:VNx2DI
+	     (match_operand:VNx2DI 1 "register_operand" "w, w")
+	     (match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
+	   (match_operand:DI 2 "const_int_operand")
+	   (match_operand:DI 3 "aarch64_gather_scale_operand_<VNx2_NARROW:Vesize>" "Ui1, i")
+	   (truncate:VNx2_NARROW
+	     (match_operand:VNx2_WIDE 4 "register_operand" "w, w"))]
+	  UNSPEC_ST1_SCATTER))]
+  "TARGET_SVE"
+  "@
+   st1<VNx2_NARROW:Vesize>\t%4.d, %5, [%0, %1.d, uxtw]
+   st1<VNx2_NARROW:Vesize>\t%4.d, %5, [%0, %1.d, uxtw %p3]"
+)
+
+;; =========================================================================
+;; == Vector creation
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- [INT,FP] Duplicate element
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - DUP
+;; - MOV
+;; - LD1RB
+;; - LD1RD
+;; - LD1RH
+;; - LD1RW
+;; - LD1ROB (F64MM)
+;; - LD1ROD (F64MM)
+;; - LD1ROH (F64MM)
+;; - LD1ROW (F64MM)
+;; - LD1RQB
+;; - LD1RQD
+;; - LD1RQH
+;; - LD1RQW
+;; -------------------------------------------------------------------------
+
+(define_expand "vec_duplicate<mode>"
+  [(parallel
+    [(set (match_operand:SVE_FULL 0 "register_operand")
+	  (vec_duplicate:SVE_FULL
+	    (match_operand:<VEL> 1 "aarch64_sve_dup_operand")))
+     (clobber (scratch:VNx16BI))])]
+  "TARGET_SVE"
+  {
+    if (MEM_P (operands[1]))
+      {
+	rtx ptrue = aarch64_ptrue_reg (<VPRED>mode);
+	emit_insn (gen_sve_ld1r<mode> (operands[0], ptrue, operands[1],
+				       CONST0_RTX (<MODE>mode)));
+	DONE;
+      }
+  }
+)
+
+;; Accept memory operands for the benefit of combine, and also in case
+;; the scalar input gets spilled to memory during RA.  We want to split
+;; the load at the first opportunity in order to allow the PTRUE to be
+;; optimized with surrounding code.
+(define_insn_and_split "*vec_duplicate<mode>_reg"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, w, w")
+	(vec_duplicate:SVE_FULL
+	  (match_operand:<VEL> 1 "aarch64_sve_dup_operand" "r, w, Uty")))
+   (clobber (match_scratch:VNx16BI 2 "=X, X, Upl"))]
+  "TARGET_SVE"
+  "@
+   mov\t%0.<Vetype>, %<vwcore>1
+   mov\t%0.<Vetype>, %<Vetype>1
+   #"
+  "&& MEM_P (operands[1])"
+  [(const_int 0)]
+  {
+    if (GET_CODE (operands[2]) == SCRATCH)
+      operands[2] = gen_reg_rtx (VNx16BImode);
+    emit_move_insn (operands[2], CONSTM1_RTX (VNx16BImode));
+    rtx gp = gen_lowpart (<VPRED>mode, operands[2]);
+    emit_insn (gen_sve_ld1r<mode> (operands[0], gp, operands[1],
+				   CONST0_RTX (<MODE>mode)));
+    DONE;
+  }
+  [(set_attr "length" "4,4,8")]
+)
+
+;; Duplicate an Advanced SIMD vector to fill an SVE vector (LE version).
+(define_insn "@aarch64_vec_duplicate_vq<mode>_le"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
+	(vec_duplicate:SVE_FULL
+	  (match_operand:<V128> 1 "register_operand" "w")))]
+  "TARGET_SVE && !BYTES_BIG_ENDIAN"
+  {
+    operands[1] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));
+    return "dup\t%0.q, %1.q[0]";
+  }
+)
+
+;; Duplicate an Advanced SIMD vector to fill an SVE vector (BE version).
+;; The SVE register layout puts memory lane N into (architectural)
+;; register lane N, whereas the Advanced SIMD layout puts the memory
+;; lsb into the register lsb.  We therefore have to describe this in rtl
+;; terms as a reverse of the V128 vector followed by a duplicate.
+(define_insn "@aarch64_vec_duplicate_vq<mode>_be"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
+	(vec_duplicate:SVE_FULL
+	  (vec_select:<V128>
+	    (match_operand:<V128> 1 "register_operand" "w")
+	    (match_operand 2 "descending_int_parallel"))))]
+  "TARGET_SVE
+   && BYTES_BIG_ENDIAN
+   && known_eq (INTVAL (XVECEXP (operands[2], 0, 0)),
+		GET_MODE_NUNITS (<V128>mode) - 1)"
+  {
+    operands[1] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));
+    return "dup\t%0.q, %1.q[0]";
+  }
+)
+
+;; This is used for vec_duplicate<mode>s from memory, but can also
+;; be used by combine to optimize selects of a a vec_duplicate<mode>
+;; with zero.
+(define_insn "sve_ld1r<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
+	(unspec:SVE_FULL
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+	   (vec_duplicate:SVE_FULL
+	     (match_operand:<VEL> 2 "aarch64_sve_ld1r_operand" "Uty"))
+	   (match_operand:SVE_FULL 3 "aarch64_simd_imm_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "ld1r<Vesize>\t%0.<Vetype>, %1/z, %2"
+)
+
+;; Load 128 bits from memory under predicate control and duplicate to
+;; fill a vector.
+(define_insn "@aarch64_sve_ld1rq<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
+	(unspec:SVE_FULL
+	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
+	   (match_operand:<V128> 1 "aarch64_sve_ld1rq_operand" "UtQ")]
+	  UNSPEC_LD1RQ))]
+  "TARGET_SVE"
+  {
+    operands[1] = gen_rtx_MEM (<VEL>mode, XEXP (operands[1], 0));
+    return "ld1rq<Vesize>\t%0.<Vetype>, %2/z, %1";
+  }
+)
+
+(define_insn "@aarch64_sve_ld1ro<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
+	(unspec:SVE_FULL
+	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
+	   (match_operand:OI 1 "aarch64_sve_ld1ro_operand_<Vesize>"
+			       "UO<Vesize>")]
+	  UNSPEC_LD1RO))]
+  "TARGET_SVE_F64MM"
+  {
+    operands[1] = gen_rtx_MEM (<VEL>mode, XEXP (operands[1], 0));
+    return "ld1ro<Vesize>\t%0.<Vetype>, %2/z, %1";
+  }
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [INT,FP] Initialize from individual elements
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - INSR
+;; -------------------------------------------------------------------------
+
+(define_expand "vec_init<mode><Vel>"
+  [(match_operand:SVE_FULL 0 "register_operand")
+    (match_operand 1 "")]
+  "TARGET_SVE"
+  {
+    aarch64_sve_expand_vector_init (operands[0], operands[1]);
+    DONE;
+  }
+)
+
+;; Shift an SVE vector left and insert a scalar into element 0.
+(define_insn "vec_shl_insert_<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=?w, w, ??&w, ?&w")
+	(unspec:SVE_FULL
+	  [(match_operand:SVE_FULL 1 "register_operand" "0, 0, w, w")
+	   (match_operand:<VEL> 2 "aarch64_reg_or_zero" "rZ, w, rZ, w")]
+	  UNSPEC_INSR))]
+  "TARGET_SVE"
+  "@
+   insr\t%0.<Vetype>, %<vwcore>2
+   insr\t%0.<Vetype>, %<Vetype>2
+   movprfx\t%0, %1\;insr\t%0.<Vetype>, %<vwcore>2
+   movprfx\t%0, %1\;insr\t%0.<Vetype>, %<Vetype>2"
+  [(set_attr "movprfx" "*,*,yes,yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Linear series
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - INDEX
+;; -------------------------------------------------------------------------
+
+(define_insn "vec_series<mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, w")
+	(vec_series:SVE_FULL_I
+	  (match_operand:<VEL> 1 "aarch64_sve_index_operand" "Usi, r, r")
+	  (match_operand:<VEL> 2 "aarch64_sve_index_operand" "r, Usi, r")))]
+  "TARGET_SVE"
+  "@
+   index\t%0.<Vetype>, #%1, %<vw>2
+   index\t%0.<Vetype>, %<vw>1, #%2
+   index\t%0.<Vetype>, %<vw>1, %<vw>2"
+)
+
+;; Optimize {x, x, x, x, ...} + {0, n, 2*n, 3*n, ...} if n is in range
+;; of an INDEX instruction.
+(define_insn "*vec_series<mode>_plus"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
+	(plus:SVE_FULL_I
+	  (vec_duplicate:SVE_FULL_I
+	    (match_operand:<VEL> 1 "register_operand" "r"))
+	  (match_operand:SVE_FULL_I 2 "immediate_operand")))]
+  "TARGET_SVE && aarch64_check_zero_based_sve_index_immediate (operands[2])"
+  {
+    operands[2] = aarch64_check_zero_based_sve_index_immediate (operands[2]);
+    return "index\t%0.<Vetype>, %<vw>1, #%2";
+  }
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [PRED] Duplicate element
+;; -------------------------------------------------------------------------
+;; The patterns in this section are synthetic.
+;; -------------------------------------------------------------------------
+
+;; Implement a predicate broadcast by shifting the low bit of the scalar
+;; input into the top bit and using a WHILELO.  An alternative would be to
+;; duplicate the input and do a compare with zero.
+(define_expand "vec_duplicate<mode>"
+  [(set (match_operand:PRED_ALL 0 "register_operand")
+	(vec_duplicate:PRED_ALL (match_operand:QI 1 "register_operand")))]
+  "TARGET_SVE"
+  {
+    rtx tmp = gen_reg_rtx (DImode);
+    rtx op1 = gen_lowpart (DImode, operands[1]);
+    emit_insn (gen_ashldi3 (tmp, op1, gen_int_mode (63, DImode)));
+    emit_insn (gen_while_ultdi<mode> (operands[0], const0_rtx, tmp));
+    DONE;
+  }
+)
+
+;; =========================================================================
+;; == Vector decomposition
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- [INT,FP] Extract index
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - DUP    (Advanced SIMD)
+;; - DUP    (SVE)
+;; - EXT    (SVE)
+;; - ST1    (Advanced SIMD)
+;; - UMOV   (Advanced SIMD)
+;; -------------------------------------------------------------------------
+
+(define_expand "vec_extract<mode><Vel>"
+  [(set (match_operand:<VEL> 0 "register_operand")
+	(vec_select:<VEL>
+	  (match_operand:SVE_FULL 1 "register_operand")
+	  (parallel [(match_operand:SI 2 "nonmemory_operand")])))]
+  "TARGET_SVE"
+  {
+    poly_int64 val;
+    if (poly_int_rtx_p (operands[2], &val)
+	&& known_eq (val, GET_MODE_NUNITS (<MODE>mode) - 1))
+      {
+	/* The last element can be extracted with a LASTB and a false
+	   predicate.  */
+	rtx sel = aarch64_pfalse_reg (<VPRED>mode);
+	emit_insn (gen_extract_last_<mode> (operands[0], sel, operands[1]));
+	DONE;
+      }
+    if (!CONST_INT_P (operands[2]))
+      {
+	/* Create an index with operand[2] as the base and -1 as the step.
+	   It will then be zero for the element we care about.  */
+	rtx index = gen_lowpart (<VEL_INT>mode, operands[2]);
+	index = force_reg (<VEL_INT>mode, index);
+	rtx series = gen_reg_rtx (<V_INT_EQUIV>mode);
+	emit_insn (gen_vec_series<v_int_equiv> (series, index, constm1_rtx));
+
+	/* Get a predicate that is true for only that element.  */
+	rtx zero = CONST0_RTX (<V_INT_EQUIV>mode);
+	rtx cmp = gen_rtx_EQ (<V_INT_EQUIV>mode, series, zero);
+	rtx sel = gen_reg_rtx (<VPRED>mode);
+	emit_insn (gen_vec_cmp<v_int_equiv><vpred> (sel, cmp, series, zero));
+
+	/* Select the element using LASTB.  */
+	emit_insn (gen_extract_last_<mode> (operands[0], sel, operands[1]));
+	DONE;
+      }
+  }
+)
+
+;; Extract element zero.  This is a special case because we want to force
+;; the registers to be the same for the second alternative, and then
+;; split the instruction into nothing after RA.
+(define_insn_and_split "*vec_extract<mode><Vel>_0"
+  [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=r, w, Utv")
+	(vec_select:<VEL>
+	  (match_operand:SVE_FULL 1 "register_operand" "w, 0, w")
+	  (parallel [(const_int 0)])))]
+  "TARGET_SVE"
+  {
+    operands[1] = gen_rtx_REG (<V128>mode, REGNO (operands[1]));
+    switch (which_alternative)
+      {
+	case 0:
+	  return "umov\\t%<vwcore>0, %1.<Vetype>[0]";
+	case 1:
+	  return "#";
+	case 2:
+	  return "st1\\t{%1.<Vetype>}[0], %0";
+	default:
+	  gcc_unreachable ();
+      }
+  }
+  "&& reload_completed
+   && REG_P (operands[0])
+   && REGNO (operands[0]) == REGNO (operands[1])"
+  [(const_int 0)]
+  {
+    emit_note (NOTE_INSN_DELETED);
+    DONE;
+  }
+  [(set_attr "type" "neon_to_gp_q, untyped, neon_store1_one_lane_q")]
+)
+
+;; Extract an element from the Advanced SIMD portion of the register.
+;; We don't just reuse the aarch64-simd.md pattern because we don't
+;; want any change in lane number on big-endian targets.
+(define_insn "*vec_extract<mode><Vel>_v128"
+  [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=r, w, Utv")
+	(vec_select:<VEL>
+	  (match_operand:SVE_FULL 1 "register_operand" "w, w, w")
+	  (parallel [(match_operand:SI 2 "const_int_operand")])))]
+  "TARGET_SVE
+   && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 1, 15)"
+  {
+    operands[1] = gen_rtx_REG (<V128>mode, REGNO (operands[1]));
+    switch (which_alternative)
+      {
+	case 0:
+	  return "umov\\t%<vwcore>0, %1.<Vetype>[%2]";
+	case 1:
+	  return "dup\\t%<Vetype>0, %1.<Vetype>[%2]";
+	case 2:
+	  return "st1\\t{%1.<Vetype>}[%2], %0";
+	default:
+	  gcc_unreachable ();
+      }
+  }
+  [(set_attr "type" "neon_to_gp_q, neon_dup_q, neon_store1_one_lane_q")]
+)
+
+;; Extract an element in the range of DUP.  This pattern allows the
+;; source and destination to be different.
+(define_insn "*vec_extract<mode><Vel>_dup"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+	(vec_select:<VEL>
+	  (match_operand:SVE_FULL 1 "register_operand" "w")
+	  (parallel [(match_operand:SI 2 "const_int_operand")])))]
+  "TARGET_SVE
+   && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 16, 63)"
+  {
+    operands[0] = gen_rtx_REG (<MODE>mode, REGNO (operands[0]));
+    return "dup\t%0.<Vetype>, %1.<Vetype>[%2]";
+  }
+)
+
+;; Extract an element outside the range of DUP.  This pattern requires the
+;; source and destination to be the same.
+(define_insn "*vec_extract<mode><Vel>_ext"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w, ?&w")
+	(vec_select:<VEL>
+	  (match_operand:SVE_FULL 1 "register_operand" "0, w")
+	  (parallel [(match_operand:SI 2 "const_int_operand")])))]
+  "TARGET_SVE && INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode) >= 64"
+  {
+    operands[0] = gen_rtx_REG (<MODE>mode, REGNO (operands[0]));
+    operands[2] = GEN_INT (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode));
+    return (which_alternative == 0
+	    ? "ext\t%0.b, %0.b, %0.b, #%2"
+	    : "movprfx\t%0, %1\;ext\t%0.b, %0.b, %1.b, #%2");
+  }
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [INT,FP] Extract active element
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - LASTA
+;; - LASTB
+;; -------------------------------------------------------------------------
+
+;; Extract the last active element of operand 1 into operand 0.
+;; If no elements are active, extract the last inactive element instead.
+(define_insn "@extract_<last_op>_<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=?r, w")
+	(unspec:<VEL>
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (match_operand:SVE_FULL 2 "register_operand" "w, w")]
+	  LAST))]
+  "TARGET_SVE"
+  "@
+   last<ab>\t%<vwcore>0, %1, %2.<Vetype>
+   last<ab>\t%<Vetype>0, %1, %2.<Vetype>"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [PRED] Extract index
+;; -------------------------------------------------------------------------
+;; The patterns in this section are synthetic.
+;; -------------------------------------------------------------------------
+
+;; Handle extractions from a predicate by converting to an integer vector
+;; and extracting from there.
+(define_expand "vec_extract<vpred><Vel>"
+  [(match_operand:<VEL> 0 "register_operand")
+   (match_operand:<VPRED> 1 "register_operand")
+   (match_operand:SI 2 "nonmemory_operand")
+   ;; Dummy operand to which we can attach the iterator.
+   (reg:SVE_FULL_I V0_REGNUM)]
+  "TARGET_SVE"
+  {
+    rtx tmp = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_vcond_mask_<mode><vpred> (tmp, operands[1],
+					     CONST1_RTX (<MODE>mode),
+					     CONST0_RTX (<MODE>mode)));
+    emit_insn (gen_vec_extract<mode><Vel> (operands[0], tmp, operands[2]));
+    DONE;
+  }
+)
+
+;; =========================================================================
+;; == Unary arithmetic
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] General unary arithmetic corresponding to rtx codes
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - ABS
+;; - CLS (= clrsb)
+;; - CLZ
+;; - CNT (= popcount)
+;; - NEG
+;; - NOT
+;; -------------------------------------------------------------------------
+
+;; Unpredicated integer unary arithmetic.
+(define_expand "<optab><mode>2"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(unspec:SVE_FULL_I
+	  [(match_dup 2)
+	   (SVE_INT_UNARY:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 1 "register_operand"))]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE"
+  {
+    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
+;; Integer unary arithmetic predicated with a PTRUE.
+(define_insn "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+	   (SVE_INT_UNARY:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 2 "register_operand" "w"))]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE"
+  "<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
+)
+
+;; Predicated integer unary arithmetic with merging.
+(define_expand "@cond_<optab><mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (SVE_INT_UNARY:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 2 "register_operand"))
+	   (match_operand:SVE_FULL_I 3 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+)
+
+;; Predicated integer unary arithmetic, merging with the first input.
+(define_insn "*cond_<optab><mode>_2"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (SVE_INT_UNARY:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 2 "register_operand" "0, w"))
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>
+   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer unary arithmetic, merging with an independent value.
+;;
+;; The earlyclobber isn't needed for the first alternative, but omitting
+;; it would only help the case in which operands 2 and 3 are the same,
+;; which is handled above rather than here.  Marking all the alternatives
+;; as earlyclobber helps to make the instruction more regular to the
+;; register allocator.
+(define_insn "*cond_<optab><mode>_any"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, ?&w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (SVE_INT_UNARY:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 2 "register_operand" "w, w, w"))
+	   (match_operand:SVE_FULL_I 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && !rtx_equal_p (operands[2], operands[3])"
+  "@
+   <sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
+   movprfx\t%0, %3\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
+  [(set_attr "movprfx" "*,yes,yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] General unary arithmetic corresponding to unspecs
+;; -------------------------------------------------------------------------
+;; Includes
+;; - RBIT
+;; - REVB
+;; - REVH
+;; - REVW
+;; -------------------------------------------------------------------------
+
+;; Predicated integer unary operations.
+(define_insn "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+	   (unspec:SVE_FULL_I
+	     [(match_operand:SVE_FULL_I 2 "register_operand" "w")]
+	     SVE_INT_UNARY)]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE && <elem_bits> >= <min_elem_bits>"
+  "<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
+)
+
+;; Predicated integer unary operations with merging.
+(define_insn "@cond_<optab><mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:SVE_FULL_I
+	     [(match_operand:SVE_FULL_I 2 "register_operand" "w, w, w")]
+	     SVE_INT_UNARY)
+	   (match_operand:SVE_FULL_I 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && <elem_bits> >= <min_elem_bits>"
+  "@
+   <sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
+   movprfx\t%0, %3\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
+  [(set_attr "movprfx" "*,yes,yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Sign extension
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - SXTB
+;; - SXTH
+;; - SXTW
+;; -------------------------------------------------------------------------
+
+;; Predicated SXT[BHW].
+(define_insn "@aarch64_pred_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL:mode>"
+  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
+	(unspec:SVE_FULL_HSDI
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+	   (sign_extend:SVE_FULL_HSDI
+	     (truncate:SVE_PARTIAL
+	       (match_operand:SVE_FULL_HSDI 2 "register_operand" "w")))]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE && (~<narrower_mask> & <self_mask>) == 0"
+  "sxt<SVE_PARTIAL:Vesize>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>"
+)
+
+;; Predicated SXT[BHW] with merging.
+(define_insn "@aarch64_cond_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL:mode>"
+  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w, ?&w, ?&w")
+	(unspec:SVE_FULL_HSDI
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (sign_extend:SVE_FULL_HSDI
+	     (truncate:SVE_PARTIAL
+	       (match_operand:SVE_FULL_HSDI 2 "register_operand" "w, w, w")))
+	   (match_operand:SVE_FULL_HSDI 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && (~<narrower_mask> & <self_mask>) == 0"
+  "@
+   sxt<SVE_PARTIAL:Vesize>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>
+   movprfx\t%0.<SVE_FULL_HSDI:Vetype>, %1/z, %2.<SVE_FULL_HSDI:Vetype>\;sxt<SVE_PARTIAL:Vesize>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>
+   movprfx\t%0, %3\;sxt<SVE_PARTIAL:Vesize>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>"
+  [(set_attr "movprfx" "*,yes,yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Zero extension
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - UXTB
+;; - UXTH
+;; - UXTW
+;; -------------------------------------------------------------------------
+
+;; Match UXT[BHW] as a conditional AND of a constant, merging with the
+;; first input.
+(define_insn "*cond_uxt<mode>_2"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (and:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 2 "register_operand" "0, w")
+	     (match_operand:SVE_FULL_I 3 "aarch64_sve_uxt_immediate"))
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   uxt%e3\t%0.<Vetype>, %1/m, %0.<Vetype>
+   movprfx\t%0, %2\;uxt%e3\t%0.<Vetype>, %1/m, %2.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Match UXT[BHW] as a conditional AND of a constant, merging with an
+;; independent value.
+;;
+;; The earlyclobber isn't needed for the first alternative, but omitting
+;; it would only help the case in which operands 2 and 4 are the same,
+;; which is handled above rather than here.  Marking all the alternatives
+;; as early-clobber helps to make the instruction more regular to the
+;; register allocator.
+(define_insn "*cond_uxt<mode>_any"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, ?&w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (and:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 2 "register_operand" "w, w, w")
+	     (match_operand:SVE_FULL_I 3 "aarch64_sve_uxt_immediate"))
+	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
+  "@
+   uxt%e3\t%0.<Vetype>, %1/m, %2.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;uxt%e3\t%0.<Vetype>, %1/m, %2.<Vetype>
+   movprfx\t%0, %4\;uxt%e3\t%0.<Vetype>, %1/m, %2.<Vetype>"
+  [(set_attr "movprfx" "*,yes,yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Logical inverse
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - CNOT
+;; -------------------------------------------------------------------------
+
+;; Predicated logical inverse.
+(define_expand "@aarch64_pred_cnot<mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(unspec:SVE_FULL_I
+	  [(unspec:<VPRED>
+	     [(match_operand:<VPRED> 1 "register_operand")
+	      (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	      (eq:<VPRED>
+		(match_operand:SVE_FULL_I 3 "register_operand")
+		(match_dup 4))]
+	     UNSPEC_PRED_Z)
+	   (match_dup 5)
+	   (match_dup 4)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  {
+    operands[4] = CONST0_RTX (<MODE>mode);
+    operands[5] = CONST1_RTX (<MODE>mode);
+  }
+)
+
+(define_insn "*cnot<mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
+	(unspec:SVE_FULL_I
+	  [(unspec:<VPRED>
+	     [(match_operand:<VPRED> 1 "register_operand" "Upl")
+	      (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	      (eq:<VPRED>
+		(match_operand:SVE_FULL_I 2 "register_operand" "w")
+		(match_operand:SVE_FULL_I 3 "aarch64_simd_imm_zero"))]
+	     UNSPEC_PRED_Z)
+	   (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_one")
+	   (match_dup 3)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "cnot\t%0.<Vetype>, %1/m, %2.<Vetype>"
+)
+
+;; Predicated logical inverse with merging.
+(define_expand "@cond_cnot<mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (unspec:SVE_FULL_I
+	     [(unspec:<VPRED>
+		[(match_dup 4)
+		 (const_int SVE_KNOWN_PTRUE)
+		 (eq:<VPRED>
+		   (match_operand:SVE_FULL_I 2 "register_operand")
+		   (match_dup 5))]
+		UNSPEC_PRED_Z)
+	      (match_dup 6)
+	      (match_dup 5)]
+	     UNSPEC_SEL)
+	   (match_operand:SVE_FULL_I 3 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  {
+    operands[4] = CONSTM1_RTX (<VPRED>mode);
+    operands[5] = CONST0_RTX (<MODE>mode);
+    operands[6] = CONST1_RTX (<MODE>mode);
+  }
+)
+
+;; Predicated logical inverse, merging with the first input.
+(define_insn_and_rewrite "*cond_cnot<mode>_2"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   ;; Logical inverse of operand 2 (as above).
+	   (unspec:SVE_FULL_I
+	     [(unspec:<VPRED>
+		[(match_operand 5)
+		 (const_int SVE_KNOWN_PTRUE)
+		 (eq:<VPRED>
+		   (match_operand:SVE_FULL_I 2 "register_operand" "0, w")
+		   (match_operand:SVE_FULL_I 3 "aarch64_simd_imm_zero"))]
+		UNSPEC_PRED_Z)
+	      (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_one")
+	      (match_dup 3)]
+	     UNSPEC_SEL)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   cnot\t%0.<Vetype>, %1/m, %0.<Vetype>
+   movprfx\t%0, %2\;cnot\t%0.<Vetype>, %1/m, %2.<Vetype>"
+  "&& !CONSTANT_P (operands[5])"
+  {
+    operands[5] = CONSTM1_RTX (<VPRED>mode);
+  }
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated logical inverse, merging with an independent value.
+;;
+;; The earlyclobber isn't needed for the first alternative, but omitting
+;; it would only help the case in which operands 2 and 6 are the same,
+;; which is handled above rather than here.  Marking all the alternatives
+;; as earlyclobber helps to make the instruction more regular to the
+;; register allocator.
+(define_insn_and_rewrite "*cond_cnot<mode>_any"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, ?&w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   ;; Logical inverse of operand 2 (as above).
+	   (unspec:SVE_FULL_I
+	     [(unspec:<VPRED>
+		[(match_operand 5)
+		 (const_int SVE_KNOWN_PTRUE)
+		 (eq:<VPRED>
+		   (match_operand:SVE_FULL_I 2 "register_operand" "w, w, w")
+		   (match_operand:SVE_FULL_I 3 "aarch64_simd_imm_zero"))]
+		UNSPEC_PRED_Z)
+	      (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_one")
+	      (match_dup 3)]
+	     UNSPEC_SEL)
+	   (match_operand:SVE_FULL_I 6 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && !rtx_equal_p (operands[2], operands[6])"
+  "@
+   cnot\t%0.<Vetype>, %1/m, %2.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;cnot\t%0.<Vetype>, %1/m, %2.<Vetype>
+   movprfx\t%0, %6\;cnot\t%0.<Vetype>, %1/m, %2.<Vetype>"
+  "&& !CONSTANT_P (operands[5])"
+  {
+    operands[5] = CONSTM1_RTX (<VPRED>mode);
+  }
+  [(set_attr "movprfx" "*,yes,yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [FP<-INT] General unary arithmetic that maps to unspecs
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FEXPA
+;; -------------------------------------------------------------------------
+
+;; Unpredicated unary operations that take an integer and return a float.
+(define_insn "@aarch64_sve_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<V_INT_EQUIV> 1 "register_operand" "w")]
+	  SVE_FP_UNARY_INT))]
+  "TARGET_SVE"
+  "<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [FP] General unary arithmetic corresponding to unspecs
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FABS
+;; - FNEG
+;; - FRECPE
+;; - FRECPX
+;; - FRINTA
+;; - FRINTI
+;; - FRINTM
+;; - FRINTN
+;; - FRINTP
+;; - FRINTX
+;; - FRINTZ
+;; - FRSQRT
+;; - FSQRT
+;; -------------------------------------------------------------------------
+
+;; Unpredicated floating-point unary operations.
+(define_insn "@aarch64_sve_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:SVE_FULL_F 1 "register_operand" "w")]
+	  SVE_FP_UNARY))]
+  "TARGET_SVE"
+  "<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>"
+)
+
+;; Unpredicated floating-point unary operations.
+(define_expand "<optab><mode>2"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_dup 2)
+	   (const_int SVE_RELAXED_GP)
+	   (match_operand:SVE_FULL_F 1 "register_operand")]
+	  SVE_COND_FP_UNARY))]
+  "TARGET_SVE"
+  {
+    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
+;; Predicated floating-point unary operations.
+(define_insn "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+	   (match_operand:SI 3 "aarch64_sve_gp_strictness")
+	   (match_operand:SVE_FULL_F 2 "register_operand" "w")]
+	  SVE_COND_FP_UNARY))]
+  "TARGET_SVE"
+  "<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
+)
+
+;; Predicated floating-point unary arithmetic with merging.
+(define_expand "@cond_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand")]
+	     SVE_COND_FP_UNARY)
+	   (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+)
+
+;; Predicated floating-point unary arithmetic, merging with the first input.
+(define_insn_and_rewrite "*cond_<optab><mode>_2"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 3)
+	      (match_operand:SI 4 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")]
+	     SVE_COND_FP_UNARY)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[3], operands[1])"
+  "@
+   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>
+   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
+  "&& !rtx_equal_p (operands[1], operands[3])"
+  {
+    operands[3] = copy_rtx (operands[1]);
+  }
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated floating-point unary arithmetic, merging with an independent
+;; value.
+;;
+;; The earlyclobber isn't needed for the first alternative, but omitting
+;; it would only help the case in which operands 2 and 3 are the same,
+;; which is handled above rather than here.  Marking all the alternatives
+;; as earlyclobber helps to make the instruction more regular to the
+;; register allocator.
+(define_insn_and_rewrite "*cond_<optab><mode>_any"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, ?&w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 4)
+	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")]
+	     SVE_COND_FP_UNARY)
+	   (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[3])
+   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "@
+   <sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
+   movprfx\t%0, %3\;<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
+  "&& !rtx_equal_p (operands[1], operands[4])"
+  {
+    operands[4] = copy_rtx (operands[1]);
+  }
+  [(set_attr "movprfx" "*,yes,yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [PRED] Inverse
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - NOT
+;; -------------------------------------------------------------------------
+
+;; Unpredicated predicate inverse.
+(define_expand "one_cmpl<mode>2"
+  [(set (match_operand:PRED_ALL 0 "register_operand")
+	(and:PRED_ALL
+	  (not:PRED_ALL (match_operand:PRED_ALL 1 "register_operand"))
+	  (match_dup 2)))]
+  "TARGET_SVE"
+  {
+    operands[2] = aarch64_ptrue_reg (<MODE>mode);
+  }
+)
+
+;; Predicated predicate inverse.
+(define_insn "*one_cmpl<mode>3"
+  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+	(and:PRED_ALL
+	  (not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand" "Upa"))
+	  (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
+  "TARGET_SVE"
+  "not\t%0.b, %1/z, %2.b"
+)
+
+;; =========================================================================
+;; == Binary arithmetic
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] General binary arithmetic corresponding to rtx codes
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - ADD    (merging form only)
+;; - AND    (merging form only)
+;; - ASR    (merging form only)
+;; - EOR    (merging form only)
+;; - LSL    (merging form only)
+;; - LSR    (merging form only)
+;; - MUL
+;; - ORR    (merging form only)
+;; - SMAX
+;; - SMIN
+;; - SUB    (merging form only)
+;; - UMAX
+;; - UMIN
+;; -------------------------------------------------------------------------
+
+;; Unpredicated integer binary operations that have an immediate form.
+(define_expand "<optab><mode>3"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(unspec:SVE_FULL_I
+	  [(match_dup 3)
+	   (SVE_INT_BINARY_IMM:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 1 "register_operand")
+	     (match_operand:SVE_FULL_I 2 "aarch64_sve_<sve_imm_con>_operand"))]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE"
+  {
+    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
+;; Integer binary operations that have an immediate form, predicated
+;; with a PTRUE.  We don't actually need the predicate for the first
+;; and third alternatives, but using Upa or X isn't likely to gain much
+;; and would make the instruction seem less uniform to the register
+;; allocator.
+(define_insn_and_split "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, ?&w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
+	   (SVE_INT_BINARY_IMM:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 2 "register_operand" "%0, 0, w, w")
+	     (match_operand:SVE_FULL_I 3 "aarch64_sve_<sve_imm_con>_operand" "<sve_imm_con>, w, <sve_imm_con>, w"))]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE"
+  "@
+   #
+   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   #
+   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  ; Split the unpredicated form after reload, so that we don't have
+  ; the unnecessary PTRUE.
+  "&& reload_completed
+   && !register_operand (operands[3], <MODE>mode)"
+  [(set (match_dup 0)
+	(SVE_INT_BINARY_IMM:SVE_FULL_I (match_dup 2) (match_dup 3)))]
+  ""
+  [(set_attr "movprfx" "*,*,yes,yes")]
+)
+
+;; Unpredicated binary operations with a constant (post-RA only).
+;; These are generated by splitting a predicated instruction whose
+;; predicate is unused.
+(define_insn "*post_ra_<optab><mode>3"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(SVE_INT_BINARY_IMM:SVE_FULL_I
+	  (match_operand:SVE_FULL_I 1 "register_operand" "0, w")
+	  (match_operand:SVE_FULL_I 2 "aarch64_sve_<sve_imm_con>_immediate")))]
+  "TARGET_SVE && reload_completed"
+  "@
+   <sve_int_op>\t%0.<Vetype>, %0.<Vetype>, #%<sve_imm_prefix>2
+   movprfx\t%0, %1\;<sve_int_op>\t%0.<Vetype>, %0.<Vetype>, #%<sve_imm_prefix>2"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer operations with merging.
+(define_expand "@cond_<optab><mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (SVE_INT_BINARY:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 2 "register_operand")
+	     (match_operand:SVE_FULL_I 3 "<sve_pred_int_rhs2_operand>"))
+	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+)
+
+;; Predicated integer operations, merging with the first input.
+(define_insn "*cond_<optab><mode>_2"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (SVE_INT_BINARY:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 2 "register_operand" "0, w")
+	     (match_operand:SVE_FULL_I 3 "register_operand" "w, w"))
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer operations, merging with the second input.
+(define_insn "*cond_<optab><mode>_3"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (SVE_INT_BINARY:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 2 "register_operand" "w, w")
+	     (match_operand:SVE_FULL_I 3 "register_operand" "0, w"))
+	   (match_dup 3)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   <sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+   movprfx\t%0, %3\;<sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer operations, merging with an independent value.
+(define_insn_and_rewrite "*cond_<optab><mode>_any"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, &w, &w, &w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl")
+	   (SVE_INT_BINARY:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 2 "register_operand" "0, w, w, w, w")
+	     (match_operand:SVE_FULL_I 3 "register_operand" "w, 0, w, w, w"))
+	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[4])
+   && !rtx_equal_p (operands[3], operands[4])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[4], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[4])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
+					     operands[4], operands[1]));
+    operands[4] = operands[2] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Addition
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - ADD
+;; - DECB
+;; - DECD
+;; - DECH
+;; - DECW
+;; - INCB
+;; - INCD
+;; - INCH
+;; - INCW
+;; - SUB
+;; -------------------------------------------------------------------------
+
+(define_insn "add<mode>3"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, w, ?w, ?w, w")
+	(plus:SVE_FULL_I
+	  (match_operand:SVE_FULL_I 1 "register_operand" "%0, 0, 0, w, w, w")
+	  (match_operand:SVE_FULL_I 2 "aarch64_sve_add_operand" "vsa, vsn, vsi, vsa, vsn, w")))]
+  "TARGET_SVE"
+  "@
+   add\t%0.<Vetype>, %0.<Vetype>, #%D2
+   sub\t%0.<Vetype>, %0.<Vetype>, #%N2
+   * return aarch64_output_sve_vector_inc_dec (\"%0.<Vetype>\", operands[2]);
+   movprfx\t%0, %1\;add\t%0.<Vetype>, %0.<Vetype>, #%D2
+   movprfx\t%0, %1\;sub\t%0.<Vetype>, %0.<Vetype>, #%N2
+   add\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+  [(set_attr "movprfx" "*,*,*,yes,yes,*")]
+)
+
+;; Merging forms are handled through SVE_INT_BINARY.
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Subtraction
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - SUB
+;; - SUBR
+;; -------------------------------------------------------------------------
+
+(define_insn "sub<mode>3"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, ?&w")
+	(minus:SVE_FULL_I
+	  (match_operand:SVE_FULL_I 1 "aarch64_sve_arith_operand" "w, vsa, vsa")
+	  (match_operand:SVE_FULL_I 2 "register_operand" "w, 0, w")))]
+  "TARGET_SVE"
+  "@
+   sub\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>
+   subr\t%0.<Vetype>, %0.<Vetype>, #%D1
+   movprfx\t%0, %2\;subr\t%0.<Vetype>, %0.<Vetype>, #%D1"
+  [(set_attr "movprfx" "*,*,yes")]
+)
+
+;; Merging forms are handled through SVE_INT_BINARY.
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Take address
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - ADR
+;; -------------------------------------------------------------------------
+
+;; An unshifted and unscaled ADR.  This is functionally equivalent to an ADD,
+;; but the svadrb intrinsics should preserve the user's choice.
+(define_insn "@aarch64_adr<mode>"
+  [(set (match_operand:SVE_FULL_SDI 0 "register_operand" "=w")
+	(unspec:SVE_FULL_SDI
+	  [(match_operand:SVE_FULL_SDI 1 "register_operand" "w")
+	   (match_operand:SVE_FULL_SDI 2 "register_operand" "w")]
+	  UNSPEC_ADR))]
+  "TARGET_SVE"
+  "adr\t%0.<Vetype>, [%1.<Vetype>, %2.<Vetype>]"
+)
+
+;; Same, but with the offset being sign-extended from the low 32 bits.
+(define_insn_and_rewrite "*aarch64_adr_sxtw"
+  [(set (match_operand:VNx2DI 0 "register_operand" "=w")
+	(unspec:VNx2DI
+	  [(match_operand:VNx2DI 1 "register_operand" "w")
+	   (unspec:VNx2DI
+	     [(match_operand 3)
+	      (sign_extend:VNx2DI
+		(truncate:VNx2SI
+		  (match_operand:VNx2DI 2 "register_operand" "w")))]
+	     UNSPEC_PRED_X)]
+	  UNSPEC_ADR))]
+  "TARGET_SVE"
+  "adr\t%0.d, [%1.d, %2.d, sxtw]"
+  "&& !CONSTANT_P (operands[3])"
+  {
+    operands[3] = CONSTM1_RTX (VNx2BImode);
+  }
+)
+
+;; Same, but with the offset being zero-extended from the low 32 bits.
+(define_insn "*aarch64_adr_uxtw_unspec"
+  [(set (match_operand:VNx2DI 0 "register_operand" "=w")
+	(unspec:VNx2DI
+	  [(match_operand:VNx2DI 1 "register_operand" "w")
+	   (and:VNx2DI
+	     (match_operand:VNx2DI 2 "register_operand" "w")
+	     (match_operand:VNx2DI 3 "aarch64_sve_uxtw_immediate"))]
+	  UNSPEC_ADR))]
+  "TARGET_SVE"
+  "adr\t%0.d, [%1.d, %2.d, uxtw]"
+)
+
+;; Same, matching as a PLUS rather than unspec.
+(define_insn "*aarch64_adr_uxtw_and"
+  [(set (match_operand:VNx2DI 0 "register_operand" "=w")
+	(plus:VNx2DI
+	  (and:VNx2DI
+	    (match_operand:VNx2DI 2 "register_operand" "w")
+	    (match_operand:VNx2DI 3 "aarch64_sve_uxtw_immediate"))
+	  (match_operand:VNx2DI 1 "register_operand" "w")))]
+  "TARGET_SVE"
+  "adr\t%0.d, [%1.d, %2.d, uxtw]"
+)
+
+;; ADR with a nonzero shift.
+(define_expand "@aarch64_adr<mode>_shift"
+  [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
+	(plus:SVE_FULL_SDI
+	  (unspec:SVE_FULL_SDI
+	    [(match_dup 4)
+	     (ashift:SVE_FULL_SDI
+	       (match_operand:SVE_FULL_SDI 2 "register_operand")
+	       (match_operand:SVE_FULL_SDI 3 "const_1_to_3_operand"))]
+	    UNSPEC_PRED_X)
+	  (match_operand:SVE_FULL_SDI 1 "register_operand")))]
+  "TARGET_SVE"
+  {
+    operands[4] = CONSTM1_RTX (<VPRED>mode);
+  }
+)
+
+(define_insn_and_rewrite "*aarch64_adr<mode>_shift"
+  [(set (match_operand:SVE_FULL_SDI 0 "register_operand" "=w")
+	(plus:SVE_FULL_SDI
+	  (unspec:SVE_FULL_SDI
+	    [(match_operand 4)
+	     (ashift:SVE_FULL_SDI
+	       (match_operand:SVE_FULL_SDI 2 "register_operand" "w")
+	       (match_operand:SVE_FULL_SDI 3 "const_1_to_3_operand"))]
+	    UNSPEC_PRED_X)
+	  (match_operand:SVE_FULL_SDI 1 "register_operand" "w")))]
+  "TARGET_SVE"
+  "adr\t%0.<Vetype>, [%1.<Vetype>, %2.<Vetype>, lsl %3]"
+  "&& !CONSTANT_P (operands[4])"
+  {
+    operands[4] = CONSTM1_RTX (<VPRED>mode);
+  }
+)
+
+;; Same, but with the index being sign-extended from the low 32 bits.
+(define_insn_and_rewrite "*aarch64_adr_shift_sxtw"
+  [(set (match_operand:VNx2DI 0 "register_operand" "=w")
+	(plus:VNx2DI
+	  (unspec:VNx2DI
+	    [(match_operand 4)
+	     (ashift:VNx2DI
+	       (unspec:VNx2DI
+		 [(match_operand 5)
+		  (sign_extend:VNx2DI
+		    (truncate:VNx2SI
+		      (match_operand:VNx2DI 2 "register_operand" "w")))]
+		 UNSPEC_PRED_X)
+	       (match_operand:VNx2DI 3 "const_1_to_3_operand"))]
+	    UNSPEC_PRED_X)
+	  (match_operand:VNx2DI 1 "register_operand" "w")))]
+  "TARGET_SVE"
+  "adr\t%0.d, [%1.d, %2.d, sxtw %3]"
+  "&& (!CONSTANT_P (operands[4]) || !CONSTANT_P (operands[5]))"
+  {
+    operands[5] = operands[4] = CONSTM1_RTX (VNx2BImode);
+  }
+)
+
+;; Same, but with the index being zero-extended from the low 32 bits.
+(define_insn_and_rewrite "*aarch64_adr_shift_uxtw"
+  [(set (match_operand:VNx2DI 0 "register_operand" "=w")
+	(plus:VNx2DI
+	  (unspec:VNx2DI
+	    [(match_operand 5)
+	     (ashift:VNx2DI
+	       (and:VNx2DI
+		 (match_operand:VNx2DI 2 "register_operand" "w")
+		 (match_operand:VNx2DI 4 "aarch64_sve_uxtw_immediate"))
+	       (match_operand:VNx2DI 3 "const_1_to_3_operand"))]
+	    UNSPEC_PRED_X)
+	  (match_operand:VNx2DI 1 "register_operand" "w")))]
+  "TARGET_SVE"
+  "adr\t%0.d, [%1.d, %2.d, uxtw %3]"
+  "&& !CONSTANT_P (operands[5])"
+  {
+    operands[5] = CONSTM1_RTX (VNx2BImode);
+  }
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Absolute difference
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - SABD
+;; - UABD
+;; -------------------------------------------------------------------------
+
+;; Unpredicated integer absolute difference.
+(define_expand "<su>abd<mode>_3"
+  [(use (match_operand:SVE_FULL_I 0 "register_operand"))
+   (USMAX:SVE_FULL_I
+     (match_operand:SVE_FULL_I 1 "register_operand")
+     (match_operand:SVE_FULL_I 2 "register_operand"))]
+  "TARGET_SVE"
+  {
+    rtx pred = aarch64_ptrue_reg (<VPRED>mode);
+    emit_insn (gen_aarch64_pred_<su>abd<mode> (operands[0], pred, operands[1],
+					       operands[2]));
+    DONE;
+  }
+)
+
+;; Predicated integer absolute difference.
+(define_insn "@aarch64_pred_<su>abd<mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (minus:SVE_FULL_I
+	     (USMAX:SVE_FULL_I
+	       (match_operand:SVE_FULL_I 2 "register_operand" "%0, w")
+	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w"))
+	     (<max_opp>:SVE_FULL_I
+	       (match_dup 2)
+	       (match_dup 3)))]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE"
+  "@
+   <su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+(define_expand "@aarch64_cond_<su>abd<mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (minus:SVE_FULL_I
+	     (unspec:SVE_FULL_I
+	       [(match_dup 1)
+		(USMAX:SVE_FULL_I
+		  (match_operand:SVE_FULL_I 2 "register_operand")
+		  (match_operand:SVE_FULL_I 3 "register_operand"))]
+	       UNSPEC_PRED_X)
+	     (unspec:SVE_FULL_I
+	       [(match_dup 1)
+		(<max_opp>:SVE_FULL_I
+		  (match_dup 2)
+		  (match_dup 3))]
+	       UNSPEC_PRED_X))
+	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+{
+  if (rtx_equal_p (operands[3], operands[4]))
+    std::swap (operands[2], operands[3]);
+})
+
+;; Predicated integer absolute difference, merging with the first input.
+(define_insn_and_rewrite "*aarch64_cond_<su>abd<mode>_2"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (minus:SVE_FULL_I
+	     (unspec:SVE_FULL_I
+	       [(match_operand 4)
+		(USMAX:SVE_FULL_I
+		  (match_operand:SVE_FULL_I 2 "register_operand" "0, w")
+		  (match_operand:SVE_FULL_I 3 "register_operand" "w, w"))]
+	       UNSPEC_PRED_X)
+	     (unspec:SVE_FULL_I
+	       [(match_operand 5)
+		(<max_opp>:SVE_FULL_I
+		  (match_dup 2)
+		  (match_dup 3))]
+	       UNSPEC_PRED_X))
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   <su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  "&& (!CONSTANT_P (operands[4]) || !CONSTANT_P (operands[5]))"
+  {
+    operands[4] = operands[5] = CONSTM1_RTX (<VPRED>mode);
+  }
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer absolute difference, merging with an independent value.
+(define_insn_and_rewrite "*aarch64_cond_<su>abd<mode>_any"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, &w, &w, &w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl")
+	   (minus:SVE_FULL_I
+	     (unspec:SVE_FULL_I
+	       [(match_operand 5)
+		(USMAX:SVE_FULL_I
+		  (match_operand:SVE_FULL_I 2 "register_operand" "0, w, w, w, w")
+		  (match_operand:SVE_FULL_I 3 "register_operand" "w, 0, w, w, w"))]
+	       UNSPEC_PRED_X)
+	     (unspec:SVE_FULL_I
+	       [(match_operand 6)
+		(<max_opp>:SVE_FULL_I
+		  (match_dup 2)
+		  (match_dup 3))]
+	       UNSPEC_PRED_X))
+	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[4])
+   && !rtx_equal_p (operands[3], operands[4])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   #"
+  "&& 1"
+  {
+    if (!CONSTANT_P (operands[5]) || !CONSTANT_P (operands[6]))
+      operands[5] = operands[6] = CONSTM1_RTX (<VPRED>mode);
+    else if (reload_completed
+	     && register_operand (operands[4], <MODE>mode)
+	     && !rtx_equal_p (operands[0], operands[4]))
+      {
+	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
+						 operands[4], operands[1]));
+	operands[4] = operands[2] = operands[0];
+      }
+    else
+      FAIL;
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Saturating addition and subtraction
+;; -------------------------------------------------------------------------
+;; - SQADD
+;; - SQSUB
+;; - UQADD
+;; - UQSUB
+;; -------------------------------------------------------------------------
+
+;; Unpredicated saturating signed addition and subtraction.
+(define_insn "@aarch64_<su_optab><optab><mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, ?&w, ?&w, w")
+	(SBINQOPS:SVE_FULL_I
+	  (match_operand:SVE_FULL_I 1 "register_operand" "0, 0, w, w, w")
+	  (match_operand:SVE_FULL_I 2 "aarch64_sve_sqadd_operand" "vsQ, vsS, vsQ, vsS, w")))]
+  "TARGET_SVE"
+  "@
+   <binqops_op>\t%0.<Vetype>, %0.<Vetype>, #%D2
+   <binqops_op_rev>\t%0.<Vetype>, %0.<Vetype>, #%N2
+   movprfx\t%0, %1\;<binqops_op>\t%0.<Vetype>, %0.<Vetype>, #%D2
+   movprfx\t%0, %1\;<binqops_op_rev>\t%0.<Vetype>, %0.<Vetype>, #%N2
+   <binqops_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+  [(set_attr "movprfx" "*,*,yes,yes,*")]
+)
+
+;; Unpredicated saturating unsigned addition and subtraction.
+(define_insn "@aarch64_<su_optab><optab><mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w, w")
+	(UBINQOPS:SVE_FULL_I
+	  (match_operand:SVE_FULL_I 1 "register_operand" "0, w, w")
+	  (match_operand:SVE_FULL_I 2 "aarch64_sve_arith_operand" "vsa, vsa, w")))]
+  "TARGET_SVE"
+  "@
+   <binqops_op>\t%0.<Vetype>, %0.<Vetype>, #%D2
+   movprfx\t%0, %1\;<binqops_op>\t%0.<Vetype>, %0.<Vetype>, #%D2
+   <binqops_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+  [(set_attr "movprfx" "*,yes,*")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Highpart multiplication
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - SMULH
+;; - UMULH
+;; -------------------------------------------------------------------------
+
+;; Unpredicated highpart multiplication.
+(define_expand "<su>mul<mode>3_highpart"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(unspec:SVE_FULL_I
+	  [(match_dup 3)
+	   (unspec:SVE_FULL_I
+	     [(match_operand:SVE_FULL_I 1 "register_operand")
+	      (match_operand:SVE_FULL_I 2 "register_operand")]
+	     MUL_HIGHPART)]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE"
+  {
+    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
+;; Predicated highpart multiplication.
+(define_insn "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_I
+	     [(match_operand:SVE_FULL_I 2 "register_operand" "%0, w")
+	      (match_operand:SVE_FULL_I 3 "register_operand" "w, w")]
+	     MUL_HIGHPART)]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE"
+  "@
+   <su>mulh\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;<su>mulh\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated highpart multiplications with merging.
+(define_expand "@cond_<optab><mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (unspec:SVE_FULL_I
+	     [(match_operand:SVE_FULL_I 2 "register_operand")
+	      (match_operand:SVE_FULL_I 3 "register_operand")]
+	     MUL_HIGHPART)
+	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+{
+  /* Only target code is aware of these operations, so we don't need
+     to handle the fully-general case.  */
+  gcc_assert (rtx_equal_p (operands[2], operands[4])
+	      || CONSTANT_P (operands[4]));
+})
+
+;; Predicated highpart multiplications, merging with the first input.
+(define_insn "*cond_<optab><mode>_2"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_I
+	     [(match_operand:SVE_FULL_I 2 "register_operand" "0, w")
+	      (match_operand:SVE_FULL_I 3 "register_operand" "w, w")]
+	     MUL_HIGHPART)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")])
+
+;; Predicated highpart multiplications, merging with zero.
+(define_insn "*cond_<optab><mode>_z"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, &w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_I
+	     [(match_operand:SVE_FULL_I 2 "register_operand" "%0, w")
+	      (match_operand:SVE_FULL_I 3 "register_operand" "w, w")]
+	     MUL_HIGHPART)
+	   (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "yes")])
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Division
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - SDIV
+;; - SDIVR
+;; - UDIV
+;; - UDIVR
+;; -------------------------------------------------------------------------
+
+;; Unpredicated integer division.
+(define_expand "<optab><mode>3"
+  [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
+	(unspec:SVE_FULL_SDI
+	  [(match_dup 3)
+	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
+	     (match_operand:SVE_FULL_SDI 1 "register_operand")
+	     (match_operand:SVE_FULL_SDI 2 "register_operand"))]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE"
+  {
+    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
+;; Integer division predicated with a PTRUE.
+(define_insn "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_FULL_SDI 0 "register_operand" "=w, w, ?&w")
+	(unspec:SVE_FULL_SDI
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
+	     (match_operand:SVE_FULL_SDI 2 "register_operand" "0, w, w")
+	     (match_operand:SVE_FULL_SDI 3 "register_operand" "w, 0, w"))]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE"
+  "@
+   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   <sve_int_op>r\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,*,yes")]
+)
+
+;; Predicated integer division with merging.
+(define_expand "@cond_<optab><mode>"
+  [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
+	(unspec:SVE_FULL_SDI
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
+	     (match_operand:SVE_FULL_SDI 2 "register_operand")
+	     (match_operand:SVE_FULL_SDI 3 "register_operand"))
+	   (match_operand:SVE_FULL_SDI 4 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+)
+
+;; Predicated integer division, merging with the first input.
+(define_insn "*cond_<optab><mode>_2"
+  [(set (match_operand:SVE_FULL_SDI 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_SDI
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
+	     (match_operand:SVE_FULL_SDI 2 "register_operand" "0, w")
+	     (match_operand:SVE_FULL_SDI 3 "register_operand" "w, w"))
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer division, merging with the second input.
+(define_insn "*cond_<optab><mode>_3"
+  [(set (match_operand:SVE_FULL_SDI 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_SDI
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
+	     (match_operand:SVE_FULL_SDI 2 "register_operand" "w, w")
+	     (match_operand:SVE_FULL_SDI 3 "register_operand" "0, w"))
+	   (match_dup 3)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   <sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+   movprfx\t%0, %3\;<sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer division, merging with an independent value.
+(define_insn_and_rewrite "*cond_<optab><mode>_any"
+  [(set (match_operand:SVE_FULL_SDI 0 "register_operand" "=&w, &w, &w, &w, ?&w")
+	(unspec:SVE_FULL_SDI
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl")
+	   (SVE_INT_BINARY_SD:SVE_FULL_SDI
+	     (match_operand:SVE_FULL_SDI 2 "register_operand" "0, w, w, w, w")
+	     (match_operand:SVE_FULL_SDI 3 "register_operand" "w, 0, w, w, w"))
+	   (match_operand:SVE_FULL_SDI 4 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[4])
+   && !rtx_equal_p (operands[3], operands[4])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[4], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[4])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
+					     operands[4], operands[1]));
+    operands[4] = operands[2] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Binary logical operations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - AND
+;; - EOR
+;; - ORR
+;; -------------------------------------------------------------------------
+
+;; Unpredicated integer binary logical operations.
+(define_insn "<optab><mode>3"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?w, w")
+	(LOGICAL:SVE_FULL_I
+	  (match_operand:SVE_FULL_I 1 "register_operand" "%0, w, w")
+	  (match_operand:SVE_FULL_I 2 "aarch64_sve_logical_operand" "vsl, vsl, w")))]
+  "TARGET_SVE"
+  "@
+   <logical>\t%0.<Vetype>, %0.<Vetype>, #%C2
+   movprfx\t%0, %1\;<logical>\t%0.<Vetype>, %0.<Vetype>, #%C2
+   <logical>\t%0.d, %1.d, %2.d"
+  [(set_attr "movprfx" "*,yes,*")]
+)
+
+;; Merging forms are handled through SVE_INT_BINARY.
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Binary logical operations (inverted second input)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - BIC
+;; -------------------------------------------------------------------------
+
+;; Unpredicated BIC.
+(define_expand "@aarch64_bic<mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(and:SVE_FULL_I
+	  (unspec:SVE_FULL_I
+	    [(match_dup 3)
+	     (not:SVE_FULL_I (match_operand:SVE_FULL_I 2 "register_operand"))]
+	    UNSPEC_PRED_X)
+	  (match_operand:SVE_FULL_I 1 "register_operand")))]
+  "TARGET_SVE"
+  {
+    operands[3] = CONSTM1_RTX (<VPRED>mode);
+  }
+)
+
+;; Predicated BIC.
+(define_insn_and_rewrite "*bic<mode>3"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
+	(and:SVE_FULL_I
+	  (unspec:SVE_FULL_I
+	    [(match_operand 3)
+	     (not:SVE_FULL_I
+	       (match_operand:SVE_FULL_I 2 "register_operand" "w"))]
+	    UNSPEC_PRED_X)
+	  (match_operand:SVE_FULL_I 1 "register_operand" "w")))]
+  "TARGET_SVE"
+  "bic\t%0.d, %1.d, %2.d"
+  "&& !CONSTANT_P (operands[3])"
+  {
+    operands[3] = CONSTM1_RTX (<VPRED>mode);
+  }
+)
+
+;; Predicated BIC with merging.
+(define_expand "@cond_bic<mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (and:SVE_FULL_I
+	     (not:SVE_FULL_I (match_operand:SVE_FULL_I 3 "register_operand"))
+	     (match_operand:SVE_FULL_I 2 "register_operand"))
+	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+)
+
+;; Predicated integer BIC, merging with the first input.
+(define_insn "*cond_bic<mode>_2"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (and:SVE_FULL_I
+	     (not:SVE_FULL_I
+	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w"))
+	     (match_operand:SVE_FULL_I 2 "register_operand" "0, w"))
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   bic\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;bic\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer BIC, merging with an independent value.
+(define_insn_and_rewrite "*cond_bic<mode>_any"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, &w, &w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
+	   (and:SVE_FULL_I
+	     (not:SVE_FULL_I
+	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w, w, w"))
+	     (match_operand:SVE_FULL_I 2 "register_operand" "0, w, w, w"))
+	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;bic\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;bic\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;bic\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[4], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[4])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
+					     operands[4], operands[1]));
+    operands[4] = operands[2] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Shifts (rounding towards -Inf)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - ASR
+;; - ASRR
+;; - LSL
+;; - LSLR
+;; - LSR
+;; - LSRR
+;; -------------------------------------------------------------------------
+
+;; Unpredicated shift by a scalar, which expands into one of the vector
+;; shifts below.
+(define_expand "<ASHIFT:optab><mode>3"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(ASHIFT:SVE_FULL_I
+	  (match_operand:SVE_FULL_I 1 "register_operand")
+	  (match_operand:<VEL> 2 "general_operand")))]
+  "TARGET_SVE"
+  {
+    rtx amount;
+    if (CONST_INT_P (operands[2]))
+      {
+	amount = gen_const_vec_duplicate (<MODE>mode, operands[2]);
+	if (!aarch64_sve_<lr>shift_operand (operands[2], <MODE>mode))
+	  amount = force_reg (<MODE>mode, amount);
+      }
+    else
+      {
+	amount = gen_reg_rtx (<MODE>mode);
+	emit_insn (gen_vec_duplicate<mode> (amount,
+					    convert_to_mode (<VEL>mode,
+							     operands[2], 0)));
+      }
+    emit_insn (gen_v<optab><mode>3 (operands[0], operands[1], amount));
+    DONE;
+  }
+)
+
+;; Unpredicated shift by a vector.
+(define_expand "v<optab><mode>3"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(unspec:SVE_FULL_I
+	  [(match_dup 3)
+	   (ASHIFT:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 1 "register_operand")
+	     (match_operand:SVE_FULL_I 2 "aarch64_sve_<lr>shift_operand"))]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE"
+  {
+    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
+;; Shift by a vector, predicated with a PTRUE.  We don't actually need
+;; the predicate for the first alternative, but using Upa or X isn't
+;; likely to gain much and would make the instruction seem less uniform
+;; to the register allocator.
+(define_insn_and_split "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
+	   (ASHIFT:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 2 "register_operand" "w, 0, w, w")
+	     (match_operand:SVE_FULL_I 3 "aarch64_sve_<lr>shift_operand" "D<lr>, w, 0, w"))]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE"
+  "@
+   #
+   <shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   <shift>r\t%0.<Vetype>, %1/m, %3.<Vetype>, %2.<Vetype>
+   movprfx\t%0, %2\;<shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  "&& reload_completed
+   && !register_operand (operands[3], <MODE>mode)"
+  [(set (match_dup 0) (ASHIFT:SVE_FULL_I (match_dup 2) (match_dup 3)))]
+  ""
+  [(set_attr "movprfx" "*,*,*,yes")]
+)
+
+;; Unpredicated shift operations by a constant (post-RA only).
+;; These are generated by splitting a predicated instruction whose
+;; predicate is unused.
+(define_insn "*post_ra_v<optab><mode>3"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
+	(ASHIFT:SVE_FULL_I
+	  (match_operand:SVE_FULL_I 1 "register_operand" "w")
+	  (match_operand:SVE_FULL_I 2 "aarch64_simd_<lr>shift_imm")))]
+  "TARGET_SVE && reload_completed"
+  "<shift>\t%0.<Vetype>, %1.<Vetype>, #%2"
+)
+
+;; Predicated integer shift, merging with the first input.
+(define_insn "*cond_<optab><mode>_2_const"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (ASHIFT:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 2 "register_operand" "0, w")
+	     (match_operand:SVE_FULL_I 3 "aarch64_simd_<lr>shift_imm"))
+	   (match_dup 2)]
+	 UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   <shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0, %2\;<shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated integer shift, merging with an independent value.
+(define_insn_and_rewrite "*cond_<optab><mode>_any_const"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, &w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (ASHIFT:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 2 "register_operand" "w, w, w")
+	     (match_operand:SVE_FULL_I 3 "aarch64_simd_<lr>shift_imm"))
+	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero" "Dz, 0, w")]
+	 UNSPEC_SEL))]
+  "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   #"
+  "&& reload_completed
+   && register_operand (operands[4], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[4])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
+					     operands[4], operands[1]));
+    operands[4] = operands[2] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
+;; Unpredicated shifts of narrow elements by 64-bit amounts.
+(define_insn "@aarch64_sve_<sve_int_op><mode>"
+  [(set (match_operand:SVE_FULL_BHSI 0 "register_operand" "=w")
+	(unspec:SVE_FULL_BHSI
+	  [(match_operand:SVE_FULL_BHSI 1 "register_operand" "w")
+	   (match_operand:VNx2DI 2 "register_operand" "w")]
+	  SVE_SHIFT_WIDE))]
+  "TARGET_SVE"
+  "<sve_int_op>\t%0.<Vetype>, %1.<Vetype>, %2.d"
+)
+
+;; Merging predicated shifts of narrow elements by 64-bit amounts.
+(define_expand "@cond_<sve_int_op><mode>"
+  [(set (match_operand:SVE_FULL_BHSI 0 "register_operand")
+	(unspec:SVE_FULL_BHSI
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (unspec:SVE_FULL_BHSI
+	     [(match_operand:SVE_FULL_BHSI 2 "register_operand")
+	      (match_operand:VNx2DI 3 "register_operand")]
+	     SVE_SHIFT_WIDE)
+	   (match_operand:SVE_FULL_BHSI 4 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+)
+
+;; Predicated shifts of narrow elements by 64-bit amounts, merging with
+;; the first input.
+(define_insn "*cond_<sve_int_op><mode>_m"
+  [(set (match_operand:SVE_FULL_BHSI 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_BHSI
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_BHSI
+	     [(match_operand:SVE_FULL_BHSI 2 "register_operand" "0, w")
+	      (match_operand:VNx2DI 3 "register_operand" "w, w")]
+	     SVE_SHIFT_WIDE)
+	   (match_dup 2)]
+	 UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.d
+   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.d"
+  [(set_attr "movprfx" "*, yes")])
+
+;; Predicated shifts of narrow elements by 64-bit amounts, merging with zero.
+(define_insn "*cond_<sve_int_op><mode>_z"
+  [(set (match_operand:SVE_FULL_BHSI 0 "register_operand" "=&w, &w")
+	(unspec:SVE_FULL_BHSI
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_BHSI
+	     [(match_operand:SVE_FULL_BHSI 2 "register_operand" "0, w")
+	      (match_operand:VNx2DI 3 "register_operand" "w, w")]
+	     SVE_SHIFT_WIDE)
+	   (match_operand:SVE_FULL_BHSI 4 "aarch64_simd_imm_zero")]
+	 UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.d
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.d"
+  [(set_attr "movprfx" "yes")])
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Shifts (rounding towards 0)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - ASRD
+;; -------------------------------------------------------------------------
+
+;; Unpredicated ASRD.
+(define_expand "sdiv_pow2<mode>3"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(unspec:SVE_FULL_I
+	  [(match_dup 3)
+	   (unspec:SVE_FULL_I
+	     [(match_operand:SVE_FULL_I 1 "register_operand")
+	      (match_operand 2 "aarch64_simd_rshift_imm")]
+	     UNSPEC_ASRD)
+	   (match_dup 1)]
+	 UNSPEC_SEL))]
+  "TARGET_SVE"
+  {
+    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
+;; Predicated ASRD with merging.
+(define_expand "@cond_asrd<mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (unspec:SVE_FULL_I
+	     [(match_operand:SVE_FULL_I 2 "register_operand")
+	      (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm")]
+	     UNSPEC_ASRD)
+	   (match_operand:SVE_FULL_I 4 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+)
+
+;; Predicated ASRD, merging with the first input.
+(define_insn "*cond_asrd<mode>_2"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_I
+	     [(match_operand:SVE_FULL_I 2 "register_operand" "0, w")
+	      (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm")]
+	     UNSPEC_ASRD)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "@
+   asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0, %2\;asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3"
+  [(set_attr "movprfx" "*,yes")])
+
+;; Predicated ASRD, merging with zero.
+(define_insn "*cond_asrd<mode>_z"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+	   (unspec:SVE_FULL_I
+	     [(match_operand:SVE_FULL_I 2 "register_operand" "w")
+	      (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm")]
+	     UNSPEC_ASRD)
+	   (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+  "movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3"
+  [(set_attr "movprfx" "yes")])
+
+;; -------------------------------------------------------------------------
+;; ---- [FP<-INT] General binary arithmetic corresponding to unspecs
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FSCALE
+;; - FTSMUL
+;; - FTSSEL
+;; -------------------------------------------------------------------------
+
+;; Unpredicated floating-point binary operations that take an integer as
+;; their second operand.
+(define_insn "@aarch64_sve_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:SVE_FULL_F 1 "register_operand" "w")
+	   (match_operand:<V_INT_EQUIV> 2 "register_operand" "w")]
+	  SVE_FP_BINARY_INT))]
+  "TARGET_SVE"
+  "<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+)
+
+;; Predicated floating-point binary operations that take an integer
+;; as their second operand.
+(define_insn "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (match_operand:SI 4 "aarch64_sve_gp_strictness")
+	   (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
+	   (match_operand:<V_INT_EQUIV> 3 "register_operand" "w, w")]
+	  SVE_COND_FP_BINARY_INT))]
+  "TARGET_SVE"
+  "@
+   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated floating-point binary operations with merging, taking an
+;; integer as their second operand.
+(define_expand "@cond_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand")
+	      (match_operand:<V_INT_EQUIV> 3 "register_operand")]
+	     SVE_COND_FP_BINARY_INT)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+)
+
+;; Predicated floating-point binary operations that take an integer as their
+;; second operand, with inactive lanes coming from the first operand.
+(define_insn_and_rewrite "*cond_<optab><mode>_2"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 4)
+	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
+	      (match_operand:<V_INT_EQUIV> 3 "register_operand" "w, w")]
+	     SVE_COND_FP_BINARY_INT)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "@
+   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  "&& !rtx_equal_p (operands[1], operands[4])"
+  {
+    operands[4] = copy_rtx (operands[1]);
+  }
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated floating-point binary operations that take an integer as
+;; their second operand, with the values of inactive lanes being distinct
+;; from the other inputs.
+(define_insn_and_rewrite "*cond_<optab><mode>_any"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 5)
+	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w")
+	      (match_operand:<V_INT_EQUIV> 3 "register_operand" "w, w, w, w")]
+	     SVE_COND_FP_BINARY_INT)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[4])
+   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   #"
+  "&& 1"
+  {
+    if (reload_completed
+        && register_operand (operands[4], <MODE>mode)
+        && !rtx_equal_p (operands[0], operands[4]))
+      {
+	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
+						 operands[4], operands[1]));
+	operands[4] = operands[2] = operands[0];
+      }
+    else if (!rtx_equal_p (operands[1], operands[5]))
+      operands[5] = copy_rtx (operands[1]);
+    else
+      FAIL;
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [FP] General binary arithmetic corresponding to rtx codes
+;; -------------------------------------------------------------------------
+;; Includes post-RA forms of:
+;; - FADD
+;; - FMUL
+;; - FSUB
+;; -------------------------------------------------------------------------
+
+;; Unpredicated floating-point binary operations (post-RA only).
+;; These are generated by splitting a predicated instruction whose
+;; predicate is unused.
+(define_insn "*post_ra_<sve_fp_op><mode>3"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
+	(SVE_UNPRED_FP_BINARY:SVE_FULL_F
+	  (match_operand:SVE_FULL_F 1 "register_operand" "w")
+	  (match_operand:SVE_FULL_F 2 "register_operand" "w")))]
+  "TARGET_SVE && reload_completed"
+  "<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>")
+
+;; -------------------------------------------------------------------------
+;; ---- [FP] General binary arithmetic corresponding to unspecs
+;; -------------------------------------------------------------------------
+;; Includes merging forms of:
+;; - FADD    (constant forms handled in the "Addition" section)
+;; - FDIV
+;; - FDIVR
+;; - FMAX
+;; - FMAXNM  (including #0.0 and #1.0)
+;; - FMIN
+;; - FMINNM  (including #0.0 and #1.0)
+;; - FMUL    (including #0.5 and #2.0)
+;; - FMULX
+;; - FRECPS
+;; - FRSQRTS
+;; - FSUB    (constant forms handled in the "Addition" section)
+;; - FSUBR   (constant forms handled in the "Subtraction" section)
+;; -------------------------------------------------------------------------
+
+;; Unpredicated floating-point binary operations.
+(define_insn "@aarch64_sve_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:SVE_FULL_F 1 "register_operand" "w")
+	   (match_operand:SVE_FULL_F 2 "register_operand" "w")]
+	  SVE_FP_BINARY))]
+  "TARGET_SVE"
+  "<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+)
+
+;; Unpredicated floating-point binary operations that need to be predicated
+;; for SVE.
+(define_expand "<optab><mode>3"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_dup 3)
+	   (const_int SVE_RELAXED_GP)
+	   (match_operand:SVE_FULL_F 1 "<sve_pred_fp_rhs1_operand>")
+	   (match_operand:SVE_FULL_F 2 "<sve_pred_fp_rhs2_operand>")]
+	  SVE_COND_FP_BINARY))]
+  "TARGET_SVE"
+  {
+    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+  }
+)
+
+;; Predicated floating-point binary operations that have no immediate forms.
+(define_insn "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (match_operand:SI 4 "aarch64_sve_gp_strictness")
+	   (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w")
+	   (match_operand:SVE_FULL_F 3 "register_operand" "w, 0, w")]
+	  SVE_COND_FP_BINARY_REG))]
+  "TARGET_SVE"
+  "@
+   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   <sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,*,yes")]
+)
+
+;; Predicated floating-point operations with merging.
+(define_expand "@cond_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "<sve_pred_fp_rhs1_operand>")
+	      (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_operand>")]
+	     SVE_COND_FP_BINARY)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+)
+
+;; Predicated floating-point operations, merging with the first input.
+(define_insn_and_rewrite "*cond_<optab><mode>_2"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 4)
+	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
+	     SVE_COND_FP_BINARY)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "@
+   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  "&& !rtx_equal_p (operands[1], operands[4])"
+  {
+    operands[4] = copy_rtx (operands[1]);
+  }
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Same for operations that take a 1-bit constant.
+(define_insn_and_rewrite "*cond_<optab><mode>_2_const"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 4)
+	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
+	      (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
+	     SVE_COND_FP_BINARY_I1)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "@
+   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3"
+  "&& !rtx_equal_p (operands[1], operands[4])"
+  {
+    operands[4] = copy_rtx (operands[1]);
+  }
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated floating-point operations, merging with the second input.
+(define_insn_and_rewrite "*cond_<optab><mode>_3"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 4)
+	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "0, w")]
+	     SVE_COND_FP_BINARY)
+	   (match_dup 3)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "@
+   <sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+   movprfx\t%0, %3\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
+  "&& !rtx_equal_p (operands[1], operands[4])"
+  {
+    operands[4] = copy_rtx (operands[1]);
+  }
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated floating-point operations, merging with an independent value.
+(define_insn_and_rewrite "*cond_<optab><mode>_any"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 5)
+	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, 0, w, w, w")]
+	     SVE_COND_FP_BINARY)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[4])
+   && !rtx_equal_p (operands[3], operands[4])
+   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   #"
+  "&& 1"
+  {
+    if (reload_completed
+        && register_operand (operands[4], <MODE>mode)
+        && !rtx_equal_p (operands[0], operands[4]))
+      {
+	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
+						 operands[4], operands[1]));
+	operands[4] = operands[2] = operands[0];
+      }
+    else if (!rtx_equal_p (operands[1], operands[5]))
+      operands[5] = copy_rtx (operands[1]);
+    else
+      FAIL;
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
+;; Same for operations that take a 1-bit constant.
+(define_insn_and_rewrite "*cond_<optab><mode>_any_const"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 5)
+	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")
+	      (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
+	     SVE_COND_FP_BINARY_I1)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[4])
+   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   #"
+  "&& 1"
+  {
+    if (reload_completed
+        && register_operand (operands[4], <MODE>mode)
+        && !rtx_equal_p (operands[0], operands[4]))
+      {
+	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
+						 operands[4], operands[1]));
+	operands[4] = operands[2] = operands[0];
+      }
+    else if (!rtx_equal_p (operands[1], operands[5]))
+      operands[5] = copy_rtx (operands[1]);
+    else
+      FAIL;
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [FP] Addition
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FADD
+;; - FSUB
+;; -------------------------------------------------------------------------
+
+;; Predicated floating-point addition.
+(define_insn_and_split "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, w, w, ?&w, ?&w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl, Upl")
+	   (match_operand:SI 4 "aarch64_sve_gp_strictness" "i, i, Z, Ui1, i, i, Ui1")
+	   (match_operand:SVE_FULL_F 2 "register_operand" "%0, 0, w, 0, w, w, w")
+	   (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_operand" "vsA, vsN, w, w, vsA, vsN, w")]
+	  SVE_COND_FP_ADD))]
+  "TARGET_SVE"
+  "@
+   fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
+   #
+   fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0, %2\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
+   movprfx\t%0, %2\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  ; Split the unpredicated form after reload, so that we don't have
+  ; the unnecessary PTRUE.
+  "&& reload_completed
+   && register_operand (operands[3], <MODE>mode)
+   && INTVAL (operands[4]) == SVE_RELAXED_GP"
+  [(set (match_dup 0) (plus:SVE_FULL_F (match_dup 2) (match_dup 3)))]
+  ""
+  [(set_attr "movprfx" "*,*,*,*,yes,yes,yes")]
+)
+
+;; Predicated floating-point addition of a constant, merging with the
+;; first input.
+(define_insn_and_rewrite "*cond_add<mode>_2_const"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w, ?w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 4)
+	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, 0, w, w")
+	      (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate" "vsA, vsN, vsA, vsN")]
+	     UNSPEC_COND_FADD)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "@
+   fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
+   movprfx\t%0, %2\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0, %2\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3"
+  "&& !rtx_equal_p (operands[1], operands[4])"
+  {
+    operands[4] = copy_rtx (operands[1]);
+  }
+  [(set_attr "movprfx" "*,*,yes,yes")]
+)
+
+;; Predicated floating-point addition of a constant, merging with an
+;; independent value.
+(define_insn_and_rewrite "*cond_add<mode>_any_const"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, w, w, ?w, ?w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 5)
+	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w, w, w, w")
+	      (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate" "vsA, vsN, vsA, vsN, vsA, vsN")]
+	     UNSPEC_COND_FADD)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, 0, w, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[4])
+   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
+   #
+   #"
+  "&& 1"
+  {
+    if (reload_completed
+        && register_operand (operands[4], <MODE>mode)
+        && !rtx_equal_p (operands[0], operands[4]))
+      {
+	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
+						 operands[4], operands[1]));
+	operands[4] = operands[2] = operands[0];
+      }
+    else if (!rtx_equal_p (operands[1], operands[5]))
+      operands[5] = copy_rtx (operands[1]);
+    else
+      FAIL;
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
+;; Register merging forms are handled through SVE_COND_FP_BINARY.
+
+;; -------------------------------------------------------------------------
+;; ---- [FP] Complex addition
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FCADD
+;; -------------------------------------------------------------------------
+
+;; Predicated FCADD.
+(define_insn "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (match_operand:SI 4 "aarch64_sve_gp_strictness")
+	   (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
+	   (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
+	  SVE_COND_FCADD))]
+  "TARGET_SVE"
+  "@
+   fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
+   movprfx\t%0, %2\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated FCADD with merging.
+(define_expand "@cond_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand")
+	      (match_operand:SVE_FULL_F 3 "register_operand")]
+	     SVE_COND_FCADD)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
+)
+
+;; Predicated FCADD, merging with the first input.
+(define_insn_and_rewrite "*cond_<optab><mode>_2"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 4)
+	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
+	     SVE_COND_FCADD)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "@
+   fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
+   movprfx\t%0, %2\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>"
+  "&& !rtx_equal_p (operands[1], operands[4])"
+  {
+    operands[4] = copy_rtx (operands[1]);
+  }
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated FCADD, merging with an independent value.
+(define_insn_and_rewrite "*cond_<optab><mode>_any"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 5)
+	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, 0, w, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, w")]
+	     SVE_COND_FCADD)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[4])
+   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;fcadd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>, #<rot>
+   #"
+  "&& 1"
+  {
+    if (reload_completed
+        && register_operand (operands[4], <MODE>mode)
+        && !rtx_equal_p (operands[0], operands[4]))
+      {
+	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[2],
+						 operands[4], operands[1]));
+	operands[4] = operands[2] = operands[0];
+      }
+    else if (!rtx_equal_p (operands[1], operands[5]))
+      operands[5] = copy_rtx (operands[1]);
+    else
+      FAIL;
+  }
+  [(set_attr "movprfx" "yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [FP] Subtraction
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FSUB
+;; - FSUBR
+;; -------------------------------------------------------------------------
+
+;; Predicated floating-point subtraction.
+(define_insn_and_split "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, w, w, ?&w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	   (match_operand:SI 4 "aarch64_sve_gp_strictness" "i, Z, Ui1, Ui1, i, Ui1")
+	   (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_operand" "vsA, w, 0, w, vsA, w")
+	   (match_operand:SVE_FULL_F 3 "register_operand" "0, w, w, 0, w, w")]
+	  SVE_COND_FP_SUB))]
+  "TARGET_SVE"
+  "@
+   fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
+   #
+   fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+   movprfx\t%0, %3\;fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
+   movprfx\t%0, %2\;fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  ; Split the unpredicated form after reload, so that we don't have
+  ; the unnecessary PTRUE.
+  "&& reload_completed
+   && register_operand (operands[2], <MODE>mode)
+   && INTVAL (operands[4]) == SVE_RELAXED_GP"
+  [(set (match_dup 0) (minus:SVE_FULL_F (match_dup 2) (match_dup 3)))]
+  ""
+  [(set_attr "movprfx" "*,*,*,*,yes,yes")]
+)
+
+;; Predicated floating-point subtraction from a constant, merging with the
+;; second input.
+(define_insn_and_rewrite "*cond_sub<mode>_3_const"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 4)
+	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "0, w")]
+	     UNSPEC_COND_FSUB)
+	   (match_dup 3)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "@
+   fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
+   movprfx\t%0, %3\;fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2"
+  "&& !rtx_equal_p (operands[1], operands[4])"
+  {
+    operands[4] = copy_rtx (operands[1]);
+  }
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated floating-point subtraction from a constant, merging with an
+;; independent value.
+(define_insn_and_rewrite "*cond_sub<mode>_any_const"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 5)
+	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w")]
+	     UNSPEC_COND_FSUB)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[3], operands[4])
+   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %3.<Vetype>\;fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
+   movprfx\t%0.<Vetype>, %1/m, %3.<Vetype>\;fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
+   #"
+  "&& 1"
   {
-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    if (reload_completed
+        && register_operand (operands[4], <MODE>mode)
+        && !rtx_equal_p (operands[0], operands[4]))
+      {
+	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[3],
+						 operands[4], operands[1]));
+	operands[4] = operands[3] = operands[0];
+      }
+    else if (!rtx_equal_p (operands[1], operands[5]))
+      operands[5] = copy_rtx (operands[1]);
+    else
+      FAIL;
   }
+  [(set_attr "movprfx" "yes")]
 )
 
-;; Predicated ST[234].
-(define_insn "vec_mask_store_lanes<mode><vsingle>"
-  [(set (match_operand:SVE_STRUCT 0 "memory_operand" "+m")
-	(unspec:SVE_STRUCT
-	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
-	   (match_operand:SVE_STRUCT 1 "register_operand" "w")
-	   (match_dup 0)]
-	  UNSPEC_STN))]
+;; Register merging forms are handled through SVE_COND_FP_BINARY.
+
+;; -------------------------------------------------------------------------
+;; ---- [FP] Absolute difference
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FABD
+;; -------------------------------------------------------------------------
+
+;; Predicated floating-point absolute difference.
+(define_expand "@aarch64_pred_abd<mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (match_operand:SI 4 "aarch64_sve_gp_strictness")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (match_dup 4)
+	      (match_operand:SVE_FULL_F 2 "register_operand")
+	      (match_operand:SVE_FULL_F 3 "register_operand")]
+	     UNSPEC_COND_FSUB)]
+	  UNSPEC_COND_FABS))]
   "TARGET_SVE"
-  "st<vector_count><Vesize>\t%1, %2, %0"
 )
 
-(define_expand "vec_perm<mode>"
-  [(match_operand:SVE_ALL 0 "register_operand")
-   (match_operand:SVE_ALL 1 "register_operand")
-   (match_operand:SVE_ALL 2 "register_operand")
-   (match_operand:<V_INT_EQUIV> 3 "aarch64_sve_vec_perm_operand")]
-  "TARGET_SVE && GET_MODE_NUNITS (<MODE>mode).is_constant ()"
+;; Predicated floating-point absolute difference.
+(define_insn_and_rewrite "*aarch64_pred_abd<mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (match_operand:SI 4 "aarch64_sve_gp_strictness")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 5)
+	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "%0, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
+	     UNSPEC_COND_FSUB)]
+	  UNSPEC_COND_FABS))]
+  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "@
+   fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  "&& !rtx_equal_p (operands[1], operands[5])"
   {
-    aarch64_expand_sve_vec_perm (operands[0], operands[1],
-				 operands[2], operands[3]);
-    DONE;
+    operands[5] = copy_rtx (operands[1]);
   }
+  [(set_attr "movprfx" "*,yes")]
 )
 
-(define_insn "*aarch64_sve_tbl<mode>"
-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
-	(unspec:SVE_ALL
-	  [(match_operand:SVE_ALL 1 "register_operand" "w")
-	   (match_operand:<V_INT_EQUIV> 2 "register_operand" "w")]
-	  UNSPEC_TBL))]
+(define_expand "@aarch64_cond_abd<mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (unspec:SVE_FULL_F
+		[(match_dup 1)
+		 (const_int SVE_STRICT_GP)
+		 (match_operand:SVE_FULL_F 2 "register_operand")
+		 (match_operand:SVE_FULL_F 3 "register_operand")]
+		UNSPEC_COND_FSUB)]
+	     UNSPEC_COND_FABS)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
   "TARGET_SVE"
-  "tbl\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+{
+  if (rtx_equal_p (operands[3], operands[4]))
+    std::swap (operands[2], operands[3]);
+})
+
+;; Predicated floating-point absolute difference, merging with the first
+;; input.
+(define_insn_and_rewrite "*aarch64_cond_abd<mode>_2"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 4)
+	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (unspec:SVE_FULL_F
+		[(match_operand 6)
+		 (match_operand:SI 7 "aarch64_sve_gp_strictness")
+		 (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
+		 (match_operand:SVE_FULL_F 3 "register_operand" "w, w")]
+		UNSPEC_COND_FSUB)]
+	     UNSPEC_COND_FABS)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])
+   && aarch64_sve_pred_dominates_p (&operands[6], operands[1])"
+  "@
+   fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  "&& (!rtx_equal_p (operands[1], operands[4])
+       || !rtx_equal_p (operands[1], operands[6]))"
+  {
+    operands[4] = copy_rtx (operands[1]);
+    operands[6] = copy_rtx (operands[1]);
+  }
+  [(set_attr "movprfx" "*,yes")]
 )
 
-(define_insn "*aarch64_sve_<perm_insn><perm_hilo><mode>"
-  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
-	(unspec:PRED_ALL [(match_operand:PRED_ALL 1 "register_operand" "Upa")
-			  (match_operand:PRED_ALL 2 "register_operand" "Upa")]
-			 PERMUTE))]
-  "TARGET_SVE"
-  "<perm_insn><perm_hilo>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+;; Predicated floating-point absolute difference, merging with the second
+;; input.
+(define_insn_and_rewrite "*aarch64_cond_abd<mode>_3"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 4)
+	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (unspec:SVE_FULL_F
+		[(match_operand 6)
+		 (match_operand:SI 7 "aarch64_sve_gp_strictness")
+		 (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
+		 (match_operand:SVE_FULL_F 3 "register_operand" "0, w")]
+		UNSPEC_COND_FSUB)]
+	     UNSPEC_COND_FABS)
+	   (match_dup 3)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])
+   && aarch64_sve_pred_dominates_p (&operands[6], operands[1])"
+  "@
+   fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+   movprfx\t%0, %3\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
+  "&& (!rtx_equal_p (operands[1], operands[4])
+       || !rtx_equal_p (operands[1], operands[6]))"
+  {
+    operands[4] = copy_rtx (operands[1]);
+    operands[6] = copy_rtx (operands[1]);
+  }
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; Predicated floating-point absolute difference, merging with an
+;; independent value.
+(define_insn_and_rewrite "*aarch64_cond_abd<mode>_any"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 5)
+	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (unspec:SVE_FULL_F
+		[(match_operand 7)
+		 (match_operand:SI 8 "aarch64_sve_gp_strictness")
+		 (match_operand:SVE_FULL_F 2 "register_operand" "0, w, w, w, w")
+		 (match_operand:SVE_FULL_F 3 "register_operand" "w, 0, w, w, w")]
+		UNSPEC_COND_FSUB)]
+	     UNSPEC_COND_FABS)
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[4])
+   && !rtx_equal_p (operands[3], operands[4])
+   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])
+   && aarch64_sve_pred_dominates_p (&operands[7], operands[1])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;fabd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   #"
+  "&& 1"
+  {
+    if (reload_completed
+        && register_operand (operands[4], <MODE>mode)
+        && !rtx_equal_p (operands[0], operands[4]))
+      {
+	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[3],
+						 operands[4], operands[1]));
+	operands[4] = operands[3] = operands[0];
+      }
+    else if (!rtx_equal_p (operands[1], operands[5])
+	     || !rtx_equal_p (operands[1], operands[7]))
+      {
+	operands[5] = copy_rtx (operands[1]);
+	operands[7] = copy_rtx (operands[1]);
+      }
+    else
+      FAIL;
+  }
+  [(set_attr "movprfx" "yes")]
 )
 
-(define_insn "aarch64_sve_<perm_insn><perm_hilo><mode>"
-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
-	(unspec:SVE_ALL [(match_operand:SVE_ALL 1 "register_operand" "w")
-			 (match_operand:SVE_ALL 2 "register_operand" "w")]
-			PERMUTE))]
+;; -------------------------------------------------------------------------
+;; ---- [FP] Multiplication
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FMUL
+;; -------------------------------------------------------------------------
+
+;; Predicated floating-point multiplication.
+(define_insn_and_split "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, w, ?&w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl")
+	   (match_operand:SI 4 "aarch64_sve_gp_strictness" "i, Z, Ui1, i, Ui1")
+	   (match_operand:SVE_FULL_F 2 "register_operand" "%0, w, 0, w, w")
+	   (match_operand:SVE_FULL_F 3 "aarch64_sve_float_mul_operand" "vsM, w, w, vsM, w")]
+	  SVE_COND_FP_MUL))]
   "TARGET_SVE"
-  "<perm_insn><perm_hilo>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+  "@
+   fmul\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   #
+   fmul\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;fmul\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0, %2\;fmul\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  ; Split the unpredicated form after reload, so that we don't have
+  ; the unnecessary PTRUE.
+  "&& reload_completed
+   && register_operand (operands[3], <MODE>mode)
+   && INTVAL (operands[4]) == SVE_RELAXED_GP"
+  [(set (match_dup 0) (mult:SVE_FULL_F (match_dup 2) (match_dup 3)))]
+  ""
+  [(set_attr "movprfx" "*,*,*,yes,yes")]
 )
 
-(define_insn "*aarch64_sve_rev64<mode>"
-  [(set (match_operand:SVE_BHS 0 "register_operand" "=w")
-	(unspec:SVE_BHS
-	  [(match_operand:VNx2BI 1 "register_operand" "Upl")
-	   (unspec:SVE_BHS [(match_operand:SVE_BHS 2 "register_operand" "w")]
-			   UNSPEC_REV64)]
-	  UNSPEC_MERGE_PTRUE))]
+;; Merging forms are handled through SVE_COND_FP_BINARY and
+;; SVE_COND_FP_BINARY_I1.
+
+;; Unpredicated multiplication by selected lanes.
+(define_insn "@aarch64_mul_lane_<mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
+	(mult:SVE_FULL_F
+	  (unspec:SVE_FULL_F
+	    [(match_operand:SVE_FULL_F 2 "register_operand" "<sve_lane_con>")
+	     (match_operand:SI 3 "const_int_operand")]
+	    UNSPEC_SVE_LANE_SELECT)
+	  (match_operand:SVE_FULL_F 1 "register_operand" "w")))]
   "TARGET_SVE"
-  "rev<Vesize>\t%0.d, %1/m, %2.d"
+  "fmul\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3]"
 )
 
-(define_insn "*aarch64_sve_rev32<mode>"
-  [(set (match_operand:SVE_BH 0 "register_operand" "=w")
-	(unspec:SVE_BH
-	  [(match_operand:VNx4BI 1 "register_operand" "Upl")
-	   (unspec:SVE_BH [(match_operand:SVE_BH 2 "register_operand" "w")]
-			  UNSPEC_REV32)]
-	  UNSPEC_MERGE_PTRUE))]
+;; -------------------------------------------------------------------------
+;; ---- [FP] Binary logical operations
+;; -------------------------------------------------------------------------
+;; Includes
+;; - AND
+;; - EOR
+;; - ORR
+;; -------------------------------------------------------------------------
+
+;; Binary logical operations on floating-point modes.  We avoid subregs
+;; by providing this, but we need to use UNSPECs since rtx logical ops
+;; aren't defined for floating-point modes.
+(define_insn "*<optab><mode>3"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:SVE_FULL_F 1 "register_operand" "w")
+	   (match_operand:SVE_FULL_F 2 "register_operand" "w")]
+	  LOGICALF))]
   "TARGET_SVE"
-  "rev<Vesize>\t%0.s, %1/m, %2.s"
+  "<logicalf_op>\t%0.d, %1.d, %2.d"
 )
 
-(define_insn "*aarch64_sve_rev16vnx16qi"
-  [(set (match_operand:VNx16QI 0 "register_operand" "=w")
-	(unspec:VNx16QI
-	  [(match_operand:VNx8BI 1 "register_operand" "Upl")
-	   (unspec:VNx16QI [(match_operand:VNx16QI 2 "register_operand" "w")]
-			   UNSPEC_REV16)]
-	  UNSPEC_MERGE_PTRUE))]
+;; -------------------------------------------------------------------------
+;; ---- [FP] Sign copying
+;; -------------------------------------------------------------------------
+;; The patterns in this section are synthetic.
+;; -------------------------------------------------------------------------
+
+(define_expand "copysign<mode>3"
+  [(match_operand:SVE_FULL_F 0 "register_operand")
+   (match_operand:SVE_FULL_F 1 "register_operand")
+   (match_operand:SVE_FULL_F 2 "register_operand")]
   "TARGET_SVE"
-  "revb\t%0.h, %1/m, %2.h"
+  {
+    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
+    rtx mant = gen_reg_rtx (<V_INT_EQUIV>mode);
+    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
+    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
+
+    rtx arg1 = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
+    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
+
+    emit_insn (gen_and<v_int_equiv>3
+	       (sign, arg2,
+		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+						   HOST_WIDE_INT_M1U
+						   << bits)));
+    emit_insn (gen_and<v_int_equiv>3
+	       (mant, arg1,
+		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+						   ~(HOST_WIDE_INT_M1U
+						     << bits))));
+    emit_insn (gen_ior<v_int_equiv>3 (int_res, sign, mant));
+    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
+    DONE;
+  }
 )
 
-(define_insn "*aarch64_sve_rev<mode>"
-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
-	(unspec:SVE_ALL [(match_operand:SVE_ALL 1 "register_operand" "w")]
-			UNSPEC_REV))]
+(define_expand "xorsign<mode>3"
+  [(match_operand:SVE_FULL_F 0 "register_operand")
+   (match_operand:SVE_FULL_F 1 "register_operand")
+   (match_operand:SVE_FULL_F 2 "register_operand")]
   "TARGET_SVE"
-  "rev\t%0.<Vetype>, %1.<Vetype>")
+  {
+    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
+    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
+    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
 
-(define_insn "*aarch64_sve_dup_lane<mode>"
-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
-	(vec_duplicate:SVE_ALL
-	  (vec_select:<VEL>
-	    (match_operand:SVE_ALL 1 "register_operand" "w")
-	    (parallel [(match_operand:SI 2 "const_int_operand")]))))]
-  "TARGET_SVE
-   && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 0, 63)"
-  "dup\t%0.<Vetype>, %1.<Vetype>[%2]"
+    rtx arg1 = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
+    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
+
+    emit_insn (gen_and<v_int_equiv>3
+	       (sign, arg2,
+		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+						   HOST_WIDE_INT_M1U
+						   << bits)));
+    emit_insn (gen_xor<v_int_equiv>3 (int_res, arg1, sign));
+    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
+    DONE;
+  }
 )
 
-;; Note that the immediate (third) operand is the lane index not
-;; the byte index.
-(define_insn "*aarch64_sve_ext<mode>"
-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
-	(unspec:SVE_ALL [(match_operand:SVE_ALL 1 "register_operand" "0")
-			 (match_operand:SVE_ALL 2 "register_operand" "w")
-			 (match_operand:SI 3 "const_int_operand")]
-			UNSPEC_EXT))]
-  "TARGET_SVE
-   && IN_RANGE (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode), 0, 255)"
+;; -------------------------------------------------------------------------
+;; ---- [FP] Maximum and minimum
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FMAX
+;; - FMAXNM
+;; - FMIN
+;; - FMINNM
+;; -------------------------------------------------------------------------
+
+;; Unpredicated fmax/fmin (the libm functions).  The optabs for the
+;; smin/smax rtx codes are handled in the generic section above.
+(define_expand "<maxmin_uns><mode>3"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_dup 3)
+	   (const_int SVE_RELAXED_GP)
+	   (match_operand:SVE_FULL_F 1 "register_operand")
+	   (match_operand:SVE_FULL_F 2 "aarch64_sve_float_maxmin_operand")]
+	  SVE_COND_FP_MAXMIN_PUBLIC))]
+  "TARGET_SVE"
   {
-    operands[3] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode));
-    return "ext\\t%0.b, %0.b, %2.b, #%3";
+    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
   }
 )
 
-(define_insn "add<mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, w, w")
-	(plus:SVE_I
-	  (match_operand:SVE_I 1 "register_operand" "%0, 0, 0, w")
-	  (match_operand:SVE_I 2 "aarch64_sve_add_operand" "vsa, vsn, vsi, w")))]
+;; Predicated floating-point maximum/minimum.
+(define_insn "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?&w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
+	   (match_operand:SI 4 "aarch64_sve_gp_strictness")
+	   (match_operand:SVE_FULL_F 2 "register_operand" "%0, 0, w, w")
+	   (match_operand:SVE_FULL_F 3 "aarch64_sve_float_maxmin_operand" "vsB, w, vsB, w")]
+	  SVE_COND_FP_MAXMIN))]
   "TARGET_SVE"
   "@
-   add\t%0.<Vetype>, %0.<Vetype>, #%D2
-   sub\t%0.<Vetype>, %0.<Vetype>, #%N2
-   * return aarch64_output_sve_inc_dec_immediate (\"%0.<Vetype>\", operands[2]);
-   add\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
+   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,*,yes,yes")]
 )
 
-(define_insn "sub<mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w, w")
-	(minus:SVE_I
-	  (match_operand:SVE_I 1 "aarch64_sve_arith_operand" "w, vsa")
-	  (match_operand:SVE_I 2 "register_operand" "w, 0")))]
+;; Merging forms are handled through SVE_COND_FP_BINARY and
+;; SVE_COND_FP_BINARY_I1.
+
+;; -------------------------------------------------------------------------
+;; ---- [PRED] Binary logical operations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - AND
+;; - ANDS
+;; - EOR
+;; - EORS
+;; - ORR
+;; - ORRS
+;; -------------------------------------------------------------------------
+
+;; Predicate AND.  We can reuse one of the inputs as the GP.
+;; Doubling the second operand is the preferred implementation
+;; of the MOV alias, so we use that instead of %1/z, %1, %2.
+(define_insn "and<mode>3"
+  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+	(and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand" "Upa")
+		      (match_operand:PRED_ALL 2 "register_operand" "Upa")))]
   "TARGET_SVE"
-  "@
-   sub\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>
-   subr\t%0.<Vetype>, %0.<Vetype>, #%D1"
+  "and\t%0.b, %1/z, %2.b, %2.b"
 )
 
-;; Unpredicated multiplication.
-(define_expand "mul<mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand")
-	(unspec:SVE_I
-	  [(match_dup 3)
-	   (mult:SVE_I
-	     (match_operand:SVE_I 1 "register_operand")
-	     (match_operand:SVE_I 2 "aarch64_sve_mul_operand"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Unpredicated predicate EOR and ORR.
+(define_expand "<optab><mode>3"
+  [(set (match_operand:PRED_ALL 0 "register_operand")
+	(and:PRED_ALL
+	  (LOGICAL_OR:PRED_ALL
+	    (match_operand:PRED_ALL 1 "register_operand")
+	    (match_operand:PRED_ALL 2 "register_operand"))
+	  (match_dup 3)))]
   "TARGET_SVE"
   {
-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    operands[3] = aarch64_ptrue_reg (<MODE>mode);
   }
 )
 
-;; Multiplication predicated with a PTRUE.  We don't actually need the
-;; predicate for the first alternative, but using Upa or X isn't likely
-;; to gain much and would make the instruction seem less uniform to the
-;; register allocator.
-(define_insn_and_split "*mul<mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
-	(unspec:SVE_I
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	   (mult:SVE_I
-	     (match_operand:SVE_I 2 "register_operand" "%0, 0, w")
-	     (match_operand:SVE_I 3 "aarch64_sve_mul_operand" "vsm, w, w"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Predicated predicate AND, EOR and ORR.
+(define_insn "@aarch64_pred_<optab><mode>_z"
+  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+	(and:PRED_ALL
+	  (LOGICAL:PRED_ALL
+	    (match_operand:PRED_ALL 2 "register_operand" "Upa")
+	    (match_operand:PRED_ALL 3 "register_operand" "Upa"))
+	  (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
   "TARGET_SVE"
-  "@
-   #
-   mul\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-   movprfx\t%0, %2\;mul\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
-  ; Split the unpredicated form after reload, so that we don't have
-  ; the unnecessary PTRUE.
-  "&& reload_completed
-   && !register_operand (operands[3], <MODE>mode)"
-  [(set (match_dup 0) (mult:SVE_I (match_dup 2) (match_dup 3)))]
-  ""
-  [(set_attr "movprfx" "*,*,yes")]
-)
-
-;; Unpredicated multiplications by a constant (post-RA only).
-;; These are generated by splitting a predicated instruction whose
-;; predicate is unused.
-(define_insn "*post_ra_mul<mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w")
-	(mult:SVE_I
-	  (match_operand:SVE_I 1 "register_operand" "0")
-	  (match_operand:SVE_I 2 "aarch64_sve_mul_immediate")))]
-  "TARGET_SVE && reload_completed"
-  "mul\t%0.<Vetype>, %0.<Vetype>, #%2"
+  "<logical>\t%0.b, %1/z, %2.b, %3.b"
 )
 
-(define_insn "*madd<mode>"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
-	(plus:SVE_I
-	  (unspec:SVE_I
-	    [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "%0, w, w")
-			 (match_operand:SVE_I 3 "register_operand" "w, w, w"))]
-	    UNSPEC_MERGE_PTRUE)
-	  (match_operand:SVE_I 4 "register_operand" "w, 0, w")))]
+;; Perform a logical operation on operands 2 and 3, using operand 1 as
+;; the GP.  Store the result in operand 0 and set the flags in the same
+;; way as for PTEST.
+(define_insn "*<optab><mode>3_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (and:PRED_ALL
+	     (LOGICAL:PRED_ALL
+	       (match_operand:PRED_ALL 2 "register_operand" "Upa")
+	       (match_operand:PRED_ALL 3 "register_operand" "Upa"))
+	     (match_dup 4))]
+	  UNSPEC_PTEST))
+   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+	(and:PRED_ALL (LOGICAL:PRED_ALL (match_dup 2) (match_dup 3))
+		      (match_dup 4)))]
   "TARGET_SVE"
-  "@
-   mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
-   mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
-   movprfx\t%0, %4\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "*,*,yes")]
+  "<logical>s\t%0.b, %1/z, %2.b, %3.b"
 )
 
-(define_insn "*msub<mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
-	(minus:SVE_I
-	  (match_operand:SVE_I 4 "register_operand" "w, 0, w")
-	  (unspec:SVE_I
-	    [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	     (mult:SVE_I (match_operand:SVE_I 2 "register_operand" "%0, w, w")
-			 (match_operand:SVE_I 3 "register_operand" "w, w, w"))]
-	    UNSPEC_MERGE_PTRUE)))]
+;; Same with just the flags result.
+(define_insn "*<optab><mode>3_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (and:PRED_ALL
+	     (LOGICAL:PRED_ALL
+	       (match_operand:PRED_ALL 2 "register_operand" "Upa")
+	       (match_operand:PRED_ALL 3 "register_operand" "Upa"))
+	     (match_dup 4))]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
   "TARGET_SVE"
-  "@
-   msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
-   mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
-   movprfx\t%0, %4\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "*,*,yes")]
+  "<logical>s\t%0.b, %1/z, %2.b, %3.b"
 )
 
-;; Unpredicated highpart multiplication.
-(define_expand "<su>mul<mode>3_highpart"
-  [(set (match_operand:SVE_I 0 "register_operand")
-	(unspec:SVE_I
-	  [(match_dup 3)
-	   (unspec:SVE_I [(match_operand:SVE_I 1 "register_operand")
-			  (match_operand:SVE_I 2 "register_operand")]
-			 MUL_HIGHPART)]
-	  UNSPEC_MERGE_PTRUE))]
+;; -------------------------------------------------------------------------
+;; ---- [PRED] Binary logical operations (inverted second input)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - BIC
+;; - ORN
+;; -------------------------------------------------------------------------
+
+;; Predicated predicate BIC and ORN.
+(define_insn "aarch64_pred_<nlogical><mode>_z"
+  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+	(and:PRED_ALL
+	  (NLOGICAL:PRED_ALL
+	    (not:PRED_ALL (match_operand:PRED_ALL 3 "register_operand" "Upa"))
+	    (match_operand:PRED_ALL 2 "register_operand" "Upa"))
+	  (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
   "TARGET_SVE"
-  {
-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
-  }
-)
+  "<nlogical>\t%0.b, %1/z, %2.b, %3.b"
+)
+
+;; Same, but set the flags as a side-effect.
+(define_insn "*<nlogical><mode>3_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (and:PRED_ALL
+	     (NLOGICAL:PRED_ALL
+	       (not:PRED_ALL
+		 (match_operand:PRED_ALL 3 "register_operand" "Upa"))
+	       (match_operand:PRED_ALL 2 "register_operand" "Upa"))
+	     (match_dup 4))]
+	  UNSPEC_PTEST))
+   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+	(and:PRED_ALL (NLOGICAL:PRED_ALL
+			(not:PRED_ALL (match_dup 3))
+			(match_dup 2))
+		      (match_dup 4)))]
+  "TARGET_SVE"
+  "<nlogical>s\t%0.b, %1/z, %2.b, %3.b"
+)
+
+;; Same with just the flags result.
+(define_insn "*<nlogical><mode>3_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (and:PRED_ALL
+	     (NLOGICAL:PRED_ALL
+	       (not:PRED_ALL
+		 (match_operand:PRED_ALL 3 "register_operand" "Upa"))
+	       (match_operand:PRED_ALL 2 "register_operand" "Upa"))
+	     (match_dup 4))]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
+  "TARGET_SVE"
+  "<nlogical>s\t%0.b, %1/z, %2.b, %3.b"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [PRED] Binary logical operations (inverted result)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - NAND
+;; - NOR
+;; -------------------------------------------------------------------------
 
-;; Predicated highpart multiplication.
-(define_insn "*<su>mul<mode>3_highpart"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
-	(unspec:SVE_I
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (unspec:SVE_I [(match_operand:SVE_I 2 "register_operand" "%0, w")
-			  (match_operand:SVE_I 3 "register_operand" "w, w")]
-			 MUL_HIGHPART)]
-	  UNSPEC_MERGE_PTRUE))]
+;; Predicated predicate NAND and NOR.
+(define_insn "aarch64_pred_<logical_nn><mode>_z"
+  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+	(and:PRED_ALL
+	  (NLOGICAL:PRED_ALL
+	    (not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand" "Upa"))
+	    (not:PRED_ALL (match_operand:PRED_ALL 3 "register_operand" "Upa")))
+	  (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
   "TARGET_SVE"
-  "@
-   <su>mulh\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-   movprfx\t%0, %2\;<su>mulh\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "*,yes")]
+  "<logical_nn>\t%0.b, %1/z, %2.b, %3.b"
 )
 
-;; Unpredicated division.
-(define_expand "<optab><mode>3"
-  [(set (match_operand:SVE_SDI 0 "register_operand")
-	(unspec:SVE_SDI
-	  [(match_dup 3)
-	   (SVE_INT_BINARY_SD:SVE_SDI
-	     (match_operand:SVE_SDI 1 "register_operand")
-	     (match_operand:SVE_SDI 2 "register_operand"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Same, but set the flags as a side-effect.
+(define_insn "*<logical_nn><mode>3_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (and:PRED_ALL
+	     (NLOGICAL:PRED_ALL
+	       (not:PRED_ALL
+		 (match_operand:PRED_ALL 2 "register_operand" "Upa"))
+	       (not:PRED_ALL
+		 (match_operand:PRED_ALL 3 "register_operand" "Upa")))
+	     (match_dup 4))]
+	  UNSPEC_PTEST))
+   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+	(and:PRED_ALL (NLOGICAL:PRED_ALL
+			(not:PRED_ALL (match_dup 2))
+			(not:PRED_ALL (match_dup 3)))
+		      (match_dup 4)))]
+  "TARGET_SVE"
+  "<logical_nn>s\t%0.b, %1/z, %2.b, %3.b"
+)
+
+;; Same with just the flags result.
+(define_insn "*<logical_nn><mode>3_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (and:PRED_ALL
+	     (NLOGICAL:PRED_ALL
+	       (not:PRED_ALL
+		 (match_operand:PRED_ALL 2 "register_operand" "Upa"))
+	       (not:PRED_ALL
+		 (match_operand:PRED_ALL 3 "register_operand" "Upa")))
+	     (match_dup 4))]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
+  "TARGET_SVE"
+  "<logical_nn>s\t%0.b, %1/z, %2.b, %3.b"
+)
+
+;; =========================================================================
+;; == Ternary arithmetic
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] MLA and MAD
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - MAD
+;; - MLA
+;; -------------------------------------------------------------------------
+
+;; Unpredicated integer addition of product.
+(define_expand "fma<mode>4"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(plus:SVE_FULL_I
+	  (unspec:SVE_FULL_I
+	    [(match_dup 4)
+	     (mult:SVE_FULL_I
+	       (match_operand:SVE_FULL_I 1 "register_operand")
+	       (match_operand:SVE_FULL_I 2 "nonmemory_operand"))]
+	    UNSPEC_PRED_X)
+	  (match_operand:SVE_FULL_I 3 "register_operand")))]
   "TARGET_SVE"
   {
-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    if (aarch64_prepare_sve_int_fma (operands, PLUS))
+      DONE;
+    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
   }
 )
 
-;; Division predicated with a PTRUE.
-(define_insn "*<optab><mode>3"
-  [(set (match_operand:SVE_SDI 0 "register_operand" "=w, w, ?&w")
-	(unspec:SVE_SDI
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	   (SVE_INT_BINARY_SD:SVE_SDI
-	     (match_operand:SVE_SDI 2 "register_operand" "0, w, w")
-	     (match_operand:SVE_SDI 3 "aarch64_sve_mul_operand" "w, 0, w"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Predicated integer addition of product.
+(define_insn "@aarch64_pred_fma<mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, ?&w")
+	(plus:SVE_FULL_I
+	  (unspec:SVE_FULL_I
+	    [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	     (mult:SVE_FULL_I
+	       (match_operand:SVE_FULL_I 2 "register_operand" "%0, w, w")
+	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w, w"))]
+	    UNSPEC_PRED_X)
+	  (match_operand:SVE_FULL_I 4 "register_operand" "w, 0, w")))]
   "TARGET_SVE"
   "@
-   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-   <sve_int_op>r\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
-   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+   mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %4\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
   [(set_attr "movprfx" "*,*,yes")]
 )
 
-;; Unpredicated NEG, NOT and POPCOUNT.
-(define_expand "<optab><mode>2"
-  [(set (match_operand:SVE_I 0 "register_operand")
-	(unspec:SVE_I
-	  [(match_dup 2)
-	   (SVE_INT_UNARY:SVE_I (match_operand:SVE_I 1 "register_operand"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Predicated integer addition of product with merging.
+(define_expand "cond_fma<mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (plus:SVE_FULL_I
+	     (mult:SVE_FULL_I
+	       (match_operand:SVE_FULL_I 2 "register_operand")
+	       (match_operand:SVE_FULL_I 3 "general_operand"))
+	     (match_operand:SVE_FULL_I 4 "register_operand"))
+	   (match_operand:SVE_FULL_I 5 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
   "TARGET_SVE"
   {
-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    if (aarch64_prepare_sve_cond_int_fma (operands, PLUS))
+      DONE;
+    /* Swap the multiplication operands if the fallback value is the
+       second of the two.  */
+    if (rtx_equal_p (operands[3], operands[5]))
+      std::swap (operands[2], operands[3]);
   }
 )
 
-;; NEG, NOT and POPCOUNT predicated with a PTRUE.
-(define_insn "*<optab><mode>2"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w")
-	(unspec:SVE_I
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (SVE_INT_UNARY:SVE_I
-	     (match_operand:SVE_I 2 "register_operand" "w"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Predicated integer addition of product, merging with the first input.
+(define_insn "*cond_fma<mode>_2"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (plus:SVE_FULL_I
+	     (mult:SVE_FULL_I
+	       (match_operand:SVE_FULL_I 2 "register_operand" "0, w")
+	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w"))
+	     (match_operand:SVE_FULL_I 4 "register_operand" "w, w"))
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
   "TARGET_SVE"
-  "<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
+  "@
+   mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0, %2\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Vector AND, ORR and XOR.
-(define_insn "<optab><mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w, w")
-	(LOGICAL:SVE_I
-	  (match_operand:SVE_I 1 "register_operand" "%0, w")
-	  (match_operand:SVE_I 2 "aarch64_sve_logical_operand" "vsl, w")))]
+;; Predicated integer addition of product, merging with the third input.
+(define_insn "*cond_fma<mode>_4"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (plus:SVE_FULL_I
+	     (mult:SVE_FULL_I
+	       (match_operand:SVE_FULL_I 2 "register_operand" "w, w")
+	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w"))
+	     (match_operand:SVE_FULL_I 4 "register_operand" "0, w"))
+	   (match_dup 4)]
+	  UNSPEC_SEL))]
   "TARGET_SVE"
   "@
-   <logical>\t%0.<Vetype>, %0.<Vetype>, #%C2
-   <logical>\t%0.d, %1.d, %2.d"
+   mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %4\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Vector AND, ORR and XOR on floating-point modes.  We avoid subregs
-;; by providing this, but we need to use UNSPECs since rtx logical ops
-;; aren't defined for floating-point modes.
-(define_insn "*<optab><mode>3"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w")
-	(unspec:SVE_F [(match_operand:SVE_F 1 "register_operand" "w")
-		       (match_operand:SVE_F 2 "register_operand" "w")]
-		      LOGICALF))]
-  "TARGET_SVE"
-  "<logicalf_op>\t%0.d, %1.d, %2.d"
+;; Predicated integer addition of product, merging with an independent value.
+(define_insn_and_rewrite "*cond_fma<mode>_any"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	   (plus:SVE_FULL_I
+	     (mult:SVE_FULL_I
+	       (match_operand:SVE_FULL_I 2 "register_operand" "w, w, 0, w, w, w")
+	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w, w, 0, w, w"))
+	     (match_operand:SVE_FULL_I 4 "register_operand" "w, 0, w, w, w, w"))
+	   (match_operand:SVE_FULL_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[5])
+   && !rtx_equal_p (operands[3], operands[5])
+   && !rtx_equal_p (operands[4], operands[5])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mad\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[5], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[5])"
+  {
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+					     operands[5], operands[1]));
+    operands[5] = operands[4] = operands[0];
+  }
+  [(set_attr "movprfx" "yes")]
 )
 
-;; REG_EQUAL notes on "not<mode>3" should ensure that we can generate
-;; this pattern even though the NOT instruction itself is predicated.
-(define_insn "bic<mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w")
-	(and:SVE_I
-	  (not:SVE_I (match_operand:SVE_I 1 "register_operand" "w"))
-	  (match_operand:SVE_I 2 "register_operand" "w")))]
+;; -------------------------------------------------------------------------
+;; ---- [INT] MLS and MSB
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - MLS
+;; - MSB
+;; -------------------------------------------------------------------------
+
+;; Unpredicated integer subtraction of product.
+(define_expand "fnma<mode>4"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+	(minus:SVE_FULL_I
+	  (match_operand:SVE_FULL_I 3 "register_operand")
+	  (unspec:SVE_FULL_I
+	    [(match_dup 4)
+	     (mult:SVE_FULL_I
+	       (match_operand:SVE_FULL_I 1 "register_operand")
+	       (match_operand:SVE_FULL_I 2 "general_operand"))]
+	    UNSPEC_PRED_X)))]
   "TARGET_SVE"
-  "bic\t%0.d, %2.d, %1.d"
+  {
+    if (aarch64_prepare_sve_int_fma (operands, MINUS))
+      DONE;
+    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+  }
 )
 
-;; Predicate AND.  We can reuse one of the inputs as the GP.
-(define_insn "and<mode>3"
-  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
-	(and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand" "Upa")
-		      (match_operand:PRED_ALL 2 "register_operand" "Upa")))]
+;; Predicated integer subtraction of product.
+(define_insn "@aarch64_pred_fnma<mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, ?&w")
+	(minus:SVE_FULL_I
+	  (match_operand:SVE_FULL_I 4 "register_operand" "w, 0, w")
+	  (unspec:SVE_FULL_I
+	    [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	     (mult:SVE_FULL_I
+	       (match_operand:SVE_FULL_I 2 "register_operand" "%0, w, w")
+	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w, w"))]
+	    UNSPEC_PRED_X)))]
   "TARGET_SVE"
-  "and\t%0.b, %1/z, %1.b, %2.b"
+  "@
+   msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %4\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,*,yes")]
 )
 
-;; Unpredicated predicate ORR and XOR.
-(define_expand "<optab><mode>3"
-  [(set (match_operand:PRED_ALL 0 "register_operand")
-	(and:PRED_ALL
-	  (LOGICAL_OR:PRED_ALL
-	    (match_operand:PRED_ALL 1 "register_operand")
-	    (match_operand:PRED_ALL 2 "register_operand"))
-	  (match_dup 3)))]
+;; Predicated integer subtraction of product with merging.
+(define_expand "cond_fnma<mode>"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand")
+   (unspec:SVE_FULL_I
+	[(match_operand:<VPRED> 1 "register_operand")
+	 (minus:SVE_FULL_I
+	   (match_operand:SVE_FULL_I 4 "register_operand")
+	   (mult:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 2 "register_operand")
+	     (match_operand:SVE_FULL_I 3 "general_operand")))
+	 (match_operand:SVE_FULL_I 5 "aarch64_simd_reg_or_zero")]
+	UNSPEC_SEL))]
   "TARGET_SVE"
   {
-    operands[3] = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode));
+    if (aarch64_prepare_sve_cond_int_fma (operands, MINUS))
+      DONE;
+    /* Swap the multiplication operands if the fallback value is the
+       second of the two.  */
+    if (rtx_equal_p (operands[3], operands[5]))
+      std::swap (operands[2], operands[3]);
   }
 )
 
-;; Predicated predicate ORR and XOR.
-(define_insn "pred_<optab><mode>3"
-  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
-	(and:PRED_ALL
-	  (LOGICAL:PRED_ALL
-	    (match_operand:PRED_ALL 2 "register_operand" "Upa")
-	    (match_operand:PRED_ALL 3 "register_operand" "Upa"))
-	  (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
+;; Predicated integer subtraction of product, merging with the first input.
+(define_insn "*cond_fnma<mode>_2"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (minus:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 4 "register_operand" "w, w")
+	     (mult:SVE_FULL_I
+	       (match_operand:SVE_FULL_I 2 "register_operand" "0, w")
+	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w")))
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
   "TARGET_SVE"
-  "<logical>\t%0.b, %1/z, %2.b, %3.b"
+  "@
+   msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0, %2\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Perform a logical operation on operands 2 and 3, using operand 1 as
-;; the GP (which is known to be a PTRUE).  Store the result in operand 0
-;; and set the flags in the same way as for PTEST.  The (and ...) in the
-;; UNSPEC_PTEST_PTRUE is logically redundant, but means that the tested
-;; value is structurally equivalent to rhs of the second set.
-(define_insn "*<optab><mode>3_cc"
-  [(set (reg:CC CC_REGNUM)
-	(compare:CC
-	  (unspec:SI [(match_operand:PRED_ALL 1 "register_operand" "Upa")
-		      (and:PRED_ALL
-			(LOGICAL:PRED_ALL
-			  (match_operand:PRED_ALL 2 "register_operand" "Upa")
-			  (match_operand:PRED_ALL 3 "register_operand" "Upa"))
-			(match_dup 1))]
-		     UNSPEC_PTEST_PTRUE)
-	  (const_int 0)))
-   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
-	(and:PRED_ALL (LOGICAL:PRED_ALL (match_dup 2) (match_dup 3))
-		      (match_dup 1)))]
+;; Predicated integer subtraction of product, merging with the third input.
+(define_insn "*cond_fnma<mode>_4"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (minus:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 4 "register_operand" "0, w")
+	     (mult:SVE_FULL_I
+	       (match_operand:SVE_FULL_I 2 "register_operand" "w, w")
+	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w")))
+	   (match_dup 4)]
+	  UNSPEC_SEL))]
   "TARGET_SVE"
-  "<logical>s\t%0.b, %1/z, %2.b, %3.b"
+  "@
+   mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %4\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Unpredicated predicate inverse.
-(define_expand "one_cmpl<mode>2"
-  [(set (match_operand:PRED_ALL 0 "register_operand")
-	(and:PRED_ALL
-	  (not:PRED_ALL (match_operand:PRED_ALL 1 "register_operand"))
-	  (match_dup 2)))]
-  "TARGET_SVE"
+;; Predicated integer subtraction of product, merging with an
+;; independent value.
+(define_insn_and_rewrite "*cond_fnma<mode>_any"
+  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
+	(unspec:SVE_FULL_I
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	   (minus:SVE_FULL_I
+	     (match_operand:SVE_FULL_I 4 "register_operand" "w, 0, w, w, w, w")
+	     (mult:SVE_FULL_I
+	       (match_operand:SVE_FULL_I 2 "register_operand" "w, w, 0, w, w, w")
+	       (match_operand:SVE_FULL_I 3 "register_operand" "w, w, w, 0, w, w")))
+	   (match_operand:SVE_FULL_I 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[5])
+   && !rtx_equal_p (operands[3], operands[5])
+   && !rtx_equal_p (operands[4], operands[5])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;msb\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;mls\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   #"
+  "&& reload_completed
+   && register_operand (operands[5], <MODE>mode)
+   && !rtx_equal_p (operands[0], operands[5])"
   {
-    operands[2] = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode));
+    emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+					     operands[5], operands[1]));
+    operands[5] = operands[4] = operands[0];
   }
+  [(set_attr "movprfx" "yes")]
 )
 
-;; Predicated predicate inverse.
-(define_insn "*one_cmpl<mode>3"
-  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
-	(and:PRED_ALL
-	  (not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand" "Upa"))
-	  (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
-  "TARGET_SVE"
-  "not\t%0.b, %1/z, %2.b"
-)
-
-;; Predicated predicate BIC and ORN.
-(define_insn "*<nlogical><mode>3"
-  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
-	(and:PRED_ALL
-	  (NLOGICAL:PRED_ALL
-	    (not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand" "Upa"))
-	    (match_operand:PRED_ALL 3 "register_operand" "Upa"))
-	  (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
-  "TARGET_SVE"
-  "<nlogical>\t%0.b, %1/z, %3.b, %2.b"
-)
+;; -------------------------------------------------------------------------
+;; ---- [INT] Dot product
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - SDOT
+;; - SUDOT   (I8MM)
+;; - UDOT
+;; - USDOT   (I8MM)
+;; -------------------------------------------------------------------------
 
-;; Predicated predicate NAND and NOR.
-(define_insn "*<logical_nn><mode>3"
-  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
-	(and:PRED_ALL
-	  (NLOGICAL:PRED_ALL
-	    (not:PRED_ALL (match_operand:PRED_ALL 2 "register_operand" "Upa"))
-	    (not:PRED_ALL (match_operand:PRED_ALL 3 "register_operand" "Upa")))
-	  (match_operand:PRED_ALL 1 "register_operand" "Upa")))]
+;; Four-element integer dot-product with accumulation.
+(define_insn "<sur>dot_prod<vsi2qi>"
+  [(set (match_operand:SVE_FULL_SDI 0 "register_operand" "=w, ?&w")
+	(plus:SVE_FULL_SDI
+	  (unspec:SVE_FULL_SDI
+	    [(match_operand:<VSI2QI> 1 "register_operand" "w, w")
+	     (match_operand:<VSI2QI> 2 "register_operand" "w, w")]
+	    DOTPROD)
+	  (match_operand:SVE_FULL_SDI 3 "register_operand" "0, w")))]
   "TARGET_SVE"
-  "<logical_nn>\t%0.b, %1/z, %2.b, %3.b"
+  "@
+   <sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>
+   movprfx\t%0, %3\;<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>"
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Unpredicated LSL, LSR and ASR by a vector.
-(define_expand "v<optab><mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand")
-	(unspec:SVE_I
-	  [(match_dup 3)
-	   (ASHIFT:SVE_I
-	     (match_operand:SVE_I 1 "register_operand")
-	     (match_operand:SVE_I 2 "aarch64_sve_<lr>shift_operand"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Four-element integer dot-product by selected lanes with accumulation.
+(define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>"
+  [(set (match_operand:SVE_FULL_SDI 0 "register_operand" "=w, ?&w")
+	(plus:SVE_FULL_SDI
+	  (unspec:SVE_FULL_SDI
+	    [(match_operand:<VSI2QI> 1 "register_operand" "w, w")
+	     (unspec:<VSI2QI>
+	       [(match_operand:<VSI2QI> 2 "register_operand" "<sve_lane_con>, <sve_lane_con>")
+		(match_operand:SI 3 "const_int_operand")]
+	       UNSPEC_SVE_LANE_SELECT)]
+	    DOTPROD)
+	  (match_operand:SVE_FULL_SDI 4 "register_operand" "0, w")))]
   "TARGET_SVE"
-  {
-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
-  }
+  "@
+   <sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>[%3]
+   movprfx\t%0, %4\;<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>[%3]"
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; LSL, LSR and ASR by a vector, predicated with a PTRUE.  We don't
-;; actually need the predicate for the first alternative, but using Upa
-;; or X isn't likely to gain much and would make the instruction seem
-;; less uniform to the register allocator.
-(define_insn_and_split "*v<optab><mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
-	(unspec:SVE_I
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	   (ASHIFT:SVE_I
-	     (match_operand:SVE_I 2 "register_operand" "w, 0, w")
-	     (match_operand:SVE_I 3 "aarch64_sve_<lr>shift_operand" "D<lr>, w, w"))]
-	  UNSPEC_MERGE_PTRUE))]
-  "TARGET_SVE"
+(define_insn "@aarch64_<sur>dot_prod<vsi2qi>"
+  [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
+        (plus:VNx4SI_ONLY
+	  (unspec:VNx4SI_ONLY
+	    [(match_operand:<VSI2QI> 1 "register_operand" "w, w")
+	     (match_operand:<VSI2QI> 2 "register_operand" "w, w")]
+	    DOTPROD_US_ONLY)
+	  (match_operand:VNx4SI_ONLY 3 "register_operand" "0, w")))]
+  "TARGET_SVE_I8MM"
   "@
-   #
-   <shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-   movprfx\t%0, %2\;<shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
-  "&& reload_completed
-   && !register_operand (operands[3], <MODE>mode)"
-  [(set (match_dup 0) (ASHIFT:SVE_I (match_dup 2) (match_dup 3)))]
-  ""
-  [(set_attr "movprfx" "*,*,yes")]
+   <sur>dot\\t%0.s, %1.b, %2.b
+   movprfx\t%0, %3\;<sur>dot\\t%0.s, %1.b, %2.b"
+   [(set_attr "movprfx" "*,yes")]
 )
 
-;; Unpredicated shift operations by a constant (post-RA only).
-;; These are generated by splitting a predicated instruction whose
-;; predicate is unused.
-(define_insn "*post_ra_v<optab><mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w")
-	(ASHIFT:SVE_I
-	  (match_operand:SVE_I 1 "register_operand" "w")
-	  (match_operand:SVE_I 2 "aarch64_simd_<lr>shift_imm")))]
-  "TARGET_SVE && reload_completed"
-  "<shift>\t%0.<Vetype>, %1.<Vetype>, #%2"
+(define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>"
+  [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
+	(plus:VNx4SI_ONLY
+	  (unspec:VNx4SI_ONLY
+	    [(match_operand:<VSI2QI> 1 "register_operand" "w, w")
+	     (unspec:<VSI2QI>
+	       [(match_operand:<VSI2QI> 2 "register_operand" "y, y")
+		(match_operand:SI 3 "const_int_operand")]
+	       UNSPEC_SVE_LANE_SELECT)]
+	    DOTPROD_I8MM)
+	  (match_operand:VNx4SI_ONLY 4 "register_operand" "0, w")))]
+  "TARGET_SVE_I8MM"
+  "@
+   <sur>dot\\t%0.s, %1.b, %2.b[%3]
+   movprfx\t%0, %4\;<sur>dot\\t%0.s, %1.b, %2.b[%3]"
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; LSL, LSR and ASR by a scalar, which expands into one of the vector
-;; shifts above.
-(define_expand "<ASHIFT:optab><mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand")
-	(ASHIFT:SVE_I (match_operand:SVE_I 1 "register_operand")
-		      (match_operand:<VEL> 2 "general_operand")))]
+;; -------------------------------------------------------------------------
+;; ---- [INT] Sum of absolute differences
+;; -------------------------------------------------------------------------
+;; The patterns in this section are synthetic.
+;; -------------------------------------------------------------------------
+
+;; Emit a sequence to produce a sum-of-absolute-differences of the inputs in
+;; operands 1 and 2.  The sequence also has to perform a widening reduction of
+;; the difference into a vector and accumulate that into operand 3 before
+;; copying that into the result operand 0.
+;; Perform that with a sequence of:
+;; MOV		ones.b, #1
+;; [SU]ABD	diff.b, p0/m, op1.b, op2.b
+;; MOVPRFX	op0, op3	// If necessary
+;; UDOT		op0.s, diff.b, ones.b
+(define_expand "<sur>sad<vsi2qi>"
+  [(use (match_operand:SVE_FULL_SDI 0 "register_operand"))
+   (unspec:<VSI2QI> [(use (match_operand:<VSI2QI> 1 "register_operand"))
+		    (use (match_operand:<VSI2QI> 2 "register_operand"))] ABAL)
+   (use (match_operand:SVE_FULL_SDI 3 "register_operand"))]
   "TARGET_SVE"
   {
-    rtx amount;
-    if (CONST_INT_P (operands[2]))
-      {
-	amount = gen_const_vec_duplicate (<MODE>mode, operands[2]);
-	if (!aarch64_sve_<lr>shift_operand (operands[2], <MODE>mode))
-	  amount = force_reg (<MODE>mode, amount);
-      }
-    else
-      {
-	amount = gen_reg_rtx (<MODE>mode);
-	emit_insn (gen_vec_duplicate<mode> (amount,
-					    convert_to_mode (<VEL>mode,
-							     operands[2], 0)));
-      }
-    emit_insn (gen_v<optab><mode>3 (operands[0], operands[1], amount));
+    rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
+    rtx diff = gen_reg_rtx (<VSI2QI>mode);
+    emit_insn (gen_<sur>abd<vsi2qi>_3 (diff, operands[1], operands[2]));
+    emit_insn (gen_udot_prod<vsi2qi> (operands[0], diff, ones, operands[3]));
     DONE;
   }
 )
 
-;; Test all bits of operand 1.  Operand 0 is a GP that is known to hold PTRUE.
-;;
-;; Using UNSPEC_PTEST_PTRUE allows combine patterns to assume that the GP
-;; is a PTRUE even if the optimizers haven't yet been able to propagate
-;; the constant.  We would use a separate unspec code for PTESTs involving
-;; GPs that might not be PTRUEs.
-(define_insn "ptest_ptrue<mode>"
-  [(set (reg:CC CC_REGNUM)
-	(compare:CC
-	  (unspec:SI [(match_operand:PRED_ALL 0 "register_operand" "Upa")
-		      (match_operand:PRED_ALL 1 "register_operand" "Upa")]
-		     UNSPEC_PTEST_PTRUE)
-	  (const_int 0)))]
-  "TARGET_SVE"
-  "ptest\t%0, %1.b"
+;; -------------------------------------------------------------------------
+;; ---- [INT] Matrix multiply-accumulate
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - SMMLA (I8MM)
+;; - UMMLA (I8MM)
+;; - USMMLA (I8MM)
+;; -------------------------------------------------------------------------
+
+(define_insn "@aarch64_sve_add_<optab><vsi2qi>"
+  [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
+	(plus:VNx4SI_ONLY
+	  (unspec:VNx4SI_ONLY
+	    [(match_operand:<VSI2QI> 2 "register_operand" "w, w")
+	     (match_operand:<VSI2QI> 3 "register_operand" "w, w")]
+	    MATMUL)
+	  (match_operand:VNx4SI_ONLY 1 "register_operand" "0, w")))]
+  "TARGET_SVE_I8MM"
+  "@
+   <sur>mmla\\t%0.s, %2.b, %3.b
+   movprfx\t%0, %1\;<sur>mmla\\t%0.s, %2.b, %3.b"
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Set element I of the result if operand1 + J < operand2 for all J in [0, I].
-;; with the comparison being unsigned.
-(define_insn "while_ult<GPI:mode><PRED_ALL:mode>"
-  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
-	(unspec:PRED_ALL [(match_operand:GPI 1 "aarch64_reg_or_zero" "rZ")
-			  (match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")]
-			 UNSPEC_WHILE_LO))
-   (clobber (reg:CC CC_REGNUM))]
-  "TARGET_SVE"
-  "whilelo\t%0.<PRED_ALL:Vetype>, %<w>1, %<w>2"
-)
-
-;; WHILELO sets the flags in the same way as a PTEST with a PTRUE GP.
-;; Handle the case in which both results are useful.  The GP operand
-;; to the PTEST isn't needed, so we allow it to be anything.
-(define_insn_and_split "while_ult<GPI:mode><PRED_ALL:mode>_cc"
-  [(set (reg:CC CC_REGNUM)
-	(compare:CC
-	  (unspec:SI [(match_operand:PRED_ALL 1)
-		      (unspec:PRED_ALL
-			[(match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")
-			 (match_operand:GPI 3 "aarch64_reg_or_zero" "rZ")]
-			UNSPEC_WHILE_LO)]
-		     UNSPEC_PTEST_PTRUE)
-	  (const_int 0)))
-   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
-	(unspec:PRED_ALL [(match_dup 2)
-			  (match_dup 3)]
-			 UNSPEC_WHILE_LO))]
+;; -------------------------------------------------------------------------
+;; ---- [FP] General ternary arithmetic corresponding to unspecs
+;; -------------------------------------------------------------------------
+;; Includes merging patterns for:
+;; - FMAD
+;; - FMLA
+;; - FMLS
+;; - FMSB
+;; - FNMAD
+;; - FNMLA
+;; - FNMLS
+;; - FNMSB
+;; -------------------------------------------------------------------------
+
+;; Unpredicated floating-point ternary operations.
+(define_expand "<optab><mode>4"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_dup 4)
+	   (const_int SVE_RELAXED_GP)
+	   (match_operand:SVE_FULL_F 1 "register_operand")
+	   (match_operand:SVE_FULL_F 2 "register_operand")
+	   (match_operand:SVE_FULL_F 3 "register_operand")]
+	  SVE_COND_FP_TERNARY))]
   "TARGET_SVE"
-  "whilelo\t%0.<PRED_ALL:Vetype>, %<w>2, %<w>3"
-  ;; Force the compiler to drop the unused predicate operand, so that we
-  ;; don't have an unnecessary PTRUE.
-  "&& !CONSTANT_P (operands[1])"
-  [(const_int 0)]
   {
-    emit_insn (gen_while_ult<GPI:mode><PRED_ALL:mode>_cc
-	       (operands[0], CONSTM1_RTX (<MODE>mode),
-		operands[2], operands[3]));
-    DONE;
+    operands[4] = aarch64_ptrue_reg (<VPRED>mode);
   }
 )
 
-;; Integer comparisons predicated with a PTRUE.
-(define_insn "*cmp<cmp_op><mode>"
-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
-	(unspec:<VPRED>
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (SVE_INT_CMP:<VPRED>
-	     (match_operand:SVE_I 2 "register_operand" "w, w")
-	     (match_operand:SVE_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
-	  UNSPEC_MERGE_PTRUE))
-   (clobber (reg:CC CC_REGNUM))]
+;; Predicated floating-point ternary operations.
+(define_insn "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	   (match_operand:SVE_FULL_F 2 "register_operand" "%w, 0, w")
+	   (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w")
+	   (match_operand:SVE_FULL_F 4 "register_operand" "0, w, w")]
+	  SVE_COND_FP_TERNARY))]
   "TARGET_SVE"
   "@
-   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3
-   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
+   <sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   <sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0, %4\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,*,yes")]
 )
 
-;; Integer comparisons predicated with a PTRUE in which only the flags result
-;; is interesting.
-(define_insn "*cmp<cmp_op><mode>_ptest"
-  [(set (reg:CC CC_REGNUM)
-	(compare:CC
-	  (unspec:SI
-	    [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	     (unspec:<VPRED>
-	       [(match_dup 1)
-		(SVE_INT_CMP:<VPRED>
-		  (match_operand:SVE_I 2 "register_operand" "w, w")
-		  (match_operand:SVE_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
-	       UNSPEC_MERGE_PTRUE)]
-	    UNSPEC_PTEST_PTRUE)
-	  (const_int 0)))
-   (clobber (match_scratch:<VPRED> 0 "=Upa, Upa"))]
+;; Predicated floating-point ternary operations with merging.
+(define_expand "@cond_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand")
+	      (match_operand:SVE_FULL_F 3 "register_operand")
+	      (match_operand:SVE_FULL_F 4 "register_operand")]
+	     SVE_COND_FP_TERNARY)
+	   (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
   "TARGET_SVE"
+{
+  /* Swap the multiplication operands if the fallback value is the
+     second of the two.  */
+  if (rtx_equal_p (operands[3], operands[5]))
+    std::swap (operands[2], operands[3]);
+})
+
+;; Predicated floating-point ternary operations, merging with the
+;; first input.
+(define_insn_and_rewrite "*cond_<optab><mode>_2"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 5)
+	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "0, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")
+	      (match_operand:SVE_FULL_F 4 "register_operand" "w, w")]
+	     SVE_COND_FP_TERNARY)
+	   (match_dup 2)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
   "@
-   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3
-   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
+   <sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0, %2\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
+  "&& !rtx_equal_p (operands[1], operands[5])"
+  {
+    operands[5] = copy_rtx (operands[1]);
+  }
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Integer comparisons predicated with a PTRUE in which both the flag and
-;; predicate results are interesting.
-(define_insn "*cmp<cmp_op><mode>_cc"
-  [(set (reg:CC CC_REGNUM)
-	(compare:CC
-	  (unspec:SI
-	    [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	     (unspec:<VPRED>
-	       [(match_dup 1)
-		(SVE_INT_CMP:<VPRED>
-		  (match_operand:SVE_I 2 "register_operand" "w, w")
-		  (match_operand:SVE_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
-	       UNSPEC_MERGE_PTRUE)]
-	    UNSPEC_PTEST_PTRUE)
-	  (const_int 0)))
-   (set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
-	(unspec:<VPRED>
-	  [(match_dup 1)
-	   (SVE_INT_CMP:<VPRED>
-	     (match_dup 2)
-	     (match_dup 3))]
-	  UNSPEC_MERGE_PTRUE))]
-  "TARGET_SVE"
+;; Predicated floating-point ternary operations, merging with the
+;; third input.
+(define_insn_and_rewrite "*cond_<optab><mode>_4"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 5)
+	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")
+	      (match_operand:SVE_FULL_F 4 "register_operand" "0, w")]
+	     SVE_COND_FP_TERNARY)
+	   (match_dup 4)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
   "@
-   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3
-   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
+   <sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %4\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
+  "&& !rtx_equal_p (operands[1], operands[5])"
+  {
+    operands[5] = copy_rtx (operands[1]);
+  }
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Predicated integer comparisons, formed by combining a PTRUE-predicated
-;; comparison with an AND.  Split the instruction into its preferred form
-;; (below) at the earliest opportunity, in order to get rid of the
-;; redundant operand 1.
-(define_insn_and_split "*pred_cmp<cmp_op><mode>_combine"
-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
-       (and:<VPRED>
-         (unspec:<VPRED>
-           [(match_operand:<VPRED> 1)
-            (SVE_INT_CMP:<VPRED>
-              (match_operand:SVE_I 2 "register_operand" "w, w")
-              (match_operand:SVE_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
-           UNSPEC_MERGE_PTRUE)
-         (match_operand:<VPRED> 4 "register_operand" "Upl, Upl")))
-   (clobber (reg:CC CC_REGNUM))]
-  "TARGET_SVE"
-  "#"
+;; Predicated floating-point ternary operations, merging with an
+;; independent value.
+(define_insn_and_rewrite "*cond_<optab><mode>_any"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, &w, &w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 6)
+	      (match_operand:SI 7 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, 0, w, w, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, 0, w, w")
+	      (match_operand:SVE_FULL_F 4 "register_operand" "w, 0, w, w, w, w")]
+	     SVE_COND_FP_TERNARY)
+	   (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero" "Dz, Dz, Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[2], operands[5])
+   && !rtx_equal_p (operands[3], operands[5])
+   && !rtx_equal_p (operands[4], operands[5])
+   && aarch64_sve_pred_dominates_p (&operands[6], operands[1])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
+   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+   #"
   "&& 1"
-  [(parallel
-     [(set (match_dup 0)
-          (and:<VPRED>
-            (SVE_INT_CMP:<VPRED>
-              (match_dup 2)
-              (match_dup 3))
-            (match_dup 4)))
-      (clobber (reg:CC CC_REGNUM))])]
+  {
+    if (reload_completed
+        && register_operand (operands[5], <MODE>mode)
+        && !rtx_equal_p (operands[0], operands[5]))
+      {
+	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+						 operands[5], operands[1]));
+	operands[5] = operands[4] = operands[0];
+      }
+    else if (!rtx_equal_p (operands[1], operands[6]))
+      operands[6] = copy_rtx (operands[1]);
+    else
+      FAIL;
+  }
+  [(set_attr "movprfx" "yes")]
 )
 
-;; Predicated integer comparisons.
-(define_insn "*pred_cmp<cmp_op><mode>"
-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
-	(and:<VPRED>
-	  (SVE_INT_CMP:<VPRED>
-	    (match_operand:SVE_I 2 "register_operand" "w, w")
-	    (match_operand:SVE_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))
-	  (match_operand:<VPRED> 1 "register_operand" "Upl, Upl")))
-   (clobber (reg:CC CC_REGNUM))]
+;; Unpredicated FMLA and FMLS by selected lanes.  It doesn't seem worth using
+;; (fma ...) since target-independent code won't understand the indexing.
+(define_insn "@aarch64_<optab>_lane_<mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:SVE_FULL_F 1 "register_operand" "w, w")
+	   (unspec:SVE_FULL_F
+	     [(match_operand:SVE_FULL_F 2 "register_operand" "<sve_lane_con>, <sve_lane_con>")
+	      (match_operand:SI 3 "const_int_operand")]
+	     UNSPEC_SVE_LANE_SELECT)
+	   (match_operand:SVE_FULL_F 4 "register_operand" "0, w")]
+	  SVE_FP_TERNARY_LANE))]
   "TARGET_SVE"
   "@
-   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3
-   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
+   <sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3]
+   movprfx\t%0, %4\;<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3]"
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Floating-point comparisons predicated with a PTRUE.
-(define_insn "*fcm<cmp_op><mode>"
-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
-	(unspec:<VPRED>
+;; -------------------------------------------------------------------------
+;; ---- [FP] Complex multiply-add
+;; -------------------------------------------------------------------------
+;; Includes merging patterns for:
+;; - FCMLA
+;; -------------------------------------------------------------------------
+
+;; Predicated FCMLA.
+(define_insn "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (SVE_FP_CMP:<VPRED>
-	     (match_operand:SVE_F 2 "register_operand" "w, w")
-	     (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero" "Dz, w"))]
-	  UNSPEC_MERGE_PTRUE))]
+	   (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	   (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
+	   (match_operand:SVE_FULL_F 3 "register_operand" "w, w")
+	   (match_operand:SVE_FULL_F 4 "register_operand" "0, w")]
+	  SVE_COND_FCMLA))]
   "TARGET_SVE"
   "@
-   fcm<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #0.0
-   fcm<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
+   fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
+   movprfx\t%0, %4\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>"
+  [(set_attr "movprfx" "*,yes")]
 )
 
-(define_insn "*fcmuo<mode>"
-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
-	(unspec:<VPRED>
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (unordered:<VPRED>
-	     (match_operand:SVE_F 2 "register_operand" "w")
-	     (match_operand:SVE_F 3 "register_operand" "w"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Predicated FCMLA with merging.
+(define_expand "@cond_<optab><mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand")
+	      (match_operand:SVE_FULL_F 3 "register_operand")
+	      (match_operand:SVE_FULL_F 4 "register_operand")]
+	     SVE_COND_FCMLA)
+	   (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
   "TARGET_SVE"
-  "fcmuo\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
 )
 
-;; Floating-point comparisons predicated on a PTRUE, with the results ANDed
-;; with another predicate P.  This does not have the same trapping behavior
-;; as predicating the comparison itself on P, but it's a legitimate fold,
-;; since we can drop any potentially-trapping operations whose results
-;; are not needed.
-;;
-;; Split the instruction into its preferred form (below) at the earliest
-;; opportunity, in order to get rid of the redundant operand 1.
-(define_insn_and_split "*fcm<cmp_op><mode>_and_combine"
-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
-	(and:<VPRED>
-	  (unspec:<VPRED>
-	    [(match_operand:<VPRED> 1)
-	     (SVE_FP_CMP
-	       (match_operand:SVE_F 2 "register_operand" "w, w")
-	       (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero" "Dz, w"))]
-	    UNSPEC_MERGE_PTRUE)
-	  (match_operand:<VPRED> 4 "register_operand" "Upl, Upl")))]
-  "TARGET_SVE"
-  "#"
-  "&& 1"
-  [(set (match_dup 0)
-	(and:<VPRED>
-	  (SVE_FP_CMP:<VPRED>
-	    (match_dup 2)
-	    (match_dup 3))
-	  (match_dup 4)))]
+;; Predicated FCMLA, merging with the third input.
+(define_insn_and_rewrite "*cond_<optab><mode>_4"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 5)
+	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w")
+	      (match_operand:SVE_FULL_F 4 "register_operand" "0, w")]
+	     SVE_COND_FCMLA)
+	   (match_dup 4)]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && aarch64_sve_pred_dominates_p (&operands[5], operands[1])"
+  "@
+   fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
+   movprfx\t%0, %4\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>"
+  "&& !rtx_equal_p (operands[1], operands[5])"
+  {
+    operands[5] = copy_rtx (operands[1]);
+  }
+  [(set_attr "movprfx" "*,yes")]
 )
 
-(define_insn_and_split "*fcmuo<mode>_and_combine"
-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
-	(and:<VPRED>
-	  (unspec:<VPRED>
-	    [(match_operand:<VPRED> 1)
-	     (unordered
-	       (match_operand:SVE_F 2 "register_operand" "w")
-	       (match_operand:SVE_F 3 "register_operand" "w"))]
-	    UNSPEC_MERGE_PTRUE)
-	  (match_operand:<VPRED> 4 "register_operand" "Upl")))]
-  "TARGET_SVE"
-  "#"
+;; Predicated FCMLA, merging with an independent value.
+(define_insn_and_rewrite "*cond_<optab><mode>_any"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, &w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 6)
+	      (match_operand:SI 7 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w, w")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w, w, w, w")
+	      (match_operand:SVE_FULL_F 4 "register_operand" "w, 0, w, w")]
+	     SVE_COND_FCMLA)
+	   (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero" "Dz, Dz, 0, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && !rtx_equal_p (operands[4], operands[5])
+   && aarch64_sve_pred_dominates_p (&operands[6], operands[1])"
+  "@
+   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
+   movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
+   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;fcmla\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>, #<rot>
+   #"
   "&& 1"
-  [(set (match_dup 0)
-	(and:<VPRED>
-	  (unordered:<VPRED>
-	    (match_dup 2)
-	    (match_dup 3))
-	  (match_dup 4)))]
+  {
+    if (reload_completed
+        && register_operand (operands[5], <MODE>mode)
+        && !rtx_equal_p (operands[0], operands[5]))
+      {
+	emit_insn (gen_vcond_mask_<mode><vpred> (operands[0], operands[4],
+						 operands[5], operands[1]));
+	operands[5] = operands[4] = operands[0];
+      }
+    else if (!rtx_equal_p (operands[1], operands[6]))
+      operands[6] = copy_rtx (operands[1]);
+    else
+      FAIL;
+  }
+  [(set_attr "movprfx" "yes")]
 )
 
-;; Unpredicated floating-point comparisons, with the results ANDed
-;; with another predicate.  This is a valid fold for the same reasons
-;; as above.
-(define_insn "*fcm<cmp_op><mode>_and"
-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
-	(and:<VPRED>
-	  (SVE_FP_CMP:<VPRED>
-	    (match_operand:SVE_F 2 "register_operand" "w, w")
-	    (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero" "Dz, w"))
-	  (match_operand:<VPRED> 1 "register_operand" "Upl, Upl")))]
+;; Unpredicated FCMLA with indexing.
+(define_insn "@aarch64_<optab>_lane_<mode>"
+  [(set (match_operand:SVE_FULL_HSF 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_HSF
+	  [(match_operand:SVE_FULL_HSF 1 "register_operand" "w, w")
+	   (unspec:SVE_FULL_HSF
+	     [(match_operand:SVE_FULL_HSF 2 "register_operand" "<sve_lane_pair_con>, <sve_lane_pair_con>")
+	      (match_operand:SI 3 "const_int_operand")]
+	     UNSPEC_SVE_LANE_SELECT)
+	   (match_operand:SVE_FULL_HSF 4 "register_operand" "0, w")]
+	  FCMLA))]
   "TARGET_SVE"
   "@
-   fcm<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #0.0
-   fcm<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
+   fcmla\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3], #<rot>
+   movprfx\t%0, %4\;fcmla\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3], #<rot>"
+  [(set_attr "movprfx" "*,yes")]
 )
 
-(define_insn "*fcmuo<mode>_and"
-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
-	(and:<VPRED>
-	  (unordered:<VPRED>
-	    (match_operand:SVE_F 2 "register_operand" "w")
-	    (match_operand:SVE_F 3 "register_operand" "w"))
-	  (match_operand:<VPRED> 1 "register_operand" "Upl")))]
+;; -------------------------------------------------------------------------
+;; ---- [FP] Trigonometric multiply-add
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FTMAD
+;; -------------------------------------------------------------------------
+
+(define_insn "@aarch64_sve_tmad<mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:SVE_FULL_F 1 "register_operand" "0, w")
+	   (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
+	   (match_operand:DI 3 "const_int_operand")]
+	  UNSPEC_FTMAD))]
   "TARGET_SVE"
-  "fcmuo\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
+  "@
+   ftmad\t%0.<Vetype>, %0.<Vetype>, %2.<Vetype>, #%3
+   movprfx\t%0, %1\;ftmad\t%0.<Vetype>, %0.<Vetype>, %2.<Vetype>, #%3"
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Predicated floating-point comparisons.  We don't need a version
-;; of this for unordered comparisons.
-(define_insn "*pred_fcm<cmp_op><mode>"
-  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
-	(unspec:<VPRED>
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (match_operand:SVE_F 2 "register_operand" "w, w")
-	   (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero" "Dz, w")]
-	  SVE_COND_FP_CMP))]
+;; -------------------------------------------------------------------------
+;; ---- [FP] Bfloat16 long ternary arithmetic (SF,BF,BF)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - BFDOT (BF16)
+;; - BFMLALB (BF16)
+;; - BFMLALT (BF16)
+;; - BFMMLA (BF16)
+;; -------------------------------------------------------------------------
+
+(define_insn "@aarch64_sve_<sve_fp_op>vnx4sf"
+  [(set (match_operand:VNx4SF 0 "register_operand" "=w, ?&w")
+	(unspec:VNx4SF
+	  [(match_operand:VNx4SF 1 "register_operand" "0, w")
+	   (match_operand:VNx8BF 2 "register_operand" "w, w")
+	   (match_operand:VNx8BF 3 "register_operand" "w, w")]
+	  SVE_BFLOAT_TERNARY_LONG))]
+  "TARGET_SVE_BF16"
+  "@
+   <sve_fp_op>\t%0.s, %2.h, %3.h
+   movprfx\t%0, %1\;<sve_fp_op>\t%0.s, %2.h, %3.h"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; The immediate range is enforced before generating the instruction.
+(define_insn "@aarch64_sve_<sve_fp_op>_lanevnx4sf"
+  [(set (match_operand:VNx4SF 0 "register_operand" "=w, ?&w")
+	(unspec:VNx4SF
+	  [(match_operand:VNx4SF 1 "register_operand" "0, w")
+	   (match_operand:VNx8BF 2 "register_operand" "w, w")
+	   (match_operand:VNx8BF 3 "register_operand" "y, y")
+	   (match_operand:SI 4 "const_int_operand")]
+	  SVE_BFLOAT_TERNARY_LONG_LANE))]
+  "TARGET_SVE_BF16"
+  "@
+   <sve_fp_op>\t%0.s, %2.h, %3.h[%4]
+   movprfx\t%0, %1\;<sve_fp_op>\t%0.s, %2.h, %3.h[%4]"
+  [(set_attr "movprfx" "*,yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [FP] Matrix multiply-accumulate
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FMMLA (F32MM,F64MM)
+;; -------------------------------------------------------------------------
+
+;; The mode iterator enforces the target requirements.
+(define_insn "@aarch64_sve_<sve_fp_op><mode>"
+  [(set (match_operand:SVE_MATMULF 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_MATMULF
+	  [(match_operand:SVE_MATMULF 2 "register_operand" "w, w")
+	   (match_operand:SVE_MATMULF 3 "register_operand" "w, w")
+	   (match_operand:SVE_MATMULF 1 "register_operand" "0, w")]
+	  FMMLA))]
   "TARGET_SVE"
   "@
-   fcm<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #0.0
-   fcm<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
+   <sve_fp_op>\\t%0.<Vetype>, %2.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %1\;<sve_fp_op>\\t%0.<Vetype>, %2.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*,yes")]
 )
 
+;; =========================================================================
+;; == Comparisons and selects
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- [INT,FP] Select based on predicates
+;; -------------------------------------------------------------------------
+;; Includes merging patterns for:
+;; - FMOV
+;; - MOV
+;; - SEL
+;; -------------------------------------------------------------------------
+
 ;; vcond_mask operand order: true, false, mask
 ;; UNSPEC_SEL operand order: mask, true, false (as for VEC_COND_EXPR)
 ;; SEL operand order:        mask, true, false
-(define_insn "vcond_mask_<mode><vpred>"
-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
-	(unspec:SVE_ALL
-	  [(match_operand:<VPRED> 3 "register_operand" "Upa")
-	   (match_operand:SVE_ALL 1 "register_operand" "w")
-	   (match_operand:SVE_ALL 2 "register_operand" "w")]
+(define_expand "@vcond_mask_<mode><vpred>"
+  [(set (match_operand:SVE_FULL 0 "register_operand")
+	(unspec:SVE_FULL
+	  [(match_operand:<VPRED> 3 "register_operand")
+	   (match_operand:SVE_FULL 1 "aarch64_sve_reg_or_dup_imm")
+	   (match_operand:SVE_FULL 2 "aarch64_simd_reg_or_zero")]
 	  UNSPEC_SEL))]
   "TARGET_SVE"
-  "sel\t%0.<Vetype>, %3, %1.<Vetype>, %2.<Vetype>"
+  {
+    if (register_operand (operands[1], <MODE>mode))
+      operands[2] = force_reg (<MODE>mode, operands[2]);
+  }
 )
 
-;; Selects between a duplicated immediate and zero.
-(define_insn "aarch64_sve_dup<mode>_const"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w")
-	(unspec:SVE_I
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (match_operand:SVE_I 2 "aarch64_sve_dup_immediate")
-	   (match_operand:SVE_I 3 "aarch64_simd_imm_zero")]
+;; Selects between:
+;; - two registers
+;; - a duplicated immediate and a register
+;; - a duplicated immediate and zero
+(define_insn "*vcond_mask_<mode><vpred>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, w, w, w, ?w, ?&w, ?&w")
+	(unspec:SVE_FULL
+	  [(match_operand:<VPRED> 3 "register_operand" "Upa, Upa, Upa, Upa, Upl, Upl, Upl")
+	   (match_operand:SVE_FULL 1 "aarch64_sve_reg_or_dup_imm" "w, vss, vss, Ufc, Ufc, vss, Ufc")
+	   (match_operand:SVE_FULL 2 "aarch64_simd_reg_or_zero" "w, 0, Dz, 0, Dz, w, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && (!register_operand (operands[1], <MODE>mode)
+       || register_operand (operands[2], <MODE>mode))"
+  "@
+   sel\t%0.<Vetype>, %3, %1.<Vetype>, %2.<Vetype>
+   mov\t%0.<Vetype>, %3/m, #%I1
+   mov\t%0.<Vetype>, %3/z, #%I1
+   fmov\t%0.<Vetype>, %3/m, #%1
+   movprfx\t%0.<Vetype>, %3/z, %0.<Vetype>\;fmov\t%0.<Vetype>, %3/m, #%1
+   movprfx\t%0, %2\;mov\t%0.<Vetype>, %3/m, #%I1
+   movprfx\t%0, %2\;fmov\t%0.<Vetype>, %3/m, #%1"
+  [(set_attr "movprfx" "*,*,*,*,yes,yes,yes")]
+)
+
+;; Optimize selects between a duplicated scalar variable and another vector,
+;; the latter of which can be a zero constant or a variable.  Treat duplicates
+;; of GPRs as being more expensive than duplicates of FPRs, since they
+;; involve a cross-file move.
+(define_insn "@aarch64_sel_dup<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=?w, w, ??w, ?&w, ??&w, ?&w")
+	(unspec:SVE_FULL
+	  [(match_operand:<VPRED> 3 "register_operand" "Upa, Upa, Upl, Upl, Upl, Upl")
+	   (vec_duplicate:SVE_FULL
+	     (match_operand:<VEL> 1 "register_operand" "r, w, r, w, r, w"))
+	   (match_operand:SVE_FULL 2 "aarch64_simd_reg_or_zero" "0, 0, Dz, Dz, w, w")]
 	  UNSPEC_SEL))]
   "TARGET_SVE"
-  "mov\t%0.<Vetype>, %1/z, #%2"
+  "@
+   mov\t%0.<Vetype>, %3/m, %<vwcore>1
+   mov\t%0.<Vetype>, %3/m, %<Vetype>1
+   movprfx\t%0.<Vetype>, %3/z, %0.<Vetype>\;mov\t%0.<Vetype>, %3/m, %<vwcore>1
+   movprfx\t%0.<Vetype>, %3/z, %0.<Vetype>\;mov\t%0.<Vetype>, %3/m, %<Vetype>1
+   movprfx\t%0, %2\;mov\t%0.<Vetype>, %3/m, %<vwcore>1
+   movprfx\t%0, %2\;mov\t%0.<Vetype>, %3/m, %<Vetype>1"
+  [(set_attr "movprfx" "*,*,yes,yes,yes,yes")]
 )
 
+;; -------------------------------------------------------------------------
+;; ---- [INT,FP] Compare and select
+;; -------------------------------------------------------------------------
+;; The patterns in this section are synthetic.
+;; -------------------------------------------------------------------------
+
 ;; Integer (signed) vcond.  Don't enforce an immediate range here, since it
 ;; depends on the comparison; leave it to aarch64_expand_sve_vcond instead.
 (define_expand "vcond<mode><v_int_equiv>"
-  [(set (match_operand:SVE_ALL 0 "register_operand")
-	(if_then_else:SVE_ALL
+  [(set (match_operand:SVE_FULL 0 "register_operand")
+	(if_then_else:SVE_FULL
 	  (match_operator 3 "comparison_operator"
 	    [(match_operand:<V_INT_EQUIV> 4 "register_operand")
 	     (match_operand:<V_INT_EQUIV> 5 "nonmemory_operand")])
-	  (match_operand:SVE_ALL 1 "register_operand")
-	  (match_operand:SVE_ALL 2 "register_operand")))]
+	  (match_operand:SVE_FULL 1 "nonmemory_operand")
+	  (match_operand:SVE_FULL 2 "nonmemory_operand")))]
   "TARGET_SVE"
   {
     aarch64_expand_sve_vcond (<MODE>mode, <V_INT_EQUIV>mode, operands);
@@ -1647,13 +6555,13 @@
 ;; Integer vcondu.  Don't enforce an immediate range here, since it
 ;; depends on the comparison; leave it to aarch64_expand_sve_vcond instead.
 (define_expand "vcondu<mode><v_int_equiv>"
-  [(set (match_operand:SVE_ALL 0 "register_operand")
-	(if_then_else:SVE_ALL
+  [(set (match_operand:SVE_FULL 0 "register_operand")
+	(if_then_else:SVE_FULL
 	  (match_operator 3 "comparison_operator"
 	    [(match_operand:<V_INT_EQUIV> 4 "register_operand")
 	     (match_operand:<V_INT_EQUIV> 5 "nonmemory_operand")])
-	  (match_operand:SVE_ALL 1 "register_operand")
-	  (match_operand:SVE_ALL 2 "register_operand")))]
+	  (match_operand:SVE_FULL 1 "nonmemory_operand")
+	  (match_operand:SVE_FULL 2 "nonmemory_operand")))]
   "TARGET_SVE"
   {
     aarch64_expand_sve_vcond (<MODE>mode, <V_INT_EQUIV>mode, operands);
@@ -1661,17 +6569,16 @@
   }
 )
 
-;; Floating-point vcond.  All comparisons except FCMUO allow a zero
-;; operand; aarch64_expand_sve_vcond handles the case of an FCMUO
-;; with zero.
+;; Floating-point vcond.  All comparisons except FCMUO allow a zero operand;
+;; aarch64_expand_sve_vcond handles the case of an FCMUO with zero.
 (define_expand "vcond<mode><v_fp_equiv>"
-  [(set (match_operand:SVE_SD 0 "register_operand")
-	(if_then_else:SVE_SD
+  [(set (match_operand:SVE_FULL_HSD 0 "register_operand")
+	(if_then_else:SVE_FULL_HSD
 	  (match_operator 3 "comparison_operator"
 	    [(match_operand:<V_FP_EQUIV> 4 "register_operand")
 	     (match_operand:<V_FP_EQUIV> 5 "aarch64_simd_reg_or_zero")])
-	  (match_operand:SVE_SD 1 "register_operand")
-	  (match_operand:SVE_SD 2 "register_operand")))]
+	  (match_operand:SVE_FULL_HSD 1 "nonmemory_operand")
+	  (match_operand:SVE_FULL_HSD 2 "nonmemory_operand")))]
   "TARGET_SVE"
   {
     aarch64_expand_sve_vcond (<MODE>mode, <V_FP_EQUIV>mode, operands);
@@ -1679,6 +6586,22 @@
   }
 )
 
+;; -------------------------------------------------------------------------
+;; ---- [INT] Comparisons
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - CMPEQ
+;; - CMPGE
+;; - CMPGT
+;; - CMPHI
+;; - CMPHS
+;; - CMPLE
+;; - CMPLO
+;; - CMPLS
+;; - CMPLT
+;; - CMPNE
+;; -------------------------------------------------------------------------
+
 ;; Signed integer comparisons.  Don't enforce an immediate range here, since
 ;; it depends on the comparison; leave it to aarch64_expand_sve_vec_cmp_int
 ;; instead.
@@ -1686,9 +6609,9 @@
   [(parallel
     [(set (match_operand:<VPRED> 0 "register_operand")
 	  (match_operator:<VPRED> 1 "comparison_operator"
-	    [(match_operand:SVE_I 2 "register_operand")
-	     (match_operand:SVE_I 3 "nonmemory_operand")]))
-     (clobber (reg:CC CC_REGNUM))])]
+	    [(match_operand:SVE_FULL_I 2 "register_operand")
+	     (match_operand:SVE_FULL_I 3 "nonmemory_operand")]))
+     (clobber (reg:CC_NZC CC_REGNUM))])]
   "TARGET_SVE"
   {
     aarch64_expand_sve_vec_cmp_int (operands[0], GET_CODE (operands[1]),
@@ -1704,9 +6627,9 @@
   [(parallel
     [(set (match_operand:<VPRED> 0 "register_operand")
 	  (match_operator:<VPRED> 1 "comparison_operator"
-	    [(match_operand:SVE_I 2 "register_operand")
-	     (match_operand:SVE_I 3 "nonmemory_operand")]))
-     (clobber (reg:CC CC_REGNUM))])]
+	    [(match_operand:SVE_FULL_I 2 "register_operand")
+	     (match_operand:SVE_FULL_I 3 "nonmemory_operand")]))
+     (clobber (reg:CC_NZC CC_REGNUM))])]
   "TARGET_SVE"
   {
     aarch64_expand_sve_vec_cmp_int (operands[0], GET_CODE (operands[1]),
@@ -1715,14 +6638,285 @@
   }
 )
 
+;; Predicated integer comparisons.
+(define_insn "@aarch64_pred_cmp<cmp_op><mode>"
+  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
+	(unspec:<VPRED>
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	   (SVE_INT_CMP:<VPRED>
+	     (match_operand:SVE_FULL_I 3 "register_operand" "w, w")
+	     (match_operand:SVE_FULL_I 4 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
+	  UNSPEC_PRED_Z))
+   (clobber (reg:CC_NZC CC_REGNUM))]
+  "TARGET_SVE"
+  "@
+   cmp<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, #%4
+   cmp<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype>"
+)
+
+;; Predicated integer comparisons in which both the flag and predicate
+;; results are interesting.
+(define_insn_and_rewrite "*cmp<cmp_op><mode>_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upl, Upl")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (unspec:<VPRED>
+	     [(match_operand 6)
+	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+	      (SVE_INT_CMP:<VPRED>
+		(match_operand:SVE_FULL_I 2 "register_operand" "w, w")
+		(match_operand:SVE_FULL_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
+	     UNSPEC_PRED_Z)]
+	  UNSPEC_PTEST))
+   (set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
+	(unspec:<VPRED>
+	  [(match_dup 6)
+	   (match_dup 7)
+	   (SVE_INT_CMP:<VPRED>
+	     (match_dup 2)
+	     (match_dup 3))]
+	  UNSPEC_PRED_Z))]
+  "TARGET_SVE
+   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+  "@
+   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3
+   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
+  "&& !rtx_equal_p (operands[4], operands[6])"
+  {
+    operands[6] = copy_rtx (operands[4]);
+    operands[7] = operands[5];
+  }
+)
+
+;; Predicated integer comparisons in which only the flags result is
+;; interesting.
+(define_insn_and_rewrite "*cmp<cmp_op><mode>_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upl, Upl")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (unspec:<VPRED>
+	     [(match_operand 6)
+	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+	      (SVE_INT_CMP:<VPRED>
+		(match_operand:SVE_FULL_I 2 "register_operand" "w, w")
+		(match_operand:SVE_FULL_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
+	     UNSPEC_PRED_Z)]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:<VPRED> 0 "=Upa, Upa"))]
+  "TARGET_SVE
+   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+  "@
+   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, #%3
+   cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
+  "&& !rtx_equal_p (operands[4], operands[6])"
+  {
+    operands[6] = copy_rtx (operands[4]);
+    operands[7] = operands[5];
+  }
+)
+
+;; Predicated integer comparisons, formed by combining a PTRUE-predicated
+;; comparison with an AND.  Split the instruction into its preferred form
+;; at the earliest opportunity, in order to get rid of the redundant
+;; operand 4.
+(define_insn_and_split "*cmp<cmp_op><mode>_and"
+  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
+	(and:<VPRED>
+	  (unspec:<VPRED>
+	    [(match_operand 4)
+	     (const_int SVE_KNOWN_PTRUE)
+	     (SVE_INT_CMP:<VPRED>
+	       (match_operand:SVE_FULL_I 2 "register_operand" "w, w")
+	       (match_operand:SVE_FULL_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
+	    UNSPEC_PRED_Z)
+	  (match_operand:<VPRED> 1 "register_operand" "Upl, Upl")))
+   (clobber (reg:CC_NZC CC_REGNUM))]
+  "TARGET_SVE"
+  "#"
+  "&& 1"
+  [(parallel
+     [(set (match_dup 0)
+	   (unspec:<VPRED>
+	     [(match_dup 1)
+	      (const_int SVE_MAYBE_NOT_PTRUE)
+	      (SVE_INT_CMP:<VPRED>
+		(match_dup 2)
+		(match_dup 3))]
+	     UNSPEC_PRED_Z))
+      (clobber (reg:CC_NZC CC_REGNUM))])]
+)
+
+;; Predicated integer wide comparisons.
+(define_insn "@aarch64_pred_cmp<cmp_op><mode>_wide"
+  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
+	(unspec:<VPRED>
+	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
+	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	   (unspec:<VPRED>
+	     [(match_operand:SVE_FULL_BHSI 3 "register_operand" "w")
+	      (match_operand:VNx2DI 4 "register_operand" "w")]
+	     SVE_COND_INT_CMP_WIDE)]
+	  UNSPEC_PRED_Z))
+   (clobber (reg:CC_NZC CC_REGNUM))]
+  "TARGET_SVE"
+  "cmp<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.d"
+)
+
+;; Predicated integer wide comparisons in which both the flag and
+;; predicate results are interesting.
+(define_insn "*aarch64_pred_cmp<cmp_op><mode>_wide_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (unspec:<VPRED>
+	     [(match_operand:VNx16BI 6 "register_operand" "Upl")
+	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+	      (unspec:<VPRED>
+		[(match_operand:SVE_FULL_BHSI 2 "register_operand" "w")
+		 (match_operand:VNx2DI 3 "register_operand" "w")]
+		SVE_COND_INT_CMP_WIDE)]
+	     UNSPEC_PRED_Z)]
+	  UNSPEC_PTEST))
+   (set (match_operand:<VPRED> 0 "register_operand" "=Upa")
+	(unspec:<VPRED>
+	  [(match_dup 6)
+	   (match_dup 7)
+	   (unspec:<VPRED>
+	     [(match_dup 2)
+	      (match_dup 3)]
+	     SVE_COND_INT_CMP_WIDE)]
+	  UNSPEC_PRED_Z))]
+  "TARGET_SVE
+   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+  "cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.d"
+)
+
+;; Predicated integer wide comparisons in which only the flags result
+;; is interesting.
+(define_insn "*aarch64_pred_cmp<cmp_op><mode>_wide_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (unspec:<VPRED>
+	     [(match_operand:VNx16BI 6 "register_operand" "Upl")
+	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+	      (unspec:<VPRED>
+		[(match_operand:SVE_FULL_BHSI 2 "register_operand" "w")
+		 (match_operand:VNx2DI 3 "register_operand" "w")]
+		SVE_COND_INT_CMP_WIDE)]
+	     UNSPEC_PRED_Z)]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:<VPRED> 0 "=Upa"))]
+  "TARGET_SVE
+   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+  "cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.d"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] While tests
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - WHILELE
+;; - WHILELO
+;; - WHILELS
+;; - WHILELT
+;; -------------------------------------------------------------------------
+
+;; Set element I of the result if (cmp (plus operand1 J) operand2) is
+;; true for all J in [0, I].
+(define_insn "@while_<while_optab_cmp><GPI:mode><PRED_ALL:mode>"
+  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+	(unspec:PRED_ALL [(match_operand:GPI 1 "aarch64_reg_or_zero" "rZ")
+			  (match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")]
+			 SVE_WHILE))
+   (clobber (reg:CC_NZC CC_REGNUM))]
+  "TARGET_SVE"
+  "while<cmp_op>\t%0.<PRED_ALL:Vetype>, %<w>1, %<w>2"
+)
+
+;; The WHILE instructions set the flags in the same way as a PTEST with
+;; a PTRUE GP.  Handle the case in which both results are useful.  The GP
+;; operands to the PTEST aren't needed, so we allow them to be anything.
+(define_insn_and_rewrite "*while_<while_optab_cmp><GPI:mode><PRED_ALL:mode>_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand 3)
+	   (match_operand 4)
+	   (const_int SVE_KNOWN_PTRUE)
+	   (unspec:PRED_ALL
+	     [(match_operand:GPI 1 "aarch64_reg_or_zero" "rZ")
+	      (match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")]
+	     SVE_WHILE)]
+	  UNSPEC_PTEST))
+   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+	(unspec:PRED_ALL [(match_dup 1)
+			  (match_dup 2)]
+			 SVE_WHILE))]
+  "TARGET_SVE"
+  "while<cmp_op>\t%0.<PRED_ALL:Vetype>, %<w>1, %<w>2"
+  ;; Force the compiler to drop the unused predicate operand, so that we
+  ;; don't have an unnecessary PTRUE.
+  "&& (!CONSTANT_P (operands[3]) || !CONSTANT_P (operands[4]))"
+  {
+    operands[3] = CONSTM1_RTX (VNx16BImode);
+    operands[4] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
+  }
+)
+
+;; Same, but handle the case in which only the flags result is useful.
+(define_insn_and_rewrite "*while_<while_optab_cmp><GPI:mode><PRED_ALL:mode>_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand 3)
+	   (match_operand 4)
+	   (const_int SVE_KNOWN_PTRUE)
+	   (unspec:PRED_ALL
+	     [(match_operand:GPI 1 "aarch64_reg_or_zero" "rZ")
+	      (match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")]
+	     SVE_WHILE)]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:PRED_ALL 0 "=Upa"))]
+  "TARGET_SVE"
+  "while<cmp_op>\t%0.<PRED_ALL:Vetype>, %<w>1, %<w>2"
+  ;; Force the compiler to drop the unused predicate operand, so that we
+  ;; don't have an unnecessary PTRUE.
+  "&& (!CONSTANT_P (operands[3]) || !CONSTANT_P (operands[4]))"
+  {
+    operands[3] = CONSTM1_RTX (VNx16BImode);
+    operands[4] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
+  }
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [FP] Direct comparisons
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FCMEQ
+;; - FCMGE
+;; - FCMGT
+;; - FCMLE
+;; - FCMLT
+;; - FCMNE
+;; - FCMUO
+;; -------------------------------------------------------------------------
+
 ;; Floating-point comparisons.  All comparisons except FCMUO allow a zero
 ;; operand; aarch64_expand_sve_vec_cmp_float handles the case of an FCMUO
 ;; with zero.
 (define_expand "vec_cmp<mode><vpred>"
   [(set (match_operand:<VPRED> 0 "register_operand")
 	(match_operator:<VPRED> 1 "comparison_operator"
-	  [(match_operand:SVE_F 2 "register_operand")
-	   (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero")]))]
+	  [(match_operand:SVE_FULL_F 2 "register_operand")
+	   (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero")]))]
   "TARGET_SVE"
   {
     aarch64_expand_sve_vec_cmp_float (operands[0], GET_CODE (operands[1]),
@@ -1731,6 +6925,172 @@
   }
 )
 
+;; Predicated floating-point comparisons.
+(define_insn "@aarch64_pred_fcm<cmp_op><mode>"
+  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
+	(unspec:<VPRED>
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	   (match_operand:SVE_FULL_F 3 "register_operand" "w, w")
+	   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "Dz, w")]
+	  SVE_COND_FP_CMP_I0))]
+  "TARGET_SVE"
+  "@
+   fcm<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, #0.0
+   fcm<cmp_op>\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype>"
+)
+
+;; Same for unordered comparisons.
+(define_insn "@aarch64_pred_fcmuo<mode>"
+  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
+	(unspec:<VPRED>
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	   (match_operand:SVE_FULL_F 3 "register_operand" "w")
+	   (match_operand:SVE_FULL_F 4 "register_operand" "w")]
+	  UNSPEC_COND_FCMUO))]
+  "TARGET_SVE"
+  "fcmuo\t%0.<Vetype>, %1/z, %3.<Vetype>, %4.<Vetype>"
+)
+
+;; Floating-point comparisons predicated on a PTRUE, with the results ANDed
+;; with another predicate P.  This does not have the same trapping behavior
+;; as predicating the comparison itself on P, but it's a legitimate fold,
+;; since we can drop any potentially-trapping operations whose results
+;; are not needed.
+;;
+;; Split the instruction into its preferred form (below) at the earliest
+;; opportunity, in order to get rid of the redundant operand 1.
+(define_insn_and_split "*fcm<cmp_op><mode>_and_combine"
+  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
+	(and:<VPRED>
+	  (unspec:<VPRED>
+	    [(match_operand:<VPRED> 1)
+	     (const_int SVE_KNOWN_PTRUE)
+	     (match_operand:SVE_FULL_F 2 "register_operand" "w, w")
+	     (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "Dz, w")]
+	    SVE_COND_FP_CMP_I0)
+	  (match_operand:<VPRED> 4 "register_operand" "Upl, Upl")))]
+  "TARGET_SVE"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:<VPRED>
+	  [(match_dup 4)
+	   (const_int SVE_MAYBE_NOT_PTRUE)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  SVE_COND_FP_CMP_I0))]
+)
+
+;; Same for unordered comparisons.
+(define_insn_and_split "*fcmuo<mode>_and_combine"
+  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
+	(and:<VPRED>
+	  (unspec:<VPRED>
+	    [(match_operand:<VPRED> 1)
+	     (const_int SVE_KNOWN_PTRUE)
+	     (match_operand:SVE_FULL_F 2 "register_operand" "w")
+	     (match_operand:SVE_FULL_F 3 "register_operand" "w")]
+	    UNSPEC_COND_FCMUO)
+	  (match_operand:<VPRED> 4 "register_operand" "Upl")))]
+  "TARGET_SVE"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:<VPRED>
+	  [(match_dup 4)
+	   (const_int SVE_MAYBE_NOT_PTRUE)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  UNSPEC_COND_FCMUO))]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [FP] Absolute comparisons
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FACGE
+;; - FACGT
+;; - FACLE
+;; - FACLT
+;; -------------------------------------------------------------------------
+
+;; Predicated floating-point absolute comparisons.
+(define_expand "@aarch64_pred_fac<cmp_op><mode>"
+  [(set (match_operand:<VPRED> 0 "register_operand")
+	(unspec:<VPRED>
+	  [(match_operand:<VPRED> 1 "register_operand")
+	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (match_dup 2)
+	      (match_operand:SVE_FULL_F 3 "register_operand")]
+	     UNSPEC_COND_FABS)
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (match_dup 2)
+	      (match_operand:SVE_FULL_F 4 "register_operand")]
+	     UNSPEC_COND_FABS)]
+	  SVE_COND_FP_ABS_CMP))]
+  "TARGET_SVE"
+)
+
+(define_insn_and_rewrite "*aarch64_pred_fac<cmp_op><mode>"
+  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
+	(unspec:<VPRED>
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+	   (match_operand:SI 4 "aarch64_sve_ptrue_flag")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 5)
+	      (match_operand:SI 6 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w")]
+	     UNSPEC_COND_FABS)
+	   (unspec:SVE_FULL_F
+	     [(match_operand 7)
+	      (match_operand:SI 8 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 3 "register_operand" "w")]
+	     UNSPEC_COND_FABS)]
+	  SVE_COND_FP_ABS_CMP))]
+  "TARGET_SVE
+   && aarch64_sve_pred_dominates_p (&operands[5], operands[1])
+   && aarch64_sve_pred_dominates_p (&operands[7], operands[1])"
+  "fac<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>"
+  "&& (!rtx_equal_p (operands[1], operands[5])
+       || !rtx_equal_p (operands[1], operands[7]))"
+  {
+    operands[5] = copy_rtx (operands[1]);
+    operands[7] = copy_rtx (operands[1]);
+  }
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [PRED] Select
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - SEL
+;; -------------------------------------------------------------------------
+
+(define_insn "@vcond_mask_<mode><mode>"
+  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+	(ior:PRED_ALL
+	  (and:PRED_ALL
+	    (match_operand:PRED_ALL 3 "register_operand" "Upa")
+	    (match_operand:PRED_ALL 1 "register_operand" "Upa"))
+	  (and:PRED_ALL
+	    (not (match_dup 3))
+	    (match_operand:PRED_ALL 2 "register_operand" "Upa"))))]
+  "TARGET_SVE"
+  "sel\t%0.b, %3, %1.b, %2.b"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [PRED] Test bits
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - PTEST
+;; -------------------------------------------------------------------------
+
 ;; Branch based on predicate equality or inequality.
 (define_expand "cbranch<mode>4"
   [(set (pc)
@@ -1742,1409 +7102,2120 @@
 	  (pc)))]
   ""
   {
-    rtx ptrue = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode));
+    rtx ptrue = force_reg (VNx16BImode, aarch64_ptrue_all (<data_bytes>));
+    rtx cast_ptrue = gen_lowpart (<MODE>mode, ptrue);
+    rtx ptrue_flag = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
     rtx pred;
     if (operands[2] == CONST0_RTX (<MODE>mode))
       pred = operands[1];
     else
       {
 	pred = gen_reg_rtx (<MODE>mode);
-	emit_insn (gen_pred_xor<mode>3 (pred, ptrue, operands[1],
-					operands[2]));
+	emit_insn (gen_aarch64_pred_xor<mode>_z (pred, cast_ptrue, operands[1],
+						 operands[2]));
       }
-    emit_insn (gen_ptest_ptrue<mode> (ptrue, pred));
-    operands[1] = gen_rtx_REG (CCmode, CC_REGNUM);
+    emit_insn (gen_aarch64_ptest<mode> (ptrue, cast_ptrue, ptrue_flag, pred));
+    operands[1] = gen_rtx_REG (CC_NZCmode, CC_REGNUM);
     operands[2] = const0_rtx;
   }
 )
 
-;; Unpredicated integer MIN/MAX.
-(define_expand "<su><maxmin><mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand")
-	(unspec:SVE_I
-	  [(match_dup 3)
-	   (MAXMIN:SVE_I (match_operand:SVE_I 1 "register_operand")
-			 (match_operand:SVE_I 2 "register_operand"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; See "Description of UNSPEC_PTEST" above for details.
+(define_insn "aarch64_ptest<mode>"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC [(match_operand:VNx16BI 0 "register_operand" "Upa")
+			(match_operand 1)
+			(match_operand:SI 2 "aarch64_sve_ptrue_flag")
+			(match_operand:PRED_ALL 3 "register_operand" "Upa")]
+		       UNSPEC_PTEST))]
+  "TARGET_SVE"
+  "ptest\t%0, %3.b"
+)
+
+;; =========================================================================
+;; == Reductions
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- [INT,FP] Conditional reductions
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - CLASTA
+;; - CLASTB
+;; -------------------------------------------------------------------------
+
+;; Set operand 0 to the last active element in operand 3, or to tied
+;; operand 1 if no elements are active.
+(define_insn "@fold_extract_<last_op>_<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=?r, w")
+	(unspec:<VEL>
+	  [(match_operand:<VEL> 1 "register_operand" "0, 0")
+	   (match_operand:<VPRED> 2 "register_operand" "Upl, Upl")
+	   (match_operand:SVE_FULL 3 "register_operand" "w, w")]
+	  CLAST))]
   "TARGET_SVE"
-  {
-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
-  }
+  "@
+   clast<ab>\t%<vwcore>0, %2, %<vwcore>0, %3.<Vetype>
+   clast<ab>\t%<Vetype>0, %2, %<Vetype>0, %3.<Vetype>"
 )
 
-;; Integer MIN/MAX predicated with a PTRUE.
-(define_insn "*<su><maxmin><mode>3"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
-	(unspec:SVE_I
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (MAXMIN:SVE_I (match_operand:SVE_I 2 "register_operand" "%0, w")
-			 (match_operand:SVE_I 3 "register_operand" "w, w"))]
-	  UNSPEC_MERGE_PTRUE))]
+(define_insn "@aarch64_fold_extract_vector_<last_op>_<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL
+	  [(match_operand:SVE_FULL 1 "register_operand" "0, w")
+	   (match_operand:<VPRED> 2 "register_operand" "Upl, Upl")
+	   (match_operand:SVE_FULL 3 "register_operand" "w, w")]
+	  CLAST))]
   "TARGET_SVE"
   "@
-   <su><maxmin>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-   movprfx\t%0, %2\;<su><maxmin>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "*,yes")]
-)
+   clast<ab>\t%0.<Vetype>, %2, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %1\;clast<ab>\t%0.<Vetype>, %2, %0.<Vetype>, %3.<Vetype>"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Tree reductions
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - ANDV
+;; - EORV
+;; - ORV
+;; - SADDV
+;; - SMAXV
+;; - SMINV
+;; - UADDV
+;; - UMAXV
+;; - UMINV
+;; -------------------------------------------------------------------------
 
-;; Unpredicated floating-point MIN/MAX.
-(define_expand "<su><maxmin><mode>3"
-  [(set (match_operand:SVE_F 0 "register_operand")
-	(unspec:SVE_F
-	  [(match_dup 3)
-	   (FMAXMIN:SVE_F (match_operand:SVE_F 1 "register_operand")
-			  (match_operand:SVE_F 2 "register_operand"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Unpredicated integer add reduction.
+(define_expand "reduc_plus_scal_<mode>"
+  [(match_operand:<VEL> 0 "register_operand")
+   (match_operand:SVE_FULL_I 1 "register_operand")]
   "TARGET_SVE"
   {
-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    rtx pred = aarch64_ptrue_reg (<VPRED>mode);
+    rtx tmp = <VEL>mode == DImode ? operands[0] : gen_reg_rtx (DImode);
+    emit_insn (gen_aarch64_pred_reduc_uadd_<mode> (tmp, pred, operands[1]));
+    if (tmp != operands[0])
+      emit_move_insn (operands[0], gen_lowpart (<VEL>mode, tmp));
+    DONE;
   }
 )
 
-;; Floating-point MIN/MAX predicated with a PTRUE.
-(define_insn "*<su><maxmin><mode>3"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w, ?&w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (FMAXMIN:SVE_F (match_operand:SVE_F 2 "register_operand" "%0, w")
-			  (match_operand:SVE_F 3 "register_operand" "w, w"))]
-	  UNSPEC_MERGE_PTRUE))]
-  "TARGET_SVE"
-  "@
-   f<maxmin>nm\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-   movprfx\t%0, %2\;f<maxmin>nm\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "*,yes")]
+;; Predicated integer add reduction.  The result is always 64-bits.
+(define_insn "@aarch64_pred_reduc_<optab>_<mode>"
+  [(set (match_operand:DI 0 "register_operand" "=w")
+	(unspec:DI [(match_operand:<VPRED> 1 "register_operand" "Upl")
+		    (match_operand:SVE_FULL_I 2 "register_operand" "w")]
+		   SVE_INT_ADDV))]
+  "TARGET_SVE && <max_elem_bits> >= <elem_bits>"
+  "<su>addv\t%d0, %1, %2.<Vetype>"
 )
 
-;; Unpredicated fmin/fmax.
-(define_expand "<maxmin_uns><mode>3"
-  [(set (match_operand:SVE_F 0 "register_operand")
-	(unspec:SVE_F
-	  [(match_dup 3)
-	   (unspec:SVE_F [(match_operand:SVE_F 1 "register_operand")
-			  (match_operand:SVE_F 2 "register_operand")]
-			 FMAXMIN_UNS)]
-	  UNSPEC_MERGE_PTRUE))]
+;; Unpredicated integer reductions.
+(define_expand "reduc_<optab>_scal_<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand")
+	(unspec:<VEL> [(match_dup 2)
+		       (match_operand:SVE_FULL_I 1 "register_operand")]
+		      SVE_INT_REDUCTION))]
   "TARGET_SVE"
   {
-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
   }
 )
 
-;; fmin/fmax predicated with a PTRUE.
-(define_insn "*<maxmin_uns><mode>3"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w, ?&w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (unspec:SVE_F [(match_operand:SVE_F 2 "register_operand" "%0, w")
-			  (match_operand:SVE_F 3 "register_operand" "w, w")]
-			 FMAXMIN_UNS)]
-	  UNSPEC_MERGE_PTRUE))]
+;; Predicated integer reductions.
+(define_insn "@aarch64_pred_reduc_<optab>_<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
+		       (match_operand:SVE_FULL_I 2 "register_operand" "w")]
+		      SVE_INT_REDUCTION))]
   "TARGET_SVE"
-  "@
-   <maxmin_uns_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-   movprfx\t%0, %2\;<maxmin_uns_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "*,yes")]
+  "<sve_int_op>\t%<Vetype>0, %1, %2.<Vetype>"
 )
 
-;; Predicated integer operations with select.
-(define_expand "cond_<optab><mode>"
-  [(set (match_operand:SVE_I 0 "register_operand")
-	(unspec:SVE_I
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (SVE_INT_BINARY:SVE_I
-	     (match_operand:SVE_I 2 "register_operand")
-	     (match_operand:SVE_I 3 "register_operand"))
-	   (match_operand:SVE_I 4 "aarch64_simd_reg_or_zero")]
-	  UNSPEC_SEL))]
-  "TARGET_SVE"
-)
+;; -------------------------------------------------------------------------
+;; ---- [FP] Tree reductions
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FADDV
+;; - FMAXNMV
+;; - FMAXV
+;; - FMINNMV
+;; - FMINV
+;; -------------------------------------------------------------------------
 
-(define_expand "cond_<optab><mode>"
-  [(set (match_operand:SVE_SDI 0 "register_operand")
-	(unspec:SVE_SDI
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (SVE_INT_BINARY_SD:SVE_SDI
-	     (match_operand:SVE_SDI 2 "register_operand")
-	     (match_operand:SVE_SDI 3 "register_operand"))
-	   (match_operand:SVE_SDI 4 "aarch64_simd_reg_or_zero")]
-	  UNSPEC_SEL))]
+;; Unpredicated floating-point tree reductions.
+(define_expand "reduc_<optab>_scal_<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand")
+	(unspec:<VEL> [(match_dup 2)
+		       (match_operand:SVE_FULL_F 1 "register_operand")]
+		      SVE_FP_REDUCTION))]
   "TARGET_SVE"
+  {
+    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
+  }
 )
 
-;; Predicated integer operations with select matching the output operand.
-(define_insn "*cond_<optab><mode>_0"
-  [(set (match_operand:SVE_I 0 "register_operand" "+w, w, ?&w")
-	(unspec:SVE_I
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	   (SVE_INT_BINARY:SVE_I
-	     (match_operand:SVE_I 2 "register_operand" "0, w, w")
-	     (match_operand:SVE_I 3 "register_operand" "w, 0, w"))
-	   (match_dup 0)]
-	  UNSPEC_SEL))]
+;; Predicated floating-point tree reductions.
+(define_insn "@aarch64_pred_reduc_<optab>_<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
+		       (match_operand:SVE_FULL_F 2 "register_operand" "w")]
+		      SVE_FP_REDUCTION))]
   "TARGET_SVE"
-  "@
-   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-   <sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
-   movprfx\t%0, %1/m, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "*,*,yes")]
+  "<sve_fp_op>\t%<Vetype>0, %1, %2.<Vetype>"
 )
 
-(define_insn "*cond_<optab><mode>_0"
-  [(set (match_operand:SVE_SDI 0 "register_operand" "+w, w, ?&w")
-	(unspec:SVE_SDI
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	   (SVE_INT_BINARY_SD:SVE_SDI
-	     (match_operand:SVE_SDI 2 "register_operand" "0, w, w")
-	     (match_operand:SVE_SDI 3 "register_operand" "w, 0, w"))
-	   (match_dup 0)]
-	  UNSPEC_SEL))]
-  "TARGET_SVE"
-  "@
-   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-   <sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
-   movprfx\t%0, %1/m, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "*,*,yes")]
-)
+;; -------------------------------------------------------------------------
+;; ---- [FP] Left-to-right reductions
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FADDA
+;; -------------------------------------------------------------------------
 
-;; Predicated integer operations with select matching the first operand.
-(define_insn "*cond_<optab><mode>_2"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
-	(unspec:SVE_I
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (SVE_INT_BINARY:SVE_I
-	     (match_operand:SVE_I 2 "register_operand" "0, w")
-	     (match_operand:SVE_I 3 "register_operand" "w, w"))
-	   (match_dup 2)]
-	  UNSPEC_SEL))]
+;; Unpredicated in-order FP reductions.
+(define_expand "fold_left_plus_<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand")
+	(unspec:<VEL> [(match_dup 3)
+		       (match_operand:<VEL> 1 "register_operand")
+		       (match_operand:SVE_FULL_F 2 "register_operand")]
+		      UNSPEC_FADDA))]
   "TARGET_SVE"
-  "@
-   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "*,yes")]
+  {
+    operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+  }
 )
 
-(define_insn "*cond_<optab><mode>_2"
-  [(set (match_operand:SVE_SDI 0 "register_operand" "=w, ?&w")
-	(unspec:SVE_SDI
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (SVE_INT_BINARY_SD:SVE_SDI
-	     (match_operand:SVE_SDI 2 "register_operand" "0, w")
-	     (match_operand:SVE_SDI 3 "register_operand" "w, w"))
-	   (match_dup 2)]
-	  UNSPEC_SEL))]
+;; Predicated in-order FP reductions.
+(define_insn "mask_fold_left_plus_<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+	(unspec:<VEL> [(match_operand:<VPRED> 3 "register_operand" "Upl")
+		       (match_operand:<VEL> 1 "register_operand" "0")
+		       (match_operand:SVE_FULL_F 2 "register_operand" "w")]
+		      UNSPEC_FADDA))]
   "TARGET_SVE"
-  "@
-   <sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-   movprfx\t%0, %2\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "*,yes")]
+  "fadda\t%<Vetype>0, %3, %<Vetype>0, %2.<Vetype>"
 )
 
-;; Predicated integer operations with select matching the second operand.
-(define_insn "*cond_<optab><mode>_3"
-  [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
-	(unspec:SVE_I
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (SVE_INT_BINARY:SVE_I
-	     (match_operand:SVE_I 2 "register_operand" "w, w")
-	     (match_operand:SVE_I 3 "register_operand" "0, w"))
-	   (match_dup 3)]
-	  UNSPEC_SEL))]
-  "TARGET_SVE"
-  "@
-   <sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
-   movprfx\t%0, %3\;<sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
-  [(set_attr "movprfx" "*,yes")]
-)
+;; =========================================================================
+;; == Permutes
+;; =========================================================================
 
-(define_insn "*cond_<optab><mode>_3"
-  [(set (match_operand:SVE_SDI 0 "register_operand" "=w, ?&w")
-	(unspec:SVE_SDI
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (SVE_INT_BINARY_SD:SVE_SDI
-	     (match_operand:SVE_SDI 2 "register_operand" "w, w")
-	     (match_operand:SVE_SDI 3 "register_operand" "0, w"))
-	   (match_dup 3)]
-	  UNSPEC_SEL))]
-  "TARGET_SVE"
-  "@
-   <sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
-   movprfx\t%0, %3\;<sve_int_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
-  [(set_attr "movprfx" "*,yes")]
-)
+;; -------------------------------------------------------------------------
+;; ---- [INT,FP] General permutes
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - TBL
+;; -------------------------------------------------------------------------
 
-;; Predicated integer operations with select matching zero.
-(define_insn "*cond_<optab><mode>_z"
-  [(set (match_operand:SVE_I 0 "register_operand" "=&w")
-	(unspec:SVE_I
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (SVE_INT_BINARY:SVE_I
-	     (match_operand:SVE_I 2 "register_operand" "w")
-	     (match_operand:SVE_I 3 "register_operand" "w"))
-	   (match_operand:SVE_I 4 "aarch64_simd_imm_zero")]
-	  UNSPEC_SEL))]
-  "TARGET_SVE"
-  "movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "yes")]
+(define_expand "vec_perm<mode>"
+  [(match_operand:SVE_FULL 0 "register_operand")
+   (match_operand:SVE_FULL 1 "register_operand")
+   (match_operand:SVE_FULL 2 "register_operand")
+   (match_operand:<V_INT_EQUIV> 3 "aarch64_sve_vec_perm_operand")]
+  "TARGET_SVE && GET_MODE_NUNITS (<MODE>mode).is_constant ()"
+  {
+    aarch64_expand_sve_vec_perm (operands[0], operands[1],
+				 operands[2], operands[3]);
+    DONE;
+  }
 )
 
-(define_insn "*cond_<optab><mode>_z"
-  [(set (match_operand:SVE_SDI 0 "register_operand" "=&w")
-	(unspec:SVE_SDI
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (SVE_INT_BINARY_SD:SVE_SDI
-	     (match_operand:SVE_SDI 2 "register_operand" "w")
-	     (match_operand:SVE_SDI 3 "register_operand" "w"))
-	   (match_operand:SVE_SDI 4 "aarch64_simd_imm_zero")]
-	  UNSPEC_SEL))]
+(define_insn "@aarch64_sve_tbl<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
+	(unspec:SVE_FULL
+	  [(match_operand:SVE_FULL 1 "register_operand" "w")
+	   (match_operand:<V_INT_EQUIV> 2 "register_operand" "w")]
+	  UNSPEC_TBL))]
   "TARGET_SVE"
-  "movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "yes")]
+  "tbl\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
 )
 
-;; Synthetic predications with select unmatched.
-(define_insn "*cond_<optab><mode>_any"
-  [(set (match_operand:SVE_I 0 "register_operand" "=&w")
-	(unspec:SVE_I
+;; -------------------------------------------------------------------------
+;; ---- [INT,FP] Special-purpose unary permutes
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - COMPACT
+;; - DUP
+;; - REV
+;; -------------------------------------------------------------------------
+
+;; Compact active elements and pad with zeros.
+(define_insn "@aarch64_sve_compact<mode>"
+  [(set (match_operand:SVE_FULL_SD 0 "register_operand" "=w")
+	(unspec:SVE_FULL_SD
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (SVE_INT_BINARY:SVE_I
-	     (match_operand:SVE_I 2 "register_operand" "w")
-	     (match_operand:SVE_I 3 "register_operand" "w"))
-	   (match_operand:SVE_I 4 "register_operand"   "w")]
-	  UNSPEC_SEL))]
+	   (match_operand:SVE_FULL_SD 2 "register_operand" "w")]
+	  UNSPEC_SVE_COMPACT))]
   "TARGET_SVE"
-  "#"
+  "compact\t%0.<Vetype>, %1, %2.<Vetype>"
 )
 
-(define_insn "*cond_<optab><mode>_any"
-  [(set (match_operand:SVE_SDI 0 "register_operand" "=&w")
-	(unspec:SVE_SDI
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (SVE_INT_BINARY_SD:SVE_I
-	     (match_operand:SVE_SDI 2 "register_operand" "w")
-	     (match_operand:SVE_SDI 3 "register_operand" "w"))
-	   (match_operand:SVE_SDI 4 "register_operand"   "w")]
-	  UNSPEC_SEL))]
-  "TARGET_SVE"
-  "#"
+;; Duplicate one element of a vector.
+(define_insn "@aarch64_sve_dup_lane<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
+	(vec_duplicate:SVE_FULL
+	  (vec_select:<VEL>
+	    (match_operand:SVE_FULL 1 "register_operand" "w")
+	    (parallel [(match_operand:SI 2 "const_int_operand")]))))]
+  "TARGET_SVE
+   && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 0, 63)"
+  "dup\t%0.<Vetype>, %1.<Vetype>[%2]"
 )
 
-(define_split
-  [(set (match_operand:SVE_I 0 "register_operand")
-	(unspec:SVE_I
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (match_operator:SVE_I 5 "aarch64_sve_any_binary_operator"
-	     [(match_operand:SVE_I 2 "register_operand")
-	      (match_operand:SVE_I 3 "register_operand")])
-	   (match_operand:SVE_I 4 "register_operand")]
-	  UNSPEC_SEL))]
-  "TARGET_SVE && reload_completed
-   && !(rtx_equal_p (operands[0], operands[4])
-        || rtx_equal_p (operands[2], operands[4])
-        || rtx_equal_p (operands[3], operands[4]))"
-  ; Not matchable by any one insn or movprfx insn.  We need a separate select.
-  [(set (match_dup 0)
-	(unspec:SVE_I [(match_dup 1) (match_dup 2) (match_dup 4)]
-                      UNSPEC_SEL))
-   (set (match_dup 0)
-	(unspec:SVE_I
-	  [(match_dup 1)
-	   (match_op_dup 5 [(match_dup 0) (match_dup 3)])
-           (match_dup 0)]
-	  UNSPEC_SEL))]
+;; Use DUP.Q to duplicate a 128-bit segment of a register.
+;;
+;; The vec_select:<V128> sets memory lane number N of the V128 to lane
+;; number op2 + N of op1.  (We don't need to distinguish between memory
+;; and architectural register lane numbering for op1 or op0, since the
+;; two numbering schemes are the same for SVE.)
+;;
+;; The vec_duplicate:SVE_FULL then copies memory lane number N of the
+;; V128 (and thus lane number op2 + N of op1) to lane numbers N + I * STEP
+;; of op0.  We therefore get the correct result for both endiannesses.
+;;
+;; The wrinkle is that for big-endian V128 registers, memory lane numbering
+;; is in the opposite order to architectural register lane numbering.
+;; Thus if we were to do this operation via a V128 temporary register,
+;; the vec_select and vec_duplicate would both involve a reverse operation
+;; for big-endian targets.  In this fused pattern the two reverses cancel
+;; each other out.
+(define_insn "@aarch64_sve_dupq_lane<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
+	(vec_duplicate:SVE_FULL
+	  (vec_select:<V128>
+	    (match_operand:SVE_FULL 1 "register_operand" "w")
+	    (match_operand 2 "ascending_int_parallel"))))]
+  "TARGET_SVE
+   && (INTVAL (XVECEXP (operands[2], 0, 0))
+       * GET_MODE_SIZE (<VEL>mode)) % 16 == 0
+   && IN_RANGE (INTVAL (XVECEXP (operands[2], 0, 0))
+		* GET_MODE_SIZE (<VEL>mode), 0, 63)"
+  {
+    unsigned int byte = (INTVAL (XVECEXP (operands[2], 0, 0))
+			 * GET_MODE_SIZE (<VEL>mode));
+    operands[2] = gen_int_mode (byte / 16, DImode);
+    return "dup\t%0.q, %1.q[%2]";
+  }
 )
 
-;; Set operand 0 to the last active element in operand 3, or to tied
-;; operand 1 if no elements are active.
-(define_insn "fold_extract_last_<mode>"
-  [(set (match_operand:<VEL> 0 "register_operand" "=r, w")
-	(unspec:<VEL>
-	  [(match_operand:<VEL> 1 "register_operand" "0, 0")
-	   (match_operand:<VPRED> 2 "register_operand" "Upl, Upl")
-	   (match_operand:SVE_ALL 3 "register_operand" "w, w")]
-	  UNSPEC_CLASTB))]
+;; Reverse the order of elements within a full vector.
+(define_insn "@aarch64_sve_rev<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
+	(unspec:SVE_FULL
+	  [(match_operand:SVE_FULL 1 "register_operand" "w")]
+	  UNSPEC_REV))]
   "TARGET_SVE"
-  "@
-   clastb\t%<vwcore>0, %2, %<vwcore>0, %3.<Vetype>
-   clastb\t%<vw>0, %2, %<vw>0, %3.<Vetype>"
-)
+  "rev\t%0.<Vetype>, %1.<Vetype>")
 
-;; Unpredicated integer add reduction.
-(define_expand "reduc_plus_scal_<mode>"
-  [(set (match_operand:<VEL> 0 "register_operand")
-	(unspec:<VEL> [(match_dup 2)
-		       (match_operand:SVE_I 1 "register_operand")]
-		      UNSPEC_ADDV))]
+;; -------------------------------------------------------------------------
+;; ---- [INT,FP] Special-purpose binary permutes
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - SPLICE
+;; - TRN1
+;; - TRN2
+;; - UZP1
+;; - UZP2
+;; - ZIP1
+;; - ZIP2
+;; -------------------------------------------------------------------------
+
+;; Like EXT, but start at the first active element.
+(define_insn "@aarch64_sve_splice<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (match_operand:SVE_FULL 2 "register_operand" "0, w")
+	   (match_operand:SVE_FULL 3 "register_operand" "w, w")]
+	  UNSPEC_SVE_SPLICE))]
   "TARGET_SVE"
+  "@
+   splice\t%0.<Vetype>, %1, %0.<Vetype>, %3.<Vetype>
+   movprfx\t%0, %2\;splice\t%0.<Vetype>, %1, %0.<Vetype>, %3.<Vetype>"
+  [(set_attr "movprfx" "*, yes")]
+)
+
+;; Permutes that take half the elements from one vector and half the
+;; elements from the other.
+(define_insn "@aarch64_sve_<perm_insn><mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
+	(unspec:SVE_FULL
+	  [(match_operand:SVE_FULL 1 "register_operand" "w")
+	   (match_operand:SVE_FULL 2 "register_operand" "w")]
+	  PERMUTE))]
+  "TARGET_SVE"
+  "<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+)
+
+;; Apply PERMUTE to 128-bit sequences.  The behavior of these patterns
+;; doesn't depend on the mode.
+(define_insn "@aarch64_sve_<optab><mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
+	(unspec:SVE_FULL
+	  [(match_operand:SVE_FULL 1 "register_operand" "w")
+	   (match_operand:SVE_FULL 2 "register_operand" "w")]
+	  PERMUTEQ))]
+  "TARGET_SVE_F64MM"
+  "<perm_insn>\t%0.q, %1.q, %2.q"
+)
+
+;; Concatenate two vectors and extract a subvector.  Note that the
+;; immediate (third) operand is the lane index not the byte index.
+(define_insn "@aarch64_sve_ext<mode>"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_FULL
+	  [(match_operand:SVE_FULL 1 "register_operand" "0, w")
+	   (match_operand:SVE_FULL 2 "register_operand" "w, w")
+	   (match_operand:SI 3 "const_int_operand")]
+	  UNSPEC_EXT))]
+  "TARGET_SVE
+   && IN_RANGE (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode), 0, 255)"
   {
-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    operands[3] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode));
+    return (which_alternative == 0
+	    ? "ext\\t%0.b, %0.b, %2.b, #%3"
+	    : "movprfx\t%0, %1\;ext\\t%0.b, %0.b, %2.b, #%3");
   }
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Predicated integer add reduction.  The result is always 64-bits.
-(define_insn "*reduc_plus_scal_<mode>"
-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
-	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
-		       (match_operand:SVE_I 2 "register_operand" "w")]
-		      UNSPEC_ADDV))]
+;; -------------------------------------------------------------------------
+;; ---- [PRED] Special-purpose unary permutes
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - REV
+;; -------------------------------------------------------------------------
+
+(define_insn "@aarch64_sve_rev<mode>"
+  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+	(unspec:PRED_ALL [(match_operand:PRED_ALL 1 "register_operand" "Upa")]
+			 UNSPEC_REV))]
   "TARGET_SVE"
-  "uaddv\t%d0, %1, %2.<Vetype>"
-)
+  "rev\t%0.<Vetype>, %1.<Vetype>")
 
-;; Unpredicated floating-point add reduction.
-(define_expand "reduc_plus_scal_<mode>"
-  [(set (match_operand:<VEL> 0 "register_operand")
-	(unspec:<VEL> [(match_dup 2)
-		       (match_operand:SVE_F 1 "register_operand")]
-		      UNSPEC_FADDV))]
+;; -------------------------------------------------------------------------
+;; ---- [PRED] Special-purpose binary permutes
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - TRN1
+;; - TRN2
+;; - UZP1
+;; - UZP2
+;; - ZIP1
+;; - ZIP2
+;; -------------------------------------------------------------------------
+
+;; Permutes that take half the elements from one vector and half the
+;; elements from the other.
+(define_insn "@aarch64_sve_<perm_insn><mode>"
+  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+	(unspec:PRED_ALL [(match_operand:PRED_ALL 1 "register_operand" "Upa")
+			  (match_operand:PRED_ALL 2 "register_operand" "Upa")]
+			 PERMUTE))]
   "TARGET_SVE"
-  {
-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
-  }
+  "<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
 )
 
-;; Predicated floating-point add reduction.
-(define_insn "*reduc_plus_scal_<mode>"
-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
-	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
-		       (match_operand:SVE_F 2 "register_operand" "w")]
-		      UNSPEC_FADDV))]
+;; =========================================================================
+;; == Conversions
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- [INT<-INT] Packs
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - UZP1
+;; -------------------------------------------------------------------------
+
+;; Integer pack.  Use UZP1 on the narrower type, which discards
+;; the high part of each wide element.
+(define_insn "vec_pack_trunc_<Vwide>"
+  [(set (match_operand:SVE_FULL_BHSI 0 "register_operand" "=w")
+	(unspec:SVE_FULL_BHSI
+	  [(match_operand:<VWIDE> 1 "register_operand" "w")
+	   (match_operand:<VWIDE> 2 "register_operand" "w")]
+	  UNSPEC_PACK))]
   "TARGET_SVE"
-  "faddv\t%<Vetype>0, %1, %2.<Vetype>"
+  "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
 )
 
-;; Unpredicated integer MIN/MAX reduction.
-(define_expand "reduc_<maxmin_uns>_scal_<mode>"
-  [(set (match_operand:<VEL> 0 "register_operand")
-	(unspec:<VEL> [(match_dup 2)
-		       (match_operand:SVE_I 1 "register_operand")]
-		      MAXMINV))]
+;; -------------------------------------------------------------------------
+;; ---- [INT<-INT] Unpacks
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - SUNPKHI
+;; - SUNPKLO
+;; - UUNPKHI
+;; - UUNPKLO
+;; -------------------------------------------------------------------------
+
+;; Unpack the low or high half of a vector, where "high" refers to
+;; the low-numbered lanes for big-endian and the high-numbered lanes
+;; for little-endian.
+(define_expand "vec_unpack<su>_<perm_hilo>_<SVE_FULL_BHSI:mode>"
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (unspec:<VWIDE>
+     [(match_operand:SVE_FULL_BHSI 1 "register_operand")] UNPACK)]
   "TARGET_SVE"
   {
-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    emit_insn ((<hi_lanes_optab>
+		? gen_aarch64_sve_<su>unpkhi_<SVE_FULL_BHSI:mode>
+		: gen_aarch64_sve_<su>unpklo_<SVE_FULL_BHSI:mode>)
+	       (operands[0], operands[1]));
+    DONE;
   }
 )
 
-;; Predicated integer MIN/MAX reduction.
-(define_insn "*reduc_<maxmin_uns>_scal_<mode>"
-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
-	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
-		       (match_operand:SVE_I 2 "register_operand" "w")]
-		      MAXMINV))]
+(define_insn "@aarch64_sve_<su>unpk<perm_hilo>_<SVE_FULL_BHSI:mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
+	(unspec:<VWIDE>
+	  [(match_operand:SVE_FULL_BHSI 1 "register_operand" "w")]
+	  UNPACK))]
   "TARGET_SVE"
-  "<maxmin_uns_op>v\t%<Vetype>0, %1, %2.<Vetype>"
+  "<su>unpk<perm_hilo>\t%0.<Vewtype>, %1.<Vetype>"
 )
 
-;; Unpredicated floating-point MIN/MAX reduction.
-(define_expand "reduc_<maxmin_uns>_scal_<mode>"
-  [(set (match_operand:<VEL> 0 "register_operand")
-	(unspec:<VEL> [(match_dup 2)
-		       (match_operand:SVE_F 1 "register_operand")]
-		      FMAXMINV))]
+;; -------------------------------------------------------------------------
+;; ---- [INT<-FP] Conversions
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FCVTZS
+;; - FCVTZU
+;; -------------------------------------------------------------------------
+
+;; Unpredicated conversion of floats to integers of the same size (HF to HI,
+;; SF to SI or DF to DI).
+(define_expand "<optab><mode><v_int_equiv>2"
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
+	(unspec:<V_INT_EQUIV>
+	  [(match_dup 2)
+	   (const_int SVE_RELAXED_GP)
+	   (match_operand:SVE_FULL_F 1 "register_operand")]
+	  SVE_COND_FCVTI))]
   "TARGET_SVE"
   {
-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
   }
 )
 
-;; Predicated floating-point MIN/MAX reduction.
-(define_insn "*reduc_<maxmin_uns>_scal_<mode>"
-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
-	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
-		       (match_operand:SVE_F 2 "register_operand" "w")]
-		      FMAXMINV))]
-  "TARGET_SVE"
-  "<maxmin_uns_op>v\t%<Vetype>0, %1, %2.<Vetype>"
+;; Predicated float-to-integer conversion, either to the same width or wider.
+(define_insn "@aarch64_sve_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>"
+  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
+	(unspec:SVE_FULL_HSDI
+	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl")
+	   (match_operand:SI 3 "aarch64_sve_gp_strictness")
+	   (match_operand:SVE_FULL_F 2 "register_operand" "w")]
+	  SVE_COND_FCVTI))]
+  "TARGET_SVE && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>"
+  "fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype>"
 )
 
-(define_expand "reduc_<optab>_scal_<mode>"
-  [(set (match_operand:<VEL> 0 "register_operand")
-	(unspec:<VEL> [(match_dup 2)
-		       (match_operand:SVE_I 1 "register_operand")]
-		      BITWISEV))]
-  "TARGET_SVE"
+;; Predicated narrowing float-to-integer conversion.
+(define_insn "@aarch64_sve_<optab>_trunc<VNx2DF_ONLY:mode><VNx4SI_ONLY:mode>"
+  [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w")
+	(unspec:VNx4SI_ONLY
+	  [(match_operand:VNx2BI 1 "register_operand" "Upl")
+	   (match_operand:SI 3 "aarch64_sve_gp_strictness")
+	   (match_operand:VNx2DF_ONLY 2 "register_operand" "w")]
+	  SVE_COND_FCVTI))]
+  "TARGET_SVE"
+  "fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>"
+)
+
+;; Predicated float-to-integer conversion with merging, either to the same
+;; width or wider.
+(define_expand "@cond_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>"
+  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand")
+	(unspec:SVE_FULL_HSDI
+	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand")
+	   (unspec:SVE_FULL_HSDI
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_F 2 "register_operand")]
+	     SVE_COND_FCVTI)
+	   (match_operand:SVE_FULL_HSDI 3 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>"
+)
+
+;; The first alternative doesn't need the earlyclobber, but the only case
+;; it would help is the uninteresting one in which operands 2 and 3 are
+;; the same register (despite having different modes).  Making all the
+;; alternatives earlyclobber makes things more consistent for the
+;; register allocator.
+(define_insn_and_rewrite "*cond_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>"
+  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=&w, &w, ?&w")
+	(unspec:SVE_FULL_HSDI
+	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:SVE_FULL_HSDI
+	     [(match_operand 4)
+	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_F 2 "register_operand" "w, w, w")]
+	     SVE_COND_FCVTI)
+	   (match_operand:SVE_FULL_HSDI 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE
+   && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>
+   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
+  "@
+   fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype>
+   movprfx\t%0.<SVE_FULL_HSDI:Vetype>, %1/z, %2.<SVE_FULL_HSDI:Vetype>\;fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype>
+   movprfx\t%0, %3\;fcvtz<su>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_F:Vetype>"
+  "&& !rtx_equal_p (operands[1], operands[4])"
   {
-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    operands[4] = copy_rtx (operands[1]);
   }
+  [(set_attr "movprfx" "*,yes,yes")]
+)
+
+;; Predicated narrowing float-to-integer conversion with merging.
+(define_expand "@cond_<optab>_trunc<VNx2DF_ONLY:mode><VNx4SI_ONLY:mode>"
+  [(set (match_operand:VNx4SI_ONLY 0 "register_operand")
+	(unspec:VNx4SI_ONLY
+	  [(match_operand:VNx2BI 1 "register_operand")
+	   (unspec:VNx4SI_ONLY
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:VNx2DF_ONLY 2 "register_operand")]
+	     SVE_COND_FCVTI)
+	   (match_operand:VNx4SI_ONLY 3 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
 )
 
-(define_insn "*reduc_<optab>_scal_<mode>"
-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
-	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
-		       (match_operand:SVE_I 2 "register_operand" "w")]
-		      BITWISEV))]
+(define_insn "*cond_<optab>_trunc<VNx2DF_ONLY:mode><VNx4SI_ONLY:mode>"
+  [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=&w, &w, ?&w")
+	(unspec:VNx4SI_ONLY
+	  [(match_operand:VNx2BI 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:VNx4SI_ONLY
+	     [(match_dup 1)
+	      (match_operand:SI 4 "aarch64_sve_gp_strictness")
+	      (match_operand:VNx2DF_ONLY 2 "register_operand" "w, w, w")]
+	     SVE_COND_FCVTI)
+	   (match_operand:VNx4SI_ONLY 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
   "TARGET_SVE"
-  "<bit_reduc_op>\t%<Vetype>0, %1, %2.<Vetype>"
+  "@
+   fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
+   movprfx\t%0.<VNx2DF_ONLY:Vetype>, %1/z, %2.<VNx2DF_ONLY:Vetype>\;fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
+   movprfx\t%0, %3\;fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>"
+  [(set_attr "movprfx" "*,yes,yes")]
 )
 
-;; Unpredicated in-order FP reductions.
-(define_expand "fold_left_plus_<mode>"
-  [(set (match_operand:<VEL> 0 "register_operand")
-	(unspec:<VEL> [(match_dup 3)
-		       (match_operand:<VEL> 1 "register_operand")
-		       (match_operand:SVE_F 2 "register_operand")]
-		      UNSPEC_FADDA))]
+;; -------------------------------------------------------------------------
+;; ---- [INT<-FP] Packs
+;; -------------------------------------------------------------------------
+;; The patterns in this section are synthetic.
+;; -------------------------------------------------------------------------
+
+;; Convert two vectors of DF to SI and pack the results into a single vector.
+(define_expand "vec_pack_<su>fix_trunc_vnx2df"
+  [(set (match_dup 4)
+	(unspec:VNx4SI
+	  [(match_dup 3)
+	   (const_int SVE_RELAXED_GP)
+	   (match_operand:VNx2DF 1 "register_operand")]
+	  SVE_COND_FCVTI))
+   (set (match_dup 5)
+	(unspec:VNx4SI
+	  [(match_dup 3)
+	   (const_int SVE_RELAXED_GP)
+	   (match_operand:VNx2DF 2 "register_operand")]
+	  SVE_COND_FCVTI))
+   (set (match_operand:VNx4SI 0 "register_operand")
+	(unspec:VNx4SI [(match_dup 4) (match_dup 5)] UNSPEC_UZP1))]
   "TARGET_SVE"
   {
-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    operands[3] = aarch64_ptrue_reg (VNx2BImode);
+    operands[4] = gen_reg_rtx (VNx4SImode);
+    operands[5] = gen_reg_rtx (VNx4SImode);
   }
 )
 
-;; In-order FP reductions predicated with PTRUE.
-(define_insn "*fold_left_plus_<mode>"
-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
-	(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
-		       (match_operand:<VEL> 2 "register_operand" "0")
-		       (match_operand:SVE_F 3 "register_operand" "w")]
-		      UNSPEC_FADDA))]
-  "TARGET_SVE"
-  "fadda\t%<Vetype>0, %1, %<Vetype>0, %3.<Vetype>"
-)
+;; -------------------------------------------------------------------------
+;; ---- [INT<-FP] Unpacks
+;; -------------------------------------------------------------------------
+;; No patterns here yet!
+;; -------------------------------------------------------------------------
 
-;; Predicated form of the above in-order reduction.
-(define_insn "*pred_fold_left_plus_<mode>"
-  [(set (match_operand:<VEL> 0 "register_operand" "=w")
-	(unspec:<VEL>
-	  [(match_operand:<VEL> 1 "register_operand" "0")
-	   (unspec:SVE_F
-	     [(match_operand:<VPRED> 2 "register_operand" "Upl")
-	      (match_operand:SVE_F 3 "register_operand" "w")
-	      (match_operand:SVE_F 4 "aarch64_simd_imm_zero")]
-	     UNSPEC_SEL)]
-	  UNSPEC_FADDA))]
-  "TARGET_SVE"
-  "fadda\t%<Vetype>0, %2, %<Vetype>0, %3.<Vetype>"
-)
+;; -------------------------------------------------------------------------
+;; ---- [FP<-INT] Conversions
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - SCVTF
+;; - UCVTF
+;; -------------------------------------------------------------------------
 
-;; Unpredicated floating-point addition.
-(define_expand "add<mode>3"
-  [(set (match_operand:SVE_F 0 "register_operand")
-	(unspec:SVE_F
-	  [(match_dup 3)
-	   (plus:SVE_F
-	     (match_operand:SVE_F 1 "register_operand")
-	     (match_operand:SVE_F 2 "aarch64_sve_float_arith_with_sub_operand"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Unpredicated conversion of integers to floats of the same size
+;; (HI to HF, SI to SF or DI to DF).
+(define_expand "<optab><v_int_equiv><mode>2"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_dup 2)
+	   (const_int SVE_RELAXED_GP)
+	   (match_operand:<V_INT_EQUIV> 1 "register_operand")]
+	  SVE_COND_ICVTF))]
   "TARGET_SVE"
   {
-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    operands[2] = aarch64_ptrue_reg (<VPRED>mode);
   }
 )
 
-;; Floating-point addition predicated with a PTRUE.
-(define_insn_and_split "*add<mode>3"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w, w, w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	   (plus:SVE_F
-	      (match_operand:SVE_F 2 "register_operand" "%0, 0, w")
-	      (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_operand" "vsA, vsN, w"))]
-	  UNSPEC_MERGE_PTRUE))]
-  "TARGET_SVE"
-  "@
-   fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
-   fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
-   #"
-  ; Split the unpredicated form after reload, so that we don't have
-  ; the unnecessary PTRUE.
-  "&& reload_completed
-   && register_operand (operands[3], <MODE>mode)"
-  [(set (match_dup 0) (plus:SVE_F (match_dup 2) (match_dup 3)))]
+;; Predicated integer-to-float conversion, either to the same width or
+;; narrower.
+(define_insn "@aarch64_sve_<optab>_nonextend<SVE_FULL_HSDI:mode><SVE_FULL_F:mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl")
+	   (match_operand:SI 3 "aarch64_sve_gp_strictness")
+	   (match_operand:SVE_FULL_HSDI 2 "register_operand" "w")]
+	  SVE_COND_ICVTF))]
+  "TARGET_SVE && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>"
+  "<su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>"
 )
 
-;; Unpredicated floating-point subtraction.
-(define_expand "sub<mode>3"
-  [(set (match_operand:SVE_F 0 "register_operand")
-	(unspec:SVE_F
-	  [(match_dup 3)
-	   (minus:SVE_F
-	     (match_operand:SVE_F 1 "aarch64_sve_float_arith_operand")
-	     (match_operand:SVE_F 2 "register_operand"))]
-	  UNSPEC_MERGE_PTRUE))]
-  "TARGET_SVE"
-  {
-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
-  }
+;; Predicated widening integer-to-float conversion.
+(define_insn "@aarch64_sve_<optab>_extend<VNx4SI_ONLY:mode><VNx2DF_ONLY:mode>"
+  [(set (match_operand:VNx2DF_ONLY 0 "register_operand" "=w")
+	(unspec:VNx2DF_ONLY
+	  [(match_operand:VNx2BI 1 "register_operand" "Upl")
+	   (match_operand:SI 3 "aarch64_sve_gp_strictness")
+	   (match_operand:VNx4SI_ONLY 2 "register_operand" "w")]
+	  SVE_COND_ICVTF))]
+  "TARGET_SVE"
+  "<su>cvtf\t%0.<VNx2DF_ONLY:Vetype>, %1/m, %2.<VNx4SI_ONLY:Vetype>"
+)
+
+;; Predicated integer-to-float conversion with merging, either to the same
+;; width or narrower.
+(define_expand "@cond_<optab>_nonextend<SVE_FULL_HSDI:mode><SVE_FULL_F:mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand")
+	   (unspec:SVE_FULL_F
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_HSDI 2 "register_operand")]
+	     SVE_COND_ICVTF)
+	   (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>"
 )
 
-;; Floating-point subtraction predicated with a PTRUE.
-(define_insn_and_split "*sub<mode>3"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w, w, w, w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl, Upl")
-	   (minus:SVE_F
-	     (match_operand:SVE_F 2 "aarch64_sve_float_arith_operand" "0, 0, vsA, w")
-	     (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_operand" "vsA, vsN, 0, w"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; The first alternative doesn't need the earlyclobber, but the only case
+;; it would help is the uninteresting one in which operands 2 and 3 are
+;; the same register (despite having different modes).  Making all the
+;; alternatives earlyclobber makes things more consistent for the
+;; register allocator.
+(define_insn_and_rewrite "*cond_<optab>_nonextend<SVE_FULL_HSDI:mode><SVE_FULL_F:mode>"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=&w, &w, ?&w")
+	(unspec:SVE_FULL_F
+	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:SVE_FULL_F
+	     [(match_operand 4)
+	      (match_operand:SI 5 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_HSDI 2 "register_operand" "w, w, w")]
+	     SVE_COND_ICVTF)
+	   (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
   "TARGET_SVE
-   && (register_operand (operands[2], <MODE>mode)
-       || register_operand (operands[3], <MODE>mode))"
+   && <SVE_FULL_HSDI:elem_bits> >= <SVE_FULL_F:elem_bits>
+   && aarch64_sve_pred_dominates_p (&operands[4], operands[1])"
   "@
-   fsub\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
-   fadd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%N3
-   fsubr\t%0.<Vetype>, %1/m, %0.<Vetype>, #%2
-   #"
-  ; Split the unpredicated form after reload, so that we don't have
-  ; the unnecessary PTRUE.
-  "&& reload_completed
-   && register_operand (operands[2], <MODE>mode)
-   && register_operand (operands[3], <MODE>mode)"
-  [(set (match_dup 0) (minus:SVE_F (match_dup 2) (match_dup 3)))]
-)
-
-;; Unpredicated floating-point multiplication.
-(define_expand "mul<mode>3"
-  [(set (match_operand:SVE_F 0 "register_operand")
-	(unspec:SVE_F
-	  [(match_dup 3)
-	   (mult:SVE_F
-	     (match_operand:SVE_F 1 "register_operand")
-	     (match_operand:SVE_F 2 "aarch64_sve_float_mul_operand"))]
-	  UNSPEC_MERGE_PTRUE))]
-  "TARGET_SVE"
+   <su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>
+   movprfx\t%0.<SVE_FULL_HSDI:Vetype>, %1/z, %2.<SVE_FULL_HSDI:Vetype>\;<su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>
+   movprfx\t%0, %3\;<su>cvtf\t%0.<SVE_FULL_F:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>"
+  "&& !rtx_equal_p (operands[1], operands[4])"
   {
-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    operands[4] = copy_rtx (operands[1]);
   }
+  [(set_attr "movprfx" "*,yes,yes")]
+)
+
+;; Predicated widening integer-to-float conversion with merging.
+(define_expand "@cond_<optab>_extend<VNx4SI_ONLY:mode><VNx2DF_ONLY:mode>"
+  [(set (match_operand:VNx2DF_ONLY 0 "register_operand")
+	(unspec:VNx2DF_ONLY
+	  [(match_operand:VNx2BI 1 "register_operand")
+	   (unspec:VNx2DF_ONLY
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:VNx4SI_ONLY 2 "register_operand")]
+	     SVE_COND_ICVTF)
+	   (match_operand:VNx2DF_ONLY 3 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE"
 )
 
-;; Floating-point multiplication predicated with a PTRUE.
-(define_insn_and_split "*mul<mode>3"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w, w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (mult:SVE_F
-	     (match_operand:SVE_F 2 "register_operand" "%0, w")
-	     (match_operand:SVE_F 3 "aarch64_sve_float_mul_operand" "vsM, w"))]
-	  UNSPEC_MERGE_PTRUE))]
+(define_insn "*cond_<optab>_extend<VNx4SI_ONLY:mode><VNx2DF_ONLY:mode>"
+  [(set (match_operand:VNx2DF_ONLY 0 "register_operand" "=w, ?&w, ?&w")
+	(unspec:VNx2DF_ONLY
+	  [(match_operand:VNx2BI 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:VNx2DF_ONLY
+	     [(match_dup 1)
+	      (match_operand:SI 4 "aarch64_sve_gp_strictness")
+	      (match_operand:VNx4SI_ONLY 2 "register_operand" "w, w, w")]
+	     SVE_COND_ICVTF)
+	   (match_operand:VNx2DF_ONLY 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
   "TARGET_SVE"
   "@
-   fmul\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
-   #"
-  ; Split the unpredicated form after reload, so that we don't have
-  ; the unnecessary PTRUE.
-  "&& reload_completed
-   && register_operand (operands[3], <MODE>mode)"
-  [(set (match_dup 0) (mult:SVE_F (match_dup 2) (match_dup 3)))]
+   <su>cvtf\t%0.<VNx2DF_ONLY:Vetype>, %1/m, %2.<VNx4SI_ONLY:Vetype>
+   movprfx\t%0.<VNx2DF_ONLY:Vetype>, %1/z, %2.<VNx2DF_ONLY:Vetype>\;<su>cvtf\t%0.<VNx2DF_ONLY:Vetype>, %1/m, %2.<VNx4SI_ONLY:Vetype>
+   movprfx\t%0, %3\;<su>cvtf\t%0.<VNx2DF_ONLY:Vetype>, %1/m, %2.<VNx4SI_ONLY:Vetype>"
+  [(set_attr "movprfx" "*,yes,yes")]
 )
 
-;; Unpredicated floating-point binary operations (post-RA only).
-;; These are generated by splitting a predicated instruction whose
-;; predicate is unused.
-(define_insn "*post_ra_<sve_fp_op><mode>3"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w")
-	(SVE_UNPRED_FP_BINARY:SVE_F
-	  (match_operand:SVE_F 1 "register_operand" "w")
-	  (match_operand:SVE_F 2 "register_operand" "w")))]
-  "TARGET_SVE && reload_completed"
-  "<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>")
+;; -------------------------------------------------------------------------
+;; ---- [FP<-INT] Packs
+;; -------------------------------------------------------------------------
+;; No patterns here yet!
+;; -------------------------------------------------------------------------
 
-;; Unpredicated fma (%0 = (%1 * %2) + %3).
-(define_expand "fma<mode>4"
-  [(set (match_operand:SVE_F 0 "register_operand")
-	(unspec:SVE_F
-	  [(match_dup 4)
-	   (fma:SVE_F (match_operand:SVE_F 1 "register_operand")
-		      (match_operand:SVE_F 2 "register_operand")
-		      (match_operand:SVE_F 3 "register_operand"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; -------------------------------------------------------------------------
+;; ---- [FP<-INT] Unpacks
+;; -------------------------------------------------------------------------
+;; The patterns in this section are synthetic.
+;; -------------------------------------------------------------------------
+
+;; Unpack one half of a VNx4SI to VNx2DF.  First unpack from VNx4SI
+;; to VNx2DI, reinterpret the VNx2DI as a VNx4SI, then convert the
+;; unpacked VNx4SI to VNx2DF.
+(define_expand "vec_unpack<su_optab>_float_<perm_hilo>_vnx4si"
+  [(match_operand:VNx2DF 0 "register_operand")
+   (FLOATUORS:VNx2DF
+     (unspec:VNx2DI [(match_operand:VNx4SI 1 "register_operand")]
+		    UNPACK_UNSIGNED))]
   "TARGET_SVE"
   {
-    operands[4] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    /* Use ZIP to do the unpack, since we don't care about the upper halves
+       and since it has the nice property of not needing any subregs.
+       If using UUNPK* turns out to be preferable, we could model it as
+       a ZIP whose first operand is zero.  */
+    rtx temp = gen_reg_rtx (VNx4SImode);
+    emit_insn ((<hi_lanes_optab>
+		? gen_aarch64_sve_zip2vnx4si
+		: gen_aarch64_sve_zip1vnx4si)
+	       (temp, operands[1], operands[1]));
+    rtx ptrue = aarch64_ptrue_reg (VNx2BImode);
+    rtx strictness = gen_int_mode (SVE_RELAXED_GP, SImode);
+    emit_insn (gen_aarch64_sve_<FLOATUORS:optab>_extendvnx4sivnx2df
+	       (operands[0], ptrue, temp, strictness));
+    DONE;
   }
 )
 
-;; fma predicated with a PTRUE.
-(define_insn "*fma<mode>4"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w, w, ?&w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	   (fma:SVE_F (match_operand:SVE_F 3 "register_operand" "%0, w, w")
-		      (match_operand:SVE_F 4 "register_operand" "w, w, w")
-		      (match_operand:SVE_F 2 "register_operand" "w, 0, w"))]
-	  UNSPEC_MERGE_PTRUE))]
-  "TARGET_SVE"
-  "@
-   fmad\t%0.<Vetype>, %1/m, %4.<Vetype>, %2.<Vetype>
-   fmla\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
-   movprfx\t%0, %2\;fmla\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
-  [(set_attr "movprfx" "*,*,yes")]
-)
+;; -------------------------------------------------------------------------
+;; ---- [FP<-FP] Packs
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FCVT
+;; -------------------------------------------------------------------------
 
-;; Unpredicated fnma (%0 = (-%1 * %2) + %3).
-(define_expand "fnma<mode>4"
-  [(set (match_operand:SVE_F 0 "register_operand")
-	(unspec:SVE_F
-	  [(match_dup 4)
-	   (fma:SVE_F (neg:SVE_F
-			(match_operand:SVE_F 1 "register_operand"))
-		      (match_operand:SVE_F 2 "register_operand")
-		      (match_operand:SVE_F 3 "register_operand"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Convert two vectors of DF to SF, or two vectors of SF to HF, and pack
+;; the results into a single vector.
+(define_expand "vec_pack_trunc_<Vwide>"
+  [(set (match_dup 4)
+	(unspec:SVE_FULL_HSF
+	  [(match_dup 3)
+	   (const_int SVE_RELAXED_GP)
+	   (match_operand:<VWIDE> 1 "register_operand")]
+	  UNSPEC_COND_FCVT))
+   (set (match_dup 5)
+	(unspec:SVE_FULL_HSF
+	  [(match_dup 3)
+	   (const_int SVE_RELAXED_GP)
+	   (match_operand:<VWIDE> 2 "register_operand")]
+	  UNSPEC_COND_FCVT))
+   (set (match_operand:SVE_FULL_HSF 0 "register_operand")
+	(unspec:SVE_FULL_HSF [(match_dup 4) (match_dup 5)] UNSPEC_UZP1))]
   "TARGET_SVE"
   {
-    operands[4] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    operands[3] = aarch64_ptrue_reg (<VWIDE_PRED>mode);
+    operands[4] = gen_reg_rtx (<MODE>mode);
+    operands[5] = gen_reg_rtx (<MODE>mode);
   }
 )
 
-;; fnma predicated with a PTRUE.
-(define_insn "*fnma<mode>4"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w, w, ?&w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	   (fma:SVE_F (neg:SVE_F
-			(match_operand:SVE_F 3 "register_operand" "%0, w, w"))
-		      (match_operand:SVE_F 4 "register_operand" "w, w, w")
-		      (match_operand:SVE_F 2 "register_operand" "w, 0, w"))]
-	  UNSPEC_MERGE_PTRUE))]
-  "TARGET_SVE"
+;; Predicated float-to-float truncation.
+(define_insn "@aarch64_sve_<optab>_trunc<SVE_FULL_SDF:mode><SVE_FULL_HSF:mode>"
+  [(set (match_operand:SVE_FULL_HSF 0 "register_operand" "=w")
+	(unspec:SVE_FULL_HSF
+	  [(match_operand:<SVE_FULL_SDF:VPRED> 1 "register_operand" "Upl")
+	   (match_operand:SI 3 "aarch64_sve_gp_strictness")
+	   (match_operand:SVE_FULL_SDF 2 "register_operand" "w")]
+	  SVE_COND_FCVT))]
+  "TARGET_SVE && <SVE_FULL_SDF:elem_bits> > <SVE_FULL_HSF:elem_bits>"
+  "fcvt\t%0.<SVE_FULL_HSF:Vetype>, %1/m, %2.<SVE_FULL_SDF:Vetype>"
+)
+
+;; Predicated float-to-float truncation with merging.
+(define_expand "@cond_<optab>_trunc<SVE_FULL_SDF:mode><SVE_FULL_HSF:mode>"
+  [(set (match_operand:SVE_FULL_HSF 0 "register_operand")
+	(unspec:SVE_FULL_HSF
+	  [(match_operand:<SVE_FULL_SDF:VPRED> 1 "register_operand")
+	   (unspec:SVE_FULL_HSF
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_SDF 2 "register_operand")]
+	     SVE_COND_FCVT)
+	   (match_operand:SVE_FULL_HSF 3 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && <SVE_FULL_SDF:elem_bits> > <SVE_FULL_HSF:elem_bits>"
+)
+
+(define_insn "*cond_<optab>_trunc<SVE_FULL_SDF:mode><SVE_FULL_HSF:mode>"
+  [(set (match_operand:SVE_FULL_HSF 0 "register_operand" "=w, ?&w, ?&w")
+	(unspec:SVE_FULL_HSF
+	  [(match_operand:<SVE_FULL_SDF:VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:SVE_FULL_HSF
+	     [(match_dup 1)
+	      (match_operand:SI 4 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_SDF 2 "register_operand" "w, w, w")]
+	     SVE_COND_FCVT)
+	   (match_operand:SVE_FULL_HSF 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && <SVE_FULL_SDF:elem_bits> > <SVE_FULL_HSF:elem_bits>"
   "@
-   fmsb\t%0.<Vetype>, %1/m, %4.<Vetype>, %2.<Vetype>
-   fmls\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
-   movprfx\t%0, %2\;fmls\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
-  [(set_attr "movprfx" "*,*,yes")]
+   fcvt\t%0.<SVE_FULL_HSF:Vetype>, %1/m, %2.<SVE_FULL_SDF:Vetype>
+   movprfx\t%0.<SVE_FULL_SDF:Vetype>, %1/z, %2.<SVE_FULL_SDF:Vetype>\;fcvt\t%0.<SVE_FULL_HSF:Vetype>, %1/m, %2.<SVE_FULL_SDF:Vetype>
+   movprfx\t%0, %3\;fcvt\t%0.<SVE_FULL_HSF:Vetype>, %1/m, %2.<SVE_FULL_SDF:Vetype>"
+  [(set_attr "movprfx" "*,yes,yes")]
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [FP<-FP] Packs (bfloat16)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - BFCVT (BF16)
+;; - BFCVTNT (BF16)
+;; -------------------------------------------------------------------------
+
+;; Predicated BFCVT.
+(define_insn "@aarch64_sve_<optab>_trunc<VNx4SF_ONLY:mode><VNx8BF_ONLY:mode>"
+  [(set (match_operand:VNx8BF_ONLY 0 "register_operand" "=w")
+	(unspec:VNx8BF_ONLY
+	  [(match_operand:VNx4BI 1 "register_operand" "Upl")
+	   (match_operand:SI 3 "aarch64_sve_gp_strictness")
+	   (match_operand:VNx4SF_ONLY 2 "register_operand" "w")]
+	  SVE_COND_FCVT))]
+  "TARGET_SVE_BF16"
+  "bfcvt\t%0.h, %1/m, %2.s"
+)
+
+;; Predicated BFCVT with merging.
+(define_expand "@cond_<optab>_trunc<VNx4SF_ONLY:mode><VNx8BF_ONLY:mode>"
+  [(set (match_operand:VNx8BF_ONLY 0 "register_operand")
+	(unspec:VNx8BF_ONLY
+	  [(match_operand:VNx4BI 1 "register_operand")
+	   (unspec:VNx8BF_ONLY
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:VNx4SF_ONLY 2 "register_operand")]
+	     SVE_COND_FCVT)
+	   (match_operand:VNx8BF_ONLY 3 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE_BF16"
+)
+
+(define_insn "*cond_<optab>_trunc<VNx4SF_ONLY:mode><VNx8BF_ONLY:mode>"
+  [(set (match_operand:VNx8BF_ONLY 0 "register_operand" "=w, ?&w, ?&w")
+	(unspec:VNx8BF_ONLY
+	  [(match_operand:VNx4BI 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:VNx8BF_ONLY
+	     [(match_dup 1)
+	      (match_operand:SI 4 "aarch64_sve_gp_strictness")
+	      (match_operand:VNx4SF_ONLY 2 "register_operand" "w, w, w")]
+	     SVE_COND_FCVT)
+	   (match_operand:VNx8BF_ONLY 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE_BF16"
+  "@
+   bfcvt\t%0.h, %1/m, %2.s
+   movprfx\t%0.s, %1/z, %2.s\;bfcvt\t%0.h, %1/m, %2.s
+   movprfx\t%0, %3\;bfcvt\t%0.h, %1/m, %2.s"
+  [(set_attr "movprfx" "*,yes,yes")]
 )
 
-;; Unpredicated fms (%0 = (%1 * %2) - %3).
-(define_expand "fms<mode>4"
-  [(set (match_operand:SVE_F 0 "register_operand")
-	(unspec:SVE_F
-	  [(match_dup 4)
-	   (fma:SVE_F (match_operand:SVE_F 1 "register_operand")
-		      (match_operand:SVE_F 2 "register_operand")
-		      (neg:SVE_F
-			(match_operand:SVE_F 3 "register_operand")))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Predicated BFCVTNT.  This doesn't give a natural aarch64_pred_*/cond_*
+;; pair because the even elements always have to be supplied for active
+;; elements, even if the inactive elements don't matter.
+;;
+;; This instructions does not take MOVPRFX.
+(define_insn "@aarch64_sve_cvtnt<mode>"
+  [(set (match_operand:VNx8BF_ONLY 0 "register_operand" "=w")
+	(unspec:VNx8BF_ONLY
+	  [(match_operand:VNx4BI 2 "register_operand" "Upl")
+	   (const_int SVE_STRICT_GP)
+	   (match_operand:VNx8BF_ONLY 1 "register_operand" "0")
+	   (match_operand:VNx4SF 3 "register_operand" "w")]
+	  UNSPEC_COND_FCVTNT))]
+  "TARGET_SVE_BF16"
+  "bfcvtnt\t%0.h, %2/m, %3.s"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [FP<-FP] Unpacks
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FCVT
+;; -------------------------------------------------------------------------
+
+;; Unpack one half of a VNx4SF to VNx2DF, or one half of a VNx8HF to VNx4SF.
+;; First unpack the source without conversion, then float-convert the
+;; unpacked source.
+(define_expand "vec_unpacks_<perm_hilo>_<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (unspec:SVE_FULL_HSF
+     [(match_operand:SVE_FULL_HSF 1 "register_operand")]
+     UNPACK_UNSIGNED)]
   "TARGET_SVE"
   {
-    operands[4] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    /* Use ZIP to do the unpack, since we don't care about the upper halves
+       and since it has the nice property of not needing any subregs.
+       If using UUNPK* turns out to be preferable, we could model it as
+       a ZIP whose first operand is zero.  */
+    rtx temp = gen_reg_rtx (<MODE>mode);
+    emit_insn ((<hi_lanes_optab>
+		? gen_aarch64_sve_zip2<mode>
+		: gen_aarch64_sve_zip1<mode>)
+		(temp, operands[1], operands[1]));
+    rtx ptrue = aarch64_ptrue_reg (<VWIDE_PRED>mode);
+    rtx strictness = gen_int_mode (SVE_RELAXED_GP, SImode);
+    emit_insn (gen_aarch64_sve_fcvt_nontrunc<mode><Vwide>
+	       (operands[0], ptrue, temp, strictness));
+    DONE;
   }
 )
 
-;; fms predicated with a PTRUE.
-(define_insn "*fms<mode>4"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w, w, ?&w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	   (fma:SVE_F (match_operand:SVE_F 3 "register_operand" "%0, w, w")
-		      (match_operand:SVE_F 4 "register_operand" "w, w, w")
-		      (neg:SVE_F
-			(match_operand:SVE_F 2 "register_operand" "w, 0, w")))]
-	  UNSPEC_MERGE_PTRUE))]
-  "TARGET_SVE"
+;; Predicated float-to-float extension.
+(define_insn "@aarch64_sve_<optab>_nontrunc<SVE_FULL_HSF:mode><SVE_FULL_SDF:mode>"
+  [(set (match_operand:SVE_FULL_SDF 0 "register_operand" "=w")
+	(unspec:SVE_FULL_SDF
+	  [(match_operand:<SVE_FULL_SDF:VPRED> 1 "register_operand" "Upl")
+	   (match_operand:SI 3 "aarch64_sve_gp_strictness")
+	   (match_operand:SVE_FULL_HSF 2 "register_operand" "w")]
+	  SVE_COND_FCVT))]
+  "TARGET_SVE && <SVE_FULL_SDF:elem_bits> > <SVE_FULL_HSF:elem_bits>"
+  "fcvt\t%0.<SVE_FULL_SDF:Vetype>, %1/m, %2.<SVE_FULL_HSF:Vetype>"
+)
+
+;; Predicated float-to-float extension with merging.
+(define_expand "@cond_<optab>_nontrunc<SVE_FULL_HSF:mode><SVE_FULL_SDF:mode>"
+  [(set (match_operand:SVE_FULL_SDF 0 "register_operand")
+	(unspec:SVE_FULL_SDF
+	  [(match_operand:<SVE_FULL_SDF:VPRED> 1 "register_operand")
+	   (unspec:SVE_FULL_SDF
+	     [(match_dup 1)
+	      (const_int SVE_STRICT_GP)
+	      (match_operand:SVE_FULL_HSF 2 "register_operand")]
+	     SVE_COND_FCVT)
+	   (match_operand:SVE_FULL_SDF 3 "aarch64_simd_reg_or_zero")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && <SVE_FULL_SDF:elem_bits> > <SVE_FULL_HSF:elem_bits>"
+)
+
+(define_insn "*cond_<optab>_nontrunc<SVE_FULL_HSF:mode><SVE_FULL_SDF:mode>"
+  [(set (match_operand:SVE_FULL_SDF 0 "register_operand" "=w, ?&w, ?&w")
+	(unspec:SVE_FULL_SDF
+	  [(match_operand:<SVE_FULL_SDF:VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	   (unspec:SVE_FULL_SDF
+	     [(match_dup 1)
+	      (match_operand:SI 4 "aarch64_sve_gp_strictness")
+	      (match_operand:SVE_FULL_HSF 2 "register_operand" "w, w, w")]
+	     SVE_COND_FCVT)
+	   (match_operand:SVE_FULL_SDF 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
+	  UNSPEC_SEL))]
+  "TARGET_SVE && <SVE_FULL_SDF:elem_bits> > <SVE_FULL_HSF:elem_bits>"
   "@
-   fnmsb\t%0.<Vetype>, %1/m, %4.<Vetype>, %2.<Vetype>
-   fnmls\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
-   movprfx\t%0, %2\;fnmls\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
-  [(set_attr "movprfx" "*,*,yes")]
+   fcvt\t%0.<SVE_FULL_SDF:Vetype>, %1/m, %2.<SVE_FULL_HSF:Vetype>
+   movprfx\t%0.<SVE_FULL_SDF:Vetype>, %1/z, %2.<SVE_FULL_SDF:Vetype>\;fcvt\t%0.<SVE_FULL_SDF:Vetype>, %1/m, %2.<SVE_FULL_HSF:Vetype>
+   movprfx\t%0, %3\;fcvt\t%0.<SVE_FULL_SDF:Vetype>, %1/m, %2.<SVE_FULL_HSF:Vetype>"
+  [(set_attr "movprfx" "*,yes,yes")]
 )
 
-;; Unpredicated fnms (%0 = (-%1 * %2) - %3).
-(define_expand "fnms<mode>4"
-  [(set (match_operand:SVE_F 0 "register_operand")
-	(unspec:SVE_F
-	  [(match_dup 4)
-	   (fma:SVE_F (neg:SVE_F
-			(match_operand:SVE_F 1 "register_operand"))
-		      (match_operand:SVE_F 2 "register_operand")
-		      (neg:SVE_F
-			(match_operand:SVE_F 3 "register_operand")))]
-	  UNSPEC_MERGE_PTRUE))]
-  "TARGET_SVE"
-  {
-    operands[4] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
-  }
-)
+;; -------------------------------------------------------------------------
+;; ---- [PRED<-PRED] Packs
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - UZP1
+;; -------------------------------------------------------------------------
 
-;; fnms predicated with a PTRUE.
-(define_insn "*fnms<mode>4"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w, w, ?&w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	   (fma:SVE_F (neg:SVE_F
-			(match_operand:SVE_F 3 "register_operand" "%0, w, w"))
-		      (match_operand:SVE_F 4 "register_operand" "w, w, w")
-		      (neg:SVE_F
-			(match_operand:SVE_F 2 "register_operand" "w, 0, w")))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Predicate pack.  Use UZP1 on the narrower type, which discards
+;; the high part of each wide element.
+(define_insn "vec_pack_trunc_<Vwide>"
+  [(set (match_operand:PRED_BHS 0 "register_operand" "=Upa")
+	(unspec:PRED_BHS
+	  [(match_operand:<VWIDE> 1 "register_operand" "Upa")
+	   (match_operand:<VWIDE> 2 "register_operand" "Upa")]
+	  UNSPEC_PACK))]
   "TARGET_SVE"
-  "@
-   fnmad\t%0.<Vetype>, %1/m, %4.<Vetype>, %2.<Vetype>
-   fnmla\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
-   movprfx\t%0, %2\;fnmla\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
-  [(set_attr "movprfx" "*,*,yes")]
+  "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
 )
 
-;; Unpredicated floating-point division.
-(define_expand "div<mode>3"
-  [(set (match_operand:SVE_F 0 "register_operand")
-	(unspec:SVE_F
-	  [(match_dup 3)
-	   (div:SVE_F (match_operand:SVE_F 1 "register_operand")
-		      (match_operand:SVE_F 2 "register_operand"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; -------------------------------------------------------------------------
+;; ---- [PRED<-PRED] Unpacks
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - PUNPKHI
+;; - PUNPKLO
+;; -------------------------------------------------------------------------
+
+;; Unpack the low or high half of a predicate, where "high" refers to
+;; the low-numbered lanes for big-endian and the high-numbered lanes
+;; for little-endian.
+(define_expand "vec_unpack<su>_<perm_hilo>_<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (unspec:<VWIDE> [(match_operand:PRED_BHS 1 "register_operand")]
+		   UNPACK)]
   "TARGET_SVE"
   {
-    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    emit_insn ((<hi_lanes_optab>
+		? gen_aarch64_sve_punpkhi_<PRED_BHS:mode>
+		: gen_aarch64_sve_punpklo_<PRED_BHS:mode>)
+	       (operands[0], operands[1]));
+    DONE;
   }
 )
 
-;; Floating-point division predicated with a PTRUE.
-(define_insn "*div<mode>3"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w, w, ?&w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	   (div:SVE_F (match_operand:SVE_F 2 "register_operand" "0, w, w")
-		      (match_operand:SVE_F 3 "register_operand" "w, 0, w"))]
-	  UNSPEC_MERGE_PTRUE))]
+(define_insn "@aarch64_sve_punpk<perm_hilo>_<mode>"
+  [(set (match_operand:<VWIDE> 0 "register_operand" "=Upa")
+	(unspec:<VWIDE> [(match_operand:PRED_BHS 1 "register_operand" "Upa")]
+			UNPACK_UNSIGNED))]
   "TARGET_SVE"
-  "@
-   fdiv\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-   fdivr\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
-   movprfx\t%0, %2\;fdiv\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "*,*,yes")]
+  "punpk<perm_hilo>\t%0.h, %1.b"
 )
 
-;; Unpredicated FNEG, FABS and FSQRT.
-(define_expand "<optab><mode>2"
-  [(set (match_operand:SVE_F 0 "register_operand")
-	(unspec:SVE_F
-	  [(match_dup 2)
-	   (SVE_FP_UNARY:SVE_F (match_operand:SVE_F 1 "register_operand"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; =========================================================================
+;; == Vector partitioning
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- [PRED] Unary partitioning
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - BRKA
+;; - BRKAS
+;; - BRKB
+;; - BRKBS
+;; -------------------------------------------------------------------------
+
+;; Note that unlike most other instructions that have both merging and
+;; zeroing forms, these instructions don't operate elementwise and so
+;; don't fit the IFN_COND model.
+(define_insn "@aarch64_brk<brk_op>"
+  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa, Upa")
+	(unspec:VNx16BI
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa, Upa")
+	   (match_operand:VNx16BI 2 "register_operand" "Upa, Upa")
+	   (match_operand:VNx16BI 3 "aarch64_simd_reg_or_zero" "Dz, 0")]
+	  SVE_BRK_UNARY))]
+  "TARGET_SVE"
+  "@
+   brk<brk_op>\t%0.b, %1/z, %2.b
+   brk<brk_op>\t%0.b, %1/m, %2.b"
+)
+
+;; Same, but also producing a flags result.
+(define_insn "*aarch64_brk<brk_op>_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa, Upa")
+	   (match_dup 1)
+	   (match_operand:SI 4 "aarch64_sve_ptrue_flag")
+	   (unspec:VNx16BI
+	     [(match_dup 1)
+	      (match_operand:VNx16BI 2 "register_operand" "Upa, Upa")
+	      (match_operand:VNx16BI 3 "aarch64_simd_reg_or_zero" "Dz, 0")]
+	     SVE_BRK_UNARY)]
+	  UNSPEC_PTEST))
+   (set (match_operand:VNx16BI 0 "register_operand" "=Upa, Upa")
+	(unspec:VNx16BI
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  SVE_BRK_UNARY))]
+  "TARGET_SVE"
+  "@
+   brk<brk_op>s\t%0.b, %1/z, %2.b
+   brk<brk_op>s\t%0.b, %1/m, %2.b"
+)
+
+;; Same, but with only the flags result being interesting.
+(define_insn "*aarch64_brk<brk_op>_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa, Upa")
+	   (match_dup 1)
+	   (match_operand:SI 4 "aarch64_sve_ptrue_flag")
+	   (unspec:VNx16BI
+	     [(match_dup 1)
+	      (match_operand:VNx16BI 2 "register_operand" "Upa, Upa")
+	      (match_operand:VNx16BI 3 "aarch64_simd_reg_or_zero" "Dz, 0")]
+	     SVE_BRK_UNARY)]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:VNx16BI 0 "=Upa, Upa"))]
   "TARGET_SVE"
+  "@
+   brk<brk_op>s\t%0.b, %1/z, %2.b
+   brk<brk_op>s\t%0.b, %1/m, %2.b"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [PRED] Binary partitioning
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - BRKN
+;; - BRKNS
+;; - BRKPA
+;; - BRKPAS
+;; - BRKPB
+;; - BRKPBS
+;; -------------------------------------------------------------------------
+
+;; Binary BRKs (BRKN, BRKPA, BRKPB).
+(define_insn "@aarch64_brk<brk_op>"
+  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(unspec:VNx16BI
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_operand:VNx16BI 2 "register_operand" "Upa")
+	   (match_operand:VNx16BI 3 "register_operand" "<brk_reg_con>")]
+	  SVE_BRK_BINARY))]
+  "TARGET_SVE"
+  "brk<brk_op>\t%0.b, %1/z, %2.b, %<brk_reg_opno>.b"
+)
+
+;; Same, but also producing a flags result.
+(define_insn "*aarch64_brk<brk_op>_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_dup 1)
+	   (match_operand:SI 4 "aarch64_sve_ptrue_flag")
+	   (unspec:VNx16BI
+	     [(match_dup 1)
+	      (match_operand:VNx16BI 2 "register_operand" "Upa")
+	      (match_operand:VNx16BI 3 "register_operand" "<brk_reg_con>")]
+	     SVE_BRK_BINARY)]
+	  UNSPEC_PTEST))
+   (set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+	(unspec:VNx16BI
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  SVE_BRK_BINARY))]
+  "TARGET_SVE"
+  "brk<brk_op>s\t%0.b, %1/z, %2.b, %<brk_reg_opno>.b"
+)
+
+;; Same, but with only the flags result being interesting.
+(define_insn "*aarch64_brk<brk_op>_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_dup 1)
+	   (match_operand:SI 4 "aarch64_sve_ptrue_flag")
+	   (unspec:VNx16BI
+	     [(match_dup 1)
+	      (match_operand:VNx16BI 2 "register_operand" "Upa")
+	      (match_operand:VNx16BI 3 "register_operand" "<brk_reg_con>")]
+	     SVE_BRK_BINARY)]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
+  "TARGET_SVE"
+  "brk<brk_op>s\t%0.b, %1/z, %2.b, %<brk_reg_opno>.b"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [PRED] Scalarization
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - PFIRST
+;; - PNEXT
+;; -------------------------------------------------------------------------
+
+(define_insn "@aarch64_sve_<sve_pred_op><mode>"
+  [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+	(unspec:PRED_ALL
+	  [(match_operand:PRED_ALL 1 "register_operand" "Upa")
+	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+	   (match_operand:PRED_ALL 3 "register_operand" "0")]
+	  SVE_PITER))
+   (clobber (reg:CC_NZC CC_REGNUM))]
+  "TARGET_SVE && <max_elem_bits> >= <elem_bits>"
+  "<sve_pred_op>\t%0.<Vetype>, %1, %0.<Vetype>"
+)
+
+;; Same, but also producing a flags result.
+(define_insn_and_rewrite "*aarch64_sve_<sve_pred_op><mode>_cc"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_operand 2)
+	   (match_operand:SI 3 "aarch64_sve_ptrue_flag")
+	   (unspec:PRED_ALL
+	     [(match_operand 4)
+	      (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	      (match_operand:PRED_ALL 6 "register_operand" "0")]
+	     SVE_PITER)]
+	  UNSPEC_PTEST))
+   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+	(unspec:PRED_ALL
+	  [(match_dup 4)
+	   (match_dup 5)
+	   (match_dup 6)]
+	  SVE_PITER))]
+  "TARGET_SVE
+   && <max_elem_bits> >= <elem_bits>
+   && aarch64_sve_same_pred_for_ptest_p (&operands[2], &operands[4])"
+  "<sve_pred_op>\t%0.<Vetype>, %1, %0.<Vetype>"
+  "&& !rtx_equal_p (operands[2], operands[4])"
   {
-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    operands[4] = operands[2];
+    operands[5] = operands[3];
   }
 )
 
-;; FNEG, FABS and FSQRT predicated with a PTRUE.
-(define_insn "*<optab><mode>2"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (SVE_FP_UNARY:SVE_F (match_operand:SVE_F 2 "register_operand" "w"))]
-	  UNSPEC_MERGE_PTRUE))]
-  "TARGET_SVE"
-  "<sve_fp_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
-)
-
-;; Unpredicated FRINTy.
-(define_expand "<frint_pattern><mode>2"
-  [(set (match_operand:SVE_F 0 "register_operand")
-	(unspec:SVE_F
-	  [(match_dup 2)
-	   (unspec:SVE_F [(match_operand:SVE_F 1 "register_operand")]
-			 FRINT)]
-	  UNSPEC_MERGE_PTRUE))]
-  "TARGET_SVE"
+;; Same, but with only the flags result being interesting.
+(define_insn_and_rewrite "*aarch64_sve_<sve_pred_op><mode>_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upa")
+	   (match_operand 2)
+	   (match_operand:SI 3 "aarch64_sve_ptrue_flag")
+	   (unspec:PRED_ALL
+	     [(match_operand 4)
+	      (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	      (match_operand:PRED_ALL 6 "register_operand" "0")]
+	     SVE_PITER)]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:PRED_ALL 0 "=Upa"))]
+  "TARGET_SVE
+   && <max_elem_bits> >= <elem_bits>
+   && aarch64_sve_same_pred_for_ptest_p (&operands[2], &operands[4])"
+  "<sve_pred_op>\t%0.<Vetype>, %1, %0.<Vetype>"
+  "&& !rtx_equal_p (operands[2], operands[4])"
   {
-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    operands[4] = operands[2];
+    operands[5] = operands[3];
   }
 )
 
-;; FRINTy predicated with a PTRUE.
-(define_insn "*<frint_pattern><mode>2"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (unspec:SVE_F [(match_operand:SVE_F 2 "register_operand" "w")]
-			 FRINT)]
-	  UNSPEC_MERGE_PTRUE))]
-  "TARGET_SVE"
-  "frint<frint_suffix>\t%0.<Vetype>, %1/m, %2.<Vetype>"
-)
+;; =========================================================================
+;; == Counting elements
+;; =========================================================================
 
-;; Unpredicated conversion of floats to integers of the same size (HF to HI,
-;; SF to SI or DF to DI).
-(define_expand "<fix_trunc_optab><mode><v_int_equiv>2"
-  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
-	(unspec:<V_INT_EQUIV>
-	  [(match_dup 2)
-	   (FIXUORS:<V_INT_EQUIV>
-	     (match_operand:SVE_F 1 "register_operand"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; -------------------------------------------------------------------------
+;; ---- [INT] Count elements in a pattern (scalar)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - CNTB
+;; - CNTD
+;; - CNTH
+;; - CNTW
+;; -------------------------------------------------------------------------
+
+;; Count the number of elements in an svpattern.  Operand 1 is the pattern,
+;; operand 2 is the number of elements that fit in a 128-bit block, and
+;; operand 3 is a multiplier in the range [1, 16].
+;;
+;; Note that this pattern isn't used for SV_ALL (but would work for that too).
+(define_insn "aarch64_sve_cnt_pat"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (unspec:SI [(match_operand:DI 1 "const_int_operand")
+		      (match_operand:DI 2 "const_int_operand")
+		      (match_operand:DI 3 "const_int_operand")]
+		     UNSPEC_SVE_CNT_PAT)))]
   "TARGET_SVE"
   {
-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    return aarch64_output_sve_cnt_pat_immediate ("cnt", "%x0", operands + 1);
   }
 )
 
-;; Conversion of SF to DI, SI or HI, predicated with a PTRUE.
-(define_insn "*<fix_trunc_optab>v16hsf<mode>2"
-  [(set (match_operand:SVE_HSDI 0 "register_operand" "=w")
-	(unspec:SVE_HSDI
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (FIXUORS:SVE_HSDI
-	     (match_operand:VNx8HF 2 "register_operand" "w"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; -------------------------------------------------------------------------
+;; ---- [INT] Increment by the number of elements in a pattern (scalar)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - INC
+;; - SQINC
+;; - UQINC
+;; -------------------------------------------------------------------------
+
+;; Increment a DImode register by the number of elements in an svpattern.
+;; See aarch64_sve_cnt_pat for the counting behavior.
+(define_insn "@aarch64_sve_<inc_dec><mode>_pat"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(ANY_PLUS:DI (zero_extend:DI
+		       (unspec:SI [(match_operand:DI 2 "const_int_operand")
+				   (match_operand:DI 3 "const_int_operand")
+				   (match_operand:DI 4 "const_int_operand")]
+				  UNSPEC_SVE_CNT_PAT))
+		     (match_operand:DI_ONLY 1 "register_operand" "0")))]
   "TARGET_SVE"
-  "fcvtz<su>\t%0.<Vetype>, %1/m, %2.h"
+  {
+    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", "%x0",
+						 operands + 2);
+  }
 )
 
-;; Conversion of SF to DI or SI, predicated with a PTRUE.
-(define_insn "*<fix_trunc_optab>vnx4sf<mode>2"
-  [(set (match_operand:SVE_SDI 0 "register_operand" "=w")
-	(unspec:SVE_SDI
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (FIXUORS:SVE_SDI
-	     (match_operand:VNx4SF 2 "register_operand" "w"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Increment an SImode register by the number of elements in an svpattern
+;; using modular arithmetic.  See aarch64_sve_cnt_pat for the counting
+;; behavior.
+(define_insn "*aarch64_sve_incsi_pat"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(plus:SI (unspec:SI [(match_operand:DI 2 "const_int_operand")
+			     (match_operand:DI 3 "const_int_operand")
+			     (match_operand:DI 4 "const_int_operand")]
+			    UNSPEC_SVE_CNT_PAT)
+		 (match_operand:SI 1 "register_operand" "0")))]
   "TARGET_SVE"
-  "fcvtz<su>\t%0.<Vetype>, %1/m, %2.s"
+  {
+    return aarch64_output_sve_cnt_pat_immediate ("inc", "%x0", operands + 2);
+  }
 )
 
-;; Conversion of DF to DI or SI, predicated with a PTRUE.
-(define_insn "*<fix_trunc_optab>vnx2df<mode>2"
-  [(set (match_operand:SVE_SDI 0 "register_operand" "=w")
-	(unspec:SVE_SDI
-	  [(match_operand:VNx2BI 1 "register_operand" "Upl")
-	   (FIXUORS:SVE_SDI
-	     (match_operand:VNx2DF 2 "register_operand" "w"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Increment an SImode register by the number of elements in an svpattern
+;; using saturating arithmetic, extending the result to 64 bits.
+;;
+;; See aarch64_sve_cnt_pat for the counting behavior.
+(define_insn "@aarch64_sve_<inc_dec><mode>_pat"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(<paired_extend>:DI
+	  (SAT_PLUS:SI
+	    (unspec:SI [(match_operand:DI 2 "const_int_operand")
+			(match_operand:DI 3 "const_int_operand")
+			(match_operand:DI 4 "const_int_operand")]
+		       UNSPEC_SVE_CNT_PAT)
+	    (match_operand:SI_ONLY 1 "register_operand" "0"))))]
   "TARGET_SVE"
-  "fcvtz<su>\t%0.<Vetype>, %1/m, %2.d"
+  {
+    const char *registers = (<CODE> == SS_PLUS ? "%x0, %w0" : "%w0");
+    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", registers,
+						 operands + 2);
+  }
 )
 
-;; Unpredicated conversion of integers to floats of the same size
-;; (HI to HF, SI to SF or DI to DF).
-(define_expand "<optab><v_int_equiv><mode>2"
-  [(set (match_operand:SVE_F 0 "register_operand")
-	(unspec:SVE_F
-	  [(match_dup 2)
-	   (FLOATUORS:SVE_F
-	     (match_operand:<V_INT_EQUIV> 1 "register_operand"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; -------------------------------------------------------------------------
+;; ---- [INT] Increment by the number of elements in a pattern (vector)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - INC
+;; - SQINC
+;; - UQINC
+;; -------------------------------------------------------------------------
+
+;; Increment a vector of DIs by the number of elements in an svpattern.
+;; See aarch64_sve_cnt_pat for the counting behavior.
+(define_insn "@aarch64_sve_<inc_dec><mode>_pat"
+  [(set (match_operand:VNx2DI 0 "register_operand" "=w, ?&w")
+	(ANY_PLUS:VNx2DI
+	  (vec_duplicate:VNx2DI
+	    (zero_extend:DI
+	      (unspec:SI [(match_operand:DI 2 "const_int_operand")
+			  (match_operand:DI 3 "const_int_operand")
+			  (match_operand:DI 4 "const_int_operand")]
+			 UNSPEC_SVE_CNT_PAT)))
+	  (match_operand:VNx2DI_ONLY 1 "register_operand" "0, w")))]
   "TARGET_SVE"
   {
-    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+    if (which_alternative == 1)
+      output_asm_insn ("movprfx\t%0, %1", operands);
+    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", "%0.<Vetype>",
+						 operands + 2);
   }
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Conversion of DI, SI or HI to the same number of HFs, predicated
-;; with a PTRUE.
-(define_insn "*<optab><mode>vnx8hf2"
-  [(set (match_operand:VNx8HF 0 "register_operand" "=w")
-	(unspec:VNx8HF
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (FLOATUORS:VNx8HF
-	     (match_operand:SVE_HSDI 2 "register_operand" "w"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Increment a vector of SIs by the number of elements in an svpattern.
+;; See aarch64_sve_cnt_pat for the counting behavior.
+(define_insn "@aarch64_sve_<inc_dec><mode>_pat"
+  [(set (match_operand:VNx4SI 0 "register_operand" "=w, ?&w")
+	(ANY_PLUS:VNx4SI
+	  (vec_duplicate:VNx4SI
+	    (unspec:SI [(match_operand:DI 2 "const_int_operand")
+			(match_operand:DI 3 "const_int_operand")
+			(match_operand:DI 4 "const_int_operand")]
+		       UNSPEC_SVE_CNT_PAT))
+	  (match_operand:VNx4SI_ONLY 1 "register_operand" "0, w")))]
   "TARGET_SVE"
-  "<su_optab>cvtf\t%0.h, %1/m, %2.<Vetype>"
+  {
+    if (which_alternative == 1)
+      output_asm_insn ("movprfx\t%0, %1", operands);
+    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", "%0.<Vetype>",
+						 operands + 2);
+  }
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Conversion of DI or SI to the same number of SFs, predicated with a PTRUE.
-(define_insn "*<optab><mode>vnx4sf2"
-  [(set (match_operand:VNx4SF 0 "register_operand" "=w")
-	(unspec:VNx4SF
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (FLOATUORS:VNx4SF
-	     (match_operand:SVE_SDI 2 "register_operand" "w"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; Increment a vector of HIs by the number of elements in an svpattern.
+;; See aarch64_sve_cnt_pat for the counting behavior.
+(define_expand "@aarch64_sve_<inc_dec><mode>_pat"
+  [(set (match_operand:VNx8HI 0 "register_operand")
+	(ANY_PLUS:VNx8HI
+	  (vec_duplicate:VNx8HI
+	    (truncate:HI
+	      (unspec:SI [(match_operand:DI 2 "const_int_operand")
+			  (match_operand:DI 3 "const_int_operand")
+			  (match_operand:DI 4 "const_int_operand")]
+			 UNSPEC_SVE_CNT_PAT)))
+	  (match_operand:VNx8HI_ONLY 1 "register_operand")))]
+  "TARGET_SVE"
+)
+
+(define_insn "*aarch64_sve_<inc_dec><mode>_pat"
+  [(set (match_operand:VNx8HI 0 "register_operand" "=w, ?&w")
+	(ANY_PLUS:VNx8HI
+	  (vec_duplicate:VNx8HI
+	    (match_operator:HI 5 "subreg_lowpart_operator"
+	      [(unspec:SI [(match_operand:DI 2 "const_int_operand")
+			   (match_operand:DI 3 "const_int_operand")
+			   (match_operand:DI 4 "const_int_operand")]
+			  UNSPEC_SVE_CNT_PAT)]))
+	  (match_operand:VNx8HI_ONLY 1 "register_operand" "0, w")))]
   "TARGET_SVE"
-  "<su_optab>cvtf\t%0.s, %1/m, %2.<Vetype>"
+  {
+    if (which_alternative == 1)
+      output_asm_insn ("movprfx\t%0, %1", operands);
+    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", "%0.<Vetype>",
+						 operands + 2);
+  }
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Conversion of DI or SI to DF, predicated with a PTRUE.
-(define_insn "aarch64_sve_<optab><mode>vnx2df2"
-  [(set (match_operand:VNx2DF 0 "register_operand" "=w")
-	(unspec:VNx2DF
-	  [(match_operand:VNx2BI 1 "register_operand" "Upl")
-	   (FLOATUORS:VNx2DF
-	     (match_operand:SVE_SDI 2 "register_operand" "w"))]
-	  UNSPEC_MERGE_PTRUE))]
+;; -------------------------------------------------------------------------
+;; ---- [INT] Decrement by the number of elements in a pattern (scalar)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - DEC
+;; - SQDEC
+;; - UQDEC
+;; -------------------------------------------------------------------------
+
+;; Decrement a DImode register by the number of elements in an svpattern.
+;; See aarch64_sve_cnt_pat for the counting behavior.
+(define_insn "@aarch64_sve_<inc_dec><mode>_pat"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(ANY_MINUS:DI (match_operand:DI_ONLY 1 "register_operand" "0")
+		      (zero_extend:DI
+			(unspec:SI [(match_operand:DI 2 "const_int_operand")
+				    (match_operand:DI 3 "const_int_operand")
+				    (match_operand:DI 4 "const_int_operand")]
+				   UNSPEC_SVE_CNT_PAT))))]
   "TARGET_SVE"
-  "<su_optab>cvtf\t%0.d, %1/m, %2.<Vetype>"
+  {
+    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", "%x0",
+						 operands + 2);
+  }
 )
 
-;; Conversion of DFs to the same number of SFs, or SFs to the same number
-;; of HFs.
-(define_insn "*trunc<Vwide><mode>2"
-  [(set (match_operand:SVE_HSF 0 "register_operand" "=w")
-	(unspec:SVE_HSF
-	  [(match_operand:<VWIDE_PRED> 1 "register_operand" "Upl")
-	   (unspec:SVE_HSF
-	     [(match_operand:<VWIDE> 2 "register_operand" "w")]
-	     UNSPEC_FLOAT_CONVERT)]
-	  UNSPEC_MERGE_PTRUE))]
+;; Decrement an SImode register by the number of elements in an svpattern
+;; using modular arithmetic.  See aarch64_sve_cnt_pat for the counting
+;; behavior.
+(define_insn "*aarch64_sve_decsi_pat"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(minus:SI (match_operand:SI 1 "register_operand" "0")
+		  (unspec:SI [(match_operand:DI 2 "const_int_operand")
+			      (match_operand:DI 3 "const_int_operand")
+			      (match_operand:DI 4 "const_int_operand")]
+			     UNSPEC_SVE_CNT_PAT)))]
   "TARGET_SVE"
-  "fcvt\t%0.<Vetype>, %1/m, %2.<Vewtype>"
+  {
+    return aarch64_output_sve_cnt_pat_immediate ("dec", "%x0", operands + 2);
+  }
 )
 
-;; Conversion of SFs to the same number of DFs, or HFs to the same number
-;; of SFs.
-(define_insn "aarch64_sve_extend<mode><Vwide>2"
-  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
-	(unspec:<VWIDE>
-	  [(match_operand:<VWIDE_PRED> 1 "register_operand" "Upl")
-	   (unspec:<VWIDE>
-	     [(match_operand:SVE_HSF 2 "register_operand" "w")]
-	     UNSPEC_FLOAT_CONVERT)]
-	  UNSPEC_MERGE_PTRUE))]
+;; Decrement an SImode register by the number of elements in an svpattern
+;; using saturating arithmetic, extending the result to 64 bits.
+;;
+;; See aarch64_sve_cnt_pat for the counting behavior.
+(define_insn "@aarch64_sve_<inc_dec><mode>_pat"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(<paired_extend>:DI
+	  (SAT_MINUS:SI
+	    (match_operand:SI_ONLY 1 "register_operand" "0")
+	    (unspec:SI [(match_operand:DI 2 "const_int_operand")
+			(match_operand:DI 3 "const_int_operand")
+			(match_operand:DI 4 "const_int_operand")]
+		       UNSPEC_SVE_CNT_PAT))))]
   "TARGET_SVE"
-  "fcvt\t%0.<Vewtype>, %1/m, %2.<Vetype>"
+  {
+    const char *registers = (<CODE> == SS_MINUS ? "%x0, %w0" : "%w0");
+    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", registers,
+						 operands + 2);
+  }
 )
 
-;; Unpack the low or high half of a predicate, where "high" refers to
-;; the low-numbered lanes for big-endian and the high-numbered lanes
-;; for little-endian.
-(define_expand "vec_unpack<su>_<perm_hilo>_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand")
-   (unspec:<VWIDE> [(match_operand:PRED_BHS 1 "register_operand")]
-		   UNPACK)]
+;; -------------------------------------------------------------------------
+;; ---- [INT] Decrement by the number of elements in a pattern (vector)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - DEC
+;; - SQDEC
+;; - UQDEC
+;; -------------------------------------------------------------------------
+
+;; Decrement a vector of DIs by the number of elements in an svpattern.
+;; See aarch64_sve_cnt_pat for the counting behavior.
+(define_insn "@aarch64_sve_<inc_dec><mode>_pat"
+  [(set (match_operand:VNx2DI 0 "register_operand" "=w, ?&w")
+	(ANY_MINUS:VNx2DI
+	  (match_operand:VNx2DI_ONLY 1 "register_operand" "0, w")
+	  (vec_duplicate:VNx2DI
+	    (zero_extend:DI
+	      (unspec:SI [(match_operand:DI 2 "const_int_operand")
+			  (match_operand:DI 3 "const_int_operand")
+			  (match_operand:DI 4 "const_int_operand")]
+			 UNSPEC_SVE_CNT_PAT)))))]
   "TARGET_SVE"
   {
-    emit_insn ((<hi_lanes_optab>
-		? gen_aarch64_sve_punpkhi_<PRED_BHS:mode>
-		: gen_aarch64_sve_punpklo_<PRED_BHS:mode>)
-	       (operands[0], operands[1]));
-    DONE;
+    if (which_alternative == 1)
+      output_asm_insn ("movprfx\t%0, %1", operands);
+    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", "%0.<Vetype>",
+						 operands + 2);
   }
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; PUNPKHI and PUNPKLO.
-(define_insn "aarch64_sve_punpk<perm_hilo>_<mode>"
-  [(set (match_operand:<VWIDE> 0 "register_operand" "=Upa")
-	(unspec:<VWIDE> [(match_operand:PRED_BHS 1 "register_operand" "Upa")]
-			UNPACK_UNSIGNED))]
+;; Decrement a vector of SIs by the number of elements in an svpattern.
+;; See aarch64_sve_cnt_pat for the counting behavior.
+(define_insn "@aarch64_sve_<inc_dec><mode>_pat"
+  [(set (match_operand:VNx4SI 0 "register_operand" "=w, ?&w")
+	(ANY_MINUS:VNx4SI
+	  (match_operand:VNx4SI_ONLY 1 "register_operand" "0, w")
+	  (vec_duplicate:VNx4SI
+	    (unspec:SI [(match_operand:DI 2 "const_int_operand")
+			(match_operand:DI 3 "const_int_operand")
+			(match_operand:DI 4 "const_int_operand")]
+		       UNSPEC_SVE_CNT_PAT))))]
   "TARGET_SVE"
-  "punpk<perm_hilo>\t%0.h, %1.b"
+  {
+    if (which_alternative == 1)
+      output_asm_insn ("movprfx\t%0, %1", operands);
+    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", "%0.<Vetype>",
+						 operands + 2);
+  }
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Unpack the low or high half of a vector, where "high" refers to
-;; the low-numbered lanes for big-endian and the high-numbered lanes
-;; for little-endian.
-(define_expand "vec_unpack<su>_<perm_hilo>_<SVE_BHSI:mode>"
-  [(match_operand:<VWIDE> 0 "register_operand")
-   (unspec:<VWIDE> [(match_operand:SVE_BHSI 1 "register_operand")] UNPACK)]
+;; Decrement a vector of HIs by the number of elements in an svpattern.
+;; See aarch64_sve_cnt_pat for the counting behavior.
+(define_expand "@aarch64_sve_<inc_dec><mode>_pat"
+  [(set (match_operand:VNx8HI 0 "register_operand")
+	(ANY_MINUS:VNx8HI
+	  (match_operand:VNx8HI_ONLY 1 "register_operand")
+	  (vec_duplicate:VNx8HI
+	    (truncate:HI
+	      (unspec:SI [(match_operand:DI 2 "const_int_operand")
+			  (match_operand:DI 3 "const_int_operand")
+			  (match_operand:DI 4 "const_int_operand")]
+			 UNSPEC_SVE_CNT_PAT)))))]
+  "TARGET_SVE"
+)
+
+(define_insn "*aarch64_sve_<inc_dec><mode>_pat"
+  [(set (match_operand:VNx8HI 0 "register_operand" "=w, ?&w")
+	(ANY_MINUS:VNx8HI
+	  (match_operand:VNx8HI_ONLY 1 "register_operand" "0, w")
+	  (vec_duplicate:VNx8HI
+	    (match_operator:HI 5 "subreg_lowpart_operator"
+	      [(unspec:SI [(match_operand:DI 2 "const_int_operand")
+			   (match_operand:DI 3 "const_int_operand")
+			   (match_operand:DI 4 "const_int_operand")]
+			  UNSPEC_SVE_CNT_PAT)]))))]
   "TARGET_SVE"
   {
-    emit_insn ((<hi_lanes_optab>
-		? gen_aarch64_sve_<su>unpkhi_<SVE_BHSI:mode>
-		: gen_aarch64_sve_<su>unpklo_<SVE_BHSI:mode>)
-	       (operands[0], operands[1]));
-    DONE;
+    if (which_alternative == 1)
+      output_asm_insn ("movprfx\t%0, %1", operands);
+    return aarch64_output_sve_cnt_pat_immediate ("<inc_dec>", "%0.<Vetype>",
+						 operands + 2);
   }
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; SUNPKHI, UUNPKHI, SUNPKLO and UUNPKLO.
-(define_insn "aarch64_sve_<su>unpk<perm_hilo>_<SVE_BHSI:mode>"
-  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
-	(unspec:<VWIDE> [(match_operand:SVE_BHSI 1 "register_operand" "w")]
-			UNPACK))]
+;; -------------------------------------------------------------------------
+;; ---- [INT] Count elements in a predicate (scalar)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - CNTP
+;; -------------------------------------------------------------------------
+
+;; Count the number of set bits in a predicate.  Operand 3 is true if
+;; operand 1 is known to be all-true.
+(define_insn "@aarch64_pred_cntp<mode>"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (unspec:SI [(match_operand:PRED_ALL 1 "register_operand" "Upl")
+		      (match_operand:SI 2 "aarch64_sve_ptrue_flag")
+		      (match_operand:PRED_ALL 3 "register_operand" "Upa")]
+		     UNSPEC_CNTP)))]
+  "TARGET_SVE"
+  "cntp\t%x0, %1, %3.<Vetype>")
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Increment by the number of elements in a predicate (scalar)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - INCP
+;; - SQINCP
+;; - UQINCP
+;; -------------------------------------------------------------------------
+
+;; Increment a DImode register by the number of set bits in a predicate.
+;; See aarch64_sve_cntp for a description of the operands.
+(define_expand "@aarch64_sve_<inc_dec><DI_ONLY:mode><PRED_ALL:mode>_cntp"
+  [(set (match_operand:DI 0 "register_operand")
+	(ANY_PLUS:DI
+	  (zero_extend:DI
+	    (unspec:SI [(match_dup 3)
+			(const_int SVE_KNOWN_PTRUE)
+			(match_operand:PRED_ALL 2 "register_operand")]
+		       UNSPEC_CNTP))
+	  (match_operand:DI_ONLY 1 "register_operand")))]
   "TARGET_SVE"
-  "<su>unpk<perm_hilo>\t%0.<Vewtype>, %1.<Vetype>"
+  {
+    operands[3] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
+  }
 )
 
-;; Unpack one half of a VNx4SF to VNx2DF, or one half of a VNx8HF to VNx4SF.
-;; First unpack the source without conversion, then float-convert the
-;; unpacked source.
-(define_expand "vec_unpacks_<perm_hilo>_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand")
-   (unspec:SVE_HSF [(match_operand:SVE_HSF 1 "register_operand")]
-		   UNPACK_UNSIGNED)]
+(define_insn_and_rewrite "*aarch64_sve_<inc_dec><DI_ONLY:mode><PRED_ALL:mode>_cntp"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(ANY_PLUS:DI
+	  (zero_extend:DI
+	    (unspec:SI [(match_operand 3)
+			(const_int SVE_KNOWN_PTRUE)
+			(match_operand:PRED_ALL 2 "register_operand" "Upa")]
+		       UNSPEC_CNTP))
+	  (match_operand:DI_ONLY 1 "register_operand" "0")))]
   "TARGET_SVE"
+  "<inc_dec>p\t%x0, %2.<PRED_ALL:Vetype>"
+  "&& !CONSTANT_P (operands[3])"
   {
-    /* Use ZIP to do the unpack, since we don't care about the upper halves
-       and since it has the nice property of not needing any subregs.
-       If using UUNPK* turns out to be preferable, we could model it as
-       a ZIP whose first operand is zero.  */
-    rtx temp = gen_reg_rtx (<MODE>mode);
-    emit_insn ((<hi_lanes_optab>
-		? gen_aarch64_sve_zip2<mode>
-		: gen_aarch64_sve_zip1<mode>)
-		(temp, operands[1], operands[1]));
-    rtx ptrue = force_reg (<VWIDE_PRED>mode, CONSTM1_RTX (<VWIDE_PRED>mode));
-    emit_insn (gen_aarch64_sve_extend<mode><Vwide>2 (operands[0],
-						     ptrue, temp));
-    DONE;
+    operands[3] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
   }
 )
 
-;; Unpack one half of a VNx4SI to VNx2DF.  First unpack from VNx4SI
-;; to VNx2DI, reinterpret the VNx2DI as a VNx4SI, then convert the
-;; unpacked VNx4SI to VNx2DF.
-(define_expand "vec_unpack<su_optab>_float_<perm_hilo>_vnx4si"
-  [(match_operand:VNx2DF 0 "register_operand")
-   (FLOATUORS:VNx2DF
-     (unspec:VNx2DI [(match_operand:VNx4SI 1 "register_operand")]
-		    UNPACK_UNSIGNED))]
-  "TARGET_SVE"
+;; Increment an SImode register by the number of set bits in a predicate
+;; using modular arithmetic.  See aarch64_sve_cntp for a description of
+;; the operands.
+(define_insn_and_rewrite "*aarch64_incsi<mode>_cntp"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(plus:SI
+	  (unspec:SI [(match_operand 3)
+		      (const_int SVE_KNOWN_PTRUE)
+		      (match_operand:PRED_ALL 2 "register_operand" "Upa")]
+		     UNSPEC_CNTP)
+	  (match_operand:SI 1 "register_operand" "0")))]
+  "TARGET_SVE"
+  "incp\t%x0, %2.<Vetype>"
+  "&& !CONSTANT_P (operands[3])"
   {
-    /* Use ZIP to do the unpack, since we don't care about the upper halves
-       and since it has the nice property of not needing any subregs.
-       If using UUNPK* turns out to be preferable, we could model it as
-       a ZIP whose first operand is zero.  */
-    rtx temp = gen_reg_rtx (VNx4SImode);
-    emit_insn ((<hi_lanes_optab>
-	        ? gen_aarch64_sve_zip2vnx4si
-	        : gen_aarch64_sve_zip1vnx4si)
-	       (temp, operands[1], operands[1]));
-    rtx ptrue = force_reg (VNx2BImode, CONSTM1_RTX (VNx2BImode));
-    emit_insn (gen_aarch64_sve_<FLOATUORS:optab>vnx4sivnx2df2 (operands[0],
-							       ptrue, temp));
-    DONE;
+    operands[3] = CONSTM1_RTX (<MODE>mode);
   }
 )
 
-;; Predicate pack.  Use UZP1 on the narrower type, which discards
-;; the high part of each wide element.
-(define_insn "vec_pack_trunc_<Vwide>"
-  [(set (match_operand:PRED_BHS 0 "register_operand" "=Upa")
-	(unspec:PRED_BHS
-	  [(match_operand:<VWIDE> 1 "register_operand" "Upa")
-	   (match_operand:<VWIDE> 2 "register_operand" "Upa")]
-	  UNSPEC_PACK))]
+;; Increment an SImode register by the number of set bits in a predicate
+;; using saturating arithmetic, extending the result to 64 bits.
+;;
+;; See aarch64_sve_cntp for a description of the operands.
+(define_expand "@aarch64_sve_<inc_dec><SI_ONLY:mode><PRED_ALL:mode>_cntp"
+  [(set (match_operand:DI 0 "register_operand")
+	(<paired_extend>:DI
+	  (SAT_PLUS:SI
+	    (unspec:SI [(match_dup 3)
+			(const_int SVE_KNOWN_PTRUE)
+			(match_operand:PRED_ALL 2 "register_operand")]
+		       UNSPEC_CNTP)
+	    (match_operand:SI_ONLY 1 "register_operand"))))]
   "TARGET_SVE"
-  "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+  {
+    operands[3] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
+  }
 )
 
-;; Integer pack.  Use UZP1 on the narrower type, which discards
-;; the high part of each wide element.
-(define_insn "vec_pack_trunc_<Vwide>"
-  [(set (match_operand:SVE_BHSI 0 "register_operand" "=w")
-	(unspec:SVE_BHSI
-	  [(match_operand:<VWIDE> 1 "register_operand" "w")
-	   (match_operand:<VWIDE> 2 "register_operand" "w")]
-	  UNSPEC_PACK))]
+(define_insn_and_rewrite "*aarch64_sve_<inc_dec><SI_ONLY:mode><PRED_ALL:mode>_cntp"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(<paired_extend>:DI
+	  (SAT_PLUS:SI
+	    (unspec:SI [(match_operand 3)
+			(const_int SVE_KNOWN_PTRUE)
+			(match_operand:PRED_ALL 2 "register_operand" "Upa")]
+		       UNSPEC_CNTP)
+	    (match_operand:SI_ONLY 1 "register_operand" "0"))))]
   "TARGET_SVE"
-  "uzp1\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+  {
+    if (<CODE> == SS_PLUS)
+      return "<inc_dec>p\t%x0, %2.<PRED_ALL:Vetype>, %w0";
+    else
+      return "<inc_dec>p\t%w0, %2.<PRED_ALL:Vetype>";
+  }
+  "&& !CONSTANT_P (operands[3])"
+  {
+    operands[3] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
+  }
 )
 
-;; Convert two vectors of DF to SF, or two vectors of SF to HF, and pack
-;; the results into a single vector.
-(define_expand "vec_pack_trunc_<Vwide>"
-  [(set (match_dup 4)
-	(unspec:SVE_HSF
-	  [(match_dup 3)
-	   (unspec:SVE_HSF [(match_operand:<VWIDE> 1 "register_operand")]
-			   UNSPEC_FLOAT_CONVERT)]
-	  UNSPEC_MERGE_PTRUE))
-   (set (match_dup 5)
-	(unspec:SVE_HSF
-	  [(match_dup 3)
-	   (unspec:SVE_HSF [(match_operand:<VWIDE> 2 "register_operand")]
-			   UNSPEC_FLOAT_CONVERT)]
-	  UNSPEC_MERGE_PTRUE))
-   (set (match_operand:SVE_HSF 0 "register_operand")
-	(unspec:SVE_HSF [(match_dup 4) (match_dup 5)] UNSPEC_UZP1))]
+;; -------------------------------------------------------------------------
+;; ---- [INT] Increment by the number of elements in a predicate (vector)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - INCP
+;; - SQINCP
+;; - UQINCP
+;; -------------------------------------------------------------------------
+
+;; Increment a vector of DIs by the number of set bits in a predicate.
+;; See aarch64_sve_cntp for a description of the operands.
+(define_expand "@aarch64_sve_<inc_dec><mode>_cntp"
+  [(set (match_operand:VNx2DI 0 "register_operand")
+	(ANY_PLUS:VNx2DI
+	  (vec_duplicate:VNx2DI
+	    (zero_extend:DI
+	      (unspec:SI
+		[(match_dup 3)
+		 (const_int SVE_KNOWN_PTRUE)
+		 (match_operand:<VPRED> 2 "register_operand")]
+		UNSPEC_CNTP)))
+	  (match_operand:VNx2DI_ONLY 1 "register_operand")))]
   "TARGET_SVE"
   {
-    operands[3] = force_reg (<VWIDE_PRED>mode, CONSTM1_RTX (<VWIDE_PRED>mode));
-    operands[4] = gen_reg_rtx (<MODE>mode);
-    operands[5] = gen_reg_rtx (<MODE>mode);
+    operands[3] = CONSTM1_RTX (<VPRED>mode);
   }
 )
 
-;; Convert two vectors of DF to SI and pack the results into a single vector.
-(define_expand "vec_pack_<su>fix_trunc_vnx2df"
-  [(set (match_dup 4)
-	(unspec:VNx4SI
-	  [(match_dup 3)
-	   (FIXUORS:VNx4SI (match_operand:VNx2DF 1 "register_operand"))]
-	  UNSPEC_MERGE_PTRUE))
-   (set (match_dup 5)
-	(unspec:VNx4SI
-	  [(match_dup 3)
-	   (FIXUORS:VNx4SI (match_operand:VNx2DF 2 "register_operand"))]
-	  UNSPEC_MERGE_PTRUE))
-   (set (match_operand:VNx4SI 0 "register_operand")
-	(unspec:VNx4SI [(match_dup 4) (match_dup 5)] UNSPEC_UZP1))]
+(define_insn_and_rewrite "*aarch64_sve_<inc_dec><mode>_cntp"
+  [(set (match_operand:VNx2DI 0 "register_operand" "=w, ?&w")
+	(ANY_PLUS:VNx2DI
+	  (vec_duplicate:VNx2DI
+	    (zero_extend:DI
+	      (unspec:SI
+		[(match_operand 3)
+		 (const_int SVE_KNOWN_PTRUE)
+		 (match_operand:<VPRED> 2 "register_operand" "Upa, Upa")]
+		UNSPEC_CNTP)))
+	  (match_operand:VNx2DI_ONLY 1 "register_operand" "0, w")))]
   "TARGET_SVE"
+  "@
+   <inc_dec>p\t%0.d, %2
+   movprfx\t%0, %1\;<inc_dec>p\t%0.d, %2"
+  "&& !CONSTANT_P (operands[3])"
   {
-    operands[3] = force_reg (VNx2BImode, CONSTM1_RTX (VNx2BImode));
-    operands[4] = gen_reg_rtx (VNx4SImode);
-    operands[5] = gen_reg_rtx (VNx4SImode);
+    operands[3] = CONSTM1_RTX (<VPRED>mode);
   }
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Predicated floating-point operations with select.
-(define_expand "cond_<optab><mode>"
-  [(set (match_operand:SVE_F 0 "register_operand")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_F
-	     [(match_operand:SVE_F 2 "register_operand")
-	      (match_operand:SVE_F 3 "register_operand")]
-	     SVE_COND_FP_BINARY)
-	   (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
-	  UNSPEC_SEL))]
+;; Increment a vector of SIs by the number of set bits in a predicate.
+;; See aarch64_sve_cntp for a description of the operands.
+(define_expand "@aarch64_sve_<inc_dec><mode>_cntp"
+  [(set (match_operand:VNx4SI 0 "register_operand")
+	(ANY_PLUS:VNx4SI
+	  (vec_duplicate:VNx4SI
+	    (unspec:SI
+	      [(match_dup 3)
+	       (const_int SVE_KNOWN_PTRUE)
+	       (match_operand:<VPRED> 2 "register_operand")]
+	      UNSPEC_CNTP))
+	  (match_operand:VNx4SI_ONLY 1 "register_operand")))]
   "TARGET_SVE"
+  {
+    operands[3] = CONSTM1_RTX (<VPRED>mode);
+  }
 )
 
-;; Predicated floating-point operations with select matching output.
-(define_insn "*cond_<optab><mode>_0"
-  [(set (match_operand:SVE_F 0 "register_operand" "+w, w, ?&w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	   (unspec:SVE_F
-	     [(match_operand:SVE_F 2 "register_operand" "0, w, w")
-	      (match_operand:SVE_F 3 "register_operand" "w, 0, w")]
-	     SVE_COND_FP_BINARY)
-	   (match_dup 0)]
-	  UNSPEC_SEL))]
+(define_insn_and_rewrite "*aarch64_sve_<inc_dec><mode>_cntp"
+  [(set (match_operand:VNx4SI 0 "register_operand" "=w, ?&w")
+	(ANY_PLUS:VNx4SI
+	  (vec_duplicate:VNx4SI
+	    (unspec:SI
+	      [(match_operand 3)
+	       (const_int SVE_KNOWN_PTRUE)
+	       (match_operand:<VPRED> 2 "register_operand" "Upa, Upa")]
+	      UNSPEC_CNTP))
+	  (match_operand:VNx4SI_ONLY 1 "register_operand" "0, w")))]
   "TARGET_SVE"
   "@
-   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-   <sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
-   movprfx\t%0, %1/m, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "*,*,yes")]
+   <inc_dec>p\t%0.s, %2
+   movprfx\t%0, %1\;<inc_dec>p\t%0.s, %2"
+  "&& !CONSTANT_P (operands[3])"
+  {
+    operands[3] = CONSTM1_RTX (<VPRED>mode);
+  }
+  [(set_attr "movprfx" "*,yes")]
 )
 
-;; Predicated floating-point operations with select matching first operand.
-(define_insn "*cond_<optab><mode>_2"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w, ?&w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (unspec:SVE_F
-	     [(match_operand:SVE_F 2 "register_operand" "0, w")
-	      (match_operand:SVE_F 3 "register_operand" "w, w")]
-	     SVE_COND_FP_BINARY)
-	   (match_dup 2)]
-	  UNSPEC_SEL))]
+;; Increment a vector of HIs by the number of set bits in a predicate.
+;; See aarch64_sve_cntp for a description of the operands.
+(define_expand "@aarch64_sve_<inc_dec><mode>_cntp"
+  [(set (match_operand:VNx8HI 0 "register_operand")
+	(ANY_PLUS:VNx8HI
+	  (vec_duplicate:VNx8HI
+	    (truncate:HI
+	      (unspec:SI
+		[(match_dup 3)
+		 (const_int SVE_KNOWN_PTRUE)
+		 (match_operand:<VPRED> 2 "register_operand")]
+		UNSPEC_CNTP)))
+	  (match_operand:VNx8HI_ONLY 1 "register_operand")))]
   "TARGET_SVE"
-  "@
-   <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
-   movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "*,yes")]
+  {
+    operands[3] = CONSTM1_RTX (<VPRED>mode);
+  }
 )
 
-;; Predicated floating-point operations with select matching second operand.
-(define_insn "*cond_<optab><mode>_3"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w, ?&w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (unspec:SVE_F
-	     [(match_operand:SVE_F 2 "register_operand" "w, w")
-	      (match_operand:SVE_F 3 "register_operand" "0, w")]
-	     SVE_COND_FP_BINARY)
-	   (match_dup 3)]
-	  UNSPEC_SEL))]
+(define_insn_and_rewrite "*aarch64_sve_<inc_dec><mode>_cntp"
+  [(set (match_operand:VNx8HI 0 "register_operand" "=w, ?&w")
+	(ANY_PLUS:VNx8HI
+	  (vec_duplicate:VNx8HI
+	    (match_operator:HI 3 "subreg_lowpart_operator"
+	      [(unspec:SI
+		 [(match_operand 4)
+		  (const_int SVE_KNOWN_PTRUE)
+		  (match_operand:<VPRED> 2 "register_operand" "Upa, Upa")]
+		 UNSPEC_CNTP)]))
+	  (match_operand:VNx8HI_ONLY 1 "register_operand" "0, w")))]
   "TARGET_SVE"
   "@
-   <sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
-   movprfx\t%0, %3\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>"
+   <inc_dec>p\t%0.h, %2
+   movprfx\t%0, %1\;<inc_dec>p\t%0.h, %2"
+  "&& !CONSTANT_P (operands[4])"
+  {
+    operands[4] = CONSTM1_RTX (<VPRED>mode);
+  }
   [(set_attr "movprfx" "*,yes")]
 )
 
-;; Predicated floating-point operations with select matching zero.
-(define_insn "*cond_<optab><mode>_z"
-  [(set (match_operand:SVE_F 0 "register_operand" "=&w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (unspec:SVE_F
-	     [(match_operand:SVE_F 2 "register_operand" "w")
-	      (match_operand:SVE_F 3 "register_operand" "w")]
-	     SVE_COND_FP_BINARY)
-	   (match_operand:SVE_F 4 "aarch64_simd_imm_zero")]
-	  UNSPEC_SEL))]
+;; -------------------------------------------------------------------------
+;; ---- [INT] Decrement by the number of elements in a predicate (scalar)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - DECP
+;; - SQDECP
+;; - UQDECP
+;; -------------------------------------------------------------------------
+
+;; Decrement a DImode register by the number of set bits in a predicate.
+;; See aarch64_sve_cntp for a description of the operands.
+(define_expand "@aarch64_sve_<inc_dec><DI_ONLY:mode><PRED_ALL:mode>_cntp"
+  [(set (match_operand:DI 0 "register_operand")
+	(ANY_MINUS:DI
+	  (match_operand:DI_ONLY 1 "register_operand")
+	  (zero_extend:DI
+	    (unspec:SI [(match_dup 3)
+			(const_int SVE_KNOWN_PTRUE)
+			(match_operand:PRED_ALL 2 "register_operand")]
+		       UNSPEC_CNTP))))]
   "TARGET_SVE"
-  "movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "yes")]
+  {
+    operands[3] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
+  }
 )
 
-;; Synthetic predication of floating-point operations with select unmatched.
-(define_insn_and_split "*cond_<optab><mode>_any"
-  [(set (match_operand:SVE_F 0 "register_operand" "=&w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (unspec:SVE_F
-	     [(match_operand:SVE_F 2 "register_operand" "w")
-	      (match_operand:SVE_F 3 "register_operand" "w")]
-	     SVE_COND_FP_BINARY)
-	   (match_operand:SVE_F 4 "register_operand" "w")]
-	  UNSPEC_SEL))]
+(define_insn_and_rewrite "*aarch64_sve_<inc_dec><DI_ONLY:mode><PRED_ALL:mode>_cntp"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(ANY_MINUS:DI
+	  (match_operand:DI_ONLY 1 "register_operand" "0")
+	  (zero_extend:DI
+	    (unspec:SI [(match_operand 3)
+			(const_int SVE_KNOWN_PTRUE)
+			(match_operand:PRED_ALL 2 "register_operand" "Upa")]
+		       UNSPEC_CNTP))))]
   "TARGET_SVE"
-  "#"
-  "&& reload_completed
-   && !(rtx_equal_p (operands[0], operands[4])
-        || rtx_equal_p (operands[2], operands[4])
-        || rtx_equal_p (operands[3], operands[4]))"
-  ; Not matchable by any one insn or movprfx insn.  We need a separate select.
-  [(set (match_dup 0)
-	(unspec:SVE_F [(match_dup 1) (match_dup 2) (match_dup 4)] UNSPEC_SEL))
-   (set (match_dup 0)
-	(unspec:SVE_F
-	  [(match_dup 1)
-	   (unspec:SVE_F [(match_dup 0) (match_dup 3)] SVE_COND_FP_BINARY)
-           (match_dup 0)]
-	  UNSPEC_SEL))]
+  "<inc_dec>p\t%x0, %2.<PRED_ALL:Vetype>"
+  "&& !CONSTANT_P (operands[3])"
+  {
+    operands[3] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
+  }
 )
 
-;; Predicated floating-point ternary operations with select.
-(define_expand "cond_<optab><mode>"
-  [(set (match_operand:SVE_F 0 "register_operand")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand")
-	   (unspec:SVE_F
-	     [(match_operand:SVE_F 2 "register_operand")
-	      (match_operand:SVE_F 3 "register_operand")
-	      (match_operand:SVE_F 4 "register_operand")]
-	     SVE_COND_FP_TERNARY)
-	   (match_operand:SVE_F 5 "aarch64_simd_reg_or_zero")]
-	  UNSPEC_SEL))]
-  "TARGET_SVE"
-{
-  /* Swap the multiplication operands if the fallback value is the
-     second of the two.  */
-  if (rtx_equal_p (operands[3], operands[5]))
-    std::swap (operands[2], operands[3]);
-})
+;; Decrement an SImode register by the number of set bits in a predicate
+;; using modular arithmetic.  See aarch64_sve_cntp for a description of the
+;; operands.
+(define_insn_and_rewrite "*aarch64_decsi<mode>_cntp"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(minus:SI
+	  (match_operand:SI 1 "register_operand" "0")
+	  (unspec:SI [(match_operand 3)
+		      (const_int SVE_KNOWN_PTRUE)
+		      (match_operand:PRED_ALL 2 "register_operand" "Upa")]
+		     UNSPEC_CNTP)))]
+  "TARGET_SVE"
+  "decp\t%x0, %2.<Vetype>"
+  "&& !CONSTANT_P (operands[3])"
+  {
+    operands[3] = CONSTM1_RTX (<MODE>mode);
+  }
+)
 
-;; Predicated floating-point ternary operations using the FMAD-like form.
-(define_insn "*cond_<optab><mode>_2"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w, ?&w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (unspec:SVE_F
-	     [(match_operand:SVE_F 2 "register_operand" "0, w")
-	      (match_operand:SVE_F 3 "register_operand" "w, w")
-	      (match_operand:SVE_F 4 "register_operand" "w, w")]
-	     SVE_COND_FP_TERNARY)
-	   (match_dup 2)]
-	  UNSPEC_SEL))]
+;; Decrement an SImode register by the number of set bits in a predicate
+;; using saturating arithmetic, extending the result to 64 bits.
+;;
+;; See aarch64_sve_cntp for a description of the operands.
+(define_expand "@aarch64_sve_<inc_dec><SI_ONLY:mode><PRED_ALL:mode>_cntp"
+  [(set (match_operand:DI 0 "register_operand")
+	(<paired_extend>:DI
+	  (SAT_MINUS:SI
+	    (match_operand:SI_ONLY 1 "register_operand")
+	    (unspec:SI [(match_dup 3)
+			(const_int SVE_KNOWN_PTRUE)
+			(match_operand:PRED_ALL 2 "register_operand")]
+		       UNSPEC_CNTP))))]
   "TARGET_SVE"
-  "@
-   <sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
-   movprfx\t%0, %2\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>"
-  [(set_attr "movprfx" "*,yes")]
+  {
+    operands[3] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
+  }
 )
 
-;; Predicated floating-point ternary operations using the FMLA-like form.
-(define_insn "*cond_<optab><mode>_4"
-  [(set (match_operand:SVE_F 0 "register_operand" "=w, ?&w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
-	   (unspec:SVE_F
-	     [(match_operand:SVE_F 2 "register_operand" "w, w")
-	      (match_operand:SVE_F 3 "register_operand" "w, w")
-	      (match_operand:SVE_F 4 "register_operand" "0, w")]
-	     SVE_COND_FP_TERNARY)
-	   (match_dup 4)]
-	  UNSPEC_SEL))]
+(define_insn_and_rewrite "*aarch64_sve_<inc_dec><SI_ONLY:mode><PRED_ALL:mode>_cntp"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(<paired_extend>:DI
+	  (SAT_MINUS:SI
+	    (match_operand:SI_ONLY 1 "register_operand" "0")
+	    (unspec:SI [(match_operand 3)
+			(const_int SVE_KNOWN_PTRUE)
+			(match_operand:PRED_ALL 2 "register_operand" "Upa")]
+		       UNSPEC_CNTP))))]
   "TARGET_SVE"
-  "@
-   <sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
-   movprfx\t%0, %4\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>"
-  [(set_attr "movprfx" "*,yes")]
+  {
+    if (<CODE> == SS_MINUS)
+      return "<inc_dec>p\t%x0, %2.<PRED_ALL:Vetype>, %w0";
+    else
+      return "<inc_dec>p\t%w0, %2.<PRED_ALL:Vetype>";
+  }
+  "&& !CONSTANT_P (operands[3])"
+  {
+    operands[3] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
+  }
 )
 
-;; Predicated floating-point ternary operations in which the value for
-;; inactive lanes is distinct from the other inputs.
-(define_insn_and_split "*cond_<optab><mode>_any"
-  [(set (match_operand:SVE_F 0 "register_operand" "=&w, &w, ?&w")
-	(unspec:SVE_F
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	   (unspec:SVE_F
-	     [(match_operand:SVE_F 2 "register_operand" "w, w, w")
-	      (match_operand:SVE_F 3 "register_operand" "w, w, w")
-	      (match_operand:SVE_F 4 "register_operand" "w, w, w")]
-	     SVE_COND_FP_TERNARY)
-	   (match_operand:SVE_F 5 "aarch64_simd_reg_or_zero" "Dz, 0, w")]
-	  UNSPEC_SEL))]
-  "TARGET_SVE
-   && !rtx_equal_p (operands[2], operands[5])
-   && !rtx_equal_p (operands[3], operands[5])
-   && !rtx_equal_p (operands[4], operands[5])"
-  "@
-   movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
-   movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
-   #"
-  "&& reload_completed
-   && !CONSTANT_P (operands[5])
-   && !rtx_equal_p (operands[0], operands[5])"
-  [(set (match_dup 0)
-	(unspec:SVE_F [(match_dup 1) (match_dup 4) (match_dup 5)] UNSPEC_SEL))
-   (set (match_dup 0)
-	(unspec:SVE_F
-	  [(match_dup 1)
-	   (unspec:SVE_F [(match_dup 2) (match_dup 3) (match_dup 0)]
-			 SVE_COND_FP_TERNARY)
-           (match_dup 0)]
-	  UNSPEC_SEL))]
-  ""
-  [(set_attr "movprfx" "yes")]
+;; -------------------------------------------------------------------------
+;; ---- [INT] Decrement by the number of elements in a predicate (vector)
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - DECP
+;; - SQDECP
+;; - UQDECP
+;; -------------------------------------------------------------------------
+
+;; Decrement a vector of DIs by the number of set bits in a predicate.
+;; See aarch64_sve_cntp for a description of the operands.
+(define_expand "@aarch64_sve_<inc_dec><mode>_cntp"
+  [(set (match_operand:VNx2DI 0 "register_operand")
+	(ANY_MINUS:VNx2DI
+	  (match_operand:VNx2DI_ONLY 1 "register_operand")
+	  (vec_duplicate:VNx2DI
+	    (zero_extend:DI
+	      (unspec:SI
+		[(match_dup 3)
+		 (const_int SVE_KNOWN_PTRUE)
+		 (match_operand:<VPRED> 2 "register_operand")]
+		UNSPEC_CNTP)))))]
+  "TARGET_SVE"
+  {
+    operands[3] = CONSTM1_RTX (<VPRED>mode);
+  }
 )
 
-;; Shift an SVE vector left and insert a scalar into element 0.
-(define_insn "vec_shl_insert_<mode>"
-  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w")
-	(unspec:SVE_ALL
-	  [(match_operand:SVE_ALL 1 "register_operand" "0, 0")
-	   (match_operand:<VEL> 2 "register_operand" "rZ, w")]
-	  UNSPEC_INSR))]
+(define_insn_and_rewrite "*aarch64_sve_<inc_dec><mode>_cntp"
+  [(set (match_operand:VNx2DI 0 "register_operand" "=w, ?&w")
+	(ANY_MINUS:VNx2DI
+	  (match_operand:VNx2DI_ONLY 1 "register_operand" "0, w")
+	  (vec_duplicate:VNx2DI
+	    (zero_extend:DI
+	      (unspec:SI
+		[(match_operand 3)
+		 (const_int SVE_KNOWN_PTRUE)
+		 (match_operand:<VPRED> 2 "register_operand" "Upa, Upa")]
+		UNSPEC_CNTP)))))]
   "TARGET_SVE"
   "@
-   insr\t%0.<Vetype>, %<vwcore>2
-   insr\t%0.<Vetype>, %<Vetype>2"
+   <inc_dec>p\t%0.d, %2
+   movprfx\t%0, %1\;<inc_dec>p\t%0.d, %2"
+  "&& !CONSTANT_P (operands[3])"
+  {
+    operands[3] = CONSTM1_RTX (<VPRED>mode);
+  }
+  [(set_attr "movprfx" "*,yes")]
 )
 
-(define_expand "copysign<mode>3"
-  [(match_operand:SVE_F 0 "register_operand")
-   (match_operand:SVE_F 1 "register_operand")
-   (match_operand:SVE_F 2 "register_operand")]
+;; Decrement a vector of SIs by the number of set bits in a predicate.
+;; See aarch64_sve_cntp for a description of the operands.
+(define_expand "@aarch64_sve_<inc_dec><mode>_cntp"
+  [(set (match_operand:VNx4SI 0 "register_operand")
+	(ANY_MINUS:VNx4SI
+	  (match_operand:VNx4SI_ONLY 1 "register_operand")
+	  (vec_duplicate:VNx4SI
+	    (unspec:SI
+	      [(match_dup 3)
+	       (const_int SVE_KNOWN_PTRUE)
+	       (match_operand:<VPRED> 2 "register_operand")]
+	      UNSPEC_CNTP))))]
   "TARGET_SVE"
   {
-    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
-    rtx mant = gen_reg_rtx (<V_INT_EQUIV>mode);
-    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
-    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
-
-    rtx arg1 = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
-    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
-
-    emit_insn (gen_and<v_int_equiv>3
-	       (sign, arg2,
-		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
-						   HOST_WIDE_INT_M1U
-						   << bits)));
-    emit_insn (gen_and<v_int_equiv>3
-	       (mant, arg1,
-		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
-						   ~(HOST_WIDE_INT_M1U
-						     << bits))));
-    emit_insn (gen_ior<v_int_equiv>3 (int_res, sign, mant));
-    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
-    DONE;
+    operands[3] = CONSTM1_RTX (<VPRED>mode);
   }
 )
 
-(define_expand "xorsign<mode>3"
-  [(match_operand:SVE_F 0 "register_operand")
-   (match_operand:SVE_F 1 "register_operand")
-   (match_operand:SVE_F 2 "register_operand")]
+(define_insn_and_rewrite "*aarch64_sve_<inc_dec><mode>_cntp"
+  [(set (match_operand:VNx4SI 0 "register_operand" "=w, ?&w")
+	(ANY_MINUS:VNx4SI
+	  (match_operand:VNx4SI_ONLY 1 "register_operand" "0, w")
+	  (vec_duplicate:VNx4SI
+	    (unspec:SI
+	      [(match_operand 3)
+	       (const_int SVE_KNOWN_PTRUE)
+	       (match_operand:<VPRED> 2 "register_operand" "Upa, Upa")]
+	      UNSPEC_CNTP))))]
   "TARGET_SVE"
+  "@
+   <inc_dec>p\t%0.s, %2
+   movprfx\t%0, %1\;<inc_dec>p\t%0.s, %2"
+  "&& !CONSTANT_P (operands[3])"
   {
-    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
-    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
-    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
-
-    rtx arg1 = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
-    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
+    operands[3] = CONSTM1_RTX (<VPRED>mode);
+  }
+  [(set_attr "movprfx" "*,yes")]
+)
 
-    emit_insn (gen_and<v_int_equiv>3
-	       (sign, arg2,
-		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
-						   HOST_WIDE_INT_M1U
-						   << bits)));
-    emit_insn (gen_xor<v_int_equiv>3 (int_res, arg1, sign));
-    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
-    DONE;
+;; Decrement a vector of HIs by the number of set bits in a predicate.
+;; See aarch64_sve_cntp for a description of the operands.
+(define_expand "@aarch64_sve_<inc_dec><mode>_cntp"
+  [(set (match_operand:VNx8HI 0 "register_operand")
+	(ANY_MINUS:VNx8HI
+	  (match_operand:VNx8HI_ONLY 1 "register_operand")
+	  (vec_duplicate:VNx8HI
+	    (truncate:HI
+	      (unspec:SI
+		[(match_dup 3)
+		 (const_int SVE_KNOWN_PTRUE)
+		 (match_operand:<VPRED> 2 "register_operand")]
+		UNSPEC_CNTP)))))]
+  "TARGET_SVE"
+  {
+    operands[3] = CONSTM1_RTX (<VPRED>mode);
   }
 )
 
-;; Unpredicated DOT product.
-(define_insn "<sur>dot_prod<vsi2qi>"
-  [(set (match_operand:SVE_SDI 0 "register_operand" "=w, ?&w")
-	(plus:SVE_SDI
-	  (unspec:SVE_SDI
-	    [(match_operand:<VSI2QI> 1 "register_operand" "w, w")
-	     (match_operand:<VSI2QI> 2 "register_operand" "w, w")]
-	    DOTPROD)
-	  (match_operand:SVE_SDI 3 "register_operand" "0, w")))]
+(define_insn_and_rewrite "*aarch64_sve_<inc_dec><mode>_cntp"
+  [(set (match_operand:VNx8HI 0 "register_operand" "=w, ?&w")
+	(ANY_MINUS:VNx8HI
+	  (match_operand:VNx8HI_ONLY 1 "register_operand" "0, w")
+	  (vec_duplicate:VNx8HI
+	    (match_operator:HI 3 "subreg_lowpart_operator"
+	      [(unspec:SI
+		 [(match_operand 4)
+		  (const_int SVE_KNOWN_PTRUE)
+		  (match_operand:<VPRED> 2 "register_operand" "Upa, Upa")]
+		 UNSPEC_CNTP)]))))]
   "TARGET_SVE"
   "@
-   <sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>
-   movprfx\t%0, %3\;<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>"
+   <inc_dec>p\t%0.h, %2
+   movprfx\t%0, %1\;<inc_dec>p\t%0.h, %2"
+  "&& !CONSTANT_P (operands[4])"
+  {
+    operands[4] = CONSTM1_RTX (<VPRED>mode);
+  }
   [(set_attr "movprfx" "*,yes")]
 )
diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
index 2b1ec85ae..a6a14b7fc 100644
--- a/gcc/config/aarch64/aarch64-tune.md
+++ b/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from aarch64-cores.def
 (define_attr "tune"
-	"cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,ares,neoversen1,neoversee1,tsv110,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55"
+	"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa65,cortexa65ae,ares,neoversen1,neoversee1,tsv110,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55"
 	(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5dfcaf57e..ee85bb4e2 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -73,6 +73,8 @@
 #include "selftest-rtl.h"
 #include "rtx-vector-builder.h"
 #include "intl.h"
+#include "expmed.h"
+#include "function-abi.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -83,7 +85,7 @@
 /* Information about a legitimate vector immediate operand.  */
 struct simd_immediate_info
 {
-  enum insn_type { MOV, MVN };
+  enum insn_type { MOV, MVN, INDEX, PTRUE };
   enum modifier_type { LSL, MSL };
 
   simd_immediate_info () {}
@@ -92,33 +94,51 @@ struct simd_immediate_info
 		       insn_type = MOV, modifier_type = LSL,
 		       unsigned int = 0);
   simd_immediate_info (scalar_mode, rtx, rtx);
+  simd_immediate_info (scalar_int_mode, aarch64_svpattern);
 
   /* The mode of the elements.  */
   scalar_mode elt_mode;
 
-  /* The value of each element if all elements are the same, or the
-     first value if the constant is a series.  */
-  rtx value;
-
-  /* The value of the step if the constant is a series, null otherwise.  */
-  rtx step;
-
   /* The instruction to use to move the immediate into a vector.  */
   insn_type insn;
 
-  /* The kind of shift modifier to use, and the number of bits to shift.
-     This is (LSL, 0) if no shift is needed.  */
-  modifier_type modifier;
-  unsigned int shift;
+  union
+  {
+    /* For MOV and MVN.  */
+    struct
+    {
+      /* The value of each element.  */
+      rtx value;
+
+      /* The kind of shift modifier to use, and the number of bits to shift.
+	 This is (LSL, 0) if no shift is needed.  */
+      modifier_type modifier;
+      unsigned int shift;
+    } mov;
+
+    /* For INDEX.  */
+    struct
+    {
+      /* The value of the first element and the step to be added for each
+	 subsequent element.  */
+      rtx base, step;
+    } index;
+
+    /* For PTRUE.  */
+    aarch64_svpattern pattern;
+  } u;
 };
 
 /* Construct a floating-point immediate in which each element has mode
    ELT_MODE_IN and value VALUE_IN.  */
 inline simd_immediate_info
 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
-  : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
-    modifier (LSL), shift (0)
-{}
+  : elt_mode (elt_mode_in), insn (MOV)
+{
+  u.mov.value = value_in;
+  u.mov.modifier = LSL;
+  u.mov.shift = 0;
+}
 
 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
    and value VALUE_IN.  The other parameters are as for the structure
@@ -128,17 +148,32 @@ inline simd_immediate_info
 		       unsigned HOST_WIDE_INT value_in,
 		       insn_type insn_in, modifier_type modifier_in,
 		       unsigned int shift_in)
-  : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
-    step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
-{}
+  : elt_mode (elt_mode_in), insn (insn_in)
+{
+  u.mov.value = gen_int_mode (value_in, elt_mode_in);
+  u.mov.modifier = modifier_in;
+  u.mov.shift = shift_in;
+}
 
 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
-   and where element I is equal to VALUE_IN + I * STEP_IN.  */
+   and where element I is equal to BASE_IN + I * STEP_IN.  */
+inline simd_immediate_info
+::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
+  : elt_mode (elt_mode_in), insn (INDEX)
+{
+  u.index.base = base_in;
+  u.index.step = step_in;
+}
+
+/* Construct a predicate that controls elements of mode ELT_MODE_IN
+   and has PTRUE pattern PATTERN_IN.  */
 inline simd_immediate_info
-::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
-  : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
-    modifier (LSL), shift (0)
-{}
+::simd_immediate_info (scalar_int_mode elt_mode_in,
+		       aarch64_svpattern pattern_in)
+  : elt_mode (elt_mode_in), insn (PTRUE)
+{
+  u.pattern = pattern_in;
+}
 
 /* The current code model.  */
 enum aarch64_code_model aarch64_cmodel;
@@ -177,7 +212,7 @@ unsigned aarch64_architecture_version;
 enum aarch64_processor aarch64_tune = cortexa53;
 
 /* Mask to specify which instruction scheduling options should be used.  */
-unsigned long aarch64_tune_flags = 0;
+uint64_t aarch64_tune_flags = 0;
 
 /* Global flag for PC relative loads.  */
 bool aarch64_pcrelative_literal_loads;
@@ -693,7 +728,7 @@ static const struct tune_params generic_tunings =
   4, /* memmov_cost  */
   2, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
-  "8",	/* function_align.  */
+  "16:12",	/* function_align.  */
   "4",	/* jump_align.  */
   "8",	/* loop_align.  */
   2,	/* int_reassoc_width.  */
@@ -1139,7 +1174,7 @@ struct processor
   enum aarch64_processor sched_core;
   enum aarch64_arch arch;
   unsigned architecture_version;
-  const unsigned long flags;
+  const uint64_t flags;
   const struct tune_params *const tune;
 };
 
@@ -1172,15 +1207,46 @@ static const struct processor *selected_arch;
 static const struct processor *selected_cpu;
 static const struct processor *selected_tune;
 
+enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
+
 /* The current tuning set.  */
 struct tune_params aarch64_tune_params = generic_tunings;
 
+/* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
+
+static tree
+handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
+				     int, bool *no_add_attrs)
+{
+  /* Since we set fn_type_req to true, the caller should have checked
+     this for us.  */
+  gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
+  switch ((arm_pcs) fntype_abi (*node).id ())
+    {
+    case ARM_PCS_AAPCS64:
+    case ARM_PCS_SIMD:
+      return NULL_TREE;
+
+    case ARM_PCS_SVE:
+      error ("the %qE attribute cannot be applied to an SVE function type",
+	     name);
+      *no_add_attrs = true;
+      return NULL_TREE;
+
+    case ARM_PCS_TLSDESC:
+    case ARM_PCS_UNKNOWN:
+      break;
+    }
+  gcc_unreachable ();
+}
+
 /* Table of machine attributes.  */
 static const struct attribute_spec aarch64_attribute_table[] =
 {
   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
        affects_type_identity, handler, exclude } */
-  { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
+  { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
+			  handle_aarch64_vector_pcs_attribute, NULL },
   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
 };
 
@@ -1241,6 +1307,7 @@ static enum aarch64_parse_opt_result
 aarch64_handle_standard_branch_protection (char* str, char* rest)
 {
   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
+  aarch64_ra_sign_key = AARCH64_KEY_A;
   aarch64_enable_bti = 1;
   if (rest)
     {
@@ -1255,6 +1322,7 @@ aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
 				    char* rest ATTRIBUTE_UNUSED)
 {
   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
+  aarch64_ra_sign_key = AARCH64_KEY_A;
   return AARCH64_PARSE_OK;
 }
 
@@ -1266,6 +1334,14 @@ aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
   return AARCH64_PARSE_OK;
 }
 
+static enum aarch64_parse_opt_result
+aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
+			      char* rest ATTRIBUTE_UNUSED)
+{
+  aarch64_ra_sign_key = AARCH64_KEY_B;
+  return AARCH64_PARSE_OK;
+}
+
 static enum aarch64_parse_opt_result
 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
 				    char* rest ATTRIBUTE_UNUSED)
@@ -1276,6 +1352,7 @@ aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
 
 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
+  { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
   { NULL, NULL, NULL, 0 }
 };
 
@@ -1295,6 +1372,66 @@ static const char * const aarch64_condition_codes[] =
   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 };
 
+/* The preferred condition codes for SVE conditions.  */
+static const char *const aarch64_sve_condition_codes[] =
+{
+  "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
+  "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
+};
+
+/* Return the assembly token for svpattern value VALUE.  */
+
+static const char *
+svpattern_token (enum aarch64_svpattern pattern)
+{
+  switch (pattern)
+    {
+#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
+    AARCH64_FOR_SVPATTERN (CASE)
+#undef CASE
+    case AARCH64_NUM_SVPATTERNS:
+      break;
+    }
+  gcc_unreachable ();
+}
+
+/* Return the descriptor of the SIMD ABI.  */
+
+static const predefined_function_abi &
+aarch64_simd_abi (void)
+{
+  predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
+  if (!simd_abi.initialized_p ())
+    {
+      HARD_REG_SET full_reg_clobbers
+	= default_function_abi.full_reg_clobbers ();
+      for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+	if (FP_SIMD_SAVED_REGNUM_P (regno))
+	  CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
+      simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
+    }
+  return simd_abi;
+}
+
+/* Return the descriptor of the SVE PCS.  */
+
+static const predefined_function_abi &
+aarch64_sve_abi (void)
+{
+  predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
+  if (!sve_abi.initialized_p ())
+    {
+      HARD_REG_SET full_reg_clobbers
+	= default_function_abi.full_reg_clobbers ();
+      for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
+	CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
+      for (int regno = P4_REGNUM; regno <= P11_REGNUM; ++regno)
+	CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
+      sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
+    }
+  return sve_abi;
+}
+
 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 const char *
 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
@@ -1337,6 +1474,14 @@ aarch64_err_no_fpadvsimd (machine_mode mode)
 	     " vector types", "+nofp");
 }
 
+/* Return true if REGNO is P0-P15 or one of the special FFR-related
+   registers.  */
+inline bool
+pr_or_ffr_regnum_p (unsigned int regno)
+{
+  return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
+}
+
 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
@@ -1413,6 +1558,16 @@ aarch64_dbx_register_number (unsigned regno)
    return DWARF_FRAME_REGISTERS;
 }
 
+/* If X is a CONST_DOUBLE, return its bit representation as a constant
+   integer, otherwise return X unmodified.  */
+static rtx
+aarch64_bit_representation (rtx x)
+{
+  if (CONST_DOUBLE_P (x))
+    x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
+  return x;
+}
+
 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
 static bool
 aarch64_advsimd_struct_mode_p (machine_mode mode)
@@ -1439,6 +1594,9 @@ const unsigned int VEC_SVE_PRED = 4;
 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
    a structure of 2, 3 or 4 vectors.  */
 const unsigned int VEC_STRUCT   = 8;
+/* Can be used in combination with VEC_SVE_DATA to indicate that the
+   vector has fewer significant bytes than a full SVE vector.  */
+const unsigned int VEC_PARTIAL  = 16;
 /* Useful combinations of the above.  */
 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
@@ -1454,34 +1612,84 @@ aarch64_classify_vector_mode (machine_mode mode)
   if (aarch64_sve_pred_mode_p (mode))
     return VEC_SVE_PRED;
 
-  scalar_mode inner = GET_MODE_INNER (mode);
-  if (VECTOR_MODE_P (mode)
-      && (inner == QImode
-	  || inner == HImode
-	  || inner == HFmode
-	  || inner == SImode
-	  || inner == SFmode
-	  || inner == DImode
-	  || inner == DFmode))
-    {
-      if (TARGET_SVE)
-	{
-	  if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
-	    return VEC_SVE_DATA;
-	  if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
-	      || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
-	      || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
-	    return VEC_SVE_DATA | VEC_STRUCT;
-	}
+  /* Make the decision based on the mode's enum value rather than its
+     properties, so that we keep the correct classification regardless
+     of -msve-vector-bits.  */
+  switch (mode)
+    {
+    /* Partial SVE QI vectors.  */
+    case E_VNx2QImode:
+    case E_VNx4QImode:
+    case E_VNx8QImode:
+    /* Partial SVE HI vectors.  */
+    case E_VNx2HImode:
+    case E_VNx4HImode:
+    /* Partial SVE SI vector.  */
+    case E_VNx2SImode:
+      return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
+
+    case E_VNx16QImode:
+    case E_VNx8HImode:
+    case E_VNx4SImode:
+    case E_VNx2DImode:
+    case E_VNx8BFmode:
+    case E_VNx8HFmode:
+    case E_VNx4SFmode:
+    case E_VNx2DFmode:
+      return TARGET_SVE ? VEC_SVE_DATA : 0;
+
+    /* x2 SVE vectors.  */
+    case E_VNx32QImode:
+    case E_VNx16HImode:
+    case E_VNx8SImode:
+    case E_VNx4DImode:
+    case E_VNx16BFmode:
+    case E_VNx16HFmode:
+    case E_VNx8SFmode:
+    case E_VNx4DFmode:
+    /* x3 SVE vectors.  */
+    case E_VNx48QImode:
+    case E_VNx24HImode:
+    case E_VNx12SImode:
+    case E_VNx6DImode:
+    case E_VNx24BFmode:
+    case E_VNx24HFmode:
+    case E_VNx12SFmode:
+    case E_VNx6DFmode:
+    /* x4 SVE vectors.  */
+    case E_VNx64QImode:
+    case E_VNx32HImode:
+    case E_VNx16SImode:
+    case E_VNx8DImode:
+    case E_VNx32BFmode:
+    case E_VNx32HFmode:
+    case E_VNx16SFmode:
+    case E_VNx8DFmode:
+      return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
+
+    /* 64-bit Advanced SIMD vectors.  */
+    case E_V8QImode:
+    case E_V4HImode:
+    case E_V2SImode:
+    /* ...E_V1DImode doesn't exist.  */
+    case E_V4HFmode:
+    case E_V4BFmode:
+    case E_V2SFmode:
+    case E_V1DFmode:
+    /* 128-bit Advanced SIMD vectors.  */
+    case E_V16QImode:
+    case E_V8HImode:
+    case E_V4SImode:
+    case E_V2DImode:
+    case E_V8HFmode:
+    case E_V8BFmode:
+    case E_V4SFmode:
+    case E_V2DFmode:
+      return TARGET_SIMD ? VEC_ADVSIMD : 0;
 
-      /* This includes V1DF but not V1DI (which doesn't exist).  */
-      if (TARGET_SIMD
-	  && (known_eq (GET_MODE_BITSIZE (mode), 64)
-	      || known_eq (GET_MODE_BITSIZE (mode), 128)))
-	return VEC_ADVSIMD;
+    default:
+      return 0;
     }
-
-  return 0;
 }
 
 /* Return true if MODE is any of the data vector modes, including
@@ -1492,6 +1700,14 @@ aarch64_vector_data_mode_p (machine_mode mode)
   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
 }
 
+/* Return true if MODE is any form of SVE mode, including predicates,
+   vectors and structures.  */
+bool
+aarch64_sve_mode_p (machine_mode mode)
+{
+  return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
+}
+
 /* Return true if MODE is an SVE data vector mode; either a single vector
    or a structure of vectors.  */
 static bool
@@ -1500,6 +1716,24 @@ aarch64_sve_data_mode_p (machine_mode mode)
   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
 }
 
+/* Return the number of defined bytes in one constituent vector of
+   SVE mode MODE, which has vector flags VEC_FLAGS.  */
+static poly_int64
+aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
+{
+  if (vec_flags & VEC_PARTIAL)
+    /* A single partial vector.  */
+    return GET_MODE_SIZE (mode);
+
+  if (vec_flags & VEC_SVE_DATA)
+    /* A single vector or a tuple.  */
+    return BYTES_PER_SVE_VECTOR;
+
+  /* A single predicate.  */
+  gcc_assert (vec_flags & VEC_SVE_PRED);
+  return BYTES_PER_SVE_PRED;
+}
+
 /* Implement target hook TARGET_ARRAY_MODE.  */
 static opt_machine_mode
 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
@@ -1582,6 +1816,43 @@ aarch64_vectorize_related_mode (machine_mode vector_mode,
   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
 }
 
+/* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
+
+opt_machine_mode
+aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
+{
+  enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
+			    ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
+  machine_mode mode;
+  FOR_EACH_MODE_IN_CLASS (mode, mclass)
+    if (inner_mode == GET_MODE_INNER (mode)
+	&& known_eq (nunits, GET_MODE_NUNITS (mode))
+	&& aarch64_sve_data_mode_p (mode))
+      return mode;
+  return opt_machine_mode ();
+}
+
+/* Return the integer element mode associated with SVE mode MODE.  */
+
+static scalar_int_mode
+aarch64_sve_element_int_mode (machine_mode mode)
+{
+  unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
+					       GET_MODE_NUNITS (mode));
+  return int_mode_for_size (elt_bits, 0).require ();
+}
+
+/* Return the integer vector mode associated with SVE mode MODE.
+   Unlike mode_for_int_vector, this can handle the case in which
+   MODE is a predicate (and thus has a different total size).  */
+
+machine_mode
+aarch64_sve_int_mode (machine_mode mode)
+{
+  scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
+  return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
+}
+
 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
    prefer to use the first arithmetic operand as the else value if
    the else value doesn't matter, since that exactly matches the SVE
@@ -1610,13 +1881,19 @@ aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
     {
     case FP_REGS:
     case FP_LO_REGS:
-      if (aarch64_sve_data_mode_p (mode))
-	return exact_div (GET_MODE_SIZE (mode),
-			  BYTES_PER_SVE_VECTOR).to_constant ();
-      return CEIL (lowest_size, UNITS_PER_VREG);
+    case FP_LO8_REGS:
+      {
+	unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+	if (vec_flags & VEC_SVE_DATA)
+	  return exact_div (GET_MODE_SIZE (mode),
+			    aarch64_vl_bytes (mode, vec_flags)).to_constant ();
+	return CEIL (lowest_size, UNITS_PER_VREG);
+      }
     case PR_REGS:
     case PR_LO_REGS:
     case PR_HI_REGS:
+    case FFR_REGS:
+    case PR_AND_FFR_REGS:
       return 1;
     default:
       return CEIL (lowest_size, UNITS_PER_WORD);
@@ -1637,11 +1914,16 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
     return mode == DImode;
 
   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+  /* At the moment, partial vector modes are only useful for memory
+     references, but that could change in future.  */
+  if (vec_flags & VEC_PARTIAL)
+    return false;
+
   if (vec_flags & VEC_SVE_PRED)
-    return PR_REGNUM_P (regno);
+    return pr_or_ffr_regnum_p (regno);
 
-  if (PR_REGNUM_P (regno))
-    return 0;
+  if (pr_or_ffr_regnum_p (regno))
+    return false;
 
   if (regno == SP_REGNUM)
     /* The purpose of comparing with ptr_mode is to support the
@@ -1670,102 +1952,184 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
   return false;
 }
 
-/* Return true if this is a definition of a vectorized simd function.  */
+/* Return true if TYPE is a type that should be passed or returned in
+   SVE registers, assuming enough registers are available.  When returning
+   true, set *NUM_ZR and *NUM_PR to the number of required Z and P registers
+   respectively.  */
 
 static bool
-aarch64_simd_decl_p (tree fndecl)
+aarch64_sve_argument_p (const_tree type, unsigned int *num_zr,
+			unsigned int *num_pr)
 {
-  tree fntype;
-
-  if (fndecl == NULL)
-    return false;
-  fntype = TREE_TYPE (fndecl);
-  if (fntype == NULL)
-    return false;
+  if (aarch64_sve::svbool_type_p (type))
+    {
+      *num_pr = 1;
+      *num_zr = 0;
+      return true;
+    }
 
-  /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
-  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
-    return true;
+  if (unsigned int nvectors = aarch64_sve::nvectors_if_data_type (type))
+    {
+      *num_pr = 0;
+      *num_zr = nvectors;
+      return true;
+    }
 
   return false;
 }
 
-/* Return the mode a register save/restore should use.  DImode for integer
-   registers, DFmode for FP registers in non-SIMD functions (they only save
-   the bottom half of a 128 bit register), or TFmode for FP registers in
-   SIMD functions.  */
+/* Return true if a function with type FNTYPE returns its value in
+   SVE vector or predicate registers.  */
 
-static machine_mode
-aarch64_reg_save_mode (tree fndecl, unsigned regno)
+static bool
+aarch64_returns_value_in_sve_regs_p (const_tree fntype)
 {
-  return GP_REGNUM_P (regno)
-	   ? E_DImode
-	   : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
+  unsigned int num_zr, num_pr;
+  tree return_type = TREE_TYPE (fntype);
+  return (return_type != error_mark_node
+	  && aarch64_sve_argument_p (return_type, &num_zr, &num_pr));
 }
 
-/* Return true if the instruction is a call to a SIMD function, false
-   if it is not a SIMD function or if we do not know anything about
-   the function.  */
+/* Return true if a function with type FNTYPE takes arguments in
+   SVE vector or predicate registers.  */
 
 static bool
-aarch64_simd_call_p (rtx_insn *insn)
+aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
 {
-  rtx symbol;
-  rtx call;
-  tree fndecl;
+  CUMULATIVE_ARGS args_so_far_v;
+  aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
+				NULL_TREE, 0, true);
+  cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
 
-  gcc_assert (CALL_P (insn));
-  call = get_call_rtx_from (insn);
-  symbol = XEXP (XEXP (call, 0), 0);
-  if (GET_CODE (symbol) != SYMBOL_REF)
-    return false;
-  fndecl = SYMBOL_REF_DECL (symbol);
-  if (!fndecl)
-    return false;
+  for (tree chain = TYPE_ARG_TYPES (fntype);
+       chain && chain != void_list_node;
+       chain = TREE_CHAIN (chain))
+    {
+      tree arg_type = TREE_VALUE (chain);
+      if (arg_type == error_mark_node)
+	return false;
+
+      function_arg_info arg (arg_type, /*named=*/true);
+      apply_pass_by_reference_rules (&args_so_far_v, arg);
+      unsigned int num_zr, num_pr;
+      if (aarch64_sve_argument_p (arg.type, &num_zr, &num_pr))
+	return true;
 
-  return aarch64_simd_decl_p (fndecl);
+      targetm.calls.function_arg_advance (args_so_far, arg);
+    }
+  return false;
 }
 
-/* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
-   a function that uses the SIMD ABI, take advantage of the extra
-   call-preserved registers that the ABI provides.  */
+/* Implement TARGET_FNTYPE_ABI.  */
 
-void
-aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
-					  HARD_REG_SET *return_set)
+static const predefined_function_abi &
+aarch64_fntype_abi (const_tree fntype)
 {
-  if (aarch64_simd_call_p (insn))
-    {
-      for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
-	if (FP_SIMD_SAVED_REGNUM_P (regno))
-	  CLEAR_HARD_REG_BIT (*return_set, regno);
-    }
+  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
+    return aarch64_simd_abi ();
+
+  if (aarch64_returns_value_in_sve_regs_p (fntype)
+      || aarch64_takes_arguments_in_sve_regs_p (fntype))
+    return aarch64_sve_abi ();
+
+  return default_function_abi;
 }
 
-/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
-   the lower 64 bits of a 128-bit register.  Tell the compiler the callee
-   clobbers the top 64 bits when restoring the bottom 64 bits.  */
+/* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
 
 static bool
-aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
-					machine_mode mode)
+aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
+{
+  return (aarch64_sve::builtin_type_p (type1)
+	  == aarch64_sve::builtin_type_p (type2));
+}
+
+/* Return true if we should emit CFI for register REGNO.  */
+
+static bool
+aarch64_emit_cfi_for_reg_p (unsigned int regno)
 {
-  bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
-  return FP_REGNUM_P (regno)
-	 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
+  return (GP_REGNUM_P (regno)
+	  || !default_function_abi.clobbers_full_reg_p (regno));
 }
 
-/* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
+/* Return the mode we should use to save and restore register REGNO.  */
 
-rtx_insn *
-aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
+static machine_mode
+aarch64_reg_save_mode (unsigned int regno)
 {
-  gcc_assert (CALL_P (call_1) && CALL_P (call_2));
+  if (GP_REGNUM_P (regno))
+    return DImode;
 
-  if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
-    return call_1;
-  else
-    return call_2;
+  if (FP_REGNUM_P (regno))
+    switch (crtl->abi->id ())
+      {
+      case ARM_PCS_AAPCS64:
+	/* Only the low 64 bits are saved by the base PCS.  */
+	return DFmode;
+
+      case ARM_PCS_SIMD:
+	/* The vector PCS saves the low 128 bits (which is the full
+	   register on non-SVE targets).  */
+	return TFmode;
+
+      case ARM_PCS_SVE:
+	/* Use vectors of DImode for registers that need frame
+	   information, so that the first 64 bytes of the save slot
+	   are always the equivalent of what storing D<n> would give.  */
+	if (aarch64_emit_cfi_for_reg_p (regno))
+	  return VNx2DImode;
+
+	/* Use vectors of bytes otherwise, so that the layout is
+	   endian-agnostic, and so that we can use LDR and STR for
+	   big-endian targets.  */
+	return VNx16QImode;
+
+      case ARM_PCS_TLSDESC:
+      case ARM_PCS_UNKNOWN:
+	break;
+      }
+
+  if (PR_REGNUM_P (regno))
+    /* Save the full predicate register.  */
+    return VNx16BImode;
+
+  gcc_unreachable ();
+}
+
+/* Implement TARGET_INSN_CALLEE_ABI.  */
+
+const predefined_function_abi &
+aarch64_insn_callee_abi (const rtx_insn *insn)
+{
+  rtx pat = PATTERN (insn);
+  gcc_assert (GET_CODE (pat) == PARALLEL);
+  rtx unspec = XVECEXP (pat, 0, 1);
+  gcc_assert (GET_CODE (unspec) == UNSPEC
+	      && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
+  return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
+}
+
+/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
+   the lower 64 bits of a 128-bit register.  Tell the compiler the callee
+   clobbers the top 64 bits when restoring the bottom 64 bits.  */
+
+static bool
+aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
+					unsigned int regno,
+					machine_mode mode)
+{
+  if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
+    {
+      poly_int64 per_register_size = GET_MODE_SIZE (mode);
+      unsigned int nregs = hard_regno_nregs (regno, mode);
+      if (nregs > 1)
+	per_register_size = exact_div (per_register_size, nregs);
+      if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
+	return maybe_gt (per_register_size, 16);
+      return maybe_gt (per_register_size, 8);
+    }
+  return false;
 }
 
 /* Implement REGMODE_NATURAL_SIZE.  */
@@ -1899,10 +2263,33 @@ emit_set_insn (rtx x, rtx y)
 rtx
 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 {
-  machine_mode mode = SELECT_CC_MODE (code, x, y);
-  rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
+  machine_mode cmp_mode = GET_MODE (x);
+  machine_mode cc_mode;
+  rtx cc_reg;
+
+  if (cmp_mode == TImode)
+    {
+      gcc_assert (code == NE);
+
+      cc_mode = CCmode;
+      cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
 
-  emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
+      rtx x_lo = operand_subword (x, 0, 0, TImode);
+      rtx y_lo = operand_subword (y, 0, 0, TImode);
+      emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
+
+      rtx x_hi = operand_subword (x, 1, 0, TImode);
+      rtx y_hi = operand_subword (y, 1, 0, TImode);
+      emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
+			     gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
+			     GEN_INT (AARCH64_EQ)));
+    }
+  else
+    {
+      cc_mode = SELECT_CC_MODE (code, x, y);
+      cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+      emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
+    }
   return cc_reg;
 }
 
@@ -2466,7 +2853,36 @@ aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
   gcc_assert (r != NULL);
   return rtx_equal_p (x, r);
 }
-			      
+
+/* Return TARGET if it is nonnull and a register of mode MODE.
+   Otherwise, return a fresh register of mode MODE if we can,
+   or TARGET reinterpreted as MODE if we can't.  */
+
+static rtx
+aarch64_target_reg (rtx target, machine_mode mode)
+{
+  if (target && REG_P (target) && GET_MODE (target) == mode)
+    return target;
+  if (!can_create_pseudo_p ())
+    {
+      gcc_assert (target);
+      return gen_lowpart (mode, target);
+    }
+  return gen_reg_rtx (mode);
+}
+
+/* Return a register that contains the constant in BUILDER, given that
+   the constant is a legitimate move operand.  Use TARGET as the register
+   if it is nonnull and convenient.  */
+
+static rtx
+aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
+{
+  rtx src = builder.build ();
+  target = aarch64_target_reg (target, GET_MODE (src));
+  emit_insn (gen_rtx_SET (target, src));
+  return target;
+}
 
 static rtx
 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
@@ -2481,82 +2897,474 @@ aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
     }
 }
 
-/* Return true if we can move VALUE into a register using a single
-   CNT[BHWD] instruction.  */
+/* Return true if predicate value X is a constant in which every element
+   is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
+   value, i.e. as a predicate in which all bits are significant.  */
 
 static bool
-aarch64_sve_cnt_immediate_p (poly_int64 value)
+aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
 {
-  HOST_WIDE_INT factor = value.coeffs[0];
-  /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
-  return (value.coeffs[1] == factor
-	  && IN_RANGE (factor, 2, 16 * 16)
-	  && (factor & 1) == 0
-	  && factor <= 16 * (factor & -factor));
+  if (GET_CODE (x) != CONST_VECTOR)
+    return false;
+
+  unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
+					     GET_MODE_NUNITS (GET_MODE (x)));
+  unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
+  unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
+  builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
+
+  unsigned int nelts = const_vector_encoded_nelts (x);
+  for (unsigned int i = 0; i < nelts; ++i)
+    {
+      rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
+      if (!CONST_INT_P (elt))
+	return false;
+
+      builder.quick_push (elt);
+      for (unsigned int j = 1; j < factor; ++j)
+	builder.quick_push (const0_rtx);
+    }
+  builder.finalize ();
+  return true;
 }
 
-/* Likewise for rtx X.  */
+/* BUILDER contains a predicate constant of mode VNx16BI.  Return the
+   widest predicate element size it can have (that is, the largest size
+   for which each element would still be 0 or 1).  */
 
-bool
-aarch64_sve_cnt_immediate_p (rtx x)
+unsigned int
+aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
 {
-  poly_int64 value;
-  return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
+  /* Start with the most optimistic assumption: that we only need
+     one bit per pattern.  This is what we will use if only the first
+     bit in each pattern is ever set.  */
+  unsigned int mask = GET_MODE_SIZE (DImode);
+  mask |= builder.npatterns ();
+
+  /* Look for set bits.  */
+  unsigned int nelts = builder.encoded_nelts ();
+  for (unsigned int i = 1; i < nelts; ++i)
+    if (INTVAL (builder.elt (i)) != 0)
+      {
+	if (i & 1)
+	  return 1;
+	mask |= i;
+      }
+  return mask & -mask;
 }
 
-/* Return the asm string for an instruction with a CNT-like vector size
-   operand (a vector pattern followed by a multiplier in the range [1, 16]).
-   PREFIX is the mnemonic without the size suffix and OPERANDS is the
-   first part of the operands template (the part that comes before the
-   vector size itself).  FACTOR is the number of quadwords.
-   NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
-   If it is zero, we can use any element size.  */
+/* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
+   return that predicate mode, otherwise return opt_machine_mode ().  */
 
-static char *
-aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
-				  unsigned int factor,
-				  unsigned int nelts_per_vq)
+opt_machine_mode
+aarch64_ptrue_all_mode (rtx x)
 {
-  static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
+  gcc_assert (GET_MODE (x) == VNx16BImode);
+  if (GET_CODE (x) != CONST_VECTOR
+      || !CONST_VECTOR_DUPLICATE_P (x)
+      || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
+      || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
+    return opt_machine_mode ();
 
-  if (nelts_per_vq == 0)
-    /* There is some overlap in the ranges of the four CNT instructions.
-       Here we always use the smallest possible element size, so that the
-       multiplier is 1 whereever possible.  */
-    nelts_per_vq = factor & -factor;
-  int shift = std::min (exact_log2 (nelts_per_vq), 4);
-  gcc_assert (IN_RANGE (shift, 1, 4));
-  char suffix = "dwhb"[shift - 1];
+  unsigned int nelts = const_vector_encoded_nelts (x);
+  for (unsigned int i = 1; i < nelts; ++i)
+    if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
+      return opt_machine_mode ();
 
-  factor >>= shift;
-  unsigned int written;
-  if (factor == 1)
-    written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
-			prefix, suffix, operands);
-  else
-    written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
-			prefix, suffix, operands, factor);
-  gcc_assert (written < sizeof (buffer));
-  return buffer;
+  return aarch64_sve_pred_mode (nelts);
 }
 
-/* Return the asm string for an instruction with a CNT-like vector size
-   operand (a vector pattern followed by a multiplier in the range [1, 16]).
-   PREFIX is the mnemonic without the size suffix and OPERANDS is the
-   first part of the operands template (the part that comes before the
-   vector size itself).  X is the value of the vector size operand,
-   as a polynomial integer rtx.  */
+/* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
+   that the constant would have with predicate element size ELT_SIZE
+   (ignoring the upper bits in each element) and return:
 
-char *
+   * -1 if all bits are set
+   * N if the predicate has N leading set bits followed by all clear bits
+   * 0 if the predicate does not have any of these forms.  */
+
+int
+aarch64_partial_ptrue_length (rtx_vector_builder &builder,
+			      unsigned int elt_size)
+{
+  /* If nelts_per_pattern is 3, we have set bits followed by clear bits
+     followed by set bits.  */
+  if (builder.nelts_per_pattern () == 3)
+    return 0;
+
+  /* Skip over leading set bits.  */
+  unsigned int nelts = builder.encoded_nelts ();
+  unsigned int i = 0;
+  for (; i < nelts; i += elt_size)
+    if (INTVAL (builder.elt (i)) == 0)
+      break;
+  unsigned int vl = i / elt_size;
+
+  /* Check for the all-true case.  */
+  if (i == nelts)
+    return -1;
+
+  /* If nelts_per_pattern is 1, then either VL is zero, or we have a
+     repeating pattern of set bits followed by clear bits.  */
+  if (builder.nelts_per_pattern () != 2)
+    return 0;
+
+  /* We have a "foreground" value and a duplicated "background" value.
+     If the background might repeat and the last set bit belongs to it,
+     we might have set bits followed by clear bits followed by set bits.  */
+  if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
+    return 0;
+
+  /* Make sure that the rest are all clear.  */
+  for (; i < nelts; i += elt_size)
+    if (INTVAL (builder.elt (i)) != 0)
+      return 0;
+
+  return vl;
+}
+
+/* See if there is an svpattern that encodes an SVE predicate of mode
+   PRED_MODE in which the first VL bits are set and the rest are clear.
+   Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
+   A VL of -1 indicates an all-true vector.  */
+
+aarch64_svpattern
+aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
+{
+  if (vl < 0)
+    return AARCH64_SV_ALL;
+
+  if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
+    return AARCH64_NUM_SVPATTERNS;
+
+  if (vl >= 1 && vl <= 8)
+    return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
+
+  if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
+    return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
+
+  int max_vl;
+  if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
+    {
+      if (vl == (max_vl / 3) * 3)
+	return AARCH64_SV_MUL3;
+      /* These would only trigger for non-power-of-2 lengths.  */
+      if (vl == (max_vl & -4))
+	return AARCH64_SV_MUL4;
+      if (vl == (1 << floor_log2 (max_vl)))
+	return AARCH64_SV_POW2;
+      if (vl == max_vl)
+	return AARCH64_SV_ALL;
+    }
+  return AARCH64_NUM_SVPATTERNS;
+}
+
+/* Return a VNx16BImode constant in which every sequence of ELT_SIZE
+   bits has the lowest bit set and the upper bits clear.  This is the
+   VNx16BImode equivalent of a PTRUE for controlling elements of
+   ELT_SIZE bytes.  However, because the constant is VNx16BImode,
+   all bits are significant, even the upper zeros.  */
+
+rtx
+aarch64_ptrue_all (unsigned int elt_size)
+{
+  rtx_vector_builder builder (VNx16BImode, elt_size, 1);
+  builder.quick_push (const1_rtx);
+  for (unsigned int i = 1; i < elt_size; ++i)
+    builder.quick_push (const0_rtx);
+  return builder.build ();
+}
+
+/* Return an all-true predicate register of mode MODE.  */
+
+rtx
+aarch64_ptrue_reg (machine_mode mode)
+{
+  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
+  rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
+  return gen_lowpart (mode, reg);
+}
+
+/* Return an all-false predicate register of mode MODE.  */
+
+rtx
+aarch64_pfalse_reg (machine_mode mode)
+{
+  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
+  rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
+  return gen_lowpart (mode, reg);
+}
+
+/* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
+   true, or alternatively if we know that the operation predicated by
+   PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
+   aarch64_sve_gp_strictness operand that describes the operation
+   predicated by PRED1[0].  */
+
+bool
+aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
+{
+  machine_mode mode = GET_MODE (pred2);
+  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
+	      && mode == GET_MODE (pred1[0])
+	      && aarch64_sve_gp_strictness (pred1[1], SImode));
+  return (pred1[0] == CONSTM1_RTX (mode)
+	  || INTVAL (pred1[1]) == SVE_RELAXED_GP
+	  || rtx_equal_p (pred1[0], pred2));
+}
+
+/* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
+   for it.  PRED2[0] is the predicate for the instruction whose result
+   is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
+   for it.  Return true if we can prove that the two predicates are
+   equivalent for PTEST purposes; that is, if we can replace PRED2[0]
+   with PRED1[0] without changing behavior.  */
+
+bool
+aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
+{
+  machine_mode mode = GET_MODE (pred1[0]);
+  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
+	      && mode == GET_MODE (pred2[0])
+	      && aarch64_sve_ptrue_flag (pred1[1], SImode)
+	      && aarch64_sve_ptrue_flag (pred2[1], SImode));
+
+  bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
+		   || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
+  bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
+		   || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
+  return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
+}
+
+/* Emit a comparison CMP between OP0 and OP1, both of which have mode
+   DATA_MODE, and return the result in a predicate of mode PRED_MODE.
+   Use TARGET as the target register if nonnull and convenient.  */
+
+static rtx
+aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
+			  machine_mode data_mode, rtx op1, rtx op2)
+{
+  insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
+  expand_operand ops[5];
+  create_output_operand (&ops[0], target, pred_mode);
+  create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
+  create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
+  create_input_operand (&ops[3], op1, data_mode);
+  create_input_operand (&ops[4], op2, data_mode);
+  expand_insn (icode, 5, ops);
+  return ops[0].value;
+}
+
+/* Use a comparison to convert integer vector SRC into MODE, which is
+   the corresponding SVE predicate mode.  Use TARGET for the result
+   if it's nonnull and convenient.  */
+
+rtx
+aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
+{
+  machine_mode src_mode = GET_MODE (src);
+  return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
+				   src, CONST0_RTX (src_mode));
+}
+
+/* Return the assembly token for svprfop value PRFOP.  */
+
+static const char *
+svprfop_token (enum aarch64_svprfop prfop)
+{
+  switch (prfop)
+    {
+#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
+    AARCH64_FOR_SVPRFOP (CASE)
+#undef CASE
+    case AARCH64_NUM_SVPRFOPS:
+      break;
+    }
+  gcc_unreachable ();
+}
+
+/* Return the assembly string for an SVE prefetch operation with
+   mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
+   and that SUFFIX is the format for the remaining operands.  */
+
+char *
+aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
+			     const char *suffix)
+{
+  static char buffer[128];
+  aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
+  unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
+				   mnemonic, svprfop_token (prfop), suffix);
+  gcc_assert (written < sizeof (buffer));
+  return buffer;
+}
+
+/* Check whether we can calculate the number of elements in PATTERN
+   at compile time, given that there are NELTS_PER_VQ elements per
+   128-bit block.  Return the value if so, otherwise return -1.  */
+
+HOST_WIDE_INT
+aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
+{
+  unsigned int vl, const_vg;
+  if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
+    vl = 1 + (pattern - AARCH64_SV_VL1);
+  else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
+    vl = 16 << (pattern - AARCH64_SV_VL16);
+  else if (aarch64_sve_vg.is_constant (&const_vg))
+    {
+      /* There are two vector granules per quadword.  */
+      unsigned int nelts = (const_vg / 2) * nelts_per_vq;
+      switch (pattern)
+	{
+	case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
+	case AARCH64_SV_MUL4: return nelts & -4;
+	case AARCH64_SV_MUL3: return (nelts / 3) * 3;
+	case AARCH64_SV_ALL: return nelts;
+	default: gcc_unreachable ();
+	}
+    }
+  else
+    return -1;
+
+  /* There are two vector granules per quadword.  */
+  poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
+  if (known_le (vl, nelts_all))
+    return vl;
+
+  /* Requesting more elements than are available results in a PFALSE.  */
+  if (known_gt (vl, nelts_all))
+    return 0;
+
+  return -1;
+}
+
+/* Return true if we can move VALUE into a register using a single
+   CNT[BHWD] instruction.  */
+
+static bool
+aarch64_sve_cnt_immediate_p (poly_int64 value)
+{
+  HOST_WIDE_INT factor = value.coeffs[0];
+  /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
+  return (value.coeffs[1] == factor
+	  && IN_RANGE (factor, 2, 16 * 16)
+	  && (factor & 1) == 0
+	  && factor <= 16 * (factor & -factor));
+}
+
+/* Likewise for rtx X.  */
+
+bool
+aarch64_sve_cnt_immediate_p (rtx x)
+{
+  poly_int64 value;
+  return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
+}
+
+/* Return the asm string for an instruction with a CNT-like vector size
+   operand (a vector pattern followed by a multiplier in the range [1, 16]).
+   PREFIX is the mnemonic without the size suffix and OPERANDS is the
+   first part of the operands template (the part that comes before the
+   vector size itself).  PATTERN is the pattern to use.  FACTOR is the
+   number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
+   in each quadword.  If it is zero, we can use any element size.  */
+
+static char *
+aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
+				  aarch64_svpattern pattern,
+				  unsigned int factor,
+				  unsigned int nelts_per_vq)
+{
+  static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
+
+  if (nelts_per_vq == 0)
+    /* There is some overlap in the ranges of the four CNT instructions.
+       Here we always use the smallest possible element size, so that the
+       multiplier is 1 whereever possible.  */
+    nelts_per_vq = factor & -factor;
+  int shift = std::min (exact_log2 (nelts_per_vq), 4);
+  gcc_assert (IN_RANGE (shift, 1, 4));
+  char suffix = "dwhb"[shift - 1];
+
+  factor >>= shift;
+  unsigned int written;
+  if (pattern == AARCH64_SV_ALL && factor == 1)
+    written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
+			prefix, suffix, operands);
+  else if (factor == 1)
+    written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
+			prefix, suffix, operands, svpattern_token (pattern));
+  else
+    written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
+			prefix, suffix, operands, svpattern_token (pattern),
+			factor);
+  gcc_assert (written < sizeof (buffer));
+  return buffer;
+}
+
+/* Return the asm string for an instruction with a CNT-like vector size
+   operand (a vector pattern followed by a multiplier in the range [1, 16]).
+   PREFIX is the mnemonic without the size suffix and OPERANDS is the
+   first part of the operands template (the part that comes before the
+   vector size itself).  X is the value of the vector size operand,
+   as a polynomial integer rtx; we need to convert this into an "all"
+   pattern with a multiplier.  */
+
+char *
 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
 				  rtx x)
 {
   poly_int64 value = rtx_to_poly_int64 (x);
   gcc_assert (aarch64_sve_cnt_immediate_p (value));
-  return aarch64_output_sve_cnt_immediate (prefix, operands,
+  return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
 					   value.coeffs[1], 0);
 }
 
+/* Return the asm string for an instruction with a CNT-like vector size
+   operand (a vector pattern followed by a multiplier in the range [1, 16]).
+   PREFIX is the mnemonic without the size suffix and OPERANDS is the
+   first part of the operands template (the part that comes before the
+   vector size itself).  CNT_PAT[0..2] are the operands of the
+   UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
+
+char *
+aarch64_output_sve_cnt_pat_immediate (const char *prefix,
+				      const char *operands, rtx *cnt_pat)
+{
+  aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
+  unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
+  unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
+  return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
+					   factor, nelts_per_vq);
+}
+
+/* Return true if we can add X using a single SVE INC or DEC instruction.  */
+
+bool
+aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
+{
+  poly_int64 value;
+  return (poly_int_rtx_p (x, &value)
+	  && (aarch64_sve_cnt_immediate_p (value)
+	      || aarch64_sve_cnt_immediate_p (-value)));
+}
+
+/* Return the asm string for adding SVE INC/DEC immediate OFFSET to
+   operand 0.  */
+
+char *
+aarch64_output_sve_scalar_inc_dec (rtx offset)
+{
+  poly_int64 offset_value = rtx_to_poly_int64 (offset);
+  gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
+  if (offset_value.coeffs[1] > 0)
+    return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
+					     offset_value.coeffs[1], 0);
+  else
+    return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
+					     -offset_value.coeffs[1], 0);
+}
+
 /* Return true if we can add VALUE to a register using a single ADDVL
    or ADDPL instruction.  */
 
@@ -2582,27 +3390,16 @@ aarch64_sve_addvl_addpl_immediate_p (rtx x)
 	  && aarch64_sve_addvl_addpl_immediate_p (value));
 }
 
-/* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
-   and storing the result in operand 0.  */
+/* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
+   to operand 1 and storing the result in operand 0.  */
 
 char *
-aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
+aarch64_output_sve_addvl_addpl (rtx offset)
 {
   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
   poly_int64 offset_value = rtx_to_poly_int64 (offset);
   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
 
-  /* Use INC or DEC if possible.  */
-  if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
-    {
-      if (aarch64_sve_cnt_immediate_p (offset_value))
-	return aarch64_output_sve_cnt_immediate ("inc", "%x0",
-						 offset_value.coeffs[1], 0);
-      if (aarch64_sve_cnt_immediate_p (-offset_value))
-	return aarch64_output_sve_cnt_immediate ("dec", "%x0",
-						 -offset_value.coeffs[1], 0);
-    }
-
   int factor = offset_value.coeffs[1];
   if ((factor & 15) == 0)
     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
@@ -2617,8 +3414,8 @@ aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
    factor in *FACTOR_OUT (if nonnull).  */
 
 bool
-aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
-				 unsigned int *nelts_per_vq_out)
+aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
+					unsigned int *nelts_per_vq_out)
 {
   rtx elt;
   poly_int64 value;
@@ -2652,9 +3449,9 @@ aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
    instruction.  */
 
 bool
-aarch64_sve_inc_dec_immediate_p (rtx x)
+aarch64_sve_vector_inc_dec_immediate_p (rtx x)
 {
-  return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
+  return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
 }
 
 /* Return the asm template for an SVE vector INC or DEC instruction.
@@ -2662,18 +3459,18 @@ aarch64_sve_inc_dec_immediate_p (rtx x)
    value of the vector count operand itself.  */
 
 char *
-aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
+aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
 {
   int factor;
   unsigned int nelts_per_vq;
-  if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
+  if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
     gcc_unreachable ();
   if (factor < 0)
-    return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
-					     nelts_per_vq);
+    return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
+					     -factor, nelts_per_vq);
   else
-    return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
-					     nelts_per_vq);
+    return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
+					     factor, nelts_per_vq);
 }
 
 static int
@@ -3056,20 +3853,36 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
 	}
       else
 	{
-	  /* Use CNTD, then multiply it by FACTOR.  */
-	  val = gen_int_mode (poly_int64 (2, 2), mode);
+	  /* Base the factor on LOW_BIT if we can calculate LOW_BIT
+	     directly, since that should increase the chances of being
+	     able to use a shift and add sequence.  If LOW_BIT itself
+	     is out of range, just use CNTD.  */
+	  if (low_bit <= 16 * 8)
+	    factor /= low_bit;
+	  else
+	    low_bit = 1;
+
+	  val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
 	  val = aarch64_force_temporary (mode, temp1, val);
 
-	  /* Go back to using a negative multiplication factor if we have
-	     no register from which to subtract.  */
-	  if (code == MINUS && src == const0_rtx)
+	  if (can_create_pseudo_p ())
+	    {
+	      rtx coeff1 = gen_int_mode (factor, mode);
+	      val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
+	    }
+	  else
 	    {
-	      factor = -factor;
-	      code = PLUS;
+	      /* Go back to using a negative multiplication factor if we have
+		 no register from which to subtract.  */
+	      if (code == MINUS && src == const0_rtx)
+		{
+		  factor = -factor;
+		  code = PLUS;
+		}
+	      rtx coeff1 = gen_int_mode (factor, mode);
+	      coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
+	      val = gen_rtx_MULT (mode, val, coeff1);
 	    }
-	  rtx coeff1 = gen_int_mode (factor, mode);
-	  coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
-	  val = gen_rtx_MULT (mode, val, coeff1);
 	}
 
       if (shift > 0)
@@ -3176,32 +3989,55 @@ aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
 }
 
-/* Try to duplicate SRC into SVE register DEST, given that SRC is an
-   integer of mode INT_MODE.  Return true on success.  */
+/* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
+   register of mode MODE.  Use TARGET for the result if it's nonnull
+   and convenient.
 
-static bool
-aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
-				      rtx src)
-{
-  /* If the constant is smaller than 128 bits, we can do the move
-     using a vector of SRC_MODEs.  */
-  if (src_mode != TImode)
-    {
-      poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
-				     GET_MODE_SIZE (src_mode));
-      machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
-      emit_move_insn (gen_lowpart (dup_mode, dest),
-		      gen_const_vec_duplicate (dup_mode, src));
-      return true;
+   The two vector modes must have the same element mode.  The behavior
+   is to duplicate architectural lane N of SRC into architectural lanes
+   N + I * STEP of the result.  On big-endian targets, architectural
+   lane 0 of an Advanced SIMD vector is the last element of the vector
+   in memory layout, so for big-endian targets this operation has the
+   effect of reversing SRC before duplicating it.  Callers need to
+   account for this.  */
+
+rtx
+aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
+{
+  machine_mode src_mode = GET_MODE (src);
+  gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
+  insn_code icode = (BYTES_BIG_ENDIAN
+		     ? code_for_aarch64_vec_duplicate_vq_be (mode)
+		     : code_for_aarch64_vec_duplicate_vq_le (mode));
+
+  unsigned int i = 0;
+  expand_operand ops[3];
+  create_output_operand (&ops[i++], target, mode);
+  create_output_operand (&ops[i++], src, src_mode);
+  if (BYTES_BIG_ENDIAN)
+    {
+      /* Create a PARALLEL describing the reversal of SRC.  */
+      unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
+      rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
+						  nelts_per_vq - 1, -1);
+      create_fixed_operand (&ops[i++], sel);
     }
+  expand_insn (icode, i, ops);
+  return ops[0].value;
+}
+
+/* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
+   the memory image into DEST.  Return true on success.  */
 
-  /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
-  src = force_const_mem (src_mode, src);
+static bool
+aarch64_expand_sve_ld1rq (rtx dest, rtx src)
+{
+  src = force_const_mem (GET_MODE (src), src);
   if (!src)
     return false;
 
   /* Make sure that the address is legitimate.  */
-  if (!aarch64_sve_ld1r_operand_p (src))
+  if (!aarch64_sve_ld1rq_operand_p (src))
     {
       rtx addr = force_reg (Pmode, XEXP (src, 0));
       src = replace_equiv_address (src, addr);
@@ -3210,47 +4046,128 @@ aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
   machine_mode mode = GET_MODE (dest);
   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
-  rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
-  src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
-  emit_insn (gen_rtx_SET (dest, src));
+  rtx ptrue = aarch64_ptrue_reg (pred_mode);
+  emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
   return true;
 }
 
-/* Expand a move of general CONST_VECTOR SRC into DEST, given that it
-   isn't a simple duplicate or series.  */
+/* Return a register containing CONST_VECTOR SRC, given that SRC has an
+   SVE data mode and isn't a legitimate constant.  Use TARGET for the
+   result if convenient.
 
-static void
-aarch64_expand_sve_const_vector (rtx dest, rtx src)
+   The returned register can have whatever mode seems most natural
+   given the contents of SRC.  */
+
+static rtx
+aarch64_expand_sve_const_vector (rtx target, rtx src)
 {
   machine_mode mode = GET_MODE (src);
   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
-  gcc_assert (npatterns > 1);
+  scalar_mode elt_mode = GET_MODE_INNER (mode);
+  unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
+  unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
+
+  if (nelts_per_pattern == 1 && encoded_bits == 128)
+    {
+      /* The constant is a duplicated quadword but can't be narrowed
+	 beyond a quadword.  Get the memory image of the first quadword
+	 as a 128-bit vector and try using LD1RQ to load it from memory.
+
+	 The effect for both endiannesses is to load memory lane N into
+	 architectural lanes N + I * STEP of the result.  On big-endian
+	 targets, the layout of the 128-bit vector in an Advanced SIMD
+	 register would be different from its layout in an SVE register,
+	 but this 128-bit vector is a memory value only.  */
+      machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
+      rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
+      if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
+	return target;
+    }
+
+  if (nelts_per_pattern == 1 && encoded_bits < 128)
+    {
+      /* The vector is a repeating sequence of 64 bits or fewer.
+	 See if we can load them using an Advanced SIMD move and then
+	 duplicate it to fill a vector.  This is better than using a GPR
+	 move because it keeps everything in the same register file.  */
+      machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
+      rtx_vector_builder builder (vq_mode, npatterns, 1);
+      for (unsigned int i = 0; i < npatterns; ++i)
+	{
+	  /* We want memory lane N to go into architectural lane N,
+	     so reverse for big-endian targets.  The DUP .Q pattern
+	     has a compensating reverse built-in.  */
+	  unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
+	  builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
+	}
+      rtx vq_src = builder.build ();
+      if (aarch64_simd_valid_immediate (vq_src, NULL))
+	{
+	  vq_src = force_reg (vq_mode, vq_src);
+	  return aarch64_expand_sve_dupq (target, mode, vq_src);
+	}
 
-  if (nelts_per_pattern == 1)
-    {
-      /* The constant is a repeating seqeuence of at least two elements,
-	 where the repeating elements occupy no more than 128 bits.
-	 Get an integer representation of the replicated value.  */
-      scalar_int_mode int_mode;
-      if (BYTES_BIG_ENDIAN)
-	/* For now, always use LD1RQ to load the value on big-endian
-	   targets, since the handling of smaller integers includes a
-	   subreg that is semantically an element reverse.  */
-	int_mode = TImode;
-      else
+      /* Get an integer representation of the repeating part of Advanced
+	 SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
+	 which for big-endian targets is lane-swapped wrt a normal
+	 Advanced SIMD vector.  This means that for both endiannesses,
+	 memory lane N of SVE vector SRC corresponds to architectural
+	 lane N of a register holding VQ_SRC.  This in turn means that
+	 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
+	 as a single 128-bit value) and thus that memory lane 0 of SRC is
+	 in the lsb of the integer.  Duplicating the integer therefore
+	 ensures that memory lane N of SRC goes into architectural lane
+	 N + I * INDEX of the SVE register.  */
+      scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
+      rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
+      if (elt_value)
 	{
-	  unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
-	  gcc_assert (int_bits <= 128);
-	  int_mode = int_mode_for_size (int_bits, 0).require ();
+	  /* Pretend that we had a vector of INT_MODE to start with.  */
+	  elt_mode = int_mode;
+	  mode = aarch64_full_sve_mode (int_mode).require ();
+
+	  /* If the integer can be moved into a general register by a
+	     single instruction, do that and duplicate the result.  */
+	  if (CONST_INT_P (elt_value)
+	      && aarch64_move_imm (INTVAL (elt_value), elt_mode))
+	    {
+	      elt_value = force_reg (elt_mode, elt_value);
+	      return expand_vector_broadcast (mode, elt_value);
+	    }
+	}
+      else if (npatterns == 1)
+	/* We're duplicating a single value, but can't do better than
+	   force it to memory and load from there.  This handles things
+	   like symbolic constants.  */
+	elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
+
+      if (elt_value)
+	{
+	  /* Load the element from memory if we can, otherwise move it into
+	     a register and use a DUP.  */
+	  rtx op = force_const_mem (elt_mode, elt_value);
+	  if (!op)
+	    op = force_reg (elt_mode, elt_value);
+	  return expand_vector_broadcast (mode, op);
 	}
-      rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
-      if (int_value
-	  && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
-	return;
     }
 
+  /* Try using INDEX.  */
+  rtx base, step;
+  if (const_vec_series_p (src, &base, &step))
+    {
+      aarch64_expand_vec_series (target, base, step);
+      return target;
+    }
+
+  /* From here on, it's better to force the whole constant to memory
+     if we can.  */
+  if (GET_MODE_NUNITS (mode).is_constant ())
+    return NULL_RTX;
+
   /* Expand each pattern individually.  */
+  gcc_assert (npatterns > 1);
   rtx_vector_builder builder;
   auto_vec<rtx, 16> vectors (npatterns);
   for (unsigned int i = 0; i < npatterns; ++i)
@@ -3267,22 +4184,263 @@ aarch64_expand_sve_const_vector (rtx dest, rtx src)
       npatterns /= 2;
       for (unsigned int i = 0; i < npatterns; ++i)
 	{
-	  rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
+	  rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
 	  rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
 	  emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
 	  vectors[i] = tmp;
 	}
     }
-  gcc_assert (vectors[0] == dest);
+  gcc_assert (vectors[0] == target);
+  return target;
+}
+
+/* Use WHILE to set a predicate register of mode MODE in which the first
+   VL bits are set and the rest are clear.  Use TARGET for the register
+   if it's nonnull and convenient.  */
+
+static rtx
+aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
+				 unsigned int vl)
+{
+  rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
+  target = aarch64_target_reg (target, mode);
+  emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
+			target, const0_rtx, limit));
+  return target;
+}
+
+static rtx
+aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
+
+/* BUILDER is a constant predicate in which the index of every set bit
+   is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
+   by inverting every element at a multiple of ELT_SIZE and EORing the
+   result with an ELT_SIZE PTRUE.
+
+   Return a register that contains the constant on success, otherwise
+   return null.  Use TARGET as the register if it is nonnull and
+   convenient.  */
+
+static rtx
+aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
+				   unsigned int elt_size)
+{
+  /* Invert every element at a multiple of ELT_SIZE, keeping the
+     other bits zero.  */
+  rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
+				  builder.nelts_per_pattern ());
+  for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
+    if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
+      inv_builder.quick_push (const1_rtx);
+    else
+      inv_builder.quick_push (const0_rtx);
+  inv_builder.finalize ();
+
+  /* See if we can load the constant cheaply.  */
+  rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
+  if (!inv)
+    return NULL_RTX;
+
+  /* EOR the result with an ELT_SIZE PTRUE.  */
+  rtx mask = aarch64_ptrue_all (elt_size);
+  mask = force_reg (VNx16BImode, mask);
+  target = aarch64_target_reg (target, VNx16BImode);
+  emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
+  return target;
+}
+
+/* BUILDER is a constant predicate in which the index of every set bit
+   is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
+   using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
+   register on success, otherwise return null.  Use TARGET as the register
+   if nonnull and convenient.  */
+
+static rtx
+aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
+				   unsigned int elt_size,
+				   unsigned int permute_size)
+{
+  /* We're going to split the constant into two new constants A and B,
+     with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
+     and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
+
+     A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
+     B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
+
+     where _ indicates elements that will be discarded by the permute.
+
+     First calculate the ELT_SIZEs for A and B.  */
+  unsigned int a_elt_size = GET_MODE_SIZE (DImode);
+  unsigned int b_elt_size = GET_MODE_SIZE (DImode);
+  for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
+    if (INTVAL (builder.elt (i)) != 0)
+      {
+	if (i & permute_size)
+	  b_elt_size |= i - permute_size;
+	else
+	  a_elt_size |= i;
+      }
+  a_elt_size &= -a_elt_size;
+  b_elt_size &= -b_elt_size;
+
+  /* Now construct the vectors themselves.  */
+  rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
+				builder.nelts_per_pattern ());
+  rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
+				builder.nelts_per_pattern ());
+  unsigned int nelts = builder.encoded_nelts ();
+  for (unsigned int i = 0; i < nelts; ++i)
+    if (i & (elt_size - 1))
+      {
+	a_builder.quick_push (const0_rtx);
+	b_builder.quick_push (const0_rtx);
+      }
+    else if ((i & permute_size) == 0)
+      {
+	/* The A and B elements are significant.  */
+	a_builder.quick_push (builder.elt (i));
+	b_builder.quick_push (builder.elt (i + permute_size));
+      }
+    else
+      {
+	/* The A and B elements are going to be discarded, so pick whatever
+	   is likely to give a nice constant.  We are targeting element
+	   sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
+	   with the aim of each being a sequence of ones followed by
+	   a sequence of zeros.  So:
+
+	   * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
+	     duplicate the last X_ELT_SIZE element, to extend the
+	     current sequence of ones or zeros.
+
+	   * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
+	     zero, so that the constant really does have X_ELT_SIZE and
+	     not a smaller size.  */
+	if (a_elt_size > permute_size)
+	  a_builder.quick_push (const0_rtx);
+	else
+	  a_builder.quick_push (a_builder.elt (i - a_elt_size));
+	if (b_elt_size > permute_size)
+	  b_builder.quick_push (const0_rtx);
+	else
+	  b_builder.quick_push (b_builder.elt (i - b_elt_size));
+      }
+  a_builder.finalize ();
+  b_builder.finalize ();
+
+  /* Try loading A into a register.  */
+  rtx_insn *last = get_last_insn ();
+  rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
+  if (!a)
+    return NULL_RTX;
+
+  /* Try loading B into a register.  */
+  rtx b = a;
+  if (a_builder != b_builder)
+    {
+      b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
+      if (!b)
+	{
+	  delete_insns_since (last);
+	  return NULL_RTX;
+	}
+    }
+
+  /* Emit the TRN1 itself.  */
+  machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
+  target = aarch64_target_reg (target, mode);
+  emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
+			      gen_lowpart (mode, a),
+			      gen_lowpart (mode, b)));
+  return target;
+}
+
+/* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
+   constant in BUILDER into an SVE predicate register.  Return the register
+   on success, otherwise return null.  Use TARGET for the register if
+   nonnull and convenient.
+
+   ALLOW_RECURSE_P is true if we can use methods that would call this
+   function recursively.  */
+
+static rtx
+aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
+				 bool allow_recurse_p)
+{
+  if (builder.encoded_nelts () == 1)
+    /* A PFALSE or a PTRUE .B ALL.  */
+    return aarch64_emit_set_immediate (target, builder);
+
+  unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
+  if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
+    {
+      /* If we can load the constant using PTRUE, use it as-is.  */
+      machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
+      if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
+	return aarch64_emit_set_immediate (target, builder);
+
+      /* Otherwise use WHILE to set the first VL bits.  */
+      return aarch64_sve_move_pred_via_while (target, mode, vl);
+    }
+
+  if (!allow_recurse_p)
+    return NULL_RTX;
+
+  /* Try inverting the vector in element size ELT_SIZE and then EORing
+     the result with an ELT_SIZE PTRUE.  */
+  if (INTVAL (builder.elt (0)) == 0)
+    if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
+						     elt_size))
+      return res;
+
+  /* Try using TRN1 to permute two simpler constants.  */
+  for (unsigned int i = elt_size; i <= 8; i *= 2)
+    if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
+						     elt_size, i))
+      return res;
+
+  return NULL_RTX;
 }
 
-/* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
-   is a pattern that can be used to set DEST to a replicated scalar
-   element.  */
+/* Return an SVE predicate register that contains the VNx16BImode
+   constant in BUILDER, without going through the move expanders.
+
+   The returned register can have whatever mode seems most natural
+   given the contents of BUILDER.  Use TARGET for the result if
+   convenient.  */
+
+static rtx
+aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
+{
+  /* Try loading the constant using pure predicate operations.  */
+  if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
+    return res;
+
+  /* Try forcing the constant to memory.  */
+  if (builder.full_nelts ().is_constant ())
+    if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
+      {
+	target = aarch64_target_reg (target, VNx16BImode);
+	emit_move_insn (target, mem);
+	return target;
+      }
+
+  /* The last resort is to load the constant as an integer and then
+     compare it against zero.  Use -1 for set bits in order to increase
+     the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
+  rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
+				  builder.nelts_per_pattern ());
+  for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
+    int_builder.quick_push (INTVAL (builder.elt (i))
+			    ? constm1_rtx : const0_rtx);
+  return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
+					   int_builder.build ());
+}
+
+/* Set DEST to immediate IMM.  */
 
 void
-aarch64_expand_mov_immediate (rtx dest, rtx imm,
-			      rtx (*gen_vec_duplicate) (rtx, rtx))
+aarch64_expand_mov_immediate (rtx dest, rtx imm)
 {
   machine_mode mode = GET_MODE (dest);
 
@@ -3405,38 +4563,50 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm,
 
   if (!CONST_INT_P (imm))
     {
-      rtx base, step, value;
-      if (GET_CODE (imm) == HIGH
-	  || aarch64_simd_valid_immediate (imm, NULL))
-	emit_insn (gen_rtx_SET (dest, imm));
-      else if (const_vec_series_p (imm, &base, &step))
-	aarch64_expand_vec_series (dest, base, step);
-      else if (const_vec_duplicate_p (imm, &value))
+      if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
 	{
-	  /* If the constant is out of range of an SVE vector move,
-	     load it from memory if we can, otherwise move it into
-	     a register and use a DUP.  */
-	  scalar_mode inner_mode = GET_MODE_INNER (mode);
-	  rtx op = force_const_mem (inner_mode, value);
-	  if (!op)
-	    op = force_reg (inner_mode, value);
-	  else if (!aarch64_sve_ld1r_operand_p (op))
+	  /* Only the low bit of each .H, .S and .D element is defined,
+	     so we can set the upper bits to whatever we like.  If the
+	     predicate is all-true in MODE, prefer to set all the undefined
+	     bits as well, so that we can share a single .B predicate for
+	     all modes.  */
+	  if (imm == CONSTM1_RTX (mode))
+	    imm = CONSTM1_RTX (VNx16BImode);
+
+	  /* All methods for constructing predicate modes wider than VNx16BI
+	     will set the upper bits of each element to zero.  Expose this
+	     by moving such constants as a VNx16BI, so that all bits are
+	     significant and so that constants for different modes can be
+	     shared.  The wider constant will still be available as a
+	     REG_EQUAL note.  */
+	  rtx_vector_builder builder;
+	  if (aarch64_get_sve_pred_bits (builder, imm))
 	    {
-	      rtx addr = force_reg (Pmode, XEXP (op, 0));
-	      op = replace_equiv_address (op, addr);
+	      rtx res = aarch64_expand_sve_const_pred (dest, builder);
+	      if (dest != res)
+		emit_move_insn (dest, gen_lowpart (mode, res));
+	      return;
 	    }
-	  emit_insn (gen_vec_duplicate (dest, op));
 	}
-      else if (GET_CODE (imm) == CONST_VECTOR
-	       && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
-	aarch64_expand_sve_const_vector (dest, imm);
-      else
+
+      if (GET_CODE (imm) == HIGH
+	  || aarch64_simd_valid_immediate (imm, NULL))
 	{
-	  rtx mem = force_const_mem (mode, imm);
-	  gcc_assert (mem);
-	  emit_move_insn (dest, mem);
+	  emit_insn (gen_rtx_SET (dest, imm));
+	  return;
 	}
 
+      if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
+	if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
+	  {
+	    if (dest != res)
+	      emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
+	    return;
+	  }
+
+      rtx mem = force_const_mem (mode, imm);
+      gcc_assert (mem);
+      emit_move_insn (dest, mem);
       return;
     }
 
@@ -3455,6 +4625,7 @@ aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
   create_output_operand (&ops[0], dest, mode);
   create_input_operand (&ops[1], pred, GET_MODE(pred));
   create_input_operand (&ops[2], src, mode);
+  temporary_volatile_ok v (true);
   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
 }
 
@@ -3471,7 +4642,7 @@ void
 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
 {
   machine_mode mode = GET_MODE (dest);
-  rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
+  rtx ptrue = aarch64_ptrue_reg (pred_mode);
   if (!register_operand (src, mode)
       && !register_operand (dest, mode))
     {
@@ -3535,7 +4706,7 @@ aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
     return false;
 
   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
-  rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
+  rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
 			       UNSPEC_REV_SUBREG);
   emit_insn (gen_rtx_SET (dest, unspec));
@@ -3557,14 +4728,29 @@ aarch64_replace_reg_mode (rtx x, machine_mode mode)
   return x;
 }
 
+/* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
+   stored in wider integer containers.  */
+
+static unsigned int
+aarch64_sve_rev_unspec (machine_mode mode)
+{
+  switch (GET_MODE_UNIT_SIZE (mode))
+    {
+    case 1: return UNSPEC_REVB;
+    case 2: return UNSPEC_REVH;
+    case 4: return UNSPEC_REVW;
+    }
+  gcc_unreachable ();
+}
+
 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
    operands.  */
 
 void
 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
 {
-  /* Decide which REV operation we need.  The mode with narrower elements
-     determines the mode of the operands and the mode with the wider
+  /* Decide which REV operation we need.  The mode with wider elements
+     determines the mode of the operands and the mode with the narrower
      elements determines the reverse width.  */
   machine_mode mode_with_wider_elts = GET_MODE (dest);
   machine_mode mode_with_narrower_elts = GET_MODE (src);
@@ -3572,38 +4758,22 @@ aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
 
+  unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
-  unsigned int unspec;
-  if (wider_bytes == 8)
-    unspec = UNSPEC_REV64;
-  else if (wider_bytes == 4)
-    unspec = UNSPEC_REV32;
-  else if (wider_bytes == 2)
-    unspec = UNSPEC_REV16;
-  else
-    gcc_unreachable ();
   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
 
-  /* Emit:
-
-       (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
-			 UNSPEC_MERGE_PTRUE))
-
-     with the appropriate modes.  */
+  /* Get the operands in the appropriate modes and emit the instruction.  */
   ptrue = gen_lowpart (pred_mode, ptrue);
-  dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
-  src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
-  src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
-  src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
-			UNSPEC_MERGE_PTRUE);
-  emit_insn (gen_rtx_SET (dest, src));
+  dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
+  src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
+  emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
+			       dest, ptrue, src));
 }
 
 static bool
-aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
-				 tree exp ATTRIBUTE_UNUSED)
+aarch64_function_ok_for_sibcall (tree, tree exp)
 {
-  if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
+  if (crtl->abi->id () != expr_callee_abi (exp).id ())
     return false;
 
   return true;
@@ -3612,35 +4782,48 @@ aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
 /* Implement TARGET_PASS_BY_REFERENCE.  */
 
 static bool
-aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
-			   machine_mode mode,
-			   const_tree type,
-			   bool named ATTRIBUTE_UNUSED)
+aarch64_pass_by_reference (cumulative_args_t pcum_v,
+			   const function_arg_info &arg)
 {
+  CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
   HOST_WIDE_INT size;
   machine_mode dummymode;
   int nregs;
 
+  unsigned int num_zr, num_pr;
+  if (arg.type && aarch64_sve_argument_p (arg.type, &num_zr, &num_pr))
+    {
+      if (pcum && !pcum->silent_p && !TARGET_SVE)
+	/* We can't gracefully recover at this point, so make this a
+	   fatal error.  */
+	fatal_error (input_location, "arguments of type %qT require"
+		     " the SVE ISA extension", arg.type);
+
+      /* Variadic SVE types are passed by reference.  Normal non-variadic
+	 arguments are too if we've run out of registers.  */
+      return (!arg.named
+	      || pcum->aapcs_nvrn + num_zr > NUM_FP_ARG_REGS
+	      || pcum->aapcs_nprn + num_pr > NUM_PR_ARG_REGS);
+    }
+
   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
-  if (mode == BLKmode && type)
-    size = int_size_in_bytes (type);
+  if (arg.mode == BLKmode && arg.type)
+    size = int_size_in_bytes (arg.type);
   else
     /* No frontends can create types with variable-sized modes, so we
        shouldn't be asked to pass or return them.  */
-    size = GET_MODE_SIZE (mode).to_constant ();
+    size = GET_MODE_SIZE (arg.mode).to_constant ();
 
   /* Aggregates are passed by reference based on their size.  */
-  if (type && AGGREGATE_TYPE_P (type))
-    {
-      size = int_size_in_bytes (type);
-    }
+  if (arg.aggregate_type_p ())
+    size = int_size_in_bytes (arg.type);
 
   /* Variable sized arguments are always returned by reference.  */
   if (size < 0)
     return true;
 
   /* Can this be a candidate to be passed in fp/simd register(s)?  */
-  if (aarch64_vfp_is_call_or_return_candidate (mode, type,
+  if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
 					       &dummymode, &nregs,
 					       NULL))
     return false;
@@ -3696,6 +4879,29 @@ aarch64_function_value (const_tree type, const_tree func,
   if (INTEGRAL_TYPE_P (type))
     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
 
+  unsigned int num_zr, num_pr;
+  if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
+    {
+      /* Don't raise an error here if we're called when SVE is disabled,
+	 since this is really just a query function.  Other code must
+	 do that where appropriate.  */
+      mode = TYPE_MODE_RAW (type);
+      gcc_assert (VECTOR_MODE_P (mode)
+		  && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
+
+      if (num_zr > 0 && num_pr == 0)
+	return gen_rtx_REG (mode, V0_REGNUM);
+
+      if (num_zr == 0 && num_pr == 1)
+	return gen_rtx_REG (mode, P0_REGNUM);
+
+      gcc_unreachable ();
+    }
+
+  /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
+     returned in memory, not by value.  */
+  gcc_assert (!aarch64_sve_mode_p (mode));
+
   if (aarch64_return_in_msb (type))
     {
       HOST_WIDE_INT size = int_size_in_bytes (type);
@@ -3778,6 +4984,16 @@ aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
     /* Simple scalar types always returned in registers.  */
     return false;
 
+  unsigned int num_zr, num_pr;
+  if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
+    {
+      /* All SVE types we support fit in registers.  For example, it isn't
+	 yet possible to define an aggregate of 9+ SVE vectors or 5+ SVE
+	 predicates.  */
+      gcc_assert (num_zr <= NUM_FP_ARG_REGS && num_pr <= NUM_PR_ARG_REGS);
+      return false;
+    }
+
   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
 					       type,
 					       &ag_mode,
@@ -3853,11 +5069,11 @@ aarch64_function_arg_alignment (machine_mode mode, const_tree type,
    numbers refer to the rule numbers in the AAPCS64.  */
 
 static void
-aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
-		    const_tree type,
-		    bool named ATTRIBUTE_UNUSED)
+aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
 {
   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
+  tree type = arg.type;
+  machine_mode mode = arg.mode;
   int ncrn, nvrn, nregs;
   bool allocate_ncrn, allocate_nvrn;
   HOST_WIDE_INT size;
@@ -3869,6 +5085,46 @@ aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
 
   pcum->aapcs_arg_processed = true;
 
+  unsigned int num_zr, num_pr;
+  if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
+    {
+      /* The PCS says that it is invalid to pass an SVE value to an
+	 unprototyped function.  There is no ABI-defined location we
+	 can return in this case, so we have no real choice but to raise
+	 an error immediately, even though this is only a query function.  */
+      if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
+	{
+	  gcc_assert (!pcum->silent_p);
+	  error ("SVE type %qT cannot be passed to an unprototyped function",
+		 arg.type);
+	  /* Avoid repeating the message, and avoid tripping the assert
+	     below.  */
+	  pcum->pcs_variant = ARM_PCS_SVE;
+	}
+
+      /* We would have converted the argument into pass-by-reference
+	 form if it didn't fit in registers.  */
+      pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + num_zr;
+      pcum->aapcs_nextnprn = pcum->aapcs_nprn + num_pr;
+      gcc_assert (arg.named
+		  && pcum->pcs_variant == ARM_PCS_SVE
+		  && aarch64_sve_mode_p (mode)
+		  && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
+		  && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
+
+      if (num_zr > 0 && num_pr == 0)
+	pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + pcum->aapcs_nvrn);
+      else if (num_zr == 0 && num_pr == 1)
+	pcum->aapcs_reg = gen_rtx_REG (mode, P0_REGNUM + pcum->aapcs_nprn);
+      else
+	gcc_unreachable ();
+      return;
+    }
+
+  /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
+     passed by reference, not by value.  */
+  gcc_assert (!aarch64_sve_mode_p (mode));
+
   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
   if (type)
     size = int_size_in_bytes (type);
@@ -3893,7 +5149,7 @@ aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
      and homogenous short-vector aggregates (HVA).  */
   if (allocate_nvrn)
     {
-      if (!TARGET_FLOAT)
+      if (!pcum->silent_p && !TARGET_FLOAT)
 	aarch64_err_no_fpadvsimd (mode);
 
       if (nvrn + nregs <= NUM_FP_ARG_REGS)
@@ -4009,37 +5265,46 @@ on_stack:
 /* Implement TARGET_FUNCTION_ARG.  */
 
 static rtx
-aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
-		      const_tree type, bool named)
+aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
 {
   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
-  gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
+  gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
+	      || pcum->pcs_variant == ARM_PCS_SIMD
+	      || pcum->pcs_variant == ARM_PCS_SVE);
 
-  if (mode == VOIDmode)
-    return NULL_RTX;
+  if (arg.end_marker_p ())
+    return gen_int_mode (pcum->pcs_variant, DImode);
 
-  aarch64_layout_arg (pcum_v, mode, type, named);
+  aarch64_layout_arg (pcum_v, arg);
   return pcum->aapcs_reg;
 }
 
 void
 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
-			   const_tree fntype ATTRIBUTE_UNUSED,
-			   rtx libname ATTRIBUTE_UNUSED,
-			   const_tree fndecl ATTRIBUTE_UNUSED,
-			   unsigned n_named ATTRIBUTE_UNUSED)
+			      const_tree fntype,
+			      rtx libname ATTRIBUTE_UNUSED,
+			      const_tree fndecl ATTRIBUTE_UNUSED,
+			      unsigned n_named ATTRIBUTE_UNUSED,
+			      bool silent_p)
 {
   pcum->aapcs_ncrn = 0;
   pcum->aapcs_nvrn = 0;
+  pcum->aapcs_nprn = 0;
   pcum->aapcs_nextncrn = 0;
   pcum->aapcs_nextnvrn = 0;
-  pcum->pcs_variant = ARM_PCS_AAPCS64;
+  pcum->aapcs_nextnprn = 0;
+  if (fntype)
+    pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
+  else
+    pcum->pcs_variant = ARM_PCS_AAPCS64;
   pcum->aapcs_reg = NULL_RTX;
   pcum->aapcs_arg_processed = false;
   pcum->aapcs_stack_words = 0;
   pcum->aapcs_stack_size = 0;
+  pcum->silent_p = silent_p;
 
-  if (!TARGET_FLOAT
+  if (!silent_p
+      && !TARGET_FLOAT
       && fndecl && TREE_PUBLIC (fndecl)
       && fntype && fntype != error_mark_node)
     {
@@ -4050,24 +5315,38 @@ aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
 						   &mode, &nregs, NULL))
 	aarch64_err_no_fpadvsimd (TYPE_MODE (type));
     }
-  return;
+
+  if (!silent_p
+      && !TARGET_SVE
+      && pcum->pcs_variant == ARM_PCS_SVE)
+    {
+      /* We can't gracefully recover at this point, so make this a
+	 fatal error.  */
+      if (fndecl)
+	fatal_error (input_location, "%qE requires the SVE ISA extension",
+		     fndecl);
+      else
+	fatal_error (input_location, "calls to functions of type %qT require"
+		     " the SVE ISA extension", fntype);
+    }
 }
 
 static void
 aarch64_function_arg_advance (cumulative_args_t pcum_v,
-			      machine_mode mode,
-			      const_tree type,
-			      bool named)
+			      const function_arg_info &arg)
 {
   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
-  if (pcum->pcs_variant == ARM_PCS_AAPCS64)
+  if (pcum->pcs_variant == ARM_PCS_AAPCS64
+      || pcum->pcs_variant == ARM_PCS_SIMD
+      || pcum->pcs_variant == ARM_PCS_SVE)
     {
-      aarch64_layout_arg (pcum_v, mode, type, named);
+      aarch64_layout_arg (pcum_v, arg);
       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
 		  != (pcum->aapcs_stack_words != 0));
       pcum->aapcs_arg_processed = false;
       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
+      pcum->aapcs_nprn = pcum->aapcs_nextnprn;
       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
       pcum->aapcs_stack_words = 0;
       pcum->aapcs_reg = NULL_RTX;
@@ -4500,11 +5779,14 @@ aarch64_needs_frame_chain (void)
 static void
 aarch64_layout_frame (void)
 {
-  HOST_WIDE_INT offset = 0;
+  poly_int64 offset = 0;
   int regno, last_fp_reg = INVALID_REGNUM;
-  bool simd_function = aarch64_simd_decl_p (cfun->decl);
+  machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
+  poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
+  bool frame_related_fp_reg_p = false;
+  aarch64_frame &frame = cfun->machine->frame;
 
-  cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
+  frame.emit_frame_chain = aarch64_needs_frame_chain ();
 
   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
      the mid-end is doing.  */
@@ -4513,184 +5795,264 @@ aarch64_layout_frame (void)
 #define SLOT_NOT_REQUIRED (-2)
 #define SLOT_REQUIRED     (-1)
 
-  cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
-  cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
-
-  /* If this is a non-leaf simd function with calls we assume that
-     at least one of those calls is to a non-simd function and thus
-     we must save V8 to V23 in the prologue.  */
-
-  if (simd_function && !crtl->is_leaf)
-    {
-      for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
-	if (FP_SIMD_SAVED_REGNUM_P (regno))
- 	  df_set_regs_ever_live (regno, true);
-    }
+  frame.wb_candidate1 = INVALID_REGNUM;
+  frame.wb_candidate2 = INVALID_REGNUM;
+  frame.spare_pred_reg = INVALID_REGNUM;
 
   /* First mark all the registers that really need to be saved...  */
-  for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
-    cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
-
-  for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
-    cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
+  for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
+    frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
 
   /* ... that includes the eh data registers (if needed)...  */
   if (crtl->calls_eh_return)
     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
-      cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
-	= SLOT_REQUIRED;
+      frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
 
   /* ... and any callee saved register that dataflow says is live.  */
   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
     if (df_regs_ever_live_p (regno)
+	&& !fixed_regs[regno]
 	&& (regno == R30_REGNUM
-	    || !call_used_regs[regno]))
-      cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
+	    || !crtl->abi->clobbers_full_reg_p (regno)))
+      frame.reg_offset[regno] = SLOT_REQUIRED;
 
   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
     if (df_regs_ever_live_p (regno)
-	&& (!call_used_regs[regno]
-	    || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
+	&& !fixed_regs[regno]
+	&& !crtl->abi->clobbers_full_reg_p (regno))
       {
-	cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
+	frame.reg_offset[regno] = SLOT_REQUIRED;
 	last_fp_reg = regno;
+	if (aarch64_emit_cfi_for_reg_p (regno))
+	  frame_related_fp_reg_p = true;
       }
 
-  if (cfun->machine->frame.emit_frame_chain)
-    {
-      /* FP and LR are placed in the linkage record.  */
-      cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
-      cfun->machine->frame.wb_candidate1 = R29_REGNUM;
-      cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
-      cfun->machine->frame.wb_candidate2 = R30_REGNUM;
-      offset = 2 * UNITS_PER_WORD;
+  /* Big-endian SVE frames need a spare predicate register in order
+     to save Z8-Z15.  Decide which register they should use.  Prefer
+     an unused argument register if possible, so that we don't force P4
+     to be saved unnecessarily.  */
+  if (frame_related_fp_reg_p
+      && crtl->abi->id () == ARM_PCS_SVE
+      && BYTES_BIG_ENDIAN)
+    {
+      bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
+      bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
+      for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
+	if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
+	  break;
+      gcc_assert (regno <= P7_REGNUM);
+      frame.spare_pred_reg = regno;
+      df_set_regs_ever_live (regno, true);
     }
 
+  for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
+    if (df_regs_ever_live_p (regno)
+	&& !fixed_regs[regno]
+	&& !crtl->abi->clobbers_full_reg_p (regno))
+      frame.reg_offset[regno] = SLOT_REQUIRED;
+
   /* With stack-clash, LR must be saved in non-leaf functions.  */
   gcc_assert (crtl->is_leaf
-	      || (cfun->machine->frame.reg_offset[R30_REGNUM]
-		  != SLOT_NOT_REQUIRED));
+	      || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
+
+  /* Now assign stack slots for the registers.  Start with the predicate
+     registers, since predicate LDR and STR have a relatively small
+     offset range.  These saves happen below the hard frame pointer.  */
+  for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
+    if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
+      {
+	frame.reg_offset[regno] = offset;
+	offset += BYTES_PER_SVE_PRED;
+      }
+
+  /* We save a maximum of 8 predicate registers, and since vector
+     registers are 8 times the size of a predicate register, all the
+     saved predicates fit within a single vector.  Doing this also
+     rounds the offset to a 128-bit boundary.  */
+  if (maybe_ne (offset, 0))
+    {
+      gcc_assert (known_le (offset, vector_save_size));
+      offset = vector_save_size;
+    }
+
+  /* If we need to save any SVE vector registers, add them next.  */
+  if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
+    for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+      if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
+	{
+	  frame.reg_offset[regno] = offset;
+	  offset += vector_save_size;
+	}
+
+  /* OFFSET is now the offset of the hard frame pointer from the bottom
+     of the callee save area.  */
+  bool saves_below_hard_fp_p = maybe_ne (offset, 0);
+  frame.below_hard_fp_saved_regs_size = offset;
+  if (frame.emit_frame_chain)
+    {
+      /* FP and LR are placed in the linkage record.  */
+      frame.reg_offset[R29_REGNUM] = offset;
+      frame.wb_candidate1 = R29_REGNUM;
+      frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
+      frame.wb_candidate2 = R30_REGNUM;
+      offset += 2 * UNITS_PER_WORD;
+    }
 
-  /* Now assign stack slots for them.  */
   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
-    if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
+    if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
       {
-	cfun->machine->frame.reg_offset[regno] = offset;
-	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
-	  cfun->machine->frame.wb_candidate1 = regno;
-	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
-	  cfun->machine->frame.wb_candidate2 = regno;
+	frame.reg_offset[regno] = offset;
+	if (frame.wb_candidate1 == INVALID_REGNUM)
+	  frame.wb_candidate1 = regno;
+	else if (frame.wb_candidate2 == INVALID_REGNUM)
+	  frame.wb_candidate2 = regno;
 	offset += UNITS_PER_WORD;
       }
 
-  HOST_WIDE_INT max_int_offset = offset;
-  offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
-  bool has_align_gap = offset != max_int_offset;
+  poly_int64 max_int_offset = offset;
+  offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+  bool has_align_gap = maybe_ne (offset, max_int_offset);
 
   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
-    if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
+    if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
       {
 	/* If there is an alignment gap between integer and fp callee-saves,
 	   allocate the last fp register to it if possible.  */
 	if (regno == last_fp_reg
 	    && has_align_gap
-	    && !simd_function
-	    && (offset & 8) == 0)
+	    && known_eq (vector_save_size, 8)
+	    && multiple_p (offset, 16))
 	  {
-	    cfun->machine->frame.reg_offset[regno] = max_int_offset;
+	    frame.reg_offset[regno] = max_int_offset;
 	    break;
 	  }
 
-	cfun->machine->frame.reg_offset[regno] = offset;
-	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
-	  cfun->machine->frame.wb_candidate1 = regno;
-	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
-		 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
-	  cfun->machine->frame.wb_candidate2 = regno;
-	offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
+	frame.reg_offset[regno] = offset;
+	if (frame.wb_candidate1 == INVALID_REGNUM)
+	  frame.wb_candidate1 = regno;
+	else if (frame.wb_candidate2 == INVALID_REGNUM
+		 && frame.wb_candidate1 >= V0_REGNUM)
+	  frame.wb_candidate2 = regno;
+	offset += vector_save_size;
       }
 
-  offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+  offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
 
-  cfun->machine->frame.saved_regs_size = offset;
+  frame.saved_regs_size = offset;
 
-  HOST_WIDE_INT varargs_and_saved_regs_size
-    = offset + cfun->machine->frame.saved_varargs_size;
+  poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
 
-  cfun->machine->frame.hard_fp_offset
+  poly_int64 above_outgoing_args
     = aligned_upper_bound (varargs_and_saved_regs_size
 			   + get_frame_size (),
 			   STACK_BOUNDARY / BITS_PER_UNIT);
 
+  frame.hard_fp_offset
+    = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
+
   /* Both these values are already aligned.  */
   gcc_assert (multiple_p (crtl->outgoing_args_size,
 			  STACK_BOUNDARY / BITS_PER_UNIT));
-  cfun->machine->frame.frame_size
-    = (cfun->machine->frame.hard_fp_offset
-       + crtl->outgoing_args_size);
+  frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
 
-  cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
+  frame.locals_offset = frame.saved_varargs_size;
 
-  cfun->machine->frame.initial_adjust = 0;
-  cfun->machine->frame.final_adjust = 0;
-  cfun->machine->frame.callee_adjust = 0;
-  cfun->machine->frame.callee_offset = 0;
+  frame.initial_adjust = 0;
+  frame.final_adjust = 0;
+  frame.callee_adjust = 0;
+  frame.sve_callee_adjust = 0;
+  frame.callee_offset = 0;
 
   HOST_WIDE_INT max_push_offset = 0;
-  if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
+  if (frame.wb_candidate2 != INVALID_REGNUM)
     max_push_offset = 512;
-  else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
+  else if (frame.wb_candidate1 != INVALID_REGNUM)
     max_push_offset = 256;
 
-  HOST_WIDE_INT const_size, const_fp_offset;
-  if (cfun->machine->frame.frame_size.is_constant (&const_size)
+  HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
+  HOST_WIDE_INT const_saved_regs_size;
+  if (frame.frame_size.is_constant (&const_size)
       && const_size < max_push_offset
-      && known_eq (crtl->outgoing_args_size, 0))
+      && known_eq (frame.hard_fp_offset, const_size))
     {
       /* Simple, small frame with no outgoing arguments:
+
 	 stp reg1, reg2, [sp, -frame_size]!
 	 stp reg3, reg4, [sp, 16]  */
-      cfun->machine->frame.callee_adjust = const_size;
-    }
-  else if (known_lt (crtl->outgoing_args_size
-		     + cfun->machine->frame.saved_regs_size, 512)
+      frame.callee_adjust = const_size;
+    }
+  else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
+	   && frame.saved_regs_size.is_constant (&const_saved_regs_size)
+	   && const_outgoing_args_size + const_saved_regs_size < 512
+	   /* We could handle this case even with outgoing args, provided
+	      that the number of args left us with valid offsets for all
+	      predicate and vector save slots.  It's such a rare case that
+	      it hardly seems worth the effort though.  */
+	   && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
 	   && !(cfun->calls_alloca
-		&& known_lt (cfun->machine->frame.hard_fp_offset,
-			     max_push_offset)))
+		&& frame.hard_fp_offset.is_constant (&const_fp_offset)
+		&& const_fp_offset < max_push_offset))
     {
       /* Frame with small outgoing arguments:
+
 	 sub sp, sp, frame_size
 	 stp reg1, reg2, [sp, outgoing_args_size]
 	 stp reg3, reg4, [sp, outgoing_args_size + 16]  */
-      cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
-      cfun->machine->frame.callee_offset
-	= cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
+      frame.initial_adjust = frame.frame_size;
+      frame.callee_offset = const_outgoing_args_size;
     }
-  else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
+  else if (saves_below_hard_fp_p
+	   && known_eq (frame.saved_regs_size,
+			frame.below_hard_fp_saved_regs_size))
+    {
+      /* Frame in which all saves are SVE saves:
+
+	 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
+	 save SVE registers relative to SP
+	 sub sp, sp, outgoing_args_size  */
+      frame.initial_adjust = (frame.hard_fp_offset
+			      + frame.below_hard_fp_saved_regs_size);
+      frame.final_adjust = crtl->outgoing_args_size;
+    }
+  else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
 	   && const_fp_offset < max_push_offset)
     {
-      /* Frame with large outgoing arguments but a small local area:
+      /* Frame with large outgoing arguments or SVE saves, but with
+	 a small local area:
+
 	 stp reg1, reg2, [sp, -hard_fp_offset]!
 	 stp reg3, reg4, [sp, 16]
+	 [sub sp, sp, below_hard_fp_saved_regs_size]
+	 [save SVE registers relative to SP]
 	 sub sp, sp, outgoing_args_size  */
-      cfun->machine->frame.callee_adjust = const_fp_offset;
-      cfun->machine->frame.final_adjust
-	= cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
+      frame.callee_adjust = const_fp_offset;
+      frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
+      frame.final_adjust = crtl->outgoing_args_size;
     }
   else
     {
-      /* Frame with large local area and outgoing arguments using frame pointer:
+      /* Frame with large local area and outgoing arguments or SVE saves,
+	 using frame pointer:
+
 	 sub sp, sp, hard_fp_offset
 	 stp x29, x30, [sp, 0]
 	 add x29, sp, 0
 	 stp reg3, reg4, [sp, 16]
+	 [sub sp, sp, below_hard_fp_saved_regs_size]
+	 [save SVE registers relative to SP]
 	 sub sp, sp, outgoing_args_size  */
-      cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
-      cfun->machine->frame.final_adjust
-	= cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
+      frame.initial_adjust = frame.hard_fp_offset;
+      frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
+      frame.final_adjust = crtl->outgoing_args_size;
     }
 
-  cfun->machine->frame.laid_out = true;
+  /* Make sure the individual adjustments add up to the full frame size.  */
+  gcc_assert (known_eq (frame.initial_adjust
+			+ frame.callee_adjust
+			+ frame.sve_callee_adjust
+			+ frame.final_adjust, frame.frame_size));
+
+  frame.laid_out = true;
 }
 
 /* Return true if the register REGNO is saved on entry to
@@ -4699,7 +6061,7 @@ aarch64_layout_frame (void)
 static bool
 aarch64_register_saved_on_entry (int regno)
 {
-  return cfun->machine->frame.reg_offset[regno] >= 0;
+  return known_ge (cfun->machine->frame.reg_offset[regno], 0);
 }
 
 /* Return the next register up from REGNO up to LIMIT for the callee
@@ -4766,7 +6128,7 @@ static void
 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
 {
   rtx_insn *insn;
-  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
+  machine_mode mode = aarch64_reg_save_mode (regno1);
 
   if (regno2 == INVALID_REGNUM)
     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
@@ -4812,7 +6174,7 @@ static void
 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
 		  rtx *cfi_ops)
 {
-  machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
+  machine_mode mode = aarch64_reg_save_mode (regno1);
   rtx reg1 = gen_rtx_REG (mode, regno1);
 
   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
@@ -4888,10 +6250,10 @@ aarch64_return_address_signing_enabled (void)
   gcc_assert (cfun->machine->frame.laid_out);
 
   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
-     if it's LR is pushed onto stack.  */
+     if its LR is pushed onto stack.  */
   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
 	  || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
-	      && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
+	      && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
 }
 
 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
@@ -4901,17 +6263,75 @@ aarch64_bti_enabled (void)
   return (aarch64_enable_bti == 1);
 }
 
+/* The caller is going to use ST1D or LD1D to save or restore an SVE
+   register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
+   the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
+
+     (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
+	 or LD1D address
+
+     (2) setting PRED to a valid predicate register for the ST1D or LD1D,
+	 if the variable isn't already nonnull
+
+   (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
+   Handle this case using a temporary base register that is suitable for
+   all offsets in that range.  Use ANCHOR_REG as this base register if it
+   is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
+
+static inline void
+aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
+				     rtx &anchor_reg, poly_int64 &offset,
+				     rtx &ptrue)
+{
+  if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
+    {
+      /* This is the maximum valid offset of the anchor from the base.
+	 Lower values would be valid too.  */
+      poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
+      if (!anchor_reg)
+	{
+	  anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
+	  emit_insn (gen_add3_insn (anchor_reg, base_rtx,
+				    gen_int_mode (anchor_offset, Pmode)));
+	}
+      base_rtx = anchor_reg;
+      offset -= anchor_offset;
+    }
+  if (!ptrue)
+    {
+      int pred_reg = cfun->machine->frame.spare_pred_reg;
+      emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
+		      CONSTM1_RTX (VNx16BImode));
+      ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
+    }
+}
+
+/* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
+   is saved at BASE + OFFSET.  */
+
+static void
+aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
+			    rtx base, poly_int64 offset)
+{
+  rtx mem = gen_frame_mem (GET_MODE (reg),
+			   plus_constant (Pmode, base, offset));
+  add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
+}
+
 /* Emit code to save the callee-saved registers from register number START
    to LIMIT to the stack at the location starting at offset START_OFFSET,
-   skipping any write-back candidates if SKIP_WB is true.  */
+   skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
+   is true if the hard frame pointer has been set up.  */
 
 static void
-aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
-			   unsigned start, unsigned limit, bool skip_wb)
+aarch64_save_callee_saves (poly_int64 start_offset,
+			   unsigned start, unsigned limit, bool skip_wb,
+			   bool hard_fp_valid_p)
 {
   rtx_insn *insn;
   unsigned regno;
   unsigned regno2;
+  rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
 
   for (regno = aarch64_next_callee_save (start, limit);
        regno <= limit;
@@ -4919,7 +6339,7 @@ aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
     {
       rtx reg, mem;
       poly_int64 offset;
-      int offset_diff;
+      bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
 
       if (skip_wb
 	  && (regno == cfun->machine->frame.wb_candidate1
@@ -4927,27 +6347,53 @@ aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
 	continue;
 
       if (cfun->machine->reg_is_wrapped_separately[regno])
-       continue;
+	continue;
 
+      machine_mode mode = aarch64_reg_save_mode (regno);
       reg = gen_rtx_REG (mode, regno);
       offset = start_offset + cfun->machine->frame.reg_offset[regno];
-      mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
-						offset));
+      rtx base_rtx = stack_pointer_rtx;
+      poly_int64 sp_offset = offset;
 
-      regno2 = aarch64_next_callee_save (regno + 1, limit);
-      offset_diff = cfun->machine->frame.reg_offset[regno2]
-		    - cfun->machine->frame.reg_offset[regno];
+      HOST_WIDE_INT const_offset;
+      if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
+					     offset, ptrue);
+      else if (GP_REGNUM_P (regno)
+	       && (!offset.is_constant (&const_offset) || const_offset >= 512))
+	{
+	  gcc_assert (known_eq (start_offset, 0));
+	  poly_int64 fp_offset
+	    = cfun->machine->frame.below_hard_fp_saved_regs_size;
+	  if (hard_fp_valid_p)
+	    base_rtx = hard_frame_pointer_rtx;
+	  else
+	    {
+	      if (!anchor_reg)
+		{
+		  anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
+		  emit_insn (gen_add3_insn (anchor_reg, base_rtx,
+					    gen_int_mode (fp_offset, Pmode)));
+		}
+	      base_rtx = anchor_reg;
+	    }
+	  offset -= fp_offset;
+	}
+      mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
+      bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
 
-      if (regno2 <= limit
+      if (!aarch64_sve_mode_p (mode)
+	  && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
-	  && known_eq (GET_MODE_SIZE (mode), offset_diff))
+	  && known_eq (GET_MODE_SIZE (mode),
+		       cfun->machine->frame.reg_offset[regno2]
+		       - cfun->machine->frame.reg_offset[regno]))
 	{
 	  rtx reg2 = gen_rtx_REG (mode, regno2);
 	  rtx mem2;
 
-	  offset = start_offset + cfun->machine->frame.reg_offset[regno2];
-	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
-						     offset));
+	  offset += GET_MODE_SIZE (mode);
+	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
 	  insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
 						    reg2));
 
@@ -4955,71 +6401,96 @@ aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
 	     always assumed to be relevant to the frame
 	     calculations; subsequent parts, are only
 	     frame-related if explicitly marked.  */
-	  RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
+	  if (aarch64_emit_cfi_for_reg_p (regno2))
+	    {
+	      if (need_cfa_note_p)
+		aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
+					    sp_offset + GET_MODE_SIZE (mode));
+	      else
+		RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
+	    }
+
 	  regno = regno2;
 	}
+      else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+	{
+	  insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
+	  need_cfa_note_p = true;
+	}
+      else if (aarch64_sve_mode_p (mode))
+	insn = emit_insn (gen_rtx_SET (mem, reg));
       else
 	insn = emit_move_insn (mem, reg);
 
-      RTX_FRAME_RELATED_P (insn) = 1;
+      RTX_FRAME_RELATED_P (insn) = frame_related_p;
+      if (frame_related_p && need_cfa_note_p)
+	aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
     }
 }
 
-/* Emit code to restore the callee registers of mode MODE from register
-   number START up to and including LIMIT.  Restore from the stack offset
-   START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
-   Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
+/* Emit code to restore the callee registers from register number START
+   up to and including LIMIT.  Restore from the stack offset START_OFFSET,
+   skipping any write-back candidates if SKIP_WB is true.  Write the
+   appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
 
 static void
-aarch64_restore_callee_saves (machine_mode mode,
-			      poly_int64 start_offset, unsigned start,
+aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
 			      unsigned limit, bool skip_wb, rtx *cfi_ops)
 {
-  rtx base_rtx = stack_pointer_rtx;
   unsigned regno;
   unsigned regno2;
   poly_int64 offset;
+  rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
 
   for (regno = aarch64_next_callee_save (start, limit);
        regno <= limit;
        regno = aarch64_next_callee_save (regno + 1, limit))
     {
+      bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
       if (cfun->machine->reg_is_wrapped_separately[regno])
-       continue;
+	continue;
 
       rtx reg, mem;
-      int offset_diff;
 
       if (skip_wb
 	  && (regno == cfun->machine->frame.wb_candidate1
 	      || regno == cfun->machine->frame.wb_candidate2))
 	continue;
 
+      machine_mode mode = aarch64_reg_save_mode (regno);
       reg = gen_rtx_REG (mode, regno);
       offset = start_offset + cfun->machine->frame.reg_offset[regno];
+      rtx base_rtx = stack_pointer_rtx;
+      if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
+					     offset, ptrue);
       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
 
-      regno2 = aarch64_next_callee_save (regno + 1, limit);
-      offset_diff = cfun->machine->frame.reg_offset[regno2]
-		    - cfun->machine->frame.reg_offset[regno];
-
-      if (regno2 <= limit
+      if (!aarch64_sve_mode_p (mode)
+	  && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
-	  && known_eq (GET_MODE_SIZE (mode), offset_diff))
+	  && known_eq (GET_MODE_SIZE (mode),
+		       cfun->machine->frame.reg_offset[regno2]
+		       - cfun->machine->frame.reg_offset[regno]))
 	{
 	  rtx reg2 = gen_rtx_REG (mode, regno2);
 	  rtx mem2;
 
-	  offset = start_offset + cfun->machine->frame.reg_offset[regno2];
+	  offset += GET_MODE_SIZE (mode);
 	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
 	  emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
 
 	  *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
 	  regno = regno2;
 	}
+      else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+	emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
+      else if (aarch64_sve_mode_p (mode))
+	emit_insn (gen_rtx_SET (reg, mem));
       else
 	emit_move_insn (reg, mem);
-      *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
+      if (frame_related_p)
+	*cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
     }
 }
 
@@ -5101,13 +6572,35 @@ aarch64_get_separate_components (void)
   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
     if (aarch64_register_saved_on_entry (regno))
       {
+	/* Punt on saves and restores that use ST1D and LD1D.  We could
+	   try to be smarter, but it would involve making sure that the
+	   spare predicate register itself is safe to use at the save
+	   and restore points.  Also, when a frame pointer is being used,
+	   the slots are often out of reach of ST1D and LD1D anyway.  */
+	machine_mode mode = aarch64_reg_save_mode (regno);
+	if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+	  continue;
+
 	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
-	if (!frame_pointer_needed)
-	  offset += cfun->machine->frame.frame_size
-		    - cfun->machine->frame.hard_fp_offset;
+
+	/* If the register is saved in the first SVE save slot, we use
+	   it as a stack probe for -fstack-clash-protection.  */
+	if (flag_stack_clash_protection
+	    && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
+	    && known_eq (offset, 0))
+	  continue;
+
+	/* Get the offset relative to the register we'll use.  */
+	if (frame_pointer_needed)
+	  offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
+	else
+	  offset += crtl->outgoing_args_size;
+
 	/* Check that we can access the stack slot of the register with one
 	   direct load with no adjustments needed.  */
-	if (offset_12bit_unsigned_scaled_p (DImode, offset))
+	if (aarch64_sve_mode_p (mode)
+	    ? offset_9bit_signed_scaled_p (mode, offset)
+	    : offset_12bit_unsigned_scaled_p (mode, offset))
 	  bitmap_set_bit (components, regno);
       }
 
@@ -5115,6 +6608,12 @@ aarch64_get_separate_components (void)
   if (frame_pointer_needed)
     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
 
+  /* If the spare predicate register used by big-endian SVE code
+     is call-preserved, it must be saved in the main prologue
+     before any saves that use it.  */
+  if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
+    bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
+
   unsigned reg1 = cfun->machine->frame.wb_candidate1;
   unsigned reg2 = cfun->machine->frame.wb_candidate2;
   /* If registers have been chosen to be stored/restored with
@@ -5139,31 +6638,48 @@ aarch64_components_for_bb (basic_block bb)
   bitmap in = DF_LIVE_IN (bb);
   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
-  bool simd_function = aarch64_simd_decl_p (cfun->decl);
 
   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
   bitmap_clear (components);
 
+  /* Clobbered registers don't generate values in any meaningful sense,
+     since nothing after the clobber can rely on their value.  And we can't
+     say that partially-clobbered registers are unconditionally killed,
+     because whether they're killed or not depends on the mode of the
+     value they're holding.  Thus partially call-clobbered registers
+     appear in neither the kill set nor the gen set.
+
+     Check manually for any calls that clobber more of a register than the
+     current function can.  */
+  function_abi_aggregator callee_abis;
+  rtx_insn *insn;
+  FOR_BB_INSNS (bb, insn)
+    if (CALL_P (insn))
+      callee_abis.note_callee_abi (insn_callee_abi (insn));
+  HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
+
   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
-    if ((!call_used_regs[regno]
-	|| (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
-       && (bitmap_bit_p (in, regno)
-	   || bitmap_bit_p (gen, regno)
-	   || bitmap_bit_p (kill, regno)))
+    if (!fixed_regs[regno]
+	&& !crtl->abi->clobbers_full_reg_p (regno)
+	&& (TEST_HARD_REG_BIT (extra_caller_saves, regno)
+	    || bitmap_bit_p (in, regno)
+	    || bitmap_bit_p (gen, regno)
+	    || bitmap_bit_p (kill, regno)))
       {
-	unsigned regno2, offset, offset2;
 	bitmap_set_bit (components, regno);
 
 	/* If there is a callee-save at an adjacent offset, add it too
 	   to increase the use of LDP/STP.  */
-	offset = cfun->machine->frame.reg_offset[regno];
-	regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
+	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
+	unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
 
 	if (regno2 <= LAST_SAVED_REGNUM)
 	  {
-	    offset2 = cfun->machine->frame.reg_offset[regno2];
-	    if ((offset & ~8) == (offset2 & ~8))
+	    poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
+	    if (regno < regno2
+		? known_eq (offset + 8, offset2)
+		: multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
 	      bitmap_set_bit (components, regno2);
 	  }
       }
@@ -5218,16 +6734,16 @@ aarch64_process_components (sbitmap components, bool prologue_p)
 
   while (regno != last_regno)
     {
-      /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
-	 so DFmode for the vector registers is enough.  For simd functions
-	 we want to save the low 128 bits.  */
-      machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
+      bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
+      machine_mode mode = aarch64_reg_save_mode (regno);
       
       rtx reg = gen_rtx_REG (mode, regno);
       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
-      if (!frame_pointer_needed)
-	offset += cfun->machine->frame.frame_size
-		  - cfun->machine->frame.hard_fp_offset;
+      if (frame_pointer_needed)
+	offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
+      else
+	offset += crtl->outgoing_args_size;
+
       rtx addr = plus_constant (Pmode, ptr_reg, offset);
       rtx mem = gen_frame_mem (mode, addr);
 
@@ -5238,39 +6754,49 @@ aarch64_process_components (sbitmap components, bool prologue_p)
       if (regno2 == last_regno)
 	{
 	  insn = emit_insn (set);
-	  RTX_FRAME_RELATED_P (insn) = 1;
-	  if (prologue_p)
-	    add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
-	  else
-	    add_reg_note (insn, REG_CFA_RESTORE, reg);
+	  if (frame_related_p)
+	    {
+	      RTX_FRAME_RELATED_P (insn) = 1;
+	      if (prologue_p)
+		add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
+	      else
+		add_reg_note (insn, REG_CFA_RESTORE, reg);
+	    }
 	  break;
 	}
 
       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
       /* The next register is not of the same class or its offset is not
 	 mergeable with the current one into a pair.  */
-      if (!satisfies_constraint_Ump (mem)
+      if (aarch64_sve_mode_p (mode)
+	  || !satisfies_constraint_Ump (mem)
 	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
-	  || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
+	  || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
 	  || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
 		       GET_MODE_SIZE (mode)))
 	{
 	  insn = emit_insn (set);
-	  RTX_FRAME_RELATED_P (insn) = 1;
-	  if (prologue_p)
-	    add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
-	  else
-	    add_reg_note (insn, REG_CFA_RESTORE, reg);
+	  if (frame_related_p)
+	    {
+	      RTX_FRAME_RELATED_P (insn) = 1;
+	      if (prologue_p)
+		add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
+	      else
+		add_reg_note (insn, REG_CFA_RESTORE, reg);
+	    }
 
 	  regno = regno2;
 	  continue;
 	}
 
+      bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
+
       /* REGNO2 can be saved/restored in a pair with REGNO.  */
       rtx reg2 = gen_rtx_REG (mode, regno2);
-      if (!frame_pointer_needed)
-	offset2 += cfun->machine->frame.frame_size
-		  - cfun->machine->frame.hard_fp_offset;
+      if (frame_pointer_needed)
+	offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
+      else
+	offset2 += crtl->outgoing_args_size;
       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
       rtx mem2 = gen_frame_mem (mode, addr2);
       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
@@ -5281,16 +6807,23 @@ aarch64_process_components (sbitmap components, bool prologue_p)
       else
 	insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
 
-      RTX_FRAME_RELATED_P (insn) = 1;
-      if (prologue_p)
-	{
-	  add_reg_note (insn, REG_CFA_OFFSET, set);
-	  add_reg_note (insn, REG_CFA_OFFSET, set2);
-	}
-      else
+      if (frame_related_p || frame_related2_p)
 	{
-	  add_reg_note (insn, REG_CFA_RESTORE, reg);
-	  add_reg_note (insn, REG_CFA_RESTORE, reg2);
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	  if (prologue_p)
+	    {
+	      if (frame_related_p)
+		add_reg_note (insn, REG_CFA_OFFSET, set);
+	      if (frame_related2_p)
+		add_reg_note (insn, REG_CFA_OFFSET, set2);
+	    }
+	  else
+	    {
+	      if (frame_related_p)
+		add_reg_note (insn, REG_CFA_RESTORE, reg);
+	      if (frame_related2_p)
+		add_reg_note (insn, REG_CFA_RESTORE, reg2);
+	    }
 	}
 
       regno = aarch64_get_next_set_bit (components, regno2 + 1);
@@ -5359,15 +6892,31 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
   HOST_WIDE_INT guard_size
     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
-  /* When doing the final adjustment for the outgoing argument size we can't
-     assume that LR was saved at position 0.  So subtract it's offset from the
-     ABI safe buffer so that we don't accidentally allow an adjustment that
-     would result in an allocation larger than the ABI buffer without
-     probing.  */
   HOST_WIDE_INT min_probe_threshold
-    = final_adjustment_p
-      ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
-      : guard_size - guard_used_by_caller;
+    = (final_adjustment_p
+       ? guard_used_by_caller
+       : guard_size - guard_used_by_caller);
+  /* When doing the final adjustment for the outgoing arguments, take into
+     account any unprobed space there is above the current SP.  There are
+     two cases:
+
+     - When saving SVE registers below the hard frame pointer, we force
+       the lowest save to take place in the prologue before doing the final
+       adjustment (i.e. we don't allow the save to be shrink-wrapped).
+       This acts as a probe at SP, so there is no unprobed space.
+
+     - When there are no SVE register saves, we use the store of the link
+       register as a probe.  We can't assume that LR was saved at position 0
+       though, so treat any space below it as unprobed.  */
+  if (final_adjustment_p
+      && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
+    {
+      poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
+      if (known_ge (lr_offset, 0))
+	min_probe_threshold -= lr_offset.to_constant ();
+      else
+	gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
+    }
 
   poly_int64 frame_size = cfun->machine->frame.frame_size;
 
@@ -5377,13 +6926,15 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
   if (flag_stack_clash_protection && !final_adjustment_p)
     {
       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
+      poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
 
       if (known_eq (frame_size, 0))
 	{
 	  dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
 	}
-      else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
+      else if (known_lt (initial_adjust + sve_callee_adjust,
+			 guard_size - guard_used_by_caller)
 	       && known_lt (final_adjust, guard_used_by_caller))
 	{
 	  dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
@@ -5583,24 +7134,10 @@ aarch64_epilogue_uses (int regno)
     {
       if (regno == LR_REGNUM)
 	return 1;
-      if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
-	return 1;
     }
   return 0;
 }
 
-/* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
-   is saved at BASE + OFFSET.  */
-
-static void
-aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
-			    rtx base, poly_int64 offset)
-{
-  rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
-  add_reg_note (insn, REG_CFA_EXPRESSION,
-		gen_rtx_SET (mem, regno_reg_rtx[reg]));
-}
-
 /* AArch64 stack frames generated by this compiler look like:
 
 	+-------------------------------+
@@ -5622,8 +7159,12 @@ aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
 	+-------------------------------+  |
 	|  LR'                          |  |
 	+-------------------------------+  |
-	|  FP'                          | / <- hard_frame_pointer_rtx (aligned)
-        +-------------------------------+
+	|  FP'                          |  |
+	+-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
+	|  SVE vector registers         |  | \
+	+-------------------------------+  |  | below_hard_fp_saved_regs_size
+	|  SVE predicate registers      | /  /
+	+-------------------------------+
 	|  dynamic allocation           |
 	+-------------------------------+
 	|  padding                      |
@@ -5656,7 +7197,8 @@ aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
    The following registers are reserved during frame layout and should not be
    used for any other purpose:
 
-   - r11: Used by stack clash protection when SVE is enabled.
+   - r11: Used by stack clash protection when SVE is enabled, and also
+	  as an anchor register when saving and restoring registers
    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
    - r14 and r15: Used for speculation tracking.
    - r16(IP0), r17(IP1): Used by indirect tailcalls.
@@ -5679,15 +7221,37 @@ aarch64_expand_prologue (void)
   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
+  poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
+  poly_int64 below_hard_fp_saved_regs_size
+    = cfun->machine->frame.below_hard_fp_saved_regs_size;
   unsigned reg1 = cfun->machine->frame.wb_candidate1;
   unsigned reg2 = cfun->machine->frame.wb_candidate2;
   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
   rtx_insn *insn;
 
+  if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
+    {
+      /* Fold the SVE allocation into the initial allocation.
+	 We don't do this in aarch64_layout_arg to avoid pessimizing
+	 the epilogue code.  */
+      initial_adjust += sve_callee_adjust;
+      sve_callee_adjust = 0;
+    }
+
   /* Sign return address for functions.  */
   if (aarch64_return_address_signing_enabled ())
     {
-      insn = emit_insn (gen_pacisp ());
+      switch (aarch64_ra_sign_key)
+	{
+	  case AARCH64_KEY_A:
+	    insn = emit_insn (gen_paciasp ());
+	    break;
+	  case AARCH64_KEY_B:
+	    insn = emit_insn (gen_pacibsp ());
+	    break;
+	  default:
+	    gcc_unreachable ();
+	}
       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
       RTX_FRAME_RELATED_P (insn) = 1;
     }
@@ -5726,18 +7290,27 @@ aarch64_expand_prologue (void)
   if (callee_adjust != 0)
     aarch64_push_regs (reg1, reg2, callee_adjust);
 
+  /* The offset of the frame chain record (if any) from the current SP.  */
+  poly_int64 chain_offset = (initial_adjust + callee_adjust
+			     - cfun->machine->frame.hard_fp_offset);
+  gcc_assert (known_ge (chain_offset, 0));
+
+  /* The offset of the bottom of the save area from the current SP.  */
+  poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
+
   if (emit_frame_chain)
     {
-      poly_int64 reg_offset = callee_adjust;
       if (callee_adjust == 0)
 	{
 	  reg1 = R29_REGNUM;
 	  reg2 = R30_REGNUM;
-	  reg_offset = callee_offset;
-	  aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
+	  aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
+				     false, false);
 	}
+      else
+	gcc_assert (known_eq (chain_offset, 0));
       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
-			  stack_pointer_rtx, callee_offset,
+			  stack_pointer_rtx, chain_offset,
 			  tmp1_rtx, tmp0_rtx, frame_pointer_needed);
       if (frame_pointer_needed && !frame_size.is_constant ())
 	{
@@ -5764,23 +7337,31 @@ aarch64_expand_prologue (void)
 
 	  /* Change the save slot expressions for the registers that
 	     we've already saved.  */
-	  reg_offset -= callee_offset;
-	  aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
-				      reg_offset + UNITS_PER_WORD);
-	  aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
-				      reg_offset);
+	  aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
+				      hard_frame_pointer_rtx, UNITS_PER_WORD);
+	  aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
+				      hard_frame_pointer_rtx, 0);
 	}
       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
     }
 
-  aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
-			     callee_adjust != 0 || emit_frame_chain);
-  if (aarch64_simd_decl_p (cfun->decl))
-    aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-			       callee_adjust != 0 || emit_frame_chain);
-  else
-    aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-			       callee_adjust != 0 || emit_frame_chain);
+  aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
+			     callee_adjust != 0 || emit_frame_chain,
+			     emit_frame_chain);
+  if (maybe_ne (sve_callee_adjust, 0))
+    {
+      gcc_assert (!flag_stack_clash_protection
+		  || known_eq (initial_adjust, 0));
+      aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
+					      sve_callee_adjust,
+					      !frame_pointer_needed, false);
+      saved_regs_offset += sve_callee_adjust;
+    }
+  aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
+			     false, emit_frame_chain);
+  aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
+			     callee_adjust != 0 || emit_frame_chain,
+			     emit_frame_chain);
 
   /* We may need to probe the final adjustment if it is larger than the guard
      that is assumed by the called.  */
@@ -5806,19 +7387,6 @@ aarch64_use_return_insn_p (void)
   return known_eq (cfun->machine->frame.frame_size, 0);
 }
 
-/* Return false for non-leaf SIMD functions in order to avoid
-   shrink-wrapping them.  Doing this will lose the necessary
-   save/restore of FP registers.  */
-
-bool
-aarch64_use_simple_return_insn_p (void)
-{
-  if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
-    return false;
-
-  return true;
-}
-
 /* Generate the epilogue instructions for returning from a function.
    This is almost exactly the reverse of the prolog sequence, except
    that we need to insert barriers to avoid scheduling loads that read
@@ -5831,6 +7399,9 @@ aarch64_expand_epilogue (bool for_sibcall)
   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
+  poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
+  poly_int64 below_hard_fp_saved_regs_size
+    = cfun->machine->frame.below_hard_fp_saved_regs_size;
   unsigned reg1 = cfun->machine->frame.wb_candidate1;
   unsigned reg2 = cfun->machine->frame.wb_candidate2;
   rtx cfi_ops = NULL;
@@ -5844,15 +7415,23 @@ aarch64_expand_epilogue (bool for_sibcall)
     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
 
-  /* We can re-use the registers when the allocation amount is smaller than
-     guard_size - guard_used_by_caller because we won't be doing any probes
-     then.  In such situations the register should remain live with the correct
+  /* We can re-use the registers when:
+
+     (a) the deallocation amount is the same as the corresponding
+	 allocation amount (which is false if we combine the initial
+	 and SVE callee save allocations in the prologue); and
+
+     (b) the allocation amount doesn't need a probe (which is false
+	 if the amount is guard_size - guard_used_by_caller or greater).
+
+     In such situations the register should remain live with the correct
      value.  */
   bool can_inherit_p = (initial_adjust.is_constant ()
-			&& final_adjust.is_constant ())
+			&& final_adjust.is_constant ()
 			&& (!flag_stack_clash_protection
-			    || known_lt (initial_adjust,
-					 guard_size - guard_used_by_caller));
+			    || (known_lt (initial_adjust,
+					  guard_size - guard_used_by_caller)
+				&& known_eq (sve_callee_adjust, 0))));
 
   /* We need to add memory barrier to prevent read from deallocated stack.  */
   bool need_barrier_p
@@ -5877,7 +7456,8 @@ aarch64_expand_epilogue (bool for_sibcall)
     /* If writeback is used when restoring callee-saves, the CFA
        is restored on the instruction doing the writeback.  */
     aarch64_add_offset (Pmode, stack_pointer_rtx,
-			hard_frame_pointer_rtx, -callee_offset,
+			hard_frame_pointer_rtx,
+			-callee_offset - below_hard_fp_saved_regs_size,
 			tmp1_rtx, tmp0_rtx, callee_adjust == 0);
   else
      /* The case where we need to re-use the register here is very rare, so
@@ -5885,14 +7465,17 @@ aarch64_expand_epilogue (bool for_sibcall)
 	immediate doesn't fit.  */
      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
 
-  aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
+  /* Restore the vector registers before the predicate registers,
+     so that we can use P4 as a temporary for big-endian SVE frames.  */
+  aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
+				callee_adjust != 0, &cfi_ops);
+  aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
+				false, &cfi_ops);
+  if (maybe_ne (sve_callee_adjust, 0))
+    aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
+  aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
+				R0_REGNUM, R30_REGNUM,
 				callee_adjust != 0, &cfi_ops);
-  if (aarch64_simd_decl_p (cfun->decl))
-    aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-				  callee_adjust != 0, &cfi_ops);
-  else
-    aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
-				  callee_adjust != 0, &cfi_ops);
 
   if (need_barrier_p)
     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
@@ -5943,13 +7526,23 @@ aarch64_expand_epilogue (bool for_sibcall)
   if (aarch64_return_address_signing_enabled ()
       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
     {
-      insn = emit_insn (gen_autisp ());
+      switch (aarch64_ra_sign_key)
+	{
+	  case AARCH64_KEY_A:
+	    insn = emit_insn (gen_autiasp ());
+	    break;
+	  case AARCH64_KEY_B:
+	    insn = emit_insn (gen_autibsp ());
+	    break;
+	  default:
+	    gcc_unreachable ();
+	}
       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
       RTX_FRAME_RELATED_P (insn) = 1;
     }
 
   /* Stack adjustment for exception handler.  */
-  if (crtl->calls_eh_return)
+  if (crtl->calls_eh_return && !for_sibcall)
     {
       /* We need to unwind the stack by the offset computed by
 	 EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
@@ -6015,6 +7608,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
   int this_regno = R0_REGNUM;
   rtx this_rtx, temp0, temp1, addr, funexp;
   rtx_insn *insn;
+  const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
 
   if (aarch64_bti_enabled ())
     emit_insn (gen_bti_c());
@@ -6077,14 +7671,18 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
     }
   funexp = XEXP (DECL_RTL (function), 0);
   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
-  insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
+  rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
+  insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
   SIBLING_CALL_P (insn) = 1;
 
   insn = get_insns ();
   shorten_branches (insn);
+
+  assemble_start_function (thunk, fnname);
   final_start_function (insn, file, 1);
   final (insn, file, 1);
   final_end_function ();
+  assemble_end_function (thunk, fnname);
 
   /* Stop pretending to be a post-reload pass.  */
   reload_completed = 0;
@@ -6608,9 +8206,15 @@ aarch64_classify_address (struct aarch64_address_info *info,
 
   HOST_WIDE_INT const_size;
 
+  /* Whether a vector mode is partial doesn't affect address legitimacy.
+     Partial vectors like VNx8QImode allow the same indexed addressing
+     mode and MUL VL addressing mode as full vectors like VNx16QImode;
+     in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
+  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+  vec_flags &= ~VEC_PARTIAL;
+
   /* On BE, we use load/store pair for all large int mode load/stores.
      TI/TFmode may also use a load/store pair.  */
-  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
 			    || type == ADDR_QUERY_LDP_STP_N
@@ -6628,7 +8232,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
   bool allow_reg_index_p = (!load_store_pair_p
 			    && (known_lt (GET_MODE_SIZE (mode), 16)
 				|| vec_flags == VEC_ADVSIMD
-				|| vec_flags == VEC_SVE_DATA));
+				|| vec_flags & VEC_SVE_DATA));
 
   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
      [Rn, #offset, MUL VL].  */
@@ -7152,11 +8756,12 @@ aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
    RESULT is the register in which the result is returned.  It's NULL for
    "call" and "sibcall".
    MEM is the location of the function call.
+   CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
    SIBCALL indicates whether this function call is normal call or sibling call.
    It will generate different pattern accordingly.  */
 
 void
-aarch64_expand_call (rtx result, rtx mem, bool sibcall)
+aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
 {
   rtx call, callee, tmp;
   rtvec vec;
@@ -7186,7 +8791,11 @@ aarch64_expand_call (rtx result, rtx mem, bool sibcall)
   else
     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
 
-  vec = gen_rtvec (2, call, tmp);
+  gcc_assert (CONST_INT_P (callee_abi));
+  callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
+			       UNSPEC_CALLEE_ABI);
+
+  vec = gen_rtvec (3, call, callee_abi, tmp);
   call = gen_rtx_PARALLEL (VOIDmode, vec);
 
   aarch64_emit_call_insn (call);
@@ -7382,6 +8991,21 @@ aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
 	}
       break;
 
+    case E_CC_NZCmode:
+      switch (comp_code)
+	{
+	case NE: return AARCH64_NE; /* = any */
+	case EQ: return AARCH64_EQ; /* = none */
+	case GE: return AARCH64_PL; /* = nfrst */
+	case LT: return AARCH64_MI; /* = first */
+	case GEU: return AARCH64_CS; /* = nlast */
+	case GTU: return AARCH64_HI; /* = pmore */
+	case LEU: return AARCH64_LS; /* = plast */
+	case LTU: return AARCH64_CC; /* = last */
+	default: return -1;
+	}
+      break;
+
     case E_CC_NZmode:
       switch (comp_code)
 	{
@@ -7524,15 +9148,24 @@ aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
   if (negate)
     r = real_value_negate (&r);
 
-  /* We only handle the SVE single-bit immediates here.  */
+  /* Handle the SVE single-bit immediates specially, since they have a
+     fixed form in the assembly syntax.  */
   if (real_equal (&r, &dconst0))
     asm_fprintf (f, "0.0");
+  else if (real_equal (&r, &dconst2))
+    asm_fprintf (f, "2.0");
   else if (real_equal (&r, &dconst1))
     asm_fprintf (f, "1.0");
   else if (real_equal (&r, &dconsthalf))
     asm_fprintf (f, "0.5");
   else
-    return false;
+    {
+      const int buf_size = 20;
+      char float_buf[buf_size] = {'\0'};
+      real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
+				1, GET_MODE (elt));
+      asm_fprintf (f, "%s", float_buf);
+    }
 
   return true;
 }
@@ -7560,7 +9193,13 @@ sizetochar (int size)
      'D':		Take the duplicated element in a vector constant
 			and print it as an unsigned integer, in decimal.
      'e':		Print the sign/zero-extend size as a character 8->b,
-			16->h, 32->w.
+			16->h, 32->w.  Can also be used for masks:
+			0xff->b, 0xffff->h, 0xffffffff->w.
+     'I':		If the operand is a duplicated vector constant,
+			replace it with the duplicated scalar.  If the
+			operand is then a floating-point constant, replace
+			it with the integer bit representation.  Print the
+			transformed constant as a signed decimal number.
      'p':		Prints N such that 2^N == X (X must be power of 2 and
 			const int).
      'P':		Print the number of non-zero bits in X (a const_int).
@@ -7574,7 +9213,7 @@ sizetochar (int size)
      'S/T/U/V':		Print a FP/SIMD register name for a register list.
 			The register printed is the FP/SIMD register name
 			of X + 0/1/2/3 for S/T/U/V.
-     'R':		Print a scalar FP/SIMD register name + 1.
+     'R':		Print a scalar Integer/FP/SIMD register name + 1.
      'X':		Print bottom 16 bits of integer constant in hex.
      'w/x':		Print a general register name or the zero register
 			(32-bit or 64-bit).
@@ -7626,27 +9265,22 @@ aarch64_print_operand (FILE *f, rtx x, int code)
 
     case 'e':
       {
-	int n;
-
-	if (!CONST_INT_P (x)
-	    || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
+	x = unwrap_const_vec_duplicate (x);
+	if (!CONST_INT_P (x))
 	  {
 	    output_operand_lossage ("invalid operand for '%%%c'", code);
 	    return;
 	  }
 
-	switch (n)
+	HOST_WIDE_INT val = INTVAL (x);
+	if ((val & ~7) == 8 || val == 0xff)
+	  fputc ('b', f);
+	else if ((val & ~7) == 16 || val == 0xffff)
+	  fputc ('h', f);
+	else if ((val & ~7) == 32 || val == 0xffffffff)
+	  fputc ('w', f);
+	else
 	  {
-	  case 3:
-	    fputc ('b', f);
-	    break;
-	  case 4:
-	    fputc ('h', f);
-	    break;
-	  case 5:
-	    fputc ('w', f);
-	    break;
-	  default:
 	    output_operand_lossage ("invalid operand for '%%%c'", code);
 	    return;
 	  }
@@ -7693,6 +9327,19 @@ aarch64_print_operand (FILE *f, rtx x, int code)
       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
       break;
 
+    case 'I':
+      {
+	x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
+	if (CONST_INT_P (x))
+	  asm_fprintf (f, "%wd", INTVAL (x));
+	else
+	  {
+	    output_operand_lossage ("invalid operand for '%%%c'", code);
+	    return;
+	  }
+	break;
+      }
+
     case 'M':
     case 'm':
       {
@@ -7715,7 +9362,10 @@ aarch64_print_operand (FILE *f, rtx x, int code)
         gcc_assert (cond_code >= 0);
 	if (code == 'M')
 	  cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
-	fputs (aarch64_condition_codes[cond_code], f);
+	if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
+	  fputs (aarch64_sve_condition_codes[cond_code], f);
+	else
+	  fputs (aarch64_condition_codes[cond_code], f);
       }
       break;
 
@@ -7766,12 +9416,13 @@ aarch64_print_operand (FILE *f, rtx x, int code)
       break;
 
     case 'R':
-      if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
-	{
-	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
-	  return;
-	}
-      asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
+      if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
+	asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
+      else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
+	asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
+      else
+	output_operand_lossage ("incompatible register operand for '%%%c'",
+				code);
       break;
 
     case 'X':
@@ -8068,7 +9719,7 @@ aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
 				aarch64_addr_query_type type)
 {
   struct aarch64_address_info addr;
-  unsigned int size;
+  unsigned int size, vec_flags;
 
   /* Check all addresses are Pmode - including ILP32.  */
   if (GET_MODE (x) != Pmode
@@ -8084,26 +9735,24 @@ aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
       {
       case ADDRESS_REG_IMM:
 	if (known_eq (addr.const_offset, 0))
-	  asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
-	else if (aarch64_sve_data_mode_p (mode))
 	  {
-	    HOST_WIDE_INT vnum
-	      = exact_div (addr.const_offset,
-			   BYTES_PER_SVE_VECTOR).to_constant ();
-	    asm_fprintf (f, "[%s, #%wd, mul vl]",
-			 reg_names[REGNO (addr.base)], vnum);
+	    asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
+	    return true;
 	  }
-	else if (aarch64_sve_pred_mode_p (mode))
+
+	vec_flags = aarch64_classify_vector_mode (mode);
+	if (vec_flags & VEC_ANY_SVE)
 	  {
 	    HOST_WIDE_INT vnum
 	      = exact_div (addr.const_offset,
-			   BYTES_PER_SVE_PRED).to_constant ();
+			   aarch64_vl_bytes (mode, vec_flags)).to_constant ();
 	    asm_fprintf (f, "[%s, #%wd, mul vl]",
 			 reg_names[REGNO (addr.base)], vnum);
+	    return true;
 	  }
-	else
-	  asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
-		       INTVAL (addr.offset));
+
+	asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
+		     INTVAL (addr.offset));
 	return true;
 
       case ADDRESS_REG_REG:
@@ -8234,11 +9883,15 @@ aarch64_regno_regclass (unsigned regno)
     return POINTER_REGS;
 
   if (FP_REGNUM_P (regno))
-    return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
+    return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
+	    : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
 
   if (PR_REGNUM_P (regno))
     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
 
+  if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
+    return FFR_REGS;
+
   return NO_REGS;
 }
 
@@ -8348,13 +10001,14 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
 			  secondary_reload_info *sri)
 {
   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
-     directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
+     directly by the *aarch64_sve_mov<mode>_[lb]e move patterns.  See the
      comment at the head of aarch64-sve.md for more details about the
      big-endian handling.  */
   if (BYTES_BIG_ENDIAN
       && reg_class_subset_p (rclass, FP_REGS)
       && !((REG_P (x) && HARD_REGISTER_P (x))
 	   || aarch64_simd_valid_immediate (x, NULL))
+      && mode != VNx16QImode
       && aarch64_sve_data_mode_p (mode))
     {
       sri->icode = CODE_FOR_aarch64_sve_reload_be;
@@ -8514,7 +10168,7 @@ aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
      can hold MODE, but at the moment we need to handle all modes.
      Just ignore any runtime parts for registers that can't store them.  */
   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
-  unsigned int nregs;
+  unsigned int nregs, vec_flags;
   switch (regclass)
     {
     case TAILCALL_ADDR_REGS:
@@ -8524,17 +10178,21 @@ aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
     case POINTER_AND_FP_REGS:
     case FP_REGS:
     case FP_LO_REGS:
-      if (aarch64_sve_data_mode_p (mode)
+    case FP_LO8_REGS:
+      vec_flags = aarch64_classify_vector_mode (mode);
+      if ((vec_flags & VEC_SVE_DATA)
 	  && constant_multiple_p (GET_MODE_SIZE (mode),
-				  BYTES_PER_SVE_VECTOR, &nregs))
+				  aarch64_vl_bytes (mode, vec_flags), &nregs))
 	return nregs;
-      return (aarch64_vector_data_mode_p (mode)
+      return (vec_flags & VEC_ADVSIMD
 	      ? CEIL (lowest_size, UNITS_PER_VREG)
 	      : CEIL (lowest_size, UNITS_PER_WORD));
     case STACK_REG:
     case PR_REGS:
     case PR_LO_REGS:
     case PR_HI_REGS:
+    case FFR_REGS:
+    case PR_AND_FFR_REGS:
       return 1;
 
     case NO_REGS:
@@ -10715,6 +12373,14 @@ aarch64_register_move_cost (machine_mode mode,
   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
     from = GENERAL_REGS;
 
+  /* Make RDFFR very expensive.  In particular, if we know that the FFR
+     contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
+     as a way of obtaining a PTRUE.  */
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
+      && hard_reg_set_subset_p (reg_class_contents[from_i],
+				reg_class_contents[FFR_REGS]))
+    return 80;
+
   /* Moving between GPR and stack cost is the same as GP2GP.  */
   if ((from == GENERAL_REGS && to == STACK_REG)
       || (to == GENERAL_REGS && from == STACK_REG))
@@ -10764,6 +12430,93 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
   return aarch64_tune_params.memmov_cost;
 }
 
+/* Implement TARGET_INIT_BUILTINS.  */
+static void
+aarch64_init_builtins ()
+{
+  aarch64_general_init_builtins ();
+  aarch64_sve::init_builtins ();
+}
+
+/* Implement TARGET_FOLD_BUILTIN.  */
+static tree
+aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
+{
+  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
+  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
+  tree type = TREE_TYPE (TREE_TYPE (fndecl));
+  switch (code & AARCH64_BUILTIN_CLASS)
+    {
+    case AARCH64_BUILTIN_GENERAL:
+      return aarch64_general_fold_builtin (subcode, type, nargs, args);
+
+    case AARCH64_BUILTIN_SVE:
+      return NULL_TREE;
+    }
+  gcc_unreachable ();
+}
+
+/* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
+static bool
+aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
+{
+  gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
+  tree fndecl = gimple_call_fndecl (stmt);
+  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
+  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
+  gimple *new_stmt = NULL;
+  switch (code & AARCH64_BUILTIN_CLASS)
+    {
+    case AARCH64_BUILTIN_GENERAL:
+      new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
+      break;
+
+    case AARCH64_BUILTIN_SVE:
+      new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
+      break;
+    }
+
+  if (!new_stmt)
+    return false;
+
+  gsi_replace (gsi, new_stmt, true);
+  return true;
+}
+
+/* Implement TARGET_EXPAND_BUILTIN.  */
+static rtx
+aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
+{
+  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
+  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
+  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
+  switch (code & AARCH64_BUILTIN_CLASS)
+    {
+    case AARCH64_BUILTIN_GENERAL:
+      return aarch64_general_expand_builtin (subcode, exp, target, ignore);
+
+    case AARCH64_BUILTIN_SVE:
+      return aarch64_sve::expand_builtin (subcode, exp, target);
+    }
+  gcc_unreachable ();
+}
+
+/* Implement TARGET_BUILTIN_DECL.  */
+static tree
+aarch64_builtin_decl (unsigned int code, bool initialize_p)
+{
+  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
+  switch (code & AARCH64_BUILTIN_CLASS)
+    {
+    case AARCH64_BUILTIN_GENERAL:
+      return aarch64_general_builtin_decl (subcode, initialize_p);
+
+    case AARCH64_BUILTIN_SVE:
+      return aarch64_sve::builtin_decl (subcode, initialize_p);
+    }
+  gcc_unreachable ();
+}
+
 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
    to optimize 1.0/sqrt.  */
 
@@ -10787,7 +12540,17 @@ aarch64_builtin_reciprocal (tree fndecl)
 
   if (!use_rsqrt_p (mode))
     return NULL_TREE;
-  return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
+  unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
+  unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
+  switch (code & AARCH64_BUILTIN_CLASS)
+    {
+    case AARCH64_BUILTIN_GENERAL:
+      return aarch64_general_builtin_rsqrt (subcode);
+
+    case AARCH64_BUILTIN_SVE:
+      return NULL_TREE;
+    }
+  gcc_unreachable ();
 }
 
 /* Emit instruction sequence to compute either the approximate square root
@@ -11096,7 +12859,7 @@ static void initialize_aarch64_code_model (struct gcc_options *);
 
 static enum aarch64_parse_opt_result
 aarch64_parse_arch (const char *to_parse, const struct processor **res,
-		    unsigned long *isa_flags, std::string *invalid_extension)
+		    uint64_t *isa_flags, std::string *invalid_extension)
 {
   const char *ext;
   const struct processor *arch;
@@ -11119,7 +12882,7 @@ aarch64_parse_arch (const char *to_parse, const struct processor **res,
       if (strlen (arch->name) == len
 	  && strncmp (arch->name, to_parse, len) == 0)
 	{
-	  unsigned long isa_temp = arch->flags;
+	  uint64_t isa_temp = arch->flags;
 
 	  if (ext != NULL)
 	    {
@@ -11151,7 +12914,7 @@ aarch64_parse_arch (const char *to_parse, const struct processor **res,
 
 static enum aarch64_parse_opt_result
 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
-		   unsigned long *isa_flags, std::string *invalid_extension)
+		   uint64_t *isa_flags, std::string *invalid_extension)
 {
   const char *ext;
   const struct processor *cpu;
@@ -11173,7 +12936,7 @@ aarch64_parse_cpu (const char *to_parse, const struct processor **res,
     {
       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
 	{
-	  unsigned long isa_temp = cpu->flags;
+	  uint64_t isa_temp = cpu->flags;
 
 
 	  if (ext != NULL)
@@ -11758,7 +13521,7 @@ aarch64_print_hint_for_extensions (const std::string &str)
 
 static bool
 aarch64_validate_mcpu (const char *str, const struct processor **res,
-		       unsigned long *isa_flags)
+		       uint64_t *isa_flags)
 {
   std::string invalid_extension;
   enum aarch64_parse_opt_result parse_res
@@ -11885,9 +13648,9 @@ aarch64_validate_mbranch_protection (const char *const_str)
   enum aarch64_parse_opt_result res =
     aarch64_parse_branch_protection (const_str, &str);
   if (res == AARCH64_PARSE_INVALID_ARG)
-    error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
+    error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
   else if (res == AARCH64_PARSE_MISSING_ARG)
-    error ("missing arg for %<-mbranch-protection=%>");
+    error ("missing argument for %<-mbranch-protection=%>");
   free (str);
   return res == AARCH64_PARSE_OK;
 }
@@ -11899,7 +13662,7 @@ aarch64_validate_mbranch_protection (const char *const_str)
 
 static bool
 aarch64_validate_march (const char *str, const struct processor **res,
-			 unsigned long *isa_flags)
+			 uint64_t *isa_flags)
 {
   std::string invalid_extension;
   enum aarch64_parse_opt_result parse_res
@@ -12014,8 +13777,8 @@ aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
 static void
 aarch64_override_options (void)
 {
-  unsigned long cpu_isa = 0;
-  unsigned long arch_isa = 0;
+  uint64_t cpu_isa = 0;
+  uint64_t arch_isa = 0;
   aarch64_isa_flags = 0;
 
   bool valid_cpu = true;
@@ -12255,7 +14018,7 @@ aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
 {
   const struct processor *cpu
     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
-  unsigned long isa_flags = ptr->x_aarch64_isa_flags;
+  uint64_t isa_flags = ptr->x_aarch64_isa_flags;
   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
   std::string extension
     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
@@ -12508,7 +14271,7 @@ static bool
 aarch64_handle_attr_isa_flags (char *str)
 {
   enum aarch64_parse_opt_result parse_res;
-  unsigned long isa_flags = aarch64_isa_flags;
+  uint64_t isa_flags = aarch64_isa_flags;
 
   /* We allow "+nothing" in the beginning to clear out all architectural
      features if the user wants to handpick specific features.  */
@@ -12999,6 +14762,26 @@ aarch64_can_inline_p (tree caller, tree callee)
   return true;
 }
 
+/* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
+   been already.  */
+
+unsigned int
+aarch64_tlsdesc_abi_id ()
+{
+  predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
+  if (!tlsdesc_abi.initialized_p ())
+    {
+      HARD_REG_SET full_reg_clobbers;
+      CLEAR_HARD_REG_SET (full_reg_clobbers);
+      SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
+      SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
+      for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
+	SET_HARD_REG_BIT (full_reg_clobbers, regno);
+      tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
+    }
+  return tlsdesc_abi.id ();
+}
+
 /* Return true if SYMBOL_REF X binds locally.  */
 
 static bool
@@ -13104,26 +14887,31 @@ aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
 	     the offset does not cause overflow of the final address.  But
 	     we have no way of knowing the address of symbol at compile time
 	     so we can't accurately say if the distance between the PC and
-	     symbol + offset is outside the addressible range of +/-1M in the
-	     TINY code model.  So we rely on images not being greater than
-	     1M and cap the offset at 1M and anything beyond 1M will have to
-	     be loaded using an alternative mechanism.  Furthermore if the
-	     symbol is a weak reference to something that isn't known to
-	     resolve to a symbol in this module, then force to memory.  */
-	  if ((SYMBOL_REF_WEAK (x)
-	       && !aarch64_symbol_binds_local_p (x))
-	      || !IN_RANGE (offset, -1048575, 1048575))
+	     symbol + offset is outside the addressible range of +/-1MB in the
+	     TINY code model.  So we limit the maximum offset to +/-64KB and
+	     assume the offset to the symbol is not larger than +/-(1MB - 64KB).
+	     If offset_within_block_p is true we allow larger offsets.
+	     Furthermore force to memory if the symbol is a weak reference to
+	     something that doesn't resolve to a symbol in this module.  */
+
+	  if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
+	    return SYMBOL_FORCE_TO_MEM;
+	  if (!(IN_RANGE (offset, -0x10000, 0x10000)
+		|| offset_within_block_p (x, offset)))
 	    return SYMBOL_FORCE_TO_MEM;
+
 	  return SYMBOL_TINY_ABSOLUTE;
 
 	case AARCH64_CMODEL_SMALL:
 	  /* Same reasoning as the tiny code model, but the offset cap here is
-	     4G.  */
-	  if ((SYMBOL_REF_WEAK (x)
-	       && !aarch64_symbol_binds_local_p (x))
-	      || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
-			    HOST_WIDE_INT_C (4294967264)))
+	     1MB, allowing +/-3.9GB for the offset to the symbol.  */
+
+	  if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
 	    return SYMBOL_FORCE_TO_MEM;
+	  if (!(IN_RANGE (offset, -0x100000, 0x100000)
+		|| offset_within_block_p (x, offset)))
+	    return SYMBOL_FORCE_TO_MEM;
+
 	  return SYMBOL_SMALL_ABSOLUTE;
 
 	case AARCH64_CMODEL_TINY_PIC:
@@ -13432,7 +15220,7 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
   HOST_WIDE_INT size, rsize, adjust, align;
   tree t, u, cond1, cond2;
 
-  indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
+  indirect_p = pass_va_arg_by_reference (type);
   if (indirect_p)
     type = build_pointer_type (type);
 
@@ -13626,6 +15414,10 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
 	  field_t = aarch64_fp16_type_node;
 	  field_ptr_t = aarch64_fp16_ptr_type_node;
 	  break;
+	case E_BFmode:
+	  field_t = aarch64_bf16_type_node;
+	  field_ptr_t = aarch64_bf16_ptr_type_node;
+	  break;
 	case E_V2SImode:
 	case E_V4SImode:
 	    {
@@ -13677,9 +15469,9 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
 
 static void
-aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
-				tree type, int *pretend_size ATTRIBUTE_UNUSED,
-				int no_rtl)
+aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
+				const function_arg_info &arg,
+				int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
 {
   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
   CUMULATIVE_ARGS local_cum;
@@ -13690,7 +15482,7 @@ aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
      argument.  Advance a local copy of CUM past the last "real" named
      argument, to find out how many registers are left over.  */
   local_cum = *cum;
-  aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
+  aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
 
   /* Found out how many registers we need to save.
      Honor tree-stdvar analysis results.  */
@@ -13777,6 +15569,10 @@ aarch64_conditional_register_usage (void)
 	call_used_regs[i] = 1;
       }
 
+  /* Only allow the FFR and FFRT to be accessed via special patterns.  */
+  CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
+  CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
+
   /* When tracking speculation, we need a couple of call-clobbered registers
      to track the speculation state.  It would be nice to just use
      IP0 and IP1, but currently there are numerous places that just
@@ -13802,6 +15598,10 @@ aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
   machine_mode mode;
   HOST_WIDE_INT size;
 
+  /* SVE types (and types containing SVE types) must be handled
+     before calling this function.  */
+  gcc_assert (!aarch64_sve::builtin_type_p (type));
+
   switch (TREE_CODE (type))
     {
     case REAL_TYPE:
@@ -13973,6 +15773,9 @@ aarch64_short_vector_p (const_tree type,
 {
   poly_int64 size = -1;
 
+  if (type && aarch64_sve::builtin_type_p (type))
+    return false;
+
   if (type && TREE_CODE (type) == VECTOR_TYPE)
     size = int_size_in_bytes (type);
   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
@@ -14033,11 +15836,14 @@ aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
 					 int *count,
 					 bool *is_ha)
 {
+  if (is_ha != NULL) *is_ha = false;
+
+  if (type && aarch64_sve::builtin_type_p (type))
+    return false;
+
   machine_mode new_mode = VOIDmode;
   bool composite_p = aarch64_composite_type_p (type, mode);
 
-  if (is_ha != NULL) *is_ha = false;
-
   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
       || aarch64_short_vector_p (type, mode))
     {
@@ -14083,7 +15889,63 @@ static bool
 aarch64_vector_mode_supported_p (machine_mode mode)
 {
   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
-  return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
+  return vec_flags != 0 && (vec_flags & (VEC_STRUCT | VEC_PARTIAL)) == 0;
+}
+
+/* Return the full-width SVE vector mode for element mode MODE, if one
+   exists.  */
+opt_machine_mode
+aarch64_full_sve_mode (scalar_mode mode)
+{
+  switch (mode)
+    {
+    case E_DFmode:
+      return VNx2DFmode;
+    case E_SFmode:
+      return VNx4SFmode;
+    case E_HFmode:
+      return VNx8HFmode;
+    case E_BFmode:
+      return VNx8BFmode;
+    case E_DImode:
+      return VNx2DImode;
+    case E_SImode:
+      return VNx4SImode;
+    case E_HImode:
+      return VNx8HImode;
+    case E_QImode:
+      return VNx16QImode;
+    default:
+      return opt_machine_mode ();
+    }
+}
+
+/* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
+   if it exists.  */
+opt_machine_mode
+aarch64_vq_mode (scalar_mode mode)
+{
+  switch (mode)
+    {
+    case E_DFmode:
+      return V2DFmode;
+    case E_SFmode:
+      return V4SFmode;
+    case E_HFmode:
+      return V8HFmode;
+    case E_BFmode:
+      return V8BFmode;
+    case E_SImode:
+      return V4SImode;
+    case E_HImode:
+      return V8HImode;
+    case E_QImode:
+      return V16QImode;
+    case E_DImode:
+      return V2DImode;
+    default:
+      return opt_machine_mode ();
+    }
 }
 
 /* Return appropriate SIMD container
@@ -14092,49 +15954,13 @@ static machine_mode
 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
 {
   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
-    switch (mode)
-      {
-      case E_DFmode:
-	return VNx2DFmode;
-      case E_SFmode:
-	return VNx4SFmode;
-      case E_HFmode:
-	return VNx8HFmode;
-      case E_DImode:
-	return VNx2DImode;
-      case E_SImode:
-	return VNx4SImode;
-      case E_HImode:
-	return VNx8HImode;
-      case E_QImode:
-	return VNx16QImode;
-      default:
-	return word_mode;
-      }
+    return aarch64_full_sve_mode (mode).else_mode (word_mode);
 
   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
   if (TARGET_SIMD)
     {
       if (known_eq (width, 128))
-	switch (mode)
-	  {
-	  case E_DFmode:
-	    return V2DFmode;
-	  case E_SFmode:
-	    return V4SFmode;
-	  case E_HFmode:
-	    return V8HFmode;
-	  case E_SImode:
-	    return V4SImode;
-	  case E_HImode:
-	    return V8HImode;
-	  case E_QImode:
-	    return V16QImode;
-	  case E_DImode:
-	    return V2DImode;
-	  default:
-	    break;
-	  }
+	return aarch64_vq_mode (mode).else_mode (word_mode);
       else
 	switch (mode)
 	  {
@@ -14142,6 +15968,8 @@ aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
 	    return V2SFmode;
 	  case E_HFmode:
 	    return V4HFmode;
+	  case E_BFmode:
+	    return V4BFmode;
 	  case E_SImode:
 	    return V2SImode;
 	  case E_HImode:
@@ -14205,14 +16033,24 @@ aarch64_mangle_type (const_tree type)
   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
     return "St9__va_list";
 
-  /* Half-precision float.  */
+  /* Half-precision floating point types.  */
   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
-    return "Dh";
+    {
+      if (TYPE_MODE (type) == BFmode)
+	return "u6__bf16";
+      else
+	return "Dh";
+    }
 
   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
      builtin types.  */
   if (TYPE_NAME (type) != NULL)
-    return aarch64_mangle_builtin_type (type);
+    {
+      const char *res;
+      if ((res = aarch64_general_mangle_builtin_type (type))
+	  || (res = aarch64_sve::mangle_builtin_type (type)))
+	return res;
+    }
 
   /* Use the default mangling.  */
   return NULL;
@@ -14370,6 +16208,27 @@ aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
   return IN_RANGE (val, 0, 0xff00);
 }
 
+/* Return true if X is a valid immediate for the SVE SQADD and SQSUB
+   instructions.  Negate X first if NEGATE_P is true.  */
+
+bool
+aarch64_sve_sqadd_sqsub_immediate_p (rtx x, bool negate_p)
+{
+  rtx elt;
+
+  if (!const_vec_duplicate_p (x, &elt)
+      || !CONST_INT_P (elt))
+    return false;
+
+  if (!aarch64_sve_arith_immediate_p (x, negate_p))
+    return false;
+
+  /* After the optional negation, the immediate must be nonnegative.
+     E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
+     instead of SQADD Zn.B, Zn.B, #129.  */
+  return negate_p == (INTVAL (elt) < 0);
+}
+
 /* Return true if X is a valid immediate operand for an SVE logical
    instruction such as AND.  */
 
@@ -14390,13 +16249,11 @@ aarch64_sve_bitmask_immediate_p (rtx x)
 bool
 aarch64_sve_dup_immediate_p (rtx x)
 {
-  rtx elt;
-
-  if (!const_vec_duplicate_p (x, &elt)
-      || !CONST_INT_P (elt))
+  x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
+  if (!CONST_INT_P (x))
     return false;
 
-  HOST_WIDE_INT val = INTVAL (elt);
+  HOST_WIDE_INT val = INTVAL (x);
   if (val & 0xff)
     return IN_RANGE (val, -0x80, 0x7f);
   return IN_RANGE (val, -0x8000, 0x7f00);
@@ -14408,13 +16265,11 @@ aarch64_sve_dup_immediate_p (rtx x)
 bool
 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
 {
-  rtx elt;
-
-  return (const_vec_duplicate_p (x, &elt)
-	  && CONST_INT_P (elt)
+  x = unwrap_const_vec_duplicate (x);
+  return (CONST_INT_P (x)
 	  && (signed_p
-	      ? IN_RANGE (INTVAL (elt), -16, 15)
-	      : IN_RANGE (INTVAL (elt), 0, 127)));
+	      ? IN_RANGE (INTVAL (x), -16, 15)
+	      : IN_RANGE (INTVAL (x), 0, 127)));
 }
 
 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
@@ -14450,11 +16305,10 @@ aarch64_sve_float_mul_immediate_p (rtx x)
 {
   rtx elt;
 
-  /* GCC will never generate a multiply with an immediate of 2, so there is no
-     point testing for it (even though it is a valid constant).  */
   return (const_vec_duplicate_p (x, &elt)
 	  && GET_CODE (elt) == CONST_DOUBLE
-	  && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
+	  && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
+	      || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
 }
 
 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
@@ -14607,6 +16461,77 @@ aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
   return false;
 }
 
+/* Return true if X is an UNSPEC_PTRUE constant of the form:
+
+       (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
+
+   where PATTERN is the svpattern as a CONST_INT and where ZERO
+   is a zero constant of the required PTRUE mode (which can have
+   fewer elements than X's mode, if zero bits are significant).
+
+   If so, and if INFO is nonnull, describe the immediate in INFO.  */
+bool
+aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
+{
+  if (GET_CODE (x) != CONST)
+    return false;
+
+  x = XEXP (x, 0);
+  if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
+    return false;
+
+  if (info)
+    {
+      aarch64_svpattern pattern
+	= (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
+      machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
+      scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
+      *info = simd_immediate_info (int_mode, pattern);
+    }
+  return true;
+}
+
+/* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
+   it to describe valid immediates.  */
+
+static bool
+aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
+{
+  if (aarch64_sve_ptrue_svpattern_p (x, info))
+    return true;
+
+  if (x == CONST0_RTX (GET_MODE (x)))
+    {
+      if (info)
+	*info = simd_immediate_info (DImode, 0);
+      return true;
+    }
+
+  /* Analyze the value as a VNx16BImode.  This should be relatively
+     efficient, since rtx_vector_builder has enough built-in capacity
+     to store all VLA predicate constants without needing the heap.  */
+  rtx_vector_builder builder;
+  if (!aarch64_get_sve_pred_bits (builder, x))
+    return false;
+
+  unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
+  if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
+    {
+      machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
+      aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
+      if (pattern != AARCH64_NUM_SVPATTERNS)
+	{
+	  if (info)
+	    {
+	      scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
+	      *info = simd_immediate_info (int_mode, pattern);
+	    }
+	  return true;
+	}
+    }
+  return false;
+}
+
 /* Return true if OP is a valid SIMD immediate for the operation
    described by WHICH.  If INFO is nonnull, use it to describe valid
    immediates.  */
@@ -14619,6 +16544,9 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
     return false;
 
+  if (vec_flags & VEC_SVE_PRED)
+    return aarch64_sve_pred_valid_immediate (op, info);
+
   scalar_mode elt_mode = GET_MODE_INNER (mode);
   rtx base, step;
   unsigned int n_elts;
@@ -14643,11 +16571,6 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
   else
     return false;
 
-  /* Handle PFALSE and PTRUE.  */
-  if (vec_flags & VEC_SVE_PRED)
-    return (op == CONST0_RTX (mode)
-	    || op == CONSTM1_RTX (mode));
-
   scalar_float_mode elt_float_mode;
   if (n_elts == 1
       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
@@ -14731,11 +16654,14 @@ aarch64_check_zero_based_sve_index_immediate (rtx x)
 bool
 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
 {
+  x = unwrap_const_vec_duplicate (x);
+  if (!CONST_INT_P (x))
+    return false;
   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
   if (left)
-    return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
+    return IN_RANGE (INTVAL (x), 0, bit_width - 1);
   else
-    return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
+    return IN_RANGE (INTVAL (x), 1, bit_width);
 }
 
 /* Return the bitmask CONST_INT to select the bits required by a zero extract
@@ -14763,7 +16689,17 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
     return true;
 
   if (VECTOR_MODE_P (GET_MODE (x)))
-    return aarch64_simd_valid_immediate (x, NULL);
+    {
+      /* Require predicate constants to be VNx16BI before RA, so that we
+	 force everything to have a canonical form.  */
+      if (!lra_in_progress
+	  && !reload_completed
+	  && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
+	  && GET_MODE (x) != VNx16BImode)
+	return false;
+
+      return aarch64_simd_valid_immediate (x, NULL);
+    }
 
   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
     return true;
@@ -14953,6 +16889,72 @@ aarch64_sve_ld1r_operand_p (rtx op)
 	  && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
 }
 
+/* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
+   where the size of the read data is specified by `mode` and the size of the
+   vector elements are specified by `elem_mode`.   */
+bool
+aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
+				   scalar_mode elem_mode)
+{
+  struct aarch64_address_info addr;
+  if (!MEM_P (op)
+      || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
+    return false;
+
+  if (addr.type == ADDRESS_REG_IMM)
+    return offset_4bit_signed_scaled_p (mode, addr.const_offset);
+
+  if (addr.type == ADDRESS_REG_REG)
+    return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
+
+  return false;
+}
+
+/* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
+bool
+aarch64_sve_ld1rq_operand_p (rtx op)
+{
+  return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
+					    GET_MODE_INNER (GET_MODE (op)));
+}
+
+/* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
+   accessing a vector where the element size is specified by `elem_mode`.  */
+bool
+aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
+{
+  return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
+}
+
+/* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
+bool
+aarch64_sve_ldff1_operand_p (rtx op)
+{
+  if (!MEM_P (op))
+    return false;
+
+  struct aarch64_address_info addr;
+  if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
+    return false;
+
+  if (addr.type == ADDRESS_REG_IMM)
+    return known_eq (addr.const_offset, 0);
+
+  return addr.type == ADDRESS_REG_REG;
+}
+
+/* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
+bool
+aarch64_sve_ldnf1_operand_p (rtx op)
+{
+  struct aarch64_address_info addr;
+
+  return (MEM_P (op)
+	  && aarch64_classify_address (&addr, XEXP (op, 0),
+				       GET_MODE (op), false)
+	  && addr.type == ADDRESS_REG_IMM);
+}
+
 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
    The conditions for STR are the same.  */
 bool
@@ -14966,6 +16968,21 @@ aarch64_sve_ldr_operand_p (rtx op)
 	  && addr.type == ADDRESS_REG_IMM);
 }
 
+/* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
+   addressing memory of mode MODE.  */
+bool
+aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
+{
+  struct aarch64_address_info addr;
+  if (!aarch64_classify_address (&addr, op, mode, false))
+    return false;
+
+  if (addr.type == ADDRESS_REG_IMM)
+    return known_eq (addr.const_offset, 0);
+
+  return addr.type == ADDRESS_REG_REG;
+}
+
 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
    We need to be able to access the individual pieces, so the range
    is different from LD[234] and ST[234].  */
@@ -15027,11 +17044,13 @@ aarch64_simd_attr_length_rglist (machine_mode mode)
 static HOST_WIDE_INT
 aarch64_simd_vector_alignment (const_tree type)
 {
+  /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
+     be set for non-predicate vectors of booleans.  Modes are the most
+     direct way we have of identifying real SVE predicate types.  */
+  if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
+    return 16;
   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
-    /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
-       be set for non-predicate vectors of booleans.  Modes are the most
-       direct way we have of identifying real SVE predicate types.  */
-    return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
+    return 128;
   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
 }
 
@@ -15361,34 +17380,383 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 	 (see aarch64_simd_valid_immediate).  */
       for (int i = 0; i < n_elts; i++)
 	{
-	  rtx x = XVECEXP (vals, 0, i);
-	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
-	    continue;
-	  rtx subst = any_const;
-	  for (int bit = n_elts / 2; bit > 0; bit /= 2)
-	    {
-	      /* Look in the copied vector, as more elements are const.  */
-	      rtx test = XVECEXP (copy, 0, i ^ bit);
-	      if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
-		{
-		  subst = test;
-		  break;
-		}
-	    }
-	  XVECEXP (copy, 0, i) = subst;
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
+	    continue;
+	  rtx subst = any_const;
+	  for (int bit = n_elts / 2; bit > 0; bit /= 2)
+	    {
+	      /* Look in the copied vector, as more elements are const.  */
+	      rtx test = XVECEXP (copy, 0, i ^ bit);
+	      if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
+		{
+		  subst = test;
+		  break;
+		}
+	    }
+	  XVECEXP (copy, 0, i) = subst;
+	}
+      aarch64_expand_vector_init (target, copy);
+    }
+
+  /* Insert the variable lanes directly.  */
+  for (int i = 0; i < n_elts; i++)
+    {
+      rtx x = XVECEXP (vals, 0, i);
+      if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
+	continue;
+      x = copy_to_mode_reg (inner_mode, x);
+      emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+    }
+}
+
+/* Emit RTL corresponding to:
+   insr TARGET, ELEM.  */
+
+static void
+emit_insr (rtx target, rtx elem)
+{
+  machine_mode mode = GET_MODE (target);
+  scalar_mode elem_mode = GET_MODE_INNER (mode);
+  elem = force_reg (elem_mode, elem);
+
+  insn_code icode = optab_handler (vec_shl_insert_optab, mode);
+  gcc_assert (icode != CODE_FOR_nothing);
+  emit_insn (GEN_FCN (icode) (target, target, elem));
+}
+
+/* Subroutine of aarch64_sve_expand_vector_init for handling
+   trailing constants.
+   This function works as follows:
+   (a) Create a new vector consisting of trailing constants.
+   (b) Initialize TARGET with the constant vector using emit_move_insn.
+   (c) Insert remaining elements in TARGET using insr.
+   NELTS is the total number of elements in original vector while
+   while NELTS_REQD is the number of elements that are actually
+   significant.
+
+   ??? The heuristic used is to do above only if number of constants
+   is at least half the total number of elements.  May need fine tuning.  */
+
+static bool
+aarch64_sve_expand_vector_init_handle_trailing_constants
+ (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
+{
+  machine_mode mode = GET_MODE (target);
+  scalar_mode elem_mode = GET_MODE_INNER (mode);
+  int n_trailing_constants = 0;
+
+  for (int i = nelts_reqd - 1;
+       i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
+       i--)
+    n_trailing_constants++;
+
+  if (n_trailing_constants >= nelts_reqd / 2)
+    {
+      rtx_vector_builder v (mode, 1, nelts);
+      for (int i = 0; i < nelts; i++)
+	v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
+      rtx const_vec = v.build ();
+      emit_move_insn (target, const_vec);
+
+      for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
+	emit_insr (target, builder.elt (i));
+
+      return true;
+    }
+
+  return false;
+}
+
+/* Subroutine of aarch64_sve_expand_vector_init.
+   Works as follows:
+   (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
+   (b) Skip trailing elements from BUILDER, which are the same as
+       element NELTS_REQD - 1.
+   (c) Insert earlier elements in reverse order in TARGET using insr.  */
+
+static void
+aarch64_sve_expand_vector_init_insert_elems (rtx target,
+					     const rtx_vector_builder &builder,
+					     int nelts_reqd)
+{
+  machine_mode mode = GET_MODE (target);
+  scalar_mode elem_mode = GET_MODE_INNER (mode);
+
+  struct expand_operand ops[2];
+  enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
+  gcc_assert (icode != CODE_FOR_nothing);
+
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
+  expand_insn (icode, 2, ops);
+
+  int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
+  for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
+    emit_insr (target, builder.elt (i));
+}
+
+/* Subroutine of aarch64_sve_expand_vector_init to handle case
+   when all trailing elements of builder are same.
+   This works as follows:
+   (a) Use expand_insn interface to broadcast last vector element in TARGET.
+   (b) Insert remaining elements in TARGET using insr.
+
+   ??? The heuristic used is to do above if number of same trailing elements
+   is at least 3/4 of total number of elements, loosely based on
+   heuristic from mostly_zeros_p.  May need fine-tuning.  */
+
+static bool
+aarch64_sve_expand_vector_init_handle_trailing_same_elem
+ (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
+{
+  int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
+  if (ndups >= (3 * nelts_reqd) / 4)
+    {
+      aarch64_sve_expand_vector_init_insert_elems (target, builder,
+						   nelts_reqd - ndups + 1);
+      return true;
+    }
+
+  return false;
+}
+
+/* Initialize register TARGET from BUILDER. NELTS is the constant number
+   of elements in BUILDER.
+
+   The function tries to initialize TARGET from BUILDER if it fits one
+   of the special cases outlined below.
+
+   Failing that, the function divides BUILDER into two sub-vectors:
+   v_even = even elements of BUILDER;
+   v_odd = odd elements of BUILDER;
+
+   and recursively calls itself with v_even and v_odd.
+
+   if (recursive call succeeded for v_even or v_odd)
+     TARGET = zip (v_even, v_odd)
+
+   The function returns true if it managed to build TARGET from BUILDER
+   with one of the special cases, false otherwise.
+
+   Example: {a, 1, b, 2, c, 3, d, 4}
+
+   The vector gets divided into:
+   v_even = {a, b, c, d}
+   v_odd = {1, 2, 3, 4}
+
+   aarch64_sve_expand_vector_init(v_odd) hits case 1 and
+   initialize tmp2 from constant vector v_odd using emit_move_insn.
+
+   aarch64_sve_expand_vector_init(v_even) fails since v_even contains
+   4 elements, so we construct tmp1 from v_even using insr:
+   tmp1 = dup(d)
+   insr tmp1, c
+   insr tmp1, b
+   insr tmp1, a
+
+   And finally:
+   TARGET = zip (tmp1, tmp2)
+   which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
+
+static bool
+aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
+				int nelts, int nelts_reqd)
+{
+  machine_mode mode = GET_MODE (target);
+
+  /* Case 1: Vector contains trailing constants.  */
+
+  if (aarch64_sve_expand_vector_init_handle_trailing_constants
+       (target, builder, nelts, nelts_reqd))
+    return true;
+
+  /* Case 2: Vector contains leading constants.  */
+
+  rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
+  for (int i = 0; i < nelts_reqd; i++)
+    rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
+  rev_builder.finalize ();
+
+  if (aarch64_sve_expand_vector_init_handle_trailing_constants
+       (target, rev_builder, nelts, nelts_reqd))
+    {
+      emit_insn (gen_aarch64_sve_rev (mode, target, target));
+      return true;
+    }
+
+  /* Case 3: Vector contains trailing same element.  */
+
+  if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
+       (target, builder, nelts_reqd))
+    return true;
+
+  /* Case 4: Vector contains leading same element.  */
+
+  if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
+       (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
+    {
+      emit_insn (gen_aarch64_sve_rev (mode, target, target));
+      return true;
+    }
+
+  /* Avoid recursing below 4-elements.
+     ??? The threshold 4 may need fine-tuning.  */
+
+  if (nelts_reqd <= 4)
+    return false;
+
+  rtx_vector_builder v_even (mode, 1, nelts);
+  rtx_vector_builder v_odd (mode, 1, nelts);
+
+  for (int i = 0; i < nelts * 2; i += 2)
+    {
+      v_even.quick_push (builder.elt (i));
+      v_odd.quick_push (builder.elt (i + 1));
+    }
+
+  v_even.finalize ();
+  v_odd.finalize ();
+
+  rtx tmp1 = gen_reg_rtx (mode);
+  bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
+						    nelts, nelts_reqd / 2);
+
+  rtx tmp2 = gen_reg_rtx (mode);
+  bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
+						   nelts, nelts_reqd / 2);
+
+  if (!did_even_p && !did_odd_p)
+    return false;
+
+  /* Initialize v_even and v_odd using INSR if it didn't match any of the
+     special cases and zip v_even, v_odd.  */
+
+  if (!did_even_p)
+    aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
+
+  if (!did_odd_p)
+    aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
+
+  rtvec v = gen_rtvec (2, tmp1, tmp2);
+  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+  return true;
+}
+
+/* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
+
+void
+aarch64_sve_expand_vector_init (rtx target, rtx vals)
+{
+  machine_mode mode = GET_MODE (target);
+  int nelts = XVECLEN (vals, 0);
+
+  rtx_vector_builder v (mode, 1, nelts);
+  for (int i = 0; i < nelts; i++)
+    v.quick_push (XVECEXP (vals, 0, i));
+  v.finalize ();
+
+  /* If neither sub-vectors of v could be initialized specially,
+     then use INSR to insert all elements from v into TARGET.
+     ??? This might not be optimal for vectors with large
+     initializers like 16-element or above.
+     For nelts < 4, it probably isn't useful to handle specially.  */
+
+  if (nelts < 4
+      || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
+    aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
+}
+
+/* Check whether VALUE is a vector constant in which every element
+   is either a power of 2 or a negated power of 2.  If so, return
+   a constant vector of log2s, and flip CODE between PLUS and MINUS
+   if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
+
+static rtx
+aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
+{
+  if (GET_CODE (value) != CONST_VECTOR)
+    return NULL_RTX;
+
+  rtx_vector_builder builder;
+  if (!builder.new_unary_operation (GET_MODE (value), value, false))
+    return NULL_RTX;
+
+  scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
+  /* 1 if the result of the multiplication must be negated,
+     0 if it mustn't, or -1 if we don't yet care.  */
+  int negate = -1;
+  unsigned int encoded_nelts = const_vector_encoded_nelts (value);
+  for (unsigned int i = 0; i < encoded_nelts; ++i)
+    {
+      rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
+      if (!CONST_SCALAR_INT_P (elt))
+	return NULL_RTX;
+      rtx_mode_t val (elt, int_mode);
+      wide_int pow2 = wi::neg (val);
+      if (val != pow2)
+	{
+	  /* It matters whether we negate or not.  Make that choice,
+	     and make sure that it's consistent with previous elements.  */
+	  if (negate == !wi::neg_p (val))
+	    return NULL_RTX;
+	  negate = wi::neg_p (val);
+	  if (!negate)
+	    pow2 = val;
 	}
-      aarch64_expand_vector_init (target, copy);
+      /* POW2 is now the value that we want to be a power of 2.  */
+      int shift = wi::exact_log2 (pow2);
+      if (shift < 0)
+	return NULL_RTX;
+      builder.quick_push (gen_int_mode (shift, int_mode));
+    }
+  if (negate == -1)
+    /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
+    code = PLUS;
+  else if (negate == 1)
+    code = code == PLUS ? MINUS : PLUS;
+  return builder.build ();
+}
+
+/* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
+   CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
+   operands array, in the same order as for fma_optab.  Return true if
+   the function emitted all the necessary instructions, false if the caller
+   should generate the pattern normally with the new OPERANDS array.  */
+
+bool
+aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
+    {
+      rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
+				  NULL_RTX, true, OPTAB_DIRECT);
+      force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
+			  operands[3], product, operands[0], true,
+			  OPTAB_DIRECT);
+      return true;
     }
+  operands[2] = force_reg (mode, operands[2]);
+  return false;
+}
 
-  /* Insert the variable lanes directly.  */
-  for (int i = 0; i < n_elts; i++)
+/* Likewise, but for a conditional pattern.  */
+
+bool
+aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
     {
-      rtx x = XVECEXP (vals, 0, i);
-      if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
-	continue;
-      x = copy_to_mode_reg (inner_mode, x);
-      emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+      rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
+				  NULL_RTX, true, OPTAB_DIRECT);
+      emit_insn (gen_cond (code, mode, operands[0], operands[1],
+			   operands[4], product, operands[5]));
+      return true;
     }
+  operands[3] = force_reg (mode, operands[3]);
+  return false;
 }
 
 static unsigned HOST_WIDE_INT
@@ -15428,11 +17796,15 @@ aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
 static void
 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
 {
-  if (aarch64_simd_decl_p (decl))
+  if (TREE_CODE (decl) == FUNCTION_DECL)
     {
-      fprintf (stream, "\t.variant_pcs\t");
-      assemble_name (stream, name);
-      fprintf (stream, "\n");
+      arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
+      if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
+	{
+	  fprintf (stream, "\t.variant_pcs\t");
+	  assemble_name (stream, name);
+	  fprintf (stream, "\n");
+	}
     }
 }
 
@@ -15459,7 +17831,7 @@ aarch64_declare_function_name (FILE *stream, const char* name,
   const struct processor *this_arch
     = aarch64_get_arch (targ_options->x_explicit_arch);
 
-  unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
+  uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
   std::string extension
     = aarch64_get_extension_string_for_isa_flags (isa_flags,
 						  this_arch->flags);
@@ -15541,6 +17913,18 @@ aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
   aarch64_asm_output_variant_pcs (stream, decl, name);
 }
 
+/* Triggered after a .cfi_startproc directive is emitted into the assembly file.
+   Used to output the .cfi_b_key_frame directive when signing the current
+   function with the B key.  */
+
+void
+aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
+{
+  if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
+      && aarch64_ra_sign_key == AARCH64_KEY_B)
+	asm_fprintf (f, "\t.cfi_b_key_frame\n");
+}
+
 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
 
 static void
@@ -15551,7 +17935,7 @@ aarch64_start_file (void)
 
   const struct processor *default_arch
     = aarch64_get_arch (default_options->x_explicit_arch);
-  unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
+  uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
   std::string extension
     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
 						  default_arch->flags);
@@ -15570,16 +17954,26 @@ static void
 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
 			     rtx mem, rtx model_rtx)
 {
-  emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
+  if (mode == TImode)
+    emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
+						gen_highpart (DImode, rval),
+						mem, model_rtx));
+  else
+    emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
 }
 
 /* Emit store exclusive.  */
 
 static void
 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
-			      rtx rval, rtx mem, rtx model_rtx)
+			      rtx mem, rtx rval, rtx model_rtx)
 {
-  emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
+  if (mode == TImode)
+    emit_insn (gen_aarch64_store_exclusive_pair
+	       (bval, mem, operand_subword (rval, 0, 0, TImode),
+		operand_subword (rval, 1, 0, TImode), model_rtx));
+  else
+    emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
 }
 
 /* Mark the previous jump instruction as unlikely.  */
@@ -15591,6 +17985,82 @@ aarch64_emit_unlikely_jump (rtx insn)
   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
 }
 
+/* We store the names of the various atomic helpers in a 5x4 array.
+   Return the libcall function given MODE, MODEL and NAMES.  */
+
+rtx
+aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
+			const atomic_ool_names *names)
+{
+  memmodel model = memmodel_base (INTVAL (model_rtx));
+  int mode_idx, model_idx;
+
+  switch (mode)
+    {
+    case E_QImode:
+      mode_idx = 0;
+      break;
+    case E_HImode:
+      mode_idx = 1;
+      break;
+    case E_SImode:
+      mode_idx = 2;
+      break;
+    case E_DImode:
+      mode_idx = 3;
+      break;
+    case E_TImode:
+      mode_idx = 4;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  switch (model)
+    {
+    case MEMMODEL_RELAXED:
+      model_idx = 0;
+      break;
+    case MEMMODEL_CONSUME:
+    case MEMMODEL_ACQUIRE:
+      model_idx = 1;
+      break;
+    case MEMMODEL_RELEASE:
+      model_idx = 2;
+      break;
+    case MEMMODEL_ACQ_REL:
+    case MEMMODEL_SEQ_CST:
+      model_idx = 3;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
+				      VISIBILITY_HIDDEN);
+}
+
+#define DEF0(B, N) \
+  { "__aarch64_" #B #N "_relax", \
+    "__aarch64_" #B #N "_acq", \
+    "__aarch64_" #B #N "_rel", \
+    "__aarch64_" #B #N "_acq_rel" }
+
+#define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
+		 { NULL, NULL, NULL, NULL }
+#define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
+
+static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
+const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
+const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
+const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
+const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
+const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
+
+#undef DEF0
+#undef DEF4
+#undef DEF5
+
 /* Expand a compare and swap pattern.  */
 
 void
@@ -15637,6 +18107,17 @@ aarch64_expand_compare_and_swap (rtx operands[])
 						   newval, mod_s));
       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
     }
+  else if (TARGET_OUTLINE_ATOMICS)
+    {
+      /* Oldval must satisfy compare afterward.  */
+      if (!aarch64_plus_operand (oldval, mode))
+	oldval = force_reg (mode, oldval);
+      rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
+      rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
+				      oldval, mode, newval, mode,
+				      XEXP (mem, 0), Pmode);
+      cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
+    }
   else
     {
       /* The oldval predicate varies by mode.  Test it and force to reg.  */
@@ -15682,13 +18163,11 @@ aarch64_split_compare_and_swap (rtx operands[])
   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
   gcc_assert (epilogue_completed);
 
-  rtx rval, mem, oldval, newval, scratch;
+  rtx rval, mem, oldval, newval, scratch, x, model_rtx;
   machine_mode mode;
   bool is_weak;
   rtx_code_label *label1, *label2;
-  rtx x, cond;
   enum memmodel model;
-  rtx model_rtx;
 
   rval = operands[0];
   mem = operands[1];
@@ -15709,7 +18188,8 @@ aarch64_split_compare_and_swap (rtx operands[])
 	CBNZ	scratch, .label1
     .label2:
 	CMP	rval, 0.  */
-  bool strong_zero_p = !is_weak && oldval == const0_rtx;
+  bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
+			oldval == const0_rtx && mode != TImode);
 
   label1 = NULL;
   if (!is_weak)
@@ -15722,35 +18202,20 @@ aarch64_split_compare_and_swap (rtx operands[])
   /* The initial load can be relaxed for a __sync operation since a final
      barrier will be emitted to stop code hoisting.  */
   if (is_mm_sync (model))
-    aarch64_emit_load_exclusive (mode, rval, mem,
-				 GEN_INT (MEMMODEL_RELAXED));
+    aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
   else
     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
 
   if (strong_zero_p)
-    {
-      if (aarch64_track_speculation)
-	{
-	  /* Emit an explicit compare instruction, so that we can correctly
-	     track the condition codes.  */
-	  rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
-	  x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
-	}
-      else
-	x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
-
-      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
-				gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
-      aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
-    }
+    x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
   else
     {
-      cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
-      x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
-      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
-				gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
-      aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+      rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
+      x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
     }
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+			    gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
+  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
 
   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
 
@@ -15771,22 +18236,16 @@ aarch64_split_compare_and_swap (rtx operands[])
       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
     }
   else
-    {
-      cond = gen_rtx_REG (CCmode, CC_REGNUM);
-      x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
-      emit_insn (gen_rtx_SET (cond, x));
-    }
+    aarch64_gen_compare_reg (NE, scratch, const0_rtx);
 
   emit_label (label2);
+
   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
      to set the condition flags.  If this is not used it will be removed by
      later passes.  */
   if (strong_zero_p)
-    {
-      cond = gen_rtx_REG (CCmode, CC_REGNUM);
-      x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
-      emit_insn (gen_rtx_SET (cond, x));
-    }
+    aarch64_gen_compare_reg (NE, rval, const0_rtx);
+
   /* Emit any final barrier needed for a __sync operation.  */
   if (is_mm_sync (model))
     aarch64_emit_post_barrier (model);
@@ -15939,6 +18398,7 @@ aarch64_float_const_representable_p (rtx x)
   REAL_VALUE_TYPE r, m;
   bool fail;
 
+  x = unwrap_const_vec_duplicate (x);
   if (!CONST_DOUBLE_P (x))
     return false;
 
@@ -16034,17 +18494,18 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
 
   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
     {
-      gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
+      gcc_assert (info.insn == simd_immediate_info::MOV
+		  && info.u.mov.shift == 0);
       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
 	 move immediate path.  */
-      if (aarch64_float_const_zero_rtx_p (info.value))
-        info.value = GEN_INT (0);
+      if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
+        info.u.mov.value = GEN_INT (0);
       else
 	{
 	  const unsigned int buf_size = 20;
 	  char float_buf[buf_size] = {'\0'};
 	  real_to_decimal_for_mode (float_buf,
-				    CONST_DOUBLE_REAL_VALUE (info.value),
+				    CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
 				    buf_size, buf_size, 1, info.elt_mode);
 
 	  if (lane_count == 1)
@@ -16056,36 +18517,39 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
 	}
     }
 
-  gcc_assert (CONST_INT_P (info.value));
+  gcc_assert (CONST_INT_P (info.u.mov.value));
 
   if (which == AARCH64_CHECK_MOV)
     {
       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
-      shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
+      shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
+		  ? "msl" : "lsl");
       if (lane_count == 1)
 	snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
-		  mnemonic, UINTVAL (info.value));
-      else if (info.shift)
+		  mnemonic, UINTVAL (info.u.mov.value));
+      else if (info.u.mov.shift)
 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
 		  HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
-		  element_char, UINTVAL (info.value), shift_op, info.shift);
+		  element_char, UINTVAL (info.u.mov.value), shift_op,
+		  info.u.mov.shift);
       else
 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
 		  HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
-		  element_char, UINTVAL (info.value));
+		  element_char, UINTVAL (info.u.mov.value));
     }
   else
     {
       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
-      if (info.shift)
+      if (info.u.mov.shift)
 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
 		  HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
-		  element_char, UINTVAL (info.value), "lsl", info.shift);
+		  element_char, UINTVAL (info.u.mov.value), "lsl",
+		  info.u.mov.shift);
       else
 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
 		  HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
-		  element_char, UINTVAL (info.value));
+		  element_char, UINTVAL (info.u.mov.value));
     }
   return templ;
 }
@@ -16129,24 +18593,49 @@ aarch64_output_sve_mov_immediate (rtx const_vector)
 
   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
 
-  if (info.step)
+  machine_mode vec_mode = GET_MODE (const_vector);
+  if (aarch64_sve_pred_mode_p (vec_mode))
+    {
+      static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
+      if (info.insn == simd_immediate_info::MOV)
+	{
+	  gcc_assert (info.u.mov.value == const0_rtx);
+	  snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
+	}
+      else
+	{
+	  gcc_assert (info.insn == simd_immediate_info::PTRUE);
+	  unsigned int total_bytes;
+	  if (info.u.pattern == AARCH64_SV_ALL
+	      && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
+	    snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
+		      total_bytes / GET_MODE_SIZE (info.elt_mode));
+	  else
+	    snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
+		      svpattern_token (info.u.pattern));
+	}
+      return buf;
+    }
+
+  if (info.insn == simd_immediate_info::INDEX)
     {
       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
 		HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
-		element_char, INTVAL (info.value), INTVAL (info.step));
+		element_char, INTVAL (info.u.index.base),
+		INTVAL (info.u.index.step));
       return templ;
     }
 
   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
     {
-      if (aarch64_float_const_zero_rtx_p (info.value))
-	info.value = GEN_INT (0);
+      if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
+	info.u.mov.value = GEN_INT (0);
       else
 	{
 	  const int buf_size = 20;
 	  char float_buf[buf_size] = {};
 	  real_to_decimal_for_mode (float_buf,
-				    CONST_DOUBLE_REAL_VALUE (info.value),
+				    CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
 				    buf_size, buf_size, 1, info.elt_mode);
 
 	  snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
@@ -16156,23 +18645,27 @@ aarch64_output_sve_mov_immediate (rtx const_vector)
     }
 
   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
-	    element_char, INTVAL (info.value));
+	    element_char, INTVAL (info.u.mov.value));
   return templ;
 }
 
-/* Return the asm format for a PTRUE instruction whose destination has
-   mode MODE.  SUFFIX is the element size suffix.  */
+/* Return the asm template for a PTRUES.  CONST_UNSPEC is the
+   aarch64_sve_ptrue_svpattern_immediate that describes the predicate
+   pattern.  */
 
 char *
-aarch64_output_ptrue (machine_mode mode, char suffix)
+aarch64_output_sve_ptrues (rtx const_unspec)
 {
-  unsigned int nunits;
-  static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
-  if (GET_MODE_NUNITS (mode).is_constant (&nunits))
-    snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
-  else
-    snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
-  return buf;
+  static char templ[40];
+
+  struct simd_immediate_info info;
+  bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
+  gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
+
+  char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
+  snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
+	    svpattern_token (info.u.pattern));
+  return templ;
 }
 
 /* Split operands into moves from op[1] + op[2] into op[0].  */
@@ -16590,13 +19083,31 @@ aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
   if (d->testing_p)
     return true;
 
-  rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
   if (d->vec_flags == VEC_SVE_DATA)
     {
-      rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
-      src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
-			    UNSPEC_MERGE_PTRUE);
+      machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
+      rtx target = gen_reg_rtx (int_mode);
+      if (BYTES_BIG_ENDIAN)
+	/* The act of taking a subreg between INT_MODE and d->vmode
+	   is itself a reversing operation on big-endian targets;
+	   see the comment at the head of aarch64-sve.md for details.
+	   First reinterpret OP0 as INT_MODE without using a subreg
+	   and without changing the contents.  */
+	emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
+      else
+	{
+	  /* For SVE we use REV[BHW] unspecs derived from the element size
+	     of v->mode and vector modes whose elements have SIZE bytes.
+	     This ensures that the vector modes match the predicate modes.  */
+	  int unspec = aarch64_sve_rev_unspec (d->vmode);
+	  rtx pred = aarch64_ptrue_reg (pred_mode);
+	  emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
+				       gen_lowpart (int_mode, d->op0)));
+	}
+      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+      return true;
     }
+  rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
   emit_set_insn (d->target, src);
   return true;
 }
@@ -16609,7 +19120,7 @@ aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
 {
   poly_uint64 nelt = d->perm.length ();
 
-  if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
+  if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
     return false;
 
   if (!d->perm.series_p (0, 1, nelt - 1, -1))
@@ -16722,6 +19233,50 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to implement D using SVE SEL instruction.  */
+
+static bool
+aarch64_evpc_sel (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  int unit_size = GET_MODE_UNIT_SIZE (vmode);
+
+  if (d->vec_flags != VEC_SVE_DATA
+      || unit_size > 8)
+    return false;
+
+  int n_patterns = d->perm.encoding ().npatterns ();
+  poly_int64 vec_len = d->perm.length ();
+
+  for (int i = 0; i < n_patterns; ++i)
+    if (!known_eq (d->perm[i], i)
+	&& !known_eq (d->perm[i], vec_len + i))
+      return false;
+
+  for (int i = n_patterns; i < n_patterns * 2; i++)
+    if (!d->perm.series_p (i, n_patterns, i, n_patterns)
+	&& !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  machine_mode pred_mode = aarch64_sve_pred_mode (unit_size).require ();
+
+  rtx_vector_builder builder (pred_mode, n_patterns, 2);
+  for (int i = 0; i < n_patterns * 2; i++)
+    {
+      rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
+					  : CONST0_RTX (BImode);
+      builder.quick_push (elem);
+    }
+
+  rtx const_vec = builder.build ();
+  rtx pred = force_reg (pred_mode, const_vec);
+  emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op1, d->op0, pred));
+  return true;
+}
+
 static bool
 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 {
@@ -16754,6 +19309,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 	return true;
       else if (aarch64_evpc_trn (d))
 	return true;
+      else if (aarch64_evpc_sel (d))
+	return true;
       if (d->vec_flags == VEC_SVE_DATA)
 	return aarch64_evpc_sve_tbl (d);
       else if (d->vec_flags == VEC_ADVSIMD)
@@ -16829,60 +19386,19 @@ aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
   return force_reg (V16QImode, mask);
 }
 
-/* Return true if X is a valid second operand for the SVE instruction
-   that implements integer comparison OP_CODE.  */
-
-static bool
-aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
-{
-  if (register_operand (x, VOIDmode))
-    return true;
-
-  switch (op_code)
-    {
-    case LTU:
-    case LEU:
-    case GEU:
-    case GTU:
-      return aarch64_sve_cmp_immediate_p (x, false);
-    case LT:
-    case LE:
-    case GE:
-    case GT:
-    case NE:
-    case EQ:
-      return aarch64_sve_cmp_immediate_p (x, true);
-    default:
-      gcc_unreachable ();
-    }
-}
-
-/* Use predicated SVE instructions to implement the equivalent of:
-
-     (set TARGET OP)
-
-   given that PTRUE is an all-true predicate of the appropriate mode.  */
-
-static void
-aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
-{
-  rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
-			       gen_rtvec (2, ptrue, op),
-			       UNSPEC_MERGE_PTRUE);
-  rtx_insn *insn = emit_set_insn (target, unspec);
-  set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
-}
+/* Expand an SVE integer comparison using the SVE equivalent of:
 
-/* Likewise, but also clobber the condition codes.  */
+     (set TARGET (CODE OP0 OP1)).  */
 
-static void
-aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
+void
+aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
 {
-  rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
-			       gen_rtvec (2, ptrue, op),
-			       UNSPEC_MERGE_PTRUE);
-  rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
-  set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
+  machine_mode pred_mode = GET_MODE (target);
+  machine_mode data_mode = GET_MODE (op0);
+  rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
+				      op0, op1);
+  if (!rtx_equal_p (target, res))
+    emit_move_insn (target, res);
 }
 
 /* Return the UNSPEC_COND_* code for comparison CODE.  */
@@ -16893,17 +19409,19 @@ aarch64_unspec_cond_code (rtx_code code)
   switch (code)
     {
     case NE:
-      return UNSPEC_COND_NE;
+      return UNSPEC_COND_FCMNE;
     case EQ:
-      return UNSPEC_COND_EQ;
+      return UNSPEC_COND_FCMEQ;
     case LT:
-      return UNSPEC_COND_LT;
+      return UNSPEC_COND_FCMLT;
     case GT:
-      return UNSPEC_COND_GT;
+      return UNSPEC_COND_FCMGT;
     case LE:
-      return UNSPEC_COND_LE;
+      return UNSPEC_COND_FCMLE;
     case GE:
-      return UNSPEC_COND_GE;
+      return UNSPEC_COND_FCMGE;
+    case UNORDERED:
+      return UNSPEC_COND_FCMUO;
     default:
       gcc_unreachable ();
     }
@@ -16911,78 +19429,58 @@ aarch64_unspec_cond_code (rtx_code code)
 
 /* Emit:
 
-      (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
+      (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
 
-   where <X> is the operation associated with comparison CODE.  This form
-   of instruction is used when (and (CODE OP0 OP1) PRED) would have different
-   semantics, such as when PRED might not be all-true and when comparing
-   inactive lanes could have side effects.  */
+   where <X> is the operation associated with comparison CODE.
+   KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
 
 static void
-aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
-				  rtx pred, rtx op0, rtx op1)
+aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
+			  bool known_ptrue_p, rtx op0, rtx op1)
 {
+  rtx flag = gen_int_mode (known_ptrue_p, SImode);
   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
-			       gen_rtvec (3, pred, op0, op1),
+			       gen_rtvec (4, pred, flag, op0, op1),
 			       aarch64_unspec_cond_code (code));
   emit_set_insn (target, unspec);
 }
 
-/* Expand an SVE integer comparison using the SVE equivalent of:
-
-     (set TARGET (CODE OP0 OP1)).  */
-
-void
-aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
-{
-  machine_mode pred_mode = GET_MODE (target);
-  machine_mode data_mode = GET_MODE (op0);
-
-  if (!aarch64_sve_cmp_operand_p (code, op1))
-    op1 = force_reg (data_mode, op1);
-
-  rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
-  rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
-  aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
-}
-
 /* Emit the SVE equivalent of:
 
-      (set TMP1 (CODE1 OP0 OP1))
-      (set TMP2 (CODE2 OP0 OP1))
+      (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
+      (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
       (set TARGET (ior:PRED_MODE TMP1 TMP2))
 
-   PTRUE is an all-true predicate with the same mode as TARGET.  */
+   where <Xi> is the operation associated with comparison CODEi.
+   KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
 
 static void
-aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
-			   rtx ptrue, rtx op0, rtx op1)
+aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
+			      rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
 {
-  machine_mode pred_mode = GET_MODE (ptrue);
+  machine_mode pred_mode = GET_MODE (pred);
   rtx tmp1 = gen_reg_rtx (pred_mode);
-  aarch64_emit_sve_ptrue_op (tmp1, ptrue,
-			     gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
+  aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
   rtx tmp2 = gen_reg_rtx (pred_mode);
-  aarch64_emit_sve_ptrue_op (tmp2, ptrue,
-			     gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
+  aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
 }
 
 /* Emit the SVE equivalent of:
 
-      (set TMP (CODE OP0 OP1))
+      (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
       (set TARGET (not TMP))
 
-   PTRUE is an all-true predicate with the same mode as TARGET.  */
+   where <X> is the operation associated with comparison CODE.
+   KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
 
 static void
-aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
-				rtx op0, rtx op1)
+aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
+				 bool known_ptrue_p, rtx op0, rtx op1)
 {
-  machine_mode pred_mode = GET_MODE (ptrue);
+  machine_mode pred_mode = GET_MODE (pred);
   rtx tmp = gen_reg_rtx (pred_mode);
-  aarch64_emit_sve_ptrue_op (tmp, ptrue,
-			     gen_rtx_fmt_ee (code, pred_mode, op0, op1));
+  aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
   aarch64_emit_unop (target, one_cmpl_optab, tmp);
 }
 
@@ -17000,7 +19498,7 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
   machine_mode pred_mode = GET_MODE (target);
   machine_mode data_mode = GET_MODE (op0);
 
-  rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
+  rtx ptrue = aarch64_ptrue_reg (pred_mode);
   switch (code)
     {
     case UNORDERED:
@@ -17015,14 +19513,13 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
     case NE:
       {
 	/* There is native support for the comparison.  */
-	rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
-	aarch64_emit_sve_ptrue_op (target, ptrue, cond);
+	aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
 	return false;
       }
 
     case LTGT:
       /* This is a trapping operation (LT or GT).  */
-      aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
+      aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
       return false;
 
     case UNEQ:
@@ -17030,7 +19527,8 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
 	{
 	  /* This would trap for signaling NaNs.  */
 	  op1 = force_reg (data_mode, op1);
-	  aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
+	  aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
+					ptrue, true, op0, op1);
 	  return false;
 	}
       /* fall through */
@@ -17043,7 +19541,8 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
 	  /* Work out which elements are ordered.  */
 	  rtx ordered = gen_reg_rtx (pred_mode);
 	  op1 = force_reg (data_mode, op1);
-	  aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
+	  aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
+					   ptrue, true, op0, op1);
 
 	  /* Test the opposite condition for the ordered elements,
 	     then invert the result.  */
@@ -17053,13 +19552,12 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
 	    code = reverse_condition_maybe_unordered (code);
 	  if (can_invert_p)
 	    {
-	      aarch64_emit_sve_predicated_cond (target, code,
-						ordered, op0, op1);
+	      aarch64_emit_sve_fp_cond (target, code,
+					ordered, false, op0, op1);
 	      return true;
 	    }
-	  rtx tmp = gen_reg_rtx (pred_mode);
-	  aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
-	  aarch64_emit_unop (target, one_cmpl_optab, tmp);
+	  aarch64_emit_sve_invert_fp_cond (target, code,
+					   ordered, false, op0, op1);
 	  return false;
 	}
       break;
@@ -17077,11 +19575,10 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
   code = reverse_condition_maybe_unordered (code);
   if (can_invert_p)
     {
-      rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
-      aarch64_emit_sve_ptrue_op (target, ptrue, cond);
+      aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
       return true;
     }
-  aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
+  aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
   return false;
 }
 
@@ -17104,6 +19601,13 @@ aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
   else
     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
 
+  if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
+    ops[1] = force_reg (data_mode, ops[1]);
+  /* The "false" value can only be zero if the "true" value is a constant.  */
+  if (register_operand (ops[1], data_mode)
+      || !aarch64_simd_reg_or_zero (ops[2], data_mode))
+    ops[2] = force_reg (data_mode, ops[2]);
+
   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
 }
@@ -17181,11 +19685,11 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
   *dst = aarch64_progress_pointer (*dst);
 }
 
-/* Expand movmem, as if from a __builtin_memcpy.  Return true if
+/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
    we succeed, otherwise return false.  */
 
 bool
-aarch64_expand_movmem (rtx *operands)
+aarch64_expand_cpymem (rtx *operands)
 {
   int n, mode_bits;
   rtx dst = operands[0];
@@ -17452,7 +19956,10 @@ aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
 static unsigned HOST_WIDE_INT
 aarch64_asan_shadow_offset (void)
 {
-  return (HOST_WIDE_INT_1 << 36);
+  if (TARGET_ILP32)
+    return (HOST_WIDE_INT_1 << 29);
+  else
+    return (HOST_WIDE_INT_1 << 36);
 }
 
 static rtx
@@ -17758,10 +20265,6 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
         }
     }
 
-  if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
-       && aarch_crypto_can_dual_issue (prev, curr))
-    return true;
-
   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
       && any_condjump_p (curr))
     {
@@ -18545,6 +21048,29 @@ aarch64_fpconst_pow_of_2 (rtx x)
   return exact_log2 (real_to_integer (r));
 }
 
+/* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
+   power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
+   return n. Otherwise return -1.  */
+
+int
+aarch64_fpconst_pow2_recip (rtx x)
+{
+  REAL_VALUE_TYPE r0;
+
+  if (!CONST_DOUBLE_P (x))
+    return -1;
+
+  r0 = *CONST_DOUBLE_REAL_VALUE (x);
+  if (exact_real_inverse (DFmode, &r0)
+      && !REAL_VALUE_NEGATIVE (r0))
+    {
+	int ret = exact_log2 (real_to_integer (&r0));
+	if (ret >= 1 && ret <= 32)
+	    return ret;
+    }
+  return -1;
+}
+
 /* If X is a vector of equal CONST_DOUBLE values and that value is
    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
 
@@ -18765,12 +21291,8 @@ aarch64_select_early_remat_modes (sbitmap modes)
   /* SVE values are not normally live across a call, so it should be
      worth doing early rematerialization even in VL-specific mode.  */
   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
-    {
-      machine_mode mode = (machine_mode) i;
-      unsigned int vec_flags = aarch64_classify_vector_mode (mode);
-      if (vec_flags & VEC_ANY_SVE)
-	bitmap_set_bit (modes, i);
-    }
+    if (aarch64_sve_mode_p ((machine_mode) i))
+      bitmap_set_bit (modes, i);
 }
 
 /* Override the default target speculation_safe_value.  */
@@ -18994,6 +21516,55 @@ aarch64_stack_protect_guard (void)
   return NULL_TREE;
 }
 
+/* Return the diagnostic message string if conversion from FROMTYPE to
+   TOTYPE is not allowed, NULL otherwise.  */
+
+static const char *
+aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
+{
+  if (element_mode (fromtype) != element_mode (totype))
+    {
+      /* Do no allow conversions to/from BFmode scalar types.  */
+      if (TYPE_MODE (fromtype) == BFmode)
+	return N_("invalid conversion from type %<bfloat16_t%>");
+      if (TYPE_MODE (totype) == BFmode)
+	return N_("invalid conversion to type %<bfloat16_t%>");
+    }
+
+  /* Conversion allowed.  */
+  return NULL;
+}
+
+/* Return the diagnostic message string if the unary operation OP is
+   not permitted on TYPE, NULL otherwise.  */
+
+static const char *
+aarch64_invalid_unary_op (int op, const_tree type)
+{
+  /* Reject all single-operand operations on BFmode except for &.  */
+  if (element_mode (type) == BFmode && op != ADDR_EXPR)
+    return N_("operation not permitted on type %<bfloat16_t%>");
+
+  /* Operation allowed.  */
+  return NULL;
+}
+
+/* Return the diagnostic message string if the binary operation OP is
+   not permitted on TYPE1 and TYPE2, NULL otherwise.  */
+
+static const char *
+aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
+			   const_tree type2)
+{
+  /* Reject all 2-operand operations on BFmode.  */
+  if (element_mode (type1) == BFmode
+      || element_mode (type2) == BFmode)
+    return N_("operation not permitted on type %<bfloat16_t%>");
+
+  /* Operation allowed.  */
+  return NULL;
+}
+
 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
    section at the end if needed.  */
 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND	0xc0000000
@@ -19137,7 +21708,7 @@ aarch64_run_selftests (void)
 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
 
 #undef TARGET_CALLEE_COPIES
-#define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
+#define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
 
 #undef TARGET_CAN_ELIMINATE
 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
@@ -19247,6 +21818,15 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_MANGLE_TYPE
 #define TARGET_MANGLE_TYPE aarch64_mangle_type
 
+#undef TARGET_INVALID_CONVERSION
+#define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
+
+#undef TARGET_INVALID_UNARY_OP
+#define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
+
+#undef TARGET_INVALID_BINARY_OP
+#define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
+
 #undef TARGET_MEMORY_MOVE_COST
 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
 
@@ -19370,6 +21950,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_VECTOR_MODE_SUPPORTED_P
 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
 
+#undef TARGET_COMPATIBLE_VECTOR_TYPES_P
+#define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
+
 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
   aarch64_builtin_support_vector_misalignment
@@ -19517,13 +22100,8 @@ aarch64_libgcc_floating_mode_supported_p
 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
   aarch64_hard_regno_call_part_clobbered
 
-#undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
-#define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
-  aarch64_remove_extra_call_preserved_regs
-
-#undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
-#define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
-  aarch64_return_call_with_max_clobbers
+#undef TARGET_INSN_CALLEE_ABI
+#define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
 
 #undef TARGET_CONSTANT_ALIGNMENT
 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
@@ -19566,11 +22144,20 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_GET_MULTILIB_ABI_NAME
 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
 
+#undef TARGET_FNTYPE_ABI
+#define TARGET_FNTYPE_ABI aarch64_fntype_abi
+
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
 #endif /* #if CHECKING_P */
 
+#undef TARGET_ASM_POST_CFI_STARTPROC
+#define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
+
+#undef TARGET_STRICT_ARGUMENT_NAMING
+#define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-aarch64.h"
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 772a97296..d5341656f 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -192,6 +192,31 @@ extern unsigned aarch64_architecture_version;
 /* Execution and Data Prediction Restriction instructions supported.  */
 #define AARCH64_FL_PREDRES    (1 << 27)
 
+/* SVE2 instruction supported.  */
+#define AARCH64_FL_SVE2		(1 << 28)
+#define AARCH64_FL_SVE2_AES	(1 << 29)
+#define AARCH64_FL_SVE2_SM4	(1 << 30)
+#define AARCH64_FL_SVE2_SHA3	(1ULL << 31)
+#define AARCH64_FL_SVE2_BITPERM	(1ULL << 32)
+
+/* Transactional Memory Extension.  */
+#define AARCH64_FL_TME	      (1ULL << 33)  /* Has TME instructions.  */
+
+/* Armv8.6-A architecture extensions.  */
+#define AARCH64_FL_V8_6	      (1ULL << 34)
+
+/* 8-bit Integer Matrix Multiply (I8MM) extensions.  */
+#define AARCH64_FL_I8MM	      (1ULL << 35)
+
+/* Brain half-precision floating-point (BFloat16) Extension.  */
+#define AARCH64_FL_BF16	      (1ULL << 36)
+
+/* 32-bit Floating-point Matrix Multiply (F32MM) extensions.  */
+#define AARCH64_FL_F32MM      (1ULL << 37)
+
+/* 64-bit Floating-point Matrix Multiply (F64MM) extensions.  */
+#define AARCH64_FL_F64MM      (1ULL << 38)
+
 /* Has FP and SIMD.  */
 #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
 
@@ -213,6 +238,9 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_FOR_ARCH8_5			\
   (AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_V8_5	\
    | AARCH64_FL_SB | AARCH64_FL_SSBS | AARCH64_FL_PREDRES)
+#define AARCH64_FL_FOR_ARCH8_6			\
+  (AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_V8_6 | AARCH64_FL_FPSIMD \
+   | AARCH64_FL_I8MM | AARCH64_FL_BF16)
 
 /* Macros to test ISA flags.  */
 
@@ -225,6 +253,7 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_V8_2	   (aarch64_isa_flags & AARCH64_FL_V8_2)
 #define AARCH64_ISA_F16		   (aarch64_isa_flags & AARCH64_FL_F16)
 #define AARCH64_ISA_SVE            (aarch64_isa_flags & AARCH64_FL_SVE)
+#define AARCH64_ISA_SVE2	   (aarch64_isa_flags & AARCH64_FL_SVE2)
 #define AARCH64_ISA_V8_3	   (aarch64_isa_flags & AARCH64_FL_V8_3)
 #define AARCH64_ISA_DOTPROD	   (aarch64_isa_flags & AARCH64_FL_DOTPROD)
 #define AARCH64_ISA_AES	           (aarch64_isa_flags & AARCH64_FL_AES)
@@ -234,7 +263,14 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_SHA3	   (aarch64_isa_flags & AARCH64_FL_SHA3)
 #define AARCH64_ISA_F16FML	   (aarch64_isa_flags & AARCH64_FL_F16FML)
 #define AARCH64_ISA_RCPC8_4	   (aarch64_isa_flags & AARCH64_FL_RCPC8_4)
+#define AARCH64_ISA_RNG		   (aarch64_isa_flags & AARCH64_FL_RNG)
 #define AARCH64_ISA_V8_5	   (aarch64_isa_flags & AARCH64_FL_V8_5)
+#define AARCH64_ISA_TME		   (aarch64_isa_flags & AARCH64_FL_TME)
+#define AARCH64_ISA_V8_6	   (aarch64_isa_flags & AARCH64_FL_V8_6)
+#define AARCH64_ISA_I8MM	   (aarch64_isa_flags & AARCH64_FL_I8MM)
+#define AARCH64_ISA_F32MM	   (aarch64_isa_flags & AARCH64_FL_F32MM)
+#define AARCH64_ISA_F64MM	   (aarch64_isa_flags & AARCH64_FL_F64MM)
+#define AARCH64_ISA_BF16	   (aarch64_isa_flags & AARCH64_FL_BF16)
 
 /* Crypto is an optional extension to AdvSIMD.  */
 #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
@@ -270,12 +306,44 @@ extern unsigned aarch64_architecture_version;
 /* SVE instructions, enabled through +sve.  */
 #define TARGET_SVE (AARCH64_ISA_SVE)
 
+/* SVE2 instructions, enabled through +sve2.  */
+#define TARGET_SVE2 (AARCH64_ISA_SVE2)
+
 /* ARMv8.3-A features.  */
 #define TARGET_ARMV8_3	(AARCH64_ISA_V8_3)
 
+/* Javascript conversion instruction from Armv8.3-a.  */
+#define TARGET_JSCVT	(TARGET_FLOAT && AARCH64_ISA_V8_3)
+
 /* Armv8.3-a Complex number extension to AdvSIMD extensions.  */
 #define TARGET_COMPLEX (TARGET_SIMD && TARGET_ARMV8_3)
 
+/* Floating-point rounding instructions from Armv8.5-a.  */
+#define TARGET_FRINT (AARCH64_ISA_V8_5 && TARGET_FLOAT)
+
+/* TME instructions are enabled.  */
+#define TARGET_TME (AARCH64_ISA_TME)
+
+/* Random number instructions from Armv8.5-a.  */
+#define TARGET_RNG (AARCH64_ISA_RNG)
+
+/* I8MM instructions are enabled through +i8mm.  */
+#define TARGET_I8MM (AARCH64_ISA_I8MM)
+#define TARGET_SVE_I8MM (TARGET_SVE && AARCH64_ISA_I8MM)
+
+/* F32MM instructions are enabled through +f32mm.  */
+#define TARGET_F32MM (AARCH64_ISA_F32MM)
+#define TARGET_SVE_F32MM (TARGET_SVE && AARCH64_ISA_F32MM)
+
+/* F64MM instructions are enabled through +f64mm.  */
+#define TARGET_F64MM (AARCH64_ISA_F64MM)
+#define TARGET_SVE_F64MM (TARGET_SVE && AARCH64_ISA_F64MM)
+
+/* BF16 instructions are enabled through +bf16.  */
+#define TARGET_BF16_FP (AARCH64_ISA_BF16)
+#define TARGET_BF16_SIMD (AARCH64_ISA_BF16 && TARGET_SIMD)
+#define TARGET_SVE_BF16 (TARGET_SVE && AARCH64_ISA_BF16)
+
 /* Make sure this is always defined so we don't have to check for ifdefs
    but rather use normal ifs.  */
 #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
@@ -338,6 +406,9 @@ extern unsigned aarch64_architecture_version;
    P0-P7        Predicate low registers: valid in all predicate contexts
    P8-P15       Predicate high registers: used as scratch space
 
+   FFR		First Fault Register, a fixed-use SVE predicate register
+   FFRT		FFR token: a fake register used for modelling dependencies
+
    VG           Pseudo "vector granules" register
 
    VG is the number of 64-bit elements in an SVE vector.  We define
@@ -358,6 +429,7 @@ extern unsigned aarch64_architecture_version;
     1, 1, 1, 1,			/* SFP, AP, CC, VG */	\
     0, 0, 0, 0,   0, 0, 0, 0,   /* P0 - P7 */           \
     0, 0, 0, 0,   0, 0, 0, 0,   /* P8 - P15 */          \
+    1, 1			/* FFR and FFRT */	\
   }
 
 /* X30 is marked as caller-saved which is in line with regular function call
@@ -380,6 +452,7 @@ extern unsigned aarch64_architecture_version;
     1, 1, 1, 1,			/* SFP, AP, CC, VG */	\
     1, 1, 1, 1,   1, 1, 1, 1,	/* P0 - P7 */		\
     1, 1, 1, 1,   1, 1, 1, 1,	/* P8 - P15 */		\
+    1, 1			/* FFR and FFRT */	\
   }
 
 #define REGISTER_NAMES						\
@@ -395,6 +468,7 @@ extern unsigned aarch64_architecture_version;
     "sfp", "ap",  "cc",  "vg",					\
     "p0",  "p1",  "p2",  "p3",  "p4",  "p5",  "p6",  "p7",	\
     "p8",  "p9",  "p10", "p11", "p12", "p13", "p14", "p15",	\
+    "ffr", "ffrt"						\
   }
 
 /* Generate the register aliases for core register N */
@@ -443,11 +517,12 @@ extern unsigned aarch64_architecture_version;
 #define FRAME_POINTER_REGNUM		SFP_REGNUM
 #define STACK_POINTER_REGNUM		SP_REGNUM
 #define ARG_POINTER_REGNUM		AP_REGNUM
-#define FIRST_PSEUDO_REGISTER		(P15_REGNUM + 1)
+#define FIRST_PSEUDO_REGISTER		(FFRT_REGNUM + 1)
 
-/* The number of (integer) argument register available.  */
+/* The number of argument registers available for each class.  */
 #define NUM_ARG_REGS			8
 #define NUM_FP_ARG_REGS			8
+#define NUM_PR_ARG_REGS			4
 
 /* A Homogeneous Floating-Point or Short-Vector Aggregate may have at most
    four members.  */
@@ -514,6 +589,9 @@ extern unsigned aarch64_architecture_version;
 #define ASM_OUTPUT_EXTERNAL(STR, DECL, NAME) \
   aarch64_asm_output_external (STR, DECL, NAME)
 
+/* Output assembly strings after .cfi_startproc is emitted.  */
+#define ASM_POST_CFI_STARTPROC  aarch64_post_cfi_startproc
+
 /* For EH returns X4 contains the stack adjustment.  */
 #define EH_RETURN_STACKADJ_RTX	gen_rtx_REG (Pmode, R4_REGNUM)
 #define EH_RETURN_HANDLER_RTX  aarch64_eh_return_handler_rtx ()
@@ -542,6 +620,9 @@ extern unsigned aarch64_architecture_version;
 #define FP_LO_REGNUM_P(REGNO)            \
   (((unsigned) (REGNO - V0_REGNUM)) <= (V15_REGNUM - V0_REGNUM))
 
+#define FP_LO8_REGNUM_P(REGNO)            \
+  (((unsigned) (REGNO - V0_REGNUM)) <= (V7_REGNUM - V0_REGNUM))
+
 #define PR_REGNUM_P(REGNO)\
   (((unsigned) (REGNO - P0_REGNUM)) <= (P15_REGNUM - P0_REGNUM))
 
@@ -560,12 +641,15 @@ enum reg_class
   GENERAL_REGS,
   STACK_REG,
   POINTER_REGS,
+  FP_LO8_REGS,
   FP_LO_REGS,
   FP_REGS,
   POINTER_AND_FP_REGS,
   PR_LO_REGS,
   PR_HI_REGS,
   PR_REGS,
+  FFR_REGS,
+  PR_AND_FFR_REGS,
   ALL_REGS,
   LIM_REG_CLASSES		/* Last */
 };
@@ -579,12 +663,15 @@ enum reg_class
   "GENERAL_REGS",				\
   "STACK_REG",					\
   "POINTER_REGS",				\
+  "FP_LO8_REGS",				\
   "FP_LO_REGS",					\
   "FP_REGS",					\
   "POINTER_AND_FP_REGS",			\
   "PR_LO_REGS",					\
   "PR_HI_REGS",					\
   "PR_REGS",					\
+  "FFR_REGS",					\
+  "PR_AND_FFR_REGS",				\
   "ALL_REGS"					\
 }
 
@@ -595,12 +682,15 @@ enum reg_class
   { 0x7fffffff, 0x00000000, 0x00000003 },	/* GENERAL_REGS */	\
   { 0x80000000, 0x00000000, 0x00000000 },	/* STACK_REG */		\
   { 0xffffffff, 0x00000000, 0x00000003 },	/* POINTER_REGS */	\
+  { 0x00000000, 0x000000ff, 0x00000000 },       /* FP_LO8_REGS  */	\
   { 0x00000000, 0x0000ffff, 0x00000000 },       /* FP_LO_REGS  */	\
   { 0x00000000, 0xffffffff, 0x00000000 },       /* FP_REGS  */		\
   { 0xffffffff, 0xffffffff, 0x00000003 },	/* POINTER_AND_FP_REGS */\
   { 0x00000000, 0x00000000, 0x00000ff0 },	/* PR_LO_REGS */	\
   { 0x00000000, 0x00000000, 0x000ff000 },	/* PR_HI_REGS */	\
   { 0x00000000, 0x00000000, 0x000ffff0 },	/* PR_REGS */		\
+  { 0x00000000, 0x00000000, 0x00300000 },	/* FFR_REGS */		\
+  { 0x00000000, 0x00000000, 0x003ffff0 },	/* PR_AND_FFR_REGS */	\
   { 0xffffffff, 0xffffffff, 0x000fffff }	/* ALL_REGS */		\
 }
 
@@ -676,7 +766,7 @@ extern enum aarch64_processor aarch64_tune;
 #ifdef HAVE_POLY_INT_H
 struct GTY (()) aarch64_frame
 {
-  HOST_WIDE_INT reg_offset[FIRST_PSEUDO_REGISTER];
+  poly_int64 reg_offset[LAST_SAVED_REGNUM + 1];
 
   /* The number of extra stack bytes taken up by register varargs.
      This area is allocated by the callee at the very top of the
@@ -684,9 +774,12 @@ struct GTY (()) aarch64_frame
      STACK_BOUNDARY.  */
   HOST_WIDE_INT saved_varargs_size;
 
-  /* The size of the saved callee-save int/FP registers.  */
+  /* The size of the callee-save registers with a slot in REG_OFFSET.  */
+  poly_int64 saved_regs_size;
 
-  HOST_WIDE_INT saved_regs_size;
+  /* The size of the callee-save registers with a slot in REG_OFFSET that
+     are saved below the hard frame pointer.  */
+  poly_int64 below_hard_fp_saved_regs_size;
 
   /* Offset from the base of the frame (incomming SP) to the
      top of the locals area.  This value is always a multiple of
@@ -714,6 +807,10 @@ struct GTY (()) aarch64_frame
      It may be non-zero if no push is used (ie. callee_adjust == 0).  */
   poly_int64 callee_offset;
 
+  /* The size of the stack adjustment before saving or after restoring
+     SVE registers.  */
+  poly_int64 sve_callee_adjust;
+
   /* The size of the stack adjustment after saving callee-saves.  */
   poly_int64 final_adjust;
 
@@ -723,6 +820,11 @@ struct GTY (()) aarch64_frame
   unsigned wb_candidate1;
   unsigned wb_candidate2;
 
+  /* Big-endian SVE frames need a spare predicate register in order
+     to save vector registers in the correct layout for unwinding.
+     This is the register they should use.  */
+  unsigned spare_pred_reg;
+
   bool laid_out;
 };
 
@@ -751,6 +853,10 @@ enum aarch64_abi_type
 enum arm_pcs
 {
   ARM_PCS_AAPCS64,		/* Base standard AAPCS for 64 bit.  */
+  ARM_PCS_SIMD,			/* For aarch64_vector_pcs functions.  */
+  ARM_PCS_SVE,			/* For functions that pass or return
+				   values in SVE registers.  */
+  ARM_PCS_TLSDESC,		/* For targets of tlsdesc calls.  */
   ARM_PCS_UNKNOWN
 };
 
@@ -777,6 +883,8 @@ typedef struct
   int aapcs_nextncrn;		/* Next next core register number.  */
   int aapcs_nvrn;		/* Next Vector register number.  */
   int aapcs_nextnvrn;		/* Next Next Vector register number.  */
+  int aapcs_nprn;		/* Next Predicate register number.  */
+  int aapcs_nextnprn;		/* Next Next Predicate register number.  */
   rtx aapcs_reg;		/* Register assigned to this argument.  This
 				   is NULL_RTX if this parameter goes on
 				   the stack.  */
@@ -787,6 +895,8 @@ typedef struct
 				   aapcs_reg == NULL_RTX.  */
   int aapcs_stack_size;		/* The total size (in words, per 8 byte) of the
 				   stack arg area so far.  */
+  bool silent_p;		/* True if we should act silently, rather than
+				   raise an error for invalid calls.  */
 } CUMULATIVE_ARGS;
 #endif
 
@@ -842,7 +952,7 @@ typedef struct
 /* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure.
    move_by_pieces will continually copy the largest safe chunks.  So a
    7-byte copy is a 4-byte + 2-byte + byte copy.  This proves inefficient
-   for both size and speed of copy, so we will instead use the "movmem"
+   for both size and speed of copy, so we will instead use the "cpymem"
    standard name to implement the copy.  This logic does not apply when
    targeting -mstrict-align, so keep a sensible default in that case.  */
 #define MOVE_RATIO(speed) \
@@ -1025,13 +1135,13 @@ extern enum aarch64_code_model aarch64_cmodel;
 #define AARCH64_VALID_SIMD_DREG_MODE(MODE) \
   ((MODE) == V2SImode || (MODE) == V4HImode || (MODE) == V8QImode \
    || (MODE) == V2SFmode || (MODE) == V4HFmode || (MODE) == DImode \
-   || (MODE) == DFmode)
+   || (MODE) == DFmode || (MODE) == V4BFmode)
 
 /* Modes valid for AdvSIMD Q registers.  */
 #define AARCH64_VALID_SIMD_QREG_MODE(MODE) \
   ((MODE) == V4SImode || (MODE) == V8HImode || (MODE) == V16QImode \
    || (MODE) == V4SFmode || (MODE) == V8HFmode || (MODE) == V2DImode \
-   || (MODE) == V2DFmode)
+   || (MODE) == V2DFmode || (MODE) == V8BFmode)
 
 #define ENDIAN_LANE_N(NUNITS, N) \
   (BYTES_BIG_ENDIAN ? NUNITS - 1 - N : N)
@@ -1079,6 +1189,11 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 extern tree aarch64_fp16_type_node;
 extern tree aarch64_fp16_ptr_type_node;
 
+/* This type is the user-visible __bf16, and a pointer to that type.  Defined
+   in aarch64-builtins.c.  */
+extern tree aarch64_bf16_type_node;
+extern tree aarch64_bf16_ptr_type_node;
+
 /* The generic unwind code in libgcc does not initialize the frame pointer.
    So in order to unwind a function using a frame pointer, the very first
    function that is unwound must save the frame pointer.  That way the frame
@@ -1094,7 +1209,8 @@ extern poly_uint16 aarch64_sve_vg;
 #define BITS_PER_SVE_VECTOR (poly_uint16 (aarch64_sve_vg * 64))
 #define BYTES_PER_SVE_VECTOR (poly_uint16 (aarch64_sve_vg * 8))
 
-/* The number of bytes in an SVE predicate.  */
+/* The number of bits and bytes in an SVE predicate.  */
+#define BITS_PER_SVE_PRED BYTES_PER_SVE_VECTOR
 #define BYTES_PER_SVE_PRED aarch64_sve_vg
 
 /* The SVE mode for a vector of bytes.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 73c34a227..34cccc7cd 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -85,7 +85,6 @@
     (V29_REGNUM		61)
     (V30_REGNUM		62)
     (V31_REGNUM		63)
-    (LAST_SAVED_REGNUM	63)
     (SFP_REGNUM		64)
     (AP_REGNUM		65)
     (CC_REGNUM		66)
@@ -107,6 +106,11 @@
     (P13_REGNUM		81)
     (P14_REGNUM		82)
     (P15_REGNUM		83)
+    (LAST_SAVED_REGNUM	83)
+    (FFR_REGNUM		84)
+    ;; "FFR token": a fake register used for representing the scheduling
+    ;; restrictions on FFR-related operations.
+    (FFRT_REGNUM	85)
     ;; Scratch register used by stack clash protection to calculate
     ;; SVE CFA offsets during probing.
     (STACK_CLASH_SVE_CFA_REGNUM 11)
@@ -120,13 +124,17 @@
     ;; Scratch registers used in frame layout.
     (IP0_REGNUM         16)
     (IP1_REGNUM         17)
+    (FP_REGNUM		29)
     (LR_REGNUM          30)
   ]
 )
 
 (define_c_enum "unspec" [
-    UNSPEC_AUTI1716
-    UNSPEC_AUTISP
+    UNSPEC_AUTIA1716
+    UNSPEC_AUTIB1716
+    UNSPEC_AUTIASP
+    UNSPEC_AUTIBSP
+    UNSPEC_CALLEE_ABI
     UNSPEC_CASESI
     UNSPEC_CRC32B
     UNSPEC_CRC32CB
@@ -138,6 +146,11 @@
     UNSPEC_CRC32X
     UNSPEC_FCVTZS
     UNSPEC_FCVTZU
+    UNSPEC_FJCVTZS
+    UNSPEC_FRINT32Z
+    UNSPEC_FRINT32X
+    UNSPEC_FRINT64Z
+    UNSPEC_FRINT64X
     UNSPEC_URECPE
     UNSPEC_FRECPE
     UNSPEC_FRECPS
@@ -169,8 +182,10 @@
     UNSPEC_LD4_LANE
     UNSPEC_MB
     UNSPEC_NOP
-    UNSPEC_PACI1716
-    UNSPEC_PACISP
+    UNSPEC_PACIA1716
+    UNSPEC_PACIB1716
+    UNSPEC_PACIASP
+    UNSPEC_PACIBSP
     UNSPEC_PRLG_STK
     UNSPEC_REV
     UNSPEC_RBIT
@@ -211,26 +226,49 @@
     UNSPEC_XPACLRI
     UNSPEC_LD1_SVE
     UNSPEC_ST1_SVE
+    UNSPEC_LDNT1_SVE
+    UNSPEC_STNT1_SVE
     UNSPEC_LD1RQ
     UNSPEC_LD1_GATHER
+    UNSPEC_LDFF1_GATHER
     UNSPEC_ST1_SCATTER
-    UNSPEC_MERGE_PTRUE
-    UNSPEC_PTEST_PTRUE
+    UNSPEC_PRED_X
+    UNSPEC_PRED_Z
+    UNSPEC_PTEST
+    UNSPEC_PTRUE
     UNSPEC_UNPACKSHI
     UNSPEC_UNPACKUHI
     UNSPEC_UNPACKSLO
     UNSPEC_UNPACKULO
     UNSPEC_PACK
-    UNSPEC_FLOAT_CONVERT
-    UNSPEC_WHILE_LO
+    UNSPEC_WHILELE
+    UNSPEC_WHILELO
+    UNSPEC_WHILELS
+    UNSPEC_WHILELT
     UNSPEC_LDN
     UNSPEC_STN
     UNSPEC_INSR
+    UNSPEC_CLASTA
     UNSPEC_CLASTB
     UNSPEC_FADDA
     UNSPEC_REV_SUBREG
+    UNSPEC_REINTERPRET
     UNSPEC_SPECULATION_TRACKER
     UNSPEC_COPYSIGN
+    UNSPEC_TTEST		; Represent transaction test.
+    UNSPEC_UPDATE_FFR
+    UNSPEC_UPDATE_FFRT
+    UNSPEC_RDFFR
+    UNSPEC_WRFFR
+    ;; Represents an SVE-style lane index, in which the indexing applies
+    ;; within the containing 128-bit block.
+    UNSPEC_SVE_LANE_SELECT
+    UNSPEC_SVE_CNT_PAT
+    UNSPEC_SVE_PREFETCH
+    UNSPEC_SVE_PREFETCH_GATHER
+    UNSPEC_SVE_COMPACT
+    UNSPEC_SVE_SPLICE
+    UNSPEC_LD1RO
 ])
 
 (define_c_enum "unspecv" [
@@ -246,9 +284,35 @@
     UNSPECV_BTI_C		; Represent BTI c.
     UNSPECV_BTI_J		; Represent BTI j.
     UNSPECV_BTI_JC		; Represent BTI jc.
+    UNSPECV_TSTART		; Represent transaction start.
+    UNSPECV_TCOMMIT		; Represent transaction commit.
+    UNSPECV_TCANCEL		; Represent transaction cancel.
+    UNSPEC_RNDR			; Represent RNDR
+    UNSPEC_RNDRRS		; Represent RNDRRS
   ]
 )
 
+;; These constants are used as a const_int in various SVE unspecs
+;; to indicate whether the governing predicate is known to be a PTRUE.
+(define_constants
+  [; Indicates that the predicate might not be a PTRUE.
+   (SVE_MAYBE_NOT_PTRUE 0)
+
+   ; Indicates that the predicate is known to be a PTRUE.
+   (SVE_KNOWN_PTRUE 1)])
+
+;; These constants are used as a const_int in predicated SVE FP arithmetic
+;; to indicate whether the operation is allowed to make additional lanes
+;; active without worrying about the effect on faulting behavior.
+(define_constants
+  [; Indicates either that all lanes are active or that the instruction may
+   ; operate on inactive inputs even if doing so could induce a fault.
+   (SVE_RELAXED_GP 0)
+
+   ; Indicates that some lanes might be inactive and that the instruction
+   ; must not operate on inactive inputs if doing so could induce a fault.
+   (SVE_STRICT_GP 1)])
+
 ;; If further include files are added the defintion of MD_INCLUDES
 ;; must be updated.
 
@@ -383,8 +447,8 @@
 
 (define_expand "cbranch<mode>4"
   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
-			    [(match_operand:GPI 1 "register_operand" "")
-			     (match_operand:GPI 2 "aarch64_plus_operand" "")])
+			    [(match_operand:GPI 1 "register_operand")
+			     (match_operand:GPI 2 "aarch64_plus_operand")])
 			   (label_ref (match_operand 3 "" ""))
 			   (pc)))]
   ""
@@ -397,8 +461,8 @@
 
 (define_expand "cbranch<mode>4"
   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
-			    [(match_operand:GPF 1 "register_operand" "")
-			     (match_operand:GPF 2 "aarch64_fp_compare_operand" "")])
+			    [(match_operand:GPF 1 "register_operand")
+			     (match_operand:GPF 2 "aarch64_fp_compare_operand")])
 			   (label_ref (match_operand 3 "" ""))
 			   (pc)))]
   ""
@@ -412,7 +476,7 @@
 (define_expand "cbranchcc4"
   [(set (pc) (if_then_else
 	      (match_operator 0 "aarch64_comparison_operator"
-	       [(match_operand 1 "cc_register" "")
+	       [(match_operand 1 "cc_register")
 	        (match_operand 2 "const0_operand")])
 	      (label_ref (match_operand 3 "" ""))
 	      (pc)))]
@@ -475,9 +539,9 @@
 ;; csneg  x0, x0, x1, mi
 
 (define_expand "mod<mode>3"
-  [(match_operand:GPI 0 "register_operand" "")
-   (match_operand:GPI 1 "register_operand" "")
-   (match_operand:GPI 2 "const_int_operand" "")]
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "register_operand")
+   (match_operand:GPI 2 "const_int_operand")]
   ""
   {
     HOST_WIDE_INT val = INTVAL (operands[2]);
@@ -530,10 +594,14 @@
 			   (pc)))]
   ""
   {
+    /* GCC's traditional style has been to use "beq" instead of "b.eq", etc.,
+       but the "." is required for SVE conditions.  */
+    bool use_dot_p = GET_MODE (operands[1]) == CC_NZCmode;
     if (get_attr_length (insn) == 8)
-      return aarch64_gen_far_branch (operands, 2, "Lbcond", "b%M0\\t");
+      return aarch64_gen_far_branch (operands, 2, "Lbcond",
+				     use_dot_p ? "b.%M0\\t" : "b%M0\\t");
     else
-      return  "b%m0\\t%l2";
+      return use_dot_p ? "b.%m0\\t%l2" : "b%m0\\t%l2";
   }
   [(set_attr "type" "branch")
    (set (attr "length")
@@ -558,14 +626,14 @@
 ;; 	sub	x0, x1, #(CST & 0xfff000)
 ;; 	subs	x0, x0, #(CST & 0x000fff)
 ;; 	b<ne,eq> .Label
-(define_insn_and_split "*compare_condjump<mode>"
+(define_insn_and_split "*compare_condjump<GPI:mode>"
   [(set (pc) (if_then_else (EQL
 			      (match_operand:GPI 0 "register_operand" "r")
 			      (match_operand:GPI 1 "aarch64_imm24" "n"))
 			   (label_ref:P (match_operand 2 "" ""))
 			   (pc)))]
-  "!aarch64_move_imm (INTVAL (operands[1]), <MODE>mode)
-   && !aarch64_plus_operand (operands[1], <MODE>mode)
+  "!aarch64_move_imm (INTVAL (operands[1]), <GPI:MODE>mode)
+   && !aarch64_plus_operand (operands[1], <GPI:MODE>mode)
    && !reload_completed"
   "#"
   "&& true"
@@ -573,20 +641,21 @@
   {
     HOST_WIDE_INT lo_imm = UINTVAL (operands[1]) & 0xfff;
     HOST_WIDE_INT hi_imm = UINTVAL (operands[1]) & 0xfff000;
-    rtx tmp = gen_reg_rtx (<MODE>mode);
-    emit_insn (gen_add<mode>3 (tmp, operands[0], GEN_INT (-hi_imm)));
-    emit_insn (gen_add<mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm)));
+    rtx tmp = gen_reg_rtx (<GPI:MODE>mode);
+    emit_insn (gen_add<GPI:mode>3 (tmp, operands[0], GEN_INT (-hi_imm)));
+    emit_insn (gen_add<GPI:mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm)));
     rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
-    rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <MODE>mode, cc_reg, const0_rtx);
+    rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <GPI:MODE>mode,
+				  cc_reg, const0_rtx);
     emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[2]));
     DONE;
   }
 )
 
 (define_expand "casesi"
-  [(match_operand:SI 0 "register_operand" "")	; Index
-   (match_operand:SI 1 "const_int_operand" "")	; Lower bound
-   (match_operand:SI 2 "const_int_operand" "")	; Total range
+  [(match_operand:SI 0 "register_operand")	; Index
+   (match_operand:SI 1 "const_int_operand")	; Lower bound
+   (match_operand:SI 2 "const_int_operand")	; Total range
    (match_operand:DI 3 "" "")			; Table label
    (match_operand:DI 4 "" "")]			; Out of range label
   ""
@@ -739,8 +808,12 @@
     if (aarch64_return_address_signing_enabled ()
 	&& TARGET_ARMV8_3
 	&& !crtl->calls_eh_return)
-      return "retaa";
-
+      {
+	if (aarch64_ra_sign_key == AARCH64_KEY_B)
+	  return "retab";
+	else
+	  return "retaa";
+      }
     return "ret";
   }
   [(set_attr "type" "branch")]
@@ -754,7 +827,7 @@
 
 (define_insn "simple_return"
   [(simple_return)]
-  "aarch64_use_simple_return_insn_p ()"
+  ""
   "ret"
   [(set_attr "type" "branch")]
 )
@@ -868,14 +941,15 @@
 ;; -------------------------------------------------------------------
 
 (define_expand "call"
-  [(parallel [(call (match_operand 0 "memory_operand" "")
-		    (match_operand 1 "general_operand" ""))
-	      (use (match_operand 2 "" ""))
-	      (clobber (reg:DI LR_REGNUM))])]
+  [(parallel
+     [(call (match_operand 0 "memory_operand")
+	    (match_operand 1 "general_operand"))
+      (unspec:DI [(match_operand 2 "const_int_operand")] UNSPEC_CALLEE_ABI)
+      (clobber (reg:DI LR_REGNUM))])]
   ""
   "
   {
-    aarch64_expand_call (NULL_RTX, operands[0], false);
+    aarch64_expand_call (NULL_RTX, operands[0], operands[2], false);
     DONE;
   }"
 )
@@ -883,6 +957,7 @@
 (define_insn "*call_insn"
   [(call (mem:DI (match_operand:DI 0 "aarch64_call_insn_operand" "r, Usf"))
 	 (match_operand 1 "" ""))
+   (unspec:DI [(match_operand:DI 2 "const_int_operand")] UNSPEC_CALLEE_ABI)
    (clobber (reg:DI LR_REGNUM))]
   ""
   "@
@@ -892,15 +967,16 @@
 )
 
 (define_expand "call_value"
-  [(parallel [(set (match_operand 0 "" "")
-		   (call (match_operand 1 "memory_operand" "")
-			 (match_operand 2 "general_operand" "")))
-	      (use (match_operand 3 "" ""))
-	      (clobber (reg:DI LR_REGNUM))])]
+  [(parallel
+     [(set (match_operand 0 "")
+	   (call (match_operand 1 "memory_operand")
+		 (match_operand 2 "general_operand")))
+     (unspec:DI [(match_operand 3 "const_int_operand")] UNSPEC_CALLEE_ABI)
+     (clobber (reg:DI LR_REGNUM))])]
   ""
   "
   {
-    aarch64_expand_call (operands[0], operands[1], false);
+    aarch64_expand_call (operands[0], operands[1], operands[3], false);
     DONE;
   }"
 )
@@ -909,6 +985,7 @@
   [(set (match_operand 0 "" "")
 	(call (mem:DI (match_operand:DI 1 "aarch64_call_insn_operand" "r, Usf"))
 		      (match_operand 2 "" "")))
+   (unspec:DI [(match_operand:DI 3 "const_int_operand")] UNSPEC_CALLEE_ABI)
    (clobber (reg:DI LR_REGNUM))]
   ""
   "@
@@ -918,33 +995,36 @@
 )
 
 (define_expand "sibcall"
-  [(parallel [(call (match_operand 0 "memory_operand" "")
-		    (match_operand 1 "general_operand" ""))
-	      (return)
-	      (use (match_operand 2 "" ""))])]
+  [(parallel
+     [(call (match_operand 0 "memory_operand")
+	    (match_operand 1 "general_operand"))
+      (unspec:DI [(match_operand 2 "const_int_operand")] UNSPEC_CALLEE_ABI)
+      (return)])]
   ""
   {
-    aarch64_expand_call (NULL_RTX, operands[0], true);
+    aarch64_expand_call (NULL_RTX, operands[0], operands[2], true);
     DONE;
   }
 )
 
 (define_expand "sibcall_value"
-  [(parallel [(set (match_operand 0 "" "")
-		   (call (match_operand 1 "memory_operand" "")
-			 (match_operand 2 "general_operand" "")))
-	      (return)
-	      (use (match_operand 3 "" ""))])]
+  [(parallel
+     [(set (match_operand 0 "")
+	   (call (match_operand 1 "memory_operand")
+		 (match_operand 2 "general_operand")))
+      (unspec:DI [(match_operand 3 "const_int_operand")] UNSPEC_CALLEE_ABI)
+      (return)])]
   ""
   {
-    aarch64_expand_call (operands[0], operands[1], true);
+    aarch64_expand_call (operands[0], operands[1], operands[3], true);
     DONE;
   }
 )
 
 (define_insn "*sibcall_insn"
   [(call (mem:DI (match_operand:DI 0 "aarch64_call_insn_operand" "Ucs, Usf"))
-	 (match_operand 1 "" ""))
+	 (match_operand 1 ""))
+   (unspec:DI [(match_operand:DI 2 "const_int_operand")] UNSPEC_CALLEE_ABI)
    (return)]
   "SIBLING_CALL_P (insn)"
   "@
@@ -954,10 +1034,11 @@
 )
 
 (define_insn "*sibcall_value_insn"
-  [(set (match_operand 0 "" "")
+  [(set (match_operand 0 "")
 	(call (mem:DI
 		(match_operand:DI 1 "aarch64_call_insn_operand" "Ucs, Usf"))
-	      (match_operand 2 "" "")))
+	      (match_operand 2 "")))
+   (unspec:DI [(match_operand:DI 3 "const_int_operand")] UNSPEC_CALLEE_ABI)
    (return)]
   "SIBLING_CALL_P (insn)"
   "@
@@ -977,7 +1058,9 @@
 {
   int i;
 
-  emit_call_insn (gen_call (operands[0], const0_rtx, NULL));
+  /* Untyped calls always use the default ABI.  It's only possible to use
+     ABI variants if we know the type of the target function.  */
+  emit_call_insn (gen_call (operands[0], const0_rtx, const0_rtx));
 
   for (i = 0; i < XVECLEN (operands[2], 0); i++)
     {
@@ -998,8 +1081,8 @@
 ;; -------------------------------------------------------------------
 
 (define_expand "mov<mode>"
-  [(set (match_operand:SHORT 0 "nonimmediate_operand" "")
-	(match_operand:SHORT 1 "general_operand" ""))]
+  [(set (match_operand:SHORT 0 "nonimmediate_operand")
+	(match_operand:SHORT 1 "general_operand"))]
   ""
   "
     if (GET_CODE (operands[0]) == MEM && operands[1] != const0_rtx)
@@ -1055,8 +1138,8 @@
 )
 
 (define_expand "mov<mode>"
-  [(set (match_operand:GPI 0 "nonimmediate_operand" "")
-	(match_operand:GPI 1 "general_operand" ""))]
+  [(set (match_operand:GPI 0 "nonimmediate_operand")
+	(match_operand:GPI 1 "general_operand"))]
   ""
   "
     if (MEM_P (operands[0]) && !MEM_VOLATILE_P (operands[0])
@@ -1162,8 +1245,8 @@
 )
 
 (define_expand "movti"
-  [(set (match_operand:TI 0 "nonimmediate_operand" "")
-	(match_operand:TI 1 "general_operand" ""))]
+  [(set (match_operand:TI 0 "nonimmediate_operand")
+	(match_operand:TI 1 "general_operand"))]
   ""
   "
     if (GET_CODE (operands[0]) == MEM && operands[1] != const0_rtx)
@@ -1217,8 +1300,8 @@
 })
 
 (define_expand "mov<mode>"
-  [(set (match_operand:GPF_TF_F16 0 "nonimmediate_operand" "")
-	(match_operand:GPF_TF_F16 1 "general_operand" ""))]
+  [(set (match_operand:GPF_TF_F16_MOV 0 "nonimmediate_operand")
+	(match_operand:GPF_TF_F16_MOV 1 "general_operand"))]
   ""
   {
     if (!TARGET_FLOAT)
@@ -1234,11 +1317,11 @@
   }
 )
 
-(define_insn "*movhf_aarch64"
-  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
-	(match_operand:HF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
-  "TARGET_FLOAT && (register_operand (operands[0], HFmode)
-    || aarch64_reg_or_fp_zero (operands[1], HFmode))"
+(define_insn "*mov<mode>_aarch64"
+  [(set (match_operand:HFBF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
+	(match_operand:HFBF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
+  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
+    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
   "@
    movi\\t%0.4h, #0
    fmov\\t%h0, %w1
@@ -1363,17 +1446,17 @@
 
 ;; 0 is dst
 ;; 1 is src
-;; 2 is size of move in bytes
+;; 2 is size of copy in bytes
 ;; 3 is alignment
 
-(define_expand "movmemdi"
+(define_expand "cpymemdi"
   [(match_operand:BLK 0 "memory_operand")
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "immediate_operand")
    (match_operand:DI 3 "immediate_operand")]
    "!STRICT_ALIGNMENT"
 {
-  if (aarch64_expand_movmem (operands))
+  if (aarch64_expand_cpymem (operands))
     DONE;
   FAIL;
 }
@@ -1492,8 +1575,8 @@
           (mem:GPI (plus:P (match_dup 1)
                    (match_operand:P 5 "const_int_operand" "n"))))])]
   "INTVAL (operands[5]) == GET_MODE_SIZE (<GPI:MODE>mode)"
-  "ldp\\t%<w>2, %<w>3, [%1], %4"
-  [(set_attr "type" "load_<ldpstp_sz>")]
+  "ldp\\t%<GPI:w>2, %<GPI:w>3, [%1], %4"
+  [(set_attr "type" "load_<GPI:ldpstp_sz>")]
 )
 
 (define_insn "loadwb_pair<GPF:mode>_<P:mode>"
@@ -1507,7 +1590,7 @@
           (mem:GPF (plus:P (match_dup 1)
                    (match_operand:P 5 "const_int_operand" "n"))))])]
   "INTVAL (operands[5]) == GET_MODE_SIZE (<GPF:MODE>mode)"
-  "ldp\\t%<w>2, %<w>3, [%1], %4"
+  "ldp\\t%<GPF:w>2, %<GPF:w>3, [%1], %4"
   [(set_attr "type" "neon_load1_2reg")]
 )
 
@@ -1540,8 +1623,8 @@
                    (match_operand:P 5 "const_int_operand" "n")))
           (match_operand:GPI 3 "register_operand" "r"))])]
   "INTVAL (operands[5]) == INTVAL (operands[4]) + GET_MODE_SIZE (<GPI:MODE>mode)"
-  "stp\\t%<w>2, %<w>3, [%0, %4]!"
-  [(set_attr "type" "store_<ldpstp_sz>")]
+  "stp\\t%<GPI:w>2, %<GPI:w>3, [%0, %4]!"
+  [(set_attr "type" "store_<GPI:ldpstp_sz>")]
 )
 
 (define_insn "storewb_pair<GPF:mode>_<P:mode>"
@@ -1556,7 +1639,7 @@
                    (match_operand:P 5 "const_int_operand" "n")))
           (match_operand:GPF 3 "register_operand" "w"))])]
   "INTVAL (operands[5]) == INTVAL (operands[4]) + GET_MODE_SIZE (<GPF:MODE>mode)"
-  "stp\\t%<w>2, %<w>3, [%0, %4]!"
+  "stp\\t%<GPF:w>2, %<GPF:w>3, [%0, %4]!"
   [(set_attr "type" "neon_store1_2reg<q>")]
 )
 
@@ -1702,9 +1785,9 @@
 
 (define_expand "add<mode>3"
   [(set
-    (match_operand:GPI 0 "register_operand" "")
-    (plus:GPI (match_operand:GPI 1 "register_operand" "")
-	      (match_operand:GPI 2 "aarch64_pluslong_or_poly_operand" "")))]
+    (match_operand:GPI 0 "register_operand")
+    (plus:GPI (match_operand:GPI 1 "register_operand")
+	      (match_operand:GPI 2 "aarch64_pluslong_or_poly_operand")))]
   ""
 {
   /* If operands[1] is a subreg extract the inner RTX.  */
@@ -1713,6 +1796,7 @@
   /* If the constant is too large for a single instruction and isn't frame
      based, split off the immediate so it is available for CSE.  */
   if (!aarch64_plus_immediate (operands[2], <MODE>mode)
+      && !(TARGET_SVE && aarch64_sve_plus_immediate (operands[2], <MODE>mode))
       && can_create_pseudo_p ()
       && (!REG_P (op1)
 	 || !REGNO_PTR_FRAME_P (REGNO (op1))))
@@ -1730,10 +1814,10 @@
 
 (define_insn "*add<mode>3_aarch64"
   [(set
-    (match_operand:GPI 0 "register_operand" "=rk,rk,w,rk,r,rk")
+    (match_operand:GPI 0 "register_operand" "=rk,rk,w,rk,r,r,rk")
     (plus:GPI
-     (match_operand:GPI 1 "register_operand" "%rk,rk,w,rk,rk,rk")
-     (match_operand:GPI 2 "aarch64_pluslong_operand" "I,r,w,J,Uaa,Uav")))]
+     (match_operand:GPI 1 "register_operand" "%rk,rk,w,rk,rk,0,rk")
+     (match_operand:GPI 2 "aarch64_pluslong_operand" "I,r,w,J,Uaa,Uai,Uav")))]
   ""
   "@
   add\\t%<w>0, %<w>1, %2
@@ -1741,10 +1825,11 @@
   add\\t%<rtn>0<vas>, %<rtn>1<vas>, %<rtn>2<vas>
   sub\\t%<w>0, %<w>1, #%n2
   #
-  * return aarch64_output_sve_addvl_addpl (operands[0], operands[1], operands[2]);"
-  ;; The "alu_imm" type for ADDVL/ADDPL is just a placeholder.
-  [(set_attr "type" "alu_imm,alu_sreg,neon_add,alu_imm,multiple,alu_imm")
-   (set_attr "arch" "*,*,simd,*,*,*")]
+  * return aarch64_output_sve_scalar_inc_dec (operands[2]);
+  * return aarch64_output_sve_addvl_addpl (operands[2]);"
+  ;; The "alu_imm" types for INC/DEC and ADDVL/ADDPL are just placeholders.
+  [(set_attr "type" "alu_imm,alu_sreg,neon_add,alu_imm,multiple,alu_imm,alu_imm")
+   (set_attr "arch" "*,*,simd,*,*,sve,sve")]
 )
 
 ;; zero_extend version of above
@@ -1823,17 +1908,18 @@
 ;; this pattern.
 (define_insn_and_split "*add<mode>3_poly_1"
   [(set
-    (match_operand:GPI 0 "register_operand" "=r,r,r,r,r,&r")
+    (match_operand:GPI 0 "register_operand" "=r,r,r,r,r,r,&r")
     (plus:GPI
-     (match_operand:GPI 1 "register_operand" "%rk,rk,rk,rk,rk,rk")
-     (match_operand:GPI 2 "aarch64_pluslong_or_poly_operand" "I,r,J,Uaa,Uav,Uat")))]
+     (match_operand:GPI 1 "register_operand" "%rk,rk,rk,rk,rk,0,rk")
+     (match_operand:GPI 2 "aarch64_pluslong_or_poly_operand" "I,r,J,Uaa,Uav,Uai,Uat")))]
   "TARGET_SVE && operands[0] != stack_pointer_rtx"
   "@
   add\\t%<w>0, %<w>1, %2
   add\\t%<w>0, %<w>1, %<w>2
   sub\\t%<w>0, %<w>1, #%n2
   #
-  * return aarch64_output_sve_addvl_addpl (operands[0], operands[1], operands[2]);
+  * return aarch64_output_sve_scalar_inc_dec (operands[2]);
+  * return aarch64_output_sve_addvl_addpl (operands[2]);
   #"
   "&& epilogue_completed
    && !reg_overlap_mentioned_p (operands[0], operands[1])
@@ -1844,8 +1930,8 @@
 			      operands[2], operands[0], NULL_RTX);
     DONE;
   }
-  ;; The "alu_imm" type for ADDVL/ADDPL is just a placeholder.
-  [(set_attr "type" "alu_imm,alu_sreg,alu_imm,multiple,alu_imm,multiple")]
+  ;; The "alu_imm" types for INC/DEC and ADDVL/ADDPL are just placeholders.
+  [(set_attr "type" "alu_imm,alu_sreg,alu_imm,multiple,alu_imm,alu_imm,multiple")]
 )
 
 (define_split
@@ -1897,9 +1983,9 @@
 })
 
 (define_expand "addti3"
-  [(set (match_operand:TI 0 "register_operand" "")
-	(plus:TI (match_operand:TI 1 "register_operand" "")
-		 (match_operand:TI 2 "aarch64_reg_or_imm" "")))]
+  [(set (match_operand:TI 0 "register_operand")
+	(plus:TI (match_operand:TI 1 "register_operand")
+		 (match_operand:TI 2 "aarch64_reg_or_imm")))]
   ""
 {
   rtx low_dest, op1_low, op2_low, high_dest, op1_high, op2_high;
@@ -1930,9 +2016,9 @@
 })
 
 (define_expand "addvti4"
-  [(match_operand:TI 0 "register_operand" "")
-   (match_operand:TI 1 "register_operand" "")
-   (match_operand:TI 2 "aarch64_reg_or_imm" "")
+  [(match_operand:TI 0 "register_operand")
+   (match_operand:TI 1 "register_operand")
+   (match_operand:TI 2 "aarch64_reg_or_imm")
    (label_ref (match_operand 3 "" ""))]
   ""
 {
@@ -1964,9 +2050,9 @@
 })
 
 (define_expand "uaddvti4"
-  [(match_operand:TI 0 "register_operand" "")
-   (match_operand:TI 1 "register_operand" "")
-   (match_operand:TI 2 "aarch64_reg_or_imm" "")
+  [(match_operand:TI 0 "register_operand")
+   (match_operand:TI 1 "register_operand")
+   (match_operand:TI 2 "aarch64_reg_or_imm")
    (label_ref (match_operand 3 "" ""))]
   ""
 {
@@ -2501,9 +2587,9 @@
 	       (plus:<DWI>
 		 (match_dup 4)
 		 (zero_extend:<DWI>
-		   (match_operand:GPI 1 "register_operand" "")))
+		   (match_operand:GPI 1 "register_operand")))
 	       (zero_extend:<DWI>
-		 (match_operand:GPI 2 "register_operand" "")))
+		 (match_operand:GPI 2 "register_operand")))
 	     (match_dup 6)))
       (set (match_operand:GPI 0 "register_operand")
 	   (plus:GPI
@@ -2564,9 +2650,9 @@
 	       (plus:<DWI>
 		 (match_dup 3)
 		 (sign_extend:<DWI>
-		   (match_operand:GPI 1 "register_operand" "")))
+		   (match_operand:GPI 1 "register_operand")))
 	       (sign_extend:<DWI>
-		 (match_operand:GPI 2 "register_operand" "")))
+		 (match_operand:GPI 2 "register_operand")))
 	   (sign_extend:<DWI>
 	     (plus:GPI
 	       (plus:GPI (match_dup 4) (match_dup 1))
@@ -2835,9 +2921,9 @@
 })
 
 (define_expand "subti3"
-  [(set (match_operand:TI 0 "register_operand" "")
-	(minus:TI (match_operand:TI 1 "aarch64_reg_or_zero" "")
-		  (match_operand:TI 2 "register_operand" "")))]
+  [(set (match_operand:TI 0 "register_operand")
+	(minus:TI (match_operand:TI 1 "aarch64_reg_or_zero")
+		  (match_operand:TI 2 "register_operand")))]
   ""
 {
   rtx low_dest, op1_low, op2_low, high_dest, op1_high, op2_high;
@@ -3285,12 +3371,12 @@
      [(set (reg:CC CC_REGNUM)
 	   (compare:CC
 	     (zero_extend:<DWI>
-	       (match_operand:GPI 1 "aarch64_reg_or_zero" ""))
+	       (match_operand:GPI 1 "aarch64_reg_or_zero"))
 	     (plus:<DWI>
 	       (zero_extend:<DWI>
-		 (match_operand:GPI 2 "register_operand" ""))
+		 (match_operand:GPI 2 "register_operand"))
 	       (ltu:<DWI> (reg:CC CC_REGNUM) (const_int 0)))))
-      (set (match_operand:GPI 0 "register_operand" "")
+      (set (match_operand:GPI 0 "register_operand")
 	   (minus:GPI
 	     (minus:GPI (match_dup 1) (match_dup 2))
 	     (ltu:GPI (reg:CC CC_REGNUM) (const_int 0))))])]
@@ -3353,16 +3439,16 @@
 	   (compare:CC_V
 	    (minus:<DWI>
 	     (sign_extend:<DWI>
-	       (match_operand:GPI 1 "aarch64_reg_or_zero" ""))
+	       (match_operand:GPI 1 "aarch64_reg_or_zero"))
 	     (plus:<DWI>
 	       (sign_extend:<DWI>
-		 (match_operand:GPI 2 "register_operand" ""))
+		 (match_operand:GPI 2 "register_operand"))
 	       (ltu:<DWI> (reg:CC CC_REGNUM) (const_int 0))))
 	    (sign_extend:<DWI>
 	     (minus:GPI (match_dup 1)
 			(plus:GPI (ltu:GPI (reg:CC CC_REGNUM) (const_int 0))
 				  (match_dup 2))))))
-      (set (match_operand:GPI 0 "register_operand" "")
+      (set (match_operand:GPI 0 "register_operand")
 	   (minus:GPI
 	     (minus:GPI (match_dup 1) (match_dup 2))
 	     (ltu:GPI (reg:CC CC_REGNUM) (const_int 0))))])]
@@ -3475,8 +3561,8 @@
 )
 
 (define_expand "abs<mode>2"
-  [(match_operand:GPI 0 "register_operand" "")
-   (match_operand:GPI 1 "register_operand" "")]
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "register_operand")]
   ""
   {
     rtx ccreg = aarch64_gen_compare_reg (LT, operands[1], const0_rtx);
@@ -3889,10 +3975,10 @@
 ;; -------------------------------------------------------------------
 
 (define_expand "cstore<mode>4"
-  [(set (match_operand:SI 0 "register_operand" "")
+  [(set (match_operand:SI 0 "register_operand")
 	(match_operator:SI 1 "aarch64_comparison_operator"
-	 [(match_operand:GPI 2 "register_operand" "")
-	  (match_operand:GPI 3 "aarch64_plus_operand" "")]))]
+	 [(match_operand:GPI 2 "register_operand")
+	  (match_operand:GPI 3 "aarch64_plus_operand")]))]
   ""
   "
   operands[2] = aarch64_gen_compare_reg (GET_CODE (operands[1]), operands[2],
@@ -3914,10 +4000,10 @@
 
 
 (define_expand "cstore<mode>4"
-  [(set (match_operand:SI 0 "register_operand" "")
+  [(set (match_operand:SI 0 "register_operand")
 	(match_operator:SI 1 "aarch64_comparison_operator_mode"
-	 [(match_operand:GPF 2 "register_operand" "")
-	  (match_operand:GPF 3 "aarch64_fp_compare_operand" "")]))]
+	 [(match_operand:GPF 2 "register_operand")
+	  (match_operand:GPF 3 "aarch64_fp_compare_operand")]))]
   ""
   "
   operands[2] = aarch64_gen_compare_reg (GET_CODE (operands[1]), operands[2],
@@ -4002,13 +4088,13 @@
 )
 
 (define_expand "cmov<mode>6"
-  [(set (match_operand:GPI 0 "register_operand" "")
+  [(set (match_operand:GPI 0 "register_operand")
 	(if_then_else:GPI
 	 (match_operator 1 "aarch64_comparison_operator"
-	  [(match_operand:GPI 2 "register_operand" "")
-	   (match_operand:GPI 3 "aarch64_plus_operand" "")])
-	 (match_operand:GPI 4 "register_operand" "")
-	 (match_operand:GPI 5 "register_operand" "")))]
+	  [(match_operand:GPI 2 "register_operand")
+	   (match_operand:GPI 3 "aarch64_plus_operand")])
+	 (match_operand:GPI 4 "register_operand")
+	 (match_operand:GPI 5 "register_operand")))]
   ""
   "
   operands[2] = aarch64_gen_compare_reg (GET_CODE (operands[1]), operands[2],
@@ -4018,13 +4104,13 @@
 )
 
 (define_expand "cmov<mode>6"
-  [(set (match_operand:GPF 0 "register_operand" "")
+  [(set (match_operand:GPF 0 "register_operand")
 	(if_then_else:GPF
 	 (match_operator 1 "aarch64_comparison_operator"
-	  [(match_operand:GPF 2 "register_operand" "")
-	   (match_operand:GPF 3 "aarch64_fp_compare_operand" "")])
-	 (match_operand:GPF 4 "register_operand" "")
-	 (match_operand:GPF 5 "register_operand" "")))]
+	  [(match_operand:GPF 2 "register_operand")
+	   (match_operand:GPF 3 "aarch64_fp_compare_operand")])
+	 (match_operand:GPF 4 "register_operand")
+	 (match_operand:GPF 5 "register_operand")))]
   ""
   "
   operands[2] = aarch64_gen_compare_reg (GET_CODE (operands[1]), operands[2],
@@ -4102,10 +4188,10 @@
 )
 
 (define_expand "mov<mode>cc"
-  [(set (match_operand:ALLI 0 "register_operand" "")
-	(if_then_else:ALLI (match_operand 1 "aarch64_comparison_operator" "")
-			   (match_operand:ALLI 2 "register_operand" "")
-			   (match_operand:ALLI 3 "register_operand" "")))]
+  [(set (match_operand:ALLI 0 "register_operand")
+	(if_then_else:ALLI (match_operand 1 "aarch64_comparison_operator")
+			   (match_operand:ALLI 2 "register_operand")
+			   (match_operand:ALLI 3 "register_operand")))]
   ""
   {
     rtx ccreg;
@@ -4121,10 +4207,10 @@
 )
 
 (define_expand "mov<GPF:mode><GPI:mode>cc"
-  [(set (match_operand:GPI 0 "register_operand" "")
-	(if_then_else:GPI (match_operand 1 "aarch64_comparison_operator" "")
-			  (match_operand:GPF 2 "register_operand" "")
-			  (match_operand:GPF 3 "register_operand" "")))]
+  [(set (match_operand:GPI 0 "register_operand")
+	(if_then_else:GPI (match_operand 1 "aarch64_comparison_operator")
+			  (match_operand:GPF 2 "register_operand")
+			  (match_operand:GPF 3 "register_operand")))]
   ""
   {
     rtx ccreg;
@@ -4140,10 +4226,10 @@
 )
 
 (define_expand "mov<mode>cc"
-  [(set (match_operand:GPF 0 "register_operand" "")
-	(if_then_else:GPF (match_operand 1 "aarch64_comparison_operator" "")
-			  (match_operand:GPF 2 "register_operand" "")
-			  (match_operand:GPF 3 "register_operand" "")))]
+  [(set (match_operand:GPF 0 "register_operand")
+	(if_then_else:GPF (match_operand 1 "aarch64_comparison_operator")
+			  (match_operand:GPF 2 "register_operand")
+			  (match_operand:GPF 3 "register_operand")))]
   ""
   {
     rtx ccreg;
@@ -4159,10 +4245,10 @@
 )
 
 (define_expand "<neg_not_op><mode>cc"
-  [(set (match_operand:GPI 0 "register_operand" "")
-	(if_then_else:GPI (match_operand 1 "aarch64_comparison_operator" "")
-			  (NEG_NOT:GPI (match_operand:GPI 2 "register_operand" ""))
-			  (match_operand:GPI 3 "register_operand" "")))]
+  [(set (match_operand:GPI 0 "register_operand")
+	(if_then_else:GPI (match_operand 1 "aarch64_comparison_operator")
+			  (NEG_NOT:GPI (match_operand:GPI 2 "register_operand"))
+			  (match_operand:GPI 3 "register_operand")))]
   ""
   {
     rtx ccreg;
@@ -4769,7 +4855,7 @@
   [(set_attr "type" "alus_imm")]
 )
 
-(define_insn "*ands<mode>_compare0"
+(define_insn "*ands<GPI:mode>_compare0"
   [(set (reg:CC_NZ CC_REGNUM)
 	(compare:CC_NZ
 	 (zero_extend:GPI (match_operand:SHORT 1 "register_operand" "r"))
@@ -5391,7 +5477,7 @@
 ;; -------------------------------------------------------------------
 
 (define_expand "<optab>"
-  [(set (match_operand:DI 0 "register_operand" "=r")
+  [(set (match_operand:DI 0 "register_operand")
 	(ANY_EXTRACT:DI (match_operand:DI 1 "register_operand")
 			(match_operand 2
 			  "aarch64_simd_shift_imm_offset_di")
@@ -5647,6 +5733,21 @@
   [(set_attr "type" "bfx")]
 )
 
+(define_insn "*ashiftsi_extvdi_bfiz"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(ashift:SI
+	  (match_operator:SI 4 "subreg_lowpart_operator"
+	    [(sign_extract:DI
+	       (match_operand:DI 1 "register_operand" "r")
+	       (match_operand 2 "aarch64_simd_shift_imm_offset_si")
+	       (const_int 0))])
+	  (match_operand 3 "aarch64_simd_shift_imm_si")))]
+  "IN_RANGE (INTVAL (operands[2]) + INTVAL (operands[3]),
+	     1, GET_MODE_BITSIZE (SImode) - 1)"
+  "sbfiz\\t%w0, %w1, %3, %2"
+  [(set_attr "type" "bfx")]
+)
+
 ;; When the bit position and width of the equivalent extraction add up to 32
 ;; we can use a W-reg LSL instruction taking advantage of the implicit
 ;; zero-extension of the X-reg.
@@ -6008,6 +6109,44 @@
   [(set_attr "type" "f_cvtf2i")]
 )
 
+;; Equal width integer to fp and multiply combine.
+(define_insn "*aarch64_<su_optab>cvtf<fcvt_target><GPF:mode>2_mult"
+  [(set (match_operand:GPF 0 "register_operand" "=w,w")
+	(mult:GPF (FLOATUORS:GPF
+		   (match_operand:<FCVT_TARGET> 1 "register_operand" "w,?r"))
+		   (match_operand:GPF 2 "aarch64_fp_pow2_recip" "Dt,Dt")))]
+  "TARGET_FLOAT"
+  {
+    operands[2] = GEN_INT (aarch64_fpconst_pow2_recip (operands[2]));
+    switch (which_alternative)
+    {
+      case 0:
+	return "<su_optab>cvtf\t%<GPF:s>0, %<s>1, #%2";
+      case 1:
+	return "<su_optab>cvtf\t%<GPF:s>0, %<w1>1, #%2";
+      default:
+	gcc_unreachable ();
+    }
+  }
+  [(set_attr "type" "neon_int_to_fp_<Vetype>,f_cvti2f")
+   (set_attr "arch" "simd,fp")]
+)
+
+;; Unequal width integer to fp and multiply combine.
+(define_insn "*aarch64_<su_optab>cvtf<fcvt_iesize><GPF:mode>2_mult"
+  [(set (match_operand:GPF 0 "register_operand" "=w")
+	(mult:GPF (FLOATUORS:GPF
+		   (match_operand:<FCVT_IESIZE> 1 "register_operand" "r"))
+		   (match_operand:GPF 2 "aarch64_fp_pow2_recip" "Dt")))]
+  "TARGET_FLOAT"
+  {
+    operands[2] = GEN_INT (aarch64_fpconst_pow2_recip (operands[2]));
+    return "<su_optab>cvtf\t%<GPF:s>0, %<w2>1, #%2";
+  }
+  [(set_attr "type" "f_cvti2f")]
+)
+
+;; Equal width integer to fp conversion.
 (define_insn "<optab><fcvt_target><GPF:mode>2"
   [(set (match_operand:GPF 0 "register_operand" "=w,w")
         (FLOATUORS:GPF (match_operand:<FCVT_TARGET> 1 "register_operand" "w,?r")))]
@@ -6019,6 +6158,7 @@
    (set_attr "arch" "simd,fp")]
 )
 
+;; Unequal width integer to fp conversions.
 (define_insn "<optab><fcvt_iesize><GPF:mode>2"
   [(set (match_operand:GPF 0 "register_operand" "=w")
         (FLOATUORS:GPF (match_operand:<FCVT_IESIZE> 1 "register_operand" "r")))]
@@ -6241,8 +6381,8 @@
 )
 
 (define_expand "sqrt<mode>2"
-  [(set (match_operand:GPF_F16 0 "register_operand" "=w")
-	(sqrt:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w")))]
+  [(set (match_operand:GPF_F16 0 "register_operand")
+	(sqrt:GPF_F16 (match_operand:GPF_F16 1 "register_operand")))]
   "TARGET_FLOAT"
 {
   if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
@@ -6401,6 +6541,7 @@
 ;; -------------------------------------------------------------------
 ;; Reload Scalar Floating point modes from constant pool.
 ;; The AArch64 port doesn't have __int128 constant move support.
+;; The patterns need constraints due to TARGET_SECONDARY_RELOAD hook.
 (define_expand "@aarch64_reload_movcp<GPF_TF:mode><P:mode>"
  [(set (match_operand:GPF_TF 0 "register_operand" "=w")
        (mem:GPF_TF (match_operand 1 "aarch64_constant_pool_symref" "S")))
@@ -6501,9 +6642,9 @@
 ;; rodata section.
 
 (define_expand "add_losym"
-  [(set (match_operand 0 "register_operand" "=r")
-	(lo_sum (match_operand 1 "register_operand" "r")
-		(match_operand 2 "aarch64_valid_symref" "S")))]
+  [(set (match_operand 0 "register_operand")
+	(lo_sum (match_operand 1 "register_operand")
+		(match_operand 2 "aarch64_valid_symref")))]
   ""
 {
   machine_mode mode = GET_MODE (operands[0]);
@@ -6602,9 +6743,10 @@
 ;; instructions in the TLS stubs, in order to enable linker relaxation.
 ;; Therefore we treat the stubs as an atomic sequence.
 (define_expand "tlsgd_small_<mode>"
- [(parallel [(set (match_operand 0 "register_operand" "")
+ [(parallel [(set (match_operand 0 "register_operand")
                   (call (mem:DI (match_dup 2)) (const_int 1)))
-	     (unspec:DI [(match_operand:PTR 1 "aarch64_valid_symref" "")] UNSPEC_GOTSMALLTLS)
+	     (unspec:DI [(const_int 0)] UNSPEC_CALLEE_ABI)
+	     (unspec:DI [(match_operand:PTR 1 "aarch64_valid_symref")] UNSPEC_GOTSMALLTLS)
 	     (clobber (reg:DI LR_REGNUM))])]
  ""
 {
@@ -6614,6 +6756,7 @@
 (define_insn "*tlsgd_small_<mode>"
   [(set (match_operand 0 "register_operand" "")
 	(call (mem:DI (match_operand:DI 2 "" "")) (const_int 1)))
+   (unspec:DI [(const_int 0)] UNSPEC_CALLEE_ABI)
    (unspec:DI [(match_operand:PTR 1 "aarch64_valid_symref" "S")] UNSPEC_GOTSMALLTLS)
    (clobber (reg:DI LR_REGNUM))
   ]
@@ -6714,7 +6857,12 @@
   "TARGET_TLS_DESC"
   {
     if (TARGET_SVE)
-      emit_insn (gen_tlsdesc_small_sve_<mode> (operands[0]));
+      {
+	rtx abi = gen_int_mode (aarch64_tlsdesc_abi_id (), DImode);
+	rtx_insn *call
+	  = emit_call_insn (gen_tlsdesc_small_sve_<mode> (operands[0], abi));
+	RTL_CONST_CALL_P (call) = 1;
+      }
     else
       emit_insn (gen_tlsdesc_small_advsimd_<mode> (operands[0]));
     DONE;
@@ -6729,72 +6877,27 @@
 		    UNSPEC_TLSDESC))
    (clobber (reg:DI LR_REGNUM))
    (clobber (reg:CC CC_REGNUM))
-   (clobber (match_scratch:DI 1 "=r"))]
+   (clobber (match_scratch:DI 1 "=r"))
+   (use (reg:DI FP_REGNUM))]
   "TARGET_TLS_DESC && !TARGET_SVE"
   "adrp\\tx0, %A0\;ldr\\t%<w>1, [x0, #%L0]\;add\\t<w>0, <w>0, %L0\;.tlsdesccall\\t%0\;blr\\t%1"
   [(set_attr "type" "call")
    (set_attr "length" "16")])
 
-;; For SVE, model tlsdesc calls as clobbering the lower 128 bits of
-;; all vector registers, and clobber all predicate registers, on
-;; top of the usual R0 and LR.
+;; For SVE, model tlsdesc calls as normal calls, with the callee ABI
+;; describing the extra call-preserved guarantees.  This would work
+;; for non-SVE too, but avoiding a call is probably better if we can.
 (define_insn "tlsdesc_small_sve_<mode>"
   [(set (reg:PTR R0_REGNUM)
-        (unspec:PTR [(match_operand 0 "aarch64_valid_symref" "S")]
-		    UNSPEC_TLSDESC))
+	(call (mem:DI (unspec:PTR
+			[(match_operand 0 "aarch64_valid_symref")]
+			UNSPEC_TLSDESC))
+	      (const_int 0)))
+   (unspec:DI [(match_operand:DI 1 "const_int_operand")] UNSPEC_CALLEE_ABI)
    (clobber (reg:DI LR_REGNUM))
-   (clobber (reg:CC CC_REGNUM))
-   (clobber_high (reg:TI V0_REGNUM))
-   (clobber_high (reg:TI V1_REGNUM))
-   (clobber_high (reg:TI V2_REGNUM))
-   (clobber_high (reg:TI V3_REGNUM))
-   (clobber_high (reg:TI V4_REGNUM))
-   (clobber_high (reg:TI V5_REGNUM))
-   (clobber_high (reg:TI V6_REGNUM))
-   (clobber_high (reg:TI V7_REGNUM))
-   (clobber_high (reg:TI V8_REGNUM))
-   (clobber_high (reg:TI V9_REGNUM))
-   (clobber_high (reg:TI V10_REGNUM))
-   (clobber_high (reg:TI V11_REGNUM))
-   (clobber_high (reg:TI V12_REGNUM))
-   (clobber_high (reg:TI V13_REGNUM))
-   (clobber_high (reg:TI V14_REGNUM))
-   (clobber_high (reg:TI V15_REGNUM))
-   (clobber_high (reg:TI V16_REGNUM))
-   (clobber_high (reg:TI V17_REGNUM))
-   (clobber_high (reg:TI V18_REGNUM))
-   (clobber_high (reg:TI V19_REGNUM))
-   (clobber_high (reg:TI V20_REGNUM))
-   (clobber_high (reg:TI V21_REGNUM))
-   (clobber_high (reg:TI V22_REGNUM))
-   (clobber_high (reg:TI V23_REGNUM))
-   (clobber_high (reg:TI V24_REGNUM))
-   (clobber_high (reg:TI V25_REGNUM))
-   (clobber_high (reg:TI V26_REGNUM))
-   (clobber_high (reg:TI V27_REGNUM))
-   (clobber_high (reg:TI V28_REGNUM))
-   (clobber_high (reg:TI V29_REGNUM))
-   (clobber_high (reg:TI V30_REGNUM))
-   (clobber_high (reg:TI V31_REGNUM))
-   (clobber (reg:VNx2BI P0_REGNUM))
-   (clobber (reg:VNx2BI P1_REGNUM))
-   (clobber (reg:VNx2BI P2_REGNUM))
-   (clobber (reg:VNx2BI P3_REGNUM))
-   (clobber (reg:VNx2BI P4_REGNUM))
-   (clobber (reg:VNx2BI P5_REGNUM))
-   (clobber (reg:VNx2BI P6_REGNUM))
-   (clobber (reg:VNx2BI P7_REGNUM))
-   (clobber (reg:VNx2BI P8_REGNUM))
-   (clobber (reg:VNx2BI P9_REGNUM))
-   (clobber (reg:VNx2BI P10_REGNUM))
-   (clobber (reg:VNx2BI P11_REGNUM))
-   (clobber (reg:VNx2BI P12_REGNUM))
-   (clobber (reg:VNx2BI P13_REGNUM))
-   (clobber (reg:VNx2BI P14_REGNUM))
-   (clobber (reg:VNx2BI P15_REGNUM))
-   (clobber (match_scratch:DI 1 "=r"))]
+   (clobber (match_scratch:DI 2 "=r"))]
   "TARGET_TLS_DESC && TARGET_SVE"
-  "adrp\\tx0, %A0\;ldr\\t%<w>1, [x0, #%L0]\;add\\t<w>0, <w>0, %L0\;.tlsdesccall\\t%0\;blr\\t%1"
+  "adrp\\tx0, %A0\;ldr\\t%<w>2, [x0, #%L0]\;add\\t<w>0, <w>0, %L0\;.tlsdesccall\\t%0\;blr\\t%2"
   [(set_attr "type" "call")
    (set_attr "length" "16")])
 
@@ -6808,6 +6911,15 @@
   [(set_attr "length" "0")]
 )
 
+(define_insn "aarch64_fjcvtzs"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:DF 1 "register_operand" "w")]
+		   UNSPEC_FJCVTZS))]
+  "TARGET_JSCVT"
+  "fjcvtzs\\t%w0, %d1"
+  [(set_attr "type" "f_cvtf2i")]
+)
+
 ;; Pointer authentication patterns are always provided.  In architecture
 ;; revisions prior to ARMv8.3-A these HINT instructions operate as NOPs.
 ;; This lets the user write portable software which authenticates pointers
@@ -6821,7 +6933,7 @@
   [(set (reg:DI R30_REGNUM)
 	(unspec:DI [(reg:DI R30_REGNUM) (reg:DI SP_REGNUM)] PAUTH_LR_SP))]
   ""
-  "hint\t<pauth_hint_num_a> // <pauth_mnem_prefix>asp";
+  "hint\t<pauth_hint_num> // <pauth_mnem_prefix>sp";
 )
 
 ;; Signing/Authenticating X17 using X16 as the salt.
@@ -6830,7 +6942,7 @@
   [(set (reg:DI R17_REGNUM)
 	(unspec:DI [(reg:DI R17_REGNUM) (reg:DI R16_REGNUM)] PAUTH_17_16))]
   ""
-  "hint\t<pauth_hint_num_a> // <pauth_mnem_prefix>a1716";
+  "hint\t<pauth_hint_num> // <pauth_mnem_prefix>1716";
 )
 
 ;; Stripping the signature in R30.
@@ -6885,7 +6997,7 @@
 
 ;; Named pattern for expanding thread pointer reference.
 (define_expand "get_thread_pointerdi"
-  [(match_operand:DI 0 "register_operand" "=r")]
+  [(match_operand:DI 0 "register_operand")]
   ""
 {
   rtx tmp = aarch64_load_tp (operands[0]);
@@ -6941,13 +7053,15 @@
  }
  [(set_attr "type" "mrs")])
 
+;; DO NOT SPLIT THIS PATTERN.  It is important for security reasons that the
+;; canary value does not live beyond the life of this sequence.
 (define_insn "stack_protect_set_<mode>"
   [(set (match_operand:PTR 0 "memory_operand" "=m")
 	(unspec:PTR [(match_operand:PTR 1 "memory_operand" "m")]
 	 UNSPEC_SP_SET))
    (set (match_scratch:PTR 2 "=&r") (const_int 0))]
   ""
-  "ldr\\t%<w>2, %1\;str\\t%<w>2, %0\;mov\t%<w>2,0"
+  "ldr\\t%<w>2, %1\;str\\t%<w>2, %0\;mov\t%<w>2, 0"
   [(set_attr "length" "12")
    (set_attr "type" "multiple")])
 
@@ -7122,12 +7236,6 @@
   [(set_attr "type" "no_insn")]
 )
 
-;; Helper for aarch64.c code.
-(define_expand "set_clobber_cc"
-  [(parallel [(set (match_operand 0)
-		   (match_operand 1))
-	      (clobber (reg:CC CC_REGNUM))])])
-
 ;; Hard speculation barrier.
 (define_insn "speculation_barrier"
   [(unspec_volatile [(const_int 0)] UNSPECV_SPECULATION_BARRIER)]
@@ -7142,10 +7250,10 @@
 ;; tracking enabled.  Use the speculation tracker to decide whether to
 ;; copy operand 1 to the target, or to copy the fail value (operand 2).
 (define_expand "@despeculate_copy<ALLI_TI:mode>"
-  [(set (match_operand:ALLI_TI 0 "register_operand" "=r")
+  [(set (match_operand:ALLI_TI 0 "register_operand")
 	(unspec_volatile:ALLI_TI
-	 [(match_operand:ALLI_TI 1 "register_operand" "r")
-	  (match_operand:ALLI_TI 2 "aarch64_reg_or_zero" "rZ")
+	 [(match_operand:ALLI_TI 1 "register_operand")
+	  (match_operand:ALLI_TI 2 "aarch64_reg_or_zero")
 	  (use (reg:DI SPECULATION_TRACKER_REGNUM))
 	  (clobber (reg:CC CC_REGNUM))] UNSPECV_SPECULATION_BARRIER))]
   ""
@@ -7235,6 +7343,73 @@
    (set_attr "speculation_barrier" "true")]
 )
 
+(define_insn "aarch64_<frintnzs_op><mode>"
+  [(set (match_operand:VSFDF 0 "register_operand" "=w")
+	(unspec:VSFDF [(match_operand:VSFDF 1 "register_operand" "w")]
+		      FRINTNZX))]
+  "TARGET_FRINT && TARGET_FLOAT
+   && !(VECTOR_MODE_P (<MODE>mode) && !TARGET_SIMD)"
+  "<frintnzs_op>\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
+  [(set_attr "type" "f_rint<stype>")]
+)
+
+;; Transactional Memory Extension (TME) instructions.
+
+(define_insn "tstart"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec_volatile:DI [(const_int 0)] UNSPECV_TSTART))
+   (clobber (mem:BLK (scratch)))]
+  "TARGET_TME"
+  "tstart\\t%0"
+  [(set_attr "type" "tme")]
+)
+
+(define_insn "ttest"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec_volatile:DI [(const_int 0)] UNSPEC_TTEST))
+   (clobber (mem:BLK (scratch)))]
+  "TARGET_TME"
+  "ttest\\t%0"
+  [(set_attr "type" "tme")]
+)
+
+(define_insn "tcommit"
+  [(unspec_volatile:BLK [(const_int 0)] UNSPECV_TCOMMIT)
+   (clobber (mem:BLK (scratch)))]
+  "TARGET_TME"
+  "tcommit"
+  [(set_attr "type" "tme")]
+)
+
+(define_insn "tcancel"
+  [(unspec_volatile:BLK
+     [(match_operand 0 "const_int_operand" "n")] UNSPECV_TCANCEL)
+   (clobber (mem:BLK (scratch)))]
+  "TARGET_TME && (UINTVAL (operands[0]) <= 65535)"
+  "tcancel\\t#%0"
+  [(set_attr "type" "tme")]
+)
+
+(define_insn "aarch64_rndr"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec_volatile:DI [(const_int 0)] UNSPEC_RNDR))
+   (set (reg:CC_Z CC_REGNUM)
+	(unspec_volatile:CC_Z [(const_int 0)] UNSPEC_RNDR))]
+  "TARGET_RNG"
+  "mrs\t%0, RNDR"
+  [(set_attr "type" "mrs")]
+)
+
+(define_insn "aarch64_rndrrs"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec_volatile:DI [(const_int 0)] UNSPEC_RNDRRS))
+   (set (reg:CC_Z CC_REGNUM)
+	(unspec_volatile:CC_Z [(const_int 0)] UNSPEC_RNDRRS))]
+  "TARGET_RNG"
+  "mrs\t%0, RNDRRS"
+  [(set_attr "type" "mrs")]
+)
+
 ;; AdvSIMD Stuff
 (include "aarch64-simd.md")
 
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index d2cb41be6..e2be8ff6f 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -31,7 +31,7 @@ TargetSave
 const char *x_aarch64_override_tune_string
 
 TargetVariable
-unsigned long aarch64_isa_flags = 0
+uint64_t aarch64_isa_flags = 0
 
 TargetVariable
 unsigned aarch64_enable_bti = 2
@@ -261,3 +261,6 @@ user-land code.
 TargetVariable
 long aarch64_stack_protector_guard_offset = 0
 
+moutline-atomics
+Target Report Mask(OUTLINE_ATOMICS) Save
+Generate local calls to out-of-line atomic operations.
diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 534a989c3..2284e7164 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -29,14 +29,77 @@
 
 #include <stdint.h>
 
-#pragma GCC push_options
-
-#pragma GCC target ("+nothing+crc")
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.3-a")
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+__jcvt (double __a)
+{
+  return __builtin_aarch64_jcvtzs (__a);
+}
+
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.5-a")
+__extension__ static __inline float __attribute__ ((__always_inline__))
+__rint32zf (float __a)
+{
+  return __builtin_aarch64_frint32zsf (__a);
+}
+
+__extension__ static __inline double __attribute__ ((__always_inline__))
+__rint32z (double __a)
+{
+  return __builtin_aarch64_frint32zdf (__a);
+}
+
+__extension__ static __inline float __attribute__ ((__always_inline__))
+__rint64zf (float __a)
+{
+  return __builtin_aarch64_frint64zsf (__a);
+}
+
+__extension__ static __inline double __attribute__ ((__always_inline__))
+__rint64z (double __a)
+{
+  return __builtin_aarch64_frint64zdf (__a);
+}
+
+__extension__ static __inline float __attribute__ ((__always_inline__))
+__rint32xf (float __a)
+{
+  return __builtin_aarch64_frint32xsf (__a);
+}
+
+__extension__ static __inline double __attribute__ ((__always_inline__))
+__rint32x (double __a)
+{
+  return __builtin_aarch64_frint32xdf (__a);
+}
+
+__extension__ static __inline float __attribute__ ((__always_inline__))
+__rint64xf (float __a)
+{
+  return __builtin_aarch64_frint64xsf (__a);
+}
+
+__extension__ static __inline double __attribute__ ((__always_inline__))
+__rint64x (double __a)
+{
+  return __builtin_aarch64_frint64xdf (__a);
+}
+
+
+#pragma GCC pop_options
+
+#pragma GCC push_options
+
+#pragma GCC target ("+nothing+crc")
+
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 __crc32b (uint32_t __a, uint8_t __b)
 {
@@ -85,10 +148,69 @@ __crc32d (uint32_t __a, uint64_t __b)
   return __builtin_aarch64_crc32x (__a, __b);
 }
 
-#ifdef __cplusplus
+#pragma GCC pop_options
+
+#ifdef __ARM_FEATURE_TME
+#pragma GCC push_options
+#pragma GCC target ("+nothing+tme")
+
+#define _TMFAILURE_REASON     0x00007fffu
+#define _TMFAILURE_RTRY       0x00008000u
+#define _TMFAILURE_CNCL       0x00010000u
+#define _TMFAILURE_MEM        0x00020000u
+#define _TMFAILURE_IMP        0x00040000u
+#define _TMFAILURE_ERR        0x00080000u
+#define _TMFAILURE_SIZE       0x00100000u
+#define _TMFAILURE_NEST       0x00200000u
+#define _TMFAILURE_DBG        0x00400000u
+#define _TMFAILURE_INT        0x00800000u
+#define _TMFAILURE_TRIVIAL    0x01000000u
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+__tstart (void)
+{
+  return __builtin_aarch64_tstart ();
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+__tcommit (void)
+{
+  __builtin_aarch64_tcommit ();
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+__tcancel (const uint64_t __reason)
+{
+  __builtin_aarch64_tcancel (__reason);
 }
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+__ttest (void)
+{
+  return __builtin_aarch64_ttest ();
+}
+
+#pragma GCC pop_options
 #endif
 
+#pragma GCC push_options
+#pragma GCC target ("+nothing+rng")
+__extension__ static __inline int __attribute__ ((__always_inline__))
+__rndr (uint64_t *__res)
+{
+  return __builtin_aarch64_rndr (__res);
+}
+
+__extension__ static __inline int __attribute__ ((__always_inline__))
+__rndrrs (uint64_t *__res)
+{
+  return __builtin_aarch64_rndrrs (__res);
+}
+
 #pragma GCC pop_options
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
new file mode 100644
index 000000000..984875dcc
--- /dev/null
+++ b/gcc/config/aarch64/arm_bf16.h
@@ -0,0 +1,45 @@
+/* Arm BF16 instrinsics include file.
+
+   Copyright (C) 2019-2020 Free Software Foundation, Inc.
+   Contributed by Arm.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _AARCH64_BF16_H_
+#define _AARCH64_BF16_H_
+
+typedef __bf16 bfloat16_t;
+typedef float float32_t;
+
+#pragma GCC push_options
+#pragma GCC target ("+nothing+bf16+nosimd")
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvth_bf16_f32 (float32_t __a)
+{
+  return __builtin_aarch64_bfcvtbf (__a);
+}
+
+#pragma GCC pop_options
+
+#endif
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 314ef3018..7435905ff 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -73,6 +73,39 @@ typedef __fp16 float16_t;
 typedef float float32_t;
 typedef double float64_t;
 
+typedef __Bfloat16x4_t bfloat16x4_t;
+typedef __Bfloat16x8_t bfloat16x8_t;
+
+typedef struct bfloat16x4x2_t
+{
+  bfloat16x4_t val[2];
+} bfloat16x4x2_t;
+
+typedef struct bfloat16x8x2_t
+{
+  bfloat16x8_t val[2];
+} bfloat16x8x2_t;
+
+typedef struct bfloat16x4x3_t
+{
+  bfloat16x4_t val[3];
+} bfloat16x4x3_t;
+
+typedef struct bfloat16x8x3_t
+{
+  bfloat16x8_t val[3];
+} bfloat16x8x3_t;
+
+typedef struct bfloat16x4x4_t
+{
+  bfloat16x4_t val[4];
+} bfloat16x4x4_t;
+
+typedef struct bfloat16x8x4_t
+{
+  bfloat16x8_t val[4];
+} bfloat16x8x4_t;
+
 typedef struct int8x8x2_t
 {
   int8x8_t val[2];
@@ -6572,867 +6605,867 @@ vcombine_p64 (poly64x1_t __a, poly64x1_t __b)
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaba_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
+vaba_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
 {
-  int8x8_t result;
+  int8x8_t __result;
   __asm__ ("saba %0.8b,%2.8b,%3.8b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaba_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
+vaba_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
 {
-  int16x4_t result;
+  int16x4_t __result;
   __asm__ ("saba %0.4h,%2.4h,%3.4h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaba_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
+vaba_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
 {
-  int32x2_t result;
+  int32x2_t __result;
   __asm__ ("saba %0.2s,%2.2s,%3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaba_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
+vaba_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
-  uint8x8_t result;
+  uint8x8_t __result;
   __asm__ ("uaba %0.8b,%2.8b,%3.8b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaba_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
+vaba_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
-  uint16x4_t result;
+  uint16x4_t __result;
   __asm__ ("uaba %0.4h,%2.4h,%3.4h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaba_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
+vaba_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
-  uint32x2_t result;
+  uint32x2_t __result;
   __asm__ ("uaba %0.2s,%2.2s,%3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
+vabal_high_s8 (int16x8_t __a, int8x16_t __b, int8x16_t __c)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("sabal2 %0.8h,%2.16b,%3.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
+vabal_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("sabal2 %0.4s,%2.8h,%3.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
+vabal_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("sabal2 %0.2d,%2.4s,%3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
+vabal_high_u8 (uint16x8_t __a, uint8x16_t __b, uint8x16_t __c)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("uabal2 %0.8h,%2.16b,%3.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
+vabal_high_u16 (uint32x4_t __a, uint16x8_t __b, uint16x8_t __c)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("uabal2 %0.4s,%2.8h,%3.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
+vabal_high_u32 (uint64x2_t __a, uint32x4_t __b, uint32x4_t __c)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("uabal2 %0.2d,%2.4s,%3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
+vabal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("sabal %0.8h,%2.8b,%3.8b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
+vabal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("sabal %0.4s,%2.4h,%3.4h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
+vabal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("sabal %0.2d,%2.2s,%3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
+vabal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("uabal %0.8h,%2.8b,%3.8b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
+vabal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("uabal %0.4s,%2.4h,%3.4h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
+vabal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("uabal %0.2d,%2.2s,%3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
+vabaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
 {
-  int8x16_t result;
+  int8x16_t __result;
   __asm__ ("saba %0.16b,%2.16b,%3.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
+vabaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("saba %0.8h,%2.8h,%3.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
+vabaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("saba %0.4s,%2.4s,%3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
+vabaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
 {
-  uint8x16_t result;
+  uint8x16_t __result;
   __asm__ ("uaba %0.16b,%2.16b,%3.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
+vabaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("uaba %0.8h,%2.8h,%3.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
+vabaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("uaba %0.4s,%2.4s,%3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabd_s8 (int8x8_t a, int8x8_t b)
+vabd_s8 (int8x8_t __a, int8x8_t __b)
 {
-  int8x8_t result;
+  int8x8_t __result;
   __asm__ ("sabd %0.8b, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabd_s16 (int16x4_t a, int16x4_t b)
+vabd_s16 (int16x4_t __a, int16x4_t __b)
 {
-  int16x4_t result;
+  int16x4_t __result;
   __asm__ ("sabd %0.4h, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabd_s32 (int32x2_t a, int32x2_t b)
+vabd_s32 (int32x2_t __a, int32x2_t __b)
 {
-  int32x2_t result;
+  int32x2_t __result;
   __asm__ ("sabd %0.2s, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabd_u8 (uint8x8_t a, uint8x8_t b)
+vabd_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  uint8x8_t result;
+  uint8x8_t __result;
   __asm__ ("uabd %0.8b, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabd_u16 (uint16x4_t a, uint16x4_t b)
+vabd_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  uint16x4_t result;
+  uint16x4_t __result;
   __asm__ ("uabd %0.4h, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabd_u32 (uint32x2_t a, uint32x2_t b)
+vabd_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  uint32x2_t result;
+  uint32x2_t __result;
   __asm__ ("uabd %0.2s, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdl_high_s8 (int8x16_t a, int8x16_t b)
+vabdl_high_s8 (int8x16_t __a, int8x16_t __b)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("sabdl2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdl_high_s16 (int16x8_t a, int16x8_t b)
+vabdl_high_s16 (int16x8_t __a, int16x8_t __b)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("sabdl2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdl_high_s32 (int32x4_t a, int32x4_t b)
+vabdl_high_s32 (int32x4_t __a, int32x4_t __b)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("sabdl2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdl_high_u8 (uint8x16_t a, uint8x16_t b)
+vabdl_high_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("uabdl2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdl_high_u16 (uint16x8_t a, uint16x8_t b)
+vabdl_high_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("uabdl2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdl_high_u32 (uint32x4_t a, uint32x4_t b)
+vabdl_high_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("uabdl2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdl_s8 (int8x8_t a, int8x8_t b)
+vabdl_s8 (int8x8_t __a, int8x8_t __b)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("sabdl %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdl_s16 (int16x4_t a, int16x4_t b)
+vabdl_s16 (int16x4_t __a, int16x4_t __b)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("sabdl %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdl_s32 (int32x2_t a, int32x2_t b)
+vabdl_s32 (int32x2_t __a, int32x2_t __b)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("sabdl %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdl_u8 (uint8x8_t a, uint8x8_t b)
+vabdl_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("uabdl %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdl_u16 (uint16x4_t a, uint16x4_t b)
+vabdl_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("uabdl %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdl_u32 (uint32x2_t a, uint32x2_t b)
+vabdl_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("uabdl %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdq_s8 (int8x16_t a, int8x16_t b)
+vabdq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  int8x16_t result;
+  int8x16_t __result;
   __asm__ ("sabd %0.16b, %1.16b, %2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdq_s16 (int16x8_t a, int16x8_t b)
+vabdq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("sabd %0.8h, %1.8h, %2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdq_s32 (int32x4_t a, int32x4_t b)
+vabdq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("sabd %0.4s, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdq_u8 (uint8x16_t a, uint8x16_t b)
+vabdq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  uint8x16_t result;
+  uint8x16_t __result;
   __asm__ ("uabd %0.16b, %1.16b, %2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdq_u16 (uint16x8_t a, uint16x8_t b)
+vabdq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("uabd %0.8h, %1.8h, %2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdq_u32 (uint32x4_t a, uint32x4_t b)
+vabdq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("uabd %0.4s, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaddlv_s8 (int8x8_t a)
+vaddlv_s8 (int8x8_t __a)
 {
-  int16_t result;
+  int16_t __result;
   __asm__ ("saddlv %h0,%1.8b"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaddlv_s16 (int16x4_t a)
+vaddlv_s16 (int16x4_t __a)
 {
-  int32_t result;
+  int32_t __result;
   __asm__ ("saddlv %s0,%1.4h"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaddlv_u8 (uint8x8_t a)
+vaddlv_u8 (uint8x8_t __a)
 {
-  uint16_t result;
+  uint16_t __result;
   __asm__ ("uaddlv %h0,%1.8b"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaddlv_u16 (uint16x4_t a)
+vaddlv_u16 (uint16x4_t __a)
 {
-  uint32_t result;
+  uint32_t __result;
   __asm__ ("uaddlv %s0,%1.4h"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaddlvq_s8 (int8x16_t a)
+vaddlvq_s8 (int8x16_t __a)
 {
-  int16_t result;
+  int16_t __result;
   __asm__ ("saddlv %h0,%1.16b"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaddlvq_s16 (int16x8_t a)
+vaddlvq_s16 (int16x8_t __a)
 {
-  int32_t result;
+  int32_t __result;
   __asm__ ("saddlv %s0,%1.8h"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaddlvq_s32 (int32x4_t a)
+vaddlvq_s32 (int32x4_t __a)
 {
-  int64_t result;
+  int64_t __result;
   __asm__ ("saddlv %d0,%1.4s"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaddlvq_u8 (uint8x16_t a)
+vaddlvq_u8 (uint8x16_t __a)
 {
-  uint16_t result;
+  uint16_t __result;
   __asm__ ("uaddlv %h0,%1.16b"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaddlvq_u16 (uint16x8_t a)
+vaddlvq_u16 (uint16x8_t __a)
 {
-  uint32_t result;
+  uint32_t __result;
   __asm__ ("uaddlv %s0,%1.8h"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaddlvq_u32 (uint32x4_t a)
+vaddlvq_u32 (uint32x4_t __a)
 {
-  uint64_t result;
+  uint64_t __result;
   __asm__ ("uaddlv %d0,%1.4s"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcvtx_f32_f64 (float64x2_t a)
+vcvtx_f32_f64 (float64x2_t __a)
 {
-  float32x2_t result;
+  float32x2_t __result;
   __asm__ ("fcvtxn %0.2s,%1.2d"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcvtx_high_f32_f64 (float32x2_t a, float64x2_t b)
+vcvtx_high_f32_f64 (float32x2_t __a, float64x2_t __b)
 {
-  float32x4_t result;
+  float32x4_t __result;
   __asm__ ("fcvtxn2 %0.4s,%1.2d"
-           : "=w"(result)
-           : "w" (b), "0"(a)
+           : "=w"(__result)
+           : "w" (__b), "0"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline float32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vcvtxd_f32_f64 (float64_t a)
+vcvtxd_f32_f64 (float64_t __a)
 {
-  float32_t result;
+  float32_t __result;
   __asm__ ("fcvtxn %s0,%d1"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
+vmla_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
 {
-  float32x2_t result;
-  float32x2_t t1;
+  float32x2_t __result;
+  float32x2_t __t1;
   __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fadd %0.2s, %0.2s, %1.2s"
-           : "=w"(result), "=w"(t1)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result), "=w"(__t1)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmla_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
+vmla_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
 {
-  int16x4_t result;
+  int16x4_t __result;
   __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "x"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmla_n_s32 (int32x2_t a, int32x2_t b, int32_t c)
+vmla_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
 {
-  int32x2_t result;
+  int32x2_t __result;
   __asm__ ("mla %0.2s,%2.2s,%3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmla_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
+vmla_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
 {
-  uint16x4_t result;
+  uint16x4_t __result;
   __asm__ ("mla %0.4h,%2.4h,%3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "x"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmla_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c)
+vmla_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
 {
-  uint32x2_t result;
+  uint32x2_t __result;
   __asm__ ("mla %0.2s,%2.2s,%3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmla_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
+vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
 {
-  int8x8_t result;
+  int8x8_t __result;
   __asm__ ("mla %0.8b, %2.8b, %3.8b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmla_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
+vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
 {
-  int16x4_t result;
+  int16x4_t __result;
   __asm__ ("mla %0.4h, %2.4h, %3.4h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmla_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
+vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
 {
-  int32x2_t result;
+  int32x2_t __result;
   __asm__ ("mla %0.2s, %2.2s, %3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmla_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
+vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
-  uint8x8_t result;
+  uint8x8_t __result;
   __asm__ ("mla %0.8b, %2.8b, %3.8b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmla_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
+vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
-  uint16x4_t result;
+  uint16x4_t __result;
   __asm__ ("mla %0.4h, %2.4h, %3.4h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmla_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
+vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
-  uint32x2_t result;
+  uint32x2_t __result;
   __asm__ ("mla %0.2s, %2.2s, %3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 #define vmlal_high_lane_s16(a, b, c, d)                                 \
@@ -7549,122 +7582,122 @@ vmla_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c)
+vmlal_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("smlal2 %0.4s,%2.8h,%3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "x"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c)
+vmlal_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("smlal2 %0.2d,%2.4s,%3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c)
+vmlal_high_n_u16 (uint32x4_t __a, uint16x8_t __b, uint16_t __c)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("umlal2 %0.4s,%2.8h,%3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "x"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c)
+vmlal_high_n_u32 (uint64x2_t __a, uint32x4_t __b, uint32_t __c)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("umlal2 %0.2d,%2.4s,%3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
+vmlal_high_s8 (int16x8_t __a, int8x16_t __b, int8x16_t __c)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("smlal2 %0.8h,%2.16b,%3.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
+vmlal_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("smlal2 %0.4s,%2.8h,%3.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
+vmlal_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("smlal2 %0.2d,%2.4s,%3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
+vmlal_high_u8 (uint16x8_t __a, uint8x16_t __b, uint8x16_t __c)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("umlal2 %0.8h,%2.16b,%3.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
+vmlal_high_u16 (uint32x4_t __a, uint16x8_t __b, uint16x8_t __c)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("umlal2 %0.4s,%2.8h,%3.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
+vmlal_high_u32 (uint64x2_t __a, uint32x4_t __b, uint32x4_t __c)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("umlal2 %0.2d,%2.4s,%3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 #define vmlal_lane_s16(a, b, c, d)                                      \
@@ -7781,388 +7814,388 @@ vmlal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_n_s16 (int32x4_t a, int16x4_t b, int16_t c)
+vmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("smlal %0.4s,%2.4h,%3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "x"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_n_s32 (int64x2_t a, int32x2_t b, int32_t c)
+vmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("smlal %0.2d,%2.2s,%3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c)
+vmlal_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("umlal %0.4s,%2.4h,%3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "x"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c)
+vmlal_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("umlal %0.2d,%2.2s,%3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
+vmlal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("smlal %0.8h,%2.8b,%3.8b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
+vmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("smlal %0.4s,%2.4h,%3.4h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
+vmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("smlal %0.2d,%2.2s,%3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
+vmlal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("umlal %0.8h,%2.8b,%3.8b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
+vmlal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("umlal %0.4s,%2.4h,%3.4h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
+vmlal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("umlal %0.2d,%2.2s,%3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
+vmlaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
 {
-  float32x4_t result;
-  float32x4_t t1;
+  float32x4_t __result;
+  float32x4_t __t1;
   __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fadd %0.4s, %0.4s, %1.4s"
-           : "=w"(result), "=w"(t1)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result), "=w"(__t1)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlaq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
+vmlaq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "x"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlaq_n_s32 (int32x4_t a, int32x4_t b, int32_t c)
+vmlaq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("mla %0.4s,%2.4s,%3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlaq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
+vmlaq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("mla %0.8h,%2.8h,%3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "x"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlaq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c)
+vmlaq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("mla %0.4s,%2.4s,%3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
+vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
 {
-  int8x16_t result;
+  int8x16_t __result;
   __asm__ ("mla %0.16b, %2.16b, %3.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
+vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("mla %0.8h, %2.8h, %3.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
+vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("mla %0.4s, %2.4s, %3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
+vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
 {
-  uint8x16_t result;
+  uint8x16_t __result;
   __asm__ ("mla %0.16b, %2.16b, %3.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
+vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("mla %0.8h, %2.8h, %3.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
+vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("mla %0.4s, %2.4s, %3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmls_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
+vmls_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
 {
-  float32x2_t result;
-  float32x2_t t1;
+  float32x2_t __result;
+  float32x2_t __t1;
   __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fsub %0.2s, %0.2s, %1.2s"
-           : "=w"(result), "=w"(t1)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result), "=w"(__t1)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmls_n_s16 (int16x4_t a, int16x4_t b, int16_t c)
+vmls_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
 {
-  int16x4_t result;
+  int16x4_t __result;
   __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "x"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmls_n_s32 (int32x2_t a, int32x2_t b, int32_t c)
+vmls_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
 {
-  int32x2_t result;
+  int32x2_t __result;
   __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmls_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c)
+vmls_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
 {
-  uint16x4_t result;
+  uint16x4_t __result;
   __asm__ ("mls %0.4h, %2.4h, %3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "x"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmls_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c)
+vmls_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
 {
-  uint32x2_t result;
+  uint32x2_t __result;
   __asm__ ("mls %0.2s, %2.2s, %3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmls_s8 (int8x8_t a, int8x8_t b, int8x8_t c)
+vmls_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
 {
-  int8x8_t result;
+  int8x8_t __result;
   __asm__ ("mls %0.8b,%2.8b,%3.8b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmls_s16 (int16x4_t a, int16x4_t b, int16x4_t c)
+vmls_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
 {
-  int16x4_t result;
+  int16x4_t __result;
   __asm__ ("mls %0.4h,%2.4h,%3.4h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmls_s32 (int32x2_t a, int32x2_t b, int32x2_t c)
+vmls_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
 {
-  int32x2_t result;
+  int32x2_t __result;
   __asm__ ("mls %0.2s,%2.2s,%3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmls_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c)
+vmls_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
-  uint8x8_t result;
+  uint8x8_t __result;
   __asm__ ("mls %0.8b,%2.8b,%3.8b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmls_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c)
+vmls_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
-  uint16x4_t result;
+  uint16x4_t __result;
   __asm__ ("mls %0.4h,%2.4h,%3.4h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
+vmls_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
-  uint32x2_t result;
+  uint32x2_t __result;
   __asm__ ("mls %0.2s,%2.2s,%3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 #define vmlsl_high_lane_s16(a, b, c, d)                                 \
@@ -8279,122 +8312,122 @@ vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c)
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c)
+vmlsl_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "x"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c)
+vmlsl_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c)
+vmlsl_high_n_u16 (uint32x4_t __a, uint16x8_t __b, uint16_t __c)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "x"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c)
+vmlsl_high_n_u32 (uint64x2_t __a, uint32x4_t __b, uint32_t __c)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c)
+vmlsl_high_s8 (int16x8_t __a, int8x16_t __b, int8x16_t __c)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("smlsl2 %0.8h,%2.16b,%3.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c)
+vmlsl_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("smlsl2 %0.4s,%2.8h,%3.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c)
+vmlsl_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("smlsl2 %0.2d,%2.4s,%3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c)
+vmlsl_high_u8 (uint16x8_t __a, uint8x16_t __b, uint8x16_t __c)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("umlsl2 %0.8h,%2.16b,%3.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c)
+vmlsl_high_u16 (uint32x4_t __a, uint16x8_t __b, uint16x8_t __c)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("umlsl2 %0.4s,%2.8h,%3.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
+vmlsl_high_u32 (uint64x2_t __a, uint32x4_t __b, uint32x4_t __c)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("umlsl2 %0.2d,%2.4s,%3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 #define vmlsl_lane_s16(a, b, c, d)                                      \
@@ -8511,543 +8544,543 @@ vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c)
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_n_s16 (int32x4_t a, int16x4_t b, int16_t c)
+vmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("smlsl %0.4s, %2.4h, %3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "x"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_n_s32 (int64x2_t a, int32x2_t b, int32_t c)
+vmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("smlsl %0.2d, %2.2s, %3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c)
+vmlsl_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("umlsl %0.4s, %2.4h, %3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "x"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c)
+vmlsl_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("umlsl %0.2d, %2.2s, %3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_s8 (int16x8_t a, int8x8_t b, int8x8_t c)
+vmlsl_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("smlsl %0.8h, %2.8b, %3.8b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_s16 (int32x4_t a, int16x4_t b, int16x4_t c)
+vmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("smlsl %0.4s, %2.4h, %3.4h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_s32 (int64x2_t a, int32x2_t b, int32x2_t c)
+vmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("smlsl %0.2d, %2.2s, %3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c)
+vmlsl_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("umlsl %0.8h, %2.8b, %3.8b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c)
+vmlsl_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("umlsl %0.4s, %2.4h, %3.4h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
+vmlsl_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("umlsl %0.2d, %2.2s, %3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
+vmlsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
 {
-  float32x4_t result;
-  float32x4_t t1;
+  float32x4_t __result;
+  float32x4_t __t1;
   __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fsub %0.4s, %0.4s, %1.4s"
-           : "=w"(result), "=w"(t1)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result), "=w"(__t1)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsq_n_s16 (int16x8_t a, int16x8_t b, int16_t c)
+vmlsq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "x"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsq_n_s32 (int32x4_t a, int32x4_t b, int32_t c)
+vmlsq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("mls %0.4s, %2.4s, %3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c)
+vmlsq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("mls %0.8h, %2.8h, %3.h[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "x"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "x"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c)
+vmlsq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("mls %0.4s, %2.4s, %3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsq_s8 (int8x16_t a, int8x16_t b, int8x16_t c)
+vmlsq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
 {
-  int8x16_t result;
+  int8x16_t __result;
   __asm__ ("mls %0.16b,%2.16b,%3.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsq_s16 (int16x8_t a, int16x8_t b, int16x8_t c)
+vmlsq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("mls %0.8h,%2.8h,%3.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
+vmlsq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("mls %0.4s,%2.4s,%3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
+vmlsq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
 {
-  uint8x16_t result;
+  uint8x16_t __result;
   __asm__ ("mls %0.16b,%2.16b,%3.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
+vmlsq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("mls %0.8h,%2.8h,%3.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
+vmlsq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("mls %0.4s,%2.4s,%3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b), "w"(__c)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovl_high_s8 (int8x16_t a)
+vmovl_high_s8 (int8x16_t __a)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("sshll2 %0.8h,%1.16b,#0"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovl_high_s16 (int16x8_t a)
+vmovl_high_s16 (int16x8_t __a)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("sshll2 %0.4s,%1.8h,#0"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovl_high_s32 (int32x4_t a)
+vmovl_high_s32 (int32x4_t __a)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("sshll2 %0.2d,%1.4s,#0"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovl_high_u8 (uint8x16_t a)
+vmovl_high_u8 (uint8x16_t __a)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("ushll2 %0.8h,%1.16b,#0"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovl_high_u16 (uint16x8_t a)
+vmovl_high_u16 (uint16x8_t __a)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("ushll2 %0.4s,%1.8h,#0"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovl_high_u32 (uint32x4_t a)
+vmovl_high_u32 (uint32x4_t __a)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("ushll2 %0.2d,%1.4s,#0"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovl_s8 (int8x8_t a)
+vmovl_s8 (int8x8_t __a)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("sshll %0.8h,%1.8b,#0"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovl_s16 (int16x4_t a)
+vmovl_s16 (int16x4_t __a)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("sshll %0.4s,%1.4h,#0"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovl_s32 (int32x2_t a)
+vmovl_s32 (int32x2_t __a)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("sshll %0.2d,%1.2s,#0"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovl_u8 (uint8x8_t a)
+vmovl_u8 (uint8x8_t __a)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("ushll %0.8h,%1.8b,#0"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovl_u16 (uint16x4_t a)
+vmovl_u16 (uint16x4_t __a)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("ushll %0.4s,%1.4h,#0"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovl_u32 (uint32x2_t a)
+vmovl_u32 (uint32x2_t __a)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("ushll %0.2d,%1.2s,#0"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovn_high_s16 (int8x8_t a, int16x8_t b)
+vmovn_high_s16 (int8x8_t __a, int16x8_t __b)
 {
-  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
+  int8x16_t __result = vcombine_s8 (__a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("xtn2 %0.16b,%1.8h"
-           : "+w"(result)
-           : "w"(b)
+           : "+w"(__result)
+           : "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovn_high_s32 (int16x4_t a, int32x4_t b)
+vmovn_high_s32 (int16x4_t __a, int32x4_t __b)
 {
-  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
+  int16x8_t __result = vcombine_s16 (__a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("xtn2 %0.8h,%1.4s"
-           : "+w"(result)
-           : "w"(b)
+           : "+w"(__result)
+           : "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovn_high_s64 (int32x2_t a, int64x2_t b)
+vmovn_high_s64 (int32x2_t __a, int64x2_t __b)
 {
-  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
+  int32x4_t __result = vcombine_s32 (__a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("xtn2 %0.4s,%1.2d"
-           : "+w"(result)
-           : "w"(b)
+           : "+w"(__result)
+           : "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovn_high_u16 (uint8x8_t a, uint16x8_t b)
+vmovn_high_u16 (uint8x8_t __a, uint16x8_t __b)
 {
-  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+  uint8x16_t __result = vcombine_u8 (__a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("xtn2 %0.16b,%1.8h"
-           : "+w"(result)
-           : "w"(b)
+           : "+w"(__result)
+           : "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovn_high_u32 (uint16x4_t a, uint32x4_t b)
+vmovn_high_u32 (uint16x4_t __a, uint32x4_t __b)
 {
-  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
+  uint16x8_t __result = vcombine_u16 (__a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("xtn2 %0.8h,%1.4s"
-           : "+w"(result)
-           : "w"(b)
+           : "+w"(__result)
+           : "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovn_high_u64 (uint32x2_t a, uint64x2_t b)
+vmovn_high_u64 (uint32x2_t __a, uint64x2_t __b)
 {
-  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
+  uint32x4_t __result = vcombine_u32 (__a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("xtn2 %0.4s,%1.2d"
-           : "+w"(result)
-           : "w"(b)
+           : "+w"(__result)
+           : "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovn_s16 (int16x8_t a)
+vmovn_s16 (int16x8_t __a)
 {
-  int8x8_t result;
+  int8x8_t __result;
   __asm__ ("xtn %0.8b,%1.8h"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovn_s32 (int32x4_t a)
+vmovn_s32 (int32x4_t __a)
 {
-  int16x4_t result;
+  int16x4_t __result;
   __asm__ ("xtn %0.4h,%1.4s"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovn_s64 (int64x2_t a)
+vmovn_s64 (int64x2_t __a)
 {
-  int32x2_t result;
+  int32x2_t __result;
   __asm__ ("xtn %0.2s,%1.2d"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovn_u16 (uint16x8_t a)
+vmovn_u16 (uint16x8_t __a)
 {
-  uint8x8_t result;
+  uint8x8_t __result;
   __asm__ ("xtn %0.8b,%1.8h"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovn_u32 (uint32x4_t a)
+vmovn_u32 (uint32x4_t __a)
 {
-  uint16x4_t result;
+  uint16x4_t __result;
   __asm__ ("xtn %0.4h,%1.4s"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmovn_u64 (uint64x2_t a)
+vmovn_u64 (uint64x2_t __a)
 {
-  uint32x2_t result;
+  uint32x2_t __result;
   __asm__ ("xtn %0.2s,%1.2d"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 #define vmull_high_lane_s16(a, b, c)                                    \
@@ -9156,134 +9189,134 @@ vmovn_u64 (uint64x2_t a)
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_high_n_s16 (int16x8_t a, int16_t b)
+vmull_high_n_s16 (int16x8_t __a, int16_t __b)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
+           : "=w"(__result)
+           : "w"(__a), "x"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_high_n_s32 (int32x4_t a, int32_t b)
+vmull_high_n_s32 (int32x4_t __a, int32_t __b)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_high_n_u16 (uint16x8_t a, uint16_t b)
+vmull_high_n_u16 (uint16x8_t __a, uint16_t __b)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
+           : "=w"(__result)
+           : "w"(__a), "x"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_high_n_u32 (uint32x4_t a, uint32_t b)
+vmull_high_n_u32 (uint32x4_t __a, uint32_t __b)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline poly16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_high_p8 (poly8x16_t a, poly8x16_t b)
+vmull_high_p8 (poly8x16_t __a, poly8x16_t __b)
 {
-  poly16x8_t result;
+  poly16x8_t __result;
   __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_high_s8 (int8x16_t a, int8x16_t b)
+vmull_high_s8 (int8x16_t __a, int8x16_t __b)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_high_s16 (int16x8_t a, int16x8_t b)
+vmull_high_s16 (int16x8_t __a, int16x8_t __b)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_high_s32 (int32x4_t a, int32x4_t b)
+vmull_high_s32 (int32x4_t __a, int32x4_t __b)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_high_u8 (uint8x16_t a, uint8x16_t b)
+vmull_high_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_high_u16 (uint16x8_t a, uint16x8_t b)
+vmull_high_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_high_u32 (uint32x4_t a, uint32x4_t b)
+vmull_high_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 #define vmull_lane_s16(a, b, c)                                         \
@@ -9392,722 +9425,722 @@ vmull_high_u32 (uint32x4_t a, uint32x4_t b)
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_n_s16 (int16x4_t a, int16_t b)
+vmull_n_s16 (int16x4_t __a, int16_t __b)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
+           : "=w"(__result)
+           : "w"(__a), "x"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_n_s32 (int32x2_t a, int32_t b)
+vmull_n_s32 (int32x2_t __a, int32_t __b)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_n_u16 (uint16x4_t a, uint16_t b)
+vmull_n_u16 (uint16x4_t __a, uint16_t __b)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
+           : "=w"(__result)
+           : "w"(__a), "x"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_n_u32 (uint32x2_t a, uint32_t b)
+vmull_n_u32 (uint32x2_t __a, uint32_t __b)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline poly16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_p8 (poly8x8_t a, poly8x8_t b)
+vmull_p8 (poly8x8_t __a, poly8x8_t __b)
 {
-  poly16x8_t result;
+  poly16x8_t __result;
   __asm__ ("pmull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_s8 (int8x8_t a, int8x8_t b)
+vmull_s8 (int8x8_t __a, int8x8_t __b)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("smull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_s16 (int16x4_t a, int16x4_t b)
+vmull_s16 (int16x4_t __a, int16x4_t __b)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("smull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_s32 (int32x2_t a, int32x2_t b)
+vmull_s32 (int32x2_t __a, int32x2_t __b)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("smull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_u8 (uint8x8_t a, uint8x8_t b)
+vmull_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("umull %0.8h, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_u16 (uint16x4_t a, uint16x4_t b)
+vmull_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("umull %0.4s, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_u32 (uint32x2_t a, uint32x2_t b)
+vmull_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("umull %0.2d, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpadal_s8 (int16x4_t a, int8x8_t b)
+vpadal_s8 (int16x4_t __a, int8x8_t __b)
 {
-  int16x4_t result;
+  int16x4_t __result;
   __asm__ ("sadalp %0.4h,%2.8b"
-           : "=w"(result)
-           : "0"(a), "w"(b)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpadal_s16 (int32x2_t a, int16x4_t b)
+vpadal_s16 (int32x2_t __a, int16x4_t __b)
 {
-  int32x2_t result;
+  int32x2_t __result;
   __asm__ ("sadalp %0.2s,%2.4h"
-           : "=w"(result)
-           : "0"(a), "w"(b)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpadal_s32 (int64x1_t a, int32x2_t b)
+vpadal_s32 (int64x1_t __a, int32x2_t __b)
 {
-  int64x1_t result;
+  int64x1_t __result;
   __asm__ ("sadalp %0.1d,%2.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpadal_u8 (uint16x4_t a, uint8x8_t b)
+vpadal_u8 (uint16x4_t __a, uint8x8_t __b)
 {
-  uint16x4_t result;
+  uint16x4_t __result;
   __asm__ ("uadalp %0.4h,%2.8b"
-           : "=w"(result)
-           : "0"(a), "w"(b)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpadal_u16 (uint32x2_t a, uint16x4_t b)
+vpadal_u16 (uint32x2_t __a, uint16x4_t __b)
 {
-  uint32x2_t result;
+  uint32x2_t __result;
   __asm__ ("uadalp %0.2s,%2.4h"
-           : "=w"(result)
-           : "0"(a), "w"(b)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpadal_u32 (uint64x1_t a, uint32x2_t b)
+vpadal_u32 (uint64x1_t __a, uint32x2_t __b)
 {
-  uint64x1_t result;
+  uint64x1_t __result;
   __asm__ ("uadalp %0.1d,%2.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpadalq_s8 (int16x8_t a, int8x16_t b)
+vpadalq_s8 (int16x8_t __a, int8x16_t __b)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("sadalp %0.8h,%2.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpadalq_s16 (int32x4_t a, int16x8_t b)
+vpadalq_s16 (int32x4_t __a, int16x8_t __b)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("sadalp %0.4s,%2.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpadalq_s32 (int64x2_t a, int32x4_t b)
+vpadalq_s32 (int64x2_t __a, int32x4_t __b)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("sadalp %0.2d,%2.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpadalq_u8 (uint16x8_t a, uint8x16_t b)
+vpadalq_u8 (uint16x8_t __a, uint8x16_t __b)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("uadalp %0.8h,%2.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpadalq_u16 (uint32x4_t a, uint16x8_t b)
+vpadalq_u16 (uint32x4_t __a, uint16x8_t __b)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("uadalp %0.4s,%2.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpadalq_u32 (uint64x2_t a, uint32x4_t b)
+vpadalq_u32 (uint64x2_t __a, uint32x4_t __b)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("uadalp %0.2d,%2.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b)
+           : "=w"(__result)
+           : "0"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddl_s8 (int8x8_t a)
+vpaddl_s8 (int8x8_t __a)
 {
-  int16x4_t result;
+  int16x4_t __result;
   __asm__ ("saddlp %0.4h,%1.8b"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddl_s16 (int16x4_t a)
+vpaddl_s16 (int16x4_t __a)
 {
-  int32x2_t result;
+  int32x2_t __result;
   __asm__ ("saddlp %0.2s,%1.4h"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddl_s32 (int32x2_t a)
+vpaddl_s32 (int32x2_t __a)
 {
-  int64x1_t result;
+  int64x1_t __result;
   __asm__ ("saddlp %0.1d,%1.2s"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddl_u8 (uint8x8_t a)
+vpaddl_u8 (uint8x8_t __a)
 {
-  uint16x4_t result;
+  uint16x4_t __result;
   __asm__ ("uaddlp %0.4h,%1.8b"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddl_u16 (uint16x4_t a)
+vpaddl_u16 (uint16x4_t __a)
 {
-  uint32x2_t result;
+  uint32x2_t __result;
   __asm__ ("uaddlp %0.2s,%1.4h"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddl_u32 (uint32x2_t a)
+vpaddl_u32 (uint32x2_t __a)
 {
-  uint64x1_t result;
+  uint64x1_t __result;
   __asm__ ("uaddlp %0.1d,%1.2s"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddlq_s8 (int8x16_t a)
+vpaddlq_s8 (int8x16_t __a)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("saddlp %0.8h,%1.16b"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddlq_s16 (int16x8_t a)
+vpaddlq_s16 (int16x8_t __a)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("saddlp %0.4s,%1.8h"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddlq_s32 (int32x4_t a)
+vpaddlq_s32 (int32x4_t __a)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("saddlp %0.2d,%1.4s"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddlq_u8 (uint8x16_t a)
+vpaddlq_u8 (uint8x16_t __a)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("uaddlp %0.8h,%1.16b"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddlq_u16 (uint16x8_t a)
+vpaddlq_u16 (uint16x8_t __a)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("uaddlp %0.4s,%1.8h"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddlq_u32 (uint32x4_t a)
+vpaddlq_u32 (uint32x4_t __a)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("uaddlp %0.2d,%1.4s"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddq_s8 (int8x16_t a, int8x16_t b)
+vpaddq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  int8x16_t result;
+  int8x16_t __result;
   __asm__ ("addp %0.16b,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddq_s16 (int16x8_t a, int16x8_t b)
+vpaddq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("addp %0.8h,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddq_s32 (int32x4_t a, int32x4_t b)
+vpaddq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("addp %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddq_s64 (int64x2_t a, int64x2_t b)
+vpaddq_s64 (int64x2_t __a, int64x2_t __b)
 {
-  int64x2_t result;
+  int64x2_t __result;
   __asm__ ("addp %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddq_u8 (uint8x16_t a, uint8x16_t b)
+vpaddq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  uint8x16_t result;
+  uint8x16_t __result;
   __asm__ ("addp %0.16b,%1.16b,%2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddq_u16 (uint16x8_t a, uint16x8_t b)
+vpaddq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  uint16x8_t result;
+  uint16x8_t __result;
   __asm__ ("addp %0.8h,%1.8h,%2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddq_u32 (uint32x4_t a, uint32x4_t b)
+vpaddq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("addp %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddq_u64 (uint64x2_t a, uint64x2_t b)
+vpaddq_u64 (uint64x2_t __a, uint64x2_t __b)
 {
-  uint64x2_t result;
+  uint64x2_t __result;
   __asm__ ("addp %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqdmulh_n_s16 (int16x4_t a, int16_t b)
+vqdmulh_n_s16 (int16x4_t __a, int16_t __b)
 {
-  int16x4_t result;
+  int16x4_t __result;
   __asm__ ("sqdmulh %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
+           : "=w"(__result)
+           : "w"(__a), "x"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqdmulh_n_s32 (int32x2_t a, int32_t b)
+vqdmulh_n_s32 (int32x2_t __a, int32_t __b)
 {
-  int32x2_t result;
+  int32x2_t __result;
   __asm__ ("sqdmulh %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqdmulhq_n_s16 (int16x8_t a, int16_t b)
+vqdmulhq_n_s16 (int16x8_t __a, int16_t __b)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("sqdmulh %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
+           : "=w"(__result)
+           : "w"(__a), "x"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqdmulhq_n_s32 (int32x4_t a, int32_t b)
+vqdmulhq_n_s32 (int32x4_t __a, int32_t __b)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("sqdmulh %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqmovn_high_s16 (int8x8_t a, int16x8_t b)
+vqmovn_high_s16 (int8x8_t __a, int16x8_t __b)
 {
-  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
+  int8x16_t __result = vcombine_s8 (__a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("sqxtn2 %0.16b, %1.8h"
-           : "+w"(result)
-           : "w"(b)
+           : "+w"(__result)
+           : "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqmovn_high_s32 (int16x4_t a, int32x4_t b)
+vqmovn_high_s32 (int16x4_t __a, int32x4_t __b)
 {
-  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
+  int16x8_t __result = vcombine_s16 (__a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("sqxtn2 %0.8h, %1.4s"
-           : "+w"(result)
-           : "w"(b)
+           : "+w"(__result)
+           : "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqmovn_high_s64 (int32x2_t a, int64x2_t b)
+vqmovn_high_s64 (int32x2_t __a, int64x2_t __b)
 {
-  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
+  int32x4_t __result = vcombine_s32 (__a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("sqxtn2 %0.4s, %1.2d"
-           : "+w"(result)
-           : "w"(b)
+           : "+w"(__result)
+           : "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqmovn_high_u16 (uint8x8_t a, uint16x8_t b)
+vqmovn_high_u16 (uint8x8_t __a, uint16x8_t __b)
 {
-  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+  uint8x16_t __result = vcombine_u8 (__a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("uqxtn2 %0.16b, %1.8h"
-           : "+w"(result)
-           : "w"(b)
+           : "+w"(__result)
+           : "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqmovn_high_u32 (uint16x4_t a, uint32x4_t b)
+vqmovn_high_u32 (uint16x4_t __a, uint32x4_t __b)
 {
-  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
+  uint16x8_t __result = vcombine_u16 (__a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("uqxtn2 %0.8h, %1.4s"
-           : "+w"(result)
-           : "w"(b)
+           : "+w"(__result)
+           : "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqmovn_high_u64 (uint32x2_t a, uint64x2_t b)
+vqmovn_high_u64 (uint32x2_t __a, uint64x2_t __b)
 {
-  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
+  uint32x4_t __result = vcombine_u32 (__a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("uqxtn2 %0.4s, %1.2d"
-           : "+w"(result)
-           : "w"(b)
+           : "+w"(__result)
+           : "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqmovun_high_s16 (uint8x8_t a, int16x8_t b)
+vqmovun_high_s16 (uint8x8_t __a, int16x8_t __b)
 {
-  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+  uint8x16_t __result = vcombine_u8 (__a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("sqxtun2 %0.16b, %1.8h"
-           : "+w"(result)
-           : "w"(b)
+           : "+w"(__result)
+           : "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqmovun_high_s32 (uint16x4_t a, int32x4_t b)
+vqmovun_high_s32 (uint16x4_t __a, int32x4_t __b)
 {
-  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
+  uint16x8_t __result = vcombine_u16 (__a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("sqxtun2 %0.8h, %1.4s"
-           : "+w"(result)
-           : "w"(b)
+           : "+w"(__result)
+           : "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqmovun_high_s64 (uint32x2_t a, int64x2_t b)
+vqmovun_high_s64 (uint32x2_t __a, int64x2_t __b)
 {
-  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
+  uint32x4_t __result = vcombine_u32 (__a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("sqxtun2 %0.4s, %1.2d"
-           : "+w"(result)
-           : "w"(b)
+           : "+w"(__result)
+           : "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqrdmulh_n_s16 (int16x4_t a, int16_t b)
+vqrdmulh_n_s16 (int16x4_t __a, int16_t __b)
 {
-  int16x4_t result;
+  int16x4_t __result;
   __asm__ ("sqrdmulh %0.4h,%1.4h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
+           : "=w"(__result)
+           : "w"(__a), "x"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqrdmulh_n_s32 (int32x2_t a, int32_t b)
+vqrdmulh_n_s32 (int32x2_t __a, int32_t __b)
 {
-  int32x2_t result;
+  int32x2_t __result;
   __asm__ ("sqrdmulh %0.2s,%1.2s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqrdmulhq_n_s16 (int16x8_t a, int16_t b)
+vqrdmulhq_n_s16 (int16x8_t __a, int16_t __b)
 {
-  int16x8_t result;
+  int16x8_t __result;
   __asm__ ("sqrdmulh %0.8h,%1.8h,%2.h[0]"
-           : "=w"(result)
-           : "w"(a), "x"(b)
+           : "=w"(__result)
+           : "w"(__a), "x"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
+vqrdmulhq_n_s32 (int32x4_t __a, int32_t __b)
 {
-  int32x4_t result;
+  int32x4_t __result;
   __asm__ ("sqrdmulh %0.4s,%1.4s,%2.s[0]"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 #define vqrshrn_high_n_s16(a, b, c)                                     \
@@ -10544,26 +10577,26 @@ vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrsqrte_u32 (uint32x2_t a)
+vrsqrte_u32 (uint32x2_t __a)
 {
-  uint32x2_t result;
+  uint32x2_t __result;
   __asm__ ("ursqrte %0.2s,%1.2s"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrsqrteq_u32 (uint32x4_t a)
+vrsqrteq_u32 (uint32x4_t __a)
 {
-  uint32x4_t result;
+  uint32x4_t __result;
   __asm__ ("ursqrte %0.4s,%1.4s"
-           : "=w"(result)
-           : "w"(a)
+           : "=w"(__result)
+           : "w"(__a)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 #define vshrn_high_n_s16(a, b, c)                                       \
@@ -10860,48 +10893,48 @@ vrsqrteq_u32 (uint32x4_t a)
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtst_p8 (poly8x8_t a, poly8x8_t b)
+vtst_p8 (poly8x8_t __a, poly8x8_t __b)
 {
-  return (uint8x8_t) ((((uint8x8_t) a) & ((uint8x8_t) b))
+  return (uint8x8_t) ((((uint8x8_t) __a) & ((uint8x8_t) __b))
 		       != 0);
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtst_p16 (poly16x4_t a, poly16x4_t b)
+vtst_p16 (poly16x4_t __a, poly16x4_t __b)
 {
-  return (uint16x4_t) ((((uint16x4_t) a) & ((uint16x4_t) b))
+  return (uint16x4_t) ((((uint16x4_t) __a) & ((uint16x4_t) __b))
 		       != 0);
 }
 
 __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtst_p64 (poly64x1_t a, poly64x1_t b)
+vtst_p64 (poly64x1_t __a, poly64x1_t __b)
 {
-  return (uint64x1_t) ((a & b) != __AARCH64_INT64_C (0));
+  return (uint64x1_t) ((__a & __b) != __AARCH64_INT64_C (0));
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtstq_p8 (poly8x16_t a, poly8x16_t b)
+vtstq_p8 (poly8x16_t __a, poly8x16_t __b)
 {
-  return (uint8x16_t) ((((uint8x16_t) a) & ((uint8x16_t) b))
+  return (uint8x16_t) ((((uint8x16_t) __a) & ((uint8x16_t) __b))
 		       != 0);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtstq_p16 (poly16x8_t a, poly16x8_t b)
+vtstq_p16 (poly16x8_t __a, poly16x8_t __b)
 {
-  return (uint16x8_t) ((((uint16x8_t) a) & ((uint16x8_t) b))
+  return (uint16x8_t) ((((uint16x8_t) __a) & ((uint16x8_t) __b))
 		       != 0);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtstq_p64 (poly64x2_t a, poly64x2_t b)
+vtstq_p64 (poly64x2_t __a, poly64x2_t __b)
 {
-  return (uint64x2_t) ((((uint64x2_t) a) & ((uint64x2_t) b))
+  return (uint64x2_t) ((((uint64x2_t) __a) & ((uint64x2_t) __b))
 		       != __AARCH64_INT64_C (0));
 }
 
@@ -11248,20 +11281,20 @@ __ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
 
 __extension__ extern __inline int64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaddlv_s32 (int32x2_t a)
+vaddlv_s32 (int32x2_t __a)
 {
-  int64_t result;
-  __asm__ ("saddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
-  return result;
+  int64_t __result;
+  __asm__ ("saddlp %0.1d, %1.2s" : "=w"(__result) : "w"(__a) : );
+  return __result;
 }
 
 __extension__ extern __inline uint64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vaddlv_u32 (uint32x2_t a)
+vaddlv_u32 (uint32x2_t __a)
 {
-  uint64_t result;
-  __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : );
-  return result;
+  uint64_t __result;
+  __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(__result) : "w"(__a) : );
+  return __result;
 }
 
 __extension__ extern __inline int16x4_t
@@ -11324,367 +11357,367 @@ vqrdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl1_p8 (poly8x16_t a, uint8x8_t b)
+vqtbl1_p8 (poly8x16_t __a, uint8x8_t __b)
 {
-  poly8x8_t result;
+  poly8x8_t __result;
   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl1_s8 (int8x16_t a, uint8x8_t b)
+vqtbl1_s8 (int8x16_t __a, uint8x8_t __b)
 {
-  int8x8_t result;
+  int8x8_t __result;
   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl1_u8 (uint8x16_t a, uint8x8_t b)
+vqtbl1_u8 (uint8x16_t __a, uint8x8_t __b)
 {
-  uint8x8_t result;
+  uint8x8_t __result;
   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline poly8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl1q_p8 (poly8x16_t a, uint8x16_t b)
+vqtbl1q_p8 (poly8x16_t __a, uint8x16_t __b)
 {
-  poly8x16_t result;
+  poly8x16_t __result;
   __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl1q_s8 (int8x16_t a, uint8x16_t b)
+vqtbl1q_s8 (int8x16_t __a, uint8x16_t __b)
 {
-  int8x16_t result;
+  int8x16_t __result;
   __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl1q_u8 (uint8x16_t a, uint8x16_t b)
+vqtbl1q_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  uint8x16_t result;
+  uint8x16_t __result;
   __asm__ ("tbl %0.16b, {%1.16b}, %2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
+           : "=w"(__result)
+           : "w"(__a), "w"(__b)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx1_s8 (int8x8_t r, int8x16_t tab, uint8x8_t idx)
+vqtbx1_s8 (int8x8_t __r, int8x16_t __tab, uint8x8_t __idx)
 {
-  int8x8_t result = r;
+  int8x8_t __result = __r;
   __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
-           : "+w"(result)
-           : "w"(tab), "w"(idx)
+           : "+w"(__result)
+           : "w"(__tab), "w"(__idx)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx1_u8 (uint8x8_t r, uint8x16_t tab, uint8x8_t idx)
+vqtbx1_u8 (uint8x8_t __r, uint8x16_t __tab, uint8x8_t __idx)
 {
-  uint8x8_t result = r;
+  uint8x8_t __result = __r;
   __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
-           : "+w"(result)
-           : "w"(tab), "w"(idx)
+           : "+w"(__result)
+           : "w"(__tab), "w"(__idx)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx1_p8 (poly8x8_t r, poly8x16_t tab, uint8x8_t idx)
+vqtbx1_p8 (poly8x8_t __r, poly8x16_t __tab, uint8x8_t __idx)
 {
-  poly8x8_t result = r;
+  poly8x8_t __result = __r;
   __asm__ ("tbx %0.8b,{%1.16b},%2.8b"
-           : "+w"(result)
-           : "w"(tab), "w"(idx)
+           : "+w"(__result)
+           : "w"(__tab), "w"(__idx)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx1q_s8 (int8x16_t r, int8x16_t tab, uint8x16_t idx)
+vqtbx1q_s8 (int8x16_t __r, int8x16_t __tab, uint8x16_t __idx)
 {
-  int8x16_t result = r;
+  int8x16_t __result = __r;
   __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
-           : "+w"(result)
-           : "w"(tab), "w"(idx)
+           : "+w"(__result)
+           : "w"(__tab), "w"(__idx)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx1q_u8 (uint8x16_t r, uint8x16_t tab, uint8x16_t idx)
+vqtbx1q_u8 (uint8x16_t __r, uint8x16_t __tab, uint8x16_t __idx)
 {
-  uint8x16_t result = r;
+  uint8x16_t __result = __r;
   __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
-           : "+w"(result)
-           : "w"(tab), "w"(idx)
+           : "+w"(__result)
+           : "w"(__tab), "w"(__idx)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline poly8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx1q_p8 (poly8x16_t r, poly8x16_t tab, uint8x16_t idx)
+vqtbx1q_p8 (poly8x16_t __r, poly8x16_t __tab, uint8x16_t __idx)
 {
-  poly8x16_t result = r;
+  poly8x16_t __result = __r;
   __asm__ ("tbx %0.16b,{%1.16b},%2.16b"
-           : "+w"(result)
-           : "w"(tab), "w"(idx)
+           : "+w"(__result)
+           : "w"(__tab), "w"(__idx)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 /* V7 legacy table intrinsics.  */
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtbl1_s8 (int8x8_t tab, int8x8_t idx)
+vtbl1_s8 (int8x8_t __tab, int8x8_t __idx)
 {
-  int8x8_t result;
-  int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
+  int8x8_t __result;
+  int8x16_t __temp = vcombine_s8 (__tab, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
+           : "=w"(__result)
+           : "w"(__temp), "w"(__idx)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtbl1_u8 (uint8x8_t tab, uint8x8_t idx)
+vtbl1_u8 (uint8x8_t __tab, uint8x8_t __idx)
 {
-  uint8x8_t result;
-  uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+  uint8x8_t __result;
+  uint8x16_t __temp = vcombine_u8 (__tab, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
+           : "=w"(__result)
+           : "w"(__temp), "w"(__idx)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtbl1_p8 (poly8x8_t tab, uint8x8_t idx)
+vtbl1_p8 (poly8x8_t __tab, uint8x8_t __idx)
 {
-  poly8x8_t result;
-  poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (__AARCH64_UINT64_C (0x0)));
+  poly8x8_t __result;
+  poly8x16_t __temp = vcombine_p8 (__tab, vcreate_p8 (__AARCH64_UINT64_C (0x0)));
   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
+           : "=w"(__result)
+           : "w"(__temp), "w"(__idx)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtbl2_s8 (int8x8x2_t tab, int8x8_t idx)
+vtbl2_s8 (int8x8x2_t __tab, int8x8_t __idx)
 {
-  int8x8_t result;
-  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
+  int8x8_t __result;
+  int8x16_t __temp = vcombine_s8 (__tab.val[0], __tab.val[1]);
   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
+           : "=w"(__result)
+           : "w"(__temp), "w"(__idx)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtbl2_u8 (uint8x8x2_t tab, uint8x8_t idx)
+vtbl2_u8 (uint8x8x2_t __tab, uint8x8_t __idx)
 {
-  uint8x8_t result;
-  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
+  uint8x8_t __result;
+  uint8x16_t __temp = vcombine_u8 (__tab.val[0], __tab.val[1]);
   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
+           : "=w"(__result)
+           : "w"(__temp), "w"(__idx)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtbl2_p8 (poly8x8x2_t tab, uint8x8_t idx)
+vtbl2_p8 (poly8x8x2_t __tab, uint8x8_t __idx)
 {
-  poly8x8_t result;
-  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
+  poly8x8_t __result;
+  poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]);
   __asm__ ("tbl %0.8b, {%1.16b}, %2.8b"
-           : "=w"(result)
-           : "w"(temp), "w"(idx)
+           : "=w"(__result)
+           : "w"(__temp), "w"(__idx)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtbl3_s8 (int8x8x3_t tab, int8x8_t idx)
+vtbl3_s8 (int8x8x3_t __tab, int8x8_t __idx)
 {
-  int8x8_t result;
-  int8x16x2_t temp;
+  int8x8_t __result;
+  int8x16x2_t __temp;
   __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0)));
+  __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
+  __temp.val[1] = vcombine_s8 (__tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0)));
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
+					   (int8x16_t) __temp.val[0], 0);
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = __builtin_aarch64_tbl3v8qi (__o, idx);
-  return result;
+					   (int8x16_t) __temp.val[1], 1);
+  __result = __builtin_aarch64_tbl3v8qi (__o, __idx);
+  return __result;
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx)
+vtbl3_u8 (uint8x8x3_t __tab, uint8x8_t __idx)
 {
-  uint8x8_t result;
-  uint8x16x2_t temp;
+  uint8x8_t __result;
+  uint8x16x2_t __temp;
   __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0)));
+  __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
+  __temp.val[1] = vcombine_u8 (__tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0)));
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
+					   (int8x16_t) __temp.val[0], 0);
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
-  return result;
+					   (int8x16_t) __temp.val[1], 1);
+  __result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx);
+  return __result;
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx)
+vtbl3_p8 (poly8x8x3_t __tab, uint8x8_t __idx)
 {
-  poly8x8_t result;
-  poly8x16x2_t temp;
+  poly8x8_t __result;
+  poly8x16x2_t __temp;
   __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0)));
+  __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
+  __temp.val[1] = vcombine_p8 (__tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0)));
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
+					   (int8x16_t) __temp.val[0], 0);
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
-  return result;
+					   (int8x16_t) __temp.val[1], 1);
+  __result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx);
+  return __result;
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtbl4_s8 (int8x8x4_t tab, int8x8_t idx)
+vtbl4_s8 (int8x8x4_t __tab, int8x8_t __idx)
 {
-  int8x8_t result;
-  int8x16x2_t temp;
+  int8x8_t __result;
+  int8x16x2_t __temp;
   __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]);
+  __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
+  __temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]);
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
+					   (int8x16_t) __temp.val[0], 0);
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = __builtin_aarch64_tbl3v8qi (__o, idx);
-  return result;
+					   (int8x16_t) __temp.val[1], 1);
+  __result = __builtin_aarch64_tbl3v8qi (__o, __idx);
+  return __result;
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx)
+vtbl4_u8 (uint8x8x4_t __tab, uint8x8_t __idx)
 {
-  uint8x8_t result;
-  uint8x16x2_t temp;
+  uint8x8_t __result;
+  uint8x16x2_t __temp;
   __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]);
+  __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
+  __temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]);
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
+					   (int8x16_t) __temp.val[0], 0);
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
-  return result;
+					   (int8x16_t) __temp.val[1], 1);
+  __result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx);
+  return __result;
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx)
+vtbl4_p8 (poly8x8x4_t __tab, uint8x8_t __idx)
 {
-  poly8x8_t result;
-  poly8x16x2_t temp;
+  poly8x8_t __result;
+  poly8x16x2_t __temp;
   __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]);
-  temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]);
+  __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
+  __temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]);
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
+					   (int8x16_t) __temp.val[0], 0);
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
-  return result;
+					   (int8x16_t) __temp.val[1], 1);
+  __result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx);
+  return __result;
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtbx2_s8 (int8x8_t r, int8x8x2_t tab, int8x8_t idx)
+vtbx2_s8 (int8x8_t __r, int8x8x2_t __tab, int8x8_t __idx)
 {
-  int8x8_t result = r;
-  int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]);
+  int8x8_t __result = __r;
+  int8x16_t __temp = vcombine_s8 (__tab.val[0], __tab.val[1]);
   __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
-           : "+w"(result)
-           : "w"(temp), "w"(idx)
+           : "+w"(__result)
+           : "w"(__temp), "w"(__idx)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtbx2_u8 (uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx)
+vtbx2_u8 (uint8x8_t __r, uint8x8x2_t __tab, uint8x8_t __idx)
 {
-  uint8x8_t result = r;
-  uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]);
+  uint8x8_t __result = __r;
+  uint8x16_t __temp = vcombine_u8 (__tab.val[0], __tab.val[1]);
   __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
-           : "+w"(result)
-           : "w"(temp), "w"(idx)
+           : "+w"(__result)
+           : "w"(__temp), "w"(__idx)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
+vtbx2_p8 (poly8x8_t __r, poly8x8x2_t __tab, uint8x8_t __idx)
 {
-  poly8x8_t result = r;
-  poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]);
+  poly8x8_t __result = __r;
+  poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]);
   __asm__ ("tbx %0.8b, {%1.16b}, %2.8b"
-           : "+w"(result)
-           : "w"(temp), "w"(idx)
+           : "+w"(__result)
+           : "w"(__temp), "w"(__idx)
            : /* No clobbers */);
-  return result;
+  return __result;
 }
 
 /* End of temporary inline asm.  */
@@ -17063,98 +17096,98 @@ vld1_f16 (const float16_t *__a)
 
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_f32 (const float32_t *a)
+vld1_f32 (const float32_t *__a)
 {
-  return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) a);
+  return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) __a);
 }
 
 __extension__ extern __inline float64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_f64 (const float64_t *a)
+vld1_f64 (const float64_t *__a)
 {
-  return (float64x1_t) {*a};
+  return (float64x1_t) {*__a};
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_p8 (const poly8_t *a)
+vld1_p8 (const poly8_t *__a)
 {
   return (poly8x8_t)
-    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
+    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) __a);
 }
 
 __extension__ extern __inline poly16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_p16 (const poly16_t *a)
+vld1_p16 (const poly16_t *__a)
 {
   return (poly16x4_t)
-    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
+    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) __a);
 }
 
 __extension__ extern __inline poly64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_p64 (const poly64_t *a)
+vld1_p64 (const poly64_t *__a)
 {
-  return (poly64x1_t) {*a};
+  return (poly64x1_t) {*__a};
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_s8 (const int8_t *a)
+vld1_s8 (const int8_t *__a)
 {
-  return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
+  return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) __a);
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_s16 (const int16_t *a)
+vld1_s16 (const int16_t *__a)
 {
-  return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
+  return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) __a);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_s32 (const int32_t *a)
+vld1_s32 (const int32_t *__a)
 {
-  return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
+  return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) __a);
 }
 
 __extension__ extern __inline int64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_s64 (const int64_t *a)
+vld1_s64 (const int64_t *__a)
 {
-  return (int64x1_t) {*a};
+  return (int64x1_t) {*__a};
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_u8 (const uint8_t *a)
+vld1_u8 (const uint8_t *__a)
 {
   return (uint8x8_t)
-    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a);
+    __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) __a);
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_u16 (const uint16_t *a)
+vld1_u16 (const uint16_t *__a)
 {
   return (uint16x4_t)
-    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
+    __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) __a);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_u32 (const uint32_t *a)
+vld1_u32 (const uint32_t *__a)
 {
   return (uint32x2_t)
-    __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a);
+    __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) __a);
 }
 
 __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1_u64 (const uint64_t *a)
+vld1_u64 (const uint64_t *__a)
 {
-  return (uint64x1_t) {*a};
+  return (uint64x1_t) {*__a};
 }
 
 /* vld1x3  */
@@ -17536,76 +17569,76 @@ vld1q_f16 (const float16_t *__a)
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_f32 (const float32_t *a)
+vld1q_f32 (const float32_t *__a)
 {
-  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a);
+  return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) __a);
 }
 
 __extension__ extern __inline float64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_f64 (const float64_t *a)
+vld1q_f64 (const float64_t *__a)
 {
-  return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) a);
+  return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) __a);
 }
 
 __extension__ extern __inline poly8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_p8 (const poly8_t *a)
+vld1q_p8 (const poly8_t *__a)
 {
   return (poly8x16_t)
-    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
+    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) __a);
 }
 
 __extension__ extern __inline poly16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_p16 (const poly16_t *a)
+vld1q_p16 (const poly16_t *__a)
 {
   return (poly16x8_t)
-    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
+    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) __a);
 }
 
 __extension__ extern __inline poly64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_p64 (const poly64_t *a)
+vld1q_p64 (const poly64_t *__a)
 {
   return (poly64x2_t)
-    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
+    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) __a);
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_s8 (const int8_t *a)
+vld1q_s8 (const int8_t *__a)
 {
-  return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
+  return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) __a);
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_s16 (const int16_t *a)
+vld1q_s16 (const int16_t *__a)
 {
-  return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
+  return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) __a);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_s32 (const int32_t *a)
+vld1q_s32 (const int32_t *__a)
 {
-  return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
+  return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) __a);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_s64 (const int64_t *a)
+vld1q_s64 (const int64_t *__a)
 {
-  return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
+  return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) __a);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_u8 (const uint8_t *a)
+vld1q_u8 (const uint8_t *__a)
 {
   return (uint8x16_t)
-    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
+    __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) __a);
 }
 
 __extension__ extern __inline uint8x8x2_t
@@ -17946,26 +17979,308 @@ vld1q_p64_x2 (const poly64_t *__a)
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_u16 (const uint16_t *a)
+vld1q_u16 (const uint16_t *__a)
 {
   return (uint16x8_t)
-    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
+    __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) __a);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_u32 (const uint32_t *a)
+vld1q_u32 (const uint32_t *__a)
 {
   return (uint32x4_t)
-    __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a);
+    __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) __a);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vld1q_u64 (const uint64_t *a)
+vld1q_u64 (const uint64_t *__a)
 {
   return (uint64x2_t)
-    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
+    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) __a);
+}
+
+/* vld1(q)_x4.  */
+
+__extension__ extern __inline int8x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s8_x4 (const int8_t *__a)
+{
+  union { int8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline int8x16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s8_x4 (const int8_t *__a)
+{
+  union { int8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline int16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s16_x4 (const int16_t *__a)
+{
+  union { int16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline int16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s16_x4 (const int16_t *__a)
+{
+  union { int16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline int32x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s32_x4 (const int32_t *__a)
+{
+  union { int32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+  = __builtin_aarch64_ld1x4v2si ((const __builtin_aarch64_simd_si *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline int32x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s32_x4 (const int32_t *__a)
+{
+  union { int32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+  = __builtin_aarch64_ld1x4v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline uint8x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u8_x4 (const uint8_t *__a)
+{
+  union { uint8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline uint8x16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u8_x4 (const uint8_t *__a)
+{
+  union { uint8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline uint16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u16_x4 (const uint16_t *__a)
+{
+  union { uint16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline uint16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u16_x4 (const uint16_t *__a)
+{
+  union { uint16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline uint32x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u32_x4 (const uint32_t *__a)
+{
+  union { uint32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v2si ((const __builtin_aarch64_simd_si *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline uint32x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u32_x4 (const uint32_t *__a)
+{
+  union { uint32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v4si ((const __builtin_aarch64_simd_si *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline float16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f16_x4 (const float16_t *__a)
+{
+  union { float16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v4hf ((const __builtin_aarch64_simd_hf *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline float16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f16_x4 (const float16_t *__a)
+{
+  union { float16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v8hf ((const __builtin_aarch64_simd_hf *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline float32x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f32_x4 (const float32_t *__a)
+{
+  union { float32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v2sf ((const __builtin_aarch64_simd_sf *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline float32x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f32_x4 (const float32_t *__a)
+{
+  union { float32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v4sf ((const __builtin_aarch64_simd_sf *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline poly8x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p8_x4 (const poly8_t *__a)
+{
+  union { poly8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline poly8x16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p8_x4 (const poly8_t *__a)
+{
+  union { poly8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline poly16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p16_x4 (const poly16_t *__a)
+{
+  union { poly16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline poly16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p16_x4 (const poly16_t *__a)
+{
+  union { poly16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline int64x1x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s64_x4 (const int64_t *__a)
+{
+  union { int64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4di ((const __builtin_aarch64_simd_di *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline uint64x1x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u64_x4 (const uint64_t *__a)
+{
+  union { uint64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4di ((const __builtin_aarch64_simd_di *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline poly64x1x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p64_x4 (const poly64_t *__a)
+{
+  union { poly64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4di ((const __builtin_aarch64_simd_di *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline int64x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s64_x4 (const int64_t *__a)
+{
+  union { int64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline uint64x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u64_x4 (const uint64_t *__a)
+{
+  union { uint64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline poly64x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p64_x4 (const poly64_t *__a)
+{
+  union { poly64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v2di ((const __builtin_aarch64_simd_di *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline float64x1x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f64_x4 (const float64_t *__a)
+{
+  union { float64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4df ((const __builtin_aarch64_simd_df *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline float64x2x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f64_x4 (const float64_t *__a)
+{
+  union { float64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v2df ((const __builtin_aarch64_simd_df *) __a);
+  return __au.__i;
 }
 
 /* vld1_dup  */
@@ -21115,328 +21430,328 @@ vmulxd_laneq_f64 (float64_t __a, float64x2_t __v, const int __lane)
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmax_s8 (int8x8_t a, int8x8_t b)
+vpmax_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return __builtin_aarch64_smaxpv8qi (a, b);
+  return __builtin_aarch64_smaxpv8qi (__a, __b);
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmax_s16 (int16x4_t a, int16x4_t b)
+vpmax_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return __builtin_aarch64_smaxpv4hi (a, b);
+  return __builtin_aarch64_smaxpv4hi (__a, __b);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmax_s32 (int32x2_t a, int32x2_t b)
+vpmax_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return __builtin_aarch64_smaxpv2si (a, b);
+  return __builtin_aarch64_smaxpv2si (__a, __b);
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmax_u8 (uint8x8_t a, uint8x8_t b)
+vpmax_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return (uint8x8_t) __builtin_aarch64_umaxpv8qi ((int8x8_t) a,
-						  (int8x8_t) b);
+  return (uint8x8_t) __builtin_aarch64_umaxpv8qi ((int8x8_t) __a,
+						  (int8x8_t) __b);
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmax_u16 (uint16x4_t a, uint16x4_t b)
+vpmax_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return (uint16x4_t) __builtin_aarch64_umaxpv4hi ((int16x4_t) a,
-						   (int16x4_t) b);
+  return (uint16x4_t) __builtin_aarch64_umaxpv4hi ((int16x4_t) __a,
+						   (int16x4_t) __b);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmax_u32 (uint32x2_t a, uint32x2_t b)
+vpmax_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return (uint32x2_t) __builtin_aarch64_umaxpv2si ((int32x2_t) a,
-						   (int32x2_t) b);
+  return (uint32x2_t) __builtin_aarch64_umaxpv2si ((int32x2_t) __a,
+						   (int32x2_t) __b);
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxq_s8 (int8x16_t a, int8x16_t b)
+vpmaxq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_smaxpv16qi (a, b);
+  return __builtin_aarch64_smaxpv16qi (__a, __b);
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxq_s16 (int16x8_t a, int16x8_t b)
+vpmaxq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __builtin_aarch64_smaxpv8hi (a, b);
+  return __builtin_aarch64_smaxpv8hi (__a, __b);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxq_s32 (int32x4_t a, int32x4_t b)
+vpmaxq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return __builtin_aarch64_smaxpv4si (a, b);
+  return __builtin_aarch64_smaxpv4si (__a, __b);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxq_u8 (uint8x16_t a, uint8x16_t b)
+vpmaxq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return (uint8x16_t) __builtin_aarch64_umaxpv16qi ((int8x16_t) a,
-						    (int8x16_t) b);
+  return (uint8x16_t) __builtin_aarch64_umaxpv16qi ((int8x16_t) __a,
+						    (int8x16_t) __b);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxq_u16 (uint16x8_t a, uint16x8_t b)
+vpmaxq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return (uint16x8_t) __builtin_aarch64_umaxpv8hi ((int16x8_t) a,
-						   (int16x8_t) b);
+  return (uint16x8_t) __builtin_aarch64_umaxpv8hi ((int16x8_t) __a,
+						   (int16x8_t) __b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxq_u32 (uint32x4_t a, uint32x4_t b)
+vpmaxq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return (uint32x4_t) __builtin_aarch64_umaxpv4si ((int32x4_t) a,
-						   (int32x4_t) b);
+  return (uint32x4_t) __builtin_aarch64_umaxpv4si ((int32x4_t) __a,
+						   (int32x4_t) __b);
 }
 
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmax_f32 (float32x2_t a, float32x2_t b)
+vpmax_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return __builtin_aarch64_smax_nanpv2sf (a, b);
+  return __builtin_aarch64_smax_nanpv2sf (__a, __b);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxq_f32 (float32x4_t a, float32x4_t b)
+vpmaxq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_aarch64_smax_nanpv4sf (a, b);
+  return __builtin_aarch64_smax_nanpv4sf (__a, __b);
 }
 
 __extension__ extern __inline float64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxq_f64 (float64x2_t a, float64x2_t b)
+vpmaxq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return __builtin_aarch64_smax_nanpv2df (a, b);
+  return __builtin_aarch64_smax_nanpv2df (__a, __b);
 }
 
 __extension__ extern __inline float64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxqd_f64 (float64x2_t a)
+vpmaxqd_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_reduc_smax_nan_scal_v2df (a);
+  return __builtin_aarch64_reduc_smax_nan_scal_v2df (__a);
 }
 
 __extension__ extern __inline float32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxs_f32 (float32x2_t a)
+vpmaxs_f32 (float32x2_t __a)
 {
-  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (a);
+  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (__a);
 }
 
 /* vpmaxnm  */
 
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxnm_f32 (float32x2_t a, float32x2_t b)
+vpmaxnm_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return __builtin_aarch64_smaxpv2sf (a, b);
+  return __builtin_aarch64_smaxpv2sf (__a, __b);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxnmq_f32 (float32x4_t a, float32x4_t b)
+vpmaxnmq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_aarch64_smaxpv4sf (a, b);
+  return __builtin_aarch64_smaxpv4sf (__a, __b);
 }
 
 __extension__ extern __inline float64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxnmq_f64 (float64x2_t a, float64x2_t b)
+vpmaxnmq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return __builtin_aarch64_smaxpv2df (a, b);
+  return __builtin_aarch64_smaxpv2df (__a, __b);
 }
 
 __extension__ extern __inline float64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxnmqd_f64 (float64x2_t a)
+vpmaxnmqd_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_reduc_smax_scal_v2df (a);
+  return __builtin_aarch64_reduc_smax_scal_v2df (__a);
 }
 
 __extension__ extern __inline float32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxnms_f32 (float32x2_t a)
+vpmaxnms_f32 (float32x2_t __a)
 {
-  return __builtin_aarch64_reduc_smax_scal_v2sf (a);
+  return __builtin_aarch64_reduc_smax_scal_v2sf (__a);
 }
 
 /* vpmin  */
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmin_s8 (int8x8_t a, int8x8_t b)
+vpmin_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return __builtin_aarch64_sminpv8qi (a, b);
+  return __builtin_aarch64_sminpv8qi (__a, __b);
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmin_s16 (int16x4_t a, int16x4_t b)
+vpmin_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return __builtin_aarch64_sminpv4hi (a, b);
+  return __builtin_aarch64_sminpv4hi (__a, __b);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmin_s32 (int32x2_t a, int32x2_t b)
+vpmin_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return __builtin_aarch64_sminpv2si (a, b);
+  return __builtin_aarch64_sminpv2si (__a, __b);
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmin_u8 (uint8x8_t a, uint8x8_t b)
+vpmin_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return (uint8x8_t) __builtin_aarch64_uminpv8qi ((int8x8_t) a,
-						  (int8x8_t) b);
+  return (uint8x8_t) __builtin_aarch64_uminpv8qi ((int8x8_t) __a,
+						  (int8x8_t) __b);
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmin_u16 (uint16x4_t a, uint16x4_t b)
+vpmin_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return (uint16x4_t) __builtin_aarch64_uminpv4hi ((int16x4_t) a,
-						   (int16x4_t) b);
+  return (uint16x4_t) __builtin_aarch64_uminpv4hi ((int16x4_t) __a,
+						   (int16x4_t) __b);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmin_u32 (uint32x2_t a, uint32x2_t b)
+vpmin_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return (uint32x2_t) __builtin_aarch64_uminpv2si ((int32x2_t) a,
-						   (int32x2_t) b);
+  return (uint32x2_t) __builtin_aarch64_uminpv2si ((int32x2_t) __a,
+						   (int32x2_t) __b);
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminq_s8 (int8x16_t a, int8x16_t b)
+vpminq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return __builtin_aarch64_sminpv16qi (a, b);
+  return __builtin_aarch64_sminpv16qi (__a, __b);
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminq_s16 (int16x8_t a, int16x8_t b)
+vpminq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return __builtin_aarch64_sminpv8hi (a, b);
+  return __builtin_aarch64_sminpv8hi (__a, __b);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminq_s32 (int32x4_t a, int32x4_t b)
+vpminq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return __builtin_aarch64_sminpv4si (a, b);
+  return __builtin_aarch64_sminpv4si (__a, __b);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminq_u8 (uint8x16_t a, uint8x16_t b)
+vpminq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return (uint8x16_t) __builtin_aarch64_uminpv16qi ((int8x16_t) a,
-						    (int8x16_t) b);
+  return (uint8x16_t) __builtin_aarch64_uminpv16qi ((int8x16_t) __a,
+						    (int8x16_t) __b);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminq_u16 (uint16x8_t a, uint16x8_t b)
+vpminq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return (uint16x8_t) __builtin_aarch64_uminpv8hi ((int16x8_t) a,
-						   (int16x8_t) b);
+  return (uint16x8_t) __builtin_aarch64_uminpv8hi ((int16x8_t) __a,
+						   (int16x8_t) __b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminq_u32 (uint32x4_t a, uint32x4_t b)
+vpminq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return (uint32x4_t) __builtin_aarch64_uminpv4si ((int32x4_t) a,
-						   (int32x4_t) b);
+  return (uint32x4_t) __builtin_aarch64_uminpv4si ((int32x4_t) __a,
+						   (int32x4_t) __b);
 }
 
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmin_f32 (float32x2_t a, float32x2_t b)
+vpmin_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return __builtin_aarch64_smin_nanpv2sf (a, b);
+  return __builtin_aarch64_smin_nanpv2sf (__a, __b);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminq_f32 (float32x4_t a, float32x4_t b)
+vpminq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_aarch64_smin_nanpv4sf (a, b);
+  return __builtin_aarch64_smin_nanpv4sf (__a, __b);
 }
 
 __extension__ extern __inline float64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminq_f64 (float64x2_t a, float64x2_t b)
+vpminq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return __builtin_aarch64_smin_nanpv2df (a, b);
+  return __builtin_aarch64_smin_nanpv2df (__a, __b);
 }
 
 __extension__ extern __inline float64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminqd_f64 (float64x2_t a)
+vpminqd_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_reduc_smin_nan_scal_v2df (a);
+  return __builtin_aarch64_reduc_smin_nan_scal_v2df (__a);
 }
 
 __extension__ extern __inline float32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmins_f32 (float32x2_t a)
+vpmins_f32 (float32x2_t __a)
 {
-  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (a);
+  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (__a);
 }
 
 /* vpminnm  */
 
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminnm_f32 (float32x2_t a, float32x2_t b)
+vpminnm_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return __builtin_aarch64_sminpv2sf (a, b);
+  return __builtin_aarch64_sminpv2sf (__a, __b);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminnmq_f32 (float32x4_t a, float32x4_t b)
+vpminnmq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return __builtin_aarch64_sminpv4sf (a, b);
+  return __builtin_aarch64_sminpv4sf (__a, __b);
 }
 
 __extension__ extern __inline float64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminnmq_f64 (float64x2_t a, float64x2_t b)
+vpminnmq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return __builtin_aarch64_sminpv2df (a, b);
+  return __builtin_aarch64_sminpv2df (__a, __b);
 }
 
 __extension__ extern __inline float64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminnmqd_f64 (float64x2_t a)
+vpminnmqd_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_reduc_smin_scal_v2df (a);
+  return __builtin_aarch64_reduc_smin_scal_v2df (__a);
 }
 
 __extension__ extern __inline float32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminnms_f32 (float32x2_t a)
+vpminnms_f32 (float32x2_t __a)
 {
-  return __builtin_aarch64_reduc_smin_scal_v2sf (a);
+  return __builtin_aarch64_reduc_smin_scal_v2sf (__a);
 }
 
 /* vmaxnm  */
@@ -21889,9 +22204,9 @@ vminnmvq_f64 (float64x2_t __a)
 
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmla_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
+vmla_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
 {
-  return a + b * c;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline float64x1_t
@@ -21903,16 +22218,16 @@ vmla_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
+vmlaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
-  return a + b * c;
+  return __a + __b * __c;
 }
 
 __extension__ extern __inline float64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
+vmlaq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
 {
-  return a + b * c;
+  return __a + __b * __c;
 }
 
 /* vmla_lane  */
@@ -22087,9 +22402,9 @@ vmlaq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
 
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmls_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
+vmls_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
 {
-  return a - b * c;
+  return __a - __b * __c;
 }
 
 __extension__ extern __inline float64x1_t
@@ -22101,16 +22416,16 @@ vmls_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
+vmlsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
-  return a - b * c;
+  return __a - __b * __c;
 }
 
 __extension__ extern __inline float64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
+vmlsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
 {
-  return a - b * c;
+  return __a - __b * __c;
 }
 
 /* vmls_lane  */
@@ -24874,419 +25189,419 @@ vqsubd_u64 (uint64_t __a, uint64_t __b)
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl2_s8 (int8x16x2_t tab, uint8x8_t idx)
+vqtbl2_s8 (int8x16x2_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
-  return __builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1);
+  return __builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx);
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl2_u8 (uint8x16x2_t tab, uint8x8_t idx)
+vqtbl2_u8 (uint8x16x2_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  return (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx);
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl2_p8 (poly8x16x2_t tab, uint8x8_t idx)
+vqtbl2_p8 (poly8x16x2_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  return (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx);
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl2q_s8 (int8x16x2_t tab, uint8x16_t idx)
+vqtbl2q_s8 (int8x16x2_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return __builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  return __builtin_aarch64_tbl3v16qi (__o, (int8x16_t)__idx);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl2q_u8 (uint8x16x2_t tab, uint8x16_t idx)
+vqtbl2q_u8 (uint8x16x2_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return (uint8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  return (uint8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)__idx);
 }
 
 __extension__ extern __inline poly8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl2q_p8 (poly8x16x2_t tab, uint8x16_t idx)
+vqtbl2q_p8 (poly8x16x2_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return (poly8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  return (poly8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)__idx);
 }
 
 /* vqtbl3 */
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl3_s8 (int8x16x3_t tab, uint8x8_t idx)
+vqtbl3_s8 (int8x16x3_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx);
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl3_u8 (uint8x16x3_t tab, uint8x8_t idx)
+vqtbl3_u8 (uint8x16x3_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx);
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl3_p8 (poly8x16x3_t tab, uint8x8_t idx)
+vqtbl3_p8 (poly8x16x3_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx);
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl3q_s8 (int8x16x3_t tab, uint8x16_t idx)
+vqtbl3q_s8 (int8x16x3_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl3q_u8 (uint8x16x3_t tab, uint8x16_t idx)
+vqtbl3q_u8 (uint8x16x3_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx);
 }
 
 __extension__ extern __inline poly8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl3q_p8 (poly8x16x3_t tab, uint8x16_t idx)
+vqtbl3q_p8 (poly8x16x3_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx);
 }
 
 /* vqtbl4 */
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl4_s8 (int8x16x4_t tab, uint8x8_t idx)
+vqtbl4_s8 (int8x16x4_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return __builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+  return __builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx);
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl4_u8 (uint8x16x4_t tab, uint8x8_t idx)
+vqtbl4_u8 (uint8x16x4_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return (uint8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+  return (uint8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx);
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl4_p8 (poly8x16x4_t tab, uint8x8_t idx)
+vqtbl4_p8 (poly8x16x4_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return (poly8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+  return (poly8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx);
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl4q_s8 (int8x16x4_t tab, uint8x16_t idx)
+vqtbl4q_s8 (int8x16x4_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return __builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+  return __builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl4q_u8 (uint8x16x4_t tab, uint8x16_t idx)
+vqtbl4q_u8 (uint8x16x4_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return (uint8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+  return (uint8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx);
 }
 
 __extension__ extern __inline poly8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbl4q_p8 (poly8x16x4_t tab, uint8x16_t idx)
+vqtbl4q_p8 (poly8x16x4_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+  return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx);
 }
 
 
 /* vqtbx2 */
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx2_s8 (int8x8_t r, int8x16x2_t tab, uint8x8_t idx)
+vqtbx2_s8 (int8x8_t __r, int8x16x2_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
-  return __builtin_aarch64_tbx4v8qi (r, __o, (int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1);
+  return __builtin_aarch64_tbx4v8qi (__r, __o, (int8x8_t)__idx);
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx2_u8 (uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx)
+vqtbx2_u8 (uint8x8_t __r, uint8x16x2_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)r, __o,
-						(int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  return (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
+						(int8x8_t)__idx);
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx2_p8 (poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx)
+vqtbx2_p8 (poly8x8_t __r, poly8x16x2_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)r, __o,
-						(int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  return (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
+						(int8x8_t)__idx);
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx2q_s8 (int8x16_t r, int8x16x2_t tab, uint8x16_t idx)
+vqtbx2q_s8 (int8x16_t __r, int8x16x2_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1);
-  return __builtin_aarch64_tbx4v16qi (r, __o, (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1);
+  return __builtin_aarch64_tbx4v16qi (__r, __o, (int8x16_t)__idx);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx2q_u8 (uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx)
+vqtbx2q_u8 (uint8x16_t __r, uint8x16x2_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return (uint8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)r, __o,
-						  (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  return (uint8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)__r, __o,
+						  (int8x16_t)__idx);
 }
 
 __extension__ extern __inline poly8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx2q_p8 (poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx)
+vqtbx2q_p8 (poly8x16_t __r, poly8x16x2_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  return (poly8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)r, __o,
-						  (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  return (poly8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)__r, __o,
+						  (int8x16_t)__idx);
 }
 
 /* vqtbx3 */
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx3_s8 (int8x8_t r, int8x16x3_t tab, uint8x8_t idx)
+vqtbx3_s8 (int8x8_t __r, int8x16x3_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2);
-  return __builtin_aarch64_qtbx3v8qi (r, __o, (int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[2], 2);
+  return __builtin_aarch64_qtbx3v8qi (__r, __o, (int8x8_t)__idx);
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx3_u8 (uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx)
+vqtbx3_u8 (uint8x8_t __r, uint8x16x3_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return (uint8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)r, __o,
-						 (int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  return (uint8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)__r, __o,
+						 (int8x8_t)__idx);
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx3_p8 (poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx)
+vqtbx3_p8 (poly8x8_t __r, poly8x16x3_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return (poly8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)r, __o,
-						 (int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  return (poly8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)__r, __o,
+						 (int8x8_t)__idx);
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx3q_s8 (int8x16_t r, int8x16x3_t tab, uint8x16_t idx)
+vqtbx3q_s8 (int8x16_t __r, int8x16x3_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2);
-  return __builtin_aarch64_qtbx3v16qi (r, __o, (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[2], 2);
+  return __builtin_aarch64_qtbx3v16qi (__r, __o, (int8x16_t)__idx);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx3q_u8 (uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx)
+vqtbx3q_u8 (uint8x16_t __r, uint8x16x3_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return (uint8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)r, __o,
-						   (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  return (uint8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)__r, __o,
+						   (int8x16_t)__idx);
 }
 
 __extension__ extern __inline poly8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx3q_p8 (poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx)
+vqtbx3q_p8 (poly8x16_t __r, poly8x16x3_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2);
-  return (poly8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)r, __o,
-						   (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  return (poly8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)__r, __o,
+						   (int8x16_t)__idx);
 }
 
 /* vqtbx4 */
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx4_s8 (int8x8_t r, int8x16x4_t tab, uint8x8_t idx)
+vqtbx4_s8 (int8x8_t __r, int8x16x4_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[3], 3);
-  return __builtin_aarch64_qtbx4v8qi (r, __o, (int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[3], 3);
+  return __builtin_aarch64_qtbx4v8qi (__r, __o, (int8x8_t)__idx);
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx4_u8 (uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx)
+vqtbx4_u8 (uint8x8_t __r, uint8x16x4_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return (uint8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)r, __o,
-						 (int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+  return (uint8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)__r, __o,
+						 (int8x8_t)__idx);
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx4_p8 (poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx)
+vqtbx4_p8 (poly8x8_t __r, poly8x16x4_t __tab, uint8x8_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return (poly8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)r, __o,
-						 (int8x8_t)idx);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+  return (poly8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)__r, __o,
+						 (int8x8_t)__idx);
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx4q_s8 (int8x16_t r, int8x16x4_t tab, uint8x16_t idx)
+vqtbx4q_s8 (int8x16_t __r, int8x16x4_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[3], 3);
-  return __builtin_aarch64_qtbx4v16qi (r, __o, (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[3], 3);
+  return __builtin_aarch64_qtbx4v16qi (__r, __o, (int8x16_t)__idx);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx4q_u8 (uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx)
+vqtbx4q_u8 (uint8x16_t __r, uint8x16x4_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return (uint8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)r, __o,
-						   (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+  return (uint8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)__r, __o,
+						   (int8x16_t)__idx);
 }
 
 __extension__ extern __inline poly8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqtbx4q_p8 (poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx)
+vqtbx4q_p8 (poly8x16_t __r, poly8x16x4_t __tab, uint8x16_t __idx)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3);
-  return (poly8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)r, __o,
-						   (int8x16_t)idx);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3);
+  return (poly8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)__r, __o,
+						   (int8x16_t)__idx);
 }
 
 /* vrbit  */
@@ -25457,134 +25772,134 @@ vrecpxd_f64 (float64_t __a)
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev16_p8 (poly8x8_t a)
+vrev16_p8 (poly8x8_t __a)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+  return __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev16_s8 (int8x8_t a)
+vrev16_s8 (int8x8_t __a)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+  return __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev16_u8 (uint8x8_t a)
+vrev16_u8 (uint8x8_t __a)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+  return __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
 __extension__ extern __inline poly8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev16q_p8 (poly8x16_t a)
+vrev16q_p8 (poly8x16_t __a)
 {
-  return __builtin_shuffle (a,
+  return __builtin_shuffle (__a,
       (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev16q_s8 (int8x16_t a)
+vrev16q_s8 (int8x16_t __a)
 {
-  return __builtin_shuffle (a,
+  return __builtin_shuffle (__a,
       (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev16q_u8 (uint8x16_t a)
+vrev16q_u8 (uint8x16_t __a)
 {
-  return __builtin_shuffle (a,
+  return __builtin_shuffle (__a,
       (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev32_p8 (poly8x8_t a)
+vrev32_p8 (poly8x8_t __a)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+  return __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
 __extension__ extern __inline poly16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev32_p16 (poly16x4_t a)
+vrev32_p16 (poly16x4_t __a)
 {
-  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
+  return __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev32_s8 (int8x8_t a)
+vrev32_s8 (int8x8_t __a)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+  return __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev32_s16 (int16x4_t a)
+vrev32_s16 (int16x4_t __a)
 {
-  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
+  return __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev32_u8 (uint8x8_t a)
+vrev32_u8 (uint8x8_t __a)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+  return __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev32_u16 (uint16x4_t a)
+vrev32_u16 (uint16x4_t __a)
 {
-  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
+  return __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 });
 }
 
 __extension__ extern __inline poly8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev32q_p8 (poly8x16_t a)
+vrev32q_p8 (poly8x16_t __a)
 {
-  return __builtin_shuffle (a,
+  return __builtin_shuffle (__a,
       (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
 }
 
 __extension__ extern __inline poly16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev32q_p16 (poly16x8_t a)
+vrev32q_p16 (poly16x8_t __a)
 {
-  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+  return __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev32q_s8 (int8x16_t a)
+vrev32q_s8 (int8x16_t __a)
 {
-  return __builtin_shuffle (a,
+  return __builtin_shuffle (__a,
       (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev32q_s16 (int16x8_t a)
+vrev32q_s16 (int16x8_t __a)
 {
-  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+  return __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev32q_u8 (uint8x16_t a)
+vrev32q_u8 (uint8x16_t __a)
 {
-  return __builtin_shuffle (a,
+  return __builtin_shuffle (__a,
       (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev32q_u16 (uint16x8_t a)
+vrev32q_u16 (uint16x8_t __a)
 {
-  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+  return __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
 __extension__ extern __inline float16x4_t
@@ -25596,65 +25911,65 @@ vrev64_f16 (float16x4_t __a)
 
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64_f32 (float32x2_t a)
+vrev64_f32 (float32x2_t __a)
 {
-  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
+  return __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64_p8 (poly8x8_t a)
+vrev64_p8 (poly8x8_t __a)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
+  return __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
 }
 
 __extension__ extern __inline poly16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64_p16 (poly16x4_t a)
+vrev64_p16 (poly16x4_t __a)
 {
-  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
+  return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64_s8 (int8x8_t a)
+vrev64_s8 (int8x8_t __a)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
+  return __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64_s16 (int16x4_t a)
+vrev64_s16 (int16x4_t __a)
 {
-  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
+  return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64_s32 (int32x2_t a)
+vrev64_s32 (int32x2_t __a)
 {
-  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
+  return __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64_u8 (uint8x8_t a)
+vrev64_u8 (uint8x8_t __a)
 {
-  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
+  return __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64_u16 (uint16x4_t a)
+vrev64_u16 (uint16x4_t __a)
 {
-  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
+  return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64_u32 (uint32x2_t a)
+vrev64_u32 (uint32x2_t __a)
 {
-  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
+  return __builtin_shuffle (__a, (uint32x2_t) { 1, 0 });
 }
 
 __extension__ extern __inline float16x8_t
@@ -25666,68 +25981,68 @@ vrev64q_f16 (float16x8_t __a)
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64q_f32 (float32x4_t a)
+vrev64q_f32 (float32x4_t __a)
 {
-  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
+  return __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
 }
 
 __extension__ extern __inline poly8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64q_p8 (poly8x16_t a)
+vrev64q_p8 (poly8x16_t __a)
 {
-  return __builtin_shuffle (a,
+  return __builtin_shuffle (__a,
       (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
 }
 
 __extension__ extern __inline poly16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64q_p16 (poly16x8_t a)
+vrev64q_p16 (poly16x8_t __a)
 {
-  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+  return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64q_s8 (int8x16_t a)
+vrev64q_s8 (int8x16_t __a)
 {
-  return __builtin_shuffle (a,
+  return __builtin_shuffle (__a,
       (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64q_s16 (int16x8_t a)
+vrev64q_s16 (int16x8_t __a)
 {
-  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+  return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64q_s32 (int32x4_t a)
+vrev64q_s32 (int32x4_t __a)
 {
-  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
+  return __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64q_u8 (uint8x16_t a)
+vrev64q_u8 (uint8x16_t __a)
 {
-  return __builtin_shuffle (a,
+  return __builtin_shuffle (__a,
       (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64q_u16 (uint16x8_t a)
+vrev64q_u16 (uint16x8_t __a)
 {
-  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+  return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrev64q_u32 (uint32x4_t a)
+vrev64q_u32 (uint32x4_t __a)
 {
-  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
+  return __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 });
 }
 
 /* vrnd  */
@@ -26420,87 +26735,90 @@ vrsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c)
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vsha1cq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
+vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
 {
-  return __builtin_aarch64_crypto_sha1cv4si_uuuu (hash_abcd, hash_e, wk);
+  return __builtin_aarch64_crypto_sha1cv4si_uuuu (__hash_abcd, __hash_e, __wk);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vsha1mq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
+vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
 {
-  return __builtin_aarch64_crypto_sha1mv4si_uuuu (hash_abcd, hash_e, wk);
+  return __builtin_aarch64_crypto_sha1mv4si_uuuu (__hash_abcd, __hash_e, __wk);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vsha1pq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk)
+vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
 {
-  return __builtin_aarch64_crypto_sha1pv4si_uuuu (hash_abcd, hash_e, wk);
+  return __builtin_aarch64_crypto_sha1pv4si_uuuu (__hash_abcd, __hash_e, __wk);
 }
 
 __extension__ extern __inline uint32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vsha1h_u32 (uint32_t hash_e)
+vsha1h_u32 (uint32_t __hash_e)
 {
-  return __builtin_aarch64_crypto_sha1hsi_uu (hash_e);
+  return __builtin_aarch64_crypto_sha1hsi_uu (__hash_e);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vsha1su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7, uint32x4_t w8_11)
+vsha1su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7, uint32x4_t __w8_11)
 {
-  return __builtin_aarch64_crypto_sha1su0v4si_uuuu (w0_3, w4_7, w8_11);
+  return __builtin_aarch64_crypto_sha1su0v4si_uuuu (__w0_3, __w4_7, __w8_11);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vsha1su1q_u32 (uint32x4_t tw0_3, uint32x4_t w12_15)
+vsha1su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w12_15)
 {
-  return __builtin_aarch64_crypto_sha1su1v4si_uuu (tw0_3, w12_15);
+  return __builtin_aarch64_crypto_sha1su1v4si_uuu (__tw0_3, __w12_15);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vsha256hq_u32 (uint32x4_t hash_abcd, uint32x4_t hash_efgh, uint32x4_t wk)
+vsha256hq_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk)
 {
-  return __builtin_aarch64_crypto_sha256hv4si_uuuu (hash_abcd, hash_efgh, wk);
+  return __builtin_aarch64_crypto_sha256hv4si_uuuu (__hash_abcd, __hash_efgh,
+						     __wk);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vsha256h2q_u32 (uint32x4_t hash_efgh, uint32x4_t hash_abcd, uint32x4_t wk)
+vsha256h2q_u32 (uint32x4_t __hash_efgh, uint32x4_t __hash_abcd, uint32x4_t __wk)
 {
-  return __builtin_aarch64_crypto_sha256h2v4si_uuuu (hash_efgh, hash_abcd, wk);
+  return __builtin_aarch64_crypto_sha256h2v4si_uuuu (__hash_efgh, __hash_abcd,
+						      __wk);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vsha256su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7)
+vsha256su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7)
 {
-  return __builtin_aarch64_crypto_sha256su0v4si_uuu (w0_3, w4_7);
+  return __builtin_aarch64_crypto_sha256su0v4si_uuu (__w0_3, __w4_7);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vsha256su1q_u32 (uint32x4_t tw0_3, uint32x4_t w8_11, uint32x4_t w12_15)
+vsha256su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w8_11, uint32x4_t __w12_15)
 {
-  return __builtin_aarch64_crypto_sha256su1v4si_uuuu (tw0_3, w8_11, w12_15);
+  return __builtin_aarch64_crypto_sha256su1v4si_uuuu (__tw0_3, __w8_11,
+						       __w12_15);
 }
 
 __extension__ extern __inline poly128_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_p64 (poly64_t a, poly64_t b)
+vmull_p64 (poly64_t __a, poly64_t __b)
 {
   return
-    __builtin_aarch64_crypto_pmulldi_ppp (a, b);
+    __builtin_aarch64_crypto_pmulldi_ppp (__a, __b);
 }
 
 __extension__ extern __inline poly128_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vmull_high_p64 (poly64x2_t a, poly64x2_t b)
+vmull_high_p64 (poly64x2_t __a, poly64x2_t __b)
 {
-  return __builtin_aarch64_crypto_pmullv2di_ppp (a, b);
+  return __builtin_aarch64_crypto_pmullv2di_ppp (__a, __b);
 }
 
 #pragma GCC pop_options
@@ -27202,30 +27520,30 @@ vsqaddd_u64 (uint64_t __a, int64_t __b)
 /* vsqrt */
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vsqrt_f32 (float32x2_t a)
+vsqrt_f32 (float32x2_t __a)
 {
-  return __builtin_aarch64_sqrtv2sf (a);
+  return __builtin_aarch64_sqrtv2sf (__a);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vsqrtq_f32 (float32x4_t a)
+vsqrtq_f32 (float32x4_t __a)
 {
-  return __builtin_aarch64_sqrtv4sf (a);
+  return __builtin_aarch64_sqrtv4sf (__a);
 }
 
 __extension__ extern __inline float64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vsqrt_f64 (float64x1_t a)
+vsqrt_f64 (float64x1_t __a)
 {
-  return (float64x1_t) { __builtin_aarch64_sqrtdf (a[0]) };
+  return (float64x1_t) { __builtin_aarch64_sqrtdf (__a[0]) };
 }
 
 __extension__ extern __inline float64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vsqrtq_f64 (float64x2_t a)
+vsqrtq_f64 (float64x2_t __a)
 {
-  return __builtin_aarch64_sqrtv2df (a);
+  return __builtin_aarch64_sqrtv2df (__a);
 }
 
 /* vsra */
@@ -27495,98 +27813,98 @@ vst1_f16 (float16_t *__a, float16x4_t __b)
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_f32 (float32_t *a, float32x2_t b)
+vst1_f32 (float32_t *__a, float32x2_t __b)
 {
-  __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) a, b);
+  __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) __a, __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_f64 (float64_t *a, float64x1_t b)
+vst1_f64 (float64_t *__a, float64x1_t __b)
 {
-  *a = b[0];
+  *__a = __b[0];
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_p8 (poly8_t *a, poly8x8_t b)
+vst1_p8 (poly8_t *__a, poly8x8_t __b)
 {
-  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
-			     (int8x8_t) b);
+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) __a,
+			     (int8x8_t) __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_p16 (poly16_t *a, poly16x4_t b)
+vst1_p16 (poly16_t *__a, poly16x4_t __b)
 {
-  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
-			     (int16x4_t) b);
+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) __a,
+			     (int16x4_t) __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_p64 (poly64_t *a, poly64x1_t b)
+vst1_p64 (poly64_t *__a, poly64x1_t __b)
 {
-  *a = b[0];
+  *__a = __b[0];
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_s8 (int8_t *a, int8x8_t b)
+vst1_s8 (int8_t *__a, int8x8_t __b)
 {
-  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, b);
+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) __a, __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_s16 (int16_t *a, int16x4_t b)
+vst1_s16 (int16_t *__a, int16x4_t __b)
 {
-  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, b);
+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) __a, __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_s32 (int32_t *a, int32x2_t b)
+vst1_s32 (int32_t *__a, int32x2_t __b)
 {
-  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a, b);
+  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) __a, __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_s64 (int64_t *a, int64x1_t b)
+vst1_s64 (int64_t *__a, int64x1_t __b)
 {
-  *a = b[0];
+  *__a = __b[0];
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_u8 (uint8_t *a, uint8x8_t b)
+vst1_u8 (uint8_t *__a, uint8x8_t __b)
 {
-  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a,
-			     (int8x8_t) b);
+  __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) __a,
+			     (int8x8_t) __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_u16 (uint16_t *a, uint16x4_t b)
+vst1_u16 (uint16_t *__a, uint16x4_t __b)
 {
-  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a,
-			     (int16x4_t) b);
+  __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) __a,
+			     (int16x4_t) __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_u32 (uint32_t *a, uint32x2_t b)
+vst1_u32 (uint32_t *__a, uint32x2_t __b)
 {
-  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a,
-			     (int32x2_t) b);
+  __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) __a,
+			     (int32x2_t) __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_u64 (uint64_t *a, uint64x1_t b)
+vst1_u64 (uint64_t *__a, uint64x1_t __b)
 {
-  *a = b[0];
+  *__a = __b[0];
 }
 
 /* vst1q */
@@ -27600,100 +27918,100 @@ vst1q_f16 (float16_t *__a, float16x8_t __b)
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_f32 (float32_t *a, float32x4_t b)
+vst1q_f32 (float32_t *__a, float32x4_t __b)
 {
-  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b);
+  __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) __a, __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_f64 (float64_t *a, float64x2_t b)
+vst1q_f64 (float64_t *__a, float64x2_t __b)
 {
-  __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) a, b);
+  __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) __a, __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_p8 (poly8_t *a, poly8x16_t b)
+vst1q_p8 (poly8_t *__a, poly8x16_t __b)
 {
-  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
-			      (int8x16_t) b);
+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) __a,
+			      (int8x16_t) __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_p16 (poly16_t *a, poly16x8_t b)
+vst1q_p16 (poly16_t *__a, poly16x8_t __b)
 {
-  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
-			     (int16x8_t) b);
+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) __a,
+			     (int16x8_t) __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_p64 (poly64_t *a, poly64x2_t b)
+vst1q_p64 (poly64_t *__a, poly64x2_t __b)
 {
-  __builtin_aarch64_st1v2di_sp ((__builtin_aarch64_simd_di *) a,
-				(poly64x2_t) b);
+  __builtin_aarch64_st1v2di_sp ((__builtin_aarch64_simd_di *) __a,
+				(poly64x2_t) __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_s8 (int8_t *a, int8x16_t b)
+vst1q_s8 (int8_t *__a, int8x16_t __b)
 {
-  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, b);
+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) __a, __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_s16 (int16_t *a, int16x8_t b)
+vst1q_s16 (int16_t *__a, int16x8_t __b)
 {
-  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, b);
+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) __a, __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_s32 (int32_t *a, int32x4_t b)
+vst1q_s32 (int32_t *__a, int32x4_t __b)
 {
-  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, b);
+  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) __a, __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_s64 (int64_t *a, int64x2_t b)
+vst1q_s64 (int64_t *__a, int64x2_t __b)
 {
-  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a, b);
+  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) __a, __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_u8 (uint8_t *a, uint8x16_t b)
+vst1q_u8 (uint8_t *__a, uint8x16_t __b)
 {
-  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
-			      (int8x16_t) b);
+  __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) __a,
+			      (int8x16_t) __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_u16 (uint16_t *a, uint16x8_t b)
+vst1q_u16 (uint16_t *__a, uint16x8_t __b)
 {
-  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a,
-			     (int16x8_t) b);
+  __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) __a,
+			     (int16x8_t) __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_u32 (uint32_t *a, uint32x4_t b)
+vst1q_u32 (uint32_t *__a, uint32x4_t __b)
 {
-  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a,
-			     (int32x4_t) b);
+  __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) __a,
+			     (int32x4_t) __b);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_u64 (uint64_t *a, uint64x2_t b)
+vst1q_u64 (uint64_t *__a, uint64x2_t __b)
 {
-  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a,
-			     (int64x2_t) b);
+  __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) __a,
+			     (int64x2_t) __b);
 }
 
 /* vst1_lane */
@@ -27900,327 +28218,343 @@ vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane)
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_s64_x2 (int64_t * __a, int64x1x2_t val)
+vst1_s64_x2 (int64_t * __a, int64x1x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  int64x2x2_t temp;
-  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+  int64x2x2_t __temp;
+  __temp.val[0]
+    = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __temp.val[1]
+    = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1);
   __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_u64_x2 (uint64_t * __a, uint64x1x2_t val)
+vst1_u64_x2 (uint64_t * __a, uint64x1x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  uint64x2x2_t temp;
-  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
+  uint64x2x2_t __temp;
+  __temp.val[0]
+    = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]
+    = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1);
   __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_f64_x2 (float64_t * __a, float64x1x2_t val)
+vst1_f64_x2 (float64_t * __a, float64x1x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  float64x2x2_t temp;
-  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1);
+  float64x2x2_t __temp;
+  __temp.val[0]
+    = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]
+    = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[1], 1);
   __builtin_aarch64_st1x2df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_s8_x2 (int8_t * __a, int8x8x2_t val)
+vst1_s8_x2 (int8_t * __a, int8x8x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  int8x16x2_t temp;
-  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  int8x16x2_t __temp;
+  __temp.val[0]
+    = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __temp.val[1]
+    = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1);
   __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_p8_x2 (poly8_t * __a, poly8x8x2_t val)
+vst1_p8_x2 (poly8_t * __a, poly8x8x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  poly8x16x2_t temp;
-  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  poly8x16x2_t __temp;
+  __temp.val[0]
+    = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]
+    = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1);
   __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_s16_x2 (int16_t * __a, int16x4x2_t val)
+vst1_s16_x2 (int16_t * __a, int16x4x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  int16x8x2_t temp;
-  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  int16x8x2_t __temp;
+  __temp.val[0]
+    = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __temp.val[1]
+    = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1);
   __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_p16_x2 (poly16_t * __a, poly16x4x2_t val)
+vst1_p16_x2 (poly16_t * __a, poly16x4x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  poly16x8x2_t temp;
-  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  poly16x8x2_t __temp;
+  __temp.val[0]
+    = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]
+    = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1);
   __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_s32_x2 (int32_t * __a, int32x2x2_t val)
+vst1_s32_x2 (int32_t * __a, int32x2x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  int32x4x2_t temp;
-  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+  int32x4x2_t __temp;
+  __temp.val[0]
+    = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __temp.val[1]
+    = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1);
   __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_u8_x2 (uint8_t * __a, uint8x8x2_t val)
+vst1_u8_x2 (uint8_t * __a, uint8x8x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  uint8x16x2_t temp;
-  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
+  uint8x16x2_t __temp;
+  __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1);
   __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_u16_x2 (uint16_t * __a, uint16x4x2_t val)
+vst1_u16_x2 (uint16_t * __a, uint16x4x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  uint16x8x2_t temp;
-  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
+  uint16x8x2_t __temp;
+  __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1);
   __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_u32_x2 (uint32_t * __a, uint32x2x2_t val)
+vst1_u32_x2 (uint32_t * __a, uint32x2x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  uint32x4x2_t temp;
-  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
+  uint32x4x2_t __temp;
+  __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1);
   __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_f16_x2 (float16_t * __a, float16x4x2_t val)
+vst1_f16_x2 (float16_t * __a, float16x4x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  float16x8x2_t temp;
-  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
+  float16x8x2_t __temp;
+  __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, __temp.val[1], 1);
   __builtin_aarch64_st1x2v4hf (__a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_f32_x2 (float32_t * __a, float32x2x2_t val)
+vst1_f32_x2 (float32_t * __a, float32x2x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  float32x4x2_t temp;
-  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1);
+  float32x4x2_t __temp;
+  __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[1], 1);
   __builtin_aarch64_st1x2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_p64_x2 (poly64_t * __a, poly64x1x2_t val)
+vst1_p64_x2 (poly64_t * __a, poly64x1x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  poly64x2x2_t temp;
-  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  poly64x2x2_t __temp;
+  __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
   __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
-					       (poly64x2_t) temp.val[0], 0);
+					       (poly64x2_t) __temp.val[0], 0);
   __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
-					       (poly64x2_t) temp.val[1], 1);
+					       (poly64x2_t) __temp.val[1], 1);
   __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_s8_x2 (int8_t * __a, int8x16x2_t val)
+vst1q_s8_x2 (int8_t * __a, int8x16x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1);
   __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t val)
+vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1);
   __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_s16_x2 (int16_t * __a, int16x8x2_t val)
+vst1q_s16_x2 (int16_t * __a, int16x8x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1);
   __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t val)
+vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1);
   __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_s32_x2 (int32_t * __a, int32x4x2_t val)
+vst1q_s32_x2 (int32_t * __a, int32x4x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[1], 1);
   __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_s64_x2 (int64_t * __a, int64x2x2_t val)
+vst1q_s64_x2 (int64_t * __a, int64x2x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[1], 1);
   __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t val)
+vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1);
   __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t val)
+vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1);
   __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t val)
+vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[1], 1);
   __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t val)
+vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[1], 1);
   __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_f16_x2 (float16_t * __a, float16x8x2_t val)
+vst1q_f16_x2 (float16_t * __a, float16x8x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, __val.val[1], 1);
   __builtin_aarch64_st1x2v8hf (__a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
+vst1q_f32_x2 (float32_t * __a, float32x4x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __val.val[1], 1);
   __builtin_aarch64_st1x2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_f64_x2 (float64_t * __a, float64x2x2_t val)
+vst1q_f64_x2 (float64_t * __a, float64x2x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __val.val[1], 1);
   __builtin_aarch64_st1x2v2df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val)
+vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t __val)
 {
   __builtin_aarch64_simd_oi __o;
   __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
-					       (poly64x2_t) val.val[0], 0);
+					       (poly64x2_t) __val.val[0], 0);
   __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
-					       (poly64x2_t) val.val[1], 1);
+					       (poly64x2_t) __val.val[1], 1);
   __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
@@ -28228,1483 +28562,1709 @@ vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val)
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_s64_x3 (int64_t * __a, int64x1x3_t val)
+vst1_s64_x3 (int64_t * __a, int64x1x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  int64x2x3_t temp;
-  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+  int64x2x3_t __temp;
+  __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __temp.val[2] = vcombine_s64 (__val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2);
   __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_u64_x3 (uint64_t * __a, uint64x1x3_t val)
+vst1_u64_x3 (uint64_t * __a, uint64x1x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  uint64x2x3_t temp;
-  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
+  uint64x2x3_t __temp;
+  __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_u64 (__val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2);
   __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_f64_x3 (float64_t * __a, float64x1x3_t val)
+vst1_f64_x3 (float64_t * __a, float64x1x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  float64x2x3_t temp;
-  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2);
+  float64x2x3_t __temp;
+  __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_f64 (__val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[2], 2);
   __builtin_aarch64_st1x3df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_s8_x3 (int8_t * __a, int8x8x3_t val)
+vst1_s8_x3 (int8_t * __a, int8x8x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  int8x16x3_t temp;
-  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  int8x16x3_t __temp;
+  __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __temp.val[2] = vcombine_s8 (__val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2);
   __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_p8_x3 (poly8_t * __a, poly8x8x3_t val)
+vst1_p8_x3 (poly8_t * __a, poly8x8x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  poly8x16x3_t temp;
-  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  poly8x16x3_t __temp;
+  __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_p8 (__val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2);
   __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_s16_x3 (int16_t * __a, int16x4x3_t val)
+vst1_s16_x3 (int16_t * __a, int16x4x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  int16x8x3_t temp;
-  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  int16x8x3_t __temp;
+  __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __temp.val[2] = vcombine_s16 (__val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2);
   __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_p16_x3 (poly16_t * __a, poly16x4x3_t val)
+vst1_p16_x3 (poly16_t * __a, poly16x4x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  poly16x8x3_t temp;
-  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  poly16x8x3_t __temp;
+  __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_p16 (__val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2);
   __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_s32_x3 (int32_t * __a, int32x2x3_t val)
+vst1_s32_x3 (int32_t * __a, int32x2x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  int32x4x3_t temp;
-  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  int32x4x3_t __temp;
+  __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __temp.val[2] = vcombine_s32 (__val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2);
   __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_u8_x3 (uint8_t * __a, uint8x8x3_t val)
+vst1_u8_x3 (uint8_t * __a, uint8x8x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  uint8x16x3_t temp;
-  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  uint8x16x3_t __temp;
+  __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_u8 (__val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2);
   __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_u16_x3 (uint16_t * __a, uint16x4x3_t val)
+vst1_u16_x3 (uint16_t * __a, uint16x4x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  uint16x8x3_t temp;
-  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  uint16x8x3_t __temp;
+  __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_u16 (__val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2);
   __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_u32_x3 (uint32_t * __a, uint32x2x3_t val)
+vst1_u32_x3 (uint32_t * __a, uint32x2x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  uint32x4x3_t temp;
-  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  uint32x4x3_t __temp;
+  __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_u32 (__val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2);
   __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_f16_x3 (float16_t * __a, float16x4x3_t val)
+vst1_f16_x3 (float16_t * __a, float16x4x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  float16x8x3_t temp;
-  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
+  float16x8x3_t __temp;
+  __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_f16 (__val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[2], 2);
   __builtin_aarch64_st1x3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_f32_x3 (float32_t * __a, float32x2x3_t val)
+vst1_f32_x3 (float32_t * __a, float32x2x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  float32x4x3_t temp;
-  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2);
+  float32x4x3_t __temp;
+  __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_f32 (__val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[2], 2);
   __builtin_aarch64_st1x3v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_p64_x3 (poly64_t * __a, poly64x1x3_t val)
+vst1_p64_x3 (poly64_t * __a, poly64x1x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  poly64x2x3_t temp;
-  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  poly64x2x3_t __temp;
+  __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_p64 (__val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-					       (poly64x2_t) temp.val[0], 0);
+					       (poly64x2_t) __temp.val[0], 0);
   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-					       (poly64x2_t) temp.val[1], 1);
+					       (poly64x2_t) __temp.val[1], 1);
   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-					       (poly64x2_t) temp.val[2], 2);
+					       (poly64x2_t) __temp.val[2], 2);
   __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_s8_x3 (int8_t * __a, int8x16x3_t val)
+vst1q_s8_x3 (int8_t * __a, int8x16x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2);
   __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_p8_x3 (poly8_t * __a, poly8x16x3_t val)
+vst1q_p8_x3 (poly8_t * __a, poly8x16x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2);
   __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_s16_x3 (int16_t * __a, int16x8x3_t val)
+vst1q_s16_x3 (int16_t * __a, int16x8x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2);
   __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_p16_x3 (poly16_t * __a, poly16x8x3_t val)
+vst1q_p16_x3 (poly16_t * __a, poly16x8x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2);
   __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_s32_x3 (int32_t * __a, int32x4x3_t val)
+vst1q_s32_x3 (int32_t * __a, int32x4x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[2], 2);
   __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_s64_x3 (int64_t * __a, int64x2x3_t val)
+vst1q_s64_x3 (int64_t * __a, int64x2x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[2], 2);
   __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_u8_x3 (uint8_t * __a, uint8x16x3_t val)
+vst1q_u8_x3 (uint8_t * __a, uint8x16x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2);
   __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_u16_x3 (uint16_t * __a, uint16x8x3_t val)
+vst1q_u16_x3 (uint16_t * __a, uint16x8x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2);
   __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_u32_x3 (uint32_t * __a, uint32x4x3_t val)
+vst1q_u32_x3 (uint32_t * __a, uint32x4x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[2], 2);
   __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_u64_x3 (uint64_t * __a, uint64x2x3_t val)
+vst1q_u64_x3 (uint64_t * __a, uint64x2x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[2], 2);
   __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_f16_x3 (float16_t * __a, float16x8x3_t val)
+vst1q_f16_x3 (float16_t * __a, float16x8x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[2], 2);
   __builtin_aarch64_st1x3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_f32_x3 (float32_t * __a, float32x4x3_t val)
+vst1q_f32_x3 (float32_t * __a, float32x4x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[2], 2);
   __builtin_aarch64_st1x3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_f64_x3 (float64_t * __a, float64x2x3_t val)
+vst1q_f64_x3 (float64_t * __a, float64x2x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[2], 2);
   __builtin_aarch64_st1x3v2df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t val)
+vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-					       (poly64x2_t) val.val[0], 0);
+					       (poly64x2_t) __val.val[0], 0);
   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-					       (poly64x2_t) val.val[1], 1);
+					       (poly64x2_t) __val.val[1], 1);
   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-					       (poly64x2_t) val.val[2], 2);
+					       (poly64x2_t) __val.val[2], 2);
   __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
-/* vstn */
+/* vst1(q)_x4.  */
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2_s64 (int64_t * __a, int64x1x2_t val)
+vst1_s8_x4 (int8_t * __a, int8x8x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  int64x2x2_t temp;
-  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
-  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { int8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2_u64 (uint64_t * __a, uint64x1x2_t val)
+vst1q_s8_x4 (int8_t * __a, int8x16x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  uint64x2x2_t temp;
-  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1);
-  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { int8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2_f64 (float64_t * __a, float64x1x2_t val)
+vst1_s16_x4 (int16_t * __a, int16x4x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  float64x2x2_t temp;
-  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1);
-  __builtin_aarch64_st2df ((__builtin_aarch64_simd_df *) __a, __o);
+  union { int16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2_s8 (int8_t * __a, int8x8x2_t val)
+vst1q_s16_x4 (int16_t * __a, int16x8x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  int8x16x2_t temp;
-  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { int16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2_p8 (poly8_t * __a, poly8x8x2_t val)
+vst1_s32_x4 (int32_t * __a, int32x2x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  poly8x16x2_t temp;
-  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { int32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v2si ((__builtin_aarch64_simd_si *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2_s16 (int16_t * __a, int16x4x2_t val)
+vst1q_s32_x4 (int32_t * __a, int32x4x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  int16x8x2_t temp;
-  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { int32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v4si ((__builtin_aarch64_simd_si *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2_p16 (poly16_t * __a, poly16x4x2_t val)
+vst1_u8_x4 (uint8_t * __a, uint8x8x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  poly16x8x2_t temp;
-  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { uint8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2_s32 (int32_t * __a, int32x2x2_t val)
+vst1q_u8_x4 (uint8_t * __a, uint8x16x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  int32x4x2_t temp;
-  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
-  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+  union { uint8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2_u8 (uint8_t * __a, uint8x8x2_t val)
+vst1_u16_x4 (uint16_t * __a, uint16x4x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  uint8x16x2_t temp;
-  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { uint16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2_u16 (uint16_t * __a, uint16x4x2_t val)
+vst1q_u16_x4 (uint16_t * __a, uint16x8x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  uint16x8x2_t temp;
-  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { uint16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2_u32 (uint32_t * __a, uint32x2x2_t val)
+vst1_u32_x4 (uint32_t * __a, uint32x2x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  uint32x4x2_t temp;
-  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1);
-  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+  union { uint32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v2si ((__builtin_aarch64_simd_si *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2_f16 (float16_t * __a, float16x4x2_t val)
+vst1q_u32_x4 (uint32_t * __a, uint32x4x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  float16x8x2_t temp;
-  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
-  __builtin_aarch64_st2v4hf (__a, __o);
+  union { uint32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v4si ((__builtin_aarch64_simd_si *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2_f32 (float32_t * __a, float32x2x2_t val)
+vst1_f16_x4 (float16_t * __a, float16x4x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  float32x4x2_t temp;
-  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1);
-  __builtin_aarch64_st2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+  union { float16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v4hf ((__builtin_aarch64_simd_hf *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2_p64 (poly64_t * __a, poly64x1x2_t val)
+vst1q_f16_x4 (float16_t * __a, float16x8x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  poly64x2x2_t temp;
-  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
-					       (poly64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
-					       (poly64x2_t) temp.val[1], 1);
-  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { float16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v8hf ((__builtin_aarch64_simd_hf *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2q_s8 (int8_t * __a, int8x16x2_t val)
+vst1_f32_x4 (float32_t * __a, float32x2x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { float32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v2sf ((__builtin_aarch64_simd_sf *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2q_p8 (poly8_t * __a, poly8x16x2_t val)
+vst1q_f32_x4 (float32_t * __a, float32x4x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { float32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v4sf ((__builtin_aarch64_simd_sf *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2q_s16 (int16_t * __a, int16x8x2_t val)
+vst1_p8_x4 (poly8_t * __a, poly8x8x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { poly8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2q_p16 (poly16_t * __a, poly16x8x2_t val)
+vst1q_p8_x4 (poly8_t * __a, poly8x16x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { poly8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2q_s32 (int32_t * __a, int32x4x2_t val)
+vst1_p16_x4 (poly16_t * __a, poly16x4x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
-  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+  union { poly16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2q_s64 (int64_t * __a, int64x2x2_t val)
+vst1q_p16_x4 (poly16_t * __a, poly16x8x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
-  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { poly16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2q_u8 (uint8_t * __a, uint8x16x2_t val)
+vst1_s64_x4 (int64_t * __a, int64x1x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+  union { int64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2q_u16 (uint16_t * __a, uint16x8x2_t val)
+vst1_u64_x4 (uint64_t * __a, uint64x1x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+  union { uint64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2q_u32 (uint32_t * __a, uint32x4x2_t val)
+vst1_p64_x4 (poly64_t * __a, poly64x1x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
-  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+  union { poly64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2q_u64 (uint64_t * __a, uint64x2x2_t val)
+vst1q_s64_x4 (int64_t * __a, int64x2x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
-  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { int64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2q_f16 (float16_t * __a, float16x8x2_t val)
+vst1q_u64_x4 (uint64_t * __a, uint64x2x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
-  __builtin_aarch64_st2v8hf (__a, __o);
+  union { uint64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2q_f32 (float32_t * __a, float32x4x2_t val)
+vst1q_p64_x4 (poly64_t * __a, poly64x2x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
-  __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+  union { poly64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2q_f64 (float64_t * __a, float64x2x2_t val)
+vst1_f64_x4 (float64_t * __a, float64x1x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
-  __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o);
+  union { float64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4df ((__builtin_aarch64_simd_df *) __a, __u.__o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst2q_p64 (poly64_t * __a, poly64x2x2_t val)
+vst1q_f64_x4 (float64_t * __a, float64x2x4_t val)
 {
-  __builtin_aarch64_simd_oi __o;
-  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
-					       (poly64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
-					       (poly64x2_t) val.val[1], 1);
-  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+  union { float64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v2df ((__builtin_aarch64_simd_df *) __a, __u.__o);
 }
 
+/* vstn */
+
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3_s64 (int64_t * __a, int64x1x3_t val)
+vst2_s64 (int64_t * __a, int64x1x2_t __val)
 {
-  __builtin_aarch64_simd_ci __o;
-  int64x2x3_t temp;
-  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
-  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
+  __builtin_aarch64_simd_oi __o;
+  int64x2x2_t __temp;
+  __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1);
+  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3_u64 (uint64_t * __a, uint64x1x3_t val)
+vst2_u64 (uint64_t * __a, uint64x1x2_t __val)
 {
-  __builtin_aarch64_simd_ci __o;
-  uint64x2x3_t temp;
-  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2);
-  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
+  __builtin_aarch64_simd_oi __o;
+  uint64x2x2_t __temp;
+  __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1);
+  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_f64 (float64_t * __a, float64x1x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float64x2x2_t __temp;
+  __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[1], 1);
+  __builtin_aarch64_st2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_s8 (int8_t * __a, int8x8x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int8x16x2_t __temp;
+  __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1);
+  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_p8 (poly8_t * __a, poly8x8x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly8x16x2_t __temp;
+  __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1);
+  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_s16 (int16_t * __a, int16x4x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int16x8x2_t __temp;
+  __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1);
+  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_p16 (poly16_t * __a, poly16x4x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly16x8x2_t __temp;
+  __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1);
+  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_s32 (int32_t * __a, int32x2x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  int32x4x2_t __temp;
+  __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1);
+  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_u8 (uint8_t * __a, uint8x8x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint8x16x2_t __temp;
+  __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1);
+  __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_u16 (uint16_t * __a, uint16x4x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint16x8x2_t __temp;
+  __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1);
+  __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_u32 (uint32_t * __a, uint32x2x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint32x4x2_t __temp;
+  __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1);
+  __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_f16 (float16_t * __a, float16x4x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float16x8x2_t __temp;
+  __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, __temp.val[1], 1);
+  __builtin_aarch64_st2v4hf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_f32 (float32_t * __a, float32x2x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  float32x4x2_t __temp;
+  __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[1], 1);
+  __builtin_aarch64_st2v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_p64 (poly64_t * __a, poly64x1x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly64x2x2_t __temp;
+  __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) __temp.val[1], 1);
+  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_s8 (int8_t * __a, int8x16x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1);
+  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_p8 (poly8_t * __a, poly8x16x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1);
+  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_s16 (int16_t * __a, int16x8x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1);
+  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_p16 (poly16_t * __a, poly16x8x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1);
+  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_s32 (int32_t * __a, int32x4x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[1], 1);
+  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_s64 (int64_t * __a, int64x2x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[1], 1);
+  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_u8 (uint8_t * __a, uint8x16x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1);
+  __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_u16 (uint16_t * __a, uint16x8x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1);
+  __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_u32 (uint32_t * __a, uint32x4x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[1], 1);
+  __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_u64 (uint64_t * __a, uint64x2x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[1], 1);
+  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_f16 (float16_t * __a, float16x8x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8hf (__o, __val.val[1], 1);
+  __builtin_aarch64_st2v8hf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_f32 (float32_t * __a, float32x4x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __val.val[1], 1);
+  __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_f64 (float64_t * __a, float64x2x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __val.val[1], 1);
+  __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_p64 (poly64_t * __a, poly64x2x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) __val.val[1], 1);
+  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_s64 (int64_t * __a, int64x1x3_t __val)
+{
+  __builtin_aarch64_simd_ci __o;
+  int64x2x3_t __temp;
+  __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __temp.val[2] = vcombine_s64 (__val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2);
+  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_u64 (uint64_t * __a, uint64x1x3_t __val)
+{
+  __builtin_aarch64_simd_ci __o;
+  uint64x2x3_t __temp;
+  __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_u64 (__val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2);
+  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3_f64 (float64_t * __a, float64x1x3_t val)
+vst3_f64 (float64_t * __a, float64x1x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  float64x2x3_t temp;
-  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2);
+  float64x2x3_t __temp;
+  __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_f64 (__val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[2], 2);
   __builtin_aarch64_st3df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3_s8 (int8_t * __a, int8x8x3_t val)
+vst3_s8 (int8_t * __a, int8x8x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  int8x16x3_t temp;
-  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  int8x16x3_t __temp;
+  __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __temp.val[2] = vcombine_s8 (__val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2);
   __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3_p8 (poly8_t * __a, poly8x8x3_t val)
+vst3_p8 (poly8_t * __a, poly8x8x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  poly8x16x3_t temp;
-  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  poly8x16x3_t __temp;
+  __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_p8 (__val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2);
   __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3_s16 (int16_t * __a, int16x4x3_t val)
+vst3_s16 (int16_t * __a, int16x4x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  int16x8x3_t temp;
-  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  int16x8x3_t __temp;
+  __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __temp.val[2] = vcombine_s16 (__val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2);
   __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3_p16 (poly16_t * __a, poly16x4x3_t val)
+vst3_p16 (poly16_t * __a, poly16x4x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  poly16x8x3_t temp;
-  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  poly16x8x3_t __temp;
+  __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_p16 (__val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2);
   __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3_s32 (int32_t * __a, int32x2x3_t val)
+vst3_s32 (int32_t * __a, int32x2x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  int32x4x3_t temp;
-  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  int32x4x3_t __temp;
+  __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __temp.val[2] = vcombine_s32 (__val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2);
   __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3_u8 (uint8_t * __a, uint8x8x3_t val)
+vst3_u8 (uint8_t * __a, uint8x8x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  uint8x16x3_t temp;
-  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2);
+  uint8x16x3_t __temp;
+  __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_u8 (__val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2);
   __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3_u16 (uint16_t * __a, uint16x4x3_t val)
+vst3_u16 (uint16_t * __a, uint16x4x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  uint16x8x3_t temp;
-  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2);
+  uint16x8x3_t __temp;
+  __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_u16 (__val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2);
   __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3_u32 (uint32_t * __a, uint32x2x3_t val)
+vst3_u32 (uint32_t * __a, uint32x2x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  uint32x4x3_t temp;
-  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2);
+  uint32x4x3_t __temp;
+  __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_u32 (__val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2);
   __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3_f16 (float16_t * __a, float16x4x3_t val)
+vst3_f16 (float16_t * __a, float16x4x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  float16x8x3_t temp;
-  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
+  float16x8x3_t __temp;
+  __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_f16 (__val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[2], 2);
   __builtin_aarch64_st3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3_f32 (float32_t * __a, float32x2x3_t val)
+vst3_f32 (float32_t * __a, float32x2x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  float32x4x3_t temp;
-  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2);
+  float32x4x3_t __temp;
+  __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_f32 (__val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[2], 2);
   __builtin_aarch64_st3v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3_p64 (poly64_t * __a, poly64x1x3_t val)
+vst3_p64 (poly64_t * __a, poly64x1x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  poly64x2x3_t temp;
-  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  poly64x2x3_t __temp;
+  __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_p64 (__val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-					       (poly64x2_t) temp.val[0], 0);
+					       (poly64x2_t) __temp.val[0], 0);
   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-					       (poly64x2_t) temp.val[1], 1);
+					       (poly64x2_t) __temp.val[1], 1);
   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-					       (poly64x2_t) temp.val[2], 2);
+					       (poly64x2_t) __temp.val[2], 2);
   __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3q_s8 (int8_t * __a, int8x16x3_t val)
+vst3q_s8 (int8_t * __a, int8x16x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2);
   __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3q_p8 (poly8_t * __a, poly8x16x3_t val)
+vst3q_p8 (poly8_t * __a, poly8x16x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2);
   __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3q_s16 (int16_t * __a, int16x8x3_t val)
+vst3q_s16 (int16_t * __a, int16x8x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2);
   __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3q_p16 (poly16_t * __a, poly16x8x3_t val)
+vst3q_p16 (poly16_t * __a, poly16x8x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2);
   __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3q_s32 (int32_t * __a, int32x4x3_t val)
+vst3q_s32 (int32_t * __a, int32x4x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[2], 2);
   __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3q_s64 (int64_t * __a, int64x2x3_t val)
+vst3q_s64 (int64_t * __a, int64x2x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[2], 2);
   __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3q_u8 (uint8_t * __a, uint8x16x3_t val)
+vst3q_u8 (uint8_t * __a, uint8x16x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2);
   __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3q_u16 (uint16_t * __a, uint16x8x3_t val)
+vst3q_u16 (uint16_t * __a, uint16x8x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2);
   __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3q_u32 (uint32_t * __a, uint32x4x3_t val)
+vst3q_u32 (uint32_t * __a, uint32x4x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[2], 2);
   __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3q_u64 (uint64_t * __a, uint64x2x3_t val)
+vst3q_u64 (uint64_t * __a, uint64x2x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[2], 2);
   __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3q_f16 (float16_t * __a, float16x8x3_t val)
+vst3q_f16 (float16_t * __a, float16x8x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[2], 2);
   __builtin_aarch64_st3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3q_f32 (float32_t * __a, float32x4x3_t val)
+vst3q_f32 (float32_t * __a, float32x4x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[2], 2);
   __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3q_f64 (float64_t * __a, float64x2x3_t val)
+vst3q_f64 (float64_t * __a, float64x2x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[2], 2);
   __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst3q_p64 (poly64_t * __a, poly64x2x3_t val)
+vst3q_p64 (poly64_t * __a, poly64x2x3_t __val)
 {
   __builtin_aarch64_simd_ci __o;
   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-					       (poly64x2_t) val.val[0], 0);
+					       (poly64x2_t) __val.val[0], 0);
   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-					       (poly64x2_t) val.val[1], 1);
+					       (poly64x2_t) __val.val[1], 1);
   __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
-					       (poly64x2_t) val.val[2], 2);
+					       (poly64x2_t) __val.val[2], 2);
   __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4_s64 (int64_t * __a, int64x1x4_t val)
+vst4_s64 (int64_t * __a, int64x1x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  int64x2x4_t temp;
-  temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
-  temp.val[3] = vcombine_s64 (val.val[3], vcreate_s64 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3);
+  int64x2x4_t __temp;
+  __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __temp.val[2] = vcombine_s64 (__val.val[2], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __temp.val[3] = vcombine_s64 (__val.val[3], vcreate_s64 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[3], 3);
   __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4_u64 (uint64_t * __a, uint64x1x4_t val)
+vst4_u64 (uint64_t * __a, uint64x1x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  uint64x2x4_t temp;
-  temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_u64 (val.val[3], vcreate_u64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3);
+  uint64x2x4_t __temp;
+  __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_u64 (__val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __temp.val[3] = vcombine_u64 (__val.val[3], vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[3], 3);
   __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4_f64 (float64_t * __a, float64x1x4_t val)
+vst4_f64 (float64_t * __a, float64x1x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  float64x2x4_t temp;
-  temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_f64 (val.val[3], vcreate_f64 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[3], 3);
+  float64x2x4_t __temp;
+  __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_f64 (__val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __temp.val[3] = vcombine_f64 (__val.val[3], vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[3], 3);
   __builtin_aarch64_st4df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4_s8 (int8_t * __a, int8x8x4_t val)
+vst4_s8 (int8_t * __a, int8x8x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  int8x16x4_t temp;
-  temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
-  temp.val[3] = vcombine_s8 (val.val[3], vcreate_s8 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
+  int8x16x4_t __temp;
+  __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __temp.val[2] = vcombine_s8 (__val.val[2], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __temp.val[3] = vcombine_s8 (__val.val[3], vcreate_s8 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3);
   __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4_p8 (poly8_t * __a, poly8x8x4_t val)
+vst4_p8 (poly8_t * __a, poly8x8x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  poly8x16x4_t temp;
-  temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_p8 (val.val[3], vcreate_p8 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
+  poly8x16x4_t __temp;
+  __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_p8 (__val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __temp.val[3] = vcombine_p8 (__val.val[3], vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3);
   __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4_s16 (int16_t * __a, int16x4x4_t val)
+vst4_s16 (int16_t * __a, int16x4x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  int16x8x4_t temp;
-  temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
-  temp.val[3] = vcombine_s16 (val.val[3], vcreate_s16 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
+  int16x8x4_t __temp;
+  __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __temp.val[2] = vcombine_s16 (__val.val[2], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __temp.val[3] = vcombine_s16 (__val.val[3], vcreate_s16 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3);
   __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4_p16 (poly16_t * __a, poly16x4x4_t val)
+vst4_p16 (poly16_t * __a, poly16x4x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  poly16x8x4_t temp;
-  temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_p16 (val.val[3], vcreate_p16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
+  poly16x8x4_t __temp;
+  __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_p16 (__val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __temp.val[3] = vcombine_p16 (__val.val[3], vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3);
   __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4_s32 (int32_t * __a, int32x2x4_t val)
+vst4_s32 (int32_t * __a, int32x2x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  int32x4x4_t temp;
-  temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
-  temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
-  temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
-  temp.val[3] = vcombine_s32 (val.val[3], vcreate_s32 (__AARCH64_INT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3);
+  int32x4x4_t __temp;
+  __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __temp.val[2] = vcombine_s32 (__val.val[2], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __temp.val[3] = vcombine_s32 (__val.val[3], vcreate_s32 (__AARCH64_INT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[3], 3);
   __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4_u8 (uint8_t * __a, uint8x8x4_t val)
+vst4_u8 (uint8_t * __a, uint8x8x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  uint8x16x4_t temp;
-  temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_u8 (val.val[3], vcreate_u8 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3);
+  uint8x16x4_t __temp;
+  __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_u8 (__val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __temp.val[3] = vcombine_u8 (__val.val[3], vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3);
   __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4_u16 (uint16_t * __a, uint16x4x4_t val)
+vst4_u16 (uint16_t * __a, uint16x4x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  uint16x8x4_t temp;
-  temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_u16 (val.val[3], vcreate_u16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3);
+  uint16x8x4_t __temp;
+  __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_u16 (__val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __temp.val[3] = vcombine_u16 (__val.val[3], vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3);
   __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4_u32 (uint32_t * __a, uint32x2x4_t val)
+vst4_u32 (uint32_t * __a, uint32x2x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  uint32x4x4_t temp;
-  temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_u32 (val.val[3], vcreate_u32 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3);
+  uint32x4x4_t __temp;
+  __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_u32 (__val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __temp.val[3] = vcombine_u32 (__val.val[3], vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[3], 3);
   __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4_f16 (float16_t * __a, float16x4x4_t val)
+vst4_f16 (float16_t * __a, float16x4x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  float16x8x4_t temp;
-  temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_f16 (val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[3], 3);
+  float16x8x4_t __temp;
+  __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_f16 (__val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __temp.val[3] = vcombine_f16 (__val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[3], 3);
   __builtin_aarch64_st4v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4_f32 (float32_t * __a, float32x2x4_t val)
+vst4_f32 (float32_t * __a, float32x2x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  float32x4x4_t temp;
-  temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_f32 (val.val[3], vcreate_f32 (__AARCH64_UINT64_C (0)));
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[3], 3);
+  float32x4x4_t __temp;
+  __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_f32 (__val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __temp.val[3] = vcombine_f32 (__val.val[3], vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[3], 3);
   __builtin_aarch64_st4v2sf ((__builtin_aarch64_simd_sf *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4_p64 (poly64_t * __a, poly64x1x4_t val)
+vst4_p64 (poly64_t * __a, poly64x1x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  poly64x2x4_t temp;
-  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
-  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
-  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
-  temp.val[3] = vcombine_p64 (val.val[3], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  poly64x2x4_t __temp;
+  __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_p64 (__val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __temp.val[3] = vcombine_p64 (__val.val[3], vcreate_p64 (__AARCH64_UINT64_C (0)));
   __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
-					       (poly64x2_t) temp.val[0], 0);
+					       (poly64x2_t) __temp.val[0], 0);
   __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
-					       (poly64x2_t) temp.val[1], 1);
+					       (poly64x2_t) __temp.val[1], 1);
   __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
-					       (poly64x2_t) temp.val[2], 2);
+					       (poly64x2_t) __temp.val[2], 2);
   __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
-					       (poly64x2_t) temp.val[3], 3);
+					       (poly64x2_t) __temp.val[3], 3);
   __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4q_s8 (int8_t * __a, int8x16x4_t val)
+vst4q_s8 (int8_t * __a, int8x16x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[3], 3);
   __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4q_p8 (poly8_t * __a, poly8x16x4_t val)
+vst4q_p8 (poly8_t * __a, poly8x16x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[3], 3);
   __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4q_s16 (int16_t * __a, int16x8x4_t val)
+vst4q_s16 (int16_t * __a, int16x8x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[3], 3);
   __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4q_p16 (poly16_t * __a, poly16x8x4_t val)
+vst4q_p16 (poly16_t * __a, poly16x8x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[3], 3);
   __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4q_s32 (int32_t * __a, int32x4x4_t val)
+vst4q_s32 (int32_t * __a, int32x4x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[3], 3);
   __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4q_s64 (int64_t * __a, int64x2x4_t val)
+vst4q_s64 (int64_t * __a, int64x2x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[3], 3);
   __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4q_u8 (uint8_t * __a, uint8x16x4_t val)
+vst4q_u8 (uint8_t * __a, uint8x16x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[3], 3);
   __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4q_u16 (uint16_t * __a, uint16x8x4_t val)
+vst4q_u16 (uint16_t * __a, uint16x8x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[3], 3);
   __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4q_u32 (uint32_t * __a, uint32x4x4_t val)
+vst4q_u32 (uint32_t * __a, uint32x4x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[3], 3);
   __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4q_u64 (uint64_t * __a, uint64x2x4_t val)
+vst4q_u64 (uint64_t * __a, uint64x2x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[3], 3);
   __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4q_f16 (float16_t * __a, float16x8x4_t val)
+vst4q_f16 (float16_t * __a, float16x8x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[3], 3);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __val.val[3], 3);
   __builtin_aarch64_st4v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4q_f32 (float32_t * __a, float32x4x4_t val)
+vst4q_f32 (float32_t * __a, float32x4x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[3], 3);
+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __val.val[3], 3);
   __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4q_f64 (float64_t * __a, float64x2x4_t val)
+vst4q_f64 (float64_t * __a, float64x2x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[0], 0);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[1], 1);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[2], 2);
-  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[3], 3);
+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __val.val[3], 3);
   __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst4q_p64 (poly64_t * __a, poly64x2x4_t val)
+vst4q_p64 (poly64_t * __a, poly64x2x4_t __val)
 {
   __builtin_aarch64_simd_xi __o;
   __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
-					       (poly64x2_t) val.val[0], 0);
+					       (poly64x2_t) __val.val[0], 0);
   __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
-					       (poly64x2_t) val.val[1], 1);
+					       (poly64x2_t) __val.val[1], 1);
   __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
-					       (poly64x2_t) val.val[2], 2);
+					       (poly64x2_t) __val.val[2], 2);
   __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
-					       (poly64x2_t) val.val[3], 3);
+					       (poly64x2_t) __val.val[3], 3);
   __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
 }
 
@@ -29796,53 +30356,53 @@ __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx)
 {
-  int8x8_t result;
-  int8x16x2_t temp;
+  int8x8_t __result;
+  int8x16x2_t __temp;
   __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
-  temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]);
+  __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]);
+  __temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]);
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
+					   (int8x16_t) __temp.val[0], 0);
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = __builtin_aarch64_tbx4v8qi (__r, __o, __idx);
-  return result;
+					   (int8x16_t) __temp.val[1], 1);
+  __result = __builtin_aarch64_tbx4v8qi (__r, __o, __idx);
+  return __result;
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx)
 {
-  uint8x8_t result;
-  uint8x16x2_t temp;
+  uint8x8_t __result;
+  uint8x16x2_t __temp;
   __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
-  temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]);
+  __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]);
+  __temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]);
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
+					   (int8x16_t) __temp.val[0], 0);
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
+					   (int8x16_t) __temp.val[1], 1);
+  __result = (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
 						  (int8x8_t)__idx);
-  return result;
+  return __result;
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
 {
-  poly8x8_t result;
-  poly8x16x2_t temp;
+  poly8x8_t __result;
+  poly8x16x2_t __temp;
   __builtin_aarch64_simd_oi __o;
-  temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
-  temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]);
+  __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]);
+  __temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]);
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[0], 0);
+					   (int8x16_t) __temp.val[0], 0);
   __o = __builtin_aarch64_set_qregoiv16qi (__o,
-					   (int8x16_t) temp.val[1], 1);
-  result = (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
+					   (int8x16_t) __temp.val[1], 1);
+  __result = (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o,
 						  (int8x8_t)__idx);
-  return result;
+  return __result;
 }
 
 /* vtrn */
@@ -30374,65 +30934,65 @@ vtrn_f16 (float16x4_t __a, float16x4_t __b)
 
 __extension__ extern __inline float32x2x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrn_f32 (float32x2_t a, float32x2_t b)
+vtrn_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return (float32x2x2_t) {vtrn1_f32 (a, b), vtrn2_f32 (a, b)};
+  return (float32x2x2_t) {vtrn1_f32 (__a, __b), vtrn2_f32 (__a, __b)};
 }
 
 __extension__ extern __inline poly8x8x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrn_p8 (poly8x8_t a, poly8x8_t b)
+vtrn_p8 (poly8x8_t __a, poly8x8_t __b)
 {
-  return (poly8x8x2_t) {vtrn1_p8 (a, b), vtrn2_p8 (a, b)};
+  return (poly8x8x2_t) {vtrn1_p8 (__a, __b), vtrn2_p8 (__a, __b)};
 }
 
 __extension__ extern __inline poly16x4x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrn_p16 (poly16x4_t a, poly16x4_t b)
+vtrn_p16 (poly16x4_t __a, poly16x4_t __b)
 {
-  return (poly16x4x2_t) {vtrn1_p16 (a, b), vtrn2_p16 (a, b)};
+  return (poly16x4x2_t) {vtrn1_p16 (__a, __b), vtrn2_p16 (__a, __b)};
 }
 
 __extension__ extern __inline int8x8x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrn_s8 (int8x8_t a, int8x8_t b)
+vtrn_s8 (int8x8_t __a, int8x8_t __b)
 {
-  return (int8x8x2_t) {vtrn1_s8 (a, b), vtrn2_s8 (a, b)};
+  return (int8x8x2_t) {vtrn1_s8 (__a, __b), vtrn2_s8 (__a, __b)};
 }
 
 __extension__ extern __inline int16x4x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrn_s16 (int16x4_t a, int16x4_t b)
+vtrn_s16 (int16x4_t __a, int16x4_t __b)
 {
-  return (int16x4x2_t) {vtrn1_s16 (a, b), vtrn2_s16 (a, b)};
+  return (int16x4x2_t) {vtrn1_s16 (__a, __b), vtrn2_s16 (__a, __b)};
 }
 
 __extension__ extern __inline int32x2x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrn_s32 (int32x2_t a, int32x2_t b)
+vtrn_s32 (int32x2_t __a, int32x2_t __b)
 {
-  return (int32x2x2_t) {vtrn1_s32 (a, b), vtrn2_s32 (a, b)};
+  return (int32x2x2_t) {vtrn1_s32 (__a, __b), vtrn2_s32 (__a, __b)};
 }
 
 __extension__ extern __inline uint8x8x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrn_u8 (uint8x8_t a, uint8x8_t b)
+vtrn_u8 (uint8x8_t __a, uint8x8_t __b)
 {
-  return (uint8x8x2_t) {vtrn1_u8 (a, b), vtrn2_u8 (a, b)};
+  return (uint8x8x2_t) {vtrn1_u8 (__a, __b), vtrn2_u8 (__a, __b)};
 }
 
 __extension__ extern __inline uint16x4x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrn_u16 (uint16x4_t a, uint16x4_t b)
+vtrn_u16 (uint16x4_t __a, uint16x4_t __b)
 {
-  return (uint16x4x2_t) {vtrn1_u16 (a, b), vtrn2_u16 (a, b)};
+  return (uint16x4x2_t) {vtrn1_u16 (__a, __b), vtrn2_u16 (__a, __b)};
 }
 
 __extension__ extern __inline uint32x2x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrn_u32 (uint32x2_t a, uint32x2_t b)
+vtrn_u32 (uint32x2_t __a, uint32x2_t __b)
 {
-  return (uint32x2x2_t) {vtrn1_u32 (a, b), vtrn2_u32 (a, b)};
+  return (uint32x2x2_t) {vtrn1_u32 (__a, __b), vtrn2_u32 (__a, __b)};
 }
 
 __extension__ extern __inline float16x8x2_t
@@ -30444,65 +31004,65 @@ vtrnq_f16 (float16x8_t __a, float16x8_t __b)
 
 __extension__ extern __inline float32x4x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrnq_f32 (float32x4_t a, float32x4_t b)
+vtrnq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return (float32x4x2_t) {vtrn1q_f32 (a, b), vtrn2q_f32 (a, b)};
+  return (float32x4x2_t) {vtrn1q_f32 (__a, __b), vtrn2q_f32 (__a, __b)};
 }
 
 __extension__ extern __inline poly8x16x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrnq_p8 (poly8x16_t a, poly8x16_t b)
+vtrnq_p8 (poly8x16_t __a, poly8x16_t __b)
 {
-  return (poly8x16x2_t) {vtrn1q_p8 (a, b), vtrn2q_p8 (a, b)};
+  return (poly8x16x2_t) {vtrn1q_p8 (__a, __b), vtrn2q_p8 (__a, __b)};
 }
 
 __extension__ extern __inline poly16x8x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrnq_p16 (poly16x8_t a, poly16x8_t b)
+vtrnq_p16 (poly16x8_t __a, poly16x8_t __b)
 {
-  return (poly16x8x2_t) {vtrn1q_p16 (a, b), vtrn2q_p16 (a, b)};
+  return (poly16x8x2_t) {vtrn1q_p16 (__a, __b), vtrn2q_p16 (__a, __b)};
 }
 
 __extension__ extern __inline int8x16x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrnq_s8 (int8x16_t a, int8x16_t b)
+vtrnq_s8 (int8x16_t __a, int8x16_t __b)
 {
-  return (int8x16x2_t) {vtrn1q_s8 (a, b), vtrn2q_s8 (a, b)};
+  return (int8x16x2_t) {vtrn1q_s8 (__a, __b), vtrn2q_s8 (__a, __b)};
 }
 
 __extension__ extern __inline int16x8x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrnq_s16 (int16x8_t a, int16x8_t b)
+vtrnq_s16 (int16x8_t __a, int16x8_t __b)
 {
-  return (int16x8x2_t) {vtrn1q_s16 (a, b), vtrn2q_s16 (a, b)};
+  return (int16x8x2_t) {vtrn1q_s16 (__a, __b), vtrn2q_s16 (__a, __b)};
 }
 
 __extension__ extern __inline int32x4x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrnq_s32 (int32x4_t a, int32x4_t b)
+vtrnq_s32 (int32x4_t __a, int32x4_t __b)
 {
-  return (int32x4x2_t) {vtrn1q_s32 (a, b), vtrn2q_s32 (a, b)};
+  return (int32x4x2_t) {vtrn1q_s32 (__a, __b), vtrn2q_s32 (__a, __b)};
 }
 
 __extension__ extern __inline uint8x16x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrnq_u8 (uint8x16_t a, uint8x16_t b)
+vtrnq_u8 (uint8x16_t __a, uint8x16_t __b)
 {
-  return (uint8x16x2_t) {vtrn1q_u8 (a, b), vtrn2q_u8 (a, b)};
+  return (uint8x16x2_t) {vtrn1q_u8 (__a, __b), vtrn2q_u8 (__a, __b)};
 }
 
 __extension__ extern __inline uint16x8x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrnq_u16 (uint16x8_t a, uint16x8_t b)
+vtrnq_u16 (uint16x8_t __a, uint16x8_t __b)
 {
-  return (uint16x8x2_t) {vtrn1q_u16 (a, b), vtrn2q_u16 (a, b)};
+  return (uint16x8x2_t) {vtrn1q_u16 (__a, __b), vtrn2q_u16 (__a, __b)};
 }
 
 __extension__ extern __inline uint32x4x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vtrnq_u32 (uint32x4_t a, uint32x4_t b)
+vtrnq_u32 (uint32x4_t __a, uint32x4_t __b)
 {
-  return (uint32x4x2_t) {vtrn1q_u32 (a, b), vtrn2q_u32 (a, b)};
+  return (uint32x4x2_t) {vtrn1q_u32 (__a, __b), vtrn2q_u32 (__a, __b)};
 }
 
 /* vtst */
@@ -32200,30 +32760,30 @@ vrndxq_f16 (float16x8_t __a)
 
 __extension__ extern __inline float16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrsqrte_f16 (float16x4_t a)
+vrsqrte_f16 (float16x4_t __a)
 {
-  return __builtin_aarch64_rsqrtev4hf (a);
+  return __builtin_aarch64_rsqrtev4hf (__a);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrsqrteq_f16 (float16x8_t a)
+vrsqrteq_f16 (float16x8_t __a)
 {
-  return __builtin_aarch64_rsqrtev8hf (a);
+  return __builtin_aarch64_rsqrtev8hf (__a);
 }
 
 __extension__ extern __inline float16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vsqrt_f16 (float16x4_t a)
+vsqrt_f16 (float16x4_t __a)
 {
-  return __builtin_aarch64_sqrtv4hf (a);
+  return __builtin_aarch64_sqrtv4hf (__a);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vsqrtq_f16 (float16x8_t a)
+vsqrtq_f16 (float16x8_t __a)
 {
-  return __builtin_aarch64_sqrtv8hf (a);
+  return __builtin_aarch64_sqrtv8hf (__a);
 }
 
 /* ARMv8.2-A FP16 two operands vector intrinsics.  */
@@ -32244,16 +32804,16 @@ vaddq_f16 (float16x8_t __a, float16x8_t __b)
 
 __extension__ extern __inline float16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabd_f16 (float16x4_t a, float16x4_t b)
+vabd_f16 (float16x4_t __a, float16x4_t __b)
 {
-  return __builtin_aarch64_fabdv4hf (a, b);
+  return __builtin_aarch64_fabdv4hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vabdq_f16 (float16x8_t a, float16x8_t b)
+vabdq_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_aarch64_fabdv8hf (a, b);
+  return __builtin_aarch64_fabdv8hf (__a, __b);
 }
 
 __extension__ extern __inline uint16x4_t
@@ -32538,72 +33098,72 @@ vmulxq_f16 (float16x8_t __a, float16x8_t __b)
 
 __extension__ extern __inline float16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpadd_f16 (float16x4_t a, float16x4_t b)
+vpadd_f16 (float16x4_t __a, float16x4_t __b)
 {
-  return __builtin_aarch64_faddpv4hf (a, b);
+  return __builtin_aarch64_faddpv4hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpaddq_f16 (float16x8_t a, float16x8_t b)
+vpaddq_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_aarch64_faddpv8hf (a, b);
+  return __builtin_aarch64_faddpv8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmax_f16 (float16x4_t a, float16x4_t b)
+vpmax_f16 (float16x4_t __a, float16x4_t __b)
 {
-  return __builtin_aarch64_smax_nanpv4hf (a, b);
+  return __builtin_aarch64_smax_nanpv4hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxq_f16 (float16x8_t a, float16x8_t b)
+vpmaxq_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_aarch64_smax_nanpv8hf (a, b);
+  return __builtin_aarch64_smax_nanpv8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxnm_f16 (float16x4_t a, float16x4_t b)
+vpmaxnm_f16 (float16x4_t __a, float16x4_t __b)
 {
-  return __builtin_aarch64_smaxpv4hf (a, b);
+  return __builtin_aarch64_smaxpv4hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmaxnmq_f16 (float16x8_t a, float16x8_t b)
+vpmaxnmq_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_aarch64_smaxpv8hf (a, b);
+  return __builtin_aarch64_smaxpv8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpmin_f16 (float16x4_t a, float16x4_t b)
+vpmin_f16 (float16x4_t __a, float16x4_t __b)
 {
-  return __builtin_aarch64_smin_nanpv4hf (a, b);
+  return __builtin_aarch64_smin_nanpv4hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminq_f16 (float16x8_t a, float16x8_t b)
+vpminq_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_aarch64_smin_nanpv8hf (a, b);
+  return __builtin_aarch64_smin_nanpv8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminnm_f16 (float16x4_t a, float16x4_t b)
+vpminnm_f16 (float16x4_t __a, float16x4_t __b)
 {
-  return __builtin_aarch64_sminpv4hf (a, b);
+  return __builtin_aarch64_sminpv4hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vpminnmq_f16 (float16x8_t a, float16x8_t b)
+vpminnmq_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_aarch64_sminpv8hf (a, b);
+  return __builtin_aarch64_sminpv8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x4_t
@@ -32622,16 +33182,16 @@ vrecpsq_f16 (float16x8_t __a, float16x8_t __b)
 
 __extension__ extern __inline float16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrsqrts_f16 (float16x4_t a, float16x4_t b)
+vrsqrts_f16 (float16x4_t __a, float16x4_t __b)
 {
-  return __builtin_aarch64_rsqrtsv4hf (a, b);
+  return __builtin_aarch64_rsqrtsv4hf (__a, __b);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vrsqrtsq_f16 (float16x8_t a, float16x8_t b)
+vrsqrtsq_f16 (float16x8_t __a, float16x8_t __b)
 {
-  return __builtin_aarch64_rsqrtsv8hf (a, b);
+  return __builtin_aarch64_rsqrtsv8hf (__a, __b);
 }
 
 __extension__ extern __inline float16x4_t
@@ -33961,6 +34521,1308 @@ vfmlslq_laneq_high_f16 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
 
 #pragma GCC pop_options
 
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.5-a")
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd32z_f32 (float32x2_t __a)
+{
+  return __builtin_aarch64_frint32zv2sf (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd32zq_f32 (float32x4_t __a)
+{
+  return __builtin_aarch64_frint32zv4sf (__a);
+}
+
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd32z_f64 (float64x1_t __a)
+{
+  return (float64x1_t)
+	   {__builtin_aarch64_frint32zdf (vget_lane_f64 (__a, 0))};
+}
+
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd32zq_f64 (float64x2_t __a)
+{
+  return __builtin_aarch64_frint32zv2df (__a);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd32x_f32 (float32x2_t __a)
+{
+  return __builtin_aarch64_frint32xv2sf (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd32xq_f32 (float32x4_t __a)
+{
+  return __builtin_aarch64_frint32xv4sf (__a);
+}
+
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd32x_f64 (float64x1_t __a)
+{
+  return (float64x1_t) {__builtin_aarch64_frint32xdf (vget_lane_f64 (__a, 0))};
+}
+
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd32xq_f64 (float64x2_t __a)
+{
+  return __builtin_aarch64_frint32xv2df (__a);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd64z_f32 (float32x2_t __a)
+{
+  return __builtin_aarch64_frint64zv2sf (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd64zq_f32 (float32x4_t __a)
+{
+  return __builtin_aarch64_frint64zv4sf (__a);
+}
+
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd64z_f64 (float64x1_t __a)
+{
+  return (float64x1_t) {__builtin_aarch64_frint64zdf (vget_lane_f64 (__a, 0))};
+}
+
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd64zq_f64 (float64x2_t __a)
+{
+  return __builtin_aarch64_frint64zv2df (__a);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd64x_f32 (float32x2_t __a)
+{
+  return __builtin_aarch64_frint64xv2sf (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd64xq_f32 (float32x4_t __a)
+{
+  return __builtin_aarch64_frint64xv4sf (__a);
+}
+
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd64x_f64 (float64x1_t __a)
+{
+  return (float64x1_t) {__builtin_aarch64_frint64xdf (vget_lane_f64 (__a, 0))};
+}
+
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrnd64xq_f64 (float64x2_t __a)
+{
+  return __builtin_aarch64_frint64xv2df (__a);
+}
+
+#pragma GCC pop_options
+
+#include "arm_bf16.h"
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vset_lane_bf16 (bfloat16_t __elem, bfloat16x4_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsetq_lane_bf16 (bfloat16_t __elem, bfloat16x8_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vgetq_lane_bf16 (bfloat16x8_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcreate_bf16 (uint64_t __a)
+{
+  return (bfloat16x4_t) __a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcombine_bf16 (bfloat16x4_t __a, bfloat16x4_t __b)
+{
+  return (bfloat16x8_t)__builtin_aarch64_combinev4bf (__a, __b);
+}
+
+/* vdup */
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_bf16 (bfloat16_t __a)
+{
+  return (bfloat16x4_t) {__a, __a, __a, __a};
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_bf16 (bfloat16_t __a)
+{
+  return (bfloat16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+  return vdup_n_bf16 (__aarch64_vget_lane_any (__a, __b));
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+  return vdup_n_bf16 (__aarch64_vget_lane_any (__a, __b));
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+  return vdupq_n_bf16 (__aarch64_vget_lane_any (__a, __b));
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+  return vdupq_n_bf16 (__aarch64_vget_lane_any (__a, __b));
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
+/* vld */
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_bf16 (const bfloat16_t *__a)
+{
+  return (bfloat16x4_t) __builtin_aarch64_ld1v4bf (__a);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_bf16 (const bfloat16_t *__a)
+{
+  return __builtin_aarch64_ld1v8bf (__a);
+}
+
+__extension__ extern __inline bfloat16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_bf16_x2 (const bfloat16_t *__a)
+{
+  bfloat16x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld1x2v4bf ((const __builtin_aarch64_simd_bf *) __a);
+  ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0);
+  ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1);
+  return ret;
+}
+
+__extension__ extern __inline bfloat16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_bf16_x2 (const bfloat16_t *__a)
+{
+  bfloat16x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld1x2v8bf ((const __builtin_aarch64_simd_bf *) __a);
+  ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0);
+  ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1);
+  return ret;
+}
+
+__extension__ extern __inline bfloat16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_bf16_x3 (const bfloat16_t *__a)
+{
+  bfloat16x4x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v4bf ((const __builtin_aarch64_simd_bf *) __a);
+  __i.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf  (__o, 0);
+  __i.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf  (__o, 1);
+  __i.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf  (__o, 2);
+  return __i;
+}
+
+__extension__ extern __inline bfloat16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_bf16_x3 (const bfloat16_t *__a)
+{
+  bfloat16x8x3_t __i;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld1x3v8bf ((const __builtin_aarch64_simd_bf *) __a);
+  __i.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf  (__o, 0);
+  __i.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf  (__o, 1);
+  __i.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf  (__o, 2);
+  return __i;
+}
+__extension__ extern __inline bfloat16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_bf16_x4 (const bfloat16_t *__a)
+{
+  union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v4bf ((const __builtin_aarch64_simd_bf *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline bfloat16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_bf16_x4 (const bfloat16_t *__a)
+{
+  union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+  __au.__o
+    = __builtin_aarch64_ld1x4v8bf ((const __builtin_aarch64_simd_bf *) __a);
+  return __au.__i;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_bf16 (const bfloat16_t *__src, bfloat16x4_t __vec, const int __lane)
+{
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_bf16 (const bfloat16_t *__src, bfloat16x8_t __vec, const int __lane)
+{
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_bf16 (const bfloat16_t* __a)
+{
+  return vdup_n_bf16 (*__a);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_bf16 (const bfloat16_t* __a)
+{
+  return vdupq_n_bf16 (*__a);
+}
+
+__extension__ extern __inline bfloat16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_bf16 (const bfloat16_t * __a)
+{
+  bfloat16x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v4bf (__a);
+  ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0);
+  ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1);
+  return ret;
+}
+
+__extension__ extern __inline bfloat16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_bf16 (const bfloat16_t * __a)
+{
+  bfloat16x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v8bf ((const __builtin_aarch64_simd_bf *) __a);
+  ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0);
+  ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1);
+  return ret;
+}
+
+__extension__ extern __inline bfloat16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_bf16 (const bfloat16_t * __a)
+{
+  bfloat16x4x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv4bf ((const __builtin_aarch64_simd_bf *) __a);
+  ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0);
+  ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1);
+  return ret;
+}
+
+__extension__ extern __inline bfloat16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_bf16 (const bfloat16_t * __a)
+{
+  bfloat16x8x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv8bf ((const __builtin_aarch64_simd_bf *) __a);
+  ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0);
+  ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1);
+  return ret;
+}
+
+__extension__ extern __inline bfloat16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_bf16 (const bfloat16_t * __a)
+{
+  bfloat16x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v4bf ((const __builtin_aarch64_simd_bf *) __a);
+  ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0);
+  ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1);
+  ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2);
+  return ret;
+}
+
+__extension__ extern __inline bfloat16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_bf16 (const bfloat16_t * __a)
+{
+  bfloat16x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v8bf ((const __builtin_aarch64_simd_bf *) __a);
+  ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0);
+  ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1);
+  ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2);
+  return ret;
+}
+
+__extension__ extern __inline bfloat16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_bf16 (const bfloat16_t * __a)
+{
+  bfloat16x4x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv4bf ((const __builtin_aarch64_simd_bf *) __a);
+  ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0);
+  ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1);
+  ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2);
+  return ret;
+}
+
+__extension__ extern __inline bfloat16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_bf16 (const bfloat16_t * __a)
+{
+  bfloat16x8x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv8bf ((const __builtin_aarch64_simd_bf *) __a);
+  ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0);
+  ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1);
+  ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2);
+  return ret;
+}
+
+__extension__ extern __inline bfloat16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_bf16 (const bfloat16_t * __a)
+{
+  bfloat16x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v4bf ((const __builtin_aarch64_simd_bf *) __a);
+  ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 0);
+  ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 1);
+  ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 2);
+  ret.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 3);
+  return ret;
+}
+
+__extension__ extern __inline bfloat16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_bf16 (const bfloat16_t * __a)
+{
+  bfloat16x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v8bf ((const __builtin_aarch64_simd_bf *) __a);
+  ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 0);
+  ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 1);
+  ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 2);
+  ret.val[3] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 3);
+  return ret;
+}
+
+__extension__ extern __inline bfloat16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_bf16 (const bfloat16_t * __a)
+{
+  bfloat16x4x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv4bf ((const __builtin_aarch64_simd_bf *) __a);
+  ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 0);
+  ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 1);
+  ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 2);
+  ret.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 3);
+  return ret;
+}
+
+__extension__ extern __inline bfloat16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_bf16 (const bfloat16_t * __a)
+{
+  bfloat16x8x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv8bf ((const __builtin_aarch64_simd_bf *) __a);
+  ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 0);
+  ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 1);
+  ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 2);
+  ret.val[3] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 3);
+  return ret;
+}
+
+/* vst */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_bf16 (bfloat16_t *__a, bfloat16x4_t __b)
+{
+  __builtin_aarch64_st1v4bf (__a, __b);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_bf16_x2 (bfloat16_t * __a, bfloat16x4x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  bfloat16x8x2_t __temp;
+  __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1);
+  __builtin_aarch64_st1x2v4bf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_bf16_x2 (bfloat16_t * __a, bfloat16x8x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1);
+  __builtin_aarch64_st1x2v8bf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_bf16_x3 (bfloat16_t * __a, bfloat16x4x3_t __val)
+{
+  __builtin_aarch64_simd_ci __o;
+  bfloat16x8x3_t __temp;
+  __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
+  __builtin_aarch64_st1x3v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_bf16_x3 (bfloat16_t * __a, bfloat16x8x3_t __val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
+  __builtin_aarch64_st1x3v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t val)
+{
+  union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __u.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t val)
+{
+  union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+  __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __u.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_bf16 (bfloat16_t *__a, bfloat16x8_t __b)
+{
+  __builtin_aarch64_st1v8bf (__a, __b);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_bf16 (bfloat16_t *__a, bfloat16x4_t __b, const int __lane)
+{
+  *__a = __aarch64_vget_lane_any (__b, __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_bf16 (bfloat16_t *__a, bfloat16x8_t __b, const int __lane)
+{
+  *__a = __aarch64_vget_lane_any (__b, __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_bf16 (bfloat16_t * __a, bfloat16x4x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  bfloat16x8x2_t __temp;
+  __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1);
+  __builtin_aarch64_st2v4bf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_bf16 (bfloat16_t * __a, bfloat16x8x2_t __val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1);
+  __builtin_aarch64_st2v8bf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_bf16 (bfloat16_t * __a, bfloat16x4x3_t __val)
+{
+  __builtin_aarch64_simd_ci __o;
+  bfloat16x8x3_t __temp;
+  __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
+  __builtin_aarch64_st3v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_bf16 (bfloat16_t * __a, bfloat16x8x3_t __val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
+  __builtin_aarch64_st3v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_bf16 (bfloat16_t * __a, bfloat16x4x4_t __val)
+{
+  __builtin_aarch64_simd_xi __o;
+  bfloat16x8x4_t __temp;
+  __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+  __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+  __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[3], 3);
+  __builtin_aarch64_st4v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_bf16 (bfloat16_t * __a, bfloat16x8x4_t __val)
+{
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[3], 3);
+  __builtin_aarch64_st4v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+/* vreinterpret */
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u8 (uint8x8_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u16 (uint16x4_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u32 (uint32x2_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u64 (uint64x1_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s8 (int8x8_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s16 (int16x4_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s32 (int32x2_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s64 (int64x1_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p8 (poly8x8_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p16 (poly16x4_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p64 (poly64x1_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_f16 (float16x4_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_f32 (float32x2_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_f64 (float64x1_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u8 (uint8x16_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u16 (uint16x8_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u32 (uint32x4_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u64 (uint64x2_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s8 (int8x16_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s16 (int16x8_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s32 (int32x4_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s64 (int64x2_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p8 (poly8x16_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p16 (poly16x8_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p64 (poly64x2_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p128 (poly128_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_f16 (float16x8_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_f32 (float32x4_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_f64 (float64x2_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s8_bf16 (bfloat16x4_t __a)
+{
+  return (int8x8_t)__a;
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s16_bf16 (bfloat16x4_t __a)
+{
+  return (int16x4_t)__a;
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s32_bf16 (bfloat16x4_t __a)
+{
+  return (int32x2_t)__a;
+}
+
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s64_bf16 (bfloat16x4_t __a)
+{
+  return (int64x1_t)__a;
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u8_bf16 (bfloat16x4_t __a)
+{
+  return (uint8x8_t)__a;
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u16_bf16 (bfloat16x4_t __a)
+{
+  return (uint16x4_t)__a;
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u32_bf16 (bfloat16x4_t __a)
+{
+  return (uint32x2_t)__a;
+}
+
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u64_bf16 (bfloat16x4_t __a)
+{
+  return (uint64x1_t)__a;
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f16_bf16 (bfloat16x4_t __a)
+{
+  return (float16x4_t)__a;
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f32_bf16 (bfloat16x4_t __a)
+{
+  return (float32x2_t)__a;
+}
+
+__extension__ extern __inline float64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f64_bf16 (bfloat16x4_t __a)
+{
+  return (float64x1_t)__a;
+}
+
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p8_bf16 (bfloat16x4_t __a)
+{
+  return (poly8x8_t)__a;
+}
+
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p16_bf16 (bfloat16x4_t __a)
+{
+  return (poly16x4_t)__a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_bf16 (bfloat16x4_t __a)
+{
+  return (poly64x1_t)__a;
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s8_bf16 (bfloat16x8_t __a)
+{
+  return (int8x16_t)__a;
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s16_bf16 (bfloat16x8_t __a)
+{
+  return (int16x8_t)__a;
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s32_bf16 (bfloat16x8_t __a)
+{
+  return (int32x4_t)__a;
+}
+
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s64_bf16 (bfloat16x8_t __a)
+{
+  return (int64x2_t)__a;
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u8_bf16 (bfloat16x8_t __a)
+{
+  return (uint8x16_t)__a;
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u16_bf16 (bfloat16x8_t __a)
+{
+  return (uint16x8_t)__a;
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u32_bf16 (bfloat16x8_t __a)
+{
+  return (uint32x4_t)__a;
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u64_bf16 (bfloat16x8_t __a)
+{
+  return (uint64x2_t)__a;
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f16_bf16 (bfloat16x8_t __a)
+{
+  return (float16x8_t)__a;
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f32_bf16 (bfloat16x8_t __a)
+{
+  return (float32x4_t)__a;
+}
+
+__extension__ extern __inline float64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f64_bf16 (bfloat16x8_t __a)
+{
+  return (float64x2_t)__a;
+}
+
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p8_bf16 (bfloat16x8_t __a)
+{
+  return (poly8x16_t)__a;
+}
+
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p16_bf16 (bfloat16x8_t __a)
+{
+  return (poly16x8_t)__a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_bf16 (bfloat16x8_t __a)
+{
+  return (poly64x2_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_bf16 (bfloat16x8_t __a)
+{
+  return (poly128_t)__a;
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
+{
+  return __builtin_aarch64_bfdotv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_aarch64_bfdotv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b,
+		 const int __index)
+{
+  return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		  const int __index)
+{
+  return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b,
+		  const int __index)
+{
+  return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		   const int __index)
+{
+  return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+
+{
+  return __builtin_aarch64_bfmmlaqv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_aarch64_bfmlalbv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_aarch64_bfmlaltv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		    const int __index)
+{
+  return __builtin_aarch64_bfmlalb_lanev4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		    const int __index)
+{
+  return __builtin_aarch64_bfmlalt_lanev4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		     const int __index)
+{
+  return __builtin_aarch64_bfmlalb_lane_qv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		     const int __index)
+{
+  return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_bf16_f32 (float32x4_t __a)
+{
+  return __builtin_aarch64_bfcvtnv4bf (__a);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_low_bf16_f32 (float32x4_t __a)
+{
+  return __builtin_aarch64_bfcvtn_qv8bf (__a);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_high_bf16_f32 (bfloat16x8_t __inactive, float32x4_t __a)
+{
+  return __builtin_aarch64_bfcvtn2v8bf (__inactive, __a);
+}
+
+#pragma GCC pop_options
+
+/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics.  */
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+i8mm")
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
+{
+  return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b, const int __index)
+{
+  return __builtin_aarch64_usdot_lanev8qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a, int8x16_t __b,
+		  const int __index)
+{
+  return __builtin_aarch64_usdot_laneqv8qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_lane_s32 (int32x4_t __r, uint8x16_t __a, int8x8_t __b,
+		  const int __index)
+{
+  return __builtin_aarch64_usdot_lanev16qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b,
+		   const int __index)
+{
+  return __builtin_aarch64_usdot_laneqv16qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudot_lane_s32 (int32x2_t __r, int8x8_t __a, uint8x8_t __b, const int __index)
+{
+  return __builtin_aarch64_sudot_lanev8qi_sssus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a, uint8x16_t __b,
+		  const int __index)
+{
+  return __builtin_aarch64_sudot_laneqv8qi_sssus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a, uint8x8_t __b,
+		  const int __index)
+{
+  return __builtin_aarch64_sudot_lanev16qi_sssus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a, uint8x16_t __b,
+		   const int __index)
+{
+  return __builtin_aarch64_sudot_laneqv16qi_sssus (__r, __a, __b, __index);
+}
+
+/* Matrix Multiply-Accumulate.  */
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
+{
+  return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b);
+}
+
+#pragma GCC pop_options
+
 #undef __aarch64_vget_lane_any
 
 #undef __aarch64_vdup_lane_any
diff --git a/gcc/config/aarch64/arm_sve.h b/gcc/config/aarch64/arm_sve.h
new file mode 100644
index 000000000..0a316c0a0
--- /dev/null
+++ b/gcc/config/aarch64/arm_sve.h
@@ -0,0 +1,37 @@
+/* AArch64 SVE intrinsics include file.
+   Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _ARM_SVE_H_
+#define _ARM_SVE_H_
+
+#include <stdint.h>
+#include <arm_bf16.h>
+
+typedef __fp16 float16_t;
+typedef float float32_t;
+typedef double float64_t;
+
+#pragma GCC aarch64 "arm_sve.h"
+
+#endif
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 0f357662a..002e91d2b 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -22,10 +22,10 @@
 
 (define_expand "@atomic_compare_and_swap<mode>"
   [(match_operand:SI 0 "register_operand" "")			;; bool out
-   (match_operand:ALLI 1 "register_operand" "")			;; val out
-   (match_operand:ALLI 2 "aarch64_sync_memory_operand" "")	;; memory
-   (match_operand:ALLI 3 "nonmemory_operand" "")		;; expected
-   (match_operand:ALLI 4 "aarch64_reg_or_zero" "")		;; desired
+   (match_operand:ALLI_TI 1 "register_operand" "")		;; val out
+   (match_operand:ALLI_TI 2 "aarch64_sync_memory_operand" "")	;; memory
+   (match_operand:ALLI_TI 3 "nonmemory_operand" "")		;; expected
+   (match_operand:ALLI_TI 4 "aarch64_reg_or_zero" "")		;; desired
    (match_operand:SI 5 "const_int_operand")			;; is_weak
    (match_operand:SI 6 "const_int_operand")			;; mod_s
    (match_operand:SI 7 "const_int_operand")]			;; mod_f
@@ -88,6 +88,30 @@
   }
 )
 
+(define_insn_and_split "@aarch64_compare_and_swap<mode>"
+  [(set (reg:CC CC_REGNUM)					;; bool out
+    (unspec_volatile:CC [(const_int 0)] UNSPECV_ATOMIC_CMPSW))
+   (set (match_operand:JUST_TI 0 "register_operand" "=&r")	;; val out
+    (match_operand:JUST_TI 1 "aarch64_sync_memory_operand" "+Q")) ;; memory
+   (set (match_dup 1)
+    (unspec_volatile:JUST_TI
+      [(match_operand:JUST_TI 2 "aarch64_reg_or_zero" "rZ")	;; expect
+       (match_operand:JUST_TI 3 "aarch64_reg_or_zero" "rZ")	;; desired
+       (match_operand:SI 4 "const_int_operand")			;; is_weak
+       (match_operand:SI 5 "const_int_operand")			;; mod_s
+       (match_operand:SI 6 "const_int_operand")]		;; mod_f
+      UNSPECV_ATOMIC_CMPSW))
+   (clobber (match_scratch:SI 7 "=&r"))]
+  ""
+  "#"
+  "&& epilogue_completed"
+  [(const_int 0)]
+  {
+    aarch64_split_compare_and_swap (operands);
+    DONE;
+  }
+)
+
 (define_insn "@aarch64_compare_and_swap<mode>_lse"
   [(set (match_operand:SI 0 "register_operand" "+r")		;; val out
     (zero_extend:SI
@@ -133,23 +157,56 @@
     return "casal<atomic_sfx>\t%<w>0, %<w>2, %1";
 })
 
+(define_insn "@aarch64_compare_and_swap<mode>_lse"
+  [(set (match_operand:JUST_TI 0 "register_operand" "+r")	;; val out
+    (match_operand:JUST_TI 1 "aarch64_sync_memory_operand" "+Q")) ;; memory
+   (set (match_dup 1)
+    (unspec_volatile:JUST_TI
+      [(match_dup 0)						;; expect
+       (match_operand:JUST_TI 2 "register_operand" "r")		;; desired
+       (match_operand:SI 3 "const_int_operand")]		;; mod_s
+      UNSPECV_ATOMIC_CMPSW))]
+  "TARGET_LSE"
+{
+  enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+  if (is_mm_relaxed (model))
+    return "casp\t%0, %R0, %2, %R2, %1";
+  else if (is_mm_acquire (model) || is_mm_consume (model))
+    return "caspa\t%0, %R0, %2, %R2, %1";
+  else if (is_mm_release (model))
+    return "caspl\t%0, %R0, %2, %R2, %1";
+  else
+    return "caspal\t%0, %R0, %2, %R2, %1";
+})
+
 (define_expand "atomic_exchange<mode>"
- [(match_operand:ALLI 0 "register_operand" "")
-  (match_operand:ALLI 1 "aarch64_sync_memory_operand" "")
-  (match_operand:ALLI 2 "aarch64_reg_or_zero" "")
-  (match_operand:SI 3 "const_int_operand" "")]
+ [(match_operand:ALLI 0 "register_operand")
+  (match_operand:ALLI 1 "aarch64_sync_memory_operand")
+  (match_operand:ALLI 2 "aarch64_reg_or_zero")
+  (match_operand:SI 3 "const_int_operand")]
   ""
   {
-    rtx (*gen) (rtx, rtx, rtx, rtx);
-
     /* Use an atomic SWP when available.  */
     if (TARGET_LSE)
-      gen = gen_aarch64_atomic_exchange<mode>_lse;
+      {
+	emit_insn (gen_aarch64_atomic_exchange<mode>_lse
+		   (operands[0], operands[1], operands[2], operands[3]));
+      }
+    else if (TARGET_OUTLINE_ATOMICS)
+      {
+	machine_mode mode = <MODE>mode;
+	rtx func = aarch64_atomic_ool_func (mode, operands[3],
+					    &aarch64_ool_swp_names);
+	rtx rval = emit_library_call_value (func, operands[0], LCT_NORMAL,
+					    mode, operands[2], mode,
+					    XEXP (operands[1], 0), Pmode);
+        emit_move_insn (operands[0], rval);
+      }
     else
-      gen = gen_aarch64_atomic_exchange<mode>;
-
-    emit_insn (gen (operands[0], operands[1], operands[2], operands[3]));
-
+      {
+	emit_insn (gen_aarch64_atomic_exchange<mode>
+		   (operands[0], operands[1], operands[2], operands[3]));
+      }
     DONE;
   }
 )
@@ -198,9 +255,9 @@
 )
 
 (define_expand "atomic_<atomic_optab><mode>"
- [(match_operand:ALLI 0 "aarch64_sync_memory_operand" "")
+ [(match_operand:ALLI 0 "aarch64_sync_memory_operand")
   (atomic_op:ALLI
-   (match_operand:ALLI 1 "<atomic_op_operand>" "")
+   (match_operand:ALLI 1 "<atomic_op_operand>")
    (match_operand:SI 2 "const_int_operand"))]
   ""
   {
@@ -234,6 +291,39 @@
 	  }
 	operands[1] = force_reg (<MODE>mode, operands[1]);
       }
+    else if (TARGET_OUTLINE_ATOMICS)
+      {
+        const atomic_ool_names *names;
+	switch (<CODE>)
+	  {
+	  case MINUS:
+	    operands[1] = expand_simple_unop (<MODE>mode, NEG, operands[1],
+					      NULL, 1);
+	    /* fallthru */
+	  case PLUS:
+	    names = &aarch64_ool_ldadd_names;
+	    break;
+	  case IOR:
+	    names = &aarch64_ool_ldset_names;
+	    break;
+	  case XOR:
+	    names = &aarch64_ool_ldeor_names;
+	    break;
+	  case AND:
+	    operands[1] = expand_simple_unop (<MODE>mode, NOT, operands[1],
+					      NULL, 1);
+	    names = &aarch64_ool_ldclr_names;
+	    break;
+	  default:
+	    gcc_unreachable ();
+	  }
+        machine_mode mode = <MODE>mode;
+	rtx func = aarch64_atomic_ool_func (mode, operands[2], names);
+	emit_library_call_value (func, NULL_RTX, LCT_NORMAL, mode,
+				 operands[1], mode,
+				 XEXP (operands[0], 0), Pmode);
+        DONE;
+      }
     else
       gen = gen_aarch64_atomic_<atomic_optab><mode>;
 
@@ -322,10 +412,10 @@
 ;; Load-operate-store, returning the original memory data.
 
 (define_expand "atomic_fetch_<atomic_optab><mode>"
- [(match_operand:ALLI 0 "register_operand" "")
-  (match_operand:ALLI 1 "aarch64_sync_memory_operand" "")
+ [(match_operand:ALLI 0 "register_operand")
+  (match_operand:ALLI 1 "aarch64_sync_memory_operand")
   (atomic_op:ALLI
-   (match_operand:ALLI 2 "<atomic_op_operand>" "")
+   (match_operand:ALLI 2 "<atomic_op_operand>")
    (match_operand:SI 3 "const_int_operand"))]
  ""
 {
@@ -359,6 +449,40 @@
 	}
       operands[2] = force_reg (<MODE>mode, operands[2]);
     }
+  else if (TARGET_OUTLINE_ATOMICS)
+    {
+      const atomic_ool_names *names;
+      switch (<CODE>)
+	{
+	case MINUS:
+	  operands[2] = expand_simple_unop (<MODE>mode, NEG, operands[2],
+					    NULL, 1);
+	  /* fallthru */
+	case PLUS:
+	  names = &aarch64_ool_ldadd_names;
+	  break;
+	case IOR:
+	  names = &aarch64_ool_ldset_names;
+	  break;
+	case XOR:
+	  names = &aarch64_ool_ldeor_names;
+	  break;
+	case AND:
+	  operands[2] = expand_simple_unop (<MODE>mode, NOT, operands[2],
+					    NULL, 1);
+	  names = &aarch64_ool_ldclr_names;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      machine_mode mode = <MODE>mode;
+      rtx func = aarch64_atomic_ool_func (mode, operands[3], names);
+      rtx rval = emit_library_call_value (func, operands[0], LCT_NORMAL, mode,
+					  operands[2], mode,
+					  XEXP (operands[1], 0), Pmode);
+      emit_move_insn (operands[0], rval);
+      DONE;
+    }
   else
     gen = gen_aarch64_atomic_fetch_<atomic_optab><mode>;
 
@@ -439,16 +563,16 @@
 ;; Load-operate-store, returning the updated memory data.
 
 (define_expand "atomic_<atomic_optab>_fetch<mode>"
- [(match_operand:ALLI 0 "register_operand" "")
+ [(match_operand:ALLI 0 "register_operand")
   (atomic_op:ALLI
-   (match_operand:ALLI 1 "aarch64_sync_memory_operand" "")
-   (match_operand:ALLI 2 "<atomic_op_operand>" ""))
+   (match_operand:ALLI 1 "aarch64_sync_memory_operand")
+   (match_operand:ALLI 2 "<atomic_op_operand>"))
   (match_operand:SI 3 "const_int_operand")]
  ""
 {
   /* Use an atomic load-operate instruction when possible.  In this case
      we will re-compute the result from the original mem value. */
-  if (TARGET_LSE)
+  if (TARGET_LSE || TARGET_OUTLINE_ATOMICS)
     {
       rtx tmp = gen_reg_rtx (<MODE>mode);
       operands[2] = force_reg (<MODE>mode, operands[2]);
@@ -581,6 +705,24 @@
   }
 )
 
+(define_insn "aarch64_load_exclusive_pair"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec_volatile:DI
+	  [(match_operand:TI 2 "aarch64_sync_memory_operand" "Q")
+	   (match_operand:SI 3 "const_int_operand")]
+	  UNSPECV_LX))
+   (set (match_operand:DI 1 "register_operand" "=r")
+	(unspec_volatile:DI [(match_dup 2) (match_dup 3)] UNSPECV_LX))]
+  ""
+  {
+    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
+      return "ldxp\t%0, %1, %2";
+    else
+      return "ldaxp\t%0, %1, %2";
+  }
+)
+
 (define_insn "@aarch64_store_exclusive<mode>"
   [(set (match_operand:SI 0 "register_operand" "=&r")
     (unspec_volatile:SI [(const_int 0)] UNSPECV_SX))
@@ -599,8 +741,27 @@
   }
 )
 
+(define_insn "aarch64_store_exclusive_pair"
+  [(set (match_operand:SI 0 "register_operand" "=&r")
+	(unspec_volatile:SI [(const_int 0)] UNSPECV_SX))
+   (set (match_operand:TI 1 "aarch64_sync_memory_operand" "=Q")
+	(unspec_volatile:TI
+	  [(match_operand:DI 2 "aarch64_reg_or_zero" "rZ")
+	   (match_operand:DI 3 "aarch64_reg_or_zero" "rZ")
+	   (match_operand:SI 4 "const_int_operand")]
+	  UNSPECV_SX))]
+  ""
+  {
+    enum memmodel model = memmodel_from_int (INTVAL (operands[4]));
+    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
+      return "stxp\t%w0, %x2, %x3, %1";
+    else
+      return "stlxp\t%w0, %x2, %x3, %1";
+  }
+)
+
 (define_expand "mem_thread_fence"
-  [(match_operand:SI 0 "const_int_operand" "")]
+  [(match_operand:SI 0 "const_int_operand")]
   ""
   {
     enum memmodel model = memmodel_from_int (INTVAL (operands[0]));
diff --git a/gcc/config/aarch64/check-sve-md.awk b/gcc/config/aarch64/check-sve-md.awk
new file mode 100644
index 000000000..3da78f3dd
--- /dev/null
+++ b/gcc/config/aarch64/check-sve-md.awk
@@ -0,0 +1,66 @@
+#!/usr/bin/awk -f
+# Copyright (C) 2019 Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# This awk script checks that aarch64-sve.md (passed either on the
+# command line or via stdin) has an up-to-date contents section.
+
+BEGIN {
+  seen1 = 0
+  seen2 = 0
+  errors = 0
+}
+
+# The headings in the comments use a two-level hierarchy: ";; == ..."
+# for major sections and ";; ---- ..." for minor sections.  Each section
+# heading must be unique.
+#
+# The contents section should list all the section headings, using the
+# same text and in the same order.  We should therefore see exactly two
+# copies of the section list.
+/^;; == / || /^;; ---- / {
+  if ($0 in seen || seen2 > 0)
+    {
+      if (seen2 >= seen1)
+	{
+	  printf "error: line not in contents: %s\n", $0 > "/dev/stderr"
+	  errors += 1
+	  exit(1)
+	}
+      if ($0 != order[seen2])
+	{
+	  printf "error: mismatched contents\n     saw: %s\nexpected: %s\n", \
+	    $0, order[seen2] > "/dev/stderr"
+	  errors += 1
+	  exit(1)
+	}
+      seen2 += 1
+    }
+  else
+    {
+      seen[$0] = 1
+      order[seen1] = $0
+      seen1 += 1
+    }
+}
+
+END {
+  if (seen2 < seen1 && errors == 0)
+    {
+      printf "error: line only in contents: %s\n", order[seen2] > "/dev/stderr"
+      exit(1)
+    }
+}
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 21f9549e6..191c996c1 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -36,6 +36,9 @@
 (define_register_constraint "x" "FP_LO_REGS"
   "Floating point and SIMD vector registers V0 - V15.")
 
+(define_register_constraint "y" "FP_LO8_REGS"
+  "Floating point and SIMD vector registers V0 - V7.")
+
 (define_constraint "I"
  "A constant that can be used with an ADD operation."
  (and (match_code "const_int")
@@ -46,6 +49,12 @@
   (and (match_code "const_int")
        (match_test "aarch64_pluslong_strict_immedate (op, VOIDmode)")))
 
+(define_constraint "Uai"
+  "@internal
+   A constraint that matches a VG-based constant that can be added by
+   a single INC or DEC."
+  (match_operand 0 "aarch64_sve_scalar_inc_dec_immediate"))
+
 (define_constraint "Uav"
   "@internal
    A constraint that matches a VG-based constant that can be added by
@@ -114,8 +123,8 @@
        (match_test "aarch64_float_const_zero_rtx_p (op)")))
 
 (define_constraint "Z"
-  "Integer constant zero."
-  (match_test "op == const0_rtx"))
+  "Integer or floating-point constant zero."
+  (match_test "op == CONST0_RTX (GET_MODE (op))"))
 
 (define_constraint "Ush"
   "A constraint that matches an absolute symbolic address high part."
@@ -248,6 +257,38 @@
 						  true,
 						  ADDR_QUERY_LDP_STP_N)")))
 
+(define_address_constraint "UPb"
+  "@internal
+   An address valid for SVE PRFB instructions."
+  (match_test "aarch64_sve_prefetch_operand_p (op, VNx16QImode)"))
+
+(define_address_constraint "UPd"
+  "@internal
+   An address valid for SVE PRFD instructions."
+  (match_test "aarch64_sve_prefetch_operand_p (op, VNx2DImode)"))
+
+(define_address_constraint "UPh"
+  "@internal
+   An address valid for SVE PRFH instructions."
+  (match_test "aarch64_sve_prefetch_operand_p (op, VNx8HImode)"))
+
+(define_address_constraint "UPw"
+  "@internal
+   An address valid for SVE PRFW instructions."
+  (match_test "aarch64_sve_prefetch_operand_p (op, VNx4SImode)"))
+
+(define_memory_constraint "Utf"
+  "@internal
+   An address valid for SVE LDFF1 instructions."
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ldff1_operand_p (op)")))
+
+(define_memory_constraint "Utn"
+  "@internal
+   An address valid for SVE LDNF1 instructions."
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ldnf1_operand_p (op)")))
+
 (define_memory_constraint "Utr"
   "@internal
    An address valid for SVE LDR and STR instructions (as distinct from
@@ -269,6 +310,37 @@
        (match_test "aarch64_legitimate_address_p (V2DImode,
 						  XEXP (op, 0), 1)")))
 
+(define_memory_constraint "UtQ"
+  "@internal
+   An address valid for SVE LD1RQs."
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ld1rq_operand_p (op)")))
+
+(define_memory_constraint "UOb"
+  "@internal
+   An address valid for SVE LD1ROH."
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ld1ro_operand_p (op, QImode)")))
+
+(define_memory_constraint "UOh"
+  "@internal
+   An address valid for SVE LD1ROH."
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ld1ro_operand_p (op, HImode)")))
+
+
+(define_memory_constraint "UOw"
+  "@internal
+   An address valid for SVE LD1ROW."
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ld1ro_operand_p (op, SImode)")))
+
+(define_memory_constraint "UOd"
+  "@internal
+   An address valid for SVE LD1ROD."
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ld1ro_operand_p (op, DImode)")))
+
 (define_memory_constraint "Uty"
   "@internal
    An address valid for SVE LD1Rs."
@@ -284,7 +356,7 @@
 (define_constraint "Ufc"
   "A floating point constant which can be used with an\
    FMOV immediate operation."
-  (and (match_code "const_double")
+  (and (match_code "const_double,const_vector")
        (match_test "aarch64_float_const_representable_p (op)")))
 
 (define_constraint "Uvi"
@@ -329,6 +401,13 @@
       (match_test "aarch64_simd_scalar_immediate_valid_for_move (op,
 						 QImode)")))
 
+(define_constraint "Dt"
+  "@internal
+ A const_double which is the reciprocal of an exact power of two, can be
+ used in an scvtf with fract bits operation"
+ (and (match_code "const_double")
+      (match_test "aarch64_fpconst_pow2_recip (op) > 0")))
+
 (define_constraint "Dl"
   "@internal
  A constraint that matches vector of immediates for left shifts."
@@ -373,18 +452,54 @@
  An address valid for a prefetch instruction."
  (match_test "aarch64_address_valid_for_prefetch_p (op, true)"))
 
+(define_constraint "vgb"
+  "@internal
+   A constraint that matches an immediate offset valid for SVE LD1B
+   gather instructions."
+ (match_operand 0 "aarch64_sve_gather_immediate_b"))
+
+(define_constraint "vgd"
+  "@internal
+   A constraint that matches an immediate offset valid for SVE LD1D
+   gather instructions."
+ (match_operand 0 "aarch64_sve_gather_immediate_d"))
+
+(define_constraint "vgh"
+  "@internal
+   A constraint that matches an immediate offset valid for SVE LD1H
+   gather instructions."
+ (match_operand 0 "aarch64_sve_gather_immediate_h"))
+
+(define_constraint "vgw"
+  "@internal
+   A constraint that matches an immediate offset valid for SVE LD1W
+   gather instructions."
+ (match_operand 0 "aarch64_sve_gather_immediate_w"))
+
 (define_constraint "vsa"
   "@internal
    A constraint that matches an immediate operand valid for SVE
    arithmetic instructions."
  (match_operand 0 "aarch64_sve_arith_immediate"))
 
+(define_constraint "vsb"
+  "@internal
+   A constraint that matches an immediate operand valid for SVE UMAX
+   and UMIN operations."
+ (match_operand 0 "aarch64_sve_vsb_immediate"))
+
 (define_constraint "vsc"
   "@internal
    A constraint that matches a signed immediate operand valid for SVE
    CMP instructions."
  (match_operand 0 "aarch64_sve_cmp_vsc_immediate"))
 
+(define_constraint "vss"
+  "@internal
+   A constraint that matches a signed immediate operand valid for SVE
+   DUP instructions."
+ (match_test "aarch64_sve_dup_immediate_p (op)"))
+
 (define_constraint "vsd"
   "@internal
    A constraint that matches an unsigned immediate operand valid for SVE
@@ -395,7 +510,7 @@
   "@internal
    A constraint that matches a vector count operand valid for SVE INC and
    DEC instructions."
- (match_operand 0 "aarch64_sve_inc_dec_immediate"))
+ (match_operand 0 "aarch64_sve_vector_inc_dec_immediate"))
 
 (define_constraint "vsn"
   "@internal
@@ -403,6 +518,18 @@
    is valid for SVE SUB instructions."
  (match_operand 0 "aarch64_sve_sub_arith_immediate"))
 
+(define_constraint "vsQ"
+  "@internal
+   Like vsa, but additionally check that the immediate is nonnegative
+   when interpreted as a signed value."
+ (match_operand 0 "aarch64_sve_qadd_immediate"))
+
+(define_constraint "vsS"
+  "@internal
+   Like vsn, but additionally check that the immediate is negative
+   when interpreted as a signed value."
+ (match_operand 0 "aarch64_sve_qsub_immediate"))
+
 (define_constraint "vsl"
   "@internal
    A constraint that matches an immediate operand valid for SVE logical
@@ -411,9 +538,9 @@
 
 (define_constraint "vsm"
   "@internal
-   A constraint that matches an immediate operand valid for SVE MUL
-   operations."
- (match_operand 0 "aarch64_sve_mul_immediate"))
+   A constraint that matches an immediate operand valid for SVE MUL,
+   SMAX and SMIN operations."
+ (match_operand 0 "aarch64_sve_vsm_immediate"))
 
 (define_constraint "vsA"
   "@internal
@@ -421,13 +548,20 @@
    and FSUB operations."
  (match_operand 0 "aarch64_sve_float_arith_immediate"))
 
+;; "B" for "bound".
+(define_constraint "vsB"
+  "@internal
+   A constraint that matches an immediate operand valid for SVE FMAX
+   and FMIN operations."
+ (match_operand 0 "aarch64_sve_float_maxmin_immediate"))
+
 (define_constraint "vsM"
   "@internal
-   A constraint that matches an imediate operand valid for SVE FMUL
+   A constraint that matches an immediate operand valid for SVE FMUL
    operations."
  (match_operand 0 "aarch64_sve_float_mul_immediate"))
 
 (define_constraint "vsN"
   "@internal
    A constraint that matches the negative of vsA"
- (match_operand 0 "aarch64_sve_float_arith_with_sub_immediate"))
+ (match_operand 0 "aarch64_sve_float_negated_arith_immediate"))
diff --git a/gcc/config/aarch64/cortex-a57-fma-steering.c b/gcc/config/aarch64/cortex-a57-fma-steering.c
index eb91662b6..d8e6038d1 100644
--- a/gcc/config/aarch64/cortex-a57-fma-steering.c
+++ b/gcc/config/aarch64/cortex-a57-fma-steering.c
@@ -37,6 +37,7 @@
 #include "insn-attr.h"
 #include "context.h"
 #include "tree-pass.h"
+#include "function-abi.h"
 #include "regrename.h"
 #include "aarch64-protos.h"
 
@@ -267,7 +268,7 @@ rename_single_chain (du_head_p head, HARD_REG_SET *unavailable)
       if (DEBUG_INSN_P (tmp->insn))
 	continue;
       n_uses++;
-      IOR_COMPL_HARD_REG_SET (*unavailable, reg_class_contents[tmp->cl]);
+      *unavailable |= ~reg_class_contents[tmp->cl];
       super_class = reg_class_superunion[(int) super_class][(int) tmp->cl];
     }
 
@@ -281,7 +282,7 @@ rename_single_chain (du_head_p head, HARD_REG_SET *unavailable)
     {
       fprintf (dump_file, "Register %s in insn %d", reg_names[reg],
 	       INSN_UID (head->first->insn));
-      if (head->need_caller_save_reg)
+      if (head->call_abis)
 	fprintf (dump_file, " crosses a call");
     }
 
diff --git a/gcc/config/aarch64/driver-aarch64.c b/gcc/config/aarch64/driver-aarch64.c
index 6f16775f4..ef4f18352 100644
--- a/gcc/config/aarch64/driver-aarch64.c
+++ b/gcc/config/aarch64/driver-aarch64.c
@@ -32,7 +32,7 @@ std::string aarch64_get_extension_string_for_isa_flags (unsigned long,
 struct aarch64_arch_extension
 {
   const char *ext;
-  unsigned int flag;
+  uint64_t flag;
   const char *feat_string;
 };
 
@@ -52,7 +52,7 @@ struct aarch64_core_data
   unsigned char implementer_id; /* Exactly 8 bits */
   unsigned int part_no; /* 12 bits + 12 bits */
   unsigned variant;
-  const unsigned long flags;
+  const uint64_t flags;
 };
 
 #define AARCH64_BIG_LITTLE(BIG, LITTLE) \
@@ -75,7 +75,7 @@ struct aarch64_arch_driver_info
 {
   const char* id;
   const char* name;
-  const unsigned long flags;
+  const uint64_t flags;
 };
 
 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
@@ -179,8 +179,8 @@ host_detect_local_cpu (int argc, const char **argv)
   unsigned int variants[2] = { ALL_VARIANTS, ALL_VARIANTS };
   unsigned int n_variants = 0;
   bool processed_exts = false;
-  unsigned long extension_flags = 0;
-  unsigned long default_flags = 0;
+  uint64_t extension_flags = 0;
+  uint64_t default_flags = 0;
 
   gcc_assert (argc);
 
diff --git a/gcc/config/aarch64/falkor-tag-collision-avoidance.c b/gcc/config/aarch64/falkor-tag-collision-avoidance.c
index 779dee81f..35ca79232 100644
--- a/gcc/config/aarch64/falkor-tag-collision-avoidance.c
+++ b/gcc/config/aarch64/falkor-tag-collision-avoidance.c
@@ -38,6 +38,7 @@
 #include "optabs.h"
 #include "regs.h"
 #include "recog.h"
+#include "function-abi.h"
 #include "regrename.h"
 #include "print-rtl.h"
 
@@ -229,7 +230,7 @@ init_unavailable (tag_insn_info *insn_info, tag_map_t &tag_map, du_head_p head,
       if (DEBUG_INSN_P (tmp->insn))
 	continue;
 
-      IOR_COMPL_HARD_REG_SET (*unavailable, reg_class_contents[tmp->cl]);
+      *unavailable |= ~reg_class_contents[tmp->cl];
       super_class = reg_class_superunion[(int) super_class][(int) tmp->cl];
     }
 
diff --git a/gcc/config/aarch64/falkor.md b/gcc/config/aarch64/falkor.md
index 41955af81..2bcc661e5 100644
--- a/gcc/config/aarch64/falkor.md
+++ b/gcc/config/aarch64/falkor.md
@@ -648,7 +648,7 @@
 
 (define_insn_reservation "falkor_other_0_nothing" 0
   (and (eq_attr "tune" "falkor")
-       (eq_attr "type" "no_insn,trap,block"))
+       (eq_attr "type" "trap,block"))
   "nothing")
 
 (define_insn_reservation "falkor_other_2_z" 2
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index c7ccd5bf6..7b6456961 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -29,9 +29,16 @@
 ;; Iterator for HI, SI, DI, some instructions can only work on these modes.
 (define_mode_iterator GPI_I16 [(HI "AARCH64_ISA_F16") SI DI])
 
+;; "Iterator" for just TI -- features like @pattern only work with iterators.
+(define_mode_iterator JUST_TI [TI])
+
 ;; Iterator for QI and HI modes
 (define_mode_iterator SHORT [QI HI])
 
+;; Iterators for single modes, for "@" patterns.
+(define_mode_iterator SI_ONLY [SI])
+(define_mode_iterator DI_ONLY [DI])
+
 ;; Iterator for all integer modes (up to 64-bit)
 (define_mode_iterator ALLI [QI HI SI DI])
 
@@ -50,9 +57,16 @@
 ;; Iterator for all scalar floating point modes (HF, SF, DF)
 (define_mode_iterator GPF_HF [HF SF DF])
 
+;; Iterator for all 16-bit scalar floating point modes (HF, BF)
+(define_mode_iterator HFBF [HF BF])
+
 ;; Iterator for all scalar floating point modes (HF, SF, DF and TF)
 (define_mode_iterator GPF_TF_F16 [HF SF DF TF])
 
+;; Iterator for all scalar floating point modes suitable for moving, including
+;; special BF type (HF, SF, DF, TF and BF)
+(define_mode_iterator GPF_TF_F16_MOV [HF BF SF DF TF])
+
 ;; Double vector modes.
 (define_mode_iterator VDF [V2SF V4HF])
 
@@ -70,7 +84,10 @@
 (define_mode_iterator VSDQ_I_DI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI DI])
 
 ;; Double vector modes.
-(define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF])
+(define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF V4BF])
+
+;; Double vector modes suitable for moving.  Includes BFmode.
+(define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF])
 
 ;; All modes stored in registers d0-d31.
 (define_mode_iterator DREG [V8QI V4HI V4HF V2SI V2SF DF])
@@ -85,20 +102,29 @@
 (define_mode_iterator VDQ_BHSI [V8QI V16QI V4HI V8HI V2SI V4SI])
 
 ;; Quad vector modes.
-(define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
+(define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF V8BF])
 
 ;; Copy of the above.
-(define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
+(define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
+
+;; Quad vector modes suitable for moving.  Includes BFmode.
+(define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
+
+;; VQMOV without 2-element modes.
+(define_mode_iterator VQMOV_NO2E [V16QI V8HI V4SI V8HF V8BF V4SF])
 
 ;; Quad integer vector modes.
 (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
 
 ;; VQ without 2 element modes.
-(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF])
+(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
 
 ;; Quad vector with only 2 element modes.
 (define_mode_iterator VQ_2E [V2DI V2DF])
 
+;; BFmode vector modes.
+(define_mode_iterator VBF [V4BF V8BF])
+
 ;; This mode iterator allows :P to be used for patterns that operate on
 ;; addresses in different modes.  In LP64, only DI will match, while in
 ;; ILP32, either can match.
@@ -110,7 +136,8 @@
 (define_mode_iterator PTR [(SI "ptr_mode == SImode") (DI "ptr_mode == DImode")])
 
 ;; Advanced SIMD Float modes suitable for moving, loading and storing.
-(define_mode_iterator VDQF_F16 [V4HF V8HF V2SF V4SF V2DF])
+(define_mode_iterator VDQF_F16 [V4HF V8HF V2SF V4SF V2DF
+				V4BF V8BF])
 
 ;; Advanced SIMD Float modes.
 (define_mode_iterator VDQF [V2SF V4SF V2DF])
@@ -128,6 +155,9 @@
 				  (HF "TARGET_SIMD_F16INST")
 				  SF DF])
 
+;; Scalar and vetor modes for SF, DF.
+(define_mode_iterator VSFDF [V2SF V4SF V2DF DF SF])
+
 ;; Advanced SIMD single Float modes.
 (define_mode_iterator VDQSF [V2SF V4SF])
 
@@ -148,7 +178,12 @@
 
 ;; All Advanced SIMD modes suitable for moving, loading, and storing.
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
-				V4HF V8HF V2SF V4SF V2DF])
+				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
+
+;; All Advanced SIMD modes suitable for moving, loading, and storing,
+;; including special Bfloat vector types.
+(define_mode_iterator VALL_F16MOV [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+				   V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
 
 ;; The VALL_F16 modes except the 128-bit 2-element ones.
 (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
@@ -159,10 +194,10 @@
 
 ;; All Advanced SIMD modes and DI.
 (define_mode_iterator VALLDI_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
-				  V4HF V8HF V2SF V4SF V2DF DI])
+				  V4HF V8HF V4BF V8BF V2SF V4SF V2DF DI])
 
 ;; All Advanced SIMD modes, plus DI and DF.
-(define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI
+(define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI V4BF V8BF
 			       V2DI V4HF V8HF V2SF V4SF V2DF DI DF])
 
 ;; Advanced SIMD modes for Integer reduction across lanes.
@@ -185,7 +220,7 @@
 (define_mode_iterator VQW [V16QI V8HI V4SI])
 
 ;; Double vector modes for combines.
-(define_mode_iterator VDC [V8QI V4HI V4HF V2SI V2SF DI DF])
+(define_mode_iterator VDC [V8QI V4HI V4BF V4HF V2SI V2SF DI DF])
 
 ;; Advanced SIMD modes except double int.
 (define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
@@ -274,50 +309,85 @@
 ;; count.
 (define_mode_iterator VMUL_CHANGE_NLANES [V4HI V8HI V2SI V4SI V2SF V4SF])
 
-;; All SVE vector modes.
-(define_mode_iterator SVE_ALL [VNx16QI VNx8HI VNx4SI VNx2DI
-			       VNx8HF VNx4SF VNx2DF])
+;; Iterators for single modes, for "@" patterns.
+(define_mode_iterator VNx8HI_ONLY [VNx8HI])
+(define_mode_iterator VNx8BF_ONLY [VNx8BF])
+(define_mode_iterator VNx4SI_ONLY [VNx4SI])
+(define_mode_iterator VNx4SF_ONLY [VNx4SF])
+(define_mode_iterator VNx2DI_ONLY [VNx2DI])
+(define_mode_iterator VNx2DF_ONLY [VNx2DF])
 
 ;; All SVE vector structure modes.
 (define_mode_iterator SVE_STRUCT [VNx32QI VNx16HI VNx8SI VNx4DI
-				  VNx16HF VNx8SF VNx4DF
+				  VNx16BF VNx16HF VNx8SF VNx4DF
 				  VNx48QI VNx24HI VNx12SI VNx6DI
-				  VNx24HF VNx12SF VNx6DF
+				  VNx24BF VNx24HF VNx12SF VNx6DF
 				  VNx64QI VNx32HI VNx16SI VNx8DI
-				  VNx32HF VNx16SF VNx8DF])
+				  VNx32BF VNx32HF VNx16SF VNx8DF])
 
-;; All SVE vector modes that have 8-bit or 16-bit elements.
-(define_mode_iterator SVE_BH [VNx16QI VNx8HI VNx8HF])
+;; All fully-packed SVE vector modes.
+(define_mode_iterator SVE_FULL [VNx16QI VNx8HI VNx4SI VNx2DI
+			        VNx8BF VNx8HF VNx4SF VNx2DF])
 
-;; All SVE vector modes that have 8-bit, 16-bit or 32-bit elements.
-(define_mode_iterator SVE_BHS [VNx16QI VNx8HI VNx4SI VNx8HF VNx4SF])
+;; All fully-packed SVE integer vector modes.
+(define_mode_iterator SVE_FULL_I [VNx16QI VNx8HI VNx4SI VNx2DI])
 
-;; All SVE integer vector modes that have 8-bit, 16-bit or 32-bit elements.
-(define_mode_iterator SVE_BHSI [VNx16QI VNx8HI VNx4SI])
+;; All fully-packed SVE floating-point vector modes.
+(define_mode_iterator SVE_FULL_F [VNx8HF VNx4SF VNx2DF])
 
-;; All SVE integer vector modes that have 16-bit, 32-bit or 64-bit elements.
-(define_mode_iterator SVE_HSDI [VNx16QI VNx8HI VNx4SI])
+;; Fully-packed SVE integer vector modes that have 8-bit, 16-bit or 32-bit
+;; elements.
+(define_mode_iterator SVE_FULL_BHSI [VNx16QI VNx8HI VNx4SI])
 
-;; All SVE floating-point vector modes that have 16-bit or 32-bit elements.
-(define_mode_iterator SVE_HSF [VNx8HF VNx4SF])
+;; Fully-packed SVE vector modes that have 16-bit, 32-bit or 64-bit elements.
+(define_mode_iterator SVE_FULL_HSD [VNx8HI VNx4SI VNx2DI
+				    VNx8BF VNx8HF VNx4SF VNx2DF])
 
-;; All SVE vector modes that have 32-bit or 64-bit elements.
-(define_mode_iterator SVE_SD [VNx4SI VNx2DI VNx4SF VNx2DF])
+;; Fully-packed SVE integer vector modes that have 16-bit, 32-bit or 64-bit
+;; elements.
+(define_mode_iterator SVE_FULL_HSDI [VNx8HI VNx4SI VNx2DI])
 
-;; All SVE vector modes that have 32-bit elements.
-(define_mode_iterator SVE_S [VNx4SI VNx4SF])
+;; Fully-packed SVE floating-point vector modes that have 16-bit or 32-bit
+;; elements.
+(define_mode_iterator SVE_FULL_HSF [VNx8HF VNx4SF])
 
-;; All SVE vector modes that have 64-bit elements.
-(define_mode_iterator SVE_D [VNx2DI VNx2DF])
+;; Fully-packed SVE vector modes that have 32-bit or 64-bit elements.
+(define_mode_iterator SVE_FULL_SD [VNx4SI VNx2DI VNx4SF VNx2DF])
 
-;; All SVE integer vector modes that have 32-bit or 64-bit elements.
-(define_mode_iterator SVE_SDI [VNx4SI VNx2DI])
+;; Fully-packed SVE integer vector modes that have 32-bit or 64-bit elements.
+(define_mode_iterator SVE_FULL_SDI [VNx4SI VNx2DI])
 
-;; All SVE integer vector modes.
-(define_mode_iterator SVE_I [VNx16QI VNx8HI VNx4SI VNx2DI])
+;; Fully-packed SVE floating-point vector modes that have 32-bit or 64-bit
+;; elements.
+(define_mode_iterator SVE_FULL_SDF [VNx4SF VNx2DF])
 
-;; All SVE floating-point vector modes.
-(define_mode_iterator SVE_F [VNx8HF VNx4SF VNx2DF])
+;; Same, but with the appropriate conditions for FMMLA support.
+(define_mode_iterator SVE_MATMULF [(VNx4SF "TARGET_SVE_F32MM")
+				   (VNx2DF "TARGET_SVE_F64MM")])
+
+;; Fully-packed SVE vector modes that have 32-bit elements.
+(define_mode_iterator SVE_FULL_S [VNx4SI VNx4SF])
+
+;; Fully-packed SVE vector modes that have 64-bit elements.
+(define_mode_iterator SVE_FULL_D [VNx2DI VNx2DF])
+
+;; All partial SVE modes.
+(define_mode_iterator SVE_PARTIAL [VNx2QI
+				   VNx4QI VNx2HI
+				   VNx8QI VNx4HI VNx2SI])
+
+;; Modes involved in extending or truncating SVE data, for 8 elements per
+;; 128-bit block.
+(define_mode_iterator VNx8_NARROW [VNx8QI])
+(define_mode_iterator VNx8_WIDE [VNx8HI])
+
+;; ...same for 4 elements per 128-bit block.
+(define_mode_iterator VNx4_NARROW [VNx4QI VNx4HI])
+(define_mode_iterator VNx4_WIDE [VNx4SI])
+
+;; ...same for 2 elements per 128-bit block.
+(define_mode_iterator VNx2_NARROW [VNx2QI VNx2HI VNx2SI])
+(define_mode_iterator VNx2_WIDE [VNx2DI])
 
 ;; All SVE predicate modes.
 (define_mode_iterator PRED_ALL [VNx16BI VNx8BI VNx4BI VNx2BI])
@@ -325,6 +395,12 @@
 ;; SVE predicate modes that control 8-bit, 16-bit or 32-bit elements.
 (define_mode_iterator PRED_BHS [VNx16BI VNx8BI VNx4BI])
 
+;; SVE predicate modes that control 16-bit, 32-bit or 64-bit elements.
+(define_mode_iterator PRED_HSD [VNx8BI VNx4BI VNx2BI])
+
+;; Bfloat16 modes to which V4SF can be converted
+(define_mode_iterator V4SF_TO_BF [V4BF V8BF])
+
 ;; ------------------------------------------------------------------
 ;; Unspec enumerations for Advance SIMD. These could well go into
 ;; aarch64.md but for their use in int_iterators here.
@@ -365,6 +441,10 @@
     UNSPEC_RSUBHN2	; Used in aarch64-simd.md.
     UNSPEC_SQDMULH	; Used in aarch64-simd.md.
     UNSPEC_SQRDMULH	; Used in aarch64-simd.md.
+    UNSPEC_SMULLB	; Used in aarch64-sve2.md.
+    UNSPEC_SMULLT	; Used in aarch64-sve2.md.
+    UNSPEC_UMULLB	; Used in aarch64-sve2.md.
+    UNSPEC_UMULLT	; Used in aarch64-sve2.md.
     UNSPEC_PMUL		; Used in aarch64-simd.md.
     UNSPEC_FMULX	; Used in aarch64-simd.md.
     UNSPEC_USQADD	; Used in aarch64-simd.md.
@@ -387,6 +467,10 @@
     UNSPEC_UQSHRN	; Used in aarch64-simd.md.
     UNSPEC_SQRSHRN	; Used in aarch64-simd.md.
     UNSPEC_UQRSHRN	; Used in aarch64-simd.md.
+    UNSPEC_SHRNB	; Used in aarch64-sve2.md.
+    UNSPEC_SHRNT	; Used in aarch64-sve2.md.
+    UNSPEC_RSHRNB	; Used in aarch64-sve2.md.
+    UNSPEC_RSHRNT	; Used in aarch64-sve2.md.
     UNSPEC_SSHL		; Used in aarch64-simd.md.
     UNSPEC_USHL		; Used in aarch64-simd.md.
     UNSPEC_SRSHL	; Used in aarch64-simd.md.
@@ -459,38 +543,126 @@
     UNSPEC_FMLSL	; Used in aarch64-simd.md.
     UNSPEC_FMLAL2	; Used in aarch64-simd.md.
     UNSPEC_FMLSL2	; Used in aarch64-simd.md.
+    UNSPEC_ADR		; Used in aarch64-sve.md.
     UNSPEC_SEL		; Used in aarch64-sve.md.
+    UNSPEC_BRKA		; Used in aarch64-sve.md.
+    UNSPEC_BRKB		; Used in aarch64-sve.md.
+    UNSPEC_BRKN		; Used in aarch64-sve.md.
+    UNSPEC_BRKPA	; Used in aarch64-sve.md.
+    UNSPEC_BRKPB	; Used in aarch64-sve.md.
+    UNSPEC_PFIRST	; Used in aarch64-sve.md.
+    UNSPEC_PNEXT	; Used in aarch64-sve.md.
+    UNSPEC_CNTP		; Used in aarch64-sve.md.
+    UNSPEC_SADDV	; Used in aarch64-sve.md.
+    UNSPEC_UADDV	; Used in aarch64-sve.md.
     UNSPEC_ANDV		; Used in aarch64-sve.md.
     UNSPEC_IORV		; Used in aarch64-sve.md.
     UNSPEC_XORV		; Used in aarch64-sve.md.
     UNSPEC_ANDF		; Used in aarch64-sve.md.
     UNSPEC_IORF		; Used in aarch64-sve.md.
     UNSPEC_XORF		; Used in aarch64-sve.md.
+    UNSPEC_REVB		; Used in aarch64-sve.md.
+    UNSPEC_REVH		; Used in aarch64-sve.md.
+    UNSPEC_REVW		; Used in aarch64-sve.md.
     UNSPEC_SMUL_HIGHPART ; Used in aarch64-sve.md.
     UNSPEC_UMUL_HIGHPART ; Used in aarch64-sve.md.
-    UNSPEC_COND_ADD	; Used in aarch64-sve.md.
-    UNSPEC_COND_SUB	; Used in aarch64-sve.md.
-    UNSPEC_COND_MUL	; Used in aarch64-sve.md.
-    UNSPEC_COND_DIV	; Used in aarch64-sve.md.
-    UNSPEC_COND_MAX	; Used in aarch64-sve.md.
-    UNSPEC_COND_MIN	; Used in aarch64-sve.md.
+    UNSPEC_FMLA		; Used in aarch64-sve.md.
+    UNSPEC_FMLS		; Used in aarch64-sve.md.
+    UNSPEC_FEXPA	; Used in aarch64-sve.md.
+    UNSPEC_FMMLA	; Used in aarch64-sve.md.
+    UNSPEC_FTMAD	; Used in aarch64-sve.md.
+    UNSPEC_FTSMUL	; Used in aarch64-sve.md.
+    UNSPEC_FTSSEL	; Used in aarch64-sve.md.
+    UNSPEC_SMATMUL	; Used in aarch64-sve.md.
+    UNSPEC_UMATMUL	; Used in aarch64-sve.md.
+    UNSPEC_USMATMUL	; Used in aarch64-sve.md.
+    UNSPEC_TRN1Q	; Used in aarch64-sve.md.
+    UNSPEC_TRN2Q	; Used in aarch64-sve.md.
+    UNSPEC_UZP1Q	; Used in aarch64-sve.md.
+    UNSPEC_UZP2Q	; Used in aarch64-sve.md.
+    UNSPEC_ZIP1Q	; Used in aarch64-sve.md.
+    UNSPEC_ZIP2Q	; Used in aarch64-sve.md.
+    UNSPEC_COND_CMPEQ_WIDE ; Used in aarch64-sve.md.
+    UNSPEC_COND_CMPGE_WIDE ; Used in aarch64-sve.md.
+    UNSPEC_COND_CMPGT_WIDE ; Used in aarch64-sve.md.
+    UNSPEC_COND_CMPHI_WIDE ; Used in aarch64-sve.md.
+    UNSPEC_COND_CMPHS_WIDE ; Used in aarch64-sve.md.
+    UNSPEC_COND_CMPLE_WIDE ; Used in aarch64-sve.md.
+    UNSPEC_COND_CMPLO_WIDE ; Used in aarch64-sve.md.
+    UNSPEC_COND_CMPLS_WIDE ; Used in aarch64-sve.md.
+    UNSPEC_COND_CMPLT_WIDE ; Used in aarch64-sve.md.
+    UNSPEC_COND_CMPNE_WIDE ; Used in aarch64-sve.md.
+    UNSPEC_COND_FABS	; Used in aarch64-sve.md.
+    UNSPEC_COND_FADD	; Used in aarch64-sve.md.
+    UNSPEC_COND_FCADD90	; Used in aarch64-sve.md.
+    UNSPEC_COND_FCADD270 ; Used in aarch64-sve.md.
+    UNSPEC_COND_FCMEQ	; Used in aarch64-sve.md.
+    UNSPEC_COND_FCMGE	; Used in aarch64-sve.md.
+    UNSPEC_COND_FCMGT	; Used in aarch64-sve.md.
+    UNSPEC_COND_FCMLA	; Used in aarch64-sve.md.
+    UNSPEC_COND_FCMLA90	; Used in aarch64-sve.md.
+    UNSPEC_COND_FCMLA180 ; Used in aarch64-sve.md.
+    UNSPEC_COND_FCMLA270 ; Used in aarch64-sve.md.
+    UNSPEC_COND_FCMLE	; Used in aarch64-sve.md.
+    UNSPEC_COND_FCMLT	; Used in aarch64-sve.md.
+    UNSPEC_COND_FCMNE	; Used in aarch64-sve.md.
+    UNSPEC_COND_FCMUO	; Used in aarch64-sve.md.
+    UNSPEC_COND_FCVT	; Used in aarch64-sve.md.
+    UNSPEC_COND_FCVTZS	; Used in aarch64-sve.md.
+    UNSPEC_COND_FCVTZU	; Used in aarch64-sve.md.
+    UNSPEC_COND_FDIV	; Used in aarch64-sve.md.
+    UNSPEC_COND_FMAX	; Used in aarch64-sve.md.
+    UNSPEC_COND_FMAXNM	; Used in aarch64-sve.md.
+    UNSPEC_COND_FMIN	; Used in aarch64-sve.md.
+    UNSPEC_COND_FMINNM	; Used in aarch64-sve.md.
     UNSPEC_COND_FMLA	; Used in aarch64-sve.md.
     UNSPEC_COND_FMLS	; Used in aarch64-sve.md.
+    UNSPEC_COND_FMUL	; Used in aarch64-sve.md.
+    UNSPEC_COND_FMULX	; Used in aarch64-sve.md.
+    UNSPEC_COND_FNEG	; Used in aarch64-sve.md.
     UNSPEC_COND_FNMLA	; Used in aarch64-sve.md.
     UNSPEC_COND_FNMLS	; Used in aarch64-sve.md.
-    UNSPEC_COND_LT	; Used in aarch64-sve.md.
-    UNSPEC_COND_LE	; Used in aarch64-sve.md.
-    UNSPEC_COND_EQ	; Used in aarch64-sve.md.
-    UNSPEC_COND_NE	; Used in aarch64-sve.md.
-    UNSPEC_COND_GE	; Used in aarch64-sve.md.
-    UNSPEC_COND_GT	; Used in aarch64-sve.md.
+    UNSPEC_COND_FRECPX	; Used in aarch64-sve.md.
+    UNSPEC_COND_FRINTA	; Used in aarch64-sve.md.
+    UNSPEC_COND_FRINTI	; Used in aarch64-sve.md.
+    UNSPEC_COND_FRINTM	; Used in aarch64-sve.md.
+    UNSPEC_COND_FRINTN	; Used in aarch64-sve.md.
+    UNSPEC_COND_FRINTP	; Used in aarch64-sve.md.
+    UNSPEC_COND_FRINTX	; Used in aarch64-sve.md.
+    UNSPEC_COND_FRINTZ	; Used in aarch64-sve.md.
+    UNSPEC_COND_FSCALE	; Used in aarch64-sve.md.
+    UNSPEC_COND_FSQRT	; Used in aarch64-sve.md.
+    UNSPEC_COND_FSUB	; Used in aarch64-sve.md.
+    UNSPEC_COND_SCVTF	; Used in aarch64-sve.md.
+    UNSPEC_COND_UCVTF	; Used in aarch64-sve.md.
+    UNSPEC_LASTA	; Used in aarch64-sve.md.
     UNSPEC_LASTB	; Used in aarch64-sve.md.
+    UNSPEC_ASHIFT_WIDE  ; Used in aarch64-sve.md.
+    UNSPEC_ASHIFTRT_WIDE ; Used in aarch64-sve.md.
+    UNSPEC_LSHIFTRT_WIDE ; Used in aarch64-sve.md.
+    UNSPEC_LDFF1	; Used in aarch64-sve.md.
+    UNSPEC_LDNF1	; Used in aarch64-sve.md.
     UNSPEC_FCADD90	; Used in aarch64-simd.md.
     UNSPEC_FCADD270	; Used in aarch64-simd.md.
     UNSPEC_FCMLA	; Used in aarch64-simd.md.
     UNSPEC_FCMLA90	; Used in aarch64-simd.md.
     UNSPEC_FCMLA180	; Used in aarch64-simd.md.
     UNSPEC_FCMLA270	; Used in aarch64-simd.md.
+    UNSPEC_COND_FCVTNT	; Used in aarch64-sve2.md.
+    UNSPEC_SMULHS	; Used in aarch64-sve2.md.
+    UNSPEC_SMULHRS	; Used in aarch64-sve2.md.
+    UNSPEC_UMULHS	; Used in aarch64-sve2.md.
+    UNSPEC_UMULHRS	; Used in aarch64-sve2.md.
+    UNSPEC_ASRD		; Used in aarch64-sve.md.
+    UNSPEC_USDOT	; Used in aarch64-simd.md.
+    UNSPEC_SUDOT	; Used in aarch64-simd.md.
+    UNSPEC_BFDOT	; Used in aarch64-simd.md.
+    UNSPEC_BFMLALB	; Used in aarch64-sve.md.
+    UNSPEC_BFMLALT	; Used in aarch64-sve.md.
+    UNSPEC_BFMMLA	; Used in aarch64-sve.md.
+    UNSPEC_BFCVTN      ; Used in aarch64-simd.md.
+    UNSPEC_BFCVTN2     ; Used in aarch64-simd.md.
+    UNSPEC_BFCVT       ; Used in aarch64-simd.md.
 ])
 
 ;; ------------------------------------------------------------------
@@ -586,6 +758,7 @@
 			  (V2SI "2") (V4SI "4")
 				     (V2DI "2")
 			  (V4HF "4") (V8HF "8")
+			  (V4BF "4") (V8BF "8")
 			  (V2SF "2") (V4SF "4")
 			  (V1DF "1") (V2DF "2")
 			  (DI "1") (DF "1")])
@@ -610,6 +783,14 @@
 (define_mode_attr sizem1 [(QI "#7") (HI "#15") (SI "#31") (DI "#63")
 			  (HF "#15") (SF "#31") (DF "#63")])
 
+;; The number of bits in a vector element, or controlled by a predicate
+;; element.
+(define_mode_attr elem_bits [(VNx16BI "8") (VNx8BI "16")
+			     (VNx4BI "32") (VNx2BI "64")
+			     (VNx16QI "8") (VNx8HI "16")
+			     (VNx4SI "32") (VNx2DI "64")
+			     (VNx8HF "16") (VNx4SF "32") (VNx2DF "64")])
+
 ;; Attribute to describe constants acceptable in logical operations
 (define_mode_attr lconst [(SI "K") (DI "L")])
 
@@ -624,6 +805,7 @@
 
 (define_mode_attr Vtype [(V8QI "8b") (V16QI "16b")
 			 (V4HI "4h") (V8HI  "8h")
+			 (V4BF "4h") (V8BF  "8h")
                          (V2SI "2s") (V4SI  "4s")
                          (DI   "1d") (DF    "1d")
                          (V2DI "2d") (V2SF "2s")
@@ -637,7 +819,8 @@
 			 (V4HI ".4h") (V8HI  ".8h")
 			 (V2SI ".2s") (V4SI  ".4s")
 			 (V2DI ".2d") (V4HF ".4h")
-			 (V8HF ".8h") (V2SF ".2s")
+			 (V8HF ".8h") (V4BF ".4h")
+			 (V8BF ".8h") (V2SF ".2s")
 			 (V4SF ".4s") (V2DF ".2d")
 			 (DI   "")    (SI   "")
 			 (HI   "")    (QI   "")
@@ -655,9 +838,10 @@
 			  (V4HI "h") (V8HI  "h") (VNx8HI  "h") (VNx8BI  "h")
 			  (V2SI "s") (V4SI  "s") (VNx4SI  "s") (VNx4BI  "s")
 			  (V2DI "d")             (VNx2DI  "d") (VNx2BI  "d")
-			  (V4HF "h") (V8HF  "h") (VNx8HF  "h")
+			  (V4HF "h") (V8HF  "h") (VNx8HF  "h") (VNx8BF  "h")
 			  (V2SF "s") (V4SF  "s") (VNx4SF  "s")
 			  (V2DF "d")             (VNx2DF  "d")
+			  (BF "h") (V4BF "h") (V8BF "h")
 			  (HF   "h")
 			  (SF   "s") (DF  "d")
 			  (QI "b")   (HI "h")
@@ -667,13 +851,17 @@
 (define_mode_attr Vetype_fourth [(VNx4SI "b") (VNx2DI "h")])
 
 ;; Equivalent of "size" for a vector element.
-(define_mode_attr Vesize [(VNx16QI "b")
-			  (VNx8HI  "h") (VNx8HF  "h")
-			  (VNx4SI  "w") (VNx4SF  "w")
+(define_mode_attr Vesize [(VNx16QI "b") (VNx8QI  "b")
+			  (VNx4QI  "b") (VNx2QI  "b")
+			  (VNx8HI  "h") (VNx4HI  "h")
+			  (VNx2HI  "h") (VNx8HF  "h")
+			  (VNx4SI  "w") (VNx2SI  "w") (VNx4SF  "w")
 			  (VNx2DI  "d") (VNx2DF  "d")
 			  (VNx32QI "b") (VNx48QI "b") (VNx64QI "b")
 			  (VNx16HI "h") (VNx24HI "h") (VNx32HI "h")
 			  (VNx16HF "h") (VNx24HF "h") (VNx32HF "h")
+			  (VNx16BF "h") (VNx24BF "h") (VNx32BF "h")
+			  (VNx8BF  "h")
 			  (VNx8SI  "w") (VNx12SI "w") (VNx16SI "w")
 			  (VNx8SF  "w") (VNx12SF "w") (VNx16SF "w")
 			  (VNx4DI  "d") (VNx6DI  "d") (VNx8DI  "d")
@@ -697,13 +885,16 @@
 			  (V8HF "16b") (V2SF  "8b")
 			  (V4SF "16b") (V2DF  "16b")
 			  (DI   "8b")  (DF    "8b")
-			  (SI   "8b")  (SF    "8b")])
+			  (SI   "8b")  (SF    "8b")
+			  (V4BF "8b")  (V8BF  "16b")])
 
 ;; Define element mode for each vector mode.
 (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI") (VNx16QI "QI")
 			(V4HI "HI") (V8HI  "HI") (VNx8HI  "HI")
 			(V2SI "SI") (V4SI  "SI") (VNx4SI  "SI")
+			(VNx8BF "BF")
 			(DI   "DI") (V2DI  "DI") (VNx2DI  "DI")
+			(V4BF "BF") (V8BF "BF")
 			(V4HF "HF") (V8HF  "HF") (VNx8HF  "HF")
 			(V2SF "SF") (V4SF  "SF") (VNx4SF  "SF")
 			(DF   "DF") (V2DF  "DF") (VNx2DF  "DF")
@@ -713,8 +904,10 @@
 ;; Define element mode for each vector mode (lower case).
 (define_mode_attr Vel [(V8QI "qi") (V16QI "qi") (VNx16QI "qi")
 			(V4HI "hi") (V8HI "hi") (VNx8HI  "hi")
+			(VNx8BF "bf")
 			(V2SI "si") (V4SI "si") (VNx4SI  "si")
 			(DI "di")   (V2DI "di") (VNx2DI  "di")
+			(V4BF "bf") (V8BF "bf")
 			(V4HF "hf") (V8HF "hf") (VNx8HF  "hf")
 			(V2SF "sf") (V4SF "sf") (VNx4SF  "sf")
 			(V2DF "df") (DF "df")   (VNx2DF  "df")
@@ -723,19 +916,19 @@
 
 ;; Element mode with floating-point values replaced by like-sized integers.
 (define_mode_attr VEL_INT [(VNx16QI "QI")
-			   (VNx8HI  "HI") (VNx8HF "HI")
+			   (VNx8HI  "HI") (VNx8HF "HI") (VNx8BF "HI")
 			   (VNx4SI  "SI") (VNx4SF "SI")
 			   (VNx2DI  "DI") (VNx2DF "DI")])
 
 ;; Gives the mode of the 128-bit lowpart of an SVE vector.
 (define_mode_attr V128 [(VNx16QI "V16QI")
-			(VNx8HI  "V8HI") (VNx8HF "V8HF")
+			(VNx8HI  "V8HI") (VNx8HF "V8HF") (VNx8BF "V8BF")
 			(VNx4SI  "V4SI") (VNx4SF "V4SF")
 			(VNx2DI  "V2DI") (VNx2DF "V2DF")])
 
 ;; ...and again in lower case.
 (define_mode_attr v128 [(VNx16QI "v16qi")
-			(VNx8HI  "v8hi") (VNx8HF "v8hf")
+			(VNx8HI  "v8hi") (VNx8HF "v8hf") (VNx8BF "v8bf")
 			(VNx4SI  "v4si") (VNx4SF "v4sf")
 			(VNx2DI  "v2di") (VNx2DF "v2df")])
 
@@ -763,19 +956,20 @@
 			 (V2SI "SI")    (V4SI  "V2SI")
 			 (V2DI "DI")    (V2SF  "SF")
 			 (V4SF "V2SF")  (V4HF "V2HF")
-			 (V8HF "V4HF")  (V2DF  "DF")])
+			 (V8HF "V4HF")  (V2DF  "DF")
+			 (V8BF "V4BF")])
 
 ;; Half modes of all vector modes, in lower-case.
 (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
 			 (V4HI "v2hi")  (V8HI  "v4hi")
-			 (V8HF  "v4hf")
+			 (V8HF  "v4hf") (V8BF  "v4bf")
 			 (V2SI "si")    (V4SI  "v2si")
 			 (V2DI "di")    (V2SF  "sf")
 			 (V4SF "v2sf")  (V2DF  "df")])
 
 ;; Double modes of vector modes.
 (define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI")
-			(V4HF "V8HF")
+			(V4HF "V8HF")  (V4BF "V8BF")
 			(V2SI "V4SI")  (V2SF "V4SF")
 			(SI   "V2SI")  (DI   "V2DI")
 			(DF   "V2DF")])
@@ -785,7 +979,7 @@
 
 ;; Double modes of vector modes (lower case).
 (define_mode_attr Vdbl [(V8QI "v16qi") (V4HI "v8hi")
-			(V4HF "v8hf")
+			(V4HF "v8hf")  (V4BF "v8bf")
 			(V2SI "v4si")  (V2SF "v4sf")
 			(SI   "v2si")  (DI   "v2di")
 			(DF   "v2df")])
@@ -879,6 +1073,7 @@
 ;; variation on <vw> mapping FP modes to GP regs.
 (define_mode_attr vwcore [(V8QI "w") (V16QI "w") (VNx16QI "w")
 			  (V4HI "w") (V8HI "w") (VNx8HI "w")
+			  (VNx8BF "w")
 			  (V2SI "w") (V4SI "w") (VNx4SI "w")
 			  (DI   "x") (V2DI "x") (VNx2DI "x")
 			  (V4HF "w") (V8HF "w") (VNx8HF "w")
@@ -894,12 +1089,14 @@
 			       (V2SI "V2SI") (V4SI  "V4SI")
 			       (DI   "DI")   (V2DI  "V2DI")
 			       (V4HF "V4HI") (V8HF  "V8HI")
+			       (V4BF "V4HI") (V8BF  "V8HI")
 			       (V2SF "V2SI") (V4SF  "V4SI")
 			       (DF   "DI")   (V2DF  "V2DI")
 			       (SF   "SI")   (SI    "SI")
 			       (HF    "HI")
 			       (VNx16QI "VNx16QI")
 			       (VNx8HI  "VNx8HI") (VNx8HF "VNx8HI")
+			       (VNx8BF  "VNx8HI")
 			       (VNx4SI  "VNx4SI") (VNx4SF "VNx4SI")
 			       (VNx2DI  "VNx2DI") (VNx2DF "VNx2DI")
 ])
@@ -910,19 +1107,25 @@
 			       (V2SI "v2si") (V4SI  "v4si")
 			       (DI   "di")   (V2DI  "v2di")
 			       (V4HF "v4hi") (V8HF  "v8hi")
+			       (V4BF "v4hi") (V8BF  "v8hi")
 			       (V2SF "v2si") (V4SF  "v4si")
 			       (DF   "di")   (V2DF  "v2di")
 			       (SF   "si")
 			       (VNx16QI "vnx16qi")
 			       (VNx8HI  "vnx8hi") (VNx8HF "vnx8hi")
+			       (VNx8BF  "vnx8hi")
 			       (VNx4SI  "vnx4si") (VNx4SF "vnx4si")
 			       (VNx2DI  "vnx2di") (VNx2DF "vnx2di")
 ])
 
 ;; Floating-point equivalent of selected modes.
-(define_mode_attr V_FP_EQUIV [(VNx4SI "VNx4SF") (VNx4SF "VNx4SF")
+(define_mode_attr V_FP_EQUIV [(VNx8HI "VNx8HF") (VNx8HF "VNx8HF")
+			      (VNx8BF "VNx8HF")
+			      (VNx4SI "VNx4SF") (VNx4SF "VNx4SF")
 			      (VNx2DI "VNx2DF") (VNx2DF "VNx2DF")])
-(define_mode_attr v_fp_equiv [(VNx4SI "vnx4sf") (VNx4SF "vnx4sf")
+(define_mode_attr v_fp_equiv [(VNx8HI "vnx8hf") (VNx8HF "vnx8hf")
+			      (VNx8BF "vnx8hf")
+			      (VNx4SI "vnx4sf") (VNx4SF "vnx4sf")
 			      (VNx2DI "vnx2df") (VNx2DF "vnx2df")])
 
 ;; Mode for vector conditional operations where the comparison has
@@ -976,6 +1179,7 @@
 
 (define_mode_attr VSWAP_WIDTH [(V8QI "V16QI") (V16QI "V8QI")
 				(V4HI "V8HI") (V8HI  "V4HI")
+				(V8BF "V4BF") (V4BF  "V8BF")
 				(V2SI "V4SI") (V4SI  "V2SI")
 				(DI   "V2DI") (V2DI  "DI")
 				(V2SF "V4SF") (V4SF  "V2SF")
@@ -988,6 +1192,7 @@
 				    (DI   "to_128") (V2DI  "to_64")
 				    (V4HF "to_128") (V8HF  "to_64")
 				    (V2SF "to_128") (V4SF  "to_64")
+				    (V4BF "to_128") (V8BF  "to_64")
 				    (DF   "to_128") (V2DF  "to_64")])
 
 ;; For certain vector-by-element multiplication instructions we must
@@ -1021,9 +1226,11 @@
 ;; Defined to '_q' for 128-bit types.
 (define_mode_attr q [(V8QI "") (V16QI "_q")
 		     (V4HI "") (V8HI  "_q")
+		     (V4BF "") (V8BF  "_q")
 		     (V2SI "") (V4SI  "_q")
 		     (DI   "") (V2DI  "_q")
 		     (V4HF "") (V8HF "_q")
+		     (V4BF "") (V8BF "_q")
 		     (V2SF "") (V4SF  "_q")
 			       (V2DF  "_q")
 		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")])
@@ -1044,6 +1251,9 @@
 ;; Register suffix for DOTPROD input types from the return type.
 (define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")])
 
+;; Register suffix for BFDOT input types from the return type.
+(define_mode_attr Vbfdottype [(V2SF "4h") (V4SF "8h")])
+
 ;; Sum of lengths of instructions needed to move vector registers of a mode.
 (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
 
@@ -1054,63 +1264,83 @@
 ;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub
 (define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")])
 
+;; Width of 2nd and 3rd arguments to bf16 vector multiply add/sub
+(define_mode_attr VBFMLA_W [(V2SF "V4BF") (V4SF "V8BF")])
+
 (define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")])
 
 (define_mode_attr f16quad [(V2SF "") (V4SF "q")])
 
+(define_mode_attr isquadop [(V8QI "") (V16QI "q") (V4BF "") (V8BF "q")])
+
 (define_code_attr f16mac [(plus "a") (minus "s")])
 
 ;; Map smax to smin and umax to umin.
 (define_code_attr max_opp [(smax "smin") (umax "umin")])
 
+;; Same as above, but louder.
+(define_code_attr MAX_OPP [(smax "SMIN") (umax "UMIN")])
+
 ;; The number of subvectors in an SVE_STRUCT.
 (define_mode_attr vector_count [(VNx32QI "2") (VNx16HI "2")
 				(VNx8SI  "2") (VNx4DI  "2")
+				(VNx16BF "2")
 				(VNx16HF "2") (VNx8SF  "2") (VNx4DF "2")
 				(VNx48QI "3") (VNx24HI "3")
 				(VNx12SI "3") (VNx6DI  "3")
+				(VNx24BF "3")
 				(VNx24HF "3") (VNx12SF "3") (VNx6DF "3")
 				(VNx64QI "4") (VNx32HI "4")
 				(VNx16SI "4") (VNx8DI  "4")
+				(VNx32BF "4")
 				(VNx32HF "4") (VNx16SF "4") (VNx8DF "4")])
 
 ;; The number of instruction bytes needed for an SVE_STRUCT move.  This is
 ;; equal to vector_count * 4.
 (define_mode_attr insn_length [(VNx32QI "8")  (VNx16HI "8")
 			       (VNx8SI  "8")  (VNx4DI  "8")
+			       (VNx16BF "8")
 			       (VNx16HF "8")  (VNx8SF  "8")  (VNx4DF "8")
 			       (VNx48QI "12") (VNx24HI "12")
 			       (VNx12SI "12") (VNx6DI  "12")
+			       (VNx24BF "12")
 			       (VNx24HF "12") (VNx12SF "12") (VNx6DF "12")
 			       (VNx64QI "16") (VNx32HI "16")
 			       (VNx16SI "16") (VNx8DI  "16")
+			       (VNx32BF "16")
 			       (VNx32HF "16") (VNx16SF "16") (VNx8DF "16")])
 
 ;; The type of a subvector in an SVE_STRUCT.
 (define_mode_attr VSINGLE [(VNx32QI "VNx16QI")
 			   (VNx16HI "VNx8HI") (VNx16HF "VNx8HF")
+			   (VNx16BF "VNx8BF")
 			   (VNx8SI "VNx4SI") (VNx8SF "VNx4SF")
 			   (VNx4DI "VNx2DI") (VNx4DF "VNx2DF")
 			   (VNx48QI "VNx16QI")
 			   (VNx24HI "VNx8HI") (VNx24HF "VNx8HF")
+			   (VNx24BF "VNx8BF")
 			   (VNx12SI "VNx4SI") (VNx12SF "VNx4SF")
 			   (VNx6DI "VNx2DI") (VNx6DF "VNx2DF")
 			   (VNx64QI "VNx16QI")
 			   (VNx32HI "VNx8HI") (VNx32HF "VNx8HF")
+			   (VNx32BF "VNx8BF")
 			   (VNx16SI "VNx4SI") (VNx16SF "VNx4SF")
 			   (VNx8DI "VNx2DI") (VNx8DF "VNx2DF")])
 
 ;; ...and again in lower case.
 (define_mode_attr vsingle [(VNx32QI "vnx16qi")
 			   (VNx16HI "vnx8hi") (VNx16HF "vnx8hf")
+			   (VNx16BF "vnx8bf")
 			   (VNx8SI "vnx4si") (VNx8SF "vnx4sf")
 			   (VNx4DI "vnx2di") (VNx4DF "vnx2df")
 			   (VNx48QI "vnx16qi")
 			   (VNx24HI "vnx8hi") (VNx24HF "vnx8hf")
+			   (VNx24BF "vnx8bf")
 			   (VNx12SI "vnx4si") (VNx12SF "vnx4sf")
 			   (VNx6DI "vnx2di") (VNx6DF "vnx2df")
 			   (VNx64QI "vnx16qi")
 			   (VNx32HI "vnx8hi") (VNx32HF "vnx8hf")
+			   (VNx32BF "vnx8bf")
 			   (VNx16SI "vnx4si") (VNx16SF "vnx4sf")
 			   (VNx8DI "vnx2di") (VNx8DF "vnx2df")])
 
@@ -1118,36 +1348,44 @@
 ;; this is equivalent to the <VPRED> of the subvector mode.
 (define_mode_attr VPRED [(VNx16QI "VNx16BI")
 			 (VNx8HI "VNx8BI") (VNx8HF "VNx8BI")
+			 (VNx8BF "VNx8BI")
 			 (VNx4SI "VNx4BI") (VNx4SF "VNx4BI")
 			 (VNx2DI "VNx2BI") (VNx2DF "VNx2BI")
 			 (VNx32QI "VNx16BI")
 			 (VNx16HI "VNx8BI") (VNx16HF "VNx8BI")
+			 (VNx16BF "VNx8BI")
 			 (VNx8SI "VNx4BI") (VNx8SF "VNx4BI")
 			 (VNx4DI "VNx2BI") (VNx4DF "VNx2BI")
 			 (VNx48QI "VNx16BI")
 			 (VNx24HI "VNx8BI") (VNx24HF "VNx8BI")
+			 (VNx24BF "VNx8BI")
 			 (VNx12SI "VNx4BI") (VNx12SF "VNx4BI")
 			 (VNx6DI "VNx2BI") (VNx6DF "VNx2BI")
 			 (VNx64QI "VNx16BI")
 			 (VNx32HI "VNx8BI") (VNx32HF "VNx8BI")
+			 (VNx32BF "VNx8BI")
 			 (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
 			 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")])
 
 ;; ...and again in lower case.
 (define_mode_attr vpred [(VNx16QI "vnx16bi")
 			 (VNx8HI "vnx8bi") (VNx8HF "vnx8bi")
+			 (VNx8BF "vnx8bi")
 			 (VNx4SI "vnx4bi") (VNx4SF "vnx4bi")
 			 (VNx2DI "vnx2bi") (VNx2DF "vnx2bi")
 			 (VNx32QI "vnx16bi")
 			 (VNx16HI "vnx8bi") (VNx16HF "vnx8bi")
+			 (VNx16BF "vnx8bi")
 			 (VNx8SI "vnx4bi") (VNx8SF "vnx4bi")
 			 (VNx4DI "vnx2bi") (VNx4DF "vnx2bi")
 			 (VNx48QI "vnx16bi")
 			 (VNx24HI "vnx8bi") (VNx24HF "vnx8bi")
+			 (VNx24BF "vnx8bi")
 			 (VNx12SI "vnx4bi") (VNx12SF "vnx4bi")
 			 (VNx6DI "vnx2bi") (VNx6DF "vnx2bi")
 			 (VNx64QI "vnx16bi")
 			 (VNx32HI "vnx8bi") (VNx32HF "vnx4bi")
+			 (VNx32BF "vnx8bi")
 			 (VNx16SI "vnx4bi") (VNx16SF "vnx4bi")
 			 (VNx8DI "vnx2bi") (VNx8DF "vnx2bi")])
 
@@ -1158,6 +1396,30 @@
 				    (V4HF "<Vetype>[%4]") (V8HF "<Vetype>[%4]")
 				    ])
 
+;; The number of bytes controlled by a predicate
+(define_mode_attr data_bytes [(VNx16BI "1") (VNx8BI "2")
+			      (VNx4BI "4") (VNx2BI "8")])
+
+;; Two-nybble mask for partial vector modes: nunits, byte size.
+(define_mode_attr self_mask [(VNx8QI "0x81")
+			     (VNx4QI "0x41")
+			     (VNx2QI "0x21")
+			     (VNx4HI "0x42")
+			     (VNx2HI "0x22")
+			     (VNx2SI "0x24")])
+
+;; For full vector modes, the mask of narrower modes, encoded as above.
+(define_mode_attr narrower_mask [(VNx8HI "0x81")
+				 (VNx4SI "0x43")
+				 (VNx2DI "0x27")])
+
+;; The constraint to use for an SVE [SU]DOT, FMUL, FMLA or FMLS lane index.
+(define_mode_attr sve_lane_con [(VNx4SI "y") (VNx2DI "x")
+				(VNx8HF "y") (VNx4SF "y") (VNx2DF "x")])
+
+;; The constraint to use for an SVE FCMLA lane index.
+(define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
+
 ;; -------------------------------------------------------------------
 ;; Code Iterators
 ;; -------------------------------------------------------------------
@@ -1168,6 +1430,8 @@
 ;; This code iterator allows the shifts supported in arithmetic instructions
 (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
 
+(define_code_iterator SHIFTRT [ashiftrt lshiftrt])
+
 ;; Code iterator for logical operations
 (define_code_iterator LOGICAL [and ior xor])
 
@@ -1214,7 +1478,7 @@
 ;; Signed and unsigned max operations.
 (define_code_iterator USMAX [smax umax])
 
-;; Code iterator for variants of vector max and min.
+;; Code iterator for plus and minus.
 (define_code_iterator ADDSUB [plus minus])
 
 ;; Code iterator for variants of vector saturating binary ops.
@@ -1226,6 +1490,21 @@
 ;; Code iterator for signed variants of vector saturating binary ops.
 (define_code_iterator SBINQOPS [ss_plus ss_minus])
 
+;; Code iterator for unsigned variants of vector saturating binary ops.
+(define_code_iterator UBINQOPS [us_plus us_minus])
+
+;; Modular and saturating addition.
+(define_code_iterator ANY_PLUS [plus ss_plus us_plus])
+
+;; Saturating addition.
+(define_code_iterator SAT_PLUS [ss_plus us_plus])
+
+;; Modular and saturating subtraction.
+(define_code_iterator ANY_MINUS [minus ss_minus us_minus])
+
+;; Saturating subtraction.
+(define_code_iterator SAT_MINUS [ss_minus us_minus])
+
 ;; Comparison operators for <F>CM.
 (define_code_iterator COMPARISONS [lt le eq ge gt])
 
@@ -1236,27 +1515,25 @@
 (define_code_iterator FAC_COMPARISONS [lt le ge gt])
 
 ;; SVE integer unary operations.
-(define_code_iterator SVE_INT_UNARY [abs neg not popcount])
-
-;; SVE floating-point unary operations.
-(define_code_iterator SVE_FP_UNARY [abs neg sqrt])
+(define_code_iterator SVE_INT_UNARY [abs neg not clrsb clz popcount])
 
 ;; SVE integer binary operations.
 (define_code_iterator SVE_INT_BINARY [plus minus mult smax umax smin umin
+				      ashift ashiftrt lshiftrt
 				      and ior xor])
 
 ;; SVE integer binary division operations.
 (define_code_iterator SVE_INT_BINARY_SD [div udiv])
 
+;; SVE integer binary operations that have an immediate form.
+(define_code_iterator SVE_INT_BINARY_IMM [mult smax smin umax umin])
+
 ;; SVE floating-point operations with an unpredicated all-register form.
 (define_code_iterator SVE_UNPRED_FP_BINARY [plus minus mult])
 
 ;; SVE integer comparisons.
 (define_code_iterator SVE_INT_CMP [lt le eq ne ge gt ltu leu geu gtu])
 
-;; SVE floating-point comparisons.
-(define_code_iterator SVE_FP_CMP [lt le eq ne ge gt])
-
 ;; -------------------------------------------------------------------
 ;; Code Attributes
 ;; -------------------------------------------------------------------
@@ -1273,6 +1550,8 @@
 			 (unsigned_fix "fixuns")
 			 (float "float")
 			 (unsigned_float "floatuns")
+			 (clrsb "clrsb")
+			 (clz "clz")
 			 (popcount "popcount")
 			 (and "and")
 			 (ior "ior")
@@ -1304,8 +1583,7 @@
 			 (leu "leu")
 			 (geu "geu")
 			 (gtu "gtu")
-			 (abs "abs")
-			 (sqrt "sqrt")])
+			 (abs "abs")])
 
 ;; For comparison operators we use the FCM* and CM* instructions.
 ;; As there are no CMLE or CMLT instructions which act on 3 vector
@@ -1350,6 +1628,9 @@
 (define_code_attr shift [(ashift "lsl") (ashiftrt "asr")
 			 (lshiftrt "lsr") (rotatert "ror")])
 
+;; Op prefix for shift right and accumulate.
+(define_code_attr sra_op [(ashiftrt "s") (lshiftrt "u")])
+
 ;; Map shift operators onto underlying bit-field instructions
 (define_code_attr bfshift [(ashift "ubfiz") (ashiftrt "sbfx")
 			   (lshiftrt "ubfx") (rotatert "extr")])
@@ -1374,6 +1655,15 @@
 		      (smax "s") (umax "u")
 		      (smin "s") (umin "u")])
 
+;; "s" for signed ops, empty for unsigned ones.
+(define_code_attr s [(sign_extend "s") (zero_extend "")])
+
+;; Map signed/unsigned ops to the corresponding extension.
+(define_code_attr paired_extend [(ss_plus "sign_extend")
+				 (us_plus "zero_extend")
+				 (ss_minus "sign_extend")
+				 (us_minus "zero_extend")])
+
 ;; Whether a shift is left or right.
 (define_code_attr lr [(ashift "l") (ashiftrt "r") (lshiftrt "r")])
 
@@ -1434,35 +1724,45 @@
 			      (smax "smax")
 			      (umin "umin")
 			      (umax "umax")
+			      (ashift "lsl")
+			      (ashiftrt "asr")
+			      (lshiftrt "lsr")
 			      (and "and")
 			      (ior "orr")
 			      (xor "eor")
 			      (not "not")
+			      (clrsb "cls")
+			      (clz "clz")
 			      (popcount "cnt")])
 
 (define_code_attr sve_int_op_rev [(plus "add")
-			          (minus "subr")
-			          (mult "mul")
-			          (div "sdivr")
-			          (udiv "udivr")
-			          (smin "smin")
-			          (smax "smax")
-			          (umin "umin")
-			          (umax "umax")
-			          (and "and")
-			          (ior "orr")
-			          (xor "eor")])
+				  (minus "subr")
+				  (mult "mul")
+				  (div "sdivr")
+				  (udiv "udivr")
+				  (smin "smin")
+				  (smax "smax")
+				  (umin "umin")
+				  (umax "umax")
+				  (ashift "lslr")
+				  (ashiftrt "asrr")
+				  (lshiftrt "lsrr")
+				  (and "and")
+				  (ior "orr")
+				  (xor "eor")])
 
 ;; The floating-point SVE instruction that implements an rtx code.
 (define_code_attr sve_fp_op [(plus "fadd")
 			     (minus "fsub")
-			     (mult "fmul")
-			     (neg "fneg")
-			     (abs "fabs")
-			     (sqrt "fsqrt")])
+			     (mult "fmul")])
 
 ;; The SVE immediate constraint to use for an rtl code.
-(define_code_attr sve_imm_con [(eq "vsc")
+(define_code_attr sve_imm_con [(mult "vsm")
+			       (smax "vsm")
+			       (smin "vsm")
+			       (umax "vsb")
+			       (umin "vsb")
+			       (eq "vsc")
 			       (ne "vsc")
 			       (lt "vsc")
 			       (ge "vsc")
@@ -1473,6 +1773,33 @@
 			       (geu "vsd")
 			       (gtu "vsd")])
 
+;; The prefix letter to use when printing an immediate operand.
+(define_code_attr sve_imm_prefix [(mult "")
+				  (smax "")
+				  (smin "")
+				  (umax "D")
+				  (umin "D")])
+
+;; The predicate to use for the second input operand in a cond_<optab><mode>
+;; pattern.
+(define_code_attr sve_pred_int_rhs2_operand
+  [(plus "register_operand")
+   (minus "register_operand")
+   (mult "register_operand")
+   (smax "register_operand")
+   (umax "register_operand")
+   (smin "register_operand")
+   (umin "register_operand")
+   (ashift "aarch64_sve_lshift_operand")
+   (ashiftrt "aarch64_sve_rshift_operand")
+   (lshiftrt "aarch64_sve_rshift_operand")
+   (and "aarch64_sve_pred_and_operand")
+   (ior "register_operand")
+   (xor "register_operand")])
+
+(define_code_attr inc_dec [(minus "dec") (ss_minus "sqdec") (us_minus "uqdec")
+			   (plus "inc") (ss_plus "sqinc") (us_plus "uqinc")])
+
 ;; -------------------------------------------------------------------
 ;; Int Iterators.
 ;; -------------------------------------------------------------------
@@ -1492,7 +1819,7 @@
 (define_int_iterator FMAXMINV [UNSPEC_FMAXV UNSPEC_FMINV
 			       UNSPEC_FMAXNMV UNSPEC_FMINNMV])
 
-(define_int_iterator BITWISEV [UNSPEC_ANDV UNSPEC_IORV UNSPEC_XORV])
+(define_int_iterator SVE_INT_ADDV [UNSPEC_SADDV UNSPEC_UADDV])
 
 (define_int_iterator LOGICALF [UNSPEC_ANDF UNSPEC_IORF UNSPEC_XORF])
 
@@ -1505,8 +1832,20 @@
 
 (define_int_iterator RHADD [UNSPEC_SRHADD UNSPEC_URHADD])
 
+(define_int_iterator MULLBT [UNSPEC_SMULLB UNSPEC_UMULLB
+                             UNSPEC_SMULLT UNSPEC_UMULLT])
+
+(define_int_iterator SHRNB [UNSPEC_SHRNB UNSPEC_RSHRNB])
+
+(define_int_iterator SHRNT [UNSPEC_SHRNT UNSPEC_RSHRNT])
+
+(define_int_iterator BSL_DUP [1 2])
+
 (define_int_iterator DOTPROD [UNSPEC_SDOT UNSPEC_UDOT])
 
+(define_int_iterator DOTPROD_I8MM [UNSPEC_USDOT UNSPEC_SUDOT])
+(define_int_iterator DOTPROD_US_ONLY [UNSPEC_USDOT])
+
 (define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN
 			       UNSPEC_SUBHN UNSPEC_RSUBHN])
 
@@ -1516,12 +1855,17 @@
 (define_int_iterator FMAXMIN_UNS [UNSPEC_FMAX UNSPEC_FMIN
 				  UNSPEC_FMAXNM UNSPEC_FMINNM])
 
-(define_int_iterator PAUTH_LR_SP [UNSPEC_PACISP UNSPEC_AUTISP])
+(define_int_iterator PAUTH_LR_SP [UNSPEC_PACIASP UNSPEC_AUTIASP
+				  UNSPEC_PACIBSP UNSPEC_AUTIBSP])
 
-(define_int_iterator PAUTH_17_16 [UNSPEC_PACI1716 UNSPEC_AUTI1716])
+(define_int_iterator PAUTH_17_16 [UNSPEC_PACIA1716 UNSPEC_AUTIA1716
+				  UNSPEC_PACIB1716 UNSPEC_AUTIB1716])
 
 (define_int_iterator VQDMULH [UNSPEC_SQDMULH UNSPEC_SQRDMULH])
 
+(define_int_iterator MULHRS [UNSPEC_SMULHS UNSPEC_UMULHS
+                             UNSPEC_SMULHRS UNSPEC_UMULHRS])
+
 (define_int_iterator USSUQADD [UNSPEC_SUQADD UNSPEC_USQADD])
 
 (define_int_iterator SUQMOVN [UNSPEC_SQXTN UNSPEC_UQXTN])
@@ -1555,6 +1899,10 @@
 			      UNSPEC_TRN1 UNSPEC_TRN2
 			      UNSPEC_UZP1 UNSPEC_UZP2])
 
+(define_int_iterator PERMUTEQ [UNSPEC_ZIP1Q UNSPEC_ZIP2Q
+			       UNSPEC_TRN1Q UNSPEC_TRN2Q
+			       UNSPEC_UZP1Q UNSPEC_UZP2Q])
+
 (define_int_iterator OPTAB_PERMUTE [UNSPEC_ZIP1 UNSPEC_ZIP2
 				    UNSPEC_UZP1 UNSPEC_UZP2])
 
@@ -1601,18 +1949,144 @@
 
 (define_int_iterator MUL_HIGHPART [UNSPEC_SMUL_HIGHPART UNSPEC_UMUL_HIGHPART])
 
-(define_int_iterator SVE_COND_FP_BINARY [UNSPEC_COND_ADD UNSPEC_COND_SUB
-					 UNSPEC_COND_MUL UNSPEC_COND_DIV
-					 UNSPEC_COND_MAX UNSPEC_COND_MIN])
+(define_int_iterator CLAST [UNSPEC_CLASTA UNSPEC_CLASTB])
+
+(define_int_iterator LAST [UNSPEC_LASTA UNSPEC_LASTB])
+
+(define_int_iterator SVE_INT_UNARY [UNSPEC_RBIT UNSPEC_REVB
+				    UNSPEC_REVH UNSPEC_REVW])
+
+(define_int_iterator SVE_FP_UNARY [UNSPEC_FRECPE UNSPEC_RSQRTE])
+
+(define_int_iterator SVE_FP_UNARY_INT [UNSPEC_FEXPA])
+
+(define_int_iterator SVE_FP_BINARY [UNSPEC_FRECPS UNSPEC_RSQRTS])
+
+(define_int_iterator SVE_FP_BINARY_INT [UNSPEC_FTSMUL UNSPEC_FTSSEL])
+
+(define_int_iterator SVE_BFLOAT_TERNARY_LONG [UNSPEC_BFDOT
+					      UNSPEC_BFMLALB
+					      UNSPEC_BFMLALT
+					      UNSPEC_BFMMLA])
+
+(define_int_iterator SVE_BFLOAT_TERNARY_LONG_LANE [UNSPEC_BFDOT
+						   UNSPEC_BFMLALB
+						   UNSPEC_BFMLALT])
+
+(define_int_iterator SVE_INT_REDUCTION [UNSPEC_ANDV
+					UNSPEC_IORV
+					UNSPEC_SMAXV
+					UNSPEC_SMINV
+					UNSPEC_UMAXV
+					UNSPEC_UMINV
+					UNSPEC_XORV])
+
+(define_int_iterator SVE_FP_REDUCTION [UNSPEC_FADDV
+				       UNSPEC_FMAXV
+				       UNSPEC_FMAXNMV
+				       UNSPEC_FMINV
+				       UNSPEC_FMINNMV])
+
+(define_int_iterator SVE_COND_FP_UNARY [UNSPEC_COND_FABS
+					UNSPEC_COND_FNEG
+					UNSPEC_COND_FRECPX
+					UNSPEC_COND_FRINTA
+					UNSPEC_COND_FRINTI
+					UNSPEC_COND_FRINTM
+					UNSPEC_COND_FRINTN
+					UNSPEC_COND_FRINTP
+					UNSPEC_COND_FRINTX
+					UNSPEC_COND_FRINTZ
+					UNSPEC_COND_FSQRT])
+
+(define_int_iterator SVE_COND_FCVT [UNSPEC_COND_FCVT])
+(define_int_iterator SVE_COND_FCVTI [UNSPEC_COND_FCVTZS UNSPEC_COND_FCVTZU])
+(define_int_iterator SVE_COND_ICVTF [UNSPEC_COND_SCVTF UNSPEC_COND_UCVTF])
+
+(define_int_iterator SVE_COND_FP_BINARY [UNSPEC_COND_FADD
+					 UNSPEC_COND_FDIV
+					 UNSPEC_COND_FMAX
+					 UNSPEC_COND_FMAXNM
+					 UNSPEC_COND_FMIN
+					 UNSPEC_COND_FMINNM
+					 UNSPEC_COND_FMUL
+					 UNSPEC_COND_FMULX
+					 UNSPEC_COND_FSUB])
+
+(define_int_iterator SVE_COND_FP_BINARY_INT [UNSPEC_COND_FSCALE])
+
+(define_int_iterator SVE_COND_FP_ADD [UNSPEC_COND_FADD])
+(define_int_iterator SVE_COND_FP_SUB [UNSPEC_COND_FSUB])
+(define_int_iterator SVE_COND_FP_MUL [UNSPEC_COND_FMUL])
+
+(define_int_iterator SVE_COND_FP_BINARY_I1 [UNSPEC_COND_FMAX
+					    UNSPEC_COND_FMAXNM
+					    UNSPEC_COND_FMIN
+					    UNSPEC_COND_FMINNM
+					    UNSPEC_COND_FMUL])
+
+(define_int_iterator SVE_COND_FP_BINARY_REG [UNSPEC_COND_FDIV
+					     UNSPEC_COND_FMULX])
+
+(define_int_iterator SVE_COND_FCADD [UNSPEC_COND_FCADD90
+				     UNSPEC_COND_FCADD270])
+
+(define_int_iterator SVE_COND_FP_MAXMIN [UNSPEC_COND_FMAX
+					 UNSPEC_COND_FMAXNM
+					 UNSPEC_COND_FMIN
+					 UNSPEC_COND_FMINNM])
+
+;; Floating-point max/min operations that correspond to optabs,
+;; as opposed to those that are internal to the port.
+(define_int_iterator SVE_COND_FP_MAXMIN_PUBLIC [UNSPEC_COND_FMAXNM
+						UNSPEC_COND_FMINNM])
 
 (define_int_iterator SVE_COND_FP_TERNARY [UNSPEC_COND_FMLA
 					  UNSPEC_COND_FMLS
 					  UNSPEC_COND_FNMLA
 					  UNSPEC_COND_FNMLS])
 
-(define_int_iterator SVE_COND_FP_CMP [UNSPEC_COND_LT UNSPEC_COND_LE
-				      UNSPEC_COND_EQ UNSPEC_COND_NE
-				      UNSPEC_COND_GE UNSPEC_COND_GT])
+(define_int_iterator SVE_COND_FCMLA [UNSPEC_COND_FCMLA
+				     UNSPEC_COND_FCMLA90
+				     UNSPEC_COND_FCMLA180
+				     UNSPEC_COND_FCMLA270])
+
+(define_int_iterator SVE_COND_INT_CMP_WIDE [UNSPEC_COND_CMPEQ_WIDE
+					    UNSPEC_COND_CMPGE_WIDE
+					    UNSPEC_COND_CMPGT_WIDE
+					    UNSPEC_COND_CMPHI_WIDE
+					    UNSPEC_COND_CMPHS_WIDE
+					    UNSPEC_COND_CMPLE_WIDE
+					    UNSPEC_COND_CMPLO_WIDE
+					    UNSPEC_COND_CMPLS_WIDE
+					    UNSPEC_COND_CMPLT_WIDE
+					    UNSPEC_COND_CMPNE_WIDE])
+
+;; SVE FP comparisons that accept #0.0.
+(define_int_iterator SVE_COND_FP_CMP_I0 [UNSPEC_COND_FCMEQ
+					 UNSPEC_COND_FCMGE
+					 UNSPEC_COND_FCMGT
+					 UNSPEC_COND_FCMLE
+					 UNSPEC_COND_FCMLT
+					 UNSPEC_COND_FCMNE])
+
+(define_int_iterator SVE_COND_FP_ABS_CMP [UNSPEC_COND_FCMGE
+					  UNSPEC_COND_FCMGT
+					  UNSPEC_COND_FCMLE
+					  UNSPEC_COND_FCMLT])
+
+(define_int_iterator SVE_FP_TERNARY_LANE [UNSPEC_FMLA UNSPEC_FMLS])
+
+(define_int_iterator SVE_CFP_TERNARY_LANE [UNSPEC_FCMLA UNSPEC_FCMLA90
+					   UNSPEC_FCMLA180 UNSPEC_FCMLA270])
+
+(define_int_iterator SVE_WHILE [UNSPEC_WHILELE UNSPEC_WHILELO UNSPEC_WHILELS UNSPEC_WHILELT])
+
+(define_int_iterator SVE_SHIFT_WIDE [UNSPEC_ASHIFT_WIDE
+				     UNSPEC_ASHIFTRT_WIDE
+				     UNSPEC_LSHIFTRT_WIDE])
+
+(define_int_iterator SVE_LDFF1_LDNF1 [UNSPEC_LDFF1 UNSPEC_LDNF1])
 
 (define_int_iterator FCADD [UNSPEC_FCADD90
 			    UNSPEC_FCADD270])
@@ -1622,6 +2096,23 @@
 			    UNSPEC_FCMLA180
 			    UNSPEC_FCMLA270])
 
+(define_int_iterator FRINTNZX [UNSPEC_FRINT32Z UNSPEC_FRINT32X
+			       UNSPEC_FRINT64Z UNSPEC_FRINT64X])
+
+(define_int_iterator SVE_BRK_UNARY [UNSPEC_BRKA UNSPEC_BRKB])
+
+(define_int_iterator SVE_BRK_BINARY [UNSPEC_BRKN UNSPEC_BRKPA UNSPEC_BRKPB])
+
+(define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT])
+
+(define_int_iterator MATMUL [UNSPEC_SMATMUL UNSPEC_UMATMUL
+			     UNSPEC_USMATMUL])
+
+(define_int_iterator FMMLA [UNSPEC_FMMLA])
+
+(define_int_iterator BF_MLA [UNSPEC_BFMLALB
+			     UNSPEC_BFMLALT])
+
 ;; Iterators for atomic operations.
 
 (define_int_iterator ATOMIC_LDOP
@@ -1646,19 +2137,84 @@
 (define_int_attr optab [(UNSPEC_ANDF "and")
 			(UNSPEC_IORF "ior")
 			(UNSPEC_XORF "xor")
+			(UNSPEC_SADDV "sadd")
+			(UNSPEC_UADDV "uadd")
 			(UNSPEC_ANDV "and")
 			(UNSPEC_IORV "ior")
 			(UNSPEC_XORV "xor")
-			(UNSPEC_COND_ADD "add")
-			(UNSPEC_COND_SUB "sub")
-			(UNSPEC_COND_MUL "mul")
-			(UNSPEC_COND_DIV "div")
-			(UNSPEC_COND_MAX "smax")
-			(UNSPEC_COND_MIN "smin")
+			(UNSPEC_FRECPE "frecpe")
+			(UNSPEC_FRECPS "frecps")
+			(UNSPEC_RSQRTE "frsqrte")
+			(UNSPEC_RSQRTS "frsqrts")
+			(UNSPEC_RBIT "rbit")
+			(UNSPEC_REVB "revb")
+			(UNSPEC_REVH "revh")
+			(UNSPEC_REVW "revw")
+			(UNSPEC_UMAXV "umax")
+			(UNSPEC_UMINV "umin")
+			(UNSPEC_SMAXV "smax")
+			(UNSPEC_SMINV "smin")
+			(UNSPEC_FADDV "plus")
+			(UNSPEC_FMAXNMV "smax")
+			(UNSPEC_FMAXV "smax_nan")
+			(UNSPEC_FMINNMV "smin")
+			(UNSPEC_FMINV "smin_nan")
+		        (UNSPEC_SMUL_HIGHPART "smulh")
+		        (UNSPEC_UMUL_HIGHPART "umulh")
+			(UNSPEC_FMLA "fma")
+			(UNSPEC_FMLS "fnma")
+			(UNSPEC_FCMLA "fcmla")
+			(UNSPEC_FCMLA90 "fcmla90")
+			(UNSPEC_FCMLA180 "fcmla180")
+			(UNSPEC_FCMLA270 "fcmla270")
+			(UNSPEC_FEXPA "fexpa")
+			(UNSPEC_FTSMUL "ftsmul")
+			(UNSPEC_FTSSEL "ftssel")
+			(UNSPEC_SMATMUL "smatmul")
+			(UNSPEC_TRN1Q "trn1q")
+			(UNSPEC_TRN2Q "trn2q")
+			(UNSPEC_UMATMUL "umatmul")
+			(UNSPEC_USMATMUL "usmatmul")
+			(UNSPEC_UZP1Q "uzp1q")
+			(UNSPEC_UZP2Q "uzp2q")
+			(UNSPEC_ZIP1Q "zip1q")
+			(UNSPEC_ZIP2Q "zip2q")
+			(UNSPEC_COND_FABS "abs")
+			(UNSPEC_COND_FADD "add")
+			(UNSPEC_COND_FCADD90 "cadd90")
+			(UNSPEC_COND_FCADD270 "cadd270")
+			(UNSPEC_COND_FCMLA "fcmla")
+			(UNSPEC_COND_FCMLA90 "fcmla90")
+			(UNSPEC_COND_FCMLA180 "fcmla180")
+			(UNSPEC_COND_FCMLA270 "fcmla270")
+			(UNSPEC_COND_FCVT "fcvt")
+			(UNSPEC_COND_FCVTZS "fix_trunc")
+			(UNSPEC_COND_FCVTZU "fixuns_trunc")
+			(UNSPEC_COND_FDIV "div")
+			(UNSPEC_COND_FMAX "smax_nan")
+			(UNSPEC_COND_FMAXNM "smax")
+			(UNSPEC_COND_FMIN "smin_nan")
+			(UNSPEC_COND_FMINNM "smin")
 			(UNSPEC_COND_FMLA "fma")
 			(UNSPEC_COND_FMLS "fnma")
+			(UNSPEC_COND_FMUL "mul")
+			(UNSPEC_COND_FMULX "mulx")
+			(UNSPEC_COND_FNEG "neg")
 			(UNSPEC_COND_FNMLA "fnms")
-			(UNSPEC_COND_FNMLS "fms")])
+			(UNSPEC_COND_FNMLS "fms")
+			(UNSPEC_COND_FRECPX "frecpx")
+			(UNSPEC_COND_FRINTA "round")
+			(UNSPEC_COND_FRINTI "nearbyint")
+			(UNSPEC_COND_FRINTM "floor")
+			(UNSPEC_COND_FRINTN "frintn")
+			(UNSPEC_COND_FRINTP "ceil")
+			(UNSPEC_COND_FRINTX "rint")
+			(UNSPEC_COND_FRINTZ "btrunc")
+			(UNSPEC_COND_FSCALE "fscale")
+			(UNSPEC_COND_FSQRT "sqrt")
+			(UNSPEC_COND_FSUB "sub")
+			(UNSPEC_COND_SCVTF "float")
+			(UNSPEC_COND_UCVTF "floatuns")])
 
 (define_int_attr  maxmin_uns [(UNSPEC_UMAXV "umax")
 			      (UNSPEC_UMINV "umin")
@@ -1671,7 +2227,11 @@
 			      (UNSPEC_FMINNMV "smin")
 			      (UNSPEC_FMINV "smin_nan")
 			      (UNSPEC_FMAXNM "fmax")
-			      (UNSPEC_FMINNM "fmin")])
+			      (UNSPEC_FMINNM "fmin")
+			      (UNSPEC_COND_FMAX "fmax_nan")
+			      (UNSPEC_COND_FMAXNM "fmax")
+			      (UNSPEC_COND_FMIN "fmin_nan")
+			      (UNSPEC_COND_FMINNM "fmin")])
 
 (define_int_attr  maxmin_uns_op [(UNSPEC_UMAXV "umax")
 				 (UNSPEC_UMINV "umin")
@@ -1686,22 +2246,41 @@
 				 (UNSPEC_FMAXNM "fmaxnm")
 				 (UNSPEC_FMINNM "fminnm")])
 
-(define_int_attr bit_reduc_op [(UNSPEC_ANDV "andv")
-			       (UNSPEC_IORV "orv")
-			       (UNSPEC_XORV "eorv")])
+(define_code_attr binqops_op [(ss_plus "sqadd")
+			      (us_plus "uqadd")
+			      (ss_minus "sqsub")
+			      (us_minus "uqsub")])
+
+(define_code_attr binqops_op_rev [(ss_plus "sqsub")
+				  (ss_minus "sqadd")])
 
 ;; The SVE logical instruction that implements an unspec.
 (define_int_attr logicalf_op [(UNSPEC_ANDF "and")
 		 	      (UNSPEC_IORF "orr")
 			      (UNSPEC_XORF "eor")])
 
+(define_int_attr last_op [(UNSPEC_CLASTA "after_last")
+			  (UNSPEC_CLASTB "last")
+			  (UNSPEC_LASTA "after_last")
+			  (UNSPEC_LASTB "last")])
+
 ;; "s" for signed operations and "u" for unsigned ones.
-(define_int_attr su [(UNSPEC_UNPACKSHI "s")
+(define_int_attr su [(UNSPEC_SADDV "s")
+		     (UNSPEC_UADDV "u")
+		     (UNSPEC_UNPACKSHI "s")
 		     (UNSPEC_UNPACKUHI "u")
 		     (UNSPEC_UNPACKSLO "s")
 		     (UNSPEC_UNPACKULO "u")
 		     (UNSPEC_SMUL_HIGHPART "s")
-		     (UNSPEC_UMUL_HIGHPART "u")])
+		     (UNSPEC_UMUL_HIGHPART "u")
+		     (UNSPEC_COND_FCVTZS "s")
+		     (UNSPEC_COND_FCVTZU "u")
+		     (UNSPEC_COND_SCVTF "s")
+		     (UNSPEC_COND_UCVTF "u")
+		     (UNSPEC_SMULLB "s") (UNSPEC_UMULLB "u")
+		     (UNSPEC_SMULLT "s") (UNSPEC_UMULLT "u")
+		     (UNSPEC_SMULHS "s") (UNSPEC_UMULHS "u")
+		     (UNSPEC_SMULHRS "s") (UNSPEC_UMULHRS "u")])
 
 (define_int_attr sur [(UNSPEC_SHADD "s") (UNSPEC_UHADD "u")
 		      (UNSPEC_SRHADD "sr") (UNSPEC_URHADD "ur")
@@ -1731,6 +2310,9 @@
 		      (UNSPEC_URSHL  "ur") (UNSPEC_SRSHL  "sr")
 		      (UNSPEC_UQRSHL  "u") (UNSPEC_SQRSHL  "s")
 		      (UNSPEC_SDOT "s") (UNSPEC_UDOT "u")
+		      (UNSPEC_USDOT "us") (UNSPEC_SUDOT "su")
+		      (UNSPEC_SMATMUL "s") (UNSPEC_UMATMUL "u")
+		      (UNSPEC_USMATMUL "us")
 ])
 
 (define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r")
@@ -1739,6 +2321,10 @@
                     (UNSPEC_SQRSHRN "r") (UNSPEC_UQRSHRN "r")
                     (UNSPEC_SQSHL   "")  (UNSPEC_UQSHL  "")
                     (UNSPEC_SQRSHL   "r")(UNSPEC_UQRSHL  "r")
+		    (UNSPEC_SHRNB "") (UNSPEC_SHRNT "")
+		    (UNSPEC_RSHRNB "r") (UNSPEC_RSHRNT "r")
+		    (UNSPEC_SMULHS "") (UNSPEC_UMULHS "")
+		    (UNSPEC_SMULHRS "r") (UNSPEC_UMULHRS "r")
 ])
 
 (define_int_attr lr [(UNSPEC_SSLI  "l") (UNSPEC_USLI  "l")
@@ -1751,6 +2337,13 @@
 		    (UNSPEC_SHADD "") (UNSPEC_UHADD "u")
 		    (UNSPEC_SRHADD "") (UNSPEC_URHADD "u")])
 
+(define_int_attr fn [(UNSPEC_LDFF1 "f") (UNSPEC_LDNF1 "n")])
+
+(define_int_attr ab [(UNSPEC_CLASTA "a") (UNSPEC_CLASTB "b")
+		     (UNSPEC_LASTA "a") (UNSPEC_LASTB "b")])
+
+(define_int_attr bt [(UNSPEC_BFMLALB "b") (UNSPEC_BFMLALT "t")])
+
 (define_int_attr addsub [(UNSPEC_SHADD "add")
 			 (UNSPEC_UHADD "add")
 			 (UNSPEC_SRHADD "add")
@@ -1768,6 +2361,18 @@
 			 (UNSPEC_RADDHN2 "add")
 			 (UNSPEC_RSUBHN2 "sub")])
 
+;; BSL variants: first commutative operand.
+(define_int_attr bsl_1st [(1 "w") (2 "0")])
+
+;; BSL variants: second commutative operand.
+(define_int_attr bsl_2nd [(1 "0") (2 "w")])
+
+;; BSL variants: duplicated input operand.
+(define_int_attr bsl_dup [(1 "1") (2 "2")])
+
+;; BSL variants: operand which requires preserving via movprfx.
+(define_int_attr bsl_mov [(1 "2") (2 "1")])
+
 (define_int_attr offsetlr [(UNSPEC_SSLI "") (UNSPEC_USLI "")
 			   (UNSPEC_SSRI "offset_")
 			   (UNSPEC_USRI "offset_")])
@@ -1797,29 +2402,47 @@
 				  (UNSPEC_FCVTZU "fcvtzu")])
 
 ;; Pointer authentication mnemonic prefix.
-(define_int_attr pauth_mnem_prefix [(UNSPEC_PACISP "paci")
-				    (UNSPEC_AUTISP "auti")
-				    (UNSPEC_PACI1716 "paci")
-				    (UNSPEC_AUTI1716 "auti")])
-
-;; Pointer authentication HINT number for NOP space instructions using A Key.
-(define_int_attr pauth_hint_num_a [(UNSPEC_PACISP "25")
-				    (UNSPEC_AUTISP "29")
-				    (UNSPEC_PACI1716 "8")
-				    (UNSPEC_AUTI1716 "12")])
-
-(define_int_attr perm_insn [(UNSPEC_ZIP1 "zip") (UNSPEC_ZIP2 "zip")
-			    (UNSPEC_TRN1 "trn") (UNSPEC_TRN2 "trn")
-			    (UNSPEC_UZP1 "uzp") (UNSPEC_UZP2 "uzp")])
+(define_int_attr pauth_mnem_prefix [(UNSPEC_PACIASP "pacia")
+				    (UNSPEC_PACIBSP "pacib")
+				    (UNSPEC_PACIA1716 "pacia")
+				    (UNSPEC_PACIB1716 "pacib")
+				    (UNSPEC_AUTIASP "autia")
+				    (UNSPEC_AUTIBSP "autib")
+				    (UNSPEC_AUTIA1716 "autia")
+				    (UNSPEC_AUTIB1716 "autib")])
+
+(define_int_attr pauth_key [(UNSPEC_PACIASP "AARCH64_KEY_A")
+			    (UNSPEC_PACIBSP "AARCH64_KEY_B")
+			    (UNSPEC_PACIA1716 "AARCH64_KEY_A")
+			    (UNSPEC_PACIB1716 "AARCH64_KEY_B")
+			    (UNSPEC_AUTIASP "AARCH64_KEY_A")
+			    (UNSPEC_AUTIBSP "AARCH64_KEY_B")
+			    (UNSPEC_AUTIA1716 "AARCH64_KEY_A")
+			    (UNSPEC_AUTIB1716 "AARCH64_KEY_B")])
+
+;; Pointer authentication HINT number for NOP space instructions using A and
+;; B key.
+(define_int_attr pauth_hint_num [(UNSPEC_PACIASP "25")
+				   (UNSPEC_PACIBSP "27")
+				   (UNSPEC_AUTIASP "29")
+				   (UNSPEC_AUTIBSP "31")
+				   (UNSPEC_PACIA1716 "8")
+				   (UNSPEC_PACIB1716 "10")
+				   (UNSPEC_AUTIA1716 "12")
+				   (UNSPEC_AUTIB1716 "14")])
+
+(define_int_attr perm_insn [(UNSPEC_ZIP1 "zip1") (UNSPEC_ZIP2 "zip2")
+			    (UNSPEC_ZIP1Q "zip1") (UNSPEC_ZIP2Q "zip2")
+			    (UNSPEC_TRN1 "trn1") (UNSPEC_TRN2 "trn2")
+			    (UNSPEC_TRN1Q "trn1") (UNSPEC_TRN2Q "trn2")
+			    (UNSPEC_UZP1 "uzp1") (UNSPEC_UZP2 "uzp2")
+			    (UNSPEC_UZP1Q "uzp1") (UNSPEC_UZP2Q "uzp2")])
 
 ; op code for REV instructions (size within which elements are reversed).
 (define_int_attr rev_op [(UNSPEC_REV64 "64") (UNSPEC_REV32 "32")
 			 (UNSPEC_REV16 "16")])
 
-(define_int_attr perm_hilo [(UNSPEC_ZIP1 "1") (UNSPEC_ZIP2 "2")
-			    (UNSPEC_TRN1 "1") (UNSPEC_TRN2 "2")
-			    (UNSPEC_UZP1 "1") (UNSPEC_UZP2 "2")
-			    (UNSPEC_UNPACKSHI "hi") (UNSPEC_UNPACKUHI "hi")
+(define_int_attr perm_hilo [(UNSPEC_UNPACKSHI "hi") (UNSPEC_UNPACKUHI "hi")
 			    (UNSPEC_UNPACKSLO "lo") (UNSPEC_UNPACKULO "lo")])
 
 ;; Return true if the associated optab refers to the high-numbered lanes,
@@ -1861,34 +2484,122 @@
 (define_int_attr f16mac1 [(UNSPEC_FMLAL "a") (UNSPEC_FMLSL "s")
 			  (UNSPEC_FMLAL2 "a") (UNSPEC_FMLSL2 "s")])
 
+(define_int_attr frintnzs_op [(UNSPEC_FRINT32Z "frint32z") (UNSPEC_FRINT32X "frint32x")
+			      (UNSPEC_FRINT64Z "frint64z") (UNSPEC_FRINT64X "frint64x")])
+
 ;; The condition associated with an UNSPEC_COND_<xx>.
-(define_int_attr cmp_op [(UNSPEC_COND_LT "lt")
-			 (UNSPEC_COND_LE "le")
-			 (UNSPEC_COND_EQ "eq")
-			 (UNSPEC_COND_NE "ne")
-			 (UNSPEC_COND_GE "ge")
-			 (UNSPEC_COND_GT "gt")])
-
-(define_int_attr sve_fp_op [(UNSPEC_COND_ADD "fadd")
-			    (UNSPEC_COND_SUB "fsub")
-			    (UNSPEC_COND_MUL "fmul")
-			    (UNSPEC_COND_DIV "fdiv")
-			    (UNSPEC_COND_MAX "fmaxnm")
-			    (UNSPEC_COND_MIN "fminnm")])
-
-(define_int_attr sve_fp_op_rev [(UNSPEC_COND_ADD "fadd")
-			        (UNSPEC_COND_SUB "fsubr")
-			        (UNSPEC_COND_MUL "fmul")
-			        (UNSPEC_COND_DIV "fdivr")
-			        (UNSPEC_COND_MAX "fmaxnm")
-			        (UNSPEC_COND_MIN "fminnm")])
+(define_int_attr cmp_op [(UNSPEC_COND_CMPEQ_WIDE "eq")
+			 (UNSPEC_COND_CMPGE_WIDE "ge")
+			 (UNSPEC_COND_CMPGT_WIDE "gt")
+			 (UNSPEC_COND_CMPHI_WIDE "hi")
+			 (UNSPEC_COND_CMPHS_WIDE "hs")
+			 (UNSPEC_COND_CMPLE_WIDE "le")
+			 (UNSPEC_COND_CMPLO_WIDE "lo")
+			 (UNSPEC_COND_CMPLS_WIDE "ls")
+			 (UNSPEC_COND_CMPLT_WIDE "lt")
+			 (UNSPEC_COND_CMPNE_WIDE "ne")
+			 (UNSPEC_COND_FCMEQ "eq")
+			 (UNSPEC_COND_FCMGE "ge")
+			 (UNSPEC_COND_FCMGT "gt")
+			 (UNSPEC_COND_FCMLE "le")
+			 (UNSPEC_COND_FCMLT "lt")
+			 (UNSPEC_COND_FCMNE "ne")
+			 (UNSPEC_WHILELE "le")
+			 (UNSPEC_WHILELO "lo")
+			 (UNSPEC_WHILELS "ls")
+			 (UNSPEC_WHILELT "lt")])
+
+(define_int_attr while_optab_cmp [(UNSPEC_WHILELE "le")
+				  (UNSPEC_WHILELO "ult")
+				  (UNSPEC_WHILELS "ule")
+				  (UNSPEC_WHILELT "lt")])
+
+(define_int_attr brk_op [(UNSPEC_BRKA "a") (UNSPEC_BRKB "b")
+			 (UNSPEC_BRKN "n")
+			 (UNSPEC_BRKPA "pa") (UNSPEC_BRKPB "pb")])
+
+(define_int_attr sve_pred_op [(UNSPEC_PFIRST "pfirst") (UNSPEC_PNEXT "pnext")])
+
+(define_int_attr sve_int_op [(UNSPEC_ANDV "andv")
+			     (UNSPEC_IORV "orv")
+			     (UNSPEC_XORV "eorv")
+			     (UNSPEC_UMAXV "umaxv")
+			     (UNSPEC_UMINV "uminv")
+			     (UNSPEC_SMAXV "smaxv")
+			     (UNSPEC_SMINV "sminv")
+			     (UNSPEC_SMUL_HIGHPART "smulh")
+			     (UNSPEC_UMUL_HIGHPART "umulh")
+			     (UNSPEC_ASHIFT_WIDE "lsl")
+			     (UNSPEC_ASHIFTRT_WIDE "asr")
+			     (UNSPEC_LSHIFTRT_WIDE "lsr")
+			     (UNSPEC_RBIT "rbit")
+			     (UNSPEC_REVB "revb")
+			     (UNSPEC_REVH "revh")
+			     (UNSPEC_REVW "revw")])
+
+(define_int_attr sve_fp_op [(UNSPEC_BFDOT "bfdot")
+			    (UNSPEC_BFMLALB "bfmlalb")
+			    (UNSPEC_BFMLALT "bfmlalt")
+			    (UNSPEC_BFMMLA "bfmmla")
+			    (UNSPEC_FRECPE "frecpe")
+			    (UNSPEC_FRECPS "frecps")
+			    (UNSPEC_RSQRTE "frsqrte")
+			    (UNSPEC_RSQRTS "frsqrts")
+			    (UNSPEC_FADDV "faddv")
+			    (UNSPEC_FEXPA "fexpa")
+			    (UNSPEC_FMAXNMV "fmaxnmv")
+			    (UNSPEC_FMAXV "fmaxv")
+			    (UNSPEC_FMINNMV "fminnmv")
+			    (UNSPEC_FMINV "fminv")
+			    (UNSPEC_FMLA "fmla")
+			    (UNSPEC_FMLS "fmls")
+			    (UNSPEC_FMMLA "fmmla")
+			    (UNSPEC_FTSMUL "ftsmul")
+			    (UNSPEC_FTSSEL "ftssel")
+			    (UNSPEC_COND_FABS "fabs")
+			    (UNSPEC_COND_FADD "fadd")
+			    (UNSPEC_COND_FDIV "fdiv")
+			    (UNSPEC_COND_FMAX "fmax")
+			    (UNSPEC_COND_FMAXNM "fmaxnm")
+			    (UNSPEC_COND_FMIN "fmin")
+			    (UNSPEC_COND_FMINNM "fminnm")
+			    (UNSPEC_COND_FMUL "fmul")
+			    (UNSPEC_COND_FMULX "fmulx")
+			    (UNSPEC_COND_FNEG "fneg")
+			    (UNSPEC_COND_FRECPX "frecpx")
+			    (UNSPEC_COND_FRINTA "frinta")
+			    (UNSPEC_COND_FRINTI "frinti")
+			    (UNSPEC_COND_FRINTM "frintm")
+			    (UNSPEC_COND_FRINTN "frintn")
+			    (UNSPEC_COND_FRINTP "frintp")
+			    (UNSPEC_COND_FRINTX "frintx")
+			    (UNSPEC_COND_FRINTZ "frintz")
+			    (UNSPEC_COND_FSCALE "fscale")
+			    (UNSPEC_COND_FSQRT "fsqrt")
+			    (UNSPEC_COND_FSUB "fsub")])
+
+(define_int_attr sve_fp_op_rev [(UNSPEC_COND_FADD "fadd")
+				(UNSPEC_COND_FDIV "fdivr")
+				(UNSPEC_COND_FMAX "fmax")
+				(UNSPEC_COND_FMAXNM "fmaxnm")
+				(UNSPEC_COND_FMIN "fmin")
+				(UNSPEC_COND_FMINNM "fminnm")
+				(UNSPEC_COND_FMUL "fmul")
+				(UNSPEC_COND_FMULX "fmulx")
+				(UNSPEC_COND_FSUB "fsubr")])
 
 (define_int_attr rot [(UNSPEC_FCADD90 "90")
 		      (UNSPEC_FCADD270 "270")
 		      (UNSPEC_FCMLA "0")
 		      (UNSPEC_FCMLA90 "90")
 		      (UNSPEC_FCMLA180 "180")
-		      (UNSPEC_FCMLA270 "270")])
+		      (UNSPEC_FCMLA270 "270")
+		      (UNSPEC_COND_FCADD90 "90")
+		      (UNSPEC_COND_FCADD270 "270")
+		      (UNSPEC_COND_FCMLA "0")
+		      (UNSPEC_COND_FCMLA90 "90")
+		      (UNSPEC_COND_FCMLA180 "180")
+		      (UNSPEC_COND_FCMLA270 "270")])
 
 (define_int_attr sve_fmla_op [(UNSPEC_COND_FMLA "fmla")
 			      (UNSPEC_COND_FMLS "fmls")
@@ -1900,9 +2611,54 @@
 			      (UNSPEC_COND_FNMLA "fnmad")
 			      (UNSPEC_COND_FNMLS "fnmsb")])
 
-(define_int_attr commutative [(UNSPEC_COND_ADD "true")
-			      (UNSPEC_COND_SUB "false")
-			      (UNSPEC_COND_MUL "true")
-			      (UNSPEC_COND_DIV "false")
-			      (UNSPEC_COND_MIN "true")
-			      (UNSPEC_COND_MAX "true")])
+;; The register constraint to use for the final operand in a binary BRK.
+(define_int_attr brk_reg_con [(UNSPEC_BRKN "0")
+			      (UNSPEC_BRKPA "Upa") (UNSPEC_BRKPB "Upa")])
+
+;; The register number to print for the above.
+(define_int_attr brk_reg_opno [(UNSPEC_BRKN "0")
+			       (UNSPEC_BRKPA "3") (UNSPEC_BRKPB "3")])
+
+;; The predicate to use for the first input operand in a floating-point
+;; <optab><mode>3 pattern.
+(define_int_attr sve_pred_fp_rhs1_operand
+  [(UNSPEC_COND_FADD "register_operand")
+   (UNSPEC_COND_FDIV "register_operand")
+   (UNSPEC_COND_FMAX "register_operand")
+   (UNSPEC_COND_FMAXNM "register_operand")
+   (UNSPEC_COND_FMIN "register_operand")
+   (UNSPEC_COND_FMINNM "register_operand")
+   (UNSPEC_COND_FMUL "register_operand")
+   (UNSPEC_COND_FMULX "register_operand")
+   (UNSPEC_COND_FSUB "aarch64_sve_float_arith_operand")])
+
+;; The predicate to use for the second input operand in a floating-point
+;; <optab><mode>3 pattern.
+(define_int_attr sve_pred_fp_rhs2_operand
+  [(UNSPEC_COND_FADD "aarch64_sve_float_arith_with_sub_operand")
+   (UNSPEC_COND_FDIV "register_operand")
+   (UNSPEC_COND_FMAX "aarch64_sve_float_maxmin_operand")
+   (UNSPEC_COND_FMAXNM "aarch64_sve_float_maxmin_operand")
+   (UNSPEC_COND_FMIN "aarch64_sve_float_maxmin_operand")
+   (UNSPEC_COND_FMINNM "aarch64_sve_float_maxmin_operand")
+   (UNSPEC_COND_FMUL "aarch64_sve_float_mul_operand")
+   (UNSPEC_COND_FMULX "register_operand")
+   (UNSPEC_COND_FSUB "register_operand")])
+
+;; Likewise for immediates only.
+(define_int_attr sve_pred_fp_rhs2_immediate
+  [(UNSPEC_COND_FMAX "aarch64_sve_float_maxmin_immediate")
+   (UNSPEC_COND_FMAXNM "aarch64_sve_float_maxmin_immediate")
+   (UNSPEC_COND_FMIN "aarch64_sve_float_maxmin_immediate")
+   (UNSPEC_COND_FMINNM "aarch64_sve_float_maxmin_immediate")
+   (UNSPEC_COND_FMUL "aarch64_sve_float_mul_immediate")])
+
+;; The maximum number of element bits that an instruction can handle.
+(define_int_attr max_elem_bits [(UNSPEC_UADDV "64") (UNSPEC_SADDV "32")
+				(UNSPEC_PFIRST "8") (UNSPEC_PNEXT "64")])
+
+;; The minimum number of element bits that an instruction can handle.
+(define_int_attr min_elem_bits [(UNSPEC_RBIT "8")
+				(UNSPEC_REVB "16")
+				(UNSPEC_REVH "32")
+				(UNSPEC_REVW "64")])
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 5f7f281e2..0b6bf6172 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -39,9 +39,17 @@
   (and (match_code "const_int")
        (match_test "op == CONST0_RTX (mode)")))
 
-(define_special_predicate "subreg_lowpart_operator"
-  (and (match_code "subreg")
-       (match_test "subreg_lowpart_p (op)")))
+(define_predicate "const_1_to_3_operand"
+  (match_code "const_int,const_vector")
+{
+  op = unwrap_const_vec_duplicate (op);
+  return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3);
+})
+
+(define_predicate "subreg_lowpart_operator"
+  (ior (match_code "truncate")
+       (and (match_code "subreg")
+	    (match_test "subreg_lowpart_p (op)"))))
 
 (define_predicate "aarch64_ccmp_immediate"
   (and (match_code "const_int")
@@ -53,13 +61,12 @@
 
 (define_predicate "aarch64_simd_register"
   (and (match_code "reg")
-       (ior (match_test "REGNO_REG_CLASS (REGNO (op)) == FP_LO_REGS")
-            (match_test "REGNO_REG_CLASS (REGNO (op)) == FP_REGS"))))
+       (match_test "FP_REGNUM_P (REGNO (op))")))
 
 (define_predicate "aarch64_reg_or_zero"
-  (and (match_code "reg,subreg,const_int")
+  (and (match_code "reg,subreg,const_int,const_double")
        (ior (match_operand 0 "register_operand")
-	    (match_test "op == const0_rtx"))))
+	    (match_test "op == CONST0_RTX (GET_MODE (op))"))))
 
 (define_predicate "aarch64_reg_or_fp_zero"
   (ior (match_operand 0 "register_operand")
@@ -98,6 +105,10 @@
   (and (match_code "const_double")
 	(match_test "aarch64_fpconst_pow_of_2 (op) > 0")))
 
+(define_predicate "aarch64_fp_pow2_recip"
+  (and (match_code "const_double")
+       (match_test "aarch64_fpconst_pow2_recip (op) > 0")))
+
 (define_predicate "aarch64_fp_vec_pow2"
   (match_test "aarch64_vec_fpconst_pow_of_2 (op) > 0"))
 
@@ -138,10 +149,18 @@
   (and (match_operand 0 "aarch64_pluslong_immediate")
        (not (match_operand 0 "aarch64_plus_immediate"))))
 
+(define_predicate "aarch64_sve_scalar_inc_dec_immediate"
+  (and (match_code "const_poly_int")
+       (match_test "aarch64_sve_scalar_inc_dec_immediate_p (op)")))
+
 (define_predicate "aarch64_sve_addvl_addpl_immediate"
   (and (match_code "const_poly_int")
        (match_test "aarch64_sve_addvl_addpl_immediate_p (op)")))
 
+(define_predicate "aarch64_sve_plus_immediate"
+  (ior (match_operand 0 "aarch64_sve_scalar_inc_dec_immediate")
+       (match_operand 0 "aarch64_sve_addvl_addpl_immediate")))
+
 (define_predicate "aarch64_split_add_offset_immediate"
   (and (match_code "const_poly_int")
        (match_test "aarch64_add_offset_temporaries (op) == 1")))
@@ -149,7 +168,8 @@
 (define_predicate "aarch64_pluslong_operand"
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "aarch64_pluslong_immediate")
-       (match_operand 0 "aarch64_sve_addvl_addpl_immediate")))
+       (and (match_test "TARGET_SVE")
+	    (match_operand 0 "aarch64_sve_plus_immediate"))))
 
 (define_predicate "aarch64_pluslong_or_poly_operand"
   (ior (match_operand 0 "aarch64_pluslong_operand")
@@ -323,12 +343,6 @@
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "const_scalar_int_operand")))
 
-(define_predicate "aarch64_smin"
-  (match_code "smin"))
-
-(define_predicate "aarch64_umin"
-  (match_code "umin"))
-
 ;; True for integer comparisons and for FP comparisons other than LTGT or UNEQ.
 (define_special_predicate "aarch64_comparison_operator"
   (match_code "eq,ne,le,lt,ge,gt,geu,gtu,leu,ltu,unordered,
@@ -444,6 +458,12 @@
   return aarch64_stepped_int_parallel_p (op, 1);
 })
 
+(define_predicate "descending_int_parallel"
+  (match_code "parallel")
+{
+  return aarch64_stepped_int_parallel_p (op, -1);
+})
+
 (define_special_predicate "aarch64_simd_lshift_imm"
   (match_code "const,const_vector")
 {
@@ -460,6 +480,10 @@
   (and (match_code "const,const_vector")
        (match_test "op == CONST0_RTX (GET_MODE (op))")))
 
+(define_predicate "aarch64_simd_imm_one"
+  (and (match_code "const_vector")
+       (match_test "op == CONST1_RTX (GET_MODE (op))")))
+
 (define_predicate "aarch64_simd_or_scalar_imm_zero"
   (and (match_code "const_int,const_double,const,const_vector")
        (match_test "op == CONST0_RTX (GET_MODE (op))")))
@@ -474,6 +498,10 @@
 	    (match_test "op == const0_rtx")
 	    (match_operand 0 "aarch64_simd_or_scalar_imm_zero"))))
 
+(define_predicate "aarch64_simd_reg_or_minus_one"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "aarch64_simd_imm_minus_one")))
+
 (define_predicate "aarch64_simd_struct_operand"
   (and (match_code "mem")
        (match_test "TARGET_SIMD && aarch64_simd_mem_operand_p (op)")))
@@ -556,12 +584,44 @@
   (and (match_operand 0 "memory_operand")
        (match_test "aarch64_sve_ld1r_operand_p (op)")))
 
+(define_predicate "aarch64_sve_ld1rq_operand"
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ld1rq_operand_p (op)")))
+
+(define_predicate "aarch64_sve_ld1ro_operand_b"
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ld1ro_operand_p (op, QImode)")))
+
+(define_predicate "aarch64_sve_ld1ro_operand_h"
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ld1ro_operand_p (op, HImode)")))
+
+(define_predicate "aarch64_sve_ld1ro_operand_w"
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ld1ro_operand_p (op, SImode)")))
+
+(define_predicate "aarch64_sve_ld1ro_operand_d"
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ld1ro_operand_p (op, DImode)")))
+
+(define_predicate "aarch64_sve_ldff1_operand"
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ldff1_operand_p (op)")))
+
+(define_predicate "aarch64_sve_ldnf1_operand"
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ldnf1_operand_p (op)")))
+
 ;; Like memory_operand, but restricted to addresses that are valid for
 ;; SVE LDR and STR instructions.
 (define_predicate "aarch64_sve_ldr_operand"
   (and (match_code "mem")
        (match_test "aarch64_sve_ldr_operand_p (op)")))
 
+(define_special_predicate "aarch64_sve_prefetch_operand"
+  (and (match_code "reg, plus")
+       (match_test "aarch64_sve_prefetch_operand_p (op, mode)")))
+
 (define_predicate "aarch64_sve_nonimmediate_operand"
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "aarch64_sve_ldr_operand")))
@@ -586,6 +646,10 @@
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "aarch64_sve_ld1r_operand")))
 
+(define_predicate "aarch64_sve_ptrue_svpattern_immediate"
+  (and (match_code "const")
+       (match_test "aarch64_sve_ptrue_svpattern_p (op, NULL)")))
+
 (define_predicate "aarch64_sve_arith_immediate"
   (and (match_code "const,const_vector")
        (match_test "aarch64_sve_arith_immediate_p (op, false)")))
@@ -594,28 +658,84 @@
   (and (match_code "const,const_vector")
        (match_test "aarch64_sve_arith_immediate_p (op, true)")))
 
-(define_predicate "aarch64_sve_inc_dec_immediate"
+(define_predicate "aarch64_sve_qadd_immediate"
   (and (match_code "const,const_vector")
-       (match_test "aarch64_sve_inc_dec_immediate_p (op)")))
+       (match_test "aarch64_sve_sqadd_sqsub_immediate_p (op, false)")))
+
+(define_predicate "aarch64_sve_qsub_immediate"
+  (and (match_code "const,const_vector")
+       (match_test "aarch64_sve_sqadd_sqsub_immediate_p (op, true)")))
+
+(define_predicate "aarch64_sve_vector_inc_dec_immediate"
+  (and (match_code "const,const_vector")
+       (match_test "aarch64_sve_vector_inc_dec_immediate_p (op)")))
+
+(define_predicate "aarch64_sve_gather_immediate_b"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 31)")))
+
+(define_predicate "aarch64_sve_gather_immediate_h"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 62)")
+       (match_test "(INTVAL (op) & 1) == 0")))
+
+(define_predicate "aarch64_sve_gather_immediate_w"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 124)")
+       (match_test "(INTVAL (op) & 3) == 0")))
+
+(define_predicate "aarch64_sve_gather_immediate_d"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 248)")
+       (match_test "(INTVAL (op) & 7) == 0")))
+
+(define_predicate "aarch64_sve_uxtb_immediate"
+  (and (match_code "const_vector")
+       (match_test "GET_MODE_UNIT_BITSIZE (GET_MODE (op)) > 8")
+       (match_test "aarch64_const_vec_all_same_int_p (op, 0xff)")))
+
+(define_predicate "aarch64_sve_uxth_immediate"
+  (and (match_code "const_vector")
+       (match_test "GET_MODE_UNIT_BITSIZE (GET_MODE (op)) > 16")
+       (match_test "aarch64_const_vec_all_same_int_p (op, 0xffff)")))
+
+(define_predicate "aarch64_sve_uxtw_immediate"
+  (and (match_code "const_vector")
+       (match_test "GET_MODE_UNIT_BITSIZE (GET_MODE (op)) > 32")
+       (match_test "aarch64_const_vec_all_same_int_p (op, 0xffffffff)")))
+
+(define_predicate "aarch64_sve_uxt_immediate"
+  (ior (match_operand 0 "aarch64_sve_uxtb_immediate")
+       (match_operand 0 "aarch64_sve_uxth_immediate")
+       (match_operand 0 "aarch64_sve_uxtw_immediate")))
 
 (define_predicate "aarch64_sve_logical_immediate"
   (and (match_code "const,const_vector")
        (match_test "aarch64_sve_bitmask_immediate_p (op)")))
 
-(define_predicate "aarch64_sve_mul_immediate"
+;; Used for SVE UMAX and UMIN.
+(define_predicate "aarch64_sve_vsb_immediate"
+  (and (match_code "const_vector")
+       (match_test "GET_MODE_INNER (GET_MODE (op)) == QImode
+		    ? aarch64_const_vec_all_same_in_range_p (op, -128, 127)
+		    : aarch64_const_vec_all_same_in_range_p (op, 0, 255)")))
+
+;; Used for SVE MUL, SMAX and SMIN.
+(define_predicate "aarch64_sve_vsm_immediate"
   (and (match_code "const,const_vector")
        (match_test "aarch64_const_vec_all_same_in_range_p (op, -128, 127)")))
 
 (define_predicate "aarch64_sve_dup_immediate"
   (and (match_code "const,const_vector")
-       (match_test "aarch64_sve_dup_immediate_p (op)")))
+       (ior (match_test "aarch64_sve_dup_immediate_p (op)")
+	    (match_test "aarch64_float_const_representable_p (op)"))))
 
 (define_predicate "aarch64_sve_cmp_vsc_immediate"
-  (and (match_code "const,const_vector")
+  (and (match_code "const_int,const_vector")
        (match_test "aarch64_sve_cmp_immediate_p (op, true)")))
 
 (define_predicate "aarch64_sve_cmp_vsd_immediate"
-  (and (match_code "const,const_vector")
+  (and (match_code "const_int,const_vector")
        (match_test "aarch64_sve_cmp_immediate_p (op, false)")))
 
 (define_predicate "aarch64_sve_index_immediate"
@@ -626,14 +746,23 @@
   (and (match_code "const,const_vector")
        (match_test "aarch64_sve_float_arith_immediate_p (op, false)")))
 
-(define_predicate "aarch64_sve_float_arith_with_sub_immediate"
+(define_predicate "aarch64_sve_float_negated_arith_immediate"
   (and (match_code "const,const_vector")
        (match_test "aarch64_sve_float_arith_immediate_p (op, true)")))
 
+(define_predicate "aarch64_sve_float_arith_with_sub_immediate"
+  (ior (match_operand 0 "aarch64_sve_float_arith_immediate")
+       (match_operand 0 "aarch64_sve_float_negated_arith_immediate")))
+
 (define_predicate "aarch64_sve_float_mul_immediate"
   (and (match_code "const,const_vector")
        (match_test "aarch64_sve_float_mul_immediate_p (op)")))
 
+(define_predicate "aarch64_sve_float_maxmin_immediate"
+  (and (match_code "const_vector")
+       (ior (match_test "op == CONST0_RTX (GET_MODE (op))")
+	    (match_test "op == CONST1_RTX (GET_MODE (op))"))))
+
 (define_predicate "aarch64_sve_arith_operand"
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "aarch64_sve_arith_immediate")))
@@ -641,12 +770,37 @@
 (define_predicate "aarch64_sve_add_operand"
   (ior (match_operand 0 "aarch64_sve_arith_operand")
        (match_operand 0 "aarch64_sve_sub_arith_immediate")
-       (match_operand 0 "aarch64_sve_inc_dec_immediate")))
+       (match_operand 0 "aarch64_sve_vector_inc_dec_immediate")))
+
+(define_predicate "aarch64_sve_sqadd_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "aarch64_sve_qadd_immediate")
+       (match_operand 0 "aarch64_sve_qsub_immediate")))
+
+(define_predicate "aarch64_sve_pred_and_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "aarch64_sve_uxt_immediate")))
 
 (define_predicate "aarch64_sve_logical_operand"
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "aarch64_sve_logical_immediate")))
 
+(define_predicate "aarch64_sve_gather_offset_b"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "aarch64_sve_gather_immediate_b")))
+
+(define_predicate "aarch64_sve_gather_offset_h"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "aarch64_sve_gather_immediate_h")))
+
+(define_predicate "aarch64_sve_gather_offset_w"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "aarch64_sve_gather_immediate_w")))
+
+(define_predicate "aarch64_sve_gather_offset_d"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "aarch64_sve_gather_immediate_d")))
+
 (define_predicate "aarch64_sve_lshift_operand"
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "aarch64_simd_lshift_imm")))
@@ -655,9 +809,17 @@
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "aarch64_simd_rshift_imm")))
 
-(define_predicate "aarch64_sve_mul_operand"
+(define_predicate "aarch64_sve_vsb_operand"
   (ior (match_operand 0 "register_operand")
-       (match_operand 0 "aarch64_sve_mul_immediate")))
+       (match_operand 0 "aarch64_sve_vsb_immediate")))
+
+(define_predicate "aarch64_sve_vsm_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "aarch64_sve_vsm_immediate")))
+
+(define_predicate "aarch64_sve_reg_or_dup_imm"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "aarch64_sve_dup_immediate")))
 
 (define_predicate "aarch64_sve_cmp_vsc_operand"
   (ior (match_operand 0 "register_operand")
@@ -676,17 +838,39 @@
        (match_operand 0 "aarch64_sve_float_arith_immediate")))
 
 (define_predicate "aarch64_sve_float_arith_with_sub_operand"
-  (ior (match_operand 0 "aarch64_sve_float_arith_operand")
+  (ior (match_operand 0 "register_operand")
        (match_operand 0 "aarch64_sve_float_arith_with_sub_immediate")))
 
 (define_predicate "aarch64_sve_float_mul_operand"
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "aarch64_sve_float_mul_immediate")))
 
+(define_predicate "aarch64_sve_float_maxmin_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "aarch64_sve_float_maxmin_immediate")))
+
 (define_predicate "aarch64_sve_vec_perm_operand"
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "aarch64_constant_vector_operand")))
 
+(define_predicate "aarch64_sve_ptrue_flag"
+  (and (match_code "const_int")
+       (ior (match_test "INTVAL (op) == SVE_MAYBE_NOT_PTRUE")
+	    (match_test "INTVAL (op) == SVE_KNOWN_PTRUE"))))
+
+(define_predicate "aarch64_sve_gp_strictness"
+  (and (match_code "const_int")
+       (ior (match_test "INTVAL (op) == SVE_RELAXED_GP")
+	    (match_test "INTVAL (op) == SVE_STRICT_GP"))))
+
+(define_predicate "aarch64_gather_scale_operand_b"
+  (and (match_code "const_int")
+       (match_test "INTVAL (op) == 1")))
+
+(define_predicate "aarch64_gather_scale_operand_h"
+  (and (match_code "const_int")
+       (match_test "INTVAL (op) == 1 || INTVAL (op) == 2")))
+
 (define_predicate "aarch64_gather_scale_operand_w"
   (and (match_code "const_int")
        (match_test "INTVAL (op) == 1 || INTVAL (op) == 4")))
diff --git a/gcc/config/aarch64/saphira.md b/gcc/config/aarch64/saphira.md
index 853deeef0..3cc7bc410 100644
--- a/gcc/config/aarch64/saphira.md
+++ b/gcc/config/aarch64/saphira.md
@@ -520,7 +520,7 @@
 
 (define_insn_reservation "saphira_other_0_nothing" 0
   (and (eq_attr "tune" "saphira")
-       (eq_attr "type" "no_insn,trap,block"))
+       (eq_attr "type" "trap,block"))
   "nothing")
 
 (define_insn_reservation "saphira_other_2_ld" 2
diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64
index ee471f898..28e1c7aec 100644
--- a/gcc/config/aarch64/t-aarch64
+++ b/gcc/config/aarch64/t-aarch64
@@ -40,6 +40,43 @@ aarch64-builtins.o: $(srcdir)/config/aarch64/aarch64-builtins.c $(CONFIG_H) \
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/aarch64/aarch64-builtins.c
 
+aarch64-sve-builtins.o: $(srcdir)/config/aarch64/aarch64-sve-builtins.cc \
+  $(srcdir)/config/aarch64/aarch64-sve-builtins.def \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(TREE_H) $(RTL_H) \
+  $(TM_P_H) memmodel.h insn-codes.h $(OPTABS_H) $(RECOG_H) $(DIAGNOSTIC_H) \
+  $(EXPR_H) $(BASIC_BLOCK_H) $(FUNCTION_H) fold-const.h $(GIMPLE_H) \
+  gimple-iterator.h gimplify.h explow.h $(EMIT_RTL_H) tree-vector-builder.h \
+  stor-layout.h $(REG_H) alias.h gimple-fold.h langhooks.h \
+  stringpool.h \
+  $(srcdir)/config/aarch64/aarch64-sve-builtins.h \
+  $(srcdir)/config/aarch64/aarch64-sve-builtins-shapes.h \
+  $(srcdir)/config/aarch64/aarch64-sve-builtins-base.h
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/aarch64/aarch64-sve-builtins.cc
+
+aarch64-sve-builtins-shapes.o: \
+  $(srcdir)/config/aarch64/aarch64-sve-builtins-shapes.cc \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(TREE_H) $(RTL_H) \
+  $(TM_P_H) memmodel.h insn-codes.h $(OPTABS_H) \
+  $(srcdir)/config/aarch64/aarch64-sve-builtins.h \
+  $(srcdir)/config/aarch64/aarch64-sve-builtins-shapes.h
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/aarch64/aarch64-sve-builtins-shapes.cc
+
+aarch64-sve-builtins-base.o: \
+  $(srcdir)/config/aarch64/aarch64-sve-builtins-base.cc \
+  $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(TREE_H) $(RTL_H) \
+  $(TM_P_H) memmodel.h insn-codes.h $(OPTABS_H) $(RECOG_H) \
+  $(EXPR_H) $(BASIC_BLOCK_H) $(FUNCTION_H) fold-const.h $(GIMPLE_H) \
+  gimple-iterator.h gimplify.h explow.h $(EMIT_RTL_H) tree-vector-builder.h \
+  rtx-vector-builder.h vec-perm-indices.h \
+  $(srcdir)/config/aarch64/aarch64-sve-builtins.h \
+  $(srcdir)/config/aarch64/aarch64-sve-builtins-shapes.h \
+  $(srcdir)/config/aarch64/aarch64-sve-builtins-base.h \
+  $(srcdir)/config/aarch64/aarch64-sve-builtins-functions.h
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/aarch64/aarch64-sve-builtins-base.cc
+
 aarch64-builtin-iterators.h: $(srcdir)/config/aarch64/geniterators.sh \
 	$(srcdir)/config/aarch64/iterators.md
 	$(SHELL) $(srcdir)/config/aarch64/geniterators.sh \
@@ -103,3 +140,10 @@ aarch64-bti-insert.o: $(srcdir)/config/aarch64/aarch64-bti-insert.c \
 comma=,
 MULTILIB_OPTIONS    = $(subst $(comma),/, $(patsubst %, mabi=%, $(subst $(comma),$(comma)mabi=,$(TM_MULTILIB_CONFIG))))
 MULTILIB_DIRNAMES   = $(subst $(comma), ,$(TM_MULTILIB_CONFIG))
+
+insn-conditions.md: s-check-sve-md
+s-check-sve-md: $(srcdir)/config/aarch64/check-sve-md.awk \
+		$(srcdir)/config/aarch64/aarch64-sve.md
+	$(AWK) -f $(srcdir)/config/aarch64/check-sve-md.awk \
+	  $(srcdir)/config/aarch64/aarch64-sve.md
+	$(STAMP) s-check-sve-md
diff --git a/gcc/config/aarch64/t-aarch64-netbsd b/gcc/config/aarch64/t-aarch64-netbsd
new file mode 100644
index 000000000..aa447d0f6
--- /dev/null
+++ b/gcc/config/aarch64/t-aarch64-netbsd
@@ -0,0 +1,21 @@
+# Machine description for AArch64 architecture.
+#  Copyright (C) 2016-2019 Free Software Foundation, Inc.
+#
+#  This file is part of GCC.
+#
+#  GCC is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 3, or (at your option)
+#  any later version.
+#
+#  GCC is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with GCC; see the file COPYING3.  If not see
+#  <http://www.gnu.org/licenses/>.
+
+LIB1ASMSRC   = aarch64/lib1funcs.asm
+LIB1ASMFUNCS = _aarch64_sync_cache_range
diff --git a/gcc/config/aarch64/thunderx2t99.md b/gcc/config/aarch64/thunderx2t99.md
index c43c39ecd..bb6e0abb0 100644
--- a/gcc/config/aarch64/thunderx2t99.md
+++ b/gcc/config/aarch64/thunderx2t99.md
@@ -74,7 +74,7 @@
 
 (define_insn_reservation "thunderx2t99_nothing" 0
   (and (eq_attr "tune" "thunderx2t99")
-       (eq_attr "type" "no_insn,block"))
+       (eq_attr "type" "block"))
   "nothing")
 
 (define_insn_reservation "thunderx2t99_mrs" 0
diff --git a/gcc/config/aarch64/tsv110.md b/gcc/config/aarch64/tsv110.md
index 680c48a68..f20055dae 100644
--- a/gcc/config/aarch64/tsv110.md
+++ b/gcc/config/aarch64/tsv110.md
@@ -281,7 +281,7 @@
 			shift_imm,shift_reg,\
 			mov_imm,mov_reg,\
 			mvn_imm,mvn_reg,\
-			mrs,multiple,no_insn"))
+			mrs,multiple"))
   "tsv110_alu1|tsv110_alu2|tsv110_alu3")
   
 (define_insn_reservation "tsv110_alus" 1
diff --git a/gcc/config/alpha/alpha.c b/gcc/config/alpha/alpha.c
index 524379d37..cd6aa117c 100644
--- a/gcc/config/alpha/alpha.c
+++ b/gcc/config/alpha/alpha.c
@@ -6380,7 +6380,7 @@ alpha_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
   t = fold_convert (build_nonstandard_integer_type (64, 0), offset_field);
   offset = get_initialized_tmp_var (t, pre_p, NULL);
 
-  indirect = pass_by_reference (NULL, TYPE_MODE (type), type, false);
+  indirect = pass_va_arg_by_reference (type);
 
   if (indirect)
     {
diff --git a/gcc/config/alpha/alpha.h b/gcc/config/alpha/alpha.h
index e2008202a..68eafe194 100644
--- a/gcc/config/alpha/alpha.h
+++ b/gcc/config/alpha/alpha.h
@@ -759,7 +759,7 @@ do {									     \
 #define MOVE_MAX 8
 
 /* If a memory-to-memory move would take MOVE_RATIO or more simple
-   move-instruction pairs, we will do a movmem or libcall instead.
+   move-instruction pairs, we will do a cpymem or libcall instead.
 
    Without byte/word accesses, we want no more than four instructions;
    with, several single byte accesses are better.  */
diff --git a/gcc/config/alpha/alpha.md b/gcc/config/alpha/alpha.md
index dd340a08e..228dee44c 100644
--- a/gcc/config/alpha/alpha.md
+++ b/gcc/config/alpha/alpha.md
@@ -4673,7 +4673,7 @@
 ;; Argument 2 is the length
 ;; Argument 3 is the alignment
 
-(define_expand "movmemqi"
+(define_expand "cpymemqi"
   [(parallel [(set (match_operand:BLK 0 "memory_operand")
 		   (match_operand:BLK 1 "memory_operand"))
 	      (use (match_operand:DI 2 "immediate_operand"))
@@ -4686,7 +4686,7 @@
     FAIL;
 })
 
-(define_expand "movmemdi"
+(define_expand "cpymemdi"
   [(parallel [(set (match_operand:BLK 0 "memory_operand")
 		   (match_operand:BLK 1 "memory_operand"))
 	      (use (match_operand:DI 2 "immediate_operand"))
@@ -4703,7 +4703,7 @@
   "TARGET_ABI_OPEN_VMS"
   "operands[4] = gen_rtx_SYMBOL_REF (Pmode, \"OTS$MOVE\");")
 
-(define_insn "*movmemdi_1"
+(define_insn "*cpymemdi_1"
   [(set (match_operand:BLK 0 "memory_operand" "=m,m")
 	(match_operand:BLK 1 "memory_operand" "m,m"))
    (use (match_operand:DI 2 "nonmemory_operand" "r,i"))
diff --git a/gcc/config/arc/arc-protos.h b/gcc/config/arc/arc-protos.h
index ac0de6b28..00d2dd2c6 100644
--- a/gcc/config/arc/arc-protos.h
+++ b/gcc/config/arc/arc-protos.h
@@ -35,7 +35,7 @@ extern void arc_final_prescan_insn (rtx_insn *, rtx *, int);
 extern const char *arc_output_libcall (const char *);
 extern int arc_output_addsi (rtx *operands, bool, bool);
 extern int arc_output_commutative_cond_exec (rtx *operands, bool);
-extern bool arc_expand_movmem (rtx *operands);
+extern bool arc_expand_cpymem (rtx *operands);
 extern bool prepare_move_operands (rtx *operands, machine_mode mode);
 extern void emit_shift (enum rtx_code, rtx, rtx, rtx);
 extern void arc_expand_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx);
diff --git a/gcc/config/arc/arc.c b/gcc/config/arc/arc.c
index 325dd3cea..c0f13ebe7 100644
--- a/gcc/config/arc/arc.c
+++ b/gcc/config/arc/arc.c
@@ -8791,7 +8791,7 @@ arc_output_commutative_cond_exec (rtx *operands, bool output_p)
   return 8;
 }
 
-/* Helper function of arc_expand_movmem.  ADDR points to a chunk of memory.
+/* Helper function of arc_expand_cpymem.  ADDR points to a chunk of memory.
    Emit code and return an potentially modified address such that offsets
    up to SIZE are can be added to yield a legitimate address.
    if REUSE is set, ADDR is a register that may be modified.  */
@@ -8825,7 +8825,7 @@ force_offsettable (rtx addr, HOST_WIDE_INT size, bool reuse)
    offset ranges.  Return true on success.  */
 
 bool
-arc_expand_movmem (rtx *operands)
+arc_expand_cpymem (rtx *operands)
 {
   rtx dst = operands[0];
   rtx src = operands[1];
@@ -10335,7 +10335,7 @@ arc_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
 				    enum by_pieces_operation op,
 				    bool speed_p)
 {
-  /* Let the movmem expander handle small block moves.  */
+  /* Let the cpymem expander handle small block moves.  */
   if (op == MOVE_BY_PIECES)
     return false;
 
diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h
index 00fc3e471..7ae10a666 100644
--- a/gcc/config/arc/arc.h
+++ b/gcc/config/arc/arc.h
@@ -1423,7 +1423,7 @@ do { \
    in one reasonably fast instruction.  */
 #define MOVE_MAX 4
 
-/* Undo the effects of the movmem pattern presence on STORE_BY_PIECES_P .  */
+/* Undo the effects of the cpymem pattern presence on STORE_BY_PIECES_P .  */
 #define MOVE_RATIO(SPEED) ((SPEED) ? 15 : 3)
 
 /* Define this to be nonzero if shift instructions ignore all but the
diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md
index 34e8248bc..2cfcf8bdd 100644
--- a/gcc/config/arc/arc.md
+++ b/gcc/config/arc/arc.md
@@ -5114,13 +5114,13 @@ core_3, archs4x, archs4xd, archs4xd_slow"
    (set_attr "type" "loop_end")
    (set_attr "length" "4,20")])
 
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(match_operand:BLK 0 "" "")
    (match_operand:BLK 1 "" "")
    (match_operand:SI 2 "nonmemory_operand" "")
    (match_operand 3 "immediate_operand" "")]
   ""
-  "if (arc_expand_movmem (operands)) DONE; else FAIL;")
+  "if (arc_expand_cpymem (operands)) DONE; else FAIL;")
 
 ;; Close http://gcc.gnu.org/bugzilla/show_bug.cgi?id=35803 if this works
 ;; to the point that we can generate cmove instructions.
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 98beb6109..dd1f32798 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -127,8 +127,8 @@ extern bool offset_ok_for_ldrd_strd (HOST_WIDE_INT);
 extern bool operands_ok_ldrd_strd (rtx, rtx, rtx, HOST_WIDE_INT, bool, bool);
 extern bool gen_operands_ldrd_strd (rtx *, bool, bool, bool);
 extern bool valid_operands_ldrd_strd (rtx *, bool);
-extern int arm_gen_movmemqi (rtx *);
-extern bool gen_movmem_ldrd_strd (rtx *);
+extern int arm_gen_cpymemqi (rtx *);
+extern bool gen_cpymem_ldrd_strd (rtx *);
 extern machine_mode arm_select_cc_mode (RTX_CODE, rtx, rtx);
 extern machine_mode arm_select_dominance_cc_mode (rtx, rtx,
 						       HOST_WIDE_INT);
@@ -204,7 +204,7 @@ extern void thumb2_final_prescan_insn (rtx_insn *);
 extern const char *thumb_load_double_from_address (rtx *);
 extern const char *thumb_output_move_mem_multiple (int, rtx *);
 extern const char *thumb_call_via_reg (rtx);
-extern void thumb_expand_movmemqi (rtx *);
+extern void thumb_expand_cpymemqi (rtx *);
 extern rtx arm_return_addr (int, rtx);
 extern void thumb_reload_out_hi (rtx *);
 extern void thumb_set_return_address (rtx, rtx);
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index eba26011e..c8a09329a 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -14426,7 +14426,7 @@ arm_block_move_unaligned_loop (rtx dest, rtx src, HOST_WIDE_INT length,
    core type, optimize_size setting, etc.  */
 
 static int
-arm_movmemqi_unaligned (rtx *operands)
+arm_cpymemqi_unaligned (rtx *operands)
 {
   HOST_WIDE_INT length = INTVAL (operands[2]);
   
@@ -14463,7 +14463,7 @@ arm_movmemqi_unaligned (rtx *operands)
 }
 
 int
-arm_gen_movmemqi (rtx *operands)
+arm_gen_cpymemqi (rtx *operands)
 {
   HOST_WIDE_INT in_words_to_go, out_words_to_go, last_bytes;
   HOST_WIDE_INT srcoffset, dstoffset;
@@ -14477,7 +14477,7 @@ arm_gen_movmemqi (rtx *operands)
     return 0;
 
   if (unaligned_access && (INTVAL (operands[3]) & 3) != 0)
-    return arm_movmemqi_unaligned (operands);
+    return arm_cpymemqi_unaligned (operands);
 
   if (INTVAL (operands[3]) & 3)
     return 0;
@@ -14611,7 +14611,7 @@ arm_gen_movmemqi (rtx *operands)
   return 1;
 }
 
-/* Helper for gen_movmem_ldrd_strd. Increase the address of memory rtx
+/* Helper for gen_cpymem_ldrd_strd. Increase the address of memory rtx
 by mode size.  */
 inline static rtx
 next_consecutive_mem (rtx mem)
@@ -14626,7 +14626,7 @@ next_consecutive_mem (rtx mem)
 /* Copy using LDRD/STRD instructions whenever possible.
    Returns true upon success. */
 bool
-gen_movmem_ldrd_strd (rtx *operands)
+gen_cpymem_ldrd_strd (rtx *operands)
 {
   unsigned HOST_WIDE_INT len;
   HOST_WIDE_INT align;
@@ -14670,7 +14670,7 @@ gen_movmem_ldrd_strd (rtx *operands)
 
   /* If we cannot generate any LDRD/STRD, try to generate LDM/STM.  */
   if (!(dst_aligned || src_aligned))
-    return arm_gen_movmemqi (operands);
+    return arm_gen_cpymemqi (operands);
 
   /* If the either src or dst is unaligned we'll be accessing it as pairs
      of unaligned SImode accesses.  Otherwise we can generate DImode
@@ -26472,7 +26472,7 @@ thumb_call_via_reg (rtx reg)
 
 /* Routines for generating rtl.  */
 void
-thumb_expand_movmemqi (rtx *operands)
+thumb_expand_cpymemqi (rtx *operands)
 {
   rtx out = copy_to_mode_reg (SImode, XEXP (operands[0], 0));
   rtx in  = copy_to_mode_reg (SImode, XEXP (operands[1], 0));
@@ -26481,13 +26481,13 @@ thumb_expand_movmemqi (rtx *operands)
 
   while (len >= 12)
     {
-      emit_insn (gen_movmem12b (out, in, out, in));
+      emit_insn (gen_cpymem12b (out, in, out, in));
       len -= 12;
     }
 
   if (len >= 8)
     {
-      emit_insn (gen_movmem8b (out, in, out, in));
+      emit_insn (gen_cpymem8b (out, in, out, in));
       len -= 8;
     }
 
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 53e54874c..a1b9d9fac 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -7260,7 +7260,7 @@
 ;; We could let this apply for blocks of less than this, but it clobbers so
 ;; many registers that there is then probably a better way.
 
-(define_expand "movmemqi"
+(define_expand "cpymemqi"
   [(match_operand:BLK 0 "general_operand" "")
    (match_operand:BLK 1 "general_operand" "")
    (match_operand:SI 2 "const_int_operand" "")
@@ -7272,12 +7272,12 @@
       if (TARGET_LDRD && current_tune->prefer_ldrd_strd
           && !optimize_function_for_size_p (cfun))
         {
-          if (gen_movmem_ldrd_strd (operands))
+          if (gen_cpymem_ldrd_strd (operands))
             DONE;
           FAIL;
         }
 
-      if (arm_gen_movmemqi (operands))
+      if (arm_gen_cpymemqi (operands))
         DONE;
       FAIL;
     }
@@ -7287,7 +7287,7 @@
           || INTVAL (operands[2]) > 48)
         FAIL;
 
-      thumb_expand_movmemqi (operands);
+      thumb_expand_cpymemqi (operands);
       DONE;
     }
   "
@@ -8807,6 +8807,8 @@
   [(set_attr "arch" "t1,32")]
 )
 
+;; DO NOT SPLIT THIS INSN.  It's important for security reasons that the
+;; canary value does not live beyond the life of this sequence.
 (define_insn "*stack_protect_set_insn"
   [(set (match_operand:SI 0 "memory_operand" "=m,m")
 	(unspec:SI [(mem:SI (match_operand:SI 1 "register_operand" "+&l,&r"))]
@@ -8814,8 +8816,8 @@
    (clobber (match_dup 1))]
   ""
   "@
-   ldr\\t%1, [%1]\;str\\t%1, %0\;movs\t%1,#0
-   ldr\\t%1, [%1]\;str\\t%1, %0\;mov\t%1,#0"
+   ldr\\t%1, [%1]\;str\\t%1, %0\;movs\t%1, #0
+   ldr\\t%1, [%1]\;str\\t%1, %0\;mov\t%1, #0"
   [(set_attr "length" "8,12")
    (set_attr "conds" "clob,nocond")
    (set_attr "type" "multiple")
diff --git a/gcc/config/arm/arm1020e.md b/gcc/config/arm/arm1020e.md
index b835cbaaa..c4c038b04 100644
--- a/gcc/config/arm/arm1020e.md
+++ b/gcc/config/arm/arm1020e.md
@@ -72,7 +72,7 @@
                        adr,bfm,rev,\
                        shift_imm,shift_reg,\
                        mov_imm,mov_reg,mvn_imm,mvn_reg,\
-                       multiple,no_insn"))
+                       multiple"))
  "1020a_e,1020a_m,1020a_w")
 
 ;; ALU operations with a shift-by-constant operand
diff --git a/gcc/config/arm/arm1026ejs.md b/gcc/config/arm/arm1026ejs.md
index 05f4d724f..88546872a 100644
--- a/gcc/config/arm/arm1026ejs.md
+++ b/gcc/config/arm/arm1026ejs.md
@@ -72,7 +72,7 @@
                        adr,bfm,rev,\
                        shift_imm,shift_reg,\
                        mov_imm,mov_reg,mvn_imm,mvn_reg,\
-                       multiple,no_insn"))
+                       multiple"))
  "a_e,a_m,a_w")
 
 ;; ALU operations with a shift-by-constant operand
diff --git a/gcc/config/arm/arm1136jfs.md b/gcc/config/arm/arm1136jfs.md
index ae0b54f5e..e7fd53afe 100644
--- a/gcc/config/arm/arm1136jfs.md
+++ b/gcc/config/arm/arm1136jfs.md
@@ -81,7 +81,7 @@
                        adr,bfm,rev,\
                        shift_imm,shift_reg,\
                        mov_imm,mov_reg,mvn_imm,mvn_reg,\
-                       multiple,no_insn"))
+                       multiple"))
  "e_1,e_2,e_3,e_wb")
 
 ;; ALU operations with a shift-by-constant operand
diff --git a/gcc/config/arm/arm926ejs.md b/gcc/config/arm/arm926ejs.md
index db4c7db8c..b4f503159 100644
--- a/gcc/config/arm/arm926ejs.md
+++ b/gcc/config/arm/arm926ejs.md
@@ -67,7 +67,7 @@
                        shift_imm,shift_reg,extend,\
                        mov_imm,mov_reg,mov_shift,\
                        mvn_imm,mvn_reg,mvn_shift,\
-                       multiple,no_insn"))
+                       multiple"))
  "e,m,w")
 
 ;; ALU operations with a shift-by-register operand
diff --git a/gcc/config/arm/cortex-a15.md b/gcc/config/arm/cortex-a15.md
index f57f98675..26765c3db 100644
--- a/gcc/config/arm/cortex-a15.md
+++ b/gcc/config/arm/cortex-a15.md
@@ -68,7 +68,7 @@
                         shift_imm,shift_reg,\
                         mov_imm,mov_reg,\
                         mvn_imm,mvn_reg,\
-                        mrs,multiple,no_insn"))
+                        mrs,multiple"))
   "ca15_issue1,(ca15_sx1,ca15_sx1_alu)|(ca15_sx2,ca15_sx2_alu)")
 
 ;; ALU ops with immediate shift
diff --git a/gcc/config/arm/cortex-a17.md b/gcc/config/arm/cortex-a17.md
index a0c6e5141..97b716414 100644
--- a/gcc/config/arm/cortex-a17.md
+++ b/gcc/config/arm/cortex-a17.md
@@ -42,7 +42,7 @@
                         adc_imm,adcs_imm,adc_reg,adcs_reg,\
                         adr, mov_imm,mov_reg,\
                         mvn_imm,mvn_reg,extend,\
-                        mrs,multiple,no_insn"))
+                        mrs,multiple"))
   "ca17_alu")
 
 (define_insn_reservation "cortex_a17_alu_shiftimm" 2
diff --git a/gcc/config/arm/cortex-a5.md b/gcc/config/arm/cortex-a5.md
index efced646a..08aa90856 100644
--- a/gcc/config/arm/cortex-a5.md
+++ b/gcc/config/arm/cortex-a5.md
@@ -64,7 +64,7 @@
                         adr,bfm,clz,rbit,rev,alu_dsp_reg,\
                         shift_imm,shift_reg,\
                         mov_imm,mov_reg,mvn_imm,mvn_reg,\
-                        mrs,multiple,no_insn"))
+                        mrs,multiple"))
   "cortex_a5_ex1")
 
 (define_insn_reservation "cortex_a5_alu_shift" 2
diff --git a/gcc/config/arm/cortex-a53.md b/gcc/config/arm/cortex-a53.md
index b55d34e91..9b29f3874 100644
--- a/gcc/config/arm/cortex-a53.md
+++ b/gcc/config/arm/cortex-a53.md
@@ -86,7 +86,7 @@
 			alu_sreg,alus_sreg,logic_reg,logics_reg,
 			adc_imm,adcs_imm,adc_reg,adcs_reg,
 			csel,clz,rbit,rev,alu_dsp_reg,
-			mov_reg,mvn_reg,mrs,multiple,no_insn"))
+			mov_reg,mvn_reg,mrs,multiple"))
   "cortex_a53_slot_any")
 
 (define_insn_reservation "cortex_a53_alu_shift" 3
diff --git a/gcc/config/arm/cortex-a57.md b/gcc/config/arm/cortex-a57.md
index 577dc8d7f..49654bf18 100644
--- a/gcc/config/arm/cortex-a57.md
+++ b/gcc/config/arm/cortex-a57.md
@@ -301,7 +301,7 @@
 			rotate_imm,shift_imm,shift_reg,\
 			mov_imm,mov_reg,\
 			mvn_imm,mvn_reg,\
-			mrs,multiple,no_insn"))
+			mrs,multiple"))
   "ca57_sx1|ca57_sx2")
 
 ;; ALU ops with immediate shift
diff --git a/gcc/config/arm/cortex-a7.md b/gcc/config/arm/cortex-a7.md
index 1f9d6414e..f1b60aa27 100644
--- a/gcc/config/arm/cortex-a7.md
+++ b/gcc/config/arm/cortex-a7.md
@@ -149,7 +149,7 @@
                         logic_shift_reg,logics_shift_reg,\
                         mov_shift,mov_shift_reg,\
                         mvn_shift,mvn_shift_reg,\
-                        mrs,multiple,no_insn"))
+                        mrs,multiple"))
   "cortex_a7_ex1")
 
 ;; Forwarding path for unshifted operands.
diff --git a/gcc/config/arm/cortex-a8.md b/gcc/config/arm/cortex-a8.md
index 980aed86e..e3372453d 100644
--- a/gcc/config/arm/cortex-a8.md
+++ b/gcc/config/arm/cortex-a8.md
@@ -90,7 +90,7 @@
                         adc_imm,adcs_imm,adc_reg,adcs_reg,\
                         adr,bfm,clz,rbit,rev,alu_dsp_reg,\
                         shift_imm,shift_reg,\
-                        multiple,no_insn"))
+                        multiple"))
   "cortex_a8_default")
 
 (define_insn_reservation "cortex_a8_alu_shift" 2
diff --git a/gcc/config/arm/cortex-a9.md b/gcc/config/arm/cortex-a9.md
index 6402a4438..c8474152c 100644
--- a/gcc/config/arm/cortex-a9.md
+++ b/gcc/config/arm/cortex-a9.md
@@ -87,7 +87,7 @@ cortex_a9_p1_e2 + cortex_a9_p0_e1 + cortex_a9_p1_e1")
                         shift_imm,shift_reg,\
                         mov_imm,mov_reg,mvn_imm,mvn_reg,\
                         mov_shift_reg,mov_shift,\
-                        mrs,multiple,no_insn"))
+                        mrs,multiple"))
   "cortex_a9_p0_default|cortex_a9_p1_default")
 
 ;; An instruction using the shifter will go down E1.
diff --git a/gcc/config/arm/cortex-m4.md b/gcc/config/arm/cortex-m4.md
index 60038c1e7..f8efcfcfc 100644
--- a/gcc/config/arm/cortex-m4.md
+++ b/gcc/config/arm/cortex-m4.md
@@ -42,7 +42,7 @@
                              logic_shift_reg,logics_shift_reg,\
                              mov_imm,mov_reg,mov_shift,mov_shift_reg,\
                              mvn_imm,mvn_reg,mvn_shift,mvn_shift_reg,\
-                             mrs,multiple,no_insn")
+                             mrs,multiple")
 	    (ior (eq_attr "mul32" "yes")
 		 (eq_attr "widen_mul64" "yes"))))
   "cortex_m4_ex")
diff --git a/gcc/config/arm/cortex-m7.md b/gcc/config/arm/cortex-m7.md
index e4695ad66..dfe9a742c 100644
--- a/gcc/config/arm/cortex-m7.md
+++ b/gcc/config/arm/cortex-m7.md
@@ -48,7 +48,7 @@
                         logic_shift_imm,logics_shift_imm,\
                         alu_shift_reg,alus_shift_reg,\
                         logic_shift_reg,logics_shift_reg,\
-                        mrs,clz,f_mcr,f_mrc,multiple,no_insn"))
+                        mrs,clz,f_mcr,f_mrc,multiple"))
   "cm7_i0|cm7_i1,cm7_a0|cm7_a1")
 
 ;; Simple alu with inline shift operation.
diff --git a/gcc/config/arm/cortex-r4.md b/gcc/config/arm/cortex-r4.md
index d7c0135fc..af5db23a6 100644
--- a/gcc/config/arm/cortex-r4.md
+++ b/gcc/config/arm/cortex-r4.md
@@ -102,7 +102,7 @@
        (eq_attr "type" "alu_shift_reg,alus_shift_reg,\
                        logic_shift_reg,logics_shift_reg,\
                        mov_shift_reg,mvn_shift_reg,\
-                       mrs,multiple,no_insn"))
+                       mrs,multiple"))
   "cortex_r4_alu_shift_reg")
 
 ;; An ALU instruction followed by an ALU instruction with no early dep.
diff --git a/gcc/config/arm/fa526.md b/gcc/config/arm/fa526.md
index e6625b011..294b79692 100644
--- a/gcc/config/arm/fa526.md
+++ b/gcc/config/arm/fa526.md
@@ -68,7 +68,7 @@
                        adr,bfm,rev,\
                        shift_imm,shift_reg,\
                        mov_imm,mov_reg,mvn_imm,mvn_reg,\
-                       mrs,multiple,no_insn"))
+                       mrs,multiple"))
  "fa526_core")
 
 (define_insn_reservation "526_alu_shift_op" 2
diff --git a/gcc/config/arm/fa606te.md b/gcc/config/arm/fa606te.md
index f2c104fb1..9007050ed 100644
--- a/gcc/config/arm/fa606te.md
+++ b/gcc/config/arm/fa606te.md
@@ -73,7 +73,7 @@
                        logic_shift_reg,logics_shift_reg,\
                        mov_imm,mov_reg,mov_shift,mov_shift_reg,\
                        mvn_imm,mvn_reg,mvn_shift,mvn_shift_reg,\
-                       mrs,multiple,no_insn"))
+                       mrs,multiple"))
  "fa606te_core")
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/gcc/config/arm/fa626te.md b/gcc/config/arm/fa626te.md
index 880090fd7..6bdc2e8b5 100644
--- a/gcc/config/arm/fa626te.md
+++ b/gcc/config/arm/fa626te.md
@@ -74,7 +74,7 @@
                        adr,bfm,rev,\
                        shift_imm,shift_reg,\
                        mov_imm,mov_reg,mvn_imm,mvn_reg,\
-                       mrs,multiple,no_insn"))
+                       mrs,multiple"))
  "fa626te_core")
 
 (define_insn_reservation "626te_alu_shift_op" 2
diff --git a/gcc/config/arm/fa726te.md b/gcc/config/arm/fa726te.md
index cb5fbaf99..f6f2531c8 100644
--- a/gcc/config/arm/fa726te.md
+++ b/gcc/config/arm/fa726te.md
@@ -91,7 +91,7 @@
                        adc_imm,adcs_imm,adc_reg,adcs_reg,\
                        adr,bfm,rev,\
                        shift_imm,shift_reg,\
-                       mrs,multiple,no_insn"))
+                       mrs,multiple"))
   "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
 
 ;; ALU operations with a shift-by-register operand.
diff --git a/gcc/config/arm/thumb1.md b/gcc/config/arm/thumb1.md
index 041e2db34..f8eb732ac 100644
--- a/gcc/config/arm/thumb1.md
+++ b/gcc/config/arm/thumb1.md
@@ -985,7 +985,7 @@
 
 ;; Thumb block-move insns
 
-(define_insn "movmem12b"
+(define_insn "cpymem12b"
   [(set (mem:SI (match_operand:SI 2 "register_operand" "0"))
 	(mem:SI (match_operand:SI 3 "register_operand" "1")))
    (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
@@ -1007,7 +1007,7 @@
    (set_attr "type" "store_12")]
 )
 
-(define_insn "movmem8b"
+(define_insn "cpymem8b"
   [(set (mem:SI (match_operand:SI 2 "register_operand" "0"))
 	(mem:SI (match_operand:SI 3 "register_operand" "1")))
    (set (mem:SI (plus:SI (match_dup 2) (const_int 4)))
diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
index f8f8dd090..60faad659 100644
--- a/gcc/config/arm/types.md
+++ b/gcc/config/arm/types.md
@@ -546,6 +546,10 @@
 ; The classification below is for coprocessor instructions
 ;
 ; coproc
+;
+; The classification below is for TME instructions
+;
+; tme
 
 (define_attr "type"
  "adc_imm,\
@@ -1091,7 +1095,8 @@
   crypto_sha3,\
   crypto_sm3,\
   crypto_sm4,\
-  coproc"
+  coproc,\
+  tme"
    (const_string "untyped"))
 
 ; Is this an (integer side) multiply with a 32-bit (or smaller) result?
@@ -1215,3 +1220,7 @@
           crypto_sha256_fast, crypto_sha256_slow")
         (const_string "yes")
         (const_string "no")))
+
+(define_insn_reservation "no_reservation" 0
+  (eq_attr "type" "no_insn")
+  "nothing")
diff --git a/gcc/config/arm/xgene1.md b/gcc/config/arm/xgene1.md
index 14156421d..81498daa0 100644
--- a/gcc/config/arm/xgene1.md
+++ b/gcc/config/arm/xgene1.md
@@ -64,11 +64,6 @@
        (eq_attr "type" "branch"))
   "xgene1_decode1op")
 
-(define_insn_reservation "xgene1_nop" 1
-  (and (eq_attr "tune" "xgene1")
-       (eq_attr "type" "no_insn"))
-  "xgene1_decode1op")
-
 (define_insn_reservation "xgene1_call" 1
   (and (eq_attr "tune" "xgene1")
        (eq_attr "type" "call"))
diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h
index dd0babbd7..31fe3a66d 100644
--- a/gcc/config/avr/avr-protos.h
+++ b/gcc/config/avr/avr-protos.h
@@ -82,7 +82,7 @@ extern rtx avr_to_int_mode (rtx);
 
 extern void avr_expand_prologue (void);
 extern void avr_expand_epilogue (bool);
-extern bool avr_emit_movmemhi (rtx*);
+extern bool avr_emit_cpymemhi (rtx*);
 extern int avr_epilogue_uses (int regno);
 
 extern void avr_output_addr_vec (rtx_insn*, rtx);
@@ -92,7 +92,7 @@ extern const char* avr_out_plus (rtx, rtx*, int* =NULL, int* =NULL, bool =true);
 extern const char* avr_out_round (rtx_insn *, rtx*, int* =NULL);
 extern const char* avr_out_addto_sp (rtx*, int*);
 extern const char* avr_out_xload (rtx_insn *, rtx*, int*);
-extern const char* avr_out_movmem (rtx_insn *, rtx*, int*);
+extern const char* avr_out_cpymem (rtx_insn *, rtx*, int*);
 extern const char* avr_out_insert_bits (rtx*, int*);
 extern bool avr_popcount_each_byte (rtx, int, int);
 extern bool avr_has_nibble_0xf (rtx);
diff --git a/gcc/config/avr/avr.c b/gcc/config/avr/avr.c
index cb4b14ae3..3e6e5d2ee 100644
--- a/gcc/config/avr/avr.c
+++ b/gcc/config/avr/avr.c
@@ -9421,7 +9421,7 @@ avr_adjust_insn_length (rtx_insn *insn, int len)
     case ADJUST_LEN_MOV16: output_movhi (insn, op, &len); break;
     case ADJUST_LEN_MOV24: avr_out_movpsi (insn, op, &len); break;
     case ADJUST_LEN_MOV32: output_movsisf (insn, op, &len); break;
-    case ADJUST_LEN_MOVMEM: avr_out_movmem (insn, op, &len); break;
+    case ADJUST_LEN_CPYMEM: avr_out_cpymem (insn, op, &len); break;
     case ADJUST_LEN_XLOAD: avr_out_xload (insn, op, &len); break;
     case ADJUST_LEN_SEXT: avr_out_sign_extend (insn, op, &len); break;
 
@@ -13338,7 +13338,7 @@ avr_emit3_fix_outputs (rtx (*gen)(rtx,rtx,rtx), rtx *op,
 }
 
 
-/* Worker function for movmemhi expander.
+/* Worker function for cpymemhi expander.
    XOP[0]  Destination as MEM:BLK
    XOP[1]  Source      "     "
    XOP[2]  # Bytes to copy
@@ -13347,7 +13347,7 @@ avr_emit3_fix_outputs (rtx (*gen)(rtx,rtx,rtx), rtx *op,
    Return FALSE if the operand compination is not supported.  */
 
 bool
-avr_emit_movmemhi (rtx *xop)
+avr_emit_cpymemhi (rtx *xop)
 {
   HOST_WIDE_INT count;
   machine_mode loop_mode;
@@ -13424,14 +13424,14 @@ avr_emit_movmemhi (rtx *xop)
          Do the copy-loop inline.  */
 
       rtx (*fun) (rtx, rtx, rtx)
-        = QImode == loop_mode ? gen_movmem_qi : gen_movmem_hi;
+        = QImode == loop_mode ? gen_cpymem_qi : gen_cpymem_hi;
 
       insn = fun (xas, loop_reg, loop_reg);
     }
   else
     {
       rtx (*fun) (rtx, rtx)
-        = QImode == loop_mode ? gen_movmemx_qi : gen_movmemx_hi;
+        = QImode == loop_mode ? gen_cpymemx_qi : gen_cpymemx_hi;
 
       emit_move_insn (gen_rtx_REG (QImode, 23), a_hi8);
 
@@ -13445,7 +13445,7 @@ avr_emit_movmemhi (rtx *xop)
 }
 
 
-/* Print assembler for movmem_qi, movmem_hi insns...
+/* Print assembler for cpymem_qi, cpymem_hi insns...
        $0     : Address Space
        $1, $2 : Loop register
        Z      : Source address
@@ -13453,7 +13453,7 @@ avr_emit_movmemhi (rtx *xop)
 */
 
 const char*
-avr_out_movmem (rtx_insn *insn ATTRIBUTE_UNUSED, rtx *op, int *plen)
+avr_out_cpymem (rtx_insn *insn ATTRIBUTE_UNUSED, rtx *op, int *plen)
 {
   addr_space_t as = (addr_space_t) INTVAL (op[0]);
   machine_mode loop_mode = GET_MODE (op[1]);
diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index f263b693c..e85bf4963 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -70,7 +70,7 @@
 
 (define_c_enum "unspec"
   [UNSPEC_STRLEN
-   UNSPEC_MOVMEM
+   UNSPEC_CPYMEM
    UNSPEC_INDEX_JMP
    UNSPEC_FMUL
    UNSPEC_FMULS
@@ -158,7 +158,7 @@
    tsthi, tstpsi, tstsi, compare, compare64, call,
    mov8, mov16, mov24, mov32, reload_in16, reload_in24, reload_in32,
    ufract, sfract, round,
-   xload, movmem,
+   xload, cpymem,
    ashlqi, ashrqi, lshrqi,
    ashlhi, ashrhi, lshrhi,
    ashlsi, ashrsi, lshrsi,
@@ -992,20 +992,20 @@
 ;;=========================================================================
 ;; move string (like memcpy)
 
-(define_expand "movmemhi"
+(define_expand "cpymemhi"
   [(parallel [(set (match_operand:BLK 0 "memory_operand" "")
                    (match_operand:BLK 1 "memory_operand" ""))
               (use (match_operand:HI 2 "const_int_operand" ""))
               (use (match_operand:HI 3 "const_int_operand" ""))])]
   ""
   {
-    if (avr_emit_movmemhi (operands))
+    if (avr_emit_cpymemhi (operands))
       DONE;
 
     FAIL;
   })
 
-(define_mode_attr MOVMEM_r_d [(QI "r")
+(define_mode_attr CPYMEM_r_d [(QI "r")
                               (HI "wd")])
 
 ;; $0     : Address Space
@@ -1013,23 +1013,23 @@
 ;; R30    : source address
 ;; R26    : destination address
 
-;; "movmem_qi"
-;; "movmem_hi"
-(define_insn "movmem_<mode>"
+;; "cpymem_qi"
+;; "cpymem_hi"
+(define_insn "cpymem_<mode>"
   [(set (mem:BLK (reg:HI REG_X))
         (mem:BLK (reg:HI REG_Z)))
    (unspec [(match_operand:QI 0 "const_int_operand" "n")]
-           UNSPEC_MOVMEM)
-   (use (match_operand:QIHI 1 "register_operand" "<MOVMEM_r_d>"))
+           UNSPEC_CPYMEM)
+   (use (match_operand:QIHI 1 "register_operand" "<CPYMEM_r_d>"))
    (clobber (reg:HI REG_X))
    (clobber (reg:HI REG_Z))
    (clobber (reg:QI LPM_REGNO))
    (clobber (match_operand:QIHI 2 "register_operand" "=1"))]
   ""
   {
-    return avr_out_movmem (insn, operands, NULL);
+    return avr_out_cpymem (insn, operands, NULL);
   }
-  [(set_attr "adjust_len" "movmem")
+  [(set_attr "adjust_len" "cpymem")
    (set_attr "cc" "clobber")])
 
 
@@ -1039,14 +1039,14 @@
 ;; R23:Z : 24-bit source address
 ;; R26   : 16-bit destination address
 
-;; "movmemx_qi"
-;; "movmemx_hi"
-(define_insn "movmemx_<mode>"
+;; "cpymemx_qi"
+;; "cpymemx_hi"
+(define_insn "cpymemx_<mode>"
   [(set (mem:BLK (reg:HI REG_X))
         (mem:BLK (lo_sum:PSI (reg:QI 23)
                              (reg:HI REG_Z))))
    (unspec [(match_operand:QI 0 "const_int_operand" "n")]
-           UNSPEC_MOVMEM)
+           UNSPEC_CPYMEM)
    (use (reg:QIHI 24))
    (clobber (reg:HI REG_X))
    (clobber (reg:HI REG_Z))
diff --git a/gcc/config/bfin/bfin-protos.h b/gcc/config/bfin/bfin-protos.h
index 64a184275..7d0f705e0 100644
--- a/gcc/config/bfin/bfin-protos.h
+++ b/gcc/config/bfin/bfin-protos.h
@@ -81,7 +81,7 @@ extern bool expand_move (rtx *, machine_mode);
 extern void bfin_expand_call (rtx, rtx, rtx, rtx, int);
 extern bool bfin_longcall_p (rtx, int);
 extern bool bfin_dsp_memref_p (rtx);
-extern bool bfin_expand_movmem (rtx, rtx, rtx, rtx);
+extern bool bfin_expand_cpymem (rtx, rtx, rtx, rtx);
 
 extern enum reg_class secondary_input_reload_class (enum reg_class,
 						    machine_mode,
diff --git a/gcc/config/bfin/bfin.c b/gcc/config/bfin/bfin.c
index 97c2c12d5..288a2ff59 100644
--- a/gcc/config/bfin/bfin.c
+++ b/gcc/config/bfin/bfin.c
@@ -3208,7 +3208,7 @@ output_pop_multiple (rtx insn, rtx *operands)
 /* Adjust DST and SRC by OFFSET bytes, and generate one move in mode MODE.  */
 
 static void
-single_move_for_movmem (rtx dst, rtx src, machine_mode mode, HOST_WIDE_INT offset)
+single_move_for_cpymem (rtx dst, rtx src, machine_mode mode, HOST_WIDE_INT offset)
 {
   rtx scratch = gen_reg_rtx (mode);
   rtx srcmem, dstmem;
@@ -3224,7 +3224,7 @@ single_move_for_movmem (rtx dst, rtx src, machine_mode mode, HOST_WIDE_INT offse
    back on a different method.  */
 
 bool
-bfin_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp)
+bfin_expand_cpymem (rtx dst, rtx src, rtx count_exp, rtx align_exp)
 {
   rtx srcreg, destreg, countreg;
   HOST_WIDE_INT align = 0;
@@ -3269,7 +3269,7 @@ bfin_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp)
 	{
 	  if ((count & ~3) == 4)
 	    {
-	      single_move_for_movmem (dst, src, SImode, offset);
+	      single_move_for_cpymem (dst, src, SImode, offset);
 	      offset = 4;
 	    }
 	  else if (count & ~3)
@@ -3282,7 +3282,7 @@ bfin_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp)
 	    }
 	  if (count & 2)
 	    {
-	      single_move_for_movmem (dst, src, HImode, offset);
+	      single_move_for_cpymem (dst, src, HImode, offset);
 	      offset += 2;
 	    }
 	}
@@ -3290,7 +3290,7 @@ bfin_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp)
 	{
 	  if ((count & ~1) == 2)
 	    {
-	      single_move_for_movmem (dst, src, HImode, offset);
+	      single_move_for_cpymem (dst, src, HImode, offset);
 	      offset = 2;
 	    }
 	  else if (count & ~1)
@@ -3304,7 +3304,7 @@ bfin_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp)
 	}
       if (count & 1)
 	{
-	  single_move_for_movmem (dst, src, QImode, offset);
+	  single_move_for_cpymem (dst, src, QImode, offset);
 	}
       return true;
     }
diff --git a/gcc/config/bfin/bfin.h b/gcc/config/bfin/bfin.h
index 19b7f819d..4aba596f6 100644
--- a/gcc/config/bfin/bfin.h
+++ b/gcc/config/bfin/bfin.h
@@ -793,7 +793,7 @@ typedef struct {
 #define MOVE_MAX UNITS_PER_WORD
 
 /* If a memory-to-memory move would take MOVE_RATIO or more simple
-   move-instruction pairs, we will do a movmem or libcall instead.  */
+   move-instruction pairs, we will do a cpymem or libcall instead.  */
 
 #define MOVE_RATIO(speed) 5
 
diff --git a/gcc/config/bfin/bfin.md b/gcc/config/bfin/bfin.md
index ac5892424..6ac208d04 100644
--- a/gcc/config/bfin/bfin.md
+++ b/gcc/config/bfin/bfin.md
@@ -2316,14 +2316,14 @@
    (set_attr "length" "16")
    (set_attr "seq_insns" "multi")])
 
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(match_operand:BLK 0 "general_operand" "")
    (match_operand:BLK 1 "general_operand" "")
    (match_operand:SI 2 "const_int_operand" "")
    (match_operand:SI 3 "const_int_operand" "")]
   ""
 {
-  if (bfin_expand_movmem (operands[0], operands[1], operands[2], operands[3]))
+  if (bfin_expand_cpymem (operands[0], operands[1], operands[2], operands[3]))
     DONE;
   FAIL;
 })
diff --git a/gcc/config/c6x/c6x-protos.h b/gcc/config/c6x/c6x-protos.h
index a657969a2..8c04c315a 100644
--- a/gcc/config/c6x/c6x-protos.h
+++ b/gcc/config/c6x/c6x-protos.h
@@ -35,7 +35,7 @@ extern bool c6x_long_call_p (rtx);
 extern void c6x_expand_call (rtx, rtx, bool);
 extern rtx c6x_expand_compare (rtx, machine_mode);
 extern bool c6x_force_op_for_comparison_p (enum rtx_code, rtx);
-extern bool c6x_expand_movmem (rtx, rtx, rtx, rtx, rtx, rtx);
+extern bool c6x_expand_cpymem (rtx, rtx, rtx, rtx, rtx, rtx);
 
 extern rtx c6x_subword (rtx, bool);
 extern void split_di (rtx *, int, rtx *, rtx *);
diff --git a/gcc/config/c6x/c6x.c b/gcc/config/c6x/c6x.c
index 9a07c4013..e4176774b 100644
--- a/gcc/config/c6x/c6x.c
+++ b/gcc/config/c6x/c6x.c
@@ -1683,10 +1683,10 @@ c6x_valid_mask_p (HOST_WIDE_INT val)
   return true;
 }
 
-/* Expand a block move for a movmemM pattern.  */
+/* Expand a block move for a cpymemM pattern.  */
 
 bool
-c6x_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
+c6x_expand_cpymem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
 		   rtx expected_align_exp ATTRIBUTE_UNUSED,
 		   rtx expected_size_exp ATTRIBUTE_UNUSED)
 {
diff --git a/gcc/config/c6x/c6x.md b/gcc/config/c6x/c6x.md
index 8218e1dad..f9bf9ba99 100644
--- a/gcc/config/c6x/c6x.md
+++ b/gcc/config/c6x/c6x.md
@@ -2844,7 +2844,7 @@
 ;; Block moves
 ;; -------------------------------------------------------------------------
 
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(use (match_operand:BLK 0 "memory_operand" ""))
    (use (match_operand:BLK 1 "memory_operand" ""))
    (use (match_operand:SI 2 "nonmemory_operand" ""))
@@ -2853,7 +2853,7 @@
    (use (match_operand:SI 5 "const_int_operand" ""))]
   ""
 {
- if (c6x_expand_movmem (operands[0], operands[1], operands[2], operands[3],
+ if (c6x_expand_cpymem (operands[0], operands[1], operands[2], operands[3],
 			operands[4], operands[5]))
    DONE;
  else
diff --git a/gcc/config/darwin.c b/gcc/config/darwin.c
index a7610829f..dcd69698f 100644
--- a/gcc/config/darwin.c
+++ b/gcc/config/darwin.c
@@ -2150,7 +2150,7 @@ darwin_emit_unwind_label (FILE *file, tree decl, int for_eh, int empty)
   if (! for_eh || ! ld_needs_eh_markers)
     return;
 
-  /* FIXME: This only works when the eh for all sections of a function are
+  /* FIXME: This only works when the eh for all sections of a function are 
      emitted at the same time.  If that changes, we would need to use a lookup
      table of some form to determine what to do.  Also, we should emit the
      unadorned label for the partition containing the public label for a
@@ -3325,7 +3325,7 @@ darwin_override_options (void)
 
   /* Linkers >= ld64-62.1 (at least) are capable of making the necessary PIC
      indirections and we no longer need to emit pic symbol stubs.
-     However, if we are generating code for earlier ones (or for use in the
+     However, if we are generating code for earlier ones (or for use in the 
      kernel) the stubs might still be required, and this will be set true.
      If the user sets it on or off - then that takes precedence.
 
@@ -3334,18 +3334,18 @@ darwin_override_options (void)
 
   if (!global_options_set.x_darwin_symbol_stubs)
     {
-      if (darwin_target_linker)
+      if (darwin_target_linker) 
 	{
 	  if (strverscmp (darwin_target_linker, MIN_LD64_OMIT_STUBS) < 0)
 	    {
 	      darwin_symbol_stubs = true;
 	      ld_needs_eh_markers = true;
 	    }
-	}
+	} 
       else if (generating_for_darwin_version < 9)
 	{
 	  /* If we don't know the linker version and we're targeting an old
-	     system, we know no better than to assume the use of an earlier
+	     system, we know no better than to assume the use of an earlier 
 	     linker.  */
 	  darwin_symbol_stubs = true;
 	  ld_needs_eh_markers = true;
@@ -3354,7 +3354,7 @@ darwin_override_options (void)
   else if (DARWIN_X86 && darwin_symbol_stubs && TARGET_64BIT)
     {
       inform (input_location,
-	      "%<-msymbol-stubs%> is not required for 64b code (ignored)");
+	      "%<-mpic-symbol-stubs%> is not required for 64b code (ignored)");
       darwin_symbol_stubs = false;
     }
 
diff --git a/gcc/config/frv/frv.md b/gcc/config/frv/frv.md
index 064bf53ea..6e8db59fd 100644
--- a/gcc/config/frv/frv.md
+++ b/gcc/config/frv/frv.md
@@ -1887,7 +1887,7 @@
 ;; Argument 2 is the length
 ;; Argument 3 is the alignment
 
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(parallel [(set (match_operand:BLK 0 "" "")
 		   (match_operand:BLK 1 "" ""))
 	      (use (match_operand:SI 2 "" ""))
diff --git a/gcc/config/ft32/ft32.md b/gcc/config/ft32/ft32.md
index de2394644..9e31f2ca7 100644
--- a/gcc/config/ft32/ft32.md
+++ b/gcc/config/ft32/ft32.md
@@ -851,7 +851,7 @@
 "stpcpy %b1,%b2 # %0 %b1 %b2"
 )
 
-(define_insn "movmemsi"
+(define_insn "cpymemsi"
   [(set (match_operand:BLK 0 "memory_operand" "=W,BW")
         (match_operand:BLK 1 "memory_operand" "W,BW"))
         (use (match_operand:SI 2 "ft32_imm_operand" "KA,KA"))
diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index eb06ff9e0..480bb22ee 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -2495,7 +2495,7 @@ gcn_gimplify_va_arg_expr (tree valist, tree type,
   tree t, u;
   bool indirect;
 
-  indirect = pass_by_reference (NULL, TYPE_MODE (type), type, 0);
+  indirect = pass_va_arg_by_reference (type);
   if (indirect)
     {
       type = ptr;
diff --git a/gcc/config/h8300/h8300.md b/gcc/config/h8300/h8300.md
index eb0ae835f..42610fddb 100644
--- a/gcc/config/h8300/h8300.md
+++ b/gcc/config/h8300/h8300.md
@@ -474,11 +474,11 @@
    (set_attr "length_table" "*,movl")
    (set_attr "cc" "set_zn,set_znv")])
 
-;; Implement block moves using movmd.  Defining movmemsi allows the full
+;; Implement block copies using movmd.  Defining cpymemsi allows the full
 ;; range of constant lengths (up to 0x40000 bytes when using movmd.l).
 ;; See h8sx_emit_movmd for details.
 
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(use (match_operand:BLK 0 "memory_operand" ""))
    (use (match_operand:BLK 1 "memory_operand" ""))
    (use (match_operand:SI 2 "" ""))
diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c
new file mode 100644
index 000000000..6afb246eb
--- /dev/null
+++ b/gcc/config/i386/i386-builtins.c
@@ -0,0 +1,2539 @@
+/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "memmodel.h"
+#include "gimple.h"
+#include "cfghooks.h"
+#include "cfgloop.h"
+#include "df.h"
+#include "tm_p.h"
+#include "stringpool.h"
+#include "expmed.h"
+#include "optabs.h"
+#include "regs.h"
+#include "emit-rtl.h"
+#include "recog.h"
+#include "cgraph.h"
+#include "diagnostic.h"
+#include "cfgbuild.h"
+#include "alias.h"
+#include "fold-const.h"
+#include "attribs.h"
+#include "calls.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "output.h"
+#include "insn-attr.h"
+#include "flags.h"
+#include "except.h"
+#include "explow.h"
+#include "expr.h"
+#include "cfgrtl.h"
+#include "common/common-target.h"
+#include "langhooks.h"
+#include "reload.h"
+#include "gimplify.h"
+#include "dwarf2.h"
+#include "tm-constrs.h"
+#include "params.h"
+#include "cselib.h"
+#include "sched-int.h"
+#include "opts.h"
+#include "tree-pass.h"
+#include "context.h"
+#include "pass_manager.h"
+#include "target-globals.h"
+#include "gimple-iterator.h"
+#include "tree-vectorizer.h"
+#include "shrink-wrap.h"
+#include "builtins.h"
+#include "rtl-iter.h"
+#include "tree-iterator.h"
+#include "dbgcnt.h"
+#include "case-cfn-macros.h"
+#include "dojump.h"
+#include "fold-const-call.h"
+#include "tree-vrp.h"
+#include "tree-ssanames.h"
+#include "selftest.h"
+#include "selftest-rtl.h"
+#include "print-rtl.h"
+#include "intl.h"
+#include "ifcvt.h"
+#include "symbol-summary.h"
+#include "ipa-prop.h"
+#include "ipa-fnsummary.h"
+#include "wide-int-bitmask.h"
+#include "tree-vector-builder.h"
+#include "debug.h"
+#include "dwarf2out.h"
+#include "i386-builtins.h"
+
+#undef BDESC
+#undef BDESC_FIRST
+#undef BDESC_END
+
+/* Macros for verification of enum ix86_builtins order.  */
+#define BDESC_VERIFY(x, y, z) \
+  gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
+#define BDESC_VERIFYS(x, y, z) \
+  STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
+
+BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
+	       IX86_BUILTIN__BDESC_COMI_LAST, 1);
+BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
+	       IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
+BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
+	       IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
+BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
+	       IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
+BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
+	       IX86_BUILTIN__BDESC_ARGS_LAST, 1);
+BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
+	       IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
+BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
+	       IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
+BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
+	       IX86_BUILTIN__BDESC_CET_LAST, 1);
+BDESC_VERIFYS (IX86_BUILTIN_MAX,
+	       IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
+
+
+/* Table for the ix86 builtin non-function types.  */
+static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
+
+/* Retrieve an element from the above table, building some of
+   the types lazily.  */
+
+static tree
+ix86_get_builtin_type (enum ix86_builtin_type tcode)
+{
+  unsigned int index;
+  tree type, itype;
+
+  gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
+
+  type = ix86_builtin_type_tab[(int) tcode];
+  if (type != NULL)
+    return type;
+
+  gcc_assert (tcode > IX86_BT_LAST_PRIM);
+  if (tcode <= IX86_BT_LAST_VECT)
+    {
+      machine_mode mode;
+
+      index = tcode - IX86_BT_LAST_PRIM - 1;
+      itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
+      mode = ix86_builtin_type_vect_mode[index];
+
+      type = build_vector_type_for_mode (itype, mode);
+    }
+  else
+    {
+      int quals;
+
+      index = tcode - IX86_BT_LAST_VECT - 1;
+      if (tcode <= IX86_BT_LAST_PTR)
+	quals = TYPE_UNQUALIFIED;
+      else
+	quals = TYPE_QUAL_CONST;
+
+      itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
+      if (quals != TYPE_UNQUALIFIED)
+	itype = build_qualified_type (itype, quals);
+
+      type = build_pointer_type (itype);
+    }
+
+  ix86_builtin_type_tab[(int) tcode] = type;
+  return type;
+}
+
+/* Table for the ix86 builtin function types.  */
+static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
+
+/* Retrieve an element from the above table, building some of
+   the types lazily.  */
+
+static tree
+ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
+{
+  tree type;
+
+  gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
+
+  type = ix86_builtin_func_type_tab[(int) tcode];
+  if (type != NULL)
+    return type;
+
+  if (tcode <= IX86_BT_LAST_FUNC)
+    {
+      unsigned start = ix86_builtin_func_start[(int) tcode];
+      unsigned after = ix86_builtin_func_start[(int) tcode + 1];
+      tree rtype, atype, args = void_list_node;
+      unsigned i;
+
+      rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
+      for (i = after - 1; i > start; --i)
+	{
+	  atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
+	  args = tree_cons (NULL, atype, args);
+	}
+
+      type = build_function_type (rtype, args);
+    }
+  else
+    {
+      unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
+      enum ix86_builtin_func_type icode;
+
+      icode = ix86_builtin_func_alias_base[index];
+      type = ix86_get_builtin_func_type (icode);
+    }
+
+  ix86_builtin_func_type_tab[(int) tcode] = type;
+  return type;
+}
+
+/* Table for the ix86 builtin decls.  */
+static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
+
+struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
+
+tree get_ix86_builtin (enum ix86_builtins c)
+{
+  return ix86_builtins[c];
+}
+
+/* Bits that can still enable any inclusion of a builtin.  */
+HOST_WIDE_INT deferred_isa_values = 0;
+HOST_WIDE_INT deferred_isa_values2 = 0;
+
+/* Add an ix86 target builtin function with CODE, NAME and TYPE.  Save the
+   MASK and MASK2 of which isa_flags and ix86_isa_flags2 to use in the
+   ix86_builtins_isa array.  Stores the function decl in the ix86_builtins
+   array.  Returns the function decl or NULL_TREE, if the builtin was not
+   added.
+
+   If the front end has a special hook for builtin functions, delay adding
+   builtin functions that aren't in the current ISA until the ISA is changed
+   with function specific optimization.  Doing so, can save about 300K for the
+   default compiler.  When the builtin is expanded, check at that time whether
+   it is valid.
+
+   If the front end doesn't have a special hook, record all builtins, even if
+   it isn't an instruction set in the current ISA in case the user uses
+   function specific options for a different ISA, so that we don't get scope
+   errors if a builtin is added in the middle of a function scope.  */
+
+static inline tree
+def_builtin (HOST_WIDE_INT mask, HOST_WIDE_INT mask2,
+	     const char *name,
+	     enum ix86_builtin_func_type tcode,
+	     enum ix86_builtins code)
+{
+  tree decl = NULL_TREE;
+
+  /* An instruction may be 64bit only regardless of ISAs.  */
+  if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
+    {
+      ix86_builtins_isa[(int) code].isa = mask;
+      ix86_builtins_isa[(int) code].isa2 = mask2;
+
+      mask &= ~OPTION_MASK_ISA_64BIT;
+
+      /* Filter out the masks most often ored together with others.  */
+      if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
+	  && mask != OPTION_MASK_ISA_AVX512VL)
+	mask &= ~OPTION_MASK_ISA_AVX512VL;
+      if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW)
+	  && mask != OPTION_MASK_ISA_AVX512BW)
+	mask &= ~OPTION_MASK_ISA_AVX512BW;
+
+      if (((mask2 == 0 || (mask2 & ix86_isa_flags2) != 0)
+	   && (mask == 0 || (mask & ix86_isa_flags) != 0))
+	  || (lang_hooks.builtin_function
+	      == lang_hooks.builtin_function_ext_scope))
+	{
+	  tree type = ix86_get_builtin_func_type (tcode);
+	  decl = add_builtin_function (name, type, code, BUILT_IN_MD,
+				       NULL, NULL_TREE);
+	  ix86_builtins[(int) code] = decl;
+	  ix86_builtins_isa[(int) code].set_and_not_built_p = false;
+	}
+      else
+	{
+	  /* Just MASK and MASK2 where set_and_not_built_p == true can potentially
+	     include a builtin.  */
+	  deferred_isa_values |= mask;
+	  deferred_isa_values2 |= mask2;
+	  ix86_builtins[(int) code] = NULL_TREE;
+	  ix86_builtins_isa[(int) code].tcode = tcode;
+	  ix86_builtins_isa[(int) code].name = name;
+	  ix86_builtins_isa[(int) code].const_p = false;
+	  ix86_builtins_isa[(int) code].pure_p = false;
+	  ix86_builtins_isa[(int) code].set_and_not_built_p = true;
+	}
+    }
+
+  return decl;
+}
+
+/* Like def_builtin, but also marks the function decl "const".  */
+
+static inline tree
+def_builtin_const (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, const char *name,
+		   enum ix86_builtin_func_type tcode, enum ix86_builtins code)
+{
+  tree decl = def_builtin (mask, mask2, name, tcode, code);
+  if (decl)
+    TREE_READONLY (decl) = 1;
+  else
+    ix86_builtins_isa[(int) code].const_p = true;
+
+  return decl;
+}
+
+/* Like def_builtin, but also marks the function decl "pure".  */
+
+static inline tree
+def_builtin_pure (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, const char *name,
+		  enum ix86_builtin_func_type tcode, enum ix86_builtins code)
+{
+  tree decl = def_builtin (mask, mask2, name, tcode, code);
+  if (decl)
+    DECL_PURE_P (decl) = 1;
+  else
+    ix86_builtins_isa[(int) code].pure_p = true;
+
+  return decl;
+}
+
+/* Add any new builtin functions for a given ISA that may not have been
+   declared.  This saves a bit of space compared to adding all of the
+   declarations to the tree, even if we didn't use them.  */
+
+void
+ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
+{
+  isa &= ~OPTION_MASK_ISA_64BIT;
+
+  if ((isa & deferred_isa_values) == 0
+      && (isa2 & deferred_isa_values2) == 0)
+    return;
+
+  /* Bits in ISA value can be removed from potential isa values.  */
+  deferred_isa_values &= ~isa;
+  deferred_isa_values2 &= ~isa2;
+
+  int i;
+  tree saved_current_target_pragma = current_target_pragma;
+  current_target_pragma = NULL_TREE;
+
+  for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
+    {
+      if (((ix86_builtins_isa[i].isa & isa) != 0
+	   || (ix86_builtins_isa[i].isa2 & isa2) != 0)
+	  && ix86_builtins_isa[i].set_and_not_built_p)
+	{
+	  tree decl, type;
+
+	  /* Don't define the builtin again.  */
+	  ix86_builtins_isa[i].set_and_not_built_p = false;
+
+	  type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
+	  decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
+						 type, i, BUILT_IN_MD, NULL,
+						 NULL_TREE);
+
+	  ix86_builtins[i] = decl;
+	  if (ix86_builtins_isa[i].const_p)
+	    TREE_READONLY (decl) = 1;
+	}
+    }
+
+  current_target_pragma = saved_current_target_pragma;
+}
+
+/* TM vector builtins.  */
+
+/* Reuse the existing x86-specific `struct builtin_description' cause
+   we're lazy.  Add casts to make them fit.  */
+static const struct builtin_description bdesc_tm[] =
+{
+  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
+  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
+  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
+  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
+  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
+  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
+  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
+
+  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
+  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
+  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
+  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
+
+  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
+  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
+  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
+  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
+
+  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
+  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
+  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
+};
+
+/* Initialize the transactional memory vector load/store builtins.  */
+
+static void
+ix86_init_tm_builtins (void)
+{
+  enum ix86_builtin_func_type ftype;
+  const struct builtin_description *d;
+  size_t i;
+  tree decl;
+  tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
+  tree attrs_log, attrs_type_log;
+
+  if (!flag_tm)
+    return;
+
+  /* If there are no builtins defined, we must be compiling in a
+     language without trans-mem support.  */
+  if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
+    return;
+
+  /* Use whatever attributes a normal TM load has.  */
+  decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
+  attrs_load = DECL_ATTRIBUTES (decl);
+  attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
+  /* Use whatever attributes a normal TM store has.  */
+  decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
+  attrs_store = DECL_ATTRIBUTES (decl);
+  attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
+  /* Use whatever attributes a normal TM log has.  */
+  decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
+  attrs_log = DECL_ATTRIBUTES (decl);
+  attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
+
+  for (i = 0, d = bdesc_tm;
+       i < ARRAY_SIZE (bdesc_tm);
+       i++, d++)
+    {
+      if ((d->mask & ix86_isa_flags) != 0
+	  || (lang_hooks.builtin_function
+	      == lang_hooks.builtin_function_ext_scope))
+	{
+	  tree type, attrs, attrs_type;
+	  enum built_in_function code = (enum built_in_function) d->code;
+
+	  ftype = (enum ix86_builtin_func_type) d->flag;
+	  type = ix86_get_builtin_func_type (ftype);
+
+	  if (BUILTIN_TM_LOAD_P (code))
+	    {
+	      attrs = attrs_load;
+	      attrs_type = attrs_type_load;
+	    }
+	  else if (BUILTIN_TM_STORE_P (code))
+	    {
+	      attrs = attrs_store;
+	      attrs_type = attrs_type_store;
+	    }
+	  else
+	    {
+	      attrs = attrs_log;
+	      attrs_type = attrs_type_log;
+	    }
+	  decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
+				       /* The builtin without the prefix for
+					  calling it directly.  */
+				       d->name + strlen ("__builtin_"),
+				       attrs);
+	  /* add_builtin_function() will set the DECL_ATTRIBUTES, now
+	     set the TYPE_ATTRIBUTES.  */
+	  decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
+
+	  set_builtin_decl (code, decl, false);
+	}
+    }
+}
+
+/* Set up all the MMX/SSE builtins, even builtins for instructions that are not
+   in the current target ISA to allow the user to compile particular modules
+   with different target specific options that differ from the command line
+   options.  */
+static void
+ix86_init_mmx_sse_builtins (void)
+{
+  const struct builtin_description * d;
+  enum ix86_builtin_func_type ftype;
+  size_t i;
+
+  /* Add all special builtins with variable number of operands.  */
+  for (i = 0, d = bdesc_special_args;
+       i < ARRAY_SIZE (bdesc_special_args);
+       i++, d++)
+    {
+      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
+      if (d->name == 0)
+	continue;
+
+      ftype = (enum ix86_builtin_func_type) d->flag;
+      def_builtin (d->mask, d->mask2, d->name, ftype, d->code);
+    }
+  BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
+		 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
+		 ARRAY_SIZE (bdesc_special_args) - 1);
+
+  /* Add all builtins with variable number of operands.  */
+  for (i = 0, d = bdesc_args;
+       i < ARRAY_SIZE (bdesc_args);
+       i++, d++)
+    {
+      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
+      if (d->name == 0)
+	continue;
+
+      ftype = (enum ix86_builtin_func_type) d->flag;
+      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
+    }
+  BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
+		 IX86_BUILTIN__BDESC_ARGS_FIRST,
+		 ARRAY_SIZE (bdesc_args) - 1);
+
+  /* Add all builtins with rounding.  */
+  for (i = 0, d = bdesc_round_args;
+       i < ARRAY_SIZE (bdesc_round_args);
+       i++, d++)
+    {
+      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
+      if (d->name == 0)
+	continue;
+
+      ftype = (enum ix86_builtin_func_type) d->flag;
+      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
+    }
+  BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
+		 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
+		 ARRAY_SIZE (bdesc_round_args) - 1);
+
+  /* pcmpestr[im] insns.  */
+  for (i = 0, d = bdesc_pcmpestr;
+       i < ARRAY_SIZE (bdesc_pcmpestr);
+       i++, d++)
+    {
+      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
+      if (d->code == IX86_BUILTIN_PCMPESTRM128)
+	ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
+      else
+	ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
+      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
+    }
+  BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
+		 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
+		 ARRAY_SIZE (bdesc_pcmpestr) - 1);
+
+  /* pcmpistr[im] insns.  */
+  for (i = 0, d = bdesc_pcmpistr;
+       i < ARRAY_SIZE (bdesc_pcmpistr);
+       i++, d++)
+    {
+      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
+      if (d->code == IX86_BUILTIN_PCMPISTRM128)
+	ftype = V16QI_FTYPE_V16QI_V16QI_INT;
+      else
+	ftype = INT_FTYPE_V16QI_V16QI_INT;
+      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
+    }
+  BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
+		 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
+		 ARRAY_SIZE (bdesc_pcmpistr) - 1);
+
+  /* comi/ucomi insns.  */
+  for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
+    {
+      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
+      if (d->mask == OPTION_MASK_ISA_SSE2)
+	ftype = INT_FTYPE_V2DF_V2DF;
+      else
+	ftype = INT_FTYPE_V4SF_V4SF;
+      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
+    }
+  BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
+		 IX86_BUILTIN__BDESC_COMI_FIRST,
+		 ARRAY_SIZE (bdesc_comi) - 1);
+
+  /* SSE */
+  def_builtin (OPTION_MASK_ISA_SSE, 0,  "__builtin_ia32_ldmxcsr",
+	       VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
+  def_builtin_pure (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_stmxcsr",
+		    UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
+
+  /* SSE or 3DNow!A */
+  def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
+	       /* As it uses V4HImode, we have to require -mmmx too.  */
+	       | OPTION_MASK_ISA_MMX, 0,
+	       "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
+	       IX86_BUILTIN_MASKMOVQ);
+
+  /* SSE2 */
+  def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_maskmovdqu",
+	       VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
+
+  def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_clflush",
+	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
+  x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_mfence",
+			    VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
+
+  /* SSE3.  */
+  def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_monitor",
+	       VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
+  def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_mwait",
+	       VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
+
+  /* AES */
+  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
+		     "__builtin_ia32_aesenc128",
+		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
+  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
+		     "__builtin_ia32_aesenclast128",
+		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
+  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
+		     "__builtin_ia32_aesdec128",
+		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
+  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
+		     "__builtin_ia32_aesdeclast128",
+		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
+  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
+		     "__builtin_ia32_aesimc128",
+		     V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
+  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
+		     "__builtin_ia32_aeskeygenassist128",
+		     V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
+
+  /* PCLMUL */
+  def_builtin_const (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2, 0,
+		     "__builtin_ia32_pclmulqdq128",
+		     V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
+
+  /* RDRND */
+  def_builtin (OPTION_MASK_ISA_RDRND, 0, "__builtin_ia32_rdrand16_step",
+	       INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
+  def_builtin (OPTION_MASK_ISA_RDRND, 0,  "__builtin_ia32_rdrand32_step",
+	       INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
+  def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT, 0,
+	       "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
+	       IX86_BUILTIN_RDRAND64_STEP);
+
+  /* AVX2 */
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv2df",
+		    V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
+		    IX86_BUILTIN_GATHERSIV2DF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4df",
+		    V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
+		    IX86_BUILTIN_GATHERSIV4DF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv2df",
+		    V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
+		    IX86_BUILTIN_GATHERDIV2DF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4df",
+		    V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
+		    IX86_BUILTIN_GATHERDIV4DF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4sf",
+		    V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
+		    IX86_BUILTIN_GATHERSIV4SF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv8sf",
+		    V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
+		    IX86_BUILTIN_GATHERSIV8SF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4sf",
+		    V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
+		    IX86_BUILTIN_GATHERDIV4SF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4sf256",
+		    V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
+		    IX86_BUILTIN_GATHERDIV8SF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv2di",
+		    V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
+		    IX86_BUILTIN_GATHERSIV2DI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4di",
+		    V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
+		    IX86_BUILTIN_GATHERSIV4DI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv2di",
+		    V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
+		    IX86_BUILTIN_GATHERDIV2DI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4di",
+		    V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
+		    IX86_BUILTIN_GATHERDIV4DI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4si",
+		    V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
+		    IX86_BUILTIN_GATHERSIV4SI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv8si",
+		    V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
+		    IX86_BUILTIN_GATHERSIV8SI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4si",
+		    V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
+		    IX86_BUILTIN_GATHERDIV4SI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4si256",
+		    V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
+		    IX86_BUILTIN_GATHERDIV8SI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltsiv4df ",
+		    V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
+		    IX86_BUILTIN_GATHERALTSIV4DF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltdiv8sf ",
+		    V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
+		    IX86_BUILTIN_GATHERALTDIV8SF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltsiv4di ",
+		    V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
+		    IX86_BUILTIN_GATHERALTSIV4DI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltdiv8si ",
+		    V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
+		    IX86_BUILTIN_GATHERALTDIV8SI);
+
+  /* AVX512F */
+  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv16sf",
+		    V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
+		    IX86_BUILTIN_GATHER3SIV16SF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv8df",
+		    V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
+		    IX86_BUILTIN_GATHER3SIV8DF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv16sf",
+		    V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
+		    IX86_BUILTIN_GATHER3DIV16SF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv8df",
+		    V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
+		    IX86_BUILTIN_GATHER3DIV8DF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv16si",
+		    V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
+		    IX86_BUILTIN_GATHER3SIV16SI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv8di",
+		    V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
+		    IX86_BUILTIN_GATHER3SIV8DI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv16si",
+		    V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
+		    IX86_BUILTIN_GATHER3DIV16SI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv8di",
+		    V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
+		    IX86_BUILTIN_GATHER3DIV8DI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altsiv8df ",
+		    V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
+		    IX86_BUILTIN_GATHER3ALTSIV8DF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altdiv16sf ",
+		    V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
+		    IX86_BUILTIN_GATHER3ALTDIV16SF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altsiv8di ",
+		    V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
+		    IX86_BUILTIN_GATHER3ALTSIV8DI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altdiv16si ",
+		    V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
+		    IX86_BUILTIN_GATHER3ALTDIV16SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv16sf",
+	       VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
+	       IX86_BUILTIN_SCATTERSIV16SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv8df",
+	       VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
+	       IX86_BUILTIN_SCATTERSIV8DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv16sf",
+	       VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
+	       IX86_BUILTIN_SCATTERDIV16SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv8df",
+	       VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
+	       IX86_BUILTIN_SCATTERDIV8DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv16si",
+	       VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
+	       IX86_BUILTIN_SCATTERSIV16SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv8di",
+	       VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
+	       IX86_BUILTIN_SCATTERSIV8DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv16si",
+	       VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
+	       IX86_BUILTIN_SCATTERDIV16SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv8di",
+	       VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
+	       IX86_BUILTIN_SCATTERDIV8DI);
+
+  /* AVX512VL */
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv2df",
+		    V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
+		    IX86_BUILTIN_GATHER3SIV2DF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4df",
+		    V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
+		    IX86_BUILTIN_GATHER3SIV4DF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div2df",
+		    V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
+		    IX86_BUILTIN_GATHER3DIV2DF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4df",
+		    V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
+		    IX86_BUILTIN_GATHER3DIV4DF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4sf",
+		    V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
+		    IX86_BUILTIN_GATHER3SIV4SF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv8sf",
+		    V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
+		    IX86_BUILTIN_GATHER3SIV8SF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4sf",
+		    V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
+		    IX86_BUILTIN_GATHER3DIV4SF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div8sf",
+		    V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
+		    IX86_BUILTIN_GATHER3DIV8SF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv2di",
+		    V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
+		    IX86_BUILTIN_GATHER3SIV2DI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4di",
+		    V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
+		    IX86_BUILTIN_GATHER3SIV4DI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div2di",
+		    V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
+		    IX86_BUILTIN_GATHER3DIV2DI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4di",
+		    V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
+		    IX86_BUILTIN_GATHER3DIV4DI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4si",
+		    V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
+		    IX86_BUILTIN_GATHER3SIV4SI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv8si",
+		    V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
+		    IX86_BUILTIN_GATHER3SIV8SI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4si",
+		    V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
+		    IX86_BUILTIN_GATHER3DIV4SI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div8si",
+		    V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
+		    IX86_BUILTIN_GATHER3DIV8SI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altsiv4df ",
+		    V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
+		    IX86_BUILTIN_GATHER3ALTSIV4DF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altdiv8sf ",
+		    V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
+		    IX86_BUILTIN_GATHER3ALTDIV8SF);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altsiv4di ",
+		    V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
+		    IX86_BUILTIN_GATHER3ALTSIV4DI);
+
+  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altdiv8si ",
+		    V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
+		    IX86_BUILTIN_GATHER3ALTDIV8SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv8sf",
+	       VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
+	       IX86_BUILTIN_SCATTERSIV8SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4sf",
+	       VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
+	       IX86_BUILTIN_SCATTERSIV4SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4df",
+	       VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
+	       IX86_BUILTIN_SCATTERSIV4DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv2df",
+	       VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
+	       IX86_BUILTIN_SCATTERSIV2DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv8sf",
+	       VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
+	       IX86_BUILTIN_SCATTERDIV8SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4sf",
+	       VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
+	       IX86_BUILTIN_SCATTERDIV4SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4df",
+	       VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
+	       IX86_BUILTIN_SCATTERDIV4DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv2df",
+	       VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
+	       IX86_BUILTIN_SCATTERDIV2DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv8si",
+	       VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
+	       IX86_BUILTIN_SCATTERSIV8SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4si",
+	       VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
+	       IX86_BUILTIN_SCATTERSIV4SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4di",
+	       VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
+	       IX86_BUILTIN_SCATTERSIV4DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv2di",
+	       VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
+	       IX86_BUILTIN_SCATTERSIV2DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv8si",
+	       VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
+	       IX86_BUILTIN_SCATTERDIV8SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4si",
+	       VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
+	       IX86_BUILTIN_SCATTERDIV4SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4di",
+	       VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
+	       IX86_BUILTIN_SCATTERDIV4DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv2di",
+	       VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
+	       IX86_BUILTIN_SCATTERDIV2DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltsiv8df ",
+	       VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
+	       IX86_BUILTIN_SCATTERALTSIV8DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltdiv16sf ",
+	       VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
+	       IX86_BUILTIN_SCATTERALTDIV16SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltsiv8di ",
+	       VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
+	       IX86_BUILTIN_SCATTERALTSIV8DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltdiv16si ",
+	       VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
+	       IX86_BUILTIN_SCATTERALTDIV16SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv4df ",
+	       VOID_FTYPE_PDOUBLE_QI_V8SI_V4DF_INT,
+	       IX86_BUILTIN_SCATTERALTSIV4DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv8sf ",
+	       VOID_FTYPE_PFLOAT_QI_V4DI_V8SF_INT,
+	       IX86_BUILTIN_SCATTERALTDIV8SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv4di ",
+	       VOID_FTYPE_PLONGLONG_QI_V8SI_V4DI_INT,
+	       IX86_BUILTIN_SCATTERALTSIV4DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv8si ",
+	       VOID_FTYPE_PINT_QI_V4DI_V8SI_INT,
+	       IX86_BUILTIN_SCATTERALTDIV8SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv2df ",
+	       VOID_FTYPE_PDOUBLE_QI_V4SI_V2DF_INT,
+	       IX86_BUILTIN_SCATTERALTSIV2DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv4sf ",
+	       VOID_FTYPE_PFLOAT_QI_V2DI_V4SF_INT,
+	       IX86_BUILTIN_SCATTERALTDIV4SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv2di ",
+	       VOID_FTYPE_PLONGLONG_QI_V4SI_V2DI_INT,
+	       IX86_BUILTIN_SCATTERALTSIV2DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv4si ",
+	       VOID_FTYPE_PINT_QI_V2DI_V4SI_INT,
+	       IX86_BUILTIN_SCATTERALTDIV4SI);
+
+  /* AVX512PF */
+  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfdpd",
+	       VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
+	       IX86_BUILTIN_GATHERPFDPD);
+  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfdps",
+	       VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
+	       IX86_BUILTIN_GATHERPFDPS);
+  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfqpd",
+	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
+	       IX86_BUILTIN_GATHERPFQPD);
+  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfqps",
+	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
+	       IX86_BUILTIN_GATHERPFQPS);
+  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfdpd",
+	       VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
+	       IX86_BUILTIN_SCATTERPFDPD);
+  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfdps",
+	       VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
+	       IX86_BUILTIN_SCATTERPFDPS);
+  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfqpd",
+	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
+	       IX86_BUILTIN_SCATTERPFQPD);
+  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfqps",
+	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
+	       IX86_BUILTIN_SCATTERPFQPS);
+
+  /* SHA */
+  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1msg1",
+		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
+  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1msg2",
+		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
+  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1nexte",
+		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
+  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1rnds4",
+		     V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
+  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256msg1",
+		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
+  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256msg2",
+		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
+  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256rnds2",
+		     V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
+
+  /* RTM.  */
+  def_builtin (OPTION_MASK_ISA_RTM, 0, "__builtin_ia32_xabort",
+	       VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
+
+  /* MMX access to the vec_init patterns.  */
+  def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v2si",
+		     V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
+
+  def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v4hi",
+		     V4HI_FTYPE_HI_HI_HI_HI,
+		     IX86_BUILTIN_VEC_INIT_V4HI);
+
+  def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v8qi",
+		     V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
+		     IX86_BUILTIN_VEC_INIT_V8QI);
+
+  /* Access to the vec_extract patterns.  */
+  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v2df",
+		     DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
+  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v2di",
+		     DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
+  def_builtin_const (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_vec_ext_v4sf",
+		     FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
+  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v4si",
+		     SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
+  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v8hi",
+		     HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
+
+  def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
+		     /* As it uses V4HImode, we have to require -mmmx too.  */
+		     | OPTION_MASK_ISA_MMX, 0,
+		     "__builtin_ia32_vec_ext_v4hi",
+		     HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
+
+  def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_ext_v2si",
+		     SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
+
+  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v16qi",
+		     QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
+
+  /* Access to the vec_set patterns.  */
+  def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT, 0,
+		     "__builtin_ia32_vec_set_v2di",
+		     V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
+
+  def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v4sf",
+		     V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
+
+  def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v4si",
+		     V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
+
+  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_set_v8hi",
+		     V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
+
+  def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
+		     /* As it uses V4HImode, we have to require -mmmx too.  */
+		     | OPTION_MASK_ISA_MMX, 0,
+		     "__builtin_ia32_vec_set_v4hi",
+		     V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
+
+  def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v16qi",
+		     V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
+
+  /* RDSEED */
+  def_builtin (OPTION_MASK_ISA_RDSEED, 0, "__builtin_ia32_rdseed_hi_step",
+	       INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
+  def_builtin (OPTION_MASK_ISA_RDSEED, 0, "__builtin_ia32_rdseed_si_step",
+	       INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
+  def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT, 0,
+	       "__builtin_ia32_rdseed_di_step",
+	       INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
+
+  /* ADCX */
+  def_builtin (0, 0, "__builtin_ia32_addcarryx_u32",
+	       UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
+  def_builtin (OPTION_MASK_ISA_64BIT, 0,
+	       "__builtin_ia32_addcarryx_u64",
+	       UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
+	       IX86_BUILTIN_ADDCARRYX64);
+
+  /* SBB */
+  def_builtin (0, 0, "__builtin_ia32_sbb_u32",
+	       UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
+  def_builtin (OPTION_MASK_ISA_64BIT, 0,
+	       "__builtin_ia32_sbb_u64",
+	       UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
+	       IX86_BUILTIN_SBB64);
+
+  /* Read/write FLAGS.  */
+  if (TARGET_64BIT)
+    {
+      def_builtin (OPTION_MASK_ISA_64BIT, 0, "__builtin_ia32_readeflags_u64",
+		   UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
+      def_builtin (OPTION_MASK_ISA_64BIT, 0, "__builtin_ia32_writeeflags_u64",
+		   VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
+    }
+  else
+    {
+      def_builtin (0, 0, "__builtin_ia32_readeflags_u32",
+		   UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
+      def_builtin (0, 0, "__builtin_ia32_writeeflags_u32",
+		   VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
+    }
+
+  /* CLFLUSHOPT.  */
+  def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, 0, "__builtin_ia32_clflushopt",
+	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
+
+  /* CLWB.  */
+  def_builtin (OPTION_MASK_ISA_CLWB, 0, "__builtin_ia32_clwb",
+	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
+
+  /* MONITORX and MWAITX.  */
+  def_builtin (0, OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
+		VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
+  def_builtin (0, OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
+		VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
+
+  /* CLZERO.  */
+  def_builtin (0, OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
+		VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
+
+  /* WAITPKG.  */
+  def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_umonitor",
+	       VOID_FTYPE_PVOID, IX86_BUILTIN_UMONITOR);
+  def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_umwait",
+	       UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_UMWAIT);
+  def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_tpause",
+	       UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_TPAUSE);
+
+  /* CLDEMOTE.  */
+  def_builtin (0, OPTION_MASK_ISA_CLDEMOTE, "__builtin_ia32_cldemote",
+	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLDEMOTE);
+
+  /* Add FMA4 multi-arg argument instructions */
+  for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
+    {
+      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
+      if (d->name == 0)
+	continue;
+
+      ftype = (enum ix86_builtin_func_type) d->flag;
+      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
+    }
+  BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
+		 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
+		 ARRAY_SIZE (bdesc_multi_arg) - 1);
+
+  /* Add CET inrinsics.  */
+  for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
+    {
+      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
+      if (d->name == 0)
+	continue;
+
+      ftype = (enum ix86_builtin_func_type) d->flag;
+      def_builtin (d->mask, d->mask2, d->name, ftype, d->code);
+    }
+  BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
+		 IX86_BUILTIN__BDESC_CET_FIRST,
+		 ARRAY_SIZE (bdesc_cet) - 1);
+
+  for (i = 0, d = bdesc_cet_rdssp;
+       i < ARRAY_SIZE (bdesc_cet_rdssp);
+       i++, d++)
+    {
+      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
+      if (d->name == 0)
+	continue;
+
+      ftype = (enum ix86_builtin_func_type) d->flag;
+      def_builtin (d->mask, d->mask2, d->name, ftype, d->code);
+    }
+  BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
+		 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
+		 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
+}
+
+#undef BDESC_VERIFY
+#undef BDESC_VERIFYS
+
+/* Make builtins to detect cpu type and features supported.  NAME is
+   the builtin name, CODE is the builtin code, and FTYPE is the function
+   type of the builtin.  */
+
+static void
+make_cpu_type_builtin (const char* name, int code,
+		       enum ix86_builtin_func_type ftype, bool is_const)
+{
+  tree decl;
+  tree type;
+
+  type = ix86_get_builtin_func_type (ftype);
+  decl = add_builtin_function (name, type, code, BUILT_IN_MD,
+			       NULL, NULL_TREE);
+  gcc_assert (decl != NULL_TREE);
+  ix86_builtins[(int) code] = decl;
+  TREE_READONLY (decl) = is_const;
+}
+
+/* Make builtins to get CPU type and features supported.  The created
+   builtins are :
+
+   __builtin_cpu_init (), to detect cpu type and features,
+   __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
+   __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
+   */
+
+static void
+ix86_init_platform_type_builtins (void)
+{
+  make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
+			 INT_FTYPE_VOID, false);
+  make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
+			 INT_FTYPE_PCCHAR, true);
+  make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
+			 INT_FTYPE_PCCHAR, true);
+}
+
+/* Internal method for ix86_init_builtins.  */
+
+static void
+ix86_init_builtins_va_builtins_abi (void)
+{
+  tree ms_va_ref, sysv_va_ref;
+  tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
+  tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
+  tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
+  tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
+
+  if (!TARGET_64BIT)
+    return;
+  fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
+  fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
+  ms_va_ref = build_reference_type (ms_va_list_type_node);
+  sysv_va_ref = build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
+
+  fnvoid_va_end_ms = build_function_type_list (void_type_node, ms_va_ref,
+					       NULL_TREE);
+  fnvoid_va_start_ms
+    = build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
+  fnvoid_va_end_sysv
+    = build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
+  fnvoid_va_start_sysv
+    = build_varargs_function_type_list (void_type_node, sysv_va_ref,
+					NULL_TREE);
+  fnvoid_va_copy_ms
+    = build_function_type_list (void_type_node, ms_va_ref,
+				ms_va_list_type_node, NULL_TREE);
+  fnvoid_va_copy_sysv
+    = build_function_type_list (void_type_node, sysv_va_ref,
+				sysv_va_ref, NULL_TREE);
+
+  add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
+  			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
+  add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
+  			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
+  add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
+			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
+  add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
+  			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
+  add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
+  			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
+  add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
+			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
+}
+
+static void
+ix86_init_builtin_types (void)
+{
+  tree float80_type_node, const_string_type_node;
+
+  /* The __float80 type.  */
+  float80_type_node = long_double_type_node;
+  if (TYPE_MODE (float80_type_node) != XFmode)
+    {
+      if (float64x_type_node != NULL_TREE
+	  && TYPE_MODE (float64x_type_node) == XFmode)
+	float80_type_node = float64x_type_node;
+      else
+	{
+	  /* The __float80 type.  */
+	  float80_type_node = make_node (REAL_TYPE);
+
+	  TYPE_PRECISION (float80_type_node) = 80;
+	  layout_type (float80_type_node);
+	}
+    }
+  lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
+
+  /* The __float128 type.  The node has already been created as
+     _Float128, so we only need to register the __float128 name for
+     it.  */
+  lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
+
+  const_string_type_node
+    = build_pointer_type (build_qualified_type
+			  (char_type_node, TYPE_QUAL_CONST));
+
+  /* This macro is built by i386-builtin-types.awk.  */
+  DEFINE_BUILTIN_PRIMITIVE_TYPES;
+}
+
+void
+ix86_init_builtins (void)
+{
+  tree ftype, decl;
+
+  ix86_init_builtin_types ();
+
+  /* Builtins to get CPU type and features. */
+  ix86_init_platform_type_builtins ();
+
+  /* TFmode support builtins.  */
+  def_builtin_const (0, 0, "__builtin_infq",
+		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
+  def_builtin_const (0, 0, "__builtin_huge_valq",
+		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
+
+  ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
+  decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
+			       BUILT_IN_MD, "nanq", NULL_TREE);
+  TREE_READONLY (decl) = 1;
+  ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
+
+  decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
+			       BUILT_IN_MD, "nansq", NULL_TREE);
+  TREE_READONLY (decl) = 1;
+  ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
+
+  /* We will expand them to normal call if SSE isn't available since
+     they are used by libgcc. */
+  ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
+  decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
+			       BUILT_IN_MD, "__fabstf2", NULL_TREE);
+  TREE_READONLY (decl) = 1;
+  ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
+
+  ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
+  decl = add_builtin_function ("__builtin_copysignq", ftype,
+			       IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
+			       "__copysigntf3", NULL_TREE);
+  TREE_READONLY (decl) = 1;
+  ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
+
+  ix86_init_tm_builtins ();
+  ix86_init_mmx_sse_builtins ();
+
+  if (TARGET_LP64)
+    ix86_init_builtins_va_builtins_abi ();
+
+#ifdef SUBTARGET_INIT_BUILTINS
+  SUBTARGET_INIT_BUILTINS;
+#endif
+}
+
+/* Return the ix86 builtin for CODE.  */
+
+tree
+ix86_builtin_decl (unsigned code, bool)
+{
+  if (code >= IX86_BUILTIN_MAX)
+    return error_mark_node;
+
+  return ix86_builtins[code];
+}
+
+/* This returns the target-specific builtin with code CODE if
+   current_function_decl has visibility on this builtin, which is checked
+   using isa flags.  Returns NULL_TREE otherwise.  */
+
+static tree ix86_get_builtin (enum ix86_builtins code)
+{
+  struct cl_target_option *opts;
+  tree target_tree = NULL_TREE;
+
+  /* Determine the isa flags of current_function_decl.  */
+
+  if (current_function_decl)
+    target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
+
+  if (target_tree == NULL)
+    target_tree = target_option_default_node;
+
+  opts = TREE_TARGET_OPTION (target_tree);
+
+  if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
+      || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
+    return ix86_builtin_decl (code, true);
+  else
+    return NULL_TREE;
+}
+
+/* Vectorization library interface and handlers.  */
+tree (*ix86_veclib_handler) (combined_fn, tree, tree);
+
+/* Returns a function decl for a vectorized version of the combined function
+   with combined_fn code FN and the result vector type TYPE, or NULL_TREE
+   if it is not available.  */
+
+tree
+ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
+				  tree type_in)
+{
+  machine_mode in_mode, out_mode;
+  int in_n, out_n;
+
+  if (TREE_CODE (type_out) != VECTOR_TYPE
+      || TREE_CODE (type_in) != VECTOR_TYPE)
+    return NULL_TREE;
+
+  out_mode = TYPE_MODE (TREE_TYPE (type_out));
+  out_n = TYPE_VECTOR_SUBPARTS (type_out);
+  in_mode = TYPE_MODE (TREE_TYPE (type_in));
+  in_n = TYPE_VECTOR_SUBPARTS (type_in);
+
+  switch (fn)
+    {
+    CASE_CFN_EXP2:
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
+	}
+      break;
+
+    CASE_CFN_IFLOOR:
+    CASE_CFN_LFLOOR:
+    CASE_CFN_LLFLOOR:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_SSE4_1)
+	break;
+
+      if (out_mode == SImode && in_mode == DFmode)
+	{
+	  if (out_n == 4 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
+	  else if (out_n == 8 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
+	  else if (out_n == 16 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
+	}
+      if (out_mode == SImode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
+	}
+      break;
+
+    CASE_CFN_ICEIL:
+    CASE_CFN_LCEIL:
+    CASE_CFN_LLCEIL:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_SSE4_1)
+	break;
+
+      if (out_mode == SImode && in_mode == DFmode)
+	{
+	  if (out_n == 4 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
+	  else if (out_n == 8 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
+	  else if (out_n == 16 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
+	}
+      if (out_mode == SImode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
+	}
+      break;
+
+    CASE_CFN_IRINT:
+    CASE_CFN_LRINT:
+    CASE_CFN_LLRINT:
+      if (out_mode == SImode && in_mode == DFmode)
+	{
+	  if (out_n == 4 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
+	  else if (out_n == 8 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
+	  else if (out_n == 16 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
+	}
+      if (out_mode == SImode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
+	}
+      break;
+
+    CASE_CFN_IROUND:
+    CASE_CFN_LROUND:
+    CASE_CFN_LLROUND:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_SSE4_1)
+	break;
+
+      if (out_mode == SImode && in_mode == DFmode)
+	{
+	  if (out_n == 4 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
+	  else if (out_n == 8 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
+	  else if (out_n == 16 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
+	}
+      if (out_mode == SImode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
+	}
+      break;
+
+    CASE_CFN_FLOOR:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_SSE4_1)
+	break;
+
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
+	  else if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
+	}
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
+	}
+      break;
+
+    CASE_CFN_CEIL:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_SSE4_1)
+	break;
+
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPD);
+	  else if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
+	}
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPS);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
+	}
+      break;
+
+    CASE_CFN_TRUNC:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_SSE4_1)
+	break;
+
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
+	  else if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
+	}
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
+	}
+      break;
+
+    CASE_CFN_RINT:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_SSE4_1)
+	break;
+
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_RINTPD);
+	  else if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
+	}
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_RINTPS);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
+	}
+      break;
+
+    CASE_CFN_FMA:
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
+	}
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
+	  if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
+	}
+      break;
+
+    default:
+      break;
+    }
+
+  /* Dispatch to a handler for a vectorization library.  */
+  if (ix86_veclib_handler)
+    return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
+
+  return NULL_TREE;
+}
+
+/* Returns a decl of a function that implements gather load with
+   memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
+   Return NULL_TREE if it is not available.  */
+
+tree
+ix86_vectorize_builtin_gather (const_tree mem_vectype,
+			       const_tree index_type, int scale)
+{
+  bool si;
+  enum ix86_builtins code;
+
+  if (! TARGET_AVX2 || !TARGET_USE_GATHER)
+    return NULL_TREE;
+
+  if ((TREE_CODE (index_type) != INTEGER_TYPE
+       && !POINTER_TYPE_P (index_type))
+      || (TYPE_MODE (index_type) != SImode
+	  && TYPE_MODE (index_type) != DImode))
+    return NULL_TREE;
+
+  if (TYPE_PRECISION (index_type) > POINTER_SIZE)
+    return NULL_TREE;
+
+  /* v*gather* insn sign extends index to pointer mode.  */
+  if (TYPE_PRECISION (index_type) < POINTER_SIZE
+      && TYPE_UNSIGNED (index_type))
+    return NULL_TREE;
+
+  if (scale <= 0
+      || scale > 8
+      || (scale & (scale - 1)) != 0)
+    return NULL_TREE;
+
+  si = TYPE_MODE (index_type) == SImode;
+  switch (TYPE_MODE (mem_vectype))
+    {
+    case E_V2DFmode:
+      if (TARGET_AVX512VL)
+	code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
+      else
+	code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
+      break;
+    case E_V4DFmode:
+      if (TARGET_AVX512VL)
+	code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
+      else
+	code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
+      break;
+    case E_V2DImode:
+      if (TARGET_AVX512VL)
+	code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
+      else
+	code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
+      break;
+    case E_V4DImode:
+      if (TARGET_AVX512VL)
+	code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
+      else
+	code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
+      break;
+    case E_V4SFmode:
+      if (TARGET_AVX512VL)
+	code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
+      else
+	code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
+      break;
+    case E_V8SFmode:
+      if (TARGET_AVX512VL)
+	code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
+      else
+	code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
+      break;
+    case E_V4SImode:
+      if (TARGET_AVX512VL)
+	code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
+      else
+	code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
+      break;
+    case E_V8SImode:
+      if (TARGET_AVX512VL)
+	code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
+      else
+	code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
+      break;
+    case E_V8DFmode:
+      if (TARGET_AVX512F)
+	code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
+      else
+	return NULL_TREE;
+      break;
+    case E_V8DImode:
+      if (TARGET_AVX512F)
+	code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
+      else
+	return NULL_TREE;
+      break;
+    case E_V16SFmode:
+      if (TARGET_AVX512F)
+	code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
+      else
+	return NULL_TREE;
+      break;
+    case E_V16SImode:
+      if (TARGET_AVX512F)
+	code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
+      else
+	return NULL_TREE;
+      break;
+    default:
+      return NULL_TREE;
+    }
+
+  return ix86_get_builtin (code);
+}
+
+/* Returns a code for a target-specific builtin that implements
+   reciprocal of the function, or NULL_TREE if not available.  */
+
+tree
+ix86_builtin_reciprocal (tree fndecl)
+{
+  enum ix86_builtins fn_code
+    = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
+  switch (fn_code)
+    {
+      /* Vectorized version of sqrt to rsqrt conversion.  */
+    case IX86_BUILTIN_SQRTPS_NR:
+      return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
+
+    case IX86_BUILTIN_SQRTPS_NR256:
+      return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
+
+    default:
+      return NULL_TREE;
+    }
+}
+
+/* Priority of i386 features, greater value is higher priority.   This is
+   used to decide the order in which function dispatch must happen.  For
+   instance, a version specialized for SSE4.2 should be checked for dispatch
+   before a version for SSE3, as SSE4.2 implies SSE3.  */
+enum feature_priority
+{
+  P_ZERO = 0,
+  P_MMX,
+  P_SSE,
+  P_SSE2,
+  P_SSE3,
+  P_SSSE3,
+  P_PROC_SSSE3,
+  P_SSE4_A,
+  P_PROC_SSE4_A,
+  P_SSE4_1,
+  P_SSE4_2,
+  P_PROC_SSE4_2,
+  P_POPCNT,
+  P_AES,
+  P_PCLMUL,
+  P_AVX,
+  P_PROC_AVX,
+  P_BMI,
+  P_PROC_BMI,
+  P_FMA4,
+  P_XOP,
+  P_PROC_XOP,
+  P_FMA,
+  P_PROC_FMA,
+  P_BMI2,
+  P_AVX2,
+  P_PROC_AVX2,
+  P_AVX512F,
+  P_PROC_AVX512F
+};
+
+/* This is the order of bit-fields in __processor_features in cpuinfo.c */
+enum processor_features
+{
+  F_CMOV = 0,
+  F_MMX,
+  F_POPCNT,
+  F_SSE,
+  F_SSE2,
+  F_SSE3,
+  F_SSSE3,
+  F_SSE4_1,
+  F_SSE4_2,
+  F_AVX,
+  F_AVX2,
+  F_SSE4_A,
+  F_FMA4,
+  F_XOP,
+  F_FMA,
+  F_AVX512F,
+  F_BMI,
+  F_BMI2,
+  F_AES,
+  F_PCLMUL,
+  F_AVX512VL,
+  F_AVX512BW,
+  F_AVX512DQ,
+  F_AVX512CD,
+  F_AVX512ER,
+  F_AVX512PF,
+  F_AVX512VBMI,
+  F_AVX512IFMA,
+  F_AVX5124VNNIW,
+  F_AVX5124FMAPS,
+  F_AVX512VPOPCNTDQ,
+  F_AVX512VBMI2,
+  F_GFNI,
+  F_VPCLMULQDQ,
+  F_AVX512VNNI,
+  F_AVX512BITALG,
+  F_MAX
+};
+
+/* These are the values for vendor types and cpu types  and subtypes
+   in cpuinfo.c.  Cpu types and subtypes should be subtracted by
+   the corresponding start value.  */
+enum processor_model
+{
+  M_INTEL = 1,
+  M_AMD,
+  M_CPU_TYPE_START,
+  M_INTEL_BONNELL,
+  M_INTEL_CORE2,
+  M_INTEL_COREI7,
+  M_AMDFAM10H,
+  M_AMDFAM15H,
+  M_INTEL_SILVERMONT,
+  M_INTEL_KNL,
+  M_AMD_BTVER1,
+  M_AMD_BTVER2,
+  M_AMDFAM17H,
+  M_INTEL_KNM,
+  M_INTEL_GOLDMONT,
+  M_INTEL_GOLDMONT_PLUS,
+  M_INTEL_TREMONT,
+  M_CPU_SUBTYPE_START,
+  M_INTEL_COREI7_NEHALEM,
+  M_INTEL_COREI7_WESTMERE,
+  M_INTEL_COREI7_SANDYBRIDGE,
+  M_AMDFAM10H_BARCELONA,
+  M_AMDFAM10H_SHANGHAI,
+  M_AMDFAM10H_ISTANBUL,
+  M_AMDFAM15H_BDVER1,
+  M_AMDFAM15H_BDVER2,
+  M_AMDFAM15H_BDVER3,
+  M_AMDFAM15H_BDVER4,
+  M_AMDFAM17H_ZNVER1,
+  M_INTEL_COREI7_IVYBRIDGE,
+  M_INTEL_COREI7_HASWELL,
+  M_INTEL_COREI7_BROADWELL,
+  M_INTEL_COREI7_SKYLAKE,
+  M_INTEL_COREI7_SKYLAKE_AVX512,
+  M_INTEL_COREI7_CANNONLAKE,
+  M_INTEL_COREI7_ICELAKE_CLIENT,
+  M_INTEL_COREI7_ICELAKE_SERVER,
+  M_AMDFAM17H_ZNVER2,
+  M_INTEL_COREI7_CASCADELAKE
+};
+
+struct _arch_names_table
+{
+  const char *const name;
+  const enum processor_model model;
+};
+
+static const _arch_names_table arch_names_table[] =
+{
+  {"amd", M_AMD},
+  {"intel", M_INTEL},
+  {"atom", M_INTEL_BONNELL},
+  {"slm", M_INTEL_SILVERMONT},
+  {"core2", M_INTEL_CORE2},
+  {"corei7", M_INTEL_COREI7},
+  {"nehalem", M_INTEL_COREI7_NEHALEM},
+  {"westmere", M_INTEL_COREI7_WESTMERE},
+  {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
+  {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
+  {"haswell", M_INTEL_COREI7_HASWELL},
+  {"broadwell", M_INTEL_COREI7_BROADWELL},
+  {"skylake", M_INTEL_COREI7_SKYLAKE},
+  {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
+  {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
+  {"icelake-client", M_INTEL_COREI7_ICELAKE_CLIENT},
+  {"icelake-server", M_INTEL_COREI7_ICELAKE_SERVER},
+  {"cascadelake", M_INTEL_COREI7_CASCADELAKE},
+  {"bonnell", M_INTEL_BONNELL},
+  {"silvermont", M_INTEL_SILVERMONT},
+  {"goldmont", M_INTEL_GOLDMONT},
+  {"goldmont-plus", M_INTEL_GOLDMONT_PLUS},
+  {"tremont", M_INTEL_TREMONT},
+  {"knl", M_INTEL_KNL},
+  {"knm", M_INTEL_KNM},
+  {"amdfam10h", M_AMDFAM10H},
+  {"barcelona", M_AMDFAM10H_BARCELONA},
+  {"shanghai", M_AMDFAM10H_SHANGHAI},
+  {"istanbul", M_AMDFAM10H_ISTANBUL},
+  {"btver1", M_AMD_BTVER1},
+  {"amdfam15h", M_AMDFAM15H},
+  {"bdver1", M_AMDFAM15H_BDVER1},
+  {"bdver2", M_AMDFAM15H_BDVER2},
+  {"bdver3", M_AMDFAM15H_BDVER3},
+  {"bdver4", M_AMDFAM15H_BDVER4},
+  {"btver2", M_AMD_BTVER2},
+  {"amdfam17h", M_AMDFAM17H},
+  {"znver1", M_AMDFAM17H_ZNVER1},
+  {"znver2", M_AMDFAM17H_ZNVER2},
+};
+
+/* These are the target attribute strings for which a dispatcher is
+   available, from fold_builtin_cpu.  */
+struct _isa_names_table
+{
+  const char *const name;
+  const enum processor_features feature;
+  const enum feature_priority priority;
+};
+
+static const _isa_names_table isa_names_table[] =
+{
+  {"cmov",    F_CMOV,	P_ZERO},
+  {"mmx",     F_MMX,	P_MMX},
+  {"popcnt",  F_POPCNT,	P_POPCNT},
+  {"sse",     F_SSE,	P_SSE},
+  {"sse2",    F_SSE2,	P_SSE2},
+  {"sse3",    F_SSE3,	P_SSE3},
+  {"ssse3",   F_SSSE3,	P_SSSE3},
+  {"sse4a",   F_SSE4_A,	P_SSE4_A},
+  {"sse4.1",  F_SSE4_1,	P_SSE4_1},
+  {"sse4.2",  F_SSE4_2,	P_SSE4_2},
+  {"avx",     F_AVX,	P_AVX},
+  {"fma4",    F_FMA4,	P_FMA4},
+  {"xop",     F_XOP,	P_XOP},
+  {"fma",     F_FMA,	P_FMA},
+  {"avx2",    F_AVX2,	P_AVX2},
+  {"avx512f", F_AVX512F, P_AVX512F},
+  {"bmi",     F_BMI,	P_BMI},
+  {"bmi2",    F_BMI2,	P_BMI2},
+  {"aes",     F_AES,	P_AES},
+  {"pclmul",  F_PCLMUL,	P_PCLMUL},
+  {"avx512vl",F_AVX512VL, P_ZERO},
+  {"avx512bw",F_AVX512BW, P_ZERO},
+  {"avx512dq",F_AVX512DQ, P_ZERO},
+  {"avx512cd",F_AVX512CD, P_ZERO},
+  {"avx512er",F_AVX512ER, P_ZERO},
+  {"avx512pf",F_AVX512PF, P_ZERO},
+  {"avx512vbmi",F_AVX512VBMI, P_ZERO},
+  {"avx512ifma",F_AVX512IFMA, P_ZERO},
+  {"avx5124vnniw",F_AVX5124VNNIW, P_ZERO},
+  {"avx5124fmaps",F_AVX5124FMAPS, P_ZERO},
+  {"avx512vpopcntdq",F_AVX512VPOPCNTDQ,	P_ZERO},
+  {"avx512vbmi2", F_AVX512VBMI2, P_ZERO},
+  {"gfni",	F_GFNI,	P_ZERO},
+  {"vpclmulqdq", F_VPCLMULQDQ, P_ZERO},
+  {"avx512vnni", F_AVX512VNNI, P_ZERO},
+  {"avx512bitalg", F_AVX512BITALG, P_ZERO}
+};
+
+/* This parses the attribute arguments to target in DECL and determines
+   the right builtin to use to match the platform specification.
+   It returns the priority value for this version decl.  If PREDICATE_LIST
+   is not NULL, it stores the list of cpu features that need to be checked
+   before dispatching this function.  */
+
+unsigned int
+get_builtin_code_for_version (tree decl, tree *predicate_list)
+{
+  tree attrs;
+  struct cl_target_option cur_target;
+  tree target_node;
+  struct cl_target_option *new_target;
+  const char *arg_str = NULL;
+  const char *attrs_str = NULL;
+  char *tok_str = NULL;
+  char *token;
+
+  enum feature_priority priority = P_ZERO;
+
+  static unsigned int NUM_FEATURES
+    = sizeof (isa_names_table) / sizeof (_isa_names_table);
+
+  unsigned int i;
+
+  tree predicate_chain = NULL_TREE;
+  tree predicate_decl, predicate_arg;
+
+  attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
+  gcc_assert (attrs != NULL);
+
+  attrs = TREE_VALUE (TREE_VALUE (attrs));
+
+  gcc_assert (TREE_CODE (attrs) == STRING_CST);
+  attrs_str = TREE_STRING_POINTER (attrs);
+
+  /* Return priority zero for default function.  */
+  if (strcmp (attrs_str, "default") == 0)
+    return 0;
+
+  /* Handle arch= if specified.  For priority, set it to be 1 more than
+     the best instruction set the processor can handle.  For instance, if
+     there is a version for atom and a version for ssse3 (the highest ISA
+     priority for atom), the atom version must be checked for dispatch
+     before the ssse3 version. */
+  if (strstr (attrs_str, "arch=") != NULL)
+    {
+      cl_target_option_save (&cur_target, &global_options);
+      target_node
+	= ix86_valid_target_attribute_tree (decl, attrs, &global_options,
+					    &global_options_set, 0);
+    
+      gcc_assert (target_node);
+      if (target_node == error_mark_node)
+	return 0;
+      new_target = TREE_TARGET_OPTION (target_node);
+      gcc_assert (new_target);
+      
+      if (new_target->arch_specified && new_target->arch > 0)
+	{
+	  switch (new_target->arch)
+	    {
+	    case PROCESSOR_CORE2:
+	      arg_str = "core2";
+	      priority = P_PROC_SSSE3;
+	      break;
+	    case PROCESSOR_NEHALEM:
+	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_PCLMUL)
+		{
+		  arg_str = "westmere";
+		  priority = P_PCLMUL;
+		}
+	      else
+		{
+		  /* We translate "arch=corei7" and "arch=nehalem" to
+		     "corei7" so that it will be mapped to M_INTEL_COREI7
+		     as cpu type to cover all M_INTEL_COREI7_XXXs.  */
+		  arg_str = "corei7";
+		  priority = P_PROC_SSE4_2;
+		}
+	      break;
+	    case PROCESSOR_SANDYBRIDGE:
+	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
+		arg_str = "ivybridge";
+	      else
+		arg_str = "sandybridge";
+	      priority = P_PROC_AVX;
+	      break;
+	    case PROCESSOR_HASWELL:
+	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
+		arg_str = "broadwell";
+	      else
+		arg_str = "haswell";
+	      priority = P_PROC_AVX2;
+	      break;
+	    case PROCESSOR_SKYLAKE:
+	      arg_str = "skylake";
+	      priority = P_PROC_AVX2;
+	      break;
+	    case PROCESSOR_SKYLAKE_AVX512:
+	      arg_str = "skylake-avx512";
+	      priority = P_PROC_AVX512F;
+	      break;
+	    case PROCESSOR_CANNONLAKE:
+	      arg_str = "cannonlake";
+	      priority = P_PROC_AVX512F;
+	      break;
+	    case PROCESSOR_ICELAKE_CLIENT:
+	      arg_str = "icelake-client";
+	      priority = P_PROC_AVX512F;
+	      break;
+	    case PROCESSOR_ICELAKE_SERVER:
+	      arg_str = "icelake-server";
+	      priority = P_PROC_AVX512F;
+	      break;
+	    case PROCESSOR_CASCADELAKE:
+	      arg_str = "cascadelake";
+	      priority = P_PROC_AVX512F;
+	      break;
+	    case PROCESSOR_BONNELL:
+	      arg_str = "bonnell";
+	      priority = P_PROC_SSSE3;
+	      break;
+	    case PROCESSOR_KNL:
+	      arg_str = "knl";
+	      priority = P_PROC_AVX512F;
+	      break;
+	    case PROCESSOR_KNM:
+	      arg_str = "knm";
+	      priority = P_PROC_AVX512F;
+	      break;
+	    case PROCESSOR_SILVERMONT:
+	      arg_str = "silvermont";
+	      priority = P_PROC_SSE4_2;
+	      break;
+	    case PROCESSOR_GOLDMONT:
+	      arg_str = "goldmont";
+	      priority = P_PROC_SSE4_2;
+	      break;
+	    case PROCESSOR_GOLDMONT_PLUS:
+	      arg_str = "goldmont-plus";
+	      priority = P_PROC_SSE4_2;
+	      break;
+	    case PROCESSOR_TREMONT:
+	      arg_str = "tremont";
+	      priority = P_PROC_SSE4_2;
+	      break;
+	    case PROCESSOR_AMDFAM10:
+	      arg_str = "amdfam10h";
+	      priority = P_PROC_SSE4_A;
+	      break;
+	    case PROCESSOR_BTVER1:
+	      arg_str = "btver1";
+	      priority = P_PROC_SSE4_A;
+	      break;
+	    case PROCESSOR_BTVER2:
+	      arg_str = "btver2";
+	      priority = P_PROC_BMI;
+	      break;
+	    case PROCESSOR_BDVER1:
+	      arg_str = "bdver1";
+	      priority = P_PROC_XOP;
+	      break;
+	    case PROCESSOR_BDVER2:
+	      arg_str = "bdver2";
+	      priority = P_PROC_FMA;
+	      break;
+	    case PROCESSOR_BDVER3:
+	      arg_str = "bdver3";
+	      priority = P_PROC_FMA;
+	      break;
+	    case PROCESSOR_BDVER4:
+	      arg_str = "bdver4";
+	      priority = P_PROC_AVX2;
+	      break;
+	    case PROCESSOR_ZNVER1:
+	      arg_str = "znver1";
+	      priority = P_PROC_AVX2;
+	      break;
+	    case PROCESSOR_ZNVER2:
+	      arg_str = "znver2";
+	      priority = P_PROC_AVX2;
+	      break;
+	    }
+	}
+
+      cl_target_option_restore (&global_options, &cur_target);
+	
+      if (predicate_list && arg_str == NULL)
+	{
+	  error_at (DECL_SOURCE_LOCATION (decl),
+		    "no dispatcher found for the versioning attributes");
+	  return 0;
+	}
+    
+      if (predicate_list)
+	{
+          predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
+          /* For a C string literal the length includes the trailing NULL.  */
+          predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
+          predicate_chain = tree_cons (predicate_decl, predicate_arg,
+				       predicate_chain);
+	}
+    }
+
+  /* Process feature name.  */
+  tok_str =  (char *) xmalloc (strlen (attrs_str) + 1);
+  strcpy (tok_str, attrs_str);
+  token = strtok (tok_str, ",");
+  predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
+
+  while (token != NULL)
+    {
+      /* Do not process "arch="  */
+      if (strncmp (token, "arch=", 5) == 0)
+	{
+	  token = strtok (NULL, ",");
+	  continue;
+	}
+      for (i = 0; i < NUM_FEATURES; ++i)
+	{
+	  if (strcmp (token, isa_names_table[i].name) == 0)
+	    {
+	      if (predicate_list)
+		{
+		  predicate_arg = build_string_literal (
+				  strlen (isa_names_table[i].name) + 1,
+				  isa_names_table[i].name);
+		  predicate_chain = tree_cons (predicate_decl, predicate_arg,
+					       predicate_chain);
+		}
+	      /* Find the maximum priority feature.  */
+	      if (isa_names_table[i].priority > priority)
+		priority = isa_names_table[i].priority;
+
+	      break;
+	    }
+	}
+      if (predicate_list && priority == P_ZERO)
+	{
+	  error_at (DECL_SOURCE_LOCATION (decl),
+		    "ISA %qs is not supported in %<target%> attribute, "
+		    "use %<arch=%> syntax", token);
+	  return 0;
+	}
+      token = strtok (NULL, ",");
+    }
+  free (tok_str);
+
+  if (predicate_list && predicate_chain == NULL_TREE)
+    {
+      error_at (DECL_SOURCE_LOCATION (decl),
+	        "no dispatcher found for the versioning attributes: %s",
+	        attrs_str);
+      return 0;
+    }
+  else if (predicate_list)
+    {
+      predicate_chain = nreverse (predicate_chain);
+      *predicate_list = predicate_chain;
+    }
+
+  return priority; 
+}
+
+/* This builds the processor_model struct type defined in
+   libgcc/config/i386/cpuinfo.c  */
+
+static tree
+build_processor_model_struct (void)
+{
+  const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
+			      "__cpu_features"};
+  tree field = NULL_TREE, field_chain = NULL_TREE;
+  int i;
+  tree type = make_node (RECORD_TYPE);
+
+  /* The first 3 fields are unsigned int.  */
+  for (i = 0; i < 3; ++i)
+    {
+      field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
+			  get_identifier (field_name[i]), unsigned_type_node);
+      if (field_chain != NULL_TREE)
+	DECL_CHAIN (field) = field_chain;
+      field_chain = field;
+    }
+
+  /* The last field is an array of unsigned integers of size one.  */
+  field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
+		      get_identifier (field_name[3]),
+		      build_array_type (unsigned_type_node,
+					build_index_type (size_one_node)));
+  if (field_chain != NULL_TREE)
+    DECL_CHAIN (field) = field_chain;
+  field_chain = field;
+
+  finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
+  return type;
+}
+
+/* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
+
+static tree
+make_var_decl (tree type, const char *name)
+{
+  tree new_decl;
+
+  new_decl = build_decl (UNKNOWN_LOCATION,
+	                 VAR_DECL,
+	  	         get_identifier(name),
+		         type);
+
+  DECL_EXTERNAL (new_decl) = 1;
+  TREE_STATIC (new_decl) = 1;
+  TREE_PUBLIC (new_decl) = 1;
+  DECL_INITIAL (new_decl) = 0;
+  DECL_ARTIFICIAL (new_decl) = 0;
+  DECL_PRESERVE_P (new_decl) = 1;
+
+  make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
+  assemble_variable (new_decl, 0, 0, 0);
+
+  return new_decl;
+}
+
+/* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
+   into an integer defined in libgcc/config/i386/cpuinfo.c */
+
+tree
+fold_builtin_cpu (tree fndecl, tree *args)
+{
+  unsigned int i;
+  enum ix86_builtins fn_code
+    = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
+  tree param_string_cst = NULL;
+
+  tree __processor_model_type = build_processor_model_struct ();
+  tree __cpu_model_var = make_var_decl (__processor_model_type,
+					"__cpu_model");
+
+
+  varpool_node::add (__cpu_model_var);
+
+  gcc_assert ((args != NULL) && (*args != NULL));
+
+  param_string_cst = *args;
+  while (param_string_cst
+	 && TREE_CODE (param_string_cst) !=  STRING_CST)
+    {
+      /* *args must be a expr that can contain other EXPRS leading to a
+	 STRING_CST.   */
+      if (!EXPR_P (param_string_cst))
+ 	{
+	  error ("parameter to builtin must be a string constant or literal");
+	  return integer_zero_node;
+	}
+      param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
+    }
+
+  gcc_assert (param_string_cst);
+
+  if (fn_code == IX86_BUILTIN_CPU_IS)
+    {
+      tree ref;
+      tree field;
+      tree final;
+
+      unsigned int field_val = 0;
+      unsigned int NUM_ARCH_NAMES
+	= sizeof (arch_names_table) / sizeof (struct _arch_names_table);
+
+      for (i = 0; i < NUM_ARCH_NAMES; i++)
+	if (strcmp (arch_names_table[i].name,
+	    TREE_STRING_POINTER (param_string_cst)) == 0)
+	  break;
+
+      if (i == NUM_ARCH_NAMES)
+	{
+	  error ("parameter to builtin not valid: %s",
+	         TREE_STRING_POINTER (param_string_cst));
+	  return integer_zero_node;
+	}
+
+      field = TYPE_FIELDS (__processor_model_type);
+      field_val = arch_names_table[i].model;
+
+      /* CPU types are stored in the next field.  */
+      if (field_val > M_CPU_TYPE_START
+	  && field_val < M_CPU_SUBTYPE_START)
+	{
+	  field = DECL_CHAIN (field);
+	  field_val -= M_CPU_TYPE_START;
+	}
+
+      /* CPU subtypes are stored in the next field.  */
+      if (field_val > M_CPU_SUBTYPE_START)
+	{
+	  field = DECL_CHAIN ( DECL_CHAIN (field));
+	  field_val -= M_CPU_SUBTYPE_START;
+	}
+
+      /* Get the appropriate field in __cpu_model.  */
+      ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
+		    field, NULL_TREE);
+
+      /* Check the value.  */
+      final = build2 (EQ_EXPR, unsigned_type_node, ref,
+		      build_int_cstu (unsigned_type_node, field_val));
+      return build1 (CONVERT_EXPR, integer_type_node, final);
+    }
+  else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
+    {
+      tree ref;
+      tree array_elt;
+      tree field;
+      tree final;
+
+      unsigned int field_val = 0;
+      unsigned int NUM_ISA_NAMES
+	= sizeof (isa_names_table) / sizeof (struct _isa_names_table);
+
+      for (i = 0; i < NUM_ISA_NAMES; i++)
+	if (strcmp (isa_names_table[i].name,
+	    TREE_STRING_POINTER (param_string_cst)) == 0)
+	  break;
+
+      if (i == NUM_ISA_NAMES)
+	{
+	  error ("parameter to builtin not valid: %s",
+	       	 TREE_STRING_POINTER (param_string_cst));
+	  return integer_zero_node;
+	}
+
+      if (isa_names_table[i].feature >= 32)
+	{
+	  tree __cpu_features2_var = make_var_decl (unsigned_type_node,
+						    "__cpu_features2");
+
+	  varpool_node::add (__cpu_features2_var);
+	  field_val = (1U << (isa_names_table[i].feature - 32));
+	  /* Return __cpu_features2 & field_val  */
+	  final = build2 (BIT_AND_EXPR, unsigned_type_node,
+			  __cpu_features2_var,
+			  build_int_cstu (unsigned_type_node, field_val));
+	  return build1 (CONVERT_EXPR, integer_type_node, final);
+	}
+
+      field = TYPE_FIELDS (__processor_model_type);
+      /* Get the last field, which is __cpu_features.  */
+      while (DECL_CHAIN (field))
+        field = DECL_CHAIN (field);
+
+      /* Get the appropriate field: __cpu_model.__cpu_features  */
+      ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
+		    field, NULL_TREE);
+
+      /* Access the 0th element of __cpu_features array.  */
+      array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
+			  integer_zero_node, NULL_TREE, NULL_TREE);
+
+      field_val = (1U << isa_names_table[i].feature);
+      /* Return __cpu_model.__cpu_features[0] & field_val  */
+      final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
+		      build_int_cstu (unsigned_type_node, field_val));
+      return build1 (CONVERT_EXPR, integer_type_node, final);
+    }
+  gcc_unreachable ();
+}
+
+#include "gt-i386-builtins.h"
diff --git a/gcc/config/i386/i386-builtins.h b/gcc/config/i386/i386-builtins.h
new file mode 100644
index 000000000..c0264e5bf
--- /dev/null
+++ b/gcc/config/i386/i386-builtins.h
@@ -0,0 +1,330 @@
+/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_I386_BUILTINS_H
+#define GCC_I386_BUILTINS_H
+
+/* The following file contains several enumerations and data structures
+   built from the definitions in i386-builtin-types.def.  */
+
+#include "i386-builtin-types.inc"
+
+/* Codes for all the SSE/MMX builtins.  Builtins not mentioned in any
+   bdesc_* arrays below should come first, then builtins for each bdesc_*
+   array in ascending order, so that we can use direct array accesses.  */
+enum ix86_builtins
+{
+  IX86_BUILTIN_MASKMOVQ,
+  IX86_BUILTIN_LDMXCSR,
+  IX86_BUILTIN_STMXCSR,
+  IX86_BUILTIN_MASKMOVDQU,
+  IX86_BUILTIN_PSLLDQ128,
+  IX86_BUILTIN_CLFLUSH,
+  IX86_BUILTIN_MONITOR,
+  IX86_BUILTIN_MWAIT,
+  IX86_BUILTIN_UMONITOR,
+  IX86_BUILTIN_UMWAIT,
+  IX86_BUILTIN_TPAUSE,
+  IX86_BUILTIN_CLZERO,
+  IX86_BUILTIN_CLDEMOTE,
+  IX86_BUILTIN_VEC_INIT_V2SI,
+  IX86_BUILTIN_VEC_INIT_V4HI,
+  IX86_BUILTIN_VEC_INIT_V8QI,
+  IX86_BUILTIN_VEC_EXT_V2DF,
+  IX86_BUILTIN_VEC_EXT_V2DI,
+  IX86_BUILTIN_VEC_EXT_V4SF,
+  IX86_BUILTIN_VEC_EXT_V4SI,
+  IX86_BUILTIN_VEC_EXT_V8HI,
+  IX86_BUILTIN_VEC_EXT_V2SI,
+  IX86_BUILTIN_VEC_EXT_V4HI,
+  IX86_BUILTIN_VEC_EXT_V16QI,
+  IX86_BUILTIN_VEC_SET_V2DI,
+  IX86_BUILTIN_VEC_SET_V4SF,
+  IX86_BUILTIN_VEC_SET_V4SI,
+  IX86_BUILTIN_VEC_SET_V8HI,
+  IX86_BUILTIN_VEC_SET_V4HI,
+  IX86_BUILTIN_VEC_SET_V16QI,
+  IX86_BUILTIN_GATHERSIV2DF,
+  IX86_BUILTIN_GATHERSIV4DF,
+  IX86_BUILTIN_GATHERDIV2DF,
+  IX86_BUILTIN_GATHERDIV4DF,
+  IX86_BUILTIN_GATHERSIV4SF,
+  IX86_BUILTIN_GATHERSIV8SF,
+  IX86_BUILTIN_GATHERDIV4SF,
+  IX86_BUILTIN_GATHERDIV8SF,
+  IX86_BUILTIN_GATHERSIV2DI,
+  IX86_BUILTIN_GATHERSIV4DI,
+  IX86_BUILTIN_GATHERDIV2DI,
+  IX86_BUILTIN_GATHERDIV4DI,
+  IX86_BUILTIN_GATHERSIV4SI,
+  IX86_BUILTIN_GATHERSIV8SI,
+  IX86_BUILTIN_GATHERDIV4SI,
+  IX86_BUILTIN_GATHERDIV8SI,
+  IX86_BUILTIN_GATHER3SIV8SF,
+  IX86_BUILTIN_GATHER3SIV4SF,
+  IX86_BUILTIN_GATHER3SIV4DF,
+  IX86_BUILTIN_GATHER3SIV2DF,
+  IX86_BUILTIN_GATHER3DIV8SF,
+  IX86_BUILTIN_GATHER3DIV4SF,
+  IX86_BUILTIN_GATHER3DIV4DF,
+  IX86_BUILTIN_GATHER3DIV2DF,
+  IX86_BUILTIN_GATHER3SIV8SI,
+  IX86_BUILTIN_GATHER3SIV4SI,
+  IX86_BUILTIN_GATHER3SIV4DI,
+  IX86_BUILTIN_GATHER3SIV2DI,
+  IX86_BUILTIN_GATHER3DIV8SI,
+  IX86_BUILTIN_GATHER3DIV4SI,
+  IX86_BUILTIN_GATHER3DIV4DI,
+  IX86_BUILTIN_GATHER3DIV2DI,
+  IX86_BUILTIN_SCATTERSIV8SF,
+  IX86_BUILTIN_SCATTERSIV4SF,
+  IX86_BUILTIN_SCATTERSIV4DF,
+  IX86_BUILTIN_SCATTERSIV2DF,
+  IX86_BUILTIN_SCATTERDIV8SF,
+  IX86_BUILTIN_SCATTERDIV4SF,
+  IX86_BUILTIN_SCATTERDIV4DF,
+  IX86_BUILTIN_SCATTERDIV2DF,
+  IX86_BUILTIN_SCATTERSIV8SI,
+  IX86_BUILTIN_SCATTERSIV4SI,
+  IX86_BUILTIN_SCATTERSIV4DI,
+  IX86_BUILTIN_SCATTERSIV2DI,
+  IX86_BUILTIN_SCATTERDIV8SI,
+  IX86_BUILTIN_SCATTERDIV4SI,
+  IX86_BUILTIN_SCATTERDIV4DI,
+  IX86_BUILTIN_SCATTERDIV2DI,
+  /* Alternate 4 and 8 element gather/scatter for the vectorizer
+     where all operands are 32-byte or 64-byte wide respectively.  */
+  IX86_BUILTIN_GATHERALTSIV4DF,
+  IX86_BUILTIN_GATHERALTDIV8SF,
+  IX86_BUILTIN_GATHERALTSIV4DI,
+  IX86_BUILTIN_GATHERALTDIV8SI,
+  IX86_BUILTIN_GATHER3ALTDIV16SF,
+  IX86_BUILTIN_GATHER3ALTDIV16SI,
+  IX86_BUILTIN_GATHER3ALTSIV4DF,
+  IX86_BUILTIN_GATHER3ALTDIV8SF,
+  IX86_BUILTIN_GATHER3ALTSIV4DI,
+  IX86_BUILTIN_GATHER3ALTDIV8SI,
+  IX86_BUILTIN_GATHER3ALTSIV8DF,
+  IX86_BUILTIN_GATHER3ALTSIV8DI,
+  IX86_BUILTIN_GATHER3DIV16SF,
+  IX86_BUILTIN_GATHER3DIV16SI,
+  IX86_BUILTIN_GATHER3DIV8DF,
+  IX86_BUILTIN_GATHER3DIV8DI,
+  IX86_BUILTIN_GATHER3SIV16SF,
+  IX86_BUILTIN_GATHER3SIV16SI,
+  IX86_BUILTIN_GATHER3SIV8DF,
+  IX86_BUILTIN_GATHER3SIV8DI,
+  IX86_BUILTIN_SCATTERALTSIV8DF,
+  IX86_BUILTIN_SCATTERALTDIV16SF,
+  IX86_BUILTIN_SCATTERALTSIV8DI,
+  IX86_BUILTIN_SCATTERALTDIV16SI,
+  IX86_BUILTIN_SCATTERALTSIV4DF,
+  IX86_BUILTIN_SCATTERALTDIV8SF,
+  IX86_BUILTIN_SCATTERALTSIV4DI,
+  IX86_BUILTIN_SCATTERALTDIV8SI,
+  IX86_BUILTIN_SCATTERALTSIV2DF,
+  IX86_BUILTIN_SCATTERALTDIV4SF,
+  IX86_BUILTIN_SCATTERALTSIV2DI,
+  IX86_BUILTIN_SCATTERALTDIV4SI,
+  IX86_BUILTIN_SCATTERDIV16SF,
+  IX86_BUILTIN_SCATTERDIV16SI,
+  IX86_BUILTIN_SCATTERDIV8DF,
+  IX86_BUILTIN_SCATTERDIV8DI,
+  IX86_BUILTIN_SCATTERSIV16SF,
+  IX86_BUILTIN_SCATTERSIV16SI,
+  IX86_BUILTIN_SCATTERSIV8DF,
+  IX86_BUILTIN_SCATTERSIV8DI,
+  IX86_BUILTIN_GATHERPFQPD,
+  IX86_BUILTIN_GATHERPFDPS,
+  IX86_BUILTIN_GATHERPFDPD,
+  IX86_BUILTIN_GATHERPFQPS,
+  IX86_BUILTIN_SCATTERPFDPD,
+  IX86_BUILTIN_SCATTERPFDPS,
+  IX86_BUILTIN_SCATTERPFQPD,
+  IX86_BUILTIN_SCATTERPFQPS,
+  IX86_BUILTIN_CLWB,
+  IX86_BUILTIN_CLFLUSHOPT,
+  IX86_BUILTIN_INFQ,
+  IX86_BUILTIN_HUGE_VALQ,
+  IX86_BUILTIN_NANQ,
+  IX86_BUILTIN_NANSQ,
+  IX86_BUILTIN_XABORT,
+  IX86_BUILTIN_ADDCARRYX32,
+  IX86_BUILTIN_ADDCARRYX64,
+  IX86_BUILTIN_SBB32,
+  IX86_BUILTIN_SBB64,
+  IX86_BUILTIN_RDRAND16_STEP,
+  IX86_BUILTIN_RDRAND32_STEP,
+  IX86_BUILTIN_RDRAND64_STEP,
+  IX86_BUILTIN_RDSEED16_STEP,
+  IX86_BUILTIN_RDSEED32_STEP,
+  IX86_BUILTIN_RDSEED64_STEP,
+  IX86_BUILTIN_MONITORX,
+  IX86_BUILTIN_MWAITX,
+  IX86_BUILTIN_CFSTRING,
+  IX86_BUILTIN_CPU_INIT,
+  IX86_BUILTIN_CPU_IS,
+  IX86_BUILTIN_CPU_SUPPORTS,
+  IX86_BUILTIN_READ_FLAGS,
+  IX86_BUILTIN_WRITE_FLAGS,
+
+  /* All the remaining builtins are tracked in bdesc_* arrays in
+     i386-builtin.def.  Don't add any IX86_BUILTIN_* enumerators after
+     this point.  */
+#define BDESC(mask, mask2, icode, name, code, comparison, flag)	\
+  code,
+#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag) \
+  code,									\
+  IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
+#define BDESC_END(kind, next_kind)
+
+#include "i386-builtin.def"
+
+#undef BDESC
+#undef BDESC_FIRST
+#undef BDESC_END
+
+  IX86_BUILTIN_MAX,
+
+  IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
+
+  /* Now just the aliases for bdesc_* start/end.  */
+#define BDESC(mask, mask2, icode, name, code, comparison, flag)
+#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag)
+#define BDESC_END(kind, next_kind) \
+  IX86_BUILTIN__BDESC_##kind##_LAST					    \
+    = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
+
+#include "i386-builtin.def"
+
+#undef BDESC
+#undef BDESC_FIRST
+#undef BDESC_END
+
+  /* Just to make sure there is no comma after the last enumerator.  */
+  IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
+};
+
+/* Table of all of the builtin functions that are possible with different ISA's
+   but are waiting to be built until a function is declared to use that
+   ISA.  */
+struct builtin_isa {
+  HOST_WIDE_INT isa;		/* isa_flags this builtin is defined for */
+  HOST_WIDE_INT isa2;		/* additional isa_flags this builtin is defined for */
+  const char *name;		/* function name */
+  enum ix86_builtin_func_type tcode; /* type to use in the declaration */
+  unsigned char const_p:1;	/* true if the declaration is constant */
+  unsigned char pure_p:1;	/* true if the declaration has pure attribute */
+  bool set_and_not_built_p;
+};
+
+/* Bits for builtin_description.flag.  */
+
+/* Set when we don't support the comparison natively, and should
+   swap_comparison in order to support it.  */
+#define BUILTIN_DESC_SWAP_OPERANDS	1
+
+struct builtin_description
+{
+  const HOST_WIDE_INT mask;
+  const HOST_WIDE_INT mask2;
+  const enum insn_code icode;
+  const char *const name;
+  const enum ix86_builtins code;
+  const enum rtx_code comparison;
+  const int flag;
+};
+
+#define MULTI_ARG_4_DF2_DI_I	V2DF_FTYPE_V2DF_V2DF_V2DI_INT
+#define MULTI_ARG_4_DF2_DI_I1	V4DF_FTYPE_V4DF_V4DF_V4DI_INT
+#define MULTI_ARG_4_SF2_SI_I	V4SF_FTYPE_V4SF_V4SF_V4SI_INT
+#define MULTI_ARG_4_SF2_SI_I1	V8SF_FTYPE_V8SF_V8SF_V8SI_INT
+#define MULTI_ARG_3_SF		V4SF_FTYPE_V4SF_V4SF_V4SF
+#define MULTI_ARG_3_DF		V2DF_FTYPE_V2DF_V2DF_V2DF
+#define MULTI_ARG_3_SF2		V8SF_FTYPE_V8SF_V8SF_V8SF
+#define MULTI_ARG_3_DF2		V4DF_FTYPE_V4DF_V4DF_V4DF
+#define MULTI_ARG_3_DI		V2DI_FTYPE_V2DI_V2DI_V2DI
+#define MULTI_ARG_3_SI		V4SI_FTYPE_V4SI_V4SI_V4SI
+#define MULTI_ARG_3_SI_DI	V4SI_FTYPE_V4SI_V4SI_V2DI
+#define MULTI_ARG_3_HI		V8HI_FTYPE_V8HI_V8HI_V8HI
+#define MULTI_ARG_3_HI_SI	V8HI_FTYPE_V8HI_V8HI_V4SI
+#define MULTI_ARG_3_QI		V16QI_FTYPE_V16QI_V16QI_V16QI
+#define MULTI_ARG_3_DI2		V4DI_FTYPE_V4DI_V4DI_V4DI
+#define MULTI_ARG_3_SI2		V8SI_FTYPE_V8SI_V8SI_V8SI
+#define MULTI_ARG_3_HI2		V16HI_FTYPE_V16HI_V16HI_V16HI
+#define MULTI_ARG_3_QI2		V32QI_FTYPE_V32QI_V32QI_V32QI
+#define MULTI_ARG_2_SF		V4SF_FTYPE_V4SF_V4SF
+#define MULTI_ARG_2_DF		V2DF_FTYPE_V2DF_V2DF
+#define MULTI_ARG_2_DI		V2DI_FTYPE_V2DI_V2DI
+#define MULTI_ARG_2_SI		V4SI_FTYPE_V4SI_V4SI
+#define MULTI_ARG_2_HI		V8HI_FTYPE_V8HI_V8HI
+#define MULTI_ARG_2_QI		V16QI_FTYPE_V16QI_V16QI
+#define MULTI_ARG_2_DI_IMM	V2DI_FTYPE_V2DI_SI
+#define MULTI_ARG_2_SI_IMM	V4SI_FTYPE_V4SI_SI
+#define MULTI_ARG_2_HI_IMM	V8HI_FTYPE_V8HI_SI
+#define MULTI_ARG_2_QI_IMM	V16QI_FTYPE_V16QI_SI
+#define MULTI_ARG_2_DI_CMP	V2DI_FTYPE_V2DI_V2DI_CMP
+#define MULTI_ARG_2_SI_CMP	V4SI_FTYPE_V4SI_V4SI_CMP
+#define MULTI_ARG_2_HI_CMP	V8HI_FTYPE_V8HI_V8HI_CMP
+#define MULTI_ARG_2_QI_CMP	V16QI_FTYPE_V16QI_V16QI_CMP
+#define MULTI_ARG_2_SF_TF	V4SF_FTYPE_V4SF_V4SF_TF
+#define MULTI_ARG_2_DF_TF	V2DF_FTYPE_V2DF_V2DF_TF
+#define MULTI_ARG_2_DI_TF	V2DI_FTYPE_V2DI_V2DI_TF
+#define MULTI_ARG_2_SI_TF	V4SI_FTYPE_V4SI_V4SI_TF
+#define MULTI_ARG_2_HI_TF	V8HI_FTYPE_V8HI_V8HI_TF
+#define MULTI_ARG_2_QI_TF	V16QI_FTYPE_V16QI_V16QI_TF
+#define MULTI_ARG_1_SF		V4SF_FTYPE_V4SF
+#define MULTI_ARG_1_DF		V2DF_FTYPE_V2DF
+#define MULTI_ARG_1_SF2		V8SF_FTYPE_V8SF
+#define MULTI_ARG_1_DF2		V4DF_FTYPE_V4DF
+#define MULTI_ARG_1_DI		V2DI_FTYPE_V2DI
+#define MULTI_ARG_1_SI		V4SI_FTYPE_V4SI
+#define MULTI_ARG_1_HI		V8HI_FTYPE_V8HI
+#define MULTI_ARG_1_QI		V16QI_FTYPE_V16QI
+#define MULTI_ARG_1_SI_DI	V2DI_FTYPE_V4SI
+#define MULTI_ARG_1_HI_DI	V2DI_FTYPE_V8HI
+#define MULTI_ARG_1_HI_SI	V4SI_FTYPE_V8HI
+#define MULTI_ARG_1_QI_DI	V2DI_FTYPE_V16QI
+#define MULTI_ARG_1_QI_SI	V4SI_FTYPE_V16QI
+#define MULTI_ARG_1_QI_HI	V8HI_FTYPE_V16QI
+
+#define BDESC(mask, mask2, icode, name, code, comparison, flag)	\
+  { mask, mask2, icode, name, code, comparison, flag },
+#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag) \
+static const struct builtin_description bdesc_##kind[] =		    \
+{									    \
+  BDESC (mask, mask2, icode, name, code, comparison, flag)
+#define BDESC_END(kind, next_kind) \
+};
+
+#include "i386-builtin.def"
+
+extern builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
+
+tree ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
+					      tree type_in);
+void ix86_init_builtins (void);
+tree ix86_vectorize_builtin_gather (const_tree mem_vectype,
+					   const_tree index_type, int scale);
+tree ix86_builtin_decl (unsigned code, bool);
+tree ix86_builtin_reciprocal (tree fndecl);
+unsigned int get_builtin_code_for_version (tree decl, tree *predicate_list);
+tree fold_builtin_cpu (tree fndecl, tree *args);
+tree get_ix86_builtin (enum ix86_builtins c);
+
+#endif  /* GCC_I386_BUILTINS_H */
diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
index 5e7e46fce..50cac3b1a 100644
--- a/gcc/config/i386/i386-c.c
+++ b/gcc/config/i386/i386-c.c
@@ -586,8 +586,9 @@ ix86_pragma_target_parse (tree args, tree pop_target)
     }
   else
     {
-      cur_tree = ix86_valid_target_attribute_tree (args, &global_options,
-						   &global_options_set);
+      cur_tree = ix86_valid_target_attribute_tree (NULL_TREE, args,
+						   &global_options,
+						   &global_options_set, 0);
       if (!cur_tree || cur_tree == error_mark_node)
        {
          cl_target_option_restore (&global_options,
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
new file mode 100644
index 000000000..01f38b9ea
--- /dev/null
+++ b/gcc/config/i386/i386-expand.c
@@ -0,0 +1,19842 @@
+/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "memmodel.h"
+#include "gimple.h"
+#include "cfghooks.h"
+#include "cfgloop.h"
+#include "df.h"
+#include "tm_p.h"
+#include "stringpool.h"
+#include "expmed.h"
+#include "optabs.h"
+#include "regs.h"
+#include "emit-rtl.h"
+#include "recog.h"
+#include "cgraph.h"
+#include "diagnostic.h"
+#include "cfgbuild.h"
+#include "alias.h"
+#include "fold-const.h"
+#include "attribs.h"
+#include "calls.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "output.h"
+#include "insn-attr.h"
+#include "flags.h"
+#include "except.h"
+#include "explow.h"
+#include "expr.h"
+#include "cfgrtl.h"
+#include "common/common-target.h"
+#include "langhooks.h"
+#include "reload.h"
+#include "gimplify.h"
+#include "dwarf2.h"
+#include "tm-constrs.h"
+#include "params.h"
+#include "cselib.h"
+#include "sched-int.h"
+#include "opts.h"
+#include "tree-pass.h"
+#include "context.h"
+#include "pass_manager.h"
+#include "target-globals.h"
+#include "gimple-iterator.h"
+#include "tree-vectorizer.h"
+#include "shrink-wrap.h"
+#include "builtins.h"
+#include "rtl-iter.h"
+#include "tree-iterator.h"
+#include "dbgcnt.h"
+#include "case-cfn-macros.h"
+#include "dojump.h"
+#include "fold-const-call.h"
+#include "tree-vrp.h"
+#include "tree-ssanames.h"
+#include "selftest.h"
+#include "selftest-rtl.h"
+#include "print-rtl.h"
+#include "intl.h"
+#include "ifcvt.h"
+#include "symbol-summary.h"
+#include "ipa-prop.h"
+#include "ipa-fnsummary.h"
+#include "wide-int-bitmask.h"
+#include "tree-vector-builder.h"
+#include "debug.h"
+#include "dwarf2out.h"
+#include "i386-options.h"
+#include "i386-builtins.h"
+#include "i386-expand.h"
+
+/* Split one or more double-mode RTL references into pairs of half-mode
+   references.  The RTL can be REG, offsettable MEM, integer constant, or
+   CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
+   split and "num" is its length.  lo_half and hi_half are output arrays
+   that parallel "operands".  */
+
+void
+split_double_mode (machine_mode mode, rtx operands[],
+		   int num, rtx lo_half[], rtx hi_half[])
+{
+  machine_mode half_mode;
+  unsigned int byte;
+
+  switch (mode)
+    {
+    case E_TImode:
+      half_mode = DImode;
+      break;
+    case E_DImode:
+      half_mode = SImode;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  byte = GET_MODE_SIZE (half_mode);
+
+  while (num--)
+    {
+      rtx op = operands[num];
+
+      /* simplify_subreg refuse to split volatile memory addresses,
+         but we still have to handle it.  */
+      if (MEM_P (op))
+	{
+	  lo_half[num] = adjust_address (op, half_mode, 0);
+	  hi_half[num] = adjust_address (op, half_mode, byte);
+	}
+      else
+	{
+	  lo_half[num] = simplify_gen_subreg (half_mode, op,
+					      GET_MODE (op) == VOIDmode
+					      ? mode : GET_MODE (op), 0);
+	  hi_half[num] = simplify_gen_subreg (half_mode, op,
+					      GET_MODE (op) == VOIDmode
+					      ? mode : GET_MODE (op), byte);
+	}
+    }
+}
+
+/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
+   for the target.  */
+
+void
+ix86_expand_clear (rtx dest)
+{
+  rtx tmp;
+
+  /* We play register width games, which are only valid after reload.  */
+  gcc_assert (reload_completed);
+
+  /* Avoid HImode and its attendant prefix byte.  */
+  if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
+    dest = gen_rtx_REG (SImode, REGNO (dest));
+  tmp = gen_rtx_SET (dest, const0_rtx);
+
+  if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
+    {
+      rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
+    }
+
+  emit_insn (tmp);
+}
+
+void
+ix86_expand_move (machine_mode mode, rtx operands[])
+{
+  rtx op0, op1;
+  rtx tmp, addend = NULL_RTX;
+  enum tls_model model;
+
+  op0 = operands[0];
+  op1 = operands[1];
+
+  switch (GET_CODE (op1))
+    {
+    case CONST:
+      tmp = XEXP (op1, 0);
+
+      if (GET_CODE (tmp) != PLUS
+	  || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
+	break;
+
+      op1 = XEXP (tmp, 0);
+      addend = XEXP (tmp, 1);
+      /* FALLTHRU */
+
+    case SYMBOL_REF:
+      model = SYMBOL_REF_TLS_MODEL (op1);
+
+      if (model)
+	op1 = legitimize_tls_address (op1, model, true);
+      else if (ix86_force_load_from_GOT_p (op1))
+	{
+	  /* Load the external function address via GOT slot to avoid PLT.  */
+	  op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
+				(TARGET_64BIT
+				 ? UNSPEC_GOTPCREL
+				 : UNSPEC_GOT));
+	  op1 = gen_rtx_CONST (Pmode, op1);
+	  op1 = gen_const_mem (Pmode, op1);
+	  set_mem_alias_set (op1, ix86_GOT_alias_set ());
+	}
+      else
+	{
+	  tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
+	  if (tmp)
+	    {
+	      op1 = tmp;
+	      if (!addend)
+		break;
+	    }
+	  else
+	    {
+	      op1 = operands[1];
+	      break;
+	    }
+	}
+
+      if (addend)
+	{
+	  op1 = force_operand (op1, NULL_RTX);
+	  op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
+				     op0, 1, OPTAB_DIRECT);
+	}
+      else
+	op1 = force_operand (op1, op0);
+
+      if (op1 == op0)
+	return;
+
+      op1 = convert_to_mode (mode, op1, 1);
+
+    default:
+      break;
+    }
+
+  if ((flag_pic || MACHOPIC_INDIRECT)
+      && symbolic_operand (op1, mode))
+    {
+      if (TARGET_MACHO && !TARGET_64BIT)
+	{
+#if TARGET_MACHO
+	  /* dynamic-no-pic */
+	  if (MACHOPIC_INDIRECT)
+	    {
+	      rtx temp = (op0 && REG_P (op0) && mode == Pmode)
+			 ? op0 : gen_reg_rtx (Pmode);
+	      op1 = machopic_indirect_data_reference (op1, temp);
+	      if (MACHOPIC_PURE)
+		op1 = machopic_legitimize_pic_address (op1, mode,
+						       temp == op1 ? 0 : temp);
+	    }
+	  if (op0 != op1 && GET_CODE (op0) != MEM)
+	    {
+	      rtx insn = gen_rtx_SET (op0, op1);
+	      emit_insn (insn);
+	      return;
+	    }
+	  if (GET_CODE (op0) == MEM)
+	    op1 = force_reg (Pmode, op1);
+	  else
+	    {
+	      rtx temp = op0;
+	      if (GET_CODE (temp) != REG)
+		temp = gen_reg_rtx (Pmode);
+	      temp = legitimize_pic_address (op1, temp);
+	      if (temp == op0)
+	    return;
+	      op1 = temp;
+	    }
+      /* dynamic-no-pic */
+#endif
+	}
+      else
+	{
+	  if (MEM_P (op0))
+	    op1 = force_reg (mode, op1);
+	  else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
+	    {
+	      rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
+	      op1 = legitimize_pic_address (op1, reg);
+	      if (op0 == op1)
+		return;
+	      op1 = convert_to_mode (mode, op1, 1);
+	    }
+	}
+    }
+  else
+    {
+      if (MEM_P (op0)
+	  && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
+	      || !push_operand (op0, mode))
+	  && MEM_P (op1))
+	op1 = force_reg (mode, op1);
+
+      if (push_operand (op0, mode)
+	  && ! general_no_elim_operand (op1, mode))
+	op1 = copy_to_mode_reg (mode, op1);
+
+      /* Force large constants in 64bit compilation into register
+	 to get them CSEed.  */
+      if (can_create_pseudo_p ()
+	  && (mode == DImode) && TARGET_64BIT
+	  && immediate_operand (op1, mode)
+	  && !x86_64_zext_immediate_operand (op1, VOIDmode)
+	  && !register_operand (op0, mode)
+	  && optimize)
+	op1 = copy_to_mode_reg (mode, op1);
+
+      if (can_create_pseudo_p ()
+	  && CONST_DOUBLE_P (op1))
+	{
+	  /* If we are loading a floating point constant to a register,
+	     force the value to memory now, since we'll get better code
+	     out the back end.  */
+
+	  op1 = validize_mem (force_const_mem (mode, op1));
+	  if (!register_operand (op0, mode))
+	    {
+	      rtx temp = gen_reg_rtx (mode);
+	      emit_insn (gen_rtx_SET (temp, op1));
+	      emit_move_insn (op0, temp);
+	      return;
+	    }
+	}
+    }
+
+  emit_insn (gen_rtx_SET (op0, op1));
+}
+
+void
+ix86_expand_vector_move (machine_mode mode, rtx operands[])
+{
+  rtx op0 = operands[0], op1 = operands[1];
+  /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
+     psABI since the biggest alignment is 4 byte for IA MCU psABI.  */
+  unsigned int align = (TARGET_IAMCU
+			? GET_MODE_BITSIZE (mode)
+			: GET_MODE_ALIGNMENT (mode));
+
+  if (push_operand (op0, VOIDmode))
+    op0 = emit_move_resolve_push (mode, op0);
+
+  /* Force constants other than zero into memory.  We do not know how
+     the instructions used to build constants modify the upper 64 bits
+     of the register, once we have that information we may be able
+     to handle some of them more efficiently.  */
+  if (can_create_pseudo_p ()
+      && (CONSTANT_P (op1)
+	  || (SUBREG_P (op1)
+	      && CONSTANT_P (SUBREG_REG (op1))))
+      && ((register_operand (op0, mode)
+	   && !standard_sse_constant_p (op1, mode))
+	  /* ix86_expand_vector_move_misalign() does not like constants.  */
+	  || (SSE_REG_MODE_P (mode)
+	      && MEM_P (op0)
+	      && MEM_ALIGN (op0) < align)))
+    {
+      if (SUBREG_P (op1))
+	{
+	  machine_mode imode = GET_MODE (SUBREG_REG (op1));
+	  rtx r = force_const_mem (imode, SUBREG_REG (op1));
+	  if (r)
+	    r = validize_mem (r);
+	  else
+	    r = force_reg (imode, SUBREG_REG (op1));
+	  op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
+	}
+      else
+	op1 = validize_mem (force_const_mem (mode, op1));
+    }
+
+  /* We need to check memory alignment for SSE mode since attribute
+     can make operands unaligned.  */
+  if (can_create_pseudo_p ()
+      && SSE_REG_MODE_P (mode)
+      && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
+	  || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
+    {
+      rtx tmp[2];
+
+      /* ix86_expand_vector_move_misalign() does not like both
+	 arguments in memory.  */
+      if (!register_operand (op0, mode)
+	  && !register_operand (op1, mode))
+	op1 = force_reg (mode, op1);
+
+      tmp[0] = op0; tmp[1] = op1;
+      ix86_expand_vector_move_misalign (mode, tmp);
+      return;
+    }
+
+  /* Make operand1 a register if it isn't already.  */
+  if (can_create_pseudo_p ()
+      && !register_operand (op0, mode)
+      && !register_operand (op1, mode))
+    {
+      emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
+      return;
+    }
+
+  emit_insn (gen_rtx_SET (op0, op1));
+}
+
+/* Split 32-byte AVX unaligned load and store if needed.  */
+
+static void
+ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
+{
+  rtx m;
+  rtx (*extract) (rtx, rtx, rtx);
+  machine_mode mode;
+
+  if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
+      || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
+    {
+      emit_insn (gen_rtx_SET (op0, op1));
+      return;
+    }
+
+  rtx orig_op0 = NULL_RTX;
+  mode = GET_MODE (op0);
+  switch (GET_MODE_CLASS (mode))
+    {
+    case MODE_VECTOR_INT:
+    case MODE_INT:
+      if (mode != V32QImode)
+	{
+	  if (!MEM_P (op0))
+	    {
+	      orig_op0 = op0;
+	      op0 = gen_reg_rtx (V32QImode);
+	    }
+	  else
+	    op0 = gen_lowpart (V32QImode, op0);
+	  op1 = gen_lowpart (V32QImode, op1);
+	  mode = V32QImode;
+	}
+      break;
+    case MODE_VECTOR_FLOAT:
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  switch (mode)
+    {
+    default:
+      gcc_unreachable ();
+    case E_V32QImode:
+      extract = gen_avx_vextractf128v32qi;
+      mode = V16QImode;
+      break;
+    case E_V8SFmode:
+      extract = gen_avx_vextractf128v8sf;
+      mode = V4SFmode;
+      break;
+    case E_V4DFmode:
+      extract = gen_avx_vextractf128v4df;
+      mode = V2DFmode;
+      break;
+    }
+
+  if (MEM_P (op1))
+    {
+      rtx r = gen_reg_rtx (mode);
+      m = adjust_address (op1, mode, 0);
+      emit_move_insn (r, m);
+      m = adjust_address (op1, mode, 16);
+      r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
+      emit_move_insn (op0, r);
+    }
+  else if (MEM_P (op0))
+    {
+      m = adjust_address (op0, mode, 0);
+      emit_insn (extract (m, op1, const0_rtx));
+      m = adjust_address (op0, mode, 16);
+      emit_insn (extract (m, copy_rtx (op1), const1_rtx));
+    }
+  else
+    gcc_unreachable ();
+
+  if (orig_op0)
+    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
+}
+
+/* Implement the movmisalign patterns for SSE.  Non-SSE modes go
+   straight to ix86_expand_vector_move.  */
+/* Code generation for scalar reg-reg moves of single and double precision data:
+     if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
+       movaps reg, reg
+     else
+       movss reg, reg
+     if (x86_sse_partial_reg_dependency == true)
+       movapd reg, reg
+     else
+       movsd reg, reg
+
+   Code generation for scalar loads of double precision data:
+     if (x86_sse_split_regs == true)
+       movlpd mem, reg      (gas syntax)
+     else
+       movsd mem, reg
+
+   Code generation for unaligned packed loads of single precision data
+   (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
+     if (x86_sse_unaligned_move_optimal)
+       movups mem, reg
+
+     if (x86_sse_partial_reg_dependency == true)
+       {
+         xorps  reg, reg
+         movlps mem, reg
+         movhps mem+8, reg
+       }
+     else
+       {
+         movlps mem, reg
+         movhps mem+8, reg
+       }
+
+   Code generation for unaligned packed loads of double precision data
+   (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
+     if (x86_sse_unaligned_move_optimal)
+       movupd mem, reg
+
+     if (x86_sse_split_regs == true)
+       {
+         movlpd mem, reg
+         movhpd mem+8, reg
+       }
+     else
+       {
+         movsd  mem, reg
+         movhpd mem+8, reg
+       }
+ */
+
+void
+ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
+{
+  rtx op0, op1, m;
+
+  op0 = operands[0];
+  op1 = operands[1];
+
+  /* Use unaligned load/store for AVX512 or when optimizing for size.  */
+  if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
+    {
+      emit_insn (gen_rtx_SET (op0, op1));
+      return;
+    }
+
+  if (TARGET_AVX)
+    {
+      if (GET_MODE_SIZE (mode) == 32)
+	ix86_avx256_split_vector_move_misalign (op0, op1);
+      else
+	/* Always use 128-bit mov<mode>_internal pattern for AVX.  */
+	emit_insn (gen_rtx_SET (op0, op1));
+      return;
+    }
+
+  if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
+      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
+    {
+      emit_insn (gen_rtx_SET (op0, op1));
+      return;
+    }
+
+  /* ??? If we have typed data, then it would appear that using
+     movdqu is the only way to get unaligned data loaded with
+     integer type.  */
+  if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+    {
+      emit_insn (gen_rtx_SET (op0, op1));
+      return;
+    }
+
+  if (MEM_P (op1))
+    {
+      if (TARGET_SSE2 && mode == V2DFmode)
+        {
+          rtx zero;
+
+	  /* When SSE registers are split into halves, we can avoid
+	     writing to the top half twice.  */
+	  if (TARGET_SSE_SPLIT_REGS)
+	    {
+	      emit_clobber (op0);
+	      zero = op0;
+	    }
+	  else
+	    {
+	      /* ??? Not sure about the best option for the Intel chips.
+		 The following would seem to satisfy; the register is
+		 entirely cleared, breaking the dependency chain.  We
+		 then store to the upper half, with a dependency depth
+		 of one.  A rumor has it that Intel recommends two movsd
+		 followed by an unpacklpd, but this is unconfirmed.  And
+		 given that the dependency depth of the unpacklpd would
+		 still be one, I'm not sure why this would be better.  */
+	      zero = CONST0_RTX (V2DFmode);
+	    }
+
+	  m = adjust_address (op1, DFmode, 0);
+	  emit_insn (gen_sse2_loadlpd (op0, zero, m));
+	  m = adjust_address (op1, DFmode, 8);
+	  emit_insn (gen_sse2_loadhpd (op0, op0, m));
+	}
+      else
+        {
+	  rtx t;
+
+	  if (mode != V4SFmode)
+	    t = gen_reg_rtx (V4SFmode);
+	  else
+	    t = op0;
+	    
+	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
+	    emit_move_insn (t, CONST0_RTX (V4SFmode));
+	  else
+	    emit_clobber (t);
+
+	  m = adjust_address (op1, V2SFmode, 0);
+	  emit_insn (gen_sse_loadlps (t, t, m));
+	  m = adjust_address (op1, V2SFmode, 8);
+	  emit_insn (gen_sse_loadhps (t, t, m));
+	  if (mode != V4SFmode)
+	    emit_move_insn (op0, gen_lowpart (mode, t));
+	}
+    }
+  else if (MEM_P (op0))
+    {
+      if (TARGET_SSE2 && mode == V2DFmode)
+	{
+	  m = adjust_address (op0, DFmode, 0);
+	  emit_insn (gen_sse2_storelpd (m, op1));
+	  m = adjust_address (op0, DFmode, 8);
+	  emit_insn (gen_sse2_storehpd (m, op1));
+	}
+      else
+	{
+	  if (mode != V4SFmode)
+	    op1 = gen_lowpart (V4SFmode, op1);
+
+	  m = adjust_address (op0, V2SFmode, 0);
+	  emit_insn (gen_sse_storelps (m, op1));
+	  m = adjust_address (op0, V2SFmode, 8);
+	  emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
+	}
+    }
+  else
+    gcc_unreachable ();
+}
+
+/* Helper function of ix86_fixup_binary_operands to canonicalize
+   operand order.  Returns true if the operands should be swapped.  */
+
+static bool
+ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
+			     rtx operands[])
+{
+  rtx dst = operands[0];
+  rtx src1 = operands[1];
+  rtx src2 = operands[2];
+
+  /* If the operation is not commutative, we can't do anything.  */
+  if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
+      && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
+    return false;
+
+  /* Highest priority is that src1 should match dst.  */
+  if (rtx_equal_p (dst, src1))
+    return false;
+  if (rtx_equal_p (dst, src2))
+    return true;
+
+  /* Next highest priority is that immediate constants come second.  */
+  if (immediate_operand (src2, mode))
+    return false;
+  if (immediate_operand (src1, mode))
+    return true;
+
+  /* Lowest priority is that memory references should come second.  */
+  if (MEM_P (src2))
+    return false;
+  if (MEM_P (src1))
+    return true;
+
+  return false;
+}
+
+
+/* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
+   destination to use for the operation.  If different from the true
+   destination in operands[0], a copy operation will be required.  */
+
+rtx
+ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
+			    rtx operands[])
+{
+  rtx dst = operands[0];
+  rtx src1 = operands[1];
+  rtx src2 = operands[2];
+
+  /* Canonicalize operand order.  */
+  if (ix86_swap_binary_operands_p (code, mode, operands))
+    {
+      /* It is invalid to swap operands of different modes.  */
+      gcc_assert (GET_MODE (src1) == GET_MODE (src2));
+
+      std::swap (src1, src2);
+    }
+
+  /* Both source operands cannot be in memory.  */
+  if (MEM_P (src1) && MEM_P (src2))
+    {
+      /* Optimization: Only read from memory once.  */
+      if (rtx_equal_p (src1, src2))
+	{
+	  src2 = force_reg (mode, src2);
+	  src1 = src2;
+	}
+      else if (rtx_equal_p (dst, src1))
+	src2 = force_reg (mode, src2);
+      else
+	src1 = force_reg (mode, src1);
+    }
+
+  /* If the destination is memory, and we do not have matching source
+     operands, do things in registers.  */
+  if (MEM_P (dst) && !rtx_equal_p (dst, src1))
+    dst = gen_reg_rtx (mode);
+
+  /* Source 1 cannot be a constant.  */
+  if (CONSTANT_P (src1))
+    src1 = force_reg (mode, src1);
+
+  /* Source 1 cannot be a non-matching memory.  */
+  if (MEM_P (src1) && !rtx_equal_p (dst, src1))
+    src1 = force_reg (mode, src1);
+
+  /* Improve address combine.  */
+  if (code == PLUS
+      && GET_MODE_CLASS (mode) == MODE_INT
+      && MEM_P (src2))
+    src2 = force_reg (mode, src2);
+
+  operands[1] = src1;
+  operands[2] = src2;
+  return dst;
+}
+
+/* Similarly, but assume that the destination has already been
+   set up properly.  */
+
+void
+ix86_fixup_binary_operands_no_copy (enum rtx_code code,
+				    machine_mode mode, rtx operands[])
+{
+  rtx dst = ix86_fixup_binary_operands (code, mode, operands);
+  gcc_assert (dst == operands[0]);
+}
+
+/* Attempt to expand a binary operator.  Make the expansion closer to the
+   actual machine, then just general_operand, which will allow 3 separate
+   memory references (one output, two input) in a single insn.  */
+
+void
+ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
+			     rtx operands[])
+{
+  rtx src1, src2, dst, op, clob;
+
+  dst = ix86_fixup_binary_operands (code, mode, operands);
+  src1 = operands[1];
+  src2 = operands[2];
+
+ /* Emit the instruction.  */
+
+  op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
+
+  if (reload_completed
+      && code == PLUS
+      && !rtx_equal_p (dst, src1))
+    {
+      /* This is going to be an LEA; avoid splitting it later.  */
+      emit_insn (op);
+    }
+  else
+    {
+      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+    }
+
+  /* Fix up the destination if needed.  */
+  if (dst != operands[0])
+    emit_move_insn (operands[0], dst);
+}
+
+/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
+   the given OPERANDS.  */
+
+void
+ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
+				     rtx operands[])
+{
+  rtx op1 = NULL_RTX, op2 = NULL_RTX;
+  if (SUBREG_P (operands[1]))
+    {
+      op1 = operands[1];
+      op2 = operands[2];
+    }
+  else if (SUBREG_P (operands[2]))
+    {
+      op1 = operands[2];
+      op2 = operands[1];
+    }
+  /* Optimize (__m128i) d | (__m128i) e and similar code
+     when d and e are float vectors into float vector logical
+     insn.  In C/C++ without using intrinsics there is no other way
+     to express vector logical operation on float vectors than
+     to cast them temporarily to integer vectors.  */
+  if (op1
+      && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+      && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
+      && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
+      && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
+      && SUBREG_BYTE (op1) == 0
+      && (GET_CODE (op2) == CONST_VECTOR
+	  || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
+	      && SUBREG_BYTE (op2) == 0))
+      && can_create_pseudo_p ())
+    {
+      rtx dst;
+      switch (GET_MODE (SUBREG_REG (op1)))
+	{
+	case E_V4SFmode:
+	case E_V8SFmode:
+	case E_V16SFmode:
+	case E_V2DFmode:
+	case E_V4DFmode:
+	case E_V8DFmode:
+	  dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
+	  if (GET_CODE (op2) == CONST_VECTOR)
+	    {
+	      op2 = gen_lowpart (GET_MODE (dst), op2);
+	      op2 = force_reg (GET_MODE (dst), op2);
+	    }
+	  else
+	    {
+	      op1 = operands[1];
+	      op2 = SUBREG_REG (operands[2]);
+	      if (!vector_operand (op2, GET_MODE (dst)))
+		op2 = force_reg (GET_MODE (dst), op2);
+	    }
+	  op1 = SUBREG_REG (op1);
+	  if (!vector_operand (op1, GET_MODE (dst)))
+	    op1 = force_reg (GET_MODE (dst), op1);
+	  emit_insn (gen_rtx_SET (dst,
+				  gen_rtx_fmt_ee (code, GET_MODE (dst),
+						  op1, op2)));
+	  emit_move_insn (operands[0], gen_lowpart (mode, dst));
+	  return;
+	default:
+	  break;
+	}
+    }
+  if (!vector_operand (operands[1], mode))
+    operands[1] = force_reg (mode, operands[1]);
+  if (!vector_operand (operands[2], mode))
+    operands[2] = force_reg (mode, operands[2]);
+  ix86_fixup_binary_operands_no_copy (code, mode, operands);
+  emit_insn (gen_rtx_SET (operands[0],
+			  gen_rtx_fmt_ee (code, mode, operands[1],
+					  operands[2])));
+}
+
+/* Return TRUE or FALSE depending on whether the binary operator meets the
+   appropriate constraints.  */
+
+bool
+ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
+			 rtx operands[3])
+{
+  rtx dst = operands[0];
+  rtx src1 = operands[1];
+  rtx src2 = operands[2];
+
+  /* Both source operands cannot be in memory.  */
+  if (MEM_P (src1) && MEM_P (src2))
+    return false;
+
+  /* Canonicalize operand order for commutative operators.  */
+  if (ix86_swap_binary_operands_p (code, mode, operands))
+    std::swap (src1, src2);
+
+  /* If the destination is memory, we must have a matching source operand.  */
+  if (MEM_P (dst) && !rtx_equal_p (dst, src1))
+    return false;
+
+  /* Source 1 cannot be a constant.  */
+  if (CONSTANT_P (src1))
+    return false;
+
+  /* Source 1 cannot be a non-matching memory.  */
+  if (MEM_P (src1) && !rtx_equal_p (dst, src1))
+    /* Support "andhi/andsi/anddi" as a zero-extending move.  */
+    return (code == AND
+	    && (mode == HImode
+		|| mode == SImode
+		|| (TARGET_64BIT && mode == DImode))
+	    && satisfies_constraint_L (src2));
+
+  return true;
+}
+
+/* Attempt to expand a unary operator.  Make the expansion closer to the
+   actual machine, then just general_operand, which will allow 2 separate
+   memory references (one output, one input) in a single insn.  */
+
+void
+ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
+			    rtx operands[])
+{
+  bool matching_memory = false;
+  rtx src, dst, op, clob;
+
+  dst = operands[0];
+  src = operands[1];
+
+  /* If the destination is memory, and we do not have matching source
+     operands, do things in registers.  */
+  if (MEM_P (dst))
+    {
+      if (rtx_equal_p (dst, src))
+	matching_memory = true;
+      else
+	dst = gen_reg_rtx (mode);
+    }
+
+  /* When source operand is memory, destination must match.  */
+  if (MEM_P (src) && !matching_memory)
+    src = force_reg (mode, src);
+
+  /* Emit the instruction.  */
+
+  op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
+
+  if (code == NOT)
+    emit_insn (op);
+  else
+    {
+      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+    }
+
+  /* Fix up the destination if needed.  */
+  if (dst != operands[0])
+    emit_move_insn (operands[0], dst);
+}
+
+/* Predict just emitted jump instruction to be taken with probability PROB.  */
+
+static void
+predict_jump (int prob)
+{
+  rtx_insn *insn = get_last_insn ();
+  gcc_assert (JUMP_P (insn));
+  add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
+}
+
+/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
+   divisor are within the range [0-255].  */
+
+void
+ix86_split_idivmod (machine_mode mode, rtx operands[],
+		    bool signed_p)
+{
+  rtx_code_label *end_label, *qimode_label;
+  rtx div, mod;
+  rtx_insn *insn;
+  rtx scratch, tmp0, tmp1, tmp2;
+  rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
+  rtx (*gen_zero_extend) (rtx, rtx);
+  rtx (*gen_test_ccno_1) (rtx, rtx);
+
+  switch (mode)
+    {
+    case E_SImode:
+      if (GET_MODE (operands[0]) == SImode)
+	{
+	  if (GET_MODE (operands[1]) == SImode)
+	    gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
+	  else
+	    gen_divmod4_1
+	      = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
+	  gen_zero_extend = gen_zero_extendqisi2;
+	}
+      else
+	{
+	  gen_divmod4_1
+	    = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
+	  gen_zero_extend = gen_zero_extendqidi2;
+	}
+      gen_test_ccno_1 = gen_testsi_ccno_1;
+      break;
+    case E_DImode:
+      gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
+      gen_test_ccno_1 = gen_testdi_ccno_1;
+      gen_zero_extend = gen_zero_extendqidi2;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  end_label = gen_label_rtx ();
+  qimode_label = gen_label_rtx ();
+
+  scratch = gen_reg_rtx (mode);
+
+  /* Use 8bit unsigned divimod if dividend and divisor are within
+     the range [0-255].  */
+  emit_move_insn (scratch, operands[2]);
+  scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
+				 scratch, 1, OPTAB_DIRECT);
+  emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
+  tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
+  tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
+  tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
+			       gen_rtx_LABEL_REF (VOIDmode, qimode_label),
+			       pc_rtx);
+  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
+  predict_jump (REG_BR_PROB_BASE * 50 / 100);
+  JUMP_LABEL (insn) = qimode_label;
+
+  /* Generate original signed/unsigned divimod.  */
+  div = gen_divmod4_1 (operands[0], operands[1],
+		       operands[2], operands[3]);
+  emit_insn (div);
+
+  /* Branch to the end.  */
+  emit_jump_insn (gen_jump (end_label));
+  emit_barrier ();
+
+  /* Generate 8bit unsigned divide.  */
+  emit_label (qimode_label);
+  /* Don't use operands[0] for result of 8bit divide since not all
+     registers support QImode ZERO_EXTRACT.  */
+  tmp0 = lowpart_subreg (HImode, scratch, mode);
+  tmp1 = lowpart_subreg (HImode, operands[2], mode);
+  tmp2 = lowpart_subreg (QImode, operands[3], mode);
+  emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
+
+  if (signed_p)
+    {
+      div = gen_rtx_DIV (mode, operands[2], operands[3]);
+      mod = gen_rtx_MOD (mode, operands[2], operands[3]);
+    }
+  else
+    {
+      div = gen_rtx_UDIV (mode, operands[2], operands[3]);
+      mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
+    }
+  if (mode == SImode)
+    {
+      if (GET_MODE (operands[0]) != SImode)
+	div = gen_rtx_ZERO_EXTEND (DImode, div);
+      if (GET_MODE (operands[1]) != SImode)
+	mod = gen_rtx_ZERO_EXTEND (DImode, mod);
+    }
+
+  /* Extract remainder from AH.  */
+  tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
+			       tmp0, GEN_INT (8), GEN_INT (8));
+  if (REG_P (operands[1]))
+    insn = emit_move_insn (operands[1], tmp1);
+  else
+    {
+      /* Need a new scratch register since the old one has result
+	 of 8bit divide.  */
+      scratch = gen_reg_rtx (GET_MODE (operands[1]));
+      emit_move_insn (scratch, tmp1);
+      insn = emit_move_insn (operands[1], scratch);
+    }
+  set_unique_reg_note (insn, REG_EQUAL, mod);
+
+  /* Zero extend quotient from AL.  */
+  tmp1 = gen_lowpart (QImode, tmp0);
+  insn = emit_insn (gen_zero_extend (operands[0], tmp1));
+  set_unique_reg_note (insn, REG_EQUAL, div);
+
+  emit_label (end_label);
+}
+
+/* Emit x86 binary operand CODE in mode MODE, where the first operand
+   matches destination.  RTX includes clobber of FLAGS_REG.  */
+
+void
+ix86_emit_binop (enum rtx_code code, machine_mode mode,
+		 rtx dst, rtx src)
+{
+  rtx op, clob;
+
+  op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
+  clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+  
+  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+}
+
+/* Return true if regno1 def is nearest to the insn.  */
+
+static bool
+find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
+{
+  rtx_insn *prev = insn;
+  rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
+
+  if (insn == start)
+    return false;
+  while (prev && prev != start)
+    {
+      if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
+	{
+	  prev = PREV_INSN (prev);
+	  continue;
+	}
+      if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
+	return true;
+      else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
+	return false;
+      prev = PREV_INSN (prev);
+    }
+
+  /* None of the regs is defined in the bb.  */
+  return false;
+}
+
+/* Split lea instructions into a sequence of instructions
+   which are executed on ALU to avoid AGU stalls.
+   It is assumed that it is allowed to clobber flags register
+   at lea position.  */
+
+void
+ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
+{
+  unsigned int regno0, regno1, regno2;
+  struct ix86_address parts;
+  rtx target, tmp;
+  int ok, adds;
+
+  ok = ix86_decompose_address (operands[1], &parts);
+  gcc_assert (ok);
+
+  target = gen_lowpart (mode, operands[0]);
+
+  regno0 = true_regnum (target);
+  regno1 = INVALID_REGNUM;
+  regno2 = INVALID_REGNUM;
+
+  if (parts.base)
+    {
+      parts.base = gen_lowpart (mode, parts.base);
+      regno1 = true_regnum (parts.base);
+    }
+
+  if (parts.index)
+    {
+      parts.index = gen_lowpart (mode, parts.index);
+      regno2 = true_regnum (parts.index);
+    }
+
+  if (parts.disp)
+    parts.disp = gen_lowpart (mode, parts.disp);
+
+  if (parts.scale > 1)
+    {
+      /* Case r1 = r1 + ...  */
+      if (regno1 == regno0)
+	{
+	  /* If we have a case r1 = r1 + C * r2 then we
+	     should use multiplication which is very
+	     expensive.  Assume cost model is wrong if we
+	     have such case here.  */
+	  gcc_assert (regno2 != regno0);
+
+	  for (adds = parts.scale; adds > 0; adds--)
+	    ix86_emit_binop (PLUS, mode, target, parts.index);
+	}
+      else
+	{
+	  /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
+	  if (regno0 != regno2)
+	    emit_insn (gen_rtx_SET (target, parts.index));
+
+	  /* Use shift for scaling.  */
+	  ix86_emit_binop (ASHIFT, mode, target,
+			   GEN_INT (exact_log2 (parts.scale)));
+
+	  if (parts.base)
+	    ix86_emit_binop (PLUS, mode, target, parts.base);
+
+	  if (parts.disp && parts.disp != const0_rtx)
+	    ix86_emit_binop (PLUS, mode, target, parts.disp);
+	}
+    }
+  else if (!parts.base && !parts.index)
+    {
+      gcc_assert(parts.disp);
+      emit_insn (gen_rtx_SET (target, parts.disp));
+    }
+  else
+    {
+      if (!parts.base)
+	{
+	  if (regno0 != regno2)
+	    emit_insn (gen_rtx_SET (target, parts.index));
+	}
+      else if (!parts.index)
+	{
+	  if (regno0 != regno1)
+	    emit_insn (gen_rtx_SET (target, parts.base));
+	}
+      else
+	{
+	  if (regno0 == regno1)
+	    tmp = parts.index;
+	  else if (regno0 == regno2)
+	    tmp = parts.base;
+	  else
+	    {
+	      rtx tmp1;
+
+	      /* Find better operand for SET instruction, depending
+		 on which definition is farther from the insn.  */
+	      if (find_nearest_reg_def (insn, regno1, regno2))
+		tmp = parts.index, tmp1 = parts.base;
+	      else
+		tmp = parts.base, tmp1 = parts.index;
+
+	      emit_insn (gen_rtx_SET (target, tmp));
+
+	      if (parts.disp && parts.disp != const0_rtx)
+		ix86_emit_binop (PLUS, mode, target, parts.disp);
+
+	      ix86_emit_binop (PLUS, mode, target, tmp1);
+	      return;
+	    }
+
+	  ix86_emit_binop (PLUS, mode, target, tmp);
+	}
+
+      if (parts.disp && parts.disp != const0_rtx)
+	ix86_emit_binop (PLUS, mode, target, parts.disp);
+    }
+}
+
+/* Post-reload splitter for converting an SF or DFmode value in an
+   SSE register into an unsigned SImode.  */
+
+void
+ix86_split_convert_uns_si_sse (rtx operands[])
+{
+  machine_mode vecmode;
+  rtx value, large, zero_or_two31, input, two31, x;
+
+  large = operands[1];
+  zero_or_two31 = operands[2];
+  input = operands[3];
+  two31 = operands[4];
+  vecmode = GET_MODE (large);
+  value = gen_rtx_REG (vecmode, REGNO (operands[0]));
+
+  /* Load up the value into the low element.  We must ensure that the other
+     elements are valid floats -- zero is the easiest such value.  */
+  if (MEM_P (input))
+    {
+      if (vecmode == V4SFmode)
+	emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
+      else
+	emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
+    }
+  else
+    {
+      input = gen_rtx_REG (vecmode, REGNO (input));
+      emit_move_insn (value, CONST0_RTX (vecmode));
+      if (vecmode == V4SFmode)
+	emit_insn (gen_sse_movss (value, value, input));
+      else
+	emit_insn (gen_sse2_movsd (value, value, input));
+    }
+
+  emit_move_insn (large, two31);
+  emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
+
+  x = gen_rtx_fmt_ee (LE, vecmode, large, value);
+  emit_insn (gen_rtx_SET (large, x));
+
+  x = gen_rtx_AND (vecmode, zero_or_two31, large);
+  emit_insn (gen_rtx_SET (zero_or_two31, x));
+
+  x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
+  emit_insn (gen_rtx_SET (value, x));
+
+  large = gen_rtx_REG (V4SImode, REGNO (large));
+  emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
+
+  x = gen_rtx_REG (V4SImode, REGNO (value));
+  if (vecmode == V4SFmode)
+    emit_insn (gen_fix_truncv4sfv4si2 (x, value));
+  else
+    emit_insn (gen_sse2_cvttpd2dq (x, value));
+  value = x;
+
+  emit_insn (gen_xorv4si3 (value, value, large));
+}
+
+static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
+						 machine_mode mode, rtx target,
+						 rtx var, int one_var);
+
+/* Convert an unsigned DImode value into a DFmode, using only SSE.
+   Expects the 64-bit DImode to be supplied in a pair of integral
+   registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
+   -mfpmath=sse, !optimize_size only.  */
+
+void
+ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
+{
+  REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
+  rtx int_xmm, fp_xmm;
+  rtx biases, exponents;
+  rtx x;
+
+  int_xmm = gen_reg_rtx (V4SImode);
+  if (TARGET_INTER_UNIT_MOVES_TO_VEC)
+    emit_insn (gen_movdi_to_sse (int_xmm, input));
+  else if (TARGET_SSE_SPLIT_REGS)
+    {
+      emit_clobber (int_xmm);
+      emit_move_insn (gen_lowpart (DImode, int_xmm), input);
+    }
+  else
+    {
+      x = gen_reg_rtx (V2DImode);
+      ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
+      emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
+    }
+
+  x = gen_rtx_CONST_VECTOR (V4SImode,
+			    gen_rtvec (4, GEN_INT (0x43300000UL),
+				       GEN_INT (0x45300000UL),
+				       const0_rtx, const0_rtx));
+  exponents = validize_mem (force_const_mem (V4SImode, x));
+
+  /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
+  emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
+
+  /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
+     yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
+     Similarly (0x45300000UL ## fp_value_hi_xmm) yields
+     (0x1.0p84 + double(fp_value_hi_xmm)).
+     Note these exponents differ by 32.  */
+
+  fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
+
+  /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
+     in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
+  real_ldexp (&bias_lo_rvt, &dconst1, 52);
+  real_ldexp (&bias_hi_rvt, &dconst1, 84);
+  biases = const_double_from_real_value (bias_lo_rvt, DFmode);
+  x = const_double_from_real_value (bias_hi_rvt, DFmode);
+  biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
+  biases = validize_mem (force_const_mem (V2DFmode, biases));
+  emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
+
+  /* Add the upper and lower DFmode values together.  */
+  if (TARGET_SSE3)
+    emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
+  else
+    {
+      x = copy_to_mode_reg (V2DFmode, fp_xmm);
+      emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
+      emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
+    }
+
+  ix86_expand_vector_extract (false, target, fp_xmm, 0);
+}
+
+/* Not used, but eases macroization of patterns.  */
+void
+ix86_expand_convert_uns_sixf_sse (rtx, rtx)
+{
+  gcc_unreachable ();
+}
+
+/* Convert an unsigned SImode value into a DFmode.  Only currently used
+   for SSE, but applicable anywhere.  */
+
+void
+ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
+{
+  REAL_VALUE_TYPE TWO31r;
+  rtx x, fp;
+
+  x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
+			   NULL, 1, OPTAB_DIRECT);
+
+  fp = gen_reg_rtx (DFmode);
+  emit_insn (gen_floatsidf2 (fp, x));
+
+  real_ldexp (&TWO31r, &dconst1, 31);
+  x = const_double_from_real_value (TWO31r, DFmode);
+
+  x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
+  if (x != target)
+    emit_move_insn (target, x);
+}
+
+/* Convert a signed DImode value into a DFmode.  Only used for SSE in
+   32-bit mode; otherwise we have a direct convert instruction.  */
+
+void
+ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
+{
+  REAL_VALUE_TYPE TWO32r;
+  rtx fp_lo, fp_hi, x;
+
+  fp_lo = gen_reg_rtx (DFmode);
+  fp_hi = gen_reg_rtx (DFmode);
+
+  emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
+
+  real_ldexp (&TWO32r, &dconst1, 32);
+  x = const_double_from_real_value (TWO32r, DFmode);
+  fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
+
+  ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
+
+  x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
+			   0, OPTAB_DIRECT);
+  if (x != target)
+    emit_move_insn (target, x);
+}
+
+/* Convert an unsigned SImode value into a SFmode, using only SSE.
+   For x86_32, -mfpmath=sse, !optimize_size only.  */
+void
+ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
+{
+  REAL_VALUE_TYPE ONE16r;
+  rtx fp_hi, fp_lo, int_hi, int_lo, x;
+
+  real_ldexp (&ONE16r, &dconst1, 16);
+  x = const_double_from_real_value (ONE16r, SFmode);
+  int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
+				      NULL, 0, OPTAB_DIRECT);
+  int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
+				      NULL, 0, OPTAB_DIRECT);
+  fp_hi = gen_reg_rtx (SFmode);
+  fp_lo = gen_reg_rtx (SFmode);
+  emit_insn (gen_floatsisf2 (fp_hi, int_hi));
+  emit_insn (gen_floatsisf2 (fp_lo, int_lo));
+  fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
+			       0, OPTAB_DIRECT);
+  fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
+			       0, OPTAB_DIRECT);
+  if (!rtx_equal_p (target, fp_hi))
+    emit_move_insn (target, fp_hi);
+}
+
+/* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
+   a vector of unsigned ints VAL to vector of floats TARGET.  */
+
+void
+ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
+{
+  rtx tmp[8];
+  REAL_VALUE_TYPE TWO16r;
+  machine_mode intmode = GET_MODE (val);
+  machine_mode fltmode = GET_MODE (target);
+  rtx (*cvt) (rtx, rtx);
+
+  if (intmode == V4SImode)
+    cvt = gen_floatv4siv4sf2;
+  else
+    cvt = gen_floatv8siv8sf2;
+  tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
+  tmp[0] = force_reg (intmode, tmp[0]);
+  tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
+				OPTAB_DIRECT);
+  tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
+				NULL_RTX, 1, OPTAB_DIRECT);
+  tmp[3] = gen_reg_rtx (fltmode);
+  emit_insn (cvt (tmp[3], tmp[1]));
+  tmp[4] = gen_reg_rtx (fltmode);
+  emit_insn (cvt (tmp[4], tmp[2]));
+  real_ldexp (&TWO16r, &dconst1, 16);
+  tmp[5] = const_double_from_real_value (TWO16r, SFmode);
+  tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
+  tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
+				OPTAB_DIRECT);
+  tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
+				OPTAB_DIRECT);
+  if (tmp[7] != target)
+    emit_move_insn (target, tmp[7]);
+}
+
+/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
+   pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
+   This is done by doing just signed conversion if < 0x1p31, and otherwise by
+   subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
+
+rtx
+ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
+{
+  REAL_VALUE_TYPE TWO31r;
+  rtx two31r, tmp[4];
+  machine_mode mode = GET_MODE (val);
+  machine_mode scalarmode = GET_MODE_INNER (mode);
+  machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
+  rtx (*cmp) (rtx, rtx, rtx, rtx);
+  int i;
+
+  for (i = 0; i < 3; i++)
+    tmp[i] = gen_reg_rtx (mode);
+  real_ldexp (&TWO31r, &dconst1, 31);
+  two31r = const_double_from_real_value (TWO31r, scalarmode);
+  two31r = ix86_build_const_vector (mode, 1, two31r);
+  two31r = force_reg (mode, two31r);
+  switch (mode)
+    {
+    case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
+    case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
+    case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
+    case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
+    default: gcc_unreachable ();
+    }
+  tmp[3] = gen_rtx_LE (mode, two31r, val);
+  emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
+  tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
+				0, OPTAB_DIRECT);
+  if (intmode == V4SImode || TARGET_AVX2)
+    *xorp = expand_simple_binop (intmode, ASHIFT,
+				 gen_lowpart (intmode, tmp[0]),
+				 GEN_INT (31), NULL_RTX, 0,
+				 OPTAB_DIRECT);
+  else
+    {
+      rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
+      two31 = ix86_build_const_vector (intmode, 1, two31);
+      *xorp = expand_simple_binop (intmode, AND,
+				   gen_lowpart (intmode, tmp[0]),
+				   two31, NULL_RTX, 0,
+				   OPTAB_DIRECT);
+    }
+  return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
+			      0, OPTAB_DIRECT);
+}
+
+/* Generate code for floating point ABS or NEG.  */
+
+void
+ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
+				rtx operands[])
+{
+  rtx mask, set, dst, src;
+  bool use_sse = false;
+  bool vector_mode = VECTOR_MODE_P (mode);
+  machine_mode vmode = mode;
+
+  if (vector_mode)
+    use_sse = true;
+  else if (mode == TFmode)
+    use_sse = true;
+  else if (TARGET_SSE_MATH)
+    {
+      use_sse = SSE_FLOAT_MODE_P (mode);
+      if (mode == SFmode)
+	vmode = V4SFmode;
+      else if (mode == DFmode)
+	vmode = V2DFmode;
+    }
+
+  /* NEG and ABS performed with SSE use bitwise mask operations.
+     Create the appropriate mask now.  */
+  if (use_sse)
+    mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
+  else
+    mask = NULL_RTX;
+
+  dst = operands[0];
+  src = operands[1];
+
+  set = gen_rtx_fmt_e (code, mode, src);
+  set = gen_rtx_SET (dst, set);
+
+  if (mask)
+    {
+      rtx use, clob;
+      rtvec par;
+
+      use = gen_rtx_USE (VOIDmode, mask);
+      if (vector_mode)
+	par = gen_rtvec (2, set, use);
+      else
+	{
+          clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+	  par = gen_rtvec (3, set, use, clob);
+        }
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
+    }
+  else
+    emit_insn (set);
+}
+
+/* Expand a copysign operation.  Special case operand 0 being a constant.  */
+
+void
+ix86_expand_copysign (rtx operands[])
+{
+  machine_mode mode, vmode;
+  rtx dest, op0, op1, mask, nmask;
+
+  dest = operands[0];
+  op0 = operands[1];
+  op1 = operands[2];
+
+  mode = GET_MODE (dest);
+
+  if (mode == SFmode)
+    vmode = V4SFmode;
+  else if (mode == DFmode)
+    vmode = V2DFmode;
+  else
+    vmode = mode;
+
+  if (CONST_DOUBLE_P (op0))
+    {
+      rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
+
+      if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
+	op0 = simplify_unary_operation (ABS, mode, op0, mode);
+
+      if (mode == SFmode || mode == DFmode)
+	{
+	  if (op0 == CONST0_RTX (mode))
+	    op0 = CONST0_RTX (vmode);
+	  else
+	    {
+	      rtx v = ix86_build_const_vector (vmode, false, op0);
+
+	      op0 = force_reg (vmode, v);
+	    }
+	}
+      else if (op0 != CONST0_RTX (mode))
+	op0 = force_reg (mode, op0);
+
+      mask = ix86_build_signbit_mask (vmode, 0, 0);
+
+      if (mode == SFmode)
+	copysign_insn = gen_copysignsf3_const;
+      else if (mode == DFmode)
+	copysign_insn = gen_copysigndf3_const;
+      else
+	copysign_insn = gen_copysigntf3_const;
+
+      emit_insn (copysign_insn (dest, op0, op1, mask));
+    }
+  else
+    {
+      rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
+
+      nmask = ix86_build_signbit_mask (vmode, 0, 1);
+      mask = ix86_build_signbit_mask (vmode, 0, 0);
+
+      if (mode == SFmode)
+	copysign_insn = gen_copysignsf3_var;
+      else if (mode == DFmode)
+	copysign_insn = gen_copysigndf3_var;
+      else
+	copysign_insn = gen_copysigntf3_var;
+
+      emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
+    }
+}
+
+/* Deconstruct a copysign operation into bit masks.  Operand 0 is known to
+   be a constant, and so has already been expanded into a vector constant.  */
+
+void
+ix86_split_copysign_const (rtx operands[])
+{
+  machine_mode mode, vmode;
+  rtx dest, op0, mask, x;
+
+  dest = operands[0];
+  op0 = operands[1];
+  mask = operands[3];
+
+  mode = GET_MODE (dest);
+  vmode = GET_MODE (mask);
+
+  dest = lowpart_subreg (vmode, dest, mode);
+  x = gen_rtx_AND (vmode, dest, mask);
+  emit_insn (gen_rtx_SET (dest, x));
+
+  if (op0 != CONST0_RTX (vmode))
+    {
+      x = gen_rtx_IOR (vmode, dest, op0);
+      emit_insn (gen_rtx_SET (dest, x));
+    }
+}
+
+/* Deconstruct a copysign operation into bit masks.  Operand 0 is variable,
+   so we have to do two masks.  */
+
+void
+ix86_split_copysign_var (rtx operands[])
+{
+  machine_mode mode, vmode;
+  rtx dest, scratch, op0, op1, mask, nmask, x;
+
+  dest = operands[0];
+  scratch = operands[1];
+  op0 = operands[2];
+  op1 = operands[3];
+  nmask = operands[4];
+  mask = operands[5];
+
+  mode = GET_MODE (dest);
+  vmode = GET_MODE (mask);
+
+  if (rtx_equal_p (op0, op1))
+    {
+      /* Shouldn't happen often (it's useless, obviously), but when it does
+	 we'd generate incorrect code if we continue below.  */
+      emit_move_insn (dest, op0);
+      return;
+    }
+
+  if (REG_P (mask) && REGNO (dest) == REGNO (mask))	/* alternative 0 */
+    {
+      gcc_assert (REGNO (op1) == REGNO (scratch));
+
+      x = gen_rtx_AND (vmode, scratch, mask);
+      emit_insn (gen_rtx_SET (scratch, x));
+
+      dest = mask;
+      op0 = lowpart_subreg (vmode, op0, mode);
+      x = gen_rtx_NOT (vmode, dest);
+      x = gen_rtx_AND (vmode, x, op0);
+      emit_insn (gen_rtx_SET (dest, x));
+    }
+  else
+    {
+      if (REGNO (op1) == REGNO (scratch))		/* alternative 1,3 */
+	{
+	  x = gen_rtx_AND (vmode, scratch, mask);
+	}
+      else						/* alternative 2,4 */
+	{
+          gcc_assert (REGNO (mask) == REGNO (scratch));
+          op1 = lowpart_subreg (vmode, op1, mode);
+	  x = gen_rtx_AND (vmode, scratch, op1);
+	}
+      emit_insn (gen_rtx_SET (scratch, x));
+
+      if (REGNO (op0) == REGNO (dest))			/* alternative 1,2 */
+	{
+	  dest = lowpart_subreg (vmode, op0, mode);
+	  x = gen_rtx_AND (vmode, dest, nmask);
+	}
+      else						/* alternative 3,4 */
+	{
+          gcc_assert (REGNO (nmask) == REGNO (dest));
+	  dest = nmask;
+	  op0 = lowpart_subreg (vmode, op0, mode);
+	  x = gen_rtx_AND (vmode, dest, op0);
+	}
+      emit_insn (gen_rtx_SET (dest, x));
+    }
+
+  x = gen_rtx_IOR (vmode, dest, scratch);
+  emit_insn (gen_rtx_SET (dest, x));
+}
+
+/* Expand an xorsign operation.  */
+
+void
+ix86_expand_xorsign (rtx operands[])
+{
+  rtx (*xorsign_insn)(rtx, rtx, rtx, rtx);
+  machine_mode mode, vmode;
+  rtx dest, op0, op1, mask;
+
+  dest = operands[0];
+  op0 = operands[1];
+  op1 = operands[2];
+
+  mode = GET_MODE (dest);
+
+  if (mode == SFmode)
+    {
+      xorsign_insn = gen_xorsignsf3_1;
+      vmode = V4SFmode;
+    }
+  else if (mode == DFmode)
+    {
+      xorsign_insn = gen_xorsigndf3_1;
+      vmode = V2DFmode;
+    }
+  else
+    gcc_unreachable ();
+
+  mask = ix86_build_signbit_mask (vmode, 0, 0);
+
+  emit_insn (xorsign_insn (dest, op0, op1, mask));
+}
+
+/* Deconstruct an xorsign operation into bit masks.  */
+
+void
+ix86_split_xorsign (rtx operands[])
+{
+  machine_mode mode, vmode;
+  rtx dest, op0, mask, x;
+
+  dest = operands[0];
+  op0 = operands[1];
+  mask = operands[3];
+
+  mode = GET_MODE (dest);
+  vmode = GET_MODE (mask);
+
+  dest = lowpart_subreg (vmode, dest, mode);
+  x = gen_rtx_AND (vmode, dest, mask);
+  emit_insn (gen_rtx_SET (dest, x));
+
+  op0 = lowpart_subreg (vmode, op0, mode);
+  x = gen_rtx_XOR (vmode, dest, op0);
+  emit_insn (gen_rtx_SET (dest, x));
+}
+
+static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
+
+void
+ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
+{
+  machine_mode mode = GET_MODE (op0);
+  rtx tmp;
+
+  /* Handle special case - vector comparsion with boolean result, transform
+     it using ptest instruction.  */
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+    {
+      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
+      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
+
+      gcc_assert (code == EQ || code == NE);
+      /* Generate XOR since we can't check that one operand is zero vector.  */
+      tmp = gen_reg_rtx (mode);
+      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
+      tmp = gen_lowpart (p_mode, tmp);
+      emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
+			      gen_rtx_UNSPEC (CCmode,
+					      gen_rtvec (2, tmp, tmp),
+					      UNSPEC_PTEST)));
+      tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
+      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+				  gen_rtx_LABEL_REF (VOIDmode, label),
+				  pc_rtx);
+      emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+      return;
+    }
+
+  switch (mode)
+    {
+    case E_SFmode:
+    case E_DFmode:
+    case E_XFmode:
+    case E_QImode:
+    case E_HImode:
+    case E_SImode:
+      simple:
+      tmp = ix86_expand_compare (code, op0, op1);
+      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+				  gen_rtx_LABEL_REF (VOIDmode, label),
+				  pc_rtx);
+      emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+      return;
+
+    case E_DImode:
+      if (TARGET_64BIT)
+	goto simple;
+      /* For 32-bit target DI comparison may be performed on
+	 SSE registers.  To allow this we should avoid split
+	 to SI mode which is achieved by doing xor in DI mode
+	 and then comparing with zero (which is recognized by
+	 STV pass).  We don't compare using xor when optimizing
+	 for size.  */
+      if (!optimize_insn_for_size_p ()
+	  && TARGET_STV
+	  && (code == EQ || code == NE))
+	{
+	  op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
+	  op1 = const0_rtx;
+	}
+      /* FALLTHRU */
+    case E_TImode:
+      /* Expand DImode branch into multiple compare+branch.  */
+      {
+	rtx lo[2], hi[2];
+	rtx_code_label *label2;
+	enum rtx_code code1, code2, code3;
+	machine_mode submode;
+
+	if (CONSTANT_P (op0) && !CONSTANT_P (op1))
+	  {
+	    std::swap (op0, op1);
+	    code = swap_condition (code);
+	  }
+
+	split_double_mode (mode, &op0, 1, lo+0, hi+0);
+	split_double_mode (mode, &op1, 1, lo+1, hi+1);
+
+	submode = mode == DImode ? SImode : DImode;
+
+	/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
+	   avoid two branches.  This costs one extra insn, so disable when
+	   optimizing for size.  */
+
+	if ((code == EQ || code == NE)
+	    && (!optimize_insn_for_size_p ()
+	        || hi[1] == const0_rtx || lo[1] == const0_rtx))
+	  {
+	    rtx xor0, xor1;
+
+	    xor1 = hi[0];
+	    if (hi[1] != const0_rtx)
+	      xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
+				   NULL_RTX, 0, OPTAB_WIDEN);
+
+	    xor0 = lo[0];
+	    if (lo[1] != const0_rtx)
+	      xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
+				   NULL_RTX, 0, OPTAB_WIDEN);
+
+	    tmp = expand_binop (submode, ior_optab, xor1, xor0,
+				NULL_RTX, 0, OPTAB_WIDEN);
+
+	    ix86_expand_branch (code, tmp, const0_rtx, label);
+	    return;
+	  }
+
+	/* Otherwise, if we are doing less-than or greater-or-equal-than,
+	   op1 is a constant and the low word is zero, then we can just
+	   examine the high word.  Similarly for low word -1 and
+	   less-or-equal-than or greater-than.  */
+
+	if (CONST_INT_P (hi[1]))
+	  switch (code)
+	    {
+	    case LT: case LTU: case GE: case GEU:
+	      if (lo[1] == const0_rtx)
+		{
+		  ix86_expand_branch (code, hi[0], hi[1], label);
+		  return;
+		}
+	      break;
+	    case LE: case LEU: case GT: case GTU:
+	      if (lo[1] == constm1_rtx)
+		{
+		  ix86_expand_branch (code, hi[0], hi[1], label);
+		  return;
+		}
+	      break;
+	    default:
+	      break;
+	    }
+
+	/* Emulate comparisons that do not depend on Zero flag with
+	   double-word subtraction.  Note that only Overflow, Sign
+	   and Carry flags are valid, so swap arguments and condition
+	   of comparisons that would otherwise test Zero flag.  */
+
+	switch (code)
+	  {
+	  case LE: case LEU: case GT: case GTU:
+	    std::swap (lo[0], lo[1]);
+	    std::swap (hi[0], hi[1]);
+	    code = swap_condition (code);
+	    /* FALLTHRU */
+
+	  case LT: case LTU: case GE: case GEU:
+	    {
+	      rtx (*cmp_insn) (rtx, rtx);
+	      rtx (*sbb_insn) (rtx, rtx, rtx);
+	      bool uns = (code == LTU || code == GEU);
+
+	      if (TARGET_64BIT)
+		{
+		  cmp_insn = gen_cmpdi_1;
+		  sbb_insn
+		    = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
+		}
+	      else
+		{
+		  cmp_insn = gen_cmpsi_1;
+		  sbb_insn
+		    = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
+		}
+
+	      if (!nonimmediate_operand (lo[0], submode))
+		lo[0] = force_reg (submode, lo[0]);
+	      if (!x86_64_general_operand (lo[1], submode))
+		lo[1] = force_reg (submode, lo[1]);
+
+	      if (!register_operand (hi[0], submode))
+		hi[0] = force_reg (submode, hi[0]);
+	      if ((uns && !nonimmediate_operand (hi[1], submode))
+		  || (!uns && !x86_64_general_operand (hi[1], submode)))
+		hi[1] = force_reg (submode, hi[1]);
+
+	      emit_insn (cmp_insn (lo[0], lo[1]));
+	      emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
+
+	      tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
+
+	      ix86_expand_branch (code, tmp, const0_rtx, label);
+	      return;
+	    }
+
+	  default:
+	    break;
+	  }
+
+	/* Otherwise, we need two or three jumps.  */
+
+	label2 = gen_label_rtx ();
+
+	code1 = code;
+	code2 = swap_condition (code);
+	code3 = unsigned_condition (code);
+
+	switch (code)
+	  {
+	  case LT: case GT: case LTU: case GTU:
+	    break;
+
+	  case LE:   code1 = LT;  code2 = GT;  break;
+	  case GE:   code1 = GT;  code2 = LT;  break;
+	  case LEU:  code1 = LTU; code2 = GTU; break;
+	  case GEU:  code1 = GTU; code2 = LTU; break;
+
+	  case EQ:   code1 = UNKNOWN; code2 = NE;  break;
+	  case NE:   code2 = UNKNOWN; break;
+
+	  default:
+	    gcc_unreachable ();
+	  }
+
+	/*
+	 * a < b =>
+	 *    if (hi(a) < hi(b)) goto true;
+	 *    if (hi(a) > hi(b)) goto false;
+	 *    if (lo(a) < lo(b)) goto true;
+	 *  false:
+	 */
+
+	if (code1 != UNKNOWN)
+	  ix86_expand_branch (code1, hi[0], hi[1], label);
+	if (code2 != UNKNOWN)
+	  ix86_expand_branch (code2, hi[0], hi[1], label2);
+
+	ix86_expand_branch (code3, lo[0], lo[1], label);
+
+	if (code2 != UNKNOWN)
+	  emit_label (label2);
+	return;
+      }
+
+    default:
+      gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
+      goto simple;
+    }
+}
+
+/* Figure out whether to use unordered fp comparisons.  */
+
+static bool
+ix86_unordered_fp_compare (enum rtx_code code)
+{
+  if (!TARGET_IEEE_FP)
+    return false;
+
+  switch (code)
+    {
+    case GT:
+    case GE:
+    case LT:
+    case LE:
+      return false;
+
+    case EQ:
+    case NE:
+
+    case LTGT:
+    case UNORDERED:
+    case ORDERED:
+    case UNLT:
+    case UNLE:
+    case UNGT:
+    case UNGE:
+    case UNEQ:
+      return true;
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Return a comparison we can do and that it is equivalent to
+   swap_condition (code) apart possibly from orderedness.
+   But, never change orderedness if TARGET_IEEE_FP, returning
+   UNKNOWN in that case if necessary.  */
+
+static enum rtx_code
+ix86_fp_swap_condition (enum rtx_code code)
+{
+  switch (code)
+    {
+    case GT:                   /* GTU - CF=0 & ZF=0 */
+      return TARGET_IEEE_FP ? UNKNOWN : UNLT;
+    case GE:                   /* GEU - CF=0 */
+      return TARGET_IEEE_FP ? UNKNOWN : UNLE;
+    case UNLT:                 /* LTU - CF=1 */
+      return TARGET_IEEE_FP ? UNKNOWN : GT;
+    case UNLE:                 /* LEU - CF=1 | ZF=1 */
+      return TARGET_IEEE_FP ? UNKNOWN : GE;
+    default:
+      return swap_condition (code);
+    }
+}
+
+/* Return cost of comparison CODE using the best strategy for performance.
+   All following functions do use number of instructions as a cost metrics.
+   In future this should be tweaked to compute bytes for optimize_size and
+   take into account performance of various instructions on various CPUs.  */
+
+static int
+ix86_fp_comparison_cost (enum rtx_code code)
+{
+  int arith_cost;
+
+  /* The cost of code using bit-twiddling on %ah.  */
+  switch (code)
+    {
+    case UNLE:
+    case UNLT:
+    case LTGT:
+    case GT:
+    case GE:
+    case UNORDERED:
+    case ORDERED:
+    case UNEQ:
+      arith_cost = 4;
+      break;
+    case LT:
+    case NE:
+    case EQ:
+    case UNGE:
+      arith_cost = TARGET_IEEE_FP ? 5 : 4;
+      break;
+    case LE:
+    case UNGT:
+      arith_cost = TARGET_IEEE_FP ? 6 : 4;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  switch (ix86_fp_comparison_strategy (code))
+    {
+    case IX86_FPCMP_COMI:
+      return arith_cost > 4 ? 3 : 2;
+    case IX86_FPCMP_SAHF:
+      return arith_cost > 4 ? 4 : 3;
+    default:
+      return arith_cost;
+    }
+}
+
+/* Swap, force into registers, or otherwise massage the two operands
+   to a fp comparison.  The operands are updated in place; the new
+   comparison code is returned.  */
+
+static enum rtx_code
+ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
+{
+  bool unordered_compare = ix86_unordered_fp_compare (code);
+  rtx op0 = *pop0, op1 = *pop1;
+  machine_mode op_mode = GET_MODE (op0);
+  bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
+
+  /* All of the unordered compare instructions only work on registers.
+     The same is true of the fcomi compare instructions.  The XFmode
+     compare instructions require registers except when comparing
+     against zero or when converting operand 1 from fixed point to
+     floating point.  */
+
+  if (!is_sse
+      && (unordered_compare
+	  || (op_mode == XFmode
+	      && ! (standard_80387_constant_p (op0) == 1
+		    || standard_80387_constant_p (op1) == 1)
+	      && GET_CODE (op1) != FLOAT)
+	  || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
+    {
+      op0 = force_reg (op_mode, op0);
+      op1 = force_reg (op_mode, op1);
+    }
+  else
+    {
+      /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
+	 things around if they appear profitable, otherwise force op0
+	 into a register.  */
+
+      if (standard_80387_constant_p (op0) == 0
+	  || (MEM_P (op0)
+	      && ! (standard_80387_constant_p (op1) == 0
+		    || MEM_P (op1))))
+	{
+	  enum rtx_code new_code = ix86_fp_swap_condition (code);
+	  if (new_code != UNKNOWN)
+	    {
+	      std::swap (op0, op1);
+	      code = new_code;
+	    }
+	}
+
+      if (!REG_P (op0))
+	op0 = force_reg (op_mode, op0);
+
+      if (CONSTANT_P (op1))
+	{
+	  int tmp = standard_80387_constant_p (op1);
+	  if (tmp == 0)
+	    op1 = validize_mem (force_const_mem (op_mode, op1));
+	  else if (tmp == 1)
+	    {
+	      if (TARGET_CMOVE)
+		op1 = force_reg (op_mode, op1);
+	    }
+	  else
+	    op1 = force_reg (op_mode, op1);
+	}
+    }
+
+  /* Try to rearrange the comparison to make it cheaper.  */
+  if (ix86_fp_comparison_cost (code)
+      > ix86_fp_comparison_cost (swap_condition (code))
+      && (REG_P (op1) || can_create_pseudo_p ()))
+    {
+      std::swap (op0, op1);
+      code = swap_condition (code);
+      if (!REG_P (op0))
+	op0 = force_reg (op_mode, op0);
+    }
+
+  *pop0 = op0;
+  *pop1 = op1;
+  return code;
+}
+
+/* Generate insn patterns to do a floating point compare of OPERANDS.  */
+
+static rtx
+ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
+{
+  bool unordered_compare = ix86_unordered_fp_compare (code);
+  machine_mode cmp_mode;
+  rtx tmp, scratch;
+
+  code = ix86_prepare_fp_compare_args (code, &op0, &op1);
+
+  tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
+  if (unordered_compare)
+    tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
+
+  /* Do fcomi/sahf based test when profitable.  */
+  switch (ix86_fp_comparison_strategy (code))
+    {
+    case IX86_FPCMP_COMI:
+      cmp_mode = CCFPmode;
+      emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
+      break;
+
+    case IX86_FPCMP_SAHF:
+      cmp_mode = CCFPmode;
+      tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
+      scratch = gen_reg_rtx (HImode);
+      emit_insn (gen_rtx_SET (scratch, tmp));
+      emit_insn (gen_x86_sahf_1 (scratch));
+      break;
+
+    case IX86_FPCMP_ARITH:
+      cmp_mode = CCNOmode;
+      tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
+      scratch = gen_reg_rtx (HImode);
+      emit_insn (gen_rtx_SET (scratch, tmp));
+
+      /* In the unordered case, we have to check C2 for NaN's, which
+	 doesn't happen to work out to anything nice combination-wise.
+	 So do some bit twiddling on the value we've got in AH to come
+	 up with an appropriate set of condition codes.  */
+
+      switch (code)
+	{
+	case GT:
+	case UNGT:
+	  if (code == GT || !TARGET_IEEE_FP)
+	    {
+	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
+	      code = EQ;
+	    }
+	  else
+	    {
+	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
+	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
+	      cmp_mode = CCmode;
+	      code = GEU;
+	    }
+	  break;
+	case LT:
+	case UNLT:
+	  if (code == LT && TARGET_IEEE_FP)
+	    {
+	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+	      emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
+	      cmp_mode = CCmode;
+	      code = EQ;
+	    }
+	  else
+	    {
+	      emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
+	      code = NE;
+	    }
+	  break;
+	case GE:
+	case UNGE:
+	  if (code == GE || !TARGET_IEEE_FP)
+	    {
+	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
+	      code = EQ;
+	    }
+	  else
+	    {
+	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
+	      code = NE;
+	    }
+	  break;
+	case LE:
+	case UNLE:
+	  if (code == LE && TARGET_IEEE_FP)
+	    {
+	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
+	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
+	      cmp_mode = CCmode;
+	      code = LTU;
+	    }
+	  else
+	    {
+	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
+	      code = NE;
+	    }
+	  break;
+	case EQ:
+	case UNEQ:
+	  if (code == EQ && TARGET_IEEE_FP)
+	    {
+	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
+	      cmp_mode = CCmode;
+	      code = EQ;
+	    }
+	  else
+	    {
+	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
+	      code = NE;
+	    }
+	  break;
+	case NE:
+	case LTGT:
+	  if (code == NE && TARGET_IEEE_FP)
+	    {
+	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
+					     GEN_INT (0x40)));
+	      code = NE;
+	    }
+	  else
+	    {
+	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
+	      code = EQ;
+	    }
+	  break;
+
+	case UNORDERED:
+	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
+	  code = NE;
+	  break;
+	case ORDERED:
+	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
+	  code = EQ;
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+	break;
+
+    default:
+      gcc_unreachable();
+    }
+
+  /* Return the test that should be put into the flags user, i.e.
+     the bcc, scc, or cmov instruction.  */
+  return gen_rtx_fmt_ee (code, VOIDmode,
+			 gen_rtx_REG (cmp_mode, FLAGS_REG),
+			 const0_rtx);
+}
+
+/* Generate insn patterns to do an integer compare of OPERANDS.  */
+
+static rtx
+ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
+{
+  machine_mode cmpmode;
+  rtx tmp, flags;
+
+  cmpmode = SELECT_CC_MODE (code, op0, op1);
+  flags = gen_rtx_REG (cmpmode, FLAGS_REG);
+
+  /* This is very simple, but making the interface the same as in the
+     FP case makes the rest of the code easier.  */
+  tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
+  emit_insn (gen_rtx_SET (flags, tmp));
+
+  /* Return the test that should be put into the flags user, i.e.
+     the bcc, scc, or cmov instruction.  */
+  return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
+}
+
+static rtx
+ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
+{
+  rtx ret;
+
+  if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
+    ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
+
+  else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
+    {
+      gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
+      ret = ix86_expand_fp_compare (code, op0, op1);
+    }
+  else
+    ret = ix86_expand_int_compare (code, op0, op1);
+
+  return ret;
+}
+
+void
+ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
+{
+  rtx ret;
+
+  gcc_assert (GET_MODE (dest) == QImode);
+
+  ret = ix86_expand_compare (code, op0, op1);
+  PUT_MODE (ret, QImode);
+  emit_insn (gen_rtx_SET (dest, ret));
+}
+
+/* Expand comparison setting or clearing carry flag.  Return true when
+   successful and set pop for the operation.  */
+static bool
+ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
+{
+  machine_mode mode
+    = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
+
+  /* Do not handle double-mode compares that go through special path.  */
+  if (mode == (TARGET_64BIT ? TImode : DImode))
+    return false;
+
+  if (SCALAR_FLOAT_MODE_P (mode))
+    {
+      rtx compare_op;
+      rtx_insn *compare_seq;
+
+      gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
+
+      /* Shortcut:  following common codes never translate
+	 into carry flag compares.  */
+      if (code == EQ || code == NE || code == UNEQ || code == LTGT
+	  || code == ORDERED || code == UNORDERED)
+	return false;
+
+      /* These comparisons require zero flag; swap operands so they won't.  */
+      if ((code == GT || code == UNLE || code == LE || code == UNGT)
+	  && !TARGET_IEEE_FP)
+	{
+	  std::swap (op0, op1);
+	  code = swap_condition (code);
+	}
+
+      /* Try to expand the comparison and verify that we end up with
+	 carry flag based comparison.  This fails to be true only when
+	 we decide to expand comparison using arithmetic that is not
+	 too common scenario.  */
+      start_sequence ();
+      compare_op = ix86_expand_fp_compare (code, op0, op1);
+      compare_seq = get_insns ();
+      end_sequence ();
+
+      if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
+        code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
+      else
+	code = GET_CODE (compare_op);
+
+      if (code != LTU && code != GEU)
+	return false;
+
+      emit_insn (compare_seq);
+      *pop = compare_op;
+      return true;
+    }
+
+  if (!INTEGRAL_MODE_P (mode))
+    return false;
+
+  switch (code)
+    {
+    case LTU:
+    case GEU:
+      break;
+
+    /* Convert a==0 into (unsigned)a<1.  */
+    case EQ:
+    case NE:
+      if (op1 != const0_rtx)
+	return false;
+      op1 = const1_rtx;
+      code = (code == EQ ? LTU : GEU);
+      break;
+
+    /* Convert a>b into b<a or a>=b-1.  */
+    case GTU:
+    case LEU:
+      if (CONST_INT_P (op1))
+	{
+	  op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
+	  /* Bail out on overflow.  We still can swap operands but that
+	     would force loading of the constant into register.  */
+	  if (op1 == const0_rtx
+	      || !x86_64_immediate_operand (op1, GET_MODE (op1)))
+	    return false;
+	  code = (code == GTU ? GEU : LTU);
+	}
+      else
+	{
+	  std::swap (op0, op1);
+	  code = (code == GTU ? LTU : GEU);
+	}
+      break;
+
+    /* Convert a>=0 into (unsigned)a<0x80000000.  */
+    case LT:
+    case GE:
+      if (mode == DImode || op1 != const0_rtx)
+	return false;
+      op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
+      code = (code == LT ? GEU : LTU);
+      break;
+    case LE:
+    case GT:
+      if (mode == DImode || op1 != constm1_rtx)
+	return false;
+      op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
+      code = (code == LE ? GEU : LTU);
+      break;
+
+    default:
+      return false;
+    }
+  /* Swapping operands may cause constant to appear as first operand.  */
+  if (!nonimmediate_operand (op0, VOIDmode))
+    {
+      if (!can_create_pseudo_p ())
+	return false;
+      op0 = force_reg (mode, op0);
+    }
+  *pop = ix86_expand_compare (code, op0, op1);
+  gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
+  return true;
+}
+
+/* Expand conditional increment or decrement using adb/sbb instructions.
+   The default case using setcc followed by the conditional move can be
+   done by generic code.  */
+bool
+ix86_expand_int_addcc (rtx operands[])
+{
+  enum rtx_code code = GET_CODE (operands[1]);
+  rtx flags;
+  rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
+  rtx compare_op;
+  rtx val = const0_rtx;
+  bool fpcmp = false;
+  machine_mode mode;
+  rtx op0 = XEXP (operands[1], 0);
+  rtx op1 = XEXP (operands[1], 1);
+
+  if (operands[3] != const1_rtx
+      && operands[3] != constm1_rtx)
+    return false;
+  if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
+     return false;
+  code = GET_CODE (compare_op);
+
+  flags = XEXP (compare_op, 0);
+
+  if (GET_MODE (flags) == CCFPmode)
+    {
+      fpcmp = true;
+      code = ix86_fp_compare_code_to_integer (code);
+    }
+
+  if (code != LTU)
+    {
+      val = constm1_rtx;
+      if (fpcmp)
+	PUT_CODE (compare_op,
+		  reverse_condition_maybe_unordered
+		    (GET_CODE (compare_op)));
+      else
+	PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
+    }
+
+  mode = GET_MODE (operands[0]);
+
+  /* Construct either adc or sbb insn.  */
+  if ((code == LTU) == (operands[3] == constm1_rtx))
+    {
+      switch (mode)
+	{
+	  case E_QImode:
+	    insn = gen_subqi3_carry;
+	    break;
+	  case E_HImode:
+	    insn = gen_subhi3_carry;
+	    break;
+	  case E_SImode:
+	    insn = gen_subsi3_carry;
+	    break;
+	  case E_DImode:
+	    insn = gen_subdi3_carry;
+	    break;
+	  default:
+	    gcc_unreachable ();
+	}
+    }
+  else
+    {
+      switch (mode)
+	{
+	  case E_QImode:
+	    insn = gen_addqi3_carry;
+	    break;
+	  case E_HImode:
+	    insn = gen_addhi3_carry;
+	    break;
+	  case E_SImode:
+	    insn = gen_addsi3_carry;
+	    break;
+	  case E_DImode:
+	    insn = gen_adddi3_carry;
+	    break;
+	  default:
+	    gcc_unreachable ();
+	}
+    }
+  emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
+
+  return true;
+}
+
+bool
+ix86_expand_int_movcc (rtx operands[])
+{
+  enum rtx_code code = GET_CODE (operands[1]), compare_code;
+  rtx_insn *compare_seq;
+  rtx compare_op;
+  machine_mode mode = GET_MODE (operands[0]);
+  bool sign_bit_compare_p = false;
+  rtx op0 = XEXP (operands[1], 0);
+  rtx op1 = XEXP (operands[1], 1);
+
+  if (GET_MODE (op0) == TImode
+      || (GET_MODE (op0) == DImode
+	  && !TARGET_64BIT))
+    return false;
+
+  start_sequence ();
+  compare_op = ix86_expand_compare (code, op0, op1);
+  compare_seq = get_insns ();
+  end_sequence ();
+
+  compare_code = GET_CODE (compare_op);
+
+  if ((op1 == const0_rtx && (code == GE || code == LT))
+      || (op1 == constm1_rtx && (code == GT || code == LE)))
+    sign_bit_compare_p = true;
+
+  /* Don't attempt mode expansion here -- if we had to expand 5 or 6
+     HImode insns, we'd be swallowed in word prefix ops.  */
+
+  if ((mode != HImode || TARGET_FAST_PREFIX)
+      && (mode != (TARGET_64BIT ? TImode : DImode))
+      && CONST_INT_P (operands[2])
+      && CONST_INT_P (operands[3]))
+    {
+      rtx out = operands[0];
+      HOST_WIDE_INT ct = INTVAL (operands[2]);
+      HOST_WIDE_INT cf = INTVAL (operands[3]);
+      HOST_WIDE_INT diff;
+
+      diff = ct - cf;
+      /*  Sign bit compares are better done using shifts than we do by using
+	  sbb.  */
+      if (sign_bit_compare_p
+	  || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
+	{
+	  /* Detect overlap between destination and compare sources.  */
+	  rtx tmp = out;
+
+          if (!sign_bit_compare_p)
+	    {
+	      rtx flags;
+	      bool fpcmp = false;
+
+	      compare_code = GET_CODE (compare_op);
+
+	      flags = XEXP (compare_op, 0);
+
+	      if (GET_MODE (flags) == CCFPmode)
+		{
+		  fpcmp = true;
+		  compare_code
+		    = ix86_fp_compare_code_to_integer (compare_code);
+		}
+
+	      /* To simplify rest of code, restrict to the GEU case.  */
+	      if (compare_code == LTU)
+		{
+		  std::swap (ct, cf);
+		  compare_code = reverse_condition (compare_code);
+		  code = reverse_condition (code);
+		}
+	      else
+		{
+		  if (fpcmp)
+		    PUT_CODE (compare_op,
+			      reverse_condition_maybe_unordered
+			        (GET_CODE (compare_op)));
+		  else
+		    PUT_CODE (compare_op,
+			      reverse_condition (GET_CODE (compare_op)));
+		}
+	      diff = ct - cf;
+
+	      if (reg_overlap_mentioned_p (out, op0)
+		  || reg_overlap_mentioned_p (out, op1))
+		tmp = gen_reg_rtx (mode);
+
+	      if (mode == DImode)
+		emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
+	      else
+		emit_insn (gen_x86_movsicc_0_m1	(gen_lowpart (SImode, tmp),
+						 flags, compare_op));
+	    }
+	  else
+	    {
+	      if (code == GT || code == GE)
+		code = reverse_condition (code);
+	      else
+		{
+		  std::swap (ct, cf);
+		  diff = ct - cf;
+		}
+	      tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
+	    }
+
+	  if (diff == 1)
+	    {
+	      /*
+	       * cmpl op0,op1
+	       * sbbl dest,dest
+	       * [addl dest, ct]
+	       *
+	       * Size 5 - 8.
+	       */
+	      if (ct)
+		tmp = expand_simple_binop (mode, PLUS,
+					   tmp, GEN_INT (ct),
+					   copy_rtx (tmp), 1, OPTAB_DIRECT);
+	    }
+	  else if (cf == -1)
+	    {
+	      /*
+	       * cmpl op0,op1
+	       * sbbl dest,dest
+	       * orl $ct, dest
+	       *
+	       * Size 8.
+	       */
+	      tmp = expand_simple_binop (mode, IOR,
+					 tmp, GEN_INT (ct),
+					 copy_rtx (tmp), 1, OPTAB_DIRECT);
+	    }
+	  else if (diff == -1 && ct)
+	    {
+	      /*
+	       * cmpl op0,op1
+	       * sbbl dest,dest
+	       * notl dest
+	       * [addl dest, cf]
+	       *
+	       * Size 8 - 11.
+	       */
+	      tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
+	      if (cf)
+		tmp = expand_simple_binop (mode, PLUS,
+					   copy_rtx (tmp), GEN_INT (cf),
+					   copy_rtx (tmp), 1, OPTAB_DIRECT);
+	    }
+	  else
+	    {
+	      /*
+	       * cmpl op0,op1
+	       * sbbl dest,dest
+	       * [notl dest]
+	       * andl cf - ct, dest
+	       * [addl dest, ct]
+	       *
+	       * Size 8 - 11.
+	       */
+
+	      if (cf == 0)
+		{
+		  cf = ct;
+		  ct = 0;
+		  tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
+		}
+
+	      tmp = expand_simple_binop (mode, AND,
+					 copy_rtx (tmp),
+					 gen_int_mode (cf - ct, mode),
+					 copy_rtx (tmp), 1, OPTAB_DIRECT);
+	      if (ct)
+		tmp = expand_simple_binop (mode, PLUS,
+					   copy_rtx (tmp), GEN_INT (ct),
+					   copy_rtx (tmp), 1, OPTAB_DIRECT);
+	    }
+
+	  if (!rtx_equal_p (tmp, out))
+	    emit_move_insn (copy_rtx (out), copy_rtx (tmp));
+
+	  return true;
+	}
+
+      if (diff < 0)
+	{
+	  machine_mode cmp_mode = GET_MODE (op0);
+	  enum rtx_code new_code;
+
+	  if (SCALAR_FLOAT_MODE_P (cmp_mode))
+	    {
+	      gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
+
+	      /* We may be reversing unordered compare to normal compare, that
+		 is not valid in general (we may convert non-trapping condition
+		 to trapping one), however on i386 we currently emit all
+		 comparisons unordered.  */
+	      new_code = reverse_condition_maybe_unordered (code);
+	    }
+	  else
+	    new_code = ix86_reverse_condition (code, cmp_mode);
+	  if (new_code != UNKNOWN)
+	    {
+	      std::swap (ct, cf);
+	      diff = -diff;
+	      code = new_code;
+	    }
+	}
+
+      compare_code = UNKNOWN;
+      if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
+	  && CONST_INT_P (op1))
+	{
+	  if (op1 == const0_rtx
+	      && (code == LT || code == GE))
+	    compare_code = code;
+	  else if (op1 == constm1_rtx)
+	    {
+	      if (code == LE)
+		compare_code = LT;
+	      else if (code == GT)
+		compare_code = GE;
+	    }
+	}
+
+      /* Optimize dest = (op0 < 0) ? -1 : cf.  */
+      if (compare_code != UNKNOWN
+	  && GET_MODE (op0) == GET_MODE (out)
+	  && (cf == -1 || ct == -1))
+	{
+	  /* If lea code below could be used, only optimize
+	     if it results in a 2 insn sequence.  */
+
+	  if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
+		 || diff == 3 || diff == 5 || diff == 9)
+	      || (compare_code == LT && ct == -1)
+	      || (compare_code == GE && cf == -1))
+	    {
+	      /*
+	       * notl op1	(if necessary)
+	       * sarl $31, op1
+	       * orl cf, op1
+	       */
+	      if (ct != -1)
+		{
+		  cf = ct;
+		  ct = -1;
+		  code = reverse_condition (code);
+		}
+
+	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
+
+	      out = expand_simple_binop (mode, IOR,
+					 out, GEN_INT (cf),
+					 out, 1, OPTAB_DIRECT);
+	      if (out != operands[0])
+		emit_move_insn (operands[0], out);
+
+	      return true;
+	    }
+	}
+
+
+      if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
+	   || diff == 3 || diff == 5 || diff == 9)
+	  && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
+	  && (mode != DImode
+	      || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
+	{
+	  /*
+	   * xorl dest,dest
+	   * cmpl op1,op2
+	   * setcc dest
+	   * lea cf(dest*(ct-cf)),dest
+	   *
+	   * Size 14.
+	   *
+	   * This also catches the degenerate setcc-only case.
+	   */
+
+	  rtx tmp;
+	  int nops;
+
+	  out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
+
+	  nops = 0;
+	  /* On x86_64 the lea instruction operates on Pmode, so we need
+	     to get arithmetics done in proper mode to match.  */
+	  if (diff == 1)
+	    tmp = copy_rtx (out);
+	  else
+	    {
+	      rtx out1;
+	      out1 = copy_rtx (out);
+	      tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
+	      nops++;
+	      if (diff & 1)
+		{
+		  tmp = gen_rtx_PLUS (mode, tmp, out1);
+		  nops++;
+		}
+	    }
+	  if (cf != 0)
+	    {
+	      tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
+	      nops++;
+	    }
+	  if (!rtx_equal_p (tmp, out))
+	    {
+	      if (nops == 1)
+		out = force_operand (tmp, copy_rtx (out));
+	      else
+		emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
+	    }
+	  if (!rtx_equal_p (out, operands[0]))
+	    emit_move_insn (operands[0], copy_rtx (out));
+
+	  return true;
+	}
+
+      /*
+       * General case:			Jumpful:
+       *   xorl dest,dest		cmpl op1, op2
+       *   cmpl op1, op2		movl ct, dest
+       *   setcc dest			jcc 1f
+       *   decl dest			movl cf, dest
+       *   andl (cf-ct),dest		1:
+       *   addl ct,dest
+       *
+       * Size 20.			Size 14.
+       *
+       * This is reasonably steep, but branch mispredict costs are
+       * high on modern cpus, so consider failing only if optimizing
+       * for space.
+       */
+
+      if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
+	  && BRANCH_COST (optimize_insn_for_speed_p (),
+		  	  false) >= 2)
+	{
+	  if (cf == 0)
+	    {
+	      machine_mode cmp_mode = GET_MODE (op0);
+	      enum rtx_code new_code;
+
+	      if (SCALAR_FLOAT_MODE_P (cmp_mode))
+		{
+		  gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
+
+		  /* We may be reversing unordered compare to normal compare,
+		     that is not valid in general (we may convert non-trapping
+		     condition to trapping one), however on i386 we currently
+		     emit all comparisons unordered.  */
+		  new_code = reverse_condition_maybe_unordered (code);
+		}
+	      else
+		{
+		  new_code = ix86_reverse_condition (code, cmp_mode);
+		  if (compare_code != UNKNOWN && new_code != UNKNOWN)
+		    compare_code = reverse_condition (compare_code);
+		}
+
+	      if (new_code != UNKNOWN)
+		{
+		  cf = ct;
+		  ct = 0;
+		  code = new_code;
+		}
+	    }
+
+	  if (compare_code != UNKNOWN)
+	    {
+	      /* notl op1	(if needed)
+		 sarl $31, op1
+		 andl (cf-ct), op1
+		 addl ct, op1
+
+		 For x < 0 (resp. x <= -1) there will be no notl,
+		 so if possible swap the constants to get rid of the
+		 complement.
+		 True/false will be -1/0 while code below (store flag
+		 followed by decrement) is 0/-1, so the constants need
+		 to be exchanged once more.  */
+
+	      if (compare_code == GE || !cf)
+		{
+		  code = reverse_condition (code);
+		  compare_code = LT;
+		}
+	      else
+		std::swap (ct, cf);
+
+	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
+	    }
+	  else
+	    {
+	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
+
+	      out = expand_simple_binop (mode, PLUS, copy_rtx (out),
+					 constm1_rtx,
+					 copy_rtx (out), 1, OPTAB_DIRECT);
+	    }
+
+	  out = expand_simple_binop (mode, AND, copy_rtx (out),
+				     gen_int_mode (cf - ct, mode),
+				     copy_rtx (out), 1, OPTAB_DIRECT);
+	  if (ct)
+	    out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
+				       copy_rtx (out), 1, OPTAB_DIRECT);
+	  if (!rtx_equal_p (out, operands[0]))
+	    emit_move_insn (operands[0], copy_rtx (out));
+
+	  return true;
+	}
+    }
+
+  if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
+    {
+      /* Try a few things more with specific constants and a variable.  */
+
+      optab op;
+      rtx var, orig_out, out, tmp;
+
+      if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
+	return false;
+
+      /* If one of the two operands is an interesting constant, load a
+	 constant with the above and mask it in with a logical operation.  */
+
+      if (CONST_INT_P (operands[2]))
+	{
+	  var = operands[3];
+	  if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
+	    operands[3] = constm1_rtx, op = and_optab;
+	  else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
+	    operands[3] = const0_rtx, op = ior_optab;
+	  else
+	    return false;
+	}
+      else if (CONST_INT_P (operands[3]))
+	{
+	  var = operands[2];
+	  if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
+	    operands[2] = constm1_rtx, op = and_optab;
+	  else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
+	    operands[2] = const0_rtx, op = ior_optab;
+	  else
+	    return false;
+	}
+      else
+        return false;
+
+      orig_out = operands[0];
+      tmp = gen_reg_rtx (mode);
+      operands[0] = tmp;
+
+      /* Recurse to get the constant loaded.  */
+      if (!ix86_expand_int_movcc (operands))
+        return false;
+
+      /* Mask in the interesting variable.  */
+      out = expand_binop (mode, op, var, tmp, orig_out, 0,
+			  OPTAB_WIDEN);
+      if (!rtx_equal_p (out, orig_out))
+	emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
+
+      return true;
+    }
+
+  /*
+   * For comparison with above,
+   *
+   * movl cf,dest
+   * movl ct,tmp
+   * cmpl op1,op2
+   * cmovcc tmp,dest
+   *
+   * Size 15.
+   */
+
+  if (! nonimmediate_operand (operands[2], mode))
+    operands[2] = force_reg (mode, operands[2]);
+  if (! nonimmediate_operand (operands[3], mode))
+    operands[3] = force_reg (mode, operands[3]);
+
+  if (! register_operand (operands[2], VOIDmode)
+      && (mode == QImode
+          || ! register_operand (operands[3], VOIDmode)))
+    operands[2] = force_reg (mode, operands[2]);
+
+  if (mode == QImode
+      && ! register_operand (operands[3], VOIDmode))
+    operands[3] = force_reg (mode, operands[3]);
+
+  emit_insn (compare_seq);
+  emit_insn (gen_rtx_SET (operands[0],
+			  gen_rtx_IF_THEN_ELSE (mode,
+						compare_op, operands[2],
+						operands[3])));
+  return true;
+}
+
+/* Detect conditional moves that exactly match min/max operational
+   semantics.  Note that this is IEEE safe, as long as we don't
+   interchange the operands.
+
+   Returns FALSE if this conditional move doesn't match a MIN/MAX,
+   and TRUE if the operation is successful and instructions are emitted.  */
+
+static bool
+ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
+			   rtx cmp_op1, rtx if_true, rtx if_false)
+{
+  machine_mode mode;
+  bool is_min;
+  rtx tmp;
+
+  if (code == LT)
+    ;
+  else if (code == UNGE)
+    std::swap (if_true, if_false);
+  else
+    return false;
+
+  if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
+    is_min = true;
+  else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
+    is_min = false;
+  else
+    return false;
+
+  mode = GET_MODE (dest);
+
+  /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
+     but MODE may be a vector mode and thus not appropriate.  */
+  if (!flag_finite_math_only || flag_signed_zeros)
+    {
+      int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
+      rtvec v;
+
+      if_true = force_reg (mode, if_true);
+      v = gen_rtvec (2, if_true, if_false);
+      tmp = gen_rtx_UNSPEC (mode, v, u);
+    }
+  else
+    {
+      code = is_min ? SMIN : SMAX;
+      if (MEM_P (if_true) && MEM_P (if_false))
+	if_true = force_reg (mode, if_true);
+      tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
+    }
+
+  emit_insn (gen_rtx_SET (dest, tmp));
+  return true;
+}
+
+/* Expand an SSE comparison.  Return the register with the result.  */
+
+static rtx
+ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
+		     rtx op_true, rtx op_false)
+{
+  machine_mode mode = GET_MODE (dest);
+  machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
+
+  /* In general case result of comparison can differ from operands' type.  */
+  machine_mode cmp_mode;
+
+  /* In AVX512F the result of comparison is an integer mask.  */
+  bool maskcmp = false;
+  rtx x;
+
+  if (GET_MODE_SIZE (cmp_ops_mode) == 64)
+    {
+      unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
+      cmp_mode = int_mode_for_size (nbits, 0).require ();
+      maskcmp = true;
+    }
+  else
+    cmp_mode = cmp_ops_mode;
+
+  cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
+
+  int (*op1_predicate)(rtx, machine_mode)
+    = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
+
+  if (!op1_predicate (cmp_op1, cmp_ops_mode))
+    cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
+
+  if (optimize
+      || (maskcmp && cmp_mode != mode)
+      || (op_true && reg_overlap_mentioned_p (dest, op_true))
+      || (op_false && reg_overlap_mentioned_p (dest, op_false)))
+    dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
+
+  /* Compare patterns for int modes are unspec in AVX512F only.  */
+  if (maskcmp && (code == GT || code == EQ))
+    {
+      rtx (*gen)(rtx, rtx, rtx);
+
+      switch (cmp_ops_mode)
+	{
+	case E_V64QImode:
+	  gcc_assert (TARGET_AVX512BW);
+	  gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
+	  break;
+	case E_V32HImode:
+	  gcc_assert (TARGET_AVX512BW);
+	  gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
+	  break;
+	case E_V16SImode:
+	  gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
+	  break;
+	case E_V8DImode:
+	  gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
+	  break;
+	default:
+	  gen = NULL;
+	}
+
+      if (gen)
+	{
+	  emit_insn (gen (dest, cmp_op0, cmp_op1));
+	  return dest;
+	}
+    }
+  x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
+
+  if (cmp_mode != mode && !maskcmp)
+    {
+      x = force_reg (cmp_ops_mode, x);
+      convert_move (dest, x, false);
+    }
+  else
+    emit_insn (gen_rtx_SET (dest, x));
+
+  return dest;
+}
+
+/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
+   operations.  This is used for both scalar and vector conditional moves.  */
+
+void
+ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
+{
+  machine_mode mode = GET_MODE (dest);
+  machine_mode cmpmode = GET_MODE (cmp);
+
+  /* In AVX512F the result of comparison is an integer mask.  */
+  bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
+
+  rtx t2, t3, x;
+
+  /* If we have an integer mask and FP value then we need
+     to cast mask to FP mode.  */
+  if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
+    {
+      cmp = force_reg (cmpmode, cmp);
+      cmp = gen_rtx_SUBREG (mode, cmp, 0);
+    }
+
+  if (maskcmp)
+    {
+      rtx (*gen) (rtx, rtx) = NULL;
+      if ((op_true == CONST0_RTX (mode)
+	   && vector_all_ones_operand (op_false, mode))
+	  || (op_false == CONST0_RTX (mode)
+	      && vector_all_ones_operand (op_true, mode)))
+	switch (mode)
+	  {
+	  case E_V64QImode:
+	    if (TARGET_AVX512BW)
+	      gen = gen_avx512bw_cvtmask2bv64qi;
+	    break;
+	  case E_V32QImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512BW)
+	      gen = gen_avx512vl_cvtmask2bv32qi;
+	    break;
+	  case E_V16QImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512BW)
+	      gen = gen_avx512vl_cvtmask2bv16qi;
+	    break;
+	  case E_V32HImode:
+	    if (TARGET_AVX512BW)
+	      gen = gen_avx512bw_cvtmask2wv32hi;
+	    break;
+	  case E_V16HImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512BW)
+	      gen = gen_avx512vl_cvtmask2wv16hi;
+	    break;
+	  case E_V8HImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512BW)
+	      gen = gen_avx512vl_cvtmask2wv8hi;
+	    break;
+	  case E_V16SImode:
+	    if (TARGET_AVX512DQ)
+	      gen = gen_avx512f_cvtmask2dv16si;
+	    break;
+	  case E_V8SImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
+	      gen = gen_avx512vl_cvtmask2dv8si;
+	    break;
+	  case E_V4SImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
+	      gen = gen_avx512vl_cvtmask2dv4si;
+	    break;
+	  case E_V8DImode:
+	    if (TARGET_AVX512DQ)
+	      gen = gen_avx512f_cvtmask2qv8di;
+	    break;
+	  case E_V4DImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
+	      gen = gen_avx512vl_cvtmask2qv4di;
+	    break;
+	  case E_V2DImode:
+	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
+	      gen = gen_avx512vl_cvtmask2qv2di;
+	    break;
+	  default:
+	    break;
+	  }
+      if (gen && SCALAR_INT_MODE_P (cmpmode))
+	{
+	  cmp = force_reg (cmpmode, cmp);
+	  if (op_true == CONST0_RTX (mode))
+	    {
+	      rtx (*gen_not) (rtx, rtx);
+	      switch (cmpmode)
+		{
+		case E_QImode: gen_not = gen_knotqi; break;
+		case E_HImode: gen_not = gen_knothi; break;
+		case E_SImode: gen_not = gen_knotsi; break;
+		case E_DImode: gen_not = gen_knotdi; break;
+		default: gcc_unreachable ();
+		}
+	      rtx n = gen_reg_rtx (cmpmode);
+	      emit_insn (gen_not (n, cmp));
+	      cmp = n;
+	    }
+	  emit_insn (gen (dest, cmp));
+	  return;
+	}
+    }
+  else if (vector_all_ones_operand (op_true, mode)
+	   && op_false == CONST0_RTX (mode))
+    {
+      emit_insn (gen_rtx_SET (dest, cmp));
+      return;
+    }
+  else if (op_false == CONST0_RTX (mode))
+    {
+      op_true = force_reg (mode, op_true);
+      x = gen_rtx_AND (mode, cmp, op_true);
+      emit_insn (gen_rtx_SET (dest, x));
+      return;
+    }
+  else if (op_true == CONST0_RTX (mode))
+    {
+      op_false = force_reg (mode, op_false);
+      x = gen_rtx_NOT (mode, cmp);
+      x = gen_rtx_AND (mode, x, op_false);
+      emit_insn (gen_rtx_SET (dest, x));
+      return;
+    }
+  else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
+    {
+      op_false = force_reg (mode, op_false);
+      x = gen_rtx_IOR (mode, cmp, op_false);
+      emit_insn (gen_rtx_SET (dest, x));
+      return;
+    }
+  else if (TARGET_XOP)
+    {
+      op_true = force_reg (mode, op_true);
+
+      if (!nonimmediate_operand (op_false, mode))
+	op_false = force_reg (mode, op_false);
+
+      emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
+							  op_true,
+							  op_false)));
+      return;
+    }
+
+  rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
+  rtx d = dest;
+
+  if (!vector_operand (op_true, mode))
+    op_true = force_reg (mode, op_true);
+
+  op_false = force_reg (mode, op_false);
+
+  switch (mode)
+    {
+    case E_V4SFmode:
+      if (TARGET_SSE4_1)
+	gen = gen_sse4_1_blendvps;
+      break;
+    case E_V2DFmode:
+      if (TARGET_SSE4_1)
+	gen = gen_sse4_1_blendvpd;
+      break;
+    case E_SFmode:
+      if (TARGET_SSE4_1)
+	{
+	  gen = gen_sse4_1_blendvss;
+	  op_true = force_reg (mode, op_true);
+	}
+      break;
+    case E_DFmode:
+      if (TARGET_SSE4_1)
+	{
+	  gen = gen_sse4_1_blendvsd;
+	  op_true = force_reg (mode, op_true);
+	}
+      break;
+    case E_V16QImode:
+    case E_V8HImode:
+    case E_V4SImode:
+    case E_V2DImode:
+      if (TARGET_SSE4_1)
+	{
+	  gen = gen_sse4_1_pblendvb;
+	  if (mode != V16QImode)
+	    d = gen_reg_rtx (V16QImode);
+	  op_false = gen_lowpart (V16QImode, op_false);
+	  op_true = gen_lowpart (V16QImode, op_true);
+	  cmp = gen_lowpart (V16QImode, cmp);
+	}
+      break;
+    case E_V8SFmode:
+      if (TARGET_AVX)
+	gen = gen_avx_blendvps256;
+      break;
+    case E_V4DFmode:
+      if (TARGET_AVX)
+	gen = gen_avx_blendvpd256;
+      break;
+    case E_V32QImode:
+    case E_V16HImode:
+    case E_V8SImode:
+    case E_V4DImode:
+      if (TARGET_AVX2)
+	{
+	  gen = gen_avx2_pblendvb;
+	  if (mode != V32QImode)
+	    d = gen_reg_rtx (V32QImode);
+	  op_false = gen_lowpart (V32QImode, op_false);
+	  op_true = gen_lowpart (V32QImode, op_true);
+	  cmp = gen_lowpart (V32QImode, cmp);
+	}
+      break;
+
+    case E_V64QImode:
+      gen = gen_avx512bw_blendmv64qi;
+      break;
+    case E_V32HImode:
+      gen = gen_avx512bw_blendmv32hi;
+      break;
+    case E_V16SImode:
+      gen = gen_avx512f_blendmv16si;
+      break;
+    case E_V8DImode:
+      gen = gen_avx512f_blendmv8di;
+      break;
+    case E_V8DFmode:
+      gen = gen_avx512f_blendmv8df;
+      break;
+    case E_V16SFmode:
+      gen = gen_avx512f_blendmv16sf;
+      break;
+
+    default:
+      break;
+    }
+
+  if (gen != NULL)
+    {
+      emit_insn (gen (d, op_false, op_true, cmp));
+      if (d != dest)
+	emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
+    }
+  else
+    {
+      op_true = force_reg (mode, op_true);
+
+      t2 = gen_reg_rtx (mode);
+      if (optimize)
+	t3 = gen_reg_rtx (mode);
+      else
+	t3 = dest;
+
+      x = gen_rtx_AND (mode, op_true, cmp);
+      emit_insn (gen_rtx_SET (t2, x));
+
+      x = gen_rtx_NOT (mode, cmp);
+      x = gen_rtx_AND (mode, x, op_false);
+      emit_insn (gen_rtx_SET (t3, x));
+
+      x = gen_rtx_IOR (mode, t3, t2);
+      emit_insn (gen_rtx_SET (dest, x));
+    }
+}
+
+/* Swap, force into registers, or otherwise massage the two operands
+   to an sse comparison with a mask result.  Thus we differ a bit from
+   ix86_prepare_fp_compare_args which expects to produce a flags result.
+
+   The DEST operand exists to help determine whether to commute commutative
+   operators.  The POP0/POP1 operands are updated in place.  The new
+   comparison code is returned, or UNKNOWN if not implementable.  */
+
+static enum rtx_code
+ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
+				  rtx *pop0, rtx *pop1)
+{
+  switch (code)
+    {
+    case LTGT:
+    case UNEQ:
+      /* AVX supports all the needed comparisons.  */
+      if (TARGET_AVX)
+	break;
+      /* We have no LTGT as an operator.  We could implement it with
+	 NE & ORDERED, but this requires an extra temporary.  It's
+	 not clear that it's worth it.  */
+      return UNKNOWN;
+
+    case LT:
+    case LE:
+    case UNGT:
+    case UNGE:
+      /* These are supported directly.  */
+      break;
+
+    case EQ:
+    case NE:
+    case UNORDERED:
+    case ORDERED:
+      /* AVX has 3 operand comparisons, no need to swap anything.  */
+      if (TARGET_AVX)
+	break;
+      /* For commutative operators, try to canonicalize the destination
+	 operand to be first in the comparison - this helps reload to
+	 avoid extra moves.  */
+      if (!dest || !rtx_equal_p (dest, *pop1))
+	break;
+      /* FALLTHRU */
+
+    case GE:
+    case GT:
+    case UNLE:
+    case UNLT:
+      /* These are not supported directly before AVX, and furthermore
+	 ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
+	 comparison operands to transform into something that is
+	 supported.  */
+      std::swap (*pop0, *pop1);
+      code = swap_condition (code);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return code;
+}
+
+/* Expand a floating-point conditional move.  Return true if successful.  */
+
+bool
+ix86_expand_fp_movcc (rtx operands[])
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  enum rtx_code code = GET_CODE (operands[1]);
+  rtx tmp, compare_op;
+  rtx op0 = XEXP (operands[1], 0);
+  rtx op1 = XEXP (operands[1], 1);
+
+  if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
+    {
+      machine_mode cmode;
+
+      /* Since we've no cmove for sse registers, don't force bad register
+	 allocation just to gain access to it.  Deny movcc when the
+	 comparison mode doesn't match the move mode.  */
+      cmode = GET_MODE (op0);
+      if (cmode == VOIDmode)
+	cmode = GET_MODE (op1);
+      if (cmode != mode)
+	return false;
+
+      code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
+      if (code == UNKNOWN)
+	return false;
+
+      if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
+				     operands[2], operands[3]))
+	return true;
+
+      tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
+				 operands[2], operands[3]);
+      ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
+      return true;
+    }
+
+  if (GET_MODE (op0) == TImode
+      || (GET_MODE (op0) == DImode
+	  && !TARGET_64BIT))
+    return false;
+
+  /* The floating point conditional move instructions don't directly
+     support conditions resulting from a signed integer comparison.  */
+
+  compare_op = ix86_expand_compare (code, op0, op1);
+  if (!fcmov_comparison_operator (compare_op, VOIDmode))
+    {
+      tmp = gen_reg_rtx (QImode);
+      ix86_expand_setcc (tmp, code, op0, op1);
+
+      compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
+    }
+
+  emit_insn (gen_rtx_SET (operands[0],
+			  gen_rtx_IF_THEN_ELSE (mode, compare_op,
+						operands[2], operands[3])));
+
+  return true;
+}
+
+/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes.  */
+
+static int
+ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
+{
+  switch (code)
+    {
+    case EQ:
+      return 0;
+    case LT:
+    case LTU:
+      return 1;
+    case LE:
+    case LEU:
+      return 2;
+    case NE:
+      return 4;
+    case GE:
+    case GEU:
+      return 5;
+    case GT:
+    case GTU:
+      return 6;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes.  */
+
+static int
+ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
+{
+  switch (code)
+    {
+    case EQ:
+      return 0x00;
+    case NE:
+      return 0x04;
+    case GT:
+      return 0x0e;
+    case LE:
+      return 0x02;
+    case GE:
+      return 0x0d;
+    case LT:
+      return 0x01;
+    case UNLE:
+      return 0x0a;
+    case UNLT:
+      return 0x09;
+    case UNGE:
+      return 0x05;
+    case UNGT:
+      return 0x06;
+    case UNEQ:
+      return 0x18;
+    case LTGT:
+      return 0x0c;
+    case ORDERED:
+      return 0x07;
+    case UNORDERED:
+      return 0x03;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Return immediate value to be used in UNSPEC_PCMP
+   for comparison CODE in MODE.  */
+
+static int
+ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
+{
+  if (FLOAT_MODE_P (mode))
+    return ix86_fp_cmp_code_to_pcmp_immediate (code);
+  return ix86_int_cmp_code_to_pcmp_immediate (code);
+}
+
+/* Expand AVX-512 vector comparison.  */
+
+bool
+ix86_expand_mask_vec_cmp (rtx operands[])
+{
+  machine_mode mask_mode = GET_MODE (operands[0]);
+  machine_mode cmp_mode = GET_MODE (operands[2]);
+  enum rtx_code code = GET_CODE (operands[1]);
+  rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
+  int unspec_code;
+  rtx unspec;
+
+  switch (code)
+    {
+    case LEU:
+    case GTU:
+    case GEU:
+    case LTU:
+      unspec_code = UNSPEC_UNSIGNED_PCMP;
+      break;
+
+    default:
+      unspec_code = UNSPEC_PCMP;
+    }
+
+  unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
+						 operands[3], imm),
+			   unspec_code);
+  emit_insn (gen_rtx_SET (operands[0], unspec));
+
+  return true;
+}
+
+/* Expand fp vector comparison.  */
+
+bool
+ix86_expand_fp_vec_cmp (rtx operands[])
+{
+  enum rtx_code code = GET_CODE (operands[1]);
+  rtx cmp;
+
+  code = ix86_prepare_sse_fp_compare_args (operands[0], code,
+					   &operands[2], &operands[3]);
+  if (code == UNKNOWN)
+    {
+      rtx temp;
+      switch (GET_CODE (operands[1]))
+	{
+	case LTGT:
+	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
+				      operands[3], NULL, NULL);
+	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
+				     operands[3], NULL, NULL);
+	  code = AND;
+	  break;
+	case UNEQ:
+	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
+				      operands[3], NULL, NULL);
+	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
+				     operands[3], NULL, NULL);
+	  code = IOR;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
+				 OPTAB_DIRECT);
+    }
+  else
+    cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
+			       operands[1], operands[2]);
+
+  if (operands[0] != cmp)
+    emit_move_insn (operands[0], cmp);
+
+  return true;
+}
+
+static rtx
+ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
+			 rtx op_true, rtx op_false, bool *negate)
+{
+  machine_mode data_mode = GET_MODE (dest);
+  machine_mode mode = GET_MODE (cop0);
+  rtx x;
+
+  *negate = false;
+
+  /* XOP supports all of the comparisons on all 128-bit vector int types.  */
+  if (TARGET_XOP
+      && (mode == V16QImode || mode == V8HImode
+	  || mode == V4SImode || mode == V2DImode))
+    ;
+  else
+    {
+      /* Canonicalize the comparison to EQ, GT, GTU.  */
+      switch (code)
+	{
+	case EQ:
+	case GT:
+	case GTU:
+	  break;
+
+	case NE:
+	case LE:
+	case LEU:
+	  code = reverse_condition (code);
+	  *negate = true;
+	  break;
+
+	case GE:
+	case GEU:
+	  code = reverse_condition (code);
+	  *negate = true;
+	  /* FALLTHRU */
+
+	case LT:
+	case LTU:
+	  std::swap (cop0, cop1);
+	  code = swap_condition (code);
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+
+      /* Only SSE4.1/SSE4.2 supports V2DImode.  */
+      if (mode == V2DImode)
+	{
+	  switch (code)
+	    {
+	    case EQ:
+	      /* SSE4.1 supports EQ.  */
+	      if (!TARGET_SSE4_1)
+		return NULL;
+	      break;
+
+	    case GT:
+	    case GTU:
+	      /* SSE4.2 supports GT/GTU.  */
+	      if (!TARGET_SSE4_2)
+		return NULL;
+	      break;
+
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
+
+      rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
+      rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
+      if (*negate)
+	std::swap (optrue, opfalse);
+
+      /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
+	 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
+	 min (x, y) == x).  While we add one instruction (the minimum),
+	 we remove the need for two instructions in the negation, as the
+	 result is done this way.
+	 When using masks, do it for SI/DImode element types, as it is shorter
+	 than the two subtractions.  */
+      if ((code != EQ
+	   && GET_MODE_SIZE (mode) != 64
+	   && vector_all_ones_operand (opfalse, data_mode)
+	   && optrue == CONST0_RTX (data_mode))
+	  || (code == GTU
+	      && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
+	      /* Don't do it if not using integer masks and we'd end up with
+		 the right values in the registers though.  */
+	      && (GET_MODE_SIZE (mode) == 64
+		  || !vector_all_ones_operand (optrue, data_mode)
+		  || opfalse != CONST0_RTX (data_mode))))
+	{
+	  rtx (*gen) (rtx, rtx, rtx) = NULL;
+
+	  switch (mode)
+	    {
+	    case E_V16SImode:
+	      gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
+	      break;
+	    case E_V8DImode:
+	      gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
+	      cop0 = force_reg (mode, cop0);
+	      cop1 = force_reg (mode, cop1);
+	      break;
+	    case E_V32QImode:
+	      if (TARGET_AVX2)
+		gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
+	      break;
+	    case E_V16HImode:
+	      if (TARGET_AVX2)
+		gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
+	      break;
+	    case E_V8SImode:
+	      if (TARGET_AVX2)
+		gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
+	      break;
+	    case E_V4DImode:
+	      if (TARGET_AVX512VL)
+		{
+		  gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
+		  cop0 = force_reg (mode, cop0);
+		  cop1 = force_reg (mode, cop1);
+		}
+	      break;
+	    case E_V16QImode:
+	      if (code == GTU && TARGET_SSE2)
+		gen = gen_uminv16qi3;
+	      else if (code == GT && TARGET_SSE4_1)
+		gen = gen_sminv16qi3;
+	      break;
+	    case E_V8HImode:
+	      if (code == GTU && TARGET_SSE4_1)
+		gen = gen_uminv8hi3;
+	      else if (code == GT && TARGET_SSE2)
+		gen = gen_sminv8hi3;
+	      break;
+	    case E_V4SImode:
+	      if (TARGET_SSE4_1)
+		gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
+	      break;
+	    case E_V2DImode:
+	      if (TARGET_AVX512VL)
+		{
+		  gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
+		  cop0 = force_reg (mode, cop0);
+		  cop1 = force_reg (mode, cop1);
+		}
+	      break;
+	    default:
+	      break;
+	    }
+
+	  if (gen)
+	    {
+	      rtx tem = gen_reg_rtx (mode);
+	      if (!vector_operand (cop0, mode))
+		cop0 = force_reg (mode, cop0);
+	      if (!vector_operand (cop1, mode))
+		cop1 = force_reg (mode, cop1);
+	      *negate = !*negate;
+	      emit_insn (gen (tem, cop0, cop1));
+	      cop1 = tem;
+	      code = EQ;
+	    }
+	}
+
+      /* Unsigned parallel compare is not supported by the hardware.
+	 Play some tricks to turn this into a signed comparison
+	 against 0.  */
+      if (code == GTU)
+	{
+	  cop0 = force_reg (mode, cop0);
+
+	  switch (mode)
+	    {
+	    case E_V16SImode:
+	    case E_V8DImode:
+	    case E_V8SImode:
+	    case E_V4DImode:
+	    case E_V4SImode:
+	    case E_V2DImode:
+		{
+		  rtx t1, t2, mask;
+		  rtx (*gen_sub3) (rtx, rtx, rtx);
+
+		  switch (mode)
+		    {
+		    case E_V16SImode: gen_sub3 = gen_subv16si3; break;
+		    case E_V8DImode: gen_sub3 = gen_subv8di3; break;
+		    case E_V8SImode: gen_sub3 = gen_subv8si3; break;
+		    case E_V4DImode: gen_sub3 = gen_subv4di3; break;
+		    case E_V4SImode: gen_sub3 = gen_subv4si3; break;
+		    case E_V2DImode: gen_sub3 = gen_subv2di3; break;
+		    default:
+		      gcc_unreachable ();
+		    }
+		  /* Subtract (-(INT MAX) - 1) from both operands to make
+		     them signed.  */
+		  mask = ix86_build_signbit_mask (mode, true, false);
+		  t1 = gen_reg_rtx (mode);
+		  emit_insn (gen_sub3 (t1, cop0, mask));
+
+		  t2 = gen_reg_rtx (mode);
+		  emit_insn (gen_sub3 (t2, cop1, mask));
+
+		  cop0 = t1;
+		  cop1 = t2;
+		  code = GT;
+		}
+	      break;
+
+	    case E_V64QImode:
+	    case E_V32HImode:
+	    case E_V32QImode:
+	    case E_V16HImode:
+	    case E_V16QImode:
+	    case E_V8HImode:
+	      /* Perform a parallel unsigned saturating subtraction.  */
+	      x = gen_reg_rtx (mode);
+	      emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
+							   cop1)));
+
+	      cop0 = x;
+	      cop1 = CONST0_RTX (mode);
+	      code = EQ;
+	      *negate = !*negate;
+	      break;
+
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
+    }
+
+  if (*negate)
+    std::swap (op_true, op_false);
+
+  /* Allow the comparison to be done in one mode, but the movcc to
+     happen in another mode.  */
+  if (data_mode == mode)
+    {
+      x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
+			       op_true, op_false);
+    }
+  else
+    {
+      gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
+      x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
+			       op_true, op_false);
+      if (GET_MODE (x) == mode)
+	x = gen_lowpart (data_mode, x);
+    }
+
+  return x;
+}
+
+/* Expand integer vector comparison.  */
+
+bool
+ix86_expand_int_vec_cmp (rtx operands[])
+{
+  rtx_code code = GET_CODE (operands[1]);
+  bool negate = false;
+  rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
+				     operands[3], NULL, NULL, &negate);
+
+  if (!cmp)
+    return false;
+
+  if (negate)
+    cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
+				   CONST0_RTX (GET_MODE (cmp)),
+				   NULL, NULL, &negate);
+
+  gcc_assert (!negate);
+
+  if (operands[0] != cmp)
+    emit_move_insn (operands[0], cmp);
+
+  return true;
+}
+
+/* Expand a floating-point vector conditional move; a vcond operation
+   rather than a movcc operation.  */
+
+bool
+ix86_expand_fp_vcond (rtx operands[])
+{
+  enum rtx_code code = GET_CODE (operands[3]);
+  rtx cmp;
+
+  code = ix86_prepare_sse_fp_compare_args (operands[0], code,
+					   &operands[4], &operands[5]);
+  if (code == UNKNOWN)
+    {
+      rtx temp;
+      switch (GET_CODE (operands[3]))
+	{
+	case LTGT:
+	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
+				      operands[5], operands[0], operands[0]);
+	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
+				     operands[5], operands[1], operands[2]);
+	  code = AND;
+	  break;
+	case UNEQ:
+	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
+				      operands[5], operands[0], operands[0]);
+	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
+				     operands[5], operands[1], operands[2]);
+	  code = IOR;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
+				 OPTAB_DIRECT);
+      ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
+      return true;
+    }
+
+  if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
+				 operands[5], operands[1], operands[2]))
+    return true;
+
+  cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
+			     operands[1], operands[2]);
+  ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
+  return true;
+}
+
+/* Expand a signed/unsigned integral vector conditional move.  */
+
+bool
+ix86_expand_int_vcond (rtx operands[])
+{
+  machine_mode data_mode = GET_MODE (operands[0]);
+  machine_mode mode = GET_MODE (operands[4]);
+  enum rtx_code code = GET_CODE (operands[3]);
+  bool negate = false;
+  rtx x, cop0, cop1;
+
+  cop0 = operands[4];
+  cop1 = operands[5];
+
+  /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
+     and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
+  if ((code == LT || code == GE)
+      && data_mode == mode
+      && cop1 == CONST0_RTX (mode)
+      && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
+      && GET_MODE_UNIT_SIZE (data_mode) > 1
+      && GET_MODE_UNIT_SIZE (data_mode) <= 8
+      && (GET_MODE_SIZE (data_mode) == 16
+	  || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
+    {
+      rtx negop = operands[2 - (code == LT)];
+      int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
+      if (negop == CONST1_RTX (data_mode))
+	{
+	  rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
+					 operands[0], 1, OPTAB_DIRECT);
+	  if (res != operands[0])
+	    emit_move_insn (operands[0], res);
+	  return true;
+	}
+      else if (GET_MODE_INNER (data_mode) != DImode
+	       && vector_all_ones_operand (negop, data_mode))
+	{
+	  rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
+					 operands[0], 0, OPTAB_DIRECT);
+	  if (res != operands[0])
+	    emit_move_insn (operands[0], res);
+	  return true;
+	}
+    }
+
+  if (!nonimmediate_operand (cop1, mode))
+    cop1 = force_reg (mode, cop1);
+  if (!general_operand (operands[1], data_mode))
+    operands[1] = force_reg (data_mode, operands[1]);
+  if (!general_operand (operands[2], data_mode))
+    operands[2] = force_reg (data_mode, operands[2]);
+
+  x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
+			       operands[1], operands[2], &negate);
+
+  if (!x)
+    return false;
+
+  ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
+			 operands[2-negate]);
+  return true;
+}
+
+static bool
+ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
+			      struct expand_vec_perm_d *d)
+{
+  /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
+     expander, so args are either in d, or in op0, op1 etc.  */
+  machine_mode mode = GET_MODE (d ? d->op0 : op0);
+  machine_mode maskmode = mode;
+  rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
+
+  switch (mode)
+    {
+    case E_V8HImode:
+      if (TARGET_AVX512VL && TARGET_AVX512BW)
+	gen = gen_avx512vl_vpermt2varv8hi3;
+      break;
+    case E_V16HImode:
+      if (TARGET_AVX512VL && TARGET_AVX512BW)
+	gen = gen_avx512vl_vpermt2varv16hi3;
+      break;
+    case E_V64QImode:
+      if (TARGET_AVX512VBMI)
+	gen = gen_avx512bw_vpermt2varv64qi3;
+      break;
+    case E_V32HImode:
+      if (TARGET_AVX512BW)
+	gen = gen_avx512bw_vpermt2varv32hi3;
+      break;
+    case E_V4SImode:
+      if (TARGET_AVX512VL)
+	gen = gen_avx512vl_vpermt2varv4si3;
+      break;
+    case E_V8SImode:
+      if (TARGET_AVX512VL)
+	gen = gen_avx512vl_vpermt2varv8si3;
+      break;
+    case E_V16SImode:
+      if (TARGET_AVX512F)
+	gen = gen_avx512f_vpermt2varv16si3;
+      break;
+    case E_V4SFmode:
+      if (TARGET_AVX512VL)
+	{
+	  gen = gen_avx512vl_vpermt2varv4sf3;
+	  maskmode = V4SImode;
+	}
+      break;
+    case E_V8SFmode:
+      if (TARGET_AVX512VL)
+	{
+	  gen = gen_avx512vl_vpermt2varv8sf3;
+	  maskmode = V8SImode;
+	}
+      break;
+    case E_V16SFmode:
+      if (TARGET_AVX512F)
+	{
+	  gen = gen_avx512f_vpermt2varv16sf3;
+	  maskmode = V16SImode;
+	}
+      break;
+    case E_V2DImode:
+      if (TARGET_AVX512VL)
+	gen = gen_avx512vl_vpermt2varv2di3;
+      break;
+    case E_V4DImode:
+      if (TARGET_AVX512VL)
+	gen = gen_avx512vl_vpermt2varv4di3;
+      break;
+    case E_V8DImode:
+      if (TARGET_AVX512F)
+	gen = gen_avx512f_vpermt2varv8di3;
+      break;
+    case E_V2DFmode:
+      if (TARGET_AVX512VL)
+	{
+	  gen = gen_avx512vl_vpermt2varv2df3;
+	  maskmode = V2DImode;
+	}
+      break;
+    case E_V4DFmode:
+      if (TARGET_AVX512VL)
+	{
+	  gen = gen_avx512vl_vpermt2varv4df3;
+	  maskmode = V4DImode;
+	}
+      break;
+    case E_V8DFmode:
+      if (TARGET_AVX512F)
+	{
+	  gen = gen_avx512f_vpermt2varv8df3;
+	  maskmode = V8DImode;
+	}
+      break;
+    default:
+      break;
+    }
+
+  if (gen == NULL)
+    return false;
+
+  /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
+     expander, so args are either in d, or in op0, op1 etc.  */
+  if (d)
+    {
+      rtx vec[64];
+      target = d->target;
+      op0 = d->op0;
+      op1 = d->op1;
+      for (int i = 0; i < d->nelt; ++i)
+	vec[i] = GEN_INT (d->perm[i]);
+      mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
+    }
+
+  emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
+  return true;
+}
+
+/* Expand a variable vector permutation.  */
+
+void
+ix86_expand_vec_perm (rtx operands[])
+{
+  rtx target = operands[0];
+  rtx op0 = operands[1];
+  rtx op1 = operands[2];
+  rtx mask = operands[3];
+  rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
+  machine_mode mode = GET_MODE (op0);
+  machine_mode maskmode = GET_MODE (mask);
+  int w, e, i;
+  bool one_operand_shuffle = rtx_equal_p (op0, op1);
+
+  /* Number of elements in the vector.  */
+  w = GET_MODE_NUNITS (mode);
+  e = GET_MODE_UNIT_SIZE (mode);
+  gcc_assert (w <= 64);
+
+  if (TARGET_AVX512F && one_operand_shuffle)
+    {
+      rtx (*gen) (rtx, rtx, rtx) = NULL;
+      switch (mode)
+	{
+	case E_V16SImode:
+	  gen =gen_avx512f_permvarv16si;
+	  break;
+	case E_V16SFmode:
+	  gen = gen_avx512f_permvarv16sf;
+	  break;
+	case E_V8DImode:
+	  gen = gen_avx512f_permvarv8di;
+	  break;
+	case E_V8DFmode:
+	  gen = gen_avx512f_permvarv8df;
+	  break;
+	default:
+	  break;
+	}
+      if (gen != NULL)
+	{
+	  emit_insn (gen (target, op0, mask));
+	  return;
+	}
+    }
+
+  if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
+    return;
+
+  if (TARGET_AVX2)
+    {
+      if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
+	{
+	  /* Unfortunately, the VPERMQ and VPERMPD instructions only support
+	     an constant shuffle operand.  With a tiny bit of effort we can
+	     use VPERMD instead.  A re-interpretation stall for V4DFmode is
+	     unfortunate but there's no avoiding it.
+	     Similarly for V16HImode we don't have instructions for variable
+	     shuffling, while for V32QImode we can use after preparing suitable
+	     masks vpshufb; vpshufb; vpermq; vpor.  */
+
+	  if (mode == V16HImode)
+	    {
+	      maskmode = mode = V32QImode;
+	      w = 32;
+	      e = 1;
+	    }
+	  else
+	    {
+	      maskmode = mode = V8SImode;
+	      w = 8;
+	      e = 4;
+	    }
+	  t1 = gen_reg_rtx (maskmode);
+
+	  /* Replicate the low bits of the V4DImode mask into V8SImode:
+	       mask = { A B C D }
+	       t1 = { A A B B C C D D }.  */
+	  for (i = 0; i < w / 2; ++i)
+	    vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
+	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+	  vt = force_reg (maskmode, vt);
+	  mask = gen_lowpart (maskmode, mask);
+	  if (maskmode == V8SImode)
+	    emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
+	  else
+	    emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
+
+	  /* Multiply the shuffle indicies by two.  */
+	  t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
+				    OPTAB_DIRECT);
+
+	  /* Add one to the odd shuffle indicies:
+		t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
+	  for (i = 0; i < w / 2; ++i)
+	    {
+	      vec[i * 2] = const0_rtx;
+	      vec[i * 2 + 1] = const1_rtx;
+	    }
+	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+	  vt = validize_mem (force_const_mem (maskmode, vt));
+	  t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
+				    OPTAB_DIRECT);
+
+	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
+	  operands[3] = mask = t1;
+	  target = gen_reg_rtx (mode);
+	  op0 = gen_lowpart (mode, op0);
+	  op1 = gen_lowpart (mode, op1);
+	}
+
+      switch (mode)
+	{
+	case E_V8SImode:
+	  /* The VPERMD and VPERMPS instructions already properly ignore
+	     the high bits of the shuffle elements.  No need for us to
+	     perform an AND ourselves.  */
+	  if (one_operand_shuffle)
+	    {
+	      emit_insn (gen_avx2_permvarv8si (target, op0, mask));
+	      if (target != operands[0])
+		emit_move_insn (operands[0],
+				gen_lowpart (GET_MODE (operands[0]), target));
+	    }
+	  else
+	    {
+	      t1 = gen_reg_rtx (V8SImode);
+	      t2 = gen_reg_rtx (V8SImode);
+	      emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
+	      emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
+	      goto merge_two;
+	    }
+	  return;
+
+	case E_V8SFmode:
+	  mask = gen_lowpart (V8SImode, mask);
+	  if (one_operand_shuffle)
+	    emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
+	  else
+	    {
+	      t1 = gen_reg_rtx (V8SFmode);
+	      t2 = gen_reg_rtx (V8SFmode);
+	      emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
+	      emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
+	      goto merge_two;
+	    }
+	  return;
+
+        case E_V4SImode:
+	  /* By combining the two 128-bit input vectors into one 256-bit
+	     input vector, we can use VPERMD and VPERMPS for the full
+	     two-operand shuffle.  */
+	  t1 = gen_reg_rtx (V8SImode);
+	  t2 = gen_reg_rtx (V8SImode);
+	  emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
+	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
+	  emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
+	  emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
+	  return;
+
+        case E_V4SFmode:
+	  t1 = gen_reg_rtx (V8SFmode);
+	  t2 = gen_reg_rtx (V8SImode);
+	  mask = gen_lowpart (V4SImode, mask);
+	  emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
+	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
+	  emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
+	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
+	  return;
+
+	case E_V32QImode:
+	  t1 = gen_reg_rtx (V32QImode);
+	  t2 = gen_reg_rtx (V32QImode);
+	  t3 = gen_reg_rtx (V32QImode);
+	  vt2 = GEN_INT (-128);
+	  vt = gen_const_vec_duplicate (V32QImode, vt2);
+	  vt = force_reg (V32QImode, vt);
+	  for (i = 0; i < 32; i++)
+	    vec[i] = i < 16 ? vt2 : const0_rtx;
+	  vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
+	  vt2 = force_reg (V32QImode, vt2);
+	  /* From mask create two adjusted masks, which contain the same
+	     bits as mask in the low 7 bits of each vector element.
+	     The first mask will have the most significant bit clear
+	     if it requests element from the same 128-bit lane
+	     and MSB set if it requests element from the other 128-bit lane.
+	     The second mask will have the opposite values of the MSB,
+	     and additionally will have its 128-bit lanes swapped.
+	     E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
+	     t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
+	     t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
+	     stands for other 12 bytes.  */
+	  /* The bit whether element is from the same lane or the other
+	     lane is bit 4, so shift it up by 3 to the MSB position.  */
+	  t5 = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
+				    GEN_INT (3)));
+	  /* Clear MSB bits from the mask just in case it had them set.  */
+	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
+	  /* After this t1 will have MSB set for elements from other lane.  */
+	  emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
+	  /* Clear bits other than MSB.  */
+	  emit_insn (gen_andv32qi3 (t1, t1, vt));
+	  /* Or in the lower bits from mask into t3.  */
+	  emit_insn (gen_iorv32qi3 (t3, t1, t2));
+	  /* And invert MSB bits in t1, so MSB is set for elements from the same
+	     lane.  */
+	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
+	  /* Swap 128-bit lanes in t3.  */
+	  t6 = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
+					  const2_rtx, GEN_INT (3),
+					  const0_rtx, const1_rtx));
+	  /* And or in the lower bits from mask into t1.  */
+	  emit_insn (gen_iorv32qi3 (t1, t1, t2));
+	  if (one_operand_shuffle)
+	    {
+	      /* Each of these shuffles will put 0s in places where
+		 element from the other 128-bit lane is needed, otherwise
+		 will shuffle in the requested value.  */
+	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
+						gen_lowpart (V32QImode, t6)));
+	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
+	      /* For t3 the 128-bit lanes are swapped again.  */
+	      t7 = gen_reg_rtx (V4DImode);
+	      emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
+					      const2_rtx, GEN_INT (3),
+					      const0_rtx, const1_rtx));
+	      /* And oring both together leads to the result.  */
+	      emit_insn (gen_iorv32qi3 (target, t1,
+					gen_lowpart (V32QImode, t7)));
+	      if (target != operands[0])
+		emit_move_insn (operands[0],
+				gen_lowpart (GET_MODE (operands[0]), target));
+	      return;
+	    }
+
+	  t4 = gen_reg_rtx (V32QImode);
+	  /* Similarly to the above one_operand_shuffle code,
+	     just for repeated twice for each operand.  merge_two:
+	     code will merge the two results together.  */
+	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
+					    gen_lowpart (V32QImode, t6)));
+	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
+					    gen_lowpart (V32QImode, t6)));
+	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
+	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
+	  t7 = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
+					  const2_rtx, GEN_INT (3),
+					  const0_rtx, const1_rtx));
+	  t8 = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
+					  const2_rtx, GEN_INT (3),
+					  const0_rtx, const1_rtx));
+	  emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
+	  emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
+	  t1 = t4;
+	  t2 = t3;
+	  goto merge_two;
+
+	default:
+	  gcc_assert (GET_MODE_SIZE (mode) <= 16);
+	  break;
+	}
+    }
+
+  if (TARGET_XOP)
+    {
+      /* The XOP VPPERM insn supports three inputs.  By ignoring the 
+	 one_operand_shuffle special case, we avoid creating another
+	 set of constant vectors in memory.  */
+      one_operand_shuffle = false;
+
+      /* mask = mask & {2*w-1, ...} */
+      vt = GEN_INT (2*w - 1);
+    }
+  else
+    {
+      /* mask = mask & {w-1, ...} */
+      vt = GEN_INT (w - 1);
+    }
+
+  vt = gen_const_vec_duplicate (maskmode, vt);
+  mask = expand_simple_binop (maskmode, AND, mask, vt,
+			      NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* For non-QImode operations, convert the word permutation control
+     into a byte permutation control.  */
+  if (mode != V16QImode)
+    {
+      mask = expand_simple_binop (maskmode, ASHIFT, mask,
+				  GEN_INT (exact_log2 (e)),
+				  NULL_RTX, 0, OPTAB_DIRECT);
+
+      /* Convert mask to vector of chars.  */
+      mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
+
+      /* Replicate each of the input bytes into byte positions:
+	 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
+	 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
+	 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
+      for (i = 0; i < 16; ++i)
+	vec[i] = GEN_INT (i/e * e);
+      vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+      vt = validize_mem (force_const_mem (V16QImode, vt));
+      if (TARGET_XOP)
+	emit_insn (gen_xop_pperm (mask, mask, mask, vt));
+      else
+	emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
+
+      /* Convert it into the byte positions by doing
+	 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
+      for (i = 0; i < 16; ++i)
+	vec[i] = GEN_INT (i % e);
+      vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+      vt = validize_mem (force_const_mem (V16QImode, vt));
+      emit_insn (gen_addv16qi3 (mask, mask, vt));
+    }
+
+  /* The actual shuffle operations all operate on V16QImode.  */
+  op0 = gen_lowpart (V16QImode, op0);
+  op1 = gen_lowpart (V16QImode, op1);
+
+  if (TARGET_XOP)
+    {
+      if (GET_MODE (target) != V16QImode)
+	target = gen_reg_rtx (V16QImode);
+      emit_insn (gen_xop_pperm (target, op0, op1, mask));
+      if (target != operands[0])
+	emit_move_insn (operands[0],
+			gen_lowpart (GET_MODE (operands[0]), target));
+    }
+  else if (one_operand_shuffle)
+    {
+      if (GET_MODE (target) != V16QImode)
+	target = gen_reg_rtx (V16QImode);
+      emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
+      if (target != operands[0])
+	emit_move_insn (operands[0],
+			gen_lowpart (GET_MODE (operands[0]), target));
+    }
+  else
+    {
+      rtx xops[6];
+      bool ok;
+
+      /* Shuffle the two input vectors independently.  */
+      t1 = gen_reg_rtx (V16QImode);
+      t2 = gen_reg_rtx (V16QImode);
+      emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
+      emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
+
+ merge_two:
+      /* Then merge them together.  The key is whether any given control
+         element contained a bit set that indicates the second word.  */
+      mask = operands[3];
+      vt = GEN_INT (w);
+      if (maskmode == V2DImode && !TARGET_SSE4_1)
+	{
+	  /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
+	     more shuffle to convert the V2DI input mask into a V4SI
+	     input mask.  At which point the masking that expand_int_vcond
+	     will work as desired.  */
+	  rtx t3 = gen_reg_rtx (V4SImode);
+	  emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
+				        const0_rtx, const0_rtx,
+				        const2_rtx, const2_rtx));
+	  mask = t3;
+	  maskmode = V4SImode;
+	  e = w = 4;
+	}
+
+      vt = gen_const_vec_duplicate (maskmode, vt);
+      vt = force_reg (maskmode, vt);
+      mask = expand_simple_binop (maskmode, AND, mask, vt,
+				  NULL_RTX, 0, OPTAB_DIRECT);
+
+      if (GET_MODE (target) != mode)
+	target = gen_reg_rtx (mode);
+      xops[0] = target;
+      xops[1] = gen_lowpart (mode, t2);
+      xops[2] = gen_lowpart (mode, t1);
+      xops[3] = gen_rtx_EQ (maskmode, mask, vt);
+      xops[4] = mask;
+      xops[5] = vt;
+      ok = ix86_expand_int_vcond (xops);
+      gcc_assert (ok);
+      if (target != operands[0])
+	emit_move_insn (operands[0],
+			gen_lowpart (GET_MODE (operands[0]), target));
+    }
+}
+
+/* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
+   true if we should do zero extension, else sign extension.  HIGH_P is
+   true if we want the N/2 high elements, else the low elements.  */
+
+void
+ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
+{
+  machine_mode imode = GET_MODE (src);
+  rtx tmp;
+
+  if (TARGET_SSE4_1)
+    {
+      rtx (*unpack)(rtx, rtx);
+      rtx (*extract)(rtx, rtx) = NULL;
+      machine_mode halfmode = BLKmode;
+
+      switch (imode)
+	{
+	case E_V64QImode:
+	  if (unsigned_p)
+	    unpack = gen_avx512bw_zero_extendv32qiv32hi2;
+	  else
+	    unpack = gen_avx512bw_sign_extendv32qiv32hi2;
+	  halfmode = V32QImode;
+	  extract
+	    = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
+	  break;
+	case E_V32QImode:
+	  if (unsigned_p)
+	    unpack = gen_avx2_zero_extendv16qiv16hi2;
+	  else
+	    unpack = gen_avx2_sign_extendv16qiv16hi2;
+	  halfmode = V16QImode;
+	  extract
+	    = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
+	  break;
+	case E_V32HImode:
+	  if (unsigned_p)
+	    unpack = gen_avx512f_zero_extendv16hiv16si2;
+	  else
+	    unpack = gen_avx512f_sign_extendv16hiv16si2;
+	  halfmode = V16HImode;
+	  extract
+	    = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
+	  break;
+	case E_V16HImode:
+	  if (unsigned_p)
+	    unpack = gen_avx2_zero_extendv8hiv8si2;
+	  else
+	    unpack = gen_avx2_sign_extendv8hiv8si2;
+	  halfmode = V8HImode;
+	  extract
+	    = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
+	  break;
+	case E_V16SImode:
+	  if (unsigned_p)
+	    unpack = gen_avx512f_zero_extendv8siv8di2;
+	  else
+	    unpack = gen_avx512f_sign_extendv8siv8di2;
+	  halfmode = V8SImode;
+	  extract
+	    = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
+	  break;
+	case E_V8SImode:
+	  if (unsigned_p)
+	    unpack = gen_avx2_zero_extendv4siv4di2;
+	  else
+	    unpack = gen_avx2_sign_extendv4siv4di2;
+	  halfmode = V4SImode;
+	  extract
+	    = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
+	  break;
+	case E_V16QImode:
+	  if (unsigned_p)
+	    unpack = gen_sse4_1_zero_extendv8qiv8hi2;
+	  else
+	    unpack = gen_sse4_1_sign_extendv8qiv8hi2;
+	  break;
+	case E_V8HImode:
+	  if (unsigned_p)
+	    unpack = gen_sse4_1_zero_extendv4hiv4si2;
+	  else
+	    unpack = gen_sse4_1_sign_extendv4hiv4si2;
+	  break;
+	case E_V4SImode:
+	  if (unsigned_p)
+	    unpack = gen_sse4_1_zero_extendv2siv2di2;
+	  else
+	    unpack = gen_sse4_1_sign_extendv2siv2di2;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      if (GET_MODE_SIZE (imode) >= 32)
+	{
+	  tmp = gen_reg_rtx (halfmode);
+	  emit_insn (extract (tmp, src));
+	}
+      else if (high_p)
+	{
+	  /* Shift higher 8 bytes to lower 8 bytes.  */
+	  tmp = gen_reg_rtx (V1TImode);
+	  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
+					 GEN_INT (64)));
+	  tmp = gen_lowpart (imode, tmp);
+	}
+      else
+	tmp = src;
+
+      emit_insn (unpack (dest, tmp));
+    }
+  else
+    {
+      rtx (*unpack)(rtx, rtx, rtx);
+
+      switch (imode)
+	{
+	case E_V16QImode:
+	  if (high_p)
+	    unpack = gen_vec_interleave_highv16qi;
+	  else
+	    unpack = gen_vec_interleave_lowv16qi;
+	  break;
+	case E_V8HImode:
+	  if (high_p)
+	    unpack = gen_vec_interleave_highv8hi;
+	  else
+	    unpack = gen_vec_interleave_lowv8hi;
+	  break;
+	case E_V4SImode:
+	  if (high_p)
+	    unpack = gen_vec_interleave_highv4si;
+	  else
+	    unpack = gen_vec_interleave_lowv4si;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      if (unsigned_p)
+	tmp = force_reg (imode, CONST0_RTX (imode));
+      else
+	tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
+				   src, pc_rtx, pc_rtx);
+
+      rtx tmp2 = gen_reg_rtx (imode);
+      emit_insn (unpack (tmp2, src, tmp));
+      emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
+    }
+}
+
+/* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
+   but works for floating pointer parameters and nonoffsetable memories.
+   For pushes, it returns just stack offsets; the values will be saved
+   in the right order.  Maximally three parts are generated.  */
+
+static int
+ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
+{
+  int size;
+
+  if (!TARGET_64BIT)
+    size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
+  else
+    size = (GET_MODE_SIZE (mode) + 4) / 8;
+
+  gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
+  gcc_assert (size >= 2 && size <= 4);
+
+  /* Optimize constant pool reference to immediates.  This is used by fp
+     moves, that force all constants to memory to allow combining.  */
+  if (MEM_P (operand) && MEM_READONLY_P (operand))
+    operand = avoid_constant_pool_reference (operand);
+
+  if (MEM_P (operand) && !offsettable_memref_p (operand))
+    {
+      /* The only non-offsetable memories we handle are pushes.  */
+      int ok = push_operand (operand, VOIDmode);
+
+      gcc_assert (ok);
+
+      operand = copy_rtx (operand);
+      PUT_MODE (operand, word_mode);
+      parts[0] = parts[1] = parts[2] = parts[3] = operand;
+      return size;
+    }
+
+  if (GET_CODE (operand) == CONST_VECTOR)
+    {
+      scalar_int_mode imode = int_mode_for_mode (mode).require ();
+      /* Caution: if we looked through a constant pool memory above,
+	 the operand may actually have a different mode now.  That's
+	 ok, since we want to pun this all the way back to an integer.  */
+      operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
+      gcc_assert (operand != NULL);
+      mode = imode;
+    }
+
+  if (!TARGET_64BIT)
+    {
+      if (mode == DImode)
+	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
+      else
+	{
+	  int i;
+
+	  if (REG_P (operand))
+	    {
+	      gcc_assert (reload_completed);
+	      for (i = 0; i < size; i++)
+		parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
+	    }
+	  else if (offsettable_memref_p (operand))
+	    {
+	      operand = adjust_address (operand, SImode, 0);
+	      parts[0] = operand;
+	      for (i = 1; i < size; i++)
+		parts[i] = adjust_address (operand, SImode, 4 * i);
+	    }
+	  else if (CONST_DOUBLE_P (operand))
+	    {
+	      const REAL_VALUE_TYPE *r;
+	      long l[4];
+
+	      r = CONST_DOUBLE_REAL_VALUE (operand);
+	      switch (mode)
+		{
+		case E_TFmode:
+		  real_to_target (l, r, mode);
+		  parts[3] = gen_int_mode (l[3], SImode);
+		  parts[2] = gen_int_mode (l[2], SImode);
+		  break;
+		case E_XFmode:
+		  /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
+		     long double may not be 80-bit.  */
+		  real_to_target (l, r, mode);
+		  parts[2] = gen_int_mode (l[2], SImode);
+		  break;
+		case E_DFmode:
+		  REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
+	      parts[1] = gen_int_mode (l[1], SImode);
+	      parts[0] = gen_int_mode (l[0], SImode);
+	    }
+	  else
+	    gcc_unreachable ();
+	}
+    }
+  else
+    {
+      if (mode == TImode)
+	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
+      if (mode == XFmode || mode == TFmode)
+	{
+	  machine_mode upper_mode = mode==XFmode ? SImode : DImode;
+	  if (REG_P (operand))
+	    {
+	      gcc_assert (reload_completed);
+	      parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
+	      parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
+	    }
+	  else if (offsettable_memref_p (operand))
+	    {
+	      operand = adjust_address (operand, DImode, 0);
+	      parts[0] = operand;
+	      parts[1] = adjust_address (operand, upper_mode, 8);
+	    }
+	  else if (CONST_DOUBLE_P (operand))
+	    {
+	      long l[4];
+
+	      real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
+
+	      /* real_to_target puts 32-bit pieces in each long.  */
+	      parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
+				       | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
+					  << 32), DImode);
+
+	      if (upper_mode == SImode)
+	        parts[1] = gen_int_mode (l[2], SImode);
+	      else
+	        parts[1]
+		  = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
+				  | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
+				     << 32), DImode);
+	    }
+	  else
+	    gcc_unreachable ();
+	}
+    }
+
+  return size;
+}
+
+/* Emit insns to perform a move or push of DI, DF, XF, and TF values.
+   Return false when normal moves are needed; true when all required
+   insns have been emitted.  Operands 2-4 contain the input values
+   int the correct order; operands 5-7 contain the output values.  */
+
+void
+ix86_split_long_move (rtx operands[])
+{
+  rtx part[2][4];
+  int nparts, i, j;
+  int push = 0;
+  int collisions = 0;
+  machine_mode mode = GET_MODE (operands[0]);
+  bool collisionparts[4];
+
+  /* The DFmode expanders may ask us to move double.
+     For 64bit target this is single move.  By hiding the fact
+     here we simplify i386.md splitters.  */
+  if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
+    {
+      /* Optimize constant pool reference to immediates.  This is used by
+	 fp moves, that force all constants to memory to allow combining.  */
+
+      if (MEM_P (operands[1])
+	  && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
+	  && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
+	operands[1] = get_pool_constant (XEXP (operands[1], 0));
+      if (push_operand (operands[0], VOIDmode))
+	{
+	  operands[0] = copy_rtx (operands[0]);
+	  PUT_MODE (operands[0], word_mode);
+	}
+      else
+        operands[0] = gen_lowpart (DImode, operands[0]);
+      operands[1] = gen_lowpart (DImode, operands[1]);
+      emit_move_insn (operands[0], operands[1]);
+      return;
+    }
+
+  /* The only non-offsettable memory we handle is push.  */
+  if (push_operand (operands[0], VOIDmode))
+    push = 1;
+  else
+    gcc_assert (!MEM_P (operands[0])
+		|| offsettable_memref_p (operands[0]));
+
+  nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
+  ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
+
+  /* When emitting push, take care for source operands on the stack.  */
+  if (push && MEM_P (operands[1])
+      && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
+    {
+      rtx src_base = XEXP (part[1][nparts - 1], 0);
+
+      /* Compensate for the stack decrement by 4.  */
+      if (!TARGET_64BIT && nparts == 3
+	  && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
+	src_base = plus_constant (Pmode, src_base, 4);
+
+      /* src_base refers to the stack pointer and is
+	 automatically decreased by emitted push.  */
+      for (i = 0; i < nparts; i++)
+	part[1][i] = change_address (part[1][i],
+				     GET_MODE (part[1][i]), src_base);
+    }
+
+  /* We need to do copy in the right order in case an address register
+     of the source overlaps the destination.  */
+  if (REG_P (part[0][0]) && MEM_P (part[1][0]))
+    {
+      rtx tmp;
+
+      for (i = 0; i < nparts; i++)
+	{
+	  collisionparts[i]
+	    = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
+	  if (collisionparts[i])
+	    collisions++;
+	}
+
+      /* Collision in the middle part can be handled by reordering.  */
+      if (collisions == 1 && nparts == 3 && collisionparts [1])
+	{
+	  std::swap (part[0][1], part[0][2]);
+	  std::swap (part[1][1], part[1][2]);
+	}
+      else if (collisions == 1
+	       && nparts == 4
+	       && (collisionparts [1] || collisionparts [2]))
+	{
+	  if (collisionparts [1])
+	    {
+	      std::swap (part[0][1], part[0][2]);
+	      std::swap (part[1][1], part[1][2]);
+	    }
+	  else
+	    {
+	      std::swap (part[0][2], part[0][3]);
+	      std::swap (part[1][2], part[1][3]);
+	    }
+	}
+
+      /* If there are more collisions, we can't handle it by reordering.
+	 Do an lea to the last part and use only one colliding move.  */
+      else if (collisions > 1)
+	{
+	  rtx base, addr;
+
+	  collisions = 1;
+
+	  base = part[0][nparts - 1];
+
+	  /* Handle the case when the last part isn't valid for lea.
+	     Happens in 64-bit mode storing the 12-byte XFmode.  */
+	  if (GET_MODE (base) != Pmode)
+	    base = gen_rtx_REG (Pmode, REGNO (base));
+
+	  addr = XEXP (part[1][0], 0);
+	  if (TARGET_TLS_DIRECT_SEG_REFS)
+	    {
+	      struct ix86_address parts;
+	      int ok = ix86_decompose_address (addr, &parts);
+	      gcc_assert (ok);
+	      /* It is not valid to use %gs: or %fs: in lea.  */
+	      gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
+	    }
+	  emit_insn (gen_rtx_SET (base, addr));
+	  part[1][0] = replace_equiv_address (part[1][0], base);
+	  for (i = 1; i < nparts; i++)
+	    {
+	      tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
+	      part[1][i] = replace_equiv_address (part[1][i], tmp);
+	    }
+	}
+    }
+
+  if (push)
+    {
+      if (!TARGET_64BIT)
+	{
+	  if (nparts == 3)
+	    {
+	      if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
+                emit_insn (ix86_gen_add3 (stack_pointer_rtx,
+					  stack_pointer_rtx, GEN_INT (-4)));
+	      emit_move_insn (part[0][2], part[1][2]);
+	    }
+	  else if (nparts == 4)
+	    {
+	      emit_move_insn (part[0][3], part[1][3]);
+	      emit_move_insn (part[0][2], part[1][2]);
+	    }
+	}
+      else
+	{
+	  /* In 64bit mode we don't have 32bit push available.  In case this is
+	     register, it is OK - we will just use larger counterpart.  We also
+	     retype memory - these comes from attempt to avoid REX prefix on
+	     moving of second half of TFmode value.  */
+	  if (GET_MODE (part[1][1]) == SImode)
+	    {
+	      switch (GET_CODE (part[1][1]))
+		{
+		case MEM:
+		  part[1][1] = adjust_address (part[1][1], DImode, 0);
+		  break;
+
+		case REG:
+		  part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
+		  break;
+
+		default:
+		  gcc_unreachable ();
+		}
+
+	      if (GET_MODE (part[1][0]) == SImode)
+		part[1][0] = part[1][1];
+	    }
+	}
+      emit_move_insn (part[0][1], part[1][1]);
+      emit_move_insn (part[0][0], part[1][0]);
+      return;
+    }
+
+  /* Choose correct order to not overwrite the source before it is copied.  */
+  if ((REG_P (part[0][0])
+       && REG_P (part[1][1])
+       && (REGNO (part[0][0]) == REGNO (part[1][1])
+	   || (nparts == 3
+	       && REGNO (part[0][0]) == REGNO (part[1][2]))
+	   || (nparts == 4
+	       && REGNO (part[0][0]) == REGNO (part[1][3]))))
+      || (collisions > 0
+	  && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
+    {
+      for (i = 0, j = nparts - 1; i < nparts; i++, j--)
+	{
+	  operands[2 + i] = part[0][j];
+	  operands[6 + i] = part[1][j];
+	}
+    }
+  else
+    {
+      for (i = 0; i < nparts; i++)
+	{
+	  operands[2 + i] = part[0][i];
+	  operands[6 + i] = part[1][i];
+	}
+    }
+
+  /* If optimizing for size, attempt to locally unCSE nonzero constants.  */
+  if (optimize_insn_for_size_p ())
+    {
+      for (j = 0; j < nparts - 1; j++)
+	if (CONST_INT_P (operands[6 + j])
+	    && operands[6 + j] != const0_rtx
+	    && REG_P (operands[2 + j]))
+	  for (i = j; i < nparts - 1; i++)
+	    if (CONST_INT_P (operands[7 + i])
+		&& INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
+	      operands[7 + i] = operands[2 + j];
+    }
+
+  for (i = 0; i < nparts; i++)
+    emit_move_insn (operands[2 + i], operands[6 + i]);
+
+  return;
+}
+
+/* Helper function of ix86_split_ashl used to generate an SImode/DImode
+   left shift by a constant, either using a single shift or
+   a sequence of add instructions.  */
+
+static void
+ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
+{
+  rtx (*insn)(rtx, rtx, rtx);
+
+  if (count == 1
+      || (count * ix86_cost->add <= ix86_cost->shift_const
+	  && !optimize_insn_for_size_p ()))
+    {
+      insn = mode == DImode ? gen_addsi3 : gen_adddi3;
+      while (count-- > 0)
+	emit_insn (insn (operand, operand, operand));
+    }
+  else
+    {
+      insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
+      emit_insn (insn (operand, operand, GEN_INT (count)));
+    }
+}
+
+void
+ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
+{
+  rtx (*gen_ashl3)(rtx, rtx, rtx);
+  rtx (*gen_shld)(rtx, rtx, rtx);
+  int half_width = GET_MODE_BITSIZE (mode) >> 1;
+
+  rtx low[2], high[2];
+  int count;
+
+  if (CONST_INT_P (operands[2]))
+    {
+      split_double_mode (mode, operands, 2, low, high);
+      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
+
+      if (count >= half_width)
+	{
+	  emit_move_insn (high[0], low[1]);
+	  emit_move_insn (low[0], const0_rtx);
+
+	  if (count > half_width)
+	    ix86_expand_ashl_const (high[0], count - half_width, mode);
+	}
+      else
+	{
+	  gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
+
+	  if (!rtx_equal_p (operands[0], operands[1]))
+	    emit_move_insn (operands[0], operands[1]);
+
+	  emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
+	  ix86_expand_ashl_const (low[0], count, mode);
+	}
+      return;
+    }
+
+  split_double_mode (mode, operands, 1, low, high);
+
+  gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
+
+  if (operands[1] == const1_rtx)
+    {
+      /* Assuming we've chosen a QImode capable registers, then 1 << N
+	 can be done with two 32/64-bit shifts, no branches, no cmoves.  */
+      if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
+	{
+	  rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
+
+	  ix86_expand_clear (low[0]);
+	  ix86_expand_clear (high[0]);
+	  emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
+
+	  d = gen_lowpart (QImode, low[0]);
+	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
+	  s = gen_rtx_EQ (QImode, flags, const0_rtx);
+	  emit_insn (gen_rtx_SET (d, s));
+
+	  d = gen_lowpart (QImode, high[0]);
+	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
+	  s = gen_rtx_NE (QImode, flags, const0_rtx);
+	  emit_insn (gen_rtx_SET (d, s));
+	}
+
+      /* Otherwise, we can get the same results by manually performing
+	 a bit extract operation on bit 5/6, and then performing the two
+	 shifts.  The two methods of getting 0/1 into low/high are exactly
+	 the same size.  Avoiding the shift in the bit extract case helps
+	 pentium4 a bit; no one else seems to care much either way.  */
+      else
+	{
+	  machine_mode half_mode;
+	  rtx (*gen_lshr3)(rtx, rtx, rtx);
+	  rtx (*gen_and3)(rtx, rtx, rtx);
+	  rtx (*gen_xor3)(rtx, rtx, rtx);
+	  HOST_WIDE_INT bits;
+	  rtx x;
+
+	  if (mode == DImode)
+	    {
+	      half_mode = SImode;
+	      gen_lshr3 = gen_lshrsi3;
+	      gen_and3 = gen_andsi3;
+	      gen_xor3 = gen_xorsi3;
+	      bits = 5;
+	    }
+	  else
+	    {
+	      half_mode = DImode;
+	      gen_lshr3 = gen_lshrdi3;
+	      gen_and3 = gen_anddi3;
+	      gen_xor3 = gen_xordi3;
+	      bits = 6;
+	    }
+
+	  if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
+	    x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
+	  else
+	    x = gen_lowpart (half_mode, operands[2]);
+	  emit_insn (gen_rtx_SET (high[0], x));
+
+	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
+	  emit_insn (gen_and3 (high[0], high[0], const1_rtx));
+	  emit_move_insn (low[0], high[0]);
+	  emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
+	}
+
+      emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
+      emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
+      return;
+    }
+
+  if (operands[1] == constm1_rtx)
+    {
+      /* For -1 << N, we can avoid the shld instruction, because we
+	 know that we're shifting 0...31/63 ones into a -1.  */
+      emit_move_insn (low[0], constm1_rtx);
+      if (optimize_insn_for_size_p ())
+	emit_move_insn (high[0], low[0]);
+      else
+	emit_move_insn (high[0], constm1_rtx);
+    }
+  else
+    {
+      gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
+
+      if (!rtx_equal_p (operands[0], operands[1]))
+	emit_move_insn (operands[0], operands[1]);
+
+      split_double_mode (mode, operands, 1, low, high);
+      emit_insn (gen_shld (high[0], low[0], operands[2]));
+    }
+
+  emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
+
+  if (TARGET_CMOVE && scratch)
+    {
+      rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
+	= mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
+
+      ix86_expand_clear (scratch);
+      emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
+    }
+  else
+    {
+      rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
+	= mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
+
+      emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
+    }
+}
+
+void
+ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
+{
+  rtx (*gen_ashr3)(rtx, rtx, rtx)
+    = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
+  rtx (*gen_shrd)(rtx, rtx, rtx);
+  int half_width = GET_MODE_BITSIZE (mode) >> 1;
+
+  rtx low[2], high[2];
+  int count;
+
+  if (CONST_INT_P (operands[2]))
+    {
+      split_double_mode (mode, operands, 2, low, high);
+      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
+
+      if (count == GET_MODE_BITSIZE (mode) - 1)
+	{
+	  emit_move_insn (high[0], high[1]);
+	  emit_insn (gen_ashr3 (high[0], high[0],
+				GEN_INT (half_width - 1)));
+	  emit_move_insn (low[0], high[0]);
+
+	}
+      else if (count >= half_width)
+	{
+	  emit_move_insn (low[0], high[1]);
+	  emit_move_insn (high[0], low[0]);
+	  emit_insn (gen_ashr3 (high[0], high[0],
+				GEN_INT (half_width - 1)));
+
+	  if (count > half_width)
+	    emit_insn (gen_ashr3 (low[0], low[0],
+				  GEN_INT (count - half_width)));
+	}
+      else
+	{
+	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
+
+	  if (!rtx_equal_p (operands[0], operands[1]))
+	    emit_move_insn (operands[0], operands[1]);
+
+	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
+	  emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
+	}
+    }
+  else
+    {
+      gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
+
+     if (!rtx_equal_p (operands[0], operands[1]))
+	emit_move_insn (operands[0], operands[1]);
+
+      split_double_mode (mode, operands, 1, low, high);
+
+      emit_insn (gen_shrd (low[0], high[0], operands[2]));
+      emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
+
+      if (TARGET_CMOVE && scratch)
+	{
+	  rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
+	    = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
+
+	  emit_move_insn (scratch, high[0]);
+	  emit_insn (gen_ashr3 (scratch, scratch,
+				GEN_INT (half_width - 1)));
+	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
+					  scratch));
+	}
+      else
+	{
+	  rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
+	    = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
+
+	  emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
+	}
+    }
+}
+
+void
+ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
+{
+  rtx (*gen_lshr3)(rtx, rtx, rtx)
+    = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
+  rtx (*gen_shrd)(rtx, rtx, rtx);
+  int half_width = GET_MODE_BITSIZE (mode) >> 1;
+
+  rtx low[2], high[2];
+  int count;
+
+  if (CONST_INT_P (operands[2]))
+    {
+      split_double_mode (mode, operands, 2, low, high);
+      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
+
+      if (count >= half_width)
+	{
+	  emit_move_insn (low[0], high[1]);
+	  ix86_expand_clear (high[0]);
+
+	  if (count > half_width)
+	    emit_insn (gen_lshr3 (low[0], low[0],
+				  GEN_INT (count - half_width)));
+	}
+      else
+	{
+	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
+
+	  if (!rtx_equal_p (operands[0], operands[1]))
+	    emit_move_insn (operands[0], operands[1]);
+
+	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
+	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
+	}
+    }
+  else
+    {
+      gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
+
+      if (!rtx_equal_p (operands[0], operands[1]))
+	emit_move_insn (operands[0], operands[1]);
+
+      split_double_mode (mode, operands, 1, low, high);
+
+      emit_insn (gen_shrd (low[0], high[0], operands[2]));
+      emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
+
+      if (TARGET_CMOVE && scratch)
+	{
+	  rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
+	    = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
+
+	  ix86_expand_clear (scratch);
+	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
+					  scratch));
+	}
+      else
+	{
+	  rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
+	    = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
+
+	  emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
+	}
+    }
+}
+
+/* Return mode for the memcpy/memset loop counter.  Prefer SImode over
+   DImode for constant loop counts.  */
+
+static machine_mode
+counter_mode (rtx count_exp)
+{
+  if (GET_MODE (count_exp) != VOIDmode)
+    return GET_MODE (count_exp);
+  if (!CONST_INT_P (count_exp))
+    return Pmode;
+  if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
+    return DImode;
+  return SImode;
+}
+
+/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
+   to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
+   specified in bytes.  When ISSETMEM is TRUE, output the equivalent loop to set
+   memory by VALUE (supposed to be in MODE).
+
+   The size is rounded down to whole number of chunk size moved at once.
+   SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
+
+
+static void
+expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
+			       rtx destptr, rtx srcptr, rtx value,
+			       rtx count, machine_mode mode, int unroll,
+			       int expected_size, bool issetmem)
+{
+  rtx_code_label *out_label, *top_label;
+  rtx iter, tmp;
+  machine_mode iter_mode = counter_mode (count);
+  int piece_size_n = GET_MODE_SIZE (mode) * unroll;
+  rtx piece_size = GEN_INT (piece_size_n);
+  rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
+  rtx size;
+  int i;
+
+  top_label = gen_label_rtx ();
+  out_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
+			      NULL, 1, OPTAB_DIRECT);
+  /* Those two should combine.  */
+  if (piece_size == const1_rtx)
+    {
+      emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
+			       true, out_label);
+      predict_jump (REG_BR_PROB_BASE * 10 / 100);
+    }
+  emit_move_insn (iter, const0_rtx);
+
+  emit_label (top_label);
+
+  tmp = convert_modes (Pmode, iter_mode, iter, true);
+
+  /* This assert could be relaxed - in this case we'll need to compute
+     smallest power of two, containing in PIECE_SIZE_N and pass it to
+     offset_address.  */
+  gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
+  destmem = offset_address (destmem, tmp, piece_size_n);
+  destmem = adjust_address (destmem, mode, 0);
+
+  if (!issetmem)
+    {
+      srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
+      srcmem = adjust_address (srcmem, mode, 0);
+
+      /* When unrolling for chips that reorder memory reads and writes,
+	 we can save registers by using single temporary.
+	 Also using 4 temporaries is overkill in 32bit mode.  */
+      if (!TARGET_64BIT && 0)
+	{
+	  for (i = 0; i < unroll; i++)
+	    {
+	      if (i)
+		{
+		  destmem = adjust_address (copy_rtx (destmem), mode,
+					    GET_MODE_SIZE (mode));
+		  srcmem = adjust_address (copy_rtx (srcmem), mode,
+					   GET_MODE_SIZE (mode));
+		}
+	      emit_move_insn (destmem, srcmem);
+	    }
+	}
+      else
+	{
+	  rtx tmpreg[4];
+	  gcc_assert (unroll <= 4);
+	  for (i = 0; i < unroll; i++)
+	    {
+	      tmpreg[i] = gen_reg_rtx (mode);
+	      if (i)
+		srcmem = adjust_address (copy_rtx (srcmem), mode,
+					 GET_MODE_SIZE (mode));
+	      emit_move_insn (tmpreg[i], srcmem);
+	    }
+	  for (i = 0; i < unroll; i++)
+	    {
+	      if (i)
+		destmem = adjust_address (copy_rtx (destmem), mode,
+					  GET_MODE_SIZE (mode));
+	      emit_move_insn (destmem, tmpreg[i]);
+	    }
+	}
+    }
+  else
+    for (i = 0; i < unroll; i++)
+      {
+	if (i)
+	  destmem = adjust_address (copy_rtx (destmem), mode,
+				    GET_MODE_SIZE (mode));
+	emit_move_insn (destmem, value);
+      }
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+			   true, top_label);
+  if (expected_size != -1)
+    {
+      expected_size /= GET_MODE_SIZE (mode) * unroll;
+      if (expected_size == 0)
+	predict_jump (0);
+      else if (expected_size > REG_BR_PROB_BASE)
+	predict_jump (REG_BR_PROB_BASE - 1);
+      else
+        predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
+		      / expected_size);
+    }
+  else
+    predict_jump (REG_BR_PROB_BASE * 80 / 100);
+  iter = ix86_zero_extend_to_Pmode (iter);
+  tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != destptr)
+    emit_move_insn (destptr, tmp);
+  if (!issetmem)
+    {
+      tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
+				 true, OPTAB_LIB_WIDEN);
+      if (tmp != srcptr)
+	emit_move_insn (srcptr, tmp);
+    }
+  emit_label (out_label);
+}
+
+/* Divide COUNTREG by SCALE.  */
+static rtx
+scale_counter (rtx countreg, int scale)
+{
+  rtx sc;
+
+  if (scale == 1)
+    return countreg;
+  if (CONST_INT_P (countreg))
+    return GEN_INT (INTVAL (countreg) / scale);
+  gcc_assert (REG_P (countreg));
+
+  sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
+			    GEN_INT (exact_log2 (scale)),
+			    NULL, 1, OPTAB_DIRECT);
+  return sc;
+}
+
+/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
+   When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
+   When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
+   For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
+   ORIG_VALUE is the original value passed to memset to fill the memory with.
+   Other arguments have same meaning as for previous function.  */
+
+static void
+expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
+			   rtx destptr, rtx srcptr, rtx value, rtx orig_value,
+			   rtx count,
+			   machine_mode mode, bool issetmem)
+{
+  rtx destexp;
+  rtx srcexp;
+  rtx countreg;
+  HOST_WIDE_INT rounded_count;
+
+  /* If possible, it is shorter to use rep movs.
+     TODO: Maybe it is better to move this logic to decide_alg.  */
+  if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
+      && (!issetmem || orig_value == const0_rtx))
+    mode = SImode;
+
+  if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
+    destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
+
+  countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
+						       GET_MODE_SIZE (mode)));
+  if (mode != QImode)
+    {
+      destexp = gen_rtx_ASHIFT (Pmode, countreg,
+				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
+      destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
+    }
+  else
+    destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
+  if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
+    {
+      rounded_count
+	= ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
+      destmem = shallow_copy_rtx (destmem);
+      set_mem_size (destmem, rounded_count);
+    }
+  else if (MEM_SIZE_KNOWN_P (destmem))
+    clear_mem_size (destmem);
+
+  if (issetmem)
+    {
+      value = force_reg (mode, gen_lowpart (mode, value));
+      emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
+    }
+  else
+    {
+      if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
+	srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
+      if (mode != QImode)
+	{
+	  srcexp = gen_rtx_ASHIFT (Pmode, countreg,
+				   GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
+	  srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
+	}
+      else
+	srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
+      if (CONST_INT_P (count))
+	{
+	  rounded_count
+	    = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
+	  srcmem = shallow_copy_rtx (srcmem);
+	  set_mem_size (srcmem, rounded_count);
+	}
+      else
+	{
+	  if (MEM_SIZE_KNOWN_P (srcmem))
+	    clear_mem_size (srcmem);
+	}
+      emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
+			      destexp, srcexp));
+    }
+}
+
+/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
+   DESTMEM.
+   SRC is passed by pointer to be updated on return.
+   Return value is updated DST.  */
+static rtx
+emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
+	     HOST_WIDE_INT size_to_move)
+{
+  rtx dst = destmem, src = *srcmem, adjust, tempreg;
+  enum insn_code code;
+  machine_mode move_mode;
+  int piece_size, i;
+
+  /* Find the widest mode in which we could perform moves.
+     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
+     it until move of such size is supported.  */
+  piece_size = 1 << floor_log2 (size_to_move);
+  while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
+	 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
+    {
+      gcc_assert (piece_size > 1);
+      piece_size >>= 1;
+    }
+
+  /* Find the corresponding vector mode with the same size as MOVE_MODE.
+     MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
+  if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
+    {
+      int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
+      if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
+	  || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
+	{
+	  move_mode = word_mode;
+	  piece_size = GET_MODE_SIZE (move_mode);
+	  code = optab_handler (mov_optab, move_mode);
+	}
+    }
+  gcc_assert (code != CODE_FOR_nothing);
+
+  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
+  src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
+
+  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
+  gcc_assert (size_to_move % piece_size == 0);
+  adjust = GEN_INT (piece_size);
+  for (i = 0; i < size_to_move; i += piece_size)
+    {
+      /* We move from memory to memory, so we'll need to do it via
+	 a temporary register.  */
+      tempreg = gen_reg_rtx (move_mode);
+      emit_insn (GEN_FCN (code) (tempreg, src));
+      emit_insn (GEN_FCN (code) (dst, tempreg));
+
+      emit_move_insn (destptr,
+		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
+      emit_move_insn (srcptr,
+		      gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
+
+      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
+					  piece_size);
+      src = adjust_automodify_address_nv (src, move_mode, srcptr,
+					  piece_size);
+    }
+
+  /* Update DST and SRC rtx.  */
+  *srcmem = src;
+  return dst;
+}
+
+/* Helper function for the string operations below.  Dest VARIABLE whether
+   it is aligned to VALUE bytes.  If true, jump to the label.  */
+
+static rtx_code_label *
+ix86_expand_aligntest (rtx variable, int value, bool epilogue)
+{
+  rtx_code_label *label = gen_label_rtx ();
+  rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
+  if (GET_MODE (variable) == DImode)
+    emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
+  else
+    emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
+  emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
+			   1, label);
+  if (epilogue)
+    predict_jump (REG_BR_PROB_BASE * 50 / 100);
+  else
+    predict_jump (REG_BR_PROB_BASE * 90 / 100);
+  return label;
+}
+
+
+/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
+
+static void
+expand_cpymem_epilogue (rtx destmem, rtx srcmem,
+			rtx destptr, rtx srcptr, rtx count, int max_size)
+{
+  rtx src, dest;
+  if (CONST_INT_P (count))
+    {
+      HOST_WIDE_INT countval = INTVAL (count);
+      HOST_WIDE_INT epilogue_size = countval % max_size;
+      int i;
+
+      /* For now MAX_SIZE should be a power of 2.  This assert could be
+	 relaxed, but it'll require a bit more complicated epilogue
+	 expanding.  */
+      gcc_assert ((max_size & (max_size - 1)) == 0);
+      for (i = max_size; i >= 1; i >>= 1)
+	{
+	  if (epilogue_size & i)
+	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
+	}
+      return;
+    }
+  if (max_size > 8)
+    {
+      count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
+				    count, 1, OPTAB_DIRECT);
+      expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
+				     count, QImode, 1, 4, false);
+      return;
+    }
+
+  /* When there are stringops, we can cheaply increase dest and src pointers.
+     Otherwise we save code size by maintaining offset (zero is readily
+     available from preceding rep operation) and using x86 addressing modes.
+   */
+  if (TARGET_SINGLE_STRINGOP)
+    {
+      if (max_size > 4)
+	{
+	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
+	  src = change_address (srcmem, SImode, srcptr);
+	  dest = change_address (destmem, SImode, destptr);
+	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+      if (max_size > 2)
+	{
+	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
+	  src = change_address (srcmem, HImode, srcptr);
+	  dest = change_address (destmem, HImode, destptr);
+	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+      if (max_size > 1)
+	{
+	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
+	  src = change_address (srcmem, QImode, srcptr);
+	  dest = change_address (destmem, QImode, destptr);
+	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+    }
+  else
+    {
+      rtx offset = force_reg (Pmode, const0_rtx);
+      rtx tmp;
+
+      if (max_size > 4)
+	{
+	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
+	  src = change_address (srcmem, SImode, srcptr);
+	  dest = change_address (destmem, SImode, destptr);
+	  emit_move_insn (dest, src);
+	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
+				     true, OPTAB_LIB_WIDEN);
+	  if (tmp != offset)
+	    emit_move_insn (offset, tmp);
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+      if (max_size > 2)
+	{
+	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
+	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
+	  src = change_address (srcmem, HImode, tmp);
+	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
+	  dest = change_address (destmem, HImode, tmp);
+	  emit_move_insn (dest, src);
+	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
+				     true, OPTAB_LIB_WIDEN);
+	  if (tmp != offset)
+	    emit_move_insn (offset, tmp);
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+      if (max_size > 1)
+	{
+	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
+	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
+	  src = change_address (srcmem, QImode, tmp);
+	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
+	  dest = change_address (destmem, QImode, tmp);
+	  emit_move_insn (dest, src);
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+    }
+}
+
+/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
+   with value PROMOTED_VAL.
+   SRC is passed by pointer to be updated on return.
+   Return value is updated DST.  */
+static rtx
+emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
+	     HOST_WIDE_INT size_to_move)
+{
+  rtx dst = destmem, adjust;
+  enum insn_code code;
+  machine_mode move_mode;
+  int piece_size, i;
+
+  /* Find the widest mode in which we could perform moves.
+     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
+     it until move of such size is supported.  */
+  move_mode = GET_MODE (promoted_val);
+  if (move_mode == VOIDmode)
+    move_mode = QImode;
+  if (size_to_move < GET_MODE_SIZE (move_mode))
+    {
+      unsigned int move_bits = size_to_move * BITS_PER_UNIT;
+      move_mode = int_mode_for_size (move_bits, 0).require ();
+      promoted_val = gen_lowpart (move_mode, promoted_val);
+    }
+  piece_size = GET_MODE_SIZE (move_mode);
+  code = optab_handler (mov_optab, move_mode);
+  gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
+
+  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
+
+  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
+  gcc_assert (size_to_move % piece_size == 0);
+  adjust = GEN_INT (piece_size);
+  for (i = 0; i < size_to_move; i += piece_size)
+    {
+      if (piece_size <= GET_MODE_SIZE (word_mode))
+	{
+	  emit_insn (gen_strset (destptr, dst, promoted_val));
+	  dst = adjust_automodify_address_nv (dst, move_mode, destptr,
+					      piece_size);
+	  continue;
+	}
+
+      emit_insn (GEN_FCN (code) (dst, promoted_val));
+
+      emit_move_insn (destptr,
+		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
+
+      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
+					  piece_size);
+    }
+
+  /* Update DST rtx.  */
+  return dst;
+}
+/* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
+static void
+expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
+				 rtx count, int max_size)
+{
+  count = expand_simple_binop (counter_mode (count), AND, count,
+			       GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
+  expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
+				 gen_lowpart (QImode, value), count, QImode,
+				 1, max_size / 2, true);
+}
+
+/* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
+static void
+expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
+			rtx count, int max_size)
+{
+  rtx dest;
+
+  if (CONST_INT_P (count))
+    {
+      HOST_WIDE_INT countval = INTVAL (count);
+      HOST_WIDE_INT epilogue_size = countval % max_size;
+      int i;
+
+      /* For now MAX_SIZE should be a power of 2.  This assert could be
+	 relaxed, but it'll require a bit more complicated epilogue
+	 expanding.  */
+      gcc_assert ((max_size & (max_size - 1)) == 0);
+      for (i = max_size; i >= 1; i >>= 1)
+	{
+	  if (epilogue_size & i)
+	    {
+	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
+		destmem = emit_memset (destmem, destptr, vec_value, i);
+	      else
+		destmem = emit_memset (destmem, destptr, value, i);
+	    }
+	}
+      return;
+    }
+  if (max_size > 32)
+    {
+      expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
+      return;
+    }
+  if (max_size > 16)
+    {
+      rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
+      if (TARGET_64BIT)
+	{
+	  dest = change_address (destmem, DImode, destptr);
+	  emit_insn (gen_strset (destptr, dest, value));
+	  dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
+	  emit_insn (gen_strset (destptr, dest, value));
+	}
+      else
+	{
+	  dest = change_address (destmem, SImode, destptr);
+	  emit_insn (gen_strset (destptr, dest, value));
+	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
+	  emit_insn (gen_strset (destptr, dest, value));
+	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
+	  emit_insn (gen_strset (destptr, dest, value));
+	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
+	  emit_insn (gen_strset (destptr, dest, value));
+	}
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+    }
+  if (max_size > 8)
+    {
+      rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
+      if (TARGET_64BIT)
+	{
+	  dest = change_address (destmem, DImode, destptr);
+	  emit_insn (gen_strset (destptr, dest, value));
+	}
+      else
+	{
+	  dest = change_address (destmem, SImode, destptr);
+	  emit_insn (gen_strset (destptr, dest, value));
+	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
+	  emit_insn (gen_strset (destptr, dest, value));
+	}
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+    }
+  if (max_size > 4)
+    {
+      rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
+      dest = change_address (destmem, SImode, destptr);
+      emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+    }
+  if (max_size > 2)
+    {
+      rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
+      dest = change_address (destmem, HImode, destptr);
+      emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+    }
+  if (max_size > 1)
+    {
+      rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
+      dest = change_address (destmem, QImode, destptr);
+      emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+    }
+}
+
+/* Adjust COUNTER by the VALUE.  */
+static void
+ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
+{
+  rtx (*gen_add)(rtx, rtx, rtx)
+    = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
+
+  emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
+}
+
+/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
+   DESTMEM to align it to DESIRED_ALIGNMENT.  Original alignment is ALIGN.
+   Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
+   ignored.
+   Return value is updated DESTMEM.  */
+
+static rtx
+expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
+				  rtx destptr, rtx srcptr, rtx value,
+				  rtx vec_value, rtx count, int align,
+				  int desired_alignment, bool issetmem)
+{
+  int i;
+  for (i = 1; i < desired_alignment; i <<= 1)
+    {
+      if (align <= i)
+	{
+	  rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
+	  if (issetmem)
+	    {
+	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
+		destmem = emit_memset (destmem, destptr, vec_value, i);
+	      else
+		destmem = emit_memset (destmem, destptr, value, i);
+	    }
+	  else
+	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
+	  ix86_adjust_counter (count, i);
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	  set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
+	}
+    }
+  return destmem;
+}
+
+/* Test if COUNT&SIZE is nonzero and if so, expand movme
+   or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
+   and jump to DONE_LABEL.  */
+static void
+expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
+			       rtx destptr, rtx srcptr,
+			       rtx value, rtx vec_value,
+			       rtx count, int size,
+			       rtx done_label, bool issetmem)
+{
+  rtx_code_label *label = ix86_expand_aligntest (count, size, false);
+  machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
+  rtx modesize;
+  int n;
+
+  /* If we do not have vector value to copy, we must reduce size.  */
+  if (issetmem)
+    {
+      if (!vec_value)
+	{
+	  if (GET_MODE (value) == VOIDmode && size > 8)
+	    mode = Pmode;
+	  else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
+	    mode = GET_MODE (value);
+	}
+      else
+	mode = GET_MODE (vec_value), value = vec_value;
+    }
+  else
+    {
+      /* Choose appropriate vector mode.  */
+      if (size >= 32)
+	mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
+      else if (size >= 16)
+	mode = TARGET_SSE ? V16QImode : DImode;
+      srcmem = change_address (srcmem, mode, srcptr);
+    }
+  destmem = change_address (destmem, mode, destptr);
+  modesize = GEN_INT (GET_MODE_SIZE (mode));
+  gcc_assert (GET_MODE_SIZE (mode) <= size);
+  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
+    {
+      if (issetmem)
+	emit_move_insn (destmem, gen_lowpart (mode, value));
+      else
+	{
+          emit_move_insn (destmem, srcmem);
+          srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
+	}
+      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
+    }
+
+  destmem = offset_address (destmem, count, 1);
+  destmem = offset_address (destmem, GEN_INT (-2 * size),
+			    GET_MODE_SIZE (mode));
+  if (!issetmem)
+    {
+      srcmem = offset_address (srcmem, count, 1);
+      srcmem = offset_address (srcmem, GEN_INT (-2 * size),
+			       GET_MODE_SIZE (mode));
+    }
+  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
+    {
+      if (issetmem)
+	emit_move_insn (destmem, gen_lowpart (mode, value));
+      else
+	{
+	  emit_move_insn (destmem, srcmem);
+	  srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
+	}
+      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
+    }
+  emit_jump_insn (gen_jump (done_label));
+  emit_barrier ();
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+}
+
+/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
+   and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
+   bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
+   proceed with an loop copying SIZE bytes at once. Do moves in MODE.
+   DONE_LABEL is a label after the whole copying sequence. The label is created
+   on demand if *DONE_LABEL is NULL.
+   MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
+   bounds after the initial copies. 
+
+   DESTMEM/SRCMEM are memory expressions pointing to the copies block,
+   DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
+   we will dispatch to a library call for large blocks.
+
+   In pseudocode we do:
+
+   if (COUNT < SIZE)
+     {
+       Assume that SIZE is 4. Bigger sizes are handled analogously
+       if (COUNT & 4)
+	 {
+	    copy 4 bytes from SRCPTR to DESTPTR
+	    copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
+	    goto done_label
+	 }
+       if (!COUNT)
+	 goto done_label;
+       copy 1 byte from SRCPTR to DESTPTR
+       if (COUNT & 2)
+	 {
+	    copy 2 bytes from SRCPTR to DESTPTR
+	    copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
+	 }
+     }
+   else
+     {
+       copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
+       copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
+
+       OLD_DESPTR = DESTPTR;
+       Align DESTPTR up to DESIRED_ALIGN
+       SRCPTR += DESTPTR - OLD_DESTPTR
+       COUNT -= DEST_PTR - OLD_DESTPTR
+       if (DYNAMIC_CHECK)
+	 Round COUNT down to multiple of SIZE
+       << optional caller supplied zero size guard is here >>
+       << optional caller supplied dynamic check is here >>
+       << caller supplied main copy loop is here >>
+     }
+   done_label:
+  */
+static void
+expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
+							    rtx *destptr, rtx *srcptr,
+							    machine_mode mode,
+							    rtx value, rtx vec_value,
+							    rtx *count,
+							    rtx_code_label **done_label,
+							    int size,
+							    int desired_align,
+							    int align,
+							    unsigned HOST_WIDE_INT *min_size,
+							    bool dynamic_check,
+							    bool issetmem)
+{
+  rtx_code_label *loop_label = NULL, *label;
+  int n;
+  rtx modesize;
+  int prolog_size = 0;
+  rtx mode_value;
+
+  /* Chose proper value to copy.  */
+  if (issetmem && VECTOR_MODE_P (mode))
+    mode_value = vec_value;
+  else
+    mode_value = value;
+  gcc_assert (GET_MODE_SIZE (mode) <= size);
+
+  /* See if block is big or small, handle small blocks.  */
+  if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
+    {
+      int size2 = size;
+      loop_label = gen_label_rtx ();
+
+      if (!*done_label)
+	*done_label = gen_label_rtx ();
+
+      emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
+			       1, loop_label);
+      size2 >>= 1;
+
+      /* Handle sizes > 3.  */
+      for (;size2 > 2; size2 >>= 1)
+	expand_small_cpymem_or_setmem (destmem, srcmem,
+				       *destptr, *srcptr,
+				       value, vec_value,
+				       *count,
+				       size2, *done_label, issetmem);
+      /* Nothing to copy?  Jump to DONE_LABEL if so */
+      emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
+			       1, *done_label);
+
+      /* Do a byte copy.  */
+      destmem = change_address (destmem, QImode, *destptr);
+      if (issetmem)
+	emit_move_insn (destmem, gen_lowpart (QImode, value));
+      else
+	{
+          srcmem = change_address (srcmem, QImode, *srcptr);
+          emit_move_insn (destmem, srcmem);
+	}
+
+      /* Handle sizes 2 and 3.  */
+      label = ix86_expand_aligntest (*count, 2, false);
+      destmem = change_address (destmem, HImode, *destptr);
+      destmem = offset_address (destmem, *count, 1);
+      destmem = offset_address (destmem, GEN_INT (-2), 2);
+      if (issetmem)
+        emit_move_insn (destmem, gen_lowpart (HImode, value));
+      else
+	{
+	  srcmem = change_address (srcmem, HImode, *srcptr);
+	  srcmem = offset_address (srcmem, *count, 1);
+	  srcmem = offset_address (srcmem, GEN_INT (-2), 2);
+	  emit_move_insn (destmem, srcmem);
+	}
+
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+      emit_jump_insn (gen_jump (*done_label));
+      emit_barrier ();
+    }
+  else
+    gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
+		|| UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
+
+  /* Start memcpy for COUNT >= SIZE.  */
+  if (loop_label)
+    {
+       emit_label (loop_label);
+       LABEL_NUSES (loop_label) = 1;
+    }
+
+  /* Copy first desired_align bytes.  */
+  if (!issetmem)
+    srcmem = change_address (srcmem, mode, *srcptr);
+  destmem = change_address (destmem, mode, *destptr);
+  modesize = GEN_INT (GET_MODE_SIZE (mode));
+  for (n = 0; prolog_size < desired_align - align; n++)
+    {
+      if (issetmem)
+        emit_move_insn (destmem, mode_value);
+      else
+	{
+          emit_move_insn (destmem, srcmem);
+          srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
+	}
+      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
+      prolog_size += GET_MODE_SIZE (mode);
+    }
+
+
+  /* Copy last SIZE bytes.  */
+  destmem = offset_address (destmem, *count, 1);
+  destmem = offset_address (destmem,
+			    GEN_INT (-size - prolog_size),
+			    1);
+  if (issetmem)
+    emit_move_insn (destmem, mode_value);
+  else
+    {
+      srcmem = offset_address (srcmem, *count, 1);
+      srcmem = offset_address (srcmem,
+			       GEN_INT (-size - prolog_size),
+			       1);
+      emit_move_insn (destmem, srcmem);
+    }
+  for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
+    {
+      destmem = offset_address (destmem, modesize, 1);
+      if (issetmem)
+	emit_move_insn (destmem, mode_value);
+      else
+	{
+          srcmem = offset_address (srcmem, modesize, 1);
+          emit_move_insn (destmem, srcmem);
+	}
+    }
+
+  /* Align destination.  */
+  if (desired_align > 1 && desired_align > align)
+    {
+      rtx saveddest = *destptr;
+
+      gcc_assert (desired_align <= size);
+      /* Align destptr up, place it to new register.  */
+      *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
+				      GEN_INT (prolog_size),
+				      NULL_RTX, 1, OPTAB_DIRECT);
+      if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
+	REG_POINTER (*destptr) = 1;
+      *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
+				      GEN_INT (-desired_align),
+				      *destptr, 1, OPTAB_DIRECT);
+      /* See how many bytes we skipped.  */
+      saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
+				       *destptr,
+				       saveddest, 1, OPTAB_DIRECT);
+      /* Adjust srcptr and count.  */
+      if (!issetmem)
+	*srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
+				       saveddest, *srcptr, 1, OPTAB_DIRECT);
+      *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
+				    saveddest, *count, 1, OPTAB_DIRECT);
+      /* We copied at most size + prolog_size.  */
+      if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
+	*min_size
+	  = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
+      else
+	*min_size = 0;
+
+      /* Our loops always round down the block size, but for dispatch to
+         library we need precise value.  */
+      if (dynamic_check)
+	*count = expand_simple_binop (GET_MODE (*count), AND, *count,
+				      GEN_INT (-size), *count, 1, OPTAB_DIRECT);
+    }
+  else
+    {
+      gcc_assert (prolog_size == 0);
+      /* Decrease count, so we won't end up copying last word twice.  */
+      if (!CONST_INT_P (*count))
+	*count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
+				      constm1_rtx, *count, 1, OPTAB_DIRECT);
+      else
+	*count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
+				      (unsigned HOST_WIDE_INT)size));
+      if (*min_size)
+	*min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
+    }
+}
+
+
+/* This function is like the previous one, except here we know how many bytes
+   need to be copied.  That allows us to update alignment not only of DST, which
+   is returned, but also of SRC, which is passed as a pointer for that
+   reason.  */
+static rtx
+expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
+					   rtx srcreg, rtx value, rtx vec_value,
+					   int desired_align, int align_bytes,
+					   bool issetmem)
+{
+  rtx src = NULL;
+  rtx orig_dst = dst;
+  rtx orig_src = NULL;
+  int piece_size = 1;
+  int copied_bytes = 0;
+
+  if (!issetmem)
+    {
+      gcc_assert (srcp != NULL);
+      src = *srcp;
+      orig_src = src;
+    }
+
+  for (piece_size = 1;
+       piece_size <= desired_align && copied_bytes < align_bytes;
+       piece_size <<= 1)
+    {
+      if (align_bytes & piece_size)
+	{
+	  if (issetmem)
+	    {
+	      if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
+		dst = emit_memset (dst, destreg, vec_value, piece_size);
+	      else
+		dst = emit_memset (dst, destreg, value, piece_size);
+	    }
+	  else
+	    dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
+	  copied_bytes += piece_size;
+	}
+    }
+  if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
+    set_mem_align (dst, desired_align * BITS_PER_UNIT);
+  if (MEM_SIZE_KNOWN_P (orig_dst))
+    set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
+
+  if (!issetmem)
+    {
+      int src_align_bytes = get_mem_align_offset (src, desired_align
+						       * BITS_PER_UNIT);
+      if (src_align_bytes >= 0)
+	src_align_bytes = desired_align - src_align_bytes;
+      if (src_align_bytes >= 0)
+	{
+	  unsigned int src_align;
+	  for (src_align = desired_align; src_align >= 2; src_align >>= 1)
+	    {
+	      if ((src_align_bytes & (src_align - 1))
+		   == (align_bytes & (src_align - 1)))
+		break;
+	    }
+	  if (src_align > (unsigned int) desired_align)
+	    src_align = desired_align;
+	  if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
+	    set_mem_align (src, src_align * BITS_PER_UNIT);
+	}
+      if (MEM_SIZE_KNOWN_P (orig_src))
+	set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
+      *srcp = src;
+    }
+
+  return dst;
+}
+
+/* Return true if ALG can be used in current context.  
+   Assume we expand memset if MEMSET is true.  */
+static bool
+alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
+{
+  if (alg == no_stringop)
+    return false;
+  if (alg == vector_loop)
+    return TARGET_SSE || TARGET_AVX;
+  /* Algorithms using the rep prefix want at least edi and ecx;
+     additionally, memset wants eax and memcpy wants esi.  Don't
+     consider such algorithms if the user has appropriated those
+     registers for their own purposes, or if we have a non-default
+     address space, since some string insns cannot override the segment.  */
+  if (alg == rep_prefix_1_byte
+      || alg == rep_prefix_4_byte
+      || alg == rep_prefix_8_byte)
+    {
+      if (have_as)
+	return false;
+      if (fixed_regs[CX_REG]
+	  || fixed_regs[DI_REG]
+	  || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
+	return false;
+    }
+  return true;
+}
+
+/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
+static enum stringop_alg
+decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
+	    unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
+	    bool memset, bool zero_memset, bool have_as,
+	    int *dynamic_check, bool *noalign, bool recur)
+{
+  const struct stringop_algs *algs;
+  bool optimize_for_speed;
+  int max = 0;
+  const struct processor_costs *cost;
+  int i;
+  bool any_alg_usable_p = false;
+
+  *noalign = false;
+  *dynamic_check = -1;
+
+  /* Even if the string operation call is cold, we still might spend a lot
+     of time processing large blocks.  */
+  if (optimize_function_for_size_p (cfun)
+      || (optimize_insn_for_size_p ()
+ 	  && (max_size < 256
+              || (expected_size != -1 && expected_size < 256))))
+    optimize_for_speed = false;
+  else
+    optimize_for_speed = true;
+
+  cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
+  if (memset)
+    algs = &cost->memset[TARGET_64BIT != 0];
+  else
+    algs = &cost->memcpy[TARGET_64BIT != 0];
+
+  /* See maximal size for user defined algorithm.  */
+  for (i = 0; i < MAX_STRINGOP_ALGS; i++)
+    {
+      enum stringop_alg candidate = algs->size[i].alg;
+      bool usable = alg_usable_p (candidate, memset, have_as);
+      any_alg_usable_p |= usable;
+
+      if (candidate != libcall && candidate && usable)
+	max = algs->size[i].max;
+    }
+
+  /* If expected size is not known but max size is small enough
+     so inline version is a win, set expected size into
+     the range.  */
+  if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
+      && expected_size == -1)
+    expected_size = min_size / 2 + max_size / 2;
+
+  /* If user specified the algorithm, honor it if possible.  */
+  if (ix86_stringop_alg != no_stringop
+      && alg_usable_p (ix86_stringop_alg, memset, have_as))
+    return ix86_stringop_alg;
+  /* rep; movq or rep; movl is the smallest variant.  */
+  else if (!optimize_for_speed)
+    {
+      *noalign = true;
+      if (!count || (count & 3) || (memset && !zero_memset))
+	return alg_usable_p (rep_prefix_1_byte, memset, have_as)
+	       ? rep_prefix_1_byte : loop_1_byte;
+      else
+	return alg_usable_p (rep_prefix_4_byte, memset, have_as)
+	       ? rep_prefix_4_byte : loop;
+    }
+  /* Very tiny blocks are best handled via the loop, REP is expensive to
+     setup.  */
+  else if (expected_size != -1 && expected_size < 4)
+    return loop_1_byte;
+  else if (expected_size != -1)
+    {
+      enum stringop_alg alg = libcall;
+      bool alg_noalign = false;
+      for (i = 0; i < MAX_STRINGOP_ALGS; i++)
+	{
+	  /* We get here if the algorithms that were not libcall-based
+	     were rep-prefix based and we are unable to use rep prefixes
+	     based on global register usage.  Break out of the loop and
+	     use the heuristic below.  */
+	  if (algs->size[i].max == 0)
+	    break;
+	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
+	    {
+	      enum stringop_alg candidate = algs->size[i].alg;
+
+	      if (candidate != libcall
+		  && alg_usable_p (candidate, memset, have_as))
+		{
+		  alg = candidate;
+		  alg_noalign = algs->size[i].noalign;
+		}
+	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
+		 last non-libcall inline algorithm.  */
+	      if (TARGET_INLINE_ALL_STRINGOPS)
+		{
+		  /* When the current size is best to be copied by a libcall,
+		     but we are still forced to inline, run the heuristic below
+		     that will pick code for medium sized blocks.  */
+		  if (alg != libcall)
+		    {
+		      *noalign = alg_noalign;
+		      return alg;
+		    }
+		  else if (!any_alg_usable_p)
+		    break;
+		}
+	      else if (alg_usable_p (candidate, memset, have_as))
+		{
+		  *noalign = algs->size[i].noalign;
+		  return candidate;
+		}
+	    }
+	}
+    }
+  /* When asked to inline the call anyway, try to pick meaningful choice.
+     We look for maximal size of block that is faster to copy by hand and
+     take blocks of at most of that size guessing that average size will
+     be roughly half of the block.
+
+     If this turns out to be bad, we might simply specify the preferred
+     choice in ix86_costs.  */
+  if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
+      && (algs->unknown_size == libcall
+	  || !alg_usable_p (algs->unknown_size, memset, have_as)))
+    {
+      enum stringop_alg alg;
+      HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
+
+      /* If there aren't any usable algorithms or if recursing already,
+	 then recursing on smaller sizes or same size isn't going to
+	 find anything.  Just return the simple byte-at-a-time copy loop.  */
+      if (!any_alg_usable_p || recur)
+	{
+	  /* Pick something reasonable.  */
+	  if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
+	    *dynamic_check = 128;
+	  return loop_1_byte;
+	}
+      alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
+			zero_memset, have_as, dynamic_check, noalign, true);
+      gcc_assert (*dynamic_check == -1);
+      if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
+	*dynamic_check = max;
+      else
+	gcc_assert (alg != libcall);
+      return alg;
+    }
+  return (alg_usable_p (algs->unknown_size, memset, have_as)
+	  ? algs->unknown_size : libcall);
+}
+
+/* Decide on alignment.  We know that the operand is already aligned to ALIGN
+   (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
+static int
+decide_alignment (int align,
+		  enum stringop_alg alg,
+		  int expected_size,
+		  machine_mode move_mode)
+{
+  int desired_align = 0;
+
+  gcc_assert (alg != no_stringop);
+
+  if (alg == libcall)
+    return 0;
+  if (move_mode == VOIDmode)
+    return 0;
+
+  desired_align = GET_MODE_SIZE (move_mode);
+  /* PentiumPro has special logic triggering for 8 byte aligned blocks.
+     copying whole cacheline at once.  */
+  if (TARGET_PENTIUMPRO
+      && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
+    desired_align = 8;
+
+  if (optimize_size)
+    desired_align = 1;
+  if (desired_align < align)
+    desired_align = align;
+  if (expected_size != -1 && expected_size < 4)
+    desired_align = align;
+
+  return desired_align;
+}
+
+
+/* Helper function for memcpy.  For QImode value 0xXY produce
+   0xXYXYXYXY of wide specified by MODE.  This is essentially
+   a * 0x10101010, but we can do slightly better than
+   synth_mult by unwinding the sequence by hand on CPUs with
+   slow multiply.  */
+static rtx
+promote_duplicated_reg (machine_mode mode, rtx val)
+{
+  machine_mode valmode = GET_MODE (val);
+  rtx tmp;
+  int nops = mode == DImode ? 3 : 2;
+
+  gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
+  if (val == const0_rtx)
+    return copy_to_mode_reg (mode, CONST0_RTX (mode));
+  if (CONST_INT_P (val))
+    {
+      HOST_WIDE_INT v = INTVAL (val) & 255;
+
+      v |= v << 8;
+      v |= v << 16;
+      if (mode == DImode)
+        v |= (v << 16) << 16;
+      return copy_to_mode_reg (mode, gen_int_mode (v, mode));
+    }
+
+  if (valmode == VOIDmode)
+    valmode = QImode;
+  if (valmode != QImode)
+    val = gen_lowpart (QImode, val);
+  if (mode == QImode)
+    return val;
+  if (!TARGET_PARTIAL_REG_STALL)
+    nops--;
+  if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
+      + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
+      <= (ix86_cost->shift_const + ix86_cost->add) * nops
+          + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
+    {
+      rtx reg = convert_modes (mode, QImode, val, true);
+      tmp = promote_duplicated_reg (mode, const1_rtx);
+      return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
+				  OPTAB_DIRECT);
+    }
+  else
+    {
+      rtx reg = convert_modes (mode, QImode, val, true);
+
+      if (!TARGET_PARTIAL_REG_STALL)
+	if (mode == SImode)
+	  emit_insn (gen_insvsi_1 (reg, reg));
+	else
+	  emit_insn (gen_insvdi_1 (reg, reg));
+      else
+	{
+	  tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
+				     NULL, 1, OPTAB_DIRECT);
+	  reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
+				     OPTAB_DIRECT);
+	}
+      tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
+			         NULL, 1, OPTAB_DIRECT);
+      reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
+      if (mode == SImode)
+	return reg;
+      tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
+				 NULL, 1, OPTAB_DIRECT);
+      reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
+      return reg;
+    }
+}
+
+/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
+   be needed by main loop copying SIZE_NEEDED chunks and prologue getting
+   alignment from ALIGN to DESIRED_ALIGN.  */
+static rtx
+promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
+				int align)
+{
+  rtx promoted_val;
+
+  if (TARGET_64BIT
+      && (size_needed > 4 || (desired_align > align && desired_align > 4)))
+    promoted_val = promote_duplicated_reg (DImode, val);
+  else if (size_needed > 2 || (desired_align > align && desired_align > 2))
+    promoted_val = promote_duplicated_reg (SImode, val);
+  else if (size_needed > 1 || (desired_align > align && desired_align > 1))
+    promoted_val = promote_duplicated_reg (HImode, val);
+  else
+    promoted_val = val;
+
+  return promoted_val;
+}
+
+/* Copy the address to a Pmode register.  This is used for x32 to
+   truncate DImode TLS address to a SImode register. */
+
+static rtx
+ix86_copy_addr_to_reg (rtx addr)
+{
+  rtx reg;
+  if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
+    {
+      reg = copy_addr_to_reg (addr);
+      REG_POINTER (reg) = 1;
+      return reg;
+    }
+  else
+    {
+      gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
+      reg = copy_to_mode_reg (DImode, addr);
+      REG_POINTER (reg) = 1;
+      return gen_rtx_SUBREG (SImode, reg, 0);
+    }
+}
+
+/* Expand string move (memcpy) ot store (memset) operation.  Use i386 string
+   operations when profitable.  The code depends upon architecture, block size
+   and alignment, but always has one of the following overall structures:
+
+   Aligned move sequence:
+
+     1) Prologue guard: Conditional that jumps up to epilogues for small
+	blocks that can be handled by epilogue alone.  This is faster
+	but also needed for correctness, since prologue assume the block
+	is larger than the desired alignment.
+
+	Optional dynamic check for size and libcall for large
+	blocks is emitted here too, with -minline-stringops-dynamically.
+
+     2) Prologue: copy first few bytes in order to get destination
+	aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
+	than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
+	copied.  We emit either a jump tree on power of two sized
+	blocks, or a byte loop.
+
+     3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
+	with specified algorithm.
+
+     4) Epilogue: code copying tail of the block that is too small to be
+	handled by main body (or up to size guarded by prologue guard). 
+
+  Misaligned move sequence
+
+     1) missaligned move prologue/epilogue containing:
+        a) Prologue handling small memory blocks and jumping to done_label
+	   (skipped if blocks are known to be large enough)
+	b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
+           needed by single possibly misaligned move
+	   (skipped if alignment is not needed)
+        c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
+
+     2) Zero size guard dispatching to done_label, if needed
+
+     3) dispatch to library call, if needed,
+
+     3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
+	with specified algorithm.  */
+bool
+ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
+			   rtx align_exp, rtx expected_align_exp,
+			   rtx expected_size_exp, rtx min_size_exp,
+			   rtx max_size_exp, rtx probable_max_size_exp,
+			   bool issetmem)
+{
+  rtx destreg;
+  rtx srcreg = NULL;
+  rtx_code_label *label = NULL;
+  rtx tmp;
+  rtx_code_label *jump_around_label = NULL;
+  HOST_WIDE_INT align = 1;
+  unsigned HOST_WIDE_INT count = 0;
+  HOST_WIDE_INT expected_size = -1;
+  int size_needed = 0, epilogue_size_needed;
+  int desired_align = 0, align_bytes = 0;
+  enum stringop_alg alg;
+  rtx promoted_val = NULL;
+  rtx vec_promoted_val = NULL;
+  bool force_loopy_epilogue = false;
+  int dynamic_check;
+  bool need_zero_guard = false;
+  bool noalign;
+  machine_mode move_mode = VOIDmode;
+  machine_mode wider_mode;
+  int unroll_factor = 1;
+  /* TODO: Once value ranges are available, fill in proper data.  */
+  unsigned HOST_WIDE_INT min_size = 0;
+  unsigned HOST_WIDE_INT max_size = -1;
+  unsigned HOST_WIDE_INT probable_max_size = -1;
+  bool misaligned_prologue_used = false;
+  bool have_as;
+
+  if (CONST_INT_P (align_exp))
+    align = INTVAL (align_exp);
+  /* i386 can do misaligned access on reasonably increased cost.  */
+  if (CONST_INT_P (expected_align_exp)
+      && INTVAL (expected_align_exp) > align)
+    align = INTVAL (expected_align_exp);
+  /* ALIGN is the minimum of destination and source alignment, but we care here
+     just about destination alignment.  */
+  else if (!issetmem
+	   && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
+    align = MEM_ALIGN (dst) / BITS_PER_UNIT;
+
+  if (CONST_INT_P (count_exp))
+    {
+      min_size = max_size = probable_max_size = count = expected_size
+	= INTVAL (count_exp);
+      /* When COUNT is 0, there is nothing to do.  */
+      if (!count)
+	return true;
+    }
+  else
+    {
+      if (min_size_exp)
+	min_size = INTVAL (min_size_exp);
+      if (max_size_exp)
+	max_size = INTVAL (max_size_exp);
+      if (probable_max_size_exp)
+	probable_max_size = INTVAL (probable_max_size_exp);
+      if (CONST_INT_P (expected_size_exp))
+	expected_size = INTVAL (expected_size_exp);
+     }
+
+  /* Make sure we don't need to care about overflow later on.  */
+  if (count > (HOST_WIDE_INT_1U << 30))
+    return false;
+
+  have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
+  if (!issetmem)
+    have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
+
+  /* Step 0: Decide on preferred algorithm, desired alignment and
+     size of chunks to be copied by main loop.  */
+  alg = decide_alg (count, expected_size, min_size, probable_max_size,
+		    issetmem,
+		    issetmem && val_exp == const0_rtx, have_as,
+		    &dynamic_check, &noalign, false);
+
+  if (dump_file)
+    fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
+	     stringop_alg_names[alg]);
+
+  if (alg == libcall)
+    return false;
+  gcc_assert (alg != no_stringop);
+
+  /* For now vector-version of memset is generated only for memory zeroing, as
+     creating of promoted vector value is very cheap in this case.  */
+  if (issetmem && alg == vector_loop && val_exp != const0_rtx)
+    alg = unrolled_loop;
+
+  if (!count)
+    count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
+  destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
+  if (!issetmem)
+    srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
+
+  unroll_factor = 1;
+  move_mode = word_mode;
+  switch (alg)
+    {
+    case libcall:
+    case no_stringop:
+    case last_alg:
+      gcc_unreachable ();
+    case loop_1_byte:
+      need_zero_guard = true;
+      move_mode = QImode;
+      break;
+    case loop:
+      need_zero_guard = true;
+      break;
+    case unrolled_loop:
+      need_zero_guard = true;
+      unroll_factor = (TARGET_64BIT ? 4 : 2);
+      break;
+    case vector_loop:
+      need_zero_guard = true;
+      unroll_factor = 4;
+      /* Find the widest supported mode.  */
+      move_mode = word_mode;
+      while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
+	     && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
+	move_mode = wider_mode;
+
+      if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
+	move_mode = TImode;
+
+      /* Find the corresponding vector mode with the same size as MOVE_MODE.
+	 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
+      if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
+	{
+	  int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
+	  if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
+	      || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
+	    move_mode = word_mode;
+	}
+      gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
+      break;
+    case rep_prefix_8_byte:
+      move_mode = DImode;
+      break;
+    case rep_prefix_4_byte:
+      move_mode = SImode;
+      break;
+    case rep_prefix_1_byte:
+      move_mode = QImode;
+      break;
+    }
+  size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
+  epilogue_size_needed = size_needed;
+
+  /* If we are going to call any library calls conditionally, make sure any
+     pending stack adjustment happen before the first conditional branch,
+     otherwise they will be emitted before the library call only and won't
+     happen from the other branches.  */
+  if (dynamic_check != -1)
+    do_pending_stack_adjust ();
+
+  desired_align = decide_alignment (align, alg, expected_size, move_mode);
+  if (!TARGET_ALIGN_STRINGOPS || noalign)
+    align = desired_align;
+
+  /* Step 1: Prologue guard.  */
+
+  /* Alignment code needs count to be in register.  */
+  if (CONST_INT_P (count_exp) && desired_align > align)
+    {
+      if (INTVAL (count_exp) > desired_align
+	  && INTVAL (count_exp) > size_needed)
+	{
+	  align_bytes
+	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
+	  if (align_bytes <= 0)
+	    align_bytes = 0;
+	  else
+	    align_bytes = desired_align - align_bytes;
+	}
+      if (align_bytes == 0)
+	count_exp = force_reg (counter_mode (count_exp), count_exp);
+    }
+  gcc_assert (desired_align >= 1 && align >= 1);
+
+  /* Misaligned move sequences handle both prologue and epilogue at once.
+     Default code generation results in a smaller code for large alignments
+     and also avoids redundant job when sizes are known precisely.  */
+  misaligned_prologue_used
+    = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
+       && MAX (desired_align, epilogue_size_needed) <= 32
+       && desired_align <= epilogue_size_needed
+       && ((desired_align > align && !align_bytes)
+	   || (!count && epilogue_size_needed > 1)));
+
+  /* Do the cheap promotion to allow better CSE across the
+     main loop and epilogue (ie one load of the big constant in the
+     front of all code.  
+     For now the misaligned move sequences do not have fast path
+     without broadcasting.  */
+  if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
+    {
+      if (alg == vector_loop)
+	{
+	  gcc_assert (val_exp == const0_rtx);
+	  vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
+	  promoted_val = promote_duplicated_reg_to_size (val_exp,
+							 GET_MODE_SIZE (word_mode),
+							 desired_align, align);
+	}
+      else
+	{
+	  promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
+							 desired_align, align);
+	}
+    }
+  /* Misaligned move sequences handles both prologues and epilogues at once.
+     Default code generation results in smaller code for large alignments and
+     also avoids redundant job when sizes are known precisely.  */
+  if (misaligned_prologue_used)
+    {
+      /* Misaligned move prologue handled small blocks by itself.  */
+      expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
+	   (dst, src, &destreg, &srcreg,
+	    move_mode, promoted_val, vec_promoted_val,
+	    &count_exp,
+	    &jump_around_label,
+            desired_align < align
+	    ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
+	    desired_align, align, &min_size, dynamic_check, issetmem);
+      if (!issetmem)
+        src = change_address (src, BLKmode, srcreg);
+      dst = change_address (dst, BLKmode, destreg);
+      set_mem_align (dst, desired_align * BITS_PER_UNIT);
+      epilogue_size_needed = 0;
+      if (need_zero_guard
+	  && min_size < (unsigned HOST_WIDE_INT) size_needed)
+	{
+	  /* It is possible that we copied enough so the main loop will not
+	     execute.  */
+	  gcc_assert (size_needed > 1);
+	  if (jump_around_label == NULL_RTX)
+	    jump_around_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (count_exp,
+				   GEN_INT (size_needed),
+				   LTU, 0, counter_mode (count_exp), 1, jump_around_label);
+	  if (expected_size == -1
+	      || expected_size < (desired_align - align) / 2 + size_needed)
+	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
+	  else
+	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
+	}
+    }
+  /* Ensure that alignment prologue won't copy past end of block.  */
+  else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
+    {
+      epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
+      /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
+	 Make sure it is power of 2.  */
+      epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
+
+      /* To improve performance of small blocks, we jump around the VAL
+	 promoting mode.  This mean that if the promoted VAL is not constant,
+	 we might not use it in the epilogue and have to use byte
+	 loop variant.  */
+      if (issetmem && epilogue_size_needed > 2 && !promoted_val)
+	force_loopy_epilogue = true;
+      if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
+	  || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
+	{
+	  /* If main algorithm works on QImode, no epilogue is needed.
+	     For small sizes just don't align anything.  */
+	  if (size_needed == 1)
+	    desired_align = align;
+	  else
+	    goto epilogue;
+	}
+      else if (!count
+	       && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
+	{
+	  label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (count_exp,
+				   GEN_INT (epilogue_size_needed),
+				   LTU, 0, counter_mode (count_exp), 1, label);
+	  if (expected_size == -1 || expected_size < epilogue_size_needed)
+	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
+	  else
+	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
+	}
+    }
+
+  /* Emit code to decide on runtime whether library call or inline should be
+     used.  */
+  if (dynamic_check != -1)
+    {
+      if (!issetmem && CONST_INT_P (count_exp))
+	{
+	  if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
+	    {
+	      emit_block_copy_via_libcall (dst, src, count_exp);
+	      count_exp = const0_rtx;
+	      goto epilogue;
+	    }
+	}
+      else
+	{
+	  rtx_code_label *hot_label = gen_label_rtx ();
+	  if (jump_around_label == NULL_RTX)
+	    jump_around_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
+				   LEU, 0, counter_mode (count_exp),
+				   1, hot_label);
+	  predict_jump (REG_BR_PROB_BASE * 90 / 100);
+	  if (issetmem)
+	    set_storage_via_libcall (dst, count_exp, val_exp);
+	  else
+	    emit_block_copy_via_libcall (dst, src, count_exp);
+	  emit_jump (jump_around_label);
+	  emit_label (hot_label);
+	}
+    }
+
+  /* Step 2: Alignment prologue.  */
+  /* Do the expensive promotion once we branched off the small blocks.  */
+  if (issetmem && !promoted_val)
+    promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
+						   desired_align, align);
+
+  if (desired_align > align && !misaligned_prologue_used)
+    {
+      if (align_bytes == 0)
+	{
+	  /* Except for the first move in prologue, we no longer know
+	     constant offset in aliasing info.  It don't seems to worth
+	     the pain to maintain it for the first move, so throw away
+	     the info early.  */
+	  dst = change_address (dst, BLKmode, destreg);
+	  if (!issetmem)
+	    src = change_address (src, BLKmode, srcreg);
+	  dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
+					    promoted_val, vec_promoted_val,
+					    count_exp, align, desired_align,
+					    issetmem);
+	  /* At most desired_align - align bytes are copied.  */
+	  if (min_size < (unsigned)(desired_align - align))
+	    min_size = 0;
+	  else
+	    min_size -= desired_align - align;
+	}
+      else
+	{
+	  /* If we know how many bytes need to be stored before dst is
+	     sufficiently aligned, maintain aliasing info accurately.  */
+	  dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
+							   srcreg,
+							   promoted_val,
+							   vec_promoted_val,
+							   desired_align,
+							   align_bytes,
+							   issetmem);
+
+	  count_exp = plus_constant (counter_mode (count_exp),
+				     count_exp, -align_bytes);
+	  count -= align_bytes;
+	  min_size -= align_bytes;
+	  max_size -= align_bytes;
+	}
+      if (need_zero_guard
+	  && min_size < (unsigned HOST_WIDE_INT) size_needed
+	  && (count < (unsigned HOST_WIDE_INT) size_needed
+	      || (align_bytes == 0
+		  && count < ((unsigned HOST_WIDE_INT) size_needed
+			      + desired_align - align))))
+	{
+	  /* It is possible that we copied enough so the main loop will not
+	     execute.  */
+	  gcc_assert (size_needed > 1);
+	  if (label == NULL_RTX)
+	    label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (count_exp,
+				   GEN_INT (size_needed),
+				   LTU, 0, counter_mode (count_exp), 1, label);
+	  if (expected_size == -1
+	      || expected_size < (desired_align - align) / 2 + size_needed)
+	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
+	  else
+	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
+	}
+    }
+  if (label && size_needed == 1)
+    {
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+      label = NULL;
+      epilogue_size_needed = 1;
+      if (issetmem)
+	promoted_val = val_exp;
+    }
+  else if (label == NULL_RTX && !misaligned_prologue_used)
+    epilogue_size_needed = size_needed;
+
+  /* Step 3: Main loop.  */
+
+  switch (alg)
+    {
+    case libcall:
+    case no_stringop:
+    case last_alg:
+      gcc_unreachable ();
+    case loop_1_byte:
+    case loop:
+    case unrolled_loop:
+      expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
+				     count_exp, move_mode, unroll_factor,
+				     expected_size, issetmem);
+      break;
+    case vector_loop:
+      expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
+				     vec_promoted_val, count_exp, move_mode,
+				     unroll_factor, expected_size, issetmem);
+      break;
+    case rep_prefix_8_byte:
+    case rep_prefix_4_byte:
+    case rep_prefix_1_byte:
+      expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
+				       val_exp, count_exp, move_mode, issetmem);
+      break;
+    }
+  /* Adjust properly the offset of src and dest memory for aliasing.  */
+  if (CONST_INT_P (count_exp))
+    {
+      if (!issetmem)
+	src = adjust_automodify_address_nv (src, BLKmode, srcreg,
+					    (count / size_needed) * size_needed);
+      dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
+					  (count / size_needed) * size_needed);
+    }
+  else
+    {
+      if (!issetmem)
+	src = change_address (src, BLKmode, srcreg);
+      dst = change_address (dst, BLKmode, destreg);
+    }
+
+  /* Step 4: Epilogue to copy the remaining bytes.  */
+ epilogue:
+  if (label)
+    {
+      /* When the main loop is done, COUNT_EXP might hold original count,
+	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
+	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
+	 bytes. Compensate if needed.  */
+
+      if (size_needed < epilogue_size_needed)
+	{
+	  tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
+				     GEN_INT (size_needed - 1), count_exp, 1,
+				     OPTAB_DIRECT);
+	  if (tmp != count_exp)
+	    emit_move_insn (count_exp, tmp);
+	}
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+    }
+
+  if (count_exp != const0_rtx && epilogue_size_needed > 1)
+    {
+      if (force_loopy_epilogue)
+	expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
+					 epilogue_size_needed);
+      else
+	{
+	  if (issetmem)
+	    expand_setmem_epilogue (dst, destreg, promoted_val,
+				    vec_promoted_val, count_exp,
+				    epilogue_size_needed);
+	  else
+	    expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
+				    epilogue_size_needed);
+	}
+    }
+  if (jump_around_label)
+    emit_label (jump_around_label);
+  return true;
+}
+
+
+/* Expand the appropriate insns for doing strlen if not just doing
+   repnz; scasb
+
+   out = result, initialized with the start address
+   align_rtx = alignment of the address.
+   scratch = scratch register, initialized with the startaddress when
+	not aligned, otherwise undefined
+
+   This is just the body. It needs the initializations mentioned above and
+   some address computing at the end.  These things are done in i386.md.  */
+
+static void
+ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
+{
+  int align;
+  rtx tmp;
+  rtx_code_label *align_2_label = NULL;
+  rtx_code_label *align_3_label = NULL;
+  rtx_code_label *align_4_label = gen_label_rtx ();
+  rtx_code_label *end_0_label = gen_label_rtx ();
+  rtx mem;
+  rtx tmpreg = gen_reg_rtx (SImode);
+  rtx scratch = gen_reg_rtx (SImode);
+  rtx cmp;
+
+  align = 0;
+  if (CONST_INT_P (align_rtx))
+    align = INTVAL (align_rtx);
+
+  /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
+
+  /* Is there a known alignment and is it less than 4?  */
+  if (align < 4)
+    {
+      rtx scratch1 = gen_reg_rtx (Pmode);
+      emit_move_insn (scratch1, out);
+      /* Is there a known alignment and is it not 2? */
+      if (align != 2)
+	{
+	  align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
+	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
+
+	  /* Leave just the 3 lower bits.  */
+	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
+				    NULL_RTX, 0, OPTAB_WIDEN);
+
+	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
+				   Pmode, 1, align_4_label);
+	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
+				   Pmode, 1, align_2_label);
+	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
+				   Pmode, 1, align_3_label);
+	}
+      else
+        {
+	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
+	     check if is aligned to 4 - byte.  */
+
+	  align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
+				    NULL_RTX, 0, OPTAB_WIDEN);
+
+	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
+				   Pmode, 1, align_4_label);
+        }
+
+      mem = change_address (src, QImode, out);
+
+      /* Now compare the bytes.  */
+
+      /* Compare the first n unaligned byte on a byte per byte basis.  */
+      emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
+			       QImode, 1, end_0_label);
+
+      /* Increment the address.  */
+      emit_insn (ix86_gen_add3 (out, out, const1_rtx));
+
+      /* Not needed with an alignment of 2 */
+      if (align != 2)
+	{
+	  emit_label (align_2_label);
+
+	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
+				   end_0_label);
+
+	  emit_insn (ix86_gen_add3 (out, out, const1_rtx));
+
+	  emit_label (align_3_label);
+	}
+
+      emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
+			       end_0_label);
+
+      emit_insn (ix86_gen_add3 (out, out, const1_rtx));
+    }
+
+  /* Generate loop to check 4 bytes at a time.  It is not a good idea to
+     align this loop.  It gives only huge programs, but does not help to
+     speed up.  */
+  emit_label (align_4_label);
+
+  mem = change_address (src, SImode, out);
+  emit_move_insn (scratch, mem);
+  emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
+
+  /* This formula yields a nonzero result iff one of the bytes is zero.
+     This saves three branches inside loop and many cycles.  */
+
+  emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
+  emit_insn (gen_one_cmplsi2 (scratch, scratch));
+  emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
+  emit_insn (gen_andsi3 (tmpreg, tmpreg,
+			 gen_int_mode (0x80808080, SImode)));
+  emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
+			   align_4_label);
+
+  if (TARGET_CMOVE)
+    {
+       rtx reg = gen_reg_rtx (SImode);
+       rtx reg2 = gen_reg_rtx (Pmode);
+       emit_move_insn (reg, tmpreg);
+       emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
+
+       /* If zero is not in the first two bytes, move two bytes forward.  */
+       emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
+       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+       tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
+       emit_insn (gen_rtx_SET (tmpreg,
+			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
+						     reg,
+						     tmpreg)));
+       /* Emit lea manually to avoid clobbering of flags.  */
+       emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
+
+       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+       tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
+       emit_insn (gen_rtx_SET (out,
+			       gen_rtx_IF_THEN_ELSE (Pmode, tmp,
+						     reg2,
+						     out)));
+    }
+  else
+    {
+       rtx_code_label *end_2_label = gen_label_rtx ();
+       /* Is zero in the first two bytes? */
+
+       emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
+       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+       tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
+       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+                            gen_rtx_LABEL_REF (VOIDmode, end_2_label),
+                            pc_rtx);
+       tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+       JUMP_LABEL (tmp) = end_2_label;
+
+       /* Not in the first two.  Move two bytes forward.  */
+       emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
+       emit_insn (ix86_gen_add3 (out, out, const2_rtx));
+
+       emit_label (end_2_label);
+
+    }
+
+  /* Avoid branch in fixing the byte.  */
+  tmpreg = gen_lowpart (QImode, tmpreg);
+  emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
+  tmp = gen_rtx_REG (CCmode, FLAGS_REG);
+  cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
+  emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
+
+  emit_label (end_0_label);
+}
+
+/* Expand strlen.  */
+
+bool
+ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
+{
+if (TARGET_UNROLL_STRLEN
+	   && TARGET_INLINE_ALL_STRINGOPS
+	   && eoschar == const0_rtx
+	   && optimize > 1)
+    {
+      /* The generic case of strlen expander is long.  Avoid it's
+	 expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
+      rtx addr = force_reg (Pmode, XEXP (src, 0));
+      /* Well it seems that some optimizer does not combine a call like
+	 foo(strlen(bar), strlen(bar));
+	 when the move and the subtraction is done here.  It does calculate
+	 the length just once when these instructions are done inside of
+	 output_strlen_unroll().  But I think since &bar[strlen(bar)] is
+	 often used and I use one fewer register for the lifetime of
+	 output_strlen_unroll() this is better.  */
+
+      emit_move_insn (out, addr);
+
+      ix86_expand_strlensi_unroll_1 (out, src, align);
+
+      /* strlensi_unroll_1 returns the address of the zero at the end of
+	 the string, like memchr(), so compute the length by subtracting
+	 the start address.  */
+      emit_insn (ix86_gen_sub3 (out, out, addr));
+      return true;
+    }
+  else
+    return false;
+}
+
+/* For given symbol (function) construct code to compute address of it's PLT
+   entry in large x86-64 PIC model.  */
+
+static rtx
+construct_plt_address (rtx symbol)
+{
+  rtx tmp, unspec;
+
+  gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
+  gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
+  gcc_assert (Pmode == DImode);
+
+  tmp = gen_reg_rtx (Pmode);
+  unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
+
+  emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
+  emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
+  return tmp;
+}
+
+/* Additional registers that are clobbered by SYSV calls.  */
+
+static int const x86_64_ms_sysv_extra_clobbered_registers
+		 [NUM_X86_64_MS_CLOBBERED_REGS] =
+{
+  SI_REG, DI_REG,
+  XMM6_REG, XMM7_REG,
+  XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
+  XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
+};
+
+rtx_insn *
+ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
+		  rtx callarg2,
+		  rtx pop, bool sibcall)
+{
+  rtx vec[3];
+  rtx use = NULL, call;
+  unsigned int vec_len = 0;
+  tree fndecl;
+
+  if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
+    {
+      fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
+      if (fndecl
+	  && (lookup_attribute ("interrupt",
+				TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
+	error ("interrupt service routine cannot be called directly");
+    }
+  else
+    fndecl = NULL_TREE;
+
+  if (pop == const0_rtx)
+    pop = NULL;
+  gcc_assert (!TARGET_64BIT || !pop);
+
+  if (TARGET_MACHO && !TARGET_64BIT)
+    {
+#if TARGET_MACHO
+      if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
+	fnaddr = machopic_indirect_call_target (fnaddr);
+#endif
+    }
+  else
+    {
+      /* Static functions and indirect calls don't need the pic register.  Also,
+	 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
+	 it an indirect call.  */
+      rtx addr = XEXP (fnaddr, 0);
+      if (flag_pic
+	  && GET_CODE (addr) == SYMBOL_REF
+	  && !SYMBOL_REF_LOCAL_P (addr))
+	{
+	  if (flag_plt
+	      && (SYMBOL_REF_DECL (addr) == NULL_TREE
+		  || !lookup_attribute ("noplt",
+					DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
+	    {
+	      if (!TARGET_64BIT
+		  || (ix86_cmodel == CM_LARGE_PIC
+		      && DEFAULT_ABI != MS_ABI))
+		{
+		  use_reg (&use, gen_rtx_REG (Pmode,
+					      REAL_PIC_OFFSET_TABLE_REGNUM));
+		  if (ix86_use_pseudo_pic_reg ())
+		    emit_move_insn (gen_rtx_REG (Pmode,
+						 REAL_PIC_OFFSET_TABLE_REGNUM),
+				    pic_offset_table_rtx);
+		}
+	    }
+	  else if (!TARGET_PECOFF && !TARGET_MACHO)
+	    {
+	      if (TARGET_64BIT)
+		{
+		  fnaddr = gen_rtx_UNSPEC (Pmode,
+					   gen_rtvec (1, addr),
+					   UNSPEC_GOTPCREL);
+		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
+		}
+	      else
+		{
+		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
+					   UNSPEC_GOT);
+		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
+		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
+					 fnaddr);
+		}
+	      fnaddr = gen_const_mem (Pmode, fnaddr);
+	      /* Pmode may not be the same as word_mode for x32, which
+		 doesn't support indirect branch via 32-bit memory slot.
+		 Since x32 GOT slot is 64 bit with zero upper 32 bits,
+		 indirect branch via x32 GOT slot is OK.  */
+	      if (GET_MODE (fnaddr) != word_mode)
+		fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
+	      fnaddr = gen_rtx_MEM (QImode, fnaddr);
+	    }
+	}
+    }
+
+  /* Skip setting up RAX register for -mskip-rax-setup when there are no
+     parameters passed in vector registers.  */
+  if (TARGET_64BIT
+      && (INTVAL (callarg2) > 0
+	  || (INTVAL (callarg2) == 0
+	      && (TARGET_SSE || !flag_skip_rax_setup))))
+    {
+      rtx al = gen_rtx_REG (QImode, AX_REG);
+      emit_move_insn (al, callarg2);
+      use_reg (&use, al);
+    }
+
+  if (ix86_cmodel == CM_LARGE_PIC
+      && !TARGET_PECOFF
+      && MEM_P (fnaddr)
+      && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
+      && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
+    fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
+  /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
+     branch via x32 GOT slot is OK.  */
+  else if (!(TARGET_X32
+	     && MEM_P (fnaddr)
+	     && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
+	     && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
+	   && (sibcall
+	       ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
+	       : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
+    {
+      fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
+      fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
+    }
+
+  call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
+
+  if (retval)
+    call = gen_rtx_SET (retval, call);
+  vec[vec_len++] = call;
+
+  if (pop)
+    {
+      pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
+      pop = gen_rtx_SET (stack_pointer_rtx, pop);
+      vec[vec_len++] = pop;
+    }
+
+  if (cfun->machine->no_caller_saved_registers
+      && (!fndecl
+	  || (!TREE_THIS_VOLATILE (fndecl)
+	      && !lookup_attribute ("no_caller_saved_registers",
+				    TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
+    {
+      static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
+      bool is_64bit_ms_abi = (TARGET_64BIT
+			      && ix86_function_abi (fndecl) == MS_ABI);
+      char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
+
+      /* If there are no caller-saved registers, add all registers
+	 that are clobbered by the call which returns.  */
+      for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+	if (!fixed_regs[i]
+	    && (ix86_call_used_regs[i] == 1
+		|| (ix86_call_used_regs[i] & c_mask))
+	    && !STACK_REGNO_P (i)
+	    && !MMX_REGNO_P (i))
+	  clobber_reg (&use,
+		       gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
+    }
+  else if (TARGET_64BIT_MS_ABI
+	   && (!callarg2 || INTVAL (callarg2) != -2))
+    {
+      unsigned i;
+
+      for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
+	{
+	  int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
+	  machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
+
+	  clobber_reg (&use, gen_rtx_REG (mode, regno));
+	}
+
+      /* Set here, but it may get cleared later.  */
+      if (TARGET_CALL_MS2SYSV_XLOGUES)
+	{
+	  if (!TARGET_SSE)
+	    ;
+
+	  /* Don't break hot-patched functions.  */
+	  else if (ix86_function_ms_hook_prologue (current_function_decl))
+	    ;
+
+	  /* TODO: Cases not yet examined.  */
+	  else if (flag_split_stack)
+	    warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
+
+	  else
+	    {
+	      gcc_assert (!reload_completed);
+	      cfun->machine->call_ms2sysv = true;
+	    }
+	}
+    }
+
+  if (vec_len > 1)
+    call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
+  rtx_insn *call_insn = emit_call_insn (call);
+  if (use)
+    CALL_INSN_FUNCTION_USAGE (call_insn) = use;
+
+  return call_insn;
+}
+
+/* Split simple return with popping POPC bytes from stack to indirect
+   branch with stack adjustment .  */
+
+void
+ix86_split_simple_return_pop_internal (rtx popc)
+{
+  struct machine_function *m = cfun->machine;
+  rtx ecx = gen_rtx_REG (SImode, CX_REG);
+  rtx_insn *insn;
+
+  /* There is no "pascal" calling convention in any 64bit ABI.  */
+  gcc_assert (!TARGET_64BIT);
+
+  insn = emit_insn (gen_pop (ecx));
+  m->fs.cfa_offset -= UNITS_PER_WORD;
+  m->fs.sp_offset -= UNITS_PER_WORD;
+
+  rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+  x = gen_rtx_SET (stack_pointer_rtx, x);
+  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+  add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
+  RTX_FRAME_RELATED_P (insn) = 1;
+
+  x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
+  x = gen_rtx_SET (stack_pointer_rtx, x);
+  insn = emit_insn (x);
+  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+  RTX_FRAME_RELATED_P (insn) = 1;
+
+  /* Now return address is in ECX.  */
+  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
+}
+
+/* Errors in the source file can cause expand_expr to return const0_rtx
+   where we expect a vector.  To avoid crashing, use one of the vector
+   clear instructions.  */
+
+static rtx
+safe_vector_operand (rtx x, machine_mode mode)
+{
+  if (x == const0_rtx)
+    x = CONST0_RTX (mode);
+  return x;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of binop insns.  */
+
+static rtx
+ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  machine_mode tmode = insn_data[icode].operand[0].mode;
+  machine_mode mode0 = insn_data[icode].operand[1].mode;
+  machine_mode mode1 = insn_data[icode].operand[2].mode;
+
+  if (VECTOR_MODE_P (mode0))
+    op0 = safe_vector_operand (op0, mode0);
+  if (VECTOR_MODE_P (mode1))
+    op1 = safe_vector_operand (op1, mode1);
+
+  if (optimize || !target
+      || GET_MODE (target) != tmode
+      || !insn_data[icode].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  if (GET_MODE (op1) == SImode && mode1 == TImode)
+    {
+      rtx x = gen_reg_rtx (V4SImode);
+      emit_insn (gen_sse2_loadd (x, op1));
+      op1 = gen_lowpart (TImode, x);
+    }
+
+  if (!insn_data[icode].operand[1].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+  if (!insn_data[icode].operand[2].predicate (op1, mode1))
+    op1 = copy_to_mode_reg (mode1, op1);
+
+  pat = GEN_FCN (icode) (target, op0, op1);
+  if (! pat)
+    return 0;
+
+  emit_insn (pat);
+
+  return target;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
+
+static rtx
+ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
+			       enum ix86_builtin_func_type m_type,
+			       enum rtx_code sub_code)
+{
+  rtx pat;
+  int i;
+  int nargs;
+  bool comparison_p = false;
+  bool tf_p = false;
+  bool last_arg_constant = false;
+  int num_memory = 0;
+  struct {
+    rtx op;
+    machine_mode mode;
+  } args[4];
+
+  machine_mode tmode = insn_data[icode].operand[0].mode;
+
+  switch (m_type)
+    {
+    case MULTI_ARG_4_DF2_DI_I:
+    case MULTI_ARG_4_DF2_DI_I1:
+    case MULTI_ARG_4_SF2_SI_I:
+    case MULTI_ARG_4_SF2_SI_I1:
+      nargs = 4;
+      last_arg_constant = true;
+      break;
+
+    case MULTI_ARG_3_SF:
+    case MULTI_ARG_3_DF:
+    case MULTI_ARG_3_SF2:
+    case MULTI_ARG_3_DF2:
+    case MULTI_ARG_3_DI:
+    case MULTI_ARG_3_SI:
+    case MULTI_ARG_3_SI_DI:
+    case MULTI_ARG_3_HI:
+    case MULTI_ARG_3_HI_SI:
+    case MULTI_ARG_3_QI:
+    case MULTI_ARG_3_DI2:
+    case MULTI_ARG_3_SI2:
+    case MULTI_ARG_3_HI2:
+    case MULTI_ARG_3_QI2:
+      nargs = 3;
+      break;
+
+    case MULTI_ARG_2_SF:
+    case MULTI_ARG_2_DF:
+    case MULTI_ARG_2_DI:
+    case MULTI_ARG_2_SI:
+    case MULTI_ARG_2_HI:
+    case MULTI_ARG_2_QI:
+      nargs = 2;
+      break;
+
+    case MULTI_ARG_2_DI_IMM:
+    case MULTI_ARG_2_SI_IMM:
+    case MULTI_ARG_2_HI_IMM:
+    case MULTI_ARG_2_QI_IMM:
+      nargs = 2;
+      last_arg_constant = true;
+      break;
+
+    case MULTI_ARG_1_SF:
+    case MULTI_ARG_1_DF:
+    case MULTI_ARG_1_SF2:
+    case MULTI_ARG_1_DF2:
+    case MULTI_ARG_1_DI:
+    case MULTI_ARG_1_SI:
+    case MULTI_ARG_1_HI:
+    case MULTI_ARG_1_QI:
+    case MULTI_ARG_1_SI_DI:
+    case MULTI_ARG_1_HI_DI:
+    case MULTI_ARG_1_HI_SI:
+    case MULTI_ARG_1_QI_DI:
+    case MULTI_ARG_1_QI_SI:
+    case MULTI_ARG_1_QI_HI:
+      nargs = 1;
+      break;
+
+    case MULTI_ARG_2_DI_CMP:
+    case MULTI_ARG_2_SI_CMP:
+    case MULTI_ARG_2_HI_CMP:
+    case MULTI_ARG_2_QI_CMP:
+      nargs = 2;
+      comparison_p = true;
+      break;
+
+    case MULTI_ARG_2_SF_TF:
+    case MULTI_ARG_2_DF_TF:
+    case MULTI_ARG_2_DI_TF:
+    case MULTI_ARG_2_SI_TF:
+    case MULTI_ARG_2_HI_TF:
+    case MULTI_ARG_2_QI_TF:
+      nargs = 2;
+      tf_p = true;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  if (optimize || !target
+      || GET_MODE (target) != tmode
+      || !insn_data[icode].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+  else if (memory_operand (target, tmode))
+    num_memory++;
+
+  gcc_assert (nargs <= 4);
+
+  for (i = 0; i < nargs; i++)
+    {
+      tree arg = CALL_EXPR_ARG (exp, i);
+      rtx op = expand_normal (arg);
+      int adjust = (comparison_p) ? 1 : 0;
+      machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
+
+      if (last_arg_constant && i == nargs - 1)
+	{
+	  if (!insn_data[icode].operand[i + 1].predicate (op, mode))
+	    {
+	      enum insn_code new_icode = icode;
+	      switch (icode)
+		{
+		case CODE_FOR_xop_vpermil2v2df3:
+		case CODE_FOR_xop_vpermil2v4sf3:
+		case CODE_FOR_xop_vpermil2v4df3:
+		case CODE_FOR_xop_vpermil2v8sf3:
+		  error ("the last argument must be a 2-bit immediate");
+		  return gen_reg_rtx (tmode);
+		case CODE_FOR_xop_rotlv2di3:
+		  new_icode = CODE_FOR_rotlv2di3;
+		  goto xop_rotl;
+		case CODE_FOR_xop_rotlv4si3:
+		  new_icode = CODE_FOR_rotlv4si3;
+		  goto xop_rotl;
+		case CODE_FOR_xop_rotlv8hi3:
+		  new_icode = CODE_FOR_rotlv8hi3;
+		  goto xop_rotl;
+		case CODE_FOR_xop_rotlv16qi3:
+		  new_icode = CODE_FOR_rotlv16qi3;
+		xop_rotl:
+		  if (CONST_INT_P (op))
+		    {
+		      int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
+		      op = GEN_INT (INTVAL (op) & mask);
+		      gcc_checking_assert
+			(insn_data[icode].operand[i + 1].predicate (op, mode));
+		    }
+		  else
+		    {
+		      gcc_checking_assert
+			(nargs == 2
+			 && insn_data[new_icode].operand[0].mode == tmode
+			 && insn_data[new_icode].operand[1].mode == tmode
+			 && insn_data[new_icode].operand[2].mode == mode
+			 && insn_data[new_icode].operand[0].predicate
+			    == insn_data[icode].operand[0].predicate
+			 && insn_data[new_icode].operand[1].predicate
+			    == insn_data[icode].operand[1].predicate);
+		      icode = new_icode;
+		      goto non_constant;
+		    }
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
+	    }
+	}
+      else
+	{
+	non_constant:
+	  if (VECTOR_MODE_P (mode))
+	    op = safe_vector_operand (op, mode);
+
+	  /* If we aren't optimizing, only allow one memory operand to be
+	     generated.  */
+	  if (memory_operand (op, mode))
+	    num_memory++;
+
+	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
+
+	  if (optimize
+	      || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
+	      || num_memory > 1)
+	    op = force_reg (mode, op);
+	}
+
+      args[i].op = op;
+      args[i].mode = mode;
+    }
+
+  switch (nargs)
+    {
+    case 1:
+      pat = GEN_FCN (icode) (target, args[0].op);
+      break;
+
+    case 2:
+      if (tf_p)
+	pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+			       GEN_INT ((int)sub_code));
+      else if (! comparison_p)
+	pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+      else
+	{
+	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
+				       args[0].op,
+				       args[1].op);
+
+	  pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
+	}
+      break;
+
+    case 3:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
+      break;
+
+    case 4:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  if (! pat)
+    return 0;
+
+  emit_insn (pat);
+  return target;
+}
+
+/* Subroutine of ix86_expand_args_builtin to take care of scalar unop
+   insns with vec_merge.  */
+
+static rtx
+ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
+				    rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  rtx op1, op0 = expand_normal (arg0);
+  machine_mode tmode = insn_data[icode].operand[0].mode;
+  machine_mode mode0 = insn_data[icode].operand[1].mode;
+
+  if (optimize || !target
+      || GET_MODE (target) != tmode
+      || !insn_data[icode].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  if (VECTOR_MODE_P (mode0))
+    op0 = safe_vector_operand (op0, mode0);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_data[icode].operand[1].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+
+  op1 = op0;
+  if (!insn_data[icode].operand[2].predicate (op1, mode0))
+    op1 = copy_to_mode_reg (mode0, op1);
+
+  pat = GEN_FCN (icode) (target, op0, op1);
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  return target;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
+
+static rtx
+ix86_expand_sse_compare (const struct builtin_description *d,
+			 tree exp, rtx target, bool swap)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  rtx op2;
+  machine_mode tmode = insn_data[d->icode].operand[0].mode;
+  machine_mode mode0 = insn_data[d->icode].operand[1].mode;
+  machine_mode mode1 = insn_data[d->icode].operand[2].mode;
+  enum rtx_code comparison = d->comparison;
+
+  if (VECTOR_MODE_P (mode0))
+    op0 = safe_vector_operand (op0, mode0);
+  if (VECTOR_MODE_P (mode1))
+    op1 = safe_vector_operand (op1, mode1);
+
+  /* Swap operands if we have a comparison that isn't available in
+     hardware.  */
+  if (swap)
+    std::swap (op0, op1);
+
+  if (optimize || !target
+      || GET_MODE (target) != tmode
+      || !insn_data[d->icode].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_data[d->icode].operand[1].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+  if ((optimize && !register_operand (op1, mode1))
+      || !insn_data[d->icode].operand[2].predicate (op1, mode1))
+    op1 = copy_to_mode_reg (mode1, op1);
+
+  op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
+  pat = GEN_FCN (d->icode) (target, op0, op1, op2);
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  return target;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of comi insns.  */
+
+static rtx
+ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
+		      rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  machine_mode mode0 = insn_data[d->icode].operand[0].mode;
+  machine_mode mode1 = insn_data[d->icode].operand[1].mode;
+  enum rtx_code comparison = d->comparison;
+
+  if (VECTOR_MODE_P (mode0))
+    op0 = safe_vector_operand (op0, mode0);
+  if (VECTOR_MODE_P (mode1))
+    op1 = safe_vector_operand (op1, mode1);
+
+  /* Swap operands if we have a comparison that isn't available in
+     hardware.  */
+  if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
+    std::swap (op0, op1);
+
+  target = gen_reg_rtx (SImode);
+  emit_move_insn (target, const0_rtx);
+  target = gen_rtx_SUBREG (QImode, target, 0);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+  if ((optimize && !register_operand (op1, mode1))
+      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+    op1 = copy_to_mode_reg (mode1, op1);
+
+  pat = GEN_FCN (d->icode) (op0, op1);
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+			  gen_rtx_fmt_ee (comparison, QImode,
+					  SET_DEST (pat),
+					  const0_rtx)));
+
+  return SUBREG_REG (target);
+}
+
+/* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
+
+static rtx
+ix86_expand_sse_round (const struct builtin_description *d, tree exp,
+		       rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  rtx op1, op0 = expand_normal (arg0);
+  machine_mode tmode = insn_data[d->icode].operand[0].mode;
+  machine_mode mode0 = insn_data[d->icode].operand[1].mode;
+
+  if (optimize || target == 0
+      || GET_MODE (target) != tmode
+      || !insn_data[d->icode].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  if (VECTOR_MODE_P (mode0))
+    op0 = safe_vector_operand (op0, mode0);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+
+  op1 = GEN_INT (d->comparison);
+
+  pat = GEN_FCN (d->icode) (target, op0, op1);
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  return target;
+}
+
+static rtx
+ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
+				     tree exp, rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  rtx op2;
+  machine_mode tmode = insn_data[d->icode].operand[0].mode;
+  machine_mode mode0 = insn_data[d->icode].operand[1].mode;
+  machine_mode mode1 = insn_data[d->icode].operand[2].mode;
+
+  if (optimize || target == 0
+      || GET_MODE (target) != tmode
+      || !insn_data[d->icode].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  op0 = safe_vector_operand (op0, mode0);
+  op1 = safe_vector_operand (op1, mode1);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+  if ((optimize && !register_operand (op1, mode1))
+      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+    op1 = copy_to_mode_reg (mode1, op1);
+
+  op2 = GEN_INT (d->comparison);
+
+  pat = GEN_FCN (d->icode) (target, op0, op1, op2);
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  return target;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
+
+static rtx
+ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
+		       rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  machine_mode mode0 = insn_data[d->icode].operand[0].mode;
+  machine_mode mode1 = insn_data[d->icode].operand[1].mode;
+  enum rtx_code comparison = d->comparison;
+
+  if (VECTOR_MODE_P (mode0))
+    op0 = safe_vector_operand (op0, mode0);
+  if (VECTOR_MODE_P (mode1))
+    op1 = safe_vector_operand (op1, mode1);
+
+  target = gen_reg_rtx (SImode);
+  emit_move_insn (target, const0_rtx);
+  target = gen_rtx_SUBREG (QImode, target, 0);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+  if ((optimize && !register_operand (op1, mode1))
+      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+    op1 = copy_to_mode_reg (mode1, op1);
+
+  pat = GEN_FCN (d->icode) (op0, op1);
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+			  gen_rtx_fmt_ee (comparison, QImode,
+					  SET_DEST (pat),
+					  const0_rtx)));
+
+  return SUBREG_REG (target);
+}
+
+/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
+
+static rtx
+ix86_expand_sse_pcmpestr (const struct builtin_description *d,
+			  tree exp, rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  tree arg2 = CALL_EXPR_ARG (exp, 2);
+  tree arg3 = CALL_EXPR_ARG (exp, 3);
+  tree arg4 = CALL_EXPR_ARG (exp, 4);
+  rtx scratch0, scratch1;
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  rtx op2 = expand_normal (arg2);
+  rtx op3 = expand_normal (arg3);
+  rtx op4 = expand_normal (arg4);
+  machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
+
+  tmode0 = insn_data[d->icode].operand[0].mode;
+  tmode1 = insn_data[d->icode].operand[1].mode;
+  modev2 = insn_data[d->icode].operand[2].mode;
+  modei3 = insn_data[d->icode].operand[3].mode;
+  modev4 = insn_data[d->icode].operand[4].mode;
+  modei5 = insn_data[d->icode].operand[5].mode;
+  modeimm = insn_data[d->icode].operand[6].mode;
+
+  if (VECTOR_MODE_P (modev2))
+    op0 = safe_vector_operand (op0, modev2);
+  if (VECTOR_MODE_P (modev4))
+    op2 = safe_vector_operand (op2, modev4);
+
+  if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
+    op0 = copy_to_mode_reg (modev2, op0);
+  if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
+    op1 = copy_to_mode_reg (modei3, op1);
+  if ((optimize && !register_operand (op2, modev4))
+      || !insn_data[d->icode].operand[4].predicate (op2, modev4))
+    op2 = copy_to_mode_reg (modev4, op2);
+  if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
+    op3 = copy_to_mode_reg (modei5, op3);
+
+  if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
+    {
+      error ("the fifth argument must be an 8-bit immediate");
+      return const0_rtx;
+    }
+
+  if (d->code == IX86_BUILTIN_PCMPESTRI128)
+    {
+      if (optimize || !target
+	  || GET_MODE (target) != tmode0
+	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
+	target = gen_reg_rtx (tmode0);
+
+      scratch1 = gen_reg_rtx (tmode1);
+
+      pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
+    }
+  else if (d->code == IX86_BUILTIN_PCMPESTRM128)
+    {
+      if (optimize || !target
+	  || GET_MODE (target) != tmode1
+	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
+	target = gen_reg_rtx (tmode1);
+
+      scratch0 = gen_reg_rtx (tmode0);
+
+      pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
+    }
+  else
+    {
+      gcc_assert (d->flag);
+
+      scratch0 = gen_reg_rtx (tmode0);
+      scratch1 = gen_reg_rtx (tmode1);
+
+      pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
+    }
+
+  if (! pat)
+    return 0;
+
+  emit_insn (pat);
+
+  if (d->flag)
+    {
+      target = gen_reg_rtx (SImode);
+      emit_move_insn (target, const0_rtx);
+      target = gen_rtx_SUBREG (QImode, target, 0);
+
+      emit_insn
+	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+		      gen_rtx_fmt_ee (EQ, QImode,
+				      gen_rtx_REG ((machine_mode) d->flag,
+						   FLAGS_REG),
+				      const0_rtx)));
+      return SUBREG_REG (target);
+    }
+  else
+    return target;
+}
+
+
+/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
+
+static rtx
+ix86_expand_sse_pcmpistr (const struct builtin_description *d,
+			  tree exp, rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  tree arg2 = CALL_EXPR_ARG (exp, 2);
+  rtx scratch0, scratch1;
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  rtx op2 = expand_normal (arg2);
+  machine_mode tmode0, tmode1, modev2, modev3, modeimm;
+
+  tmode0 = insn_data[d->icode].operand[0].mode;
+  tmode1 = insn_data[d->icode].operand[1].mode;
+  modev2 = insn_data[d->icode].operand[2].mode;
+  modev3 = insn_data[d->icode].operand[3].mode;
+  modeimm = insn_data[d->icode].operand[4].mode;
+
+  if (VECTOR_MODE_P (modev2))
+    op0 = safe_vector_operand (op0, modev2);
+  if (VECTOR_MODE_P (modev3))
+    op1 = safe_vector_operand (op1, modev3);
+
+  if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
+    op0 = copy_to_mode_reg (modev2, op0);
+  if ((optimize && !register_operand (op1, modev3))
+      || !insn_data[d->icode].operand[3].predicate (op1, modev3))
+    op1 = copy_to_mode_reg (modev3, op1);
+
+  if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
+    {
+      error ("the third argument must be an 8-bit immediate");
+      return const0_rtx;
+    }
+
+  if (d->code == IX86_BUILTIN_PCMPISTRI128)
+    {
+      if (optimize || !target
+	  || GET_MODE (target) != tmode0
+	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
+	target = gen_reg_rtx (tmode0);
+
+      scratch1 = gen_reg_rtx (tmode1);
+
+      pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
+    }
+  else if (d->code == IX86_BUILTIN_PCMPISTRM128)
+    {
+      if (optimize || !target
+	  || GET_MODE (target) != tmode1
+	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
+	target = gen_reg_rtx (tmode1);
+
+      scratch0 = gen_reg_rtx (tmode0);
+
+      pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
+    }
+  else
+    {
+      gcc_assert (d->flag);
+
+      scratch0 = gen_reg_rtx (tmode0);
+      scratch1 = gen_reg_rtx (tmode1);
+
+      pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
+    }
+
+  if (! pat)
+    return 0;
+
+  emit_insn (pat);
+
+  if (d->flag)
+    {
+      target = gen_reg_rtx (SImode);
+      emit_move_insn (target, const0_rtx);
+      target = gen_rtx_SUBREG (QImode, target, 0);
+
+      emit_insn
+	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+		      gen_rtx_fmt_ee (EQ, QImode,
+				      gen_rtx_REG ((machine_mode) d->flag,
+						   FLAGS_REG),
+				      const0_rtx)));
+      return SUBREG_REG (target);
+    }
+  else
+    return target;
+}
+
+/* Fixup modeless constants to fit required mode.  */
+
+static rtx
+fixup_modeless_constant (rtx x, machine_mode mode)
+{
+  if (GET_MODE (x) == VOIDmode)
+    x = convert_to_mode (mode, x, 1);
+  return x;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of insns with
+   variable number of operands.  */
+
+static rtx
+ix86_expand_args_builtin (const struct builtin_description *d,
+			  tree exp, rtx target)
+{
+  rtx pat, real_target;
+  unsigned int i, nargs;
+  unsigned int nargs_constant = 0;
+  unsigned int mask_pos = 0;
+  int num_memory = 0;
+  struct
+    {
+      rtx op;
+      machine_mode mode;
+    } args[6];
+  bool second_arg_count = false;
+  enum insn_code icode = d->icode;
+  const struct insn_data_d *insn_p = &insn_data[icode];
+  machine_mode tmode = insn_p->operand[0].mode;
+  machine_mode rmode = VOIDmode;
+  bool swap = false;
+  enum rtx_code comparison = d->comparison;
+
+  switch ((enum ix86_builtin_func_type) d->flag)
+    {
+    case V2DF_FTYPE_V2DF_ROUND:
+    case V4DF_FTYPE_V4DF_ROUND:
+    case V8DF_FTYPE_V8DF_ROUND:
+    case V4SF_FTYPE_V4SF_ROUND:
+    case V8SF_FTYPE_V8SF_ROUND:
+    case V16SF_FTYPE_V16SF_ROUND:
+    case V4SI_FTYPE_V4SF_ROUND:
+    case V8SI_FTYPE_V8SF_ROUND:
+    case V16SI_FTYPE_V16SF_ROUND:
+      return ix86_expand_sse_round (d, exp, target);
+    case V4SI_FTYPE_V2DF_V2DF_ROUND:
+    case V8SI_FTYPE_V4DF_V4DF_ROUND:
+    case V16SI_FTYPE_V8DF_V8DF_ROUND:
+      return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
+    case INT_FTYPE_V8SF_V8SF_PTEST:
+    case INT_FTYPE_V4DI_V4DI_PTEST:
+    case INT_FTYPE_V4DF_V4DF_PTEST:
+    case INT_FTYPE_V4SF_V4SF_PTEST:
+    case INT_FTYPE_V2DI_V2DI_PTEST:
+    case INT_FTYPE_V2DF_V2DF_PTEST:
+      return ix86_expand_sse_ptest (d, exp, target);
+    case FLOAT128_FTYPE_FLOAT128:
+    case FLOAT_FTYPE_FLOAT:
+    case INT_FTYPE_INT:
+    case UINT_FTYPE_UINT:
+    case UINT16_FTYPE_UINT16:
+    case UINT64_FTYPE_INT:
+    case UINT64_FTYPE_UINT64:
+    case INT64_FTYPE_INT64:
+    case INT64_FTYPE_V4SF:
+    case INT64_FTYPE_V2DF:
+    case INT_FTYPE_V16QI:
+    case INT_FTYPE_V8QI:
+    case INT_FTYPE_V8SF:
+    case INT_FTYPE_V4DF:
+    case INT_FTYPE_V4SF:
+    case INT_FTYPE_V2DF:
+    case INT_FTYPE_V32QI:
+    case V16QI_FTYPE_V16QI:
+    case V8SI_FTYPE_V8SF:
+    case V8SI_FTYPE_V4SI:
+    case V8HI_FTYPE_V8HI:
+    case V8HI_FTYPE_V16QI:
+    case V8QI_FTYPE_V8QI:
+    case V8SF_FTYPE_V8SF:
+    case V8SF_FTYPE_V8SI:
+    case V8SF_FTYPE_V4SF:
+    case V8SF_FTYPE_V8HI:
+    case V4SI_FTYPE_V4SI:
+    case V4SI_FTYPE_V16QI:
+    case V4SI_FTYPE_V4SF:
+    case V4SI_FTYPE_V8SI:
+    case V4SI_FTYPE_V8HI:
+    case V4SI_FTYPE_V4DF:
+    case V4SI_FTYPE_V2DF:
+    case V4HI_FTYPE_V4HI:
+    case V4DF_FTYPE_V4DF:
+    case V4DF_FTYPE_V4SI:
+    case V4DF_FTYPE_V4SF:
+    case V4DF_FTYPE_V2DF:
+    case V4SF_FTYPE_V4SF:
+    case V4SF_FTYPE_V4SI:
+    case V4SF_FTYPE_V8SF:
+    case V4SF_FTYPE_V4DF:
+    case V4SF_FTYPE_V8HI:
+    case V4SF_FTYPE_V2DF:
+    case V2DI_FTYPE_V2DI:
+    case V2DI_FTYPE_V16QI:
+    case V2DI_FTYPE_V8HI:
+    case V2DI_FTYPE_V4SI:
+    case V2DF_FTYPE_V2DF:
+    case V2DF_FTYPE_V4SI:
+    case V2DF_FTYPE_V4DF:
+    case V2DF_FTYPE_V4SF:
+    case V2DF_FTYPE_V2SI:
+    case V2SI_FTYPE_V2SI:
+    case V2SI_FTYPE_V4SF:
+    case V2SI_FTYPE_V2SF:
+    case V2SI_FTYPE_V2DF:
+    case V2SF_FTYPE_V2SF:
+    case V2SF_FTYPE_V2SI:
+    case V32QI_FTYPE_V32QI:
+    case V32QI_FTYPE_V16QI:
+    case V16HI_FTYPE_V16HI:
+    case V16HI_FTYPE_V8HI:
+    case V8SI_FTYPE_V8SI:
+    case V16HI_FTYPE_V16QI:
+    case V8SI_FTYPE_V16QI:
+    case V4DI_FTYPE_V16QI:
+    case V8SI_FTYPE_V8HI:
+    case V4DI_FTYPE_V8HI:
+    case V4DI_FTYPE_V4SI:
+    case V4DI_FTYPE_V2DI:
+    case UQI_FTYPE_UQI:
+    case UHI_FTYPE_UHI:
+    case USI_FTYPE_USI:
+    case USI_FTYPE_UQI:
+    case USI_FTYPE_UHI:
+    case UDI_FTYPE_UDI:
+    case UHI_FTYPE_V16QI:
+    case USI_FTYPE_V32QI:
+    case UDI_FTYPE_V64QI:
+    case V16QI_FTYPE_UHI:
+    case V32QI_FTYPE_USI:
+    case V64QI_FTYPE_UDI:
+    case V8HI_FTYPE_UQI:
+    case V16HI_FTYPE_UHI:
+    case V32HI_FTYPE_USI:
+    case V4SI_FTYPE_UQI:
+    case V8SI_FTYPE_UQI:
+    case V4SI_FTYPE_UHI:
+    case V8SI_FTYPE_UHI:
+    case UQI_FTYPE_V8HI:
+    case UHI_FTYPE_V16HI:
+    case USI_FTYPE_V32HI:
+    case UQI_FTYPE_V4SI:
+    case UQI_FTYPE_V8SI:
+    case UHI_FTYPE_V16SI:
+    case UQI_FTYPE_V2DI:
+    case UQI_FTYPE_V4DI:
+    case UQI_FTYPE_V8DI:
+    case V16SI_FTYPE_UHI:
+    case V2DI_FTYPE_UQI:
+    case V4DI_FTYPE_UQI:
+    case V16SI_FTYPE_INT:
+    case V16SF_FTYPE_V8SF:
+    case V16SI_FTYPE_V8SI:
+    case V16SF_FTYPE_V4SF:
+    case V16SI_FTYPE_V4SI:
+    case V16SI_FTYPE_V16SF:
+    case V16SI_FTYPE_V16SI:
+    case V64QI_FTYPE_V64QI:
+    case V32HI_FTYPE_V32HI:
+    case V16SF_FTYPE_V16SF:
+    case V8DI_FTYPE_UQI:
+    case V8DI_FTYPE_V8DI:
+    case V8DF_FTYPE_V4DF:
+    case V8DF_FTYPE_V2DF:
+    case V8DF_FTYPE_V8DF:
+    case V4DI_FTYPE_V4DI:
+      nargs = 1;
+      break;
+    case V4SF_FTYPE_V4SF_VEC_MERGE:
+    case V2DF_FTYPE_V2DF_VEC_MERGE:
+      return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
+    case FLOAT128_FTYPE_FLOAT128_FLOAT128:
+    case V16QI_FTYPE_V16QI_V16QI:
+    case V16QI_FTYPE_V8HI_V8HI:
+    case V16SF_FTYPE_V16SF_V16SF:
+    case V8QI_FTYPE_V8QI_V8QI:
+    case V8QI_FTYPE_V4HI_V4HI:
+    case V8HI_FTYPE_V8HI_V8HI:
+    case V8HI_FTYPE_V16QI_V16QI:
+    case V8HI_FTYPE_V4SI_V4SI:
+    case V8SF_FTYPE_V8SF_V8SF:
+    case V8SF_FTYPE_V8SF_V8SI:
+    case V8DF_FTYPE_V8DF_V8DF:
+    case V4SI_FTYPE_V4SI_V4SI:
+    case V4SI_FTYPE_V8HI_V8HI:
+    case V4SI_FTYPE_V2DF_V2DF:
+    case V4HI_FTYPE_V4HI_V4HI:
+    case V4HI_FTYPE_V8QI_V8QI:
+    case V4HI_FTYPE_V2SI_V2SI:
+    case V4DF_FTYPE_V4DF_V4DF:
+    case V4DF_FTYPE_V4DF_V4DI:
+    case V4SF_FTYPE_V4SF_V4SF:
+    case V4SF_FTYPE_V4SF_V4SI:
+    case V4SF_FTYPE_V4SF_V2SI:
+    case V4SF_FTYPE_V4SF_V2DF:
+    case V4SF_FTYPE_V4SF_UINT:
+    case V4SF_FTYPE_V4SF_DI:
+    case V4SF_FTYPE_V4SF_SI:
+    case V2DI_FTYPE_V2DI_V2DI:
+    case V2DI_FTYPE_V16QI_V16QI:
+    case V2DI_FTYPE_V4SI_V4SI:
+    case V2DI_FTYPE_V2DI_V16QI:
+    case V2SI_FTYPE_V2SI_V2SI:
+    case V2SI_FTYPE_V4HI_V4HI:
+    case V2SI_FTYPE_V2SF_V2SF:
+    case V2DF_FTYPE_V2DF_V2DF:
+    case V2DF_FTYPE_V2DF_V4SF:
+    case V2DF_FTYPE_V2DF_V2DI:
+    case V2DF_FTYPE_V2DF_DI:
+    case V2DF_FTYPE_V2DF_SI:
+    case V2DF_FTYPE_V2DF_UINT:
+    case V2SF_FTYPE_V2SF_V2SF:
+    case V1DI_FTYPE_V1DI_V1DI:
+    case V1DI_FTYPE_V8QI_V8QI:
+    case V1DI_FTYPE_V2SI_V2SI:
+    case V32QI_FTYPE_V16HI_V16HI:
+    case V16HI_FTYPE_V8SI_V8SI:
+    case V64QI_FTYPE_V64QI_V64QI:
+    case V32QI_FTYPE_V32QI_V32QI:
+    case V16HI_FTYPE_V32QI_V32QI:
+    case V16HI_FTYPE_V16HI_V16HI:
+    case V8SI_FTYPE_V4DF_V4DF:
+    case V8SI_FTYPE_V8SI_V8SI:
+    case V8SI_FTYPE_V16HI_V16HI:
+    case V4DI_FTYPE_V4DI_V4DI:
+    case V4DI_FTYPE_V8SI_V8SI:
+    case V8DI_FTYPE_V64QI_V64QI:
+      if (comparison == UNKNOWN)
+	return ix86_expand_binop_builtin (icode, exp, target);
+      nargs = 2;
+      break;
+    case V4SF_FTYPE_V4SF_V4SF_SWAP:
+    case V2DF_FTYPE_V2DF_V2DF_SWAP:
+      gcc_assert (comparison != UNKNOWN);
+      nargs = 2;
+      swap = true;
+      break;
+    case V16HI_FTYPE_V16HI_V8HI_COUNT:
+    case V16HI_FTYPE_V16HI_SI_COUNT:
+    case V8SI_FTYPE_V8SI_V4SI_COUNT:
+    case V8SI_FTYPE_V8SI_SI_COUNT:
+    case V4DI_FTYPE_V4DI_V2DI_COUNT:
+    case V4DI_FTYPE_V4DI_INT_COUNT:
+    case V8HI_FTYPE_V8HI_V8HI_COUNT:
+    case V8HI_FTYPE_V8HI_SI_COUNT:
+    case V4SI_FTYPE_V4SI_V4SI_COUNT:
+    case V4SI_FTYPE_V4SI_SI_COUNT:
+    case V4HI_FTYPE_V4HI_V4HI_COUNT:
+    case V4HI_FTYPE_V4HI_SI_COUNT:
+    case V2DI_FTYPE_V2DI_V2DI_COUNT:
+    case V2DI_FTYPE_V2DI_SI_COUNT:
+    case V2SI_FTYPE_V2SI_V2SI_COUNT:
+    case V2SI_FTYPE_V2SI_SI_COUNT:
+    case V1DI_FTYPE_V1DI_V1DI_COUNT:
+    case V1DI_FTYPE_V1DI_SI_COUNT:
+      nargs = 2;
+      second_arg_count = true;
+      break;
+    case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
+    case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
+    case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
+    case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
+    case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
+    case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
+    case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
+    case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
+    case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
+    case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
+    case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
+    case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
+    case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
+    case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
+    case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
+    case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
+    case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
+    case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
+      nargs = 4;
+      second_arg_count = true;
+      break;
+    case UINT64_FTYPE_UINT64_UINT64:
+    case UINT_FTYPE_UINT_UINT:
+    case UINT_FTYPE_UINT_USHORT:
+    case UINT_FTYPE_UINT_UCHAR:
+    case UINT16_FTYPE_UINT16_INT:
+    case UINT8_FTYPE_UINT8_INT:
+    case UQI_FTYPE_UQI_UQI:
+    case UHI_FTYPE_UHI_UHI:
+    case USI_FTYPE_USI_USI:
+    case UDI_FTYPE_UDI_UDI:
+    case V16SI_FTYPE_V8DF_V8DF:
+      nargs = 2;
+      break;
+    case V2DI_FTYPE_V2DI_INT_CONVERT:
+      nargs = 2;
+      rmode = V1TImode;
+      nargs_constant = 1;
+      break;
+    case V4DI_FTYPE_V4DI_INT_CONVERT:
+      nargs = 2;
+      rmode = V2TImode;
+      nargs_constant = 1;
+      break;
+    case V8DI_FTYPE_V8DI_INT_CONVERT:
+      nargs = 2;
+      rmode = V4TImode;
+      nargs_constant = 1;
+      break;
+    case V8HI_FTYPE_V8HI_INT:
+    case V8HI_FTYPE_V8SF_INT:
+    case V16HI_FTYPE_V16SF_INT:
+    case V8HI_FTYPE_V4SF_INT:
+    case V8SF_FTYPE_V8SF_INT:
+    case V4SF_FTYPE_V16SF_INT:
+    case V16SF_FTYPE_V16SF_INT:
+    case V4SI_FTYPE_V4SI_INT:
+    case V4SI_FTYPE_V8SI_INT:
+    case V4HI_FTYPE_V4HI_INT:
+    case V4DF_FTYPE_V4DF_INT:
+    case V4DF_FTYPE_V8DF_INT:
+    case V4SF_FTYPE_V4SF_INT:
+    case V4SF_FTYPE_V8SF_INT:
+    case V2DI_FTYPE_V2DI_INT:
+    case V2DF_FTYPE_V2DF_INT:
+    case V2DF_FTYPE_V4DF_INT:
+    case V16HI_FTYPE_V16HI_INT:
+    case V8SI_FTYPE_V8SI_INT:
+    case V16SI_FTYPE_V16SI_INT:
+    case V4SI_FTYPE_V16SI_INT:
+    case V4DI_FTYPE_V4DI_INT:
+    case V2DI_FTYPE_V4DI_INT:
+    case V4DI_FTYPE_V8DI_INT:
+    case QI_FTYPE_V4SF_INT:
+    case QI_FTYPE_V2DF_INT:
+    case UQI_FTYPE_UQI_UQI_CONST:
+    case UHI_FTYPE_UHI_UQI:
+    case USI_FTYPE_USI_UQI:
+    case UDI_FTYPE_UDI_UQI:
+      nargs = 2;
+      nargs_constant = 1;
+      break;
+    case V16QI_FTYPE_V16QI_V16QI_V16QI:
+    case V8SF_FTYPE_V8SF_V8SF_V8SF:
+    case V4DF_FTYPE_V4DF_V4DF_V4DF:
+    case V4SF_FTYPE_V4SF_V4SF_V4SF:
+    case V2DF_FTYPE_V2DF_V2DF_V2DF:
+    case V32QI_FTYPE_V32QI_V32QI_V32QI:
+    case UHI_FTYPE_V16SI_V16SI_UHI:
+    case UQI_FTYPE_V8DI_V8DI_UQI:
+    case V16HI_FTYPE_V16SI_V16HI_UHI:
+    case V16QI_FTYPE_V16SI_V16QI_UHI:
+    case V16QI_FTYPE_V8DI_V16QI_UQI:
+    case V16SF_FTYPE_V16SF_V16SF_UHI:
+    case V16SF_FTYPE_V4SF_V16SF_UHI:
+    case V16SI_FTYPE_SI_V16SI_UHI:
+    case V16SI_FTYPE_V16HI_V16SI_UHI:
+    case V16SI_FTYPE_V16QI_V16SI_UHI:
+    case V8SF_FTYPE_V4SF_V8SF_UQI:
+    case V4DF_FTYPE_V2DF_V4DF_UQI:
+    case V8SI_FTYPE_V4SI_V8SI_UQI:
+    case V8SI_FTYPE_SI_V8SI_UQI:
+    case V4SI_FTYPE_V4SI_V4SI_UQI:
+    case V4SI_FTYPE_SI_V4SI_UQI:
+    case V4DI_FTYPE_V2DI_V4DI_UQI:
+    case V4DI_FTYPE_DI_V4DI_UQI:
+    case V2DI_FTYPE_V2DI_V2DI_UQI:
+    case V2DI_FTYPE_DI_V2DI_UQI:
+    case V64QI_FTYPE_V64QI_V64QI_UDI:
+    case V64QI_FTYPE_V16QI_V64QI_UDI:
+    case V64QI_FTYPE_QI_V64QI_UDI:
+    case V32QI_FTYPE_V32QI_V32QI_USI:
+    case V32QI_FTYPE_V16QI_V32QI_USI:
+    case V32QI_FTYPE_QI_V32QI_USI:
+    case V16QI_FTYPE_V16QI_V16QI_UHI:
+    case V16QI_FTYPE_QI_V16QI_UHI:
+    case V32HI_FTYPE_V8HI_V32HI_USI:
+    case V32HI_FTYPE_HI_V32HI_USI:
+    case V16HI_FTYPE_V8HI_V16HI_UHI:
+    case V16HI_FTYPE_HI_V16HI_UHI:
+    case V8HI_FTYPE_V8HI_V8HI_UQI:
+    case V8HI_FTYPE_HI_V8HI_UQI:
+    case V8SF_FTYPE_V8HI_V8SF_UQI:
+    case V4SF_FTYPE_V8HI_V4SF_UQI:
+    case V8SI_FTYPE_V8SF_V8SI_UQI:
+    case V4SI_FTYPE_V4SF_V4SI_UQI:
+    case V4DI_FTYPE_V4SF_V4DI_UQI:
+    case V2DI_FTYPE_V4SF_V2DI_UQI:
+    case V4SF_FTYPE_V4DI_V4SF_UQI:
+    case V4SF_FTYPE_V2DI_V4SF_UQI:
+    case V4DF_FTYPE_V4DI_V4DF_UQI:
+    case V2DF_FTYPE_V2DI_V2DF_UQI:
+    case V16QI_FTYPE_V8HI_V16QI_UQI:
+    case V16QI_FTYPE_V16HI_V16QI_UHI:
+    case V16QI_FTYPE_V4SI_V16QI_UQI:
+    case V16QI_FTYPE_V8SI_V16QI_UQI:
+    case V8HI_FTYPE_V4SI_V8HI_UQI:
+    case V8HI_FTYPE_V8SI_V8HI_UQI:
+    case V16QI_FTYPE_V2DI_V16QI_UQI:
+    case V16QI_FTYPE_V4DI_V16QI_UQI:
+    case V8HI_FTYPE_V2DI_V8HI_UQI:
+    case V8HI_FTYPE_V4DI_V8HI_UQI:
+    case V4SI_FTYPE_V2DI_V4SI_UQI:
+    case V4SI_FTYPE_V4DI_V4SI_UQI:
+    case V32QI_FTYPE_V32HI_V32QI_USI:
+    case UHI_FTYPE_V16QI_V16QI_UHI:
+    case USI_FTYPE_V32QI_V32QI_USI:
+    case UDI_FTYPE_V64QI_V64QI_UDI:
+    case UQI_FTYPE_V8HI_V8HI_UQI:
+    case UHI_FTYPE_V16HI_V16HI_UHI:
+    case USI_FTYPE_V32HI_V32HI_USI:
+    case UQI_FTYPE_V4SI_V4SI_UQI:
+    case UQI_FTYPE_V8SI_V8SI_UQI:
+    case UQI_FTYPE_V2DI_V2DI_UQI:
+    case UQI_FTYPE_V4DI_V4DI_UQI:
+    case V4SF_FTYPE_V2DF_V4SF_UQI:
+    case V4SF_FTYPE_V4DF_V4SF_UQI:
+    case V16SI_FTYPE_V16SI_V16SI_UHI:
+    case V16SI_FTYPE_V4SI_V16SI_UHI:
+    case V2DI_FTYPE_V4SI_V2DI_UQI:
+    case V2DI_FTYPE_V8HI_V2DI_UQI:
+    case V2DI_FTYPE_V16QI_V2DI_UQI:
+    case V4DI_FTYPE_V4DI_V4DI_UQI:
+    case V4DI_FTYPE_V4SI_V4DI_UQI:
+    case V4DI_FTYPE_V8HI_V4DI_UQI:
+    case V4DI_FTYPE_V16QI_V4DI_UQI:
+    case V4DI_FTYPE_V4DF_V4DI_UQI:
+    case V2DI_FTYPE_V2DF_V2DI_UQI:
+    case V4SI_FTYPE_V4DF_V4SI_UQI:
+    case V4SI_FTYPE_V2DF_V4SI_UQI:
+    case V4SI_FTYPE_V8HI_V4SI_UQI:
+    case V4SI_FTYPE_V16QI_V4SI_UQI:
+    case V4DI_FTYPE_V4DI_V4DI_V4DI:
+    case V8DF_FTYPE_V2DF_V8DF_UQI:
+    case V8DF_FTYPE_V4DF_V8DF_UQI:
+    case V8DF_FTYPE_V8DF_V8DF_UQI:
+    case V8SF_FTYPE_V8SF_V8SF_UQI:
+    case V8SF_FTYPE_V8SI_V8SF_UQI:
+    case V4DF_FTYPE_V4DF_V4DF_UQI:
+    case V4SF_FTYPE_V4SF_V4SF_UQI:
+    case V2DF_FTYPE_V2DF_V2DF_UQI:
+    case V2DF_FTYPE_V4SF_V2DF_UQI:
+    case V2DF_FTYPE_V4SI_V2DF_UQI:
+    case V4SF_FTYPE_V4SI_V4SF_UQI:
+    case V4DF_FTYPE_V4SF_V4DF_UQI:
+    case V4DF_FTYPE_V4SI_V4DF_UQI:
+    case V8SI_FTYPE_V8SI_V8SI_UQI:
+    case V8SI_FTYPE_V8HI_V8SI_UQI:
+    case V8SI_FTYPE_V16QI_V8SI_UQI:
+    case V8DF_FTYPE_V8SI_V8DF_UQI:
+    case V8DI_FTYPE_DI_V8DI_UQI:
+    case V16SF_FTYPE_V8SF_V16SF_UHI:
+    case V16SI_FTYPE_V8SI_V16SI_UHI:
+    case V16HI_FTYPE_V16HI_V16HI_UHI:
+    case V8HI_FTYPE_V16QI_V8HI_UQI:
+    case V16HI_FTYPE_V16QI_V16HI_UHI:
+    case V32HI_FTYPE_V32HI_V32HI_USI:
+    case V32HI_FTYPE_V32QI_V32HI_USI:
+    case V8DI_FTYPE_V16QI_V8DI_UQI:
+    case V8DI_FTYPE_V2DI_V8DI_UQI:
+    case V8DI_FTYPE_V4DI_V8DI_UQI:
+    case V8DI_FTYPE_V8DI_V8DI_UQI:
+    case V8DI_FTYPE_V8HI_V8DI_UQI:
+    case V8DI_FTYPE_V8SI_V8DI_UQI:
+    case V8HI_FTYPE_V8DI_V8HI_UQI:
+    case V8SI_FTYPE_V8DI_V8SI_UQI:
+    case V4SI_FTYPE_V4SI_V4SI_V4SI:
+    case V16SI_FTYPE_V16SI_V16SI_V16SI:
+    case V8DI_FTYPE_V8DI_V8DI_V8DI:
+    case V32HI_FTYPE_V32HI_V32HI_V32HI:
+    case V2DI_FTYPE_V2DI_V2DI_V2DI:
+    case V16HI_FTYPE_V16HI_V16HI_V16HI:
+    case V8SI_FTYPE_V8SI_V8SI_V8SI:
+    case V8HI_FTYPE_V8HI_V8HI_V8HI:
+      nargs = 3;
+      break;
+    case V32QI_FTYPE_V32QI_V32QI_INT:
+    case V16HI_FTYPE_V16HI_V16HI_INT:
+    case V16QI_FTYPE_V16QI_V16QI_INT:
+    case V4DI_FTYPE_V4DI_V4DI_INT:
+    case V8HI_FTYPE_V8HI_V8HI_INT:
+    case V8SI_FTYPE_V8SI_V8SI_INT:
+    case V8SI_FTYPE_V8SI_V4SI_INT:
+    case V8SF_FTYPE_V8SF_V8SF_INT:
+    case V8SF_FTYPE_V8SF_V4SF_INT:
+    case V4SI_FTYPE_V4SI_V4SI_INT:
+    case V4DF_FTYPE_V4DF_V4DF_INT:
+    case V16SF_FTYPE_V16SF_V16SF_INT:
+    case V16SF_FTYPE_V16SF_V4SF_INT:
+    case V16SI_FTYPE_V16SI_V4SI_INT:
+    case V4DF_FTYPE_V4DF_V2DF_INT:
+    case V4SF_FTYPE_V4SF_V4SF_INT:
+    case V2DI_FTYPE_V2DI_V2DI_INT:
+    case V4DI_FTYPE_V4DI_V2DI_INT:
+    case V2DF_FTYPE_V2DF_V2DF_INT:
+    case UQI_FTYPE_V8DI_V8UDI_INT:
+    case UQI_FTYPE_V8DF_V8DF_INT:
+    case UQI_FTYPE_V2DF_V2DF_INT:
+    case UQI_FTYPE_V4SF_V4SF_INT:
+    case UHI_FTYPE_V16SI_V16SI_INT:
+    case UHI_FTYPE_V16SF_V16SF_INT:
+    case V64QI_FTYPE_V64QI_V64QI_INT:
+    case V32HI_FTYPE_V32HI_V32HI_INT:
+    case V16SI_FTYPE_V16SI_V16SI_INT:
+    case V8DI_FTYPE_V8DI_V8DI_INT:
+      nargs = 3;
+      nargs_constant = 1;
+      break;
+    case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
+      nargs = 3;
+      rmode = V4DImode;
+      nargs_constant = 1;
+      break;
+    case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
+      nargs = 3;
+      rmode = V2DImode;
+      nargs_constant = 1;
+      break;
+    case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
+      nargs = 3;
+      rmode = DImode;
+      nargs_constant = 1;
+      break;
+    case V2DI_FTYPE_V2DI_UINT_UINT:
+      nargs = 3;
+      nargs_constant = 2;
+      break;
+    case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
+      nargs = 3;
+      rmode = V8DImode;
+      nargs_constant = 1;
+      break;
+    case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
+      nargs = 5;
+      rmode = V8DImode;
+      mask_pos = 2;
+      nargs_constant = 1;
+      break;
+    case QI_FTYPE_V8DF_INT_UQI:
+    case QI_FTYPE_V4DF_INT_UQI:
+    case QI_FTYPE_V2DF_INT_UQI:
+    case HI_FTYPE_V16SF_INT_UHI:
+    case QI_FTYPE_V8SF_INT_UQI:
+    case QI_FTYPE_V4SF_INT_UQI:
+    case V4SI_FTYPE_V4SI_V4SI_UHI:
+    case V8SI_FTYPE_V8SI_V8SI_UHI:
+      nargs = 3;
+      mask_pos = 1;
+      nargs_constant = 1;
+      break;
+    case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
+      nargs = 5;
+      rmode = V4DImode;
+      mask_pos = 2;
+      nargs_constant = 1;
+      break;
+    case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
+      nargs = 5;
+      rmode = V2DImode;
+      mask_pos = 2;
+      nargs_constant = 1;
+      break;
+    case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
+    case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
+    case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
+    case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
+    case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
+    case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
+    case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
+    case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
+    case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
+    case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
+    case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
+    case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
+    case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
+    case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
+    case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
+    case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
+    case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
+    case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
+    case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
+    case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
+    case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
+    case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
+    case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
+    case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
+    case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
+    case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
+    case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
+    case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
+    case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
+    case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
+    case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
+    case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
+    case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
+    case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
+    case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
+    case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
+    case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
+    case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
+    case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
+    case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
+    case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
+    case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
+    case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
+    case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
+    case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
+    case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
+    case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
+    case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
+    case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
+    case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
+    case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
+      nargs = 4;
+      break;
+    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
+    case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
+    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
+    case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
+    case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
+      nargs = 4;
+      nargs_constant = 1;
+      break;
+    case UQI_FTYPE_V4DI_V4DI_INT_UQI:
+    case UQI_FTYPE_V8SI_V8SI_INT_UQI:
+    case QI_FTYPE_V4DF_V4DF_INT_UQI:
+    case QI_FTYPE_V8SF_V8SF_INT_UQI:
+    case UQI_FTYPE_V2DI_V2DI_INT_UQI:
+    case UQI_FTYPE_V4SI_V4SI_INT_UQI:
+    case UQI_FTYPE_V2DF_V2DF_INT_UQI:
+    case UQI_FTYPE_V4SF_V4SF_INT_UQI:
+    case UDI_FTYPE_V64QI_V64QI_INT_UDI:
+    case USI_FTYPE_V32QI_V32QI_INT_USI:
+    case UHI_FTYPE_V16QI_V16QI_INT_UHI:
+    case USI_FTYPE_V32HI_V32HI_INT_USI:
+    case UHI_FTYPE_V16HI_V16HI_INT_UHI:
+    case UQI_FTYPE_V8HI_V8HI_INT_UQI:
+    case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
+    case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
+    case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
+    case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
+    case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
+    case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
+    case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
+    case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
+    case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
+      nargs = 4;
+      mask_pos = 1;
+      nargs_constant = 1;
+      break;
+    case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
+      nargs = 4;
+      nargs_constant = 2;
+      break;
+    case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
+    case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
+      nargs = 4;
+      break;
+    case UQI_FTYPE_V8DI_V8DI_INT_UQI:
+    case UHI_FTYPE_V16SI_V16SI_INT_UHI:
+      mask_pos = 1;
+      nargs = 4;
+      nargs_constant = 1;
+      break;
+    case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
+    case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
+    case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
+    case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
+    case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
+    case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
+    case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
+    case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
+    case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
+    case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
+    case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
+    case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
+    case V32HI_FTYPE_V32HI_INT_V32HI_USI:
+    case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
+    case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
+    case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
+    case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
+    case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
+    case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
+    case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
+    case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
+    case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
+    case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
+    case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
+    case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
+    case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
+    case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
+    case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
+    case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
+    case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
+      nargs = 4;
+      mask_pos = 2;
+      nargs_constant = 1;
+      break;
+    case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
+    case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
+    case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
+    case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
+    case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
+    case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
+    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
+    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
+    case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
+    case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
+    case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
+    case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
+    case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
+    case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
+    case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
+    case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
+    case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
+    case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
+    case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
+    case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
+    case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
+    case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
+    case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
+    case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
+    case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
+    case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
+    case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
+      nargs = 5;
+      mask_pos = 2;
+      nargs_constant = 1;
+      break;
+    case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
+    case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
+    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
+    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
+    case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
+    case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
+    case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
+    case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
+    case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
+    case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
+      nargs = 5;
+      mask_pos = 1;
+      nargs_constant = 1;
+      break;
+    case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
+    case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
+    case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
+    case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
+    case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
+    case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
+    case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
+    case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
+    case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
+    case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
+    case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
+    case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
+      nargs = 5;
+      mask_pos = 1;
+      nargs_constant = 2;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  gcc_assert (nargs <= ARRAY_SIZE (args));
+
+  if (comparison != UNKNOWN)
+    {
+      gcc_assert (nargs == 2);
+      return ix86_expand_sse_compare (d, exp, target, swap);
+    }
+
+  if (rmode == VOIDmode || rmode == tmode)
+    {
+      if (optimize
+	  || target == 0
+	  || GET_MODE (target) != tmode
+	  || !insn_p->operand[0].predicate (target, tmode))
+	target = gen_reg_rtx (tmode);
+      else if (memory_operand (target, tmode))
+	num_memory++;
+      real_target = target;
+    }
+  else
+    {
+      real_target = gen_reg_rtx (tmode);
+      target = lowpart_subreg (rmode, real_target, tmode);
+    }
+
+  for (i = 0; i < nargs; i++)
+    {
+      tree arg = CALL_EXPR_ARG (exp, i);
+      rtx op = expand_normal (arg);
+      machine_mode mode = insn_p->operand[i + 1].mode;
+      bool match = insn_p->operand[i + 1].predicate (op, mode);
+
+      if (second_arg_count && i == 1)
+	{
+	  /* SIMD shift insns take either an 8-bit immediate or
+	     register as count.  But builtin functions take int as
+	     count.  If count doesn't match, we put it in register.
+	     The instructions are using 64-bit count, if op is just
+	     32-bit, zero-extend it, as negative shift counts
+	     are undefined behavior and zero-extension is more
+	     efficient.  */
+	  if (!match)
+	    {
+	      if (SCALAR_INT_MODE_P (GET_MODE (op)))
+		op = convert_modes (mode, GET_MODE (op), op, 1);
+	      else
+		op = lowpart_subreg (mode, op, GET_MODE (op));
+	      if (!insn_p->operand[i + 1].predicate (op, mode))
+		op = copy_to_reg (op);
+	    }
+	}
+      else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
+	       (!mask_pos && (nargs - i) <= nargs_constant))
+	{
+	  if (!match)
+	    switch (icode)
+	      {
+	      case CODE_FOR_avx_vinsertf128v4di:
+	      case CODE_FOR_avx_vextractf128v4di:
+		error ("the last argument must be an 1-bit immediate");
+		return const0_rtx;
+
+	      case CODE_FOR_avx512f_cmpv8di3_mask:
+	      case CODE_FOR_avx512f_cmpv16si3_mask:
+	      case CODE_FOR_avx512f_ucmpv8di3_mask:
+	      case CODE_FOR_avx512f_ucmpv16si3_mask:
+	      case CODE_FOR_avx512vl_cmpv4di3_mask:
+	      case CODE_FOR_avx512vl_cmpv8si3_mask:
+	      case CODE_FOR_avx512vl_ucmpv4di3_mask:
+	      case CODE_FOR_avx512vl_ucmpv8si3_mask:
+	      case CODE_FOR_avx512vl_cmpv2di3_mask:
+	      case CODE_FOR_avx512vl_cmpv4si3_mask:
+	      case CODE_FOR_avx512vl_ucmpv2di3_mask:
+	      case CODE_FOR_avx512vl_ucmpv4si3_mask:
+		error ("the last argument must be a 3-bit immediate");
+		return const0_rtx;
+
+	      case CODE_FOR_sse4_1_roundsd:
+	      case CODE_FOR_sse4_1_roundss:
+
+	      case CODE_FOR_sse4_1_roundpd:
+	      case CODE_FOR_sse4_1_roundps:
+	      case CODE_FOR_avx_roundpd256:
+	      case CODE_FOR_avx_roundps256:
+
+	      case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
+	      case CODE_FOR_sse4_1_roundps_sfix:
+	      case CODE_FOR_avx_roundpd_vec_pack_sfix256:
+	      case CODE_FOR_avx_roundps_sfix256:
+
+	      case CODE_FOR_sse4_1_blendps:
+	      case CODE_FOR_avx_blendpd256:
+	      case CODE_FOR_avx_vpermilv4df:
+	      case CODE_FOR_avx_vpermilv4df_mask:
+	      case CODE_FOR_avx512f_getmantv8df_mask:
+	      case CODE_FOR_avx512f_getmantv16sf_mask:
+	      case CODE_FOR_avx512vl_getmantv8sf_mask:
+	      case CODE_FOR_avx512vl_getmantv4df_mask:
+	      case CODE_FOR_avx512vl_getmantv4sf_mask:
+	      case CODE_FOR_avx512vl_getmantv2df_mask:
+	      case CODE_FOR_avx512dq_rangepv8df_mask_round:
+	      case CODE_FOR_avx512dq_rangepv16sf_mask_round:
+	      case CODE_FOR_avx512dq_rangepv4df_mask:
+	      case CODE_FOR_avx512dq_rangepv8sf_mask:
+	      case CODE_FOR_avx512dq_rangepv2df_mask:
+	      case CODE_FOR_avx512dq_rangepv4sf_mask:
+	      case CODE_FOR_avx_shufpd256_mask:
+		error ("the last argument must be a 4-bit immediate");
+		return const0_rtx;
+
+	      case CODE_FOR_sha1rnds4:
+	      case CODE_FOR_sse4_1_blendpd:
+	      case CODE_FOR_avx_vpermilv2df:
+	      case CODE_FOR_avx_vpermilv2df_mask:
+	      case CODE_FOR_xop_vpermil2v2df3:
+	      case CODE_FOR_xop_vpermil2v4sf3:
+	      case CODE_FOR_xop_vpermil2v4df3:
+	      case CODE_FOR_xop_vpermil2v8sf3:
+	      case CODE_FOR_avx512f_vinsertf32x4_mask:
+	      case CODE_FOR_avx512f_vinserti32x4_mask:
+	      case CODE_FOR_avx512f_vextractf32x4_mask:
+	      case CODE_FOR_avx512f_vextracti32x4_mask:
+	      case CODE_FOR_sse2_shufpd:
+	      case CODE_FOR_sse2_shufpd_mask:
+	      case CODE_FOR_avx512dq_shuf_f64x2_mask:
+	      case CODE_FOR_avx512dq_shuf_i64x2_mask:
+	      case CODE_FOR_avx512vl_shuf_i32x4_mask:
+	      case CODE_FOR_avx512vl_shuf_f32x4_mask:
+		error ("the last argument must be a 2-bit immediate");
+		return const0_rtx;
+
+	      case CODE_FOR_avx_vextractf128v4df:
+	      case CODE_FOR_avx_vextractf128v8sf:
+	      case CODE_FOR_avx_vextractf128v8si:
+	      case CODE_FOR_avx_vinsertf128v4df:
+	      case CODE_FOR_avx_vinsertf128v8sf:
+	      case CODE_FOR_avx_vinsertf128v8si:
+	      case CODE_FOR_avx512f_vinsertf64x4_mask:
+	      case CODE_FOR_avx512f_vinserti64x4_mask:
+	      case CODE_FOR_avx512f_vextractf64x4_mask:
+	      case CODE_FOR_avx512f_vextracti64x4_mask:
+	      case CODE_FOR_avx512dq_vinsertf32x8_mask:
+	      case CODE_FOR_avx512dq_vinserti32x8_mask:
+	      case CODE_FOR_avx512vl_vinsertv4df:
+	      case CODE_FOR_avx512vl_vinsertv4di:
+	      case CODE_FOR_avx512vl_vinsertv8sf:
+	      case CODE_FOR_avx512vl_vinsertv8si:
+		error ("the last argument must be a 1-bit immediate");
+		return const0_rtx;
+
+	      case CODE_FOR_avx_vmcmpv2df3:
+	      case CODE_FOR_avx_vmcmpv4sf3:
+	      case CODE_FOR_avx_cmpv2df3:
+	      case CODE_FOR_avx_cmpv4sf3:
+	      case CODE_FOR_avx_cmpv4df3:
+	      case CODE_FOR_avx_cmpv8sf3:
+	      case CODE_FOR_avx512f_cmpv8df3_mask:
+	      case CODE_FOR_avx512f_cmpv16sf3_mask:
+	      case CODE_FOR_avx512f_vmcmpv2df3_mask:
+	      case CODE_FOR_avx512f_vmcmpv4sf3_mask:
+		error ("the last argument must be a 5-bit immediate");
+		return const0_rtx;
+
+	      default:
+		switch (nargs_constant)
+		  {
+		  case 2:
+		    if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
+			(!mask_pos && (nargs - i) == nargs_constant))
+		      {
+			error ("the next to last argument must be an 8-bit immediate");
+			break;
+		      }
+		    /* FALLTHRU */
+		  case 1:
+		    error ("the last argument must be an 8-bit immediate");
+		    break;
+		  default:
+		    gcc_unreachable ();
+		  }
+		return const0_rtx;
+	      }
+	}
+      else
+	{
+	  if (VECTOR_MODE_P (mode))
+	    op = safe_vector_operand (op, mode);
+
+	  /* If we aren't optimizing, only allow one memory operand to
+	     be generated.  */
+	  if (memory_operand (op, mode))
+	    num_memory++;
+
+	  op = fixup_modeless_constant (op, mode);
+
+	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
+	    {
+	      if (optimize || !match || num_memory > 1)
+		op = copy_to_mode_reg (mode, op);
+	    }
+	  else
+	    {
+	      op = copy_to_reg (op);
+	      op = lowpart_subreg (mode, op, GET_MODE (op));
+	    }
+	}
+
+      args[i].op = op;
+      args[i].mode = mode;
+    }
+
+  switch (nargs)
+    {
+    case 1:
+      pat = GEN_FCN (icode) (real_target, args[0].op);
+      break;
+    case 2:
+      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
+      break;
+    case 3:
+      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
+			     args[2].op);
+      break;
+    case 4:
+      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
+			     args[2].op, args[3].op);
+      break;
+    case 5:
+      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
+			     args[2].op, args[3].op, args[4].op);
+      break;
+    case 6:
+      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
+			     args[2].op, args[3].op, args[4].op,
+			     args[5].op);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (! pat)
+    return 0;
+
+  emit_insn (pat);
+  return target;
+}
+
+/* Transform pattern of following layout:
+     (set A
+       (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
+     )
+   into:
+     (set (A B)) */
+
+static rtx
+ix86_erase_embedded_rounding (rtx pat)
+{
+  if (GET_CODE (pat) == INSN)
+    pat = PATTERN (pat);
+
+  gcc_assert (GET_CODE (pat) == SET);
+  rtx src = SET_SRC (pat);
+  gcc_assert (XVECLEN (src, 0) == 2);
+  rtx p0 = XVECEXP (src, 0, 0);
+  gcc_assert (GET_CODE (src) == UNSPEC
+	      && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
+  rtx res = gen_rtx_SET (SET_DEST (pat), p0);
+  return res;
+}
+
+/* Subroutine of ix86_expand_round_builtin to take care of comi insns
+   with rounding.  */
+static rtx
+ix86_expand_sse_comi_round (const struct builtin_description *d,
+			    tree exp, rtx target)
+{
+  rtx pat, set_dst;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  tree arg2 = CALL_EXPR_ARG (exp, 2);
+  tree arg3 = CALL_EXPR_ARG (exp, 3);
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  rtx op2 = expand_normal (arg2);
+  rtx op3 = expand_normal (arg3);
+  enum insn_code icode = d->icode;
+  const struct insn_data_d *insn_p = &insn_data[icode];
+  machine_mode mode0 = insn_p->operand[0].mode;
+  machine_mode mode1 = insn_p->operand[1].mode;
+  enum rtx_code comparison = UNEQ;
+  bool need_ucomi = false;
+
+  /* See avxintrin.h for values.  */
+  enum rtx_code comi_comparisons[32] =
+    {
+      UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
+      UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
+      UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
+    };
+  bool need_ucomi_values[32] =
+    {
+      true,  false, false, true,  true,  false, false, true,
+      true,  false, false, true,  true,  false, false, true,
+      false, true,  true,  false, false, true,  true,  false,
+      false, true,  true,  false, false, true,  true,  false
+    };
+
+  if (!CONST_INT_P (op2))
+    {
+      error ("the third argument must be comparison constant");
+      return const0_rtx;
+    }
+  if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
+    {
+      error ("incorrect comparison mode");
+      return const0_rtx;
+    }
+
+  if (!insn_p->operand[2].predicate (op3, SImode))
+    {
+      error ("incorrect rounding operand");
+      return const0_rtx;
+    }
+
+  comparison = comi_comparisons[INTVAL (op2)];
+  need_ucomi = need_ucomi_values[INTVAL (op2)];
+
+  if (VECTOR_MODE_P (mode0))
+    op0 = safe_vector_operand (op0, mode0);
+  if (VECTOR_MODE_P (mode1))
+    op1 = safe_vector_operand (op1, mode1);
+
+  target = gen_reg_rtx (SImode);
+  emit_move_insn (target, const0_rtx);
+  target = gen_rtx_SUBREG (QImode, target, 0);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_p->operand[0].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+  if ((optimize && !register_operand (op1, mode1))
+      || !insn_p->operand[1].predicate (op1, mode1))
+    op1 = copy_to_mode_reg (mode1, op1);
+
+  if (need_ucomi)
+    icode = icode == CODE_FOR_sse_comi_round
+		     ? CODE_FOR_sse_ucomi_round
+		     : CODE_FOR_sse2_ucomi_round;
+
+  pat = GEN_FCN (icode) (op0, op1, op3);
+  if (! pat)
+    return 0;
+
+  /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point.  */
+  if (INTVAL (op3) == NO_ROUND)
+    {
+      pat = ix86_erase_embedded_rounding (pat);
+      if (! pat)
+	return 0;
+
+      set_dst = SET_DEST (pat);
+    }
+  else
+    {
+      gcc_assert (GET_CODE (pat) == SET);
+      set_dst = SET_DEST (pat);
+    }
+
+  emit_insn (pat);
+  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+			  gen_rtx_fmt_ee (comparison, QImode,
+					  set_dst,
+					  const0_rtx)));
+
+  return SUBREG_REG (target);
+}
+
+static rtx
+ix86_expand_round_builtin (const struct builtin_description *d,
+			   tree exp, rtx target)
+{
+  rtx pat;
+  unsigned int i, nargs;
+  struct
+    {
+      rtx op;
+      machine_mode mode;
+    } args[6];
+  enum insn_code icode = d->icode;
+  const struct insn_data_d *insn_p = &insn_data[icode];
+  machine_mode tmode = insn_p->operand[0].mode;
+  unsigned int nargs_constant = 0;
+  unsigned int redundant_embed_rnd = 0;
+
+  switch ((enum ix86_builtin_func_type) d->flag)
+    {
+    case UINT64_FTYPE_V2DF_INT:
+    case UINT64_FTYPE_V4SF_INT:
+    case UINT_FTYPE_V2DF_INT:
+    case UINT_FTYPE_V4SF_INT:
+    case INT64_FTYPE_V2DF_INT:
+    case INT64_FTYPE_V4SF_INT:
+    case INT_FTYPE_V2DF_INT:
+    case INT_FTYPE_V4SF_INT:
+      nargs = 2;
+      break;
+    case V4SF_FTYPE_V4SF_UINT_INT:
+    case V4SF_FTYPE_V4SF_UINT64_INT:
+    case V2DF_FTYPE_V2DF_UINT64_INT:
+    case V4SF_FTYPE_V4SF_INT_INT:
+    case V4SF_FTYPE_V4SF_INT64_INT:
+    case V2DF_FTYPE_V2DF_INT64_INT:
+    case V4SF_FTYPE_V4SF_V4SF_INT:
+    case V2DF_FTYPE_V2DF_V2DF_INT:
+    case V4SF_FTYPE_V4SF_V2DF_INT:
+    case V2DF_FTYPE_V2DF_V4SF_INT:
+      nargs = 3;
+      break;
+    case V8SF_FTYPE_V8DF_V8SF_QI_INT:
+    case V8DF_FTYPE_V8DF_V8DF_QI_INT:
+    case V8SI_FTYPE_V8DF_V8SI_QI_INT:
+    case V8DI_FTYPE_V8DF_V8DI_QI_INT:
+    case V8SF_FTYPE_V8DI_V8SF_QI_INT:
+    case V8DF_FTYPE_V8DI_V8DF_QI_INT:
+    case V16SF_FTYPE_V16SF_V16SF_HI_INT:
+    case V8DI_FTYPE_V8SF_V8DI_QI_INT:
+    case V16SF_FTYPE_V16SI_V16SF_HI_INT:
+    case V16SI_FTYPE_V16SF_V16SI_HI_INT:
+    case V8DF_FTYPE_V8SF_V8DF_QI_INT:
+    case V16SF_FTYPE_V16HI_V16SF_HI_INT:
+    case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
+    case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
+      nargs = 4;
+      break;
+    case V4SF_FTYPE_V4SF_V4SF_INT_INT:
+    case V2DF_FTYPE_V2DF_V2DF_INT_INT:
+      nargs_constant = 2;
+      nargs = 4;
+      break;
+    case INT_FTYPE_V4SF_V4SF_INT_INT:
+    case INT_FTYPE_V2DF_V2DF_INT_INT:
+      return ix86_expand_sse_comi_round (d, exp, target);
+    case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
+    case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
+    case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
+    case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
+    case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
+    case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
+    case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
+    case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
+      nargs = 5;
+      break;
+    case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
+    case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
+      nargs_constant = 4;
+      nargs = 5;
+      break;
+    case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
+    case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
+    case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
+    case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
+      nargs_constant = 3;
+      nargs = 5;
+      break;
+    case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
+    case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
+    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
+    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
+    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
+    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
+      nargs = 6;
+      nargs_constant = 4;
+      break;
+    case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
+    case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
+    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
+    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
+      nargs = 6;
+      nargs_constant = 3;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  gcc_assert (nargs <= ARRAY_SIZE (args));
+
+  if (optimize
+      || target == 0
+      || GET_MODE (target) != tmode
+      || !insn_p->operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  for (i = 0; i < nargs; i++)
+    {
+      tree arg = CALL_EXPR_ARG (exp, i);
+      rtx op = expand_normal (arg);
+      machine_mode mode = insn_p->operand[i + 1].mode;
+      bool match = insn_p->operand[i + 1].predicate (op, mode);
+
+      if (i == nargs - nargs_constant)
+	{
+	  if (!match)
+	    {
+	      switch (icode)
+		{
+		case CODE_FOR_avx512f_getmantv8df_mask_round:
+		case CODE_FOR_avx512f_getmantv16sf_mask_round:
+		case CODE_FOR_avx512f_vgetmantv2df_round:
+		case CODE_FOR_avx512f_vgetmantv2df_mask_round:
+		case CODE_FOR_avx512f_vgetmantv4sf_round:
+		case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
+		  error ("the immediate argument must be a 4-bit immediate");
+		  return const0_rtx;
+		case CODE_FOR_avx512f_cmpv8df3_mask_round:
+		case CODE_FOR_avx512f_cmpv16sf3_mask_round:
+		case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
+		case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
+		  error ("the immediate argument must be a 5-bit immediate");
+		  return const0_rtx;
+		default:
+		  error ("the immediate argument must be an 8-bit immediate");
+		  return const0_rtx;
+		}
+	    }
+	}
+      else if (i == nargs-1)
+	{
+	  if (!insn_p->operand[nargs].predicate (op, SImode))
+	    {
+	      error ("incorrect rounding operand");
+	      return const0_rtx;
+	    }
+
+	  /* If there is no rounding use normal version of the pattern.  */
+	  if (INTVAL (op) == NO_ROUND)
+	    redundant_embed_rnd = 1;
+	}
+      else
+	{
+	  if (VECTOR_MODE_P (mode))
+	    op = safe_vector_operand (op, mode);
+
+	  op = fixup_modeless_constant (op, mode);
+
+	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
+	    {
+	      if (optimize || !match)
+		op = copy_to_mode_reg (mode, op);
+	    }
+	  else
+	    {
+	      op = copy_to_reg (op);
+	      op = lowpart_subreg (mode, op, GET_MODE (op));
+	    }
+	}
+
+      args[i].op = op;
+      args[i].mode = mode;
+    }
+
+  switch (nargs)
+    {
+    case 1:
+      pat = GEN_FCN (icode) (target, args[0].op);
+      break;
+    case 2:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+      break;
+    case 3:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+			     args[2].op);
+      break;
+    case 4:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+			     args[2].op, args[3].op);
+      break;
+    case 5:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+			     args[2].op, args[3].op, args[4].op);
+      break;
+    case 6:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+			     args[2].op, args[3].op, args[4].op,
+			     args[5].op);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (!pat)
+    return 0;
+
+  if (redundant_embed_rnd)
+    pat = ix86_erase_embedded_rounding (pat);
+
+  emit_insn (pat);
+  return target;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of special insns
+   with variable number of operands.  */
+
+static rtx
+ix86_expand_special_args_builtin (const struct builtin_description *d,
+				  tree exp, rtx target)
+{
+  tree arg;
+  rtx pat, op;
+  unsigned int i, nargs, arg_adjust, memory;
+  bool aligned_mem = false;
+  struct
+    {
+      rtx op;
+      machine_mode mode;
+    } args[3];
+  enum insn_code icode = d->icode;
+  bool last_arg_constant = false;
+  const struct insn_data_d *insn_p = &insn_data[icode];
+  machine_mode tmode = insn_p->operand[0].mode;
+  enum { load, store } klass;
+
+  switch ((enum ix86_builtin_func_type) d->flag)
+    {
+    case VOID_FTYPE_VOID:
+      emit_insn (GEN_FCN (icode) (target));
+      return 0;
+    case VOID_FTYPE_UINT64:
+    case VOID_FTYPE_UNSIGNED:
+      nargs = 0;
+      klass = store;
+      memory = 0;
+      break;
+
+    case INT_FTYPE_VOID:
+    case USHORT_FTYPE_VOID:
+    case UINT64_FTYPE_VOID:
+    case UINT_FTYPE_VOID:
+    case UNSIGNED_FTYPE_VOID:
+      nargs = 0;
+      klass = load;
+      memory = 0;
+      break;
+    case UINT64_FTYPE_PUNSIGNED:
+    case V2DI_FTYPE_PV2DI:
+    case V4DI_FTYPE_PV4DI:
+    case V32QI_FTYPE_PCCHAR:
+    case V16QI_FTYPE_PCCHAR:
+    case V8SF_FTYPE_PCV4SF:
+    case V8SF_FTYPE_PCFLOAT:
+    case V4SF_FTYPE_PCFLOAT:
+    case V4DF_FTYPE_PCV2DF:
+    case V4DF_FTYPE_PCDOUBLE:
+    case V2DF_FTYPE_PCDOUBLE:
+    case VOID_FTYPE_PVOID:
+    case V8DI_FTYPE_PV8DI:
+      nargs = 1;
+      klass = load;
+      memory = 0;
+      switch (icode)
+	{
+	case CODE_FOR_sse4_1_movntdqa:
+	case CODE_FOR_avx2_movntdqa:
+	case CODE_FOR_avx512f_movntdqa:
+	  aligned_mem = true;
+	  break;
+	default:
+	  break;
+	}
+      break;
+    case VOID_FTYPE_PV2SF_V4SF:
+    case VOID_FTYPE_PV8DI_V8DI:
+    case VOID_FTYPE_PV4DI_V4DI:
+    case VOID_FTYPE_PV2DI_V2DI:
+    case VOID_FTYPE_PCHAR_V32QI:
+    case VOID_FTYPE_PCHAR_V16QI:
+    case VOID_FTYPE_PFLOAT_V16SF:
+    case VOID_FTYPE_PFLOAT_V8SF:
+    case VOID_FTYPE_PFLOAT_V4SF:
+    case VOID_FTYPE_PDOUBLE_V8DF:
+    case VOID_FTYPE_PDOUBLE_V4DF:
+    case VOID_FTYPE_PDOUBLE_V2DF:
+    case VOID_FTYPE_PLONGLONG_LONGLONG:
+    case VOID_FTYPE_PULONGLONG_ULONGLONG:
+    case VOID_FTYPE_PUNSIGNED_UNSIGNED:
+    case VOID_FTYPE_PINT_INT:
+      nargs = 1;
+      klass = store;
+      /* Reserve memory operand for target.  */
+      memory = ARRAY_SIZE (args);
+      switch (icode)
+	{
+	/* These builtins and instructions require the memory
+	   to be properly aligned.  */
+	case CODE_FOR_avx_movntv4di:
+	case CODE_FOR_sse2_movntv2di:
+	case CODE_FOR_avx_movntv8sf:
+	case CODE_FOR_sse_movntv4sf:
+	case CODE_FOR_sse4a_vmmovntv4sf:
+	case CODE_FOR_avx_movntv4df:
+	case CODE_FOR_sse2_movntv2df:
+	case CODE_FOR_sse4a_vmmovntv2df:
+	case CODE_FOR_sse2_movntidi:
+	case CODE_FOR_sse_movntq:
+	case CODE_FOR_sse2_movntisi:
+	case CODE_FOR_avx512f_movntv16sf:
+	case CODE_FOR_avx512f_movntv8df:
+	case CODE_FOR_avx512f_movntv8di:
+	  aligned_mem = true;
+	  break;
+	default:
+	  break;
+	}
+      break;
+    case VOID_FTYPE_PVOID_PCVOID:
+	nargs = 1;
+	klass = store;
+	memory = 0;
+
+	break;
+    case V4SF_FTYPE_V4SF_PCV2SF:
+    case V2DF_FTYPE_V2DF_PCDOUBLE:
+      nargs = 2;
+      klass = load;
+      memory = 1;
+      break;
+    case V8SF_FTYPE_PCV8SF_V8SI:
+    case V4DF_FTYPE_PCV4DF_V4DI:
+    case V4SF_FTYPE_PCV4SF_V4SI:
+    case V2DF_FTYPE_PCV2DF_V2DI:
+    case V8SI_FTYPE_PCV8SI_V8SI:
+    case V4DI_FTYPE_PCV4DI_V4DI:
+    case V4SI_FTYPE_PCV4SI_V4SI:
+    case V2DI_FTYPE_PCV2DI_V2DI:
+    case VOID_FTYPE_INT_INT64:
+      nargs = 2;
+      klass = load;
+      memory = 0;
+      break;
+    case VOID_FTYPE_PV8DF_V8DF_UQI:
+    case VOID_FTYPE_PV4DF_V4DF_UQI:
+    case VOID_FTYPE_PV2DF_V2DF_UQI:
+    case VOID_FTYPE_PV16SF_V16SF_UHI:
+    case VOID_FTYPE_PV8SF_V8SF_UQI:
+    case VOID_FTYPE_PV4SF_V4SF_UQI:
+    case VOID_FTYPE_PV8DI_V8DI_UQI:
+    case VOID_FTYPE_PV4DI_V4DI_UQI:
+    case VOID_FTYPE_PV2DI_V2DI_UQI:
+    case VOID_FTYPE_PV16SI_V16SI_UHI:
+    case VOID_FTYPE_PV8SI_V8SI_UQI:
+    case VOID_FTYPE_PV4SI_V4SI_UQI:
+    case VOID_FTYPE_PV64QI_V64QI_UDI:
+    case VOID_FTYPE_PV32HI_V32HI_USI:
+    case VOID_FTYPE_PV32QI_V32QI_USI:
+    case VOID_FTYPE_PV16QI_V16QI_UHI:
+    case VOID_FTYPE_PV16HI_V16HI_UHI:
+    case VOID_FTYPE_PV8HI_V8HI_UQI:
+      switch (icode)
+	{
+	/* These builtins and instructions require the memory
+	   to be properly aligned.  */
+	case CODE_FOR_avx512f_storev16sf_mask:
+	case CODE_FOR_avx512f_storev16si_mask:
+	case CODE_FOR_avx512f_storev8df_mask:
+	case CODE_FOR_avx512f_storev8di_mask:
+	case CODE_FOR_avx512vl_storev8sf_mask:
+	case CODE_FOR_avx512vl_storev8si_mask:
+	case CODE_FOR_avx512vl_storev4df_mask:
+	case CODE_FOR_avx512vl_storev4di_mask:
+	case CODE_FOR_avx512vl_storev4sf_mask:
+	case CODE_FOR_avx512vl_storev4si_mask:
+	case CODE_FOR_avx512vl_storev2df_mask:
+	case CODE_FOR_avx512vl_storev2di_mask:
+	  aligned_mem = true;
+	  break;
+	default:
+	  break;
+	}
+      /* FALLTHRU */
+    case VOID_FTYPE_PV8SF_V8SI_V8SF:
+    case VOID_FTYPE_PV4DF_V4DI_V4DF:
+    case VOID_FTYPE_PV4SF_V4SI_V4SF:
+    case VOID_FTYPE_PV2DF_V2DI_V2DF:
+    case VOID_FTYPE_PV8SI_V8SI_V8SI:
+    case VOID_FTYPE_PV4DI_V4DI_V4DI:
+    case VOID_FTYPE_PV4SI_V4SI_V4SI:
+    case VOID_FTYPE_PV2DI_V2DI_V2DI:
+    case VOID_FTYPE_PV8SI_V8DI_UQI:
+    case VOID_FTYPE_PV8HI_V8DI_UQI:
+    case VOID_FTYPE_PV16HI_V16SI_UHI:
+    case VOID_FTYPE_PV16QI_V8DI_UQI:
+    case VOID_FTYPE_PV16QI_V16SI_UHI:
+    case VOID_FTYPE_PV4SI_V4DI_UQI:
+    case VOID_FTYPE_PV4SI_V2DI_UQI:
+    case VOID_FTYPE_PV8HI_V4DI_UQI:
+    case VOID_FTYPE_PV8HI_V2DI_UQI:
+    case VOID_FTYPE_PV8HI_V8SI_UQI:
+    case VOID_FTYPE_PV8HI_V4SI_UQI:
+    case VOID_FTYPE_PV16QI_V4DI_UQI:
+    case VOID_FTYPE_PV16QI_V2DI_UQI:
+    case VOID_FTYPE_PV16QI_V8SI_UQI:
+    case VOID_FTYPE_PV16QI_V4SI_UQI:
+    case VOID_FTYPE_PCHAR_V64QI_UDI:
+    case VOID_FTYPE_PCHAR_V32QI_USI:
+    case VOID_FTYPE_PCHAR_V16QI_UHI:
+    case VOID_FTYPE_PSHORT_V32HI_USI:
+    case VOID_FTYPE_PSHORT_V16HI_UHI:
+    case VOID_FTYPE_PSHORT_V8HI_UQI:
+    case VOID_FTYPE_PINT_V16SI_UHI:
+    case VOID_FTYPE_PINT_V8SI_UQI:
+    case VOID_FTYPE_PINT_V4SI_UQI:
+    case VOID_FTYPE_PINT64_V8DI_UQI:
+    case VOID_FTYPE_PINT64_V4DI_UQI:
+    case VOID_FTYPE_PINT64_V2DI_UQI:
+    case VOID_FTYPE_PDOUBLE_V8DF_UQI:
+    case VOID_FTYPE_PDOUBLE_V4DF_UQI:
+    case VOID_FTYPE_PDOUBLE_V2DF_UQI:
+    case VOID_FTYPE_PFLOAT_V16SF_UHI:
+    case VOID_FTYPE_PFLOAT_V8SF_UQI:
+    case VOID_FTYPE_PFLOAT_V4SF_UQI:
+    case VOID_FTYPE_PV32QI_V32HI_USI:
+    case VOID_FTYPE_PV16QI_V16HI_UHI:
+    case VOID_FTYPE_PV8QI_V8HI_UQI:
+      nargs = 2;
+      klass = store;
+      /* Reserve memory operand for target.  */
+      memory = ARRAY_SIZE (args);
+      break;
+    case V4SF_FTYPE_PCV4SF_V4SF_UQI:
+    case V8SF_FTYPE_PCV8SF_V8SF_UQI:
+    case V16SF_FTYPE_PCV16SF_V16SF_UHI:
+    case V4SI_FTYPE_PCV4SI_V4SI_UQI:
+    case V8SI_FTYPE_PCV8SI_V8SI_UQI:
+    case V16SI_FTYPE_PCV16SI_V16SI_UHI:
+    case V2DF_FTYPE_PCV2DF_V2DF_UQI:
+    case V4DF_FTYPE_PCV4DF_V4DF_UQI:
+    case V8DF_FTYPE_PCV8DF_V8DF_UQI:
+    case V2DI_FTYPE_PCV2DI_V2DI_UQI:
+    case V4DI_FTYPE_PCV4DI_V4DI_UQI:
+    case V8DI_FTYPE_PCV8DI_V8DI_UQI:
+    case V64QI_FTYPE_PCV64QI_V64QI_UDI:
+    case V32HI_FTYPE_PCV32HI_V32HI_USI:
+    case V32QI_FTYPE_PCV32QI_V32QI_USI:
+    case V16QI_FTYPE_PCV16QI_V16QI_UHI:
+    case V16HI_FTYPE_PCV16HI_V16HI_UHI:
+    case V8HI_FTYPE_PCV8HI_V8HI_UQI:
+      switch (icode)
+	{
+	/* These builtins and instructions require the memory
+	   to be properly aligned.  */
+	case CODE_FOR_avx512f_loadv16sf_mask:
+	case CODE_FOR_avx512f_loadv16si_mask:
+	case CODE_FOR_avx512f_loadv8df_mask:
+	case CODE_FOR_avx512f_loadv8di_mask:
+	case CODE_FOR_avx512vl_loadv8sf_mask:
+	case CODE_FOR_avx512vl_loadv8si_mask:
+	case CODE_FOR_avx512vl_loadv4df_mask:
+	case CODE_FOR_avx512vl_loadv4di_mask:
+	case CODE_FOR_avx512vl_loadv4sf_mask:
+	case CODE_FOR_avx512vl_loadv4si_mask:
+	case CODE_FOR_avx512vl_loadv2df_mask:
+	case CODE_FOR_avx512vl_loadv2di_mask:
+	case CODE_FOR_avx512bw_loadv64qi_mask:
+	case CODE_FOR_avx512vl_loadv32qi_mask:
+	case CODE_FOR_avx512vl_loadv16qi_mask:
+	case CODE_FOR_avx512bw_loadv32hi_mask:
+	case CODE_FOR_avx512vl_loadv16hi_mask:
+	case CODE_FOR_avx512vl_loadv8hi_mask:
+	  aligned_mem = true;
+	  break;
+	default:
+	  break;
+	}
+      /* FALLTHRU */
+    case V64QI_FTYPE_PCCHAR_V64QI_UDI:
+    case V32QI_FTYPE_PCCHAR_V32QI_USI:
+    case V16QI_FTYPE_PCCHAR_V16QI_UHI:
+    case V32HI_FTYPE_PCSHORT_V32HI_USI:
+    case V16HI_FTYPE_PCSHORT_V16HI_UHI:
+    case V8HI_FTYPE_PCSHORT_V8HI_UQI:
+    case V16SI_FTYPE_PCINT_V16SI_UHI:
+    case V8SI_FTYPE_PCINT_V8SI_UQI:
+    case V4SI_FTYPE_PCINT_V4SI_UQI:
+    case V8DI_FTYPE_PCINT64_V8DI_UQI:
+    case V4DI_FTYPE_PCINT64_V4DI_UQI:
+    case V2DI_FTYPE_PCINT64_V2DI_UQI:
+    case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
+    case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
+    case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
+    case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
+    case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
+    case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
+      nargs = 3;
+      klass = load;
+      memory = 0;
+      break;
+    case VOID_FTYPE_UINT_UINT_UINT:
+    case VOID_FTYPE_UINT64_UINT_UINT:
+    case UCHAR_FTYPE_UINT_UINT_UINT:
+    case UCHAR_FTYPE_UINT64_UINT_UINT:
+      nargs = 3;
+      klass = load;
+      memory = ARRAY_SIZE (args);
+      last_arg_constant = true;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  gcc_assert (nargs <= ARRAY_SIZE (args));
+
+  if (klass == store)
+    {
+      arg = CALL_EXPR_ARG (exp, 0);
+      op = expand_normal (arg);
+      gcc_assert (target == 0);
+      if (memory)
+	{
+	  op = ix86_zero_extend_to_Pmode (op);
+	  target = gen_rtx_MEM (tmode, op);
+	  /* target at this point has just BITS_PER_UNIT MEM_ALIGN
+	     on it.  Try to improve it using get_pointer_alignment,
+	     and if the special builtin is one that requires strict
+	     mode alignment, also from it's GET_MODE_ALIGNMENT.
+	     Failure to do so could lead to ix86_legitimate_combined_insn
+	     rejecting all changes to such insns.  */
+	  unsigned int align = get_pointer_alignment (arg);
+	  if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
+	    align = GET_MODE_ALIGNMENT (tmode);
+	  if (MEM_ALIGN (target) < align)
+	    set_mem_align (target, align);
+	}
+      else
+	target = force_reg (tmode, op);
+      arg_adjust = 1;
+    }
+  else
+    {
+      arg_adjust = 0;
+      if (optimize
+	  || target == 0
+	  || !register_operand (target, tmode)
+	  || GET_MODE (target) != tmode)
+	target = gen_reg_rtx (tmode);
+    }
+
+  for (i = 0; i < nargs; i++)
+    {
+      machine_mode mode = insn_p->operand[i + 1].mode;
+      bool match;
+
+      arg = CALL_EXPR_ARG (exp, i + arg_adjust);
+      op = expand_normal (arg);
+      match = insn_p->operand[i + 1].predicate (op, mode);
+
+      if (last_arg_constant && (i + 1) == nargs)
+	{
+	  if (!match)
+	    {
+	      if (icode == CODE_FOR_lwp_lwpvalsi3
+		  || icode == CODE_FOR_lwp_lwpinssi3
+		  || icode == CODE_FOR_lwp_lwpvaldi3
+		  || icode == CODE_FOR_lwp_lwpinsdi3)
+		error ("the last argument must be a 32-bit immediate");
+	      else
+		error ("the last argument must be an 8-bit immediate");
+	      return const0_rtx;
+	    }
+	}
+      else
+	{
+	  if (i == memory)
+	    {
+	      /* This must be the memory operand.  */
+	      op = ix86_zero_extend_to_Pmode (op);
+	      op = gen_rtx_MEM (mode, op);
+	      /* op at this point has just BITS_PER_UNIT MEM_ALIGN
+		 on it.  Try to improve it using get_pointer_alignment,
+		 and if the special builtin is one that requires strict
+		 mode alignment, also from it's GET_MODE_ALIGNMENT.
+		 Failure to do so could lead to ix86_legitimate_combined_insn
+		 rejecting all changes to such insns.  */
+	      unsigned int align = get_pointer_alignment (arg);
+	      if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
+		align = GET_MODE_ALIGNMENT (mode);
+	      if (MEM_ALIGN (op) < align)
+		set_mem_align (op, align);
+	    }
+	  else
+	    {
+	      /* This must be register.  */
+	      if (VECTOR_MODE_P (mode))
+		op = safe_vector_operand (op, mode);
+
+	      op = fixup_modeless_constant (op, mode);
+
+	      if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
+		op = copy_to_mode_reg (mode, op);
+	      else
+	        {
+	          op = copy_to_reg (op);
+	          op = lowpart_subreg (mode, op, GET_MODE (op));
+	        }
+	    }
+	}
+
+      args[i].op = op;
+      args[i].mode = mode;
+    }
+
+  switch (nargs)
+    {
+    case 0:
+      pat = GEN_FCN (icode) (target);
+      break;
+    case 1:
+      pat = GEN_FCN (icode) (target, args[0].op);
+      break;
+    case 2:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+      break;
+    case 3:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  return klass == store ? 0 : target;
+}
+
+/* Return the integer constant in ARG.  Constrain it to be in the range
+   of the subparts of VEC_TYPE; issue an error if not.  */
+
+static int
+get_element_number (tree vec_type, tree arg)
+{
+  unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
+
+  if (!tree_fits_uhwi_p (arg)
+      || (elt = tree_to_uhwi (arg), elt > max))
+    {
+      error ("selector must be an integer constant in the range "
+	     "[0, %wi]", max);
+      return 0;
+    }
+
+  return elt;
+}
+
+/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
+   ix86_expand_vector_init.  We DO have language-level syntax for this, in
+   the form of  (type){ init-list }.  Except that since we can't place emms
+   instructions from inside the compiler, we can't allow the use of MMX
+   registers unless the user explicitly asks for it.  So we do *not* define
+   vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
+   we have builtins invoked by mmintrin.h that gives us license to emit
+   these sorts of instructions.  */
+
+static rtx
+ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
+{
+  machine_mode tmode = TYPE_MODE (type);
+  machine_mode inner_mode = GET_MODE_INNER (tmode);
+  int i, n_elt = GET_MODE_NUNITS (tmode);
+  rtvec v = rtvec_alloc (n_elt);
+
+  gcc_assert (VECTOR_MODE_P (tmode));
+  gcc_assert (call_expr_nargs (exp) == n_elt);
+
+  for (i = 0; i < n_elt; ++i)
+    {
+      rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
+      RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
+    }
+
+  if (!target || !register_operand (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
+  return target;
+}
+
+/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
+   ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
+   had a language-level syntax for referencing vector elements.  */
+
+static rtx
+ix86_expand_vec_ext_builtin (tree exp, rtx target)
+{
+  machine_mode tmode, mode0;
+  tree arg0, arg1;
+  int elt;
+  rtx op0;
+
+  arg0 = CALL_EXPR_ARG (exp, 0);
+  arg1 = CALL_EXPR_ARG (exp, 1);
+
+  op0 = expand_normal (arg0);
+  elt = get_element_number (TREE_TYPE (arg0), arg1);
+
+  tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
+  mode0 = TYPE_MODE (TREE_TYPE (arg0));
+  gcc_assert (VECTOR_MODE_P (mode0));
+
+  op0 = force_reg (mode0, op0);
+
+  if (optimize || !target || !register_operand (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  ix86_expand_vector_extract (true, target, op0, elt);
+
+  return target;
+}
+
+/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
+   ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
+   a language-level syntax for referencing vector elements.  */
+
+static rtx
+ix86_expand_vec_set_builtin (tree exp)
+{
+  machine_mode tmode, mode1;
+  tree arg0, arg1, arg2;
+  int elt;
+  rtx op0, op1, target;
+
+  arg0 = CALL_EXPR_ARG (exp, 0);
+  arg1 = CALL_EXPR_ARG (exp, 1);
+  arg2 = CALL_EXPR_ARG (exp, 2);
+
+  tmode = TYPE_MODE (TREE_TYPE (arg0));
+  mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
+  gcc_assert (VECTOR_MODE_P (tmode));
+
+  op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
+  op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
+  elt = get_element_number (TREE_TYPE (arg0), arg2);
+
+  if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
+    op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
+
+  op0 = force_reg (tmode, op0);
+  op1 = force_reg (mode1, op1);
+
+  /* OP0 is the source of these builtin functions and shouldn't be
+     modified.  Create a copy, use it and return it as target.  */
+  target = gen_reg_rtx (tmode);
+  emit_move_insn (target, op0);
+  ix86_expand_vector_set (true, target, op1, elt);
+
+  return target;
+}
+
+/* Expand an expression EXP that calls a built-in function,
+   with result going to TARGET if that's convenient
+   (and in mode MODE if that's convenient).
+   SUBTARGET may be used as the target for computing one of EXP's operands.
+   IGNORE is nonzero if the value is to be ignored.  */
+
+rtx
+ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
+		     machine_mode mode, int ignore)
+{
+  size_t i;
+  enum insn_code icode, icode2;
+  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
+  tree arg0, arg1, arg2, arg3, arg4;
+  rtx op0, op1, op2, op3, op4, pat, pat2, insn;
+  machine_mode mode0, mode1, mode2, mode3, mode4;
+  unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
+
+  /* For CPU builtins that can be folded, fold first and expand the fold.  */
+  switch (fcode)
+    {
+    case IX86_BUILTIN_CPU_INIT:
+      {
+	/* Make it call __cpu_indicator_init in libgcc. */
+	tree call_expr, fndecl, type;
+        type = build_function_type_list (integer_type_node, NULL_TREE); 
+	fndecl = build_fn_decl ("__cpu_indicator_init", type);
+	call_expr = build_call_expr (fndecl, 0); 
+	return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
+      }
+    case IX86_BUILTIN_CPU_IS:
+    case IX86_BUILTIN_CPU_SUPPORTS:
+      {
+	tree arg0 = CALL_EXPR_ARG (exp, 0);
+	tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
+	gcc_assert (fold_expr != NULL_TREE);
+	return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
+      }
+    }
+
+  HOST_WIDE_INT isa = ix86_isa_flags;
+  HOST_WIDE_INT isa2 = ix86_isa_flags2;
+  HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
+  HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
+  /* The general case is we require all the ISAs specified in bisa{,2}
+     to be enabled.
+     The exceptions are:
+     OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
+     OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
+     OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
+     where for each this pair it is sufficient if either of the ISAs is
+     enabled, plus if it is ored with other options also those others.  */
+  if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
+       == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
+      && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
+    isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
+  if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
+       == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
+      && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
+    isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
+  if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
+       == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
+      && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
+    isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
+  if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
+    {
+      bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
+      if (TARGET_ABI_X32)
+	bisa |= OPTION_MASK_ABI_X32;
+      else
+	bisa |= OPTION_MASK_ABI_64;
+      char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
+				       (enum fpmath_unit) 0, false, add_abi_p);
+      if (!opts)
+	error ("%qE needs unknown isa option", fndecl);
+      else
+	{
+	  gcc_assert (opts != NULL);
+	  error ("%qE needs isa option %s", fndecl, opts);
+	  free (opts);
+	}
+      return expand_call (exp, target, ignore);
+    }
+
+  switch (fcode)
+    {
+    case IX86_BUILTIN_MASKMOVQ:
+    case IX86_BUILTIN_MASKMOVDQU:
+      icode = (fcode == IX86_BUILTIN_MASKMOVQ
+	       ? CODE_FOR_mmx_maskmovq
+	       : CODE_FOR_sse2_maskmovdqu);
+      /* Note the arg order is different from the operand order.  */
+      arg1 = CALL_EXPR_ARG (exp, 0);
+      arg2 = CALL_EXPR_ARG (exp, 1);
+      arg0 = CALL_EXPR_ARG (exp, 2);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      op2 = expand_normal (arg2);
+      mode0 = insn_data[icode].operand[0].mode;
+      mode1 = insn_data[icode].operand[1].mode;
+      mode2 = insn_data[icode].operand[2].mode;
+
+      op0 = ix86_zero_extend_to_Pmode (op0);
+      op0 = gen_rtx_MEM (mode1, op0);
+
+      if (!insn_data[icode].operand[0].predicate (op0, mode0))
+	op0 = copy_to_mode_reg (mode0, op0);
+      if (!insn_data[icode].operand[1].predicate (op1, mode1))
+	op1 = copy_to_mode_reg (mode1, op1);
+      if (!insn_data[icode].operand[2].predicate (op2, mode2))
+	op2 = copy_to_mode_reg (mode2, op2);
+      pat = GEN_FCN (icode) (op0, op1, op2);
+      if (! pat)
+	return 0;
+      emit_insn (pat);
+      return 0;
+
+    case IX86_BUILTIN_LDMXCSR:
+      op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
+      target = assign_386_stack_local (SImode, SLOT_TEMP);
+      emit_move_insn (target, op0);
+      emit_insn (gen_sse_ldmxcsr (target));
+      return 0;
+
+    case IX86_BUILTIN_STMXCSR:
+      target = assign_386_stack_local (SImode, SLOT_TEMP);
+      emit_insn (gen_sse_stmxcsr (target));
+      return copy_to_mode_reg (SImode, target);
+
+    case IX86_BUILTIN_CLFLUSH:
+	arg0 = CALL_EXPR_ARG (exp, 0);
+	op0 = expand_normal (arg0);
+	icode = CODE_FOR_sse2_clflush;
+	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+	  op0 = ix86_zero_extend_to_Pmode (op0);
+
+	emit_insn (gen_sse2_clflush (op0));
+	return 0;
+
+    case IX86_BUILTIN_CLWB:
+	arg0 = CALL_EXPR_ARG (exp, 0);
+	op0 = expand_normal (arg0);
+	icode = CODE_FOR_clwb;
+	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+	  op0 = ix86_zero_extend_to_Pmode (op0);
+
+	emit_insn (gen_clwb (op0));
+	return 0;
+
+    case IX86_BUILTIN_CLFLUSHOPT:
+	arg0 = CALL_EXPR_ARG (exp, 0);
+	op0 = expand_normal (arg0);
+	icode = CODE_FOR_clflushopt;
+	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+	  op0 = ix86_zero_extend_to_Pmode (op0);
+
+	emit_insn (gen_clflushopt (op0));
+	return 0;
+
+    case IX86_BUILTIN_MONITOR:
+    case IX86_BUILTIN_MONITORX:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      arg2 = CALL_EXPR_ARG (exp, 2);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      op2 = expand_normal (arg2);
+      if (!REG_P (op0))
+	op0 = ix86_zero_extend_to_Pmode (op0);
+      if (!REG_P (op1))
+	op1 = copy_to_mode_reg (SImode, op1);
+      if (!REG_P (op2))
+	op2 = copy_to_mode_reg (SImode, op2);
+
+      emit_insn (fcode == IX86_BUILTIN_MONITOR 
+		 ? ix86_gen_monitor (op0, op1, op2)
+		 : ix86_gen_monitorx (op0, op1, op2));
+      return 0;
+
+    case IX86_BUILTIN_MWAIT:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      if (!REG_P (op0))
+	op0 = copy_to_mode_reg (SImode, op0);
+      if (!REG_P (op1))
+	op1 = copy_to_mode_reg (SImode, op1);
+      emit_insn (gen_sse3_mwait (op0, op1));
+      return 0;
+
+    case IX86_BUILTIN_MWAITX:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      arg2 = CALL_EXPR_ARG (exp, 2);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      op2 = expand_normal (arg2);
+      if (!REG_P (op0))
+	op0 = copy_to_mode_reg (SImode, op0);
+      if (!REG_P (op1))
+	op1 = copy_to_mode_reg (SImode, op1);
+      if (!REG_P (op2))
+	op2 = copy_to_mode_reg (SImode, op2);
+      emit_insn (gen_mwaitx (op0, op1, op2));
+      return 0;
+
+    case IX86_BUILTIN_UMONITOR:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = expand_normal (arg0);
+
+      op0 = ix86_zero_extend_to_Pmode (op0);
+
+      insn = (TARGET_64BIT
+	      ? gen_umonitor_di (op0)
+	      : gen_umonitor_si (op0));
+
+      emit_insn (insn);
+      return 0;
+
+    case IX86_BUILTIN_UMWAIT:
+    case IX86_BUILTIN_TPAUSE:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+
+      if (!REG_P (op0))
+	op0 = copy_to_mode_reg (SImode, op0);
+
+      op1 = force_reg (DImode, op1);
+
+      if (TARGET_64BIT)
+	{
+	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
+				     NULL, 1, OPTAB_DIRECT);
+	  switch (fcode)
+	    {
+	    case IX86_BUILTIN_UMWAIT:
+	      icode = CODE_FOR_umwait_rex64;
+	      break;
+	    case IX86_BUILTIN_TPAUSE:
+	      icode = CODE_FOR_tpause_rex64;
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+
+	  op2 = gen_lowpart (SImode, op2);
+	  op1 = gen_lowpart (SImode, op1);
+	  pat = GEN_FCN (icode) (op0, op1, op2);
+	}
+      else
+	{
+	  switch (fcode)
+	    {
+	    case IX86_BUILTIN_UMWAIT:
+	      icode = CODE_FOR_umwait;
+	      break;
+	    case IX86_BUILTIN_TPAUSE:
+	      icode = CODE_FOR_tpause;
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+	  pat = GEN_FCN (icode) (op0, op1);
+	}
+
+      if (!pat)
+	return 0;
+
+      emit_insn (pat);
+
+      if (target == 0
+	  || !register_operand (target, QImode))
+	target = gen_reg_rtx (QImode);
+
+      pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
+			const0_rtx);
+      emit_insn (gen_rtx_SET (target, pat));
+
+      return target;
+
+    case IX86_BUILTIN_CLZERO:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = expand_normal (arg0);
+      if (!REG_P (op0))
+	op0 = ix86_zero_extend_to_Pmode (op0);
+      emit_insn (ix86_gen_clzero (op0));
+      return 0;
+
+    case IX86_BUILTIN_CLDEMOTE:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = expand_normal (arg0);
+      icode = CODE_FOR_cldemote;
+      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+	op0 = ix86_zero_extend_to_Pmode (op0);
+
+      emit_insn (gen_cldemote (op0));
+      return 0;
+
+    case IX86_BUILTIN_VEC_INIT_V2SI:
+    case IX86_BUILTIN_VEC_INIT_V4HI:
+    case IX86_BUILTIN_VEC_INIT_V8QI:
+      return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
+
+    case IX86_BUILTIN_VEC_EXT_V2DF:
+    case IX86_BUILTIN_VEC_EXT_V2DI:
+    case IX86_BUILTIN_VEC_EXT_V4SF:
+    case IX86_BUILTIN_VEC_EXT_V4SI:
+    case IX86_BUILTIN_VEC_EXT_V8HI:
+    case IX86_BUILTIN_VEC_EXT_V2SI:
+    case IX86_BUILTIN_VEC_EXT_V4HI:
+    case IX86_BUILTIN_VEC_EXT_V16QI:
+      return ix86_expand_vec_ext_builtin (exp, target);
+
+    case IX86_BUILTIN_VEC_SET_V2DI:
+    case IX86_BUILTIN_VEC_SET_V4SF:
+    case IX86_BUILTIN_VEC_SET_V4SI:
+    case IX86_BUILTIN_VEC_SET_V8HI:
+    case IX86_BUILTIN_VEC_SET_V4HI:
+    case IX86_BUILTIN_VEC_SET_V16QI:
+      return ix86_expand_vec_set_builtin (exp);
+
+    case IX86_BUILTIN_NANQ:
+    case IX86_BUILTIN_NANSQ:
+      return expand_call (exp, target, ignore);
+
+    case IX86_BUILTIN_RDPID:
+
+      op0 = gen_reg_rtx (word_mode);
+
+      if (TARGET_64BIT)
+	{
+	  insn = gen_rdpid_rex64 (op0);
+	  op0 = convert_to_mode (SImode, op0, 1);
+	}
+      else
+	insn = gen_rdpid (op0);
+
+      emit_insn (insn);
+
+      if (target == 0
+	  || !register_operand (target, SImode))
+	target = gen_reg_rtx (SImode);
+
+      emit_move_insn (target, op0);
+      return target;
+
+    case IX86_BUILTIN_RDPMC:
+    case IX86_BUILTIN_RDTSC:
+    case IX86_BUILTIN_RDTSCP:
+    case IX86_BUILTIN_XGETBV:
+
+      op0 = gen_reg_rtx (DImode);
+      op1 = gen_reg_rtx (DImode);
+
+      if (fcode == IX86_BUILTIN_RDPMC)
+	{
+	  arg0 = CALL_EXPR_ARG (exp, 0);
+	  op2 = expand_normal (arg0);
+	  if (!register_operand (op2, SImode))
+	    op2 = copy_to_mode_reg (SImode, op2);
+
+	  insn = (TARGET_64BIT
+		  ? gen_rdpmc_rex64 (op0, op1, op2)
+		  : gen_rdpmc (op0, op2));
+	  emit_insn (insn);
+	}
+      else if (fcode == IX86_BUILTIN_XGETBV)
+	{
+	  arg0 = CALL_EXPR_ARG (exp, 0);
+	  op2 = expand_normal (arg0);
+	  if (!register_operand (op2, SImode))
+	    op2 = copy_to_mode_reg (SImode, op2);
+
+	  insn = (TARGET_64BIT
+		  ? gen_xgetbv_rex64 (op0, op1, op2)
+		  : gen_xgetbv (op0, op2));
+	  emit_insn (insn);
+	}
+      else if (fcode == IX86_BUILTIN_RDTSC)
+	{
+	  insn = (TARGET_64BIT
+		  ? gen_rdtsc_rex64 (op0, op1)
+		  : gen_rdtsc (op0));
+	  emit_insn (insn);
+	}
+      else
+	{
+	  op2 = gen_reg_rtx (SImode);
+
+	  insn = (TARGET_64BIT
+		  ? gen_rdtscp_rex64 (op0, op1, op2)
+		  : gen_rdtscp (op0, op2));
+	  emit_insn (insn);
+
+	  arg0 = CALL_EXPR_ARG (exp, 0);
+	  op4 = expand_normal (arg0);
+	  if (!address_operand (op4, VOIDmode))
+	    {
+	      op4 = convert_memory_address (Pmode, op4);
+	      op4 = copy_addr_to_reg (op4);
+	    }
+	  emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
+	}
+
+      if (target == 0
+	  || !register_operand (target, DImode))
+        target = gen_reg_rtx (DImode);
+
+      if (TARGET_64BIT)
+	{
+	  op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
+				     op1, 1, OPTAB_DIRECT);
+	  op0 = expand_simple_binop (DImode, IOR, op0, op1,
+				     op0, 1, OPTAB_DIRECT);
+	}
+
+      emit_move_insn (target, op0);
+      return target;
+
+    case IX86_BUILTIN_MOVDIR64B:
+
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+
+      op0 = ix86_zero_extend_to_Pmode (op0);
+      if (!address_operand (op1, VOIDmode))
+      {
+	op1 = convert_memory_address (Pmode, op1);
+	op1 = copy_addr_to_reg (op1);
+      }
+      op1 = gen_rtx_MEM (XImode, op1);
+
+      insn = (TARGET_64BIT
+		? gen_movdir64b_di (op0, op1)
+		: gen_movdir64b_si (op0, op1));
+      emit_insn (insn);
+      return 0;
+
+    case IX86_BUILTIN_FXSAVE:
+    case IX86_BUILTIN_FXRSTOR:
+    case IX86_BUILTIN_FXSAVE64:
+    case IX86_BUILTIN_FXRSTOR64:
+    case IX86_BUILTIN_FNSTENV:
+    case IX86_BUILTIN_FLDENV:
+      mode0 = BLKmode;
+      switch (fcode)
+	{
+	case IX86_BUILTIN_FXSAVE:
+	  icode = CODE_FOR_fxsave;
+	  break;
+	case IX86_BUILTIN_FXRSTOR:
+	  icode = CODE_FOR_fxrstor;
+	  break;
+	case IX86_BUILTIN_FXSAVE64:
+	  icode = CODE_FOR_fxsave64;
+	  break;
+	case IX86_BUILTIN_FXRSTOR64:
+	  icode = CODE_FOR_fxrstor64;
+	  break;
+	case IX86_BUILTIN_FNSTENV:
+	  icode = CODE_FOR_fnstenv;
+	  break;
+	case IX86_BUILTIN_FLDENV:
+	  icode = CODE_FOR_fldenv;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = expand_normal (arg0);
+
+      if (!address_operand (op0, VOIDmode))
+	{
+	  op0 = convert_memory_address (Pmode, op0);
+	  op0 = copy_addr_to_reg (op0);
+	}
+      op0 = gen_rtx_MEM (mode0, op0);
+
+      pat = GEN_FCN (icode) (op0);
+      if (pat)
+	emit_insn (pat);
+      return 0;
+
+    case IX86_BUILTIN_XSETBV:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+
+      if (!REG_P (op0))
+	op0 = copy_to_mode_reg (SImode, op0);
+
+      op1 = force_reg (DImode, op1);
+
+      if (TARGET_64BIT)
+	{
+	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
+				     NULL, 1, OPTAB_DIRECT);
+
+	  icode = CODE_FOR_xsetbv_rex64;
+
+	  op2 = gen_lowpart (SImode, op2);
+	  op1 = gen_lowpart (SImode, op1);
+	  pat = GEN_FCN (icode) (op0, op1, op2);
+	}
+      else
+	{
+	  icode = CODE_FOR_xsetbv;
+
+	  pat = GEN_FCN (icode) (op0, op1);
+	}
+      if (pat)
+	emit_insn (pat);
+      return 0;
+
+    case IX86_BUILTIN_XSAVE:
+    case IX86_BUILTIN_XRSTOR:
+    case IX86_BUILTIN_XSAVE64:
+    case IX86_BUILTIN_XRSTOR64:
+    case IX86_BUILTIN_XSAVEOPT:
+    case IX86_BUILTIN_XSAVEOPT64:
+    case IX86_BUILTIN_XSAVES:
+    case IX86_BUILTIN_XRSTORS:
+    case IX86_BUILTIN_XSAVES64:
+    case IX86_BUILTIN_XRSTORS64:
+    case IX86_BUILTIN_XSAVEC:
+    case IX86_BUILTIN_XSAVEC64:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+
+      if (!address_operand (op0, VOIDmode))
+	{
+	  op0 = convert_memory_address (Pmode, op0);
+	  op0 = copy_addr_to_reg (op0);
+	}
+      op0 = gen_rtx_MEM (BLKmode, op0);
+
+      op1 = force_reg (DImode, op1);
+
+      if (TARGET_64BIT)
+	{
+	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
+				     NULL, 1, OPTAB_DIRECT);
+	  switch (fcode)
+	    {
+	    case IX86_BUILTIN_XSAVE:
+	      icode = CODE_FOR_xsave_rex64;
+	      break;
+	    case IX86_BUILTIN_XRSTOR:
+	      icode = CODE_FOR_xrstor_rex64;
+	      break;
+	    case IX86_BUILTIN_XSAVE64:
+	      icode = CODE_FOR_xsave64;
+	      break;
+	    case IX86_BUILTIN_XRSTOR64:
+	      icode = CODE_FOR_xrstor64;
+	      break;
+	    case IX86_BUILTIN_XSAVEOPT:
+	      icode = CODE_FOR_xsaveopt_rex64;
+	      break;
+	    case IX86_BUILTIN_XSAVEOPT64:
+	      icode = CODE_FOR_xsaveopt64;
+	      break;
+	    case IX86_BUILTIN_XSAVES:
+	      icode = CODE_FOR_xsaves_rex64;
+	      break;
+	    case IX86_BUILTIN_XRSTORS:
+	      icode = CODE_FOR_xrstors_rex64;
+	      break;
+	    case IX86_BUILTIN_XSAVES64:
+	      icode = CODE_FOR_xsaves64;
+	      break;
+	    case IX86_BUILTIN_XRSTORS64:
+	      icode = CODE_FOR_xrstors64;
+	      break;
+	    case IX86_BUILTIN_XSAVEC:
+	      icode = CODE_FOR_xsavec_rex64;
+	      break;
+	    case IX86_BUILTIN_XSAVEC64:
+	      icode = CODE_FOR_xsavec64;
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+
+	  op2 = gen_lowpart (SImode, op2);
+	  op1 = gen_lowpart (SImode, op1);
+	  pat = GEN_FCN (icode) (op0, op1, op2);
+	}
+      else
+	{
+	  switch (fcode)
+	    {
+	    case IX86_BUILTIN_XSAVE:
+	      icode = CODE_FOR_xsave;
+	      break;
+	    case IX86_BUILTIN_XRSTOR:
+	      icode = CODE_FOR_xrstor;
+	      break;
+	    case IX86_BUILTIN_XSAVEOPT:
+	      icode = CODE_FOR_xsaveopt;
+	      break;
+	    case IX86_BUILTIN_XSAVES:
+	      icode = CODE_FOR_xsaves;
+	      break;
+	    case IX86_BUILTIN_XRSTORS:
+	      icode = CODE_FOR_xrstors;
+	      break;
+	    case IX86_BUILTIN_XSAVEC:
+	      icode = CODE_FOR_xsavec;
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+	  pat = GEN_FCN (icode) (op0, op1);
+	}
+
+      if (pat)
+	emit_insn (pat);
+      return 0;
+
+    case IX86_BUILTIN_LLWPCB:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = expand_normal (arg0);
+      icode = CODE_FOR_lwp_llwpcb;
+      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+	op0 = ix86_zero_extend_to_Pmode (op0);
+      emit_insn (gen_lwp_llwpcb (op0));
+      return 0;
+
+    case IX86_BUILTIN_SLWPCB:
+      icode = CODE_FOR_lwp_slwpcb;
+      if (!target
+	  || !insn_data[icode].operand[0].predicate (target, Pmode))
+	target = gen_reg_rtx (Pmode);
+      emit_insn (gen_lwp_slwpcb (target));
+      return target;
+
+    case IX86_BUILTIN_BEXTRI32:
+    case IX86_BUILTIN_BEXTRI64:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      icode = (fcode == IX86_BUILTIN_BEXTRI32
+	  ? CODE_FOR_tbm_bextri_si
+	  : CODE_FOR_tbm_bextri_di);
+      if (!CONST_INT_P (op1))
+        {
+          error ("last argument must be an immediate");
+          return const0_rtx;
+        }
+      else
+        {
+          unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
+          unsigned char lsb_index = INTVAL (op1) & 0xFF;
+          op1 = GEN_INT (length);
+          op2 = GEN_INT (lsb_index);
+
+	  mode1 = insn_data[icode].operand[1].mode;
+	  if (!insn_data[icode].operand[1].predicate (op0, mode1))
+	    op0 = copy_to_mode_reg (mode1, op0);
+
+	  mode0 = insn_data[icode].operand[0].mode;
+	  if (target == 0
+	      || !register_operand (target, mode0))
+	    target = gen_reg_rtx (mode0);
+
+          pat = GEN_FCN (icode) (target, op0, op1, op2);
+          if (pat)
+            emit_insn (pat);
+          return target;
+        }
+
+    case IX86_BUILTIN_RDRAND16_STEP:
+      icode = CODE_FOR_rdrandhi_1;
+      mode0 = HImode;
+      goto rdrand_step;
+
+    case IX86_BUILTIN_RDRAND32_STEP:
+      icode = CODE_FOR_rdrandsi_1;
+      mode0 = SImode;
+      goto rdrand_step;
+
+    case IX86_BUILTIN_RDRAND64_STEP:
+      icode = CODE_FOR_rdranddi_1;
+      mode0 = DImode;
+
+rdrand_step:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op1 = expand_normal (arg0);
+      if (!address_operand (op1, VOIDmode))
+	{
+	  op1 = convert_memory_address (Pmode, op1);
+	  op1 = copy_addr_to_reg (op1);
+	}
+
+      op0 = gen_reg_rtx (mode0);
+      emit_insn (GEN_FCN (icode) (op0));
+
+      emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
+
+      op1 = gen_reg_rtx (SImode);
+      emit_move_insn (op1, CONST1_RTX (SImode));
+
+      /* Emit SImode conditional move.  */
+      if (mode0 == HImode)
+	{
+	  if (TARGET_ZERO_EXTEND_WITH_AND
+	      && optimize_function_for_speed_p (cfun))
+	    {
+	      op2 = force_reg (SImode, const0_rtx);
+
+	      emit_insn (gen_movstricthi
+			 (gen_lowpart (HImode, op2), op0));
+	    }
+	  else
+	    {
+	      op2 = gen_reg_rtx (SImode);
+
+	      emit_insn (gen_zero_extendhisi2 (op2, op0));
+	    }
+	}
+      else if (mode0 == SImode)
+	op2 = op0;
+      else
+	op2 = gen_rtx_SUBREG (SImode, op0, 0);
+
+      if (target == 0
+	  || !register_operand (target, SImode))
+	target = gen_reg_rtx (SImode);
+
+      pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
+			 const0_rtx);
+      emit_insn (gen_rtx_SET (target,
+			      gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
+      return target;
+
+    case IX86_BUILTIN_RDSEED16_STEP:
+      icode = CODE_FOR_rdseedhi_1;
+      mode0 = HImode;
+      goto rdseed_step;
+
+    case IX86_BUILTIN_RDSEED32_STEP:
+      icode = CODE_FOR_rdseedsi_1;
+      mode0 = SImode;
+      goto rdseed_step;
+
+    case IX86_BUILTIN_RDSEED64_STEP:
+      icode = CODE_FOR_rdseeddi_1;
+      mode0 = DImode;
+
+rdseed_step:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op1 = expand_normal (arg0);
+      if (!address_operand (op1, VOIDmode))
+	{
+	  op1 = convert_memory_address (Pmode, op1);
+	  op1 = copy_addr_to_reg (op1);
+	}
+
+      op0 = gen_reg_rtx (mode0);
+      emit_insn (GEN_FCN (icode) (op0));
+
+      emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
+
+      op2 = gen_reg_rtx (QImode);
+
+      pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
+                         const0_rtx);
+      emit_insn (gen_rtx_SET (op2, pat));
+
+      if (target == 0
+	  || !register_operand (target, SImode))
+        target = gen_reg_rtx (SImode);
+
+      emit_insn (gen_zero_extendqisi2 (target, op2));
+      return target;
+
+    case IX86_BUILTIN_SBB32:
+      icode = CODE_FOR_subborrowsi;
+      icode2 = CODE_FOR_subborrowsi_0;
+      mode0 = SImode;
+      mode1 = DImode;
+      mode2 = CCmode;
+      goto handlecarry;
+
+    case IX86_BUILTIN_SBB64:
+      icode = CODE_FOR_subborrowdi;
+      icode2 = CODE_FOR_subborrowdi_0;
+      mode0 = DImode;
+      mode1 = TImode;
+      mode2 = CCmode;
+      goto handlecarry;
+
+    case IX86_BUILTIN_ADDCARRYX32:
+      icode = CODE_FOR_addcarrysi;
+      icode2 = CODE_FOR_addcarrysi_0;
+      mode0 = SImode;
+      mode1 = DImode;
+      mode2 = CCCmode;
+      goto handlecarry;
+
+    case IX86_BUILTIN_ADDCARRYX64:
+      icode = CODE_FOR_addcarrydi;
+      icode2 = CODE_FOR_addcarrydi_0;
+      mode0 = DImode;
+      mode1 = TImode;
+      mode2 = CCCmode;
+
+    handlecarry:
+      arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in.  */
+      arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1.  */
+      arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2.  */
+      arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
+
+      op1 = expand_normal (arg0);
+      if (!integer_zerop (arg0))
+	op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
+
+      op2 = expand_normal (arg1);
+      if (!register_operand (op2, mode0))
+	op2 = copy_to_mode_reg (mode0, op2);
+
+      op3 = expand_normal (arg2);
+      if (!register_operand (op3, mode0))
+	op3 = copy_to_mode_reg (mode0, op3);
+
+      op4 = expand_normal (arg3);
+      if (!address_operand (op4, VOIDmode))
+	{
+	  op4 = convert_memory_address (Pmode, op4);
+	  op4 = copy_addr_to_reg (op4);
+	}
+
+      op0 = gen_reg_rtx (mode0);
+      if (integer_zerop (arg0))
+	{
+	  /* If arg0 is 0, optimize right away into add or sub
+	     instruction that sets CCCmode flags.  */
+	  op1 = gen_rtx_REG (mode2, FLAGS_REG);
+	  emit_insn (GEN_FCN (icode2) (op0, op2, op3));
+	}
+      else
+	{
+	  /* Generate CF from input operand.  */
+	  emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+
+	  /* Generate instruction that consumes CF.  */
+	  op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
+	  pat = gen_rtx_LTU (mode1, op1, const0_rtx);
+	  pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
+	  emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
+	}
+
+      /* Return current CF value.  */
+      if (target == 0)
+        target = gen_reg_rtx (QImode);
+
+      pat = gen_rtx_LTU (QImode, op1, const0_rtx);
+      emit_insn (gen_rtx_SET (target, pat));
+
+      /* Store the result.  */
+      emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
+
+      return target;
+
+    case IX86_BUILTIN_READ_FLAGS:
+      emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
+
+      if (optimize
+	  || target == NULL_RTX
+	  || !nonimmediate_operand (target, word_mode)
+	  || GET_MODE (target) != word_mode)
+	target = gen_reg_rtx (word_mode);
+
+      emit_insn (gen_pop (target));
+      return target;
+
+    case IX86_BUILTIN_WRITE_FLAGS:
+
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = expand_normal (arg0);
+      if (!general_no_elim_operand (op0, word_mode))
+	op0 = copy_to_mode_reg (word_mode, op0);
+
+      emit_insn (gen_push (op0));
+      emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
+      return 0;
+
+    case IX86_BUILTIN_KTESTC8:
+      icode = CODE_FOR_ktestqi;
+      mode3 = CCCmode;
+      goto kortest;
+
+    case IX86_BUILTIN_KTESTZ8:
+      icode = CODE_FOR_ktestqi;
+      mode3 = CCZmode;
+      goto kortest;
+
+    case IX86_BUILTIN_KTESTC16:
+      icode = CODE_FOR_ktesthi;
+      mode3 = CCCmode;
+      goto kortest;
+
+    case IX86_BUILTIN_KTESTZ16:
+      icode = CODE_FOR_ktesthi;
+      mode3 = CCZmode;
+      goto kortest;
+
+    case IX86_BUILTIN_KTESTC32:
+      icode = CODE_FOR_ktestsi;
+      mode3 = CCCmode;
+      goto kortest;
+
+    case IX86_BUILTIN_KTESTZ32:
+      icode = CODE_FOR_ktestsi;
+      mode3 = CCZmode;
+      goto kortest;
+
+    case IX86_BUILTIN_KTESTC64:
+      icode = CODE_FOR_ktestdi;
+      mode3 = CCCmode;
+      goto kortest;
+
+    case IX86_BUILTIN_KTESTZ64:
+      icode = CODE_FOR_ktestdi;
+      mode3 = CCZmode;
+      goto kortest;
+
+    case IX86_BUILTIN_KORTESTC8:
+      icode = CODE_FOR_kortestqi;
+      mode3 = CCCmode;
+      goto kortest;
+
+    case IX86_BUILTIN_KORTESTZ8:
+      icode = CODE_FOR_kortestqi;
+      mode3 = CCZmode;
+      goto kortest;
+
+    case IX86_BUILTIN_KORTESTC16:
+      icode = CODE_FOR_kortesthi;
+      mode3 = CCCmode;
+      goto kortest;
+
+    case IX86_BUILTIN_KORTESTZ16:
+      icode = CODE_FOR_kortesthi;
+      mode3 = CCZmode;
+      goto kortest;
+
+    case IX86_BUILTIN_KORTESTC32:
+      icode = CODE_FOR_kortestsi;
+      mode3 = CCCmode;
+      goto kortest;
+
+    case IX86_BUILTIN_KORTESTZ32:
+      icode = CODE_FOR_kortestsi;
+      mode3 = CCZmode;
+      goto kortest;
+
+    case IX86_BUILTIN_KORTESTC64:
+      icode = CODE_FOR_kortestdi;
+      mode3 = CCCmode;
+      goto kortest;
+
+    case IX86_BUILTIN_KORTESTZ64:
+      icode = CODE_FOR_kortestdi;
+      mode3 = CCZmode;
+
+    kortest:
+      arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1.  */
+      arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2.  */
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+
+      mode0 = insn_data[icode].operand[0].mode;
+      mode1 = insn_data[icode].operand[1].mode;
+
+      if (GET_MODE (op0) != VOIDmode)
+	op0 = force_reg (GET_MODE (op0), op0);
+
+      op0 = gen_lowpart (mode0, op0);
+
+      if (!insn_data[icode].operand[0].predicate (op0, mode0))
+	op0 = copy_to_mode_reg (mode0, op0);
+
+      if (GET_MODE (op1) != VOIDmode)
+	op1 = force_reg (GET_MODE (op1), op1);
+
+      op1 = gen_lowpart (mode1, op1);
+
+      if (!insn_data[icode].operand[1].predicate (op1, mode1))
+	op1 = copy_to_mode_reg (mode1, op1);
+
+      target = gen_reg_rtx (QImode);
+
+      /* Emit kortest.  */
+      emit_insn (GEN_FCN (icode) (op0, op1));
+      /* And use setcc to return result from flags.  */
+      ix86_expand_setcc (target, EQ,
+			 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
+      return target;
+
+    case IX86_BUILTIN_GATHERSIV2DF:
+      icode = CODE_FOR_avx2_gathersiv2df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV4DF:
+      icode = CODE_FOR_avx2_gathersiv4df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV2DF:
+      icode = CODE_FOR_avx2_gatherdiv2df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV4DF:
+      icode = CODE_FOR_avx2_gatherdiv4df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV4SF:
+      icode = CODE_FOR_avx2_gathersiv4sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV8SF:
+      icode = CODE_FOR_avx2_gathersiv8sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV4SF:
+      icode = CODE_FOR_avx2_gatherdiv4sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV8SF:
+      icode = CODE_FOR_avx2_gatherdiv8sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV2DI:
+      icode = CODE_FOR_avx2_gathersiv2di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV4DI:
+      icode = CODE_FOR_avx2_gathersiv4di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV2DI:
+      icode = CODE_FOR_avx2_gatherdiv2di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV4DI:
+      icode = CODE_FOR_avx2_gatherdiv4di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV4SI:
+      icode = CODE_FOR_avx2_gathersiv4si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV8SI:
+      icode = CODE_FOR_avx2_gathersiv8si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV4SI:
+      icode = CODE_FOR_avx2_gatherdiv4si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV8SI:
+      icode = CODE_FOR_avx2_gatherdiv8si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERALTSIV4DF:
+      icode = CODE_FOR_avx2_gathersiv4df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERALTDIV8SF:
+      icode = CODE_FOR_avx2_gatherdiv8sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERALTSIV4DI:
+      icode = CODE_FOR_avx2_gathersiv4di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERALTDIV8SI:
+      icode = CODE_FOR_avx2_gatherdiv8si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3SIV16SF:
+      icode = CODE_FOR_avx512f_gathersiv16sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3SIV8DF:
+      icode = CODE_FOR_avx512f_gathersiv8df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3DIV16SF:
+      icode = CODE_FOR_avx512f_gatherdiv16sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3DIV8DF:
+      icode = CODE_FOR_avx512f_gatherdiv8df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3SIV16SI:
+      icode = CODE_FOR_avx512f_gathersiv16si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3SIV8DI:
+      icode = CODE_FOR_avx512f_gathersiv8di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3DIV16SI:
+      icode = CODE_FOR_avx512f_gatherdiv16si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3DIV8DI:
+      icode = CODE_FOR_avx512f_gatherdiv8di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3ALTSIV8DF:
+      icode = CODE_FOR_avx512f_gathersiv8df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3ALTDIV16SF:
+      icode = CODE_FOR_avx512f_gatherdiv16sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3ALTSIV8DI:
+      icode = CODE_FOR_avx512f_gathersiv8di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3ALTDIV16SI:
+      icode = CODE_FOR_avx512f_gatherdiv16si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3SIV2DF:
+      icode = CODE_FOR_avx512vl_gathersiv2df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3SIV4DF:
+      icode = CODE_FOR_avx512vl_gathersiv4df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3DIV2DF:
+      icode = CODE_FOR_avx512vl_gatherdiv2df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3DIV4DF:
+      icode = CODE_FOR_avx512vl_gatherdiv4df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3SIV4SF:
+      icode = CODE_FOR_avx512vl_gathersiv4sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3SIV8SF:
+      icode = CODE_FOR_avx512vl_gathersiv8sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3DIV4SF:
+      icode = CODE_FOR_avx512vl_gatherdiv4sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3DIV8SF:
+      icode = CODE_FOR_avx512vl_gatherdiv8sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3SIV2DI:
+      icode = CODE_FOR_avx512vl_gathersiv2di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3SIV4DI:
+      icode = CODE_FOR_avx512vl_gathersiv4di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3DIV2DI:
+      icode = CODE_FOR_avx512vl_gatherdiv2di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3DIV4DI:
+      icode = CODE_FOR_avx512vl_gatherdiv4di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3SIV4SI:
+      icode = CODE_FOR_avx512vl_gathersiv4si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3SIV8SI:
+      icode = CODE_FOR_avx512vl_gathersiv8si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3DIV4SI:
+      icode = CODE_FOR_avx512vl_gatherdiv4si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3DIV8SI:
+      icode = CODE_FOR_avx512vl_gatherdiv8si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3ALTSIV4DF:
+      icode = CODE_FOR_avx512vl_gathersiv4df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3ALTDIV8SF:
+      icode = CODE_FOR_avx512vl_gatherdiv8sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3ALTSIV4DI:
+      icode = CODE_FOR_avx512vl_gathersiv4di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3ALTDIV8SI:
+      icode = CODE_FOR_avx512vl_gatherdiv8si;
+      goto gather_gen;
+    case IX86_BUILTIN_SCATTERSIV16SF:
+      icode = CODE_FOR_avx512f_scattersiv16sf;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERSIV8DF:
+      icode = CODE_FOR_avx512f_scattersiv8df;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERDIV16SF:
+      icode = CODE_FOR_avx512f_scatterdiv16sf;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERDIV8DF:
+      icode = CODE_FOR_avx512f_scatterdiv8df;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERSIV16SI:
+      icode = CODE_FOR_avx512f_scattersiv16si;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERSIV8DI:
+      icode = CODE_FOR_avx512f_scattersiv8di;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERDIV16SI:
+      icode = CODE_FOR_avx512f_scatterdiv16si;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERDIV8DI:
+      icode = CODE_FOR_avx512f_scatterdiv8di;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERSIV8SF:
+      icode = CODE_FOR_avx512vl_scattersiv8sf;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERSIV4SF:
+      icode = CODE_FOR_avx512vl_scattersiv4sf;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERSIV4DF:
+      icode = CODE_FOR_avx512vl_scattersiv4df;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERSIV2DF:
+      icode = CODE_FOR_avx512vl_scattersiv2df;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERDIV8SF:
+      icode = CODE_FOR_avx512vl_scatterdiv8sf;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERDIV4SF:
+      icode = CODE_FOR_avx512vl_scatterdiv4sf;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERDIV4DF:
+      icode = CODE_FOR_avx512vl_scatterdiv4df;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERDIV2DF:
+      icode = CODE_FOR_avx512vl_scatterdiv2df;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERSIV8SI:
+      icode = CODE_FOR_avx512vl_scattersiv8si;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERSIV4SI:
+      icode = CODE_FOR_avx512vl_scattersiv4si;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERSIV4DI:
+      icode = CODE_FOR_avx512vl_scattersiv4di;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERSIV2DI:
+      icode = CODE_FOR_avx512vl_scattersiv2di;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERDIV8SI:
+      icode = CODE_FOR_avx512vl_scatterdiv8si;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERDIV4SI:
+      icode = CODE_FOR_avx512vl_scatterdiv4si;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERDIV4DI:
+      icode = CODE_FOR_avx512vl_scatterdiv4di;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERDIV2DI:
+      icode = CODE_FOR_avx512vl_scatterdiv2di;
+      goto scatter_gen;
+    case IX86_BUILTIN_GATHERPFDPD:
+      icode = CODE_FOR_avx512pf_gatherpfv8sidf;
+      goto vec_prefetch_gen;
+    case IX86_BUILTIN_SCATTERALTSIV8DF:
+      icode = CODE_FOR_avx512f_scattersiv8df;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTDIV16SF:
+      icode = CODE_FOR_avx512f_scatterdiv16sf;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTSIV8DI:
+      icode = CODE_FOR_avx512f_scattersiv8di;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTDIV16SI:
+      icode = CODE_FOR_avx512f_scatterdiv16si;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTSIV4DF:
+      icode = CODE_FOR_avx512vl_scattersiv4df;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTDIV8SF:
+      icode = CODE_FOR_avx512vl_scatterdiv8sf;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTSIV4DI:
+      icode = CODE_FOR_avx512vl_scattersiv4di;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTDIV8SI:
+      icode = CODE_FOR_avx512vl_scatterdiv8si;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTSIV2DF:
+      icode = CODE_FOR_avx512vl_scattersiv2df;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTDIV4SF:
+      icode = CODE_FOR_avx512vl_scatterdiv4sf;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTSIV2DI:
+      icode = CODE_FOR_avx512vl_scattersiv2di;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERALTDIV4SI:
+      icode = CODE_FOR_avx512vl_scatterdiv4si;
+      goto scatter_gen;
+    case IX86_BUILTIN_GATHERPFDPS:
+      icode = CODE_FOR_avx512pf_gatherpfv16sisf;
+      goto vec_prefetch_gen;
+    case IX86_BUILTIN_GATHERPFQPD:
+      icode = CODE_FOR_avx512pf_gatherpfv8didf;
+      goto vec_prefetch_gen;
+    case IX86_BUILTIN_GATHERPFQPS:
+      icode = CODE_FOR_avx512pf_gatherpfv8disf;
+      goto vec_prefetch_gen;
+    case IX86_BUILTIN_SCATTERPFDPD:
+      icode = CODE_FOR_avx512pf_scatterpfv8sidf;
+      goto vec_prefetch_gen;
+    case IX86_BUILTIN_SCATTERPFDPS:
+      icode = CODE_FOR_avx512pf_scatterpfv16sisf;
+      goto vec_prefetch_gen;
+    case IX86_BUILTIN_SCATTERPFQPD:
+      icode = CODE_FOR_avx512pf_scatterpfv8didf;
+      goto vec_prefetch_gen;
+    case IX86_BUILTIN_SCATTERPFQPS:
+      icode = CODE_FOR_avx512pf_scatterpfv8disf;
+      goto vec_prefetch_gen;
+
+    gather_gen:
+      rtx half;
+      rtx (*gen) (rtx, rtx);
+
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      arg2 = CALL_EXPR_ARG (exp, 2);
+      arg3 = CALL_EXPR_ARG (exp, 3);
+      arg4 = CALL_EXPR_ARG (exp, 4);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      op2 = expand_normal (arg2);
+      op3 = expand_normal (arg3);
+      op4 = expand_normal (arg4);
+      /* Note the arg order is different from the operand order.  */
+      mode0 = insn_data[icode].operand[1].mode;
+      mode2 = insn_data[icode].operand[3].mode;
+      mode3 = insn_data[icode].operand[4].mode;
+      mode4 = insn_data[icode].operand[5].mode;
+
+      if (target == NULL_RTX
+	  || GET_MODE (target) != insn_data[icode].operand[0].mode
+	  || !insn_data[icode].operand[0].predicate (target,
+						     GET_MODE (target)))
+	subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
+      else
+	subtarget = target;
+
+      switch (fcode)
+	{
+	case IX86_BUILTIN_GATHER3ALTSIV8DF:
+	case IX86_BUILTIN_GATHER3ALTSIV8DI:
+	  half = gen_reg_rtx (V8SImode);
+	  if (!nonimmediate_operand (op2, V16SImode))
+	    op2 = copy_to_mode_reg (V16SImode, op2);
+	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
+	  op2 = half;
+	  break;
+	case IX86_BUILTIN_GATHER3ALTSIV4DF:
+	case IX86_BUILTIN_GATHER3ALTSIV4DI:
+	case IX86_BUILTIN_GATHERALTSIV4DF:
+	case IX86_BUILTIN_GATHERALTSIV4DI:
+	  half = gen_reg_rtx (V4SImode);
+	  if (!nonimmediate_operand (op2, V8SImode))
+	    op2 = copy_to_mode_reg (V8SImode, op2);
+	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
+	  op2 = half;
+	  break;
+	case IX86_BUILTIN_GATHER3ALTDIV16SF:
+	case IX86_BUILTIN_GATHER3ALTDIV16SI:
+	  half = gen_reg_rtx (mode0);
+	  if (mode0 == V8SFmode)
+	    gen = gen_vec_extract_lo_v16sf;
+	  else
+	    gen = gen_vec_extract_lo_v16si;
+	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
+	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
+	  emit_insn (gen (half, op0));
+	  op0 = half;
+	  op3 = lowpart_subreg (QImode, op3, HImode);
+	  break;
+	case IX86_BUILTIN_GATHER3ALTDIV8SF:
+	case IX86_BUILTIN_GATHER3ALTDIV8SI:
+	case IX86_BUILTIN_GATHERALTDIV8SF:
+	case IX86_BUILTIN_GATHERALTDIV8SI:
+	  half = gen_reg_rtx (mode0);
+	  if (mode0 == V4SFmode)
+	    gen = gen_vec_extract_lo_v8sf;
+	  else
+	    gen = gen_vec_extract_lo_v8si;
+	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
+	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
+	  emit_insn (gen (half, op0));
+	  op0 = half;
+	  if (VECTOR_MODE_P (GET_MODE (op3)))
+	    {
+	      half = gen_reg_rtx (mode0);
+	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
+		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+	      emit_insn (gen (half, op3));
+	      op3 = half;
+	    }
+	  break;
+	default:
+	  break;
+	}
+
+      /* Force memory operand only with base register here.  But we
+	 don't want to do it on memory operand for other builtin
+	 functions.  */
+      op1 = ix86_zero_extend_to_Pmode (op1);
+
+      if (!insn_data[icode].operand[1].predicate (op0, mode0))
+	op0 = copy_to_mode_reg (mode0, op0);
+      if (!insn_data[icode].operand[2].predicate (op1, Pmode))
+	op1 = copy_to_mode_reg (Pmode, op1);
+      if (!insn_data[icode].operand[3].predicate (op2, mode2))
+	op2 = copy_to_mode_reg (mode2, op2);
+
+      op3 = fixup_modeless_constant (op3, mode3);
+
+      if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
+	{
+	  if (!insn_data[icode].operand[4].predicate (op3, mode3))
+	    op3 = copy_to_mode_reg (mode3, op3);
+	}
+      else
+	{
+	  op3 = copy_to_reg (op3);
+	  op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
+	}
+      if (!insn_data[icode].operand[5].predicate (op4, mode4))
+	{
+          error ("the last argument must be scale 1, 2, 4, 8");
+          return const0_rtx;
+	}
+
+      /* Optimize.  If mask is known to have all high bits set,
+	 replace op0 with pc_rtx to signal that the instruction
+	 overwrites the whole destination and doesn't use its
+	 previous contents.  */
+      if (optimize)
+	{
+	  if (TREE_CODE (arg3) == INTEGER_CST)
+	    {
+	      if (integer_all_onesp (arg3))
+		op0 = pc_rtx;
+	    }
+	  else if (TREE_CODE (arg3) == VECTOR_CST)
+	    {
+	      unsigned int negative = 0;
+	      for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
+		{
+		  tree cst = VECTOR_CST_ELT (arg3, i);
+		  if (TREE_CODE (cst) == INTEGER_CST
+		      && tree_int_cst_sign_bit (cst))
+		    negative++;
+		  else if (TREE_CODE (cst) == REAL_CST
+			   && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
+		    negative++;
+		}
+	      if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
+		op0 = pc_rtx;
+	    }
+	  else if (TREE_CODE (arg3) == SSA_NAME
+		   && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
+	    {
+	      /* Recognize also when mask is like:
+		 __v2df src = _mm_setzero_pd ();
+		 __v2df mask = _mm_cmpeq_pd (src, src);
+		 or
+		 __v8sf src = _mm256_setzero_ps ();
+		 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
+		 as that is a cheaper way to load all ones into
+		 a register than having to load a constant from
+		 memory.  */
+	      gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
+	      if (is_gimple_call (def_stmt))
+		{
+		  tree fndecl = gimple_call_fndecl (def_stmt);
+		  if (fndecl
+		      && fndecl_built_in_p (fndecl, BUILT_IN_MD))
+		    switch (DECL_MD_FUNCTION_CODE (fndecl))
+		      {
+		      case IX86_BUILTIN_CMPPD:
+		      case IX86_BUILTIN_CMPPS:
+		      case IX86_BUILTIN_CMPPD256:
+		      case IX86_BUILTIN_CMPPS256:
+			if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
+			  break;
+			/* FALLTHRU */
+		      case IX86_BUILTIN_CMPEQPD:
+		      case IX86_BUILTIN_CMPEQPS:
+			if (initializer_zerop (gimple_call_arg (def_stmt, 0))
+			    && initializer_zerop (gimple_call_arg (def_stmt,
+								   1)))
+			  op0 = pc_rtx;
+			break;
+		      default:
+			break;
+		      }
+		}
+	    }
+	}
+
+      pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
+      if (! pat)
+	return const0_rtx;
+      emit_insn (pat);
+
+      switch (fcode)
+	{
+	case IX86_BUILTIN_GATHER3DIV16SF:
+	  if (target == NULL_RTX)
+	    target = gen_reg_rtx (V8SFmode);
+	  emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
+	  break;
+	case IX86_BUILTIN_GATHER3DIV16SI:
+	  if (target == NULL_RTX)
+	    target = gen_reg_rtx (V8SImode);
+	  emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
+	  break;
+	case IX86_BUILTIN_GATHER3DIV8SF:
+	case IX86_BUILTIN_GATHERDIV8SF:
+	  if (target == NULL_RTX)
+	    target = gen_reg_rtx (V4SFmode);
+	  emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
+	  break;
+	case IX86_BUILTIN_GATHER3DIV8SI:
+	case IX86_BUILTIN_GATHERDIV8SI:
+	  if (target == NULL_RTX)
+	    target = gen_reg_rtx (V4SImode);
+	  emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
+	  break;
+	default:
+	  target = subtarget;
+	  break;
+	}
+      return target;
+
+    scatter_gen:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      arg2 = CALL_EXPR_ARG (exp, 2);
+      arg3 = CALL_EXPR_ARG (exp, 3);
+      arg4 = CALL_EXPR_ARG (exp, 4);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      op2 = expand_normal (arg2);
+      op3 = expand_normal (arg3);
+      op4 = expand_normal (arg4);
+      mode1 = insn_data[icode].operand[1].mode;
+      mode2 = insn_data[icode].operand[2].mode;
+      mode3 = insn_data[icode].operand[3].mode;
+      mode4 = insn_data[icode].operand[4].mode;
+
+      /* Scatter instruction stores operand op3 to memory with
+	 indices from op2 and scale from op4 under writemask op1.
+	 If index operand op2 has more elements then source operand
+	 op3 one need to use only its low half. And vice versa.  */
+      switch (fcode)
+	{
+	case IX86_BUILTIN_SCATTERALTSIV8DF:
+	case IX86_BUILTIN_SCATTERALTSIV8DI:
+	  half = gen_reg_rtx (V8SImode);
+	  if (!nonimmediate_operand (op2, V16SImode))
+	    op2 = copy_to_mode_reg (V16SImode, op2);
+	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
+	  op2 = half;
+	  break;
+	case IX86_BUILTIN_SCATTERALTDIV16SF:
+	case IX86_BUILTIN_SCATTERALTDIV16SI:
+	  half = gen_reg_rtx (mode3);
+	  if (mode3 == V8SFmode)
+	    gen = gen_vec_extract_lo_v16sf;
+	  else
+	    gen = gen_vec_extract_lo_v16si;
+	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
+	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+	  emit_insn (gen (half, op3));
+	  op3 = half;
+	  break;
+	case IX86_BUILTIN_SCATTERALTSIV4DF:
+	case IX86_BUILTIN_SCATTERALTSIV4DI:
+	  half = gen_reg_rtx (V4SImode);
+	  if (!nonimmediate_operand (op2, V8SImode))
+	    op2 = copy_to_mode_reg (V8SImode, op2);
+	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
+	  op2 = half;
+	  break;
+	case IX86_BUILTIN_SCATTERALTDIV8SF:
+	case IX86_BUILTIN_SCATTERALTDIV8SI:
+	  half = gen_reg_rtx (mode3);
+	  if (mode3 == V4SFmode)
+	    gen = gen_vec_extract_lo_v8sf;
+	  else
+	    gen = gen_vec_extract_lo_v8si;
+	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
+	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+	  emit_insn (gen (half, op3));
+	  op3 = half;
+	  break;
+	case IX86_BUILTIN_SCATTERALTSIV2DF:
+	case IX86_BUILTIN_SCATTERALTSIV2DI:
+	  if (!nonimmediate_operand (op2, V4SImode))
+	    op2 = copy_to_mode_reg (V4SImode, op2);
+	  break;
+	case IX86_BUILTIN_SCATTERALTDIV4SF:
+	case IX86_BUILTIN_SCATTERALTDIV4SI:
+	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
+	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+	  break;
+	default:
+	  break;
+	}
+
+      /* Force memory operand only with base register here.  But we
+	 don't want to do it on memory operand for other builtin
+	 functions.  */
+      op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
+
+      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+	op0 = copy_to_mode_reg (Pmode, op0);
+
+      op1 = fixup_modeless_constant (op1, mode1);
+
+      if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
+	{
+	  if (!insn_data[icode].operand[1].predicate (op1, mode1))
+	    op1 = copy_to_mode_reg (mode1, op1);
+	}
+      else
+	{
+	  op1 = copy_to_reg (op1);
+	  op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
+	}
+
+      if (!insn_data[icode].operand[2].predicate (op2, mode2))
+	op2 = copy_to_mode_reg (mode2, op2);
+
+      if (!insn_data[icode].operand[3].predicate (op3, mode3))
+	op3 = copy_to_mode_reg (mode3, op3);
+
+      if (!insn_data[icode].operand[4].predicate (op4, mode4))
+	{
+	  error ("the last argument must be scale 1, 2, 4, 8");
+	  return const0_rtx;
+	}
+
+      pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
+      if (! pat)
+	return const0_rtx;
+
+      emit_insn (pat);
+      return 0;
+
+    vec_prefetch_gen:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      arg2 = CALL_EXPR_ARG (exp, 2);
+      arg3 = CALL_EXPR_ARG (exp, 3);
+      arg4 = CALL_EXPR_ARG (exp, 4);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      op2 = expand_normal (arg2);
+      op3 = expand_normal (arg3);
+      op4 = expand_normal (arg4);
+      mode0 = insn_data[icode].operand[0].mode;
+      mode1 = insn_data[icode].operand[1].mode;
+      mode3 = insn_data[icode].operand[3].mode;
+      mode4 = insn_data[icode].operand[4].mode;
+
+      op0 = fixup_modeless_constant (op0, mode0);
+
+      if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
+	{
+	  if (!insn_data[icode].operand[0].predicate (op0, mode0))
+	    op0 = copy_to_mode_reg (mode0, op0);
+	}
+      else
+	{
+	  op0 = copy_to_reg (op0);
+	  op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
+	}
+
+      if (!insn_data[icode].operand[1].predicate (op1, mode1))
+	op1 = copy_to_mode_reg (mode1, op1);
+
+      /* Force memory operand only with base register here.  But we
+	 don't want to do it on memory operand for other builtin
+	 functions.  */
+      op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
+
+      if (!insn_data[icode].operand[2].predicate (op2, Pmode))
+	op2 = copy_to_mode_reg (Pmode, op2);
+
+      if (!insn_data[icode].operand[3].predicate (op3, mode3))
+	{
+	  error ("the forth argument must be scale 1, 2, 4, 8");
+	  return const0_rtx;
+	}
+
+      if (!insn_data[icode].operand[4].predicate (op4, mode4))
+	{
+	  error ("incorrect hint operand");
+	  return const0_rtx;
+	}
+
+      pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
+      if (! pat)
+	return const0_rtx;
+
+      emit_insn (pat);
+
+      return 0;
+
+    case IX86_BUILTIN_XABORT:
+      icode = CODE_FOR_xabort;
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = expand_normal (arg0);
+      mode0 = insn_data[icode].operand[0].mode;
+      if (!insn_data[icode].operand[0].predicate (op0, mode0))
+	{
+	  error ("the argument to %<xabort%> intrinsic must "
+		 "be an 8-bit immediate");
+	  return const0_rtx;
+	}
+      emit_insn (gen_xabort (op0));
+      return 0;
+
+    case IX86_BUILTIN_RSTORSSP:
+    case IX86_BUILTIN_CLRSSBSY:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = expand_normal (arg0);
+      icode = (fcode == IX86_BUILTIN_RSTORSSP
+	  ? CODE_FOR_rstorssp
+	  : CODE_FOR_clrssbsy);
+      if (!address_operand (op0, VOIDmode))
+	{
+	  op1 = convert_memory_address (Pmode, op0);
+	  op0 = copy_addr_to_reg (op1);
+	}
+      emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
+      return 0;
+
+    case IX86_BUILTIN_WRSSD:
+    case IX86_BUILTIN_WRSSQ:
+    case IX86_BUILTIN_WRUSSD:
+    case IX86_BUILTIN_WRUSSQ:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = expand_normal (arg0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      op1 = expand_normal (arg1);
+      switch (fcode)
+	{
+	case IX86_BUILTIN_WRSSD:
+	  icode = CODE_FOR_wrsssi;
+	  mode = SImode;
+	  break;
+	case IX86_BUILTIN_WRSSQ:
+	  icode = CODE_FOR_wrssdi;
+	  mode = DImode;
+	  break;
+	case IX86_BUILTIN_WRUSSD:
+	  icode = CODE_FOR_wrusssi;
+	  mode = SImode;
+	  break;
+	case IX86_BUILTIN_WRUSSQ:
+	  icode = CODE_FOR_wrussdi;
+	  mode = DImode;
+	  break;
+	}
+      op0 = force_reg (mode, op0);
+      if (!address_operand (op1, VOIDmode))
+	{
+	  op2 = convert_memory_address (Pmode, op1);
+	  op1 = copy_addr_to_reg (op2);
+	}
+      emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
+      return 0;
+
+    default:
+      break;
+    }
+
+  if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
+      && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
+    {
+      i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
+      return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
+					       target);
+    }
+
+  if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
+      && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
+    {
+      i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
+      rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
+      rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
+      rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
+      int masked = 1;
+      machine_mode mode, wide_mode, nar_mode;
+
+      nar_mode  = V4SFmode;
+      mode      = V16SFmode;
+      wide_mode = V64SFmode;
+      fcn_mask  = gen_avx5124fmaddps_4fmaddps_mask;
+      fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
+
+      switch (fcode)
+	{
+	case IX86_BUILTIN_4FMAPS:
+	  fcn = gen_avx5124fmaddps_4fmaddps;
+	  masked = 0;
+	  goto v4fma_expand;
+
+	case IX86_BUILTIN_4DPWSSD:
+	  nar_mode  = V4SImode;
+	  mode      = V16SImode;
+	  wide_mode = V64SImode;
+	  fcn = gen_avx5124vnniw_vp4dpwssd;
+	  masked = 0;
+	  goto v4fma_expand;
+
+	case IX86_BUILTIN_4DPWSSDS:
+	  nar_mode  = V4SImode;
+	  mode      = V16SImode;
+	  wide_mode = V64SImode;
+	  fcn = gen_avx5124vnniw_vp4dpwssds;
+	  masked = 0;
+	  goto v4fma_expand;
+
+	case IX86_BUILTIN_4FNMAPS:
+	  fcn = gen_avx5124fmaddps_4fnmaddps;
+	  masked = 0;
+	  goto v4fma_expand;
+
+	case IX86_BUILTIN_4FNMAPS_MASK:
+	  fcn_mask  = gen_avx5124fmaddps_4fnmaddps_mask;
+	  fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
+	  goto v4fma_expand;
+
+	case IX86_BUILTIN_4DPWSSD_MASK:
+	  nar_mode  = V4SImode;
+	  mode      = V16SImode;
+	  wide_mode = V64SImode;
+	  fcn_mask  = gen_avx5124vnniw_vp4dpwssd_mask;
+	  fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
+	  goto v4fma_expand;
+
+	case IX86_BUILTIN_4DPWSSDS_MASK:
+	  nar_mode  = V4SImode;
+	  mode      = V16SImode;
+	  wide_mode = V64SImode;
+	  fcn_mask  = gen_avx5124vnniw_vp4dpwssds_mask;
+	  fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
+	  goto v4fma_expand;
+
+	case IX86_BUILTIN_4FMAPS_MASK:
+	  {
+	    tree args[4];
+	    rtx ops[4];
+	    rtx wide_reg;
+	    rtx accum;
+	    rtx addr;
+	    rtx mem;
+
+v4fma_expand:
+	    wide_reg = gen_reg_rtx (wide_mode);
+	    for (i = 0; i < 4; i++)
+	      {
+		args[i] = CALL_EXPR_ARG (exp, i);
+		ops[i] = expand_normal (args[i]);
+
+		emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
+				ops[i]);
+	      }
+
+	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
+	    accum = force_reg (mode, accum);
+
+	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
+	    addr = force_reg (Pmode, addr);
+
+	    mem = gen_rtx_MEM (nar_mode, addr);
+
+	    target = gen_reg_rtx (mode);
+
+	    emit_move_insn (target, accum);
+
+	    if (! masked)
+	      emit_insn (fcn (target, accum, wide_reg, mem));
+	    else
+	      {
+		rtx merge, mask;
+		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
+
+		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
+
+		if (CONST_INT_P (mask))
+		  mask = fixup_modeless_constant (mask, HImode);
+
+		mask = force_reg (HImode, mask);
+
+		if (GET_MODE (mask) != HImode)
+		  mask = gen_rtx_SUBREG (HImode, mask, 0);
+
+		/* If merge is 0 then we're about to emit z-masked variant.  */
+		if (const0_operand (merge, mode))
+		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
+		/* If merge is the same as accum then emit merge-masked variant.  */
+		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
+		  {
+		    merge = force_reg (mode, merge);
+		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
+		  }
+		/* Merge with something unknown might happen if we z-mask w/ -O0.  */
+		else
+		  {
+		    target = gen_reg_rtx (mode);
+		    emit_move_insn (target, merge);
+		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
+		  }
+	      }
+	    return target;
+	  }
+
+	case IX86_BUILTIN_4FNMASS:
+	  fcn = gen_avx5124fmaddps_4fnmaddss;
+	  masked = 0;
+	  goto s4fma_expand;
+
+	case IX86_BUILTIN_4FMASS:
+	  fcn = gen_avx5124fmaddps_4fmaddss;
+	  masked = 0;
+	  goto s4fma_expand;
+
+	case IX86_BUILTIN_4FNMASS_MASK:
+	  fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
+	  fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
+	  goto s4fma_expand;
+
+	case IX86_BUILTIN_4FMASS_MASK:
+	  {
+	    tree args[4];
+	    rtx ops[4];
+	    rtx wide_reg;
+	    rtx accum;
+	    rtx addr;
+	    rtx mem;
+
+	    fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
+	    fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
+
+s4fma_expand:
+	    mode = V4SFmode;
+	    wide_reg = gen_reg_rtx (V64SFmode);
+	    for (i = 0; i < 4; i++)
+	      {
+		rtx tmp;
+		args[i] = CALL_EXPR_ARG (exp, i);
+		ops[i] = expand_normal (args[i]);
+
+		tmp = gen_reg_rtx (SFmode);
+		emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
+
+		emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
+				gen_rtx_SUBREG (V16SFmode, tmp, 0));
+	      }
+
+	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
+	    accum = force_reg (V4SFmode, accum);
+
+	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
+	    addr = force_reg (Pmode, addr);
+
+	    mem = gen_rtx_MEM (V4SFmode, addr);
+
+	    target = gen_reg_rtx (V4SFmode);
+
+	    emit_move_insn (target, accum);
+
+	    if (! masked)
+	      emit_insn (fcn (target, accum, wide_reg, mem));
+	    else
+	      {
+		rtx merge, mask;
+		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
+
+		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
+
+		if (CONST_INT_P (mask))
+		  mask = fixup_modeless_constant (mask, QImode);
+
+		mask = force_reg (QImode, mask);
+
+		if (GET_MODE (mask) != QImode)
+		  mask = gen_rtx_SUBREG (QImode, mask, 0);
+
+		/* If merge is 0 then we're about to emit z-masked variant.  */
+		if (const0_operand (merge, mode))
+		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
+		/* If merge is the same as accum then emit merge-masked
+		   variant.  */
+		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
+		  {
+		    merge = force_reg (mode, merge);
+		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
+		  }
+		/* Merge with something unknown might happen if we z-mask
+		   w/ -O0.  */
+		else
+		  {
+		    target = gen_reg_rtx (mode);
+		    emit_move_insn (target, merge);
+		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
+		  }
+		}
+	      return target;
+	    }
+	  case IX86_BUILTIN_RDPID:
+	    return ix86_expand_special_args_builtin (bdesc_args + i, exp,
+						     target);
+	  case IX86_BUILTIN_FABSQ:
+	  case IX86_BUILTIN_COPYSIGNQ:
+	    if (!TARGET_SSE)
+	      /* Emit a normal call if SSE isn't available.  */
+	      return expand_call (exp, target, ignore);
+	    /* FALLTHRU */
+	  default:
+	    return ix86_expand_args_builtin (bdesc_args + i, exp, target);
+	  }
+    }
+
+  if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
+      && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
+    {
+      i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
+      return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
+    }
+
+  if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
+      && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
+    {
+      i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
+      return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
+    }
+
+  if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
+      && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
+    {
+      i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
+      return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
+    }
+
+  if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
+      && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
+    {
+      i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
+      return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
+    }
+
+  if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
+      && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
+    {
+      i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
+      const struct builtin_description *d = bdesc_multi_arg + i;
+      return ix86_expand_multi_arg_builtin (d->icode, exp, target,
+					    (enum ix86_builtin_func_type)
+					    d->flag, d->comparison);
+    }
+
+  if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
+      && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
+    {
+      i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
+      return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
+					       target);
+    }
+
+  if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
+      && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
+    {
+      i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
+      return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
+				       target);
+    }
+
+  gcc_unreachable ();
+}
+
+/* A subroutine of ix86_expand_vector_init_duplicate.  Tries to
+   fill target with val via vec_duplicate.  */
+
+static bool
+ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
+{
+  bool ok;
+  rtx_insn *insn;
+  rtx dup;
+
+  /* First attempt to recognize VAL as-is.  */
+  dup = gen_vec_duplicate (mode, val);
+  insn = emit_insn (gen_rtx_SET (target, dup));
+  if (recog_memoized (insn) < 0)
+    {
+      rtx_insn *seq;
+      machine_mode innermode = GET_MODE_INNER (mode);
+      rtx reg;
+
+      /* If that fails, force VAL into a register.  */
+
+      start_sequence ();
+      reg = force_reg (innermode, val);
+      if (GET_MODE (reg) != innermode)
+	reg = gen_lowpart (innermode, reg);
+      SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
+      seq = get_insns ();
+      end_sequence ();
+      if (seq)
+	emit_insn_before (seq, insn);
+
+      ok = recog_memoized (insn) >= 0;
+      gcc_assert (ok);
+    }
+  return true;
+}
+
+/* Get a vector mode of the same size as the original but with elements
+   twice as wide.  This is only guaranteed to apply to integral vectors.  */
+
+static machine_mode
+get_mode_wider_vector (machine_mode o)
+{
+  /* ??? Rely on the ordering that genmodes.c gives to vectors.  */
+  machine_mode n = GET_MODE_WIDER_MODE (o).require ();
+  gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
+  gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
+  return n;
+}
+
+static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
+static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
+
+/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
+   with all elements equal to VAR.  Return true if successful.  */
+
+static bool
+ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
+				   rtx target, rtx val)
+{
+  bool ok;
+
+  switch (mode)
+    {
+    case E_V2SImode:
+    case E_V2SFmode:
+      if (!mmx_ok)
+	return false;
+      /* FALLTHRU */
+
+    case E_V4DFmode:
+    case E_V4DImode:
+    case E_V8SFmode:
+    case E_V8SImode:
+    case E_V2DFmode:
+    case E_V2DImode:
+    case E_V4SFmode:
+    case E_V4SImode:
+    case E_V16SImode:
+    case E_V8DImode:
+    case E_V16SFmode:
+    case E_V8DFmode:
+      return ix86_vector_duplicate_value (mode, target, val);
+
+    case E_V4HImode:
+      if (!mmx_ok)
+	return false;
+      if (TARGET_SSE || TARGET_3DNOW_A)
+	{
+	  rtx x;
+
+	  val = gen_lowpart (SImode, val);
+	  x = gen_rtx_TRUNCATE (HImode, val);
+	  x = gen_rtx_VEC_DUPLICATE (mode, x);
+	  emit_insn (gen_rtx_SET (target, x));
+	  return true;
+	}
+      goto widen;
+
+    case E_V8QImode:
+      if (!mmx_ok)
+	return false;
+      goto widen;
+
+    case E_V8HImode:
+      if (TARGET_AVX2)
+	return ix86_vector_duplicate_value (mode, target, val);
+
+      if (TARGET_SSE2)
+	{
+	  struct expand_vec_perm_d dperm;
+	  rtx tmp1, tmp2;
+
+	permute:
+	  memset (&dperm, 0, sizeof (dperm));
+	  dperm.target = target;
+	  dperm.vmode = mode;
+	  dperm.nelt = GET_MODE_NUNITS (mode);
+	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
+	  dperm.one_operand_p = true;
+
+	  /* Extend to SImode using a paradoxical SUBREG.  */
+	  tmp1 = gen_reg_rtx (SImode);
+	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
+
+	  /* Insert the SImode value as low element of a V4SImode vector. */
+	  tmp2 = gen_reg_rtx (V4SImode);
+	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
+	  emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
+
+	  ok = (expand_vec_perm_1 (&dperm)
+		|| expand_vec_perm_broadcast_1 (&dperm));
+	  gcc_assert (ok);
+	  return ok;
+	}
+      goto widen;
+
+    case E_V16QImode:
+      if (TARGET_AVX2)
+	return ix86_vector_duplicate_value (mode, target, val);
+
+      if (TARGET_SSE2)
+	goto permute;
+      goto widen;
+
+    widen:
+      /* Replicate the value once into the next wider mode and recurse.  */
+      {
+	machine_mode smode, wsmode, wvmode;
+	rtx x;
+
+	smode = GET_MODE_INNER (mode);
+	wvmode = get_mode_wider_vector (mode);
+	wsmode = GET_MODE_INNER (wvmode);
+
+	val = convert_modes (wsmode, smode, val, true);
+	x = expand_simple_binop (wsmode, ASHIFT, val,
+				 GEN_INT (GET_MODE_BITSIZE (smode)),
+				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
+	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
+
+	x = gen_reg_rtx (wvmode);
+	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
+	gcc_assert (ok);
+	emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
+	return ok;
+      }
+
+    case E_V16HImode:
+    case E_V32QImode:
+      if (TARGET_AVX2)
+	return ix86_vector_duplicate_value (mode, target, val);
+      else
+	{
+	  machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
+	  rtx x = gen_reg_rtx (hvmode);
+
+	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
+	  gcc_assert (ok);
+
+	  x = gen_rtx_VEC_CONCAT (mode, x, x);
+	  emit_insn (gen_rtx_SET (target, x));
+	}
+      return true;
+
+    case E_V64QImode:
+    case E_V32HImode:
+      if (TARGET_AVX512BW)
+	return ix86_vector_duplicate_value (mode, target, val);
+      else
+	{
+	  machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
+	  rtx x = gen_reg_rtx (hvmode);
+
+	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
+	  gcc_assert (ok);
+
+	  x = gen_rtx_VEC_CONCAT (mode, x, x);
+	  emit_insn (gen_rtx_SET (target, x));
+	}
+      return true;
+
+    default:
+      return false;
+    }
+}
+
+/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
+   whose ONE_VAR element is VAR, and other elements are zero.  Return true
+   if successful.  */
+
+static bool
+ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
+				     rtx target, rtx var, int one_var)
+{
+  machine_mode vsimode;
+  rtx new_target;
+  rtx x, tmp;
+  bool use_vector_set = false;
+  rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
+
+  switch (mode)
+    {
+    case E_V2DImode:
+      /* For SSE4.1, we normally use vector set.  But if the second
+	 element is zero and inter-unit moves are OK, we use movq
+	 instead.  */
+      use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
+			&& !(TARGET_INTER_UNIT_MOVES_TO_VEC
+			     && one_var == 0));
+      break;
+    case E_V16QImode:
+    case E_V4SImode:
+    case E_V4SFmode:
+      use_vector_set = TARGET_SSE4_1;
+      break;
+    case E_V8HImode:
+      use_vector_set = TARGET_SSE2;
+      break;
+    case E_V4HImode:
+      use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
+      break;
+    case E_V32QImode:
+    case E_V16HImode:
+      use_vector_set = TARGET_AVX;
+      break;
+    case E_V8SImode:
+      use_vector_set = TARGET_AVX;
+      gen_vec_set_0 = gen_vec_setv8si_0;
+      break;
+    case E_V8SFmode:
+      use_vector_set = TARGET_AVX;
+      gen_vec_set_0 = gen_vec_setv8sf_0;
+      break;
+    case E_V4DFmode:
+      use_vector_set = TARGET_AVX;
+      gen_vec_set_0 = gen_vec_setv4df_0;
+      break;
+    case E_V4DImode:
+      /* Use ix86_expand_vector_set in 64bit mode only.  */
+      use_vector_set = TARGET_AVX && TARGET_64BIT;
+      gen_vec_set_0 = gen_vec_setv4di_0;
+      break;
+    case E_V16SImode:
+      use_vector_set = TARGET_AVX512F && one_var == 0;
+      gen_vec_set_0 = gen_vec_setv16si_0;
+      break;
+    case E_V16SFmode:
+      use_vector_set = TARGET_AVX512F && one_var == 0;
+      gen_vec_set_0 = gen_vec_setv16sf_0;
+      break;
+    case E_V8DFmode:
+      use_vector_set = TARGET_AVX512F && one_var == 0;
+      gen_vec_set_0 = gen_vec_setv8df_0;
+      break;
+    case E_V8DImode:
+      /* Use ix86_expand_vector_set in 64bit mode only.  */
+      use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
+      gen_vec_set_0 = gen_vec_setv8di_0;
+      break;
+    default:
+      break;
+    }
+
+  if (use_vector_set)
+    {
+      if (gen_vec_set_0 && one_var == 0)
+	{
+	  var = force_reg (GET_MODE_INNER (mode), var);
+	  emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
+	  return true;
+	}
+      emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
+      var = force_reg (GET_MODE_INNER (mode), var);
+      ix86_expand_vector_set (mmx_ok, target, var, one_var);
+      return true;
+    }
+
+  switch (mode)
+    {
+    case E_V2SFmode:
+    case E_V2SImode:
+      if (!mmx_ok)
+	return false;
+      /* FALLTHRU */
+
+    case E_V2DFmode:
+    case E_V2DImode:
+      if (one_var != 0)
+	return false;
+      var = force_reg (GET_MODE_INNER (mode), var);
+      x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
+      emit_insn (gen_rtx_SET (target, x));
+      return true;
+
+    case E_V4SFmode:
+    case E_V4SImode:
+      if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+	new_target = gen_reg_rtx (mode);
+      else
+	new_target = target;
+      var = force_reg (GET_MODE_INNER (mode), var);
+      x = gen_rtx_VEC_DUPLICATE (mode, var);
+      x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
+      emit_insn (gen_rtx_SET (new_target, x));
+      if (one_var != 0)
+	{
+	  /* We need to shuffle the value to the correct position, so
+	     create a new pseudo to store the intermediate result.  */
+
+	  /* With SSE2, we can use the integer shuffle insns.  */
+	  if (mode != V4SFmode && TARGET_SSE2)
+	    {
+	      emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
+					    const1_rtx,
+					    GEN_INT (one_var == 1 ? 0 : 1),
+					    GEN_INT (one_var == 2 ? 0 : 1),
+					    GEN_INT (one_var == 3 ? 0 : 1)));
+	      if (target != new_target)
+		emit_move_insn (target, new_target);
+	      return true;
+	    }
+
+	  /* Otherwise convert the intermediate result to V4SFmode and
+	     use the SSE1 shuffle instructions.  */
+	  if (mode != V4SFmode)
+	    {
+	      tmp = gen_reg_rtx (V4SFmode);
+	      emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
+	    }
+	  else
+	    tmp = new_target;
+
+	  emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
+				       const1_rtx,
+				       GEN_INT (one_var == 1 ? 0 : 1),
+				       GEN_INT (one_var == 2 ? 0+4 : 1+4),
+				       GEN_INT (one_var == 3 ? 0+4 : 1+4)));
+
+	  if (mode != V4SFmode)
+	    emit_move_insn (target, gen_lowpart (V4SImode, tmp));
+	  else if (tmp != target)
+	    emit_move_insn (target, tmp);
+	}
+      else if (target != new_target)
+	emit_move_insn (target, new_target);
+      return true;
+
+    case E_V8HImode:
+    case E_V16QImode:
+      vsimode = V4SImode;
+      goto widen;
+    case E_V4HImode:
+    case E_V8QImode:
+      if (!mmx_ok)
+	return false;
+      vsimode = V2SImode;
+      goto widen;
+    widen:
+      if (one_var != 0)
+	return false;
+
+      /* Zero extend the variable element to SImode and recurse.  */
+      var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
+
+      x = gen_reg_rtx (vsimode);
+      if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
+						var, one_var))
+	gcc_unreachable ();
+
+      emit_move_insn (target, gen_lowpart (mode, x));
+      return true;
+
+    default:
+      return false;
+    }
+}
+
+/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
+   consisting of the values in VALS.  It is known that all elements
+   except ONE_VAR are constants.  Return true if successful.  */
+
+static bool
+ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
+				 rtx target, rtx vals, int one_var)
+{
+  rtx var = XVECEXP (vals, 0, one_var);
+  machine_mode wmode;
+  rtx const_vec, x;
+
+  const_vec = copy_rtx (vals);
+  XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
+  const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
+
+  switch (mode)
+    {
+    case E_V2DFmode:
+    case E_V2DImode:
+    case E_V2SFmode:
+    case E_V2SImode:
+      /* For the two element vectors, it's just as easy to use
+	 the general case.  */
+      return false;
+
+    case E_V4DImode:
+      /* Use ix86_expand_vector_set in 64bit mode only.  */
+      if (!TARGET_64BIT)
+	return false;
+      /* FALLTHRU */
+    case E_V4DFmode:
+    case E_V8SFmode:
+    case E_V8SImode:
+    case E_V16HImode:
+    case E_V32QImode:
+    case E_V4SFmode:
+    case E_V4SImode:
+    case E_V8HImode:
+    case E_V4HImode:
+      break;
+
+    case E_V16QImode:
+      if (TARGET_SSE4_1)
+	break;
+      wmode = V8HImode;
+      goto widen;
+    case E_V8QImode:
+      wmode = V4HImode;
+      goto widen;
+    widen:
+      /* There's no way to set one QImode entry easily.  Combine
+	 the variable value with its adjacent constant value, and
+	 promote to an HImode set.  */
+      x = XVECEXP (vals, 0, one_var ^ 1);
+      if (one_var & 1)
+	{
+	  var = convert_modes (HImode, QImode, var, true);
+	  var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
+				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
+	  x = GEN_INT (INTVAL (x) & 0xff);
+	}
+      else
+	{
+	  var = convert_modes (HImode, QImode, var, true);
+	  x = gen_int_mode (UINTVAL (x) << 8, HImode);
+	}
+      if (x != const0_rtx)
+	var = expand_simple_binop (HImode, IOR, var, x, var,
+				   1, OPTAB_LIB_WIDEN);
+
+      x = gen_reg_rtx (wmode);
+      emit_move_insn (x, gen_lowpart (wmode, const_vec));
+      ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
+
+      emit_move_insn (target, gen_lowpart (mode, x));
+      return true;
+
+    default:
+      return false;
+    }
+
+  emit_move_insn (target, const_vec);
+  ix86_expand_vector_set (mmx_ok, target, var, one_var);
+  return true;
+}
+
+/* A subroutine of ix86_expand_vector_init_general.  Use vector
+   concatenate to handle the most general case: all values variable,
+   and none identical.  */
+
+static void
+ix86_expand_vector_init_concat (machine_mode mode,
+				rtx target, rtx *ops, int n)
+{
+  machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
+  rtx first[16], second[8], third[4];
+  rtvec v;
+  int i, j;
+
+  switch (n)
+    {
+    case 2:
+      switch (mode)
+	{
+	case E_V16SImode:
+	  cmode = V8SImode;
+	  break;
+	case E_V16SFmode:
+	  cmode = V8SFmode;
+	  break;
+	case E_V8DImode:
+	  cmode = V4DImode;
+	  break;
+	case E_V8DFmode:
+	  cmode = V4DFmode;
+	  break;
+	case E_V8SImode:
+	  cmode = V4SImode;
+	  break;
+	case E_V8SFmode:
+	  cmode = V4SFmode;
+	  break;
+	case E_V4DImode:
+	  cmode = V2DImode;
+	  break;
+	case E_V4DFmode:
+	  cmode = V2DFmode;
+	  break;
+	case E_V4SImode:
+	  cmode = V2SImode;
+	  break;
+	case E_V4SFmode:
+	  cmode = V2SFmode;
+	  break;
+	case E_V2DImode:
+	  cmode = DImode;
+	  break;
+	case E_V2SImode:
+	  cmode = SImode;
+	  break;
+	case E_V2DFmode:
+	  cmode = DFmode;
+	  break;
+	case E_V2SFmode:
+	  cmode = SFmode;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      if (!register_operand (ops[1], cmode))
+	ops[1] = force_reg (cmode, ops[1]);
+      if (!register_operand (ops[0], cmode))
+	ops[0] = force_reg (cmode, ops[0]);
+      emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
+							  ops[1])));
+      break;
+
+    case 4:
+      switch (mode)
+	{
+	case E_V4DImode:
+	  cmode = V2DImode;
+	  break;
+	case E_V4DFmode:
+	  cmode = V2DFmode;
+	  break;
+	case E_V4SImode:
+	  cmode = V2SImode;
+	  break;
+	case E_V4SFmode:
+	  cmode = V2SFmode;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      goto half;
+
+    case 8:
+      switch (mode)
+	{
+	case E_V8DImode:
+	  cmode = V2DImode;
+	  hmode = V4DImode;
+	  break;
+	case E_V8DFmode:
+	  cmode = V2DFmode;
+	  hmode = V4DFmode;
+	  break;
+	case E_V8SImode:
+	  cmode = V2SImode;
+	  hmode = V4SImode;
+	  break;
+	case E_V8SFmode:
+	  cmode = V2SFmode;
+	  hmode = V4SFmode;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      goto half;
+
+    case 16:
+      switch (mode)
+	{
+	case E_V16SImode:
+	  cmode = V2SImode;
+	  hmode = V4SImode;
+	  gmode = V8SImode;
+	  break;
+	case E_V16SFmode:
+	  cmode = V2SFmode;
+	  hmode = V4SFmode;
+	  gmode = V8SFmode;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      goto half;
+
+half:
+      /* FIXME: We process inputs backward to help RA.  PR 36222.  */
+      i = n - 1;
+      j = (n >> 1) - 1;
+      for (; i > 0; i -= 2, j--)
+	{
+	  first[j] = gen_reg_rtx (cmode);
+	  v = gen_rtvec (2, ops[i - 1], ops[i]);
+	  ix86_expand_vector_init (false, first[j],
+				   gen_rtx_PARALLEL (cmode, v));
+	}
+
+      n >>= 1;
+      if (n > 4)
+	{
+	  gcc_assert (hmode != VOIDmode);
+	  gcc_assert (gmode != VOIDmode);
+	  for (i = j = 0; i < n; i += 2, j++)
+	    {
+	      second[j] = gen_reg_rtx (hmode);
+	      ix86_expand_vector_init_concat (hmode, second [j],
+					      &first [i], 2);
+	    }
+	  n >>= 1;
+	  for (i = j = 0; i < n; i += 2, j++)
+	    {
+	      third[j] = gen_reg_rtx (gmode);
+	      ix86_expand_vector_init_concat (gmode, third[j],
+					      &second[i], 2);
+	    }
+	  n >>= 1;
+	  ix86_expand_vector_init_concat (mode, target, third, n);
+	}
+      else if (n > 2)
+	{
+	  gcc_assert (hmode != VOIDmode);
+	  for (i = j = 0; i < n; i += 2, j++)
+	    {
+	      second[j] = gen_reg_rtx (hmode);
+	      ix86_expand_vector_init_concat (hmode, second [j],
+					      &first [i], 2);
+	    }
+	  n >>= 1;
+	  ix86_expand_vector_init_concat (mode, target, second, n);
+	}
+      else
+	ix86_expand_vector_init_concat (mode, target, first, n);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* A subroutine of ix86_expand_vector_init_general.  Use vector
+   interleave to handle the most general case: all values variable,
+   and none identical.  */
+
+static void
+ix86_expand_vector_init_interleave (machine_mode mode,
+				    rtx target, rtx *ops, int n)
+{
+  machine_mode first_imode, second_imode, third_imode, inner_mode;
+  int i, j;
+  rtx op0, op1;
+  rtx (*gen_load_even) (rtx, rtx, rtx);
+  rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
+  rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
+
+  switch (mode)
+    {
+    case E_V8HImode:
+      gen_load_even = gen_vec_setv8hi;
+      gen_interleave_first_low = gen_vec_interleave_lowv4si;
+      gen_interleave_second_low = gen_vec_interleave_lowv2di;
+      inner_mode = HImode;
+      first_imode = V4SImode;
+      second_imode = V2DImode;
+      third_imode = VOIDmode;
+      break;
+    case E_V16QImode:
+      gen_load_even = gen_vec_setv16qi;
+      gen_interleave_first_low = gen_vec_interleave_lowv8hi;
+      gen_interleave_second_low = gen_vec_interleave_lowv4si;
+      inner_mode = QImode;
+      first_imode = V8HImode;
+      second_imode = V4SImode;
+      third_imode = V2DImode;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  for (i = 0; i < n; i++)
+    {
+      /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
+      op0 = gen_reg_rtx (SImode);
+      emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
+
+      /* Insert the SImode value as low element of V4SImode vector. */
+      op1 = gen_reg_rtx (V4SImode);
+      op0 = gen_rtx_VEC_MERGE (V4SImode,
+			       gen_rtx_VEC_DUPLICATE (V4SImode,
+						      op0),
+			       CONST0_RTX (V4SImode),
+			       const1_rtx);
+      emit_insn (gen_rtx_SET (op1, op0));
+
+      /* Cast the V4SImode vector back to a vector in orignal mode.  */
+      op0 = gen_reg_rtx (mode);
+      emit_move_insn (op0, gen_lowpart (mode, op1));
+
+      /* Load even elements into the second position.  */
+      emit_insn (gen_load_even (op0,
+				force_reg (inner_mode,
+					   ops [i + i + 1]),
+				const1_rtx));
+
+      /* Cast vector to FIRST_IMODE vector.  */
+      ops[i] = gen_reg_rtx (first_imode);
+      emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
+    }
+
+  /* Interleave low FIRST_IMODE vectors.  */
+  for (i = j = 0; i < n; i += 2, j++)
+    {
+      op0 = gen_reg_rtx (first_imode);
+      emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
+
+      /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
+      ops[j] = gen_reg_rtx (second_imode);
+      emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
+    }
+
+  /* Interleave low SECOND_IMODE vectors.  */
+  switch (second_imode)
+    {
+    case E_V4SImode:
+      for (i = j = 0; i < n / 2; i += 2, j++)
+	{
+	  op0 = gen_reg_rtx (second_imode);
+	  emit_insn (gen_interleave_second_low (op0, ops[i],
+						ops[i + 1]));
+
+	  /* Cast the SECOND_IMODE vector to the THIRD_IMODE
+	     vector.  */
+	  ops[j] = gen_reg_rtx (third_imode);
+	  emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
+	}
+      second_imode = V2DImode;
+      gen_interleave_second_low = gen_vec_interleave_lowv2di;
+      /* FALLTHRU */
+
+    case E_V2DImode:
+      op0 = gen_reg_rtx (second_imode);
+      emit_insn (gen_interleave_second_low (op0, ops[0],
+					    ops[1]));
+
+      /* Cast the SECOND_IMODE vector back to a vector on original
+	 mode.  */
+      emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* A subroutine of ix86_expand_vector_init.  Handle the most general case:
+   all values variable, and none identical.  */
+
+static void
+ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
+				 rtx target, rtx vals)
+{
+  rtx ops[64], op0, op1, op2, op3, op4, op5;
+  machine_mode half_mode = VOIDmode;
+  machine_mode quarter_mode = VOIDmode;
+  int n, i;
+
+  switch (mode)
+    {
+    case E_V2SFmode:
+    case E_V2SImode:
+      if (!mmx_ok && !TARGET_SSE)
+	break;
+      /* FALLTHRU */
+
+    case E_V16SImode:
+    case E_V16SFmode:
+    case E_V8DFmode:
+    case E_V8DImode:
+    case E_V8SFmode:
+    case E_V8SImode:
+    case E_V4DFmode:
+    case E_V4DImode:
+    case E_V4SFmode:
+    case E_V4SImode:
+    case E_V2DFmode:
+    case E_V2DImode:
+      n = GET_MODE_NUNITS (mode);
+      for (i = 0; i < n; i++)
+	ops[i] = XVECEXP (vals, 0, i);
+      ix86_expand_vector_init_concat (mode, target, ops, n);
+      return;
+
+    case E_V2TImode:
+      for (i = 0; i < 2; i++)
+	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
+      op0 = gen_reg_rtx (V4DImode);
+      ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
+      emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
+      return;
+
+    case E_V4TImode:
+      for (i = 0; i < 4; i++)
+	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
+      ops[4] = gen_reg_rtx (V4DImode);
+      ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
+      ops[5] = gen_reg_rtx (V4DImode);
+      ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
+      op0 = gen_reg_rtx (V8DImode);
+      ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
+      emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
+      return;
+
+    case E_V32QImode:
+      half_mode = V16QImode;
+      goto half;
+
+    case E_V16HImode:
+      half_mode = V8HImode;
+      goto half;
+
+half:
+      n = GET_MODE_NUNITS (mode);
+      for (i = 0; i < n; i++)
+	ops[i] = XVECEXP (vals, 0, i);
+      op0 = gen_reg_rtx (half_mode);
+      op1 = gen_reg_rtx (half_mode);
+      ix86_expand_vector_init_interleave (half_mode, op0, ops,
+					  n >> 2);
+      ix86_expand_vector_init_interleave (half_mode, op1,
+					  &ops [n >> 1], n >> 2);
+      emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
+      return;
+
+    case E_V64QImode:
+      quarter_mode = V16QImode;
+      half_mode = V32QImode;
+      goto quarter;
+
+    case E_V32HImode:
+      quarter_mode = V8HImode;
+      half_mode = V16HImode;
+      goto quarter;
+
+quarter:
+      n = GET_MODE_NUNITS (mode);
+      for (i = 0; i < n; i++)
+	ops[i] = XVECEXP (vals, 0, i);
+      op0 = gen_reg_rtx (quarter_mode);
+      op1 = gen_reg_rtx (quarter_mode);
+      op2 = gen_reg_rtx (quarter_mode);
+      op3 = gen_reg_rtx (quarter_mode);
+      op4 = gen_reg_rtx (half_mode);
+      op5 = gen_reg_rtx (half_mode);
+      ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
+					  n >> 3);
+      ix86_expand_vector_init_interleave (quarter_mode, op1,
+					  &ops [n >> 2], n >> 3);
+      ix86_expand_vector_init_interleave (quarter_mode, op2,
+					  &ops [n >> 1], n >> 3);
+      ix86_expand_vector_init_interleave (quarter_mode, op3,
+					  &ops [(n >> 1) | (n >> 2)], n >> 3);
+      emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
+      emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
+      emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
+      return;
+
+    case E_V16QImode:
+      if (!TARGET_SSE4_1)
+	break;
+      /* FALLTHRU */
+
+    case E_V8HImode:
+      if (!TARGET_SSE2)
+	break;
+
+      /* Don't use ix86_expand_vector_init_interleave if we can't
+	 move from GPR to SSE register directly.  */
+      if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
+	break;
+
+      n = GET_MODE_NUNITS (mode);
+      for (i = 0; i < n; i++)
+	ops[i] = XVECEXP (vals, 0, i);
+      ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
+      return;
+
+    case E_V4HImode:
+    case E_V8QImode:
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+    {
+      int i, j, n_elts, n_words, n_elt_per_word;
+      machine_mode inner_mode;
+      rtx words[4], shift;
+
+      inner_mode = GET_MODE_INNER (mode);
+      n_elts = GET_MODE_NUNITS (mode);
+      n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
+      n_elt_per_word = n_elts / n_words;
+      shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
+
+      for (i = 0; i < n_words; ++i)
+	{
+	  rtx word = NULL_RTX;
+
+	  for (j = 0; j < n_elt_per_word; ++j)
+	    {
+	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
+	      elt = convert_modes (word_mode, inner_mode, elt, true);
+
+	      if (j == 0)
+		word = elt;
+	      else
+		{
+		  word = expand_simple_binop (word_mode, ASHIFT, word, shift,
+					      word, 1, OPTAB_LIB_WIDEN);
+		  word = expand_simple_binop (word_mode, IOR, word, elt,
+					      word, 1, OPTAB_LIB_WIDEN);
+		}
+	    }
+
+	  words[i] = word;
+	}
+
+      if (n_words == 1)
+	emit_move_insn (target, gen_lowpart (mode, words[0]));
+      else if (n_words == 2)
+	{
+	  rtx tmp = gen_reg_rtx (mode);
+	  emit_clobber (tmp);
+	  emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
+	  emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
+	  emit_move_insn (target, tmp);
+	}
+      else if (n_words == 4)
+	{
+	  rtx tmp = gen_reg_rtx (V4SImode);
+	  gcc_assert (word_mode == SImode);
+	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
+	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
+	  emit_move_insn (target, gen_lowpart (mode, tmp));
+	}
+      else
+	gcc_unreachable ();
+    }
+}
+
+/* Initialize vector TARGET via VALS.  Suppress the use of MMX
+   instructions unless MMX_OK is true.  */
+
+void
+ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
+{
+  machine_mode mode = GET_MODE (target);
+  machine_mode inner_mode = GET_MODE_INNER (mode);
+  int n_elts = GET_MODE_NUNITS (mode);
+  int n_var = 0, one_var = -1;
+  bool all_same = true, all_const_zero = true;
+  int i;
+  rtx x;
+
+  /* Handle first initialization from vector elts.  */
+  if (n_elts != XVECLEN (vals, 0))
+    {
+      rtx subtarget = target;
+      x = XVECEXP (vals, 0, 0);
+      gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
+      if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
+	{
+	  rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
+	  if (inner_mode == QImode || inner_mode == HImode)
+	    {
+	      unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
+	      mode = mode_for_vector (SImode, n_bits / 4).require ();
+	      inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
+	      ops[0] = gen_lowpart (inner_mode, ops[0]);
+	      ops[1] = gen_lowpart (inner_mode, ops[1]);
+	      subtarget = gen_reg_rtx (mode);
+	    }
+	  ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
+	  if (subtarget != target)
+	    emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
+	  return;
+	}
+      gcc_unreachable ();
+    }
+
+  for (i = 0; i < n_elts; ++i)
+    {
+      x = XVECEXP (vals, 0, i);
+      if (!(CONST_SCALAR_INT_P (x)
+	    || CONST_DOUBLE_P (x)
+	    || CONST_FIXED_P (x)))
+	n_var++, one_var = i;
+      else if (x != CONST0_RTX (inner_mode))
+	all_const_zero = false;
+      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
+	all_same = false;
+    }
+
+  /* Constants are best loaded from the constant pool.  */
+  if (n_var == 0)
+    {
+      emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
+      return;
+    }
+
+  /* If all values are identical, broadcast the value.  */
+  if (all_same
+      && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
+					    XVECEXP (vals, 0, 0)))
+    return;
+
+  /* Values where only one field is non-constant are best loaded from
+     the pool and overwritten via move later.  */
+  if (n_var == 1)
+    {
+      if (all_const_zero
+	  && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
+						  XVECEXP (vals, 0, one_var),
+						  one_var))
+	return;
+
+      if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
+	return;
+    }
+
+  ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
+}
+
+void
+ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
+{
+  machine_mode mode = GET_MODE (target);
+  machine_mode inner_mode = GET_MODE_INNER (mode);
+  machine_mode half_mode;
+  bool use_vec_merge = false;
+  rtx tmp;
+  static rtx (*gen_extract[6][2]) (rtx, rtx)
+    = {
+	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
+	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
+	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
+	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
+	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
+	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
+      };
+  static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
+    = {
+	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
+	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
+	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
+	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
+	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
+	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
+      };
+  int i, j, n;
+  machine_mode mmode = VOIDmode;
+  rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
+
+  switch (mode)
+    {
+    case E_V2SFmode:
+    case E_V2SImode:
+      if (mmx_ok)
+	{
+	  tmp = gen_reg_rtx (GET_MODE_INNER (mode));
+	  ix86_expand_vector_extract (true, tmp, target, 1 - elt);
+	  if (elt == 0)
+	    tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
+	  else
+	    tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
+	  emit_insn (gen_rtx_SET (target, tmp));
+	  return;
+	}
+      break;
+
+    case E_V2DImode:
+      use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
+      if (use_vec_merge)
+	break;
+
+      tmp = gen_reg_rtx (GET_MODE_INNER (mode));
+      ix86_expand_vector_extract (false, tmp, target, 1 - elt);
+      if (elt == 0)
+	tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
+      else
+	tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
+      emit_insn (gen_rtx_SET (target, tmp));
+      return;
+
+    case E_V2DFmode:
+      {
+	rtx op0, op1;
+
+	/* For the two element vectors, we implement a VEC_CONCAT with
+	   the extraction of the other element.  */
+
+	tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
+	tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
+
+	if (elt == 0)
+	  op0 = val, op1 = tmp;
+	else
+	  op0 = tmp, op1 = val;
+
+	tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
+	emit_insn (gen_rtx_SET (target, tmp));
+      }
+      return;
+
+    case E_V4SFmode:
+      use_vec_merge = TARGET_SSE4_1;
+      if (use_vec_merge)
+	break;
+
+      switch (elt)
+	{
+	case 0:
+	  use_vec_merge = true;
+	  break;
+
+	case 1:
+	  /* tmp = target = A B C D */
+	  tmp = copy_to_reg (target);
+	  /* target = A A B B */
+	  emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
+	  /* target = X A B B */
+	  ix86_expand_vector_set (false, target, val, 0);
+	  /* target = A X C D  */
+	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
+					  const1_rtx, const0_rtx,
+					  GEN_INT (2+4), GEN_INT (3+4)));
+	  return;
+
+	case 2:
+	  /* tmp = target = A B C D */
+	  tmp = copy_to_reg (target);
+	  /* tmp = X B C D */
+	  ix86_expand_vector_set (false, tmp, val, 0);
+	  /* target = A B X D */
+	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
+					  const0_rtx, const1_rtx,
+					  GEN_INT (0+4), GEN_INT (3+4)));
+	  return;
+
+	case 3:
+	  /* tmp = target = A B C D */
+	  tmp = copy_to_reg (target);
+	  /* tmp = X B C D */
+	  ix86_expand_vector_set (false, tmp, val, 0);
+	  /* target = A B X D */
+	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
+					  const0_rtx, const1_rtx,
+					  GEN_INT (2+4), GEN_INT (0+4)));
+	  return;
+
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+
+    case E_V4SImode:
+      use_vec_merge = TARGET_SSE4_1;
+      if (use_vec_merge)
+	break;
+
+      /* Element 0 handled by vec_merge below.  */
+      if (elt == 0)
+	{
+	  use_vec_merge = true;
+	  break;
+	}
+
+      if (TARGET_SSE2)
+	{
+	  /* With SSE2, use integer shuffles to swap element 0 and ELT,
+	     store into element 0, then shuffle them back.  */
+
+	  rtx order[4];
+
+	  order[0] = GEN_INT (elt);
+	  order[1] = const1_rtx;
+	  order[2] = const2_rtx;
+	  order[3] = GEN_INT (3);
+	  order[elt] = const0_rtx;
+
+	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
+					order[1], order[2], order[3]));
+
+	  ix86_expand_vector_set (false, target, val, 0);
+
+	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
+					order[1], order[2], order[3]));
+	}
+      else
+	{
+	  /* For SSE1, we have to reuse the V4SF code.  */
+	  rtx t = gen_reg_rtx (V4SFmode);
+	  emit_move_insn (t, gen_lowpart (V4SFmode, target));
+	  ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
+	  emit_move_insn (target, gen_lowpart (mode, t));
+	}
+      return;
+
+    case E_V8HImode:
+      use_vec_merge = TARGET_SSE2;
+      break;
+    case E_V4HImode:
+      use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
+      break;
+
+    case E_V16QImode:
+      use_vec_merge = TARGET_SSE4_1;
+      break;
+
+    case E_V8QImode:
+      break;
+
+    case E_V32QImode:
+      half_mode = V16QImode;
+      j = 0;
+      n = 16;
+      goto half;
+
+    case E_V16HImode:
+      half_mode = V8HImode;
+      j = 1;
+      n = 8;
+      goto half;
+
+    case E_V8SImode:
+      half_mode = V4SImode;
+      j = 2;
+      n = 4;
+      goto half;
+
+    case E_V4DImode:
+      half_mode = V2DImode;
+      j = 3;
+      n = 2;
+      goto half;
+
+    case E_V8SFmode:
+      half_mode = V4SFmode;
+      j = 4;
+      n = 4;
+      goto half;
+
+    case E_V4DFmode:
+      half_mode = V2DFmode;
+      j = 5;
+      n = 2;
+      goto half;
+
+half:
+      /* Compute offset.  */
+      i = elt / n;
+      elt %= n;
+
+      gcc_assert (i <= 1);
+
+      /* Extract the half.  */
+      tmp = gen_reg_rtx (half_mode);
+      emit_insn (gen_extract[j][i] (tmp, target));
+
+      /* Put val in tmp at elt.  */
+      ix86_expand_vector_set (false, tmp, val, elt);
+
+      /* Put it back.  */
+      emit_insn (gen_insert[j][i] (target, target, tmp));
+      return;
+
+    case E_V8DFmode:
+      if (TARGET_AVX512F)
+	{
+	  mmode = QImode;
+	  gen_blendm = gen_avx512f_blendmv8df;
+	}
+      break;
+
+    case E_V8DImode:
+      if (TARGET_AVX512F)
+	{
+	  mmode = QImode;
+	  gen_blendm = gen_avx512f_blendmv8di;
+	}
+      break;
+
+    case E_V16SFmode:
+      if (TARGET_AVX512F)
+	{
+	  mmode = HImode;
+	  gen_blendm = gen_avx512f_blendmv16sf;
+	}
+      break;
+
+    case E_V16SImode:
+      if (TARGET_AVX512F)
+	{
+	  mmode = HImode;
+	  gen_blendm = gen_avx512f_blendmv16si;
+	}
+      break;
+
+    case E_V32HImode:
+      if (TARGET_AVX512BW)
+	{
+	  mmode = SImode;
+	  gen_blendm = gen_avx512bw_blendmv32hi;
+	}
+      else if (TARGET_AVX512F)
+	{
+	  half_mode = E_V8HImode;
+	  n = 8;
+	  goto quarter;
+	}
+      break;
+
+    case E_V64QImode:
+      if (TARGET_AVX512BW)
+	{
+	  mmode = DImode;
+	  gen_blendm = gen_avx512bw_blendmv64qi;
+	}
+      else if (TARGET_AVX512F)
+	{
+	  half_mode = E_V16QImode;
+	  n = 16;
+	  goto quarter;
+	}
+      break;
+
+quarter:
+      /* Compute offset.  */
+      i = elt / n;
+      elt %= n;
+
+      gcc_assert (i <= 3);
+
+      {
+	/* Extract the quarter.  */
+	tmp = gen_reg_rtx (V4SImode);
+	rtx tmp2 = gen_lowpart (V16SImode, target);
+	rtx mask = gen_reg_rtx (QImode);
+
+	emit_move_insn (mask, constm1_rtx);
+	emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
+						   tmp, mask));
+
+	tmp2 = gen_reg_rtx (half_mode);
+	emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
+	tmp = tmp2;
+
+	/* Put val in tmp at elt.  */
+	ix86_expand_vector_set (false, tmp, val, elt);
+
+	/* Put it back.  */
+	tmp2 = gen_reg_rtx (V16SImode);
+	rtx tmp3 = gen_lowpart (V16SImode, target);
+	mask = gen_reg_rtx (HImode);
+	emit_move_insn (mask, constm1_rtx);
+	tmp = gen_lowpart (V4SImode, tmp);
+	emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
+						  tmp3, mask));
+	emit_move_insn (target, gen_lowpart (mode, tmp2));
+      }
+      return;
+
+    default:
+      break;
+    }
+
+  if (mmode != VOIDmode)
+    {
+      tmp = gen_reg_rtx (mode);
+      emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
+      /* The avx512*_blendm<mode> expanders have different operand order
+	 from VEC_MERGE.  In VEC_MERGE, the first input operand is used for
+	 elements where the mask is set and second input operand otherwise,
+	 in {sse,avx}*_*blend* the first input operand is used for elements
+	 where the mask is clear and second input operand otherwise.  */
+      emit_insn (gen_blendm (target, target, tmp,
+			     force_reg (mmode,
+					gen_int_mode (HOST_WIDE_INT_1U << elt,
+						      mmode))));
+    }
+  else if (use_vec_merge)
+    {
+      tmp = gen_rtx_VEC_DUPLICATE (mode, val);
+      tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
+			       GEN_INT (HOST_WIDE_INT_1U << elt));
+      emit_insn (gen_rtx_SET (target, tmp));
+    }
+  else
+    {
+      rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
+
+      emit_move_insn (mem, target);
+
+      tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
+      emit_move_insn (tmp, val);
+
+      emit_move_insn (target, mem);
+    }
+}
+
+void
+ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
+{
+  machine_mode mode = GET_MODE (vec);
+  machine_mode inner_mode = GET_MODE_INNER (mode);
+  bool use_vec_extr = false;
+  rtx tmp;
+
+  switch (mode)
+    {
+    case E_V2SImode:
+    case E_V2SFmode:
+      if (!mmx_ok)
+	break;
+      /* FALLTHRU */
+
+    case E_V2DFmode:
+    case E_V2DImode:
+    case E_V2TImode:
+    case E_V4TImode:
+      use_vec_extr = true;
+      break;
+
+    case E_V4SFmode:
+      use_vec_extr = TARGET_SSE4_1;
+      if (use_vec_extr)
+	break;
+
+      switch (elt)
+	{
+	case 0:
+	  tmp = vec;
+	  break;
+
+	case 1:
+	case 3:
+	  tmp = gen_reg_rtx (mode);
+	  emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
+				       GEN_INT (elt), GEN_INT (elt),
+				       GEN_INT (elt+4), GEN_INT (elt+4)));
+	  break;
+
+	case 2:
+	  tmp = gen_reg_rtx (mode);
+	  emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+      vec = tmp;
+      use_vec_extr = true;
+      elt = 0;
+      break;
+
+    case E_V4SImode:
+      use_vec_extr = TARGET_SSE4_1;
+      if (use_vec_extr)
+	break;
+
+      if (TARGET_SSE2)
+	{
+	  switch (elt)
+	    {
+	    case 0:
+	      tmp = vec;
+	      break;
+
+	    case 1:
+	    case 3:
+	      tmp = gen_reg_rtx (mode);
+	      emit_insn (gen_sse2_pshufd_1 (tmp, vec,
+					    GEN_INT (elt), GEN_INT (elt),
+					    GEN_INT (elt), GEN_INT (elt)));
+	      break;
+
+	    case 2:
+	      tmp = gen_reg_rtx (mode);
+	      emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
+	      break;
+
+	    default:
+	      gcc_unreachable ();
+	    }
+	  vec = tmp;
+	  use_vec_extr = true;
+	  elt = 0;
+	}
+      else
+	{
+	  /* For SSE1, we have to reuse the V4SF code.  */
+	  ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
+				      gen_lowpart (V4SFmode, vec), elt);
+	  return;
+	}
+      break;
+
+    case E_V8HImode:
+      use_vec_extr = TARGET_SSE2;
+      break;
+    case E_V4HImode:
+      use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
+      break;
+
+    case E_V16QImode:
+      use_vec_extr = TARGET_SSE4_1;
+      break;
+
+    case E_V8SFmode:
+      if (TARGET_AVX)
+	{
+	  tmp = gen_reg_rtx (V4SFmode);
+	  if (elt < 4)
+	    emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
+	  else
+	    emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
+	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
+	  return;
+	}
+      break;
+
+    case E_V4DFmode:
+      if (TARGET_AVX)
+	{
+	  tmp = gen_reg_rtx (V2DFmode);
+	  if (elt < 2)
+	    emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
+	  else
+	    emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
+	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
+	  return;
+	}
+      break;
+
+    case E_V32QImode:
+      if (TARGET_AVX)
+	{
+	  tmp = gen_reg_rtx (V16QImode);
+	  if (elt < 16)
+	    emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
+	  else
+	    emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
+	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
+	  return;
+	}
+      break;
+
+    case E_V16HImode:
+      if (TARGET_AVX)
+	{
+	  tmp = gen_reg_rtx (V8HImode);
+	  if (elt < 8)
+	    emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
+	  else
+	    emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
+	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
+	  return;
+	}
+      break;
+
+    case E_V8SImode:
+      if (TARGET_AVX)
+	{
+	  tmp = gen_reg_rtx (V4SImode);
+	  if (elt < 4)
+	    emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
+	  else
+	    emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
+	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
+	  return;
+	}
+      break;
+
+    case E_V4DImode:
+      if (TARGET_AVX)
+	{
+	  tmp = gen_reg_rtx (V2DImode);
+	  if (elt < 2)
+	    emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
+	  else
+	    emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
+	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
+	  return;
+	}
+      break;
+
+    case E_V32HImode:
+      if (TARGET_AVX512BW)
+	{
+	  tmp = gen_reg_rtx (V16HImode);
+	  if (elt < 16)
+	    emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
+	  else
+	    emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
+	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
+	  return;
+	}
+      break;
+
+    case E_V64QImode:
+      if (TARGET_AVX512BW)
+	{
+	  tmp = gen_reg_rtx (V32QImode);
+	  if (elt < 32)
+	    emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
+	  else
+	    emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
+	  ix86_expand_vector_extract (false, target, tmp, elt & 31);
+	  return;
+	}
+      break;
+
+    case E_V16SFmode:
+      tmp = gen_reg_rtx (V8SFmode);
+      if (elt < 8)
+	emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
+      else
+	emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
+      ix86_expand_vector_extract (false, target, tmp, elt & 7);
+      return;
+
+    case E_V8DFmode:
+      tmp = gen_reg_rtx (V4DFmode);
+      if (elt < 4)
+	emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
+      else
+	emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
+      ix86_expand_vector_extract (false, target, tmp, elt & 3);
+      return;
+
+    case E_V16SImode:
+      tmp = gen_reg_rtx (V8SImode);
+      if (elt < 8)
+	emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
+      else
+	emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
+      ix86_expand_vector_extract (false, target, tmp, elt & 7);
+      return;
+
+    case E_V8DImode:
+      tmp = gen_reg_rtx (V4DImode);
+      if (elt < 4)
+	emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
+      else
+	emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
+      ix86_expand_vector_extract (false, target, tmp, elt & 3);
+      return;
+
+    case E_V8QImode:
+      /* ??? Could extract the appropriate HImode element and shift.  */
+    default:
+      break;
+    }
+
+  if (use_vec_extr)
+    {
+      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
+      tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
+
+      /* Let the rtl optimizers know about the zero extension performed.  */
+      if (inner_mode == QImode || inner_mode == HImode)
+	{
+	  tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
+	  target = gen_lowpart (SImode, target);
+	}
+
+      emit_insn (gen_rtx_SET (target, tmp));
+    }
+  else
+    {
+      rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
+
+      emit_move_insn (mem, vec);
+
+      tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
+      emit_move_insn (target, tmp);
+    }
+}
+
+/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
+   to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
+   The upper bits of DEST are undefined, though they shouldn't cause
+   exceptions (some bits from src or all zeros are ok).  */
+
+static void
+emit_reduc_half (rtx dest, rtx src, int i)
+{
+  rtx tem, d = dest;
+  switch (GET_MODE (src))
+    {
+    case E_V4SFmode:
+      if (i == 128)
+	tem = gen_sse_movhlps (dest, src, src);
+      else
+	tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
+				   GEN_INT (1 + 4), GEN_INT (1 + 4));
+      break;
+    case E_V2DFmode:
+      tem = gen_vec_interleave_highv2df (dest, src, src);
+      break;
+    case E_V16QImode:
+    case E_V8HImode:
+    case E_V4SImode:
+    case E_V2DImode:
+      d = gen_reg_rtx (V1TImode);
+      tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
+				GEN_INT (i / 2));
+      break;
+    case E_V8SFmode:
+      if (i == 256)
+	tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
+      else
+	tem = gen_avx_shufps256 (dest, src, src,
+				 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
+      break;
+    case E_V4DFmode:
+      if (i == 256)
+	tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
+      else
+	tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
+      break;
+    case E_V32QImode:
+    case E_V16HImode:
+    case E_V8SImode:
+    case E_V4DImode:
+      if (i == 256)
+	{
+	  if (GET_MODE (dest) != V4DImode)
+	    d = gen_reg_rtx (V4DImode);
+	  tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
+				   gen_lowpart (V4DImode, src),
+				   const1_rtx);
+	}
+      else
+	{
+	  d = gen_reg_rtx (V2TImode);
+	  tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
+				    GEN_INT (i / 2));
+	}
+      break;
+    case E_V64QImode:
+    case E_V32HImode:
+    case E_V16SImode:
+    case E_V16SFmode:
+    case E_V8DImode:
+    case E_V8DFmode:
+      if (i > 128)
+	tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
+				      gen_lowpart (V16SImode, src),
+				      gen_lowpart (V16SImode, src),
+				      GEN_INT (0x4 + (i == 512 ? 4 : 0)),
+				      GEN_INT (0x5 + (i == 512 ? 4 : 0)),
+				      GEN_INT (0x6 + (i == 512 ? 4 : 0)),
+				      GEN_INT (0x7 + (i == 512 ? 4 : 0)),
+				      GEN_INT (0xC), GEN_INT (0xD),
+				      GEN_INT (0xE), GEN_INT (0xF),
+				      GEN_INT (0x10), GEN_INT (0x11),
+				      GEN_INT (0x12), GEN_INT (0x13),
+				      GEN_INT (0x14), GEN_INT (0x15),
+				      GEN_INT (0x16), GEN_INT (0x17));
+      else
+	tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
+				   gen_lowpart (V16SImode, src),
+				   GEN_INT (i == 128 ? 0x2 : 0x1),
+				   GEN_INT (0x3),
+				   GEN_INT (0x3),
+				   GEN_INT (0x3),
+				   GEN_INT (i == 128 ? 0x6 : 0x5),
+				   GEN_INT (0x7),
+				   GEN_INT (0x7),
+				   GEN_INT (0x7),
+				   GEN_INT (i == 128 ? 0xA : 0x9),
+				   GEN_INT (0xB),
+				   GEN_INT (0xB),
+				   GEN_INT (0xB),
+				   GEN_INT (i == 128 ? 0xE : 0xD),
+				   GEN_INT (0xF),
+				   GEN_INT (0xF),
+				   GEN_INT (0xF));
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  emit_insn (tem);
+  if (d != dest)
+    emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
+}
+
+/* Expand a vector reduction.  FN is the binary pattern to reduce;
+   DEST is the destination; IN is the input vector.  */
+
+void
+ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
+{
+  rtx half, dst, vec = in;
+  machine_mode mode = GET_MODE (in);
+  int i;
+
+  /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
+  if (TARGET_SSE4_1
+      && mode == V8HImode
+      && fn == gen_uminv8hi3)
+    {
+      emit_insn (gen_sse4_1_phminposuw (dest, in));
+      return;
+    }
+
+  for (i = GET_MODE_BITSIZE (mode);
+       i > GET_MODE_UNIT_BITSIZE (mode);
+       i >>= 1)
+    {
+      half = gen_reg_rtx (mode);
+      emit_reduc_half (half, vec, i);
+      if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
+	dst = dest;
+      else
+	dst = gen_reg_rtx (mode);
+      emit_insn (fn (dst, half, vec));
+      vec = dst;
+    }
+}
+
+/* Output code to perform a conditional jump to LABEL, if C2 flag in
+   FP status register is set.  */
+
+void
+ix86_emit_fp_unordered_jump (rtx label)
+{
+  rtx reg = gen_reg_rtx (HImode);
+  rtx_insn *insn;
+  rtx temp;
+
+  emit_insn (gen_x86_fnstsw_1 (reg));
+
+  if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
+    {
+      emit_insn (gen_x86_sahf_1 (reg));
+
+      temp = gen_rtx_REG (CCmode, FLAGS_REG);
+      temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
+    }
+  else
+    {
+      emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
+
+      temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+      temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
+    }
+
+  temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
+			      gen_rtx_LABEL_REF (VOIDmode, label),
+			      pc_rtx);
+  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
+  predict_jump (REG_BR_PROB_BASE * 10 / 100);
+  JUMP_LABEL (insn) = label;
+}
+
+/* Output code to perform an sinh XFmode calculation.  */
+
+void ix86_emit_i387_sinh (rtx op0, rtx op1)
+{
+  rtx e1 = gen_reg_rtx (XFmode);
+  rtx e2 = gen_reg_rtx (XFmode);
+  rtx scratch = gen_reg_rtx (HImode);
+  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+  rtx half = const_double_from_real_value (dconsthalf, XFmode);
+  rtx cst1, tmp;
+  rtx_code_label *jump_label = gen_label_rtx ();
+  rtx_insn *insn;
+
+  /* scratch = fxam (op1) */
+  emit_insn (gen_fxamxf2_i387 (scratch, op1));
+
+  /* e1 = expm1 (|op1|) */
+  emit_insn (gen_absxf2 (e2, op1));
+  emit_insn (gen_expm1xf2 (e1, e2));
+
+  /* e2 = e1 / (e1 + 1.0) + e1 */
+  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+  emit_insn (gen_addxf3 (e2, e1, cst1));
+  emit_insn (gen_divxf3 (e2, e1, e2));
+  emit_insn (gen_addxf3 (e2, e2, e1));
+
+  /* flags = signbit (op1) */
+  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
+
+  /* if (flags) then e2 = -e2 */
+  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
+			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
+			      pc_rtx);
+  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+  predict_jump (REG_BR_PROB_BASE * 50 / 100);
+  JUMP_LABEL (insn) = jump_label;
+
+  emit_insn (gen_negxf2 (e2, e2));
+
+  emit_label (jump_label);
+  LABEL_NUSES (jump_label) = 1;
+
+  /* op0 = 0.5 * e2 */
+  half = force_reg (XFmode, half);
+  emit_insn (gen_mulxf3 (op0, e2, half));
+}
+
+/* Output code to perform an cosh XFmode calculation.  */
+
+void ix86_emit_i387_cosh (rtx op0, rtx op1)
+{
+  rtx e1 = gen_reg_rtx (XFmode);
+  rtx e2 = gen_reg_rtx (XFmode);
+  rtx half = const_double_from_real_value (dconsthalf, XFmode);
+  rtx cst1;
+
+  /* e1 = exp (op1) */
+  emit_insn (gen_expxf2 (e1, op1));
+
+  /* e2 = e1 + 1.0 / e1 */
+  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+  emit_insn (gen_divxf3 (e2, cst1, e1));
+  emit_insn (gen_addxf3 (e2, e1, e2));
+
+  /* op0 = 0.5 * e2 */
+  half = force_reg (XFmode, half);
+  emit_insn (gen_mulxf3 (op0, e2, half));
+}
+
+/* Output code to perform an tanh XFmode calculation.  */
+
+void ix86_emit_i387_tanh (rtx op0, rtx op1)
+{
+  rtx e1 = gen_reg_rtx (XFmode);
+  rtx e2 = gen_reg_rtx (XFmode);
+  rtx scratch = gen_reg_rtx (HImode);
+  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+  rtx cst2, tmp;
+  rtx_code_label *jump_label = gen_label_rtx ();
+  rtx_insn *insn;
+
+  /* scratch = fxam (op1) */
+  emit_insn (gen_fxamxf2_i387 (scratch, op1));
+
+  /* e1 = expm1 (-|2 * op1|) */
+  emit_insn (gen_addxf3 (e2, op1, op1));
+  emit_insn (gen_absxf2 (e2, e2));
+  emit_insn (gen_negxf2 (e2, e2));
+  emit_insn (gen_expm1xf2 (e1, e2));
+
+  /* e2 = e1 / (e1 + 2.0) */
+  cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
+  emit_insn (gen_addxf3 (e2, e1, cst2));
+  emit_insn (gen_divxf3 (e2, e1, e2));
+
+  /* flags = signbit (op1) */
+  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
+
+  /* if (!flags) then e2 = -e2 */
+  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
+			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
+			      pc_rtx);
+  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+  predict_jump (REG_BR_PROB_BASE * 50 / 100);
+  JUMP_LABEL (insn) = jump_label;
+
+  emit_insn (gen_negxf2 (e2, e2));
+
+  emit_label (jump_label);
+  LABEL_NUSES (jump_label) = 1;
+
+  emit_move_insn (op0, e2);
+}
+
+/* Output code to perform an asinh XFmode calculation.  */
+
+void ix86_emit_i387_asinh (rtx op0, rtx op1)
+{
+  rtx e1 = gen_reg_rtx (XFmode);
+  rtx e2 = gen_reg_rtx (XFmode);
+  rtx scratch = gen_reg_rtx (HImode);
+  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+  rtx cst1, tmp;
+  rtx_code_label *jump_label = gen_label_rtx ();
+  rtx_insn *insn;
+
+  /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
+  emit_insn (gen_mulxf3 (e1, op1, op1));
+  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+  emit_insn (gen_addxf3 (e2, e1, cst1));
+  emit_insn (gen_sqrtxf2 (e2, e2));
+  emit_insn (gen_addxf3 (e2, e2, cst1));
+
+  /* e1 = e1 / e2 */
+  emit_insn (gen_divxf3 (e1, e1, e2));
+
+  /* scratch = fxam (op1) */
+  emit_insn (gen_fxamxf2_i387 (scratch, op1));
+
+  /* e1 = e1 + |op1| */
+  emit_insn (gen_absxf2 (e2, op1));
+  emit_insn (gen_addxf3 (e1, e1, e2));
+
+  /* e2 = log1p (e1) */
+  ix86_emit_i387_log1p (e2, e1);
+
+  /* flags = signbit (op1) */
+  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
+
+  /* if (flags) then e2 = -e2 */
+  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
+			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
+			      pc_rtx);
+  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+  predict_jump (REG_BR_PROB_BASE * 50 / 100);
+  JUMP_LABEL (insn) = jump_label;
+
+  emit_insn (gen_negxf2 (e2, e2));
+
+  emit_label (jump_label);
+  LABEL_NUSES (jump_label) = 1;
+
+  emit_move_insn (op0, e2);
+}
+
+/* Output code to perform an acosh XFmode calculation.  */
+
+void ix86_emit_i387_acosh (rtx op0, rtx op1)
+{
+  rtx e1 = gen_reg_rtx (XFmode);
+  rtx e2 = gen_reg_rtx (XFmode);
+  rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+
+  /* e2 = sqrt (op1 + 1.0) */
+  emit_insn (gen_addxf3 (e2, op1, cst1));
+  emit_insn (gen_sqrtxf2 (e2, e2));
+
+  /* e1 = sqrt (op1 - 1.0) */
+  emit_insn (gen_subxf3 (e1, op1, cst1));
+  emit_insn (gen_sqrtxf2 (e1, e1));
+
+  /* e1 = e1 * e2 */
+  emit_insn (gen_mulxf3 (e1, e1, e2));
+
+  /* e1 = e1 + op1 */
+  emit_insn (gen_addxf3 (e1, e1, op1));
+
+  /* op0 = log (e1) */
+  emit_insn (gen_logxf2 (op0, e1));
+}
+
+/* Output code to perform an atanh XFmode calculation.  */
+
+void ix86_emit_i387_atanh (rtx op0, rtx op1)
+{
+  rtx e1 = gen_reg_rtx (XFmode);
+  rtx e2 = gen_reg_rtx (XFmode);
+  rtx scratch = gen_reg_rtx (HImode);
+  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+  rtx half = const_double_from_real_value (dconsthalf, XFmode);
+  rtx cst1, tmp;
+  rtx_code_label *jump_label = gen_label_rtx ();
+  rtx_insn *insn;
+
+  /* scratch = fxam (op1) */
+  emit_insn (gen_fxamxf2_i387 (scratch, op1));
+
+  /* e2 = |op1| */
+  emit_insn (gen_absxf2 (e2, op1));
+
+  /* e1 = -(e2 + e2) / (e2 + 1.0) */
+  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+  emit_insn (gen_addxf3 (e1, e2, cst1));
+  emit_insn (gen_addxf3 (e2, e2, e2));
+  emit_insn (gen_negxf2 (e2, e2));
+  emit_insn (gen_divxf3 (e1, e2, e1));
+
+  /* e2 = log1p (e1) */
+  ix86_emit_i387_log1p (e2, e1);
+
+  /* flags = signbit (op1) */
+  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
+
+  /* if (!flags) then e2 = -e2 */
+  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
+			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
+			      pc_rtx);
+  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+  predict_jump (REG_BR_PROB_BASE * 50 / 100);
+  JUMP_LABEL (insn) = jump_label;
+
+  emit_insn (gen_negxf2 (e2, e2));
+
+  emit_label (jump_label);
+  LABEL_NUSES (jump_label) = 1;
+
+  /* op0 = 0.5 * e2 */
+  half = force_reg (XFmode, half);
+  emit_insn (gen_mulxf3 (op0, e2, half));
+}
+
+/* Output code to perform a log1p XFmode calculation.  */
+
+void ix86_emit_i387_log1p (rtx op0, rtx op1)
+{
+  rtx_code_label *label1 = gen_label_rtx ();
+  rtx_code_label *label2 = gen_label_rtx ();
+
+  rtx tmp = gen_reg_rtx (XFmode);
+  rtx res = gen_reg_rtx (XFmode);
+  rtx cst, cstln2, cst1;
+  rtx_insn *insn;
+
+  cst = const_double_from_real_value
+    (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
+  cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
+
+  emit_insn (gen_absxf2 (tmp, op1));
+
+  cst = force_reg (XFmode, cst);
+  ix86_expand_branch (GE, tmp, cst, label1);
+  predict_jump (REG_BR_PROB_BASE * 10 / 100);
+  insn = get_last_insn ();
+  JUMP_LABEL (insn) = label1;
+
+  emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
+  emit_jump (label2);
+
+  emit_label (label1);
+  LABEL_NUSES (label1) = 1;
+
+  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+  emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
+  emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
+
+  emit_label (label2);
+  LABEL_NUSES (label2) = 1;
+
+  emit_move_insn (op0, res);
+}
+
+/* Emit code for round calculation.  */
+void ix86_emit_i387_round (rtx op0, rtx op1)
+{
+  machine_mode inmode = GET_MODE (op1);
+  machine_mode outmode = GET_MODE (op0);
+  rtx e1 = gen_reg_rtx (XFmode);
+  rtx e2 = gen_reg_rtx (XFmode);
+  rtx scratch = gen_reg_rtx (HImode);
+  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+  rtx half = const_double_from_real_value (dconsthalf, XFmode);
+  rtx res = gen_reg_rtx (outmode);
+  rtx_code_label *jump_label = gen_label_rtx ();
+  rtx (*floor_insn) (rtx, rtx);
+  rtx (*neg_insn) (rtx, rtx);
+  rtx_insn *insn;
+  rtx tmp;
+
+  switch (inmode)
+    {
+    case E_SFmode:
+    case E_DFmode:
+      tmp = gen_reg_rtx (XFmode);
+
+      emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
+      op1 = tmp;
+      break;
+    case E_XFmode:
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  switch (outmode)
+    {
+    case E_SFmode:
+      floor_insn = gen_frndintxf2_floor;
+      neg_insn = gen_negsf2;
+      break;
+    case E_DFmode:
+      floor_insn = gen_frndintxf2_floor;
+      neg_insn = gen_negdf2;
+      break;
+    case E_XFmode:
+      floor_insn = gen_frndintxf2_floor;
+      neg_insn = gen_negxf2;
+      break;
+    case E_HImode:
+      floor_insn = gen_lfloorxfhi2;
+      neg_insn = gen_neghi2;
+      break;
+    case E_SImode:
+      floor_insn = gen_lfloorxfsi2;
+      neg_insn = gen_negsi2;
+      break;
+    case E_DImode:
+      floor_insn = gen_lfloorxfdi2;
+      neg_insn = gen_negdi2;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
+
+  /* scratch = fxam(op1) */
+  emit_insn (gen_fxamxf2_i387 (scratch, op1));
+
+  /* e1 = fabs(op1) */
+  emit_insn (gen_absxf2 (e1, op1));
+
+  /* e2 = e1 + 0.5 */
+  half = force_reg (XFmode, half);
+  emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
+
+  /* res = floor(e2) */
+  switch (outmode)
+    {
+    case E_SFmode:
+    case E_DFmode:
+      {
+	tmp = gen_reg_rtx (XFmode);
+
+	emit_insn (floor_insn (tmp, e2));
+	emit_insn (gen_rtx_SET (res,
+				gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
+						UNSPEC_TRUNC_NOOP)));
+      }
+      break;
+    default:
+      emit_insn (floor_insn (res, e2));
+    }
+
+  /* flags = signbit(a) */
+  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
+
+  /* if (flags) then res = -res */
+  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
+			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
+			      pc_rtx);
+  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+  predict_jump (REG_BR_PROB_BASE * 50 / 100);
+  JUMP_LABEL (insn) = jump_label;
+
+  emit_insn (neg_insn (res, res));
+
+  emit_label (jump_label);
+  LABEL_NUSES (jump_label) = 1;
+
+  emit_move_insn (op0, res);
+}
+
+/* Output code to perform a Newton-Rhapson approximation of a single precision
+   floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
+
+void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
+{
+  rtx x0, x1, e0, e1;
+
+  x0 = gen_reg_rtx (mode);
+  e0 = gen_reg_rtx (mode);
+  e1 = gen_reg_rtx (mode);
+  x1 = gen_reg_rtx (mode);
+
+  /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
+
+  b = force_reg (mode, b);
+
+  /* x0 = rcp(b) estimate */
+  if (mode == V16SFmode || mode == V8DFmode)
+    {
+      if (TARGET_AVX512ER)
+	{
+	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+						      UNSPEC_RCP28)));
+	  /* res = a * x0 */
+	  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
+	  return;
+	}
+      else
+	emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+						    UNSPEC_RCP14)));
+    }
+  else
+    emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+						UNSPEC_RCP)));
+
+  /* e0 = x0 * b */
+  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
+
+  /* e0 = x0 * e0 */
+  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
+
+  /* e1 = x0 + x0 */
+  emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
+
+  /* x1 = e1 - e0 */
+  emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
+
+  /* res = a * x1 */
+  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
+}
+
+/* Output code to perform a Newton-Rhapson approximation of a
+   single precision floating point [reciprocal] square root.  */
+
+void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
+{
+  rtx x0, e0, e1, e2, e3, mthree, mhalf;
+  REAL_VALUE_TYPE r;
+  int unspec;
+
+  x0 = gen_reg_rtx (mode);
+  e0 = gen_reg_rtx (mode);
+  e1 = gen_reg_rtx (mode);
+  e2 = gen_reg_rtx (mode);
+  e3 = gen_reg_rtx (mode);
+
+  if (TARGET_AVX512ER && mode == V16SFmode)
+    {
+      if (recip)
+	/* res = rsqrt28(a) estimate */
+	emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+						     UNSPEC_RSQRT28)));
+      else
+	{
+	  /* x0 = rsqrt28(a) estimate */
+	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+						      UNSPEC_RSQRT28)));
+	  /* res = rcp28(x0) estimate */
+	  emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
+						       UNSPEC_RCP28)));
+	}
+      return;
+    }
+
+  real_from_integer (&r, VOIDmode, -3, SIGNED);
+  mthree = const_double_from_real_value (r, SFmode);
+
+  real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
+  mhalf = const_double_from_real_value (r, SFmode);
+  unspec = UNSPEC_RSQRT;
+
+  if (VECTOR_MODE_P (mode))
+    {
+      mthree = ix86_build_const_vector (mode, true, mthree);
+      mhalf = ix86_build_const_vector (mode, true, mhalf);
+      /* There is no 512-bit rsqrt.  There is however rsqrt14.  */
+      if (GET_MODE_SIZE (mode) == 64)
+	unspec = UNSPEC_RSQRT14;
+    }
+
+  /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
+     rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
+
+  a = force_reg (mode, a);
+
+  /* x0 = rsqrt(a) estimate */
+  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+					      unspec)));
+
+  /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
+  if (!recip)
+    {
+      rtx zero = force_reg (mode, CONST0_RTX(mode));
+      rtx mask;
+
+      /* Handle masked compare.  */
+      if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
+	{
+	  mask = gen_reg_rtx (HImode);
+	  /* Imm value 0x4 corresponds to not-equal comparison.  */
+	  emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
+	  emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
+	}
+      else
+	{
+	  mask = gen_reg_rtx (mode);
+	  emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
+	  emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
+	}
+    }
+
+  /* e0 = x0 * a */
+  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
+  /* e1 = e0 * x0 */
+  emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
+
+  /* e2 = e1 - 3. */
+  mthree = force_reg (mode, mthree);
+  emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
+
+  mhalf = force_reg (mode, mhalf);
+  if (recip)
+    /* e3 = -.5 * x0 */
+    emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
+  else
+    /* e3 = -.5 * e0 */
+    emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
+  /* ret = e2 * e3 */
+  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
+}
+
+/* Expand fabs (OP0) and return a new rtx that holds the result.  The
+   mask for masking out the sign-bit is stored in *SMASK, if that is
+   non-null.  */
+
+static rtx
+ix86_expand_sse_fabs (rtx op0, rtx *smask)
+{
+  machine_mode vmode, mode = GET_MODE (op0);
+  rtx xa, mask;
+
+  xa = gen_reg_rtx (mode);
+  if (mode == SFmode)
+    vmode = V4SFmode;
+  else if (mode == DFmode)
+    vmode = V2DFmode;
+  else
+    vmode = mode;
+  mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
+  if (!VECTOR_MODE_P (mode))
+    {
+      /* We need to generate a scalar mode mask in this case.  */
+      rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
+      tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
+      mask = gen_reg_rtx (mode);
+      emit_insn (gen_rtx_SET (mask, tmp));
+    }
+  emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
+
+  if (smask)
+    *smask = mask;
+
+  return xa;
+}
+
+/* Expands a comparison of OP0 with OP1 using comparison code CODE,
+   swapping the operands if SWAP_OPERANDS is true.  The expanded
+   code is a forward jump to a newly created label in case the
+   comparison is true.  The generated label rtx is returned.  */
+static rtx_code_label *
+ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
+                                  bool swap_operands)
+{
+  bool unordered_compare = ix86_unordered_fp_compare (code);
+  rtx_code_label *label;
+  rtx tmp, reg;
+
+  if (swap_operands)
+    std::swap (op0, op1);
+
+  label = gen_label_rtx ();
+  tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
+  if (unordered_compare)
+    tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
+  reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
+  emit_insn (gen_rtx_SET (reg, tmp));
+  tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
+  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+			      gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
+  tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+  JUMP_LABEL (tmp) = label;
+
+  return label;
+}
+
+/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
+   using comparison code CODE.  Operands are swapped for the comparison if
+   SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
+static rtx
+ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
+			      bool swap_operands)
+{
+  rtx (*insn)(rtx, rtx, rtx, rtx);
+  machine_mode mode = GET_MODE (op0);
+  rtx mask = gen_reg_rtx (mode);
+
+  if (swap_operands)
+    std::swap (op0, op1);
+
+  insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
+
+  emit_insn (insn (mask, op0, op1,
+		   gen_rtx_fmt_ee (code, mode, op0, op1)));
+  return mask;
+}
+
+/* Expand copysign from SIGN to the positive value ABS_VALUE
+   storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
+   the sign-bit.  */
+
+static void
+ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
+{
+  machine_mode mode = GET_MODE (sign);
+  rtx sgn = gen_reg_rtx (mode);
+  if (mask == NULL_RTX)
+    {
+      machine_mode vmode;
+
+      if (mode == SFmode)
+	vmode = V4SFmode;
+      else if (mode == DFmode)
+	vmode = V2DFmode;
+      else
+	vmode = mode;
+
+      mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
+      if (!VECTOR_MODE_P (mode))
+	{
+	  /* We need to generate a scalar mode mask in this case.  */
+	  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
+	  tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
+	  mask = gen_reg_rtx (mode);
+	  emit_insn (gen_rtx_SET (mask, tmp));
+	}
+    }
+  else
+    mask = gen_rtx_NOT (mode, mask);
+  emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
+  emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
+}
+
+/* Expand SSE sequence for computing lround from OP1 storing
+   into OP0.  */
+
+void
+ix86_expand_lround (rtx op0, rtx op1)
+{
+  /* C code for the stuff we're doing below:
+       tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
+       return (long)tmp;
+   */
+  machine_mode mode = GET_MODE (op1);
+  const struct real_format *fmt;
+  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+  rtx adj;
+
+  /* load nextafter (0.5, 0.0) */
+  fmt = REAL_MODE_FORMAT (mode);
+  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
+  real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
+
+  /* adj = copysign (0.5, op1) */
+  adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
+  ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
+
+  /* adj = op1 + adj */
+  adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* op0 = (imode)adj */
+  expand_fix (op0, adj, 0);
+}
+
+/* Expand SSE2 sequence for computing lround from OPERAND1 storing
+   into OPERAND0.  */
+
+void
+ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
+{
+  /* C code for the stuff we're doing below (for do_floor):
+	xi = (long)op1;
+        xi -= (double)xi > op1 ? 1 : 0;
+        return xi;
+   */
+  machine_mode fmode = GET_MODE (op1);
+  machine_mode imode = GET_MODE (op0);
+  rtx ireg, freg, tmp;
+  rtx_code_label *label;
+
+  /* reg = (long)op1 */
+  ireg = gen_reg_rtx (imode);
+  expand_fix (ireg, op1, 0);
+
+  /* freg = (double)reg */
+  freg = gen_reg_rtx (fmode);
+  expand_float (freg, ireg, 0);
+
+  /* ireg = (freg > op1) ? ireg - 1 : ireg */
+  label = ix86_expand_sse_compare_and_jump (UNLE,
+					    freg, op1, !do_floor);
+  tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
+			     ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
+  emit_move_insn (ireg, tmp);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (op0, ireg);
+}
+
+/* Generate and return a rtx of mode MODE for 2**n where n is the number
+   of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
+
+static rtx
+ix86_gen_TWO52 (machine_mode mode)
+{
+  REAL_VALUE_TYPE TWO52r;
+  rtx TWO52;
+
+  real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
+  TWO52 = const_double_from_real_value (TWO52r, mode);
+  TWO52 = force_reg (mode, TWO52);
+
+  return TWO52;
+}
+
+/* Expand rint rounding OPERAND1 and storing the result in OPERAND0.  */
+
+void
+ix86_expand_rint (rtx operand0, rtx operand1)
+{
+  /* C code for the stuff we're doing below:
+	xa = fabs (operand1);
+        if (!isless (xa, 2**52))
+	  return operand1;
+        two52 = 2**52;
+        if (flag_rounding_math)
+	  {
+	    two52 = copysign (two52, operand1);
+	    xa = operand1;
+	  }
+        xa = xa + two52 - two52;
+        return copysign (xa, operand1);
+   */
+  machine_mode mode = GET_MODE (operand0);
+  rtx res, xa, TWO52, two52, mask;
+  rtx_code_label *label;
+
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, &mask);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  TWO52 = ix86_gen_TWO52 (mode);
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  two52 = TWO52;
+  if (flag_rounding_math)
+    {
+      two52 = gen_reg_rtx (mode);
+      ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
+      xa = res;
+    }
+
+  xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
+  xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
+
+  ix86_sse_copysign_to_positive (res, xa, res, mask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
+   into OPERAND0.  */
+void
+ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
+{
+  /* C code for the stuff we expand below.
+        double xa = fabs (x), x2;
+        if (!isless (xa, TWO52))
+          return x;
+        xa = xa + TWO52 - TWO52;
+        x2 = copysign (xa, x);
+     Compensate.  Floor:
+        if (x2 > x)
+          x2 -= 1;
+     Compensate.  Ceil:
+        if (x2 < x)
+          x2 -= -1;
+        return x2;
+   */
+  machine_mode mode = GET_MODE (operand0);
+  rtx xa, TWO52, tmp, one, res, mask;
+  rtx_code_label *label;
+
+  TWO52 = ix86_gen_TWO52 (mode);
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, &mask);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* xa = xa + TWO52 - TWO52; */
+  xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+  xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
+
+  /* xa = copysign (xa, operand1) */
+  ix86_sse_copysign_to_positive (xa, xa, res, mask);
+
+  /* generate 1.0 or -1.0 */
+  one = force_reg (mode,
+	           const_double_from_real_value (do_floor
+						 ? dconst1 : dconstm1, mode));
+
+  /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
+  tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
+  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
+  /* We always need to subtract here to preserve signed zero.  */
+  tmp = expand_simple_binop (mode, MINUS,
+			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+  emit_move_insn (res, tmp);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
+   into OPERAND0.  */
+void
+ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
+{
+  /* C code for the stuff we expand below.
+	double xa = fabs (x), x2;
+        if (!isless (xa, TWO52))
+          return x;
+	x2 = (double)(long)x;
+     Compensate.  Floor:
+	if (x2 > x)
+	  x2 -= 1;
+     Compensate.  Ceil:
+	if (x2 < x)
+	  x2 += 1;
+	if (HONOR_SIGNED_ZEROS (mode))
+	  return copysign (x2, x);
+	return x2;
+   */
+  machine_mode mode = GET_MODE (operand0);
+  rtx xa, xi, TWO52, tmp, one, res, mask;
+  rtx_code_label *label;
+
+  TWO52 = ix86_gen_TWO52 (mode);
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, &mask);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* xa = (double)(long)x */
+  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+  expand_fix (xi, res, 0);
+  expand_float (xa, xi, 0);
+
+  /* generate 1.0 */
+  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
+
+  /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
+  tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
+  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
+  tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
+			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+  emit_move_insn (res, tmp);
+
+  if (HONOR_SIGNED_ZEROS (mode))
+    ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing round from OPERAND1 storing
+   into OPERAND0.  Sequence that works without relying on DImode truncation
+   via cvttsd2siq that is only available on 64bit targets.  */
+void
+ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
+{
+  /* C code for the stuff we expand below.
+        double xa = fabs (x), xa2, x2;
+        if (!isless (xa, TWO52))
+          return x;
+     Using the absolute value and copying back sign makes
+     -0.0 -> -0.0 correct.
+        xa2 = xa + TWO52 - TWO52;
+     Compensate.
+	dxa = xa2 - xa;
+        if (dxa <= -0.5)
+          xa2 += 1;
+        else if (dxa > 0.5)
+          xa2 -= 1;
+        x2 = copysign (xa2, x);
+        return x2;
+   */
+  machine_mode mode = GET_MODE (operand0);
+  rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
+  rtx_code_label *label;
+
+  TWO52 = ix86_gen_TWO52 (mode);
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, &mask);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* xa2 = xa + TWO52 - TWO52; */
+  xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+  xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
+
+  /* dxa = xa2 - xa; */
+  dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* generate 0.5, 1.0 and -0.5 */
+  half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
+  one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
+  mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
+			       0, OPTAB_DIRECT);
+
+  /* Compensate.  */
+  tmp = gen_reg_rtx (mode);
+  /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
+  tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
+  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
+  xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+  /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
+  tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
+  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
+  xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* res = copysign (xa2, operand1) */
+  ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing trunc from OPERAND1 storing
+   into OPERAND0.  */
+void
+ix86_expand_trunc (rtx operand0, rtx operand1)
+{
+  /* C code for SSE variant we expand below.
+        double xa = fabs (x), x2;
+        if (!isless (xa, TWO52))
+          return x;
+        x2 = (double)(long)x;
+	if (HONOR_SIGNED_ZEROS (mode))
+	  return copysign (x2, x);
+	return x2;
+   */
+  machine_mode mode = GET_MODE (operand0);
+  rtx xa, xi, TWO52, res, mask;
+  rtx_code_label *label;
+
+  TWO52 = ix86_gen_TWO52 (mode);
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, &mask);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* x = (double)(long)x */
+  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+  expand_fix (xi, res, 0);
+  expand_float (res, xi, 0);
+
+  if (HONOR_SIGNED_ZEROS (mode))
+    ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing trunc from OPERAND1 storing
+   into OPERAND0.  */
+void
+ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
+{
+  machine_mode mode = GET_MODE (operand0);
+  rtx xa, mask, TWO52, one, res, smask, tmp;
+  rtx_code_label *label;
+
+  /* C code for SSE variant we expand below.
+        double xa = fabs (x), x2;
+        if (!isless (xa, TWO52))
+          return x;
+        xa2 = xa + TWO52 - TWO52;
+     Compensate:
+        if (xa2 > xa)
+          xa2 -= 1.0;
+        x2 = copysign (xa2, x);
+        return x2;
+   */
+
+  TWO52 = ix86_gen_TWO52 (mode);
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, &smask);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* res = xa + TWO52 - TWO52; */
+  tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+  tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
+  emit_move_insn (res, tmp);
+
+  /* generate 1.0 */
+  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
+
+  /* Compensate: res = xa2 - (res > xa ? 1 : 0)  */
+  mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
+  emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
+  tmp = expand_simple_binop (mode, MINUS,
+			     res, mask, NULL_RTX, 0, OPTAB_DIRECT);
+  emit_move_insn (res, tmp);
+
+  /* res = copysign (res, operand1) */
+  ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing round from OPERAND1 storing
+   into OPERAND0.  */
+void
+ix86_expand_round (rtx operand0, rtx operand1)
+{
+  /* C code for the stuff we're doing below:
+        double xa = fabs (x);
+        if (!isless (xa, TWO52))
+          return x;
+        xa = (double)(long)(xa + nextafter (0.5, 0.0));
+        return copysign (xa, x);
+   */
+  machine_mode mode = GET_MODE (operand0);
+  rtx res, TWO52, xa, xi, half, mask;
+  rtx_code_label *label;
+  const struct real_format *fmt;
+  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  TWO52 = ix86_gen_TWO52 (mode);
+  xa = ix86_expand_sse_fabs (res, &mask);
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* load nextafter (0.5, 0.0) */
+  fmt = REAL_MODE_FORMAT (mode);
+  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
+  real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
+
+  /* xa = xa + 0.5 */
+  half = force_reg (mode, const_double_from_real_value (pred_half, mode));
+  xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* xa = (double)(int64_t)xa */
+  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+  expand_fix (xi, xa, 0);
+  expand_float (xa, xi, 0);
+
+  /* res = copysign (xa, operand1) */
+  ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing round
+   from OP1 storing into OP0 using sse4 round insn.  */
+void
+ix86_expand_round_sse4 (rtx op0, rtx op1)
+{
+  machine_mode mode = GET_MODE (op0);
+  rtx e1, e2, res, half;
+  const struct real_format *fmt;
+  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+  rtx (*gen_copysign) (rtx, rtx, rtx);
+  rtx (*gen_round) (rtx, rtx, rtx);
+
+  switch (mode)
+    {
+    case E_SFmode:
+      gen_copysign = gen_copysignsf3;
+      gen_round = gen_sse4_1_roundsf2;
+      break;
+    case E_DFmode:
+      gen_copysign = gen_copysigndf3;
+      gen_round = gen_sse4_1_rounddf2;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* round (a) = trunc (a + copysign (0.5, a)) */
+
+  /* load nextafter (0.5, 0.0) */
+  fmt = REAL_MODE_FORMAT (mode);
+  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
+  real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
+  half = const_double_from_real_value (pred_half, mode);
+
+  /* e1 = copysign (0.5, op1) */
+  e1 = gen_reg_rtx (mode);
+  emit_insn (gen_copysign (e1, half, op1));
+
+  /* e2 = op1 + e1 */
+  e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* res = trunc (e2) */
+  res = gen_reg_rtx (mode);
+  emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
+
+  emit_move_insn (op0, res);
+}
+
+/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
+   insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
+   insn every time.  */
+
+static GTY(()) rtx_insn *vselect_insn;
+
+/* Initialize vselect_insn.  */
+
+static void
+init_vselect_insn (void)
+{
+  unsigned i;
+  rtx x;
+
+  x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
+  for (i = 0; i < MAX_VECT_LEN; ++i)
+    XVECEXP (x, 0, i) = const0_rtx;
+  x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
+							const0_rtx), x);
+  x = gen_rtx_SET (const0_rtx, x);
+  start_sequence ();
+  vselect_insn = emit_insn (x);
+  end_sequence ();
+}
+
+/* Construct (set target (vec_select op0 (parallel perm))) and
+   return true if that's a valid instruction in the active ISA.  */
+
+static bool
+expand_vselect (rtx target, rtx op0, const unsigned char *perm,
+		unsigned nelt, bool testing_p)
+{
+  unsigned int i;
+  rtx x, save_vconcat;
+  int icode;
+
+  if (vselect_insn == NULL_RTX)
+    init_vselect_insn ();
+
+  x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
+  PUT_NUM_ELEM (XVEC (x, 0), nelt);
+  for (i = 0; i < nelt; ++i)
+    XVECEXP (x, 0, i) = GEN_INT (perm[i]);
+  save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
+  XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
+  PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
+  SET_DEST (PATTERN (vselect_insn)) = target;
+  icode = recog_memoized (vselect_insn);
+
+  if (icode >= 0 && !testing_p)
+    emit_insn (copy_rtx (PATTERN (vselect_insn)));
+
+  SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
+  XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
+  INSN_CODE (vselect_insn) = -1;
+
+  return icode >= 0;
+}
+
+/* Similar, but generate a vec_concat from op0 and op1 as well.  */
+
+static bool
+expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
+			const unsigned char *perm, unsigned nelt,
+			bool testing_p)
+{
+  machine_mode v2mode;
+  rtx x;
+  bool ok;
+
+  if (vselect_insn == NULL_RTX)
+    init_vselect_insn ();
+
+  if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
+    return false;
+  x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
+  PUT_MODE (x, v2mode);
+  XEXP (x, 0) = op0;
+  XEXP (x, 1) = op1;
+  ok = expand_vselect (target, x, perm, nelt, testing_p);
+  XEXP (x, 0) = const0_rtx;
+  XEXP (x, 1) = const0_rtx;
+  return ok;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   using movss or movsd.  */
+static bool
+expand_vec_perm_movs (struct expand_vec_perm_d *d)
+{
+  machine_mode vmode = d->vmode;
+  unsigned i, nelt = d->nelt;
+  rtx x;
+
+  if (d->one_operand_p)
+    return false;
+
+  if (!(TARGET_SSE && vmode == V4SFmode)
+      && !(TARGET_SSE2 && vmode == V2DFmode))
+    return false;
+
+  /* Only the first element is changed.  */
+  if (d->perm[0] != nelt && d->perm[0] != 0)
+    return false;
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != i + nelt - d->perm[0])
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  if (d->perm[0] == nelt)
+    x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
+  else
+    x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
+
+  emit_insn (gen_rtx_SET (d->target, x));
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
+
+static bool
+expand_vec_perm_blend (struct expand_vec_perm_d *d)
+{
+  machine_mode mmode, vmode = d->vmode;
+  unsigned i, nelt = d->nelt;
+  unsigned HOST_WIDE_INT mask;
+  rtx target, op0, op1, maskop, x;
+  rtx rperm[32], vperm;
+
+  if (d->one_operand_p)
+    return false;
+  if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
+      && (TARGET_AVX512BW
+	  || GET_MODE_UNIT_SIZE (vmode) >= 4))
+    ;
+  else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+    ;
+  else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
+    ;
+  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+    ;
+  else
+    return false;
+
+  /* This is a blend, not a permute.  Elements must stay in their
+     respective lanes.  */
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned e = d->perm[i];
+      if (!(e == i || e == i + nelt))
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
+     decision should be extracted elsewhere, so that we only try that
+     sequence once all budget==3 options have been tried.  */
+  target = d->target;
+  op0 = d->op0;
+  op1 = d->op1;
+  mask = 0;
+
+  switch (vmode)
+    {
+    case E_V8DFmode:
+    case E_V16SFmode:
+    case E_V4DFmode:
+    case E_V8SFmode:
+    case E_V2DFmode:
+    case E_V4SFmode:
+    case E_V8HImode:
+    case E_V8SImode:
+    case E_V32HImode:
+    case E_V64QImode:
+    case E_V16SImode:
+    case E_V8DImode:
+      for (i = 0; i < nelt; ++i)
+	mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
+      break;
+
+    case E_V2DImode:
+      for (i = 0; i < 2; ++i)
+	mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
+      vmode = V8HImode;
+      goto do_subreg;
+
+    case E_V4SImode:
+      for (i = 0; i < 4; ++i)
+	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+      vmode = V8HImode;
+      goto do_subreg;
+
+    case E_V16QImode:
+      /* See if bytes move in pairs so we can use pblendw with
+	 an immediate argument, rather than pblendvb with a vector
+	 argument.  */
+      for (i = 0; i < 16; i += 2)
+	if (d->perm[i] + 1 != d->perm[i + 1])
+	  {
+	  use_pblendvb:
+	    for (i = 0; i < nelt; ++i)
+	      rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
+
+	  finish_pblendvb:
+	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
+	    vperm = force_reg (vmode, vperm);
+
+	    if (GET_MODE_SIZE (vmode) == 16)
+	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
+	    else
+	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
+	    if (target != d->target)
+	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+	    return true;
+	  }
+
+      for (i = 0; i < 8; ++i)
+	mask |= (d->perm[i * 2] >= 16) << i;
+      vmode = V8HImode;
+      /* FALLTHRU */
+
+    do_subreg:
+      target = gen_reg_rtx (vmode);
+      op0 = gen_lowpart (vmode, op0);
+      op1 = gen_lowpart (vmode, op1);
+      break;
+
+    case E_V32QImode:
+      /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
+      for (i = 0; i < 32; i += 2)
+	if (d->perm[i] + 1 != d->perm[i + 1])
+	  goto use_pblendvb;
+      /* See if bytes move in quadruplets.  If yes, vpblendd
+	 with immediate can be used.  */
+      for (i = 0; i < 32; i += 4)
+	if (d->perm[i] + 2 != d->perm[i + 2])
+	  break;
+      if (i < 32)
+	{
+	  /* See if bytes move the same in both lanes.  If yes,
+	     vpblendw with immediate can be used.  */
+	  for (i = 0; i < 16; i += 2)
+	    if (d->perm[i] + 16 != d->perm[i + 16])
+	      goto use_pblendvb;
+
+	  /* Use vpblendw.  */
+	  for (i = 0; i < 16; ++i)
+	    mask |= (d->perm[i * 2] >= 32) << i;
+	  vmode = V16HImode;
+	  goto do_subreg;
+	}
+
+      /* Use vpblendd.  */
+      for (i = 0; i < 8; ++i)
+	mask |= (d->perm[i * 4] >= 32) << i;
+      vmode = V8SImode;
+      goto do_subreg;
+
+    case E_V16HImode:
+      /* See if words move in pairs.  If yes, vpblendd can be used.  */
+      for (i = 0; i < 16; i += 2)
+	if (d->perm[i] + 1 != d->perm[i + 1])
+	  break;
+      if (i < 16)
+	{
+	  /* See if words move the same in both lanes.  If not,
+	     vpblendvb must be used.  */
+	  for (i = 0; i < 8; i++)
+	    if (d->perm[i] + 8 != d->perm[i + 8])
+	      {
+		/* Use vpblendvb.  */
+		for (i = 0; i < 32; ++i)
+		  rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
+
+		vmode = V32QImode;
+		nelt = 32;
+		target = gen_reg_rtx (vmode);
+		op0 = gen_lowpart (vmode, op0);
+		op1 = gen_lowpart (vmode, op1);
+		goto finish_pblendvb;
+	      }
+
+	  /* Use vpblendw.  */
+	  for (i = 0; i < 16; ++i)
+	    mask |= (d->perm[i] >= 16) << i;
+	  break;
+	}
+
+      /* Use vpblendd.  */
+      for (i = 0; i < 8; ++i)
+	mask |= (d->perm[i * 2] >= 16) << i;
+      vmode = V8SImode;
+      goto do_subreg;
+
+    case E_V4DImode:
+      /* Use vpblendd.  */
+      for (i = 0; i < 4; ++i)
+	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+      vmode = V8SImode;
+      goto do_subreg;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  switch (vmode)
+    {
+    case E_V8DFmode:
+    case E_V8DImode:
+      mmode = QImode;
+      break;
+    case E_V16SFmode:
+    case E_V16SImode:
+      mmode = HImode;
+      break;
+    case E_V32HImode:
+      mmode = SImode;
+      break;
+    case E_V64QImode:
+      mmode = DImode;
+      break;
+    default:
+      mmode = VOIDmode;
+    }
+
+  if (mmode != VOIDmode)
+    maskop = force_reg (mmode, gen_int_mode (mask, mmode));
+  else
+    maskop = GEN_INT (mask);
+
+  /* This matches five different patterns with the different modes.  */
+  x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
+  x = gen_rtx_SET (target, x);
+  emit_insn (x);
+  if (target != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   in terms of the variable form of vpermilps.
+
+   Note that we will have already failed the immediate input vpermilps,
+   which requires that the high and low part shuffle be identical; the
+   variable form doesn't require that.  */
+
+static bool
+expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
+{
+  rtx rperm[8], vperm;
+  unsigned i;
+
+  if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
+    return false;
+
+  /* We can only permute within the 128-bit lane.  */
+  for (i = 0; i < 8; ++i)
+    {
+      unsigned e = d->perm[i];
+      if (i < 4 ? e >= 4 : e < 4)
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  for (i = 0; i < 8; ++i)
+    {
+      unsigned e = d->perm[i];
+
+      /* Within each 128-bit lane, the elements of op0 are numbered
+	 from 0 and the elements of op1 are numbered from 4.  */
+      if (e >= 8 + 4)
+	e -= 8;
+      else if (e >= 4)
+	e -= 4;
+
+      rperm[i] = GEN_INT (e);
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
+  vperm = force_reg (V8SImode, vperm);
+  emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
+
+  return true;
+}
+
+/* Return true if permutation D can be performed as VMODE permutation
+   instead.  */
+
+static bool
+valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
+{
+  unsigned int i, j, chunk;
+
+  if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
+      || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
+      || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
+    return false;
+
+  if (GET_MODE_NUNITS (vmode) >= d->nelt)
+    return true;
+
+  chunk = d->nelt / GET_MODE_NUNITS (vmode);
+  for (i = 0; i < d->nelt; i += chunk)
+    if (d->perm[i] & (chunk - 1))
+      return false;
+    else
+      for (j = 1; j < chunk; ++j)
+	if (d->perm[i] + j != d->perm[i + j])
+	  return false;
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
+
+static bool
+expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt, eltsz, mask;
+  unsigned char perm[64];
+  machine_mode vmode = V16QImode;
+  rtx rperm[64], vperm, target, op0, op1;
+
+  nelt = d->nelt;
+
+  if (!d->one_operand_p)
+    {
+      if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
+	{
+	  if (TARGET_AVX2
+	      && valid_perm_using_mode_p (V2TImode, d))
+	    {
+	      if (d->testing_p)
+		return true;
+
+	      /* Use vperm2i128 insn.  The pattern uses
+		 V4DImode instead of V2TImode.  */
+	      target = d->target;
+	      if (d->vmode != V4DImode)
+		target = gen_reg_rtx (V4DImode);
+	      op0 = gen_lowpart (V4DImode, d->op0);
+	      op1 = gen_lowpart (V4DImode, d->op1);
+	      rperm[0]
+		= GEN_INT ((d->perm[0] / (nelt / 2))
+			   | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
+	      emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
+	      if (target != d->target)
+		emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+	      return true;
+	    }
+	  return false;
+	}
+    }
+  else
+    {
+      if (GET_MODE_SIZE (d->vmode) == 16)
+	{
+	  if (!TARGET_SSSE3)
+	    return false;
+	}
+      else if (GET_MODE_SIZE (d->vmode) == 32)
+	{
+	  if (!TARGET_AVX2)
+	    return false;
+
+	  /* V4DImode should be already handled through
+	     expand_vselect by vpermq instruction.  */
+	  gcc_assert (d->vmode != V4DImode);
+
+	  vmode = V32QImode;
+	  if (d->vmode == V8SImode
+	      || d->vmode == V16HImode
+	      || d->vmode == V32QImode)
+	    {
+	      /* First see if vpermq can be used for
+		 V8SImode/V16HImode/V32QImode.  */
+	      if (valid_perm_using_mode_p (V4DImode, d))
+		{
+		  for (i = 0; i < 4; i++)
+		    perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
+		  if (d->testing_p)
+		    return true;
+		  target = gen_reg_rtx (V4DImode);
+		  if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
+				      perm, 4, false))
+		    {
+		      emit_move_insn (d->target,
+				      gen_lowpart (d->vmode, target));
+		      return true;
+		    }
+		  return false;
+		}
+
+	      /* Next see if vpermd can be used.  */
+	      if (valid_perm_using_mode_p (V8SImode, d))
+		vmode = V8SImode;
+	    }
+	  /* Or if vpermps can be used.  */
+	  else if (d->vmode == V8SFmode)
+	    vmode = V8SImode;
+
+	  if (vmode == V32QImode)
+	    {
+	      /* vpshufb only works intra lanes, it is not
+		 possible to shuffle bytes in between the lanes.  */
+	      for (i = 0; i < nelt; ++i)
+		if ((d->perm[i] ^ i) & (nelt / 2))
+		  return false;
+	    }
+	}
+      else if (GET_MODE_SIZE (d->vmode) == 64)
+	{
+	  if (!TARGET_AVX512BW)
+	    return false;
+
+	  /* If vpermq didn't work, vpshufb won't work either.  */
+	  if (d->vmode == V8DFmode || d->vmode == V8DImode)
+	    return false;
+
+	  vmode = V64QImode;
+	  if (d->vmode == V16SImode
+	      || d->vmode == V32HImode
+	      || d->vmode == V64QImode)
+	    {
+	      /* First see if vpermq can be used for
+		 V16SImode/V32HImode/V64QImode.  */
+	      if (valid_perm_using_mode_p (V8DImode, d))
+		{
+		  for (i = 0; i < 8; i++)
+		    perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
+		  if (d->testing_p)
+		    return true;
+		  target = gen_reg_rtx (V8DImode);
+		  if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
+				      perm, 8, false))
+		    {
+		      emit_move_insn (d->target,
+				      gen_lowpart (d->vmode, target));
+		      return true;
+		    }
+		  return false;
+		}
+
+	      /* Next see if vpermd can be used.  */
+	      if (valid_perm_using_mode_p (V16SImode, d))
+		vmode = V16SImode;
+	    }
+	  /* Or if vpermps can be used.  */
+	  else if (d->vmode == V16SFmode)
+	    vmode = V16SImode;
+	  if (vmode == V64QImode)
+	    {
+	      /* vpshufb only works intra lanes, it is not
+		 possible to shuffle bytes in between the lanes.  */
+	      for (i = 0; i < nelt; ++i)
+		if ((d->perm[i] ^ i) & (nelt / 4))
+		  return false;
+	    }
+	}
+      else
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  if (vmode == V8SImode)
+    for (i = 0; i < 8; ++i)
+      rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
+  else if (vmode == V16SImode)
+    for (i = 0; i < 16; ++i)
+      rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
+  else
+    {
+      eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+      if (!d->one_operand_p)
+	mask = 2 * nelt - 1;
+      else if (vmode == V16QImode)
+	mask = nelt - 1;
+      else if (vmode == V64QImode)
+	mask = nelt / 4 - 1;
+      else
+	mask = nelt / 2 - 1;
+
+      for (i = 0; i < nelt; ++i)
+	{
+	  unsigned j, e = d->perm[i] & mask;
+	  for (j = 0; j < eltsz; ++j)
+	    rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
+	}
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (vmode,
+				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
+  vperm = force_reg (vmode, vperm);
+
+  target = d->target;
+  if (d->vmode != vmode)
+    target = gen_reg_rtx (vmode);
+  op0 = gen_lowpart (vmode, d->op0);
+  if (d->one_operand_p)
+    {
+      if (vmode == V16QImode)
+	emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
+      else if (vmode == V32QImode)
+	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+      else if (vmode == V64QImode)
+	emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
+      else if (vmode == V8SFmode)
+	emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
+      else if (vmode == V8SImode)
+	emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
+      else if (vmode == V16SFmode)
+	emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
+      else if (vmode == V16SImode)
+	emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
+      else
+	gcc_unreachable ();
+    }
+  else
+    {
+      op1 = gen_lowpart (vmode, d->op1);
+      emit_insn (gen_xop_pperm (target, op0, op1, vperm));
+    }
+  if (target != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+
+  return true;
+}
+
+/* For V*[QHS]Imode permutations, check if the same permutation
+   can't be performed in a 2x, 4x or 8x wider inner mode.  */
+
+static bool
+canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
+			      struct expand_vec_perm_d *nd)
+{
+  int i;
+  machine_mode mode = VOIDmode;
+
+  switch (d->vmode)
+    {
+    case E_V16QImode: mode = V8HImode; break;
+    case E_V32QImode: mode = V16HImode; break;
+    case E_V64QImode: mode = V32HImode; break;
+    case E_V8HImode: mode = V4SImode; break;
+    case E_V16HImode: mode = V8SImode; break;
+    case E_V32HImode: mode = V16SImode; break;
+    case E_V4SImode: mode = V2DImode; break;
+    case E_V8SImode: mode = V4DImode; break;
+    case E_V16SImode: mode = V8DImode; break;
+    default: return false;
+    }
+  for (i = 0; i < d->nelt; i += 2)
+    if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
+      return false;
+  nd->vmode = mode;
+  nd->nelt = d->nelt / 2;
+  for (i = 0; i < nd->nelt; i++)
+    nd->perm[i] = d->perm[2 * i] / 2;
+  if (GET_MODE_INNER (mode) != DImode)
+    canonicalize_vector_int_perm (nd, nd);
+  if (nd != d)
+    {
+      nd->one_operand_p = d->one_operand_p;
+      nd->testing_p = d->testing_p;
+      if (d->op0 == d->op1)
+	nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
+      else
+	{
+	  nd->op0 = gen_lowpart (nd->vmode, d->op0);
+	  nd->op1 = gen_lowpart (nd->vmode, d->op1);
+	}
+      if (d->testing_p)
+	nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
+      else
+	nd->target = gen_reg_rtx (nd->vmode);
+    }
+  return true;
+}
+
+/* Try to expand one-operand permutation with constant mask.  */
+
+static bool
+ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
+{
+  machine_mode mode = GET_MODE (d->op0);
+  machine_mode maskmode = mode;
+  rtx (*gen) (rtx, rtx, rtx) = NULL;
+  rtx target, op0, mask;
+  rtx vec[64];
+
+  if (!rtx_equal_p (d->op0, d->op1))
+    return false;
+
+  if (!TARGET_AVX512F)
+    return false;
+
+  switch (mode)
+    {
+    case E_V16SImode:
+      gen = gen_avx512f_permvarv16si;
+      break;
+    case E_V16SFmode:
+      gen = gen_avx512f_permvarv16sf;
+      maskmode = V16SImode;
+      break;
+    case E_V8DImode:
+      gen = gen_avx512f_permvarv8di;
+      break;
+    case E_V8DFmode:
+      gen = gen_avx512f_permvarv8df;
+      maskmode = V8DImode;
+      break;
+    default:
+      return false;
+    }
+
+  target = d->target;
+  op0 = d->op0;
+  for (int i = 0; i < d->nelt; ++i)
+    vec[i] = GEN_INT (d->perm[i]);
+  mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
+  emit_insn (gen (target, op0, force_reg (maskmode, mask)));
+  return true;
+}
+
+static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
+   in a single instruction.  */
+
+static bool
+expand_vec_perm_1 (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt = d->nelt;
+  struct expand_vec_perm_d nd;
+
+  /* Check plain VEC_SELECT first, because AVX has instructions that could
+     match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
+     input where SEL+CONCAT may not.  */
+  if (d->one_operand_p)
+    {
+      int mask = nelt - 1;
+      bool identity_perm = true;
+      bool broadcast_perm = true;
+
+      for (i = 0; i < nelt; i++)
+	{
+	  nd.perm[i] = d->perm[i] & mask;
+	  if (nd.perm[i] != i)
+	    identity_perm = false;
+	  if (nd.perm[i])
+	    broadcast_perm = false;
+	}
+
+      if (identity_perm)
+	{
+	  if (!d->testing_p)
+	    emit_move_insn (d->target, d->op0);
+	  return true;
+	}
+      else if (broadcast_perm && TARGET_AVX2)
+	{
+	  /* Use vpbroadcast{b,w,d}.  */
+	  rtx (*gen) (rtx, rtx) = NULL;
+	  switch (d->vmode)
+	    {
+	    case E_V64QImode:
+	      if (TARGET_AVX512BW)
+		gen = gen_avx512bw_vec_dupv64qi_1;
+	      break;
+	    case E_V32QImode:
+	      gen = gen_avx2_pbroadcastv32qi_1;
+	      break;
+	    case E_V32HImode:
+	      if (TARGET_AVX512BW)
+		gen = gen_avx512bw_vec_dupv32hi_1;
+	      break;
+	    case E_V16HImode:
+	      gen = gen_avx2_pbroadcastv16hi_1;
+	      break;
+	    case E_V16SImode:
+	      if (TARGET_AVX512F)
+		gen = gen_avx512f_vec_dupv16si_1;
+	      break;
+	    case E_V8SImode:
+	      gen = gen_avx2_pbroadcastv8si_1;
+	      break;
+	    case E_V16QImode:
+	      gen = gen_avx2_pbroadcastv16qi;
+	      break;
+	    case E_V8HImode:
+	      gen = gen_avx2_pbroadcastv8hi;
+	      break;
+	    case E_V16SFmode:
+	      if (TARGET_AVX512F)
+		gen = gen_avx512f_vec_dupv16sf_1;
+	      break;
+	    case E_V8SFmode:
+	      gen = gen_avx2_vec_dupv8sf_1;
+	      break;
+	    case E_V8DFmode:
+	      if (TARGET_AVX512F)
+		gen = gen_avx512f_vec_dupv8df_1;
+	      break;
+	    case E_V8DImode:
+	      if (TARGET_AVX512F)
+		gen = gen_avx512f_vec_dupv8di_1;
+	      break;
+	    /* For other modes prefer other shuffles this function creates.  */
+	    default: break;
+	    }
+	  if (gen != NULL)
+	    {
+	      if (!d->testing_p)
+		emit_insn (gen (d->target, d->op0));
+	      return true;
+	    }
+	}
+
+      if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
+	return true;
+
+      /* There are plenty of patterns in sse.md that are written for
+	 SEL+CONCAT and are not replicated for a single op.  Perhaps
+	 that should be changed, to avoid the nastiness here.  */
+
+      /* Recognize interleave style patterns, which means incrementing
+	 every other permutation operand.  */
+      for (i = 0; i < nelt; i += 2)
+	{
+	  nd.perm[i] = d->perm[i] & mask;
+	  nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
+	}
+      if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
+				  d->testing_p))
+	return true;
+
+      /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
+      if (nelt >= 4)
+	{
+	  for (i = 0; i < nelt; i += 4)
+	    {
+	      nd.perm[i + 0] = d->perm[i + 0] & mask;
+	      nd.perm[i + 1] = d->perm[i + 1] & mask;
+	      nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
+	      nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
+	    }
+
+	  if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
+				      d->testing_p))
+	    return true;
+	}
+    }
+
+  /* Try movss/movsd instructions.  */
+  if (expand_vec_perm_movs (d))
+    return true;
+
+  /* Finally, try the fully general two operand permute.  */
+  if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
+			      d->testing_p))
+    return true;
+
+  /* Recognize interleave style patterns with reversed operands.  */
+  if (!d->one_operand_p)
+    {
+      for (i = 0; i < nelt; ++i)
+	{
+	  unsigned e = d->perm[i];
+	  if (e >= nelt)
+	    e -= nelt;
+	  else
+	    e += nelt;
+	  nd.perm[i] = e;
+	}
+
+      if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
+				  d->testing_p))
+	return true;
+    }
+
+  /* Try the SSE4.1 blend variable merge instructions.  */
+  if (expand_vec_perm_blend (d))
+    return true;
+
+  /* Try one of the AVX vpermil variable permutations.  */
+  if (expand_vec_perm_vpermil (d))
+    return true;
+
+  /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
+     vpshufb, vpermd, vpermps or vpermq variable permutation.  */
+  if (expand_vec_perm_pshufb (d))
+    return true;
+
+  /* Try the AVX2 vpalignr instruction.  */
+  if (expand_vec_perm_palignr (d, true))
+    return true;
+
+  /* Try the AVX512F vperm{s,d} instructions.  */
+  if (ix86_expand_vec_one_operand_perm_avx512 (d))
+    return true;
+
+  /* Try the AVX512F vpermt2/vpermi2 instructions.  */
+  if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
+    return true;
+
+  /* See if we can get the same permutation in different vector integer
+     mode.  */
+  if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
+    {
+      if (!d->testing_p)
+	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
+      return true;
+    }
+  return false;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   in terms of a pair of pshuflw + pshufhw instructions.  */
+
+static bool
+expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
+{
+  unsigned char perm2[MAX_VECT_LEN];
+  unsigned i;
+  bool ok;
+
+  if (d->vmode != V8HImode || !d->one_operand_p)
+    return false;
+
+  /* The two permutations only operate in 64-bit lanes.  */
+  for (i = 0; i < 4; ++i)
+    if (d->perm[i] >= 4)
+      return false;
+  for (i = 4; i < 8; ++i)
+    if (d->perm[i] < 4)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  /* Emit the pshuflw.  */
+  memcpy (perm2, d->perm, 4);
+  for (i = 4; i < 8; ++i)
+    perm2[i] = i;
+  ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
+  gcc_assert (ok);
+
+  /* Emit the pshufhw.  */
+  memcpy (perm2 + 4, d->perm + 4, 4);
+  for (i = 0; i < 4; ++i)
+    perm2[i] = i;
+  ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
+  gcc_assert (ok);
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+   the permutation using the SSSE3 palignr instruction.  This succeeds
+   when all of the elements in PERM fit within one vector and we merely
+   need to shift them down so that a single vector permutation has a
+   chance to succeed.  If SINGLE_INSN_ONLY_P, succeed if only
+   the vpalignr instruction itself can perform the requested permutation.  */
+
+static bool
+expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
+{
+  unsigned i, nelt = d->nelt;
+  unsigned min, max, minswap, maxswap;
+  bool in_order, ok, swap = false;
+  rtx shift, target;
+  struct expand_vec_perm_d dcopy;
+
+  /* Even with AVX, palignr only operates on 128-bit vectors,
+     in AVX2 palignr operates on both 128-bit lanes.  */
+  if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+      && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
+    return false;
+
+  min = 2 * nelt;
+  max = 0;
+  minswap = 2 * nelt;
+  maxswap = 0;
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned e = d->perm[i];
+      unsigned eswap = d->perm[i] ^ nelt;
+      if (GET_MODE_SIZE (d->vmode) == 32)
+	{
+	  e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
+	  eswap = e ^ (nelt / 2);
+	}
+      if (e < min)
+	min = e;
+      if (e > max)
+	max = e;
+      if (eswap < minswap)
+	minswap = eswap;
+      if (eswap > maxswap)
+	maxswap = eswap;
+    }
+  if (min == 0
+      || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
+    {
+      if (d->one_operand_p
+	  || minswap == 0
+	  || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
+				   ? nelt / 2 : nelt))
+	return false;
+      swap = true;
+      min = minswap;
+      max = maxswap;
+    }
+
+  /* Given that we have SSSE3, we know we'll be able to implement the
+     single operand permutation after the palignr with pshufb for
+     128-bit vectors.  If SINGLE_INSN_ONLY_P, in_order has to be computed
+     first.  */
+  if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
+    return true;
+
+  dcopy = *d;
+  if (swap)
+    {
+      dcopy.op0 = d->op1;
+      dcopy.op1 = d->op0;
+      for (i = 0; i < nelt; ++i)
+	dcopy.perm[i] ^= nelt;
+    }
+
+  in_order = true;
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned e = dcopy.perm[i];
+      if (GET_MODE_SIZE (d->vmode) == 32
+	  && e >= nelt
+	  && (e & (nelt / 2 - 1)) < min)
+	e = e - min - (nelt / 2);
+      else
+	e = e - min;
+      if (e != i)
+	in_order = false;
+      dcopy.perm[i] = e;
+    }
+  dcopy.one_operand_p = true;
+
+  if (single_insn_only_p && !in_order)
+    return false;
+
+  /* For AVX2, test whether we can permute the result in one instruction.  */
+  if (d->testing_p)
+    {
+      if (in_order)
+	return true;
+      dcopy.op1 = dcopy.op0;
+      return expand_vec_perm_1 (&dcopy);
+    }
+
+  shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
+  if (GET_MODE_SIZE (d->vmode) == 16)
+    {
+      target = gen_reg_rtx (TImode);
+      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
+				      gen_lowpart (TImode, dcopy.op0), shift));
+    }
+  else
+    {
+      target = gen_reg_rtx (V2TImode);
+      emit_insn (gen_avx2_palignrv2ti (target,
+				       gen_lowpart (V2TImode, dcopy.op1),
+				       gen_lowpart (V2TImode, dcopy.op0),
+				       shift));
+    }
+
+  dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
+
+  /* Test for the degenerate case where the alignment by itself
+     produces the desired permutation.  */
+  if (in_order)
+    {
+      emit_move_insn (d->target, dcopy.op0);
+      return true;
+    }
+
+  ok = expand_vec_perm_1 (&dcopy);
+  gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
+
+  return ok;
+}
+
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
+   the permutation using the SSE4_1 pblendv instruction.  Potentially
+   reduces permutation from 2 pshufb and or to 1 pshufb and pblendv.  */
+
+static bool
+expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
+{
+  unsigned i, which, nelt = d->nelt;
+  struct expand_vec_perm_d dcopy, dcopy1;
+  machine_mode vmode = d->vmode;
+  bool ok;
+
+  /* Use the same checks as in expand_vec_perm_blend.  */
+  if (d->one_operand_p)
+    return false;
+  if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+    ;
+  else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
+    ;
+  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+    ;
+  else
+    return false;
+
+  /* Figure out where permutation elements stay not in their
+     respective lanes.  */
+  for (i = 0, which = 0; i < nelt; ++i)
+    {
+      unsigned e = d->perm[i];
+      if (e != i)
+	which |= (e < nelt ? 1 : 2);
+    }
+  /* We can pblend the part where elements stay not in their
+     respective lanes only when these elements are all in one
+     half of a permutation.
+     {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
+     lanes, but both 8 and 9 >= 8
+     {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
+     respective lanes and 8 >= 8, but 2 not.  */
+  if (which != 1 && which != 2)
+    return false;
+  if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
+    return true;
+
+  /* First we apply one operand permutation to the part where
+     elements stay not in their respective lanes.  */
+  dcopy = *d;
+  if (which == 2)
+    dcopy.op0 = dcopy.op1 = d->op1;
+  else
+    dcopy.op0 = dcopy.op1 = d->op0;
+  if (!d->testing_p)
+    dcopy.target = gen_reg_rtx (vmode);
+  dcopy.one_operand_p = true;
+
+  for (i = 0; i < nelt; ++i)
+    dcopy.perm[i] = d->perm[i] & (nelt - 1);
+
+  ok = expand_vec_perm_1 (&dcopy);
+  if (GET_MODE_SIZE (vmode) != 16 && !ok)
+    return false;
+  else
+    gcc_assert (ok);
+  if (d->testing_p)
+    return true;
+
+  /* Next we put permuted elements into their positions.  */
+  dcopy1 = *d;
+  if (which == 2)
+    dcopy1.op1 = dcopy.target;
+  else
+    dcopy1.op0 = dcopy.target;
+
+  for (i = 0; i < nelt; ++i)
+    dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
+
+  ok = expand_vec_perm_blend (&dcopy1);
+  gcc_assert (ok);
+
+  return true;
+}
+
+static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+   a two vector permutation into a single vector permutation by using
+   an interleave operation to merge the vectors.  */
+
+static bool
+expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
+{
+  struct expand_vec_perm_d dremap, dfinal;
+  unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
+  unsigned HOST_WIDE_INT contents;
+  unsigned char remap[2 * MAX_VECT_LEN];
+  rtx_insn *seq;
+  bool ok, same_halves = false;
+
+  if (GET_MODE_SIZE (d->vmode) == 16)
+    {
+      if (d->one_operand_p)
+	return false;
+    }
+  else if (GET_MODE_SIZE (d->vmode) == 32)
+    {
+      if (!TARGET_AVX)
+	return false;
+      /* For 32-byte modes allow even d->one_operand_p.
+	 The lack of cross-lane shuffling in some instructions
+	 might prevent a single insn shuffle.  */
+      dfinal = *d;
+      dfinal.testing_p = true;
+      /* If expand_vec_perm_interleave3 can expand this into
+	 a 3 insn sequence, give up and let it be expanded as
+	 3 insn sequence.  While that is one insn longer,
+	 it doesn't need a memory operand and in the common
+	 case that both interleave low and high permutations
+	 with the same operands are adjacent needs 4 insns
+	 for both after CSE.  */
+      if (expand_vec_perm_interleave3 (&dfinal))
+	return false;
+    }
+  else
+    return false;
+
+  /* Examine from whence the elements come.  */
+  contents = 0;
+  for (i = 0; i < nelt; ++i)
+    contents |= HOST_WIDE_INT_1U << d->perm[i];
+
+  memset (remap, 0xff, sizeof (remap));
+  dremap = *d;
+
+  if (GET_MODE_SIZE (d->vmode) == 16)
+    {
+      unsigned HOST_WIDE_INT h1, h2, h3, h4;
+
+      /* Split the two input vectors into 4 halves.  */
+      h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
+      h2 = h1 << nelt2;
+      h3 = h2 << nelt2;
+      h4 = h3 << nelt2;
+
+      /* If the elements from the low halves use interleave low, and similarly
+	 for interleave high.  If the elements are from mis-matched halves, we
+	 can use shufps for V4SF/V4SI or do a DImode shuffle.  */
+      if ((contents & (h1 | h3)) == contents)
+	{
+	  /* punpckl* */
+	  for (i = 0; i < nelt2; ++i)
+	    {
+	      remap[i] = i * 2;
+	      remap[i + nelt] = i * 2 + 1;
+	      dremap.perm[i * 2] = i;
+	      dremap.perm[i * 2 + 1] = i + nelt;
+	    }
+	  if (!TARGET_SSE2 && d->vmode == V4SImode)
+	    dremap.vmode = V4SFmode;
+	}
+      else if ((contents & (h2 | h4)) == contents)
+	{
+	  /* punpckh* */
+	  for (i = 0; i < nelt2; ++i)
+	    {
+	      remap[i + nelt2] = i * 2;
+	      remap[i + nelt + nelt2] = i * 2 + 1;
+	      dremap.perm[i * 2] = i + nelt2;
+	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
+	    }
+	  if (!TARGET_SSE2 && d->vmode == V4SImode)
+	    dremap.vmode = V4SFmode;
+	}
+      else if ((contents & (h1 | h4)) == contents)
+	{
+	  /* shufps */
+	  for (i = 0; i < nelt2; ++i)
+	    {
+	      remap[i] = i;
+	      remap[i + nelt + nelt2] = i + nelt2;
+	      dremap.perm[i] = i;
+	      dremap.perm[i + nelt2] = i + nelt + nelt2;
+	    }
+	  if (nelt != 4)
+	    {
+	      /* shufpd */
+	      dremap.vmode = V2DImode;
+	      dremap.nelt = 2;
+	      dremap.perm[0] = 0;
+	      dremap.perm[1] = 3;
+	    }
+	}
+      else if ((contents & (h2 | h3)) == contents)
+	{
+	  /* shufps */
+	  for (i = 0; i < nelt2; ++i)
+	    {
+	      remap[i + nelt2] = i;
+	      remap[i + nelt] = i + nelt2;
+	      dremap.perm[i] = i + nelt2;
+	      dremap.perm[i + nelt2] = i + nelt;
+	    }
+	  if (nelt != 4)
+	    {
+	      /* shufpd */
+	      dremap.vmode = V2DImode;
+	      dremap.nelt = 2;
+	      dremap.perm[0] = 1;
+	      dremap.perm[1] = 2;
+	    }
+	}
+      else
+	return false;
+    }
+  else
+    {
+      unsigned int nelt4 = nelt / 4, nzcnt = 0;
+      unsigned HOST_WIDE_INT q[8];
+      unsigned int nonzero_halves[4];
+
+      /* Split the two input vectors into 8 quarters.  */
+      q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
+      for (i = 1; i < 8; ++i)
+	q[i] = q[0] << (nelt4 * i);
+      for (i = 0; i < 4; ++i)
+	if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
+	  {
+	    nonzero_halves[nzcnt] = i;
+	    ++nzcnt;
+	  }
+
+      if (nzcnt == 1)
+	{
+	  gcc_assert (d->one_operand_p);
+	  nonzero_halves[1] = nonzero_halves[0];
+	  same_halves = true;
+	}
+      else if (d->one_operand_p)
+	{
+	  gcc_assert (nonzero_halves[0] == 0);
+	  gcc_assert (nonzero_halves[1] == 1);
+	}
+
+      if (nzcnt <= 2)
+	{
+	  if (d->perm[0] / nelt2 == nonzero_halves[1])
+	    {
+	      /* Attempt to increase the likelihood that dfinal
+		 shuffle will be intra-lane.  */
+	      std::swap (nonzero_halves[0], nonzero_halves[1]);
+	    }
+
+	  /* vperm2f128 or vperm2i128.  */
+	  for (i = 0; i < nelt2; ++i)
+	    {
+	      remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
+	      remap[i + nonzero_halves[0] * nelt2] = i;
+	      dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
+	      dremap.perm[i] = i + nonzero_halves[0] * nelt2;
+	    }
+
+	  if (d->vmode != V8SFmode
+	      && d->vmode != V4DFmode
+	      && d->vmode != V8SImode)
+	    {
+	      dremap.vmode = V8SImode;
+	      dremap.nelt = 8;
+	      for (i = 0; i < 4; ++i)
+		{
+		  dremap.perm[i] = i + nonzero_halves[0] * 4;
+		  dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
+		}
+	    }
+	}
+      else if (d->one_operand_p)
+	return false;
+      else if (TARGET_AVX2
+	       && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
+	{
+	  /* vpunpckl* */
+	  for (i = 0; i < nelt4; ++i)
+	    {
+	      remap[i] = i * 2;
+	      remap[i + nelt] = i * 2 + 1;
+	      remap[i + nelt2] = i * 2 + nelt2;
+	      remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
+	      dremap.perm[i * 2] = i;
+	      dremap.perm[i * 2 + 1] = i + nelt;
+	      dremap.perm[i * 2 + nelt2] = i + nelt2;
+	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
+	    }
+	}
+      else if (TARGET_AVX2
+	       && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
+	{
+	  /* vpunpckh* */
+	  for (i = 0; i < nelt4; ++i)
+	    {
+	      remap[i + nelt4] = i * 2;
+	      remap[i + nelt + nelt4] = i * 2 + 1;
+	      remap[i + nelt2 + nelt4] = i * 2 + nelt2;
+	      remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
+	      dremap.perm[i * 2] = i + nelt4;
+	      dremap.perm[i * 2 + 1] = i + nelt + nelt4;
+	      dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
+	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
+	    }
+	}
+      else
+	return false;
+    }
+
+  /* Use the remapping array set up above to move the elements from their
+     swizzled locations into their final destinations.  */
+  dfinal = *d;
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned e = remap[d->perm[i]];
+      gcc_assert (e < nelt);
+      /* If same_halves is true, both halves of the remapped vector are the
+	 same.  Avoid cross-lane accesses if possible.  */
+      if (same_halves && i >= nelt2)
+	{
+	  gcc_assert (e < nelt2);
+	  dfinal.perm[i] = e + nelt2;
+	}
+      else
+	dfinal.perm[i] = e;
+    }
+  if (!d->testing_p)
+    {
+      dremap.target = gen_reg_rtx (dremap.vmode);
+      dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
+    }
+  dfinal.op1 = dfinal.op0;
+  dfinal.one_operand_p = true;
+
+  /* Test if the final remap can be done with a single insn.  For V4SFmode or
+     V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
+  start_sequence ();
+  ok = expand_vec_perm_1 (&dfinal);
+  seq = get_insns ();
+  end_sequence ();
+
+  if (!ok)
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  if (dremap.vmode != dfinal.vmode)
+    {
+      dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
+      dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
+    }
+
+  ok = expand_vec_perm_1 (&dremap);
+  gcc_assert (ok);
+
+  emit_insn (seq);
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+   a single vector cross-lane permutation into vpermq followed
+   by any of the single insn permutations.  */
+
+static bool
+expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
+{
+  struct expand_vec_perm_d dremap, dfinal;
+  unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
+  unsigned contents[2];
+  bool ok;
+
+  if (!(TARGET_AVX2
+	&& (d->vmode == V32QImode || d->vmode == V16HImode)
+	&& d->one_operand_p))
+    return false;
+
+  contents[0] = 0;
+  contents[1] = 0;
+  for (i = 0; i < nelt2; ++i)
+    {
+      contents[0] |= 1u << (d->perm[i] / nelt4);
+      contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
+    }
+
+  for (i = 0; i < 2; ++i)
+    {
+      unsigned int cnt = 0;
+      for (j = 0; j < 4; ++j)
+	if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
+	  return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  dremap = *d;
+  dremap.vmode = V4DImode;
+  dremap.nelt = 4;
+  dremap.target = gen_reg_rtx (V4DImode);
+  dremap.op0 = gen_lowpart (V4DImode, d->op0);
+  dremap.op1 = dremap.op0;
+  dremap.one_operand_p = true;
+  for (i = 0; i < 2; ++i)
+    {
+      unsigned int cnt = 0;
+      for (j = 0; j < 4; ++j)
+	if ((contents[i] & (1u << j)) != 0)
+	  dremap.perm[2 * i + cnt++] = j;
+      for (; cnt < 2; ++cnt)
+	dremap.perm[2 * i + cnt] = 0;
+    }
+
+  dfinal = *d;
+  dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
+  dfinal.op1 = dfinal.op0;
+  dfinal.one_operand_p = true;
+  for (i = 0, j = 0; i < nelt; ++i)
+    {
+      if (i == nelt2)
+	j = 2;
+      dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
+      if ((d->perm[i] / nelt4) == dremap.perm[j])
+	;
+      else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
+	dfinal.perm[i] |= nelt4;
+      else
+	gcc_unreachable ();
+    }
+
+  ok = expand_vec_perm_1 (&dremap);
+  gcc_assert (ok);
+
+  ok = expand_vec_perm_1 (&dfinal);
+  gcc_assert (ok);
+
+  return true;
+}
+
+static bool canonicalize_perm (struct expand_vec_perm_d *d);
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to expand
+   a vector permutation using two instructions, vperm2f128 resp.
+   vperm2i128 followed by any single in-lane permutation.  */
+
+static bool
+expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
+{
+  struct expand_vec_perm_d dfirst, dsecond;
+  unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
+  bool ok;
+
+  if (!TARGET_AVX
+      || GET_MODE_SIZE (d->vmode) != 32
+      || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
+    return false;
+
+  dsecond = *d;
+  dsecond.one_operand_p = false;
+  dsecond.testing_p = true;
+
+  /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
+     immediate.  For perm < 16 the second permutation uses
+     d->op0 as first operand, for perm >= 16 it uses d->op1
+     as first operand.  The second operand is the result of
+     vperm2[fi]128.  */
+  for (perm = 0; perm < 32; perm++)
+    {
+      /* Ignore permutations which do not move anything cross-lane.  */
+      if (perm < 16)
+	{
+	  /* The second shuffle for e.g. V4DFmode has
+	     0123 and ABCD operands.
+	     Ignore AB23, as 23 is already in the second lane
+	     of the first operand.  */
+	  if ((perm & 0xc) == (1 << 2)) continue;
+	  /* And 01CD, as 01 is in the first lane of the first
+	     operand.  */
+	  if ((perm & 3) == 0) continue;
+	  /* And 4567, as then the vperm2[fi]128 doesn't change
+	     anything on the original 4567 second operand.  */
+	  if ((perm & 0xf) == ((3 << 2) | 2)) continue;
+	}
+      else
+	{
+	  /* The second shuffle for e.g. V4DFmode has
+	     4567 and ABCD operands.
+	     Ignore AB67, as 67 is already in the second lane
+	     of the first operand.  */
+	  if ((perm & 0xc) == (3 << 2)) continue;
+	  /* And 45CD, as 45 is in the first lane of the first
+	     operand.  */
+	  if ((perm & 3) == 2) continue;
+	  /* And 0123, as then the vperm2[fi]128 doesn't change
+	     anything on the original 0123 first operand.  */
+	  if ((perm & 0xf) == (1 << 2)) continue;
+	}
+
+      for (i = 0; i < nelt; i++)
+	{
+	  j = d->perm[i] / nelt2;
+	  if (j == ((perm >> (2 * (i >= nelt2))) & 3))
+	    dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
+	  else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
+	    dsecond.perm[i] = d->perm[i] & (nelt - 1);
+	  else
+	    break;
+	}
+
+      if (i == nelt)
+	{
+	  start_sequence ();
+	  ok = expand_vec_perm_1 (&dsecond);
+	  end_sequence ();
+	}
+      else
+	ok = false;
+
+      if (ok)
+	{
+	  if (d->testing_p)
+	    return true;
+
+	  /* Found a usable second shuffle.  dfirst will be
+	     vperm2f128 on d->op0 and d->op1.  */
+	  dsecond.testing_p = false;
+	  dfirst = *d;
+	  dfirst.target = gen_reg_rtx (d->vmode);
+	  for (i = 0; i < nelt; i++)
+	    dfirst.perm[i] = (i & (nelt2 - 1))
+			     + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
+
+	  canonicalize_perm (&dfirst);
+	  ok = expand_vec_perm_1 (&dfirst);
+	  gcc_assert (ok);
+
+	  /* And dsecond is some single insn shuffle, taking
+	     d->op0 and result of vperm2f128 (if perm < 16) or
+	     d->op1 and result of vperm2f128 (otherwise).  */
+	  if (perm >= 16)
+	    dsecond.op0 = dsecond.op1;
+	  dsecond.op1 = dfirst.target;
+
+	  ok = expand_vec_perm_1 (&dsecond);
+	  gcc_assert (ok);
+
+	  return true;
+	}
+
+      /* For one operand, the only useful vperm2f128 permutation is 0x01
+	 aka lanes swap.  */
+      if (d->one_operand_p)
+	return false;
+    }
+
+  return false;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+   a two vector permutation using 2 intra-lane interleave insns
+   and cross-lane shuffle for 32-byte vectors.  */
+
+static bool
+expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt;
+  rtx (*gen) (rtx, rtx, rtx);
+
+  if (d->one_operand_p)
+    return false;
+  if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
+    ;
+  else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
+    ;
+  else
+    return false;
+
+  nelt = d->nelt;
+  if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
+    return false;
+  for (i = 0; i < nelt; i += 2)
+    if (d->perm[i] != d->perm[0] + i / 2
+	|| d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  switch (d->vmode)
+    {
+    case E_V32QImode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv32qi;
+      else
+	gen = gen_vec_interleave_lowv32qi;
+      break;
+    case E_V16HImode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv16hi;
+      else
+	gen = gen_vec_interleave_lowv16hi;
+      break;
+    case E_V8SImode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv8si;
+      else
+	gen = gen_vec_interleave_lowv8si;
+      break;
+    case E_V4DImode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv4di;
+      else
+	gen = gen_vec_interleave_lowv4di;
+      break;
+    case E_V8SFmode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv8sf;
+      else
+	gen = gen_vec_interleave_lowv8sf;
+      break;
+    case E_V4DFmode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv4df;
+      else
+	gen = gen_vec_interleave_lowv4df;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  emit_insn (gen (d->target, d->op0, d->op1));
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement
+   a single vector permutation using a single intra-lane vector
+   permutation, vperm2f128 swapping the lanes and vblend* insn blending
+   the non-swapped and swapped vectors together.  */
+
+static bool
+expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
+{
+  struct expand_vec_perm_d dfirst, dsecond;
+  unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
+  rtx_insn *seq;
+  bool ok;
+  rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
+
+  if (!TARGET_AVX
+      || TARGET_AVX2
+      || (d->vmode != V8SFmode && d->vmode != V4DFmode)
+      || !d->one_operand_p)
+    return false;
+
+  dfirst = *d;
+  for (i = 0; i < nelt; i++)
+    dfirst.perm[i] = 0xff;
+  for (i = 0, msk = 0; i < nelt; i++)
+    {
+      j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
+      if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
+	return false;
+      dfirst.perm[j] = d->perm[i];
+      if (j != i)
+	msk |= (1 << i);
+    }
+  for (i = 0; i < nelt; i++)
+    if (dfirst.perm[i] == 0xff)
+      dfirst.perm[i] = i;
+
+  if (!d->testing_p)
+    dfirst.target = gen_reg_rtx (dfirst.vmode);
+
+  start_sequence ();
+  ok = expand_vec_perm_1 (&dfirst);
+  seq = get_insns ();
+  end_sequence ();
+
+  if (!ok)
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  emit_insn (seq);
+
+  dsecond = *d;
+  dsecond.op0 = dfirst.target;
+  dsecond.op1 = dfirst.target;
+  dsecond.one_operand_p = true;
+  dsecond.target = gen_reg_rtx (dsecond.vmode);
+  for (i = 0; i < nelt; i++)
+    dsecond.perm[i] = i ^ nelt2;
+
+  ok = expand_vec_perm_1 (&dsecond);
+  gcc_assert (ok);
+
+  blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
+  emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement a V4DF
+   permutation using two vperm2f128, followed by a vshufpd insn blending
+   the two vectors together.  */
+
+static bool
+expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
+{
+  struct expand_vec_perm_d dfirst, dsecond, dthird;
+  bool ok;
+
+  if (!TARGET_AVX || (d->vmode != V4DFmode))
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  dfirst = *d;
+  dsecond = *d;
+  dthird = *d;
+
+  dfirst.perm[0] = (d->perm[0] & ~1);
+  dfirst.perm[1] = (d->perm[0] & ~1) + 1;
+  dfirst.perm[2] = (d->perm[2] & ~1);
+  dfirst.perm[3] = (d->perm[2] & ~1) + 1;
+  dsecond.perm[0] = (d->perm[1] & ~1);
+  dsecond.perm[1] = (d->perm[1] & ~1) + 1;
+  dsecond.perm[2] = (d->perm[3] & ~1);
+  dsecond.perm[3] = (d->perm[3] & ~1) + 1;
+  dthird.perm[0] = (d->perm[0] % 2);
+  dthird.perm[1] = (d->perm[1] % 2) + 4;
+  dthird.perm[2] = (d->perm[2] % 2) + 2;
+  dthird.perm[3] = (d->perm[3] % 2) + 6;
+
+  dfirst.target = gen_reg_rtx (dfirst.vmode);
+  dsecond.target = gen_reg_rtx (dsecond.vmode);
+  dthird.op0 = dfirst.target;
+  dthird.op1 = dsecond.target;
+  dthird.one_operand_p = false;
+
+  canonicalize_perm (&dfirst);
+  canonicalize_perm (&dsecond);
+
+  ok = expand_vec_perm_1 (&dfirst)
+       && expand_vec_perm_1 (&dsecond)
+       && expand_vec_perm_1 (&dthird);
+
+  gcc_assert (ok);
+
+  return true;
+}
+
+/* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
+   permutation with two pshufb insns and an ior.  We should have already
+   failed all two instruction sequences.  */
+
+static bool
+expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
+{
+  rtx rperm[2][16], vperm, l, h, op, m128;
+  unsigned int i, nelt, eltsz;
+
+  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+    return false;
+  gcc_assert (!d->one_operand_p);
+
+  if (d->testing_p)
+    return true;
+
+  nelt = d->nelt;
+  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+
+  /* Generate two permutation masks.  If the required element is within
+     the given vector it is shuffled into the proper lane.  If the required
+     element is in the other vector, force a zero into the lane by setting
+     bit 7 in the permutation mask.  */
+  m128 = GEN_INT (-128);
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned j, e = d->perm[i];
+      unsigned which = (e >= nelt);
+      if (e >= nelt)
+	e -= nelt;
+
+      for (j = 0; j < eltsz; ++j)
+	{
+	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
+	  rperm[1-which][i*eltsz + j] = m128;
+	}
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
+  vperm = force_reg (V16QImode, vperm);
+
+  l = gen_reg_rtx (V16QImode);
+  op = gen_lowpart (V16QImode, d->op0);
+  emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
+
+  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
+  vperm = force_reg (V16QImode, vperm);
+
+  h = gen_reg_rtx (V16QImode);
+  op = gen_lowpart (V16QImode, d->op1);
+  emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
+
+  op = d->target;
+  if (d->vmode != V16QImode)
+    op = gen_reg_rtx (V16QImode);
+  emit_insn (gen_iorv16qi3 (op, l, h));
+  if (op != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+
+  return true;
+}
+
+/* Implement arbitrary permutation of one V32QImode and V16QImode operand
+   with two vpshufb insns, vpermq and vpor.  We should have already failed
+   all two or three instruction sequences.  */
+
+static bool
+expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
+{
+  rtx rperm[2][32], vperm, l, h, hp, op, m128;
+  unsigned int i, nelt, eltsz;
+
+  if (!TARGET_AVX2
+      || !d->one_operand_p
+      || (d->vmode != V32QImode && d->vmode != V16HImode))
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  nelt = d->nelt;
+  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+
+  /* Generate two permutation masks.  If the required element is within
+     the same lane, it is shuffled in.  If the required element from the
+     other lane, force a zero by setting bit 7 in the permutation mask.
+     In the other mask the mask has non-negative elements if element
+     is requested from the other lane, but also moved to the other lane,
+     so that the result of vpshufb can have the two V2TImode halves
+     swapped.  */
+  m128 = GEN_INT (-128);
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+      unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
+
+      for (j = 0; j < eltsz; ++j)
+	{
+	  rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
+	  rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
+	}
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
+  vperm = force_reg (V32QImode, vperm);
+
+  h = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V32QImode, d->op0);
+  emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
+
+  /* Swap the 128-byte lanes of h into hp.  */
+  hp = gen_reg_rtx (V4DImode);
+  op = gen_lowpart (V4DImode, h);
+  emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
+				  const1_rtx));
+
+  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
+  vperm = force_reg (V32QImode, vperm);
+
+  l = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V32QImode, d->op0);
+  emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
+
+  op = d->target;
+  if (d->vmode != V32QImode)
+    op = gen_reg_rtx (V32QImode);
+  emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
+  if (op != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+
+  return true;
+}
+
+/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
+   and extract-odd permutations of two V32QImode and V16QImode operand
+   with two vpshufb insns, vpor and vpermq.  We should have already
+   failed all two or three instruction sequences.  */
+
+static bool
+expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
+{
+  rtx rperm[2][32], vperm, l, h, ior, op, m128;
+  unsigned int i, nelt, eltsz;
+
+  if (!TARGET_AVX2
+      || d->one_operand_p
+      || (d->vmode != V32QImode && d->vmode != V16HImode))
+    return false;
+
+  for (i = 0; i < d->nelt; ++i)
+    if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  nelt = d->nelt;
+  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+
+  /* Generate two permutation masks.  In the first permutation mask
+     the first quarter will contain indexes for the first half
+     of the op0, the second quarter will contain bit 7 set, third quarter
+     will contain indexes for the second half of the op0 and the
+     last quarter bit 7 set.  In the second permutation mask
+     the first quarter will contain bit 7 set, the second quarter
+     indexes for the first half of the op1, the third quarter bit 7 set
+     and last quarter indexes for the second half of the op1.
+     I.e. the first mask e.g. for V32QImode extract even will be:
+     0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
+     (all values masked with 0xf except for -128) and second mask
+     for extract even will be
+     -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
+  m128 = GEN_INT (-128);
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+      unsigned which = d->perm[i] >= nelt;
+      unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
+
+      for (j = 0; j < eltsz; ++j)
+	{
+	  rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
+	  rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
+	}
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
+  vperm = force_reg (V32QImode, vperm);
+
+  l = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V32QImode, d->op0);
+  emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
+
+  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
+  vperm = force_reg (V32QImode, vperm);
+
+  h = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V32QImode, d->op1);
+  emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
+
+  ior = gen_reg_rtx (V32QImode);
+  emit_insn (gen_iorv32qi3 (ior, l, h));
+
+  /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
+  op = gen_reg_rtx (V4DImode);
+  ior = gen_lowpart (V4DImode, ior);
+  emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
+				  const1_rtx, GEN_INT (3)));
+  emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+
+  return true;
+}
+
+/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
+   and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
+   with two "and" and "pack" or two "shift" and "pack" insns.  We should
+   have already failed all two instruction sequences.  */
+
+static bool
+expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
+{
+  rtx op, dop0, dop1, t;
+  unsigned i, odd, c, s, nelt = d->nelt;
+  bool end_perm = false;
+  machine_mode half_mode;
+  rtx (*gen_and) (rtx, rtx, rtx);
+  rtx (*gen_pack) (rtx, rtx, rtx);
+  rtx (*gen_shift) (rtx, rtx, rtx);
+
+  if (d->one_operand_p)
+    return false;
+
+  switch (d->vmode)
+    {
+    case E_V8HImode:
+      /* Required for "pack".  */
+      if (!TARGET_SSE4_1)
+        return false;
+      c = 0xffff;
+      s = 16;
+      half_mode = V4SImode;
+      gen_and = gen_andv4si3;
+      gen_pack = gen_sse4_1_packusdw;
+      gen_shift = gen_lshrv4si3;
+      break;
+    case E_V16QImode:
+      /* No check as all instructions are SSE2.  */
+      c = 0xff;
+      s = 8;
+      half_mode = V8HImode;
+      gen_and = gen_andv8hi3;
+      gen_pack = gen_sse2_packuswb;
+      gen_shift = gen_lshrv8hi3;
+      break;
+    case E_V16HImode:
+      if (!TARGET_AVX2)
+        return false;
+      c = 0xffff;
+      s = 16;
+      half_mode = V8SImode;
+      gen_and = gen_andv8si3;
+      gen_pack = gen_avx2_packusdw;
+      gen_shift = gen_lshrv8si3;
+      end_perm = true;
+      break;
+    case E_V32QImode:
+      if (!TARGET_AVX2)
+        return false;
+      c = 0xff;
+      s = 8;
+      half_mode = V16HImode;
+      gen_and = gen_andv16hi3;
+      gen_pack = gen_avx2_packuswb;
+      gen_shift = gen_lshrv16hi3;
+      end_perm = true;
+      break;
+    default:
+      /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
+	 general shuffles.  */
+      return false;
+    }
+
+  /* Check that permutation is even or odd.  */
+  odd = d->perm[0];
+  if (odd > 1)
+    return false;
+
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != 2 * i + odd)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  dop0 = gen_reg_rtx (half_mode);
+  dop1 = gen_reg_rtx (half_mode);
+  if (odd == 0)
+    {
+      t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
+      t = force_reg (half_mode, t);
+      emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
+      emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+    }
+  else
+    {
+      emit_insn (gen_shift (dop0,
+			    gen_lowpart (half_mode, d->op0),
+			    GEN_INT (s)));
+      emit_insn (gen_shift (dop1,
+			    gen_lowpart (half_mode, d->op1),
+			    GEN_INT (s)));
+    }
+  /* In AVX2 for 256 bit case we need to permute pack result.  */
+  if (TARGET_AVX2 && end_perm)
+    {
+      op = gen_reg_rtx (d->vmode);
+      t = gen_reg_rtx (V4DImode);
+      emit_insn (gen_pack (op, dop0, dop1));
+      emit_insn (gen_avx2_permv4di_1 (t,
+				      gen_lowpart (V4DImode, op),
+				      const0_rtx,
+				      const2_rtx,
+				      const1_rtx,
+				      GEN_INT (3)));
+      emit_move_insn (d->target, gen_lowpart (d->vmode, t));
+    }
+  else
+    emit_insn (gen_pack (d->target, dop0, dop1));
+
+  return true;
+}
+
+/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
+   and extract-odd permutations of two V64QI operands
+   with two "shifts", two "truncs" and one "concat" insns for "odd"
+   and two "truncs" and one concat insn for "even."
+   Have already failed all two instruction sequences.  */
+
+static bool
+expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
+{
+  rtx t1, t2, t3, t4;
+  unsigned i, odd, nelt = d->nelt;
+
+  if (!TARGET_AVX512BW
+      || d->one_operand_p
+      || d->vmode != V64QImode)
+    return false;
+
+  /* Check that permutation is even or odd.  */
+  odd = d->perm[0];
+  if (odd > 1)
+    return false;
+
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != 2 * i + odd)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+
+  if (odd)
+    {
+      t1 = gen_reg_rtx (V32HImode);
+      t2 = gen_reg_rtx (V32HImode);
+      emit_insn (gen_lshrv32hi3 (t1,
+				 gen_lowpart (V32HImode, d->op0),
+				 GEN_INT (8)));
+      emit_insn (gen_lshrv32hi3 (t2,
+				 gen_lowpart (V32HImode, d->op1),
+				 GEN_INT (8)));
+    }
+  else
+    {
+      t1 = gen_lowpart (V32HImode, d->op0);
+      t2 = gen_lowpart (V32HImode, d->op1);
+    }
+
+  t3 = gen_reg_rtx (V32QImode);
+  t4 = gen_reg_rtx (V32QImode);
+  emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
+  emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
+  emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
+   and extract-odd permutations.  */
+
+static bool
+expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
+{
+  rtx t1, t2, t3, t4, t5;
+
+  switch (d->vmode)
+    {
+    case E_V4DFmode:
+      if (d->testing_p)
+	break;
+      t1 = gen_reg_rtx (V4DFmode);
+      t2 = gen_reg_rtx (V4DFmode);
+
+      /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
+      emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
+      emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
+
+      /* Now an unpck[lh]pd will produce the result required.  */
+      if (odd)
+	t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
+      else
+	t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
+      emit_insn (t3);
+      break;
+
+    case E_V8SFmode:
+      {
+	int mask = odd ? 0xdd : 0x88;
+
+	if (d->testing_p)
+	  break;
+	t1 = gen_reg_rtx (V8SFmode);
+	t2 = gen_reg_rtx (V8SFmode);
+	t3 = gen_reg_rtx (V8SFmode);
+
+	/* Shuffle within the 128-bit lanes to produce:
+	   { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
+	emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
+				      GEN_INT (mask)));
+
+	/* Shuffle the lanes around to produce:
+	   { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
+	emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
+					    GEN_INT (0x3)));
+
+	/* Shuffle within the 128-bit lanes to produce:
+	   { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
+	emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
+
+	/* Shuffle within the 128-bit lanes to produce:
+	   { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
+	emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
+
+	/* Shuffle the lanes around to produce:
+	   { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
+	emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
+					    GEN_INT (0x20)));
+      }
+      break;
+
+    case E_V2DFmode:
+    case E_V4SFmode:
+    case E_V2DImode:
+    case E_V4SImode:
+      /* These are always directly implementable by expand_vec_perm_1.  */
+      gcc_unreachable ();
+
+    case E_V8HImode:
+      if (TARGET_SSE4_1)
+	return expand_vec_perm_even_odd_pack (d);
+      else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
+	return expand_vec_perm_pshufb2 (d);
+      else
+	{
+	  if (d->testing_p)
+	    break;
+	  /* We need 2*log2(N)-1 operations to achieve odd/even
+	     with interleave. */
+	  t1 = gen_reg_rtx (V8HImode);
+	  t2 = gen_reg_rtx (V8HImode);
+	  emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
+	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
+	  emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
+	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
+	  if (odd)
+	    t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
+	  else
+	    t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
+	  emit_insn (t3);
+	}
+      break;
+
+    case E_V16QImode:
+      return expand_vec_perm_even_odd_pack (d);
+
+    case E_V16HImode:
+    case E_V32QImode:
+      return expand_vec_perm_even_odd_pack (d);
+
+    case E_V64QImode:
+      return expand_vec_perm_even_odd_trunc (d);
+
+    case E_V4DImode:
+      if (!TARGET_AVX2)
+	{
+	  struct expand_vec_perm_d d_copy = *d;
+	  d_copy.vmode = V4DFmode;
+	  if (d->testing_p)
+	    d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
+	  else
+	    d_copy.target = gen_reg_rtx (V4DFmode);
+	  d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
+	  d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
+	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
+	    {
+	      if (!d->testing_p)
+		emit_move_insn (d->target,
+				gen_lowpart (V4DImode, d_copy.target));
+	      return true;
+	    }
+	  return false;
+	}
+
+      if (d->testing_p)
+	break;
+
+      t1 = gen_reg_rtx (V4DImode);
+      t2 = gen_reg_rtx (V4DImode);
+
+      /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
+      emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
+      emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
+
+      /* Now an vpunpck[lh]qdq will produce the result required.  */
+      if (odd)
+	t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
+      else
+	t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
+      emit_insn (t3);
+      break;
+
+    case E_V8SImode:
+      if (!TARGET_AVX2)
+	{
+	  struct expand_vec_perm_d d_copy = *d;
+	  d_copy.vmode = V8SFmode;
+	  if (d->testing_p)
+	    d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
+	  else
+	    d_copy.target = gen_reg_rtx (V8SFmode);
+	  d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
+	  d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
+	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
+	    {
+	      if (!d->testing_p)
+		emit_move_insn (d->target,
+				gen_lowpart (V8SImode, d_copy.target));
+	      return true;
+	    }
+	  return false;
+	}
+
+      if (d->testing_p)
+	break;
+
+      t1 = gen_reg_rtx (V8SImode);
+      t2 = gen_reg_rtx (V8SImode);
+      t3 = gen_reg_rtx (V4DImode);
+      t4 = gen_reg_rtx (V4DImode);
+      t5 = gen_reg_rtx (V4DImode);
+
+      /* Shuffle the lanes around into
+	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
+      emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
+				    gen_lowpart (V4DImode, d->op1),
+				    GEN_INT (0x20)));
+      emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
+				    gen_lowpart (V4DImode, d->op1),
+				    GEN_INT (0x31)));
+
+      /* Swap the 2nd and 3rd position in each lane into
+	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
+      emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
+				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
+      emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
+				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
+
+      /* Now an vpunpck[lh]qdq will produce
+	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
+      if (odd)
+	t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
+					   gen_lowpart (V4DImode, t2));
+      else
+	t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
+					  gen_lowpart (V4DImode, t2));
+      emit_insn (t3);
+      emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
+   extract-even and extract-odd permutations.  */
+
+static bool
+expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
+{
+  unsigned i, odd, nelt = d->nelt;
+
+  odd = d->perm[0];
+  if (odd != 0 && odd != 1)
+    return false;
+
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != 2 * i + odd)
+      return false;
+
+  return expand_vec_perm_even_odd_1 (d, odd);
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement broadcast
+   permutations.  We assume that expand_vec_perm_1 has already failed.  */
+
+static bool
+expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
+{
+  unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
+  machine_mode vmode = d->vmode;
+  unsigned char perm2[4];
+  rtx op0 = d->op0, dest;
+  bool ok;
+
+  switch (vmode)
+    {
+    case E_V4DFmode:
+    case E_V8SFmode:
+      /* These are special-cased in sse.md so that we can optionally
+	 use the vbroadcast instruction.  They expand to two insns
+	 if the input happens to be in a register.  */
+      gcc_unreachable ();
+
+    case E_V2DFmode:
+    case E_V2DImode:
+    case E_V4SFmode:
+    case E_V4SImode:
+      /* These are always implementable using standard shuffle patterns.  */
+      gcc_unreachable ();
+
+    case E_V8HImode:
+    case E_V16QImode:
+      /* These can be implemented via interleave.  We save one insn by
+	 stopping once we have promoted to V4SImode and then use pshufd.  */
+      if (d->testing_p)
+	return true;
+      do
+	{
+	  rtx dest;
+	  rtx (*gen) (rtx, rtx, rtx)
+	    = vmode == V16QImode ? gen_vec_interleave_lowv16qi
+				 : gen_vec_interleave_lowv8hi;
+
+	  if (elt >= nelt2)
+	    {
+	      gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
+				       : gen_vec_interleave_highv8hi;
+	      elt -= nelt2;
+	    }
+	  nelt2 /= 2;
+
+	  dest = gen_reg_rtx (vmode);
+	  emit_insn (gen (dest, op0, op0));
+	  vmode = get_mode_wider_vector (vmode);
+	  op0 = gen_lowpart (vmode, dest);
+	}
+      while (vmode != V4SImode);
+
+      memset (perm2, elt, 4);
+      dest = gen_reg_rtx (V4SImode);
+      ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
+      gcc_assert (ok);
+      if (!d->testing_p)
+	emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+      return true;
+
+    case E_V64QImode:
+    case E_V32QImode:
+    case E_V16HImode:
+    case E_V8SImode:
+    case E_V4DImode:
+      /* For AVX2 broadcasts of the first element vpbroadcast* or
+	 vpermq should be used by expand_vec_perm_1.  */
+      gcc_assert (!TARGET_AVX2 || d->perm[0]);
+      return false;
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
+   broadcast permutations.  */
+
+static bool
+expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
+{
+  unsigned i, elt, nelt = d->nelt;
+
+  if (!d->one_operand_p)
+    return false;
+
+  elt = d->perm[0];
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != elt)
+      return false;
+
+  return expand_vec_perm_broadcast_1 (d);
+}
+
+/* Implement arbitrary permutations of two V64QImode operands
+   with 2 vperm[it]2w, 2 vpshufb and one vpor instruction.  */
+static bool
+expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
+{
+  if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  struct expand_vec_perm_d ds[2];
+  rtx rperm[128], vperm, target0, target1;
+  unsigned int i, nelt;
+  machine_mode vmode;
+
+  nelt = d->nelt;
+  vmode = V64QImode;
+
+  for (i = 0; i < 2; i++)
+    {
+      ds[i] = *d;
+      ds[i].vmode = V32HImode;
+      ds[i].nelt = 32;
+      ds[i].target = gen_reg_rtx (V32HImode);
+      ds[i].op0 = gen_lowpart (V32HImode, d->op0);
+      ds[i].op1 = gen_lowpart (V32HImode, d->op1);
+    }
+
+  /* Prepare permutations such that the first one takes care of
+     putting the even bytes into the right positions or one higher
+     positions (ds[0]) and the second one takes care of
+     putting the odd bytes into the right positions or one below
+     (ds[1]).  */
+
+  for (i = 0; i < nelt; i++)
+    {
+      ds[i & 1].perm[i / 2] = d->perm[i] / 2;
+      if (i & 1)
+	{
+	  rperm[i] = constm1_rtx;
+	  rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
+	}
+      else
+	{
+	  rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
+	  rperm[i + 64] = constm1_rtx;
+	}
+    }
+
+  bool ok = expand_vec_perm_1 (&ds[0]);
+  gcc_assert (ok);
+  ds[0].target = gen_lowpart (V64QImode, ds[0].target);
+
+  ok = expand_vec_perm_1 (&ds[1]);
+  gcc_assert (ok);
+  ds[1].target = gen_lowpart (V64QImode, ds[1].target);
+
+  vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
+  vperm = force_reg (vmode, vperm);
+  target0 = gen_reg_rtx (V64QImode);
+  emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
+
+  vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
+  vperm = force_reg (vmode, vperm);
+  target1 = gen_reg_rtx (V64QImode);
+  emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
+
+  emit_insn (gen_iorv64qi3 (d->target, target0, target1));
+  return true;
+}
+
+/* Implement arbitrary permutation of two V32QImode and V16QImode operands
+   with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
+   all the shorter instruction sequences.  */
+
+static bool
+expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
+{
+  rtx rperm[4][32], vperm, l[2], h[2], op, m128;
+  unsigned int i, nelt, eltsz;
+  bool used[4];
+
+  if (!TARGET_AVX2
+      || d->one_operand_p
+      || (d->vmode != V32QImode && d->vmode != V16HImode))
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  nelt = d->nelt;
+  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+
+  /* Generate 4 permutation masks.  If the required element is within
+     the same lane, it is shuffled in.  If the required element from the
+     other lane, force a zero by setting bit 7 in the permutation mask.
+     In the other mask the mask has non-negative elements if element
+     is requested from the other lane, but also moved to the other lane,
+     so that the result of vpshufb can have the two V2TImode halves
+     swapped.  */
+  m128 = GEN_INT (-128);
+  for (i = 0; i < 32; ++i)
+    {
+      rperm[0][i] = m128;
+      rperm[1][i] = m128;
+      rperm[2][i] = m128;
+      rperm[3][i] = m128;
+    }
+  used[0] = false;
+  used[1] = false;
+  used[2] = false;
+  used[3] = false;
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+      unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
+      unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
+
+      for (j = 0; j < eltsz; ++j)
+	rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
+      used[which] = true;
+    }
+
+  for (i = 0; i < 2; ++i)
+    {
+      if (!used[2 * i + 1])
+	{
+	  h[i] = NULL_RTX;
+	  continue;
+	}
+      vperm = gen_rtx_CONST_VECTOR (V32QImode,
+				    gen_rtvec_v (32, rperm[2 * i + 1]));
+      vperm = force_reg (V32QImode, vperm);
+      h[i] = gen_reg_rtx (V32QImode);
+      op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
+      emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
+    }
+
+  /* Swap the 128-byte lanes of h[X].  */
+  for (i = 0; i < 2; ++i)
+   {
+     if (h[i] == NULL_RTX)
+       continue;
+     op = gen_reg_rtx (V4DImode);
+     emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
+				     const2_rtx, GEN_INT (3), const0_rtx,
+				     const1_rtx));
+     h[i] = gen_lowpart (V32QImode, op);
+   }
+
+  for (i = 0; i < 2; ++i)
+    {
+      if (!used[2 * i])
+	{
+	  l[i] = NULL_RTX;
+	  continue;
+	}
+      vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
+      vperm = force_reg (V32QImode, vperm);
+      l[i] = gen_reg_rtx (V32QImode);
+      op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
+      emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
+    }
+
+  for (i = 0; i < 2; ++i)
+    {
+      if (h[i] && l[i])
+	{
+	  op = gen_reg_rtx (V32QImode);
+	  emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
+	  l[i] = op;
+	}
+      else if (h[i])
+	l[i] = h[i];
+    }
+
+  gcc_assert (l[0] && l[1]);
+  op = d->target;
+  if (d->vmode != V32QImode)
+    op = gen_reg_rtx (V32QImode);
+  emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
+  if (op != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+  return true;
+}
+
+/* The guts of ix86_vectorize_vec_perm_const.  With all of the interface bits
+   taken care of, perform the expansion in D and return true on success.  */
+
+static bool
+ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+{
+  /* Try a single instruction expansion.  */
+  if (expand_vec_perm_1 (d))
+    return true;
+
+  /* Try sequences of two instructions.  */
+
+  if (expand_vec_perm_pshuflw_pshufhw (d))
+    return true;
+
+  if (expand_vec_perm_palignr (d, false))
+    return true;
+
+  if (expand_vec_perm_interleave2 (d))
+    return true;
+
+  if (expand_vec_perm_broadcast (d))
+    return true;
+
+  if (expand_vec_perm_vpermq_perm_1 (d))
+    return true;
+
+  if (expand_vec_perm_vperm2f128 (d))
+    return true;
+
+  if (expand_vec_perm_pblendv (d))
+    return true;
+
+  /* Try sequences of three instructions.  */
+
+  if (expand_vec_perm_even_odd_pack (d))
+    return true;
+
+  if (expand_vec_perm_2vperm2f128_vshuf (d))
+    return true;
+
+  if (expand_vec_perm_pshufb2 (d))
+    return true;
+
+  if (expand_vec_perm_interleave3 (d))
+    return true;
+
+  if (expand_vec_perm_vperm2f128_vblend (d))
+    return true;
+
+  /* Try sequences of four instructions.  */
+
+  if (expand_vec_perm_even_odd_trunc (d))
+    return true;
+  if (expand_vec_perm_vpshufb2_vpermq (d))
+    return true;
+
+  if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
+    return true;
+
+  if (expand_vec_perm_vpermt2_vpshub2 (d))
+    return true;
+
+  /* ??? Look for narrow permutations whose element orderings would
+     allow the promotion to a wider mode.  */
+
+  /* ??? Look for sequences of interleave or a wider permute that place
+     the data into the correct lanes for a half-vector shuffle like
+     pshuf[lh]w or vpermilps.  */
+
+  /* ??? Look for sequences of interleave that produce the desired results.
+     The combinatorics of punpck[lh] get pretty ugly... */
+
+  if (expand_vec_perm_even_odd (d))
+    return true;
+
+  /* Even longer sequences.  */
+  if (expand_vec_perm_vpshufb4_vpermq2 (d))
+    return true;
+
+  /* See if we can get the same permutation in different vector integer
+     mode.  */
+  struct expand_vec_perm_d nd;
+  if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
+    {
+      if (!d->testing_p)
+	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
+      return true;
+    }
+
+  return false;
+}
+
+/* If a permutation only uses one operand, make it clear. Returns true
+   if the permutation references both operands.  */
+
+static bool
+canonicalize_perm (struct expand_vec_perm_d *d)
+{
+  int i, which, nelt = d->nelt;
+
+  for (i = which = 0; i < nelt; ++i)
+      which |= (d->perm[i] < nelt ? 1 : 2);
+
+  d->one_operand_p = true;
+  switch (which)
+    {
+    default:
+      gcc_unreachable();
+
+    case 3:
+      if (!rtx_equal_p (d->op0, d->op1))
+        {
+	  d->one_operand_p = false;
+	  break;
+        }
+      /* The elements of PERM do not suggest that only the first operand
+	 is used, but both operands are identical.  Allow easier matching
+	 of the permutation by folding the permutation into the single
+	 input vector.  */
+      /* FALLTHRU */
+
+    case 2:
+      for (i = 0; i < nelt; ++i)
+        d->perm[i] &= nelt - 1;
+      d->op0 = d->op1;
+      break;
+
+    case 1:
+      d->op1 = d->op0;
+      break;
+    }
+
+  return (which == 3);
+}
+
+/* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
+
+bool
+ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
+			       rtx op1, const vec_perm_indices &sel)
+{
+  struct expand_vec_perm_d d;
+  unsigned char perm[MAX_VECT_LEN];
+  unsigned int i, nelt, which;
+  bool two_args;
+
+  d.target = target;
+  d.op0 = op0;
+  d.op1 = op1;
+
+  d.vmode = vmode;
+  gcc_assert (VECTOR_MODE_P (d.vmode));
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = !target;
+
+  gcc_assert (sel.length () == nelt);
+  gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
+
+  /* Given sufficient ISA support we can just return true here
+     for selected vector modes.  */
+  switch (d.vmode)
+    {
+    case E_V16SFmode:
+    case E_V16SImode:
+    case E_V8DImode:
+    case E_V8DFmode:
+      if (!TARGET_AVX512F)
+	return false;
+      /* All implementable with a single vperm[it]2 insn.  */
+      if (d.testing_p)
+	return true;
+      break;
+    case E_V32HImode:
+      if (!TARGET_AVX512BW)
+	return false;
+      if (d.testing_p)
+	/* All implementable with a single vperm[it]2 insn.  */
+	return true;
+      break;
+    case E_V64QImode:
+      if (!TARGET_AVX512BW)
+	return false;
+      if (d.testing_p)
+	/* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn.  */
+	return true;
+      break;
+    case E_V8SImode:
+    case E_V8SFmode:
+    case E_V4DFmode:
+    case E_V4DImode:
+      if (!TARGET_AVX)
+	return false;
+      if (d.testing_p && TARGET_AVX512VL)
+	/* All implementable with a single vperm[it]2 insn.  */
+	return true;
+      break;
+    case E_V16HImode:
+      if (!TARGET_SSE2)
+	return false;
+      if (d.testing_p && TARGET_AVX2)
+	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
+	return true;
+      break;
+    case E_V32QImode:
+      if (!TARGET_SSE2)
+	return false;
+      if (d.testing_p && TARGET_AVX2)
+	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
+	return true;
+      break;
+    case E_V8HImode:
+    case E_V16QImode:
+      if (!TARGET_SSE2)
+	return false;
+      /* Fall through.  */
+    case E_V4SImode:
+    case E_V4SFmode:
+      if (!TARGET_SSE)
+	return false;
+      /* All implementable with a single vpperm insn.  */
+      if (d.testing_p && TARGET_XOP)
+	return true;
+      /* All implementable with 2 pshufb + 1 ior.  */
+      if (d.testing_p && TARGET_SSSE3)
+	return true;
+      break;
+    case E_V2DImode:
+    case E_V2DFmode:
+      if (!TARGET_SSE)
+	return false;
+      /* All implementable with shufpd or unpck[lh]pd.  */
+      if (d.testing_p)
+	return true;
+      break;
+    default:
+      return false;
+    }
+
+  for (i = which = 0; i < nelt; ++i)
+    {
+      unsigned char e = sel[i];
+      gcc_assert (e < 2 * nelt);
+      d.perm[i] = e;
+      perm[i] = e;
+      which |= (e < nelt ? 1 : 2);
+    }
+
+  if (d.testing_p)
+    {
+      /* For all elements from second vector, fold the elements to first.  */
+      if (which == 2)
+	for (i = 0; i < nelt; ++i)
+	  d.perm[i] -= nelt;
+
+      /* Check whether the mask can be applied to the vector type.  */
+      d.one_operand_p = (which != 3);
+
+      /* Implementable with shufps or pshufd.  */
+      if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
+	return true;
+
+      /* Otherwise we have to go through the motions and see if we can
+	 figure out how to generate the requested permutation.  */
+      d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
+      d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
+      if (!d.one_operand_p)
+	d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+
+      start_sequence ();
+      bool ret = ix86_expand_vec_perm_const_1 (&d);
+      end_sequence ();
+
+      return ret;
+    }
+
+  two_args = canonicalize_perm (&d);
+
+  if (ix86_expand_vec_perm_const_1 (&d))
+    return true;
+
+  /* If the selector says both arguments are needed, but the operands are the
+     same, the above tried to expand with one_operand_p and flattened selector.
+     If that didn't work, retry without one_operand_p; we succeeded with that
+     during testing.  */
+  if (two_args && d.one_operand_p)
+    {
+      d.one_operand_p = false;
+      memcpy (d.perm, perm, sizeof (perm));
+      return ix86_expand_vec_perm_const_1 (&d);
+    }
+
+  return false;
+}
+
+void
+ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
+{
+  struct expand_vec_perm_d d;
+  unsigned i, nelt;
+
+  d.target = targ;
+  d.op0 = op0;
+  d.op1 = op1;
+  d.vmode = GET_MODE (targ);
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.one_operand_p = false;
+  d.testing_p = false;
+
+  for (i = 0; i < nelt; ++i)
+    d.perm[i] = i * 2 + odd;
+
+  /* We'll either be able to implement the permutation directly...  */
+  if (expand_vec_perm_1 (&d))
+    return;
+
+  /* ... or we use the special-case patterns.  */
+  expand_vec_perm_even_odd_1 (&d, odd);
+}
+
+static void
+ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
+{
+  struct expand_vec_perm_d d;
+  unsigned i, nelt, base;
+  bool ok;
+
+  d.target = targ;
+  d.op0 = op0;
+  d.op1 = op1;
+  d.vmode = GET_MODE (targ);
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.one_operand_p = false;
+  d.testing_p = false;
+
+  base = high_p ? nelt / 2 : 0;
+  for (i = 0; i < nelt / 2; ++i)
+    {
+      d.perm[i * 2] = i + base;
+      d.perm[i * 2 + 1] = i + base + nelt;
+    }
+
+  /* Note that for AVX this isn't one instruction.  */
+  ok = ix86_expand_vec_perm_const_1 (&d);
+  gcc_assert (ok);
+}
+
+
+/* Expand a vector operation CODE for a V*QImode in terms of the
+   same operation on V*HImode.  */
+
+void
+ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+  machine_mode qimode = GET_MODE (dest);
+  machine_mode himode;
+  rtx (*gen_il) (rtx, rtx, rtx);
+  rtx (*gen_ih) (rtx, rtx, rtx);
+  rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
+  struct expand_vec_perm_d d;
+  bool ok, full_interleave;
+  bool uns_p = false;
+  int i;
+
+  switch (qimode)
+    {
+    case E_V16QImode:
+      himode = V8HImode;
+      gen_il = gen_vec_interleave_lowv16qi;
+      gen_ih = gen_vec_interleave_highv16qi;
+      break;
+    case E_V32QImode:
+      himode = V16HImode;
+      gen_il = gen_avx2_interleave_lowv32qi;
+      gen_ih = gen_avx2_interleave_highv32qi;
+      break;
+    case E_V64QImode:
+      himode = V32HImode;
+      gen_il = gen_avx512bw_interleave_lowv64qi;
+      gen_ih = gen_avx512bw_interleave_highv64qi;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  op2_l = op2_h = op2;
+  switch (code)
+    {
+    case MULT:
+      /* Unpack data such that we've got a source byte in each low byte of
+	 each word.  We don't care what goes into the high byte of each word.
+	 Rather than trying to get zero in there, most convenient is to let
+	 it be a copy of the low byte.  */
+      op2_l = gen_reg_rtx (qimode);
+      op2_h = gen_reg_rtx (qimode);
+      emit_insn (gen_il (op2_l, op2, op2));
+      emit_insn (gen_ih (op2_h, op2, op2));
+
+      op1_l = gen_reg_rtx (qimode);
+      op1_h = gen_reg_rtx (qimode);
+      emit_insn (gen_il (op1_l, op1, op1));
+      emit_insn (gen_ih (op1_h, op1, op1));
+      full_interleave = qimode == V16QImode;
+      break;
+
+    case ASHIFT:
+    case LSHIFTRT:
+      uns_p = true;
+      /* FALLTHRU */
+    case ASHIFTRT:
+      op1_l = gen_reg_rtx (himode);
+      op1_h = gen_reg_rtx (himode);
+      ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
+      ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
+      full_interleave = true;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Perform the operation.  */
+  res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
+			       1, OPTAB_DIRECT);
+  res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
+			       1, OPTAB_DIRECT);
+  gcc_assert (res_l && res_h);
+
+  /* Merge the data back into the right place.  */
+  d.target = dest;
+  d.op0 = gen_lowpart (qimode, res_l);
+  d.op1 = gen_lowpart (qimode, res_h);
+  d.vmode = qimode;
+  d.nelt = GET_MODE_NUNITS (qimode);
+  d.one_operand_p = false;
+  d.testing_p = false;
+
+  if (full_interleave)
+    {
+      /* For SSE2, we used an full interleave, so the desired
+	 results are in the even elements.  */
+      for (i = 0; i < d.nelt; ++i)
+	d.perm[i] = i * 2;
+    }
+  else
+    {
+      /* For AVX, the interleave used above was not cross-lane.  So the
+	 extraction is evens but with the second and third quarter swapped.
+	 Happily, that is even one insn shorter than even extraction.
+	 For AVX512BW we have 4 lanes.  We extract evens from within a lane,
+	 always first from the first and then from the second source operand,
+	 the index bits above the low 4 bits remains the same.
+	 Thus, for d.nelt == 32 we want permutation
+	 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
+	 and for d.nelt == 64 we want permutation
+	 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
+	 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126.  */
+      for (i = 0; i < d.nelt; ++i)
+	d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
+    }
+
+  ok = ix86_expand_vec_perm_const_1 (&d);
+  gcc_assert (ok);
+
+  set_unique_reg_note (get_last_insn (), REG_EQUAL,
+		       gen_rtx_fmt_ee (code, qimode, op1, op2));
+}
+
+/* Helper function of ix86_expand_mul_widen_evenodd.  Return true
+   if op is CONST_VECTOR with all odd elements equal to their
+   preceding element.  */
+
+static bool
+const_vector_equal_evenodd_p (rtx op)
+{
+  machine_mode mode = GET_MODE (op);
+  int i, nunits = GET_MODE_NUNITS (mode);
+  if (GET_CODE (op) != CONST_VECTOR
+      || nunits != CONST_VECTOR_NUNITS (op))
+    return false;
+  for (i = 0; i < nunits; i += 2)
+    if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
+      return false;
+  return true;
+}
+
+void
+ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
+			       bool uns_p, bool odd_p)
+{
+  machine_mode mode = GET_MODE (op1);
+  machine_mode wmode = GET_MODE (dest);
+  rtx x;
+  rtx orig_op1 = op1, orig_op2 = op2;
+
+  if (!nonimmediate_operand (op1, mode))
+    op1 = force_reg (mode, op1);
+  if (!nonimmediate_operand (op2, mode))
+    op2 = force_reg (mode, op2);
+
+  /* We only play even/odd games with vectors of SImode.  */
+  gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
+
+  /* If we're looking for the odd results, shift those members down to
+     the even slots.  For some cpus this is faster than a PSHUFD.  */
+  if (odd_p)
+    {
+      /* For XOP use vpmacsdqh, but only for smult, as it is only
+	 signed.  */
+      if (TARGET_XOP && mode == V4SImode && !uns_p)
+	{
+	  x = force_reg (wmode, CONST0_RTX (wmode));
+	  emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
+	  return;
+	}
+
+      x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
+      if (!const_vector_equal_evenodd_p (orig_op1))
+	op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
+			    x, NULL, 1, OPTAB_DIRECT);
+      if (!const_vector_equal_evenodd_p (orig_op2))
+	op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
+			    x, NULL, 1, OPTAB_DIRECT);
+      op1 = gen_lowpart (mode, op1);
+      op2 = gen_lowpart (mode, op2);
+    }
+
+  if (mode == V16SImode)
+    {
+      if (uns_p)
+	x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
+      else
+	x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
+    }
+  else if (mode == V8SImode)
+    {
+      if (uns_p)
+	x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
+      else
+	x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
+    }
+  else if (uns_p)
+    x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
+  else if (TARGET_SSE4_1)
+    x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
+  else
+    {
+      rtx s1, s2, t0, t1, t2;
+
+      /* The easiest way to implement this without PMULDQ is to go through
+	 the motions as if we are performing a full 64-bit multiply.  With
+	 the exception that we need to do less shuffling of the elements.  */
+
+      /* Compute the sign-extension, aka highparts, of the two operands.  */
+      s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
+				op1, pc_rtx, pc_rtx);
+      s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
+				op2, pc_rtx, pc_rtx);
+
+      /* Multiply LO(A) * HI(B), and vice-versa.  */
+      t1 = gen_reg_rtx (wmode);
+      t2 = gen_reg_rtx (wmode);
+      emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
+      emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
+
+      /* Multiply LO(A) * LO(B).  */
+      t0 = gen_reg_rtx (wmode);
+      emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
+
+      /* Combine and shift the highparts into place.  */
+      t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
+      t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
+			 1, OPTAB_DIRECT);
+
+      /* Combine high and low parts.  */
+      force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
+      return;
+    }
+  emit_insn (x);
+}
+
+void
+ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
+			    bool uns_p, bool high_p)
+{
+  machine_mode wmode = GET_MODE (dest);
+  machine_mode mode = GET_MODE (op1);
+  rtx t1, t2, t3, t4, mask;
+
+  switch (mode)
+    {
+    case E_V4SImode:
+      t1 = gen_reg_rtx (mode);
+      t2 = gen_reg_rtx (mode);
+      if (TARGET_XOP && !uns_p)
+	{
+	  /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
+	     shuffle the elements once so that all elements are in the right
+	     place for immediate use: { A C B D }.  */
+	  emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
+					const1_rtx, GEN_INT (3)));
+	  emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
+					const1_rtx, GEN_INT (3)));
+	}
+      else
+	{
+	  /* Put the elements into place for the multiply.  */
+	  ix86_expand_vec_interleave (t1, op1, op1, high_p);
+	  ix86_expand_vec_interleave (t2, op2, op2, high_p);
+	  high_p = false;
+	}
+      ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
+      break;
+
+    case E_V8SImode:
+      /* Shuffle the elements between the lanes.  After this we
+	 have { A B E F | C D G H } for each operand.  */
+      t1 = gen_reg_rtx (V4DImode);
+      t2 = gen_reg_rtx (V4DImode);
+      emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
+				      const0_rtx, const2_rtx,
+				      const1_rtx, GEN_INT (3)));
+      emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
+				      const0_rtx, const2_rtx,
+				      const1_rtx, GEN_INT (3)));
+
+      /* Shuffle the elements within the lanes.  After this we
+	 have { A A B B | C C D D } or { E E F F | G G H H }.  */
+      t3 = gen_reg_rtx (V8SImode);
+      t4 = gen_reg_rtx (V8SImode);
+      mask = GEN_INT (high_p
+		      ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
+		      : 0 + (0 << 2) + (1 << 4) + (1 << 6));
+      emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
+      emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
+
+      ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
+      break;
+
+    case E_V8HImode:
+    case E_V16HImode:
+      t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
+			 uns_p, OPTAB_DIRECT);
+      t2 = expand_binop (mode,
+			 uns_p ? umul_highpart_optab : smul_highpart_optab,
+			 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
+      gcc_assert (t1 && t2);
+
+      t3 = gen_reg_rtx (mode);
+      ix86_expand_vec_interleave (t3, t1, t2, high_p);
+      emit_move_insn (dest, gen_lowpart (wmode, t3));
+      break;
+
+    case E_V16QImode:
+    case E_V32QImode:
+    case E_V32HImode:
+    case E_V16SImode:
+    case E_V64QImode:
+      t1 = gen_reg_rtx (wmode);
+      t2 = gen_reg_rtx (wmode);
+      ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
+      ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
+
+      emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+void
+ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
+{
+  rtx res_1, res_2, res_3, res_4;
+
+  res_1 = gen_reg_rtx (V4SImode);
+  res_2 = gen_reg_rtx (V4SImode);
+  res_3 = gen_reg_rtx (V2DImode);
+  res_4 = gen_reg_rtx (V2DImode);
+  ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
+  ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
+
+  /* Move the results in element 2 down to element 1; we don't care
+     what goes in elements 2 and 3.  Then we can merge the parts
+     back together with an interleave.
+
+     Note that two other sequences were tried:
+     (1) Use interleaves at the start instead of psrldq, which allows
+     us to use a single shufps to merge things back at the end.
+     (2) Use shufps here to combine the two vectors, then pshufd to
+     put the elements in the correct order.
+     In both cases the cost of the reformatting stall was too high
+     and the overall sequence slower.  */
+
+  emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
+				const0_rtx, const2_rtx,
+				const0_rtx, const0_rtx));
+  emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
+				const0_rtx, const2_rtx,
+				const0_rtx, const0_rtx));
+  res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
+
+  set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
+}
+
+void
+ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
+{
+  machine_mode mode = GET_MODE (op0);
+  rtx t1, t2, t3, t4, t5, t6;
+
+  if (TARGET_AVX512DQ && mode == V8DImode)
+    emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
+  else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
+    emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
+  else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
+    emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
+  else if (TARGET_XOP && mode == V2DImode)
+    {
+      /* op1: A,B,C,D, op2: E,F,G,H */
+      op1 = gen_lowpart (V4SImode, op1);
+      op2 = gen_lowpart (V4SImode, op2);
+
+      t1 = gen_reg_rtx (V4SImode);
+      t2 = gen_reg_rtx (V4SImode);
+      t3 = gen_reg_rtx (V2DImode);
+      t4 = gen_reg_rtx (V2DImode);
+
+      /* t1: B,A,D,C */
+      emit_insn (gen_sse2_pshufd_1 (t1, op1,
+				    GEN_INT (1),
+				    GEN_INT (0),
+				    GEN_INT (3),
+				    GEN_INT (2)));
+
+      /* t2: (B*E),(A*F),(D*G),(C*H) */
+      emit_insn (gen_mulv4si3 (t2, t1, op2));
+
+      /* t3: (B*E)+(A*F), (D*G)+(C*H) */
+      emit_insn (gen_xop_phadddq (t3, t2));
+
+      /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
+      emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
+
+      /* Multiply lower parts and add all */
+      t5 = gen_reg_rtx (V2DImode);
+      emit_insn (gen_vec_widen_umult_even_v4si (t5, 
+					gen_lowpart (V4SImode, op1),
+					gen_lowpart (V4SImode, op2)));
+      op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
+
+    }
+  else
+    {
+      machine_mode nmode;
+      rtx (*umul) (rtx, rtx, rtx);
+
+      if (mode == V2DImode)
+	{
+	  umul = gen_vec_widen_umult_even_v4si;
+	  nmode = V4SImode;
+	}
+      else if (mode == V4DImode)
+	{
+	  umul = gen_vec_widen_umult_even_v8si;
+	  nmode = V8SImode;
+	}
+      else if (mode == V8DImode)
+	{
+	  umul = gen_vec_widen_umult_even_v16si;
+	  nmode = V16SImode;
+	}
+      else
+	gcc_unreachable ();
+
+
+      /* Multiply low parts.  */
+      t1 = gen_reg_rtx (mode);
+      emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
+
+      /* Shift input vectors right 32 bits so we can multiply high parts.  */
+      t6 = GEN_INT (32);
+      t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
+      t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
+
+      /* Multiply high parts by low parts.  */
+      t4 = gen_reg_rtx (mode);
+      t5 = gen_reg_rtx (mode);
+      emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
+      emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
+
+      /* Combine and shift the highparts back.  */
+      t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
+      t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
+
+      /* Combine high and low parts.  */
+      force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
+    }
+
+  set_unique_reg_note (get_last_insn (), REG_EQUAL,
+		       gen_rtx_MULT (mode, op1, op2));
+}
+
+/* Return 1 if control tansfer instruction INSN
+   should be encoded with notrack prefix.  */
+
+bool
+ix86_notrack_prefixed_insn_p (rtx insn)
+{
+  if (!insn || !((flag_cf_protection & CF_BRANCH)))
+    return false;
+
+  if (CALL_P (insn))
+    {
+      rtx call = get_call_rtx_from (insn);
+      gcc_assert (call != NULL_RTX);
+      rtx addr = XEXP (call, 0);
+
+      /* Do not emit 'notrack' if it's not an indirect call.  */
+      if (MEM_P (addr)
+	  && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
+	return false;
+      else
+	return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
+    }
+
+  if (JUMP_P (insn) && !flag_cet_switch)
+    {
+      rtx target = JUMP_LABEL (insn);
+      if (target == NULL_RTX || ANY_RETURN_P (target))
+	return false;
+
+      /* Check the jump is a switch table.  */
+      rtx_insn *label = as_a<rtx_insn *> (target);
+      rtx_insn *table = next_insn (label);
+      if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
+	return false;
+      else
+	return true;
+    }
+  return false;
+}
+
+/* Calculate integer abs() using only SSE2 instructions.  */
+
+void
+ix86_expand_sse2_abs (rtx target, rtx input)
+{
+  machine_mode mode = GET_MODE (target);
+  rtx tmp0, tmp1, x;
+
+  switch (mode)
+    {
+    case E_V2DImode:
+    case E_V4DImode:
+      /* For 64-bit signed integer X, with SSE4.2 use
+	 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
+	 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
+	 32 and use logical instead of arithmetic right shift (which is
+	 unimplemented) and subtract.  */
+      if (TARGET_SSE4_2)
+	{
+	  tmp0 = gen_reg_rtx (mode);
+	  tmp1 = gen_reg_rtx (mode);
+	  emit_move_insn (tmp1, CONST0_RTX (mode));
+	  if (mode == E_V2DImode)
+	    emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
+	  else
+	    emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
+	}
+      else
+	{
+	  tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
+				      GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
+					       - 1), NULL, 0, OPTAB_DIRECT);
+	  tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
+	}
+
+      tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
+				  NULL, 0, OPTAB_DIRECT);
+      x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
+			       target, 0, OPTAB_DIRECT);
+      break;
+
+    case E_V4SImode:
+      /* For 32-bit signed integer X, the best way to calculate the absolute
+	 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
+      tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
+				  GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
+				  NULL, 0, OPTAB_DIRECT);
+      tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
+				  NULL, 0, OPTAB_DIRECT);
+      x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
+			       target, 0, OPTAB_DIRECT);
+      break;
+
+    case E_V8HImode:
+      /* For 16-bit signed integer X, the best way to calculate the absolute
+	 value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
+      tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
+
+      x = expand_simple_binop (mode, SMAX, tmp0, input,
+			       target, 0, OPTAB_DIRECT);
+      break;
+
+    case E_V16QImode:
+      /* For 8-bit signed integer X, the best way to calculate the absolute
+	 value of X is min ((unsigned char) X, (unsigned char) (-X)),
+	 as SSE2 provides the PMINUB insn.  */
+      tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
+
+      x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
+			       target, 0, OPTAB_DIRECT);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  if (x != target)
+    emit_move_insn (target, x);
+}
+
+/* Expand an extract from a vector register through pextr insn.
+   Return true if successful.  */
+
+bool
+ix86_expand_pextr (rtx *operands)
+{
+  rtx dst = operands[0];
+  rtx src = operands[1];
+
+  unsigned int size = INTVAL (operands[2]);
+  unsigned int pos = INTVAL (operands[3]);
+
+  if (SUBREG_P (dst))
+    {
+      /* Reject non-lowpart subregs.  */
+      if (SUBREG_BYTE (dst) > 0)
+	return false;
+      dst = SUBREG_REG (dst);
+    }
+	
+  if (SUBREG_P (src))
+    {
+      pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
+      src = SUBREG_REG (src);
+    }
+
+  switch (GET_MODE (src))
+    {
+    case E_V16QImode:
+    case E_V8HImode:
+    case E_V4SImode:
+    case E_V2DImode:
+    case E_V1TImode:
+    case E_TImode:
+      {
+	machine_mode srcmode, dstmode;
+	rtx d, pat;
+
+	if (!int_mode_for_size (size, 0).exists (&dstmode))
+	  return false;
+
+	switch (dstmode)
+	  {
+	  case E_QImode:
+	    if (!TARGET_SSE4_1)
+	      return false;
+	    srcmode = V16QImode;
+	    break;
+
+	  case E_HImode:
+	    if (!TARGET_SSE2)
+	      return false;
+	    srcmode = V8HImode;
+	    break;
+
+	  case E_SImode:
+	    if (!TARGET_SSE4_1)
+	      return false;
+	    srcmode = V4SImode;
+	    break;
+
+	  case E_DImode:
+	    gcc_assert (TARGET_64BIT);
+	    if (!TARGET_SSE4_1)
+	      return false;
+	    srcmode = V2DImode;
+	    break;
+
+	  default:
+	    return false;
+	  }
+
+	/* Reject extractions from misaligned positions.  */
+	if (pos & (size-1))
+	  return false;
+
+	if (GET_MODE (dst) == dstmode)
+	  d = dst;
+	else
+	  d = gen_reg_rtx (dstmode);
+
+	/* Construct insn pattern.  */
+	pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
+	pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
+
+	/* Let the rtl optimizers know about the zero extension performed.  */
+	if (dstmode == QImode || dstmode == HImode)
+	  {
+	    pat = gen_rtx_ZERO_EXTEND (SImode, pat);
+	    d = gen_lowpart (SImode, d);
+	  }
+
+	emit_insn (gen_rtx_SET (d, pat));
+
+	if (d != dst)
+	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
+	return true;
+      }
+
+    default:
+      return false;
+    }
+}
+
+/* Expand an insert into a vector register through pinsr insn.
+   Return true if successful.  */
+
+bool
+ix86_expand_pinsr (rtx *operands)
+{
+  rtx dst = operands[0];
+  rtx src = operands[3];
+
+  unsigned int size = INTVAL (operands[1]);
+  unsigned int pos = INTVAL (operands[2]);
+
+  if (SUBREG_P (dst))
+    {
+      pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
+      dst = SUBREG_REG (dst);
+    }
+
+  switch (GET_MODE (dst))
+    {
+    case E_V16QImode:
+    case E_V8HImode:
+    case E_V4SImode:
+    case E_V2DImode:
+    case E_V1TImode:
+    case E_TImode:
+      {
+	machine_mode srcmode, dstmode;
+	rtx (*pinsr)(rtx, rtx, rtx, rtx);
+	rtx d;
+
+	if (!int_mode_for_size (size, 0).exists (&srcmode))
+	  return false;
+
+	switch (srcmode)
+	  {
+	  case E_QImode:
+	    if (!TARGET_SSE4_1)
+	      return false;
+	    dstmode = V16QImode;
+	    pinsr = gen_sse4_1_pinsrb;
+	    break;
+
+	  case E_HImode:
+	    if (!TARGET_SSE2)
+	      return false;
+	    dstmode = V8HImode;
+	    pinsr = gen_sse2_pinsrw;
+	    break;
+
+	  case E_SImode:
+	    if (!TARGET_SSE4_1)
+	      return false;
+	    dstmode = V4SImode;
+	    pinsr = gen_sse4_1_pinsrd;
+	    break;
+
+	  case E_DImode:
+	    gcc_assert (TARGET_64BIT);
+	    if (!TARGET_SSE4_1)
+	      return false;
+	    dstmode = V2DImode;
+	    pinsr = gen_sse4_1_pinsrq;
+	    break;
+
+	  default:
+	    return false;
+	  }
+
+	/* Reject insertions to misaligned positions.  */
+	if (pos & (size-1))
+	  return false;
+
+	if (SUBREG_P (src))
+	  {
+	    unsigned int srcpos = SUBREG_BYTE (src);
+
+	    if (srcpos > 0)
+	      {
+		rtx extr_ops[4];
+
+		extr_ops[0] = gen_reg_rtx (srcmode);
+		extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
+		extr_ops[2] = GEN_INT (size);
+		extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
+
+		if (!ix86_expand_pextr (extr_ops))
+		  return false;
+
+		src = extr_ops[0];
+	      }
+	    else
+	      src = gen_lowpart (srcmode, SUBREG_REG (src));
+	  }
+
+	if (GET_MODE (dst) == dstmode)
+	  d = dst;
+	else
+	  d = gen_reg_rtx (dstmode);
+
+	emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
+			  gen_lowpart (srcmode, src),
+			  GEN_INT (1 << (pos / size))));
+	if (d != dst)
+	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
+	return true;
+      }
+
+    default:
+      return false;
+    }
+}
+
+/* All CPUs prefer to avoid cross-lane operations so perform reductions
+   upper against lower halves up to SSE reg size.  */
+
+machine_mode
+ix86_split_reduction (machine_mode mode)
+{
+  /* Reduce lowpart against highpart until we reach SSE reg width to
+     avoid cross-lane operations.  */
+  switch (mode)
+    {
+    case E_V8DImode:
+    case E_V4DImode:
+      return V2DImode;
+    case E_V16SImode:
+    case E_V8SImode:
+      return V4SImode;
+    case E_V32HImode:
+    case E_V16HImode:
+      return V8HImode;
+    case E_V64QImode:
+    case E_V32QImode:
+      return V16QImode;
+    case E_V16SFmode:
+    case E_V8SFmode:
+      return V4SFmode;
+    case E_V8DFmode:
+    case E_V4DFmode:
+      return V2DFmode;
+    default:
+      return mode;
+    }
+}
+
+/* Generate call to __divmoddi4.  */
+
+void
+ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
+			    rtx op0, rtx op1,
+			    rtx *quot_p, rtx *rem_p)
+{
+  rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
+
+  rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
+				      mode, op0, mode, op1, mode,
+				      XEXP (rem, 0), Pmode);
+  *quot_p = quot;
+  *rem_p = rem;
+}
+
+#include "gt-i386-expand.h"
diff --git a/gcc/config/i386/i386-expand.h b/gcc/config/i386/i386-expand.h
new file mode 100644
index 000000000..9271bb85a
--- /dev/null
+++ b/gcc/config/i386/i386-expand.h
@@ -0,0 +1,58 @@
+/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_I386_EXPAND_H
+#define GCC_I386_EXPAND_H
+
+/* AVX512F does support 64-byte integer vector operations,
+   thus the longest vector we are faced with is V64QImode.  */
+#define MAX_VECT_LEN	64
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  machine_mode vmode;
+  unsigned char nelt;
+  bool one_operand_p;
+  bool testing_p;
+};
+
+rtx legitimize_tls_address (rtx x, enum tls_model model, bool for_mov);
+alias_set_type ix86_GOT_alias_set (void);
+rtx legitimize_pic_address (rtx orig, rtx reg);
+rtx legitimize_pe_coff_symbol (rtx addr, bool inreg);
+
+bool insn_defines_reg (unsigned int regno1, unsigned int regno2,
+		       rtx_insn *insn);
+void ix86_emit_binop (enum rtx_code code, machine_mode mode, rtx dst, rtx src);
+enum calling_abi ix86_function_abi (const_tree fndecl);
+bool ix86_function_ms_hook_prologue (const_tree fn);
+void warn_once_call_ms2sysv_xlogues (const char *feature);
+rtx gen_push (rtx arg);
+rtx gen_pop (rtx arg);
+rtx ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
+			 machine_mode mode, int ignore);
+bool ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
+				    rtx op1, const vec_perm_indices &sel);
+bool ix86_notrack_prefixed_insn_p (rtx insn);
+machine_mode ix86_split_reduction (machine_mode mode);
+void ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, rtx op0,
+				 rtx op1, rtx *quot_p, rtx *rem_p);
+
+#endif  /* GCC_I386_EXPAND_H */
diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
new file mode 100644
index 000000000..60a120f4d
--- /dev/null
+++ b/gcc/config/i386/i386-features.c
@@ -0,0 +1,2742 @@
+/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "memmodel.h"
+#include "gimple.h"
+#include "cfghooks.h"
+#include "cfgloop.h"
+#include "df.h"
+#include "tm_p.h"
+#include "stringpool.h"
+#include "expmed.h"
+#include "optabs.h"
+#include "regs.h"
+#include "emit-rtl.h"
+#include "recog.h"
+#include "cgraph.h"
+#include "diagnostic.h"
+#include "cfgbuild.h"
+#include "alias.h"
+#include "fold-const.h"
+#include "attribs.h"
+#include "calls.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "output.h"
+#include "insn-attr.h"
+#include "flags.h"
+#include "except.h"
+#include "explow.h"
+#include "expr.h"
+#include "cfgrtl.h"
+#include "common/common-target.h"
+#include "langhooks.h"
+#include "reload.h"
+#include "gimplify.h"
+#include "dwarf2.h"
+#include "tm-constrs.h"
+#include "params.h"
+#include "cselib.h"
+#include "sched-int.h"
+#include "opts.h"
+#include "tree-pass.h"
+#include "context.h"
+#include "pass_manager.h"
+#include "target-globals.h"
+#include "gimple-iterator.h"
+#include "tree-vectorizer.h"
+#include "shrink-wrap.h"
+#include "builtins.h"
+#include "rtl-iter.h"
+#include "tree-iterator.h"
+#include "dbgcnt.h"
+#include "case-cfn-macros.h"
+#include "dojump.h"
+#include "fold-const-call.h"
+#include "tree-vrp.h"
+#include "tree-ssanames.h"
+#include "selftest.h"
+#include "selftest-rtl.h"
+#include "print-rtl.h"
+#include "intl.h"
+#include "ifcvt.h"
+#include "symbol-summary.h"
+#include "ipa-prop.h"
+#include "ipa-fnsummary.h"
+#include "wide-int-bitmask.h"
+#include "tree-vector-builder.h"
+#include "debug.h"
+#include "dwarf2out.h"
+#include "i386-builtins.h"
+#include "i386-features.h"
+
+const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
+  "savms64",
+  "resms64",
+  "resms64x",
+  "savms64f",
+  "resms64f",
+  "resms64fx"
+};
+
+const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
+/* The below offset values are where each register is stored for the layout
+   relative to incoming stack pointer.  The value of each m_regs[].offset will
+   be relative to the incoming base pointer (rax or rsi) used by the stub.
+
+    s_instances:   0		1		2		3
+    Offset:					realigned or	aligned + 8
+    Register	   aligned	aligned + 8	aligned w/HFP	w/HFP	*/
+    XMM15_REG,	/* 0x10		0x18		0x10		0x18	*/
+    XMM14_REG,	/* 0x20		0x28		0x20		0x28	*/
+    XMM13_REG,	/* 0x30		0x38		0x30		0x38	*/
+    XMM12_REG,	/* 0x40		0x48		0x40		0x48	*/
+    XMM11_REG,	/* 0x50		0x58		0x50		0x58	*/
+    XMM10_REG,	/* 0x60		0x68		0x60		0x68	*/
+    XMM9_REG,	/* 0x70		0x78		0x70		0x78	*/
+    XMM8_REG,	/* 0x80		0x88		0x80		0x88	*/
+    XMM7_REG,	/* 0x90		0x98		0x90		0x98	*/
+    XMM6_REG,	/* 0xa0		0xa8		0xa0		0xa8	*/
+    SI_REG,	/* 0xa8		0xb0		0xa8		0xb0	*/
+    DI_REG,	/* 0xb0		0xb8		0xb0		0xb8	*/
+    BX_REG,	/* 0xb8		0xc0		0xb8		0xc0	*/
+    BP_REG,	/* 0xc0		0xc8		N/A		N/A	*/
+    R12_REG,	/* 0xc8		0xd0		0xc0		0xc8	*/
+    R13_REG,	/* 0xd0		0xd8		0xc8		0xd0	*/
+    R14_REG,	/* 0xd8		0xe0		0xd0		0xd8	*/
+    R15_REG,	/* 0xe0		0xe8		0xd8		0xe0	*/
+};
+
+/* Instantiate static const values.  */
+const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
+const unsigned xlogue_layout::MIN_REGS;
+const unsigned xlogue_layout::MAX_REGS;
+const unsigned xlogue_layout::MAX_EXTRA_REGS;
+const unsigned xlogue_layout::VARIANT_COUNT;
+const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
+
+/* Initialize xlogue_layout::s_stub_names to zero.  */
+char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
+				[STUB_NAME_MAX_LEN];
+
+/* Instantiates all xlogue_layout instances.  */
+const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
+  xlogue_layout (0, false),
+  xlogue_layout (8, false),
+  xlogue_layout (0, true),
+  xlogue_layout (8, true)
+};
+
+/* Return an appropriate const instance of xlogue_layout based upon values
+   in cfun->machine and crtl.  */
+const struct xlogue_layout &
+xlogue_layout::get_instance ()
+{
+  enum xlogue_stub_sets stub_set;
+  bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
+
+  if (stack_realign_fp)
+    stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
+  else if (frame_pointer_needed)
+    stub_set = aligned_plus_8
+	      ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
+	      : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
+  else
+    stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
+
+  return s_instances[stub_set];
+}
+
+/* Determine how many clobbered registers can be saved by the stub.
+   Returns the count of registers the stub will save and restore.  */
+unsigned
+xlogue_layout::count_stub_managed_regs ()
+{
+  bool hfp = frame_pointer_needed || stack_realign_fp;
+  unsigned i, count;
+  unsigned regno;
+
+  for (count = i = MIN_REGS; i < MAX_REGS; ++i)
+    {
+      regno = REG_ORDER[i];
+      if (regno == BP_REG && hfp)
+	continue;
+      if (!ix86_save_reg (regno, false, false))
+	break;
+      ++count;
+    }
+  return count;
+}
+
+/* Determine if register REGNO is a stub managed register given the
+   total COUNT of stub managed registers.  */
+bool
+xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
+{
+  bool hfp = frame_pointer_needed || stack_realign_fp;
+  unsigned i;
+
+  for (i = 0; i < count; ++i)
+    {
+      gcc_assert (i < MAX_REGS);
+      if (REG_ORDER[i] == BP_REG && hfp)
+	++count;
+      else if (REG_ORDER[i] == regno)
+	return true;
+    }
+  return false;
+}
+
+/* Constructor for xlogue_layout.  */
+xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
+  : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
+    m_stack_align_off_in (stack_align_off_in)
+{
+  HOST_WIDE_INT offset = stack_align_off_in;
+  unsigned i, j;
+
+  for (i = j = 0; i < MAX_REGS; ++i)
+    {
+      unsigned regno = REG_ORDER[i];
+
+      if (regno == BP_REG && hfp)
+	continue;
+      if (SSE_REGNO_P (regno))
+	{
+	  offset += 16;
+	  /* Verify that SSE regs are always aligned.  */
+	  gcc_assert (!((stack_align_off_in + offset) & 15));
+	}
+      else
+	offset += 8;
+
+      m_regs[j].regno    = regno;
+      m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
+    }
+  gcc_assert (j == m_nregs);
+}
+
+const char *
+xlogue_layout::get_stub_name (enum xlogue_stub stub,
+			      unsigned n_extra_regs)
+{
+  const int have_avx = TARGET_AVX;
+  char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
+
+  /* Lazy init */
+  if (!*name)
+    {
+      int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
+			  (have_avx ? "avx" : "sse"),
+			  STUB_BASE_NAMES[stub],
+			  MIN_REGS + n_extra_regs);
+      gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
+    }
+
+  return name;
+}
+
+/* Return rtx of a symbol ref for the entry point (based upon
+   cfun->machine->call_ms2sysv_extra_regs) of the specified stub.  */
+rtx
+xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
+{
+  const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
+  gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
+  gcc_assert (stub < XLOGUE_STUB_COUNT);
+  gcc_assert (crtl->stack_realign_finalized);
+
+  return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
+}
+
+unsigned scalar_chain::max_id = 0;
+
+/* Initialize new chain.  */
+
+scalar_chain::scalar_chain ()
+{
+  chain_id = ++max_id;
+
+   if (dump_file)
+    fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
+
+  bitmap_obstack_initialize (NULL);
+  insns = BITMAP_ALLOC (NULL);
+  defs = BITMAP_ALLOC (NULL);
+  defs_conv = BITMAP_ALLOC (NULL);
+  queue = NULL;
+}
+
+/* Free chain's data.  */
+
+scalar_chain::~scalar_chain ()
+{
+  BITMAP_FREE (insns);
+  BITMAP_FREE (defs);
+  BITMAP_FREE (defs_conv);
+  bitmap_obstack_release (NULL);
+}
+
+/* Add instruction into chains' queue.  */
+
+void
+scalar_chain::add_to_queue (unsigned insn_uid)
+{
+  if (bitmap_bit_p (insns, insn_uid)
+      || bitmap_bit_p (queue, insn_uid))
+    return;
+
+  if (dump_file)
+    fprintf (dump_file, "  Adding insn %d into chain's #%d queue\n",
+	     insn_uid, chain_id);
+  bitmap_set_bit (queue, insn_uid);
+}
+
+/* For DImode conversion, mark register defined by DEF as requiring
+   conversion.  */
+
+void
+dimode_scalar_chain::mark_dual_mode_def (df_ref def)
+{
+  gcc_assert (DF_REF_REG_DEF_P (def));
+
+  if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
+    return;
+
+  if (dump_file)
+    fprintf (dump_file,
+	     "  Mark r%d def in insn %d as requiring both modes in chain #%d\n",
+	     DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
+
+  bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
+}
+
+/* For TImode conversion, it is unused.  */
+
+void
+timode_scalar_chain::mark_dual_mode_def (df_ref)
+{
+  gcc_unreachable ();
+}
+
+/* Check REF's chain to add new insns into a queue
+   and find registers requiring conversion.  */
+
+void
+scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
+{
+  df_link *chain;
+
+  gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
+	      || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
+  add_to_queue (DF_REF_INSN_UID (ref));
+
+  for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
+    {
+      unsigned uid = DF_REF_INSN_UID (chain->ref);
+
+      if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
+	continue;
+
+      if (!DF_REF_REG_MEM_P (chain->ref))
+	{
+	  if (bitmap_bit_p (insns, uid))
+	    continue;
+
+	  if (bitmap_bit_p (candidates, uid))
+	    {
+	      add_to_queue (uid);
+	      continue;
+	    }
+	}
+
+      if (DF_REF_REG_DEF_P (chain->ref))
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "  r%d def in insn %d isn't convertible\n",
+		     DF_REF_REGNO (chain->ref), uid);
+	  mark_dual_mode_def (chain->ref);
+	}
+      else
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "  r%d use in insn %d isn't convertible\n",
+		     DF_REF_REGNO (chain->ref), uid);
+	  mark_dual_mode_def (ref);
+	}
+    }
+}
+
+/* Add instruction into a chain.  */
+
+void
+scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
+{
+  if (bitmap_bit_p (insns, insn_uid))
+    return;
+
+  if (dump_file)
+    fprintf (dump_file, "  Adding insn %d to chain #%d\n", insn_uid, chain_id);
+
+  bitmap_set_bit (insns, insn_uid);
+
+  rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
+  rtx def_set = single_set (insn);
+  if (def_set && REG_P (SET_DEST (def_set))
+      && !HARD_REGISTER_P (SET_DEST (def_set)))
+    bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
+
+  df_ref ref;
+  df_ref def;
+  for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
+    if (!HARD_REGISTER_P (DF_REF_REG (ref)))
+      for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
+	   def;
+	   def = DF_REF_NEXT_REG (def))
+	analyze_register_chain (candidates, def);
+  for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
+    if (!DF_REF_REG_MEM_P (ref))
+      analyze_register_chain (candidates, ref);
+}
+
+/* Build new chain starting from insn INSN_UID recursively
+   adding all dependent uses and definitions.  */
+
+void
+scalar_chain::build (bitmap candidates, unsigned insn_uid)
+{
+  queue = BITMAP_ALLOC (NULL);
+  bitmap_set_bit (queue, insn_uid);
+
+  if (dump_file)
+    fprintf (dump_file, "Building chain #%d...\n", chain_id);
+
+  while (!bitmap_empty_p (queue))
+    {
+      insn_uid = bitmap_first_set_bit (queue);
+      bitmap_clear_bit (queue, insn_uid);
+      bitmap_clear_bit (candidates, insn_uid);
+      add_insn (candidates, insn_uid);
+    }
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Collected chain #%d...\n", chain_id);
+      fprintf (dump_file, "  insns: ");
+      dump_bitmap (dump_file, insns);
+      if (!bitmap_empty_p (defs_conv))
+	{
+	  bitmap_iterator bi;
+	  unsigned id;
+	  const char *comma = "";
+	  fprintf (dump_file, "  defs to convert: ");
+	  EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
+	    {
+	      fprintf (dump_file, "%sr%d", comma, id);
+	      comma = ", ";
+	    }
+	  fprintf (dump_file, "\n");
+	}
+    }
+
+  BITMAP_FREE (queue);
+}
+
+/* Return a cost of building a vector costant
+   instead of using a scalar one.  */
+
+int
+dimode_scalar_chain::vector_const_cost (rtx exp)
+{
+  gcc_assert (CONST_INT_P (exp));
+
+  if (standard_sse_constant_p (exp, V2DImode))
+    return COSTS_N_INSNS (1);
+  return ix86_cost->sse_load[1];
+}
+
+/* Compute a gain for chain conversion.  */
+
+int
+dimode_scalar_chain::compute_convert_gain ()
+{
+  bitmap_iterator bi;
+  unsigned insn_uid;
+  int gain = 0;
+  int cost = 0;
+
+  if (dump_file)
+    fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
+
+  EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
+    {
+      rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
+      rtx def_set = single_set (insn);
+      rtx src = SET_SRC (def_set);
+      rtx dst = SET_DEST (def_set);
+
+      if (REG_P (src) && REG_P (dst))
+	gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
+      else if (REG_P (src) && MEM_P (dst))
+	gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
+      else if (MEM_P (src) && REG_P (dst))
+	gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
+      else if (GET_CODE (src) == ASHIFT
+	       || GET_CODE (src) == ASHIFTRT
+	       || GET_CODE (src) == LSHIFTRT)
+	{
+    	  if (CONST_INT_P (XEXP (src, 0)))
+	    gain -= vector_const_cost (XEXP (src, 0));
+	  gain += ix86_cost->shift_const;
+	  if (INTVAL (XEXP (src, 1)) >= 32)
+	    gain -= COSTS_N_INSNS (1);
+	}
+      else if (GET_CODE (src) == PLUS
+	       || GET_CODE (src) == MINUS
+	       || GET_CODE (src) == IOR
+	       || GET_CODE (src) == XOR
+	       || GET_CODE (src) == AND)
+	{
+	  gain += ix86_cost->add;
+	  /* Additional gain for andnot for targets without BMI.  */
+	  if (GET_CODE (XEXP (src, 0)) == NOT
+	      && !TARGET_BMI)
+	    gain += 2 * ix86_cost->add;
+
+	  if (CONST_INT_P (XEXP (src, 0)))
+	    gain -= vector_const_cost (XEXP (src, 0));
+	  if (CONST_INT_P (XEXP (src, 1)))
+	    gain -= vector_const_cost (XEXP (src, 1));
+	}
+      else if (GET_CODE (src) == NEG
+	       || GET_CODE (src) == NOT)
+	gain += ix86_cost->add - COSTS_N_INSNS (1);
+      else if (GET_CODE (src) == COMPARE)
+	{
+	  /* Assume comparison cost is the same.  */
+	}
+      else if (CONST_INT_P (src))
+	{
+	  if (REG_P (dst))
+	    gain += COSTS_N_INSNS (2);
+	  else if (MEM_P (dst))
+	    gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
+	  gain -= vector_const_cost (src);
+	}
+      else
+	gcc_unreachable ();
+    }
+
+  if (dump_file)
+    fprintf (dump_file, "  Instruction conversion gain: %d\n", gain);
+
+  EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
+    cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
+
+  if (dump_file)
+    fprintf (dump_file, "  Registers conversion cost: %d\n", cost);
+
+  gain -= cost;
+
+  if (dump_file)
+    fprintf (dump_file, "  Total gain: %d\n", gain);
+
+  return gain;
+}
+
+/* Replace REG in X with a V2DI subreg of NEW_REG.  */
+
+rtx
+dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
+{
+  if (x == reg)
+    return gen_rtx_SUBREG (V2DImode, new_reg, 0);
+
+  const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
+  int i, j;
+  for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
+    {
+      if (fmt[i] == 'e')
+	XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
+      else if (fmt[i] == 'E')
+	for (j = XVECLEN (x, i) - 1; j >= 0; j--)
+	  XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
+						   reg, new_reg);
+    }
+
+  return x;
+}
+
+/* Replace REG in INSN with a V2DI subreg of NEW_REG.  */
+
+void
+dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
+						  rtx reg, rtx new_reg)
+{
+  replace_with_subreg (single_set (insn), reg, new_reg);
+}
+
+/* Insert generated conversion instruction sequence INSNS
+   after instruction AFTER.  New BB may be required in case
+   instruction has EH region attached.  */
+
+void
+scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
+{
+  if (!control_flow_insn_p (after))
+    {
+      emit_insn_after (insns, after);
+      return;
+    }
+
+  basic_block bb = BLOCK_FOR_INSN (after);
+  edge e = find_fallthru_edge (bb->succs);
+  gcc_assert (e);
+
+  basic_block new_bb = split_edge (e);
+  emit_insn_after (insns, BB_HEAD (new_bb));
+}
+
+/* Make vector copies for all register REGNO definitions
+   and replace its uses in a chain.  */
+
+void
+dimode_scalar_chain::make_vector_copies (unsigned regno)
+{
+  rtx reg = regno_reg_rtx[regno];
+  rtx vreg = gen_reg_rtx (DImode);
+  df_ref ref;
+
+  for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+    if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+      {
+	start_sequence ();
+	if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
+	  {
+	    rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
+	    emit_move_insn (adjust_address (tmp, SImode, 0),
+			    gen_rtx_SUBREG (SImode, reg, 0));
+	    emit_move_insn (adjust_address (tmp, SImode, 4),
+			    gen_rtx_SUBREG (SImode, reg, 4));
+	    emit_move_insn (vreg, tmp);
+	  }
+	else if (TARGET_SSE4_1)
+	  {
+	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
+					CONST0_RTX (V4SImode),
+					gen_rtx_SUBREG (SImode, reg, 0)));
+	    emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
+					  gen_rtx_SUBREG (V4SImode, vreg, 0),
+					  gen_rtx_SUBREG (SImode, reg, 4),
+					  GEN_INT (2)));
+	  }
+	else
+	  {
+	    rtx tmp = gen_reg_rtx (DImode);
+	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
+					CONST0_RTX (V4SImode),
+					gen_rtx_SUBREG (SImode, reg, 0)));
+	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
+					CONST0_RTX (V4SImode),
+					gen_rtx_SUBREG (SImode, reg, 4)));
+	    emit_insn (gen_vec_interleave_lowv4si
+		       (gen_rtx_SUBREG (V4SImode, vreg, 0),
+			gen_rtx_SUBREG (V4SImode, vreg, 0),
+			gen_rtx_SUBREG (V4SImode, tmp, 0)));
+	  }
+	rtx_insn *seq = get_insns ();
+	end_sequence ();
+	rtx_insn *insn = DF_REF_INSN (ref);
+	emit_conversion_insns (seq, insn);
+
+	if (dump_file)
+	  fprintf (dump_file,
+		   "  Copied r%d to a vector register r%d for insn %d\n",
+		   regno, REGNO (vreg), INSN_UID (insn));
+      }
+
+  for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+    if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+      {
+	rtx_insn *insn = DF_REF_INSN (ref);
+	replace_with_subreg_in_insn (insn, reg, vreg);
+
+	if (dump_file)
+	  fprintf (dump_file, "  Replaced r%d with r%d in insn %d\n",
+		   regno, REGNO (vreg), INSN_UID (insn));
+      }
+}
+
+/* Convert all definitions of register REGNO
+   and fix its uses.  Scalar copies may be created
+   in case register is used in not convertible insn.  */
+
+void
+dimode_scalar_chain::convert_reg (unsigned regno)
+{
+  bool scalar_copy = bitmap_bit_p (defs_conv, regno);
+  rtx reg = regno_reg_rtx[regno];
+  rtx scopy = NULL_RTX;
+  df_ref ref;
+  bitmap conv;
+
+  conv = BITMAP_ALLOC (NULL);
+  bitmap_copy (conv, insns);
+
+  if (scalar_copy)
+    scopy = gen_reg_rtx (DImode);
+
+  for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+    {
+      rtx_insn *insn = DF_REF_INSN (ref);
+      rtx def_set = single_set (insn);
+      rtx src = SET_SRC (def_set);
+      rtx reg = DF_REF_REG (ref);
+
+      if (!MEM_P (src))
+	{
+	  replace_with_subreg_in_insn (insn, reg, reg);
+	  bitmap_clear_bit (conv, INSN_UID (insn));
+	}
+
+      if (scalar_copy)
+	{
+	  start_sequence ();
+	  if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
+	    {
+	      rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
+	      emit_move_insn (tmp, reg);
+	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
+			      adjust_address (tmp, SImode, 0));
+	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
+			      adjust_address (tmp, SImode, 4));
+	    }
+	  else if (TARGET_SSE4_1)
+	    {
+	      rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
+	      emit_insn
+		(gen_rtx_SET
+		 (gen_rtx_SUBREG (SImode, scopy, 0),
+		  gen_rtx_VEC_SELECT (SImode,
+				      gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
+
+	      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
+	      emit_insn
+		(gen_rtx_SET
+		 (gen_rtx_SUBREG (SImode, scopy, 4),
+		  gen_rtx_VEC_SELECT (SImode,
+				      gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
+	    }
+	  else
+	    {
+	      rtx vcopy = gen_reg_rtx (V2DImode);
+	      emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
+	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
+			      gen_rtx_SUBREG (SImode, vcopy, 0));
+	      emit_move_insn (vcopy,
+			      gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
+	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
+			      gen_rtx_SUBREG (SImode, vcopy, 0));
+	    }
+	  rtx_insn *seq = get_insns ();
+	  end_sequence ();
+	  emit_conversion_insns (seq, insn);
+
+	  if (dump_file)
+	    fprintf (dump_file,
+		     "  Copied r%d to a scalar register r%d for insn %d\n",
+		     regno, REGNO (scopy), INSN_UID (insn));
+	}
+    }
+
+  for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+    if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+      {
+	if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
+	  {
+	    rtx_insn *insn = DF_REF_INSN (ref);
+
+	    rtx def_set = single_set (insn);
+	    gcc_assert (def_set);
+
+	    rtx src = SET_SRC (def_set);
+	    rtx dst = SET_DEST (def_set);
+
+	    if (!MEM_P (dst) || !REG_P (src))
+	      replace_with_subreg_in_insn (insn, reg, reg);
+
+	    bitmap_clear_bit (conv, INSN_UID (insn));
+	  }
+      }
+    /* Skip debug insns and uninitialized uses.  */
+    else if (DF_REF_CHAIN (ref)
+	     && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
+      {
+	gcc_assert (scopy);
+	replace_rtx (DF_REF_INSN (ref), reg, scopy);
+	df_insn_rescan (DF_REF_INSN (ref));
+      }
+
+  BITMAP_FREE (conv);
+}
+
+/* Convert operand OP in INSN.  We should handle
+   memory operands and uninitialized registers.
+   All other register uses are converted during
+   registers conversion.  */
+
+void
+dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
+{
+  *op = copy_rtx_if_shared (*op);
+
+  if (GET_CODE (*op) == NOT)
+    {
+      convert_op (&XEXP (*op, 0), insn);
+      PUT_MODE (*op, V2DImode);
+    }
+  else if (MEM_P (*op))
+    {
+      rtx tmp = gen_reg_rtx (DImode);
+
+      emit_insn_before (gen_move_insn (tmp, *op), insn);
+      *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
+
+      if (dump_file)
+	fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
+		 INSN_UID (insn), REGNO (tmp));
+    }
+  else if (REG_P (*op))
+    {
+      /* We may have not converted register usage in case
+	 this register has no definition.  Otherwise it
+	 should be converted in convert_reg.  */
+      df_ref ref;
+      FOR_EACH_INSN_USE (ref, insn)
+	if (DF_REF_REGNO (ref) == REGNO (*op))
+	  {
+	    gcc_assert (!DF_REF_CHAIN (ref));
+	    break;
+	  }
+      *op = gen_rtx_SUBREG (V2DImode, *op, 0);
+    }
+  else if (CONST_INT_P (*op))
+    {
+      rtx vec_cst;
+      rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
+
+      /* Prefer all ones vector in case of -1.  */
+      if (constm1_operand (*op, GET_MODE (*op)))
+	vec_cst = CONSTM1_RTX (V2DImode);
+      else
+	vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
+					gen_rtvec (2, *op, const0_rtx));
+
+      if (!standard_sse_constant_p (vec_cst, V2DImode))
+	{
+	  start_sequence ();
+	  vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
+	  rtx_insn *seq = get_insns ();
+	  end_sequence ();
+	  emit_insn_before (seq, insn);
+	}
+
+      emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
+      *op = tmp;
+    }
+  else
+    {
+      gcc_assert (SUBREG_P (*op));
+      gcc_assert (GET_MODE (*op) == V2DImode);
+    }
+}
+
+/* Convert INSN to vector mode.  */
+
+void
+dimode_scalar_chain::convert_insn (rtx_insn *insn)
+{
+  rtx def_set = single_set (insn);
+  rtx src = SET_SRC (def_set);
+  rtx dst = SET_DEST (def_set);
+  rtx subreg;
+
+  if (MEM_P (dst) && !REG_P (src))
+    {
+      /* There are no scalar integer instructions and therefore
+	 temporary register usage is required.  */
+      rtx tmp = gen_reg_rtx (DImode);
+      emit_conversion_insns (gen_move_insn (dst, tmp), insn);
+      dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
+    }
+
+  switch (GET_CODE (src))
+    {
+    case ASHIFT:
+    case ASHIFTRT:
+    case LSHIFTRT:
+      convert_op (&XEXP (src, 0), insn);
+      PUT_MODE (src, V2DImode);
+      break;
+
+    case PLUS:
+    case MINUS:
+    case IOR:
+    case XOR:
+    case AND:
+      convert_op (&XEXP (src, 0), insn);
+      convert_op (&XEXP (src, 1), insn);
+      PUT_MODE (src, V2DImode);
+      break;
+
+    case NEG:
+      src = XEXP (src, 0);
+      convert_op (&src, insn);
+      subreg = gen_reg_rtx (V2DImode);
+      emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
+      src = gen_rtx_MINUS (V2DImode, subreg, src);
+      break;
+
+    case NOT:
+      src = XEXP (src, 0);
+      convert_op (&src, insn);
+      subreg = gen_reg_rtx (V2DImode);
+      emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
+      src = gen_rtx_XOR (V2DImode, src, subreg);
+      break;
+
+    case MEM:
+      if (!REG_P (dst))
+	convert_op (&src, insn);
+      break;
+
+    case REG:
+      if (!MEM_P (dst))
+	convert_op (&src, insn);
+      break;
+
+    case SUBREG:
+      gcc_assert (GET_MODE (src) == V2DImode);
+      break;
+
+    case COMPARE:
+      src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
+
+      gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
+		  || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
+
+      if (REG_P (src))
+	subreg = gen_rtx_SUBREG (V2DImode, src, 0);
+      else
+	subreg = copy_rtx_if_shared (src);
+      emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
+						    copy_rtx_if_shared (subreg),
+						    copy_rtx_if_shared (subreg)),
+			insn);
+      dst = gen_rtx_REG (CCmode, FLAGS_REG);
+      src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
+					       copy_rtx_if_shared (src)),
+			    UNSPEC_PTEST);
+      break;
+
+    case CONST_INT:
+      convert_op (&src, insn);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  SET_SRC (def_set) = src;
+  SET_DEST (def_set) = dst;
+
+  /* Drop possible dead definitions.  */
+  PATTERN (insn) = def_set;
+
+  INSN_CODE (insn) = -1;
+  recog_memoized (insn);
+  df_insn_rescan (insn);
+}
+
+/* Fix uses of converted REG in debug insns.  */
+
+void
+timode_scalar_chain::fix_debug_reg_uses (rtx reg)
+{
+  if (!flag_var_tracking)
+    return;
+
+  df_ref ref, next;
+  for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
+    {
+      rtx_insn *insn = DF_REF_INSN (ref);
+      /* Make sure the next ref is for a different instruction,
+         so that we're not affected by the rescan.  */
+      next = DF_REF_NEXT_REG (ref);
+      while (next && DF_REF_INSN (next) == insn)
+	next = DF_REF_NEXT_REG (next);
+
+      if (DEBUG_INSN_P (insn))
+	{
+	  /* It may be a debug insn with a TImode variable in
+	     register.  */
+	  bool changed = false;
+	  for (; ref != next; ref = DF_REF_NEXT_REG (ref))
+	    {
+	      rtx *loc = DF_REF_LOC (ref);
+	      if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
+		{
+		  *loc = gen_rtx_SUBREG (TImode, *loc, 0);
+		  changed = true;
+		}
+	    }
+	  if (changed)
+	    df_insn_rescan (insn);
+	}
+    }
+}
+
+/* Convert INSN from TImode to V1T1mode.  */
+
+void
+timode_scalar_chain::convert_insn (rtx_insn *insn)
+{
+  rtx def_set = single_set (insn);
+  rtx src = SET_SRC (def_set);
+  rtx dst = SET_DEST (def_set);
+
+  switch (GET_CODE (dst))
+    {
+    case REG:
+      {
+	rtx tmp = find_reg_equal_equiv_note (insn);
+	if (tmp)
+	  PUT_MODE (XEXP (tmp, 0), V1TImode);
+	PUT_MODE (dst, V1TImode);
+	fix_debug_reg_uses (dst);
+      }
+      break;
+    case MEM:
+      PUT_MODE (dst, V1TImode);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  switch (GET_CODE (src))
+    {
+    case REG:
+      PUT_MODE (src, V1TImode);
+      /* Call fix_debug_reg_uses only if SRC is never defined.  */
+      if (!DF_REG_DEF_CHAIN (REGNO (src)))
+	fix_debug_reg_uses (src);
+      break;
+
+    case MEM:
+      PUT_MODE (src, V1TImode);
+      break;
+
+    case CONST_WIDE_INT:
+      if (NONDEBUG_INSN_P (insn))
+	{
+	  /* Since there are no instructions to store 128-bit constant,
+	     temporary register usage is required.  */
+	  rtx tmp = gen_reg_rtx (V1TImode);
+	  start_sequence ();
+	  src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
+	  src = validize_mem (force_const_mem (V1TImode, src));
+	  rtx_insn *seq = get_insns ();
+	  end_sequence ();
+	  if (seq)
+	    emit_insn_before (seq, insn);
+	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
+	  dst = tmp;
+	}
+      break;
+
+    case CONST_INT:
+      switch (standard_sse_constant_p (src, TImode))
+	{
+	case 1:
+	  src = CONST0_RTX (GET_MODE (dst));
+	  break;
+	case 2:
+	  src = CONSTM1_RTX (GET_MODE (dst));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      if (NONDEBUG_INSN_P (insn))
+	{
+	  rtx tmp = gen_reg_rtx (V1TImode);
+	  /* Since there are no instructions to store standard SSE
+	     constant, temporary register usage is required.  */
+	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
+	  dst = tmp;
+	}
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  SET_SRC (def_set) = src;
+  SET_DEST (def_set) = dst;
+
+  /* Drop possible dead definitions.  */
+  PATTERN (insn) = def_set;
+
+  INSN_CODE (insn) = -1;
+  recog_memoized (insn);
+  df_insn_rescan (insn);
+}
+
+void
+dimode_scalar_chain::convert_registers ()
+{
+  bitmap_iterator bi;
+  unsigned id;
+
+  EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
+    convert_reg (id);
+
+  EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
+    make_vector_copies (id);
+}
+
+/* Convert whole chain creating required register
+   conversions and copies.  */
+
+int
+scalar_chain::convert ()
+{
+  bitmap_iterator bi;
+  unsigned id;
+  int converted_insns = 0;
+
+  if (!dbg_cnt (stv_conversion))
+    return 0;
+
+  if (dump_file)
+    fprintf (dump_file, "Converting chain #%d...\n", chain_id);
+
+  convert_registers ();
+
+  EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
+    {
+      convert_insn (DF_INSN_UID_GET (id)->insn);
+      converted_insns++;
+    }
+
+  return converted_insns;
+}
+
+/* Return 1 if INSN uses or defines a hard register.
+   Hard register uses in a memory address are ignored.
+   Clobbers and flags definitions are ignored.  */
+
+static bool
+has_non_address_hard_reg (rtx_insn *insn)
+{
+  df_ref ref;
+  FOR_EACH_INSN_DEF (ref, insn)
+    if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
+	&& !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
+	&& DF_REF_REGNO (ref) != FLAGS_REG)
+      return true;
+
+  FOR_EACH_INSN_USE (ref, insn)
+    if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
+      return true;
+
+  return false;
+}
+
+/* Check if comparison INSN may be transformed
+   into vector comparison.  Currently we transform
+   zero checks only which look like:
+
+   (set (reg:CCZ 17 flags)
+        (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
+                             (subreg:SI (reg:DI x) 0))
+		     (const_int 0 [0])))  */
+
+static bool
+convertible_comparison_p (rtx_insn *insn)
+{
+  if (!TARGET_SSE4_1)
+    return false;
+
+  rtx def_set = single_set (insn);
+
+  gcc_assert (def_set);
+
+  rtx src = SET_SRC (def_set);
+  rtx dst = SET_DEST (def_set);
+
+  gcc_assert (GET_CODE (src) == COMPARE);
+
+  if (GET_CODE (dst) != REG
+      || REGNO (dst) != FLAGS_REG
+      || GET_MODE (dst) != CCZmode)
+    return false;
+
+  rtx op1 = XEXP (src, 0);
+  rtx op2 = XEXP (src, 1);
+
+  if (op2 != CONST0_RTX (GET_MODE (op2)))
+    return false;
+
+  if (GET_CODE (op1) != IOR)
+    return false;
+
+  op2 = XEXP (op1, 1);
+  op1 = XEXP (op1, 0);
+
+  if (!SUBREG_P (op1)
+      || !SUBREG_P (op2)
+      || GET_MODE (op1) != SImode
+      || GET_MODE (op2) != SImode
+      || ((SUBREG_BYTE (op1) != 0
+	   || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
+	  && (SUBREG_BYTE (op2) != 0
+	      || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
+    return false;
+
+  op1 = SUBREG_REG (op1);
+  op2 = SUBREG_REG (op2);
+
+  if (op1 != op2
+      || !REG_P (op1)
+      || GET_MODE (op1) != DImode)
+    return false;
+
+  return true;
+}
+
+/* The DImode version of scalar_to_vector_candidate_p.  */
+
+static bool
+dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
+{
+  rtx def_set = single_set (insn);
+
+  if (!def_set)
+    return false;
+
+  if (has_non_address_hard_reg (insn))
+    return false;
+
+  rtx src = SET_SRC (def_set);
+  rtx dst = SET_DEST (def_set);
+
+  if (GET_CODE (src) == COMPARE)
+    return convertible_comparison_p (insn);
+
+  /* We are interested in DImode promotion only.  */
+  if ((GET_MODE (src) != DImode
+       && !CONST_INT_P (src))
+      || GET_MODE (dst) != DImode)
+    return false;
+
+  if (!REG_P (dst) && !MEM_P (dst))
+    return false;
+
+  switch (GET_CODE (src))
+    {
+    case ASHIFTRT:
+      if (!TARGET_AVX512VL)
+	return false;
+      /* FALLTHRU */
+
+    case ASHIFT:
+    case LSHIFTRT:
+      if (!CONST_INT_P (XEXP (src, 1))
+	  || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63))
+	return false;
+      break;
+
+    case PLUS:
+    case MINUS:
+    case IOR:
+    case XOR:
+    case AND:
+      if (!REG_P (XEXP (src, 1))
+	  && !MEM_P (XEXP (src, 1))
+	  && !CONST_INT_P (XEXP (src, 1)))
+	return false;
+
+      if (GET_MODE (XEXP (src, 1)) != DImode
+	  && !CONST_INT_P (XEXP (src, 1)))
+	return false;
+      break;
+
+    case NEG:
+    case NOT:
+      break;
+
+    case REG:
+      return true;
+
+    case MEM:
+    case CONST_INT:
+      return REG_P (dst);
+
+    default:
+      return false;
+    }
+
+  if (!REG_P (XEXP (src, 0))
+      && !MEM_P (XEXP (src, 0))
+      && !CONST_INT_P (XEXP (src, 0))
+      /* Check for andnot case.  */
+      && (GET_CODE (src) != AND
+	  || GET_CODE (XEXP (src, 0)) != NOT
+	  || !REG_P (XEXP (XEXP (src, 0), 0))))
+      return false;
+
+  if (GET_MODE (XEXP (src, 0)) != DImode
+      && !CONST_INT_P (XEXP (src, 0)))
+    return false;
+
+  return true;
+}
+
+/* The TImode version of scalar_to_vector_candidate_p.  */
+
+static bool
+timode_scalar_to_vector_candidate_p (rtx_insn *insn)
+{
+  rtx def_set = single_set (insn);
+
+  if (!def_set)
+    return false;
+
+  if (has_non_address_hard_reg (insn))
+    return false;
+
+  rtx src = SET_SRC (def_set);
+  rtx dst = SET_DEST (def_set);
+
+  /* Only TImode load and store are allowed.  */
+  if (GET_MODE (dst) != TImode)
+    return false;
+
+  if (MEM_P (dst))
+    {
+      /* Check for store.  Memory must be aligned or unaligned store
+	 is optimal.  Only support store from register, standard SSE
+	 constant or CONST_WIDE_INT generated from piecewise store.
+
+	 ??? Verify performance impact before enabling CONST_INT for
+	 __int128 store.  */
+      if (misaligned_operand (dst, TImode)
+	  && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
+	return false;
+
+      switch (GET_CODE (src))
+	{
+	default:
+	  return false;
+
+	case REG:
+	case CONST_WIDE_INT:
+	  return true;
+
+	case CONST_INT:
+	  return standard_sse_constant_p (src, TImode);
+	}
+    }
+  else if (MEM_P (src))
+    {
+      /* Check for load.  Memory must be aligned or unaligned load is
+	 optimal.  */
+      return (REG_P (dst)
+	      && (!misaligned_operand (src, TImode)
+		  || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
+    }
+
+  return false;
+}
+
+/* Return 1 if INSN may be converted into vector
+   instruction.  */
+
+static bool
+scalar_to_vector_candidate_p (rtx_insn *insn)
+{
+  if (TARGET_64BIT)
+    return timode_scalar_to_vector_candidate_p (insn);
+  else
+    return dimode_scalar_to_vector_candidate_p (insn);
+}
+
+/* The DImode version of remove_non_convertible_regs.  */
+
+static void
+dimode_remove_non_convertible_regs (bitmap candidates)
+{
+  bitmap_iterator bi;
+  unsigned id;
+  bitmap regs = BITMAP_ALLOC (NULL);
+
+  EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
+    {
+      rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
+      rtx reg = SET_DEST (def_set);
+
+      if (!REG_P (reg)
+	  || bitmap_bit_p (regs, REGNO (reg))
+	  || HARD_REGISTER_P (reg))
+	continue;
+
+      for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
+	   def;
+	   def = DF_REF_NEXT_REG (def))
+	{
+	  if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
+	    {
+	      if (dump_file)
+		fprintf (dump_file,
+			 "r%d has non convertible definition in insn %d\n",
+			 REGNO (reg), DF_REF_INSN_UID (def));
+
+	      bitmap_set_bit (regs, REGNO (reg));
+	      break;
+	    }
+	}
+    }
+
+  EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
+    {
+      for (df_ref def = DF_REG_DEF_CHAIN (id);
+	   def;
+	   def = DF_REF_NEXT_REG (def))
+	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
+	  {
+	    if (dump_file)
+	      fprintf (dump_file, "Removing insn %d from candidates list\n",
+		       DF_REF_INSN_UID (def));
+
+	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
+	  }
+    }
+
+  BITMAP_FREE (regs);
+}
+
+/* For a register REGNO, scan instructions for its defs and uses.
+   Put REGNO in REGS if a def or use isn't in CANDIDATES.  */
+
+static void
+timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
+				   unsigned int regno)
+{
+  for (df_ref def = DF_REG_DEF_CHAIN (regno);
+       def;
+       def = DF_REF_NEXT_REG (def))
+    {
+      if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
+	{
+	  if (dump_file)
+	    fprintf (dump_file,
+		     "r%d has non convertible def in insn %d\n",
+		     regno, DF_REF_INSN_UID (def));
+
+	  bitmap_set_bit (regs, regno);
+	  break;
+	}
+    }
+
+  for (df_ref ref = DF_REG_USE_CHAIN (regno);
+       ref;
+       ref = DF_REF_NEXT_REG (ref))
+    {
+      /* Debug instructions are skipped.  */
+      if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
+	  && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
+	{
+	  if (dump_file)
+	    fprintf (dump_file,
+		     "r%d has non convertible use in insn %d\n",
+		     regno, DF_REF_INSN_UID (ref));
+
+	  bitmap_set_bit (regs, regno);
+	  break;
+	}
+    }
+}
+
+/* The TImode version of remove_non_convertible_regs.  */
+
+static void
+timode_remove_non_convertible_regs (bitmap candidates)
+{
+  bitmap_iterator bi;
+  unsigned id;
+  bitmap regs = BITMAP_ALLOC (NULL);
+
+  EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
+    {
+      rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
+      rtx dest = SET_DEST (def_set);
+      rtx src = SET_SRC (def_set);
+
+      if ((!REG_P (dest)
+	   || bitmap_bit_p (regs, REGNO (dest))
+	   || HARD_REGISTER_P (dest))
+	  && (!REG_P (src)
+	      || bitmap_bit_p (regs, REGNO (src))
+	      || HARD_REGISTER_P (src)))
+	continue;
+
+      if (REG_P (dest))
+	timode_check_non_convertible_regs (candidates, regs,
+					   REGNO (dest));
+
+      if (REG_P (src))
+	timode_check_non_convertible_regs (candidates, regs,
+					   REGNO (src));
+    }
+
+  EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
+    {
+      for (df_ref def = DF_REG_DEF_CHAIN (id);
+	   def;
+	   def = DF_REF_NEXT_REG (def))
+	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
+	  {
+	    if (dump_file)
+	      fprintf (dump_file, "Removing insn %d from candidates list\n",
+		       DF_REF_INSN_UID (def));
+
+	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
+	  }
+
+      for (df_ref ref = DF_REG_USE_CHAIN (id);
+	   ref;
+	   ref = DF_REF_NEXT_REG (ref))
+	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
+	  {
+	    if (dump_file)
+	      fprintf (dump_file, "Removing insn %d from candidates list\n",
+		       DF_REF_INSN_UID (ref));
+
+	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
+	  }
+    }
+
+  BITMAP_FREE (regs);
+}
+
+/* For a given bitmap of insn UIDs scans all instruction and
+   remove insn from CANDIDATES in case it has both convertible
+   and not convertible definitions.
+
+   All insns in a bitmap are conversion candidates according to
+   scalar_to_vector_candidate_p.  Currently it implies all insns
+   are single_set.  */
+
+static void
+remove_non_convertible_regs (bitmap candidates)
+{
+  if (TARGET_64BIT)
+    timode_remove_non_convertible_regs (candidates);
+  else
+    dimode_remove_non_convertible_regs (candidates);
+}
+
+/* Main STV pass function.  Find and convert scalar
+   instructions into vector mode when profitable.  */
+
+static unsigned int
+convert_scalars_to_vector ()
+{
+  basic_block bb;
+  bitmap candidates;
+  int converted_insns = 0;
+
+  bitmap_obstack_initialize (NULL);
+  candidates = BITMAP_ALLOC (NULL);
+
+  calculate_dominance_info (CDI_DOMINATORS);
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+  df_md_add_problem ();
+  df_analyze ();
+
+  /* Find all instructions we want to convert into vector mode.  */
+  if (dump_file)
+    fprintf (dump_file, "Searching for mode conversion candidates...\n");
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      rtx_insn *insn;
+      FOR_BB_INSNS (bb, insn)
+	if (scalar_to_vector_candidate_p (insn))
+	  {
+	    if (dump_file)
+	      fprintf (dump_file, "  insn %d is marked as a candidate\n",
+		       INSN_UID (insn));
+
+	    bitmap_set_bit (candidates, INSN_UID (insn));
+	  }
+    }
+
+  remove_non_convertible_regs (candidates);
+
+  if (bitmap_empty_p (candidates))
+    if (dump_file)
+      fprintf (dump_file, "There are no candidates for optimization.\n");
+
+  while (!bitmap_empty_p (candidates))
+    {
+      unsigned uid = bitmap_first_set_bit (candidates);
+      scalar_chain *chain;
+
+      if (TARGET_64BIT)
+	chain = new timode_scalar_chain;
+      else
+	chain = new dimode_scalar_chain;
+
+      /* Find instructions chain we want to convert to vector mode.
+	 Check all uses and definitions to estimate all required
+	 conversions.  */
+      chain->build (candidates, uid);
+
+      if (chain->compute_convert_gain () > 0)
+	converted_insns += chain->convert ();
+      else
+	if (dump_file)
+	  fprintf (dump_file, "Chain #%d conversion is not profitable\n",
+		   chain->chain_id);
+
+      delete chain;
+    }
+
+  if (dump_file)
+    fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
+
+  BITMAP_FREE (candidates);
+  bitmap_obstack_release (NULL);
+  df_process_deferred_rescans ();
+
+  /* Conversion means we may have 128bit register spills/fills
+     which require aligned stack.  */
+  if (converted_insns)
+    {
+      if (crtl->stack_alignment_needed < 128)
+	crtl->stack_alignment_needed = 128;
+      if (crtl->stack_alignment_estimated < 128)
+	crtl->stack_alignment_estimated = 128;
+      /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments.  */
+      if (TARGET_64BIT)
+	for (tree parm = DECL_ARGUMENTS (current_function_decl);
+	     parm; parm = DECL_CHAIN (parm))
+	  {
+	    if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
+	      continue;
+	    if (DECL_RTL_SET_P (parm)
+		&& GET_MODE (DECL_RTL (parm)) == V1TImode)
+	      {
+		rtx r = DECL_RTL (parm);
+		if (REG_P (r))
+		  SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
+	      }
+	    if (DECL_INCOMING_RTL (parm)
+		&& GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
+	      {
+		rtx r = DECL_INCOMING_RTL (parm);
+		if (REG_P (r))
+		  DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
+	      }
+	  }
+    }
+
+  return 0;
+}
+
+static unsigned int
+rest_of_handle_insert_vzeroupper (void)
+{
+  int i;
+
+  /* vzeroupper instructions are inserted immediately after reload to
+     account for possible spills from 256bit or 512bit registers.  The pass
+     reuses mode switching infrastructure by re-running mode insertion
+     pass, so disable entities that have already been processed.  */
+  for (i = 0; i < MAX_386_ENTITIES; i++)
+    ix86_optimize_mode_switching[i] = 0;
+
+  ix86_optimize_mode_switching[AVX_U128] = 1;
+
+  /* Call optimize_mode_switching.  */
+  g->get_passes ()->execute_pass_mode_switching ();
+  return 0;
+}
+
+namespace {
+
+const pass_data pass_data_insert_vzeroupper =
+{
+  RTL_PASS, /* type */
+  "vzeroupper", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_MACH_DEP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_insert_vzeroupper : public rtl_opt_pass
+{
+public:
+  pass_insert_vzeroupper(gcc::context *ctxt)
+    : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return TARGET_AVX
+	     && TARGET_VZEROUPPER && flag_expensive_optimizations
+	     && !optimize_size;
+    }
+
+  virtual unsigned int execute (function *)
+    {
+      return rest_of_handle_insert_vzeroupper ();
+    }
+
+}; // class pass_insert_vzeroupper
+
+const pass_data pass_data_stv =
+{
+  RTL_PASS, /* type */
+  "stv", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_MACH_DEP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_stv : public rtl_opt_pass
+{
+public:
+  pass_stv (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_stv, ctxt),
+      timode_p (false)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return (timode_p == !!TARGET_64BIT
+	      && TARGET_STV && TARGET_SSE2 && optimize > 1);
+    }
+
+  virtual unsigned int execute (function *)
+    {
+      return convert_scalars_to_vector ();
+    }
+
+  opt_pass *clone ()
+    {
+      return new pass_stv (m_ctxt);
+    }
+
+  void set_pass_param (unsigned int n, bool param)
+    {
+      gcc_assert (n == 0);
+      timode_p = param;
+    }
+
+private:
+  bool timode_p;
+}; // class pass_stv
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_insert_vzeroupper (gcc::context *ctxt)
+{
+  return new pass_insert_vzeroupper (ctxt);
+}
+
+rtl_opt_pass *
+make_pass_stv (gcc::context *ctxt)
+{
+  return new pass_stv (ctxt);
+}
+
+/* Inserting ENDBRANCH instructions.  */
+
+static unsigned int
+rest_of_insert_endbranch (void)
+{
+  timevar_push (TV_MACH_DEP);
+
+  rtx cet_eb;
+  rtx_insn *insn;
+  basic_block bb;
+
+  /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
+     absent among function attributes.  Later an optimization will be
+     introduced to make analysis if an address of a static function is
+     taken.  A static function whose address is not taken will get a
+     nocf_check attribute.  This will allow to reduce the number of EB.  */
+
+  if (!lookup_attribute ("nocf_check",
+			 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
+      && (!flag_manual_endbr
+	  || lookup_attribute ("cf_check",
+			       DECL_ATTRIBUTES (cfun->decl)))
+      && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
+    {
+      /* Queue ENDBR insertion to x86_function_profiler.  */
+      if (crtl->profile && flag_fentry)
+	cfun->machine->endbr_queued_at_entrance = true;
+      else
+	{
+	  cet_eb = gen_nop_endbr ();
+
+	  bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
+	  insn = BB_HEAD (bb);
+	  emit_insn_before (cet_eb, insn);
+	}
+    }
+
+  bb = 0;
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
+	   insn = NEXT_INSN (insn))
+	{
+	  if (CALL_P (insn))
+	    {
+	      bool need_endbr;
+	      need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
+	      if (!need_endbr && !SIBLING_CALL_P (insn))
+		{
+		  rtx call = get_call_rtx_from (insn);
+		  rtx fnaddr = XEXP (call, 0);
+		  tree fndecl = NULL_TREE;
+
+		  /* Also generate ENDBRANCH for non-tail call which
+		     may return via indirect branch.  */
+		  if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
+		    fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
+		  if (fndecl == NULL_TREE)
+		    fndecl = MEM_EXPR (fnaddr);
+		  if (fndecl
+		      && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
+		      && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
+		    fndecl = NULL_TREE;
+		  if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
+		    {
+		      tree fntype = TREE_TYPE (fndecl);
+		      if (lookup_attribute ("indirect_return",
+					    TYPE_ATTRIBUTES (fntype)))
+			need_endbr = true;
+		    }
+		}
+	      if (!need_endbr)
+		continue;
+	      /* Generate ENDBRANCH after CALL, which can return more than
+		 twice, setjmp-like functions.  */
+
+	      cet_eb = gen_nop_endbr ();
+	      emit_insn_after_setloc (cet_eb, insn, INSN_LOCATION (insn));
+	      continue;
+	    }
+
+	  if (JUMP_P (insn) && flag_cet_switch)
+	    {
+	      rtx target = JUMP_LABEL (insn);
+	      if (target == NULL_RTX || ANY_RETURN_P (target))
+		continue;
+
+	      /* Check the jump is a switch table.  */
+	      rtx_insn *label = as_a<rtx_insn *> (target);
+	      rtx_insn *table = next_insn (label);
+	      if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
+		continue;
+
+	      /* For the indirect jump find out all places it jumps and insert
+		 ENDBRANCH there.  It should be done under a special flag to
+		 control ENDBRANCH generation for switch stmts.  */
+	      edge_iterator ei;
+	      edge e;
+	      basic_block dest_blk;
+
+	      FOR_EACH_EDGE (e, ei, bb->succs)
+		{
+		  rtx_insn *insn;
+
+		  dest_blk = e->dest;
+		  insn = BB_HEAD (dest_blk);
+		  gcc_assert (LABEL_P (insn));
+		  cet_eb = gen_nop_endbr ();
+		  emit_insn_after (cet_eb, insn);
+		}
+	      continue;
+	    }
+
+	  if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
+	      || (NOTE_P (insn)
+		  && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
+	    /* TODO.  Check /s bit also.  */
+	    {
+	      cet_eb = gen_nop_endbr ();
+	      emit_insn_after (cet_eb, insn);
+	      continue;
+	    }
+	}
+    }
+
+  timevar_pop (TV_MACH_DEP);
+  return 0;
+}
+
+namespace {
+
+const pass_data pass_data_insert_endbranch =
+{
+  RTL_PASS, /* type.  */
+  "cet", /* name.  */
+  OPTGROUP_NONE, /* optinfo_flags.  */
+  TV_MACH_DEP, /* tv_id.  */
+  0, /* properties_required.  */
+  0, /* properties_provided.  */
+  0, /* properties_destroyed.  */
+  0, /* todo_flags_start.  */
+  0, /* todo_flags_finish.  */
+};
+
+class pass_insert_endbranch : public rtl_opt_pass
+{
+public:
+  pass_insert_endbranch (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return ((flag_cf_protection & CF_BRANCH));
+    }
+
+  virtual unsigned int execute (function *)
+    {
+      return rest_of_insert_endbranch ();
+    }
+
+}; // class pass_insert_endbranch
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_insert_endbranch (gcc::context *ctxt)
+{
+  return new pass_insert_endbranch (ctxt);
+}
+
+/* At entry of the nearest common dominator for basic blocks with
+   conversions, generate a single
+	vxorps %xmmN, %xmmN, %xmmN
+   for all
+	vcvtss2sd  op, %xmmN, %xmmX
+	vcvtsd2ss  op, %xmmN, %xmmX
+	vcvtsi2ss  op, %xmmN, %xmmX
+	vcvtsi2sd  op, %xmmN, %xmmX
+
+   NB: We want to generate only a single vxorps to cover the whole
+   function.  The LCM algorithm isn't appropriate here since it may
+   place a vxorps inside the loop.  */
+
+static unsigned int
+remove_partial_avx_dependency (void)
+{
+  timevar_push (TV_MACH_DEP);
+
+  bitmap_obstack_initialize (NULL);
+  bitmap convert_bbs = BITMAP_ALLOC (NULL);
+
+  basic_block bb;
+  rtx_insn *insn, *set_insn;
+  rtx set;
+  rtx v4sf_const0 = NULL_RTX;
+
+  auto_vec<rtx_insn *> control_flow_insns;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      FOR_BB_INSNS (bb, insn)
+	{
+	  if (!NONDEBUG_INSN_P (insn))
+	    continue;
+
+	  set = single_set (insn);
+	  if (!set)
+	    continue;
+
+	  if (get_attr_avx_partial_xmm_update (insn)
+	      != AVX_PARTIAL_XMM_UPDATE_TRUE)
+	    continue;
+
+	  if (!v4sf_const0)
+	    {
+	      calculate_dominance_info (CDI_DOMINATORS);
+	      df_set_flags (DF_DEFER_INSN_RESCAN);
+	      df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+	      df_md_add_problem ();
+	      df_analyze ();
+	      v4sf_const0 = gen_reg_rtx (V4SFmode);
+	    }
+
+	  /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
+	     SI -> SF, SI -> DF, DI -> SF, DI -> DF, to vec_dup and
+	     vec_merge with subreg.  */
+	  rtx src = SET_SRC (set);
+	  rtx dest = SET_DEST (set);
+	  machine_mode dest_mode = GET_MODE (dest);
+
+	  rtx zero;
+	  machine_mode dest_vecmode;
+	  if (dest_mode == E_SFmode)
+	    {
+	      dest_vecmode = V4SFmode;
+	      zero = v4sf_const0;
+	    }
+	  else
+	    {
+	      dest_vecmode = V2DFmode;
+	      zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
+	    }
+
+	  /* Change source to vector mode.  */
+	  src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
+	  src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
+				   GEN_INT (HOST_WIDE_INT_1U));
+	  /* Change destination to vector mode.  */
+	  rtx vec = gen_reg_rtx (dest_vecmode);
+	  /* Generate an XMM vector SET.  */
+	  set = gen_rtx_SET (vec, src);
+	  set_insn = emit_insn_before (set, insn);
+	  df_insn_rescan (set_insn);
+
+	  if (cfun->can_throw_non_call_exceptions)
+	    {
+	      /* Handle REG_EH_REGION note.  */
+	      rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
+	      if (note)
+		{
+		  control_flow_insns.safe_push (set_insn);
+		  add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
+		}
+	    }
+
+	  src = gen_rtx_SUBREG (dest_mode, vec, 0);
+	  set = gen_rtx_SET (dest, src);
+
+	  /* Drop possible dead definitions.  */
+	  PATTERN (insn) = set;
+
+	  INSN_CODE (insn) = -1;
+	  recog_memoized (insn);
+	  df_insn_rescan (insn);
+	  bitmap_set_bit (convert_bbs, bb->index);
+	}
+    }
+
+  if (v4sf_const0)
+    {
+      /* (Re-)discover loops so that bb->loop_father can be used in the
+	 analysis below.  */
+      loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+
+      /* Generate a vxorps at entry of the nearest dominator for basic
+	 blocks with conversions, which is in the the fake loop that
+	 contains the whole function, so that there is only a single
+	 vxorps in the whole function.   */
+      bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
+					     convert_bbs);
+      while (bb->loop_father->latch
+	     != EXIT_BLOCK_PTR_FOR_FN (cfun))
+	bb = get_immediate_dominator (CDI_DOMINATORS,
+				      bb->loop_father->header);
+
+      set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
+
+      insn = BB_HEAD (bb);
+      while (insn && !NONDEBUG_INSN_P (insn))
+	{
+	  if (insn == BB_END (bb))
+	    {
+	      insn = NULL;
+	      break;
+	    }
+	  insn = NEXT_INSN (insn);
+	}
+      if (insn == BB_HEAD (bb))
+        set_insn = emit_insn_before (set, insn);
+      else
+	set_insn = emit_insn_after (set,
+				    insn ? PREV_INSN (insn) : BB_END (bb));
+      df_insn_rescan (set_insn);
+      df_process_deferred_rescans ();
+      loop_optimizer_finalize ();
+
+      if (!control_flow_insns.is_empty ())
+	{
+	  free_dominance_info (CDI_DOMINATORS);
+
+	  unsigned int i;
+	  FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
+	    if (control_flow_insn_p (insn))
+	      {
+		/* Split the block after insn.  There will be a fallthru
+		   edge, which is OK so we keep it.  We have to create
+		   the exception edges ourselves.  */
+		bb = BLOCK_FOR_INSN (insn);
+		split_block (bb, insn);
+		rtl_make_eh_edge (NULL, bb, BB_END (bb));
+	      }
+	}
+    }
+
+  bitmap_obstack_release (NULL);
+  BITMAP_FREE (convert_bbs);
+
+  timevar_pop (TV_MACH_DEP);
+  return 0;
+}
+
+namespace {
+
+const pass_data pass_data_remove_partial_avx_dependency =
+{
+  RTL_PASS, /* type */
+  "rpad", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_MACH_DEP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_remove_partial_avx_dependency : public rtl_opt_pass
+{
+public:
+  pass_remove_partial_avx_dependency (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      return (TARGET_AVX
+	      && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+	      && TARGET_SSE_MATH
+	      && optimize
+	      && optimize_function_for_speed_p (cfun));
+    }
+
+  virtual unsigned int execute (function *)
+    {
+      return remove_partial_avx_dependency ();
+    }
+}; // class pass_rpad
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
+{
+  return new pass_remove_partial_avx_dependency (ctxt);
+}
+
+/* This compares the priority of target features in function DECL1
+   and DECL2.  It returns positive value if DECL1 is higher priority,
+   negative value if DECL2 is higher priority and 0 if they are the
+   same.  */
+
+int
+ix86_compare_version_priority (tree decl1, tree decl2)
+{
+  unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
+  unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
+
+  return (int)priority1 - (int)priority2;
+}
+
+/* V1 and V2 point to function versions with different priorities
+   based on the target ISA.  This function compares their priorities.  */
+ 
+static int
+feature_compare (const void *v1, const void *v2)
+{
+  typedef struct _function_version_info
+    {
+      tree version_decl;
+      tree predicate_chain;
+      unsigned int dispatch_priority;
+    } function_version_info;
+
+  const function_version_info c1 = *(const function_version_info *)v1;
+  const function_version_info c2 = *(const function_version_info *)v2;
+  return (c2.dispatch_priority - c1.dispatch_priority);
+}
+
+/* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
+   to return a pointer to VERSION_DECL if the outcome of the expression
+   formed by PREDICATE_CHAIN is true.  This function will be called during
+   version dispatch to decide which function version to execute.  It returns
+   the basic block at the end, to which more conditions can be added.  */
+
+static basic_block
+add_condition_to_bb (tree function_decl, tree version_decl,
+		     tree predicate_chain, basic_block new_bb)
+{
+  gimple *return_stmt;
+  tree convert_expr, result_var;
+  gimple *convert_stmt;
+  gimple *call_cond_stmt;
+  gimple *if_else_stmt;
+
+  basic_block bb1, bb2, bb3;
+  edge e12, e23;
+
+  tree cond_var, and_expr_var = NULL_TREE;
+  gimple_seq gseq;
+
+  tree predicate_decl, predicate_arg;
+
+  push_cfun (DECL_STRUCT_FUNCTION (function_decl));
+
+  gcc_assert (new_bb != NULL);
+  gseq = bb_seq (new_bb);
+
+
+  convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
+	     		 build_fold_addr_expr (version_decl));
+  result_var = create_tmp_var (ptr_type_node);
+  convert_stmt = gimple_build_assign (result_var, convert_expr); 
+  return_stmt = gimple_build_return (result_var);
+
+  if (predicate_chain == NULL_TREE)
+    {
+      gimple_seq_add_stmt (&gseq, convert_stmt);
+      gimple_seq_add_stmt (&gseq, return_stmt);
+      set_bb_seq (new_bb, gseq);
+      gimple_set_bb (convert_stmt, new_bb);
+      gimple_set_bb (return_stmt, new_bb);
+      pop_cfun ();
+      return new_bb;
+    }
+
+  while (predicate_chain != NULL)
+    {
+      cond_var = create_tmp_var (integer_type_node);
+      predicate_decl = TREE_PURPOSE (predicate_chain);
+      predicate_arg = TREE_VALUE (predicate_chain);
+      call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
+      gimple_call_set_lhs (call_cond_stmt, cond_var);
+
+      gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
+      gimple_set_bb (call_cond_stmt, new_bb);
+      gimple_seq_add_stmt (&gseq, call_cond_stmt);
+
+      predicate_chain = TREE_CHAIN (predicate_chain);
+      
+      if (and_expr_var == NULL)
+        and_expr_var = cond_var;
+      else
+	{
+	  gimple *assign_stmt;
+	  /* Use MIN_EXPR to check if any integer is zero?.
+	     and_expr_var = min_expr <cond_var, and_expr_var>  */
+	  assign_stmt = gimple_build_assign (and_expr_var,
+			  build2 (MIN_EXPR, integer_type_node,
+				  cond_var, and_expr_var));
+
+	  gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
+	  gimple_set_bb (assign_stmt, new_bb);
+	  gimple_seq_add_stmt (&gseq, assign_stmt);
+	}
+    }
+
+  if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
+	  		            integer_zero_node,
+				    NULL_TREE, NULL_TREE);
+  gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
+  gimple_set_bb (if_else_stmt, new_bb);
+  gimple_seq_add_stmt (&gseq, if_else_stmt);
+
+  gimple_seq_add_stmt (&gseq, convert_stmt);
+  gimple_seq_add_stmt (&gseq, return_stmt);
+  set_bb_seq (new_bb, gseq);
+
+  bb1 = new_bb;
+  e12 = split_block (bb1, if_else_stmt);
+  bb2 = e12->dest;
+  e12->flags &= ~EDGE_FALLTHRU;
+  e12->flags |= EDGE_TRUE_VALUE;
+
+  e23 = split_block (bb2, return_stmt);
+
+  gimple_set_bb (convert_stmt, bb2);
+  gimple_set_bb (return_stmt, bb2);
+
+  bb3 = e23->dest;
+  make_edge (bb1, bb3, EDGE_FALSE_VALUE); 
+
+  remove_edge (e23);
+  make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
+
+  pop_cfun ();
+
+  return bb3;
+}
+
+/* This function generates the dispatch function for
+   multi-versioned functions.  DISPATCH_DECL is the function which will
+   contain the dispatch logic.  FNDECLS are the function choices for
+   dispatch, and is a tree chain.  EMPTY_BB is the basic block pointer
+   in DISPATCH_DECL in which the dispatch code is generated.  */
+
+static int
+dispatch_function_versions (tree dispatch_decl,
+			    void *fndecls_p,
+			    basic_block *empty_bb)
+{
+  tree default_decl;
+  gimple *ifunc_cpu_init_stmt;
+  gimple_seq gseq;
+  int ix;
+  tree ele;
+  vec<tree> *fndecls;
+  unsigned int num_versions = 0;
+  unsigned int actual_versions = 0;
+  unsigned int i;
+
+  struct _function_version_info
+    {
+      tree version_decl;
+      tree predicate_chain;
+      unsigned int dispatch_priority;
+    }*function_version_info;
+
+  gcc_assert (dispatch_decl != NULL
+	      && fndecls_p != NULL
+	      && empty_bb != NULL);
+
+  /*fndecls_p is actually a vector.  */
+  fndecls = static_cast<vec<tree> *> (fndecls_p);
+
+  /* At least one more version other than the default.  */
+  num_versions = fndecls->length ();
+  gcc_assert (num_versions >= 2);
+
+  function_version_info = (struct _function_version_info *)
+    XNEWVEC (struct _function_version_info, (num_versions - 1));
+
+  /* The first version in the vector is the default decl.  */
+  default_decl = (*fndecls)[0];
+
+  push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
+
+  gseq = bb_seq (*empty_bb);
+  /* Function version dispatch is via IFUNC.  IFUNC resolvers fire before
+     constructors, so explicity call __builtin_cpu_init here.  */
+  ifunc_cpu_init_stmt
+    = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL);
+  gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
+  gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
+  set_bb_seq (*empty_bb, gseq);
+
+  pop_cfun ();
+
+
+  for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
+    {
+      tree version_decl = ele;
+      tree predicate_chain = NULL_TREE;
+      unsigned int priority;
+      /* Get attribute string, parse it and find the right predicate decl.
+         The predicate function could be a lengthy combination of many
+	 features, like arch-type and various isa-variants.  */
+      priority = get_builtin_code_for_version (version_decl,
+	 			               &predicate_chain);
+
+      if (predicate_chain == NULL_TREE)
+	continue;
+
+      function_version_info [actual_versions].version_decl = version_decl;
+      function_version_info [actual_versions].predicate_chain
+	 = predicate_chain;
+      function_version_info [actual_versions].dispatch_priority = priority;
+      actual_versions++;
+    }
+
+  /* Sort the versions according to descending order of dispatch priority.  The
+     priority is based on the ISA.  This is not a perfect solution.  There
+     could still be ambiguity.  If more than one function version is suitable
+     to execute,  which one should be dispatched?  In future, allow the user
+     to specify a dispatch  priority next to the version.  */
+  qsort (function_version_info, actual_versions,
+         sizeof (struct _function_version_info), feature_compare);
+
+  for  (i = 0; i < actual_versions; ++i)
+    *empty_bb = add_condition_to_bb (dispatch_decl,
+				     function_version_info[i].version_decl,
+				     function_version_info[i].predicate_chain,
+				     *empty_bb);
+
+  /* dispatch default version at the end.  */
+  *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
+				   NULL, *empty_bb);
+
+  free (function_version_info);
+  return 0;
+}
+
+/* This function changes the assembler name for functions that are
+   versions.  If DECL is a function version and has a "target"
+   attribute, it appends the attribute string to its assembler name.  */
+
+static tree
+ix86_mangle_function_version_assembler_name (tree decl, tree id)
+{
+  tree version_attr;
+  const char *orig_name, *version_string;
+  char *attr_str, *assembler_name;
+
+  if (DECL_DECLARED_INLINE_P (decl)
+      && lookup_attribute ("gnu_inline",
+			   DECL_ATTRIBUTES (decl)))
+    error_at (DECL_SOURCE_LOCATION (decl),
+	      "function versions cannot be marked as %<gnu_inline%>,"
+	      " bodies have to be generated");
+
+  if (DECL_VIRTUAL_P (decl)
+      || DECL_VINDEX (decl))
+    sorry ("virtual function multiversioning not supported");
+
+  version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
+
+  /* target attribute string cannot be NULL.  */
+  gcc_assert (version_attr != NULL_TREE);
+
+  orig_name = IDENTIFIER_POINTER (id);
+  version_string
+    = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
+
+  if (strcmp (version_string, "default") == 0)
+    return id;
+
+  attr_str = sorted_attr_string (TREE_VALUE (version_attr));
+  assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
+
+  sprintf (assembler_name, "%s.%s", orig_name, attr_str);
+
+  /* Allow assembler name to be modified if already set.  */
+  if (DECL_ASSEMBLER_NAME_SET_P (decl))
+    SET_DECL_RTL (decl, NULL);
+
+  tree ret = get_identifier (assembler_name);
+  XDELETEVEC (attr_str);
+  XDELETEVEC (assembler_name);
+  return ret;
+}
+
+tree 
+ix86_mangle_decl_assembler_name (tree decl, tree id)
+{
+  /* For function version, add the target suffix to the assembler name.  */
+  if (TREE_CODE (decl) == FUNCTION_DECL
+      && DECL_FUNCTION_VERSIONED (decl))
+    id = ix86_mangle_function_version_assembler_name (decl, id);
+#ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
+  id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
+#endif
+
+  return id;
+}
+
+/* Make a dispatcher declaration for the multi-versioned function DECL.
+   Calls to DECL function will be replaced with calls to the dispatcher
+   by the front-end.  Returns the decl of the dispatcher function.  */
+
+tree
+ix86_get_function_versions_dispatcher (void *decl)
+{
+  tree fn = (tree) decl;
+  struct cgraph_node *node = NULL;
+  struct cgraph_node *default_node = NULL;
+  struct cgraph_function_version_info *node_v = NULL;
+  struct cgraph_function_version_info *first_v = NULL;
+
+  tree dispatch_decl = NULL;
+
+  struct cgraph_function_version_info *default_version_info = NULL;
+ 
+  gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
+
+  node = cgraph_node::get (fn);
+  gcc_assert (node != NULL);
+
+  node_v = node->function_version ();
+  gcc_assert (node_v != NULL);
+ 
+  if (node_v->dispatcher_resolver != NULL)
+    return node_v->dispatcher_resolver;
+
+  /* Find the default version and make it the first node.  */
+  first_v = node_v;
+  /* Go to the beginning of the chain.  */
+  while (first_v->prev != NULL)
+    first_v = first_v->prev;
+  default_version_info = first_v;
+  while (default_version_info != NULL)
+    {
+      if (is_function_default_version
+	    (default_version_info->this_node->decl))
+        break;
+      default_version_info = default_version_info->next;
+    }
+
+  /* If there is no default node, just return NULL.  */
+  if (default_version_info == NULL)
+    return NULL;
+
+  /* Make default info the first node.  */
+  if (first_v != default_version_info)
+    {
+      default_version_info->prev->next = default_version_info->next;
+      if (default_version_info->next)
+        default_version_info->next->prev = default_version_info->prev;
+      first_v->prev = default_version_info;
+      default_version_info->next = first_v;
+      default_version_info->prev = NULL;
+    }
+
+  default_node = default_version_info->this_node;
+
+#if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
+  if (targetm.has_ifunc_p ())
+    {
+      struct cgraph_function_version_info *it_v = NULL;
+      struct cgraph_node *dispatcher_node = NULL;
+      struct cgraph_function_version_info *dispatcher_version_info = NULL;
+
+      /* Right now, the dispatching is done via ifunc.  */
+      dispatch_decl = make_dispatcher_decl (default_node->decl);
+
+      dispatcher_node = cgraph_node::get_create (dispatch_decl);
+      gcc_assert (dispatcher_node != NULL);
+      dispatcher_node->dispatcher_function = 1;
+      dispatcher_version_info
+	= dispatcher_node->insert_new_function_version ();
+      dispatcher_version_info->next = default_version_info;
+      dispatcher_node->definition = 1;
+
+      /* Set the dispatcher for all the versions.  */
+      it_v = default_version_info;
+      while (it_v != NULL)
+	{
+	  it_v->dispatcher_resolver = dispatch_decl;
+	  it_v = it_v->next;
+	}
+    }
+  else
+#endif
+    {
+      error_at (DECL_SOURCE_LOCATION (default_node->decl),
+		"multiversioning needs ifunc which is not supported "
+		"on this target");
+    }
+
+  return dispatch_decl;
+}
+
+/* Make the resolver function decl to dispatch the versions of
+   a multi-versioned function,  DEFAULT_DECL.  IFUNC_ALIAS_DECL is
+   ifunc alias that will point to the created resolver.  Create an
+   empty basic block in the resolver and store the pointer in
+   EMPTY_BB.  Return the decl of the resolver function.  */
+
+static tree
+make_resolver_func (const tree default_decl,
+		    const tree ifunc_alias_decl,
+		    basic_block *empty_bb)
+{
+  char *resolver_name;
+  tree decl, type, decl_name, t;
+
+  /* IFUNC's have to be globally visible.  So, if the default_decl is
+     not, then the name of the IFUNC should be made unique.  */
+  if (TREE_PUBLIC (default_decl) == 0)
+    {
+      char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
+      symtab->change_decl_assembler_name (ifunc_alias_decl,
+					  get_identifier (ifunc_name));
+      XDELETEVEC (ifunc_name);
+    }
+
+  resolver_name = make_unique_name (default_decl, "resolver", false);
+
+  /* The resolver function should return a (void *). */
+  type = build_function_type_list (ptr_type_node, NULL_TREE);
+
+  decl = build_fn_decl (resolver_name, type);
+  decl_name = get_identifier (resolver_name);
+  SET_DECL_ASSEMBLER_NAME (decl, decl_name);
+
+  DECL_NAME (decl) = decl_name;
+  TREE_USED (decl) = 1;
+  DECL_ARTIFICIAL (decl) = 1;
+  DECL_IGNORED_P (decl) = 1;
+  TREE_PUBLIC (decl) = 0;
+  DECL_UNINLINABLE (decl) = 1;
+
+  /* Resolver is not external, body is generated.  */
+  DECL_EXTERNAL (decl) = 0;
+  DECL_EXTERNAL (ifunc_alias_decl) = 0;
+
+  DECL_CONTEXT (decl) = NULL_TREE;
+  DECL_INITIAL (decl) = make_node (BLOCK);
+  DECL_STATIC_CONSTRUCTOR (decl) = 0;
+
+  if (DECL_COMDAT_GROUP (default_decl)
+      || TREE_PUBLIC (default_decl))
+    {
+      /* In this case, each translation unit with a call to this
+	 versioned function will put out a resolver.  Ensure it
+	 is comdat to keep just one copy.  */
+      DECL_COMDAT (decl) = 1;
+      make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
+    }
+  /* Build result decl and add to function_decl. */
+  t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
+  DECL_CONTEXT (t) = decl;
+  DECL_ARTIFICIAL (t) = 1;
+  DECL_IGNORED_P (t) = 1;
+  DECL_RESULT (decl) = t;
+
+  gimplify_function_tree (decl);
+  push_cfun (DECL_STRUCT_FUNCTION (decl));
+  *empty_bb = init_lowered_empty_function (decl, false,
+					   profile_count::uninitialized ());
+
+  cgraph_node::add_new_function (decl, true);
+  symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
+
+  pop_cfun ();
+
+  gcc_assert (ifunc_alias_decl != NULL);
+  /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name.  */
+  DECL_ATTRIBUTES (ifunc_alias_decl)
+    = make_attribute ("ifunc", resolver_name,
+		      DECL_ATTRIBUTES (ifunc_alias_decl));
+
+  /* Create the alias for dispatch to resolver here.  */
+  cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
+  XDELETEVEC (resolver_name);
+  return decl;
+}
+
+/* Generate the dispatching code body to dispatch multi-versioned function
+   DECL.  The target hook is called to process the "target" attributes and
+   provide the code to dispatch the right function at run-time.  NODE points
+   to the dispatcher decl whose body will be created.  */
+
+tree 
+ix86_generate_version_dispatcher_body (void *node_p)
+{
+  tree resolver_decl;
+  basic_block empty_bb;
+  tree default_ver_decl;
+  struct cgraph_node *versn;
+  struct cgraph_node *node;
+
+  struct cgraph_function_version_info *node_version_info = NULL;
+  struct cgraph_function_version_info *versn_info = NULL;
+
+  node = (cgraph_node *)node_p;
+
+  node_version_info = node->function_version ();
+  gcc_assert (node->dispatcher_function
+	      && node_version_info != NULL);
+
+  if (node_version_info->dispatcher_resolver)
+    return node_version_info->dispatcher_resolver;
+
+  /* The first version in the chain corresponds to the default version.  */
+  default_ver_decl = node_version_info->next->this_node->decl;
+
+  /* node is going to be an alias, so remove the finalized bit.  */
+  node->definition = false;
+
+  resolver_decl = make_resolver_func (default_ver_decl,
+				      node->decl, &empty_bb);
+
+  node_version_info->dispatcher_resolver = resolver_decl;
+
+  push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
+
+  auto_vec<tree, 2> fn_ver_vec;
+
+  for (versn_info = node_version_info->next; versn_info;
+       versn_info = versn_info->next)
+    {
+      versn = versn_info->this_node;
+      /* Check for virtual functions here again, as by this time it should
+	 have been determined if this function needs a vtable index or
+	 not.  This happens for methods in derived classes that override
+	 virtual methods in base classes but are not explicitly marked as
+	 virtual.  */
+      if (DECL_VINDEX (versn->decl))
+	sorry ("virtual function multiversioning not supported");
+
+      fn_ver_vec.safe_push (versn->decl);
+    }
+
+  dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
+  cgraph_edge::rebuild_edges ();
+  pop_cfun ();
+  return resolver_decl;
+}
+
+
diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h
new file mode 100644
index 000000000..358122249
--- /dev/null
+++ b/gcc/config/i386/i386-features.h
@@ -0,0 +1,201 @@
+/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_I386_FEATURES_H
+#define GCC_I386_FEATURES_H
+
+enum xlogue_stub {
+  XLOGUE_STUB_SAVE,
+  XLOGUE_STUB_RESTORE,
+  XLOGUE_STUB_RESTORE_TAIL,
+  XLOGUE_STUB_SAVE_HFP,
+  XLOGUE_STUB_RESTORE_HFP,
+  XLOGUE_STUB_RESTORE_HFP_TAIL,
+
+  XLOGUE_STUB_COUNT
+};
+
+enum xlogue_stub_sets {
+  XLOGUE_SET_ALIGNED,
+  XLOGUE_SET_ALIGNED_PLUS_8,
+  XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
+  XLOGUE_SET_HFP_ALIGNED_PLUS_8,
+
+  XLOGUE_SET_COUNT
+};
+
+/* Register save/restore layout used by out-of-line stubs.  */
+class xlogue_layout {
+public:
+  struct reginfo
+  {
+    unsigned regno;
+    HOST_WIDE_INT offset;	/* Offset used by stub base pointer (rax or
+				   rsi) to where each register is stored.  */
+  };
+
+  unsigned get_nregs () const			{return m_nregs;}
+  HOST_WIDE_INT get_stack_align_off_in () const	{return m_stack_align_off_in;}
+
+  const reginfo &get_reginfo (unsigned reg) const
+  {
+    gcc_assert (reg < m_nregs);
+    return m_regs[reg];
+  }
+
+  static const char *get_stub_name (enum xlogue_stub stub,
+				    unsigned n_extra_args);
+
+  /* Returns an rtx for the stub's symbol based upon
+       1.) the specified stub (save, restore or restore_ret) and
+       2.) the value of cfun->machine->call_ms2sysv_extra_regs and
+       3.) rather or not stack alignment is being performed.  */
+  static rtx get_stub_rtx (enum xlogue_stub stub);
+
+  /* Returns the amount of stack space (including padding) that the stub
+     needs to store registers based upon data in the machine_function.  */
+  HOST_WIDE_INT get_stack_space_used () const
+  {
+    const struct machine_function *m = cfun->machine;
+    unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
+
+    gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
+    return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
+  }
+
+  /* Returns the offset for the base pointer used by the stub.  */
+  HOST_WIDE_INT get_stub_ptr_offset () const
+  {
+    return STUB_INDEX_OFFSET + m_stack_align_off_in;
+  }
+
+  static const struct xlogue_layout &get_instance ();
+  static unsigned count_stub_managed_regs ();
+  static bool is_stub_managed_reg (unsigned regno, unsigned count);
+
+  static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
+  static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
+  static const unsigned MAX_REGS = 18;
+  static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
+  static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
+  static const unsigned STUB_NAME_MAX_LEN = 20;
+  static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
+  static const unsigned REG_ORDER[MAX_REGS];
+  static const unsigned REG_ORDER_REALIGN[MAX_REGS];
+
+private:
+  xlogue_layout ();
+  xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
+  xlogue_layout (const xlogue_layout &);
+
+  /* True if hard frame pointer is used.  */
+  bool m_hfp;
+
+  /* Max number of register this layout manages.  */
+  unsigned m_nregs;
+
+  /* Incoming offset from 16-byte alignment.  */
+  HOST_WIDE_INT m_stack_align_off_in;
+
+  /* Register order and offsets.  */
+  struct reginfo m_regs[MAX_REGS];
+
+  /* Lazy-inited cache of symbol names for stubs.  */
+  static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
+			  [STUB_NAME_MAX_LEN];
+
+  static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
+};
+
+namespace {
+
+class scalar_chain
+{
+ public:
+  scalar_chain ();
+  virtual ~scalar_chain ();
+
+  static unsigned max_id;
+
+  /* ID of a chain.  */
+  unsigned int chain_id;
+  /* A queue of instructions to be included into a chain.  */
+  bitmap queue;
+  /* Instructions included into a chain.  */
+  bitmap insns;
+  /* All registers defined by a chain.  */
+  bitmap defs;
+  /* Registers used in both vector and sclar modes.  */
+  bitmap defs_conv;
+
+  void build (bitmap candidates, unsigned insn_uid);
+  virtual int compute_convert_gain () = 0;
+  int convert ();
+
+ protected:
+  void add_to_queue (unsigned insn_uid);
+  void emit_conversion_insns (rtx insns, rtx_insn *pos);
+
+ private:
+  void add_insn (bitmap candidates, unsigned insn_uid);
+  void analyze_register_chain (bitmap candidates, df_ref ref);
+  virtual void mark_dual_mode_def (df_ref def) = 0;
+  virtual void convert_insn (rtx_insn *insn) = 0;
+  virtual void convert_registers () = 0;
+};
+
+class dimode_scalar_chain : public scalar_chain
+{
+ public:
+  int compute_convert_gain ();
+ private:
+  void mark_dual_mode_def (df_ref def);
+  rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
+  void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
+  void convert_insn (rtx_insn *insn);
+  void convert_op (rtx *op, rtx_insn *insn);
+  void convert_reg (unsigned regno);
+  void make_vector_copies (unsigned regno);
+  void convert_registers ();
+  int vector_const_cost (rtx exp);
+};
+
+class timode_scalar_chain : public scalar_chain
+{
+ public:
+  /* Convert from TImode to V1TImode is always faster.  */
+  int compute_convert_gain () { return 1; }
+
+ private:
+  void mark_dual_mode_def (df_ref def);
+  void fix_debug_reg_uses (rtx reg);
+  void convert_insn (rtx_insn *insn);
+  /* We don't convert registers to difference size.  */
+  void convert_registers () {}
+};
+
+} // anon namespace
+
+bool ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined);
+int ix86_compare_version_priority (tree decl1, tree decl2);
+tree ix86_generate_version_dispatcher_body (void *node_p);
+tree ix86_get_function_versions_dispatcher (void *decl);
+tree ix86_mangle_decl_assembler_name (tree decl, tree id);
+
+
+#endif  /* GCC_I386_FEATURES_H */
diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
new file mode 100644
index 000000000..4a03bead8
--- /dev/null
+++ b/gcc/config/i386/i386-options.c
@@ -0,0 +1,3707 @@
+/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "memmodel.h"
+#include "gimple.h"
+#include "cfghooks.h"
+#include "cfgloop.h"
+#include "df.h"
+#include "tm_p.h"
+#include "stringpool.h"
+#include "expmed.h"
+#include "optabs.h"
+#include "regs.h"
+#include "emit-rtl.h"
+#include "recog.h"
+#include "cgraph.h"
+#include "diagnostic.h"
+#include "cfgbuild.h"
+#include "alias.h"
+#include "fold-const.h"
+#include "attribs.h"
+#include "calls.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "output.h"
+#include "insn-attr.h"
+#include "flags.h"
+#include "except.h"
+#include "explow.h"
+#include "expr.h"
+#include "cfgrtl.h"
+#include "common/common-target.h"
+#include "langhooks.h"
+#include "reload.h"
+#include "gimplify.h"
+#include "dwarf2.h"
+#include "tm-constrs.h"
+#include "params.h"
+#include "cselib.h"
+#include "sched-int.h"
+#include "opts.h"
+#include "tree-pass.h"
+#include "context.h"
+#include "pass_manager.h"
+#include "target-globals.h"
+#include "gimple-iterator.h"
+#include "tree-vectorizer.h"
+#include "shrink-wrap.h"
+#include "builtins.h"
+#include "rtl-iter.h"
+#include "tree-iterator.h"
+#include "dbgcnt.h"
+#include "case-cfn-macros.h"
+#include "dojump.h"
+#include "fold-const-call.h"
+#include "tree-vrp.h"
+#include "tree-ssanames.h"
+#include "selftest.h"
+#include "selftest-rtl.h"
+#include "print-rtl.h"
+#include "intl.h"
+#include "ifcvt.h"
+#include "symbol-summary.h"
+#include "ipa-prop.h"
+#include "ipa-fnsummary.h"
+#include "wide-int-bitmask.h"
+#include "tree-vector-builder.h"
+#include "debug.h"
+#include "dwarf2out.h"
+#include "i386-options.h"
+
+#include "x86-tune-costs.h"
+
+#ifndef SUBTARGET32_DEFAULT_CPU
+#define SUBTARGET32_DEFAULT_CPU "i386"
+#endif
+
+/* Processor feature/optimization bitmasks.  */
+#define m_386 (HOST_WIDE_INT_1U<<PROCESSOR_I386)
+#define m_486 (HOST_WIDE_INT_1U<<PROCESSOR_I486)
+#define m_PENT (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM)
+#define m_LAKEMONT (HOST_WIDE_INT_1U<<PROCESSOR_LAKEMONT)
+#define m_PPRO (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUMPRO)
+#define m_PENT4 (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM4)
+#define m_NOCONA (HOST_WIDE_INT_1U<<PROCESSOR_NOCONA)
+#define m_P4_NOCONA (m_PENT4 | m_NOCONA)
+#define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2)
+#define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM)
+#define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE)
+#define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL)
+#define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL)
+#define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT)
+#define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL)
+#define m_KNM (HOST_WIDE_INT_1U<<PROCESSOR_KNM)
+#define m_SKYLAKE (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE)
+#define m_SKYLAKE_AVX512 (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512)
+#define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE)
+#define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT)
+#define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)
+#define m_CASCADELAKE (HOST_WIDE_INT_1U<<PROCESSOR_CASCADELAKE)
+#define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \
+		       | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE)
+#define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512)
+#define m_CORE_ALL (m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_CORE_AVX2)
+#define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT)
+#define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS)
+#define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT)
+#define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL)
+
+#define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
+#define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
+#define m_K6_GEODE (m_K6 | m_GEODE)
+#define m_K8 (HOST_WIDE_INT_1U<<PROCESSOR_K8)
+#define m_ATHLON (HOST_WIDE_INT_1U<<PROCESSOR_ATHLON)
+#define m_ATHLON_K8 (m_K8 | m_ATHLON)
+#define m_AMDFAM10 (HOST_WIDE_INT_1U<<PROCESSOR_AMDFAM10)
+#define m_BDVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER1)
+#define m_BDVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER2)
+#define m_BDVER3 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER3)
+#define m_BDVER4 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER4)
+#define m_ZNVER1 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER1)
+#define m_ZNVER2 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER2)
+#define m_BTVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER1)
+#define m_BTVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER2)
+#define m_BDVER	(m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
+#define m_BTVER (m_BTVER1 | m_BTVER2)
+#define m_ZNVER	(m_ZNVER1 | m_ZNVER2)
+#define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
+			| m_ZNVER)
+
+#define m_GENERIC (HOST_WIDE_INT_1U<<PROCESSOR_GENERIC)
+
+const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
+#undef DEF_TUNE
+#define DEF_TUNE(tune, name, selector) name,
+#include "x86-tune.def"
+#undef DEF_TUNE
+};
+
+/* Feature tests against the various tunings.  */
+unsigned char ix86_tune_features[X86_TUNE_LAST];
+
+/* Feature tests against the various tunings used to create ix86_tune_features
+   based on the processor mask.  */
+static unsigned HOST_WIDE_INT initial_ix86_tune_features[X86_TUNE_LAST] = {
+#undef DEF_TUNE
+#define DEF_TUNE(tune, name, selector) selector,
+#include "x86-tune.def"
+#undef DEF_TUNE
+};
+
+/* Feature tests against the various architecture variations.  */
+unsigned char ix86_arch_features[X86_ARCH_LAST];
+
+/* Return a string that documents the current -m options.  The caller is
+   responsible for freeing the string.  */
+
+char *
+ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
+		    int flags, int flags2,
+		    const char *arch, const char *tune,
+		    enum fpmath_unit fpmath, bool add_nl_p, bool add_abi_p)
+{
+  struct ix86_target_opts
+  {
+    const char *option;		/* option string */
+    HOST_WIDE_INT mask;		/* isa mask options */
+  };
+
+  /* This table is ordered so that options like -msse4.2 that imply other
+     ISAs come first.  Target string will be displayed in the same order.  */
+  static struct ix86_target_opts isa2_opts[] =
+  {
+    { "-mcx16",		OPTION_MASK_ISA_CX16 },
+    { "-mvaes",		OPTION_MASK_ISA_VAES },
+    { "-mrdpid",	OPTION_MASK_ISA_RDPID },
+    { "-mpconfig",	OPTION_MASK_ISA_PCONFIG },
+    { "-mwbnoinvd",     OPTION_MASK_ISA_WBNOINVD },
+    { "-msgx",		OPTION_MASK_ISA_SGX },
+    { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
+    { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
+    { "-mhle",		OPTION_MASK_ISA_HLE },
+    { "-mmovbe",	OPTION_MASK_ISA_MOVBE },
+    { "-mclzero",	OPTION_MASK_ISA_CLZERO },
+    { "-mmwaitx",	OPTION_MASK_ISA_MWAITX },
+    { "-mmovdir64b",	OPTION_MASK_ISA_MOVDIR64B },
+    { "-mwaitpkg",	OPTION_MASK_ISA_WAITPKG },
+    { "-mcldemote",	OPTION_MASK_ISA_CLDEMOTE },
+    { "-mptwrite",	OPTION_MASK_ISA_PTWRITE }
+  };
+  static struct ix86_target_opts isa_opts[] =
+  {
+    { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
+    { "-mavx512bitalg", OPTION_MASK_ISA_AVX512BITALG },
+    { "-mvpclmulqdq",	OPTION_MASK_ISA_VPCLMULQDQ },
+    { "-mgfni",		OPTION_MASK_ISA_GFNI },
+    { "-mavx512vnni",	OPTION_MASK_ISA_AVX512VNNI },
+    { "-mavx512vbmi2",	OPTION_MASK_ISA_AVX512VBMI2 },
+    { "-mavx512vbmi",	OPTION_MASK_ISA_AVX512VBMI },
+    { "-mavx512ifma",	OPTION_MASK_ISA_AVX512IFMA },
+    { "-mavx512vl",	OPTION_MASK_ISA_AVX512VL },
+    { "-mavx512bw",	OPTION_MASK_ISA_AVX512BW },
+    { "-mavx512dq",	OPTION_MASK_ISA_AVX512DQ },
+    { "-mavx512er",	OPTION_MASK_ISA_AVX512ER },
+    { "-mavx512pf",	OPTION_MASK_ISA_AVX512PF },
+    { "-mavx512cd",	OPTION_MASK_ISA_AVX512CD },
+    { "-mavx512f",	OPTION_MASK_ISA_AVX512F },
+    { "-mavx2",		OPTION_MASK_ISA_AVX2 },
+    { "-mfma",		OPTION_MASK_ISA_FMA },
+    { "-mxop",		OPTION_MASK_ISA_XOP },
+    { "-mfma4",		OPTION_MASK_ISA_FMA4 },
+    { "-mf16c",		OPTION_MASK_ISA_F16C },
+    { "-mavx",		OPTION_MASK_ISA_AVX },
+/*  { "-msse4"		OPTION_MASK_ISA_SSE4 }, */
+    { "-msse4.2",	OPTION_MASK_ISA_SSE4_2 },
+    { "-msse4.1",	OPTION_MASK_ISA_SSE4_1 },
+    { "-msse4a",	OPTION_MASK_ISA_SSE4A },
+    { "-mssse3",	OPTION_MASK_ISA_SSSE3 },
+    { "-msse3",		OPTION_MASK_ISA_SSE3 },
+    { "-maes",		OPTION_MASK_ISA_AES },
+    { "-msha",		OPTION_MASK_ISA_SHA },
+    { "-mpclmul",	OPTION_MASK_ISA_PCLMUL },
+    { "-msse2",		OPTION_MASK_ISA_SSE2 },
+    { "-msse",		OPTION_MASK_ISA_SSE },
+    { "-m3dnowa",	OPTION_MASK_ISA_3DNOW_A },
+    { "-m3dnow",	OPTION_MASK_ISA_3DNOW },
+    { "-mmmx",		OPTION_MASK_ISA_MMX },
+    { "-mrtm",		OPTION_MASK_ISA_RTM },
+    { "-mprfchw",	OPTION_MASK_ISA_PRFCHW },
+    { "-mrdseed",	OPTION_MASK_ISA_RDSEED },
+    { "-madx",		OPTION_MASK_ISA_ADX },
+    { "-mprefetchwt1",	OPTION_MASK_ISA_PREFETCHWT1 },
+    { "-mclflushopt",	OPTION_MASK_ISA_CLFLUSHOPT },
+    { "-mxsaves",	OPTION_MASK_ISA_XSAVES },
+    { "-mxsavec",	OPTION_MASK_ISA_XSAVEC },
+    { "-mxsaveopt",	OPTION_MASK_ISA_XSAVEOPT },
+    { "-mxsave",	OPTION_MASK_ISA_XSAVE },
+    { "-mabm",		OPTION_MASK_ISA_ABM },
+    { "-mbmi",		OPTION_MASK_ISA_BMI },
+    { "-mbmi2",		OPTION_MASK_ISA_BMI2 },
+    { "-mlzcnt",	OPTION_MASK_ISA_LZCNT },
+    { "-mtbm",		OPTION_MASK_ISA_TBM },
+    { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
+    { "-msahf",		OPTION_MASK_ISA_SAHF },
+    { "-mcrc32",	OPTION_MASK_ISA_CRC32 },
+    { "-mfsgsbase",	OPTION_MASK_ISA_FSGSBASE },
+    { "-mrdrnd",	OPTION_MASK_ISA_RDRND },
+    { "-mpku",		OPTION_MASK_ISA_PKU },
+    { "-mlwp",		OPTION_MASK_ISA_LWP },
+    { "-mfxsr",		OPTION_MASK_ISA_FXSR },
+    { "-mclwb",		OPTION_MASK_ISA_CLWB },
+    { "-mshstk",	OPTION_MASK_ISA_SHSTK },
+    { "-mmovdiri",	OPTION_MASK_ISA_MOVDIRI }
+  };
+
+  /* Flag options.  */
+  static struct ix86_target_opts flag_opts[] =
+  {
+    { "-m128bit-long-double",		MASK_128BIT_LONG_DOUBLE },
+    { "-mlong-double-128",		MASK_LONG_DOUBLE_128 },
+    { "-mlong-double-64",		MASK_LONG_DOUBLE_64 },
+    { "-m80387",			MASK_80387 },
+    { "-maccumulate-outgoing-args",	MASK_ACCUMULATE_OUTGOING_ARGS },
+    { "-malign-double",			MASK_ALIGN_DOUBLE },
+    { "-mcld",				MASK_CLD },
+    { "-mfp-ret-in-387",		MASK_FLOAT_RETURNS },
+    { "-mieee-fp",			MASK_IEEE_FP },
+    { "-minline-all-stringops",		MASK_INLINE_ALL_STRINGOPS },
+    { "-minline-stringops-dynamically",	MASK_INLINE_STRINGOPS_DYNAMICALLY },
+    { "-mms-bitfields",			MASK_MS_BITFIELD_LAYOUT },
+    { "-mno-align-stringops",		MASK_NO_ALIGN_STRINGOPS },
+    { "-mno-fancy-math-387",		MASK_NO_FANCY_MATH_387 },
+    { "-mno-push-args",			MASK_NO_PUSH_ARGS },
+    { "-mno-red-zone",			MASK_NO_RED_ZONE },
+    { "-momit-leaf-frame-pointer",	MASK_OMIT_LEAF_FRAME_POINTER },
+    { "-mrecip",			MASK_RECIP },
+    { "-mrtd",				MASK_RTD },
+    { "-msseregparm",			MASK_SSEREGPARM },
+    { "-mstack-arg-probe",		MASK_STACK_PROBE },
+    { "-mtls-direct-seg-refs",		MASK_TLS_DIRECT_SEG_REFS },
+    { "-mvect8-ret-in-mem",		MASK_VECT8_RETURNS },
+    { "-m8bit-idiv",			MASK_USE_8BIT_IDIV },
+    { "-mvzeroupper",			MASK_VZEROUPPER },
+    { "-mstv",				MASK_STV },
+    { "-mavx256-split-unaligned-load",	MASK_AVX256_SPLIT_UNALIGNED_LOAD },
+    { "-mavx256-split-unaligned-store",	MASK_AVX256_SPLIT_UNALIGNED_STORE },
+    { "-mcall-ms2sysv-xlogues",		MASK_CALL_MS2SYSV_XLOGUES }
+  };
+
+  /* Additional flag options.  */
+  static struct ix86_target_opts flag2_opts[] =
+  {
+    { "-mgeneral-regs-only",		OPTION_MASK_GENERAL_REGS_ONLY }
+  };
+
+  const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
+		   + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
+
+  char isa_other[40];
+  char isa2_other[40];
+  char flags_other[40];
+  char flags2_other[40];
+  unsigned num = 0;
+  unsigned i, j;
+  char *ret;
+  char *ptr;
+  size_t len;
+  size_t line_len;
+  size_t sep_len;
+  const char *abi;
+
+  memset (opts, '\0', sizeof (opts));
+
+  /* Add -march= option.  */
+  if (arch)
+    {
+      opts[num][0] = "-march=";
+      opts[num++][1] = arch;
+    }
+
+  /* Add -mtune= option.  */
+  if (tune)
+    {
+      opts[num][0] = "-mtune=";
+      opts[num++][1] = tune;
+    }
+
+  /* Add -m32/-m64/-mx32.  */
+  if (add_abi_p)
+    {
+      if ((isa & OPTION_MASK_ISA_64BIT) != 0)
+	{
+	  if ((isa & OPTION_MASK_ABI_64) != 0)
+	    abi = "-m64";
+	  else
+	    abi = "-mx32";
+	}
+      else
+	abi = "-m32";
+      opts[num++][0] = abi;
+    }
+  isa &= ~(OPTION_MASK_ISA_64BIT | OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
+
+  /* Pick out the options in isa2 options.  */
+  for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
+    {
+      if ((isa2 & isa2_opts[i].mask) != 0)
+	{
+	  opts[num++][0] = isa2_opts[i].option;
+	  isa2 &= ~ isa2_opts[i].mask;
+	}
+    }
+
+  if (isa2 && add_nl_p)
+    {
+      opts[num++][0] = isa2_other;
+      sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
+    }
+
+  /* Pick out the options in isa options.  */
+  for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
+    {
+      if ((isa & isa_opts[i].mask) != 0)
+	{
+	  opts[num++][0] = isa_opts[i].option;
+	  isa &= ~ isa_opts[i].mask;
+	}
+    }
+
+  if (isa && add_nl_p)
+    {
+      opts[num++][0] = isa_other;
+      sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
+    }
+
+  /* Add flag options.  */
+  for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
+    {
+      if ((flags & flag_opts[i].mask) != 0)
+	{
+	  opts[num++][0] = flag_opts[i].option;
+	  flags &= ~ flag_opts[i].mask;
+	}
+    }
+
+  if (flags && add_nl_p)
+    {
+      opts[num++][0] = flags_other;
+      sprintf (flags_other, "(other flags: %#x)", flags);
+    }
+
+    /* Add additional flag options.  */
+  for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
+    {
+      if ((flags2 & flag2_opts[i].mask) != 0)
+	{
+	  opts[num++][0] = flag2_opts[i].option;
+	  flags2 &= ~ flag2_opts[i].mask;
+	}
+    }
+
+  if (flags2 && add_nl_p)
+    {
+      opts[num++][0] = flags2_other;
+      sprintf (flags2_other, "(other flags2: %#x)", flags2);
+    }
+
+  /* Add -fpmath= option.  */
+  if (fpmath)
+    {
+      opts[num][0] = "-mfpmath=";
+      switch ((int) fpmath)
+	{
+	case FPMATH_387:
+	  opts[num++][1] = "387";
+	  break;
+
+	case FPMATH_SSE:
+	  opts[num++][1] = "sse";
+	  break;
+
+	case FPMATH_387 | FPMATH_SSE:
+	  opts[num++][1] = "sse+387";
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+    }
+
+  /* Any options?  */
+  if (num == 0)
+    return NULL;
+
+  gcc_assert (num < ARRAY_SIZE (opts));
+
+  /* Size the string.  */
+  len = 0;
+  sep_len = (add_nl_p) ? 3 : 1;
+  for (i = 0; i < num; i++)
+    {
+      len += sep_len;
+      for (j = 0; j < 2; j++)
+	if (opts[i][j])
+	  len += strlen (opts[i][j]);
+    }
+
+  /* Build the string.  */
+  ret = ptr = (char *) xmalloc (len);
+  line_len = 0;
+
+  for (i = 0; i < num; i++)
+    {
+      size_t len2[2];
+
+      for (j = 0; j < 2; j++)
+	len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
+
+      if (i != 0)
+	{
+	  *ptr++ = ' ';
+	  line_len++;
+
+	  if (add_nl_p && line_len + len2[0] + len2[1] > 70)
+	    {
+	      *ptr++ = '\\';
+	      *ptr++ = '\n';
+	      line_len = 0;
+	    }
+	}
+
+      for (j = 0; j < 2; j++)
+	if (opts[i][j])
+	  {
+	    memcpy (ptr, opts[i][j], len2[j]);
+	    ptr += len2[j];
+	    line_len += len2[j];
+	  }
+    }
+
+  *ptr = '\0';
+  gcc_assert (ret + len >= ptr);
+
+  return ret;
+}
+
+/* Function that is callable from the debugger to print the current
+   options.  */
+void ATTRIBUTE_UNUSED
+ix86_debug_options (void)
+{
+  char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
+				   target_flags, ix86_target_flags,
+				   ix86_arch_string,ix86_tune_string,
+				   ix86_fpmath, true, true);
+
+  if (opts)
+    {
+      fprintf (stderr, "%s\n\n", opts);
+      free (opts);
+    }
+  else
+    fputs ("<no options>\n\n", stderr);
+
+  return;
+}
+
+/* Save the current options */
+
+void
+ix86_function_specific_save (struct cl_target_option *ptr,
+			     struct gcc_options *opts)
+{
+  ptr->arch = ix86_arch;
+  ptr->schedule = ix86_schedule;
+  ptr->prefetch_sse = x86_prefetch_sse;
+  ptr->tune = ix86_tune;
+  ptr->branch_cost = ix86_branch_cost;
+  ptr->tune_defaulted = ix86_tune_defaulted;
+  ptr->arch_specified = ix86_arch_specified;
+  ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
+  ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
+  ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
+  ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
+  ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
+  ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
+  ptr->x_ix86_abi = opts->x_ix86_abi;
+  ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
+  ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
+  ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
+  ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
+  ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
+  ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
+  ptr->x_ix86_pmode = opts->x_ix86_pmode;
+  ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
+  ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
+  ptr->x_ix86_regparm = opts->x_ix86_regparm;
+  ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
+  ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
+  ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
+  ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
+  ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
+  ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
+  ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
+  ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
+  ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
+  ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
+
+  /* The fields are char but the variables are not; make sure the
+     values fit in the fields.  */
+  gcc_assert (ptr->arch == ix86_arch);
+  gcc_assert (ptr->schedule == ix86_schedule);
+  gcc_assert (ptr->tune == ix86_tune);
+  gcc_assert (ptr->branch_cost == ix86_branch_cost);
+}
+
+/* Feature tests against the various architecture variations, used to create
+   ix86_arch_features based on the processor mask.  */
+static unsigned HOST_WIDE_INT initial_ix86_arch_features[X86_ARCH_LAST] = {
+  /* X86_ARCH_CMOV: Conditional move was added for pentiumpro.  */
+  ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
+
+  /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486.  */
+  ~m_386,
+
+  /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
+  ~(m_386 | m_486),
+
+  /* X86_ARCH_XADD: Exchange and add was added for 80486.  */
+  ~m_386,
+
+  /* X86_ARCH_BSWAP: Byteswap was added for 80486.  */
+  ~m_386,
+};
+
+/* This table must be in sync with enum processor_type in i386.h.  */ 
+static const struct processor_costs *processor_cost_table[] =
+{
+  &generic_cost,
+  &i386_cost,
+  &i486_cost,
+  &pentium_cost,
+  &lakemont_cost,
+  &pentiumpro_cost,
+  &pentium4_cost,
+  &nocona_cost,
+  &core_cost,
+  &core_cost,
+  &core_cost,
+  &core_cost,
+  &atom_cost,
+  &slm_cost,
+  &slm_cost,
+  &slm_cost,
+  &slm_cost,
+  &slm_cost,
+  &slm_cost,
+  &skylake_cost,
+  &skylake_cost,
+  &skylake_cost,
+  &skylake_cost,
+  &skylake_cost,
+  &skylake_cost,
+  &intel_cost,
+  &geode_cost,
+  &k6_cost,
+  &athlon_cost,
+  &k8_cost,
+  &amdfam10_cost,
+  &bdver_cost,
+  &bdver_cost,
+  &bdver_cost,
+  &bdver_cost,
+  &btver1_cost,
+  &btver2_cost,
+  &znver1_cost,
+  &znver2_cost
+};
+
+/* Guarantee that the array is aligned with enum processor_type.  */
+STATIC_ASSERT (ARRAY_SIZE (processor_cost_table) == PROCESSOR_max);
+
+static bool
+ix86_option_override_internal (bool main_args_p,
+			       struct gcc_options *opts,
+			       struct gcc_options *opts_set);
+static void
+set_ix86_tune_features (enum processor_type ix86_tune, bool dump);
+
+/* Restore the current options */
+
+void
+ix86_function_specific_restore (struct gcc_options *opts,
+				struct cl_target_option *ptr)
+{
+  enum processor_type old_tune = ix86_tune;
+  enum processor_type old_arch = ix86_arch;
+  unsigned HOST_WIDE_INT ix86_arch_mask;
+  int i;
+
+  /* We don't change -fPIC.  */
+  opts->x_flag_pic = flag_pic;
+
+  ix86_arch = (enum processor_type) ptr->arch;
+  ix86_schedule = (enum attr_cpu) ptr->schedule;
+  ix86_tune = (enum processor_type) ptr->tune;
+  x86_prefetch_sse = ptr->prefetch_sse;
+  opts->x_ix86_branch_cost = ptr->branch_cost;
+  ix86_tune_defaulted = ptr->tune_defaulted;
+  ix86_arch_specified = ptr->arch_specified;
+  opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
+  opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
+  opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
+  opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
+  opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
+  opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
+  opts->x_ix86_abi = ptr->x_ix86_abi;
+  opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
+  opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
+  opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
+  opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
+  opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
+  opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
+  opts->x_ix86_pmode = ptr->x_ix86_pmode;
+  opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
+  opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
+  opts->x_ix86_regparm = ptr->x_ix86_regparm;
+  opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
+  opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
+  opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
+  opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
+  opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
+  opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
+  opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
+  opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
+  opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
+  opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
+  ix86_tune_cost = processor_cost_table[ix86_tune];
+  /* TODO: ix86_cost should be chosen at instruction or function granuality
+     so for cold code we use size_cost even in !optimize_size compilation.  */
+  if (opts->x_optimize_size)
+    ix86_cost = &ix86_size_cost;
+  else
+    ix86_cost = ix86_tune_cost;
+
+  /* Recreate the arch feature tests if the arch changed */
+  if (old_arch != ix86_arch)
+    {
+      ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
+      for (i = 0; i < X86_ARCH_LAST; ++i)
+	ix86_arch_features[i]
+	  = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
+    }
+
+  /* Recreate the tune optimization tests */
+  if (old_tune != ix86_tune)
+    set_ix86_tune_features (ix86_tune, false);
+}
+
+/* Adjust target options after streaming them in.  This is mainly about
+   reconciling them with global options.  */
+
+void
+ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
+{
+  /* flag_pic is a global option, but ix86_cmodel is target saved option
+     partly computed from flag_pic.  If flag_pic is on, adjust x_ix86_cmodel
+     for PIC, or error out.  */
+  if (flag_pic)
+    switch (ptr->x_ix86_cmodel)
+      {
+      case CM_SMALL:
+	ptr->x_ix86_cmodel = CM_SMALL_PIC;
+	break;
+
+      case CM_MEDIUM:
+	ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
+	break;
+
+      case CM_LARGE:
+	ptr->x_ix86_cmodel = CM_LARGE_PIC;
+	break;
+
+      case CM_KERNEL:
+	error ("code model %s does not support PIC mode", "kernel");
+	break;
+
+      default:
+	break;
+      }
+  else
+    switch (ptr->x_ix86_cmodel)
+      {
+      case CM_SMALL_PIC:
+	ptr->x_ix86_cmodel = CM_SMALL;
+	break;
+
+      case CM_MEDIUM_PIC:
+	ptr->x_ix86_cmodel = CM_MEDIUM;
+	break;
+
+      case CM_LARGE_PIC:
+	ptr->x_ix86_cmodel = CM_LARGE;
+	break;
+
+      default:
+	break;
+      }
+}
+
+/* Print the current options */
+
+void
+ix86_function_specific_print (FILE *file, int indent,
+			      struct cl_target_option *ptr)
+{
+  char *target_string
+    = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
+			  ptr->x_target_flags, ptr->x_ix86_target_flags,
+			  NULL, NULL, ptr->x_ix86_fpmath, false, true);
+
+  gcc_assert (ptr->arch < PROCESSOR_max);
+  fprintf (file, "%*sarch = %d (%s)\n",
+	   indent, "",
+	   ptr->arch, processor_names[ptr->arch]);
+
+  gcc_assert (ptr->tune < PROCESSOR_max);
+  fprintf (file, "%*stune = %d (%s)\n",
+	   indent, "",
+	   ptr->tune, processor_names[ptr->tune]);
+
+  fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
+
+  if (target_string)
+    {
+      fprintf (file, "%*s%s\n", indent, "", target_string);
+      free (target_string);
+    }
+}
+
+
+/* Inner function to process the attribute((target(...))), take an argument and
+   set the current options from the argument. If we have a list, recursively go
+   over the list.  */
+
+static bool
+ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
+				     struct gcc_options *opts,
+				     struct gcc_options *opts_set,
+				     struct gcc_options *enum_opts_set,
+				     bool target_clone_attr)
+{
+  char *next_optstr;
+  bool ret = true;
+
+#define IX86_ATTR_ISA(S,O)   { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
+#define IX86_ATTR_STR(S,O)   { S, sizeof (S)-1, ix86_opt_str, O, 0 }
+#define IX86_ATTR_ENUM(S,O)  { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
+#define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
+#define IX86_ATTR_NO(S,O,M)  { S, sizeof (S)-1, ix86_opt_no,  O, M }
+
+  enum ix86_opt_type
+  {
+    ix86_opt_unknown,
+    ix86_opt_yes,
+    ix86_opt_no,
+    ix86_opt_str,
+    ix86_opt_enum,
+    ix86_opt_isa
+  };
+
+  static const struct
+  {
+    const char *string;
+    size_t len;
+    enum ix86_opt_type type;
+    int opt;
+    int mask;
+  } attrs[] = {
+    /* isa options */
+    IX86_ATTR_ISA ("pconfig",	OPT_mpconfig),
+    IX86_ATTR_ISA ("wbnoinvd",	OPT_mwbnoinvd),
+    IX86_ATTR_ISA ("sgx",	OPT_msgx),
+    IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
+    IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
+    IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
+    IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
+    IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
+    IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg),
+
+    IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
+    IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
+    IX86_ATTR_ISA ("avx512vl",	OPT_mavx512vl),
+    IX86_ATTR_ISA ("avx512bw",	OPT_mavx512bw),
+    IX86_ATTR_ISA ("avx512dq",	OPT_mavx512dq),
+    IX86_ATTR_ISA ("avx512er",	OPT_mavx512er),
+    IX86_ATTR_ISA ("avx512pf",	OPT_mavx512pf),
+    IX86_ATTR_ISA ("avx512cd",	OPT_mavx512cd),
+    IX86_ATTR_ISA ("avx512f",	OPT_mavx512f),
+    IX86_ATTR_ISA ("avx2",	OPT_mavx2),
+    IX86_ATTR_ISA ("fma",	OPT_mfma),
+    IX86_ATTR_ISA ("xop",	OPT_mxop),
+    IX86_ATTR_ISA ("fma4",	OPT_mfma4),
+    IX86_ATTR_ISA ("f16c",	OPT_mf16c),
+    IX86_ATTR_ISA ("avx",	OPT_mavx),
+    IX86_ATTR_ISA ("sse4",	OPT_msse4),
+    IX86_ATTR_ISA ("sse4.2",	OPT_msse4_2),
+    IX86_ATTR_ISA ("sse4.1",	OPT_msse4_1),
+    IX86_ATTR_ISA ("sse4a",	OPT_msse4a),
+    IX86_ATTR_ISA ("ssse3",	OPT_mssse3),
+    IX86_ATTR_ISA ("sse3",	OPT_msse3),
+    IX86_ATTR_ISA ("aes",	OPT_maes),
+    IX86_ATTR_ISA ("sha",	OPT_msha),
+    IX86_ATTR_ISA ("pclmul",	OPT_mpclmul),
+    IX86_ATTR_ISA ("sse2",	OPT_msse2),
+    IX86_ATTR_ISA ("sse",	OPT_msse),
+    IX86_ATTR_ISA ("3dnowa",	OPT_m3dnowa),
+    IX86_ATTR_ISA ("3dnow",	OPT_m3dnow),
+    IX86_ATTR_ISA ("mmx",	OPT_mmmx),
+    IX86_ATTR_ISA ("rtm",	OPT_mrtm),
+    IX86_ATTR_ISA ("prfchw",	OPT_mprfchw),
+    IX86_ATTR_ISA ("rdseed",	OPT_mrdseed),
+    IX86_ATTR_ISA ("adx",	OPT_madx),
+    IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
+    IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
+    IX86_ATTR_ISA ("xsaves",	OPT_mxsaves),
+    IX86_ATTR_ISA ("xsavec",	OPT_mxsavec),
+    IX86_ATTR_ISA ("xsaveopt",	OPT_mxsaveopt),
+    IX86_ATTR_ISA ("xsave",	OPT_mxsave),
+    IX86_ATTR_ISA ("abm",	OPT_mabm),
+    IX86_ATTR_ISA ("bmi",	OPT_mbmi),
+    IX86_ATTR_ISA ("bmi2",	OPT_mbmi2),
+    IX86_ATTR_ISA ("lzcnt",	OPT_mlzcnt),
+    IX86_ATTR_ISA ("tbm",	OPT_mtbm),
+    IX86_ATTR_ISA ("popcnt",	OPT_mpopcnt),
+    IX86_ATTR_ISA ("cx16",	OPT_mcx16),
+    IX86_ATTR_ISA ("sahf",	OPT_msahf),
+    IX86_ATTR_ISA ("movbe",	OPT_mmovbe),
+    IX86_ATTR_ISA ("crc32",	OPT_mcrc32),
+    IX86_ATTR_ISA ("fsgsbase",	OPT_mfsgsbase),
+    IX86_ATTR_ISA ("rdrnd",	OPT_mrdrnd),
+    IX86_ATTR_ISA ("mwaitx",	OPT_mmwaitx),
+    IX86_ATTR_ISA ("clzero",	OPT_mclzero),
+    IX86_ATTR_ISA ("pku",	OPT_mpku),
+    IX86_ATTR_ISA ("lwp",	OPT_mlwp),
+    IX86_ATTR_ISA ("hle",	OPT_mhle),
+    IX86_ATTR_ISA ("fxsr",	OPT_mfxsr),
+    IX86_ATTR_ISA ("clwb",	OPT_mclwb),
+    IX86_ATTR_ISA ("rdpid",	OPT_mrdpid),
+    IX86_ATTR_ISA ("gfni",	OPT_mgfni),
+    IX86_ATTR_ISA ("shstk",	OPT_mshstk),
+    IX86_ATTR_ISA ("vaes",	OPT_mvaes),
+    IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq),
+    IX86_ATTR_ISA ("movdiri", OPT_mmovdiri),
+    IX86_ATTR_ISA ("movdir64b", OPT_mmovdir64b),
+    IX86_ATTR_ISA ("waitpkg", OPT_mwaitpkg),
+    IX86_ATTR_ISA ("cldemote", OPT_mcldemote),
+    IX86_ATTR_ISA ("ptwrite",   OPT_mptwrite),
+
+    /* enum options */
+    IX86_ATTR_ENUM ("fpmath=",	OPT_mfpmath_),
+
+    /* string options */
+    IX86_ATTR_STR ("arch=",	IX86_FUNCTION_SPECIFIC_ARCH),
+    IX86_ATTR_STR ("tune=",	IX86_FUNCTION_SPECIFIC_TUNE),
+
+    /* flag options */
+    IX86_ATTR_YES ("cld",
+		   OPT_mcld,
+		   MASK_CLD),
+
+    IX86_ATTR_NO ("fancy-math-387",
+		  OPT_mfancy_math_387,
+		  MASK_NO_FANCY_MATH_387),
+
+    IX86_ATTR_YES ("ieee-fp",
+		   OPT_mieee_fp,
+		   MASK_IEEE_FP),
+
+    IX86_ATTR_YES ("inline-all-stringops",
+		   OPT_minline_all_stringops,
+		   MASK_INLINE_ALL_STRINGOPS),
+
+    IX86_ATTR_YES ("inline-stringops-dynamically",
+		   OPT_minline_stringops_dynamically,
+		   MASK_INLINE_STRINGOPS_DYNAMICALLY),
+
+    IX86_ATTR_NO ("align-stringops",
+		  OPT_mno_align_stringops,
+		  MASK_NO_ALIGN_STRINGOPS),
+
+    IX86_ATTR_YES ("recip",
+		   OPT_mrecip,
+		   MASK_RECIP),
+  };
+
+  location_t loc
+    = fndecl == NULL ? UNKNOWN_LOCATION : DECL_SOURCE_LOCATION (fndecl);
+  const char *attr_name = target_clone_attr ? "target_clone" : "target";
+
+  /* If this is a list, recurse to get the options.  */
+  if (TREE_CODE (args) == TREE_LIST)
+    {
+      bool ret = true;
+
+      for (; args; args = TREE_CHAIN (args))
+	if (TREE_VALUE (args)
+	    && !ix86_valid_target_attribute_inner_p (fndecl, TREE_VALUE (args),
+						     p_strings, opts, opts_set,
+						     enum_opts_set,
+						     target_clone_attr))
+	  ret = false;
+
+      return ret;
+    }
+
+  else if (TREE_CODE (args) != STRING_CST)
+    {
+      error_at (loc, "attribute %qs argument is not a string", attr_name);
+      return false;
+    }
+
+  /* Handle multiple arguments separated by commas.  */
+  next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
+
+  while (next_optstr && *next_optstr != '\0')
+    {
+      char *p = next_optstr;
+      char *orig_p = p;
+      char *comma = strchr (next_optstr, ',');
+      size_t len, opt_len;
+      int opt;
+      bool opt_set_p;
+      char ch;
+      unsigned i;
+      enum ix86_opt_type type = ix86_opt_unknown;
+      int mask = 0;
+
+      if (comma)
+	{
+	  *comma = '\0';
+	  len = comma - next_optstr;
+	  next_optstr = comma + 1;
+	}
+      else
+	{
+	  len = strlen (p);
+	  next_optstr = NULL;
+	}
+
+      /* Recognize no-xxx.  */
+      if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
+	{
+	  opt_set_p = false;
+	  p += 3;
+	  len -= 3;
+	}
+      else
+	opt_set_p = true;
+
+      /* Find the option.  */
+      ch = *p;
+      opt = N_OPTS;
+      for (i = 0; i < ARRAY_SIZE (attrs); i++)
+	{
+	  type = attrs[i].type;
+	  opt_len = attrs[i].len;
+	  if (ch == attrs[i].string[0]
+	      && ((type != ix86_opt_str && type != ix86_opt_enum)
+		  ? len == opt_len
+		  : len > opt_len)
+	      && memcmp (p, attrs[i].string, opt_len) == 0)
+	    {
+	      opt = attrs[i].opt;
+	      mask = attrs[i].mask;
+	      break;
+	    }
+	}
+
+      /* Process the option.  */
+      if (opt == N_OPTS)
+	{
+	  error_at (loc, "attribute %qs argument %qs is unknown",
+		    orig_p, attr_name);
+	  ret = false;
+	}
+
+      else if (type == ix86_opt_isa)
+	{
+	  struct cl_decoded_option decoded;
+
+	  generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
+	  ix86_handle_option (opts, opts_set,
+			      &decoded, input_location);
+	}
+
+      else if (type == ix86_opt_yes || type == ix86_opt_no)
+	{
+	  if (type == ix86_opt_no)
+	    opt_set_p = !opt_set_p;
+
+	  if (opt_set_p)
+	    opts->x_target_flags |= mask;
+	  else
+	    opts->x_target_flags &= ~mask;
+	}
+
+      else if (type == ix86_opt_str)
+	{
+	  if (p_strings[opt])
+	    {
+	      error_at (loc, "attribute value %qs was already specified "
+			"in %qs attribute", orig_p, attr_name);
+	      ret = false;
+	    }
+	  else
+	    {
+	      p_strings[opt] = xstrdup (p + opt_len);
+	      if (opt == IX86_FUNCTION_SPECIFIC_ARCH)
+		{
+		  /* If arch= is set,  clear all bits in x_ix86_isa_flags,
+		     except for ISA_64BIT, ABI_64, ABI_X32, and CODE16
+		     and all bits in x_ix86_isa_flags2.  */
+		  opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
+					     | OPTION_MASK_ABI_64
+					     | OPTION_MASK_ABI_X32
+					     | OPTION_MASK_CODE16);
+		  opts->x_ix86_isa_flags_explicit &= (OPTION_MASK_ISA_64BIT
+						      | OPTION_MASK_ABI_64
+						      | OPTION_MASK_ABI_X32
+						      | OPTION_MASK_CODE16);
+		  opts->x_ix86_isa_flags2 = 0;
+		  opts->x_ix86_isa_flags2_explicit = 0;
+		}
+	    }
+	}
+
+      else if (type == ix86_opt_enum)
+	{
+	  bool arg_ok;
+	  int value;
+
+	  arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
+	  if (arg_ok)
+	    set_option (opts, enum_opts_set, opt, value,
+			p + opt_len, DK_UNSPECIFIED, input_location,
+			global_dc);
+	  else
+	    {
+	      error_at (loc, "attribute value %qs is unknown in %qs attribute",
+			orig_p, attr_name);
+	      ret = false;
+	    }
+	}
+
+      else
+	gcc_unreachable ();
+    }
+
+  return ret;
+}
+
+/* Release allocated strings.  */
+static void
+release_options_strings (char **option_strings)
+{
+  /* Free up memory allocated to hold the strings */
+  for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
+    free (option_strings[i]);
+}
+
+/* Return a TARGET_OPTION_NODE tree of the target options listed or NULL.  */
+
+tree
+ix86_valid_target_attribute_tree (tree fndecl, tree args,
+				  struct gcc_options *opts,
+				  struct gcc_options *opts_set,
+				  bool target_clone_attr)
+{
+  const char *orig_arch_string = opts->x_ix86_arch_string;
+  const char *orig_tune_string = opts->x_ix86_tune_string;
+  enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
+  int orig_tune_defaulted = ix86_tune_defaulted;
+  int orig_arch_specified = ix86_arch_specified;
+  char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
+  tree t = NULL_TREE;
+  struct cl_target_option *def
+    = TREE_TARGET_OPTION (target_option_default_node);
+  struct gcc_options enum_opts_set;
+
+  memset (&enum_opts_set, 0, sizeof (enum_opts_set));
+
+  /* Process each of the options on the chain.  */
+  if (!ix86_valid_target_attribute_inner_p (fndecl, args, option_strings, opts,
+					    opts_set, &enum_opts_set,
+					    target_clone_attr))
+    return error_mark_node;
+
+  /* If the changed options are different from the default, rerun
+     ix86_option_override_internal, and then save the options away.
+     The string options are attribute options, and will be undone
+     when we copy the save structure.  */
+  if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
+      || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
+      || opts->x_target_flags != def->x_target_flags
+      || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
+      || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
+      || enum_opts_set.x_ix86_fpmath)
+    {
+      /* If we are using the default tune= or arch=, undo the string assigned,
+	 and use the default.  */
+      if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
+	opts->x_ix86_arch_string
+	  = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
+      else if (!orig_arch_specified)
+	opts->x_ix86_arch_string = NULL;
+
+      if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
+	opts->x_ix86_tune_string
+	  = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
+      else if (orig_tune_defaulted)
+	opts->x_ix86_tune_string = NULL;
+
+      /* If fpmath= is not set, and we now have sse2 on 32-bit, use it.  */
+      if (enum_opts_set.x_ix86_fpmath)
+	opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
+
+      /* Do any overrides, such as arch=xxx, or tune=xxx support.  */
+      bool r = ix86_option_override_internal (false, opts, opts_set);
+      if (!r)
+	{
+	  release_options_strings (option_strings);
+	  return error_mark_node;
+	}
+
+      /* Add any builtin functions with the new isa if any.  */
+      ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
+
+      /* Save the current options unless we are validating options for
+	 #pragma.  */
+      t = build_target_option_node (opts);
+
+      opts->x_ix86_arch_string = orig_arch_string;
+      opts->x_ix86_tune_string = orig_tune_string;
+      opts_set->x_ix86_fpmath = orig_fpmath_set;
+
+      release_options_strings (option_strings);
+    }
+
+  return t;
+}
+
+/* Hook to validate attribute((target("string"))).  */
+
+bool
+ix86_valid_target_attribute_p (tree fndecl,
+			       tree ARG_UNUSED (name),
+			       tree args,
+			       int flags)
+{
+  struct gcc_options func_options;
+  tree new_target, new_optimize;
+  bool ret = true;
+
+  /* attribute((target("default"))) does nothing, beyond
+     affecting multi-versioning.  */
+  if (TREE_VALUE (args)
+      && TREE_CODE (TREE_VALUE (args)) == STRING_CST
+      && TREE_CHAIN (args) == NULL_TREE
+      && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
+    return true;
+
+  tree old_optimize = build_optimization_node (&global_options);
+
+  /* Get the optimization options of the current function.  */  
+  tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
+ 
+  if (!func_optimize)
+    func_optimize = old_optimize;
+
+  /* Init func_options.  */
+  memset (&func_options, 0, sizeof (func_options));
+  init_options_struct (&func_options, NULL);
+  lang_hooks.init_options_struct (&func_options);
+ 
+  cl_optimization_restore (&func_options,
+			   TREE_OPTIMIZATION (func_optimize));
+
+  /* Initialize func_options to the default before its target options can
+     be set.  */
+  cl_target_option_restore (&func_options,
+			    TREE_TARGET_OPTION (target_option_default_node));
+
+  /* FLAGS == 1 is used for target_clones attribute.  */
+  new_target
+    = ix86_valid_target_attribute_tree (fndecl, args, &func_options,
+					&global_options_set, flags == 1);
+
+  new_optimize = build_optimization_node (&func_options);
+
+  if (new_target == error_mark_node)
+    ret = false;
+
+  else if (fndecl && new_target)
+    {
+      DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
+
+      if (old_optimize != new_optimize)
+	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
+    }
+
+  finalize_options_struct (&func_options);
+
+  return ret;
+}
+
+const char *stringop_alg_names[] = {
+#define DEF_ENUM
+#define DEF_ALG(alg, name) #name,
+#include "stringop.def"
+#undef DEF_ENUM
+#undef DEF_ALG
+};
+
+/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
+   The string is of the following form (or comma separated list of it):
+
+     strategy_alg:max_size:[align|noalign]
+
+   where the full size range for the strategy is either [0, max_size] or
+   [min_size, max_size], in which min_size is the max_size + 1 of the
+   preceding range.  The last size range must have max_size == -1.
+
+   Examples:
+
+    1.
+       -mmemcpy-strategy=libcall:-1:noalign
+
+      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
+
+
+   2.
+      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
+
+      This is to tell the compiler to use the following strategy for memset
+      1) when the expected size is between [1, 16], use rep_8byte strategy;
+      2) when the size is between [17, 2048], use vector_loop;
+      3) when the size is > 2048, use libcall.  */
+
+struct stringop_size_range
+{
+  int max;
+  stringop_alg alg;
+  bool noalign;
+};
+
+static void
+ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
+{
+  const struct stringop_algs *default_algs;
+  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
+  char *curr_range_str, *next_range_str;
+  const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
+  int i = 0, n = 0;
+
+  if (is_memset)
+    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
+  else
+    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
+
+  curr_range_str = strategy_str;
+
+  do
+    {
+      int maxs;
+      char alg_name[128];
+      char align[16];
+      next_range_str = strchr (curr_range_str, ',');
+      if (next_range_str)
+        *next_range_str++ = '\0';
+
+      if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs,
+		  align) != 3)
+        {
+	  error ("wrong argument %qs to option %qs", curr_range_str, opt);
+          return;
+        }
+
+      if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
+        {
+	  error ("size ranges of option %qs should be increasing", opt);
+          return;
+        }
+
+      for (i = 0; i < last_alg; i++)
+	if (!strcmp (alg_name, stringop_alg_names[i]))
+	  break;
+
+      if (i == last_alg)
+        {
+	  error ("wrong strategy name %qs specified for option %qs",
+		 alg_name, opt);
+
+	  auto_vec <const char *> candidates;
+	  for (i = 0; i < last_alg; i++)
+	    if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
+	      candidates.safe_push (stringop_alg_names[i]);
+
+	  char *s;
+	  const char *hint
+	    = candidates_list_and_hint (alg_name, s, candidates);
+	  if (hint)
+	    inform (input_location,
+		    "valid arguments to %qs are: %s; did you mean %qs?",
+		    opt, s, hint);
+	  else
+	    inform (input_location, "valid arguments to %qs are: %s",
+		    opt, s);
+	  XDELETEVEC (s);
+          return;
+        }
+
+      if ((stringop_alg) i == rep_prefix_8_byte
+	  && !TARGET_64BIT)
+	{
+	  /* rep; movq isn't available in 32-bit code.  */
+	  error ("strategy name %qs specified for option %qs "
+		 "not supported for 32-bit code", alg_name, opt);
+	  return;
+	}
+
+      input_ranges[n].max = maxs;
+      input_ranges[n].alg = (stringop_alg) i;
+      if (!strcmp (align, "align"))
+        input_ranges[n].noalign = false;
+      else if (!strcmp (align, "noalign"))
+        input_ranges[n].noalign = true;
+      else
+        {
+	  error ("unknown alignment %qs specified for option %qs", align, opt);
+          return;
+        }
+      n++;
+      curr_range_str = next_range_str;
+    }
+  while (curr_range_str);
+
+  if (input_ranges[n - 1].max != -1)
+    {
+      error ("the max value for the last size range should be -1"
+             " for option %qs", opt);
+      return;
+    }
+
+  if (n > MAX_STRINGOP_ALGS)
+    {
+      error ("too many size ranges specified in option %qs", opt);
+      return;
+    }
+
+  /* Now override the default algs array.  */
+  for (i = 0; i < n; i++)
+    {
+      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
+      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
+          = input_ranges[i].alg;
+      *const_cast<int *>(&default_algs->size[i].noalign)
+          = input_ranges[i].noalign;
+    }
+}
+
+
+/* parse -mtune-ctrl= option. When DUMP is true,
+   print the features that are explicitly set.  */
+
+static void
+parse_mtune_ctrl_str (bool dump)
+{
+  if (!ix86_tune_ctrl_string)
+    return;
+
+  char *next_feature_string = NULL;
+  char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
+  char *orig = curr_feature_string;
+  int i;
+  do
+    {
+      bool clear = false;
+
+      next_feature_string = strchr (curr_feature_string, ',');
+      if (next_feature_string)
+        *next_feature_string++ = '\0';
+      if (*curr_feature_string == '^')
+        {
+          curr_feature_string++;
+          clear = true;
+        }
+      for (i = 0; i < X86_TUNE_LAST; i++)
+        {
+          if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
+            {
+              ix86_tune_features[i] = !clear;
+              if (dump)
+                fprintf (stderr, "Explicitly %s feature %s\n",
+                         clear ? "clear" : "set", ix86_tune_feature_names[i]);
+              break;
+            }
+        }
+      if (i == X86_TUNE_LAST)
+	error ("unknown parameter to option %<-mtune-ctrl%>: %s",
+	       clear ? curr_feature_string - 1 : curr_feature_string);
+      curr_feature_string = next_feature_string;
+    }
+  while (curr_feature_string);
+  free (orig);
+}
+
+/* Helper function to set ix86_tune_features. IX86_TUNE is the
+   processor type.  */
+
+static void
+set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
+{
+  unsigned HOST_WIDE_INT ix86_tune_mask = HOST_WIDE_INT_1U << ix86_tune;
+  int i;
+
+  for (i = 0; i < X86_TUNE_LAST; ++i)
+    {
+      if (ix86_tune_no_default)
+        ix86_tune_features[i] = 0;
+      else
+	ix86_tune_features[i]
+	  = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
+    }
+
+  if (dump)
+    {
+      fprintf (stderr, "List of x86 specific tuning parameter names:\n");
+      for (i = 0; i < X86_TUNE_LAST; i++)
+        fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
+                 ix86_tune_features[i] ? "on" : "off");
+    }
+
+  parse_mtune_ctrl_str (dump);
+}
+
+
+/* Default align_* from the processor table.  */
+
+static void
+ix86_default_align (struct gcc_options *opts)
+{
+  /* -falign-foo without argument: supply one.  */
+  if (opts->x_flag_align_loops && !opts->x_str_align_loops)
+    opts->x_str_align_loops = processor_cost_table[ix86_tune]->align_loop;
+  if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
+    opts->x_str_align_jumps = processor_cost_table[ix86_tune]->align_jump;
+  if (opts->x_flag_align_labels && !opts->x_str_align_labels)
+    opts->x_str_align_labels = processor_cost_table[ix86_tune]->align_label;
+  if (opts->x_flag_align_functions && !opts->x_str_align_functions)
+    opts->x_str_align_functions = processor_cost_table[ix86_tune]->align_func;
+}
+
+/* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook.  */
+
+void
+ix86_override_options_after_change (void)
+{
+  ix86_default_align (&global_options);
+}
+
+/* Clear stack slot assignments remembered from previous functions.
+   This is called from INIT_EXPANDERS once before RTL is emitted for each
+   function.  */
+
+static struct machine_function *
+ix86_init_machine_status (void)
+{
+  struct machine_function *f;
+
+  f = ggc_cleared_alloc<machine_function> ();
+  f->call_abi = ix86_abi;
+
+  return f;
+}
+
+/* Override various settings based on options.  If MAIN_ARGS_P, the
+   options are from the command line, otherwise they are from
+   attributes.  Return true if there's an error related to march
+   option.  */
+
+static bool
+ix86_option_override_internal (bool main_args_p,
+			       struct gcc_options *opts,
+			       struct gcc_options *opts_set)
+{
+  int i;
+  unsigned HOST_WIDE_INT ix86_arch_mask;
+  const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
+
+  /* -mrecip options.  */
+  static struct
+    {
+      const char *string;           /* option name */
+      unsigned int mask;            /* mask bits to set */
+    }
+  const recip_options[] =
+    {
+      { "all",       RECIP_MASK_ALL },
+      { "none",      RECIP_MASK_NONE },
+      { "div",       RECIP_MASK_DIV },
+      { "sqrt",      RECIP_MASK_SQRT },
+      { "vec-div",   RECIP_MASK_VEC_DIV },
+      { "vec-sqrt",  RECIP_MASK_VEC_SQRT },
+    };
+
+
+  /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
+     TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false.  */
+  if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
+#ifdef TARGET_BI_ARCH
+  else
+    {
+#if TARGET_BI_ARCH == 1
+      /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
+	 is on and OPTION_MASK_ABI_X32 is off.  We turn off
+	 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
+	 -mx32.  */
+      if (TARGET_X32_P (opts->x_ix86_isa_flags))
+	opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
+#else
+      /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
+	 on and OPTION_MASK_ABI_64 is off.  We turn off
+	 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
+	 -m64 or OPTION_MASK_CODE16 is turned on by -m16.  */
+      if (TARGET_LP64_P (opts->x_ix86_isa_flags)
+	  || TARGET_16BIT_P (opts->x_ix86_isa_flags))
+	opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
+#endif
+      if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
+	  && TARGET_IAMCU_P (opts->x_target_flags))
+	sorry ("Intel MCU psABI isn%'t supported in %s mode",
+	       TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
+    }
+#endif
+
+  if (TARGET_X32_P (opts->x_ix86_isa_flags))
+    {
+      /* Always turn on OPTION_MASK_ISA_64BIT and turn off
+	 OPTION_MASK_ABI_64 for TARGET_X32.  */
+      opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
+      opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
+    }
+  else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
+    opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
+				| OPTION_MASK_ABI_X32
+				| OPTION_MASK_ABI_64);
+  else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
+    {
+      /* Always turn on OPTION_MASK_ISA_64BIT and turn off
+	 OPTION_MASK_ABI_X32 for TARGET_LP64.  */
+      opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
+      opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
+    }
+
+#ifdef SUBTARGET_OVERRIDE_OPTIONS
+  SUBTARGET_OVERRIDE_OPTIONS;
+#endif
+
+#ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
+  SUBSUBTARGET_OVERRIDE_OPTIONS;
+#endif
+
+  /* -fPIC is the default for x86_64.  */
+  if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    opts->x_flag_pic = 2;
+
+  /* Need to check -mtune=generic first.  */
+  if (opts->x_ix86_tune_string)
+    {
+      /* As special support for cross compilers we read -mtune=native
+	     as -mtune=generic.  With native compilers we won't see the
+	     -mtune=native, as it was changed by the driver.  */
+      if (!strcmp (opts->x_ix86_tune_string, "native"))
+	{
+	  opts->x_ix86_tune_string = "generic";
+	}
+      else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
+        warning (OPT_Wdeprecated,
+		 main_args_p
+		 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
+		      "or %<-mtune=generic%> instead as appropriate")
+		 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
+		      "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
+		      " instead as appropriate"));
+    }
+  else
+    {
+      if (opts->x_ix86_arch_string)
+	opts->x_ix86_tune_string = opts->x_ix86_arch_string;
+      if (!opts->x_ix86_tune_string)
+	{
+	  opts->x_ix86_tune_string = processor_names[TARGET_CPU_DEFAULT];
+	  ix86_tune_defaulted = 1;
+	}
+
+      /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
+	 or defaulted.  We need to use a sensible tune option.  */
+      if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
+	{
+	  opts->x_ix86_tune_string = "generic";
+	}
+    }
+
+  if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
+      && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    {
+      /* rep; movq isn't available in 32-bit code.  */
+      error ("%<-mstringop-strategy=rep_8byte%> not supported for 32-bit code");
+      opts->x_ix86_stringop_alg = no_stringop;
+    }
+
+  if (!opts->x_ix86_arch_string)
+    opts->x_ix86_arch_string
+      = TARGET_64BIT_P (opts->x_ix86_isa_flags)
+	? "x86-64" : SUBTARGET32_DEFAULT_CPU;
+  else
+    ix86_arch_specified = 1;
+
+  if (opts_set->x_ix86_pmode)
+    {
+      if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
+	   && opts->x_ix86_pmode == PMODE_SI)
+	  || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
+	       && opts->x_ix86_pmode == PMODE_DI))
+	error ("address mode %qs not supported in the %s bit mode",
+	       TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
+	       TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
+    }
+  else
+    opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
+			 ? PMODE_DI : PMODE_SI;
+
+  if (!opts_set->x_ix86_abi)
+    opts->x_ix86_abi = DEFAULT_ABI;
+
+  if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
+    error ("%<-mabi=ms%> not supported with X32 ABI");
+  gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
+
+  const char *abi_name = opts->x_ix86_abi == MS_ABI ? "ms" : "sysv";
+  if ((opts->x_flag_sanitize & SANITIZE_USER_ADDRESS)
+      && opts->x_ix86_abi != DEFAULT_ABI)
+    error ("%<-mabi=%s%> not supported with %<-fsanitize=address%>", abi_name);
+  if ((opts->x_flag_sanitize & SANITIZE_KERNEL_ADDRESS)
+      && opts->x_ix86_abi != DEFAULT_ABI)
+    error ("%<-mabi=%s%> not supported with %<-fsanitize=kernel-address%>",
+	   abi_name);
+  if ((opts->x_flag_sanitize & SANITIZE_THREAD)
+      && opts->x_ix86_abi != DEFAULT_ABI)
+    error ("%<-mabi=%s%> not supported with %<-fsanitize=thread%>", abi_name);
+
+  /* For targets using ms ABI enable ms-extensions, if not
+     explicit turned off.  For non-ms ABI we turn off this
+     option.  */
+  if (!opts_set->x_flag_ms_extensions)
+    opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
+
+  if (opts_set->x_ix86_cmodel)
+    {
+      switch (opts->x_ix86_cmodel)
+	{
+	case CM_SMALL:
+	case CM_SMALL_PIC:
+	  if (opts->x_flag_pic)
+	    opts->x_ix86_cmodel = CM_SMALL_PIC;
+	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	    error ("code model %qs not supported in the %s bit mode",
+		   "small", "32");
+	  break;
+
+	case CM_MEDIUM:
+	case CM_MEDIUM_PIC:
+	  if (opts->x_flag_pic)
+	    opts->x_ix86_cmodel = CM_MEDIUM_PIC;
+	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	    error ("code model %qs not supported in the %s bit mode",
+		   "medium", "32");
+	  else if (TARGET_X32_P (opts->x_ix86_isa_flags))
+	    error ("code model %qs not supported in x32 mode",
+		   "medium");
+	  break;
+
+	case CM_LARGE:
+	case CM_LARGE_PIC:
+	  if (opts->x_flag_pic)
+	    opts->x_ix86_cmodel = CM_LARGE_PIC;
+	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	    error ("code model %qs not supported in the %s bit mode",
+		   "large", "32");
+	  else if (TARGET_X32_P (opts->x_ix86_isa_flags))
+	    error ("code model %qs not supported in x32 mode",
+		   "large");
+	  break;
+
+	case CM_32:
+	  if (opts->x_flag_pic)
+	    error ("code model %s does not support PIC mode", "32");
+	  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	    error ("code model %qs not supported in the %s bit mode",
+		   "32", "64");
+	  break;
+
+	case CM_KERNEL:
+	  if (opts->x_flag_pic)
+	    {
+	      error ("code model %s does not support PIC mode", "kernel");
+	      opts->x_ix86_cmodel = CM_32;
+	    }
+	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	    error ("code model %qs not supported in the %s bit mode",
+		   "kernel", "32");
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+    }
+  else
+    {
+      /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
+	 use of rip-relative addressing.  This eliminates fixups that
+	 would otherwise be needed if this object is to be placed in a
+	 DLL, and is essentially just as efficient as direct addressing.  */
+      if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
+	  && (TARGET_RDOS || TARGET_PECOFF))
+	opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
+      else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
+      else
+	opts->x_ix86_cmodel = CM_32;
+    }
+  if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
+    {
+      error ("%<-masm=intel%> not supported in this configuration");
+      opts->x_ix86_asm_dialect = ASM_ATT;
+    }
+  if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
+      != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
+    sorry ("%i-bit mode not compiled in",
+	   (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
+
+  for (i = 0; i < pta_size; i++)
+    if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
+      {
+	if (!strcmp (opts->x_ix86_arch_string, "generic"))
+	  {
+	    error (main_args_p
+		   ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
+			"switch")
+		   : G_("%<generic%> CPU can be used only for "
+			"%<target(\"tune=\")%> attribute"));
+	    return false;
+	  }
+	else if (!strcmp (opts->x_ix86_arch_string, "intel"))
+	  {
+	    error (main_args_p
+		   ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
+			"switch")
+		   : G_("%<intel%> CPU can be used only for "
+			"%<target(\"tune=\")%> attribute"));
+	    return false;
+	  }
+
+	if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
+	    && !((processor_alias_table[i].flags & PTA_64BIT) != 0))
+	  {
+	    error ("CPU you selected does not support x86-64 "
+		   "instruction set");
+	    return false;
+	  }
+
+	ix86_schedule = processor_alias_table[i].schedule;
+	ix86_arch = processor_alias_table[i].processor;
+	/* Default cpu tuning to the architecture.  */
+	ix86_tune = ix86_arch;
+
+	if (((processor_alias_table[i].flags & PTA_MMX) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
+	if (((processor_alias_table[i].flags & PTA_3DNOW) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
+	if (((processor_alias_table[i].flags & PTA_3DNOW_A) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
+	if (((processor_alias_table[i].flags & PTA_SSE) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
+	if (((processor_alias_table[i].flags & PTA_SSE2) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
+	if (((processor_alias_table[i].flags & PTA_SSE3) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
+	if (((processor_alias_table[i].flags & PTA_SSSE3) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
+	if (((processor_alias_table[i].flags & PTA_SSE4_1) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
+	if (((processor_alias_table[i].flags & PTA_SSE4_2) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
+	if (((processor_alias_table[i].flags & PTA_AVX) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
+	if (((processor_alias_table[i].flags & PTA_AVX2) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
+	if (((processor_alias_table[i].flags & PTA_FMA) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
+	if (((processor_alias_table[i].flags & PTA_SSE4A) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
+	if (((processor_alias_table[i].flags & PTA_FMA4) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
+	if (((processor_alias_table[i].flags & PTA_XOP) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
+	if (((processor_alias_table[i].flags & PTA_LWP) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
+	if (((processor_alias_table[i].flags & PTA_ABM) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
+	if (((processor_alias_table[i].flags & PTA_BMI) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
+	if (((processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
+	if (((processor_alias_table[i].flags & PTA_TBM) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
+	if (((processor_alias_table[i].flags & PTA_BMI2) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
+	if (((processor_alias_table[i].flags & PTA_CX16) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CX16))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CX16;
+	if (((processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
+	if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
+	    && ((processor_alias_table[i].flags & PTA_NO_SAHF) != 0))
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
+	if (((processor_alias_table[i].flags & PTA_MOVBE) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MOVBE))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MOVBE;
+	if (((processor_alias_table[i].flags & PTA_AES) != 0)
+	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
+	  ix86_isa_flags |= OPTION_MASK_ISA_AES;
+	if (((processor_alias_table[i].flags & PTA_SHA) != 0)
+	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
+	  ix86_isa_flags |= OPTION_MASK_ISA_SHA;
+	if (((processor_alias_table[i].flags & PTA_PCLMUL) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
+	if (((processor_alias_table[i].flags & PTA_FSGSBASE) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
+	if (((processor_alias_table[i].flags & PTA_RDRND) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
+	if (((processor_alias_table[i].flags & PTA_F16C) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
+	if (((processor_alias_table[i].flags & PTA_RTM) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
+	if (((processor_alias_table[i].flags & PTA_HLE) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_HLE))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_HLE;
+	if (((processor_alias_table[i].flags & PTA_PRFCHW) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
+	if (((processor_alias_table[i].flags & PTA_RDSEED) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
+	if (((processor_alias_table[i].flags & PTA_ADX) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
+	if (((processor_alias_table[i].flags & PTA_FXSR) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
+	if (((processor_alias_table[i].flags & PTA_XSAVE) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
+	if (((processor_alias_table[i].flags & PTA_XSAVEOPT) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
+	if (((processor_alias_table[i].flags & PTA_AVX512F) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
+	if (((processor_alias_table[i].flags & PTA_AVX512ER) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
+	if (((processor_alias_table[i].flags & PTA_AVX512PF) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
+	if (((processor_alias_table[i].flags & PTA_AVX512CD) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
+	if (((processor_alias_table[i].flags & PTA_PREFETCHWT1) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
+	if (((processor_alias_table[i].flags & PTA_CLWB) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
+	if (((processor_alias_table[i].flags & PTA_CLFLUSHOPT) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
+	if (((processor_alias_table[i].flags & PTA_CLZERO) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CLZERO))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CLZERO;
+	if (((processor_alias_table[i].flags & PTA_XSAVEC) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
+	if (((processor_alias_table[i].flags & PTA_XSAVES) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
+	if (((processor_alias_table[i].flags & PTA_AVX512DQ) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
+	if (((processor_alias_table[i].flags & PTA_AVX512BW) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
+	if (((processor_alias_table[i].flags & PTA_AVX512VL) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
+	if (((processor_alias_table[i].flags & PTA_AVX512VBMI) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
+	if (((processor_alias_table[i].flags & PTA_AVX512IFMA) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
+	if (((processor_alias_table[i].flags & PTA_AVX512VNNI) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VNNI))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VNNI;
+	if (((processor_alias_table[i].flags & PTA_GFNI) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_GFNI))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_GFNI;
+	if (((processor_alias_table[i].flags & PTA_AVX512VBMI2) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit
+	    & OPTION_MASK_ISA_AVX512VBMI2))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI2;
+	if (((processor_alias_table[i].flags & PTA_VPCLMULQDQ) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_VPCLMULQDQ))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_VPCLMULQDQ;
+	if (((processor_alias_table[i].flags & PTA_AVX512BITALG) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit
+	    & OPTION_MASK_ISA_AVX512BITALG))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BITALG;
+
+	if (((processor_alias_table[i].flags & PTA_AVX5124VNNIW) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit
+		 & OPTION_MASK_ISA_AVX5124VNNIW))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
+	if (((processor_alias_table[i].flags & PTA_AVX5124FMAPS) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit
+		 & OPTION_MASK_ISA_AVX5124FMAPS))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
+	if (((processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit
+		 & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
+	if (((processor_alias_table[i].flags & PTA_SGX) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
+	if (((processor_alias_table[i].flags & PTA_VAES) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_VAES))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_VAES;
+	if (((processor_alias_table[i].flags & PTA_RDPID) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_RDPID))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_RDPID;
+	if (((processor_alias_table[i].flags & PTA_PCONFIG) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PCONFIG))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PCONFIG;
+	if (((processor_alias_table[i].flags & PTA_WBNOINVD) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_WBNOINVD))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_WBNOINVD;
+	if (((processor_alias_table[i].flags & PTA_PTWRITE) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PTWRITE))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PTWRITE;
+
+	if ((processor_alias_table[i].flags
+	   & (PTA_PREFETCH_SSE | PTA_SSE)) != 0)
+	  x86_prefetch_sse = true;
+	if (((processor_alias_table[i].flags & PTA_MWAITX) != 0)
+	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MWAITX))
+	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MWAITX;
+	if (((processor_alias_table[i].flags & PTA_PKU) != 0)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
+
+	/* Don't enable x87 instructions if only
+	   general registers are allowed.  */
+	if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
+	    && !(opts_set->x_target_flags & MASK_80387))
+	  {
+	    if (((processor_alias_table[i].flags & PTA_NO_80387) != 0))
+	      opts->x_target_flags &= ~MASK_80387;
+	    else
+	      opts->x_target_flags |= MASK_80387;
+	  }
+	break;
+      }
+
+  if (i == pta_size)
+    {
+      error (main_args_p
+	     ? G_("bad value (%qs) for %<-march=%> switch")
+	     : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
+	     opts->x_ix86_arch_string);
+
+      auto_vec <const char *> candidates;
+      for (i = 0; i < pta_size; i++)
+	if (strcmp (processor_alias_table[i].name, "generic")
+	    && strcmp (processor_alias_table[i].name, "intel")
+	    && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
+		|| ((processor_alias_table[i].flags & PTA_64BIT) != 0)))
+	  candidates.safe_push (processor_alias_table[i].name);
+
+#ifdef HAVE_LOCAL_CPU_DETECT
+      /* Add also "native" as possible value.  */
+      candidates.safe_push ("native");
+#endif
+
+      char *s;
+      const char *hint
+	= candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
+      if (hint)
+	inform (input_location,
+		main_args_p
+		? G_("valid arguments to %<-march=%> switch are: "
+		     "%s; did you mean %qs?")
+		: G_("valid arguments to %<target(\"arch=\")%> attribute are: "
+		     "%s; did you mean %qs?"), s, hint);
+      else
+	inform (input_location,
+		main_args_p
+		? G_("valid arguments to %<-march=%> switch are: %s")
+		: G_("valid arguments to %<target(\"arch=\")%> attribute "
+		     "are: %s"), s);
+      XDELETEVEC (s);
+    }
+
+  ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
+  for (i = 0; i < X86_ARCH_LAST; ++i)
+    ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
+
+  for (i = 0; i < pta_size; i++)
+    if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
+      {
+	ix86_schedule = processor_alias_table[i].schedule;
+	ix86_tune = processor_alias_table[i].processor;
+	if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	  {
+	    if (!((processor_alias_table[i].flags & PTA_64BIT) != 0))
+	      {
+		if (ix86_tune_defaulted)
+		  {
+		    opts->x_ix86_tune_string = "x86-64";
+		    for (i = 0; i < pta_size; i++)
+		      if (! strcmp (opts->x_ix86_tune_string,
+				    processor_alias_table[i].name))
+			break;
+		    ix86_schedule = processor_alias_table[i].schedule;
+		    ix86_tune = processor_alias_table[i].processor;
+		  }
+		else
+		  error ("CPU you selected does not support x86-64 "
+			 "instruction set");
+	      }
+	  }
+	/* Intel CPUs have always interpreted SSE prefetch instructions as
+	   NOPs; so, we can enable SSE prefetch instructions even when
+	   -mtune (rather than -march) points us to a processor that has them.
+	   However, the VIA C3 gives a SIGILL, so we only do that for i686 and
+	   higher processors.  */
+	if (TARGET_CMOV
+	    && ((processor_alias_table[i].flags
+	      & (PTA_PREFETCH_SSE | PTA_SSE)) != 0))
+	  x86_prefetch_sse = true;
+	break;
+      }
+
+  if (ix86_tune_specified && i == pta_size)
+    {
+      error (main_args_p
+	     ? G_("bad value (%qs) for %<-mtune=%> switch")
+	     : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
+	     opts->x_ix86_tune_string);
+
+      auto_vec <const char *> candidates;
+      for (i = 0; i < pta_size; i++)
+	if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
+	    || ((processor_alias_table[i].flags & PTA_64BIT) != 0))
+	  candidates.safe_push (processor_alias_table[i].name);
+
+#ifdef HAVE_LOCAL_CPU_DETECT
+      /* Add also "native" as possible value.  */
+      candidates.safe_push ("native");
+#endif
+
+      char *s;
+      const char *hint
+	= candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
+      if (hint)
+	inform (input_location,
+		main_args_p
+		? G_("valid arguments to %<-mtune=%> switch are: "
+		     "%s; did you mean %qs?")
+		: G_("valid arguments to %<target(\"tune=\")%> attribute are: "
+		     "%s; did you mean %qs?"), s, hint);
+      else
+	inform (input_location,
+		main_args_p
+		? G_("valid arguments to %<-mtune=%> switch are: %s")
+		: G_("valid arguments to %<target(\"tune=\")%> attribute "
+		     "are: %s"), s);
+      XDELETEVEC (s);
+    }
+
+  set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
+
+#ifndef USE_IX86_FRAME_POINTER
+#define USE_IX86_FRAME_POINTER 0
+#endif
+
+#ifndef USE_X86_64_FRAME_POINTER
+#define USE_X86_64_FRAME_POINTER 0
+#endif
+
+  /* Set the default values for switches whose default depends on TARGET_64BIT
+     in case they weren't overwritten by command line options.  */
+  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    {
+      if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
+	opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
+      if (opts->x_flag_asynchronous_unwind_tables
+	  && !opts_set->x_flag_unwind_tables
+	  && TARGET_64BIT_MS_ABI)
+	opts->x_flag_unwind_tables = 1;
+      if (opts->x_flag_asynchronous_unwind_tables == 2)
+	opts->x_flag_unwind_tables
+	  = opts->x_flag_asynchronous_unwind_tables = 1;
+      if (opts->x_flag_pcc_struct_return == 2)
+	opts->x_flag_pcc_struct_return = 0;
+    }
+  else
+    {
+      if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
+	opts->x_flag_omit_frame_pointer
+	  = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
+      if (opts->x_flag_asynchronous_unwind_tables == 2)
+	opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
+      if (opts->x_flag_pcc_struct_return == 2)
+	{
+	  /* Intel MCU psABI specifies that -freg-struct-return should
+	     be on.  Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
+	     we check -miamcu so that -freg-struct-return is always
+	     turned on if -miamcu is used.  */
+	  if (TARGET_IAMCU_P (opts->x_target_flags))
+	    opts->x_flag_pcc_struct_return = 0;
+	  else
+	    opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
+	}
+    }
+
+  ix86_tune_cost = processor_cost_table[ix86_tune];
+  /* TODO: ix86_cost should be chosen at instruction or function granuality
+     so for cold code we use size_cost even in !optimize_size compilation.  */
+  if (opts->x_optimize_size)
+    ix86_cost = &ix86_size_cost;
+  else
+    ix86_cost = ix86_tune_cost;
+
+  /* Arrange to set up i386_stack_locals for all functions.  */
+  init_machine_status = ix86_init_machine_status;
+
+  /* Validate -mregparm= value.  */
+  if (opts_set->x_ix86_regparm)
+    {
+      if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	warning (0, "%<-mregparm%> is ignored in 64-bit mode");
+      else if (TARGET_IAMCU_P (opts->x_target_flags))
+	warning (0, "%<-mregparm%> is ignored for Intel MCU psABI");
+      if (opts->x_ix86_regparm > REGPARM_MAX)
+	{
+	  error ("%<-mregparm=%d%> is not between 0 and %d",
+		 opts->x_ix86_regparm, REGPARM_MAX);
+	  opts->x_ix86_regparm = 0;
+	}
+    }
+  if (TARGET_IAMCU_P (opts->x_target_flags)
+      || TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    opts->x_ix86_regparm = REGPARM_MAX;
+
+  /* Default align_* from the processor table.  */
+  ix86_default_align (opts);
+
+  /* Provide default for -mbranch-cost= value.  */
+  if (!opts_set->x_ix86_branch_cost)
+    opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
+
+  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    {
+      opts->x_target_flags
+	|= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
+
+      if (!ix86_arch_specified)
+	opts->x_ix86_isa_flags
+	  |= TARGET_SUBTARGET64_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
+
+      if (TARGET_RTD_P (opts->x_target_flags))
+	warning (0,
+		 main_args_p
+		 ? G_("%<-mrtd%> is ignored in 64bit mode")
+		 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
+    }
+  else
+    {
+      opts->x_target_flags
+	|= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
+
+      if (!ix86_arch_specified)
+        opts->x_ix86_isa_flags
+	  |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
+
+      /* i386 ABI does not specify red zone.  It still makes sense to use it
+         when programmer takes care to stack from being destroyed.  */
+      if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
+        opts->x_target_flags |= MASK_NO_RED_ZONE;
+    }
+
+  /* Keep nonleaf frame pointers.  */
+  if (opts->x_flag_omit_frame_pointer)
+    opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
+  else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
+    opts->x_flag_omit_frame_pointer = 1;
+
+  /* If we're doing fast math, we don't care about comparison order
+     wrt NaNs.  This lets us use a shorter comparison sequence.  */
+  if (opts->x_flag_finite_math_only)
+    opts->x_target_flags &= ~MASK_IEEE_FP;
+
+  /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
+     since the insns won't need emulation.  */
+  if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
+    opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
+
+  /* Likewise, if the target doesn't have a 387, or we've specified
+     software floating point, don't use 387 inline intrinsics.  */
+  if (!TARGET_80387_P (opts->x_target_flags))
+    opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
+
+  /* Turn on MMX builtins for -msse.  */
+  if (TARGET_SSE_P (opts->x_ix86_isa_flags))
+    opts->x_ix86_isa_flags
+      |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
+
+  /* Enable SSE prefetch.  */
+  if (TARGET_SSE_P (opts->x_ix86_isa_flags)
+      || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
+	  && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
+      || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
+    x86_prefetch_sse = true;
+
+  /* Enable popcnt instruction for -msse4.2 or -mabm.  */
+  if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
+      || TARGET_ABM_P (opts->x_ix86_isa_flags))
+    opts->x_ix86_isa_flags
+      |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
+
+  /* Enable lzcnt instruction for -mabm.  */
+  if (TARGET_ABM_P(opts->x_ix86_isa_flags))
+    opts->x_ix86_isa_flags
+      |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
+
+  /* Disable BMI, BMI2 and TBM instructions for -m16.  */
+  if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
+    opts->x_ix86_isa_flags
+      &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
+	   & ~opts->x_ix86_isa_flags_explicit);
+
+  /* Validate -mpreferred-stack-boundary= value or default it to
+     PREFERRED_STACK_BOUNDARY_DEFAULT.  */
+  ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
+  if (opts_set->x_ix86_preferred_stack_boundary_arg)
+    {
+      int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
+      int max = TARGET_SEH ? 4 : 12;
+
+      if (opts->x_ix86_preferred_stack_boundary_arg < min
+	  || opts->x_ix86_preferred_stack_boundary_arg > max)
+	{
+	  if (min == max)
+	    error ("%<-mpreferred-stack-boundary%> is not supported "
+		   "for this target");
+	  else
+	    error ("%<-mpreferred-stack-boundary=%d%> is not between %d and %d",
+		   opts->x_ix86_preferred_stack_boundary_arg, min, max);
+	}
+      else
+	ix86_preferred_stack_boundary
+	  = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
+    }
+
+  /* Set the default value for -mstackrealign.  */
+  if (!opts_set->x_ix86_force_align_arg_pointer)
+    opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
+
+  ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
+
+  /* Validate -mincoming-stack-boundary= value or default it to
+     MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY.  */
+  ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
+  if (opts_set->x_ix86_incoming_stack_boundary_arg)
+    {
+      int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
+
+      if (opts->x_ix86_incoming_stack_boundary_arg < min
+	  || opts->x_ix86_incoming_stack_boundary_arg > 12)
+	error ("%<-mincoming-stack-boundary=%d%> is not between %d and 12",
+	       opts->x_ix86_incoming_stack_boundary_arg, min);
+      else
+	{
+	  ix86_user_incoming_stack_boundary
+	    = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
+	  ix86_incoming_stack_boundary
+	    = ix86_user_incoming_stack_boundary;
+	}
+    }
+
+#ifndef NO_PROFILE_COUNTERS
+  if (flag_nop_mcount)
+    error ("%<-mnop-mcount%> is not compatible with this target");
+#endif
+  if (flag_nop_mcount && flag_pic)
+    error ("%<-mnop-mcount%> is not implemented for %<-fPIC%>");
+
+  /* Accept -msseregparm only if at least SSE support is enabled.  */
+  if (TARGET_SSEREGPARM_P (opts->x_target_flags)
+      && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
+    error (main_args_p
+	   ? G_("%<-msseregparm%> used without SSE enabled")
+	   : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
+
+  if (opts_set->x_ix86_fpmath)
+    {
+      if (opts->x_ix86_fpmath & FPMATH_SSE)
+	{
+	  if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
+	    {
+	      if (TARGET_80387_P (opts->x_target_flags))
+		{
+		  warning (0, "SSE instruction set disabled, using 387 arithmetics");
+		  opts->x_ix86_fpmath = FPMATH_387;
+		}
+	    }
+	  else if ((opts->x_ix86_fpmath & FPMATH_387)
+		   && !TARGET_80387_P (opts->x_target_flags))
+	    {
+	      warning (0, "387 instruction set disabled, using SSE arithmetics");
+	      opts->x_ix86_fpmath = FPMATH_SSE;
+	    }
+	}
+    }
+  /* For all chips supporting SSE2, -mfpmath=sse performs better than
+     fpmath=387.  The second is however default at many targets since the
+     extra 80bit precision of temporaries is considered to be part of ABI.
+     Overwrite the default at least for -ffast-math. 
+     TODO: -mfpmath=both seems to produce same performing code with bit
+     smaller binaries.  It is however not clear if register allocation is
+     ready for this setting.
+     Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
+     codegen.  We may switch to 387 with -ffast-math for size optimized
+     functions. */
+  else if (fast_math_flags_set_p (&global_options)
+	   && TARGET_SSE2_P (opts->x_ix86_isa_flags))
+    opts->x_ix86_fpmath = FPMATH_SSE;
+  else
+    opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
+
+  /* Use external vectorized library in vectorizing intrinsics.  */
+  if (opts_set->x_ix86_veclibabi_type)
+    switch (opts->x_ix86_veclibabi_type)
+      {
+      case ix86_veclibabi_type_svml:
+	ix86_veclib_handler = &ix86_veclibabi_svml;
+	break;
+
+      case ix86_veclibabi_type_acml:
+	ix86_veclib_handler = &ix86_veclibabi_acml;
+	break;
+
+      default:
+	gcc_unreachable ();
+      }
+
+  if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
+      && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
+    opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
+
+  /* If stack probes are required, the space used for large function
+     arguments on the stack must also be probed, so enable
+     -maccumulate-outgoing-args so this happens in the prologue.  */
+  if (TARGET_STACK_PROBE_P (opts->x_target_flags)
+      && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
+    {
+      if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
+	warning (0,
+		 main_args_p
+		 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
+		      "for correctness")
+		 : G_("stack probing requires "
+		      "%<target(\"accumulate-outgoing-args\")%> for "
+		      "correctness"));
+      opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
+    }
+
+  /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
+     so enable -maccumulate-outgoing-args when %ebp is fixed.  */
+  if (fixed_regs[BP_REG]
+      && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
+    {
+      if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
+	warning (0,
+		 main_args_p
+		 ? G_("fixed ebp register requires "
+		      "%<-maccumulate-outgoing-args%>")
+		 : G_("fixed ebp register requires "
+		      "%<target(\"accumulate-outgoing-args\")%>"));
+      opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
+    }
+
+  /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix.  */
+  {
+    char *p;
+    ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
+    p = strchr (internal_label_prefix, 'X');
+    internal_label_prefix_len = p - internal_label_prefix;
+    *p = '\0';
+  }
+
+  /* When scheduling description is not available, disable scheduler pass
+     so it won't slow down the compilation and make x87 code slower.  */
+  if (!TARGET_SCHEDULE)
+    opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
+
+  maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
+			 ix86_tune_cost->simultaneous_prefetches,
+			 opts->x_param_values,
+			 opts_set->x_param_values);
+  maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+			 ix86_tune_cost->prefetch_block,
+			 opts->x_param_values,
+			 opts_set->x_param_values);
+  maybe_set_param_value (PARAM_L1_CACHE_SIZE,
+			 ix86_tune_cost->l1_cache_size,
+			 opts->x_param_values,
+			 opts_set->x_param_values);
+  maybe_set_param_value (PARAM_L2_CACHE_SIZE,
+			 ix86_tune_cost->l2_cache_size,
+			 opts->x_param_values,
+			 opts_set->x_param_values);
+
+  /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful.  */
+  if (opts->x_flag_prefetch_loop_arrays < 0
+      && HAVE_prefetch
+      && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
+      && !opts->x_optimize_size
+      && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
+    opts->x_flag_prefetch_loop_arrays = 1;
+
+  /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
+     can be opts->x_optimized to ap = __builtin_next_arg (0).  */
+  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
+    targetm.expand_builtin_va_start = NULL;
+
+  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    {
+      ix86_gen_leave = gen_leave_rex64;
+      if (Pmode == DImode)
+	{
+	  ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
+	  ix86_gen_tls_local_dynamic_base_64
+	    = gen_tls_local_dynamic_base_64_di;
+	}
+      else
+	{
+	  ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
+	  ix86_gen_tls_local_dynamic_base_64
+	    = gen_tls_local_dynamic_base_64_si;
+	}
+    }
+  else
+    ix86_gen_leave = gen_leave;
+
+  if (Pmode == DImode)
+    {
+      ix86_gen_add3 = gen_adddi3;
+      ix86_gen_sub3 = gen_subdi3;
+      ix86_gen_sub3_carry = gen_subdi3_carry;
+      ix86_gen_one_cmpl2 = gen_one_cmpldi2;
+      ix86_gen_andsp = gen_anddi3;
+      ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
+      ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
+      ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
+      ix86_gen_monitor = gen_sse3_monitor_di;
+      ix86_gen_monitorx = gen_monitorx_di;
+      ix86_gen_clzero = gen_clzero_di;
+    }
+  else
+    {
+      ix86_gen_add3 = gen_addsi3;
+      ix86_gen_sub3 = gen_subsi3;
+      ix86_gen_sub3_carry = gen_subsi3_carry;
+      ix86_gen_one_cmpl2 = gen_one_cmplsi2;
+      ix86_gen_andsp = gen_andsi3;
+      ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
+      ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
+      ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
+      ix86_gen_monitor = gen_sse3_monitor_si;
+      ix86_gen_monitorx = gen_monitorx_si;
+      ix86_gen_clzero = gen_clzero_si;
+    }
+
+#ifdef USE_IX86_CLD
+  /* Use -mcld by default for 32-bit code if configured with --enable-cld.  */
+  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
+#endif
+
+  /* Set the default value for -mfentry.  */
+  if (!opts_set->x_flag_fentry)
+    opts->x_flag_fentry = TARGET_SEH;
+  else
+    {
+      if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
+	  && opts->x_flag_fentry)
+	sorry ("%<-mfentry%> isn%'t supported for 32-bit in combination "
+	       "with %<-fpic%>");
+      else if (TARGET_SEH && !opts->x_flag_fentry)
+	sorry ("%<-mno-fentry%> isn%'t compatible with SEH");
+    }
+
+  if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
+    sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH");
+
+  if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
+      && TARGET_EMIT_VZEROUPPER)
+    opts->x_target_flags |= MASK_VZEROUPPER;
+  if (!(opts_set->x_target_flags & MASK_STV))
+    opts->x_target_flags |= MASK_STV;
+  /* Disable STV if -mpreferred-stack-boundary={2,3} or
+     -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
+     stack realignment will be extra cost the pass doesn't take into
+     account and the pass can't realign the stack.  */
+  if (ix86_preferred_stack_boundary < 128
+      || ix86_incoming_stack_boundary < 128
+      || opts->x_ix86_force_align_arg_pointer)
+    opts->x_target_flags &= ~MASK_STV;
+  if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
+      && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
+    opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
+  if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
+      && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
+    opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
+
+  /* Enable 128-bit AVX instruction generation
+     for the auto-vectorizer.  */
+  if (TARGET_AVX128_OPTIMAL
+      && (opts_set->x_prefer_vector_width_type == PVW_NONE))
+    opts->x_prefer_vector_width_type = PVW_AVX128;
+
+  /* Use 256-bit AVX instruction generation
+     in the auto-vectorizer.  */
+  if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
+      && (opts_set->x_prefer_vector_width_type == PVW_NONE))
+    opts->x_prefer_vector_width_type = PVW_AVX256;
+
+  if (opts->x_ix86_recip_name)
+    {
+      char *p = ASTRDUP (opts->x_ix86_recip_name);
+      char *q;
+      unsigned int mask, i;
+      bool invert;
+
+      while ((q = strtok (p, ",")) != NULL)
+	{
+	  p = NULL;
+	  if (*q == '!')
+	    {
+	      invert = true;
+	      q++;
+	    }
+	  else
+	    invert = false;
+
+	  if (!strcmp (q, "default"))
+	    mask = RECIP_MASK_ALL;
+	  else
+	    {
+	      for (i = 0; i < ARRAY_SIZE (recip_options); i++)
+		if (!strcmp (q, recip_options[i].string))
+		  {
+		    mask = recip_options[i].mask;
+		    break;
+		  }
+
+	      if (i == ARRAY_SIZE (recip_options))
+		{
+		  error ("unknown option for %<-mrecip=%s%>", q);
+		  invert = false;
+		  mask = RECIP_MASK_NONE;
+		}
+	    }
+
+	  opts->x_recip_mask_explicit |= mask;
+	  if (invert)
+	    opts->x_recip_mask &= ~mask;
+	  else
+	    opts->x_recip_mask |= mask;
+	}
+    }
+
+  if (TARGET_RECIP_P (opts->x_target_flags))
+    opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
+  else if (opts_set->x_target_flags & MASK_RECIP)
+    opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
+
+  /* Default long double to 64-bit for 32-bit Bionic and to __float128
+     for 64-bit Bionic.  Also default long double to 64-bit for Intel
+     MCU psABI.  */
+  if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
+      && !(opts_set->x_target_flags
+	   & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
+    opts->x_target_flags |= (TARGET_64BIT
+			     ? MASK_LONG_DOUBLE_128
+			     : MASK_LONG_DOUBLE_64);
+
+  /* Only one of them can be active.  */
+  gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
+	      || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
+
+  /* Handle stack protector */
+  if (!opts_set->x_ix86_stack_protector_guard)
+    {
+#ifdef TARGET_THREAD_SSP_OFFSET
+      if (!TARGET_HAS_BIONIC)
+	opts->x_ix86_stack_protector_guard = SSP_TLS;
+      else
+#endif
+	opts->x_ix86_stack_protector_guard = SSP_GLOBAL;
+    }
+
+  if (opts_set->x_ix86_stack_protector_guard_offset_str)
+    {
+      char *endp;
+      const char *str = opts->x_ix86_stack_protector_guard_offset_str;
+
+      errno = 0;
+      int64_t offset;
+
+#if defined(INT64_T_IS_LONG)
+      offset = strtol (str, &endp, 0);
+#else
+      offset = strtoll (str, &endp, 0);
+#endif
+
+      if (!*str || *endp || errno)
+	error ("%qs is not a valid number "
+	       "in %<-mstack-protector-guard-offset=%>", str);
+
+      if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
+		     HOST_WIDE_INT_C (0x7fffffff)))
+	error ("%qs is not a valid offset "
+	       "in %<-mstack-protector-guard-offset=%>", str);
+
+      opts->x_ix86_stack_protector_guard_offset = offset;
+    }
+#ifdef TARGET_THREAD_SSP_OFFSET
+  else
+    opts->x_ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
+#endif
+
+  if (opts_set->x_ix86_stack_protector_guard_reg_str)
+    {
+      const char *str = opts->x_ix86_stack_protector_guard_reg_str;
+      addr_space_t seg = ADDR_SPACE_GENERIC;
+
+      /* Discard optional register prefix.  */
+      if (str[0] == '%')
+	str++;
+
+      if (strlen (str) == 2 && str[1] == 's')
+	{
+	  if (str[0] == 'f')
+	    seg = ADDR_SPACE_SEG_FS;
+	  else if (str[0] == 'g')
+	    seg = ADDR_SPACE_SEG_GS;
+	}
+
+      if (seg == ADDR_SPACE_GENERIC)
+	error ("%qs is not a valid base register "
+	       "in %<-mstack-protector-guard-reg=%>",
+	       opts->x_ix86_stack_protector_guard_reg_str);
+
+      opts->x_ix86_stack_protector_guard_reg = seg;
+    }
+  else
+    {
+      opts->x_ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
+
+      /* The kernel uses a different segment register for performance
+	 reasons; a system call would not have to trash the userspace
+	 segment register, which would be expensive.  */
+      if (opts->x_ix86_cmodel == CM_KERNEL)
+	opts->x_ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
+    }
+
+  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
+  if (opts->x_ix86_tune_memcpy_strategy)
+    {
+      char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
+      ix86_parse_stringop_strategy_string (str, false);
+      free (str);
+    }
+
+  if (opts->x_ix86_tune_memset_strategy)
+    {
+      char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
+      ix86_parse_stringop_strategy_string (str, true);
+      free (str);
+    }
+
+  /* Save the initial options in case the user does function specific
+     options.  */
+  if (main_args_p)
+    target_option_default_node = target_option_current_node
+      = build_target_option_node (opts);
+
+  if (opts->x_flag_cf_protection != CF_NONE)
+    opts->x_flag_cf_protection
+      = (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
+
+  if (ix86_tune_features [X86_TUNE_AVOID_256FMA_CHAINS])
+    maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 256,
+			   opts->x_param_values,
+			   opts_set->x_param_values);
+  else if (ix86_tune_features [X86_TUNE_AVOID_128FMA_CHAINS])
+    maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 128,
+			   opts->x_param_values,
+			   opts_set->x_param_values);
+
+  /* PR86952: jump table usage with retpolines is slow.
+     The PR provides some numbers about the slowness.  */
+  if (ix86_indirect_branch != indirect_branch_keep
+      && !opts_set->x_flag_jump_tables)
+    opts->x_flag_jump_tables = 0;
+
+  return true;
+}
+
+/* Implement the TARGET_OPTION_OVERRIDE hook.  */
+
+void
+ix86_option_override (void)
+{
+  ix86_option_override_internal (true, &global_options, &global_options_set);
+}
+
+/* Remember the last target of ix86_set_current_function.  */
+static GTY(()) tree ix86_previous_fndecl;
+
+/* Set targets globals to the default (or current #pragma GCC target
+   if active).  Invalidate ix86_previous_fndecl cache.  */
+
+void
+ix86_reset_previous_fndecl (void)
+{
+  tree new_tree = target_option_current_node;
+  cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
+  if (TREE_TARGET_GLOBALS (new_tree))
+    restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
+  else if (new_tree == target_option_default_node)
+    restore_target_globals (&default_target_globals);
+  else
+    TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
+  ix86_previous_fndecl = NULL_TREE;
+}
+
+/* Add target attribute to SIMD clone NODE if needed.  */
+
+void
+ix86_simd_clone_adjust (struct cgraph_node *node)
+{
+  const char *str = NULL;
+
+  /* Attributes need to be adjusted for definitions, not declarations.  */
+  if (!node->definition)
+    return;
+
+  gcc_assert (node->decl == cfun->decl);
+  switch (node->simdclone->vecsize_mangle)
+    {
+    case 'b':
+      if (!TARGET_SSE2)
+	str = "sse2";
+      break;
+    case 'c':
+      if (!TARGET_AVX)
+	str = "avx";
+      break;
+    case 'd':
+      if (!TARGET_AVX2)
+	str = "avx2";
+      break;
+    case 'e':
+      if (!TARGET_AVX512F)
+	str = "avx512f";
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  if (str == NULL)
+    return;
+  push_cfun (NULL);
+  tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
+  bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
+  gcc_assert (ok);
+  pop_cfun ();
+  ix86_reset_previous_fndecl ();
+  ix86_set_current_function (node->decl);
+}
+
+
+
+/* Set the func_type field from the function FNDECL.  */
+
+static void
+ix86_set_func_type (tree fndecl)
+{
+  if (cfun->machine->func_type == TYPE_UNKNOWN)
+    {
+      if (lookup_attribute ("interrupt",
+			    TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
+	{
+	  if (ix86_function_naked (fndecl))
+	    error_at (DECL_SOURCE_LOCATION (fndecl),
+		      "interrupt and naked attributes are not compatible");
+
+	  int nargs = 0;
+	  for (tree arg = DECL_ARGUMENTS (fndecl);
+	       arg;
+	       arg = TREE_CHAIN (arg))
+	    nargs++;
+	  cfun->machine->no_caller_saved_registers = true;
+	  cfun->machine->func_type
+	    = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
+
+	  ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
+
+	  /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument.  */
+	  if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
+	    sorry ("only DWARF debug format is supported for interrupt "
+		   "service routine");
+	}
+      else
+	{
+	  cfun->machine->func_type = TYPE_NORMAL;
+	  if (lookup_attribute ("no_caller_saved_registers",
+				TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
+	    cfun->machine->no_caller_saved_registers = true;
+	}
+    }
+}
+
+/* Set the indirect_branch_type field from the function FNDECL.  */
+
+static void
+ix86_set_indirect_branch_type (tree fndecl)
+{
+  if (cfun->machine->indirect_branch_type == indirect_branch_unset)
+    {
+      tree attr = lookup_attribute ("indirect_branch",
+				    DECL_ATTRIBUTES (fndecl));
+      if (attr != NULL)
+	{
+	  tree args = TREE_VALUE (attr);
+	  if (args == NULL)
+	    gcc_unreachable ();
+	  tree cst = TREE_VALUE (args);
+	  if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
+	    cfun->machine->indirect_branch_type = indirect_branch_keep;
+	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
+	    cfun->machine->indirect_branch_type = indirect_branch_thunk;
+	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
+	    cfun->machine->indirect_branch_type = indirect_branch_thunk_inline;
+	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
+	    cfun->machine->indirect_branch_type = indirect_branch_thunk_extern;
+	  else
+	    gcc_unreachable ();
+	}
+      else
+	cfun->machine->indirect_branch_type = ix86_indirect_branch;
+
+      /* -mcmodel=large is not compatible with -mindirect-branch=thunk
+	 nor -mindirect-branch=thunk-extern.  */
+      if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
+	  && ((cfun->machine->indirect_branch_type
+	       == indirect_branch_thunk_extern)
+	      || (cfun->machine->indirect_branch_type
+		  == indirect_branch_thunk)))
+	error ("%<-mindirect-branch=%s%> and %<-mcmodel=large%> are not "
+	       "compatible",
+	       ((cfun->machine->indirect_branch_type
+		 == indirect_branch_thunk_extern)
+		? "thunk-extern" : "thunk"));
+
+      if (cfun->machine->indirect_branch_type != indirect_branch_keep
+	  && (flag_cf_protection & CF_RETURN))
+	error ("%<-mindirect-branch%> and %<-fcf-protection%> are not "
+	       "compatible");
+    }
+
+  if (cfun->machine->function_return_type == indirect_branch_unset)
+    {
+      tree attr = lookup_attribute ("function_return",
+				    DECL_ATTRIBUTES (fndecl));
+      if (attr != NULL)
+	{
+	  tree args = TREE_VALUE (attr);
+	  if (args == NULL)
+	    gcc_unreachable ();
+	  tree cst = TREE_VALUE (args);
+	  if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
+	    cfun->machine->function_return_type = indirect_branch_keep;
+	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
+	    cfun->machine->function_return_type = indirect_branch_thunk;
+	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
+	    cfun->machine->function_return_type = indirect_branch_thunk_inline;
+	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
+	    cfun->machine->function_return_type = indirect_branch_thunk_extern;
+	  else
+	    gcc_unreachable ();
+	}
+      else
+	cfun->machine->function_return_type = ix86_function_return;
+
+      /* -mcmodel=large is not compatible with -mfunction-return=thunk
+	 nor -mfunction-return=thunk-extern.  */
+      if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
+	  && ((cfun->machine->function_return_type
+	       == indirect_branch_thunk_extern)
+	      || (cfun->machine->function_return_type
+		  == indirect_branch_thunk)))
+	error ("%<-mfunction-return=%s%> and %<-mcmodel=large%> are not "
+	       "compatible",
+	       ((cfun->machine->function_return_type
+		 == indirect_branch_thunk_extern)
+		? "thunk-extern" : "thunk"));
+
+      if (cfun->machine->function_return_type != indirect_branch_keep
+	  && (flag_cf_protection & CF_RETURN))
+	error ("%<-mfunction-return%> and %<-fcf-protection%> are not "
+	       "compatible");
+    }
+}
+
+/* Establish appropriate back-end context for processing the function
+   FNDECL.  The argument might be NULL to indicate processing at top
+   level, outside of any function scope.  */
+void
+ix86_set_current_function (tree fndecl)
+{
+  /* Only change the context if the function changes.  This hook is called
+     several times in the course of compiling a function, and we don't want to
+     slow things down too much or call target_reinit when it isn't safe.  */
+  if (fndecl == ix86_previous_fndecl)
+    {
+      /* There may be 2 function bodies for the same function FNDECL,
+	 one is extern inline and one isn't.  Call ix86_set_func_type
+	 to set the func_type field.  */
+      if (fndecl != NULL_TREE)
+	{
+	  ix86_set_func_type (fndecl);
+	  ix86_set_indirect_branch_type (fndecl);
+	}
+      return;
+    }
+
+  tree old_tree;
+  if (ix86_previous_fndecl == NULL_TREE)
+    old_tree = target_option_current_node;
+  else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
+    old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
+  else
+    old_tree = target_option_default_node;
+
+  if (fndecl == NULL_TREE)
+    {
+      if (old_tree != target_option_current_node)
+	ix86_reset_previous_fndecl ();
+      return;
+    }
+
+  ix86_set_func_type (fndecl);
+  ix86_set_indirect_branch_type (fndecl);
+
+  tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
+  if (new_tree == NULL_TREE)
+    new_tree = target_option_default_node;
+
+  if (old_tree != new_tree)
+    {
+      cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
+      if (TREE_TARGET_GLOBALS (new_tree))
+	restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
+      else if (new_tree == target_option_default_node)
+	restore_target_globals (&default_target_globals);
+      else
+	TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
+    }
+  ix86_previous_fndecl = fndecl;
+
+  static bool prev_no_caller_saved_registers;
+
+  /* 64-bit MS and SYSV ABI have different set of call used registers.
+     Avoid expensive re-initialization of init_regs each time we switch
+     function context.  */
+  if (TARGET_64BIT
+      && (call_used_or_fixed_reg_p (SI_REG)
+	  == (cfun->machine->call_abi == MS_ABI)))
+    reinit_regs ();
+  /* Need to re-initialize init_regs if caller-saved registers are
+     changed.  */
+  else if (prev_no_caller_saved_registers
+	   != cfun->machine->no_caller_saved_registers)
+    reinit_regs ();
+
+  if (cfun->machine->func_type != TYPE_NORMAL
+      || cfun->machine->no_caller_saved_registers)
+    {
+      /* Don't allow SSE, MMX nor x87 instructions since they
+	 may change processor state.  */
+      const char *isa;
+      if (TARGET_SSE)
+	isa = "SSE";
+      else if (TARGET_MMX)
+	isa = "MMX/3Dnow";
+      else if (TARGET_80387)
+	isa = "80387";
+      else
+	isa = NULL;
+      if (isa != NULL)
+	{
+	  if (cfun->machine->func_type != TYPE_NORMAL)
+	    sorry (cfun->machine->func_type == TYPE_EXCEPTION
+		   ? G_("%s instructions aren%'t allowed in an"
+			" exception service routine")
+		   : G_("%s instructions aren%'t allowed in an"
+			" interrupt service routine"),
+		   isa);
+	  else
+	    sorry ("%s instructions aren%'t allowed in a function with "
+		   "the %<no_caller_saved_registers%> attribute", isa);
+	  /* Don't issue the same error twice.  */
+	  cfun->machine->func_type = TYPE_NORMAL;
+	  cfun->machine->no_caller_saved_registers = false;
+	}
+    }
+
+  prev_no_caller_saved_registers
+    = cfun->machine->no_caller_saved_registers;
+}
+
+/* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
+char *
+ix86_offload_options (void)
+{
+  if (TARGET_LP64)
+    return xstrdup ("-foffload-abi=lp64");
+  return xstrdup ("-foffload-abi=ilp32");
+}
+
+/* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
+   and "sseregparm" calling convention attributes;
+   arguments as in struct attribute_spec.handler.  */
+
+static tree
+ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
+			     bool *no_add_attrs)
+{
+  if (TREE_CODE (*node) != FUNCTION_TYPE
+      && TREE_CODE (*node) != METHOD_TYPE
+      && TREE_CODE (*node) != FIELD_DECL
+      && TREE_CODE (*node) != TYPE_DECL)
+    {
+      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+	       name);
+      *no_add_attrs = true;
+      return NULL_TREE;
+    }
+
+  /* Can combine regparm with all attributes but fastcall, and thiscall.  */
+  if (is_attribute_p ("regparm", name))
+    {
+      tree cst;
+
+      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("fastcall and regparm attributes are not compatible");
+	}
+
+      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+	{
+	  error ("regparam and thiscall attributes are not compatible");
+	}
+
+      cst = TREE_VALUE (args);
+      if (TREE_CODE (cst) != INTEGER_CST)
+	{
+	  warning (OPT_Wattributes,
+		   "%qE attribute requires an integer constant argument",
+		   name);
+	  *no_add_attrs = true;
+	}
+      else if (compare_tree_int (cst, REGPARM_MAX) > 0)
+	{
+	  warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
+		   name, REGPARM_MAX);
+	  *no_add_attrs = true;
+	}
+
+      return NULL_TREE;
+    }
+
+  if (TARGET_64BIT)
+    {
+      /* Do not warn when emulating the MS ABI.  */
+      if ((TREE_CODE (*node) != FUNCTION_TYPE
+	   && TREE_CODE (*node) != METHOD_TYPE)
+	  || ix86_function_type_abi (*node) != MS_ABI)
+	warning (OPT_Wattributes, "%qE attribute ignored",
+	         name);
+      *no_add_attrs = true;
+      return NULL_TREE;
+    }
+
+  /* Can combine fastcall with stdcall (redundant) and sseregparm.  */
+  if (is_attribute_p ("fastcall", name))
+    {
+      if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("fastcall and cdecl attributes are not compatible");
+	}
+      if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("fastcall and stdcall attributes are not compatible");
+	}
+      if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("fastcall and regparm attributes are not compatible");
+	}
+      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+	{
+	  error ("fastcall and thiscall attributes are not compatible");
+	}
+    }
+
+  /* Can combine stdcall with fastcall (redundant), regparm and
+     sseregparm.  */
+  else if (is_attribute_p ("stdcall", name))
+    {
+      if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("stdcall and cdecl attributes are not compatible");
+	}
+      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("stdcall and fastcall attributes are not compatible");
+	}
+      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+	{
+	  error ("stdcall and thiscall attributes are not compatible");
+	}
+    }
+
+  /* Can combine cdecl with regparm and sseregparm.  */
+  else if (is_attribute_p ("cdecl", name))
+    {
+      if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("stdcall and cdecl attributes are not compatible");
+	}
+      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("fastcall and cdecl attributes are not compatible");
+	}
+      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+	{
+	  error ("cdecl and thiscall attributes are not compatible");
+	}
+    }
+  else if (is_attribute_p ("thiscall", name))
+    {
+      if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
+	warning (OPT_Wattributes, "%qE attribute is used for non-class method",
+	         name);
+      if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
+	{
+	  error ("stdcall and thiscall attributes are not compatible");
+	}
+      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
+	{
+	  error ("fastcall and thiscall attributes are not compatible");
+	}
+      if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
+	{
+	  error ("cdecl and thiscall attributes are not compatible");
+	}
+    }
+
+  /* Can combine sseregparm with all attributes.  */
+
+  return NULL_TREE;
+}
+
+#ifndef CHECK_STACK_LIMIT
+#define CHECK_STACK_LIMIT (-1)
+#endif
+
+/* The transactional memory builtins are implicitly regparm or fastcall
+   depending on the ABI.  Override the generic do-nothing attribute that
+   these builtins were declared with, and replace it with one of the two
+   attributes that we expect elsewhere.  */
+
+static tree
+ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
+				  int flags, bool *no_add_attrs)
+{
+  tree alt;
+
+  /* In no case do we want to add the placeholder attribute.  */
+  *no_add_attrs = true;
+
+  /* The 64-bit ABI is unchanged for transactional memory.  */
+  if (TARGET_64BIT)
+    return NULL_TREE;
+
+  /* ??? Is there a better way to validate 32-bit windows?  We have
+     cfun->machine->call_abi, but that seems to be set only for 64-bit.  */
+  if (CHECK_STACK_LIMIT > 0)
+    alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
+  else
+    {
+      alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
+      alt = tree_cons (get_identifier ("regparm"), alt, NULL);
+    }
+  decl_attributes (node, alt, flags);
+
+  return NULL_TREE;
+}
+
+/* Handle a "force_align_arg_pointer" attribute.  */
+
+static tree
+ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
+					       tree, int, bool *no_add_attrs)
+{
+  if (TREE_CODE (*node) != FUNCTION_TYPE
+      && TREE_CODE (*node) != METHOD_TYPE
+      && TREE_CODE (*node) != FIELD_DECL
+      && TREE_CODE (*node) != TYPE_DECL)
+    {
+      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+	       name);
+      *no_add_attrs = true;
+    }
+
+  return NULL_TREE;
+}
+
+/* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
+   struct attribute_spec.handler.  */
+
+static tree
+ix86_handle_struct_attribute (tree *node, tree name, tree, int,
+			      bool *no_add_attrs)
+{
+  tree *type = NULL;
+  if (DECL_P (*node))
+    {
+      if (TREE_CODE (*node) == TYPE_DECL)
+	type = &TREE_TYPE (*node);
+    }
+  else
+    type = node;
+
+  if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
+    {
+      warning (OPT_Wattributes, "%qE attribute ignored",
+	       name);
+      *no_add_attrs = true;
+    }
+
+  else if ((is_attribute_p ("ms_struct", name)
+	    && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
+	   || ((is_attribute_p ("gcc_struct", name)
+		&& lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
+    {
+      warning (OPT_Wattributes, "%qE incompatible attribute ignored",
+               name);
+      *no_add_attrs = true;
+    }
+
+  return NULL_TREE;
+}
+
+/* Handle a "callee_pop_aggregate_return" attribute; arguments as
+   in struct attribute_spec handler.  */
+
+static tree
+ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int,
+					 bool *no_add_attrs)
+{
+  if (TREE_CODE (*node) != FUNCTION_TYPE
+      && TREE_CODE (*node) != METHOD_TYPE
+      && TREE_CODE (*node) != FIELD_DECL
+      && TREE_CODE (*node) != TYPE_DECL)
+    {
+      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+	       name);
+      *no_add_attrs = true;
+      return NULL_TREE;
+    }
+  if (TARGET_64BIT)
+    {
+      warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
+	       name);
+      *no_add_attrs = true;
+      return NULL_TREE;
+    }
+  if (is_attribute_p ("callee_pop_aggregate_return", name))
+    {
+      tree cst;
+
+      cst = TREE_VALUE (args);
+      if (TREE_CODE (cst) != INTEGER_CST)
+	{
+	  warning (OPT_Wattributes,
+		   "%qE attribute requires an integer constant argument",
+		   name);
+	  *no_add_attrs = true;
+	}
+      else if (compare_tree_int (cst, 0) != 0
+	       && compare_tree_int (cst, 1) != 0)
+	{
+	  warning (OPT_Wattributes,
+		   "argument to %qE attribute is neither zero, nor one",
+		   name);
+	  *no_add_attrs = true;
+	}
+
+      return NULL_TREE;
+    }
+
+  return NULL_TREE;
+}
+
+/* Handle a "ms_abi" or "sysv" attribute; arguments as in
+   struct attribute_spec.handler.  */
+
+static tree
+ix86_handle_abi_attribute (tree *node, tree name, tree, int,
+			   bool *no_add_attrs)
+{
+  if (TREE_CODE (*node) != FUNCTION_TYPE
+      && TREE_CODE (*node) != METHOD_TYPE
+      && TREE_CODE (*node) != FIELD_DECL
+      && TREE_CODE (*node) != TYPE_DECL)
+    {
+      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+	       name);
+      *no_add_attrs = true;
+      return NULL_TREE;
+    }
+
+  /* Can combine regparm with all attributes but fastcall.  */
+  if (is_attribute_p ("ms_abi", name))
+    {
+      if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("%qs and %qs attributes are not compatible",
+		 "ms_abi", "sysv_abi");
+	}
+
+      return NULL_TREE;
+    }
+  else if (is_attribute_p ("sysv_abi", name))
+    {
+      if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("%qs and %qs attributes are not compatible",
+		 "ms_abi", "sysv_abi");
+	}
+
+      return NULL_TREE;
+    }
+
+  return NULL_TREE;
+}
+
+static tree
+ix86_handle_fndecl_attribute (tree *node, tree name, tree args, int,
+			      bool *no_add_attrs)
+{
+  if (TREE_CODE (*node) != FUNCTION_DECL)
+    {
+      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+               name);
+      *no_add_attrs = true;
+    }
+
+  if (is_attribute_p ("indirect_branch", name))
+    {
+      tree cst = TREE_VALUE (args);
+      if (TREE_CODE (cst) != STRING_CST)
+	{
+	  warning (OPT_Wattributes,
+		   "%qE attribute requires a string constant argument",
+		   name);
+	  *no_add_attrs = true;
+	}
+      else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
+	       && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
+	       && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
+	       && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
+	{
+	  warning (OPT_Wattributes,
+		   "argument to %qE attribute is not "
+		   "(keep|thunk|thunk-inline|thunk-extern)", name);
+	  *no_add_attrs = true;
+	}
+    }
+
+  if (is_attribute_p ("function_return", name))
+    {
+      tree cst = TREE_VALUE (args);
+      if (TREE_CODE (cst) != STRING_CST)
+	{
+	  warning (OPT_Wattributes,
+		   "%qE attribute requires a string constant argument",
+		   name);
+	  *no_add_attrs = true;
+	}
+      else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
+	       && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
+	       && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
+	       && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
+	{
+	  warning (OPT_Wattributes,
+		   "argument to %qE attribute is not "
+		   "(keep|thunk|thunk-inline|thunk-extern)", name);
+	  *no_add_attrs = true;
+	}
+    }
+
+  return NULL_TREE;
+}
+
+static tree
+ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
+						 int, bool *)
+{
+  return NULL_TREE;
+}
+
+static tree
+ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
+{
+  /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
+     but the function type contains args and return type data.  */
+  tree func_type = *node;
+  tree return_type = TREE_TYPE (func_type);
+
+  int nargs = 0;
+  tree current_arg_type = TYPE_ARG_TYPES (func_type);
+  while (current_arg_type
+	 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
+    {
+      if (nargs == 0)
+	{
+	  if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
+	    error ("interrupt service routine should have a pointer "
+		   "as the first argument");
+	}
+      else if (nargs == 1)
+	{
+	  if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
+	      || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
+	    error ("interrupt service routine should have %qs "
+		   "as the second argument",
+		   TARGET_64BIT
+		   ? (TARGET_X32 ? "unsigned long long int"
+				 : "unsigned long int")
+		   : "unsigned int");
+	}
+      nargs++;
+      current_arg_type = TREE_CHAIN (current_arg_type);
+    }
+  if (!nargs || nargs > 2)
+    error ("interrupt service routine can only have a pointer argument "
+	   "and an optional integer argument");
+  if (! VOID_TYPE_P (return_type))
+    error ("interrupt service routine must return %<void%>");
+
+  return NULL_TREE;
+}
+
+/* Handle fentry_name / fentry_section attribute.  */
+
+static tree
+ix86_handle_fentry_name (tree *node, tree name, tree args,
+			 int, bool *no_add_attrs)
+{
+  if (TREE_CODE (*node) == FUNCTION_DECL
+      && TREE_CODE (TREE_VALUE (args)) == STRING_CST)
+    /* Do nothing else, just set the attribute.  We'll get at
+       it later with lookup_attribute.  */
+    ;
+  else
+    {
+      warning (OPT_Wattributes, "%qE attribute ignored", name);
+      *no_add_attrs = true;
+    }
+
+  return NULL_TREE;
+}
+
+/* Table of valid machine attributes.  */
+const struct attribute_spec ix86_attribute_table[] =
+{
+  /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
+       affects_type_identity, handler, exclude } */
+  /* Stdcall attribute says callee is responsible for popping arguments
+     if they are not variable.  */
+  { "stdcall",   0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
+    NULL },
+  /* Fastcall attribute says callee is responsible for popping arguments
+     if they are not variable.  */
+  { "fastcall",  0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
+    NULL },
+  /* Thiscall attribute says callee is responsible for popping arguments
+     if they are not variable.  */
+  { "thiscall",  0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
+    NULL },
+  /* Cdecl attribute says the callee is a normal C declaration */
+  { "cdecl",     0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
+    NULL },
+  /* Regparm attribute specifies how many integer arguments are to be
+     passed in registers.  */
+  { "regparm",   1, 1, false, true,  true,  true, ix86_handle_cconv_attribute,
+    NULL },
+  /* Sseregparm attribute says we are using x86_64 calling conventions
+     for FP arguments.  */
+  { "sseregparm", 0, 0, false, true, true,  true, ix86_handle_cconv_attribute,
+    NULL },
+  /* The transactional memory builtins are implicitly regparm or fastcall
+     depending on the ABI.  Override the generic do-nothing attribute that
+     these builtins were declared with.  */
+  { "*tm regparm", 0, 0, false, true, true, true,
+    ix86_handle_tm_regparm_attribute, NULL },
+  /* force_align_arg_pointer says this function realigns the stack at entry.  */
+  { "force_align_arg_pointer", 0, 0,
+    false, true,  true, false, ix86_handle_force_align_arg_pointer_attribute,
+    NULL },
+#if TARGET_DLLIMPORT_DECL_ATTRIBUTES
+  { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute,
+    NULL },
+  { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute,
+    NULL },
+  { "shared",    0, 0, true,  false, false, false,
+    ix86_handle_shared_attribute, NULL },
+#endif
+  { "ms_struct", 0, 0, false, false,  false, false,
+    ix86_handle_struct_attribute, NULL },
+  { "gcc_struct", 0, 0, false, false,  false, false,
+    ix86_handle_struct_attribute, NULL },
+#ifdef SUBTARGET_ATTRIBUTE_TABLE
+  SUBTARGET_ATTRIBUTE_TABLE,
+#endif
+  /* ms_abi and sysv_abi calling convention function attributes.  */
+  { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL },
+  { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute,
+    NULL },
+  { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
+  { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
+  { "ms_hook_prologue", 0, 0, true, false, false, false,
+    ix86_handle_fndecl_attribute, NULL },
+  { "callee_pop_aggregate_return", 1, 1, false, true, true, true,
+    ix86_handle_callee_pop_aggregate_return, NULL },
+  { "interrupt", 0, 0, false, true, true, false,
+    ix86_handle_interrupt_attribute, NULL },
+  { "no_caller_saved_registers", 0, 0, false, true, true, false,
+    ix86_handle_no_caller_saved_registers_attribute, NULL },
+  { "naked", 0, 0, true, false, false, false,
+    ix86_handle_fndecl_attribute, NULL },
+  { "indirect_branch", 1, 1, true, false, false, false,
+    ix86_handle_fndecl_attribute, NULL },
+  { "function_return", 1, 1, true, false, false, false,
+    ix86_handle_fndecl_attribute, NULL },
+  { "indirect_return", 0, 0, false, true, true, false,
+    NULL, NULL },
+  { "fentry_name", 1, 1, true, false, false, false,
+    ix86_handle_fentry_name, NULL },
+  { "fentry_section", 1, 1, true, false, false, false,
+    ix86_handle_fentry_name, NULL },
+  { "cf_check", 0, 0, true, false, false, false,
+    ix86_handle_fndecl_attribute, NULL },
+
+  /* End element.  */
+  { NULL, 0, 0, false, false, false, false, NULL, NULL }
+};
+
+#include "gt-i386-options.h"
diff --git a/gcc/config/i386/i386-options.h b/gcc/config/i386/i386-options.h
new file mode 100644
index 000000000..817ddda5c
--- /dev/null
+++ b/gcc/config/i386/i386-options.h
@@ -0,0 +1,95 @@
+/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_I386_OPTIONS_H
+#define GCC_I386_OPTIONS_H
+
+char *ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
+			  int flags, int flags2,
+			  const char *arch, const char *tune,
+			  enum fpmath_unit fpmath, bool add_nl_p,
+			  bool add_abi_p);
+
+extern enum attr_cpu ix86_schedule;
+
+extern enum processor_type ix86_tune;
+extern enum processor_type ix86_arch;
+extern unsigned char x86_prefetch_sse;
+extern const struct processor_costs *ix86_tune_cost;
+
+extern int ix86_tune_defaulted;
+extern int ix86_arch_specified;
+
+extern unsigned int ix86_default_incoming_stack_boundary;
+extern HOST_WIDE_INT deferred_isa_values;
+extern HOST_WIDE_INT deferred_isa_values2;
+
+extern unsigned int ix86_preferred_stack_boundary;
+extern unsigned int ix86_user_incoming_stack_boundary;
+extern unsigned int ix86_default_incoming_stack_boundary;
+extern unsigned int ix86_incoming_stack_boundary;
+
+extern char *ix86_offload_options (void);
+extern void ix86_option_override (void);
+extern void ix86_override_options_after_change (void);
+void ix86_set_current_function (tree fndecl);
+bool ix86_function_naked (const_tree fn);
+void ix86_simd_clone_adjust (struct cgraph_node *node);
+
+extern tree (*ix86_veclib_handler) (combined_fn, tree, tree);
+extern tree ix86_veclibabi_svml (combined_fn, tree, tree);
+extern tree ix86_veclibabi_acml (combined_fn, tree, tree);
+
+extern rtx (*ix86_gen_leave) (void);
+extern rtx (*ix86_gen_add3) (rtx, rtx, rtx);
+extern rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
+extern rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
+extern rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
+extern rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
+extern rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
+extern rtx (*ix86_gen_clzero) (rtx);
+extern rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
+extern rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
+extern rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
+extern rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
+extern rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
+extern rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
+
+enum ix86_function_specific_strings
+{
+  IX86_FUNCTION_SPECIFIC_ARCH,
+  IX86_FUNCTION_SPECIFIC_TUNE,
+  IX86_FUNCTION_SPECIFIC_MAX
+};
+
+extern const char *stringop_alg_names[];
+
+void ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2);
+void ix86_function_specific_save (struct cl_target_option *,
+				  struct gcc_options *opts);
+void ix86_function_specific_restore (struct gcc_options *opts,
+				     struct cl_target_option *);
+void ix86_function_specific_post_stream_in (struct cl_target_option *);
+void ix86_function_specific_print (FILE *, int,
+				   struct cl_target_option *);
+bool ix86_valid_target_attribute_p (tree, tree, tree, int);
+
+extern const struct attribute_spec ix86_attribute_table[];
+
+
+#endif  /* GCC_I386_OPTIONS_H */
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 83645e89a..4afba5bc2 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -65,7 +65,7 @@ extern int avx_vpermilp_parallel (rtx par, machine_mode mode);
 extern int avx_vperm2f128_parallel (rtx par, machine_mode mode);
 
 extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx);
-extern bool ix86_expand_set_or_movmem (rtx, rtx, rtx, rtx, rtx, rtx,
+extern bool ix86_expand_set_or_cpymem (rtx, rtx, rtx, rtx, rtx, rtx,
 				       rtx, rtx, rtx, rtx, bool);
 
 extern bool constant_address_p (rtx);
@@ -207,7 +207,7 @@ extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
 #endif	/* RTX_CODE  */
 
 #ifdef TREE_CODE
-extern int ix86_data_alignment (tree, int, bool);
+extern int ix86_data_alignment (tree, unsigned int, bool);
 extern unsigned int ix86_local_alignment (tree, machine_mode,
 					  unsigned int);
 extern unsigned int ix86_minimum_alignment (tree, machine_mode,
@@ -215,9 +215,9 @@ extern unsigned int ix86_minimum_alignment (tree, machine_mode,
 extern tree ix86_handle_shared_attribute (tree *, tree, tree, int, bool *);
 extern tree ix86_handle_selectany_attribute (tree *, tree, tree, int, bool *);
 extern int x86_field_alignment (tree, int);
-extern tree ix86_valid_target_attribute_tree (tree,
+extern tree ix86_valid_target_attribute_tree (tree, tree,
 					      struct gcc_options *,
-					      struct gcc_options *);
+					      struct gcc_options *, bool);
 extern unsigned int ix86_get_callcvt (const_tree);
 
 #endif
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 5a0f8a0eb..9282a8fb6 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -91,19 +91,17 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-vector-builder.h"
 #include "debug.h"
 #include "dwarf2out.h"
+#include "i386-options.h"
+#include "i386-builtins.h"
+#include "i386-expand.h"
+#include "i386-features.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
 
-#include "x86-tune-costs.h"
-
 static rtx legitimize_dllimport_symbol (rtx, bool);
 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
-static rtx legitimize_pe_coff_symbol (rtx, bool);
 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
-static bool ix86_save_reg (unsigned int, bool, bool);
-static bool ix86_function_naked (const_tree);
-static bool ix86_notrack_prefixed_insn_p (rtx);
 static void ix86_emit_restore_reg_using_pop (rtx);
 
 
@@ -126,102 +124,6 @@ const struct processor_costs *ix86_tune_cost = NULL;
 /* Set by -mtune or -Os.  */
 const struct processor_costs *ix86_cost = NULL;
 
-/* Processor feature/optimization bitmasks.  */
-#define m_386 (HOST_WIDE_INT_1U<<PROCESSOR_I386)
-#define m_486 (HOST_WIDE_INT_1U<<PROCESSOR_I486)
-#define m_PENT (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM)
-#define m_LAKEMONT (HOST_WIDE_INT_1U<<PROCESSOR_LAKEMONT)
-#define m_PPRO (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUMPRO)
-#define m_PENT4 (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM4)
-#define m_NOCONA (HOST_WIDE_INT_1U<<PROCESSOR_NOCONA)
-#define m_P4_NOCONA (m_PENT4 | m_NOCONA)
-#define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2)
-#define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM)
-#define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE)
-#define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL)
-#define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL)
-#define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT)
-#define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL)
-#define m_KNM (HOST_WIDE_INT_1U<<PROCESSOR_KNM)
-#define m_SKYLAKE (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE)
-#define m_SKYLAKE_AVX512 (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512)
-#define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE)
-#define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT)
-#define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)
-#define m_CASCADELAKE (HOST_WIDE_INT_1U<<PROCESSOR_CASCADELAKE)
-#define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \
-		       | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE)
-#define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512)
-#define m_CORE_ALL (m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_CORE_AVX2)
-#define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT)
-#define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS)
-#define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT)
-#define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL)
-
-#define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
-#define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
-#define m_K6_GEODE (m_K6 | m_GEODE)
-#define m_K8 (HOST_WIDE_INT_1U<<PROCESSOR_K8)
-#define m_ATHLON (HOST_WIDE_INT_1U<<PROCESSOR_ATHLON)
-#define m_ATHLON_K8 (m_K8 | m_ATHLON)
-#define m_AMDFAM10 (HOST_WIDE_INT_1U<<PROCESSOR_AMDFAM10)
-#define m_BDVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER1)
-#define m_BDVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER2)
-#define m_BDVER3 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER3)
-#define m_BDVER4 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER4)
-#define m_ZNVER1 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER1)
-#define m_ZNVER2 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER2)
-#define m_BTVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER1)
-#define m_BTVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER2)
-#define m_BDVER	(m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
-#define m_BTVER (m_BTVER1 | m_BTVER2)
-#define m_ZNVER	(m_ZNVER1 | m_ZNVER2)
-#define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
-			| m_ZNVER)
-
-#define m_GENERIC (HOST_WIDE_INT_1U<<PROCESSOR_GENERIC)
-
-const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
-#undef DEF_TUNE
-#define DEF_TUNE(tune, name, selector) name,
-#include "x86-tune.def"
-#undef DEF_TUNE
-};
-
-/* Feature tests against the various tunings.  */
-unsigned char ix86_tune_features[X86_TUNE_LAST];
-
-/* Feature tests against the various tunings used to create ix86_tune_features
-   based on the processor mask.  */
-static unsigned HOST_WIDE_INT initial_ix86_tune_features[X86_TUNE_LAST] = {
-#undef DEF_TUNE
-#define DEF_TUNE(tune, name, selector) selector,
-#include "x86-tune.def"
-#undef DEF_TUNE
-};
-
-/* Feature tests against the various architecture variations.  */
-unsigned char ix86_arch_features[X86_ARCH_LAST];
-
-/* Feature tests against the various architecture variations, used to create
-   ix86_arch_features based on the processor mask.  */
-static unsigned HOST_WIDE_INT initial_ix86_arch_features[X86_ARCH_LAST] = {
-  /* X86_ARCH_CMOV: Conditional move was added for pentiumpro.  */
-  ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
-
-  /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486.  */
-  ~m_386,
-
-  /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
-  ~(m_386 | m_486),
-
-  /* X86_ARCH_XADD: Exchange and add was added for 80486.  */
-  ~m_386,
-
-  /* X86_ARCH_BSWAP: Byteswap was added for 80486.  */
-  ~m_386,
-};
-
 /* In case the average insn count for single function invocation is
    lower than this constant, emit fast (but longer) prologue and
    epilogue code.  */
@@ -426,300 +328,6 @@ static int const x86_64_int_return_registers[4] =
   AX_REG, DX_REG, DI_REG, SI_REG
 };
 
-/* Additional registers that are clobbered by SYSV calls.  */
-
-#define NUM_X86_64_MS_CLOBBERED_REGS 12
-static int const x86_64_ms_sysv_extra_clobbered_registers
-		 [NUM_X86_64_MS_CLOBBERED_REGS] =
-{
-  SI_REG, DI_REG,
-  XMM6_REG, XMM7_REG,
-  XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
-  XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
-};
-
-enum xlogue_stub {
-  XLOGUE_STUB_SAVE,
-  XLOGUE_STUB_RESTORE,
-  XLOGUE_STUB_RESTORE_TAIL,
-  XLOGUE_STUB_SAVE_HFP,
-  XLOGUE_STUB_RESTORE_HFP,
-  XLOGUE_STUB_RESTORE_HFP_TAIL,
-
-  XLOGUE_STUB_COUNT
-};
-
-enum xlogue_stub_sets {
-  XLOGUE_SET_ALIGNED,
-  XLOGUE_SET_ALIGNED_PLUS_8,
-  XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
-  XLOGUE_SET_HFP_ALIGNED_PLUS_8,
-
-  XLOGUE_SET_COUNT
-};
-
-/* Register save/restore layout used by out-of-line stubs.  */
-class xlogue_layout {
-public:
-  struct reginfo
-  {
-    unsigned regno;
-    HOST_WIDE_INT offset;	/* Offset used by stub base pointer (rax or
-				   rsi) to where each register is stored.  */
-  };
-
-  unsigned get_nregs () const			{return m_nregs;}
-  HOST_WIDE_INT get_stack_align_off_in () const	{return m_stack_align_off_in;}
-
-  const reginfo &get_reginfo (unsigned reg) const
-  {
-    gcc_assert (reg < m_nregs);
-    return m_regs[reg];
-  }
-
-  static const char *get_stub_name (enum xlogue_stub stub,
-				    unsigned n_extra_args);
-
-  /* Returns an rtx for the stub's symbol based upon
-       1.) the specified stub (save, restore or restore_ret) and
-       2.) the value of cfun->machine->call_ms2sysv_extra_regs and
-       3.) rather or not stack alignment is being performed.  */
-  static rtx get_stub_rtx (enum xlogue_stub stub);
-
-  /* Returns the amount of stack space (including padding) that the stub
-     needs to store registers based upon data in the machine_function.  */
-  HOST_WIDE_INT get_stack_space_used () const
-  {
-    const struct machine_function *m = cfun->machine;
-    unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
-
-    gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
-    return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
-  }
-
-  /* Returns the offset for the base pointer used by the stub.  */
-  HOST_WIDE_INT get_stub_ptr_offset () const
-  {
-    return STUB_INDEX_OFFSET + m_stack_align_off_in;
-  }
-
-  static const struct xlogue_layout &get_instance ();
-  static unsigned count_stub_managed_regs ();
-  static bool is_stub_managed_reg (unsigned regno, unsigned count);
-
-  static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
-  static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
-  static const unsigned MAX_REGS = 18;
-  static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
-  static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
-  static const unsigned STUB_NAME_MAX_LEN = 20;
-  static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
-  static const unsigned REG_ORDER[MAX_REGS];
-  static const unsigned REG_ORDER_REALIGN[MAX_REGS];
-
-private:
-  xlogue_layout ();
-  xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
-  xlogue_layout (const xlogue_layout &);
-
-  /* True if hard frame pointer is used.  */
-  bool m_hfp;
-
-  /* Max number of register this layout manages.  */
-  unsigned m_nregs;
-
-  /* Incoming offset from 16-byte alignment.  */
-  HOST_WIDE_INT m_stack_align_off_in;
-
-  /* Register order and offsets.  */
-  struct reginfo m_regs[MAX_REGS];
-
-  /* Lazy-inited cache of symbol names for stubs.  */
-  static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
-			  [STUB_NAME_MAX_LEN];
-
-  static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
-};
-
-const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
-  "savms64",
-  "resms64",
-  "resms64x",
-  "savms64f",
-  "resms64f",
-  "resms64fx"
-};
-
-const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
-/* The below offset values are where each register is stored for the layout
-   relative to incoming stack pointer.  The value of each m_regs[].offset will
-   be relative to the incoming base pointer (rax or rsi) used by the stub.
-
-    s_instances:   0		1		2		3
-    Offset:					realigned or	aligned + 8
-    Register	   aligned	aligned + 8	aligned w/HFP	w/HFP	*/
-    XMM15_REG,	/* 0x10		0x18		0x10		0x18	*/
-    XMM14_REG,	/* 0x20		0x28		0x20		0x28	*/
-    XMM13_REG,	/* 0x30		0x38		0x30		0x38	*/
-    XMM12_REG,	/* 0x40		0x48		0x40		0x48	*/
-    XMM11_REG,	/* 0x50		0x58		0x50		0x58	*/
-    XMM10_REG,	/* 0x60		0x68		0x60		0x68	*/
-    XMM9_REG,	/* 0x70		0x78		0x70		0x78	*/
-    XMM8_REG,	/* 0x80		0x88		0x80		0x88	*/
-    XMM7_REG,	/* 0x90		0x98		0x90		0x98	*/
-    XMM6_REG,	/* 0xa0		0xa8		0xa0		0xa8	*/
-    SI_REG,	/* 0xa8		0xb0		0xa8		0xb0	*/
-    DI_REG,	/* 0xb0		0xb8		0xb0		0xb8	*/
-    BX_REG,	/* 0xb8		0xc0		0xb8		0xc0	*/
-    BP_REG,	/* 0xc0		0xc8		N/A		N/A	*/
-    R12_REG,	/* 0xc8		0xd0		0xc0		0xc8	*/
-    R13_REG,	/* 0xd0		0xd8		0xc8		0xd0	*/
-    R14_REG,	/* 0xd8		0xe0		0xd0		0xd8	*/
-    R15_REG,	/* 0xe0		0xe8		0xd8		0xe0	*/
-};
-
-/* Instantiate static const values.  */
-const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
-const unsigned xlogue_layout::MIN_REGS;
-const unsigned xlogue_layout::MAX_REGS;
-const unsigned xlogue_layout::MAX_EXTRA_REGS;
-const unsigned xlogue_layout::VARIANT_COUNT;
-const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
-
-/* Initialize xlogue_layout::s_stub_names to zero.  */
-char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
-				[STUB_NAME_MAX_LEN];
-
-/* Instantiates all xlogue_layout instances.  */
-const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
-  xlogue_layout (0, false),
-  xlogue_layout (8, false),
-  xlogue_layout (0, true),
-  xlogue_layout (8, true)
-};
-
-/* Return an appropriate const instance of xlogue_layout based upon values
-   in cfun->machine and crtl.  */
-const struct xlogue_layout &
-xlogue_layout::get_instance ()
-{
-  enum xlogue_stub_sets stub_set;
-  bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
-
-  if (stack_realign_fp)
-    stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
-  else if (frame_pointer_needed)
-    stub_set = aligned_plus_8
-	      ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
-	      : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
-  else
-    stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
-
-  return s_instances[stub_set];
-}
-
-/* Determine how many clobbered registers can be saved by the stub.
-   Returns the count of registers the stub will save and restore.  */
-unsigned
-xlogue_layout::count_stub_managed_regs ()
-{
-  bool hfp = frame_pointer_needed || stack_realign_fp;
-  unsigned i, count;
-  unsigned regno;
-
-  for (count = i = MIN_REGS; i < MAX_REGS; ++i)
-    {
-      regno = REG_ORDER[i];
-      if (regno == BP_REG && hfp)
-	continue;
-      if (!ix86_save_reg (regno, false, false))
-	break;
-      ++count;
-    }
-  return count;
-}
-
-/* Determine if register REGNO is a stub managed register given the
-   total COUNT of stub managed registers.  */
-bool
-xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
-{
-  bool hfp = frame_pointer_needed || stack_realign_fp;
-  unsigned i;
-
-  for (i = 0; i < count; ++i)
-    {
-      gcc_assert (i < MAX_REGS);
-      if (REG_ORDER[i] == BP_REG && hfp)
-	++count;
-      else if (REG_ORDER[i] == regno)
-	return true;
-    }
-  return false;
-}
-
-/* Constructor for xlogue_layout.  */
-xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
-  : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
-    m_stack_align_off_in (stack_align_off_in)
-{
-  HOST_WIDE_INT offset = stack_align_off_in;
-  unsigned i, j;
-
-  for (i = j = 0; i < MAX_REGS; ++i)
-    {
-      unsigned regno = REG_ORDER[i];
-
-      if (regno == BP_REG && hfp)
-	continue;
-      if (SSE_REGNO_P (regno))
-	{
-	  offset += 16;
-	  /* Verify that SSE regs are always aligned.  */
-	  gcc_assert (!((stack_align_off_in + offset) & 15));
-	}
-      else
-	offset += 8;
-
-      m_regs[j].regno    = regno;
-      m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
-    }
-  gcc_assert (j == m_nregs);
-}
-
-const char *
-xlogue_layout::get_stub_name (enum xlogue_stub stub,
-			      unsigned n_extra_regs)
-{
-  const int have_avx = TARGET_AVX;
-  char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
-
-  /* Lazy init */
-  if (!*name)
-    {
-      int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
-			  (have_avx ? "avx" : "sse"),
-			  STUB_BASE_NAMES[stub],
-			  MIN_REGS + n_extra_regs);
-      gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
-    }
-
-  return name;
-}
-
-/* Return rtx of a symbol ref for the entry point (based upon
-   cfun->machine->call_ms2sysv_extra_regs) of the specified stub.  */
-rtx
-xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
-{
-  const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
-  gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
-  gcc_assert (stub < XLOGUE_STUB_COUNT);
-  gcc_assert (crtl->stack_realign_finalized);
-
-  return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
-}
-
 /* Define the structure for the machine field in struct function.  */
 
 struct GTY(()) stack_local_entry {
@@ -741,41 +349,37 @@ enum processor_type ix86_arch;
 /* True if processor has SSE prefetch instruction.  */
 unsigned char x86_prefetch_sse;
 
-/* -mstackrealign option */
-static const char ix86_force_align_arg_pointer_string[]
-  = "force_align_arg_pointer";
-
-static rtx (*ix86_gen_leave) (void);
-static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
-static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
-static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
-static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
-static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
-static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
-static rtx (*ix86_gen_clzero) (rtx);
-static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
-static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
-static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
-static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
-static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
-static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
+rtx (*ix86_gen_leave) (void);
+rtx (*ix86_gen_add3) (rtx, rtx, rtx);
+rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
+rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
+rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
+rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
+rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
+rtx (*ix86_gen_clzero) (rtx);
+rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
+rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
+rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
+rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
+rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
+rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
 
 /* Preferred alignment for stack boundary in bits.  */
 unsigned int ix86_preferred_stack_boundary;
 
 /* Alignment for incoming stack boundary in bits specified at
    command line.  */
-static unsigned int ix86_user_incoming_stack_boundary;
+unsigned int ix86_user_incoming_stack_boundary;
 
 /* Default alignment for incoming stack boundary in bits.  */
-static unsigned int ix86_default_incoming_stack_boundary;
+unsigned int ix86_default_incoming_stack_boundary;
 
 /* Alignment for incoming stack boundary in bits.  */
 unsigned int ix86_incoming_stack_boundary;
 
 /* Calling abi specific va_list type nodes.  */
-static GTY(()) tree sysv_va_list_type_node;
-static GTY(()) tree ms_va_list_type_node;
+tree sysv_va_list_type_node;
+tree ms_va_list_type_node;
 
 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL.  */
 char internal_label_prefix[16];
@@ -813,7 +417,6 @@ static REAL_VALUE_TYPE ext_80387_constants_table [5];
 static bool ext_80387_constants_init;
 
 
-static struct machine_function * ix86_init_machine_status (void);
 static rtx ix86_function_value (const_tree, const_tree, bool);
 static bool ix86_function_value_regno_p (const unsigned int);
 static unsigned int ix86_function_arg_boundary (machine_mode,
@@ -821,49173 +424,20710 @@ static unsigned int ix86_function_arg_boundary (machine_mode,
 static rtx ix86_static_chain (const_tree, bool);
 static int ix86_function_regparm (const_tree, const_tree);
 static void ix86_compute_frame_layout (void);
-static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
-						 rtx, rtx, int);
-static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
 static tree ix86_canonical_va_list_type (tree);
-static void predict_jump (int);
 static unsigned int split_stack_prologue_scratch_regno (void);
 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
 
-enum ix86_function_specific_strings
-{
-  IX86_FUNCTION_SPECIFIC_ARCH,
-  IX86_FUNCTION_SPECIFIC_TUNE,
-  IX86_FUNCTION_SPECIFIC_MAX
-};
-
-static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
-				 const char *, const char *, enum fpmath_unit,
-				 bool, bool);
-static void ix86_function_specific_save (struct cl_target_option *,
-					 struct gcc_options *opts);
-static void ix86_function_specific_restore (struct gcc_options *opts,
-					    struct cl_target_option *);
-static void ix86_function_specific_post_stream_in (struct cl_target_option *);
-static void ix86_function_specific_print (FILE *, int,
-					  struct cl_target_option *);
-static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
-static bool ix86_valid_target_attribute_inner_p (tree, char *[],
-						 struct gcc_options *,
-						 struct gcc_options *,
-						 struct gcc_options *);
 static bool ix86_can_inline_p (tree, tree);
-static void ix86_set_current_function (tree);
 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
 
-static enum calling_abi ix86_function_abi (const_tree);
-
 
-#ifndef SUBTARGET32_DEFAULT_CPU
-#define SUBTARGET32_DEFAULT_CPU "i386"
-#endif
-
 /* Whether -mtune= or -march= were specified */
-static int ix86_tune_defaulted;
-static int ix86_arch_specified;
-
-/* Vectorization library interface and handlers.  */
-static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
-
-static tree ix86_veclibabi_svml (combined_fn, tree, tree);
-static tree ix86_veclibabi_acml (combined_fn, tree, tree);
-
-/* This table must be in sync with enum processor_type in i386.h.  */ 
-static const struct processor_costs *processor_cost_table[] =
-{
-  &generic_cost,
-  &i386_cost,
-  &i486_cost,
-  &pentium_cost,
-  &lakemont_cost,
-  &pentiumpro_cost,
-  &pentium4_cost,
-  &nocona_cost,
-  &core_cost,
-  &core_cost,
-  &core_cost,
-  &core_cost,
-  &atom_cost,
-  &slm_cost,
-  &slm_cost,
-  &slm_cost,
-  &slm_cost,
-  &slm_cost,
-  &slm_cost,
-  &skylake_cost,
-  &skylake_cost,
-  &skylake_cost,
-  &skylake_cost,
-  &skylake_cost,
-  &skylake_cost,
-  &intel_cost,
-  &geode_cost,
-  &k6_cost,
-  &athlon_cost,
-  &k8_cost,
-  &amdfam10_cost,
-  &bdver_cost,
-  &bdver_cost,
-  &bdver_cost,
-  &bdver_cost,
-  &btver1_cost,
-  &btver2_cost,
-  &znver1_cost,
-  &znver2_cost
-};
-
-/* Guarantee that the array is aligned with enum processor_type.  */
-STATIC_ASSERT (ARRAY_SIZE (processor_cost_table) == PROCESSOR_max);
+int ix86_tune_defaulted;
+int ix86_arch_specified;
 
-static unsigned int
-rest_of_handle_insert_vzeroupper (void)
-{
-  int i;
-
-  /* vzeroupper instructions are inserted immediately after reload to
-     account for possible spills from 256bit or 512bit registers.  The pass
-     reuses mode switching infrastructure by re-running mode insertion
-     pass, so disable entities that have already been processed.  */
-  for (i = 0; i < MAX_386_ENTITIES; i++)
-    ix86_optimize_mode_switching[i] = 0;
+/* Return true if a red-zone is in use.  We can't use red-zone when
+   there are local indirect jumps, like "indirect_jump" or "tablejump",
+   which jumps to another place in the function, since "call" in the
+   indirect thunk pushes the return address onto stack, destroying
+   red-zone.
 
-  ix86_optimize_mode_switching[AVX_U128] = 1;
+   TODO: If we can reserve the first 2 WORDs, for PUSH and, another
+   for CALL, in red-zone, we can allow local indirect jumps with
+   indirect thunk.  */
 
-  /* Call optimize_mode_switching.  */
-  g->get_passes ()->execute_pass_mode_switching ();
-  return 0;
+bool
+ix86_using_red_zone (void)
+{
+  return (TARGET_RED_ZONE
+	  && !TARGET_64BIT_MS_ABI
+	  && (!cfun->machine->has_local_indirect_jump
+	      || cfun->machine->indirect_branch_type == indirect_branch_keep));
 }
-
-/* Return 1 if INSN uses or defines a hard register.
-   Hard register uses in a memory address are ignored.
-   Clobbers and flags definitions are ignored.  */
-
+
+/* Return true, if profiling code should be emitted before
+   prologue. Otherwise it returns false.
+   Note: For x86 with "hotfix" it is sorried.  */
 static bool
-has_non_address_hard_reg (rtx_insn *insn)
+ix86_profile_before_prologue (void)
 {
-  df_ref ref;
-  FOR_EACH_INSN_DEF (ref, insn)
-    if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
-	&& !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
-	&& DF_REF_REGNO (ref) != FLAGS_REG)
-      return true;
-
-  FOR_EACH_INSN_USE (ref, insn)
-    if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
-      return true;
-
-  return false;
+  return flag_fentry != 0;
 }
 
-/* Check if comparison INSN may be transformed
-   into vector comparison.  Currently we transform
-   zero checks only which look like:
-
-   (set (reg:CCZ 17 flags)
-        (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
-                             (subreg:SI (reg:DI x) 0))
-		     (const_int 0 [0])))  */
+/* Update register usage after having seen the compiler flags.  */
 
-static bool
-convertible_comparison_p (rtx_insn *insn)
+static void
+ix86_conditional_register_usage (void)
 {
-  if (!TARGET_SSE4_1)
-    return false;
+  int i, c_mask;
 
-  rtx def_set = single_set (insn);
+  /* If there are no caller-saved registers, preserve all registers.
+     except fixed_regs and registers used for function return value
+     since aggregate_value_p checks call_used_regs[regno] on return
+     value.  */
+  if (cfun && cfun->machine->no_caller_saved_registers)
+    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+      if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
+	call_used_regs[i] = 0;
 
-  gcc_assert (def_set);
+  /* For 32-bit targets, disable the REX registers.  */
+  if (! TARGET_64BIT)
+    {
+      for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
+	CLEAR_HARD_REG_BIT (accessible_reg_set, i);
+      for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
+	CLEAR_HARD_REG_BIT (accessible_reg_set, i);
+      for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
+	CLEAR_HARD_REG_BIT (accessible_reg_set, i);
+    }
 
-  rtx src = SET_SRC (def_set);
-  rtx dst = SET_DEST (def_set);
+  /*  See the definition of CALL_USED_REGISTERS in i386.h.  */
+  c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
+  
+  CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
 
-  gcc_assert (GET_CODE (src) == COMPARE);
+  for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+    {
+      /* Set/reset conditionally defined registers from
+	 CALL_USED_REGISTERS initializer.  */
+      if (call_used_regs[i] > 1)
+	call_used_regs[i] = !!(call_used_regs[i] & c_mask);
 
-  if (GET_CODE (dst) != REG
-      || REGNO (dst) != FLAGS_REG
-      || GET_MODE (dst) != CCZmode)
-    return false;
+      /* Calculate registers of CLOBBERED_REGS register set
+	 as call used registers from GENERAL_REGS register set.  */
+      if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
+	  && call_used_regs[i])
+	SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
+    }
 
-  rtx op1 = XEXP (src, 0);
-  rtx op2 = XEXP (src, 1);
+  /* If MMX is disabled, disable the registers.  */
+  if (! TARGET_MMX)
+    accessible_reg_set &= ~reg_class_contents[MMX_REGS];
 
-  if (op2 != CONST0_RTX (GET_MODE (op2)))
-    return false;
+  /* If SSE is disabled, disable the registers.  */
+  if (! TARGET_SSE)
+    accessible_reg_set &= ~reg_class_contents[ALL_SSE_REGS];
 
-  if (GET_CODE (op1) != IOR)
-    return false;
+  /* If the FPU is disabled, disable the registers.  */
+  if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
+    accessible_reg_set &= ~reg_class_contents[FLOAT_REGS];
 
-  op2 = XEXP (op1, 1);
-  op1 = XEXP (op1, 0);
-
-  if (!SUBREG_P (op1)
-      || !SUBREG_P (op2)
-      || GET_MODE (op1) != SImode
-      || GET_MODE (op2) != SImode
-      || ((SUBREG_BYTE (op1) != 0
-	   || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
-	  && (SUBREG_BYTE (op2) != 0
-	      || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
-    return false;
+  /* If AVX512F is disabled, disable the registers.  */
+  if (! TARGET_AVX512F)
+    {
+      for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
+	CLEAR_HARD_REG_BIT (accessible_reg_set, i);
 
-  op1 = SUBREG_REG (op1);
-  op2 = SUBREG_REG (op2);
+      accessible_reg_set &= ~reg_class_contents[ALL_MASK_REGS];
+    }
+}
 
-  if (op1 != op2
-      || !REG_P (op1)
-      || GET_MODE (op1) != DImode)
-    return false;
+/* Canonicalize a comparison from one we don't have to one we do have.  */
 
-  return true;
-}
+static void
+ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
+			      bool op0_preserve_value)
+{
+  /* The order of operands in x87 ficom compare is forced by combine in
+     simplify_comparison () function. Float operator is treated as RTX_OBJ
+     with a precedence over other operators and is always put in the first
+     place. Swap condition and operands to match ficom instruction.  */
+  if (!op0_preserve_value
+      && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
+    {
+      enum rtx_code scode = swap_condition ((enum rtx_code) *code);
 
-/* The DImode version of scalar_to_vector_candidate_p.  */
+      /* We are called only for compares that are split to SAHF instruction.
+	 Ensure that we have setcc/jcc insn for the swapped condition.  */
+      if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
+	{
+	  std::swap (*op0, *op1);
+	  *code = (int) scode;
+	}
+    }
+}
+
+
+/* Hook to determine if one function can safely inline another.  */
 
 static bool
-dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
+ix86_can_inline_p (tree caller, tree callee)
 {
-  rtx def_set = single_set (insn);
-
-  if (!def_set)
-    return false;
+  tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
+  tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
 
-  if (has_non_address_hard_reg (insn))
-    return false;
+  /* Changes of those flags can be tolerated for always inlines. Lets hope
+     user knows what he is doing.  */
+  const unsigned HOST_WIDE_INT always_inline_safe_mask
+	 = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS
+	    | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD
+	    | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD
+	    | MASK_NO_FANCY_MATH_387 | MASK_IEEE_FP | MASK_INLINE_ALL_STRINGOPS
+	    | MASK_INLINE_STRINGOPS_DYNAMICALLY | MASK_RECIP | MASK_STACK_PROBE
+	    | MASK_STV | MASK_TLS_DIRECT_SEG_REFS | MASK_VZEROUPPER
+	    | MASK_NO_PUSH_ARGS | MASK_OMIT_LEAF_FRAME_POINTER);
 
-  rtx src = SET_SRC (def_set);
-  rtx dst = SET_DEST (def_set);
 
-  if (GET_CODE (src) == COMPARE)
-    return convertible_comparison_p (insn);
+  if (!callee_tree)
+    callee_tree = target_option_default_node;
+  if (!caller_tree)
+    caller_tree = target_option_default_node;
+  if (callee_tree == caller_tree)
+    return true;
 
-  /* We are interested in DImode promotion only.  */
-  if ((GET_MODE (src) != DImode
-       && !CONST_INT_P (src))
-      || GET_MODE (dst) != DImode)
-    return false;
+  struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
+  struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
+  bool ret = false;
+  bool always_inline
+    = (DECL_DISREGARD_INLINE_LIMITS (callee)
+       && lookup_attribute ("always_inline",
+			    DECL_ATTRIBUTES (callee)));
 
-  if (!REG_P (dst) && !MEM_P (dst))
-    return false;
-
-  switch (GET_CODE (src))
-    {
-    case ASHIFTRT:
-      if (!TARGET_AVX512VL)
-	return false;
-      /* FALLTHRU */
-
-    case ASHIFT:
-    case LSHIFTRT:
-      if (!CONST_INT_P (XEXP (src, 1))
-	  || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63))
-	return false;
-      break;
-
-    case PLUS:
-    case MINUS:
-    case IOR:
-    case XOR:
-    case AND:
-      if (!REG_P (XEXP (src, 1))
-	  && !MEM_P (XEXP (src, 1))
-	  && !CONST_INT_P (XEXP (src, 1)))
-	return false;
-
-      if (GET_MODE (XEXP (src, 1)) != DImode
-	  && !CONST_INT_P (XEXP (src, 1)))
-	return false;
-      break;
+  cgraph_node *callee_node = cgraph_node::get (callee);
+  /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
+     function can inline a SSE2 function but a SSE2 function can't inline
+     a SSE4 function.  */
+  if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
+       != callee_opts->x_ix86_isa_flags)
+      || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
+	  != callee_opts->x_ix86_isa_flags2))
+    ret = false;
 
-    case NEG:
-    case NOT:
-      break;
+  /* See if we have the same non-isa options.  */
+  else if ((!always_inline
+	    && caller_opts->x_target_flags != callee_opts->x_target_flags)
+	   || (caller_opts->x_target_flags & ~always_inline_safe_mask)
+	       != (callee_opts->x_target_flags & ~always_inline_safe_mask))
+    ret = false;
 
-    case REG:
-      return true;
+  /* See if arch, tune, etc. are the same.  */
+  else if (caller_opts->arch != callee_opts->arch)
+    ret = false;
 
-    case MEM:
-    case CONST_INT:
-      return REG_P (dst);
+  else if (!always_inline && caller_opts->tune != callee_opts->tune)
+    ret = false;
 
-    default:
-      return false;
-    }
+  else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
+	   /* If the calle doesn't use FP expressions differences in
+	      ix86_fpmath can be ignored.  We are called from FEs
+	      for multi-versioning call optimization, so beware of
+	      ipa_fn_summaries not available.  */
+	   && (! ipa_fn_summaries
+	       || ipa_fn_summaries->get (callee_node) == NULL
+	       || ipa_fn_summaries->get (callee_node)->fp_expressions))
+    ret = false;
 
-  if (!REG_P (XEXP (src, 0))
-      && !MEM_P (XEXP (src, 0))
-      && !CONST_INT_P (XEXP (src, 0))
-      /* Check for andnot case.  */
-      && (GET_CODE (src) != AND
-	  || GET_CODE (XEXP (src, 0)) != NOT
-	  || !REG_P (XEXP (XEXP (src, 0), 0))))
-      return false;
+  else if (!always_inline
+	   && caller_opts->branch_cost != callee_opts->branch_cost)
+    ret = false;
 
-  if (GET_MODE (XEXP (src, 0)) != DImode
-      && !CONST_INT_P (XEXP (src, 0)))
-    return false;
+  else
+    ret = true;
 
-  return true;
+  return ret;
 }
-
-/* The TImode version of scalar_to_vector_candidate_p.  */
+
+/* Return true if this goes in large data/bss.  */
 
 static bool
-timode_scalar_to_vector_candidate_p (rtx_insn *insn)
+ix86_in_large_data_p (tree exp)
 {
-  rtx def_set = single_set (insn);
-
-  if (!def_set)
+  if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
     return false;
 
-  if (has_non_address_hard_reg (insn))
+  if (exp == NULL_TREE)
     return false;
 
-  rtx src = SET_SRC (def_set);
-  rtx dst = SET_DEST (def_set);
-
-  /* Only TImode load and store are allowed.  */
-  if (GET_MODE (dst) != TImode)
+  /* Functions are never large data.  */
+  if (TREE_CODE (exp) == FUNCTION_DECL)
     return false;
 
-  if (MEM_P (dst))
-    {
-      /* Check for store.  Memory must be aligned or unaligned store
-	 is optimal.  Only support store from register, standard SSE
-	 constant or CONST_WIDE_INT generated from piecewise store.
-
-	 ??? Verify performance impact before enabling CONST_INT for
-	 __int128 store.  */
-      if (misaligned_operand (dst, TImode)
-	  && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
-	return false;
-
-      switch (GET_CODE (src))
-	{
-	default:
-	  return false;
-
-	case REG:
-	case CONST_WIDE_INT:
-	  return true;
+  /* Automatic variables are never large data.  */
+  if (VAR_P (exp) && !is_global_var (exp))
+    return false;
 
-	case CONST_INT:
-	  return standard_sse_constant_p (src, TImode);
-	}
-    }
-  else if (MEM_P (src))
+  if (VAR_P (exp) && DECL_SECTION_NAME (exp))
     {
-      /* Check for load.  Memory must be aligned or unaligned load is
-	 optimal.  */
-      return (REG_P (dst)
-	      && (!misaligned_operand (src, TImode)
-		  || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
+      const char *section = DECL_SECTION_NAME (exp);
+      if (strcmp (section, ".ldata") == 0
+	  || strcmp (section, ".lbss") == 0)
+	return true;
+      return false;
     }
-
-  return false;
-}
-
-/* Return 1 if INSN may be converted into vector
-   instruction.  */
-
-static bool
-scalar_to_vector_candidate_p (rtx_insn *insn)
-{
-  if (TARGET_64BIT)
-    return timode_scalar_to_vector_candidate_p (insn);
   else
-    return dimode_scalar_to_vector_candidate_p (insn);
-}
-
-/* The DImode version of remove_non_convertible_regs.  */
-
-static void
-dimode_remove_non_convertible_regs (bitmap candidates)
-{
-  bitmap_iterator bi;
-  unsigned id;
-  bitmap regs = BITMAP_ALLOC (NULL);
-
-  EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
-    {
-      rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
-      rtx reg = SET_DEST (def_set);
-
-      if (!REG_P (reg)
-	  || bitmap_bit_p (regs, REGNO (reg))
-	  || HARD_REGISTER_P (reg))
-	continue;
-
-      for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
-	   def;
-	   def = DF_REF_NEXT_REG (def))
-	{
-	  if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
-	    {
-	      if (dump_file)
-		fprintf (dump_file,
-			 "r%d has non convertible definition in insn %d\n",
-			 REGNO (reg), DF_REF_INSN_UID (def));
-
-	      bitmap_set_bit (regs, REGNO (reg));
-	      break;
-	    }
-	}
-    }
-
-  EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
     {
-      for (df_ref def = DF_REG_DEF_CHAIN (id);
-	   def;
-	   def = DF_REF_NEXT_REG (def))
-	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
-	  {
-	    if (dump_file)
-	      fprintf (dump_file, "Removing insn %d from candidates list\n",
-		       DF_REF_INSN_UID (def));
+      HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
 
-	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
-	  }
+      /* If this is an incomplete type with size 0, then we can't put it
+	 in data because it might be too big when completed.  Also,
+	 int_size_in_bytes returns -1 if size can vary or is larger than
+	 an integer in which case also it is safer to assume that it goes in
+	 large data.  */
+      if (size <= 0 || size > ix86_section_threshold)
+	return true;
     }
 
-  BITMAP_FREE (regs);
+  return false;
 }
 
-/* For a register REGNO, scan instructions for its defs and uses.
-   Put REGNO in REGS if a def or use isn't in CANDIDATES.  */
+/* i386-specific section flag to mark large sections.  */
+#define SECTION_LARGE SECTION_MACH_DEP
+
+/* Switch to the appropriate section for output of DECL.
+   DECL is either a `VAR_DECL' node or a constant of some sort.
+   RELOC indicates whether forming the initial value of DECL requires
+   link-time relocations.  */
 
-static void
-timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
-				   unsigned int regno)
+ATTRIBUTE_UNUSED static section *
+x86_64_elf_select_section (tree decl, int reloc,
+			   unsigned HOST_WIDE_INT align)
 {
-  for (df_ref def = DF_REG_DEF_CHAIN (regno);
-       def;
-       def = DF_REF_NEXT_REG (def))
+  if (ix86_in_large_data_p (decl))
     {
-      if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
+      const char *sname = NULL;
+      unsigned int flags = SECTION_WRITE | SECTION_LARGE;
+      switch (categorize_decl_for_section (decl, reloc))
 	{
-	  if (dump_file)
-	    fprintf (dump_file,
-		     "r%d has non convertible def in insn %d\n",
-		     regno, DF_REF_INSN_UID (def));
-
-	  bitmap_set_bit (regs, regno);
+	case SECCAT_DATA:
+	  sname = ".ldata";
+	  break;
+	case SECCAT_DATA_REL:
+	  sname = ".ldata.rel";
+	  break;
+	case SECCAT_DATA_REL_LOCAL:
+	  sname = ".ldata.rel.local";
+	  break;
+	case SECCAT_DATA_REL_RO:
+	  sname = ".ldata.rel.ro";
+	  break;
+	case SECCAT_DATA_REL_RO_LOCAL:
+	  sname = ".ldata.rel.ro.local";
+	  break;
+	case SECCAT_BSS:
+	  sname = ".lbss";
+	  flags |= SECTION_BSS;
+	  break;
+	case SECCAT_RODATA:
+	case SECCAT_RODATA_MERGE_STR:
+	case SECCAT_RODATA_MERGE_STR_INIT:
+	case SECCAT_RODATA_MERGE_CONST:
+	  sname = ".lrodata";
+	  flags &= ~SECTION_WRITE;
+	  break;
+	case SECCAT_SRODATA:
+	case SECCAT_SDATA:
+	case SECCAT_SBSS:
+	  gcc_unreachable ();
+	case SECCAT_TEXT:
+	case SECCAT_TDATA:
+	case SECCAT_TBSS:
+	  /* We don't split these for medium model.  Place them into
+	     default sections and hope for best.  */
 	  break;
 	}
-    }
-
-  for (df_ref ref = DF_REG_USE_CHAIN (regno);
-       ref;
-       ref = DF_REF_NEXT_REG (ref))
-    {
-      /* Debug instructions are skipped.  */
-      if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
-	  && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
+      if (sname)
 	{
-	  if (dump_file)
-	    fprintf (dump_file,
-		     "r%d has non convertible use in insn %d\n",
-		     regno, DF_REF_INSN_UID (ref));
-
-	  bitmap_set_bit (regs, regno);
-	  break;
+	  /* We might get called with string constants, but get_named_section
+	     doesn't like them as they are not DECLs.  Also, we need to set
+	     flags in that case.  */
+	  if (!DECL_P (decl))
+	    return get_section (sname, flags, NULL);
+	  return get_named_section (decl, sname, reloc);
 	}
     }
+  return default_elf_select_section (decl, reloc, align);
 }
 
-/* The TImode version of remove_non_convertible_regs.  */
+/* Select a set of attributes for section NAME based on the properties
+   of DECL and whether or not RELOC indicates that DECL's initializer
+   might contain runtime relocations.  */
 
-static void
-timode_remove_non_convertible_regs (bitmap candidates)
+static unsigned int ATTRIBUTE_UNUSED
+x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
 {
-  bitmap_iterator bi;
-  unsigned id;
-  bitmap regs = BITMAP_ALLOC (NULL);
-
-  EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
-    {
-      rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
-      rtx dest = SET_DEST (def_set);
-      rtx src = SET_SRC (def_set);
-
-      if ((!REG_P (dest)
-	   || bitmap_bit_p (regs, REGNO (dest))
-	   || HARD_REGISTER_P (dest))
-	  && (!REG_P (src)
-	      || bitmap_bit_p (regs, REGNO (src))
-	      || HARD_REGISTER_P (src)))
-	continue;
-
-      if (REG_P (dest))
-	timode_check_non_convertible_regs (candidates, regs,
-					   REGNO (dest));
-
-      if (REG_P (src))
-	timode_check_non_convertible_regs (candidates, regs,
-					   REGNO (src));
-    }
-
-  EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
-    {
-      for (df_ref def = DF_REG_DEF_CHAIN (id);
-	   def;
-	   def = DF_REF_NEXT_REG (def))
-	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
-	  {
-	    if (dump_file)
-	      fprintf (dump_file, "Removing insn %d from candidates list\n",
-		       DF_REF_INSN_UID (def));
+  unsigned int flags = default_section_type_flags (decl, name, reloc);
 
-	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
-	  }
+  if (ix86_in_large_data_p (decl))
+    flags |= SECTION_LARGE;
 
-      for (df_ref ref = DF_REG_USE_CHAIN (id);
-	   ref;
-	   ref = DF_REF_NEXT_REG (ref))
-	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
-	  {
-	    if (dump_file)
-	      fprintf (dump_file, "Removing insn %d from candidates list\n",
-		       DF_REF_INSN_UID (ref));
+  if (decl == NULL_TREE
+      && (strcmp (name, ".ldata.rel.ro") == 0
+	  || strcmp (name, ".ldata.rel.ro.local") == 0))
+    flags |= SECTION_RELRO;
 
-	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
-	  }
-    }
+  if (strcmp (name, ".lbss") == 0
+      || strncmp (name, ".lbss.", 5) == 0
+      || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
+    flags |= SECTION_BSS;
 
-  BITMAP_FREE (regs);
+  return flags;
 }
 
-/* For a given bitmap of insn UIDs scans all instruction and
-   remove insn from CANDIDATES in case it has both convertible
-   and not convertible definitions.
-
-   All insns in a bitmap are conversion candidates according to
-   scalar_to_vector_candidate_p.  Currently it implies all insns
-   are single_set.  */
-
-static void
-remove_non_convertible_regs (bitmap candidates)
-{
-  if (TARGET_64BIT)
-    timode_remove_non_convertible_regs (candidates);
-  else
-    dimode_remove_non_convertible_regs (candidates);
-}
-
-class scalar_chain
-{
- public:
-  scalar_chain ();
-  virtual ~scalar_chain ();
-
-  static unsigned max_id;
-
-  /* ID of a chain.  */
-  unsigned int chain_id;
-  /* A queue of instructions to be included into a chain.  */
-  bitmap queue;
-  /* Instructions included into a chain.  */
-  bitmap insns;
-  /* All registers defined by a chain.  */
-  bitmap defs;
-  /* Registers used in both vector and sclar modes.  */
-  bitmap defs_conv;
-
-  void build (bitmap candidates, unsigned insn_uid);
-  virtual int compute_convert_gain () = 0;
-  int convert ();
-
- protected:
-  void add_to_queue (unsigned insn_uid);
-  void emit_conversion_insns (rtx insns, rtx_insn *pos);
-
- private:
-  void add_insn (bitmap candidates, unsigned insn_uid);
-  void analyze_register_chain (bitmap candidates, df_ref ref);
-  virtual void mark_dual_mode_def (df_ref def) = 0;
-  virtual void convert_insn (rtx_insn *insn) = 0;
-  virtual void convert_registers () = 0;
-};
-
-class dimode_scalar_chain : public scalar_chain
-{
- public:
-  int compute_convert_gain ();
- private:
-  void mark_dual_mode_def (df_ref def);
-  rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
-  void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
-  void convert_insn (rtx_insn *insn);
-  void convert_op (rtx *op, rtx_insn *insn);
-  void convert_reg (unsigned regno);
-  void make_vector_copies (unsigned regno);
-  void convert_registers ();
-  int vector_const_cost (rtx exp);
-};
+/* Build up a unique section name, expressed as a
+   STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
+   RELOC indicates whether the initial value of EXP requires
+   link-time relocations.  */
 
-class timode_scalar_chain : public scalar_chain
+static void ATTRIBUTE_UNUSED
+x86_64_elf_unique_section (tree decl, int reloc)
 {
- public:
-  /* Convert from TImode to V1TImode is always faster.  */
-  int compute_convert_gain () { return 1; }
-
- private:
-  void mark_dual_mode_def (df_ref def);
-  void fix_debug_reg_uses (rtx reg);
-  void convert_insn (rtx_insn *insn);
-  /* We don't convert registers to difference size.  */
-  void convert_registers () {}
-};
-
-unsigned scalar_chain::max_id = 0;
-
-/* Initialize new chain.  */
+  if (ix86_in_large_data_p (decl))
+    {
+      const char *prefix = NULL;
+      /* We only need to use .gnu.linkonce if we don't have COMDAT groups.  */
+      bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
 
-scalar_chain::scalar_chain ()
-{
-  chain_id = ++max_id;
+      switch (categorize_decl_for_section (decl, reloc))
+	{
+	case SECCAT_DATA:
+	case SECCAT_DATA_REL:
+	case SECCAT_DATA_REL_LOCAL:
+	case SECCAT_DATA_REL_RO:
+	case SECCAT_DATA_REL_RO_LOCAL:
+          prefix = one_only ? ".ld" : ".ldata";
+	  break;
+	case SECCAT_BSS:
+          prefix = one_only ? ".lb" : ".lbss";
+	  break;
+	case SECCAT_RODATA:
+	case SECCAT_RODATA_MERGE_STR:
+	case SECCAT_RODATA_MERGE_STR_INIT:
+	case SECCAT_RODATA_MERGE_CONST:
+          prefix = one_only ? ".lr" : ".lrodata";
+	  break;
+	case SECCAT_SRODATA:
+	case SECCAT_SDATA:
+	case SECCAT_SBSS:
+	  gcc_unreachable ();
+	case SECCAT_TEXT:
+	case SECCAT_TDATA:
+	case SECCAT_TBSS:
+	  /* We don't split these for medium model.  Place them into
+	     default sections and hope for best.  */
+	  break;
+	}
+      if (prefix)
+	{
+	  const char *name, *linkonce;
+	  char *string;
 
-   if (dump_file)
-    fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
+	  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
+	  name = targetm.strip_name_encoding (name);
 
-  bitmap_obstack_initialize (NULL);
-  insns = BITMAP_ALLOC (NULL);
-  defs = BITMAP_ALLOC (NULL);
-  defs_conv = BITMAP_ALLOC (NULL);
-  queue = NULL;
-}
+	  /* If we're using one_only, then there needs to be a .gnu.linkonce
+     	     prefix to the section name.  */
+	  linkonce = one_only ? ".gnu.linkonce" : "";
 
-/* Free chain's data.  */
+	  string = ACONCAT ((linkonce, prefix, ".", name, NULL));
 
-scalar_chain::~scalar_chain ()
-{
-  BITMAP_FREE (insns);
-  BITMAP_FREE (defs);
-  BITMAP_FREE (defs_conv);
-  bitmap_obstack_release (NULL);
+	  set_decl_section_name (decl, string);
+	  return;
+	}
+    }
+  default_unique_section (decl, reloc);
 }
 
-/* Add instruction into chains' queue.  */
-
-void
-scalar_chain::add_to_queue (unsigned insn_uid)
-{
-  if (bitmap_bit_p (insns, insn_uid)
-      || bitmap_bit_p (queue, insn_uid))
-    return;
+#ifdef COMMON_ASM_OP
 
-  if (dump_file)
-    fprintf (dump_file, "  Adding insn %d into chain's #%d queue\n",
-	     insn_uid, chain_id);
-  bitmap_set_bit (queue, insn_uid);
-}
+#ifndef LARGECOMM_SECTION_ASM_OP
+#define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
+#endif
 
-/* For DImode conversion, mark register defined by DEF as requiring
-   conversion.  */
+/* This says how to output assembler code to declare an
+   uninitialized external linkage data object.
 
+   For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
+   large objects.  */
 void
-dimode_scalar_chain::mark_dual_mode_def (df_ref def)
+x86_elf_aligned_decl_common (FILE *file, tree decl,
+			const char *name, unsigned HOST_WIDE_INT size,
+			int align)
 {
-  gcc_assert (DF_REF_REG_DEF_P (def));
-
-  if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
-    return;
-
-  if (dump_file)
-    fprintf (dump_file,
-	     "  Mark r%d def in insn %d as requiring both modes in chain #%d\n",
-	     DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
-
-  bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
+  if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
+      && size > (unsigned int)ix86_section_threshold)
+    {
+      switch_to_section (get_named_section (decl, ".lbss", 0));
+      fputs (LARGECOMM_SECTION_ASM_OP, file);
+    }
+  else
+    fputs (COMMON_ASM_OP, file);
+  assemble_name (file, name);
+  fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
+	   size, align / BITS_PER_UNIT);
 }
+#endif
 
-/* For TImode conversion, it is unused.  */
+/* Utility function for targets to use in implementing
+   ASM_OUTPUT_ALIGNED_BSS.  */
 
 void
-timode_scalar_chain::mark_dual_mode_def (df_ref)
+x86_output_aligned_bss (FILE *file, tree decl, const char *name,
+		       	unsigned HOST_WIDE_INT size, int align)
 {
-  gcc_unreachable ();
+  if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
+      && size > (unsigned int)ix86_section_threshold)
+    switch_to_section (get_named_section (decl, ".lbss", 0));
+  else
+    switch_to_section (bss_section);
+  ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
+#ifdef ASM_DECLARE_OBJECT_NAME
+  last_assemble_variable_decl = decl;
+  ASM_DECLARE_OBJECT_NAME (file, name, decl);
+#else
+  /* Standard thing is just output label for the object.  */
+  ASM_OUTPUT_LABEL (file, name);
+#endif /* ASM_DECLARE_OBJECT_NAME */
+  ASM_OUTPUT_SKIP (file, size ? size : 1);
 }
+
+/* Decide whether we must probe the stack before any space allocation
+   on this target.  It's essentially TARGET_STACK_PROBE except when
+   -fstack-check causes the stack to be already probed differently.  */
 
-/* Check REF's chain to add new insns into a queue
-   and find registers requiring conversion.  */
-
-void
-scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
+bool
+ix86_target_stack_probe (void)
 {
-  df_link *chain;
-
-  gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
-	      || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
-  add_to_queue (DF_REF_INSN_UID (ref));
-
-  for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
-    {
-      unsigned uid = DF_REF_INSN_UID (chain->ref);
-
-      if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
-	continue;
-
-      if (!DF_REF_REG_MEM_P (chain->ref))
-	{
-	  if (bitmap_bit_p (insns, uid))
-	    continue;
-
-	  if (bitmap_bit_p (candidates, uid))
-	    {
-	      add_to_queue (uid);
-	      continue;
-	    }
-	}
+  /* Do not probe the stack twice if static stack checking is enabled.  */
+  if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
+    return false;
 
-      if (DF_REF_REG_DEF_P (chain->ref))
-	{
-	  if (dump_file)
-	    fprintf (dump_file, "  r%d def in insn %d isn't convertible\n",
-		     DF_REF_REGNO (chain->ref), uid);
-	  mark_dual_mode_def (chain->ref);
-	}
-      else
-	{
-	  if (dump_file)
-	    fprintf (dump_file, "  r%d use in insn %d isn't convertible\n",
-		     DF_REF_REGNO (chain->ref), uid);
-	  mark_dual_mode_def (ref);
-	}
-    }
+  return TARGET_STACK_PROBE;
 }
+
+/* Decide whether we can make a sibling call to a function.  DECL is the
+   declaration of the function being targeted by the call and EXP is the
+   CALL_EXPR representing the call.  */
 
-/* Add instruction into a chain.  */
-
-void
-scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
+static bool
+ix86_function_ok_for_sibcall (tree decl, tree exp)
 {
-  if (bitmap_bit_p (insns, insn_uid))
-    return;
-
-  if (dump_file)
-    fprintf (dump_file, "  Adding insn %d to chain #%d\n", insn_uid, chain_id);
-
-  bitmap_set_bit (insns, insn_uid);
+  tree type, decl_or_type;
+  rtx a, b;
+  bool bind_global = decl && !targetm.binds_local_p (decl);
 
-  rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
-  rtx def_set = single_set (insn);
-  if (def_set && REG_P (SET_DEST (def_set))
-      && !HARD_REGISTER_P (SET_DEST (def_set)))
-    bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
+  if (ix86_function_naked (current_function_decl))
+    return false;
 
-  df_ref ref;
-  df_ref def;
-  for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
-    if (!HARD_REGISTER_P (DF_REF_REG (ref)))
-      for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
-	   def;
-	   def = DF_REF_NEXT_REG (def))
-	analyze_register_chain (candidates, def);
-  for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
-    if (!DF_REF_REG_MEM_P (ref))
-      analyze_register_chain (candidates, ref);
-}
+  /* Sibling call isn't OK if there are no caller-saved registers
+     since all registers must be preserved before return.  */
+  if (cfun->machine->no_caller_saved_registers)
+    return false;
 
-/* Build new chain starting from insn INSN_UID recursively
-   adding all dependent uses and definitions.  */
+  /* If we are generating position-independent code, we cannot sibcall
+     optimize direct calls to global functions, as the PLT requires
+     %ebx be live. (Darwin does not have a PLT.)  */
+  if (!TARGET_MACHO
+      && !TARGET_64BIT
+      && flag_pic
+      && flag_plt
+      && bind_global)
+    return false;
 
-void
-scalar_chain::build (bitmap candidates, unsigned insn_uid)
-{
-  queue = BITMAP_ALLOC (NULL);
-  bitmap_set_bit (queue, insn_uid);
+  /* If we need to align the outgoing stack, then sibcalling would
+     unalign the stack, which may break the called function.  */
+  if (ix86_minimum_incoming_stack_boundary (true)
+      < PREFERRED_STACK_BOUNDARY)
+    return false;
 
-  if (dump_file)
-    fprintf (dump_file, "Building chain #%d...\n", chain_id);
+  if (decl)
+    {
+      decl_or_type = decl;
+      type = TREE_TYPE (decl);
+    }
+  else
+    {
+      /* We're looking at the CALL_EXPR, we need the type of the function.  */
+      type = CALL_EXPR_FN (exp);		/* pointer expression */
+      type = TREE_TYPE (type);			/* pointer type */
+      type = TREE_TYPE (type);			/* function type */
+      decl_or_type = type;
+    }
 
-  while (!bitmap_empty_p (queue))
+  /* Check that the return value locations are the same.  Like
+     if we are returning floats on the 80387 register stack, we cannot
+     make a sibcall from a function that doesn't return a float to a
+     function that does or, conversely, from a function that does return
+     a float to a function that doesn't; the necessary stack adjustment
+     would not be executed.  This is also the place we notice
+     differences in the return value ABI.  Note that it is ok for one
+     of the functions to have void return type as long as the return
+     value of the other is passed in a register.  */
+  a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
+  b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
+			   cfun->decl, false);
+  if (STACK_REG_P (a) || STACK_REG_P (b))
     {
-      insn_uid = bitmap_first_set_bit (queue);
-      bitmap_clear_bit (queue, insn_uid);
-      bitmap_clear_bit (candidates, insn_uid);
-      add_insn (candidates, insn_uid);
+      if (!rtx_equal_p (a, b))
+	return false;
     }
+  else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
+    ;
+  else if (!rtx_equal_p (a, b))
+    return false;
 
-  if (dump_file)
+  if (TARGET_64BIT)
+    {
+      /* The SYSV ABI has more call-clobbered registers;
+	 disallow sibcalls from MS to SYSV.  */
+      if (cfun->machine->call_abi == MS_ABI
+	  && ix86_function_type_abi (type) == SYSV_ABI)
+	return false;
+    }
+  else
     {
-      fprintf (dump_file, "Collected chain #%d...\n", chain_id);
-      fprintf (dump_file, "  insns: ");
-      dump_bitmap (dump_file, insns);
-      if (!bitmap_empty_p (defs_conv))
+      /* If this call is indirect, we'll need to be able to use a
+	 call-clobbered register for the address of the target function.
+	 Make sure that all such registers are not used for passing
+	 parameters.  Note that DLLIMPORT functions and call to global
+	 function via GOT slot are indirect.  */
+      if (!decl
+	  || (bind_global && flag_pic && !flag_plt)
+	  || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl))
+	  || flag_force_indirect_call)
 	{
-	  bitmap_iterator bi;
-	  unsigned id;
-	  const char *comma = "";
-	  fprintf (dump_file, "  defs to convert: ");
-	  EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
-	    {
-	      fprintf (dump_file, "%sr%d", comma, id);
-	      comma = ", ";
-	    }
-	  fprintf (dump_file, "\n");
+	  /* Check if regparm >= 3 since arg_reg_available is set to
+	     false if regparm == 0.  If regparm is 1 or 2, there is
+	     always a call-clobbered register available.
+
+	     ??? The symbol indirect call doesn't need a call-clobbered
+	     register.  But we don't know if this is a symbol indirect
+	     call or not here.  */
+	  if (ix86_function_regparm (type, decl) >= 3
+	      && !cfun->machine->arg_reg_available)
+	    return false;
 	}
     }
 
-  BITMAP_FREE (queue);
+  /* Otherwise okay.  That also includes certain types of indirect calls.  */
+  return true;
 }
 
-/* Return a cost of building a vector costant
-   instead of using a scalar one.  */
+/* This function determines from TYPE the calling-convention.  */
 
-int
-dimode_scalar_chain::vector_const_cost (rtx exp)
+unsigned int
+ix86_get_callcvt (const_tree type)
 {
-  gcc_assert (CONST_INT_P (exp));
+  unsigned int ret = 0;
+  bool is_stdarg;
+  tree attrs;
 
-  if (standard_sse_constant_p (exp, V2DImode))
-    return COSTS_N_INSNS (1);
-  return ix86_cost->sse_load[1];
-}
+  if (TARGET_64BIT)
+    return IX86_CALLCVT_CDECL;
 
-/* Compute a gain for chain conversion.  */
+  attrs = TYPE_ATTRIBUTES (type);
+  if (attrs != NULL_TREE)
+    {
+      if (lookup_attribute ("cdecl", attrs))
+	ret |= IX86_CALLCVT_CDECL;
+      else if (lookup_attribute ("stdcall", attrs))
+	ret |= IX86_CALLCVT_STDCALL;
+      else if (lookup_attribute ("fastcall", attrs))
+	ret |= IX86_CALLCVT_FASTCALL;
+      else if (lookup_attribute ("thiscall", attrs))
+	ret |= IX86_CALLCVT_THISCALL;
 
-int
-dimode_scalar_chain::compute_convert_gain ()
-{
-  bitmap_iterator bi;
-  unsigned insn_uid;
-  int gain = 0;
-  int cost = 0;
-
-  if (dump_file)
-    fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
-
-  EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
-    {
-      rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
-      rtx def_set = single_set (insn);
-      rtx src = SET_SRC (def_set);
-      rtx dst = SET_DEST (def_set);
-
-      if (REG_P (src) && REG_P (dst))
-	gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
-      else if (REG_P (src) && MEM_P (dst))
-	gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
-      else if (MEM_P (src) && REG_P (dst))
-	gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
-      else if (GET_CODE (src) == ASHIFT
-	       || GET_CODE (src) == ASHIFTRT
-	       || GET_CODE (src) == LSHIFTRT)
-	{
-    	  if (CONST_INT_P (XEXP (src, 0)))
-	    gain -= vector_const_cost (XEXP (src, 0));
-
-	  gain += ix86_cost->shift_const;
-	  if (INTVAL (XEXP (src, 1)) >= 32)
-	    gain -= COSTS_N_INSNS (1);
-	}
-      else if (GET_CODE (src) == PLUS
-	       || GET_CODE (src) == MINUS
-	       || GET_CODE (src) == IOR
-	       || GET_CODE (src) == XOR
-	       || GET_CODE (src) == AND)
-	{
-	  gain += ix86_cost->add;
-	  /* Additional gain for andnot for targets without BMI.  */
-	  if (GET_CODE (XEXP (src, 0)) == NOT
-	      && !TARGET_BMI)
-	    gain += 2 * ix86_cost->add;
-
-	  if (CONST_INT_P (XEXP (src, 0)))
-	    gain -= vector_const_cost (XEXP (src, 0));
-	  if (CONST_INT_P (XEXP (src, 1)))
-	    gain -= vector_const_cost (XEXP (src, 1));
-	}
-      else if (GET_CODE (src) == NEG
-	       || GET_CODE (src) == NOT)
-	gain += ix86_cost->add - COSTS_N_INSNS (1);
-      else if (GET_CODE (src) == COMPARE)
-	{
-	  /* Assume comparison cost is the same.  */
-	}
-      else if (CONST_INT_P (src))
-	{
-	  if (REG_P (dst))
-	    gain += COSTS_N_INSNS (2);
-	  else if (MEM_P (dst))
-	    gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
-	  gain -= vector_const_cost (src);
-	}
-      else
-	gcc_unreachable ();
-    }
-
-  if (dump_file)
-    fprintf (dump_file, "  Instruction conversion gain: %d\n", gain);
-
-  EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
-    cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
+      /* Regparam isn't allowed for thiscall and fastcall.  */
+      if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
+	{
+	  if (lookup_attribute ("regparm", attrs))
+	    ret |= IX86_CALLCVT_REGPARM;
+	  if (lookup_attribute ("sseregparm", attrs))
+	    ret |= IX86_CALLCVT_SSEREGPARM;
+	}
 
-  if (dump_file)
-    fprintf (dump_file, "  Registers conversion cost: %d\n", cost);
+      if (IX86_BASE_CALLCVT(ret) != 0)
+	return ret;
+    }
 
-  gain -= cost;
+  is_stdarg = stdarg_p (type);
+  if (TARGET_RTD && !is_stdarg)
+    return IX86_CALLCVT_STDCALL | ret;
 
-  if (dump_file)
-    fprintf (dump_file, "  Total gain: %d\n", gain);
+  if (ret != 0
+      || is_stdarg
+      || TREE_CODE (type) != METHOD_TYPE
+      || ix86_function_type_abi (type) != MS_ABI)
+    return IX86_CALLCVT_CDECL | ret;
 
-  return gain;
+  return IX86_CALLCVT_THISCALL;
 }
 
-/* Replace REG in X with a V2DI subreg of NEW_REG.  */
+/* Return 0 if the attributes for two types are incompatible, 1 if they
+   are compatible, and 2 if they are nearly compatible (which causes a
+   warning to be generated).  */
 
-rtx
-dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
+static int
+ix86_comp_type_attributes (const_tree type1, const_tree type2)
 {
-  if (x == reg)
-    return gen_rtx_SUBREG (V2DImode, new_reg, 0);
+  unsigned int ccvt1, ccvt2;
 
-  const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
-  int i, j;
-  for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
-    {
-      if (fmt[i] == 'e')
-	XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
-      else if (fmt[i] == 'E')
-	for (j = XVECLEN (x, i) - 1; j >= 0; j--)
-	  XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
-						   reg, new_reg);
-    }
+  if (TREE_CODE (type1) != FUNCTION_TYPE
+      && TREE_CODE (type1) != METHOD_TYPE)
+    return 1;
 
-  return x;
-}
+  ccvt1 = ix86_get_callcvt (type1);
+  ccvt2 = ix86_get_callcvt (type2);
+  if (ccvt1 != ccvt2)
+    return 0;
+  if (ix86_function_regparm (type1, NULL)
+      != ix86_function_regparm (type2, NULL))
+    return 0;
 
-/* Replace REG in INSN with a V2DI subreg of NEW_REG.  */
+  return 1;
+}
+
+/* Return the regparm value for a function with the indicated TYPE and DECL.
+   DECL may be NULL when calling function indirectly
+   or considering a libcall.  */
 
-void
-dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
-						  rtx reg, rtx new_reg)
+static int
+ix86_function_regparm (const_tree type, const_tree decl)
 {
-  replace_with_subreg (single_set (insn), reg, new_reg);
-}
+  tree attr;
+  int regparm;
+  unsigned int ccvt;
 
-/* Insert generated conversion instruction sequence INSNS
-   after instruction AFTER.  New BB may be required in case
-   instruction has EH region attached.  */
+  if (TARGET_64BIT)
+    return (ix86_function_type_abi (type) == SYSV_ABI
+	    ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
+  ccvt = ix86_get_callcvt (type);
+  regparm = ix86_regparm;
 
-void
-scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
-{
-  if (!control_flow_insn_p (after))
+  if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
     {
-      emit_insn_after (insns, after);
-      return;
+      attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
+      if (attr)
+	{
+	  regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
+	  return regparm;
+	}
     }
+  else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+    return 2;
+  else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+    return 1;
 
-  basic_block bb = BLOCK_FOR_INSN (after);
-  edge e = find_fallthru_edge (bb->succs);
-  gcc_assert (e);
+  /* Use register calling convention for local functions when possible.  */
+  if (decl
+      && TREE_CODE (decl) == FUNCTION_DECL)
+    {
+      cgraph_node *target = cgraph_node::get (decl);
+      if (target)
+	target = target->function_symbol ();
 
-  basic_block new_bb = split_edge (e);
-  emit_insn_after (insns, BB_HEAD (new_bb));
-}
+      /* Caller and callee must agree on the calling convention, so
+	 checking here just optimize means that with
+	 __attribute__((optimize (...))) caller could use regparm convention
+	 and callee not, or vice versa.  Instead look at whether the callee
+	 is optimized or not.  */
+      if (target && opt_for_fn (target->decl, optimize)
+	  && !(profile_flag && !flag_fentry))
+	{
+	  cgraph_local_info *i = &target->local;
+	  if (i && i->local && i->can_change_signature)
+	    {
+	      int local_regparm, globals = 0, regno;
 
-/* Make vector copies for all register REGNO definitions
-   and replace its uses in a chain.  */
+	      /* Make sure no regparm register is taken by a
+		 fixed register variable.  */
+	      for (local_regparm = 0; local_regparm < REGPARM_MAX;
+		   local_regparm++)
+		if (fixed_regs[local_regparm])
+		  break;
 
-void
-dimode_scalar_chain::make_vector_copies (unsigned regno)
-{
-  rtx reg = regno_reg_rtx[regno];
-  rtx vreg = gen_reg_rtx (DImode);
-  df_ref ref;
+	      /* We don't want to use regparm(3) for nested functions as
+		 these use a static chain pointer in the third argument.  */
+	      if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
+		local_regparm = 2;
 
-  for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
-    if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
-      {
-	start_sequence ();
+	      /* Save a register for the split stack.  */
+	      if (flag_split_stack)
+		{
+		  if (local_regparm == 3)
+		    local_regparm = 2;
+		  else if (local_regparm == 2
+			   && DECL_STATIC_CHAIN (target->decl))
+		    local_regparm = 1;
+		}
 
-	if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
-	  {
-	    rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
-	    emit_move_insn (adjust_address (tmp, SImode, 0),
-			    gen_rtx_SUBREG (SImode, reg, 0));
-	    emit_move_insn (adjust_address (tmp, SImode, 4),
-			    gen_rtx_SUBREG (SImode, reg, 4));
-	    emit_move_insn (vreg, tmp);
-	  }
-	else if (TARGET_SSE4_1)
-	  {
-	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
-					CONST0_RTX (V4SImode),
-					gen_rtx_SUBREG (SImode, reg, 0)));
-	    emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
-					  gen_rtx_SUBREG (V4SImode, vreg, 0),
-					  gen_rtx_SUBREG (SImode, reg, 4),
-					  GEN_INT (2)));
-	  }
-	else
-	  {
-	    rtx tmp = gen_reg_rtx (DImode);
-	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
-					CONST0_RTX (V4SImode),
-					gen_rtx_SUBREG (SImode, reg, 0)));
-	    emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
-					CONST0_RTX (V4SImode),
-					gen_rtx_SUBREG (SImode, reg, 4)));
-	    emit_insn (gen_vec_interleave_lowv4si
-		       (gen_rtx_SUBREG (V4SImode, vreg, 0),
-			gen_rtx_SUBREG (V4SImode, vreg, 0),
-			gen_rtx_SUBREG (V4SImode, tmp, 0)));
-	  }
-	rtx_insn *seq = get_insns ();
-	end_sequence ();
-	rtx_insn *insn = DF_REF_INSN (ref);
-	emit_conversion_insns (seq, insn);
-
-	if (dump_file)
-	  fprintf (dump_file,
-		   "  Copied r%d to a vector register r%d for insn %d\n",
-		   regno, REGNO (vreg), INSN_UID (insn));
-      }
+	      /* Each fixed register usage increases register pressure,
+		 so less registers should be used for argument passing.
+		 This functionality can be overriden by an explicit
+		 regparm value.  */
+	      for (regno = AX_REG; regno <= DI_REG; regno++)
+		if (fixed_regs[regno])
+		  globals++;
 
-  for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
-    if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
-      {
-	rtx_insn *insn = DF_REF_INSN (ref);
+	      local_regparm
+		= globals < local_regparm ? local_regparm - globals : 0;
 
-	replace_with_subreg_in_insn (insn, reg, vreg);
+	      if (local_regparm > regparm)
+		regparm = local_regparm;
+	    }
+	}
+    }
 
-	if (dump_file)
-	  fprintf (dump_file, "  Replaced r%d with r%d in insn %d\n",
-		   regno, REGNO (vreg), INSN_UID (insn));
-      }
+  return regparm;
 }
 
-/* Convert all definitions of register REGNO
-   and fix its uses.  Scalar copies may be created
-   in case register is used in not convertible insn.  */
+/* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
+   DFmode (2) arguments in SSE registers for a function with the
+   indicated TYPE and DECL.  DECL may be NULL when calling function
+   indirectly or considering a libcall.  Return -1 if any FP parameter
+   should be rejected by error.  This is used in siutation we imply SSE
+   calling convetion but the function is called from another function with
+   SSE disabled. Otherwise return 0.  */
 
-void
-dimode_scalar_chain::convert_reg (unsigned regno)
+static int
+ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
 {
-  bool scalar_copy = bitmap_bit_p (defs_conv, regno);
-  rtx reg = regno_reg_rtx[regno];
-  rtx scopy = NULL_RTX;
-  df_ref ref;
-  bitmap conv;
-
-  conv = BITMAP_ALLOC (NULL);
-  bitmap_copy (conv, insns);
-
-  if (scalar_copy)
-    scopy = gen_reg_rtx (DImode);
+  gcc_assert (!TARGET_64BIT);
 
-  for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+  /* Use SSE registers to pass SFmode and DFmode arguments if requested
+     by the sseregparm attribute.  */
+  if (TARGET_SSEREGPARM
+      || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
     {
-      rtx_insn *insn = DF_REF_INSN (ref);
-      rtx def_set = single_set (insn);
-      rtx src = SET_SRC (def_set);
-      rtx reg = DF_REF_REG (ref);
-
-      if (!MEM_P (src))
-	{
-	  replace_with_subreg_in_insn (insn, reg, reg);
-	  bitmap_clear_bit (conv, INSN_UID (insn));
-	}
-
-      if (scalar_copy)
+      if (!TARGET_SSE)
 	{
-	  start_sequence ();
-	  if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
-	    {
-	      rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
-	      emit_move_insn (tmp, reg);
-	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
-			      adjust_address (tmp, SImode, 0));
-	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
-			      adjust_address (tmp, SImode, 4));
-	    }
-	  else if (TARGET_SSE4_1)
-	    {
-	      rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
-	      emit_insn
-		(gen_rtx_SET
-		 (gen_rtx_SUBREG (SImode, scopy, 0),
-		  gen_rtx_VEC_SELECT (SImode,
-				      gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
-
-	      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
-	      emit_insn
-		(gen_rtx_SET
-		 (gen_rtx_SUBREG (SImode, scopy, 4),
-		  gen_rtx_VEC_SELECT (SImode,
-				      gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
-	    }
-	  else
+	  if (warn)
 	    {
-	      rtx vcopy = gen_reg_rtx (V2DImode);
-	      emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
-	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
-			      gen_rtx_SUBREG (SImode, vcopy, 0));
-	      emit_move_insn (vcopy,
-			      gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
-	      emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
-			      gen_rtx_SUBREG (SImode, vcopy, 0));
+	      if (decl)
+		error ("calling %qD with attribute sseregparm without "
+		       "SSE/SSE2 enabled", decl);
+	      else
+		error ("calling %qT with attribute sseregparm without "
+		       "SSE/SSE2 enabled", type);
 	    }
-	  rtx_insn *seq = get_insns ();
-	  end_sequence ();
-	  emit_conversion_insns (seq, insn);
-
-	  if (dump_file)
-	    fprintf (dump_file,
-		     "  Copied r%d to a scalar register r%d for insn %d\n",
-		     regno, REGNO (scopy), INSN_UID (insn));
+	  return 0;
 	}
+
+      return 2;
     }
 
-  for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
-    if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
-      {
-	if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
-	  {
-	    rtx_insn *insn = DF_REF_INSN (ref);
+  if (!decl)
+    return 0;
 
-	    rtx def_set = single_set (insn);
-	    gcc_assert (def_set);
+  cgraph_node *target = cgraph_node::get (decl);
+  if (target)
+    target = target->function_symbol ();
 
-	    rtx src = SET_SRC (def_set);
-	    rtx dst = SET_DEST (def_set);
+  /* For local functions, pass up to SSE_REGPARM_MAX SFmode
+     (and DFmode for SSE2) arguments in SSE registers.  */
+  if (target
+      /* TARGET_SSE_MATH */
+      && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
+      && opt_for_fn (target->decl, optimize)
+      && !(profile_flag && !flag_fentry))
+    {
+      cgraph_local_info *i = &target->local;
+      if (i && i->local && i->can_change_signature)
+	{
+	  /* Refuse to produce wrong code when local function with SSE enabled
+	     is called from SSE disabled function.
+	     FIXME: We need a way to detect these cases cross-ltrans partition
+	     and avoid using SSE calling conventions on local functions called
+	     from function with SSE disabled.  For now at least delay the
+	     warning until we know we are going to produce wrong code.
+	     See PR66047  */
+	  if (!TARGET_SSE && warn)
+	    return -1;
+	  return TARGET_SSE2_P (target_opts_for_fn (target->decl)
+				->x_ix86_isa_flags) ? 2 : 1;
+	}
+    }
 
-	    if (!MEM_P (dst) || !REG_P (src))
-	      replace_with_subreg_in_insn (insn, reg, reg);
+  return 0;
+}
 
-	    bitmap_clear_bit (conv, INSN_UID (insn));
-	  }
-      }
-    /* Skip debug insns and uninitialized uses.  */
-    else if (DF_REF_CHAIN (ref)
-	     && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
-      {
-	gcc_assert (scopy);
-	replace_rtx (DF_REF_INSN (ref), reg, scopy);
-	df_insn_rescan (DF_REF_INSN (ref));
-      }
+/* Return true if EAX is live at the start of the function.  Used by
+   ix86_expand_prologue to determine if we need special help before
+   calling allocate_stack_worker.  */
 
-  BITMAP_FREE (conv);
+static bool
+ix86_eax_live_at_start_p (void)
+{
+  /* Cheat.  Don't bother working forward from ix86_function_regparm
+     to the function type to whether an actual argument is located in
+     eax.  Instead just look at cfg info, which is still close enough
+     to correct at this point.  This gives false positives for broken
+     functions that might use uninitialized data that happens to be
+     allocated in eax, but who cares?  */
+  return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
 }
 
-/* Convert operand OP in INSN.  We should handle
-   memory operands and uninitialized registers.
-   All other register uses are converted during
-   registers conversion.  */
-
-void
-dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
+static bool
+ix86_keep_aggregate_return_pointer (tree fntype)
 {
-  *op = copy_rtx_if_shared (*op);
+  tree attr;
 
-  if (GET_CODE (*op) == NOT)
-    {
-      convert_op (&XEXP (*op, 0), insn);
-      PUT_MODE (*op, V2DImode);
-    }
-  else if (MEM_P (*op))
+  if (!TARGET_64BIT)
     {
-      rtx tmp = gen_reg_rtx (DImode);
-
-      emit_insn_before (gen_move_insn (tmp, *op), insn);
-      *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
+      attr = lookup_attribute ("callee_pop_aggregate_return",
+			       TYPE_ATTRIBUTES (fntype));
+      if (attr)
+	return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
 
-      if (dump_file)
-	fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
-		 INSN_UID (insn), REGNO (tmp));
-    }
-  else if (REG_P (*op))
-    {
-      /* We may have not converted register usage in case
-	 this register has no definition.  Otherwise it
-	 should be converted in convert_reg.  */
-      df_ref ref;
-      FOR_EACH_INSN_USE (ref, insn)
-	if (DF_REF_REGNO (ref) == REGNO (*op))
-	  {
-	    gcc_assert (!DF_REF_CHAIN (ref));
-	    break;
-	  }
-      *op = gen_rtx_SUBREG (V2DImode, *op, 0);
-    }
-  else if (CONST_INT_P (*op))
-    {
-      rtx vec_cst;
-      rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
-
-      /* Prefer all ones vector in case of -1.  */
-      if (constm1_operand (*op, GET_MODE (*op)))
-	vec_cst = CONSTM1_RTX (V2DImode);
-      else
-	vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
-					gen_rtvec (2, *op, const0_rtx));
-
-      if (!standard_sse_constant_p (vec_cst, V2DImode))
-	{
-	  start_sequence ();
-	  vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
-	  rtx_insn *seq = get_insns ();
-	  end_sequence ();
-	  emit_insn_before (seq, insn);
-	}
-
-      emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
-      *op = tmp;
-    }
-  else
-    {
-      gcc_assert (SUBREG_P (*op));
-      gcc_assert (GET_MODE (*op) == V2DImode);
+      /* For 32-bit MS-ABI the default is to keep aggregate
+         return pointer.  */
+      if (ix86_function_type_abi (fntype) == MS_ABI)
+	return true;
     }
+  return KEEP_AGGREGATE_RETURN_POINTER != 0;
 }
 
-/* Convert INSN to vector mode.  */
-
-void
-dimode_scalar_chain::convert_insn (rtx_insn *insn)
-{
-  rtx def_set = single_set (insn);
-  rtx src = SET_SRC (def_set);
-  rtx dst = SET_DEST (def_set);
-  rtx subreg;
+/* Value is the number of bytes of arguments automatically
+   popped when returning from a subroutine call.
+   FUNDECL is the declaration node of the function (as a tree),
+   FUNTYPE is the data type of the function (as a tree),
+   or for a library call it is an identifier node for the subroutine name.
+   SIZE is the number of bytes of arguments passed on the stack.
 
-  if (MEM_P (dst) && !REG_P (src))
-    {
-      /* There are no scalar integer instructions and therefore
-	 temporary register usage is required.  */
-      rtx tmp = gen_reg_rtx (DImode);
-      emit_conversion_insns (gen_move_insn (dst, tmp), insn);
-      dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
-    }
+   On the 80386, the RTD insn may be used to pop them if the number
+     of args is fixed, but if the number is variable then the caller
+     must pop them all.  RTD can't be used for library calls now
+     because the library is compiled with the Unix compiler.
+   Use of RTD is a selectable option, since it is incompatible with
+   standard Unix calling sequences.  If the option is not selected,
+   the caller must always pop the args.
 
-  switch (GET_CODE (src))
-    {
-    case ASHIFT:
-    case ASHIFTRT:
-    case LSHIFTRT:
-      convert_op (&XEXP (src, 0), insn);
-      PUT_MODE (src, V2DImode);
-      break;
+   The attribute stdcall is equivalent to RTD on a per module basis.  */
 
-    case PLUS:
-    case MINUS:
-    case IOR:
-    case XOR:
-    case AND:
-      convert_op (&XEXP (src, 0), insn);
-      convert_op (&XEXP (src, 1), insn);
-      PUT_MODE (src, V2DImode);
-      break;
+static poly_int64
+ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size)
+{
+  unsigned int ccvt;
 
-    case NEG:
-      src = XEXP (src, 0);
-      convert_op (&src, insn);
-      subreg = gen_reg_rtx (V2DImode);
-      emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
-      src = gen_rtx_MINUS (V2DImode, subreg, src);
-      break;
+  /* None of the 64-bit ABIs pop arguments.  */
+  if (TARGET_64BIT)
+    return 0;
 
-    case NOT:
-      src = XEXP (src, 0);
-      convert_op (&src, insn);
-      subreg = gen_reg_rtx (V2DImode);
-      emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
-      src = gen_rtx_XOR (V2DImode, src, subreg);
-      break;
+  ccvt = ix86_get_callcvt (funtype);
 
-    case MEM:
-      if (!REG_P (dst))
-	convert_op (&src, insn);
-      break;
+  if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
+	       | IX86_CALLCVT_THISCALL)) != 0
+      && ! stdarg_p (funtype))
+    return size;
 
-    case REG:
-      if (!MEM_P (dst))
-	convert_op (&src, insn);
-      break;
+  /* Lose any fake structure return argument if it is passed on the stack.  */
+  if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
+      && !ix86_keep_aggregate_return_pointer (funtype))
+    {
+      int nregs = ix86_function_regparm (funtype, fundecl);
+      if (nregs == 0)
+	return GET_MODE_SIZE (Pmode);
+    }
 
-    case SUBREG:
-      gcc_assert (GET_MODE (src) == V2DImode);
-      break;
+  return 0;
+}
 
-    case COMPARE:
-      src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
+/* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook.  */
 
-      gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
-		  || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
+static bool
+ix86_legitimate_combined_insn (rtx_insn *insn)
+{
+  int i;
 
-      if (REG_P (src))
-	subreg = gen_rtx_SUBREG (V2DImode, src, 0);
-      else
-	subreg = copy_rtx_if_shared (src);
-      emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
-						    copy_rtx_if_shared (subreg),
-						    copy_rtx_if_shared (subreg)),
-			insn);
-      dst = gen_rtx_REG (CCmode, FLAGS_REG);
-      src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
-					       copy_rtx_if_shared (src)),
-			    UNSPEC_PTEST);
-      break;
+  /* Check operand constraints in case hard registers were propagated
+     into insn pattern.  This check prevents combine pass from
+     generating insn patterns with invalid hard register operands.
+     These invalid insns can eventually confuse reload to error out
+     with a spill failure.  See also PRs 46829 and 46843.  */
 
-    case CONST_INT:
-      convert_op (&src, insn);
-      break;
+  gcc_assert (INSN_CODE (insn) >= 0);
 
-    default:
-      gcc_unreachable ();
-    }
+  extract_insn (insn);
+  preprocess_constraints (insn);
 
-  SET_SRC (def_set) = src;
-  SET_DEST (def_set) = dst;
+  int n_operands = recog_data.n_operands;
+  int n_alternatives = recog_data.n_alternatives;
+  for (i = 0; i < n_operands; i++)
+    {
+      rtx op = recog_data.operand[i];
+      machine_mode mode = GET_MODE (op);
+      const operand_alternative *op_alt;
+      int offset = 0;
+      bool win;
+      int j;
 
-  /* Drop possible dead definitions.  */
-  PATTERN (insn) = def_set;
+      /* A unary operator may be accepted by the predicate, but it
+	 is irrelevant for matching constraints.  */
+      if (UNARY_P (op))
+	op = XEXP (op, 0);
 
-  INSN_CODE (insn) = -1;
-  recog_memoized (insn);
-  df_insn_rescan (insn);
-}
+      if (SUBREG_P (op))
+	{
+	  if (REG_P (SUBREG_REG (op))
+	      && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
+	    offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
+					  GET_MODE (SUBREG_REG (op)),
+					  SUBREG_BYTE (op),
+					  GET_MODE (op));
+	  op = SUBREG_REG (op);
+	}
 
-/* Fix uses of converted REG in debug insns.  */
+      if (!(REG_P (op) && HARD_REGISTER_P (op)))
+	continue;
 
-void
-timode_scalar_chain::fix_debug_reg_uses (rtx reg)
-{
-  if (!flag_var_tracking)
-    return;
+      op_alt = recog_op_alt;
 
-  df_ref ref, next;
-  for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
-    {
-      rtx_insn *insn = DF_REF_INSN (ref);
-      /* Make sure the next ref is for a different instruction,
-         so that we're not affected by the rescan.  */
-      next = DF_REF_NEXT_REG (ref);
-      while (next && DF_REF_INSN (next) == insn)
-	next = DF_REF_NEXT_REG (next);
+      /* Operand has no constraints, anything is OK.  */
+      win = !n_alternatives;
 
-      if (DEBUG_INSN_P (insn))
+      alternative_mask preferred = get_preferred_alternatives (insn);
+      for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
 	{
-	  /* It may be a debug insn with a TImode variable in
-	     register.  */
-	  bool changed = false;
-	  for (; ref != next; ref = DF_REF_NEXT_REG (ref))
+	  if (!TEST_BIT (preferred, j))
+	    continue;
+	  if (op_alt[i].anything_ok
+	      || (op_alt[i].matches != -1
+		  && operands_match_p
+		  (recog_data.operand[i],
+		   recog_data.operand[op_alt[i].matches]))
+	      || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
 	    {
-	      rtx *loc = DF_REF_LOC (ref);
-	      if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
-		{
-		  *loc = gen_rtx_SUBREG (TImode, *loc, 0);
-		  changed = true;
-		}
+	      win = true;
+	      break;
 	    }
-	  if (changed)
-	    df_insn_rescan (insn);
 	}
+
+      if (!win)
+	return false;
     }
+
+  return true;
 }
+
+/* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
 
-/* Convert INSN from TImode to V1T1mode.  */
+static unsigned HOST_WIDE_INT
+ix86_asan_shadow_offset (void)
+{
+  return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
+				     : HOST_WIDE_INT_C (0x7fff8000))
+		     : (HOST_WIDE_INT_1 << 29);
+}
+
+/* Argument support functions.  */
 
-void
-timode_scalar_chain::convert_insn (rtx_insn *insn)
+/* Return true when register may be used to pass function parameters.  */
+bool
+ix86_function_arg_regno_p (int regno)
 {
-  rtx def_set = single_set (insn);
-  rtx src = SET_SRC (def_set);
-  rtx dst = SET_DEST (def_set);
+  int i;
+  enum calling_abi call_abi;
+  const int *parm_regs;
 
-  switch (GET_CODE (dst))
+  if (!TARGET_64BIT)
     {
-    case REG:
-      {
-	rtx tmp = find_reg_equal_equiv_note (insn);
-	if (tmp)
-	  PUT_MODE (XEXP (tmp, 0), V1TImode);
-	PUT_MODE (dst, V1TImode);
-	fix_debug_reg_uses (dst);
-      }
-      break;
-    case MEM:
-      PUT_MODE (dst, V1TImode);
-      break;
-
-    default:
-      gcc_unreachable ();
+      if (TARGET_MACHO)
+        return (regno < REGPARM_MAX
+                || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
+      else
+        return (regno < REGPARM_MAX
+	        || (TARGET_MMX && MMX_REGNO_P (regno)
+	  	    && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
+	        || (TARGET_SSE && SSE_REGNO_P (regno)
+		    && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
     }
 
-  switch (GET_CODE (src))
-    {
-    case REG:
-      PUT_MODE (src, V1TImode);
-      /* Call fix_debug_reg_uses only if SRC is never defined.  */
-      if (!DF_REG_DEF_CHAIN (REGNO (src)))
-	fix_debug_reg_uses (src);
-      break;
-
-    case MEM:
-      PUT_MODE (src, V1TImode);
-      break;
-
-    case CONST_WIDE_INT:
-      if (NONDEBUG_INSN_P (insn))
-	{
-	  /* Since there are no instructions to store 128-bit constant,
-	     temporary register usage is required.  */
-	  rtx tmp = gen_reg_rtx (V1TImode);
-	  start_sequence ();
-	  src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
-	  src = validize_mem (force_const_mem (V1TImode, src));
-	  rtx_insn *seq = get_insns ();
-	  end_sequence ();
-	  if (seq)
-	    emit_insn_before (seq, insn);
-	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
-	  dst = tmp;
-	}
-      break;
-
-    case CONST_INT:
-      switch (standard_sse_constant_p (src, TImode))
-	{
-	case 1:
-	  src = CONST0_RTX (GET_MODE (dst));
-	  break;
-	case 2:
-	  src = CONSTM1_RTX (GET_MODE (dst));
-	  break;
-	default:
-	  gcc_unreachable ();
-	}
-      if (NONDEBUG_INSN_P (insn))
-	{
-	  rtx tmp = gen_reg_rtx (V1TImode);
-	  /* Since there are no instructions to store standard SSE
-	     constant, temporary register usage is required.  */
-	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
-	  dst = tmp;
-	}
-      break;
+  if (TARGET_SSE && SSE_REGNO_P (regno)
+      && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
+    return true;
 
-    default:
-      gcc_unreachable ();
-    }
+  /* TODO: The function should depend on current function ABI but
+     builtins.c would need updating then. Therefore we use the
+     default ABI.  */
+  call_abi = ix86_cfun_abi ();
 
-  SET_SRC (def_set) = src;
-  SET_DEST (def_set) = dst;
+  /* RAX is used as hidden argument to va_arg functions.  */
+  if (call_abi == SYSV_ABI && regno == AX_REG)
+    return true;
 
-  /* Drop possible dead definitions.  */
-  PATTERN (insn) = def_set;
+  if (call_abi == MS_ABI)
+    parm_regs = x86_64_ms_abi_int_parameter_registers;
+  else
+    parm_regs = x86_64_int_parameter_registers;
 
-  INSN_CODE (insn) = -1;
-  recog_memoized (insn);
-  df_insn_rescan (insn);
+  for (i = 0; i < (call_abi == MS_ABI
+		   ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
+    if (regno == parm_regs[i])
+      return true;
+  return false;
 }
 
-void
-dimode_scalar_chain::convert_registers ()
-{
-  bitmap_iterator bi;
-  unsigned id;
+/* Return if we do not know how to pass ARG solely in registers.  */
 
-  EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
-    convert_reg (id);
+static bool
+ix86_must_pass_in_stack (const function_arg_info &arg)
+{
+  if (must_pass_in_stack_var_size_or_pad (arg))
+    return true;
 
-  EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
-    make_vector_copies (id);
+  /* For 32-bit, we want TImode aggregates to go on the stack.  But watch out!
+     The layout_type routine is crafty and tries to trick us into passing
+     currently unsupported vector types on the stack by using TImode.  */
+  return (!TARGET_64BIT && arg.mode == TImode
+	  && arg.type && TREE_CODE (arg.type) != VECTOR_TYPE);
 }
 
-/* Convert whole chain creating required register
-   conversions and copies.  */
-
+/* It returns the size, in bytes, of the area reserved for arguments passed
+   in registers for the function represented by fndecl dependent to the used
+   abi format.  */
 int
-scalar_chain::convert ()
+ix86_reg_parm_stack_space (const_tree fndecl)
 {
-  bitmap_iterator bi;
-  unsigned id;
-  int converted_insns = 0;
-
-  if (!dbg_cnt (stv_conversion))
-    return 0;
-
-  if (dump_file)
-    fprintf (dump_file, "Converting chain #%d...\n", chain_id);
+  enum calling_abi call_abi = SYSV_ABI;
+  if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
+    call_abi = ix86_function_abi (fndecl);
+  else
+    call_abi = ix86_function_type_abi (fndecl);
+  if (TARGET_64BIT && call_abi == MS_ABI)
+    return 32;
+  return 0;
+}
 
-  convert_registers ();
-
-  EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
-    {
-      convert_insn (DF_INSN_UID_GET (id)->insn);
-      converted_insns++;
-    }
-
-  return converted_insns;
+/* We add this as a workaround in order to use libc_has_function
+   hook in i386.md.  */
+bool
+ix86_libc_has_function (enum function_class fn_class)
+{
+  return targetm.libc_has_function (fn_class);
 }
 
-/* Main STV pass function.  Find and convert scalar
-   instructions into vector mode when profitable.  */
-
-static unsigned int
-convert_scalars_to_vector ()
+/* Returns value SYSV_ABI, MS_ABI dependent on fntype,
+   specifying the call abi used.  */
+enum calling_abi
+ix86_function_type_abi (const_tree fntype)
 {
-  basic_block bb;
-  bitmap candidates;
-  int converted_insns = 0;
-
-  bitmap_obstack_initialize (NULL);
-  candidates = BITMAP_ALLOC (NULL);
-
-  calculate_dominance_info (CDI_DOMINATORS);
-  df_set_flags (DF_DEFER_INSN_RESCAN);
-  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
-  df_md_add_problem ();
-  df_analyze ();
-
-  /* Find all instructions we want to convert into vector mode.  */
-  if (dump_file)
-    fprintf (dump_file, "Searching for mode conversion candidates...\n");
-
-  FOR_EACH_BB_FN (bb, cfun)
-    {
-      rtx_insn *insn;
-      FOR_BB_INSNS (bb, insn)
-	if (scalar_to_vector_candidate_p (insn))
-	  {
-	    if (dump_file)
-	      fprintf (dump_file, "  insn %d is marked as a candidate\n",
-		       INSN_UID (insn));
-
-	    bitmap_set_bit (candidates, INSN_UID (insn));
-	  }
-    }
-
-  remove_non_convertible_regs (candidates);
+  enum calling_abi abi = ix86_abi;
 
-  if (bitmap_empty_p (candidates))
-    if (dump_file)
-      fprintf (dump_file, "There are no candidates for optimization.\n");
+  if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
+    return abi;
 
-  while (!bitmap_empty_p (candidates))
+  if (abi == SYSV_ABI
+      && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
     {
-      unsigned uid = bitmap_first_set_bit (candidates);
-      scalar_chain *chain;
-
-      if (TARGET_64BIT)
-	chain = new timode_scalar_chain;
-      else
-	chain = new dimode_scalar_chain;
-
-      /* Find instructions chain we want to convert to vector mode.
-	 Check all uses and definitions to estimate all required
-	 conversions.  */
-      chain->build (candidates, uid);
-
-      if (chain->compute_convert_gain () > 0)
-	converted_insns += chain->convert ();
-      else
-	if (dump_file)
-	  fprintf (dump_file, "Chain #%d conversion is not profitable\n",
-		   chain->chain_id);
-
-      delete chain;
-    }
-
-  if (dump_file)
-    fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
-
-  BITMAP_FREE (candidates);
-  bitmap_obstack_release (NULL);
-  df_process_deferred_rescans ();
+      static int warned;
+      if (TARGET_X32 && !warned)
+	{
+	  error ("X32 does not support %<ms_abi%> attribute");
+	  warned = 1;
+	}
 
-  /* Conversion means we may have 128bit register spills/fills
-     which require aligned stack.  */
-  if (converted_insns)
-    {
-      if (crtl->stack_alignment_needed < 128)
-	crtl->stack_alignment_needed = 128;
-      if (crtl->stack_alignment_estimated < 128)
-	crtl->stack_alignment_estimated = 128;
-      /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments.  */
-      if (TARGET_64BIT)
-	for (tree parm = DECL_ARGUMENTS (current_function_decl);
-	     parm; parm = DECL_CHAIN (parm))
-	  {
-	    if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
-	      continue;
-	    if (DECL_RTL_SET_P (parm)
-		&& GET_MODE (DECL_RTL (parm)) == V1TImode)
-	      {
-		rtx r = DECL_RTL (parm);
-		if (REG_P (r))
-		  SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
-	      }
-	    if (DECL_INCOMING_RTL (parm)
-		&& GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
-	      {
-		rtx r = DECL_INCOMING_RTL (parm);
-		if (REG_P (r))
-		  DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
-	      }
-	  }
+      abi = MS_ABI;
     }
+  else if (abi == MS_ABI
+	   && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
+    abi = SYSV_ABI;
 
-  return 0;
+  return abi;
 }
 
-namespace {
-
-const pass_data pass_data_insert_vzeroupper =
+enum calling_abi
+ix86_function_abi (const_tree fndecl)
 {
-  RTL_PASS, /* type */
-  "vzeroupper", /* name */
-  OPTGROUP_NONE, /* optinfo_flags */
-  TV_MACH_DEP, /* tv_id */
-  0, /* properties_required */
-  0, /* properties_provided */
-  0, /* properties_destroyed */
-  0, /* todo_flags_start */
-  TODO_df_finish, /* todo_flags_finish */
-};
+  return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
+}
 
-class pass_insert_vzeroupper : public rtl_opt_pass
+/* Returns value SYSV_ABI, MS_ABI dependent on cfun,
+   specifying the call abi used.  */
+enum calling_abi
+ix86_cfun_abi (void)
 {
-public:
-  pass_insert_vzeroupper(gcc::context *ctxt)
-    : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
-  {}
+  return cfun ? cfun->machine->call_abi : ix86_abi;
+}
 
-  /* opt_pass methods: */
-  virtual bool gate (function *)
+bool
+ix86_function_ms_hook_prologue (const_tree fn)
+{
+  if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
     {
-      return TARGET_AVX
-	     && TARGET_VZEROUPPER && flag_expensive_optimizations
-	     && !optimize_size;
+      if (decl_function_context (fn) != NULL_TREE)
+	error_at (DECL_SOURCE_LOCATION (fn),
+		  "%<ms_hook_prologue%> attribute is not compatible "
+		  "with nested function");
+      else
+        return true;
     }
+  return false;
+}
 
-  virtual unsigned int execute (function *)
-    {
-      return rest_of_handle_insert_vzeroupper ();
-    }
+bool
+ix86_function_naked (const_tree fn)
+{
+  if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
+    return true;
 
-}; // class pass_insert_vzeroupper
+  return false;
+}
 
-const pass_data pass_data_stv =
-{
-  RTL_PASS, /* type */
-  "stv", /* name */
-  OPTGROUP_NONE, /* optinfo_flags */
-  TV_MACH_DEP, /* tv_id */
-  0, /* properties_required */
-  0, /* properties_provided */
-  0, /* properties_destroyed */
-  0, /* todo_flags_start */
-  TODO_df_finish, /* todo_flags_finish */
-};
+/* Write the extra assembler code needed to declare a function properly.  */
 
-class pass_stv : public rtl_opt_pass
+void
+ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
+				tree decl)
 {
-public:
-  pass_stv (gcc::context *ctxt)
-    : rtl_opt_pass (pass_data_stv, ctxt),
-      timode_p (false)
-  {}
+  bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
 
-  /* opt_pass methods: */
-  virtual bool gate (function *)
+  if (is_ms_hook)
     {
-      return (timode_p == !!TARGET_64BIT
-	      && TARGET_STV && TARGET_SSE2 && optimize > 1);
-    }
+      int i, filler_count = (TARGET_64BIT ? 32 : 16);
+      unsigned int filler_cc = 0xcccccccc;
 
-  virtual unsigned int execute (function *)
-    {
-      return convert_scalars_to_vector ();
+      for (i = 0; i < filler_count; i += 4)
+        fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
     }
 
-  opt_pass *clone ()
-    {
-      return new pass_stv (m_ctxt);
-    }
+#ifdef SUBTARGET_ASM_UNWIND_INIT
+  SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
+#endif
+
+  ASM_OUTPUT_LABEL (asm_out_file, fname);
 
-  void set_pass_param (unsigned int n, bool param)
+  /* Output magic byte marker, if hot-patch attribute is set.  */
+  if (is_ms_hook)
     {
-      gcc_assert (n == 0);
-      timode_p = param;
+      if (TARGET_64BIT)
+	{
+	  /* leaq [%rsp + 0], %rsp  */
+	  fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
+		 asm_out_file);
+	}
+      else
+	{
+          /* movl.s %edi, %edi
+	     push   %ebp
+	     movl.s %esp, %ebp */
+	  fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
+	}
     }
+}
 
-private:
-  bool timode_p;
-}; // class pass_stv
-
-} // anon namespace
-
-rtl_opt_pass *
-make_pass_insert_vzeroupper (gcc::context *ctxt)
+/* Implementation of call abi switching target hook. Specific to FNDECL
+   the specific call register sets are set.  See also
+   ix86_conditional_register_usage for more details.  */
+void
+ix86_call_abi_override (const_tree fndecl)
 {
-  return new pass_insert_vzeroupper (ctxt);
+  cfun->machine->call_abi = ix86_function_abi (fndecl);
 }
 
-rtl_opt_pass *
-make_pass_stv (gcc::context *ctxt)
+/* Return 1 if pseudo register should be created and used to hold
+   GOT address for PIC code.  */
+bool
+ix86_use_pseudo_pic_reg (void)
 {
-  return new pass_stv (ctxt);
+  if ((TARGET_64BIT
+       && (ix86_cmodel == CM_SMALL_PIC
+	   || TARGET_PECOFF))
+      || !flag_pic)
+    return false;
+  return true;
 }
 
-/* Inserting ENDBRANCH instructions.  */
+/* Initialize large model PIC register.  */
 
-static unsigned int
-rest_of_insert_endbranch (void)
+static void
+ix86_init_large_pic_reg (unsigned int tmp_regno)
 {
-  timevar_push (TV_MACH_DEP);
-
-  rtx cet_eb;
-  rtx_insn *insn;
-  basic_block bb;
-
-  /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
-     absent among function attributes.  Later an optimization will be
-     introduced to make analysis if an address of a static function is
-     taken.  A static function whose address is not taken will get a
-     nocf_check attribute.  This will allow to reduce the number of EB.  */
-
-  if (!lookup_attribute ("nocf_check",
-			 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
-      && (!flag_manual_endbr
-	  || lookup_attribute ("cf_check",
-			       DECL_ATTRIBUTES (cfun->decl)))
-      && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
-    {
-      /* Queue ENDBR insertion to x86_function_profiler.  */
-      if (crtl->profile && flag_fentry)
-	cfun->machine->endbr_queued_at_entrance = true;
-      else
-	{
-	  cet_eb = gen_nop_endbr ();
-
-	  bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
-	  insn = BB_HEAD (bb);
-	  emit_insn_before (cet_eb, insn);
-	}
-    }
-
-  bb = 0;
-  FOR_EACH_BB_FN (bb, cfun)
-    {
-      for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
-	   insn = NEXT_INSN (insn))
-	{
-	  if (CALL_P (insn))
-	    {
-	      bool need_endbr;
-	      need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
-	      if (!need_endbr && !SIBLING_CALL_P (insn))
-		{
-		  rtx call = get_call_rtx_from (insn);
-		  rtx fnaddr = XEXP (call, 0);
-		  tree fndecl = NULL_TREE;
-
-		  /* Also generate ENDBRANCH for non-tail call which
-		     may return via indirect branch.  */
-		  if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
-		    fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
-		  if (fndecl == NULL_TREE)
-		    fndecl = MEM_EXPR (fnaddr);
-		  if (fndecl
-		      && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
-		      && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
-		    fndecl = NULL_TREE;
-		  if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
-		    {
-		      tree fntype = TREE_TYPE (fndecl);
-		      if (lookup_attribute ("indirect_return",
-					    TYPE_ATTRIBUTES (fntype)))
-			need_endbr = true;
-		    }
-		}
-	      if (!need_endbr)
-		continue;
-	      /* Generate ENDBRANCH after CALL, which can return more than
-		 twice, setjmp-like functions.  */
-
-	      cet_eb = gen_nop_endbr ();
-	      emit_insn_after_setloc (cet_eb, insn, INSN_LOCATION (insn));
-	      continue;
-	    }
-
-	  if (JUMP_P (insn) && flag_cet_switch)
-	    {
-	      rtx target = JUMP_LABEL (insn);
-	      if (target == NULL_RTX || ANY_RETURN_P (target))
-		continue;
-
-	      /* Check the jump is a switch table.  */
-	      rtx_insn *label = as_a<rtx_insn *> (target);
-	      rtx_insn *table = next_insn (label);
-	      if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
-		continue;
-
-	      /* For the indirect jump find out all places it jumps and insert
-		 ENDBRANCH there.  It should be done under a special flag to
-		 control ENDBRANCH generation for switch stmts.  */
-	      edge_iterator ei;
-	      edge e;
-	      basic_block dest_blk;
-
-	      FOR_EACH_EDGE (e, ei, bb->succs)
-		{
-		  rtx_insn *insn;
-
-		  dest_blk = e->dest;
-		  insn = BB_HEAD (dest_blk);
-		  gcc_assert (LABEL_P (insn));
-		  cet_eb = gen_nop_endbr ();
-		  emit_insn_after (cet_eb, insn);
-		}
-	      continue;
-	    }
-
-	  if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
-	      || (NOTE_P (insn)
-		  && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
-	    /* TODO.  Check /s bit also.  */
-	    {
-	      cet_eb = gen_nop_endbr ();
-	      emit_insn_after (cet_eb, insn);
-	      continue;
-	    }
-	}
-    }
+  rtx_code_label *label;
+  rtx tmp_reg;
 
-  timevar_pop (TV_MACH_DEP);
-  return 0;
+  gcc_assert (Pmode == DImode);
+  label = gen_label_rtx ();
+  emit_label (label);
+  LABEL_PRESERVE_P (label) = 1;
+  tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
+  gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
+  emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
+				label));
+  emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
+  emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
+			    pic_offset_table_rtx, tmp_reg));
+  const char *name = LABEL_NAME (label);
+  PUT_CODE (label, NOTE);
+  NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
+  NOTE_DELETED_LABEL_NAME (label) = name;
 }
 
-namespace {
-
-const pass_data pass_data_insert_endbranch =
+/* Create and initialize PIC register if required.  */
+static void
+ix86_init_pic_reg (void)
 {
-  RTL_PASS, /* type.  */
-  "cet", /* name.  */
-  OPTGROUP_NONE, /* optinfo_flags.  */
-  TV_MACH_DEP, /* tv_id.  */
-  0, /* properties_required.  */
-  0, /* properties_provided.  */
-  0, /* properties_destroyed.  */
-  0, /* todo_flags_start.  */
-  0, /* todo_flags_finish.  */
-};
+  edge entry_edge;
+  rtx_insn *seq;
 
-class pass_insert_endbranch : public rtl_opt_pass
-{
-public:
-  pass_insert_endbranch (gcc::context *ctxt)
-    : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
-  {}
+  if (!ix86_use_pseudo_pic_reg ())
+    return;
+
+  start_sequence ();
 
-  /* opt_pass methods: */
-  virtual bool gate (function *)
+  if (TARGET_64BIT)
     {
-      return ((flag_cf_protection & CF_BRANCH));
+      if (ix86_cmodel == CM_LARGE_PIC)
+	ix86_init_large_pic_reg (R11_REG);
+      else
+	emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
     }
-
-  virtual unsigned int execute (function *)
+  else
     {
-      return rest_of_insert_endbranch ();
+      /*  If there is future mcount call in the function it is more profitable
+	  to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM.  */
+      rtx reg = crtl->profile
+		? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
+		: pic_offset_table_rtx;
+      rtx_insn *insn = emit_insn (gen_set_got (reg));
+      RTX_FRAME_RELATED_P (insn) = 1;
+      if (crtl->profile)
+        emit_move_insn (pic_offset_table_rtx, reg);
+      add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
     }
 
-}; // class pass_insert_endbranch
-
-} // anon namespace
+  seq = get_insns ();
+  end_sequence ();
 
-rtl_opt_pass *
-make_pass_insert_endbranch (gcc::context *ctxt)
-{
-  return new pass_insert_endbranch (ctxt);
+  entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
+  insert_insn_on_edge (seq, entry_edge);
+  commit_one_edge_insertion (entry_edge);
 }
 
-/* At entry of the nearest common dominator for basic blocks with
-   conversions, generate a single
-	vxorps %xmmN, %xmmN, %xmmN
-   for all
-	vcvtss2sd  op, %xmmN, %xmmX
-	vcvtsd2ss  op, %xmmN, %xmmX
-	vcvtsi2ss  op, %xmmN, %xmmX
-	vcvtsi2sd  op, %xmmN, %xmmX
-
-   NB: We want to generate only a single vxorps to cover the whole
-   function.  The LCM algorithm isn't appropriate here since it may
-   place a vxorps inside the loop.  */
-
-static unsigned int
-remove_partial_avx_dependency (void)
-{
-  timevar_push (TV_MACH_DEP);
-
-  bitmap_obstack_initialize (NULL);
-  bitmap convert_bbs = BITMAP_ALLOC (NULL);
+/* Initialize a variable CUM of type CUMULATIVE_ARGS
+   for a call to a function whose data type is FNTYPE.
+   For a library call, FNTYPE is 0.  */
 
-  basic_block bb;
-  rtx_insn *insn, *set_insn;
-  rtx set;
-  rtx v4sf_const0 = NULL_RTX;
+void
+init_cumulative_args (CUMULATIVE_ARGS *cum,  /* Argument info to initialize */
+		      tree fntype,	/* tree ptr for function decl */
+		      rtx libname,	/* SYMBOL_REF of library name or 0 */
+		      tree fndecl,
+		      int caller)
+{
+  struct cgraph_local_info *i = NULL;
+  struct cgraph_node *target = NULL;
 
-  auto_vec<rtx_insn *> control_flow_insns;
+  memset (cum, 0, sizeof (*cum));
 
-  FOR_EACH_BB_FN (bb, cfun)
+  if (fndecl)
     {
-      FOR_BB_INSNS (bb, insn)
+      target = cgraph_node::get (fndecl);
+      if (target)
 	{
-	  if (!NONDEBUG_INSN_P (insn))
-	    continue;
-
-	  set = single_set (insn);
-	  if (!set)
-	    continue;
+	  target = target->function_symbol ();
+	  i = cgraph_node::local_info (target->decl);
+	  cum->call_abi = ix86_function_abi (target->decl);
+	}
+      else
+	cum->call_abi = ix86_function_abi (fndecl);
+    }
+  else
+    cum->call_abi = ix86_function_type_abi (fntype);
 
-	  if (get_attr_avx_partial_xmm_update (insn)
-	      != AVX_PARTIAL_XMM_UPDATE_TRUE)
-	    continue;
+  cum->caller = caller;
 
-	  if (!v4sf_const0)
-	    {
-	      calculate_dominance_info (CDI_DOMINATORS);
-	      df_set_flags (DF_DEFER_INSN_RESCAN);
-	      df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
-	      df_md_add_problem ();
-	      df_analyze ();
-	      v4sf_const0 = gen_reg_rtx (V4SFmode);
-	    }
+  /* Set up the number of registers to use for passing arguments.  */
+  cum->nregs = ix86_regparm;
+  if (TARGET_64BIT)
+    {
+      cum->nregs = (cum->call_abi == SYSV_ABI
+                   ? X86_64_REGPARM_MAX
+                   : X86_64_MS_REGPARM_MAX);
+    }
+  if (TARGET_SSE)
+    {
+      cum->sse_nregs = SSE_REGPARM_MAX;
+      if (TARGET_64BIT)
+        {
+          cum->sse_nregs = (cum->call_abi == SYSV_ABI
+                           ? X86_64_SSE_REGPARM_MAX
+                           : X86_64_MS_SSE_REGPARM_MAX);
+        }
+    }
+  if (TARGET_MMX)
+    cum->mmx_nregs = MMX_REGPARM_MAX;
+  cum->warn_avx512f = true;
+  cum->warn_avx = true;
+  cum->warn_sse = true;
+  cum->warn_mmx = true;
 
-	  /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
-	     SI -> SF, SI -> DF, DI -> SF, DI -> DF, to vec_dup and
-	     vec_merge with subreg.  */
-	  rtx src = SET_SRC (set);
-	  rtx dest = SET_DEST (set);
-	  machine_mode dest_mode = GET_MODE (dest);
+  /* Because type might mismatch in between caller and callee, we need to
+     use actual type of function for local calls.
+     FIXME: cgraph_analyze can be told to actually record if function uses
+     va_start so for local functions maybe_vaarg can be made aggressive
+     helping K&R code.
+     FIXME: once typesytem is fixed, we won't need this code anymore.  */
+  if (i && i->local && i->can_change_signature)
+    fntype = TREE_TYPE (target->decl);
+  cum->stdarg = stdarg_p (fntype);
+  cum->maybe_vaarg = (fntype
+		      ? (!prototype_p (fntype) || stdarg_p (fntype))
+		      : !libname);
 
-	  rtx zero;
-	  machine_mode dest_vecmode;
-	  if (dest_mode == E_SFmode)
-	    {
-	      dest_vecmode = V4SFmode;
-	      zero = v4sf_const0;
-	    }
-	  else
-	    {
-	      dest_vecmode = V2DFmode;
-	      zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
-	    }
+  cum->decl = fndecl;
 
-	  /* Change source to vector mode.  */
-	  src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
-	  src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
-				   GEN_INT (HOST_WIDE_INT_1U));
-	  /* Change destination to vector mode.  */
-	  rtx vec = gen_reg_rtx (dest_vecmode);
-	  /* Generate an XMM vector SET.  */
-	  set = gen_rtx_SET (vec, src);
-	  set_insn = emit_insn_before (set, insn);
-	  df_insn_rescan (set_insn);
-
-	  if (cfun->can_throw_non_call_exceptions)
+  cum->warn_empty = !warn_abi || cum->stdarg;
+  if (!cum->warn_empty && fntype)
+    {
+      function_args_iterator iter;
+      tree argtype;
+      bool seen_empty_type = false;
+      FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
+	{
+	  if (argtype == error_mark_node || VOID_TYPE_P (argtype))
+	    break;
+	  if (TYPE_EMPTY_P (argtype))
+	    seen_empty_type = true;
+	  else if (seen_empty_type)
 	    {
-	      /* Handle REG_EH_REGION note.  */
-	      rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
-	      if (note)
-		{
-		  control_flow_insns.safe_push (set_insn);
-		  add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
-		}
+	      cum->warn_empty = true;
+	      break;
 	    }
-
-	  src = gen_rtx_SUBREG (dest_mode, vec, 0);
-	  set = gen_rtx_SET (dest, src);
-
-	  /* Drop possible dead definitions.  */
-	  PATTERN (insn) = set;
-
-	  INSN_CODE (insn) = -1;
-	  recog_memoized (insn);
-	  df_insn_rescan (insn);
-	  bitmap_set_bit (convert_bbs, bb->index);
 	}
     }
 
-  if (v4sf_const0)
+  if (!TARGET_64BIT)
     {
-      /* (Re-)discover loops so that bb->loop_father can be used in the
-	 analysis below.  */
-      loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
-
-      /* Generate a vxorps at entry of the nearest dominator for basic
-	 blocks with conversions, which is in the the fake loop that
-	 contains the whole function, so that there is only a single
-	 vxorps in the whole function.   */
-      bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
-					     convert_bbs);
-      while (bb->loop_father->latch
-	     != EXIT_BLOCK_PTR_FOR_FN (cfun))
-	bb = get_immediate_dominator (CDI_DOMINATORS,
-				      bb->loop_father->header);
-
-      set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
+      /* If there are variable arguments, then we won't pass anything
+         in registers in 32-bit mode. */
+      if (stdarg_p (fntype))
+	{
+	  cum->nregs = 0;
+	  /* Since in 32-bit, variable arguments are always passed on
+	     stack, there is scratch register available for indirect
+	     sibcall.  */
+	  cfun->machine->arg_reg_available = true;
+	  cum->sse_nregs = 0;
+	  cum->mmx_nregs = 0;
+	  cum->warn_avx512f = false;
+	  cum->warn_avx = false;
+	  cum->warn_sse = false;
+	  cum->warn_mmx = false;
+	  return;
+	}
 
-      insn = BB_HEAD (bb);
-      while (insn && !NONDEBUG_INSN_P (insn))
+      /* Use ecx and edx registers if function has fastcall attribute,
+	 else look for regparm information.  */
+      if (fntype)
 	{
-	  if (insn == BB_END (bb))
+	  unsigned int ccvt = ix86_get_callcvt (fntype);
+	  if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
 	    {
-	      insn = NULL;
-	      break;
+	      cum->nregs = 1;
+	      cum->fastcall = 1; /* Same first register as in fastcall.  */
+	    }
+	  else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+	    {
+	      cum->nregs = 2;
+	      cum->fastcall = 1;
 	    }
-	  insn = NEXT_INSN (insn);
+	  else
+	    cum->nregs = ix86_function_regparm (fntype, fndecl);
 	}
-      if (insn == BB_HEAD (bb))
-        set_insn = emit_insn_before (set, insn);
-      else
-	set_insn = emit_insn_after (set,
-				    insn ? PREV_INSN (insn) : BB_END (bb));
-      df_insn_rescan (set_insn);
-      df_process_deferred_rescans ();
-      loop_optimizer_finalize ();
-
-      if (!control_flow_insns.is_empty ())
-	{
-	  free_dominance_info (CDI_DOMINATORS);
 
-	  unsigned int i;
-	  FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
-	    if (control_flow_insn_p (insn))
-	      {
-		/* Split the block after insn.  There will be a fallthru
-		   edge, which is OK so we keep it.  We have to create
-		   the exception edges ourselves.  */
-		bb = BLOCK_FOR_INSN (insn);
-		split_block (bb, insn);
-		rtl_make_eh_edge (NULL, bb, BB_END (bb));
-	      }
-	}
+      /* Set up the number of SSE registers used for passing SFmode
+	 and DFmode arguments.  Warn for mismatching ABI.  */
+      cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
     }
 
-  bitmap_obstack_release (NULL);
-  BITMAP_FREE (convert_bbs);
-
-  timevar_pop (TV_MACH_DEP);
-  return 0;
+  cfun->machine->arg_reg_available = (cum->nregs > 0);
 }
 
-namespace {
-
-const pass_data pass_data_remove_partial_avx_dependency =
-{
-  RTL_PASS, /* type */
-  "rpad", /* name */
-  OPTGROUP_NONE, /* optinfo_flags */
-  TV_MACH_DEP, /* tv_id */
-  0, /* properties_required */
-  0, /* properties_provided */
-  0, /* properties_destroyed */
-  0, /* todo_flags_start */
-  TODO_df_finish, /* todo_flags_finish */
-};
-
-class pass_remove_partial_avx_dependency : public rtl_opt_pass
-{
-public:
-  pass_remove_partial_avx_dependency (gcc::context *ctxt)
-    : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
-  {}
-
-  /* opt_pass methods: */
-  virtual bool gate (function *)
-    {
-      return (TARGET_AVX
-	      && TARGET_SSE_PARTIAL_REG_DEPENDENCY
-	      && TARGET_SSE_MATH
-	      && optimize
-	      && optimize_function_for_speed_p (cfun));
-    }
-
-  virtual unsigned int execute (function *)
-    {
-      return remove_partial_avx_dependency ();
-    }
-}; // class pass_rpad
-
-} // anon namespace
+/* Return the "natural" mode for TYPE.  In most cases, this is just TYPE_MODE.
+   But in the case of vector types, it is some vector mode.
 
-rtl_opt_pass *
-make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
-{
-  return new pass_remove_partial_avx_dependency (ctxt);
-}
+   When we have only some of our vector isa extensions enabled, then there
+   are some modes for which vector_mode_supported_p is false.  For these
+   modes, the generic vector support in gcc will choose some non-vector mode
+   in order to implement the type.  By computing the natural mode, we'll
+   select the proper ABI location for the operand and not depend on whatever
+   the middle-end decides to do with these vector types.
 
-/* Return true if a red-zone is in use.  We can't use red-zone when
-   there are local indirect jumps, like "indirect_jump" or "tablejump",
-   which jumps to another place in the function, since "call" in the
-   indirect thunk pushes the return address onto stack, destroying
-   red-zone.
+   The midde-end can't deal with the vector types > 16 bytes.  In this
+   case, we return the original mode and warn ABI change if CUM isn't
+   NULL. 
 
-   TODO: If we can reserve the first 2 WORDs, for PUSH and, another
-   for CALL, in red-zone, we can allow local indirect jumps with
-   indirect thunk.  */
+   If INT_RETURN is true, warn ABI change if the vector mode isn't
+   available for function return value.  */
 
-bool
-ix86_using_red_zone (void)
+static machine_mode
+type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
+		   bool in_return)
 {
-  return (TARGET_RED_ZONE
-	  && !TARGET_64BIT_MS_ABI
-	  && (!cfun->machine->has_local_indirect_jump
-	      || cfun->machine->indirect_branch_type == indirect_branch_keep));
-}
-
-/* Return a string that documents the current -m options.  The caller is
-   responsible for freeing the string.  */
+  machine_mode mode = TYPE_MODE (type);
 
-static char *
-ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
-		    int flags, int flags2,
-		    const char *arch, const char *tune,
-		    enum fpmath_unit fpmath, bool add_nl_p, bool add_abi_p)
-{
-  struct ix86_target_opts
-  {
-    const char *option;		/* option string */
-    HOST_WIDE_INT mask;		/* isa mask options */
-  };
+  if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
+    {
+      HOST_WIDE_INT size = int_size_in_bytes (type);
+      if ((size == 8 || size == 16 || size == 32 || size == 64)
+	  /* ??? Generic code allows us to create width 1 vectors.  Ignore.  */
+	  && TYPE_VECTOR_SUBPARTS (type) > 1)
+	{
+	  machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
 
-  /* This table is ordered so that options like -msse4.2 that imply other
-     ISAs come first.  Target string will be displayed in the same order.  */
-  static struct ix86_target_opts isa2_opts[] =
-  {
-    { "-mcx16",		OPTION_MASK_ISA_CX16 },
-    { "-mvaes",		OPTION_MASK_ISA_VAES },
-    { "-mrdpid",	OPTION_MASK_ISA_RDPID },
-    { "-mpconfig",	OPTION_MASK_ISA_PCONFIG },
-    { "-mwbnoinvd",     OPTION_MASK_ISA_WBNOINVD },
-    { "-msgx",		OPTION_MASK_ISA_SGX },
-    { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
-    { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
-    { "-mhle",		OPTION_MASK_ISA_HLE },
-    { "-mmovbe",	OPTION_MASK_ISA_MOVBE },
-    { "-mclzero",	OPTION_MASK_ISA_CLZERO },
-    { "-mmwaitx",	OPTION_MASK_ISA_MWAITX },
-    { "-mmovdir64b",	OPTION_MASK_ISA_MOVDIR64B },
-    { "-mwaitpkg",	OPTION_MASK_ISA_WAITPKG },
-    { "-mcldemote",	OPTION_MASK_ISA_CLDEMOTE },
-    { "-mptwrite",	OPTION_MASK_ISA_PTWRITE }
-  };
-  static struct ix86_target_opts isa_opts[] =
-  {
-    { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
-    { "-mavx512bitalg", OPTION_MASK_ISA_AVX512BITALG },
-    { "-mvpclmulqdq",	OPTION_MASK_ISA_VPCLMULQDQ },
-    { "-mgfni",		OPTION_MASK_ISA_GFNI },
-    { "-mavx512vnni",	OPTION_MASK_ISA_AVX512VNNI },
-    { "-mavx512vbmi2",	OPTION_MASK_ISA_AVX512VBMI2 },
-    { "-mavx512vbmi",	OPTION_MASK_ISA_AVX512VBMI },
-    { "-mavx512ifma",	OPTION_MASK_ISA_AVX512IFMA },
-    { "-mavx512vl",	OPTION_MASK_ISA_AVX512VL },
-    { "-mavx512bw",	OPTION_MASK_ISA_AVX512BW },
-    { "-mavx512dq",	OPTION_MASK_ISA_AVX512DQ },
-    { "-mavx512er",	OPTION_MASK_ISA_AVX512ER },
-    { "-mavx512pf",	OPTION_MASK_ISA_AVX512PF },
-    { "-mavx512cd",	OPTION_MASK_ISA_AVX512CD },
-    { "-mavx512f",	OPTION_MASK_ISA_AVX512F },
-    { "-mavx2",		OPTION_MASK_ISA_AVX2 },
-    { "-mfma",		OPTION_MASK_ISA_FMA },
-    { "-mxop",		OPTION_MASK_ISA_XOP },
-    { "-mfma4",		OPTION_MASK_ISA_FMA4 },
-    { "-mf16c",		OPTION_MASK_ISA_F16C },
-    { "-mavx",		OPTION_MASK_ISA_AVX },
-/*  { "-msse4"		OPTION_MASK_ISA_SSE4 }, */
-    { "-msse4.2",	OPTION_MASK_ISA_SSE4_2 },
-    { "-msse4.1",	OPTION_MASK_ISA_SSE4_1 },
-    { "-msse4a",	OPTION_MASK_ISA_SSE4A },
-    { "-mssse3",	OPTION_MASK_ISA_SSSE3 },
-    { "-msse3",		OPTION_MASK_ISA_SSE3 },
-    { "-maes",		OPTION_MASK_ISA_AES },
-    { "-msha",		OPTION_MASK_ISA_SHA },
-    { "-mpclmul",	OPTION_MASK_ISA_PCLMUL },
-    { "-msse2",		OPTION_MASK_ISA_SSE2 },
-    { "-msse",		OPTION_MASK_ISA_SSE },
-    { "-m3dnowa",	OPTION_MASK_ISA_3DNOW_A },
-    { "-m3dnow",	OPTION_MASK_ISA_3DNOW },
-    { "-mmmx",		OPTION_MASK_ISA_MMX },
-    { "-mrtm",		OPTION_MASK_ISA_RTM },
-    { "-mprfchw",	OPTION_MASK_ISA_PRFCHW },
-    { "-mrdseed",	OPTION_MASK_ISA_RDSEED },
-    { "-madx",		OPTION_MASK_ISA_ADX },
-    { "-mprefetchwt1",	OPTION_MASK_ISA_PREFETCHWT1 },
-    { "-mclflushopt",	OPTION_MASK_ISA_CLFLUSHOPT },
-    { "-mxsaves",	OPTION_MASK_ISA_XSAVES },
-    { "-mxsavec",	OPTION_MASK_ISA_XSAVEC },
-    { "-mxsaveopt",	OPTION_MASK_ISA_XSAVEOPT },
-    { "-mxsave",	OPTION_MASK_ISA_XSAVE },
-    { "-mabm",		OPTION_MASK_ISA_ABM },
-    { "-mbmi",		OPTION_MASK_ISA_BMI },
-    { "-mbmi2",		OPTION_MASK_ISA_BMI2 },
-    { "-mlzcnt",	OPTION_MASK_ISA_LZCNT },
-    { "-mtbm",		OPTION_MASK_ISA_TBM },
-    { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
-    { "-msahf",		OPTION_MASK_ISA_SAHF },
-    { "-mcrc32",	OPTION_MASK_ISA_CRC32 },
-    { "-mfsgsbase",	OPTION_MASK_ISA_FSGSBASE },
-    { "-mrdrnd",	OPTION_MASK_ISA_RDRND },
-    { "-mpku",		OPTION_MASK_ISA_PKU },
-    { "-mlwp",		OPTION_MASK_ISA_LWP },
-    { "-mfxsr",		OPTION_MASK_ISA_FXSR },
-    { "-mclwb",		OPTION_MASK_ISA_CLWB },
-    { "-mshstk",	OPTION_MASK_ISA_SHSTK },
-    { "-mmovdiri",	OPTION_MASK_ISA_MOVDIRI }
-  };
+	  /* There are no XFmode vector modes.  */
+	  if (innermode == XFmode)
+	    return mode;
 
-  /* Flag options.  */
-  static struct ix86_target_opts flag_opts[] =
-  {
-    { "-m128bit-long-double",		MASK_128BIT_LONG_DOUBLE },
-    { "-mlong-double-128",		MASK_LONG_DOUBLE_128 },
-    { "-mlong-double-64",		MASK_LONG_DOUBLE_64 },
-    { "-m80387",			MASK_80387 },
-    { "-maccumulate-outgoing-args",	MASK_ACCUMULATE_OUTGOING_ARGS },
-    { "-malign-double",			MASK_ALIGN_DOUBLE },
-    { "-mcld",				MASK_CLD },
-    { "-mfp-ret-in-387",		MASK_FLOAT_RETURNS },
-    { "-mieee-fp",			MASK_IEEE_FP },
-    { "-minline-all-stringops",		MASK_INLINE_ALL_STRINGOPS },
-    { "-minline-stringops-dynamically",	MASK_INLINE_STRINGOPS_DYNAMICALLY },
-    { "-mms-bitfields",			MASK_MS_BITFIELD_LAYOUT },
-    { "-mno-align-stringops",		MASK_NO_ALIGN_STRINGOPS },
-    { "-mno-fancy-math-387",		MASK_NO_FANCY_MATH_387 },
-    { "-mno-push-args",			MASK_NO_PUSH_ARGS },
-    { "-mno-red-zone",			MASK_NO_RED_ZONE },
-    { "-momit-leaf-frame-pointer",	MASK_OMIT_LEAF_FRAME_POINTER },
-    { "-mrecip",			MASK_RECIP },
-    { "-mrtd",				MASK_RTD },
-    { "-msseregparm",			MASK_SSEREGPARM },
-    { "-mstack-arg-probe",		MASK_STACK_PROBE },
-    { "-mtls-direct-seg-refs",		MASK_TLS_DIRECT_SEG_REFS },
-    { "-mvect8-ret-in-mem",		MASK_VECT8_RETURNS },
-    { "-m8bit-idiv",			MASK_USE_8BIT_IDIV },
-    { "-mvzeroupper",			MASK_VZEROUPPER },
-    { "-mstv",				MASK_STV },
-    { "-mavx256-split-unaligned-load",	MASK_AVX256_SPLIT_UNALIGNED_LOAD },
-    { "-mavx256-split-unaligned-store",	MASK_AVX256_SPLIT_UNALIGNED_STORE },
-    { "-mcall-ms2sysv-xlogues",		MASK_CALL_MS2SYSV_XLOGUES }
-  };
+	  if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
+	    mode = MIN_MODE_VECTOR_FLOAT;
+	  else
+	    mode = MIN_MODE_VECTOR_INT;
 
-  /* Additional flag options.  */
-  static struct ix86_target_opts flag2_opts[] =
-  {
-    { "-mgeneral-regs-only",		OPTION_MASK_GENERAL_REGS_ONLY }
-  };
+	  /* Get the mode which has this inner mode and number of units.  */
+	  FOR_EACH_MODE_FROM (mode, mode)
+	    if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
+		&& GET_MODE_INNER (mode) == innermode)
+	      {
+		if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
+		  {
+		    static bool warnedavx512f;
+		    static bool warnedavx512f_ret;
 
-  const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
-		   + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
+		    if (cum && cum->warn_avx512f && !warnedavx512f)
+		      {
+			if (warning (OPT_Wpsabi, "AVX512F vector argument "
+				     "without AVX512F enabled changes the ABI"))
+			  warnedavx512f = true;
+		      }
+		    else if (in_return && !warnedavx512f_ret)
+		      {
+			if (warning (OPT_Wpsabi, "AVX512F vector return "
+				     "without AVX512F enabled changes the ABI"))
+			  warnedavx512f_ret = true;
+		      }
 
-  char isa_other[40];
-  char isa2_other[40];
-  char flags_other[40];
-  char flags2_other[40];
-  unsigned num = 0;
-  unsigned i, j;
-  char *ret;
-  char *ptr;
-  size_t len;
-  size_t line_len;
-  size_t sep_len;
-  const char *abi;
+		    return TYPE_MODE (type);
+		  }
+		else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
+		  {
+		    static bool warnedavx;
+		    static bool warnedavx_ret;
 
-  memset (opts, '\0', sizeof (opts));
+		    if (cum && cum->warn_avx && !warnedavx)
+		      {
+			if (warning (OPT_Wpsabi, "AVX vector argument "
+				     "without AVX enabled changes the ABI"))
+			  warnedavx = true;
+		      }
+		    else if (in_return && !warnedavx_ret)
+		      {
+			if (warning (OPT_Wpsabi, "AVX vector return "
+				     "without AVX enabled changes the ABI"))
+			  warnedavx_ret = true;
+		      }
 
-  /* Add -march= option.  */
-  if (arch)
-    {
-      opts[num][0] = "-march=";
-      opts[num++][1] = arch;
-    }
-
-  /* Add -mtune= option.  */
-  if (tune)
-    {
-      opts[num][0] = "-mtune=";
-      opts[num++][1] = tune;
-    }
-
-  /* Add -m32/-m64/-mx32.  */
-  if (add_abi_p)
-    {
-      if ((isa & OPTION_MASK_ISA_64BIT) != 0)
-	{
-	  if ((isa & OPTION_MASK_ABI_64) != 0)
-	    abi = "-m64";
-	  else
-	    abi = "-mx32";
-	}
-      else
-	abi = "-m32";
-      opts[num++][0] = abi;
-    }
-  isa &= ~(OPTION_MASK_ISA_64BIT | OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
-
-  /* Pick out the options in isa2 options.  */
-  for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
-    {
-      if ((isa2 & isa2_opts[i].mask) != 0)
-	{
-	  opts[num++][0] = isa2_opts[i].option;
-	  isa2 &= ~ isa2_opts[i].mask;
-	}
-    }
-
-  if (isa2 && add_nl_p)
-    {
-      opts[num++][0] = isa2_other;
-      sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
-    }
-
-  /* Pick out the options in isa options.  */
-  for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
-    {
-      if ((isa & isa_opts[i].mask) != 0)
-	{
-	  opts[num++][0] = isa_opts[i].option;
-	  isa &= ~ isa_opts[i].mask;
-	}
-    }
-
-  if (isa && add_nl_p)
-    {
-      opts[num++][0] = isa_other;
-      sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
-    }
-
-  /* Add flag options.  */
-  for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
-    {
-      if ((flags & flag_opts[i].mask) != 0)
-	{
-	  opts[num++][0] = flag_opts[i].option;
-	  flags &= ~ flag_opts[i].mask;
-	}
-    }
-
-  if (flags && add_nl_p)
-    {
-      opts[num++][0] = flags_other;
-      sprintf (flags_other, "(other flags: %#x)", flags);
-    }
-
-    /* Add additional flag options.  */
-  for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
-    {
-      if ((flags2 & flag2_opts[i].mask) != 0)
-	{
-	  opts[num++][0] = flag2_opts[i].option;
-	  flags2 &= ~ flag2_opts[i].mask;
-	}
-    }
-
-  if (flags2 && add_nl_p)
-    {
-      opts[num++][0] = flags2_other;
-      sprintf (flags2_other, "(other flags2: %#x)", flags2);
-    }
-
-  /* Add -fpmath= option.  */
-  if (fpmath)
-    {
-      opts[num][0] = "-mfpmath=";
-      switch ((int) fpmath)
-	{
-	case FPMATH_387:
-	  opts[num++][1] = "387";
-	  break;
+		    return TYPE_MODE (type);
+		  }
+		else if (((size == 8 && TARGET_64BIT) || size == 16)
+			 && !TARGET_SSE
+			 && !TARGET_IAMCU)
+		  {
+		    static bool warnedsse;
+		    static bool warnedsse_ret;
 
-	case FPMATH_SSE:
-	  opts[num++][1] = "sse";
-	  break;
+		    if (cum && cum->warn_sse && !warnedsse)
+		      {
+			if (warning (OPT_Wpsabi, "SSE vector argument "
+				     "without SSE enabled changes the ABI"))
+			  warnedsse = true;
+		      }
+		    else if (!TARGET_64BIT && in_return && !warnedsse_ret)
+		      {
+			if (warning (OPT_Wpsabi, "SSE vector return "
+				     "without SSE enabled changes the ABI"))
+			  warnedsse_ret = true;
+		      }
+		  }
+		else if ((size == 8 && !TARGET_64BIT)
+			 && (!cfun
+			     || cfun->machine->func_type == TYPE_NORMAL)
+			 && !TARGET_MMX
+			 && !TARGET_IAMCU)
+		  {
+		    static bool warnedmmx;
+		    static bool warnedmmx_ret;
 
-	case FPMATH_387 | FPMATH_SSE:
-	  opts[num++][1] = "sse+387";
-	  break;
+		    if (cum && cum->warn_mmx && !warnedmmx)
+		      {
+			if (warning (OPT_Wpsabi, "MMX vector argument "
+				     "without MMX enabled changes the ABI"))
+			  warnedmmx = true;
+		      }
+		    else if (in_return && !warnedmmx_ret)
+		      {
+			if (warning (OPT_Wpsabi, "MMX vector return "
+				     "without MMX enabled changes the ABI"))
+			  warnedmmx_ret = true;
+		      }
+		  }
+		return mode;
+	      }
 
-	default:
 	  gcc_unreachable ();
 	}
     }
 
-  /* Any options?  */
-  if (num == 0)
-    return NULL;
-
-  gcc_assert (num < ARRAY_SIZE (opts));
-
-  /* Size the string.  */
-  len = 0;
-  sep_len = (add_nl_p) ? 3 : 1;
-  for (i = 0; i < num; i++)
-    {
-      len += sep_len;
-      for (j = 0; j < 2; j++)
-	if (opts[i][j])
-	  len += strlen (opts[i][j]);
-    }
-
-  /* Build the string.  */
-  ret = ptr = (char *) xmalloc (len);
-  line_len = 0;
-
-  for (i = 0; i < num; i++)
-    {
-      size_t len2[2];
-
-      for (j = 0; j < 2; j++)
-	len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
-
-      if (i != 0)
-	{
-	  *ptr++ = ' ';
-	  line_len++;
-
-	  if (add_nl_p && line_len + len2[0] + len2[1] > 70)
-	    {
-	      *ptr++ = '\\';
-	      *ptr++ = '\n';
-	      line_len = 0;
-	    }
-	}
-
-      for (j = 0; j < 2; j++)
-	if (opts[i][j])
-	  {
-	    memcpy (ptr, opts[i][j], len2[j]);
-	    ptr += len2[j];
-	    line_len += len2[j];
-	  }
-    }
-
-  *ptr = '\0';
-  gcc_assert (ret + len >= ptr);
-
-  return ret;
+  return mode;
 }
 
-/* Return true, if profiling code should be emitted before
-   prologue. Otherwise it returns false.
-   Note: For x86 with "hotfix" it is sorried.  */
-static bool
-ix86_profile_before_prologue (void)
-{
-  return flag_fentry != 0;
-}
+/* We want to pass a value in REGNO whose "natural" mode is MODE.  However,
+   this may not agree with the mode that the type system has chosen for the
+   register, which is ORIG_MODE.  If ORIG_MODE is not BLKmode, then we can
+   go ahead and use it.  Otherwise we have to build a PARALLEL instead.  */
 
-/* Function that is callable from the debugger to print the current
-   options.  */
-void ATTRIBUTE_UNUSED
-ix86_debug_options (void)
+static rtx
+gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
+		     unsigned int regno)
 {
-  char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
-				   target_flags, ix86_target_flags,
-				   ix86_arch_string,ix86_tune_string,
-				   ix86_fpmath, true, true);
+  rtx tmp;
 
-  if (opts)
+  if (orig_mode != BLKmode)
+    tmp = gen_rtx_REG (orig_mode, regno);
+  else
     {
-      fprintf (stderr, "%s\n\n", opts);
-      free (opts);
+      tmp = gen_rtx_REG (mode, regno);
+      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
+      tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
     }
-  else
-    fputs ("<no options>\n\n", stderr);
 
-  return;
+  return tmp;
 }
 
-static const char *stringop_alg_names[] = {
-#define DEF_ENUM
-#define DEF_ALG(alg, name) #name,
-#include "stringop.def"
-#undef DEF_ENUM
-#undef DEF_ALG
-};
+/* x86-64 register passing implementation.  See x86-64 ABI for details.  Goal
+   of this code is to classify each 8bytes of incoming argument by the register
+   class and assign registers accordingly.  */
 
-/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
-   The string is of the following form (or comma separated list of it):
+/* Return the union class of CLASS1 and CLASS2.
+   See the x86-64 PS ABI for details.  */
 
-     strategy_alg:max_size:[align|noalign]
+static enum x86_64_reg_class
+merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
+{
+  /* Rule #1: If both classes are equal, this is the resulting class.  */
+  if (class1 == class2)
+    return class1;
 
-   where the full size range for the strategy is either [0, max_size] or
-   [min_size, max_size], in which min_size is the max_size + 1 of the
-   preceding range.  The last size range must have max_size == -1.
+  /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
+     the other class.  */
+  if (class1 == X86_64_NO_CLASS)
+    return class2;
+  if (class2 == X86_64_NO_CLASS)
+    return class1;
 
-   Examples:
+  /* Rule #3: If one of the classes is MEMORY, the result is MEMORY.  */
+  if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
+    return X86_64_MEMORY_CLASS;
 
-    1.
-       -mmemcpy-strategy=libcall:-1:noalign
+  /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */
+  if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
+      || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
+    return X86_64_INTEGERSI_CLASS;
+  if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
+      || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
+    return X86_64_INTEGER_CLASS;
 
-      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
+  /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
+     MEMORY is used.  */
+  if (class1 == X86_64_X87_CLASS
+      || class1 == X86_64_X87UP_CLASS
+      || class1 == X86_64_COMPLEX_X87_CLASS
+      || class2 == X86_64_X87_CLASS
+      || class2 == X86_64_X87UP_CLASS
+      || class2 == X86_64_COMPLEX_X87_CLASS)
+    return X86_64_MEMORY_CLASS;
 
+  /* Rule #6: Otherwise class SSE is used.  */
+  return X86_64_SSE_CLASS;
+}
 
-   2.
-      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
+/* Classify the argument of type TYPE and mode MODE.
+   CLASSES will be filled by the register class used to pass each word
+   of the operand.  The number of words is returned.  In case the parameter
+   should be passed in memory, 0 is returned. As a special case for zero
+   sized containers, classes[0] will be NO_CLASS and 1 is returned.
 
-      This is to tell the compiler to use the following strategy for memset
-      1) when the expected size is between [1, 16], use rep_8byte strategy;
-      2) when the size is between [17, 2048], use vector_loop;
-      3) when the size is > 2048, use libcall.  */
+   BIT_OFFSET is used internally for handling records and specifies offset
+   of the offset in bits modulo 512 to avoid overflow cases.
 
-struct stringop_size_range
-{
-  int max;
-  stringop_alg alg;
-  bool noalign;
-};
+   See the x86-64 PS ABI for details.
+*/
 
-static void
-ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
+static int
+classify_argument (machine_mode mode, const_tree type,
+		   enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
 {
-  const struct stringop_algs *default_algs;
-  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
-  char *curr_range_str, *next_range_str;
-  const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
-  int i = 0, n = 0;
-
-  if (is_memset)
-    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
-  else
-    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
+  HOST_WIDE_INT bytes
+    = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
+  int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
 
-  curr_range_str = strategy_str;
+  /* Variable sized entities are always passed/returned in memory.  */
+  if (bytes < 0)
+    return 0;
 
-  do
+  if (mode != VOIDmode)
     {
-      int maxs;
-      char alg_name[128];
-      char align[16];
-      next_range_str = strchr (curr_range_str, ',');
-      if (next_range_str)
-        *next_range_str++ = '\0';
-
-      if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs,
-		  align) != 3)
-        {
-	  error ("wrong argument %qs to option %qs", curr_range_str, opt);
-          return;
-        }
+      /* The value of "named" doesn't matter.  */
+      function_arg_info arg (const_cast<tree> (type), mode, /*named=*/true);
+      if (targetm.calls.must_pass_in_stack (arg))
+	return 0;
+    }
 
-      if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
-        {
-	  error ("size ranges of option %qs should be increasing", opt);
-          return;
-        }
+  if (type && AGGREGATE_TYPE_P (type))
+    {
+      int i;
+      tree field;
+      enum x86_64_reg_class subclasses[MAX_CLASSES];
 
-      for (i = 0; i < last_alg; i++)
-	if (!strcmp (alg_name, stringop_alg_names[i]))
-	  break;
+      /* On x86-64 we pass structures larger than 64 bytes on the stack.  */
+      if (bytes > 64)
+	return 0;
 
-      if (i == last_alg)
-        {
-	  error ("wrong strategy name %qs specified for option %qs",
-		 alg_name, opt);
-
-	  auto_vec <const char *> candidates;
-	  for (i = 0; i < last_alg; i++)
-	    if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
-	      candidates.safe_push (stringop_alg_names[i]);
-
-	  char *s;
-	  const char *hint
-	    = candidates_list_and_hint (alg_name, s, candidates);
-	  if (hint)
-	    inform (input_location,
-		    "valid arguments to %qs are: %s; did you mean %qs?",
-		    opt, s, hint);
-	  else
-	    inform (input_location, "valid arguments to %qs are: %s",
-		    opt, s);
-	  XDELETEVEC (s);
-          return;
-        }
+      for (i = 0; i < words; i++)
+	classes[i] = X86_64_NO_CLASS;
 
-      if ((stringop_alg) i == rep_prefix_8_byte
-	  && !TARGET_64BIT)
+      /* Zero sized arrays or structures are NO_CLASS.  We return 0 to
+	 signalize memory class, so handle it as special case.  */
+      if (!words)
 	{
-	  /* rep; movq isn't available in 32-bit code.  */
-	  error ("strategy name %qs specified for option %qs "
-		 "not supported for 32-bit code", alg_name, opt);
-	  return;
+	  classes[0] = X86_64_NO_CLASS;
+	  return 1;
 	}
 
-      input_ranges[n].max = maxs;
-      input_ranges[n].alg = (stringop_alg) i;
-      if (!strcmp (align, "align"))
-        input_ranges[n].noalign = false;
-      else if (!strcmp (align, "noalign"))
-        input_ranges[n].noalign = true;
-      else
-        {
-	  error ("unknown alignment %qs specified for option %qs", align, opt);
-          return;
-        }
-      n++;
-      curr_range_str = next_range_str;
-    }
-  while (curr_range_str);
-
-  if (input_ranges[n - 1].max != -1)
-    {
-      error ("the max value for the last size range should be -1"
-             " for option %qs", opt);
-      return;
-    }
+      /* Classify each field of record and merge classes.  */
+      switch (TREE_CODE (type))
+	{
+	case RECORD_TYPE:
+	  /* And now merge the fields of structure.  */
+	  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
+	    {
+	      if (TREE_CODE (field) == FIELD_DECL)
+		{
+		  int num;
 
-  if (n > MAX_STRINGOP_ALGS)
-    {
-      error ("too many size ranges specified in option %qs", opt);
-      return;
-    }
+		  if (TREE_TYPE (field) == error_mark_node)
+		    continue;
 
-  /* Now override the default algs array.  */
-  for (i = 0; i < n; i++)
-    {
-      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
-      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
-          = input_ranges[i].alg;
-      *const_cast<int *>(&default_algs->size[i].noalign)
-          = input_ranges[i].noalign;
-    }
-}
-
-
-/* parse -mtune-ctrl= option. When DUMP is true,
-   print the features that are explicitly set.  */
-
-static void
-parse_mtune_ctrl_str (bool dump)
-{
-  if (!ix86_tune_ctrl_string)
-    return;
-
-  char *next_feature_string = NULL;
-  char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
-  char *orig = curr_feature_string;
-  int i;
-  do
-    {
-      bool clear = false;
-
-      next_feature_string = strchr (curr_feature_string, ',');
-      if (next_feature_string)
-        *next_feature_string++ = '\0';
-      if (*curr_feature_string == '^')
-        {
-          curr_feature_string++;
-          clear = true;
-        }
-      for (i = 0; i < X86_TUNE_LAST; i++)
-        {
-          if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
-            {
-              ix86_tune_features[i] = !clear;
-              if (dump)
-                fprintf (stderr, "Explicitly %s feature %s\n",
-                         clear ? "clear" : "set", ix86_tune_feature_names[i]);
-              break;
-            }
-        }
-      if (i == X86_TUNE_LAST)
-	error ("unknown parameter to option %<-mtune-ctrl%>: %s",
-	       clear ? curr_feature_string - 1 : curr_feature_string);
-      curr_feature_string = next_feature_string;
-    }
-  while (curr_feature_string);
-  free (orig);
-}
-
-/* Helper function to set ix86_tune_features. IX86_TUNE is the
-   processor type.  */
-
-static void
-set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
-{
-  unsigned HOST_WIDE_INT ix86_tune_mask = HOST_WIDE_INT_1U << ix86_tune;
-  int i;
-
-  for (i = 0; i < X86_TUNE_LAST; ++i)
-    {
-      if (ix86_tune_no_default)
-        ix86_tune_features[i] = 0;
-      else
-	ix86_tune_features[i]
-	  = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
-    }
-
-  if (dump)
-    {
-      fprintf (stderr, "List of x86 specific tuning parameter names:\n");
-      for (i = 0; i < X86_TUNE_LAST; i++)
-        fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
-                 ix86_tune_features[i] ? "on" : "off");
-    }
-
-  parse_mtune_ctrl_str (dump);
-}
-
-
-/* Default align_* from the processor table.  */
+		  /* Bitfields are always classified as integer.  Handle them
+		     early, since later code would consider them to be
+		     misaligned integers.  */
+		  if (DECL_BIT_FIELD (field))
+		    {
+		      for (i = (int_bit_position (field)
+				+ (bit_offset % 64)) / 8 / 8;
+			   i < ((int_bit_position (field) + (bit_offset % 64))
+			        + tree_to_shwi (DECL_SIZE (field))
+				+ 63) / 8 / 8; i++)
+			classes[i]
+			  = merge_classes (X86_64_INTEGER_CLASS, classes[i]);
+		    }
+		  else
+		    {
+		      int pos;
 
-static void
-ix86_default_align (struct gcc_options *opts)
-{
-  /* -falign-foo without argument: supply one.  */
-  if (opts->x_flag_align_loops && !opts->x_str_align_loops)
-    opts->x_str_align_loops = processor_cost_table[ix86_tune]->align_loop;
-  if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
-    opts->x_str_align_jumps = processor_cost_table[ix86_tune]->align_jump;
-  if (opts->x_flag_align_labels && !opts->x_str_align_labels)
-    opts->x_str_align_labels = processor_cost_table[ix86_tune]->align_label;
-  if (opts->x_flag_align_functions && !opts->x_str_align_functions)
-    opts->x_str_align_functions = processor_cost_table[ix86_tune]->align_func;
-}
+		      type = TREE_TYPE (field);
 
-/* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook.  */
+		      /* Flexible array member is ignored.  */
+		      if (TYPE_MODE (type) == BLKmode
+			  && TREE_CODE (type) == ARRAY_TYPE
+			  && TYPE_SIZE (type) == NULL_TREE
+			  && TYPE_DOMAIN (type) != NULL_TREE
+			  && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
+			      == NULL_TREE))
+			{
+			  static bool warned;
 
-static void
-ix86_override_options_after_change (void)
-{
-  ix86_default_align (&global_options);
-}
+			  if (!warned && warn_psabi)
+			    {
+			      warned = true;
+			      inform (input_location,
+				      "the ABI of passing struct with"
+				      " a flexible array member has"
+				      " changed in GCC 4.4");
+			    }
+			  continue;
+			}
+		      num = classify_argument (TYPE_MODE (type), type,
+					       subclasses,
+					       (int_bit_position (field)
+						+ bit_offset) % 512);
+		      if (!num)
+			return 0;
+		      pos = (int_bit_position (field)
+			     + (bit_offset % 64)) / 8 / 8;
+		      for (i = 0; i < num && (i + pos) < words; i++)
+			classes[i + pos]
+			  = merge_classes (subclasses[i], classes[i + pos]);
+		    }
+		}
+	    }
+	  break;
 
+	case ARRAY_TYPE:
+	  /* Arrays are handled as small records.  */
+	  {
+	    int num;
+	    num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
+				     TREE_TYPE (type), subclasses, bit_offset);
+	    if (!num)
+	      return 0;
 
+	    /* The partial classes are now full classes.  */
+	    if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
+	      subclasses[0] = X86_64_SSE_CLASS;
+	    if (subclasses[0] == X86_64_INTEGERSI_CLASS
+		&& !((bit_offset % 64) == 0 && bytes == 4))
+	      subclasses[0] = X86_64_INTEGER_CLASS;
 
-/* Override various settings based on options.  If MAIN_ARGS_P, the
-   options are from the command line, otherwise they are from
-   attributes.  Return true if there's an error related to march
-   option.  */
+	    for (i = 0; i < words; i++)
+	      classes[i] = subclasses[i % num];
 
-static bool
-ix86_option_override_internal (bool main_args_p,
-			       struct gcc_options *opts,
-			       struct gcc_options *opts_set)
-{
-  int i;
-  unsigned HOST_WIDE_INT ix86_arch_mask;
-  const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
+	    break;
+	  }
+	case UNION_TYPE:
+	case QUAL_UNION_TYPE:
+	  /* Unions are similar to RECORD_TYPE but offset is always 0.
+	     */
+	  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
+	    {
+	      if (TREE_CODE (field) == FIELD_DECL)
+		{
+		  int num;
 
-  /* -mrecip options.  */
-  static struct
-    {
-      const char *string;           /* option name */
-      unsigned int mask;            /* mask bits to set */
-    }
-  const recip_options[] =
-    {
-      { "all",       RECIP_MASK_ALL },
-      { "none",      RECIP_MASK_NONE },
-      { "div",       RECIP_MASK_DIV },
-      { "sqrt",      RECIP_MASK_SQRT },
-      { "vec-div",   RECIP_MASK_VEC_DIV },
-      { "vec-sqrt",  RECIP_MASK_VEC_SQRT },
-    };
+		  if (TREE_TYPE (field) == error_mark_node)
+		    continue;
 
+		  num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
+					   TREE_TYPE (field), subclasses,
+					   bit_offset);
+		  if (!num)
+		    return 0;
+		  for (i = 0; i < num && i < words; i++)
+		    classes[i] = merge_classes (subclasses[i], classes[i]);
+		}
+	    }
+	  break;
 
-  /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
-     TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false.  */
-  if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
-    opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
-#ifdef TARGET_BI_ARCH
-  else
-    {
-#if TARGET_BI_ARCH == 1
-      /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
-	 is on and OPTION_MASK_ABI_X32 is off.  We turn off
-	 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
-	 -mx32.  */
-      if (TARGET_X32_P (opts->x_ix86_isa_flags))
-	opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
-#else
-      /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
-	 on and OPTION_MASK_ABI_64 is off.  We turn off
-	 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
-	 -m64 or OPTION_MASK_CODE16 is turned on by -m16.  */
-      if (TARGET_LP64_P (opts->x_ix86_isa_flags)
-	  || TARGET_16BIT_P (opts->x_ix86_isa_flags))
-	opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
-#endif
-      if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
-	  && TARGET_IAMCU_P (opts->x_target_flags))
-	sorry ("Intel MCU psABI isn%'t supported in %s mode",
-	       TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
-    }
-#endif
+	default:
+	  gcc_unreachable ();
+	}
 
-  if (TARGET_X32_P (opts->x_ix86_isa_flags))
-    {
-      /* Always turn on OPTION_MASK_ISA_64BIT and turn off
-	 OPTION_MASK_ABI_64 for TARGET_X32.  */
-      opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
-      opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
-    }
-  else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
-    opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
-				| OPTION_MASK_ABI_X32
-				| OPTION_MASK_ABI_64);
-  else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
-    {
-      /* Always turn on OPTION_MASK_ISA_64BIT and turn off
-	 OPTION_MASK_ABI_X32 for TARGET_LP64.  */
-      opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
-      opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
-    }
+      if (words > 2)
+	{
+	  /* When size > 16 bytes, if the first one isn't
+	     X86_64_SSE_CLASS or any other ones aren't
+	     X86_64_SSEUP_CLASS, everything should be passed in
+	     memory.  */
+	  if (classes[0] != X86_64_SSE_CLASS)
+	      return 0;
 
-#ifdef SUBTARGET_OVERRIDE_OPTIONS
-  SUBTARGET_OVERRIDE_OPTIONS;
-#endif
+	  for (i = 1; i < words; i++)
+	    if (classes[i] != X86_64_SSEUP_CLASS)
+	      return 0;
+	}
 
-#ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
-  SUBSUBTARGET_OVERRIDE_OPTIONS;
-#endif
+      /* Final merger cleanup.  */
+      for (i = 0; i < words; i++)
+	{
+	  /* If one class is MEMORY, everything should be passed in
+	     memory.  */
+	  if (classes[i] == X86_64_MEMORY_CLASS)
+	    return 0;
 
-  /* -fPIC is the default for x86_64.  */
-  if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
-    opts->x_flag_pic = 2;
+	  /* The X86_64_SSEUP_CLASS should be always preceded by
+	     X86_64_SSE_CLASS or X86_64_SSEUP_CLASS.  */
+	  if (classes[i] == X86_64_SSEUP_CLASS
+	      && classes[i - 1] != X86_64_SSE_CLASS
+	      && classes[i - 1] != X86_64_SSEUP_CLASS)
+	    {
+	      /* The first one should never be X86_64_SSEUP_CLASS.  */
+	      gcc_assert (i != 0);
+	      classes[i] = X86_64_SSE_CLASS;
+	    }
 
-  /* Need to check -mtune=generic first.  */
-  if (opts->x_ix86_tune_string)
-    {
-      /* As special support for cross compilers we read -mtune=native
-	     as -mtune=generic.  With native compilers we won't see the
-	     -mtune=native, as it was changed by the driver.  */
-      if (!strcmp (opts->x_ix86_tune_string, "native"))
-	{
-	  opts->x_ix86_tune_string = "generic";
-	}
-      else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
-        warning (OPT_Wdeprecated,
-		 main_args_p
-		 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
-		      "or %<-mtune=generic%> instead as appropriate")
-		 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
-		      "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
-		      " instead as appropriate"));
-    }
-  else
-    {
-      if (opts->x_ix86_arch_string)
-	opts->x_ix86_tune_string = opts->x_ix86_arch_string;
-      if (!opts->x_ix86_tune_string)
-	{
-	  opts->x_ix86_tune_string = processor_names[TARGET_CPU_DEFAULT];
-	  ix86_tune_defaulted = 1;
-	}
+	  /*  If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
+	       everything should be passed in memory.  */
+	  if (classes[i] == X86_64_X87UP_CLASS
+	      && (classes[i - 1] != X86_64_X87_CLASS))
+	    {
+	      static bool warned;
 
-      /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
-	 or defaulted.  We need to use a sensible tune option.  */
-      if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
-	{
-	  opts->x_ix86_tune_string = "generic";
+	      /* The first one should never be X86_64_X87UP_CLASS.  */
+	      gcc_assert (i != 0);
+	      if (!warned && warn_psabi)
+		{
+		  warned = true;
+		  inform (input_location,
+			  "the ABI of passing union with %<long double%>"
+			  " has changed in GCC 4.4");
+		}
+	      return 0;
+	    }
 	}
+      return words;
     }
 
-  if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
-      && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
+  /* Compute alignment needed.  We align all types to natural boundaries with
+     exception of XFmode that is aligned to 64bits.  */
+  if (mode != VOIDmode && mode != BLKmode)
     {
-      /* rep; movq isn't available in 32-bit code.  */
-      error ("%<-mstringop-strategy=rep_8byte%> not supported for 32-bit code");
-      opts->x_ix86_stringop_alg = no_stringop;
-    }
-
-  if (!opts->x_ix86_arch_string)
-    opts->x_ix86_arch_string
-      = TARGET_64BIT_P (opts->x_ix86_isa_flags)
-	? "x86-64" : SUBTARGET32_DEFAULT_CPU;
-  else
-    ix86_arch_specified = 1;
+      int mode_alignment = GET_MODE_BITSIZE (mode);
 
-  if (opts_set->x_ix86_pmode)
-    {
-      if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
-	   && opts->x_ix86_pmode == PMODE_SI)
-	  || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
-	       && opts->x_ix86_pmode == PMODE_DI))
-	error ("address mode %qs not supported in the %s bit mode",
-	       TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
-	       TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
+      if (mode == XFmode)
+	mode_alignment = 128;
+      else if (mode == XCmode)
+	mode_alignment = 256;
+      if (COMPLEX_MODE_P (mode))
+	mode_alignment /= 2;
+      /* Misaligned fields are always returned in memory.  */
+      if (bit_offset % mode_alignment)
+	return 0;
     }
-  else
-    opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
-			 ? PMODE_DI : PMODE_SI;
-
-  if (!opts_set->x_ix86_abi)
-    opts->x_ix86_abi = DEFAULT_ABI;
-
-  if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
-    error ("%<-mabi=ms%> not supported with X32 ABI");
-  gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
-
-  const char *abi_name = opts->x_ix86_abi == MS_ABI ? "ms" : "sysv";
-  if ((opts->x_flag_sanitize & SANITIZE_USER_ADDRESS)
-      && opts->x_ix86_abi != DEFAULT_ABI)
-    error ("%<-mabi=%s%> not supported with %<-fsanitize=address%>", abi_name);
-  if ((opts->x_flag_sanitize & SANITIZE_KERNEL_ADDRESS)
-      && opts->x_ix86_abi != DEFAULT_ABI)
-    error ("%<-mabi=%s%> not supported with %<-fsanitize=kernel-address%>",
-	   abi_name);
-  if ((opts->x_flag_sanitize & SANITIZE_THREAD)
-      && opts->x_ix86_abi != DEFAULT_ABI)
-    error ("%<-mabi=%s%> not supported with %<-fsanitize=thread%>", abi_name);
-
-  /* For targets using ms ABI enable ms-extensions, if not
-     explicit turned off.  For non-ms ABI we turn off this
-     option.  */
-  if (!opts_set->x_flag_ms_extensions)
-    opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
-
-  if (opts_set->x_ix86_cmodel)
-    {
-      switch (opts->x_ix86_cmodel)
-	{
-	case CM_SMALL:
-	case CM_SMALL_PIC:
-	  if (opts->x_flag_pic)
-	    opts->x_ix86_cmodel = CM_SMALL_PIC;
-	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
-	    error ("code model %qs not supported in the %s bit mode",
-		   "small", "32");
-	  break;
 
-	case CM_MEDIUM:
-	case CM_MEDIUM_PIC:
-	  if (opts->x_flag_pic)
-	    opts->x_ix86_cmodel = CM_MEDIUM_PIC;
-	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
-	    error ("code model %qs not supported in the %s bit mode",
-		   "medium", "32");
-	  else if (TARGET_X32_P (opts->x_ix86_isa_flags))
-	    error ("code model %qs not supported in x32 mode",
-		   "medium");
-	  break;
+  /* for V1xx modes, just use the base mode */
+  if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
+      && GET_MODE_UNIT_SIZE (mode) == bytes)
+    mode = GET_MODE_INNER (mode);
 
-	case CM_LARGE:
-	case CM_LARGE_PIC:
-	  if (opts->x_flag_pic)
-	    opts->x_ix86_cmodel = CM_LARGE_PIC;
-	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
-	    error ("code model %qs not supported in the %s bit mode",
-		   "large", "32");
-	  else if (TARGET_X32_P (opts->x_ix86_isa_flags))
-	    error ("code model %qs not supported in x32 mode",
-		   "large");
-	  break;
+  /* Classification of atomic types.  */
+  switch (mode)
+    {
+    case E_SDmode:
+    case E_DDmode:
+      classes[0] = X86_64_SSE_CLASS;
+      return 1;
+    case E_TDmode:
+      classes[0] = X86_64_SSE_CLASS;
+      classes[1] = X86_64_SSEUP_CLASS;
+      return 2;
+    case E_DImode:
+    case E_SImode:
+    case E_HImode:
+    case E_QImode:
+    case E_CSImode:
+    case E_CHImode:
+    case E_CQImode:
+      {
+	int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
 
-	case CM_32:
-	  if (opts->x_flag_pic)
-	    error ("code model %s does not support PIC mode", "32");
-	  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
-	    error ("code model %qs not supported in the %s bit mode",
-		   "32", "64");
-	  break;
+	/* Analyze last 128 bits only.  */
+	size = (size - 1) & 0x7f;
 
-	case CM_KERNEL:
-	  if (opts->x_flag_pic)
-	    {
-	      error ("code model %s does not support PIC mode", "kernel");
-	      opts->x_ix86_cmodel = CM_32;
-	    }
-	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
-	    error ("code model %qs not supported in the %s bit mode",
-		   "kernel", "32");
-	  break;
-
-	default:
-	  gcc_unreachable ();
-	}
-    }
-  else
-    {
-      /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
-	 use of rip-relative addressing.  This eliminates fixups that
-	 would otherwise be needed if this object is to be placed in a
-	 DLL, and is essentially just as efficient as direct addressing.  */
-      if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
-	  && (TARGET_RDOS || TARGET_PECOFF))
-	opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
-      else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
-	opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
-      else
-	opts->x_ix86_cmodel = CM_32;
-    }
-  if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
-    {
-      error ("%<-masm=intel%> not supported in this configuration");
-      opts->x_ix86_asm_dialect = ASM_ATT;
-    }
-  if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
-      != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
-    sorry ("%i-bit mode not compiled in",
-	   (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
-
-  for (i = 0; i < pta_size; i++)
-    if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
-      {
-	if (!strcmp (opts->x_ix86_arch_string, "generic"))
+	if (size < 32)
 	  {
-	    error (main_args_p
-		   ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
-			"switch")
-		   : G_("%<generic%> CPU can be used only for "
-			"%<target(\"tune=\")%> attribute"));
-	    return false;
+	    classes[0] = X86_64_INTEGERSI_CLASS;
+	    return 1;
 	  }
-	else if (!strcmp (opts->x_ix86_arch_string, "intel"))
+	else if (size < 64)
 	  {
-	    error (main_args_p
-		   ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
-			"switch")
-		   : G_("%<intel%> CPU can be used only for "
-			"%<target(\"tune=\")%> attribute"));
-	    return false;
+	    classes[0] = X86_64_INTEGER_CLASS;
+	    return 1;
 	  }
-
-	if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
-	    && !((processor_alias_table[i].flags & PTA_64BIT) != 0))
+	else if (size < 64+32)
 	  {
-	    error ("CPU you selected does not support x86-64 "
-		   "instruction set");
-	    return false;
+	    classes[0] = X86_64_INTEGER_CLASS;
+	    classes[1] = X86_64_INTEGERSI_CLASS;
+	    return 2;
 	  }
-
-	ix86_schedule = processor_alias_table[i].schedule;
-	ix86_arch = processor_alias_table[i].processor;
-	/* Default cpu tuning to the architecture.  */
-	ix86_tune = ix86_arch;
-
-	if (((processor_alias_table[i].flags & PTA_MMX) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
-	if (((processor_alias_table[i].flags & PTA_3DNOW) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
-	if (((processor_alias_table[i].flags & PTA_3DNOW_A) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
-	if (((processor_alias_table[i].flags & PTA_SSE) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
-	if (((processor_alias_table[i].flags & PTA_SSE2) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
-	if (((processor_alias_table[i].flags & PTA_SSE3) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
-	if (((processor_alias_table[i].flags & PTA_SSSE3) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
-	if (((processor_alias_table[i].flags & PTA_SSE4_1) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
-	if (((processor_alias_table[i].flags & PTA_SSE4_2) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
-	if (((processor_alias_table[i].flags & PTA_AVX) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
-	if (((processor_alias_table[i].flags & PTA_AVX2) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
-	if (((processor_alias_table[i].flags & PTA_FMA) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
-	if (((processor_alias_table[i].flags & PTA_SSE4A) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
-	if (((processor_alias_table[i].flags & PTA_FMA4) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
-	if (((processor_alias_table[i].flags & PTA_XOP) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
-	if (((processor_alias_table[i].flags & PTA_LWP) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
-	if (((processor_alias_table[i].flags & PTA_ABM) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
-	if (((processor_alias_table[i].flags & PTA_BMI) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
-	if (((processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
-	if (((processor_alias_table[i].flags & PTA_TBM) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
-	if (((processor_alias_table[i].flags & PTA_BMI2) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
-	if (((processor_alias_table[i].flags & PTA_CX16) != 0)
-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CX16))
-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CX16;
-	if (((processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
-	if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
-	    && ((processor_alias_table[i].flags & PTA_NO_SAHF) != 0))
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
-	if (((processor_alias_table[i].flags & PTA_MOVBE) != 0)
-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MOVBE))
-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MOVBE;
-	if (((processor_alias_table[i].flags & PTA_AES) != 0)
-	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
-	  ix86_isa_flags |= OPTION_MASK_ISA_AES;
-	if (((processor_alias_table[i].flags & PTA_SHA) != 0)
-	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
-	  ix86_isa_flags |= OPTION_MASK_ISA_SHA;
-	if (((processor_alias_table[i].flags & PTA_PCLMUL) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
-	if (((processor_alias_table[i].flags & PTA_FSGSBASE) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
-	if (((processor_alias_table[i].flags & PTA_RDRND) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
-	if (((processor_alias_table[i].flags & PTA_F16C) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
-	if (((processor_alias_table[i].flags & PTA_RTM) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
-	if (((processor_alias_table[i].flags & PTA_HLE) != 0)
-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_HLE))
-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_HLE;
-	if (((processor_alias_table[i].flags & PTA_PRFCHW) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
-	if (((processor_alias_table[i].flags & PTA_RDSEED) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
-	if (((processor_alias_table[i].flags & PTA_ADX) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
-	if (((processor_alias_table[i].flags & PTA_FXSR) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
-	if (((processor_alias_table[i].flags & PTA_XSAVE) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
-	if (((processor_alias_table[i].flags & PTA_XSAVEOPT) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
-	if (((processor_alias_table[i].flags & PTA_AVX512F) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
-	if (((processor_alias_table[i].flags & PTA_AVX512ER) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
-	if (((processor_alias_table[i].flags & PTA_AVX512PF) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
-	if (((processor_alias_table[i].flags & PTA_AVX512CD) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
-	if (((processor_alias_table[i].flags & PTA_PREFETCHWT1) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
-	if (((processor_alias_table[i].flags & PTA_CLWB) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
-	if (((processor_alias_table[i].flags & PTA_CLFLUSHOPT) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
-	if (((processor_alias_table[i].flags & PTA_CLZERO) != 0)
-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CLZERO))
-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CLZERO;
-	if (((processor_alias_table[i].flags & PTA_XSAVEC) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
-	if (((processor_alias_table[i].flags & PTA_XSAVES) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
-	if (((processor_alias_table[i].flags & PTA_AVX512DQ) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
-	if (((processor_alias_table[i].flags & PTA_AVX512BW) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
-	if (((processor_alias_table[i].flags & PTA_AVX512VL) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
-	if (((processor_alias_table[i].flags & PTA_AVX512VBMI) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
-	if (((processor_alias_table[i].flags & PTA_AVX512IFMA) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
-	if (((processor_alias_table[i].flags & PTA_AVX512VNNI) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VNNI))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VNNI;
-	if (((processor_alias_table[i].flags & PTA_GFNI) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_GFNI))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_GFNI;
-	if (((processor_alias_table[i].flags & PTA_AVX512VBMI2) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit
-	    & OPTION_MASK_ISA_AVX512VBMI2))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI2;
-	if (((processor_alias_table[i].flags & PTA_VPCLMULQDQ) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_VPCLMULQDQ))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_VPCLMULQDQ;
-	if (((processor_alias_table[i].flags & PTA_AVX512BITALG) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit
-	    & OPTION_MASK_ISA_AVX512BITALG))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BITALG;
-
-	if (((processor_alias_table[i].flags & PTA_AVX5124VNNIW) != 0)
-	    && !(opts->x_ix86_isa_flags2_explicit
-		 & OPTION_MASK_ISA_AVX5124VNNIW))
-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
-	if (((processor_alias_table[i].flags & PTA_AVX5124FMAPS) != 0)
-	    && !(opts->x_ix86_isa_flags2_explicit
-		 & OPTION_MASK_ISA_AVX5124FMAPS))
-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
-	if (((processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit
-		 & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
-	if (((processor_alias_table[i].flags & PTA_SGX) != 0)
-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
-	if (((processor_alias_table[i].flags & PTA_VAES) != 0)
-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_VAES))
-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_VAES;
-	if (((processor_alias_table[i].flags & PTA_RDPID) != 0)
-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_RDPID))
-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_RDPID;
-	if (((processor_alias_table[i].flags & PTA_PCONFIG) != 0)
-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PCONFIG))
-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PCONFIG;
-	if (((processor_alias_table[i].flags & PTA_WBNOINVD) != 0)
-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_WBNOINVD))
-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_WBNOINVD;
-	if (((processor_alias_table[i].flags & PTA_PTWRITE) != 0)
-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PTWRITE))
-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PTWRITE;
-
-	if ((processor_alias_table[i].flags
-	   & (PTA_PREFETCH_SSE | PTA_SSE)) != 0)
-	  x86_prefetch_sse = true;
-	if (((processor_alias_table[i].flags & PTA_MWAITX) != 0)
-	    && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MWAITX))
-	  opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MWAITX;
-	if (((processor_alias_table[i].flags & PTA_PKU) != 0)
-	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
-	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
-
-	/* Don't enable x87 instructions if only
-	   general registers are allowed.  */
-	if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
-	    && !(opts_set->x_target_flags & MASK_80387))
+	else if (size < 64+64)
 	  {
-	    if (((processor_alias_table[i].flags & PTA_NO_80387) != 0))
-	      opts->x_target_flags &= ~MASK_80387;
-	    else
-	      opts->x_target_flags |= MASK_80387;
+	    classes[0] = classes[1] = X86_64_INTEGER_CLASS;
+	    return 2;
 	  }
-	break;
+	else
+	  gcc_unreachable ();
       }
+    case E_CDImode:
+    case E_TImode:
+      classes[0] = classes[1] = X86_64_INTEGER_CLASS;
+      return 2;
+    case E_COImode:
+    case E_OImode:
+      /* OImode shouldn't be used directly.  */
+      gcc_unreachable ();
+    case E_CTImode:
+      return 0;
+    case E_SFmode:
+      if (!(bit_offset % 64))
+	classes[0] = X86_64_SSESF_CLASS;
+      else
+	classes[0] = X86_64_SSE_CLASS;
+      return 1;
+    case E_DFmode:
+      classes[0] = X86_64_SSEDF_CLASS;
+      return 1;
+    case E_XFmode:
+      classes[0] = X86_64_X87_CLASS;
+      classes[1] = X86_64_X87UP_CLASS;
+      return 2;
+    case E_TFmode:
+      classes[0] = X86_64_SSE_CLASS;
+      classes[1] = X86_64_SSEUP_CLASS;
+      return 2;
+    case E_SCmode:
+      classes[0] = X86_64_SSE_CLASS;
+      if (!(bit_offset % 64))
+	return 1;
+      else
+	{
+	  static bool warned;
 
-  if (i == pta_size)
-    {
-      error (main_args_p
-	     ? G_("bad value (%qs) for %<-march=%> switch")
-	     : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
-	     opts->x_ix86_arch_string);
+	  if (!warned && warn_psabi)
+	    {
+	      warned = true;
+	      inform (input_location,
+		      "the ABI of passing structure with %<complex float%>"
+		      " member has changed in GCC 4.4");
+	    }
+	  classes[1] = X86_64_SSESF_CLASS;
+	  return 2;
+	}
+    case E_DCmode:
+      classes[0] = X86_64_SSEDF_CLASS;
+      classes[1] = X86_64_SSEDF_CLASS;
+      return 2;
+    case E_XCmode:
+      classes[0] = X86_64_COMPLEX_X87_CLASS;
+      return 1;
+    case E_TCmode:
+      /* This modes is larger than 16 bytes.  */
+      return 0;
+    case E_V8SFmode:
+    case E_V8SImode:
+    case E_V32QImode:
+    case E_V16HImode:
+    case E_V4DFmode:
+    case E_V4DImode:
+      classes[0] = X86_64_SSE_CLASS;
+      classes[1] = X86_64_SSEUP_CLASS;
+      classes[2] = X86_64_SSEUP_CLASS;
+      classes[3] = X86_64_SSEUP_CLASS;
+      return 4;
+    case E_V8DFmode:
+    case E_V16SFmode:
+    case E_V8DImode:
+    case E_V16SImode:
+    case E_V32HImode:
+    case E_V64QImode:
+      classes[0] = X86_64_SSE_CLASS;
+      classes[1] = X86_64_SSEUP_CLASS;
+      classes[2] = X86_64_SSEUP_CLASS;
+      classes[3] = X86_64_SSEUP_CLASS;
+      classes[4] = X86_64_SSEUP_CLASS;
+      classes[5] = X86_64_SSEUP_CLASS;
+      classes[6] = X86_64_SSEUP_CLASS;
+      classes[7] = X86_64_SSEUP_CLASS;
+      return 8;
+    case E_V4SFmode:
+    case E_V4SImode:
+    case E_V16QImode:
+    case E_V8HImode:
+    case E_V2DFmode:
+    case E_V2DImode:
+      classes[0] = X86_64_SSE_CLASS;
+      classes[1] = X86_64_SSEUP_CLASS;
+      return 2;
+    case E_V1TImode:
+    case E_V1DImode:
+    case E_V2SFmode:
+    case E_V2SImode:
+    case E_V4HImode:
+    case E_V8QImode:
+      classes[0] = X86_64_SSE_CLASS;
+      return 1;
+    case E_BLKmode:
+    case E_VOIDmode:
+      return 0;
+    default:
+      gcc_assert (VECTOR_MODE_P (mode));
 
-      auto_vec <const char *> candidates;
-      for (i = 0; i < pta_size; i++)
-	if (strcmp (processor_alias_table[i].name, "generic")
-	    && strcmp (processor_alias_table[i].name, "intel")
-	    && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
-		|| ((processor_alias_table[i].flags & PTA_64BIT) != 0)))
-	  candidates.safe_push (processor_alias_table[i].name);
+      if (bytes > 16)
+	return 0;
 
-#ifdef HAVE_LOCAL_CPU_DETECT
-      /* Add also "native" as possible value.  */
-      candidates.safe_push ("native");
-#endif
+      gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
 
-      char *s;
-      const char *hint
-	= candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
-      if (hint)
-	inform (input_location,
-		main_args_p
-		? G_("valid arguments to %<-march=%> switch are: "
-		     "%s; did you mean %qs?")
-		: G_("valid arguments to %<target(\"arch=\")%> attribute are: "
-		     "%s; did you mean %qs?"), s, hint);
+      if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
+	classes[0] = X86_64_INTEGERSI_CLASS;
       else
-	inform (input_location,
-		main_args_p
-		? G_("valid arguments to %<-march=%> switch are: %s")
-		: G_("valid arguments to %<target(\"arch=\")%> attribute "
-		     "are: %s"), s);
-      XDELETEVEC (s);
+	classes[0] = X86_64_INTEGER_CLASS;
+      classes[1] = X86_64_INTEGER_CLASS;
+      return 1 + (bytes > 8);
     }
+}
+
+/* Examine the argument and return set number of register required in each
+   class.  Return true iff parameter should be passed in memory.  */
+
+static bool
+examine_argument (machine_mode mode, const_tree type, int in_return,
+		  int *int_nregs, int *sse_nregs)
+{
+  enum x86_64_reg_class regclass[MAX_CLASSES];
+  int n = classify_argument (mode, type, regclass, 0);
 
-  ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
-  for (i = 0; i < X86_ARCH_LAST; ++i)
-    ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
+  *int_nregs = 0;
+  *sse_nregs = 0;
 
-  for (i = 0; i < pta_size; i++)
-    if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
+  if (!n)
+    return true;
+  for (n--; n >= 0; n--)
+    switch (regclass[n])
       {
-	ix86_schedule = processor_alias_table[i].schedule;
-	ix86_tune = processor_alias_table[i].processor;
-	if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
-	  {
-	    if (!((processor_alias_table[i].flags & PTA_64BIT) != 0))
-	      {
-		if (ix86_tune_defaulted)
-		  {
-		    opts->x_ix86_tune_string = "x86-64";
-		    for (i = 0; i < pta_size; i++)
-		      if (! strcmp (opts->x_ix86_tune_string,
-				    processor_alias_table[i].name))
-			break;
-		    ix86_schedule = processor_alias_table[i].schedule;
-		    ix86_tune = processor_alias_table[i].processor;
-		  }
-		else
-		  error ("CPU you selected does not support x86-64 "
-			 "instruction set");
-	      }
-	  }
-	/* Intel CPUs have always interpreted SSE prefetch instructions as
-	   NOPs; so, we can enable SSE prefetch instructions even when
-	   -mtune (rather than -march) points us to a processor that has them.
-	   However, the VIA C3 gives a SIGILL, so we only do that for i686 and
-	   higher processors.  */
-	if (TARGET_CMOV
-	    && ((processor_alias_table[i].flags
-	      & (PTA_PREFETCH_SSE | PTA_SSE)) != 0))
-	  x86_prefetch_sse = true;
+      case X86_64_INTEGER_CLASS:
+      case X86_64_INTEGERSI_CLASS:
+	(*int_nregs)++;
 	break;
+      case X86_64_SSE_CLASS:
+      case X86_64_SSESF_CLASS:
+      case X86_64_SSEDF_CLASS:
+	(*sse_nregs)++;
+	break;
+      case X86_64_NO_CLASS:
+      case X86_64_SSEUP_CLASS:
+	break;
+      case X86_64_X87_CLASS:
+      case X86_64_X87UP_CLASS:
+      case X86_64_COMPLEX_X87_CLASS:
+	if (!in_return)
+	  return true;
+	break;
+      case X86_64_MEMORY_CLASS:
+	gcc_unreachable ();
       }
 
-  if (ix86_tune_specified && i == pta_size)
-    {
-      error (main_args_p
-	     ? G_("bad value (%qs) for %<-mtune=%> switch")
-	     : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
-	     opts->x_ix86_tune_string);
-
-      auto_vec <const char *> candidates;
-      for (i = 0; i < pta_size; i++)
-	if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
-	    || ((processor_alias_table[i].flags & PTA_64BIT) != 0))
-	  candidates.safe_push (processor_alias_table[i].name);
-
-#ifdef HAVE_LOCAL_CPU_DETECT
-      /* Add also "native" as possible value.  */
-      candidates.safe_push ("native");
-#endif
+  return false;
+}
 
-      char *s;
-      const char *hint
-	= candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
-      if (hint)
-	inform (input_location,
-		main_args_p
-		? G_("valid arguments to %<-mtune=%> switch are: "
-		     "%s; did you mean %qs?")
-		: G_("valid arguments to %<target(\"tune=\")%> attribute are: "
-		     "%s; did you mean %qs?"), s, hint);
-      else
-	inform (input_location,
-		main_args_p
-		? G_("valid arguments to %<-mtune=%> switch are: %s")
-		: G_("valid arguments to %<target(\"tune=\")%> attribute "
-		     "are: %s"), s);
-      XDELETEVEC (s);
-    }
+/* Construct container for the argument used by GCC interface.  See
+   FUNCTION_ARG for the detailed description.  */
 
-  set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
+static rtx
+construct_container (machine_mode mode, machine_mode orig_mode,
+		     const_tree type, int in_return, int nintregs, int nsseregs,
+		     const int *intreg, int sse_regno)
+{
+  /* The following variables hold the static issued_error state.  */
+  static bool issued_sse_arg_error;
+  static bool issued_sse_ret_error;
+  static bool issued_x87_ret_error;
 
-#ifndef USE_IX86_FRAME_POINTER
-#define USE_IX86_FRAME_POINTER 0
-#endif
+  machine_mode tmpmode;
+  int bytes
+    = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
+  enum x86_64_reg_class regclass[MAX_CLASSES];
+  int n;
+  int i;
+  int nexps = 0;
+  int needed_sseregs, needed_intregs;
+  rtx exp[MAX_CLASSES];
+  rtx ret;
 
-#ifndef USE_X86_64_FRAME_POINTER
-#define USE_X86_64_FRAME_POINTER 0
-#endif
+  n = classify_argument (mode, type, regclass, 0);
+  if (!n)
+    return NULL;
+  if (examine_argument (mode, type, in_return, &needed_intregs,
+			&needed_sseregs))
+    return NULL;
+  if (needed_intregs > nintregs || needed_sseregs > nsseregs)
+    return NULL;
 
-  /* Set the default values for switches whose default depends on TARGET_64BIT
-     in case they weren't overwritten by command line options.  */
-  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
-    {
-      if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
-	opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
-      if (opts->x_flag_asynchronous_unwind_tables
-	  && !opts_set->x_flag_unwind_tables
-	  && TARGET_64BIT_MS_ABI)
-	opts->x_flag_unwind_tables = 1;
-      if (opts->x_flag_asynchronous_unwind_tables == 2)
-	opts->x_flag_unwind_tables
-	  = opts->x_flag_asynchronous_unwind_tables = 1;
-      if (opts->x_flag_pcc_struct_return == 2)
-	opts->x_flag_pcc_struct_return = 0;
-    }
-  else
+  /* We allowed the user to turn off SSE for kernel mode.  Don't crash if
+     some less clueful developer tries to use floating-point anyway.  */
+  if (needed_sseregs && !TARGET_SSE)
     {
-      if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
-	opts->x_flag_omit_frame_pointer
-	  = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
-      if (opts->x_flag_asynchronous_unwind_tables == 2)
-	opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
-      if (opts->x_flag_pcc_struct_return == 2)
-	{
-	  /* Intel MCU psABI specifies that -freg-struct-return should
-	     be on.  Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
-	     we check -miamcu so that -freg-struct-return is always
-	     turned on if -miamcu is used.  */
-	  if (TARGET_IAMCU_P (opts->x_target_flags))
-	    opts->x_flag_pcc_struct_return = 0;
-	  else
-	    opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
+      if (in_return)
+	{
+	  if (!issued_sse_ret_error)
+	    {
+	      error ("SSE register return with SSE disabled");
+	      issued_sse_ret_error = true;
+	    }
 	}
-    }
-
-  ix86_tune_cost = processor_cost_table[ix86_tune];
-  /* TODO: ix86_cost should be chosen at instruction or function granuality
-     so for cold code we use size_cost even in !optimize_size compilation.  */
-  if (opts->x_optimize_size)
-    ix86_cost = &ix86_size_cost;
-  else
-    ix86_cost = ix86_tune_cost;
-
-  /* Arrange to set up i386_stack_locals for all functions.  */
-  init_machine_status = ix86_init_machine_status;
-
-  /* Validate -mregparm= value.  */
-  if (opts_set->x_ix86_regparm)
-    {
-      if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
-	warning (0, "%<-mregparm%> is ignored in 64-bit mode");
-      else if (TARGET_IAMCU_P (opts->x_target_flags))
-	warning (0, "%<-mregparm%> is ignored for Intel MCU psABI");
-      if (opts->x_ix86_regparm > REGPARM_MAX)
+      else if (!issued_sse_arg_error)
 	{
-	  error ("%<-mregparm=%d%> is not between 0 and %d",
-		 opts->x_ix86_regparm, REGPARM_MAX);
-	  opts->x_ix86_regparm = 0;
+	  error ("SSE register argument with SSE disabled");
+	  issued_sse_arg_error = true;
 	}
+      return NULL;
     }
-  if (TARGET_IAMCU_P (opts->x_target_flags)
-      || TARGET_64BIT_P (opts->x_ix86_isa_flags))
-    opts->x_ix86_regparm = REGPARM_MAX;
-
-  /* Default align_* from the processor table.  */
-  ix86_default_align (opts);
-
-  /* Provide default for -mbranch-cost= value.  */
-  if (!opts_set->x_ix86_branch_cost)
-    opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
 
-  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
-    {
-      opts->x_target_flags
-	|= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
-
-      if (!ix86_arch_specified)
-	opts->x_ix86_isa_flags
-	  |= TARGET_SUBTARGET64_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
-
-      if (TARGET_RTD_P (opts->x_target_flags))
-	warning (0,
-		 main_args_p
-		 ? G_("%<-mrtd%> is ignored in 64bit mode")
-		 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
-    }
-  else
-    {
-      opts->x_target_flags
-	|= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
-
-      if (!ix86_arch_specified)
-        opts->x_ix86_isa_flags
-	  |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
-
-      /* i386 ABI does not specify red zone.  It still makes sense to use it
-         when programmer takes care to stack from being destroyed.  */
-      if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
-        opts->x_target_flags |= MASK_NO_RED_ZONE;
-    }
-
-  /* Keep nonleaf frame pointers.  */
-  if (opts->x_flag_omit_frame_pointer)
-    opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
-  else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
-    opts->x_flag_omit_frame_pointer = 1;
-
-  /* If we're doing fast math, we don't care about comparison order
-     wrt NaNs.  This lets us use a shorter comparison sequence.  */
-  if (opts->x_flag_finite_math_only)
-    opts->x_target_flags &= ~MASK_IEEE_FP;
-
-  /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
-     since the insns won't need emulation.  */
-  if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
-    opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
-
-  /* Likewise, if the target doesn't have a 387, or we've specified
-     software floating point, don't use 387 inline intrinsics.  */
-  if (!TARGET_80387_P (opts->x_target_flags))
-    opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
-
-  /* Turn on MMX builtins for -msse.  */
-  if (TARGET_SSE_P (opts->x_ix86_isa_flags))
-    opts->x_ix86_isa_flags
-      |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
-
-  /* Enable SSE prefetch.  */
-  if (TARGET_SSE_P (opts->x_ix86_isa_flags)
-      || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
-	  && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
-      || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
-    x86_prefetch_sse = true;
-
-  /* Enable popcnt instruction for -msse4.2 or -mabm.  */
-  if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
-      || TARGET_ABM_P (opts->x_ix86_isa_flags))
-    opts->x_ix86_isa_flags
-      |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
-
-  /* Enable lzcnt instruction for -mabm.  */
-  if (TARGET_ABM_P(opts->x_ix86_isa_flags))
-    opts->x_ix86_isa_flags
-      |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
-
-  /* Disable BMI, BMI2 and TBM instructions for -m16.  */
-  if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
-    opts->x_ix86_isa_flags
-      &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
-	   & ~opts->x_ix86_isa_flags_explicit);
-
-  /* Validate -mpreferred-stack-boundary= value or default it to
-     PREFERRED_STACK_BOUNDARY_DEFAULT.  */
-  ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
-  if (opts_set->x_ix86_preferred_stack_boundary_arg)
-    {
-      int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
-      int max = TARGET_SEH ? 4 : 12;
-
-      if (opts->x_ix86_preferred_stack_boundary_arg < min
-	  || opts->x_ix86_preferred_stack_boundary_arg > max)
-	{
-	  if (min == max)
-	    error ("%<-mpreferred-stack-boundary%> is not supported "
-		   "for this target");
-	  else
-	    error ("%<-mpreferred-stack-boundary=%d%> is not between %d and %d",
-		   opts->x_ix86_preferred_stack_boundary_arg, min, max);
-	}
-      else
-	ix86_preferred_stack_boundary
-	  = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
-    }
-
-  /* Set the default value for -mstackrealign.  */
-  if (!opts_set->x_ix86_force_align_arg_pointer)
-    opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
-
-  ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
-
-  /* Validate -mincoming-stack-boundary= value or default it to
-     MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY.  */
-  ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
-  if (opts_set->x_ix86_incoming_stack_boundary_arg)
-    {
-      int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
-
-      if (opts->x_ix86_incoming_stack_boundary_arg < min
-	  || opts->x_ix86_incoming_stack_boundary_arg > 12)
-	error ("%<-mincoming-stack-boundary=%d%> is not between %d and 12",
-	       opts->x_ix86_incoming_stack_boundary_arg, min);
-      else
-	{
-	  ix86_user_incoming_stack_boundary
-	    = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
-	  ix86_incoming_stack_boundary
-	    = ix86_user_incoming_stack_boundary;
-	}
-    }
-
-#ifndef NO_PROFILE_COUNTERS
-  if (flag_nop_mcount)
-    error ("%<-mnop-mcount%> is not compatible with this target");
-#endif
-  if (flag_nop_mcount && flag_pic)
-    error ("%<-mnop-mcount%> is not implemented for %<-fPIC%>");
-
-  /* Accept -msseregparm only if at least SSE support is enabled.  */
-  if (TARGET_SSEREGPARM_P (opts->x_target_flags)
-      && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
-    error (main_args_p
-	   ? G_("%<-msseregparm%> used without SSE enabled")
-	   : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
-
-  if (opts_set->x_ix86_fpmath)
-    {
-      if (opts->x_ix86_fpmath & FPMATH_SSE)
+  /* Likewise, error if the ABI requires us to return values in the
+     x87 registers and the user specified -mno-80387.  */
+  if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
+    for (i = 0; i < n; i++)
+      if (regclass[i] == X86_64_X87_CLASS
+	  || regclass[i] == X86_64_X87UP_CLASS
+	  || regclass[i] == X86_64_COMPLEX_X87_CLASS)
 	{
-	  if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
-	    {
-	      if (TARGET_80387_P (opts->x_target_flags))
-		{
-		  warning (0, "SSE instruction set disabled, using 387 arithmetics");
-		  opts->x_ix86_fpmath = FPMATH_387;
-		}
-	    }
-	  else if ((opts->x_ix86_fpmath & FPMATH_387)
-		   && !TARGET_80387_P (opts->x_target_flags))
+	  if (!issued_x87_ret_error)
 	    {
-	      warning (0, "387 instruction set disabled, using SSE arithmetics");
-	      opts->x_ix86_fpmath = FPMATH_SSE;
+	      error ("x87 register return with x87 disabled");
+	      issued_x87_ret_error = true;
 	    }
+	  return NULL;
 	}
-    }
-  /* For all chips supporting SSE2, -mfpmath=sse performs better than
-     fpmath=387.  The second is however default at many targets since the
-     extra 80bit precision of temporaries is considered to be part of ABI.
-     Overwrite the default at least for -ffast-math. 
-     TODO: -mfpmath=both seems to produce same performing code with bit
-     smaller binaries.  It is however not clear if register allocation is
-     ready for this setting.
-     Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
-     codegen.  We may switch to 387 with -ffast-math for size optimized
-     functions. */
-  else if (fast_math_flags_set_p (&global_options)
-	   && TARGET_SSE2_P (opts->x_ix86_isa_flags))
-    opts->x_ix86_fpmath = FPMATH_SSE;
-  else
-    opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
 
-  /* Use external vectorized library in vectorizing intrinsics.  */
-  if (opts_set->x_ix86_veclibabi_type)
-    switch (opts->x_ix86_veclibabi_type)
+  /* First construct simple cases.  Avoid SCmode, since we want to use
+     single register to pass this type.  */
+  if (n == 1 && mode != SCmode)
+    switch (regclass[0])
       {
-      case ix86_veclibabi_type_svml:
-	ix86_veclib_handler = ix86_veclibabi_svml;
-	break;
-
-      case ix86_veclibabi_type_acml:
-	ix86_veclib_handler = ix86_veclibabi_acml;
+      case X86_64_INTEGER_CLASS:
+      case X86_64_INTEGERSI_CLASS:
+	return gen_rtx_REG (mode, intreg[0]);
+      case X86_64_SSE_CLASS:
+      case X86_64_SSESF_CLASS:
+      case X86_64_SSEDF_CLASS:
+	if (mode != BLKmode)
+	  return gen_reg_or_parallel (mode, orig_mode,
+				      GET_SSE_REGNO (sse_regno));
 	break;
-
+      case X86_64_X87_CLASS:
+      case X86_64_COMPLEX_X87_CLASS:
+	return gen_rtx_REG (mode, FIRST_STACK_REG);
+      case X86_64_NO_CLASS:
+	/* Zero sized array, struct or class.  */
+	return NULL;
       default:
 	gcc_unreachable ();
       }
+  if (n == 2
+      && regclass[0] == X86_64_SSE_CLASS
+      && regclass[1] == X86_64_SSEUP_CLASS
+      && mode != BLKmode)
+    return gen_reg_or_parallel (mode, orig_mode,
+				GET_SSE_REGNO (sse_regno));
+  if (n == 4
+      && regclass[0] == X86_64_SSE_CLASS
+      && regclass[1] == X86_64_SSEUP_CLASS
+      && regclass[2] == X86_64_SSEUP_CLASS
+      && regclass[3] == X86_64_SSEUP_CLASS
+      && mode != BLKmode)
+    return gen_reg_or_parallel (mode, orig_mode,
+				GET_SSE_REGNO (sse_regno));
+  if (n == 8
+      && regclass[0] == X86_64_SSE_CLASS
+      && regclass[1] == X86_64_SSEUP_CLASS
+      && regclass[2] == X86_64_SSEUP_CLASS
+      && regclass[3] == X86_64_SSEUP_CLASS
+      && regclass[4] == X86_64_SSEUP_CLASS
+      && regclass[5] == X86_64_SSEUP_CLASS
+      && regclass[6] == X86_64_SSEUP_CLASS
+      && regclass[7] == X86_64_SSEUP_CLASS
+      && mode != BLKmode)
+    return gen_reg_or_parallel (mode, orig_mode,
+				GET_SSE_REGNO (sse_regno));
+  if (n == 2
+      && regclass[0] == X86_64_X87_CLASS
+      && regclass[1] == X86_64_X87UP_CLASS)
+    return gen_rtx_REG (XFmode, FIRST_STACK_REG);
 
-  if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
-      && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
-    opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
-
-  /* If stack probes are required, the space used for large function
-     arguments on the stack must also be probed, so enable
-     -maccumulate-outgoing-args so this happens in the prologue.  */
-  if (TARGET_STACK_PROBE_P (opts->x_target_flags)
-      && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
-    {
-      if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
-	warning (0,
-		 main_args_p
-		 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
-		      "for correctness")
-		 : G_("stack probing requires "
-		      "%<target(\"accumulate-outgoing-args\")%> for "
-		      "correctness"));
-      opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
-    }
-
-  /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
-     so enable -maccumulate-outgoing-args when %ebp is fixed.  */
-  if (fixed_regs[BP_REG]
-      && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
-    {
-      if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
-	warning (0,
-		 main_args_p
-		 ? G_("fixed ebp register requires "
-		      "%<-maccumulate-outgoing-args%>")
-		 : G_("fixed ebp register requires "
-		      "%<target(\"accumulate-outgoing-args\")%>"));
-      opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
-    }
-
-  /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix.  */
-  {
-    char *p;
-    ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
-    p = strchr (internal_label_prefix, 'X');
-    internal_label_prefix_len = p - internal_label_prefix;
-    *p = '\0';
-  }
-
-  /* When scheduling description is not available, disable scheduler pass
-     so it won't slow down the compilation and make x87 code slower.  */
-  if (!TARGET_SCHEDULE)
-    opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
-
-  maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
-			 ix86_tune_cost->simultaneous_prefetches,
-			 opts->x_param_values,
-			 opts_set->x_param_values);
-  maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
-			 ix86_tune_cost->prefetch_block,
-			 opts->x_param_values,
-			 opts_set->x_param_values);
-  maybe_set_param_value (PARAM_L1_CACHE_SIZE,
-			 ix86_tune_cost->l1_cache_size,
-			 opts->x_param_values,
-			 opts_set->x_param_values);
-  maybe_set_param_value (PARAM_L2_CACHE_SIZE,
-			 ix86_tune_cost->l2_cache_size,
-			 opts->x_param_values,
-			 opts_set->x_param_values);
-
-  /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful.  */
-  if (opts->x_flag_prefetch_loop_arrays < 0
-      && HAVE_prefetch
-      && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
-      && !opts->x_optimize_size
-      && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
-    opts->x_flag_prefetch_loop_arrays = 1;
-
-  /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
-     can be opts->x_optimized to ap = __builtin_next_arg (0).  */
-  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
-    targetm.expand_builtin_va_start = NULL;
-
-  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
-    {
-      ix86_gen_leave = gen_leave_rex64;
-      if (Pmode == DImode)
-	{
-	  ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
-	  ix86_gen_tls_local_dynamic_base_64
-	    = gen_tls_local_dynamic_base_64_di;
-	}
-      else
+  if (n == 2
+      && regclass[0] == X86_64_INTEGER_CLASS
+      && regclass[1] == X86_64_INTEGER_CLASS
+      && (mode == CDImode || mode == TImode || mode == BLKmode)
+      && intreg[0] + 1 == intreg[1])
+    {
+      if (mode == BLKmode)
 	{
-	  ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
-	  ix86_gen_tls_local_dynamic_base_64
-	    = gen_tls_local_dynamic_base_64_si;
+	  /* Use TImode for BLKmode values in 2 integer registers.  */
+	  exp[0] = gen_rtx_EXPR_LIST (VOIDmode,
+				      gen_rtx_REG (TImode, intreg[0]),
+				      GEN_INT (0));
+	  ret = gen_rtx_PARALLEL (mode, rtvec_alloc (1));
+	  XVECEXP (ret, 0, 0) = exp[0];
+	  return ret;
 	}
+      else
+	return gen_rtx_REG (mode, intreg[0]);
     }
-  else
-    ix86_gen_leave = gen_leave;
-
-  if (Pmode == DImode)
-    {
-      ix86_gen_add3 = gen_adddi3;
-      ix86_gen_sub3 = gen_subdi3;
-      ix86_gen_sub3_carry = gen_subdi3_carry;
-      ix86_gen_one_cmpl2 = gen_one_cmpldi2;
-      ix86_gen_andsp = gen_anddi3;
-      ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
-      ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
-      ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
-      ix86_gen_monitor = gen_sse3_monitor_di;
-      ix86_gen_monitorx = gen_monitorx_di;
-      ix86_gen_clzero = gen_clzero_di;
-    }
-  else
-    {
-      ix86_gen_add3 = gen_addsi3;
-      ix86_gen_sub3 = gen_subsi3;
-      ix86_gen_sub3_carry = gen_subsi3_carry;
-      ix86_gen_one_cmpl2 = gen_one_cmplsi2;
-      ix86_gen_andsp = gen_andsi3;
-      ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
-      ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
-      ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
-      ix86_gen_monitor = gen_sse3_monitor_si;
-      ix86_gen_monitorx = gen_monitorx_si;
-      ix86_gen_clzero = gen_clzero_si;
-    }
-
-#ifdef USE_IX86_CLD
-  /* Use -mcld by default for 32-bit code if configured with --enable-cld.  */
-  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
-    opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
-#endif
 
-  /* Set the default value for -mfentry.  */
-  if (!opts_set->x_flag_fentry)
-    opts->x_flag_fentry = TARGET_SEH;
-  else
+  /* Otherwise figure out the entries of the PARALLEL.  */
+  for (i = 0; i < n; i++)
     {
-      if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
-	  && opts->x_flag_fentry)
-	sorry ("%<-mfentry%> isn%'t supported for 32-bit in combination "
-	       "with %<-fpic%>");
-      else if (TARGET_SEH && !opts->x_flag_fentry)
-	sorry ("%<-mno-fentry%> isn%'t compatible with SEH");
-    }
-
-  if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
-    sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH");
-
-  if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
-      && TARGET_EMIT_VZEROUPPER)
-    opts->x_target_flags |= MASK_VZEROUPPER;
-  if (!(opts_set->x_target_flags & MASK_STV))
-    opts->x_target_flags |= MASK_STV;
-  /* Disable STV if -mpreferred-stack-boundary={2,3} or
-     -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
-     stack realignment will be extra cost the pass doesn't take into
-     account and the pass can't realign the stack.  */
-  if (ix86_preferred_stack_boundary < 128
-      || ix86_incoming_stack_boundary < 128
-      || opts->x_ix86_force_align_arg_pointer)
-    opts->x_target_flags &= ~MASK_STV;
-  if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
-      && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
-    opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
-  if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
-      && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
-    opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
-
-  /* Enable 128-bit AVX instruction generation
-     for the auto-vectorizer.  */
-  if (TARGET_AVX128_OPTIMAL
-      && (opts_set->x_prefer_vector_width_type == PVW_NONE))
-    opts->x_prefer_vector_width_type = PVW_AVX128;
-
-  /* Use 256-bit AVX instruction generation
-     in the auto-vectorizer.  */
-  if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
-      && (opts_set->x_prefer_vector_width_type == PVW_NONE))
-    opts->x_prefer_vector_width_type = PVW_AVX256;
-
-  if (opts->x_ix86_recip_name)
-    {
-      char *p = ASTRDUP (opts->x_ix86_recip_name);
-      char *q;
-      unsigned int mask, i;
-      bool invert;
-
-      while ((q = strtok (p, ",")) != NULL)
-	{
-	  p = NULL;
-	  if (*q == '!')
-	    {
-	      invert = true;
-	      q++;
-	    }
-	  else
-	    invert = false;
+      int pos;
 
-	  if (!strcmp (q, "default"))
-	    mask = RECIP_MASK_ALL;
-	  else
-	    {
-	      for (i = 0; i < ARRAY_SIZE (recip_options); i++)
-		if (!strcmp (q, recip_options[i].string))
+      switch (regclass[i])
+        {
+	  case X86_64_NO_CLASS:
+	    break;
+	  case X86_64_INTEGER_CLASS:
+	  case X86_64_INTEGERSI_CLASS:
+	    /* Merge TImodes on aligned occasions here too.  */
+	    if (i * 8 + 8 > bytes)
+	      {
+		unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
+		if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
+		  /* We've requested 24 bytes we
+		     don't have mode for.  Use DImode.  */
+		  tmpmode = DImode;
+	      }
+	    else if (regclass[i] == X86_64_INTEGERSI_CLASS)
+	      tmpmode = SImode;
+	    else
+	      tmpmode = DImode;
+	    exp [nexps++]
+	      = gen_rtx_EXPR_LIST (VOIDmode,
+				   gen_rtx_REG (tmpmode, *intreg),
+				   GEN_INT (i*8));
+	    intreg++;
+	    break;
+	  case X86_64_SSESF_CLASS:
+	    exp [nexps++]
+	      = gen_rtx_EXPR_LIST (VOIDmode,
+				   gen_rtx_REG (SFmode,
+						GET_SSE_REGNO (sse_regno)),
+				   GEN_INT (i*8));
+	    sse_regno++;
+	    break;
+	  case X86_64_SSEDF_CLASS:
+	    exp [nexps++]
+	      = gen_rtx_EXPR_LIST (VOIDmode,
+				   gen_rtx_REG (DFmode,
+						GET_SSE_REGNO (sse_regno)),
+				   GEN_INT (i*8));
+	    sse_regno++;
+	    break;
+	  case X86_64_SSE_CLASS:
+	    pos = i;
+	    switch (n)
+	      {
+	      case 1:
+		tmpmode = DImode;
+		break;
+	      case 2:
+		if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
 		  {
-		    mask = recip_options[i].mask;
-		    break;
+		    tmpmode = TImode;
+		    i++;
 		  }
-
-	      if (i == ARRAY_SIZE (recip_options))
-		{
-		  error ("unknown option for %<-mrecip=%s%>", q);
-		  invert = false;
-		  mask = RECIP_MASK_NONE;
-		}
-	    }
-
-	  opts->x_recip_mask_explicit |= mask;
-	  if (invert)
-	    opts->x_recip_mask &= ~mask;
-	  else
-	    opts->x_recip_mask |= mask;
+		else
+		  tmpmode = DImode;
+		break;
+	      case 4:
+		gcc_assert (i == 0
+			    && regclass[1] == X86_64_SSEUP_CLASS
+			    && regclass[2] == X86_64_SSEUP_CLASS
+			    && regclass[3] == X86_64_SSEUP_CLASS);
+		tmpmode = OImode;
+		i += 3;
+		break;
+	      case 8:
+		gcc_assert (i == 0
+			    && regclass[1] == X86_64_SSEUP_CLASS
+			    && regclass[2] == X86_64_SSEUP_CLASS
+			    && regclass[3] == X86_64_SSEUP_CLASS
+			    && regclass[4] == X86_64_SSEUP_CLASS
+			    && regclass[5] == X86_64_SSEUP_CLASS
+			    && regclass[6] == X86_64_SSEUP_CLASS
+			    && regclass[7] == X86_64_SSEUP_CLASS);
+		tmpmode = XImode;
+		i += 7;
+		break;
+	      default:
+		gcc_unreachable ();
+	      }
+	    exp [nexps++]
+	      = gen_rtx_EXPR_LIST (VOIDmode,
+				   gen_rtx_REG (tmpmode,
+						GET_SSE_REGNO (sse_regno)),
+				   GEN_INT (pos*8));
+	    sse_regno++;
+	    break;
+	  default:
+	    gcc_unreachable ();
 	}
     }
 
-  if (TARGET_RECIP_P (opts->x_target_flags))
-    opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
-  else if (opts_set->x_target_flags & MASK_RECIP)
-    opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
+  /* Empty aligned struct, union or class.  */
+  if (nexps == 0)
+    return NULL;
+
+  ret =  gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
+  for (i = 0; i < nexps; i++)
+    XVECEXP (ret, 0, i) = exp [i];
+  return ret;
+}
+
+/* Update the data in CUM to advance over an argument of mode MODE
+   and data type TYPE.  (TYPE is null for libcalls where that information
+   may not be available.)
 
-  /* Default long double to 64-bit for 32-bit Bionic and to __float128
-     for 64-bit Bionic.  Also default long double to 64-bit for Intel
-     MCU psABI.  */
-  if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
-      && !(opts_set->x_target_flags
-	   & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
-    opts->x_target_flags |= (TARGET_64BIT
-			     ? MASK_LONG_DOUBLE_128
-			     : MASK_LONG_DOUBLE_64);
+   Return a number of integer regsiters advanced over.  */
 
-  /* Only one of them can be active.  */
-  gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
-	      || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
+static int
+function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
+			 const_tree type, HOST_WIDE_INT bytes,
+			 HOST_WIDE_INT words)
+{
+  int res = 0;
+  bool error_p = false;
 
-  /* Handle stack protector */
-  if (!opts_set->x_ix86_stack_protector_guard)
+  if (TARGET_IAMCU)
     {
-#ifdef TARGET_THREAD_SSP_OFFSET
-      if (!TARGET_HAS_BIONIC)
-	opts->x_ix86_stack_protector_guard = SSP_TLS;
-      else
-#endif
-	opts->x_ix86_stack_protector_guard = SSP_GLOBAL;
+      /* Intel MCU psABI passes scalars and aggregates no larger than 8
+	 bytes in registers.  */
+      if (!VECTOR_MODE_P (mode) && bytes <= 8)
+	goto pass_in_reg;
+      return res;
     }
 
-  if (opts_set->x_ix86_stack_protector_guard_offset_str)
+  switch (mode)
     {
-      char *endp;
-      const char *str = opts->x_ix86_stack_protector_guard_offset_str;
-
-      errno = 0;
-      int64_t offset;
-
-#if defined(INT64_T_IS_LONG)
-      offset = strtol (str, &endp, 0);
-#else
-      offset = strtoll (str, &endp, 0);
-#endif
-
-      if (!*str || *endp || errno)
-	error ("%qs is not a valid number "
-	       "in %<-mstack-protector-guard-offset=%>", str);
-
-      if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
-		     HOST_WIDE_INT_C (0x7fffffff)))
-	error ("%qs is not a valid offset "
-	       "in %<-mstack-protector-guard-offset=%>", str);
-
-      opts->x_ix86_stack_protector_guard_offset = offset;
-    }
-#ifdef TARGET_THREAD_SSP_OFFSET
-  else
-    opts->x_ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
-#endif
-
-  if (opts_set->x_ix86_stack_protector_guard_reg_str)
-    {
-      const char *str = opts->x_ix86_stack_protector_guard_reg_str;
-      addr_space_t seg = ADDR_SPACE_GENERIC;
+    default:
+      break;
 
-      /* Discard optional register prefix.  */
-      if (str[0] == '%')
-	str++;
+    case E_BLKmode:
+      if (bytes < 0)
+	break;
+      /* FALLTHRU */
 
-      if (strlen (str) == 2 && str[1] == 's')
+    case E_DImode:
+    case E_SImode:
+    case E_HImode:
+    case E_QImode:
+pass_in_reg:
+      cum->words += words;
+      cum->nregs -= words;
+      cum->regno += words;
+      if (cum->nregs >= 0)
+	res = words;
+      if (cum->nregs <= 0)
 	{
-	  if (str[0] == 'f')
-	    seg = ADDR_SPACE_SEG_FS;
-	  else if (str[0] == 'g')
-	    seg = ADDR_SPACE_SEG_GS;
+	  cum->nregs = 0;
+	  cfun->machine->arg_reg_available = false;
+	  cum->regno = 0;
 	}
+      break;
 
-      if (seg == ADDR_SPACE_GENERIC)
-	error ("%qs is not a valid base register "
-	       "in %<-mstack-protector-guard-reg=%>",
-	       opts->x_ix86_stack_protector_guard_reg_str);
+    case E_OImode:
+      /* OImode shouldn't be used directly.  */
+      gcc_unreachable ();
 
-      opts->x_ix86_stack_protector_guard_reg = seg;
-    }
-  else
-    {
-      opts->x_ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
+    case E_DFmode:
+      if (cum->float_in_sse == -1)
+	error_p = true;
+      if (cum->float_in_sse < 2)
+	break;
+      /* FALLTHRU */
+    case E_SFmode:
+      if (cum->float_in_sse == -1)
+	error_p = true;
+      if (cum->float_in_sse < 1)
+	break;
+      /* FALLTHRU */
 
-      /* The kernel uses a different segment register for performance
-	 reasons; a system call would not have to trash the userspace
-	 segment register, which would be expensive.  */
-      if (opts->x_ix86_cmodel == CM_KERNEL)
-	opts->x_ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
-    }
+    case E_V8SFmode:
+    case E_V8SImode:
+    case E_V64QImode:
+    case E_V32HImode:
+    case E_V16SImode:
+    case E_V8DImode:
+    case E_V16SFmode:
+    case E_V8DFmode:
+    case E_V32QImode:
+    case E_V16HImode:
+    case E_V4DFmode:
+    case E_V4DImode:
+    case E_TImode:
+    case E_V16QImode:
+    case E_V8HImode:
+    case E_V4SImode:
+    case E_V2DImode:
+    case E_V4SFmode:
+    case E_V2DFmode:
+      if (!type || !AGGREGATE_TYPE_P (type))
+	{
+	  cum->sse_words += words;
+	  cum->sse_nregs -= 1;
+	  cum->sse_regno += 1;
+	  if (cum->sse_nregs <= 0)
+	    {
+	      cum->sse_nregs = 0;
+	      cum->sse_regno = 0;
+	    }
+	}
+      break;
 
-  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
-  if (opts->x_ix86_tune_memcpy_strategy)
-    {
-      char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
-      ix86_parse_stringop_strategy_string (str, false);
-      free (str);
+    case E_V8QImode:
+    case E_V4HImode:
+    case E_V2SImode:
+    case E_V2SFmode:
+    case E_V1TImode:
+    case E_V1DImode:
+      if (!type || !AGGREGATE_TYPE_P (type))
+	{
+	  cum->mmx_words += words;
+	  cum->mmx_nregs -= 1;
+	  cum->mmx_regno += 1;
+	  if (cum->mmx_nregs <= 0)
+	    {
+	      cum->mmx_nregs = 0;
+	      cum->mmx_regno = 0;
+	    }
+	}
+      break;
     }
-
-  if (opts->x_ix86_tune_memset_strategy)
+  if (error_p)
     {
-      char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
-      ix86_parse_stringop_strategy_string (str, true);
-      free (str);
+      cum->float_in_sse = 0;
+      error ("calling %qD with SSE calling convention without "
+	     "SSE/SSE2 enabled", cum->decl);
+      sorry ("this is a GCC bug that can be worked around by adding "
+	     "attribute used to function called");
     }
 
-  /* Save the initial options in case the user does function specific
-     options.  */
-  if (main_args_p)
-    target_option_default_node = target_option_current_node
-      = build_target_option_node (opts);
-
-  if (opts->x_flag_cf_protection != CF_NONE)
-    opts->x_flag_cf_protection
-      = (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
-
-  if (ix86_tune_features [X86_TUNE_AVOID_256FMA_CHAINS])
-    maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 256,
-			   opts->x_param_values,
-			   opts_set->x_param_values);
-  else if (ix86_tune_features [X86_TUNE_AVOID_128FMA_CHAINS])
-    maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 128,
-			   opts->x_param_values,
-			   opts_set->x_param_values);
-
-  /* PR86952: jump table usage with retpolines is slow.
-     The PR provides some numbers about the slowness.  */
-  if (ix86_indirect_branch != indirect_branch_keep
-      && !opts_set->x_flag_jump_tables)
-    opts->x_flag_jump_tables = 0;
-
-  return true;
-}
-
-/* Implement the TARGET_OPTION_OVERRIDE hook.  */
-
-static void
-ix86_option_override (void)
-{
-  ix86_option_override_internal (true, &global_options, &global_options_set);
-}
-
-/* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
-static char *
-ix86_offload_options (void)
-{
-  if (TARGET_LP64)
-    return xstrdup ("-foffload-abi=lp64");
-  return xstrdup ("-foffload-abi=ilp32");
+  return res;
 }
 
-/* Update register usage after having seen the compiler flags.  */
-
-static void
-ix86_conditional_register_usage (void)
+static int
+function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
+			 const_tree type, HOST_WIDE_INT words, bool named)
 {
-  int i, c_mask;
+  int int_nregs, sse_nregs;
 
-  /* If there are no caller-saved registers, preserve all registers.
-     except fixed_regs and registers used for function return value
-     since aggregate_value_p checks call_used_regs[regno] on return
-     value.  */
-  if (cfun && cfun->machine->no_caller_saved_registers)
-    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-      if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
-	call_used_regs[i] = 0;
+  /* Unnamed 512 and 256bit vector mode parameters are passed on stack.  */
+  if (!named && (VALID_AVX512F_REG_MODE (mode)
+		 || VALID_AVX256_REG_MODE (mode)))
+    return 0;
 
-  /* For 32-bit targets, squash the REX registers.  */
-  if (! TARGET_64BIT)
+  if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
+      && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
     {
-      for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
-	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
-      for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
-	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
-      for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
-	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+      cum->nregs -= int_nregs;
+      cum->sse_nregs -= sse_nregs;
+      cum->regno += int_nregs;
+      cum->sse_regno += sse_nregs;
+      return int_nregs;
     }
-
-  /*  See the definition of CALL_USED_REGISTERS in i386.h.  */
-  c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
-  
-  CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
-
-  for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+  else
     {
-      /* Set/reset conditionally defined registers from
-	 CALL_USED_REGISTERS initializer.  */
-      if (call_used_regs[i] > 1)
-	call_used_regs[i] = !!(call_used_regs[i] & c_mask);
-
-      /* Calculate registers of CLOBBERED_REGS register set
-	 as call used registers from GENERAL_REGS register set.  */
-      if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
-	  && call_used_regs[i])
-	SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
+      int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
+      cum->words = ROUND_UP (cum->words, align);
+      cum->words += words;
+      return 0;
     }
+}
 
-  /* If MMX is disabled, squash the registers.  */
-  if (! TARGET_MMX)
-    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-      if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
-	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
-
-  /* If SSE is disabled, squash the registers.  */
-  if (! TARGET_SSE)
-    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-      if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
-	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
-
-  /* If the FPU is disabled, squash the registers.  */
-  if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
-    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-      if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
-	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+static int
+function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
+			    HOST_WIDE_INT words)
+{
+  /* Otherwise, this should be passed indirect.  */
+  gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
 
-  /* If AVX512F is disabled, squash the registers.  */
-  if (! TARGET_AVX512F)
+  cum->words += words;
+  if (cum->nregs > 0)
     {
-      for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
-	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
-
-      for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
-	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+      cum->nregs -= 1;
+      cum->regno += 1;
+      return 1;
     }
+  return 0;
 }
 
-/* Canonicalize a comparison from one we don't have to one we do have.  */
+/* Update the data in CUM to advance over argument ARG.  */
 
 static void
-ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
-			      bool op0_preserve_value)
+ix86_function_arg_advance (cumulative_args_t cum_v,
+			   const function_arg_info &arg)
 {
-  /* The order of operands in x87 ficom compare is forced by combine in
-     simplify_comparison () function. Float operator is treated as RTX_OBJ
-     with a precedence over other operators and is always put in the first
-     place. Swap condition and operands to match ficom instruction.  */
-  if (!op0_preserve_value
-      && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
-    {
-      enum rtx_code scode = swap_condition ((enum rtx_code) *code);
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+  machine_mode mode = arg.mode;
+  HOST_WIDE_INT bytes, words;
+  int nregs;
 
-      /* We are called only for compares that are split to SAHF instruction.
-	 Ensure that we have setcc/jcc insn for the swapped condition.  */
-      if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
-	{
-	  std::swap (*op0, *op1);
-	  *code = (int) scode;
-	}
-    }
-}
-
-/* Save the current options */
+  /* The argument of interrupt handler is a special case and is
+     handled in ix86_function_arg.  */
+  if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
+    return;
 
-static void
-ix86_function_specific_save (struct cl_target_option *ptr,
-			     struct gcc_options *opts)
-{
-  ptr->arch = ix86_arch;
-  ptr->schedule = ix86_schedule;
-  ptr->prefetch_sse = x86_prefetch_sse;
-  ptr->tune = ix86_tune;
-  ptr->branch_cost = ix86_branch_cost;
-  ptr->tune_defaulted = ix86_tune_defaulted;
-  ptr->arch_specified = ix86_arch_specified;
-  ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
-  ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
-  ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
-  ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
-  ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
-  ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
-  ptr->x_ix86_abi = opts->x_ix86_abi;
-  ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
-  ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
-  ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
-  ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
-  ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
-  ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
-  ptr->x_ix86_pmode = opts->x_ix86_pmode;
-  ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
-  ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
-  ptr->x_ix86_regparm = opts->x_ix86_regparm;
-  ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
-  ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
-  ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
-  ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
-  ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
-  ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
-  ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
-  ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
-  ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
-  ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
-
-  /* The fields are char but the variables are not; make sure the
-     values fit in the fields.  */
-  gcc_assert (ptr->arch == ix86_arch);
-  gcc_assert (ptr->schedule == ix86_schedule);
-  gcc_assert (ptr->tune == ix86_tune);
-  gcc_assert (ptr->branch_cost == ix86_branch_cost);
-}
-
-/* Restore the current options */
+  bytes = arg.promoted_size_in_bytes ();
+  words = CEIL (bytes, UNITS_PER_WORD);
 
-static void
-ix86_function_specific_restore (struct gcc_options *opts,
-				struct cl_target_option *ptr)
-{
-  enum processor_type old_tune = ix86_tune;
-  enum processor_type old_arch = ix86_arch;
-  unsigned HOST_WIDE_INT ix86_arch_mask;
-  int i;
+  if (arg.type)
+    mode = type_natural_mode (arg.type, NULL, false);
+
+  if (TARGET_64BIT)
+    {
+      enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
 
-  /* We don't change -fPIC.  */
-  opts->x_flag_pic = flag_pic;
-
-  ix86_arch = (enum processor_type) ptr->arch;
-  ix86_schedule = (enum attr_cpu) ptr->schedule;
-  ix86_tune = (enum processor_type) ptr->tune;
-  x86_prefetch_sse = ptr->prefetch_sse;
-  opts->x_ix86_branch_cost = ptr->branch_cost;
-  ix86_tune_defaulted = ptr->tune_defaulted;
-  ix86_arch_specified = ptr->arch_specified;
-  opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
-  opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
-  opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
-  opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
-  opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
-  opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
-  opts->x_ix86_abi = ptr->x_ix86_abi;
-  opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
-  opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
-  opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
-  opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
-  opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
-  opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
-  opts->x_ix86_pmode = ptr->x_ix86_pmode;
-  opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
-  opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
-  opts->x_ix86_regparm = ptr->x_ix86_regparm;
-  opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
-  opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
-  opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
-  opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
-  opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
-  opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
-  opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
-  opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
-  opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
-  opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
-  ix86_tune_cost = processor_cost_table[ix86_tune];
-  /* TODO: ix86_cost should be chosen at instruction or function granuality
-     so for cold code we use size_cost even in !optimize_size compilation.  */
-  if (opts->x_optimize_size)
-    ix86_cost = &ix86_size_cost;
+      if (call_abi == MS_ABI)
+	nregs = function_arg_advance_ms_64 (cum, bytes, words);
+      else
+	nregs = function_arg_advance_64 (cum, mode, arg.type, words,
+					 arg.named);
+    }
   else
-    ix86_cost = ix86_tune_cost;
+    nregs = function_arg_advance_32 (cum, mode, arg.type, bytes, words);
 
-  /* Recreate the arch feature tests if the arch changed */
-  if (old_arch != ix86_arch)
+  if (!nregs)
     {
-      ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
-      for (i = 0; i < X86_ARCH_LAST; ++i)
-	ix86_arch_features[i]
-	  = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
+      /* Track if there are outgoing arguments on stack.  */
+      if (cum->caller)
+	cfun->machine->outgoing_args_on_stack = true;
     }
-
-  /* Recreate the tune optimization tests */
-  if (old_tune != ix86_tune)
-    set_ix86_tune_features (ix86_tune, false);
 }
 
-/* Adjust target options after streaming them in.  This is mainly about
-   reconciling them with global options.  */
-
-static void
-ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
-{
-  /* flag_pic is a global option, but ix86_cmodel is target saved option
-     partly computed from flag_pic.  If flag_pic is on, adjust x_ix86_cmodel
-     for PIC, or error out.  */
-  if (flag_pic)
-    switch (ptr->x_ix86_cmodel)
-      {
-      case CM_SMALL:
-	ptr->x_ix86_cmodel = CM_SMALL_PIC;
-	break;
+/* Define where to put the arguments to a function.
+   Value is zero to push the argument on the stack,
+   or a hard register in which to store the argument.
 
-      case CM_MEDIUM:
-	ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
-	break;
-
-      case CM_LARGE:
-	ptr->x_ix86_cmodel = CM_LARGE_PIC;
-	break;
-
-      case CM_KERNEL:
-	error ("code model %s does not support PIC mode", "kernel");
-	break;
-
-      default:
-	break;
-      }
-  else
-    switch (ptr->x_ix86_cmodel)
-      {
-      case CM_SMALL_PIC:
-	ptr->x_ix86_cmodel = CM_SMALL;
-	break;
-
-      case CM_MEDIUM_PIC:
-	ptr->x_ix86_cmodel = CM_MEDIUM;
-	break;
-
-      case CM_LARGE_PIC:
-	ptr->x_ix86_cmodel = CM_LARGE;
-	break;
-
-      default:
-	break;
-      }
-}
-
-/* Print the current options */
-
-static void
-ix86_function_specific_print (FILE *file, int indent,
-			      struct cl_target_option *ptr)
-{
-  char *target_string
-    = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
-			  ptr->x_target_flags, ptr->x_ix86_target_flags,
-			  NULL, NULL, ptr->x_ix86_fpmath, false, true);
-
-  gcc_assert (ptr->arch < PROCESSOR_max);
-  fprintf (file, "%*sarch = %d (%s)\n",
-	   indent, "",
-	   ptr->arch, processor_names[ptr->arch]);
-
-  gcc_assert (ptr->tune < PROCESSOR_max);
-  fprintf (file, "%*stune = %d (%s)\n",
-	   indent, "",
-	   ptr->tune, processor_names[ptr->tune]);
-
-  fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
-
-  if (target_string)
-    {
-      fprintf (file, "%*s%s\n", indent, "", target_string);
-      free (target_string);
-    }
-}
-
-
-/* Inner function to process the attribute((target(...))), take an argument and
-   set the current options from the argument. If we have a list, recursively go
-   over the list.  */
+   MODE is the argument's machine mode.
+   TYPE is the data type of the argument (as a tree).
+    This is null for libcalls where that information may
+    not be available.
+   CUM is a variable of type CUMULATIVE_ARGS which gives info about
+    the preceding args and about the function being called.
+   NAMED is nonzero if this argument is a named parameter
+    (otherwise it is an extra parameter matching an ellipsis).  */
 
-static bool
-ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
-				     struct gcc_options *opts,
-				     struct gcc_options *opts_set,
-				     struct gcc_options *enum_opts_set)
+static rtx
+function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
+		 machine_mode orig_mode, const_tree type,
+		 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
 {
-  char *next_optstr;
-  bool ret = true;
-
-#define IX86_ATTR_ISA(S,O)   { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
-#define IX86_ATTR_STR(S,O)   { S, sizeof (S)-1, ix86_opt_str, O, 0 }
-#define IX86_ATTR_ENUM(S,O)  { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
-#define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
-#define IX86_ATTR_NO(S,O,M)  { S, sizeof (S)-1, ix86_opt_no,  O, M }
-
-  enum ix86_opt_type
-  {
-    ix86_opt_unknown,
-    ix86_opt_yes,
-    ix86_opt_no,
-    ix86_opt_str,
-    ix86_opt_enum,
-    ix86_opt_isa
-  };
-
-  static const struct
-  {
-    const char *string;
-    size_t len;
-    enum ix86_opt_type type;
-    int opt;
-    int mask;
-  } attrs[] = {
-    /* isa options */
-    IX86_ATTR_ISA ("pconfig",	OPT_mpconfig),
-    IX86_ATTR_ISA ("wbnoinvd",	OPT_mwbnoinvd),
-    IX86_ATTR_ISA ("sgx",	OPT_msgx),
-    IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
-    IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
-    IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
-    IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
-    IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
-    IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg),
-
-    IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
-    IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
-    IX86_ATTR_ISA ("avx512vl",	OPT_mavx512vl),
-    IX86_ATTR_ISA ("avx512bw",	OPT_mavx512bw),
-    IX86_ATTR_ISA ("avx512dq",	OPT_mavx512dq),
-    IX86_ATTR_ISA ("avx512er",	OPT_mavx512er),
-    IX86_ATTR_ISA ("avx512pf",	OPT_mavx512pf),
-    IX86_ATTR_ISA ("avx512cd",	OPT_mavx512cd),
-    IX86_ATTR_ISA ("avx512f",	OPT_mavx512f),
-    IX86_ATTR_ISA ("avx2",	OPT_mavx2),
-    IX86_ATTR_ISA ("fma",	OPT_mfma),
-    IX86_ATTR_ISA ("xop",	OPT_mxop),
-    IX86_ATTR_ISA ("fma4",	OPT_mfma4),
-    IX86_ATTR_ISA ("f16c",	OPT_mf16c),
-    IX86_ATTR_ISA ("avx",	OPT_mavx),
-    IX86_ATTR_ISA ("sse4",	OPT_msse4),
-    IX86_ATTR_ISA ("sse4.2",	OPT_msse4_2),
-    IX86_ATTR_ISA ("sse4.1",	OPT_msse4_1),
-    IX86_ATTR_ISA ("sse4a",	OPT_msse4a),
-    IX86_ATTR_ISA ("ssse3",	OPT_mssse3),
-    IX86_ATTR_ISA ("sse3",	OPT_msse3),
-    IX86_ATTR_ISA ("aes",	OPT_maes),
-    IX86_ATTR_ISA ("sha",	OPT_msha),
-    IX86_ATTR_ISA ("pclmul",	OPT_mpclmul),
-    IX86_ATTR_ISA ("sse2",	OPT_msse2),
-    IX86_ATTR_ISA ("sse",	OPT_msse),
-    IX86_ATTR_ISA ("3dnowa",	OPT_m3dnowa),
-    IX86_ATTR_ISA ("3dnow",	OPT_m3dnow),
-    IX86_ATTR_ISA ("mmx",	OPT_mmmx),
-    IX86_ATTR_ISA ("rtm",	OPT_mrtm),
-    IX86_ATTR_ISA ("prfchw",	OPT_mprfchw),
-    IX86_ATTR_ISA ("rdseed",	OPT_mrdseed),
-    IX86_ATTR_ISA ("adx",	OPT_madx),
-    IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
-    IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
-    IX86_ATTR_ISA ("xsaves",	OPT_mxsaves),
-    IX86_ATTR_ISA ("xsavec",	OPT_mxsavec),
-    IX86_ATTR_ISA ("xsaveopt",	OPT_mxsaveopt),
-    IX86_ATTR_ISA ("xsave",	OPT_mxsave),
-    IX86_ATTR_ISA ("abm",	OPT_mabm),
-    IX86_ATTR_ISA ("bmi",	OPT_mbmi),
-    IX86_ATTR_ISA ("bmi2",	OPT_mbmi2),
-    IX86_ATTR_ISA ("lzcnt",	OPT_mlzcnt),
-    IX86_ATTR_ISA ("tbm",	OPT_mtbm),
-    IX86_ATTR_ISA ("popcnt",	OPT_mpopcnt),
-    IX86_ATTR_ISA ("cx16",	OPT_mcx16),
-    IX86_ATTR_ISA ("sahf",	OPT_msahf),
-    IX86_ATTR_ISA ("movbe",	OPT_mmovbe),
-    IX86_ATTR_ISA ("crc32",	OPT_mcrc32),
-    IX86_ATTR_ISA ("fsgsbase",	OPT_mfsgsbase),
-    IX86_ATTR_ISA ("rdrnd",	OPT_mrdrnd),
-    IX86_ATTR_ISA ("mwaitx",	OPT_mmwaitx),
-    IX86_ATTR_ISA ("clzero",	OPT_mclzero),
-    IX86_ATTR_ISA ("pku",	OPT_mpku),
-    IX86_ATTR_ISA ("lwp",	OPT_mlwp),
-    IX86_ATTR_ISA ("hle",	OPT_mhle),
-    IX86_ATTR_ISA ("fxsr",	OPT_mfxsr),
-    IX86_ATTR_ISA ("clwb",	OPT_mclwb),
-    IX86_ATTR_ISA ("rdpid",	OPT_mrdpid),
-    IX86_ATTR_ISA ("gfni",	OPT_mgfni),
-    IX86_ATTR_ISA ("shstk",	OPT_mshstk),
-    IX86_ATTR_ISA ("vaes",	OPT_mvaes),
-    IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq),
-    IX86_ATTR_ISA ("movdiri", OPT_mmovdiri),
-    IX86_ATTR_ISA ("movdir64b", OPT_mmovdir64b),
-    IX86_ATTR_ISA ("waitpkg", OPT_mwaitpkg),
-    IX86_ATTR_ISA ("cldemote", OPT_mcldemote),
-    IX86_ATTR_ISA ("ptwrite",   OPT_mptwrite),
-
-    /* enum options */
-    IX86_ATTR_ENUM ("fpmath=",	OPT_mfpmath_),
-
-    /* string options */
-    IX86_ATTR_STR ("arch=",	IX86_FUNCTION_SPECIFIC_ARCH),
-    IX86_ATTR_STR ("tune=",	IX86_FUNCTION_SPECIFIC_TUNE),
-
-    /* flag options */
-    IX86_ATTR_YES ("cld",
-		   OPT_mcld,
-		   MASK_CLD),
-
-    IX86_ATTR_NO ("fancy-math-387",
-		  OPT_mfancy_math_387,
-		  MASK_NO_FANCY_MATH_387),
-
-    IX86_ATTR_YES ("ieee-fp",
-		   OPT_mieee_fp,
-		   MASK_IEEE_FP),
-
-    IX86_ATTR_YES ("inline-all-stringops",
-		   OPT_minline_all_stringops,
-		   MASK_INLINE_ALL_STRINGOPS),
-
-    IX86_ATTR_YES ("inline-stringops-dynamically",
-		   OPT_minline_stringops_dynamically,
-		   MASK_INLINE_STRINGOPS_DYNAMICALLY),
-
-    IX86_ATTR_NO ("align-stringops",
-		  OPT_mno_align_stringops,
-		  MASK_NO_ALIGN_STRINGOPS),
-
-    IX86_ATTR_YES ("recip",
-		   OPT_mrecip,
-		   MASK_RECIP),
-
-  };
-
-  /* If this is a list, recurse to get the options.  */
-  if (TREE_CODE (args) == TREE_LIST)
-    {
-      bool ret = true;
-
-      for (; args; args = TREE_CHAIN (args))
-	if (TREE_VALUE (args)
-	    && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
-						     p_strings, opts, opts_set,
-						     enum_opts_set))
-	  ret = false;
+  bool error_p = false;
 
-      return ret;
-    }
+  /* Avoid the AL settings for the Unix64 ABI.  */
+  if (mode == VOIDmode)
+    return constm1_rtx;
 
-  else if (TREE_CODE (args) != STRING_CST)
+  if (TARGET_IAMCU)
     {
-      error ("attribute %<target%> argument not a string");
-      return false;
+      /* Intel MCU psABI passes scalars and aggregates no larger than 8
+	 bytes in registers.  */
+      if (!VECTOR_MODE_P (mode) && bytes <= 8)
+	goto pass_in_reg;
+      return NULL_RTX;
     }
 
-  /* Handle multiple arguments separated by commas.  */
-  next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
-
-  while (next_optstr && *next_optstr != '\0')
+  switch (mode)
     {
-      char *p = next_optstr;
-      char *orig_p = p;
-      char *comma = strchr (next_optstr, ',');
-      const char *opt_string;
-      size_t len, opt_len;
-      int opt;
-      bool opt_set_p;
-      char ch;
-      unsigned i;
-      enum ix86_opt_type type = ix86_opt_unknown;
-      int mask = 0;
+    default:
+      break;
 
-      if (comma)
-	{
-	  *comma = '\0';
-	  len = comma - next_optstr;
-	  next_optstr = comma + 1;
-	}
-      else
+    case E_BLKmode:
+      if (bytes < 0)
+	break;
+      /* FALLTHRU */
+    case E_DImode:
+    case E_SImode:
+    case E_HImode:
+    case E_QImode:
+pass_in_reg:
+      if (words <= cum->nregs)
 	{
-	  len = strlen (p);
-	  next_optstr = NULL;
-	}
+	  int regno = cum->regno;
 
-      /* Recognize no-xxx.  */
-      if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
-	{
-	  opt_set_p = false;
-	  p += 3;
-	  len -= 3;
-	}
-      else
-	opt_set_p = true;
-
-      /* Find the option.  */
-      ch = *p;
-      opt = N_OPTS;
-      for (i = 0; i < ARRAY_SIZE (attrs); i++)
-	{
-	  type = attrs[i].type;
-	  opt_len = attrs[i].len;
-	  if (ch == attrs[i].string[0]
-	      && ((type != ix86_opt_str && type != ix86_opt_enum)
-		  ? len == opt_len
-		  : len > opt_len)
-	      && memcmp (p, attrs[i].string, opt_len) == 0)
+	  /* Fastcall allocates the first two DWORD (SImode) or
+            smaller arguments to ECX and EDX if it isn't an
+            aggregate type .  */
+	  if (cum->fastcall)
 	    {
-	      opt = attrs[i].opt;
-	      mask = attrs[i].mask;
-	      opt_string = attrs[i].string;
-	      break;
-	    }
-	}
+	      if (mode == BLKmode
+		  || mode == DImode
+		  || (type && AGGREGATE_TYPE_P (type)))
+	        break;
 
-      /* Process the option.  */
-      if (opt == N_OPTS)
-	{
-	  error ("attribute(target(\"%s\")) is unknown", orig_p);
-	  ret = false;
+	      /* ECX not EAX is the first allocated register.  */
+	      if (regno == AX_REG)
+		regno = CX_REG;
+	    }
+	  return gen_rtx_REG (mode, regno);
 	}
+      break;
 
-      else if (type == ix86_opt_isa)
+    case E_DFmode:
+      if (cum->float_in_sse == -1)
+	error_p = true;
+      if (cum->float_in_sse < 2)
+	break;
+      /* FALLTHRU */
+    case E_SFmode:
+      if (cum->float_in_sse == -1)
+	error_p = true;
+      if (cum->float_in_sse < 1)
+	break;
+      /* FALLTHRU */
+    case E_TImode:
+      /* In 32bit, we pass TImode in xmm registers.  */
+    case E_V16QImode:
+    case E_V8HImode:
+    case E_V4SImode:
+    case E_V2DImode:
+    case E_V4SFmode:
+    case E_V2DFmode:
+      if (!type || !AGGREGATE_TYPE_P (type))
 	{
-	  struct cl_decoded_option decoded;
-
-	  generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
-	  ix86_handle_option (opts, opts_set,
-			      &decoded, input_location);
+	  if (cum->sse_nregs)
+	    return gen_reg_or_parallel (mode, orig_mode,
+				        cum->sse_regno + FIRST_SSE_REG);
 	}
+      break;
 
-      else if (type == ix86_opt_yes || type == ix86_opt_no)
-	{
-	  if (type == ix86_opt_no)
-	    opt_set_p = !opt_set_p;
-
-	  if (opt_set_p)
-	    opts->x_target_flags |= mask;
-	  else
-	    opts->x_target_flags &= ~mask;
-	}
+    case E_OImode:
+    case E_XImode:
+      /* OImode and XImode shouldn't be used directly.  */
+      gcc_unreachable ();
 
-      else if (type == ix86_opt_str)
+    case E_V64QImode:
+    case E_V32HImode:
+    case E_V16SImode:
+    case E_V8DImode:
+    case E_V16SFmode:
+    case E_V8DFmode:
+    case E_V8SFmode:
+    case E_V8SImode:
+    case E_V32QImode:
+    case E_V16HImode:
+    case E_V4DFmode:
+    case E_V4DImode:
+      if (!type || !AGGREGATE_TYPE_P (type))
 	{
-	  if (p_strings[opt])
-	    {
-	      error ("option(\"%s\") was already specified", opt_string);
-	      ret = false;
-	    }
-	  else
-	    {
-	      p_strings[opt] = xstrdup (p + opt_len);
-	      if (opt == IX86_FUNCTION_SPECIFIC_ARCH)
-		{
-		  /* If arch= is set,  clear all bits in x_ix86_isa_flags,
-		     except for ISA_64BIT, ABI_64, ABI_X32, and CODE16
-		     and all bits in x_ix86_isa_flags2.  */
-		  opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
-					     | OPTION_MASK_ABI_64
-					     | OPTION_MASK_ABI_X32
-					     | OPTION_MASK_CODE16);
-		  opts->x_ix86_isa_flags_explicit &= (OPTION_MASK_ISA_64BIT
-						      | OPTION_MASK_ABI_64
-						      | OPTION_MASK_ABI_X32
-						      | OPTION_MASK_CODE16);
-		  opts->x_ix86_isa_flags2 = 0;
-		  opts->x_ix86_isa_flags2_explicit = 0;
-		}
-	    }
+	  if (cum->sse_nregs)
+	    return gen_reg_or_parallel (mode, orig_mode,
+				        cum->sse_regno + FIRST_SSE_REG);
 	}
+      break;
 
-      else if (type == ix86_opt_enum)
+    case E_V8QImode:
+    case E_V4HImode:
+    case E_V2SImode:
+    case E_V2SFmode:
+    case E_V1TImode:
+    case E_V1DImode:
+      if (!type || !AGGREGATE_TYPE_P (type))
 	{
-	  bool arg_ok;
-	  int value;
-
-	  arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
-	  if (arg_ok)
-	    set_option (opts, enum_opts_set, opt, value,
-			p + opt_len, DK_UNSPECIFIED, input_location,
-			global_dc);
-	  else
-	    {
-	      error ("attribute(target(\"%s\")) is unknown", orig_p);
-	      ret = false;
-	    }
+	  if (cum->mmx_nregs)
+	    return gen_reg_or_parallel (mode, orig_mode,
+				        cum->mmx_regno + FIRST_MMX_REG);
 	}
-
-      else
-	gcc_unreachable ();
+      break;
+    }
+  if (error_p)
+    {
+      cum->float_in_sse = 0;
+      error ("calling %qD with SSE calling convention without "
+	     "SSE/SSE2 enabled", cum->decl);
+      sorry ("this is a GCC bug that can be worked around by adding "
+	     "attribute used to function called");
     }
 
-  return ret;
-}
-
-/* Release allocated strings.  */
-static void
-release_options_strings (char **option_strings)
-{
-  /* Free up memory allocated to hold the strings */
-  for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
-    free (option_strings[i]);
+  return NULL_RTX;
 }
 
-/* Return a TARGET_OPTION_NODE tree of the target options listed or NULL.  */
-
-tree
-ix86_valid_target_attribute_tree (tree args,
-				  struct gcc_options *opts,
-				  struct gcc_options *opts_set)
+static rtx
+function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
+		 machine_mode orig_mode, const_tree type, bool named)
 {
-  const char *orig_arch_string = opts->x_ix86_arch_string;
-  const char *orig_tune_string = opts->x_ix86_tune_string;
-  enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
-  int orig_tune_defaulted = ix86_tune_defaulted;
-  int orig_arch_specified = ix86_arch_specified;
-  char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
-  tree t = NULL_TREE;
-  struct cl_target_option *def
-    = TREE_TARGET_OPTION (target_option_default_node);
-  struct gcc_options enum_opts_set;
-
-  memset (&enum_opts_set, 0, sizeof (enum_opts_set));
-
-  /* Process each of the options on the chain.  */
-  if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
-					     opts_set, &enum_opts_set))
-    return error_mark_node;
+  /* Handle a hidden AL argument containing number of registers
+     for varargs x86-64 functions.  */
+  if (mode == VOIDmode)
+    return GEN_INT (cum->maybe_vaarg
+		    ? (cum->sse_nregs < 0
+		       ? X86_64_SSE_REGPARM_MAX
+		       : cum->sse_regno)
+		    : -1);
 
-  /* If the changed options are different from the default, rerun
-     ix86_option_override_internal, and then save the options away.
-     The string options are attribute options, and will be undone
-     when we copy the save structure.  */
-  if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
-      || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
-      || opts->x_target_flags != def->x_target_flags
-      || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
-      || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
-      || enum_opts_set.x_ix86_fpmath)
+  switch (mode)
     {
-      /* If we are using the default tune= or arch=, undo the string assigned,
-	 and use the default.  */
-      if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
-	opts->x_ix86_arch_string
-	  = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
-      else if (!orig_arch_specified)
-	opts->x_ix86_arch_string = NULL;
-
-      if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
-	opts->x_ix86_tune_string
-	  = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
-      else if (orig_tune_defaulted)
-	opts->x_ix86_tune_string = NULL;
-
-      /* If fpmath= is not set, and we now have sse2 on 32-bit, use it.  */
-      if (enum_opts_set.x_ix86_fpmath)
-	opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
-
-      /* Do any overrides, such as arch=xxx, or tune=xxx support.  */
-      bool r = ix86_option_override_internal (false, opts, opts_set);
-      if (!r)
-	{
-	  release_options_strings (option_strings);
-	  return error_mark_node;
-	}
-
-      /* Add any builtin functions with the new isa if any.  */
-      ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
-
-      /* Save the current options unless we are validating options for
-	 #pragma.  */
-      t = build_target_option_node (opts);
-
-      opts->x_ix86_arch_string = orig_arch_string;
-      opts->x_ix86_tune_string = orig_tune_string;
-      opts_set->x_ix86_fpmath = orig_fpmath_set;
+    default:
+      break;
 
-      release_options_strings (option_strings);
+    case E_V8SFmode:
+    case E_V8SImode:
+    case E_V32QImode:
+    case E_V16HImode:
+    case E_V4DFmode:
+    case E_V4DImode:
+    case E_V16SFmode:
+    case E_V16SImode:
+    case E_V64QImode:
+    case E_V32HImode:
+    case E_V8DFmode:
+    case E_V8DImode:
+      /* Unnamed 256 and 512bit vector mode parameters are passed on stack.  */
+      if (!named)
+	return NULL;
+      break;
     }
 
-  return t;
+  return construct_container (mode, orig_mode, type, 0, cum->nregs,
+			      cum->sse_nregs,
+			      &x86_64_int_parameter_registers [cum->regno],
+			      cum->sse_regno);
 }
 
-/* Hook to validate attribute((target("string"))).  */
-
-static bool
-ix86_valid_target_attribute_p (tree fndecl,
-			       tree ARG_UNUSED (name),
-			       tree args,
-			       int ARG_UNUSED (flags))
-{
-  struct gcc_options func_options;
-  tree new_target, new_optimize;
-  bool ret = true;
-
-  /* attribute((target("default"))) does nothing, beyond
-     affecting multi-versioning.  */
-  if (TREE_VALUE (args)
-      && TREE_CODE (TREE_VALUE (args)) == STRING_CST
-      && TREE_CHAIN (args) == NULL_TREE
-      && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
-    return true;
+static rtx
+function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
+		    machine_mode orig_mode, bool named, const_tree type,
+		    HOST_WIDE_INT bytes)
+{
+  unsigned int regno;
 
-  tree old_optimize = build_optimization_node (&global_options);
-
-  /* Get the optimization options of the current function.  */  
-  tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
- 
-  if (!func_optimize)
-    func_optimize = old_optimize;
-
-  /* Init func_options.  */
-  memset (&func_options, 0, sizeof (func_options));
-  init_options_struct (&func_options, NULL);
-  lang_hooks.init_options_struct (&func_options);
- 
-  cl_optimization_restore (&func_options,
-			   TREE_OPTIMIZATION (func_optimize));
-
-  /* Initialize func_options to the default before its target options can
-     be set.  */
-  cl_target_option_restore (&func_options,
-			    TREE_TARGET_OPTION (target_option_default_node));
-
-  new_target = ix86_valid_target_attribute_tree (args, &func_options,
-						 &global_options_set);
-
-  new_optimize = build_optimization_node (&func_options);
-
-  if (new_target == error_mark_node)
-    ret = false;
-
-  else if (fndecl && new_target)
-    {
-      DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
-
-      if (old_optimize != new_optimize)
-	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
-    }
-
-  finalize_options_struct (&func_options);
-
-  return ret;
-}
-
-
-/* Hook to determine if one function can safely inline another.  */
-
-static bool
-ix86_can_inline_p (tree caller, tree callee)
-{
-  tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
-  tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
-
-  /* Changes of those flags can be tolerated for always inlines. Lets hope
-     user knows what he is doing.  */
-  const unsigned HOST_WIDE_INT always_inline_safe_mask
-	 = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS
-	    | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD
-	    | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD
-	    | MASK_NO_FANCY_MATH_387 | MASK_IEEE_FP | MASK_INLINE_ALL_STRINGOPS
-	    | MASK_INLINE_STRINGOPS_DYNAMICALLY | MASK_RECIP | MASK_STACK_PROBE
-	    | MASK_STV | MASK_TLS_DIRECT_SEG_REFS | MASK_VZEROUPPER
-	    | MASK_NO_PUSH_ARGS | MASK_OMIT_LEAF_FRAME_POINTER);
-
-
-  if (!callee_tree)
-    callee_tree = target_option_default_node;
-  if (!caller_tree)
-    caller_tree = target_option_default_node;
-  if (callee_tree == caller_tree)
-    return true;
-
-  struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
-  struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
-  bool ret = false;
-  bool always_inline
-    = (DECL_DISREGARD_INLINE_LIMITS (callee)
-       && lookup_attribute ("always_inline",
-			    DECL_ATTRIBUTES (callee)));
-
-  cgraph_node *callee_node = cgraph_node::get (callee);
-  /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
-     function can inline a SSE2 function but a SSE2 function can't inline
-     a SSE4 function.  */
-  if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
-       != callee_opts->x_ix86_isa_flags)
-      || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
-	  != callee_opts->x_ix86_isa_flags2))
-    ret = false;
-
-  /* See if we have the same non-isa options.  */
-  else if ((!always_inline
-	    && caller_opts->x_target_flags != callee_opts->x_target_flags)
-	   || (caller_opts->x_target_flags & ~always_inline_safe_mask)
-	       != (callee_opts->x_target_flags & ~always_inline_safe_mask))
-    ret = false;
-
-  /* See if arch, tune, etc. are the same.  */
-  else if (caller_opts->arch != callee_opts->arch)
-    ret = false;
-
-  else if (!always_inline && caller_opts->tune != callee_opts->tune)
-    ret = false;
-
-  else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
-	   /* If the calle doesn't use FP expressions differences in
-	      ix86_fpmath can be ignored.  We are called from FEs
-	      for multi-versioning call optimization, so beware of
-	      ipa_fn_summaries not available.  */
-	   && (! ipa_fn_summaries
-	       || ipa_fn_summaries->get (callee_node) == NULL
-	       || ipa_fn_summaries->get (callee_node)->fp_expressions))
-    ret = false;
-
-  else if (!always_inline
-	   && caller_opts->branch_cost != callee_opts->branch_cost)
-    ret = false;
-
-  else
-    ret = true;
-
-  return ret;
-}
-
-
-/* Remember the last target of ix86_set_current_function.  */
-static GTY(()) tree ix86_previous_fndecl;
-
-/* Set targets globals to the default (or current #pragma GCC target
-   if active).  Invalidate ix86_previous_fndecl cache.  */
+  /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
+     We use value of -2 to specify that current function call is MSABI.  */
+  if (mode == VOIDmode)
+    return GEN_INT (-2);
 
-void
-ix86_reset_previous_fndecl (void)
-{
-  tree new_tree = target_option_current_node;
-  cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
-  if (TREE_TARGET_GLOBALS (new_tree))
-    restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
-  else if (new_tree == target_option_default_node)
-    restore_target_globals (&default_target_globals);
-  else
-    TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
-  ix86_previous_fndecl = NULL_TREE;
-}
+  /* If we've run out of registers, it goes on the stack.  */
+  if (cum->nregs == 0)
+    return NULL_RTX;
 
-/* Set the func_type field from the function FNDECL.  */
+  regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
 
-static void
-ix86_set_func_type (tree fndecl)
-{
-  if (cfun->machine->func_type == TYPE_UNKNOWN)
+  /* Only floating point modes are passed in anything but integer regs.  */
+  if (TARGET_SSE && (mode == SFmode || mode == DFmode))
     {
-      if (lookup_attribute ("interrupt",
-			    TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
+      if (named)
 	{
-	  if (ix86_function_naked (fndecl))
-	    error_at (DECL_SOURCE_LOCATION (fndecl),
-		      "interrupt and naked attributes are not compatible");
-
-	  int nargs = 0;
-	  for (tree arg = DECL_ARGUMENTS (fndecl);
-	       arg;
-	       arg = TREE_CHAIN (arg))
-	    nargs++;
-	  cfun->machine->no_caller_saved_registers = true;
-	  cfun->machine->func_type
-	    = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
-
-	  ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
-
-	  /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument.  */
-	  if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
-	    sorry ("only DWARF debug format is supported for interrupt "
-		   "service routine");
+	  if (type == NULL_TREE || !AGGREGATE_TYPE_P (type))
+	    regno = cum->regno + FIRST_SSE_REG;
 	}
       else
 	{
-	  cfun->machine->func_type = TYPE_NORMAL;
-	  if (lookup_attribute ("no_caller_saved_registers",
-				TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
-	    cfun->machine->no_caller_saved_registers = true;
+	  rtx t1, t2;
+
+	  /* Unnamed floating parameters are passed in both the
+	     SSE and integer registers.  */
+	  t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
+	  t2 = gen_rtx_REG (mode, regno);
+	  t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
+	  t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
+	  return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
 	}
     }
+  /* Handle aggregated types passed in register.  */
+  if (orig_mode == BLKmode)
+    {
+      if (bytes > 0 && bytes <= 8)
+        mode = (bytes > 4 ? DImode : SImode);
+      if (mode == BLKmode)
+        mode = DImode;
+    }
+
+  return gen_reg_or_parallel (mode, orig_mode, regno);
 }
 
-/* Set the indirect_branch_type field from the function FNDECL.  */
+/* Return where to put the arguments to a function.
+   Return zero to push the argument on the stack, or a hard register in which to store the argument.
 
-static void
-ix86_set_indirect_branch_type (tree fndecl)
+   ARG describes the argument while CUM gives information about the
+   preceding args and about the function being called.  */
+
+static rtx
+ix86_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
 {
-  if (cfun->machine->indirect_branch_type == indirect_branch_unset)
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+  machine_mode mode = arg.mode;
+  HOST_WIDE_INT bytes, words;
+  rtx reg;
+
+  if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
     {
-      tree attr = lookup_attribute ("indirect_branch",
-				    DECL_ATTRIBUTES (fndecl));
-      if (attr != NULL)
+      gcc_assert (arg.type != NULL_TREE);
+      if (POINTER_TYPE_P (arg.type))
 	{
-	  tree args = TREE_VALUE (attr);
-	  if (args == NULL)
-	    gcc_unreachable ();
-	  tree cst = TREE_VALUE (args);
-	  if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
-	    cfun->machine->indirect_branch_type = indirect_branch_keep;
-	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
-	    cfun->machine->indirect_branch_type = indirect_branch_thunk;
-	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
-	    cfun->machine->indirect_branch_type = indirect_branch_thunk_inline;
-	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
-	    cfun->machine->indirect_branch_type = indirect_branch_thunk_extern;
-	  else
-	    gcc_unreachable ();
-	}
-      else
-	cfun->machine->indirect_branch_type = ix86_indirect_branch;
-
-      /* -mcmodel=large is not compatible with -mindirect-branch=thunk
-	 nor -mindirect-branch=thunk-extern.  */
-      if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
-	  && ((cfun->machine->indirect_branch_type
-	       == indirect_branch_thunk_extern)
-	      || (cfun->machine->indirect_branch_type
-		  == indirect_branch_thunk)))
-	error ("%<-mindirect-branch=%s%> and %<-mcmodel=large%> are not "
-	       "compatible",
-	       ((cfun->machine->indirect_branch_type
-		 == indirect_branch_thunk_extern)
-		? "thunk-extern" : "thunk"));
-
-      if (cfun->machine->indirect_branch_type != indirect_branch_keep
-	  && (flag_cf_protection & CF_RETURN))
-	error ("%<-mindirect-branch%> and %<-fcf-protection%> are not "
-	       "compatible");
-    }
-
-  if (cfun->machine->function_return_type == indirect_branch_unset)
-    {
-      tree attr = lookup_attribute ("function_return",
-				    DECL_ATTRIBUTES (fndecl));
-      if (attr != NULL)
-	{
-	  tree args = TREE_VALUE (attr);
-	  if (args == NULL)
-	    gcc_unreachable ();
-	  tree cst = TREE_VALUE (args);
-	  if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
-	    cfun->machine->function_return_type = indirect_branch_keep;
-	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
-	    cfun->machine->function_return_type = indirect_branch_thunk;
-	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
-	    cfun->machine->function_return_type = indirect_branch_thunk_inline;
-	  else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
-	    cfun->machine->function_return_type = indirect_branch_thunk_extern;
-	  else
-	    gcc_unreachable ();
+	  /* This is the pointer argument.  */
+	  gcc_assert (TYPE_MODE (arg.type) == Pmode);
+	  /* It is at -WORD(AP) in the current frame in interrupt and
+	     exception handlers.  */
+	  reg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
 	}
       else
-	cfun->machine->function_return_type = ix86_function_return;
-
-      /* -mcmodel=large is not compatible with -mfunction-return=thunk
-	 nor -mfunction-return=thunk-extern.  */
-      if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
-	  && ((cfun->machine->function_return_type
-	       == indirect_branch_thunk_extern)
-	      || (cfun->machine->function_return_type
-		  == indirect_branch_thunk)))
-	error ("%<-mfunction-return=%s%> and %<-mcmodel=large%> are not "
-	       "compatible",
-	       ((cfun->machine->function_return_type
-		 == indirect_branch_thunk_extern)
-		? "thunk-extern" : "thunk"));
-
-      if (cfun->machine->function_return_type != indirect_branch_keep
-	  && (flag_cf_protection & CF_RETURN))
-	error ("%<-mfunction-return%> and %<-fcf-protection%> are not "
-	       "compatible");
-    }
-}
-
-/* Establish appropriate back-end context for processing the function
-   FNDECL.  The argument might be NULL to indicate processing at top
-   level, outside of any function scope.  */
-static void
-ix86_set_current_function (tree fndecl)
-{
-  /* Only change the context if the function changes.  This hook is called
-     several times in the course of compiling a function, and we don't want to
-     slow things down too much or call target_reinit when it isn't safe.  */
-  if (fndecl == ix86_previous_fndecl)
-    {
-      /* There may be 2 function bodies for the same function FNDECL,
-	 one is extern inline and one isn't.  Call ix86_set_func_type
-	 to set the func_type field.  */
-      if (fndecl != NULL_TREE)
 	{
-	  ix86_set_func_type (fndecl);
-	  ix86_set_indirect_branch_type (fndecl);
+	  gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
+		      && TREE_CODE (arg.type) == INTEGER_TYPE
+		      && TYPE_MODE (arg.type) == word_mode);
+	  /* The error code is the word-mode integer argument at
+	     -2 * WORD(AP) in the current frame of the exception
+	     handler.  */
+	  reg = gen_rtx_MEM (word_mode,
+			     plus_constant (Pmode,
+					    arg_pointer_rtx,
+					    -2 * UNITS_PER_WORD));
 	}
-      return;
-    }
-
-  tree old_tree;
-  if (ix86_previous_fndecl == NULL_TREE)
-    old_tree = target_option_current_node;
-  else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
-    old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
-  else
-    old_tree = target_option_default_node;
-
-  if (fndecl == NULL_TREE)
-    {
-      if (old_tree != target_option_current_node)
-	ix86_reset_previous_fndecl ();
-      return;
+      return reg;
     }
 
-  ix86_set_func_type (fndecl);
-  ix86_set_indirect_branch_type (fndecl);
+  bytes = arg.promoted_size_in_bytes ();
+  words = CEIL (bytes, UNITS_PER_WORD);
 
-  tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
-  if (new_tree == NULL_TREE)
-    new_tree = target_option_default_node;
+  /* To simplify the code below, represent vector types with a vector mode
+     even if MMX/SSE are not active.  */
+  if (arg.type && TREE_CODE (arg.type) == VECTOR_TYPE)
+    mode = type_natural_mode (arg.type, cum, false);
 
-  if (old_tree != new_tree)
+  if (TARGET_64BIT)
     {
-      cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
-      if (TREE_TARGET_GLOBALS (new_tree))
-	restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
-      else if (new_tree == target_option_default_node)
-	restore_target_globals (&default_target_globals);
-      else
-	TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
-    }
-  ix86_previous_fndecl = fndecl;
-
-  static bool prev_no_caller_saved_registers;
+      enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
 
-  /* 64-bit MS and SYSV ABI have different set of call used registers.
-     Avoid expensive re-initialization of init_regs each time we switch
-     function context.  */
-  if (TARGET_64BIT
-      && (call_used_regs[SI_REG]
-	  == (cfun->machine->call_abi == MS_ABI)))
-    reinit_regs ();
-  /* Need to re-initialize init_regs if caller-saved registers are
-     changed.  */
-  else if (prev_no_caller_saved_registers
-	   != cfun->machine->no_caller_saved_registers)
-    reinit_regs ();
-
-  if (cfun->machine->func_type != TYPE_NORMAL
-      || cfun->machine->no_caller_saved_registers)
-    {
-      /* Don't allow SSE, MMX nor x87 instructions since they
-	 may change processor state.  */
-      const char *isa;
-      if (TARGET_SSE)
-	isa = "SSE";
-      else if (TARGET_MMX)
-	isa = "MMX/3Dnow";
-      else if (TARGET_80387)
-	isa = "80387";
+      if (call_abi == MS_ABI)
+	reg = function_arg_ms_64 (cum, mode, arg.mode, arg.named,
+				  arg.type, bytes);
       else
-	isa = NULL;
-      if (isa != NULL)
-	{
-	  if (cfun->machine->func_type != TYPE_NORMAL)
-	    sorry (cfun->machine->func_type == TYPE_EXCEPTION
-		   ? G_("%s instructions aren%'t allowed in an"
-			" exception service routine")
-		   : G_("%s instructions aren%'t allowed in an"
-			" interrupt service routine"),
-		   isa);
-	  else
-	    sorry ("%s instructions aren%'t allowed in a function with "
-		   "the %<no_caller_saved_registers%> attribute", isa);
-	  /* Don't issue the same error twice.  */
-	  cfun->machine->func_type = TYPE_NORMAL;
-	  cfun->machine->no_caller_saved_registers = false;
-	}
+	reg = function_arg_64 (cum, mode, arg.mode, arg.type, arg.named);
     }
+  else
+    reg = function_arg_32 (cum, mode, arg.mode, arg.type, bytes, words);
 
-  prev_no_caller_saved_registers
-    = cfun->machine->no_caller_saved_registers;
+  /* Track if there are outgoing arguments on stack.  */
+  if (reg == NULL_RTX && cum->caller)
+    cfun->machine->outgoing_args_on_stack = true;
+
+  return reg;
 }
 
-
-/* Return true if this goes in large data/bss.  */
+/* A C expression that indicates when an argument must be passed by
+   reference.  If nonzero for an argument, a copy of that argument is
+   made in memory and a pointer to the argument is passed instead of
+   the argument itself.  The pointer is passed in whatever way is
+   appropriate for passing a pointer to that type.  */
 
 static bool
-ix86_in_large_data_p (tree exp)
+ix86_pass_by_reference (cumulative_args_t cum_v, const function_arg_info &arg)
 {
-  if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
-    return false;
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
 
-  if (exp == NULL_TREE)
-    return false;
+  if (TARGET_64BIT)
+    {
+      enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
 
-  /* Functions are never large data.  */
-  if (TREE_CODE (exp) == FUNCTION_DECL)
-    return false;
+      /* See Windows x64 Software Convention.  */
+      if (call_abi == MS_ABI)
+	{
+	  HOST_WIDE_INT msize = GET_MODE_SIZE (arg.mode);
 
-  /* Automatic variables are never large data.  */
-  if (VAR_P (exp) && !is_global_var (exp))
-    return false;
+	  if (tree type = arg.type)
+	    {
+	      /* Arrays are passed by reference.  */
+	      if (TREE_CODE (type) == ARRAY_TYPE)
+		return true;
 
-  if (VAR_P (exp) && DECL_SECTION_NAME (exp))
-    {
-      const char *section = DECL_SECTION_NAME (exp);
-      if (strcmp (section, ".ldata") == 0
-	  || strcmp (section, ".lbss") == 0)
-	return true;
-      return false;
-    }
-  else
-    {
-      HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
+	      if (RECORD_OR_UNION_TYPE_P (type))
+		{
+		  /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
+		     are passed by reference.  */
+		  msize = int_size_in_bytes (type);
+		}
+	    }
 
-      /* If this is an incomplete type with size 0, then we can't put it
-	 in data because it might be too big when completed.  Also,
-	 int_size_in_bytes returns -1 if size can vary or is larger than
-	 an integer in which case also it is safer to assume that it goes in
-	 large data.  */
-      if (size <= 0 || size > ix86_section_threshold)
+	  /* __m128 is passed by reference.  */
+	  return msize != 1 && msize != 2 && msize != 4 && msize != 8;
+	}
+      else if (arg.type && int_size_in_bytes (arg.type) == -1)
 	return true;
     }
 
   return false;
 }
 
-/* i386-specific section flag to mark large sections.  */
-#define SECTION_LARGE SECTION_MACH_DEP
-
-/* Switch to the appropriate section for output of DECL.
-   DECL is either a `VAR_DECL' node or a constant of some sort.
-   RELOC indicates whether forming the initial value of DECL requires
-   link-time relocations.  */
+/* Return true when TYPE should be 128bit aligned for 32bit argument
+   passing ABI.  XXX: This function is obsolete and is only used for
+   checking psABI compatibility with previous versions of GCC.  */
 
-ATTRIBUTE_UNUSED static section *
-x86_64_elf_select_section (tree decl, int reloc,
-			   unsigned HOST_WIDE_INT align)
+static bool
+ix86_compat_aligned_value_p (const_tree type)
 {
-  if (ix86_in_large_data_p (decl))
+  machine_mode mode = TYPE_MODE (type);
+  if (((TARGET_SSE && SSE_REG_MODE_P (mode))
+       || mode == TDmode
+       || mode == TFmode
+       || mode == TCmode)
+      && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
+    return true;
+  if (TYPE_ALIGN (type) < 128)
+    return false;
+
+  if (AGGREGATE_TYPE_P (type))
     {
-      const char *sname = NULL;
-      unsigned int flags = SECTION_WRITE | SECTION_LARGE;
-      switch (categorize_decl_for_section (decl, reloc))
+      /* Walk the aggregates recursively.  */
+      switch (TREE_CODE (type))
 	{
-	case SECCAT_DATA:
-	  sname = ".ldata";
-	  break;
-	case SECCAT_DATA_REL:
-	  sname = ".ldata.rel";
-	  break;
-	case SECCAT_DATA_REL_LOCAL:
-	  sname = ".ldata.rel.local";
-	  break;
-	case SECCAT_DATA_REL_RO:
-	  sname = ".ldata.rel.ro";
-	  break;
-	case SECCAT_DATA_REL_RO_LOCAL:
-	  sname = ".ldata.rel.ro.local";
-	  break;
-	case SECCAT_BSS:
-	  sname = ".lbss";
-	  flags |= SECTION_BSS;
-	  break;
-	case SECCAT_RODATA:
-	case SECCAT_RODATA_MERGE_STR:
-	case SECCAT_RODATA_MERGE_STR_INIT:
-	case SECCAT_RODATA_MERGE_CONST:
-	  sname = ".lrodata";
-	  flags &= ~SECTION_WRITE;
+	case RECORD_TYPE:
+	case UNION_TYPE:
+	case QUAL_UNION_TYPE:
+	  {
+	    tree field;
+
+	    /* Walk all the structure fields.  */
+	    for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
+	      {
+		if (TREE_CODE (field) == FIELD_DECL
+		    && ix86_compat_aligned_value_p (TREE_TYPE (field)))
+		  return true;
+	      }
+	    break;
+	  }
+
+	case ARRAY_TYPE:
+	  /* Just for use if some languages passes arrays by value.  */
+	  if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
+	    return true;
 	  break;
-	case SECCAT_SRODATA:
-	case SECCAT_SDATA:
-	case SECCAT_SBSS:
+
+	default:
 	  gcc_unreachable ();
-	case SECCAT_TEXT:
-	case SECCAT_TDATA:
-	case SECCAT_TBSS:
-	  /* We don't split these for medium model.  Place them into
-	     default sections and hope for best.  */
-	  break;
-	}
-      if (sname)
-	{
-	  /* We might get called with string constants, but get_named_section
-	     doesn't like them as they are not DECLs.  Also, we need to set
-	     flags in that case.  */
-	  if (!DECL_P (decl))
-	    return get_section (sname, flags, NULL);
-	  return get_named_section (decl, sname, reloc);
 	}
     }
-  return default_elf_select_section (decl, reloc, align);
+  return false;
 }
 
-/* Select a set of attributes for section NAME based on the properties
-   of DECL and whether or not RELOC indicates that DECL's initializer
-   might contain runtime relocations.  */
+/* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
+   XXX: This function is obsolete and is only used for checking psABI
+   compatibility with previous versions of GCC.  */
 
-static unsigned int ATTRIBUTE_UNUSED
-x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
+static unsigned int
+ix86_compat_function_arg_boundary (machine_mode mode,
+				   const_tree type, unsigned int align)
 {
-  unsigned int flags = default_section_type_flags (decl, name, reloc);
+  /* In 32bit, only _Decimal128 and __float128 are aligned to their
+     natural boundaries.  */
+  if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
+    {
+      /* i386 ABI defines all arguments to be 4 byte aligned.  We have to
+	 make an exception for SSE modes since these require 128bit
+	 alignment.
 
-  if (ix86_in_large_data_p (decl))
-    flags |= SECTION_LARGE;
+	 The handling here differs from field_alignment.  ICC aligns MMX
+	 arguments to 4 byte boundaries, while structure fields are aligned
+	 to 8 byte boundaries.  */
+      if (!type)
+	{
+	  if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
+	    align = PARM_BOUNDARY;
+	}
+      else
+	{
+	  if (!ix86_compat_aligned_value_p (type))
+	    align = PARM_BOUNDARY;
+	}
+    }
+  if (align > BIGGEST_ALIGNMENT)
+    align = BIGGEST_ALIGNMENT;
+  return align;
+}
 
-  if (decl == NULL_TREE
-      && (strcmp (name, ".ldata.rel.ro") == 0
-	  || strcmp (name, ".ldata.rel.ro.local") == 0))
-    flags |= SECTION_RELRO;
+/* Return true when TYPE should be 128bit aligned for 32bit argument
+   passing ABI.  */
 
-  if (strcmp (name, ".lbss") == 0
-      || strncmp (name, ".lbss.", 5) == 0
-      || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
-    flags |= SECTION_BSS;
+static bool
+ix86_contains_aligned_value_p (const_tree type)
+{
+  machine_mode mode = TYPE_MODE (type);
 
-  return flags;
-}
+  if (mode == XFmode || mode == XCmode)
+    return false;
 
-/* Build up a unique section name, expressed as a
-   STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
-   RELOC indicates whether the initial value of EXP requires
-   link-time relocations.  */
+  if (TYPE_ALIGN (type) < 128)
+    return false;
 
-static void ATTRIBUTE_UNUSED
-x86_64_elf_unique_section (tree decl, int reloc)
-{
-  if (ix86_in_large_data_p (decl))
+  if (AGGREGATE_TYPE_P (type))
     {
-      const char *prefix = NULL;
-      /* We only need to use .gnu.linkonce if we don't have COMDAT groups.  */
-      bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
-
-      switch (categorize_decl_for_section (decl, reloc))
-	{
-	case SECCAT_DATA:
-	case SECCAT_DATA_REL:
-	case SECCAT_DATA_REL_LOCAL:
-	case SECCAT_DATA_REL_RO:
-	case SECCAT_DATA_REL_RO_LOCAL:
-          prefix = one_only ? ".ld" : ".ldata";
-	  break;
-	case SECCAT_BSS:
-          prefix = one_only ? ".lb" : ".lbss";
-	  break;
-	case SECCAT_RODATA:
-	case SECCAT_RODATA_MERGE_STR:
-	case SECCAT_RODATA_MERGE_STR_INIT:
-	case SECCAT_RODATA_MERGE_CONST:
-          prefix = one_only ? ".lr" : ".lrodata";
-	  break;
-	case SECCAT_SRODATA:
-	case SECCAT_SDATA:
-	case SECCAT_SBSS:
-	  gcc_unreachable ();
-	case SECCAT_TEXT:
-	case SECCAT_TDATA:
-	case SECCAT_TBSS:
-	  /* We don't split these for medium model.  Place them into
-	     default sections and hope for best.  */
-	  break;
-	}
-      if (prefix)
+      /* Walk the aggregates recursively.  */
+      switch (TREE_CODE (type))
 	{
-	  const char *name, *linkonce;
-	  char *string;
-
-	  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
-	  name = targetm.strip_name_encoding (name);
+	case RECORD_TYPE:
+	case UNION_TYPE:
+	case QUAL_UNION_TYPE:
+	  {
+	    tree field;
 
-	  /* If we're using one_only, then there needs to be a .gnu.linkonce
-     	     prefix to the section name.  */
-	  linkonce = one_only ? ".gnu.linkonce" : "";
+	    /* Walk all the structure fields.  */
+	    for (field = TYPE_FIELDS (type);
+		 field;
+		 field = DECL_CHAIN (field))
+	      {
+		if (TREE_CODE (field) == FIELD_DECL
+		    && ix86_contains_aligned_value_p (TREE_TYPE (field)))
+		  return true;
+	      }
+	    break;
+	  }
 
-	  string = ACONCAT ((linkonce, prefix, ".", name, NULL));
+	case ARRAY_TYPE:
+	  /* Just for use if some languages passes arrays by value.  */
+	  if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
+	    return true;
+	  break;
 
-	  set_decl_section_name (decl, string);
-	  return;
+	default:
+	  gcc_unreachable ();
 	}
     }
-  default_unique_section (decl, reloc);
-}
-
-#ifdef COMMON_ASM_OP
+  else
+    return TYPE_ALIGN (type) >= 128;
 
-#ifndef LARGECOMM_SECTION_ASM_OP
-#define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
-#endif
+  return false;
+}
 
-/* This says how to output assembler code to declare an
-   uninitialized external linkage data object.
+/* Gives the alignment boundary, in bits, of an argument with the
+   specified mode and type.  */
 
-   For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
-   large objects.  */
-void
-x86_elf_aligned_decl_common (FILE *file, tree decl,
-			const char *name, unsigned HOST_WIDE_INT size,
-			int align)
+static unsigned int
+ix86_function_arg_boundary (machine_mode mode, const_tree type)
 {
-  if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
-      && size > (unsigned int)ix86_section_threshold)
+  unsigned int align;
+  if (type)
     {
-      switch_to_section (get_named_section (decl, ".lbss", 0));
-      fputs (LARGECOMM_SECTION_ASM_OP, file);
+      /* Since the main variant type is used for call, we convert it to
+	 the main variant type.  */
+      type = TYPE_MAIN_VARIANT (type);
+      align = TYPE_ALIGN (type);
+      if (TYPE_EMPTY_P (type))
+	return PARM_BOUNDARY;
     }
   else
-    fputs (COMMON_ASM_OP, file);
-  assemble_name (file, name);
-  fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
-	   size, align / BITS_PER_UNIT);
-}
-#endif
+    align = GET_MODE_ALIGNMENT (mode);
+  if (align < PARM_BOUNDARY)
+    align = PARM_BOUNDARY;
+  else
+    {
+      static bool warned;
+      unsigned int saved_align = align;
 
-/* Utility function for targets to use in implementing
-   ASM_OUTPUT_ALIGNED_BSS.  */
+      if (!TARGET_64BIT)
+	{
+	  /* i386 ABI defines XFmode arguments to be 4 byte aligned.  */
+	  if (!type)
+	    {
+	      if (mode == XFmode || mode == XCmode)
+		align = PARM_BOUNDARY;
+	    }
+	  else if (!ix86_contains_aligned_value_p (type))
+	    align = PARM_BOUNDARY;
 
-void
-x86_output_aligned_bss (FILE *file, tree decl, const char *name,
-		       	unsigned HOST_WIDE_INT size, int align)
-{
-  if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
-      && size > (unsigned int)ix86_section_threshold)
-    switch_to_section (get_named_section (decl, ".lbss", 0));
-  else
-    switch_to_section (bss_section);
-  ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
-#ifdef ASM_DECLARE_OBJECT_NAME
-  last_assemble_variable_decl = decl;
-  ASM_DECLARE_OBJECT_NAME (file, name, decl);
-#else
-  /* Standard thing is just output label for the object.  */
-  ASM_OUTPUT_LABEL (file, name);
-#endif /* ASM_DECLARE_OBJECT_NAME */
-  ASM_OUTPUT_SKIP (file, size ? size : 1);
-}
-
-/* Decide whether we must probe the stack before any space allocation
-   on this target.  It's essentially TARGET_STACK_PROBE except when
-   -fstack-check causes the stack to be already probed differently.  */
+	  if (align < 128)
+	    align = PARM_BOUNDARY;
+	}
 
-bool
-ix86_target_stack_probe (void)
-{
-  /* Do not probe the stack twice if static stack checking is enabled.  */
-  if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
-    return false;
+      if (warn_psabi
+	  && !warned
+	  && align != ix86_compat_function_arg_boundary (mode, type,
+							 saved_align))
+	{
+	  warned = true;
+	  inform (input_location,
+		  "the ABI for passing parameters with %d-byte"
+		  " alignment has changed in GCC 4.6",
+		  align / BITS_PER_UNIT);
+	}
+    }
 
-  return TARGET_STACK_PROBE;
+  return align;
 }
-
-/* Decide whether we can make a sibling call to a function.  DECL is the
-   declaration of the function being targeted by the call and EXP is the
-   CALL_EXPR representing the call.  */
+
+/* Return true if N is a possible register number of function value.  */
 
 static bool
-ix86_function_ok_for_sibcall (tree decl, tree exp)
+ix86_function_value_regno_p (const unsigned int regno)
 {
-  tree type, decl_or_type;
-  rtx a, b;
-  bool bind_global = decl && !targetm.binds_local_p (decl);
-
-  if (ix86_function_naked (current_function_decl))
-    return false;
-
-  /* Sibling call isn't OK if there are no caller-saved registers
-     since all registers must be preserved before return.  */
-  if (cfun->machine->no_caller_saved_registers)
-    return false;
-
-  /* If we are generating position-independent code, we cannot sibcall
-     optimize direct calls to global functions, as the PLT requires
-     %ebx be live. (Darwin does not have a PLT.)  */
-  if (!TARGET_MACHO
-      && !TARGET_64BIT
-      && flag_pic
-      && flag_plt
-      && bind_global)
-    return false;
-
-  /* If we need to align the outgoing stack, then sibcalling would
-     unalign the stack, which may break the called function.  */
-  if (ix86_minimum_incoming_stack_boundary (true)
-      < PREFERRED_STACK_BOUNDARY)
-    return false;
-
-  if (decl)
-    {
-      decl_or_type = decl;
-      type = TREE_TYPE (decl);
-    }
-  else
+  switch (regno)
     {
-      /* We're looking at the CALL_EXPR, we need the type of the function.  */
-      type = CALL_EXPR_FN (exp);		/* pointer expression */
-      type = TREE_TYPE (type);			/* pointer type */
-      type = TREE_TYPE (type);			/* function type */
-      decl_or_type = type;
-    }
+    case AX_REG:
+      return true;
+    case DX_REG:
+      return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
+    case DI_REG:
+    case SI_REG:
+      return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
 
-  /* Check that the return value locations are the same.  Like
-     if we are returning floats on the 80387 register stack, we cannot
-     make a sibcall from a function that doesn't return a float to a
-     function that does or, conversely, from a function that does return
-     a float to a function that doesn't; the necessary stack adjustment
-     would not be executed.  This is also the place we notice
-     differences in the return value ABI.  Note that it is ok for one
-     of the functions to have void return type as long as the return
-     value of the other is passed in a register.  */
-  a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
-  b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
-			   cfun->decl, false);
-  if (STACK_REG_P (a) || STACK_REG_P (b))
-    {
-      if (!rtx_equal_p (a, b))
+      /* Complex values are returned in %st(0)/%st(1) pair.  */
+    case ST0_REG:
+    case ST1_REG:
+      /* TODO: The function should depend on current function ABI but
+       builtins.c would need updating then. Therefore we use the
+       default ABI.  */
+      if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
 	return false;
-    }
-  else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
-    ;
-  else if (!rtx_equal_p (a, b))
-    return false;
+      return TARGET_FLOAT_RETURNS_IN_80387;
 
-  if (TARGET_64BIT)
-    {
-      /* The SYSV ABI has more call-clobbered registers;
-	 disallow sibcalls from MS to SYSV.  */
-      if (cfun->machine->call_abi == MS_ABI
-	  && ix86_function_type_abi (type) == SYSV_ABI)
-	return false;
-    }
-  else
-    {
-      /* If this call is indirect, we'll need to be able to use a
-	 call-clobbered register for the address of the target function.
-	 Make sure that all such registers are not used for passing
-	 parameters.  Note that DLLIMPORT functions and call to global
-	 function via GOT slot are indirect.  */
-      if (!decl
-	  || (bind_global && flag_pic && !flag_plt)
-	  || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl))
-	  || flag_force_indirect_call)
-	{
-	  /* Check if regparm >= 3 since arg_reg_available is set to
-	     false if regparm == 0.  If regparm is 1 or 2, there is
-	     always a call-clobbered register available.
+      /* Complex values are returned in %xmm0/%xmm1 pair.  */
+    case XMM0_REG:
+    case XMM1_REG:
+      return TARGET_SSE;
 
-	     ??? The symbol indirect call doesn't need a call-clobbered
-	     register.  But we don't know if this is a symbol indirect
-	     call or not here.  */
-	  if (ix86_function_regparm (type, decl) >= 3
-	      && !cfun->machine->arg_reg_available)
-	    return false;
-	}
+    case MM0_REG:
+      if (TARGET_MACHO || TARGET_64BIT)
+	return false;
+      return TARGET_MMX;
     }
 
-  /* Otherwise okay.  That also includes certain types of indirect calls.  */
-  return true;
+  return false;
 }
 
-/* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
-   and "sseregparm" calling convention attributes;
-   arguments as in struct attribute_spec.handler.  */
+/* Define how to find the value returned by a function.
+   VALTYPE is the data type of the value (as a tree).
+   If the precise function being called is known, FUNC is its FUNCTION_DECL;
+   otherwise, FUNC is 0.  */
 
-static tree
-ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
-			     bool *no_add_attrs)
+static rtx
+function_value_32 (machine_mode orig_mode, machine_mode mode,
+		   const_tree fntype, const_tree fn)
 {
-  if (TREE_CODE (*node) != FUNCTION_TYPE
-      && TREE_CODE (*node) != METHOD_TYPE
-      && TREE_CODE (*node) != FIELD_DECL
-      && TREE_CODE (*node) != TYPE_DECL)
-    {
-      warning (OPT_Wattributes, "%qE attribute only applies to functions",
-	       name);
-      *no_add_attrs = true;
-      return NULL_TREE;
-    }
-
-  /* Can combine regparm with all attributes but fastcall, and thiscall.  */
-  if (is_attribute_p ("regparm", name))
-    {
-      tree cst;
+  unsigned int regno;
 
-      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
-        {
-	  error ("fastcall and regparm attributes are not compatible");
-	}
+  /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
+     we normally prevent this case when mmx is not available.  However
+     some ABIs may require the result to be returned like DImode.  */
+  if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
+    regno = FIRST_MMX_REG;
 
-      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
-	{
-	  error ("regparam and thiscall attributes are not compatible");
-	}
+  /* 16-byte vector modes in %xmm0.  See ix86_return_in_memory for where
+     we prevent this case when sse is not available.  However some ABIs
+     may require the result to be returned like integer TImode.  */
+  else if (mode == TImode
+	   || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
+    regno = FIRST_SSE_REG;
 
-      cst = TREE_VALUE (args);
-      if (TREE_CODE (cst) != INTEGER_CST)
-	{
-	  warning (OPT_Wattributes,
-		   "%qE attribute requires an integer constant argument",
-		   name);
-	  *no_add_attrs = true;
-	}
-      else if (compare_tree_int (cst, REGPARM_MAX) > 0)
-	{
-	  warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
-		   name, REGPARM_MAX);
-	  *no_add_attrs = true;
-	}
+  /* 32-byte vector modes in %ymm0.   */
+  else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
+    regno = FIRST_SSE_REG;
 
-      return NULL_TREE;
-    }
+  /* 64-byte vector modes in %zmm0.   */
+  else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
+    regno = FIRST_SSE_REG;
 
-  if (TARGET_64BIT)
-    {
-      /* Do not warn when emulating the MS ABI.  */
-      if ((TREE_CODE (*node) != FUNCTION_TYPE
-	   && TREE_CODE (*node) != METHOD_TYPE)
-	  || ix86_function_type_abi (*node) != MS_ABI)
-	warning (OPT_Wattributes, "%qE attribute ignored",
-	         name);
-      *no_add_attrs = true;
-      return NULL_TREE;
-    }
+  /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387).  */
+  else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
+    regno = FIRST_FLOAT_REG;
+  else
+    /* Most things go in %eax.  */
+    regno = AX_REG;
 
-  /* Can combine fastcall with stdcall (redundant) and sseregparm.  */
-  if (is_attribute_p ("fastcall", name))
+  /* Override FP return register with %xmm0 for local functions when
+     SSE math is enabled or for functions with sseregparm attribute.  */
+  if ((fn || fntype) && (mode == SFmode || mode == DFmode))
     {
-      if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
-        {
-	  error ("fastcall and cdecl attributes are not compatible");
-	}
-      if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
-        {
-	  error ("fastcall and stdcall attributes are not compatible");
-	}
-      if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
-        {
-	  error ("fastcall and regparm attributes are not compatible");
-	}
-      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+      int sse_level = ix86_function_sseregparm (fntype, fn, false);
+      if (sse_level == -1)
 	{
-	  error ("fastcall and thiscall attributes are not compatible");
+	  error ("calling %qD with SSE calling convention without "
+		 "SSE/SSE2 enabled", fn);
+	  sorry ("this is a GCC bug that can be worked around by adding "
+		 "attribute used to function called");
 	}
+      else if ((sse_level >= 1 && mode == SFmode)
+	       || (sse_level == 2 && mode == DFmode))
+	regno = FIRST_SSE_REG;
     }
 
-  /* Can combine stdcall with fastcall (redundant), regparm and
-     sseregparm.  */
-  else if (is_attribute_p ("stdcall", name))
-    {
-      if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
-        {
-	  error ("stdcall and cdecl attributes are not compatible");
-	}
-      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
-        {
-	  error ("stdcall and fastcall attributes are not compatible");
-	}
-      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
-	{
-	  error ("stdcall and thiscall attributes are not compatible");
-	}
-    }
+  /* OImode shouldn't be used directly.  */
+  gcc_assert (mode != OImode);
+
+  return gen_rtx_REG (orig_mode, regno);
+}
+
+static rtx
+function_value_64 (machine_mode orig_mode, machine_mode mode,
+		   const_tree valtype)
+{
+  rtx ret;
 
-  /* Can combine cdecl with regparm and sseregparm.  */
-  else if (is_attribute_p ("cdecl", name))
+  /* Handle libcalls, which don't provide a type node.  */
+  if (valtype == NULL)
     {
-      if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
-        {
-	  error ("stdcall and cdecl attributes are not compatible");
-	}
-      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
-        {
-	  error ("fastcall and cdecl attributes are not compatible");
-	}
-      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+      unsigned int regno;
+
+      switch (mode)
 	{
-	  error ("cdecl and thiscall attributes are not compatible");
+	case E_SFmode:
+	case E_SCmode:
+	case E_DFmode:
+	case E_DCmode:
+	case E_TFmode:
+	case E_SDmode:
+	case E_DDmode:
+	case E_TDmode:
+	  regno = FIRST_SSE_REG;
+	  break;
+	case E_XFmode:
+	case E_XCmode:
+	  regno = FIRST_FLOAT_REG;
+	  break;
+	case E_TCmode:
+	  return NULL;
+	default:
+	  regno = AX_REG;
 	}
+
+      return gen_rtx_REG (mode, regno);
     }
-  else if (is_attribute_p ("thiscall", name))
+  else if (POINTER_TYPE_P (valtype))
     {
-      if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
-	warning (OPT_Wattributes, "%qE attribute is used for non-class method",
-	         name);
-      if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
-	{
-	  error ("stdcall and thiscall attributes are not compatible");
-	}
-      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
-	{
-	  error ("fastcall and thiscall attributes are not compatible");
-	}
-      if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
-	{
-	  error ("cdecl and thiscall attributes are not compatible");
-	}
+      /* Pointers are always returned in word_mode.  */
+      mode = word_mode;
     }
 
-  /* Can combine sseregparm with all attributes.  */
+  ret = construct_container (mode, orig_mode, valtype, 1,
+			     X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
+			     x86_64_int_return_registers, 0);
 
-  return NULL_TREE;
-}
+  /* For zero sized structures, construct_container returns NULL, but we
+     need to keep rest of compiler happy by returning meaningful value.  */
+  if (!ret)
+    ret = gen_rtx_REG (orig_mode, AX_REG);
 
-/* The transactional memory builtins are implicitly regparm or fastcall
-   depending on the ABI.  Override the generic do-nothing attribute that
-   these builtins were declared with, and replace it with one of the two
-   attributes that we expect elsewhere.  */
+  return ret;
+}
 
-static tree
-ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
-				  int flags, bool *no_add_attrs)
+static rtx
+function_value_ms_32 (machine_mode orig_mode, machine_mode mode,
+		      const_tree fntype, const_tree fn, const_tree valtype)
 {
-  tree alt;
+  unsigned int regno;
 
-  /* In no case do we want to add the placeholder attribute.  */
-  *no_add_attrs = true;
+  /* Floating point return values in %st(0)
+     (unless -mno-fp-ret-in-387 or aggregate type of up to 8 bytes).  */
+  if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387
+	   && (GET_MODE_SIZE (mode) > 8
+	       || valtype == NULL_TREE || !AGGREGATE_TYPE_P (valtype)))
+  {
+    regno = FIRST_FLOAT_REG;
+    return gen_rtx_REG (orig_mode, regno);
+  }
+  else
+    return function_value_32(orig_mode, mode, fntype,fn);
+}
 
-  /* The 64-bit ABI is unchanged for transactional memory.  */
-  if (TARGET_64BIT)
-    return NULL_TREE;
+static rtx
+function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
+		      const_tree valtype)
+{
+  unsigned int regno = AX_REG;
 
-  /* ??? Is there a better way to validate 32-bit windows?  We have
-     cfun->machine->call_abi, but that seems to be set only for 64-bit.  */
-  if (CHECK_STACK_LIMIT > 0)
-    alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
-  else
+  if (TARGET_SSE)
     {
-      alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
-      alt = tree_cons (get_identifier ("regparm"), alt, NULL);
+      switch (GET_MODE_SIZE (mode))
+	{
+	case 16:
+	  if (valtype != NULL_TREE
+	      && !VECTOR_INTEGER_TYPE_P (valtype)
+	      && !VECTOR_INTEGER_TYPE_P (valtype)
+	      && !INTEGRAL_TYPE_P (valtype)
+	      && !VECTOR_FLOAT_TYPE_P (valtype))
+	    break;
+	  if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
+	      && !COMPLEX_MODE_P (mode))
+	    regno = FIRST_SSE_REG;
+	  break;
+	case 8:
+	case 4:
+	  if (valtype != NULL_TREE && AGGREGATE_TYPE_P (valtype))
+	    break;
+	  if (mode == SFmode || mode == DFmode)
+	    regno = FIRST_SSE_REG;
+	  break;
+	default:
+	  break;
+        }
     }
-  decl_attributes (node, alt, flags);
-
-  return NULL_TREE;
+  return gen_rtx_REG (orig_mode, regno);
 }
 
-/* This function determines from TYPE the calling-convention.  */
-
-unsigned int
-ix86_get_callcvt (const_tree type)
+static rtx
+ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
+		       machine_mode orig_mode, machine_mode mode)
 {
-  unsigned int ret = 0;
-  bool is_stdarg;
-  tree attrs;
-
-  if (TARGET_64BIT)
-    return IX86_CALLCVT_CDECL;
+  const_tree fn, fntype;
 
-  attrs = TYPE_ATTRIBUTES (type);
-  if (attrs != NULL_TREE)
+  fn = NULL_TREE;
+  if (fntype_or_decl && DECL_P (fntype_or_decl))
+    fn = fntype_or_decl;
+  fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
+  
+  if (ix86_function_type_abi (fntype) == MS_ABI)
     {
-      if (lookup_attribute ("cdecl", attrs))
-	ret |= IX86_CALLCVT_CDECL;
-      else if (lookup_attribute ("stdcall", attrs))
-	ret |= IX86_CALLCVT_STDCALL;
-      else if (lookup_attribute ("fastcall", attrs))
-	ret |= IX86_CALLCVT_FASTCALL;
-      else if (lookup_attribute ("thiscall", attrs))
-	ret |= IX86_CALLCVT_THISCALL;
-
-      /* Regparam isn't allowed for thiscall and fastcall.  */
-      if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
-	{
-	  if (lookup_attribute ("regparm", attrs))
-	    ret |= IX86_CALLCVT_REGPARM;
-	  if (lookup_attribute ("sseregparm", attrs))
-	    ret |= IX86_CALLCVT_SSEREGPARM;
-	}
-
-      if (IX86_BASE_CALLCVT(ret) != 0)
-	return ret;
+      if (TARGET_64BIT)
+	return function_value_ms_64 (orig_mode, mode, valtype);
+      else
+	return function_value_ms_32 (orig_mode, mode, fntype, fn, valtype);
     }
+  else if (TARGET_64BIT)
+    return function_value_64 (orig_mode, mode, valtype);
+  else
+    return function_value_32 (orig_mode, mode, fntype, fn);
+}
 
-  is_stdarg = stdarg_p (type);
-  if (TARGET_RTD && !is_stdarg)
-    return IX86_CALLCVT_STDCALL | ret;
-
-  if (ret != 0
-      || is_stdarg
-      || TREE_CODE (type) != METHOD_TYPE
-      || ix86_function_type_abi (type) != MS_ABI)
-    return IX86_CALLCVT_CDECL | ret;
+static rtx
+ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
+{
+  machine_mode mode, orig_mode;
 
-  return IX86_CALLCVT_THISCALL;
+  orig_mode = TYPE_MODE (valtype);
+  mode = type_natural_mode (valtype, NULL, true);
+  return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
 }
 
-/* Return 0 if the attributes for two types are incompatible, 1 if they
-   are compatible, and 2 if they are nearly compatible (which causes a
-   warning to be generated).  */
+/* Pointer function arguments and return values are promoted to
+   word_mode for normal functions.  */
 
-static int
-ix86_comp_type_attributes (const_tree type1, const_tree type2)
+static machine_mode
+ix86_promote_function_mode (const_tree type, machine_mode mode,
+			    int *punsignedp, const_tree fntype,
+			    int for_return)
 {
-  unsigned int ccvt1, ccvt2;
-
-  if (TREE_CODE (type1) != FUNCTION_TYPE
-      && TREE_CODE (type1) != METHOD_TYPE)
-    return 1;
+  if (cfun->machine->func_type == TYPE_NORMAL
+      && type != NULL_TREE
+      && POINTER_TYPE_P (type))
+    {
+      *punsignedp = POINTERS_EXTEND_UNSIGNED;
+      return word_mode;
+    }
+  return default_promote_function_mode (type, mode, punsignedp, fntype,
+					for_return);
+}
 
-  ccvt1 = ix86_get_callcvt (type1);
-  ccvt2 = ix86_get_callcvt (type2);
-  if (ccvt1 != ccvt2)
-    return 0;
-  if (ix86_function_regparm (type1, NULL)
-      != ix86_function_regparm (type2, NULL))
-    return 0;
+/* Return true if a structure, union or array with MODE containing FIELD
+   should be accessed using BLKmode.  */
 
-  return 1;
+static bool
+ix86_member_type_forces_blk (const_tree field, machine_mode mode)
+{
+  /* Union with XFmode must be in BLKmode.  */
+  return (mode == XFmode
+	  && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
+	      || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
 }
-
-/* Return the regparm value for a function with the indicated TYPE and DECL.
-   DECL may be NULL when calling function indirectly
-   or considering a libcall.  */
 
-static int
-ix86_function_regparm (const_tree type, const_tree decl)
+rtx
+ix86_libcall_value (machine_mode mode)
 {
-  tree attr;
-  int regparm;
-  unsigned int ccvt;
+  return ix86_function_value_1 (NULL, NULL, mode, mode);
+}
 
-  if (TARGET_64BIT)
-    return (ix86_function_type_abi (type) == SYSV_ABI
-	    ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
-  ccvt = ix86_get_callcvt (type);
-  regparm = ix86_regparm;
+/* Return true iff type is returned in memory.  */
 
-  if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
+static bool
+ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
+{
+#ifdef SUBTARGET_RETURN_IN_MEMORY
+  return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
+#else
+  const machine_mode mode = type_natural_mode (type, NULL, true);
+  HOST_WIDE_INT size;
+
+  if (TARGET_64BIT)
     {
-      attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
-      if (attr)
+      if (ix86_function_type_abi (fntype) == MS_ABI)
 	{
-	  regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
-	  return regparm;
-	}
-    }
-  else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
-    return 2;
-  else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
-    return 1;
+	  size = int_size_in_bytes (type);
 
-  /* Use register calling convention for local functions when possible.  */
-  if (decl
-      && TREE_CODE (decl) == FUNCTION_DECL)
-    {
-      cgraph_node *target = cgraph_node::get (decl);
-      if (target)
-	target = target->function_symbol ();
+	  /* __m128 is returned in xmm0.  */
+	  if ((!type || VECTOR_INTEGER_TYPE_P (type)
+	       || INTEGRAL_TYPE_P (type)
+	       || VECTOR_FLOAT_TYPE_P (type))
+	      && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
+	      && !COMPLEX_MODE_P (mode)
+	      && (GET_MODE_SIZE (mode) == 16 || size == 16))
+	    return false;
 
-      /* Caller and callee must agree on the calling convention, so
-	 checking here just optimize means that with
-	 __attribute__((optimize (...))) caller could use regparm convention
-	 and callee not, or vice versa.  Instead look at whether the callee
-	 is optimized or not.  */
-      if (target && opt_for_fn (target->decl, optimize)
-	  && !(profile_flag && !flag_fentry))
+	  /* Otherwise, the size must be exactly in [1248]. */
+	  return size != 1 && size != 2 && size != 4 && size != 8;
+	}
+      else
 	{
-	  cgraph_local_info *i = &target->local;
-	  if (i && i->local && i->can_change_signature)
-	    {
-	      int local_regparm, globals = 0, regno;
-
-	      /* Make sure no regparm register is taken by a
-		 fixed register variable.  */
-	      for (local_regparm = 0; local_regparm < REGPARM_MAX;
-		   local_regparm++)
-		if (fixed_regs[local_regparm])
-		  break;
+	  int needed_intregs, needed_sseregs;
 
-	      /* We don't want to use regparm(3) for nested functions as
-		 these use a static chain pointer in the third argument.  */
-	      if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
-		local_regparm = 2;
+	  return examine_argument (mode, type, 1,
+				   &needed_intregs, &needed_sseregs);
+	}
+    }
+  else
+    {
+      size = int_size_in_bytes (type);
 
-	      /* Save a register for the split stack.  */
-	      if (flag_split_stack)
-		{
-		  if (local_regparm == 3)
-		    local_regparm = 2;
-		  else if (local_regparm == 2
-			   && DECL_STATIC_CHAIN (target->decl))
-		    local_regparm = 1;
-		}
+      /* Intel MCU psABI returns scalars and aggregates no larger than 8
+	 bytes in registers.  */
+      if (TARGET_IAMCU)
+	return VECTOR_MODE_P (mode) || size < 0 || size > 8;
 
-	      /* Each fixed register usage increases register pressure,
-		 so less registers should be used for argument passing.
-		 This functionality can be overriden by an explicit
-		 regparm value.  */
-	      for (regno = AX_REG; regno <= DI_REG; regno++)
-		if (fixed_regs[regno])
-		  globals++;
+      if (mode == BLKmode)
+	return true;
 
-	      local_regparm
-		= globals < local_regparm ? local_regparm - globals : 0;
+      if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
+	return false;
 
-	      if (local_regparm > regparm)
-		regparm = local_regparm;
-	    }
-	}
-    }
+      if (VECTOR_MODE_P (mode) || mode == TImode)
+	{
+	  /* User-created vectors small enough to fit in EAX.  */
+	  if (size < 8)
+	    return false;
 
-  return regparm;
-}
+	  /* Unless ABI prescibes otherwise,
+	     MMX/3dNow values are returned in MM0 if available.  */
+	     
+	  if (size == 8)
+	    return TARGET_VECT8_RETURNS || !TARGET_MMX;
 
-/* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
-   DFmode (2) arguments in SSE registers for a function with the
-   indicated TYPE and DECL.  DECL may be NULL when calling function
-   indirectly or considering a libcall.  Return -1 if any FP parameter
-   should be rejected by error.  This is used in siutation we imply SSE
-   calling convetion but the function is called from another function with
-   SSE disabled. Otherwise return 0.  */
+	  /* SSE values are returned in XMM0 if available.  */
+	  if (size == 16)
+	    return !TARGET_SSE;
 
-static int
-ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
-{
-  gcc_assert (!TARGET_64BIT);
+	  /* AVX values are returned in YMM0 if available.  */
+	  if (size == 32)
+	    return !TARGET_AVX;
 
-  /* Use SSE registers to pass SFmode and DFmode arguments if requested
-     by the sseregparm attribute.  */
-  if (TARGET_SSEREGPARM
-      || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
-    {
-      if (!TARGET_SSE)
-	{
-	  if (warn)
-	    {
-	      if (decl)
-		error ("calling %qD with attribute sseregparm without "
-		       "SSE/SSE2 enabled", decl);
-	      else
-		error ("calling %qT with attribute sseregparm without "
-		       "SSE/SSE2 enabled", type);
-	    }
-	  return 0;
+	  /* AVX512F values are returned in ZMM0 if available.  */
+	  if (size == 64)
+	    return !TARGET_AVX512F;
 	}
 
-      return 2;
-    }
+      if (mode == XFmode)
+	return false;
 
-  if (!decl)
-    return 0;
+      if (size > 12)
+	return true;
 
-  cgraph_node *target = cgraph_node::get (decl);
-  if (target)
-    target = target->function_symbol ();
+      /* OImode shouldn't be used directly.  */
+      gcc_assert (mode != OImode);
 
-  /* For local functions, pass up to SSE_REGPARM_MAX SFmode
-     (and DFmode for SSE2) arguments in SSE registers.  */
-  if (target
-      /* TARGET_SSE_MATH */
-      && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
-      && opt_for_fn (target->decl, optimize)
-      && !(profile_flag && !flag_fentry))
-    {
-      cgraph_local_info *i = &target->local;
-      if (i && i->local && i->can_change_signature)
-	{
-	  /* Refuse to produce wrong code when local function with SSE enabled
-	     is called from SSE disabled function.
-	     FIXME: We need a way to detect these cases cross-ltrans partition
-	     and avoid using SSE calling conventions on local functions called
-	     from function with SSE disabled.  For now at least delay the
-	     warning until we know we are going to produce wrong code.
-	     See PR66047  */
-	  if (!TARGET_SSE && warn)
-	    return -1;
-	  return TARGET_SSE2_P (target_opts_for_fn (target->decl)
-				->x_ix86_isa_flags) ? 2 : 1;
-	}
+      return false;
     }
-
-  return 0;
+#endif
 }
 
-/* Return true if EAX is live at the start of the function.  Used by
-   ix86_expand_prologue to determine if we need special help before
-   calling allocate_stack_worker.  */
+
+/* Create the va_list data type.  */
 
-static bool
-ix86_eax_live_at_start_p (void)
+static tree
+ix86_build_builtin_va_list_64 (void)
 {
-  /* Cheat.  Don't bother working forward from ix86_function_regparm
-     to the function type to whether an actual argument is located in
-     eax.  Instead just look at cfg info, which is still close enough
-     to correct at this point.  This gives false positives for broken
-     functions that might use uninitialized data that happens to be
-     allocated in eax, but who cares?  */
-  return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
-}
+  tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
 
-static bool
-ix86_keep_aggregate_return_pointer (tree fntype)
-{
-  tree attr;
+  record = lang_hooks.types.make_type (RECORD_TYPE);
+  type_decl = build_decl (BUILTINS_LOCATION,
+			  TYPE_DECL, get_identifier ("__va_list_tag"), record);
 
-  if (!TARGET_64BIT)
-    {
-      attr = lookup_attribute ("callee_pop_aggregate_return",
-			       TYPE_ATTRIBUTES (fntype));
-      if (attr)
-	return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
+  f_gpr = build_decl (BUILTINS_LOCATION,
+		      FIELD_DECL, get_identifier ("gp_offset"),
+		      unsigned_type_node);
+  f_fpr = build_decl (BUILTINS_LOCATION,
+		      FIELD_DECL, get_identifier ("fp_offset"),
+		      unsigned_type_node);
+  f_ovf = build_decl (BUILTINS_LOCATION,
+		      FIELD_DECL, get_identifier ("overflow_arg_area"),
+		      ptr_type_node);
+  f_sav = build_decl (BUILTINS_LOCATION,
+		      FIELD_DECL, get_identifier ("reg_save_area"),
+		      ptr_type_node);
 
-      /* For 32-bit MS-ABI the default is to keep aggregate
-         return pointer.  */
-      if (ix86_function_type_abi (fntype) == MS_ABI)
-	return true;
-    }
-  return KEEP_AGGREGATE_RETURN_POINTER != 0;
-}
+  va_list_gpr_counter_field = f_gpr;
+  va_list_fpr_counter_field = f_fpr;
 
-/* Value is the number of bytes of arguments automatically
-   popped when returning from a subroutine call.
-   FUNDECL is the declaration node of the function (as a tree),
-   FUNTYPE is the data type of the function (as a tree),
-   or for a library call it is an identifier node for the subroutine name.
-   SIZE is the number of bytes of arguments passed on the stack.
+  DECL_FIELD_CONTEXT (f_gpr) = record;
+  DECL_FIELD_CONTEXT (f_fpr) = record;
+  DECL_FIELD_CONTEXT (f_ovf) = record;
+  DECL_FIELD_CONTEXT (f_sav) = record;
 
-   On the 80386, the RTD insn may be used to pop them if the number
-     of args is fixed, but if the number is variable then the caller
-     must pop them all.  RTD can't be used for library calls now
-     because the library is compiled with the Unix compiler.
-   Use of RTD is a selectable option, since it is incompatible with
-   standard Unix calling sequences.  If the option is not selected,
-   the caller must always pop the args.
+  TYPE_STUB_DECL (record) = type_decl;
+  TYPE_NAME (record) = type_decl;
+  TYPE_FIELDS (record) = f_gpr;
+  DECL_CHAIN (f_gpr) = f_fpr;
+  DECL_CHAIN (f_fpr) = f_ovf;
+  DECL_CHAIN (f_ovf) = f_sav;
 
-   The attribute stdcall is equivalent to RTD on a per module basis.  */
+  layout_type (record);
 
-static poly_int64
-ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size)
-{
-  unsigned int ccvt;
+  TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
+					NULL_TREE, TYPE_ATTRIBUTES (record));
 
-  /* None of the 64-bit ABIs pop arguments.  */
+  /* The correct type is an array type of one element.  */
+  return build_array_type (record, build_index_type (size_zero_node));
+}
+
+/* Setup the builtin va_list data type and for 64-bit the additional
+   calling convention specific va_list data types.  */
+
+static tree
+ix86_build_builtin_va_list (void)
+{
   if (TARGET_64BIT)
-    return 0;
+    {
+      /* Initialize ABI specific va_list builtin types.
 
-  ccvt = ix86_get_callcvt (funtype);
+	 In lto1, we can encounter two va_list types:
+	 - one as a result of the type-merge across TUs, and
+	 - the one constructed here.
+	 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
+	 a type identity check in canonical_va_list_type based on
+	 TYPE_MAIN_VARIANT (which we used to have) will not work.
+	 Instead, we tag each va_list_type_node with its unique attribute, and
+	 look for the attribute in the type identity check in
+	 canonical_va_list_type.
 
-  if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
-	       | IX86_CALLCVT_THISCALL)) != 0
-      && ! stdarg_p (funtype))
-    return size;
+	 Tagging sysv_va_list_type_node directly with the attribute is
+	 problematic since it's a array of one record, which will degrade into a
+	 pointer to record when used as parameter (see build_va_arg comments for
+	 an example), dropping the attribute in the process.  So we tag the
+	 record instead.  */
 
-  /* Lose any fake structure return argument if it is passed on the stack.  */
-  if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
-      && !ix86_keep_aggregate_return_pointer (funtype))
+      /* For SYSV_ABI we use an array of one record.  */
+      sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
+	
+      /* For MS_ABI we use plain pointer to argument area.  */
+      tree char_ptr_type = build_pointer_type (char_type_node);
+      tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
+			     TYPE_ATTRIBUTES (char_ptr_type));
+      ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
+
+      return ((ix86_abi == MS_ABI)
+	      ? ms_va_list_type_node
+	      : sysv_va_list_type_node);
+    }
+  else
     {
-      int nregs = ix86_function_regparm (funtype, fundecl);
-      if (nregs == 0)
-	return GET_MODE_SIZE (Pmode);
+      /* For i386 we use plain pointer to argument area.  */
+      return build_pointer_type (char_type_node);
     }
-
-  return 0;
 }
 
-/* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook.  */
+/* Worker function for TARGET_SETUP_INCOMING_VARARGS.  */
 
-static bool
-ix86_legitimate_combined_insn (rtx_insn *insn)
+static void
+setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
 {
-  int i;
+  rtx save_area, mem;
+  alias_set_type set;
+  int i, max;
 
-  /* Check operand constraints in case hard registers were propagated
-     into insn pattern.  This check prevents combine pass from
-     generating insn patterns with invalid hard register operands.
-     These invalid insns can eventually confuse reload to error out
-     with a spill failure.  See also PRs 46829 and 46843.  */
+  /* GPR size of varargs save area.  */
+  if (cfun->va_list_gpr_size)
+    ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
+  else
+    ix86_varargs_gpr_size = 0;
 
-  gcc_assert (INSN_CODE (insn) >= 0);
+  /* FPR size of varargs save area.  We don't need it if we don't pass
+     anything in SSE registers.  */
+  if (TARGET_SSE && cfun->va_list_fpr_size)
+    ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
+  else
+    ix86_varargs_fpr_size = 0;
 
-  extract_insn (insn);
-  preprocess_constraints (insn);
+  if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
+    return;
 
-  int n_operands = recog_data.n_operands;
-  int n_alternatives = recog_data.n_alternatives;
-  for (i = 0; i < n_operands; i++)
+  save_area = frame_pointer_rtx;
+  set = get_varargs_alias_set ();
+
+  max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
+  if (max > X86_64_REGPARM_MAX)
+    max = X86_64_REGPARM_MAX;
+
+  for (i = cum->regno; i < max; i++)
     {
-      rtx op = recog_data.operand[i];
-      machine_mode mode = GET_MODE (op);
-      const operand_alternative *op_alt;
-      int offset = 0;
-      bool win;
-      int j;
+      mem = gen_rtx_MEM (word_mode,
+			 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
+      MEM_NOTRAP_P (mem) = 1;
+      set_mem_alias_set (mem, set);
+      emit_move_insn (mem,
+		      gen_rtx_REG (word_mode,
+				   x86_64_int_parameter_registers[i]));
+    }
 
-      /* A unary operator may be accepted by the predicate, but it
-	 is irrelevant for matching constraints.  */
-      if (UNARY_P (op))
-	op = XEXP (op, 0);
+  if (ix86_varargs_fpr_size)
+    {
+      machine_mode smode;
+      rtx_code_label *label;
+      rtx test;
 
-      if (SUBREG_P (op))
-	{
-	  if (REG_P (SUBREG_REG (op))
-	      && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
-	    offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
-					  GET_MODE (SUBREG_REG (op)),
-					  SUBREG_BYTE (op),
-					  GET_MODE (op));
-	  op = SUBREG_REG (op);
-	}
+      /* Now emit code to save SSE registers.  The AX parameter contains number
+	 of SSE parameter registers used to call this function, though all we
+	 actually check here is the zero/non-zero status.  */
 
-      if (!(REG_P (op) && HARD_REGISTER_P (op)))
-	continue;
+      label = gen_label_rtx ();
+      test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
+      emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
+				      label));
 
-      op_alt = recog_op_alt;
+      /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
+	 we used movdqa (i.e. TImode) instead?  Perhaps even better would
+	 be if we could determine the real mode of the data, via a hook
+	 into pass_stdarg.  Ignore all that for now.  */
+      smode = V4SFmode;
+      if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
+	crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
 
-      /* Operand has no constraints, anything is OK.  */
-      win = !n_alternatives;
+      max = cum->sse_regno + cfun->va_list_fpr_size / 16;
+      if (max > X86_64_SSE_REGPARM_MAX)
+	max = X86_64_SSE_REGPARM_MAX;
 
-      alternative_mask preferred = get_preferred_alternatives (insn);
-      for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
+      for (i = cum->sse_regno; i < max; ++i)
 	{
-	  if (!TEST_BIT (preferred, j))
-	    continue;
-	  if (op_alt[i].anything_ok
-	      || (op_alt[i].matches != -1
-		  && operands_match_p
-		  (recog_data.operand[i],
-		   recog_data.operand[op_alt[i].matches]))
-	      || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
-	    {
-	      win = true;
-	      break;
-	    }
+	  mem = plus_constant (Pmode, save_area,
+			       i * 16 + ix86_varargs_gpr_size);
+	  mem = gen_rtx_MEM (smode, mem);
+	  MEM_NOTRAP_P (mem) = 1;
+	  set_mem_alias_set (mem, set);
+	  set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
+
+	  emit_move_insn (mem, gen_rtx_REG (smode, GET_SSE_REGNO (i)));
 	}
 
-      if (!win)
-	return false;
+      emit_label (label);
     }
-
-  return true;
-}
-
-/* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
-
-static unsigned HOST_WIDE_INT
-ix86_asan_shadow_offset (void)
-{
-  return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
-				     : HOST_WIDE_INT_C (0x7fff8000))
-		     : (HOST_WIDE_INT_1 << 29);
 }
-
-/* Argument support functions.  */
 
-/* Return true when register may be used to pass function parameters.  */
-bool
-ix86_function_arg_regno_p (int regno)
+static void
+setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
 {
+  alias_set_type set = get_varargs_alias_set ();
   int i;
-  enum calling_abi call_abi;
-  const int *parm_regs;
 
-  if (!TARGET_64BIT)
-    {
-      if (TARGET_MACHO)
-        return (regno < REGPARM_MAX
-                || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
-      else
-        return (regno < REGPARM_MAX
-	        || (TARGET_MMX && MMX_REGNO_P (regno)
-	  	    && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
-	        || (TARGET_SSE && SSE_REGNO_P (regno)
-		    && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
-    }
+  /* Reset to zero, as there might be a sysv vaarg used
+     before.  */
+  ix86_varargs_gpr_size = 0;
+  ix86_varargs_fpr_size = 0;
 
-  if (TARGET_SSE && SSE_REGNO_P (regno)
-      && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
-    return true;
-
-  /* TODO: The function should depend on current function ABI but
-     builtins.c would need updating then. Therefore we use the
-     default ABI.  */
-  call_abi = ix86_cfun_abi ();
-
-  /* RAX is used as hidden argument to va_arg functions.  */
-  if (call_abi == SYSV_ABI && regno == AX_REG)
-    return true;
-
-  if (call_abi == MS_ABI)
-    parm_regs = x86_64_ms_abi_int_parameter_registers;
-  else
-    parm_regs = x86_64_int_parameter_registers;
-
-  for (i = 0; i < (call_abi == MS_ABI
-		   ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
-    if (regno == parm_regs[i])
-      return true;
-  return false;
-}
-
-/* Return if we do not know how to pass TYPE solely in registers.  */
-
-static bool
-ix86_must_pass_in_stack (machine_mode mode, const_tree type)
-{
-  if (must_pass_in_stack_var_size_or_pad (mode, type))
-    return true;
-
-  /* For 32-bit, we want TImode aggregates to go on the stack.  But watch out!
-     The layout_type routine is crafty and tries to trick us into passing
-     currently unsupported vector types on the stack by using TImode.  */
-  return (!TARGET_64BIT && mode == TImode
-	  && type && TREE_CODE (type) != VECTOR_TYPE);
-}
+  for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
+    {
+      rtx reg, mem;
 
-/* It returns the size, in bytes, of the area reserved for arguments passed
-   in registers for the function represented by fndecl dependent to the used
-   abi format.  */
-int
-ix86_reg_parm_stack_space (const_tree fndecl)
-{
-  enum calling_abi call_abi = SYSV_ABI;
-  if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
-    call_abi = ix86_function_abi (fndecl);
-  else
-    call_abi = ix86_function_type_abi (fndecl);
-  if (TARGET_64BIT && call_abi == MS_ABI)
-    return 32;
-  return 0;
-}
+      mem = gen_rtx_MEM (Pmode,
+			 plus_constant (Pmode, virtual_incoming_args_rtx,
+					i * UNITS_PER_WORD));
+      MEM_NOTRAP_P (mem) = 1;
+      set_mem_alias_set (mem, set);
 
-/* We add this as a workaround in order to use libc_has_function
-   hook in i386.md.  */
-bool
-ix86_libc_has_function (enum function_class fn_class)
-{
-  return targetm.libc_has_function (fn_class);
+      reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
+      emit_move_insn (mem, reg);
+    }
 }
 
-/* Returns value SYSV_ABI, MS_ABI dependent on fntype,
-   specifying the call abi used.  */
-enum calling_abi
-ix86_function_type_abi (const_tree fntype)
+static void
+ix86_setup_incoming_varargs (cumulative_args_t cum_v,
+			     const function_arg_info &arg,
+			     int *, int no_rtl)
 {
-  enum calling_abi abi = ix86_abi;
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+  CUMULATIVE_ARGS next_cum;
+  tree fntype;
 
-  if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
-    return abi;
+  /* This argument doesn't appear to be used anymore.  Which is good,
+     because the old code here didn't suppress rtl generation.  */
+  gcc_assert (!no_rtl);
 
-  if (abi == SYSV_ABI
-      && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
-    {
-      static int warned;
-      if (TARGET_X32 && !warned)
-	{
-	  error ("X32 does not support ms_abi attribute");
-	  warned = 1;
-	}
+  if (!TARGET_64BIT)
+    return;
 
-      abi = MS_ABI;
-    }
-  else if (abi == MS_ABI
-	   && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
-    abi = SYSV_ABI;
+  fntype = TREE_TYPE (current_function_decl);
 
-  return abi;
-}
+  /* For varargs, we do not want to skip the dummy va_dcl argument.
+     For stdargs, we do want to skip the last named argument.  */
+  next_cum = *cum;
+  if (stdarg_p (fntype))
+    ix86_function_arg_advance (pack_cumulative_args (&next_cum), arg);
 
-static enum calling_abi
-ix86_function_abi (const_tree fndecl)
-{
-  return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
+  if (cum->call_abi == MS_ABI)
+    setup_incoming_varargs_ms_64 (&next_cum);
+  else
+    setup_incoming_varargs_64 (&next_cum);
 }
 
-/* Returns value SYSV_ABI, MS_ABI dependent on cfun,
-   specifying the call abi used.  */
-enum calling_abi
-ix86_cfun_abi (void)
-{
-  return cfun ? cfun->machine->call_abi : ix86_abi;
-}
+/* Checks if TYPE is of kind va_list char *.  */
 
 static bool
-ix86_function_ms_hook_prologue (const_tree fn)
+is_va_list_char_pointer (tree type)
 {
-  if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
-    {
-      if (decl_function_context (fn) != NULL_TREE)
-	error_at (DECL_SOURCE_LOCATION (fn),
-		  "ms_hook_prologue is not compatible with nested function");
-      else
-        return true;
-    }
-  return false;
-}
+  tree canonic;
 
-static bool
-ix86_function_naked (const_tree fn)
-{
-  if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
+  /* For 32-bit it is always true.  */
+  if (!TARGET_64BIT)
     return true;
-
-  return false;
+  canonic = ix86_canonical_va_list_type (type);
+  return (canonic == ms_va_list_type_node
+          || (ix86_abi == MS_ABI && canonic == va_list_type_node));
 }
 
-/* Write the extra assembler code needed to declare a function properly.  */
+/* Implement va_start.  */
 
-void
-ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
-				tree decl)
+static void
+ix86_va_start (tree valist, rtx nextarg)
 {
-  bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
+  HOST_WIDE_INT words, n_gpr, n_fpr;
+  tree f_gpr, f_fpr, f_ovf, f_sav;
+  tree gpr, fpr, ovf, sav, t;
+  tree type;
+  rtx ovf_rtx;
 
-  if (is_ms_hook)
+  if (flag_split_stack
+      && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
     {
-      int i, filler_count = (TARGET_64BIT ? 32 : 16);
-      unsigned int filler_cc = 0xcccccccc;
+      unsigned int scratch_regno;
 
-      for (i = 0; i < filler_count; i += 4)
-        fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
-    }
+      /* When we are splitting the stack, we can't refer to the stack
+	 arguments using internal_arg_pointer, because they may be on
+	 the old stack.  The split stack prologue will arrange to
+	 leave a pointer to the old stack arguments in a scratch
+	 register, which we here copy to a pseudo-register.  The split
+	 stack prologue can't set the pseudo-register directly because
+	 it (the prologue) runs before any registers have been saved.  */
 
-#ifdef SUBTARGET_ASM_UNWIND_INIT
-  SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
-#endif
+      scratch_regno = split_stack_prologue_scratch_regno ();
+      if (scratch_regno != INVALID_REGNUM)
+	{
+	  rtx reg;
+	  rtx_insn *seq;
 
-  ASM_OUTPUT_LABEL (asm_out_file, fname);
+	  reg = gen_reg_rtx (Pmode);
+	  cfun->machine->split_stack_varargs_pointer = reg;
 
-  /* Output magic byte marker, if hot-patch attribute is set.  */
-  if (is_ms_hook)
-    {
-      if (TARGET_64BIT)
-	{
-	  /* leaq [%rsp + 0], %rsp  */
-	  fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
-		 asm_out_file);
+	  start_sequence ();
+	  emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
+	  seq = get_insns ();
+	  end_sequence ();
+
+	  push_topmost_sequence ();
+	  emit_insn_after (seq, entry_of_function ());
+	  pop_topmost_sequence ();
 	}
+    }
+
+  /* Only 64bit target needs something special.  */
+  if (is_va_list_char_pointer (TREE_TYPE (valist)))
+    {
+      if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
+	std_expand_builtin_va_start (valist, nextarg);
       else
 	{
-          /* movl.s %edi, %edi
-	     push   %ebp
-	     movl.s %esp, %ebp */
-	  fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
+	  rtx va_r, next;
+
+	  va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
+	  next = expand_binop (ptr_mode, add_optab,
+			       cfun->machine->split_stack_varargs_pointer,
+			       crtl->args.arg_offset_rtx,
+			       NULL_RTX, 0, OPTAB_LIB_WIDEN);
+	  convert_move (va_r, next, 0);
 	}
+      return;
     }
-}
 
-/* Implementation of call abi switching target hook. Specific to FNDECL
-   the specific call register sets are set.  See also
-   ix86_conditional_register_usage for more details.  */
-void
-ix86_call_abi_override (const_tree fndecl)
-{
-  cfun->machine->call_abi = ix86_function_abi (fndecl);
-}
+  f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
+  f_fpr = DECL_CHAIN (f_gpr);
+  f_ovf = DECL_CHAIN (f_fpr);
+  f_sav = DECL_CHAIN (f_ovf);
 
-/* Return 1 if pseudo register should be created and used to hold
-   GOT address for PIC code.  */
-bool
-ix86_use_pseudo_pic_reg (void)
-{
-  if ((TARGET_64BIT
-       && (ix86_cmodel == CM_SMALL_PIC
-	   || TARGET_PECOFF))
-      || !flag_pic)
-    return false;
-  return true;
-}
+  valist = build_simple_mem_ref (valist);
+  TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
+  /* The following should be folded into the MEM_REF offset.  */
+  gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
+		f_gpr, NULL_TREE);
+  fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
+		f_fpr, NULL_TREE);
+  ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
+		f_ovf, NULL_TREE);
+  sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
+		f_sav, NULL_TREE);
 
-/* Initialize large model PIC register.  */
+  /* Count number of gp and fp argument registers used.  */
+  words = crtl->args.info.words;
+  n_gpr = crtl->args.info.regno;
+  n_fpr = crtl->args.info.sse_regno;
 
-static void
-ix86_init_large_pic_reg (unsigned int tmp_regno)
-{
-  rtx_code_label *label;
-  rtx tmp_reg;
-
-  gcc_assert (Pmode == DImode);
-  label = gen_label_rtx ();
-  emit_label (label);
-  LABEL_PRESERVE_P (label) = 1;
-  tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
-  gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
-  emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
-				label));
-  emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
-  emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
-			    pic_offset_table_rtx, tmp_reg));
-  const char *name = LABEL_NAME (label);
-  PUT_CODE (label, NOTE);
-  NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
-  NOTE_DELETED_LABEL_NAME (label) = name;
-}
-
-/* Create and initialize PIC register if required.  */
-static void
-ix86_init_pic_reg (void)
-{
-  edge entry_edge;
-  rtx_insn *seq;
-
-  if (!ix86_use_pseudo_pic_reg ())
-    return;
-
-  start_sequence ();
-
-  if (TARGET_64BIT)
+  if (cfun->va_list_gpr_size)
     {
-      if (ix86_cmodel == CM_LARGE_PIC)
-	ix86_init_large_pic_reg (R11_REG);
-      else
-	emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
+      type = TREE_TYPE (gpr);
+      t = build2 (MODIFY_EXPR, type,
+		  gpr, build_int_cst (type, n_gpr * 8));
+      TREE_SIDE_EFFECTS (t) = 1;
+      expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
     }
-  else
+
+  if (TARGET_SSE && cfun->va_list_fpr_size)
     {
-      /*  If there is future mcount call in the function it is more profitable
-	  to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM.  */
-      rtx reg = crtl->profile
-		? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
-		: pic_offset_table_rtx;
-      rtx_insn *insn = emit_insn (gen_set_got (reg));
-      RTX_FRAME_RELATED_P (insn) = 1;
-      if (crtl->profile)
-        emit_move_insn (pic_offset_table_rtx, reg);
-      add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
+      type = TREE_TYPE (fpr);
+      t = build2 (MODIFY_EXPR, type, fpr,
+		  build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
+      TREE_SIDE_EFFECTS (t) = 1;
+      expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
     }
 
-  seq = get_insns ();
-  end_sequence ();
+  /* Find the overflow area.  */
+  type = TREE_TYPE (ovf);
+  if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
+    ovf_rtx = crtl->args.internal_arg_pointer;
+  else
+    ovf_rtx = cfun->machine->split_stack_varargs_pointer;
+  t = make_tree (type, ovf_rtx);
+  if (words != 0)
+    t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
 
-  entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
-  insert_insn_on_edge (seq, entry_edge);
-  commit_one_edge_insertion (entry_edge);
+  t = build2 (MODIFY_EXPR, type, ovf, t);
+  TREE_SIDE_EFFECTS (t) = 1;
+  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+
+  if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
+    {
+      /* Find the register save area.
+	 Prologue of the function save it right above stack frame.  */
+      type = TREE_TYPE (sav);
+      t = make_tree (type, frame_pointer_rtx);
+      if (!ix86_varargs_gpr_size)
+	t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
+
+      t = build2 (MODIFY_EXPR, type, sav, t);
+      TREE_SIDE_EFFECTS (t) = 1;
+      expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+    }
 }
 
-/* Initialize a variable CUM of type CUMULATIVE_ARGS
-   for a call to a function whose data type is FNTYPE.
-   For a library call, FNTYPE is 0.  */
+/* Implement va_arg.  */
 
-void
-init_cumulative_args (CUMULATIVE_ARGS *cum,  /* Argument info to initialize */
-		      tree fntype,	/* tree ptr for function decl */
-		      rtx libname,	/* SYMBOL_REF of library name or 0 */
-		      tree fndecl,
-		      int caller)
+static tree
+ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
+		      gimple_seq *post_p)
 {
-  struct cgraph_local_info *i = NULL;
-  struct cgraph_node *target = NULL;
+  static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
+  tree f_gpr, f_fpr, f_ovf, f_sav;
+  tree gpr, fpr, ovf, sav, t;
+  int size, rsize;
+  tree lab_false, lab_over = NULL_TREE;
+  tree addr, t2;
+  rtx container;
+  int indirect_p = 0;
+  tree ptrtype;
+  machine_mode nat_mode;
+  unsigned int arg_boundary;
+  unsigned int type_align;
 
-  memset (cum, 0, sizeof (*cum));
+  /* Only 64bit target needs something special.  */
+  if (is_va_list_char_pointer (TREE_TYPE (valist)))
+    return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
 
-  if (fndecl)
+  f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
+  f_fpr = DECL_CHAIN (f_gpr);
+  f_ovf = DECL_CHAIN (f_fpr);
+  f_sav = DECL_CHAIN (f_ovf);
+
+  gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
+		valist, f_gpr, NULL_TREE);
+
+  fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
+  ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
+  sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
+
+  indirect_p = pass_va_arg_by_reference (type);
+  if (indirect_p)
+    type = build_pointer_type (type);
+  size = arg_int_size_in_bytes (type);
+  rsize = CEIL (size, UNITS_PER_WORD);
+
+  nat_mode = type_natural_mode (type, NULL, false);
+  switch (nat_mode)
     {
-      target = cgraph_node::get (fndecl);
-      if (target)
+    case E_V8SFmode:
+    case E_V8SImode:
+    case E_V32QImode:
+    case E_V16HImode:
+    case E_V4DFmode:
+    case E_V4DImode:
+    case E_V16SFmode:
+    case E_V16SImode:
+    case E_V64QImode:
+    case E_V32HImode:
+    case E_V8DFmode:
+    case E_V8DImode:
+      /* Unnamed 256 and 512bit vector mode parameters are passed on stack.  */
+      if (!TARGET_64BIT_MS_ABI)
 	{
-	  target = target->function_symbol ();
-	  i = cgraph_node::local_info (target->decl);
-	  cum->call_abi = ix86_function_abi (target->decl);
+	  container = NULL;
+	  break;
 	}
-      else
-	cum->call_abi = ix86_function_abi (fndecl);
+      /* FALLTHRU */
+
+    default:
+      container = construct_container (nat_mode, TYPE_MODE (type),
+				       type, 0, X86_64_REGPARM_MAX,
+				       X86_64_SSE_REGPARM_MAX, intreg,
+				       0);
+      break;
     }
-  else
-    cum->call_abi = ix86_function_type_abi (fntype);
 
-  cum->caller = caller;
+  /* Pull the value out of the saved registers.  */
 
-  /* Set up the number of registers to use for passing arguments.  */
-  cum->nregs = ix86_regparm;
-  if (TARGET_64BIT)
-    {
-      cum->nregs = (cum->call_abi == SYSV_ABI
-                   ? X86_64_REGPARM_MAX
-                   : X86_64_MS_REGPARM_MAX);
-    }
-  if (TARGET_SSE)
+  addr = create_tmp_var (ptr_type_node, "addr");
+  type_align = TYPE_ALIGN (type);
+
+  if (container)
     {
-      cum->sse_nregs = SSE_REGPARM_MAX;
-      if (TARGET_64BIT)
-        {
-          cum->sse_nregs = (cum->call_abi == SYSV_ABI
-                           ? X86_64_SSE_REGPARM_MAX
-                           : X86_64_MS_SSE_REGPARM_MAX);
-        }
-    }
-  if (TARGET_MMX)
-    cum->mmx_nregs = MMX_REGPARM_MAX;
-  cum->warn_avx512f = true;
-  cum->warn_avx = true;
-  cum->warn_sse = true;
-  cum->warn_mmx = true;
+      int needed_intregs, needed_sseregs;
+      bool need_temp;
+      tree int_addr, sse_addr;
 
-  /* Because type might mismatch in between caller and callee, we need to
-     use actual type of function for local calls.
-     FIXME: cgraph_analyze can be told to actually record if function uses
-     va_start so for local functions maybe_vaarg can be made aggressive
-     helping K&R code.
-     FIXME: once typesytem is fixed, we won't need this code anymore.  */
-  if (i && i->local && i->can_change_signature)
-    fntype = TREE_TYPE (target->decl);
-  cum->stdarg = stdarg_p (fntype);
-  cum->maybe_vaarg = (fntype
-		      ? (!prototype_p (fntype) || stdarg_p (fntype))
-		      : !libname);
+      lab_false = create_artificial_label (UNKNOWN_LOCATION);
+      lab_over = create_artificial_label (UNKNOWN_LOCATION);
 
-  cum->decl = fndecl;
+      examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
 
-  cum->warn_empty = !warn_abi || cum->stdarg;
-  if (!cum->warn_empty && fntype)
-    {
-      function_args_iterator iter;
-      tree argtype;
-      bool seen_empty_type = false;
-      FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
+      need_temp = (!REG_P (container)
+		   && ((needed_intregs && TYPE_ALIGN (type) > 64)
+		       || TYPE_ALIGN (type) > 128));
+
+      /* In case we are passing structure, verify that it is consecutive block
+         on the register save area.  If not we need to do moves.  */
+      if (!need_temp && !REG_P (container))
 	{
-	  if (argtype == error_mark_node || VOID_TYPE_P (argtype))
-	    break;
-	  if (TYPE_EMPTY_P (argtype))
-	    seen_empty_type = true;
-	  else if (seen_empty_type)
+	  /* Verify that all registers are strictly consecutive  */
+	  if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
 	    {
-	      cum->warn_empty = true;
-	      break;
-	    }
-	}
-    }
+	      int i;
 
-  if (!TARGET_64BIT)
-    {
-      /* If there are variable arguments, then we won't pass anything
-         in registers in 32-bit mode. */
-      if (stdarg_p (fntype))
+	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
+		{
+		  rtx slot = XVECEXP (container, 0, i);
+		  if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
+		      || INTVAL (XEXP (slot, 1)) != i * 16)
+		    need_temp = true;
+		}
+	    }
+	  else
+	    {
+	      int i;
+
+	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
+		{
+		  rtx slot = XVECEXP (container, 0, i);
+		  if (REGNO (XEXP (slot, 0)) != (unsigned int) i
+		      || INTVAL (XEXP (slot, 1)) != i * 8)
+		    need_temp = true;
+		}
+	    }
+	}
+      if (!need_temp)
 	{
-	  cum->nregs = 0;
-	  /* Since in 32-bit, variable arguments are always passed on
-	     stack, there is scratch register available for indirect
-	     sibcall.  */
-	  cfun->machine->arg_reg_available = true;
-	  cum->sse_nregs = 0;
-	  cum->mmx_nregs = 0;
-	  cum->warn_avx512f = false;
-	  cum->warn_avx = false;
-	  cum->warn_sse = false;
-	  cum->warn_mmx = false;
-	  return;
+	  int_addr = addr;
+	  sse_addr = addr;
+	}
+      else
+	{
+	  int_addr = create_tmp_var (ptr_type_node, "int_addr");
+	  sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
 	}
 
-      /* Use ecx and edx registers if function has fastcall attribute,
-	 else look for regparm information.  */
-      if (fntype)
+      /* First ensure that we fit completely in registers.  */
+      if (needed_intregs)
 	{
-	  unsigned int ccvt = ix86_get_callcvt (fntype);
-	  if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
-	    {
-	      cum->nregs = 1;
-	      cum->fastcall = 1; /* Same first register as in fastcall.  */
-	    }
-	  else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
-	    {
-	      cum->nregs = 2;
-	      cum->fastcall = 1;
-	    }
-	  else
-	    cum->nregs = ix86_function_regparm (fntype, fndecl);
+	  t = build_int_cst (TREE_TYPE (gpr),
+			     (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
+	  t = build2 (GE_EXPR, boolean_type_node, gpr, t);
+	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
+	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
+	  gimplify_and_add (t, pre_p);
+	}
+      if (needed_sseregs)
+	{
+	  t = build_int_cst (TREE_TYPE (fpr),
+			     (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
+			     + X86_64_REGPARM_MAX * 8);
+	  t = build2 (GE_EXPR, boolean_type_node, fpr, t);
+	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
+	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
+	  gimplify_and_add (t, pre_p);
 	}
 
-      /* Set up the number of SSE registers used for passing SFmode
-	 and DFmode arguments.  Warn for mismatching ABI.  */
-      cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
-    }
+      /* Compute index to start of area used for integer regs.  */
+      if (needed_intregs)
+	{
+	  /* int_addr = gpr + sav; */
+	  t = fold_build_pointer_plus (sav, gpr);
+	  gimplify_assign (int_addr, t, pre_p);
+	}
+      if (needed_sseregs)
+	{
+	  /* sse_addr = fpr + sav; */
+	  t = fold_build_pointer_plus (sav, fpr);
+	  gimplify_assign (sse_addr, t, pre_p);
+	}
+      if (need_temp)
+	{
+	  int i, prev_size = 0;
+	  tree temp = create_tmp_var (type, "va_arg_tmp");
 
-  cfun->machine->arg_reg_available = (cum->nregs > 0);
-}
+	  /* addr = &temp; */
+	  t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
+	  gimplify_assign (addr, t, pre_p);
 
-/* Return the "natural" mode for TYPE.  In most cases, this is just TYPE_MODE.
-   But in the case of vector types, it is some vector mode.
+	  for (i = 0; i < XVECLEN (container, 0); i++)
+	    {
+	      rtx slot = XVECEXP (container, 0, i);
+	      rtx reg = XEXP (slot, 0);
+	      machine_mode mode = GET_MODE (reg);
+	      tree piece_type;
+	      tree addr_type;
+	      tree daddr_type;
+	      tree src_addr, src;
+	      int src_offset;
+	      tree dest_addr, dest;
+	      int cur_size = GET_MODE_SIZE (mode);
 
-   When we have only some of our vector isa extensions enabled, then there
-   are some modes for which vector_mode_supported_p is false.  For these
-   modes, the generic vector support in gcc will choose some non-vector mode
-   in order to implement the type.  By computing the natural mode, we'll
-   select the proper ABI location for the operand and not depend on whatever
-   the middle-end decides to do with these vector types.
+	      gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
+	      prev_size = INTVAL (XEXP (slot, 1));
+	      if (prev_size + cur_size > size)
+		{
+		  cur_size = size - prev_size;
+		  unsigned int nbits = cur_size * BITS_PER_UNIT;
+		  if (!int_mode_for_size (nbits, 1).exists (&mode))
+		    mode = QImode;
+		}
+	      piece_type = lang_hooks.types.type_for_mode (mode, 1);
+	      if (mode == GET_MODE (reg))
+		addr_type = build_pointer_type (piece_type);
+	      else
+		addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
+							 true);
+	      daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
+							true);
 
-   The midde-end can't deal with the vector types > 16 bytes.  In this
-   case, we return the original mode and warn ABI change if CUM isn't
-   NULL. 
+	      if (SSE_REGNO_P (REGNO (reg)))
+		{
+		  src_addr = sse_addr;
+		  src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
+		}
+	      else
+		{
+		  src_addr = int_addr;
+		  src_offset = REGNO (reg) * 8;
+		}
+	      src_addr = fold_convert (addr_type, src_addr);
+	      src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
 
-   If INT_RETURN is true, warn ABI change if the vector mode isn't
-   available for function return value.  */
+	      dest_addr = fold_convert (daddr_type, addr);
+	      dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
+	      if (cur_size == GET_MODE_SIZE (mode))
+		{
+		  src = build_va_arg_indirect_ref (src_addr);
+		  dest = build_va_arg_indirect_ref (dest_addr);
 
-static machine_mode
-type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
-		   bool in_return)
-{
-  machine_mode mode = TYPE_MODE (type);
+		  gimplify_assign (dest, src, pre_p);
+		}
+	      else
+		{
+		  tree copy
+		    = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
+				       3, dest_addr, src_addr,
+				       size_int (cur_size));
+		  gimplify_and_add (copy, pre_p);
+		}
+	      prev_size += cur_size;
+	    }
+	}
 
-  if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
-    {
-      HOST_WIDE_INT size = int_size_in_bytes (type);
-      if ((size == 8 || size == 16 || size == 32 || size == 64)
-	  /* ??? Generic code allows us to create width 1 vectors.  Ignore.  */
-	  && TYPE_VECTOR_SUBPARTS (type) > 1)
+      if (needed_intregs)
 	{
-	  machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
+	  t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
+		      build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
+	  gimplify_assign (gpr, t, pre_p);
+	  /* The GPR save area guarantees only 8-byte alignment.  */
+	  if (!need_temp)
+	    type_align = MIN (type_align, 64);
+	}
 
-	  /* There are no XFmode vector modes.  */
-	  if (innermode == XFmode)
-	    return mode;
+      if (needed_sseregs)
+	{
+	  t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
+		      build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
+	  gimplify_assign (unshare_expr (fpr), t, pre_p);
+	}
 
-	  if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
-	    mode = MIN_MODE_VECTOR_FLOAT;
-	  else
-	    mode = MIN_MODE_VECTOR_INT;
+      gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
 
-	  /* Get the mode which has this inner mode and number of units.  */
-	  FOR_EACH_MODE_FROM (mode, mode)
-	    if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
-		&& GET_MODE_INNER (mode) == innermode)
-	      {
-		if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
-		  {
-		    static bool warnedavx512f;
-		    static bool warnedavx512f_ret;
+      gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
+    }
 
-		    if (cum && cum->warn_avx512f && !warnedavx512f)
-		      {
-			if (warning (OPT_Wpsabi, "AVX512F vector argument "
-				     "without AVX512F enabled changes the ABI"))
-			  warnedavx512f = true;
-		      }
-		    else if (in_return && !warnedavx512f_ret)
-		      {
-			if (warning (OPT_Wpsabi, "AVX512F vector return "
-				     "without AVX512F enabled changes the ABI"))
-			  warnedavx512f_ret = true;
-		      }
+  /* ... otherwise out of the overflow area.  */
 
-		    return TYPE_MODE (type);
-		  }
-		else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
-		  {
-		    static bool warnedavx;
-		    static bool warnedavx_ret;
+  /* When we align parameter on stack for caller, if the parameter
+     alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
+     aligned at MAX_SUPPORTED_STACK_ALIGNMENT.  We will match callee
+     here with caller.  */
+  arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
+  if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
+    arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
 
-		    if (cum && cum->warn_avx && !warnedavx)
-		      {
-			if (warning (OPT_Wpsabi, "AVX vector argument "
-				     "without AVX enabled changes the ABI"))
-			  warnedavx = true;
-		      }
-		    else if (in_return && !warnedavx_ret)
-		      {
-			if (warning (OPT_Wpsabi, "AVX vector return "
-				     "without AVX enabled changes the ABI"))
-			  warnedavx_ret = true;
-		      }
+  /* Care for on-stack alignment if needed.  */
+  if (arg_boundary <= 64 || size == 0)
+    t = ovf;
+ else
+    {
+      HOST_WIDE_INT align = arg_boundary / 8;
+      t = fold_build_pointer_plus_hwi (ovf, align - 1);
+      t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
+		  build_int_cst (TREE_TYPE (t), -align));
+    }
 
-		    return TYPE_MODE (type);
-		  }
-		else if (((size == 8 && TARGET_64BIT) || size == 16)
-			 && !TARGET_SSE
-			 && !TARGET_IAMCU)
-		  {
-		    static bool warnedsse;
-		    static bool warnedsse_ret;
+  gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
+  gimplify_assign (addr, t, pre_p);
 
-		    if (cum && cum->warn_sse && !warnedsse)
-		      {
-			if (warning (OPT_Wpsabi, "SSE vector argument "
-				     "without SSE enabled changes the ABI"))
-			  warnedsse = true;
-		      }
-		    else if (!TARGET_64BIT && in_return && !warnedsse_ret)
-		      {
-			if (warning (OPT_Wpsabi, "SSE vector return "
-				     "without SSE enabled changes the ABI"))
-			  warnedsse_ret = true;
-		      }
-		  }
-		else if ((size == 8 && !TARGET_64BIT)
-			 && (!cfun
-			     || cfun->machine->func_type == TYPE_NORMAL)
-			 && !TARGET_MMX
-			 && !TARGET_IAMCU)
-		  {
-		    static bool warnedmmx;
-		    static bool warnedmmx_ret;
+  t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
+  gimplify_assign (unshare_expr (ovf), t, pre_p);
 
-		    if (cum && cum->warn_mmx && !warnedmmx)
-		      {
-			if (warning (OPT_Wpsabi, "MMX vector argument "
-				     "without MMX enabled changes the ABI"))
-			  warnedmmx = true;
-		      }
-		    else if (in_return && !warnedmmx_ret)
-		      {
-			if (warning (OPT_Wpsabi, "MMX vector return "
-				     "without MMX enabled changes the ABI"))
-			  warnedmmx_ret = true;
-		      }
-		  }
-		return mode;
-	      }
+  if (container)
+    gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
 
-	  gcc_unreachable ();
-	}
-    }
+  type = build_aligned_type (type, type_align);
+  ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
+  addr = fold_convert (ptrtype, addr);
 
-  return mode;
+  if (indirect_p)
+    addr = build_va_arg_indirect_ref (addr);
+  return build_va_arg_indirect_ref (addr);
 }
+
+/* Return true if OPNUM's MEM should be matched
+   in movabs* patterns.  */
 
-/* We want to pass a value in REGNO whose "natural" mode is MODE.  However,
-   this may not agree with the mode that the type system has chosen for the
-   register, which is ORIG_MODE.  If ORIG_MODE is not BLKmode, then we can
-   go ahead and use it.  Otherwise we have to build a PARALLEL instead.  */
-
-static rtx
-gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
-		     unsigned int regno)
+bool
+ix86_check_movabs (rtx insn, int opnum)
 {
-  rtx tmp;
+  rtx set, mem;
 
-  if (orig_mode != BLKmode)
-    tmp = gen_rtx_REG (orig_mode, regno);
-  else
+  set = PATTERN (insn);
+  if (GET_CODE (set) == PARALLEL)
+    set = XVECEXP (set, 0, 0);
+  gcc_assert (GET_CODE (set) == SET);
+  mem = XEXP (set, opnum);
+  while (SUBREG_P (mem))
+    mem = SUBREG_REG (mem);
+  gcc_assert (MEM_P (mem));
+  return volatile_ok || !MEM_VOLATILE_P (mem);
+}
+
+/* Return false if INSN contains a MEM with a non-default address space.  */
+bool
+ix86_check_no_addr_space (rtx insn)
+{
+  subrtx_var_iterator::array_type array;
+  FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
     {
-      tmp = gen_rtx_REG (mode, regno);
-      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
-      tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
+      rtx x = *iter;
+      if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
+	return false;
     }
-
-  return tmp;
+  return true;
 }
+
+/* Initialize the table of extra 80387 mathematical constants.  */
 
-/* x86-64 register passing implementation.  See x86-64 ABI for details.  Goal
-   of this code is to classify each 8bytes of incoming argument by the register
-   class and assign registers accordingly.  */
-
-/* Return the union class of CLASS1 and CLASS2.
-   See the x86-64 PS ABI for details.  */
-
-static enum x86_64_reg_class
-merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
+static void
+init_ext_80387_constants (void)
 {
-  /* Rule #1: If both classes are equal, this is the resulting class.  */
-  if (class1 == class2)
-    return class1;
-
-  /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
-     the other class.  */
-  if (class1 == X86_64_NO_CLASS)
-    return class2;
-  if (class2 == X86_64_NO_CLASS)
-    return class1;
-
-  /* Rule #3: If one of the classes is MEMORY, the result is MEMORY.  */
-  if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
-    return X86_64_MEMORY_CLASS;
-
-  /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */
-  if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
-      || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
-    return X86_64_INTEGERSI_CLASS;
-  if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
-      || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
-    return X86_64_INTEGER_CLASS;
+  static const char * cst[5] =
+  {
+    "0.3010299956639811952256464283594894482",  /* 0: fldlg2  */
+    "0.6931471805599453094286904741849753009",  /* 1: fldln2  */
+    "1.4426950408889634073876517827983434472",  /* 2: fldl2e  */
+    "3.3219280948873623478083405569094566090",  /* 3: fldl2t  */
+    "3.1415926535897932385128089594061862044",  /* 4: fldpi   */
+  };
+  int i;
 
-  /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
-     MEMORY is used.  */
-  if (class1 == X86_64_X87_CLASS
-      || class1 == X86_64_X87UP_CLASS
-      || class1 == X86_64_COMPLEX_X87_CLASS
-      || class2 == X86_64_X87_CLASS
-      || class2 == X86_64_X87UP_CLASS
-      || class2 == X86_64_COMPLEX_X87_CLASS)
-    return X86_64_MEMORY_CLASS;
+  for (i = 0; i < 5; i++)
+    {
+      real_from_string (&ext_80387_constants_table[i], cst[i]);
+      /* Ensure each constant is rounded to XFmode precision.  */
+      real_convert (&ext_80387_constants_table[i],
+		    XFmode, &ext_80387_constants_table[i]);
+    }
 
-  /* Rule #6: Otherwise class SSE is used.  */
-  return X86_64_SSE_CLASS;
+  ext_80387_constants_init = 1;
 }
 
-/* Classify the argument of type TYPE and mode MODE.
-   CLASSES will be filled by the register class used to pass each word
-   of the operand.  The number of words is returned.  In case the parameter
-   should be passed in memory, 0 is returned. As a special case for zero
-   sized containers, classes[0] will be NO_CLASS and 1 is returned.
+/* Return non-zero if the constant is something that
+   can be loaded with a special instruction.  */
 
-   BIT_OFFSET is used internally for handling records and specifies offset
-   of the offset in bits modulo 512 to avoid overflow cases.
+int
+standard_80387_constant_p (rtx x)
+{
+  machine_mode mode = GET_MODE (x);
 
-   See the x86-64 PS ABI for details.
-*/
+  const REAL_VALUE_TYPE *r;
 
-static int
-classify_argument (machine_mode mode, const_tree type,
-		   enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
-{
-  HOST_WIDE_INT bytes
-    = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
-  int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
+  if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
+    return -1;
 
-  /* Variable sized entities are always passed/returned in memory.  */
-  if (bytes < 0)
-    return 0;
+  if (x == CONST0_RTX (mode))
+    return 1;
+  if (x == CONST1_RTX (mode))
+    return 2;
 
-  if (mode != VOIDmode
-      && targetm.calls.must_pass_in_stack (mode, type))
-    return 0;
+  r = CONST_DOUBLE_REAL_VALUE (x);
 
-  if (type && AGGREGATE_TYPE_P (type))
+  /* For XFmode constants, try to find a special 80387 instruction when
+     optimizing for size or on those CPUs that benefit from them.  */
+  if (mode == XFmode
+      && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
     {
       int i;
-      tree field;
-      enum x86_64_reg_class subclasses[MAX_CLASSES];
 
-      /* On x86-64 we pass structures larger than 64 bytes on the stack.  */
-      if (bytes > 64)
-	return 0;
+      if (! ext_80387_constants_init)
+	init_ext_80387_constants ();
 
-      for (i = 0; i < words; i++)
-	classes[i] = X86_64_NO_CLASS;
+      for (i = 0; i < 5; i++)
+        if (real_identical (r, &ext_80387_constants_table[i]))
+	  return i + 3;
+    }
 
-      /* Zero sized arrays or structures are NO_CLASS.  We return 0 to
-	 signalize memory class, so handle it as special case.  */
-      if (!words)
-	{
-	  classes[0] = X86_64_NO_CLASS;
-	  return 1;
-	}
+  /* Load of the constant -0.0 or -1.0 will be split as
+     fldz;fchs or fld1;fchs sequence.  */
+  if (real_isnegzero (r))
+    return 8;
+  if (real_identical (r, &dconstm1))
+    return 9;
 
-      /* Classify each field of record and merge classes.  */
-      switch (TREE_CODE (type))
-	{
-	case RECORD_TYPE:
-	  /* And now merge the fields of structure.  */
-	  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
-	    {
-	      if (TREE_CODE (field) == FIELD_DECL)
-		{
-		  int num;
+  return 0;
+}
 
-		  if (TREE_TYPE (field) == error_mark_node)
-		    continue;
+/* Return the opcode of the special instruction to be used to load
+   the constant X.  */
 
-		  /* Bitfields are always classified as integer.  Handle them
-		     early, since later code would consider them to be
-		     misaligned integers.  */
-		  if (DECL_BIT_FIELD (field))
-		    {
-		      for (i = (int_bit_position (field)
-				+ (bit_offset % 64)) / 8 / 8;
-			   i < ((int_bit_position (field) + (bit_offset % 64))
-			        + tree_to_shwi (DECL_SIZE (field))
-				+ 63) / 8 / 8; i++)
-			classes[i]
-			  = merge_classes (X86_64_INTEGER_CLASS, classes[i]);
-		    }
-		  else
-		    {
-		      int pos;
+const char *
+standard_80387_constant_opcode (rtx x)
+{
+  switch (standard_80387_constant_p (x))
+    {
+    case 1:
+      return "fldz";
+    case 2:
+      return "fld1";
+    case 3:
+      return "fldlg2";
+    case 4:
+      return "fldln2";
+    case 5:
+      return "fldl2e";
+    case 6:
+      return "fldl2t";
+    case 7:
+      return "fldpi";
+    case 8:
+    case 9:
+      return "#";
+    default:
+      gcc_unreachable ();
+    }
+}
 
-		      type = TREE_TYPE (field);
+/* Return the CONST_DOUBLE representing the 80387 constant that is
+   loaded by the specified special instruction.  The argument IDX
+   matches the return value from standard_80387_constant_p.  */
 
-		      /* Flexible array member is ignored.  */
-		      if (TYPE_MODE (type) == BLKmode
-			  && TREE_CODE (type) == ARRAY_TYPE
-			  && TYPE_SIZE (type) == NULL_TREE
-			  && TYPE_DOMAIN (type) != NULL_TREE
-			  && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
-			      == NULL_TREE))
-			{
-			  static bool warned;
+rtx
+standard_80387_constant_rtx (int idx)
+{
+  int i;
 
-			  if (!warned && warn_psabi)
-			    {
-			      warned = true;
-			      inform (input_location,
-				      "the ABI of passing struct with"
-				      " a flexible array member has"
-				      " changed in GCC 4.4");
-			    }
-			  continue;
-			}
-		      num = classify_argument (TYPE_MODE (type), type,
-					       subclasses,
-					       (int_bit_position (field)
-						+ bit_offset) % 512);
-		      if (!num)
-			return 0;
-		      pos = (int_bit_position (field)
-			     + (bit_offset % 64)) / 8 / 8;
-		      for (i = 0; i < num && (i + pos) < words; i++)
-			classes[i + pos]
-			  = merge_classes (subclasses[i], classes[i + pos]);
-		    }
-		}
-	    }
-	  break;
+  if (! ext_80387_constants_init)
+    init_ext_80387_constants ();
 
-	case ARRAY_TYPE:
-	  /* Arrays are handled as small records.  */
-	  {
-	    int num;
-	    num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
-				     TREE_TYPE (type), subclasses, bit_offset);
-	    if (!num)
-	      return 0;
+  switch (idx)
+    {
+    case 3:
+    case 4:
+    case 5:
+    case 6:
+    case 7:
+      i = idx - 3;
+      break;
 
-	    /* The partial classes are now full classes.  */
-	    if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
-	      subclasses[0] = X86_64_SSE_CLASS;
-	    if (subclasses[0] == X86_64_INTEGERSI_CLASS
-		&& !((bit_offset % 64) == 0 && bytes == 4))
-	      subclasses[0] = X86_64_INTEGER_CLASS;
+    default:
+      gcc_unreachable ();
+    }
 
-	    for (i = 0; i < words; i++)
-	      classes[i] = subclasses[i % num];
+  return const_double_from_real_value (ext_80387_constants_table[i],
+				       XFmode);
+}
 
-	    break;
-	  }
-	case UNION_TYPE:
-	case QUAL_UNION_TYPE:
-	  /* Unions are similar to RECORD_TYPE but offset is always 0.
-	     */
-	  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
-	    {
-	      if (TREE_CODE (field) == FIELD_DECL)
-		{
-		  int num;
+/* Return 1 if X is all bits 0 and 2 if X is all bits 1
+   in supported SSE/AVX vector mode.  */
 
-		  if (TREE_TYPE (field) == error_mark_node)
-		    continue;
+int
+standard_sse_constant_p (rtx x, machine_mode pred_mode)
+{
+  machine_mode mode;
 
-		  num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
-					   TREE_TYPE (field), subclasses,
-					   bit_offset);
-		  if (!num)
-		    return 0;
-		  for (i = 0; i < num && i < words; i++)
-		    classes[i] = merge_classes (subclasses[i], classes[i]);
-		}
-	    }
-	  break;
+  if (!TARGET_SSE)
+    return 0;
 
-	default:
-	  gcc_unreachable ();
-	}
+  mode = GET_MODE (x);
 
-      if (words > 2)
-	{
-	  /* When size > 16 bytes, if the first one isn't
-	     X86_64_SSE_CLASS or any other ones aren't
-	     X86_64_SSEUP_CLASS, everything should be passed in
-	     memory.  */
-	  if (classes[0] != X86_64_SSE_CLASS)
-	      return 0;
+  if (x == const0_rtx || const0_operand (x, mode))
+    return 1;
 
-	  for (i = 1; i < words; i++)
-	    if (classes[i] != X86_64_SSEUP_CLASS)
-	      return 0;
-	}
+  if (x == constm1_rtx || vector_all_ones_operand (x, mode))
+    {
+      /* VOIDmode integer constant, get mode from the predicate.  */
+      if (mode == VOIDmode)
+	mode = pred_mode;
 
-      /* Final merger cleanup.  */
-      for (i = 0; i < words; i++)
+      switch (GET_MODE_SIZE (mode))
 	{
-	  /* If one class is MEMORY, everything should be passed in
-	     memory.  */
-	  if (classes[i] == X86_64_MEMORY_CLASS)
-	    return 0;
-
-	  /* The X86_64_SSEUP_CLASS should be always preceded by
-	     X86_64_SSE_CLASS or X86_64_SSEUP_CLASS.  */
-	  if (classes[i] == X86_64_SSEUP_CLASS
-	      && classes[i - 1] != X86_64_SSE_CLASS
-	      && classes[i - 1] != X86_64_SSEUP_CLASS)
-	    {
-	      /* The first one should never be X86_64_SSEUP_CLASS.  */
-	      gcc_assert (i != 0);
-	      classes[i] = X86_64_SSE_CLASS;
-	    }
-
-	  /*  If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
-	       everything should be passed in memory.  */
-	  if (classes[i] == X86_64_X87UP_CLASS
-	      && (classes[i - 1] != X86_64_X87_CLASS))
-	    {
-	      static bool warned;
-
-	      /* The first one should never be X86_64_X87UP_CLASS.  */
-	      gcc_assert (i != 0);
-	      if (!warned && warn_psabi)
-		{
-		  warned = true;
-		  inform (input_location,
-			  "the ABI of passing union with long double"
-			  " has changed in GCC 4.4");
-		}
-	      return 0;
-	    }
+	case 64:
+	  if (TARGET_AVX512F)
+	    return 2;
+	  break;
+	case 32:
+	  if (TARGET_AVX2)
+	    return 2;
+	  break;
+	case 16:
+	  if (TARGET_SSE2)
+	    return 2;
+	  break;
+	case 0:
+	  /* VOIDmode */
+	  gcc_unreachable ();
+	default:
+	  break;
 	}
-      return words;
     }
 
-  /* Compute alignment needed.  We align all types to natural boundaries with
-     exception of XFmode that is aligned to 64bits.  */
-  if (mode != VOIDmode && mode != BLKmode)
-    {
-      int mode_alignment = GET_MODE_BITSIZE (mode);
-
-      if (mode == XFmode)
-	mode_alignment = 128;
-      else if (mode == XCmode)
-	mode_alignment = 256;
-      if (COMPLEX_MODE_P (mode))
-	mode_alignment /= 2;
-      /* Misaligned fields are always returned in memory.  */
-      if (bit_offset % mode_alignment)
-	return 0;
-    }
+  return 0;
+}
 
-  /* for V1xx modes, just use the base mode */
-  if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
-      && GET_MODE_UNIT_SIZE (mode) == bytes)
-    mode = GET_MODE_INNER (mode);
+/* Return the opcode of the special instruction to be used to load
+   the constant operands[1] into operands[0].  */
 
-  /* Classification of atomic types.  */
-  switch (mode)
-    {
-    case E_SDmode:
-    case E_DDmode:
-      classes[0] = X86_64_SSE_CLASS;
-      return 1;
-    case E_TDmode:
-      classes[0] = X86_64_SSE_CLASS;
-      classes[1] = X86_64_SSEUP_CLASS;
-      return 2;
-    case E_DImode:
-    case E_SImode:
-    case E_HImode:
-    case E_QImode:
-    case E_CSImode:
-    case E_CHImode:
-    case E_CQImode:
-      {
-	int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
+const char *
+standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
+{
+  machine_mode mode;
+  rtx x = operands[1];
 
-	/* Analyze last 128 bits only.  */
-	size = (size - 1) & 0x7f;
+  gcc_assert (TARGET_SSE);
 
-	if (size < 32)
-	  {
-	    classes[0] = X86_64_INTEGERSI_CLASS;
-	    return 1;
-	  }
-	else if (size < 64)
-	  {
-	    classes[0] = X86_64_INTEGER_CLASS;
-	    return 1;
-	  }
-	else if (size < 64+32)
-	  {
-	    classes[0] = X86_64_INTEGER_CLASS;
-	    classes[1] = X86_64_INTEGERSI_CLASS;
-	    return 2;
-	  }
-	else if (size < 64+64)
-	  {
-	    classes[0] = classes[1] = X86_64_INTEGER_CLASS;
-	    return 2;
-	  }
-	else
-	  gcc_unreachable ();
-      }
-    case E_CDImode:
-    case E_TImode:
-      classes[0] = classes[1] = X86_64_INTEGER_CLASS;
-      return 2;
-    case E_COImode:
-    case E_OImode:
-      /* OImode shouldn't be used directly.  */
-      gcc_unreachable ();
-    case E_CTImode:
-      return 0;
-    case E_SFmode:
-      if (!(bit_offset % 64))
-	classes[0] = X86_64_SSESF_CLASS;
-      else
-	classes[0] = X86_64_SSE_CLASS;
-      return 1;
-    case E_DFmode:
-      classes[0] = X86_64_SSEDF_CLASS;
-      return 1;
-    case E_XFmode:
-      classes[0] = X86_64_X87_CLASS;
-      classes[1] = X86_64_X87UP_CLASS;
-      return 2;
-    case E_TFmode:
-      classes[0] = X86_64_SSE_CLASS;
-      classes[1] = X86_64_SSEUP_CLASS;
-      return 2;
-    case E_SCmode:
-      classes[0] = X86_64_SSE_CLASS;
-      if (!(bit_offset % 64))
-	return 1;
-      else
+  mode = GET_MODE (x);
+
+  if (x == const0_rtx || const0_operand (x, mode))
+    {
+      switch (get_attr_mode (insn))
 	{
-	  static bool warned;
+	case MODE_TI:
+	  if (!EXT_REX_SSE_REG_P (operands[0]))
+	    return "%vpxor\t%0, %d0";
+	  /* FALLTHRU */
+	case MODE_XI:
+	case MODE_OI:
+	  if (EXT_REX_SSE_REG_P (operands[0]))
+	    return (TARGET_AVX512VL
+		    ? "vpxord\t%x0, %x0, %x0"
+		    : "vpxord\t%g0, %g0, %g0");
+	  return "vpxor\t%x0, %x0, %x0";
 
-	  if (!warned && warn_psabi)
-	    {
-	      warned = true;
-	      inform (input_location,
-		      "the ABI of passing structure with complex float"
-		      " member has changed in GCC 4.4");
-	    }
-	  classes[1] = X86_64_SSESF_CLASS;
-	  return 2;
+	case MODE_V2DF:
+	  if (!EXT_REX_SSE_REG_P (operands[0]))
+	    return "%vxorpd\t%0, %d0";
+	  /* FALLTHRU */
+	case MODE_V8DF:
+	case MODE_V4DF:
+	  if (!EXT_REX_SSE_REG_P (operands[0]))
+	    return "vxorpd\t%x0, %x0, %x0";
+	  else if (TARGET_AVX512DQ)
+	    return (TARGET_AVX512VL
+		    ? "vxorpd\t%x0, %x0, %x0"
+		    : "vxorpd\t%g0, %g0, %g0");
+	  else
+	    return (TARGET_AVX512VL
+		    ? "vpxorq\t%x0, %x0, %x0"
+		    : "vpxorq\t%g0, %g0, %g0");
+
+	case MODE_V4SF:
+	  if (!EXT_REX_SSE_REG_P (operands[0]))
+	    return "%vxorps\t%0, %d0";
+	  /* FALLTHRU */
+	case MODE_V16SF:
+	case MODE_V8SF:
+	  if (!EXT_REX_SSE_REG_P (operands[0]))
+	    return "vxorps\t%x0, %x0, %x0";
+	  else if (TARGET_AVX512DQ)
+	    return (TARGET_AVX512VL
+		    ? "vxorps\t%x0, %x0, %x0"
+		    : "vxorps\t%g0, %g0, %g0");
+	  else
+	    return (TARGET_AVX512VL
+		    ? "vpxord\t%x0, %x0, %x0"
+		    : "vpxord\t%g0, %g0, %g0");
+
+	default:
+	  gcc_unreachable ();
 	}
-    case E_DCmode:
-      classes[0] = X86_64_SSEDF_CLASS;
-      classes[1] = X86_64_SSEDF_CLASS;
-      return 2;
-    case E_XCmode:
-      classes[0] = X86_64_COMPLEX_X87_CLASS;
-      return 1;
-    case E_TCmode:
-      /* This modes is larger than 16 bytes.  */
-      return 0;
-    case E_V8SFmode:
-    case E_V8SImode:
-    case E_V32QImode:
-    case E_V16HImode:
-    case E_V4DFmode:
-    case E_V4DImode:
-      classes[0] = X86_64_SSE_CLASS;
-      classes[1] = X86_64_SSEUP_CLASS;
-      classes[2] = X86_64_SSEUP_CLASS;
-      classes[3] = X86_64_SSEUP_CLASS;
-      return 4;
-    case E_V8DFmode:
-    case E_V16SFmode:
-    case E_V8DImode:
-    case E_V16SImode:
-    case E_V32HImode:
-    case E_V64QImode:
-      classes[0] = X86_64_SSE_CLASS;
-      classes[1] = X86_64_SSEUP_CLASS;
-      classes[2] = X86_64_SSEUP_CLASS;
-      classes[3] = X86_64_SSEUP_CLASS;
-      classes[4] = X86_64_SSEUP_CLASS;
-      classes[5] = X86_64_SSEUP_CLASS;
-      classes[6] = X86_64_SSEUP_CLASS;
-      classes[7] = X86_64_SSEUP_CLASS;
-      return 8;
-    case E_V4SFmode:
-    case E_V4SImode:
-    case E_V16QImode:
-    case E_V8HImode:
-    case E_V2DFmode:
-    case E_V2DImode:
-      classes[0] = X86_64_SSE_CLASS;
-      classes[1] = X86_64_SSEUP_CLASS;
-      return 2;
-    case E_V1TImode:
-    case E_V1DImode:
-    case E_V2SFmode:
-    case E_V2SImode:
-    case E_V4HImode:
-    case E_V8QImode:
-      classes[0] = X86_64_SSE_CLASS;
-      return 1;
-    case E_BLKmode:
-    case E_VOIDmode:
-      return 0;
-    default:
-      gcc_assert (VECTOR_MODE_P (mode));
+    }
+  else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
+    {
+      enum attr_mode insn_mode = get_attr_mode (insn);
+      
+      switch (insn_mode)
+	{
+	case MODE_XI:
+	case MODE_V8DF:
+	case MODE_V16SF:
+	  gcc_assert (TARGET_AVX512F);
+	  return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
 
-      if (bytes > 16)
-	return 0;
+	case MODE_OI:
+	case MODE_V4DF:
+	case MODE_V8SF:
+	  gcc_assert (TARGET_AVX2);
+	  /* FALLTHRU */
+	case MODE_TI:
+	case MODE_V2DF:
+	case MODE_V4SF:
+	  gcc_assert (TARGET_SSE2);
+	  if (!EXT_REX_SSE_REG_P (operands[0]))
+	    return (TARGET_AVX
+		    ? "vpcmpeqd\t%0, %0, %0"
+		    : "pcmpeqd\t%0, %0");
+	  else if (TARGET_AVX512VL)
+	    return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
+	  else
+	    return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
 
-      gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
+	default:
+	  gcc_unreachable ();
+	}
+   }
 
-      if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
-	classes[0] = X86_64_INTEGERSI_CLASS;
-      else
-	classes[0] = X86_64_INTEGER_CLASS;
-      classes[1] = X86_64_INTEGER_CLASS;
-      return 1 + (bytes > 8);
-    }
+  gcc_unreachable ();
 }
 
-/* Examine the argument and return set number of register required in each
-   class.  Return true iff parameter should be passed in memory.  */
+/* Returns true if INSN can be transformed from a memory load
+   to a supported FP constant load.  */
 
-static bool
-examine_argument (machine_mode mode, const_tree type, int in_return,
-		  int *int_nregs, int *sse_nregs)
+bool
+ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
 {
-  enum x86_64_reg_class regclass[MAX_CLASSES];
-  int n = classify_argument (mode, type, regclass, 0);
+  rtx src = find_constant_src (insn);
 
-  *int_nregs = 0;
-  *sse_nregs = 0;
+  gcc_assert (REG_P (dst));
 
-  if (!n)
-    return true;
-  for (n--; n >= 0; n--)
-    switch (regclass[n])
-      {
-      case X86_64_INTEGER_CLASS:
-      case X86_64_INTEGERSI_CLASS:
-	(*int_nregs)++;
-	break;
-      case X86_64_SSE_CLASS:
-      case X86_64_SSESF_CLASS:
-      case X86_64_SSEDF_CLASS:
-	(*sse_nregs)++;
-	break;
-      case X86_64_NO_CLASS:
-      case X86_64_SSEUP_CLASS:
-	break;
-      case X86_64_X87_CLASS:
-      case X86_64_X87UP_CLASS:
-      case X86_64_COMPLEX_X87_CLASS:
-	if (!in_return)
-	  return true;
-	break;
-      case X86_64_MEMORY_CLASS:
-	gcc_unreachable ();
-      }
+  if (src == NULL
+      || (SSE_REGNO_P (REGNO (dst))
+	  && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
+      || (STACK_REGNO_P (REGNO (dst))
+	   && standard_80387_constant_p (src) < 1))
+    return false;
 
-  return false;
+  return true;
 }
 
-/* Construct container for the argument used by GCC interface.  See
-   FUNCTION_ARG for the detailed description.  */
+/* Returns true if OP contains a symbol reference */
 
-static rtx
-construct_container (machine_mode mode, machine_mode orig_mode,
-		     const_tree type, int in_return, int nintregs, int nsseregs,
-		     const int *intreg, int sse_regno)
+bool
+symbolic_reference_mentioned_p (rtx op)
 {
-  /* The following variables hold the static issued_error state.  */
-  static bool issued_sse_arg_error;
-  static bool issued_sse_ret_error;
-  static bool issued_x87_ret_error;
-
-  machine_mode tmpmode;
-  int bytes
-    = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
-  enum x86_64_reg_class regclass[MAX_CLASSES];
-  int n;
+  const char *fmt;
   int i;
-  int nexps = 0;
-  int needed_sseregs, needed_intregs;
-  rtx exp[MAX_CLASSES];
-  rtx ret;
 
-  n = classify_argument (mode, type, regclass, 0);
-  if (!n)
-    return NULL;
-  if (examine_argument (mode, type, in_return, &needed_intregs,
-			&needed_sseregs))
-    return NULL;
-  if (needed_intregs > nintregs || needed_sseregs > nsseregs)
-    return NULL;
+  if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
+    return true;
 
-  /* We allowed the user to turn off SSE for kernel mode.  Don't crash if
-     some less clueful developer tries to use floating-point anyway.  */
-  if (needed_sseregs && !TARGET_SSE)
+  fmt = GET_RTX_FORMAT (GET_CODE (op));
+  for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
     {
-      if (in_return)
-	{
-	  if (!issued_sse_ret_error)
-	    {
-	      error ("SSE register return with SSE disabled");
-	      issued_sse_ret_error = true;
-	    }
-	}
-      else if (!issued_sse_arg_error)
+      if (fmt[i] == 'E')
 	{
-	  error ("SSE register argument with SSE disabled");
-	  issued_sse_arg_error = true;
+	  int j;
+
+	  for (j = XVECLEN (op, i) - 1; j >= 0; j--)
+	    if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
+	      return true;
 	}
-      return NULL;
+
+      else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
+	return true;
     }
 
-  /* Likewise, error if the ABI requires us to return values in the
-     x87 registers and the user specified -mno-80387.  */
-  if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
-    for (i = 0; i < n; i++)
-      if (regclass[i] == X86_64_X87_CLASS
-	  || regclass[i] == X86_64_X87UP_CLASS
-	  || regclass[i] == X86_64_COMPLEX_X87_CLASS)
-	{
-	  if (!issued_x87_ret_error)
-	    {
-	      error ("x87 register return with x87 disabled");
-	      issued_x87_ret_error = true;
-	    }
-	  return NULL;
-	}
+  return false;
+}
 
-  /* First construct simple cases.  Avoid SCmode, since we want to use
-     single register to pass this type.  */
-  if (n == 1 && mode != SCmode)
-    switch (regclass[0])
-      {
-      case X86_64_INTEGER_CLASS:
-      case X86_64_INTEGERSI_CLASS:
-	return gen_rtx_REG (mode, intreg[0]);
-      case X86_64_SSE_CLASS:
-      case X86_64_SSESF_CLASS:
-      case X86_64_SSEDF_CLASS:
-	if (mode != BLKmode)
-	  return gen_reg_or_parallel (mode, orig_mode,
-				      GET_SSE_REGNO (sse_regno));
-	break;
-      case X86_64_X87_CLASS:
-      case X86_64_COMPLEX_X87_CLASS:
-	return gen_rtx_REG (mode, FIRST_STACK_REG);
-      case X86_64_NO_CLASS:
-	/* Zero sized array, struct or class.  */
-	return NULL;
-      default:
-	gcc_unreachable ();
-      }
-  if (n == 2
-      && regclass[0] == X86_64_SSE_CLASS
-      && regclass[1] == X86_64_SSEUP_CLASS
-      && mode != BLKmode)
-    return gen_reg_or_parallel (mode, orig_mode,
-				GET_SSE_REGNO (sse_regno));
-  if (n == 4
-      && regclass[0] == X86_64_SSE_CLASS
-      && regclass[1] == X86_64_SSEUP_CLASS
-      && regclass[2] == X86_64_SSEUP_CLASS
-      && regclass[3] == X86_64_SSEUP_CLASS
-      && mode != BLKmode)
-    return gen_reg_or_parallel (mode, orig_mode,
-				GET_SSE_REGNO (sse_regno));
-  if (n == 8
-      && regclass[0] == X86_64_SSE_CLASS
-      && regclass[1] == X86_64_SSEUP_CLASS
-      && regclass[2] == X86_64_SSEUP_CLASS
-      && regclass[3] == X86_64_SSEUP_CLASS
-      && regclass[4] == X86_64_SSEUP_CLASS
-      && regclass[5] == X86_64_SSEUP_CLASS
-      && regclass[6] == X86_64_SSEUP_CLASS
-      && regclass[7] == X86_64_SSEUP_CLASS
-      && mode != BLKmode)
-    return gen_reg_or_parallel (mode, orig_mode,
-				GET_SSE_REGNO (sse_regno));
-  if (n == 2
-      && regclass[0] == X86_64_X87_CLASS
-      && regclass[1] == X86_64_X87UP_CLASS)
-    return gen_rtx_REG (XFmode, FIRST_STACK_REG);
+/* Return true if it is appropriate to emit `ret' instructions in the
+   body of a function.  Do this only if the epilogue is simple, needing a
+   couple of insns.  Prior to reloading, we can't tell how many registers
+   must be saved, so return false then.  Return false if there is no frame
+   marker to de-allocate.  */
 
-  if (n == 2
-      && regclass[0] == X86_64_INTEGER_CLASS
-      && regclass[1] == X86_64_INTEGER_CLASS
-      && (mode == CDImode || mode == TImode || mode == BLKmode)
-      && intreg[0] + 1 == intreg[1])
-    {
-      if (mode == BLKmode)
-	{
-	  /* Use TImode for BLKmode values in 2 integer registers.  */
-	  exp[0] = gen_rtx_EXPR_LIST (VOIDmode,
-				      gen_rtx_REG (TImode, intreg[0]),
-				      GEN_INT (0));
-	  ret = gen_rtx_PARALLEL (mode, rtvec_alloc (1));
-	  XVECEXP (ret, 0, 0) = exp[0];
-	  return ret;
-	}
-      else
-	return gen_rtx_REG (mode, intreg[0]);
-    }
+bool
+ix86_can_use_return_insn_p (void)
+{
+  if (ix86_function_naked (current_function_decl))
+    return false;
 
-  /* Otherwise figure out the entries of the PARALLEL.  */
-  for (i = 0; i < n; i++)
-    {
-      int pos;
+  /* Don't use `ret' instruction in interrupt handler.  */
+  if (! reload_completed
+      || frame_pointer_needed
+      || cfun->machine->func_type != TYPE_NORMAL)
+    return 0;
 
-      switch (regclass[i])
-        {
-	  case X86_64_NO_CLASS:
-	    break;
-	  case X86_64_INTEGER_CLASS:
-	  case X86_64_INTEGERSI_CLASS:
-	    /* Merge TImodes on aligned occasions here too.  */
-	    if (i * 8 + 8 > bytes)
-	      {
-		unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
-		if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
-		  /* We've requested 24 bytes we
-		     don't have mode for.  Use DImode.  */
-		  tmpmode = DImode;
-	      }
-	    else if (regclass[i] == X86_64_INTEGERSI_CLASS)
-	      tmpmode = SImode;
-	    else
-	      tmpmode = DImode;
-	    exp [nexps++]
-	      = gen_rtx_EXPR_LIST (VOIDmode,
-				   gen_rtx_REG (tmpmode, *intreg),
-				   GEN_INT (i*8));
-	    intreg++;
-	    break;
-	  case X86_64_SSESF_CLASS:
-	    exp [nexps++]
-	      = gen_rtx_EXPR_LIST (VOIDmode,
-				   gen_rtx_REG (SFmode,
-						GET_SSE_REGNO (sse_regno)),
-				   GEN_INT (i*8));
-	    sse_regno++;
-	    break;
-	  case X86_64_SSEDF_CLASS:
-	    exp [nexps++]
-	      = gen_rtx_EXPR_LIST (VOIDmode,
-				   gen_rtx_REG (DFmode,
-						GET_SSE_REGNO (sse_regno)),
-				   GEN_INT (i*8));
-	    sse_regno++;
-	    break;
-	  case X86_64_SSE_CLASS:
-	    pos = i;
-	    switch (n)
-	      {
-	      case 1:
-		tmpmode = DImode;
-		break;
-	      case 2:
-		if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
-		  {
-		    tmpmode = TImode;
-		    i++;
-		  }
-		else
-		  tmpmode = DImode;
-		break;
-	      case 4:
-		gcc_assert (i == 0
-			    && regclass[1] == X86_64_SSEUP_CLASS
-			    && regclass[2] == X86_64_SSEUP_CLASS
-			    && regclass[3] == X86_64_SSEUP_CLASS);
-		tmpmode = OImode;
-		i += 3;
-		break;
-	      case 8:
-		gcc_assert (i == 0
-			    && regclass[1] == X86_64_SSEUP_CLASS
-			    && regclass[2] == X86_64_SSEUP_CLASS
-			    && regclass[3] == X86_64_SSEUP_CLASS
-			    && regclass[4] == X86_64_SSEUP_CLASS
-			    && regclass[5] == X86_64_SSEUP_CLASS
-			    && regclass[6] == X86_64_SSEUP_CLASS
-			    && regclass[7] == X86_64_SSEUP_CLASS);
-		tmpmode = XImode;
-		i += 7;
-		break;
-	      default:
-		gcc_unreachable ();
-	      }
-	    exp [nexps++]
-	      = gen_rtx_EXPR_LIST (VOIDmode,
-				   gen_rtx_REG (tmpmode,
-						GET_SSE_REGNO (sse_regno)),
-				   GEN_INT (pos*8));
-	    sse_regno++;
-	    break;
-	  default:
-	    gcc_unreachable ();
-	}
-    }
+  /* Don't allow more than 32k pop, since that's all we can do
+     with one instruction.  */
+  if (crtl->args.pops_args && crtl->args.size >= 32768)
+    return 0;
 
-  /* Empty aligned struct, union or class.  */
-  if (nexps == 0)
-    return NULL;
+  struct ix86_frame &frame = cfun->machine->frame;
+  return (frame.stack_pointer_offset == UNITS_PER_WORD
+	  && (frame.nregs + frame.nsseregs) == 0);
+}
+
+/* Value should be nonzero if functions must have frame pointers.
+   Zero means the frame pointer need not be set up (and parms may
+   be accessed via the stack pointer) in functions that seem suitable.  */
 
-  ret =  gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
-  for (i = 0; i < nexps; i++)
-    XVECEXP (ret, 0, i) = exp [i];
-  return ret;
+static bool
+ix86_frame_pointer_required (void)
+{
+  /* If we accessed previous frames, then the generated code expects
+     to be able to access the saved ebp value in our frame.  */
+  if (cfun->machine->accesses_prev_frame)
+    return true;
+
+  /* Several x86 os'es need a frame pointer for other reasons,
+     usually pertaining to setjmp.  */
+  if (SUBTARGET_FRAME_POINTER_REQUIRED)
+    return true;
+
+  /* For older 32-bit runtimes setjmp requires valid frame-pointer.  */
+  if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
+    return true;
+
+  /* Win64 SEH, very large frames need a frame-pointer as maximum stack
+     allocation is 4GB.  */
+  if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
+    return true;
+
+  /* SSE saves require frame-pointer when stack is misaligned.  */
+  if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
+    return true;
+  
+  /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
+     turns off the frame pointer by default.  Turn it back on now if
+     we've not got a leaf function.  */
+  if (TARGET_OMIT_LEAF_FRAME_POINTER
+      && (!crtl->is_leaf
+	  || ix86_current_function_calls_tls_descriptor))
+    return true;
+
+  if (crtl->profile && !flag_fentry)
+    return true;
+
+  return false;
 }
 
-/* Update the data in CUM to advance over an argument of mode MODE
-   and data type TYPE.  (TYPE is null for libcalls where that information
-   may not be available.)
+/* Record that the current function accesses previous call frames.  */
 
-   Return a number of integer regsiters advanced over.  */
+void
+ix86_setup_frame_addresses (void)
+{
+  cfun->machine->accesses_prev_frame = 1;
+}
+
+#ifndef USE_HIDDEN_LINKONCE
+# if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
+#  define USE_HIDDEN_LINKONCE 1
+# else
+#  define USE_HIDDEN_LINKONCE 0
+# endif
+#endif
 
-static int
-function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
-			 const_tree type, HOST_WIDE_INT bytes,
-			 HOST_WIDE_INT words)
+/* Label count for call and return thunks.  It is used to make unique
+   labels in call and return thunks.  */
+static int indirectlabelno;
+
+/* True if call thunk function is needed.  */
+static bool indirect_thunk_needed = false;
+
+/* Bit masks of integer registers, which contain branch target, used
+   by call thunk functions.  */
+static int indirect_thunks_used;
+
+/* True if return thunk function is needed.  */
+static bool indirect_return_needed = false;
+
+/* True if return thunk function via CX is needed.  */
+static bool indirect_return_via_cx;
+
+#ifndef INDIRECT_LABEL
+# define INDIRECT_LABEL "LIND"
+#endif
+
+/* Indicate what prefix is needed for an indirect branch.  */
+enum indirect_thunk_prefix
 {
-  int res = 0;
-  bool error_p = false;
+  indirect_thunk_prefix_none,
+  indirect_thunk_prefix_nt
+};
 
-  if (TARGET_IAMCU)
+/* Return the prefix needed for an indirect branch INSN.  */
+
+enum indirect_thunk_prefix
+indirect_thunk_need_prefix (rtx_insn *insn)
+{
+  enum indirect_thunk_prefix need_prefix;
+  if ((cfun->machine->indirect_branch_type
+	    == indirect_branch_thunk_extern)
+	   && ix86_notrack_prefixed_insn_p (insn))
     {
-      /* Intel MCU psABI passes scalars and aggregates no larger than 8
-	 bytes in registers.  */
-      if (!VECTOR_MODE_P (mode) && bytes <= 8)
-	goto pass_in_reg;
-      return res;
+      /* NOTRACK prefix is only used with external thunk so that it
+	 can be properly updated to support CET at run-time.  */
+      need_prefix = indirect_thunk_prefix_nt;
     }
+  else
+    need_prefix = indirect_thunk_prefix_none;
+  return need_prefix;
+}
 
-  switch (mode)
-    {
-    default:
-      break;
+/* Fills in the label name that should be used for the indirect thunk.  */
 
-    case E_BLKmode:
-      if (bytes < 0)
-	break;
-      /* FALLTHRU */
+static void
+indirect_thunk_name (char name[32], unsigned int regno,
+		     enum indirect_thunk_prefix need_prefix,
+		     bool ret_p)
+{
+  if (regno != INVALID_REGNUM && regno != CX_REG && ret_p)
+    gcc_unreachable ();
 
-    case E_DImode:
-    case E_SImode:
-    case E_HImode:
-    case E_QImode:
-pass_in_reg:
-      cum->words += words;
-      cum->nregs -= words;
-      cum->regno += words;
-      if (cum->nregs >= 0)
-	res = words;
-      if (cum->nregs <= 0)
+  if (USE_HIDDEN_LINKONCE)
+    {
+      const char *prefix;
+
+      if (need_prefix == indirect_thunk_prefix_nt
+	  && regno != INVALID_REGNUM)
 	{
-	  cum->nregs = 0;
-	  cfun->machine->arg_reg_available = false;
-	  cum->regno = 0;
+	  /* NOTRACK prefix is only used with external thunk via
+	     register so that NOTRACK prefix can be added to indirect
+	     branch via register to support CET at run-time.  */
+	  prefix = "_nt";
 	}
-      break;
-
-    case E_OImode:
-      /* OImode shouldn't be used directly.  */
-      gcc_unreachable ();
+      else
+	prefix = "";
 
-    case E_DFmode:
-      if (cum->float_in_sse == -1)
-	error_p = true;
-      if (cum->float_in_sse < 2)
-	break;
-      /* FALLTHRU */
-    case E_SFmode:
-      if (cum->float_in_sse == -1)
-	error_p = true;
-      if (cum->float_in_sse < 1)
-	break;
-      /* FALLTHRU */
+      const char *ret = ret_p ? "return" : "indirect";
 
-    case E_V8SFmode:
-    case E_V8SImode:
-    case E_V64QImode:
-    case E_V32HImode:
-    case E_V16SImode:
-    case E_V8DImode:
-    case E_V16SFmode:
-    case E_V8DFmode:
-    case E_V32QImode:
-    case E_V16HImode:
-    case E_V4DFmode:
-    case E_V4DImode:
-    case E_TImode:
-    case E_V16QImode:
-    case E_V8HImode:
-    case E_V4SImode:
-    case E_V2DImode:
-    case E_V4SFmode:
-    case E_V2DFmode:
-      if (!type || !AGGREGATE_TYPE_P (type))
+      if (regno != INVALID_REGNUM)
 	{
-	  cum->sse_words += words;
-	  cum->sse_nregs -= 1;
-	  cum->sse_regno += 1;
-	  if (cum->sse_nregs <= 0)
-	    {
-	      cum->sse_nregs = 0;
-	      cum->sse_regno = 0;
-	    }
-	}
-      break;
-
-    case E_V8QImode:
-    case E_V4HImode:
-    case E_V2SImode:
-    case E_V2SFmode:
-    case E_V1TImode:
-    case E_V1DImode:
-      if (!type || !AGGREGATE_TYPE_P (type))
-	{
-	  cum->mmx_words += words;
-	  cum->mmx_nregs -= 1;
-	  cum->mmx_regno += 1;
-	  if (cum->mmx_nregs <= 0)
-	    {
-	      cum->mmx_nregs = 0;
-	      cum->mmx_regno = 0;
-	    }
+	  const char *reg_prefix;
+	  if (LEGACY_INT_REGNO_P (regno))
+	    reg_prefix = TARGET_64BIT ? "r" : "e";
+	  else
+	    reg_prefix = "";
+	  sprintf (name, "__x86_%s_thunk%s_%s%s",
+		   ret, prefix, reg_prefix, reg_names[regno]);
 	}
-      break;
+      else
+	sprintf (name, "__x86_%s_thunk%s", ret, prefix);
     }
-  if (error_p)
+  else
     {
-      cum->float_in_sse = 0;
-      error ("calling %qD with SSE calling convention without "
-	     "SSE/SSE2 enabled", cum->decl);
-      sorry ("this is a GCC bug that can be worked around by adding "
-	     "attribute used to function called");
+      if (regno != INVALID_REGNUM)
+	ASM_GENERATE_INTERNAL_LABEL (name, "LITR", regno);
+      else
+	{
+	  if (ret_p)
+	    ASM_GENERATE_INTERNAL_LABEL (name, "LRT", 0);
+	  else
+	    ASM_GENERATE_INTERNAL_LABEL (name, "LIT", 0);
+	}
     }
-
-  return res;
 }
 
-static int
-function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
-			 const_tree type, HOST_WIDE_INT words, bool named)
-{
-  int int_nregs, sse_nregs;
+/* Output a call and return thunk for indirect branch.  If REGNO != -1,
+   the function address is in REGNO and the call and return thunk looks like:
 
-  /* Unnamed 512 and 256bit vector mode parameters are passed on stack.  */
-  if (!named && (VALID_AVX512F_REG_MODE (mode)
-		 || VALID_AVX256_REG_MODE (mode)))
-    return 0;
+	call	L2
+   L1:
+	pause
+	lfence
+	jmp	L1
+   L2:
+	mov	%REG, (%sp)
+	ret
 
-  if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
-      && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
-    {
-      cum->nregs -= int_nregs;
-      cum->sse_nregs -= sse_nregs;
-      cum->regno += int_nregs;
-      cum->sse_regno += sse_nregs;
-      return int_nregs;
-    }
-  else
-    {
-      int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
-      cum->words = ROUND_UP (cum->words, align);
-      cum->words += words;
-      return 0;
-    }
-}
+   Otherwise, the function address is on the top of stack and the
+   call and return thunk looks like:
 
-static int
-function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
-			    HOST_WIDE_INT words)
+	call L2
+  L1:
+	pause
+	lfence
+	jmp L1
+  L2:
+	lea WORD_SIZE(%sp), %sp
+	ret
+ */
+
+static void
+output_indirect_thunk (unsigned int regno)
 {
-  /* Otherwise, this should be passed indirect.  */
-  gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
+  char indirectlabel1[32];
+  char indirectlabel2[32];
 
-  cum->words += words;
-  if (cum->nregs > 0)
-    {
-      cum->nregs -= 1;
-      cum->regno += 1;
-      return 1;
-    }
-  return 0;
-}
+  ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, INDIRECT_LABEL,
+			       indirectlabelno++);
+  ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, INDIRECT_LABEL,
+			       indirectlabelno++);
 
-/* Update the data in CUM to advance over an argument of mode MODE and
-   data type TYPE.  (TYPE is null for libcalls where that information
-   may not be available.)  */
+  /* Call */
+  fputs ("\tcall\t", asm_out_file);
+  assemble_name_raw (asm_out_file, indirectlabel2);
+  fputc ('\n', asm_out_file);
 
-static void
-ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
-			   const_tree type, bool named)
-{
-  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
-  HOST_WIDE_INT bytes, words;
-  int nregs;
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
 
-  /* The argument of interrupt handler is a special case and is
-     handled in ix86_function_arg.  */
-  if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
-    return;
+  /* AMD and Intel CPUs prefer each a different instruction as loop filler.
+     Usage of both pause + lfence is compromise solution.  */
+  fprintf (asm_out_file, "\tpause\n\tlfence\n");
 
-  if (mode == BLKmode)
-    bytes = int_size_in_bytes (type);
-  else
-    bytes = GET_MODE_SIZE (mode);
-  words = CEIL (bytes, UNITS_PER_WORD);
+  /* Jump.  */
+  fputs ("\tjmp\t", asm_out_file);
+  assemble_name_raw (asm_out_file, indirectlabel1);
+  fputc ('\n', asm_out_file);
 
-  if (type)
-    mode = type_natural_mode (type, NULL, false);
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
 
-  if (TARGET_64BIT)
+  /* The above call insn pushed a word to stack.  Adjust CFI info.  */
+  if (flag_asynchronous_unwind_tables && dwarf2out_do_frame ())
     {
-      enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
+      if (! dwarf2out_do_cfi_asm ())
+	{
+	  dw_cfi_ref xcfi = ggc_cleared_alloc<dw_cfi_node> ();
+	  xcfi->dw_cfi_opc = DW_CFA_advance_loc4;
+	  xcfi->dw_cfi_oprnd1.dw_cfi_addr = ggc_strdup (indirectlabel2);
+	  vec_safe_push (cfun->fde->dw_fde_cfi, xcfi);
+	}
+      dw_cfi_ref xcfi = ggc_cleared_alloc<dw_cfi_node> ();
+      xcfi->dw_cfi_opc = DW_CFA_def_cfa_offset;
+      xcfi->dw_cfi_oprnd1.dw_cfi_offset = 2 * UNITS_PER_WORD;
+      vec_safe_push (cfun->fde->dw_fde_cfi, xcfi);
+      dwarf2out_emit_cfi (xcfi);
+    }
 
-      if (call_abi == MS_ABI)
-	nregs = function_arg_advance_ms_64 (cum, bytes, words);
-      else
-	nregs = function_arg_advance_64 (cum, mode, type, words, named);
+  if (regno != INVALID_REGNUM)
+    {
+      /* MOV.  */
+      rtx xops[2];
+      xops[0] = gen_rtx_MEM (word_mode, stack_pointer_rtx);
+      xops[1] = gen_rtx_REG (word_mode, regno);
+      output_asm_insn ("mov\t{%1, %0|%0, %1}", xops);
     }
   else
-    nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
-
-  if (!nregs)
     {
-      /* Track if there are outgoing arguments on stack.  */
-      if (cum->caller)
-	cfun->machine->outgoing_args_on_stack = true;
+      /* LEA.  */
+      rtx xops[2];
+      xops[0] = stack_pointer_rtx;
+      xops[1] = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+      output_asm_insn ("lea\t{%E1, %0|%0, %E1}", xops);
     }
-}
 
-/* Define where to put the arguments to a function.
-   Value is zero to push the argument on the stack,
-   or a hard register in which to store the argument.
+  fputs ("\tret\n", asm_out_file);
+}
 
-   MODE is the argument's machine mode.
-   TYPE is the data type of the argument (as a tree).
-    This is null for libcalls where that information may
-    not be available.
-   CUM is a variable of type CUMULATIVE_ARGS which gives info about
-    the preceding args and about the function being called.
-   NAMED is nonzero if this argument is a named parameter
-    (otherwise it is an extra parameter matching an ellipsis).  */
+/* Output a funtion with a call and return thunk for indirect branch.
+   If REGNO != INVALID_REGNUM, the function address is in REGNO.
+   Otherwise, the function address is on the top of stack.  Thunk is
+   used for function return if RET_P is true.  */
 
-static rtx
-function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
-		 machine_mode orig_mode, const_tree type,
-		 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
+static void
+output_indirect_thunk_function (enum indirect_thunk_prefix need_prefix,
+				unsigned int regno, bool ret_p)
 {
-  bool error_p = false;
+  char name[32];
+  tree decl;
 
-  /* Avoid the AL settings for the Unix64 ABI.  */
-  if (mode == VOIDmode)
-    return constm1_rtx;
+  /* Create __x86_indirect_thunk.  */
+  indirect_thunk_name (name, regno, need_prefix, ret_p);
+  decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
+		     get_identifier (name),
+		     build_function_type_list (void_type_node, NULL_TREE));
+  DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
+				   NULL_TREE, void_type_node);
+  TREE_PUBLIC (decl) = 1;
+  TREE_STATIC (decl) = 1;
+  DECL_IGNORED_P (decl) = 1;
 
-  if (TARGET_IAMCU)
+#if TARGET_MACHO
+  if (TARGET_MACHO)
     {
-      /* Intel MCU psABI passes scalars and aggregates no larger than 8
-	 bytes in registers.  */
-      if (!VECTOR_MODE_P (mode) && bytes <= 8)
-	goto pass_in_reg;
-      return NULL_RTX;
+      switch_to_section (darwin_sections[picbase_thunk_section]);
+      fputs ("\t.weak_definition\t", asm_out_file);
+      assemble_name (asm_out_file, name);
+      fputs ("\n\t.private_extern\t", asm_out_file);
+      assemble_name (asm_out_file, name);
+      putc ('\n', asm_out_file);
+      ASM_OUTPUT_LABEL (asm_out_file, name);
+      DECL_WEAK (decl) = 1;
     }
+  else
+#endif
+    if (USE_HIDDEN_LINKONCE)
+      {
+	cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
 
-  switch (mode)
-    {
-    default:
-      break;
+	targetm.asm_out.unique_section (decl, 0);
+	switch_to_section (get_named_section (decl, NULL, 0));
 
-    case E_BLKmode:
-      if (bytes < 0)
-	break;
-      /* FALLTHRU */
-    case E_DImode:
-    case E_SImode:
-    case E_HImode:
-    case E_QImode:
-pass_in_reg:
-      if (words <= cum->nregs)
-	{
-	  int regno = cum->regno;
+	targetm.asm_out.globalize_label (asm_out_file, name);
+	fputs ("\t.hidden\t", asm_out_file);
+	assemble_name (asm_out_file, name);
+	putc ('\n', asm_out_file);
+	ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
+      }
+    else
+      {
+	switch_to_section (text_section);
+	ASM_OUTPUT_LABEL (asm_out_file, name);
+      }
 
-	  /* Fastcall allocates the first two DWORD (SImode) or
-            smaller arguments to ECX and EDX if it isn't an
-            aggregate type .  */
-	  if (cum->fastcall)
-	    {
-	      if (mode == BLKmode
-		  || mode == DImode
-		  || (type && AGGREGATE_TYPE_P (type)))
-	        break;
+  DECL_INITIAL (decl) = make_node (BLOCK);
+  current_function_decl = decl;
+  allocate_struct_function (decl, false);
+  init_function_start (decl);
+  /* We're about to hide the function body from callees of final_* by
+     emitting it directly; tell them we're a thunk, if they care.  */
+  cfun->is_thunk = true;
+  first_function_block_is_cold = false;
+  /* Make sure unwind info is emitted for the thunk if needed.  */
+  final_start_function (emit_barrier (), asm_out_file, 1);
 
-	      /* ECX not EAX is the first allocated register.  */
-	      if (regno == AX_REG)
-		regno = CX_REG;
-	    }
-	  return gen_rtx_REG (mode, regno);
-	}
-      break;
+  output_indirect_thunk (regno);
 
-    case E_DFmode:
-      if (cum->float_in_sse == -1)
-	error_p = true;
-      if (cum->float_in_sse < 2)
-	break;
-      /* FALLTHRU */
-    case E_SFmode:
-      if (cum->float_in_sse == -1)
-	error_p = true;
-      if (cum->float_in_sse < 1)
-	break;
-      /* FALLTHRU */
-    case E_TImode:
-      /* In 32bit, we pass TImode in xmm registers.  */
-    case E_V16QImode:
-    case E_V8HImode:
-    case E_V4SImode:
-    case E_V2DImode:
-    case E_V4SFmode:
-    case E_V2DFmode:
-      if (!type || !AGGREGATE_TYPE_P (type))
-	{
-	  if (cum->sse_nregs)
-	    return gen_reg_or_parallel (mode, orig_mode,
-				        cum->sse_regno + FIRST_SSE_REG);
-	}
-      break;
+  final_end_function ();
+  init_insn_lengths ();
+  free_after_compilation (cfun);
+  set_cfun (NULL);
+  current_function_decl = NULL;
+}
 
-    case E_OImode:
-    case E_XImode:
-      /* OImode and XImode shouldn't be used directly.  */
-      gcc_unreachable ();
+static int pic_labels_used;
 
-    case E_V64QImode:
-    case E_V32HImode:
-    case E_V16SImode:
-    case E_V8DImode:
-    case E_V16SFmode:
-    case E_V8DFmode:
-    case E_V8SFmode:
-    case E_V8SImode:
-    case E_V32QImode:
-    case E_V16HImode:
-    case E_V4DFmode:
-    case E_V4DImode:
-      if (!type || !AGGREGATE_TYPE_P (type))
-	{
-	  if (cum->sse_nregs)
-	    return gen_reg_or_parallel (mode, orig_mode,
-				        cum->sse_regno + FIRST_SSE_REG);
-	}
-      break;
+/* Fills in the label name that should be used for a pc thunk for
+   the given register.  */
 
-    case E_V8QImode:
-    case E_V4HImode:
-    case E_V2SImode:
-    case E_V2SFmode:
-    case E_V1TImode:
-    case E_V1DImode:
-      if (!type || !AGGREGATE_TYPE_P (type))
-	{
-	  if (cum->mmx_nregs)
-	    return gen_reg_or_parallel (mode, orig_mode,
-				        cum->mmx_regno + FIRST_MMX_REG);
-	}
-      break;
-    }
-  if (error_p)
-    {
-      cum->float_in_sse = 0;
-      error ("calling %qD with SSE calling convention without "
-	     "SSE/SSE2 enabled", cum->decl);
-      sorry ("this is a GCC bug that can be worked around by adding "
-	     "attribute used to function called");
-    }
+static void
+get_pc_thunk_name (char name[32], unsigned int regno)
+{
+  gcc_assert (!TARGET_64BIT);
 
-  return NULL_RTX;
+  if (USE_HIDDEN_LINKONCE)
+    sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
+  else
+    ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
 }
 
-static rtx
-function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
-		 machine_mode orig_mode, const_tree type, bool named)
+
+/* This function generates code for -fpic that loads %ebx with
+   the return address of the caller and then returns.  */
+
+static void
+ix86_code_end (void)
 {
-  /* Handle a hidden AL argument containing number of registers
-     for varargs x86-64 functions.  */
-  if (mode == VOIDmode)
-    return GEN_INT (cum->maybe_vaarg
-		    ? (cum->sse_nregs < 0
-		       ? X86_64_SSE_REGPARM_MAX
-		       : cum->sse_regno)
-		    : -1);
+  rtx xops[2];
+  unsigned int regno;
 
-  switch (mode)
-    {
-    default:
-      break;
+  if (indirect_return_needed)
+    output_indirect_thunk_function (indirect_thunk_prefix_none,
+				    INVALID_REGNUM, true);
+  if (indirect_return_via_cx)
+    output_indirect_thunk_function (indirect_thunk_prefix_none,
+				    CX_REG, true);
+  if (indirect_thunk_needed)
+    output_indirect_thunk_function (indirect_thunk_prefix_none,
+				    INVALID_REGNUM, false);
 
-    case E_V8SFmode:
-    case E_V8SImode:
-    case E_V32QImode:
-    case E_V16HImode:
-    case E_V4DFmode:
-    case E_V4DImode:
-    case E_V16SFmode:
-    case E_V16SImode:
-    case E_V64QImode:
-    case E_V32HImode:
-    case E_V8DFmode:
-    case E_V8DImode:
-      /* Unnamed 256 and 512bit vector mode parameters are passed on stack.  */
-      if (!named)
-	return NULL;
-      break;
+  for (regno = FIRST_REX_INT_REG; regno <= LAST_REX_INT_REG; regno++)
+    {
+      unsigned int i = regno - FIRST_REX_INT_REG + LAST_INT_REG + 1;
+      if ((indirect_thunks_used & (1 << i)))
+	output_indirect_thunk_function (indirect_thunk_prefix_none,
+					regno, false);
     }
 
-  return construct_container (mode, orig_mode, type, 0, cum->nregs,
-			      cum->sse_nregs,
-			      &x86_64_int_parameter_registers [cum->regno],
-			      cum->sse_regno);
-}
+  for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
+    {
+      char name[32];
+      tree decl;
 
-static rtx
-function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
-		    machine_mode orig_mode, bool named, const_tree type,
-		    HOST_WIDE_INT bytes)
-{
-  unsigned int regno;
+      if ((indirect_thunks_used & (1 << regno)))
+	output_indirect_thunk_function (indirect_thunk_prefix_none,
+					regno, false);
 
-  /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
-     We use value of -2 to specify that current function call is MSABI.  */
-  if (mode == VOIDmode)
-    return GEN_INT (-2);
+      if (!(pic_labels_used & (1 << regno)))
+	continue;
 
-  /* If we've run out of registers, it goes on the stack.  */
-  if (cum->nregs == 0)
-    return NULL_RTX;
+      get_pc_thunk_name (name, regno);
 
-  regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
+      decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
+			 get_identifier (name),
+			 build_function_type_list (void_type_node, NULL_TREE));
+      DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
+				       NULL_TREE, void_type_node);
+      TREE_PUBLIC (decl) = 1;
+      TREE_STATIC (decl) = 1;
+      DECL_IGNORED_P (decl) = 1;
 
-  /* Only floating point modes are passed in anything but integer regs.  */
-  if (TARGET_SSE && (mode == SFmode || mode == DFmode))
-    {
-      if (named)
+#if TARGET_MACHO
+      if (TARGET_MACHO)
 	{
-	  if (type == NULL_TREE || !AGGREGATE_TYPE_P (type))
-	    regno = cum->regno + FIRST_SSE_REG;
+	  switch_to_section (darwin_sections[picbase_thunk_section]);
+	  fputs ("\t.weak_definition\t", asm_out_file);
+	  assemble_name (asm_out_file, name);
+	  fputs ("\n\t.private_extern\t", asm_out_file);
+	  assemble_name (asm_out_file, name);
+	  putc ('\n', asm_out_file);
+	  ASM_OUTPUT_LABEL (asm_out_file, name);
+	  DECL_WEAK (decl) = 1;
 	}
       else
+#endif
+      if (USE_HIDDEN_LINKONCE)
 	{
-	  rtx t1, t2;
+	  cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
 
-	  /* Unnamed floating parameters are passed in both the
-	     SSE and integer registers.  */
-	  t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
-	  t2 = gen_rtx_REG (mode, regno);
-	  t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
-	  t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
-	  return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
-	}
-    }
-  /* Handle aggregated types passed in register.  */
-  if (orig_mode == BLKmode)
-    {
-      if (bytes > 0 && bytes <= 8)
-        mode = (bytes > 4 ? DImode : SImode);
-      if (mode == BLKmode)
-        mode = DImode;
-    }
-
-  return gen_reg_or_parallel (mode, orig_mode, regno);
-}
-
-/* Return where to put the arguments to a function.
-   Return zero to push the argument on the stack, or a hard register in which to store the argument.
-
-   MODE is the argument's machine mode.  TYPE is the data type of the
-   argument.  It is null for libcalls where that information may not be
-   available.  CUM gives information about the preceding args and about
-   the function being called.  NAMED is nonzero if this argument is a
-   named parameter (otherwise it is an extra parameter matching an
-   ellipsis).  */
-
-static rtx
-ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
-		   const_tree type, bool named)
-{
-  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
-  machine_mode mode = omode;
-  HOST_WIDE_INT bytes, words;
-  rtx arg;
+	  targetm.asm_out.unique_section (decl, 0);
+	  switch_to_section (get_named_section (decl, NULL, 0));
 
-  if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
-    {
-      gcc_assert (type != NULL_TREE);
-      if (POINTER_TYPE_P (type))
-	{
-	  /* This is the pointer argument.  */
-	  gcc_assert (TYPE_MODE (type) == Pmode);
-	  /* It is at -WORD(AP) in the current frame in interrupt and
-	     exception handlers.  */
-	  arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
+	  targetm.asm_out.globalize_label (asm_out_file, name);
+	  fputs ("\t.hidden\t", asm_out_file);
+	  assemble_name (asm_out_file, name);
+	  putc ('\n', asm_out_file);
+	  ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
 	}
       else
 	{
-	  gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
-		      && TREE_CODE (type) == INTEGER_TYPE
-		      && TYPE_MODE (type) == word_mode);
-	  /* The error code is the word-mode integer argument at
-	     -2 * WORD(AP) in the current frame of the exception
-	     handler.  */
-	  arg = gen_rtx_MEM (word_mode,
-			     plus_constant (Pmode,
-					    arg_pointer_rtx,
-					    -2 * UNITS_PER_WORD));
+	  switch_to_section (text_section);
+	  ASM_OUTPUT_LABEL (asm_out_file, name);
 	}
-      return arg;
-    }
 
-  if (mode == BLKmode)
-    bytes = int_size_in_bytes (type);
-  else
-    bytes = GET_MODE_SIZE (mode);
-  words = CEIL (bytes, UNITS_PER_WORD);
+      DECL_INITIAL (decl) = make_node (BLOCK);
+      current_function_decl = decl;
+      allocate_struct_function (decl, false);
+      init_function_start (decl);
+      /* We're about to hide the function body from callees of final_* by
+	 emitting it directly; tell them we're a thunk, if they care.  */
+      cfun->is_thunk = true;
+      first_function_block_is_cold = false;
+      /* Make sure unwind info is emitted for the thunk if needed.  */
+      final_start_function (emit_barrier (), asm_out_file, 1);
 
-  /* To simplify the code below, represent vector types with a vector mode
-     even if MMX/SSE are not active.  */
-  if (type && TREE_CODE (type) == VECTOR_TYPE)
-    mode = type_natural_mode (type, cum, false);
+      /* Pad stack IP move with 4 instructions (two NOPs count
+	 as one instruction).  */
+      if (TARGET_PAD_SHORT_FUNCTION)
+	{
+	  int i = 8;
 
-  if (TARGET_64BIT)
-    {
-      enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
+	  while (i--)
+	    fputs ("\tnop\n", asm_out_file);
+	}
 
-      if (call_abi == MS_ABI)
-	arg = function_arg_ms_64 (cum, mode, omode, named, type, bytes);
-      else
-	arg = function_arg_64 (cum, mode, omode, type, named);
+      xops[0] = gen_rtx_REG (Pmode, regno);
+      xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
+      output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
+      output_asm_insn ("%!ret", NULL);
+      final_end_function ();
+      init_insn_lengths ();
+      free_after_compilation (cfun);
+      set_cfun (NULL);
+      current_function_decl = NULL;
     }
-  else
-    arg = function_arg_32 (cum, mode, omode, type, bytes, words);
 
-  /* Track if there are outgoing arguments on stack.  */
-  if (arg == NULL_RTX && cum->caller)
-    cfun->machine->outgoing_args_on_stack = true;
-
-  return arg;
+  if (flag_split_stack)
+    file_end_indicate_split_stack ();
 }
 
-/* A C expression that indicates when an argument must be passed by
-   reference.  If nonzero for an argument, a copy of that argument is
-   made in memory and a pointer to the argument is passed instead of
-   the argument itself.  The pointer is passed in whatever way is
-   appropriate for passing a pointer to that type.  */
+/* Emit code for the SET_GOT patterns.  */
 
-static bool
-ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
-			const_tree type, bool)
+const char *
+output_set_got (rtx dest, rtx label)
 {
-  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+  rtx xops[3];
 
-  if (TARGET_64BIT)
+  xops[0] = dest;
+
+  if (TARGET_VXWORKS_RTP && flag_pic)
     {
-      enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
+      /* Load (*VXWORKS_GOTT_BASE) into the PIC register.  */
+      xops[2] = gen_rtx_MEM (Pmode,
+			     gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
+      output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
 
-      /* See Windows x64 Software Convention.  */
-      if (call_abi == MS_ABI)
-	{
-	  HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
+      /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
+	 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
+	 an unadorned address.  */
+      xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
+      SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
+      output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
+      return "";
+    }
 
-	  if (type)
-	    {
-	      /* Arrays are passed by reference.  */
-	      if (TREE_CODE (type) == ARRAY_TYPE)
-		return true;
+  xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
 
-	      if (RECORD_OR_UNION_TYPE_P (type))
-		{
-		  /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
-		     are passed by reference.  */
-		  msize = int_size_in_bytes (type);
-		}
-	    }
+  if (flag_pic)
+    {
+      char name[32];
+      get_pc_thunk_name (name, REGNO (dest));
+      pic_labels_used |= 1 << REGNO (dest);
 
-	  /* __m128 is passed by reference.  */
-	  return msize != 1 && msize != 2 && msize != 4 && msize != 8;
-	}
-      else if (type && int_size_in_bytes (type) == -1)
-	return true;
+      xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
+      xops[2] = gen_rtx_MEM (QImode, xops[2]);
+      output_asm_insn ("%!call\t%X2", xops);
+
+#if TARGET_MACHO
+      /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
+         This is what will be referenced by the Mach-O PIC subsystem.  */
+      if (machopic_should_output_picbase_label () || !label)
+	ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
+
+      /* When we are restoring the pic base at the site of a nonlocal label,
+         and we decided to emit the pic base above, we will still output a
+         local label used for calculating the correction offset (even though
+         the offset will be 0 in that case).  */
+      if (label)
+        targetm.asm_out.internal_label (asm_out_file, "L",
+					   CODE_LABEL_NUMBER (label));
+#endif
+    }
+  else
+    {
+      if (TARGET_MACHO)
+	/* We don't need a pic base, we're not producing pic.  */
+	gcc_unreachable ();
+
+      xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
+      output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
+      targetm.asm_out.internal_label (asm_out_file, "L",
+				      CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
     }
 
-  return false;
+  if (!TARGET_MACHO)
+    output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
+
+  return "";
 }
 
-/* Return true when TYPE should be 128bit aligned for 32bit argument
-   passing ABI.  XXX: This function is obsolete and is only used for
-   checking psABI compatibility with previous versions of GCC.  */
+/* Generate an "push" pattern for input ARG.  */
 
-static bool
-ix86_compat_aligned_value_p (const_tree type)
+rtx
+gen_push (rtx arg)
 {
-  machine_mode mode = TYPE_MODE (type);
-  if (((TARGET_SSE && SSE_REG_MODE_P (mode))
-       || mode == TDmode
-       || mode == TFmode
-       || mode == TCmode)
-      && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
-    return true;
-  if (TYPE_ALIGN (type) < 128)
-    return false;
+  struct machine_function *m = cfun->machine;
 
-  if (AGGREGATE_TYPE_P (type))
-    {
-      /* Walk the aggregates recursively.  */
-      switch (TREE_CODE (type))
-	{
-	case RECORD_TYPE:
-	case UNION_TYPE:
-	case QUAL_UNION_TYPE:
-	  {
-	    tree field;
+  if (m->fs.cfa_reg == stack_pointer_rtx)
+    m->fs.cfa_offset += UNITS_PER_WORD;
+  m->fs.sp_offset += UNITS_PER_WORD;
 
-	    /* Walk all the structure fields.  */
-	    for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
-	      {
-		if (TREE_CODE (field) == FIELD_DECL
-		    && ix86_compat_aligned_value_p (TREE_TYPE (field)))
-		  return true;
-	      }
-	    break;
-	  }
+  if (REG_P (arg) && GET_MODE (arg) != word_mode)
+    arg = gen_rtx_REG (word_mode, REGNO (arg));
 
-	case ARRAY_TYPE:
-	  /* Just for use if some languages passes arrays by value.  */
-	  if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
-	    return true;
-	  break;
+  return gen_rtx_SET (gen_rtx_MEM (word_mode,
+				   gen_rtx_PRE_DEC (Pmode,
+						    stack_pointer_rtx)),
+		      arg);
+}
 
-	default:
-	  gcc_unreachable ();
-	}
-    }
-  return false;
+/* Generate an "pop" pattern for input ARG.  */
+
+rtx
+gen_pop (rtx arg)
+{
+  if (REG_P (arg) && GET_MODE (arg) != word_mode)
+    arg = gen_rtx_REG (word_mode, REGNO (arg));
+
+  return gen_rtx_SET (arg,
+		      gen_rtx_MEM (word_mode,
+				   gen_rtx_POST_INC (Pmode,
+						     stack_pointer_rtx)));
 }
 
-/* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
-   XXX: This function is obsolete and is only used for checking psABI
-   compatibility with previous versions of GCC.  */
+/* Return >= 0 if there is an unused call-clobbered register available
+   for the entire function.  */
 
 static unsigned int
-ix86_compat_function_arg_boundary (machine_mode mode,
-				   const_tree type, unsigned int align)
+ix86_select_alt_pic_regnum (void)
 {
-  /* In 32bit, only _Decimal128 and __float128 are aligned to their
-     natural boundaries.  */
-  if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
-    {
-      /* i386 ABI defines all arguments to be 4 byte aligned.  We have to
-	 make an exception for SSE modes since these require 128bit
-	 alignment.
-
-	 The handling here differs from field_alignment.  ICC aligns MMX
-	 arguments to 4 byte boundaries, while structure fields are aligned
-	 to 8 byte boundaries.  */
-      if (!type)
-	{
-	  if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
-	    align = PARM_BOUNDARY;
-	}
+  if (ix86_use_pseudo_pic_reg ())
+    return INVALID_REGNUM;
+
+  if (crtl->is_leaf
+      && !crtl->profile
+      && !ix86_current_function_calls_tls_descriptor)
+    {
+      int i, drap;
+      /* Can't use the same register for both PIC and DRAP.  */
+      if (crtl->drap_reg)
+	drap = REGNO (crtl->drap_reg);
       else
-	{
-	  if (!ix86_compat_aligned_value_p (type))
-	    align = PARM_BOUNDARY;
-	}
+	drap = -1;
+      for (i = 2; i >= 0; --i)
+        if (i != drap && !df_regs_ever_live_p (i))
+	  return i;
     }
-  if (align > BIGGEST_ALIGNMENT)
-    align = BIGGEST_ALIGNMENT;
-  return align;
+
+  return INVALID_REGNUM;
 }
 
-/* Return true when TYPE should be 128bit aligned for 32bit argument
-   passing ABI.  */
+/* Return true if REGNO is used by the epilogue.  */
 
-static bool
-ix86_contains_aligned_value_p (const_tree type)
+bool
+ix86_epilogue_uses (int regno)
 {
-  machine_mode mode = TYPE_MODE (type);
-
-  if (mode == XFmode || mode == XCmode)
-    return false;
-
-  if (TYPE_ALIGN (type) < 128)
-    return false;
-
-  if (AGGREGATE_TYPE_P (type))
-    {
-      /* Walk the aggregates recursively.  */
-      switch (TREE_CODE (type))
-	{
-	case RECORD_TYPE:
-	case UNION_TYPE:
-	case QUAL_UNION_TYPE:
-	  {
-	    tree field;
-
-	    /* Walk all the structure fields.  */
-	    for (field = TYPE_FIELDS (type);
-		 field;
-		 field = DECL_CHAIN (field))
-	      {
-		if (TREE_CODE (field) == FIELD_DECL
-		    && ix86_contains_aligned_value_p (TREE_TYPE (field)))
-		  return true;
-	      }
-	    break;
-	  }
-
-	case ARRAY_TYPE:
-	  /* Just for use if some languages passes arrays by value.  */
-	  if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
-	    return true;
-	  break;
+  /* If there are no caller-saved registers, we preserve all registers,
+     except for MMX and x87 registers which aren't supported when saving
+     and restoring registers.  Don't explicitly save SP register since
+     it is always preserved.  */
+  return (epilogue_completed
+	  && cfun->machine->no_caller_saved_registers
+	  && !fixed_regs[regno]
+	  && !STACK_REGNO_P (regno)
+	  && !MMX_REGNO_P (regno));
+}
 
-	default:
-	  gcc_unreachable ();
-	}
-    }
-  else
-    return TYPE_ALIGN (type) >= 128;
+/* Return nonzero if register REGNO can be used as a scratch register
+   in peephole2.  */
 
-  return false;
+static bool
+ix86_hard_regno_scratch_ok (unsigned int regno)
+{
+  /* If there are no caller-saved registers, we can't use any register
+     as a scratch register after epilogue and use REGNO as scratch
+     register only if it has been used before to avoid saving and
+     restoring it.  */
+  return (!cfun->machine->no_caller_saved_registers
+	  || (!epilogue_completed
+	      && df_regs_ever_live_p (regno)));
 }
 
-/* Gives the alignment boundary, in bits, of an argument with the
-   specified mode and type.  */
+/* Return TRUE if we need to save REGNO.  */
 
-static unsigned int
-ix86_function_arg_boundary (machine_mode mode, const_tree type)
+bool
+ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
 {
-  unsigned int align;
-  if (type)
+  /* If there are no caller-saved registers, we preserve all registers,
+     except for MMX and x87 registers which aren't supported when saving
+     and restoring registers.  Don't explicitly save SP register since
+     it is always preserved.  */
+  if (cfun->machine->no_caller_saved_registers)
     {
-      /* Since the main variant type is used for call, we convert it to
-	 the main variant type.  */
-      type = TYPE_MAIN_VARIANT (type);
-      align = TYPE_ALIGN (type);
-      if (TYPE_EMPTY_P (type))
-	return PARM_BOUNDARY;
+      /* Don't preserve registers used for function return value.  */
+      rtx reg = crtl->return_rtx;
+      if (reg)
+	{
+	  unsigned int i = REGNO (reg);
+	  unsigned int nregs = REG_NREGS (reg);
+	  while (nregs-- > 0)
+	    if ((i + nregs) == regno)
+	      return false;
+	}
+
+      return (df_regs_ever_live_p (regno)
+	      && !fixed_regs[regno]
+	      && !STACK_REGNO_P (regno)
+	      && !MMX_REGNO_P (regno)
+	      && (regno != HARD_FRAME_POINTER_REGNUM
+		  || !frame_pointer_needed));
     }
-  else
-    align = GET_MODE_ALIGNMENT (mode);
-  if (align < PARM_BOUNDARY)
-    align = PARM_BOUNDARY;
-  else
-    {
-      static bool warned;
-      unsigned int saved_align = align;
 
-      if (!TARGET_64BIT)
+  if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
+      && pic_offset_table_rtx)
+    {
+      if (ix86_use_pseudo_pic_reg ())
 	{
-	  /* i386 ABI defines XFmode arguments to be 4 byte aligned.  */
-	  if (!type)
-	    {
-	      if (mode == XFmode || mode == XCmode)
-		align = PARM_BOUNDARY;
-	    }
-	  else if (!ix86_contains_aligned_value_p (type))
-	    align = PARM_BOUNDARY;
-
-	  if (align < 128)
-	    align = PARM_BOUNDARY;
+	  /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
+	  _mcount in prologue.  */
+	  if (!TARGET_64BIT && flag_pic && crtl->profile)
+	    return true;
 	}
+      else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
+	       || crtl->profile
+	       || crtl->calls_eh_return
+	       || crtl->uses_const_pool
+	       || cfun->has_nonlocal_label)
+        return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
+    }
 
-      if (warn_psabi
-	  && !warned
-	  && align != ix86_compat_function_arg_boundary (mode, type,
-							 saved_align))
+  if (crtl->calls_eh_return && maybe_eh_return)
+    {
+      unsigned i;
+      for (i = 0; ; i++)
 	{
-	  warned = true;
-	  inform (input_location,
-		  "the ABI for passing parameters with %d-byte"
-		  " alignment has changed in GCC 4.6",
-		  align / BITS_PER_UNIT);
+	  unsigned test = EH_RETURN_DATA_REGNO (i);
+	  if (test == INVALID_REGNUM)
+	    break;
+	  if (test == regno)
+	    return true;
 	}
     }
 
-  return align;
-}
-
-/* Return true if N is a possible register number of function value.  */
-
-static bool
-ix86_function_value_regno_p (const unsigned int regno)
-{
-  switch (regno)
+  if (ignore_outlined && cfun->machine->call_ms2sysv)
     {
-    case AX_REG:
-      return true;
-    case DX_REG:
-      return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
-    case DI_REG:
-    case SI_REG:
-      return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
-
-      /* Complex values are returned in %st(0)/%st(1) pair.  */
-    case ST0_REG:
-    case ST1_REG:
-      /* TODO: The function should depend on current function ABI but
-       builtins.c would need updating then. Therefore we use the
-       default ABI.  */
-      if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
-	return false;
-      return TARGET_FLOAT_RETURNS_IN_80387;
-
-      /* Complex values are returned in %xmm0/%xmm1 pair.  */
-    case XMM0_REG:
-    case XMM1_REG:
-      return TARGET_SSE;
-
-    case MM0_REG:
-      if (TARGET_MACHO || TARGET_64BIT)
+      unsigned count = cfun->machine->call_ms2sysv_extra_regs
+		       + xlogue_layout::MIN_REGS;
+      if (xlogue_layout::is_stub_managed_reg (regno, count))
 	return false;
-      return TARGET_MMX;
     }
 
-  return false;
+  if (crtl->drap_reg
+      && regno == REGNO (crtl->drap_reg)
+      && !cfun->machine->no_drap_save_restore)
+    return true;
+
+  return (df_regs_ever_live_p (regno)
+	  && !call_used_or_fixed_reg_p (regno)
+	  && !fixed_regs[regno]
+	  && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
 }
 
-/* Define how to find the value returned by a function.
-   VALTYPE is the data type of the value (as a tree).
-   If the precise function being called is known, FUNC is its FUNCTION_DECL;
-   otherwise, FUNC is 0.  */
+/* Return number of saved general prupose registers.  */
 
-static rtx
-function_value_32 (machine_mode orig_mode, machine_mode mode,
-		   const_tree fntype, const_tree fn)
+static int
+ix86_nsaved_regs (void)
 {
-  unsigned int regno;
+  int nregs = 0;
+  int regno;
 
-  /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
-     we normally prevent this case when mmx is not available.  However
-     some ABIs may require the result to be returned like DImode.  */
-  if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
-    regno = FIRST_MMX_REG;
-
-  /* 16-byte vector modes in %xmm0.  See ix86_return_in_memory for where
-     we prevent this case when sse is not available.  However some ABIs
-     may require the result to be returned like integer TImode.  */
-  else if (mode == TImode
-	   || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
-    regno = FIRST_SSE_REG;
-
-  /* 32-byte vector modes in %ymm0.   */
-  else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
-    regno = FIRST_SSE_REG;
+  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+      nregs ++;
+  return nregs;
+}
 
-  /* 64-byte vector modes in %zmm0.   */
-  else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
-    regno = FIRST_SSE_REG;
+/* Return number of saved SSE registers.  */
 
-  /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387).  */
-  else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
-    regno = FIRST_FLOAT_REG;
-  else
-    /* Most things go in %eax.  */
-    regno = AX_REG;
+static int
+ix86_nsaved_sseregs (void)
+{
+  int nregs = 0;
+  int regno;
 
-  /* Override FP return register with %xmm0 for local functions when
-     SSE math is enabled or for functions with sseregparm attribute.  */
-  if ((fn || fntype) && (mode == SFmode || mode == DFmode))
-    {
-      int sse_level = ix86_function_sseregparm (fntype, fn, false);
-      if (sse_level == -1)
-	{
-	  error ("calling %qD with SSE calling convention without "
-		 "SSE/SSE2 enabled", fn);
-	  sorry ("this is a GCC bug that can be worked around by adding "
-		 "attribute used to function called");
-	}
-      else if ((sse_level >= 1 && mode == SFmode)
-	       || (sse_level == 2 && mode == DFmode))
-	regno = FIRST_SSE_REG;
-    }
+  if (!TARGET_64BIT_MS_ABI)
+    return 0;
+  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+    if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+      nregs ++;
+  return nregs;
+}
 
-  /* OImode shouldn't be used directly.  */
-  gcc_assert (mode != OImode);
+/* Given FROM and TO register numbers, say whether this elimination is
+   allowed.  If stack alignment is needed, we can only replace argument
+   pointer with hard frame pointer, or replace frame pointer with stack
+   pointer.  Otherwise, frame pointer elimination is automatically
+   handled and all other eliminations are valid.  */
 
-  return gen_rtx_REG (orig_mode, regno);
+static bool
+ix86_can_eliminate (const int from, const int to)
+{
+  if (stack_realign_fp)
+    return ((from == ARG_POINTER_REGNUM
+	     && to == HARD_FRAME_POINTER_REGNUM)
+	    || (from == FRAME_POINTER_REGNUM
+		&& to == STACK_POINTER_REGNUM));
+  else
+    return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
 }
 
-static rtx
-function_value_64 (machine_mode orig_mode, machine_mode mode,
-		   const_tree valtype)
+/* Return the offset between two registers, one to be eliminated, and the other
+   its replacement, at the start of a routine.  */
+
+HOST_WIDE_INT
+ix86_initial_elimination_offset (int from, int to)
 {
-  rtx ret;
+  struct ix86_frame &frame = cfun->machine->frame;
 
-  /* Handle libcalls, which don't provide a type node.  */
-  if (valtype == NULL)
+  if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
+    return frame.hard_frame_pointer_offset;
+  else if (from == FRAME_POINTER_REGNUM
+	   && to == HARD_FRAME_POINTER_REGNUM)
+    return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
+  else
     {
-      unsigned int regno;
+      gcc_assert (to == STACK_POINTER_REGNUM);
 
-      switch (mode)
-	{
-	case E_SFmode:
-	case E_SCmode:
-	case E_DFmode:
-	case E_DCmode:
-	case E_TFmode:
-	case E_SDmode:
-	case E_DDmode:
-	case E_TDmode:
-	  regno = FIRST_SSE_REG;
-	  break;
-	case E_XFmode:
-	case E_XCmode:
-	  regno = FIRST_FLOAT_REG;
-	  break;
-	case E_TCmode:
-	  return NULL;
-	default:
-	  regno = AX_REG;
-	}
+      if (from == ARG_POINTER_REGNUM)
+	return frame.stack_pointer_offset;
 
-      return gen_rtx_REG (mode, regno);
-    }
-  else if (POINTER_TYPE_P (valtype))
-    {
-      /* Pointers are always returned in word_mode.  */
-      mode = word_mode;
+      gcc_assert (from == FRAME_POINTER_REGNUM);
+      return frame.stack_pointer_offset - frame.frame_pointer_offset;
     }
-
-  ret = construct_container (mode, orig_mode, valtype, 1,
-			     X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
-			     x86_64_int_return_registers, 0);
-
-  /* For zero sized structures, construct_container returns NULL, but we
-     need to keep rest of compiler happy by returning meaningful value.  */
-  if (!ret)
-    ret = gen_rtx_REG (orig_mode, AX_REG);
-
-  return ret;
 }
 
+/* In a dynamically-aligned function, we can't know the offset from
+   stack pointer to frame pointer, so we must ensure that setjmp
+   eliminates fp against the hard fp (%ebp) rather than trying to
+   index from %esp up to the top of the frame across a gap that is
+   of unknown (at compile-time) size.  */
 static rtx
-function_value_ms_32 (machine_mode orig_mode, machine_mode mode,
-		      const_tree fntype, const_tree fn, const_tree valtype)
+ix86_builtin_setjmp_frame_value (void)
 {
-  unsigned int regno;
-
-  /* Floating point return values in %st(0)
-     (unless -mno-fp-ret-in-387 or aggregate type of up to 8 bytes).  */
-  if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387
-	   && (GET_MODE_SIZE (mode) > 8
-	       || valtype == NULL_TREE || !AGGREGATE_TYPE_P (valtype)))
-  {
-    regno = FIRST_FLOAT_REG;
-    return gen_rtx_REG (orig_mode, regno);
-  }
-  else
-    return function_value_32(orig_mode, mode, fntype,fn);
+  return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
 }
 
-static rtx
-function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
-		      const_tree valtype)
+/* Emits a warning for unsupported msabi to sysv pro/epilogues.  */
+void warn_once_call_ms2sysv_xlogues (const char *feature)
 {
-  unsigned int regno = AX_REG;
-
-  if (TARGET_SSE)
+  static bool warned_once = false;
+  if (!warned_once)
     {
-      switch (GET_MODE_SIZE (mode))
-	{
-	case 16:
-	  if (valtype != NULL_TREE
-	      && !VECTOR_INTEGER_TYPE_P (valtype)
-	      && !VECTOR_INTEGER_TYPE_P (valtype)
-	      && !INTEGRAL_TYPE_P (valtype)
-	      && !VECTOR_FLOAT_TYPE_P (valtype))
-	    break;
-	  if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
-	      && !COMPLEX_MODE_P (mode))
-	    regno = FIRST_SSE_REG;
-	  break;
-	case 8:
-	case 4:
-	  if (valtype != NULL_TREE && AGGREGATE_TYPE_P (valtype))
-	    break;
-	  if (mode == SFmode || mode == DFmode)
-	    regno = FIRST_SSE_REG;
-	  break;
-	default:
-	  break;
-        }
+      warning (0, "%<-mcall-ms2sysv-xlogues%> is not compatible with %s",
+	       feature);
+      warned_once = true;
     }
-  return gen_rtx_REG (orig_mode, regno);
 }
 
-static rtx
-ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
-		       machine_mode orig_mode, machine_mode mode)
-{
-  const_tree fn, fntype;
+/* Return the probing interval for -fstack-clash-protection.  */
 
-  fn = NULL_TREE;
-  if (fntype_or_decl && DECL_P (fntype_or_decl))
-    fn = fntype_or_decl;
-  fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
-  
-  if (ix86_function_type_abi (fntype) == MS_ABI)
-    {
-      if (TARGET_64BIT)
-	return function_value_ms_64 (orig_mode, mode, valtype);
-      else
-	return function_value_ms_32 (orig_mode, mode, fntype, fn, valtype);
-    }
-  else if (TARGET_64BIT)
-    return function_value_64 (orig_mode, mode, valtype);
+static HOST_WIDE_INT
+get_probe_interval (void)
+{
+  if (flag_stack_clash_protection)
+    return (HOST_WIDE_INT_1U
+	    << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
   else
-    return function_value_32 (orig_mode, mode, fntype, fn);
+    return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
 }
 
-static rtx
-ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
-{
-  machine_mode mode, orig_mode;
+/* When using -fsplit-stack, the allocation routines set a field in
+   the TCB to the bottom of the stack plus this much space, measured
+   in bytes.  */
 
-  orig_mode = TYPE_MODE (valtype);
-  mode = type_natural_mode (valtype, NULL, true);
-  return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
-}
+#define SPLIT_STACK_AVAILABLE 256
 
-/* Pointer function arguments and return values are promoted to
-   word_mode for normal functions.  */
+/* Fill structure ix86_frame about frame of currently computed function.  */
 
-static machine_mode
-ix86_promote_function_mode (const_tree type, machine_mode mode,
-			    int *punsignedp, const_tree fntype,
-			    int for_return)
+static void
+ix86_compute_frame_layout (void)
 {
-  if (cfun->machine->func_type == TYPE_NORMAL
-      && type != NULL_TREE
-      && POINTER_TYPE_P (type))
-    {
-      *punsignedp = POINTERS_EXTEND_UNSIGNED;
-      return word_mode;
-    }
-  return default_promote_function_mode (type, mode, punsignedp, fntype,
-					for_return);
-}
-
-/* Return true if a structure, union or array with MODE containing FIELD
-   should be accessed using BLKmode.  */
-
-static bool
-ix86_member_type_forces_blk (const_tree field, machine_mode mode)
-{
-  /* Union with XFmode must be in BLKmode.  */
-  return (mode == XFmode
-	  && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
-	      || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
-}
-
-rtx
-ix86_libcall_value (machine_mode mode)
-{
-  return ix86_function_value_1 (NULL, NULL, mode, mode);
-}
-
-/* Return true iff type is returned in memory.  */
-
-static bool
-ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
-{
-#ifdef SUBTARGET_RETURN_IN_MEMORY
-  return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
-#else
-  const machine_mode mode = type_natural_mode (type, NULL, true);
-  HOST_WIDE_INT size;
+  struct ix86_frame *frame = &cfun->machine->frame;
+  struct machine_function *m = cfun->machine;
+  unsigned HOST_WIDE_INT stack_alignment_needed;
+  HOST_WIDE_INT offset;
+  unsigned HOST_WIDE_INT preferred_alignment;
+  HOST_WIDE_INT size = get_frame_size ();
+  HOST_WIDE_INT to_allocate;
 
-  if (TARGET_64BIT)
+  /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
+   * ms_abi functions that call a sysv function.  We now need to prune away
+   * cases where it should be disabled.  */
+  if (TARGET_64BIT && m->call_ms2sysv)
     {
-      if (ix86_function_type_abi (fntype) == MS_ABI)
-	{
-	  size = int_size_in_bytes (type);
+      gcc_assert (TARGET_64BIT_MS_ABI);
+      gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
+      gcc_assert (!TARGET_SEH);
+      gcc_assert (TARGET_SSE);
+      gcc_assert (!ix86_using_red_zone ());
 
-	  /* __m128 is returned in xmm0.  */
-	  if ((!type || VECTOR_INTEGER_TYPE_P (type)
-	       || INTEGRAL_TYPE_P (type)
-	       || VECTOR_FLOAT_TYPE_P (type))
-	      && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
-	      && !COMPLEX_MODE_P (mode)
-	      && (GET_MODE_SIZE (mode) == 16 || size == 16))
-	    return false;
+      if (crtl->calls_eh_return)
+	{
+	  gcc_assert (!reload_completed);
+	  m->call_ms2sysv = false;
+	  warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
+	}
 
-	  /* Otherwise, the size must be exactly in [1248]. */
-	  return size != 1 && size != 2 && size != 4 && size != 8;
+      else if (ix86_static_chain_on_stack)
+	{
+	  gcc_assert (!reload_completed);
+	  m->call_ms2sysv = false;
+	  warn_once_call_ms2sysv_xlogues ("static call chains");
 	}
+
+      /* Finally, compute which registers the stub will manage.  */
       else
 	{
-	  int needed_intregs, needed_sseregs;
-
-	  return examine_argument (mode, type, 1,
-				   &needed_intregs, &needed_sseregs);
+	  unsigned count = xlogue_layout::count_stub_managed_regs ();
+	  m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
+	  m->call_ms2sysv_pad_in = 0;
 	}
     }
-  else
-    {
-      size = int_size_in_bytes (type);
-
-      /* Intel MCU psABI returns scalars and aggregates no larger than 8
-	 bytes in registers.  */
-      if (TARGET_IAMCU)
-	return VECTOR_MODE_P (mode) || size < 0 || size > 8;
-
-      if (mode == BLKmode)
-	return true;
-
-      if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
-	return false;
 
-      if (VECTOR_MODE_P (mode) || mode == TImode)
-	{
-	  /* User-created vectors small enough to fit in EAX.  */
-	  if (size < 8)
-	    return false;
+  frame->nregs = ix86_nsaved_regs ();
+  frame->nsseregs = ix86_nsaved_sseregs ();
 
-	  /* Unless ABI prescibes otherwise,
-	     MMX/3dNow values are returned in MM0 if available.  */
-	     
-	  if (size == 8)
-	    return TARGET_VECT8_RETURNS || !TARGET_MMX;
+  /* 64-bit MS ABI seem to require stack alignment to be always 16,
+     except for function prologues, leaf functions and when the defult
+     incoming stack boundary is overriden at command line or via
+     force_align_arg_pointer attribute.
 
-	  /* SSE values are returned in XMM0 if available.  */
-	  if (size == 16)
-	    return !TARGET_SSE;
+     Darwin's ABI specifies 128b alignment for both 32 and  64 bit variants
+     at call sites, including profile function calls.
+ */
+  if (((TARGET_64BIT_MS_ABI || TARGET_MACHO)
+        && crtl->preferred_stack_boundary < 128)
+      && (!crtl->is_leaf || cfun->calls_alloca != 0
+	  || ix86_current_function_calls_tls_descriptor
+	  || (TARGET_MACHO && crtl->profile)
+	  || ix86_incoming_stack_boundary < 128))
+    {
+      crtl->preferred_stack_boundary = 128;
+      crtl->stack_alignment_needed = 128;
+    }
 
-	  /* AVX values are returned in YMM0 if available.  */
-	  if (size == 32)
-	    return !TARGET_AVX;
+  stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
+  preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
 
-	  /* AVX512F values are returned in ZMM0 if available.  */
-	  if (size == 64)
-	    return !TARGET_AVX512F;
-	}
+  gcc_assert (!size || stack_alignment_needed);
+  gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
+  gcc_assert (preferred_alignment <= stack_alignment_needed);
 
-      if (mode == XFmode)
-	return false;
+  /* The only ABI saving SSE regs should be 64-bit ms_abi.  */
+  gcc_assert (TARGET_64BIT || !frame->nsseregs);
+  if (TARGET_64BIT && m->call_ms2sysv)
+    {
+      gcc_assert (stack_alignment_needed >= 16);
+      gcc_assert (!frame->nsseregs);
+    }
 
-      if (size > 12)
-	return true;
+  /* For SEH we have to limit the amount of code movement into the prologue.
+     At present we do this via a BLOCKAGE, at which point there's very little
+     scheduling that can be done, which means that there's very little point
+     in doing anything except PUSHs.  */
+  if (TARGET_SEH)
+    m->use_fast_prologue_epilogue = false;
+  else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
+    {
+      int count = frame->nregs;
+      struct cgraph_node *node = cgraph_node::get (current_function_decl);
 
-      /* OImode shouldn't be used directly.  */
-      gcc_assert (mode != OImode);
+      /* The fast prologue uses move instead of push to save registers.  This
+         is significantly longer, but also executes faster as modern hardware
+         can execute the moves in parallel, but can't do that for push/pop.
 
-      return false;
+	 Be careful about choosing what prologue to emit:  When function takes
+	 many instructions to execute we may use slow version as well as in
+	 case function is known to be outside hot spot (this is known with
+	 feedback only).  Weight the size of function by number of registers
+	 to save as it is cheap to use one or two push instructions but very
+	 slow to use many of them.  */
+      if (count)
+	count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
+      if (node->frequency < NODE_FREQUENCY_NORMAL
+	  || (flag_branch_probabilities
+	      && node->frequency < NODE_FREQUENCY_HOT))
+	m->use_fast_prologue_epilogue = false;
+      else
+	m->use_fast_prologue_epilogue
+	   = !expensive_function_p (count);
     }
-#endif
-}
 
-
-/* Create the va_list data type.  */
+  frame->save_regs_using_mov
+    = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
+       /* If static stack checking is enabled and done with probes,
+	  the registers need to be saved before allocating the frame.  */
+       && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
 
-static tree
-ix86_build_builtin_va_list_64 (void)
-{
-  tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
+  /* Skip return address and error code in exception handler.  */
+  offset = INCOMING_FRAME_SP_OFFSET;
 
-  record = lang_hooks.types.make_type (RECORD_TYPE);
-  type_decl = build_decl (BUILTINS_LOCATION,
-			  TYPE_DECL, get_identifier ("__va_list_tag"), record);
+  /* Skip pushed static chain.  */
+  if (ix86_static_chain_on_stack)
+    offset += UNITS_PER_WORD;
 
-  f_gpr = build_decl (BUILTINS_LOCATION,
-		      FIELD_DECL, get_identifier ("gp_offset"),
-		      unsigned_type_node);
-  f_fpr = build_decl (BUILTINS_LOCATION,
-		      FIELD_DECL, get_identifier ("fp_offset"),
-		      unsigned_type_node);
-  f_ovf = build_decl (BUILTINS_LOCATION,
-		      FIELD_DECL, get_identifier ("overflow_arg_area"),
-		      ptr_type_node);
-  f_sav = build_decl (BUILTINS_LOCATION,
-		      FIELD_DECL, get_identifier ("reg_save_area"),
-		      ptr_type_node);
+  /* Skip saved base pointer.  */
+  if (frame_pointer_needed)
+    offset += UNITS_PER_WORD;
+  frame->hfp_save_offset = offset;
 
-  va_list_gpr_counter_field = f_gpr;
-  va_list_fpr_counter_field = f_fpr;
+  /* The traditional frame pointer location is at the top of the frame.  */
+  frame->hard_frame_pointer_offset = offset;
 
-  DECL_FIELD_CONTEXT (f_gpr) = record;
-  DECL_FIELD_CONTEXT (f_fpr) = record;
-  DECL_FIELD_CONTEXT (f_ovf) = record;
-  DECL_FIELD_CONTEXT (f_sav) = record;
+  /* Register save area */
+  offset += frame->nregs * UNITS_PER_WORD;
+  frame->reg_save_offset = offset;
 
-  TYPE_STUB_DECL (record) = type_decl;
-  TYPE_NAME (record) = type_decl;
-  TYPE_FIELDS (record) = f_gpr;
-  DECL_CHAIN (f_gpr) = f_fpr;
-  DECL_CHAIN (f_fpr) = f_ovf;
-  DECL_CHAIN (f_ovf) = f_sav;
+  /* On SEH target, registers are pushed just before the frame pointer
+     location.  */
+  if (TARGET_SEH)
+    frame->hard_frame_pointer_offset = offset;
 
-  layout_type (record);
+  /* Calculate the size of the va-arg area (not including padding, if any).  */
+  frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
 
-  TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
-					NULL_TREE, TYPE_ATTRIBUTES (record));
+  /* Also adjust stack_realign_offset for the largest alignment of
+     stack slot actually used.  */
+  if (stack_realign_fp
+      || (cfun->machine->max_used_stack_alignment != 0
+	  && (offset % cfun->machine->max_used_stack_alignment) != 0))
+    {
+      /* We may need a 16-byte aligned stack for the remainder of the
+	 register save area, but the stack frame for the local function
+	 may require a greater alignment if using AVX/2/512.  In order
+	 to avoid wasting space, we first calculate the space needed for
+	 the rest of the register saves, add that to the stack pointer,
+	 and then realign the stack to the boundary of the start of the
+	 frame for the local function.  */
+      HOST_WIDE_INT space_needed = 0;
+      HOST_WIDE_INT sse_reg_space_needed = 0;
 
-  /* The correct type is an array type of one element.  */
-  return build_array_type (record, build_index_type (size_zero_node));
-}
+      if (TARGET_64BIT)
+	{
+	  if (m->call_ms2sysv)
+	    {
+	      m->call_ms2sysv_pad_in = 0;
+	      space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
+	    }
 
-/* Setup the builtin va_list data type and for 64-bit the additional
-   calling convention specific va_list data types.  */
+	  else if (frame->nsseregs)
+	    /* The only ABI that has saved SSE registers (Win64) also has a
+	       16-byte aligned default stack.  However, many programs violate
+	       the ABI, and Wine64 forces stack realignment to compensate.  */
+	    space_needed = frame->nsseregs * 16;
 
-static tree
-ix86_build_builtin_va_list (void)
-{
-  if (TARGET_64BIT)
-    {
-      /* Initialize ABI specific va_list builtin types.
+	  sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
 
-	 In lto1, we can encounter two va_list types:
-	 - one as a result of the type-merge across TUs, and
-	 - the one constructed here.
-	 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
-	 a type identity check in canonical_va_list_type based on
-	 TYPE_MAIN_VARIANT (which we used to have) will not work.
-	 Instead, we tag each va_list_type_node with its unique attribute, and
-	 look for the attribute in the type identity check in
-	 canonical_va_list_type.
+	  /* 64-bit frame->va_arg_size should always be a multiple of 16, but
+	     rounding to be pedantic.  */
+	  space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
+	}
+      else
+	space_needed = frame->va_arg_size;
 
-	 Tagging sysv_va_list_type_node directly with the attribute is
-	 problematic since it's a array of one record, which will degrade into a
-	 pointer to record when used as parameter (see build_va_arg comments for
-	 an example), dropping the attribute in the process.  So we tag the
-	 record instead.  */
+      /* Record the allocation size required prior to the realignment AND.  */
+      frame->stack_realign_allocate = space_needed;
 
-      /* For SYSV_ABI we use an array of one record.  */
-      sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
-	
-      /* For MS_ABI we use plain pointer to argument area.  */
-      tree char_ptr_type = build_pointer_type (char_type_node);
-      tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
-			     TYPE_ATTRIBUTES (char_ptr_type));
-      ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
+      /* The re-aligned stack starts at frame->stack_realign_offset.  Values
+	 before this point are not directly comparable with values below
+	 this point.  Use sp_valid_at to determine if the stack pointer is
+	 valid for a given offset, fp_valid_at for the frame pointer, or
+	 choose_baseaddr to have a base register chosen for you.
 
-      return ((ix86_abi == MS_ABI)
-	      ? ms_va_list_type_node
-	      : sysv_va_list_type_node);
+	 Note that the result of (frame->stack_realign_offset
+	 & (stack_alignment_needed - 1)) may not equal zero.  */
+      offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
+      frame->stack_realign_offset = offset - space_needed;
+      frame->sse_reg_save_offset = frame->stack_realign_offset
+							+ sse_reg_space_needed;
     }
   else
     {
-      /* For i386 we use plain pointer to argument area.  */
-      return build_pointer_type (char_type_node);
+      frame->stack_realign_offset = offset;
+
+      if (TARGET_64BIT && m->call_ms2sysv)
+	{
+	  m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
+	  offset += xlogue_layout::get_instance ().get_stack_space_used ();
+	}
+
+      /* Align and set SSE register save area.  */
+      else if (frame->nsseregs)
+	{
+	  /* If the incoming stack boundary is at least 16 bytes, or DRAP is
+	     required and the DRAP re-alignment boundary is at least 16 bytes,
+	     then we want the SSE register save area properly aligned.  */
+	  if (ix86_incoming_stack_boundary >= 128
+		  || (stack_realign_drap && stack_alignment_needed >= 16))
+	    offset = ROUND_UP (offset, 16);
+	  offset += frame->nsseregs * 16;
+	}
+      frame->sse_reg_save_offset = offset;
+      offset += frame->va_arg_size;
     }
-}
 
-/* Worker function for TARGET_SETUP_INCOMING_VARARGS.  */
+  /* Align start of frame for local function.  When a function call
+     is removed, it may become a leaf function.  But if argument may
+     be passed on stack, we need to align the stack when there is no
+     tail call.  */
+  if (m->call_ms2sysv
+      || frame->va_arg_size != 0
+      || size != 0
+      || !crtl->is_leaf
+      || (!crtl->tail_call_emit
+	  && cfun->machine->outgoing_args_on_stack)
+      || cfun->calls_alloca
+      || ix86_current_function_calls_tls_descriptor)
+    offset = ROUND_UP (offset, stack_alignment_needed);
 
-static void
-setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
-{
-  rtx save_area, mem;
-  alias_set_type set;
-  int i, max;
+  /* Frame pointer points here.  */
+  frame->frame_pointer_offset = offset;
 
-  /* GPR size of varargs save area.  */
-  if (cfun->va_list_gpr_size)
-    ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
-  else
-    ix86_varargs_gpr_size = 0;
+  offset += size;
 
-  /* FPR size of varargs save area.  We don't need it if we don't pass
-     anything in SSE registers.  */
-  if (TARGET_SSE && cfun->va_list_fpr_size)
-    ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
+  /* Add outgoing arguments area.  Can be skipped if we eliminated
+     all the function calls as dead code.
+     Skipping is however impossible when function calls alloca.  Alloca
+     expander assumes that last crtl->outgoing_args_size
+     of stack frame are unused.  */
+  if (ACCUMULATE_OUTGOING_ARGS
+      && (!crtl->is_leaf || cfun->calls_alloca
+	  || ix86_current_function_calls_tls_descriptor))
+    {
+      offset += crtl->outgoing_args_size;
+      frame->outgoing_arguments_size = crtl->outgoing_args_size;
+    }
   else
-    ix86_varargs_fpr_size = 0;
+    frame->outgoing_arguments_size = 0;
 
-  if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
-    return;
+  /* Align stack boundary.  Only needed if we're calling another function
+     or using alloca.  */
+  if (!crtl->is_leaf || cfun->calls_alloca
+      || ix86_current_function_calls_tls_descriptor)
+    offset = ROUND_UP (offset, preferred_alignment);
 
-  save_area = frame_pointer_rtx;
-  set = get_varargs_alias_set ();
+  /* We've reached end of stack frame.  */
+  frame->stack_pointer_offset = offset;
 
-  max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
-  if (max > X86_64_REGPARM_MAX)
-    max = X86_64_REGPARM_MAX;
+  /* Size prologue needs to allocate.  */
+  to_allocate = offset - frame->sse_reg_save_offset;
 
-  for (i = cum->regno; i < max; i++)
+  if ((!to_allocate && frame->nregs <= 1)
+      || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
+      /* If stack clash probing needs a loop, then it needs a
+	 scratch register.  But the returned register is only guaranteed
+	 to be safe to use after register saves are complete.  So if
+	 stack clash protections are enabled and the allocated frame is
+	 larger than the probe interval, then use pushes to save
+	 callee saved registers.  */
+      || (flag_stack_clash_protection && to_allocate > get_probe_interval ()))
+    frame->save_regs_using_mov = false;
+
+  if (ix86_using_red_zone ()
+      && crtl->sp_is_unchanging
+      && crtl->is_leaf
+      && !ix86_pc_thunk_call_expanded
+      && !ix86_current_function_calls_tls_descriptor)
     {
-      mem = gen_rtx_MEM (word_mode,
-			 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
-      MEM_NOTRAP_P (mem) = 1;
-      set_mem_alias_set (mem, set);
-      emit_move_insn (mem,
-		      gen_rtx_REG (word_mode,
-				   x86_64_int_parameter_registers[i]));
+      frame->red_zone_size = to_allocate;
+      if (frame->save_regs_using_mov)
+	frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
+      if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
+	frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
     }
+  else
+    frame->red_zone_size = 0;
+  frame->stack_pointer_offset -= frame->red_zone_size;
 
-  if (ix86_varargs_fpr_size)
+  /* The SEH frame pointer location is near the bottom of the frame.
+     This is enforced by the fact that the difference between the
+     stack pointer and the frame pointer is limited to 240 bytes in
+     the unwind data structure.  */
+  if (TARGET_SEH)
     {
-      machine_mode smode;
-      rtx_code_label *label;
-      rtx test;
-
-      /* Now emit code to save SSE registers.  The AX parameter contains number
-	 of SSE parameter registers used to call this function, though all we
-	 actually check here is the zero/non-zero status.  */
-
-      label = gen_label_rtx ();
-      test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
-      emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
-				      label));
-
-      /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
-	 we used movdqa (i.e. TImode) instead?  Perhaps even better would
-	 be if we could determine the real mode of the data, via a hook
-	 into pass_stdarg.  Ignore all that for now.  */
-      smode = V4SFmode;
-      if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
-	crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
-
-      max = cum->sse_regno + cfun->va_list_fpr_size / 16;
-      if (max > X86_64_SSE_REGPARM_MAX)
-	max = X86_64_SSE_REGPARM_MAX;
+      HOST_WIDE_INT diff;
 
-      for (i = cum->sse_regno; i < max; ++i)
+      /* If we can leave the frame pointer where it is, do so.  Also, returns
+	 the establisher frame for __builtin_frame_address (0).  */
+      diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
+      if (diff <= SEH_MAX_FRAME_SIZE
+	  && (diff > 240 || (diff & 15) != 0)
+	  && !crtl->accesses_prior_frames)
 	{
-	  mem = plus_constant (Pmode, save_area,
-			       i * 16 + ix86_varargs_gpr_size);
-	  mem = gen_rtx_MEM (smode, mem);
-	  MEM_NOTRAP_P (mem) = 1;
-	  set_mem_alias_set (mem, set);
-	  set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
-
-	  emit_move_insn (mem, gen_rtx_REG (smode, GET_SSE_REGNO (i)));
+	  /* Ideally we'd determine what portion of the local stack frame
+	     (within the constraint of the lowest 240) is most heavily used.
+	     But without that complication, simply bias the frame pointer
+	     by 128 bytes so as to maximize the amount of the local stack
+	     frame that is addressable with 8-bit offsets.  */
+	  frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
 	}
-
-      emit_label (label);
     }
 }
 
-static void
-setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
-{
-  alias_set_type set = get_varargs_alias_set ();
-  int i;
+/* This is semi-inlined memory_address_length, but simplified
+   since we know that we're always dealing with reg+offset, and
+   to avoid having to create and discard all that rtl.  */
 
-  /* Reset to zero, as there might be a sysv vaarg used
-     before.  */
-  ix86_varargs_gpr_size = 0;
-  ix86_varargs_fpr_size = 0;
+static inline int
+choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
+{
+  int len = 4;
 
-  for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
+  if (offset == 0)
     {
-      rtx reg, mem;
+      /* EBP and R13 cannot be encoded without an offset.  */
+      len = (regno == BP_REG || regno == R13_REG);
+    }
+  else if (IN_RANGE (offset, -128, 127))
+    len = 1;
 
-      mem = gen_rtx_MEM (Pmode,
-			 plus_constant (Pmode, virtual_incoming_args_rtx,
-					i * UNITS_PER_WORD));
-      MEM_NOTRAP_P (mem) = 1;
-      set_mem_alias_set (mem, set);
+  /* ESP and R12 must be encoded with a SIB byte.  */
+  if (regno == SP_REG || regno == R12_REG)
+    len++;
 
-      reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
-      emit_move_insn (mem, reg);
+  return len;
+}
+
+/* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
+   the frame save area.  The register is saved at CFA - CFA_OFFSET.  */
+
+static bool
+sp_valid_at (HOST_WIDE_INT cfa_offset)
+{
+  const struct machine_frame_state &fs = cfun->machine->fs;
+  if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
+    {
+      /* Validate that the cfa_offset isn't in a "no-man's land".  */
+      gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
+      return false;
+    }
+  return fs.sp_valid;
+}
+
+/* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
+   the frame save area.  The register is saved at CFA - CFA_OFFSET.  */
+
+static inline bool
+fp_valid_at (HOST_WIDE_INT cfa_offset)
+{
+  const struct machine_frame_state &fs = cfun->machine->fs;
+  if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
+    {
+      /* Validate that the cfa_offset isn't in a "no-man's land".  */
+      gcc_assert (cfa_offset >= fs.sp_realigned_offset);
+      return false;
     }
+  return fs.fp_valid;
 }
 
+/* Choose a base register based upon alignment requested, speed and/or
+   size.  */
+
 static void
-ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
-			     tree type, int *, int no_rtl)
+choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
+		HOST_WIDE_INT &base_offset,
+		unsigned int align_reqested, unsigned int *align)
 {
-  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
-  CUMULATIVE_ARGS next_cum;
-  tree fntype;
+  const struct machine_function *m = cfun->machine;
+  unsigned int hfp_align;
+  unsigned int drap_align;
+  unsigned int sp_align;
+  bool hfp_ok  = fp_valid_at (cfa_offset);
+  bool drap_ok = m->fs.drap_valid;
+  bool sp_ok   = sp_valid_at (cfa_offset);
 
-  /* This argument doesn't appear to be used anymore.  Which is good,
-     because the old code here didn't suppress rtl generation.  */
-  gcc_assert (!no_rtl);
+  hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
 
-  if (!TARGET_64BIT)
-    return;
+  /* Filter out any registers that don't meet the requested alignment
+     criteria.  */
+  if (align_reqested)
+    {
+      if (m->fs.realigned)
+	hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
+      /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
+	 notes (which we would need to use a realigned stack pointer),
+	 so disable on SEH targets.  */
+      else if (m->fs.sp_realigned)
+	sp_align = crtl->stack_alignment_needed;
 
-  fntype = TREE_TYPE (current_function_decl);
+      hfp_ok = hfp_ok && hfp_align >= align_reqested;
+      drap_ok = drap_ok && drap_align >= align_reqested;
+      sp_ok = sp_ok && sp_align >= align_reqested;
+    }
 
-  /* For varargs, we do not want to skip the dummy va_dcl argument.
-     For stdargs, we do want to skip the last named argument.  */
-  next_cum = *cum;
-  if (stdarg_p (fntype))
-    ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
-			       true);
+  if (m->use_fast_prologue_epilogue)
+    {
+      /* Choose the base register most likely to allow the most scheduling
+         opportunities.  Generally FP is valid throughout the function,
+         while DRAP must be reloaded within the epilogue.  But choose either
+         over the SP due to increased encoding size.  */
 
-  if (cum->call_abi == MS_ABI)
-    setup_incoming_varargs_ms_64 (&next_cum);
+      if (hfp_ok)
+	{
+	  base_reg = hard_frame_pointer_rtx;
+	  base_offset = m->fs.fp_offset - cfa_offset;
+	}
+      else if (drap_ok)
+	{
+	  base_reg = crtl->drap_reg;
+	  base_offset = 0 - cfa_offset;
+	}
+      else if (sp_ok)
+	{
+	  base_reg = stack_pointer_rtx;
+	  base_offset = m->fs.sp_offset - cfa_offset;
+	}
+    }
   else
-    setup_incoming_varargs_64 (&next_cum);
+    {
+      HOST_WIDE_INT toffset;
+      int len = 16, tlen;
+
+      /* Choose the base register with the smallest address encoding.
+         With a tie, choose FP > DRAP > SP.  */
+      if (sp_ok)
+	{
+	  base_reg = stack_pointer_rtx;
+	  base_offset = m->fs.sp_offset - cfa_offset;
+          len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
+	}
+      if (drap_ok)
+	{
+	  toffset = 0 - cfa_offset;
+	  tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
+	  if (tlen <= len)
+	    {
+	      base_reg = crtl->drap_reg;
+	      base_offset = toffset;
+	      len = tlen;
+	    }
+	}
+      if (hfp_ok)
+	{
+	  toffset = m->fs.fp_offset - cfa_offset;
+	  tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
+	  if (tlen <= len)
+	    {
+	      base_reg = hard_frame_pointer_rtx;
+	      base_offset = toffset;
+	      len = tlen;
+	    }
+	}
+    }
+
+    /* Set the align return value.  */
+    if (align)
+      {
+	if (base_reg == stack_pointer_rtx)
+	  *align = sp_align;
+	else if (base_reg == crtl->drap_reg)
+	  *align = drap_align;
+	else if (base_reg == hard_frame_pointer_rtx)
+	  *align = hfp_align;
+      }
 }
 
-static void
-ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
-				   machine_mode mode,
-				   tree type,
-				   int *pretend_size ATTRIBUTE_UNUSED,
-				   int no_rtl)
+/* Return an RTX that points to CFA_OFFSET within the stack frame and
+   the alignment of address.  If ALIGN is non-null, it should point to
+   an alignment value (in bits) that is preferred or zero and will
+   recieve the alignment of the base register that was selected,
+   irrespective of rather or not CFA_OFFSET is a multiple of that
+   alignment value.  If it is possible for the base register offset to be
+   non-immediate then SCRATCH_REGNO should specify a scratch register to
+   use.
+
+   The valid base registers are taken from CFUN->MACHINE->FS.  */
+
+static rtx
+choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
+		 unsigned int scratch_regno = INVALID_REGNUM)
 {
-  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
-  CUMULATIVE_ARGS next_cum;
-  tree fntype;
+  rtx base_reg = NULL;
+  HOST_WIDE_INT base_offset = 0;
 
-  gcc_assert (!no_rtl);
+  /* If a specific alignment is requested, try to get a base register
+     with that alignment first.  */
+  if (align && *align)
+    choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
 
-  /* Do nothing if we use plain pointer to argument area.  */
-  if (!TARGET_64BIT || cum->call_abi == MS_ABI)
-    return;
+  if (!base_reg)
+    choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
 
-  fntype = TREE_TYPE (current_function_decl);
+  gcc_assert (base_reg != NULL);
 
-  /* For varargs, we do not want to skip the dummy va_dcl argument.
-     For stdargs, we do want to skip the last named argument.  */
-  next_cum = *cum;
-  if (stdarg_p (fntype))
-    ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
-			       true);
-}
+  rtx base_offset_rtx = GEN_INT (base_offset);
 
+  if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
+    {
+      gcc_assert (scratch_regno != INVALID_REGNUM);
 
-/* Checks if TYPE is of kind va_list char *.  */
+      rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
+      emit_move_insn (scratch_reg, base_offset_rtx);
 
-static bool
-is_va_list_char_pointer (tree type)
-{
-  tree canonic;
+      return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
+    }
 
-  /* For 32-bit it is always true.  */
-  if (!TARGET_64BIT)
-    return true;
-  canonic = ix86_canonical_va_list_type (type);
-  return (canonic == ms_va_list_type_node
-          || (ix86_abi == MS_ABI && canonic == va_list_type_node));
+  return plus_constant (Pmode, base_reg, base_offset);
 }
 
-/* Implement va_start.  */
+/* Emit code to save registers in the prologue.  */
 
 static void
-ix86_va_start (tree valist, rtx nextarg)
+ix86_emit_save_regs (void)
 {
-  HOST_WIDE_INT words, n_gpr, n_fpr;
-  tree f_gpr, f_fpr, f_ovf, f_sav;
-  tree gpr, fpr, ovf, sav, t;
-  tree type;
-  rtx ovf_rtx;
-
-  if (flag_split_stack
-      && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
-    {
-      unsigned int scratch_regno;
+  unsigned int regno;
+  rtx_insn *insn;
 
-      /* When we are splitting the stack, we can't refer to the stack
-	 arguments using internal_arg_pointer, because they may be on
-	 the old stack.  The split stack prologue will arrange to
-	 leave a pointer to the old stack arguments in a scratch
-	 register, which we here copy to a pseudo-register.  The split
-	 stack prologue can't set the pseudo-register directly because
-	 it (the prologue) runs before any registers have been saved.  */
+  for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
+    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+      {
+	insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
+	RTX_FRAME_RELATED_P (insn) = 1;
+      }
+}
 
-      scratch_regno = split_stack_prologue_scratch_regno ();
-      if (scratch_regno != INVALID_REGNUM)
-	{
-	  rtx reg;
-	  rtx_insn *seq;
+/* Emit a single register save at CFA - CFA_OFFSET.  */
 
-	  reg = gen_reg_rtx (Pmode);
-	  cfun->machine->split_stack_varargs_pointer = reg;
+static void
+ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
+			      HOST_WIDE_INT cfa_offset)
+{
+  struct machine_function *m = cfun->machine;
+  rtx reg = gen_rtx_REG (mode, regno);
+  rtx mem, addr, base, insn;
+  unsigned int align = GET_MODE_ALIGNMENT (mode);
 
-	  start_sequence ();
-	  emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
-	  seq = get_insns ();
-	  end_sequence ();
+  addr = choose_baseaddr (cfa_offset, &align);
+  mem = gen_frame_mem (mode, addr);
 
-	  push_topmost_sequence ();
-	  emit_insn_after (seq, entry_of_function ());
-	  pop_topmost_sequence ();
-	}
-    }
+  /* The location aligment depends upon the base register.  */
+  align = MIN (GET_MODE_ALIGNMENT (mode), align);
+  gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
+  set_mem_align (mem, align);
 
-  /* Only 64bit target needs something special.  */
-  if (is_va_list_char_pointer (TREE_TYPE (valist)))
+  insn = emit_insn (gen_rtx_SET (mem, reg));
+  RTX_FRAME_RELATED_P (insn) = 1;
+
+  base = addr;
+  if (GET_CODE (base) == PLUS)
+    base = XEXP (base, 0);
+  gcc_checking_assert (REG_P (base));
+
+  /* When saving registers into a re-aligned local stack frame, avoid
+     any tricky guessing by dwarf2out.  */
+  if (m->fs.realigned)
     {
-      if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
-	std_expand_builtin_va_start (valist, nextarg);
+      gcc_checking_assert (stack_realign_drap);
+
+      if (regno == REGNO (crtl->drap_reg))
+	{
+	  /* A bit of a hack.  We force the DRAP register to be saved in
+	     the re-aligned stack frame, which provides us with a copy
+	     of the CFA that will last past the prologue.  Install it.  */
+	  gcc_checking_assert (cfun->machine->fs.fp_valid);
+	  addr = plus_constant (Pmode, hard_frame_pointer_rtx,
+				cfun->machine->fs.fp_offset - cfa_offset);
+	  mem = gen_rtx_MEM (mode, addr);
+	  add_reg_note (insn, REG_CFA_DEF_CFA, mem);
+	}
       else
 	{
-	  rtx va_r, next;
-
-	  va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
-	  next = expand_binop (ptr_mode, add_optab,
-			       cfun->machine->split_stack_varargs_pointer,
-			       crtl->args.arg_offset_rtx,
-			       NULL_RTX, 0, OPTAB_LIB_WIDEN);
-	  convert_move (va_r, next, 0);
+	  /* The frame pointer is a stable reference within the
+	     aligned frame.  Use it.  */
+	  gcc_checking_assert (cfun->machine->fs.fp_valid);
+	  addr = plus_constant (Pmode, hard_frame_pointer_rtx,
+				cfun->machine->fs.fp_offset - cfa_offset);
+	  mem = gen_rtx_MEM (mode, addr);
+	  add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
 	}
-      return;
     }
 
-  f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
-  f_fpr = DECL_CHAIN (f_gpr);
-  f_ovf = DECL_CHAIN (f_fpr);
-  f_sav = DECL_CHAIN (f_ovf);
-
-  valist = build_simple_mem_ref (valist);
-  TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
-  /* The following should be folded into the MEM_REF offset.  */
-  gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
-		f_gpr, NULL_TREE);
-  fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
-		f_fpr, NULL_TREE);
-  ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
-		f_ovf, NULL_TREE);
-  sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
-		f_sav, NULL_TREE);
-
-  /* Count number of gp and fp argument registers used.  */
-  words = crtl->args.info.words;
-  n_gpr = crtl->args.info.regno;
-  n_fpr = crtl->args.info.sse_regno;
-
-  if (cfun->va_list_gpr_size)
+  else if (base == stack_pointer_rtx && m->fs.sp_realigned
+	   && cfa_offset >= m->fs.sp_realigned_offset)
     {
-      type = TREE_TYPE (gpr);
-      t = build2 (MODIFY_EXPR, type,
-		  gpr, build_int_cst (type, n_gpr * 8));
-      TREE_SIDE_EFFECTS (t) = 1;
-      expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+      gcc_checking_assert (stack_realign_fp);
+      add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
     }
 
-  if (TARGET_SSE && cfun->va_list_fpr_size)
+  /* The memory may not be relative to the current CFA register,
+     which means that we may need to generate a new pattern for
+     use by the unwind info.  */
+  else if (base != m->fs.cfa_reg)
     {
-      type = TREE_TYPE (fpr);
-      t = build2 (MODIFY_EXPR, type, fpr,
-		  build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
-      TREE_SIDE_EFFECTS (t) = 1;
-      expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+      addr = plus_constant (Pmode, m->fs.cfa_reg,
+			    m->fs.cfa_offset - cfa_offset);
+      mem = gen_rtx_MEM (mode, addr);
+      add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
     }
+}
 
-  /* Find the overflow area.  */
-  type = TREE_TYPE (ovf);
-  if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
-    ovf_rtx = crtl->args.internal_arg_pointer;
-  else
-    ovf_rtx = cfun->machine->split_stack_varargs_pointer;
-  t = make_tree (type, ovf_rtx);
-  if (words != 0)
-    t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
+/* Emit code to save registers using MOV insns.
+   First register is stored at CFA - CFA_OFFSET.  */
+static void
+ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
+{
+  unsigned int regno;
 
-  t = build2 (MODIFY_EXPR, type, ovf, t);
-  TREE_SIDE_EFFECTS (t) = 1;
-  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+      {
+        ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
+	cfa_offset -= UNITS_PER_WORD;
+      }
+}
 
-  if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
-    {
-      /* Find the register save area.
-	 Prologue of the function save it right above stack frame.  */
-      type = TREE_TYPE (sav);
-      t = make_tree (type, frame_pointer_rtx);
-      if (!ix86_varargs_gpr_size)
-	t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
+/* Emit code to save SSE registers using MOV insns.
+   First register is stored at CFA - CFA_OFFSET.  */
+static void
+ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
+{
+  unsigned int regno;
 
-      t = build2 (MODIFY_EXPR, type, sav, t);
-      TREE_SIDE_EFFECTS (t) = 1;
-      expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
-    }
+  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+    if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+      {
+	ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
+	cfa_offset -= GET_MODE_SIZE (V4SFmode);
+      }
 }
 
-/* Implement va_arg.  */
+static GTY(()) rtx queued_cfa_restores;
 
-static tree
-ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
-		      gimple_seq *post_p)
+/* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
+   manipulation insn.  The value is on the stack at CFA - CFA_OFFSET.
+   Don't add the note if the previously saved value will be left untouched
+   within stack red-zone till return, as unwinders can find the same value
+   in the register and on the stack.  */
+
+static void
+ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
 {
-  static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
-  tree f_gpr, f_fpr, f_ovf, f_sav;
-  tree gpr, fpr, ovf, sav, t;
-  int size, rsize;
-  tree lab_false, lab_over = NULL_TREE;
-  tree addr, t2;
-  rtx container;
-  int indirect_p = 0;
-  tree ptrtype;
-  machine_mode nat_mode;
-  unsigned int arg_boundary;
-  unsigned int type_align;
+  if (!crtl->shrink_wrapped
+      && cfa_offset <= cfun->machine->fs.red_zone_offset)
+    return;
 
-  /* Only 64bit target needs something special.  */
-  if (is_va_list_char_pointer (TREE_TYPE (valist)))
-    return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
+  if (insn)
+    {
+      add_reg_note (insn, REG_CFA_RESTORE, reg);
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
+  else
+    queued_cfa_restores
+      = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
+}
 
-  f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
-  f_fpr = DECL_CHAIN (f_gpr);
-  f_ovf = DECL_CHAIN (f_fpr);
-  f_sav = DECL_CHAIN (f_ovf);
+/* Add queued REG_CFA_RESTORE notes if any to INSN.  */
 
-  gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
-		valist, f_gpr, NULL_TREE);
+static void
+ix86_add_queued_cfa_restore_notes (rtx insn)
+{
+  rtx last;
+  if (!queued_cfa_restores)
+    return;
+  for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
+    ;
+  XEXP (last, 1) = REG_NOTES (insn);
+  REG_NOTES (insn) = queued_cfa_restores;
+  queued_cfa_restores = NULL_RTX;
+  RTX_FRAME_RELATED_P (insn) = 1;
+}
 
-  fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
-  ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
-  sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
+/* Expand prologue or epilogue stack adjustment.
+   The pattern exist to put a dependency on all ebp-based memory accesses.
+   STYLE should be negative if instructions should be marked as frame related,
+   zero if %r11 register is live and cannot be freely used and positive
+   otherwise.  */
 
-  indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
-  if (indirect_p)
-    type = build_pointer_type (type);
-  size = arg_int_size_in_bytes (type);
-  rsize = CEIL (size, UNITS_PER_WORD);
+static rtx
+pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
+			   int style, bool set_cfa)
+{
+  struct machine_function *m = cfun->machine;
+  rtx insn;
+  bool add_frame_related_expr = false;
 
-  nat_mode = type_natural_mode (type, NULL, false);
-  switch (nat_mode)
+  if (Pmode == SImode)
+    insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
+  else if (x86_64_immediate_operand (offset, DImode))
+    insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
+  else
     {
-    case E_V8SFmode:
-    case E_V8SImode:
-    case E_V32QImode:
-    case E_V16HImode:
-    case E_V4DFmode:
-    case E_V4DImode:
-    case E_V16SFmode:
-    case E_V16SImode:
-    case E_V64QImode:
-    case E_V32HImode:
-    case E_V8DFmode:
-    case E_V8DImode:
-      /* Unnamed 256 and 512bit vector mode parameters are passed on stack.  */
-      if (!TARGET_64BIT_MS_ABI)
+      rtx tmp;
+      /* r11 is used by indirect sibcall return as well, set before the
+	 epilogue and used after the epilogue.  */
+      if (style)
+        tmp = gen_rtx_REG (DImode, R11_REG);
+      else
 	{
-	  container = NULL;
-	  break;
+	  gcc_assert (src != hard_frame_pointer_rtx
+		      && dest != hard_frame_pointer_rtx);
+	  tmp = hard_frame_pointer_rtx;
 	}
-      /* FALLTHRU */
+      insn = emit_insn (gen_rtx_SET (tmp, offset));
+      if (style < 0)
+	add_frame_related_expr = true;
 
-    default:
-      container = construct_container (nat_mode, TYPE_MODE (type),
-				       type, 0, X86_64_REGPARM_MAX,
-				       X86_64_SSE_REGPARM_MAX, intreg,
-				       0);
-      break;
+      insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
     }
 
-  /* Pull the value out of the saved registers.  */
-
-  addr = create_tmp_var (ptr_type_node, "addr");
-  type_align = TYPE_ALIGN (type);
+  insn = emit_insn (insn);
+  if (style >= 0)
+    ix86_add_queued_cfa_restore_notes (insn);
 
-  if (container)
+  if (set_cfa)
     {
-      int needed_intregs, needed_sseregs;
-      bool need_temp;
-      tree int_addr, sse_addr;
-
-      lab_false = create_artificial_label (UNKNOWN_LOCATION);
-      lab_over = create_artificial_label (UNKNOWN_LOCATION);
-
-      examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
+      rtx r;
 
-      need_temp = (!REG_P (container)
-		   && ((needed_intregs && TYPE_ALIGN (type) > 64)
-		       || TYPE_ALIGN (type) > 128));
+      gcc_assert (m->fs.cfa_reg == src);
+      m->fs.cfa_offset += INTVAL (offset);
+      m->fs.cfa_reg = dest;
 
-      /* In case we are passing structure, verify that it is consecutive block
-         on the register save area.  If not we need to do moves.  */
-      if (!need_temp && !REG_P (container))
+      r = gen_rtx_PLUS (Pmode, src, offset);
+      r = gen_rtx_SET (dest, r);
+      add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
+  else if (style < 0)
+    {
+      RTX_FRAME_RELATED_P (insn) = 1;
+      if (add_frame_related_expr)
 	{
-	  /* Verify that all registers are strictly consecutive  */
-	  if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
-	    {
-	      int i;
+	  rtx r = gen_rtx_PLUS (Pmode, src, offset);
+	  r = gen_rtx_SET (dest, r);
+	  add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
+	}
+    }
 
-	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
-		{
-		  rtx slot = XVECEXP (container, 0, i);
-		  if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
-		      || INTVAL (XEXP (slot, 1)) != i * 16)
-		    need_temp = true;
-		}
-	    }
-	  else
-	    {
-	      int i;
+  if (dest == stack_pointer_rtx)
+    {
+      HOST_WIDE_INT ooffset = m->fs.sp_offset;
+      bool valid = m->fs.sp_valid;
+      bool realigned = m->fs.sp_realigned;
 
-	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
-		{
-		  rtx slot = XVECEXP (container, 0, i);
-		  if (REGNO (XEXP (slot, 0)) != (unsigned int) i
-		      || INTVAL (XEXP (slot, 1)) != i * 8)
-		    need_temp = true;
-		}
-	    }
-	}
-      if (!need_temp)
-	{
-	  int_addr = addr;
-	  sse_addr = addr;
-	}
-      else
+      if (src == hard_frame_pointer_rtx)
 	{
-	  int_addr = create_tmp_var (ptr_type_node, "int_addr");
-	  sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
+	  valid = m->fs.fp_valid;
+	  realigned = false;
+	  ooffset = m->fs.fp_offset;
 	}
-
-      /* First ensure that we fit completely in registers.  */
-      if (needed_intregs)
+      else if (src == crtl->drap_reg)
 	{
-	  t = build_int_cst (TREE_TYPE (gpr),
-			     (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
-	  t = build2 (GE_EXPR, boolean_type_node, gpr, t);
-	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
-	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
-	  gimplify_and_add (t, pre_p);
+	  valid = m->fs.drap_valid;
+	  realigned = false;
+	  ooffset = 0;
 	}
-      if (needed_sseregs)
+      else
 	{
-	  t = build_int_cst (TREE_TYPE (fpr),
-			     (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
-			     + X86_64_REGPARM_MAX * 8);
-	  t = build2 (GE_EXPR, boolean_type_node, fpr, t);
-	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
-	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
-	  gimplify_and_add (t, pre_p);
+	  /* Else there are two possibilities: SP itself, which we set
+	     up as the default above.  Or EH_RETURN_STACKADJ_RTX, which is
+	     taken care of this by hand along the eh_return path.  */
+	  gcc_checking_assert (src == stack_pointer_rtx
+			       || offset == const0_rtx);
 	}
 
-      /* Compute index to start of area used for integer regs.  */
-      if (needed_intregs)
-	{
-	  /* int_addr = gpr + sav; */
-	  t = fold_build_pointer_plus (sav, gpr);
-	  gimplify_assign (int_addr, t, pre_p);
-	}
-      if (needed_sseregs)
-	{
-	  /* sse_addr = fpr + sav; */
-	  t = fold_build_pointer_plus (sav, fpr);
-	  gimplify_assign (sse_addr, t, pre_p);
-	}
-      if (need_temp)
-	{
-	  int i, prev_size = 0;
-	  tree temp = create_tmp_var (type, "va_arg_tmp");
+      m->fs.sp_offset = ooffset - INTVAL (offset);
+      m->fs.sp_valid = valid;
+      m->fs.sp_realigned = realigned;
+    }
+  return insn;
+}
 
-	  /* addr = &temp; */
-	  t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
-	  gimplify_assign (addr, t, pre_p);
+/* Find an available register to be used as dynamic realign argument
+   pointer regsiter.  Such a register will be written in prologue and
+   used in begin of body, so it must not be
+	1. parameter passing register.
+	2. GOT pointer.
+   We reuse static-chain register if it is available.  Otherwise, we
+   use DI for i386 and R13 for x86-64.  We chose R13 since it has
+   shorter encoding.
 
-	  for (i = 0; i < XVECLEN (container, 0); i++)
-	    {
-	      rtx slot = XVECEXP (container, 0, i);
-	      rtx reg = XEXP (slot, 0);
-	      machine_mode mode = GET_MODE (reg);
-	      tree piece_type;
-	      tree addr_type;
-	      tree daddr_type;
-	      tree src_addr, src;
-	      int src_offset;
-	      tree dest_addr, dest;
-	      int cur_size = GET_MODE_SIZE (mode);
+   Return: the regno of chosen register.  */
 
-	      gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
-	      prev_size = INTVAL (XEXP (slot, 1));
-	      if (prev_size + cur_size > size)
-		{
-		  cur_size = size - prev_size;
-		  unsigned int nbits = cur_size * BITS_PER_UNIT;
-		  if (!int_mode_for_size (nbits, 1).exists (&mode))
-		    mode = QImode;
-		}
-	      piece_type = lang_hooks.types.type_for_mode (mode, 1);
-	      if (mode == GET_MODE (reg))
-		addr_type = build_pointer_type (piece_type);
-	      else
-		addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
-							 true);
-	      daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
-							true);
+static unsigned int
+find_drap_reg (void)
+{
+  tree decl = cfun->decl;
 
-	      if (SSE_REGNO_P (REGNO (reg)))
-		{
-		  src_addr = sse_addr;
-		  src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
-		}
-	      else
-		{
-		  src_addr = int_addr;
-		  src_offset = REGNO (reg) * 8;
-		}
-	      src_addr = fold_convert (addr_type, src_addr);
-	      src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
+  /* Always use callee-saved register if there are no caller-saved
+     registers.  */
+  if (TARGET_64BIT)
+    {
+      /* Use R13 for nested function or function need static chain.
+	 Since function with tail call may use any caller-saved
+	 registers in epilogue, DRAP must not use caller-saved
+	 register in such case.  */
+      if (DECL_STATIC_CHAIN (decl)
+	  || cfun->machine->no_caller_saved_registers
+	  || crtl->tail_call_emit)
+	return R13_REG;
 
-	      dest_addr = fold_convert (daddr_type, addr);
-	      dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
-	      if (cur_size == GET_MODE_SIZE (mode))
-		{
-		  src = build_va_arg_indirect_ref (src_addr);
-		  dest = build_va_arg_indirect_ref (dest_addr);
+      return R10_REG;
+    }
+  else
+    {
+      /* Use DI for nested function or function need static chain.
+	 Since function with tail call may use any caller-saved
+	 registers in epilogue, DRAP must not use caller-saved
+	 register in such case.  */
+      if (DECL_STATIC_CHAIN (decl)
+	  || cfun->machine->no_caller_saved_registers
+	  || crtl->tail_call_emit)
+	return DI_REG;
 
-		  gimplify_assign (dest, src, pre_p);
-		}
-	      else
-		{
-		  tree copy
-		    = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
-				       3, dest_addr, src_addr,
-				       size_int (cur_size));
-		  gimplify_and_add (copy, pre_p);
-		}
-	      prev_size += cur_size;
-	    }
-	}
-
-      if (needed_intregs)
-	{
-	  t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
-		      build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
-	  gimplify_assign (gpr, t, pre_p);
-	  /* The GPR save area guarantees only 8-byte alignment.  */
-	  if (!need_temp)
-	    type_align = MIN (type_align, 64);
-	}
-
-      if (needed_sseregs)
+      /* Reuse static chain register if it isn't used for parameter
+         passing.  */
+      if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
 	{
-	  t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
-		      build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
-	  gimplify_assign (unshare_expr (fpr), t, pre_p);
+	  unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
+	  if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
+	    return CX_REG;
 	}
-
-      gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
-
-      gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
-    }
-
-  /* ... otherwise out of the overflow area.  */
-
-  /* When we align parameter on stack for caller, if the parameter
-     alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
-     aligned at MAX_SUPPORTED_STACK_ALIGNMENT.  We will match callee
-     here with caller.  */
-  arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
-  if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
-    arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
-
-  /* Care for on-stack alignment if needed.  */
-  if (arg_boundary <= 64 || size == 0)
-    t = ovf;
- else
-    {
-      HOST_WIDE_INT align = arg_boundary / 8;
-      t = fold_build_pointer_plus_hwi (ovf, align - 1);
-      t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
-		  build_int_cst (TREE_TYPE (t), -align));
+      return DI_REG;
     }
+}
 
-  gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
-  gimplify_assign (addr, t, pre_p);
+/* Return minimum incoming stack alignment.  */
 
-  t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
-  gimplify_assign (unshare_expr (ovf), t, pre_p);
+static unsigned int
+ix86_minimum_incoming_stack_boundary (bool sibcall)
+{
+  unsigned int incoming_stack_boundary;
 
-  if (container)
-    gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
+  /* Stack of interrupt handler is aligned to 128 bits in 64bit mode.  */
+  if (cfun->machine->func_type != TYPE_NORMAL)
+    incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
+  /* Prefer the one specified at command line. */
+  else if (ix86_user_incoming_stack_boundary)
+    incoming_stack_boundary = ix86_user_incoming_stack_boundary;
+  /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
+     if -mstackrealign is used, it isn't used for sibcall check and
+     estimated stack alignment is 128bit.  */
+  else if (!sibcall
+	   && ix86_force_align_arg_pointer
+	   && crtl->stack_alignment_estimated == 128)
+    incoming_stack_boundary = MIN_STACK_BOUNDARY;
+  else
+    incoming_stack_boundary = ix86_default_incoming_stack_boundary;
 
-  type = build_aligned_type (type, type_align);
-  ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
-  addr = fold_convert (ptrtype, addr);
+  /* Incoming stack alignment can be changed on individual functions
+     via force_align_arg_pointer attribute.  We use the smallest
+     incoming stack boundary.  */
+  if (incoming_stack_boundary > MIN_STACK_BOUNDARY
+      && lookup_attribute ("force_align_arg_pointer",
+			   TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
+    incoming_stack_boundary = MIN_STACK_BOUNDARY;
 
-  if (indirect_p)
-    addr = build_va_arg_indirect_ref (addr);
-  return build_va_arg_indirect_ref (addr);
-}
-
-/* Return true if OPNUM's MEM should be matched
-   in movabs* patterns.  */
+  /* The incoming stack frame has to be aligned at least at
+     parm_stack_boundary.  */
+  if (incoming_stack_boundary < crtl->parm_stack_boundary)
+    incoming_stack_boundary = crtl->parm_stack_boundary;
 
-bool
-ix86_check_movabs (rtx insn, int opnum)
-{
-  rtx set, mem;
+  /* Stack at entrance of main is aligned by runtime.  We use the
+     smallest incoming stack boundary. */
+  if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
+      && DECL_NAME (current_function_decl)
+      && MAIN_NAME_P (DECL_NAME (current_function_decl))
+      && DECL_FILE_SCOPE_P (current_function_decl))
+    incoming_stack_boundary = MAIN_STACK_BOUNDARY;
 
-  set = PATTERN (insn);
-  if (GET_CODE (set) == PARALLEL)
-    set = XVECEXP (set, 0, 0);
-  gcc_assert (GET_CODE (set) == SET);
-  mem = XEXP (set, opnum);
-  while (SUBREG_P (mem))
-    mem = SUBREG_REG (mem);
-  gcc_assert (MEM_P (mem));
-  return volatile_ok || !MEM_VOLATILE_P (mem);
+  return incoming_stack_boundary;
 }
 
-/* Return false if INSN contains a MEM with a non-default address space.  */
-bool
-ix86_check_no_addr_space (rtx insn)
-{
-  subrtx_var_iterator::array_type array;
-  FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
-    {
-      rtx x = *iter;
-      if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
-	return false;
-    }
-  return true;
-}
-
-/* Initialize the table of extra 80387 mathematical constants.  */
+/* Update incoming stack boundary and estimated stack alignment.  */
 
 static void
-init_ext_80387_constants (void)
+ix86_update_stack_boundary (void)
 {
-  static const char * cst[5] =
-  {
-    "0.3010299956639811952256464283594894482",  /* 0: fldlg2  */
-    "0.6931471805599453094286904741849753009",  /* 1: fldln2  */
-    "1.4426950408889634073876517827983434472",  /* 2: fldl2e  */
-    "3.3219280948873623478083405569094566090",  /* 3: fldl2t  */
-    "3.1415926535897932385128089594061862044",  /* 4: fldpi   */
-  };
-  int i;
+  ix86_incoming_stack_boundary
+    = ix86_minimum_incoming_stack_boundary (false);
 
-  for (i = 0; i < 5; i++)
-    {
-      real_from_string (&ext_80387_constants_table[i], cst[i]);
-      /* Ensure each constant is rounded to XFmode precision.  */
-      real_convert (&ext_80387_constants_table[i],
-		    XFmode, &ext_80387_constants_table[i]);
-    }
+  /* x86_64 vararg needs 16byte stack alignment for register save area.  */
+  if (TARGET_64BIT
+      && cfun->stdarg
+      && crtl->stack_alignment_estimated < 128)
+    crtl->stack_alignment_estimated = 128;
 
-  ext_80387_constants_init = 1;
+  /* __tls_get_addr needs to be called with 16-byte aligned stack.  */
+  if (ix86_tls_descriptor_calls_expanded_in_cfun
+      && crtl->preferred_stack_boundary < 128)
+    crtl->preferred_stack_boundary = 128;
 }
 
-/* Return non-zero if the constant is something that
-   can be loaded with a special instruction.  */
+/* Handle the TARGET_GET_DRAP_RTX hook.  Return NULL if no DRAP is
+   needed or an rtx for DRAP otherwise.  */
 
-int
-standard_80387_constant_p (rtx x)
+static rtx
+ix86_get_drap_rtx (void)
 {
-  machine_mode mode = GET_MODE (x);
-
-  const REAL_VALUE_TYPE *r;
-
-  if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
-    return -1;
-
-  if (x == CONST0_RTX (mode))
-    return 1;
-  if (x == CONST1_RTX (mode))
-    return 2;
-
-  r = CONST_DOUBLE_REAL_VALUE (x);
+  /* We must use DRAP if there are outgoing arguments on stack and
+     ACCUMULATE_OUTGOING_ARGS is false.  */
+  if (ix86_force_drap
+      || (cfun->machine->outgoing_args_on_stack
+	  && !ACCUMULATE_OUTGOING_ARGS))
+    crtl->need_drap = true;
 
-  /* For XFmode constants, try to find a special 80387 instruction when
-     optimizing for size or on those CPUs that benefit from them.  */
-  if (mode == XFmode
-      && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
+  if (stack_realign_drap)
     {
-      int i;
+      /* Assign DRAP to vDRAP and returns vDRAP */
+      unsigned int regno = find_drap_reg ();
+      rtx drap_vreg;
+      rtx arg_ptr;
+      rtx_insn *seq, *insn;
 
-      if (! ext_80387_constants_init)
-	init_ext_80387_constants ();
+      arg_ptr = gen_rtx_REG (Pmode, regno);
+      crtl->drap_reg = arg_ptr;
 
-      for (i = 0; i < 5; i++)
-        if (real_identical (r, &ext_80387_constants_table[i]))
-	  return i + 3;
+      start_sequence ();
+      drap_vreg = copy_to_reg (arg_ptr);
+      seq = get_insns ();
+      end_sequence ();
+
+      insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
+      if (!optimize)
+	{
+	  add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+      return drap_vreg;
     }
+  else
+    return NULL;
+}
 
-  /* Load of the constant -0.0 or -1.0 will be split as
-     fldz;fchs or fld1;fchs sequence.  */
-  if (real_isnegzero (r))
-    return 8;
-  if (real_identical (r, &dconstm1))
-    return 9;
+/* Handle the TARGET_INTERNAL_ARG_POINTER hook.  */
 
-  return 0;
+static rtx
+ix86_internal_arg_pointer (void)
+{
+  return virtual_incoming_args_rtx;
 }
 
-/* Return the opcode of the special instruction to be used to load
-   the constant X.  */
+struct scratch_reg {
+  rtx reg;
+  bool saved;
+};
 
-const char *
-standard_80387_constant_opcode (rtx x)
+/* Return a short-lived scratch register for use on function entry.
+   In 32-bit mode, it is valid only after the registers are saved
+   in the prologue.  This register must be released by means of
+   release_scratch_register_on_entry once it is dead.  */
+
+static void
+get_scratch_register_on_entry (struct scratch_reg *sr)
 {
-  switch (standard_80387_constant_p (x))
-    {
-    case 1:
-      return "fldz";
-    case 2:
-      return "fld1";
-    case 3:
-      return "fldlg2";
-    case 4:
-      return "fldln2";
-    case 5:
-      return "fldl2e";
-    case 6:
-      return "fldl2t";
-    case 7:
-      return "fldpi";
-    case 8:
-    case 9:
-      return "#";
-    default:
-      gcc_unreachable ();
-    }
-}
-
-/* Return the CONST_DOUBLE representing the 80387 constant that is
-   loaded by the specified special instruction.  The argument IDX
-   matches the return value from standard_80387_constant_p.  */
-
-rtx
-standard_80387_constant_rtx (int idx)
-{
-  int i;
+  int regno;
 
-  if (! ext_80387_constants_init)
-    init_ext_80387_constants ();
+  sr->saved = false;
 
-  switch (idx)
+  if (TARGET_64BIT)
     {
-    case 3:
-    case 4:
-    case 5:
-    case 6:
-    case 7:
-      i = idx - 3;
-      break;
+      /* We always use R11 in 64-bit mode.  */
+      regno = R11_REG;
+    }
+  else
+    {
+      tree decl = current_function_decl, fntype = TREE_TYPE (decl);
+      bool fastcall_p
+	= lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
+      bool thiscall_p
+	= lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
+      bool static_chain_p = DECL_STATIC_CHAIN (decl);
+      int regparm = ix86_function_regparm (fntype, decl);
+      int drap_regno
+	= crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
 
-    default:
-      gcc_unreachable ();
+      /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
+	  for the static chain register.  */
+      if ((regparm < 1 || (fastcall_p && !static_chain_p))
+	  && drap_regno != AX_REG)
+	regno = AX_REG;
+      /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
+	  for the static chain register.  */
+      else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
+        regno = AX_REG;
+      else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
+	regno = DX_REG;
+      /* ecx is the static chain register.  */
+      else if (regparm < 3 && !fastcall_p && !thiscall_p
+	       && !static_chain_p
+	       && drap_regno != CX_REG)
+	regno = CX_REG;
+      else if (ix86_save_reg (BX_REG, true, false))
+	regno = BX_REG;
+      /* esi is the static chain register.  */
+      else if (!(regparm == 3 && static_chain_p)
+	       && ix86_save_reg (SI_REG, true, false))
+	regno = SI_REG;
+      else if (ix86_save_reg (DI_REG, true, false))
+	regno = DI_REG;
+      else
+	{
+	  regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
+	  sr->saved = true;
+	}
     }
 
-  return const_double_from_real_value (ext_80387_constants_table[i],
-				       XFmode);
+  sr->reg = gen_rtx_REG (Pmode, regno);
+  if (sr->saved)
+    {
+      rtx_insn *insn = emit_insn (gen_push (sr->reg));
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
 }
 
-/* Return 1 if X is all bits 0 and 2 if X is all bits 1
-   in supported SSE/AVX vector mode.  */
-
-int
-standard_sse_constant_p (rtx x, machine_mode pred_mode)
-{
-  machine_mode mode;
-
-  if (!TARGET_SSE)
-    return 0;
+/* Release a scratch register obtained from the preceding function.
 
-  mode = GET_MODE (x);
+   If RELEASE_VIA_POP is true, we just pop the register off the stack
+   to release it.  This is what non-Linux systems use with -fstack-check.
 
-  if (x == const0_rtx || const0_operand (x, mode))
-    return 1;
+   Otherwise we use OFFSET to locate the saved register and the
+   allocated stack space becomes part of the local frame and is
+   deallocated by the epilogue.  */
 
-  if (x == constm1_rtx || vector_all_ones_operand (x, mode))
+static void
+release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset,
+				   bool release_via_pop)
+{
+  if (sr->saved)
     {
-      /* VOIDmode integer constant, get mode from the predicate.  */
-      if (mode == VOIDmode)
-	mode = pred_mode;
+      if (release_via_pop)
+	{
+	  struct machine_function *m = cfun->machine;
+	  rtx x, insn = emit_insn (gen_pop (sr->reg));
 
-      switch (GET_MODE_SIZE (mode))
+	  /* The RX FRAME_RELATED_P mechanism doesn't know about pop.  */
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	  x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
+	  x = gen_rtx_SET (stack_pointer_rtx, x);
+	  add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
+	  m->fs.sp_offset -= UNITS_PER_WORD;
+	}
+      else
 	{
-	case 64:
-	  if (TARGET_AVX512F)
-	    return 2;
-	  break;
-	case 32:
-	  if (TARGET_AVX2)
-	    return 2;
-	  break;
-	case 16:
-	  if (TARGET_SSE2)
-	    return 2;
-	  break;
-	case 0:
-	  /* VOIDmode */
-	  gcc_unreachable ();
-	default:
-	  break;
+	  rtx x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (offset));
+	  x = gen_rtx_SET (sr->reg, gen_rtx_MEM (word_mode, x));
+	  emit_insn (x);
 	}
     }
-
-  return 0;
 }
 
-/* Return the opcode of the special instruction to be used to load
-   the constant operands[1] into operands[0].  */
+/* Emit code to adjust the stack pointer by SIZE bytes while probing it.
 
-const char *
-standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
-{
-  machine_mode mode;
-  rtx x = operands[1];
+   This differs from the next routine in that it tries hard to prevent
+   attacks that jump the stack guard.  Thus it is never allowed to allocate
+   more than PROBE_INTERVAL bytes of stack space without a suitable
+   probe.
 
-  gcc_assert (TARGET_SSE);
+   INT_REGISTERS_SAVED is true if integer registers have already been
+   pushed on the stack.  */
 
-  mode = GET_MODE (x);
+static void
+ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
+					 const bool int_registers_saved)
+{
+  struct machine_function *m = cfun->machine;
 
-  if (x == const0_rtx || const0_operand (x, mode))
+  /* If this function does not statically allocate stack space, then
+     no probes are needed.  */
+  if (!size)
     {
-      switch (get_attr_mode (insn))
-	{
-	case MODE_TI:
-	  if (!EXT_REX_SSE_REG_P (operands[0]))
-	    return "%vpxor\t%0, %d0";
-	  /* FALLTHRU */
-	case MODE_XI:
-	case MODE_OI:
-	  if (EXT_REX_SSE_REG_P (operands[0]))
-	    return (TARGET_AVX512VL
-		    ? "vpxord\t%x0, %x0, %x0"
-		    : "vpxord\t%g0, %g0, %g0");
-	  return "vpxor\t%x0, %x0, %x0";
+      /* However, the allocation of space via pushes for register
+	 saves could be viewed as allocating space, but without the
+	 need to probe.  */
+      if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
+        dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
+      else
+	dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
+      return;
+    }
 
-	case MODE_V2DF:
-	  if (!EXT_REX_SSE_REG_P (operands[0]))
-	    return "%vxorpd\t%0, %d0";
-	  /* FALLTHRU */
-	case MODE_V8DF:
-	case MODE_V4DF:
-	  if (!EXT_REX_SSE_REG_P (operands[0]))
-	    return "vxorpd\t%x0, %x0, %x0";
-	  else if (TARGET_AVX512DQ)
-	    return (TARGET_AVX512VL
-		    ? "vxorpd\t%x0, %x0, %x0"
-		    : "vxorpd\t%g0, %g0, %g0");
-	  else
-	    return (TARGET_AVX512VL
-		    ? "vpxorq\t%x0, %x0, %x0"
-		    : "vpxorq\t%g0, %g0, %g0");
+  /* If we are a noreturn function, then we have to consider the
+     possibility that we're called via a jump rather than a call.
 
-	case MODE_V4SF:
-	  if (!EXT_REX_SSE_REG_P (operands[0]))
-	    return "%vxorps\t%0, %d0";
-	  /* FALLTHRU */
-	case MODE_V16SF:
-	case MODE_V8SF:
-	  if (!EXT_REX_SSE_REG_P (operands[0]))
-	    return "vxorps\t%x0, %x0, %x0";
-	  else if (TARGET_AVX512DQ)
-	    return (TARGET_AVX512VL
-		    ? "vxorps\t%x0, %x0, %x0"
-		    : "vxorps\t%g0, %g0, %g0");
-	  else
-	    return (TARGET_AVX512VL
-		    ? "vpxord\t%x0, %x0, %x0"
-		    : "vpxord\t%g0, %g0, %g0");
+     Thus we don't have the implicit probe generated by saving the
+     return address into the stack at the call.  Thus, the stack
+     pointer could be anywhere in the guard page.  The safe thing
+     to do is emit a probe now.
 
-	default:
-	  gcc_unreachable ();
-	}
-    }
-  else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
-    {
-      enum attr_mode insn_mode = get_attr_mode (insn);
-      
-      switch (insn_mode)
-	{
-	case MODE_XI:
-	case MODE_V8DF:
-	case MODE_V16SF:
-	  gcc_assert (TARGET_AVX512F);
-	  return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
-
-	case MODE_OI:
-	case MODE_V4DF:
-	case MODE_V8SF:
-	  gcc_assert (TARGET_AVX2);
-	  /* FALLTHRU */
-	case MODE_TI:
-	case MODE_V2DF:
-	case MODE_V4SF:
-	  gcc_assert (TARGET_SSE2);
-	  if (!EXT_REX_SSE_REG_P (operands[0]))
-	    return (TARGET_AVX
-		    ? "vpcmpeqd\t%0, %0, %0"
-		    : "pcmpeqd\t%0, %0");
-	  else if (TARGET_AVX512VL)
-	    return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
-	  else
-	    return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
+     The probe can be avoided if we have already emitted any callee
+     register saves into the stack or have a frame pointer (which will
+     have been saved as well).  Those saves will function as implicit
+     probes.
 
-	default:
-	  gcc_unreachable ();
+     ?!? This should be revamped to work like aarch64 and s390 where
+     we track the offset from the most recent probe.  Normally that
+     offset would be zero.  For a noreturn function we would reset
+     it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT).   Then
+     we just probe when we cross PROBE_INTERVAL.  */
+  if (TREE_THIS_VOLATILE (cfun->decl)
+      && !(m->frame.nregs || m->frame.nsseregs || frame_pointer_needed))
+    {
+      /* We can safely use any register here since we're just going to push
+	 its value and immediately pop it back.  But we do try and avoid
+	 argument passing registers so as not to introduce dependencies in
+	 the pipeline.  For 32 bit we use %esi and for 64 bit we use %rax.  */
+      rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
+      rtx_insn *insn_push = emit_insn (gen_push (dummy_reg));
+      rtx_insn *insn_pop = emit_insn (gen_pop (dummy_reg));
+      m->fs.sp_offset -= UNITS_PER_WORD;
+      if (m->fs.cfa_reg == stack_pointer_rtx)
+	{
+	  m->fs.cfa_offset -= UNITS_PER_WORD;
+	  rtx x = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
+	  x = gen_rtx_SET (stack_pointer_rtx, x);
+	  add_reg_note (insn_push, REG_CFA_ADJUST_CFA, x);
+	  RTX_FRAME_RELATED_P (insn_push) = 1;
+	  x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+	  x = gen_rtx_SET (stack_pointer_rtx, x);
+	  add_reg_note (insn_pop, REG_CFA_ADJUST_CFA, x);
+	  RTX_FRAME_RELATED_P (insn_pop) = 1;
 	}
-   }
-
-  gcc_unreachable ();
-}
-
-/* Returns true if INSN can be transformed from a memory load
-   to a supported FP constant load.  */
+      emit_insn (gen_blockage ());
+    }
 
-bool
-ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
-{
-  rtx src = find_constant_src (insn);
+  /* If we allocate less than the size of the guard statically,
+     then no probing is necessary, but we do need to allocate
+     the stack.  */
+  if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
+    {
+      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+			         GEN_INT (-size), -1,
+			         m->fs.cfa_reg == stack_pointer_rtx);
+      dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
+      return;
+    }
 
-  gcc_assert (REG_P (dst));
+  /* We're allocating a large enough stack frame that we need to
+     emit probes.  Either emit them inline or in a loop depending
+     on the size.  */
+  HOST_WIDE_INT probe_interval = get_probe_interval ();
+  if (size <= 4 * probe_interval)
+    {
+      HOST_WIDE_INT i;
+      for (i = probe_interval; i <= size; i += probe_interval)
+	{
+	  /* Allocate PROBE_INTERVAL bytes.  */
+	  rtx insn
+	    = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+					 GEN_INT (-probe_interval), -1,
+					 m->fs.cfa_reg == stack_pointer_rtx);
+	  add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
 
-  if (src == NULL
-      || (SSE_REGNO_P (REGNO (dst))
-	  && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
-      || (STACK_REGNO_P (REGNO (dst))
-	   && standard_80387_constant_p (src) < 1))
-    return false;
+	  /* And probe at *sp.  */
+	  emit_stack_probe (stack_pointer_rtx);
+	  emit_insn (gen_blockage ());
+	}
 
-  return true;
-}
+      /* We need to allocate space for the residual, but we do not need
+	 to probe the residual.  */
+      HOST_WIDE_INT residual = (i - probe_interval - size);
+      if (residual)
+	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				   GEN_INT (residual), -1,
+				   m->fs.cfa_reg == stack_pointer_rtx);
+      dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
+    }
+  else
+    {
+      /* We expect the GP registers to be saved when probes are used
+	 as the probing sequences might need a scratch register and
+	 the routine to allocate one assumes the integer registers
+	 have already been saved.  */
+      gcc_assert (int_registers_saved);
 
-/* Returns true if OP contains a symbol reference */
+      struct scratch_reg sr;
+      get_scratch_register_on_entry (&sr);
 
-bool
-symbolic_reference_mentioned_p (rtx op)
-{
-  const char *fmt;
-  int i;
+      /* If we needed to save a register, then account for any space
+	 that was pushed (we are not going to pop the register when
+	 we do the restore).  */
+      if (sr.saved)
+	size -= UNITS_PER_WORD;
 
-  if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
-    return true;
+      /* Step 1: round SIZE down to a multiple of the interval.  */
+      HOST_WIDE_INT rounded_size = size & -probe_interval;
 
-  fmt = GET_RTX_FORMAT (GET_CODE (op));
-  for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
-    {
-      if (fmt[i] == 'E')
+      /* Step 2: compute final value of the loop counter.  Use lea if
+	 possible.  */
+      rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
+      rtx insn;
+      if (address_no_seg_operand (addr, Pmode))
+	insn = emit_insn (gen_rtx_SET (sr.reg, addr));
+      else
 	{
-	  int j;
-
-	  for (j = XVECLEN (op, i) - 1; j >= 0; j--)
-	    if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
-	      return true;
+	  emit_move_insn (sr.reg, GEN_INT (-rounded_size));
+	  insn = emit_insn (gen_rtx_SET (sr.reg,
+					 gen_rtx_PLUS (Pmode, sr.reg,
+						       stack_pointer_rtx)));
+	}
+      if (m->fs.cfa_reg == stack_pointer_rtx)
+	{
+	  add_reg_note (insn, REG_CFA_DEF_CFA,
+			plus_constant (Pmode, sr.reg,
+				       m->fs.cfa_offset + rounded_size));
+	  RTX_FRAME_RELATED_P (insn) = 1;
 	}
 
-      else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
-	return true;
-    }
+      /* Step 3: the loop.  */
+      rtx size_rtx = GEN_INT (rounded_size);
+      insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
+							 size_rtx));
+      if (m->fs.cfa_reg == stack_pointer_rtx)
+	{
+	  m->fs.cfa_offset += rounded_size;
+	  add_reg_note (insn, REG_CFA_DEF_CFA,
+			plus_constant (Pmode, stack_pointer_rtx,
+				       m->fs.cfa_offset));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+      m->fs.sp_offset += rounded_size;
+      emit_insn (gen_blockage ());
 
-  return false;
-}
+      /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
+	 is equal to ROUNDED_SIZE.  */
 
-/* Return true if it is appropriate to emit `ret' instructions in the
-   body of a function.  Do this only if the epilogue is simple, needing a
-   couple of insns.  Prior to reloading, we can't tell how many registers
-   must be saved, so return false then.  Return false if there is no frame
-   marker to de-allocate.  */
+      if (size != rounded_size)
+	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				   GEN_INT (rounded_size - size), -1,
+				   m->fs.cfa_reg == stack_pointer_rtx);
+      dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
 
-bool
-ix86_can_use_return_insn_p (void)
-{
-  if (ix86_function_naked (current_function_decl))
-    return false;
+      /* This does not deallocate the space reserved for the scratch
+	 register.  That will be deallocated in the epilogue.  */
+      release_scratch_register_on_entry (&sr, size, false);
+    }
 
-  /* Don't use `ret' instruction in interrupt handler.  */
-  if (! reload_completed
-      || frame_pointer_needed
-      || cfun->machine->func_type != TYPE_NORMAL)
-    return 0;
+  /* Make sure nothing is scheduled before we are done.  */
+  emit_insn (gen_blockage ());
+}
 
-  /* Don't allow more than 32k pop, since that's all we can do
-     with one instruction.  */
-  if (crtl->args.pops_args && crtl->args.size >= 32768)
-    return 0;
+/* Emit code to adjust the stack pointer by SIZE bytes while probing it.
 
-  struct ix86_frame &frame = cfun->machine->frame;
-  return (frame.stack_pointer_offset == UNITS_PER_WORD
-	  && (frame.nregs + frame.nsseregs) == 0);
-}
-
-/* Value should be nonzero if functions must have frame pointers.
-   Zero means the frame pointer need not be set up (and parms may
-   be accessed via the stack pointer) in functions that seem suitable.  */
+   INT_REGISTERS_SAVED is true if integer registers have already been
+   pushed on the stack.  */
 
-static bool
-ix86_frame_pointer_required (void)
+static void
+ix86_adjust_stack_and_probe (HOST_WIDE_INT size,
+			     const bool int_registers_saved)
 {
-  /* If we accessed previous frames, then the generated code expects
-     to be able to access the saved ebp value in our frame.  */
-  if (cfun->machine->accesses_prev_frame)
-    return true;
+  /* We skip the probe for the first interval + a small dope of 4 words and
+     probe that many bytes past the specified size to maintain a protection
+     area at the botton of the stack.  */
+  const int dope = 4 * UNITS_PER_WORD;
+  rtx size_rtx = GEN_INT (size), last;
 
-  /* Several x86 os'es need a frame pointer for other reasons,
-     usually pertaining to setjmp.  */
-  if (SUBTARGET_FRAME_POINTER_REQUIRED)
-    return true;
+  /* See if we have a constant small number of probes to generate.  If so,
+     that's the easy case.  The run-time loop is made up of 9 insns in the
+     generic case while the compile-time loop is made up of 3+2*(n-1) insns
+     for n # of intervals.  */
+  if (size <= 4 * get_probe_interval ())
+    {
+      HOST_WIDE_INT i, adjust;
+      bool first_probe = true;
 
-  /* For older 32-bit runtimes setjmp requires valid frame-pointer.  */
-  if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
-    return true;
+      /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
+	 values of N from 1 until it exceeds SIZE.  If only one probe is
+	 needed, this will not generate any code.  Then adjust and probe
+	 to PROBE_INTERVAL + SIZE.  */
+      for (i = get_probe_interval (); i < size; i += get_probe_interval ())
+	{
+	  if (first_probe)
+	    {
+	      adjust = 2 * get_probe_interval () + dope;
+	      first_probe = false;
+	    }
+	  else
+	    adjust = get_probe_interval ();
 
-  /* Win64 SEH, very large frames need a frame-pointer as maximum stack
-     allocation is 4GB.  */
-  if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
-    return true;
+	  emit_insn (gen_rtx_SET (stack_pointer_rtx,
+				  plus_constant (Pmode, stack_pointer_rtx,
+						 -adjust)));
+	  emit_stack_probe (stack_pointer_rtx);
+	}
 
-  /* SSE saves require frame-pointer when stack is misaligned.  */
-  if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
-    return true;
-  
-  /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
-     turns off the frame pointer by default.  Turn it back on now if
-     we've not got a leaf function.  */
-  if (TARGET_OMIT_LEAF_FRAME_POINTER
-      && (!crtl->is_leaf
-	  || ix86_current_function_calls_tls_descriptor))
-    return true;
+      if (first_probe)
+	adjust = size + get_probe_interval () + dope;
+      else
+        adjust = size + get_probe_interval () - i;
 
-  if (crtl->profile && !flag_fentry)
-    return true;
+      emit_insn (gen_rtx_SET (stack_pointer_rtx,
+			      plus_constant (Pmode, stack_pointer_rtx,
+					     -adjust)));
+      emit_stack_probe (stack_pointer_rtx);
 
-  return false;
-}
+      /* Adjust back to account for the additional first interval.  */
+      last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
+				     plus_constant (Pmode, stack_pointer_rtx,
+						    (get_probe_interval ()
+						     + dope))));
+    }
 
-/* Record that the current function accesses previous call frames.  */
+  /* Otherwise, do the same as above, but in a loop.  Note that we must be
+     extra careful with variables wrapping around because we might be at
+     the very top (or the very bottom) of the address space and we have
+     to be able to handle this case properly; in particular, we use an
+     equality test for the loop condition.  */
+  else
+    {
+      /* We expect the GP registers to be saved when probes are used
+	 as the probing sequences might need a scratch register and
+	 the routine to allocate one assumes the integer registers
+	 have already been saved.  */
+      gcc_assert (int_registers_saved);
 
-void
-ix86_setup_frame_addresses (void)
-{
-  cfun->machine->accesses_prev_frame = 1;
-}
-
-#ifndef USE_HIDDEN_LINKONCE
-# if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
-#  define USE_HIDDEN_LINKONCE 1
-# else
-#  define USE_HIDDEN_LINKONCE 0
-# endif
-#endif
+      HOST_WIDE_INT rounded_size;
+      struct scratch_reg sr;
 
-/* Label count for call and return thunks.  It is used to make unique
-   labels in call and return thunks.  */
-static int indirectlabelno;
+      get_scratch_register_on_entry (&sr);
 
-/* True if call thunk function is needed.  */
-static bool indirect_thunk_needed = false;
+      /* If we needed to save a register, then account for any space
+	 that was pushed (we are not going to pop the register when
+	 we do the restore).  */
+      if (sr.saved)
+	size -= UNITS_PER_WORD;
 
-/* Bit masks of integer registers, which contain branch target, used
-   by call thunk functions.  */
-static int indirect_thunks_used;
+      /* Step 1: round SIZE to the previous multiple of the interval.  */
 
-/* True if return thunk function is needed.  */
-static bool indirect_return_needed = false;
+      rounded_size = ROUND_DOWN (size, get_probe_interval ());
 
-/* True if return thunk function via CX is needed.  */
-static bool indirect_return_via_cx;
 
-#ifndef INDIRECT_LABEL
-# define INDIRECT_LABEL "LIND"
-#endif
+      /* Step 2: compute initial and final value of the loop counter.  */
 
-/* Indicate what prefix is needed for an indirect branch.  */
-enum indirect_thunk_prefix
-{
-  indirect_thunk_prefix_none,
-  indirect_thunk_prefix_nt
-};
+      /* SP = SP_0 + PROBE_INTERVAL.  */
+      emit_insn (gen_rtx_SET (stack_pointer_rtx,
+			      plus_constant (Pmode, stack_pointer_rtx,
+					     - (get_probe_interval () + dope))));
 
-/* Return the prefix needed for an indirect branch INSN.  */
+      /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE.  */
+      if (rounded_size <= (HOST_WIDE_INT_1 << 31))
+	emit_insn (gen_rtx_SET (sr.reg,
+				plus_constant (Pmode, stack_pointer_rtx,
+					       -rounded_size)));
+      else
+	{
+	  emit_move_insn (sr.reg, GEN_INT (-rounded_size));
+	  emit_insn (gen_rtx_SET (sr.reg,
+				  gen_rtx_PLUS (Pmode, sr.reg,
+						stack_pointer_rtx)));
+	}
 
-enum indirect_thunk_prefix
-indirect_thunk_need_prefix (rtx_insn *insn)
-{
-  enum indirect_thunk_prefix need_prefix;
-  if ((cfun->machine->indirect_branch_type
-	    == indirect_branch_thunk_extern)
-	   && ix86_notrack_prefixed_insn_p (insn))
-    {
-      /* NOTRACK prefix is only used with external thunk so that it
-	 can be properly updated to support CET at run-time.  */
-      need_prefix = indirect_thunk_prefix_nt;
-    }
-  else
-    need_prefix = indirect_thunk_prefix_none;
-  return need_prefix;
-}
 
-/* Fills in the label name that should be used for the indirect thunk.  */
+      /* Step 3: the loop
 
-static void
-indirect_thunk_name (char name[32], unsigned int regno,
-		     enum indirect_thunk_prefix need_prefix,
-		     bool ret_p)
-{
-  if (regno != INVALID_REGNUM && regno != CX_REG && ret_p)
-    gcc_unreachable ();
+	 do
+	   {
+	     SP = SP + PROBE_INTERVAL
+	     probe at SP
+	   }
+	 while (SP != LAST_ADDR)
 
-  if (USE_HIDDEN_LINKONCE)
-    {
-      const char *prefix;
+	 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
+	 values of N from 1 until it is equal to ROUNDED_SIZE.  */
 
-      if (need_prefix == indirect_thunk_prefix_nt
-	  && regno != INVALID_REGNUM)
-	{
-	  /* NOTRACK prefix is only used with external thunk via
-	     register so that NOTRACK prefix can be added to indirect
-	     branch via register to support CET at run-time.  */
-	  prefix = "_nt";
-	}
-      else
-	prefix = "";
+      emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
 
-      const char *ret = ret_p ? "return" : "indirect";
 
-      if (regno != INVALID_REGNUM)
+      /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
+	 assert at compile-time that SIZE is equal to ROUNDED_SIZE.  */
+
+      if (size != rounded_size)
 	{
-	  const char *reg_prefix;
-	  if (LEGACY_INT_REGNO_P (regno))
-	    reg_prefix = TARGET_64BIT ? "r" : "e";
-	  else
-	    reg_prefix = "";
-	  sprintf (name, "__x86_%s_thunk%s_%s%s",
-		   ret, prefix, reg_prefix, reg_names[regno]);
+	  emit_insn (gen_rtx_SET (stack_pointer_rtx,
+			          plus_constant (Pmode, stack_pointer_rtx,
+						 rounded_size - size)));
+	  emit_stack_probe (stack_pointer_rtx);
 	}
-      else
-	sprintf (name, "__x86_%s_thunk%s", ret, prefix);
+
+      /* Adjust back to account for the additional first interval.  */
+      last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
+				     plus_constant (Pmode, stack_pointer_rtx,
+						    (get_probe_interval ()
+						     + dope))));
+
+      /* This does not deallocate the space reserved for the scratch
+	 register.  That will be deallocated in the epilogue.  */
+      release_scratch_register_on_entry (&sr, size, false);
     }
-  else
+
+  /* Even if the stack pointer isn't the CFA register, we need to correctly
+     describe the adjustments made to it, in particular differentiate the
+     frame-related ones from the frame-unrelated ones.  */
+  if (size > 0)
     {
-      if (regno != INVALID_REGNUM)
-	ASM_GENERATE_INTERNAL_LABEL (name, "LITR", regno);
-      else
-	{
-	  if (ret_p)
-	    ASM_GENERATE_INTERNAL_LABEL (name, "LRT", 0);
-	  else
-	    ASM_GENERATE_INTERNAL_LABEL (name, "LIT", 0);
-	}
+      rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
+      XVECEXP (expr, 0, 0)
+	= gen_rtx_SET (stack_pointer_rtx,
+		       plus_constant (Pmode, stack_pointer_rtx, -size));
+      XVECEXP (expr, 0, 1)
+	= gen_rtx_SET (stack_pointer_rtx,
+		       plus_constant (Pmode, stack_pointer_rtx,
+				      get_probe_interval () + dope + size));
+      add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
+      RTX_FRAME_RELATED_P (last) = 1;
+
+      cfun->machine->fs.sp_offset += size;
     }
+
+  /* Make sure nothing is scheduled before we are done.  */
+  emit_insn (gen_blockage ());
 }
 
-/* Output a call and return thunk for indirect branch.  If REGNO != -1,
-   the function address is in REGNO and the call and return thunk looks like:
+/* Adjust the stack pointer up to REG while probing it.  */
 
-	call	L2
-   L1:
-	pause
-	lfence
-	jmp	L1
-   L2:
-	mov	%REG, (%sp)
-	ret
+const char *
+output_adjust_stack_and_probe (rtx reg)
+{
+  static int labelno = 0;
+  char loop_lab[32];
+  rtx xops[2];
 
-   Otherwise, the function address is on the top of stack and the
-   call and return thunk looks like:
+  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
 
-	call L2
-  L1:
-	pause
-	lfence
-	jmp L1
-  L2:
-	lea WORD_SIZE(%sp), %sp
-	ret
- */
+  /* Loop.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
 
-static void
-output_indirect_thunk (unsigned int regno)
-{
-  char indirectlabel1[32];
-  char indirectlabel2[32];
+  /* SP = SP + PROBE_INTERVAL.  */
+  xops[0] = stack_pointer_rtx;
+  xops[1] = GEN_INT (get_probe_interval ());
+  output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
 
-  ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, INDIRECT_LABEL,
-			       indirectlabelno++);
-  ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, INDIRECT_LABEL,
-			       indirectlabelno++);
+  /* Probe at SP.  */
+  xops[1] = const0_rtx;
+  output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
 
-  /* Call */
-  fputs ("\tcall\t", asm_out_file);
-  assemble_name_raw (asm_out_file, indirectlabel2);
-  fputc ('\n', asm_out_file);
+  /* Test if SP == LAST_ADDR.  */
+  xops[0] = stack_pointer_rtx;
+  xops[1] = reg;
+  output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
 
-  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
+  /* Branch.  */
+  fputs ("\tjne\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_lab);
+  fputc ('\n', asm_out_file);
 
-  /* AMD and Intel CPUs prefer each a different instruction as loop filler.
-     Usage of both pause + lfence is compromise solution.  */
-  fprintf (asm_out_file, "\tpause\n\tlfence\n");
+  return "";
+}
 
-  /* Jump.  */
-  fputs ("\tjmp\t", asm_out_file);
-  assemble_name_raw (asm_out_file, indirectlabel1);
-  fputc ('\n', asm_out_file);
+/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
+   inclusive.  These are offsets from the current stack pointer.
 
-  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
+   INT_REGISTERS_SAVED is true if integer registers have already been
+   pushed on the stack.  */
 
-  /* The above call insn pushed a word to stack.  Adjust CFI info.  */
-  if (flag_asynchronous_unwind_tables && dwarf2out_do_frame ())
+static void
+ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
+			     const bool int_registers_saved)
+{
+  /* See if we have a constant small number of probes to generate.  If so,
+     that's the easy case.  The run-time loop is made up of 6 insns in the
+     generic case while the compile-time loop is made up of n insns for n #
+     of intervals.  */
+  if (size <= 6 * get_probe_interval ())
     {
-      if (! dwarf2out_do_cfi_asm ())
-	{
-	  dw_cfi_ref xcfi = ggc_cleared_alloc<dw_cfi_node> ();
-	  xcfi->dw_cfi_opc = DW_CFA_advance_loc4;
-	  xcfi->dw_cfi_oprnd1.dw_cfi_addr = ggc_strdup (indirectlabel2);
-	  vec_safe_push (cfun->fde->dw_fde_cfi, xcfi);
-	}
-      dw_cfi_ref xcfi = ggc_cleared_alloc<dw_cfi_node> ();
-      xcfi->dw_cfi_opc = DW_CFA_def_cfa_offset;
-      xcfi->dw_cfi_oprnd1.dw_cfi_offset = 2 * UNITS_PER_WORD;
-      vec_safe_push (cfun->fde->dw_fde_cfi, xcfi);
-      dwarf2out_emit_cfi (xcfi);
-    }
+      HOST_WIDE_INT i;
 
-  if (regno != INVALID_REGNUM)
-    {
-      /* MOV.  */
-      rtx xops[2];
-      xops[0] = gen_rtx_MEM (word_mode, stack_pointer_rtx);
-      xops[1] = gen_rtx_REG (word_mode, regno);
-      output_asm_insn ("mov\t{%1, %0|%0, %1}", xops);
+      /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
+	 it exceeds SIZE.  If only one probe is needed, this will not
+	 generate any code.  Then probe at FIRST + SIZE.  */
+      for (i = get_probe_interval (); i < size; i += get_probe_interval ())
+	emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+					 -(first + i)));
+
+      emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+				       -(first + size)));
     }
+
+  /* Otherwise, do the same as above, but in a loop.  Note that we must be
+     extra careful with variables wrapping around because we might be at
+     the very top (or the very bottom) of the address space and we have
+     to be able to handle this case properly; in particular, we use an
+     equality test for the loop condition.  */
   else
     {
-      /* LEA.  */
-      rtx xops[2];
-      xops[0] = stack_pointer_rtx;
-      xops[1] = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
-      output_asm_insn ("lea\t{%E1, %0|%0, %E1}", xops);
-    }
-
-  fputs ("\tret\n", asm_out_file);
-}
+      /* We expect the GP registers to be saved when probes are used
+	 as the probing sequences might need a scratch register and
+	 the routine to allocate one assumes the integer registers
+	 have already been saved.  */
+      gcc_assert (int_registers_saved);
 
-/* Output a funtion with a call and return thunk for indirect branch.
-   If REGNO != INVALID_REGNUM, the function address is in REGNO.
-   Otherwise, the function address is on the top of stack.  Thunk is
-   used for function return if RET_P is true.  */
+      HOST_WIDE_INT rounded_size, last;
+      struct scratch_reg sr;
 
-static void
-output_indirect_thunk_function (enum indirect_thunk_prefix need_prefix,
-				unsigned int regno, bool ret_p)
-{
-  char name[32];
-  tree decl;
+      get_scratch_register_on_entry (&sr);
 
-  /* Create __x86_indirect_thunk.  */
-  indirect_thunk_name (name, regno, need_prefix, ret_p);
-  decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
-		     get_identifier (name),
-		     build_function_type_list (void_type_node, NULL_TREE));
-  DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
-				   NULL_TREE, void_type_node);
-  TREE_PUBLIC (decl) = 1;
-  TREE_STATIC (decl) = 1;
-  DECL_IGNORED_P (decl) = 1;
 
-#if TARGET_MACHO
-  if (TARGET_MACHO)
-    {
-      switch_to_section (darwin_sections[picbase_thunk_section]);
-      fputs ("\t.weak_definition\t", asm_out_file);
-      assemble_name (asm_out_file, name);
-      fputs ("\n\t.private_extern\t", asm_out_file);
-      assemble_name (asm_out_file, name);
-      putc ('\n', asm_out_file);
-      ASM_OUTPUT_LABEL (asm_out_file, name);
-      DECL_WEAK (decl) = 1;
-    }
-  else
-#endif
-    if (USE_HIDDEN_LINKONCE)
-      {
-	cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
+      /* Step 1: round SIZE to the previous multiple of the interval.  */
 
-	targetm.asm_out.unique_section (decl, 0);
-	switch_to_section (get_named_section (decl, NULL, 0));
+      rounded_size = ROUND_DOWN (size, get_probe_interval ());
 
-	targetm.asm_out.globalize_label (asm_out_file, name);
-	fputs ("\t.hidden\t", asm_out_file);
-	assemble_name (asm_out_file, name);
-	putc ('\n', asm_out_file);
-	ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
-      }
-    else
-      {
-	switch_to_section (text_section);
-	ASM_OUTPUT_LABEL (asm_out_file, name);
-      }
 
-  DECL_INITIAL (decl) = make_node (BLOCK);
-  current_function_decl = decl;
-  allocate_struct_function (decl, false);
-  init_function_start (decl);
-  /* We're about to hide the function body from callees of final_* by
-     emitting it directly; tell them we're a thunk, if they care.  */
-  cfun->is_thunk = true;
-  first_function_block_is_cold = false;
-  /* Make sure unwind info is emitted for the thunk if needed.  */
-  final_start_function (emit_barrier (), asm_out_file, 1);
+      /* Step 2: compute initial and final value of the loop counter.  */
 
-  output_indirect_thunk (regno);
+      /* TEST_OFFSET = FIRST.  */
+      emit_move_insn (sr.reg, GEN_INT (-first));
 
-  final_end_function ();
-  init_insn_lengths ();
-  free_after_compilation (cfun);
-  set_cfun (NULL);
-  current_function_decl = NULL;
-}
+      /* LAST_OFFSET = FIRST + ROUNDED_SIZE.  */
+      last = first + rounded_size;
 
-static int pic_labels_used;
 
-/* Fills in the label name that should be used for a pc thunk for
-   the given register.  */
+      /* Step 3: the loop
 
-static void
-get_pc_thunk_name (char name[32], unsigned int regno)
-{
-  gcc_assert (!TARGET_64BIT);
+	 do
+	   {
+	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
+	     probe at TEST_ADDR
+	   }
+	 while (TEST_ADDR != LAST_ADDR)
 
-  if (USE_HIDDEN_LINKONCE)
-    sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
-  else
-    ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
-}
+         probes at FIRST + N * PROBE_INTERVAL for values of N from 1
+         until it is equal to ROUNDED_SIZE.  */
 
+      emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
 
-/* This function generates code for -fpic that loads %ebx with
-   the return address of the caller and then returns.  */
 
-static void
-ix86_code_end (void)
-{
-  rtx xops[2];
-  unsigned int regno;
+      /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
+	 that SIZE is equal to ROUNDED_SIZE.  */
 
-  if (indirect_return_needed)
-    output_indirect_thunk_function (indirect_thunk_prefix_none,
-				    INVALID_REGNUM, true);
-  if (indirect_return_via_cx)
-    output_indirect_thunk_function (indirect_thunk_prefix_none,
-				    CX_REG, true);
-  if (indirect_thunk_needed)
-    output_indirect_thunk_function (indirect_thunk_prefix_none,
-				    INVALID_REGNUM, false);
+      if (size != rounded_size)
+	emit_stack_probe (plus_constant (Pmode,
+					 gen_rtx_PLUS (Pmode,
+						       stack_pointer_rtx,
+						       sr.reg),
+					 rounded_size - size));
 
-  for (regno = FIRST_REX_INT_REG; regno <= LAST_REX_INT_REG; regno++)
-    {
-      unsigned int i = regno - FIRST_REX_INT_REG + LAST_INT_REG + 1;
-      if ((indirect_thunks_used & (1 << i)))
-	output_indirect_thunk_function (indirect_thunk_prefix_none,
-					regno, false);
+      release_scratch_register_on_entry (&sr, size, true);
     }
 
-  for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
-    {
-      char name[32];
-      tree decl;
+  /* Make sure nothing is scheduled before we are done.  */
+  emit_insn (gen_blockage ());
+}
 
-      if ((indirect_thunks_used & (1 << regno)))
-	output_indirect_thunk_function (indirect_thunk_prefix_none,
-					regno, false);
-
-      if (!(pic_labels_used & (1 << regno)))
-	continue;
-
-      get_pc_thunk_name (name, regno);
-
-      decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
-			 get_identifier (name),
-			 build_function_type_list (void_type_node, NULL_TREE));
-      DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
-				       NULL_TREE, void_type_node);
-      TREE_PUBLIC (decl) = 1;
-      TREE_STATIC (decl) = 1;
-      DECL_IGNORED_P (decl) = 1;
-
-#if TARGET_MACHO
-      if (TARGET_MACHO)
-	{
-	  switch_to_section (darwin_sections[picbase_thunk_section]);
-	  fputs ("\t.weak_definition\t", asm_out_file);
-	  assemble_name (asm_out_file, name);
-	  fputs ("\n\t.private_extern\t", asm_out_file);
-	  assemble_name (asm_out_file, name);
-	  putc ('\n', asm_out_file);
-	  ASM_OUTPUT_LABEL (asm_out_file, name);
-	  DECL_WEAK (decl) = 1;
-	}
-      else
-#endif
-      if (USE_HIDDEN_LINKONCE)
-	{
-	  cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
-
-	  targetm.asm_out.unique_section (decl, 0);
-	  switch_to_section (get_named_section (decl, NULL, 0));
-
-	  targetm.asm_out.globalize_label (asm_out_file, name);
-	  fputs ("\t.hidden\t", asm_out_file);
-	  assemble_name (asm_out_file, name);
-	  putc ('\n', asm_out_file);
-	  ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
-	}
-      else
-	{
-	  switch_to_section (text_section);
-	  ASM_OUTPUT_LABEL (asm_out_file, name);
-	}
-
-      DECL_INITIAL (decl) = make_node (BLOCK);
-      current_function_decl = decl;
-      allocate_struct_function (decl, false);
-      init_function_start (decl);
-      /* We're about to hide the function body from callees of final_* by
-	 emitting it directly; tell them we're a thunk, if they care.  */
-      cfun->is_thunk = true;
-      first_function_block_is_cold = false;
-      /* Make sure unwind info is emitted for the thunk if needed.  */
-      final_start_function (emit_barrier (), asm_out_file, 1);
-
-      /* Pad stack IP move with 4 instructions (two NOPs count
-	 as one instruction).  */
-      if (TARGET_PAD_SHORT_FUNCTION)
-	{
-	  int i = 8;
-
-	  while (i--)
-	    fputs ("\tnop\n", asm_out_file);
-	}
-
-      xops[0] = gen_rtx_REG (Pmode, regno);
-      xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
-      output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
-      output_asm_insn ("%!ret", NULL);
-      final_end_function ();
-      init_insn_lengths ();
-      free_after_compilation (cfun);
-      set_cfun (NULL);
-      current_function_decl = NULL;
-    }
-
-  if (flag_split_stack)
-    file_end_indicate_split_stack ();
-}
-
-/* Emit code for the SET_GOT patterns.  */
+/* Probe a range of stack addresses from REG to END, inclusive.  These are
+   offsets from the current stack pointer.  */
 
 const char *
-output_set_got (rtx dest, rtx label)
+output_probe_stack_range (rtx reg, rtx end)
 {
+  static int labelno = 0;
+  char loop_lab[32];
   rtx xops[3];
 
-  xops[0] = dest;
-
-  if (TARGET_VXWORKS_RTP && flag_pic)
-    {
-      /* Load (*VXWORKS_GOTT_BASE) into the PIC register.  */
-      xops[2] = gen_rtx_MEM (Pmode,
-			     gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
-      output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
-
-      /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
-	 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
-	 an unadorned address.  */
-      xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
-      SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
-      output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
-      return "";
-    }
-
-  xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
-
-  if (flag_pic)
-    {
-      char name[32];
-      get_pc_thunk_name (name, REGNO (dest));
-      pic_labels_used |= 1 << REGNO (dest);
+  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
 
-      xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
-      xops[2] = gen_rtx_MEM (QImode, xops[2]);
-      output_asm_insn ("%!call\t%X2", xops);
+  /* Loop.  */
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
 
-#if TARGET_MACHO
-      /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
-         This is what will be referenced by the Mach-O PIC subsystem.  */
-      if (machopic_should_output_picbase_label () || !label)
-	ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
+  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
+  xops[0] = reg;
+  xops[1] = GEN_INT (get_probe_interval ());
+  output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
 
-      /* When we are restoring the pic base at the site of a nonlocal label,
-         and we decided to emit the pic base above, we will still output a
-         local label used for calculating the correction offset (even though
-         the offset will be 0 in that case).  */
-      if (label)
-        targetm.asm_out.internal_label (asm_out_file, "L",
-					   CODE_LABEL_NUMBER (label));
-#endif
-    }
-  else
-    {
-      if (TARGET_MACHO)
-	/* We don't need a pic base, we're not producing pic.  */
-	gcc_unreachable ();
+  /* Probe at TEST_ADDR.  */
+  xops[0] = stack_pointer_rtx;
+  xops[1] = reg;
+  xops[2] = const0_rtx;
+  output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
 
-      xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
-      output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
-      targetm.asm_out.internal_label (asm_out_file, "L",
-				      CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
-    }
+  /* Test if TEST_ADDR == LAST_ADDR.  */
+  xops[0] = reg;
+  xops[1] = end;
+  output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
 
-  if (!TARGET_MACHO)
-    output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
+  /* Branch.  */
+  fputs ("\tjne\t", asm_out_file);
+  assemble_name_raw (asm_out_file, loop_lab);
+  fputc ('\n', asm_out_file);
 
   return "";
 }
 
-/* Generate an "push" pattern for input ARG.  */
+/* Return true if stack frame is required.  Update STACK_ALIGNMENT
+   to the largest alignment, in bits, of stack slot used if stack
+   frame is required and CHECK_STACK_SLOT is true.  */
 
-static rtx
-gen_push (rtx arg)
+static bool
+ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
+				    bool check_stack_slot)
 {
-  struct machine_function *m = cfun->machine;
+  HARD_REG_SET set_up_by_prologue, prologue_used;
+  basic_block bb;
 
-  if (m->fs.cfa_reg == stack_pointer_rtx)
-    m->fs.cfa_offset += UNITS_PER_WORD;
-  m->fs.sp_offset += UNITS_PER_WORD;
+  CLEAR_HARD_REG_SET (prologue_used);
+  CLEAR_HARD_REG_SET (set_up_by_prologue);
+  add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
+  add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
+  add_to_hard_reg_set (&set_up_by_prologue, Pmode,
+		       HARD_FRAME_POINTER_REGNUM);
 
-  if (REG_P (arg) && GET_MODE (arg) != word_mode)
-    arg = gen_rtx_REG (word_mode, REGNO (arg));
+  /* The preferred stack alignment is the minimum stack alignment.  */
+  if (stack_alignment > crtl->preferred_stack_boundary)
+    stack_alignment = crtl->preferred_stack_boundary;
 
-  return gen_rtx_SET (gen_rtx_MEM (word_mode,
-				   gen_rtx_PRE_DEC (Pmode,
-						    stack_pointer_rtx)),
-		      arg);
-}
+  bool require_stack_frame = false;
 
-/* Generate an "pop" pattern for input ARG.  */
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      rtx_insn *insn;
+      FOR_BB_INSNS (bb, insn)
+	if (NONDEBUG_INSN_P (insn)
+	    && requires_stack_frame_p (insn, prologue_used,
+				       set_up_by_prologue))
+	  {
+	    require_stack_frame = true;
 
-static rtx
-gen_pop (rtx arg)
-{
-  if (REG_P (arg) && GET_MODE (arg) != word_mode)
-    arg = gen_rtx_REG (word_mode, REGNO (arg));
+	    if (check_stack_slot)
+	      {
+		/* Find the maximum stack alignment.  */
+		subrtx_iterator::array_type array;
+		FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
+		  if (MEM_P (*iter)
+		      && (reg_mentioned_p (stack_pointer_rtx,
+					   *iter)
+			  || reg_mentioned_p (frame_pointer_rtx,
+					      *iter)))
+		    {
+		      unsigned int alignment = MEM_ALIGN (*iter);
+		      if (alignment > stack_alignment)
+			stack_alignment = alignment;
+		    }
+	      }
+	  }
+    }
 
-  return gen_rtx_SET (arg,
-		      gen_rtx_MEM (word_mode,
-				   gen_rtx_POST_INC (Pmode,
-						     stack_pointer_rtx)));
+  return require_stack_frame;
 }
 
-/* Return >= 0 if there is an unused call-clobbered register available
-   for the entire function.  */
+/* Finalize stack_realign_needed and frame_pointer_needed flags, which
+   will guide prologue/epilogue to be generated in correct form.  */
 
-static unsigned int
-ix86_select_alt_pic_regnum (void)
+static void
+ix86_finalize_stack_frame_flags (void)
 {
-  if (ix86_use_pseudo_pic_reg ())
-    return INVALID_REGNUM;
+  /* Check if stack realign is really needed after reload, and
+     stores result in cfun */
+  unsigned int incoming_stack_boundary
+    = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
+       ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
+  unsigned int stack_alignment
+    = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
+       ? crtl->max_used_stack_slot_alignment
+       : crtl->stack_alignment_needed);
+  unsigned int stack_realign
+    = (incoming_stack_boundary < stack_alignment);
+  bool recompute_frame_layout_p = false;
 
-  if (crtl->is_leaf
-      && !crtl->profile
-      && !ix86_current_function_calls_tls_descriptor)
+  if (crtl->stack_realign_finalized)
     {
-      int i, drap;
-      /* Can't use the same register for both PIC and DRAP.  */
-      if (crtl->drap_reg)
-	drap = REGNO (crtl->drap_reg);
-      else
-	drap = -1;
-      for (i = 2; i >= 0; --i)
-        if (i != drap && !df_regs_ever_live_p (i))
-	  return i;
+      /* After stack_realign_needed is finalized, we can't no longer
+	 change it.  */
+      gcc_assert (crtl->stack_realign_needed == stack_realign);
+      return;
     }
 
-  return INVALID_REGNUM;
-}
-
-/* Return true if REGNO is used by the epilogue.  */
+  /* If the only reason for frame_pointer_needed is that we conservatively
+     assumed stack realignment might be needed or -fno-omit-frame-pointer
+     is used, but in the end nothing that needed the stack alignment had
+     been spilled nor stack access, clear frame_pointer_needed and say we
+     don't need stack realignment.  */
+  if ((stack_realign || (!flag_omit_frame_pointer && optimize))
+      && frame_pointer_needed
+      && crtl->is_leaf
+      && crtl->sp_is_unchanging
+      && !ix86_current_function_calls_tls_descriptor
+      && !crtl->accesses_prior_frames
+      && !cfun->calls_alloca
+      && !crtl->calls_eh_return
+      /* See ira_setup_eliminable_regset for the rationale.  */
+      && !(STACK_CHECK_MOVING_SP
+	   && flag_stack_check
+	   && flag_exceptions
+	   && cfun->can_throw_non_call_exceptions)
+      && !ix86_frame_pointer_required ()
+      && get_frame_size () == 0
+      && ix86_nsaved_sseregs () == 0
+      && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
+    {
+      if (ix86_find_max_used_stack_alignment (stack_alignment,
+					      stack_realign))
+	{
+	  /* Stack frame is required.  If stack alignment needed is less
+	     than incoming stack boundary, don't realign stack.  */
+	  stack_realign = incoming_stack_boundary < stack_alignment;
+	  if (!stack_realign)
+	    {
+	      crtl->max_used_stack_slot_alignment
+		= incoming_stack_boundary;
+	      crtl->stack_alignment_needed
+		= incoming_stack_boundary;
+	      /* Also update preferred_stack_boundary for leaf
+	         functions.  */
+	      crtl->preferred_stack_boundary
+		= incoming_stack_boundary;
+	    }
+	}
+      else
+	{
+	  /* If drap has been set, but it actually isn't live at the
+	     start of the function, there is no reason to set it up.  */
+	  if (crtl->drap_reg)
+	    {
+	      basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
+	      if (! REGNO_REG_SET_P (DF_LR_IN (bb),
+				     REGNO (crtl->drap_reg)))
+		{
+		  crtl->drap_reg = NULL_RTX;
+		  crtl->need_drap = false;
+		}
+	    }
+	  else
+	    cfun->machine->no_drap_save_restore = true;
 
-bool
-ix86_epilogue_uses (int regno)
-{
-  /* If there are no caller-saved registers, we preserve all registers,
-     except for MMX and x87 registers which aren't supported when saving
-     and restoring registers.  Don't explicitly save SP register since
-     it is always preserved.  */
-  return (epilogue_completed
-	  && cfun->machine->no_caller_saved_registers
-	  && !fixed_regs[regno]
-	  && !STACK_REGNO_P (regno)
-	  && !MMX_REGNO_P (regno));
-}
+	  frame_pointer_needed = false;
+	  stack_realign = false;
+	  crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
+	  crtl->stack_alignment_needed = incoming_stack_boundary;
+	  crtl->stack_alignment_estimated = incoming_stack_boundary;
+	  if (crtl->preferred_stack_boundary > incoming_stack_boundary)
+	    crtl->preferred_stack_boundary = incoming_stack_boundary;
+	  df_finish_pass (true);
+	  df_scan_alloc (NULL);
+	  df_scan_blocks ();
+	  df_compute_regs_ever_live (true);
+	  df_analyze ();
 
-/* Return nonzero if register REGNO can be used as a scratch register
-   in peephole2.  */
+	  if (flag_var_tracking)
+	    {
+	      /* Since frame pointer is no longer available, replace it with
+		 stack pointer - UNITS_PER_WORD in debug insns.  */
+	      df_ref ref, next;
+	      for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
+		   ref; ref = next)
+		{
+		  next = DF_REF_NEXT_REG (ref);
+		  if (!DF_REF_INSN_INFO (ref))
+		    continue;
 
-static bool
-ix86_hard_regno_scratch_ok (unsigned int regno)
-{
-  /* If there are no caller-saved registers, we can't use any register
-     as a scratch register after epilogue and use REGNO as scratch
-     register only if it has been used before to avoid saving and
-     restoring it.  */
-  return (!cfun->machine->no_caller_saved_registers
-	  || (!epilogue_completed
-	      && df_regs_ever_live_p (regno)));
-}
+		  /* Make sure the next ref is for a different instruction,
+		     so that we're not affected by the rescan.  */
+		  rtx_insn *insn = DF_REF_INSN (ref);
+		  while (next && DF_REF_INSN (next) == insn)
+		    next = DF_REF_NEXT_REG (next);
 
-/* Return TRUE if we need to save REGNO.  */
+		  if (DEBUG_INSN_P (insn))
+		    {
+		      bool changed = false;
+		      for (; ref != next; ref = DF_REF_NEXT_REG (ref))
+			{
+			  rtx *loc = DF_REF_LOC (ref);
+			  if (*loc == hard_frame_pointer_rtx)
+			    {
+			      *loc = plus_constant (Pmode,
+						    stack_pointer_rtx,
+						    -UNITS_PER_WORD);
+			      changed = true;
+			    }
+			}
+		      if (changed)
+			df_insn_rescan (insn);
+		    }
+		}
+	    }
 
-static bool
-ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
-{
-  /* If there are no caller-saved registers, we preserve all registers,
-     except for MMX and x87 registers which aren't supported when saving
-     and restoring registers.  Don't explicitly save SP register since
-     it is always preserved.  */
-  if (cfun->machine->no_caller_saved_registers)
-    {
-      /* Don't preserve registers used for function return value.  */
-      rtx reg = crtl->return_rtx;
-      if (reg)
-	{
-	  unsigned int i = REGNO (reg);
-	  unsigned int nregs = REG_NREGS (reg);
-	  while (nregs-- > 0)
-	    if ((i + nregs) == regno)
-	      return false;
+	  recompute_frame_layout_p = true;
 	}
-
-      return (df_regs_ever_live_p (regno)
-	      && !fixed_regs[regno]
-	      && !STACK_REGNO_P (regno)
-	      && !MMX_REGNO_P (regno)
-	      && (regno != HARD_FRAME_POINTER_REGNUM
-		  || !frame_pointer_needed));
     }
-
-  if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
-      && pic_offset_table_rtx)
+  else if (crtl->max_used_stack_slot_alignment >= 128)
     {
-      if (ix86_use_pseudo_pic_reg ())
-	{
-	  /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
-	  _mcount in prologue.  */
-	  if (!TARGET_64BIT && flag_pic && crtl->profile)
-	    return true;
-	}
-      else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
-	       || crtl->profile
-	       || crtl->calls_eh_return
-	       || crtl->uses_const_pool
-	       || cfun->has_nonlocal_label)
-        return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
+      /* We don't need to realign stack.  max_used_stack_alignment is
+	 used to decide how stack frame should be aligned.  This is
+	 independent of any psABIs nor 32-bit vs 64-bit.  It is always
+	 safe to compute max_used_stack_alignment.  We compute it only
+	 if 128-bit aligned load/store may be generated on misaligned
+	 stack slot which will lead to segfault.   */
+      if (ix86_find_max_used_stack_alignment (stack_alignment, true))
+	cfun->machine->max_used_stack_alignment
+	  = stack_alignment / BITS_PER_UNIT;
     }
 
-  if (crtl->calls_eh_return && maybe_eh_return)
+  if (crtl->stack_realign_needed != stack_realign)
+    recompute_frame_layout_p = true;
+  crtl->stack_realign_needed = stack_realign;
+  crtl->stack_realign_finalized = true;
+  if (recompute_frame_layout_p)
+    ix86_compute_frame_layout ();
+}
+
+/* Delete SET_GOT right after entry block if it is allocated to reg.  */
+
+static void
+ix86_elim_entry_set_got (rtx reg)
+{
+  basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
+  rtx_insn *c_insn = BB_HEAD (bb);
+  if (!NONDEBUG_INSN_P (c_insn))
+    c_insn = next_nonnote_nondebug_insn (c_insn);
+  if (c_insn && NONJUMP_INSN_P (c_insn))
     {
-      unsigned i;
-      for (i = 0; ; i++)
+      rtx pat = PATTERN (c_insn);
+      if (GET_CODE (pat) == PARALLEL)
 	{
-	  unsigned test = EH_RETURN_DATA_REGNO (i);
-	  if (test == INVALID_REGNUM)
-	    break;
-	  if (test == regno)
-	    return true;
+	  rtx vec = XVECEXP (pat, 0, 0);
+	  if (GET_CODE (vec) == SET
+	      && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
+	      && REGNO (XEXP (vec, 0)) == REGNO (reg))
+	    delete_insn (c_insn);
 	}
     }
-
-  if (ignore_outlined && cfun->machine->call_ms2sysv)
-    {
-      unsigned count = cfun->machine->call_ms2sysv_extra_regs
-		       + xlogue_layout::MIN_REGS;
-      if (xlogue_layout::is_stub_managed_reg (regno, count))
-	return false;
-    }
-
-  if (crtl->drap_reg
-      && regno == REGNO (crtl->drap_reg)
-      && !cfun->machine->no_drap_save_restore)
-    return true;
-
-  return (df_regs_ever_live_p (regno)
-	  && !call_used_regs[regno]
-	  && !fixed_regs[regno]
-	  && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
 }
 
-/* Return number of saved general prupose registers.  */
-
-static int
-ix86_nsaved_regs (void)
+static rtx
+gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
 {
-  int nregs = 0;
-  int regno;
+  rtx addr, mem;
 
-  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
-    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
-      nregs ++;
-  return nregs;
+  if (offset)
+    addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
+  mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
+  return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
 }
 
-/* Return number of saved SSE registers.  */
-
-static int
-ix86_nsaved_sseregs (void)
+static inline rtx
+gen_frame_load (rtx reg, rtx frame_reg, int offset)
 {
-  int nregs = 0;
-  int regno;
-
-  if (!TARGET_64BIT_MS_ABI)
-    return 0;
-  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
-    if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
-      nregs ++;
-  return nregs;
+  return gen_frame_set (reg, frame_reg, offset, false);
 }
 
-/* Given FROM and TO register numbers, say whether this elimination is
-   allowed.  If stack alignment is needed, we can only replace argument
-   pointer with hard frame pointer, or replace frame pointer with stack
-   pointer.  Otherwise, frame pointer elimination is automatically
-   handled and all other eliminations are valid.  */
-
-static bool
-ix86_can_eliminate (const int from, const int to)
+static inline rtx
+gen_frame_store (rtx reg, rtx frame_reg, int offset)
 {
-  if (stack_realign_fp)
-    return ((from == ARG_POINTER_REGNUM
-	     && to == HARD_FRAME_POINTER_REGNUM)
-	    || (from == FRAME_POINTER_REGNUM
-		&& to == STACK_POINTER_REGNUM));
-  else
-    return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
+  return gen_frame_set (reg, frame_reg, offset, true);
 }
 
-/* Return the offset between two registers, one to be eliminated, and the other
-   its replacement, at the start of a routine.  */
-
-HOST_WIDE_INT
-ix86_initial_elimination_offset (int from, int to)
+static void
+ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
 {
-  struct ix86_frame &frame = cfun->machine->frame;
+  struct machine_function *m = cfun->machine;
+  const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
+			  + m->call_ms2sysv_extra_regs;
+  rtvec v = rtvec_alloc (ncregs + 1);
+  unsigned int align, i, vi = 0;
+  rtx_insn *insn;
+  rtx sym, addr;
+  rtx rax = gen_rtx_REG (word_mode, AX_REG);
+  const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
 
-  if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
-    return frame.hard_frame_pointer_offset;
-  else if (from == FRAME_POINTER_REGNUM
-	   && to == HARD_FRAME_POINTER_REGNUM)
-    return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
-  else
-    {
-      gcc_assert (to == STACK_POINTER_REGNUM);
+  /* AL should only be live with sysv_abi.  */
+  gcc_assert (!ix86_eax_live_at_start_p ());
+  gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
 
-      if (from == ARG_POINTER_REGNUM)
-	return frame.stack_pointer_offset;
+  /* Setup RAX as the stub's base pointer.  We use stack_realign_offset rather
+     we've actually realigned the stack or not.  */
+  align = GET_MODE_ALIGNMENT (V4SFmode);
+  addr = choose_baseaddr (frame.stack_realign_offset
+			  + xlogue.get_stub_ptr_offset (), &align, AX_REG);
+  gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
 
-      gcc_assert (from == FRAME_POINTER_REGNUM);
-      return frame.stack_pointer_offset - frame.frame_pointer_offset;
-    }
-}
+  emit_insn (gen_rtx_SET (rax, addr));
 
-/* In a dynamically-aligned function, we can't know the offset from
-   stack pointer to frame pointer, so we must ensure that setjmp
-   eliminates fp against the hard fp (%ebp) rather than trying to
-   index from %esp up to the top of the frame across a gap that is
-   of unknown (at compile-time) size.  */
-static rtx
-ix86_builtin_setjmp_frame_value (void)
-{
-  return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
-}
+  /* Get the stub symbol.  */
+  sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
+						  : XLOGUE_STUB_SAVE);
+  RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
 
-/* Emits a warning for unsupported msabi to sysv pro/epilogues.  */
-static void warn_once_call_ms2sysv_xlogues (const char *feature)
-{
-  static bool warned_once = false;
-  if (!warned_once)
+  for (i = 0; i < ncregs; ++i)
     {
-      warning (0, "%<-mcall-ms2sysv-xlogues%> is not compatible with %s",
-	       feature);
-      warned_once = true;
+      const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
+      rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
+			     r.regno);
+      RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
     }
-}
 
-/* Return the probing interval for -fstack-clash-protection.  */
+  gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
 
-static HOST_WIDE_INT
-get_probe_interval (void)
-{
-  if (flag_stack_clash_protection)
-    return (HOST_WIDE_INT_1U
-	    << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
-  else
-    return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
+  insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
+  RTX_FRAME_RELATED_P (insn) = true;
 }
 
-/* When using -fsplit-stack, the allocation routines set a field in
-   the TCB to the bottom of the stack plus this much space, measured
-   in bytes.  */
-
-#define SPLIT_STACK_AVAILABLE 256
-
-/* Fill structure ix86_frame about frame of currently computed function.  */
+/* Expand the prologue into a bunch of separate insns.  */
 
-static void
-ix86_compute_frame_layout (void)
+void
+ix86_expand_prologue (void)
 {
-  struct ix86_frame *frame = &cfun->machine->frame;
   struct machine_function *m = cfun->machine;
-  unsigned HOST_WIDE_INT stack_alignment_needed;
-  HOST_WIDE_INT offset;
-  unsigned HOST_WIDE_INT preferred_alignment;
-  HOST_WIDE_INT size = get_frame_size ();
-  HOST_WIDE_INT to_allocate;
+  rtx insn, t;
+  HOST_WIDE_INT allocate;
+  bool int_registers_saved;
+  bool sse_registers_saved;
+  bool save_stub_call_needed;
+  rtx static_chain = NULL_RTX;
 
-  /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
-   * ms_abi functions that call a sysv function.  We now need to prune away
-   * cases where it should be disabled.  */
-  if (TARGET_64BIT && m->call_ms2sysv)
-    {
-      gcc_assert (TARGET_64BIT_MS_ABI);
-      gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
-      gcc_assert (!TARGET_SEH);
-      gcc_assert (TARGET_SSE);
-      gcc_assert (!ix86_using_red_zone ());
+  if (ix86_function_naked (current_function_decl))
+    return;
 
-      if (crtl->calls_eh_return)
-	{
-	  gcc_assert (!reload_completed);
-	  m->call_ms2sysv = false;
-	  warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
-	}
+  ix86_finalize_stack_frame_flags ();
 
-      else if (ix86_static_chain_on_stack)
-	{
-	  gcc_assert (!reload_completed);
-	  m->call_ms2sysv = false;
-	  warn_once_call_ms2sysv_xlogues ("static call chains");
-	}
+  /* DRAP should not coexist with stack_realign_fp */
+  gcc_assert (!(crtl->drap_reg && stack_realign_fp));
 
-      /* Finally, compute which registers the stub will manage.  */
-      else
-	{
-	  unsigned count = xlogue_layout::count_stub_managed_regs ();
-	  m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
-	  m->call_ms2sysv_pad_in = 0;
-	}
-    }
+  memset (&m->fs, 0, sizeof (m->fs));
 
-  frame->nregs = ix86_nsaved_regs ();
-  frame->nsseregs = ix86_nsaved_sseregs ();
+  /* Initialize CFA state for before the prologue.  */
+  m->fs.cfa_reg = stack_pointer_rtx;
+  m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
 
-  /* 64-bit MS ABI seem to require stack alignment to be always 16,
-     except for function prologues, leaf functions and when the defult
-     incoming stack boundary is overriden at command line or via
-     force_align_arg_pointer attribute.
+  /* Track SP offset to the CFA.  We continue tracking this after we've
+     swapped the CFA register away from SP.  In the case of re-alignment
+     this is fudged; we're interested to offsets within the local frame.  */
+  m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
+  m->fs.sp_valid = true;
+  m->fs.sp_realigned = false;
 
-     Darwin's ABI specifies 128b alignment for both 32 and  64 bit variants
-     at call sites, including profile function calls.
- */
-  if (((TARGET_64BIT_MS_ABI || TARGET_MACHO)
-        && crtl->preferred_stack_boundary < 128)
-      && (!crtl->is_leaf || cfun->calls_alloca != 0
-	  || ix86_current_function_calls_tls_descriptor
-	  || (TARGET_MACHO && crtl->profile)
-	  || ix86_incoming_stack_boundary < 128))
+  const struct ix86_frame &frame = cfun->machine->frame;
+
+  if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
     {
-      crtl->preferred_stack_boundary = 128;
-      crtl->stack_alignment_needed = 128;
-    }
+      /* We should have already generated an error for any use of
+         ms_hook on a nested function.  */
+      gcc_checking_assert (!ix86_static_chain_on_stack);
 
-  stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
-  preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
+      /* Check if profiling is active and we shall use profiling before
+         prologue variant. If so sorry.  */
+      if (crtl->profile && flag_fentry != 0)
+	sorry ("%<ms_hook_prologue%> attribute is not compatible "
+	       "with %<-mfentry%> for 32-bit");
 
-  gcc_assert (!size || stack_alignment_needed);
-  gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
-  gcc_assert (preferred_alignment <= stack_alignment_needed);
+      /* In ix86_asm_output_function_label we emitted:
+	 8b ff     movl.s %edi,%edi
+	 55        push   %ebp
+	 8b ec     movl.s %esp,%ebp
 
-  /* The only ABI saving SSE regs should be 64-bit ms_abi.  */
-  gcc_assert (TARGET_64BIT || !frame->nsseregs);
-  if (TARGET_64BIT && m->call_ms2sysv)
-    {
-      gcc_assert (stack_alignment_needed >= 16);
-      gcc_assert (!frame->nsseregs);
-    }
+	 This matches the hookable function prologue in Win32 API
+	 functions in Microsoft Windows XP Service Pack 2 and newer.
+	 Wine uses this to enable Windows apps to hook the Win32 API
+	 functions provided by Wine.
 
-  /* For SEH we have to limit the amount of code movement into the prologue.
-     At present we do this via a BLOCKAGE, at which point there's very little
-     scheduling that can be done, which means that there's very little point
-     in doing anything except PUSHs.  */
-  if (TARGET_SEH)
-    m->use_fast_prologue_epilogue = false;
-  else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
-    {
-      int count = frame->nregs;
-      struct cgraph_node *node = cgraph_node::get (current_function_decl);
+	 What that means is that we've already set up the frame pointer.  */
 
-      /* The fast prologue uses move instead of push to save registers.  This
-         is significantly longer, but also executes faster as modern hardware
-         can execute the moves in parallel, but can't do that for push/pop.
+      if (frame_pointer_needed
+	  && !(crtl->drap_reg && crtl->stack_realign_needed))
+	{
+	  rtx push, mov;
 
-	 Be careful about choosing what prologue to emit:  When function takes
-	 many instructions to execute we may use slow version as well as in
-	 case function is known to be outside hot spot (this is known with
-	 feedback only).  Weight the size of function by number of registers
-	 to save as it is cheap to use one or two push instructions but very
-	 slow to use many of them.  */
-      if (count)
-	count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
-      if (node->frequency < NODE_FREQUENCY_NORMAL
-	  || (flag_branch_probabilities
-	      && node->frequency < NODE_FREQUENCY_HOT))
-	m->use_fast_prologue_epilogue = false;
-      else
-	m->use_fast_prologue_epilogue
-	   = !expensive_function_p (count);
-    }
+	  /* We've decided to use the frame pointer already set up.
+	     Describe this to the unwinder by pretending that both
+	     push and mov insns happen right here.
 
-  frame->save_regs_using_mov
-    = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
-       /* If static stack checking is enabled and done with probes,
-	  the registers need to be saved before allocating the frame.  */
-       && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
+	     Putting the unwind info here at the end of the ms_hook
+	     is done so that we can make absolutely certain we get
+	     the required byte sequence at the start of the function,
+	     rather than relying on an assembler that can produce
+	     the exact encoding required.
 
-  /* Skip return address and error code in exception handler.  */
-  offset = INCOMING_FRAME_SP_OFFSET;
+	     However it does mean (in the unpatched case) that we have
+	     a 1 insn window where the asynchronous unwind info is
+	     incorrect.  However, if we placed the unwind info at
+	     its correct location we would have incorrect unwind info
+	     in the patched case.  Which is probably all moot since
+	     I don't expect Wine generates dwarf2 unwind info for the
+	     system libraries that use this feature.  */
 
-  /* Skip pushed static chain.  */
-  if (ix86_static_chain_on_stack)
-    offset += UNITS_PER_WORD;
+	  insn = emit_insn (gen_blockage ());
 
-  /* Skip saved base pointer.  */
-  if (frame_pointer_needed)
-    offset += UNITS_PER_WORD;
-  frame->hfp_save_offset = offset;
+	  push = gen_push (hard_frame_pointer_rtx);
+	  mov = gen_rtx_SET (hard_frame_pointer_rtx,
+			     stack_pointer_rtx);
+	  RTX_FRAME_RELATED_P (push) = 1;
+	  RTX_FRAME_RELATED_P (mov) = 1;
 
-  /* The traditional frame pointer location is at the top of the frame.  */
-  frame->hard_frame_pointer_offset = offset;
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+			gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
 
-  /* Register save area */
-  offset += frame->nregs * UNITS_PER_WORD;
-  frame->reg_save_offset = offset;
+	  /* Note that gen_push incremented m->fs.cfa_offset, even
+	     though we didn't emit the push insn here.  */
+	  m->fs.cfa_reg = hard_frame_pointer_rtx;
+	  m->fs.fp_offset = m->fs.cfa_offset;
+	  m->fs.fp_valid = true;
+	}
+      else
+	{
+	  /* The frame pointer is not needed so pop %ebp again.
+	     This leaves us with a pristine state.  */
+	  emit_insn (gen_pop (hard_frame_pointer_rtx));
+	}
+    }
 
-  /* On SEH target, registers are pushed just before the frame pointer
-     location.  */
-  if (TARGET_SEH)
-    frame->hard_frame_pointer_offset = offset;
+  /* The first insn of a function that accepts its static chain on the
+     stack is to push the register that would be filled in by a direct
+     call.  This insn will be skipped by the trampoline.  */
+  else if (ix86_static_chain_on_stack)
+    {
+      static_chain = ix86_static_chain (cfun->decl, false);
+      insn = emit_insn (gen_push (static_chain));
+      emit_insn (gen_blockage ());
 
-  /* Calculate the size of the va-arg area (not including padding, if any).  */
-  frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
+      /* We don't want to interpret this push insn as a register save,
+	 only as a stack adjustment.  The real copy of the register as
+	 a save will be done later, if needed.  */
+      t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
+      t = gen_rtx_SET (stack_pointer_rtx, t);
+      add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
 
-  /* Also adjust stack_realign_offset for the largest alignment of
-     stack slot actually used.  */
-  if (stack_realign_fp
-      || (cfun->machine->max_used_stack_alignment != 0
-	  && (offset % cfun->machine->max_used_stack_alignment) != 0))
+  /* Emit prologue code to adjust stack alignment and setup DRAP, in case
+     of DRAP is needed and stack realignment is really needed after reload */
+  if (stack_realign_drap)
     {
-      /* We may need a 16-byte aligned stack for the remainder of the
-	 register save area, but the stack frame for the local function
-	 may require a greater alignment if using AVX/2/512.  In order
-	 to avoid wasting space, we first calculate the space needed for
-	 the rest of the register saves, add that to the stack pointer,
-	 and then realign the stack to the boundary of the start of the
-	 frame for the local function.  */
-      HOST_WIDE_INT space_needed = 0;
-      HOST_WIDE_INT sse_reg_space_needed = 0;
+      int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
 
-      if (TARGET_64BIT)
-	{
-	  if (m->call_ms2sysv)
-	    {
-	      m->call_ms2sysv_pad_in = 0;
-	      space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
-	    }
+      /* Can't use DRAP in interrupt function.  */
+      if (cfun->machine->func_type != TYPE_NORMAL)
+	sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
+	       "in interrupt service routine.  This may be worked "
+	       "around by avoiding functions with aggregate return.");
 
-	  else if (frame->nsseregs)
-	    /* The only ABI that has saved SSE registers (Win64) also has a
-	       16-byte aligned default stack.  However, many programs violate
-	       the ABI, and Wine64 forces stack realignment to compensate.  */
-	    space_needed = frame->nsseregs * 16;
+      /* Only need to push parameter pointer reg if it is caller saved.  */
+      if (!call_used_or_fixed_reg_p (REGNO (crtl->drap_reg)))
+	{
+	  /* Push arg pointer reg */
+	  insn = emit_insn (gen_push (crtl->drap_reg));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
 
-	  sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
+      /* Grab the argument pointer.  */
+      t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
+      insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
+      RTX_FRAME_RELATED_P (insn) = 1;
+      m->fs.cfa_reg = crtl->drap_reg;
+      m->fs.cfa_offset = 0;
 
-	  /* 64-bit frame->va_arg_size should always be a multiple of 16, but
-	     rounding to be pedantic.  */
-	  space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
-	}
-      else
-	space_needed = frame->va_arg_size;
+      /* Align the stack.  */
+      insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
+					stack_pointer_rtx,
+					GEN_INT (-align_bytes)));
+      RTX_FRAME_RELATED_P (insn) = 1;
 
-      /* Record the allocation size required prior to the realignment AND.  */
-      frame->stack_realign_allocate = space_needed;
+      /* Replicate the return address on the stack so that return
+	 address can be reached via (argp - 1) slot.  This is needed
+	 to implement macro RETURN_ADDR_RTX and intrinsic function
+	 expand_builtin_return_addr etc.  */
+      t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
+      t = gen_frame_mem (word_mode, t);
+      insn = emit_insn (gen_push (t));
+      RTX_FRAME_RELATED_P (insn) = 1;
 
-      /* The re-aligned stack starts at frame->stack_realign_offset.  Values
-	 before this point are not directly comparable with values below
-	 this point.  Use sp_valid_at to determine if the stack pointer is
-	 valid for a given offset, fp_valid_at for the frame pointer, or
-	 choose_baseaddr to have a base register chosen for you.
+      /* For the purposes of frame and register save area addressing,
+	 we've started over with a new frame.  */
+      m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
+      m->fs.realigned = true;
 
-	 Note that the result of (frame->stack_realign_offset
-	 & (stack_alignment_needed - 1)) may not equal zero.  */
-      offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
-      frame->stack_realign_offset = offset - space_needed;
-      frame->sse_reg_save_offset = frame->stack_realign_offset
-							+ sse_reg_space_needed;
+      if (static_chain)
+	{
+	  /* Replicate static chain on the stack so that static chain
+	     can be reached via (argp - 2) slot.  This is needed for
+	     nested function with stack realignment.  */
+	  insn = emit_insn (gen_push (static_chain));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
     }
-  else
+
+  int_registers_saved = (frame.nregs == 0);
+  sse_registers_saved = (frame.nsseregs == 0);
+  save_stub_call_needed = (m->call_ms2sysv);
+  gcc_assert (sse_registers_saved || !save_stub_call_needed);
+
+  if (frame_pointer_needed && !m->fs.fp_valid)
     {
-      frame->stack_realign_offset = offset;
+      /* Note: AT&T enter does NOT have reversed args.  Enter is probably
+         slower on all targets.  Also sdb didn't like it.  */
+      insn = emit_insn (gen_push (hard_frame_pointer_rtx));
+      RTX_FRAME_RELATED_P (insn) = 1;
 
-      if (TARGET_64BIT && m->call_ms2sysv)
+      /* Push registers now, before setting the frame pointer
+	 on SEH target.  */
+      if (!int_registers_saved
+	  && TARGET_SEH
+	  && !frame.save_regs_using_mov)
 	{
-	  m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
-	  offset += xlogue_layout::get_instance ().get_stack_space_used ();
+	  ix86_emit_save_regs ();
+	  int_registers_saved = true;
+	  gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
 	}
 
-      /* Align and set SSE register save area.  */
-      else if (frame->nsseregs)
+      if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
 	{
-	  /* If the incoming stack boundary is at least 16 bytes, or DRAP is
-	     required and the DRAP re-alignment boundary is at least 16 bytes,
-	     then we want the SSE register save area properly aligned.  */
-	  if (ix86_incoming_stack_boundary >= 128
-		  || (stack_realign_drap && stack_alignment_needed >= 16))
-	    offset = ROUND_UP (offset, 16);
-	  offset += frame->nsseregs * 16;
+	  insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
+	  RTX_FRAME_RELATED_P (insn) = 1;
+
+	  if (m->fs.cfa_reg == stack_pointer_rtx)
+	    m->fs.cfa_reg = hard_frame_pointer_rtx;
+	  m->fs.fp_offset = m->fs.sp_offset;
+	  m->fs.fp_valid = true;
 	}
-      frame->sse_reg_save_offset = offset;
-      offset += frame->va_arg_size;
     }
 
-  /* Align start of frame for local function.  When a function call
-     is removed, it may become a leaf function.  But if argument may
-     be passed on stack, we need to align the stack when there is no
-     tail call.  */
-  if (m->call_ms2sysv
-      || frame->va_arg_size != 0
-      || size != 0
-      || !crtl->is_leaf
-      || (!crtl->tail_call_emit
-	  && cfun->machine->outgoing_args_on_stack)
-      || cfun->calls_alloca
-      || ix86_current_function_calls_tls_descriptor)
-    offset = ROUND_UP (offset, stack_alignment_needed);
-
-  /* Frame pointer points here.  */
-  frame->frame_pointer_offset = offset;
-
-  offset += size;
-
-  /* Add outgoing arguments area.  Can be skipped if we eliminated
-     all the function calls as dead code.
-     Skipping is however impossible when function calls alloca.  Alloca
-     expander assumes that last crtl->outgoing_args_size
-     of stack frame are unused.  */
-  if (ACCUMULATE_OUTGOING_ARGS
-      && (!crtl->is_leaf || cfun->calls_alloca
-	  || ix86_current_function_calls_tls_descriptor))
+  if (!int_registers_saved)
     {
-      offset += crtl->outgoing_args_size;
-      frame->outgoing_arguments_size = crtl->outgoing_args_size;
-    }
-  else
-    frame->outgoing_arguments_size = 0;
+      /* If saving registers via PUSH, do so now.  */
+      if (!frame.save_regs_using_mov)
+	{
+	  ix86_emit_save_regs ();
+	  int_registers_saved = true;
+	  gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
+	}
 
-  /* Align stack boundary.  Only needed if we're calling another function
-     or using alloca.  */
-  if (!crtl->is_leaf || cfun->calls_alloca
-      || ix86_current_function_calls_tls_descriptor)
-    offset = ROUND_UP (offset, preferred_alignment);
+      /* When using red zone we may start register saving before allocating
+	 the stack frame saving one cycle of the prologue.  However, avoid
+	 doing this if we have to probe the stack; at least on x86_64 the
+	 stack probe can turn into a call that clobbers a red zone location. */
+      else if (ix86_using_red_zone ()
+	       && (! TARGET_STACK_PROBE
+		   || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
+	{
+	  ix86_emit_save_regs_using_mov (frame.reg_save_offset);
+	  int_registers_saved = true;
+	}
+    }
 
-  /* We've reached end of stack frame.  */
-  frame->stack_pointer_offset = offset;
+  if (stack_realign_fp)
+    {
+      int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
+      gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
 
-  /* Size prologue needs to allocate.  */
-  to_allocate = offset - frame->sse_reg_save_offset;
+      /* Record last valid frame pointer offset.  */
+      m->fs.sp_realigned_fp_last = frame.reg_save_offset;
 
-  if ((!to_allocate && frame->nregs <= 1)
-      || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
-      /* If stack clash probing needs a loop, then it needs a
-	 scratch register.  But the returned register is only guaranteed
-	 to be safe to use after register saves are complete.  So if
-	 stack clash protections are enabled and the allocated frame is
-	 larger than the probe interval, then use pushes to save
-	 callee saved registers.  */
-      || (flag_stack_clash_protection && to_allocate > get_probe_interval ()))
-    frame->save_regs_using_mov = false;
+      /* The computation of the size of the re-aligned stack frame means
+	 that we must allocate the size of the register save area before
+	 performing the actual alignment.  Otherwise we cannot guarantee
+	 that there's enough storage above the realignment point.  */
+      allocate = frame.reg_save_offset - m->fs.sp_offset
+		 + frame.stack_realign_allocate;
+      if (allocate)
+        pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				   GEN_INT (-allocate), -1, false);
 
-  if (ix86_using_red_zone ()
-      && crtl->sp_is_unchanging
-      && crtl->is_leaf
-      && !ix86_pc_thunk_call_expanded
-      && !ix86_current_function_calls_tls_descriptor)
-    {
-      frame->red_zone_size = to_allocate;
-      if (frame->save_regs_using_mov)
-	frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
-      if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
-	frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
-    }
-  else
-    frame->red_zone_size = 0;
-  frame->stack_pointer_offset -= frame->red_zone_size;
+      /* Align the stack.  */
+      insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
+					stack_pointer_rtx,
+					GEN_INT (-align_bytes)));
+      m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
+      m->fs.sp_realigned_offset = m->fs.sp_offset
+					      - frame.stack_realign_allocate;
+      /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
+	 Beyond this point, stack access should be done via choose_baseaddr or
+	 by using sp_valid_at and fp_valid_at to determine the correct base
+	 register.  Henceforth, any CFA offset should be thought of as logical
+	 and not physical.  */
+      gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
+      gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
+      m->fs.sp_realigned = true;
 
-  /* The SEH frame pointer location is near the bottom of the frame.
-     This is enforced by the fact that the difference between the
-     stack pointer and the frame pointer is limited to 240 bytes in
-     the unwind data structure.  */
-  if (TARGET_SEH)
-    {
-      HOST_WIDE_INT diff;
+      /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
+	 is needed to describe where a register is saved using a realigned
+	 stack pointer, so we need to invalidate the stack pointer for that
+	 target.  */
+      if (TARGET_SEH)
+	m->fs.sp_valid = false;
 
-      /* If we can leave the frame pointer where it is, do so.  Also, returns
-	 the establisher frame for __builtin_frame_address (0).  */
-      diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
-      if (diff <= SEH_MAX_FRAME_SIZE
-	  && (diff > 240 || (diff & 15) != 0)
-	  && !crtl->accesses_prior_frames)
+      /* If SP offset is non-immediate after allocation of the stack frame,
+	 then emit SSE saves or stub call prior to allocating the rest of the
+	 stack frame.  This is less efficient for the out-of-line stub because
+	 we can't combine allocations across the call barrier, but it's better
+	 than using a scratch register.  */
+      else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
+						   - m->fs.sp_realigned_offset),
+					  Pmode))
 	{
-	  /* Ideally we'd determine what portion of the local stack frame
-	     (within the constraint of the lowest 240) is most heavily used.
-	     But without that complication, simply bias the frame pointer
-	     by 128 bytes so as to maximize the amount of the local stack
-	     frame that is addressable with 8-bit offsets.  */
-	  frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
+	  if (!sse_registers_saved)
+	    {
+	      ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
+	      sse_registers_saved = true;
+	    }
+	  else if (save_stub_call_needed)
+	    {
+	      ix86_emit_outlined_ms2sysv_save (frame);
+	      save_stub_call_needed = false;
+	    }
 	}
     }
-}
-
-/* This is semi-inlined memory_address_length, but simplified
-   since we know that we're always dealing with reg+offset, and
-   to avoid having to create and discard all that rtl.  */
 
-static inline int
-choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
-{
-  int len = 4;
+  allocate = frame.stack_pointer_offset - m->fs.sp_offset;
 
-  if (offset == 0)
+  if (flag_stack_usage_info)
     {
-      /* EBP and R13 cannot be encoded without an offset.  */
-      len = (regno == BP_REG || regno == R13_REG);
-    }
-  else if (IN_RANGE (offset, -128, 127))
-    len = 1;
+      /* We start to count from ARG_POINTER.  */
+      HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
 
-  /* ESP and R12 must be encoded with a SIB byte.  */
-  if (regno == SP_REG || regno == R12_REG)
-    len++;
+      /* If it was realigned, take into account the fake frame.  */
+      if (stack_realign_drap)
+	{
+	  if (ix86_static_chain_on_stack)
+	    stack_size += UNITS_PER_WORD;
 
-  return len;
-}
+	  if (!call_used_or_fixed_reg_p (REGNO (crtl->drap_reg)))
+	    stack_size += UNITS_PER_WORD;
 
-/* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
-   the frame save area.  The register is saved at CFA - CFA_OFFSET.  */
+	  /* This over-estimates by 1 minimal-stack-alignment-unit but
+	     mitigates that by counting in the new return address slot.  */
+	  current_function_dynamic_stack_size
+	    += crtl->stack_alignment_needed / BITS_PER_UNIT;
+	}
 
-static bool
-sp_valid_at (HOST_WIDE_INT cfa_offset)
-{
-  const struct machine_frame_state &fs = cfun->machine->fs;
-  if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
-    {
-      /* Validate that the cfa_offset isn't in a "no-man's land".  */
-      gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
-      return false;
+      current_function_static_stack_size = stack_size;
     }
-  return fs.sp_valid;
-}
 
-/* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
-   the frame save area.  The register is saved at CFA - CFA_OFFSET.  */
-
-static inline bool
-fp_valid_at (HOST_WIDE_INT cfa_offset)
-{
-  const struct machine_frame_state &fs = cfun->machine->fs;
-  if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
+  /* On SEH target with very large frame size, allocate an area to save
+     SSE registers (as the very large allocation won't be described).  */
+  if (TARGET_SEH
+      && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
+      && !sse_registers_saved)
     {
-      /* Validate that the cfa_offset isn't in a "no-man's land".  */
-      gcc_assert (cfa_offset >= fs.sp_realigned_offset);
-      return false;
-    }
-  return fs.fp_valid;
-}
-
-/* Choose a base register based upon alignment requested, speed and/or
-   size.  */
-
-static void
-choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
-		HOST_WIDE_INT &base_offset,
-		unsigned int align_reqested, unsigned int *align)
-{
-  const struct machine_function *m = cfun->machine;
-  unsigned int hfp_align;
-  unsigned int drap_align;
-  unsigned int sp_align;
-  bool hfp_ok  = fp_valid_at (cfa_offset);
-  bool drap_ok = m->fs.drap_valid;
-  bool sp_ok   = sp_valid_at (cfa_offset);
-
-  hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
+      HOST_WIDE_INT sse_size
+	= frame.sse_reg_save_offset - frame.reg_save_offset;
 
-  /* Filter out any registers that don't meet the requested alignment
-     criteria.  */
-  if (align_reqested)
-    {
-      if (m->fs.realigned)
-	hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
-      /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
-	 notes (which we would need to use a realigned stack pointer),
-	 so disable on SEH targets.  */
-      else if (m->fs.sp_realigned)
-	sp_align = crtl->stack_alignment_needed;
+      gcc_assert (int_registers_saved);
 
-      hfp_ok = hfp_ok && hfp_align >= align_reqested;
-      drap_ok = drap_ok && drap_align >= align_reqested;
-      sp_ok = sp_ok && sp_align >= align_reqested;
+      /* No need to do stack checking as the area will be immediately
+	 written.  */
+      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+			         GEN_INT (-sse_size), -1,
+				 m->fs.cfa_reg == stack_pointer_rtx);
+      allocate -= sse_size;
+      ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
+      sse_registers_saved = true;
     }
 
-  if (m->use_fast_prologue_epilogue)
+  /* The stack has already been decremented by the instruction calling us
+     so probe if the size is non-negative to preserve the protection area.  */
+  if (allocate >= 0
+      && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
+	  || flag_stack_clash_protection))
     {
-      /* Choose the base register most likely to allow the most scheduling
-         opportunities.  Generally FP is valid throughout the function,
-         while DRAP must be reloaded within the epilogue.  But choose either
-         over the SP due to increased encoding size.  */
-
-      if (hfp_ok)
+      if (flag_stack_clash_protection)
 	{
-	  base_reg = hard_frame_pointer_rtx;
-	  base_offset = m->fs.fp_offset - cfa_offset;
+	  ix86_adjust_stack_and_probe_stack_clash (allocate,
+						   int_registers_saved);
+	  allocate = 0;
 	}
-      else if (drap_ok)
+      else if (STACK_CHECK_MOVING_SP)
 	{
-	  base_reg = crtl->drap_reg;
-	  base_offset = 0 - cfa_offset;
+	  if (!(crtl->is_leaf && !cfun->calls_alloca
+		&& allocate <= get_probe_interval ()))
+	    {
+	      ix86_adjust_stack_and_probe (allocate, int_registers_saved);
+	      allocate = 0;
+	    }
 	}
-      else if (sp_ok)
+      else
 	{
-	  base_reg = stack_pointer_rtx;
-	  base_offset = m->fs.sp_offset - cfa_offset;
+	  HOST_WIDE_INT size = allocate;
+
+	  if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
+	    size = 0x80000000 - get_stack_check_protect () - 1;
+
+	  if (TARGET_STACK_PROBE)
+	    {
+	      if (crtl->is_leaf && !cfun->calls_alloca)
+		{
+		  if (size > get_probe_interval ())
+		    ix86_emit_probe_stack_range (0, size, int_registers_saved);
+		}
+	      else
+		ix86_emit_probe_stack_range (0,
+					     size + get_stack_check_protect (),
+					     int_registers_saved);
+	    }
+	  else
+	    {
+	      if (crtl->is_leaf && !cfun->calls_alloca)
+		{
+		  if (size > get_probe_interval ()
+		      && size > get_stack_check_protect ())
+		    ix86_emit_probe_stack_range (get_stack_check_protect (),
+						 (size
+						  - get_stack_check_protect ()),
+						 int_registers_saved);
+		}
+	      else
+		ix86_emit_probe_stack_range (get_stack_check_protect (), size,
+					     int_registers_saved);
+	    }
 	}
     }
+
+  if (allocate == 0)
+    ;
+  else if (!ix86_target_stack_probe ()
+	   || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
+    {
+      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+			         GEN_INT (-allocate), -1,
+			         m->fs.cfa_reg == stack_pointer_rtx);
+    }
   else
     {
-      HOST_WIDE_INT toffset;
-      int len = 16, tlen;
+      rtx eax = gen_rtx_REG (Pmode, AX_REG);
+      rtx r10 = NULL;
+      rtx (*adjust_stack_insn)(rtx, rtx, rtx);
+      const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
+      bool eax_live = ix86_eax_live_at_start_p ();
+      bool r10_live = false;
 
-      /* Choose the base register with the smallest address encoding.
-         With a tie, choose FP > DRAP > SP.  */
-      if (sp_ok)
-	{
-	  base_reg = stack_pointer_rtx;
-	  base_offset = m->fs.sp_offset - cfa_offset;
-          len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
-	}
-      if (drap_ok)
+      if (TARGET_64BIT)
+        r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
+
+      if (eax_live)
 	{
-	  toffset = 0 - cfa_offset;
-	  tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
-	  if (tlen <= len)
+	  insn = emit_insn (gen_push (eax));
+	  allocate -= UNITS_PER_WORD;
+	  /* Note that SEH directives need to continue tracking the stack
+	     pointer even after the frame pointer has been set up.  */
+	  if (sp_is_cfa_reg || TARGET_SEH)
 	    {
-	      base_reg = crtl->drap_reg;
-	      base_offset = toffset;
-	      len = tlen;
+	      if (sp_is_cfa_reg)
+		m->fs.cfa_offset += UNITS_PER_WORD;
+	      RTX_FRAME_RELATED_P (insn) = 1;
+	      add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+			    gen_rtx_SET (stack_pointer_rtx,
+					 plus_constant (Pmode, stack_pointer_rtx,
+							-UNITS_PER_WORD)));
 	    }
 	}
-      if (hfp_ok)
+
+      if (r10_live)
 	{
-	  toffset = m->fs.fp_offset - cfa_offset;
-	  tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
-	  if (tlen <= len)
+	  r10 = gen_rtx_REG (Pmode, R10_REG);
+	  insn = emit_insn (gen_push (r10));
+	  allocate -= UNITS_PER_WORD;
+	  if (sp_is_cfa_reg || TARGET_SEH)
 	    {
-	      base_reg = hard_frame_pointer_rtx;
-	      base_offset = toffset;
+	      if (sp_is_cfa_reg)
+		m->fs.cfa_offset += UNITS_PER_WORD;
+	      RTX_FRAME_RELATED_P (insn) = 1;
+	      add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+			    gen_rtx_SET (stack_pointer_rtx,
+					 plus_constant (Pmode, stack_pointer_rtx,
+							-UNITS_PER_WORD)));
 	    }
 	}
-    }
 
-    /* Set the align return value.  */
-    if (align)
-      {
-	if (base_reg == stack_pointer_rtx)
-	  *align = sp_align;
-	else if (base_reg == crtl->drap_reg)
-	  *align = drap_align;
-	else if (base_reg == hard_frame_pointer_rtx)
-	  *align = hfp_align;
-      }
-}
+      emit_move_insn (eax, GEN_INT (allocate));
+      emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
 
-/* Return an RTX that points to CFA_OFFSET within the stack frame and
-   the alignment of address.  If ALIGN is non-null, it should point to
-   an alignment value (in bits) that is preferred or zero and will
-   recieve the alignment of the base register that was selected,
-   irrespective of rather or not CFA_OFFSET is a multiple of that
-   alignment value.  If it is possible for the base register offset to be
-   non-immediate then SCRATCH_REGNO should specify a scratch register to
-   use.
+      /* Use the fact that AX still contains ALLOCATE.  */
+      adjust_stack_insn = (Pmode == DImode
+			   ? gen_pro_epilogue_adjust_stack_di_sub
+			   : gen_pro_epilogue_adjust_stack_si_sub);
 
-   The valid base registers are taken from CFUN->MACHINE->FS.  */
+      insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
+					   stack_pointer_rtx, eax));
 
-static rtx
-choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
-		 unsigned int scratch_regno = INVALID_REGNUM)
-{
-  rtx base_reg = NULL;
-  HOST_WIDE_INT base_offset = 0;
+      if (sp_is_cfa_reg || TARGET_SEH)
+	{
+	  if (sp_is_cfa_reg)
+	    m->fs.cfa_offset += allocate;
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+			gen_rtx_SET (stack_pointer_rtx,
+				     plus_constant (Pmode, stack_pointer_rtx,
+						    -allocate)));
+	}
+      m->fs.sp_offset += allocate;
 
-  /* If a specific alignment is requested, try to get a base register
-     with that alignment first.  */
-  if (align && *align)
-    choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
+      /* Use stack_pointer_rtx for relative addressing so that code works for
+	 realigned stack.  But this means that we need a blockage to prevent
+	 stores based on the frame pointer from being scheduled before.  */
+      if (r10_live && eax_live)
+        {
+	  t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
+	  emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
+			  gen_frame_mem (word_mode, t));
+	  t = plus_constant (Pmode, t, UNITS_PER_WORD);
+	  emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
+			  gen_frame_mem (word_mode, t));
+	  emit_insn (gen_memory_blockage ());
+	}
+      else if (eax_live || r10_live)
+	{
+	  t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
+	  emit_move_insn (gen_rtx_REG (word_mode,
+				       (eax_live ? AX_REG : R10_REG)),
+			  gen_frame_mem (word_mode, t));
+	  emit_insn (gen_memory_blockage ());
+	}
+    }
+  gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
 
-  if (!base_reg)
-    choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
+  /* If we havn't already set up the frame pointer, do so now.  */
+  if (frame_pointer_needed && !m->fs.fp_valid)
+    {
+      insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
+			    GEN_INT (frame.stack_pointer_offset
+				     - frame.hard_frame_pointer_offset));
+      insn = emit_insn (insn);
+      RTX_FRAME_RELATED_P (insn) = 1;
+      add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
 
-  gcc_assert (base_reg != NULL);
+      if (m->fs.cfa_reg == stack_pointer_rtx)
+	m->fs.cfa_reg = hard_frame_pointer_rtx;
+      m->fs.fp_offset = frame.hard_frame_pointer_offset;
+      m->fs.fp_valid = true;
+    }
 
-  rtx base_offset_rtx = GEN_INT (base_offset);
+  if (!int_registers_saved)
+    ix86_emit_save_regs_using_mov (frame.reg_save_offset);
+  if (!sse_registers_saved)
+    ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
+  else if (save_stub_call_needed)
+    ix86_emit_outlined_ms2sysv_save (frame);
 
-  if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
+  /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
+     in PROLOGUE.  */
+  if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
     {
-      gcc_assert (scratch_regno != INVALID_REGNUM);
-
-      rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
-      emit_move_insn (scratch_reg, base_offset_rtx);
-
-      return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
+      rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
+      insn = emit_insn (gen_set_got (pic));
+      RTX_FRAME_RELATED_P (insn) = 1;
+      add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
+      emit_insn (gen_prologue_use (pic));
+      /* Deleting already emmitted SET_GOT if exist and allocated to
+	 REAL_PIC_OFFSET_TABLE_REGNUM.  */
+      ix86_elim_entry_set_got (pic);
     }
 
-  return plus_constant (Pmode, base_reg, base_offset);
-}
-
-/* Emit code to save registers in the prologue.  */
+  if (crtl->drap_reg && !crtl->stack_realign_needed)
+    {
+      /* vDRAP is setup but after reload it turns out stack realign
+         isn't necessary, here we will emit prologue to setup DRAP
+         without stack realign adjustment */
+      t = choose_baseaddr (0, NULL);
+      emit_insn (gen_rtx_SET (crtl->drap_reg, t));
+    }
 
-static void
-ix86_emit_save_regs (void)
-{
-  unsigned int regno;
-  rtx_insn *insn;
+  /* Prevent instructions from being scheduled into register save push
+     sequence when access to the redzone area is done through frame pointer.
+     The offset between the frame pointer and the stack pointer is calculated
+     relative to the value of the stack pointer at the end of the function
+     prologue, and moving instructions that access redzone area via frame
+     pointer inside push sequence violates this assumption.  */
+  if (frame_pointer_needed && frame.red_zone_size)
+    emit_insn (gen_memory_blockage ());
 
-  for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
-    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
-      {
-	insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
-	RTX_FRAME_RELATED_P (insn) = 1;
-      }
+  /* SEH requires that the prologue end within 256 bytes of the start of
+     the function.  Prevent instruction schedules that would extend that.
+     Further, prevent alloca modifications to the stack pointer from being
+     combined with prologue modifications.  */
+  if (TARGET_SEH)
+    emit_insn (gen_prologue_use (stack_pointer_rtx));
 }
 
-/* Emit a single register save at CFA - CFA_OFFSET.  */
+/* Emit code to restore REG using a POP insn.  */
 
 static void
-ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
-			      HOST_WIDE_INT cfa_offset)
+ix86_emit_restore_reg_using_pop (rtx reg)
 {
   struct machine_function *m = cfun->machine;
-  rtx reg = gen_rtx_REG (mode, regno);
-  rtx mem, addr, base, insn;
-  unsigned int align = GET_MODE_ALIGNMENT (mode);
-
-  addr = choose_baseaddr (cfa_offset, &align);
-  mem = gen_frame_mem (mode, addr);
+  rtx_insn *insn = emit_insn (gen_pop (reg));
 
-  /* The location aligment depends upon the base register.  */
-  align = MIN (GET_MODE_ALIGNMENT (mode), align);
-  gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
-  set_mem_align (mem, align);
+  ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
+  m->fs.sp_offset -= UNITS_PER_WORD;
 
-  insn = emit_insn (gen_rtx_SET (mem, reg));
-  RTX_FRAME_RELATED_P (insn) = 1;
+  if (m->fs.cfa_reg == crtl->drap_reg
+      && REGNO (reg) == REGNO (crtl->drap_reg))
+    {
+      /* Previously we'd represented the CFA as an expression
+	 like *(%ebp - 8).  We've just popped that value from
+	 the stack, which means we need to reset the CFA to
+	 the drap register.  This will remain until we restore
+	 the stack pointer.  */
+      add_reg_note (insn, REG_CFA_DEF_CFA, reg);
+      RTX_FRAME_RELATED_P (insn) = 1;
 
-  base = addr;
-  if (GET_CODE (base) == PLUS)
-    base = XEXP (base, 0);
-  gcc_checking_assert (REG_P (base));
+      /* This means that the DRAP register is valid for addressing too.  */
+      m->fs.drap_valid = true;
+      return;
+    }
 
-  /* When saving registers into a re-aligned local stack frame, avoid
-     any tricky guessing by dwarf2out.  */
-  if (m->fs.realigned)
+  if (m->fs.cfa_reg == stack_pointer_rtx)
     {
-      gcc_checking_assert (stack_realign_drap);
+      rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+      x = gen_rtx_SET (stack_pointer_rtx, x);
+      add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+      RTX_FRAME_RELATED_P (insn) = 1;
 
-      if (regno == REGNO (crtl->drap_reg))
-	{
-	  /* A bit of a hack.  We force the DRAP register to be saved in
-	     the re-aligned stack frame, which provides us with a copy
-	     of the CFA that will last past the prologue.  Install it.  */
-	  gcc_checking_assert (cfun->machine->fs.fp_valid);
-	  addr = plus_constant (Pmode, hard_frame_pointer_rtx,
-				cfun->machine->fs.fp_offset - cfa_offset);
-	  mem = gen_rtx_MEM (mode, addr);
-	  add_reg_note (insn, REG_CFA_DEF_CFA, mem);
-	}
-      else
-	{
-	  /* The frame pointer is a stable reference within the
-	     aligned frame.  Use it.  */
-	  gcc_checking_assert (cfun->machine->fs.fp_valid);
-	  addr = plus_constant (Pmode, hard_frame_pointer_rtx,
-				cfun->machine->fs.fp_offset - cfa_offset);
-	  mem = gen_rtx_MEM (mode, addr);
-	  add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
-	}
+      m->fs.cfa_offset -= UNITS_PER_WORD;
     }
 
-  else if (base == stack_pointer_rtx && m->fs.sp_realigned
-	   && cfa_offset >= m->fs.sp_realigned_offset)
+  /* When the frame pointer is the CFA, and we pop it, we are
+     swapping back to the stack pointer as the CFA.  This happens
+     for stack frames that don't allocate other data, so we assume
+     the stack pointer is now pointing at the return address, i.e.
+     the function entry state, which makes the offset be 1 word.  */
+  if (reg == hard_frame_pointer_rtx)
     {
-      gcc_checking_assert (stack_realign_fp);
-      add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
+      m->fs.fp_valid = false;
+      if (m->fs.cfa_reg == hard_frame_pointer_rtx)
+	{
+	  m->fs.cfa_reg = stack_pointer_rtx;
+	  m->fs.cfa_offset -= UNITS_PER_WORD;
+
+	  add_reg_note (insn, REG_CFA_DEF_CFA,
+			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+				      GEN_INT (m->fs.cfa_offset)));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
     }
+}
 
-  /* The memory may not be relative to the current CFA register,
-     which means that we may need to generate a new pattern for
-     use by the unwind info.  */
-  else if (base != m->fs.cfa_reg)
+/* Emit code to restore saved registers using POP insns.  */
+
+static void
+ix86_emit_restore_regs_using_pop (void)
+{
+  unsigned int regno;
+
+  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
+      ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
+}
+
+/* Emit code and notes for the LEAVE instruction.  If insn is non-null,
+   omits the emit and only attaches the notes.  */
+
+static void
+ix86_emit_leave (rtx_insn *insn)
+{
+  struct machine_function *m = cfun->machine;
+  if (!insn)
+    insn = emit_insn (ix86_gen_leave ());
+
+  ix86_add_queued_cfa_restore_notes (insn);
+
+  gcc_assert (m->fs.fp_valid);
+  m->fs.sp_valid = true;
+  m->fs.sp_realigned = false;
+  m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
+  m->fs.fp_valid = false;
+
+  if (m->fs.cfa_reg == hard_frame_pointer_rtx)
     {
-      addr = plus_constant (Pmode, m->fs.cfa_reg,
-			    m->fs.cfa_offset - cfa_offset);
-      mem = gen_rtx_MEM (mode, addr);
-      add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
+      m->fs.cfa_reg = stack_pointer_rtx;
+      m->fs.cfa_offset = m->fs.sp_offset;
+
+      add_reg_note (insn, REG_CFA_DEF_CFA,
+		    plus_constant (Pmode, stack_pointer_rtx,
+				   m->fs.sp_offset));
+      RTX_FRAME_RELATED_P (insn) = 1;
     }
+  ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
+			     m->fs.fp_offset);
 }
 
-/* Emit code to save registers using MOV insns.
-   First register is stored at CFA - CFA_OFFSET.  */
+/* Emit code to restore saved registers using MOV insns.
+   First register is restored from CFA - CFA_OFFSET.  */
 static void
-ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
+ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
+				  bool maybe_eh_return)
 {
+  struct machine_function *m = cfun->machine;
   unsigned int regno;
 
   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
-    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
       {
-        ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
+	rtx reg = gen_rtx_REG (word_mode, regno);
+	rtx mem;
+	rtx_insn *insn;
+
+	mem = choose_baseaddr (cfa_offset, NULL);
+	mem = gen_frame_mem (word_mode, mem);
+	insn = emit_move_insn (reg, mem);
+
+        if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
+	  {
+	    /* Previously we'd represented the CFA as an expression
+	       like *(%ebp - 8).  We've just popped that value from
+	       the stack, which means we need to reset the CFA to
+	       the drap register.  This will remain until we restore
+	       the stack pointer.  */
+	    add_reg_note (insn, REG_CFA_DEF_CFA, reg);
+	    RTX_FRAME_RELATED_P (insn) = 1;
+
+	    /* This means that the DRAP register is valid for addressing.  */
+	    m->fs.drap_valid = true;
+	  }
+	else
+	  ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
+
 	cfa_offset -= UNITS_PER_WORD;
       }
 }
 
-/* Emit code to save SSE registers using MOV insns.
-   First register is stored at CFA - CFA_OFFSET.  */
+/* Emit code to restore saved registers using MOV insns.
+   First register is restored from CFA - CFA_OFFSET.  */
 static void
-ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
+ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
+				      bool maybe_eh_return)
 {
   unsigned int regno;
 
   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
-    if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+    if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
       {
-	ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
-	cfa_offset -= GET_MODE_SIZE (V4SFmode);
-      }
-}
+	rtx reg = gen_rtx_REG (V4SFmode, regno);
+	rtx mem;
+	unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
 
-static GTY(()) rtx queued_cfa_restores;
+	mem = choose_baseaddr (cfa_offset, &align);
+	mem = gen_rtx_MEM (V4SFmode, mem);
 
-/* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
-   manipulation insn.  The value is on the stack at CFA - CFA_OFFSET.
-   Don't add the note if the previously saved value will be left untouched
-   within stack red-zone till return, as unwinders can find the same value
-   in the register and on the stack.  */
+	/* The location aligment depends upon the base register.  */
+	align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
+	gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
+	set_mem_align (mem, align);
+	emit_insn (gen_rtx_SET (reg, mem));
 
-static void
-ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
-{
-  if (!crtl->shrink_wrapped
-      && cfa_offset <= cfun->machine->fs.red_zone_offset)
-    return;
+	ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
 
-  if (insn)
-    {
-      add_reg_note (insn, REG_CFA_RESTORE, reg);
-      RTX_FRAME_RELATED_P (insn) = 1;
-    }
-  else
-    queued_cfa_restores
-      = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
+	cfa_offset -= GET_MODE_SIZE (V4SFmode);
+      }
 }
 
-/* Add queued REG_CFA_RESTORE notes if any to INSN.  */
-
 static void
-ix86_add_queued_cfa_restore_notes (rtx insn)
-{
-  rtx last;
-  if (!queued_cfa_restores)
-    return;
-  for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
-    ;
-  XEXP (last, 1) = REG_NOTES (insn);
-  REG_NOTES (insn) = queued_cfa_restores;
-  queued_cfa_restores = NULL_RTX;
-  RTX_FRAME_RELATED_P (insn) = 1;
-}
-
-/* Expand prologue or epilogue stack adjustment.
-   The pattern exist to put a dependency on all ebp-based memory accesses.
-   STYLE should be negative if instructions should be marked as frame related,
-   zero if %r11 register is live and cannot be freely used and positive
-   otherwise.  */
-
-static rtx
-pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
-			   int style, bool set_cfa)
+ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
+				  bool use_call, int style)
 {
   struct machine_function *m = cfun->machine;
-  rtx insn;
-  bool add_frame_related_expr = false;
+  const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
+			  + m->call_ms2sysv_extra_regs;
+  rtvec v;
+  unsigned int elems_needed, align, i, vi = 0;
+  rtx_insn *insn;
+  rtx sym, tmp;
+  rtx rsi = gen_rtx_REG (word_mode, SI_REG);
+  rtx r10 = NULL_RTX;
+  const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
+  HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
+  HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
+  rtx rsi_frame_load = NULL_RTX;
+  HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
+  enum xlogue_stub stub;
 
-  if (Pmode == SImode)
-    insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
-  else if (x86_64_immediate_operand (offset, DImode))
-    insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
-  else
-    {
-      rtx tmp;
-      /* r11 is used by indirect sibcall return as well, set before the
-	 epilogue and used after the epilogue.  */
-      if (style)
-        tmp = gen_rtx_REG (DImode, R11_REG);
-      else
-	{
-	  gcc_assert (src != hard_frame_pointer_rtx
-		      && dest != hard_frame_pointer_rtx);
-	  tmp = hard_frame_pointer_rtx;
-	}
-      insn = emit_insn (gen_rtx_SET (tmp, offset));
-      if (style < 0)
-	add_frame_related_expr = true;
+  gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
 
-      insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
-    }
+  /* If using a realigned stack, we should never start with padding.  */
+  gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
 
-  insn = emit_insn (insn);
-  if (style >= 0)
-    ix86_add_queued_cfa_restore_notes (insn);
+  /* Setup RSI as the stub's base pointer.  */
+  align = GET_MODE_ALIGNMENT (V4SFmode);
+  tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
+  gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
 
-  if (set_cfa)
-    {
-      rtx r;
+  emit_insn (gen_rtx_SET (rsi, tmp));
 
-      gcc_assert (m->fs.cfa_reg == src);
-      m->fs.cfa_offset += INTVAL (offset);
-      m->fs.cfa_reg = dest;
+  /* Get a symbol for the stub.  */
+  if (frame_pointer_needed)
+    stub = use_call ? XLOGUE_STUB_RESTORE_HFP
+		    : XLOGUE_STUB_RESTORE_HFP_TAIL;
+  else
+    stub = use_call ? XLOGUE_STUB_RESTORE
+		    : XLOGUE_STUB_RESTORE_TAIL;
+  sym = xlogue.get_stub_rtx (stub);
 
-      r = gen_rtx_PLUS (Pmode, src, offset);
-      r = gen_rtx_SET (dest, r);
-      add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
-      RTX_FRAME_RELATED_P (insn) = 1;
-    }
-  else if (style < 0)
+  elems_needed = ncregs;
+  if (use_call)
+    elems_needed += 1;
+  else
+    elems_needed += frame_pointer_needed ? 5 : 3;
+  v = rtvec_alloc (elems_needed);
+
+  /* We call the epilogue stub when we need to pop incoming args or we are
+     doing a sibling call as the tail.  Otherwise, we will emit a jmp to the
+     epilogue stub and it is the tail-call.  */
+  if (use_call)
+      RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
+  else
     {
-      RTX_FRAME_RELATED_P (insn) = 1;
-      if (add_frame_related_expr)
+      RTVEC_ELT (v, vi++) = ret_rtx;
+      RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
+      if (frame_pointer_needed)
 	{
-	  rtx r = gen_rtx_PLUS (Pmode, src, offset);
-	  r = gen_rtx_SET (dest, r);
-	  add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
+	  rtx rbp = gen_rtx_REG (DImode, BP_REG);
+	  gcc_assert (m->fs.fp_valid);
+	  gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
+
+	  tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
+	  RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
+	  RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
+	  tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
+	  RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
+	}
+      else
+	{
+	  /* If no hard frame pointer, we set R10 to the SP restore value.  */
+	  gcc_assert (!m->fs.fp_valid);
+	  gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
+	  gcc_assert (m->fs.sp_valid);
+
+	  r10 = gen_rtx_REG (DImode, R10_REG);
+	  tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
+	  emit_insn (gen_rtx_SET (r10, tmp));
+
+	  RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
 	}
     }
 
-  if (dest == stack_pointer_rtx)
+  /* Generate frame load insns and restore notes.  */
+  for (i = 0; i < ncregs; ++i)
     {
-      HOST_WIDE_INT ooffset = m->fs.sp_offset;
-      bool valid = m->fs.sp_valid;
-      bool realigned = m->fs.sp_realigned;
+      const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
+      machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
+      rtx reg, frame_load;
 
-      if (src == hard_frame_pointer_rtx)
-	{
-	  valid = m->fs.fp_valid;
-	  realigned = false;
-	  ooffset = m->fs.fp_offset;
-	}
-      else if (src == crtl->drap_reg)
+      reg = gen_rtx_REG (mode, r.regno);
+      frame_load = gen_frame_load (reg, rsi, r.offset);
+
+      /* Save RSI frame load insn & note to add last.  */
+      if (r.regno == SI_REG)
 	{
-	  valid = m->fs.drap_valid;
-	  realigned = false;
-	  ooffset = 0;
+	  gcc_assert (!rsi_frame_load);
+	  rsi_frame_load = frame_load;
+	  rsi_restore_offset = r.offset;
 	}
       else
 	{
-	  /* Else there are two possibilities: SP itself, which we set
-	     up as the default above.  Or EH_RETURN_STACKADJ_RTX, which is
-	     taken care of this by hand along the eh_return path.  */
-	  gcc_checking_assert (src == stack_pointer_rtx
-			       || offset == const0_rtx);
+	  RTVEC_ELT (v, vi++) = frame_load;
+	  ix86_add_cfa_restore_note (NULL, reg, r.offset);
 	}
-
-      m->fs.sp_offset = ooffset - INTVAL (offset);
-      m->fs.sp_valid = valid;
-      m->fs.sp_realigned = realigned;
     }
-  return insn;
-}
-
-/* Find an available register to be used as dynamic realign argument
-   pointer regsiter.  Such a register will be written in prologue and
-   used in begin of body, so it must not be
-	1. parameter passing register.
-	2. GOT pointer.
-   We reuse static-chain register if it is available.  Otherwise, we
-   use DI for i386 and R13 for x86-64.  We chose R13 since it has
-   shorter encoding.
-
-   Return: the regno of chosen register.  */
 
-static unsigned int
-find_drap_reg (void)
-{
-  tree decl = cfun->decl;
+  /* Add RSI frame load & restore note at the end.  */
+  gcc_assert (rsi_frame_load);
+  gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
+  RTVEC_ELT (v, vi++) = rsi_frame_load;
+  ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
+			     rsi_restore_offset);
 
-  /* Always use callee-saved register if there are no caller-saved
-     registers.  */
-  if (TARGET_64BIT)
+  /* Finally, for tail-call w/o a hard frame pointer, set SP to R10.  */
+  if (!use_call && !frame_pointer_needed)
     {
-      /* Use R13 for nested function or function need static chain.
-	 Since function with tail call may use any caller-saved
-	 registers in epilogue, DRAP must not use caller-saved
-	 register in such case.  */
-      if (DECL_STATIC_CHAIN (decl)
-	  || cfun->machine->no_caller_saved_registers
-	  || crtl->tail_call_emit)
-	return R13_REG;
+      gcc_assert (m->fs.sp_valid);
+      gcc_assert (!m->fs.sp_realigned);
 
-      return R10_REG;
+      /* At this point, R10 should point to frame.stack_realign_offset.  */
+      if (m->fs.cfa_reg == stack_pointer_rtx)
+	m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
+      m->fs.sp_offset = frame.stack_realign_offset;
     }
+
+  gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
+  tmp = gen_rtx_PARALLEL (VOIDmode, v);
+  if (use_call)
+      insn = emit_insn (tmp);
   else
     {
-      /* Use DI for nested function or function need static chain.
-	 Since function with tail call may use any caller-saved
-	 registers in epilogue, DRAP must not use caller-saved
-	 register in such case.  */
-      if (DECL_STATIC_CHAIN (decl)
-	  || cfun->machine->no_caller_saved_registers
-	  || crtl->tail_call_emit)
-	return DI_REG;
+      insn = emit_jump_insn (tmp);
+      JUMP_LABEL (insn) = ret_rtx;
 
-      /* Reuse static chain register if it isn't used for parameter
-         passing.  */
-      if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
+      if (frame_pointer_needed)
+	ix86_emit_leave (insn);
+      else
 	{
-	  unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
-	  if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
-	    return CX_REG;
+	  /* Need CFA adjust note.  */
+	  tmp = gen_rtx_SET (stack_pointer_rtx, r10);
+	  add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
 	}
-      return DI_REG;
     }
-}
 
-/* Handle a "force_align_arg_pointer" attribute.  */
+  RTX_FRAME_RELATED_P (insn) = true;
+  ix86_add_queued_cfa_restore_notes (insn);
 
-static tree
-ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
-					       tree, int, bool *no_add_attrs)
-{
-  if (TREE_CODE (*node) != FUNCTION_TYPE
-      && TREE_CODE (*node) != METHOD_TYPE
-      && TREE_CODE (*node) != FIELD_DECL
-      && TREE_CODE (*node) != TYPE_DECL)
+  /* If we're not doing a tail-call, we need to adjust the stack.  */
+  if (use_call && m->fs.sp_valid)
     {
-      warning (OPT_Wattributes, "%qE attribute only applies to functions",
-	       name);
-      *no_add_attrs = true;
+      HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
+      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				GEN_INT (dealloc), style,
+				m->fs.cfa_reg == stack_pointer_rtx);
     }
-
-  return NULL_TREE;
 }
 
-/* Return minimum incoming stack alignment.  */
+/* Restore function stack, frame, and registers.  */
 
-static unsigned int
-ix86_minimum_incoming_stack_boundary (bool sibcall)
+void
+ix86_expand_epilogue (int style)
 {
-  unsigned int incoming_stack_boundary;
-
-  /* Stack of interrupt handler is aligned to 128 bits in 64bit mode.  */
-  if (cfun->machine->func_type != TYPE_NORMAL)
-    incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
-  /* Prefer the one specified at command line. */
-  else if (ix86_user_incoming_stack_boundary)
-    incoming_stack_boundary = ix86_user_incoming_stack_boundary;
-  /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
-     if -mstackrealign is used, it isn't used for sibcall check and
-     estimated stack alignment is 128bit.  */
-  else if (!sibcall
-	   && ix86_force_align_arg_pointer
-	   && crtl->stack_alignment_estimated == 128)
-    incoming_stack_boundary = MIN_STACK_BOUNDARY;
-  else
-    incoming_stack_boundary = ix86_default_incoming_stack_boundary;
+  struct machine_function *m = cfun->machine;
+  struct machine_frame_state frame_state_save = m->fs;
+  bool restore_regs_via_mov;
+  bool using_drap;
+  bool restore_stub_is_tail = false;
 
-  /* Incoming stack alignment can be changed on individual functions
-     via force_align_arg_pointer attribute.  We use the smallest
-     incoming stack boundary.  */
-  if (incoming_stack_boundary > MIN_STACK_BOUNDARY
-      && lookup_attribute (ix86_force_align_arg_pointer_string,
-			   TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
-    incoming_stack_boundary = MIN_STACK_BOUNDARY;
+  if (ix86_function_naked (current_function_decl))
+    {
+      /* The program should not reach this point.  */
+      emit_insn (gen_ud2 ());
+      return;
+    }
 
-  /* The incoming stack frame has to be aligned at least at
-     parm_stack_boundary.  */
-  if (incoming_stack_boundary < crtl->parm_stack_boundary)
-    incoming_stack_boundary = crtl->parm_stack_boundary;
+  ix86_finalize_stack_frame_flags ();
+  const struct ix86_frame &frame = cfun->machine->frame;
 
-  /* Stack at entrance of main is aligned by runtime.  We use the
-     smallest incoming stack boundary. */
-  if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
-      && DECL_NAME (current_function_decl)
-      && MAIN_NAME_P (DECL_NAME (current_function_decl))
-      && DECL_FILE_SCOPE_P (current_function_decl))
-    incoming_stack_boundary = MAIN_STACK_BOUNDARY;
+  m->fs.sp_realigned = stack_realign_fp;
+  m->fs.sp_valid = stack_realign_fp
+		   || !frame_pointer_needed
+		   || crtl->sp_is_unchanging;
+  gcc_assert (!m->fs.sp_valid
+	      || m->fs.sp_offset == frame.stack_pointer_offset);
 
-  return incoming_stack_boundary;
-}
+  /* The FP must be valid if the frame pointer is present.  */
+  gcc_assert (frame_pointer_needed == m->fs.fp_valid);
+  gcc_assert (!m->fs.fp_valid
+	      || m->fs.fp_offset == frame.hard_frame_pointer_offset);
 
-/* Update incoming stack boundary and estimated stack alignment.  */
+  /* We must have *some* valid pointer to the stack frame.  */
+  gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
 
-static void
-ix86_update_stack_boundary (void)
-{
-  ix86_incoming_stack_boundary
-    = ix86_minimum_incoming_stack_boundary (false);
+  /* The DRAP is never valid at this point.  */
+  gcc_assert (!m->fs.drap_valid);
 
-  /* x86_64 vararg needs 16byte stack alignment for register save area.  */
-  if (TARGET_64BIT
-      && cfun->stdarg
-      && crtl->stack_alignment_estimated < 128)
-    crtl->stack_alignment_estimated = 128;
+  /* See the comment about red zone and frame
+     pointer usage in ix86_expand_prologue.  */
+  if (frame_pointer_needed && frame.red_zone_size)
+    emit_insn (gen_memory_blockage ());
 
-  /* __tls_get_addr needs to be called with 16-byte aligned stack.  */
-  if (ix86_tls_descriptor_calls_expanded_in_cfun
-      && crtl->preferred_stack_boundary < 128)
-    crtl->preferred_stack_boundary = 128;
-}
+  using_drap = crtl->drap_reg && crtl->stack_realign_needed;
+  gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
 
-/* Handle the TARGET_GET_DRAP_RTX hook.  Return NULL if no DRAP is
-   needed or an rtx for DRAP otherwise.  */
+  /* Determine the CFA offset of the end of the red-zone.  */
+  m->fs.red_zone_offset = 0;
+  if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
+    {
+      /* The red-zone begins below return address and error code in
+	 exception handler.  */
+      m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
 
-static rtx
-ix86_get_drap_rtx (void)
-{
-  /* We must use DRAP if there are outgoing arguments on stack and
-     ACCUMULATE_OUTGOING_ARGS is false.  */
-  if (ix86_force_drap
-      || (cfun->machine->outgoing_args_on_stack
-	  && !ACCUMULATE_OUTGOING_ARGS))
-    crtl->need_drap = true;
+      /* When the register save area is in the aligned portion of
+         the stack, determine the maximum runtime displacement that
+	 matches up with the aligned frame.  */
+      if (stack_realign_drap)
+	m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
+				  + UNITS_PER_WORD);
+    }
 
-  if (stack_realign_drap)
-    {
-      /* Assign DRAP to vDRAP and returns vDRAP */
-      unsigned int regno = find_drap_reg ();
-      rtx drap_vreg;
-      rtx arg_ptr;
-      rtx_insn *seq, *insn;
+  HOST_WIDE_INT reg_save_offset = frame.reg_save_offset;
 
-      arg_ptr = gen_rtx_REG (Pmode, regno);
-      crtl->drap_reg = arg_ptr;
+  /* Special care must be taken for the normal return case of a function
+     using eh_return: the eax and edx registers are marked as saved, but
+     not restored along this path.  Adjust the save location to match.  */
+  if (crtl->calls_eh_return && style != 2)
+    reg_save_offset -= 2 * UNITS_PER_WORD;
 
-      start_sequence ();
-      drap_vreg = copy_to_reg (arg_ptr);
-      seq = get_insns ();
-      end_sequence ();
+  /* EH_RETURN requires the use of moves to function properly.  */
+  if (crtl->calls_eh_return)
+    restore_regs_via_mov = true;
+  /* SEH requires the use of pops to identify the epilogue.  */
+  else if (TARGET_SEH)
+    restore_regs_via_mov = false;
+  /* If we're only restoring one register and sp cannot be used then
+     using a move instruction to restore the register since it's
+     less work than reloading sp and popping the register.  */
+  else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
+    restore_regs_via_mov = true;
+  else if (TARGET_EPILOGUE_USING_MOVE
+	   && cfun->machine->use_fast_prologue_epilogue
+	   && (frame.nregs > 1
+	       || m->fs.sp_offset != reg_save_offset))
+    restore_regs_via_mov = true;
+  else if (frame_pointer_needed
+	   && !frame.nregs
+	   && m->fs.sp_offset != reg_save_offset)
+    restore_regs_via_mov = true;
+  else if (frame_pointer_needed
+	   && TARGET_USE_LEAVE
+	   && cfun->machine->use_fast_prologue_epilogue
+	   && frame.nregs == 1)
+    restore_regs_via_mov = true;
+  else
+    restore_regs_via_mov = false;
 
-      insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
-      if (!optimize)
+  if (restore_regs_via_mov || frame.nsseregs)
+    {
+      /* Ensure that the entire register save area is addressable via
+	 the stack pointer, if we will restore SSE regs via sp.  */
+      if (TARGET_64BIT
+	  && m->fs.sp_offset > 0x7fffffff
+	  && sp_valid_at (frame.stack_realign_offset + 1)
+	  && (frame.nsseregs + frame.nregs) != 0)
 	{
-	  add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
-	  RTX_FRAME_RELATED_P (insn) = 1;
+	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				     GEN_INT (m->fs.sp_offset
+					      - frame.sse_reg_save_offset),
+				     style,
+				     m->fs.cfa_reg == stack_pointer_rtx);
 	}
-      return drap_vreg;
     }
-  else
-    return NULL;
-}
-
-/* Handle the TARGET_INTERNAL_ARG_POINTER hook.  */
 
-static rtx
-ix86_internal_arg_pointer (void)
-{
-  return virtual_incoming_args_rtx;
-}
+  /* If there are any SSE registers to restore, then we have to do it
+     via moves, since there's obviously no pop for SSE regs.  */
+  if (frame.nsseregs)
+    ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
+					  style == 2);
 
-struct scratch_reg {
-  rtx reg;
-  bool saved;
-};
+  if (m->call_ms2sysv)
+    {
+      int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
 
-/* Return a short-lived scratch register for use on function entry.
-   In 32-bit mode, it is valid only after the registers are saved
-   in the prologue.  This register must be released by means of
-   release_scratch_register_on_entry once it is dead.  */
+      /* We cannot use a tail-call for the stub if:
+	 1. We have to pop incoming args,
+	 2. We have additional int regs to restore, or
+	 3. A sibling call will be the tail-call, or
+	 4. We are emitting an eh_return_internal epilogue.
 
-static void
-get_scratch_register_on_entry (struct scratch_reg *sr)
-{
-  int regno;
+	 TODO: Item 4 has not yet tested!
 
-  sr->saved = false;
+	 If any of the above are true, we will call the stub rather than
+	 jump to it.  */
+      restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
+      ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
+    }
 
-  if (TARGET_64BIT)
+  /* If using out-of-line stub that is a tail-call, then...*/
+  if (m->call_ms2sysv && restore_stub_is_tail)
     {
-      /* We always use R11 in 64-bit mode.  */
-      regno = R11_REG;
+      /* TODO: parinoid tests. (remove eventually)  */
+      gcc_assert (m->fs.sp_valid);
+      gcc_assert (!m->fs.sp_realigned);
+      gcc_assert (!m->fs.fp_valid);
+      gcc_assert (!m->fs.realigned);
+      gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
+      gcc_assert (!crtl->drap_reg);
+      gcc_assert (!frame.nregs);
     }
-  else
+  else if (restore_regs_via_mov)
     {
-      tree decl = current_function_decl, fntype = TREE_TYPE (decl);
-      bool fastcall_p
-	= lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
-      bool thiscall_p
-	= lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
-      bool static_chain_p = DECL_STATIC_CHAIN (decl);
-      int regparm = ix86_function_regparm (fntype, decl);
-      int drap_regno
-	= crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
+      rtx t;
 
-      /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
-	  for the static chain register.  */
-      if ((regparm < 1 || (fastcall_p && !static_chain_p))
-	  && drap_regno != AX_REG)
-	regno = AX_REG;
-      /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
-	  for the static chain register.  */
-      else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
-        regno = AX_REG;
-      else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
-	regno = DX_REG;
-      /* ecx is the static chain register.  */
-      else if (regparm < 3 && !fastcall_p && !thiscall_p
-	       && !static_chain_p
-	       && drap_regno != CX_REG)
-	regno = CX_REG;
-      else if (ix86_save_reg (BX_REG, true, false))
-	regno = BX_REG;
-      /* esi is the static chain register.  */
-      else if (!(regparm == 3 && static_chain_p)
-	       && ix86_save_reg (SI_REG, true, false))
-	regno = SI_REG;
-      else if (ix86_save_reg (DI_REG, true, false))
-	regno = DI_REG;
-      else
+      if (frame.nregs)
+	ix86_emit_restore_regs_using_mov (reg_save_offset, style == 2);
+
+      /* eh_return epilogues need %ecx added to the stack pointer.  */
+      if (style == 2)
 	{
-	  regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
-	  sr->saved = true;
-	}
-    }
+	  rtx sa = EH_RETURN_STACKADJ_RTX;
+	  rtx_insn *insn;
 
-  sr->reg = gen_rtx_REG (Pmode, regno);
-  if (sr->saved)
-    {
-      rtx_insn *insn = emit_insn (gen_push (sr->reg));
-      RTX_FRAME_RELATED_P (insn) = 1;
-    }
-}
+	  /* %ecx can't be used for both DRAP register and eh_return.  */
+	  if (crtl->drap_reg)
+	    gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
 
-/* Release a scratch register obtained from the preceding function.
+	  /* regparm nested functions don't work with eh_return.  */
+	  gcc_assert (!ix86_static_chain_on_stack);
 
-   If RELEASE_VIA_POP is true, we just pop the register off the stack
-   to release it.  This is what non-Linux systems use with -fstack-check.
+	  if (frame_pointer_needed)
+	    {
+	      t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
+	      t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
+	      emit_insn (gen_rtx_SET (sa, t));
 
-   Otherwise we use OFFSET to locate the saved register and the
-   allocated stack space becomes part of the local frame and is
-   deallocated by the epilogue.  */
+	      t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
+	      insn = emit_move_insn (hard_frame_pointer_rtx, t);
 
-static void
-release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset,
-				   bool release_via_pop)
-{
-  if (sr->saved)
+	      /* Note that we use SA as a temporary CFA, as the return
+		 address is at the proper place relative to it.  We
+		 pretend this happens at the FP restore insn because
+		 prior to this insn the FP would be stored at the wrong
+		 offset relative to SA, and after this insn we have no
+		 other reasonable register to use for the CFA.  We don't
+		 bother resetting the CFA to the SP for the duration of
+		 the return insn, unless the control flow instrumentation
+		 is done.  In this case the SP is used later and we have
+		 to reset CFA to SP.  */
+	      add_reg_note (insn, REG_CFA_DEF_CFA,
+			    plus_constant (Pmode, sa, UNITS_PER_WORD));
+	      ix86_add_queued_cfa_restore_notes (insn);
+	      add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
+	      RTX_FRAME_RELATED_P (insn) = 1;
+
+	      m->fs.cfa_reg = sa;
+	      m->fs.cfa_offset = UNITS_PER_WORD;
+	      m->fs.fp_valid = false;
+
+	      pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
+					 const0_rtx, style,
+					 flag_cf_protection);
+	    }
+	  else
+	    {
+	      t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
+	      t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
+	      insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
+	      ix86_add_queued_cfa_restore_notes (insn);
+
+	      gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
+	      if (m->fs.cfa_offset != UNITS_PER_WORD)
+		{
+		  m->fs.cfa_offset = UNITS_PER_WORD;
+		  add_reg_note (insn, REG_CFA_DEF_CFA,
+				plus_constant (Pmode, stack_pointer_rtx,
+					       UNITS_PER_WORD));
+		  RTX_FRAME_RELATED_P (insn) = 1;
+		}
+	    }
+	  m->fs.sp_offset = UNITS_PER_WORD;
+	  m->fs.sp_valid = true;
+	  m->fs.sp_realigned = false;
+	}
+    }
+  else
     {
-      if (release_via_pop)
+      /* SEH requires that the function end with (1) a stack adjustment
+	 if necessary, (2) a sequence of pops, and (3) a return or
+	 jump instruction.  Prevent insns from the function body from
+	 being scheduled into this sequence.  */
+      if (TARGET_SEH)
 	{
-	  struct machine_function *m = cfun->machine;
-	  rtx x, insn = emit_insn (gen_pop (sr->reg));
+	  /* Prevent a catch region from being adjacent to the standard
+	     epilogue sequence.  Unfortunately neither crtl->uses_eh_lsda
+	     nor several other flags that would be interesting to test are
+	     set up yet.  */
+	  if (flag_non_call_exceptions)
+	    emit_insn (gen_nops (const1_rtx));
+	  else
+	    emit_insn (gen_blockage ());
+	}
 
-	  /* The RX FRAME_RELATED_P mechanism doesn't know about pop.  */
-	  RTX_FRAME_RELATED_P (insn) = 1;
-	  x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
-	  x = gen_rtx_SET (stack_pointer_rtx, x);
-	  add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
-	  m->fs.sp_offset -= UNITS_PER_WORD;
+      /* First step is to deallocate the stack frame so that we can
+	 pop the registers.  If the stack pointer was realigned, it needs
+	 to be restored now.  Also do it on SEH target for very large
+	 frame as the emitted instructions aren't allowed by the ABI
+	 in epilogues.  */
+      if (!m->fs.sp_valid || m->fs.sp_realigned
+ 	  || (TARGET_SEH
+	      && (m->fs.sp_offset - reg_save_offset
+		  >= SEH_MAX_FRAME_SIZE)))
+	{
+	  pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
+				     GEN_INT (m->fs.fp_offset
+					      - reg_save_offset),
+				     style, false);
 	}
-      else
+      else if (m->fs.sp_offset != reg_save_offset)
 	{
-	  rtx x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (offset));
-	  x = gen_rtx_SET (sr->reg, gen_rtx_MEM (word_mode, x));
-	  emit_insn (x);
+	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				     GEN_INT (m->fs.sp_offset
+					      - reg_save_offset),
+				     style,
+				     m->fs.cfa_reg == stack_pointer_rtx);
 	}
+
+      ix86_emit_restore_regs_using_pop ();
     }
-}
 
-/* Emit code to adjust the stack pointer by SIZE bytes while probing it.
+  /* If we used a stack pointer and haven't already got rid of it,
+     then do so now.  */
+  if (m->fs.fp_valid)
+    {
+      /* If the stack pointer is valid and pointing at the frame
+	 pointer store address, then we only need a pop.  */
+      if (sp_valid_at (frame.hfp_save_offset)
+	  && m->fs.sp_offset == frame.hfp_save_offset)
+	ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
+      /* Leave results in shorter dependency chains on CPUs that are
+	 able to grok it fast.  */
+      else if (TARGET_USE_LEAVE
+	       || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
+	       || !cfun->machine->use_fast_prologue_epilogue)
+	ix86_emit_leave (NULL);
+      else
+        {
+	  pro_epilogue_adjust_stack (stack_pointer_rtx,
+				     hard_frame_pointer_rtx,
+				     const0_rtx, style, !using_drap);
+	  ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
+        }
+    }
 
-   This differs from the next routine in that it tries hard to prevent
-   attacks that jump the stack guard.  Thus it is never allowed to allocate
-   more than PROBE_INTERVAL bytes of stack space without a suitable
-   probe.
+  if (using_drap)
+    {
+      int param_ptr_offset = UNITS_PER_WORD;
+      rtx_insn *insn;
 
-   INT_REGISTERS_SAVED is true if integer registers have already been
-   pushed on the stack.  */
+      gcc_assert (stack_realign_drap);
 
-static void
-ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
-					 const bool int_registers_saved)
-{
-  struct machine_function *m = cfun->machine;
+      if (ix86_static_chain_on_stack)
+	param_ptr_offset += UNITS_PER_WORD;
+      if (!call_used_or_fixed_reg_p (REGNO (crtl->drap_reg)))
+	param_ptr_offset += UNITS_PER_WORD;
 
-  /* If this function does not statically allocate stack space, then
-     no probes are needed.  */
-  if (!size)
+      insn = emit_insn (gen_rtx_SET
+			(stack_pointer_rtx,
+			 gen_rtx_PLUS (Pmode,
+				       crtl->drap_reg,
+				       GEN_INT (-param_ptr_offset))));
+      m->fs.cfa_reg = stack_pointer_rtx;
+      m->fs.cfa_offset = param_ptr_offset;
+      m->fs.sp_offset = param_ptr_offset;
+      m->fs.realigned = false;
+
+      add_reg_note (insn, REG_CFA_DEF_CFA,
+		    gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+				  GEN_INT (param_ptr_offset)));
+      RTX_FRAME_RELATED_P (insn) = 1;
+
+      if (!call_used_or_fixed_reg_p (REGNO (crtl->drap_reg)))
+	ix86_emit_restore_reg_using_pop (crtl->drap_reg);
+    }
+
+  /* At this point the stack pointer must be valid, and we must have
+     restored all of the registers.  We may not have deallocated the
+     entire stack frame.  We've delayed this until now because it may
+     be possible to merge the local stack deallocation with the
+     deallocation forced by ix86_static_chain_on_stack.   */
+  gcc_assert (m->fs.sp_valid);
+  gcc_assert (!m->fs.sp_realigned);
+  gcc_assert (!m->fs.fp_valid);
+  gcc_assert (!m->fs.realigned);
+  if (m->fs.sp_offset != UNITS_PER_WORD)
     {
-      /* However, the allocation of space via pushes for register
-	 saves could be viewed as allocating space, but without the
-	 need to probe.  */
-      if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
-        dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
-      else
-	dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
-      return;
+      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
+				 style, true);
     }
+  else
+    ix86_add_queued_cfa_restore_notes (get_last_insn ());
 
-  /* If we are a noreturn function, then we have to consider the
-     possibility that we're called via a jump rather than a call.
+  /* Sibcall epilogues don't want a return instruction.  */
+  if (style == 0)
+    {
+      m->fs = frame_state_save;
+      return;
+    }
 
-     Thus we don't have the implicit probe generated by saving the
-     return address into the stack at the call.  Thus, the stack
-     pointer could be anywhere in the guard page.  The safe thing
-     to do is emit a probe now.
+  if (cfun->machine->func_type != TYPE_NORMAL)
+    emit_jump_insn (gen_interrupt_return ());
+  else if (crtl->args.pops_args && crtl->args.size)
+    {
+      rtx popc = GEN_INT (crtl->args.pops_args);
 
-     The probe can be avoided if we have already emitted any callee
-     register saves into the stack or have a frame pointer (which will
-     have been saved as well).  Those saves will function as implicit
-     probes.
+      /* i386 can only pop 64K bytes.  If asked to pop more, pop return
+	 address, do explicit add, and jump indirectly to the caller.  */
 
-     ?!? This should be revamped to work like aarch64 and s390 where
-     we track the offset from the most recent probe.  Normally that
-     offset would be zero.  For a noreturn function we would reset
-     it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT).   Then
-     we just probe when we cross PROBE_INTERVAL.  */
-  if (TREE_THIS_VOLATILE (cfun->decl)
-      && !(m->frame.nregs || m->frame.nsseregs || frame_pointer_needed))
-    {
-      /* We can safely use any register here since we're just going to push
-	 its value and immediately pop it back.  But we do try and avoid
-	 argument passing registers so as not to introduce dependencies in
-	 the pipeline.  For 32 bit we use %esi and for 64 bit we use %rax.  */
-      rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
-      rtx_insn *insn_push = emit_insn (gen_push (dummy_reg));
-      rtx_insn *insn_pop = emit_insn (gen_pop (dummy_reg));
-      m->fs.sp_offset -= UNITS_PER_WORD;
-      if (m->fs.cfa_reg == stack_pointer_rtx)
+      if (crtl->args.pops_args >= 65536)
 	{
-	  m->fs.cfa_offset -= UNITS_PER_WORD;
-	  rtx x = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
-	  x = gen_rtx_SET (stack_pointer_rtx, x);
-	  add_reg_note (insn_push, REG_CFA_ADJUST_CFA, x);
-	  RTX_FRAME_RELATED_P (insn_push) = 1;
-	  x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
-	  x = gen_rtx_SET (stack_pointer_rtx, x);
-	  add_reg_note (insn_pop, REG_CFA_ADJUST_CFA, x);
-	  RTX_FRAME_RELATED_P (insn_pop) = 1;
-	}
-      emit_insn (gen_blockage ());
-    }
+	  rtx ecx = gen_rtx_REG (SImode, CX_REG);
+	  rtx_insn *insn;
 
-  /* If we allocate less than the size of the guard statically,
-     then no probing is necessary, but we do need to allocate
-     the stack.  */
-  if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
-    {
-      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
-			         GEN_INT (-size), -1,
-			         m->fs.cfa_reg == stack_pointer_rtx);
-      dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
-      return;
-    }
+	  /* There is no "pascal" calling convention in any 64bit ABI.  */
+	  gcc_assert (!TARGET_64BIT);
 
-  /* We're allocating a large enough stack frame that we need to
-     emit probes.  Either emit them inline or in a loop depending
-     on the size.  */
-  HOST_WIDE_INT probe_interval = get_probe_interval ();
-  if (size <= 4 * probe_interval)
-    {
-      HOST_WIDE_INT i;
-      for (i = probe_interval; i <= size; i += probe_interval)
-	{
-	  /* Allocate PROBE_INTERVAL bytes.  */
-	  rtx insn
-	    = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
-					 GEN_INT (-probe_interval), -1,
-					 m->fs.cfa_reg == stack_pointer_rtx);
-	  add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
+	  insn = emit_insn (gen_pop (ecx));
+	  m->fs.cfa_offset -= UNITS_PER_WORD;
+	  m->fs.sp_offset -= UNITS_PER_WORD;
 
-	  /* And probe at *sp.  */
-	  emit_stack_probe (stack_pointer_rtx);
-	  emit_insn (gen_blockage ());
-	}
+	  rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+	  x = gen_rtx_SET (stack_pointer_rtx, x);
+	  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+	  add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
+	  RTX_FRAME_RELATED_P (insn) = 1;
 
-      /* We need to allocate space for the residual, but we do not need
-	 to probe the residual.  */
-      HOST_WIDE_INT residual = (i - probe_interval - size);
-      if (residual)
-	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
-				   GEN_INT (residual), -1,
-				   m->fs.cfa_reg == stack_pointer_rtx);
-      dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
+	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				     popc, -1, true);
+	  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
+	}
+      else
+	emit_jump_insn (gen_simple_return_pop_internal (popc));
     }
-  else
+  else if (!m->call_ms2sysv || !restore_stub_is_tail)
     {
-      /* We expect the GP registers to be saved when probes are used
-	 as the probing sequences might need a scratch register and
-	 the routine to allocate one assumes the integer registers
-	 have already been saved.  */
-      gcc_assert (int_registers_saved);
-
-      struct scratch_reg sr;
-      get_scratch_register_on_entry (&sr);
-
-      /* If we needed to save a register, then account for any space
-	 that was pushed (we are not going to pop the register when
-	 we do the restore).  */
-      if (sr.saved)
-	size -= UNITS_PER_WORD;
+      /* In case of return from EH a simple return cannot be used
+	 as a return address will be compared with a shadow stack
+	 return address.  Use indirect jump instead.  */
+      if (style == 2 && flag_cf_protection)
+	{
+	  /* Register used in indirect jump must be in word_mode.  But
+	     Pmode may not be the same as word_mode for x32.  */
+	  rtx ecx = gen_rtx_REG (word_mode, CX_REG);
+	  rtx_insn *insn;
 
-      /* Step 1: round SIZE down to a multiple of the interval.  */
-      HOST_WIDE_INT rounded_size = size & -probe_interval;
+	  insn = emit_insn (gen_pop (ecx));
+	  m->fs.cfa_offset -= UNITS_PER_WORD;
+	  m->fs.sp_offset -= UNITS_PER_WORD;
 
-      /* Step 2: compute final value of the loop counter.  Use lea if
-	 possible.  */
-      rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
-      rtx insn;
-      if (address_no_seg_operand (addr, Pmode))
-	insn = emit_insn (gen_rtx_SET (sr.reg, addr));
-      else
-	{
-	  emit_move_insn (sr.reg, GEN_INT (-rounded_size));
-	  insn = emit_insn (gen_rtx_SET (sr.reg,
-					 gen_rtx_PLUS (Pmode, sr.reg,
-						       stack_pointer_rtx)));
-	}
-      if (m->fs.cfa_reg == stack_pointer_rtx)
-	{
-	  add_reg_note (insn, REG_CFA_DEF_CFA,
-			plus_constant (Pmode, sr.reg,
-				       m->fs.cfa_offset + rounded_size));
+	  rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+	  x = gen_rtx_SET (stack_pointer_rtx, x);
+	  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+	  add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
 	  RTX_FRAME_RELATED_P (insn) = 1;
-	}
 
-      /* Step 3: the loop.  */
-      rtx size_rtx = GEN_INT (rounded_size);
-      insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
-							 size_rtx));
-      if (m->fs.cfa_reg == stack_pointer_rtx)
-	{
-	  m->fs.cfa_offset += rounded_size;
-	  add_reg_note (insn, REG_CFA_DEF_CFA,
-			plus_constant (Pmode, stack_pointer_rtx,
-				       m->fs.cfa_offset));
-	  RTX_FRAME_RELATED_P (insn) = 1;
+	  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
 	}
-      m->fs.sp_offset += rounded_size;
-      emit_insn (gen_blockage ());
-
-      /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
-	 is equal to ROUNDED_SIZE.  */
-
-      if (size != rounded_size)
-	pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
-				   GEN_INT (rounded_size - size), -1,
-				   m->fs.cfa_reg == stack_pointer_rtx);
-      dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
-
-      /* This does not deallocate the space reserved for the scratch
-	 register.  That will be deallocated in the epilogue.  */
-      release_scratch_register_on_entry (&sr, size, false);
+      else
+	emit_jump_insn (gen_simple_return_internal ());
     }
 
-  /* Make sure nothing is scheduled before we are done.  */
-  emit_insn (gen_blockage ());
+  /* Restore the state back to the state from the prologue,
+     so that it's correct for the next epilogue.  */
+  m->fs = frame_state_save;
 }
 
-/* Emit code to adjust the stack pointer by SIZE bytes while probing it.
-
-   INT_REGISTERS_SAVED is true if integer registers have already been
-   pushed on the stack.  */
+/* Reset from the function's potential modifications.  */
 
 static void
-ix86_adjust_stack_and_probe (HOST_WIDE_INT size,
-			     const bool int_registers_saved)
+ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
 {
-  /* We skip the probe for the first interval + a small dope of 4 words and
-     probe that many bytes past the specified size to maintain a protection
-     area at the botton of the stack.  */
-  const int dope = 4 * UNITS_PER_WORD;
-  rtx size_rtx = GEN_INT (size), last;
+  if (pic_offset_table_rtx
+      && !ix86_use_pseudo_pic_reg ())
+    SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
 
-  /* See if we have a constant small number of probes to generate.  If so,
-     that's the easy case.  The run-time loop is made up of 9 insns in the
-     generic case while the compile-time loop is made up of 3+2*(n-1) insns
-     for n # of intervals.  */
-  if (size <= 4 * get_probe_interval ())
+  if (TARGET_MACHO)
     {
-      HOST_WIDE_INT i, adjust;
-      bool first_probe = true;
+      rtx_insn *insn = get_last_insn ();
+      rtx_insn *deleted_debug_label = NULL;
 
-      /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
-	 values of N from 1 until it exceeds SIZE.  If only one probe is
-	 needed, this will not generate any code.  Then adjust and probe
-	 to PROBE_INTERVAL + SIZE.  */
-      for (i = get_probe_interval (); i < size; i += get_probe_interval ())
+      /* Mach-O doesn't support labels at the end of objects, so if
+         it looks like we might want one, take special action.
+        First, collect any sequence of deleted debug labels.  */
+      while (insn
+	     && NOTE_P (insn)
+	     && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
 	{
-	  if (first_probe)
-	    {
-	      adjust = 2 * get_probe_interval () + dope;
-	      first_probe = false;
-	    }
-	  else
-	    adjust = get_probe_interval ();
-
-	  emit_insn (gen_rtx_SET (stack_pointer_rtx,
-				  plus_constant (Pmode, stack_pointer_rtx,
-						 -adjust)));
-	  emit_stack_probe (stack_pointer_rtx);
+	  /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
+	     notes only, instead set their CODE_LABEL_NUMBER to -1,
+	     otherwise there would be code generation differences
+	     in between -g and -g0.  */
+	  if (NOTE_P (insn) && NOTE_KIND (insn)
+	      == NOTE_INSN_DELETED_DEBUG_LABEL)
+	    deleted_debug_label = insn;
+	  insn = PREV_INSN (insn);
 	}
 
-      if (first_probe)
-	adjust = size + get_probe_interval () + dope;
-      else
-        adjust = size + get_probe_interval () - i;
-
-      emit_insn (gen_rtx_SET (stack_pointer_rtx,
-			      plus_constant (Pmode, stack_pointer_rtx,
-					     -adjust)));
-      emit_stack_probe (stack_pointer_rtx);
+      /* If we have:
+	 label:
+	    barrier
+	  then this needs to be detected, so skip past the barrier.  */
 
-      /* Adjust back to account for the additional first interval.  */
-      last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
-				     plus_constant (Pmode, stack_pointer_rtx,
-						    (get_probe_interval ()
-						     + dope))));
-    }
+      if (insn && BARRIER_P (insn))
+	insn = PREV_INSN (insn);
 
-  /* Otherwise, do the same as above, but in a loop.  Note that we must be
-     extra careful with variables wrapping around because we might be at
-     the very top (or the very bottom) of the address space and we have
-     to be able to handle this case properly; in particular, we use an
-     equality test for the loop condition.  */
-  else
-    {
-      /* We expect the GP registers to be saved when probes are used
-	 as the probing sequences might need a scratch register and
-	 the routine to allocate one assumes the integer registers
-	 have already been saved.  */
-      gcc_assert (int_registers_saved);
-
-      HOST_WIDE_INT rounded_size;
-      struct scratch_reg sr;
-
-      get_scratch_register_on_entry (&sr);
-
-      /* If we needed to save a register, then account for any space
-	 that was pushed (we are not going to pop the register when
-	 we do the restore).  */
-      if (sr.saved)
-	size -= UNITS_PER_WORD;
-
-      /* Step 1: round SIZE to the previous multiple of the interval.  */
-
-      rounded_size = ROUND_DOWN (size, get_probe_interval ());
+      /* Up to now we've only seen notes or barriers.  */
+      if (insn)
+	{
+	  if (LABEL_P (insn)
+	      || (NOTE_P (insn)
+		  && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
+	    /* Trailing label.  */
+	    fputs ("\tnop\n", file);
+	  else if (cfun && ! cfun->is_thunk)
+	    {
+	      /* See if we have a completely empty function body, skipping
+	         the special case of the picbase thunk emitted as asm.  */
+	      while (insn && ! INSN_P (insn))
+		insn = PREV_INSN (insn);
+	      /* If we don't find any insns, we've got an empty function body;
+		 I.e. completely empty - without a return or branch.  This is
+		 taken as the case where a function body has been removed
+		 because it contains an inline __builtin_unreachable().  GCC
+		 declares that reaching __builtin_unreachable() means UB so
+		 we're not obliged to do anything special; however, we want
+		 non-zero-sized function bodies.  To meet this, and help the
+		 user out, let's trap the case.  */
+	      if (insn == NULL)
+		fputs ("\tud2\n", file);
+	    }
+	}
+      else if (deleted_debug_label)
+	for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
+	  if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
+	    CODE_LABEL_NUMBER (insn) = -1;
+    }
+}
 
+/* Return a scratch register to use in the split stack prologue.  The
+   split stack prologue is used for -fsplit-stack.  It is the first
+   instructions in the function, even before the regular prologue.
+   The scratch register can be any caller-saved register which is not
+   used for parameters or for the static chain.  */
 
-      /* Step 2: compute initial and final value of the loop counter.  */
+static unsigned int
+split_stack_prologue_scratch_regno (void)
+{
+  if (TARGET_64BIT)
+    return R11_REG;
+  else
+    {
+      bool is_fastcall, is_thiscall;
+      int regparm;
 
-      /* SP = SP_0 + PROBE_INTERVAL.  */
-      emit_insn (gen_rtx_SET (stack_pointer_rtx,
-			      plus_constant (Pmode, stack_pointer_rtx,
-					     - (get_probe_interval () + dope))));
+      is_fastcall = (lookup_attribute ("fastcall",
+				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
+		     != NULL);
+      is_thiscall = (lookup_attribute ("thiscall",
+				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
+		     != NULL);
+      regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
 
-      /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE.  */
-      if (rounded_size <= (HOST_WIDE_INT_1 << 31))
-	emit_insn (gen_rtx_SET (sr.reg,
-				plus_constant (Pmode, stack_pointer_rtx,
-					       -rounded_size)));
+      if (is_fastcall)
+	{
+	  if (DECL_STATIC_CHAIN (cfun->decl))
+	    {
+	      sorry ("%<-fsplit-stack%> does not support fastcall with "
+		     "nested function");
+	      return INVALID_REGNUM;
+	    }
+	  return AX_REG;
+	}
+      else if (is_thiscall)
+        {
+	  if (!DECL_STATIC_CHAIN (cfun->decl))
+	    return DX_REG;
+	  return AX_REG;
+	}
+      else if (regparm < 3)
+	{
+	  if (!DECL_STATIC_CHAIN (cfun->decl))
+	    return CX_REG;
+	  else
+	    {
+	      if (regparm >= 2)
+		{
+		  sorry ("%<-fsplit-stack%> does not support 2 register "
+			 "parameters for a nested function");
+		  return INVALID_REGNUM;
+		}
+	      return DX_REG;
+	    }
+	}
       else
 	{
-	  emit_move_insn (sr.reg, GEN_INT (-rounded_size));
-	  emit_insn (gen_rtx_SET (sr.reg,
-				  gen_rtx_PLUS (Pmode, sr.reg,
-						stack_pointer_rtx)));
+	  /* FIXME: We could make this work by pushing a register
+	     around the addition and comparison.  */
+	  sorry ("%<-fsplit-stack%> does not support 3 register parameters");
+	  return INVALID_REGNUM;
 	}
+    }
+}
 
+/* A SYMBOL_REF for the function which allocates new stackspace for
+   -fsplit-stack.  */
 
-      /* Step 3: the loop
-
-	 do
-	   {
-	     SP = SP + PROBE_INTERVAL
-	     probe at SP
-	   }
-	 while (SP != LAST_ADDR)
-
-	 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
-	 values of N from 1 until it is equal to ROUNDED_SIZE.  */
-
-      emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
+static GTY(()) rtx split_stack_fn;
 
+/* A SYMBOL_REF for the more stack function when using the large
+   model.  */
 
-      /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
-	 assert at compile-time that SIZE is equal to ROUNDED_SIZE.  */
+static GTY(()) rtx split_stack_fn_large;
 
-      if (size != rounded_size)
-	{
-	  emit_insn (gen_rtx_SET (stack_pointer_rtx,
-			          plus_constant (Pmode, stack_pointer_rtx,
-						 rounded_size - size)));
-	  emit_stack_probe (stack_pointer_rtx);
-	}
+/* Return location of the stack guard value in the TLS block.  */
 
-      /* Adjust back to account for the additional first interval.  */
-      last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
-				     plus_constant (Pmode, stack_pointer_rtx,
-						    (get_probe_interval ()
-						     + dope))));
+rtx
+ix86_split_stack_guard (void)
+{
+  int offset;
+  addr_space_t as = DEFAULT_TLS_SEG_REG;
+  rtx r;
 
-      /* This does not deallocate the space reserved for the scratch
-	 register.  That will be deallocated in the epilogue.  */
-      release_scratch_register_on_entry (&sr, size, false);
-    }
+  gcc_assert (flag_split_stack);
 
-  /* Even if the stack pointer isn't the CFA register, we need to correctly
-     describe the adjustments made to it, in particular differentiate the
-     frame-related ones from the frame-unrelated ones.  */
-  if (size > 0)
-    {
-      rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
-      XVECEXP (expr, 0, 0)
-	= gen_rtx_SET (stack_pointer_rtx,
-		       plus_constant (Pmode, stack_pointer_rtx, -size));
-      XVECEXP (expr, 0, 1)
-	= gen_rtx_SET (stack_pointer_rtx,
-		       plus_constant (Pmode, stack_pointer_rtx,
-				      get_probe_interval () + dope + size));
-      add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
-      RTX_FRAME_RELATED_P (last) = 1;
+#ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
+  offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
+#else
+  gcc_unreachable ();
+#endif
 
-      cfun->machine->fs.sp_offset += size;
-    }
+  r = GEN_INT (offset);
+  r = gen_const_mem (Pmode, r);
+  set_mem_addr_space (r, as);
 
-  /* Make sure nothing is scheduled before we are done.  */
-  emit_insn (gen_blockage ());
+  return r;
 }
 
-/* Adjust the stack pointer up to REG while probing it.  */
+/* Handle -fsplit-stack.  These are the first instructions in the
+   function, even before the regular prologue.  */
 
-const char *
-output_adjust_stack_and_probe (rtx reg)
+void
+ix86_expand_split_stack_prologue (void)
 {
-  static int labelno = 0;
-  char loop_lab[32];
-  rtx xops[2];
-
-  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
+  HOST_WIDE_INT allocate;
+  unsigned HOST_WIDE_INT args_size;
+  rtx_code_label *label;
+  rtx limit, current, allocate_rtx, call_fusage;
+  rtx_insn *call_insn;
+  rtx scratch_reg = NULL_RTX;
+  rtx_code_label *varargs_label = NULL;
+  rtx fn;
 
-  /* Loop.  */
-  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+  gcc_assert (flag_split_stack && reload_completed);
 
-  /* SP = SP + PROBE_INTERVAL.  */
-  xops[0] = stack_pointer_rtx;
-  xops[1] = GEN_INT (get_probe_interval ());
-  output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
+  ix86_finalize_stack_frame_flags ();
+  struct ix86_frame &frame = cfun->machine->frame;
+  allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
 
-  /* Probe at SP.  */
-  xops[1] = const0_rtx;
-  output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
+  /* This is the label we will branch to if we have enough stack
+     space.  We expect the basic block reordering pass to reverse this
+     branch if optimizing, so that we branch in the unlikely case.  */
+  label = gen_label_rtx ();
 
-  /* Test if SP == LAST_ADDR.  */
-  xops[0] = stack_pointer_rtx;
-  xops[1] = reg;
-  output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
+  /* We need to compare the stack pointer minus the frame size with
+     the stack boundary in the TCB.  The stack boundary always gives
+     us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
+     can compare directly.  Otherwise we need to do an addition.  */
 
-  /* Branch.  */
-  fputs ("\tjne\t", asm_out_file);
-  assemble_name_raw (asm_out_file, loop_lab);
-  fputc ('\n', asm_out_file);
+  limit = ix86_split_stack_guard ();
 
-  return "";
-}
+  if (allocate < SPLIT_STACK_AVAILABLE)
+    current = stack_pointer_rtx;
+  else
+    {
+      unsigned int scratch_regno;
+      rtx offset;
 
-/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
-   inclusive.  These are offsets from the current stack pointer.
-
-   INT_REGISTERS_SAVED is true if integer registers have already been
-   pushed on the stack.  */
+      /* We need a scratch register to hold the stack pointer minus
+	 the required frame size.  Since this is the very start of the
+	 function, the scratch register can be any caller-saved
+	 register which is not used for parameters.  */
+      offset = GEN_INT (- allocate);
+      scratch_regno = split_stack_prologue_scratch_regno ();
+      if (scratch_regno == INVALID_REGNUM)
+	return;
+      scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
+      if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
+	{
+	  /* We don't use ix86_gen_add3 in this case because it will
+	     want to split to lea, but when not optimizing the insn
+	     will not be split after this point.  */
+	  emit_insn (gen_rtx_SET (scratch_reg,
+				  gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+						offset)));
+	}
+      else
+	{
+	  emit_move_insn (scratch_reg, offset);
+	  emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
+				    stack_pointer_rtx));
+	}
+      current = scratch_reg;
+    }
 
-static void
-ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
-			     const bool int_registers_saved)
-{
-  /* See if we have a constant small number of probes to generate.  If so,
-     that's the easy case.  The run-time loop is made up of 6 insns in the
-     generic case while the compile-time loop is made up of n insns for n #
-     of intervals.  */
-  if (size <= 6 * get_probe_interval ())
-    {
-      HOST_WIDE_INT i;
+  ix86_expand_branch (GEU, current, limit, label);
+  rtx_insn *jump_insn = get_last_insn ();
+  JUMP_LABEL (jump_insn) = label;
 
-      /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
-	 it exceeds SIZE.  If only one probe is needed, this will not
-	 generate any code.  Then probe at FIRST + SIZE.  */
-      for (i = get_probe_interval (); i < size; i += get_probe_interval ())
-	emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
-					 -(first + i)));
+  /* Mark the jump as very likely to be taken.  */
+  add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
 
-      emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
-				       -(first + size)));
+  if (split_stack_fn == NULL_RTX)
+    {
+      split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
+      SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
     }
+  fn = split_stack_fn;
 
-  /* Otherwise, do the same as above, but in a loop.  Note that we must be
-     extra careful with variables wrapping around because we might be at
-     the very top (or the very bottom) of the address space and we have
-     to be able to handle this case properly; in particular, we use an
-     equality test for the loop condition.  */
-  else
+  /* Get more stack space.  We pass in the desired stack space and the
+     size of the arguments to copy to the new stack.  In 32-bit mode
+     we push the parameters; __morestack will return on a new stack
+     anyhow.  In 64-bit mode we pass the parameters in r10 and
+     r11.  */
+  allocate_rtx = GEN_INT (allocate);
+  args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0;
+  call_fusage = NULL_RTX;
+  rtx pop = NULL_RTX;
+  if (TARGET_64BIT)
     {
-      /* We expect the GP registers to be saved when probes are used
-	 as the probing sequences might need a scratch register and
-	 the routine to allocate one assumes the integer registers
-	 have already been saved.  */
-      gcc_assert (int_registers_saved);
+      rtx reg10, reg11;
 
-      HOST_WIDE_INT rounded_size, last;
-      struct scratch_reg sr;
+      reg10 = gen_rtx_REG (Pmode, R10_REG);
+      reg11 = gen_rtx_REG (Pmode, R11_REG);
 
-      get_scratch_register_on_entry (&sr);
+      /* If this function uses a static chain, it will be in %r10.
+	 Preserve it across the call to __morestack.  */
+      if (DECL_STATIC_CHAIN (cfun->decl))
+	{
+	  rtx rax;
 
+	  rax = gen_rtx_REG (word_mode, AX_REG);
+	  emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
+	  use_reg (&call_fusage, rax);
+	}
 
-      /* Step 1: round SIZE to the previous multiple of the interval.  */
+      if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
+          && !TARGET_PECOFF)
+	{
+	  HOST_WIDE_INT argval;
 
-      rounded_size = ROUND_DOWN (size, get_probe_interval ());
+	  gcc_assert (Pmode == DImode);
+	  /* When using the large model we need to load the address
+	     into a register, and we've run out of registers.  So we
+	     switch to a different calling convention, and we call a
+	     different function: __morestack_large.  We pass the
+	     argument size in the upper 32 bits of r10 and pass the
+	     frame size in the lower 32 bits.  */
+	  gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
+	  gcc_assert ((args_size & 0xffffffff) == args_size);
 
+	  if (split_stack_fn_large == NULL_RTX)
+	    {
+	      split_stack_fn_large
+		= gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
+	      SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
+	    }
+	  if (ix86_cmodel == CM_LARGE_PIC)
+	    {
+	      rtx_code_label *label;
+	      rtx x;
 
-      /* Step 2: compute initial and final value of the loop counter.  */
+	      label = gen_label_rtx ();
+	      emit_label (label);
+	      LABEL_PRESERVE_P (label) = 1;
+	      emit_insn (gen_set_rip_rex64 (reg10, label));
+	      emit_insn (gen_set_got_offset_rex64 (reg11, label));
+	      emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
+	      x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
+				  UNSPEC_GOT);
+	      x = gen_rtx_CONST (Pmode, x);
+	      emit_move_insn (reg11, x);
+	      x = gen_rtx_PLUS (Pmode, reg10, reg11);
+	      x = gen_const_mem (Pmode, x);
+	      emit_move_insn (reg11, x);
+	    }
+	  else
+	    emit_move_insn (reg11, split_stack_fn_large);
 
-      /* TEST_OFFSET = FIRST.  */
-      emit_move_insn (sr.reg, GEN_INT (-first));
+	  fn = reg11;
 
-      /* LAST_OFFSET = FIRST + ROUNDED_SIZE.  */
-      last = first + rounded_size;
+	  argval = ((args_size << 16) << 16) + allocate;
+	  emit_move_insn (reg10, GEN_INT (argval));
+	}
+      else
+	{
+	  emit_move_insn (reg10, allocate_rtx);
+	  emit_move_insn (reg11, GEN_INT (args_size));
+	  use_reg (&call_fusage, reg11);
+	}
 
+      use_reg (&call_fusage, reg10);
+    }
+  else
+    {
+      rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
+      add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
+      insn = emit_insn (gen_push (allocate_rtx));
+      add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
+      pop = GEN_INT (2 * UNITS_PER_WORD);
+    }
+  call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
+				GEN_INT (UNITS_PER_WORD), constm1_rtx,
+				pop, false);
+  add_function_usage_to (call_insn, call_fusage);
+  if (!TARGET_64BIT)
+    add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
+  /* Indicate that this function can't jump to non-local gotos.  */
+  make_reg_eh_region_note_nothrow_nononlocal (call_insn);
 
-      /* Step 3: the loop
+  /* In order to make call/return prediction work right, we now need
+     to execute a return instruction.  See
+     libgcc/config/i386/morestack.S for the details on how this works.
 
-	 do
-	   {
-	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
-	     probe at TEST_ADDR
-	   }
-	 while (TEST_ADDR != LAST_ADDR)
+     For flow purposes gcc must not see this as a return
+     instruction--we need control flow to continue at the subsequent
+     label.  Therefore, we use an unspec.  */
+  gcc_assert (crtl->args.pops_args < 65536);
+  rtx_insn *ret_insn
+    = emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
 
-         probes at FIRST + N * PROBE_INTERVAL for values of N from 1
-         until it is equal to ROUNDED_SIZE.  */
+  if ((flag_cf_protection & CF_BRANCH))
+    {
+      /* Insert ENDBR since __morestack will jump back here via indirect
+	 call.  */
+      rtx cet_eb = gen_nop_endbr ();
+      emit_insn_after (cet_eb, ret_insn);
+    }
 
-      emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
+  /* If we are in 64-bit mode and this function uses a static chain,
+     we saved %r10 in %rax before calling _morestack.  */
+  if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
+    emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
+		    gen_rtx_REG (word_mode, AX_REG));
 
+  /* If this function calls va_start, we need to store a pointer to
+     the arguments on the old stack, because they may not have been
+     all copied to the new stack.  At this point the old stack can be
+     found at the frame pointer value used by __morestack, because
+     __morestack has set that up before calling back to us.  Here we
+     store that pointer in a scratch register, and in
+     ix86_expand_prologue we store the scratch register in a stack
+     slot.  */
+  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+    {
+      unsigned int scratch_regno;
+      rtx frame_reg;
+      int words;
 
-      /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
-	 that SIZE is equal to ROUNDED_SIZE.  */
+      scratch_regno = split_stack_prologue_scratch_regno ();
+      scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
+      frame_reg = gen_rtx_REG (Pmode, BP_REG);
 
-      if (size != rounded_size)
-	emit_stack_probe (plus_constant (Pmode,
-					 gen_rtx_PLUS (Pmode,
-						       stack_pointer_rtx,
-						       sr.reg),
-					 rounded_size - size));
+      /* 64-bit:
+	 fp -> old fp value
+	       return address within this function
+	       return address of caller of this function
+	       stack arguments
+	 So we add three words to get to the stack arguments.
 
-      release_scratch_register_on_entry (&sr, size, true);
-    }
+	 32-bit:
+	 fp -> old fp value
+	       return address within this function
+               first argument to __morestack
+               second argument to __morestack
+               return address of caller of this function
+               stack arguments
+         So we add five words to get to the stack arguments.
+      */
+      words = TARGET_64BIT ? 3 : 5;
+      emit_insn (gen_rtx_SET (scratch_reg,
+			      gen_rtx_PLUS (Pmode, frame_reg,
+					    GEN_INT (words * UNITS_PER_WORD))));
 
-  /* Make sure nothing is scheduled before we are done.  */
-  emit_insn (gen_blockage ());
-}
+      varargs_label = gen_label_rtx ();
+      emit_jump_insn (gen_jump (varargs_label));
+      JUMP_LABEL (get_last_insn ()) = varargs_label;
 
-/* Probe a range of stack addresses from REG to END, inclusive.  These are
-   offsets from the current stack pointer.  */
+      emit_barrier ();
+    }
 
-const char *
-output_probe_stack_range (rtx reg, rtx end)
-{
-  static int labelno = 0;
-  char loop_lab[32];
-  rtx xops[3];
-
-  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
-
-  /* Loop.  */
-  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
-
-  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
-  xops[0] = reg;
-  xops[1] = GEN_INT (get_probe_interval ());
-  output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
-
-  /* Probe at TEST_ADDR.  */
-  xops[0] = stack_pointer_rtx;
-  xops[1] = reg;
-  xops[2] = const0_rtx;
-  output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
-
-  /* Test if TEST_ADDR == LAST_ADDR.  */
-  xops[0] = reg;
-  xops[1] = end;
-  output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
 
-  /* Branch.  */
-  fputs ("\tjne\t", asm_out_file);
-  assemble_name_raw (asm_out_file, loop_lab);
-  fputc ('\n', asm_out_file);
+  /* If this function calls va_start, we now have to set the scratch
+     register for the case where we do not call __morestack.  In this
+     case we need to set it based on the stack pointer.  */
+  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+    {
+      emit_insn (gen_rtx_SET (scratch_reg,
+			      gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+					    GEN_INT (UNITS_PER_WORD))));
 
-  return "";
+      emit_label (varargs_label);
+      LABEL_NUSES (varargs_label) = 1;
+    }
 }
 
-/* Return true if stack frame is required.  Update STACK_ALIGNMENT
-   to the largest alignment, in bits, of stack slot used if stack
-   frame is required and CHECK_STACK_SLOT is true.  */
+/* We may have to tell the dataflow pass that the split stack prologue
+   is initializing a scratch register.  */
 
-static bool
-ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
-				    bool check_stack_slot)
+static void
+ix86_live_on_entry (bitmap regs)
 {
-  HARD_REG_SET set_up_by_prologue, prologue_used;
-  basic_block bb;
-
-  CLEAR_HARD_REG_SET (prologue_used);
-  CLEAR_HARD_REG_SET (set_up_by_prologue);
-  add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
-  add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
-  add_to_hard_reg_set (&set_up_by_prologue, Pmode,
-		       HARD_FRAME_POINTER_REGNUM);
-
-  /* The preferred stack alignment is the minimum stack alignment.  */
-  if (stack_alignment > crtl->preferred_stack_boundary)
-    stack_alignment = crtl->preferred_stack_boundary;
-
-  bool require_stack_frame = false;
-
-  FOR_EACH_BB_FN (bb, cfun)
+  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
     {
-      rtx_insn *insn;
-      FOR_BB_INSNS (bb, insn)
-	if (NONDEBUG_INSN_P (insn)
-	    && requires_stack_frame_p (insn, prologue_used,
-				       set_up_by_prologue))
-	  {
-	    require_stack_frame = true;
-
-	    if (check_stack_slot)
-	      {
-		/* Find the maximum stack alignment.  */
-		subrtx_iterator::array_type array;
-		FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
-		  if (MEM_P (*iter)
-		      && (reg_mentioned_p (stack_pointer_rtx,
-					   *iter)
-			  || reg_mentioned_p (frame_pointer_rtx,
-					      *iter)))
-		    {
-		      unsigned int alignment = MEM_ALIGN (*iter);
-		      if (alignment > stack_alignment)
-			stack_alignment = alignment;
-		    }
-	      }
-	  }
+      gcc_assert (flag_split_stack);
+      bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
     }
-
-  return require_stack_frame;
 }
+
+/* Extract the parts of an RTL expression that is a valid memory address
+   for an instruction.  Return 0 if the structure of the address is
+   grossly off.  Return -1 if the address contains ASHIFT, so it is not
+   strictly valid, but still used for computing length of lea instruction.  */
 
-/* Finalize stack_realign_needed and frame_pointer_needed flags, which
-   will guide prologue/epilogue to be generated in correct form.  */
-
-static void
-ix86_finalize_stack_frame_flags (void)
+int
+ix86_decompose_address (rtx addr, struct ix86_address *out)
 {
-  /* Check if stack realign is really needed after reload, and
-     stores result in cfun */
-  unsigned int incoming_stack_boundary
-    = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
-       ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
-  unsigned int stack_alignment
-    = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
-       ? crtl->max_used_stack_slot_alignment
-       : crtl->stack_alignment_needed);
-  unsigned int stack_realign
-    = (incoming_stack_boundary < stack_alignment);
-  bool recompute_frame_layout_p = false;
+  rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
+  rtx base_reg, index_reg;
+  HOST_WIDE_INT scale = 1;
+  rtx scale_rtx = NULL_RTX;
+  rtx tmp;
+  int retval = 1;
+  addr_space_t seg = ADDR_SPACE_GENERIC;
 
-  if (crtl->stack_realign_finalized)
+  /* Allow zero-extended SImode addresses,
+     they will be emitted with addr32 prefix.  */
+  if (TARGET_64BIT && GET_MODE (addr) == DImode)
     {
-      /* After stack_realign_needed is finalized, we can't no longer
-	 change it.  */
-      gcc_assert (crtl->stack_realign_needed == stack_realign);
-      return;
+      if (GET_CODE (addr) == ZERO_EXTEND
+	  && GET_MODE (XEXP (addr, 0)) == SImode)
+	{
+	  addr = XEXP (addr, 0);
+	  if (CONST_INT_P (addr))
+	    return 0;
+	}	      
+      else if (GET_CODE (addr) == AND
+	       && const_32bit_mask (XEXP (addr, 1), DImode))
+	{
+	  addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
+	  if (addr == NULL_RTX)
+	    return 0;
+
+	  if (CONST_INT_P (addr))
+	    return 0;
+	}
     }
 
-  /* If the only reason for frame_pointer_needed is that we conservatively
-     assumed stack realignment might be needed or -fno-omit-frame-pointer
-     is used, but in the end nothing that needed the stack alignment had
-     been spilled nor stack access, clear frame_pointer_needed and say we
-     don't need stack realignment.  */
-  if ((stack_realign || (!flag_omit_frame_pointer && optimize))
-      && frame_pointer_needed
-      && crtl->is_leaf
-      && crtl->sp_is_unchanging
-      && !ix86_current_function_calls_tls_descriptor
-      && !crtl->accesses_prior_frames
-      && !cfun->calls_alloca
-      && !crtl->calls_eh_return
-      /* See ira_setup_eliminable_regset for the rationale.  */
-      && !(STACK_CHECK_MOVING_SP
-	   && flag_stack_check
-	   && flag_exceptions
-	   && cfun->can_throw_non_call_exceptions)
-      && !ix86_frame_pointer_required ()
-      && get_frame_size () == 0
-      && ix86_nsaved_sseregs () == 0
-      && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
+  /* Allow SImode subregs of DImode addresses,
+     they will be emitted with addr32 prefix.  */
+  if (TARGET_64BIT && GET_MODE (addr) == SImode)
     {
-      if (ix86_find_max_used_stack_alignment (stack_alignment,
-					      stack_realign))
+      if (SUBREG_P (addr)
+	  && GET_MODE (SUBREG_REG (addr)) == DImode)
 	{
-	  /* Stack frame is required.  If stack alignment needed is less
-	     than incoming stack boundary, don't realign stack.  */
-	  stack_realign = incoming_stack_boundary < stack_alignment;
-	  if (!stack_realign)
-	    {
-	      crtl->max_used_stack_slot_alignment
-		= incoming_stack_boundary;
-	      crtl->stack_alignment_needed
-		= incoming_stack_boundary;
-	      /* Also update preferred_stack_boundary for leaf
-	         functions.  */
-	      crtl->preferred_stack_boundary
-		= incoming_stack_boundary;
-	    }
+	  addr = SUBREG_REG (addr);
+	  if (CONST_INT_P (addr))
+	    return 0;
 	}
+    }
+
+  if (REG_P (addr))
+    base = addr;
+  else if (SUBREG_P (addr))
+    {
+      if (REG_P (SUBREG_REG (addr)))
+	base = addr;
       else
+	return 0;
+    }
+  else if (GET_CODE (addr) == PLUS)
+    {
+      rtx addends[4], op;
+      int n = 0, i;
+
+      op = addr;
+      do
 	{
-	  /* If drap has been set, but it actually isn't live at the
-	     start of the function, there is no reason to set it up.  */
-	  if (crtl->drap_reg)
+	  if (n >= 4)
+	    return 0;
+	  addends[n++] = XEXP (op, 1);
+	  op = XEXP (op, 0);
+	}
+      while (GET_CODE (op) == PLUS);
+      if (n >= 4)
+	return 0;
+      addends[n] = op;
+
+      for (i = n; i >= 0; --i)
+	{
+	  op = addends[i];
+	  switch (GET_CODE (op))
 	    {
-	      basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
-	      if (! REGNO_REG_SET_P (DF_LR_IN (bb),
-				     REGNO (crtl->drap_reg)))
-		{
-		  crtl->drap_reg = NULL_RTX;
-		  crtl->need_drap = false;
-		}
-	    }
-	  else
-	    cfun->machine->no_drap_save_restore = true;
+	    case MULT:
+	      if (index)
+		return 0;
+	      index = XEXP (op, 0);
+	      scale_rtx = XEXP (op, 1);
+	      break;
 
-	  frame_pointer_needed = false;
-	  stack_realign = false;
-	  crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
-	  crtl->stack_alignment_needed = incoming_stack_boundary;
-	  crtl->stack_alignment_estimated = incoming_stack_boundary;
-	  if (crtl->preferred_stack_boundary > incoming_stack_boundary)
-	    crtl->preferred_stack_boundary = incoming_stack_boundary;
-	  df_finish_pass (true);
-	  df_scan_alloc (NULL);
-	  df_scan_blocks ();
-	  df_compute_regs_ever_live (true);
-	  df_analyze ();
+	    case ASHIFT:
+	      if (index)
+		return 0;
+	      index = XEXP (op, 0);
+	      tmp = XEXP (op, 1);
+	      if (!CONST_INT_P (tmp))
+		return 0;
+	      scale = INTVAL (tmp);
+	      if ((unsigned HOST_WIDE_INT) scale > 3)
+		return 0;
+	      scale = 1 << scale;
+	      break;
 
-	  if (flag_var_tracking)
-	    {
-	      /* Since frame pointer is no longer available, replace it with
-		 stack pointer - UNITS_PER_WORD in debug insns.  */
-	      df_ref ref, next;
-	      for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
-		   ref; ref = next)
-		{
-		  next = DF_REF_NEXT_REG (ref);
-		  if (!DF_REF_INSN_INFO (ref))
-		    continue;
+	    case ZERO_EXTEND:
+	      op = XEXP (op, 0);
+	      if (GET_CODE (op) != UNSPEC)
+		return 0;
+	      /* FALLTHRU */
 
-		  /* Make sure the next ref is for a different instruction,
-		     so that we're not affected by the rescan.  */
-		  rtx_insn *insn = DF_REF_INSN (ref);
-		  while (next && DF_REF_INSN (next) == insn)
-		    next = DF_REF_NEXT_REG (next);
+	    case UNSPEC:
+	      if (XINT (op, 1) == UNSPEC_TP
+	          && TARGET_TLS_DIRECT_SEG_REFS
+	          && seg == ADDR_SPACE_GENERIC)
+		seg = DEFAULT_TLS_SEG_REG;
+	      else
+		return 0;
+	      break;
 
-		  if (DEBUG_INSN_P (insn))
-		    {
-		      bool changed = false;
-		      for (; ref != next; ref = DF_REF_NEXT_REG (ref))
-			{
-			  rtx *loc = DF_REF_LOC (ref);
-			  if (*loc == hard_frame_pointer_rtx)
-			    {
-			      *loc = plus_constant (Pmode,
-						    stack_pointer_rtx,
-						    -UNITS_PER_WORD);
-			      changed = true;
-			    }
-			}
-		      if (changed)
-			df_insn_rescan (insn);
-		    }
-		}
-	    }
+	    case SUBREG:
+	      if (!REG_P (SUBREG_REG (op)))
+		return 0;
+	      /* FALLTHRU */
 
-	  recompute_frame_layout_p = true;
+	    case REG:
+	      if (!base)
+		base = op;
+	      else if (!index)
+		index = op;
+	      else
+		return 0;
+	      break;
+
+	    case CONST:
+	    case CONST_INT:
+	    case SYMBOL_REF:
+	    case LABEL_REF:
+	      if (disp)
+		return 0;
+	      disp = op;
+	      break;
+
+	    default:
+	      return 0;
+	    }
 	}
     }
-  else if (crtl->max_used_stack_slot_alignment >= 128)
+  else if (GET_CODE (addr) == MULT)
     {
-      /* We don't need to realign stack.  max_used_stack_alignment is
-	 used to decide how stack frame should be aligned.  This is
-	 independent of any psABIs nor 32-bit vs 64-bit.  It is always
-	 safe to compute max_used_stack_alignment.  We compute it only
-	 if 128-bit aligned load/store may be generated on misaligned
-	 stack slot which will lead to segfault.   */
-      if (ix86_find_max_used_stack_alignment (stack_alignment, true))
-	cfun->machine->max_used_stack_alignment
-	  = stack_alignment / BITS_PER_UNIT;
+      index = XEXP (addr, 0);		/* index*scale */
+      scale_rtx = XEXP (addr, 1);
     }
+  else if (GET_CODE (addr) == ASHIFT)
+    {
+      /* We're called for lea too, which implements ashift on occasion.  */
+      index = XEXP (addr, 0);
+      tmp = XEXP (addr, 1);
+      if (!CONST_INT_P (tmp))
+	return 0;
+      scale = INTVAL (tmp);
+      if ((unsigned HOST_WIDE_INT) scale > 3)
+	return 0;
+      scale = 1 << scale;
+      retval = -1;
+    }
+  else
+    disp = addr;			/* displacement */
 
-  if (crtl->stack_realign_needed != stack_realign)
-    recompute_frame_layout_p = true;
-  crtl->stack_realign_needed = stack_realign;
-  crtl->stack_realign_finalized = true;
-  if (recompute_frame_layout_p)
-    ix86_compute_frame_layout ();
-}
-
-/* Delete SET_GOT right after entry block if it is allocated to reg.  */
-
-static void
-ix86_elim_entry_set_got (rtx reg)
-{
-  basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
-  rtx_insn *c_insn = BB_HEAD (bb);
-  if (!NONDEBUG_INSN_P (c_insn))
-    c_insn = next_nonnote_nondebug_insn (c_insn);
-  if (c_insn && NONJUMP_INSN_P (c_insn))
+  if (index)
     {
-      rtx pat = PATTERN (c_insn);
-      if (GET_CODE (pat) == PARALLEL)
-	{
-	  rtx vec = XVECEXP (pat, 0, 0);
-	  if (GET_CODE (vec) == SET
-	      && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
-	      && REGNO (XEXP (vec, 0)) == REGNO (reg))
-	    delete_insn (c_insn);
-	}
+      if (REG_P (index))
+	;
+      else if (SUBREG_P (index)
+	       && REG_P (SUBREG_REG (index)))
+	;
+      else
+	return 0;
     }
-}
 
-static rtx
-gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
-{
-  rtx addr, mem;
+  /* Extract the integral value of scale.  */
+  if (scale_rtx)
+    {
+      if (!CONST_INT_P (scale_rtx))
+	return 0;
+      scale = INTVAL (scale_rtx);
+    }
 
-  if (offset)
-    addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
-  mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
-  return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
-}
+  base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
+  index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
 
-static inline rtx
-gen_frame_load (rtx reg, rtx frame_reg, int offset)
-{
-  return gen_frame_set (reg, frame_reg, offset, false);
-}
+  /* Avoid useless 0 displacement.  */
+  if (disp == const0_rtx && (base || index))
+    disp = NULL_RTX;
 
-static inline rtx
-gen_frame_store (rtx reg, rtx frame_reg, int offset)
-{
-  return gen_frame_set (reg, frame_reg, offset, true);
-}
+  /* Allow arg pointer and stack pointer as index if there is not scaling.  */
+  if (base_reg && index_reg && scale == 1
+      && (REGNO (index_reg) == ARG_POINTER_REGNUM
+	  || REGNO (index_reg) == FRAME_POINTER_REGNUM
+	  || REGNO (index_reg) == SP_REG))
+    {
+      std::swap (base, index);
+      std::swap (base_reg, index_reg);
+    }
 
-static void
-ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
-{
-  struct machine_function *m = cfun->machine;
-  const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
-			  + m->call_ms2sysv_extra_regs;
-  rtvec v = rtvec_alloc (ncregs + 1);
-  unsigned int align, i, vi = 0;
-  rtx_insn *insn;
-  rtx sym, addr;
-  rtx rax = gen_rtx_REG (word_mode, AX_REG);
-  const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
+  /* Special case: %ebp cannot be encoded as a base without a displacement.
+     Similarly %r13.  */
+  if (!disp && base_reg
+      && (REGNO (base_reg) == ARG_POINTER_REGNUM
+	  || REGNO (base_reg) == FRAME_POINTER_REGNUM
+	  || REGNO (base_reg) == BP_REG
+	  || REGNO (base_reg) == R13_REG))
+    disp = const0_rtx;
 
-  /* AL should only be live with sysv_abi.  */
-  gcc_assert (!ix86_eax_live_at_start_p ());
-  gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
+  /* Special case: on K6, [%esi] makes the instruction vector decoded.
+     Avoid this by transforming to [%esi+0].
+     Reload calls address legitimization without cfun defined, so we need
+     to test cfun for being non-NULL. */
+  if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
+      && base_reg && !index_reg && !disp
+      && REGNO (base_reg) == SI_REG)
+    disp = const0_rtx;
 
-  /* Setup RAX as the stub's base pointer.  We use stack_realign_offset rather
-     we've actually realigned the stack or not.  */
-  align = GET_MODE_ALIGNMENT (V4SFmode);
-  addr = choose_baseaddr (frame.stack_realign_offset
-			  + xlogue.get_stub_ptr_offset (), &align, AX_REG);
-  gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
+  /* Special case: encode reg+reg instead of reg*2.  */
+  if (!base && index && scale == 2)
+    base = index, base_reg = index_reg, scale = 1;
 
-  emit_insn (gen_rtx_SET (rax, addr));
+  /* Special case: scaling cannot be encoded without base or displacement.  */
+  if (!base && !disp && index && scale != 1)
+    disp = const0_rtx;
 
-  /* Get the stub symbol.  */
-  sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
-						  : XLOGUE_STUB_SAVE);
-  RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
+  out->base = base;
+  out->index = index;
+  out->disp = disp;
+  out->scale = scale;
+  out->seg = seg;
 
-  for (i = 0; i < ncregs; ++i)
-    {
-      const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
-      rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
-			     r.regno);
-      RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
-    }
-
-  gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
-
-  insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
-  RTX_FRAME_RELATED_P (insn) = true;
+  return retval;
 }
+
+/* Return cost of the memory address x.
+   For i386, it is better to use a complex address than let gcc copy
+   the address into a reg and make a new pseudo.  But not if the address
+   requires to two regs - that would mean more pseudos with longer
+   lifetimes.  */
+static int
+ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
+{
+  struct ix86_address parts;
+  int cost = 1;
+  int ok = ix86_decompose_address (x, &parts);
 
-/* Expand the prologue into a bunch of separate insns.  */
+  gcc_assert (ok);
 
-void
-ix86_expand_prologue (void)
-{
-  struct machine_function *m = cfun->machine;
-  rtx insn, t;
-  HOST_WIDE_INT allocate;
-  bool int_registers_saved;
-  bool sse_registers_saved;
-  bool save_stub_call_needed;
-  rtx static_chain = NULL_RTX;
+  if (parts.base && SUBREG_P (parts.base))
+    parts.base = SUBREG_REG (parts.base);
+  if (parts.index && SUBREG_P (parts.index))
+    parts.index = SUBREG_REG (parts.index);
 
-  if (ix86_function_naked (current_function_decl))
-    return;
+  /* Attempt to minimize number of registers in the address by increasing
+     address cost for each used register.  We don't increase address cost
+     for "pic_offset_table_rtx".  When a memopt with "pic_offset_table_rtx"
+     is not invariant itself it most likely means that base or index is not
+     invariant.  Therefore only "pic_offset_table_rtx" could be hoisted out,
+     which is not profitable for x86.  */
+  if (parts.base
+      && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
+      && (current_pass->type == GIMPLE_PASS
+	  || !pic_offset_table_rtx
+	  || !REG_P (parts.base)
+	  || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
+    cost++;
 
-  ix86_finalize_stack_frame_flags ();
+  if (parts.index
+      && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
+      && (current_pass->type == GIMPLE_PASS
+	  || !pic_offset_table_rtx
+	  || !REG_P (parts.index)
+	  || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
+    cost++;
 
-  /* DRAP should not coexist with stack_realign_fp */
-  gcc_assert (!(crtl->drap_reg && stack_realign_fp));
+  /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
+     since it's predecode logic can't detect the length of instructions
+     and it degenerates to vector decoded.  Increase cost of such
+     addresses here.  The penalty is minimally 2 cycles.  It may be worthwhile
+     to split such addresses or even refuse such addresses at all.
 
-  memset (&m->fs, 0, sizeof (m->fs));
+     Following addressing modes are affected:
+      [base+scale*index]
+      [scale*index+disp]
+      [base+index]
 
-  /* Initialize CFA state for before the prologue.  */
-  m->fs.cfa_reg = stack_pointer_rtx;
-  m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
+     The first and last case  may be avoidable by explicitly coding the zero in
+     memory address, but I don't have AMD-K6 machine handy to check this
+     theory.  */
 
-  /* Track SP offset to the CFA.  We continue tracking this after we've
-     swapped the CFA register away from SP.  In the case of re-alignment
-     this is fudged; we're interested to offsets within the local frame.  */
-  m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
-  m->fs.sp_valid = true;
-  m->fs.sp_realigned = false;
+  if (TARGET_K6
+      && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
+	  || (parts.disp && !parts.base && parts.index && parts.scale != 1)
+	  || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
+    cost += 10;
 
-  const struct ix86_frame &frame = cfun->machine->frame;
+  return cost;
+}
+
+/* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
+   this is used for to form addresses to local data when -fPIC is in
+   use.  */
 
-  if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
-    {
-      /* We should have already generated an error for any use of
-         ms_hook on a nested function.  */
-      gcc_checking_assert (!ix86_static_chain_on_stack);
+static bool
+darwin_local_data_pic (rtx disp)
+{
+  return (GET_CODE (disp) == UNSPEC
+	  && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
+}
 
-      /* Check if profiling is active and we shall use profiling before
-         prologue variant. If so sorry.  */
-      if (crtl->profile && flag_fentry != 0)
-        sorry ("ms_hook_prologue attribute isn%'t compatible "
-	       "with %<-mfentry%> for 32-bit");
+/* True if operand X should be loaded from GOT.  */
 
-      /* In ix86_asm_output_function_label we emitted:
-	 8b ff     movl.s %edi,%edi
-	 55        push   %ebp
-	 8b ec     movl.s %esp,%ebp
+bool
+ix86_force_load_from_GOT_p (rtx x)
+{
+  return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
+	  && !TARGET_PECOFF && !TARGET_MACHO
+	  && !flag_pic
+	  && ix86_cmodel != CM_LARGE
+	  && GET_CODE (x) == SYMBOL_REF
+	  && SYMBOL_REF_FUNCTION_P (x)
+	  && (!flag_plt
+	      || (SYMBOL_REF_DECL (x)
+		  && lookup_attribute ("noplt",
+				       DECL_ATTRIBUTES (SYMBOL_REF_DECL (x)))))
+	  && !SYMBOL_REF_LOCAL_P (x));
+}
 
-	 This matches the hookable function prologue in Win32 API
-	 functions in Microsoft Windows XP Service Pack 2 and newer.
-	 Wine uses this to enable Windows apps to hook the Win32 API
-	 functions provided by Wine.
+/* Determine if a given RTX is a valid constant.  We already know this
+   satisfies CONSTANT_P.  */
 
-	 What that means is that we've already set up the frame pointer.  */
+static bool
+ix86_legitimate_constant_p (machine_mode mode, rtx x)
+{
+  switch (GET_CODE (x))
+    {
+    case CONST:
+      x = XEXP (x, 0);
 
-      if (frame_pointer_needed
-	  && !(crtl->drap_reg && crtl->stack_realign_needed))
+      if (GET_CODE (x) == PLUS)
 	{
-	  rtx push, mov;
+	  if (!CONST_INT_P (XEXP (x, 1)))
+	    return false;
+	  x = XEXP (x, 0);
+	}
 
-	  /* We've decided to use the frame pointer already set up.
-	     Describe this to the unwinder by pretending that both
-	     push and mov insns happen right here.
+      if (TARGET_MACHO && darwin_local_data_pic (x))
+	return true;
 
-	     Putting the unwind info here at the end of the ms_hook
-	     is done so that we can make absolutely certain we get
-	     the required byte sequence at the start of the function,
-	     rather than relying on an assembler that can produce
-	     the exact encoding required.
+      /* Only some unspecs are valid as "constants".  */
+      if (GET_CODE (x) == UNSPEC)
+	switch (XINT (x, 1))
+	  {
+	  case UNSPEC_GOT:
+	  case UNSPEC_GOTOFF:
+	  case UNSPEC_PLTOFF:
+	    return TARGET_64BIT;
+	  case UNSPEC_TPOFF:
+	  case UNSPEC_NTPOFF:
+	    x = XVECEXP (x, 0, 0);
+	    return (GET_CODE (x) == SYMBOL_REF
+		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
+	  case UNSPEC_DTPOFF:
+	    x = XVECEXP (x, 0, 0);
+	    return (GET_CODE (x) == SYMBOL_REF
+		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
+	  default:
+	    return false;
+	  }
 
-	     However it does mean (in the unpatched case) that we have
-	     a 1 insn window where the asynchronous unwind info is
-	     incorrect.  However, if we placed the unwind info at
-	     its correct location we would have incorrect unwind info
-	     in the patched case.  Which is probably all moot since
-	     I don't expect Wine generates dwarf2 unwind info for the
-	     system libraries that use this feature.  */
+      /* We must have drilled down to a symbol.  */
+      if (GET_CODE (x) == LABEL_REF)
+	return true;
+      if (GET_CODE (x) != SYMBOL_REF)
+	return false;
+      /* FALLTHRU */
 
-	  insn = emit_insn (gen_blockage ());
+    case SYMBOL_REF:
+      /* TLS symbols are never valid.  */
+      if (SYMBOL_REF_TLS_MODEL (x))
+	return false;
 
-	  push = gen_push (hard_frame_pointer_rtx);
-	  mov = gen_rtx_SET (hard_frame_pointer_rtx,
-			     stack_pointer_rtx);
-	  RTX_FRAME_RELATED_P (push) = 1;
-	  RTX_FRAME_RELATED_P (mov) = 1;
+      /* DLLIMPORT symbols are never valid.  */
+      if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
+	  && SYMBOL_REF_DLLIMPORT_P (x))
+	return false;
 
-	  RTX_FRAME_RELATED_P (insn) = 1;
-	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
-			gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
+#if TARGET_MACHO
+      /* mdynamic-no-pic */
+      if (MACHO_DYNAMIC_NO_PIC_P)
+	return machopic_symbol_defined_p (x);
+#endif
 
-	  /* Note that gen_push incremented m->fs.cfa_offset, even
-	     though we didn't emit the push insn here.  */
-	  m->fs.cfa_reg = hard_frame_pointer_rtx;
-	  m->fs.fp_offset = m->fs.cfa_offset;
-	  m->fs.fp_valid = true;
-	}
-      else
+      /* External function address should be loaded
+	 via the GOT slot to avoid PLT.  */
+      if (ix86_force_load_from_GOT_p (x))
+	return false;
+
+      break;
+
+    CASE_CONST_SCALAR_INT:
+      switch (mode)
 	{
-	  /* The frame pointer is not needed so pop %ebp again.
-	     This leaves us with a pristine state.  */
-	  emit_insn (gen_pop (hard_frame_pointer_rtx));
+	case E_TImode:
+	  if (TARGET_64BIT)
+	    return true;
+	  /* FALLTHRU */
+	case E_OImode:
+	case E_XImode:
+	  if (!standard_sse_constant_p (x, mode))
+	    return false;
+	default:
+	  break;
 	}
+      break;
+
+    case CONST_VECTOR:
+      if (!standard_sse_constant_p (x, mode))
+	return false;
+
+    default:
+      break;
     }
 
-  /* The first insn of a function that accepts its static chain on the
-     stack is to push the register that would be filled in by a direct
-     call.  This insn will be skipped by the trampoline.  */
-  else if (ix86_static_chain_on_stack)
-    {
-      static_chain = ix86_static_chain (cfun->decl, false);
-      insn = emit_insn (gen_push (static_chain));
-      emit_insn (gen_blockage ());
+  /* Otherwise we handle everything else in the move patterns.  */
+  return true;
+}
 
-      /* We don't want to interpret this push insn as a register save,
-	 only as a stack adjustment.  The real copy of the register as
-	 a save will be done later, if needed.  */
-      t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
-      t = gen_rtx_SET (stack_pointer_rtx, t);
-      add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
-      RTX_FRAME_RELATED_P (insn) = 1;
-    }
+/* Determine if it's legal to put X into the constant pool.  This
+   is not possible for the address of thread-local symbols, which
+   is checked above.  */
 
-  /* Emit prologue code to adjust stack alignment and setup DRAP, in case
-     of DRAP is needed and stack realignment is really needed after reload */
-  if (stack_realign_drap)
+static bool
+ix86_cannot_force_const_mem (machine_mode mode, rtx x)
+{
+  /* We can put any immediate constant in memory.  */
+  switch (GET_CODE (x))
     {
-      int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
+    CASE_CONST_ANY:
+      return false;
 
-      /* Can't use DRAP in interrupt function.  */
-      if (cfun->machine->func_type != TYPE_NORMAL)
-	sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
-	       "in interrupt service routine.  This may be worked "
-	       "around by avoiding functions with aggregate return.");
+    default:
+      break;
+    }
 
-      /* Only need to push parameter pointer reg if it is caller saved.  */
-      if (!call_used_regs[REGNO (crtl->drap_reg)])
-	{
-	  /* Push arg pointer reg */
-	  insn = emit_insn (gen_push (crtl->drap_reg));
-	  RTX_FRAME_RELATED_P (insn) = 1;
-	}
+  return !ix86_legitimate_constant_p (mode, x);
+}
 
-      /* Grab the argument pointer.  */
-      t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
-      insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
-      RTX_FRAME_RELATED_P (insn) = 1;
-      m->fs.cfa_reg = crtl->drap_reg;
-      m->fs.cfa_offset = 0;
+/*  Nonzero if the symbol is marked as dllimport, or as stub-variable,
+    otherwise zero.  */
 
-      /* Align the stack.  */
-      insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
-					stack_pointer_rtx,
-					GEN_INT (-align_bytes)));
-      RTX_FRAME_RELATED_P (insn) = 1;
+static bool
+is_imported_p (rtx x)
+{
+  if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
+      || GET_CODE (x) != SYMBOL_REF)
+    return false;
 
-      /* Replicate the return address on the stack so that return
-	 address can be reached via (argp - 1) slot.  This is needed
-	 to implement macro RETURN_ADDR_RTX and intrinsic function
-	 expand_builtin_return_addr etc.  */
-      t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
-      t = gen_frame_mem (word_mode, t);
-      insn = emit_insn (gen_push (t));
-      RTX_FRAME_RELATED_P (insn) = 1;
+  return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
+}
 
-      /* For the purposes of frame and register save area addressing,
-	 we've started over with a new frame.  */
-      m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
-      m->fs.realigned = true;
 
-      if (static_chain)
-	{
-	  /* Replicate static chain on the stack so that static chain
-	     can be reached via (argp - 2) slot.  This is needed for
-	     nested function with stack realignment.  */
-	  insn = emit_insn (gen_push (static_chain));
-	  RTX_FRAME_RELATED_P (insn) = 1;
-	}
-    }
+/* Nonzero if the constant value X is a legitimate general operand
+   when generating PIC code.  It is given that flag_pic is on and
+   that X satisfies CONSTANT_P.  */
 
-  int_registers_saved = (frame.nregs == 0);
-  sse_registers_saved = (frame.nsseregs == 0);
-  save_stub_call_needed = (m->call_ms2sysv);
-  gcc_assert (sse_registers_saved || !save_stub_call_needed);
+bool
+legitimate_pic_operand_p (rtx x)
+{
+  rtx inner;
 
-  if (frame_pointer_needed && !m->fs.fp_valid)
+  switch (GET_CODE (x))
     {
-      /* Note: AT&T enter does NOT have reversed args.  Enter is probably
-         slower on all targets.  Also sdb didn't like it.  */
-      insn = emit_insn (gen_push (hard_frame_pointer_rtx));
-      RTX_FRAME_RELATED_P (insn) = 1;
+    case CONST:
+      inner = XEXP (x, 0);
+      if (GET_CODE (inner) == PLUS
+	  && CONST_INT_P (XEXP (inner, 1)))
+	inner = XEXP (inner, 0);
 
-      /* Push registers now, before setting the frame pointer
-	 on SEH target.  */
-      if (!int_registers_saved
-	  && TARGET_SEH
-	  && !frame.save_regs_using_mov)
-	{
-	  ix86_emit_save_regs ();
-	  int_registers_saved = true;
-	  gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
-	}
+      /* Only some unspecs are valid as "constants".  */
+      if (GET_CODE (inner) == UNSPEC)
+	switch (XINT (inner, 1))
+	  {
+	  case UNSPEC_GOT:
+	  case UNSPEC_GOTOFF:
+	  case UNSPEC_PLTOFF:
+	    return TARGET_64BIT;
+	  case UNSPEC_TPOFF:
+	    x = XVECEXP (inner, 0, 0);
+	    return (GET_CODE (x) == SYMBOL_REF
+		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
+	  case UNSPEC_MACHOPIC_OFFSET:
+	    return legitimate_pic_address_disp_p (x);
+	  default:
+	    return false;
+	  }
+      /* FALLTHRU */
 
-      if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
-	{
-	  insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
-	  RTX_FRAME_RELATED_P (insn) = 1;
+    case SYMBOL_REF:
+    case LABEL_REF:
+      return legitimate_pic_address_disp_p (x);
 
-	  if (m->fs.cfa_reg == stack_pointer_rtx)
-	    m->fs.cfa_reg = hard_frame_pointer_rtx;
-	  m->fs.fp_offset = m->fs.sp_offset;
-	  m->fs.fp_valid = true;
-	}
+    default:
+      return true;
     }
+}
 
-  if (!int_registers_saved)
-    {
-      /* If saving registers via PUSH, do so now.  */
-      if (!frame.save_regs_using_mov)
-	{
-	  ix86_emit_save_regs ();
-	  int_registers_saved = true;
-	  gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
-	}
+/* Determine if a given CONST RTX is a valid memory displacement
+   in PIC mode.  */
 
-      /* When using red zone we may start register saving before allocating
-	 the stack frame saving one cycle of the prologue.  However, avoid
-	 doing this if we have to probe the stack; at least on x86_64 the
-	 stack probe can turn into a call that clobbers a red zone location. */
-      else if (ix86_using_red_zone ()
-	       && (! TARGET_STACK_PROBE
-		   || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
-	{
-	  ix86_emit_save_regs_using_mov (frame.reg_save_offset);
-	  int_registers_saved = true;
-	}
-    }
+bool
+legitimate_pic_address_disp_p (rtx disp)
+{
+  bool saw_plus;
 
-  if (stack_realign_fp)
+  /* In 64bit mode we can allow direct addresses of symbols and labels
+     when they are not dynamic symbols.  */
+  if (TARGET_64BIT)
     {
-      int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
-      gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
+      rtx op0 = disp, op1;
 
-      /* Record last valid frame pointer offset.  */
-      m->fs.sp_realigned_fp_last = frame.reg_save_offset;
+      switch (GET_CODE (disp))
+	{
+	case LABEL_REF:
+	  return true;
 
-      /* The computation of the size of the re-aligned stack frame means
-	 that we must allocate the size of the register save area before
-	 performing the actual alignment.  Otherwise we cannot guarantee
-	 that there's enough storage above the realignment point.  */
-      allocate = frame.reg_save_offset - m->fs.sp_offset
-		 + frame.stack_realign_allocate;
-      if (allocate)
-        pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
-				   GEN_INT (-allocate), -1, false);
+	case CONST:
+	  if (GET_CODE (XEXP (disp, 0)) != PLUS)
+	    break;
+	  op0 = XEXP (XEXP (disp, 0), 0);
+	  op1 = XEXP (XEXP (disp, 0), 1);
+	  if (!CONST_INT_P (op1))
+	    break;
+	  if (GET_CODE (op0) == UNSPEC
+	      && (XINT (op0, 1) == UNSPEC_DTPOFF
+		  || XINT (op0, 1) == UNSPEC_NTPOFF)
+	      && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
+	    return true;
+	  if (INTVAL (op1) >= 16*1024*1024
+	      || INTVAL (op1) < -16*1024*1024)
+	    break;
+	  if (GET_CODE (op0) == LABEL_REF)
+	    return true;
+	  if (GET_CODE (op0) == CONST
+	      && GET_CODE (XEXP (op0, 0)) == UNSPEC
+	      && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
+	    return true;
+	  if (GET_CODE (op0) == UNSPEC
+	      && XINT (op0, 1) == UNSPEC_PCREL)
+	    return true;
+	  if (GET_CODE (op0) != SYMBOL_REF)
+	    break;
+	  /* FALLTHRU */
 
-      /* Align the stack.  */
-      insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
-					stack_pointer_rtx,
-					GEN_INT (-align_bytes)));
-      m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
-      m->fs.sp_realigned_offset = m->fs.sp_offset
-					      - frame.stack_realign_allocate;
-      /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
-	 Beyond this point, stack access should be done via choose_baseaddr or
-	 by using sp_valid_at and fp_valid_at to determine the correct base
-	 register.  Henceforth, any CFA offset should be thought of as logical
-	 and not physical.  */
-      gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
-      gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
-      m->fs.sp_realigned = true;
+	case SYMBOL_REF:
+	  /* TLS references should always be enclosed in UNSPEC.
+	     The dllimported symbol needs always to be resolved.  */
+	  if (SYMBOL_REF_TLS_MODEL (op0)
+	      || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
+	    return false;
 
-      /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
-	 is needed to describe where a register is saved using a realigned
-	 stack pointer, so we need to invalidate the stack pointer for that
-	 target.  */
-      if (TARGET_SEH)
-	m->fs.sp_valid = false;
-
-      /* If SP offset is non-immediate after allocation of the stack frame,
-	 then emit SSE saves or stub call prior to allocating the rest of the
-	 stack frame.  This is less efficient for the out-of-line stub because
-	 we can't combine allocations across the call barrier, but it's better
-	 than using a scratch register.  */
-      else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
-						   - m->fs.sp_realigned_offset),
-					  Pmode))
-	{
-	  if (!sse_registers_saved)
-	    {
-	      ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
-	      sse_registers_saved = true;
-	    }
-	  else if (save_stub_call_needed)
+	  if (TARGET_PECOFF)
 	    {
-	      ix86_emit_outlined_ms2sysv_save (frame);
-	      save_stub_call_needed = false;
+	      if (is_imported_p (op0))
+		return true;
+
+	      if (SYMBOL_REF_FAR_ADDR_P (op0)
+		  || !SYMBOL_REF_LOCAL_P (op0))
+		break;
+
+	      /* Function-symbols need to be resolved only for
+	         large-model.
+	         For the small-model we don't need to resolve anything
+	         here.  */
+	      if ((ix86_cmodel != CM_LARGE_PIC
+	           && SYMBOL_REF_FUNCTION_P (op0))
+		  || ix86_cmodel == CM_SMALL_PIC)
+		return true;
+	      /* Non-external symbols don't need to be resolved for
+	         large, and medium-model.  */
+	      if ((ix86_cmodel == CM_LARGE_PIC
+		   || ix86_cmodel == CM_MEDIUM_PIC)
+		  && !SYMBOL_REF_EXTERNAL_P (op0))
+		return true;
 	    }
+	  else if (!SYMBOL_REF_FAR_ADDR_P (op0)
+		   && (SYMBOL_REF_LOCAL_P (op0)
+		       || (HAVE_LD_PIE_COPYRELOC
+			   && flag_pie
+			   && !SYMBOL_REF_WEAK (op0)
+			   && !SYMBOL_REF_FUNCTION_P (op0)))
+		   && ix86_cmodel != CM_LARGE_PIC)
+	    return true;
+	  break;
+
+	default:
+	  break;
 	}
     }
+  if (GET_CODE (disp) != CONST)
+    return false;
+  disp = XEXP (disp, 0);
 
-  allocate = frame.stack_pointer_offset - m->fs.sp_offset;
-
-  if (flag_stack_usage_info)
+  if (TARGET_64BIT)
     {
-      /* We start to count from ARG_POINTER.  */
-      HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
+      /* We are unsafe to allow PLUS expressions.  This limit allowed distance
+         of GOT tables.  We should not need these anyway.  */
+      if (GET_CODE (disp) != UNSPEC
+	  || (XINT (disp, 1) != UNSPEC_GOTPCREL
+	      && XINT (disp, 1) != UNSPEC_GOTOFF
+	      && XINT (disp, 1) != UNSPEC_PCREL
+	      && XINT (disp, 1) != UNSPEC_PLTOFF))
+	return false;
 
-      /* If it was realigned, take into account the fake frame.  */
-      if (stack_realign_drap)
-	{
-	  if (ix86_static_chain_on_stack)
-	    stack_size += UNITS_PER_WORD;
+      if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
+	  && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
+	return false;
+      return true;
+    }
 
-	  if (!call_used_regs[REGNO (crtl->drap_reg)])
-	    stack_size += UNITS_PER_WORD;
+  saw_plus = false;
+  if (GET_CODE (disp) == PLUS)
+    {
+      if (!CONST_INT_P (XEXP (disp, 1)))
+	return false;
+      disp = XEXP (disp, 0);
+      saw_plus = true;
+    }
 
-	  /* This over-estimates by 1 minimal-stack-alignment-unit but
-	     mitigates that by counting in the new return address slot.  */
-	  current_function_dynamic_stack_size
-	    += crtl->stack_alignment_needed / BITS_PER_UNIT;
-	}
+  if (TARGET_MACHO && darwin_local_data_pic (disp))
+    return true;
 
-      current_function_static_stack_size = stack_size;
-    }
+  if (GET_CODE (disp) != UNSPEC)
+    return false;
 
-  /* On SEH target with very large frame size, allocate an area to save
-     SSE registers (as the very large allocation won't be described).  */
-  if (TARGET_SEH
-      && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
-      && !sse_registers_saved)
+  switch (XINT (disp, 1))
     {
-      HOST_WIDE_INT sse_size
-	= frame.sse_reg_save_offset - frame.reg_save_offset;
+    case UNSPEC_GOT:
+      if (saw_plus)
+	return false;
+      /* We need to check for both symbols and labels because VxWorks loads
+	 text labels with @GOT rather than @GOTOFF.  See gotoff_operand for
+	 details.  */
+      return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
+	      || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
+    case UNSPEC_GOTOFF:
+      /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
+	 While ABI specify also 32bit relocation but we don't produce it in
+	 small PIC model at all.  */
+      if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
+	   || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
+	  && !TARGET_64BIT)
+        return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
+      return false;
+    case UNSPEC_GOTTPOFF:
+    case UNSPEC_GOTNTPOFF:
+    case UNSPEC_INDNTPOFF:
+      if (saw_plus)
+	return false;
+      disp = XVECEXP (disp, 0, 0);
+      return (GET_CODE (disp) == SYMBOL_REF
+	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
+    case UNSPEC_NTPOFF:
+      disp = XVECEXP (disp, 0, 0);
+      return (GET_CODE (disp) == SYMBOL_REF
+	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
+    case UNSPEC_DTPOFF:
+      disp = XVECEXP (disp, 0, 0);
+      return (GET_CODE (disp) == SYMBOL_REF
+	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
+    }
 
-      gcc_assert (int_registers_saved);
+  return false;
+}
 
-      /* No need to do stack checking as the area will be immediately
-	 written.  */
-      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
-			         GEN_INT (-sse_size), -1,
-				 m->fs.cfa_reg == stack_pointer_rtx);
-      allocate -= sse_size;
-      ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
-      sse_registers_saved = true;
-    }
+/* Determine if op is suitable RTX for an address register.
+   Return naked register if a register or a register subreg is
+   found, otherwise return NULL_RTX.  */
 
-  /* The stack has already been decremented by the instruction calling us
-     so probe if the size is non-negative to preserve the protection area.  */
-  if (allocate >= 0
-      && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
-	  || flag_stack_clash_protection))
+static rtx
+ix86_validate_address_register (rtx op)
+{
+  machine_mode mode = GET_MODE (op);
+
+  /* Only SImode or DImode registers can form the address.  */
+  if (mode != SImode && mode != DImode)
+    return NULL_RTX;
+
+  if (REG_P (op))
+    return op;
+  else if (SUBREG_P (op))
     {
-      if (flag_stack_clash_protection)
-	{
-	  ix86_adjust_stack_and_probe_stack_clash (allocate,
-						   int_registers_saved);
-	  allocate = 0;
-	}
-      else if (STACK_CHECK_MOVING_SP)
-	{
-	  if (!(crtl->is_leaf && !cfun->calls_alloca
-		&& allocate <= get_probe_interval ()))
-	    {
-	      ix86_adjust_stack_and_probe (allocate, int_registers_saved);
-	      allocate = 0;
-	    }
-	}
-      else
-	{
-	  HOST_WIDE_INT size = allocate;
+      rtx reg = SUBREG_REG (op);
 
-	  if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
-	    size = 0x80000000 - get_stack_check_protect () - 1;
+      if (!REG_P (reg))
+	return NULL_RTX;
 
-	  if (TARGET_STACK_PROBE)
-	    {
-	      if (crtl->is_leaf && !cfun->calls_alloca)
-		{
-		  if (size > get_probe_interval ())
-		    ix86_emit_probe_stack_range (0, size, int_registers_saved);
-		}
-	      else
-		ix86_emit_probe_stack_range (0,
-					     size + get_stack_check_protect (),
-					     int_registers_saved);
-	    }
-	  else
-	    {
-	      if (crtl->is_leaf && !cfun->calls_alloca)
-		{
-		  if (size > get_probe_interval ()
-		      && size > get_stack_check_protect ())
-		    ix86_emit_probe_stack_range (get_stack_check_protect (),
-						 (size
-						  - get_stack_check_protect ()),
-						 int_registers_saved);
-		}
-	      else
-		ix86_emit_probe_stack_range (get_stack_check_protect (), size,
-					     int_registers_saved);
-	    }
-	}
-    }
+      mode = GET_MODE (reg);
 
-  if (allocate == 0)
-    ;
-  else if (!ix86_target_stack_probe ()
-	   || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
-    {
-      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
-			         GEN_INT (-allocate), -1,
-			         m->fs.cfa_reg == stack_pointer_rtx);
-    }
-  else
-    {
-      rtx eax = gen_rtx_REG (Pmode, AX_REG);
-      rtx r10 = NULL;
-      rtx (*adjust_stack_insn)(rtx, rtx, rtx);
-      const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
-      bool eax_live = ix86_eax_live_at_start_p ();
-      bool r10_live = false;
+      /* Don't allow SUBREGs that span more than a word.  It can
+	 lead to spill failures when the register is one word out
+	 of a two word structure.  */
+      if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+	return NULL_RTX;
 
-      if (TARGET_64BIT)
-        r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
+      /* Allow only SUBREGs of non-eliminable hard registers.  */
+      if (register_no_elim_operand (reg, mode))
+	return reg;
+    }
 
-      if (eax_live)
-	{
-	  insn = emit_insn (gen_push (eax));
-	  allocate -= UNITS_PER_WORD;
-	  /* Note that SEH directives need to continue tracking the stack
-	     pointer even after the frame pointer has been set up.  */
-	  if (sp_is_cfa_reg || TARGET_SEH)
-	    {
-	      if (sp_is_cfa_reg)
-		m->fs.cfa_offset += UNITS_PER_WORD;
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	      add_reg_note (insn, REG_FRAME_RELATED_EXPR,
-			    gen_rtx_SET (stack_pointer_rtx,
-					 plus_constant (Pmode, stack_pointer_rtx,
-							-UNITS_PER_WORD)));
-	    }
-	}
-
-      if (r10_live)
-	{
-	  r10 = gen_rtx_REG (Pmode, R10_REG);
-	  insn = emit_insn (gen_push (r10));
-	  allocate -= UNITS_PER_WORD;
-	  if (sp_is_cfa_reg || TARGET_SEH)
-	    {
-	      if (sp_is_cfa_reg)
-		m->fs.cfa_offset += UNITS_PER_WORD;
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	      add_reg_note (insn, REG_FRAME_RELATED_EXPR,
-			    gen_rtx_SET (stack_pointer_rtx,
-					 plus_constant (Pmode, stack_pointer_rtx,
-							-UNITS_PER_WORD)));
-	    }
-	}
+  /* Op is not a register.  */
+  return NULL_RTX;
+}
 
-      emit_move_insn (eax, GEN_INT (allocate));
-      emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
+/* Recognizes RTL expressions that are valid memory addresses for an
+   instruction.  The MODE argument is the machine mode for the MEM
+   expression that wants to use this address.
 
-      /* Use the fact that AX still contains ALLOCATE.  */
-      adjust_stack_insn = (Pmode == DImode
-			   ? gen_pro_epilogue_adjust_stack_di_sub
-			   : gen_pro_epilogue_adjust_stack_si_sub);
+   It only recognizes address in canonical form.  LEGITIMIZE_ADDRESS should
+   convert common non-canonical forms to canonical form so that they will
+   be recognized.  */
 
-      insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
-					   stack_pointer_rtx, eax));
+static bool
+ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
+{
+  struct ix86_address parts;
+  rtx base, index, disp;
+  HOST_WIDE_INT scale;
+  addr_space_t seg;
 
-      if (sp_is_cfa_reg || TARGET_SEH)
-	{
-	  if (sp_is_cfa_reg)
-	    m->fs.cfa_offset += allocate;
-	  RTX_FRAME_RELATED_P (insn) = 1;
-	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
-			gen_rtx_SET (stack_pointer_rtx,
-				     plus_constant (Pmode, stack_pointer_rtx,
-						    -allocate)));
-	}
-      m->fs.sp_offset += allocate;
+  if (ix86_decompose_address (addr, &parts) <= 0)
+    /* Decomposition failed.  */
+    return false;
 
-      /* Use stack_pointer_rtx for relative addressing so that code works for
-	 realigned stack.  But this means that we need a blockage to prevent
-	 stores based on the frame pointer from being scheduled before.  */
-      if (r10_live && eax_live)
-        {
-	  t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
-	  emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
-			  gen_frame_mem (word_mode, t));
-	  t = plus_constant (Pmode, t, UNITS_PER_WORD);
-	  emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
-			  gen_frame_mem (word_mode, t));
-	  emit_insn (gen_memory_blockage ());
-	}
-      else if (eax_live || r10_live)
-	{
-	  t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
-	  emit_move_insn (gen_rtx_REG (word_mode,
-				       (eax_live ? AX_REG : R10_REG)),
-			  gen_frame_mem (word_mode, t));
-	  emit_insn (gen_memory_blockage ());
-	}
-    }
-  gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
+  base = parts.base;
+  index = parts.index;
+  disp = parts.disp;
+  scale = parts.scale;
+  seg = parts.seg;
 
-  /* If we havn't already set up the frame pointer, do so now.  */
-  if (frame_pointer_needed && !m->fs.fp_valid)
+  /* Validate base register.  */
+  if (base)
     {
-      insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
-			    GEN_INT (frame.stack_pointer_offset
-				     - frame.hard_frame_pointer_offset));
-      insn = emit_insn (insn);
-      RTX_FRAME_RELATED_P (insn) = 1;
-      add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
-
-      if (m->fs.cfa_reg == stack_pointer_rtx)
-	m->fs.cfa_reg = hard_frame_pointer_rtx;
-      m->fs.fp_offset = frame.hard_frame_pointer_offset;
-      m->fs.fp_valid = true;
-    }
+      rtx reg = ix86_validate_address_register (base);
 
-  if (!int_registers_saved)
-    ix86_emit_save_regs_using_mov (frame.reg_save_offset);
-  if (!sse_registers_saved)
-    ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
-  else if (save_stub_call_needed)
-    ix86_emit_outlined_ms2sysv_save (frame);
+      if (reg == NULL_RTX)
+	return false;
 
-  /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
-     in PROLOGUE.  */
-  if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
-    {
-      rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
-      insn = emit_insn (gen_set_got (pic));
-      RTX_FRAME_RELATED_P (insn) = 1;
-      add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
-      emit_insn (gen_prologue_use (pic));
-      /* Deleting already emmitted SET_GOT if exist and allocated to
-	 REAL_PIC_OFFSET_TABLE_REGNUM.  */
-      ix86_elim_entry_set_got (pic);
+      if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
+	  || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
+	/* Base is not valid.  */
+	return false;
     }
 
-  if (crtl->drap_reg && !crtl->stack_realign_needed)
+  /* Validate index register.  */
+  if (index)
     {
-      /* vDRAP is setup but after reload it turns out stack realign
-         isn't necessary, here we will emit prologue to setup DRAP
-         without stack realign adjustment */
-      t = choose_baseaddr (0, NULL);
-      emit_insn (gen_rtx_SET (crtl->drap_reg, t));
-    }
-
-  /* Prevent instructions from being scheduled into register save push
-     sequence when access to the redzone area is done through frame pointer.
-     The offset between the frame pointer and the stack pointer is calculated
-     relative to the value of the stack pointer at the end of the function
-     prologue, and moving instructions that access redzone area via frame
-     pointer inside push sequence violates this assumption.  */
-  if (frame_pointer_needed && frame.red_zone_size)
-    emit_insn (gen_memory_blockage ());
+      rtx reg = ix86_validate_address_register (index);
 
-  /* SEH requires that the prologue end within 256 bytes of the start of
-     the function.  Prevent instruction schedules that would extend that.
-     Further, prevent alloca modifications to the stack pointer from being
-     combined with prologue modifications.  */
-  if (TARGET_SEH)
-    emit_insn (gen_prologue_use (stack_pointer_rtx));
-}
+      if (reg == NULL_RTX)
+	return false;
 
-/* Emit code to restore REG using a POP insn.  */
+      if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
+	  || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
+	/* Index is not valid.  */
+	return false;
+    }
 
-static void
-ix86_emit_restore_reg_using_pop (rtx reg)
-{
-  struct machine_function *m = cfun->machine;
-  rtx_insn *insn = emit_insn (gen_pop (reg));
+  /* Index and base should have the same mode.  */
+  if (base && index
+      && GET_MODE (base) != GET_MODE (index))
+    return false;
 
-  ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
-  m->fs.sp_offset -= UNITS_PER_WORD;
+  /* Address override works only on the (%reg) part of %fs:(%reg).  */
+  if (seg != ADDR_SPACE_GENERIC
+      && ((base && GET_MODE (base) != word_mode)
+	  || (index && GET_MODE (index) != word_mode)))
+    return false;
 
-  if (m->fs.cfa_reg == crtl->drap_reg
-      && REGNO (reg) == REGNO (crtl->drap_reg))
+  /* Validate scale factor.  */
+  if (scale != 1)
     {
-      /* Previously we'd represented the CFA as an expression
-	 like *(%ebp - 8).  We've just popped that value from
-	 the stack, which means we need to reset the CFA to
-	 the drap register.  This will remain until we restore
-	 the stack pointer.  */
-      add_reg_note (insn, REG_CFA_DEF_CFA, reg);
-      RTX_FRAME_RELATED_P (insn) = 1;
+      if (!index)
+	/* Scale without index.  */
+	return false;
 
-      /* This means that the DRAP register is valid for addressing too.  */
-      m->fs.drap_valid = true;
-      return;
+      if (scale != 2 && scale != 4 && scale != 8)
+	/* Scale is not a valid multiplier.  */
+	return false;
     }
 
-  if (m->fs.cfa_reg == stack_pointer_rtx)
+  /* Validate displacement.  */
+  if (disp)
     {
-      rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
-      x = gen_rtx_SET (stack_pointer_rtx, x);
-      add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
-      RTX_FRAME_RELATED_P (insn) = 1;
-
-      m->fs.cfa_offset -= UNITS_PER_WORD;
-    }
+      if (GET_CODE (disp) == CONST
+	  && GET_CODE (XEXP (disp, 0)) == UNSPEC
+	  && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
+	switch (XINT (XEXP (disp, 0), 1))
+	  {
+	  /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
+	     when used.  While ABI specify also 32bit relocations, we
+	     don't produce them at all and use IP relative instead.
+	     Allow GOT in 32bit mode for both PIC and non-PIC if symbol
+	     should be loaded via GOT.  */
+	  case UNSPEC_GOT:
+	    if (!TARGET_64BIT
+		&& ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
+	      goto is_legitimate_pic;
+	    /* FALLTHRU */
+	  case UNSPEC_GOTOFF:
+	    gcc_assert (flag_pic);
+	    if (!TARGET_64BIT)
+	      goto is_legitimate_pic;
 
-  /* When the frame pointer is the CFA, and we pop it, we are
-     swapping back to the stack pointer as the CFA.  This happens
-     for stack frames that don't allocate other data, so we assume
-     the stack pointer is now pointing at the return address, i.e.
-     the function entry state, which makes the offset be 1 word.  */
-  if (reg == hard_frame_pointer_rtx)
-    {
-      m->fs.fp_valid = false;
-      if (m->fs.cfa_reg == hard_frame_pointer_rtx)
-	{
-	  m->fs.cfa_reg = stack_pointer_rtx;
-	  m->fs.cfa_offset -= UNITS_PER_WORD;
+	    /* 64bit address unspec.  */
+	    return false;
 
-	  add_reg_note (insn, REG_CFA_DEF_CFA,
-			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
-				      GEN_INT (m->fs.cfa_offset)));
-	  RTX_FRAME_RELATED_P (insn) = 1;
-	}
-    }
-}
-
-/* Emit code to restore saved registers using POP insns.  */
+	  case UNSPEC_GOTPCREL:
+	    if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
+	      goto is_legitimate_pic;
+	    /* FALLTHRU */
+	  case UNSPEC_PCREL:
+	    gcc_assert (flag_pic);
+	    goto is_legitimate_pic;
 
-static void
-ix86_emit_restore_regs_using_pop (void)
-{
-  unsigned int regno;
+	  case UNSPEC_GOTTPOFF:
+	  case UNSPEC_GOTNTPOFF:
+	  case UNSPEC_INDNTPOFF:
+	  case UNSPEC_NTPOFF:
+	  case UNSPEC_DTPOFF:
+	    break;
 
-  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
-    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
-      ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
-}
+	  default:
+	    /* Invalid address unspec.  */
+	    return false;
+	  }
 
-/* Emit code and notes for the LEAVE instruction.  If insn is non-null,
-   omits the emit and only attaches the notes.  */
+      else if (SYMBOLIC_CONST (disp)
+	       && (flag_pic
+		   || (TARGET_MACHO
+#if TARGET_MACHO
+		       && MACHOPIC_INDIRECT
+		       && !machopic_operand_p (disp)
+#endif
+	       )))
+	{
 
-static void
-ix86_emit_leave (rtx_insn *insn)
-{
-  struct machine_function *m = cfun->machine;
-  if (!insn)
-    insn = emit_insn (ix86_gen_leave ());
+	is_legitimate_pic:
+	  if (TARGET_64BIT && (index || base))
+	    {
+	      /* foo@dtpoff(%rX) is ok.  */
+	      if (GET_CODE (disp) != CONST
+		  || GET_CODE (XEXP (disp, 0)) != PLUS
+		  || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
+		  || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
+		  || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
+		      && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
+		/* Non-constant pic memory reference.  */
+		return false;
+	    }
+	  else if ((!TARGET_MACHO || flag_pic)
+		    && ! legitimate_pic_address_disp_p (disp))
+	    /* Displacement is an invalid pic construct.  */
+	    return false;
+#if TARGET_MACHO
+	  else if (MACHO_DYNAMIC_NO_PIC_P
+		   && !ix86_legitimate_constant_p (Pmode, disp))
+	    /* displacment must be referenced via non_lazy_pointer */
+	    return false;
+#endif
 
-  ix86_add_queued_cfa_restore_notes (insn);
+          /* This code used to verify that a symbolic pic displacement
+	     includes the pic_offset_table_rtx register.
 
-  gcc_assert (m->fs.fp_valid);
-  m->fs.sp_valid = true;
-  m->fs.sp_realigned = false;
-  m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
-  m->fs.fp_valid = false;
+	     While this is good idea, unfortunately these constructs may
+	     be created by "adds using lea" optimization for incorrect
+	     code like:
 
-  if (m->fs.cfa_reg == hard_frame_pointer_rtx)
-    {
-      m->fs.cfa_reg = stack_pointer_rtx;
-      m->fs.cfa_offset = m->fs.sp_offset;
+	     int a;
+	     int foo(int i)
+	       {
+	         return *(&a+i);
+	       }
 
-      add_reg_note (insn, REG_CFA_DEF_CFA,
-		    plus_constant (Pmode, stack_pointer_rtx,
-				   m->fs.sp_offset));
-      RTX_FRAME_RELATED_P (insn) = 1;
+	     This code is nonsensical, but results in addressing
+	     GOT table with pic_offset_table_rtx base.  We can't
+	     just refuse it easily, since it gets matched by
+	     "addsi3" pattern, that later gets split to lea in the
+	     case output register differs from input.  While this
+	     can be handled by separate addsi pattern for this case
+	     that never results in lea, this seems to be easier and
+	     correct fix for crash to disable this test.  */
+	}
+      else if (GET_CODE (disp) != LABEL_REF
+	       && !CONST_INT_P (disp)
+	       && (GET_CODE (disp) != CONST
+		   || !ix86_legitimate_constant_p (Pmode, disp))
+	       && (GET_CODE (disp) != SYMBOL_REF
+		   || !ix86_legitimate_constant_p (Pmode, disp)))
+	/* Displacement is not constant.  */
+	return false;
+      else if (TARGET_64BIT
+	       && !x86_64_immediate_operand (disp, VOIDmode))
+	/* Displacement is out of range.  */
+	return false;
+      /* In x32 mode, constant addresses are sign extended to 64bit, so
+	 we have to prevent addresses from 0x80000000 to 0xffffffff.  */
+      else if (TARGET_X32 && !(index || base)
+	       && CONST_INT_P (disp)
+	       && val_signbit_known_set_p (SImode, INTVAL (disp)))
+	return false;
     }
-  ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
-			     m->fs.fp_offset);
-}
-
-/* Emit code to restore saved registers using MOV insns.
-   First register is restored from CFA - CFA_OFFSET.  */
-static void
-ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
-				  bool maybe_eh_return)
-{
-  struct machine_function *m = cfun->machine;
-  unsigned int regno;
-
-  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
-    if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
-      {
-	rtx reg = gen_rtx_REG (word_mode, regno);
-	rtx mem;
-	rtx_insn *insn;
-
-	mem = choose_baseaddr (cfa_offset, NULL);
-	mem = gen_frame_mem (word_mode, mem);
-	insn = emit_move_insn (reg, mem);
 
-        if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
-	  {
-	    /* Previously we'd represented the CFA as an expression
-	       like *(%ebp - 8).  We've just popped that value from
-	       the stack, which means we need to reset the CFA to
-	       the drap register.  This will remain until we restore
-	       the stack pointer.  */
-	    add_reg_note (insn, REG_CFA_DEF_CFA, reg);
-	    RTX_FRAME_RELATED_P (insn) = 1;
+  /* Everything looks valid.  */
+  return true;
+}
 
-	    /* This means that the DRAP register is valid for addressing.  */
-	    m->fs.drap_valid = true;
-	  }
-	else
-	  ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
+/* Determine if a given RTX is a valid constant address.  */
 
-	cfa_offset -= UNITS_PER_WORD;
-      }
+bool
+constant_address_p (rtx x)
+{
+  return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
 }
+
+/* Return a unique alias set for the GOT.  */
 
-/* Emit code to restore saved registers using MOV insns.
-   First register is restored from CFA - CFA_OFFSET.  */
-static void
-ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
-				      bool maybe_eh_return)
+alias_set_type
+ix86_GOT_alias_set (void)
 {
-  unsigned int regno;
+  static alias_set_type set = -1;
+  if (set == -1)
+    set = new_alias_set ();
+  return set;
+}
 
-  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
-    if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
-      {
-	rtx reg = gen_rtx_REG (V4SFmode, regno);
-	rtx mem;
-	unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
+/* Return a legitimate reference for ORIG (an address) using the
+   register REG.  If REG is 0, a new pseudo is generated.
 
-	mem = choose_baseaddr (cfa_offset, &align);
-	mem = gen_rtx_MEM (V4SFmode, mem);
+   There are two types of references that must be handled:
 
-	/* The location aligment depends upon the base register.  */
-	align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
-	gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
-	set_mem_align (mem, align);
-	emit_insn (gen_rtx_SET (reg, mem));
+   1. Global data references must load the address from the GOT, via
+      the PIC reg.  An insn is emitted to do this load, and the reg is
+      returned.
 
-	ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
+   2. Static data references, constant pool addresses, and code labels
+      compute the address as an offset from the GOT, whose base is in
+      the PIC reg.  Static data objects have SYMBOL_FLAG_LOCAL set to
+      differentiate them from global data objects.  The returned
+      address is the PIC reg + an unspec constant.
 
-	cfa_offset -= GET_MODE_SIZE (V4SFmode);
-      }
-}
+   TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
+   reg also appears in the address.  */
 
-static void
-ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
-				  bool use_call, int style)
+rtx
+legitimize_pic_address (rtx orig, rtx reg)
 {
-  struct machine_function *m = cfun->machine;
-  const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
-			  + m->call_ms2sysv_extra_regs;
-  rtvec v;
-  unsigned int elems_needed, align, i, vi = 0;
-  rtx_insn *insn;
-  rtx sym, tmp;
-  rtx rsi = gen_rtx_REG (word_mode, SI_REG);
-  rtx r10 = NULL_RTX;
-  const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
-  HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
-  HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
-  rtx rsi_frame_load = NULL_RTX;
-  HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
-  enum xlogue_stub stub;
+  rtx addr = orig;
+  rtx new_rtx = orig;
 
-  gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
+#if TARGET_MACHO
+  if (TARGET_MACHO && !TARGET_64BIT)
+    {
+      if (reg == 0)
+	reg = gen_reg_rtx (Pmode);
+      /* Use the generic Mach-O PIC machinery.  */
+      return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
+    }
+#endif
 
-  /* If using a realigned stack, we should never start with padding.  */
-  gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
+  if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
+    {
+      rtx tmp = legitimize_pe_coff_symbol (addr, true);
+      if (tmp)
+        return tmp;
+    }
 
-  /* Setup RSI as the stub's base pointer.  */
-  align = GET_MODE_ALIGNMENT (V4SFmode);
-  tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
-  gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
+  if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
+    new_rtx = addr;
+  else if ((!TARGET_64BIT
+	    || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
+	   && !TARGET_PECOFF
+	   && gotoff_operand (addr, Pmode))
+    {
+      /* This symbol may be referenced via a displacement
+	 from the PIC base address (@GOTOFF).  */
+      if (GET_CODE (addr) == CONST)
+	addr = XEXP (addr, 0);
 
-  emit_insn (gen_rtx_SET (rsi, tmp));
+      if (GET_CODE (addr) == PLUS)
+	  {
+            new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
+				      UNSPEC_GOTOFF);
+	    new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
+	  }
+	else
+          new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
 
-  /* Get a symbol for the stub.  */
-  if (frame_pointer_needed)
-    stub = use_call ? XLOGUE_STUB_RESTORE_HFP
-		    : XLOGUE_STUB_RESTORE_HFP_TAIL;
-  else
-    stub = use_call ? XLOGUE_STUB_RESTORE
-		    : XLOGUE_STUB_RESTORE_TAIL;
-  sym = xlogue.get_stub_rtx (stub);
+      new_rtx = gen_rtx_CONST (Pmode, new_rtx);
 
-  elems_needed = ncregs;
-  if (use_call)
-    elems_needed += 1;
-  else
-    elems_needed += frame_pointer_needed ? 5 : 3;
-  v = rtvec_alloc (elems_needed);
+      if (TARGET_64BIT)
+	new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
 
-  /* We call the epilogue stub when we need to pop incoming args or we are
-     doing a sibling call as the tail.  Otherwise, we will emit a jmp to the
-     epilogue stub and it is the tail-call.  */
-  if (use_call)
-      RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
-  else
-    {
-      RTVEC_ELT (v, vi++) = ret_rtx;
-      RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
-      if (frame_pointer_needed)
+      if (reg != 0)
 	{
-	  rtx rbp = gen_rtx_REG (DImode, BP_REG);
-	  gcc_assert (m->fs.fp_valid);
-	  gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
-
-	  tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
-	  RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
-	  RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
-	  tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
-	  RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
-	}
+ 	  gcc_assert (REG_P (reg));
+	  new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
+					 new_rtx, reg, 1, OPTAB_DIRECT);
+ 	}
       else
-	{
-	  /* If no hard frame pointer, we set R10 to the SP restore value.  */
-	  gcc_assert (!m->fs.fp_valid);
-	  gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
-	  gcc_assert (m->fs.sp_valid);
-
-	  r10 = gen_rtx_REG (DImode, R10_REG);
-	  tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
-	  emit_insn (gen_rtx_SET (r10, tmp));
-
-	  RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
-	}
+	new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
     }
-
-  /* Generate frame load insns and restore notes.  */
-  for (i = 0; i < ncregs; ++i)
+  else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
+	   /* We can't use @GOTOFF for text labels
+	      on VxWorks, see gotoff_operand.  */
+	   || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
     {
-      const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
-      machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
-      rtx reg, frame_load;
-
-      reg = gen_rtx_REG (mode, r.regno);
-      frame_load = gen_frame_load (reg, rsi, r.offset);
+      rtx tmp = legitimize_pe_coff_symbol (addr, true);
+      if (tmp)
+        return tmp;
 
-      /* Save RSI frame load insn & note to add last.  */
-      if (r.regno == SI_REG)
+      /* For x64 PE-COFF there is no GOT table,
+	 so we use address directly.  */
+      if (TARGET_64BIT && TARGET_PECOFF)
 	{
-	  gcc_assert (!rsi_frame_load);
-	  rsi_frame_load = frame_load;
-	  rsi_restore_offset = r.offset;
+	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
+	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+	}
+      else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
+	{
+	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
+				    UNSPEC_GOTPCREL);
+	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+	  new_rtx = gen_const_mem (Pmode, new_rtx);
+	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
 	}
       else
 	{
-	  RTVEC_ELT (v, vi++) = frame_load;
-	  ix86_add_cfa_restore_note (NULL, reg, r.offset);
+	  /* This symbol must be referenced via a load
+	     from the Global Offset Table (@GOT).  */
+	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
+	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+	  if (TARGET_64BIT)
+	    new_rtx = force_reg (Pmode, new_rtx);
+	  new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
+	  new_rtx = gen_const_mem (Pmode, new_rtx);
+	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
 	}
-    }
-
-  /* Add RSI frame load & restore note at the end.  */
-  gcc_assert (rsi_frame_load);
-  gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
-  RTVEC_ELT (v, vi++) = rsi_frame_load;
-  ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
-			     rsi_restore_offset);
-
-  /* Finally, for tail-call w/o a hard frame pointer, set SP to R10.  */
-  if (!use_call && !frame_pointer_needed)
-    {
-      gcc_assert (m->fs.sp_valid);
-      gcc_assert (!m->fs.sp_realigned);
 
-      /* At this point, R10 should point to frame.stack_realign_offset.  */
-      if (m->fs.cfa_reg == stack_pointer_rtx)
-	m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
-      m->fs.sp_offset = frame.stack_realign_offset;
+      new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
     }
-
-  gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
-  tmp = gen_rtx_PARALLEL (VOIDmode, v);
-  if (use_call)
-      insn = emit_insn (tmp);
   else
     {
-      insn = emit_jump_insn (tmp);
-      JUMP_LABEL (insn) = ret_rtx;
-
-      if (frame_pointer_needed)
-	ix86_emit_leave (insn);
-      else
+      if (CONST_INT_P (addr)
+	  && !x86_64_immediate_operand (addr, VOIDmode))
+	new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
+      else if (GET_CODE (addr) == CONST)
 	{
-	  /* Need CFA adjust note.  */
-	  tmp = gen_rtx_SET (stack_pointer_rtx, r10);
-	  add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
-	}
-    }
+	  addr = XEXP (addr, 0);
 
-  RTX_FRAME_RELATED_P (insn) = true;
-  ix86_add_queued_cfa_restore_notes (insn);
+	  /* We must match stuff we generate before.  Assume the only
+	     unspecs that can get here are ours.  Not that we could do
+	     anything with them anyway....  */
+	  if (GET_CODE (addr) == UNSPEC
+	      || (GET_CODE (addr) == PLUS
+		  && GET_CODE (XEXP (addr, 0)) == UNSPEC))
+	    return orig;
+	  gcc_assert (GET_CODE (addr) == PLUS);
+	}
 
-  /* If we're not doing a tail-call, we need to adjust the stack.  */
-  if (use_call && m->fs.sp_valid)
-    {
-      HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
-      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
-				GEN_INT (dealloc), style,
-				m->fs.cfa_reg == stack_pointer_rtx);
-    }
-}
+      if (GET_CODE (addr) == PLUS)
+	{
+	  rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
 
-/* Restore function stack, frame, and registers.  */
+	  /* Check first to see if this is a constant
+	     offset from a @GOTOFF symbol reference.  */
+	  if (!TARGET_PECOFF
+	      && gotoff_operand (op0, Pmode)
+	      && CONST_INT_P (op1))
+	    {
+	      if (!TARGET_64BIT)
+		{
+		  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
+					    UNSPEC_GOTOFF);
+		  new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
+		  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
 
-void
-ix86_expand_epilogue (int style)
-{
-  struct machine_function *m = cfun->machine;
-  struct machine_frame_state frame_state_save = m->fs;
-  bool restore_regs_via_mov;
-  bool using_drap;
-  bool restore_stub_is_tail = false;
+		  if (reg != 0)
+		    {
+		      gcc_assert (REG_P (reg));
+		      new_rtx = expand_simple_binop (Pmode, PLUS,
+						     pic_offset_table_rtx,
+						     new_rtx, reg, 1,
+						     OPTAB_DIRECT);
+		    }
+		  else
+		    new_rtx
+		      = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
+		}
+	      else
+		{
+		  if (INTVAL (op1) < -16*1024*1024
+		      || INTVAL (op1) >= 16*1024*1024)
+		    {
+		      if (!x86_64_immediate_operand (op1, Pmode))
+			op1 = force_reg (Pmode, op1);
 
-  if (ix86_function_naked (current_function_decl))
-    {
-      /* The program should not reach this point.  */
-      emit_insn (gen_ud2 ());
-      return;
-    }
+		      new_rtx
+			= gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
+		    }
+		}
+	    }
+	  else
+	    {
+	      rtx base = legitimize_pic_address (op0, reg);
+	      machine_mode mode = GET_MODE (base);
+	      new_rtx
+	        = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
 
-  ix86_finalize_stack_frame_flags ();
-  const struct ix86_frame &frame = cfun->machine->frame;
+	      if (CONST_INT_P (new_rtx))
+		{
+		  if (INTVAL (new_rtx) < -16*1024*1024
+		      || INTVAL (new_rtx) >= 16*1024*1024)
+		    {
+		      if (!x86_64_immediate_operand (new_rtx, mode))
+			new_rtx = force_reg (mode, new_rtx);
 
-  m->fs.sp_realigned = stack_realign_fp;
-  m->fs.sp_valid = stack_realign_fp
-		   || !frame_pointer_needed
-		   || crtl->sp_is_unchanging;
-  gcc_assert (!m->fs.sp_valid
-	      || m->fs.sp_offset == frame.stack_pointer_offset);
+		      new_rtx
+		        = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
+		    }
+		  else
+		    new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
+		}
+	      else
+		{
+		  /* For %rip addressing, we have to use
+		     just disp32, not base nor index.  */
+		  if (TARGET_64BIT
+		      && (GET_CODE (base) == SYMBOL_REF
+			  || GET_CODE (base) == LABEL_REF))
+		    base = force_reg (mode, base);
+		  if (GET_CODE (new_rtx) == PLUS
+		      && CONSTANT_P (XEXP (new_rtx, 1)))
+		    {
+		      base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
+		      new_rtx = XEXP (new_rtx, 1);
+		    }
+		  new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
+		}
+	    }
+	}
+    }
+  return new_rtx;
+}
+
+/* Load the thread pointer.  If TO_REG is true, force it into a register.  */
 
-  /* The FP must be valid if the frame pointer is present.  */
-  gcc_assert (frame_pointer_needed == m->fs.fp_valid);
-  gcc_assert (!m->fs.fp_valid
-	      || m->fs.fp_offset == frame.hard_frame_pointer_offset);
+static rtx
+get_thread_pointer (machine_mode tp_mode, bool to_reg)
+{
+  rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
 
-  /* We must have *some* valid pointer to the stack frame.  */
-  gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
+  if (GET_MODE (tp) != tp_mode)
+    {
+      gcc_assert (GET_MODE (tp) == SImode);
+      gcc_assert (tp_mode == DImode);
 
-  /* The DRAP is never valid at this point.  */
-  gcc_assert (!m->fs.drap_valid);
+      tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
+    }
 
-  /* See the comment about red zone and frame
-     pointer usage in ix86_expand_prologue.  */
-  if (frame_pointer_needed && frame.red_zone_size)
-    emit_insn (gen_memory_blockage ());
+  if (to_reg)
+    tp = copy_to_mode_reg (tp_mode, tp);
 
-  using_drap = crtl->drap_reg && crtl->stack_realign_needed;
-  gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
+  return tp;
+}
 
-  /* Determine the CFA offset of the end of the red-zone.  */
-  m->fs.red_zone_offset = 0;
-  if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
+/* Construct the SYMBOL_REF for the tls_get_addr function.  */
+
+static GTY(()) rtx ix86_tls_symbol;
+
+static rtx
+ix86_tls_get_addr (void)
+{
+  if (!ix86_tls_symbol)
     {
-      /* The red-zone begins below return address and error code in
-	 exception handler.  */
-      m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
+      const char *sym
+	= ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
+	   ? "___tls_get_addr" : "__tls_get_addr");
 
-      /* When the register save area is in the aligned portion of
-         the stack, determine the maximum runtime displacement that
-	 matches up with the aligned frame.  */
-      if (stack_realign_drap)
-	m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
-				  + UNITS_PER_WORD);
+      ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
     }
 
-  HOST_WIDE_INT reg_save_offset = frame.reg_save_offset;
+  if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
+    {
+      rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
+				   UNSPEC_PLTOFF);
+      return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
+			   gen_rtx_CONST (Pmode, unspec));
+    }
 
-  /* Special care must be taken for the normal return case of a function
-     using eh_return: the eax and edx registers are marked as saved, but
-     not restored along this path.  Adjust the save location to match.  */
-  if (crtl->calls_eh_return && style != 2)
-    reg_save_offset -= 2 * UNITS_PER_WORD;
+  return ix86_tls_symbol;
+}
 
-  /* EH_RETURN requires the use of moves to function properly.  */
-  if (crtl->calls_eh_return)
-    restore_regs_via_mov = true;
-  /* SEH requires the use of pops to identify the epilogue.  */
-  else if (TARGET_SEH)
-    restore_regs_via_mov = false;
-  /* If we're only restoring one register and sp cannot be used then
-     using a move instruction to restore the register since it's
-     less work than reloading sp and popping the register.  */
-  else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
-    restore_regs_via_mov = true;
-  else if (TARGET_EPILOGUE_USING_MOVE
-	   && cfun->machine->use_fast_prologue_epilogue
-	   && (frame.nregs > 1
-	       || m->fs.sp_offset != reg_save_offset))
-    restore_regs_via_mov = true;
-  else if (frame_pointer_needed
-	   && !frame.nregs
-	   && m->fs.sp_offset != reg_save_offset)
-    restore_regs_via_mov = true;
-  else if (frame_pointer_needed
-	   && TARGET_USE_LEAVE
-	   && cfun->machine->use_fast_prologue_epilogue
-	   && frame.nregs == 1)
-    restore_regs_via_mov = true;
-  else
-    restore_regs_via_mov = false;
+/* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol.  */
 
-  if (restore_regs_via_mov || frame.nsseregs)
+static GTY(()) rtx ix86_tls_module_base_symbol;
+
+rtx
+ix86_tls_module_base (void)
+{
+  if (!ix86_tls_module_base_symbol)
     {
-      /* Ensure that the entire register save area is addressable via
-	 the stack pointer, if we will restore SSE regs via sp.  */
-      if (TARGET_64BIT
-	  && m->fs.sp_offset > 0x7fffffff
-	  && sp_valid_at (frame.stack_realign_offset + 1)
-	  && (frame.nsseregs + frame.nregs) != 0)
-	{
-	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
-				     GEN_INT (m->fs.sp_offset
-					      - frame.sse_reg_save_offset),
-				     style,
-				     m->fs.cfa_reg == stack_pointer_rtx);
-	}
-    }
+      ix86_tls_module_base_symbol
+	= gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
 
-  /* If there are any SSE registers to restore, then we have to do it
-     via moves, since there's obviously no pop for SSE regs.  */
-  if (frame.nsseregs)
-    ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
-					  style == 2);
+      SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
+	|= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
+    }
 
-  if (m->call_ms2sysv)
-    {
-      int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
+  return ix86_tls_module_base_symbol;
+}
 
-      /* We cannot use a tail-call for the stub if:
-	 1. We have to pop incoming args,
-	 2. We have additional int regs to restore, or
-	 3. A sibling call will be the tail-call, or
-	 4. We are emitting an eh_return_internal epilogue.
+/* A subroutine of ix86_legitimize_address and ix86_expand_move.  FOR_MOV is
+   false if we expect this to be used for a memory address and true if
+   we expect to load the address into a register.  */
 
-	 TODO: Item 4 has not yet tested!
+rtx
+legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
+{
+  rtx dest, base, off;
+  rtx pic = NULL_RTX, tp = NULL_RTX;
+  machine_mode tp_mode = Pmode;
+  int type;
 
-	 If any of the above are true, we will call the stub rather than
-	 jump to it.  */
-      restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
-      ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
-    }
+  /* Fall back to global dynamic model if tool chain cannot support local
+     dynamic.  */
+  if (TARGET_SUN_TLS && !TARGET_64BIT
+      && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
+      && model == TLS_MODEL_LOCAL_DYNAMIC)
+    model = TLS_MODEL_GLOBAL_DYNAMIC;
 
-  /* If using out-of-line stub that is a tail-call, then...*/
-  if (m->call_ms2sysv && restore_stub_is_tail)
-    {
-      /* TODO: parinoid tests. (remove eventually)  */
-      gcc_assert (m->fs.sp_valid);
-      gcc_assert (!m->fs.sp_realigned);
-      gcc_assert (!m->fs.fp_valid);
-      gcc_assert (!m->fs.realigned);
-      gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
-      gcc_assert (!crtl->drap_reg);
-      gcc_assert (!frame.nregs);
-    }
-  else if (restore_regs_via_mov)
+  switch (model)
     {
-      rtx t;
+    case TLS_MODEL_GLOBAL_DYNAMIC:
+      dest = gen_reg_rtx (Pmode);
 
-      if (frame.nregs)
-	ix86_emit_restore_regs_using_mov (reg_save_offset, style == 2);
+      if (!TARGET_64BIT)
+	{
+	  if (flag_pic && !TARGET_PECOFF)
+	    pic = pic_offset_table_rtx;
+	  else
+	    {
+	      pic = gen_reg_rtx (Pmode);
+	      emit_insn (gen_set_got (pic));
+	    }
+	}
 
-      /* eh_return epilogues need %ecx added to the stack pointer.  */
-      if (style == 2)
+      if (TARGET_GNU2_TLS)
 	{
-	  rtx sa = EH_RETURN_STACKADJ_RTX;
-	  rtx_insn *insn;
+	  if (TARGET_64BIT)
+	    emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
+	  else
+	    emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
 
-	  /* %ecx can't be used for both DRAP register and eh_return.  */
-	  if (crtl->drap_reg)
-	    gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
+	  tp = get_thread_pointer (Pmode, true);
+	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
 
-	  /* regparm nested functions don't work with eh_return.  */
-	  gcc_assert (!ix86_static_chain_on_stack);
+	  if (GET_MODE (x) != Pmode)
+	    x = gen_rtx_ZERO_EXTEND (Pmode, x);
 
-	  if (frame_pointer_needed)
+	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
+	}
+      else
+	{
+	  rtx caddr = ix86_tls_get_addr ();
+
+	  if (TARGET_64BIT)
 	    {
-	      t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
-	      t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
-	      emit_insn (gen_rtx_SET (sa, t));
+	      rtx rax = gen_rtx_REG (Pmode, AX_REG);
+	      rtx_insn *insns;
 
-	      t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
-	      insn = emit_move_insn (hard_frame_pointer_rtx, t);
+	      start_sequence ();
+	      emit_call_insn
+		(ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
+	      insns = get_insns ();
+	      end_sequence ();
 
-	      /* Note that we use SA as a temporary CFA, as the return
-		 address is at the proper place relative to it.  We
-		 pretend this happens at the FP restore insn because
-		 prior to this insn the FP would be stored at the wrong
-		 offset relative to SA, and after this insn we have no
-		 other reasonable register to use for the CFA.  We don't
-		 bother resetting the CFA to the SP for the duration of
-		 the return insn, unless the control flow instrumentation
-		 is done.  In this case the SP is used later and we have
-		 to reset CFA to SP.  */
-	      add_reg_note (insn, REG_CFA_DEF_CFA,
-			    plus_constant (Pmode, sa, UNITS_PER_WORD));
-	      ix86_add_queued_cfa_restore_notes (insn);
-	      add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
-	      RTX_FRAME_RELATED_P (insn) = 1;
-
-	      m->fs.cfa_reg = sa;
-	      m->fs.cfa_offset = UNITS_PER_WORD;
-	      m->fs.fp_valid = false;
+	      if (GET_MODE (x) != Pmode)
+		x = gen_rtx_ZERO_EXTEND (Pmode, x);
 
-	      pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
-					 const0_rtx, style,
-					 flag_cf_protection);
+	      RTL_CONST_CALL_P (insns) = 1;
+	      emit_libcall_block (insns, dest, rax, x);
 	    }
 	  else
-	    {
-	      t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
-	      t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
-	      insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
-	      ix86_add_queued_cfa_restore_notes (insn);
-
-	      gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
-	      if (m->fs.cfa_offset != UNITS_PER_WORD)
-		{
-		  m->fs.cfa_offset = UNITS_PER_WORD;
-		  add_reg_note (insn, REG_CFA_DEF_CFA,
-				plus_constant (Pmode, stack_pointer_rtx,
-					       UNITS_PER_WORD));
-		  RTX_FRAME_RELATED_P (insn) = 1;
-		}
-	    }
-	  m->fs.sp_offset = UNITS_PER_WORD;
-	  m->fs.sp_valid = true;
-	  m->fs.sp_realigned = false;
+	    emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
 	}
-    }
-  else
-    {
-      /* SEH requires that the function end with (1) a stack adjustment
-	 if necessary, (2) a sequence of pops, and (3) a return or
-	 jump instruction.  Prevent insns from the function body from
-	 being scheduled into this sequence.  */
-      if (TARGET_SEH)
+      break;
+
+    case TLS_MODEL_LOCAL_DYNAMIC:
+      base = gen_reg_rtx (Pmode);
+
+      if (!TARGET_64BIT)
 	{
-	  /* Prevent a catch region from being adjacent to the standard
-	     epilogue sequence.  Unfortunately neither crtl->uses_eh_lsda
-	     nor several other flags that would be interesting to test are
-	     set up yet.  */
-	  if (flag_non_call_exceptions)
-	    emit_insn (gen_nops (const1_rtx));
+	  if (flag_pic)
+	    pic = pic_offset_table_rtx;
 	  else
-	    emit_insn (gen_blockage ());
+	    {
+	      pic = gen_reg_rtx (Pmode);
+	      emit_insn (gen_set_got (pic));
+	    }
 	}
 
-      /* First step is to deallocate the stack frame so that we can
-	 pop the registers.  If the stack pointer was realigned, it needs
-	 to be restored now.  Also do it on SEH target for very large
-	 frame as the emitted instructions aren't allowed by the ABI
-	 in epilogues.  */
-      if (!m->fs.sp_valid || m->fs.sp_realigned
- 	  || (TARGET_SEH
-	      && (m->fs.sp_offset - reg_save_offset
-		  >= SEH_MAX_FRAME_SIZE)))
-	{
-	  pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
-				     GEN_INT (m->fs.fp_offset
-					      - reg_save_offset),
-				     style, false);
-	}
-      else if (m->fs.sp_offset != reg_save_offset)
+      if (TARGET_GNU2_TLS)
 	{
-	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
-				     GEN_INT (m->fs.sp_offset
-					      - reg_save_offset),
-				     style,
-				     m->fs.cfa_reg == stack_pointer_rtx);
-	}
+	  rtx tmp = ix86_tls_module_base ();
 
-      ix86_emit_restore_regs_using_pop ();
-    }
+	  if (TARGET_64BIT)
+	    emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
+	  else
+	    emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
 
-  /* If we used a stack pointer and haven't already got rid of it,
-     then do so now.  */
-  if (m->fs.fp_valid)
-    {
-      /* If the stack pointer is valid and pointing at the frame
-	 pointer store address, then we only need a pop.  */
-      if (sp_valid_at (frame.hfp_save_offset)
-	  && m->fs.sp_offset == frame.hfp_save_offset)
-	ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
-      /* Leave results in shorter dependency chains on CPUs that are
-	 able to grok it fast.  */
-      else if (TARGET_USE_LEAVE
-	       || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
-	       || !cfun->machine->use_fast_prologue_epilogue)
-	ix86_emit_leave (NULL);
+	  tp = get_thread_pointer (Pmode, true);
+	  set_unique_reg_note (get_last_insn (), REG_EQUAL,
+			       gen_rtx_MINUS (Pmode, tmp, tp));
+	}
       else
-        {
-	  pro_epilogue_adjust_stack (stack_pointer_rtx,
-				     hard_frame_pointer_rtx,
-				     const0_rtx, style, !using_drap);
-	  ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
-        }
-    }
-
-  if (using_drap)
-    {
-      int param_ptr_offset = UNITS_PER_WORD;
-      rtx_insn *insn;
+	{
+	  rtx caddr = ix86_tls_get_addr ();
 
-      gcc_assert (stack_realign_drap);
+	  if (TARGET_64BIT)
+	    {
+	      rtx rax = gen_rtx_REG (Pmode, AX_REG);
+	      rtx_insn *insns;
+	      rtx eqv;
 
-      if (ix86_static_chain_on_stack)
-	param_ptr_offset += UNITS_PER_WORD;
-      if (!call_used_regs[REGNO (crtl->drap_reg)])
-	param_ptr_offset += UNITS_PER_WORD;
+	      start_sequence ();
+	      emit_call_insn
+		(ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
+	      insns = get_insns ();
+	      end_sequence ();
 
-      insn = emit_insn (gen_rtx_SET
-			(stack_pointer_rtx,
-			 gen_rtx_PLUS (Pmode,
-				       crtl->drap_reg,
-				       GEN_INT (-param_ptr_offset))));
-      m->fs.cfa_reg = stack_pointer_rtx;
-      m->fs.cfa_offset = param_ptr_offset;
-      m->fs.sp_offset = param_ptr_offset;
-      m->fs.realigned = false;
+	      /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
+		 share the LD_BASE result with other LD model accesses.  */
+	      eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
+				    UNSPEC_TLS_LD_BASE);
 
-      add_reg_note (insn, REG_CFA_DEF_CFA,
-		    gen_rtx_PLUS (Pmode, stack_pointer_rtx,
-				  GEN_INT (param_ptr_offset)));
-      RTX_FRAME_RELATED_P (insn) = 1;
+	      RTL_CONST_CALL_P (insns) = 1;
+	      emit_libcall_block (insns, base, rax, eqv);
+	    }
+	  else
+	    emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
+	}
 
-      if (!call_used_regs[REGNO (crtl->drap_reg)])
-	ix86_emit_restore_reg_using_pop (crtl->drap_reg);
-    }
+      off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
+      off = gen_rtx_CONST (Pmode, off);
 
-  /* At this point the stack pointer must be valid, and we must have
-     restored all of the registers.  We may not have deallocated the
-     entire stack frame.  We've delayed this until now because it may
-     be possible to merge the local stack deallocation with the
-     deallocation forced by ix86_static_chain_on_stack.   */
-  gcc_assert (m->fs.sp_valid);
-  gcc_assert (!m->fs.sp_realigned);
-  gcc_assert (!m->fs.fp_valid);
-  gcc_assert (!m->fs.realigned);
-  if (m->fs.sp_offset != UNITS_PER_WORD)
-    {
-      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
-				 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
-				 style, true);
-    }
-  else
-    ix86_add_queued_cfa_restore_notes (get_last_insn ());
+      dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
 
-  /* Sibcall epilogues don't want a return instruction.  */
-  if (style == 0)
-    {
-      m->fs = frame_state_save;
-      return;
-    }
+      if (TARGET_GNU2_TLS)
+	{
+	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
 
-  if (cfun->machine->func_type != TYPE_NORMAL)
-    emit_jump_insn (gen_interrupt_return ());
-  else if (crtl->args.pops_args && crtl->args.size)
-    {
-      rtx popc = GEN_INT (crtl->args.pops_args);
+	  if (GET_MODE (x) != Pmode)
+	    x = gen_rtx_ZERO_EXTEND (Pmode, x);
 
-      /* i386 can only pop 64K bytes.  If asked to pop more, pop return
-	 address, do explicit add, and jump indirectly to the caller.  */
+	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
+	}
+      break;
 
-      if (crtl->args.pops_args >= 65536)
+    case TLS_MODEL_INITIAL_EXEC:
+      if (TARGET_64BIT)
 	{
-	  rtx ecx = gen_rtx_REG (SImode, CX_REG);
-	  rtx_insn *insn;
-
-	  /* There is no "pascal" calling convention in any 64bit ABI.  */
-	  gcc_assert (!TARGET_64BIT);
-
-	  insn = emit_insn (gen_pop (ecx));
-	  m->fs.cfa_offset -= UNITS_PER_WORD;
-	  m->fs.sp_offset -= UNITS_PER_WORD;
+	  if (TARGET_SUN_TLS && !TARGET_X32)
+	    {
+	      /* The Sun linker took the AMD64 TLS spec literally
+		 and can only handle %rax as destination of the
+		 initial executable code sequence.  */
 
-	  rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
-	  x = gen_rtx_SET (stack_pointer_rtx, x);
-	  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
-	  add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
-	  RTX_FRAME_RELATED_P (insn) = 1;
+	      dest = gen_reg_rtx (DImode);
+	      emit_insn (gen_tls_initial_exec_64_sun (dest, x));
+	      return dest;
+	    }
 
-	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
-				     popc, -1, true);
-	  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
+	  /* Generate DImode references to avoid %fs:(%reg32)
+	     problems and linker IE->LE relaxation bug.  */
+	  tp_mode = DImode;
+	  pic = NULL;
+	  type = UNSPEC_GOTNTPOFF;
 	}
-      else
-	emit_jump_insn (gen_simple_return_pop_internal (popc));
-    }
-  else if (!m->call_ms2sysv || !restore_stub_is_tail)
-    {
-      /* In case of return from EH a simple return cannot be used
-	 as a return address will be compared with a shadow stack
-	 return address.  Use indirect jump instead.  */
-      if (style == 2 && flag_cf_protection)
+      else if (flag_pic)
 	{
-	  /* Register used in indirect jump must be in word_mode.  But
-	     Pmode may not be the same as word_mode for x32.  */
-	  rtx ecx = gen_rtx_REG (word_mode, CX_REG);
-	  rtx_insn *insn;
-
-	  insn = emit_insn (gen_pop (ecx));
-	  m->fs.cfa_offset -= UNITS_PER_WORD;
-	  m->fs.sp_offset -= UNITS_PER_WORD;
-
-	  rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
-	  x = gen_rtx_SET (stack_pointer_rtx, x);
-	  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
-	  add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
-	  RTX_FRAME_RELATED_P (insn) = 1;
-
-	  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
+	  pic = pic_offset_table_rtx;
+	  type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
+	}
+      else if (!TARGET_ANY_GNU_TLS)
+	{
+	  pic = gen_reg_rtx (Pmode);
+	  emit_insn (gen_set_got (pic));
+	  type = UNSPEC_GOTTPOFF;
 	}
       else
-	emit_jump_insn (gen_simple_return_internal ());
-    }
-
-  /* Restore the state back to the state from the prologue,
-     so that it's correct for the next epilogue.  */
-  m->fs = frame_state_save;
-}
+	{
+	  pic = NULL;
+	  type = UNSPEC_INDNTPOFF;
+	}
 
-/* Reset from the function's potential modifications.  */
+      off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
+      off = gen_rtx_CONST (tp_mode, off);
+      if (pic)
+	off = gen_rtx_PLUS (tp_mode, pic, off);
+      off = gen_const_mem (tp_mode, off);
+      set_mem_alias_set (off, ix86_GOT_alias_set ());
 
-static void
-ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
-{
-  if (pic_offset_table_rtx
-      && !ix86_use_pseudo_pic_reg ())
-    SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
+      if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
+	{
+	  base = get_thread_pointer (tp_mode,
+				     for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
+	  off = force_reg (tp_mode, off);
+	  dest = gen_rtx_PLUS (tp_mode, base, off);
+	  if (tp_mode != Pmode)
+	    dest = convert_to_mode (Pmode, dest, 1);
+	}
+      else
+	{
+	  base = get_thread_pointer (Pmode, true);
+	  dest = gen_reg_rtx (Pmode);
+	  emit_insn (ix86_gen_sub3 (dest, base, off));
+	}
+      break;
 
-  if (TARGET_MACHO)
-    {
-      rtx_insn *insn = get_last_insn ();
-      rtx_insn *deleted_debug_label = NULL;
+    case TLS_MODEL_LOCAL_EXEC:
+      off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
+			    (TARGET_64BIT || TARGET_ANY_GNU_TLS)
+			    ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
+      off = gen_rtx_CONST (Pmode, off);
 
-      /* Mach-O doesn't support labels at the end of objects, so if
-         it looks like we might want one, take special action.
-        First, collect any sequence of deleted debug labels.  */
-      while (insn
-	     && NOTE_P (insn)
-	     && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
+      if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
 	{
-	  /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
-	     notes only, instead set their CODE_LABEL_NUMBER to -1,
-	     otherwise there would be code generation differences
-	     in between -g and -g0.  */
-	  if (NOTE_P (insn) && NOTE_KIND (insn)
-	      == NOTE_INSN_DELETED_DEBUG_LABEL)
-	    deleted_debug_label = insn;
-	  insn = PREV_INSN (insn);
+	  base = get_thread_pointer (Pmode,
+				     for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
+	  return gen_rtx_PLUS (Pmode, base, off);
+	}
+      else
+	{
+	  base = get_thread_pointer (Pmode, true);
+	  dest = gen_reg_rtx (Pmode);
+	  emit_insn (ix86_gen_sub3 (dest, base, off));
 	}
+      break;
 
-      /* If we have:
-	 label:
-	    barrier
-	  then this needs to be detected, so skip past the barrier.  */
+    default:
+      gcc_unreachable ();
+    }
 
-      if (insn && BARRIER_P (insn))
-	insn = PREV_INSN (insn);
+  return dest;
+}
 
-      /* Up to now we've only seen notes or barriers.  */
-      if (insn)
+/* Return true if OP refers to a TLS address.  */
+bool
+ix86_tls_address_pattern_p (rtx op)
+{
+  subrtx_var_iterator::array_type array;
+  FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
+    {
+      rtx op = *iter;
+      if (MEM_P (op))
 	{
-	  if (LABEL_P (insn)
-	      || (NOTE_P (insn)
-		  && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
-	    /* Trailing label.  */
-	    fputs ("\tnop\n", file);
-	  else if (cfun && ! cfun->is_thunk)
+	  rtx *x = &XEXP (op, 0);
+	  while (GET_CODE (*x) == PLUS)
 	    {
-	      /* See if we have a completely empty function body, skipping
-	         the special case of the picbase thunk emitted as asm.  */
-	      while (insn && ! INSN_P (insn))
-		insn = PREV_INSN (insn);
-	      /* If we don't find any insns, we've got an empty function body;
-		 I.e. completely empty - without a return or branch.  This is
-		 taken as the case where a function body has been removed
-		 because it contains an inline __builtin_unreachable().  GCC
-		 declares that reaching __builtin_unreachable() means UB so
-		 we're not obliged to do anything special; however, we want
-		 non-zero-sized function bodies.  To meet this, and help the
-		 user out, let's trap the case.  */
-	      if (insn == NULL)
-		fputs ("\tud2\n", file);
+	      int i;
+	      for (i = 0; i < 2; i++)
+		{
+		  rtx u = XEXP (*x, i);
+		  if (GET_CODE (u) == ZERO_EXTEND)
+		    u = XEXP (u, 0);
+		  if (GET_CODE (u) == UNSPEC
+		      && XINT (u, 1) == UNSPEC_TP)
+		    return true;
+		}
+	      x = &XEXP (*x, 0);
 	    }
+
+	  iter.skip_subrtxes ();
 	}
-      else if (deleted_debug_label)
-	for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
-	  if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
-	    CODE_LABEL_NUMBER (insn) = -1;
     }
-}
 
-/* Return a scratch register to use in the split stack prologue.  The
-   split stack prologue is used for -fsplit-stack.  It is the first
-   instructions in the function, even before the regular prologue.
-   The scratch register can be any caller-saved register which is not
-   used for parameters or for the static chain.  */
+  return false;
+}
 
-static unsigned int
-split_stack_prologue_scratch_regno (void)
+/* Rewrite *LOC so that it refers to a default TLS address space.  */
+void
+ix86_rewrite_tls_address_1 (rtx *loc)
 {
-  if (TARGET_64BIT)
-    return R11_REG;
-  else
+  subrtx_ptr_iterator::array_type array;
+  FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
     {
-      bool is_fastcall, is_thiscall;
-      int regparm;
-
-      is_fastcall = (lookup_attribute ("fastcall",
-				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
-		     != NULL);
-      is_thiscall = (lookup_attribute ("thiscall",
-				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
-		     != NULL);
-      regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
-
-      if (is_fastcall)
-	{
-	  if (DECL_STATIC_CHAIN (cfun->decl))
-	    {
-	      sorry ("%<-fsplit-stack%> does not support fastcall with "
-		     "nested function");
-	      return INVALID_REGNUM;
-	    }
-	  return AX_REG;
-	}
-      else if (is_thiscall)
-        {
-	  if (!DECL_STATIC_CHAIN (cfun->decl))
-	    return DX_REG;
-	  return AX_REG;
-	}
-      else if (regparm < 3)
+      rtx *loc = *iter;
+      if (MEM_P (*loc))
 	{
-	  if (!DECL_STATIC_CHAIN (cfun->decl))
-	    return CX_REG;
-	  else
+	  rtx addr = XEXP (*loc, 0);
+	  rtx *x = &addr;
+	  while (GET_CODE (*x) == PLUS)
 	    {
-	      if (regparm >= 2)
+	      int i;
+	      for (i = 0; i < 2; i++)
 		{
-		  sorry ("%<-fsplit-stack%> does not support 2 register "
-			 "parameters for a nested function");
-		  return INVALID_REGNUM;
+		  rtx u = XEXP (*x, i);
+		  if (GET_CODE (u) == ZERO_EXTEND)
+		    u = XEXP (u, 0);
+		  if (GET_CODE (u) == UNSPEC
+		      && XINT (u, 1) == UNSPEC_TP)
+		    {
+		      addr_space_t as = DEFAULT_TLS_SEG_REG;
+
+		      *x = XEXP (*x, 1 - i);
+
+		      *loc = replace_equiv_address_nv (*loc, addr, true);
+		      set_mem_addr_space (*loc, as);
+		      return;
+		    }
 		}
-	      return DX_REG;
+	      x = &XEXP (*x, 0);
 	    }
-	}
-      else
-	{
-	  /* FIXME: We could make this work by pushing a register
-	     around the addition and comparison.  */
-	  sorry ("%<-fsplit-stack%> does not support 3 register parameters");
-	  return INVALID_REGNUM;
+
+	  iter.skip_subrtxes ();
 	}
     }
 }
 
-/* A SYMBOL_REF for the function which allocates new stackspace for
-   -fsplit-stack.  */
-
-static GTY(()) rtx split_stack_fn;
-
-/* A SYMBOL_REF for the more stack function when using the large
-   model.  */
-
-static GTY(()) rtx split_stack_fn_large;
+/* Rewrite instruction pattern involvning TLS address
+   so that it refers to a default TLS address space.  */
+rtx
+ix86_rewrite_tls_address (rtx pattern)
+{
+  pattern = copy_insn (pattern);
+  ix86_rewrite_tls_address_1 (&pattern);
+  return pattern;
+}
 
-/* Return location of the stack guard value in the TLS block.  */
+/* Create or return the unique __imp_DECL dllimport symbol corresponding
+   to symbol DECL if BEIMPORT is true.  Otherwise create or return the
+   unique refptr-DECL symbol corresponding to symbol DECL.  */
 
-rtx
-ix86_split_stack_guard (void)
+struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
 {
-  int offset;
-  addr_space_t as = DEFAULT_TLS_SEG_REG;
-  rtx r;
+  static inline hashval_t hash (tree_map *m) { return m->hash; }
+  static inline bool
+  equal (tree_map *a, tree_map *b)
+  {
+    return a->base.from == b->base.from;
+  }
 
-  gcc_assert (flag_split_stack);
+  static int
+  keep_cache_entry (tree_map *&m)
+  {
+    return ggc_marked_p (m->base.from);
+  }
+};
 
-#ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
-  offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
-#else
-  gcc_unreachable ();
+static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
+
+static tree
+get_dllimport_decl (tree decl, bool beimport)
+{
+  struct tree_map *h, in;
+  const char *name;
+  const char *prefix;
+  size_t namelen, prefixlen;
+  char *imp_name;
+  tree to;
+  rtx rtl;
+
+  if (!dllimport_map)
+    dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
+
+  in.hash = htab_hash_pointer (decl);
+  in.base.from = decl;
+  tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
+  h = *loc;
+  if (h)
+    return h->to;
+
+  *loc = h = ggc_alloc<tree_map> ();
+  h->hash = in.hash;
+  h->base.from = decl;
+  h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
+			   VAR_DECL, NULL, ptr_type_node);
+  DECL_ARTIFICIAL (to) = 1;
+  DECL_IGNORED_P (to) = 1;
+  DECL_EXTERNAL (to) = 1;
+  TREE_READONLY (to) = 1;
+
+  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
+  name = targetm.strip_name_encoding (name);
+  if (beimport)
+    prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
+      ? "*__imp_" : "*__imp__";
+  else
+    prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
+  namelen = strlen (name);
+  prefixlen = strlen (prefix);
+  imp_name = (char *) alloca (namelen + prefixlen + 1);
+  memcpy (imp_name, prefix, prefixlen);
+  memcpy (imp_name + prefixlen, name, namelen + 1);
+
+  name = ggc_alloc_string (imp_name, namelen + prefixlen);
+  rtl = gen_rtx_SYMBOL_REF (Pmode, name);
+  SET_SYMBOL_REF_DECL (rtl, to);
+  SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
+  if (!beimport)
+    {
+      SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
+#ifdef SUB_TARGET_RECORD_STUB
+      SUB_TARGET_RECORD_STUB (name);
 #endif
+    }      
 
-  r = GEN_INT (offset);
-  r = gen_const_mem (Pmode, r);
-  set_mem_addr_space (r, as);
+  rtl = gen_const_mem (Pmode, rtl);
+  set_mem_alias_set (rtl, ix86_GOT_alias_set ());
 
-  return r;
+  SET_DECL_RTL (to, rtl);
+  SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
+
+  return to;
 }
 
-/* Handle -fsplit-stack.  These are the first instructions in the
-   function, even before the regular prologue.  */
+/* Expand SYMBOL into its corresponding far-address symbol.
+   WANT_REG is true if we require the result be a register.  */
 
-void
-ix86_expand_split_stack_prologue (void)
+static rtx
+legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
 {
-  HOST_WIDE_INT allocate;
-  unsigned HOST_WIDE_INT args_size;
-  rtx_code_label *label;
-  rtx limit, current, allocate_rtx, call_fusage;
-  rtx_insn *call_insn;
-  rtx scratch_reg = NULL_RTX;
-  rtx_code_label *varargs_label = NULL;
-  rtx fn;
+  tree imp_decl;
+  rtx x;
 
-  gcc_assert (flag_split_stack && reload_completed);
+  gcc_assert (SYMBOL_REF_DECL (symbol));
+  imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
 
-  ix86_finalize_stack_frame_flags ();
-  struct ix86_frame &frame = cfun->machine->frame;
-  allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
+  x = DECL_RTL (imp_decl);
+  if (want_reg)
+    x = force_reg (Pmode, x);
+  return x;
+}
 
-  /* This is the label we will branch to if we have enough stack
-     space.  We expect the basic block reordering pass to reverse this
-     branch if optimizing, so that we branch in the unlikely case.  */
-  label = gen_label_rtx ();
+/* Expand SYMBOL into its corresponding dllimport symbol.  WANT_REG is
+   true if we require the result be a register.  */
 
-  /* We need to compare the stack pointer minus the frame size with
-     the stack boundary in the TCB.  The stack boundary always gives
-     us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
-     can compare directly.  Otherwise we need to do an addition.  */
+static rtx
+legitimize_dllimport_symbol (rtx symbol, bool want_reg)
+{
+  tree imp_decl;
+  rtx x;
 
-  limit = ix86_split_stack_guard ();
+  gcc_assert (SYMBOL_REF_DECL (symbol));
+  imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
 
-  if (allocate < SPLIT_STACK_AVAILABLE)
-    current = stack_pointer_rtx;
-  else
-    {
-      unsigned int scratch_regno;
-      rtx offset;
+  x = DECL_RTL (imp_decl);
+  if (want_reg)
+    x = force_reg (Pmode, x);
+  return x;
+}
 
-      /* We need a scratch register to hold the stack pointer minus
-	 the required frame size.  Since this is the very start of the
-	 function, the scratch register can be any caller-saved
-	 register which is not used for parameters.  */
-      offset = GEN_INT (- allocate);
-      scratch_regno = split_stack_prologue_scratch_regno ();
-      if (scratch_regno == INVALID_REGNUM)
-	return;
-      scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
-      if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
-	{
-	  /* We don't use ix86_gen_add3 in this case because it will
-	     want to split to lea, but when not optimizing the insn
-	     will not be split after this point.  */
-	  emit_insn (gen_rtx_SET (scratch_reg,
-				  gen_rtx_PLUS (Pmode, stack_pointer_rtx,
-						offset)));
-	}
-      else
+/* Expand SYMBOL into its corresponding dllimport or refptr symbol.  WANT_REG 
+   is true if we require the result be a register.  */
+
+rtx
+legitimize_pe_coff_symbol (rtx addr, bool inreg)
+{
+  if (!TARGET_PECOFF)
+    return NULL_RTX;
+
+  if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
+    {
+      if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
+	return legitimize_dllimport_symbol (addr, inreg);
+      if (GET_CODE (addr) == CONST
+	  && GET_CODE (XEXP (addr, 0)) == PLUS
+	  && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
+	  && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
 	{
-	  emit_move_insn (scratch_reg, offset);
-	  emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
-				    stack_pointer_rtx));
+	  rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
+	  return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
 	}
-      current = scratch_reg;
     }
 
-  ix86_expand_branch (GEU, current, limit, label);
-  rtx_insn *jump_insn = get_last_insn ();
-  JUMP_LABEL (jump_insn) = label;
-
-  /* Mark the jump as very likely to be taken.  */
-  add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
+  if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
+    return NULL_RTX;
+  if (GET_CODE (addr) == SYMBOL_REF
+      && !is_imported_p (addr)
+      && SYMBOL_REF_EXTERNAL_P (addr)
+      && SYMBOL_REF_DECL (addr))
+    return legitimize_pe_coff_extern_decl (addr, inreg);
 
-  if (split_stack_fn == NULL_RTX)
+  if (GET_CODE (addr) == CONST
+      && GET_CODE (XEXP (addr, 0)) == PLUS
+      && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
+      && !is_imported_p (XEXP (XEXP (addr, 0), 0))
+      && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
+      && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
     {
-      split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
-      SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
+      rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
+      return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
     }
-  fn = split_stack_fn;
+  return NULL_RTX;
+}
 
-  /* Get more stack space.  We pass in the desired stack space and the
-     size of the arguments to copy to the new stack.  In 32-bit mode
-     we push the parameters; __morestack will return on a new stack
-     anyhow.  In 64-bit mode we pass the parameters in r10 and
-     r11.  */
-  allocate_rtx = GEN_INT (allocate);
-  args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0;
-  call_fusage = NULL_RTX;
-  rtx pop = NULL_RTX;
-  if (TARGET_64BIT)
-    {
-      rtx reg10, reg11;
+/* Try machine-dependent ways of modifying an illegitimate address
+   to be legitimate.  If we find one, return the new, valid address.
+   This macro is used in only one place: `memory_address' in explow.c.
 
-      reg10 = gen_rtx_REG (Pmode, R10_REG);
-      reg11 = gen_rtx_REG (Pmode, R11_REG);
+   OLDX is the address as it was before break_out_memory_refs was called.
+   In some cases it is useful to look at this to decide what needs to be done.
 
-      /* If this function uses a static chain, it will be in %r10.
-	 Preserve it across the call to __morestack.  */
-      if (DECL_STATIC_CHAIN (cfun->decl))
-	{
-	  rtx rax;
+   It is always safe for this macro to do nothing.  It exists to recognize
+   opportunities to optimize the output.
 
-	  rax = gen_rtx_REG (word_mode, AX_REG);
-	  emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
-	  use_reg (&call_fusage, rax);
-	}
-
-      if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
-          && !TARGET_PECOFF)
-	{
-	  HOST_WIDE_INT argval;
-
-	  gcc_assert (Pmode == DImode);
-	  /* When using the large model we need to load the address
-	     into a register, and we've run out of registers.  So we
-	     switch to a different calling convention, and we call a
-	     different function: __morestack_large.  We pass the
-	     argument size in the upper 32 bits of r10 and pass the
-	     frame size in the lower 32 bits.  */
-	  gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
-	  gcc_assert ((args_size & 0xffffffff) == args_size);
-
-	  if (split_stack_fn_large == NULL_RTX)
-	    {
-	      split_stack_fn_large
-		= gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
-	      SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
-	    }
-	  if (ix86_cmodel == CM_LARGE_PIC)
-	    {
-	      rtx_code_label *label;
-	      rtx x;
-
-	      label = gen_label_rtx ();
-	      emit_label (label);
-	      LABEL_PRESERVE_P (label) = 1;
-	      emit_insn (gen_set_rip_rex64 (reg10, label));
-	      emit_insn (gen_set_got_offset_rex64 (reg11, label));
-	      emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
-	      x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
-				  UNSPEC_GOT);
-	      x = gen_rtx_CONST (Pmode, x);
-	      emit_move_insn (reg11, x);
-	      x = gen_rtx_PLUS (Pmode, reg10, reg11);
-	      x = gen_const_mem (Pmode, x);
-	      emit_move_insn (reg11, x);
-	    }
-	  else
-	    emit_move_insn (reg11, split_stack_fn_large);
+   For the 80386, we handle X+REG by loading X into a register R and
+   using R+REG.  R will go in a general reg and indexing will be used.
+   However, if REG is a broken-out memory address or multiplication,
+   nothing needs to be done because REG can certainly go in a general reg.
 
-	  fn = reg11;
+   When -fpic is used, special handling is needed for symbolic references.
+   See comments by legitimize_pic_address in i386.c for details.  */
 
-	  argval = ((args_size << 16) << 16) + allocate;
-	  emit_move_insn (reg10, GEN_INT (argval));
-	}
-      else
-	{
-	  emit_move_insn (reg10, allocate_rtx);
-	  emit_move_insn (reg11, GEN_INT (args_size));
-	  use_reg (&call_fusage, reg11);
-	}
+static rtx
+ix86_legitimize_address (rtx x, rtx, machine_mode mode)
+{
+  bool changed = false;
+  unsigned log;
 
-      use_reg (&call_fusage, reg10);
+  log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
+  if (log)
+    return legitimize_tls_address (x, (enum tls_model) log, false);
+  if (GET_CODE (x) == CONST
+      && GET_CODE (XEXP (x, 0)) == PLUS
+      && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
+      && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
+    {
+      rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
+				      (enum tls_model) log, false);
+      return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
     }
-  else
+
+  if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
     {
-      rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
-      add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
-      insn = emit_insn (gen_push (allocate_rtx));
-      add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
-      pop = GEN_INT (2 * UNITS_PER_WORD);
+      rtx tmp = legitimize_pe_coff_symbol (x, true);
+      if (tmp)
+        return tmp;
     }
-  call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
-				GEN_INT (UNITS_PER_WORD), constm1_rtx,
-				pop, false);
-  add_function_usage_to (call_insn, call_fusage);
-  if (!TARGET_64BIT)
-    add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
-  /* Indicate that this function can't jump to non-local gotos.  */
-  make_reg_eh_region_note_nothrow_nononlocal (call_insn);
 
-  /* In order to make call/return prediction work right, we now need
-     to execute a return instruction.  See
-     libgcc/config/i386/morestack.S for the details on how this works.
+  if (flag_pic && SYMBOLIC_CONST (x))
+    return legitimize_pic_address (x, 0);
 
-     For flow purposes gcc must not see this as a return
-     instruction--we need control flow to continue at the subsequent
-     label.  Therefore, we use an unspec.  */
-  gcc_assert (crtl->args.pops_args < 65536);
-  rtx_insn *ret_insn
-    = emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
+#if TARGET_MACHO
+  if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
+    return machopic_indirect_data_reference (x, 0);
+#endif
 
-  if ((flag_cf_protection & CF_BRANCH))
+  /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
+  if (GET_CODE (x) == ASHIFT
+      && CONST_INT_P (XEXP (x, 1))
+      && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
     {
-      /* Insert ENDBR since __morestack will jump back here via indirect
-	 call.  */
-      rtx cet_eb = gen_nop_endbr ();
-      emit_insn_after (cet_eb, ret_insn);
+      changed = true;
+      log = INTVAL (XEXP (x, 1));
+      x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
+			GEN_INT (1 << log));
     }
 
-  /* If we are in 64-bit mode and this function uses a static chain,
-     we saved %r10 in %rax before calling _morestack.  */
-  if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
-    emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
-		    gen_rtx_REG (word_mode, AX_REG));
-
-  /* If this function calls va_start, we need to store a pointer to
-     the arguments on the old stack, because they may not have been
-     all copied to the new stack.  At this point the old stack can be
-     found at the frame pointer value used by __morestack, because
-     __morestack has set that up before calling back to us.  Here we
-     store that pointer in a scratch register, and in
-     ix86_expand_prologue we store the scratch register in a stack
-     slot.  */
-  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+  if (GET_CODE (x) == PLUS)
     {
-      unsigned int scratch_regno;
-      rtx frame_reg;
-      int words;
+      /* Canonicalize shifts by 0, 1, 2, 3 into multiply.  */
 
-      scratch_regno = split_stack_prologue_scratch_regno ();
-      scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
-      frame_reg = gen_rtx_REG (Pmode, BP_REG);
+      if (GET_CODE (XEXP (x, 0)) == ASHIFT
+	  && CONST_INT_P (XEXP (XEXP (x, 0), 1))
+	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
+	{
+	  changed = true;
+	  log = INTVAL (XEXP (XEXP (x, 0), 1));
+	  XEXP (x, 0) = gen_rtx_MULT (Pmode,
+				      force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
+				      GEN_INT (1 << log));
+	}
 
-      /* 64-bit:
-	 fp -> old fp value
-	       return address within this function
-	       return address of caller of this function
-	       stack arguments
-	 So we add three words to get to the stack arguments.
+      if (GET_CODE (XEXP (x, 1)) == ASHIFT
+	  && CONST_INT_P (XEXP (XEXP (x, 1), 1))
+	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
+	{
+	  changed = true;
+	  log = INTVAL (XEXP (XEXP (x, 1), 1));
+	  XEXP (x, 1) = gen_rtx_MULT (Pmode,
+				      force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
+				      GEN_INT (1 << log));
+	}
 
-	 32-bit:
-	 fp -> old fp value
-	       return address within this function
-               first argument to __morestack
-               second argument to __morestack
-               return address of caller of this function
-               stack arguments
-         So we add five words to get to the stack arguments.
-      */
-      words = TARGET_64BIT ? 3 : 5;
-      emit_insn (gen_rtx_SET (scratch_reg,
-			      gen_rtx_PLUS (Pmode, frame_reg,
-					    GEN_INT (words * UNITS_PER_WORD))));
+      /* Put multiply first if it isn't already.  */
+      if (GET_CODE (XEXP (x, 1)) == MULT)
+	{
+	  std::swap (XEXP (x, 0), XEXP (x, 1));
+	  changed = true;
+	}
 
-      varargs_label = gen_label_rtx ();
-      emit_jump_insn (gen_jump (varargs_label));
-      JUMP_LABEL (get_last_insn ()) = varargs_label;
+      /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
+	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  This can be
+	 created by virtual register instantiation, register elimination, and
+	 similar optimizations.  */
+      if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
+	{
+	  changed = true;
+	  x = gen_rtx_PLUS (Pmode,
+			    gen_rtx_PLUS (Pmode, XEXP (x, 0),
+					  XEXP (XEXP (x, 1), 0)),
+			    XEXP (XEXP (x, 1), 1));
+	}
 
-      emit_barrier ();
-    }
+      /* Canonicalize
+	 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
+	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  */
+      else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
+	       && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
+	       && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
+	       && CONSTANT_P (XEXP (x, 1)))
+	{
+	  rtx constant;
+	  rtx other = NULL_RTX;
 
-  emit_label (label);
-  LABEL_NUSES (label) = 1;
+	  if (CONST_INT_P (XEXP (x, 1)))
+	    {
+	      constant = XEXP (x, 1);
+	      other = XEXP (XEXP (XEXP (x, 0), 1), 1);
+	    }
+	  else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
+	    {
+	      constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
+	      other = XEXP (x, 1);
+	    }
+	  else
+	    constant = 0;
 
-  /* If this function calls va_start, we now have to set the scratch
-     register for the case where we do not call __morestack.  In this
-     case we need to set it based on the stack pointer.  */
-  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
-    {
-      emit_insn (gen_rtx_SET (scratch_reg,
-			      gen_rtx_PLUS (Pmode, stack_pointer_rtx,
-					    GEN_INT (UNITS_PER_WORD))));
+	  if (constant)
+	    {
+	      changed = true;
+	      x = gen_rtx_PLUS (Pmode,
+				gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
+					      XEXP (XEXP (XEXP (x, 0), 1), 0)),
+				plus_constant (Pmode, other,
+					       INTVAL (constant)));
+	    }
+	}
 
-      emit_label (varargs_label);
-      LABEL_NUSES (varargs_label) = 1;
-    }
-}
-
-/* We may have to tell the dataflow pass that the split stack prologue
-   is initializing a scratch register.  */
-
-static void
-ix86_live_on_entry (bitmap regs)
-{
-  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
-    {
-      gcc_assert (flag_split_stack);
-      bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
-    }
-}
-
-/* Extract the parts of an RTL expression that is a valid memory address
-   for an instruction.  Return 0 if the structure of the address is
-   grossly off.  Return -1 if the address contains ASHIFT, so it is not
-   strictly valid, but still used for computing length of lea instruction.  */
-
-int
-ix86_decompose_address (rtx addr, struct ix86_address *out)
-{
-  rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
-  rtx base_reg, index_reg;
-  HOST_WIDE_INT scale = 1;
-  rtx scale_rtx = NULL_RTX;
-  rtx tmp;
-  int retval = 1;
-  addr_space_t seg = ADDR_SPACE_GENERIC;
+      if (changed && ix86_legitimate_address_p (mode, x, false))
+	return x;
 
-  /* Allow zero-extended SImode addresses,
-     they will be emitted with addr32 prefix.  */
-  if (TARGET_64BIT && GET_MODE (addr) == DImode)
-    {
-      if (GET_CODE (addr) == ZERO_EXTEND
-	  && GET_MODE (XEXP (addr, 0)) == SImode)
-	{
-	  addr = XEXP (addr, 0);
-	  if (CONST_INT_P (addr))
-	    return 0;
-	}	      
-      else if (GET_CODE (addr) == AND
-	       && const_32bit_mask (XEXP (addr, 1), DImode))
+      if (GET_CODE (XEXP (x, 0)) == MULT)
 	{
-	  addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
-	  if (addr == NULL_RTX)
-	    return 0;
-
-	  if (CONST_INT_P (addr))
-	    return 0;
+	  changed = true;
+	  XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
 	}
-    }
 
-  /* Allow SImode subregs of DImode addresses,
-     they will be emitted with addr32 prefix.  */
-  if (TARGET_64BIT && GET_MODE (addr) == SImode)
-    {
-      if (SUBREG_P (addr)
-	  && GET_MODE (SUBREG_REG (addr)) == DImode)
+      if (GET_CODE (XEXP (x, 1)) == MULT)
 	{
-	  addr = SUBREG_REG (addr);
-	  if (CONST_INT_P (addr))
-	    return 0;
+	  changed = true;
+	  XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
 	}
-    }
 
-  if (REG_P (addr))
-    base = addr;
-  else if (SUBREG_P (addr))
-    {
-      if (REG_P (SUBREG_REG (addr)))
-	base = addr;
-      else
-	return 0;
-    }
-  else if (GET_CODE (addr) == PLUS)
-    {
-      rtx addends[4], op;
-      int n = 0, i;
+      if (changed
+	  && REG_P (XEXP (x, 1))
+	  && REG_P (XEXP (x, 0)))
+	return x;
 
-      op = addr;
-      do
+      if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
 	{
-	  if (n >= 4)
-	    return 0;
-	  addends[n++] = XEXP (op, 1);
-	  op = XEXP (op, 0);
+	  changed = true;
+	  x = legitimize_pic_address (x, 0);
 	}
-      while (GET_CODE (op) == PLUS);
-      if (n >= 4)
-	return 0;
-      addends[n] = op;
 
-      for (i = n; i >= 0; --i)
+      if (changed && ix86_legitimate_address_p (mode, x, false))
+	return x;
+
+      if (REG_P (XEXP (x, 0)))
 	{
-	  op = addends[i];
-	  switch (GET_CODE (op))
+	  rtx temp = gen_reg_rtx (Pmode);
+	  rtx val  = force_operand (XEXP (x, 1), temp);
+	  if (val != temp)
 	    {
-	    case MULT:
-	      if (index)
-		return 0;
-	      index = XEXP (op, 0);
-	      scale_rtx = XEXP (op, 1);
-	      break;
-
-	    case ASHIFT:
-	      if (index)
-		return 0;
-	      index = XEXP (op, 0);
-	      tmp = XEXP (op, 1);
-	      if (!CONST_INT_P (tmp))
-		return 0;
-	      scale = INTVAL (tmp);
-	      if ((unsigned HOST_WIDE_INT) scale > 3)
-		return 0;
-	      scale = 1 << scale;
-	      break;
-
-	    case ZERO_EXTEND:
-	      op = XEXP (op, 0);
-	      if (GET_CODE (op) != UNSPEC)
-		return 0;
-	      /* FALLTHRU */
-
-	    case UNSPEC:
-	      if (XINT (op, 1) == UNSPEC_TP
-	          && TARGET_TLS_DIRECT_SEG_REFS
-	          && seg == ADDR_SPACE_GENERIC)
-		seg = DEFAULT_TLS_SEG_REG;
-	      else
-		return 0;
-	      break;
-
-	    case SUBREG:
-	      if (!REG_P (SUBREG_REG (op)))
-		return 0;
-	      /* FALLTHRU */
-
-	    case REG:
-	      if (!base)
-		base = op;
-	      else if (!index)
-		index = op;
-	      else
-		return 0;
-	      break;
+	      val = convert_to_mode (Pmode, val, 1);
+	      emit_move_insn (temp, val);
+	    }
 
-	    case CONST:
-	    case CONST_INT:
-	    case SYMBOL_REF:
-	    case LABEL_REF:
-	      if (disp)
-		return 0;
-	      disp = op;
-	      break;
+	  XEXP (x, 1) = temp;
+	  return x;
+	}
 
-	    default:
-	      return 0;
+      else if (REG_P (XEXP (x, 1)))
+	{
+	  rtx temp = gen_reg_rtx (Pmode);
+	  rtx val  = force_operand (XEXP (x, 0), temp);
+	  if (val != temp)
+	    {
+	      val = convert_to_mode (Pmode, val, 1);
+	      emit_move_insn (temp, val);
 	    }
+
+	  XEXP (x, 0) = temp;
+	  return x;
 	}
     }
-  else if (GET_CODE (addr) == MULT)
-    {
-      index = XEXP (addr, 0);		/* index*scale */
-      scale_rtx = XEXP (addr, 1);
-    }
-  else if (GET_CODE (addr) == ASHIFT)
-    {
-      /* We're called for lea too, which implements ashift on occasion.  */
-      index = XEXP (addr, 0);
-      tmp = XEXP (addr, 1);
-      if (!CONST_INT_P (tmp))
-	return 0;
-      scale = INTVAL (tmp);
-      if ((unsigned HOST_WIDE_INT) scale > 3)
-	return 0;
-      scale = 1 << scale;
-      retval = -1;
-    }
-  else
-    disp = addr;			/* displacement */
 
-  if (index)
-    {
-      if (REG_P (index))
-	;
-      else if (SUBREG_P (index)
-	       && REG_P (SUBREG_REG (index)))
-	;
-      else
-	return 0;
-    }
+  return x;
+}
+
+/* Print an integer constant expression in assembler syntax.  Addition
+   and subtraction are the only arithmetic that may appear in these
+   expressions.  FILE is the stdio stream to write to, X is the rtx, and
+   CODE is the operand print code from the output string.  */
 
-  /* Extract the integral value of scale.  */
-  if (scale_rtx)
+static void
+output_pic_addr_const (FILE *file, rtx x, int code)
+{
+  char buf[256];
+
+  switch (GET_CODE (x))
     {
-      if (!CONST_INT_P (scale_rtx))
-	return 0;
-      scale = INTVAL (scale_rtx);
-    }
+    case PC:
+      gcc_assert (flag_pic);
+      putc ('.', file);
+      break;
 
-  base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
-  index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
+    case SYMBOL_REF:
+      if (TARGET_64BIT || ! TARGET_MACHO_SYMBOL_STUBS)
+	output_addr_const (file, x);
+      else
+	{
+	  const char *name = XSTR (x, 0);
 
-  /* Avoid useless 0 displacement.  */
-  if (disp == const0_rtx && (base || index))
-    disp = NULL_RTX;
+	  /* Mark the decl as referenced so that cgraph will
+	     output the function.  */
+	  if (SYMBOL_REF_DECL (x))
+	    mark_decl_referenced (SYMBOL_REF_DECL (x));
 
-  /* Allow arg pointer and stack pointer as index if there is not scaling.  */
-  if (base_reg && index_reg && scale == 1
-      && (REGNO (index_reg) == ARG_POINTER_REGNUM
-	  || REGNO (index_reg) == FRAME_POINTER_REGNUM
-	  || REGNO (index_reg) == SP_REG))
-    {
-      std::swap (base, index);
-      std::swap (base_reg, index_reg);
-    }
-
-  /* Special case: %ebp cannot be encoded as a base without a displacement.
-     Similarly %r13.  */
-  if (!disp && base_reg
-      && (REGNO (base_reg) == ARG_POINTER_REGNUM
-	  || REGNO (base_reg) == FRAME_POINTER_REGNUM
-	  || REGNO (base_reg) == BP_REG
-	  || REGNO (base_reg) == R13_REG))
-    disp = const0_rtx;
-
-  /* Special case: on K6, [%esi] makes the instruction vector decoded.
-     Avoid this by transforming to [%esi+0].
-     Reload calls address legitimization without cfun defined, so we need
-     to test cfun for being non-NULL. */
-  if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
-      && base_reg && !index_reg && !disp
-      && REGNO (base_reg) == SI_REG)
-    disp = const0_rtx;
-
-  /* Special case: encode reg+reg instead of reg*2.  */
-  if (!base && index && scale == 2)
-    base = index, base_reg = index_reg, scale = 1;
-
-  /* Special case: scaling cannot be encoded without base or displacement.  */
-  if (!base && !disp && index && scale != 1)
-    disp = const0_rtx;
-
-  out->base = base;
-  out->index = index;
-  out->disp = disp;
-  out->scale = scale;
-  out->seg = seg;
-
-  return retval;
-}
-
-/* Return cost of the memory address x.
-   For i386, it is better to use a complex address than let gcc copy
-   the address into a reg and make a new pseudo.  But not if the address
-   requires to two regs - that would mean more pseudos with longer
-   lifetimes.  */
-static int
-ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
-{
-  struct ix86_address parts;
-  int cost = 1;
-  int ok = ix86_decompose_address (x, &parts);
-
-  gcc_assert (ok);
+#if TARGET_MACHO
+	  if (MACHOPIC_INDIRECT
+	      && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
+	    name = machopic_indirection_name (x, /*stub_p=*/true);
+#endif
+	  assemble_name (file, name);
+	}
+      if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
+	  && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
+	fputs ("@PLT", file);
+      break;
 
-  if (parts.base && SUBREG_P (parts.base))
-    parts.base = SUBREG_REG (parts.base);
-  if (parts.index && SUBREG_P (parts.index))
-    parts.index = SUBREG_REG (parts.index);
+    case LABEL_REF:
+      x = XEXP (x, 0);
+      /* FALLTHRU */
+    case CODE_LABEL:
+      ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
+      assemble_name (asm_out_file, buf);
+      break;
 
-  /* Attempt to minimize number of registers in the address by increasing
-     address cost for each used register.  We don't increase address cost
-     for "pic_offset_table_rtx".  When a memopt with "pic_offset_table_rtx"
-     is not invariant itself it most likely means that base or index is not
-     invariant.  Therefore only "pic_offset_table_rtx" could be hoisted out,
-     which is not profitable for x86.  */
-  if (parts.base
-      && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
-      && (current_pass->type == GIMPLE_PASS
-	  || !pic_offset_table_rtx
-	  || !REG_P (parts.base)
-	  || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
-    cost++;
+    case CONST_INT:
+      fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
+      break;
 
-  if (parts.index
-      && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
-      && (current_pass->type == GIMPLE_PASS
-	  || !pic_offset_table_rtx
-	  || !REG_P (parts.index)
-	  || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
-    cost++;
+    case CONST:
+      /* This used to output parentheses around the expression,
+	 but that does not work on the 386 (either ATT or BSD assembler).  */
+      output_pic_addr_const (file, XEXP (x, 0), code);
+      break;
 
-  /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
-     since it's predecode logic can't detect the length of instructions
-     and it degenerates to vector decoded.  Increase cost of such
-     addresses here.  The penalty is minimally 2 cycles.  It may be worthwhile
-     to split such addresses or even refuse such addresses at all.
+    case CONST_DOUBLE:
+      /* We can't handle floating point constants;
+	 TARGET_PRINT_OPERAND must handle them.  */
+      output_operand_lossage ("floating constant misused");
+      break;
 
-     Following addressing modes are affected:
-      [base+scale*index]
-      [scale*index+disp]
-      [base+index]
+    case PLUS:
+      /* Some assemblers need integer constants to appear first.  */
+      if (CONST_INT_P (XEXP (x, 0)))
+	{
+	  output_pic_addr_const (file, XEXP (x, 0), code);
+	  putc ('+', file);
+	  output_pic_addr_const (file, XEXP (x, 1), code);
+	}
+      else
+	{
+	  gcc_assert (CONST_INT_P (XEXP (x, 1)));
+	  output_pic_addr_const (file, XEXP (x, 1), code);
+	  putc ('+', file);
+	  output_pic_addr_const (file, XEXP (x, 0), code);
+	}
+      break;
 
-     The first and last case  may be avoidable by explicitly coding the zero in
-     memory address, but I don't have AMD-K6 machine handy to check this
-     theory.  */
+    case MINUS:
+      if (!TARGET_MACHO)
+	putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
+      output_pic_addr_const (file, XEXP (x, 0), code);
+      putc ('-', file);
+      output_pic_addr_const (file, XEXP (x, 1), code);
+      if (!TARGET_MACHO)
+	putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
+      break;
 
-  if (TARGET_K6
-      && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
-	  || (parts.disp && !parts.base && parts.index && parts.scale != 1)
-	  || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
-    cost += 10;
+    case UNSPEC:
+      gcc_assert (XVECLEN (x, 0) == 1);
+      output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
+      switch (XINT (x, 1))
+	{
+	case UNSPEC_GOT:
+	  fputs ("@GOT", file);
+	  break;
+	case UNSPEC_GOTOFF:
+	  fputs ("@GOTOFF", file);
+	  break;
+	case UNSPEC_PLTOFF:
+	  fputs ("@PLTOFF", file);
+	  break;
+	case UNSPEC_PCREL:
+	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+		 "(%rip)" : "[rip]", file);
+	  break;
+	case UNSPEC_GOTPCREL:
+	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+		 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
+	  break;
+	case UNSPEC_GOTTPOFF:
+	  /* FIXME: This might be @TPOFF in Sun ld too.  */
+	  fputs ("@gottpoff", file);
+	  break;
+	case UNSPEC_TPOFF:
+	  fputs ("@tpoff", file);
+	  break;
+	case UNSPEC_NTPOFF:
+	  if (TARGET_64BIT)
+	    fputs ("@tpoff", file);
+	  else
+	    fputs ("@ntpoff", file);
+	  break;
+	case UNSPEC_DTPOFF:
+	  fputs ("@dtpoff", file);
+	  break;
+	case UNSPEC_GOTNTPOFF:
+	  if (TARGET_64BIT)
+	    fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+		   "@gottpoff(%rip)": "@gottpoff[rip]", file);
+	  else
+	    fputs ("@gotntpoff", file);
+	  break;
+	case UNSPEC_INDNTPOFF:
+	  fputs ("@indntpoff", file);
+	  break;
+#if TARGET_MACHO
+	case UNSPEC_MACHOPIC_OFFSET:
+	  putc ('-', file);
+	  machopic_output_function_base_name (file);
+	  break;
+#endif
+	default:
+	  output_operand_lossage ("invalid UNSPEC as operand");
+	  break;
+	}
+       break;
 
-  return cost;
+    default:
+      output_operand_lossage ("invalid expression as operand");
+    }
 }
-
-/* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
-   this is used for to form addresses to local data when -fPIC is in
-   use.  */
 
-static bool
-darwin_local_data_pic (rtx disp)
+/* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
+   We need to emit DTP-relative relocations.  */
+
+static void ATTRIBUTE_UNUSED
+i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
 {
-  return (GET_CODE (disp) == UNSPEC
-	  && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
+  fputs (ASM_LONG, file);
+  output_addr_const (file, x);
+  fputs ("@dtpoff", file);
+  switch (size)
+    {
+    case 4:
+      break;
+    case 8:
+      fputs (", 0", file);
+      break;
+    default:
+      gcc_unreachable ();
+   }
 }
 
-/* True if operand X should be loaded from GOT.  */
+/* Return true if X is a representation of the PIC register.  This copes
+   with calls from ix86_find_base_term, where the register might have
+   been replaced by a cselib value.  */
 
-bool
-ix86_force_load_from_GOT_p (rtx x)
+static bool
+ix86_pic_register_p (rtx x)
 {
-  return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
-	  && !TARGET_PECOFF && !TARGET_MACHO
-	  && !flag_pic
-	  && ix86_cmodel != CM_LARGE
-	  && GET_CODE (x) == SYMBOL_REF
-	  && SYMBOL_REF_FUNCTION_P (x)
-	  && (!flag_plt
-	      || (SYMBOL_REF_DECL (x)
-		  && lookup_attribute ("noplt",
-				       DECL_ATTRIBUTES (SYMBOL_REF_DECL (x)))))
-	  && !SYMBOL_REF_LOCAL_P (x));
+  if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
+    return (pic_offset_table_rtx
+	    && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
+  else if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SET_GOT)
+    return true;
+  else if (!REG_P (x))
+    return false;
+  else if (pic_offset_table_rtx)
+    {
+      if (REGNO (x) == REGNO (pic_offset_table_rtx))
+	return true;
+      if (HARD_REGISTER_P (x)
+	  && !HARD_REGISTER_P (pic_offset_table_rtx)
+	  && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
+	return true;
+      return false;
+    }
+  else
+    return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
 }
 
-/* Determine if a given RTX is a valid constant.  We already know this
-   satisfies CONSTANT_P.  */
+/* Helper function for ix86_delegitimize_address.
+   Attempt to delegitimize TLS local-exec accesses.  */
 
-static bool
-ix86_legitimate_constant_p (machine_mode mode, rtx x)
+static rtx
+ix86_delegitimize_tls_address (rtx orig_x)
 {
-  switch (GET_CODE (x))
-    {
-    case CONST:
-      x = XEXP (x, 0);
-
-      if (GET_CODE (x) == PLUS)
-	{
-	  if (!CONST_INT_P (XEXP (x, 1)))
-	    return false;
-	  x = XEXP (x, 0);
-	}
+  rtx x = orig_x, unspec;
+  struct ix86_address addr;
 
-      if (TARGET_MACHO && darwin_local_data_pic (x))
-	return true;
+  if (!TARGET_TLS_DIRECT_SEG_REFS)
+    return orig_x;
+  if (MEM_P (x))
+    x = XEXP (x, 0);
+  if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
+    return orig_x;
+  if (ix86_decompose_address (x, &addr) == 0
+      || addr.seg != DEFAULT_TLS_SEG_REG
+      || addr.disp == NULL_RTX
+      || GET_CODE (addr.disp) != CONST)
+    return orig_x;
+  unspec = XEXP (addr.disp, 0);
+  if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
+    unspec = XEXP (unspec, 0);
+  if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
+    return orig_x;
+  x = XVECEXP (unspec, 0, 0);
+  gcc_assert (GET_CODE (x) == SYMBOL_REF);
+  if (unspec != XEXP (addr.disp, 0))
+    x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
+  if (addr.index)
+    {
+      rtx idx = addr.index;
+      if (addr.scale != 1)
+	idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
+      x = gen_rtx_PLUS (Pmode, idx, x);
+    }
+  if (addr.base)
+    x = gen_rtx_PLUS (Pmode, addr.base, x);
+  if (MEM_P (orig_x))
+    x = replace_equiv_address_nv (orig_x, x);
+  return x;
+}
 
-      /* Only some unspecs are valid as "constants".  */
-      if (GET_CODE (x) == UNSPEC)
-	switch (XINT (x, 1))
-	  {
-	  case UNSPEC_GOT:
-	  case UNSPEC_GOTOFF:
-	  case UNSPEC_PLTOFF:
-	    return TARGET_64BIT;
-	  case UNSPEC_TPOFF:
-	  case UNSPEC_NTPOFF:
-	    x = XVECEXP (x, 0, 0);
-	    return (GET_CODE (x) == SYMBOL_REF
-		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
-	  case UNSPEC_DTPOFF:
-	    x = XVECEXP (x, 0, 0);
-	    return (GET_CODE (x) == SYMBOL_REF
-		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
-	  default:
-	    return false;
-	  }
+/* In the name of slightly smaller debug output, and to cater to
+   general assembler lossage, recognize PIC+GOTOFF and turn it back
+   into a direct symbol reference.
 
-      /* We must have drilled down to a symbol.  */
-      if (GET_CODE (x) == LABEL_REF)
-	return true;
-      if (GET_CODE (x) != SYMBOL_REF)
-	return false;
-      /* FALLTHRU */
+   On Darwin, this is necessary to avoid a crash, because Darwin
+   has a different PIC label for each routine but the DWARF debugging
+   information is not associated with any particular routine, so it's
+   necessary to remove references to the PIC label from RTL stored by
+   the DWARF output code.
 
-    case SYMBOL_REF:
-      /* TLS symbols are never valid.  */
-      if (SYMBOL_REF_TLS_MODEL (x))
-	return false;
+   This helper is used in the normal ix86_delegitimize_address
+   entrypoint (e.g. used in the target delegitimization hook) and
+   in ix86_find_base_term.  As compile time memory optimization, we
+   avoid allocating rtxes that will not change anything on the outcome
+   of the callers (find_base_value and find_base_term).  */
 
-      /* DLLIMPORT symbols are never valid.  */
-      if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
-	  && SYMBOL_REF_DLLIMPORT_P (x))
-	return false;
+static inline rtx
+ix86_delegitimize_address_1 (rtx x, bool base_term_p)
+{
+  rtx orig_x = delegitimize_mem_from_attrs (x);
+  /* addend is NULL or some rtx if x is something+GOTOFF where
+     something doesn't include the PIC register.  */
+  rtx addend = NULL_RTX;
+  /* reg_addend is NULL or a multiple of some register.  */
+  rtx reg_addend = NULL_RTX;
+  /* const_addend is NULL or a const_int.  */
+  rtx const_addend = NULL_RTX;
+  /* This is the result, or NULL.  */
+  rtx result = NULL_RTX;
 
-#if TARGET_MACHO
-      /* mdynamic-no-pic */
-      if (MACHO_DYNAMIC_NO_PIC_P)
-	return machopic_symbol_defined_p (x);
-#endif
+  x = orig_x;
 
-      /* External function address should be loaded
-	 via the GOT slot to avoid PLT.  */
-      if (ix86_force_load_from_GOT_p (x))
-	return false;
+  if (MEM_P (x))
+    x = XEXP (x, 0);
 
-      break;
+  if (TARGET_64BIT)
+    {
+      if (GET_CODE (x) == CONST
+          && GET_CODE (XEXP (x, 0)) == PLUS
+          && GET_MODE (XEXP (x, 0)) == Pmode
+          && CONST_INT_P (XEXP (XEXP (x, 0), 1))
+          && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
+          && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
+        {
+	  /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
+	     base.  A CONST can't be arg_pointer_rtx based.  */
+	  if (base_term_p && MEM_P (orig_x))
+	    return orig_x;
+	  rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
+	  x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
+	  if (MEM_P (orig_x))
+	    x = replace_equiv_address_nv (orig_x, x);
+	  return x;
+	}
 
-    CASE_CONST_SCALAR_INT:
-      switch (mode)
+      if (GET_CODE (x) == CONST
+	  && GET_CODE (XEXP (x, 0)) == UNSPEC
+	  && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
+	      || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
+	  && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
 	{
-	case E_TImode:
-	  if (TARGET_64BIT)
-	    return true;
-	  /* FALLTHRU */
-	case E_OImode:
-	case E_XImode:
-	  if (!standard_sse_constant_p (x, mode))
-	    return false;
-	default:
-	  break;
+	  x = XVECEXP (XEXP (x, 0), 0, 0);
+	  if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
+	    {
+	      x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
+	      if (x == NULL_RTX)
+		return orig_x;
+	    }
+	  return x;
 	}
-      break;
 
-    case CONST_VECTOR:
-      if (!standard_sse_constant_p (x, mode))
-	return false;
+      if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
+	return ix86_delegitimize_tls_address (orig_x);
 
-    default:
-      break;
+      /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
+	 and -mcmodel=medium -fpic.  */
     }
 
-  /* Otherwise we handle everything else in the move patterns.  */
-  return true;
-}
-
-/* Determine if it's legal to put X into the constant pool.  This
-   is not possible for the address of thread-local symbols, which
-   is checked above.  */
+  if (GET_CODE (x) != PLUS
+      || GET_CODE (XEXP (x, 1)) != CONST)
+    return ix86_delegitimize_tls_address (orig_x);
 
-static bool
-ix86_cannot_force_const_mem (machine_mode mode, rtx x)
-{
-  /* We can put any immediate constant in memory.  */
-  switch (GET_CODE (x))
+  if (ix86_pic_register_p (XEXP (x, 0)))
+    /* %ebx + GOT/GOTOFF */
+    ;
+  else if (GET_CODE (XEXP (x, 0)) == PLUS)
     {
-    CASE_CONST_ANY:
-      return false;
+      /* %ebx + %reg * scale + GOT/GOTOFF */
+      reg_addend = XEXP (x, 0);
+      if (ix86_pic_register_p (XEXP (reg_addend, 0)))
+	reg_addend = XEXP (reg_addend, 1);
+      else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
+	reg_addend = XEXP (reg_addend, 0);
+      else
+	{
+	  reg_addend = NULL_RTX;
+	  addend = XEXP (x, 0);
+	}
+    }
+  else
+    addend = XEXP (x, 0);
 
-    default:
-      break;
+  x = XEXP (XEXP (x, 1), 0);
+  if (GET_CODE (x) == PLUS
+      && CONST_INT_P (XEXP (x, 1)))
+    {
+      const_addend = XEXP (x, 1);
+      x = XEXP (x, 0);
     }
 
-  return !ix86_legitimate_constant_p (mode, x);
-}
+  if (GET_CODE (x) == UNSPEC
+      && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
+	  || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
+	  || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
+	      && !MEM_P (orig_x) && !addend)))
+    result = XVECEXP (x, 0, 0);
 
-/*  Nonzero if the symbol is marked as dllimport, or as stub-variable,
-    otherwise zero.  */
+  if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
+      && !MEM_P (orig_x))
+    result = XVECEXP (x, 0, 0);
 
-static bool
-is_imported_p (rtx x)
-{
-  if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
-      || GET_CODE (x) != SYMBOL_REF)
-    return false;
+  if (! result)
+    return ix86_delegitimize_tls_address (orig_x);
 
-  return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
+  /* For (PLUS something CONST_INT) both find_base_{value,term} just
+     recurse on the first operand.  */
+  if (const_addend && !base_term_p)
+    result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
+  if (reg_addend)
+    result = gen_rtx_PLUS (Pmode, reg_addend, result);
+  if (addend)
+    {
+      /* If the rest of original X doesn't involve the PIC register, add
+	 addend and subtract pic_offset_table_rtx.  This can happen e.g.
+	 for code like:
+	 leal (%ebx, %ecx, 4), %ecx
+	 ...
+	 movl foo@GOTOFF(%ecx), %edx
+	 in which case we return (%ecx - %ebx) + foo
+	 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
+	 and reload has completed.  Don't do the latter for debug,
+	 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly.  */
+      if (pic_offset_table_rtx
+	  && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
+        result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
+						     pic_offset_table_rtx),
+			       result);
+      else if (base_term_p
+	       && pic_offset_table_rtx
+	       && !TARGET_MACHO
+	       && !TARGET_VXWORKS_RTP)
+	{
+	  rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
+	  tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
+	  result = gen_rtx_PLUS (Pmode, tmp, result);
+	}
+      else
+	return orig_x;
+    }
+  if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
+    {
+      result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
+      if (result == NULL_RTX)
+	return orig_x;
+    }
+  return result;
 }
 
+/* The normal instantiation of the above template.  */
 
-/* Nonzero if the constant value X is a legitimate general operand
-   when generating PIC code.  It is given that flag_pic is on and
-   that X satisfies CONSTANT_P.  */
-
-bool
-legitimate_pic_operand_p (rtx x)
+static rtx
+ix86_delegitimize_address (rtx x)
 {
-  rtx inner;
-
-  switch (GET_CODE (x))
-    {
-    case CONST:
-      inner = XEXP (x, 0);
-      if (GET_CODE (inner) == PLUS
-	  && CONST_INT_P (XEXP (inner, 1)))
-	inner = XEXP (inner, 0);
-
-      /* Only some unspecs are valid as "constants".  */
-      if (GET_CODE (inner) == UNSPEC)
-	switch (XINT (inner, 1))
-	  {
-	  case UNSPEC_GOT:
-	  case UNSPEC_GOTOFF:
-	  case UNSPEC_PLTOFF:
-	    return TARGET_64BIT;
-	  case UNSPEC_TPOFF:
-	    x = XVECEXP (inner, 0, 0);
-	    return (GET_CODE (x) == SYMBOL_REF
-		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
-	  case UNSPEC_MACHOPIC_OFFSET:
-	    return legitimate_pic_address_disp_p (x);
-	  default:
-	    return false;
-	  }
-      /* FALLTHRU */
-
-    case SYMBOL_REF:
-    case LABEL_REF:
-      return legitimate_pic_address_disp_p (x);
-
-    default:
-      return true;
-    }
+  return ix86_delegitimize_address_1 (x, false);
 }
 
-/* Determine if a given CONST RTX is a valid memory displacement
-   in PIC mode.  */
+/* If X is a machine specific address (i.e. a symbol or label being
+   referenced as a displacement from the GOT implemented using an
+   UNSPEC), then return the base term.  Otherwise return X.  */
 
-bool
-legitimate_pic_address_disp_p (rtx disp)
+rtx
+ix86_find_base_term (rtx x)
 {
-  bool saw_plus;
+  rtx term;
 
-  /* In 64bit mode we can allow direct addresses of symbols and labels
-     when they are not dynamic symbols.  */
   if (TARGET_64BIT)
     {
-      rtx op0 = disp, op1;
-
-      switch (GET_CODE (disp))
-	{
-	case LABEL_REF:
-	  return true;
+      if (GET_CODE (x) != CONST)
+	return x;
+      term = XEXP (x, 0);
+      if (GET_CODE (term) == PLUS
+	  && CONST_INT_P (XEXP (term, 1)))
+	term = XEXP (term, 0);
+      if (GET_CODE (term) != UNSPEC
+	  || (XINT (term, 1) != UNSPEC_GOTPCREL
+	      && XINT (term, 1) != UNSPEC_PCREL))
+	return x;
 
-	case CONST:
-	  if (GET_CODE (XEXP (disp, 0)) != PLUS)
-	    break;
-	  op0 = XEXP (XEXP (disp, 0), 0);
-	  op1 = XEXP (XEXP (disp, 0), 1);
-	  if (!CONST_INT_P (op1))
-	    break;
-	  if (GET_CODE (op0) == UNSPEC
-	      && (XINT (op0, 1) == UNSPEC_DTPOFF
-		  || XINT (op0, 1) == UNSPEC_NTPOFF)
-	      && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
-	    return true;
-	  if (INTVAL (op1) >= 16*1024*1024
-	      || INTVAL (op1) < -16*1024*1024)
-	    break;
-	  if (GET_CODE (op0) == LABEL_REF)
-	    return true;
-	  if (GET_CODE (op0) == CONST
-	      && GET_CODE (XEXP (op0, 0)) == UNSPEC
-	      && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
-	    return true;
-	  if (GET_CODE (op0) == UNSPEC
-	      && XINT (op0, 1) == UNSPEC_PCREL)
-	    return true;
-	  if (GET_CODE (op0) != SYMBOL_REF)
-	    break;
-	  /* FALLTHRU */
+      return XVECEXP (term, 0, 0);
+    }
 
-	case SYMBOL_REF:
-	  /* TLS references should always be enclosed in UNSPEC.
-	     The dllimported symbol needs always to be resolved.  */
-	  if (SYMBOL_REF_TLS_MODEL (op0)
-	      || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
-	    return false;
+  return ix86_delegitimize_address_1 (x, true);
+}
 
-	  if (TARGET_PECOFF)
-	    {
-	      if (is_imported_p (op0))
-		return true;
+/* Return true if X shouldn't be emitted into the debug info.
+   Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
+   symbol easily into the .debug_info section, so we need not to
+   delegitimize, but instead assemble as @gotoff.
+   Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
+   assembles that as _GLOBAL_OFFSET_TABLE_-. expression.  */
 
-	      if (SYMBOL_REF_FAR_ADDR_P (op0)
-		  || !SYMBOL_REF_LOCAL_P (op0))
-		break;
+static bool
+ix86_const_not_ok_for_debug_p (rtx x)
+{
+  if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
+    return true;
 
-	      /* Function-symbols need to be resolved only for
-	         large-model.
-	         For the small-model we don't need to resolve anything
-	         here.  */
-	      if ((ix86_cmodel != CM_LARGE_PIC
-	           && SYMBOL_REF_FUNCTION_P (op0))
-		  || ix86_cmodel == CM_SMALL_PIC)
-		return true;
-	      /* Non-external symbols don't need to be resolved for
-	         large, and medium-model.  */
-	      if ((ix86_cmodel == CM_LARGE_PIC
-		   || ix86_cmodel == CM_MEDIUM_PIC)
-		  && !SYMBOL_REF_EXTERNAL_P (op0))
-		return true;
-	    }
-	  else if (!SYMBOL_REF_FAR_ADDR_P (op0)
-		   && (SYMBOL_REF_LOCAL_P (op0)
-		       || (HAVE_LD_PIE_COPYRELOC
-			   && flag_pie
-			   && !SYMBOL_REF_WEAK (op0)
-			   && !SYMBOL_REF_FUNCTION_P (op0)))
-		   && ix86_cmodel != CM_LARGE_PIC)
-	    return true;
-	  break;
+  if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
+    return true;
 
-	default:
-	  break;
-	}
-    }
-  if (GET_CODE (disp) != CONST)
-    return false;
-  disp = XEXP (disp, 0);
+  return false;
+}
+
+static void
+put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
+		    bool fp, FILE *file)
+{
+  const char *suffix;
 
-  if (TARGET_64BIT)
+  if (mode == CCFPmode)
     {
-      /* We are unsafe to allow PLUS expressions.  This limit allowed distance
-         of GOT tables.  We should not need these anyway.  */
-      if (GET_CODE (disp) != UNSPEC
-	  || (XINT (disp, 1) != UNSPEC_GOTPCREL
-	      && XINT (disp, 1) != UNSPEC_GOTOFF
-	      && XINT (disp, 1) != UNSPEC_PCREL
-	      && XINT (disp, 1) != UNSPEC_PLTOFF))
-	return false;
-
-      if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
-	  && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
-	return false;
-      return true;
+      code = ix86_fp_compare_code_to_integer (code);
+      mode = CCmode;
     }
+  if (reverse)
+    code = reverse_condition (code);
 
-  saw_plus = false;
-  if (GET_CODE (disp) == PLUS)
+  switch (code)
     {
-      if (!CONST_INT_P (XEXP (disp, 1)))
-	return false;
-      disp = XEXP (disp, 0);
-      saw_plus = true;
-    }
-
-  if (TARGET_MACHO && darwin_local_data_pic (disp))
-    return true;
+    case EQ:
+      gcc_assert (mode != CCGZmode);
+      switch (mode)
+	{
+	case E_CCAmode:
+	  suffix = "a";
+	  break;
+	case E_CCCmode:
+	  suffix = "c";
+	  break;
+	case E_CCOmode:
+	  suffix = "o";
+	  break;
+	case E_CCPmode:
+	  suffix = "p";
+	  break;
+	case E_CCSmode:
+	  suffix = "s";
+	  break;
+	default:
+	  suffix = "e";
+	  break;
+	}
+      break;
+    case NE:
+      gcc_assert (mode != CCGZmode);
+      switch (mode)
+	{
+	case E_CCAmode:
+	  suffix = "na";
+	  break;
+	case E_CCCmode:
+	  suffix = "nc";
+	  break;
+	case E_CCOmode:
+	  suffix = "no";
+	  break;
+	case E_CCPmode:
+	  suffix = "np";
+	  break;
+	case E_CCSmode:
+	  suffix = "ns";
+	  break;
+	default:
+	  suffix = "ne";
+	  break;
+	}
+      break;
+    case GT:
+      gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
+      suffix = "g";
+      break;
+    case GTU:
+      /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
+	 Those same assemblers have the same but opposite lossage on cmov.  */
+      if (mode == CCmode)
+	suffix = fp ? "nbe" : "a";
+      else
+	gcc_unreachable ();
+      break;
+    case LT:
+      switch (mode)
+	{
+	case E_CCNOmode:
+	case E_CCGOCmode:
+	  suffix = "s";
+	  break;
 
-  if (GET_CODE (disp) != UNSPEC)
-    return false;
+	case E_CCmode:
+	case E_CCGCmode:
+	case E_CCGZmode:
+	  suffix = "l";
+	  break;
 
-  switch (XINT (disp, 1))
-    {
-    case UNSPEC_GOT:
-      if (saw_plus)
-	return false;
-      /* We need to check for both symbols and labels because VxWorks loads
-	 text labels with @GOT rather than @GOTOFF.  See gotoff_operand for
-	 details.  */
-      return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
-	      || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
-    case UNSPEC_GOTOFF:
-      /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
-	 While ABI specify also 32bit relocation but we don't produce it in
-	 small PIC model at all.  */
-      if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
-	   || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
-	  && !TARGET_64BIT)
-        return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
-      return false;
-    case UNSPEC_GOTTPOFF:
-    case UNSPEC_GOTNTPOFF:
-    case UNSPEC_INDNTPOFF:
-      if (saw_plus)
-	return false;
-      disp = XVECEXP (disp, 0, 0);
-      return (GET_CODE (disp) == SYMBOL_REF
-	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
-    case UNSPEC_NTPOFF:
-      disp = XVECEXP (disp, 0, 0);
-      return (GET_CODE (disp) == SYMBOL_REF
-	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
-    case UNSPEC_DTPOFF:
-      disp = XVECEXP (disp, 0, 0);
-      return (GET_CODE (disp) == SYMBOL_REF
-	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
-    }
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+    case LTU:
+      if (mode == CCmode || mode == CCGZmode)
+	suffix = "b";
+      else if (mode == CCCmode)
+	suffix = fp ? "b" : "c";
+      else
+	gcc_unreachable ();
+      break;
+    case GE:
+      switch (mode)
+	{
+	case E_CCNOmode:
+	case E_CCGOCmode:
+	  suffix = "ns";
+	  break;
 
-  return false;
+	case E_CCmode:
+	case E_CCGCmode:
+	case E_CCGZmode:
+	  suffix = "ge";
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+    case GEU:
+      if (mode == CCmode || mode == CCGZmode)
+	suffix = "nb";
+      else if (mode == CCCmode)
+	suffix = fp ? "nb" : "nc";
+      else
+	gcc_unreachable ();
+      break;
+    case LE:
+      gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
+      suffix = "le";
+      break;
+    case LEU:
+      if (mode == CCmode)
+	suffix = "be";
+      else
+	gcc_unreachable ();
+      break;
+    case UNORDERED:
+      suffix = fp ? "u" : "p";
+      break;
+    case ORDERED:
+      suffix = fp ? "nu" : "np";
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  fputs (suffix, file);
 }
 
-/* Determine if op is suitable RTX for an address register.
-   Return naked register if a register or a register subreg is
-   found, otherwise return NULL_RTX.  */
+/* Print the name of register X to FILE based on its machine mode and number.
+   If CODE is 'w', pretend the mode is HImode.
+   If CODE is 'b', pretend the mode is QImode.
+   If CODE is 'k', pretend the mode is SImode.
+   If CODE is 'q', pretend the mode is DImode.
+   If CODE is 'x', pretend the mode is V4SFmode.
+   If CODE is 't', pretend the mode is V8SFmode.
+   If CODE is 'g', pretend the mode is V16SFmode.
+   If CODE is 'h', pretend the reg is the 'high' byte register.
+   If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
+   If CODE is 'd', duplicate the operand for AVX instruction.
+   If CODE is 'V', print naked full integer register name without %.
+ */
 
-static rtx
-ix86_validate_address_register (rtx op)
+void
+print_reg (rtx x, int code, FILE *file)
 {
-  machine_mode mode = GET_MODE (op);
+  const char *reg;
+  int msize;
+  unsigned int regno;
+  bool duplicated;
 
-  /* Only SImode or DImode registers can form the address.  */
-  if (mode != SImode && mode != DImode)
-    return NULL_RTX;
+  if (ASSEMBLER_DIALECT == ASM_ATT && code != 'V')
+    putc ('%', file);
 
-  if (REG_P (op))
-    return op;
-  else if (SUBREG_P (op))
+  if (x == pc_rtx)
     {
-      rtx reg = SUBREG_REG (op);
+      gcc_assert (TARGET_64BIT);
+      fputs ("rip", file);
+      return;
+    }
 
-      if (!REG_P (reg))
-	return NULL_RTX;
+  if (code == 'y' && STACK_TOP_P (x))
+    {
+      fputs ("st(0)", file);
+      return;
+    }
 
-      mode = GET_MODE (reg);
+  if (code == 'w')
+    msize = 2;
+  else if (code == 'b')
+    msize = 1;
+  else if (code == 'k')
+    msize = 4;
+  else if (code == 'q')
+    msize = 8;
+  else if (code == 'h')
+    msize = 0;
+  else if (code == 'x')
+    msize = 16;
+  else if (code == 't')
+    msize = 32;
+  else if (code == 'g')
+    msize = 64;
+  else
+    msize = GET_MODE_SIZE (GET_MODE (x));
 
-      /* Don't allow SUBREGs that span more than a word.  It can
-	 lead to spill failures when the register is one word out
-	 of a two word structure.  */
-      if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
-	return NULL_RTX;
+  regno = REGNO (x);
 
-      /* Allow only SUBREGs of non-eliminable hard registers.  */
-      if (register_no_elim_operand (reg, mode))
-	return reg;
+  if (regno == ARG_POINTER_REGNUM
+      || regno == FRAME_POINTER_REGNUM
+      || regno == FPSR_REG)
+    {
+      output_operand_lossage
+	("invalid use of register '%s'", reg_names[regno]);
+      return;
+    }
+  else if (regno == FLAGS_REG)
+    {
+      output_operand_lossage ("invalid use of asm flag output");
+      return;
     }
 
-  /* Op is not a register.  */
-  return NULL_RTX;
-}
-
-/* Recognizes RTL expressions that are valid memory addresses for an
-   instruction.  The MODE argument is the machine mode for the MEM
-   expression that wants to use this address.
-
-   It only recognizes address in canonical form.  LEGITIMIZE_ADDRESS should
-   convert common non-canonical forms to canonical form so that they will
-   be recognized.  */
-
-static bool
-ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
-{
-  struct ix86_address parts;
-  rtx base, index, disp;
-  HOST_WIDE_INT scale;
-  addr_space_t seg;
-
-  if (ix86_decompose_address (addr, &parts) <= 0)
-    /* Decomposition failed.  */
-    return false;
+  if (code == 'V')
+    {
+      if (GENERAL_REGNO_P (regno))
+	msize = GET_MODE_SIZE (word_mode);
+      else
+	error ("%<V%> modifier on non-integer register");
+    }
 
-  base = parts.base;
-  index = parts.index;
-  disp = parts.disp;
-  scale = parts.scale;
-  seg = parts.seg;
+  duplicated = code == 'd' && TARGET_AVX;
 
-  /* Validate base register.  */
-  if (base)
+  switch (msize)
     {
-      rtx reg = ix86_validate_address_register (base);
+    case 16:
+    case 12:
+    case 8:
+      if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
+	warning (0, "unsupported size for integer register");
+      /* FALLTHRU */
+    case 4:
+      if (LEGACY_INT_REGNO_P (regno))
+	putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
+      /* FALLTHRU */
+    case 2:
+    normal:
+      reg = hi_reg_name[regno];
+      break;
+    case 1:
+      if (regno >= ARRAY_SIZE (qi_reg_name))
+	goto normal;
+      if (!ANY_QI_REGNO_P (regno))
+	error ("unsupported size for integer register");
+      reg = qi_reg_name[regno];
+      break;
+    case 0:
+      if (regno >= ARRAY_SIZE (qi_high_reg_name))
+	goto normal;
+      reg = qi_high_reg_name[regno];
+      break;
+    case 32:
+    case 64:
+      if (SSE_REGNO_P (regno))
+	{
+	  gcc_assert (!duplicated);
+	  putc (msize == 32 ? 'y' : 'z', file);
+	  reg = hi_reg_name[regno] + 1;
+	  break;
+	}
+      goto normal;
+    default:
+      gcc_unreachable ();
+    }
 
-      if (reg == NULL_RTX)
-	return false;
+  fputs (reg, file);
 
-      if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
-	  || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
-	/* Base is not valid.  */
-	return false;
+  /* Irritatingly, AMD extended registers use
+     different naming convention: "r%d[bwd]"  */
+  if (REX_INT_REGNO_P (regno))
+    {
+      gcc_assert (TARGET_64BIT);
+      switch (msize)
+	{
+	  case 0:
+	    error ("extended registers have no high halves");
+	    break;
+	  case 1:
+	    putc ('b', file);
+	    break;
+	  case 2:
+	    putc ('w', file);
+	    break;
+	  case 4:
+	    putc ('d', file);
+	    break;
+	  case 8:
+	    /* no suffix */
+	    break;
+	  default:
+	    error ("unsupported operand size for extended register");
+	    break;
+	}
+      return;
     }
 
-  /* Validate index register.  */
-  if (index)
+  if (duplicated)
     {
-      rtx reg = ix86_validate_address_register (index);
-
-      if (reg == NULL_RTX)
-	return false;
-
-      if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
-	  || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
-	/* Index is not valid.  */
-	return false;
+      if (ASSEMBLER_DIALECT == ASM_ATT)
+	fprintf (file, ", %%%s", reg);
+      else
+	fprintf (file, ", %s", reg);
     }
+}
 
-  /* Index and base should have the same mode.  */
-  if (base && index
-      && GET_MODE (base) != GET_MODE (index))
-    return false;
-
-  /* Address override works only on the (%reg) part of %fs:(%reg).  */
-  if (seg != ADDR_SPACE_GENERIC
-      && ((base && GET_MODE (base) != word_mode)
-	  || (index && GET_MODE (index) != word_mode)))
-    return false;
+/* Meaning of CODE:
+   L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
+   C -- print opcode suffix for set/cmov insn.
+   c -- like C, but print reversed condition
+   F,f -- likewise, but for floating-point.
+   O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
+	otherwise nothing
+   R -- print embedded rounding and sae.
+   r -- print only sae.
+   z -- print the opcode suffix for the size of the current operand.
+   Z -- likewise, with special suffixes for x87 instructions.
+   * -- print a star (in certain assembler syntax)
+   A -- print an absolute memory reference.
+   E -- print address with DImode register names if TARGET_64BIT.
+   w -- print the operand as if it's a "word" (HImode) even if it isn't.
+   s -- print a shift double count, followed by the assemblers argument
+	delimiter.
+   b -- print the QImode name of the register for the indicated operand.
+	%b0 would print %al if operands[0] is reg 0.
+   w --  likewise, print the HImode name of the register.
+   k --  likewise, print the SImode name of the register.
+   q --  likewise, print the DImode name of the register.
+   x --  likewise, print the V4SFmode name of the register.
+   t --  likewise, print the V8SFmode name of the register.
+   g --  likewise, print the V16SFmode name of the register.
+   h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
+   y -- print "st(0)" instead of "st" as a register.
+   d -- print duplicated register operand for AVX instruction.
+   D -- print condition for SSE cmp instruction.
+   P -- if PIC, print an @PLT suffix.
+   p -- print raw symbol name.
+   X -- don't print any sort of PIC '@' suffix for a symbol.
+   & -- print some in-use local-dynamic symbol name.
+   H -- print a memory address offset by 8; used for sse high-parts
+   Y -- print condition for XOP pcom* instruction.
+   V -- print naked full integer register name without %.
+   + -- print a branch hint as 'cs' or 'ds' prefix
+   ; -- print a semicolon (after prefixes due to bug in older gas).
+   ~ -- print "i" if TARGET_AVX2, "f" otherwise.
+   ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
+   M -- print addr32 prefix for TARGET_X32 with VSIB address.
+   ! -- print NOTRACK prefix for jxx/call/ret instructions if required.
+ */
 
-  /* Validate scale factor.  */
-  if (scale != 1)
+void
+ix86_print_operand (FILE *file, rtx x, int code)
+{
+  if (code)
     {
-      if (!index)
-	/* Scale without index.  */
-	return false;
+      switch (code)
+	{
+	case 'A':
+	  switch (ASSEMBLER_DIALECT)
+	    {
+	    case ASM_ATT:
+	      putc ('*', file);
+	      break;
 
-      if (scale != 2 && scale != 4 && scale != 8)
-	/* Scale is not a valid multiplier.  */
-	return false;
-    }
+	    case ASM_INTEL:
+	      /* Intel syntax. For absolute addresses, registers should not
+		 be surrounded by braces.  */
+	      if (!REG_P (x))
+		{
+		  putc ('[', file);
+		  ix86_print_operand (file, x, 0);
+		  putc (']', file);
+		  return;
+		}
+	      break;
 
-  /* Validate displacement.  */
-  if (disp)
-    {
-      if (GET_CODE (disp) == CONST
-	  && GET_CODE (XEXP (disp, 0)) == UNSPEC
-	  && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
-	switch (XINT (XEXP (disp, 0), 1))
-	  {
-	  /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
-	     when used.  While ABI specify also 32bit relocations, we
-	     don't produce them at all and use IP relative instead.
-	     Allow GOT in 32bit mode for both PIC and non-PIC if symbol
-	     should be loaded via GOT.  */
-	  case UNSPEC_GOT:
-	    if (!TARGET_64BIT
-		&& ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
-	      goto is_legitimate_pic;
-	    /* FALLTHRU */
-	  case UNSPEC_GOTOFF:
-	    gcc_assert (flag_pic);
-	    if (!TARGET_64BIT)
-	      goto is_legitimate_pic;
+	    default:
+	      gcc_unreachable ();
+	    }
 
-	    /* 64bit address unspec.  */
-	    return false;
+	  ix86_print_operand (file, x, 0);
+	  return;
 
-	  case UNSPEC_GOTPCREL:
-	    if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
-	      goto is_legitimate_pic;
-	    /* FALLTHRU */
-	  case UNSPEC_PCREL:
-	    gcc_assert (flag_pic);
-	    goto is_legitimate_pic;
+	case 'E':
+	  /* Wrap address in an UNSPEC to declare special handling.  */
+	  if (TARGET_64BIT)
+	    x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
 
-	  case UNSPEC_GOTTPOFF:
-	  case UNSPEC_GOTNTPOFF:
-	  case UNSPEC_INDNTPOFF:
-	  case UNSPEC_NTPOFF:
-	  case UNSPEC_DTPOFF:
-	    break;
-
-	  default:
-	    /* Invalid address unspec.  */
-	    return false;
-	  }
+	  output_address (VOIDmode, x);
+	  return;
 
-      else if (SYMBOLIC_CONST (disp)
-	       && (flag_pic
-		   || (TARGET_MACHO
-#if TARGET_MACHO
-		       && MACHOPIC_INDIRECT
-		       && !machopic_operand_p (disp)
-#endif
-	       )))
-	{
+	case 'L':
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('l', file);
+	  return;
 
-	is_legitimate_pic:
-	  if (TARGET_64BIT && (index || base))
-	    {
-	      /* foo@dtpoff(%rX) is ok.  */
-	      if (GET_CODE (disp) != CONST
-		  || GET_CODE (XEXP (disp, 0)) != PLUS
-		  || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
-		  || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
-		  || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
-		      && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
-		/* Non-constant pic memory reference.  */
-		return false;
-	    }
-	  else if ((!TARGET_MACHO || flag_pic)
-		    && ! legitimate_pic_address_disp_p (disp))
-	    /* Displacement is an invalid pic construct.  */
-	    return false;
-#if TARGET_MACHO
-	  else if (MACHO_DYNAMIC_NO_PIC_P
-		   && !ix86_legitimate_constant_p (Pmode, disp))
-	    /* displacment must be referenced via non_lazy_pointer */
-	    return false;
-#endif
+	case 'W':
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('w', file);
+	  return;
 
-          /* This code used to verify that a symbolic pic displacement
-	     includes the pic_offset_table_rtx register.
+	case 'B':
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('b', file);
+	  return;
 
-	     While this is good idea, unfortunately these constructs may
-	     be created by "adds using lea" optimization for incorrect
-	     code like:
+	case 'Q':
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('l', file);
+	  return;
 
-	     int a;
-	     int foo(int i)
-	       {
-	         return *(&a+i);
-	       }
+	case 'S':
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('s', file);
+	  return;
 
-	     This code is nonsensical, but results in addressing
-	     GOT table with pic_offset_table_rtx base.  We can't
-	     just refuse it easily, since it gets matched by
-	     "addsi3" pattern, that later gets split to lea in the
-	     case output register differs from input.  While this
-	     can be handled by separate addsi pattern for this case
-	     that never results in lea, this seems to be easier and
-	     correct fix for crash to disable this test.  */
-	}
-      else if (GET_CODE (disp) != LABEL_REF
-	       && !CONST_INT_P (disp)
-	       && (GET_CODE (disp) != CONST
-		   || !ix86_legitimate_constant_p (Pmode, disp))
-	       && (GET_CODE (disp) != SYMBOL_REF
-		   || !ix86_legitimate_constant_p (Pmode, disp)))
-	/* Displacement is not constant.  */
-	return false;
-      else if (TARGET_64BIT
-	       && !x86_64_immediate_operand (disp, VOIDmode))
-	/* Displacement is out of range.  */
-	return false;
-      /* In x32 mode, constant addresses are sign extended to 64bit, so
-	 we have to prevent addresses from 0x80000000 to 0xffffffff.  */
-      else if (TARGET_X32 && !(index || base)
-	       && CONST_INT_P (disp)
-	       && val_signbit_known_set_p (SImode, INTVAL (disp)))
-	return false;
-    }
+	case 'T':
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('t', file);
+	  return;
 
-  /* Everything looks valid.  */
-  return true;
-}
+	case 'O':
+#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
+	  if (ASSEMBLER_DIALECT != ASM_ATT)
+	    return;
 
-/* Determine if a given RTX is a valid constant address.  */
+	  switch (GET_MODE_SIZE (GET_MODE (x)))
+	    {
+	    case 2:
+	      putc ('w', file);
+	      break;
+  
+	    case 4:
+	      putc ('l', file);
+	      break;
 
-bool
-constant_address_p (rtx x)
-{
-  return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
-}
-
-/* Return a unique alias set for the GOT.  */
+	    case 8:
+	      putc ('q', file);
+	      break;
 
-static alias_set_type
-ix86_GOT_alias_set (void)
-{
-  static alias_set_type set = -1;
-  if (set == -1)
-    set = new_alias_set ();
-  return set;
-}
+	    default:
+	      output_operand_lossage ("invalid operand size for operand "
+				      "code 'O'");
+	      return;
+	    }
 
-/* Return a legitimate reference for ORIG (an address) using the
-   register REG.  If REG is 0, a new pseudo is generated.
+	  putc ('.', file);
+#endif
+	  return;
 
-   There are two types of references that must be handled:
+	case 'z':
+	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
+	    {
+	      /* Opcodes don't get size suffixes if using Intel opcodes.  */
+	      if (ASSEMBLER_DIALECT == ASM_INTEL)
+		return;
 
-   1. Global data references must load the address from the GOT, via
-      the PIC reg.  An insn is emitted to do this load, and the reg is
-      returned.
+	      switch (GET_MODE_SIZE (GET_MODE (x)))
+		{
+		case 1:
+		  putc ('b', file);
+		  return;
 
-   2. Static data references, constant pool addresses, and code labels
-      compute the address as an offset from the GOT, whose base is in
-      the PIC reg.  Static data objects have SYMBOL_FLAG_LOCAL set to
-      differentiate them from global data objects.  The returned
-      address is the PIC reg + an unspec constant.
+		case 2:
+		  putc ('w', file);
+		  return;
 
-   TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
-   reg also appears in the address.  */
+		case 4:
+		  putc ('l', file);
+		  return;
 
-static rtx
-legitimize_pic_address (rtx orig, rtx reg)
-{
-  rtx addr = orig;
-  rtx new_rtx = orig;
+		case 8:
+		  putc ('q', file);
+		  return;
 
-#if TARGET_MACHO
-  if (TARGET_MACHO && !TARGET_64BIT)
-    {
-      if (reg == 0)
-	reg = gen_reg_rtx (Pmode);
-      /* Use the generic Mach-O PIC machinery.  */
-      return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
-    }
-#endif
+		default:
+		  output_operand_lossage ("invalid operand size for operand "
+					  "code 'z'");
+		  return;
+		}
+	    }
 
-  if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
-    {
-      rtx tmp = legitimize_pe_coff_symbol (addr, true);
-      if (tmp)
-        return tmp;
-    }
+	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
+	    warning (0, "non-integer operand used with operand code %<z%>");
+	  /* FALLTHRU */
 
-  if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
-    new_rtx = addr;
-  else if ((!TARGET_64BIT
-	    || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
-	   && !TARGET_PECOFF
-	   && gotoff_operand (addr, Pmode))
-    {
-      /* This symbol may be referenced via a displacement
-	 from the PIC base address (@GOTOFF).  */
-      if (GET_CODE (addr) == CONST)
-	addr = XEXP (addr, 0);
+	case 'Z':
+	  /* 387 opcodes don't get size suffixes if using Intel opcodes.  */
+	  if (ASSEMBLER_DIALECT == ASM_INTEL)
+	    return;
 
-      if (GET_CODE (addr) == PLUS)
-	  {
-            new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
-				      UNSPEC_GOTOFF);
-	    new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
-	  }
-	else
-          new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
+	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
+	    {
+	      switch (GET_MODE_SIZE (GET_MODE (x)))
+		{
+		case 2:
+#ifdef HAVE_AS_IX86_FILDS
+		  putc ('s', file);
+#endif
+		  return;
 
-      new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+		case 4:
+		  putc ('l', file);
+		  return;
 
-      if (TARGET_64BIT)
-	new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
+		case 8:
+#ifdef HAVE_AS_IX86_FILDQ
+		  putc ('q', file);
+#else
+		  fputs ("ll", file);
+#endif
+		  return;
 
-      if (reg != 0)
-	{
- 	  gcc_assert (REG_P (reg));
-	  new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
-					 new_rtx, reg, 1, OPTAB_DIRECT);
- 	}
-      else
-	new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
-    }
-  else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
-	   /* We can't use @GOTOFF for text labels
-	      on VxWorks, see gotoff_operand.  */
-	   || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
-    {
-      rtx tmp = legitimize_pe_coff_symbol (addr, true);
-      if (tmp)
-        return tmp;
-
-      /* For x64 PE-COFF there is no GOT table,
-	 so we use address directly.  */
-      if (TARGET_64BIT && TARGET_PECOFF)
-	{
-	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
-	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
-	}
-      else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
-	{
-	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
-				    UNSPEC_GOTPCREL);
-	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
-	  new_rtx = gen_const_mem (Pmode, new_rtx);
-	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
-	}
-      else
-	{
-	  /* This symbol must be referenced via a load
-	     from the Global Offset Table (@GOT).  */
-	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
-	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
-	  if (TARGET_64BIT)
-	    new_rtx = force_reg (Pmode, new_rtx);
-	  new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
-	  new_rtx = gen_const_mem (Pmode, new_rtx);
-	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
-	}
-
-      new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
-    }
-  else
-    {
-      if (CONST_INT_P (addr)
-	  && !x86_64_immediate_operand (addr, VOIDmode))
-	new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
-      else if (GET_CODE (addr) == CONST)
-	{
-	  addr = XEXP (addr, 0);
-
-	  /* We must match stuff we generate before.  Assume the only
-	     unspecs that can get here are ours.  Not that we could do
-	     anything with them anyway....  */
-	  if (GET_CODE (addr) == UNSPEC
-	      || (GET_CODE (addr) == PLUS
-		  && GET_CODE (XEXP (addr, 0)) == UNSPEC))
-	    return orig;
-	  gcc_assert (GET_CODE (addr) == PLUS);
-	}
-
-      if (GET_CODE (addr) == PLUS)
-	{
-	  rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
-
-	  /* Check first to see if this is a constant
-	     offset from a @GOTOFF symbol reference.  */
-	  if (!TARGET_PECOFF
-	      && gotoff_operand (op0, Pmode)
-	      && CONST_INT_P (op1))
+		default:
+		  break;
+		}
+	    }
+	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
 	    {
-	      if (!TARGET_64BIT)
-		{
-		  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
-					    UNSPEC_GOTOFF);
-		  new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
-		  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+	      /* 387 opcodes don't get size suffixes
+		 if the operands are registers.  */
+	      if (STACK_REG_P (x))
+		return;
 
-		  if (reg != 0)
-		    {
-		      gcc_assert (REG_P (reg));
-		      new_rtx = expand_simple_binop (Pmode, PLUS,
-						     pic_offset_table_rtx,
-						     new_rtx, reg, 1,
-						     OPTAB_DIRECT);
-		    }
-		  else
-		    new_rtx
-		      = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
-		}
-	      else
+	      switch (GET_MODE_SIZE (GET_MODE (x)))
 		{
-		  if (INTVAL (op1) < -16*1024*1024
-		      || INTVAL (op1) >= 16*1024*1024)
-		    {
-		      if (!x86_64_immediate_operand (op1, Pmode))
-			op1 = force_reg (Pmode, op1);
+		case 4:
+		  putc ('s', file);
+		  return;
 
-		      new_rtx
-			= gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
-		    }
+		case 8:
+		  putc ('l', file);
+		  return;
+
+		case 12:
+		case 16:
+		  putc ('t', file);
+		  return;
+
+		default:
+		  break;
 		}
 	    }
 	  else
 	    {
-	      rtx base = legitimize_pic_address (op0, reg);
-	      machine_mode mode = GET_MODE (base);
-	      new_rtx
-	        = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
-
-	      if (CONST_INT_P (new_rtx))
-		{
-		  if (INTVAL (new_rtx) < -16*1024*1024
-		      || INTVAL (new_rtx) >= 16*1024*1024)
-		    {
-		      if (!x86_64_immediate_operand (new_rtx, mode))
-			new_rtx = force_reg (mode, new_rtx);
-
-		      new_rtx
-		        = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
-		    }
-		  else
-		    new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
-		}
-	      else
-		{
-		  /* For %rip addressing, we have to use
-		     just disp32, not base nor index.  */
-		  if (TARGET_64BIT
-		      && (GET_CODE (base) == SYMBOL_REF
-			  || GET_CODE (base) == LABEL_REF))
-		    base = force_reg (mode, base);
-		  if (GET_CODE (new_rtx) == PLUS
-		      && CONSTANT_P (XEXP (new_rtx, 1)))
-		    {
-		      base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
-		      new_rtx = XEXP (new_rtx, 1);
-		    }
-		  new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
-		}
+	      output_operand_lossage ("invalid operand type used with "
+				      "operand code 'Z'");
+	      return;
 	    }
-	}
-    }
-  return new_rtx;
-}
-
-/* Load the thread pointer.  If TO_REG is true, force it into a register.  */
-
-static rtx
-get_thread_pointer (machine_mode tp_mode, bool to_reg)
-{
-  rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
-
-  if (GET_MODE (tp) != tp_mode)
-    {
-      gcc_assert (GET_MODE (tp) == SImode);
-      gcc_assert (tp_mode == DImode);
-
-      tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
-    }
-
-  if (to_reg)
-    tp = copy_to_mode_reg (tp_mode, tp);
-
-  return tp;
-}
-
-/* Construct the SYMBOL_REF for the tls_get_addr function.  */
-
-static GTY(()) rtx ix86_tls_symbol;
-
-static rtx
-ix86_tls_get_addr (void)
-{
-  if (!ix86_tls_symbol)
-    {
-      const char *sym
-	= ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
-	   ? "___tls_get_addr" : "__tls_get_addr");
-
-      ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
-    }
-
-  if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
-    {
-      rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
-				   UNSPEC_PLTOFF);
-      return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
-			   gen_rtx_CONST (Pmode, unspec));
-    }
-
-  return ix86_tls_symbol;
-}
-
-/* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol.  */
 
-static GTY(()) rtx ix86_tls_module_base_symbol;
-
-rtx
-ix86_tls_module_base (void)
-{
-  if (!ix86_tls_module_base_symbol)
-    {
-      ix86_tls_module_base_symbol
-	= gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
+	  output_operand_lossage ("invalid operand size for operand code 'Z'");
+	  return;
 
-      SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
-	|= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
-    }
+	case 'd':
+	case 'b':
+	case 'w':
+	case 'k':
+	case 'q':
+	case 'h':
+	case 't':
+	case 'g':
+	case 'y':
+	case 'x':
+	case 'X':
+	case 'P':
+	case 'p':
+	case 'V':
+	  break;
 
-  return ix86_tls_module_base_symbol;
-}
+	case 's':
+	  if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
+	    {
+	      ix86_print_operand (file, x, 0);
+	      fputs (", ", file);
+	    }
+	  return;
 
-/* A subroutine of ix86_legitimize_address and ix86_expand_move.  FOR_MOV is
-   false if we expect this to be used for a memory address and true if
-   we expect to load the address into a register.  */
-
-static rtx
-legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
-{
-  rtx dest, base, off;
-  rtx pic = NULL_RTX, tp = NULL_RTX;
-  machine_mode tp_mode = Pmode;
-  int type;
+	case 'Y':
+	  switch (GET_CODE (x))
+	    {
+	    case NE:
+	      fputs ("neq", file);
+	      break;
+	    case EQ:
+	      fputs ("eq", file);
+	      break;
+	    case GE:
+	    case GEU:
+	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
+	      break;
+	    case GT:
+	    case GTU:
+	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
+	      break;
+	    case LE:
+	    case LEU:
+	      fputs ("le", file);
+	      break;
+	    case LT:
+	    case LTU:
+	      fputs ("lt", file);
+	      break;
+	    case UNORDERED:
+	      fputs ("unord", file);
+	      break;
+	    case ORDERED:
+	      fputs ("ord", file);
+	      break;
+	    case UNEQ:
+	      fputs ("ueq", file);
+	      break;
+	    case UNGE:
+	      fputs ("nlt", file);
+	      break;
+	    case UNGT:
+	      fputs ("nle", file);
+	      break;
+	    case UNLE:
+	      fputs ("ule", file);
+	      break;
+	    case UNLT:
+	      fputs ("ult", file);
+	      break;
+	    case LTGT:
+	      fputs ("une", file);
+	      break;
+	    default:
+	      output_operand_lossage ("operand is not a condition code, "
+				      "invalid operand code 'Y'");
+	      return;
+	    }
+	  return;
 
-  /* Fall back to global dynamic model if tool chain cannot support local
-     dynamic.  */
-  if (TARGET_SUN_TLS && !TARGET_64BIT
-      && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
-      && model == TLS_MODEL_LOCAL_DYNAMIC)
-    model = TLS_MODEL_GLOBAL_DYNAMIC;
+	case 'D':
+	  /* Little bit of braindamage here.  The SSE compare instructions
+	     does use completely different names for the comparisons that the
+	     fp conditional moves.  */
+	  switch (GET_CODE (x))
+	    {
+	    case UNEQ:
+	      if (TARGET_AVX)
+		{
+		  fputs ("eq_us", file);
+		  break;
+		}
+	     /* FALLTHRU */
+	    case EQ:
+	      fputs ("eq", file);
+	      break;
+	    case UNLT:
+	      if (TARGET_AVX)
+		{
+		  fputs ("nge", file);
+		  break;
+		}
+	     /* FALLTHRU */
+	    case LT:
+	      fputs ("lt", file);
+	      break;
+	    case UNLE:
+	      if (TARGET_AVX)
+		{
+		  fputs ("ngt", file);
+		  break;
+		}
+	     /* FALLTHRU */
+	    case LE:
+	      fputs ("le", file);
+	      break;
+	    case UNORDERED:
+	      fputs ("unord", file);
+	      break;
+	    case LTGT:
+	      if (TARGET_AVX)
+		{
+		  fputs ("neq_oq", file);
+		  break;
+		}
+	     /* FALLTHRU */
+	    case NE:
+	      fputs ("neq", file);
+	      break;
+	    case GE:
+	      if (TARGET_AVX)
+		{
+		  fputs ("ge", file);
+		  break;
+		}
+	     /* FALLTHRU */
+	    case UNGE:
+	      fputs ("nlt", file);
+	      break;
+	    case GT:
+	      if (TARGET_AVX)
+		{
+		  fputs ("gt", file);
+		  break;
+		}
+	     /* FALLTHRU */
+	    case UNGT:
+	      fputs ("nle", file);
+	      break;
+	    case ORDERED:
+	      fputs ("ord", file);
+	      break;
+	    default:
+	      output_operand_lossage ("operand is not a condition code, "
+				      "invalid operand code 'D'");
+	      return;
+	    }
+	  return;
 
-  switch (model)
-    {
-    case TLS_MODEL_GLOBAL_DYNAMIC:
-      dest = gen_reg_rtx (Pmode);
+	case 'F':
+	case 'f':
+#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('.', file);
+	  gcc_fallthrough ();
+#endif
 
-      if (!TARGET_64BIT)
-	{
-	  if (flag_pic && !TARGET_PECOFF)
-	    pic = pic_offset_table_rtx;
-	  else
+	case 'C':
+	case 'c':
+	  if (!COMPARISON_P (x))
 	    {
-	      pic = gen_reg_rtx (Pmode);
-	      emit_insn (gen_set_got (pic));
+	      output_operand_lossage ("operand is not a condition code, "
+				      "invalid operand code '%c'", code);
+	      return;
 	    }
-	}
+	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
+			      code == 'c' || code == 'f',
+			      code == 'F' || code == 'f',
+			      file);
+	  return;
 
-      if (TARGET_GNU2_TLS)
-	{
-	  if (TARGET_64BIT)
-	    emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
-	  else
-	    emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
+	case 'H':
+	  if (!offsettable_memref_p (x))
+	    {
+	      output_operand_lossage ("operand is not an offsettable memory "
+				      "reference, invalid operand code 'H'");
+	      return;
+	    }
+	  /* It doesn't actually matter what mode we use here, as we're
+	     only going to use this for printing.  */
+	  x = adjust_address_nv (x, DImode, 8);
+	  /* Output 'qword ptr' for intel assembler dialect.  */
+	  if (ASSEMBLER_DIALECT == ASM_INTEL)
+	    code = 'q';
+	  break;
 
-	  tp = get_thread_pointer (Pmode, true);
-	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
+	case 'K':
+	  if (!CONST_INT_P (x))
+	    {
+	      output_operand_lossage ("operand is not an integer, invalid "
+				      "operand code 'K'");
+	      return;
+	    }
 
-	  if (GET_MODE (x) != Pmode)
-	    x = gen_rtx_ZERO_EXTEND (Pmode, x);
+	  if (INTVAL (x) & IX86_HLE_ACQUIRE)
+#ifdef HAVE_AS_IX86_HLE
+	    fputs ("xacquire ", file);
+#else
+	    fputs ("\n" ASM_BYTE "0xf2\n\t", file);
+#endif
+	  else if (INTVAL (x) & IX86_HLE_RELEASE)
+#ifdef HAVE_AS_IX86_HLE
+	    fputs ("xrelease ", file);
+#else
+	    fputs ("\n" ASM_BYTE "0xf3\n\t", file);
+#endif
+	  /* We do not want to print value of the operand.  */
+	  return;
 
-	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
-	}
-      else
-	{
-	  rtx caddr = ix86_tls_get_addr ();
+	case 'N':
+	  if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
+	    fputs ("{z}", file);
+	  return;
 
-	  if (TARGET_64BIT)
+	case 'r':
+	  if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
 	    {
-	      rtx rax = gen_rtx_REG (Pmode, AX_REG);
-	      rtx_insn *insns;
+	      output_operand_lossage ("operand is not a specific integer, "
+				      "invalid operand code 'r'");
+	      return;
+	    }
 
-	      start_sequence ();
-	      emit_call_insn
-		(ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
-	      insns = get_insns ();
-	      end_sequence ();
+	  if (ASSEMBLER_DIALECT == ASM_INTEL)
+	    fputs (", ", file);
 
-	      if (GET_MODE (x) != Pmode)
-		x = gen_rtx_ZERO_EXTEND (Pmode, x);
+	  fputs ("{sae}", file);
 
-	      RTL_CONST_CALL_P (insns) = 1;
-	      emit_libcall_block (insns, dest, rax, x);
-	    }
-	  else
-	    emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
-	}
-      break;
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    fputs (", ", file);
 
-    case TLS_MODEL_LOCAL_DYNAMIC:
-      base = gen_reg_rtx (Pmode);
+	  return;
 
-      if (!TARGET_64BIT)
-	{
-	  if (flag_pic)
-	    pic = pic_offset_table_rtx;
-	  else
+	case 'R':
+	  if (!CONST_INT_P (x))
 	    {
-	      pic = gen_reg_rtx (Pmode);
-	      emit_insn (gen_set_got (pic));
+	      output_operand_lossage ("operand is not an integer, invalid "
+				      "operand code 'R'");
+	      return;
 	    }
-	}
 
-      if (TARGET_GNU2_TLS)
-	{
-	  rtx tmp = ix86_tls_module_base ();
+	  if (ASSEMBLER_DIALECT == ASM_INTEL)
+	    fputs (", ", file);
 
-	  if (TARGET_64BIT)
-	    emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
-	  else
-	    emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
+	  switch (INTVAL (x))
+	    {
+	    case ROUND_NEAREST_INT | ROUND_SAE:
+	      fputs ("{rn-sae}", file);
+	      break;
+	    case ROUND_NEG_INF | ROUND_SAE:
+	      fputs ("{rd-sae}", file);
+	      break;
+	    case ROUND_POS_INF | ROUND_SAE:
+	      fputs ("{ru-sae}", file);
+	      break;
+	    case ROUND_ZERO | ROUND_SAE:
+	      fputs ("{rz-sae}", file);
+	      break;
+	    default:
+	      output_operand_lossage ("operand is not a specific integer, "
+				      "invalid operand code 'R'");
+	    }
 
-	  tp = get_thread_pointer (Pmode, true);
-	  set_unique_reg_note (get_last_insn (), REG_EQUAL,
-			       gen_rtx_MINUS (Pmode, tmp, tp));
-	}
-      else
-	{
-	  rtx caddr = ix86_tls_get_addr ();
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    fputs (", ", file);
 
-	  if (TARGET_64BIT)
-	    {
-	      rtx rax = gen_rtx_REG (Pmode, AX_REG);
-	      rtx_insn *insns;
-	      rtx eqv;
+	  return;
 
-	      start_sequence ();
-	      emit_call_insn
-		(ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
-	      insns = get_insns ();
-	      end_sequence ();
+	case '*':
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('*', file);
+	  return;
 
-	      /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
-		 share the LD_BASE result with other LD model accesses.  */
-	      eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
-				    UNSPEC_TLS_LD_BASE);
+	case '&':
+	  {
+	    const char *name = get_some_local_dynamic_name ();
+	    if (name == NULL)
+	      output_operand_lossage ("'%%&' used without any "
+				      "local dynamic TLS references");
+	    else
+	      assemble_name (file, name);
+	    return;
+	  }
 
-	      RTL_CONST_CALL_P (insns) = 1;
-	      emit_libcall_block (insns, base, rax, eqv);
-	    }
-	  else
-	    emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
-	}
+	case '+':
+	  {
+	    rtx x;
 
-      off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
-      off = gen_rtx_CONST (Pmode, off);
+	    if (!optimize
+	        || optimize_function_for_size_p (cfun)
+		|| !TARGET_BRANCH_PREDICTION_HINTS)
+	      return;
 
-      dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
+	    x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
+	    if (x)
+	      {
+		int pred_val = profile_probability::from_reg_br_prob_note
+				 (XINT (x, 0)).to_reg_br_prob_base ();
 
-      if (TARGET_GNU2_TLS)
-	{
-	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
+		if (pred_val < REG_BR_PROB_BASE * 45 / 100
+		    || pred_val > REG_BR_PROB_BASE * 55 / 100)
+		  {
+		    bool taken = pred_val > REG_BR_PROB_BASE / 2;
+		    bool cputaken
+		      = final_forward_branch_p (current_output_insn) == 0;
 
-	  if (GET_MODE (x) != Pmode)
-	    x = gen_rtx_ZERO_EXTEND (Pmode, x);
+		    /* Emit hints only in the case default branch prediction
+		       heuristics would fail.  */
+		    if (taken != cputaken)
+		      {
+			/* We use 3e (DS) prefix for taken branches and
+			   2e (CS) prefix for not taken branches.  */
+			if (taken)
+			  fputs ("ds ; ", file);
+			else
+			  fputs ("cs ; ", file);
+		      }
+		  }
+	      }
+	    return;
+	  }
 
-	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
-	}
-      break;
+	case ';':
+#ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
+	  putc (';', file);
+#endif
+	  return;
 
-    case TLS_MODEL_INITIAL_EXEC:
-      if (TARGET_64BIT)
-	{
-	  if (TARGET_SUN_TLS && !TARGET_X32)
-	    {
-	      /* The Sun linker took the AMD64 TLS spec literally
-		 and can only handle %rax as destination of the
-		 initial executable code sequence.  */
+	case '~':
+	  putc (TARGET_AVX2 ? 'i' : 'f', file);
+	  return;
 
-	      dest = gen_reg_rtx (DImode);
-	      emit_insn (gen_tls_initial_exec_64_sun (dest, x));
-	      return dest;
+	case 'M':
+	  if (TARGET_X32)
+	    {
+	      /* NB: 32-bit indices in VSIB address are sign-extended
+		 to 64 bits. In x32, if 32-bit address 0xf7fa3010 is
+		 sign-extended to 0xfffffffff7fa3010 which is invalid
+		 address.  Add addr32 prefix if there is no base
+		 register nor symbol.  */
+	      bool ok;
+	      struct ix86_address parts;
+	      ok = ix86_decompose_address (x, &parts);
+	      gcc_assert (ok && parts.index == NULL_RTX);
+	      if (parts.base == NULL_RTX
+		  && (parts.disp == NULL_RTX
+		      || !symbolic_operand (parts.disp,
+					    GET_MODE (parts.disp))))
+		fputs ("addr32 ", file);
 	    }
+	  return;
 
-	  /* Generate DImode references to avoid %fs:(%reg32)
-	     problems and linker IE->LE relaxation bug.  */
-	  tp_mode = DImode;
-	  pic = NULL;
-	  type = UNSPEC_GOTNTPOFF;
-	}
-      else if (flag_pic)
-	{
-	  pic = pic_offset_table_rtx;
-	  type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
-	}
-      else if (!TARGET_ANY_GNU_TLS)
-	{
-	  pic = gen_reg_rtx (Pmode);
-	  emit_insn (gen_set_got (pic));
-	  type = UNSPEC_GOTTPOFF;
-	}
-      else
-	{
-	  pic = NULL;
-	  type = UNSPEC_INDNTPOFF;
-	}
+	case '^':
+	  if (TARGET_64BIT && Pmode != word_mode)
+	    fputs ("addr32 ", file);
+	  return;
 
-      off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
-      off = gen_rtx_CONST (tp_mode, off);
-      if (pic)
-	off = gen_rtx_PLUS (tp_mode, pic, off);
-      off = gen_const_mem (tp_mode, off);
-      set_mem_alias_set (off, ix86_GOT_alias_set ());
+	case '!':
+	  if (ix86_notrack_prefixed_insn_p (current_output_insn))
+	    fputs ("notrack ", file);
+	  return;
 
-      if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
-	{
-	  base = get_thread_pointer (tp_mode,
-				     for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
-	  off = force_reg (tp_mode, off);
-	  dest = gen_rtx_PLUS (tp_mode, base, off);
-	  if (tp_mode != Pmode)
-	    dest = convert_to_mode (Pmode, dest, 1);
-	}
-      else
-	{
-	  base = get_thread_pointer (Pmode, true);
-	  dest = gen_reg_rtx (Pmode);
-	  emit_insn (ix86_gen_sub3 (dest, base, off));
+	default:
+	  output_operand_lossage ("invalid operand code '%c'", code);
 	}
-      break;
+    }
 
-    case TLS_MODEL_LOCAL_EXEC:
-      off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
-			    (TARGET_64BIT || TARGET_ANY_GNU_TLS)
-			    ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
-      off = gen_rtx_CONST (Pmode, off);
+  if (REG_P (x))
+    print_reg (x, code, file);
 
-      if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
-	{
-	  base = get_thread_pointer (Pmode,
-				     for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
-	  return gen_rtx_PLUS (Pmode, base, off);
-	}
-      else
+  else if (MEM_P (x))
+    {
+      rtx addr = XEXP (x, 0);
+
+      /* No `byte ptr' prefix for call instructions ... */
+      if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
 	{
-	  base = get_thread_pointer (Pmode, true);
-	  dest = gen_reg_rtx (Pmode);
-	  emit_insn (ix86_gen_sub3 (dest, base, off));
-	}
-      break;
+	  machine_mode mode = GET_MODE (x);
+	  const char *size;
 
-    default:
-      gcc_unreachable ();
-    }
+	  /* Check for explicit size override codes.  */
+	  if (code == 'b')
+	    size = "BYTE";
+	  else if (code == 'w')
+	    size = "WORD";
+	  else if (code == 'k')
+	    size = "DWORD";
+	  else if (code == 'q')
+	    size = "QWORD";
+	  else if (code == 'x')
+	    size = "XMMWORD";
+	  else if (code == 't')
+	    size = "YMMWORD";
+	  else if (code == 'g')
+	    size = "ZMMWORD";
+	  else if (mode == BLKmode)
+	    /* ... or BLKmode operands, when not overridden.  */
+	    size = NULL;
+	  else
+	    switch (GET_MODE_SIZE (mode))
+	      {
+	      case 1: size = "BYTE"; break;
+	      case 2: size = "WORD"; break;
+	      case 4: size = "DWORD"; break;
+	      case 8: size = "QWORD"; break;
+	      case 12: size = "TBYTE"; break;
+	      case 16:
+		if (mode == XFmode)
+		  size = "TBYTE";
+		else
+		  size = "XMMWORD";
+		break;
+	      case 32: size = "YMMWORD"; break;
+	      case 64: size = "ZMMWORD"; break;
+	      default:
+		gcc_unreachable ();
+	      }
+	  if (size)
+	    {
+	      fputs (size, file);
+	      fputs (" PTR ", file);
+	    }
+	}
 
-  return dest;
-}
+      if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
+	output_operand_lossage ("invalid constraints for operand");
+      else
+	ix86_print_operand_address_as
+	  (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
+    }
 
-/* Return true if OP refers to a TLS address.  */
-bool
-ix86_tls_address_pattern_p (rtx op)
-{
-  subrtx_var_iterator::array_type array;
-  FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
+  else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
     {
-      rtx op = *iter;
-      if (MEM_P (op))
-	{
-	  rtx *x = &XEXP (op, 0);
-	  while (GET_CODE (*x) == PLUS)
-	    {
-	      int i;
-	      for (i = 0; i < 2; i++)
-		{
-		  rtx u = XEXP (*x, i);
-		  if (GET_CODE (u) == ZERO_EXTEND)
-		    u = XEXP (u, 0);
-		  if (GET_CODE (u) == UNSPEC
-		      && XINT (u, 1) == UNSPEC_TP)
-		    return true;
-		}
-	      x = &XEXP (*x, 0);
-	    }
+      long l;
 
-	  iter.skip_subrtxes ();
-	}
+      REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
+
+      if (ASSEMBLER_DIALECT == ASM_ATT)
+	putc ('$', file);
+      /* Sign extend 32bit SFmode immediate to 8 bytes.  */
+      if (code == 'q')
+	fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
+		 (unsigned long long) (int) l);
+      else
+	fprintf (file, "0x%08x", (unsigned int) l);
     }
 
-  return false;
-}
+  else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
+    {
+      long l[2];
 
-/* Rewrite *LOC so that it refers to a default TLS address space.  */
-void
-ix86_rewrite_tls_address_1 (rtx *loc)
-{
-  subrtx_ptr_iterator::array_type array;
-  FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
+      REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
+
+      if (ASSEMBLER_DIALECT == ASM_ATT)
+	putc ('$', file);
+      fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
+    }
+
+  /* These float cases don't actually occur as immediate operands.  */
+  else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
     {
-      rtx *loc = *iter;
-      if (MEM_P (*loc))
-	{
-	  rtx addr = XEXP (*loc, 0);
-	  rtx *x = &addr;
-	  while (GET_CODE (*x) == PLUS)
-	    {
-	      int i;
-	      for (i = 0; i < 2; i++)
-		{
-		  rtx u = XEXP (*x, i);
-		  if (GET_CODE (u) == ZERO_EXTEND)
-		    u = XEXP (u, 0);
-		  if (GET_CODE (u) == UNSPEC
-		      && XINT (u, 1) == UNSPEC_TP)
-		    {
-		      addr_space_t as = DEFAULT_TLS_SEG_REG;
+      char dstr[30];
 
-		      *x = XEXP (*x, 1 - i);
+      real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
+      fputs (dstr, file);
+    }
 
-		      *loc = replace_equiv_address_nv (*loc, addr, true);
-		      set_mem_addr_space (*loc, as);
-		      return;
-		    }
-		}
-	      x = &XEXP (*x, 0);
-	    }
+  else
+    {
+      /* We have patterns that allow zero sets of memory, for instance.
+	 In 64-bit mode, we should probably support all 8-byte vectors,
+	 since we can in fact encode that into an immediate.  */
+      if (GET_CODE (x) == CONST_VECTOR)
+	{
+	  if (x != CONST0_RTX (GET_MODE (x)))
+	    output_operand_lossage ("invalid vector immediate");
+	  x = const0_rtx;
+	}
 
-	  iter.skip_subrtxes ();
+      if (code != 'P' && code != 'p')
+	{
+	  if (CONST_INT_P (x))
+	    {
+	      if (ASSEMBLER_DIALECT == ASM_ATT)
+		putc ('$', file);
+	    }
+	  else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
+		   || GET_CODE (x) == LABEL_REF)
+	    {
+	      if (ASSEMBLER_DIALECT == ASM_ATT)
+		putc ('$', file);
+	      else
+		fputs ("OFFSET FLAT:", file);
+	    }
 	}
+      if (CONST_INT_P (x))
+	fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
+      else if (flag_pic || MACHOPIC_INDIRECT)
+	output_pic_addr_const (file, x, code);
+      else
+	output_addr_const (file, x);
     }
 }
 
-/* Rewrite instruction pattern involvning TLS address
-   so that it refers to a default TLS address space.  */
-rtx
-ix86_rewrite_tls_address (rtx pattern)
+static bool
+ix86_print_operand_punct_valid_p (unsigned char code)
 {
-  pattern = copy_insn (pattern);
-  ix86_rewrite_tls_address_1 (&pattern);
-  return pattern;
+  return (code == '*' || code == '+' || code == '&' || code == ';'
+	  || code == '~' || code == '^' || code == '!');
 }
+
+/* Print a memory operand whose address is ADDR.  */
 
-/* Create or return the unique __imp_DECL dllimport symbol corresponding
-   to symbol DECL if BEIMPORT is true.  Otherwise create or return the
-   unique refptr-DECL symbol corresponding to symbol DECL.  */
-
-struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
+static void
+ix86_print_operand_address_as (FILE *file, rtx addr,
+			       addr_space_t as, bool no_rip)
 {
-  static inline hashval_t hash (tree_map *m) { return m->hash; }
-  static inline bool
-  equal (tree_map *a, tree_map *b)
-  {
-    return a->base.from == b->base.from;
-  }
-
-  static int
-  keep_cache_entry (tree_map *&m)
-  {
-    return ggc_marked_p (m->base.from);
-  }
-};
+  struct ix86_address parts;
+  rtx base, index, disp;
+  int scale;
+  int ok;
+  bool vsib = false;
+  int code = 0;
 
-static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
+  if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
+    {
+      ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
+      gcc_assert (parts.index == NULL_RTX);
+      parts.index = XVECEXP (addr, 0, 1);
+      parts.scale = INTVAL (XVECEXP (addr, 0, 2));
+      addr = XVECEXP (addr, 0, 0);
+      vsib = true;
+    }
+  else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
+    {
+      gcc_assert (TARGET_64BIT);
+      ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
+      code = 'q';
+    }
+  else
+    ok = ix86_decompose_address (addr, &parts);
 
-static tree
-get_dllimport_decl (tree decl, bool beimport)
-{
-  struct tree_map *h, in;
-  const char *name;
-  const char *prefix;
-  size_t namelen, prefixlen;
-  char *imp_name;
-  tree to;
-  rtx rtl;
+  gcc_assert (ok);
 
-  if (!dllimport_map)
-    dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
+  base = parts.base;
+  index = parts.index;
+  disp = parts.disp;
+  scale = parts.scale;
 
-  in.hash = htab_hash_pointer (decl);
-  in.base.from = decl;
-  tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
-  h = *loc;
-  if (h)
-    return h->to;
+  if (ADDR_SPACE_GENERIC_P (as))
+    as = parts.seg;
+  else
+    gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
 
-  *loc = h = ggc_alloc<tree_map> ();
-  h->hash = in.hash;
-  h->base.from = decl;
-  h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
-			   VAR_DECL, NULL, ptr_type_node);
-  DECL_ARTIFICIAL (to) = 1;
-  DECL_IGNORED_P (to) = 1;
-  DECL_EXTERNAL (to) = 1;
-  TREE_READONLY (to) = 1;
+  if (!ADDR_SPACE_GENERIC_P (as))
+    {
+      if (ASSEMBLER_DIALECT == ASM_ATT)
+	putc ('%', file);
 
-  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
-  name = targetm.strip_name_encoding (name);
-  if (beimport)
-    prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
-      ? "*__imp_" : "*__imp__";
-  else
-    prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
-  namelen = strlen (name);
-  prefixlen = strlen (prefix);
-  imp_name = (char *) alloca (namelen + prefixlen + 1);
-  memcpy (imp_name, prefix, prefixlen);
-  memcpy (imp_name + prefixlen, name, namelen + 1);
+      switch (as)
+	{
+	case ADDR_SPACE_SEG_FS:
+	  fputs ("fs:", file);
+	  break;
+	case ADDR_SPACE_SEG_GS:
+	  fputs ("gs:", file);
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+    }
 
-  name = ggc_alloc_string (imp_name, namelen + prefixlen);
-  rtl = gen_rtx_SYMBOL_REF (Pmode, name);
-  SET_SYMBOL_REF_DECL (rtl, to);
-  SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
-  if (!beimport)
+  /* Use one byte shorter RIP relative addressing for 64bit mode.  */
+  if (TARGET_64BIT && !base && !index && !no_rip)
     {
-      SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
-#ifdef SUB_TARGET_RECORD_STUB
-      SUB_TARGET_RECORD_STUB (name);
-#endif
-    }      
+      rtx symbol = disp;
 
-  rtl = gen_const_mem (Pmode, rtl);
-  set_mem_alias_set (rtl, ix86_GOT_alias_set ());
+      if (GET_CODE (disp) == CONST
+	  && GET_CODE (XEXP (disp, 0)) == PLUS
+	  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
+	symbol = XEXP (XEXP (disp, 0), 0);
 
-  SET_DECL_RTL (to, rtl);
-  SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
+      if (GET_CODE (symbol) == LABEL_REF
+	  || (GET_CODE (symbol) == SYMBOL_REF
+	      && SYMBOL_REF_TLS_MODEL (symbol) == 0))
+	base = pc_rtx;
+    }
 
-  return to;
-}
+  if (!base && !index)
+    {
+      /* Displacement only requires special attention.  */
+      if (CONST_INT_P (disp))
+	{
+	  if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
+	    fputs ("ds:", file);
+	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
+	}
+      /* Load the external function address via the GOT slot to avoid PLT.  */
+      else if (GET_CODE (disp) == CONST
+	       && GET_CODE (XEXP (disp, 0)) == UNSPEC
+	       && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
+		   || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
+	       && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
+	output_pic_addr_const (file, disp, 0);
+      else if (flag_pic)
+	output_pic_addr_const (file, disp, 0);
+      else
+	output_addr_const (file, disp);
+    }
+  else
+    {
+      /* Print SImode register names to force addr32 prefix.  */
+      if (SImode_address_operand (addr, VOIDmode))
+	{
+	  if (flag_checking)
+	    {
+	      gcc_assert (TARGET_64BIT);
+	      switch (GET_CODE (addr))
+		{
+		case SUBREG:
+		  gcc_assert (GET_MODE (addr) == SImode);
+		  gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
+		  break;
+		case ZERO_EXTEND:
+		case AND:
+		  gcc_assert (GET_MODE (addr) == DImode);
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
+	    }
+	  gcc_assert (!code);
+	  code = 'k';
+	}
+      else if (code == 0
+	       && TARGET_X32
+	       && disp
+	       && CONST_INT_P (disp)
+	       && INTVAL (disp) < -16*1024*1024)
+	{
+	  /* X32 runs in 64-bit mode, where displacement, DISP, in
+	     address DISP(%r64), is encoded as 32-bit immediate sign-
+	     extended from 32-bit to 64-bit.  For -0x40000300(%r64),
+	     address is %r64 + 0xffffffffbffffd00.  When %r64 <
+	     0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
+	     which is invalid for x32.  The correct address is %r64
+	     - 0x40000300 == 0xf7ffdd64.  To properly encode
+	     -0x40000300(%r64) for x32, we zero-extend negative
+	     displacement by forcing addr32 prefix which truncates
+	     0xfffffffff7ffdd64 to 0xf7ffdd64.  In theory, we should
+	     zero-extend all negative displacements, including -1(%rsp).
+	     However, for small negative displacements, sign-extension
+	     won't cause overflow.  We only zero-extend negative
+	     displacements if they < -16*1024*1024, which is also used
+	     to check legitimate address displacements for PIC.  */
+	  code = 'k';
+	}
 
-/* Expand SYMBOL into its corresponding far-address symbol.
-   WANT_REG is true if we require the result be a register.  */
+      /* Since the upper 32 bits of RSP are always zero for x32,
+	 we can encode %esp as %rsp to avoid 0x67 prefix if
+	 there is no index register.  */
+      if (TARGET_X32 && Pmode == SImode
+	  && !index && base && REG_P (base) && REGNO (base) == SP_REG)
+	code = 'q';
 
-static rtx
-legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
-{
-  tree imp_decl;
-  rtx x;
+      if (ASSEMBLER_DIALECT == ASM_ATT)
+	{
+	  if (disp)
+	    {
+	      if (flag_pic)
+		output_pic_addr_const (file, disp, 0);
+	      else if (GET_CODE (disp) == LABEL_REF)
+		output_asm_label (disp);
+	      else
+		output_addr_const (file, disp);
+	    }
 
-  gcc_assert (SYMBOL_REF_DECL (symbol));
-  imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
+	  putc ('(', file);
+	  if (base)
+	    print_reg (base, code, file);
+	  if (index)
+	    {
+	      putc (',', file);
+	      print_reg (index, vsib ? 0 : code, file);
+	      if (scale != 1 || vsib)
+		fprintf (file, ",%d", scale);
+	    }
+	  putc (')', file);
+	}
+      else
+	{
+	  rtx offset = NULL_RTX;
 
-  x = DECL_RTL (imp_decl);
-  if (want_reg)
-    x = force_reg (Pmode, x);
-  return x;
-}
+	  if (disp)
+	    {
+	      /* Pull out the offset of a symbol; print any symbol itself.  */
+	      if (GET_CODE (disp) == CONST
+		  && GET_CODE (XEXP (disp, 0)) == PLUS
+		  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
+		{
+		  offset = XEXP (XEXP (disp, 0), 1);
+		  disp = gen_rtx_CONST (VOIDmode,
+					XEXP (XEXP (disp, 0), 0));
+		}
 
-/* Expand SYMBOL into its corresponding dllimport symbol.  WANT_REG is
-   true if we require the result be a register.  */
+	      if (flag_pic)
+		output_pic_addr_const (file, disp, 0);
+	      else if (GET_CODE (disp) == LABEL_REF)
+		output_asm_label (disp);
+	      else if (CONST_INT_P (disp))
+		offset = disp;
+	      else
+		output_addr_const (file, disp);
+	    }
 
-static rtx
-legitimize_dllimport_symbol (rtx symbol, bool want_reg)
-{
-  tree imp_decl;
-  rtx x;
+	  putc ('[', file);
+	  if (base)
+	    {
+	      print_reg (base, code, file);
+	      if (offset)
+		{
+		  if (INTVAL (offset) >= 0)
+		    putc ('+', file);
+		  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
+		}
+	    }
+	  else if (offset)
+	    fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
+	  else
+	    putc ('0', file);
 
-  gcc_assert (SYMBOL_REF_DECL (symbol));
-  imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
+	  if (index)
+	    {
+	      putc ('+', file);
+	      print_reg (index, vsib ? 0 : code, file);
+	      if (scale != 1 || vsib)
+		fprintf (file, "*%d", scale);
+	    }
+	  putc (']', file);
+	}
+    }
+}
 
-  x = DECL_RTL (imp_decl);
-  if (want_reg)
-    x = force_reg (Pmode, x);
-  return x;
+static void
+ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
+{
+  ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
 }
 
-/* Expand SYMBOL into its corresponding dllimport or refptr symbol.  WANT_REG 
-   is true if we require the result be a register.  */
+/* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
 
-static rtx
-legitimize_pe_coff_symbol (rtx addr, bool inreg)
+static bool
+i386_asm_output_addr_const_extra (FILE *file, rtx x)
 {
-  if (!TARGET_PECOFF)
-    return NULL_RTX;
+  rtx op;
 
-  if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
-    {
-      if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
-	return legitimize_dllimport_symbol (addr, inreg);
-      if (GET_CODE (addr) == CONST
-	  && GET_CODE (XEXP (addr, 0)) == PLUS
-	  && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
-	  && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
-	{
-	  rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
-	  return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
-	}
-    }
-
-  if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
-    return NULL_RTX;
-  if (GET_CODE (addr) == SYMBOL_REF
-      && !is_imported_p (addr)
-      && SYMBOL_REF_EXTERNAL_P (addr)
-      && SYMBOL_REF_DECL (addr))
-    return legitimize_pe_coff_extern_decl (addr, inreg);
+  if (GET_CODE (x) != UNSPEC)
+    return false;
 
-  if (GET_CODE (addr) == CONST
-      && GET_CODE (XEXP (addr, 0)) == PLUS
-      && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
-      && !is_imported_p (XEXP (XEXP (addr, 0), 0))
-      && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
-      && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
+  op = XVECEXP (x, 0, 0);
+  switch (XINT (x, 1))
     {
-      rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
-      return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
+    case UNSPEC_GOTOFF:
+      output_addr_const (file, op);
+      fputs ("@gotoff", file);
+      break;
+    case UNSPEC_GOTTPOFF:
+      output_addr_const (file, op);
+      /* FIXME: This might be @TPOFF in Sun ld.  */
+      fputs ("@gottpoff", file);
+      break;
+    case UNSPEC_TPOFF:
+      output_addr_const (file, op);
+      fputs ("@tpoff", file);
+      break;
+    case UNSPEC_NTPOFF:
+      output_addr_const (file, op);
+      if (TARGET_64BIT)
+	fputs ("@tpoff", file);
+      else
+	fputs ("@ntpoff", file);
+      break;
+    case UNSPEC_DTPOFF:
+      output_addr_const (file, op);
+      fputs ("@dtpoff", file);
+      break;
+    case UNSPEC_GOTNTPOFF:
+      output_addr_const (file, op);
+      if (TARGET_64BIT)
+	fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+	       "@gottpoff(%rip)" : "@gottpoff[rip]", file);
+      else
+	fputs ("@gotntpoff", file);
+      break;
+    case UNSPEC_INDNTPOFF:
+      output_addr_const (file, op);
+      fputs ("@indntpoff", file);
+      break;
+#if TARGET_MACHO
+    case UNSPEC_MACHOPIC_OFFSET:
+      output_addr_const (file, op);
+      putc ('-', file);
+      machopic_output_function_base_name (file);
+      break;
+#endif
+
+    default:
+      return false;
     }
-  return NULL_RTX;
-}
 
-/* Try machine-dependent ways of modifying an illegitimate address
-   to be legitimate.  If we find one, return the new, valid address.
-   This macro is used in only one place: `memory_address' in explow.c.
+  return true;
+}
+
+
+/* Output code to perform a 387 binary operation in INSN, one of PLUS,
+   MINUS, MULT or DIV.  OPERANDS are the insn operands, where operands[3]
+   is the expression of the binary operation.  The output may either be
+   emitted here, or returned to the caller, like all output_* functions.
 
-   OLDX is the address as it was before break_out_memory_refs was called.
-   In some cases it is useful to look at this to decide what needs to be done.
+   There is no guarantee that the operands are the same mode, as they
+   might be within FLOAT or FLOAT_EXTEND expressions.  */
 
-   It is always safe for this macro to do nothing.  It exists to recognize
-   opportunities to optimize the output.
+#ifndef SYSV386_COMPAT
+/* Set to 1 for compatibility with brain-damaged assemblers.  No-one
+   wants to fix the assemblers because that causes incompatibility
+   with gcc.  No-one wants to fix gcc because that causes
+   incompatibility with assemblers...  You can use the option of
+   -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way.  */
+#define SYSV386_COMPAT 1
+#endif
 
-   For the 80386, we handle X+REG by loading X into a register R and
-   using R+REG.  R will go in a general reg and indexing will be used.
-   However, if REG is a broken-out memory address or multiplication,
-   nothing needs to be done because REG can certainly go in a general reg.
+const char *
+output_387_binary_op (rtx_insn *insn, rtx *operands)
+{
+  static char buf[40];
+  const char *p;
+  bool is_sse
+    = (SSE_REG_P (operands[0])
+       || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
 
-   When -fpic is used, special handling is needed for symbolic references.
-   See comments by legitimize_pic_address in i386.c for details.  */
+  if (is_sse)
+    p = "%v";
+  else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
+	   || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
+    p = "fi";
+  else
+    p = "f";
 
-static rtx
-ix86_legitimize_address (rtx x, rtx, machine_mode mode)
-{
-  bool changed = false;
-  unsigned log;
+  strcpy (buf, p);
 
-  log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
-  if (log)
-    return legitimize_tls_address (x, (enum tls_model) log, false);
-  if (GET_CODE (x) == CONST
-      && GET_CODE (XEXP (x, 0)) == PLUS
-      && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
-      && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
+  switch (GET_CODE (operands[3]))
     {
-      rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
-				      (enum tls_model) log, false);
-      return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
+    case PLUS:
+      p = "add"; break;
+    case MINUS:
+      p = "sub"; break;
+    case MULT:
+      p = "mul"; break;
+    case DIV:
+      p = "div"; break;
+    default:
+      gcc_unreachable ();
     }
 
-  if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
-    {
-      rtx tmp = legitimize_pe_coff_symbol (x, true);
-      if (tmp)
-        return tmp;
-    }
+  strcat (buf, p);
 
-  if (flag_pic && SYMBOLIC_CONST (x))
-    return legitimize_pic_address (x, 0);
+  if (is_sse)
+   {
+     p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
+     strcat (buf, p);
 
-#if TARGET_MACHO
-  if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
-    return machopic_indirect_data_reference (x, 0);
-#endif
+     if (TARGET_AVX)
+       p = "\t{%2, %1, %0|%0, %1, %2}";
+     else
+       p = "\t{%2, %0|%0, %2}";
 
-  /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
-  if (GET_CODE (x) == ASHIFT
-      && CONST_INT_P (XEXP (x, 1))
-      && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
+     strcat (buf, p);
+     return buf;
+   }
+
+  /* Even if we do not want to check the inputs, this documents input
+     constraints.  Which helps in understanding the following code.  */
+  if (flag_checking)
     {
-      changed = true;
-      log = INTVAL (XEXP (x, 1));
-      x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
-			GEN_INT (1 << log));
+      if (STACK_REG_P (operands[0])
+	  && ((REG_P (operands[1])
+	       && REGNO (operands[0]) == REGNO (operands[1])
+	       && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
+	      || (REG_P (operands[2])
+		  && REGNO (operands[0]) == REGNO (operands[2])
+		  && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
+	  && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
+	; /* ok */
+      else
+	gcc_unreachable ();
     }
 
-  if (GET_CODE (x) == PLUS)
+  switch (GET_CODE (operands[3]))
     {
-      /* Canonicalize shifts by 0, 1, 2, 3 into multiply.  */
+    case MULT:
+    case PLUS:
+      if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
+	std::swap (operands[1], operands[2]);
 
-      if (GET_CODE (XEXP (x, 0)) == ASHIFT
-	  && CONST_INT_P (XEXP (XEXP (x, 0), 1))
-	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
+      /* know operands[0] == operands[1].  */
+
+      if (MEM_P (operands[2]))
 	{
-	  changed = true;
-	  log = INTVAL (XEXP (XEXP (x, 0), 1));
-	  XEXP (x, 0) = gen_rtx_MULT (Pmode,
-				      force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
-				      GEN_INT (1 << log));
+	  p = "%Z2\t%2";
+	  break;
 	}
 
-      if (GET_CODE (XEXP (x, 1)) == ASHIFT
-	  && CONST_INT_P (XEXP (XEXP (x, 1), 1))
-	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
+      if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
 	{
-	  changed = true;
-	  log = INTVAL (XEXP (XEXP (x, 1), 1));
-	  XEXP (x, 1) = gen_rtx_MULT (Pmode,
-				      force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
-				      GEN_INT (1 << log));
+	  if (STACK_TOP_P (operands[0]))
+	    /* How is it that we are storing to a dead operand[2]?
+	       Well, presumably operands[1] is dead too.  We can't
+	       store the result to st(0) as st(0) gets popped on this
+	       instruction.  Instead store to operands[2] (which I
+	       think has to be st(1)).  st(1) will be popped later.
+	       gcc <= 2.8.1 didn't have this check and generated
+	       assembly code that the Unixware assembler rejected.  */
+	    p = "p\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
+	  else
+	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
+	  break;
 	}
 
-      /* Put multiply first if it isn't already.  */
-      if (GET_CODE (XEXP (x, 1)) == MULT)
+      if (STACK_TOP_P (operands[0]))
+	p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
+      else
+	p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
+      break;
+
+    case MINUS:
+    case DIV:
+      if (MEM_P (operands[1]))
 	{
-	  std::swap (XEXP (x, 0), XEXP (x, 1));
-	  changed = true;
+	  p = "r%Z1\t%1";
+	  break;
 	}
 
-      /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
-	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  This can be
-	 created by virtual register instantiation, register elimination, and
-	 similar optimizations.  */
-      if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
+      if (MEM_P (operands[2]))
 	{
-	  changed = true;
-	  x = gen_rtx_PLUS (Pmode,
-			    gen_rtx_PLUS (Pmode, XEXP (x, 0),
-					  XEXP (XEXP (x, 1), 0)),
-			    XEXP (XEXP (x, 1), 1));
+	  p = "%Z2\t%2";
+	  break;
 	}
 
-      /* Canonicalize
-	 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
-	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  */
-      else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
-	       && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
-	       && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
-	       && CONSTANT_P (XEXP (x, 1)))
+      if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
 	{
-	  rtx constant;
-	  rtx other = NULL_RTX;
-
-	  if (CONST_INT_P (XEXP (x, 1)))
-	    {
-	      constant = XEXP (x, 1);
-	      other = XEXP (XEXP (XEXP (x, 0), 1), 1);
-	    }
-	  else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
-	    {
-	      constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
-	      other = XEXP (x, 1);
-	    }
+#if SYSV386_COMPAT
+	  /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
+	     derived assemblers, confusingly reverse the direction of
+	     the operation for fsub{r} and fdiv{r} when the
+	     destination register is not st(0).  The Intel assembler
+	     doesn't have this brain damage.  Read !SYSV386_COMPAT to
+	     figure out what the hardware really does.  */
+	  if (STACK_TOP_P (operands[0]))
+	    p = "{p\t%0, %2|rp\t%2, %0}";
 	  else
-	    constant = 0;
-
-	  if (constant)
-	    {
-	      changed = true;
-	      x = gen_rtx_PLUS (Pmode,
-				gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
-					      XEXP (XEXP (XEXP (x, 0), 1), 0)),
-				plus_constant (Pmode, other,
-					       INTVAL (constant)));
-	    }
+	    p = "{rp\t%2, %0|p\t%0, %2}";
+#else
+	  if (STACK_TOP_P (operands[0]))
+	    /* As above for fmul/fadd, we can't store to st(0).  */
+	    p = "rp\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
+	  else
+	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
+#endif
+	  break;
 	}
 
-      if (changed && ix86_legitimate_address_p (mode, x, false))
-	return x;
-
-      if (GET_CODE (XEXP (x, 0)) == MULT)
+      if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
 	{
-	  changed = true;
-	  XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
+#if SYSV386_COMPAT
+	  if (STACK_TOP_P (operands[0]))
+	    p = "{rp\t%0, %1|p\t%1, %0}";
+	  else
+	    p = "{p\t%1, %0|rp\t%0, %1}";
+#else
+	  if (STACK_TOP_P (operands[0]))
+	    p = "p\t{%0, %1|%1, %0}";	/* st(1) = st(1) op st(0); pop */
+	  else
+	    p = "rp\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2); pop */
+#endif
+	  break;
 	}
 
-      if (GET_CODE (XEXP (x, 1)) == MULT)
+      if (STACK_TOP_P (operands[0]))
 	{
-	  changed = true;
-	  XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
+	  if (STACK_TOP_P (operands[1]))
+	    p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
+	  else
+	    p = "r\t{%y1, %0|%0, %y1}";	/* st(0) = st(r1) op st(0) */
+	  break;
 	}
-
-      if (changed
-	  && REG_P (XEXP (x, 1))
-	  && REG_P (XEXP (x, 0)))
-	return x;
-
-      if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
+      else if (STACK_TOP_P (operands[1]))
 	{
-	  changed = true;
-	  x = legitimize_pic_address (x, 0);
+#if SYSV386_COMPAT
+	  p = "{\t%1, %0|r\t%0, %1}";
+#else
+	  p = "r\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2) */
+#endif
+	}
+      else
+	{
+#if SYSV386_COMPAT
+	  p = "{r\t%2, %0|\t%0, %2}";
+#else
+	  p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
+#endif
 	}
+      break;
 
-      if (changed && ix86_legitimate_address_p (mode, x, false))
-	return x;
+    default:
+      gcc_unreachable ();
+    }
 
-      if (REG_P (XEXP (x, 0)))
-	{
-	  rtx temp = gen_reg_rtx (Pmode);
-	  rtx val  = force_operand (XEXP (x, 1), temp);
-	  if (val != temp)
-	    {
-	      val = convert_to_mode (Pmode, val, 1);
-	      emit_move_insn (temp, val);
-	    }
+  strcat (buf, p);
+  return buf;
+}
 
-	  XEXP (x, 1) = temp;
-	  return x;
-	}
+/* Return needed mode for entity in optimize_mode_switching pass.  */
 
-      else if (REG_P (XEXP (x, 1)))
-	{
-	  rtx temp = gen_reg_rtx (Pmode);
-	  rtx val  = force_operand (XEXP (x, 0), temp);
-	  if (val != temp)
-	    {
-	      val = convert_to_mode (Pmode, val, 1);
-	      emit_move_insn (temp, val);
-	    }
+static int
+ix86_dirflag_mode_needed (rtx_insn *insn)
+{
+  if (CALL_P (insn))
+    {
+      if (cfun->machine->func_type == TYPE_NORMAL)
+	return X86_DIRFLAG_ANY;
+      else
+	/* No need to emit CLD in interrupt handler for TARGET_CLD.  */
+	return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
+    }
 
-	  XEXP (x, 0) = temp;
-	  return x;
-	}
+  if (recog_memoized (insn) < 0)
+    return X86_DIRFLAG_ANY;
+
+  if (get_attr_type (insn) == TYPE_STR)
+    {
+      /* Emit cld instruction if stringops are used in the function.  */
+      if (cfun->machine->func_type == TYPE_NORMAL)
+	return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
+      else
+	return X86_DIRFLAG_RESET;
     }
 
-  return x;
+  return X86_DIRFLAG_ANY;
 }
-
-/* Print an integer constant expression in assembler syntax.  Addition
-   and subtraction are the only arithmetic that may appear in these
-   expressions.  FILE is the stdio stream to write to, X is the rtx, and
-   CODE is the operand print code from the output string.  */
 
-static void
-output_pic_addr_const (FILE *file, rtx x, int code)
+/* Check if a 256bit or 512 bit AVX register is referenced inside of EXP.   */
+
+static bool
+ix86_check_avx_upper_register (const_rtx exp)
 {
-  char buf[256];
+  return SSE_REG_P (exp) && GET_MODE_BITSIZE (GET_MODE (exp)) > 128;
+}
 
-  switch (GET_CODE (x))
+/* Return needed mode for entity in optimize_mode_switching pass.  */
+
+static int
+ix86_avx_u128_mode_needed (rtx_insn *insn)
+{
+  if (CALL_P (insn))
     {
-    case PC:
-      gcc_assert (flag_pic);
-      putc ('.', file);
-      break;
+      rtx link;
 
-    case SYMBOL_REF:
-      if (TARGET_64BIT || ! TARGET_MACHO_SYMBOL_STUBS)
-	output_addr_const (file, x);
-      else
+      /* Needed mode is set to AVX_U128_CLEAN if there are
+	 no 256bit or 512bit modes used in function arguments. */
+      for (link = CALL_INSN_FUNCTION_USAGE (insn);
+	   link;
+	   link = XEXP (link, 1))
 	{
-	  const char *name = XSTR (x, 0);
-
-	  /* Mark the decl as referenced so that cgraph will
-	     output the function.  */
-	  if (SYMBOL_REF_DECL (x))
-	    mark_decl_referenced (SYMBOL_REF_DECL (x));
+	  if (GET_CODE (XEXP (link, 0)) == USE)
+	    {
+	      rtx arg = XEXP (XEXP (link, 0), 0);
 
-#if TARGET_MACHO
-	  if (MACHOPIC_INDIRECT
-	      && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
-	    name = machopic_indirection_name (x, /*stub_p=*/true);
-#endif
-	  assemble_name (file, name);
+	      if (ix86_check_avx_upper_register (arg))
+		return AVX_U128_DIRTY;
+	    }
 	}
-      if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
-	  && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
-	fputs ("@PLT", file);
-      break;
 
-    case LABEL_REF:
-      x = XEXP (x, 0);
-      /* FALLTHRU */
-    case CODE_LABEL:
-      ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
-      assemble_name (asm_out_file, buf);
-      break;
+      return AVX_U128_CLEAN;
+    }
 
-    case CONST_INT:
-      fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
-      break;
+  /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
+     Hardware changes state only when a 256bit register is written to,
+     but we need to prevent the compiler from moving optimal insertion
+     point above eventual read from 256bit or 512 bit register.  */
+  subrtx_iterator::array_type array;
+  FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
+    if (ix86_check_avx_upper_register (*iter))
+      return AVX_U128_DIRTY;
 
-    case CONST:
-      /* This used to output parentheses around the expression,
-	 but that does not work on the 386 (either ATT or BSD assembler).  */
-      output_pic_addr_const (file, XEXP (x, 0), code);
-      break;
+  return AVX_U128_ANY;
+}
 
-    case CONST_DOUBLE:
-      /* We can't handle floating point constants;
-	 TARGET_PRINT_OPERAND must handle them.  */
-      output_operand_lossage ("floating constant misused");
-      break;
+/* Return mode that i387 must be switched into
+   prior to the execution of insn.  */
 
-    case PLUS:
-      /* Some assemblers need integer constants to appear first.  */
-      if (CONST_INT_P (XEXP (x, 0)))
-	{
-	  output_pic_addr_const (file, XEXP (x, 0), code);
-	  putc ('+', file);
-	  output_pic_addr_const (file, XEXP (x, 1), code);
-	}
-      else
-	{
-	  gcc_assert (CONST_INT_P (XEXP (x, 1)));
-	  output_pic_addr_const (file, XEXP (x, 1), code);
-	  putc ('+', file);
-	  output_pic_addr_const (file, XEXP (x, 0), code);
-	}
+static int
+ix86_i387_mode_needed (int entity, rtx_insn *insn)
+{
+  enum attr_i387_cw mode;
+
+  /* The mode UNINITIALIZED is used to store control word after a
+     function call or ASM pattern.  The mode ANY specify that function
+     has no requirements on the control word and make no changes in the
+     bits we are interested in.  */
+
+  if (CALL_P (insn)
+      || (NONJUMP_INSN_P (insn)
+	  && (asm_noperands (PATTERN (insn)) >= 0
+	      || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
+    return I387_CW_UNINITIALIZED;
+
+  if (recog_memoized (insn) < 0)
+    return I387_CW_ANY;
+
+  mode = get_attr_i387_cw (insn);
+
+  switch (entity)
+    {
+    case I387_TRUNC:
+      if (mode == I387_CW_TRUNC)
+	return mode;
       break;
 
-    case MINUS:
-      if (!TARGET_MACHO)
-	putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
-      output_pic_addr_const (file, XEXP (x, 0), code);
-      putc ('-', file);
-      output_pic_addr_const (file, XEXP (x, 1), code);
-      if (!TARGET_MACHO)
-	putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
+    case I387_FLOOR:
+      if (mode == I387_CW_FLOOR)
+	return mode;
       break;
 
-    case UNSPEC:
-      gcc_assert (XVECLEN (x, 0) == 1);
-      output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
-      switch (XINT (x, 1))
-	{
-	case UNSPEC_GOT:
-	  fputs ("@GOT", file);
-	  break;
-	case UNSPEC_GOTOFF:
-	  fputs ("@GOTOFF", file);
-	  break;
-	case UNSPEC_PLTOFF:
-	  fputs ("@PLTOFF", file);
-	  break;
-	case UNSPEC_PCREL:
-	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
-		 "(%rip)" : "[rip]", file);
-	  break;
-	case UNSPEC_GOTPCREL:
-	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
-		 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
-	  break;
-	case UNSPEC_GOTTPOFF:
-	  /* FIXME: This might be @TPOFF in Sun ld too.  */
-	  fputs ("@gottpoff", file);
-	  break;
-	case UNSPEC_TPOFF:
-	  fputs ("@tpoff", file);
-	  break;
-	case UNSPEC_NTPOFF:
-	  if (TARGET_64BIT)
-	    fputs ("@tpoff", file);
-	  else
-	    fputs ("@ntpoff", file);
-	  break;
-	case UNSPEC_DTPOFF:
-	  fputs ("@dtpoff", file);
-	  break;
-	case UNSPEC_GOTNTPOFF:
-	  if (TARGET_64BIT)
-	    fputs (ASSEMBLER_DIALECT == ASM_ATT ?
-		   "@gottpoff(%rip)": "@gottpoff[rip]", file);
-	  else
-	    fputs ("@gotntpoff", file);
-	  break;
-	case UNSPEC_INDNTPOFF:
-	  fputs ("@indntpoff", file);
-	  break;
-#if TARGET_MACHO
-	case UNSPEC_MACHOPIC_OFFSET:
-	  putc ('-', file);
-	  machopic_output_function_base_name (file);
-	  break;
-#endif
-	default:
-	  output_operand_lossage ("invalid UNSPEC as operand");
-	  break;
-	}
-       break;
+    case I387_CEIL:
+      if (mode == I387_CW_CEIL)
+	return mode;
+      break;
 
     default:
-      output_operand_lossage ("invalid expression as operand");
+      gcc_unreachable ();
     }
+
+  return I387_CW_ANY;
 }
 
-/* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
-   We need to emit DTP-relative relocations.  */
+/* Return mode that entity must be switched into
+   prior to the execution of insn.  */
 
-static void ATTRIBUTE_UNUSED
-i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
+static int
+ix86_mode_needed (int entity, rtx_insn *insn)
 {
-  fputs (ASM_LONG, file);
-  output_addr_const (file, x);
-  fputs ("@dtpoff", file);
-  switch (size)
+  switch (entity)
     {
-    case 4:
-      break;
-    case 8:
-      fputs (", 0", file);
-      break;
+    case X86_DIRFLAG:
+      return ix86_dirflag_mode_needed (insn);
+    case AVX_U128:
+      return ix86_avx_u128_mode_needed (insn);
+    case I387_TRUNC:
+    case I387_FLOOR:
+    case I387_CEIL:
+      return ix86_i387_mode_needed (entity, insn);
     default:
       gcc_unreachable ();
-   }
+    }
+  return 0;
 }
 
-/* Return true if X is a representation of the PIC register.  This copes
-   with calls from ix86_find_base_term, where the register might have
-   been replaced by a cselib value.  */
-
-static bool
-ix86_pic_register_p (rtx x)
-{
-  if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
-    return (pic_offset_table_rtx
-	    && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
-  else if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SET_GOT)
-    return true;
-  else if (!REG_P (x))
-    return false;
-  else if (pic_offset_table_rtx)
+/* Check if a 256bit or 512bit AVX register is referenced in stores.   */
+ 
+static void
+ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
+ {
+   if (ix86_check_avx_upper_register (dest))
     {
-      if (REGNO (x) == REGNO (pic_offset_table_rtx))
-	return true;
-      if (HARD_REGISTER_P (x)
-	  && !HARD_REGISTER_P (pic_offset_table_rtx)
-	  && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
-	return true;
-      return false;
+      bool *used = (bool *) data;
+      *used = true;
     }
-  else
-    return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
-}
+ } 
 
-/* Helper function for ix86_delegitimize_address.
-   Attempt to delegitimize TLS local-exec accesses.  */
+/* Calculate mode of upper 128bit AVX registers after the insn.  */
 
-static rtx
-ix86_delegitimize_tls_address (rtx orig_x)
+static int
+ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
 {
-  rtx x = orig_x, unspec;
-  struct ix86_address addr;
+  rtx pat = PATTERN (insn);
 
-  if (!TARGET_TLS_DIRECT_SEG_REFS)
-    return orig_x;
-  if (MEM_P (x))
-    x = XEXP (x, 0);
-  if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
-    return orig_x;
-  if (ix86_decompose_address (x, &addr) == 0
-      || addr.seg != DEFAULT_TLS_SEG_REG
-      || addr.disp == NULL_RTX
-      || GET_CODE (addr.disp) != CONST)
-    return orig_x;
-  unspec = XEXP (addr.disp, 0);
-  if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
-    unspec = XEXP (unspec, 0);
-  if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
-    return orig_x;
-  x = XVECEXP (unspec, 0, 0);
-  gcc_assert (GET_CODE (x) == SYMBOL_REF);
-  if (unspec != XEXP (addr.disp, 0))
-    x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
-  if (addr.index)
+  if (vzeroupper_pattern (pat, VOIDmode)
+      || vzeroall_pattern (pat, VOIDmode))
+    return AVX_U128_CLEAN;
+
+  /* We know that state is clean after CALL insn if there are no
+     256bit or 512bit registers used in the function return register. */
+  if (CALL_P (insn))
     {
-      rtx idx = addr.index;
-      if (addr.scale != 1)
-	idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
-      x = gen_rtx_PLUS (Pmode, idx, x);
+      bool avx_upper_reg_found = false;
+      note_stores (insn, ix86_check_avx_upper_stores, &avx_upper_reg_found);
+
+      return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
     }
-  if (addr.base)
-    x = gen_rtx_PLUS (Pmode, addr.base, x);
-  if (MEM_P (orig_x))
-    x = replace_equiv_address_nv (orig_x, x);
-  return x;
-}
 
-/* In the name of slightly smaller debug output, and to cater to
-   general assembler lossage, recognize PIC+GOTOFF and turn it back
-   into a direct symbol reference.
+  /* Otherwise, return current mode.  Remember that if insn
+     references AVX 256bit or 512bit registers, the mode was already
+     changed to DIRTY from MODE_NEEDED.  */
+  return mode;
+}
 
-   On Darwin, this is necessary to avoid a crash, because Darwin
-   has a different PIC label for each routine but the DWARF debugging
-   information is not associated with any particular routine, so it's
-   necessary to remove references to the PIC label from RTL stored by
-   the DWARF output code.
+/* Return the mode that an insn results in.  */
 
-   This helper is used in the normal ix86_delegitimize_address
-   entrypoint (e.g. used in the target delegitimization hook) and
-   in ix86_find_base_term.  As compile time memory optimization, we
-   avoid allocating rtxes that will not change anything on the outcome
-   of the callers (find_base_value and find_base_term).  */
+static int
+ix86_mode_after (int entity, int mode, rtx_insn *insn)
+{
+  switch (entity)
+    {
+    case X86_DIRFLAG:
+      return mode;
+    case AVX_U128:
+      return ix86_avx_u128_mode_after (mode, insn);
+    case I387_TRUNC:
+    case I387_FLOOR:
+    case I387_CEIL:
+      return mode;
+    default:
+      gcc_unreachable ();
+    }
+}
 
-static inline rtx
-ix86_delegitimize_address_1 (rtx x, bool base_term_p)
+static int
+ix86_dirflag_mode_entry (void)
 {
-  rtx orig_x = delegitimize_mem_from_attrs (x);
-  /* addend is NULL or some rtx if x is something+GOTOFF where
-     something doesn't include the PIC register.  */
-  rtx addend = NULL_RTX;
-  /* reg_addend is NULL or a multiple of some register.  */
-  rtx reg_addend = NULL_RTX;
-  /* const_addend is NULL or a const_int.  */
-  rtx const_addend = NULL_RTX;
-  /* This is the result, or NULL.  */
-  rtx result = NULL_RTX;
+  /* For TARGET_CLD or in the interrupt handler we can't assume
+     direction flag state at function entry.  */
+  if (TARGET_CLD
+      || cfun->machine->func_type != TYPE_NORMAL)
+    return X86_DIRFLAG_ANY;
 
-  x = orig_x;
+  return X86_DIRFLAG_RESET;
+}
 
-  if (MEM_P (x))
-    x = XEXP (x, 0);
+static int
+ix86_avx_u128_mode_entry (void)
+{
+  tree arg;
 
-  if (TARGET_64BIT)
+  /* Entry mode is set to AVX_U128_DIRTY if there are
+     256bit or 512bit modes used in function arguments.  */
+  for (arg = DECL_ARGUMENTS (current_function_decl); arg;
+       arg = TREE_CHAIN (arg))
     {
-      if (GET_CODE (x) == CONST
-          && GET_CODE (XEXP (x, 0)) == PLUS
-          && GET_MODE (XEXP (x, 0)) == Pmode
-          && CONST_INT_P (XEXP (XEXP (x, 0), 1))
-          && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
-          && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
-        {
-	  /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
-	     base.  A CONST can't be arg_pointer_rtx based.  */
-	  if (base_term_p && MEM_P (orig_x))
-	    return orig_x;
-	  rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
-	  x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
-	  if (MEM_P (orig_x))
-	    x = replace_equiv_address_nv (orig_x, x);
-	  return x;
-	}
-
-      if (GET_CODE (x) == CONST
-	  && GET_CODE (XEXP (x, 0)) == UNSPEC
-	  && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
-	      || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
-	  && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
-	{
-	  x = XVECEXP (XEXP (x, 0), 0, 0);
-	  if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
-	    {
-	      x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
-	      if (x == NULL_RTX)
-		return orig_x;
-	    }
-	  return x;
-	}
-
-      if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
-	return ix86_delegitimize_tls_address (orig_x);
+      rtx incoming = DECL_INCOMING_RTL (arg);
 
-      /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
-	 and -mcmodel=medium -fpic.  */
+      if (incoming && ix86_check_avx_upper_register (incoming))
+	return AVX_U128_DIRTY;
     }
 
-  if (GET_CODE (x) != PLUS
-      || GET_CODE (XEXP (x, 1)) != CONST)
-    return ix86_delegitimize_tls_address (orig_x);
+  return AVX_U128_CLEAN;
+}
 
-  if (ix86_pic_register_p (XEXP (x, 0)))
-    /* %ebx + GOT/GOTOFF */
-    ;
-  else if (GET_CODE (XEXP (x, 0)) == PLUS)
-    {
-      /* %ebx + %reg * scale + GOT/GOTOFF */
-      reg_addend = XEXP (x, 0);
-      if (ix86_pic_register_p (XEXP (reg_addend, 0)))
-	reg_addend = XEXP (reg_addend, 1);
-      else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
-	reg_addend = XEXP (reg_addend, 0);
-      else
-	{
-	  reg_addend = NULL_RTX;
-	  addend = XEXP (x, 0);
-	}
-    }
-  else
-    addend = XEXP (x, 0);
+/* Return a mode that ENTITY is assumed to be
+   switched to at function entry.  */
 
-  x = XEXP (XEXP (x, 1), 0);
-  if (GET_CODE (x) == PLUS
-      && CONST_INT_P (XEXP (x, 1)))
+static int
+ix86_mode_entry (int entity)
+{
+  switch (entity)
     {
-      const_addend = XEXP (x, 1);
-      x = XEXP (x, 0);
+    case X86_DIRFLAG:
+      return ix86_dirflag_mode_entry ();
+    case AVX_U128:
+      return ix86_avx_u128_mode_entry ();
+    case I387_TRUNC:
+    case I387_FLOOR:
+    case I387_CEIL:
+      return I387_CW_ANY;
+    default:
+      gcc_unreachable ();
     }
+}
 
-  if (GET_CODE (x) == UNSPEC
-      && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
-	  || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
-	  || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
-	      && !MEM_P (orig_x) && !addend)))
-    result = XVECEXP (x, 0, 0);
+static int
+ix86_avx_u128_mode_exit (void)
+{
+  rtx reg = crtl->return_rtx;
 
-  if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
-      && !MEM_P (orig_x))
-    result = XVECEXP (x, 0, 0);
+  /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
+     or 512 bit modes used in the function return register. */
+  if (reg && ix86_check_avx_upper_register (reg))
+    return AVX_U128_DIRTY;
 
-  if (! result)
-    return ix86_delegitimize_tls_address (orig_x);
+  /* Exit mode is set to AVX_U128_DIRTY if there are 256bit or 512bit
+     modes used in function arguments, otherwise return AVX_U128_CLEAN.
+   */
+  return ix86_avx_u128_mode_entry ();
+}
 
-  /* For (PLUS something CONST_INT) both find_base_{value,term} just
-     recurse on the first operand.  */
-  if (const_addend && !base_term_p)
-    result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
-  if (reg_addend)
-    result = gen_rtx_PLUS (Pmode, reg_addend, result);
-  if (addend)
-    {
-      /* If the rest of original X doesn't involve the PIC register, add
-	 addend and subtract pic_offset_table_rtx.  This can happen e.g.
-	 for code like:
-	 leal (%ebx, %ecx, 4), %ecx
-	 ...
-	 movl foo@GOTOFF(%ecx), %edx
-	 in which case we return (%ecx - %ebx) + foo
-	 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
-	 and reload has completed.  Don't do the latter for debug,
-	 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly.  */
-      if (pic_offset_table_rtx
-	  && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
-        result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
-						     pic_offset_table_rtx),
-			       result);
-      else if (base_term_p
-	       && pic_offset_table_rtx
-	       && !TARGET_MACHO
-	       && !TARGET_VXWORKS_RTP)
-	{
-	  rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
-	  tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
-	  result = gen_rtx_PLUS (Pmode, tmp, result);
-	}
-      else
-	return orig_x;
-    }
-  if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
+/* Return a mode that ENTITY is assumed to be
+   switched to at function exit.  */
+
+static int
+ix86_mode_exit (int entity)
+{
+  switch (entity)
     {
-      result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
-      if (result == NULL_RTX)
-	return orig_x;
+    case X86_DIRFLAG:
+      return X86_DIRFLAG_ANY;
+    case AVX_U128:
+      return ix86_avx_u128_mode_exit ();
+    case I387_TRUNC:
+    case I387_FLOOR:
+    case I387_CEIL:
+      return I387_CW_ANY;
+    default:
+      gcc_unreachable ();
     }
-  return result;
 }
 
-/* The normal instantiation of the above template.  */
-
-static rtx
-ix86_delegitimize_address (rtx x)
+static int
+ix86_mode_priority (int, int n)
 {
-  return ix86_delegitimize_address_1 (x, false);
+  return n;
 }
 
-/* If X is a machine specific address (i.e. a symbol or label being
-   referenced as a displacement from the GOT implemented using an
-   UNSPEC), then return the base term.  Otherwise return X.  */
+/* Output code to initialize control word copies used by trunc?f?i and
+   rounding patterns.  CURRENT_MODE is set to current control word,
+   while NEW_MODE is set to new control word.  */
 
-rtx
-ix86_find_base_term (rtx x)
+static void
+emit_i387_cw_initialization (int mode)
 {
-  rtx term;
+  rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
+  rtx new_mode;
 
-  if (TARGET_64BIT)
-    {
-      if (GET_CODE (x) != CONST)
-	return x;
-      term = XEXP (x, 0);
-      if (GET_CODE (term) == PLUS
-	  && CONST_INT_P (XEXP (term, 1)))
-	term = XEXP (term, 0);
-      if (GET_CODE (term) != UNSPEC
-	  || (XINT (term, 1) != UNSPEC_GOTPCREL
-	      && XINT (term, 1) != UNSPEC_PCREL))
-	return x;
+  enum ix86_stack_slot slot;
 
-      return XVECEXP (term, 0, 0);
-    }
+  rtx reg = gen_reg_rtx (HImode);
 
-  return ix86_delegitimize_address_1 (x, true);
-}
-
-/* Return true if X shouldn't be emitted into the debug info.
-   Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
-   symbol easily into the .debug_info section, so we need not to
-   delegitimize, but instead assemble as @gotoff.
-   Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
-   assembles that as _GLOBAL_OFFSET_TABLE_-. expression.  */
-
-static bool
-ix86_const_not_ok_for_debug_p (rtx x)
-{
-  if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
-    return true;
-
-  if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
-    return true;
-
-  return false;
-}
-
-static void
-put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
-		    bool fp, FILE *file)
-{
-  const char *suffix;
-
-  if (mode == CCFPmode)
-    {
-      code = ix86_fp_compare_code_to_integer (code);
-      mode = CCmode;
-    }
-  if (reverse)
-    code = reverse_condition (code);
+  emit_insn (gen_x86_fnstcw_1 (stored_mode));
+  emit_move_insn (reg, copy_rtx (stored_mode));
 
-  switch (code)
+  switch (mode)
     {
-    case EQ:
-      gcc_assert (mode != CCGZmode);
-      switch (mode)
-	{
-	case E_CCAmode:
-	  suffix = "a";
-	  break;
-	case E_CCCmode:
-	  suffix = "c";
-	  break;
-	case E_CCOmode:
-	  suffix = "o";
-	  break;
-	case E_CCPmode:
-	  suffix = "p";
-	  break;
-	case E_CCSmode:
-	  suffix = "s";
-	  break;
-	default:
-	  suffix = "e";
-	  break;
-	}
-      break;
-    case NE:
-      gcc_assert (mode != CCGZmode);
-      switch (mode)
-	{
-	case E_CCAmode:
-	  suffix = "na";
-	  break;
-	case E_CCCmode:
-	  suffix = "nc";
-	  break;
-	case E_CCOmode:
-	  suffix = "no";
-	  break;
-	case E_CCPmode:
-	  suffix = "np";
-	  break;
-	case E_CCSmode:
-	  suffix = "ns";
-	  break;
-	default:
-	  suffix = "ne";
-	  break;
-	}
+    case I387_CW_TRUNC:
+      /* round toward zero (truncate) */
+      emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
+      slot = SLOT_CW_TRUNC;
       break;
-    case GT:
-      gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
-      suffix = "g";
+
+    case I387_CW_FLOOR:
+      /* round down toward -oo */
+      emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
+      emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
+      slot = SLOT_CW_FLOOR;
       break;
-    case GTU:
-      /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
-	 Those same assemblers have the same but opposite lossage on cmov.  */
-      if (mode == CCmode)
-	suffix = fp ? "nbe" : "a";
-      else
-	gcc_unreachable ();
+
+    case I387_CW_CEIL:
+      /* round up toward +oo */
+      emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
+      emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
+      slot = SLOT_CW_CEIL;
       break;
-    case LT:
-      switch (mode)
-	{
-	case E_CCNOmode:
-	case E_CCGOCmode:
-	  suffix = "s";
-	  break;
 
-	case E_CCmode:
-	case E_CCGCmode:
-	case E_CCGZmode:
-	  suffix = "l";
-	  break;
+    default:
+      gcc_unreachable ();
+    }
 
-	default:
-	  gcc_unreachable ();
-	}
-      break;
-    case LTU:
-      if (mode == CCmode || mode == CCGZmode)
-	suffix = "b";
-      else if (mode == CCCmode)
-	suffix = fp ? "b" : "c";
-      else
-	gcc_unreachable ();
-      break;
-    case GE:
-      switch (mode)
-	{
-	case E_CCNOmode:
-	case E_CCGOCmode:
-	  suffix = "ns";
-	  break;
+  gcc_assert (slot < MAX_386_STACK_LOCALS);
 
-	case E_CCmode:
-	case E_CCGCmode:
-	case E_CCGZmode:
-	  suffix = "ge";
-	  break;
+  new_mode = assign_386_stack_local (HImode, slot);
+  emit_move_insn (new_mode, reg);
+}
 
-	default:
-	  gcc_unreachable ();
-	}
-      break;
-    case GEU:
-      if (mode == CCmode || mode == CCGZmode)
-	suffix = "nb";
-      else if (mode == CCCmode)
-	suffix = fp ? "nb" : "nc";
-      else
-	gcc_unreachable ();
-      break;
-    case LE:
-      gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
-      suffix = "le";
-      break;
-    case LEU:
-      if (mode == CCmode)
-	suffix = "be";
-      else
-	gcc_unreachable ();
+/* Generate one or more insns to set ENTITY to MODE.  */
+
+static void
+ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
+		    HARD_REG_SET regs_live ATTRIBUTE_UNUSED)
+{
+  switch (entity)
+    {
+    case X86_DIRFLAG:
+      if (mode == X86_DIRFLAG_RESET)
+	emit_insn (gen_cld ());
       break;
-    case UNORDERED:
-      suffix = fp ? "u" : "p";
+    case AVX_U128:
+      if (mode == AVX_U128_CLEAN)
+	emit_insn (gen_avx_vzeroupper ());
       break;
-    case ORDERED:
-      suffix = fp ? "nu" : "np";
+    case I387_TRUNC:
+    case I387_FLOOR:
+    case I387_CEIL:
+      if (mode != I387_CW_ANY
+	  && mode != I387_CW_UNINITIALIZED)
+	emit_i387_cw_initialization (mode);
       break;
     default:
       gcc_unreachable ();
     }
-  fputs (suffix, file);
 }
 
-/* Print the name of register X to FILE based on its machine mode and number.
-   If CODE is 'w', pretend the mode is HImode.
-   If CODE is 'b', pretend the mode is QImode.
-   If CODE is 'k', pretend the mode is SImode.
-   If CODE is 'q', pretend the mode is DImode.
-   If CODE is 'x', pretend the mode is V4SFmode.
-   If CODE is 't', pretend the mode is V8SFmode.
-   If CODE is 'g', pretend the mode is V16SFmode.
-   If CODE is 'h', pretend the reg is the 'high' byte register.
-   If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
-   If CODE is 'd', duplicate the operand for AVX instruction.
-   If CODE is 'V', print naked full integer register name without %.
- */
+/* Output code for INSN to convert a float to a signed int.  OPERANDS
+   are the insn operands.  The output may be [HSD]Imode and the input
+   operand may be [SDX]Fmode.  */
 
-void
-print_reg (rtx x, int code, FILE *file)
+const char *
+output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
 {
-  const char *reg;
-  int msize;
-  unsigned int regno;
-  bool duplicated;
+  bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
+  bool dimode_p = GET_MODE (operands[0]) == DImode;
+  int round_mode = get_attr_i387_cw (insn);
 
-  if (ASSEMBLER_DIALECT == ASM_ATT && code != 'V')
-    putc ('%', file);
+  static char buf[40];
+  const char *p;
 
-  if (x == pc_rtx)
-    {
-      gcc_assert (TARGET_64BIT);
-      fputs ("rip", file);
-      return;
-    }
+  /* Jump through a hoop or two for DImode, since the hardware has no
+     non-popping instruction.  We used to do this a different way, but
+     that was somewhat fragile and broke with post-reload splitters.  */
+  if ((dimode_p || fisttp) && !stack_top_dies)
+    output_asm_insn ("fld\t%y1", operands);
 
-  if (code == 'y' && STACK_TOP_P (x))
-    {
-      fputs ("st(0)", file);
-      return;
-    }
+  gcc_assert (STACK_TOP_P (operands[1]));
+  gcc_assert (MEM_P (operands[0]));
+  gcc_assert (GET_MODE (operands[1]) != TFmode);
 
-  if (code == 'w')
-    msize = 2;
-  else if (code == 'b')
-    msize = 1;
-  else if (code == 'k')
-    msize = 4;
-  else if (code == 'q')
-    msize = 8;
-  else if (code == 'h')
-    msize = 0;
-  else if (code == 'x')
-    msize = 16;
-  else if (code == 't')
-    msize = 32;
-  else if (code == 'g')
-    msize = 64;
-  else
-    msize = GET_MODE_SIZE (GET_MODE (x));
-
-  regno = REGNO (x);
-
-  if (regno == ARG_POINTER_REGNUM
-      || regno == FRAME_POINTER_REGNUM
-      || regno == FPSR_REG)
-    {
-      output_operand_lossage
-	("invalid use of register '%s'", reg_names[regno]);
-      return;
-    }
-  else if (regno == FLAGS_REG)
-    {
-      output_operand_lossage ("invalid use of asm flag output");
-      return;
-    }
+  if (fisttp)
+    return "fisttp%Z0\t%0";
 
-  if (code == 'V')
-    {
-      if (GENERAL_REGNO_P (regno))
-	msize = GET_MODE_SIZE (word_mode);
-      else
-	error ("%<V%> modifier on non-integer register");
-    }
+  strcpy (buf, "fist");
 
-  duplicated = code == 'd' && TARGET_AVX;
+  if (round_mode != I387_CW_ANY)
+    output_asm_insn ("fldcw\t%3", operands);
 
-  switch (msize)
-    {
-    case 16:
-    case 12:
-    case 8:
-      if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
-	warning (0, "unsupported size for integer register");
-      /* FALLTHRU */
-    case 4:
-      if (LEGACY_INT_REGNO_P (regno))
-	putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
-      /* FALLTHRU */
-    case 2:
-    normal:
-      reg = hi_reg_name[regno];
-      break;
-    case 1:
-      if (regno >= ARRAY_SIZE (qi_reg_name))
-	goto normal;
-      if (!ANY_QI_REGNO_P (regno))
-	error ("unsupported size for integer register");
-      reg = qi_reg_name[regno];
-      break;
-    case 0:
-      if (regno >= ARRAY_SIZE (qi_high_reg_name))
-	goto normal;
-      reg = qi_high_reg_name[regno];
-      break;
-    case 32:
-    case 64:
-      if (SSE_REGNO_P (regno))
-	{
-	  gcc_assert (!duplicated);
-	  putc (msize == 32 ? 'y' : 'z', file);
-	  reg = hi_reg_name[regno] + 1;
-	  break;
-	}
-      goto normal;
-    default:
-      gcc_unreachable ();
-    }
+  p = "p%Z0\t%0";
+  strcat (buf, p + !(stack_top_dies || dimode_p));
 
-  fputs (reg, file);
+  output_asm_insn (buf, operands);
 
-  /* Irritatingly, AMD extended registers use
-     different naming convention: "r%d[bwd]"  */
-  if (REX_INT_REGNO_P (regno))
-    {
-      gcc_assert (TARGET_64BIT);
-      switch (msize)
-	{
-	  case 0:
-	    error ("extended registers have no high halves");
-	    break;
-	  case 1:
-	    putc ('b', file);
-	    break;
-	  case 2:
-	    putc ('w', file);
-	    break;
-	  case 4:
-	    putc ('d', file);
-	    break;
-	  case 8:
-	    /* no suffix */
-	    break;
-	  default:
-	    error ("unsupported operand size for extended register");
-	    break;
-	}
-      return;
-    }
+  if (round_mode != I387_CW_ANY)
+    output_asm_insn ("fldcw\t%2", operands);
 
-  if (duplicated)
-    {
-      if (ASSEMBLER_DIALECT == ASM_ATT)
-	fprintf (file, ", %%%s", reg);
-      else
-	fprintf (file, ", %s", reg);
-    }
+  return "";
 }
 
-/* Meaning of CODE:
-   L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
-   C -- print opcode suffix for set/cmov insn.
-   c -- like C, but print reversed condition
-   F,f -- likewise, but for floating-point.
-   O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
-	otherwise nothing
-   R -- print embedded rounding and sae.
-   r -- print only sae.
-   z -- print the opcode suffix for the size of the current operand.
-   Z -- likewise, with special suffixes for x87 instructions.
-   * -- print a star (in certain assembler syntax)
-   A -- print an absolute memory reference.
-   E -- print address with DImode register names if TARGET_64BIT.
-   w -- print the operand as if it's a "word" (HImode) even if it isn't.
-   s -- print a shift double count, followed by the assemblers argument
-	delimiter.
-   b -- print the QImode name of the register for the indicated operand.
-	%b0 would print %al if operands[0] is reg 0.
-   w --  likewise, print the HImode name of the register.
-   k --  likewise, print the SImode name of the register.
-   q --  likewise, print the DImode name of the register.
-   x --  likewise, print the V4SFmode name of the register.
-   t --  likewise, print the V8SFmode name of the register.
-   g --  likewise, print the V16SFmode name of the register.
-   h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
-   y -- print "st(0)" instead of "st" as a register.
-   d -- print duplicated register operand for AVX instruction.
-   D -- print condition for SSE cmp instruction.
-   P -- if PIC, print an @PLT suffix.
-   p -- print raw symbol name.
-   X -- don't print any sort of PIC '@' suffix for a symbol.
-   & -- print some in-use local-dynamic symbol name.
-   H -- print a memory address offset by 8; used for sse high-parts
-   Y -- print condition for XOP pcom* instruction.
-   V -- print naked full integer register name without %.
-   + -- print a branch hint as 'cs' or 'ds' prefix
-   ; -- print a semicolon (after prefixes due to bug in older gas).
-   ~ -- print "i" if TARGET_AVX2, "f" otherwise.
-   ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
-   M -- print addr32 prefix for TARGET_X32 with VSIB address.
-   ! -- print NOTRACK prefix for jxx/call/ret instructions if required.
- */
+/* Output code for x87 ffreep insn.  The OPNO argument, which may only
+   have the values zero or one, indicates the ffreep insn's operand
+   from the OPERANDS array.  */
 
-void
-ix86_print_operand (FILE *file, rtx x, int code)
+static const char *
+output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
 {
-  if (code)
+  if (TARGET_USE_FFREEP)
+#ifdef HAVE_AS_IX86_FFREEP
+    return opno ? "ffreep\t%y1" : "ffreep\t%y0";
+#else
     {
-      switch (code)
-	{
-	case 'A':
-	  switch (ASSEMBLER_DIALECT)
-	    {
-	    case ASM_ATT:
-	      putc ('*', file);
-	      break;
-
-	    case ASM_INTEL:
-	      /* Intel syntax. For absolute addresses, registers should not
-		 be surrounded by braces.  */
-	      if (!REG_P (x))
-		{
-		  putc ('[', file);
-		  ix86_print_operand (file, x, 0);
-		  putc (']', file);
-		  return;
-		}
-	      break;
+      static char retval[32];
+      int regno = REGNO (operands[opno]);
 
-	    default:
-	      gcc_unreachable ();
-	    }
+      gcc_assert (STACK_REGNO_P (regno));
 
-	  ix86_print_operand (file, x, 0);
-	  return;
+      regno -= FIRST_STACK_REG;
 
-	case 'E':
-	  /* Wrap address in an UNSPEC to declare special handling.  */
-	  if (TARGET_64BIT)
-	    x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
+      snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
+      return retval;
+    }
+#endif
 
-	  output_address (VOIDmode, x);
-	  return;
+  return opno ? "fstp\t%y1" : "fstp\t%y0";
+}
 
-	case 'L':
-	  if (ASSEMBLER_DIALECT == ASM_ATT)
-	    putc ('l', file);
-	  return;
 
-	case 'W':
-	  if (ASSEMBLER_DIALECT == ASM_ATT)
-	    putc ('w', file);
-	  return;
+/* Output code for INSN to compare OPERANDS.  EFLAGS_P is 1 when fcomi
+   should be used.  UNORDERED_P is true when fucom should be used.  */
 
-	case 'B':
-	  if (ASSEMBLER_DIALECT == ASM_ATT)
-	    putc ('b', file);
-	  return;
+const char *
+output_fp_compare (rtx_insn *insn, rtx *operands,
+		   bool eflags_p, bool unordered_p)
+{
+  rtx *xops = eflags_p ? &operands[0] : &operands[1];
+  bool stack_top_dies;
 
-	case 'Q':
-	  if (ASSEMBLER_DIALECT == ASM_ATT)
-	    putc ('l', file);
-	  return;
+  static char buf[40];
+  const char *p;
 
-	case 'S':
-	  if (ASSEMBLER_DIALECT == ASM_ATT)
-	    putc ('s', file);
-	  return;
+  gcc_assert (STACK_TOP_P (xops[0]));
 
-	case 'T':
-	  if (ASSEMBLER_DIALECT == ASM_ATT)
-	    putc ('t', file);
-	  return;
+  stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
 
-	case 'O':
-#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
-	  if (ASSEMBLER_DIALECT != ASM_ATT)
-	    return;
-
-	  switch (GET_MODE_SIZE (GET_MODE (x)))
-	    {
-	    case 2:
-	      putc ('w', file);
-	      break;
-  
-	    case 4:
-	      putc ('l', file);
-	      break;
+  if (eflags_p)
+    {
+      p = unordered_p ? "fucomi" : "fcomi";
+      strcpy (buf, p);
 
-	    case 8:
-	      putc ('q', file);
-	      break;
+      p = "p\t{%y1, %0|%0, %y1}";
+      strcat (buf, p + !stack_top_dies);
 
-	    default:
-	      output_operand_lossage ("invalid operand size for operand "
-				      "code 'O'");
-	      return;
-	    }
+      return buf;
+    }
 
-	  putc ('.', file);
-#endif
-	  return;
+  if (STACK_REG_P (xops[1])
+      && stack_top_dies
+      && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
+    {
+      gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
 
-	case 'z':
-	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
-	    {
-	      /* Opcodes don't get size suffixes if using Intel opcodes.  */
-	      if (ASSEMBLER_DIALECT == ASM_INTEL)
-		return;
+      /* If both the top of the 387 stack die, and the other operand
+	 is also a stack register that dies, then this must be a
+	 `fcompp' float compare.  */
+      p = unordered_p ? "fucompp" : "fcompp";
+      strcpy (buf, p);
+    }
+  else if (const0_operand (xops[1], VOIDmode))
+    {
+      gcc_assert (!unordered_p);
+      strcpy (buf, "ftst");
+    }
+  else
+    {
+      if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
+	{
+	  gcc_assert (!unordered_p);
+	  p = "ficom";
+	}
+      else
+	p = unordered_p ? "fucom" : "fcom";
 
-	      switch (GET_MODE_SIZE (GET_MODE (x)))
-		{
-		case 1:
-		  putc ('b', file);
-		  return;
+      strcpy (buf, p);
 
-		case 2:
-		  putc ('w', file);
-		  return;
+      p = "p%Z2\t%y2";
+      strcat (buf, p + !stack_top_dies);
+    }
 
-		case 4:
-		  putc ('l', file);
-		  return;
+  output_asm_insn (buf, operands);
+  return "fnstsw\t%0";
+}
 
-		case 8:
-		  putc ('q', file);
-		  return;
+void
+ix86_output_addr_vec_elt (FILE *file, int value)
+{
+  const char *directive = ASM_LONG;
 
-		default:
-		  output_operand_lossage ("invalid operand size for operand "
-					  "code 'z'");
-		  return;
-		}
-	    }
+#ifdef ASM_QUAD
+  if (TARGET_LP64)
+    directive = ASM_QUAD;
+#else
+  gcc_assert (!TARGET_64BIT);
+#endif
 
-	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
-	    warning (0, "non-integer operand used with operand code %<z%>");
-	  /* FALLTHRU */
+  fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
+}
 
-	case 'Z':
-	  /* 387 opcodes don't get size suffixes if using Intel opcodes.  */
-	  if (ASSEMBLER_DIALECT == ASM_INTEL)
-	    return;
+void
+ix86_output_addr_diff_elt (FILE *file, int value, int rel)
+{
+  const char *directive = ASM_LONG;
 
-	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
-	    {
-	      switch (GET_MODE_SIZE (GET_MODE (x)))
-		{
-		case 2:
-#ifdef HAVE_AS_IX86_FILDS
-		  putc ('s', file);
+#ifdef ASM_QUAD
+  if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
+    directive = ASM_QUAD;
+#else
+  gcc_assert (!TARGET_64BIT);
 #endif
-		  return;
+  /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand.  */
+  if (TARGET_64BIT || TARGET_VXWORKS_RTP)
+    fprintf (file, "%s%s%d-%s%d\n",
+	     directive, LPREFIX, value, LPREFIX, rel);
+#if TARGET_MACHO
+  else if (TARGET_MACHO)
+    {
+      fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
+      machopic_output_function_base_name (file);
+      putc ('\n', file);
+    }
+#endif
+  else if (HAVE_AS_GOTOFF_IN_DATA)
+    fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
+  else
+    asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
+		 GOT_SYMBOL_NAME, LPREFIX, value);
+}
+
+#define LEA_MAX_STALL (3)
+#define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
 
-		case 4:
-		  putc ('l', file);
-		  return;
+/* Increase given DISTANCE in half-cycles according to
+   dependencies between PREV and NEXT instructions.
+   Add 1 half-cycle if there is no dependency and
+   go to next cycle if there is some dependecy.  */
 
-		case 8:
-#ifdef HAVE_AS_IX86_FILDQ
-		  putc ('q', file);
-#else
-		  fputs ("ll", file);
-#endif
-		  return;
+static unsigned int
+increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
+{
+  df_ref def, use;
 
-		default:
-		  break;
-		}
-	    }
-	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
-	    {
-	      /* 387 opcodes don't get size suffixes
-		 if the operands are registers.  */
-	      if (STACK_REG_P (x))
-		return;
+  if (!prev || !next)
+    return distance + (distance & 1) + 2;
 
-	      switch (GET_MODE_SIZE (GET_MODE (x)))
-		{
-		case 4:
-		  putc ('s', file);
-		  return;
+  if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
+    return distance + 1;
 
-		case 8:
-		  putc ('l', file);
-		  return;
+  FOR_EACH_INSN_USE (use, next)
+    FOR_EACH_INSN_DEF (def, prev)
+      if (!DF_REF_IS_ARTIFICIAL (def)
+	  && DF_REF_REGNO (use) == DF_REF_REGNO (def))
+	return distance + (distance & 1) + 2;
 
-		case 12:
-		case 16:
-		  putc ('t', file);
-		  return;
+  return distance + 1;
+}
 
-		default:
-		  break;
-		}
-	    }
-	  else
-	    {
-	      output_operand_lossage ("invalid operand type used with "
-				      "operand code 'Z'");
-	      return;
-	    }
+/* Function checks if instruction INSN defines register number
+   REGNO1 or REGNO2.  */
 
-	  output_operand_lossage ("invalid operand size for operand code 'Z'");
-	  return;
+bool
+insn_defines_reg (unsigned int regno1, unsigned int regno2,
+		  rtx_insn *insn)
+{
+  df_ref def;
 
-	case 'd':
-	case 'b':
-	case 'w':
-	case 'k':
-	case 'q':
-	case 'h':
-	case 't':
-	case 'g':
-	case 'y':
-	case 'x':
-	case 'X':
-	case 'P':
-	case 'p':
-	case 'V':
-	  break;
+  FOR_EACH_INSN_DEF (def, insn)
+    if (DF_REF_REG_DEF_P (def)
+	&& !DF_REF_IS_ARTIFICIAL (def)
+	&& (regno1 == DF_REF_REGNO (def)
+	    || regno2 == DF_REF_REGNO (def)))
+      return true;
 
-	case 's':
-	  if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
-	    {
-	      ix86_print_operand (file, x, 0);
-	      fputs (", ", file);
-	    }
-	  return;
+  return false;
+}
 
-	case 'Y':
-	  switch (GET_CODE (x))
-	    {
-	    case NE:
-	      fputs ("neq", file);
-	      break;
-	    case EQ:
-	      fputs ("eq", file);
-	      break;
-	    case GE:
-	    case GEU:
-	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
-	      break;
-	    case GT:
-	    case GTU:
-	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
-	      break;
-	    case LE:
-	    case LEU:
-	      fputs ("le", file);
-	      break;
-	    case LT:
-	    case LTU:
-	      fputs ("lt", file);
-	      break;
-	    case UNORDERED:
-	      fputs ("unord", file);
-	      break;
-	    case ORDERED:
-	      fputs ("ord", file);
-	      break;
-	    case UNEQ:
-	      fputs ("ueq", file);
-	      break;
-	    case UNGE:
-	      fputs ("nlt", file);
-	      break;
-	    case UNGT:
-	      fputs ("nle", file);
-	      break;
-	    case UNLE:
-	      fputs ("ule", file);
-	      break;
-	    case UNLT:
-	      fputs ("ult", file);
-	      break;
-	    case LTGT:
-	      fputs ("une", file);
-	      break;
-	    default:
-	      output_operand_lossage ("operand is not a condition code, "
-				      "invalid operand code 'Y'");
-	      return;
-	    }
-	  return;
+/* Function checks if instruction INSN uses register number
+   REGNO as a part of address expression.  */
 
-	case 'D':
-	  /* Little bit of braindamage here.  The SSE compare instructions
-	     does use completely different names for the comparisons that the
-	     fp conditional moves.  */
-	  switch (GET_CODE (x))
-	    {
-	    case UNEQ:
-	      if (TARGET_AVX)
-		{
-		  fputs ("eq_us", file);
-		  break;
-		}
-	     /* FALLTHRU */
-	    case EQ:
-	      fputs ("eq", file);
-	      break;
-	    case UNLT:
-	      if (TARGET_AVX)
-		{
-		  fputs ("nge", file);
-		  break;
-		}
-	     /* FALLTHRU */
-	    case LT:
-	      fputs ("lt", file);
-	      break;
-	    case UNLE:
-	      if (TARGET_AVX)
-		{
-		  fputs ("ngt", file);
-		  break;
-		}
-	     /* FALLTHRU */
-	    case LE:
-	      fputs ("le", file);
-	      break;
-	    case UNORDERED:
-	      fputs ("unord", file);
-	      break;
-	    case LTGT:
-	      if (TARGET_AVX)
-		{
-		  fputs ("neq_oq", file);
-		  break;
-		}
-	     /* FALLTHRU */
-	    case NE:
-	      fputs ("neq", file);
-	      break;
-	    case GE:
-	      if (TARGET_AVX)
-		{
-		  fputs ("ge", file);
-		  break;
-		}
-	     /* FALLTHRU */
-	    case UNGE:
-	      fputs ("nlt", file);
-	      break;
-	    case GT:
-	      if (TARGET_AVX)
-		{
-		  fputs ("gt", file);
-		  break;
-		}
-	     /* FALLTHRU */
-	    case UNGT:
-	      fputs ("nle", file);
-	      break;
-	    case ORDERED:
-	      fputs ("ord", file);
-	      break;
-	    default:
-	      output_operand_lossage ("operand is not a condition code, "
-				      "invalid operand code 'D'");
-	      return;
-	    }
-	  return;
+static bool
+insn_uses_reg_mem (unsigned int regno, rtx insn)
+{
+  df_ref use;
 
-	case 'F':
-	case 'f':
-#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
-	  if (ASSEMBLER_DIALECT == ASM_ATT)
-	    putc ('.', file);
-	  gcc_fallthrough ();
-#endif
+  FOR_EACH_INSN_USE (use, insn)
+    if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
+      return true;
 
-	case 'C':
-	case 'c':
-	  if (!COMPARISON_P (x))
-	    {
-	      output_operand_lossage ("operand is not a condition code, "
-				      "invalid operand code '%c'", code);
-	      return;
-	    }
-	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
-			      code == 'c' || code == 'f',
-			      code == 'F' || code == 'f',
-			      file);
-	  return;
+  return false;
+}
 
-	case 'H':
-	  if (!offsettable_memref_p (x))
-	    {
-	      output_operand_lossage ("operand is not an offsettable memory "
-				      "reference, invalid operand code 'H'");
-	      return;
-	    }
-	  /* It doesn't actually matter what mode we use here, as we're
-	     only going to use this for printing.  */
-	  x = adjust_address_nv (x, DImode, 8);
-	  /* Output 'qword ptr' for intel assembler dialect.  */
-	  if (ASSEMBLER_DIALECT == ASM_INTEL)
-	    code = 'q';
-	  break;
+/* Search backward for non-agu definition of register number REGNO1
+   or register number REGNO2 in basic block starting from instruction
+   START up to head of basic block or instruction INSN.
 
-	case 'K':
-	  if (!CONST_INT_P (x))
-	    {
-	      output_operand_lossage ("operand is not an integer, invalid "
-				      "operand code 'K'");
-	      return;
-	    }
+   Function puts true value into *FOUND var if definition was found
+   and false otherwise.
 
-	  if (INTVAL (x) & IX86_HLE_ACQUIRE)
-#ifdef HAVE_AS_IX86_HLE
-	    fputs ("xacquire ", file);
-#else
-	    fputs ("\n" ASM_BYTE "0xf2\n\t", file);
-#endif
-	  else if (INTVAL (x) & IX86_HLE_RELEASE)
-#ifdef HAVE_AS_IX86_HLE
-	    fputs ("xrelease ", file);
-#else
-	    fputs ("\n" ASM_BYTE "0xf3\n\t", file);
-#endif
-	  /* We do not want to print value of the operand.  */
-	  return;
+   Distance in half-cycles between START and found instruction or head
+   of BB is added to DISTANCE and returned.  */
 
-	case 'N':
-	  if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
-	    fputs ("{z}", file);
-	  return;
+static int
+distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
+			       rtx_insn *insn, int distance,
+			       rtx_insn *start, bool *found)
+{
+  basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
+  rtx_insn *prev = start;
+  rtx_insn *next = NULL;
 
-	case 'r':
-	  if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
+  *found = false;
+
+  while (prev
+	 && prev != insn
+	 && distance < LEA_SEARCH_THRESHOLD)
+    {
+      if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
+	{
+	  distance = increase_distance (prev, next, distance);
+	  if (insn_defines_reg (regno1, regno2, prev))
 	    {
-	      output_operand_lossage ("operand is not a specific integer, "
-				      "invalid operand code 'r'");
-	      return;
+	      if (recog_memoized (prev) < 0
+		  || get_attr_type (prev) != TYPE_LEA)
+		{
+		  *found = true;
+		  return distance;
+		}
 	    }
 
-	  if (ASSEMBLER_DIALECT == ASM_INTEL)
-	    fputs (", ", file);
+	  next = prev;
+	}
+      if (prev == BB_HEAD (bb))
+	break;
 
-	  fputs ("{sae}", file);
+      prev = PREV_INSN (prev);
+    }
 
-	  if (ASSEMBLER_DIALECT == ASM_ATT)
-	    fputs (", ", file);
+  return distance;
+}
 
-	  return;
+/* Search backward for non-agu definition of register number REGNO1
+   or register number REGNO2 in INSN's basic block until
+   1. Pass LEA_SEARCH_THRESHOLD instructions, or
+   2. Reach neighbor BBs boundary, or
+   3. Reach agu definition.
+   Returns the distance between the non-agu definition point and INSN.
+   If no definition point, returns -1.  */
 
-	case 'R':
-	  if (!CONST_INT_P (x))
-	    {
-	      output_operand_lossage ("operand is not an integer, invalid "
-				      "operand code 'R'");
-	      return;
-	    }
+static int
+distance_non_agu_define (unsigned int regno1, unsigned int regno2,
+			 rtx_insn *insn)
+{
+  basic_block bb = BLOCK_FOR_INSN (insn);
+  int distance = 0;
+  bool found = false;
 
-	  if (ASSEMBLER_DIALECT == ASM_INTEL)
-	    fputs (", ", file);
+  if (insn != BB_HEAD (bb))
+    distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
+					      distance, PREV_INSN (insn),
+					      &found);
 
-	  switch (INTVAL (x))
+  if (!found && distance < LEA_SEARCH_THRESHOLD)
+    {
+      edge e;
+      edge_iterator ei;
+      bool simple_loop = false;
+
+      FOR_EACH_EDGE (e, ei, bb->preds)
+	if (e->src == bb)
+	  {
+	    simple_loop = true;
+	    break;
+	  }
+
+      if (simple_loop)
+	distance = distance_non_agu_define_in_bb (regno1, regno2,
+						  insn, distance,
+						  BB_END (bb), &found);
+      else
+	{
+	  int shortest_dist = -1;
+	  bool found_in_bb = false;
+
+	  FOR_EACH_EDGE (e, ei, bb->preds)
 	    {
-	    case ROUND_NEAREST_INT | ROUND_SAE:
-	      fputs ("{rn-sae}", file);
-	      break;
-	    case ROUND_NEG_INF | ROUND_SAE:
-	      fputs ("{rd-sae}", file);
-	      break;
-	    case ROUND_POS_INF | ROUND_SAE:
-	      fputs ("{ru-sae}", file);
-	      break;
-	    case ROUND_ZERO | ROUND_SAE:
-	      fputs ("{rz-sae}", file);
-	      break;
-	    default:
-	      output_operand_lossage ("operand is not a specific integer, "
-				      "invalid operand code 'R'");
-	    }
+	      int bb_dist
+		= distance_non_agu_define_in_bb (regno1, regno2,
+						 insn, distance,
+						 BB_END (e->src),
+						 &found_in_bb);
+	      if (found_in_bb)
+		{
+		  if (shortest_dist < 0)
+		    shortest_dist = bb_dist;
+		  else if (bb_dist > 0)
+		    shortest_dist = MIN (bb_dist, shortest_dist);
 
-	  if (ASSEMBLER_DIALECT == ASM_ATT)
-	    fputs (", ", file);
+		  found = true;
+		}
+	    }
 
-	  return;
+	  distance = shortest_dist;
+	}
+    }
 
-	case '*':
-	  if (ASSEMBLER_DIALECT == ASM_ATT)
-	    putc ('*', file);
-	  return;
+  /* get_attr_type may modify recog data.  We want to make sure
+     that recog data is valid for instruction INSN, on which
+     distance_non_agu_define is called.  INSN is unchanged here.  */
+  extract_insn_cached (insn);
 
-	case '&':
-	  {
-	    const char *name = get_some_local_dynamic_name ();
-	    if (name == NULL)
-	      output_operand_lossage ("'%%&' used without any "
-				      "local dynamic TLS references");
-	    else
-	      assemble_name (file, name);
-	    return;
-	  }
+  if (!found)
+    return -1;
 
-	case '+':
-	  {
-	    rtx x;
+  return distance >> 1;
+}
 
-	    if (!optimize
-	        || optimize_function_for_size_p (cfun)
-		|| !TARGET_BRANCH_PREDICTION_HINTS)
-	      return;
+/* Return the distance in half-cycles between INSN and the next
+   insn that uses register number REGNO in memory address added
+   to DISTANCE.  Return -1 if REGNO0 is set.
 
-	    x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
-	    if (x)
-	      {
-		int pred_val = profile_probability::from_reg_br_prob_note
-				 (XINT (x, 0)).to_reg_br_prob_base ();
+   Put true value into *FOUND if register usage was found and
+   false otherwise.
+   Put true value into *REDEFINED if register redefinition was
+   found and false otherwise.  */
 
-		if (pred_val < REG_BR_PROB_BASE * 45 / 100
-		    || pred_val > REG_BR_PROB_BASE * 55 / 100)
-		  {
-		    bool taken = pred_val > REG_BR_PROB_BASE / 2;
-		    bool cputaken
-		      = final_forward_branch_p (current_output_insn) == 0;
+static int
+distance_agu_use_in_bb (unsigned int regno,
+			rtx_insn *insn, int distance, rtx_insn *start,
+			bool *found, bool *redefined)
+{
+  basic_block bb = NULL;
+  rtx_insn *next = start;
+  rtx_insn *prev = NULL;
 
-		    /* Emit hints only in the case default branch prediction
-		       heuristics would fail.  */
-		    if (taken != cputaken)
-		      {
-			/* We use 3e (DS) prefix for taken branches and
-			   2e (CS) prefix for not taken branches.  */
-			if (taken)
-			  fputs ("ds ; ", file);
-			else
-			  fputs ("cs ; ", file);
-		      }
-		  }
-	      }
-	    return;
-	  }
+  *found = false;
+  *redefined = false;
 
-	case ';':
-#ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
-	  putc (';', file);
-#endif
-	  return;
+  if (start != NULL_RTX)
+    {
+      bb = BLOCK_FOR_INSN (start);
+      if (start != BB_HEAD (bb))
+	/* If insn and start belong to the same bb, set prev to insn,
+	   so the call to increase_distance will increase the distance
+	   between insns by 1.  */
+	prev = insn;
+    }
 
-	case '~':
-	  putc (TARGET_AVX2 ? 'i' : 'f', file);
-	  return;
+  while (next
+	 && next != insn
+	 && distance < LEA_SEARCH_THRESHOLD)
+    {
+      if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
+	{
+	  distance = increase_distance(prev, next, distance);
+	  if (insn_uses_reg_mem (regno, next))
+	    {
+	      /* Return DISTANCE if OP0 is used in memory
+		 address in NEXT.  */
+	      *found = true;
+	      return distance;
+	    }
 
-	case 'M':
-	  if (TARGET_X32)
+	  if (insn_defines_reg (regno, INVALID_REGNUM, next))
 	    {
-	      /* NB: 32-bit indices in VSIB address are sign-extended
-		 to 64 bits. In x32, if 32-bit address 0xf7fa3010 is
-		 sign-extended to 0xfffffffff7fa3010 which is invalid
-		 address.  Add addr32 prefix if there is no base
-		 register nor symbol.  */
-	      bool ok;
-	      struct ix86_address parts;
-	      ok = ix86_decompose_address (x, &parts);
-	      gcc_assert (ok && parts.index == NULL_RTX);
-	      if (parts.base == NULL_RTX
-		  && (parts.disp == NULL_RTX
-		      || !symbolic_operand (parts.disp,
-					    GET_MODE (parts.disp))))
-		fputs ("addr32 ", file);
+	      /* Return -1 if OP0 is set in NEXT.  */
+	      *redefined = true;
+	      return -1;
 	    }
-	  return;
 
-	case '^':
-	  if (TARGET_64BIT && Pmode != word_mode)
-	    fputs ("addr32 ", file);
-	  return;
+	  prev = next;
+	}
 
-	case '!':
-	  if (ix86_notrack_prefixed_insn_p (current_output_insn))
-	    fputs ("notrack ", file);
-	  return;
+      if (next == BB_END (bb))
+	break;
 
-	default:
-	  output_operand_lossage ("invalid operand code '%c'", code);
-	}
+      next = NEXT_INSN (next);
     }
 
-  if (REG_P (x))
-    print_reg (x, code, file);
+  return distance;
+}
 
-  else if (MEM_P (x))
+/* Return the distance between INSN and the next insn that uses
+   register number REGNO0 in memory address.  Return -1 if no such
+   a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set.  */
+
+static int
+distance_agu_use (unsigned int regno0, rtx_insn *insn)
+{
+  basic_block bb = BLOCK_FOR_INSN (insn);
+  int distance = 0;
+  bool found = false;
+  bool redefined = false;
+
+  if (insn != BB_END (bb))
+    distance = distance_agu_use_in_bb (regno0, insn, distance,
+				       NEXT_INSN (insn),
+				       &found, &redefined);
+
+  if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
     {
-      rtx addr = XEXP (x, 0);
+      edge e;
+      edge_iterator ei;
+      bool simple_loop = false;
 
-      /* No `byte ptr' prefix for call instructions ... */
-      if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
-	{
-	  machine_mode mode = GET_MODE (x);
-	  const char *size;
+      FOR_EACH_EDGE (e, ei, bb->succs)
+        if (e->dest == bb)
+	  {
+	    simple_loop = true;
+	    break;
+	  }
 
-	  /* Check for explicit size override codes.  */
-	  if (code == 'b')
-	    size = "BYTE";
-	  else if (code == 'w')
-	    size = "WORD";
-	  else if (code == 'k')
-	    size = "DWORD";
-	  else if (code == 'q')
-	    size = "QWORD";
-	  else if (code == 'x')
-	    size = "XMMWORD";
-	  else if (code == 't')
-	    size = "YMMWORD";
-	  else if (code == 'g')
-	    size = "ZMMWORD";
-	  else if (mode == BLKmode)
-	    /* ... or BLKmode operands, when not overridden.  */
-	    size = NULL;
-	  else
-	    switch (GET_MODE_SIZE (mode))
-	      {
-	      case 1: size = "BYTE"; break;
-	      case 2: size = "WORD"; break;
-	      case 4: size = "DWORD"; break;
-	      case 8: size = "QWORD"; break;
-	      case 12: size = "TBYTE"; break;
-	      case 16:
-		if (mode == XFmode)
-		  size = "TBYTE";
-		else
-		  size = "XMMWORD";
-		break;
-	      case 32: size = "YMMWORD"; break;
-	      case 64: size = "ZMMWORD"; break;
-	      default:
-		gcc_unreachable ();
-	      }
-	  if (size)
+      if (simple_loop)
+	distance = distance_agu_use_in_bb (regno0, insn,
+					   distance, BB_HEAD (bb),
+					   &found, &redefined);
+      else
+	{
+	  int shortest_dist = -1;
+	  bool found_in_bb = false;
+	  bool redefined_in_bb = false;
+
+	  FOR_EACH_EDGE (e, ei, bb->succs)
 	    {
-	      fputs (size, file);
-	      fputs (" PTR ", file);
+	      int bb_dist
+		= distance_agu_use_in_bb (regno0, insn,
+					  distance, BB_HEAD (e->dest),
+					  &found_in_bb, &redefined_in_bb);
+	      if (found_in_bb)
+		{
+		  if (shortest_dist < 0)
+		    shortest_dist = bb_dist;
+		  else if (bb_dist > 0)
+		    shortest_dist = MIN (bb_dist, shortest_dist);
+
+		  found = true;
+		}
 	    }
-	}
 
-      if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
-	output_operand_lossage ("invalid constraints for operand");
-      else
-	ix86_print_operand_address_as
-	  (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
+	  distance = shortest_dist;
+	}
     }
 
-  else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
+  if (!found || redefined)
+    return -1;
+
+  return distance >> 1;
+}
+
+/* Define this macro to tune LEA priority vs ADD, it take effect when
+   there is a dilemma of choicing LEA or ADD
+   Negative value: ADD is more preferred than LEA
+   Zero: Netrual
+   Positive value: LEA is more preferred than ADD*/
+#define IX86_LEA_PRIORITY 0
+
+/* Return true if usage of lea INSN has performance advantage
+   over a sequence of instructions.  Instructions sequence has
+   SPLIT_COST cycles higher latency than lea latency.  */
+
+static bool
+ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
+		      unsigned int regno2, int split_cost, bool has_scale)
+{
+  int dist_define, dist_use;
+
+  /* For Silvermont if using a 2-source or 3-source LEA for
+     non-destructive destination purposes, or due to wanting
+     ability to use SCALE, the use of LEA is justified.  */
+  if (TARGET_SILVERMONT || TARGET_GOLDMONT || TARGET_GOLDMONT_PLUS
+      || TARGET_TREMONT || TARGET_INTEL)
     {
-      long l;
+      if (has_scale)
+	return true;
+      if (split_cost < 1)
+	return false;
+      if (regno0 == regno1 || regno0 == regno2)
+	return false;
+      return true;
+    }
 
-      REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
+  dist_define = distance_non_agu_define (regno1, regno2, insn);
+  dist_use = distance_agu_use (regno0, insn);
 
-      if (ASSEMBLER_DIALECT == ASM_ATT)
-	putc ('$', file);
-      /* Sign extend 32bit SFmode immediate to 8 bytes.  */
-      if (code == 'q')
-	fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
-		 (unsigned long long) (int) l);
+  if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
+    {
+      /* If there is no non AGU operand definition, no AGU
+	 operand usage and split cost is 0 then both lea
+	 and non lea variants have same priority.  Currently
+	 we prefer lea for 64 bit code and non lea on 32 bit
+	 code.  */
+      if (dist_use < 0 && split_cost == 0)
+	return TARGET_64BIT || IX86_LEA_PRIORITY;
       else
-	fprintf (file, "0x%08x", (unsigned int) l);
+	return true;
     }
 
-  else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
-    {
-      long l[2];
+  /* With longer definitions distance lea is more preferable.
+     Here we change it to take into account splitting cost and
+     lea priority.  */
+  dist_define += split_cost + IX86_LEA_PRIORITY;
 
-      REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
+  /* If there is no use in memory addess then we just check
+     that split cost exceeds AGU stall.  */
+  if (dist_use < 0)
+    return dist_define > LEA_MAX_STALL;
 
-      if (ASSEMBLER_DIALECT == ASM_ATT)
-	putc ('$', file);
-      fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
-    }
+  /* If this insn has both backward non-agu dependence and forward
+     agu dependence, the one with short distance takes effect.  */
+  return dist_define >= dist_use;
+}
 
-  /* These float cases don't actually occur as immediate operands.  */
-  else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
-    {
-      char dstr[30];
+/* Return true if it is legal to clobber flags by INSN and
+   false otherwise.  */
 
-      real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
-      fputs (dstr, file);
-    }
+static bool
+ix86_ok_to_clobber_flags (rtx_insn *insn)
+{
+  basic_block bb = BLOCK_FOR_INSN (insn);
+  df_ref use;
+  bitmap live;
 
-  else
+  while (insn)
     {
-      /* We have patterns that allow zero sets of memory, for instance.
-	 In 64-bit mode, we should probably support all 8-byte vectors,
-	 since we can in fact encode that into an immediate.  */
-      if (GET_CODE (x) == CONST_VECTOR)
+      if (NONDEBUG_INSN_P (insn))
 	{
-	  if (x != CONST0_RTX (GET_MODE (x)))
-	    output_operand_lossage ("invalid vector immediate");
-	  x = const0_rtx;
-	}
+	  FOR_EACH_INSN_USE (use, insn)
+	    if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
+	      return false;
 
-      if (code != 'P' && code != 'p')
-	{
-	  if (CONST_INT_P (x))
-	    {
-	      if (ASSEMBLER_DIALECT == ASM_ATT)
-		putc ('$', file);
-	    }
-	  else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
-		   || GET_CODE (x) == LABEL_REF)
-	    {
-	      if (ASSEMBLER_DIALECT == ASM_ATT)
-		putc ('$', file);
-	      else
-		fputs ("OFFSET FLAT:", file);
-	    }
+	  if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
+	    return true;
 	}
-      if (CONST_INT_P (x))
-	fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
-      else if (flag_pic || MACHOPIC_INDIRECT)
-	output_pic_addr_const (file, x, code);
-      else
-	output_addr_const (file, x);
+
+      if (insn == BB_END (bb))
+	break;
+
+      insn = NEXT_INSN (insn);
     }
-}
 
-static bool
-ix86_print_operand_punct_valid_p (unsigned char code)
-{
-  return (code == '*' || code == '+' || code == '&' || code == ';'
-	  || code == '~' || code == '^' || code == '!');
+  live = df_get_live_out(bb);
+  return !REGNO_REG_SET_P (live, FLAGS_REG);
 }
-
-/* Print a memory operand whose address is ADDR.  */
 
-static void
-ix86_print_operand_address_as (FILE *file, rtx addr,
-			       addr_space_t as, bool no_rip)
+/* Return true if we need to split op0 = op1 + op2 into a sequence of
+   move and add to avoid AGU stalls.  */
+
+bool
+ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
 {
-  struct ix86_address parts;
-  rtx base, index, disp;
-  int scale;
-  int ok;
-  bool vsib = false;
-  int code = 0;
+  unsigned int regno0, regno1, regno2;
 
-  if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
-    {
-      ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
-      gcc_assert (parts.index == NULL_RTX);
-      parts.index = XVECEXP (addr, 0, 1);
-      parts.scale = INTVAL (XVECEXP (addr, 0, 2));
-      addr = XVECEXP (addr, 0, 0);
-      vsib = true;
-    }
-  else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
-    {
-      gcc_assert (TARGET_64BIT);
-      ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
-      code = 'q';
-    }
-  else
-    ok = ix86_decompose_address (addr, &parts);
+  /* Check if we need to optimize.  */
+  if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
+    return false;
 
-  gcc_assert (ok);
+  /* Check it is correct to split here.  */
+  if (!ix86_ok_to_clobber_flags(insn))
+    return false;
 
-  base = parts.base;
-  index = parts.index;
-  disp = parts.disp;
-  scale = parts.scale;
+  regno0 = true_regnum (operands[0]);
+  regno1 = true_regnum (operands[1]);
+  regno2 = true_regnum (operands[2]);
 
-  if (ADDR_SPACE_GENERIC_P (as))
-    as = parts.seg;
+  /* We need to split only adds with non destructive
+     destination operand.  */
+  if (regno0 == regno1 || regno0 == regno2)
+    return false;
   else
-    gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
-
-  if (!ADDR_SPACE_GENERIC_P (as))
-    {
-      if (ASSEMBLER_DIALECT == ASM_ATT)
-	putc ('%', file);
-
-      switch (as)
-	{
-	case ADDR_SPACE_SEG_FS:
-	  fputs ("fs:", file);
-	  break;
-	case ADDR_SPACE_SEG_GS:
-	  fputs ("gs:", file);
-	  break;
-	default:
-	  gcc_unreachable ();
-	}
-    }
-
-  /* Use one byte shorter RIP relative addressing for 64bit mode.  */
-  if (TARGET_64BIT && !base && !index && !no_rip)
-    {
-      rtx symbol = disp;
+    return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
+}
 
-      if (GET_CODE (disp) == CONST
-	  && GET_CODE (XEXP (disp, 0)) == PLUS
-	  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
-	symbol = XEXP (XEXP (disp, 0), 0);
+/* Return true if we should emit lea instruction instead of mov
+   instruction.  */
 
-      if (GET_CODE (symbol) == LABEL_REF
-	  || (GET_CODE (symbol) == SYMBOL_REF
-	      && SYMBOL_REF_TLS_MODEL (symbol) == 0))
-	base = pc_rtx;
-    }
+bool
+ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
+{
+  unsigned int regno0, regno1;
 
-  if (!base && !index)
-    {
-      /* Displacement only requires special attention.  */
-      if (CONST_INT_P (disp))
-	{
-	  if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
-	    fputs ("ds:", file);
-	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
-	}
-      /* Load the external function address via the GOT slot to avoid PLT.  */
-      else if (GET_CODE (disp) == CONST
-	       && GET_CODE (XEXP (disp, 0)) == UNSPEC
-	       && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
-		   || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
-	       && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
-	output_pic_addr_const (file, disp, 0);
-      else if (flag_pic)
-	output_pic_addr_const (file, disp, 0);
-      else
-	output_addr_const (file, disp);
-    }
-  else
-    {
-      /* Print SImode register names to force addr32 prefix.  */
-      if (SImode_address_operand (addr, VOIDmode))
-	{
-	  if (flag_checking)
-	    {
-	      gcc_assert (TARGET_64BIT);
-	      switch (GET_CODE (addr))
-		{
-		case SUBREG:
-		  gcc_assert (GET_MODE (addr) == SImode);
-		  gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
-		  break;
-		case ZERO_EXTEND:
-		case AND:
-		  gcc_assert (GET_MODE (addr) == DImode);
-		  break;
-		default:
-		  gcc_unreachable ();
-		}
-	    }
-	  gcc_assert (!code);
-	  code = 'k';
-	}
-      else if (code == 0
-	       && TARGET_X32
-	       && disp
-	       && CONST_INT_P (disp)
-	       && INTVAL (disp) < -16*1024*1024)
-	{
-	  /* X32 runs in 64-bit mode, where displacement, DISP, in
-	     address DISP(%r64), is encoded as 32-bit immediate sign-
-	     extended from 32-bit to 64-bit.  For -0x40000300(%r64),
-	     address is %r64 + 0xffffffffbffffd00.  When %r64 <
-	     0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
-	     which is invalid for x32.  The correct address is %r64
-	     - 0x40000300 == 0xf7ffdd64.  To properly encode
-	     -0x40000300(%r64) for x32, we zero-extend negative
-	     displacement by forcing addr32 prefix which truncates
-	     0xfffffffff7ffdd64 to 0xf7ffdd64.  In theory, we should
-	     zero-extend all negative displacements, including -1(%rsp).
-	     However, for small negative displacements, sign-extension
-	     won't cause overflow.  We only zero-extend negative
-	     displacements if they < -16*1024*1024, which is also used
-	     to check legitimate address displacements for PIC.  */
-	  code = 'k';
-	}
+  /* Check if we need to optimize.  */
+  if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
+    return false;
 
-      /* Since the upper 32 bits of RSP are always zero for x32,
-	 we can encode %esp as %rsp to avoid 0x67 prefix if
-	 there is no index register.  */
-      if (TARGET_X32 && Pmode == SImode
-	  && !index && base && REG_P (base) && REGNO (base) == SP_REG)
-	code = 'q';
+  /* Use lea for reg to reg moves only.  */
+  if (!REG_P (operands[0]) || !REG_P (operands[1]))
+    return false;
 
-      if (ASSEMBLER_DIALECT == ASM_ATT)
-	{
-	  if (disp)
-	    {
-	      if (flag_pic)
-		output_pic_addr_const (file, disp, 0);
-	      else if (GET_CODE (disp) == LABEL_REF)
-		output_asm_label (disp);
-	      else
-		output_addr_const (file, disp);
-	    }
+  regno0 = true_regnum (operands[0]);
+  regno1 = true_regnum (operands[1]);
 
-	  putc ('(', file);
-	  if (base)
-	    print_reg (base, code, file);
-	  if (index)
-	    {
-	      putc (',', file);
-	      print_reg (index, vsib ? 0 : code, file);
-	      if (scale != 1 || vsib)
-		fprintf (file, ",%d", scale);
-	    }
-	  putc (')', file);
-	}
-      else
-	{
-	  rtx offset = NULL_RTX;
+  return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
+}
 
-	  if (disp)
-	    {
-	      /* Pull out the offset of a symbol; print any symbol itself.  */
-	      if (GET_CODE (disp) == CONST
-		  && GET_CODE (XEXP (disp, 0)) == PLUS
-		  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
-		{
-		  offset = XEXP (XEXP (disp, 0), 1);
-		  disp = gen_rtx_CONST (VOIDmode,
-					XEXP (XEXP (disp, 0), 0));
-		}
+/* Return true if we need to split lea into a sequence of
+   instructions to avoid AGU stalls. */
 
-	      if (flag_pic)
-		output_pic_addr_const (file, disp, 0);
-	      else if (GET_CODE (disp) == LABEL_REF)
-		output_asm_label (disp);
-	      else if (CONST_INT_P (disp))
-		offset = disp;
-	      else
-		output_addr_const (file, disp);
-	    }
+bool
+ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
+{
+  unsigned int regno0, regno1, regno2;
+  int split_cost;
+  struct ix86_address parts;
+  int ok;
 
-	  putc ('[', file);
-	  if (base)
-	    {
-	      print_reg (base, code, file);
-	      if (offset)
-		{
-		  if (INTVAL (offset) >= 0)
-		    putc ('+', file);
-		  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
-		}
-	    }
-	  else if (offset)
-	    fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
-	  else
-	    putc ('0', file);
+  /* Check we need to optimize.  */
+  if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
+    return false;
 
-	  if (index)
-	    {
-	      putc ('+', file);
-	      print_reg (index, vsib ? 0 : code, file);
-	      if (scale != 1 || vsib)
-		fprintf (file, "*%d", scale);
-	    }
-	  putc (']', file);
-	}
-    }
-}
+  /* The "at least two components" test below might not catch simple
+     move or zero extension insns if parts.base is non-NULL and parts.disp
+     is const0_rtx as the only components in the address, e.g. if the
+     register is %rbp or %r13.  As this test is much cheaper and moves or
+     zero extensions are the common case, do this check first.  */
+  if (REG_P (operands[1])
+      || (SImode_address_operand (operands[1], VOIDmode)
+	  && REG_P (XEXP (operands[1], 0))))
+    return false;
 
-static void
-ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
-{
-  ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
-}
+  /* Check if it is OK to split here.  */
+  if (!ix86_ok_to_clobber_flags (insn))
+    return false;
 
-/* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
+  ok = ix86_decompose_address (operands[1], &parts);
+  gcc_assert (ok);
 
-static bool
-i386_asm_output_addr_const_extra (FILE *file, rtx x)
-{
-  rtx op;
+  /* There should be at least two components in the address.  */
+  if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
+      + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
+    return false;
 
-  if (GET_CODE (x) != UNSPEC)
+  /* We should not split into add if non legitimate pic
+     operand is used as displacement. */
+  if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
     return false;
 
-  op = XVECEXP (x, 0, 0);
-  switch (XINT (x, 1))
-    {
-    case UNSPEC_GOTOFF:
-      output_addr_const (file, op);
-      fputs ("@gotoff", file);
-      break;
-    case UNSPEC_GOTTPOFF:
-      output_addr_const (file, op);
-      /* FIXME: This might be @TPOFF in Sun ld.  */
-      fputs ("@gottpoff", file);
-      break;
-    case UNSPEC_TPOFF:
-      output_addr_const (file, op);
-      fputs ("@tpoff", file);
-      break;
-    case UNSPEC_NTPOFF:
-      output_addr_const (file, op);
-      if (TARGET_64BIT)
-	fputs ("@tpoff", file);
-      else
-	fputs ("@ntpoff", file);
-      break;
-    case UNSPEC_DTPOFF:
-      output_addr_const (file, op);
-      fputs ("@dtpoff", file);
-      break;
-    case UNSPEC_GOTNTPOFF:
-      output_addr_const (file, op);
-      if (TARGET_64BIT)
-	fputs (ASSEMBLER_DIALECT == ASM_ATT ?
-	       "@gottpoff(%rip)" : "@gottpoff[rip]", file);
-      else
-	fputs ("@gotntpoff", file);
-      break;
-    case UNSPEC_INDNTPOFF:
-      output_addr_const (file, op);
-      fputs ("@indntpoff", file);
-      break;
-#if TARGET_MACHO
-    case UNSPEC_MACHOPIC_OFFSET:
-      output_addr_const (file, op);
-      putc ('-', file);
-      machopic_output_function_base_name (file);
-      break;
-#endif
-
-    default:
-      return false;
-    }
+  regno0 = true_regnum (operands[0]) ;
+  regno1 = INVALID_REGNUM;
+  regno2 = INVALID_REGNUM;
 
-  return true;
-}
-
-/* Split one or more double-mode RTL references into pairs of half-mode
-   references.  The RTL can be REG, offsettable MEM, integer constant, or
-   CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
-   split and "num" is its length.  lo_half and hi_half are output arrays
-   that parallel "operands".  */
+  if (parts.base)
+    regno1 = true_regnum (parts.base);
+  if (parts.index)
+    regno2 = true_regnum (parts.index);
 
-void
-split_double_mode (machine_mode mode, rtx operands[],
-		   int num, rtx lo_half[], rtx hi_half[])
-{
-  machine_mode half_mode;
-  unsigned int byte;
+  split_cost = 0;
 
-  switch (mode)
+  /* Compute how many cycles we will add to execution time
+     if split lea into a sequence of instructions.  */
+  if (parts.base || parts.index)
     {
-    case E_TImode:
-      half_mode = DImode;
-      break;
-    case E_DImode:
-      half_mode = SImode;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  byte = GET_MODE_SIZE (half_mode);
+      /* Have to use mov instruction if non desctructive
+	 destination form is used.  */
+      if (regno1 != regno0 && regno2 != regno0)
+	split_cost += 1;
 
-  while (num--)
-    {
-      rtx op = operands[num];
+      /* Have to add index to base if both exist.  */
+      if (parts.base && parts.index)
+	split_cost += 1;
 
-      /* simplify_subreg refuse to split volatile memory addresses,
-         but we still have to handle it.  */
-      if (MEM_P (op))
-	{
-	  lo_half[num] = adjust_address (op, half_mode, 0);
-	  hi_half[num] = adjust_address (op, half_mode, byte);
-	}
-      else
+      /* Have to use shift and adds if scale is 2 or greater.  */
+      if (parts.scale > 1)
 	{
-	  lo_half[num] = simplify_gen_subreg (half_mode, op,
-					      GET_MODE (op) == VOIDmode
-					      ? mode : GET_MODE (op), 0);
-	  hi_half[num] = simplify_gen_subreg (half_mode, op,
-					      GET_MODE (op) == VOIDmode
-					      ? mode : GET_MODE (op), byte);
+	  if (regno0 != regno1)
+	    split_cost += 1;
+	  else if (regno2 == regno0)
+	    split_cost += 4;
+	  else
+	    split_cost += parts.scale;
 	}
-    }
-}
-
-/* Output code to perform a 387 binary operation in INSN, one of PLUS,
-   MINUS, MULT or DIV.  OPERANDS are the insn operands, where operands[3]
-   is the expression of the binary operation.  The output may either be
-   emitted here, or returned to the caller, like all output_* functions.
 
-   There is no guarantee that the operands are the same mode, as they
-   might be within FLOAT or FLOAT_EXTEND expressions.  */
+      /* Have to use add instruction with immediate if
+	 disp is non zero.  */
+      if (parts.disp && parts.disp != const0_rtx)
+	split_cost += 1;
 
-#ifndef SYSV386_COMPAT
-/* Set to 1 for compatibility with brain-damaged assemblers.  No-one
-   wants to fix the assemblers because that causes incompatibility
-   with gcc.  No-one wants to fix gcc because that causes
-   incompatibility with assemblers...  You can use the option of
-   -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way.  */
-#define SYSV386_COMPAT 1
-#endif
+      /* Subtract the price of lea.  */
+      split_cost -= 1;
+    }
 
-const char *
-output_387_binary_op (rtx_insn *insn, rtx *operands)
-{
-  static char buf[40];
-  const char *p;
-  bool is_sse
-    = (SSE_REG_P (operands[0])
-       || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
+  return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
+				parts.scale > 1);
+}
 
-  if (is_sse)
-    p = "%v";
-  else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
-	   || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
-    p = "fi";
-  else
-    p = "f";
+/* Return true if it is ok to optimize an ADD operation to LEA
+   operation to avoid flag register consumation.  For most processors,
+   ADD is faster than LEA.  For the processors like BONNELL, if the
+   destination register of LEA holds an actual address which will be
+   used soon, LEA is better and otherwise ADD is better.  */
 
-  strcpy (buf, p);
+bool
+ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
+{
+  unsigned int regno0 = true_regnum (operands[0]);
+  unsigned int regno1 = true_regnum (operands[1]);
+  unsigned int regno2 = true_regnum (operands[2]);
 
-  switch (GET_CODE (operands[3]))
-    {
-    case PLUS:
-      p = "add"; break;
-    case MINUS:
-      p = "sub"; break;
-    case MULT:
-      p = "mul"; break;
-    case DIV:
-      p = "div"; break;
-    default:
-      gcc_unreachable ();
-    }
+  /* If a = b + c, (a!=b && a!=c), must use lea form. */
+  if (regno0 != regno1 && regno0 != regno2)
+    return true;
 
-  strcat (buf, p);
+  if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
+    return false;
 
-  if (is_sse)
-   {
-     p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
-     strcat (buf, p);
+  return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
+}
 
-     if (TARGET_AVX)
-       p = "\t{%2, %1, %0|%0, %1, %2}";
-     else
-       p = "\t{%2, %0|%0, %2}";
+/* Return true if destination reg of SET_BODY is shift count of
+   USE_BODY.  */
 
-     strcat (buf, p);
-     return buf;
-   }
+static bool
+ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
+{
+  rtx set_dest;
+  rtx shift_rtx;
+  int i;
 
-  /* Even if we do not want to check the inputs, this documents input
-     constraints.  Which helps in understanding the following code.  */
-  if (flag_checking)
+  /* Retrieve destination of SET_BODY.  */
+  switch (GET_CODE (set_body))
     {
-      if (STACK_REG_P (operands[0])
-	  && ((REG_P (operands[1])
-	       && REGNO (operands[0]) == REGNO (operands[1])
-	       && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
-	      || (REG_P (operands[2])
-		  && REGNO (operands[0]) == REGNO (operands[2])
-		  && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
-	  && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
-	; /* ok */
-      else
-	gcc_unreachable ();
+    case SET:
+      set_dest = SET_DEST (set_body);
+      if (!set_dest || !REG_P (set_dest))
+	return false;
+      break;
+    case PARALLEL:
+      for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
+	if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
+					  use_body))
+	  return true;
+      /* FALLTHROUGH */
+    default:
+      return false;
     }
 
-  switch (GET_CODE (operands[3]))
+  /* Retrieve shift count of USE_BODY.  */
+  switch (GET_CODE (use_body))
     {
-    case MULT:
-    case PLUS:
-      if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
-	std::swap (operands[1], operands[2]);
+    case SET:
+      shift_rtx = XEXP (use_body, 1);
+      break;
+    case PARALLEL:
+      for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
+	if (ix86_dep_by_shift_count_body (set_body,
+					  XVECEXP (use_body, 0, i)))
+	  return true;
+      /* FALLTHROUGH */
+    default:
+      return false;
+    }
 
-      /* know operands[0] == operands[1].  */
+  if (shift_rtx
+      && (GET_CODE (shift_rtx) == ASHIFT
+	  || GET_CODE (shift_rtx) == LSHIFTRT
+	  || GET_CODE (shift_rtx) == ASHIFTRT
+	  || GET_CODE (shift_rtx) == ROTATE
+	  || GET_CODE (shift_rtx) == ROTATERT))
+    {
+      rtx shift_count = XEXP (shift_rtx, 1);
 
-      if (MEM_P (operands[2]))
+      /* Return true if shift count is dest of SET_BODY.  */
+      if (REG_P (shift_count))
 	{
-	  p = "%Z2\t%2";
-	  break;
+	  /* Add check since it can be invoked before register
+	     allocation in pre-reload schedule.  */
+	  if (reload_completed
+	      && true_regnum (set_dest) == true_regnum (shift_count))
+	    return true;
+	  else if (REGNO(set_dest) == REGNO(shift_count))
+	    return true;
 	}
+    }
 
-      if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
-	{
-	  if (STACK_TOP_P (operands[0]))
-	    /* How is it that we are storing to a dead operand[2]?
-	       Well, presumably operands[1] is dead too.  We can't
-	       store the result to st(0) as st(0) gets popped on this
-	       instruction.  Instead store to operands[2] (which I
-	       think has to be st(1)).  st(1) will be popped later.
-	       gcc <= 2.8.1 didn't have this check and generated
-	       assembly code that the Unixware assembler rejected.  */
-	    p = "p\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
-	  else
-	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
-	  break;
-	}
-
-      if (STACK_TOP_P (operands[0]))
-	p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
-      else
-	p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
-      break;
-
-    case MINUS:
-    case DIV:
-      if (MEM_P (operands[1]))
-	{
-	  p = "r%Z1\t%1";
-	  break;
-	}
-
-      if (MEM_P (operands[2]))
-	{
-	  p = "%Z2\t%2";
-	  break;
-	}
-
-      if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
-	{
-#if SYSV386_COMPAT
-	  /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
-	     derived assemblers, confusingly reverse the direction of
-	     the operation for fsub{r} and fdiv{r} when the
-	     destination register is not st(0).  The Intel assembler
-	     doesn't have this brain damage.  Read !SYSV386_COMPAT to
-	     figure out what the hardware really does.  */
-	  if (STACK_TOP_P (operands[0]))
-	    p = "{p\t%0, %2|rp\t%2, %0}";
-	  else
-	    p = "{rp\t%2, %0|p\t%0, %2}";
-#else
-	  if (STACK_TOP_P (operands[0]))
-	    /* As above for fmul/fadd, we can't store to st(0).  */
-	    p = "rp\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
-	  else
-	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
-#endif
-	  break;
-	}
-
-      if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
-	{
-#if SYSV386_COMPAT
-	  if (STACK_TOP_P (operands[0]))
-	    p = "{rp\t%0, %1|p\t%1, %0}";
-	  else
-	    p = "{p\t%1, %0|rp\t%0, %1}";
-#else
-	  if (STACK_TOP_P (operands[0]))
-	    p = "p\t{%0, %1|%1, %0}";	/* st(1) = st(1) op st(0); pop */
-	  else
-	    p = "rp\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2); pop */
-#endif
-	  break;
-	}
-
-      if (STACK_TOP_P (operands[0]))
-	{
-	  if (STACK_TOP_P (operands[1]))
-	    p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
-	  else
-	    p = "r\t{%y1, %0|%0, %y1}";	/* st(0) = st(r1) op st(0) */
-	  break;
-	}
-      else if (STACK_TOP_P (operands[1]))
-	{
-#if SYSV386_COMPAT
-	  p = "{\t%1, %0|r\t%0, %1}";
-#else
-	  p = "r\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2) */
-#endif
-	}
-      else
-	{
-#if SYSV386_COMPAT
-	  p = "{r\t%2, %0|\t%0, %2}";
-#else
-	  p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
-#endif
-	}
-      break;
-
-    default:
-      gcc_unreachable ();
-    }
-
-  strcat (buf, p);
-  return buf;
+  return false;
 }
 
-/* Return needed mode for entity in optimize_mode_switching pass.  */
+/* Return true if destination reg of SET_INSN is shift count of
+   USE_INSN.  */
 
-static int
-ix86_dirflag_mode_needed (rtx_insn *insn)
+bool
+ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
 {
-  if (CALL_P (insn))
-    {
-      if (cfun->machine->func_type == TYPE_NORMAL)
-	return X86_DIRFLAG_ANY;
-      else
-	/* No need to emit CLD in interrupt handler for TARGET_CLD.  */
-	return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
-    }
-
-  if (recog_memoized (insn) < 0)
-    return X86_DIRFLAG_ANY;
+  return ix86_dep_by_shift_count_body (PATTERN (set_insn),
+				       PATTERN (use_insn));
+}
 
-  if (get_attr_type (insn) == TYPE_STR)
-    {
-      /* Emit cld instruction if stringops are used in the function.  */
-      if (cfun->machine->func_type == TYPE_NORMAL)
-	return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
-      else
-	return X86_DIRFLAG_RESET;
-    }
+/* Return TRUE or FALSE depending on whether the unary operator meets the
+   appropriate constraints.  */
 
-  return X86_DIRFLAG_ANY;
+bool
+ix86_unary_operator_ok (enum rtx_code,
+			machine_mode,
+			rtx operands[2])
+{
+  /* If one of operands is memory, source and destination must match.  */
+  if ((MEM_P (operands[0])
+       || MEM_P (operands[1]))
+      && ! rtx_equal_p (operands[0], operands[1]))
+    return false;
+  return true;
 }
 
-/* Check if a 256bit or 512 bit AVX register is referenced inside of EXP.   */
+/* Return TRUE if the operands to a vec_interleave_{high,low}v2df
+   are ok, keeping in mind the possible movddup alternative.  */
 
-static bool
-ix86_check_avx_upper_register (const_rtx exp)
+bool
+ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
 {
-  return SSE_REG_P (exp) && GET_MODE_BITSIZE (GET_MODE (exp)) > 128;
+  if (MEM_P (operands[0]))
+    return rtx_equal_p (operands[0], operands[1 + high]);
+  if (MEM_P (operands[1]) && MEM_P (operands[2]))
+    return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
+  return true;
 }
 
-/* Return needed mode for entity in optimize_mode_switching pass.  */
+/* A subroutine of ix86_build_signbit_mask.  If VECT is true,
+   then replicate the value for all elements of the vector
+   register.  */
 
-static int
-ix86_avx_u128_mode_needed (rtx_insn *insn)
+rtx
+ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
 {
-  if (CALL_P (insn))
-    {
-      rtx link;
+  int i, n_elt;
+  rtvec v;
+  machine_mode scalar_mode;
 
-      /* Needed mode is set to AVX_U128_CLEAN if there are
-	 no 256bit or 512bit modes used in function arguments. */
-      for (link = CALL_INSN_FUNCTION_USAGE (insn);
-	   link;
-	   link = XEXP (link, 1))
-	{
-	  if (GET_CODE (XEXP (link, 0)) == USE)
-	    {
-	      rtx arg = XEXP (XEXP (link, 0), 0);
+  switch (mode)
+    {
+    case E_V64QImode:
+    case E_V32QImode:
+    case E_V16QImode:
+    case E_V32HImode:
+    case E_V16HImode:
+    case E_V8HImode:
+    case E_V16SImode:
+    case E_V8SImode:
+    case E_V4SImode:
+    case E_V8DImode:
+    case E_V4DImode:
+    case E_V2DImode:
+      gcc_assert (vect);
+      /* FALLTHRU */
+    case E_V16SFmode:
+    case E_V8SFmode:
+    case E_V4SFmode:
+    case E_V8DFmode:
+    case E_V4DFmode:
+    case E_V2DFmode:
+      n_elt = GET_MODE_NUNITS (mode);
+      v = rtvec_alloc (n_elt);
+      scalar_mode = GET_MODE_INNER (mode);
 
-	      if (ix86_check_avx_upper_register (arg))
-		return AVX_U128_DIRTY;
-	    }
-	}
+      RTVEC_ELT (v, 0) = value;
 
-      return AVX_U128_CLEAN;
-    }
+      for (i = 1; i < n_elt; ++i)
+	RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
 
-  /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
-     Hardware changes state only when a 256bit register is written to,
-     but we need to prevent the compiler from moving optimal insertion
-     point above eventual read from 256bit or 512 bit register.  */
-  subrtx_iterator::array_type array;
-  FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
-    if (ix86_check_avx_upper_register (*iter))
-      return AVX_U128_DIRTY;
+      return gen_rtx_CONST_VECTOR (mode, v);
 
-  return AVX_U128_ANY;
+    default:
+      gcc_unreachable ();
+    }
 }
 
-/* Return mode that i387 must be switched into
-   prior to the execution of insn.  */
+/* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
+   and ix86_expand_int_vcond.  Create a mask for the sign bit in MODE
+   for an SSE register.  If VECT is true, then replicate the mask for
+   all elements of the vector register.  If INVERT is true, then create
+   a mask excluding the sign bit.  */
 
-static int
-ix86_i387_mode_needed (int entity, rtx_insn *insn)
+rtx
+ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
 {
-  enum attr_i387_cw mode;
-
-  /* The mode UNINITIALIZED is used to store control word after a
-     function call or ASM pattern.  The mode ANY specify that function
-     has no requirements on the control word and make no changes in the
-     bits we are interested in.  */
+  machine_mode vec_mode, imode;
+  wide_int w;
+  rtx mask, v;
 
-  if (CALL_P (insn)
-      || (NONJUMP_INSN_P (insn)
-	  && (asm_noperands (PATTERN (insn)) >= 0
-	      || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
-    return I387_CW_UNINITIALIZED;
-
-  if (recog_memoized (insn) < 0)
-    return I387_CW_ANY;
-
-  mode = get_attr_i387_cw (insn);
-
-  switch (entity)
+  switch (mode)
     {
-    case I387_TRUNC:
-      if (mode == I387_CW_TRUNC)
-	return mode;
+    case E_V16SImode:
+    case E_V16SFmode:
+    case E_V8SImode:
+    case E_V4SImode:
+    case E_V8SFmode:
+    case E_V4SFmode:
+      vec_mode = mode;
+      imode = SImode;
       break;
 
-    case I387_FLOOR:
-      if (mode == I387_CW_FLOOR)
-	return mode;
+    case E_V8DImode:
+    case E_V4DImode:
+    case E_V2DImode:
+    case E_V8DFmode:
+    case E_V4DFmode:
+    case E_V2DFmode:
+      vec_mode = mode;
+      imode = DImode;
       break;
 
-    case I387_CEIL:
-      if (mode == I387_CW_CEIL)
-	return mode;
+    case E_TImode:
+    case E_TFmode:
+      vec_mode = VOIDmode;
+      imode = TImode;
       break;
 
     default:
       gcc_unreachable ();
     }
 
-  return I387_CW_ANY;
-}
+  machine_mode inner_mode = GET_MODE_INNER (mode);
+  w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
+			   GET_MODE_BITSIZE (inner_mode));
+  if (invert)
+    w = wi::bit_not (w);
 
-/* Return mode that entity must be switched into
-   prior to the execution of insn.  */
+  /* Force this value into the low part of a fp vector constant.  */
+  mask = immed_wide_int_const (w, imode);
+  mask = gen_lowpart (inner_mode, mask);
 
-static int
-ix86_mode_needed (int entity, rtx_insn *insn)
-{
-  switch (entity)
-    {
-    case X86_DIRFLAG:
-      return ix86_dirflag_mode_needed (insn);
-    case AVX_U128:
-      return ix86_avx_u128_mode_needed (insn);
-    case I387_TRUNC:
-    case I387_FLOOR:
-    case I387_CEIL:
-      return ix86_i387_mode_needed (entity, insn);
-    default:
-      gcc_unreachable ();
-    }
-  return 0;
-}
+  if (vec_mode == VOIDmode)
+    return force_reg (inner_mode, mask);
 
-/* Check if a 256bit or 512bit AVX register is referenced in stores.   */
- 
-static void
-ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
- {
-   if (ix86_check_avx_upper_register (dest))
-    {
-      bool *used = (bool *) data;
-      *used = true;
-    }
- } 
+  v = ix86_build_const_vector (vec_mode, vect, mask);
+  return force_reg (vec_mode, v);
+}
 
-/* Calculate mode of upper 128bit AVX registers after the insn.  */
+/* Return TRUE or FALSE depending on whether the first SET in INSN
+   has source and destination with matching CC modes, and that the
+   CC mode is at least as constrained as REQ_MODE.  */
 
-static int
-ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
+bool
+ix86_match_ccmode (rtx insn, machine_mode req_mode)
 {
-  rtx pat = PATTERN (insn);
+  rtx set;
+  machine_mode set_mode;
 
-  if (vzeroupper_pattern (pat, VOIDmode)
-      || vzeroall_pattern (pat, VOIDmode))
-    return AVX_U128_CLEAN;
+  set = PATTERN (insn);
+  if (GET_CODE (set) == PARALLEL)
+    set = XVECEXP (set, 0, 0);
+  gcc_assert (GET_CODE (set) == SET);
+  gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
 
-  /* We know that state is clean after CALL insn if there are no
-     256bit or 512bit registers used in the function return register. */
-  if (CALL_P (insn))
+  set_mode = GET_MODE (SET_DEST (set));
+  switch (set_mode)
     {
-      bool avx_upper_reg_found = false;
-      note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
-
-      return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
-    }
+    case E_CCNOmode:
+      if (req_mode != CCNOmode
+	  && (req_mode != CCmode
+	      || XEXP (SET_SRC (set), 1) != const0_rtx))
+	return false;
+      break;
+    case E_CCmode:
+      if (req_mode == CCGCmode)
+	return false;
+      /* FALLTHRU */
+    case E_CCGCmode:
+      if (req_mode == CCGOCmode || req_mode == CCNOmode)
+	return false;
+      /* FALLTHRU */
+    case E_CCGOCmode:
+      if (req_mode == CCZmode)
+	return false;
+      /* FALLTHRU */
+    case E_CCZmode:
+      break;
 
-  /* Otherwise, return current mode.  Remember that if insn
-     references AVX 256bit or 512bit registers, the mode was already
-     changed to DIRTY from MODE_NEEDED.  */
-  return mode;
-}
+    case E_CCGZmode:
 
-/* Return the mode that an insn results in.  */
+    case E_CCAmode:
+    case E_CCCmode:
+    case E_CCOmode:
+    case E_CCPmode:
+    case E_CCSmode:
+      if (set_mode != req_mode)
+	return false;
+      break;
 
-static int
-ix86_mode_after (int entity, int mode, rtx_insn *insn)
-{
-  switch (entity)
-    {
-    case X86_DIRFLAG:
-      return mode;
-    case AVX_U128:
-      return ix86_avx_u128_mode_after (mode, insn);
-    case I387_TRUNC:
-    case I387_FLOOR:
-    case I387_CEIL:
-      return mode;
     default:
       gcc_unreachable ();
     }
-}
-
-static int
-ix86_dirflag_mode_entry (void)
-{
-  /* For TARGET_CLD or in the interrupt handler we can't assume
-     direction flag state at function entry.  */
-  if (TARGET_CLD
-      || cfun->machine->func_type != TYPE_NORMAL)
-    return X86_DIRFLAG_ANY;
 
-  return X86_DIRFLAG_RESET;
+  return GET_MODE (SET_SRC (set)) == set_mode;
 }
 
-static int
-ix86_avx_u128_mode_entry (void)
+machine_mode
+ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
 {
-  tree arg;
+  machine_mode mode = GET_MODE (op0);
 
-  /* Entry mode is set to AVX_U128_DIRTY if there are
-     256bit or 512bit modes used in function arguments.  */
-  for (arg = DECL_ARGUMENTS (current_function_decl); arg;
-       arg = TREE_CHAIN (arg))
+  if (SCALAR_FLOAT_MODE_P (mode))
     {
-      rtx incoming = DECL_INCOMING_RTL (arg);
-
-      if (incoming && ix86_check_avx_upper_register (incoming))
-	return AVX_U128_DIRTY;
+      gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
+      return CCFPmode;
     }
 
-  return AVX_U128_CLEAN;
-}
-
-/* Return a mode that ENTITY is assumed to be
-   switched to at function entry.  */
-
-static int
-ix86_mode_entry (int entity)
-{
-  switch (entity)
+  switch (code)
     {
-    case X86_DIRFLAG:
-      return ix86_dirflag_mode_entry ();
-    case AVX_U128:
-      return ix86_avx_u128_mode_entry ();
-    case I387_TRUNC:
-    case I387_FLOOR:
-    case I387_CEIL:
-      return I387_CW_ANY;
+      /* Only zero flag is needed.  */
+    case EQ:			/* ZF=0 */
+    case NE:			/* ZF!=0 */
+      return CCZmode;
+      /* Codes needing carry flag.  */
+    case GEU:			/* CF=0 */
+    case LTU:			/* CF=1 */
+      /* Detect overflow checks.  They need just the carry flag.  */
+      if (GET_CODE (op0) == PLUS
+	  && (rtx_equal_p (op1, XEXP (op0, 0))
+	      || rtx_equal_p (op1, XEXP (op0, 1))))
+	return CCCmode;
+      else
+	return CCmode;
+    case GTU:			/* CF=0 & ZF=0 */
+    case LEU:			/* CF=1 | ZF=1 */
+      return CCmode;
+      /* Codes possibly doable only with sign flag when
+         comparing against zero.  */
+    case GE:			/* SF=OF   or   SF=0 */
+    case LT:			/* SF<>OF  or   SF=1 */
+      if (op1 == const0_rtx)
+	return CCGOCmode;
+      else
+	/* For other cases Carry flag is not required.  */
+	return CCGCmode;
+      /* Codes doable only with sign flag when comparing
+         against zero, but we miss jump instruction for it
+         so we need to use relational tests against overflow
+         that thus needs to be zero.  */
+    case GT:			/* ZF=0 & SF=OF */
+    case LE:			/* ZF=1 | SF<>OF */
+      if (op1 == const0_rtx)
+	return CCNOmode;
+      else
+	return CCGCmode;
+      /* strcmp pattern do (use flags) and combine may ask us for proper
+	 mode.  */
+    case USE:
+      return CCmode;
     default:
       gcc_unreachable ();
     }
 }
 
-static int
-ix86_avx_u128_mode_exit (void)
-{
-  rtx reg = crtl->return_rtx;
+/* Return the fixed registers used for condition codes.  */
 
-  /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
-     or 512 bit modes used in the function return register. */
-  if (reg && ix86_check_avx_upper_register (reg))
-    return AVX_U128_DIRTY;
-
-  /* Exit mode is set to AVX_U128_DIRTY if there are 256bit or 512bit
-     modes used in function arguments, otherwise return AVX_U128_CLEAN.
-   */
-  return ix86_avx_u128_mode_entry ();
-}
-
-/* Return a mode that ENTITY is assumed to be
-   switched to at function exit.  */
-
-static int
-ix86_mode_exit (int entity)
-{
-  switch (entity)
-    {
-    case X86_DIRFLAG:
-      return X86_DIRFLAG_ANY;
-    case AVX_U128:
-      return ix86_avx_u128_mode_exit ();
-    case I387_TRUNC:
-    case I387_FLOOR:
-    case I387_CEIL:
-      return I387_CW_ANY;
-    default:
-      gcc_unreachable ();
-    }
-}
-
-static int
-ix86_mode_priority (int, int n)
+static bool
+ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
 {
-  return n;
+  *p1 = FLAGS_REG;
+  *p2 = INVALID_REGNUM;
+  return true;
 }
 
-/* Output code to initialize control word copies used by trunc?f?i and
-   rounding patterns.  CURRENT_MODE is set to current control word,
-   while NEW_MODE is set to new control word.  */
+/* If two condition code modes are compatible, return a condition code
+   mode which is compatible with both.  Otherwise, return
+   VOIDmode.  */
 
-static void
-emit_i387_cw_initialization (int mode)
+static machine_mode
+ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
 {
-  rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
-  rtx new_mode;
+  if (m1 == m2)
+    return m1;
 
-  enum ix86_stack_slot slot;
+  if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
+    return VOIDmode;
 
-  rtx reg = gen_reg_rtx (HImode);
+  if ((m1 == CCGCmode && m2 == CCGOCmode)
+      || (m1 == CCGOCmode && m2 == CCGCmode))
+    return CCGCmode;
 
-  emit_insn (gen_x86_fnstcw_1 (stored_mode));
-  emit_move_insn (reg, copy_rtx (stored_mode));
+  if ((m1 == CCNOmode && m2 == CCGOCmode)
+      || (m1 == CCGOCmode && m2 == CCNOmode))
+    return CCNOmode;
 
-  switch (mode)
+  if (m1 == CCZmode
+      && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
+    return m2;
+  else if (m2 == CCZmode
+	   && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
+    return m1;
+
+  switch (m1)
     {
-    case I387_CW_TRUNC:
-      /* round toward zero (truncate) */
-      emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
-      slot = SLOT_CW_TRUNC;
-      break;
+    default:
+      gcc_unreachable ();
 
-    case I387_CW_FLOOR:
-      /* round down toward -oo */
-      emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
-      emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
-      slot = SLOT_CW_FLOOR;
-      break;
+    case E_CCmode:
+    case E_CCGCmode:
+    case E_CCGOCmode:
+    case E_CCNOmode:
+    case E_CCAmode:
+    case E_CCCmode:
+    case E_CCOmode:
+    case E_CCPmode:
+    case E_CCSmode:
+    case E_CCZmode:
+      switch (m2)
+	{
+	default:
+	  return VOIDmode;
 
-    case I387_CW_CEIL:
-      /* round up toward +oo */
-      emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
-      emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
-      slot = SLOT_CW_CEIL;
-      break;
+	case E_CCmode:
+	case E_CCGCmode:
+	case E_CCGOCmode:
+	case E_CCNOmode:
+	case E_CCAmode:
+	case E_CCCmode:
+	case E_CCOmode:
+	case E_CCPmode:
+	case E_CCSmode:
+	case E_CCZmode:
+	  return CCmode;
+	}
 
-    default:
-      gcc_unreachable ();
+    case E_CCFPmode:
+      /* These are only compatible with themselves, which we already
+	 checked above.  */
+      return VOIDmode;
     }
+}
 
-  gcc_assert (slot < MAX_386_STACK_LOCALS);
+/* Return strategy to use for floating-point.  We assume that fcomi is always
+   preferrable where available, since that is also true when looking at size
+   (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test).  */
 
-  new_mode = assign_386_stack_local (HImode, slot);
-  emit_move_insn (new_mode, reg);
+enum ix86_fpcmp_strategy
+ix86_fp_comparison_strategy (enum rtx_code)
+{
+  /* Do fcomi/sahf based test when profitable.  */
+
+  if (TARGET_CMOVE)
+    return IX86_FPCMP_COMI;
+
+  if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
+    return IX86_FPCMP_SAHF;
+
+  return IX86_FPCMP_ARITH;
 }
 
-/* Generate one or more insns to set ENTITY to MODE.  */
+/* Convert comparison codes we use to represent FP comparison to integer
+   code that will result in proper branch.  Return UNKNOWN if no such code
+   is available.  */
 
-static void
-ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
-		    HARD_REG_SET regs_live ATTRIBUTE_UNUSED)
+enum rtx_code
+ix86_fp_compare_code_to_integer (enum rtx_code code)
 {
-  switch (entity)
+  switch (code)
     {
-    case X86_DIRFLAG:
-      if (mode == X86_DIRFLAG_RESET)
-	emit_insn (gen_cld ());
-      break;
-    case AVX_U128:
-      if (mode == AVX_U128_CLEAN)
-	emit_insn (gen_avx_vzeroupper ());
-      break;
-    case I387_TRUNC:
-    case I387_FLOOR:
-    case I387_CEIL:
-      if (mode != I387_CW_ANY
-	  && mode != I387_CW_UNINITIALIZED)
-	emit_i387_cw_initialization (mode);
-      break;
+    case GT:
+      return GTU;
+    case GE:
+      return GEU;
+    case ORDERED:
+    case UNORDERED:
+      return code;
+    case UNEQ:
+      return EQ;
+    case UNLT:
+      return LTU;
+    case UNLE:
+      return LEU;
+    case LTGT:
+      return NE;
     default:
-      gcc_unreachable ();
+      return UNKNOWN;
     }
 }
 
-/* Output code for INSN to convert a float to a signed int.  OPERANDS
-   are the insn operands.  The output may be [HSD]Imode and the input
-   operand may be [SDX]Fmode.  */
-
-const char *
-output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
+/* Zero extend possibly SImode EXP to Pmode register.  */
+rtx
+ix86_zero_extend_to_Pmode (rtx exp)
 {
-  bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
-  bool dimode_p = GET_MODE (operands[0]) == DImode;
-  int round_mode = get_attr_i387_cw (insn);
-
-  static char buf[40];
-  const char *p;
-
-  /* Jump through a hoop or two for DImode, since the hardware has no
-     non-popping instruction.  We used to do this a different way, but
-     that was somewhat fragile and broke with post-reload splitters.  */
-  if ((dimode_p || fisttp) && !stack_top_dies)
-    output_asm_insn ("fld\t%y1", operands);
-
-  gcc_assert (STACK_TOP_P (operands[1]));
-  gcc_assert (MEM_P (operands[0]));
-  gcc_assert (GET_MODE (operands[1]) != TFmode);
-
-  if (fisttp)
-    return "fisttp%Z0\t%0";
-
-  strcpy (buf, "fist");
+  return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
+}
 
-  if (round_mode != I387_CW_ANY)
-    output_asm_insn ("fldcw\t%3", operands);
+/* Return true if the function being called was marked with attribute
+   "noplt" or using -fno-plt and we are compiling for non-PIC.  We need
+   to handle the non-PIC case in the backend because there is no easy
+   interface for the front-end to force non-PLT calls to use the GOT.
+   This is currently used only with 64-bit or 32-bit GOT32X ELF targets
+   to call the function marked "noplt" indirectly.  */
 
-  p = "p%Z0\t%0";
-  strcat (buf, p + !(stack_top_dies || dimode_p));
+static bool
+ix86_nopic_noplt_attribute_p (rtx call_op)
+{
+  if (flag_pic || ix86_cmodel == CM_LARGE
+      || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
+      || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
+      || SYMBOL_REF_LOCAL_P (call_op))
+    return false;
 
-  output_asm_insn (buf, operands);
+  tree symbol_decl = SYMBOL_REF_DECL (call_op);
 
-  if (round_mode != I387_CW_ANY)
-    output_asm_insn ("fldcw\t%2", operands);
+  if (!flag_plt
+      || (symbol_decl != NULL_TREE
+          && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
+    return true;
 
-  return "";
+  return false;
 }
 
-/* Output code for x87 ffreep insn.  The OPNO argument, which may only
-   have the values zero or one, indicates the ffreep insn's operand
-   from the OPERANDS array.  */
-
-static const char *
-output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
+/* Helper to output the jmp/call.  */
+static void
+ix86_output_jmp_thunk_or_indirect (const char *thunk_name, const int regno)
 {
-  if (TARGET_USE_FFREEP)
-#ifdef HAVE_AS_IX86_FFREEP
-    return opno ? "ffreep\t%y1" : "ffreep\t%y0";
-#else
+  if (thunk_name != NULL)
     {
-      static char retval[32];
-      int regno = REGNO (operands[opno]);
-
-      gcc_assert (STACK_REGNO_P (regno));
-
-      regno -= FIRST_STACK_REG;
-
-      snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
-      return retval;
+      fprintf (asm_out_file, "\tjmp\t");
+      assemble_name (asm_out_file, thunk_name);
+      putc ('\n', asm_out_file);
     }
-#endif
-
-  return opno ? "fstp\t%y1" : "fstp\t%y0";
+  else
+    output_indirect_thunk (regno);
 }
 
+/* Output indirect branch via a call and return thunk.  CALL_OP is a
+   register which contains the branch target.  XASM is the assembly
+   template for CALL_OP.  Branch is a tail call if SIBCALL_P is true.
+   A normal call is converted to:
 
-/* Output code for INSN to compare OPERANDS.  EFLAGS_P is 1 when fcomi
-   should be used.  UNORDERED_P is true when fucom should be used.  */
-
-const char *
-output_fp_compare (rtx_insn *insn, rtx *operands,
-		   bool eflags_p, bool unordered_p)
-{
-  rtx *xops = eflags_p ? &operands[0] : &operands[1];
-  bool stack_top_dies;
+	call __x86_indirect_thunk_reg
 
-  static char buf[40];
-  const char *p;
+   and a tail call is converted to:
 
-  gcc_assert (STACK_TOP_P (xops[0]));
+	jmp __x86_indirect_thunk_reg
+ */
 
-  stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
+static void
+ix86_output_indirect_branch_via_reg (rtx call_op, bool sibcall_p)
+{
+  char thunk_name_buf[32];
+  char *thunk_name;
+  enum indirect_thunk_prefix need_prefix
+    = indirect_thunk_need_prefix (current_output_insn);
+  int regno = REGNO (call_op);
 
-  if (eflags_p)
+  if (cfun->machine->indirect_branch_type
+      != indirect_branch_thunk_inline)
     {
-      p = unordered_p ? "fucomi" : "fcomi";
-      strcpy (buf, p);
-
-      p = "p\t{%y1, %0|%0, %y1}";
-      strcat (buf, p + !stack_top_dies);
-
-      return buf;
+      if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
+	{
+	  int i = regno;
+	  if (i >= FIRST_REX_INT_REG)
+	    i -= (FIRST_REX_INT_REG - LAST_INT_REG - 1);
+	  indirect_thunks_used |= 1 << i;
+	}
+      indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
+      thunk_name = thunk_name_buf;
     }
+  else
+    thunk_name = NULL;
 
-  if (STACK_REG_P (xops[1])
-      && stack_top_dies
-      && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
-    {
-      gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
-
-      /* If both the top of the 387 stack die, and the other operand
-	 is also a stack register that dies, then this must be a
-	 `fcompp' float compare.  */
-      p = unordered_p ? "fucompp" : "fcompp";
-      strcpy (buf, p);
-    }
-  else if (const0_operand (xops[1], VOIDmode))
-    {
-      gcc_assert (!unordered_p);
-      strcpy (buf, "ftst");
-    }
+  if (sibcall_p)
+     ix86_output_jmp_thunk_or_indirect (thunk_name, regno);
   else
     {
-      if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
+      if (thunk_name != NULL)
 	{
-	  gcc_assert (!unordered_p);
-	  p = "ficom";
+	  fprintf (asm_out_file, "\tcall\t");
+	  assemble_name (asm_out_file, thunk_name);
+	  putc ('\n', asm_out_file);
+	  return;
 	}
-      else
-	p = unordered_p ? "fucom" : "fcom";
-
-      strcpy (buf, p);
 
-      p = "p%Z2\t%y2";
-      strcat (buf, p + !stack_top_dies);
-    }
+      char indirectlabel1[32];
+      char indirectlabel2[32];
 
-  output_asm_insn (buf, operands);
-  return "fnstsw\t%0";
-}
+      ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
+				   INDIRECT_LABEL,
+				   indirectlabelno++);
+      ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
+				   INDIRECT_LABEL,
+				   indirectlabelno++);
 
-void
-ix86_output_addr_vec_elt (FILE *file, int value)
-{
-  const char *directive = ASM_LONG;
+      /* Jump.  */
+      fputs ("\tjmp\t", asm_out_file);
+      assemble_name_raw (asm_out_file, indirectlabel2);
+      fputc ('\n', asm_out_file);
 
-#ifdef ASM_QUAD
-  if (TARGET_LP64)
-    directive = ASM_QUAD;
-#else
-  gcc_assert (!TARGET_64BIT);
-#endif
+      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
 
-  fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
-}
+     ix86_output_jmp_thunk_or_indirect (thunk_name, regno);
 
-void
-ix86_output_addr_diff_elt (FILE *file, int value, int rel)
-{
-  const char *directive = ASM_LONG;
+      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
 
-#ifdef ASM_QUAD
-  if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
-    directive = ASM_QUAD;
-#else
-  gcc_assert (!TARGET_64BIT);
-#endif
-  /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand.  */
-  if (TARGET_64BIT || TARGET_VXWORKS_RTP)
-    fprintf (file, "%s%s%d-%s%d\n",
-	     directive, LPREFIX, value, LPREFIX, rel);
-#if TARGET_MACHO
-  else if (TARGET_MACHO)
-    {
-      fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
-      machopic_output_function_base_name (file);
-      putc ('\n', file);
+      /* Call.  */
+      fputs ("\tcall\t", asm_out_file);
+      assemble_name_raw (asm_out_file, indirectlabel1);
+      fputc ('\n', asm_out_file);
     }
-#endif
-  else if (HAVE_AS_GOTOFF_IN_DATA)
-    fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
-  else
-    asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
-		 GOT_SYMBOL_NAME, LPREFIX, value);
 }
-
-/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
-   for the target.  */
-
-void
-ix86_expand_clear (rtx dest)
-{
-  rtx tmp;
 
-  /* We play register width games, which are only valid after reload.  */
-  gcc_assert (reload_completed);
+/* Output indirect branch via a call and return thunk.  CALL_OP is
+   the branch target.  XASM is the assembly template for CALL_OP.
+   Branch is a tail call if SIBCALL_P is true.  A normal call is
+   converted to:
 
-  /* Avoid HImode and its attendant prefix byte.  */
-  if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
-    dest = gen_rtx_REG (SImode, REGNO (dest));
-  tmp = gen_rtx_SET (dest, const0_rtx);
+	jmp L2
+   L1:
+	push CALL_OP
+	jmp __x86_indirect_thunk
+   L2:
+	call L1
 
-  if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
-    {
-      rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
-      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
-    }
+   and a tail call is converted to:
 
-  emit_insn (tmp);
-}
+	push CALL_OP
+	jmp __x86_indirect_thunk
+ */
 
-void
-ix86_expand_move (machine_mode mode, rtx operands[])
+static void
+ix86_output_indirect_branch_via_push (rtx call_op, const char *xasm,
+				      bool sibcall_p)
 {
-  rtx op0, op1;
-  rtx tmp, addend = NULL_RTX;
-  enum tls_model model;
+  char thunk_name_buf[32];
+  char *thunk_name;
+  char push_buf[64];
+  enum indirect_thunk_prefix need_prefix
+    = indirect_thunk_need_prefix (current_output_insn);
+  int regno = -1;
+
+  if (cfun->machine->indirect_branch_type
+      != indirect_branch_thunk_inline)
+    {
+      if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
+	indirect_thunk_needed = true;
+      indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
+      thunk_name = thunk_name_buf;
+    }
+  else
+    thunk_name = NULL;
 
-  op0 = operands[0];
-  op1 = operands[1];
+  snprintf (push_buf, sizeof (push_buf), "push{%c}\t%s",
+	    TARGET_64BIT ? 'q' : 'l', xasm);
 
-  switch (GET_CODE (op1))
+  if (sibcall_p)
     {
-    case CONST:
-      tmp = XEXP (op1, 0);
+      output_asm_insn (push_buf, &call_op);
+      ix86_output_jmp_thunk_or_indirect (thunk_name, regno);
+    }
+  else
+    {
+      char indirectlabel1[32];
+      char indirectlabel2[32];
 
-      if (GET_CODE (tmp) != PLUS
-	  || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
-	break;
+      ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
+				   INDIRECT_LABEL,
+				   indirectlabelno++);
+      ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
+				   INDIRECT_LABEL,
+				   indirectlabelno++);
 
-      op1 = XEXP (tmp, 0);
-      addend = XEXP (tmp, 1);
-      /* FALLTHRU */
+      /* Jump.  */
+      fputs ("\tjmp\t", asm_out_file);
+      assemble_name_raw (asm_out_file, indirectlabel2);
+      fputc ('\n', asm_out_file);
 
-    case SYMBOL_REF:
-      model = SYMBOL_REF_TLS_MODEL (op1);
+      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
 
-      if (model)
-	op1 = legitimize_tls_address (op1, model, true);
-      else if (ix86_force_load_from_GOT_p (op1))
-	{
-	  /* Load the external function address via GOT slot to avoid PLT.  */
-	  op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
-				(TARGET_64BIT
-				 ? UNSPEC_GOTPCREL
-				 : UNSPEC_GOT));
-	  op1 = gen_rtx_CONST (Pmode, op1);
-	  op1 = gen_const_mem (Pmode, op1);
-	  set_mem_alias_set (op1, ix86_GOT_alias_set ());
-	}
-      else
+      /* An external function may be called via GOT, instead of PLT.  */
+      if (MEM_P (call_op))
 	{
-	  tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
-	  if (tmp)
-	    {
-	      op1 = tmp;
-	      if (!addend)
-		break;
-	    }
-	  else
+	  struct ix86_address parts;
+	  rtx addr = XEXP (call_op, 0);
+	  if (ix86_decompose_address (addr, &parts)
+	      && parts.base == stack_pointer_rtx)
 	    {
-	      op1 = operands[1];
-	      break;
-	    }
-	}
-
-      if (addend)
-	{
-	  op1 = force_operand (op1, NULL_RTX);
-	  op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
-				     op0, 1, OPTAB_DIRECT);
-	}
-      else
-	op1 = force_operand (op1, op0);
-
-      if (op1 == op0)
-	return;
-
-      op1 = convert_to_mode (mode, op1, 1);
+	      /* Since call will adjust stack by -UNITS_PER_WORD,
+		 we must convert "disp(stack, index, scale)" to
+		 "disp+UNITS_PER_WORD(stack, index, scale)".  */
+	      if (parts.index)
+		{
+		  addr = gen_rtx_MULT (Pmode, parts.index,
+				       GEN_INT (parts.scale));
+		  addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+				       addr);
+		}
+	      else
+		addr = stack_pointer_rtx;
 
-    default:
-      break;
-    }
+	      rtx disp;
+	      if (parts.disp != NULL_RTX)
+		disp = plus_constant (Pmode, parts.disp,
+				      UNITS_PER_WORD);
+	      else
+		disp = GEN_INT (UNITS_PER_WORD);
 
-  if ((flag_pic || MACHOPIC_INDIRECT)
-      && symbolic_operand (op1, mode))
-    {
-      if (TARGET_MACHO && !TARGET_64BIT)
-	{
-#if TARGET_MACHO
-	  /* dynamic-no-pic */
-	  if (MACHOPIC_INDIRECT)
-	    {
-	      rtx temp = (op0 && REG_P (op0) && mode == Pmode)
-			 ? op0 : gen_reg_rtx (Pmode);
-	      op1 = machopic_indirect_data_reference (op1, temp);
-	      if (MACHOPIC_PURE)
-		op1 = machopic_legitimize_pic_address (op1, mode,
-						       temp == op1 ? 0 : temp);
-	    }
-	  if (op0 != op1 && GET_CODE (op0) != MEM)
-	    {
-	      rtx insn = gen_rtx_SET (op0, op1);
-	      emit_insn (insn);
-	      return;
-	    }
-	  if (GET_CODE (op0) == MEM)
-	    op1 = force_reg (Pmode, op1);
-	  else
-	    {
-	      rtx temp = op0;
-	      if (GET_CODE (temp) != REG)
-		temp = gen_reg_rtx (Pmode);
-	      temp = legitimize_pic_address (op1, temp);
-	      if (temp == op0)
-	    return;
-	      op1 = temp;
-	    }
-      /* dynamic-no-pic */
-#endif
-	}
-      else
-	{
-	  if (MEM_P (op0))
-	    op1 = force_reg (mode, op1);
-	  else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
-	    {
-	      rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
-	      op1 = legitimize_pic_address (op1, reg);
-	      if (op0 == op1)
-		return;
-	      op1 = convert_to_mode (mode, op1, 1);
-	    }
-	}
-    }
-  else
-    {
-      if (MEM_P (op0)
-	  && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
-	      || !push_operand (op0, mode))
-	  && MEM_P (op1))
-	op1 = force_reg (mode, op1);
-
-      if (push_operand (op0, mode)
-	  && ! general_no_elim_operand (op1, mode))
-	op1 = copy_to_mode_reg (mode, op1);
-
-      /* Force large constants in 64bit compilation into register
-	 to get them CSEed.  */
-      if (can_create_pseudo_p ()
-	  && (mode == DImode) && TARGET_64BIT
-	  && immediate_operand (op1, mode)
-	  && !x86_64_zext_immediate_operand (op1, VOIDmode)
-	  && !register_operand (op0, mode)
-	  && optimize)
-	op1 = copy_to_mode_reg (mode, op1);
-
-      if (can_create_pseudo_p ()
-	  && CONST_DOUBLE_P (op1))
-	{
-	  /* If we are loading a floating point constant to a register,
-	     force the value to memory now, since we'll get better code
-	     out the back end.  */
-
-	  op1 = validize_mem (force_const_mem (mode, op1));
-	  if (!register_operand (op0, mode))
-	    {
-	      rtx temp = gen_reg_rtx (mode);
-	      emit_insn (gen_rtx_SET (temp, op1));
-	      emit_move_insn (op0, temp);
-	      return;
+	      addr = gen_rtx_PLUS (Pmode, addr, disp);
+	      call_op = gen_rtx_MEM (GET_MODE (call_op), addr);
 	    }
 	}
-    }
-
-  emit_insn (gen_rtx_SET (op0, op1));
-}
-
-void
-ix86_expand_vector_move (machine_mode mode, rtx operands[])
-{
-  rtx op0 = operands[0], op1 = operands[1];
-  /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
-     psABI since the biggest alignment is 4 byte for IA MCU psABI.  */
-  unsigned int align = (TARGET_IAMCU
-			? GET_MODE_BITSIZE (mode)
-			: GET_MODE_ALIGNMENT (mode));
-
-  if (push_operand (op0, VOIDmode))
-    op0 = emit_move_resolve_push (mode, op0);
-
-  /* Force constants other than zero into memory.  We do not know how
-     the instructions used to build constants modify the upper 64 bits
-     of the register, once we have that information we may be able
-     to handle some of them more efficiently.  */
-  if (can_create_pseudo_p ()
-      && (CONSTANT_P (op1)
-	  || (SUBREG_P (op1)
-	      && CONSTANT_P (SUBREG_REG (op1))))
-      && ((register_operand (op0, mode)
-	   && !standard_sse_constant_p (op1, mode))
-	  /* ix86_expand_vector_move_misalign() does not like constants.  */
-	  || (SSE_REG_MODE_P (mode)
-	      && MEM_P (op0)
-	      && MEM_ALIGN (op0) < align)))
-    {
-      if (SUBREG_P (op1))
-	{
-	  machine_mode imode = GET_MODE (SUBREG_REG (op1));
-	  rtx r = force_const_mem (imode, SUBREG_REG (op1));
-	  if (r)
-	    r = validize_mem (r);
-	  else
-	    r = force_reg (imode, SUBREG_REG (op1));
-	  op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
-	}
-      else
-	op1 = validize_mem (force_const_mem (mode, op1));
-    }
 
-  /* We need to check memory alignment for SSE mode since attribute
-     can make operands unaligned.  */
-  if (can_create_pseudo_p ()
-      && SSE_REG_MODE_P (mode)
-      && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
-	  || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
-    {
-      rtx tmp[2];
+      output_asm_insn (push_buf, &call_op);
 
-      /* ix86_expand_vector_move_misalign() does not like both
-	 arguments in memory.  */
-      if (!register_operand (op0, mode)
-	  && !register_operand (op1, mode))
-	op1 = force_reg (mode, op1);
+      ix86_output_jmp_thunk_or_indirect (thunk_name, regno);
 
-      tmp[0] = op0; tmp[1] = op1;
-      ix86_expand_vector_move_misalign (mode, tmp);
-      return;
-    }
+      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
 
-  /* Make operand1 a register if it isn't already.  */
-  if (can_create_pseudo_p ()
-      && !register_operand (op0, mode)
-      && !register_operand (op1, mode))
-    {
-      emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
-      return;
+      /* Call.  */
+      fputs ("\tcall\t", asm_out_file);
+      assemble_name_raw (asm_out_file, indirectlabel1);
+      fputc ('\n', asm_out_file);
     }
-
-  emit_insn (gen_rtx_SET (op0, op1));
 }
 
-/* Split 32-byte AVX unaligned load and store if needed.  */
+/* Output indirect branch via a call and return thunk.  CALL_OP is
+   the branch target.  XASM is the assembly template for CALL_OP.
+   Branch is a tail call if SIBCALL_P is true.   */
 
 static void
-ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
+ix86_output_indirect_branch (rtx call_op, const char *xasm,
+			     bool sibcall_p)
 {
-  rtx m;
-  rtx (*extract) (rtx, rtx, rtx);
-  machine_mode mode;
+  if (REG_P (call_op))
+    ix86_output_indirect_branch_via_reg (call_op, sibcall_p);
+  else
+    ix86_output_indirect_branch_via_push (call_op, xasm, sibcall_p);
+}
+
+/* Output indirect jump.  CALL_OP is the jump target.  */
 
-  if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
-      || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
+const char *
+ix86_output_indirect_jmp (rtx call_op)
+{
+  if (cfun->machine->indirect_branch_type != indirect_branch_keep)
     {
-      emit_insn (gen_rtx_SET (op0, op1));
-      return;
+      /* We can't have red-zone since "call" in the indirect thunk
+         pushes the return address onto stack, destroying red-zone.  */
+      if (ix86_red_zone_size != 0)
+	gcc_unreachable ();
+
+      ix86_output_indirect_branch (call_op, "%0", true);
+      return "";
     }
+  else
+    return "%!jmp\t%A0";
+}
 
-  rtx orig_op0 = NULL_RTX;
-  mode = GET_MODE (op0);
-  switch (GET_MODE_CLASS (mode))
+/* Output return instrumentation for current function if needed.  */
+
+static void
+output_return_instrumentation (void)
+{
+  if (ix86_instrument_return != instrument_return_none
+      && flag_fentry
+      && !DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (cfun->decl))
     {
-    case MODE_VECTOR_INT:
-    case MODE_INT:
-      if (mode != V32QImode)
+      if (ix86_flag_record_return)
+	fprintf (asm_out_file, "1:\n");
+      switch (ix86_instrument_return)
 	{
-	  if (!MEM_P (op0))
-	    {
-	      orig_op0 = op0;
-	      op0 = gen_reg_rtx (V32QImode);
-	    }
-	  else
-	    op0 = gen_lowpart (V32QImode, op0);
-	  op1 = gen_lowpart (V32QImode, op1);
-	  mode = V32QImode;
+	case instrument_return_call:
+	  fprintf (asm_out_file, "\tcall\t__return__\n");
+	  break;
+	case instrument_return_nop5:
+	  /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1)  */
+	  fprintf (asm_out_file, ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
+	  break;
+	case instrument_return_none:
+	  break;
 	}
-      break;
-    case MODE_VECTOR_FLOAT:
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  switch (mode)
-    {
-    default:
-      gcc_unreachable ();
-    case E_V32QImode:
-      extract = gen_avx_vextractf128v32qi;
-      mode = V16QImode;
-      break;
-    case E_V8SFmode:
-      extract = gen_avx_vextractf128v8sf;
-      mode = V4SFmode;
-      break;
-    case E_V4DFmode:
-      extract = gen_avx_vextractf128v4df;
-      mode = V2DFmode;
-      break;
-    }
 
-  if (MEM_P (op1))
-    {
-      rtx r = gen_reg_rtx (mode);
-      m = adjust_address (op1, mode, 0);
-      emit_move_insn (r, m);
-      m = adjust_address (op1, mode, 16);
-      r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
-      emit_move_insn (op0, r);
-    }
-  else if (MEM_P (op0))
-    {
-      m = adjust_address (op0, mode, 0);
-      emit_insn (extract (m, op1, const0_rtx));
-      m = adjust_address (op0, mode, 16);
-      emit_insn (extract (m, copy_rtx (op1), const1_rtx));
+      if (ix86_flag_record_return)
+	{
+	  fprintf (asm_out_file, "\t.section __return_loc, \"a\",@progbits\n");
+	  fprintf (asm_out_file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
+	  fprintf (asm_out_file, "\t.previous\n");
+	}
     }
-  else
-    gcc_unreachable ();
-
-  if (orig_op0)
-    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
 }
 
-/* Implement the movmisalign patterns for SSE.  Non-SSE modes go
-   straight to ix86_expand_vector_move.  */
-/* Code generation for scalar reg-reg moves of single and double precision data:
-     if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
-       movaps reg, reg
-     else
-       movss reg, reg
-     if (x86_sse_partial_reg_dependency == true)
-       movapd reg, reg
-     else
-       movsd reg, reg
-
-   Code generation for scalar loads of double precision data:
-     if (x86_sse_split_regs == true)
-       movlpd mem, reg      (gas syntax)
-     else
-       movsd mem, reg
-
-   Code generation for unaligned packed loads of single precision data
-   (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
-     if (x86_sse_unaligned_move_optimal)
-       movups mem, reg
-
-     if (x86_sse_partial_reg_dependency == true)
-       {
-         xorps  reg, reg
-         movlps mem, reg
-         movhps mem+8, reg
-       }
-     else
-       {
-         movlps mem, reg
-         movhps mem+8, reg
-       }
-
-   Code generation for unaligned packed loads of double precision data
-   (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
-     if (x86_sse_unaligned_move_optimal)
-       movupd mem, reg
-
-     if (x86_sse_split_regs == true)
-       {
-         movlpd mem, reg
-         movhpd mem+8, reg
-       }
-     else
-       {
-         movsd  mem, reg
-         movhpd mem+8, reg
-       }
- */
+/* Output function return.  CALL_OP is the jump target.  Add a REP
+   prefix to RET if LONG_P is true and function return is kept.  */
 
-void
-ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
+const char *
+ix86_output_function_return (bool long_p)
 {
-  rtx op0, op1, m;
-
-  op0 = operands[0];
-  op1 = operands[1];
+  output_return_instrumentation ();
 
-  /* Use unaligned load/store for AVX512 or when optimizing for size.  */
-  if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
+  if (cfun->machine->function_return_type != indirect_branch_keep)
     {
-      emit_insn (gen_rtx_SET (op0, op1));
-      return;
-    }
+      char thunk_name[32];
+      enum indirect_thunk_prefix need_prefix
+	= indirect_thunk_need_prefix (current_output_insn);
 
-  if (TARGET_AVX)
-    {
-      if (GET_MODE_SIZE (mode) == 32)
-	ix86_avx256_split_vector_move_misalign (op0, op1);
+      if (cfun->machine->function_return_type
+	  != indirect_branch_thunk_inline)
+	{
+	  bool need_thunk = (cfun->machine->function_return_type
+			     == indirect_branch_thunk);
+	  indirect_thunk_name (thunk_name, INVALID_REGNUM, need_prefix,
+			       true);
+	  indirect_return_needed |= need_thunk;
+	  fprintf (asm_out_file, "\tjmp\t");
+	  assemble_name (asm_out_file, thunk_name);
+	  putc ('\n', asm_out_file);
+	}
       else
-	/* Always use 128-bit mov<mode>_internal pattern for AVX.  */
-	emit_insn (gen_rtx_SET (op0, op1));
-      return;
-    }
-
-  if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
-      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
-    {
-      emit_insn (gen_rtx_SET (op0, op1));
-      return;
-    }
+	output_indirect_thunk (INVALID_REGNUM);
 
-  /* ??? If we have typed data, then it would appear that using
-     movdqu is the only way to get unaligned data loaded with
-     integer type.  */
-  if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
-    {
-      emit_insn (gen_rtx_SET (op0, op1));
-      return;
+      return "";
     }
 
-  if (MEM_P (op1))
-    {
-      if (TARGET_SSE2 && mode == V2DFmode)
-        {
-          rtx zero;
-
-	  /* When SSE registers are split into halves, we can avoid
-	     writing to the top half twice.  */
-	  if (TARGET_SSE_SPLIT_REGS)
-	    {
-	      emit_clobber (op0);
-	      zero = op0;
-	    }
-	  else
-	    {
-	      /* ??? Not sure about the best option for the Intel chips.
-		 The following would seem to satisfy; the register is
-		 entirely cleared, breaking the dependency chain.  We
-		 then store to the upper half, with a dependency depth
-		 of one.  A rumor has it that Intel recommends two movsd
-		 followed by an unpacklpd, but this is unconfirmed.  And
-		 given that the dependency depth of the unpacklpd would
-		 still be one, I'm not sure why this would be better.  */
-	      zero = CONST0_RTX (V2DFmode);
-	    }
+  if (!long_p)
+    return "%!ret";
 
-	  m = adjust_address (op1, DFmode, 0);
-	  emit_insn (gen_sse2_loadlpd (op0, zero, m));
-	  m = adjust_address (op1, DFmode, 8);
-	  emit_insn (gen_sse2_loadhpd (op0, op0, m));
-	}
-      else
-        {
-	  rtx t;
+  return "rep%; ret";
+}
 
-	  if (mode != V4SFmode)
-	    t = gen_reg_rtx (V4SFmode);
-	  else
-	    t = op0;
-	    
-	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
-	    emit_move_insn (t, CONST0_RTX (V4SFmode));
-	  else
-	    emit_clobber (t);
+/* Output indirect function return.  RET_OP is the function return
+   target.  */
 
-	  m = adjust_address (op1, V2SFmode, 0);
-	  emit_insn (gen_sse_loadlps (t, t, m));
-	  m = adjust_address (op1, V2SFmode, 8);
-	  emit_insn (gen_sse_loadhps (t, t, m));
-	  if (mode != V4SFmode)
-	    emit_move_insn (op0, gen_lowpart (mode, t));
-	}
-    }
-  else if (MEM_P (op0))
+const char *
+ix86_output_indirect_function_return (rtx ret_op)
+{
+  if (cfun->machine->function_return_type != indirect_branch_keep)
     {
-      if (TARGET_SSE2 && mode == V2DFmode)
+      char thunk_name[32];
+      enum indirect_thunk_prefix need_prefix
+	= indirect_thunk_need_prefix (current_output_insn);
+      unsigned int regno = REGNO (ret_op);
+      gcc_assert (regno == CX_REG);
+
+      if (cfun->machine->function_return_type
+	  != indirect_branch_thunk_inline)
 	{
-	  m = adjust_address (op0, DFmode, 0);
-	  emit_insn (gen_sse2_storelpd (m, op1));
-	  m = adjust_address (op0, DFmode, 8);
-	  emit_insn (gen_sse2_storehpd (m, op1));
+	  bool need_thunk = (cfun->machine->function_return_type
+			     == indirect_branch_thunk);
+	  indirect_thunk_name (thunk_name, regno, need_prefix, true);
+
+	  if (need_thunk)
+	    {
+	      indirect_return_via_cx = true;
+	      indirect_thunks_used |= 1 << CX_REG;
+	    }
+	  fprintf (asm_out_file, "\tjmp\t");
+	  assemble_name (asm_out_file, thunk_name);
+	  putc ('\n', asm_out_file);
 	}
       else
-	{
-	  if (mode != V4SFmode)
-	    op1 = gen_lowpart (V4SFmode, op1);
+	output_indirect_thunk (regno);
 
-	  m = adjust_address (op0, V2SFmode, 0);
-	  emit_insn (gen_sse_storelps (m, op1));
-	  m = adjust_address (op0, V2SFmode, 8);
-	  emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
-	}
+      return "";
     }
   else
-    gcc_unreachable ();
+    return "%!jmp\t%A0";
 }
 
-/* Helper function of ix86_fixup_binary_operands to canonicalize
-   operand order.  Returns true if the operands should be swapped.  */
+/* Output the assembly for a call instruction.  */
 
-static bool
-ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
-			     rtx operands[])
+const char *
+ix86_output_call_insn (rtx_insn *insn, rtx call_op)
 {
-  rtx dst = operands[0];
-  rtx src1 = operands[1];
-  rtx src2 = operands[2];
-
-  /* If the operation is not commutative, we can't do anything.  */
-  if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
-      && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
-    return false;
-
-  /* Highest priority is that src1 should match dst.  */
-  if (rtx_equal_p (dst, src1))
-    return false;
-  if (rtx_equal_p (dst, src2))
-    return true;
-
-  /* Next highest priority is that immediate constants come second.  */
-  if (immediate_operand (src2, mode))
-    return false;
-  if (immediate_operand (src1, mode))
-    return true;
-
-  /* Lowest priority is that memory references should come second.  */
-  if (MEM_P (src2))
-    return false;
-  if (MEM_P (src1))
-    return true;
+  bool direct_p = constant_call_address_operand (call_op, VOIDmode);
+  bool output_indirect_p
+    = (!TARGET_SEH
+       && cfun->machine->indirect_branch_type != indirect_branch_keep);
+  bool seh_nop_p = false;
+  const char *xasm;
 
-  return false;
-}
+  if (SIBLING_CALL_P (insn))
+    {
+      output_return_instrumentation ();
+      if (direct_p)
+	{
+	  if (ix86_nopic_noplt_attribute_p (call_op))
+	    {
+	      direct_p = false;
+	      if (TARGET_64BIT)
+		{
+		  if (output_indirect_p)
+		    xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
+		  else
+		    xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
+		}
+	      else
+		{
+		  if (output_indirect_p)
+		    xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
+		  else
+		    xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
+		}
+	    }
+	  else
+	    xasm = "%!jmp\t%P0";
+	}
+      /* SEH epilogue detection requires the indirect branch case
+	 to include REX.W.  */
+      else if (TARGET_SEH)
+	xasm = "%!rex.W jmp\t%A0";
+      else
+	{
+	  if (output_indirect_p)
+	    xasm = "%0";
+	  else
+	    xasm = "%!jmp\t%A0";
+	}
 
+      if (output_indirect_p && !direct_p)
+	ix86_output_indirect_branch (call_op, xasm, true);
+      else
+	output_asm_insn (xasm, &call_op);
+      return "";
+    }
 
-/* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
-   destination to use for the operation.  If different from the true
-   destination in operands[0], a copy operation will be required.  */
+  /* SEH unwinding can require an extra nop to be emitted in several
+     circumstances.  Determine if we have one of those.  */
+  if (TARGET_SEH)
+    {
+      rtx_insn *i;
 
-rtx
-ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
-			    rtx operands[])
-{
-  rtx dst = operands[0];
-  rtx src1 = operands[1];
-  rtx src2 = operands[2];
+      for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
+	{
+	  /* Prevent a catch region from being adjacent to a jump that would
+	     be interpreted as an epilogue sequence by the unwinder.  */
+	  if (JUMP_P(i) && CROSSING_JUMP_P (i))
+	    {
+	      seh_nop_p = true;
+	      break;
+	    }
+	    
+	  /* If we get to another real insn, we don't need the nop.  */
+	  if (INSN_P (i))
+	    break;
 
-  /* Canonicalize operand order.  */
-  if (ix86_swap_binary_operands_p (code, mode, operands))
-    {
-      /* It is invalid to swap operands of different modes.  */
-      gcc_assert (GET_MODE (src1) == GET_MODE (src2));
+	  /* If we get to the epilogue note, prevent a catch region from
+	     being adjacent to the standard epilogue sequence.  If non-
+	     call-exceptions, we'll have done this during epilogue emission. */
+	  if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
+	      && !flag_non_call_exceptions
+	      && !can_throw_internal (insn))
+	    {
+	      seh_nop_p = true;
+	      break;
+	    }
+	}
 
-      std::swap (src1, src2);
+      /* If we didn't find a real insn following the call, prevent the
+	 unwinder from looking into the next function.  */
+      if (i == NULL)
+	seh_nop_p = true;
     }
 
-  /* Both source operands cannot be in memory.  */
-  if (MEM_P (src1) && MEM_P (src2))
+  if (direct_p)
     {
-      /* Optimization: Only read from memory once.  */
-      if (rtx_equal_p (src1, src2))
+      if (ix86_nopic_noplt_attribute_p (call_op))
 	{
-	  src2 = force_reg (mode, src2);
-	  src1 = src2;
+	  direct_p = false;
+	  if (TARGET_64BIT)
+	    {
+	      if (output_indirect_p)
+		xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
+	      else
+		xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
+	    }
+	  else
+	    {
+	      if (output_indirect_p)
+		xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
+	      else
+		xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
+	    }
 	}
-      else if (rtx_equal_p (dst, src1))
-	src2 = force_reg (mode, src2);
       else
-	src1 = force_reg (mode, src1);
+	xasm = "%!call\t%P0";
+    }
+  else
+    {
+      if (output_indirect_p)
+	xasm = "%0";
+      else
+	xasm = "%!call\t%A0";
     }
 
-  /* If the destination is memory, and we do not have matching source
-     operands, do things in registers.  */
-  if (MEM_P (dst) && !rtx_equal_p (dst, src1))
-    dst = gen_reg_rtx (mode);
-
-  /* Source 1 cannot be a constant.  */
-  if (CONSTANT_P (src1))
-    src1 = force_reg (mode, src1);
-
-  /* Source 1 cannot be a non-matching memory.  */
-  if (MEM_P (src1) && !rtx_equal_p (dst, src1))
-    src1 = force_reg (mode, src1);
-
-  /* Improve address combine.  */
-  if (code == PLUS
-      && GET_MODE_CLASS (mode) == MODE_INT
-      && MEM_P (src2))
-    src2 = force_reg (mode, src2);
-
-  operands[1] = src1;
-  operands[2] = src2;
-  return dst;
-}
+  if (output_indirect_p && !direct_p)
+    ix86_output_indirect_branch (call_op, xasm, false);
+  else
+    output_asm_insn (xasm, &call_op);
 
-/* Similarly, but assume that the destination has already been
-   set up properly.  */
+  if (seh_nop_p)
+    return "nop";
 
-void
-ix86_fixup_binary_operands_no_copy (enum rtx_code code,
-				    machine_mode mode, rtx operands[])
-{
-  rtx dst = ix86_fixup_binary_operands (code, mode, operands);
-  gcc_assert (dst == operands[0]);
+  return "";
 }
+
+/* Return a MEM corresponding to a stack slot with mode MODE.
+   Allocate a new slot if necessary.
 
-/* Attempt to expand a binary operator.  Make the expansion closer to the
-   actual machine, then just general_operand, which will allow 3 separate
-   memory references (one output, two input) in a single insn.  */
+   The RTL for a function can have several slots available: N is
+   which slot to use.  */
 
-void
-ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
-			     rtx operands[])
+rtx
+assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
 {
-  rtx src1, src2, dst, op, clob;
-
-  dst = ix86_fixup_binary_operands (code, mode, operands);
-  src1 = operands[1];
-  src2 = operands[2];
+  struct stack_local_entry *s;
 
- /* Emit the instruction.  */
+  gcc_assert (n < MAX_386_STACK_LOCALS);
 
-  op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
+  for (s = ix86_stack_locals; s; s = s->next)
+    if (s->mode == mode && s->n == n)
+      return validize_mem (copy_rtx (s->rtl));
 
-  if (reload_completed
-      && code == PLUS
-      && !rtx_equal_p (dst, src1))
-    {
-      /* This is going to be an LEA; avoid splitting it later.  */
-      emit_insn (op);
-    }
-  else
-    {
-      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
-      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
-    }
+  s = ggc_alloc<stack_local_entry> ();
+  s->n = n;
+  s->mode = mode;
+  s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
 
-  /* Fix up the destination if needed.  */
-  if (dst != operands[0])
-    emit_move_insn (operands[0], dst);
+  s->next = ix86_stack_locals;
+  ix86_stack_locals = s;
+  return validize_mem (copy_rtx (s->rtl));
 }
 
-/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
-   the given OPERANDS.  */
+static void
+ix86_instantiate_decls (void)
+{
+  struct stack_local_entry *s;
 
-void
-ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
-				     rtx operands[])
-{
-  rtx op1 = NULL_RTX, op2 = NULL_RTX;
-  if (SUBREG_P (operands[1]))
-    {
-      op1 = operands[1];
-      op2 = operands[2];
-    }
-  else if (SUBREG_P (operands[2]))
-    {
-      op1 = operands[2];
-      op2 = operands[1];
-    }
-  /* Optimize (__m128i) d | (__m128i) e and similar code
-     when d and e are float vectors into float vector logical
-     insn.  In C/C++ without using intrinsics there is no other way
-     to express vector logical operation on float vectors than
-     to cast them temporarily to integer vectors.  */
-  if (op1
-      && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
-      && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
-      && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
-      && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
-      && SUBREG_BYTE (op1) == 0
-      && (GET_CODE (op2) == CONST_VECTOR
-	  || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
-	      && SUBREG_BYTE (op2) == 0))
-      && can_create_pseudo_p ())
-    {
-      rtx dst;
-      switch (GET_MODE (SUBREG_REG (op1)))
-	{
-	case E_V4SFmode:
-	case E_V8SFmode:
-	case E_V16SFmode:
-	case E_V2DFmode:
-	case E_V4DFmode:
-	case E_V8DFmode:
-	  dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
-	  if (GET_CODE (op2) == CONST_VECTOR)
-	    {
-	      op2 = gen_lowpart (GET_MODE (dst), op2);
-	      op2 = force_reg (GET_MODE (dst), op2);
-	    }
-	  else
-	    {
-	      op1 = operands[1];
-	      op2 = SUBREG_REG (operands[2]);
-	      if (!vector_operand (op2, GET_MODE (dst)))
-		op2 = force_reg (GET_MODE (dst), op2);
-	    }
-	  op1 = SUBREG_REG (op1);
-	  if (!vector_operand (op1, GET_MODE (dst)))
-	    op1 = force_reg (GET_MODE (dst), op1);
-	  emit_insn (gen_rtx_SET (dst,
-				  gen_rtx_fmt_ee (code, GET_MODE (dst),
-						  op1, op2)));
-	  emit_move_insn (operands[0], gen_lowpart (mode, dst));
-	  return;
-	default:
-	  break;
-	}
-    }
-  if (!vector_operand (operands[1], mode))
-    operands[1] = force_reg (mode, operands[1]);
-  if (!vector_operand (operands[2], mode))
-    operands[2] = force_reg (mode, operands[2]);
-  ix86_fixup_binary_operands_no_copy (code, mode, operands);
-  emit_insn (gen_rtx_SET (operands[0],
-			  gen_rtx_fmt_ee (code, mode, operands[1],
-					  operands[2])));
+  for (s = ix86_stack_locals; s; s = s->next)
+    if (s->rtl != NULL_RTX)
+      instantiate_decl_rtl (s->rtl);
 }
-
-/* Return TRUE or FALSE depending on whether the binary operator meets the
-   appropriate constraints.  */
+
+/* Check whether x86 address PARTS is a pc-relative address.  */
 
 bool
-ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
-			 rtx operands[3])
+ix86_rip_relative_addr_p (struct ix86_address *parts)
 {
-  rtx dst = operands[0];
-  rtx src1 = operands[1];
-  rtx src2 = operands[2];
-
-  /* Both source operands cannot be in memory.  */
-  if (MEM_P (src1) && MEM_P (src2))
-    return false;
-
-  /* Canonicalize operand order for commutative operators.  */
-  if (ix86_swap_binary_operands_p (code, mode, operands))
-    std::swap (src1, src2);
+  rtx base, index, disp;
 
-  /* If the destination is memory, we must have a matching source operand.  */
-  if (MEM_P (dst) && !rtx_equal_p (dst, src1))
-    return false;
+  base = parts->base;
+  index = parts->index;
+  disp = parts->disp;
 
-  /* Source 1 cannot be a constant.  */
-  if (CONSTANT_P (src1))
-    return false;
+  if (disp && !base && !index)
+    {
+      if (TARGET_64BIT)
+	{
+	  rtx symbol = disp;
 
-  /* Source 1 cannot be a non-matching memory.  */
-  if (MEM_P (src1) && !rtx_equal_p (dst, src1))
-    /* Support "andhi/andsi/anddi" as a zero-extending move.  */
-    return (code == AND
-	    && (mode == HImode
-		|| mode == SImode
-		|| (TARGET_64BIT && mode == DImode))
-	    && satisfies_constraint_L (src2));
+	  if (GET_CODE (disp) == CONST)
+	    symbol = XEXP (disp, 0);
+	  if (GET_CODE (symbol) == PLUS
+	      && CONST_INT_P (XEXP (symbol, 1)))
+	    symbol = XEXP (symbol, 0);
 
-  return true;
+	  if (GET_CODE (symbol) == LABEL_REF
+	      || (GET_CODE (symbol) == SYMBOL_REF
+		  && SYMBOL_REF_TLS_MODEL (symbol) == 0)
+	      || (GET_CODE (symbol) == UNSPEC
+		  && (XINT (symbol, 1) == UNSPEC_GOTPCREL
+		      || XINT (symbol, 1) == UNSPEC_PCREL
+		      || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
+	    return true;
+	}
+    }
+  return false;
 }
 
-/* Attempt to expand a unary operator.  Make the expansion closer to the
-   actual machine, then just general_operand, which will allow 2 separate
-   memory references (one output, one input) in a single insn.  */
+/* Calculate the length of the memory address in the instruction encoding.
+   Includes addr32 prefix, does not include the one-byte modrm, opcode,
+   or other prefixes.  We never generate addr32 prefix for LEA insn.  */
 
-void
-ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
-			    rtx operands[])
+int
+memory_address_length (rtx addr, bool lea)
 {
-  bool matching_memory = false;
-  rtx src, dst, op, clob;
-
-  dst = operands[0];
-  src = operands[1];
+  struct ix86_address parts;
+  rtx base, index, disp;
+  int len;
+  int ok;
 
-  /* If the destination is memory, and we do not have matching source
-     operands, do things in registers.  */
-  if (MEM_P (dst))
-    {
-      if (rtx_equal_p (dst, src))
-	matching_memory = true;
-      else
-	dst = gen_reg_rtx (mode);
-    }
+  if (GET_CODE (addr) == PRE_DEC
+      || GET_CODE (addr) == POST_INC
+      || GET_CODE (addr) == PRE_MODIFY
+      || GET_CODE (addr) == POST_MODIFY)
+    return 0;
 
-  /* When source operand is memory, destination must match.  */
-  if (MEM_P (src) && !matching_memory)
-    src = force_reg (mode, src);
+  ok = ix86_decompose_address (addr, &parts);
+  gcc_assert (ok);
 
-  /* Emit the instruction.  */
+  len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
 
-  op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
+  /*  If this is not LEA instruction, add the length of addr32 prefix.  */
+  if (TARGET_64BIT && !lea
+      && (SImode_address_operand (addr, VOIDmode)
+	  || (parts.base && GET_MODE (parts.base) == SImode)
+	  || (parts.index && GET_MODE (parts.index) == SImode)))
+    len++;
 
-  if (code == NOT)
-    emit_insn (op);
-  else
-    {
-      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
-      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
-    }
+  base = parts.base;
+  index = parts.index;
+  disp = parts.disp;
 
-  /* Fix up the destination if needed.  */
-  if (dst != operands[0])
-    emit_move_insn (operands[0], dst);
-}
+  if (base && SUBREG_P (base))
+    base = SUBREG_REG (base);
+  if (index && SUBREG_P (index))
+    index = SUBREG_REG (index);
 
-/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
-   divisor are within the range [0-255].  */
+  gcc_assert (base == NULL_RTX || REG_P (base));
+  gcc_assert (index == NULL_RTX || REG_P (index));
 
-void
-ix86_split_idivmod (machine_mode mode, rtx operands[],
-		    bool signed_p)
-{
-  rtx_code_label *end_label, *qimode_label;
-  rtx div, mod;
-  rtx_insn *insn;
-  rtx scratch, tmp0, tmp1, tmp2;
-  rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
-  rtx (*gen_zero_extend) (rtx, rtx);
-  rtx (*gen_test_ccno_1) (rtx, rtx);
+  /* Rule of thumb:
+       - esp as the base always wants an index,
+       - ebp as the base always wants a displacement,
+       - r12 as the base always wants an index,
+       - r13 as the base always wants a displacement.  */
 
-  switch (mode)
+  /* Register Indirect.  */
+  if (base && !index && !disp)
     {
-    case E_SImode:
-      if (GET_MODE (operands[0]) == SImode)
-	{
-	  if (GET_MODE (operands[1]) == SImode)
-	    gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
-	  else
-	    gen_divmod4_1
-	      = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
-	  gen_zero_extend = gen_zero_extendqisi2;
-	}
-      else
-	{
-	  gen_divmod4_1
-	    = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
-	  gen_zero_extend = gen_zero_extendqidi2;
-	}
-      gen_test_ccno_1 = gen_testsi_ccno_1;
-      break;
-    case E_DImode:
-      gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
-      gen_test_ccno_1 = gen_testdi_ccno_1;
-      gen_zero_extend = gen_zero_extendqidi2;
-      break;
-    default:
-      gcc_unreachable ();
+      /* esp (for its index) and ebp (for its displacement) need
+	 the two-byte modrm form.  Similarly for r12 and r13 in 64-bit
+	 code.  */
+      if (base == arg_pointer_rtx
+	  || base == frame_pointer_rtx
+	  || REGNO (base) == SP_REG
+	  || REGNO (base) == BP_REG
+	  || REGNO (base) == R12_REG
+	  || REGNO (base) == R13_REG)
+	len++;
     }
 
-  end_label = gen_label_rtx ();
-  qimode_label = gen_label_rtx ();
-
-  scratch = gen_reg_rtx (mode);
-
-  /* Use 8bit unsigned divimod if dividend and divisor are within
-     the range [0-255].  */
-  emit_move_insn (scratch, operands[2]);
-  scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
-				 scratch, 1, OPTAB_DIRECT);
-  emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
-  tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
-  tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
-  tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
-			       gen_rtx_LABEL_REF (VOIDmode, qimode_label),
-			       pc_rtx);
-  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
-  predict_jump (REG_BR_PROB_BASE * 50 / 100);
-  JUMP_LABEL (insn) = qimode_label;
-
-  /* Generate original signed/unsigned divimod.  */
-  div = gen_divmod4_1 (operands[0], operands[1],
-		       operands[2], operands[3]);
-  emit_insn (div);
-
-  /* Branch to the end.  */
-  emit_jump_insn (gen_jump (end_label));
-  emit_barrier ();
-
-  /* Generate 8bit unsigned divide.  */
-  emit_label (qimode_label);
-  /* Don't use operands[0] for result of 8bit divide since not all
-     registers support QImode ZERO_EXTRACT.  */
-  tmp0 = lowpart_subreg (HImode, scratch, mode);
-  tmp1 = lowpart_subreg (HImode, operands[2], mode);
-  tmp2 = lowpart_subreg (QImode, operands[3], mode);
-  emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
-
-  if (signed_p)
+  /* Direct Addressing.  In 64-bit mode mod 00 r/m 5
+     is not disp32, but disp32(%rip), so for disp32
+     SIB byte is needed, unless print_operand_address
+     optimizes it into disp32(%rip) or (%rip) is implied
+     by UNSPEC.  */
+  else if (disp && !base && !index)
     {
-      div = gen_rtx_DIV (mode, operands[2], operands[3]);
-      mod = gen_rtx_MOD (mode, operands[2], operands[3]);
+      len += 4;
+      if (!ix86_rip_relative_addr_p (&parts))
+	len++;
     }
   else
     {
-      div = gen_rtx_UDIV (mode, operands[2], operands[3]);
-      mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
-    }
-  if (mode == SImode)
-    {
-      if (GET_MODE (operands[0]) != SImode)
-	div = gen_rtx_ZERO_EXTEND (DImode, div);
-      if (GET_MODE (operands[1]) != SImode)
-	mod = gen_rtx_ZERO_EXTEND (DImode, mod);
-    }
+      /* Find the length of the displacement constant.  */
+      if (disp)
+	{
+	  if (base && satisfies_constraint_K (disp))
+	    len += 1;
+	  else
+	    len += 4;
+	}
+      /* ebp always wants a displacement.  Similarly r13.  */
+      else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
+	len++;
 
-  /* Extract remainder from AH.  */
-  tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
-			       tmp0, GEN_INT (8), GEN_INT (8));
-  if (REG_P (operands[1]))
-    insn = emit_move_insn (operands[1], tmp1);
-  else
-    {
-      /* Need a new scratch register since the old one has result
-	 of 8bit divide.  */
-      scratch = gen_reg_rtx (GET_MODE (operands[1]));
-      emit_move_insn (scratch, tmp1);
-      insn = emit_move_insn (operands[1], scratch);
+      /* An index requires the two-byte modrm form....  */
+      if (index
+	  /* ...like esp (or r12), which always wants an index.  */
+	  || base == arg_pointer_rtx
+	  || base == frame_pointer_rtx
+	  || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
+	len++;
     }
-  set_unique_reg_note (insn, REG_EQUAL, mod);
 
-  /* Zero extend quotient from AL.  */
-  tmp1 = gen_lowpart (QImode, tmp0);
-  insn = emit_insn (gen_zero_extend (operands[0], tmp1));
-  set_unique_reg_note (insn, REG_EQUAL, div);
-
-  emit_label (end_label);
+  return len;
 }
 
-#define LEA_MAX_STALL (3)
-#define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
+/* Compute default value for "length_immediate" attribute.  When SHORTFORM
+   is set, expect that insn have 8bit immediate alternative.  */
+int
+ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
+{
+  int len = 0;
+  int i;
+  extract_insn_cached (insn);
+  for (i = recog_data.n_operands - 1; i >= 0; --i)
+    if (CONSTANT_P (recog_data.operand[i]))
+      {
+        enum attr_mode mode = get_attr_mode (insn);
 
-/* Increase given DISTANCE in half-cycles according to
-   dependencies between PREV and NEXT instructions.
-   Add 1 half-cycle if there is no dependency and
-   go to next cycle if there is some dependecy.  */
+	gcc_assert (!len);
+	if (shortform && CONST_INT_P (recog_data.operand[i]))
+	  {
+	    HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
+	    switch (mode)
+	      {
+	      case MODE_QI:
+		len = 1;
+		continue;
+	      case MODE_HI:
+		ival = trunc_int_for_mode (ival, HImode);
+		break;
+	      case MODE_SI:
+		ival = trunc_int_for_mode (ival, SImode);
+		break;
+	      default:
+		break;
+	      }
+	    if (IN_RANGE (ival, -128, 127))
+	      {
+		len = 1;
+		continue;
+	      }
+	  }
+	switch (mode)
+	  {
+	  case MODE_QI:
+	    len = 1;
+	    break;
+	  case MODE_HI:
+	    len = 2;
+	    break;
+	  case MODE_SI:
+	    len = 4;
+	    break;
+	  /* Immediates for DImode instructions are encoded
+	     as 32bit sign extended values.  */
+	  case MODE_DI:
+	    len = 4;
+	    break;
+	  default:
+	    fatal_insn ("unknown insn mode", insn);
+	}
+      }
+  return len;
+}
 
-static unsigned int
-increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
+/* Compute default value for "length_address" attribute.  */
+int
+ix86_attr_length_address_default (rtx_insn *insn)
 {
-  df_ref def, use;
+  int i;
 
-  if (!prev || !next)
-    return distance + (distance & 1) + 2;
+  if (get_attr_type (insn) == TYPE_LEA)
+    {
+      rtx set = PATTERN (insn), addr;
 
-  if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
-    return distance + 1;
+      if (GET_CODE (set) == PARALLEL)
+	set = XVECEXP (set, 0, 0);
 
-  FOR_EACH_INSN_USE (use, next)
-    FOR_EACH_INSN_DEF (def, prev)
-      if (!DF_REF_IS_ARTIFICIAL (def)
-	  && DF_REF_REGNO (use) == DF_REF_REGNO (def))
-	return distance + (distance & 1) + 2;
+      gcc_assert (GET_CODE (set) == SET);
 
-  return distance + 1;
-}
+      addr = SET_SRC (set);
 
-/* Function checks if instruction INSN defines register number
-   REGNO1 or REGNO2.  */
+      return memory_address_length (addr, true);
+    }
 
-static bool
-insn_defines_reg (unsigned int regno1, unsigned int regno2,
-		  rtx_insn *insn)
-{
-  df_ref def;
+  extract_insn_cached (insn);
+  for (i = recog_data.n_operands - 1; i >= 0; --i)
+    {
+      rtx op = recog_data.operand[i];
+      if (MEM_P (op))
+	{
+	  constrain_operands_cached (insn, reload_completed);
+	  if (which_alternative != -1)
+	    {
+	      const char *constraints = recog_data.constraints[i];
+	      int alt = which_alternative;
 
-  FOR_EACH_INSN_DEF (def, insn)
-    if (DF_REF_REG_DEF_P (def)
-	&& !DF_REF_IS_ARTIFICIAL (def)
-	&& (regno1 == DF_REF_REGNO (def)
-	    || regno2 == DF_REF_REGNO (def)))
-      return true;
+	      while (*constraints == '=' || *constraints == '+')
+		constraints++;
+	      while (alt-- > 0)
+	        while (*constraints++ != ',')
+		  ;
+	      /* Skip ignored operands.  */
+	      if (*constraints == 'X')
+		continue;
+	    }
 
-  return false;
+	  int len = memory_address_length (XEXP (op, 0), false);
+
+	  /* Account for segment prefix for non-default addr spaces.  */
+	  if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
+	    len++;
+
+	  return len;
+	}
+    }
+  return 0;
 }
 
-/* Function checks if instruction INSN uses register number
-   REGNO as a part of address expression.  */
+/* Compute default value for "length_vex" attribute. It includes
+   2 or 3 byte VEX prefix and 1 opcode byte.  */
 
-static bool
-insn_uses_reg_mem (unsigned int regno, rtx insn)
+int
+ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
+			      bool has_vex_w)
 {
-  df_ref use;
+  int i;
 
-  FOR_EACH_INSN_USE (use, insn)
-    if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
-      return true;
+  /* Only 0f opcode can use 2 byte VEX prefix and  VEX W bit uses 3
+     byte VEX prefix.  */
+  if (!has_0f_opcode || has_vex_w)
+    return 3 + 1;
 
-  return false;
-}
+ /* We can always use 2 byte VEX prefix in 32bit.  */
+  if (!TARGET_64BIT)
+    return 2 + 1;
 
-/* Search backward for non-agu definition of register number REGNO1
-   or register number REGNO2 in basic block starting from instruction
-   START up to head of basic block or instruction INSN.
+  extract_insn_cached (insn);
 
-   Function puts true value into *FOUND var if definition was found
-   and false otherwise.
+  for (i = recog_data.n_operands - 1; i >= 0; --i)
+    if (REG_P (recog_data.operand[i]))
+      {
+	/* REX.W bit uses 3 byte VEX prefix.  */
+	if (GET_MODE (recog_data.operand[i]) == DImode
+	    && GENERAL_REG_P (recog_data.operand[i]))
+	  return 3 + 1;
+      }
+    else
+      {
+	/* REX.X or REX.B bits use 3 byte VEX prefix.  */
+	if (MEM_P (recog_data.operand[i])
+	    && x86_extended_reg_mentioned_p (recog_data.operand[i]))
+	  return 3 + 1;
+      }
 
-   Distance in half-cycles between START and found instruction or head
-   of BB is added to DISTANCE and returned.  */
+  return 2 + 1;
+}
+
 
-static int
-distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
-			       rtx_insn *insn, int distance,
-			       rtx_insn *start, bool *found)
-{
-  basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
-  rtx_insn *prev = start;
-  rtx_insn *next = NULL;
+static bool
+ix86_class_likely_spilled_p (reg_class_t);
 
-  *found = false;
+/* Returns true if lhs of insn is HW function argument register and set up
+   is_spilled to true if it is likely spilled HW register.  */
+static bool
+insn_is_function_arg (rtx insn, bool* is_spilled)
+{
+  rtx dst;
 
-  while (prev
-	 && prev != insn
-	 && distance < LEA_SEARCH_THRESHOLD)
+  if (!NONDEBUG_INSN_P (insn))
+    return false;
+  /* Call instructions are not movable, ignore it.  */
+  if (CALL_P (insn))
+    return false;
+  insn = PATTERN (insn);
+  if (GET_CODE (insn) == PARALLEL)
+    insn = XVECEXP (insn, 0, 0);
+  if (GET_CODE (insn) != SET)
+    return false;
+  dst = SET_DEST (insn);
+  if (REG_P (dst) && HARD_REGISTER_P (dst)
+      && ix86_function_arg_regno_p (REGNO (dst)))
     {
-      if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
-	{
-	  distance = increase_distance (prev, next, distance);
-	  if (insn_defines_reg (regno1, regno2, prev))
-	    {
-	      if (recog_memoized (prev) < 0
-		  || get_attr_type (prev) != TYPE_LEA)
-		{
-		  *found = true;
-		  return distance;
-		}
-	    }
-
-	  next = prev;
-	}
-      if (prev == BB_HEAD (bb))
-	break;
-
-      prev = PREV_INSN (prev);
+      /* Is it likely spilled HW register?  */
+      if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
+	  && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
+	*is_spilled = true;
+      return true;
     }
-
-  return distance;
+  return false;
 }
 
-/* Search backward for non-agu definition of register number REGNO1
-   or register number REGNO2 in INSN's basic block until
-   1. Pass LEA_SEARCH_THRESHOLD instructions, or
-   2. Reach neighbor BBs boundary, or
-   3. Reach agu definition.
-   Returns the distance between the non-agu definition point and INSN.
-   If no definition point, returns -1.  */
-
-static int
-distance_non_agu_define (unsigned int regno1, unsigned int regno2,
-			 rtx_insn *insn)
+/* Add output dependencies for chain of function adjacent arguments if only
+   there is a move to likely spilled HW register.  Return first argument
+   if at least one dependence was added or NULL otherwise.  */
+static rtx_insn *
+add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
 {
-  basic_block bb = BLOCK_FOR_INSN (insn);
-  int distance = 0;
-  bool found = false;
+  rtx_insn *insn;
+  rtx_insn *last = call;
+  rtx_insn *first_arg = NULL;
+  bool is_spilled = false;
 
-  if (insn != BB_HEAD (bb))
-    distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
-					      distance, PREV_INSN (insn),
-					      &found);
+  head = PREV_INSN (head);
 
-  if (!found && distance < LEA_SEARCH_THRESHOLD)
+  /* Find nearest to call argument passing instruction.  */
+  while (true)
     {
-      edge e;
-      edge_iterator ei;
-      bool simple_loop = false;
-
-      FOR_EACH_EDGE (e, ei, bb->preds)
-	if (e->src == bb)
-	  {
-	    simple_loop = true;
-	    break;
-	  }
+      last = PREV_INSN (last);
+      if (last == head)
+	return NULL;
+      if (!NONDEBUG_INSN_P (last))
+	continue;
+      if (insn_is_function_arg (last, &is_spilled))
+	break;
+      return NULL;
+    }
 
-      if (simple_loop)
-	distance = distance_non_agu_define_in_bb (regno1, regno2,
-						  insn, distance,
-						  BB_END (bb), &found);
-      else
+  first_arg = last;
+  while (true)
+    {
+      insn = PREV_INSN (last);
+      if (!INSN_P (insn))
+	break;
+      if (insn == head)
+	break;
+      if (!NONDEBUG_INSN_P (insn))
 	{
-	  int shortest_dist = -1;
-	  bool found_in_bb = false;
-
-	  FOR_EACH_EDGE (e, ei, bb->preds)
-	    {
-	      int bb_dist
-		= distance_non_agu_define_in_bb (regno1, regno2,
-						 insn, distance,
-						 BB_END (e->src),
-						 &found_in_bb);
-	      if (found_in_bb)
-		{
-		  if (shortest_dist < 0)
-		    shortest_dist = bb_dist;
-		  else if (bb_dist > 0)
-		    shortest_dist = MIN (bb_dist, shortest_dist);
-
-		  found = true;
-		}
-	    }
-
-	  distance = shortest_dist;
+	  last = insn;
+	  continue;
+	}
+      if (insn_is_function_arg (insn, &is_spilled))
+	{
+	  /* Add output depdendence between two function arguments if chain
+	     of output arguments contains likely spilled HW registers.  */
+	  if (is_spilled)
+	    add_dependence (first_arg, insn, REG_DEP_OUTPUT);
+	  first_arg = last = insn;
 	}
+      else
+	break;
     }
-
-  /* get_attr_type may modify recog data.  We want to make sure
-     that recog data is valid for instruction INSN, on which
-     distance_non_agu_define is called.  INSN is unchanged here.  */
-  extract_insn_cached (insn);
-
-  if (!found)
-    return -1;
-
-  return distance >> 1;
+  if (!is_spilled)
+    return NULL;
+  return first_arg;
 }
 
-/* Return the distance in half-cycles between INSN and the next
-   insn that uses register number REGNO in memory address added
-   to DISTANCE.  Return -1 if REGNO0 is set.
-
-   Put true value into *FOUND if register usage was found and
-   false otherwise.
-   Put true value into *REDEFINED if register redefinition was
-   found and false otherwise.  */
-
-static int
-distance_agu_use_in_bb (unsigned int regno,
-			rtx_insn *insn, int distance, rtx_insn *start,
-			bool *found, bool *redefined)
+/* Add output or anti dependency from insn to first_arg to restrict its code
+   motion.  */
+static void
+avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
 {
-  basic_block bb = NULL;
-  rtx_insn *next = start;
-  rtx_insn *prev = NULL;
-
-  *found = false;
-  *redefined = false;
+  rtx set;
+  rtx tmp;
 
-  if (start != NULL_RTX)
+  set = single_set (insn);
+  if (!set)
+    return;
+  tmp = SET_DEST (set);
+  if (REG_P (tmp))
     {
-      bb = BLOCK_FOR_INSN (start);
-      if (start != BB_HEAD (bb))
-	/* If insn and start belong to the same bb, set prev to insn,
-	   so the call to increase_distance will increase the distance
-	   between insns by 1.  */
-	prev = insn;
+      /* Add output dependency to the first function argument.  */
+      add_dependence (first_arg, insn, REG_DEP_OUTPUT);
+      return;
     }
+  /* Add anti dependency.  */
+  add_dependence (first_arg, insn, REG_DEP_ANTI);
+}
 
-  while (next
-	 && next != insn
-	 && distance < LEA_SEARCH_THRESHOLD)
+/* Avoid cross block motion of function argument through adding dependency
+   from the first non-jump instruction in bb.  */
+static void
+add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
+{
+  rtx_insn *insn = BB_END (bb);
+
+  while (insn)
     {
-      if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
+      if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
 	{
-	  distance = increase_distance(prev, next, distance);
-	  if (insn_uses_reg_mem (regno, next))
-	    {
-	      /* Return DISTANCE if OP0 is used in memory
-		 address in NEXT.  */
-	      *found = true;
-	      return distance;
-	    }
-
-	  if (insn_defines_reg (regno, INVALID_REGNUM, next))
+	  rtx set = single_set (insn);
+	  if (set)
 	    {
-	      /* Return -1 if OP0 is set in NEXT.  */
-	      *redefined = true;
-	      return -1;
+	      avoid_func_arg_motion (arg, insn);
+	      return;
 	    }
-
-	  prev = next;
 	}
-
-      if (next == BB_END (bb))
-	break;
-
-      next = NEXT_INSN (next);
+      if (insn == BB_HEAD (bb))
+	return;
+      insn = PREV_INSN (insn);
     }
-
-  return distance;
 }
 
-/* Return the distance between INSN and the next insn that uses
-   register number REGNO0 in memory address.  Return -1 if no such
-   a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set.  */
+/* Hook for pre-reload schedule - avoid motion of function arguments
+   passed in likely spilled HW registers.  */
+static void
+ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
+{
+  rtx_insn *insn;
+  rtx_insn *first_arg = NULL;
+  if (reload_completed)
+    return;
+  while (head != tail && DEBUG_INSN_P (head))
+    head = NEXT_INSN (head);
+  for (insn = tail; insn != head; insn = PREV_INSN (insn))
+    if (INSN_P (insn) && CALL_P (insn))
+      {
+	first_arg = add_parameter_dependencies (insn, head);
+	if (first_arg)
+	  {
+	    /* Add dependee for first argument to predecessors if only
+	       region contains more than one block.  */
+	    basic_block bb =  BLOCK_FOR_INSN (insn);
+	    int rgn = CONTAINING_RGN (bb->index);
+	    int nr_blks = RGN_NR_BLOCKS (rgn);
+	    /* Skip trivial regions and region head blocks that can have
+	       predecessors outside of region.  */
+	    if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
+	      {
+		edge e;
+		edge_iterator ei;
+
+		/* Regions are SCCs with the exception of selective
+		   scheduling with pipelining of outer blocks enabled.
+		   So also check that immediate predecessors of a non-head
+		   block are in the same region.  */
+		FOR_EACH_EDGE (e, ei, bb->preds)
+		  {
+		    /* Avoid creating of loop-carried dependencies through
+		       using topological ordering in the region.  */
+		    if (rgn == CONTAINING_RGN (e->src->index)
+			&& BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
+		      add_dependee_for_func_arg (first_arg, e->src); 
+		  }
+	      }
+	    insn = first_arg;
+	    if (insn == head)
+	      break;
+	  }
+      }
+    else if (first_arg)
+      avoid_func_arg_motion (first_arg, insn);
+}
 
+/* Hook for pre-reload schedule - set priority of moves from likely spilled
+   HW registers to maximum, to schedule them at soon as possible. These are
+   moves from function argument registers at the top of the function entry
+   and moves from function return value registers after call.  */
 static int
-distance_agu_use (unsigned int regno0, rtx_insn *insn)
+ix86_adjust_priority (rtx_insn *insn, int priority)
 {
-  basic_block bb = BLOCK_FOR_INSN (insn);
-  int distance = 0;
-  bool found = false;
-  bool redefined = false;
+  rtx set;
 
-  if (insn != BB_END (bb))
-    distance = distance_agu_use_in_bb (regno0, insn, distance,
-				       NEXT_INSN (insn),
-				       &found, &redefined);
+  if (reload_completed)
+    return priority;
 
-  if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
+  if (!NONDEBUG_INSN_P (insn))
+    return priority;
+
+  set = single_set (insn);
+  if (set)
     {
-      edge e;
-      edge_iterator ei;
-      bool simple_loop = false;
+      rtx tmp = SET_SRC (set);
+      if (REG_P (tmp)
+          && HARD_REGISTER_P (tmp)
+          && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
+          && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
+	return current_sched_info->sched_max_insns_priority;
+    }
 
-      FOR_EACH_EDGE (e, ei, bb->succs)
-        if (e->dest == bb)
-	  {
-	    simple_loop = true;
-	    break;
-	  }
+  return priority;
+}
 
-      if (simple_loop)
-	distance = distance_agu_use_in_bb (regno0, insn,
-					   distance, BB_HEAD (bb),
-					   &found, &redefined);
-      else
+/* Prepare for scheduling pass.  */
+static void
+ix86_sched_init_global (FILE *, int, int)
+{
+  /* Install scheduling hooks for current CPU.  Some of these hooks are used
+     in time-critical parts of the scheduler, so we only set them up when
+     they are actually used.  */
+  switch (ix86_tune)
+    {
+    case PROCESSOR_CORE2:
+    case PROCESSOR_NEHALEM:
+    case PROCESSOR_SANDYBRIDGE:
+    case PROCESSOR_HASWELL:
+    case PROCESSOR_GENERIC:
+      /* Do not perform multipass scheduling for pre-reload schedule
+         to save compile time.  */
+      if (reload_completed)
 	{
-	  int shortest_dist = -1;
-	  bool found_in_bb = false;
-	  bool redefined_in_bb = false;
-
-	  FOR_EACH_EDGE (e, ei, bb->succs)
-	    {
-	      int bb_dist
-		= distance_agu_use_in_bb (regno0, insn,
-					  distance, BB_HEAD (e->dest),
-					  &found_in_bb, &redefined_in_bb);
-	      if (found_in_bb)
-		{
-		  if (shortest_dist < 0)
-		    shortest_dist = bb_dist;
-		  else if (bb_dist > 0)
-		    shortest_dist = MIN (bb_dist, shortest_dist);
-
-		  found = true;
-		}
-	    }
-
-	  distance = shortest_dist;
+	  ix86_core2i7_init_hooks ();
+	  break;
 	}
+      /* Fall through.  */
+    default:
+      targetm.sched.dfa_post_advance_cycle = NULL;
+      targetm.sched.first_cycle_multipass_init = NULL;
+      targetm.sched.first_cycle_multipass_begin = NULL;
+      targetm.sched.first_cycle_multipass_issue = NULL;
+      targetm.sched.first_cycle_multipass_backtrack = NULL;
+      targetm.sched.first_cycle_multipass_end = NULL;
+      targetm.sched.first_cycle_multipass_fini = NULL;
+      break;
     }
-
-  if (!found || redefined)
-    return -1;
-
-  return distance >> 1;
 }
 
-/* Define this macro to tune LEA priority vs ADD, it take effect when
-   there is a dilemma of choicing LEA or ADD
-   Negative value: ADD is more preferred than LEA
-   Zero: Netrual
-   Positive value: LEA is more preferred than ADD*/
-#define IX86_LEA_PRIORITY 0
-
-/* Return true if usage of lea INSN has performance advantage
-   over a sequence of instructions.  Instructions sequence has
-   SPLIT_COST cycles higher latency than lea latency.  */
+
+/* Implement TARGET_STATIC_RTX_ALIGNMENT.  */
 
-static bool
-ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
-		      unsigned int regno2, int split_cost, bool has_scale)
+static HOST_WIDE_INT
+ix86_static_rtx_alignment (machine_mode mode)
 {
-  int dist_define, dist_use;
-
-  /* For Silvermont if using a 2-source or 3-source LEA for
-     non-destructive destination purposes, or due to wanting
-     ability to use SCALE, the use of LEA is justified.  */
-  if (TARGET_SILVERMONT || TARGET_GOLDMONT || TARGET_GOLDMONT_PLUS
-      || TARGET_TREMONT || TARGET_INTEL)
-    {
-      if (has_scale)
-	return true;
-      if (split_cost < 1)
-	return false;
-      if (regno0 == regno1 || regno0 == regno2)
-	return false;
-      return true;
-    }
+  if (mode == DFmode)
+    return 64;
+  if (ALIGN_MODE_128 (mode))
+    return MAX (128, GET_MODE_ALIGNMENT (mode));
+  return GET_MODE_ALIGNMENT (mode);
+}
 
-  dist_define = distance_non_agu_define (regno1, regno2, insn);
-  dist_use = distance_agu_use (regno0, insn);
+/* Implement TARGET_CONSTANT_ALIGNMENT.  */
 
-  if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
+static HOST_WIDE_INT
+ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
+{
+  if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
+      || TREE_CODE (exp) == INTEGER_CST)
     {
-      /* If there is no non AGU operand definition, no AGU
-	 operand usage and split cost is 0 then both lea
-	 and non lea variants have same priority.  Currently
-	 we prefer lea for 64 bit code and non lea on 32 bit
-	 code.  */
-      if (dist_use < 0 && split_cost == 0)
-	return TARGET_64BIT || IX86_LEA_PRIORITY;
-      else
-	return true;
+      machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
+      HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
+      return MAX (mode_align, align);
     }
+  else if (!optimize_size && TREE_CODE (exp) == STRING_CST
+	   && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
+    return BITS_PER_WORD;
 
-  /* With longer definitions distance lea is more preferable.
-     Here we change it to take into account splitting cost and
-     lea priority.  */
-  dist_define += split_cost + IX86_LEA_PRIORITY;
+  return align;
+}
 
-  /* If there is no use in memory addess then we just check
-     that split cost exceeds AGU stall.  */
-  if (dist_use < 0)
-    return dist_define > LEA_MAX_STALL;
+/* Implement TARGET_EMPTY_RECORD_P.  */
 
-  /* If this insn has both backward non-agu dependence and forward
-     agu dependence, the one with short distance takes effect.  */
-  return dist_define >= dist_use;
+static bool
+ix86_is_empty_record (const_tree type)
+{
+  if (!TARGET_64BIT)
+    return false;
+  return default_is_empty_record (type);
 }
 
-/* Return true if it is legal to clobber flags by INSN and
-   false otherwise.  */
+/* Implement TARGET_WARN_PARAMETER_PASSING_ABI.  */
 
-static bool
-ix86_ok_to_clobber_flags (rtx_insn *insn)
+static void
+ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
 {
-  basic_block bb = BLOCK_FOR_INSN (insn);
-  df_ref use;
-  bitmap live;
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
 
-  while (insn)
-    {
-      if (NONDEBUG_INSN_P (insn))
-	{
-	  FOR_EACH_INSN_USE (use, insn)
-	    if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
-	      return false;
+  if (!cum->warn_empty)
+    return;
 
-	  if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
-	    return true;
-	}
-
-      if (insn == BB_END (bb))
-	break;
-
-      insn = NEXT_INSN (insn);
-    }
-
-  live = df_get_live_out(bb);
-  return !REGNO_REG_SET_P (live, FLAGS_REG);
-}
-
-/* Return true if we need to split op0 = op1 + op2 into a sequence of
-   move and add to avoid AGU stalls.  */
+  if (!TYPE_EMPTY_P (type))
+    return;
 
-bool
-ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
-{
-  unsigned int regno0, regno1, regno2;
+  /* Don't warn if the function isn't visible outside of the TU.  */
+  if (cum->decl && !TREE_PUBLIC (cum->decl))
+    return;
 
-  /* Check if we need to optimize.  */
-  if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
-    return false;
+  const_tree ctx = get_ultimate_context (cum->decl);
+  if (ctx != NULL_TREE
+      && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
+    return;
 
-  /* Check it is correct to split here.  */
-  if (!ix86_ok_to_clobber_flags(insn))
-    return false;
+  /* If the actual size of the type is zero, then there is no change
+     in how objects of this size are passed.  */
+  if (int_size_in_bytes (type) == 0)
+    return;
 
-  regno0 = true_regnum (operands[0]);
-  regno1 = true_regnum (operands[1]);
-  regno2 = true_regnum (operands[2]);
+  warning (OPT_Wabi, "empty class %qT parameter passing ABI "
+	   "changes in %<-fabi-version=12%> (GCC 8)", type);
 
-  /* We need to split only adds with non destructive
-     destination operand.  */
-  if (regno0 == regno1 || regno0 == regno2)
-    return false;
-  else
-    return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
+  /* Only warn once.  */
+  cum->warn_empty = false;
 }
 
-/* Return true if we should emit lea instruction instead of mov
-   instruction.  */
+/* This hook returns name of multilib ABI.  */
 
-bool
-ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
+static const char *
+ix86_get_multilib_abi_name (void)
 {
-  unsigned int regno0, regno1;
-
-  /* Check if we need to optimize.  */
-  if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
-    return false;
-
-  /* Use lea for reg to reg moves only.  */
-  if (!REG_P (operands[0]) || !REG_P (operands[1]))
-    return false;
-
-  regno0 = true_regnum (operands[0]);
-  regno1 = true_regnum (operands[1]);
-
-  return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
+  if (!(TARGET_64BIT_P (ix86_isa_flags)))
+    return "i386";
+  else if (TARGET_X32_P (ix86_isa_flags))
+    return "x32";
+  else
+    return "x86_64";
 }
 
-/* Return true if we need to split lea into a sequence of
-   instructions to avoid AGU stalls. */
+/* Compute the alignment for a variable for Intel MCU psABI.  TYPE is
+   the data type, and ALIGN is the alignment that the object would
+   ordinarily have.  */
 
-bool
-ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
+static int
+iamcu_alignment (tree type, int align)
 {
-  unsigned int regno0, regno1, regno2;
-  int split_cost;
-  struct ix86_address parts;
-  int ok;
-
-  /* Check we need to optimize.  */
-  if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
-    return false;
-
-  /* The "at least two components" test below might not catch simple
-     move or zero extension insns if parts.base is non-NULL and parts.disp
-     is const0_rtx as the only components in the address, e.g. if the
-     register is %rbp or %r13.  As this test is much cheaper and moves or
-     zero extensions are the common case, do this check first.  */
-  if (REG_P (operands[1])
-      || (SImode_address_operand (operands[1], VOIDmode)
-	  && REG_P (XEXP (operands[1], 0))))
-    return false;
-
-  /* Check if it is OK to split here.  */
-  if (!ix86_ok_to_clobber_flags (insn))
-    return false;
-
-  ok = ix86_decompose_address (operands[1], &parts);
-  gcc_assert (ok);
-
-  /* There should be at least two components in the address.  */
-  if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
-      + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
-    return false;
-
-  /* We should not split into add if non legitimate pic
-     operand is used as displacement. */
-  if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
-    return false;
-
-  regno0 = true_regnum (operands[0]) ;
-  regno1 = INVALID_REGNUM;
-  regno2 = INVALID_REGNUM;
-
-  if (parts.base)
-    regno1 = true_regnum (parts.base);
-  if (parts.index)
-    regno2 = true_regnum (parts.index);
+  machine_mode mode;
 
-  split_cost = 0;
+  if (align < 32 || TYPE_USER_ALIGN (type))
+    return align;
 
-  /* Compute how many cycles we will add to execution time
-     if split lea into a sequence of instructions.  */
-  if (parts.base || parts.index)
+  /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
+     bytes.  */
+  mode = TYPE_MODE (strip_array_types (type));
+  switch (GET_MODE_CLASS (mode))
     {
-      /* Have to use mov instruction if non desctructive
-	 destination form is used.  */
-      if (regno1 != regno0 && regno2 != regno0)
-	split_cost += 1;
-
-      /* Have to add index to base if both exist.  */
-      if (parts.base && parts.index)
-	split_cost += 1;
-
-      /* Have to use shift and adds if scale is 2 or greater.  */
-      if (parts.scale > 1)
-	{
-	  if (regno0 != regno1)
-	    split_cost += 1;
-	  else if (regno2 == regno0)
-	    split_cost += 4;
-	  else
-	    split_cost += parts.scale;
-	}
-
-      /* Have to use add instruction with immediate if
-	 disp is non zero.  */
-      if (parts.disp && parts.disp != const0_rtx)
-	split_cost += 1;
-
-      /* Subtract the price of lea.  */
-      split_cost -= 1;
+    case MODE_INT:
+    case MODE_COMPLEX_INT:
+    case MODE_COMPLEX_FLOAT:
+    case MODE_FLOAT:
+    case MODE_DECIMAL_FLOAT:
+      return 32;
+    default:
+      return align;
     }
-
-  return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
-				parts.scale > 1);
 }
 
-/* Emit x86 binary operand CODE in mode MODE, where the first operand
-   matches destination.  RTX includes clobber of FLAGS_REG.  */
+/* Compute the alignment for a static variable.
+   TYPE is the data type, and ALIGN is the alignment that
+   the object would ordinarily have.  The value of this function is used
+   instead of that alignment to align the object.  */
 
-static void
-ix86_emit_binop (enum rtx_code code, machine_mode mode,
-		 rtx dst, rtx src)
+int
+ix86_data_alignment (tree type, unsigned int align, bool opt)
 {
-  rtx op, clob;
+  /* GCC 4.8 and earlier used to incorrectly assume this alignment even
+     for symbols from other compilation units or symbols that don't need
+     to bind locally.  In order to preserve some ABI compatibility with
+     those compilers, ensure we don't decrease alignment from what we
+     used to assume.  */
 
-  op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
-  clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
-  
-  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
-}
+  unsigned int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
 
-/* Return true if regno1 def is nearest to the insn.  */
+  /* A data structure, equal or greater than the size of a cache line
+     (64 bytes in the Pentium 4 and other recent Intel processors, including
+     processors based on Intel Core microarchitecture) should be aligned
+     so that its base address is a multiple of a cache line size.  */
 
-static bool
-find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
-{
-  rtx_insn *prev = insn;
-  rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
+  unsigned int max_align
+    = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
 
-  if (insn == start)
-    return false;
-  while (prev && prev != start)
+  if (max_align < BITS_PER_WORD)
+    max_align = BITS_PER_WORD;
+
+  switch (ix86_align_data_type)
     {
-      if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
-	{
-	  prev = PREV_INSN (prev);
-	  continue;
-	}
-      if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
-	return true;
-      else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
-	return false;
-      prev = PREV_INSN (prev);
+    case ix86_align_data_type_abi: opt = false; break;
+    case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
+    case ix86_align_data_type_cacheline: break;
     }
 
-  /* None of the regs is defined in the bb.  */
-  return false;
-}
-
-/* Split lea instructions into a sequence of instructions
-   which are executed on ALU to avoid AGU stalls.
-   It is assumed that it is allowed to clobber flags register
-   at lea position.  */
-
-void
-ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
-{
-  unsigned int regno0, regno1, regno2;
-  struct ix86_address parts;
-  rtx target, tmp;
-  int ok, adds;
-
-  ok = ix86_decompose_address (operands[1], &parts);
-  gcc_assert (ok);
-
-  target = gen_lowpart (mode, operands[0]);
-
-  regno0 = true_regnum (target);
-  regno1 = INVALID_REGNUM;
-  regno2 = INVALID_REGNUM;
+  if (TARGET_IAMCU)
+    align = iamcu_alignment (type, align);
 
-  if (parts.base)
+  if (opt
+      && AGGREGATE_TYPE_P (type)
+      && TYPE_SIZE (type)
+      && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
     {
-      parts.base = gen_lowpart (mode, parts.base);
-      regno1 = true_regnum (parts.base);
+      if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
+	  && align < max_align_compat)
+	align = max_align_compat;
+      if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
+	  && align < max_align)
+	align = max_align;
     }
 
-  if (parts.index)
+  /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
+     to 16byte boundary.  */
+  if (TARGET_64BIT)
     {
-      parts.index = gen_lowpart (mode, parts.index);
-      regno2 = true_regnum (parts.index);
+      if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
+	  && TYPE_SIZE (type)
+	  && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
+	  && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
+	  && align < 128)
+	return 128;
     }
 
-  if (parts.disp)
-    parts.disp = gen_lowpart (mode, parts.disp);
+  if (!opt)
+    return align;
 
-  if (parts.scale > 1)
+  if (TREE_CODE (type) == ARRAY_TYPE)
+    {
+      if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
+	return 64;
+      if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
+	return 128;
+    }
+  else if (TREE_CODE (type) == COMPLEX_TYPE)
     {
-      /* Case r1 = r1 + ...  */
-      if (regno1 == regno0)
-	{
-	  /* If we have a case r1 = r1 + C * r2 then we
-	     should use multiplication which is very
-	     expensive.  Assume cost model is wrong if we
-	     have such case here.  */
-	  gcc_assert (regno2 != regno0);
-
-	  for (adds = parts.scale; adds > 0; adds--)
-	    ix86_emit_binop (PLUS, mode, target, parts.index);
-	}
-      else
-	{
-	  /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
-	  if (regno0 != regno2)
-	    emit_insn (gen_rtx_SET (target, parts.index));
-
-	  /* Use shift for scaling.  */
-	  ix86_emit_binop (ASHIFT, mode, target,
-			   GEN_INT (exact_log2 (parts.scale)));
-
-	  if (parts.base)
-	    ix86_emit_binop (PLUS, mode, target, parts.base);
 
-	  if (parts.disp && parts.disp != const0_rtx)
-	    ix86_emit_binop (PLUS, mode, target, parts.disp);
-	}
+      if (TYPE_MODE (type) == DCmode && align < 64)
+	return 64;
+      if ((TYPE_MODE (type) == XCmode
+	   || TYPE_MODE (type) == TCmode) && align < 128)
+	return 128;
     }
-  else if (!parts.base && !parts.index)
+  else if ((TREE_CODE (type) == RECORD_TYPE
+	    || TREE_CODE (type) == UNION_TYPE
+	    || TREE_CODE (type) == QUAL_UNION_TYPE)
+	   && TYPE_FIELDS (type))
     {
-      gcc_assert(parts.disp);
-      emit_insn (gen_rtx_SET (target, parts.disp));
+      if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
+	return 64;
+      if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
+	return 128;
     }
-  else
+  else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
+	   || TREE_CODE (type) == INTEGER_TYPE)
     {
-      if (!parts.base)
-	{
-	  if (regno0 != regno2)
-	    emit_insn (gen_rtx_SET (target, parts.index));
-	}
-      else if (!parts.index)
-	{
-	  if (regno0 != regno1)
-	    emit_insn (gen_rtx_SET (target, parts.base));
-	}
-      else
-	{
-	  if (regno0 == regno1)
-	    tmp = parts.index;
-	  else if (regno0 == regno2)
-	    tmp = parts.base;
-	  else
-	    {
-	      rtx tmp1;
+      if (TYPE_MODE (type) == DFmode && align < 64)
+	return 64;
+      if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
+	return 128;
+    }
 
-	      /* Find better operand for SET instruction, depending
-		 on which definition is farther from the insn.  */
-	      if (find_nearest_reg_def (insn, regno1, regno2))
-		tmp = parts.index, tmp1 = parts.base;
-	      else
-		tmp = parts.base, tmp1 = parts.index;
+  return align;
+}
 
-	      emit_insn (gen_rtx_SET (target, tmp));
+/* Compute the alignment for a local variable or a stack slot.  EXP is
+   the data type or decl itself, MODE is the widest mode available and
+   ALIGN is the alignment that the object would ordinarily have.  The
+   value of this macro is used instead of that alignment to align the
+   object.  */
 
-	      if (parts.disp && parts.disp != const0_rtx)
-		ix86_emit_binop (PLUS, mode, target, parts.disp);
+unsigned int
+ix86_local_alignment (tree exp, machine_mode mode,
+		      unsigned int align)
+{
+  tree type, decl;
 
-	      ix86_emit_binop (PLUS, mode, target, tmp1);
-	      return;
-	    }
+  if (exp && DECL_P (exp))
+    {
+      type = TREE_TYPE (exp);
+      decl = exp;
+    }
+  else
+    {
+      type = exp;
+      decl = NULL;
+    }
 
-	  ix86_emit_binop (PLUS, mode, target, tmp);
-	}
+  /* Don't do dynamic stack realignment for long long objects with
+     -mpreferred-stack-boundary=2.  */
+  if (!TARGET_64BIT
+      && align == 64
+      && ix86_preferred_stack_boundary < 64
+      && (mode == DImode || (type && TYPE_MODE (type) == DImode))
+      && (!type || !TYPE_USER_ALIGN (type))
+      && (!decl || !DECL_USER_ALIGN (decl)))
+    align = 32;
 
-      if (parts.disp && parts.disp != const0_rtx)
-	ix86_emit_binop (PLUS, mode, target, parts.disp);
+  /* If TYPE is NULL, we are allocating a stack slot for caller-save
+     register in MODE.  We will return the largest alignment of XF
+     and DF.  */
+  if (!type)
+    {
+      if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
+	align = GET_MODE_ALIGNMENT (DFmode);
+      return align;
     }
-}
 
-/* Return true if it is ok to optimize an ADD operation to LEA
-   operation to avoid flag register consumation.  For most processors,
-   ADD is faster than LEA.  For the processors like BONNELL, if the
-   destination register of LEA holds an actual address which will be
-   used soon, LEA is better and otherwise ADD is better.  */
+  /* Don't increase alignment for Intel MCU psABI.  */
+  if (TARGET_IAMCU)
+    return align;
 
-bool
-ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
-{
-  unsigned int regno0 = true_regnum (operands[0]);
-  unsigned int regno1 = true_regnum (operands[1]);
-  unsigned int regno2 = true_regnum (operands[2]);
+  /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
+     to 16byte boundary.  Exact wording is:
 
-  /* If a = b + c, (a!=b && a!=c), must use lea form. */
-  if (regno0 != regno1 && regno0 != regno2)
-    return true;
+     An array uses the same alignment as its elements, except that a local or
+     global array variable of length at least 16 bytes or
+     a C99 variable-length array variable always has alignment of at least 16 bytes.
 
-  if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
-    return false;
+     This was added to allow use of aligned SSE instructions at arrays.  This
+     rule is meant for static storage (where compiler cannot do the analysis
+     by itself).  We follow it for automatic variables only when convenient.
+     We fully control everything in the function compiled and functions from
+     other unit cannot rely on the alignment.
 
-  return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
+     Exclude va_list type.  It is the common case of local array where
+     we cannot benefit from the alignment.  
+
+     TODO: Probably one should optimize for size only when var is not escaping.  */
+  if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
+      && TARGET_SSE)
+    {
+      if (AGGREGATE_TYPE_P (type)
+	  && (va_list_type_node == NULL_TREE
+	      || (TYPE_MAIN_VARIANT (type)
+		  != TYPE_MAIN_VARIANT (va_list_type_node)))
+	  && TYPE_SIZE (type)
+	  && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
+	  && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
+	  && align < 128)
+	return 128;
+    }
+  if (TREE_CODE (type) == ARRAY_TYPE)
+    {
+      if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
+	return 64;
+      if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
+	return 128;
+    }
+  else if (TREE_CODE (type) == COMPLEX_TYPE)
+    {
+      if (TYPE_MODE (type) == DCmode && align < 64)
+	return 64;
+      if ((TYPE_MODE (type) == XCmode
+	   || TYPE_MODE (type) == TCmode) && align < 128)
+	return 128;
+    }
+  else if ((TREE_CODE (type) == RECORD_TYPE
+	    || TREE_CODE (type) == UNION_TYPE
+	    || TREE_CODE (type) == QUAL_UNION_TYPE)
+	   && TYPE_FIELDS (type))
+    {
+      if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
+	return 64;
+      if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
+	return 128;
+    }
+  else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
+	   || TREE_CODE (type) == INTEGER_TYPE)
+    {
+
+      if (TYPE_MODE (type) == DFmode && align < 64)
+	return 64;
+      if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
+	return 128;
+    }
+  return align;
 }
 
-/* Return true if destination reg of SET_BODY is shift count of
-   USE_BODY.  */
+/* Compute the minimum required alignment for dynamic stack realignment
+   purposes for a local variable, parameter or a stack slot.  EXP is
+   the data type or decl itself, MODE is its mode and ALIGN is the
+   alignment that the object would ordinarily have.  */
 
-static bool
-ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
+unsigned int
+ix86_minimum_alignment (tree exp, machine_mode mode,
+			unsigned int align)
 {
-  rtx set_dest;
-  rtx shift_rtx;
-  int i;
+  tree type, decl;
 
-  /* Retrieve destination of SET_BODY.  */
-  switch (GET_CODE (set_body))
+  if (exp && DECL_P (exp))
     {
-    case SET:
-      set_dest = SET_DEST (set_body);
-      if (!set_dest || !REG_P (set_dest))
-	return false;
-      break;
-    case PARALLEL:
-      for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
-	if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
-					  use_body))
-	  return true;
-      /* FALLTHROUGH */
-    default:
-      return false;
+      type = TREE_TYPE (exp);
+      decl = exp;
     }
-
-  /* Retrieve shift count of USE_BODY.  */
-  switch (GET_CODE (use_body))
+  else
     {
-    case SET:
-      shift_rtx = XEXP (use_body, 1);
-      break;
-    case PARALLEL:
-      for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
-	if (ix86_dep_by_shift_count_body (set_body,
-					  XVECEXP (use_body, 0, i)))
-	  return true;
-      /* FALLTHROUGH */
-    default:
-      return false;
+      type = exp;
+      decl = NULL;
     }
 
-  if (shift_rtx
-      && (GET_CODE (shift_rtx) == ASHIFT
-	  || GET_CODE (shift_rtx) == LSHIFTRT
-	  || GET_CODE (shift_rtx) == ASHIFTRT
-	  || GET_CODE (shift_rtx) == ROTATE
-	  || GET_CODE (shift_rtx) == ROTATERT))
-    {
-      rtx shift_count = XEXP (shift_rtx, 1);
+  if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
+    return align;
 
-      /* Return true if shift count is dest of SET_BODY.  */
-      if (REG_P (shift_count))
-	{
-	  /* Add check since it can be invoked before register
-	     allocation in pre-reload schedule.  */
-	  if (reload_completed
-	      && true_regnum (set_dest) == true_regnum (shift_count))
-	    return true;
-	  else if (REGNO(set_dest) == REGNO(shift_count))
-	    return true;
-	}
+  /* Don't do dynamic stack realignment for long long objects with
+     -mpreferred-stack-boundary=2.  */
+  if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
+      && (!type || !TYPE_USER_ALIGN (type))
+      && (!decl || !DECL_USER_ALIGN (decl)))
+    {
+      gcc_checking_assert (!TARGET_STV);
+      return 32;
     }
 
-  return false;
+  return align;
 }
+
+/* Find a location for the static chain incoming to a nested function.
+   This is a register, unless all free registers are used by arguments.  */
 
-/* Return true if destination reg of SET_INSN is shift count of
-   USE_INSN.  */
-
-bool
-ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
+static rtx
+ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
 {
-  return ix86_dep_by_shift_count_body (PATTERN (set_insn),
-				       PATTERN (use_insn));
-}
+  unsigned regno;
 
-/* Return TRUE or FALSE depending on whether the unary operator meets the
-   appropriate constraints.  */
+  if (TARGET_64BIT)
+    {
+      /* We always use R10 in 64-bit mode.  */
+      regno = R10_REG;
+    }
+  else
+    {
+      const_tree fntype, fndecl;
+      unsigned int ccvt;
 
-bool
-ix86_unary_operator_ok (enum rtx_code,
-			machine_mode,
-			rtx operands[2])
-{
-  /* If one of operands is memory, source and destination must match.  */
-  if ((MEM_P (operands[0])
-       || MEM_P (operands[1]))
-      && ! rtx_equal_p (operands[0], operands[1]))
-    return false;
-  return true;
-}
+      /* By default in 32-bit mode we use ECX to pass the static chain.  */
+      regno = CX_REG;
 
-/* Return TRUE if the operands to a vec_interleave_{high,low}v2df
-   are ok, keeping in mind the possible movddup alternative.  */
+      if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
+	{
+          fntype = TREE_TYPE (fndecl_or_type);
+	  fndecl = fndecl_or_type;
+	}
+      else
+	{
+	  fntype = fndecl_or_type;
+	  fndecl = NULL;
+	}
 
-bool
-ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
-{
-  if (MEM_P (operands[0]))
-    return rtx_equal_p (operands[0], operands[1 + high]);
-  if (MEM_P (operands[1]) && MEM_P (operands[2]))
-    return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
-  return true;
+      ccvt = ix86_get_callcvt (fntype);
+      if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+	{
+	  /* Fastcall functions use ecx/edx for arguments, which leaves
+	     us with EAX for the static chain.
+	     Thiscall functions use ecx for arguments, which also
+	     leaves us with EAX for the static chain.  */
+	  regno = AX_REG;
+	}
+      else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+	{
+	  /* Thiscall functions use ecx for arguments, which leaves
+	     us with EAX and EDX for the static chain.
+	     We are using for abi-compatibility EAX.  */
+	  regno = AX_REG;
+	}
+      else if (ix86_function_regparm (fntype, fndecl) == 3)
+	{
+	  /* For regparm 3, we have no free call-clobbered registers in
+	     which to store the static chain.  In order to implement this,
+	     we have the trampoline push the static chain to the stack.
+	     However, we can't push a value below the return address when
+	     we call the nested function directly, so we have to use an
+	     alternate entry point.  For this we use ESI, and have the
+	     alternate entry point push ESI, so that things appear the
+	     same once we're executing the nested function.  */
+	  if (incoming_p)
+	    {
+	      if (fndecl == current_function_decl
+		  && !ix86_static_chain_on_stack)
+		{
+		  gcc_assert (!reload_completed);
+		  ix86_static_chain_on_stack = true;
+		}
+	      return gen_frame_mem (SImode,
+				    plus_constant (Pmode,
+						   arg_pointer_rtx, -8));
+	    }
+	  regno = SI_REG;
+	}
+    }
+
+  return gen_rtx_REG (Pmode, regno);
 }
 
-/* Post-reload splitter for converting an SF or DFmode value in an
-   SSE register into an unsigned SImode.  */
+/* Emit RTL insns to initialize the variable parts of a trampoline.
+   FNDECL is the decl of the target address; M_TRAMP is a MEM for
+   the trampoline, and CHAIN_VALUE is an RTX for the static chain
+   to be passed to the target function.  */
 
-void
-ix86_split_convert_uns_si_sse (rtx operands[])
+static void
+ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
 {
-  machine_mode vecmode;
-  rtx value, large, zero_or_two31, input, two31, x;
+  rtx mem, fnaddr;
+  int opcode;
+  int offset = 0;
+  bool need_endbr = (flag_cf_protection & CF_BRANCH);
 
-  large = operands[1];
-  zero_or_two31 = operands[2];
-  input = operands[3];
-  two31 = operands[4];
-  vecmode = GET_MODE (large);
-  value = gen_rtx_REG (vecmode, REGNO (operands[0]));
+  fnaddr = XEXP (DECL_RTL (fndecl), 0);
 
-  /* Load up the value into the low element.  We must ensure that the other
-     elements are valid floats -- zero is the easiest such value.  */
-  if (MEM_P (input))
-    {
-      if (vecmode == V4SFmode)
-	emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
-      else
-	emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
-    }
-  else
+  if (TARGET_64BIT)
     {
-      input = gen_rtx_REG (vecmode, REGNO (input));
-      emit_move_insn (value, CONST0_RTX (vecmode));
-      if (vecmode == V4SFmode)
-	emit_insn (gen_sse_movss (value, value, input));
-      else
-	emit_insn (gen_sse2_movsd (value, value, input));
-    }
-
-  emit_move_insn (large, two31);
-  emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
+      int size;
 
-  x = gen_rtx_fmt_ee (LE, vecmode, large, value);
-  emit_insn (gen_rtx_SET (large, x));
+      if (need_endbr)
+	{
+	  /* Insert ENDBR64.  */
+	  mem = adjust_address (m_tramp, SImode, offset);
+	  emit_move_insn (mem, gen_int_mode (0xfa1e0ff3, SImode));
+	  offset += 4;
+	}
 
-  x = gen_rtx_AND (vecmode, zero_or_two31, large);
-  emit_insn (gen_rtx_SET (zero_or_two31, x));
+      /* Load the function address to r11.  Try to load address using
+	 the shorter movl instead of movabs.  We may want to support
+	 movq for kernel mode, but kernel does not use trampolines at
+	 the moment.  FNADDR is a 32bit address and may not be in
+	 DImode when ptr_mode == SImode.  Always use movl in this
+	 case.  */
+      if (ptr_mode == SImode
+	  || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
+	{
+	  fnaddr = copy_addr_to_reg (fnaddr);
 
-  x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
-  emit_insn (gen_rtx_SET (value, x));
+	  mem = adjust_address (m_tramp, HImode, offset);
+	  emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
 
-  large = gen_rtx_REG (V4SImode, REGNO (large));
-  emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
+	  mem = adjust_address (m_tramp, SImode, offset + 2);
+	  emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
+	  offset += 6;
+	}
+      else
+	{
+	  mem = adjust_address (m_tramp, HImode, offset);
+	  emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
 
-  x = gen_rtx_REG (V4SImode, REGNO (value));
-  if (vecmode == V4SFmode)
-    emit_insn (gen_fix_truncv4sfv4si2 (x, value));
-  else
-    emit_insn (gen_sse2_cvttpd2dq (x, value));
-  value = x;
+	  mem = adjust_address (m_tramp, DImode, offset + 2);
+	  emit_move_insn (mem, fnaddr);
+	  offset += 10;
+	}
 
-  emit_insn (gen_xorv4si3 (value, value, large));
-}
+      /* Load static chain using movabs to r10.  Use the shorter movl
+         instead of movabs when ptr_mode == SImode.  */
+      if (ptr_mode == SImode)
+	{
+	  opcode = 0xba41;
+	  size = 6;
+	}
+      else
+	{
+	  opcode = 0xba49;
+	  size = 10;
+	}
 
-/* Convert an unsigned DImode value into a DFmode, using only SSE.
-   Expects the 64-bit DImode to be supplied in a pair of integral
-   registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
-   -mfpmath=sse, !optimize_size only.  */
+      mem = adjust_address (m_tramp, HImode, offset);
+      emit_move_insn (mem, gen_int_mode (opcode, HImode));
 
-void
-ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
-{
-  REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
-  rtx int_xmm, fp_xmm;
-  rtx biases, exponents;
-  rtx x;
+      mem = adjust_address (m_tramp, ptr_mode, offset + 2);
+      emit_move_insn (mem, chain_value);
+      offset += size;
 
-  int_xmm = gen_reg_rtx (V4SImode);
-  if (TARGET_INTER_UNIT_MOVES_TO_VEC)
-    emit_insn (gen_movdi_to_sse (int_xmm, input));
-  else if (TARGET_SSE_SPLIT_REGS)
-    {
-      emit_clobber (int_xmm);
-      emit_move_insn (gen_lowpart (DImode, int_xmm), input);
+      /* Jump to r11; the last (unused) byte is a nop, only there to
+	 pad the write out to a single 32-bit store.  */
+      mem = adjust_address (m_tramp, SImode, offset);
+      emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
+      offset += 4;
     }
   else
     {
-      x = gen_reg_rtx (V2DImode);
-      ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
-      emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
-    }
-
-  x = gen_rtx_CONST_VECTOR (V4SImode,
-			    gen_rtvec (4, GEN_INT (0x43300000UL),
-				       GEN_INT (0x45300000UL),
-				       const0_rtx, const0_rtx));
-  exponents = validize_mem (force_const_mem (V4SImode, x));
-
-  /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
-  emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
-
-  /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
-     yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
-     Similarly (0x45300000UL ## fp_value_hi_xmm) yields
-     (0x1.0p84 + double(fp_value_hi_xmm)).
-     Note these exponents differ by 32.  */
-
-  fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
-
-  /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
-     in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
-  real_ldexp (&bias_lo_rvt, &dconst1, 52);
-  real_ldexp (&bias_hi_rvt, &dconst1, 84);
-  biases = const_double_from_real_value (bias_lo_rvt, DFmode);
-  x = const_double_from_real_value (bias_hi_rvt, DFmode);
-  biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
-  biases = validize_mem (force_const_mem (V2DFmode, biases));
-  emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
-
-  /* Add the upper and lower DFmode values together.  */
-  if (TARGET_SSE3)
-    emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
-  else
-    {
-      x = copy_to_mode_reg (V2DFmode, fp_xmm);
-      emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
-      emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
-    }
+      rtx disp, chain;
 
-  ix86_expand_vector_extract (false, target, fp_xmm, 0);
-}
+      /* Depending on the static chain location, either load a register
+	 with a constant, or push the constant to the stack.  All of the
+	 instructions are the same size.  */
+      chain = ix86_static_chain (fndecl, true);
+      if (REG_P (chain))
+	{
+	  switch (REGNO (chain))
+	    {
+	    case AX_REG:
+	      opcode = 0xb8; break;
+	    case CX_REG:
+	      opcode = 0xb9; break;
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
+      else
+	opcode = 0x68;
 
-/* Not used, but eases macroization of patterns.  */
-void
-ix86_expand_convert_uns_sixf_sse (rtx, rtx)
-{
-  gcc_unreachable ();
-}
+      if (need_endbr)
+	{
+	  /* Insert ENDBR32.  */
+	  mem = adjust_address (m_tramp, SImode, offset);
+	  emit_move_insn (mem, gen_int_mode (0xfb1e0ff3, SImode));
+	  offset += 4;
+	}
 
-/* Convert an unsigned SImode value into a DFmode.  Only currently used
-   for SSE, but applicable anywhere.  */
+      mem = adjust_address (m_tramp, QImode, offset);
+      emit_move_insn (mem, gen_int_mode (opcode, QImode));
 
-void
-ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
-{
-  REAL_VALUE_TYPE TWO31r;
-  rtx x, fp;
+      mem = adjust_address (m_tramp, SImode, offset + 1);
+      emit_move_insn (mem, chain_value);
+      offset += 5;
 
-  x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
-			   NULL, 1, OPTAB_DIRECT);
+      mem = adjust_address (m_tramp, QImode, offset);
+      emit_move_insn (mem, gen_int_mode (0xe9, QImode));
+
+      mem = adjust_address (m_tramp, SImode, offset + 1);
 
-  fp = gen_reg_rtx (DFmode);
-  emit_insn (gen_floatsidf2 (fp, x));
+      /* Compute offset from the end of the jmp to the target function.
+	 In the case in which the trampoline stores the static chain on
+	 the stack, we need to skip the first insn which pushes the
+	 (call-saved) register static chain; this push is 1 byte.  */
+      offset += 5;
+      int skip = MEM_P (chain) ? 1 : 0;
+      /* Skip ENDBR32 at the entry of the target function.  */
+      if (need_endbr
+	  && !cgraph_node::get (fndecl)->only_called_directly_p ())
+	skip += 4;
+      disp = expand_binop (SImode, sub_optab, fnaddr,
+			   plus_constant (Pmode, XEXP (m_tramp, 0),
+					  offset - skip),
+			   NULL_RTX, 1, OPTAB_DIRECT);
+      emit_move_insn (mem, disp);
+    }
 
-  real_ldexp (&TWO31r, &dconst1, 31);
-  x = const_double_from_real_value (TWO31r, DFmode);
+  gcc_assert (offset <= TRAMPOLINE_SIZE);
 
-  x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
-  if (x != target)
-    emit_move_insn (target, x);
+#ifdef HAVE_ENABLE_EXECUTE_STACK
+#ifdef CHECK_EXECUTE_STACK_ENABLED
+  if (CHECK_EXECUTE_STACK_ENABLED)
+#endif
+  emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
+		     LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
+#endif
 }
 
-/* Convert a signed DImode value into a DFmode.  Only used for SSE in
-   32-bit mode; otherwise we have a direct convert instruction.  */
-
-void
-ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
+static bool
+ix86_allocate_stack_slots_for_args (void)
 {
-  REAL_VALUE_TYPE TWO32r;
-  rtx fp_lo, fp_hi, x;
-
-  fp_lo = gen_reg_rtx (DFmode);
-  fp_hi = gen_reg_rtx (DFmode);
+  /* Naked functions should not allocate stack slots for arguments.  */
+  return !ix86_function_naked (current_function_decl);
+}
 
-  emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
-
-  real_ldexp (&TWO32r, &dconst1, 32);
-  x = const_double_from_real_value (TWO32r, DFmode);
-  fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
-
-  ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
-
-  x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
-			   0, OPTAB_DIRECT);
-  if (x != target)
-    emit_move_insn (target, x);
-}
-
-/* Convert an unsigned SImode value into a SFmode, using only SSE.
-   For x86_32, -mfpmath=sse, !optimize_size only.  */
-void
-ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
-{
-  REAL_VALUE_TYPE ONE16r;
-  rtx fp_hi, fp_lo, int_hi, int_lo, x;
-
-  real_ldexp (&ONE16r, &dconst1, 16);
-  x = const_double_from_real_value (ONE16r, SFmode);
-  int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
-				      NULL, 0, OPTAB_DIRECT);
-  int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
-				      NULL, 0, OPTAB_DIRECT);
-  fp_hi = gen_reg_rtx (SFmode);
-  fp_lo = gen_reg_rtx (SFmode);
-  emit_insn (gen_floatsisf2 (fp_hi, int_hi));
-  emit_insn (gen_floatsisf2 (fp_lo, int_lo));
-  fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
-			       0, OPTAB_DIRECT);
-  fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
-			       0, OPTAB_DIRECT);
-  if (!rtx_equal_p (target, fp_hi))
-    emit_move_insn (target, fp_hi);
-}
-
-/* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
-   a vector of unsigned ints VAL to vector of floats TARGET.  */
-
-void
-ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
-{
-  rtx tmp[8];
-  REAL_VALUE_TYPE TWO16r;
-  machine_mode intmode = GET_MODE (val);
-  machine_mode fltmode = GET_MODE (target);
-  rtx (*cvt) (rtx, rtx);
-
-  if (intmode == V4SImode)
-    cvt = gen_floatv4siv4sf2;
-  else
-    cvt = gen_floatv8siv8sf2;
-  tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
-  tmp[0] = force_reg (intmode, tmp[0]);
-  tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
-				OPTAB_DIRECT);
-  tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
-				NULL_RTX, 1, OPTAB_DIRECT);
-  tmp[3] = gen_reg_rtx (fltmode);
-  emit_insn (cvt (tmp[3], tmp[1]));
-  tmp[4] = gen_reg_rtx (fltmode);
-  emit_insn (cvt (tmp[4], tmp[2]));
-  real_ldexp (&TWO16r, &dconst1, 16);
-  tmp[5] = const_double_from_real_value (TWO16r, SFmode);
-  tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
-  tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
-				OPTAB_DIRECT);
-  tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
-				OPTAB_DIRECT);
-  if (tmp[7] != target)
-    emit_move_insn (target, tmp[7]);
-}
-
-/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
-   pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
-   This is done by doing just signed conversion if < 0x1p31, and otherwise by
-   subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
-
-rtx
-ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
-{
-  REAL_VALUE_TYPE TWO31r;
-  rtx two31r, tmp[4];
-  machine_mode mode = GET_MODE (val);
-  machine_mode scalarmode = GET_MODE_INNER (mode);
-  machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
-  rtx (*cmp) (rtx, rtx, rtx, rtx);
-  int i;
-
-  for (i = 0; i < 3; i++)
-    tmp[i] = gen_reg_rtx (mode);
-  real_ldexp (&TWO31r, &dconst1, 31);
-  two31r = const_double_from_real_value (TWO31r, scalarmode);
-  two31r = ix86_build_const_vector (mode, 1, two31r);
-  two31r = force_reg (mode, two31r);
-  switch (mode)
-    {
-    case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
-    case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
-    case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
-    case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
-    default: gcc_unreachable ();
-    }
-  tmp[3] = gen_rtx_LE (mode, two31r, val);
-  emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
-  tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
-				0, OPTAB_DIRECT);
-  if (intmode == V4SImode || TARGET_AVX2)
-    *xorp = expand_simple_binop (intmode, ASHIFT,
-				 gen_lowpart (intmode, tmp[0]),
-				 GEN_INT (31), NULL_RTX, 0,
-				 OPTAB_DIRECT);
-  else
-    {
-      rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
-      two31 = ix86_build_const_vector (intmode, 1, two31);
-      *xorp = expand_simple_binop (intmode, AND,
-				   gen_lowpart (intmode, tmp[0]),
-				   two31, NULL_RTX, 0,
-				   OPTAB_DIRECT);
-    }
-  return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
-			      0, OPTAB_DIRECT);
-}
-
-/* A subroutine of ix86_build_signbit_mask.  If VECT is true,
-   then replicate the value for all elements of the vector
-   register.  */
-
-rtx
-ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
+static bool
+ix86_warn_func_return (tree decl)
 {
-  int i, n_elt;
-  rtvec v;
-  machine_mode scalar_mode;
-
-  switch (mode)
-    {
-    case E_V64QImode:
-    case E_V32QImode:
-    case E_V16QImode:
-    case E_V32HImode:
-    case E_V16HImode:
-    case E_V8HImode:
-    case E_V16SImode:
-    case E_V8SImode:
-    case E_V4SImode:
-    case E_V8DImode:
-    case E_V4DImode:
-    case E_V2DImode:
-      gcc_assert (vect);
-      /* FALLTHRU */
-    case E_V16SFmode:
-    case E_V8SFmode:
-    case E_V4SFmode:
-    case E_V8DFmode:
-    case E_V4DFmode:
-    case E_V2DFmode:
-      n_elt = GET_MODE_NUNITS (mode);
-      v = rtvec_alloc (n_elt);
-      scalar_mode = GET_MODE_INNER (mode);
-
-      RTVEC_ELT (v, 0) = value;
-
-      for (i = 1; i < n_elt; ++i)
-	RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
-
-      return gen_rtx_CONST_VECTOR (mode, v);
-
-    default:
-      gcc_unreachable ();
-    }
+  /* Naked functions are implemented entirely in assembly, including the
+     return sequence, so suppress warnings about this.  */
+  return !ix86_function_naked (decl);
 }
-
-/* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
-   and ix86_expand_int_vcond.  Create a mask for the sign bit in MODE
-   for an SSE register.  If VECT is true, then replicate the mask for
-   all elements of the vector register.  If INVERT is true, then create
-   a mask excluding the sign bit.  */
-
-rtx
-ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
+
+/* Return the shift count of a vector by scalar shift builtin second argument
+   ARG1.  */
+static tree
+ix86_vector_shift_count (tree arg1)
 {
-  machine_mode vec_mode, imode;
-  wide_int w;
-  rtx mask, v;
-
-  switch (mode)
+  if (tree_fits_uhwi_p (arg1))
+    return arg1;
+  else if (TREE_CODE (arg1) == VECTOR_CST && CHAR_BIT == 8)
     {
-    case E_V16SImode:
-    case E_V16SFmode:
-    case E_V8SImode:
-    case E_V4SImode:
-    case E_V8SFmode:
-    case E_V4SFmode:
-      vec_mode = mode;
-      imode = SImode;
-      break;
-
-    case E_V8DImode:
-    case E_V4DImode:
-    case E_V2DImode:
-    case E_V8DFmode:
-    case E_V4DFmode:
-    case E_V2DFmode:
-      vec_mode = mode;
-      imode = DImode;
-      break;
-
-    case E_TImode:
-    case E_TFmode:
-      vec_mode = VOIDmode;
-      imode = TImode;
-      break;
-
-    default:
-      gcc_unreachable ();
+      /* The count argument is weird, passed in as various 128-bit
+	 (or 64-bit) vectors, the low 64 bits from it are the count.  */
+      unsigned char buf[16];
+      int len = native_encode_expr (arg1, buf, 16);
+      if (len == 0)
+	return NULL_TREE;
+      tree t = native_interpret_expr (uint64_type_node, buf, len);
+      if (t && tree_fits_uhwi_p (t))
+	return t;
     }
-
-  machine_mode inner_mode = GET_MODE_INNER (mode);
-  w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
-			   GET_MODE_BITSIZE (inner_mode));
-  if (invert)
-    w = wi::bit_not (w);
-
-  /* Force this value into the low part of a fp vector constant.  */
-  mask = immed_wide_int_const (w, imode);
-  mask = gen_lowpart (inner_mode, mask);
-
-  if (vec_mode == VOIDmode)
-    return force_reg (inner_mode, mask);
-
-  v = ix86_build_const_vector (vec_mode, vect, mask);
-  return force_reg (vec_mode, v);
+  return NULL_TREE;
 }
 
-/* Generate code for floating point ABS or NEG.  */
-
-void
-ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
-				rtx operands[])
+static tree
+ix86_fold_builtin (tree fndecl, int n_args,
+		   tree *args, bool ignore ATTRIBUTE_UNUSED)
 {
-  rtx mask, set, dst, src;
-  bool use_sse = false;
-  bool vector_mode = VECTOR_MODE_P (mode);
-  machine_mode vmode = mode;
-
-  if (vector_mode)
-    use_sse = true;
-  else if (mode == TFmode)
-    use_sse = true;
-  else if (TARGET_SSE_MATH)
-    {
-      use_sse = SSE_FLOAT_MODE_P (mode);
-      if (mode == SFmode)
-	vmode = V4SFmode;
-      else if (mode == DFmode)
-	vmode = V2DFmode;
-    }
-
-  /* NEG and ABS performed with SSE use bitwise mask operations.
-     Create the appropriate mask now.  */
-  if (use_sse)
-    mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
-  else
-    mask = NULL_RTX;
-
-  dst = operands[0];
-  src = operands[1];
-
-  set = gen_rtx_fmt_e (code, mode, src);
-  set = gen_rtx_SET (dst, set);
-
-  if (mask)
+  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
     {
-      rtx use, clob;
-      rtvec par;
+      enum ix86_builtins fn_code
+	= (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
+      enum rtx_code rcode;
+      bool is_vshift;
+      unsigned HOST_WIDE_INT mask;
 
-      use = gen_rtx_USE (VOIDmode, mask);
-      if (vector_mode)
-	par = gen_rtvec (2, set, use);
-      else
+      switch (fn_code)
 	{
-          clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
-	  par = gen_rtvec (3, set, use, clob);
-        }
-      emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
-    }
-  else
-    emit_insn (set);
-}
-
-/* Expand a copysign operation.  Special case operand 0 being a constant.  */
-
-void
-ix86_expand_copysign (rtx operands[])
-{
-  machine_mode mode, vmode;
-  rtx dest, op0, op1, mask, nmask;
-
-  dest = operands[0];
-  op0 = operands[1];
-  op1 = operands[2];
-
-  mode = GET_MODE (dest);
+	case IX86_BUILTIN_CPU_IS:
+	case IX86_BUILTIN_CPU_SUPPORTS:
+	  gcc_assert (n_args == 1);
+	  return fold_builtin_cpu (fndecl, args);
 
-  if (mode == SFmode)
-    vmode = V4SFmode;
-  else if (mode == DFmode)
-    vmode = V2DFmode;
-  else
-    vmode = mode;
+	case IX86_BUILTIN_NANQ:
+	case IX86_BUILTIN_NANSQ:
+	  {
+	    tree type = TREE_TYPE (TREE_TYPE (fndecl));
+	    const char *str = c_getstr (*args);
+	    int quiet = fn_code == IX86_BUILTIN_NANQ;
+	    REAL_VALUE_TYPE real;
 
-  if (CONST_DOUBLE_P (op0))
-    {
-      rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
+	    if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
+	      return build_real (type, real);
+	    return NULL_TREE;
+	  }
 
-      if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
-	op0 = simplify_unary_operation (ABS, mode, op0, mode);
+	case IX86_BUILTIN_INFQ:
+	case IX86_BUILTIN_HUGE_VALQ:
+	  {
+	    tree type = TREE_TYPE (TREE_TYPE (fndecl));
+	    REAL_VALUE_TYPE inf;
+	    real_inf (&inf);
+	    return build_real (type, inf);
+	  }
 
-      if (mode == SFmode || mode == DFmode)
-	{
-	  if (op0 == CONST0_RTX (mode))
-	    op0 = CONST0_RTX (vmode);
-	  else
+	case IX86_BUILTIN_TZCNT16:
+	case IX86_BUILTIN_CTZS:
+	case IX86_BUILTIN_TZCNT32:
+	case IX86_BUILTIN_TZCNT64:
+	  gcc_assert (n_args == 1);
+	  if (TREE_CODE (args[0]) == INTEGER_CST)
 	    {
-	      rtx v = ix86_build_const_vector (vmode, false, op0);
-
-	      op0 = force_reg (vmode, v);
+	      tree type = TREE_TYPE (TREE_TYPE (fndecl));
+	      tree arg = args[0];
+	      if (fn_code == IX86_BUILTIN_TZCNT16
+		  || fn_code == IX86_BUILTIN_CTZS)
+		arg = fold_convert (short_unsigned_type_node, arg);
+	      if (integer_zerop (arg))
+		return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
+	      else
+		return fold_const_call (CFN_CTZ, type, arg);
 	    }
-	}
-      else if (op0 != CONST0_RTX (mode))
-	op0 = force_reg (mode, op0);
-
-      mask = ix86_build_signbit_mask (vmode, 0, 0);
-
-      if (mode == SFmode)
-	copysign_insn = gen_copysignsf3_const;
-      else if (mode == DFmode)
-	copysign_insn = gen_copysigndf3_const;
-      else
-	copysign_insn = gen_copysigntf3_const;
-
-      emit_insn (copysign_insn (dest, op0, op1, mask));
-    }
-  else
-    {
-      rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
-
-      nmask = ix86_build_signbit_mask (vmode, 0, 1);
-      mask = ix86_build_signbit_mask (vmode, 0, 0);
-
-      if (mode == SFmode)
-	copysign_insn = gen_copysignsf3_var;
-      else if (mode == DFmode)
-	copysign_insn = gen_copysigndf3_var;
-      else
-	copysign_insn = gen_copysigntf3_var;
-
-      emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
-    }
-}
+	  break;
 
-/* Deconstruct a copysign operation into bit masks.  Operand 0 is known to
-   be a constant, and so has already been expanded into a vector constant.  */
+	case IX86_BUILTIN_LZCNT16:
+	case IX86_BUILTIN_CLZS:
+	case IX86_BUILTIN_LZCNT32:
+	case IX86_BUILTIN_LZCNT64:
+	  gcc_assert (n_args == 1);
+	  if (TREE_CODE (args[0]) == INTEGER_CST)
+	    {
+	      tree type = TREE_TYPE (TREE_TYPE (fndecl));
+	      tree arg = args[0];
+	      if (fn_code == IX86_BUILTIN_LZCNT16
+		  || fn_code == IX86_BUILTIN_CLZS)
+		arg = fold_convert (short_unsigned_type_node, arg);
+	      if (integer_zerop (arg))
+		return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
+	      else
+		return fold_const_call (CFN_CLZ, type, arg);
+	    }
+	  break;
 
-void
-ix86_split_copysign_const (rtx operands[])
-{
-  machine_mode mode, vmode;
-  rtx dest, op0, mask, x;
-
-  dest = operands[0];
-  op0 = operands[1];
-  mask = operands[3];
-
-  mode = GET_MODE (dest);
-  vmode = GET_MODE (mask);
-
-  dest = lowpart_subreg (vmode, dest, mode);
-  x = gen_rtx_AND (vmode, dest, mask);
-  emit_insn (gen_rtx_SET (dest, x));
-
-  if (op0 != CONST0_RTX (vmode))
-    {
-      x = gen_rtx_IOR (vmode, dest, op0);
-      emit_insn (gen_rtx_SET (dest, x));
-    }
-}
-
-/* Deconstruct a copysign operation into bit masks.  Operand 0 is variable,
-   so we have to do two masks.  */
-
-void
-ix86_split_copysign_var (rtx operands[])
-{
-  machine_mode mode, vmode;
-  rtx dest, scratch, op0, op1, mask, nmask, x;
-
-  dest = operands[0];
-  scratch = operands[1];
-  op0 = operands[2];
-  op1 = operands[3];
-  nmask = operands[4];
-  mask = operands[5];
-
-  mode = GET_MODE (dest);
-  vmode = GET_MODE (mask);
-
-  if (rtx_equal_p (op0, op1))
-    {
-      /* Shouldn't happen often (it's useless, obviously), but when it does
-	 we'd generate incorrect code if we continue below.  */
-      emit_move_insn (dest, op0);
-      return;
-    }
-
-  if (REG_P (mask) && REGNO (dest) == REGNO (mask))	/* alternative 0 */
-    {
-      gcc_assert (REGNO (op1) == REGNO (scratch));
-
-      x = gen_rtx_AND (vmode, scratch, mask);
-      emit_insn (gen_rtx_SET (scratch, x));
-
-      dest = mask;
-      op0 = lowpart_subreg (vmode, op0, mode);
-      x = gen_rtx_NOT (vmode, dest);
-      x = gen_rtx_AND (vmode, x, op0);
-      emit_insn (gen_rtx_SET (dest, x));
-    }
-  else
-    {
-      if (REGNO (op1) == REGNO (scratch))		/* alternative 1,3 */
-	{
-	  x = gen_rtx_AND (vmode, scratch, mask);
-	}
-      else						/* alternative 2,4 */
-	{
-          gcc_assert (REGNO (mask) == REGNO (scratch));
-          op1 = lowpart_subreg (vmode, op1, mode);
-	  x = gen_rtx_AND (vmode, scratch, op1);
-	}
-      emit_insn (gen_rtx_SET (scratch, x));
-
-      if (REGNO (op0) == REGNO (dest))			/* alternative 1,2 */
-	{
-	  dest = lowpart_subreg (vmode, op0, mode);
-	  x = gen_rtx_AND (vmode, dest, nmask);
-	}
-      else						/* alternative 3,4 */
-	{
-          gcc_assert (REGNO (nmask) == REGNO (dest));
-	  dest = nmask;
-	  op0 = lowpart_subreg (vmode, op0, mode);
-	  x = gen_rtx_AND (vmode, dest, op0);
-	}
-      emit_insn (gen_rtx_SET (dest, x));
-    }
-
-  x = gen_rtx_IOR (vmode, dest, scratch);
-  emit_insn (gen_rtx_SET (dest, x));
-}
-
-/* Expand an xorsign operation.  */
-
-void
-ix86_expand_xorsign (rtx operands[])
-{
-  rtx (*xorsign_insn)(rtx, rtx, rtx, rtx);
-  machine_mode mode, vmode;
-  rtx dest, op0, op1, mask;
-
-  dest = operands[0];
-  op0 = operands[1];
-  op1 = operands[2];
-
-  mode = GET_MODE (dest);
-
-  if (mode == SFmode)
-    {
-      xorsign_insn = gen_xorsignsf3_1;
-      vmode = V4SFmode;
-    }
-  else if (mode == DFmode)
-    {
-      xorsign_insn = gen_xorsigndf3_1;
-      vmode = V2DFmode;
-    }
-  else
-    gcc_unreachable ();
-
-  mask = ix86_build_signbit_mask (vmode, 0, 0);
-
-  emit_insn (xorsign_insn (dest, op0, op1, mask));
-}
-
-/* Deconstruct an xorsign operation into bit masks.  */
-
-void
-ix86_split_xorsign (rtx operands[])
-{
-  machine_mode mode, vmode;
-  rtx dest, op0, mask, x;
-
-  dest = operands[0];
-  op0 = operands[1];
-  mask = operands[3];
-
-  mode = GET_MODE (dest);
-  vmode = GET_MODE (mask);
-
-  dest = lowpart_subreg (vmode, dest, mode);
-  x = gen_rtx_AND (vmode, dest, mask);
-  emit_insn (gen_rtx_SET (dest, x));
-
-  op0 = lowpart_subreg (vmode, op0, mode);
-  x = gen_rtx_XOR (vmode, dest, op0);
-  emit_insn (gen_rtx_SET (dest, x));
-}
-
-/* Return TRUE or FALSE depending on whether the first SET in INSN
-   has source and destination with matching CC modes, and that the
-   CC mode is at least as constrained as REQ_MODE.  */
-
-bool
-ix86_match_ccmode (rtx insn, machine_mode req_mode)
-{
-  rtx set;
-  machine_mode set_mode;
-
-  set = PATTERN (insn);
-  if (GET_CODE (set) == PARALLEL)
-    set = XVECEXP (set, 0, 0);
-  gcc_assert (GET_CODE (set) == SET);
-  gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
-
-  set_mode = GET_MODE (SET_DEST (set));
-  switch (set_mode)
-    {
-    case E_CCNOmode:
-      if (req_mode != CCNOmode
-	  && (req_mode != CCmode
-	      || XEXP (SET_SRC (set), 1) != const0_rtx))
-	return false;
-      break;
-    case E_CCmode:
-      if (req_mode == CCGCmode)
-	return false;
-      /* FALLTHRU */
-    case E_CCGCmode:
-      if (req_mode == CCGOCmode || req_mode == CCNOmode)
-	return false;
-      /* FALLTHRU */
-    case E_CCGOCmode:
-      if (req_mode == CCZmode)
-	return false;
-      /* FALLTHRU */
-    case E_CCZmode:
-      break;
-
-    case E_CCGZmode:
-
-    case E_CCAmode:
-    case E_CCCmode:
-    case E_CCOmode:
-    case E_CCPmode:
-    case E_CCSmode:
-      if (set_mode != req_mode)
-	return false;
-      break;
-
-    default:
-      gcc_unreachable ();
-    }
-
-  return GET_MODE (SET_SRC (set)) == set_mode;
-}
-
-/* Generate insn patterns to do an integer compare of OPERANDS.  */
-
-static rtx
-ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
-{
-  machine_mode cmpmode;
-  rtx tmp, flags;
-
-  cmpmode = SELECT_CC_MODE (code, op0, op1);
-  flags = gen_rtx_REG (cmpmode, FLAGS_REG);
-
-  /* This is very simple, but making the interface the same as in the
-     FP case makes the rest of the code easier.  */
-  tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
-  emit_insn (gen_rtx_SET (flags, tmp));
-
-  /* Return the test that should be put into the flags user, i.e.
-     the bcc, scc, or cmov instruction.  */
-  return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
-}
+	case IX86_BUILTIN_BEXTR32:
+	case IX86_BUILTIN_BEXTR64:
+	case IX86_BUILTIN_BEXTRI32:
+	case IX86_BUILTIN_BEXTRI64:
+	  gcc_assert (n_args == 2);
+	  if (tree_fits_uhwi_p (args[1]))
+	    {
+	      unsigned HOST_WIDE_INT res = 0;
+	      unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
+	      unsigned int start = tree_to_uhwi (args[1]);
+	      unsigned int len = (start & 0xff00) >> 8;
+	      start &= 0xff;
+	      if (start >= prec || len == 0)
+		res = 0;
+	      else if (!tree_fits_uhwi_p (args[0]))
+		break;
+	      else
+		res = tree_to_uhwi (args[0]) >> start;
+	      if (len > prec)
+		len = prec;
+	      if (len < HOST_BITS_PER_WIDE_INT)
+		res &= (HOST_WIDE_INT_1U << len) - 1;
+	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
+	    }
+	  break;
 
-/* Figure out whether to use unordered fp comparisons.  */
+	case IX86_BUILTIN_BZHI32:
+	case IX86_BUILTIN_BZHI64:
+	  gcc_assert (n_args == 2);
+	  if (tree_fits_uhwi_p (args[1]))
+	    {
+	      unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
+	      if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
+		return args[0];
+	      if (idx == 0)
+		return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), 0);
+	      if (!tree_fits_uhwi_p (args[0]))
+		break;
+	      unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
+	      res &= ~(HOST_WIDE_INT_M1U << idx);
+	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
+	    }
+	  break;
 
-static bool
-ix86_unordered_fp_compare (enum rtx_code code)
-{
-  if (!TARGET_IEEE_FP)
-    return false;
+	case IX86_BUILTIN_PDEP32:
+	case IX86_BUILTIN_PDEP64:
+	  gcc_assert (n_args == 2);
+	  if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
+	    {
+	      unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
+	      unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
+	      unsigned HOST_WIDE_INT res = 0;
+	      unsigned HOST_WIDE_INT m, k = 1;
+	      for (m = 1; m; m <<= 1)
+		if ((mask & m) != 0)
+		  {
+		    if ((src & k) != 0)
+		      res |= m;
+		    k <<= 1;
+		  }
+	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
+	    }
+	  break;
 
-  switch (code)
-    {
-    case GT:
-    case GE:
-    case LT:
-    case LE:
-      return false;
+	case IX86_BUILTIN_PEXT32:
+	case IX86_BUILTIN_PEXT64:
+	  gcc_assert (n_args == 2);
+	  if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
+	    {
+	      unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
+	      unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
+	      unsigned HOST_WIDE_INT res = 0;
+	      unsigned HOST_WIDE_INT m, k = 1;
+	      for (m = 1; m; m <<= 1)
+		if ((mask & m) != 0)
+		  {
+		    if ((src & m) != 0)
+		      res |= k;
+		    k <<= 1;
+		  }
+	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
+	    }
+	  break;
 
-    case EQ:
-    case NE:
+	case IX86_BUILTIN_MOVMSKPS:
+	case IX86_BUILTIN_PMOVMSKB:
+	case IX86_BUILTIN_MOVMSKPD:
+	case IX86_BUILTIN_PMOVMSKB128:
+	case IX86_BUILTIN_MOVMSKPD256:
+	case IX86_BUILTIN_MOVMSKPS256:
+	case IX86_BUILTIN_PMOVMSKB256:
+	  gcc_assert (n_args == 1);
+	  if (TREE_CODE (args[0]) == VECTOR_CST)
+	    {
+	      HOST_WIDE_INT res = 0;
+	      for (unsigned i = 0; i < VECTOR_CST_NELTS (args[0]); ++i)
+		{
+		  tree e = VECTOR_CST_ELT (args[0], i);
+		  if (TREE_CODE (e) == INTEGER_CST && !TREE_OVERFLOW (e))
+		    {
+		      if (wi::neg_p (wi::to_wide (e)))
+			res |= HOST_WIDE_INT_1 << i;
+		    }
+		  else if (TREE_CODE (e) == REAL_CST && !TREE_OVERFLOW (e))
+		    {
+		      if (TREE_REAL_CST (e).sign)
+			res |= HOST_WIDE_INT_1 << i;
+		    }
+		  else
+		    return NULL_TREE;
+		}
+	      return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), res);
+	    }
+	  break;
 
-    case LTGT:
-    case UNORDERED:
-    case ORDERED:
-    case UNLT:
-    case UNLE:
-    case UNGT:
-    case UNGE:
-    case UNEQ:
-      return true;
-
-    default:
-      gcc_unreachable ();
-    }
-}
-
-machine_mode
-ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
-{
-  machine_mode mode = GET_MODE (op0);
-
-  if (SCALAR_FLOAT_MODE_P (mode))
-    {
-      gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
-      return CCFPmode;
-    }
-
-  switch (code)
-    {
-      /* Only zero flag is needed.  */
-    case EQ:			/* ZF=0 */
-    case NE:			/* ZF!=0 */
-      return CCZmode;
-      /* Codes needing carry flag.  */
-    case GEU:			/* CF=0 */
-    case LTU:			/* CF=1 */
-      /* Detect overflow checks.  They need just the carry flag.  */
-      if (GET_CODE (op0) == PLUS
-	  && (rtx_equal_p (op1, XEXP (op0, 0))
-	      || rtx_equal_p (op1, XEXP (op0, 1))))
-	return CCCmode;
-      else
-	return CCmode;
-    case GTU:			/* CF=0 & ZF=0 */
-    case LEU:			/* CF=1 | ZF=1 */
-      return CCmode;
-      /* Codes possibly doable only with sign flag when
-         comparing against zero.  */
-    case GE:			/* SF=OF   or   SF=0 */
-    case LT:			/* SF<>OF  or   SF=1 */
-      if (op1 == const0_rtx)
-	return CCGOCmode;
-      else
-	/* For other cases Carry flag is not required.  */
-	return CCGCmode;
-      /* Codes doable only with sign flag when comparing
-         against zero, but we miss jump instruction for it
-         so we need to use relational tests against overflow
-         that thus needs to be zero.  */
-    case GT:			/* ZF=0 & SF=OF */
-    case LE:			/* ZF=1 | SF<>OF */
-      if (op1 == const0_rtx)
-	return CCNOmode;
-      else
-	return CCGCmode;
-      /* strcmp pattern do (use flags) and combine may ask us for proper
-	 mode.  */
-    case USE:
-      return CCmode;
-    default:
-      gcc_unreachable ();
-    }
-}
-
-/* Return the fixed registers used for condition codes.  */
-
-static bool
-ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
-{
-  *p1 = FLAGS_REG;
-  *p2 = INVALID_REGNUM;
-  return true;
-}
-
-/* If two condition code modes are compatible, return a condition code
-   mode which is compatible with both.  Otherwise, return
-   VOIDmode.  */
-
-static machine_mode
-ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
-{
-  if (m1 == m2)
-    return m1;
-
-  if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
-    return VOIDmode;
-
-  if ((m1 == CCGCmode && m2 == CCGOCmode)
-      || (m1 == CCGOCmode && m2 == CCGCmode))
-    return CCGCmode;
-
-  if ((m1 == CCNOmode && m2 == CCGOCmode)
-      || (m1 == CCGOCmode && m2 == CCNOmode))
-    return CCNOmode;
-
-  if (m1 == CCZmode
-      && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
-    return m2;
-  else if (m2 == CCZmode
-	   && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
-    return m1;
-
-  switch (m1)
-    {
-    default:
-      gcc_unreachable ();
-
-    case E_CCmode:
-    case E_CCGCmode:
-    case E_CCGOCmode:
-    case E_CCNOmode:
-    case E_CCAmode:
-    case E_CCCmode:
-    case E_CCOmode:
-    case E_CCPmode:
-    case E_CCSmode:
-    case E_CCZmode:
-      switch (m2)
-	{
-	default:
-	  return VOIDmode;
-
-	case E_CCmode:
-	case E_CCGCmode:
-	case E_CCGOCmode:
-	case E_CCNOmode:
-	case E_CCAmode:
-	case E_CCCmode:
-	case E_CCOmode:
-	case E_CCPmode:
-	case E_CCSmode:
-	case E_CCZmode:
-	  return CCmode;
-	}
-
-    case E_CCFPmode:
-      /* These are only compatible with themselves, which we already
-	 checked above.  */
-      return VOIDmode;
-    }
-}
-
-
-/* Return a comparison we can do and that it is equivalent to
-   swap_condition (code) apart possibly from orderedness.
-   But, never change orderedness if TARGET_IEEE_FP, returning
-   UNKNOWN in that case if necessary.  */
-
-static enum rtx_code
-ix86_fp_swap_condition (enum rtx_code code)
-{
-  switch (code)
-    {
-    case GT:                   /* GTU - CF=0 & ZF=0 */
-      return TARGET_IEEE_FP ? UNKNOWN : UNLT;
-    case GE:                   /* GEU - CF=0 */
-      return TARGET_IEEE_FP ? UNKNOWN : UNLE;
-    case UNLT:                 /* LTU - CF=1 */
-      return TARGET_IEEE_FP ? UNKNOWN : GT;
-    case UNLE:                 /* LEU - CF=1 | ZF=1 */
-      return TARGET_IEEE_FP ? UNKNOWN : GE;
-    default:
-      return swap_condition (code);
-    }
-}
-
-/* Return cost of comparison CODE using the best strategy for performance.
-   All following functions do use number of instructions as a cost metrics.
-   In future this should be tweaked to compute bytes for optimize_size and
-   take into account performance of various instructions on various CPUs.  */
-
-static int
-ix86_fp_comparison_cost (enum rtx_code code)
-{
-  int arith_cost;
-
-  /* The cost of code using bit-twiddling on %ah.  */
-  switch (code)
-    {
-    case UNLE:
-    case UNLT:
-    case LTGT:
-    case GT:
-    case GE:
-    case UNORDERED:
-    case ORDERED:
-    case UNEQ:
-      arith_cost = 4;
-      break;
-    case LT:
-    case NE:
-    case EQ:
-    case UNGE:
-      arith_cost = TARGET_IEEE_FP ? 5 : 4;
-      break;
-    case LE:
-    case UNGT:
-      arith_cost = TARGET_IEEE_FP ? 6 : 4;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  switch (ix86_fp_comparison_strategy (code))
-    {
-    case IX86_FPCMP_COMI:
-      return arith_cost > 4 ? 3 : 2;
-    case IX86_FPCMP_SAHF:
-      return arith_cost > 4 ? 4 : 3;
-    default:
-      return arith_cost;
-    }
-}
-
-/* Return strategy to use for floating-point.  We assume that fcomi is always
-   preferrable where available, since that is also true when looking at size
-   (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test).  */
-
-enum ix86_fpcmp_strategy
-ix86_fp_comparison_strategy (enum rtx_code)
-{
-  /* Do fcomi/sahf based test when profitable.  */
-
-  if (TARGET_CMOVE)
-    return IX86_FPCMP_COMI;
-
-  if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
-    return IX86_FPCMP_SAHF;
-
-  return IX86_FPCMP_ARITH;
-}
-
-/* Swap, force into registers, or otherwise massage the two operands
-   to a fp comparison.  The operands are updated in place; the new
-   comparison code is returned.  */
-
-static enum rtx_code
-ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
-{
-  bool unordered_compare = ix86_unordered_fp_compare (code);
-  rtx op0 = *pop0, op1 = *pop1;
-  machine_mode op_mode = GET_MODE (op0);
-  bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
-
-  /* All of the unordered compare instructions only work on registers.
-     The same is true of the fcomi compare instructions.  The XFmode
-     compare instructions require registers except when comparing
-     against zero or when converting operand 1 from fixed point to
-     floating point.  */
-
-  if (!is_sse
-      && (unordered_compare
-	  || (op_mode == XFmode
-	      && ! (standard_80387_constant_p (op0) == 1
-		    || standard_80387_constant_p (op1) == 1)
-	      && GET_CODE (op1) != FLOAT)
-	  || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
-    {
-      op0 = force_reg (op_mode, op0);
-      op1 = force_reg (op_mode, op1);
-    }
-  else
-    {
-      /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
-	 things around if they appear profitable, otherwise force op0
-	 into a register.  */
-
-      if (standard_80387_constant_p (op0) == 0
-	  || (MEM_P (op0)
-	      && ! (standard_80387_constant_p (op1) == 0
-		    || MEM_P (op1))))
-	{
-	  enum rtx_code new_code = ix86_fp_swap_condition (code);
-	  if (new_code != UNKNOWN)
-	    {
-	      std::swap (op0, op1);
-	      code = new_code;
-	    }
-	}
-
-      if (!REG_P (op0))
-	op0 = force_reg (op_mode, op0);
-
-      if (CONSTANT_P (op1))
-	{
-	  int tmp = standard_80387_constant_p (op1);
-	  if (tmp == 0)
-	    op1 = validize_mem (force_const_mem (op_mode, op1));
-	  else if (tmp == 1)
-	    {
-	      if (TARGET_CMOVE)
-		op1 = force_reg (op_mode, op1);
-	    }
-	  else
-	    op1 = force_reg (op_mode, op1);
-	}
-    }
-
-  /* Try to rearrange the comparison to make it cheaper.  */
-  if (ix86_fp_comparison_cost (code)
-      > ix86_fp_comparison_cost (swap_condition (code))
-      && (REG_P (op1) || can_create_pseudo_p ()))
-    {
-      std::swap (op0, op1);
-      code = swap_condition (code);
-      if (!REG_P (op0))
-	op0 = force_reg (op_mode, op0);
-    }
-
-  *pop0 = op0;
-  *pop1 = op1;
-  return code;
-}
-
-/* Convert comparison codes we use to represent FP comparison to integer
-   code that will result in proper branch.  Return UNKNOWN if no such code
-   is available.  */
-
-enum rtx_code
-ix86_fp_compare_code_to_integer (enum rtx_code code)
-{
-  switch (code)
-    {
-    case GT:
-      return GTU;
-    case GE:
-      return GEU;
-    case ORDERED:
-    case UNORDERED:
-      return code;
-    case UNEQ:
-      return EQ;
-    case UNLT:
-      return LTU;
-    case UNLE:
-      return LEU;
-    case LTGT:
-      return NE;
-    default:
-      return UNKNOWN;
-    }
-}
-
-/* Generate insn patterns to do a floating point compare of OPERANDS.  */
-
-static rtx
-ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
-{
-  bool unordered_compare = ix86_unordered_fp_compare (code);
-  machine_mode cmp_mode;
-  rtx tmp, scratch;
-
-  code = ix86_prepare_fp_compare_args (code, &op0, &op1);
-
-  tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
-  if (unordered_compare)
-    tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
-
-  /* Do fcomi/sahf based test when profitable.  */
-  switch (ix86_fp_comparison_strategy (code))
-    {
-    case IX86_FPCMP_COMI:
-      cmp_mode = CCFPmode;
-      emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
-      break;
-
-    case IX86_FPCMP_SAHF:
-      cmp_mode = CCFPmode;
-      tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
-      scratch = gen_reg_rtx (HImode);
-      emit_insn (gen_rtx_SET (scratch, tmp));
-      emit_insn (gen_x86_sahf_1 (scratch));
-      break;
-
-    case IX86_FPCMP_ARITH:
-      cmp_mode = CCNOmode;
-      tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
-      scratch = gen_reg_rtx (HImode);
-      emit_insn (gen_rtx_SET (scratch, tmp));
-
-      /* In the unordered case, we have to check C2 for NaN's, which
-	 doesn't happen to work out to anything nice combination-wise.
-	 So do some bit twiddling on the value we've got in AH to come
-	 up with an appropriate set of condition codes.  */
-
-      switch (code)
-	{
-	case GT:
-	case UNGT:
-	  if (code == GT || !TARGET_IEEE_FP)
-	    {
-	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
-	      code = EQ;
-	    }
-	  else
-	    {
-	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
-	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
-	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
-	      cmp_mode = CCmode;
-	      code = GEU;
-	    }
-	  break;
-	case LT:
-	case UNLT:
-	  if (code == LT && TARGET_IEEE_FP)
-	    {
-	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
-	      emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
-	      cmp_mode = CCmode;
-	      code = EQ;
-	    }
-	  else
-	    {
-	      emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
-	      code = NE;
-	    }
-	  break;
-	case GE:
-	case UNGE:
-	  if (code == GE || !TARGET_IEEE_FP)
-	    {
-	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
-	      code = EQ;
-	    }
-	  else
-	    {
-	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
-	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
-	      code = NE;
-	    }
-	  break;
-	case LE:
-	case UNLE:
-	  if (code == LE && TARGET_IEEE_FP)
-	    {
-	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
-	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
-	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
-	      cmp_mode = CCmode;
-	      code = LTU;
-	    }
-	  else
-	    {
-	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
-	      code = NE;
-	    }
-	  break;
-	case EQ:
-	case UNEQ:
-	  if (code == EQ && TARGET_IEEE_FP)
-	    {
-	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
-	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
-	      cmp_mode = CCmode;
-	      code = EQ;
-	    }
-	  else
-	    {
-	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
-	      code = NE;
-	    }
-	  break;
-	case NE:
-	case LTGT:
-	  if (code == NE && TARGET_IEEE_FP)
-	    {
-	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
-	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
-					     GEN_INT (0x40)));
-	      code = NE;
-	    }
-	  else
-	    {
-	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
-	      code = EQ;
-	    }
-	  break;
-
-	case UNORDERED:
-	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
-	  code = NE;
-	  break;
-	case ORDERED:
-	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
-	  code = EQ;
-	  break;
-
-	default:
-	  gcc_unreachable ();
-	}
-	break;
-
-    default:
-      gcc_unreachable();
-    }
-
-  /* Return the test that should be put into the flags user, i.e.
-     the bcc, scc, or cmov instruction.  */
-  return gen_rtx_fmt_ee (code, VOIDmode,
-			 gen_rtx_REG (cmp_mode, FLAGS_REG),
-			 const0_rtx);
-}
-
-static rtx
-ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
-{
-  rtx ret;
-
-  if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
-    ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
-
-  else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
-    {
-      gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
-      ret = ix86_expand_fp_compare (code, op0, op1);
-    }
-  else
-    ret = ix86_expand_int_compare (code, op0, op1);
-
-  return ret;
-}
-
-void
-ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
-{
-  machine_mode mode = GET_MODE (op0);
-  rtx tmp;
-
-  /* Handle special case - vector comparsion with boolean result, transform
-     it using ptest instruction.  */
-  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
-    {
-      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
-      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
-
-      gcc_assert (code == EQ || code == NE);
-      /* Generate XOR since we can't check that one operand is zero vector.  */
-      tmp = gen_reg_rtx (mode);
-      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
-      tmp = gen_lowpart (p_mode, tmp);
-      emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
-			      gen_rtx_UNSPEC (CCmode,
-					      gen_rtvec (2, tmp, tmp),
-					      UNSPEC_PTEST)));
-      tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
-      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
-				  gen_rtx_LABEL_REF (VOIDmode, label),
-				  pc_rtx);
-      emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
-      return;
-    }
-
-  switch (mode)
-    {
-    case E_SFmode:
-    case E_DFmode:
-    case E_XFmode:
-    case E_QImode:
-    case E_HImode:
-    case E_SImode:
-      simple:
-      tmp = ix86_expand_compare (code, op0, op1);
-      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
-				  gen_rtx_LABEL_REF (VOIDmode, label),
-				  pc_rtx);
-      emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
-      return;
-
-    case E_DImode:
-      if (TARGET_64BIT)
-	goto simple;
-      /* For 32-bit target DI comparison may be performed on
-	 SSE registers.  To allow this we should avoid split
-	 to SI mode which is achieved by doing xor in DI mode
-	 and then comparing with zero (which is recognized by
-	 STV pass).  We don't compare using xor when optimizing
-	 for size.  */
-      if (!optimize_insn_for_size_p ()
-	  && TARGET_STV
-	  && (code == EQ || code == NE))
-	{
-	  op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
-	  op1 = const0_rtx;
-	}
-      /* FALLTHRU */
-    case E_TImode:
-      /* Expand DImode branch into multiple compare+branch.  */
-      {
-	rtx lo[2], hi[2];
-	rtx_code_label *label2;
-	enum rtx_code code1, code2, code3;
-	machine_mode submode;
-
-	if (CONSTANT_P (op0) && !CONSTANT_P (op1))
-	  {
-	    std::swap (op0, op1);
-	    code = swap_condition (code);
-	  }
-
-	split_double_mode (mode, &op0, 1, lo+0, hi+0);
-	split_double_mode (mode, &op1, 1, lo+1, hi+1);
-
-	submode = mode == DImode ? SImode : DImode;
-
-	/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
-	   avoid two branches.  This costs one extra insn, so disable when
-	   optimizing for size.  */
-
-	if ((code == EQ || code == NE)
-	    && (!optimize_insn_for_size_p ()
-	        || hi[1] == const0_rtx || lo[1] == const0_rtx))
-	  {
-	    rtx xor0, xor1;
-
-	    xor1 = hi[0];
-	    if (hi[1] != const0_rtx)
-	      xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
-				   NULL_RTX, 0, OPTAB_WIDEN);
-
-	    xor0 = lo[0];
-	    if (lo[1] != const0_rtx)
-	      xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
-				   NULL_RTX, 0, OPTAB_WIDEN);
-
-	    tmp = expand_binop (submode, ior_optab, xor1, xor0,
-				NULL_RTX, 0, OPTAB_WIDEN);
-
-	    ix86_expand_branch (code, tmp, const0_rtx, label);
-	    return;
-	  }
-
-	/* Otherwise, if we are doing less-than or greater-or-equal-than,
-	   op1 is a constant and the low word is zero, then we can just
-	   examine the high word.  Similarly for low word -1 and
-	   less-or-equal-than or greater-than.  */
-
-	if (CONST_INT_P (hi[1]))
-	  switch (code)
-	    {
-	    case LT: case LTU: case GE: case GEU:
-	      if (lo[1] == const0_rtx)
-		{
-		  ix86_expand_branch (code, hi[0], hi[1], label);
-		  return;
-		}
-	      break;
-	    case LE: case LEU: case GT: case GTU:
-	      if (lo[1] == constm1_rtx)
-		{
-		  ix86_expand_branch (code, hi[0], hi[1], label);
-		  return;
-		}
-	      break;
-	    default:
-	      break;
-	    }
-
-	/* Emulate comparisons that do not depend on Zero flag with
-	   double-word subtraction.  Note that only Overflow, Sign
-	   and Carry flags are valid, so swap arguments and condition
-	   of comparisons that would otherwise test Zero flag.  */
-
-	switch (code)
-	  {
-	  case LE: case LEU: case GT: case GTU:
-	    std::swap (lo[0], lo[1]);
-	    std::swap (hi[0], hi[1]);
-	    code = swap_condition (code);
-	    /* FALLTHRU */
-
-	  case LT: case LTU: case GE: case GEU:
-	    {
-	      rtx (*cmp_insn) (rtx, rtx);
-	      rtx (*sbb_insn) (rtx, rtx, rtx);
-	      bool uns = (code == LTU || code == GEU);
-
-	      if (TARGET_64BIT)
-		{
-		  cmp_insn = gen_cmpdi_1;
-		  sbb_insn
-		    = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
-		}
-	      else
-		{
-		  cmp_insn = gen_cmpsi_1;
-		  sbb_insn
-		    = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
-		}
-
-	      if (!nonimmediate_operand (lo[0], submode))
-		lo[0] = force_reg (submode, lo[0]);
-	      if (!x86_64_general_operand (lo[1], submode))
-		lo[1] = force_reg (submode, lo[1]);
-
-	      if (!register_operand (hi[0], submode))
-		hi[0] = force_reg (submode, hi[0]);
-	      if ((uns && !nonimmediate_operand (hi[1], submode))
-		  || (!uns && !x86_64_general_operand (hi[1], submode)))
-		hi[1] = force_reg (submode, hi[1]);
-
-	      emit_insn (cmp_insn (lo[0], lo[1]));
-	      emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
-
-	      tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
-
-	      ix86_expand_branch (code, tmp, const0_rtx, label);
-	      return;
-	    }
-
-	  default:
-	    break;
-	  }
-
-	/* Otherwise, we need two or three jumps.  */
-
-	label2 = gen_label_rtx ();
-
-	code1 = code;
-	code2 = swap_condition (code);
-	code3 = unsigned_condition (code);
-
-	switch (code)
-	  {
-	  case LT: case GT: case LTU: case GTU:
-	    break;
-
-	  case LE:   code1 = LT;  code2 = GT;  break;
-	  case GE:   code1 = GT;  code2 = LT;  break;
-	  case LEU:  code1 = LTU; code2 = GTU; break;
-	  case GEU:  code1 = GTU; code2 = LTU; break;
-
-	  case EQ:   code1 = UNKNOWN; code2 = NE;  break;
-	  case NE:   code2 = UNKNOWN; break;
-
-	  default:
-	    gcc_unreachable ();
-	  }
-
-	/*
-	 * a < b =>
-	 *    if (hi(a) < hi(b)) goto true;
-	 *    if (hi(a) > hi(b)) goto false;
-	 *    if (lo(a) < lo(b)) goto true;
-	 *  false:
-	 */
-
-	if (code1 != UNKNOWN)
-	  ix86_expand_branch (code1, hi[0], hi[1], label);
-	if (code2 != UNKNOWN)
-	  ix86_expand_branch (code2, hi[0], hi[1], label2);
-
-	ix86_expand_branch (code3, lo[0], lo[1], label);
-
-	if (code2 != UNKNOWN)
-	  emit_label (label2);
-	return;
-      }
-
-    default:
-      gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
-      goto simple;
-    }
-}
-
-void
-ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
-{
-  rtx ret;
-
-  gcc_assert (GET_MODE (dest) == QImode);
-
-  ret = ix86_expand_compare (code, op0, op1);
-  PUT_MODE (ret, QImode);
-  emit_insn (gen_rtx_SET (dest, ret));
-}
-
-/* Expand comparison setting or clearing carry flag.  Return true when
-   successful and set pop for the operation.  */
-static bool
-ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
-{
-  machine_mode mode
-    = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
-
-  /* Do not handle double-mode compares that go through special path.  */
-  if (mode == (TARGET_64BIT ? TImode : DImode))
-    return false;
-
-  if (SCALAR_FLOAT_MODE_P (mode))
-    {
-      rtx compare_op;
-      rtx_insn *compare_seq;
-
-      gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
-
-      /* Shortcut:  following common codes never translate
-	 into carry flag compares.  */
-      if (code == EQ || code == NE || code == UNEQ || code == LTGT
-	  || code == ORDERED || code == UNORDERED)
-	return false;
-
-      /* These comparisons require zero flag; swap operands so they won't.  */
-      if ((code == GT || code == UNLE || code == LE || code == UNGT)
-	  && !TARGET_IEEE_FP)
-	{
-	  std::swap (op0, op1);
-	  code = swap_condition (code);
-	}
-
-      /* Try to expand the comparison and verify that we end up with
-	 carry flag based comparison.  This fails to be true only when
-	 we decide to expand comparison using arithmetic that is not
-	 too common scenario.  */
-      start_sequence ();
-      compare_op = ix86_expand_fp_compare (code, op0, op1);
-      compare_seq = get_insns ();
-      end_sequence ();
-
-      if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
-        code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
-      else
-	code = GET_CODE (compare_op);
-
-      if (code != LTU && code != GEU)
-	return false;
-
-      emit_insn (compare_seq);
-      *pop = compare_op;
-      return true;
-    }
-
-  if (!INTEGRAL_MODE_P (mode))
-    return false;
-
-  switch (code)
-    {
-    case LTU:
-    case GEU:
-      break;
-
-    /* Convert a==0 into (unsigned)a<1.  */
-    case EQ:
-    case NE:
-      if (op1 != const0_rtx)
-	return false;
-      op1 = const1_rtx;
-      code = (code == EQ ? LTU : GEU);
-      break;
-
-    /* Convert a>b into b<a or a>=b-1.  */
-    case GTU:
-    case LEU:
-      if (CONST_INT_P (op1))
-	{
-	  op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
-	  /* Bail out on overflow.  We still can swap operands but that
-	     would force loading of the constant into register.  */
-	  if (op1 == const0_rtx
-	      || !x86_64_immediate_operand (op1, GET_MODE (op1)))
-	    return false;
-	  code = (code == GTU ? GEU : LTU);
-	}
-      else
-	{
-	  std::swap (op0, op1);
-	  code = (code == GTU ? LTU : GEU);
-	}
-      break;
-
-    /* Convert a>=0 into (unsigned)a<0x80000000.  */
-    case LT:
-    case GE:
-      if (mode == DImode || op1 != const0_rtx)
-	return false;
-      op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
-      code = (code == LT ? GEU : LTU);
-      break;
-    case LE:
-    case GT:
-      if (mode == DImode || op1 != constm1_rtx)
-	return false;
-      op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
-      code = (code == LE ? GEU : LTU);
-      break;
-
-    default:
-      return false;
-    }
-  /* Swapping operands may cause constant to appear as first operand.  */
-  if (!nonimmediate_operand (op0, VOIDmode))
-    {
-      if (!can_create_pseudo_p ())
-	return false;
-      op0 = force_reg (mode, op0);
-    }
-  *pop = ix86_expand_compare (code, op0, op1);
-  gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
-  return true;
-}
-
-bool
-ix86_expand_int_movcc (rtx operands[])
-{
-  enum rtx_code code = GET_CODE (operands[1]), compare_code;
-  rtx_insn *compare_seq;
-  rtx compare_op;
-  machine_mode mode = GET_MODE (operands[0]);
-  bool sign_bit_compare_p = false;
-  rtx op0 = XEXP (operands[1], 0);
-  rtx op1 = XEXP (operands[1], 1);
-
-  if (GET_MODE (op0) == TImode
-      || (GET_MODE (op0) == DImode
-	  && !TARGET_64BIT))
-    return false;
-
-  start_sequence ();
-  compare_op = ix86_expand_compare (code, op0, op1);
-  compare_seq = get_insns ();
-  end_sequence ();
-
-  compare_code = GET_CODE (compare_op);
-
-  if ((op1 == const0_rtx && (code == GE || code == LT))
-      || (op1 == constm1_rtx && (code == GT || code == LE)))
-    sign_bit_compare_p = true;
-
-  /* Don't attempt mode expansion here -- if we had to expand 5 or 6
-     HImode insns, we'd be swallowed in word prefix ops.  */
-
-  if ((mode != HImode || TARGET_FAST_PREFIX)
-      && (mode != (TARGET_64BIT ? TImode : DImode))
-      && CONST_INT_P (operands[2])
-      && CONST_INT_P (operands[3]))
-    {
-      rtx out = operands[0];
-      HOST_WIDE_INT ct = INTVAL (operands[2]);
-      HOST_WIDE_INT cf = INTVAL (operands[3]);
-      HOST_WIDE_INT diff;
-
-      diff = ct - cf;
-      /*  Sign bit compares are better done using shifts than we do by using
-	  sbb.  */
-      if (sign_bit_compare_p
-	  || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
-	{
-	  /* Detect overlap between destination and compare sources.  */
-	  rtx tmp = out;
-
-          if (!sign_bit_compare_p)
-	    {
-	      rtx flags;
-	      bool fpcmp = false;
-
-	      compare_code = GET_CODE (compare_op);
-
-	      flags = XEXP (compare_op, 0);
-
-	      if (GET_MODE (flags) == CCFPmode)
-		{
-		  fpcmp = true;
-		  compare_code
-		    = ix86_fp_compare_code_to_integer (compare_code);
-		}
-
-	      /* To simplify rest of code, restrict to the GEU case.  */
-	      if (compare_code == LTU)
-		{
-		  std::swap (ct, cf);
-		  compare_code = reverse_condition (compare_code);
-		  code = reverse_condition (code);
-		}
-	      else
-		{
-		  if (fpcmp)
-		    PUT_CODE (compare_op,
-			      reverse_condition_maybe_unordered
-			        (GET_CODE (compare_op)));
-		  else
-		    PUT_CODE (compare_op,
-			      reverse_condition (GET_CODE (compare_op)));
-		}
-	      diff = ct - cf;
-
-	      if (reg_overlap_mentioned_p (out, op0)
-		  || reg_overlap_mentioned_p (out, op1))
-		tmp = gen_reg_rtx (mode);
-
-	      if (mode == DImode)
-		emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
-	      else
-		emit_insn (gen_x86_movsicc_0_m1	(gen_lowpart (SImode, tmp),
-						 flags, compare_op));
-	    }
-	  else
-	    {
-	      if (code == GT || code == GE)
-		code = reverse_condition (code);
-	      else
-		{
-		  std::swap (ct, cf);
-		  diff = ct - cf;
-		}
-	      tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
-	    }
-
-	  if (diff == 1)
-	    {
-	      /*
-	       * cmpl op0,op1
-	       * sbbl dest,dest
-	       * [addl dest, ct]
-	       *
-	       * Size 5 - 8.
-	       */
-	      if (ct)
-		tmp = expand_simple_binop (mode, PLUS,
-					   tmp, GEN_INT (ct),
-					   copy_rtx (tmp), 1, OPTAB_DIRECT);
-	    }
-	  else if (cf == -1)
-	    {
-	      /*
-	       * cmpl op0,op1
-	       * sbbl dest,dest
-	       * orl $ct, dest
-	       *
-	       * Size 8.
-	       */
-	      tmp = expand_simple_binop (mode, IOR,
-					 tmp, GEN_INT (ct),
-					 copy_rtx (tmp), 1, OPTAB_DIRECT);
-	    }
-	  else if (diff == -1 && ct)
-	    {
-	      /*
-	       * cmpl op0,op1
-	       * sbbl dest,dest
-	       * notl dest
-	       * [addl dest, cf]
-	       *
-	       * Size 8 - 11.
-	       */
-	      tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
-	      if (cf)
-		tmp = expand_simple_binop (mode, PLUS,
-					   copy_rtx (tmp), GEN_INT (cf),
-					   copy_rtx (tmp), 1, OPTAB_DIRECT);
-	    }
-	  else
-	    {
-	      /*
-	       * cmpl op0,op1
-	       * sbbl dest,dest
-	       * [notl dest]
-	       * andl cf - ct, dest
-	       * [addl dest, ct]
-	       *
-	       * Size 8 - 11.
-	       */
-
-	      if (cf == 0)
-		{
-		  cf = ct;
-		  ct = 0;
-		  tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
-		}
-
-	      tmp = expand_simple_binop (mode, AND,
-					 copy_rtx (tmp),
-					 gen_int_mode (cf - ct, mode),
-					 copy_rtx (tmp), 1, OPTAB_DIRECT);
-	      if (ct)
-		tmp = expand_simple_binop (mode, PLUS,
-					   copy_rtx (tmp), GEN_INT (ct),
-					   copy_rtx (tmp), 1, OPTAB_DIRECT);
-	    }
-
-	  if (!rtx_equal_p (tmp, out))
-	    emit_move_insn (copy_rtx (out), copy_rtx (tmp));
-
-	  return true;
-	}
-
-      if (diff < 0)
-	{
-	  machine_mode cmp_mode = GET_MODE (op0);
-	  enum rtx_code new_code;
-
-	  if (SCALAR_FLOAT_MODE_P (cmp_mode))
-	    {
-	      gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
-
-	      /* We may be reversing unordered compare to normal compare, that
-		 is not valid in general (we may convert non-trapping condition
-		 to trapping one), however on i386 we currently emit all
-		 comparisons unordered.  */
-	      new_code = reverse_condition_maybe_unordered (code);
-	    }
-	  else
-	    new_code = ix86_reverse_condition (code, cmp_mode);
-	  if (new_code != UNKNOWN)
-	    {
-	      std::swap (ct, cf);
-	      diff = -diff;
-	      code = new_code;
-	    }
-	}
-
-      compare_code = UNKNOWN;
-      if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
-	  && CONST_INT_P (op1))
-	{
-	  if (op1 == const0_rtx
-	      && (code == LT || code == GE))
-	    compare_code = code;
-	  else if (op1 == constm1_rtx)
-	    {
-	      if (code == LE)
-		compare_code = LT;
-	      else if (code == GT)
-		compare_code = GE;
-	    }
-	}
-
-      /* Optimize dest = (op0 < 0) ? -1 : cf.  */
-      if (compare_code != UNKNOWN
-	  && GET_MODE (op0) == GET_MODE (out)
-	  && (cf == -1 || ct == -1))
-	{
-	  /* If lea code below could be used, only optimize
-	     if it results in a 2 insn sequence.  */
-
-	  if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
-		 || diff == 3 || diff == 5 || diff == 9)
-	      || (compare_code == LT && ct == -1)
-	      || (compare_code == GE && cf == -1))
-	    {
-	      /*
-	       * notl op1	(if necessary)
-	       * sarl $31, op1
-	       * orl cf, op1
-	       */
-	      if (ct != -1)
-		{
-		  cf = ct;
-		  ct = -1;
-		  code = reverse_condition (code);
-		}
-
-	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
-
-	      out = expand_simple_binop (mode, IOR,
-					 out, GEN_INT (cf),
-					 out, 1, OPTAB_DIRECT);
-	      if (out != operands[0])
-		emit_move_insn (operands[0], out);
-
-	      return true;
-	    }
-	}
-
-
-      if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
-	   || diff == 3 || diff == 5 || diff == 9)
-	  && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
-	  && (mode != DImode
-	      || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
-	{
-	  /*
-	   * xorl dest,dest
-	   * cmpl op1,op2
-	   * setcc dest
-	   * lea cf(dest*(ct-cf)),dest
-	   *
-	   * Size 14.
-	   *
-	   * This also catches the degenerate setcc-only case.
-	   */
-
-	  rtx tmp;
-	  int nops;
-
-	  out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
-
-	  nops = 0;
-	  /* On x86_64 the lea instruction operates on Pmode, so we need
-	     to get arithmetics done in proper mode to match.  */
-	  if (diff == 1)
-	    tmp = copy_rtx (out);
-	  else
-	    {
-	      rtx out1;
-	      out1 = copy_rtx (out);
-	      tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
-	      nops++;
-	      if (diff & 1)
-		{
-		  tmp = gen_rtx_PLUS (mode, tmp, out1);
-		  nops++;
-		}
-	    }
-	  if (cf != 0)
-	    {
-	      tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
-	      nops++;
-	    }
-	  if (!rtx_equal_p (tmp, out))
-	    {
-	      if (nops == 1)
-		out = force_operand (tmp, copy_rtx (out));
-	      else
-		emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
-	    }
-	  if (!rtx_equal_p (out, operands[0]))
-	    emit_move_insn (operands[0], copy_rtx (out));
-
-	  return true;
-	}
-
-      /*
-       * General case:			Jumpful:
-       *   xorl dest,dest		cmpl op1, op2
-       *   cmpl op1, op2		movl ct, dest
-       *   setcc dest			jcc 1f
-       *   decl dest			movl cf, dest
-       *   andl (cf-ct),dest		1:
-       *   addl ct,dest
-       *
-       * Size 20.			Size 14.
-       *
-       * This is reasonably steep, but branch mispredict costs are
-       * high on modern cpus, so consider failing only if optimizing
-       * for space.
-       */
-
-      if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
-	  && BRANCH_COST (optimize_insn_for_speed_p (),
-		  	  false) >= 2)
-	{
-	  if (cf == 0)
-	    {
-	      machine_mode cmp_mode = GET_MODE (op0);
-	      enum rtx_code new_code;
-
-	      if (SCALAR_FLOAT_MODE_P (cmp_mode))
-		{
-		  gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
-
-		  /* We may be reversing unordered compare to normal compare,
-		     that is not valid in general (we may convert non-trapping
-		     condition to trapping one), however on i386 we currently
-		     emit all comparisons unordered.  */
-		  new_code = reverse_condition_maybe_unordered (code);
-		}
-	      else
-		{
-		  new_code = ix86_reverse_condition (code, cmp_mode);
-		  if (compare_code != UNKNOWN && new_code != UNKNOWN)
-		    compare_code = reverse_condition (compare_code);
-		}
-
-	      if (new_code != UNKNOWN)
-		{
-		  cf = ct;
-		  ct = 0;
-		  code = new_code;
-		}
-	    }
-
-	  if (compare_code != UNKNOWN)
-	    {
-	      /* notl op1	(if needed)
-		 sarl $31, op1
-		 andl (cf-ct), op1
-		 addl ct, op1
-
-		 For x < 0 (resp. x <= -1) there will be no notl,
-		 so if possible swap the constants to get rid of the
-		 complement.
-		 True/false will be -1/0 while code below (store flag
-		 followed by decrement) is 0/-1, so the constants need
-		 to be exchanged once more.  */
-
-	      if (compare_code == GE || !cf)
-		{
-		  code = reverse_condition (code);
-		  compare_code = LT;
-		}
-	      else
-		std::swap (ct, cf);
-
-	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
-	    }
-	  else
-	    {
-	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
-
-	      out = expand_simple_binop (mode, PLUS, copy_rtx (out),
-					 constm1_rtx,
-					 copy_rtx (out), 1, OPTAB_DIRECT);
-	    }
-
-	  out = expand_simple_binop (mode, AND, copy_rtx (out),
-				     gen_int_mode (cf - ct, mode),
-				     copy_rtx (out), 1, OPTAB_DIRECT);
-	  if (ct)
-	    out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
-				       copy_rtx (out), 1, OPTAB_DIRECT);
-	  if (!rtx_equal_p (out, operands[0]))
-	    emit_move_insn (operands[0], copy_rtx (out));
-
-	  return true;
-	}
-    }
-
-  if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
-    {
-      /* Try a few things more with specific constants and a variable.  */
-
-      optab op;
-      rtx var, orig_out, out, tmp;
-
-      if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
-	return false;
-
-      /* If one of the two operands is an interesting constant, load a
-	 constant with the above and mask it in with a logical operation.  */
-
-      if (CONST_INT_P (operands[2]))
-	{
-	  var = operands[3];
-	  if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
-	    operands[3] = constm1_rtx, op = and_optab;
-	  else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
-	    operands[3] = const0_rtx, op = ior_optab;
-	  else
-	    return false;
-	}
-      else if (CONST_INT_P (operands[3]))
-	{
-	  var = operands[2];
-	  if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
-	    operands[2] = constm1_rtx, op = and_optab;
-	  else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
-	    operands[2] = const0_rtx, op = ior_optab;
-	  else
-	    return false;
-	}
-      else
-        return false;
-
-      orig_out = operands[0];
-      tmp = gen_reg_rtx (mode);
-      operands[0] = tmp;
-
-      /* Recurse to get the constant loaded.  */
-      if (!ix86_expand_int_movcc (operands))
-        return false;
-
-      /* Mask in the interesting variable.  */
-      out = expand_binop (mode, op, var, tmp, orig_out, 0,
-			  OPTAB_WIDEN);
-      if (!rtx_equal_p (out, orig_out))
-	emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
-
-      return true;
-    }
-
-  /*
-   * For comparison with above,
-   *
-   * movl cf,dest
-   * movl ct,tmp
-   * cmpl op1,op2
-   * cmovcc tmp,dest
-   *
-   * Size 15.
-   */
-
-  if (! nonimmediate_operand (operands[2], mode))
-    operands[2] = force_reg (mode, operands[2]);
-  if (! nonimmediate_operand (operands[3], mode))
-    operands[3] = force_reg (mode, operands[3]);
-
-  if (! register_operand (operands[2], VOIDmode)
-      && (mode == QImode
-          || ! register_operand (operands[3], VOIDmode)))
-    operands[2] = force_reg (mode, operands[2]);
-
-  if (mode == QImode
-      && ! register_operand (operands[3], VOIDmode))
-    operands[3] = force_reg (mode, operands[3]);
-
-  emit_insn (compare_seq);
-  emit_insn (gen_rtx_SET (operands[0],
-			  gen_rtx_IF_THEN_ELSE (mode,
-						compare_op, operands[2],
-						operands[3])));
-  return true;
-}
-
-/* Swap, force into registers, or otherwise massage the two operands
-   to an sse comparison with a mask result.  Thus we differ a bit from
-   ix86_prepare_fp_compare_args which expects to produce a flags result.
-
-   The DEST operand exists to help determine whether to commute commutative
-   operators.  The POP0/POP1 operands are updated in place.  The new
-   comparison code is returned, or UNKNOWN if not implementable.  */
-
-static enum rtx_code
-ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
-				  rtx *pop0, rtx *pop1)
-{
-  switch (code)
-    {
-    case LTGT:
-    case UNEQ:
-      /* AVX supports all the needed comparisons.  */
-      if (TARGET_AVX)
-	break;
-      /* We have no LTGT as an operator.  We could implement it with
-	 NE & ORDERED, but this requires an extra temporary.  It's
-	 not clear that it's worth it.  */
-      return UNKNOWN;
-
-    case LT:
-    case LE:
-    case UNGT:
-    case UNGE:
-      /* These are supported directly.  */
-      break;
-
-    case EQ:
-    case NE:
-    case UNORDERED:
-    case ORDERED:
-      /* AVX has 3 operand comparisons, no need to swap anything.  */
-      if (TARGET_AVX)
-	break;
-      /* For commutative operators, try to canonicalize the destination
-	 operand to be first in the comparison - this helps reload to
-	 avoid extra moves.  */
-      if (!dest || !rtx_equal_p (dest, *pop1))
-	break;
-      /* FALLTHRU */
-
-    case GE:
-    case GT:
-    case UNLE:
-    case UNLT:
-      /* These are not supported directly before AVX, and furthermore
-	 ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
-	 comparison operands to transform into something that is
-	 supported.  */
-      std::swap (*pop0, *pop1);
-      code = swap_condition (code);
-      break;
-
-    default:
-      gcc_unreachable ();
-    }
-
-  return code;
-}
-
-/* Detect conditional moves that exactly match min/max operational
-   semantics.  Note that this is IEEE safe, as long as we don't
-   interchange the operands.
-
-   Returns FALSE if this conditional move doesn't match a MIN/MAX,
-   and TRUE if the operation is successful and instructions are emitted.  */
-
-static bool
-ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
-			   rtx cmp_op1, rtx if_true, rtx if_false)
-{
-  machine_mode mode;
-  bool is_min;
-  rtx tmp;
-
-  if (code == LT)
-    ;
-  else if (code == UNGE)
-    std::swap (if_true, if_false);
-  else
-    return false;
-
-  if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
-    is_min = true;
-  else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
-    is_min = false;
-  else
-    return false;
-
-  mode = GET_MODE (dest);
-
-  /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
-     but MODE may be a vector mode and thus not appropriate.  */
-  if (!flag_finite_math_only || flag_signed_zeros)
-    {
-      int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
-      rtvec v;
-
-      if_true = force_reg (mode, if_true);
-      v = gen_rtvec (2, if_true, if_false);
-      tmp = gen_rtx_UNSPEC (mode, v, u);
-    }
-  else
-    {
-      code = is_min ? SMIN : SMAX;
-      if (MEM_P (if_true) && MEM_P (if_false))
-	if_true = force_reg (mode, if_true);
-      tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
-    }
-
-  emit_insn (gen_rtx_SET (dest, tmp));
-  return true;
-}
-
-/* Expand an SSE comparison.  Return the register with the result.  */
-
-static rtx
-ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
-		     rtx op_true, rtx op_false)
-{
-  machine_mode mode = GET_MODE (dest);
-  machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
-
-  /* In general case result of comparison can differ from operands' type.  */
-  machine_mode cmp_mode;
-
-  /* In AVX512F the result of comparison is an integer mask.  */
-  bool maskcmp = false;
-  rtx x;
-
-  if (GET_MODE_SIZE (cmp_ops_mode) == 64)
-    {
-      unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
-      cmp_mode = int_mode_for_size (nbits, 0).require ();
-      maskcmp = true;
-    }
-  else
-    cmp_mode = cmp_ops_mode;
-
-  cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
-
-  int (*op1_predicate)(rtx, machine_mode)
-    = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
-
-  if (!op1_predicate (cmp_op1, cmp_ops_mode))
-    cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
-
-  if (optimize
-      || (maskcmp && cmp_mode != mode)
-      || (op_true && reg_overlap_mentioned_p (dest, op_true))
-      || (op_false && reg_overlap_mentioned_p (dest, op_false)))
-    dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
-
-  /* Compare patterns for int modes are unspec in AVX512F only.  */
-  if (maskcmp && (code == GT || code == EQ))
-    {
-      rtx (*gen)(rtx, rtx, rtx);
-
-      switch (cmp_ops_mode)
-	{
-	case E_V64QImode:
-	  gcc_assert (TARGET_AVX512BW);
-	  gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
-	  break;
-	case E_V32HImode:
-	  gcc_assert (TARGET_AVX512BW);
-	  gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
-	  break;
-	case E_V16SImode:
-	  gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
-	  break;
-	case E_V8DImode:
-	  gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
-	  break;
-	default:
-	  gen = NULL;
-	}
-
-      if (gen)
-	{
-	  emit_insn (gen (dest, cmp_op0, cmp_op1));
-	  return dest;
-	}
-    }
-  x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
-
-  if (cmp_mode != mode && !maskcmp)
-    {
-      x = force_reg (cmp_ops_mode, x);
-      convert_move (dest, x, false);
-    }
-  else
-    emit_insn (gen_rtx_SET (dest, x));
-
-  return dest;
-}
-
-/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
-   operations.  This is used for both scalar and vector conditional moves.  */
-
-void
-ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
-{
-  machine_mode mode = GET_MODE (dest);
-  machine_mode cmpmode = GET_MODE (cmp);
-
-  /* In AVX512F the result of comparison is an integer mask.  */
-  bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
-
-  rtx t2, t3, x;
-
-  /* If we have an integer mask and FP value then we need
-     to cast mask to FP mode.  */
-  if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
-    {
-      cmp = force_reg (cmpmode, cmp);
-      cmp = gen_rtx_SUBREG (mode, cmp, 0);
-    }
-
-  if (maskcmp)
-    {
-      rtx (*gen) (rtx, rtx) = NULL;
-      if ((op_true == CONST0_RTX (mode)
-	   && vector_all_ones_operand (op_false, mode))
-	  || (op_false == CONST0_RTX (mode)
-	      && vector_all_ones_operand (op_true, mode)))
-	switch (mode)
-	  {
-	  case E_V64QImode:
-	    if (TARGET_AVX512BW)
-	      gen = gen_avx512bw_cvtmask2bv64qi;
-	    break;
-	  case E_V32QImode:
-	    if (TARGET_AVX512VL && TARGET_AVX512BW)
-	      gen = gen_avx512vl_cvtmask2bv32qi;
-	    break;
-	  case E_V16QImode:
-	    if (TARGET_AVX512VL && TARGET_AVX512BW)
-	      gen = gen_avx512vl_cvtmask2bv16qi;
-	    break;
-	  case E_V32HImode:
-	    if (TARGET_AVX512BW)
-	      gen = gen_avx512bw_cvtmask2wv32hi;
-	    break;
-	  case E_V16HImode:
-	    if (TARGET_AVX512VL && TARGET_AVX512BW)
-	      gen = gen_avx512vl_cvtmask2wv16hi;
-	    break;
-	  case E_V8HImode:
-	    if (TARGET_AVX512VL && TARGET_AVX512BW)
-	      gen = gen_avx512vl_cvtmask2wv8hi;
-	    break;
-	  case E_V16SImode:
-	    if (TARGET_AVX512DQ)
-	      gen = gen_avx512f_cvtmask2dv16si;
-	    break;
-	  case E_V8SImode:
-	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
-	      gen = gen_avx512vl_cvtmask2dv8si;
-	    break;
-	  case E_V4SImode:
-	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
-	      gen = gen_avx512vl_cvtmask2dv4si;
-	    break;
-	  case E_V8DImode:
-	    if (TARGET_AVX512DQ)
-	      gen = gen_avx512f_cvtmask2qv8di;
-	    break;
-	  case E_V4DImode:
-	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
-	      gen = gen_avx512vl_cvtmask2qv4di;
-	    break;
-	  case E_V2DImode:
-	    if (TARGET_AVX512VL && TARGET_AVX512DQ)
-	      gen = gen_avx512vl_cvtmask2qv2di;
-	    break;
-	  default:
-	    break;
-	  }
-      if (gen && SCALAR_INT_MODE_P (cmpmode))
-	{
-	  cmp = force_reg (cmpmode, cmp);
-	  if (op_true == CONST0_RTX (mode))
-	    {
-	      rtx (*gen_not) (rtx, rtx);
-	      switch (cmpmode)
-		{
-		case E_QImode: gen_not = gen_knotqi; break;
-		case E_HImode: gen_not = gen_knothi; break;
-		case E_SImode: gen_not = gen_knotsi; break;
-		case E_DImode: gen_not = gen_knotdi; break;
-		default: gcc_unreachable ();
-		}
-	      rtx n = gen_reg_rtx (cmpmode);
-	      emit_insn (gen_not (n, cmp));
-	      cmp = n;
-	    }
-	  emit_insn (gen (dest, cmp));
-	  return;
-	}
-    }
-  else if (vector_all_ones_operand (op_true, mode)
-	   && op_false == CONST0_RTX (mode))
-    {
-      emit_insn (gen_rtx_SET (dest, cmp));
-      return;
-    }
-  else if (op_false == CONST0_RTX (mode))
-    {
-      op_true = force_reg (mode, op_true);
-      x = gen_rtx_AND (mode, cmp, op_true);
-      emit_insn (gen_rtx_SET (dest, x));
-      return;
-    }
-  else if (op_true == CONST0_RTX (mode))
-    {
-      op_false = force_reg (mode, op_false);
-      x = gen_rtx_NOT (mode, cmp);
-      x = gen_rtx_AND (mode, x, op_false);
-      emit_insn (gen_rtx_SET (dest, x));
-      return;
-    }
-  else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
-    {
-      op_false = force_reg (mode, op_false);
-      x = gen_rtx_IOR (mode, cmp, op_false);
-      emit_insn (gen_rtx_SET (dest, x));
-      return;
-    }
-  else if (TARGET_XOP)
-    {
-      op_true = force_reg (mode, op_true);
-
-      if (!nonimmediate_operand (op_false, mode))
-	op_false = force_reg (mode, op_false);
-
-      emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
-							  op_true,
-							  op_false)));
-      return;
-    }
-
-  rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
-  rtx d = dest;
-
-  if (!vector_operand (op_true, mode))
-    op_true = force_reg (mode, op_true);
-
-  op_false = force_reg (mode, op_false);
-
-  switch (mode)
-    {
-    case E_V4SFmode:
-      if (TARGET_SSE4_1)
-	gen = gen_sse4_1_blendvps;
-      break;
-    case E_V2DFmode:
-      if (TARGET_SSE4_1)
-	gen = gen_sse4_1_blendvpd;
-      break;
-    case E_SFmode:
-      if (TARGET_SSE4_1)
-	{
-	  gen = gen_sse4_1_blendvss;
-	  op_true = force_reg (mode, op_true);
-	}
-      break;
-    case E_DFmode:
-      if (TARGET_SSE4_1)
-	{
-	  gen = gen_sse4_1_blendvsd;
-	  op_true = force_reg (mode, op_true);
-	}
-      break;
-    case E_V16QImode:
-    case E_V8HImode:
-    case E_V4SImode:
-    case E_V2DImode:
-      if (TARGET_SSE4_1)
-	{
-	  gen = gen_sse4_1_pblendvb;
-	  if (mode != V16QImode)
-	    d = gen_reg_rtx (V16QImode);
-	  op_false = gen_lowpart (V16QImode, op_false);
-	  op_true = gen_lowpart (V16QImode, op_true);
-	  cmp = gen_lowpart (V16QImode, cmp);
-	}
-      break;
-    case E_V8SFmode:
-      if (TARGET_AVX)
-	gen = gen_avx_blendvps256;
-      break;
-    case E_V4DFmode:
-      if (TARGET_AVX)
-	gen = gen_avx_blendvpd256;
-      break;
-    case E_V32QImode:
-    case E_V16HImode:
-    case E_V8SImode:
-    case E_V4DImode:
-      if (TARGET_AVX2)
-	{
-	  gen = gen_avx2_pblendvb;
-	  if (mode != V32QImode)
-	    d = gen_reg_rtx (V32QImode);
-	  op_false = gen_lowpart (V32QImode, op_false);
-	  op_true = gen_lowpart (V32QImode, op_true);
-	  cmp = gen_lowpart (V32QImode, cmp);
-	}
-      break;
-
-    case E_V64QImode:
-      gen = gen_avx512bw_blendmv64qi;
-      break;
-    case E_V32HImode:
-      gen = gen_avx512bw_blendmv32hi;
-      break;
-    case E_V16SImode:
-      gen = gen_avx512f_blendmv16si;
-      break;
-    case E_V8DImode:
-      gen = gen_avx512f_blendmv8di;
-      break;
-    case E_V8DFmode:
-      gen = gen_avx512f_blendmv8df;
-      break;
-    case E_V16SFmode:
-      gen = gen_avx512f_blendmv16sf;
-      break;
-
-    default:
-      break;
-    }
-
-  if (gen != NULL)
-    {
-      emit_insn (gen (d, op_false, op_true, cmp));
-      if (d != dest)
-	emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
-    }
-  else
-    {
-      op_true = force_reg (mode, op_true);
-
-      t2 = gen_reg_rtx (mode);
-      if (optimize)
-	t3 = gen_reg_rtx (mode);
-      else
-	t3 = dest;
-
-      x = gen_rtx_AND (mode, op_true, cmp);
-      emit_insn (gen_rtx_SET (t2, x));
-
-      x = gen_rtx_NOT (mode, cmp);
-      x = gen_rtx_AND (mode, x, op_false);
-      emit_insn (gen_rtx_SET (t3, x));
-
-      x = gen_rtx_IOR (mode, t3, t2);
-      emit_insn (gen_rtx_SET (dest, x));
-    }
-}
-
-/* Expand a floating-point conditional move.  Return true if successful.  */
-
-bool
-ix86_expand_fp_movcc (rtx operands[])
-{
-  machine_mode mode = GET_MODE (operands[0]);
-  enum rtx_code code = GET_CODE (operands[1]);
-  rtx tmp, compare_op;
-  rtx op0 = XEXP (operands[1], 0);
-  rtx op1 = XEXP (operands[1], 1);
-
-  if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
-    {
-      machine_mode cmode;
-
-      /* Since we've no cmove for sse registers, don't force bad register
-	 allocation just to gain access to it.  Deny movcc when the
-	 comparison mode doesn't match the move mode.  */
-      cmode = GET_MODE (op0);
-      if (cmode == VOIDmode)
-	cmode = GET_MODE (op1);
-      if (cmode != mode)
-	return false;
-
-      code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
-      if (code == UNKNOWN)
-	return false;
-
-      if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
-				     operands[2], operands[3]))
-	return true;
-
-      tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
-				 operands[2], operands[3]);
-      ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
-      return true;
-    }
-
-  if (GET_MODE (op0) == TImode
-      || (GET_MODE (op0) == DImode
-	  && !TARGET_64BIT))
-    return false;
-
-  /* The floating point conditional move instructions don't directly
-     support conditions resulting from a signed integer comparison.  */
-
-  compare_op = ix86_expand_compare (code, op0, op1);
-  if (!fcmov_comparison_operator (compare_op, VOIDmode))
-    {
-      tmp = gen_reg_rtx (QImode);
-      ix86_expand_setcc (tmp, code, op0, op1);
-
-      compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
-    }
-
-  emit_insn (gen_rtx_SET (operands[0],
-			  gen_rtx_IF_THEN_ELSE (mode, compare_op,
-						operands[2], operands[3])));
-
-  return true;
-}
-
-/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes.  */
-
-static int
-ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
-{
-  switch (code)
-    {
-    case EQ:
-      return 0;
-    case LT:
-    case LTU:
-      return 1;
-    case LE:
-    case LEU:
-      return 2;
-    case NE:
-      return 4;
-    case GE:
-    case GEU:
-      return 5;
-    case GT:
-    case GTU:
-      return 6;
-    default:
-      gcc_unreachable ();
-    }
-}
-
-/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes.  */
-
-static int
-ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
-{
-  switch (code)
-    {
-    case EQ:
-      return 0x00;
-    case NE:
-      return 0x04;
-    case GT:
-      return 0x0e;
-    case LE:
-      return 0x02;
-    case GE:
-      return 0x0d;
-    case LT:
-      return 0x01;
-    case UNLE:
-      return 0x0a;
-    case UNLT:
-      return 0x09;
-    case UNGE:
-      return 0x05;
-    case UNGT:
-      return 0x06;
-    case UNEQ:
-      return 0x18;
-    case LTGT:
-      return 0x0c;
-    case ORDERED:
-      return 0x07;
-    case UNORDERED:
-      return 0x03;
-    default:
-      gcc_unreachable ();
-    }
-}
-
-/* Return immediate value to be used in UNSPEC_PCMP
-   for comparison CODE in MODE.  */
-
-static int
-ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
-{
-  if (FLOAT_MODE_P (mode))
-    return ix86_fp_cmp_code_to_pcmp_immediate (code);
-  return ix86_int_cmp_code_to_pcmp_immediate (code);
-}
-
-/* Expand AVX-512 vector comparison.  */
-
-bool
-ix86_expand_mask_vec_cmp (rtx operands[])
-{
-  machine_mode mask_mode = GET_MODE (operands[0]);
-  machine_mode cmp_mode = GET_MODE (operands[2]);
-  enum rtx_code code = GET_CODE (operands[1]);
-  rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
-  int unspec_code;
-  rtx unspec;
-
-  switch (code)
-    {
-    case LEU:
-    case GTU:
-    case GEU:
-    case LTU:
-      unspec_code = UNSPEC_UNSIGNED_PCMP;
-      break;
-
-    default:
-      unspec_code = UNSPEC_PCMP;
-    }
-
-  unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
-						 operands[3], imm),
-			   unspec_code);
-  emit_insn (gen_rtx_SET (operands[0], unspec));
-
-  return true;
-}
-
-/* Expand fp vector comparison.  */
-
-bool
-ix86_expand_fp_vec_cmp (rtx operands[])
-{
-  enum rtx_code code = GET_CODE (operands[1]);
-  rtx cmp;
-
-  code = ix86_prepare_sse_fp_compare_args (operands[0], code,
-					   &operands[2], &operands[3]);
-  if (code == UNKNOWN)
-    {
-      rtx temp;
-      switch (GET_CODE (operands[1]))
-	{
-	case LTGT:
-	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
-				      operands[3], NULL, NULL);
-	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
-				     operands[3], NULL, NULL);
-	  code = AND;
-	  break;
-	case UNEQ:
-	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
-				      operands[3], NULL, NULL);
-	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
-				     operands[3], NULL, NULL);
-	  code = IOR;
-	  break;
-	default:
-	  gcc_unreachable ();
-	}
-      cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
-				 OPTAB_DIRECT);
-    }
-  else
-    cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
-			       operands[1], operands[2]);
-
-  if (operands[0] != cmp)
-    emit_move_insn (operands[0], cmp);
-
-  return true;
-}
-
-static rtx
-ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
-			 rtx op_true, rtx op_false, bool *negate)
-{
-  machine_mode data_mode = GET_MODE (dest);
-  machine_mode mode = GET_MODE (cop0);
-  rtx x;
-
-  *negate = false;
-
-  /* XOP supports all of the comparisons on all 128-bit vector int types.  */
-  if (TARGET_XOP
-      && (mode == V16QImode || mode == V8HImode
-	  || mode == V4SImode || mode == V2DImode))
-    ;
-  else
-    {
-      /* Canonicalize the comparison to EQ, GT, GTU.  */
-      switch (code)
-	{
-	case EQ:
-	case GT:
-	case GTU:
-	  break;
-
-	case NE:
-	case LE:
-	case LEU:
-	  code = reverse_condition (code);
-	  *negate = true;
-	  break;
-
-	case GE:
-	case GEU:
-	  code = reverse_condition (code);
-	  *negate = true;
-	  /* FALLTHRU */
-
-	case LT:
-	case LTU:
-	  std::swap (cop0, cop1);
-	  code = swap_condition (code);
-	  break;
-
-	default:
-	  gcc_unreachable ();
-	}
-
-      /* Only SSE4.1/SSE4.2 supports V2DImode.  */
-      if (mode == V2DImode)
-	{
-	  switch (code)
-	    {
-	    case EQ:
-	      /* SSE4.1 supports EQ.  */
-	      if (!TARGET_SSE4_1)
-		return NULL;
-	      break;
-
-	    case GT:
-	    case GTU:
-	      /* SSE4.2 supports GT/GTU.  */
-	      if (!TARGET_SSE4_2)
-		return NULL;
-	      break;
-
-	    default:
-	      gcc_unreachable ();
-	    }
-	}
-
-      rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
-      rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
-      if (*negate)
-	std::swap (optrue, opfalse);
-
-      /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
-	 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
-	 min (x, y) == x).  While we add one instruction (the minimum),
-	 we remove the need for two instructions in the negation, as the
-	 result is done this way.
-	 When using masks, do it for SI/DImode element types, as it is shorter
-	 than the two subtractions.  */
-      if ((code != EQ
-	   && GET_MODE_SIZE (mode) != 64
-	   && vector_all_ones_operand (opfalse, data_mode)
-	   && optrue == CONST0_RTX (data_mode))
-	  || (code == GTU
-	      && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
-	      /* Don't do it if not using integer masks and we'd end up with
-		 the right values in the registers though.  */
-	      && (GET_MODE_SIZE (mode) == 64
-		  || !vector_all_ones_operand (optrue, data_mode)
-		  || opfalse != CONST0_RTX (data_mode))))
-	{
-	  rtx (*gen) (rtx, rtx, rtx) = NULL;
-
-	  switch (mode)
-	    {
-	    case E_V16SImode:
-	      gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
-	      break;
-	    case E_V8DImode:
-	      gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
-	      cop0 = force_reg (mode, cop0);
-	      cop1 = force_reg (mode, cop1);
-	      break;
-	    case E_V32QImode:
-	      if (TARGET_AVX2)
-		gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
-	      break;
-	    case E_V16HImode:
-	      if (TARGET_AVX2)
-		gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
-	      break;
-	    case E_V8SImode:
-	      if (TARGET_AVX2)
-		gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
-	      break;
-	    case E_V4DImode:
-	      if (TARGET_AVX512VL)
-		{
-		  gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
-		  cop0 = force_reg (mode, cop0);
-		  cop1 = force_reg (mode, cop1);
-		}
-	      break;
-	    case E_V16QImode:
-	      if (code == GTU && TARGET_SSE2)
-		gen = gen_uminv16qi3;
-	      else if (code == GT && TARGET_SSE4_1)
-		gen = gen_sminv16qi3;
-	      break;
-	    case E_V8HImode:
-	      if (code == GTU && TARGET_SSE4_1)
-		gen = gen_uminv8hi3;
-	      else if (code == GT && TARGET_SSE2)
-		gen = gen_sminv8hi3;
-	      break;
-	    case E_V4SImode:
-	      if (TARGET_SSE4_1)
-		gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
-	      break;
-	    case E_V2DImode:
-	      if (TARGET_AVX512VL)
-		{
-		  gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
-		  cop0 = force_reg (mode, cop0);
-		  cop1 = force_reg (mode, cop1);
-		}
-	      break;
-	    default:
-	      break;
-	    }
-
-	  if (gen)
-	    {
-	      rtx tem = gen_reg_rtx (mode);
-	      if (!vector_operand (cop0, mode))
-		cop0 = force_reg (mode, cop0);
-	      if (!vector_operand (cop1, mode))
-		cop1 = force_reg (mode, cop1);
-	      *negate = !*negate;
-	      emit_insn (gen (tem, cop0, cop1));
-	      cop1 = tem;
-	      code = EQ;
-	    }
-	}
-
-      /* Unsigned parallel compare is not supported by the hardware.
-	 Play some tricks to turn this into a signed comparison
-	 against 0.  */
-      if (code == GTU)
-	{
-	  cop0 = force_reg (mode, cop0);
-
-	  switch (mode)
-	    {
-	    case E_V16SImode:
-	    case E_V8DImode:
-	    case E_V8SImode:
-	    case E_V4DImode:
-	    case E_V4SImode:
-	    case E_V2DImode:
-		{
-		  rtx t1, t2, mask;
-		  rtx (*gen_sub3) (rtx, rtx, rtx);
-
-		  switch (mode)
-		    {
-		    case E_V16SImode: gen_sub3 = gen_subv16si3; break;
-		    case E_V8DImode: gen_sub3 = gen_subv8di3; break;
-		    case E_V8SImode: gen_sub3 = gen_subv8si3; break;
-		    case E_V4DImode: gen_sub3 = gen_subv4di3; break;
-		    case E_V4SImode: gen_sub3 = gen_subv4si3; break;
-		    case E_V2DImode: gen_sub3 = gen_subv2di3; break;
-		    default:
-		      gcc_unreachable ();
-		    }
-		  /* Subtract (-(INT MAX) - 1) from both operands to make
-		     them signed.  */
-		  mask = ix86_build_signbit_mask (mode, true, false);
-		  t1 = gen_reg_rtx (mode);
-		  emit_insn (gen_sub3 (t1, cop0, mask));
-
-		  t2 = gen_reg_rtx (mode);
-		  emit_insn (gen_sub3 (t2, cop1, mask));
-
-		  cop0 = t1;
-		  cop1 = t2;
-		  code = GT;
-		}
-	      break;
-
-	    case E_V64QImode:
-	    case E_V32HImode:
-	    case E_V32QImode:
-	    case E_V16HImode:
-	    case E_V16QImode:
-	    case E_V8HImode:
-	      /* Perform a parallel unsigned saturating subtraction.  */
-	      x = gen_reg_rtx (mode);
-	      emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
-							   cop1)));
-
-	      cop0 = x;
-	      cop1 = CONST0_RTX (mode);
-	      code = EQ;
-	      *negate = !*negate;
-	      break;
-
-	    default:
-	      gcc_unreachable ();
-	    }
-	}
-    }
-
-  if (*negate)
-    std::swap (op_true, op_false);
-
-  /* Allow the comparison to be done in one mode, but the movcc to
-     happen in another mode.  */
-  if (data_mode == mode)
-    {
-      x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
-			       op_true, op_false);
-    }
-  else
-    {
-      gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
-      x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
-			       op_true, op_false);
-      if (GET_MODE (x) == mode)
-	x = gen_lowpart (data_mode, x);
-    }
-
-  return x;
-}
-
-/* Expand integer vector comparison.  */
-
-bool
-ix86_expand_int_vec_cmp (rtx operands[])
-{
-  rtx_code code = GET_CODE (operands[1]);
-  bool negate = false;
-  rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
-				     operands[3], NULL, NULL, &negate);
-
-  if (!cmp)
-    return false;
-
-  if (negate)
-    cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
-				   CONST0_RTX (GET_MODE (cmp)),
-				   NULL, NULL, &negate);
-
-  gcc_assert (!negate);
-
-  if (operands[0] != cmp)
-    emit_move_insn (operands[0], cmp);
-
-  return true;
-}
-
-/* Expand a floating-point vector conditional move; a vcond operation
-   rather than a movcc operation.  */
-
-bool
-ix86_expand_fp_vcond (rtx operands[])
-{
-  enum rtx_code code = GET_CODE (operands[3]);
-  rtx cmp;
-
-  code = ix86_prepare_sse_fp_compare_args (operands[0], code,
-					   &operands[4], &operands[5]);
-  if (code == UNKNOWN)
-    {
-      rtx temp;
-      switch (GET_CODE (operands[3]))
-	{
-	case LTGT:
-	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
-				      operands[5], operands[0], operands[0]);
-	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
-				     operands[5], operands[1], operands[2]);
-	  code = AND;
-	  break;
-	case UNEQ:
-	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
-				      operands[5], operands[0], operands[0]);
-	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
-				     operands[5], operands[1], operands[2]);
-	  code = IOR;
-	  break;
-	default:
-	  gcc_unreachable ();
-	}
-      cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
-				 OPTAB_DIRECT);
-      ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
-      return true;
-    }
-
-  if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
-				 operands[5], operands[1], operands[2]))
-    return true;
-
-  cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
-			     operands[1], operands[2]);
-  ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
-  return true;
-}
-
-/* Expand a signed/unsigned integral vector conditional move.  */
-
-bool
-ix86_expand_int_vcond (rtx operands[])
-{
-  machine_mode data_mode = GET_MODE (operands[0]);
-  machine_mode mode = GET_MODE (operands[4]);
-  enum rtx_code code = GET_CODE (operands[3]);
-  bool negate = false;
-  rtx x, cop0, cop1;
-
-  cop0 = operands[4];
-  cop1 = operands[5];
-
-  /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
-     and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
-  if ((code == LT || code == GE)
-      && data_mode == mode
-      && cop1 == CONST0_RTX (mode)
-      && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
-      && GET_MODE_UNIT_SIZE (data_mode) > 1
-      && GET_MODE_UNIT_SIZE (data_mode) <= 8
-      && (GET_MODE_SIZE (data_mode) == 16
-	  || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
-    {
-      rtx negop = operands[2 - (code == LT)];
-      int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
-      if (negop == CONST1_RTX (data_mode))
-	{
-	  rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
-					 operands[0], 1, OPTAB_DIRECT);
-	  if (res != operands[0])
-	    emit_move_insn (operands[0], res);
-	  return true;
-	}
-      else if (GET_MODE_INNER (data_mode) != DImode
-	       && vector_all_ones_operand (negop, data_mode))
-	{
-	  rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
-					 operands[0], 0, OPTAB_DIRECT);
-	  if (res != operands[0])
-	    emit_move_insn (operands[0], res);
-	  return true;
-	}
-    }
-
-  if (!nonimmediate_operand (cop1, mode))
-    cop1 = force_reg (mode, cop1);
-  if (!general_operand (operands[1], data_mode))
-    operands[1] = force_reg (data_mode, operands[1]);
-  if (!general_operand (operands[2], data_mode))
-    operands[2] = force_reg (data_mode, operands[2]);
-
-  x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
-			       operands[1], operands[2], &negate);
-
-  if (!x)
-    return false;
-
-  ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
-			 operands[2-negate]);
-  return true;
-}
-
-/* AVX512F does support 64-byte integer vector operations,
-   thus the longest vector we are faced with is V64QImode.  */
-#define MAX_VECT_LEN	64
-
-struct expand_vec_perm_d
-{
-  rtx target, op0, op1;
-  unsigned char perm[MAX_VECT_LEN];
-  machine_mode vmode;
-  unsigned char nelt;
-  bool one_operand_p;
-  bool testing_p;
-};
-
-static bool
-ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
-			      struct expand_vec_perm_d *d)
-{
-  /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
-     expander, so args are either in d, or in op0, op1 etc.  */
-  machine_mode mode = GET_MODE (d ? d->op0 : op0);
-  machine_mode maskmode = mode;
-  rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
-
-  switch (mode)
-    {
-    case E_V8HImode:
-      if (TARGET_AVX512VL && TARGET_AVX512BW)
-	gen = gen_avx512vl_vpermt2varv8hi3;
-      break;
-    case E_V16HImode:
-      if (TARGET_AVX512VL && TARGET_AVX512BW)
-	gen = gen_avx512vl_vpermt2varv16hi3;
-      break;
-    case E_V64QImode:
-      if (TARGET_AVX512VBMI)
-	gen = gen_avx512bw_vpermt2varv64qi3;
-      break;
-    case E_V32HImode:
-      if (TARGET_AVX512BW)
-	gen = gen_avx512bw_vpermt2varv32hi3;
-      break;
-    case E_V4SImode:
-      if (TARGET_AVX512VL)
-	gen = gen_avx512vl_vpermt2varv4si3;
-      break;
-    case E_V8SImode:
-      if (TARGET_AVX512VL)
-	gen = gen_avx512vl_vpermt2varv8si3;
-      break;
-    case E_V16SImode:
-      if (TARGET_AVX512F)
-	gen = gen_avx512f_vpermt2varv16si3;
-      break;
-    case E_V4SFmode:
-      if (TARGET_AVX512VL)
-	{
-	  gen = gen_avx512vl_vpermt2varv4sf3;
-	  maskmode = V4SImode;
-	}
-      break;
-    case E_V8SFmode:
-      if (TARGET_AVX512VL)
-	{
-	  gen = gen_avx512vl_vpermt2varv8sf3;
-	  maskmode = V8SImode;
-	}
-      break;
-    case E_V16SFmode:
-      if (TARGET_AVX512F)
-	{
-	  gen = gen_avx512f_vpermt2varv16sf3;
-	  maskmode = V16SImode;
-	}
-      break;
-    case E_V2DImode:
-      if (TARGET_AVX512VL)
-	gen = gen_avx512vl_vpermt2varv2di3;
-      break;
-    case E_V4DImode:
-      if (TARGET_AVX512VL)
-	gen = gen_avx512vl_vpermt2varv4di3;
-      break;
-    case E_V8DImode:
-      if (TARGET_AVX512F)
-	gen = gen_avx512f_vpermt2varv8di3;
-      break;
-    case E_V2DFmode:
-      if (TARGET_AVX512VL)
-	{
-	  gen = gen_avx512vl_vpermt2varv2df3;
-	  maskmode = V2DImode;
-	}
-      break;
-    case E_V4DFmode:
-      if (TARGET_AVX512VL)
-	{
-	  gen = gen_avx512vl_vpermt2varv4df3;
-	  maskmode = V4DImode;
-	}
-      break;
-    case E_V8DFmode:
-      if (TARGET_AVX512F)
-	{
-	  gen = gen_avx512f_vpermt2varv8df3;
-	  maskmode = V8DImode;
-	}
-      break;
-    default:
-      break;
-    }
-
-  if (gen == NULL)
-    return false;
-
-  /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
-     expander, so args are either in d, or in op0, op1 etc.  */
-  if (d)
-    {
-      rtx vec[64];
-      target = d->target;
-      op0 = d->op0;
-      op1 = d->op1;
-      for (int i = 0; i < d->nelt; ++i)
-	vec[i] = GEN_INT (d->perm[i]);
-      mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
-    }
-
-  emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
-  return true;
-}
-
-/* Expand a variable vector permutation.  */
-
-void
-ix86_expand_vec_perm (rtx operands[])
-{
-  rtx target = operands[0];
-  rtx op0 = operands[1];
-  rtx op1 = operands[2];
-  rtx mask = operands[3];
-  rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
-  machine_mode mode = GET_MODE (op0);
-  machine_mode maskmode = GET_MODE (mask);
-  int w, e, i;
-  bool one_operand_shuffle = rtx_equal_p (op0, op1);
-
-  /* Number of elements in the vector.  */
-  w = GET_MODE_NUNITS (mode);
-  e = GET_MODE_UNIT_SIZE (mode);
-  gcc_assert (w <= 64);
-
-  if (TARGET_AVX512F && one_operand_shuffle)
-    {
-      rtx (*gen) (rtx, rtx, rtx) = NULL;
-      switch (mode)
-	{
-	case E_V16SImode:
-	  gen =gen_avx512f_permvarv16si;
-	  break;
-	case E_V16SFmode:
-	  gen = gen_avx512f_permvarv16sf;
-	  break;
-	case E_V8DImode:
-	  gen = gen_avx512f_permvarv8di;
-	  break;
-	case E_V8DFmode:
-	  gen = gen_avx512f_permvarv8df;
-	  break;
-	default:
-	  break;
-	}
-      if (gen != NULL)
-	{
-	  emit_insn (gen (target, op0, mask));
-	  return;
-	}
-    }
-
-  if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
-    return;
-
-  if (TARGET_AVX2)
-    {
-      if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
-	{
-	  /* Unfortunately, the VPERMQ and VPERMPD instructions only support
-	     an constant shuffle operand.  With a tiny bit of effort we can
-	     use VPERMD instead.  A re-interpretation stall for V4DFmode is
-	     unfortunate but there's no avoiding it.
-	     Similarly for V16HImode we don't have instructions for variable
-	     shuffling, while for V32QImode we can use after preparing suitable
-	     masks vpshufb; vpshufb; vpermq; vpor.  */
-
-	  if (mode == V16HImode)
-	    {
-	      maskmode = mode = V32QImode;
-	      w = 32;
-	      e = 1;
-	    }
-	  else
-	    {
-	      maskmode = mode = V8SImode;
-	      w = 8;
-	      e = 4;
-	    }
-	  t1 = gen_reg_rtx (maskmode);
-
-	  /* Replicate the low bits of the V4DImode mask into V8SImode:
-	       mask = { A B C D }
-	       t1 = { A A B B C C D D }.  */
-	  for (i = 0; i < w / 2; ++i)
-	    vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
-	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
-	  vt = force_reg (maskmode, vt);
-	  mask = gen_lowpart (maskmode, mask);
-	  if (maskmode == V8SImode)
-	    emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
-	  else
-	    emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
-
-	  /* Multiply the shuffle indicies by two.  */
-	  t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
-				    OPTAB_DIRECT);
-
-	  /* Add one to the odd shuffle indicies:
-		t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
-	  for (i = 0; i < w / 2; ++i)
-	    {
-	      vec[i * 2] = const0_rtx;
-	      vec[i * 2 + 1] = const1_rtx;
-	    }
-	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
-	  vt = validize_mem (force_const_mem (maskmode, vt));
-	  t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
-				    OPTAB_DIRECT);
-
-	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
-	  operands[3] = mask = t1;
-	  target = gen_reg_rtx (mode);
-	  op0 = gen_lowpart (mode, op0);
-	  op1 = gen_lowpart (mode, op1);
-	}
-
-      switch (mode)
-	{
-	case E_V8SImode:
-	  /* The VPERMD and VPERMPS instructions already properly ignore
-	     the high bits of the shuffle elements.  No need for us to
-	     perform an AND ourselves.  */
-	  if (one_operand_shuffle)
-	    {
-	      emit_insn (gen_avx2_permvarv8si (target, op0, mask));
-	      if (target != operands[0])
-		emit_move_insn (operands[0],
-				gen_lowpart (GET_MODE (operands[0]), target));
-	    }
-	  else
-	    {
-	      t1 = gen_reg_rtx (V8SImode);
-	      t2 = gen_reg_rtx (V8SImode);
-	      emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
-	      emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
-	      goto merge_two;
-	    }
-	  return;
-
-	case E_V8SFmode:
-	  mask = gen_lowpart (V8SImode, mask);
-	  if (one_operand_shuffle)
-	    emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
-	  else
-	    {
-	      t1 = gen_reg_rtx (V8SFmode);
-	      t2 = gen_reg_rtx (V8SFmode);
-	      emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
-	      emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
-	      goto merge_two;
-	    }
-	  return;
-
-        case E_V4SImode:
-	  /* By combining the two 128-bit input vectors into one 256-bit
-	     input vector, we can use VPERMD and VPERMPS for the full
-	     two-operand shuffle.  */
-	  t1 = gen_reg_rtx (V8SImode);
-	  t2 = gen_reg_rtx (V8SImode);
-	  emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
-	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
-	  emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
-	  emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
-	  return;
-
-        case E_V4SFmode:
-	  t1 = gen_reg_rtx (V8SFmode);
-	  t2 = gen_reg_rtx (V8SImode);
-	  mask = gen_lowpart (V4SImode, mask);
-	  emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
-	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
-	  emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
-	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
-	  return;
-
-	case E_V32QImode:
-	  t1 = gen_reg_rtx (V32QImode);
-	  t2 = gen_reg_rtx (V32QImode);
-	  t3 = gen_reg_rtx (V32QImode);
-	  vt2 = GEN_INT (-128);
-	  vt = gen_const_vec_duplicate (V32QImode, vt2);
-	  vt = force_reg (V32QImode, vt);
-	  for (i = 0; i < 32; i++)
-	    vec[i] = i < 16 ? vt2 : const0_rtx;
-	  vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
-	  vt2 = force_reg (V32QImode, vt2);
-	  /* From mask create two adjusted masks, which contain the same
-	     bits as mask in the low 7 bits of each vector element.
-	     The first mask will have the most significant bit clear
-	     if it requests element from the same 128-bit lane
-	     and MSB set if it requests element from the other 128-bit lane.
-	     The second mask will have the opposite values of the MSB,
-	     and additionally will have its 128-bit lanes swapped.
-	     E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
-	     t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
-	     t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
-	     stands for other 12 bytes.  */
-	  /* The bit whether element is from the same lane or the other
-	     lane is bit 4, so shift it up by 3 to the MSB position.  */
-	  t5 = gen_reg_rtx (V4DImode);
-	  emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
-				    GEN_INT (3)));
-	  /* Clear MSB bits from the mask just in case it had them set.  */
-	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
-	  /* After this t1 will have MSB set for elements from other lane.  */
-	  emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
-	  /* Clear bits other than MSB.  */
-	  emit_insn (gen_andv32qi3 (t1, t1, vt));
-	  /* Or in the lower bits from mask into t3.  */
-	  emit_insn (gen_iorv32qi3 (t3, t1, t2));
-	  /* And invert MSB bits in t1, so MSB is set for elements from the same
-	     lane.  */
-	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
-	  /* Swap 128-bit lanes in t3.  */
-	  t6 = gen_reg_rtx (V4DImode);
-	  emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
-					  const2_rtx, GEN_INT (3),
-					  const0_rtx, const1_rtx));
-	  /* And or in the lower bits from mask into t1.  */
-	  emit_insn (gen_iorv32qi3 (t1, t1, t2));
-	  if (one_operand_shuffle)
-	    {
-	      /* Each of these shuffles will put 0s in places where
-		 element from the other 128-bit lane is needed, otherwise
-		 will shuffle in the requested value.  */
-	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
-						gen_lowpart (V32QImode, t6)));
-	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
-	      /* For t3 the 128-bit lanes are swapped again.  */
-	      t7 = gen_reg_rtx (V4DImode);
-	      emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
-					      const2_rtx, GEN_INT (3),
-					      const0_rtx, const1_rtx));
-	      /* And oring both together leads to the result.  */
-	      emit_insn (gen_iorv32qi3 (target, t1,
-					gen_lowpart (V32QImode, t7)));
-	      if (target != operands[0])
-		emit_move_insn (operands[0],
-				gen_lowpart (GET_MODE (operands[0]), target));
-	      return;
-	    }
-
-	  t4 = gen_reg_rtx (V32QImode);
-	  /* Similarly to the above one_operand_shuffle code,
-	     just for repeated twice for each operand.  merge_two:
-	     code will merge the two results together.  */
-	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
-					    gen_lowpart (V32QImode, t6)));
-	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
-					    gen_lowpart (V32QImode, t6)));
-	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
-	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
-	  t7 = gen_reg_rtx (V4DImode);
-	  emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
-					  const2_rtx, GEN_INT (3),
-					  const0_rtx, const1_rtx));
-	  t8 = gen_reg_rtx (V4DImode);
-	  emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
-					  const2_rtx, GEN_INT (3),
-					  const0_rtx, const1_rtx));
-	  emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
-	  emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
-	  t1 = t4;
-	  t2 = t3;
-	  goto merge_two;
-
-	default:
-	  gcc_assert (GET_MODE_SIZE (mode) <= 16);
-	  break;
-	}
-    }
-
-  if (TARGET_XOP)
-    {
-      /* The XOP VPPERM insn supports three inputs.  By ignoring the 
-	 one_operand_shuffle special case, we avoid creating another
-	 set of constant vectors in memory.  */
-      one_operand_shuffle = false;
-
-      /* mask = mask & {2*w-1, ...} */
-      vt = GEN_INT (2*w - 1);
-    }
-  else
-    {
-      /* mask = mask & {w-1, ...} */
-      vt = GEN_INT (w - 1);
-    }
-
-  vt = gen_const_vec_duplicate (maskmode, vt);
-  mask = expand_simple_binop (maskmode, AND, mask, vt,
-			      NULL_RTX, 0, OPTAB_DIRECT);
-
-  /* For non-QImode operations, convert the word permutation control
-     into a byte permutation control.  */
-  if (mode != V16QImode)
-    {
-      mask = expand_simple_binop (maskmode, ASHIFT, mask,
-				  GEN_INT (exact_log2 (e)),
-				  NULL_RTX, 0, OPTAB_DIRECT);
-
-      /* Convert mask to vector of chars.  */
-      mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
-
-      /* Replicate each of the input bytes into byte positions:
-	 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
-	 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
-	 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
-      for (i = 0; i < 16; ++i)
-	vec[i] = GEN_INT (i/e * e);
-      vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
-      vt = validize_mem (force_const_mem (V16QImode, vt));
-      if (TARGET_XOP)
-	emit_insn (gen_xop_pperm (mask, mask, mask, vt));
-      else
-	emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
-
-      /* Convert it into the byte positions by doing
-	 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
-      for (i = 0; i < 16; ++i)
-	vec[i] = GEN_INT (i % e);
-      vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
-      vt = validize_mem (force_const_mem (V16QImode, vt));
-      emit_insn (gen_addv16qi3 (mask, mask, vt));
-    }
-
-  /* The actual shuffle operations all operate on V16QImode.  */
-  op0 = gen_lowpart (V16QImode, op0);
-  op1 = gen_lowpart (V16QImode, op1);
-
-  if (TARGET_XOP)
-    {
-      if (GET_MODE (target) != V16QImode)
-	target = gen_reg_rtx (V16QImode);
-      emit_insn (gen_xop_pperm (target, op0, op1, mask));
-      if (target != operands[0])
-	emit_move_insn (operands[0],
-			gen_lowpart (GET_MODE (operands[0]), target));
-    }
-  else if (one_operand_shuffle)
-    {
-      if (GET_MODE (target) != V16QImode)
-	target = gen_reg_rtx (V16QImode);
-      emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
-      if (target != operands[0])
-	emit_move_insn (operands[0],
-			gen_lowpart (GET_MODE (operands[0]), target));
-    }
-  else
-    {
-      rtx xops[6];
-      bool ok;
-
-      /* Shuffle the two input vectors independently.  */
-      t1 = gen_reg_rtx (V16QImode);
-      t2 = gen_reg_rtx (V16QImode);
-      emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
-      emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
-
- merge_two:
-      /* Then merge them together.  The key is whether any given control
-         element contained a bit set that indicates the second word.  */
-      mask = operands[3];
-      vt = GEN_INT (w);
-      if (maskmode == V2DImode && !TARGET_SSE4_1)
-	{
-	  /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
-	     more shuffle to convert the V2DI input mask into a V4SI
-	     input mask.  At which point the masking that expand_int_vcond
-	     will work as desired.  */
-	  rtx t3 = gen_reg_rtx (V4SImode);
-	  emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
-				        const0_rtx, const0_rtx,
-				        const2_rtx, const2_rtx));
-	  mask = t3;
-	  maskmode = V4SImode;
-	  e = w = 4;
-	}
-
-      vt = gen_const_vec_duplicate (maskmode, vt);
-      vt = force_reg (maskmode, vt);
-      mask = expand_simple_binop (maskmode, AND, mask, vt,
-				  NULL_RTX, 0, OPTAB_DIRECT);
-
-      if (GET_MODE (target) != mode)
-	target = gen_reg_rtx (mode);
-      xops[0] = target;
-      xops[1] = gen_lowpart (mode, t2);
-      xops[2] = gen_lowpart (mode, t1);
-      xops[3] = gen_rtx_EQ (maskmode, mask, vt);
-      xops[4] = mask;
-      xops[5] = vt;
-      ok = ix86_expand_int_vcond (xops);
-      gcc_assert (ok);
-      if (target != operands[0])
-	emit_move_insn (operands[0],
-			gen_lowpart (GET_MODE (operands[0]), target));
-    }
-}
-
-/* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
-   true if we should do zero extension, else sign extension.  HIGH_P is
-   true if we want the N/2 high elements, else the low elements.  */
-
-void
-ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
-{
-  machine_mode imode = GET_MODE (src);
-  rtx tmp;
-
-  if (TARGET_SSE4_1)
-    {
-      rtx (*unpack)(rtx, rtx);
-      rtx (*extract)(rtx, rtx) = NULL;
-      machine_mode halfmode = BLKmode;
-
-      switch (imode)
-	{
-	case E_V64QImode:
-	  if (unsigned_p)
-	    unpack = gen_avx512bw_zero_extendv32qiv32hi2;
-	  else
-	    unpack = gen_avx512bw_sign_extendv32qiv32hi2;
-	  halfmode = V32QImode;
-	  extract
-	    = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
-	  break;
-	case E_V32QImode:
-	  if (unsigned_p)
-	    unpack = gen_avx2_zero_extendv16qiv16hi2;
-	  else
-	    unpack = gen_avx2_sign_extendv16qiv16hi2;
-	  halfmode = V16QImode;
-	  extract
-	    = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
-	  break;
-	case E_V32HImode:
-	  if (unsigned_p)
-	    unpack = gen_avx512f_zero_extendv16hiv16si2;
-	  else
-	    unpack = gen_avx512f_sign_extendv16hiv16si2;
-	  halfmode = V16HImode;
-	  extract
-	    = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
-	  break;
-	case E_V16HImode:
-	  if (unsigned_p)
-	    unpack = gen_avx2_zero_extendv8hiv8si2;
-	  else
-	    unpack = gen_avx2_sign_extendv8hiv8si2;
-	  halfmode = V8HImode;
-	  extract
-	    = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
-	  break;
-	case E_V16SImode:
-	  if (unsigned_p)
-	    unpack = gen_avx512f_zero_extendv8siv8di2;
-	  else
-	    unpack = gen_avx512f_sign_extendv8siv8di2;
-	  halfmode = V8SImode;
-	  extract
-	    = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
-	  break;
-	case E_V8SImode:
-	  if (unsigned_p)
-	    unpack = gen_avx2_zero_extendv4siv4di2;
-	  else
-	    unpack = gen_avx2_sign_extendv4siv4di2;
-	  halfmode = V4SImode;
-	  extract
-	    = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
-	  break;
-	case E_V16QImode:
-	  if (unsigned_p)
-	    unpack = gen_sse4_1_zero_extendv8qiv8hi2;
-	  else
-	    unpack = gen_sse4_1_sign_extendv8qiv8hi2;
-	  break;
-	case E_V8HImode:
-	  if (unsigned_p)
-	    unpack = gen_sse4_1_zero_extendv4hiv4si2;
-	  else
-	    unpack = gen_sse4_1_sign_extendv4hiv4si2;
-	  break;
-	case E_V4SImode:
-	  if (unsigned_p)
-	    unpack = gen_sse4_1_zero_extendv2siv2di2;
-	  else
-	    unpack = gen_sse4_1_sign_extendv2siv2di2;
-	  break;
-	default:
-	  gcc_unreachable ();
-	}
-
-      if (GET_MODE_SIZE (imode) >= 32)
-	{
-	  tmp = gen_reg_rtx (halfmode);
-	  emit_insn (extract (tmp, src));
-	}
-      else if (high_p)
-	{
-	  /* Shift higher 8 bytes to lower 8 bytes.  */
-	  tmp = gen_reg_rtx (V1TImode);
-	  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
-					 GEN_INT (64)));
-	  tmp = gen_lowpart (imode, tmp);
-	}
-      else
-	tmp = src;
-
-      emit_insn (unpack (dest, tmp));
-    }
-  else
-    {
-      rtx (*unpack)(rtx, rtx, rtx);
-
-      switch (imode)
-	{
-	case E_V16QImode:
-	  if (high_p)
-	    unpack = gen_vec_interleave_highv16qi;
-	  else
-	    unpack = gen_vec_interleave_lowv16qi;
-	  break;
-	case E_V8HImode:
-	  if (high_p)
-	    unpack = gen_vec_interleave_highv8hi;
-	  else
-	    unpack = gen_vec_interleave_lowv8hi;
-	  break;
-	case E_V4SImode:
-	  if (high_p)
-	    unpack = gen_vec_interleave_highv4si;
-	  else
-	    unpack = gen_vec_interleave_lowv4si;
-	  break;
-	default:
-	  gcc_unreachable ();
-	}
-
-      if (unsigned_p)
-	tmp = force_reg (imode, CONST0_RTX (imode));
-      else
-	tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
-				   src, pc_rtx, pc_rtx);
-
-      rtx tmp2 = gen_reg_rtx (imode);
-      emit_insn (unpack (tmp2, src, tmp));
-      emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
-    }
-}
-
-/* Expand conditional increment or decrement using adb/sbb instructions.
-   The default case using setcc followed by the conditional move can be
-   done by generic code.  */
-bool
-ix86_expand_int_addcc (rtx operands[])
-{
-  enum rtx_code code = GET_CODE (operands[1]);
-  rtx flags;
-  rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
-  rtx compare_op;
-  rtx val = const0_rtx;
-  bool fpcmp = false;
-  machine_mode mode;
-  rtx op0 = XEXP (operands[1], 0);
-  rtx op1 = XEXP (operands[1], 1);
-
-  if (operands[3] != const1_rtx
-      && operands[3] != constm1_rtx)
-    return false;
-  if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
-     return false;
-  code = GET_CODE (compare_op);
-
-  flags = XEXP (compare_op, 0);
-
-  if (GET_MODE (flags) == CCFPmode)
-    {
-      fpcmp = true;
-      code = ix86_fp_compare_code_to_integer (code);
-    }
-
-  if (code != LTU)
-    {
-      val = constm1_rtx;
-      if (fpcmp)
-	PUT_CODE (compare_op,
-		  reverse_condition_maybe_unordered
-		    (GET_CODE (compare_op)));
-      else
-	PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
-    }
-
-  mode = GET_MODE (operands[0]);
-
-  /* Construct either adc or sbb insn.  */
-  if ((code == LTU) == (operands[3] == constm1_rtx))
-    {
-      switch (mode)
-	{
-	  case E_QImode:
-	    insn = gen_subqi3_carry;
-	    break;
-	  case E_HImode:
-	    insn = gen_subhi3_carry;
-	    break;
-	  case E_SImode:
-	    insn = gen_subsi3_carry;
-	    break;
-	  case E_DImode:
-	    insn = gen_subdi3_carry;
-	    break;
-	  default:
-	    gcc_unreachable ();
-	}
-    }
-  else
-    {
-      switch (mode)
-	{
-	  case E_QImode:
-	    insn = gen_addqi3_carry;
-	    break;
-	  case E_HImode:
-	    insn = gen_addhi3_carry;
-	    break;
-	  case E_SImode:
-	    insn = gen_addsi3_carry;
-	    break;
-	  case E_DImode:
-	    insn = gen_adddi3_carry;
-	    break;
-	  default:
-	    gcc_unreachable ();
-	}
-    }
-  emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
-
-  return true;
-}
-
-
-/* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
-   but works for floating pointer parameters and nonoffsetable memories.
-   For pushes, it returns just stack offsets; the values will be saved
-   in the right order.  Maximally three parts are generated.  */
-
-static int
-ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
-{
-  int size;
-
-  if (!TARGET_64BIT)
-    size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
-  else
-    size = (GET_MODE_SIZE (mode) + 4) / 8;
-
-  gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
-  gcc_assert (size >= 2 && size <= 4);
-
-  /* Optimize constant pool reference to immediates.  This is used by fp
-     moves, that force all constants to memory to allow combining.  */
-  if (MEM_P (operand) && MEM_READONLY_P (operand))
-    operand = avoid_constant_pool_reference (operand);
-
-  if (MEM_P (operand) && !offsettable_memref_p (operand))
-    {
-      /* The only non-offsetable memories we handle are pushes.  */
-      int ok = push_operand (operand, VOIDmode);
-
-      gcc_assert (ok);
-
-      operand = copy_rtx (operand);
-      PUT_MODE (operand, word_mode);
-      parts[0] = parts[1] = parts[2] = parts[3] = operand;
-      return size;
-    }
-
-  if (GET_CODE (operand) == CONST_VECTOR)
-    {
-      scalar_int_mode imode = int_mode_for_mode (mode).require ();
-      /* Caution: if we looked through a constant pool memory above,
-	 the operand may actually have a different mode now.  That's
-	 ok, since we want to pun this all the way back to an integer.  */
-      operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
-      gcc_assert (operand != NULL);
-      mode = imode;
-    }
-
-  if (!TARGET_64BIT)
-    {
-      if (mode == DImode)
-	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
-      else
-	{
-	  int i;
-
-	  if (REG_P (operand))
-	    {
-	      gcc_assert (reload_completed);
-	      for (i = 0; i < size; i++)
-		parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
-	    }
-	  else if (offsettable_memref_p (operand))
-	    {
-	      operand = adjust_address (operand, SImode, 0);
-	      parts[0] = operand;
-	      for (i = 1; i < size; i++)
-		parts[i] = adjust_address (operand, SImode, 4 * i);
-	    }
-	  else if (CONST_DOUBLE_P (operand))
-	    {
-	      const REAL_VALUE_TYPE *r;
-	      long l[4];
-
-	      r = CONST_DOUBLE_REAL_VALUE (operand);
-	      switch (mode)
-		{
-		case E_TFmode:
-		  real_to_target (l, r, mode);
-		  parts[3] = gen_int_mode (l[3], SImode);
-		  parts[2] = gen_int_mode (l[2], SImode);
-		  break;
-		case E_XFmode:
-		  /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
-		     long double may not be 80-bit.  */
-		  real_to_target (l, r, mode);
-		  parts[2] = gen_int_mode (l[2], SImode);
-		  break;
-		case E_DFmode:
-		  REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
-		  break;
-		default:
-		  gcc_unreachable ();
-		}
-	      parts[1] = gen_int_mode (l[1], SImode);
-	      parts[0] = gen_int_mode (l[0], SImode);
-	    }
-	  else
-	    gcc_unreachable ();
-	}
-    }
-  else
-    {
-      if (mode == TImode)
-	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
-      if (mode == XFmode || mode == TFmode)
-	{
-	  machine_mode upper_mode = mode==XFmode ? SImode : DImode;
-	  if (REG_P (operand))
-	    {
-	      gcc_assert (reload_completed);
-	      parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
-	      parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
-	    }
-	  else if (offsettable_memref_p (operand))
-	    {
-	      operand = adjust_address (operand, DImode, 0);
-	      parts[0] = operand;
-	      parts[1] = adjust_address (operand, upper_mode, 8);
-	    }
-	  else if (CONST_DOUBLE_P (operand))
-	    {
-	      long l[4];
-
-	      real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
-
-	      /* real_to_target puts 32-bit pieces in each long.  */
-	      parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
-				       | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
-					  << 32), DImode);
-
-	      if (upper_mode == SImode)
-	        parts[1] = gen_int_mode (l[2], SImode);
-	      else
-	        parts[1]
-		  = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
-				  | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
-				     << 32), DImode);
-	    }
-	  else
-	    gcc_unreachable ();
-	}
-    }
-
-  return size;
-}
-
-/* Emit insns to perform a move or push of DI, DF, XF, and TF values.
-   Return false when normal moves are needed; true when all required
-   insns have been emitted.  Operands 2-4 contain the input values
-   int the correct order; operands 5-7 contain the output values.  */
-
-void
-ix86_split_long_move (rtx operands[])
-{
-  rtx part[2][4];
-  int nparts, i, j;
-  int push = 0;
-  int collisions = 0;
-  machine_mode mode = GET_MODE (operands[0]);
-  bool collisionparts[4];
-
-  /* The DFmode expanders may ask us to move double.
-     For 64bit target this is single move.  By hiding the fact
-     here we simplify i386.md splitters.  */
-  if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
-    {
-      /* Optimize constant pool reference to immediates.  This is used by
-	 fp moves, that force all constants to memory to allow combining.  */
-
-      if (MEM_P (operands[1])
-	  && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
-	  && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
-	operands[1] = get_pool_constant (XEXP (operands[1], 0));
-      if (push_operand (operands[0], VOIDmode))
-	{
-	  operands[0] = copy_rtx (operands[0]);
-	  PUT_MODE (operands[0], word_mode);
-	}
-      else
-        operands[0] = gen_lowpart (DImode, operands[0]);
-      operands[1] = gen_lowpart (DImode, operands[1]);
-      emit_move_insn (operands[0], operands[1]);
-      return;
-    }
-
-  /* The only non-offsettable memory we handle is push.  */
-  if (push_operand (operands[0], VOIDmode))
-    push = 1;
-  else
-    gcc_assert (!MEM_P (operands[0])
-		|| offsettable_memref_p (operands[0]));
-
-  nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
-  ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
-
-  /* When emitting push, take care for source operands on the stack.  */
-  if (push && MEM_P (operands[1])
-      && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
-    {
-      rtx src_base = XEXP (part[1][nparts - 1], 0);
-
-      /* Compensate for the stack decrement by 4.  */
-      if (!TARGET_64BIT && nparts == 3
-	  && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
-	src_base = plus_constant (Pmode, src_base, 4);
-
-      /* src_base refers to the stack pointer and is
-	 automatically decreased by emitted push.  */
-      for (i = 0; i < nparts; i++)
-	part[1][i] = change_address (part[1][i],
-				     GET_MODE (part[1][i]), src_base);
-    }
-
-  /* We need to do copy in the right order in case an address register
-     of the source overlaps the destination.  */
-  if (REG_P (part[0][0]) && MEM_P (part[1][0]))
-    {
-      rtx tmp;
-
-      for (i = 0; i < nparts; i++)
-	{
-	  collisionparts[i]
-	    = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
-	  if (collisionparts[i])
-	    collisions++;
-	}
-
-      /* Collision in the middle part can be handled by reordering.  */
-      if (collisions == 1 && nparts == 3 && collisionparts [1])
-	{
-	  std::swap (part[0][1], part[0][2]);
-	  std::swap (part[1][1], part[1][2]);
-	}
-      else if (collisions == 1
-	       && nparts == 4
-	       && (collisionparts [1] || collisionparts [2]))
-	{
-	  if (collisionparts [1])
-	    {
-	      std::swap (part[0][1], part[0][2]);
-	      std::swap (part[1][1], part[1][2]);
-	    }
-	  else
-	    {
-	      std::swap (part[0][2], part[0][3]);
-	      std::swap (part[1][2], part[1][3]);
-	    }
-	}
-
-      /* If there are more collisions, we can't handle it by reordering.
-	 Do an lea to the last part and use only one colliding move.  */
-      else if (collisions > 1)
-	{
-	  rtx base, addr;
-
-	  collisions = 1;
-
-	  base = part[0][nparts - 1];
-
-	  /* Handle the case when the last part isn't valid for lea.
-	     Happens in 64-bit mode storing the 12-byte XFmode.  */
-	  if (GET_MODE (base) != Pmode)
-	    base = gen_rtx_REG (Pmode, REGNO (base));
-
-	  addr = XEXP (part[1][0], 0);
-	  if (TARGET_TLS_DIRECT_SEG_REFS)
-	    {
-	      struct ix86_address parts;
-	      int ok = ix86_decompose_address (addr, &parts);
-	      gcc_assert (ok);
-	      /* It is not valid to use %gs: or %fs: in lea.  */
-	      gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
-	    }
-	  emit_insn (gen_rtx_SET (base, addr));
-	  part[1][0] = replace_equiv_address (part[1][0], base);
-	  for (i = 1; i < nparts; i++)
-	    {
-	      tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
-	      part[1][i] = replace_equiv_address (part[1][i], tmp);
-	    }
-	}
-    }
-
-  if (push)
-    {
-      if (!TARGET_64BIT)
-	{
-	  if (nparts == 3)
-	    {
-	      if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
-                emit_insn (ix86_gen_add3 (stack_pointer_rtx,
-					  stack_pointer_rtx, GEN_INT (-4)));
-	      emit_move_insn (part[0][2], part[1][2]);
-	    }
-	  else if (nparts == 4)
-	    {
-	      emit_move_insn (part[0][3], part[1][3]);
-	      emit_move_insn (part[0][2], part[1][2]);
-	    }
-	}
-      else
-	{
-	  /* In 64bit mode we don't have 32bit push available.  In case this is
-	     register, it is OK - we will just use larger counterpart.  We also
-	     retype memory - these comes from attempt to avoid REX prefix on
-	     moving of second half of TFmode value.  */
-	  if (GET_MODE (part[1][1]) == SImode)
-	    {
-	      switch (GET_CODE (part[1][1]))
-		{
-		case MEM:
-		  part[1][1] = adjust_address (part[1][1], DImode, 0);
-		  break;
-
-		case REG:
-		  part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
-		  break;
-
-		default:
-		  gcc_unreachable ();
-		}
-
-	      if (GET_MODE (part[1][0]) == SImode)
-		part[1][0] = part[1][1];
-	    }
-	}
-      emit_move_insn (part[0][1], part[1][1]);
-      emit_move_insn (part[0][0], part[1][0]);
-      return;
-    }
-
-  /* Choose correct order to not overwrite the source before it is copied.  */
-  if ((REG_P (part[0][0])
-       && REG_P (part[1][1])
-       && (REGNO (part[0][0]) == REGNO (part[1][1])
-	   || (nparts == 3
-	       && REGNO (part[0][0]) == REGNO (part[1][2]))
-	   || (nparts == 4
-	       && REGNO (part[0][0]) == REGNO (part[1][3]))))
-      || (collisions > 0
-	  && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
-    {
-      for (i = 0, j = nparts - 1; i < nparts; i++, j--)
-	{
-	  operands[2 + i] = part[0][j];
-	  operands[6 + i] = part[1][j];
-	}
-    }
-  else
-    {
-      for (i = 0; i < nparts; i++)
-	{
-	  operands[2 + i] = part[0][i];
-	  operands[6 + i] = part[1][i];
-	}
-    }
-
-  /* If optimizing for size, attempt to locally unCSE nonzero constants.  */
-  if (optimize_insn_for_size_p ())
-    {
-      for (j = 0; j < nparts - 1; j++)
-	if (CONST_INT_P (operands[6 + j])
-	    && operands[6 + j] != const0_rtx
-	    && REG_P (operands[2 + j]))
-	  for (i = j; i < nparts - 1; i++)
-	    if (CONST_INT_P (operands[7 + i])
-		&& INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
-	      operands[7 + i] = operands[2 + j];
-    }
-
-  for (i = 0; i < nparts; i++)
-    emit_move_insn (operands[2 + i], operands[6 + i]);
-
-  return;
-}
-
-/* Helper function of ix86_split_ashl used to generate an SImode/DImode
-   left shift by a constant, either using a single shift or
-   a sequence of add instructions.  */
-
-static void
-ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
-{
-  rtx (*insn)(rtx, rtx, rtx);
-
-  if (count == 1
-      || (count * ix86_cost->add <= ix86_cost->shift_const
-	  && !optimize_insn_for_size_p ()))
-    {
-      insn = mode == DImode ? gen_addsi3 : gen_adddi3;
-      while (count-- > 0)
-	emit_insn (insn (operand, operand, operand));
-    }
-  else
-    {
-      insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
-      emit_insn (insn (operand, operand, GEN_INT (count)));
-    }
-}
-
-void
-ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
-{
-  rtx (*gen_ashl3)(rtx, rtx, rtx);
-  rtx (*gen_shld)(rtx, rtx, rtx);
-  int half_width = GET_MODE_BITSIZE (mode) >> 1;
-
-  rtx low[2], high[2];
-  int count;
-
-  if (CONST_INT_P (operands[2]))
-    {
-      split_double_mode (mode, operands, 2, low, high);
-      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
-
-      if (count >= half_width)
-	{
-	  emit_move_insn (high[0], low[1]);
-	  emit_move_insn (low[0], const0_rtx);
-
-	  if (count > half_width)
-	    ix86_expand_ashl_const (high[0], count - half_width, mode);
-	}
-      else
-	{
-	  gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
-
-	  if (!rtx_equal_p (operands[0], operands[1]))
-	    emit_move_insn (operands[0], operands[1]);
-
-	  emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
-	  ix86_expand_ashl_const (low[0], count, mode);
-	}
-      return;
-    }
-
-  split_double_mode (mode, operands, 1, low, high);
-
-  gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
-
-  if (operands[1] == const1_rtx)
-    {
-      /* Assuming we've chosen a QImode capable registers, then 1 << N
-	 can be done with two 32/64-bit shifts, no branches, no cmoves.  */
-      if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
-	{
-	  rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
-
-	  ix86_expand_clear (low[0]);
-	  ix86_expand_clear (high[0]);
-	  emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
-
-	  d = gen_lowpart (QImode, low[0]);
-	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
-	  s = gen_rtx_EQ (QImode, flags, const0_rtx);
-	  emit_insn (gen_rtx_SET (d, s));
-
-	  d = gen_lowpart (QImode, high[0]);
-	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
-	  s = gen_rtx_NE (QImode, flags, const0_rtx);
-	  emit_insn (gen_rtx_SET (d, s));
-	}
-
-      /* Otherwise, we can get the same results by manually performing
-	 a bit extract operation on bit 5/6, and then performing the two
-	 shifts.  The two methods of getting 0/1 into low/high are exactly
-	 the same size.  Avoiding the shift in the bit extract case helps
-	 pentium4 a bit; no one else seems to care much either way.  */
-      else
-	{
-	  machine_mode half_mode;
-	  rtx (*gen_lshr3)(rtx, rtx, rtx);
-	  rtx (*gen_and3)(rtx, rtx, rtx);
-	  rtx (*gen_xor3)(rtx, rtx, rtx);
-	  HOST_WIDE_INT bits;
-	  rtx x;
-
-	  if (mode == DImode)
-	    {
-	      half_mode = SImode;
-	      gen_lshr3 = gen_lshrsi3;
-	      gen_and3 = gen_andsi3;
-	      gen_xor3 = gen_xorsi3;
-	      bits = 5;
-	    }
-	  else
-	    {
-	      half_mode = DImode;
-	      gen_lshr3 = gen_lshrdi3;
-	      gen_and3 = gen_anddi3;
-	      gen_xor3 = gen_xordi3;
-	      bits = 6;
-	    }
-
-	  if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
-	    x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
-	  else
-	    x = gen_lowpart (half_mode, operands[2]);
-	  emit_insn (gen_rtx_SET (high[0], x));
-
-	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
-	  emit_insn (gen_and3 (high[0], high[0], const1_rtx));
-	  emit_move_insn (low[0], high[0]);
-	  emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
-	}
-
-      emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
-      emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
-      return;
-    }
-
-  if (operands[1] == constm1_rtx)
-    {
-      /* For -1 << N, we can avoid the shld instruction, because we
-	 know that we're shifting 0...31/63 ones into a -1.  */
-      emit_move_insn (low[0], constm1_rtx);
-      if (optimize_insn_for_size_p ())
-	emit_move_insn (high[0], low[0]);
-      else
-	emit_move_insn (high[0], constm1_rtx);
-    }
-  else
-    {
-      gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
-
-      if (!rtx_equal_p (operands[0], operands[1]))
-	emit_move_insn (operands[0], operands[1]);
-
-      split_double_mode (mode, operands, 1, low, high);
-      emit_insn (gen_shld (high[0], low[0], operands[2]));
-    }
-
-  emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
-
-  if (TARGET_CMOVE && scratch)
-    {
-      rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
-	= mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
-
-      ix86_expand_clear (scratch);
-      emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
-    }
-  else
-    {
-      rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
-	= mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
-
-      emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
-    }
-}
-
-void
-ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
-{
-  rtx (*gen_ashr3)(rtx, rtx, rtx)
-    = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
-  rtx (*gen_shrd)(rtx, rtx, rtx);
-  int half_width = GET_MODE_BITSIZE (mode) >> 1;
-
-  rtx low[2], high[2];
-  int count;
-
-  if (CONST_INT_P (operands[2]))
-    {
-      split_double_mode (mode, operands, 2, low, high);
-      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
-
-      if (count == GET_MODE_BITSIZE (mode) - 1)
-	{
-	  emit_move_insn (high[0], high[1]);
-	  emit_insn (gen_ashr3 (high[0], high[0],
-				GEN_INT (half_width - 1)));
-	  emit_move_insn (low[0], high[0]);
-
-	}
-      else if (count >= half_width)
-	{
-	  emit_move_insn (low[0], high[1]);
-	  emit_move_insn (high[0], low[0]);
-	  emit_insn (gen_ashr3 (high[0], high[0],
-				GEN_INT (half_width - 1)));
-
-	  if (count > half_width)
-	    emit_insn (gen_ashr3 (low[0], low[0],
-				  GEN_INT (count - half_width)));
-	}
-      else
-	{
-	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
-
-	  if (!rtx_equal_p (operands[0], operands[1]))
-	    emit_move_insn (operands[0], operands[1]);
-
-	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
-	  emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
-	}
-    }
-  else
-    {
-      gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
-
-     if (!rtx_equal_p (operands[0], operands[1]))
-	emit_move_insn (operands[0], operands[1]);
-
-      split_double_mode (mode, operands, 1, low, high);
-
-      emit_insn (gen_shrd (low[0], high[0], operands[2]));
-      emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
-
-      if (TARGET_CMOVE && scratch)
-	{
-	  rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
-	    = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
-
-	  emit_move_insn (scratch, high[0]);
-	  emit_insn (gen_ashr3 (scratch, scratch,
-				GEN_INT (half_width - 1)));
-	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
-					  scratch));
-	}
-      else
-	{
-	  rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
-	    = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
-
-	  emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
-	}
-    }
-}
-
-void
-ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
-{
-  rtx (*gen_lshr3)(rtx, rtx, rtx)
-    = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
-  rtx (*gen_shrd)(rtx, rtx, rtx);
-  int half_width = GET_MODE_BITSIZE (mode) >> 1;
-
-  rtx low[2], high[2];
-  int count;
-
-  if (CONST_INT_P (operands[2]))
-    {
-      split_double_mode (mode, operands, 2, low, high);
-      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
-
-      if (count >= half_width)
-	{
-	  emit_move_insn (low[0], high[1]);
-	  ix86_expand_clear (high[0]);
-
-	  if (count > half_width)
-	    emit_insn (gen_lshr3 (low[0], low[0],
-				  GEN_INT (count - half_width)));
-	}
-      else
-	{
-	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
-
-	  if (!rtx_equal_p (operands[0], operands[1]))
-	    emit_move_insn (operands[0], operands[1]);
-
-	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
-	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
-	}
-    }
-  else
-    {
-      gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
-
-      if (!rtx_equal_p (operands[0], operands[1]))
-	emit_move_insn (operands[0], operands[1]);
-
-      split_double_mode (mode, operands, 1, low, high);
-
-      emit_insn (gen_shrd (low[0], high[0], operands[2]));
-      emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
-
-      if (TARGET_CMOVE && scratch)
-	{
-	  rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
-	    = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
-
-	  ix86_expand_clear (scratch);
-	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
-					  scratch));
-	}
-      else
-	{
-	  rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
-	    = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
-
-	  emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
-	}
-    }
-}
-
-/* Predict just emitted jump instruction to be taken with probability PROB.  */
-static void
-predict_jump (int prob)
-{
-  rtx_insn *insn = get_last_insn ();
-  gcc_assert (JUMP_P (insn));
-  add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
-}
-
-/* Helper function for the string operations below.  Dest VARIABLE whether
-   it is aligned to VALUE bytes.  If true, jump to the label.  */
-static rtx_code_label *
-ix86_expand_aligntest (rtx variable, int value, bool epilogue)
-{
-  rtx_code_label *label = gen_label_rtx ();
-  rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
-  if (GET_MODE (variable) == DImode)
-    emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
-  else
-    emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
-  emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
-			   1, label);
-  if (epilogue)
-    predict_jump (REG_BR_PROB_BASE * 50 / 100);
-  else
-    predict_jump (REG_BR_PROB_BASE * 90 / 100);
-  return label;
-}
-
-/* Adjust COUNTER by the VALUE.  */
-static void
-ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
-{
-  rtx (*gen_add)(rtx, rtx, rtx)
-    = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
-
-  emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
-}
-
-/* Zero extend possibly SImode EXP to Pmode register.  */
-rtx
-ix86_zero_extend_to_Pmode (rtx exp)
-{
-  return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
-}
-
-/* Divide COUNTREG by SCALE.  */
-static rtx
-scale_counter (rtx countreg, int scale)
-{
-  rtx sc;
-
-  if (scale == 1)
-    return countreg;
-  if (CONST_INT_P (countreg))
-    return GEN_INT (INTVAL (countreg) / scale);
-  gcc_assert (REG_P (countreg));
-
-  sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
-			    GEN_INT (exact_log2 (scale)),
-			    NULL, 1, OPTAB_DIRECT);
-  return sc;
-}
-
-/* Return mode for the memcpy/memset loop counter.  Prefer SImode over
-   DImode for constant loop counts.  */
-
-static machine_mode
-counter_mode (rtx count_exp)
-{
-  if (GET_MODE (count_exp) != VOIDmode)
-    return GET_MODE (count_exp);
-  if (!CONST_INT_P (count_exp))
-    return Pmode;
-  if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
-    return DImode;
-  return SImode;
-}
-
-/* Copy the address to a Pmode register.  This is used for x32 to
-   truncate DImode TLS address to a SImode register. */
-
-static rtx
-ix86_copy_addr_to_reg (rtx addr)
-{
-  rtx reg;
-  if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
-    {
-      reg = copy_addr_to_reg (addr);
-      REG_POINTER (reg) = 1;
-      return reg;
-    }
-  else
-    {
-      gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
-      reg = copy_to_mode_reg (DImode, addr);
-      REG_POINTER (reg) = 1;
-      return gen_rtx_SUBREG (SImode, reg, 0);
-    }
-}
-
-/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
-   to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
-   specified in bytes.  When ISSETMEM is TRUE, output the equivalent loop to set
-   memory by VALUE (supposed to be in MODE).
-
-   The size is rounded down to whole number of chunk size moved at once.
-   SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
-
-
-static void
-expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
-			       rtx destptr, rtx srcptr, rtx value,
-			       rtx count, machine_mode mode, int unroll,
-			       int expected_size, bool issetmem)
-{
-  rtx_code_label *out_label, *top_label;
-  rtx iter, tmp;
-  machine_mode iter_mode = counter_mode (count);
-  int piece_size_n = GET_MODE_SIZE (mode) * unroll;
-  rtx piece_size = GEN_INT (piece_size_n);
-  rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
-  rtx size;
-  int i;
-
-  top_label = gen_label_rtx ();
-  out_label = gen_label_rtx ();
-  iter = gen_reg_rtx (iter_mode);
-
-  size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
-			      NULL, 1, OPTAB_DIRECT);
-  /* Those two should combine.  */
-  if (piece_size == const1_rtx)
-    {
-      emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
-			       true, out_label);
-      predict_jump (REG_BR_PROB_BASE * 10 / 100);
-    }
-  emit_move_insn (iter, const0_rtx);
-
-  emit_label (top_label);
-
-  tmp = convert_modes (Pmode, iter_mode, iter, true);
-
-  /* This assert could be relaxed - in this case we'll need to compute
-     smallest power of two, containing in PIECE_SIZE_N and pass it to
-     offset_address.  */
-  gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
-  destmem = offset_address (destmem, tmp, piece_size_n);
-  destmem = adjust_address (destmem, mode, 0);
-
-  if (!issetmem)
-    {
-      srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
-      srcmem = adjust_address (srcmem, mode, 0);
-
-      /* When unrolling for chips that reorder memory reads and writes,
-	 we can save registers by using single temporary.
-	 Also using 4 temporaries is overkill in 32bit mode.  */
-      if (!TARGET_64BIT && 0)
-	{
-	  for (i = 0; i < unroll; i++)
-	    {
-	      if (i)
-		{
-		  destmem = adjust_address (copy_rtx (destmem), mode,
-					    GET_MODE_SIZE (mode));
-		  srcmem = adjust_address (copy_rtx (srcmem), mode,
-					   GET_MODE_SIZE (mode));
-		}
-	      emit_move_insn (destmem, srcmem);
-	    }
-	}
-      else
-	{
-	  rtx tmpreg[4];
-	  gcc_assert (unroll <= 4);
-	  for (i = 0; i < unroll; i++)
-	    {
-	      tmpreg[i] = gen_reg_rtx (mode);
-	      if (i)
-		srcmem = adjust_address (copy_rtx (srcmem), mode,
-					 GET_MODE_SIZE (mode));
-	      emit_move_insn (tmpreg[i], srcmem);
-	    }
-	  for (i = 0; i < unroll; i++)
-	    {
-	      if (i)
-		destmem = adjust_address (copy_rtx (destmem), mode,
-					  GET_MODE_SIZE (mode));
-	      emit_move_insn (destmem, tmpreg[i]);
-	    }
-	}
-    }
-  else
-    for (i = 0; i < unroll; i++)
-      {
-	if (i)
-	  destmem = adjust_address (copy_rtx (destmem), mode,
-				    GET_MODE_SIZE (mode));
-	emit_move_insn (destmem, value);
-      }
-
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
-			     true, OPTAB_LIB_WIDEN);
-  if (tmp != iter)
-    emit_move_insn (iter, tmp);
-
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
-			   true, top_label);
-  if (expected_size != -1)
-    {
-      expected_size /= GET_MODE_SIZE (mode) * unroll;
-      if (expected_size == 0)
-	predict_jump (0);
-      else if (expected_size > REG_BR_PROB_BASE)
-	predict_jump (REG_BR_PROB_BASE - 1);
-      else
-        predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
-		      / expected_size);
-    }
-  else
-    predict_jump (REG_BR_PROB_BASE * 80 / 100);
-  iter = ix86_zero_extend_to_Pmode (iter);
-  tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
-			     true, OPTAB_LIB_WIDEN);
-  if (tmp != destptr)
-    emit_move_insn (destptr, tmp);
-  if (!issetmem)
-    {
-      tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
-				 true, OPTAB_LIB_WIDEN);
-      if (tmp != srcptr)
-	emit_move_insn (srcptr, tmp);
-    }
-  emit_label (out_label);
-}
-
-/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
-   When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
-   When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
-   For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
-   ORIG_VALUE is the original value passed to memset to fill the memory with.
-   Other arguments have same meaning as for previous function.  */
-
-static void
-expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
-			   rtx destptr, rtx srcptr, rtx value, rtx orig_value,
-			   rtx count,
-			   machine_mode mode, bool issetmem)
-{
-  rtx destexp;
-  rtx srcexp;
-  rtx countreg;
-  HOST_WIDE_INT rounded_count;
-
-  /* If possible, it is shorter to use rep movs.
-     TODO: Maybe it is better to move this logic to decide_alg.  */
-  if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
-      && (!issetmem || orig_value == const0_rtx))
-    mode = SImode;
-
-  if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
-    destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
-
-  countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
-						       GET_MODE_SIZE (mode)));
-  if (mode != QImode)
-    {
-      destexp = gen_rtx_ASHIFT (Pmode, countreg,
-				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
-      destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
-    }
-  else
-    destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
-  if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
-    {
-      rounded_count
-	= ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
-      destmem = shallow_copy_rtx (destmem);
-      set_mem_size (destmem, rounded_count);
-    }
-  else if (MEM_SIZE_KNOWN_P (destmem))
-    clear_mem_size (destmem);
-
-  if (issetmem)
-    {
-      value = force_reg (mode, gen_lowpart (mode, value));
-      emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
-    }
-  else
-    {
-      if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
-	srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
-      if (mode != QImode)
-	{
-	  srcexp = gen_rtx_ASHIFT (Pmode, countreg,
-				   GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
-	  srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
-	}
-      else
-	srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
-      if (CONST_INT_P (count))
-	{
-	  rounded_count
-	    = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
-	  srcmem = shallow_copy_rtx (srcmem);
-	  set_mem_size (srcmem, rounded_count);
-	}
-      else
-	{
-	  if (MEM_SIZE_KNOWN_P (srcmem))
-	    clear_mem_size (srcmem);
-	}
-      emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
-			      destexp, srcexp));
-    }
-}
-
-/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
-   DESTMEM.
-   SRC is passed by pointer to be updated on return.
-   Return value is updated DST.  */
-static rtx
-emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
-	     HOST_WIDE_INT size_to_move)
-{
-  rtx dst = destmem, src = *srcmem, adjust, tempreg;
-  enum insn_code code;
-  machine_mode move_mode;
-  int piece_size, i;
-
-  /* Find the widest mode in which we could perform moves.
-     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
-     it until move of such size is supported.  */
-  piece_size = 1 << floor_log2 (size_to_move);
-  while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
-	 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
-    {
-      gcc_assert (piece_size > 1);
-      piece_size >>= 1;
-    }
-
-  /* Find the corresponding vector mode with the same size as MOVE_MODE.
-     MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
-  if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
-    {
-      int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
-      if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
-	  || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
-	{
-	  move_mode = word_mode;
-	  piece_size = GET_MODE_SIZE (move_mode);
-	  code = optab_handler (mov_optab, move_mode);
-	}
-    }
-  gcc_assert (code != CODE_FOR_nothing);
-
-  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
-  src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
-
-  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
-  gcc_assert (size_to_move % piece_size == 0);
-  adjust = GEN_INT (piece_size);
-  for (i = 0; i < size_to_move; i += piece_size)
-    {
-      /* We move from memory to memory, so we'll need to do it via
-	 a temporary register.  */
-      tempreg = gen_reg_rtx (move_mode);
-      emit_insn (GEN_FCN (code) (tempreg, src));
-      emit_insn (GEN_FCN (code) (dst, tempreg));
-
-      emit_move_insn (destptr,
-		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
-      emit_move_insn (srcptr,
-		      gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
-
-      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
-					  piece_size);
-      src = adjust_automodify_address_nv (src, move_mode, srcptr,
-					  piece_size);
-    }
-
-  /* Update DST and SRC rtx.  */
-  *srcmem = src;
-  return dst;
-}
-
-/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
-static void
-expand_movmem_epilogue (rtx destmem, rtx srcmem,
-			rtx destptr, rtx srcptr, rtx count, int max_size)
-{
-  rtx src, dest;
-  if (CONST_INT_P (count))
-    {
-      HOST_WIDE_INT countval = INTVAL (count);
-      HOST_WIDE_INT epilogue_size = countval % max_size;
-      int i;
-
-      /* For now MAX_SIZE should be a power of 2.  This assert could be
-	 relaxed, but it'll require a bit more complicated epilogue
-	 expanding.  */
-      gcc_assert ((max_size & (max_size - 1)) == 0);
-      for (i = max_size; i >= 1; i >>= 1)
-	{
-	  if (epilogue_size & i)
-	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
-	}
-      return;
-    }
-  if (max_size > 8)
-    {
-      count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
-				    count, 1, OPTAB_DIRECT);
-      expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
-				     count, QImode, 1, 4, false);
-      return;
-    }
-
-  /* When there are stringops, we can cheaply increase dest and src pointers.
-     Otherwise we save code size by maintaining offset (zero is readily
-     available from preceding rep operation) and using x86 addressing modes.
-   */
-  if (TARGET_SINGLE_STRINGOP)
-    {
-      if (max_size > 4)
-	{
-	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
-	  src = change_address (srcmem, SImode, srcptr);
-	  dest = change_address (destmem, SImode, destptr);
-	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
-	  emit_label (label);
-	  LABEL_NUSES (label) = 1;
-	}
-      if (max_size > 2)
-	{
-	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
-	  src = change_address (srcmem, HImode, srcptr);
-	  dest = change_address (destmem, HImode, destptr);
-	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
-	  emit_label (label);
-	  LABEL_NUSES (label) = 1;
-	}
-      if (max_size > 1)
-	{
-	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
-	  src = change_address (srcmem, QImode, srcptr);
-	  dest = change_address (destmem, QImode, destptr);
-	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
-	  emit_label (label);
-	  LABEL_NUSES (label) = 1;
-	}
-    }
-  else
-    {
-      rtx offset = force_reg (Pmode, const0_rtx);
-      rtx tmp;
-
-      if (max_size > 4)
-	{
-	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
-	  src = change_address (srcmem, SImode, srcptr);
-	  dest = change_address (destmem, SImode, destptr);
-	  emit_move_insn (dest, src);
-	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
-				     true, OPTAB_LIB_WIDEN);
-	  if (tmp != offset)
-	    emit_move_insn (offset, tmp);
-	  emit_label (label);
-	  LABEL_NUSES (label) = 1;
-	}
-      if (max_size > 2)
-	{
-	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
-	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
-	  src = change_address (srcmem, HImode, tmp);
-	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
-	  dest = change_address (destmem, HImode, tmp);
-	  emit_move_insn (dest, src);
-	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
-				     true, OPTAB_LIB_WIDEN);
-	  if (tmp != offset)
-	    emit_move_insn (offset, tmp);
-	  emit_label (label);
-	  LABEL_NUSES (label) = 1;
-	}
-      if (max_size > 1)
-	{
-	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
-	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
-	  src = change_address (srcmem, QImode, tmp);
-	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
-	  dest = change_address (destmem, QImode, tmp);
-	  emit_move_insn (dest, src);
-	  emit_label (label);
-	  LABEL_NUSES (label) = 1;
-	}
-    }
-}
-
-/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
-   with value PROMOTED_VAL.
-   SRC is passed by pointer to be updated on return.
-   Return value is updated DST.  */
-static rtx
-emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
-	     HOST_WIDE_INT size_to_move)
-{
-  rtx dst = destmem, adjust;
-  enum insn_code code;
-  machine_mode move_mode;
-  int piece_size, i;
-
-  /* Find the widest mode in which we could perform moves.
-     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
-     it until move of such size is supported.  */
-  move_mode = GET_MODE (promoted_val);
-  if (move_mode == VOIDmode)
-    move_mode = QImode;
-  if (size_to_move < GET_MODE_SIZE (move_mode))
-    {
-      unsigned int move_bits = size_to_move * BITS_PER_UNIT;
-      move_mode = int_mode_for_size (move_bits, 0).require ();
-      promoted_val = gen_lowpart (move_mode, promoted_val);
-    }
-  piece_size = GET_MODE_SIZE (move_mode);
-  code = optab_handler (mov_optab, move_mode);
-  gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
-
-  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
-
-  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
-  gcc_assert (size_to_move % piece_size == 0);
-  adjust = GEN_INT (piece_size);
-  for (i = 0; i < size_to_move; i += piece_size)
-    {
-      if (piece_size <= GET_MODE_SIZE (word_mode))
-	{
-	  emit_insn (gen_strset (destptr, dst, promoted_val));
-	  dst = adjust_automodify_address_nv (dst, move_mode, destptr,
-					      piece_size);
-	  continue;
-	}
-
-      emit_insn (GEN_FCN (code) (dst, promoted_val));
-
-      emit_move_insn (destptr,
-		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
-
-      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
-					  piece_size);
-    }
-
-  /* Update DST rtx.  */
-  return dst;
-}
-/* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
-static void
-expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
-				 rtx count, int max_size)
-{
-  count = expand_simple_binop (counter_mode (count), AND, count,
-			       GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
-  expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
-				 gen_lowpart (QImode, value), count, QImode,
-				 1, max_size / 2, true);
-}
-
-/* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
-static void
-expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
-			rtx count, int max_size)
-{
-  rtx dest;
-
-  if (CONST_INT_P (count))
-    {
-      HOST_WIDE_INT countval = INTVAL (count);
-      HOST_WIDE_INT epilogue_size = countval % max_size;
-      int i;
-
-      /* For now MAX_SIZE should be a power of 2.  This assert could be
-	 relaxed, but it'll require a bit more complicated epilogue
-	 expanding.  */
-      gcc_assert ((max_size & (max_size - 1)) == 0);
-      for (i = max_size; i >= 1; i >>= 1)
-	{
-	  if (epilogue_size & i)
-	    {
-	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
-		destmem = emit_memset (destmem, destptr, vec_value, i);
-	      else
-		destmem = emit_memset (destmem, destptr, value, i);
-	    }
-	}
-      return;
-    }
-  if (max_size > 32)
-    {
-      expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
-      return;
-    }
-  if (max_size > 16)
-    {
-      rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
-      if (TARGET_64BIT)
-	{
-	  dest = change_address (destmem, DImode, destptr);
-	  emit_insn (gen_strset (destptr, dest, value));
-	  dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
-	  emit_insn (gen_strset (destptr, dest, value));
-	}
-      else
-	{
-	  dest = change_address (destmem, SImode, destptr);
-	  emit_insn (gen_strset (destptr, dest, value));
-	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
-	  emit_insn (gen_strset (destptr, dest, value));
-	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
-	  emit_insn (gen_strset (destptr, dest, value));
-	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
-	  emit_insn (gen_strset (destptr, dest, value));
-	}
-      emit_label (label);
-      LABEL_NUSES (label) = 1;
-    }
-  if (max_size > 8)
-    {
-      rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
-      if (TARGET_64BIT)
-	{
-	  dest = change_address (destmem, DImode, destptr);
-	  emit_insn (gen_strset (destptr, dest, value));
-	}
-      else
-	{
-	  dest = change_address (destmem, SImode, destptr);
-	  emit_insn (gen_strset (destptr, dest, value));
-	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
-	  emit_insn (gen_strset (destptr, dest, value));
-	}
-      emit_label (label);
-      LABEL_NUSES (label) = 1;
-    }
-  if (max_size > 4)
-    {
-      rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
-      dest = change_address (destmem, SImode, destptr);
-      emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
-      emit_label (label);
-      LABEL_NUSES (label) = 1;
-    }
-  if (max_size > 2)
-    {
-      rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
-      dest = change_address (destmem, HImode, destptr);
-      emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
-      emit_label (label);
-      LABEL_NUSES (label) = 1;
-    }
-  if (max_size > 1)
-    {
-      rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
-      dest = change_address (destmem, QImode, destptr);
-      emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
-      emit_label (label);
-      LABEL_NUSES (label) = 1;
-    }
-}
-
-/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
-   DESTMEM to align it to DESIRED_ALIGNMENT.  Original alignment is ALIGN.
-   Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
-   ignored.
-   Return value is updated DESTMEM.  */
-static rtx
-expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
-				  rtx destptr, rtx srcptr, rtx value,
-				  rtx vec_value, rtx count, int align,
-				  int desired_alignment, bool issetmem)
-{
-  int i;
-  for (i = 1; i < desired_alignment; i <<= 1)
-    {
-      if (align <= i)
-	{
-	  rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
-	  if (issetmem)
-	    {
-	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
-		destmem = emit_memset (destmem, destptr, vec_value, i);
-	      else
-		destmem = emit_memset (destmem, destptr, value, i);
-	    }
-	  else
-	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
-	  ix86_adjust_counter (count, i);
-	  emit_label (label);
-	  LABEL_NUSES (label) = 1;
-	  set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
-	}
-    }
-  return destmem;
-}
-
-/* Test if COUNT&SIZE is nonzero and if so, expand movme
-   or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
-   and jump to DONE_LABEL.  */
-static void
-expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
-			       rtx destptr, rtx srcptr,
-			       rtx value, rtx vec_value,
-			       rtx count, int size,
-			       rtx done_label, bool issetmem)
-{
-  rtx_code_label *label = ix86_expand_aligntest (count, size, false);
-  machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
-  rtx modesize;
-  int n;
-
-  /* If we do not have vector value to copy, we must reduce size.  */
-  if (issetmem)
-    {
-      if (!vec_value)
-	{
-	  if (GET_MODE (value) == VOIDmode && size > 8)
-	    mode = Pmode;
-	  else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
-	    mode = GET_MODE (value);
-	}
-      else
-	mode = GET_MODE (vec_value), value = vec_value;
-    }
-  else
-    {
-      /* Choose appropriate vector mode.  */
-      if (size >= 32)
-	mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
-      else if (size >= 16)
-	mode = TARGET_SSE ? V16QImode : DImode;
-      srcmem = change_address (srcmem, mode, srcptr);
-    }
-  destmem = change_address (destmem, mode, destptr);
-  modesize = GEN_INT (GET_MODE_SIZE (mode));
-  gcc_assert (GET_MODE_SIZE (mode) <= size);
-  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
-    {
-      if (issetmem)
-	emit_move_insn (destmem, gen_lowpart (mode, value));
-      else
-	{
-          emit_move_insn (destmem, srcmem);
-          srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
-	}
-      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
-    }
-
-  destmem = offset_address (destmem, count, 1);
-  destmem = offset_address (destmem, GEN_INT (-2 * size),
-			    GET_MODE_SIZE (mode));
-  if (!issetmem)
-    {
-      srcmem = offset_address (srcmem, count, 1);
-      srcmem = offset_address (srcmem, GEN_INT (-2 * size),
-			       GET_MODE_SIZE (mode));
-    }
-  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
-    {
-      if (issetmem)
-	emit_move_insn (destmem, gen_lowpart (mode, value));
-      else
-	{
-	  emit_move_insn (destmem, srcmem);
-	  srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
-	}
-      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
-    }
-  emit_jump_insn (gen_jump (done_label));
-  emit_barrier ();
-
-  emit_label (label);
-  LABEL_NUSES (label) = 1;
-}
-
-/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
-   and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
-   bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
-   proceed with an loop copying SIZE bytes at once. Do moves in MODE.
-   DONE_LABEL is a label after the whole copying sequence. The label is created
-   on demand if *DONE_LABEL is NULL.
-   MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
-   bounds after the initial copies. 
-
-   DESTMEM/SRCMEM are memory expressions pointing to the copies block,
-   DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
-   we will dispatch to a library call for large blocks.
-
-   In pseudocode we do:
-
-   if (COUNT < SIZE)
-     {
-       Assume that SIZE is 4. Bigger sizes are handled analogously
-       if (COUNT & 4)
-	 {
-	    copy 4 bytes from SRCPTR to DESTPTR
-	    copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
-	    goto done_label
-	 }
-       if (!COUNT)
-	 goto done_label;
-       copy 1 byte from SRCPTR to DESTPTR
-       if (COUNT & 2)
-	 {
-	    copy 2 bytes from SRCPTR to DESTPTR
-	    copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
-	 }
-     }
-   else
-     {
-       copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
-       copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
-
-       OLD_DESPTR = DESTPTR;
-       Align DESTPTR up to DESIRED_ALIGN
-       SRCPTR += DESTPTR - OLD_DESTPTR
-       COUNT -= DEST_PTR - OLD_DESTPTR
-       if (DYNAMIC_CHECK)
-	 Round COUNT down to multiple of SIZE
-       << optional caller supplied zero size guard is here >>
-       << optional caller supplied dynamic check is here >>
-       << caller supplied main copy loop is here >>
-     }
-   done_label:
-  */
-static void
-expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
-							    rtx *destptr, rtx *srcptr,
-							    machine_mode mode,
-							    rtx value, rtx vec_value,
-							    rtx *count,
-							    rtx_code_label **done_label,
-							    int size,
-							    int desired_align,
-							    int align,
-							    unsigned HOST_WIDE_INT *min_size,
-							    bool dynamic_check,
-							    bool issetmem)
-{
-  rtx_code_label *loop_label = NULL, *label;
-  int n;
-  rtx modesize;
-  int prolog_size = 0;
-  rtx mode_value;
-
-  /* Chose proper value to copy.  */
-  if (issetmem && VECTOR_MODE_P (mode))
-    mode_value = vec_value;
-  else
-    mode_value = value;
-  gcc_assert (GET_MODE_SIZE (mode) <= size);
-
-  /* See if block is big or small, handle small blocks.  */
-  if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
-    {
-      int size2 = size;
-      loop_label = gen_label_rtx ();
-
-      if (!*done_label)
-	*done_label = gen_label_rtx ();
-
-      emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
-			       1, loop_label);
-      size2 >>= 1;
-
-      /* Handle sizes > 3.  */
-      for (;size2 > 2; size2 >>= 1)
-	expand_small_movmem_or_setmem (destmem, srcmem,
-				       *destptr, *srcptr,
-				       value, vec_value,
-				       *count,
-				       size2, *done_label, issetmem);
-      /* Nothing to copy?  Jump to DONE_LABEL if so */
-      emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
-			       1, *done_label);
-
-      /* Do a byte copy.  */
-      destmem = change_address (destmem, QImode, *destptr);
-      if (issetmem)
-	emit_move_insn (destmem, gen_lowpart (QImode, value));
-      else
-	{
-          srcmem = change_address (srcmem, QImode, *srcptr);
-          emit_move_insn (destmem, srcmem);
-	}
-
-      /* Handle sizes 2 and 3.  */
-      label = ix86_expand_aligntest (*count, 2, false);
-      destmem = change_address (destmem, HImode, *destptr);
-      destmem = offset_address (destmem, *count, 1);
-      destmem = offset_address (destmem, GEN_INT (-2), 2);
-      if (issetmem)
-        emit_move_insn (destmem, gen_lowpart (HImode, value));
-      else
-	{
-	  srcmem = change_address (srcmem, HImode, *srcptr);
-	  srcmem = offset_address (srcmem, *count, 1);
-	  srcmem = offset_address (srcmem, GEN_INT (-2), 2);
-	  emit_move_insn (destmem, srcmem);
-	}
-
-      emit_label (label);
-      LABEL_NUSES (label) = 1;
-      emit_jump_insn (gen_jump (*done_label));
-      emit_barrier ();
-    }
-  else
-    gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
-		|| UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
-
-  /* Start memcpy for COUNT >= SIZE.  */
-  if (loop_label)
-    {
-       emit_label (loop_label);
-       LABEL_NUSES (loop_label) = 1;
-    }
-
-  /* Copy first desired_align bytes.  */
-  if (!issetmem)
-    srcmem = change_address (srcmem, mode, *srcptr);
-  destmem = change_address (destmem, mode, *destptr);
-  modesize = GEN_INT (GET_MODE_SIZE (mode));
-  for (n = 0; prolog_size < desired_align - align; n++)
-    {
-      if (issetmem)
-        emit_move_insn (destmem, mode_value);
-      else
-	{
-          emit_move_insn (destmem, srcmem);
-          srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
-	}
-      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
-      prolog_size += GET_MODE_SIZE (mode);
-    }
-
-
-  /* Copy last SIZE bytes.  */
-  destmem = offset_address (destmem, *count, 1);
-  destmem = offset_address (destmem,
-			    GEN_INT (-size - prolog_size),
-			    1);
-  if (issetmem)
-    emit_move_insn (destmem, mode_value);
-  else
-    {
-      srcmem = offset_address (srcmem, *count, 1);
-      srcmem = offset_address (srcmem,
-			       GEN_INT (-size - prolog_size),
-			       1);
-      emit_move_insn (destmem, srcmem);
-    }
-  for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
-    {
-      destmem = offset_address (destmem, modesize, 1);
-      if (issetmem)
-	emit_move_insn (destmem, mode_value);
-      else
-	{
-          srcmem = offset_address (srcmem, modesize, 1);
-          emit_move_insn (destmem, srcmem);
-	}
-    }
-
-  /* Align destination.  */
-  if (desired_align > 1 && desired_align > align)
-    {
-      rtx saveddest = *destptr;
-
-      gcc_assert (desired_align <= size);
-      /* Align destptr up, place it to new register.  */
-      *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
-				      GEN_INT (prolog_size),
-				      NULL_RTX, 1, OPTAB_DIRECT);
-      if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
-	REG_POINTER (*destptr) = 1;
-      *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
-				      GEN_INT (-desired_align),
-				      *destptr, 1, OPTAB_DIRECT);
-      /* See how many bytes we skipped.  */
-      saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
-				       *destptr,
-				       saveddest, 1, OPTAB_DIRECT);
-      /* Adjust srcptr and count.  */
-      if (!issetmem)
-	*srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
-				       saveddest, *srcptr, 1, OPTAB_DIRECT);
-      *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
-				    saveddest, *count, 1, OPTAB_DIRECT);
-      /* We copied at most size + prolog_size.  */
-      if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
-	*min_size
-	  = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
-      else
-	*min_size = 0;
-
-      /* Our loops always round down the block size, but for dispatch to
-         library we need precise value.  */
-      if (dynamic_check)
-	*count = expand_simple_binop (GET_MODE (*count), AND, *count,
-				      GEN_INT (-size), *count, 1, OPTAB_DIRECT);
-    }
-  else
-    {
-      gcc_assert (prolog_size == 0);
-      /* Decrease count, so we won't end up copying last word twice.  */
-      if (!CONST_INT_P (*count))
-	*count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
-				      constm1_rtx, *count, 1, OPTAB_DIRECT);
-      else
-	*count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
-				      (unsigned HOST_WIDE_INT)size));
-      if (*min_size)
-	*min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
-    }
-}
-
-
-/* This function is like the previous one, except here we know how many bytes
-   need to be copied.  That allows us to update alignment not only of DST, which
-   is returned, but also of SRC, which is passed as a pointer for that
-   reason.  */
-static rtx
-expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
-					   rtx srcreg, rtx value, rtx vec_value,
-					   int desired_align, int align_bytes,
-					   bool issetmem)
-{
-  rtx src = NULL;
-  rtx orig_dst = dst;
-  rtx orig_src = NULL;
-  int piece_size = 1;
-  int copied_bytes = 0;
-
-  if (!issetmem)
-    {
-      gcc_assert (srcp != NULL);
-      src = *srcp;
-      orig_src = src;
-    }
-
-  for (piece_size = 1;
-       piece_size <= desired_align && copied_bytes < align_bytes;
-       piece_size <<= 1)
-    {
-      if (align_bytes & piece_size)
-	{
-	  if (issetmem)
-	    {
-	      if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
-		dst = emit_memset (dst, destreg, vec_value, piece_size);
-	      else
-		dst = emit_memset (dst, destreg, value, piece_size);
-	    }
-	  else
-	    dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
-	  copied_bytes += piece_size;
-	}
-    }
-  if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
-    set_mem_align (dst, desired_align * BITS_PER_UNIT);
-  if (MEM_SIZE_KNOWN_P (orig_dst))
-    set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
-
-  if (!issetmem)
-    {
-      int src_align_bytes = get_mem_align_offset (src, desired_align
-						       * BITS_PER_UNIT);
-      if (src_align_bytes >= 0)
-	src_align_bytes = desired_align - src_align_bytes;
-      if (src_align_bytes >= 0)
-	{
-	  unsigned int src_align;
-	  for (src_align = desired_align; src_align >= 2; src_align >>= 1)
-	    {
-	      if ((src_align_bytes & (src_align - 1))
-		   == (align_bytes & (src_align - 1)))
-		break;
-	    }
-	  if (src_align > (unsigned int) desired_align)
-	    src_align = desired_align;
-	  if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
-	    set_mem_align (src, src_align * BITS_PER_UNIT);
-	}
-      if (MEM_SIZE_KNOWN_P (orig_src))
-	set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
-      *srcp = src;
-    }
-
-  return dst;
-}
-
-/* Return true if ALG can be used in current context.  
-   Assume we expand memset if MEMSET is true.  */
-static bool
-alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
-{
-  if (alg == no_stringop)
-    return false;
-  if (alg == vector_loop)
-    return TARGET_SSE || TARGET_AVX;
-  /* Algorithms using the rep prefix want at least edi and ecx;
-     additionally, memset wants eax and memcpy wants esi.  Don't
-     consider such algorithms if the user has appropriated those
-     registers for their own purposes, or if we have a non-default
-     address space, since some string insns cannot override the segment.  */
-  if (alg == rep_prefix_1_byte
-      || alg == rep_prefix_4_byte
-      || alg == rep_prefix_8_byte)
-    {
-      if (have_as)
-	return false;
-      if (fixed_regs[CX_REG]
-	  || fixed_regs[DI_REG]
-	  || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
-	return false;
-    }
-  return true;
-}
-
-/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
-static enum stringop_alg
-decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
-	    unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
-	    bool memset, bool zero_memset, bool have_as,
-	    int *dynamic_check, bool *noalign, bool recur)
-{
-  const struct stringop_algs *algs;
-  bool optimize_for_speed;
-  int max = 0;
-  const struct processor_costs *cost;
-  int i;
-  bool any_alg_usable_p = false;
-
-  *noalign = false;
-  *dynamic_check = -1;
-
-  /* Even if the string operation call is cold, we still might spend a lot
-     of time processing large blocks.  */
-  if (optimize_function_for_size_p (cfun)
-      || (optimize_insn_for_size_p ()
- 	  && (max_size < 256
-              || (expected_size != -1 && expected_size < 256))))
-    optimize_for_speed = false;
-  else
-    optimize_for_speed = true;
-
-  cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
-  if (memset)
-    algs = &cost->memset[TARGET_64BIT != 0];
-  else
-    algs = &cost->memcpy[TARGET_64BIT != 0];
-
-  /* See maximal size for user defined algorithm.  */
-  for (i = 0; i < MAX_STRINGOP_ALGS; i++)
-    {
-      enum stringop_alg candidate = algs->size[i].alg;
-      bool usable = alg_usable_p (candidate, memset, have_as);
-      any_alg_usable_p |= usable;
-
-      if (candidate != libcall && candidate && usable)
-	max = algs->size[i].max;
-    }
-
-  /* If expected size is not known but max size is small enough
-     so inline version is a win, set expected size into
-     the range.  */
-  if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
-      && expected_size == -1)
-    expected_size = min_size / 2 + max_size / 2;
-
-  /* If user specified the algorithm, honor it if possible.  */
-  if (ix86_stringop_alg != no_stringop
-      && alg_usable_p (ix86_stringop_alg, memset, have_as))
-    return ix86_stringop_alg;
-  /* rep; movq or rep; movl is the smallest variant.  */
-  else if (!optimize_for_speed)
-    {
-      *noalign = true;
-      if (!count || (count & 3) || (memset && !zero_memset))
-	return alg_usable_p (rep_prefix_1_byte, memset, have_as)
-	       ? rep_prefix_1_byte : loop_1_byte;
-      else
-	return alg_usable_p (rep_prefix_4_byte, memset, have_as)
-	       ? rep_prefix_4_byte : loop;
-    }
-  /* Very tiny blocks are best handled via the loop, REP is expensive to
-     setup.  */
-  else if (expected_size != -1 && expected_size < 4)
-    return loop_1_byte;
-  else if (expected_size != -1)
-    {
-      enum stringop_alg alg = libcall;
-      bool alg_noalign = false;
-      for (i = 0; i < MAX_STRINGOP_ALGS; i++)
-	{
-	  /* We get here if the algorithms that were not libcall-based
-	     were rep-prefix based and we are unable to use rep prefixes
-	     based on global register usage.  Break out of the loop and
-	     use the heuristic below.  */
-	  if (algs->size[i].max == 0)
-	    break;
-	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
-	    {
-	      enum stringop_alg candidate = algs->size[i].alg;
-
-	      if (candidate != libcall
-		  && alg_usable_p (candidate, memset, have_as))
-		{
-		  alg = candidate;
-		  alg_noalign = algs->size[i].noalign;
-		}
-	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
-		 last non-libcall inline algorithm.  */
-	      if (TARGET_INLINE_ALL_STRINGOPS)
-		{
-		  /* When the current size is best to be copied by a libcall,
-		     but we are still forced to inline, run the heuristic below
-		     that will pick code for medium sized blocks.  */
-		  if (alg != libcall)
-		    {
-		      *noalign = alg_noalign;
-		      return alg;
-		    }
-		  else if (!any_alg_usable_p)
-		    break;
-		}
-	      else if (alg_usable_p (candidate, memset, have_as))
-		{
-		  *noalign = algs->size[i].noalign;
-		  return candidate;
-		}
-	    }
-	}
-    }
-  /* When asked to inline the call anyway, try to pick meaningful choice.
-     We look for maximal size of block that is faster to copy by hand and
-     take blocks of at most of that size guessing that average size will
-     be roughly half of the block.
-
-     If this turns out to be bad, we might simply specify the preferred
-     choice in ix86_costs.  */
-  if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
-      && (algs->unknown_size == libcall
-	  || !alg_usable_p (algs->unknown_size, memset, have_as)))
-    {
-      enum stringop_alg alg;
-      HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
-
-      /* If there aren't any usable algorithms or if recursing already,
-	 then recursing on smaller sizes or same size isn't going to
-	 find anything.  Just return the simple byte-at-a-time copy loop.  */
-      if (!any_alg_usable_p || recur)
-	{
-	  /* Pick something reasonable.  */
-	  if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
-	    *dynamic_check = 128;
-	  return loop_1_byte;
-	}
-      alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
-			zero_memset, have_as, dynamic_check, noalign, true);
-      gcc_assert (*dynamic_check == -1);
-      if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
-	*dynamic_check = max;
-      else
-	gcc_assert (alg != libcall);
-      return alg;
-    }
-  return (alg_usable_p (algs->unknown_size, memset, have_as)
-	  ? algs->unknown_size : libcall);
-}
-
-/* Decide on alignment.  We know that the operand is already aligned to ALIGN
-   (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
-static int
-decide_alignment (int align,
-		  enum stringop_alg alg,
-		  int expected_size,
-		  machine_mode move_mode)
-{
-  int desired_align = 0;
-
-  gcc_assert (alg != no_stringop);
-
-  if (alg == libcall)
-    return 0;
-  if (move_mode == VOIDmode)
-    return 0;
-
-  desired_align = GET_MODE_SIZE (move_mode);
-  /* PentiumPro has special logic triggering for 8 byte aligned blocks.
-     copying whole cacheline at once.  */
-  if (TARGET_PENTIUMPRO
-      && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
-    desired_align = 8;
-
-  if (optimize_size)
-    desired_align = 1;
-  if (desired_align < align)
-    desired_align = align;
-  if (expected_size != -1 && expected_size < 4)
-    desired_align = align;
-
-  return desired_align;
-}
-
-
-/* Helper function for memcpy.  For QImode value 0xXY produce
-   0xXYXYXYXY of wide specified by MODE.  This is essentially
-   a * 0x10101010, but we can do slightly better than
-   synth_mult by unwinding the sequence by hand on CPUs with
-   slow multiply.  */
-static rtx
-promote_duplicated_reg (machine_mode mode, rtx val)
-{
-  machine_mode valmode = GET_MODE (val);
-  rtx tmp;
-  int nops = mode == DImode ? 3 : 2;
-
-  gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
-  if (val == const0_rtx)
-    return copy_to_mode_reg (mode, CONST0_RTX (mode));
-  if (CONST_INT_P (val))
-    {
-      HOST_WIDE_INT v = INTVAL (val) & 255;
-
-      v |= v << 8;
-      v |= v << 16;
-      if (mode == DImode)
-        v |= (v << 16) << 16;
-      return copy_to_mode_reg (mode, gen_int_mode (v, mode));
-    }
-
-  if (valmode == VOIDmode)
-    valmode = QImode;
-  if (valmode != QImode)
-    val = gen_lowpart (QImode, val);
-  if (mode == QImode)
-    return val;
-  if (!TARGET_PARTIAL_REG_STALL)
-    nops--;
-  if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
-      + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
-      <= (ix86_cost->shift_const + ix86_cost->add) * nops
-          + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
-    {
-      rtx reg = convert_modes (mode, QImode, val, true);
-      tmp = promote_duplicated_reg (mode, const1_rtx);
-      return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
-				  OPTAB_DIRECT);
-    }
-  else
-    {
-      rtx reg = convert_modes (mode, QImode, val, true);
-
-      if (!TARGET_PARTIAL_REG_STALL)
-	if (mode == SImode)
-	  emit_insn (gen_insvsi_1 (reg, reg));
-	else
-	  emit_insn (gen_insvdi_1 (reg, reg));
-      else
-	{
-	  tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
-				     NULL, 1, OPTAB_DIRECT);
-	  reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
-				     OPTAB_DIRECT);
-	}
-      tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
-			         NULL, 1, OPTAB_DIRECT);
-      reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
-      if (mode == SImode)
-	return reg;
-      tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
-				 NULL, 1, OPTAB_DIRECT);
-      reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
-      return reg;
-    }
-}
-
-/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
-   be needed by main loop copying SIZE_NEEDED chunks and prologue getting
-   alignment from ALIGN to DESIRED_ALIGN.  */
-static rtx
-promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
-				int align)
-{
-  rtx promoted_val;
-
-  if (TARGET_64BIT
-      && (size_needed > 4 || (desired_align > align && desired_align > 4)))
-    promoted_val = promote_duplicated_reg (DImode, val);
-  else if (size_needed > 2 || (desired_align > align && desired_align > 2))
-    promoted_val = promote_duplicated_reg (SImode, val);
-  else if (size_needed > 1 || (desired_align > align && desired_align > 1))
-    promoted_val = promote_duplicated_reg (HImode, val);
-  else
-    promoted_val = val;
-
-  return promoted_val;
-}
-
-/* Expand string move (memcpy) ot store (memset) operation.  Use i386 string
-   operations when profitable.  The code depends upon architecture, block size
-   and alignment, but always has one of the following overall structures:
-
-   Aligned move sequence:
-
-     1) Prologue guard: Conditional that jumps up to epilogues for small
-	blocks that can be handled by epilogue alone.  This is faster
-	but also needed for correctness, since prologue assume the block
-	is larger than the desired alignment.
-
-	Optional dynamic check for size and libcall for large
-	blocks is emitted here too, with -minline-stringops-dynamically.
-
-     2) Prologue: copy first few bytes in order to get destination
-	aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
-	than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
-	copied.  We emit either a jump tree on power of two sized
-	blocks, or a byte loop.
-
-     3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
-	with specified algorithm.
-
-     4) Epilogue: code copying tail of the block that is too small to be
-	handled by main body (or up to size guarded by prologue guard). 
-
-  Misaligned move sequence
-
-     1) missaligned move prologue/epilogue containing:
-        a) Prologue handling small memory blocks and jumping to done_label
-	   (skipped if blocks are known to be large enough)
-	b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
-           needed by single possibly misaligned move
-	   (skipped if alignment is not needed)
-        c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
-
-     2) Zero size guard dispatching to done_label, if needed
-
-     3) dispatch to library call, if needed,
-
-     3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
-	with specified algorithm.  */
-bool
-ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
-			   rtx align_exp, rtx expected_align_exp,
-			   rtx expected_size_exp, rtx min_size_exp,
-			   rtx max_size_exp, rtx probable_max_size_exp,
-			   bool issetmem)
-{
-  rtx destreg;
-  rtx srcreg = NULL;
-  rtx_code_label *label = NULL;
-  rtx tmp;
-  rtx_code_label *jump_around_label = NULL;
-  HOST_WIDE_INT align = 1;
-  unsigned HOST_WIDE_INT count = 0;
-  HOST_WIDE_INT expected_size = -1;
-  int size_needed = 0, epilogue_size_needed;
-  int desired_align = 0, align_bytes = 0;
-  enum stringop_alg alg;
-  rtx promoted_val = NULL;
-  rtx vec_promoted_val = NULL;
-  bool force_loopy_epilogue = false;
-  int dynamic_check;
-  bool need_zero_guard = false;
-  bool noalign;
-  machine_mode move_mode = VOIDmode;
-  machine_mode wider_mode;
-  int unroll_factor = 1;
-  /* TODO: Once value ranges are available, fill in proper data.  */
-  unsigned HOST_WIDE_INT min_size = 0;
-  unsigned HOST_WIDE_INT max_size = -1;
-  unsigned HOST_WIDE_INT probable_max_size = -1;
-  bool misaligned_prologue_used = false;
-  bool have_as;
-
-  if (CONST_INT_P (align_exp))
-    align = INTVAL (align_exp);
-  /* i386 can do misaligned access on reasonably increased cost.  */
-  if (CONST_INT_P (expected_align_exp)
-      && INTVAL (expected_align_exp) > align)
-    align = INTVAL (expected_align_exp);
-  /* ALIGN is the minimum of destination and source alignment, but we care here
-     just about destination alignment.  */
-  else if (!issetmem
-	   && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
-    align = MEM_ALIGN (dst) / BITS_PER_UNIT;
-
-  if (CONST_INT_P (count_exp))
-    {
-      min_size = max_size = probable_max_size = count = expected_size
-	= INTVAL (count_exp);
-      /* When COUNT is 0, there is nothing to do.  */
-      if (!count)
-	return true;
-    }
-  else
-    {
-      if (min_size_exp)
-	min_size = INTVAL (min_size_exp);
-      if (max_size_exp)
-	max_size = INTVAL (max_size_exp);
-      if (probable_max_size_exp)
-	probable_max_size = INTVAL (probable_max_size_exp);
-      if (CONST_INT_P (expected_size_exp))
-	expected_size = INTVAL (expected_size_exp);
-     }
-
-  /* Make sure we don't need to care about overflow later on.  */
-  if (count > (HOST_WIDE_INT_1U << 30))
-    return false;
-
-  have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
-  if (!issetmem)
-    have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
-
-  /* Step 0: Decide on preferred algorithm, desired alignment and
-     size of chunks to be copied by main loop.  */
-  alg = decide_alg (count, expected_size, min_size, probable_max_size,
-		    issetmem,
-		    issetmem && val_exp == const0_rtx, have_as,
-		    &dynamic_check, &noalign, false);
-
-  if (dump_file)
-    fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
-	     stringop_alg_names[alg]);
-
-  if (alg == libcall)
-    return false;
-  gcc_assert (alg != no_stringop);
-
-  /* For now vector-version of memset is generated only for memory zeroing, as
-     creating of promoted vector value is very cheap in this case.  */
-  if (issetmem && alg == vector_loop && val_exp != const0_rtx)
-    alg = unrolled_loop;
-
-  if (!count)
-    count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
-  destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
-  if (!issetmem)
-    srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
-
-  unroll_factor = 1;
-  move_mode = word_mode;
-  switch (alg)
-    {
-    case libcall:
-    case no_stringop:
-    case last_alg:
-      gcc_unreachable ();
-    case loop_1_byte:
-      need_zero_guard = true;
-      move_mode = QImode;
-      break;
-    case loop:
-      need_zero_guard = true;
-      break;
-    case unrolled_loop:
-      need_zero_guard = true;
-      unroll_factor = (TARGET_64BIT ? 4 : 2);
-      break;
-    case vector_loop:
-      need_zero_guard = true;
-      unroll_factor = 4;
-      /* Find the widest supported mode.  */
-      move_mode = word_mode;
-      while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
-	     && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
-	move_mode = wider_mode;
-
-      if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
-	move_mode = TImode;
-
-      /* Find the corresponding vector mode with the same size as MOVE_MODE.
-	 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
-      if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
-	{
-	  int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
-	  if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
-	      || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
-	    move_mode = word_mode;
-	}
-      gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
-      break;
-    case rep_prefix_8_byte:
-      move_mode = DImode;
-      break;
-    case rep_prefix_4_byte:
-      move_mode = SImode;
-      break;
-    case rep_prefix_1_byte:
-      move_mode = QImode;
-      break;
-    }
-  size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
-  epilogue_size_needed = size_needed;
-
-  /* If we are going to call any library calls conditionally, make sure any
-     pending stack adjustment happen before the first conditional branch,
-     otherwise they will be emitted before the library call only and won't
-     happen from the other branches.  */
-  if (dynamic_check != -1)
-    do_pending_stack_adjust ();
-
-  desired_align = decide_alignment (align, alg, expected_size, move_mode);
-  if (!TARGET_ALIGN_STRINGOPS || noalign)
-    align = desired_align;
-
-  /* Step 1: Prologue guard.  */
-
-  /* Alignment code needs count to be in register.  */
-  if (CONST_INT_P (count_exp) && desired_align > align)
-    {
-      if (INTVAL (count_exp) > desired_align
-	  && INTVAL (count_exp) > size_needed)
-	{
-	  align_bytes
-	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
-	  if (align_bytes <= 0)
-	    align_bytes = 0;
-	  else
-	    align_bytes = desired_align - align_bytes;
-	}
-      if (align_bytes == 0)
-	count_exp = force_reg (counter_mode (count_exp), count_exp);
-    }
-  gcc_assert (desired_align >= 1 && align >= 1);
-
-  /* Misaligned move sequences handle both prologue and epilogue at once.
-     Default code generation results in a smaller code for large alignments
-     and also avoids redundant job when sizes are known precisely.  */
-  misaligned_prologue_used
-    = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
-       && MAX (desired_align, epilogue_size_needed) <= 32
-       && desired_align <= epilogue_size_needed
-       && ((desired_align > align && !align_bytes)
-	   || (!count && epilogue_size_needed > 1)));
-
-  /* Do the cheap promotion to allow better CSE across the
-     main loop and epilogue (ie one load of the big constant in the
-     front of all code.  
-     For now the misaligned move sequences do not have fast path
-     without broadcasting.  */
-  if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
-    {
-      if (alg == vector_loop)
-	{
-	  gcc_assert (val_exp == const0_rtx);
-	  vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
-	  promoted_val = promote_duplicated_reg_to_size (val_exp,
-							 GET_MODE_SIZE (word_mode),
-							 desired_align, align);
-	}
-      else
-	{
-	  promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
-							 desired_align, align);
-	}
-    }
-  /* Misaligned move sequences handles both prologues and epilogues at once.
-     Default code generation results in smaller code for large alignments and
-     also avoids redundant job when sizes are known precisely.  */
-  if (misaligned_prologue_used)
-    {
-      /* Misaligned move prologue handled small blocks by itself.  */
-      expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
-	   (dst, src, &destreg, &srcreg,
-	    move_mode, promoted_val, vec_promoted_val,
-	    &count_exp,
-	    &jump_around_label,
-            desired_align < align
-	    ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
-	    desired_align, align, &min_size, dynamic_check, issetmem);
-      if (!issetmem)
-        src = change_address (src, BLKmode, srcreg);
-      dst = change_address (dst, BLKmode, destreg);
-      set_mem_align (dst, desired_align * BITS_PER_UNIT);
-      epilogue_size_needed = 0;
-      if (need_zero_guard
-	  && min_size < (unsigned HOST_WIDE_INT) size_needed)
-	{
-	  /* It is possible that we copied enough so the main loop will not
-	     execute.  */
-	  gcc_assert (size_needed > 1);
-	  if (jump_around_label == NULL_RTX)
-	    jump_around_label = gen_label_rtx ();
-	  emit_cmp_and_jump_insns (count_exp,
-				   GEN_INT (size_needed),
-				   LTU, 0, counter_mode (count_exp), 1, jump_around_label);
-	  if (expected_size == -1
-	      || expected_size < (desired_align - align) / 2 + size_needed)
-	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
-	  else
-	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
-	}
-    }
-  /* Ensure that alignment prologue won't copy past end of block.  */
-  else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
-    {
-      epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
-      /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
-	 Make sure it is power of 2.  */
-      epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
-
-      /* To improve performance of small blocks, we jump around the VAL
-	 promoting mode.  This mean that if the promoted VAL is not constant,
-	 we might not use it in the epilogue and have to use byte
-	 loop variant.  */
-      if (issetmem && epilogue_size_needed > 2 && !promoted_val)
-	force_loopy_epilogue = true;
-      if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
-	  || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
-	{
-	  /* If main algorithm works on QImode, no epilogue is needed.
-	     For small sizes just don't align anything.  */
-	  if (size_needed == 1)
-	    desired_align = align;
-	  else
-	    goto epilogue;
-	}
-      else if (!count
-	       && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
-	{
-	  label = gen_label_rtx ();
-	  emit_cmp_and_jump_insns (count_exp,
-				   GEN_INT (epilogue_size_needed),
-				   LTU, 0, counter_mode (count_exp), 1, label);
-	  if (expected_size == -1 || expected_size < epilogue_size_needed)
-	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
-	  else
-	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
-	}
-    }
-
-  /* Emit code to decide on runtime whether library call or inline should be
-     used.  */
-  if (dynamic_check != -1)
-    {
-      if (!issetmem && CONST_INT_P (count_exp))
-	{
-	  if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
-	    {
-	      emit_block_copy_via_libcall (dst, src, count_exp);
-	      count_exp = const0_rtx;
-	      goto epilogue;
-	    }
-	}
-      else
-	{
-	  rtx_code_label *hot_label = gen_label_rtx ();
-	  if (jump_around_label == NULL_RTX)
-	    jump_around_label = gen_label_rtx ();
-	  emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
-				   LEU, 0, counter_mode (count_exp),
-				   1, hot_label);
-	  predict_jump (REG_BR_PROB_BASE * 90 / 100);
-	  if (issetmem)
-	    set_storage_via_libcall (dst, count_exp, val_exp);
-	  else
-	    emit_block_copy_via_libcall (dst, src, count_exp);
-	  emit_jump (jump_around_label);
-	  emit_label (hot_label);
-	}
-    }
-
-  /* Step 2: Alignment prologue.  */
-  /* Do the expensive promotion once we branched off the small blocks.  */
-  if (issetmem && !promoted_val)
-    promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
-						   desired_align, align);
-
-  if (desired_align > align && !misaligned_prologue_used)
-    {
-      if (align_bytes == 0)
-	{
-	  /* Except for the first move in prologue, we no longer know
-	     constant offset in aliasing info.  It don't seems to worth
-	     the pain to maintain it for the first move, so throw away
-	     the info early.  */
-	  dst = change_address (dst, BLKmode, destreg);
-	  if (!issetmem)
-	    src = change_address (src, BLKmode, srcreg);
-	  dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
-					    promoted_val, vec_promoted_val,
-					    count_exp, align, desired_align,
-					    issetmem);
-	  /* At most desired_align - align bytes are copied.  */
-	  if (min_size < (unsigned)(desired_align - align))
-	    min_size = 0;
-	  else
-	    min_size -= desired_align - align;
-	}
-      else
-	{
-	  /* If we know how many bytes need to be stored before dst is
-	     sufficiently aligned, maintain aliasing info accurately.  */
-	  dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
-							   srcreg,
-							   promoted_val,
-							   vec_promoted_val,
-							   desired_align,
-							   align_bytes,
-							   issetmem);
-
-	  count_exp = plus_constant (counter_mode (count_exp),
-				     count_exp, -align_bytes);
-	  count -= align_bytes;
-	  min_size -= align_bytes;
-	  max_size -= align_bytes;
-	}
-      if (need_zero_guard
-	  && min_size < (unsigned HOST_WIDE_INT) size_needed
-	  && (count < (unsigned HOST_WIDE_INT) size_needed
-	      || (align_bytes == 0
-		  && count < ((unsigned HOST_WIDE_INT) size_needed
-			      + desired_align - align))))
-	{
-	  /* It is possible that we copied enough so the main loop will not
-	     execute.  */
-	  gcc_assert (size_needed > 1);
-	  if (label == NULL_RTX)
-	    label = gen_label_rtx ();
-	  emit_cmp_and_jump_insns (count_exp,
-				   GEN_INT (size_needed),
-				   LTU, 0, counter_mode (count_exp), 1, label);
-	  if (expected_size == -1
-	      || expected_size < (desired_align - align) / 2 + size_needed)
-	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
-	  else
-	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
-	}
-    }
-  if (label && size_needed == 1)
-    {
-      emit_label (label);
-      LABEL_NUSES (label) = 1;
-      label = NULL;
-      epilogue_size_needed = 1;
-      if (issetmem)
-	promoted_val = val_exp;
-    }
-  else if (label == NULL_RTX && !misaligned_prologue_used)
-    epilogue_size_needed = size_needed;
-
-  /* Step 3: Main loop.  */
-
-  switch (alg)
-    {
-    case libcall:
-    case no_stringop:
-    case last_alg:
-      gcc_unreachable ();
-    case loop_1_byte:
-    case loop:
-    case unrolled_loop:
-      expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
-				     count_exp, move_mode, unroll_factor,
-				     expected_size, issetmem);
-      break;
-    case vector_loop:
-      expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
-				     vec_promoted_val, count_exp, move_mode,
-				     unroll_factor, expected_size, issetmem);
-      break;
-    case rep_prefix_8_byte:
-    case rep_prefix_4_byte:
-    case rep_prefix_1_byte:
-      expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
-				       val_exp, count_exp, move_mode, issetmem);
-      break;
-    }
-  /* Adjust properly the offset of src and dest memory for aliasing.  */
-  if (CONST_INT_P (count_exp))
-    {
-      if (!issetmem)
-	src = adjust_automodify_address_nv (src, BLKmode, srcreg,
-					    (count / size_needed) * size_needed);
-      dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
-					  (count / size_needed) * size_needed);
-    }
-  else
-    {
-      if (!issetmem)
-	src = change_address (src, BLKmode, srcreg);
-      dst = change_address (dst, BLKmode, destreg);
-    }
-
-  /* Step 4: Epilogue to copy the remaining bytes.  */
- epilogue:
-  if (label)
-    {
-      /* When the main loop is done, COUNT_EXP might hold original count,
-	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
-	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
-	 bytes. Compensate if needed.  */
-
-      if (size_needed < epilogue_size_needed)
-	{
-	  tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
-				     GEN_INT (size_needed - 1), count_exp, 1,
-				     OPTAB_DIRECT);
-	  if (tmp != count_exp)
-	    emit_move_insn (count_exp, tmp);
-	}
-      emit_label (label);
-      LABEL_NUSES (label) = 1;
-    }
-
-  if (count_exp != const0_rtx && epilogue_size_needed > 1)
-    {
-      if (force_loopy_epilogue)
-	expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
-					 epilogue_size_needed);
-      else
-	{
-	  if (issetmem)
-	    expand_setmem_epilogue (dst, destreg, promoted_val,
-				    vec_promoted_val, count_exp,
-				    epilogue_size_needed);
-	  else
-	    expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
-				    epilogue_size_needed);
-	}
-    }
-  if (jump_around_label)
-    emit_label (jump_around_label);
-  return true;
-}
-
-
-/* Expand the appropriate insns for doing strlen if not just doing
-   repnz; scasb
-
-   out = result, initialized with the start address
-   align_rtx = alignment of the address.
-   scratch = scratch register, initialized with the startaddress when
-	not aligned, otherwise undefined
-
-   This is just the body. It needs the initializations mentioned above and
-   some address computing at the end.  These things are done in i386.md.  */
-
-static void
-ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
-{
-  int align;
-  rtx tmp;
-  rtx_code_label *align_2_label = NULL;
-  rtx_code_label *align_3_label = NULL;
-  rtx_code_label *align_4_label = gen_label_rtx ();
-  rtx_code_label *end_0_label = gen_label_rtx ();
-  rtx mem;
-  rtx tmpreg = gen_reg_rtx (SImode);
-  rtx scratch = gen_reg_rtx (SImode);
-  rtx cmp;
-
-  align = 0;
-  if (CONST_INT_P (align_rtx))
-    align = INTVAL (align_rtx);
-
-  /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
-
-  /* Is there a known alignment and is it less than 4?  */
-  if (align < 4)
-    {
-      rtx scratch1 = gen_reg_rtx (Pmode);
-      emit_move_insn (scratch1, out);
-      /* Is there a known alignment and is it not 2? */
-      if (align != 2)
-	{
-	  align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
-	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
-
-	  /* Leave just the 3 lower bits.  */
-	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
-				    NULL_RTX, 0, OPTAB_WIDEN);
-
-	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
-				   Pmode, 1, align_4_label);
-	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
-				   Pmode, 1, align_2_label);
-	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
-				   Pmode, 1, align_3_label);
-	}
-      else
-        {
-	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
-	     check if is aligned to 4 - byte.  */
-
-	  align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
-				    NULL_RTX, 0, OPTAB_WIDEN);
-
-	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
-				   Pmode, 1, align_4_label);
-        }
-
-      mem = change_address (src, QImode, out);
-
-      /* Now compare the bytes.  */
-
-      /* Compare the first n unaligned byte on a byte per byte basis.  */
-      emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
-			       QImode, 1, end_0_label);
-
-      /* Increment the address.  */
-      emit_insn (ix86_gen_add3 (out, out, const1_rtx));
-
-      /* Not needed with an alignment of 2 */
-      if (align != 2)
-	{
-	  emit_label (align_2_label);
-
-	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
-				   end_0_label);
-
-	  emit_insn (ix86_gen_add3 (out, out, const1_rtx));
-
-	  emit_label (align_3_label);
-	}
-
-      emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
-			       end_0_label);
-
-      emit_insn (ix86_gen_add3 (out, out, const1_rtx));
-    }
-
-  /* Generate loop to check 4 bytes at a time.  It is not a good idea to
-     align this loop.  It gives only huge programs, but does not help to
-     speed up.  */
-  emit_label (align_4_label);
-
-  mem = change_address (src, SImode, out);
-  emit_move_insn (scratch, mem);
-  emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
-
-  /* This formula yields a nonzero result iff one of the bytes is zero.
-     This saves three branches inside loop and many cycles.  */
-
-  emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
-  emit_insn (gen_one_cmplsi2 (scratch, scratch));
-  emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
-  emit_insn (gen_andsi3 (tmpreg, tmpreg,
-			 gen_int_mode (0x80808080, SImode)));
-  emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
-			   align_4_label);
-
-  if (TARGET_CMOVE)
-    {
-       rtx reg = gen_reg_rtx (SImode);
-       rtx reg2 = gen_reg_rtx (Pmode);
-       emit_move_insn (reg, tmpreg);
-       emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
-
-       /* If zero is not in the first two bytes, move two bytes forward.  */
-       emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
-       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
-       tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
-       emit_insn (gen_rtx_SET (tmpreg,
-			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
-						     reg,
-						     tmpreg)));
-       /* Emit lea manually to avoid clobbering of flags.  */
-       emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
-
-       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
-       tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
-       emit_insn (gen_rtx_SET (out,
-			       gen_rtx_IF_THEN_ELSE (Pmode, tmp,
-						     reg2,
-						     out)));
-    }
-  else
-    {
-       rtx_code_label *end_2_label = gen_label_rtx ();
-       /* Is zero in the first two bytes? */
-
-       emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
-       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
-       tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
-       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
-                            gen_rtx_LABEL_REF (VOIDmode, end_2_label),
-                            pc_rtx);
-       tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
-       JUMP_LABEL (tmp) = end_2_label;
-
-       /* Not in the first two.  Move two bytes forward.  */
-       emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
-       emit_insn (ix86_gen_add3 (out, out, const2_rtx));
-
-       emit_label (end_2_label);
-
-    }
-
-  /* Avoid branch in fixing the byte.  */
-  tmpreg = gen_lowpart (QImode, tmpreg);
-  emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
-  tmp = gen_rtx_REG (CCmode, FLAGS_REG);
-  cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
-  emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
-
-  emit_label (end_0_label);
-}
-
-/* Expand strlen.  */
-
-bool
-ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
-{
-  rtx addr, scratch1, scratch2, scratch3, scratch4;
-
-  /* The generic case of strlen expander is long.  Avoid it's
-     expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
-
-  if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
-      && !TARGET_INLINE_ALL_STRINGOPS
-      && !optimize_insn_for_size_p ()
-      && (!CONST_INT_P (align) || INTVAL (align) < 4))
-    return false;
-
-  addr = force_reg (Pmode, XEXP (src, 0));
-  scratch1 = gen_reg_rtx (Pmode);
-
-  if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
-      && !optimize_insn_for_size_p ())
-    {
-      /* Well it seems that some optimizer does not combine a call like
-         foo(strlen(bar), strlen(bar));
-         when the move and the subtraction is done here.  It does calculate
-         the length just once when these instructions are done inside of
-         output_strlen_unroll().  But I think since &bar[strlen(bar)] is
-         often used and I use one fewer register for the lifetime of
-         output_strlen_unroll() this is better.  */
-
-      emit_move_insn (out, addr);
-
-      ix86_expand_strlensi_unroll_1 (out, src, align);
-
-      /* strlensi_unroll_1 returns the address of the zero at the end of
-         the string, like memchr(), so compute the length by subtracting
-         the start address.  */
-      emit_insn (ix86_gen_sub3 (out, out, addr));
-    }
-  else
-    {
-      rtx unspec;
-
-      /* Can't use this if the user has appropriated eax, ecx, or edi.  */
-      if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
-        return false;
-      /* Can't use this for non-default address spaces.  */
-      if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
-	return false;
-
-      scratch2 = gen_reg_rtx (Pmode);
-      scratch3 = gen_reg_rtx (Pmode);
-      scratch4 = force_reg (Pmode, constm1_rtx);
-
-      emit_move_insn (scratch3, addr);
-      eoschar = force_reg (QImode, eoschar);
-
-      src = replace_equiv_address_nv (src, scratch3);
-
-      /* If .md starts supporting :P, this can be done in .md.  */
-      unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
-						 scratch4), UNSPEC_SCAS);
-      emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
-      emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
-      emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
-    }
-  return true;
-}
-
-/* For given symbol (function) construct code to compute address of it's PLT
-   entry in large x86-64 PIC model.  */
-static rtx
-construct_plt_address (rtx symbol)
-{
-  rtx tmp, unspec;
-
-  gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
-  gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
-  gcc_assert (Pmode == DImode);
-
-  tmp = gen_reg_rtx (Pmode);
-  unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
-
-  emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
-  emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
-  return tmp;
-}
-
-rtx_insn *
-ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
-		  rtx callarg2,
-		  rtx pop, bool sibcall)
-{
-  rtx vec[3];
-  rtx use = NULL, call;
-  unsigned int vec_len = 0;
-  tree fndecl;
-
-  if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
-    {
-      fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
-      if (fndecl
-	  && (lookup_attribute ("interrupt",
-				TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
-	error ("interrupt service routine can%'t be called directly");
-    }
-  else
-    fndecl = NULL_TREE;
-
-  if (pop == const0_rtx)
-    pop = NULL;
-  gcc_assert (!TARGET_64BIT || !pop);
-
-  if (TARGET_MACHO && !TARGET_64BIT)
-    {
-#if TARGET_MACHO
-      if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
-	fnaddr = machopic_indirect_call_target (fnaddr);
-#endif
-    }
-  else
-    {
-      /* Static functions and indirect calls don't need the pic register.  Also,
-	 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
-	 it an indirect call.  */
-      rtx addr = XEXP (fnaddr, 0);
-      if (flag_pic
-	  && GET_CODE (addr) == SYMBOL_REF
-	  && !SYMBOL_REF_LOCAL_P (addr))
-	{
-	  if (flag_plt
-	      && (SYMBOL_REF_DECL (addr) == NULL_TREE
-		  || !lookup_attribute ("noplt",
-					DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
-	    {
-	      if (!TARGET_64BIT
-		  || (ix86_cmodel == CM_LARGE_PIC
-		      && DEFAULT_ABI != MS_ABI))
-		{
-		  use_reg (&use, gen_rtx_REG (Pmode,
-					      REAL_PIC_OFFSET_TABLE_REGNUM));
-		  if (ix86_use_pseudo_pic_reg ())
-		    emit_move_insn (gen_rtx_REG (Pmode,
-						 REAL_PIC_OFFSET_TABLE_REGNUM),
-				    pic_offset_table_rtx);
-		}
-	    }
-	  else if (!TARGET_PECOFF && !TARGET_MACHO)
-	    {
-	      if (TARGET_64BIT)
-		{
-		  fnaddr = gen_rtx_UNSPEC (Pmode,
-					   gen_rtvec (1, addr),
-					   UNSPEC_GOTPCREL);
-		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
-		}
-	      else
-		{
-		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
-					   UNSPEC_GOT);
-		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
-		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
-					 fnaddr);
-		}
-	      fnaddr = gen_const_mem (Pmode, fnaddr);
-	      /* Pmode may not be the same as word_mode for x32, which
-		 doesn't support indirect branch via 32-bit memory slot.
-		 Since x32 GOT slot is 64 bit with zero upper 32 bits,
-		 indirect branch via x32 GOT slot is OK.  */
-	      if (GET_MODE (fnaddr) != word_mode)
-		fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
-	      fnaddr = gen_rtx_MEM (QImode, fnaddr);
-	    }
-	}
-    }
-
-  /* Skip setting up RAX register for -mskip-rax-setup when there are no
-     parameters passed in vector registers.  */
-  if (TARGET_64BIT
-      && (INTVAL (callarg2) > 0
-	  || (INTVAL (callarg2) == 0
-	      && (TARGET_SSE || !flag_skip_rax_setup))))
-    {
-      rtx al = gen_rtx_REG (QImode, AX_REG);
-      emit_move_insn (al, callarg2);
-      use_reg (&use, al);
-    }
-
-  if (ix86_cmodel == CM_LARGE_PIC
-      && !TARGET_PECOFF
-      && MEM_P (fnaddr)
-      && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
-      && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
-    fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
-  /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
-     branch via x32 GOT slot is OK.  */
-  else if (!(TARGET_X32
-	     && MEM_P (fnaddr)
-	     && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
-	     && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
-	   && (sibcall
-	       ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
-	       : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
-    {
-      fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
-      fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
-    }
-
-  call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
-
-  if (retval)
-    call = gen_rtx_SET (retval, call);
-  vec[vec_len++] = call;
-
-  if (pop)
-    {
-      pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
-      pop = gen_rtx_SET (stack_pointer_rtx, pop);
-      vec[vec_len++] = pop;
-    }
-
-  if (cfun->machine->no_caller_saved_registers
-      && (!fndecl
-	  || (!TREE_THIS_VOLATILE (fndecl)
-	      && !lookup_attribute ("no_caller_saved_registers",
-				    TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
-    {
-      static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
-      bool is_64bit_ms_abi = (TARGET_64BIT
-			      && ix86_function_abi (fndecl) == MS_ABI);
-      char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
-
-      /* If there are no caller-saved registers, add all registers
-	 that are clobbered by the call which returns.  */
-      for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-	if (!fixed_regs[i]
-	    && (ix86_call_used_regs[i] == 1
-		|| (ix86_call_used_regs[i] & c_mask))
-	    && !STACK_REGNO_P (i)
-	    && !MMX_REGNO_P (i))
-	  clobber_reg (&use,
-		       gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
-    }
-  else if (TARGET_64BIT_MS_ABI
-	   && (!callarg2 || INTVAL (callarg2) != -2))
-    {
-      unsigned i;
-
-      for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
-	{
-	  int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
-	  machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
-
-	  clobber_reg (&use, gen_rtx_REG (mode, regno));
-	}
-
-      /* Set here, but it may get cleared later.  */
-      if (TARGET_CALL_MS2SYSV_XLOGUES)
-	{
-	  if (!TARGET_SSE)
-	    ;
-
-	  /* Don't break hot-patched functions.  */
-	  else if (ix86_function_ms_hook_prologue (current_function_decl))
-	    ;
-
-	  /* TODO: Cases not yet examined.  */
-	  else if (flag_split_stack)
-	    warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
-
-	  else
-	    {
-	      gcc_assert (!reload_completed);
-	      cfun->machine->call_ms2sysv = true;
-	    }
-	}
-    }
-
-  if (vec_len > 1)
-    call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
-  rtx_insn *call_insn = emit_call_insn (call);
-  if (use)
-    CALL_INSN_FUNCTION_USAGE (call_insn) = use;
-
-  return call_insn;
-}
-
-/* Return true if the function being called was marked with attribute
-   "noplt" or using -fno-plt and we are compiling for non-PIC.  We need
-   to handle the non-PIC case in the backend because there is no easy
-   interface for the front-end to force non-PLT calls to use the GOT.
-   This is currently used only with 64-bit or 32-bit GOT32X ELF targets
-   to call the function marked "noplt" indirectly.  */
-
-static bool
-ix86_nopic_noplt_attribute_p (rtx call_op)
-{
-  if (flag_pic || ix86_cmodel == CM_LARGE
-      || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
-      || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
-      || SYMBOL_REF_LOCAL_P (call_op))
-    return false;
-
-  tree symbol_decl = SYMBOL_REF_DECL (call_op);
-
-  if (!flag_plt
-      || (symbol_decl != NULL_TREE
-          && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
-    return true;
-
-  return false;
-}
-
-/* Helper to output the jmp/call.  */
-static void
-ix86_output_jmp_thunk_or_indirect (const char *thunk_name, const int regno)
-{
-  if (thunk_name != NULL)
-    {
-      fprintf (asm_out_file, "\tjmp\t");
-      assemble_name (asm_out_file, thunk_name);
-      putc ('\n', asm_out_file);
-    }
-  else
-    output_indirect_thunk (regno);
-}
-
-/* Output indirect branch via a call and return thunk.  CALL_OP is a
-   register which contains the branch target.  XASM is the assembly
-   template for CALL_OP.  Branch is a tail call if SIBCALL_P is true.
-   A normal call is converted to:
-
-	call __x86_indirect_thunk_reg
-
-   and a tail call is converted to:
-
-	jmp __x86_indirect_thunk_reg
- */
-
-static void
-ix86_output_indirect_branch_via_reg (rtx call_op, bool sibcall_p)
-{
-  char thunk_name_buf[32];
-  char *thunk_name;
-  enum indirect_thunk_prefix need_prefix
-    = indirect_thunk_need_prefix (current_output_insn);
-  int regno = REGNO (call_op);
-
-  if (cfun->machine->indirect_branch_type
-      != indirect_branch_thunk_inline)
-    {
-      if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
-	{
-	  int i = regno;
-	  if (i >= FIRST_REX_INT_REG)
-	    i -= (FIRST_REX_INT_REG - LAST_INT_REG - 1);
-	  indirect_thunks_used |= 1 << i;
-	}
-      indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
-      thunk_name = thunk_name_buf;
-    }
-  else
-    thunk_name = NULL;
-
-  if (sibcall_p)
-     ix86_output_jmp_thunk_or_indirect (thunk_name, regno);
-  else
-    {
-      if (thunk_name != NULL)
-	{
-	  fprintf (asm_out_file, "\tcall\t");
-	  assemble_name (asm_out_file, thunk_name);
-	  putc ('\n', asm_out_file);
-	  return;
-	}
-
-      char indirectlabel1[32];
-      char indirectlabel2[32];
-
-      ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
-				   INDIRECT_LABEL,
-				   indirectlabelno++);
-      ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
-				   INDIRECT_LABEL,
-				   indirectlabelno++);
-
-      /* Jump.  */
-      fputs ("\tjmp\t", asm_out_file);
-      assemble_name_raw (asm_out_file, indirectlabel2);
-      fputc ('\n', asm_out_file);
-
-      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
-
-     ix86_output_jmp_thunk_or_indirect (thunk_name, regno);
-
-      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
-
-      /* Call.  */
-      fputs ("\tcall\t", asm_out_file);
-      assemble_name_raw (asm_out_file, indirectlabel1);
-      fputc ('\n', asm_out_file);
-    }
-}
-
-/* Output indirect branch via a call and return thunk.  CALL_OP is
-   the branch target.  XASM is the assembly template for CALL_OP.
-   Branch is a tail call if SIBCALL_P is true.  A normal call is
-   converted to:
-
-	jmp L2
-   L1:
-	push CALL_OP
-	jmp __x86_indirect_thunk
-   L2:
-	call L1
-
-   and a tail call is converted to:
-
-	push CALL_OP
-	jmp __x86_indirect_thunk
- */
-
-static void
-ix86_output_indirect_branch_via_push (rtx call_op, const char *xasm,
-				      bool sibcall_p)
-{
-  char thunk_name_buf[32];
-  char *thunk_name;
-  char push_buf[64];
-  enum indirect_thunk_prefix need_prefix
-    = indirect_thunk_need_prefix (current_output_insn);
-  int regno = -1;
-
-  if (cfun->machine->indirect_branch_type
-      != indirect_branch_thunk_inline)
-    {
-      if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
-	indirect_thunk_needed = true;
-      indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
-      thunk_name = thunk_name_buf;
-    }
-  else
-    thunk_name = NULL;
-
-  snprintf (push_buf, sizeof (push_buf), "push{%c}\t%s",
-	    TARGET_64BIT ? 'q' : 'l', xasm);
-
-  if (sibcall_p)
-    {
-      output_asm_insn (push_buf, &call_op);
-      ix86_output_jmp_thunk_or_indirect (thunk_name, regno);
-    }
-  else
-    {
-      char indirectlabel1[32];
-      char indirectlabel2[32];
-
-      ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
-				   INDIRECT_LABEL,
-				   indirectlabelno++);
-      ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
-				   INDIRECT_LABEL,
-				   indirectlabelno++);
-
-      /* Jump.  */
-      fputs ("\tjmp\t", asm_out_file);
-      assemble_name_raw (asm_out_file, indirectlabel2);
-      fputc ('\n', asm_out_file);
-
-      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
-
-      /* An external function may be called via GOT, instead of PLT.  */
-      if (MEM_P (call_op))
-	{
-	  struct ix86_address parts;
-	  rtx addr = XEXP (call_op, 0);
-	  if (ix86_decompose_address (addr, &parts)
-	      && parts.base == stack_pointer_rtx)
-	    {
-	      /* Since call will adjust stack by -UNITS_PER_WORD,
-		 we must convert "disp(stack, index, scale)" to
-		 "disp+UNITS_PER_WORD(stack, index, scale)".  */
-	      if (parts.index)
-		{
-		  addr = gen_rtx_MULT (Pmode, parts.index,
-				       GEN_INT (parts.scale));
-		  addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
-				       addr);
-		}
-	      else
-		addr = stack_pointer_rtx;
-
-	      rtx disp;
-	      if (parts.disp != NULL_RTX)
-		disp = plus_constant (Pmode, parts.disp,
-				      UNITS_PER_WORD);
-	      else
-		disp = GEN_INT (UNITS_PER_WORD);
-
-	      addr = gen_rtx_PLUS (Pmode, addr, disp);
-	      call_op = gen_rtx_MEM (GET_MODE (call_op), addr);
-	    }
-	}
-
-      output_asm_insn (push_buf, &call_op);
-
-      ix86_output_jmp_thunk_or_indirect (thunk_name, regno);
-
-      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
-
-      /* Call.  */
-      fputs ("\tcall\t", asm_out_file);
-      assemble_name_raw (asm_out_file, indirectlabel1);
-      fputc ('\n', asm_out_file);
-    }
-}
-
-/* Output indirect branch via a call and return thunk.  CALL_OP is
-   the branch target.  XASM is the assembly template for CALL_OP.
-   Branch is a tail call if SIBCALL_P is true.   */
-
-static void
-ix86_output_indirect_branch (rtx call_op, const char *xasm,
-			     bool sibcall_p)
-{
-  if (REG_P (call_op))
-    ix86_output_indirect_branch_via_reg (call_op, sibcall_p);
-  else
-    ix86_output_indirect_branch_via_push (call_op, xasm, sibcall_p);
-}
-
-/* Output indirect jump.  CALL_OP is the jump target.  */
-
-const char *
-ix86_output_indirect_jmp (rtx call_op)
-{
-  if (cfun->machine->indirect_branch_type != indirect_branch_keep)
-    {
-      /* We can't have red-zone since "call" in the indirect thunk
-         pushes the return address onto stack, destroying red-zone.  */
-      if (ix86_red_zone_size != 0)
-	gcc_unreachable ();
-
-      ix86_output_indirect_branch (call_op, "%0", true);
-      return "";
-    }
-  else
-    return "%!jmp\t%A0";
-}
-
-/* Output return instrumentation for current function if needed.  */
-
-static void
-output_return_instrumentation (void)
-{
-  if (ix86_instrument_return != instrument_return_none
-      && flag_fentry
-      && !DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (cfun->decl))
-    {
-      if (ix86_flag_record_return)
-	fprintf (asm_out_file, "1:\n");
-      switch (ix86_instrument_return)
-	{
-	case instrument_return_call:
-	  fprintf (asm_out_file, "\tcall\t__return__\n");
-	  break;
-	case instrument_return_nop5:
-	  /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1)  */
-	  fprintf (asm_out_file, ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
-	  break;
-	case instrument_return_none:
-	  break;
-	}
-
-      if (ix86_flag_record_return)
-	{
-	  fprintf (asm_out_file, "\t.section __return_loc, \"a\",@progbits\n");
-	  fprintf (asm_out_file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
-	  fprintf (asm_out_file, "\t.previous\n");
-	}
-    }
-}
-
-/* Output function return.  CALL_OP is the jump target.  Add a REP
-   prefix to RET if LONG_P is true and function return is kept.  */
-
-const char *
-ix86_output_function_return (bool long_p)
-{
-  output_return_instrumentation ();
-
-  if (cfun->machine->function_return_type != indirect_branch_keep)
-    {
-      char thunk_name[32];
-      enum indirect_thunk_prefix need_prefix
-	= indirect_thunk_need_prefix (current_output_insn);
-
-      if (cfun->machine->function_return_type
-	  != indirect_branch_thunk_inline)
-	{
-	  bool need_thunk = (cfun->machine->function_return_type
-			     == indirect_branch_thunk);
-	  indirect_thunk_name (thunk_name, INVALID_REGNUM, need_prefix,
-			       true);
-	  indirect_return_needed |= need_thunk;
-	  fprintf (asm_out_file, "\tjmp\t");
-	  assemble_name (asm_out_file, thunk_name);
-	  putc ('\n', asm_out_file);
-	}
-      else
-	output_indirect_thunk (INVALID_REGNUM);
-
-      return "";
-    }
-
-  if (!long_p)
-    return "%!ret";
-
-  return "rep%; ret";
-}
-
-/* Output indirect function return.  RET_OP is the function return
-   target.  */
-
-const char *
-ix86_output_indirect_function_return (rtx ret_op)
-{
-  if (cfun->machine->function_return_type != indirect_branch_keep)
-    {
-      char thunk_name[32];
-      enum indirect_thunk_prefix need_prefix
-	= indirect_thunk_need_prefix (current_output_insn);
-      unsigned int regno = REGNO (ret_op);
-      gcc_assert (regno == CX_REG);
-
-      if (cfun->machine->function_return_type
-	  != indirect_branch_thunk_inline)
-	{
-	  bool need_thunk = (cfun->machine->function_return_type
-			     == indirect_branch_thunk);
-	  indirect_thunk_name (thunk_name, regno, need_prefix, true);
-
-	  if (need_thunk)
-	    {
-	      indirect_return_via_cx = true;
-	      indirect_thunks_used |= 1 << CX_REG;
-	    }
-	  fprintf (asm_out_file, "\tjmp\t");
-	  assemble_name (asm_out_file, thunk_name);
-	  putc ('\n', asm_out_file);
-	}
-      else
-	output_indirect_thunk (regno);
-
-      return "";
-    }
-  else
-    return "%!jmp\t%A0";
-}
-
-/* Split simple return with popping POPC bytes from stack to indirect
-   branch with stack adjustment .  */
-
-void
-ix86_split_simple_return_pop_internal (rtx popc)
-{
-  struct machine_function *m = cfun->machine;
-  rtx ecx = gen_rtx_REG (SImode, CX_REG);
-  rtx_insn *insn;
-
-  /* There is no "pascal" calling convention in any 64bit ABI.  */
-  gcc_assert (!TARGET_64BIT);
-
-  insn = emit_insn (gen_pop (ecx));
-  m->fs.cfa_offset -= UNITS_PER_WORD;
-  m->fs.sp_offset -= UNITS_PER_WORD;
-
-  rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
-  x = gen_rtx_SET (stack_pointer_rtx, x);
-  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
-  add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
-  RTX_FRAME_RELATED_P (insn) = 1;
-
-  x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
-  x = gen_rtx_SET (stack_pointer_rtx, x);
-  insn = emit_insn (x);
-  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
-  RTX_FRAME_RELATED_P (insn) = 1;
-
-  /* Now return address is in ECX.  */
-  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
-}
-
-/* Output the assembly for a call instruction.  */
-
-const char *
-ix86_output_call_insn (rtx_insn *insn, rtx call_op)
-{
-  bool direct_p = constant_call_address_operand (call_op, VOIDmode);
-  bool output_indirect_p
-    = (!TARGET_SEH
-       && cfun->machine->indirect_branch_type != indirect_branch_keep);
-  bool seh_nop_p = false;
-  const char *xasm;
-
-  if (SIBLING_CALL_P (insn))
-    {
-      output_return_instrumentation ();
-      if (direct_p)
-	{
-	  if (ix86_nopic_noplt_attribute_p (call_op))
-	    {
-	      direct_p = false;
-	      if (TARGET_64BIT)
-		{
-		  if (output_indirect_p)
-		    xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
-		  else
-		    xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
-		}
-	      else
-		{
-		  if (output_indirect_p)
-		    xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
-		  else
-		    xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
-		}
-	    }
-	  else
-	    xasm = "%!jmp\t%P0";
-	}
-      /* SEH epilogue detection requires the indirect branch case
-	 to include REX.W.  */
-      else if (TARGET_SEH)
-	xasm = "%!rex.W jmp\t%A0";
-      else
-	{
-	  if (output_indirect_p)
-	    xasm = "%0";
-	  else
-	    xasm = "%!jmp\t%A0";
-	}
-
-      if (output_indirect_p && !direct_p)
-	ix86_output_indirect_branch (call_op, xasm, true);
-      else
-	output_asm_insn (xasm, &call_op);
-      return "";
-    }
-
-  /* SEH unwinding can require an extra nop to be emitted in several
-     circumstances.  Determine if we have one of those.  */
-  if (TARGET_SEH)
-    {
-      rtx_insn *i;
-
-      for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
-	{
-	  /* Prevent a catch region from being adjacent to a jump that would
-	     be interpreted as an epilogue sequence by the unwinder.  */
-	  if (JUMP_P(i) && CROSSING_JUMP_P (i))
-	    {
-	      seh_nop_p = true;
-	      break;
-	    }
-	    
-	  /* If we get to another real insn, we don't need the nop.  */
-	  if (INSN_P (i))
-	    break;
-
-	  /* If we get to the epilogue note, prevent a catch region from
-	     being adjacent to the standard epilogue sequence.  If non-
-	     call-exceptions, we'll have done this during epilogue emission. */
-	  if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
-	      && !flag_non_call_exceptions
-	      && !can_throw_internal (insn))
-	    {
-	      seh_nop_p = true;
-	      break;
-	    }
-	}
-
-      /* If we didn't find a real insn following the call, prevent the
-	 unwinder from looking into the next function.  */
-      if (i == NULL)
-	seh_nop_p = true;
-    }
-
-  if (direct_p)
-    {
-      if (ix86_nopic_noplt_attribute_p (call_op))
-	{
-	  direct_p = false;
-	  if (TARGET_64BIT)
-	    {
-	      if (output_indirect_p)
-		xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
-	      else
-		xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
-	    }
-	  else
-	    {
-	      if (output_indirect_p)
-		xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
-	      else
-		xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
-	    }
-	}
-      else
-	xasm = "%!call\t%P0";
-    }
-  else
-    {
-      if (output_indirect_p)
-	xasm = "%0";
-      else
-	xasm = "%!call\t%A0";
-    }
-
-  if (output_indirect_p && !direct_p)
-    ix86_output_indirect_branch (call_op, xasm, false);
-  else
-    output_asm_insn (xasm, &call_op);
-
-  if (seh_nop_p)
-    return "nop";
-
-  return "";
-}
-
-/* Clear stack slot assignments remembered from previous functions.
-   This is called from INIT_EXPANDERS once before RTL is emitted for each
-   function.  */
-
-static struct machine_function *
-ix86_init_machine_status (void)
-{
-  struct machine_function *f;
-
-  f = ggc_cleared_alloc<machine_function> ();
-  f->call_abi = ix86_abi;
-
-  return f;
-}
-
-/* Return a MEM corresponding to a stack slot with mode MODE.
-   Allocate a new slot if necessary.
-
-   The RTL for a function can have several slots available: N is
-   which slot to use.  */
-
-rtx
-assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
-{
-  struct stack_local_entry *s;
-
-  gcc_assert (n < MAX_386_STACK_LOCALS);
-
-  for (s = ix86_stack_locals; s; s = s->next)
-    if (s->mode == mode && s->n == n)
-      return validize_mem (copy_rtx (s->rtl));
-
-  s = ggc_alloc<stack_local_entry> ();
-  s->n = n;
-  s->mode = mode;
-  s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
-
-  s->next = ix86_stack_locals;
-  ix86_stack_locals = s;
-  return validize_mem (copy_rtx (s->rtl));
-}
-
-static void
-ix86_instantiate_decls (void)
-{
-  struct stack_local_entry *s;
-
-  for (s = ix86_stack_locals; s; s = s->next)
-    if (s->rtl != NULL_RTX)
-      instantiate_decl_rtl (s->rtl);
-}
-
-/* Check whether x86 address PARTS is a pc-relative address.  */
-
-bool
-ix86_rip_relative_addr_p (struct ix86_address *parts)
-{
-  rtx base, index, disp;
-
-  base = parts->base;
-  index = parts->index;
-  disp = parts->disp;
-
-  if (disp && !base && !index)
-    {
-      if (TARGET_64BIT)
-	{
-	  rtx symbol = disp;
-
-	  if (GET_CODE (disp) == CONST)
-	    symbol = XEXP (disp, 0);
-	  if (GET_CODE (symbol) == PLUS
-	      && CONST_INT_P (XEXP (symbol, 1)))
-	    symbol = XEXP (symbol, 0);
-
-	  if (GET_CODE (symbol) == LABEL_REF
-	      || (GET_CODE (symbol) == SYMBOL_REF
-		  && SYMBOL_REF_TLS_MODEL (symbol) == 0)
-	      || (GET_CODE (symbol) == UNSPEC
-		  && (XINT (symbol, 1) == UNSPEC_GOTPCREL
-		      || XINT (symbol, 1) == UNSPEC_PCREL
-		      || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
-	    return true;
-	}
-    }
-  return false;
-}
-
-/* Calculate the length of the memory address in the instruction encoding.
-   Includes addr32 prefix, does not include the one-byte modrm, opcode,
-   or other prefixes.  We never generate addr32 prefix for LEA insn.  */
-
-int
-memory_address_length (rtx addr, bool lea)
-{
-  struct ix86_address parts;
-  rtx base, index, disp;
-  int len;
-  int ok;
-
-  if (GET_CODE (addr) == PRE_DEC
-      || GET_CODE (addr) == POST_INC
-      || GET_CODE (addr) == PRE_MODIFY
-      || GET_CODE (addr) == POST_MODIFY)
-    return 0;
-
-  ok = ix86_decompose_address (addr, &parts);
-  gcc_assert (ok);
-
-  len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
-
-  /*  If this is not LEA instruction, add the length of addr32 prefix.  */
-  if (TARGET_64BIT && !lea
-      && (SImode_address_operand (addr, VOIDmode)
-	  || (parts.base && GET_MODE (parts.base) == SImode)
-	  || (parts.index && GET_MODE (parts.index) == SImode)))
-    len++;
-
-  base = parts.base;
-  index = parts.index;
-  disp = parts.disp;
-
-  if (base && SUBREG_P (base))
-    base = SUBREG_REG (base);
-  if (index && SUBREG_P (index))
-    index = SUBREG_REG (index);
-
-  gcc_assert (base == NULL_RTX || REG_P (base));
-  gcc_assert (index == NULL_RTX || REG_P (index));
-
-  /* Rule of thumb:
-       - esp as the base always wants an index,
-       - ebp as the base always wants a displacement,
-       - r12 as the base always wants an index,
-       - r13 as the base always wants a displacement.  */
-
-  /* Register Indirect.  */
-  if (base && !index && !disp)
-    {
-      /* esp (for its index) and ebp (for its displacement) need
-	 the two-byte modrm form.  Similarly for r12 and r13 in 64-bit
-	 code.  */
-      if (base == arg_pointer_rtx
-	  || base == frame_pointer_rtx
-	  || REGNO (base) == SP_REG
-	  || REGNO (base) == BP_REG
-	  || REGNO (base) == R12_REG
-	  || REGNO (base) == R13_REG)
-	len++;
-    }
-
-  /* Direct Addressing.  In 64-bit mode mod 00 r/m 5
-     is not disp32, but disp32(%rip), so for disp32
-     SIB byte is needed, unless print_operand_address
-     optimizes it into disp32(%rip) or (%rip) is implied
-     by UNSPEC.  */
-  else if (disp && !base && !index)
-    {
-      len += 4;
-      if (!ix86_rip_relative_addr_p (&parts))
-	len++;
-    }
-  else
-    {
-      /* Find the length of the displacement constant.  */
-      if (disp)
-	{
-	  if (base && satisfies_constraint_K (disp))
-	    len += 1;
-	  else
-	    len += 4;
-	}
-      /* ebp always wants a displacement.  Similarly r13.  */
-      else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
-	len++;
-
-      /* An index requires the two-byte modrm form....  */
-      if (index
-	  /* ...like esp (or r12), which always wants an index.  */
-	  || base == arg_pointer_rtx
-	  || base == frame_pointer_rtx
-	  || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
-	len++;
-    }
-
-  return len;
-}
-
-/* Compute default value for "length_immediate" attribute.  When SHORTFORM
-   is set, expect that insn have 8bit immediate alternative.  */
-int
-ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
-{
-  int len = 0;
-  int i;
-  extract_insn_cached (insn);
-  for (i = recog_data.n_operands - 1; i >= 0; --i)
-    if (CONSTANT_P (recog_data.operand[i]))
-      {
-        enum attr_mode mode = get_attr_mode (insn);
-
-	gcc_assert (!len);
-	if (shortform && CONST_INT_P (recog_data.operand[i]))
-	  {
-	    HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
-	    switch (mode)
-	      {
-	      case MODE_QI:
-		len = 1;
-		continue;
-	      case MODE_HI:
-		ival = trunc_int_for_mode (ival, HImode);
-		break;
-	      case MODE_SI:
-		ival = trunc_int_for_mode (ival, SImode);
-		break;
-	      default:
-		break;
-	      }
-	    if (IN_RANGE (ival, -128, 127))
-	      {
-		len = 1;
-		continue;
-	      }
-	  }
-	switch (mode)
-	  {
-	  case MODE_QI:
-	    len = 1;
-	    break;
-	  case MODE_HI:
-	    len = 2;
-	    break;
-	  case MODE_SI:
-	    len = 4;
-	    break;
-	  /* Immediates for DImode instructions are encoded
-	     as 32bit sign extended values.  */
-	  case MODE_DI:
-	    len = 4;
-	    break;
-	  default:
-	    fatal_insn ("unknown insn mode", insn);
-	}
-      }
-  return len;
-}
-
-/* Compute default value for "length_address" attribute.  */
-int
-ix86_attr_length_address_default (rtx_insn *insn)
-{
-  int i;
-
-  if (get_attr_type (insn) == TYPE_LEA)
-    {
-      rtx set = PATTERN (insn), addr;
-
-      if (GET_CODE (set) == PARALLEL)
-	set = XVECEXP (set, 0, 0);
-
-      gcc_assert (GET_CODE (set) == SET);
-
-      addr = SET_SRC (set);
-
-      return memory_address_length (addr, true);
-    }
-
-  extract_insn_cached (insn);
-  for (i = recog_data.n_operands - 1; i >= 0; --i)
-    {
-      rtx op = recog_data.operand[i];
-      if (MEM_P (op))
-	{
-	  constrain_operands_cached (insn, reload_completed);
-	  if (which_alternative != -1)
-	    {
-	      const char *constraints = recog_data.constraints[i];
-	      int alt = which_alternative;
-
-	      while (*constraints == '=' || *constraints == '+')
-		constraints++;
-	      while (alt-- > 0)
-	        while (*constraints++ != ',')
-		  ;
-	      /* Skip ignored operands.  */
-	      if (*constraints == 'X')
-		continue;
-	    }
-
-	  int len = memory_address_length (XEXP (op, 0), false);
-
-	  /* Account for segment prefix for non-default addr spaces.  */
-	  if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
-	    len++;
-
-	  return len;
-	}
-    }
-  return 0;
-}
-
-/* Compute default value for "length_vex" attribute. It includes
-   2 or 3 byte VEX prefix and 1 opcode byte.  */
-
-int
-ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
-			      bool has_vex_w)
-{
-  int i;
-
-  /* Only 0f opcode can use 2 byte VEX prefix and  VEX W bit uses 3
-     byte VEX prefix.  */
-  if (!has_0f_opcode || has_vex_w)
-    return 3 + 1;
-
- /* We can always use 2 byte VEX prefix in 32bit.  */
-  if (!TARGET_64BIT)
-    return 2 + 1;
-
-  extract_insn_cached (insn);
-
-  for (i = recog_data.n_operands - 1; i >= 0; --i)
-    if (REG_P (recog_data.operand[i]))
-      {
-	/* REX.W bit uses 3 byte VEX prefix.  */
-	if (GET_MODE (recog_data.operand[i]) == DImode
-	    && GENERAL_REG_P (recog_data.operand[i]))
-	  return 3 + 1;
-      }
-    else
-      {
-	/* REX.X or REX.B bits use 3 byte VEX prefix.  */
-	if (MEM_P (recog_data.operand[i])
-	    && x86_extended_reg_mentioned_p (recog_data.operand[i]))
-	  return 3 + 1;
-      }
-
-  return 2 + 1;
-}
-
-
-static bool
-ix86_class_likely_spilled_p (reg_class_t);
-
-/* Returns true if lhs of insn is HW function argument register and set up
-   is_spilled to true if it is likely spilled HW register.  */
-static bool
-insn_is_function_arg (rtx insn, bool* is_spilled)
-{
-  rtx dst;
-
-  if (!NONDEBUG_INSN_P (insn))
-    return false;
-  /* Call instructions are not movable, ignore it.  */
-  if (CALL_P (insn))
-    return false;
-  insn = PATTERN (insn);
-  if (GET_CODE (insn) == PARALLEL)
-    insn = XVECEXP (insn, 0, 0);
-  if (GET_CODE (insn) != SET)
-    return false;
-  dst = SET_DEST (insn);
-  if (REG_P (dst) && HARD_REGISTER_P (dst)
-      && ix86_function_arg_regno_p (REGNO (dst)))
-    {
-      /* Is it likely spilled HW register?  */
-      if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
-	  && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
-	*is_spilled = true;
-      return true;
-    }
-  return false;
-}
-
-/* Add output dependencies for chain of function adjacent arguments if only
-   there is a move to likely spilled HW register.  Return first argument
-   if at least one dependence was added or NULL otherwise.  */
-static rtx_insn *
-add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
-{
-  rtx_insn *insn;
-  rtx_insn *last = call;
-  rtx_insn *first_arg = NULL;
-  bool is_spilled = false;
-
-  head = PREV_INSN (head);
-
-  /* Find nearest to call argument passing instruction.  */
-  while (true)
-    {
-      last = PREV_INSN (last);
-      if (last == head)
-	return NULL;
-      if (!NONDEBUG_INSN_P (last))
-	continue;
-      if (insn_is_function_arg (last, &is_spilled))
-	break;
-      return NULL;
-    }
-
-  first_arg = last;
-  while (true)
-    {
-      insn = PREV_INSN (last);
-      if (!INSN_P (insn))
-	break;
-      if (insn == head)
-	break;
-      if (!NONDEBUG_INSN_P (insn))
-	{
-	  last = insn;
-	  continue;
-	}
-      if (insn_is_function_arg (insn, &is_spilled))
-	{
-	  /* Add output depdendence between two function arguments if chain
-	     of output arguments contains likely spilled HW registers.  */
-	  if (is_spilled)
-	    add_dependence (first_arg, insn, REG_DEP_OUTPUT);
-	  first_arg = last = insn;
-	}
-      else
-	break;
-    }
-  if (!is_spilled)
-    return NULL;
-  return first_arg;
-}
-
-/* Add output or anti dependency from insn to first_arg to restrict its code
-   motion.  */
-static void
-avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
-{
-  rtx set;
-  rtx tmp;
-
-  set = single_set (insn);
-  if (!set)
-    return;
-  tmp = SET_DEST (set);
-  if (REG_P (tmp))
-    {
-      /* Add output dependency to the first function argument.  */
-      add_dependence (first_arg, insn, REG_DEP_OUTPUT);
-      return;
-    }
-  /* Add anti dependency.  */
-  add_dependence (first_arg, insn, REG_DEP_ANTI);
-}
-
-/* Avoid cross block motion of function argument through adding dependency
-   from the first non-jump instruction in bb.  */
-static void
-add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
-{
-  rtx_insn *insn = BB_END (bb);
-
-  while (insn)
-    {
-      if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
-	{
-	  rtx set = single_set (insn);
-	  if (set)
-	    {
-	      avoid_func_arg_motion (arg, insn);
-	      return;
-	    }
-	}
-      if (insn == BB_HEAD (bb))
-	return;
-      insn = PREV_INSN (insn);
-    }
-}
-
-/* Hook for pre-reload schedule - avoid motion of function arguments
-   passed in likely spilled HW registers.  */
-static void
-ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
-{
-  rtx_insn *insn;
-  rtx_insn *first_arg = NULL;
-  if (reload_completed)
-    return;
-  while (head != tail && DEBUG_INSN_P (head))
-    head = NEXT_INSN (head);
-  for (insn = tail; insn != head; insn = PREV_INSN (insn))
-    if (INSN_P (insn) && CALL_P (insn))
-      {
-	first_arg = add_parameter_dependencies (insn, head);
-	if (first_arg)
-	  {
-	    /* Add dependee for first argument to predecessors if only
-	       region contains more than one block.  */
-	    basic_block bb =  BLOCK_FOR_INSN (insn);
-	    int rgn = CONTAINING_RGN (bb->index);
-	    int nr_blks = RGN_NR_BLOCKS (rgn);
-	    /* Skip trivial regions and region head blocks that can have
-	       predecessors outside of region.  */
-	    if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
-	      {
-		edge e;
-		edge_iterator ei;
-
-		/* Regions are SCCs with the exception of selective
-		   scheduling with pipelining of outer blocks enabled.
-		   So also check that immediate predecessors of a non-head
-		   block are in the same region.  */
-		FOR_EACH_EDGE (e, ei, bb->preds)
-		  {
-		    /* Avoid creating of loop-carried dependencies through
-		       using topological ordering in the region.  */
-		    if (rgn == CONTAINING_RGN (e->src->index)
-			&& BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
-		      add_dependee_for_func_arg (first_arg, e->src); 
-		  }
-	      }
-	    insn = first_arg;
-	    if (insn == head)
-	      break;
-	  }
-      }
-    else if (first_arg)
-      avoid_func_arg_motion (first_arg, insn);
-}
-
-/* Hook for pre-reload schedule - set priority of moves from likely spilled
-   HW registers to maximum, to schedule them at soon as possible. These are
-   moves from function argument registers at the top of the function entry
-   and moves from function return value registers after call.  */
-static int
-ix86_adjust_priority (rtx_insn *insn, int priority)
-{
-  rtx set;
-
-  if (reload_completed)
-    return priority;
-
-  if (!NONDEBUG_INSN_P (insn))
-    return priority;
-
-  set = single_set (insn);
-  if (set)
-    {
-      rtx tmp = SET_SRC (set);
-      if (REG_P (tmp)
-          && HARD_REGISTER_P (tmp)
-          && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
-          && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
-	return current_sched_info->sched_max_insns_priority;
-    }
-
-  return priority;
-}
-
-/* Prepare for scheduling pass.  */
-static void
-ix86_sched_init_global (FILE *, int, int)
-{
-  /* Install scheduling hooks for current CPU.  Some of these hooks are used
-     in time-critical parts of the scheduler, so we only set them up when
-     they are actually used.  */
-  switch (ix86_tune)
-    {
-    case PROCESSOR_CORE2:
-    case PROCESSOR_NEHALEM:
-    case PROCESSOR_SANDYBRIDGE:
-    case PROCESSOR_HASWELL:
-    case PROCESSOR_GENERIC:
-      /* Do not perform multipass scheduling for pre-reload schedule
-         to save compile time.  */
-      if (reload_completed)
-	{
-	  ix86_core2i7_init_hooks ();
-	  break;
-	}
-      /* Fall through.  */
-    default:
-      targetm.sched.dfa_post_advance_cycle = NULL;
-      targetm.sched.first_cycle_multipass_init = NULL;
-      targetm.sched.first_cycle_multipass_begin = NULL;
-      targetm.sched.first_cycle_multipass_issue = NULL;
-      targetm.sched.first_cycle_multipass_backtrack = NULL;
-      targetm.sched.first_cycle_multipass_end = NULL;
-      targetm.sched.first_cycle_multipass_fini = NULL;
-      break;
-    }
-}
-
-
-/* Implement TARGET_STATIC_RTX_ALIGNMENT.  */
-
-static HOST_WIDE_INT
-ix86_static_rtx_alignment (machine_mode mode)
-{
-  if (mode == DFmode)
-    return 64;
-  if (ALIGN_MODE_128 (mode))
-    return MAX (128, GET_MODE_ALIGNMENT (mode));
-  return GET_MODE_ALIGNMENT (mode);
-}
-
-/* Implement TARGET_CONSTANT_ALIGNMENT.  */
-
-static HOST_WIDE_INT
-ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
-{
-  if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
-      || TREE_CODE (exp) == INTEGER_CST)
-    {
-      machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
-      HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
-      return MAX (mode_align, align);
-    }
-  else if (!optimize_size && TREE_CODE (exp) == STRING_CST
-	   && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
-    return BITS_PER_WORD;
-
-  return align;
-}
-
-/* Implement TARGET_EMPTY_RECORD_P.  */
-
-static bool
-ix86_is_empty_record (const_tree type)
-{
-  if (!TARGET_64BIT)
-    return false;
-  return default_is_empty_record (type);
-}
-
-/* Implement TARGET_WARN_PARAMETER_PASSING_ABI.  */
-
-static void
-ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
-{
-  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
-
-  if (!cum->warn_empty)
-    return;
-
-  if (!TYPE_EMPTY_P (type))
-    return;
-
-  /* Don't warn if the function isn't visible outside of the TU.  */
-  if (cum->decl && !TREE_PUBLIC (cum->decl))
-    return;
-
-  const_tree ctx = get_ultimate_context (cum->decl);
-  if (ctx != NULL_TREE
-      && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
-    return;
-
-  /* If the actual size of the type is zero, then there is no change
-     in how objects of this size are passed.  */
-  if (int_size_in_bytes (type) == 0)
-    return;
-
-  warning (OPT_Wabi, "empty class %qT parameter passing ABI "
-	   "changes in %<-fabi-version=12%> (GCC 8)", type);
-
-  /* Only warn once.  */
-  cum->warn_empty = false;
-}
-
-/* This hook returns name of multilib ABI.  */
-
-static const char *
-ix86_get_multilib_abi_name (void)
-{
-  if (!(TARGET_64BIT_P (ix86_isa_flags)))
-    return "i386";
-  else if (TARGET_X32_P (ix86_isa_flags))
-    return "x32";
-  else
-    return "x86_64";
-}
-
-/* Compute the alignment for a variable for Intel MCU psABI.  TYPE is
-   the data type, and ALIGN is the alignment that the object would
-   ordinarily have.  */
-
-static int
-iamcu_alignment (tree type, int align)
-{
-  machine_mode mode;
-
-  if (align < 32 || TYPE_USER_ALIGN (type))
-    return align;
-
-  /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
-     bytes.  */
-  mode = TYPE_MODE (strip_array_types (type));
-  switch (GET_MODE_CLASS (mode))
-    {
-    case MODE_INT:
-    case MODE_COMPLEX_INT:
-    case MODE_COMPLEX_FLOAT:
-    case MODE_FLOAT:
-    case MODE_DECIMAL_FLOAT:
-      return 32;
-    default:
-      return align;
-    }
-}
-
-/* Compute the alignment for a static variable.
-   TYPE is the data type, and ALIGN is the alignment that
-   the object would ordinarily have.  The value of this function is used
-   instead of that alignment to align the object.  */
-
-int
-ix86_data_alignment (tree type, int align, bool opt)
-{
-  /* GCC 4.8 and earlier used to incorrectly assume this alignment even
-     for symbols from other compilation units or symbols that don't need
-     to bind locally.  In order to preserve some ABI compatibility with
-     those compilers, ensure we don't decrease alignment from what we
-     used to assume.  */
-
-  int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
-
-  /* A data structure, equal or greater than the size of a cache line
-     (64 bytes in the Pentium 4 and other recent Intel processors, including
-     processors based on Intel Core microarchitecture) should be aligned
-     so that its base address is a multiple of a cache line size.  */
-
-  int max_align
-    = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
-
-  if (max_align < BITS_PER_WORD)
-    max_align = BITS_PER_WORD;
-
-  switch (ix86_align_data_type)
-    {
-    case ix86_align_data_type_abi: opt = false; break;
-    case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
-    case ix86_align_data_type_cacheline: break;
-    }
-
-  if (TARGET_IAMCU)
-    align = iamcu_alignment (type, align);
-
-  if (opt
-      && AGGREGATE_TYPE_P (type)
-      && TYPE_SIZE (type)
-      && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
-    {
-      if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
-	  && align < max_align_compat)
-	align = max_align_compat;
-      if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
-	  && align < max_align)
-	align = max_align;
-    }
-
-  /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
-     to 16byte boundary.  */
-  if (TARGET_64BIT)
-    {
-      if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
-	  && TYPE_SIZE (type)
-	  && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
-	  && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
-	  && align < 128)
-	return 128;
-    }
-
-  if (!opt)
-    return align;
-
-  if (TREE_CODE (type) == ARRAY_TYPE)
-    {
-      if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
-	return 64;
-      if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
-	return 128;
-    }
-  else if (TREE_CODE (type) == COMPLEX_TYPE)
-    {
-
-      if (TYPE_MODE (type) == DCmode && align < 64)
-	return 64;
-      if ((TYPE_MODE (type) == XCmode
-	   || TYPE_MODE (type) == TCmode) && align < 128)
-	return 128;
-    }
-  else if ((TREE_CODE (type) == RECORD_TYPE
-	    || TREE_CODE (type) == UNION_TYPE
-	    || TREE_CODE (type) == QUAL_UNION_TYPE)
-	   && TYPE_FIELDS (type))
-    {
-      if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
-	return 64;
-      if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
-	return 128;
-    }
-  else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
-	   || TREE_CODE (type) == INTEGER_TYPE)
-    {
-      if (TYPE_MODE (type) == DFmode && align < 64)
-	return 64;
-      if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
-	return 128;
-    }
-
-  return align;
-}
-
-/* Compute the alignment for a local variable or a stack slot.  EXP is
-   the data type or decl itself, MODE is the widest mode available and
-   ALIGN is the alignment that the object would ordinarily have.  The
-   value of this macro is used instead of that alignment to align the
-   object.  */
-
-unsigned int
-ix86_local_alignment (tree exp, machine_mode mode,
-		      unsigned int align)
-{
-  tree type, decl;
-
-  if (exp && DECL_P (exp))
-    {
-      type = TREE_TYPE (exp);
-      decl = exp;
-    }
-  else
-    {
-      type = exp;
-      decl = NULL;
-    }
-
-  /* Don't do dynamic stack realignment for long long objects with
-     -mpreferred-stack-boundary=2.  */
-  if (!TARGET_64BIT
-      && align == 64
-      && ix86_preferred_stack_boundary < 64
-      && (mode == DImode || (type && TYPE_MODE (type) == DImode))
-      && (!type || !TYPE_USER_ALIGN (type))
-      && (!decl || !DECL_USER_ALIGN (decl)))
-    align = 32;
-
-  /* If TYPE is NULL, we are allocating a stack slot for caller-save
-     register in MODE.  We will return the largest alignment of XF
-     and DF.  */
-  if (!type)
-    {
-      if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
-	align = GET_MODE_ALIGNMENT (DFmode);
-      return align;
-    }
-
-  /* Don't increase alignment for Intel MCU psABI.  */
-  if (TARGET_IAMCU)
-    return align;
-
-  /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
-     to 16byte boundary.  Exact wording is:
-
-     An array uses the same alignment as its elements, except that a local or
-     global array variable of length at least 16 bytes or
-     a C99 variable-length array variable always has alignment of at least 16 bytes.
-
-     This was added to allow use of aligned SSE instructions at arrays.  This
-     rule is meant for static storage (where compiler cannot do the analysis
-     by itself).  We follow it for automatic variables only when convenient.
-     We fully control everything in the function compiled and functions from
-     other unit cannot rely on the alignment.
-
-     Exclude va_list type.  It is the common case of local array where
-     we cannot benefit from the alignment.  
-
-     TODO: Probably one should optimize for size only when var is not escaping.  */
-  if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
-      && TARGET_SSE)
-    {
-      if (AGGREGATE_TYPE_P (type)
-	  && (va_list_type_node == NULL_TREE
-	      || (TYPE_MAIN_VARIANT (type)
-		  != TYPE_MAIN_VARIANT (va_list_type_node)))
-	  && TYPE_SIZE (type)
-	  && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
-	  && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
-	  && align < 128)
-	return 128;
-    }
-  if (TREE_CODE (type) == ARRAY_TYPE)
-    {
-      if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
-	return 64;
-      if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
-	return 128;
-    }
-  else if (TREE_CODE (type) == COMPLEX_TYPE)
-    {
-      if (TYPE_MODE (type) == DCmode && align < 64)
-	return 64;
-      if ((TYPE_MODE (type) == XCmode
-	   || TYPE_MODE (type) == TCmode) && align < 128)
-	return 128;
-    }
-  else if ((TREE_CODE (type) == RECORD_TYPE
-	    || TREE_CODE (type) == UNION_TYPE
-	    || TREE_CODE (type) == QUAL_UNION_TYPE)
-	   && TYPE_FIELDS (type))
-    {
-      if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
-	return 64;
-      if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
-	return 128;
-    }
-  else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
-	   || TREE_CODE (type) == INTEGER_TYPE)
-    {
-
-      if (TYPE_MODE (type) == DFmode && align < 64)
-	return 64;
-      if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
-	return 128;
-    }
-  return align;
-}
-
-/* Compute the minimum required alignment for dynamic stack realignment
-   purposes for a local variable, parameter or a stack slot.  EXP is
-   the data type or decl itself, MODE is its mode and ALIGN is the
-   alignment that the object would ordinarily have.  */
-
-unsigned int
-ix86_minimum_alignment (tree exp, machine_mode mode,
-			unsigned int align)
-{
-  tree type, decl;
-
-  if (exp && DECL_P (exp))
-    {
-      type = TREE_TYPE (exp);
-      decl = exp;
-    }
-  else
-    {
-      type = exp;
-      decl = NULL;
-    }
-
-  if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
-    return align;
-
-  /* Don't do dynamic stack realignment for long long objects with
-     -mpreferred-stack-boundary=2.  */
-  if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
-      && (!type || !TYPE_USER_ALIGN (type))
-      && (!decl || !DECL_USER_ALIGN (decl)))
-    {
-      gcc_checking_assert (!TARGET_STV);
-      return 32;
-    }
-
-  return align;
-}
-
-/* Find a location for the static chain incoming to a nested function.
-   This is a register, unless all free registers are used by arguments.  */
-
-static rtx
-ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
-{
-  unsigned regno;
-
-  if (TARGET_64BIT)
-    {
-      /* We always use R10 in 64-bit mode.  */
-      regno = R10_REG;
-    }
-  else
-    {
-      const_tree fntype, fndecl;
-      unsigned int ccvt;
-
-      /* By default in 32-bit mode we use ECX to pass the static chain.  */
-      regno = CX_REG;
-
-      if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
-	{
-          fntype = TREE_TYPE (fndecl_or_type);
-	  fndecl = fndecl_or_type;
-	}
-      else
-	{
-	  fntype = fndecl_or_type;
-	  fndecl = NULL;
-	}
-
-      ccvt = ix86_get_callcvt (fntype);
-      if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
-	{
-	  /* Fastcall functions use ecx/edx for arguments, which leaves
-	     us with EAX for the static chain.
-	     Thiscall functions use ecx for arguments, which also
-	     leaves us with EAX for the static chain.  */
-	  regno = AX_REG;
-	}
-      else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
-	{
-	  /* Thiscall functions use ecx for arguments, which leaves
-	     us with EAX and EDX for the static chain.
-	     We are using for abi-compatibility EAX.  */
-	  regno = AX_REG;
-	}
-      else if (ix86_function_regparm (fntype, fndecl) == 3)
-	{
-	  /* For regparm 3, we have no free call-clobbered registers in
-	     which to store the static chain.  In order to implement this,
-	     we have the trampoline push the static chain to the stack.
-	     However, we can't push a value below the return address when
-	     we call the nested function directly, so we have to use an
-	     alternate entry point.  For this we use ESI, and have the
-	     alternate entry point push ESI, so that things appear the
-	     same once we're executing the nested function.  */
-	  if (incoming_p)
-	    {
-	      if (fndecl == current_function_decl
-		  && !ix86_static_chain_on_stack)
-		{
-		  gcc_assert (!reload_completed);
-		  ix86_static_chain_on_stack = true;
-		}
-	      return gen_frame_mem (SImode,
-				    plus_constant (Pmode,
-						   arg_pointer_rtx, -8));
-	    }
-	  regno = SI_REG;
-	}
-    }
-
-  return gen_rtx_REG (Pmode, regno);
-}
-
-/* Emit RTL insns to initialize the variable parts of a trampoline.
-   FNDECL is the decl of the target address; M_TRAMP is a MEM for
-   the trampoline, and CHAIN_VALUE is an RTX for the static chain
-   to be passed to the target function.  */
-
-static void
-ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
-{
-  rtx mem, fnaddr;
-  int opcode;
-  int offset = 0;
-  bool need_endbr = (flag_cf_protection & CF_BRANCH);
-
-  fnaddr = XEXP (DECL_RTL (fndecl), 0);
-
-  if (TARGET_64BIT)
-    {
-      int size;
-
-      if (need_endbr)
-	{
-	  /* Insert ENDBR64.  */
-	  mem = adjust_address (m_tramp, SImode, offset);
-	  emit_move_insn (mem, gen_int_mode (0xfa1e0ff3, SImode));
-	  offset += 4;
-	}
-
-      /* Load the function address to r11.  Try to load address using
-	 the shorter movl instead of movabs.  We may want to support
-	 movq for kernel mode, but kernel does not use trampolines at
-	 the moment.  FNADDR is a 32bit address and may not be in
-	 DImode when ptr_mode == SImode.  Always use movl in this
-	 case.  */
-      if (ptr_mode == SImode
-	  || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
-	{
-	  fnaddr = copy_addr_to_reg (fnaddr);
-
-	  mem = adjust_address (m_tramp, HImode, offset);
-	  emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
-
-	  mem = adjust_address (m_tramp, SImode, offset + 2);
-	  emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
-	  offset += 6;
-	}
-      else
-	{
-	  mem = adjust_address (m_tramp, HImode, offset);
-	  emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
-
-	  mem = adjust_address (m_tramp, DImode, offset + 2);
-	  emit_move_insn (mem, fnaddr);
-	  offset += 10;
-	}
-
-      /* Load static chain using movabs to r10.  Use the shorter movl
-         instead of movabs when ptr_mode == SImode.  */
-      if (ptr_mode == SImode)
-	{
-	  opcode = 0xba41;
-	  size = 6;
-	}
-      else
-	{
-	  opcode = 0xba49;
-	  size = 10;
-	}
-
-      mem = adjust_address (m_tramp, HImode, offset);
-      emit_move_insn (mem, gen_int_mode (opcode, HImode));
-
-      mem = adjust_address (m_tramp, ptr_mode, offset + 2);
-      emit_move_insn (mem, chain_value);
-      offset += size;
-
-      /* Jump to r11; the last (unused) byte is a nop, only there to
-	 pad the write out to a single 32-bit store.  */
-      mem = adjust_address (m_tramp, SImode, offset);
-      emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
-      offset += 4;
-    }
-  else
-    {
-      rtx disp, chain;
-
-      /* Depending on the static chain location, either load a register
-	 with a constant, or push the constant to the stack.  All of the
-	 instructions are the same size.  */
-      chain = ix86_static_chain (fndecl, true);
-      if (REG_P (chain))
-	{
-	  switch (REGNO (chain))
-	    {
-	    case AX_REG:
-	      opcode = 0xb8; break;
-	    case CX_REG:
-	      opcode = 0xb9; break;
-	    default:
-	      gcc_unreachable ();
-	    }
-	}
-      else
-	opcode = 0x68;
-
-      if (need_endbr)
-	{
-	  /* Insert ENDBR32.  */
-	  mem = adjust_address (m_tramp, SImode, offset);
-	  emit_move_insn (mem, gen_int_mode (0xfb1e0ff3, SImode));
-	  offset += 4;
-	}
-
-      mem = adjust_address (m_tramp, QImode, offset);
-      emit_move_insn (mem, gen_int_mode (opcode, QImode));
-
-      mem = adjust_address (m_tramp, SImode, offset + 1);
-      emit_move_insn (mem, chain_value);
-      offset += 5;
-
-      mem = adjust_address (m_tramp, QImode, offset);
-      emit_move_insn (mem, gen_int_mode (0xe9, QImode));
-
-      mem = adjust_address (m_tramp, SImode, offset + 1);
-
-      /* Compute offset from the end of the jmp to the target function.
-	 In the case in which the trampoline stores the static chain on
-	 the stack, we need to skip the first insn which pushes the
-	 (call-saved) register static chain; this push is 1 byte.  */
-      offset += 5;
-      int skip = MEM_P (chain) ? 1 : 0;
-      /* Skip ENDBR32 at the entry of the target function.  */
-      if (need_endbr
-	  && !cgraph_node::get (fndecl)->only_called_directly_p ())
-	skip += 4;
-      disp = expand_binop (SImode, sub_optab, fnaddr,
-			   plus_constant (Pmode, XEXP (m_tramp, 0),
-					  offset - skip),
-			   NULL_RTX, 1, OPTAB_DIRECT);
-      emit_move_insn (mem, disp);
-    }
-
-  gcc_assert (offset <= TRAMPOLINE_SIZE);
-
-#ifdef HAVE_ENABLE_EXECUTE_STACK
-#ifdef CHECK_EXECUTE_STACK_ENABLED
-  if (CHECK_EXECUTE_STACK_ENABLED)
-#endif
-  emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
-		     LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
-#endif
-}
-
-static bool
-ix86_allocate_stack_slots_for_args (void)
-{
-  /* Naked functions should not allocate stack slots for arguments.  */
-  return !ix86_function_naked (current_function_decl);
-}
-
-static bool
-ix86_warn_func_return (tree decl)
-{
-  /* Naked functions are implemented entirely in assembly, including the
-     return sequence, so suppress warnings about this.  */
-  return !ix86_function_naked (decl);
-}
-
-/* The following file contains several enumerations and data structures
-   built from the definitions in i386-builtin-types.def.  */
-
-#include "i386-builtin-types.inc"
-
-/* Table for the ix86 builtin non-function types.  */
-static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
-
-/* Retrieve an element from the above table, building some of
-   the types lazily.  */
-
-static tree
-ix86_get_builtin_type (enum ix86_builtin_type tcode)
-{
-  unsigned int index;
-  tree type, itype;
-
-  gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
-
-  type = ix86_builtin_type_tab[(int) tcode];
-  if (type != NULL)
-    return type;
-
-  gcc_assert (tcode > IX86_BT_LAST_PRIM);
-  if (tcode <= IX86_BT_LAST_VECT)
-    {
-      machine_mode mode;
-
-      index = tcode - IX86_BT_LAST_PRIM - 1;
-      itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
-      mode = ix86_builtin_type_vect_mode[index];
-
-      type = build_vector_type_for_mode (itype, mode);
-    }
-  else
-    {
-      int quals;
-
-      index = tcode - IX86_BT_LAST_VECT - 1;
-      if (tcode <= IX86_BT_LAST_PTR)
-	quals = TYPE_UNQUALIFIED;
-      else
-	quals = TYPE_QUAL_CONST;
-
-      itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
-      if (quals != TYPE_UNQUALIFIED)
-	itype = build_qualified_type (itype, quals);
-
-      type = build_pointer_type (itype);
-    }
-
-  ix86_builtin_type_tab[(int) tcode] = type;
-  return type;
-}
-
-/* Table for the ix86 builtin function types.  */
-static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
-
-/* Retrieve an element from the above table, building some of
-   the types lazily.  */
-
-static tree
-ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
-{
-  tree type;
-
-  gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
-
-  type = ix86_builtin_func_type_tab[(int) tcode];
-  if (type != NULL)
-    return type;
-
-  if (tcode <= IX86_BT_LAST_FUNC)
-    {
-      unsigned start = ix86_builtin_func_start[(int) tcode];
-      unsigned after = ix86_builtin_func_start[(int) tcode + 1];
-      tree rtype, atype, args = void_list_node;
-      unsigned i;
-
-      rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
-      for (i = after - 1; i > start; --i)
-	{
-	  atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
-	  args = tree_cons (NULL, atype, args);
-	}
-
-      type = build_function_type (rtype, args);
-    }
-  else
-    {
-      unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
-      enum ix86_builtin_func_type icode;
-
-      icode = ix86_builtin_func_alias_base[index];
-      type = ix86_get_builtin_func_type (icode);
-    }
-
-  ix86_builtin_func_type_tab[(int) tcode] = type;
-  return type;
-}
-
-
-/* Codes for all the SSE/MMX builtins.  Builtins not mentioned in any
-   bdesc_* arrays below should come first, then builtins for each bdesc_*
-   array in ascending order, so that we can use direct array accesses.  */
-enum ix86_builtins
-{
-  IX86_BUILTIN_MASKMOVQ,
-  IX86_BUILTIN_LDMXCSR,
-  IX86_BUILTIN_STMXCSR,
-  IX86_BUILTIN_MASKMOVDQU,
-  IX86_BUILTIN_PSLLDQ128,
-  IX86_BUILTIN_CLFLUSH,
-  IX86_BUILTIN_MONITOR,
-  IX86_BUILTIN_MWAIT,
-  IX86_BUILTIN_UMONITOR,
-  IX86_BUILTIN_UMWAIT,
-  IX86_BUILTIN_TPAUSE,
-  IX86_BUILTIN_CLZERO,
-  IX86_BUILTIN_CLDEMOTE,
-  IX86_BUILTIN_VEC_INIT_V2SI,
-  IX86_BUILTIN_VEC_INIT_V4HI,
-  IX86_BUILTIN_VEC_INIT_V8QI,
-  IX86_BUILTIN_VEC_EXT_V2DF,
-  IX86_BUILTIN_VEC_EXT_V2DI,
-  IX86_BUILTIN_VEC_EXT_V4SF,
-  IX86_BUILTIN_VEC_EXT_V4SI,
-  IX86_BUILTIN_VEC_EXT_V8HI,
-  IX86_BUILTIN_VEC_EXT_V2SI,
-  IX86_BUILTIN_VEC_EXT_V4HI,
-  IX86_BUILTIN_VEC_EXT_V16QI,
-  IX86_BUILTIN_VEC_SET_V2DI,
-  IX86_BUILTIN_VEC_SET_V4SF,
-  IX86_BUILTIN_VEC_SET_V4SI,
-  IX86_BUILTIN_VEC_SET_V8HI,
-  IX86_BUILTIN_VEC_SET_V4HI,
-  IX86_BUILTIN_VEC_SET_V16QI,
-  IX86_BUILTIN_GATHERSIV2DF,
-  IX86_BUILTIN_GATHERSIV4DF,
-  IX86_BUILTIN_GATHERDIV2DF,
-  IX86_BUILTIN_GATHERDIV4DF,
-  IX86_BUILTIN_GATHERSIV4SF,
-  IX86_BUILTIN_GATHERSIV8SF,
-  IX86_BUILTIN_GATHERDIV4SF,
-  IX86_BUILTIN_GATHERDIV8SF,
-  IX86_BUILTIN_GATHERSIV2DI,
-  IX86_BUILTIN_GATHERSIV4DI,
-  IX86_BUILTIN_GATHERDIV2DI,
-  IX86_BUILTIN_GATHERDIV4DI,
-  IX86_BUILTIN_GATHERSIV4SI,
-  IX86_BUILTIN_GATHERSIV8SI,
-  IX86_BUILTIN_GATHERDIV4SI,
-  IX86_BUILTIN_GATHERDIV8SI,
-  IX86_BUILTIN_GATHER3SIV8SF,
-  IX86_BUILTIN_GATHER3SIV4SF,
-  IX86_BUILTIN_GATHER3SIV4DF,
-  IX86_BUILTIN_GATHER3SIV2DF,
-  IX86_BUILTIN_GATHER3DIV8SF,
-  IX86_BUILTIN_GATHER3DIV4SF,
-  IX86_BUILTIN_GATHER3DIV4DF,
-  IX86_BUILTIN_GATHER3DIV2DF,
-  IX86_BUILTIN_GATHER3SIV8SI,
-  IX86_BUILTIN_GATHER3SIV4SI,
-  IX86_BUILTIN_GATHER3SIV4DI,
-  IX86_BUILTIN_GATHER3SIV2DI,
-  IX86_BUILTIN_GATHER3DIV8SI,
-  IX86_BUILTIN_GATHER3DIV4SI,
-  IX86_BUILTIN_GATHER3DIV4DI,
-  IX86_BUILTIN_GATHER3DIV2DI,
-  IX86_BUILTIN_SCATTERSIV8SF,
-  IX86_BUILTIN_SCATTERSIV4SF,
-  IX86_BUILTIN_SCATTERSIV4DF,
-  IX86_BUILTIN_SCATTERSIV2DF,
-  IX86_BUILTIN_SCATTERDIV8SF,
-  IX86_BUILTIN_SCATTERDIV4SF,
-  IX86_BUILTIN_SCATTERDIV4DF,
-  IX86_BUILTIN_SCATTERDIV2DF,
-  IX86_BUILTIN_SCATTERSIV8SI,
-  IX86_BUILTIN_SCATTERSIV4SI,
-  IX86_BUILTIN_SCATTERSIV4DI,
-  IX86_BUILTIN_SCATTERSIV2DI,
-  IX86_BUILTIN_SCATTERDIV8SI,
-  IX86_BUILTIN_SCATTERDIV4SI,
-  IX86_BUILTIN_SCATTERDIV4DI,
-  IX86_BUILTIN_SCATTERDIV2DI,
-  /* Alternate 4 and 8 element gather/scatter for the vectorizer
-     where all operands are 32-byte or 64-byte wide respectively.  */
-  IX86_BUILTIN_GATHERALTSIV4DF,
-  IX86_BUILTIN_GATHERALTDIV8SF,
-  IX86_BUILTIN_GATHERALTSIV4DI,
-  IX86_BUILTIN_GATHERALTDIV8SI,
-  IX86_BUILTIN_GATHER3ALTDIV16SF,
-  IX86_BUILTIN_GATHER3ALTDIV16SI,
-  IX86_BUILTIN_GATHER3ALTSIV4DF,
-  IX86_BUILTIN_GATHER3ALTDIV8SF,
-  IX86_BUILTIN_GATHER3ALTSIV4DI,
-  IX86_BUILTIN_GATHER3ALTDIV8SI,
-  IX86_BUILTIN_GATHER3ALTSIV8DF,
-  IX86_BUILTIN_GATHER3ALTSIV8DI,
-  IX86_BUILTIN_GATHER3DIV16SF,
-  IX86_BUILTIN_GATHER3DIV16SI,
-  IX86_BUILTIN_GATHER3DIV8DF,
-  IX86_BUILTIN_GATHER3DIV8DI,
-  IX86_BUILTIN_GATHER3SIV16SF,
-  IX86_BUILTIN_GATHER3SIV16SI,
-  IX86_BUILTIN_GATHER3SIV8DF,
-  IX86_BUILTIN_GATHER3SIV8DI,
-  IX86_BUILTIN_SCATTERALTSIV8DF,
-  IX86_BUILTIN_SCATTERALTDIV16SF,
-  IX86_BUILTIN_SCATTERALTSIV8DI,
-  IX86_BUILTIN_SCATTERALTDIV16SI,
-  IX86_BUILTIN_SCATTERALTSIV4DF,
-  IX86_BUILTIN_SCATTERALTDIV8SF,
-  IX86_BUILTIN_SCATTERALTSIV4DI,
-  IX86_BUILTIN_SCATTERALTDIV8SI,
-  IX86_BUILTIN_SCATTERALTSIV2DF,
-  IX86_BUILTIN_SCATTERALTDIV4SF,
-  IX86_BUILTIN_SCATTERALTSIV2DI,
-  IX86_BUILTIN_SCATTERALTDIV4SI,
-  IX86_BUILTIN_SCATTERDIV16SF,
-  IX86_BUILTIN_SCATTERDIV16SI,
-  IX86_BUILTIN_SCATTERDIV8DF,
-  IX86_BUILTIN_SCATTERDIV8DI,
-  IX86_BUILTIN_SCATTERSIV16SF,
-  IX86_BUILTIN_SCATTERSIV16SI,
-  IX86_BUILTIN_SCATTERSIV8DF,
-  IX86_BUILTIN_SCATTERSIV8DI,
-  IX86_BUILTIN_GATHERPFQPD,
-  IX86_BUILTIN_GATHERPFDPS,
-  IX86_BUILTIN_GATHERPFDPD,
-  IX86_BUILTIN_GATHERPFQPS,
-  IX86_BUILTIN_SCATTERPFDPD,
-  IX86_BUILTIN_SCATTERPFDPS,
-  IX86_BUILTIN_SCATTERPFQPD,
-  IX86_BUILTIN_SCATTERPFQPS,
-  IX86_BUILTIN_CLWB,
-  IX86_BUILTIN_CLFLUSHOPT,
-  IX86_BUILTIN_INFQ,
-  IX86_BUILTIN_HUGE_VALQ,
-  IX86_BUILTIN_NANQ,
-  IX86_BUILTIN_NANSQ,
-  IX86_BUILTIN_XABORT,
-  IX86_BUILTIN_ADDCARRYX32,
-  IX86_BUILTIN_ADDCARRYX64,
-  IX86_BUILTIN_SBB32,
-  IX86_BUILTIN_SBB64,
-  IX86_BUILTIN_RDRAND16_STEP,
-  IX86_BUILTIN_RDRAND32_STEP,
-  IX86_BUILTIN_RDRAND64_STEP,
-  IX86_BUILTIN_RDSEED16_STEP,
-  IX86_BUILTIN_RDSEED32_STEP,
-  IX86_BUILTIN_RDSEED64_STEP,
-  IX86_BUILTIN_MONITORX,
-  IX86_BUILTIN_MWAITX,
-  IX86_BUILTIN_CFSTRING,
-  IX86_BUILTIN_CPU_INIT,
-  IX86_BUILTIN_CPU_IS,
-  IX86_BUILTIN_CPU_SUPPORTS,
-  IX86_BUILTIN_READ_FLAGS,
-  IX86_BUILTIN_WRITE_FLAGS,
-
-  /* All the remaining builtins are tracked in bdesc_* arrays in
-     i386-builtin.def.  Don't add any IX86_BUILTIN_* enumerators after
-     this point.  */
-#define BDESC(mask, mask2, icode, name, code, comparison, flag)	\
-  code,
-#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag) \
-  code,									\
-  IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
-#define BDESC_END(kind, next_kind)
-
-#include "i386-builtin.def"
-
-#undef BDESC
-#undef BDESC_FIRST
-#undef BDESC_END
-
-  IX86_BUILTIN_MAX,
-
-  IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
-
-  /* Now just the aliases for bdesc_* start/end.  */
-#define BDESC(mask, mask2, icode, name, code, comparison, flag)
-#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag)
-#define BDESC_END(kind, next_kind) \
-  IX86_BUILTIN__BDESC_##kind##_LAST					    \
-    = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
-
-#include "i386-builtin.def"
-
-#undef BDESC
-#undef BDESC_FIRST
-#undef BDESC_END
-
-  /* Just to make sure there is no comma after the last enumerator.  */
-  IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
-};
-
-/* Table for the ix86 builtin decls.  */
-static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
-
-/* Table of all of the builtin functions that are possible with different ISA's
-   but are waiting to be built until a function is declared to use that
-   ISA.  */
-struct builtin_isa {
-  HOST_WIDE_INT isa;		/* isa_flags this builtin is defined for */
-  HOST_WIDE_INT isa2;		/* additional isa_flags this builtin is defined for */
-  const char *name;		/* function name */
-  enum ix86_builtin_func_type tcode; /* type to use in the declaration */
-  unsigned char const_p:1;	/* true if the declaration is constant */
-  unsigned char pure_p:1;	/* true if the declaration has pure attribute */
-  bool set_and_not_built_p;
-};
-
-static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
-
-/* Bits that can still enable any inclusion of a builtin.  */
-static HOST_WIDE_INT deferred_isa_values = 0;
-static HOST_WIDE_INT deferred_isa_values2 = 0;
-
-/* Add an ix86 target builtin function with CODE, NAME and TYPE.  Save the
-   MASK and MASK2 of which isa_flags and ix86_isa_flags2 to use in the
-   ix86_builtins_isa array.  Stores the function decl in the ix86_builtins
-   array.  Returns the function decl or NULL_TREE, if the builtin was not
-   added.
-
-   If the front end has a special hook for builtin functions, delay adding
-   builtin functions that aren't in the current ISA until the ISA is changed
-   with function specific optimization.  Doing so, can save about 300K for the
-   default compiler.  When the builtin is expanded, check at that time whether
-   it is valid.
-
-   If the front end doesn't have a special hook, record all builtins, even if
-   it isn't an instruction set in the current ISA in case the user uses
-   function specific options for a different ISA, so that we don't get scope
-   errors if a builtin is added in the middle of a function scope.  */
-
-static inline tree
-def_builtin (HOST_WIDE_INT mask, HOST_WIDE_INT mask2,
-	     const char *name,
-	     enum ix86_builtin_func_type tcode,
-	     enum ix86_builtins code)
-{
-  tree decl = NULL_TREE;
-
-  /* An instruction may be 64bit only regardless of ISAs.  */
-  if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
-    {
-      ix86_builtins_isa[(int) code].isa = mask;
-      ix86_builtins_isa[(int) code].isa2 = mask2;
-
-      mask &= ~OPTION_MASK_ISA_64BIT;
-
-      /* Filter out the masks most often ored together with others.  */
-      if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
-	  && mask != OPTION_MASK_ISA_AVX512VL)
-	mask &= ~OPTION_MASK_ISA_AVX512VL;
-      if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW)
-	  && mask != OPTION_MASK_ISA_AVX512BW)
-	mask &= ~OPTION_MASK_ISA_AVX512BW;
-
-      if (((mask2 == 0 || (mask2 & ix86_isa_flags2) != 0)
-	   && (mask == 0 || (mask & ix86_isa_flags) != 0))
-	  || (lang_hooks.builtin_function
-	      == lang_hooks.builtin_function_ext_scope))
-	{
-	  tree type = ix86_get_builtin_func_type (tcode);
-	  decl = add_builtin_function (name, type, code, BUILT_IN_MD,
-				       NULL, NULL_TREE);
-	  ix86_builtins[(int) code] = decl;
-	  ix86_builtins_isa[(int) code].set_and_not_built_p = false;
-	}
-      else
-	{
-	  /* Just MASK and MASK2 where set_and_not_built_p == true can potentially
-	     include a builtin.  */
-	  deferred_isa_values |= mask;
-	  deferred_isa_values2 |= mask2;
-	  ix86_builtins[(int) code] = NULL_TREE;
-	  ix86_builtins_isa[(int) code].tcode = tcode;
-	  ix86_builtins_isa[(int) code].name = name;
-	  ix86_builtins_isa[(int) code].const_p = false;
-	  ix86_builtins_isa[(int) code].pure_p = false;
-	  ix86_builtins_isa[(int) code].set_and_not_built_p = true;
-	}
-    }
-
-  return decl;
-}
-
-/* Like def_builtin, but also marks the function decl "const".  */
-
-static inline tree
-def_builtin_const (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, const char *name,
-		   enum ix86_builtin_func_type tcode, enum ix86_builtins code)
-{
-  tree decl = def_builtin (mask, mask2, name, tcode, code);
-  if (decl)
-    TREE_READONLY (decl) = 1;
-  else
-    ix86_builtins_isa[(int) code].const_p = true;
-
-  return decl;
-}
-
-/* Like def_builtin, but also marks the function decl "pure".  */
-
-static inline tree
-def_builtin_pure (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, const char *name,
-		  enum ix86_builtin_func_type tcode, enum ix86_builtins code)
-{
-  tree decl = def_builtin (mask, mask2, name, tcode, code);
-  if (decl)
-    DECL_PURE_P (decl) = 1;
-  else
-    ix86_builtins_isa[(int) code].pure_p = true;
-
-  return decl;
-}
-
-/* Add any new builtin functions for a given ISA that may not have been
-   declared.  This saves a bit of space compared to adding all of the
-   declarations to the tree, even if we didn't use them.  */
-
-static void
-ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
-{
-  isa &= ~OPTION_MASK_ISA_64BIT;
-
-  if ((isa & deferred_isa_values) == 0
-      && (isa2 & deferred_isa_values2) == 0)
-    return;
-
-  /* Bits in ISA value can be removed from potential isa values.  */
-  deferred_isa_values &= ~isa;
-  deferred_isa_values2 &= ~isa2;
-
-  int i;
-  tree saved_current_target_pragma = current_target_pragma;
-  current_target_pragma = NULL_TREE;
-
-  for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
-    {
-      if (((ix86_builtins_isa[i].isa & isa) != 0
-	   || (ix86_builtins_isa[i].isa2 & isa2) != 0)
-	  && ix86_builtins_isa[i].set_and_not_built_p)
-	{
-	  tree decl, type;
-
-	  /* Don't define the builtin again.  */
-	  ix86_builtins_isa[i].set_and_not_built_p = false;
-
-	  type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
-	  decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
-						 type, i, BUILT_IN_MD, NULL,
-						 NULL_TREE);
-
-	  ix86_builtins[i] = decl;
-	  if (ix86_builtins_isa[i].const_p)
-	    TREE_READONLY (decl) = 1;
-	}
-    }
-
-  current_target_pragma = saved_current_target_pragma;
-}
-
-/* Bits for builtin_description.flag.  */
-
-/* Set when we don't support the comparison natively, and should
-   swap_comparison in order to support it.  */
-#define BUILTIN_DESC_SWAP_OPERANDS	1
-
-struct builtin_description
-{
-  const HOST_WIDE_INT mask;
-  const HOST_WIDE_INT mask2;
-  const enum insn_code icode;
-  const char *const name;
-  const enum ix86_builtins code;
-  const enum rtx_code comparison;
-  const int flag;
-};
-
-#define MULTI_ARG_4_DF2_DI_I	V2DF_FTYPE_V2DF_V2DF_V2DI_INT
-#define MULTI_ARG_4_DF2_DI_I1	V4DF_FTYPE_V4DF_V4DF_V4DI_INT
-#define MULTI_ARG_4_SF2_SI_I	V4SF_FTYPE_V4SF_V4SF_V4SI_INT
-#define MULTI_ARG_4_SF2_SI_I1	V8SF_FTYPE_V8SF_V8SF_V8SI_INT
-#define MULTI_ARG_3_SF		V4SF_FTYPE_V4SF_V4SF_V4SF
-#define MULTI_ARG_3_DF		V2DF_FTYPE_V2DF_V2DF_V2DF
-#define MULTI_ARG_3_SF2		V8SF_FTYPE_V8SF_V8SF_V8SF
-#define MULTI_ARG_3_DF2		V4DF_FTYPE_V4DF_V4DF_V4DF
-#define MULTI_ARG_3_DI		V2DI_FTYPE_V2DI_V2DI_V2DI
-#define MULTI_ARG_3_SI		V4SI_FTYPE_V4SI_V4SI_V4SI
-#define MULTI_ARG_3_SI_DI	V4SI_FTYPE_V4SI_V4SI_V2DI
-#define MULTI_ARG_3_HI		V8HI_FTYPE_V8HI_V8HI_V8HI
-#define MULTI_ARG_3_HI_SI	V8HI_FTYPE_V8HI_V8HI_V4SI
-#define MULTI_ARG_3_QI		V16QI_FTYPE_V16QI_V16QI_V16QI
-#define MULTI_ARG_3_DI2		V4DI_FTYPE_V4DI_V4DI_V4DI
-#define MULTI_ARG_3_SI2		V8SI_FTYPE_V8SI_V8SI_V8SI
-#define MULTI_ARG_3_HI2		V16HI_FTYPE_V16HI_V16HI_V16HI
-#define MULTI_ARG_3_QI2		V32QI_FTYPE_V32QI_V32QI_V32QI
-#define MULTI_ARG_2_SF		V4SF_FTYPE_V4SF_V4SF
-#define MULTI_ARG_2_DF		V2DF_FTYPE_V2DF_V2DF
-#define MULTI_ARG_2_DI		V2DI_FTYPE_V2DI_V2DI
-#define MULTI_ARG_2_SI		V4SI_FTYPE_V4SI_V4SI
-#define MULTI_ARG_2_HI		V8HI_FTYPE_V8HI_V8HI
-#define MULTI_ARG_2_QI		V16QI_FTYPE_V16QI_V16QI
-#define MULTI_ARG_2_DI_IMM	V2DI_FTYPE_V2DI_SI
-#define MULTI_ARG_2_SI_IMM	V4SI_FTYPE_V4SI_SI
-#define MULTI_ARG_2_HI_IMM	V8HI_FTYPE_V8HI_SI
-#define MULTI_ARG_2_QI_IMM	V16QI_FTYPE_V16QI_SI
-#define MULTI_ARG_2_DI_CMP	V2DI_FTYPE_V2DI_V2DI_CMP
-#define MULTI_ARG_2_SI_CMP	V4SI_FTYPE_V4SI_V4SI_CMP
-#define MULTI_ARG_2_HI_CMP	V8HI_FTYPE_V8HI_V8HI_CMP
-#define MULTI_ARG_2_QI_CMP	V16QI_FTYPE_V16QI_V16QI_CMP
-#define MULTI_ARG_2_SF_TF	V4SF_FTYPE_V4SF_V4SF_TF
-#define MULTI_ARG_2_DF_TF	V2DF_FTYPE_V2DF_V2DF_TF
-#define MULTI_ARG_2_DI_TF	V2DI_FTYPE_V2DI_V2DI_TF
-#define MULTI_ARG_2_SI_TF	V4SI_FTYPE_V4SI_V4SI_TF
-#define MULTI_ARG_2_HI_TF	V8HI_FTYPE_V8HI_V8HI_TF
-#define MULTI_ARG_2_QI_TF	V16QI_FTYPE_V16QI_V16QI_TF
-#define MULTI_ARG_1_SF		V4SF_FTYPE_V4SF
-#define MULTI_ARG_1_DF		V2DF_FTYPE_V2DF
-#define MULTI_ARG_1_SF2		V8SF_FTYPE_V8SF
-#define MULTI_ARG_1_DF2		V4DF_FTYPE_V4DF
-#define MULTI_ARG_1_DI		V2DI_FTYPE_V2DI
-#define MULTI_ARG_1_SI		V4SI_FTYPE_V4SI
-#define MULTI_ARG_1_HI		V8HI_FTYPE_V8HI
-#define MULTI_ARG_1_QI		V16QI_FTYPE_V16QI
-#define MULTI_ARG_1_SI_DI	V2DI_FTYPE_V4SI
-#define MULTI_ARG_1_HI_DI	V2DI_FTYPE_V8HI
-#define MULTI_ARG_1_HI_SI	V4SI_FTYPE_V8HI
-#define MULTI_ARG_1_QI_DI	V2DI_FTYPE_V16QI
-#define MULTI_ARG_1_QI_SI	V4SI_FTYPE_V16QI
-#define MULTI_ARG_1_QI_HI	V8HI_FTYPE_V16QI
-
-#define BDESC(mask, mask2, icode, name, code, comparison, flag)	\
-  { mask, mask2, icode, name, code, comparison, flag },
-#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag) \
-static const struct builtin_description bdesc_##kind[] =		    \
-{									    \
-  BDESC (mask, mask2, icode, name, code, comparison, flag)
-#define BDESC_END(kind, next_kind) \
-};
-
-#include "i386-builtin.def"
-
-#undef BDESC
-#undef BDESC_FIRST
-#undef BDESC_END
-
-
-/* TM vector builtins.  */
-
-/* Reuse the existing x86-specific `struct builtin_description' cause
-   we're lazy.  Add casts to make them fit.  */
-static const struct builtin_description bdesc_tm[] =
-{
-  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
-  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
-  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
-  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
-  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
-  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
-  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
-
-  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
-  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
-  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
-  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
-  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
-  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
-  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
-
-  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
-  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
-  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
-  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
-  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
-  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
-  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
-
-  { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
-  { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
-  { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
-};
-
-/* Initialize the transactional memory vector load/store builtins.  */
-
-static void
-ix86_init_tm_builtins (void)
-{
-  enum ix86_builtin_func_type ftype;
-  const struct builtin_description *d;
-  size_t i;
-  tree decl;
-  tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
-  tree attrs_log, attrs_type_log;
-
-  if (!flag_tm)
-    return;
-
-  /* If there are no builtins defined, we must be compiling in a
-     language without trans-mem support.  */
-  if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
-    return;
-
-  /* Use whatever attributes a normal TM load has.  */
-  decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
-  attrs_load = DECL_ATTRIBUTES (decl);
-  attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
-  /* Use whatever attributes a normal TM store has.  */
-  decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
-  attrs_store = DECL_ATTRIBUTES (decl);
-  attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
-  /* Use whatever attributes a normal TM log has.  */
-  decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
-  attrs_log = DECL_ATTRIBUTES (decl);
-  attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
-
-  for (i = 0, d = bdesc_tm;
-       i < ARRAY_SIZE (bdesc_tm);
-       i++, d++)
-    {
-      if ((d->mask & ix86_isa_flags) != 0
-	  || (lang_hooks.builtin_function
-	      == lang_hooks.builtin_function_ext_scope))
-	{
-	  tree type, attrs, attrs_type;
-	  enum built_in_function code = (enum built_in_function) d->code;
-
-	  ftype = (enum ix86_builtin_func_type) d->flag;
-	  type = ix86_get_builtin_func_type (ftype);
-
-	  if (BUILTIN_TM_LOAD_P (code))
-	    {
-	      attrs = attrs_load;
-	      attrs_type = attrs_type_load;
-	    }
-	  else if (BUILTIN_TM_STORE_P (code))
-	    {
-	      attrs = attrs_store;
-	      attrs_type = attrs_type_store;
-	    }
-	  else
-	    {
-	      attrs = attrs_log;
-	      attrs_type = attrs_type_log;
-	    }
-	  decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
-				       /* The builtin without the prefix for
-					  calling it directly.  */
-				       d->name + strlen ("__builtin_"),
-				       attrs);
-	  /* add_builtin_function() will set the DECL_ATTRIBUTES, now
-	     set the TYPE_ATTRIBUTES.  */
-	  decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
-
-	  set_builtin_decl (code, decl, false);
-	}
-    }
-}
-
-/* Macros for verification of enum ix86_builtins order.  */
-#define BDESC_VERIFY(x, y, z) \
-  gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
-#define BDESC_VERIFYS(x, y, z) \
-  STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
-
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
-	       IX86_BUILTIN__BDESC_COMI_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
-	       IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
-	       IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
-	       IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
-	       IX86_BUILTIN__BDESC_ARGS_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
-	       IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
-	       IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
-	       IX86_BUILTIN__BDESC_CET_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN_MAX,
-	       IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
-
-/* Set up all the MMX/SSE builtins, even builtins for instructions that are not
-   in the current target ISA to allow the user to compile particular modules
-   with different target specific options that differ from the command line
-   options.  */
-static void
-ix86_init_mmx_sse_builtins (void)
-{
-  const struct builtin_description * d;
-  enum ix86_builtin_func_type ftype;
-  size_t i;
-
-  /* Add all special builtins with variable number of operands.  */
-  for (i = 0, d = bdesc_special_args;
-       i < ARRAY_SIZE (bdesc_special_args);
-       i++, d++)
-    {
-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
-      if (d->name == 0)
-	continue;
-
-      ftype = (enum ix86_builtin_func_type) d->flag;
-      def_builtin (d->mask, d->mask2, d->name, ftype, d->code);
-    }
-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
-		 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
-		 ARRAY_SIZE (bdesc_special_args) - 1);
-
-  /* Add all builtins with variable number of operands.  */
-  for (i = 0, d = bdesc_args;
-       i < ARRAY_SIZE (bdesc_args);
-       i++, d++)
-    {
-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
-      if (d->name == 0)
-	continue;
-
-      ftype = (enum ix86_builtin_func_type) d->flag;
-      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
-    }
-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
-		 IX86_BUILTIN__BDESC_ARGS_FIRST,
-		 ARRAY_SIZE (bdesc_args) - 1);
-
-  /* Add all builtins with rounding.  */
-  for (i = 0, d = bdesc_round_args;
-       i < ARRAY_SIZE (bdesc_round_args);
-       i++, d++)
-    {
-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
-      if (d->name == 0)
-	continue;
-
-      ftype = (enum ix86_builtin_func_type) d->flag;
-      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
-    }
-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
-		 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
-		 ARRAY_SIZE (bdesc_round_args) - 1);
-
-  /* pcmpestr[im] insns.  */
-  for (i = 0, d = bdesc_pcmpestr;
-       i < ARRAY_SIZE (bdesc_pcmpestr);
-       i++, d++)
-    {
-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
-      if (d->code == IX86_BUILTIN_PCMPESTRM128)
-	ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
-      else
-	ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
-      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
-    }
-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
-		 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
-		 ARRAY_SIZE (bdesc_pcmpestr) - 1);
-
-  /* pcmpistr[im] insns.  */
-  for (i = 0, d = bdesc_pcmpistr;
-       i < ARRAY_SIZE (bdesc_pcmpistr);
-       i++, d++)
-    {
-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
-      if (d->code == IX86_BUILTIN_PCMPISTRM128)
-	ftype = V16QI_FTYPE_V16QI_V16QI_INT;
-      else
-	ftype = INT_FTYPE_V16QI_V16QI_INT;
-      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
-    }
-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
-		 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
-		 ARRAY_SIZE (bdesc_pcmpistr) - 1);
-
-  /* comi/ucomi insns.  */
-  for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
-    {
-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
-      if (d->mask == OPTION_MASK_ISA_SSE2)
-	ftype = INT_FTYPE_V2DF_V2DF;
-      else
-	ftype = INT_FTYPE_V4SF_V4SF;
-      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
-    }
-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
-		 IX86_BUILTIN__BDESC_COMI_FIRST,
-		 ARRAY_SIZE (bdesc_comi) - 1);
-
-  /* SSE */
-  def_builtin (OPTION_MASK_ISA_SSE, 0,  "__builtin_ia32_ldmxcsr",
-	       VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
-  def_builtin_pure (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_stmxcsr",
-		    UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
-
-  /* SSE or 3DNow!A */
-  def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
-	       /* As it uses V4HImode, we have to require -mmmx too.  */
-	       | OPTION_MASK_ISA_MMX, 0,
-	       "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
-	       IX86_BUILTIN_MASKMOVQ);
-
-  /* SSE2 */
-  def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_maskmovdqu",
-	       VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
-
-  def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_clflush",
-	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
-  x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_mfence",
-			    VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
-
-  /* SSE3.  */
-  def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_monitor",
-	       VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
-  def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_mwait",
-	       VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
-
-  /* AES */
-  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
-		     "__builtin_ia32_aesenc128",
-		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
-  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
-		     "__builtin_ia32_aesenclast128",
-		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
-  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
-		     "__builtin_ia32_aesdec128",
-		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
-  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
-		     "__builtin_ia32_aesdeclast128",
-		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
-  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
-		     "__builtin_ia32_aesimc128",
-		     V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
-  def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
-		     "__builtin_ia32_aeskeygenassist128",
-		     V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
-
-  /* PCLMUL */
-  def_builtin_const (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2, 0,
-		     "__builtin_ia32_pclmulqdq128",
-		     V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
-
-  /* RDRND */
-  def_builtin (OPTION_MASK_ISA_RDRND, 0, "__builtin_ia32_rdrand16_step",
-	       INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
-  def_builtin (OPTION_MASK_ISA_RDRND, 0,  "__builtin_ia32_rdrand32_step",
-	       INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
-  def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT, 0,
-	       "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
-	       IX86_BUILTIN_RDRAND64_STEP);
-
-  /* AVX2 */
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv2df",
-		    V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
-		    IX86_BUILTIN_GATHERSIV2DF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4df",
-		    V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
-		    IX86_BUILTIN_GATHERSIV4DF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv2df",
-		    V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
-		    IX86_BUILTIN_GATHERDIV2DF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4df",
-		    V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
-		    IX86_BUILTIN_GATHERDIV4DF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4sf",
-		    V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
-		    IX86_BUILTIN_GATHERSIV4SF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv8sf",
-		    V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
-		    IX86_BUILTIN_GATHERSIV8SF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4sf",
-		    V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
-		    IX86_BUILTIN_GATHERDIV4SF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4sf256",
-		    V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
-		    IX86_BUILTIN_GATHERDIV8SF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv2di",
-		    V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
-		    IX86_BUILTIN_GATHERSIV2DI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4di",
-		    V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
-		    IX86_BUILTIN_GATHERSIV4DI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv2di",
-		    V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
-		    IX86_BUILTIN_GATHERDIV2DI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4di",
-		    V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
-		    IX86_BUILTIN_GATHERDIV4DI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4si",
-		    V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
-		    IX86_BUILTIN_GATHERSIV4SI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv8si",
-		    V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
-		    IX86_BUILTIN_GATHERSIV8SI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4si",
-		    V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
-		    IX86_BUILTIN_GATHERDIV4SI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4si256",
-		    V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
-		    IX86_BUILTIN_GATHERDIV8SI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltsiv4df ",
-		    V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
-		    IX86_BUILTIN_GATHERALTSIV4DF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltdiv8sf ",
-		    V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
-		    IX86_BUILTIN_GATHERALTDIV8SF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltsiv4di ",
-		    V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
-		    IX86_BUILTIN_GATHERALTSIV4DI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltdiv8si ",
-		    V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
-		    IX86_BUILTIN_GATHERALTDIV8SI);
-
-  /* AVX512F */
-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv16sf",
-		    V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
-		    IX86_BUILTIN_GATHER3SIV16SF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv8df",
-		    V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
-		    IX86_BUILTIN_GATHER3SIV8DF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv16sf",
-		    V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
-		    IX86_BUILTIN_GATHER3DIV16SF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv8df",
-		    V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
-		    IX86_BUILTIN_GATHER3DIV8DF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv16si",
-		    V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
-		    IX86_BUILTIN_GATHER3SIV16SI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv8di",
-		    V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
-		    IX86_BUILTIN_GATHER3SIV8DI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv16si",
-		    V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
-		    IX86_BUILTIN_GATHER3DIV16SI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv8di",
-		    V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
-		    IX86_BUILTIN_GATHER3DIV8DI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altsiv8df ",
-		    V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
-		    IX86_BUILTIN_GATHER3ALTSIV8DF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altdiv16sf ",
-		    V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
-		    IX86_BUILTIN_GATHER3ALTDIV16SF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altsiv8di ",
-		    V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
-		    IX86_BUILTIN_GATHER3ALTSIV8DI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altdiv16si ",
-		    V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
-		    IX86_BUILTIN_GATHER3ALTDIV16SI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv16sf",
-	       VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
-	       IX86_BUILTIN_SCATTERSIV16SF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv8df",
-	       VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
-	       IX86_BUILTIN_SCATTERSIV8DF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv16sf",
-	       VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
-	       IX86_BUILTIN_SCATTERDIV16SF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv8df",
-	       VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
-	       IX86_BUILTIN_SCATTERDIV8DF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv16si",
-	       VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
-	       IX86_BUILTIN_SCATTERSIV16SI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv8di",
-	       VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
-	       IX86_BUILTIN_SCATTERSIV8DI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv16si",
-	       VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
-	       IX86_BUILTIN_SCATTERDIV16SI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv8di",
-	       VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
-	       IX86_BUILTIN_SCATTERDIV8DI);
-
-  /* AVX512VL */
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv2df",
-		    V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
-		    IX86_BUILTIN_GATHER3SIV2DF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4df",
-		    V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
-		    IX86_BUILTIN_GATHER3SIV4DF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div2df",
-		    V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
-		    IX86_BUILTIN_GATHER3DIV2DF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4df",
-		    V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
-		    IX86_BUILTIN_GATHER3DIV4DF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4sf",
-		    V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
-		    IX86_BUILTIN_GATHER3SIV4SF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv8sf",
-		    V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
-		    IX86_BUILTIN_GATHER3SIV8SF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4sf",
-		    V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
-		    IX86_BUILTIN_GATHER3DIV4SF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div8sf",
-		    V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
-		    IX86_BUILTIN_GATHER3DIV8SF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv2di",
-		    V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
-		    IX86_BUILTIN_GATHER3SIV2DI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4di",
-		    V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
-		    IX86_BUILTIN_GATHER3SIV4DI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div2di",
-		    V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
-		    IX86_BUILTIN_GATHER3DIV2DI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4di",
-		    V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
-		    IX86_BUILTIN_GATHER3DIV4DI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4si",
-		    V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
-		    IX86_BUILTIN_GATHER3SIV4SI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv8si",
-		    V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
-		    IX86_BUILTIN_GATHER3SIV8SI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4si",
-		    V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
-		    IX86_BUILTIN_GATHER3DIV4SI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div8si",
-		    V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
-		    IX86_BUILTIN_GATHER3DIV8SI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altsiv4df ",
-		    V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
-		    IX86_BUILTIN_GATHER3ALTSIV4DF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altdiv8sf ",
-		    V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
-		    IX86_BUILTIN_GATHER3ALTDIV8SF);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altsiv4di ",
-		    V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
-		    IX86_BUILTIN_GATHER3ALTSIV4DI);
-
-  def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altdiv8si ",
-		    V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
-		    IX86_BUILTIN_GATHER3ALTDIV8SI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv8sf",
-	       VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
-	       IX86_BUILTIN_SCATTERSIV8SF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4sf",
-	       VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
-	       IX86_BUILTIN_SCATTERSIV4SF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4df",
-	       VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
-	       IX86_BUILTIN_SCATTERSIV4DF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv2df",
-	       VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
-	       IX86_BUILTIN_SCATTERSIV2DF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv8sf",
-	       VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
-	       IX86_BUILTIN_SCATTERDIV8SF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4sf",
-	       VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
-	       IX86_BUILTIN_SCATTERDIV4SF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4df",
-	       VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
-	       IX86_BUILTIN_SCATTERDIV4DF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv2df",
-	       VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
-	       IX86_BUILTIN_SCATTERDIV2DF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv8si",
-	       VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
-	       IX86_BUILTIN_SCATTERSIV8SI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4si",
-	       VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
-	       IX86_BUILTIN_SCATTERSIV4SI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4di",
-	       VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
-	       IX86_BUILTIN_SCATTERSIV4DI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv2di",
-	       VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
-	       IX86_BUILTIN_SCATTERSIV2DI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv8si",
-	       VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
-	       IX86_BUILTIN_SCATTERDIV8SI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4si",
-	       VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
-	       IX86_BUILTIN_SCATTERDIV4SI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4di",
-	       VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
-	       IX86_BUILTIN_SCATTERDIV4DI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv2di",
-	       VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
-	       IX86_BUILTIN_SCATTERDIV2DI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltsiv8df ",
-	       VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
-	       IX86_BUILTIN_SCATTERALTSIV8DF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltdiv16sf ",
-	       VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
-	       IX86_BUILTIN_SCATTERALTDIV16SF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltsiv8di ",
-	       VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
-	       IX86_BUILTIN_SCATTERALTSIV8DI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltdiv16si ",
-	       VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
-	       IX86_BUILTIN_SCATTERALTDIV16SI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv4df ",
-	       VOID_FTYPE_PDOUBLE_QI_V8SI_V4DF_INT,
-	       IX86_BUILTIN_SCATTERALTSIV4DF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv8sf ",
-	       VOID_FTYPE_PFLOAT_QI_V4DI_V8SF_INT,
-	       IX86_BUILTIN_SCATTERALTDIV8SF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv4di ",
-	       VOID_FTYPE_PLONGLONG_QI_V8SI_V4DI_INT,
-	       IX86_BUILTIN_SCATTERALTSIV4DI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv8si ",
-	       VOID_FTYPE_PINT_QI_V4DI_V8SI_INT,
-	       IX86_BUILTIN_SCATTERALTDIV8SI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv2df ",
-	       VOID_FTYPE_PDOUBLE_QI_V4SI_V2DF_INT,
-	       IX86_BUILTIN_SCATTERALTSIV2DF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv4sf ",
-	       VOID_FTYPE_PFLOAT_QI_V2DI_V4SF_INT,
-	       IX86_BUILTIN_SCATTERALTDIV4SF);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv2di ",
-	       VOID_FTYPE_PLONGLONG_QI_V4SI_V2DI_INT,
-	       IX86_BUILTIN_SCATTERALTSIV2DI);
-
-  def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv4si ",
-	       VOID_FTYPE_PINT_QI_V2DI_V4SI_INT,
-	       IX86_BUILTIN_SCATTERALTDIV4SI);
-
-  /* AVX512PF */
-  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfdpd",
-	       VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
-	       IX86_BUILTIN_GATHERPFDPD);
-  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfdps",
-	       VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
-	       IX86_BUILTIN_GATHERPFDPS);
-  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfqpd",
-	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
-	       IX86_BUILTIN_GATHERPFQPD);
-  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfqps",
-	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
-	       IX86_BUILTIN_GATHERPFQPS);
-  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfdpd",
-	       VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
-	       IX86_BUILTIN_SCATTERPFDPD);
-  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfdps",
-	       VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
-	       IX86_BUILTIN_SCATTERPFDPS);
-  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfqpd",
-	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
-	       IX86_BUILTIN_SCATTERPFQPD);
-  def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfqps",
-	       VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
-	       IX86_BUILTIN_SCATTERPFQPS);
-
-  /* SHA */
-  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1msg1",
-		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
-  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1msg2",
-		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
-  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1nexte",
-		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
-  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1rnds4",
-		     V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
-  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256msg1",
-		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
-  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256msg2",
-		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
-  def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256rnds2",
-		     V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
-
-  /* RTM.  */
-  def_builtin (OPTION_MASK_ISA_RTM, 0, "__builtin_ia32_xabort",
-	       VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
-
-  /* MMX access to the vec_init patterns.  */
-  def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v2si",
-		     V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
-
-  def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v4hi",
-		     V4HI_FTYPE_HI_HI_HI_HI,
-		     IX86_BUILTIN_VEC_INIT_V4HI);
-
-  def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v8qi",
-		     V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
-		     IX86_BUILTIN_VEC_INIT_V8QI);
-
-  /* Access to the vec_extract patterns.  */
-  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v2df",
-		     DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
-  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v2di",
-		     DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
-  def_builtin_const (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_vec_ext_v4sf",
-		     FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
-  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v4si",
-		     SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
-  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v8hi",
-		     HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
-
-  def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
-		     /* As it uses V4HImode, we have to require -mmmx too.  */
-		     | OPTION_MASK_ISA_MMX, 0,
-		     "__builtin_ia32_vec_ext_v4hi",
-		     HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
-
-  def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_ext_v2si",
-		     SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
-
-  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v16qi",
-		     QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
-
-  /* Access to the vec_set patterns.  */
-  def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT, 0,
-		     "__builtin_ia32_vec_set_v2di",
-		     V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
-
-  def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v4sf",
-		     V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
-
-  def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v4si",
-		     V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
-
-  def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_set_v8hi",
-		     V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
-
-  def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
-		     /* As it uses V4HImode, we have to require -mmmx too.  */
-		     | OPTION_MASK_ISA_MMX, 0,
-		     "__builtin_ia32_vec_set_v4hi",
-		     V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
-
-  def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v16qi",
-		     V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
-
-  /* RDSEED */
-  def_builtin (OPTION_MASK_ISA_RDSEED, 0, "__builtin_ia32_rdseed_hi_step",
-	       INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
-  def_builtin (OPTION_MASK_ISA_RDSEED, 0, "__builtin_ia32_rdseed_si_step",
-	       INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
-  def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT, 0,
-	       "__builtin_ia32_rdseed_di_step",
-	       INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
-
-  /* ADCX */
-  def_builtin (0, 0, "__builtin_ia32_addcarryx_u32",
-	       UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
-  def_builtin (OPTION_MASK_ISA_64BIT, 0,
-	       "__builtin_ia32_addcarryx_u64",
-	       UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
-	       IX86_BUILTIN_ADDCARRYX64);
-
-  /* SBB */
-  def_builtin (0, 0, "__builtin_ia32_sbb_u32",
-	       UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
-  def_builtin (OPTION_MASK_ISA_64BIT, 0,
-	       "__builtin_ia32_sbb_u64",
-	       UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
-	       IX86_BUILTIN_SBB64);
-
-  /* Read/write FLAGS.  */
-  if (TARGET_64BIT)
-    {
-      def_builtin (OPTION_MASK_ISA_64BIT, 0, "__builtin_ia32_readeflags_u64",
-		   UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
-      def_builtin (OPTION_MASK_ISA_64BIT, 0, "__builtin_ia32_writeeflags_u64",
-		   VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
-    }
-  else
-    {
-      def_builtin (0, 0, "__builtin_ia32_readeflags_u32",
-		   UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
-      def_builtin (0, 0, "__builtin_ia32_writeeflags_u32",
-		   VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
-    }
-
-  /* CLFLUSHOPT.  */
-  def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, 0, "__builtin_ia32_clflushopt",
-	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
-
-  /* CLWB.  */
-  def_builtin (OPTION_MASK_ISA_CLWB, 0, "__builtin_ia32_clwb",
-	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
-
-  /* MONITORX and MWAITX.  */
-  def_builtin (0, OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
-		VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
-  def_builtin (0, OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
-		VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
-
-  /* CLZERO.  */
-  def_builtin (0, OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
-		VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
-
-  /* WAITPKG.  */
-  def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_umonitor",
-	       VOID_FTYPE_PVOID, IX86_BUILTIN_UMONITOR);
-  def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_umwait",
-	       UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_UMWAIT);
-  def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_tpause",
-	       UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_TPAUSE);
-
-  /* CLDEMOTE.  */
-  def_builtin (0, OPTION_MASK_ISA_CLDEMOTE, "__builtin_ia32_cldemote",
-	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLDEMOTE);
-
-  /* Add FMA4 multi-arg argument instructions */
-  for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
-    {
-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
-      if (d->name == 0)
-	continue;
-
-      ftype = (enum ix86_builtin_func_type) d->flag;
-      def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
-    }
-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
-		 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
-		 ARRAY_SIZE (bdesc_multi_arg) - 1);
-
-  /* Add CET inrinsics.  */
-  for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
-    {
-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
-      if (d->name == 0)
-	continue;
-
-      ftype = (enum ix86_builtin_func_type) d->flag;
-      def_builtin (d->mask, d->mask2, d->name, ftype, d->code);
-    }
-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
-		 IX86_BUILTIN__BDESC_CET_FIRST,
-		 ARRAY_SIZE (bdesc_cet) - 1);
-
-  for (i = 0, d = bdesc_cet_rdssp;
-       i < ARRAY_SIZE (bdesc_cet_rdssp);
-       i++, d++)
-    {
-      BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
-      if (d->name == 0)
-	continue;
-
-      ftype = (enum ix86_builtin_func_type) d->flag;
-      def_builtin (d->mask, d->mask2, d->name, ftype, d->code);
-    }
-  BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
-		 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
-		 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
-}
-
-#undef BDESC_VERIFY
-#undef BDESC_VERIFYS
-
-/* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
-   to return a pointer to VERSION_DECL if the outcome of the expression
-   formed by PREDICATE_CHAIN is true.  This function will be called during
-   version dispatch to decide which function version to execute.  It returns
-   the basic block at the end, to which more conditions can be added.  */
-
-static basic_block
-add_condition_to_bb (tree function_decl, tree version_decl,
-		     tree predicate_chain, basic_block new_bb)
-{
-  gimple *return_stmt;
-  tree convert_expr, result_var;
-  gimple *convert_stmt;
-  gimple *call_cond_stmt;
-  gimple *if_else_stmt;
-
-  basic_block bb1, bb2, bb3;
-  edge e12, e23;
-
-  tree cond_var, and_expr_var = NULL_TREE;
-  gimple_seq gseq;
-
-  tree predicate_decl, predicate_arg;
-
-  push_cfun (DECL_STRUCT_FUNCTION (function_decl));
-
-  gcc_assert (new_bb != NULL);
-  gseq = bb_seq (new_bb);
-
-
-  convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
-	     		 build_fold_addr_expr (version_decl));
-  result_var = create_tmp_var (ptr_type_node);
-  convert_stmt = gimple_build_assign (result_var, convert_expr); 
-  return_stmt = gimple_build_return (result_var);
-
-  if (predicate_chain == NULL_TREE)
-    {
-      gimple_seq_add_stmt (&gseq, convert_stmt);
-      gimple_seq_add_stmt (&gseq, return_stmt);
-      set_bb_seq (new_bb, gseq);
-      gimple_set_bb (convert_stmt, new_bb);
-      gimple_set_bb (return_stmt, new_bb);
-      pop_cfun ();
-      return new_bb;
-    }
-
-  while (predicate_chain != NULL)
-    {
-      cond_var = create_tmp_var (integer_type_node);
-      predicate_decl = TREE_PURPOSE (predicate_chain);
-      predicate_arg = TREE_VALUE (predicate_chain);
-      call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
-      gimple_call_set_lhs (call_cond_stmt, cond_var);
-
-      gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
-      gimple_set_bb (call_cond_stmt, new_bb);
-      gimple_seq_add_stmt (&gseq, call_cond_stmt);
-
-      predicate_chain = TREE_CHAIN (predicate_chain);
-      
-      if (and_expr_var == NULL)
-        and_expr_var = cond_var;
-      else
-	{
-	  gimple *assign_stmt;
-	  /* Use MIN_EXPR to check if any integer is zero?.
-	     and_expr_var = min_expr <cond_var, and_expr_var>  */
-	  assign_stmt = gimple_build_assign (and_expr_var,
-			  build2 (MIN_EXPR, integer_type_node,
-				  cond_var, and_expr_var));
-
-	  gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
-	  gimple_set_bb (assign_stmt, new_bb);
-	  gimple_seq_add_stmt (&gseq, assign_stmt);
-	}
-    }
-
-  if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
-	  		            integer_zero_node,
-				    NULL_TREE, NULL_TREE);
-  gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
-  gimple_set_bb (if_else_stmt, new_bb);
-  gimple_seq_add_stmt (&gseq, if_else_stmt);
-
-  gimple_seq_add_stmt (&gseq, convert_stmt);
-  gimple_seq_add_stmt (&gseq, return_stmt);
-  set_bb_seq (new_bb, gseq);
-
-  bb1 = new_bb;
-  e12 = split_block (bb1, if_else_stmt);
-  bb2 = e12->dest;
-  e12->flags &= ~EDGE_FALLTHRU;
-  e12->flags |= EDGE_TRUE_VALUE;
-
-  e23 = split_block (bb2, return_stmt);
-
-  gimple_set_bb (convert_stmt, bb2);
-  gimple_set_bb (return_stmt, bb2);
-
-  bb3 = e23->dest;
-  make_edge (bb1, bb3, EDGE_FALSE_VALUE); 
-
-  remove_edge (e23);
-  make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
-
-  pop_cfun ();
-
-  return bb3;
-}
-
-/* Priority of i386 features, greater value is higher priority.   This is
-   used to decide the order in which function dispatch must happen.  For
-   instance, a version specialized for SSE4.2 should be checked for dispatch
-   before a version for SSE3, as SSE4.2 implies SSE3.  */
-enum feature_priority
-{
-  P_ZERO = 0,
-  P_MMX,
-  P_SSE,
-  P_SSE2,
-  P_SSE3,
-  P_SSSE3,
-  P_PROC_SSSE3,
-  P_SSE4_A,
-  P_PROC_SSE4_A,
-  P_SSE4_1,
-  P_SSE4_2,
-  P_PROC_SSE4_2,
-  P_POPCNT,
-  P_AES,
-  P_PCLMUL,
-  P_AVX,
-  P_PROC_AVX,
-  P_BMI,
-  P_PROC_BMI,
-  P_FMA4,
-  P_XOP,
-  P_PROC_XOP,
-  P_FMA,
-  P_PROC_FMA,
-  P_BMI2,
-  P_AVX2,
-  P_PROC_AVX2,
-  P_AVX512F,
-  P_PROC_AVX512F
-};
-
-/* This is the order of bit-fields in __processor_features in cpuinfo.c */
-enum processor_features
-{
-  F_CMOV = 0,
-  F_MMX,
-  F_POPCNT,
-  F_SSE,
-  F_SSE2,
-  F_SSE3,
-  F_SSSE3,
-  F_SSE4_1,
-  F_SSE4_2,
-  F_AVX,
-  F_AVX2,
-  F_SSE4_A,
-  F_FMA4,
-  F_XOP,
-  F_FMA,
-  F_AVX512F,
-  F_BMI,
-  F_BMI2,
-  F_AES,
-  F_PCLMUL,
-  F_AVX512VL,
-  F_AVX512BW,
-  F_AVX512DQ,
-  F_AVX512CD,
-  F_AVX512ER,
-  F_AVX512PF,
-  F_AVX512VBMI,
-  F_AVX512IFMA,
-  F_AVX5124VNNIW,
-  F_AVX5124FMAPS,
-  F_AVX512VPOPCNTDQ,
-  F_AVX512VBMI2,
-  F_GFNI,
-  F_VPCLMULQDQ,
-  F_AVX512VNNI,
-  F_AVX512BITALG,
-  F_MAX
-};
-
-/* These are the values for vendor types and cpu types  and subtypes
-   in cpuinfo.c.  Cpu types and subtypes should be subtracted by
-   the corresponding start value.  */
-enum processor_model
-{
-  M_INTEL = 1,
-  M_AMD,
-  M_CPU_TYPE_START,
-  M_INTEL_BONNELL,
-  M_INTEL_CORE2,
-  M_INTEL_COREI7,
-  M_AMDFAM10H,
-  M_AMDFAM15H,
-  M_INTEL_SILVERMONT,
-  M_INTEL_KNL,
-  M_AMD_BTVER1,
-  M_AMD_BTVER2,
-  M_AMDFAM17H,
-  M_INTEL_KNM,
-  M_INTEL_GOLDMONT,
-  M_INTEL_GOLDMONT_PLUS,
-  M_INTEL_TREMONT,
-  M_CPU_SUBTYPE_START,
-  M_INTEL_COREI7_NEHALEM,
-  M_INTEL_COREI7_WESTMERE,
-  M_INTEL_COREI7_SANDYBRIDGE,
-  M_AMDFAM10H_BARCELONA,
-  M_AMDFAM10H_SHANGHAI,
-  M_AMDFAM10H_ISTANBUL,
-  M_AMDFAM15H_BDVER1,
-  M_AMDFAM15H_BDVER2,
-  M_AMDFAM15H_BDVER3,
-  M_AMDFAM15H_BDVER4,
-  M_AMDFAM17H_ZNVER1,
-  M_INTEL_COREI7_IVYBRIDGE,
-  M_INTEL_COREI7_HASWELL,
-  M_INTEL_COREI7_BROADWELL,
-  M_INTEL_COREI7_SKYLAKE,
-  M_INTEL_COREI7_SKYLAKE_AVX512,
-  M_INTEL_COREI7_CANNONLAKE,
-  M_INTEL_COREI7_ICELAKE_CLIENT,
-  M_INTEL_COREI7_ICELAKE_SERVER,
-  M_AMDFAM17H_ZNVER2,
-  M_INTEL_COREI7_CASCADELAKE
-};
-
-struct _arch_names_table
-{
-  const char *const name;
-  const enum processor_model model;
-};
-
-static const _arch_names_table arch_names_table[] =
-{
-  {"amd", M_AMD},
-  {"intel", M_INTEL},
-  {"atom", M_INTEL_BONNELL},
-  {"slm", M_INTEL_SILVERMONT},
-  {"core2", M_INTEL_CORE2},
-  {"corei7", M_INTEL_COREI7},
-  {"nehalem", M_INTEL_COREI7_NEHALEM},
-  {"westmere", M_INTEL_COREI7_WESTMERE},
-  {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
-  {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
-  {"haswell", M_INTEL_COREI7_HASWELL},
-  {"broadwell", M_INTEL_COREI7_BROADWELL},
-  {"skylake", M_INTEL_COREI7_SKYLAKE},
-  {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
-  {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
-  {"icelake-client", M_INTEL_COREI7_ICELAKE_CLIENT},
-  {"icelake-server", M_INTEL_COREI7_ICELAKE_SERVER},
-  {"cascadelake", M_INTEL_COREI7_CASCADELAKE},
-  {"bonnell", M_INTEL_BONNELL},
-  {"silvermont", M_INTEL_SILVERMONT},
-  {"goldmont", M_INTEL_GOLDMONT},
-  {"goldmont-plus", M_INTEL_GOLDMONT_PLUS},
-  {"tremont", M_INTEL_TREMONT},
-  {"knl", M_INTEL_KNL},
-  {"knm", M_INTEL_KNM},
-  {"amdfam10h", M_AMDFAM10H},
-  {"barcelona", M_AMDFAM10H_BARCELONA},
-  {"shanghai", M_AMDFAM10H_SHANGHAI},
-  {"istanbul", M_AMDFAM10H_ISTANBUL},
-  {"btver1", M_AMD_BTVER1},
-  {"amdfam15h", M_AMDFAM15H},
-  {"bdver1", M_AMDFAM15H_BDVER1},
-  {"bdver2", M_AMDFAM15H_BDVER2},
-  {"bdver3", M_AMDFAM15H_BDVER3},
-  {"bdver4", M_AMDFAM15H_BDVER4},
-  {"btver2", M_AMD_BTVER2},
-  {"amdfam17h", M_AMDFAM17H},
-  {"znver1", M_AMDFAM17H_ZNVER1},
-  {"znver2", M_AMDFAM17H_ZNVER2},
-};
-
-/* These are the target attribute strings for which a dispatcher is
-   available, from fold_builtin_cpu.  */
-struct _isa_names_table
-{
-  const char *const name;
-  const enum processor_features feature;
-  const enum feature_priority priority;
-};
-
-static const _isa_names_table isa_names_table[] =
-{
-  {"cmov",    F_CMOV,	P_ZERO},
-  {"mmx",     F_MMX,	P_MMX},
-  {"popcnt",  F_POPCNT,	P_POPCNT},
-  {"sse",     F_SSE,	P_SSE},
-  {"sse2",    F_SSE2,	P_SSE2},
-  {"sse3",    F_SSE3,	P_SSE3},
-  {"ssse3",   F_SSSE3,	P_SSSE3},
-  {"sse4a",   F_SSE4_A,	P_SSE4_A},
-  {"sse4.1",  F_SSE4_1,	P_SSE4_1},
-  {"sse4.2",  F_SSE4_2,	P_SSE4_2},
-  {"avx",     F_AVX,	P_AVX},
-  {"fma4",    F_FMA4,	P_FMA4},
-  {"xop",     F_XOP,	P_XOP},
-  {"fma",     F_FMA,	P_FMA},
-  {"avx2",    F_AVX2,	P_AVX2},
-  {"avx512f", F_AVX512F, P_AVX512F},
-  {"bmi",     F_BMI,	P_BMI},
-  {"bmi2",    F_BMI2,	P_BMI2},
-  {"aes",     F_AES,	P_AES},
-  {"pclmul",  F_PCLMUL,	P_PCLMUL},
-  {"avx512vl",F_AVX512VL, P_ZERO},
-  {"avx512bw",F_AVX512BW, P_ZERO},
-  {"avx512dq",F_AVX512DQ, P_ZERO},
-  {"avx512cd",F_AVX512CD, P_ZERO},
-  {"avx512er",F_AVX512ER, P_ZERO},
-  {"avx512pf",F_AVX512PF, P_ZERO},
-  {"avx512vbmi",F_AVX512VBMI, P_ZERO},
-  {"avx512ifma",F_AVX512IFMA, P_ZERO},
-  {"avx5124vnniw",F_AVX5124VNNIW, P_ZERO},
-  {"avx5124fmaps",F_AVX5124FMAPS, P_ZERO},
-  {"avx512vpopcntdq",F_AVX512VPOPCNTDQ,	P_ZERO},
-  {"avx512vbmi2", F_AVX512VBMI2, P_ZERO},
-  {"gfni",	F_GFNI,	P_ZERO},
-  {"vpclmulqdq", F_VPCLMULQDQ, P_ZERO},
-  {"avx512vnni", F_AVX512VNNI, P_ZERO},
-  {"avx512bitalg", F_AVX512BITALG, P_ZERO}
-};
-
-/* This parses the attribute arguments to target in DECL and determines
-   the right builtin to use to match the platform specification.
-   It returns the priority value for this version decl.  If PREDICATE_LIST
-   is not NULL, it stores the list of cpu features that need to be checked
-   before dispatching this function.  */
-
-static unsigned int
-get_builtin_code_for_version (tree decl, tree *predicate_list)
-{
-  tree attrs;
-  struct cl_target_option cur_target;
-  tree target_node;
-  struct cl_target_option *new_target;
-  const char *arg_str = NULL;
-  const char *attrs_str = NULL;
-  char *tok_str = NULL;
-  char *token;
-
-  enum feature_priority priority = P_ZERO;
-
-  static unsigned int NUM_FEATURES
-    = sizeof (isa_names_table) / sizeof (_isa_names_table);
-
-  unsigned int i;
-
-  tree predicate_chain = NULL_TREE;
-  tree predicate_decl, predicate_arg;
-
-  attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
-  gcc_assert (attrs != NULL);
-
-  attrs = TREE_VALUE (TREE_VALUE (attrs));
-
-  gcc_assert (TREE_CODE (attrs) == STRING_CST);
-  attrs_str = TREE_STRING_POINTER (attrs);
-
-  /* Return priority zero for default function.  */
-  if (strcmp (attrs_str, "default") == 0)
-    return 0;
-
-  /* Handle arch= if specified.  For priority, set it to be 1 more than
-     the best instruction set the processor can handle.  For instance, if
-     there is a version for atom and a version for ssse3 (the highest ISA
-     priority for atom), the atom version must be checked for dispatch
-     before the ssse3 version. */
-  if (strstr (attrs_str, "arch=") != NULL)
-    {
-      cl_target_option_save (&cur_target, &global_options);
-      target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
-						      &global_options_set);
-    
-      gcc_assert (target_node);
-      if (target_node == error_mark_node)
-	return 0;
-      new_target = TREE_TARGET_OPTION (target_node);
-      gcc_assert (new_target);
-      
-      if (new_target->arch_specified && new_target->arch > 0)
-	{
-	  switch (new_target->arch)
-	    {
-	    case PROCESSOR_CORE2:
-	      arg_str = "core2";
-	      priority = P_PROC_SSSE3;
-	      break;
-	    case PROCESSOR_NEHALEM:
-	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_PCLMUL)
-		{
-		  arg_str = "westmere";
-		  priority = P_PCLMUL;
-		}
-	      else
-		{
-		  /* We translate "arch=corei7" and "arch=nehalem" to
-		     "corei7" so that it will be mapped to M_INTEL_COREI7
-		     as cpu type to cover all M_INTEL_COREI7_XXXs.  */
-		  arg_str = "corei7";
-		  priority = P_PROC_SSE4_2;
-		}
-	      break;
-	    case PROCESSOR_SANDYBRIDGE:
-	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
-		arg_str = "ivybridge";
-	      else
-		arg_str = "sandybridge";
-	      priority = P_PROC_AVX;
-	      break;
-	    case PROCESSOR_HASWELL:
-	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
-		arg_str = "broadwell";
-	      else
-		arg_str = "haswell";
-	      priority = P_PROC_AVX2;
-	      break;
-	    case PROCESSOR_SKYLAKE:
-	      arg_str = "skylake";
-	      priority = P_PROC_AVX2;
-	      break;
-	    case PROCESSOR_SKYLAKE_AVX512:
-	      arg_str = "skylake-avx512";
-	      priority = P_PROC_AVX512F;
-	      break;
-	    case PROCESSOR_CANNONLAKE:
-	      arg_str = "cannonlake";
-	      priority = P_PROC_AVX512F;
-	      break;
-	    case PROCESSOR_ICELAKE_CLIENT:
-	      arg_str = "icelake-client";
-	      priority = P_PROC_AVX512F;
-	      break;
-	    case PROCESSOR_ICELAKE_SERVER:
-	      arg_str = "icelake-server";
-	      priority = P_PROC_AVX512F;
-	      break;
-	    case PROCESSOR_CASCADELAKE:
-	      arg_str = "cascadelake";
-	      priority = P_PROC_AVX512F;
-	      break;
-	    case PROCESSOR_BONNELL:
-	      arg_str = "bonnell";
-	      priority = P_PROC_SSSE3;
-	      break;
-	    case PROCESSOR_KNL:
-	      arg_str = "knl";
-	      priority = P_PROC_AVX512F;
-	      break;
-	    case PROCESSOR_KNM:
-	      arg_str = "knm";
-	      priority = P_PROC_AVX512F;
-	      break;
-	    case PROCESSOR_SILVERMONT:
-	      arg_str = "silvermont";
-	      priority = P_PROC_SSE4_2;
-	      break;
-	    case PROCESSOR_GOLDMONT:
-	      arg_str = "goldmont";
-	      priority = P_PROC_SSE4_2;
-	      break;
-	    case PROCESSOR_GOLDMONT_PLUS:
-	      arg_str = "goldmont-plus";
-	      priority = P_PROC_SSE4_2;
-	      break;
-	    case PROCESSOR_TREMONT:
-	      arg_str = "tremont";
-	      priority = P_PROC_SSE4_2;
-	      break;
-	    case PROCESSOR_AMDFAM10:
-	      arg_str = "amdfam10h";
-	      priority = P_PROC_SSE4_A;
-	      break;
-	    case PROCESSOR_BTVER1:
-	      arg_str = "btver1";
-	      priority = P_PROC_SSE4_A;
-	      break;
-	    case PROCESSOR_BTVER2:
-	      arg_str = "btver2";
-	      priority = P_PROC_BMI;
-	      break;
-	    case PROCESSOR_BDVER1:
-	      arg_str = "bdver1";
-	      priority = P_PROC_XOP;
-	      break;
-	    case PROCESSOR_BDVER2:
-	      arg_str = "bdver2";
-	      priority = P_PROC_FMA;
-	      break;
-	    case PROCESSOR_BDVER3:
-	      arg_str = "bdver3";
-	      priority = P_PROC_FMA;
-	      break;
-	    case PROCESSOR_BDVER4:
-	      arg_str = "bdver4";
-	      priority = P_PROC_AVX2;
-	      break;
-	    case PROCESSOR_ZNVER1:
-	      arg_str = "znver1";
-	      priority = P_PROC_AVX2;
-	      break;
-	    case PROCESSOR_ZNVER2:
-	      arg_str = "znver2";
-	      priority = P_PROC_AVX2;
-	      break;
-	    }
-	}
-
-      cl_target_option_restore (&global_options, &cur_target);
-	
-      if (predicate_list && arg_str == NULL)
-	{
-	  error_at (DECL_SOURCE_LOCATION (decl),
-		    "no dispatcher found for the versioning attributes");
-	  return 0;
-	}
-    
-      if (predicate_list)
-	{
-          predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
-          /* For a C string literal the length includes the trailing NULL.  */
-          predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
-          predicate_chain = tree_cons (predicate_decl, predicate_arg,
-				       predicate_chain);
-	}
-    }
-
-  /* Process feature name.  */
-  tok_str =  (char *) xmalloc (strlen (attrs_str) + 1);
-  strcpy (tok_str, attrs_str);
-  token = strtok (tok_str, ",");
-  predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
-
-  while (token != NULL)
-    {
-      /* Do not process "arch="  */
-      if (strncmp (token, "arch=", 5) == 0)
-	{
-	  token = strtok (NULL, ",");
-	  continue;
-	}
-      for (i = 0; i < NUM_FEATURES; ++i)
-	{
-	  if (strcmp (token, isa_names_table[i].name) == 0)
-	    {
-	      if (predicate_list)
-		{
-		  predicate_arg = build_string_literal (
-				  strlen (isa_names_table[i].name) + 1,
-				  isa_names_table[i].name);
-		  predicate_chain = tree_cons (predicate_decl, predicate_arg,
-					       predicate_chain);
-		}
-	      /* Find the maximum priority feature.  */
-	      if (isa_names_table[i].priority > priority)
-		priority = isa_names_table[i].priority;
-
-	      break;
-	    }
-	}
-      if (predicate_list && priority == P_ZERO)
-	{
-	  error_at (DECL_SOURCE_LOCATION (decl),
-		    "ISA %qs is not supported in %<target%> attribute, "
-		    "use %<arch=%> syntax", token);
-	  return 0;
-	}
-      token = strtok (NULL, ",");
-    }
-  free (tok_str);
-
-  if (predicate_list && predicate_chain == NULL_TREE)
-    {
-      error_at (DECL_SOURCE_LOCATION (decl),
-	        "no dispatcher found for the versioning attributes: %s",
-	        attrs_str);
-      return 0;
-    }
-  else if (predicate_list)
-    {
-      predicate_chain = nreverse (predicate_chain);
-      *predicate_list = predicate_chain;
-    }
-
-  return priority; 
-}
-
-/* This compares the priority of target features in function DECL1
-   and DECL2.  It returns positive value if DECL1 is higher priority,
-   negative value if DECL2 is higher priority and 0 if they are the
-   same.  */
-
-static int
-ix86_compare_version_priority (tree decl1, tree decl2)
-{
-  unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
-  unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
-
-  return (int)priority1 - (int)priority2;
-}
-
-/* V1 and V2 point to function versions with different priorities
-   based on the target ISA.  This function compares their priorities.  */
- 
-static int
-feature_compare (const void *v1, const void *v2)
-{
-  typedef struct _function_version_info
-    {
-      tree version_decl;
-      tree predicate_chain;
-      unsigned int dispatch_priority;
-    } function_version_info;
-
-  const function_version_info c1 = *(const function_version_info *)v1;
-  const function_version_info c2 = *(const function_version_info *)v2;
-  return (c2.dispatch_priority - c1.dispatch_priority);
-}
-
-/* This function generates the dispatch function for
-   multi-versioned functions.  DISPATCH_DECL is the function which will
-   contain the dispatch logic.  FNDECLS are the function choices for
-   dispatch, and is a tree chain.  EMPTY_BB is the basic block pointer
-   in DISPATCH_DECL in which the dispatch code is generated.  */
-
-static int
-dispatch_function_versions (tree dispatch_decl,
-			    void *fndecls_p,
-			    basic_block *empty_bb)
-{
-  tree default_decl;
-  gimple *ifunc_cpu_init_stmt;
-  gimple_seq gseq;
-  int ix;
-  tree ele;
-  vec<tree> *fndecls;
-  unsigned int num_versions = 0;
-  unsigned int actual_versions = 0;
-  unsigned int i;
-
-  struct _function_version_info
-    {
-      tree version_decl;
-      tree predicate_chain;
-      unsigned int dispatch_priority;
-    }*function_version_info;
-
-  gcc_assert (dispatch_decl != NULL
-	      && fndecls_p != NULL
-	      && empty_bb != NULL);
-
-  /*fndecls_p is actually a vector.  */
-  fndecls = static_cast<vec<tree> *> (fndecls_p);
-
-  /* At least one more version other than the default.  */
-  num_versions = fndecls->length ();
-  gcc_assert (num_versions >= 2);
-
-  function_version_info = (struct _function_version_info *)
-    XNEWVEC (struct _function_version_info, (num_versions - 1));
-
-  /* The first version in the vector is the default decl.  */
-  default_decl = (*fndecls)[0];
-
-  push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
-
-  gseq = bb_seq (*empty_bb);
-  /* Function version dispatch is via IFUNC.  IFUNC resolvers fire before
-     constructors, so explicity call __builtin_cpu_init here.  */
-  ifunc_cpu_init_stmt = gimple_build_call_vec (
-                     ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
-  gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
-  gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
-  set_bb_seq (*empty_bb, gseq);
-
-  pop_cfun ();
-
-
-  for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
-    {
-      tree version_decl = ele;
-      tree predicate_chain = NULL_TREE;
-      unsigned int priority;
-      /* Get attribute string, parse it and find the right predicate decl.
-         The predicate function could be a lengthy combination of many
-	 features, like arch-type and various isa-variants.  */
-      priority = get_builtin_code_for_version (version_decl,
-	 			               &predicate_chain);
-
-      if (predicate_chain == NULL_TREE)
-	continue;
-
-      function_version_info [actual_versions].version_decl = version_decl;
-      function_version_info [actual_versions].predicate_chain
-	 = predicate_chain;
-      function_version_info [actual_versions].dispatch_priority = priority;
-      actual_versions++;
-    }
-
-  /* Sort the versions according to descending order of dispatch priority.  The
-     priority is based on the ISA.  This is not a perfect solution.  There
-     could still be ambiguity.  If more than one function version is suitable
-     to execute,  which one should be dispatched?  In future, allow the user
-     to specify a dispatch  priority next to the version.  */
-  qsort (function_version_info, actual_versions,
-         sizeof (struct _function_version_info), feature_compare);
-
-  for  (i = 0; i < actual_versions; ++i)
-    *empty_bb = add_condition_to_bb (dispatch_decl,
-				     function_version_info[i].version_decl,
-				     function_version_info[i].predicate_chain,
-				     *empty_bb);
-
-  /* dispatch default version at the end.  */
-  *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
-				   NULL, *empty_bb);
-
-  free (function_version_info);
-  return 0;
-}
-
-/* This function changes the assembler name for functions that are
-   versions.  If DECL is a function version and has a "target"
-   attribute, it appends the attribute string to its assembler name.  */
-
-static tree
-ix86_mangle_function_version_assembler_name (tree decl, tree id)
-{
-  tree version_attr;
-  const char *orig_name, *version_string;
-  char *attr_str, *assembler_name;
-
-  if (DECL_DECLARED_INLINE_P (decl)
-      && lookup_attribute ("gnu_inline",
-			   DECL_ATTRIBUTES (decl)))
-    error_at (DECL_SOURCE_LOCATION (decl),
-	      "function versions cannot be marked as gnu_inline,"
-	      " bodies have to be generated");
-
-  if (DECL_VIRTUAL_P (decl)
-      || DECL_VINDEX (decl))
-    sorry ("virtual function multiversioning not supported");
-
-  version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
-
-  /* target attribute string cannot be NULL.  */
-  gcc_assert (version_attr != NULL_TREE);
-
-  orig_name = IDENTIFIER_POINTER (id);
-  version_string
-    = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
-
-  if (strcmp (version_string, "default") == 0)
-    return id;
-
-  attr_str = sorted_attr_string (TREE_VALUE (version_attr));
-  assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
-
-  sprintf (assembler_name, "%s.%s", orig_name, attr_str);
-
-  /* Allow assembler name to be modified if already set.  */
-  if (DECL_ASSEMBLER_NAME_SET_P (decl))
-    SET_DECL_RTL (decl, NULL);
-
-  tree ret = get_identifier (assembler_name);
-  XDELETEVEC (attr_str);
-  XDELETEVEC (assembler_name);
-  return ret;
-}
-
-
-static tree 
-ix86_mangle_decl_assembler_name (tree decl, tree id)
-{
-  /* For function version, add the target suffix to the assembler name.  */
-  if (TREE_CODE (decl) == FUNCTION_DECL
-      && DECL_FUNCTION_VERSIONED (decl))
-    id = ix86_mangle_function_version_assembler_name (decl, id);
-#ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
-  id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
-#endif
-
-  return id;
-}
-
-/* Make a dispatcher declaration for the multi-versioned function DECL.
-   Calls to DECL function will be replaced with calls to the dispatcher
-   by the front-end.  Returns the decl of the dispatcher function.  */
-
-static tree
-ix86_get_function_versions_dispatcher (void *decl)
-{
-  tree fn = (tree) decl;
-  struct cgraph_node *node = NULL;
-  struct cgraph_node *default_node = NULL;
-  struct cgraph_function_version_info *node_v = NULL;
-  struct cgraph_function_version_info *first_v = NULL;
-
-  tree dispatch_decl = NULL;
-
-  struct cgraph_function_version_info *default_version_info = NULL;
- 
-  gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
-
-  node = cgraph_node::get (fn);
-  gcc_assert (node != NULL);
-
-  node_v = node->function_version ();
-  gcc_assert (node_v != NULL);
- 
-  if (node_v->dispatcher_resolver != NULL)
-    return node_v->dispatcher_resolver;
-
-  /* Find the default version and make it the first node.  */
-  first_v = node_v;
-  /* Go to the beginning of the chain.  */
-  while (first_v->prev != NULL)
-    first_v = first_v->prev;
-  default_version_info = first_v;
-  while (default_version_info != NULL)
-    {
-      if (is_function_default_version
-	    (default_version_info->this_node->decl))
-        break;
-      default_version_info = default_version_info->next;
-    }
-
-  /* If there is no default node, just return NULL.  */
-  if (default_version_info == NULL)
-    return NULL;
-
-  /* Make default info the first node.  */
-  if (first_v != default_version_info)
-    {
-      default_version_info->prev->next = default_version_info->next;
-      if (default_version_info->next)
-        default_version_info->next->prev = default_version_info->prev;
-      first_v->prev = default_version_info;
-      default_version_info->next = first_v;
-      default_version_info->prev = NULL;
-    }
-
-  default_node = default_version_info->this_node;
-
-#if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
-  if (targetm.has_ifunc_p ())
-    {
-      struct cgraph_function_version_info *it_v = NULL;
-      struct cgraph_node *dispatcher_node = NULL;
-      struct cgraph_function_version_info *dispatcher_version_info = NULL;
-
-      /* Right now, the dispatching is done via ifunc.  */
-      dispatch_decl = make_dispatcher_decl (default_node->decl);
-
-      dispatcher_node = cgraph_node::get_create (dispatch_decl);
-      gcc_assert (dispatcher_node != NULL);
-      dispatcher_node->dispatcher_function = 1;
-      dispatcher_version_info
-	= dispatcher_node->insert_new_function_version ();
-      dispatcher_version_info->next = default_version_info;
-      dispatcher_node->definition = 1;
-
-      /* Set the dispatcher for all the versions.  */
-      it_v = default_version_info;
-      while (it_v != NULL)
-	{
-	  it_v->dispatcher_resolver = dispatch_decl;
-	  it_v = it_v->next;
-	}
-    }
-  else
-#endif
-    {
-      error_at (DECL_SOURCE_LOCATION (default_node->decl),
-		"multiversioning needs ifunc which is not supported "
-		"on this target");
-    }
-
-  return dispatch_decl;
-}
-
-/* Make the resolver function decl to dispatch the versions of
-   a multi-versioned function,  DEFAULT_DECL.  IFUNC_ALIAS_DECL is
-   ifunc alias that will point to the created resolver.  Create an
-   empty basic block in the resolver and store the pointer in
-   EMPTY_BB.  Return the decl of the resolver function.  */
-
-static tree
-make_resolver_func (const tree default_decl,
-		    const tree ifunc_alias_decl,
-		    basic_block *empty_bb)
-{
-  char *resolver_name;
-  tree decl, type, decl_name, t;
-
-  /* IFUNC's have to be globally visible.  So, if the default_decl is
-     not, then the name of the IFUNC should be made unique.  */
-  if (TREE_PUBLIC (default_decl) == 0)
-    {
-      char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
-      symtab->change_decl_assembler_name (ifunc_alias_decl,
-					  get_identifier (ifunc_name));
-      XDELETEVEC (ifunc_name);
-    }
-
-  resolver_name = make_unique_name (default_decl, "resolver", false);
-
-  /* The resolver function should return a (void *). */
-  type = build_function_type_list (ptr_type_node, NULL_TREE);
-
-  decl = build_fn_decl (resolver_name, type);
-  decl_name = get_identifier (resolver_name);
-  SET_DECL_ASSEMBLER_NAME (decl, decl_name);
-
-  DECL_NAME (decl) = decl_name;
-  TREE_USED (decl) = 1;
-  DECL_ARTIFICIAL (decl) = 1;
-  DECL_IGNORED_P (decl) = 1;
-  TREE_PUBLIC (decl) = 0;
-  DECL_UNINLINABLE (decl) = 1;
-
-  /* Resolver is not external, body is generated.  */
-  DECL_EXTERNAL (decl) = 0;
-  DECL_EXTERNAL (ifunc_alias_decl) = 0;
-
-  DECL_CONTEXT (decl) = NULL_TREE;
-  DECL_INITIAL (decl) = make_node (BLOCK);
-  DECL_STATIC_CONSTRUCTOR (decl) = 0;
-
-  if (DECL_COMDAT_GROUP (default_decl)
-      || TREE_PUBLIC (default_decl))
-    {
-      /* In this case, each translation unit with a call to this
-	 versioned function will put out a resolver.  Ensure it
-	 is comdat to keep just one copy.  */
-      DECL_COMDAT (decl) = 1;
-      make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
-    }
-  /* Build result decl and add to function_decl. */
-  t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
-  DECL_CONTEXT (t) = decl;
-  DECL_ARTIFICIAL (t) = 1;
-  DECL_IGNORED_P (t) = 1;
-  DECL_RESULT (decl) = t;
-
-  gimplify_function_tree (decl);
-  push_cfun (DECL_STRUCT_FUNCTION (decl));
-  *empty_bb = init_lowered_empty_function (decl, false,
-					   profile_count::uninitialized ());
-
-  cgraph_node::add_new_function (decl, true);
-  symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
-
-  pop_cfun ();
-
-  gcc_assert (ifunc_alias_decl != NULL);
-  /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name.  */
-  DECL_ATTRIBUTES (ifunc_alias_decl)
-    = make_attribute ("ifunc", resolver_name,
-		      DECL_ATTRIBUTES (ifunc_alias_decl));
-
-  /* Create the alias for dispatch to resolver here.  */
-  cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
-  XDELETEVEC (resolver_name);
-  return decl;
-}
-
-/* Generate the dispatching code body to dispatch multi-versioned function
-   DECL.  The target hook is called to process the "target" attributes and
-   provide the code to dispatch the right function at run-time.  NODE points
-   to the dispatcher decl whose body will be created.  */
-
-static tree 
-ix86_generate_version_dispatcher_body (void *node_p)
-{
-  tree resolver_decl;
-  basic_block empty_bb;
-  tree default_ver_decl;
-  struct cgraph_node *versn;
-  struct cgraph_node *node;
-
-  struct cgraph_function_version_info *node_version_info = NULL;
-  struct cgraph_function_version_info *versn_info = NULL;
-
-  node = (cgraph_node *)node_p;
-
-  node_version_info = node->function_version ();
-  gcc_assert (node->dispatcher_function
-	      && node_version_info != NULL);
-
-  if (node_version_info->dispatcher_resolver)
-    return node_version_info->dispatcher_resolver;
-
-  /* The first version in the chain corresponds to the default version.  */
-  default_ver_decl = node_version_info->next->this_node->decl;
-
-  /* node is going to be an alias, so remove the finalized bit.  */
-  node->definition = false;
-
-  resolver_decl = make_resolver_func (default_ver_decl,
-				      node->decl, &empty_bb);
-
-  node_version_info->dispatcher_resolver = resolver_decl;
-
-  push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
-
-  auto_vec<tree, 2> fn_ver_vec;
-
-  for (versn_info = node_version_info->next; versn_info;
-       versn_info = versn_info->next)
-    {
-      versn = versn_info->this_node;
-      /* Check for virtual functions here again, as by this time it should
-	 have been determined if this function needs a vtable index or
-	 not.  This happens for methods in derived classes that override
-	 virtual methods in base classes but are not explicitly marked as
-	 virtual.  */
-      if (DECL_VINDEX (versn->decl))
-	sorry ("virtual function multiversioning not supported");
-
-      fn_ver_vec.safe_push (versn->decl);
-    }
-
-  dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
-  cgraph_edge::rebuild_edges ();
-  pop_cfun ();
-  return resolver_decl;
-}
-/* This builds the processor_model struct type defined in
-   libgcc/config/i386/cpuinfo.c  */
-
-static tree
-build_processor_model_struct (void)
-{
-  const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
-			      "__cpu_features"};
-  tree field = NULL_TREE, field_chain = NULL_TREE;
-  int i;
-  tree type = make_node (RECORD_TYPE);
-
-  /* The first 3 fields are unsigned int.  */
-  for (i = 0; i < 3; ++i)
-    {
-      field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
-			  get_identifier (field_name[i]), unsigned_type_node);
-      if (field_chain != NULL_TREE)
-	DECL_CHAIN (field) = field_chain;
-      field_chain = field;
-    }
-
-  /* The last field is an array of unsigned integers of size one.  */
-  field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
-		      get_identifier (field_name[3]),
-		      build_array_type (unsigned_type_node,
-					build_index_type (size_one_node)));
-  if (field_chain != NULL_TREE)
-    DECL_CHAIN (field) = field_chain;
-  field_chain = field;
-
-  finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
-  return type;
-}
-
-/* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
-
-static tree
-make_var_decl (tree type, const char *name)
-{
-  tree new_decl;
-
-  new_decl = build_decl (UNKNOWN_LOCATION,
-	                 VAR_DECL,
-	  	         get_identifier(name),
-		         type);
-
-  DECL_EXTERNAL (new_decl) = 1;
-  TREE_STATIC (new_decl) = 1;
-  TREE_PUBLIC (new_decl) = 1;
-  DECL_INITIAL (new_decl) = 0;
-  DECL_ARTIFICIAL (new_decl) = 0;
-  DECL_PRESERVE_P (new_decl) = 1;
-
-  make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
-  assemble_variable (new_decl, 0, 0, 0);
-
-  return new_decl;
-}
-
-/* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
-   into an integer defined in libgcc/config/i386/cpuinfo.c */
-
-static tree
-fold_builtin_cpu (tree fndecl, tree *args)
-{
-  unsigned int i;
-  enum ix86_builtins fn_code = (enum ix86_builtins)
-				DECL_FUNCTION_CODE (fndecl);
-  tree param_string_cst = NULL;
-
-  tree __processor_model_type = build_processor_model_struct ();
-  tree __cpu_model_var = make_var_decl (__processor_model_type,
-					"__cpu_model");
-
-
-  varpool_node::add (__cpu_model_var);
-
-  gcc_assert ((args != NULL) && (*args != NULL));
-
-  param_string_cst = *args;
-  while (param_string_cst
-	 && TREE_CODE (param_string_cst) !=  STRING_CST)
-    {
-      /* *args must be a expr that can contain other EXPRS leading to a
-	 STRING_CST.   */
-      if (!EXPR_P (param_string_cst))
- 	{
-	  error ("parameter to builtin must be a string constant or literal");
-	  return integer_zero_node;
-	}
-      param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
-    }
-
-  gcc_assert (param_string_cst);
-
-  if (fn_code == IX86_BUILTIN_CPU_IS)
-    {
-      tree ref;
-      tree field;
-      tree final;
-
-      unsigned int field_val = 0;
-      unsigned int NUM_ARCH_NAMES
-	= sizeof (arch_names_table) / sizeof (struct _arch_names_table);
-
-      for (i = 0; i < NUM_ARCH_NAMES; i++)
-	if (strcmp (arch_names_table[i].name,
-	    TREE_STRING_POINTER (param_string_cst)) == 0)
-	  break;
-
-      if (i == NUM_ARCH_NAMES)
-	{
-	  error ("parameter to builtin not valid: %s",
-	         TREE_STRING_POINTER (param_string_cst));
-	  return integer_zero_node;
-	}
-
-      field = TYPE_FIELDS (__processor_model_type);
-      field_val = arch_names_table[i].model;
-
-      /* CPU types are stored in the next field.  */
-      if (field_val > M_CPU_TYPE_START
-	  && field_val < M_CPU_SUBTYPE_START)
-	{
-	  field = DECL_CHAIN (field);
-	  field_val -= M_CPU_TYPE_START;
-	}
-
-      /* CPU subtypes are stored in the next field.  */
-      if (field_val > M_CPU_SUBTYPE_START)
-	{
-	  field = DECL_CHAIN ( DECL_CHAIN (field));
-	  field_val -= M_CPU_SUBTYPE_START;
-	}
-
-      /* Get the appropriate field in __cpu_model.  */
-      ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
-		    field, NULL_TREE);
-
-      /* Check the value.  */
-      final = build2 (EQ_EXPR, unsigned_type_node, ref,
-		      build_int_cstu (unsigned_type_node, field_val));
-      return build1 (CONVERT_EXPR, integer_type_node, final);
-    }
-  else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
-    {
-      tree ref;
-      tree array_elt;
-      tree field;
-      tree final;
-
-      unsigned int field_val = 0;
-      unsigned int NUM_ISA_NAMES
-	= sizeof (isa_names_table) / sizeof (struct _isa_names_table);
-
-      for (i = 0; i < NUM_ISA_NAMES; i++)
-	if (strcmp (isa_names_table[i].name,
-	    TREE_STRING_POINTER (param_string_cst)) == 0)
-	  break;
-
-      if (i == NUM_ISA_NAMES)
-	{
-	  error ("parameter to builtin not valid: %s",
-	       	 TREE_STRING_POINTER (param_string_cst));
-	  return integer_zero_node;
-	}
-
-      if (isa_names_table[i].feature >= 32)
-	{
-	  tree __cpu_features2_var = make_var_decl (unsigned_type_node,
-						    "__cpu_features2");
-
-	  varpool_node::add (__cpu_features2_var);
-	  field_val = (1U << (isa_names_table[i].feature - 32));
-	  /* Return __cpu_features2 & field_val  */
-	  final = build2 (BIT_AND_EXPR, unsigned_type_node,
-			  __cpu_features2_var,
-			  build_int_cstu (unsigned_type_node, field_val));
-	  return build1 (CONVERT_EXPR, integer_type_node, final);
-	}
-
-      field = TYPE_FIELDS (__processor_model_type);
-      /* Get the last field, which is __cpu_features.  */
-      while (DECL_CHAIN (field))
-        field = DECL_CHAIN (field);
-
-      /* Get the appropriate field: __cpu_model.__cpu_features  */
-      ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
-		    field, NULL_TREE);
-
-      /* Access the 0th element of __cpu_features array.  */
-      array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
-			  integer_zero_node, NULL_TREE, NULL_TREE);
-
-      field_val = (1U << isa_names_table[i].feature);
-      /* Return __cpu_model.__cpu_features[0] & field_val  */
-      final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
-		      build_int_cstu (unsigned_type_node, field_val));
-      return build1 (CONVERT_EXPR, integer_type_node, final);
-    }
-  gcc_unreachable ();
-}
-
-/* Return the shift count of a vector by scalar shift builtin second argument
-   ARG1.  */
-static tree
-ix86_vector_shift_count (tree arg1)
-{
-  if (tree_fits_uhwi_p (arg1))
-    return arg1;
-  else if (TREE_CODE (arg1) == VECTOR_CST && CHAR_BIT == 8)
-    {
-      /* The count argument is weird, passed in as various 128-bit
-	 (or 64-bit) vectors, the low 64 bits from it are the count.  */
-      unsigned char buf[16];
-      int len = native_encode_expr (arg1, buf, 16);
-      if (len == 0)
-	return NULL_TREE;
-      tree t = native_interpret_expr (uint64_type_node, buf, len);
-      if (t && tree_fits_uhwi_p (t))
-	return t;
-    }
-  return NULL_TREE;
-}
-
-static tree
-ix86_fold_builtin (tree fndecl, int n_args,
-		   tree *args, bool ignore ATTRIBUTE_UNUSED)
-{
-  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
-    {
-      enum ix86_builtins fn_code = (enum ix86_builtins)
-				   DECL_FUNCTION_CODE (fndecl);
-      enum rtx_code rcode;
-      bool is_vshift;
-      unsigned HOST_WIDE_INT mask;
-
-      switch (fn_code)
-	{
-	case IX86_BUILTIN_CPU_IS:
-	case IX86_BUILTIN_CPU_SUPPORTS:
-	  gcc_assert (n_args == 1);
-	  return fold_builtin_cpu (fndecl, args);
-
-	case IX86_BUILTIN_NANQ:
-	case IX86_BUILTIN_NANSQ:
-	  {
-	    tree type = TREE_TYPE (TREE_TYPE (fndecl));
-	    const char *str = c_getstr (*args);
-	    int quiet = fn_code == IX86_BUILTIN_NANQ;
-	    REAL_VALUE_TYPE real;
-
-	    if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
-	      return build_real (type, real);
-	    return NULL_TREE;
-	  }
-
-	case IX86_BUILTIN_INFQ:
-	case IX86_BUILTIN_HUGE_VALQ:
-	  {
-	    tree type = TREE_TYPE (TREE_TYPE (fndecl));
-	    REAL_VALUE_TYPE inf;
-	    real_inf (&inf);
-	    return build_real (type, inf);
-	  }
-
-	case IX86_BUILTIN_TZCNT16:
-	case IX86_BUILTIN_CTZS:
-	case IX86_BUILTIN_TZCNT32:
-	case IX86_BUILTIN_TZCNT64:
-	  gcc_assert (n_args == 1);
-	  if (TREE_CODE (args[0]) == INTEGER_CST)
-	    {
-	      tree type = TREE_TYPE (TREE_TYPE (fndecl));
-	      tree arg = args[0];
-	      if (fn_code == IX86_BUILTIN_TZCNT16
-		  || fn_code == IX86_BUILTIN_CTZS)
-		arg = fold_convert (short_unsigned_type_node, arg);
-	      if (integer_zerop (arg))
-		return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
-	      else
-		return fold_const_call (CFN_CTZ, type, arg);
-	    }
-	  break;
-
-	case IX86_BUILTIN_LZCNT16:
-	case IX86_BUILTIN_CLZS:
-	case IX86_BUILTIN_LZCNT32:
-	case IX86_BUILTIN_LZCNT64:
-	  gcc_assert (n_args == 1);
-	  if (TREE_CODE (args[0]) == INTEGER_CST)
-	    {
-	      tree type = TREE_TYPE (TREE_TYPE (fndecl));
-	      tree arg = args[0];
-	      if (fn_code == IX86_BUILTIN_LZCNT16
-		  || fn_code == IX86_BUILTIN_CLZS)
-		arg = fold_convert (short_unsigned_type_node, arg);
-	      if (integer_zerop (arg))
-		return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
-	      else
-		return fold_const_call (CFN_CLZ, type, arg);
-	    }
-	  break;
-
-	case IX86_BUILTIN_BEXTR32:
-	case IX86_BUILTIN_BEXTR64:
-	case IX86_BUILTIN_BEXTRI32:
-	case IX86_BUILTIN_BEXTRI64:
-	  gcc_assert (n_args == 2);
-	  if (tree_fits_uhwi_p (args[1]))
-	    {
-	      unsigned HOST_WIDE_INT res = 0;
-	      unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
-	      unsigned int start = tree_to_uhwi (args[1]);
-	      unsigned int len = (start & 0xff00) >> 8;
-	      start &= 0xff;
-	      if (start >= prec || len == 0)
-		res = 0;
-	      else if (!tree_fits_uhwi_p (args[0]))
-		break;
-	      else
-		res = tree_to_uhwi (args[0]) >> start;
-	      if (len > prec)
-		len = prec;
-	      if (len < HOST_BITS_PER_WIDE_INT)
-		res &= (HOST_WIDE_INT_1U << len) - 1;
-	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
-	    }
-	  break;
-
-	case IX86_BUILTIN_BZHI32:
-	case IX86_BUILTIN_BZHI64:
-	  gcc_assert (n_args == 2);
-	  if (tree_fits_uhwi_p (args[1]))
-	    {
-	      unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
-	      if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
-		return args[0];
-	      if (idx == 0)
-		return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), 0);
-	      if (!tree_fits_uhwi_p (args[0]))
-		break;
-	      unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
-	      res &= ~(HOST_WIDE_INT_M1U << idx);
-	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
-	    }
-	  break;
-
-	case IX86_BUILTIN_PDEP32:
-	case IX86_BUILTIN_PDEP64:
-	  gcc_assert (n_args == 2);
-	  if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
-	    {
-	      unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
-	      unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
-	      unsigned HOST_WIDE_INT res = 0;
-	      unsigned HOST_WIDE_INT m, k = 1;
-	      for (m = 1; m; m <<= 1)
-		if ((mask & m) != 0)
-		  {
-		    if ((src & k) != 0)
-		      res |= m;
-		    k <<= 1;
-		  }
-	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
-	    }
-	  break;
-
-	case IX86_BUILTIN_PEXT32:
-	case IX86_BUILTIN_PEXT64:
-	  gcc_assert (n_args == 2);
-	  if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
-	    {
-	      unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
-	      unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
-	      unsigned HOST_WIDE_INT res = 0;
-	      unsigned HOST_WIDE_INT m, k = 1;
-	      for (m = 1; m; m <<= 1)
-		if ((mask & m) != 0)
-		  {
-		    if ((src & m) != 0)
-		      res |= k;
-		    k <<= 1;
-		  }
-	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
-	    }
-	  break;
-
-	case IX86_BUILTIN_MOVMSKPS:
-	case IX86_BUILTIN_PMOVMSKB:
-	case IX86_BUILTIN_MOVMSKPD:
-	case IX86_BUILTIN_PMOVMSKB128:
-	case IX86_BUILTIN_MOVMSKPD256:
-	case IX86_BUILTIN_MOVMSKPS256:
-	case IX86_BUILTIN_PMOVMSKB256:
-	  gcc_assert (n_args == 1);
-	  if (TREE_CODE (args[0]) == VECTOR_CST)
-	    {
-	      HOST_WIDE_INT res = 0;
-	      for (unsigned i = 0; i < VECTOR_CST_NELTS (args[0]); ++i)
-		{
-		  tree e = VECTOR_CST_ELT (args[0], i);
-		  if (TREE_CODE (e) == INTEGER_CST && !TREE_OVERFLOW (e))
-		    {
-		      if (wi::neg_p (wi::to_wide (e)))
-			res |= HOST_WIDE_INT_1 << i;
-		    }
-		  else if (TREE_CODE (e) == REAL_CST && !TREE_OVERFLOW (e))
-		    {
-		      if (TREE_REAL_CST (e).sign)
-			res |= HOST_WIDE_INT_1 << i;
-		    }
-		  else
-		    return NULL_TREE;
-		}
-	      return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), res);
-	    }
-	  break;
-
-	case IX86_BUILTIN_PSLLD:
-	case IX86_BUILTIN_PSLLD128:
-	case IX86_BUILTIN_PSLLD128_MASK:
-	case IX86_BUILTIN_PSLLD256:
-	case IX86_BUILTIN_PSLLD256_MASK:
-	case IX86_BUILTIN_PSLLD512:
-	case IX86_BUILTIN_PSLLDI:
-	case IX86_BUILTIN_PSLLDI128:
-	case IX86_BUILTIN_PSLLDI128_MASK:
-	case IX86_BUILTIN_PSLLDI256:
-	case IX86_BUILTIN_PSLLDI256_MASK:
-	case IX86_BUILTIN_PSLLDI512:
-	case IX86_BUILTIN_PSLLQ:
-	case IX86_BUILTIN_PSLLQ128:
-	case IX86_BUILTIN_PSLLQ128_MASK:
-	case IX86_BUILTIN_PSLLQ256:
-	case IX86_BUILTIN_PSLLQ256_MASK:
-	case IX86_BUILTIN_PSLLQ512:
-	case IX86_BUILTIN_PSLLQI:
-	case IX86_BUILTIN_PSLLQI128:
-	case IX86_BUILTIN_PSLLQI128_MASK:
-	case IX86_BUILTIN_PSLLQI256:
-	case IX86_BUILTIN_PSLLQI256_MASK:
-	case IX86_BUILTIN_PSLLQI512:
-	case IX86_BUILTIN_PSLLW:
-	case IX86_BUILTIN_PSLLW128:
-	case IX86_BUILTIN_PSLLW128_MASK:
-	case IX86_BUILTIN_PSLLW256:
-	case IX86_BUILTIN_PSLLW256_MASK:
-	case IX86_BUILTIN_PSLLW512_MASK:
-	case IX86_BUILTIN_PSLLWI:
-	case IX86_BUILTIN_PSLLWI128:
-	case IX86_BUILTIN_PSLLWI128_MASK:
-	case IX86_BUILTIN_PSLLWI256:
-	case IX86_BUILTIN_PSLLWI256_MASK:
-	case IX86_BUILTIN_PSLLWI512_MASK:
-	  rcode = ASHIFT;
-	  is_vshift = false;
-	  goto do_shift;
-	case IX86_BUILTIN_PSRAD:
-	case IX86_BUILTIN_PSRAD128:
-	case IX86_BUILTIN_PSRAD128_MASK:
-	case IX86_BUILTIN_PSRAD256:
-	case IX86_BUILTIN_PSRAD256_MASK:
-	case IX86_BUILTIN_PSRAD512:
-	case IX86_BUILTIN_PSRADI:
-	case IX86_BUILTIN_PSRADI128:
-	case IX86_BUILTIN_PSRADI128_MASK:
-	case IX86_BUILTIN_PSRADI256:
-	case IX86_BUILTIN_PSRADI256_MASK:
-	case IX86_BUILTIN_PSRADI512:
-	case IX86_BUILTIN_PSRAQ128_MASK:
-	case IX86_BUILTIN_PSRAQ256_MASK:
-	case IX86_BUILTIN_PSRAQ512:
-	case IX86_BUILTIN_PSRAQI128_MASK:
-	case IX86_BUILTIN_PSRAQI256_MASK:
-	case IX86_BUILTIN_PSRAQI512:
-	case IX86_BUILTIN_PSRAW:
-	case IX86_BUILTIN_PSRAW128:
-	case IX86_BUILTIN_PSRAW128_MASK:
-	case IX86_BUILTIN_PSRAW256:
-	case IX86_BUILTIN_PSRAW256_MASK:
-	case IX86_BUILTIN_PSRAW512:
-	case IX86_BUILTIN_PSRAWI:
-	case IX86_BUILTIN_PSRAWI128:
-	case IX86_BUILTIN_PSRAWI128_MASK:
-	case IX86_BUILTIN_PSRAWI256:
-	case IX86_BUILTIN_PSRAWI256_MASK:
-	case IX86_BUILTIN_PSRAWI512:
-	  rcode = ASHIFTRT;
-	  is_vshift = false;
-	  goto do_shift;
-	case IX86_BUILTIN_PSRLD:
-	case IX86_BUILTIN_PSRLD128:
-	case IX86_BUILTIN_PSRLD128_MASK:
-	case IX86_BUILTIN_PSRLD256:
-	case IX86_BUILTIN_PSRLD256_MASK:
-	case IX86_BUILTIN_PSRLD512:
-	case IX86_BUILTIN_PSRLDI:
-	case IX86_BUILTIN_PSRLDI128:
-	case IX86_BUILTIN_PSRLDI128_MASK:
-	case IX86_BUILTIN_PSRLDI256:
-	case IX86_BUILTIN_PSRLDI256_MASK:
-	case IX86_BUILTIN_PSRLDI512:
-	case IX86_BUILTIN_PSRLQ:
-	case IX86_BUILTIN_PSRLQ128:
-	case IX86_BUILTIN_PSRLQ128_MASK:
-	case IX86_BUILTIN_PSRLQ256:
-	case IX86_BUILTIN_PSRLQ256_MASK:
-	case IX86_BUILTIN_PSRLQ512:
-	case IX86_BUILTIN_PSRLQI:
-	case IX86_BUILTIN_PSRLQI128:
-	case IX86_BUILTIN_PSRLQI128_MASK:
-	case IX86_BUILTIN_PSRLQI256:
-	case IX86_BUILTIN_PSRLQI256_MASK:
-	case IX86_BUILTIN_PSRLQI512:
-	case IX86_BUILTIN_PSRLW:
-	case IX86_BUILTIN_PSRLW128:
-	case IX86_BUILTIN_PSRLW128_MASK:
-	case IX86_BUILTIN_PSRLW256:
-	case IX86_BUILTIN_PSRLW256_MASK:
-	case IX86_BUILTIN_PSRLW512:
-	case IX86_BUILTIN_PSRLWI:
-	case IX86_BUILTIN_PSRLWI128:
-	case IX86_BUILTIN_PSRLWI128_MASK:
-	case IX86_BUILTIN_PSRLWI256:
-	case IX86_BUILTIN_PSRLWI256_MASK:
-	case IX86_BUILTIN_PSRLWI512:
-	  rcode = LSHIFTRT;
-	  is_vshift = false;
-	  goto do_shift;
-	case IX86_BUILTIN_PSLLVV16HI:
-	case IX86_BUILTIN_PSLLVV16SI:
-	case IX86_BUILTIN_PSLLVV2DI:
-	case IX86_BUILTIN_PSLLVV2DI_MASK:
-	case IX86_BUILTIN_PSLLVV32HI:
-	case IX86_BUILTIN_PSLLVV4DI:
-	case IX86_BUILTIN_PSLLVV4DI_MASK:
-	case IX86_BUILTIN_PSLLVV4SI:
-	case IX86_BUILTIN_PSLLVV4SI_MASK:
-	case IX86_BUILTIN_PSLLVV8DI:
-	case IX86_BUILTIN_PSLLVV8HI:
-	case IX86_BUILTIN_PSLLVV8SI:
-	case IX86_BUILTIN_PSLLVV8SI_MASK:
-	  rcode = ASHIFT;
-	  is_vshift = true;
-	  goto do_shift;
-	case IX86_BUILTIN_PSRAVQ128:
-	case IX86_BUILTIN_PSRAVQ256:
-	case IX86_BUILTIN_PSRAVV16HI:
-	case IX86_BUILTIN_PSRAVV16SI:
-	case IX86_BUILTIN_PSRAVV32HI:
-	case IX86_BUILTIN_PSRAVV4SI:
-	case IX86_BUILTIN_PSRAVV4SI_MASK:
-	case IX86_BUILTIN_PSRAVV8DI:
-	case IX86_BUILTIN_PSRAVV8HI:
-	case IX86_BUILTIN_PSRAVV8SI:
-	case IX86_BUILTIN_PSRAVV8SI_MASK:
-	  rcode = ASHIFTRT;
-	  is_vshift = true;
-	  goto do_shift;
-	case IX86_BUILTIN_PSRLVV16HI:
-	case IX86_BUILTIN_PSRLVV16SI:
-	case IX86_BUILTIN_PSRLVV2DI:
-	case IX86_BUILTIN_PSRLVV2DI_MASK:
-	case IX86_BUILTIN_PSRLVV32HI:
-	case IX86_BUILTIN_PSRLVV4DI:
-	case IX86_BUILTIN_PSRLVV4DI_MASK:
-	case IX86_BUILTIN_PSRLVV4SI:
-	case IX86_BUILTIN_PSRLVV4SI_MASK:
-	case IX86_BUILTIN_PSRLVV8DI:
-	case IX86_BUILTIN_PSRLVV8HI:
-	case IX86_BUILTIN_PSRLVV8SI:
-	case IX86_BUILTIN_PSRLVV8SI_MASK:
-	  rcode = LSHIFTRT;
-	  is_vshift = true;
-	  goto do_shift;
-
-	do_shift:
-	  gcc_assert (n_args >= 2);
-	  if (TREE_CODE (args[0]) != VECTOR_CST)
-	    break;
-	  mask = HOST_WIDE_INT_M1U;
-	  if (n_args > 2)
-	    {
-	      /* This is masked shift.  */
-	      if (!tree_fits_uhwi_p (args[n_args - 1])
-		  || TREE_SIDE_EFFECTS (args[n_args - 2]))
-		break;
-	      mask = tree_to_uhwi (args[n_args - 1]);
-	      unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
-	      mask |= HOST_WIDE_INT_M1U << elems;
-	      if (mask != HOST_WIDE_INT_M1U
-		  && TREE_CODE (args[n_args - 2]) != VECTOR_CST)
-		break;
-	      if (mask == (HOST_WIDE_INT_M1U << elems))
-		return args[n_args - 2];
-	    }
-	  if (is_vshift && TREE_CODE (args[1]) != VECTOR_CST)
-	    break;
-	  if (tree tem = (is_vshift ? integer_one_node
-			  : ix86_vector_shift_count (args[1])))
-	    {
-	      unsigned HOST_WIDE_INT count = tree_to_uhwi (tem);
-	      unsigned HOST_WIDE_INT prec
-		= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (args[0])));
-	      if (count == 0 && mask == HOST_WIDE_INT_M1U)
-		return args[0];
-	      if (count >= prec)
-		{
-		  if (rcode == ASHIFTRT)
-		    count = prec - 1;
-		  else if (mask == HOST_WIDE_INT_M1U)
-		    return build_zero_cst (TREE_TYPE (args[0]));
-		}
-	      tree countt = NULL_TREE;
-	      if (!is_vshift)
-		{
-		  if (count >= prec)
-		    countt = integer_zero_node;
-		  else
-		    countt = build_int_cst (integer_type_node, count);
-		}
-	      tree_vector_builder builder;
-	      if (mask != HOST_WIDE_INT_M1U || is_vshift)
-		builder.new_vector (TREE_TYPE (args[0]),
-				    TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0])),
-				    1);
-	      else
-		builder.new_unary_operation (TREE_TYPE (args[0]), args[0],
-					     false);
-	      unsigned int cnt = builder.encoded_nelts ();
-	      for (unsigned int i = 0; i < cnt; ++i)
-		{
-		  tree elt = VECTOR_CST_ELT (args[0], i);
-		  if (TREE_CODE (elt) != INTEGER_CST || TREE_OVERFLOW (elt))
-		    return NULL_TREE;
-		  tree type = TREE_TYPE (elt);
-		  if (rcode == LSHIFTRT)
-		    elt = fold_convert (unsigned_type_for (type), elt);
-		  if (is_vshift)
-		    {
-		      countt = VECTOR_CST_ELT (args[1], i);
-		      if (TREE_CODE (countt) != INTEGER_CST
-			  || TREE_OVERFLOW (countt))
-			return NULL_TREE;
-		      if (wi::neg_p (wi::to_wide (countt))
-			  || wi::to_widest (countt) >= prec)
-			{
-			  if (rcode == ASHIFTRT)
-			    countt = build_int_cst (TREE_TYPE (countt),
-						    prec - 1);
-			  else
-			    {
-			      elt = build_zero_cst (TREE_TYPE (elt));
-			      countt = build_zero_cst (TREE_TYPE (countt));
-			    }
-			}
-		    }
-		  else if (count >= prec)
-		    elt = build_zero_cst (TREE_TYPE (elt));
-		  elt = const_binop (rcode == ASHIFT
-				     ? LSHIFT_EXPR : RSHIFT_EXPR,
-				     TREE_TYPE (elt), elt, countt);
-		  if (!elt || TREE_CODE (elt) != INTEGER_CST)
-		    return NULL_TREE;
-		  if (rcode == LSHIFTRT)
-		    elt = fold_convert (type, elt);
-		  if ((mask & (HOST_WIDE_INT_1U << i)) == 0)
-		    {
-		      elt = VECTOR_CST_ELT (args[n_args - 2], i);
-		      if (TREE_CODE (elt) != INTEGER_CST
-			  || TREE_OVERFLOW (elt))
-			return NULL_TREE;
-		    }
-		  builder.quick_push (elt);
-		}
-	      return builder.build ();
-	    }
-	  break;
-
-	default:
-	  break;
-	}
-    }
-
-#ifdef SUBTARGET_FOLD_BUILTIN
-  return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
-#endif
-
-  return NULL_TREE;
-}
-
-/* Fold a MD builtin (use ix86_fold_builtin for folding into
-   constant) in GIMPLE.  */
-
-bool
-ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
-{
-  gimple *stmt = gsi_stmt (*gsi);
-  tree fndecl = gimple_call_fndecl (stmt);
-  gcc_checking_assert (fndecl && fndecl_built_in_p (fndecl, BUILT_IN_MD));
-  int n_args = gimple_call_num_args (stmt);
-  enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
-  tree decl = NULL_TREE;
-  tree arg0, arg1;
-  enum rtx_code rcode;
-  unsigned HOST_WIDE_INT count;
-  bool is_vshift;
-
-  switch (fn_code)
-    {
-    case IX86_BUILTIN_TZCNT32:
-      decl = builtin_decl_implicit (BUILT_IN_CTZ);
-      goto fold_tzcnt_lzcnt;
-
-    case IX86_BUILTIN_TZCNT64:
-      decl = builtin_decl_implicit (BUILT_IN_CTZLL);
-      goto fold_tzcnt_lzcnt;
-
-    case IX86_BUILTIN_LZCNT32:
-      decl = builtin_decl_implicit (BUILT_IN_CLZ);
-      goto fold_tzcnt_lzcnt;
-
-    case IX86_BUILTIN_LZCNT64:
-      decl = builtin_decl_implicit (BUILT_IN_CLZLL);
-      goto fold_tzcnt_lzcnt;
-
-    fold_tzcnt_lzcnt:
-      gcc_assert (n_args == 1);
-      arg0 = gimple_call_arg (stmt, 0);
-      if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
-	{
-	  int prec = TYPE_PRECISION (TREE_TYPE (arg0));
-	  /* If arg0 is provably non-zero, optimize into generic
-	     __builtin_c[tl]z{,ll} function the middle-end handles
-	     better.  */
-	  if (!expr_not_equal_to (arg0, wi::zero (prec)))
-	    return false;
-
-	  location_t loc = gimple_location (stmt);
-	  gimple *g = gimple_build_call (decl, 1, arg0);
-	  gimple_set_location (g, loc);
-	  tree lhs = make_ssa_name (integer_type_node);
-	  gimple_call_set_lhs (g, lhs);
-	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
-	  g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
-	  gimple_set_location (g, loc);
-	  gsi_replace (gsi, g, false);
-	  return true;
-	}
-      break;
-
-    case IX86_BUILTIN_BZHI32:
-    case IX86_BUILTIN_BZHI64:
-      gcc_assert (n_args == 2);
-      arg1 = gimple_call_arg (stmt, 1);
-      if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
-	{
-	  unsigned int idx = tree_to_uhwi (arg1) & 0xff;
-	  arg0 = gimple_call_arg (stmt, 0);
-	  if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
-	    break;
-	  location_t loc = gimple_location (stmt);
-	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
-	  gimple_set_location (g, loc);
-	  gsi_replace (gsi, g, false);
-	  return true;
-	}
-      break;
-
-    case IX86_BUILTIN_PDEP32:
-    case IX86_BUILTIN_PDEP64:
-    case IX86_BUILTIN_PEXT32:
-    case IX86_BUILTIN_PEXT64:
-      gcc_assert (n_args == 2);
-      arg1 = gimple_call_arg (stmt, 1);
-      if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
-	{
-	  location_t loc = gimple_location (stmt);
-	  arg0 = gimple_call_arg (stmt, 0);
-	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
-	  gimple_set_location (g, loc);
-	  gsi_replace (gsi, g, false);
-	  return true;
-	}
-      break;
-
-    case IX86_BUILTIN_PSLLD:
-    case IX86_BUILTIN_PSLLD128:
-    case IX86_BUILTIN_PSLLD128_MASK:
-    case IX86_BUILTIN_PSLLD256:
-    case IX86_BUILTIN_PSLLD256_MASK:
-    case IX86_BUILTIN_PSLLD512:
-    case IX86_BUILTIN_PSLLDI:
-    case IX86_BUILTIN_PSLLDI128:
-    case IX86_BUILTIN_PSLLDI128_MASK:
-    case IX86_BUILTIN_PSLLDI256:
-    case IX86_BUILTIN_PSLLDI256_MASK:
-    case IX86_BUILTIN_PSLLDI512:
-    case IX86_BUILTIN_PSLLQ:
-    case IX86_BUILTIN_PSLLQ128:
-    case IX86_BUILTIN_PSLLQ128_MASK:
-    case IX86_BUILTIN_PSLLQ256:
-    case IX86_BUILTIN_PSLLQ256_MASK:
-    case IX86_BUILTIN_PSLLQ512:
-    case IX86_BUILTIN_PSLLQI:
-    case IX86_BUILTIN_PSLLQI128:
-    case IX86_BUILTIN_PSLLQI128_MASK:
-    case IX86_BUILTIN_PSLLQI256:
-    case IX86_BUILTIN_PSLLQI256_MASK:
-    case IX86_BUILTIN_PSLLQI512:
-    case IX86_BUILTIN_PSLLW:
-    case IX86_BUILTIN_PSLLW128:
-    case IX86_BUILTIN_PSLLW128_MASK:
-    case IX86_BUILTIN_PSLLW256:
-    case IX86_BUILTIN_PSLLW256_MASK:
-    case IX86_BUILTIN_PSLLW512_MASK:
-    case IX86_BUILTIN_PSLLWI:
-    case IX86_BUILTIN_PSLLWI128:
-    case IX86_BUILTIN_PSLLWI128_MASK:
-    case IX86_BUILTIN_PSLLWI256:
-    case IX86_BUILTIN_PSLLWI256_MASK:
-    case IX86_BUILTIN_PSLLWI512_MASK:
-      rcode = ASHIFT;
-      is_vshift = false;
-      goto do_shift;
-    case IX86_BUILTIN_PSRAD:
-    case IX86_BUILTIN_PSRAD128:
-    case IX86_BUILTIN_PSRAD128_MASK:
-    case IX86_BUILTIN_PSRAD256:
-    case IX86_BUILTIN_PSRAD256_MASK:
-    case IX86_BUILTIN_PSRAD512:
-    case IX86_BUILTIN_PSRADI:
-    case IX86_BUILTIN_PSRADI128:
-    case IX86_BUILTIN_PSRADI128_MASK:
-    case IX86_BUILTIN_PSRADI256:
-    case IX86_BUILTIN_PSRADI256_MASK:
-    case IX86_BUILTIN_PSRADI512:
-    case IX86_BUILTIN_PSRAQ128_MASK:
-    case IX86_BUILTIN_PSRAQ256_MASK:
-    case IX86_BUILTIN_PSRAQ512:
-    case IX86_BUILTIN_PSRAQI128_MASK:
-    case IX86_BUILTIN_PSRAQI256_MASK:
-    case IX86_BUILTIN_PSRAQI512:
-    case IX86_BUILTIN_PSRAW:
-    case IX86_BUILTIN_PSRAW128:
-    case IX86_BUILTIN_PSRAW128_MASK:
-    case IX86_BUILTIN_PSRAW256:
-    case IX86_BUILTIN_PSRAW256_MASK:
-    case IX86_BUILTIN_PSRAW512:
-    case IX86_BUILTIN_PSRAWI:
-    case IX86_BUILTIN_PSRAWI128:
-    case IX86_BUILTIN_PSRAWI128_MASK:
-    case IX86_BUILTIN_PSRAWI256:
-    case IX86_BUILTIN_PSRAWI256_MASK:
-    case IX86_BUILTIN_PSRAWI512:
-      rcode = ASHIFTRT;
-      is_vshift = false;
-      goto do_shift;
-    case IX86_BUILTIN_PSRLD:
-    case IX86_BUILTIN_PSRLD128:
-    case IX86_BUILTIN_PSRLD128_MASK:
-    case IX86_BUILTIN_PSRLD256:
-    case IX86_BUILTIN_PSRLD256_MASK:
-    case IX86_BUILTIN_PSRLD512:
-    case IX86_BUILTIN_PSRLDI:
-    case IX86_BUILTIN_PSRLDI128:
-    case IX86_BUILTIN_PSRLDI128_MASK:
-    case IX86_BUILTIN_PSRLDI256:
-    case IX86_BUILTIN_PSRLDI256_MASK:
-    case IX86_BUILTIN_PSRLDI512:
-    case IX86_BUILTIN_PSRLQ:
-    case IX86_BUILTIN_PSRLQ128:
-    case IX86_BUILTIN_PSRLQ128_MASK:
-    case IX86_BUILTIN_PSRLQ256:
-    case IX86_BUILTIN_PSRLQ256_MASK:
-    case IX86_BUILTIN_PSRLQ512:
-    case IX86_BUILTIN_PSRLQI:
-    case IX86_BUILTIN_PSRLQI128:
-    case IX86_BUILTIN_PSRLQI128_MASK:
-    case IX86_BUILTIN_PSRLQI256:
-    case IX86_BUILTIN_PSRLQI256_MASK:
-    case IX86_BUILTIN_PSRLQI512:
-    case IX86_BUILTIN_PSRLW:
-    case IX86_BUILTIN_PSRLW128:
-    case IX86_BUILTIN_PSRLW128_MASK:
-    case IX86_BUILTIN_PSRLW256:
-    case IX86_BUILTIN_PSRLW256_MASK:
-    case IX86_BUILTIN_PSRLW512:
-    case IX86_BUILTIN_PSRLWI:
-    case IX86_BUILTIN_PSRLWI128:
-    case IX86_BUILTIN_PSRLWI128_MASK:
-    case IX86_BUILTIN_PSRLWI256:
-    case IX86_BUILTIN_PSRLWI256_MASK:
-    case IX86_BUILTIN_PSRLWI512:
-      rcode = LSHIFTRT;
-      is_vshift = false;
-      goto do_shift;
-    case IX86_BUILTIN_PSLLVV16HI:
-    case IX86_BUILTIN_PSLLVV16SI:
-    case IX86_BUILTIN_PSLLVV2DI:
-    case IX86_BUILTIN_PSLLVV2DI_MASK:
-    case IX86_BUILTIN_PSLLVV32HI:
-    case IX86_BUILTIN_PSLLVV4DI:
-    case IX86_BUILTIN_PSLLVV4DI_MASK:
-    case IX86_BUILTIN_PSLLVV4SI:
-    case IX86_BUILTIN_PSLLVV4SI_MASK:
-    case IX86_BUILTIN_PSLLVV8DI:
-    case IX86_BUILTIN_PSLLVV8HI:
-    case IX86_BUILTIN_PSLLVV8SI:
-    case IX86_BUILTIN_PSLLVV8SI_MASK:
-      rcode = ASHIFT;
-      is_vshift = true;
-      goto do_shift;
-    case IX86_BUILTIN_PSRAVQ128:
-    case IX86_BUILTIN_PSRAVQ256:
-    case IX86_BUILTIN_PSRAVV16HI:
-    case IX86_BUILTIN_PSRAVV16SI:
-    case IX86_BUILTIN_PSRAVV32HI:
-    case IX86_BUILTIN_PSRAVV4SI:
-    case IX86_BUILTIN_PSRAVV4SI_MASK:
-    case IX86_BUILTIN_PSRAVV8DI:
-    case IX86_BUILTIN_PSRAVV8HI:
-    case IX86_BUILTIN_PSRAVV8SI:
-    case IX86_BUILTIN_PSRAVV8SI_MASK:
-      rcode = ASHIFTRT;
-      is_vshift = true;
-      goto do_shift;
-    case IX86_BUILTIN_PSRLVV16HI:
-    case IX86_BUILTIN_PSRLVV16SI:
-    case IX86_BUILTIN_PSRLVV2DI:
-    case IX86_BUILTIN_PSRLVV2DI_MASK:
-    case IX86_BUILTIN_PSRLVV32HI:
-    case IX86_BUILTIN_PSRLVV4DI:
-    case IX86_BUILTIN_PSRLVV4DI_MASK:
-    case IX86_BUILTIN_PSRLVV4SI:
-    case IX86_BUILTIN_PSRLVV4SI_MASK:
-    case IX86_BUILTIN_PSRLVV8DI:
-    case IX86_BUILTIN_PSRLVV8HI:
-    case IX86_BUILTIN_PSRLVV8SI:
-    case IX86_BUILTIN_PSRLVV8SI_MASK:
-      rcode = LSHIFTRT;
-      is_vshift = true;
-      goto do_shift;
-
-    do_shift:
-      gcc_assert (n_args >= 2);
-      arg0 = gimple_call_arg (stmt, 0);
-      arg1 = gimple_call_arg (stmt, 1);
-      if (n_args > 2)
-	{
-	  /* This is masked shift.  Only optimize if the mask is all ones.  */
-	  tree argl = gimple_call_arg (stmt, n_args - 1);
-	  if (!tree_fits_uhwi_p (argl))
-	    break;
-	  unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);
-	  unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
-	  if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
-	    break;
-	}
-      if (is_vshift)
-	{
-	  if (TREE_CODE (arg1) != VECTOR_CST)
-	    break;
-	  count = TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0)));
-	  if (integer_zerop (arg1))
-	    count = 0;
-	  else if (rcode == ASHIFTRT)
-	    break;
-	  else
-	    for (unsigned int i = 0; i < VECTOR_CST_NELTS (arg1); ++i)
-	      {
-		tree elt = VECTOR_CST_ELT (arg1, i);
-		if (!wi::neg_p (wi::to_wide (elt))
-		    && wi::to_widest (elt) < count)
-		  return false;
-	      }
-	}
-      else
-	{
-	  arg1 = ix86_vector_shift_count (arg1);
-	  if (!arg1)
-	    break;
-	  count = tree_to_uhwi (arg1);
-	}
-      if (count == 0)
-	{
-	  /* Just return the first argument for shift by 0.  */
-	  location_t loc = gimple_location (stmt);
-	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
-	  gimple_set_location (g, loc);
-	  gsi_replace (gsi, g, false);
-	  return true;
-	}
-      if (rcode != ASHIFTRT
-	  && count >= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0))))
-	{
-	  /* For shift counts equal or greater than precision, except for
-	     arithmetic right shift the result is zero.  */
-	  location_t loc = gimple_location (stmt);
-	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
-					   build_zero_cst (TREE_TYPE (arg0)));
-	  gimple_set_location (g, loc);
-	  gsi_replace (gsi, g, false);
-	  return true;
-	}
-      break;
-
-    default:
-      break;
-    }
-
-  return false;
-}
-
-/* Make builtins to detect cpu type and features supported.  NAME is
-   the builtin name, CODE is the builtin code, and FTYPE is the function
-   type of the builtin.  */
-
-static void
-make_cpu_type_builtin (const char* name, int code,
-		       enum ix86_builtin_func_type ftype, bool is_const)
-{
-  tree decl;
-  tree type;
-
-  type = ix86_get_builtin_func_type (ftype);
-  decl = add_builtin_function (name, type, code, BUILT_IN_MD,
-			       NULL, NULL_TREE);
-  gcc_assert (decl != NULL_TREE);
-  ix86_builtins[(int) code] = decl;
-  TREE_READONLY (decl) = is_const;
-}
-
-/* Make builtins to get CPU type and features supported.  The created
-   builtins are :
-
-   __builtin_cpu_init (), to detect cpu type and features,
-   __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
-   __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
-   */
-
-static void
-ix86_init_platform_type_builtins (void)
-{
-  make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
-			 INT_FTYPE_VOID, false);
-  make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
-			 INT_FTYPE_PCCHAR, true);
-  make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
-			 INT_FTYPE_PCCHAR, true);
-}
-
-/* Internal method for ix86_init_builtins.  */
-
-static void
-ix86_init_builtins_va_builtins_abi (void)
-{
-  tree ms_va_ref, sysv_va_ref;
-  tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
-  tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
-  tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
-  tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
-
-  if (!TARGET_64BIT)
-    return;
-  fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
-  fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
-  ms_va_ref = build_reference_type (ms_va_list_type_node);
-  sysv_va_ref = build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
-
-  fnvoid_va_end_ms = build_function_type_list (void_type_node, ms_va_ref,
-					       NULL_TREE);
-  fnvoid_va_start_ms
-    = build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
-  fnvoid_va_end_sysv
-    = build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
-  fnvoid_va_start_sysv
-    = build_varargs_function_type_list (void_type_node, sysv_va_ref,
-					NULL_TREE);
-  fnvoid_va_copy_ms
-    = build_function_type_list (void_type_node, ms_va_ref,
-				ms_va_list_type_node, NULL_TREE);
-  fnvoid_va_copy_sysv
-    = build_function_type_list (void_type_node, sysv_va_ref,
-				sysv_va_ref, NULL_TREE);
-
-  add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
-  			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
-  add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
-  			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
-  add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
-			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
-  add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
-  			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
-  add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
-  			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
-  add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
-			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
-}
-
-static void
-ix86_init_builtin_types (void)
-{
-  tree float80_type_node, const_string_type_node;
-
-  /* The __float80 type.  */
-  float80_type_node = long_double_type_node;
-  if (TYPE_MODE (float80_type_node) != XFmode)
-    {
-      if (float64x_type_node != NULL_TREE
-	  && TYPE_MODE (float64x_type_node) == XFmode)
-	float80_type_node = float64x_type_node;
-      else
-	{
-	  /* The __float80 type.  */
-	  float80_type_node = make_node (REAL_TYPE);
-
-	  TYPE_PRECISION (float80_type_node) = 80;
-	  layout_type (float80_type_node);
-	}
-    }
-  lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
-
-  /* The __float128 type.  The node has already been created as
-     _Float128, so we only need to register the __float128 name for
-     it.  */
-  lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
-
-  const_string_type_node
-    = build_pointer_type (build_qualified_type
-			  (char_type_node, TYPE_QUAL_CONST));
-
-  /* This macro is built by i386-builtin-types.awk.  */
-  DEFINE_BUILTIN_PRIMITIVE_TYPES;
-}
-
-static void
-ix86_init_builtins (void)
-{
-  tree ftype, decl;
-
-  ix86_init_builtin_types ();
-
-  /* Builtins to get CPU type and features. */
-  ix86_init_platform_type_builtins ();
-
-  /* TFmode support builtins.  */
-  def_builtin_const (0, 0, "__builtin_infq",
-		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
-  def_builtin_const (0, 0, "__builtin_huge_valq",
-		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
-
-  ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
-  decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
-			       BUILT_IN_MD, "nanq", NULL_TREE);
-  TREE_READONLY (decl) = 1;
-  ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
-
-  decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
-			       BUILT_IN_MD, "nansq", NULL_TREE);
-  TREE_READONLY (decl) = 1;
-  ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
-
-  /* We will expand them to normal call if SSE isn't available since
-     they are used by libgcc. */
-  ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
-  decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
-			       BUILT_IN_MD, "__fabstf2", NULL_TREE);
-  TREE_READONLY (decl) = 1;
-  ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
-
-  ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
-  decl = add_builtin_function ("__builtin_copysignq", ftype,
-			       IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
-			       "__copysigntf3", NULL_TREE);
-  TREE_READONLY (decl) = 1;
-  ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
-
-  ix86_init_tm_builtins ();
-  ix86_init_mmx_sse_builtins ();
-
-  if (TARGET_LP64)
-    ix86_init_builtins_va_builtins_abi ();
-
-#ifdef SUBTARGET_INIT_BUILTINS
-  SUBTARGET_INIT_BUILTINS;
-#endif
-}
-
-/* Return the ix86 builtin for CODE.  */
-
-static tree
-ix86_builtin_decl (unsigned code, bool)
-{
-  if (code >= IX86_BUILTIN_MAX)
-    return error_mark_node;
-
-  return ix86_builtins[code];
-}
-
-/* Errors in the source file can cause expand_expr to return const0_rtx
-   where we expect a vector.  To avoid crashing, use one of the vector
-   clear instructions.  */
-static rtx
-safe_vector_operand (rtx x, machine_mode mode)
-{
-  if (x == const0_rtx)
-    x = CONST0_RTX (mode);
-  return x;
-}
-
-/* Fixup modeless constants to fit required mode.  */
-static rtx
-fixup_modeless_constant (rtx x, machine_mode mode)
-{
-  if (GET_MODE (x) == VOIDmode)
-    x = convert_to_mode (mode, x, 1);
-  return x;
-}
-
-/* Subroutine of ix86_expand_builtin to take care of binop insns.  */
-
-static rtx
-ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
-{
-  rtx pat;
-  tree arg0 = CALL_EXPR_ARG (exp, 0);
-  tree arg1 = CALL_EXPR_ARG (exp, 1);
-  rtx op0 = expand_normal (arg0);
-  rtx op1 = expand_normal (arg1);
-  machine_mode tmode = insn_data[icode].operand[0].mode;
-  machine_mode mode0 = insn_data[icode].operand[1].mode;
-  machine_mode mode1 = insn_data[icode].operand[2].mode;
-
-  if (VECTOR_MODE_P (mode0))
-    op0 = safe_vector_operand (op0, mode0);
-  if (VECTOR_MODE_P (mode1))
-    op1 = safe_vector_operand (op1, mode1);
-
-  if (optimize || !target
-      || GET_MODE (target) != tmode
-      || !insn_data[icode].operand[0].predicate (target, tmode))
-    target = gen_reg_rtx (tmode);
-
-  if (GET_MODE (op1) == SImode && mode1 == TImode)
-    {
-      rtx x = gen_reg_rtx (V4SImode);
-      emit_insn (gen_sse2_loadd (x, op1));
-      op1 = gen_lowpart (TImode, x);
-    }
-
-  if (!insn_data[icode].operand[1].predicate (op0, mode0))
-    op0 = copy_to_mode_reg (mode0, op0);
-  if (!insn_data[icode].operand[2].predicate (op1, mode1))
-    op1 = copy_to_mode_reg (mode1, op1);
-
-  pat = GEN_FCN (icode) (target, op0, op1);
-  if (! pat)
-    return 0;
-
-  emit_insn (pat);
-
-  return target;
-}
-
-/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
-
-static rtx
-ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
-			       enum ix86_builtin_func_type m_type,
-			       enum rtx_code sub_code)
-{
-  rtx pat;
-  int i;
-  int nargs;
-  bool comparison_p = false;
-  bool tf_p = false;
-  bool last_arg_constant = false;
-  int num_memory = 0;
-  struct {
-    rtx op;
-    machine_mode mode;
-  } args[4];
-
-  machine_mode tmode = insn_data[icode].operand[0].mode;
-
-  switch (m_type)
-    {
-    case MULTI_ARG_4_DF2_DI_I:
-    case MULTI_ARG_4_DF2_DI_I1:
-    case MULTI_ARG_4_SF2_SI_I:
-    case MULTI_ARG_4_SF2_SI_I1:
-      nargs = 4;
-      last_arg_constant = true;
-      break;
-
-    case MULTI_ARG_3_SF:
-    case MULTI_ARG_3_DF:
-    case MULTI_ARG_3_SF2:
-    case MULTI_ARG_3_DF2:
-    case MULTI_ARG_3_DI:
-    case MULTI_ARG_3_SI:
-    case MULTI_ARG_3_SI_DI:
-    case MULTI_ARG_3_HI:
-    case MULTI_ARG_3_HI_SI:
-    case MULTI_ARG_3_QI:
-    case MULTI_ARG_3_DI2:
-    case MULTI_ARG_3_SI2:
-    case MULTI_ARG_3_HI2:
-    case MULTI_ARG_3_QI2:
-      nargs = 3;
-      break;
-
-    case MULTI_ARG_2_SF:
-    case MULTI_ARG_2_DF:
-    case MULTI_ARG_2_DI:
-    case MULTI_ARG_2_SI:
-    case MULTI_ARG_2_HI:
-    case MULTI_ARG_2_QI:
-      nargs = 2;
-      break;
-
-    case MULTI_ARG_2_DI_IMM:
-    case MULTI_ARG_2_SI_IMM:
-    case MULTI_ARG_2_HI_IMM:
-    case MULTI_ARG_2_QI_IMM:
-      nargs = 2;
-      last_arg_constant = true;
-      break;
-
-    case MULTI_ARG_1_SF:
-    case MULTI_ARG_1_DF:
-    case MULTI_ARG_1_SF2:
-    case MULTI_ARG_1_DF2:
-    case MULTI_ARG_1_DI:
-    case MULTI_ARG_1_SI:
-    case MULTI_ARG_1_HI:
-    case MULTI_ARG_1_QI:
-    case MULTI_ARG_1_SI_DI:
-    case MULTI_ARG_1_HI_DI:
-    case MULTI_ARG_1_HI_SI:
-    case MULTI_ARG_1_QI_DI:
-    case MULTI_ARG_1_QI_SI:
-    case MULTI_ARG_1_QI_HI:
-      nargs = 1;
-      break;
-
-    case MULTI_ARG_2_DI_CMP:
-    case MULTI_ARG_2_SI_CMP:
-    case MULTI_ARG_2_HI_CMP:
-    case MULTI_ARG_2_QI_CMP:
-      nargs = 2;
-      comparison_p = true;
-      break;
-
-    case MULTI_ARG_2_SF_TF:
-    case MULTI_ARG_2_DF_TF:
-    case MULTI_ARG_2_DI_TF:
-    case MULTI_ARG_2_SI_TF:
-    case MULTI_ARG_2_HI_TF:
-    case MULTI_ARG_2_QI_TF:
-      nargs = 2;
-      tf_p = true;
-      break;
-
-    default:
-      gcc_unreachable ();
-    }
-
-  if (optimize || !target
-      || GET_MODE (target) != tmode
-      || !insn_data[icode].operand[0].predicate (target, tmode))
-    target = gen_reg_rtx (tmode);
-  else if (memory_operand (target, tmode))
-    num_memory++;
-
-  gcc_assert (nargs <= 4);
-
-  for (i = 0; i < nargs; i++)
-    {
-      tree arg = CALL_EXPR_ARG (exp, i);
-      rtx op = expand_normal (arg);
-      int adjust = (comparison_p) ? 1 : 0;
-      machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
-
-      if (last_arg_constant && i == nargs - 1)
-	{
-	  if (!insn_data[icode].operand[i + 1].predicate (op, mode))
-	    {
-	      enum insn_code new_icode = icode;
-	      switch (icode)
-		{
-		case CODE_FOR_xop_vpermil2v2df3:
-		case CODE_FOR_xop_vpermil2v4sf3:
-		case CODE_FOR_xop_vpermil2v4df3:
-		case CODE_FOR_xop_vpermil2v8sf3:
-		  error ("the last argument must be a 2-bit immediate");
-		  return gen_reg_rtx (tmode);
-		case CODE_FOR_xop_rotlv2di3:
-		  new_icode = CODE_FOR_rotlv2di3;
-		  goto xop_rotl;
-		case CODE_FOR_xop_rotlv4si3:
-		  new_icode = CODE_FOR_rotlv4si3;
-		  goto xop_rotl;
-		case CODE_FOR_xop_rotlv8hi3:
-		  new_icode = CODE_FOR_rotlv8hi3;
-		  goto xop_rotl;
-		case CODE_FOR_xop_rotlv16qi3:
-		  new_icode = CODE_FOR_rotlv16qi3;
-		xop_rotl:
-		  if (CONST_INT_P (op))
-		    {
-		      int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
-		      op = GEN_INT (INTVAL (op) & mask);
-		      gcc_checking_assert
-			(insn_data[icode].operand[i + 1].predicate (op, mode));
-		    }
-		  else
-		    {
-		      gcc_checking_assert
-			(nargs == 2
-			 && insn_data[new_icode].operand[0].mode == tmode
-			 && insn_data[new_icode].operand[1].mode == tmode
-			 && insn_data[new_icode].operand[2].mode == mode
-			 && insn_data[new_icode].operand[0].predicate
-			    == insn_data[icode].operand[0].predicate
-			 && insn_data[new_icode].operand[1].predicate
-			    == insn_data[icode].operand[1].predicate);
-		      icode = new_icode;
-		      goto non_constant;
-		    }
-		  break;
-		default:
-		  gcc_unreachable ();
-		}
-	    }
-	}
-      else
-	{
-	non_constant:
-	  if (VECTOR_MODE_P (mode))
-	    op = safe_vector_operand (op, mode);
-
-	  /* If we aren't optimizing, only allow one memory operand to be
-	     generated.  */
-	  if (memory_operand (op, mode))
-	    num_memory++;
-
-	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
-
-	  if (optimize
-	      || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
-	      || num_memory > 1)
-	    op = force_reg (mode, op);
-	}
-
-      args[i].op = op;
-      args[i].mode = mode;
-    }
-
-  switch (nargs)
-    {
-    case 1:
-      pat = GEN_FCN (icode) (target, args[0].op);
-      break;
-
-    case 2:
-      if (tf_p)
-	pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
-			       GEN_INT ((int)sub_code));
-      else if (! comparison_p)
-	pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
-      else
-	{
-	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
-				       args[0].op,
-				       args[1].op);
-
-	  pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
-	}
-      break;
-
-    case 3:
-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
-      break;
-
-    case 4:
-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
-      break;
-
-    default:
-      gcc_unreachable ();
-    }
-
-  if (! pat)
-    return 0;
-
-  emit_insn (pat);
-  return target;
-}
-
-/* Subroutine of ix86_expand_args_builtin to take care of scalar unop
-   insns with vec_merge.  */
-
-static rtx
-ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
-				    rtx target)
-{
-  rtx pat;
-  tree arg0 = CALL_EXPR_ARG (exp, 0);
-  rtx op1, op0 = expand_normal (arg0);
-  machine_mode tmode = insn_data[icode].operand[0].mode;
-  machine_mode mode0 = insn_data[icode].operand[1].mode;
-
-  if (optimize || !target
-      || GET_MODE (target) != tmode
-      || !insn_data[icode].operand[0].predicate (target, tmode))
-    target = gen_reg_rtx (tmode);
-
-  if (VECTOR_MODE_P (mode0))
-    op0 = safe_vector_operand (op0, mode0);
-
-  if ((optimize && !register_operand (op0, mode0))
-      || !insn_data[icode].operand[1].predicate (op0, mode0))
-    op0 = copy_to_mode_reg (mode0, op0);
-
-  op1 = op0;
-  if (!insn_data[icode].operand[2].predicate (op1, mode0))
-    op1 = copy_to_mode_reg (mode0, op1);
-
-  pat = GEN_FCN (icode) (target, op0, op1);
-  if (! pat)
-    return 0;
-  emit_insn (pat);
-  return target;
-}
-
-/* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
-
-static rtx
-ix86_expand_sse_compare (const struct builtin_description *d,
-			 tree exp, rtx target, bool swap)
-{
-  rtx pat;
-  tree arg0 = CALL_EXPR_ARG (exp, 0);
-  tree arg1 = CALL_EXPR_ARG (exp, 1);
-  rtx op0 = expand_normal (arg0);
-  rtx op1 = expand_normal (arg1);
-  rtx op2;
-  machine_mode tmode = insn_data[d->icode].operand[0].mode;
-  machine_mode mode0 = insn_data[d->icode].operand[1].mode;
-  machine_mode mode1 = insn_data[d->icode].operand[2].mode;
-  enum rtx_code comparison = d->comparison;
-
-  if (VECTOR_MODE_P (mode0))
-    op0 = safe_vector_operand (op0, mode0);
-  if (VECTOR_MODE_P (mode1))
-    op1 = safe_vector_operand (op1, mode1);
-
-  /* Swap operands if we have a comparison that isn't available in
-     hardware.  */
-  if (swap)
-    std::swap (op0, op1);
-
-  if (optimize || !target
-      || GET_MODE (target) != tmode
-      || !insn_data[d->icode].operand[0].predicate (target, tmode))
-    target = gen_reg_rtx (tmode);
-
-  if ((optimize && !register_operand (op0, mode0))
-      || !insn_data[d->icode].operand[1].predicate (op0, mode0))
-    op0 = copy_to_mode_reg (mode0, op0);
-  if ((optimize && !register_operand (op1, mode1))
-      || !insn_data[d->icode].operand[2].predicate (op1, mode1))
-    op1 = copy_to_mode_reg (mode1, op1);
-
-  op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
-  pat = GEN_FCN (d->icode) (target, op0, op1, op2);
-  if (! pat)
-    return 0;
-  emit_insn (pat);
-  return target;
-}
-
-/* Subroutine of ix86_expand_builtin to take care of comi insns.  */
-
-static rtx
-ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
-		      rtx target)
-{
-  rtx pat;
-  tree arg0 = CALL_EXPR_ARG (exp, 0);
-  tree arg1 = CALL_EXPR_ARG (exp, 1);
-  rtx op0 = expand_normal (arg0);
-  rtx op1 = expand_normal (arg1);
-  machine_mode mode0 = insn_data[d->icode].operand[0].mode;
-  machine_mode mode1 = insn_data[d->icode].operand[1].mode;
-  enum rtx_code comparison = d->comparison;
-
-  if (VECTOR_MODE_P (mode0))
-    op0 = safe_vector_operand (op0, mode0);
-  if (VECTOR_MODE_P (mode1))
-    op1 = safe_vector_operand (op1, mode1);
-
-  /* Swap operands if we have a comparison that isn't available in
-     hardware.  */
-  if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
-    std::swap (op0, op1);
-
-  target = gen_reg_rtx (SImode);
-  emit_move_insn (target, const0_rtx);
-  target = gen_rtx_SUBREG (QImode, target, 0);
-
-  if ((optimize && !register_operand (op0, mode0))
-      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
-    op0 = copy_to_mode_reg (mode0, op0);
-  if ((optimize && !register_operand (op1, mode1))
-      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
-    op1 = copy_to_mode_reg (mode1, op1);
-
-  pat = GEN_FCN (d->icode) (op0, op1);
-  if (! pat)
-    return 0;
-  emit_insn (pat);
-  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
-			  gen_rtx_fmt_ee (comparison, QImode,
-					  SET_DEST (pat),
-					  const0_rtx)));
-
-  return SUBREG_REG (target);
-}
-
-/* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
-
-static rtx
-ix86_expand_sse_round (const struct builtin_description *d, tree exp,
-		       rtx target)
-{
-  rtx pat;
-  tree arg0 = CALL_EXPR_ARG (exp, 0);
-  rtx op1, op0 = expand_normal (arg0);
-  machine_mode tmode = insn_data[d->icode].operand[0].mode;
-  machine_mode mode0 = insn_data[d->icode].operand[1].mode;
-
-  if (optimize || target == 0
-      || GET_MODE (target) != tmode
-      || !insn_data[d->icode].operand[0].predicate (target, tmode))
-    target = gen_reg_rtx (tmode);
-
-  if (VECTOR_MODE_P (mode0))
-    op0 = safe_vector_operand (op0, mode0);
-
-  if ((optimize && !register_operand (op0, mode0))
-      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
-    op0 = copy_to_mode_reg (mode0, op0);
-
-  op1 = GEN_INT (d->comparison);
-
-  pat = GEN_FCN (d->icode) (target, op0, op1);
-  if (! pat)
-    return 0;
-  emit_insn (pat);
-  return target;
-}
-
-static rtx
-ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
-				     tree exp, rtx target)
-{
-  rtx pat;
-  tree arg0 = CALL_EXPR_ARG (exp, 0);
-  tree arg1 = CALL_EXPR_ARG (exp, 1);
-  rtx op0 = expand_normal (arg0);
-  rtx op1 = expand_normal (arg1);
-  rtx op2;
-  machine_mode tmode = insn_data[d->icode].operand[0].mode;
-  machine_mode mode0 = insn_data[d->icode].operand[1].mode;
-  machine_mode mode1 = insn_data[d->icode].operand[2].mode;
-
-  if (optimize || target == 0
-      || GET_MODE (target) != tmode
-      || !insn_data[d->icode].operand[0].predicate (target, tmode))
-    target = gen_reg_rtx (tmode);
-
-  op0 = safe_vector_operand (op0, mode0);
-  op1 = safe_vector_operand (op1, mode1);
-
-  if ((optimize && !register_operand (op0, mode0))
-      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
-    op0 = copy_to_mode_reg (mode0, op0);
-  if ((optimize && !register_operand (op1, mode1))
-      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
-    op1 = copy_to_mode_reg (mode1, op1);
-
-  op2 = GEN_INT (d->comparison);
-
-  pat = GEN_FCN (d->icode) (target, op0, op1, op2);
-  if (! pat)
-    return 0;
-  emit_insn (pat);
-  return target;
-}
-
-/* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
-
-static rtx
-ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
-		       rtx target)
-{
-  rtx pat;
-  tree arg0 = CALL_EXPR_ARG (exp, 0);
-  tree arg1 = CALL_EXPR_ARG (exp, 1);
-  rtx op0 = expand_normal (arg0);
-  rtx op1 = expand_normal (arg1);
-  machine_mode mode0 = insn_data[d->icode].operand[0].mode;
-  machine_mode mode1 = insn_data[d->icode].operand[1].mode;
-  enum rtx_code comparison = d->comparison;
-
-  if (VECTOR_MODE_P (mode0))
-    op0 = safe_vector_operand (op0, mode0);
-  if (VECTOR_MODE_P (mode1))
-    op1 = safe_vector_operand (op1, mode1);
-
-  target = gen_reg_rtx (SImode);
-  emit_move_insn (target, const0_rtx);
-  target = gen_rtx_SUBREG (QImode, target, 0);
-
-  if ((optimize && !register_operand (op0, mode0))
-      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
-    op0 = copy_to_mode_reg (mode0, op0);
-  if ((optimize && !register_operand (op1, mode1))
-      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
-    op1 = copy_to_mode_reg (mode1, op1);
-
-  pat = GEN_FCN (d->icode) (op0, op1);
-  if (! pat)
-    return 0;
-  emit_insn (pat);
-  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
-			  gen_rtx_fmt_ee (comparison, QImode,
-					  SET_DEST (pat),
-					  const0_rtx)));
-
-  return SUBREG_REG (target);
-}
-
-/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
-
-static rtx
-ix86_expand_sse_pcmpestr (const struct builtin_description *d,
-			  tree exp, rtx target)
-{
-  rtx pat;
-  tree arg0 = CALL_EXPR_ARG (exp, 0);
-  tree arg1 = CALL_EXPR_ARG (exp, 1);
-  tree arg2 = CALL_EXPR_ARG (exp, 2);
-  tree arg3 = CALL_EXPR_ARG (exp, 3);
-  tree arg4 = CALL_EXPR_ARG (exp, 4);
-  rtx scratch0, scratch1;
-  rtx op0 = expand_normal (arg0);
-  rtx op1 = expand_normal (arg1);
-  rtx op2 = expand_normal (arg2);
-  rtx op3 = expand_normal (arg3);
-  rtx op4 = expand_normal (arg4);
-  machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
-
-  tmode0 = insn_data[d->icode].operand[0].mode;
-  tmode1 = insn_data[d->icode].operand[1].mode;
-  modev2 = insn_data[d->icode].operand[2].mode;
-  modei3 = insn_data[d->icode].operand[3].mode;
-  modev4 = insn_data[d->icode].operand[4].mode;
-  modei5 = insn_data[d->icode].operand[5].mode;
-  modeimm = insn_data[d->icode].operand[6].mode;
-
-  if (VECTOR_MODE_P (modev2))
-    op0 = safe_vector_operand (op0, modev2);
-  if (VECTOR_MODE_P (modev4))
-    op2 = safe_vector_operand (op2, modev4);
-
-  if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
-    op0 = copy_to_mode_reg (modev2, op0);
-  if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
-    op1 = copy_to_mode_reg (modei3, op1);
-  if ((optimize && !register_operand (op2, modev4))
-      || !insn_data[d->icode].operand[4].predicate (op2, modev4))
-    op2 = copy_to_mode_reg (modev4, op2);
-  if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
-    op3 = copy_to_mode_reg (modei5, op3);
-
-  if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
-    {
-      error ("the fifth argument must be an 8-bit immediate");
-      return const0_rtx;
-    }
-
-  if (d->code == IX86_BUILTIN_PCMPESTRI128)
-    {
-      if (optimize || !target
-	  || GET_MODE (target) != tmode0
-	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
-	target = gen_reg_rtx (tmode0);
-
-      scratch1 = gen_reg_rtx (tmode1);
-
-      pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
-    }
-  else if (d->code == IX86_BUILTIN_PCMPESTRM128)
-    {
-      if (optimize || !target
-	  || GET_MODE (target) != tmode1
-	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
-	target = gen_reg_rtx (tmode1);
-
-      scratch0 = gen_reg_rtx (tmode0);
-
-      pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
-    }
-  else
-    {
-      gcc_assert (d->flag);
-
-      scratch0 = gen_reg_rtx (tmode0);
-      scratch1 = gen_reg_rtx (tmode1);
-
-      pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
-    }
-
-  if (! pat)
-    return 0;
-
-  emit_insn (pat);
-
-  if (d->flag)
-    {
-      target = gen_reg_rtx (SImode);
-      emit_move_insn (target, const0_rtx);
-      target = gen_rtx_SUBREG (QImode, target, 0);
-
-      emit_insn
-	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
-		      gen_rtx_fmt_ee (EQ, QImode,
-				      gen_rtx_REG ((machine_mode) d->flag,
-						   FLAGS_REG),
-				      const0_rtx)));
-      return SUBREG_REG (target);
-    }
-  else
-    return target;
-}
-
-
-/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
-
-static rtx
-ix86_expand_sse_pcmpistr (const struct builtin_description *d,
-			  tree exp, rtx target)
-{
-  rtx pat;
-  tree arg0 = CALL_EXPR_ARG (exp, 0);
-  tree arg1 = CALL_EXPR_ARG (exp, 1);
-  tree arg2 = CALL_EXPR_ARG (exp, 2);
-  rtx scratch0, scratch1;
-  rtx op0 = expand_normal (arg0);
-  rtx op1 = expand_normal (arg1);
-  rtx op2 = expand_normal (arg2);
-  machine_mode tmode0, tmode1, modev2, modev3, modeimm;
-
-  tmode0 = insn_data[d->icode].operand[0].mode;
-  tmode1 = insn_data[d->icode].operand[1].mode;
-  modev2 = insn_data[d->icode].operand[2].mode;
-  modev3 = insn_data[d->icode].operand[3].mode;
-  modeimm = insn_data[d->icode].operand[4].mode;
-
-  if (VECTOR_MODE_P (modev2))
-    op0 = safe_vector_operand (op0, modev2);
-  if (VECTOR_MODE_P (modev3))
-    op1 = safe_vector_operand (op1, modev3);
-
-  if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
-    op0 = copy_to_mode_reg (modev2, op0);
-  if ((optimize && !register_operand (op1, modev3))
-      || !insn_data[d->icode].operand[3].predicate (op1, modev3))
-    op1 = copy_to_mode_reg (modev3, op1);
-
-  if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
-    {
-      error ("the third argument must be an 8-bit immediate");
-      return const0_rtx;
-    }
-
-  if (d->code == IX86_BUILTIN_PCMPISTRI128)
-    {
-      if (optimize || !target
-	  || GET_MODE (target) != tmode0
-	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
-	target = gen_reg_rtx (tmode0);
-
-      scratch1 = gen_reg_rtx (tmode1);
-
-      pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
-    }
-  else if (d->code == IX86_BUILTIN_PCMPISTRM128)
-    {
-      if (optimize || !target
-	  || GET_MODE (target) != tmode1
-	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
-	target = gen_reg_rtx (tmode1);
-
-      scratch0 = gen_reg_rtx (tmode0);
-
-      pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
-    }
-  else
-    {
-      gcc_assert (d->flag);
-
-      scratch0 = gen_reg_rtx (tmode0);
-      scratch1 = gen_reg_rtx (tmode1);
-
-      pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
-    }
-
-  if (! pat)
-    return 0;
-
-  emit_insn (pat);
-
-  if (d->flag)
-    {
-      target = gen_reg_rtx (SImode);
-      emit_move_insn (target, const0_rtx);
-      target = gen_rtx_SUBREG (QImode, target, 0);
-
-      emit_insn
-	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
-		      gen_rtx_fmt_ee (EQ, QImode,
-				      gen_rtx_REG ((machine_mode) d->flag,
-						   FLAGS_REG),
-				      const0_rtx)));
-      return SUBREG_REG (target);
-    }
-  else
-    return target;
-}
-
-/* Subroutine of ix86_expand_builtin to take care of insns with
-   variable number of operands.  */
-
-static rtx
-ix86_expand_args_builtin (const struct builtin_description *d,
-			  tree exp, rtx target)
-{
-  rtx pat, real_target;
-  unsigned int i, nargs;
-  unsigned int nargs_constant = 0;
-  unsigned int mask_pos = 0;
-  int num_memory = 0;
-  struct
-    {
-      rtx op;
-      machine_mode mode;
-    } args[6];
-  bool second_arg_count = false;
-  enum insn_code icode = d->icode;
-  const struct insn_data_d *insn_p = &insn_data[icode];
-  machine_mode tmode = insn_p->operand[0].mode;
-  machine_mode rmode = VOIDmode;
-  bool swap = false;
-  enum rtx_code comparison = d->comparison;
-
-  switch ((enum ix86_builtin_func_type) d->flag)
-    {
-    case V2DF_FTYPE_V2DF_ROUND:
-    case V4DF_FTYPE_V4DF_ROUND:
-    case V8DF_FTYPE_V8DF_ROUND:
-    case V4SF_FTYPE_V4SF_ROUND:
-    case V8SF_FTYPE_V8SF_ROUND:
-    case V16SF_FTYPE_V16SF_ROUND:
-    case V4SI_FTYPE_V4SF_ROUND:
-    case V8SI_FTYPE_V8SF_ROUND:
-    case V16SI_FTYPE_V16SF_ROUND:
-      return ix86_expand_sse_round (d, exp, target);
-    case V4SI_FTYPE_V2DF_V2DF_ROUND:
-    case V8SI_FTYPE_V4DF_V4DF_ROUND:
-    case V16SI_FTYPE_V8DF_V8DF_ROUND:
-      return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
-    case INT_FTYPE_V8SF_V8SF_PTEST:
-    case INT_FTYPE_V4DI_V4DI_PTEST:
-    case INT_FTYPE_V4DF_V4DF_PTEST:
-    case INT_FTYPE_V4SF_V4SF_PTEST:
-    case INT_FTYPE_V2DI_V2DI_PTEST:
-    case INT_FTYPE_V2DF_V2DF_PTEST:
-      return ix86_expand_sse_ptest (d, exp, target);
-    case FLOAT128_FTYPE_FLOAT128:
-    case FLOAT_FTYPE_FLOAT:
-    case INT_FTYPE_INT:
-    case UINT_FTYPE_UINT:
-    case UINT16_FTYPE_UINT16:
-    case UINT64_FTYPE_INT:
-    case UINT64_FTYPE_UINT64:
-    case INT64_FTYPE_INT64:
-    case INT64_FTYPE_V4SF:
-    case INT64_FTYPE_V2DF:
-    case INT_FTYPE_V16QI:
-    case INT_FTYPE_V8QI:
-    case INT_FTYPE_V8SF:
-    case INT_FTYPE_V4DF:
-    case INT_FTYPE_V4SF:
-    case INT_FTYPE_V2DF:
-    case INT_FTYPE_V32QI:
-    case V16QI_FTYPE_V16QI:
-    case V8SI_FTYPE_V8SF:
-    case V8SI_FTYPE_V4SI:
-    case V8HI_FTYPE_V8HI:
-    case V8HI_FTYPE_V16QI:
-    case V8QI_FTYPE_V8QI:
-    case V8SF_FTYPE_V8SF:
-    case V8SF_FTYPE_V8SI:
-    case V8SF_FTYPE_V4SF:
-    case V8SF_FTYPE_V8HI:
-    case V4SI_FTYPE_V4SI:
-    case V4SI_FTYPE_V16QI:
-    case V4SI_FTYPE_V4SF:
-    case V4SI_FTYPE_V8SI:
-    case V4SI_FTYPE_V8HI:
-    case V4SI_FTYPE_V4DF:
-    case V4SI_FTYPE_V2DF:
-    case V4HI_FTYPE_V4HI:
-    case V4DF_FTYPE_V4DF:
-    case V4DF_FTYPE_V4SI:
-    case V4DF_FTYPE_V4SF:
-    case V4DF_FTYPE_V2DF:
-    case V4SF_FTYPE_V4SF:
-    case V4SF_FTYPE_V4SI:
-    case V4SF_FTYPE_V8SF:
-    case V4SF_FTYPE_V4DF:
-    case V4SF_FTYPE_V8HI:
-    case V4SF_FTYPE_V2DF:
-    case V2DI_FTYPE_V2DI:
-    case V2DI_FTYPE_V16QI:
-    case V2DI_FTYPE_V8HI:
-    case V2DI_FTYPE_V4SI:
-    case V2DF_FTYPE_V2DF:
-    case V2DF_FTYPE_V4SI:
-    case V2DF_FTYPE_V4DF:
-    case V2DF_FTYPE_V4SF:
-    case V2DF_FTYPE_V2SI:
-    case V2SI_FTYPE_V2SI:
-    case V2SI_FTYPE_V4SF:
-    case V2SI_FTYPE_V2SF:
-    case V2SI_FTYPE_V2DF:
-    case V2SF_FTYPE_V2SF:
-    case V2SF_FTYPE_V2SI:
-    case V32QI_FTYPE_V32QI:
-    case V32QI_FTYPE_V16QI:
-    case V16HI_FTYPE_V16HI:
-    case V16HI_FTYPE_V8HI:
-    case V8SI_FTYPE_V8SI:
-    case V16HI_FTYPE_V16QI:
-    case V8SI_FTYPE_V16QI:
-    case V4DI_FTYPE_V16QI:
-    case V8SI_FTYPE_V8HI:
-    case V4DI_FTYPE_V8HI:
-    case V4DI_FTYPE_V4SI:
-    case V4DI_FTYPE_V2DI:
-    case UQI_FTYPE_UQI:
-    case UHI_FTYPE_UHI:
-    case USI_FTYPE_USI:
-    case USI_FTYPE_UQI:
-    case USI_FTYPE_UHI:
-    case UDI_FTYPE_UDI:
-    case UHI_FTYPE_V16QI:
-    case USI_FTYPE_V32QI:
-    case UDI_FTYPE_V64QI:
-    case V16QI_FTYPE_UHI:
-    case V32QI_FTYPE_USI:
-    case V64QI_FTYPE_UDI:
-    case V8HI_FTYPE_UQI:
-    case V16HI_FTYPE_UHI:
-    case V32HI_FTYPE_USI:
-    case V4SI_FTYPE_UQI:
-    case V8SI_FTYPE_UQI:
-    case V4SI_FTYPE_UHI:
-    case V8SI_FTYPE_UHI:
-    case UQI_FTYPE_V8HI:
-    case UHI_FTYPE_V16HI:
-    case USI_FTYPE_V32HI:
-    case UQI_FTYPE_V4SI:
-    case UQI_FTYPE_V8SI:
-    case UHI_FTYPE_V16SI:
-    case UQI_FTYPE_V2DI:
-    case UQI_FTYPE_V4DI:
-    case UQI_FTYPE_V8DI:
-    case V16SI_FTYPE_UHI:
-    case V2DI_FTYPE_UQI:
-    case V4DI_FTYPE_UQI:
-    case V16SI_FTYPE_INT:
-    case V16SF_FTYPE_V8SF:
-    case V16SI_FTYPE_V8SI:
-    case V16SF_FTYPE_V4SF:
-    case V16SI_FTYPE_V4SI:
-    case V16SI_FTYPE_V16SF:
-    case V16SI_FTYPE_V16SI:
-    case V64QI_FTYPE_V64QI:
-    case V32HI_FTYPE_V32HI:
-    case V16SF_FTYPE_V16SF:
-    case V8DI_FTYPE_UQI:
-    case V8DI_FTYPE_V8DI:
-    case V8DF_FTYPE_V4DF:
-    case V8DF_FTYPE_V2DF:
-    case V8DF_FTYPE_V8DF:
-    case V4DI_FTYPE_V4DI:
-      nargs = 1;
-      break;
-    case V4SF_FTYPE_V4SF_VEC_MERGE:
-    case V2DF_FTYPE_V2DF_VEC_MERGE:
-      return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
-    case FLOAT128_FTYPE_FLOAT128_FLOAT128:
-    case V16QI_FTYPE_V16QI_V16QI:
-    case V16QI_FTYPE_V8HI_V8HI:
-    case V16SF_FTYPE_V16SF_V16SF:
-    case V8QI_FTYPE_V8QI_V8QI:
-    case V8QI_FTYPE_V4HI_V4HI:
-    case V8HI_FTYPE_V8HI_V8HI:
-    case V8HI_FTYPE_V16QI_V16QI:
-    case V8HI_FTYPE_V4SI_V4SI:
-    case V8SF_FTYPE_V8SF_V8SF:
-    case V8SF_FTYPE_V8SF_V8SI:
-    case V8DF_FTYPE_V8DF_V8DF:
-    case V4SI_FTYPE_V4SI_V4SI:
-    case V4SI_FTYPE_V8HI_V8HI:
-    case V4SI_FTYPE_V2DF_V2DF:
-    case V4HI_FTYPE_V4HI_V4HI:
-    case V4HI_FTYPE_V8QI_V8QI:
-    case V4HI_FTYPE_V2SI_V2SI:
-    case V4DF_FTYPE_V4DF_V4DF:
-    case V4DF_FTYPE_V4DF_V4DI:
-    case V4SF_FTYPE_V4SF_V4SF:
-    case V4SF_FTYPE_V4SF_V4SI:
-    case V4SF_FTYPE_V4SF_V2SI:
-    case V4SF_FTYPE_V4SF_V2DF:
-    case V4SF_FTYPE_V4SF_UINT:
-    case V4SF_FTYPE_V4SF_DI:
-    case V4SF_FTYPE_V4SF_SI:
-    case V2DI_FTYPE_V2DI_V2DI:
-    case V2DI_FTYPE_V16QI_V16QI:
-    case V2DI_FTYPE_V4SI_V4SI:
-    case V2DI_FTYPE_V2DI_V16QI:
-    case V2SI_FTYPE_V2SI_V2SI:
-    case V2SI_FTYPE_V4HI_V4HI:
-    case V2SI_FTYPE_V2SF_V2SF:
-    case V2DF_FTYPE_V2DF_V2DF:
-    case V2DF_FTYPE_V2DF_V4SF:
-    case V2DF_FTYPE_V2DF_V2DI:
-    case V2DF_FTYPE_V2DF_DI:
-    case V2DF_FTYPE_V2DF_SI:
-    case V2DF_FTYPE_V2DF_UINT:
-    case V2SF_FTYPE_V2SF_V2SF:
-    case V1DI_FTYPE_V1DI_V1DI:
-    case V1DI_FTYPE_V8QI_V8QI:
-    case V1DI_FTYPE_V2SI_V2SI:
-    case V32QI_FTYPE_V16HI_V16HI:
-    case V16HI_FTYPE_V8SI_V8SI:
-    case V64QI_FTYPE_V64QI_V64QI:
-    case V32QI_FTYPE_V32QI_V32QI:
-    case V16HI_FTYPE_V32QI_V32QI:
-    case V16HI_FTYPE_V16HI_V16HI:
-    case V8SI_FTYPE_V4DF_V4DF:
-    case V8SI_FTYPE_V8SI_V8SI:
-    case V8SI_FTYPE_V16HI_V16HI:
-    case V4DI_FTYPE_V4DI_V4DI:
-    case V4DI_FTYPE_V8SI_V8SI:
-    case V8DI_FTYPE_V64QI_V64QI:
-      if (comparison == UNKNOWN)
-	return ix86_expand_binop_builtin (icode, exp, target);
-      nargs = 2;
-      break;
-    case V4SF_FTYPE_V4SF_V4SF_SWAP:
-    case V2DF_FTYPE_V2DF_V2DF_SWAP:
-      gcc_assert (comparison != UNKNOWN);
-      nargs = 2;
-      swap = true;
-      break;
-    case V16HI_FTYPE_V16HI_V8HI_COUNT:
-    case V16HI_FTYPE_V16HI_SI_COUNT:
-    case V8SI_FTYPE_V8SI_V4SI_COUNT:
-    case V8SI_FTYPE_V8SI_SI_COUNT:
-    case V4DI_FTYPE_V4DI_V2DI_COUNT:
-    case V4DI_FTYPE_V4DI_INT_COUNT:
-    case V8HI_FTYPE_V8HI_V8HI_COUNT:
-    case V8HI_FTYPE_V8HI_SI_COUNT:
-    case V4SI_FTYPE_V4SI_V4SI_COUNT:
-    case V4SI_FTYPE_V4SI_SI_COUNT:
-    case V4HI_FTYPE_V4HI_V4HI_COUNT:
-    case V4HI_FTYPE_V4HI_SI_COUNT:
-    case V2DI_FTYPE_V2DI_V2DI_COUNT:
-    case V2DI_FTYPE_V2DI_SI_COUNT:
-    case V2SI_FTYPE_V2SI_V2SI_COUNT:
-    case V2SI_FTYPE_V2SI_SI_COUNT:
-    case V1DI_FTYPE_V1DI_V1DI_COUNT:
-    case V1DI_FTYPE_V1DI_SI_COUNT:
-      nargs = 2;
-      second_arg_count = true;
-      break;
-    case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
-    case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
-    case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
-    case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
-    case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
-    case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
-    case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
-    case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
-    case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
-    case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
-    case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
-    case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
-    case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
-    case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
-    case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
-    case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
-    case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
-    case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
-      nargs = 4;
-      second_arg_count = true;
-      break;
-    case UINT64_FTYPE_UINT64_UINT64:
-    case UINT_FTYPE_UINT_UINT:
-    case UINT_FTYPE_UINT_USHORT:
-    case UINT_FTYPE_UINT_UCHAR:
-    case UINT16_FTYPE_UINT16_INT:
-    case UINT8_FTYPE_UINT8_INT:
-    case UQI_FTYPE_UQI_UQI:
-    case UHI_FTYPE_UHI_UHI:
-    case USI_FTYPE_USI_USI:
-    case UDI_FTYPE_UDI_UDI:
-    case V16SI_FTYPE_V8DF_V8DF:
-      nargs = 2;
-      break;
-    case V2DI_FTYPE_V2DI_INT_CONVERT:
-      nargs = 2;
-      rmode = V1TImode;
-      nargs_constant = 1;
-      break;
-    case V4DI_FTYPE_V4DI_INT_CONVERT:
-      nargs = 2;
-      rmode = V2TImode;
-      nargs_constant = 1;
-      break;
-    case V8DI_FTYPE_V8DI_INT_CONVERT:
-      nargs = 2;
-      rmode = V4TImode;
-      nargs_constant = 1;
-      break;
-    case V8HI_FTYPE_V8HI_INT:
-    case V8HI_FTYPE_V8SF_INT:
-    case V16HI_FTYPE_V16SF_INT:
-    case V8HI_FTYPE_V4SF_INT:
-    case V8SF_FTYPE_V8SF_INT:
-    case V4SF_FTYPE_V16SF_INT:
-    case V16SF_FTYPE_V16SF_INT:
-    case V4SI_FTYPE_V4SI_INT:
-    case V4SI_FTYPE_V8SI_INT:
-    case V4HI_FTYPE_V4HI_INT:
-    case V4DF_FTYPE_V4DF_INT:
-    case V4DF_FTYPE_V8DF_INT:
-    case V4SF_FTYPE_V4SF_INT:
-    case V4SF_FTYPE_V8SF_INT:
-    case V2DI_FTYPE_V2DI_INT:
-    case V2DF_FTYPE_V2DF_INT:
-    case V2DF_FTYPE_V4DF_INT:
-    case V16HI_FTYPE_V16HI_INT:
-    case V8SI_FTYPE_V8SI_INT:
-    case V16SI_FTYPE_V16SI_INT:
-    case V4SI_FTYPE_V16SI_INT:
-    case V4DI_FTYPE_V4DI_INT:
-    case V2DI_FTYPE_V4DI_INT:
-    case V4DI_FTYPE_V8DI_INT:
-    case QI_FTYPE_V4SF_INT:
-    case QI_FTYPE_V2DF_INT:
-    case UQI_FTYPE_UQI_UQI_CONST:
-    case UHI_FTYPE_UHI_UQI:
-    case USI_FTYPE_USI_UQI:
-    case UDI_FTYPE_UDI_UQI:
-      nargs = 2;
-      nargs_constant = 1;
-      break;
-    case V16QI_FTYPE_V16QI_V16QI_V16QI:
-    case V8SF_FTYPE_V8SF_V8SF_V8SF:
-    case V4DF_FTYPE_V4DF_V4DF_V4DF:
-    case V4SF_FTYPE_V4SF_V4SF_V4SF:
-    case V2DF_FTYPE_V2DF_V2DF_V2DF:
-    case V32QI_FTYPE_V32QI_V32QI_V32QI:
-    case UHI_FTYPE_V16SI_V16SI_UHI:
-    case UQI_FTYPE_V8DI_V8DI_UQI:
-    case V16HI_FTYPE_V16SI_V16HI_UHI:
-    case V16QI_FTYPE_V16SI_V16QI_UHI:
-    case V16QI_FTYPE_V8DI_V16QI_UQI:
-    case V16SF_FTYPE_V16SF_V16SF_UHI:
-    case V16SF_FTYPE_V4SF_V16SF_UHI:
-    case V16SI_FTYPE_SI_V16SI_UHI:
-    case V16SI_FTYPE_V16HI_V16SI_UHI:
-    case V16SI_FTYPE_V16QI_V16SI_UHI:
-    case V8SF_FTYPE_V4SF_V8SF_UQI:
-    case V4DF_FTYPE_V2DF_V4DF_UQI:
-    case V8SI_FTYPE_V4SI_V8SI_UQI:
-    case V8SI_FTYPE_SI_V8SI_UQI:
-    case V4SI_FTYPE_V4SI_V4SI_UQI:
-    case V4SI_FTYPE_SI_V4SI_UQI:
-    case V4DI_FTYPE_V2DI_V4DI_UQI:
-    case V4DI_FTYPE_DI_V4DI_UQI:
-    case V2DI_FTYPE_V2DI_V2DI_UQI:
-    case V2DI_FTYPE_DI_V2DI_UQI:
-    case V64QI_FTYPE_V64QI_V64QI_UDI:
-    case V64QI_FTYPE_V16QI_V64QI_UDI:
-    case V64QI_FTYPE_QI_V64QI_UDI:
-    case V32QI_FTYPE_V32QI_V32QI_USI:
-    case V32QI_FTYPE_V16QI_V32QI_USI:
-    case V32QI_FTYPE_QI_V32QI_USI:
-    case V16QI_FTYPE_V16QI_V16QI_UHI:
-    case V16QI_FTYPE_QI_V16QI_UHI:
-    case V32HI_FTYPE_V8HI_V32HI_USI:
-    case V32HI_FTYPE_HI_V32HI_USI:
-    case V16HI_FTYPE_V8HI_V16HI_UHI:
-    case V16HI_FTYPE_HI_V16HI_UHI:
-    case V8HI_FTYPE_V8HI_V8HI_UQI:
-    case V8HI_FTYPE_HI_V8HI_UQI:
-    case V8SF_FTYPE_V8HI_V8SF_UQI:
-    case V4SF_FTYPE_V8HI_V4SF_UQI:
-    case V8SI_FTYPE_V8SF_V8SI_UQI:
-    case V4SI_FTYPE_V4SF_V4SI_UQI:
-    case V4DI_FTYPE_V4SF_V4DI_UQI:
-    case V2DI_FTYPE_V4SF_V2DI_UQI:
-    case V4SF_FTYPE_V4DI_V4SF_UQI:
-    case V4SF_FTYPE_V2DI_V4SF_UQI:
-    case V4DF_FTYPE_V4DI_V4DF_UQI:
-    case V2DF_FTYPE_V2DI_V2DF_UQI:
-    case V16QI_FTYPE_V8HI_V16QI_UQI:
-    case V16QI_FTYPE_V16HI_V16QI_UHI:
-    case V16QI_FTYPE_V4SI_V16QI_UQI:
-    case V16QI_FTYPE_V8SI_V16QI_UQI:
-    case V8HI_FTYPE_V4SI_V8HI_UQI:
-    case V8HI_FTYPE_V8SI_V8HI_UQI:
-    case V16QI_FTYPE_V2DI_V16QI_UQI:
-    case V16QI_FTYPE_V4DI_V16QI_UQI:
-    case V8HI_FTYPE_V2DI_V8HI_UQI:
-    case V8HI_FTYPE_V4DI_V8HI_UQI:
-    case V4SI_FTYPE_V2DI_V4SI_UQI:
-    case V4SI_FTYPE_V4DI_V4SI_UQI:
-    case V32QI_FTYPE_V32HI_V32QI_USI:
-    case UHI_FTYPE_V16QI_V16QI_UHI:
-    case USI_FTYPE_V32QI_V32QI_USI:
-    case UDI_FTYPE_V64QI_V64QI_UDI:
-    case UQI_FTYPE_V8HI_V8HI_UQI:
-    case UHI_FTYPE_V16HI_V16HI_UHI:
-    case USI_FTYPE_V32HI_V32HI_USI:
-    case UQI_FTYPE_V4SI_V4SI_UQI:
-    case UQI_FTYPE_V8SI_V8SI_UQI:
-    case UQI_FTYPE_V2DI_V2DI_UQI:
-    case UQI_FTYPE_V4DI_V4DI_UQI:
-    case V4SF_FTYPE_V2DF_V4SF_UQI:
-    case V4SF_FTYPE_V4DF_V4SF_UQI:
-    case V16SI_FTYPE_V16SI_V16SI_UHI:
-    case V16SI_FTYPE_V4SI_V16SI_UHI:
-    case V2DI_FTYPE_V4SI_V2DI_UQI:
-    case V2DI_FTYPE_V8HI_V2DI_UQI:
-    case V2DI_FTYPE_V16QI_V2DI_UQI:
-    case V4DI_FTYPE_V4DI_V4DI_UQI:
-    case V4DI_FTYPE_V4SI_V4DI_UQI:
-    case V4DI_FTYPE_V8HI_V4DI_UQI:
-    case V4DI_FTYPE_V16QI_V4DI_UQI:
-    case V4DI_FTYPE_V4DF_V4DI_UQI:
-    case V2DI_FTYPE_V2DF_V2DI_UQI:
-    case V4SI_FTYPE_V4DF_V4SI_UQI:
-    case V4SI_FTYPE_V2DF_V4SI_UQI:
-    case V4SI_FTYPE_V8HI_V4SI_UQI:
-    case V4SI_FTYPE_V16QI_V4SI_UQI:
-    case V4DI_FTYPE_V4DI_V4DI_V4DI:
-    case V8DF_FTYPE_V2DF_V8DF_UQI:
-    case V8DF_FTYPE_V4DF_V8DF_UQI:
-    case V8DF_FTYPE_V8DF_V8DF_UQI:
-    case V8SF_FTYPE_V8SF_V8SF_UQI:
-    case V8SF_FTYPE_V8SI_V8SF_UQI:
-    case V4DF_FTYPE_V4DF_V4DF_UQI:
-    case V4SF_FTYPE_V4SF_V4SF_UQI:
-    case V2DF_FTYPE_V2DF_V2DF_UQI:
-    case V2DF_FTYPE_V4SF_V2DF_UQI:
-    case V2DF_FTYPE_V4SI_V2DF_UQI:
-    case V4SF_FTYPE_V4SI_V4SF_UQI:
-    case V4DF_FTYPE_V4SF_V4DF_UQI:
-    case V4DF_FTYPE_V4SI_V4DF_UQI:
-    case V8SI_FTYPE_V8SI_V8SI_UQI:
-    case V8SI_FTYPE_V8HI_V8SI_UQI:
-    case V8SI_FTYPE_V16QI_V8SI_UQI:
-    case V8DF_FTYPE_V8SI_V8DF_UQI:
-    case V8DI_FTYPE_DI_V8DI_UQI:
-    case V16SF_FTYPE_V8SF_V16SF_UHI:
-    case V16SI_FTYPE_V8SI_V16SI_UHI:
-    case V16HI_FTYPE_V16HI_V16HI_UHI:
-    case V8HI_FTYPE_V16QI_V8HI_UQI:
-    case V16HI_FTYPE_V16QI_V16HI_UHI:
-    case V32HI_FTYPE_V32HI_V32HI_USI:
-    case V32HI_FTYPE_V32QI_V32HI_USI:
-    case V8DI_FTYPE_V16QI_V8DI_UQI:
-    case V8DI_FTYPE_V2DI_V8DI_UQI:
-    case V8DI_FTYPE_V4DI_V8DI_UQI:
-    case V8DI_FTYPE_V8DI_V8DI_UQI:
-    case V8DI_FTYPE_V8HI_V8DI_UQI:
-    case V8DI_FTYPE_V8SI_V8DI_UQI:
-    case V8HI_FTYPE_V8DI_V8HI_UQI:
-    case V8SI_FTYPE_V8DI_V8SI_UQI:
-    case V4SI_FTYPE_V4SI_V4SI_V4SI:
-    case V16SI_FTYPE_V16SI_V16SI_V16SI:
-    case V8DI_FTYPE_V8DI_V8DI_V8DI:
-    case V32HI_FTYPE_V32HI_V32HI_V32HI:
-    case V2DI_FTYPE_V2DI_V2DI_V2DI:
-    case V16HI_FTYPE_V16HI_V16HI_V16HI:
-    case V8SI_FTYPE_V8SI_V8SI_V8SI:
-    case V8HI_FTYPE_V8HI_V8HI_V8HI:
-      nargs = 3;
-      break;
-    case V32QI_FTYPE_V32QI_V32QI_INT:
-    case V16HI_FTYPE_V16HI_V16HI_INT:
-    case V16QI_FTYPE_V16QI_V16QI_INT:
-    case V4DI_FTYPE_V4DI_V4DI_INT:
-    case V8HI_FTYPE_V8HI_V8HI_INT:
-    case V8SI_FTYPE_V8SI_V8SI_INT:
-    case V8SI_FTYPE_V8SI_V4SI_INT:
-    case V8SF_FTYPE_V8SF_V8SF_INT:
-    case V8SF_FTYPE_V8SF_V4SF_INT:
-    case V4SI_FTYPE_V4SI_V4SI_INT:
-    case V4DF_FTYPE_V4DF_V4DF_INT:
-    case V16SF_FTYPE_V16SF_V16SF_INT:
-    case V16SF_FTYPE_V16SF_V4SF_INT:
-    case V16SI_FTYPE_V16SI_V4SI_INT:
-    case V4DF_FTYPE_V4DF_V2DF_INT:
-    case V4SF_FTYPE_V4SF_V4SF_INT:
-    case V2DI_FTYPE_V2DI_V2DI_INT:
-    case V4DI_FTYPE_V4DI_V2DI_INT:
-    case V2DF_FTYPE_V2DF_V2DF_INT:
-    case UQI_FTYPE_V8DI_V8UDI_INT:
-    case UQI_FTYPE_V8DF_V8DF_INT:
-    case UQI_FTYPE_V2DF_V2DF_INT:
-    case UQI_FTYPE_V4SF_V4SF_INT:
-    case UHI_FTYPE_V16SI_V16SI_INT:
-    case UHI_FTYPE_V16SF_V16SF_INT:
-    case V64QI_FTYPE_V64QI_V64QI_INT:
-    case V32HI_FTYPE_V32HI_V32HI_INT:
-    case V16SI_FTYPE_V16SI_V16SI_INT:
-    case V8DI_FTYPE_V8DI_V8DI_INT:
-      nargs = 3;
-      nargs_constant = 1;
-      break;
-    case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
-      nargs = 3;
-      rmode = V4DImode;
-      nargs_constant = 1;
-      break;
-    case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
-      nargs = 3;
-      rmode = V2DImode;
-      nargs_constant = 1;
-      break;
-    case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
-      nargs = 3;
-      rmode = DImode;
-      nargs_constant = 1;
-      break;
-    case V2DI_FTYPE_V2DI_UINT_UINT:
-      nargs = 3;
-      nargs_constant = 2;
-      break;
-    case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
-      nargs = 3;
-      rmode = V8DImode;
-      nargs_constant = 1;
-      break;
-    case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
-      nargs = 5;
-      rmode = V8DImode;
-      mask_pos = 2;
-      nargs_constant = 1;
-      break;
-    case QI_FTYPE_V8DF_INT_UQI:
-    case QI_FTYPE_V4DF_INT_UQI:
-    case QI_FTYPE_V2DF_INT_UQI:
-    case HI_FTYPE_V16SF_INT_UHI:
-    case QI_FTYPE_V8SF_INT_UQI:
-    case QI_FTYPE_V4SF_INT_UQI:
-    case V4SI_FTYPE_V4SI_V4SI_UHI:
-    case V8SI_FTYPE_V8SI_V8SI_UHI:
-      nargs = 3;
-      mask_pos = 1;
-      nargs_constant = 1;
-      break;
-    case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
-      nargs = 5;
-      rmode = V4DImode;
-      mask_pos = 2;
-      nargs_constant = 1;
-      break;
-    case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
-      nargs = 5;
-      rmode = V2DImode;
-      mask_pos = 2;
-      nargs_constant = 1;
-      break;
-    case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
-    case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
-    case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
-    case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
-    case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
-    case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
-    case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
-    case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
-    case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
-    case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
-    case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
-    case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
-    case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
-    case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
-    case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
-    case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
-    case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
-    case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
-    case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
-    case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
-    case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
-    case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
-    case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
-    case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
-    case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
-    case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
-    case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
-    case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
-    case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
-    case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
-    case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
-    case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
-    case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
-    case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
-    case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
-    case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
-    case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
-    case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
-    case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
-    case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
-    case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
-    case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
-    case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
-    case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
-    case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
-    case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
-    case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
-    case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
-    case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
-    case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
-    case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
-      nargs = 4;
-      break;
-    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
-    case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
-    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
-    case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
-    case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
-      nargs = 4;
-      nargs_constant = 1;
-      break;
-    case UQI_FTYPE_V4DI_V4DI_INT_UQI:
-    case UQI_FTYPE_V8SI_V8SI_INT_UQI:
-    case QI_FTYPE_V4DF_V4DF_INT_UQI:
-    case QI_FTYPE_V8SF_V8SF_INT_UQI:
-    case UQI_FTYPE_V2DI_V2DI_INT_UQI:
-    case UQI_FTYPE_V4SI_V4SI_INT_UQI:
-    case UQI_FTYPE_V2DF_V2DF_INT_UQI:
-    case UQI_FTYPE_V4SF_V4SF_INT_UQI:
-    case UDI_FTYPE_V64QI_V64QI_INT_UDI:
-    case USI_FTYPE_V32QI_V32QI_INT_USI:
-    case UHI_FTYPE_V16QI_V16QI_INT_UHI:
-    case USI_FTYPE_V32HI_V32HI_INT_USI:
-    case UHI_FTYPE_V16HI_V16HI_INT_UHI:
-    case UQI_FTYPE_V8HI_V8HI_INT_UQI:
-    case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
-    case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
-    case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
-    case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
-    case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
-    case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
-    case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
-    case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
-    case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
-      nargs = 4;
-      mask_pos = 1;
-      nargs_constant = 1;
-      break;
-    case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
-      nargs = 4;
-      nargs_constant = 2;
-      break;
-    case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
-    case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
-      nargs = 4;
-      break;
-    case UQI_FTYPE_V8DI_V8DI_INT_UQI:
-    case UHI_FTYPE_V16SI_V16SI_INT_UHI:
-      mask_pos = 1;
-      nargs = 4;
-      nargs_constant = 1;
-      break;
-    case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
-    case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
-    case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
-    case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
-    case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
-    case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
-    case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
-    case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
-    case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
-    case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
-    case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
-    case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
-    case V32HI_FTYPE_V32HI_INT_V32HI_USI:
-    case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
-    case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
-    case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
-    case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
-    case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
-    case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
-    case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
-    case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
-    case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
-    case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
-    case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
-    case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
-    case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
-    case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
-    case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
-    case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
-    case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
-      nargs = 4;
-      mask_pos = 2;
-      nargs_constant = 1;
-      break;
-    case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
-    case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
-    case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
-    case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
-    case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
-    case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
-    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
-    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
-    case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
-    case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
-    case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
-    case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
-    case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
-    case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
-    case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
-    case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
-    case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
-    case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
-    case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
-    case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
-    case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
-    case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
-    case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
-    case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
-    case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
-    case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
-    case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
-      nargs = 5;
-      mask_pos = 2;
-      nargs_constant = 1;
-      break;
-    case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
-    case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
-    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
-    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
-    case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
-    case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
-    case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
-    case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
-    case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
-    case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
-      nargs = 5;
-      mask_pos = 1;
-      nargs_constant = 1;
-      break;
-    case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
-    case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
-    case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
-    case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
-    case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
-    case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
-    case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
-    case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
-    case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
-    case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
-    case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
-    case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
-      nargs = 5;
-      mask_pos = 1;
-      nargs_constant = 2;
-      break;
-
-    default:
-      gcc_unreachable ();
-    }
-
-  gcc_assert (nargs <= ARRAY_SIZE (args));
-
-  if (comparison != UNKNOWN)
-    {
-      gcc_assert (nargs == 2);
-      return ix86_expand_sse_compare (d, exp, target, swap);
-    }
-
-  if (rmode == VOIDmode || rmode == tmode)
-    {
-      if (optimize
-	  || target == 0
-	  || GET_MODE (target) != tmode
-	  || !insn_p->operand[0].predicate (target, tmode))
-	target = gen_reg_rtx (tmode);
-      else if (memory_operand (target, tmode))
-	num_memory++;
-      real_target = target;
-    }
-  else
-    {
-      real_target = gen_reg_rtx (tmode);
-      target = lowpart_subreg (rmode, real_target, tmode);
-    }
-
-  for (i = 0; i < nargs; i++)
-    {
-      tree arg = CALL_EXPR_ARG (exp, i);
-      rtx op = expand_normal (arg);
-      machine_mode mode = insn_p->operand[i + 1].mode;
-      bool match = insn_p->operand[i + 1].predicate (op, mode);
-
-      if (second_arg_count && i == 1)
-	{
-	  /* SIMD shift insns take either an 8-bit immediate or
-	     register as count.  But builtin functions take int as
-	     count.  If count doesn't match, we put it in register.
-	     The instructions are using 64-bit count, if op is just
-	     32-bit, zero-extend it, as negative shift counts
-	     are undefined behavior and zero-extension is more
-	     efficient.  */
-	  if (!match)
-	    {
-	      if (SCALAR_INT_MODE_P (GET_MODE (op)))
-		op = convert_modes (mode, GET_MODE (op), op, 1);
-	      else
-		op = lowpart_subreg (mode, op, GET_MODE (op));
-	      if (!insn_p->operand[i + 1].predicate (op, mode))
-		op = copy_to_reg (op);
-	    }
-	}
-      else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
-	       (!mask_pos && (nargs - i) <= nargs_constant))
-	{
-	  if (!match)
-	    switch (icode)
-	      {
-	      case CODE_FOR_avx_vinsertf128v4di:
-	      case CODE_FOR_avx_vextractf128v4di:
-		error ("the last argument must be an 1-bit immediate");
-		return const0_rtx;
-
-	      case CODE_FOR_avx512f_cmpv8di3_mask:
-	      case CODE_FOR_avx512f_cmpv16si3_mask:
-	      case CODE_FOR_avx512f_ucmpv8di3_mask:
-	      case CODE_FOR_avx512f_ucmpv16si3_mask:
-	      case CODE_FOR_avx512vl_cmpv4di3_mask:
-	      case CODE_FOR_avx512vl_cmpv8si3_mask:
-	      case CODE_FOR_avx512vl_ucmpv4di3_mask:
-	      case CODE_FOR_avx512vl_ucmpv8si3_mask:
-	      case CODE_FOR_avx512vl_cmpv2di3_mask:
-	      case CODE_FOR_avx512vl_cmpv4si3_mask:
-	      case CODE_FOR_avx512vl_ucmpv2di3_mask:
-	      case CODE_FOR_avx512vl_ucmpv4si3_mask:
-		error ("the last argument must be a 3-bit immediate");
-		return const0_rtx;
-
-	      case CODE_FOR_sse4_1_roundsd:
-	      case CODE_FOR_sse4_1_roundss:
-
-	      case CODE_FOR_sse4_1_roundpd:
-	      case CODE_FOR_sse4_1_roundps:
-	      case CODE_FOR_avx_roundpd256:
-	      case CODE_FOR_avx_roundps256:
-
-	      case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
-	      case CODE_FOR_sse4_1_roundps_sfix:
-	      case CODE_FOR_avx_roundpd_vec_pack_sfix256:
-	      case CODE_FOR_avx_roundps_sfix256:
-
-	      case CODE_FOR_sse4_1_blendps:
-	      case CODE_FOR_avx_blendpd256:
-	      case CODE_FOR_avx_vpermilv4df:
-	      case CODE_FOR_avx_vpermilv4df_mask:
-	      case CODE_FOR_avx512f_getmantv8df_mask:
-	      case CODE_FOR_avx512f_getmantv16sf_mask:
-	      case CODE_FOR_avx512vl_getmantv8sf_mask:
-	      case CODE_FOR_avx512vl_getmantv4df_mask:
-	      case CODE_FOR_avx512vl_getmantv4sf_mask:
-	      case CODE_FOR_avx512vl_getmantv2df_mask:
-	      case CODE_FOR_avx512dq_rangepv8df_mask_round:
-	      case CODE_FOR_avx512dq_rangepv16sf_mask_round:
-	      case CODE_FOR_avx512dq_rangepv4df_mask:
-	      case CODE_FOR_avx512dq_rangepv8sf_mask:
-	      case CODE_FOR_avx512dq_rangepv2df_mask:
-	      case CODE_FOR_avx512dq_rangepv4sf_mask:
-	      case CODE_FOR_avx_shufpd256_mask:
-		error ("the last argument must be a 4-bit immediate");
-		return const0_rtx;
-
-	      case CODE_FOR_sha1rnds4:
-	      case CODE_FOR_sse4_1_blendpd:
-	      case CODE_FOR_avx_vpermilv2df:
-	      case CODE_FOR_avx_vpermilv2df_mask:
-	      case CODE_FOR_xop_vpermil2v2df3:
-	      case CODE_FOR_xop_vpermil2v4sf3:
-	      case CODE_FOR_xop_vpermil2v4df3:
-	      case CODE_FOR_xop_vpermil2v8sf3:
-	      case CODE_FOR_avx512f_vinsertf32x4_mask:
-	      case CODE_FOR_avx512f_vinserti32x4_mask:
-	      case CODE_FOR_avx512f_vextractf32x4_mask:
-	      case CODE_FOR_avx512f_vextracti32x4_mask:
-	      case CODE_FOR_sse2_shufpd:
-	      case CODE_FOR_sse2_shufpd_mask:
-	      case CODE_FOR_avx512dq_shuf_f64x2_mask:
-	      case CODE_FOR_avx512dq_shuf_i64x2_mask:
-	      case CODE_FOR_avx512vl_shuf_i32x4_mask:
-	      case CODE_FOR_avx512vl_shuf_f32x4_mask:
-		error ("the last argument must be a 2-bit immediate");
-		return const0_rtx;
-
-	      case CODE_FOR_avx_vextractf128v4df:
-	      case CODE_FOR_avx_vextractf128v8sf:
-	      case CODE_FOR_avx_vextractf128v8si:
-	      case CODE_FOR_avx_vinsertf128v4df:
-	      case CODE_FOR_avx_vinsertf128v8sf:
-	      case CODE_FOR_avx_vinsertf128v8si:
-	      case CODE_FOR_avx512f_vinsertf64x4_mask:
-	      case CODE_FOR_avx512f_vinserti64x4_mask:
-	      case CODE_FOR_avx512f_vextractf64x4_mask:
-	      case CODE_FOR_avx512f_vextracti64x4_mask:
-	      case CODE_FOR_avx512dq_vinsertf32x8_mask:
-	      case CODE_FOR_avx512dq_vinserti32x8_mask:
-	      case CODE_FOR_avx512vl_vinsertv4df:
-	      case CODE_FOR_avx512vl_vinsertv4di:
-	      case CODE_FOR_avx512vl_vinsertv8sf:
-	      case CODE_FOR_avx512vl_vinsertv8si:
-		error ("the last argument must be a 1-bit immediate");
-		return const0_rtx;
-
-	      case CODE_FOR_avx_vmcmpv2df3:
-	      case CODE_FOR_avx_vmcmpv4sf3:
-	      case CODE_FOR_avx_cmpv2df3:
-	      case CODE_FOR_avx_cmpv4sf3:
-	      case CODE_FOR_avx_cmpv4df3:
-	      case CODE_FOR_avx_cmpv8sf3:
-	      case CODE_FOR_avx512f_cmpv8df3_mask:
-	      case CODE_FOR_avx512f_cmpv16sf3_mask:
-	      case CODE_FOR_avx512f_vmcmpv2df3_mask:
-	      case CODE_FOR_avx512f_vmcmpv4sf3_mask:
-		error ("the last argument must be a 5-bit immediate");
-		return const0_rtx;
-
-	      default:
-		switch (nargs_constant)
-		  {
-		  case 2:
-		    if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
-			(!mask_pos && (nargs - i) == nargs_constant))
-		      {
-			error ("the next to last argument must be an 8-bit immediate");
-			break;
-		      }
-		    /* FALLTHRU */
-		  case 1:
-		    error ("the last argument must be an 8-bit immediate");
-		    break;
-		  default:
-		    gcc_unreachable ();
-		  }
-		return const0_rtx;
-	      }
-	}
-      else
-	{
-	  if (VECTOR_MODE_P (mode))
-	    op = safe_vector_operand (op, mode);
-
-	  /* If we aren't optimizing, only allow one memory operand to
-	     be generated.  */
-	  if (memory_operand (op, mode))
-	    num_memory++;
-
-	  op = fixup_modeless_constant (op, mode);
-
-	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
-	    {
-	      if (optimize || !match || num_memory > 1)
-		op = copy_to_mode_reg (mode, op);
-	    }
-	  else
-	    {
-	      op = copy_to_reg (op);
-	      op = lowpart_subreg (mode, op, GET_MODE (op));
-	    }
-	}
-
-      args[i].op = op;
-      args[i].mode = mode;
-    }
-
-  switch (nargs)
-    {
-    case 1:
-      pat = GEN_FCN (icode) (real_target, args[0].op);
-      break;
-    case 2:
-      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
-      break;
-    case 3:
-      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
-			     args[2].op);
-      break;
-    case 4:
-      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
-			     args[2].op, args[3].op);
-      break;
-    case 5:
-      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
-			     args[2].op, args[3].op, args[4].op);
-      break;
-    case 6:
-      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
-			     args[2].op, args[3].op, args[4].op,
-			     args[5].op);
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  if (! pat)
-    return 0;
-
-  emit_insn (pat);
-  return target;
-}
-
-/* Transform pattern of following layout:
-     (set A
-       (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
-     )
-   into:
-     (set (A B)) */
-
-static rtx
-ix86_erase_embedded_rounding (rtx pat)
-{
-  if (GET_CODE (pat) == INSN)
-    pat = PATTERN (pat);
-
-  gcc_assert (GET_CODE (pat) == SET);
-  rtx src = SET_SRC (pat);
-  gcc_assert (XVECLEN (src, 0) == 2);
-  rtx p0 = XVECEXP (src, 0, 0);
-  gcc_assert (GET_CODE (src) == UNSPEC
-	      && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
-  rtx res = gen_rtx_SET (SET_DEST (pat), p0);
-  return res;
-}
-
-/* Subroutine of ix86_expand_round_builtin to take care of comi insns
-   with rounding.  */
-static rtx
-ix86_expand_sse_comi_round (const struct builtin_description *d,
-			    tree exp, rtx target)
-{
-  rtx pat, set_dst;
-  tree arg0 = CALL_EXPR_ARG (exp, 0);
-  tree arg1 = CALL_EXPR_ARG (exp, 1);
-  tree arg2 = CALL_EXPR_ARG (exp, 2);
-  tree arg3 = CALL_EXPR_ARG (exp, 3);
-  rtx op0 = expand_normal (arg0);
-  rtx op1 = expand_normal (arg1);
-  rtx op2 = expand_normal (arg2);
-  rtx op3 = expand_normal (arg3);
-  enum insn_code icode = d->icode;
-  const struct insn_data_d *insn_p = &insn_data[icode];
-  machine_mode mode0 = insn_p->operand[0].mode;
-  machine_mode mode1 = insn_p->operand[1].mode;
-  enum rtx_code comparison = UNEQ;
-  bool need_ucomi = false;
-
-  /* See avxintrin.h for values.  */
-  enum rtx_code comi_comparisons[32] =
-    {
-      UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
-      UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
-      UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
-    };
-  bool need_ucomi_values[32] =
-    {
-      true,  false, false, true,  true,  false, false, true,
-      true,  false, false, true,  true,  false, false, true,
-      false, true,  true,  false, false, true,  true,  false,
-      false, true,  true,  false, false, true,  true,  false
-    };
-
-  if (!CONST_INT_P (op2))
-    {
-      error ("the third argument must be comparison constant");
-      return const0_rtx;
-    }
-  if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
-    {
-      error ("incorrect comparison mode");
-      return const0_rtx;
-    }
-
-  if (!insn_p->operand[2].predicate (op3, SImode))
-    {
-      error ("incorrect rounding operand");
-      return const0_rtx;
-    }
-
-  comparison = comi_comparisons[INTVAL (op2)];
-  need_ucomi = need_ucomi_values[INTVAL (op2)];
-
-  if (VECTOR_MODE_P (mode0))
-    op0 = safe_vector_operand (op0, mode0);
-  if (VECTOR_MODE_P (mode1))
-    op1 = safe_vector_operand (op1, mode1);
-
-  target = gen_reg_rtx (SImode);
-  emit_move_insn (target, const0_rtx);
-  target = gen_rtx_SUBREG (QImode, target, 0);
-
-  if ((optimize && !register_operand (op0, mode0))
-      || !insn_p->operand[0].predicate (op0, mode0))
-    op0 = copy_to_mode_reg (mode0, op0);
-  if ((optimize && !register_operand (op1, mode1))
-      || !insn_p->operand[1].predicate (op1, mode1))
-    op1 = copy_to_mode_reg (mode1, op1);
-
-  if (need_ucomi)
-    icode = icode == CODE_FOR_sse_comi_round
-		     ? CODE_FOR_sse_ucomi_round
-		     : CODE_FOR_sse2_ucomi_round;
-
-  pat = GEN_FCN (icode) (op0, op1, op3);
-  if (! pat)
-    return 0;
-
-  /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point.  */
-  if (INTVAL (op3) == NO_ROUND)
-    {
-      pat = ix86_erase_embedded_rounding (pat);
-      if (! pat)
-	return 0;
-
-      set_dst = SET_DEST (pat);
-    }
-  else
-    {
-      gcc_assert (GET_CODE (pat) == SET);
-      set_dst = SET_DEST (pat);
-    }
-
-  emit_insn (pat);
-  emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
-			  gen_rtx_fmt_ee (comparison, QImode,
-					  set_dst,
-					  const0_rtx)));
-
-  return SUBREG_REG (target);
-}
-
-static rtx
-ix86_expand_round_builtin (const struct builtin_description *d,
-			   tree exp, rtx target)
-{
-  rtx pat;
-  unsigned int i, nargs;
-  struct
-    {
-      rtx op;
-      machine_mode mode;
-    } args[6];
-  enum insn_code icode = d->icode;
-  const struct insn_data_d *insn_p = &insn_data[icode];
-  machine_mode tmode = insn_p->operand[0].mode;
-  unsigned int nargs_constant = 0;
-  unsigned int redundant_embed_rnd = 0;
-
-  switch ((enum ix86_builtin_func_type) d->flag)
-    {
-    case UINT64_FTYPE_V2DF_INT:
-    case UINT64_FTYPE_V4SF_INT:
-    case UINT_FTYPE_V2DF_INT:
-    case UINT_FTYPE_V4SF_INT:
-    case INT64_FTYPE_V2DF_INT:
-    case INT64_FTYPE_V4SF_INT:
-    case INT_FTYPE_V2DF_INT:
-    case INT_FTYPE_V4SF_INT:
-      nargs = 2;
-      break;
-    case V4SF_FTYPE_V4SF_UINT_INT:
-    case V4SF_FTYPE_V4SF_UINT64_INT:
-    case V2DF_FTYPE_V2DF_UINT64_INT:
-    case V4SF_FTYPE_V4SF_INT_INT:
-    case V4SF_FTYPE_V4SF_INT64_INT:
-    case V2DF_FTYPE_V2DF_INT64_INT:
-    case V4SF_FTYPE_V4SF_V4SF_INT:
-    case V2DF_FTYPE_V2DF_V2DF_INT:
-    case V4SF_FTYPE_V4SF_V2DF_INT:
-    case V2DF_FTYPE_V2DF_V4SF_INT:
-      nargs = 3;
-      break;
-    case V8SF_FTYPE_V8DF_V8SF_QI_INT:
-    case V8DF_FTYPE_V8DF_V8DF_QI_INT:
-    case V8SI_FTYPE_V8DF_V8SI_QI_INT:
-    case V8DI_FTYPE_V8DF_V8DI_QI_INT:
-    case V8SF_FTYPE_V8DI_V8SF_QI_INT:
-    case V8DF_FTYPE_V8DI_V8DF_QI_INT:
-    case V16SF_FTYPE_V16SF_V16SF_HI_INT:
-    case V8DI_FTYPE_V8SF_V8DI_QI_INT:
-    case V16SF_FTYPE_V16SI_V16SF_HI_INT:
-    case V16SI_FTYPE_V16SF_V16SI_HI_INT:
-    case V8DF_FTYPE_V8SF_V8DF_QI_INT:
-    case V16SF_FTYPE_V16HI_V16SF_HI_INT:
-    case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
-    case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
-      nargs = 4;
-      break;
-    case V4SF_FTYPE_V4SF_V4SF_INT_INT:
-    case V2DF_FTYPE_V2DF_V2DF_INT_INT:
-      nargs_constant = 2;
-      nargs = 4;
-      break;
-    case INT_FTYPE_V4SF_V4SF_INT_INT:
-    case INT_FTYPE_V2DF_V2DF_INT_INT:
-      return ix86_expand_sse_comi_round (d, exp, target);
-    case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
-    case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
-    case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
-    case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
-    case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
-    case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
-    case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
-    case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
-      nargs = 5;
-      break;
-    case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
-    case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
-      nargs_constant = 4;
-      nargs = 5;
-      break;
-    case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
-    case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
-    case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
-    case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
-      nargs_constant = 3;
-      nargs = 5;
-      break;
-    case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
-    case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
-    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
-    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
-    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
-    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
-      nargs = 6;
-      nargs_constant = 4;
-      break;
-    case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
-    case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
-    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
-    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
-      nargs = 6;
-      nargs_constant = 3;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-  gcc_assert (nargs <= ARRAY_SIZE (args));
-
-  if (optimize
-      || target == 0
-      || GET_MODE (target) != tmode
-      || !insn_p->operand[0].predicate (target, tmode))
-    target = gen_reg_rtx (tmode);
-
-  for (i = 0; i < nargs; i++)
-    {
-      tree arg = CALL_EXPR_ARG (exp, i);
-      rtx op = expand_normal (arg);
-      machine_mode mode = insn_p->operand[i + 1].mode;
-      bool match = insn_p->operand[i + 1].predicate (op, mode);
-
-      if (i == nargs - nargs_constant)
-	{
-	  if (!match)
-	    {
-	      switch (icode)
-		{
-		case CODE_FOR_avx512f_getmantv8df_mask_round:
-		case CODE_FOR_avx512f_getmantv16sf_mask_round:
-		case CODE_FOR_avx512f_vgetmantv2df_round:
-		case CODE_FOR_avx512f_vgetmantv2df_mask_round:
-		case CODE_FOR_avx512f_vgetmantv4sf_round:
-		case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
-		  error ("the immediate argument must be a 4-bit immediate");
-		  return const0_rtx;
-		case CODE_FOR_avx512f_cmpv8df3_mask_round:
-		case CODE_FOR_avx512f_cmpv16sf3_mask_round:
-		case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
-		case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
-		  error ("the immediate argument must be a 5-bit immediate");
-		  return const0_rtx;
-		default:
-		  error ("the immediate argument must be an 8-bit immediate");
-		  return const0_rtx;
-		}
-	    }
-	}
-      else if (i == nargs-1)
-	{
-	  if (!insn_p->operand[nargs].predicate (op, SImode))
-	    {
-	      error ("incorrect rounding operand");
-	      return const0_rtx;
-	    }
-
-	  /* If there is no rounding use normal version of the pattern.  */
-	  if (INTVAL (op) == NO_ROUND)
-	    redundant_embed_rnd = 1;
-	}
-      else
-	{
-	  if (VECTOR_MODE_P (mode))
-	    op = safe_vector_operand (op, mode);
-
-	  op = fixup_modeless_constant (op, mode);
-
-	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
-	    {
-	      if (optimize || !match)
-		op = copy_to_mode_reg (mode, op);
-	    }
-	  else
-	    {
-	      op = copy_to_reg (op);
-	      op = lowpart_subreg (mode, op, GET_MODE (op));
-	    }
-	}
-
-      args[i].op = op;
-      args[i].mode = mode;
-    }
-
-  switch (nargs)
-    {
-    case 1:
-      pat = GEN_FCN (icode) (target, args[0].op);
-      break;
-    case 2:
-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
-      break;
-    case 3:
-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
-			     args[2].op);
-      break;
-    case 4:
-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
-			     args[2].op, args[3].op);
-      break;
-    case 5:
-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
-			     args[2].op, args[3].op, args[4].op);
-      break;
-    case 6:
-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
-			     args[2].op, args[3].op, args[4].op,
-			     args[5].op);
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  if (!pat)
-    return 0;
-
-  if (redundant_embed_rnd)
-    pat = ix86_erase_embedded_rounding (pat);
-
-  emit_insn (pat);
-  return target;
-}
-
-/* Subroutine of ix86_expand_builtin to take care of special insns
-   with variable number of operands.  */
-
-static rtx
-ix86_expand_special_args_builtin (const struct builtin_description *d,
-				  tree exp, rtx target)
-{
-  tree arg;
-  rtx pat, op;
-  unsigned int i, nargs, arg_adjust, memory;
-  bool aligned_mem = false;
-  struct
-    {
-      rtx op;
-      machine_mode mode;
-    } args[3];
-  enum insn_code icode = d->icode;
-  bool last_arg_constant = false;
-  const struct insn_data_d *insn_p = &insn_data[icode];
-  machine_mode tmode = insn_p->operand[0].mode;
-  enum { load, store } klass;
-
-  switch ((enum ix86_builtin_func_type) d->flag)
-    {
-    case VOID_FTYPE_VOID:
-      emit_insn (GEN_FCN (icode) (target));
-      return 0;
-    case VOID_FTYPE_UINT64:
-    case VOID_FTYPE_UNSIGNED:
-      nargs = 0;
-      klass = store;
-      memory = 0;
-      break;
-
-    case INT_FTYPE_VOID:
-    case USHORT_FTYPE_VOID:
-    case UINT64_FTYPE_VOID:
-    case UINT_FTYPE_VOID:
-    case UNSIGNED_FTYPE_VOID:
-      nargs = 0;
-      klass = load;
-      memory = 0;
-      break;
-    case UINT64_FTYPE_PUNSIGNED:
-    case V2DI_FTYPE_PV2DI:
-    case V4DI_FTYPE_PV4DI:
-    case V32QI_FTYPE_PCCHAR:
-    case V16QI_FTYPE_PCCHAR:
-    case V8SF_FTYPE_PCV4SF:
-    case V8SF_FTYPE_PCFLOAT:
-    case V4SF_FTYPE_PCFLOAT:
-    case V4DF_FTYPE_PCV2DF:
-    case V4DF_FTYPE_PCDOUBLE:
-    case V2DF_FTYPE_PCDOUBLE:
-    case VOID_FTYPE_PVOID:
-    case V8DI_FTYPE_PV8DI:
-      nargs = 1;
-      klass = load;
-      memory = 0;
-      switch (icode)
-	{
-	case CODE_FOR_sse4_1_movntdqa:
-	case CODE_FOR_avx2_movntdqa:
-	case CODE_FOR_avx512f_movntdqa:
-	  aligned_mem = true;
-	  break;
-	default:
-	  break;
-	}
-      break;
-    case VOID_FTYPE_PV2SF_V4SF:
-    case VOID_FTYPE_PV8DI_V8DI:
-    case VOID_FTYPE_PV4DI_V4DI:
-    case VOID_FTYPE_PV2DI_V2DI:
-    case VOID_FTYPE_PCHAR_V32QI:
-    case VOID_FTYPE_PCHAR_V16QI:
-    case VOID_FTYPE_PFLOAT_V16SF:
-    case VOID_FTYPE_PFLOAT_V8SF:
-    case VOID_FTYPE_PFLOAT_V4SF:
-    case VOID_FTYPE_PDOUBLE_V8DF:
-    case VOID_FTYPE_PDOUBLE_V4DF:
-    case VOID_FTYPE_PDOUBLE_V2DF:
-    case VOID_FTYPE_PLONGLONG_LONGLONG:
-    case VOID_FTYPE_PULONGLONG_ULONGLONG:
-    case VOID_FTYPE_PUNSIGNED_UNSIGNED:
-    case VOID_FTYPE_PINT_INT:
-      nargs = 1;
-      klass = store;
-      /* Reserve memory operand for target.  */
-      memory = ARRAY_SIZE (args);
-      switch (icode)
-	{
-	/* These builtins and instructions require the memory
-	   to be properly aligned.  */
-	case CODE_FOR_avx_movntv4di:
-	case CODE_FOR_sse2_movntv2di:
-	case CODE_FOR_avx_movntv8sf:
-	case CODE_FOR_sse_movntv4sf:
-	case CODE_FOR_sse4a_vmmovntv4sf:
-	case CODE_FOR_avx_movntv4df:
-	case CODE_FOR_sse2_movntv2df:
-	case CODE_FOR_sse4a_vmmovntv2df:
-	case CODE_FOR_sse2_movntidi:
-	case CODE_FOR_sse_movntq:
-	case CODE_FOR_sse2_movntisi:
-	case CODE_FOR_avx512f_movntv16sf:
-	case CODE_FOR_avx512f_movntv8df:
-	case CODE_FOR_avx512f_movntv8di:
-	  aligned_mem = true;
-	  break;
-	default:
-	  break;
-	}
-      break;
-    case VOID_FTYPE_PVOID_PCVOID:
-	nargs = 1;
-	klass = store;
-	memory = 0;
-
-	break;
-    case V4SF_FTYPE_V4SF_PCV2SF:
-    case V2DF_FTYPE_V2DF_PCDOUBLE:
-      nargs = 2;
-      klass = load;
-      memory = 1;
-      break;
-    case V8SF_FTYPE_PCV8SF_V8SI:
-    case V4DF_FTYPE_PCV4DF_V4DI:
-    case V4SF_FTYPE_PCV4SF_V4SI:
-    case V2DF_FTYPE_PCV2DF_V2DI:
-    case V8SI_FTYPE_PCV8SI_V8SI:
-    case V4DI_FTYPE_PCV4DI_V4DI:
-    case V4SI_FTYPE_PCV4SI_V4SI:
-    case V2DI_FTYPE_PCV2DI_V2DI:
-    case VOID_FTYPE_INT_INT64:
-      nargs = 2;
-      klass = load;
-      memory = 0;
-      break;
-    case VOID_FTYPE_PV8DF_V8DF_UQI:
-    case VOID_FTYPE_PV4DF_V4DF_UQI:
-    case VOID_FTYPE_PV2DF_V2DF_UQI:
-    case VOID_FTYPE_PV16SF_V16SF_UHI:
-    case VOID_FTYPE_PV8SF_V8SF_UQI:
-    case VOID_FTYPE_PV4SF_V4SF_UQI:
-    case VOID_FTYPE_PV8DI_V8DI_UQI:
-    case VOID_FTYPE_PV4DI_V4DI_UQI:
-    case VOID_FTYPE_PV2DI_V2DI_UQI:
-    case VOID_FTYPE_PV16SI_V16SI_UHI:
-    case VOID_FTYPE_PV8SI_V8SI_UQI:
-    case VOID_FTYPE_PV4SI_V4SI_UQI:
-    case VOID_FTYPE_PV64QI_V64QI_UDI:
-    case VOID_FTYPE_PV32HI_V32HI_USI:
-    case VOID_FTYPE_PV32QI_V32QI_USI:
-    case VOID_FTYPE_PV16QI_V16QI_UHI:
-    case VOID_FTYPE_PV16HI_V16HI_UHI:
-    case VOID_FTYPE_PV8HI_V8HI_UQI:
-      switch (icode)
-	{
-	/* These builtins and instructions require the memory
-	   to be properly aligned.  */
-	case CODE_FOR_avx512f_storev16sf_mask:
-	case CODE_FOR_avx512f_storev16si_mask:
-	case CODE_FOR_avx512f_storev8df_mask:
-	case CODE_FOR_avx512f_storev8di_mask:
-	case CODE_FOR_avx512vl_storev8sf_mask:
-	case CODE_FOR_avx512vl_storev8si_mask:
-	case CODE_FOR_avx512vl_storev4df_mask:
-	case CODE_FOR_avx512vl_storev4di_mask:
-	case CODE_FOR_avx512vl_storev4sf_mask:
-	case CODE_FOR_avx512vl_storev4si_mask:
-	case CODE_FOR_avx512vl_storev2df_mask:
-	case CODE_FOR_avx512vl_storev2di_mask:
-	  aligned_mem = true;
-	  break;
-	default:
-	  break;
-	}
-      /* FALLTHRU */
-    case VOID_FTYPE_PV8SF_V8SI_V8SF:
-    case VOID_FTYPE_PV4DF_V4DI_V4DF:
-    case VOID_FTYPE_PV4SF_V4SI_V4SF:
-    case VOID_FTYPE_PV2DF_V2DI_V2DF:
-    case VOID_FTYPE_PV8SI_V8SI_V8SI:
-    case VOID_FTYPE_PV4DI_V4DI_V4DI:
-    case VOID_FTYPE_PV4SI_V4SI_V4SI:
-    case VOID_FTYPE_PV2DI_V2DI_V2DI:
-    case VOID_FTYPE_PV8SI_V8DI_UQI:
-    case VOID_FTYPE_PV8HI_V8DI_UQI:
-    case VOID_FTYPE_PV16HI_V16SI_UHI:
-    case VOID_FTYPE_PV16QI_V8DI_UQI:
-    case VOID_FTYPE_PV16QI_V16SI_UHI:
-    case VOID_FTYPE_PV4SI_V4DI_UQI:
-    case VOID_FTYPE_PV4SI_V2DI_UQI:
-    case VOID_FTYPE_PV8HI_V4DI_UQI:
-    case VOID_FTYPE_PV8HI_V2DI_UQI:
-    case VOID_FTYPE_PV8HI_V8SI_UQI:
-    case VOID_FTYPE_PV8HI_V4SI_UQI:
-    case VOID_FTYPE_PV16QI_V4DI_UQI:
-    case VOID_FTYPE_PV16QI_V2DI_UQI:
-    case VOID_FTYPE_PV16QI_V8SI_UQI:
-    case VOID_FTYPE_PV16QI_V4SI_UQI:
-    case VOID_FTYPE_PCHAR_V64QI_UDI:
-    case VOID_FTYPE_PCHAR_V32QI_USI:
-    case VOID_FTYPE_PCHAR_V16QI_UHI:
-    case VOID_FTYPE_PSHORT_V32HI_USI:
-    case VOID_FTYPE_PSHORT_V16HI_UHI:
-    case VOID_FTYPE_PSHORT_V8HI_UQI:
-    case VOID_FTYPE_PINT_V16SI_UHI:
-    case VOID_FTYPE_PINT_V8SI_UQI:
-    case VOID_FTYPE_PINT_V4SI_UQI:
-    case VOID_FTYPE_PINT64_V8DI_UQI:
-    case VOID_FTYPE_PINT64_V4DI_UQI:
-    case VOID_FTYPE_PINT64_V2DI_UQI:
-    case VOID_FTYPE_PDOUBLE_V8DF_UQI:
-    case VOID_FTYPE_PDOUBLE_V4DF_UQI:
-    case VOID_FTYPE_PDOUBLE_V2DF_UQI:
-    case VOID_FTYPE_PFLOAT_V16SF_UHI:
-    case VOID_FTYPE_PFLOAT_V8SF_UQI:
-    case VOID_FTYPE_PFLOAT_V4SF_UQI:
-    case VOID_FTYPE_PV32QI_V32HI_USI:
-    case VOID_FTYPE_PV16QI_V16HI_UHI:
-    case VOID_FTYPE_PV8QI_V8HI_UQI:
-      nargs = 2;
-      klass = store;
-      /* Reserve memory operand for target.  */
-      memory = ARRAY_SIZE (args);
-      break;
-    case V4SF_FTYPE_PCV4SF_V4SF_UQI:
-    case V8SF_FTYPE_PCV8SF_V8SF_UQI:
-    case V16SF_FTYPE_PCV16SF_V16SF_UHI:
-    case V4SI_FTYPE_PCV4SI_V4SI_UQI:
-    case V8SI_FTYPE_PCV8SI_V8SI_UQI:
-    case V16SI_FTYPE_PCV16SI_V16SI_UHI:
-    case V2DF_FTYPE_PCV2DF_V2DF_UQI:
-    case V4DF_FTYPE_PCV4DF_V4DF_UQI:
-    case V8DF_FTYPE_PCV8DF_V8DF_UQI:
-    case V2DI_FTYPE_PCV2DI_V2DI_UQI:
-    case V4DI_FTYPE_PCV4DI_V4DI_UQI:
-    case V8DI_FTYPE_PCV8DI_V8DI_UQI:
-    case V64QI_FTYPE_PCV64QI_V64QI_UDI:
-    case V32HI_FTYPE_PCV32HI_V32HI_USI:
-    case V32QI_FTYPE_PCV32QI_V32QI_USI:
-    case V16QI_FTYPE_PCV16QI_V16QI_UHI:
-    case V16HI_FTYPE_PCV16HI_V16HI_UHI:
-    case V8HI_FTYPE_PCV8HI_V8HI_UQI:
-      switch (icode)
-	{
-	/* These builtins and instructions require the memory
-	   to be properly aligned.  */
-	case CODE_FOR_avx512f_loadv16sf_mask:
-	case CODE_FOR_avx512f_loadv16si_mask:
-	case CODE_FOR_avx512f_loadv8df_mask:
-	case CODE_FOR_avx512f_loadv8di_mask:
-	case CODE_FOR_avx512vl_loadv8sf_mask:
-	case CODE_FOR_avx512vl_loadv8si_mask:
-	case CODE_FOR_avx512vl_loadv4df_mask:
-	case CODE_FOR_avx512vl_loadv4di_mask:
-	case CODE_FOR_avx512vl_loadv4sf_mask:
-	case CODE_FOR_avx512vl_loadv4si_mask:
-	case CODE_FOR_avx512vl_loadv2df_mask:
-	case CODE_FOR_avx512vl_loadv2di_mask:
-	case CODE_FOR_avx512bw_loadv64qi_mask:
-	case CODE_FOR_avx512vl_loadv32qi_mask:
-	case CODE_FOR_avx512vl_loadv16qi_mask:
-	case CODE_FOR_avx512bw_loadv32hi_mask:
-	case CODE_FOR_avx512vl_loadv16hi_mask:
-	case CODE_FOR_avx512vl_loadv8hi_mask:
-	  aligned_mem = true;
-	  break;
-	default:
-	  break;
-	}
-      /* FALLTHRU */
-    case V64QI_FTYPE_PCCHAR_V64QI_UDI:
-    case V32QI_FTYPE_PCCHAR_V32QI_USI:
-    case V16QI_FTYPE_PCCHAR_V16QI_UHI:
-    case V32HI_FTYPE_PCSHORT_V32HI_USI:
-    case V16HI_FTYPE_PCSHORT_V16HI_UHI:
-    case V8HI_FTYPE_PCSHORT_V8HI_UQI:
-    case V16SI_FTYPE_PCINT_V16SI_UHI:
-    case V8SI_FTYPE_PCINT_V8SI_UQI:
-    case V4SI_FTYPE_PCINT_V4SI_UQI:
-    case V8DI_FTYPE_PCINT64_V8DI_UQI:
-    case V4DI_FTYPE_PCINT64_V4DI_UQI:
-    case V2DI_FTYPE_PCINT64_V2DI_UQI:
-    case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
-    case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
-    case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
-    case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
-    case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
-    case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
-      nargs = 3;
-      klass = load;
-      memory = 0;
-      break;
-    case VOID_FTYPE_UINT_UINT_UINT:
-    case VOID_FTYPE_UINT64_UINT_UINT:
-    case UCHAR_FTYPE_UINT_UINT_UINT:
-    case UCHAR_FTYPE_UINT64_UINT_UINT:
-      nargs = 3;
-      klass = load;
-      memory = ARRAY_SIZE (args);
-      last_arg_constant = true;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  gcc_assert (nargs <= ARRAY_SIZE (args));
-
-  if (klass == store)
-    {
-      arg = CALL_EXPR_ARG (exp, 0);
-      op = expand_normal (arg);
-      gcc_assert (target == 0);
-      if (memory)
-	{
-	  op = ix86_zero_extend_to_Pmode (op);
-	  target = gen_rtx_MEM (tmode, op);
-	  /* target at this point has just BITS_PER_UNIT MEM_ALIGN
-	     on it.  Try to improve it using get_pointer_alignment,
-	     and if the special builtin is one that requires strict
-	     mode alignment, also from it's GET_MODE_ALIGNMENT.
-	     Failure to do so could lead to ix86_legitimate_combined_insn
-	     rejecting all changes to such insns.  */
-	  unsigned int align = get_pointer_alignment (arg);
-	  if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
-	    align = GET_MODE_ALIGNMENT (tmode);
-	  if (MEM_ALIGN (target) < align)
-	    set_mem_align (target, align);
-	}
-      else
-	target = force_reg (tmode, op);
-      arg_adjust = 1;
-    }
-  else
-    {
-      arg_adjust = 0;
-      if (optimize
-	  || target == 0
-	  || !register_operand (target, tmode)
-	  || GET_MODE (target) != tmode)
-	target = gen_reg_rtx (tmode);
-    }
-
-  for (i = 0; i < nargs; i++)
-    {
-      machine_mode mode = insn_p->operand[i + 1].mode;
-      bool match;
-
-      arg = CALL_EXPR_ARG (exp, i + arg_adjust);
-      op = expand_normal (arg);
-      match = insn_p->operand[i + 1].predicate (op, mode);
-
-      if (last_arg_constant && (i + 1) == nargs)
-	{
-	  if (!match)
-	    {
-	      if (icode == CODE_FOR_lwp_lwpvalsi3
-		  || icode == CODE_FOR_lwp_lwpinssi3
-		  || icode == CODE_FOR_lwp_lwpvaldi3
-		  || icode == CODE_FOR_lwp_lwpinsdi3)
-		error ("the last argument must be a 32-bit immediate");
-	      else
-		error ("the last argument must be an 8-bit immediate");
-	      return const0_rtx;
-	    }
-	}
-      else
-	{
-	  if (i == memory)
-	    {
-	      /* This must be the memory operand.  */
-	      op = ix86_zero_extend_to_Pmode (op);
-	      op = gen_rtx_MEM (mode, op);
-	      /* op at this point has just BITS_PER_UNIT MEM_ALIGN
-		 on it.  Try to improve it using get_pointer_alignment,
-		 and if the special builtin is one that requires strict
-		 mode alignment, also from it's GET_MODE_ALIGNMENT.
-		 Failure to do so could lead to ix86_legitimate_combined_insn
-		 rejecting all changes to such insns.  */
-	      unsigned int align = get_pointer_alignment (arg);
-	      if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
-		align = GET_MODE_ALIGNMENT (mode);
-	      if (MEM_ALIGN (op) < align)
-		set_mem_align (op, align);
-	    }
-	  else
-	    {
-	      /* This must be register.  */
-	      if (VECTOR_MODE_P (mode))
-		op = safe_vector_operand (op, mode);
-
-	      op = fixup_modeless_constant (op, mode);
-
-	      if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
-		op = copy_to_mode_reg (mode, op);
-	      else
-	        {
-	          op = copy_to_reg (op);
-	          op = lowpart_subreg (mode, op, GET_MODE (op));
-	        }
-	    }
-	}
-
-      args[i].op = op;
-      args[i].mode = mode;
-    }
-
-  switch (nargs)
-    {
-    case 0:
-      pat = GEN_FCN (icode) (target);
-      break;
-    case 1:
-      pat = GEN_FCN (icode) (target, args[0].op);
-      break;
-    case 2:
-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
-      break;
-    case 3:
-      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  if (! pat)
-    return 0;
-  emit_insn (pat);
-  return klass == store ? 0 : target;
-}
-
-/* Return the integer constant in ARG.  Constrain it to be in the range
-   of the subparts of VEC_TYPE; issue an error if not.  */
-
-static int
-get_element_number (tree vec_type, tree arg)
-{
-  unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
-
-  if (!tree_fits_uhwi_p (arg)
-      || (elt = tree_to_uhwi (arg), elt > max))
-    {
-      error ("selector must be an integer constant in the range 0..%wi", max);
-      return 0;
-    }
-
-  return elt;
-}
-
-/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
-   ix86_expand_vector_init.  We DO have language-level syntax for this, in
-   the form of  (type){ init-list }.  Except that since we can't place emms
-   instructions from inside the compiler, we can't allow the use of MMX
-   registers unless the user explicitly asks for it.  So we do *not* define
-   vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
-   we have builtins invoked by mmintrin.h that gives us license to emit
-   these sorts of instructions.  */
-
-static rtx
-ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
-{
-  machine_mode tmode = TYPE_MODE (type);
-  machine_mode inner_mode = GET_MODE_INNER (tmode);
-  int i, n_elt = GET_MODE_NUNITS (tmode);
-  rtvec v = rtvec_alloc (n_elt);
-
-  gcc_assert (VECTOR_MODE_P (tmode));
-  gcc_assert (call_expr_nargs (exp) == n_elt);
-
-  for (i = 0; i < n_elt; ++i)
-    {
-      rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
-      RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
-    }
-
-  if (!target || !register_operand (target, tmode))
-    target = gen_reg_rtx (tmode);
-
-  ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
-  return target;
-}
-
-/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
-   ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
-   had a language-level syntax for referencing vector elements.  */
-
-static rtx
-ix86_expand_vec_ext_builtin (tree exp, rtx target)
-{
-  machine_mode tmode, mode0;
-  tree arg0, arg1;
-  int elt;
-  rtx op0;
-
-  arg0 = CALL_EXPR_ARG (exp, 0);
-  arg1 = CALL_EXPR_ARG (exp, 1);
-
-  op0 = expand_normal (arg0);
-  elt = get_element_number (TREE_TYPE (arg0), arg1);
-
-  tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
-  mode0 = TYPE_MODE (TREE_TYPE (arg0));
-  gcc_assert (VECTOR_MODE_P (mode0));
-
-  op0 = force_reg (mode0, op0);
-
-  if (optimize || !target || !register_operand (target, tmode))
-    target = gen_reg_rtx (tmode);
-
-  ix86_expand_vector_extract (true, target, op0, elt);
-
-  return target;
-}
-
-/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
-   ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
-   a language-level syntax for referencing vector elements.  */
-
-static rtx
-ix86_expand_vec_set_builtin (tree exp)
-{
-  machine_mode tmode, mode1;
-  tree arg0, arg1, arg2;
-  int elt;
-  rtx op0, op1, target;
-
-  arg0 = CALL_EXPR_ARG (exp, 0);
-  arg1 = CALL_EXPR_ARG (exp, 1);
-  arg2 = CALL_EXPR_ARG (exp, 2);
-
-  tmode = TYPE_MODE (TREE_TYPE (arg0));
-  mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
-  gcc_assert (VECTOR_MODE_P (tmode));
-
-  op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
-  op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
-  elt = get_element_number (TREE_TYPE (arg0), arg2);
-
-  if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
-    op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
-
-  op0 = force_reg (tmode, op0);
-  op1 = force_reg (mode1, op1);
-
-  /* OP0 is the source of these builtin functions and shouldn't be
-     modified.  Create a copy, use it and return it as target.  */
-  target = gen_reg_rtx (tmode);
-  emit_move_insn (target, op0);
-  ix86_expand_vector_set (true, target, op1, elt);
-
-  return target;
-}
-
-/* Expand an expression EXP that calls a built-in function,
-   with result going to TARGET if that's convenient
-   (and in mode MODE if that's convenient).
-   SUBTARGET may be used as the target for computing one of EXP's operands.
-   IGNORE is nonzero if the value is to be ignored.  */
-
-static rtx
-ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
-		     machine_mode mode, int ignore)
-{
-  size_t i;
-  enum insn_code icode, icode2;
-  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
-  tree arg0, arg1, arg2, arg3, arg4;
-  rtx op0, op1, op2, op3, op4, pat, pat2, insn;
-  machine_mode mode0, mode1, mode2, mode3, mode4;
-  unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
-
-  /* For CPU builtins that can be folded, fold first and expand the fold.  */
-  switch (fcode)
-    {
-    case IX86_BUILTIN_CPU_INIT:
-      {
-	/* Make it call __cpu_indicator_init in libgcc. */
-	tree call_expr, fndecl, type;
-        type = build_function_type_list (integer_type_node, NULL_TREE); 
-	fndecl = build_fn_decl ("__cpu_indicator_init", type);
-	call_expr = build_call_expr (fndecl, 0); 
-	return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
-      }
-    case IX86_BUILTIN_CPU_IS:
-    case IX86_BUILTIN_CPU_SUPPORTS:
-      {
-	tree arg0 = CALL_EXPR_ARG (exp, 0);
-	tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
-	gcc_assert (fold_expr != NULL_TREE);
-	return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
-      }
-    }
-
-  HOST_WIDE_INT isa = ix86_isa_flags;
-  HOST_WIDE_INT isa2 = ix86_isa_flags2;
-  HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
-  HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
-  /* The general case is we require all the ISAs specified in bisa{,2}
-     to be enabled.
-     The exceptions are:
-     OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
-     OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
-     OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
-     where for each this pair it is sufficient if either of the ISAs is
-     enabled, plus if it is ored with other options also those others.  */
-  if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
-       == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
-      && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
-    isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
-  if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
-       == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
-      && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
-    isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
-  if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
-       == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
-      && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
-    isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
-  if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
-    {
-      bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
-      if (TARGET_ABI_X32)
-	bisa |= OPTION_MASK_ABI_X32;
-      else
-	bisa |= OPTION_MASK_ABI_64;
-      char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
-				       (enum fpmath_unit) 0, false, add_abi_p);
-      if (!opts)
-	error ("%qE needs unknown isa option", fndecl);
-      else
-	{
-	  gcc_assert (opts != NULL);
-	  error ("%qE needs isa option %s", fndecl, opts);
-	  free (opts);
-	}
-      return expand_call (exp, target, ignore);
-    }
-
-  switch (fcode)
-    {
-    case IX86_BUILTIN_MASKMOVQ:
-    case IX86_BUILTIN_MASKMOVDQU:
-      icode = (fcode == IX86_BUILTIN_MASKMOVQ
-	       ? CODE_FOR_mmx_maskmovq
-	       : CODE_FOR_sse2_maskmovdqu);
-      /* Note the arg order is different from the operand order.  */
-      arg1 = CALL_EXPR_ARG (exp, 0);
-      arg2 = CALL_EXPR_ARG (exp, 1);
-      arg0 = CALL_EXPR_ARG (exp, 2);
-      op0 = expand_normal (arg0);
-      op1 = expand_normal (arg1);
-      op2 = expand_normal (arg2);
-      mode0 = insn_data[icode].operand[0].mode;
-      mode1 = insn_data[icode].operand[1].mode;
-      mode2 = insn_data[icode].operand[2].mode;
-
-      op0 = ix86_zero_extend_to_Pmode (op0);
-      op0 = gen_rtx_MEM (mode1, op0);
-
-      if (!insn_data[icode].operand[0].predicate (op0, mode0))
-	op0 = copy_to_mode_reg (mode0, op0);
-      if (!insn_data[icode].operand[1].predicate (op1, mode1))
-	op1 = copy_to_mode_reg (mode1, op1);
-      if (!insn_data[icode].operand[2].predicate (op2, mode2))
-	op2 = copy_to_mode_reg (mode2, op2);
-      pat = GEN_FCN (icode) (op0, op1, op2);
-      if (! pat)
-	return 0;
-      emit_insn (pat);
-      return 0;
-
-    case IX86_BUILTIN_LDMXCSR:
-      op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
-      target = assign_386_stack_local (SImode, SLOT_TEMP);
-      emit_move_insn (target, op0);
-      emit_insn (gen_sse_ldmxcsr (target));
-      return 0;
-
-    case IX86_BUILTIN_STMXCSR:
-      target = assign_386_stack_local (SImode, SLOT_TEMP);
-      emit_insn (gen_sse_stmxcsr (target));
-      return copy_to_mode_reg (SImode, target);
-
-    case IX86_BUILTIN_CLFLUSH:
-	arg0 = CALL_EXPR_ARG (exp, 0);
-	op0 = expand_normal (arg0);
-	icode = CODE_FOR_sse2_clflush;
-	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
-	  op0 = ix86_zero_extend_to_Pmode (op0);
-
-	emit_insn (gen_sse2_clflush (op0));
-	return 0;
-
-    case IX86_BUILTIN_CLWB:
-	arg0 = CALL_EXPR_ARG (exp, 0);
-	op0 = expand_normal (arg0);
-	icode = CODE_FOR_clwb;
-	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
-	  op0 = ix86_zero_extend_to_Pmode (op0);
-
-	emit_insn (gen_clwb (op0));
-	return 0;
-
-    case IX86_BUILTIN_CLFLUSHOPT:
-	arg0 = CALL_EXPR_ARG (exp, 0);
-	op0 = expand_normal (arg0);
-	icode = CODE_FOR_clflushopt;
-	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
-	  op0 = ix86_zero_extend_to_Pmode (op0);
-
-	emit_insn (gen_clflushopt (op0));
-	return 0;
-
-    case IX86_BUILTIN_MONITOR:
-    case IX86_BUILTIN_MONITORX:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      arg1 = CALL_EXPR_ARG (exp, 1);
-      arg2 = CALL_EXPR_ARG (exp, 2);
-      op0 = expand_normal (arg0);
-      op1 = expand_normal (arg1);
-      op2 = expand_normal (arg2);
-      if (!REG_P (op0))
-	op0 = ix86_zero_extend_to_Pmode (op0);
-      if (!REG_P (op1))
-	op1 = copy_to_mode_reg (SImode, op1);
-      if (!REG_P (op2))
-	op2 = copy_to_mode_reg (SImode, op2);
-
-      emit_insn (fcode == IX86_BUILTIN_MONITOR 
-		 ? ix86_gen_monitor (op0, op1, op2)
-		 : ix86_gen_monitorx (op0, op1, op2));
-      return 0;
-
-    case IX86_BUILTIN_MWAIT:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      arg1 = CALL_EXPR_ARG (exp, 1);
-      op0 = expand_normal (arg0);
-      op1 = expand_normal (arg1);
-      if (!REG_P (op0))
-	op0 = copy_to_mode_reg (SImode, op0);
-      if (!REG_P (op1))
-	op1 = copy_to_mode_reg (SImode, op1);
-      emit_insn (gen_sse3_mwait (op0, op1));
-      return 0;
-
-    case IX86_BUILTIN_MWAITX:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      arg1 = CALL_EXPR_ARG (exp, 1);
-      arg2 = CALL_EXPR_ARG (exp, 2);
-      op0 = expand_normal (arg0);
-      op1 = expand_normal (arg1);
-      op2 = expand_normal (arg2);
-      if (!REG_P (op0))
-	op0 = copy_to_mode_reg (SImode, op0);
-      if (!REG_P (op1))
-	op1 = copy_to_mode_reg (SImode, op1);
-      if (!REG_P (op2))
-	op2 = copy_to_mode_reg (SImode, op2);
-      emit_insn (gen_mwaitx (op0, op1, op2));
-      return 0;
-
-    case IX86_BUILTIN_UMONITOR:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      op0 = expand_normal (arg0);
-
-      op0 = ix86_zero_extend_to_Pmode (op0);
-
-      insn = (TARGET_64BIT
-	      ? gen_umonitor_di (op0)
-	      : gen_umonitor_si (op0));
-
-      emit_insn (insn);
-      return 0;
-
-    case IX86_BUILTIN_UMWAIT:
-    case IX86_BUILTIN_TPAUSE:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      arg1 = CALL_EXPR_ARG (exp, 1);
-      op0 = expand_normal (arg0);
-      op1 = expand_normal (arg1);
-
-      if (!REG_P (op0))
-	op0 = copy_to_mode_reg (SImode, op0);
-
-      op1 = force_reg (DImode, op1);
-
-      if (TARGET_64BIT)
-	{
-	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
-				     NULL, 1, OPTAB_DIRECT);
-	  switch (fcode)
-	    {
-	    case IX86_BUILTIN_UMWAIT:
-	      icode = CODE_FOR_umwait_rex64;
-	      break;
-	    case IX86_BUILTIN_TPAUSE:
-	      icode = CODE_FOR_tpause_rex64;
-	      break;
-	    default:
-	      gcc_unreachable ();
-	    }
-
-	  op2 = gen_lowpart (SImode, op2);
-	  op1 = gen_lowpart (SImode, op1);
-	  pat = GEN_FCN (icode) (op0, op1, op2);
-	}
-      else
-	{
-	  switch (fcode)
-	    {
-	    case IX86_BUILTIN_UMWAIT:
-	      icode = CODE_FOR_umwait;
-	      break;
-	    case IX86_BUILTIN_TPAUSE:
-	      icode = CODE_FOR_tpause;
-	      break;
-	    default:
-	      gcc_unreachable ();
-	    }
-	  pat = GEN_FCN (icode) (op0, op1);
-	}
-
-      if (!pat)
-	return 0;
-
-      emit_insn (pat);
-
-      if (target == 0
-	  || !register_operand (target, QImode))
-	target = gen_reg_rtx (QImode);
-
-      pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
-			const0_rtx);
-      emit_insn (gen_rtx_SET (target, pat));
-
-      return target;
-
-    case IX86_BUILTIN_CLZERO:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      op0 = expand_normal (arg0);
-      if (!REG_P (op0))
-	op0 = ix86_zero_extend_to_Pmode (op0);
-      emit_insn (ix86_gen_clzero (op0));
-      return 0;
-
-    case IX86_BUILTIN_CLDEMOTE:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      op0 = expand_normal (arg0);
-      icode = CODE_FOR_cldemote;
-      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
-	op0 = ix86_zero_extend_to_Pmode (op0);
-
-      emit_insn (gen_cldemote (op0));
-      return 0;
-
-    case IX86_BUILTIN_VEC_INIT_V2SI:
-    case IX86_BUILTIN_VEC_INIT_V4HI:
-    case IX86_BUILTIN_VEC_INIT_V8QI:
-      return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
-
-    case IX86_BUILTIN_VEC_EXT_V2DF:
-    case IX86_BUILTIN_VEC_EXT_V2DI:
-    case IX86_BUILTIN_VEC_EXT_V4SF:
-    case IX86_BUILTIN_VEC_EXT_V4SI:
-    case IX86_BUILTIN_VEC_EXT_V8HI:
-    case IX86_BUILTIN_VEC_EXT_V2SI:
-    case IX86_BUILTIN_VEC_EXT_V4HI:
-    case IX86_BUILTIN_VEC_EXT_V16QI:
-      return ix86_expand_vec_ext_builtin (exp, target);
-
-    case IX86_BUILTIN_VEC_SET_V2DI:
-    case IX86_BUILTIN_VEC_SET_V4SF:
-    case IX86_BUILTIN_VEC_SET_V4SI:
-    case IX86_BUILTIN_VEC_SET_V8HI:
-    case IX86_BUILTIN_VEC_SET_V4HI:
-    case IX86_BUILTIN_VEC_SET_V16QI:
-      return ix86_expand_vec_set_builtin (exp);
-
-    case IX86_BUILTIN_NANQ:
-    case IX86_BUILTIN_NANSQ:
-      return expand_call (exp, target, ignore);
-
-    case IX86_BUILTIN_RDPID:
-
-      op0 = gen_reg_rtx (word_mode);
-
-      if (TARGET_64BIT)
-	{
-	  insn = gen_rdpid_rex64 (op0);
-	  op0 = convert_to_mode (SImode, op0, 1);
-	}
-      else
-	insn = gen_rdpid (op0);
-
-      emit_insn (insn);
-
-      if (target == 0
-	  || !register_operand (target, SImode))
-	target = gen_reg_rtx (SImode);
-
-      emit_move_insn (target, op0);
-      return target;
-
-    case IX86_BUILTIN_RDPMC:
-    case IX86_BUILTIN_RDTSC:
-    case IX86_BUILTIN_RDTSCP:
-    case IX86_BUILTIN_XGETBV:
-
-      op0 = gen_reg_rtx (DImode);
-      op1 = gen_reg_rtx (DImode);
-
-      if (fcode == IX86_BUILTIN_RDPMC)
-	{
-	  arg0 = CALL_EXPR_ARG (exp, 0);
-	  op2 = expand_normal (arg0);
-	  if (!register_operand (op2, SImode))
-	    op2 = copy_to_mode_reg (SImode, op2);
-
-	  insn = (TARGET_64BIT
-		  ? gen_rdpmc_rex64 (op0, op1, op2)
-		  : gen_rdpmc (op0, op2));
-	  emit_insn (insn);
-	}
-      else if (fcode == IX86_BUILTIN_XGETBV)
-	{
-	  arg0 = CALL_EXPR_ARG (exp, 0);
-	  op2 = expand_normal (arg0);
-	  if (!register_operand (op2, SImode))
-	    op2 = copy_to_mode_reg (SImode, op2);
-
-	  insn = (TARGET_64BIT
-		  ? gen_xgetbv_rex64 (op0, op1, op2)
-		  : gen_xgetbv (op0, op2));
-	  emit_insn (insn);
-	}
-      else if (fcode == IX86_BUILTIN_RDTSC)
-	{
-	  insn = (TARGET_64BIT
-		  ? gen_rdtsc_rex64 (op0, op1)
-		  : gen_rdtsc (op0));
-	  emit_insn (insn);
-	}
-      else
-	{
-	  op2 = gen_reg_rtx (SImode);
-
-	  insn = (TARGET_64BIT
-		  ? gen_rdtscp_rex64 (op0, op1, op2)
-		  : gen_rdtscp (op0, op2));
-	  emit_insn (insn);
-
-	  arg0 = CALL_EXPR_ARG (exp, 0);
-	  op4 = expand_normal (arg0);
-	  if (!address_operand (op4, VOIDmode))
-	    {
-	      op4 = convert_memory_address (Pmode, op4);
-	      op4 = copy_addr_to_reg (op4);
-	    }
-	  emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
-	}
-
-      if (target == 0
-	  || !register_operand (target, DImode))
-        target = gen_reg_rtx (DImode);
-
-      if (TARGET_64BIT)
-	{
-	  op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
-				     op1, 1, OPTAB_DIRECT);
-	  op0 = expand_simple_binop (DImode, IOR, op0, op1,
-				     op0, 1, OPTAB_DIRECT);
-	}
-
-      emit_move_insn (target, op0);
-      return target;
-
-    case IX86_BUILTIN_MOVDIR64B:
-
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      arg1 = CALL_EXPR_ARG (exp, 1);
-      op0 = expand_normal (arg0);
-      op1 = expand_normal (arg1);
-
-      op0 = ix86_zero_extend_to_Pmode (op0);
-      if (!address_operand (op1, VOIDmode))
-      {
-	op1 = convert_memory_address (Pmode, op1);
-	op1 = copy_addr_to_reg (op1);
-      }
-      op1 = gen_rtx_MEM (XImode, op1);
-
-      insn = (TARGET_64BIT
-		? gen_movdir64b_di (op0, op1)
-		: gen_movdir64b_si (op0, op1));
-      emit_insn (insn);
-      return 0;
-
-    case IX86_BUILTIN_FXSAVE:
-    case IX86_BUILTIN_FXRSTOR:
-    case IX86_BUILTIN_FXSAVE64:
-    case IX86_BUILTIN_FXRSTOR64:
-    case IX86_BUILTIN_FNSTENV:
-    case IX86_BUILTIN_FLDENV:
-      mode0 = BLKmode;
-      switch (fcode)
-	{
-	case IX86_BUILTIN_FXSAVE:
-	  icode = CODE_FOR_fxsave;
-	  break;
-	case IX86_BUILTIN_FXRSTOR:
-	  icode = CODE_FOR_fxrstor;
-	  break;
-	case IX86_BUILTIN_FXSAVE64:
-	  icode = CODE_FOR_fxsave64;
-	  break;
-	case IX86_BUILTIN_FXRSTOR64:
-	  icode = CODE_FOR_fxrstor64;
-	  break;
-	case IX86_BUILTIN_FNSTENV:
-	  icode = CODE_FOR_fnstenv;
-	  break;
-	case IX86_BUILTIN_FLDENV:
-	  icode = CODE_FOR_fldenv;
-	  break;
-	default:
-	  gcc_unreachable ();
-	}
-
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      op0 = expand_normal (arg0);
-
-      if (!address_operand (op0, VOIDmode))
-	{
-	  op0 = convert_memory_address (Pmode, op0);
-	  op0 = copy_addr_to_reg (op0);
-	}
-      op0 = gen_rtx_MEM (mode0, op0);
-
-      pat = GEN_FCN (icode) (op0);
-      if (pat)
-	emit_insn (pat);
-      return 0;
-
-    case IX86_BUILTIN_XSETBV:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      arg1 = CALL_EXPR_ARG (exp, 1);
-      op0 = expand_normal (arg0);
-      op1 = expand_normal (arg1);
-
-      if (!REG_P (op0))
-	op0 = copy_to_mode_reg (SImode, op0);
-
-      op1 = force_reg (DImode, op1);
-
-      if (TARGET_64BIT)
-	{
-	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
-				     NULL, 1, OPTAB_DIRECT);
-
-	  icode = CODE_FOR_xsetbv_rex64;
-
-	  op2 = gen_lowpart (SImode, op2);
-	  op1 = gen_lowpart (SImode, op1);
-	  pat = GEN_FCN (icode) (op0, op1, op2);
-	}
-      else
-	{
-	  icode = CODE_FOR_xsetbv;
-
-	  pat = GEN_FCN (icode) (op0, op1);
-	}
-      if (pat)
-	emit_insn (pat);
-      return 0;
-
-    case IX86_BUILTIN_XSAVE:
-    case IX86_BUILTIN_XRSTOR:
-    case IX86_BUILTIN_XSAVE64:
-    case IX86_BUILTIN_XRSTOR64:
-    case IX86_BUILTIN_XSAVEOPT:
-    case IX86_BUILTIN_XSAVEOPT64:
-    case IX86_BUILTIN_XSAVES:
-    case IX86_BUILTIN_XRSTORS:
-    case IX86_BUILTIN_XSAVES64:
-    case IX86_BUILTIN_XRSTORS64:
-    case IX86_BUILTIN_XSAVEC:
-    case IX86_BUILTIN_XSAVEC64:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      arg1 = CALL_EXPR_ARG (exp, 1);
-      op0 = expand_normal (arg0);
-      op1 = expand_normal (arg1);
-
-      if (!address_operand (op0, VOIDmode))
-	{
-	  op0 = convert_memory_address (Pmode, op0);
-	  op0 = copy_addr_to_reg (op0);
-	}
-      op0 = gen_rtx_MEM (BLKmode, op0);
-
-      op1 = force_reg (DImode, op1);
-
-      if (TARGET_64BIT)
-	{
-	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
-				     NULL, 1, OPTAB_DIRECT);
-	  switch (fcode)
-	    {
-	    case IX86_BUILTIN_XSAVE:
-	      icode = CODE_FOR_xsave_rex64;
-	      break;
-	    case IX86_BUILTIN_XRSTOR:
-	      icode = CODE_FOR_xrstor_rex64;
-	      break;
-	    case IX86_BUILTIN_XSAVE64:
-	      icode = CODE_FOR_xsave64;
-	      break;
-	    case IX86_BUILTIN_XRSTOR64:
-	      icode = CODE_FOR_xrstor64;
-	      break;
-	    case IX86_BUILTIN_XSAVEOPT:
-	      icode = CODE_FOR_xsaveopt_rex64;
-	      break;
-	    case IX86_BUILTIN_XSAVEOPT64:
-	      icode = CODE_FOR_xsaveopt64;
-	      break;
-	    case IX86_BUILTIN_XSAVES:
-	      icode = CODE_FOR_xsaves_rex64;
-	      break;
-	    case IX86_BUILTIN_XRSTORS:
-	      icode = CODE_FOR_xrstors_rex64;
-	      break;
-	    case IX86_BUILTIN_XSAVES64:
-	      icode = CODE_FOR_xsaves64;
-	      break;
-	    case IX86_BUILTIN_XRSTORS64:
-	      icode = CODE_FOR_xrstors64;
-	      break;
-	    case IX86_BUILTIN_XSAVEC:
-	      icode = CODE_FOR_xsavec_rex64;
-	      break;
-	    case IX86_BUILTIN_XSAVEC64:
-	      icode = CODE_FOR_xsavec64;
-	      break;
-	    default:
-	      gcc_unreachable ();
-	    }
-
-	  op2 = gen_lowpart (SImode, op2);
-	  op1 = gen_lowpart (SImode, op1);
-	  pat = GEN_FCN (icode) (op0, op1, op2);
-	}
-      else
-	{
-	  switch (fcode)
-	    {
-	    case IX86_BUILTIN_XSAVE:
-	      icode = CODE_FOR_xsave;
-	      break;
-	    case IX86_BUILTIN_XRSTOR:
-	      icode = CODE_FOR_xrstor;
-	      break;
-	    case IX86_BUILTIN_XSAVEOPT:
-	      icode = CODE_FOR_xsaveopt;
-	      break;
-	    case IX86_BUILTIN_XSAVES:
-	      icode = CODE_FOR_xsaves;
-	      break;
-	    case IX86_BUILTIN_XRSTORS:
-	      icode = CODE_FOR_xrstors;
-	      break;
-	    case IX86_BUILTIN_XSAVEC:
-	      icode = CODE_FOR_xsavec;
-	      break;
-	    default:
-	      gcc_unreachable ();
-	    }
-	  pat = GEN_FCN (icode) (op0, op1);
-	}
-
-      if (pat)
-	emit_insn (pat);
-      return 0;
-
-    case IX86_BUILTIN_LLWPCB:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      op0 = expand_normal (arg0);
-      icode = CODE_FOR_lwp_llwpcb;
-      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
-	op0 = ix86_zero_extend_to_Pmode (op0);
-      emit_insn (gen_lwp_llwpcb (op0));
-      return 0;
-
-    case IX86_BUILTIN_SLWPCB:
-      icode = CODE_FOR_lwp_slwpcb;
-      if (!target
-	  || !insn_data[icode].operand[0].predicate (target, Pmode))
-	target = gen_reg_rtx (Pmode);
-      emit_insn (gen_lwp_slwpcb (target));
-      return target;
-
-    case IX86_BUILTIN_BEXTRI32:
-    case IX86_BUILTIN_BEXTRI64:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      arg1 = CALL_EXPR_ARG (exp, 1);
-      op0 = expand_normal (arg0);
-      op1 = expand_normal (arg1);
-      icode = (fcode == IX86_BUILTIN_BEXTRI32
-	  ? CODE_FOR_tbm_bextri_si
-	  : CODE_FOR_tbm_bextri_di);
-      if (!CONST_INT_P (op1))
-        {
-          error ("last argument must be an immediate");
-          return const0_rtx;
-        }
-      else
-        {
-          unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
-          unsigned char lsb_index = INTVAL (op1) & 0xFF;
-          op1 = GEN_INT (length);
-          op2 = GEN_INT (lsb_index);
-
-	  mode1 = insn_data[icode].operand[1].mode;
-	  if (!insn_data[icode].operand[1].predicate (op0, mode1))
-	    op0 = copy_to_mode_reg (mode1, op0);
-
-	  mode0 = insn_data[icode].operand[0].mode;
-	  if (target == 0
-	      || !register_operand (target, mode0))
-	    target = gen_reg_rtx (mode0);
-
-          pat = GEN_FCN (icode) (target, op0, op1, op2);
-          if (pat)
-            emit_insn (pat);
-          return target;
-        }
-
-    case IX86_BUILTIN_RDRAND16_STEP:
-      icode = CODE_FOR_rdrandhi_1;
-      mode0 = HImode;
-      goto rdrand_step;
-
-    case IX86_BUILTIN_RDRAND32_STEP:
-      icode = CODE_FOR_rdrandsi_1;
-      mode0 = SImode;
-      goto rdrand_step;
-
-    case IX86_BUILTIN_RDRAND64_STEP:
-      icode = CODE_FOR_rdranddi_1;
-      mode0 = DImode;
-
-rdrand_step:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      op1 = expand_normal (arg0);
-      if (!address_operand (op1, VOIDmode))
-	{
-	  op1 = convert_memory_address (Pmode, op1);
-	  op1 = copy_addr_to_reg (op1);
-	}
-
-      op0 = gen_reg_rtx (mode0);
-      emit_insn (GEN_FCN (icode) (op0));
-
-      emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
-
-      op1 = gen_reg_rtx (SImode);
-      emit_move_insn (op1, CONST1_RTX (SImode));
-
-      /* Emit SImode conditional move.  */
-      if (mode0 == HImode)
-	{
-	  if (TARGET_ZERO_EXTEND_WITH_AND
-	      && optimize_function_for_speed_p (cfun))
-	    {
-	      op2 = force_reg (SImode, const0_rtx);
-
-	      emit_insn (gen_movstricthi
-			 (gen_lowpart (HImode, op2), op0));
-	    }
-	  else
-	    {
-	      op2 = gen_reg_rtx (SImode);
-
-	      emit_insn (gen_zero_extendhisi2 (op2, op0));
-	    }
-	}
-      else if (mode0 == SImode)
-	op2 = op0;
-      else
-	op2 = gen_rtx_SUBREG (SImode, op0, 0);
-
-      if (target == 0
-	  || !register_operand (target, SImode))
-	target = gen_reg_rtx (SImode);
-
-      pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
-			 const0_rtx);
-      emit_insn (gen_rtx_SET (target,
-			      gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
-      return target;
-
-    case IX86_BUILTIN_RDSEED16_STEP:
-      icode = CODE_FOR_rdseedhi_1;
-      mode0 = HImode;
-      goto rdseed_step;
-
-    case IX86_BUILTIN_RDSEED32_STEP:
-      icode = CODE_FOR_rdseedsi_1;
-      mode0 = SImode;
-      goto rdseed_step;
-
-    case IX86_BUILTIN_RDSEED64_STEP:
-      icode = CODE_FOR_rdseeddi_1;
-      mode0 = DImode;
-
-rdseed_step:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      op1 = expand_normal (arg0);
-      if (!address_operand (op1, VOIDmode))
-	{
-	  op1 = convert_memory_address (Pmode, op1);
-	  op1 = copy_addr_to_reg (op1);
-	}
-
-      op0 = gen_reg_rtx (mode0);
-      emit_insn (GEN_FCN (icode) (op0));
-
-      emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
-
-      op2 = gen_reg_rtx (QImode);
-
-      pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
-                         const0_rtx);
-      emit_insn (gen_rtx_SET (op2, pat));
-
-      if (target == 0
-	  || !register_operand (target, SImode))
-        target = gen_reg_rtx (SImode);
-
-      emit_insn (gen_zero_extendqisi2 (target, op2));
-      return target;
-
-    case IX86_BUILTIN_SBB32:
-      icode = CODE_FOR_subborrowsi;
-      icode2 = CODE_FOR_subborrowsi_0;
-      mode0 = SImode;
-      mode1 = DImode;
-      mode2 = CCmode;
-      goto handlecarry;
-
-    case IX86_BUILTIN_SBB64:
-      icode = CODE_FOR_subborrowdi;
-      icode2 = CODE_FOR_subborrowdi_0;
-      mode0 = DImode;
-      mode1 = TImode;
-      mode2 = CCmode;
-      goto handlecarry;
-
-    case IX86_BUILTIN_ADDCARRYX32:
-      icode = CODE_FOR_addcarrysi;
-      icode2 = CODE_FOR_addcarrysi_0;
-      mode0 = SImode;
-      mode1 = DImode;
-      mode2 = CCCmode;
-      goto handlecarry;
-
-    case IX86_BUILTIN_ADDCARRYX64:
-      icode = CODE_FOR_addcarrydi;
-      icode2 = CODE_FOR_addcarrydi_0;
-      mode0 = DImode;
-      mode1 = TImode;
-      mode2 = CCCmode;
-
-    handlecarry:
-      arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in.  */
-      arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1.  */
-      arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2.  */
-      arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
-
-      op1 = expand_normal (arg0);
-      if (!integer_zerop (arg0))
-	op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
-
-      op2 = expand_normal (arg1);
-      if (!register_operand (op2, mode0))
-	op2 = copy_to_mode_reg (mode0, op2);
-
-      op3 = expand_normal (arg2);
-      if (!register_operand (op3, mode0))
-	op3 = copy_to_mode_reg (mode0, op3);
-
-      op4 = expand_normal (arg3);
-      if (!address_operand (op4, VOIDmode))
-	{
-	  op4 = convert_memory_address (Pmode, op4);
-	  op4 = copy_addr_to_reg (op4);
-	}
-
-      op0 = gen_reg_rtx (mode0);
-      if (integer_zerop (arg0))
-	{
-	  /* If arg0 is 0, optimize right away into add or sub
-	     instruction that sets CCCmode flags.  */
-	  op1 = gen_rtx_REG (mode2, FLAGS_REG);
-	  emit_insn (GEN_FCN (icode2) (op0, op2, op3));
-	}
-      else
-	{
-	  /* Generate CF from input operand.  */
-	  emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
-
-	  /* Generate instruction that consumes CF.  */
-	  op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
-	  pat = gen_rtx_LTU (mode1, op1, const0_rtx);
-	  pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
-	  emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
-	}
-
-      /* Return current CF value.  */
-      if (target == 0)
-        target = gen_reg_rtx (QImode);
-
-      pat = gen_rtx_LTU (QImode, op1, const0_rtx);
-      emit_insn (gen_rtx_SET (target, pat));
-
-      /* Store the result.  */
-      emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
-
-      return target;
-
-    case IX86_BUILTIN_READ_FLAGS:
-      emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
-
-      if (optimize
-	  || target == NULL_RTX
-	  || !nonimmediate_operand (target, word_mode)
-	  || GET_MODE (target) != word_mode)
-	target = gen_reg_rtx (word_mode);
-
-      emit_insn (gen_pop (target));
-      return target;
-
-    case IX86_BUILTIN_WRITE_FLAGS:
-
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      op0 = expand_normal (arg0);
-      if (!general_no_elim_operand (op0, word_mode))
-	op0 = copy_to_mode_reg (word_mode, op0);
-
-      emit_insn (gen_push (op0));
-      emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
-      return 0;
-
-    case IX86_BUILTIN_KTESTC8:
-      icode = CODE_FOR_ktestqi;
-      mode3 = CCCmode;
-      goto kortest;
-
-    case IX86_BUILTIN_KTESTZ8:
-      icode = CODE_FOR_ktestqi;
-      mode3 = CCZmode;
-      goto kortest;
-
-    case IX86_BUILTIN_KTESTC16:
-      icode = CODE_FOR_ktesthi;
-      mode3 = CCCmode;
-      goto kortest;
-
-    case IX86_BUILTIN_KTESTZ16:
-      icode = CODE_FOR_ktesthi;
-      mode3 = CCZmode;
-      goto kortest;
-
-    case IX86_BUILTIN_KTESTC32:
-      icode = CODE_FOR_ktestsi;
-      mode3 = CCCmode;
-      goto kortest;
-
-    case IX86_BUILTIN_KTESTZ32:
-      icode = CODE_FOR_ktestsi;
-      mode3 = CCZmode;
-      goto kortest;
-
-    case IX86_BUILTIN_KTESTC64:
-      icode = CODE_FOR_ktestdi;
-      mode3 = CCCmode;
-      goto kortest;
-
-    case IX86_BUILTIN_KTESTZ64:
-      icode = CODE_FOR_ktestdi;
-      mode3 = CCZmode;
-      goto kortest;
-
-    case IX86_BUILTIN_KORTESTC8:
-      icode = CODE_FOR_kortestqi;
-      mode3 = CCCmode;
-      goto kortest;
-
-    case IX86_BUILTIN_KORTESTZ8:
-      icode = CODE_FOR_kortestqi;
-      mode3 = CCZmode;
-      goto kortest;
-
-    case IX86_BUILTIN_KORTESTC16:
-      icode = CODE_FOR_kortesthi;
-      mode3 = CCCmode;
-      goto kortest;
-
-    case IX86_BUILTIN_KORTESTZ16:
-      icode = CODE_FOR_kortesthi;
-      mode3 = CCZmode;
-      goto kortest;
-
-    case IX86_BUILTIN_KORTESTC32:
-      icode = CODE_FOR_kortestsi;
-      mode3 = CCCmode;
-      goto kortest;
-
-    case IX86_BUILTIN_KORTESTZ32:
-      icode = CODE_FOR_kortestsi;
-      mode3 = CCZmode;
-      goto kortest;
-
-    case IX86_BUILTIN_KORTESTC64:
-      icode = CODE_FOR_kortestdi;
-      mode3 = CCCmode;
-      goto kortest;
-
-    case IX86_BUILTIN_KORTESTZ64:
-      icode = CODE_FOR_kortestdi;
-      mode3 = CCZmode;
-
-    kortest:
-      arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1.  */
-      arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2.  */
-      op0 = expand_normal (arg0);
-      op1 = expand_normal (arg1);
-
-      mode0 = insn_data[icode].operand[0].mode;
-      mode1 = insn_data[icode].operand[1].mode;
-
-      if (GET_MODE (op0) != VOIDmode)
-	op0 = force_reg (GET_MODE (op0), op0);
-
-      op0 = gen_lowpart (mode0, op0);
-
-      if (!insn_data[icode].operand[0].predicate (op0, mode0))
-	op0 = copy_to_mode_reg (mode0, op0);
-
-      if (GET_MODE (op1) != VOIDmode)
-	op1 = force_reg (GET_MODE (op1), op1);
-
-      op1 = gen_lowpart (mode1, op1);
-
-      if (!insn_data[icode].operand[1].predicate (op1, mode1))
-	op1 = copy_to_mode_reg (mode1, op1);
-
-      target = gen_reg_rtx (QImode);
-
-      /* Emit kortest.  */
-      emit_insn (GEN_FCN (icode) (op0, op1));
-      /* And use setcc to return result from flags.  */
-      ix86_expand_setcc (target, EQ,
-			 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
-      return target;
-
-    case IX86_BUILTIN_GATHERSIV2DF:
-      icode = CODE_FOR_avx2_gathersiv2df;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERSIV4DF:
-      icode = CODE_FOR_avx2_gathersiv4df;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERDIV2DF:
-      icode = CODE_FOR_avx2_gatherdiv2df;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERDIV4DF:
-      icode = CODE_FOR_avx2_gatherdiv4df;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERSIV4SF:
-      icode = CODE_FOR_avx2_gathersiv4sf;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERSIV8SF:
-      icode = CODE_FOR_avx2_gathersiv8sf;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERDIV4SF:
-      icode = CODE_FOR_avx2_gatherdiv4sf;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERDIV8SF:
-      icode = CODE_FOR_avx2_gatherdiv8sf;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERSIV2DI:
-      icode = CODE_FOR_avx2_gathersiv2di;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERSIV4DI:
-      icode = CODE_FOR_avx2_gathersiv4di;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERDIV2DI:
-      icode = CODE_FOR_avx2_gatherdiv2di;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERDIV4DI:
-      icode = CODE_FOR_avx2_gatherdiv4di;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERSIV4SI:
-      icode = CODE_FOR_avx2_gathersiv4si;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERSIV8SI:
-      icode = CODE_FOR_avx2_gathersiv8si;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERDIV4SI:
-      icode = CODE_FOR_avx2_gatherdiv4si;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERDIV8SI:
-      icode = CODE_FOR_avx2_gatherdiv8si;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERALTSIV4DF:
-      icode = CODE_FOR_avx2_gathersiv4df;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERALTDIV8SF:
-      icode = CODE_FOR_avx2_gatherdiv8sf;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERALTSIV4DI:
-      icode = CODE_FOR_avx2_gathersiv4di;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHERALTDIV8SI:
-      icode = CODE_FOR_avx2_gatherdiv8si;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3SIV16SF:
-      icode = CODE_FOR_avx512f_gathersiv16sf;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3SIV8DF:
-      icode = CODE_FOR_avx512f_gathersiv8df;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3DIV16SF:
-      icode = CODE_FOR_avx512f_gatherdiv16sf;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3DIV8DF:
-      icode = CODE_FOR_avx512f_gatherdiv8df;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3SIV16SI:
-      icode = CODE_FOR_avx512f_gathersiv16si;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3SIV8DI:
-      icode = CODE_FOR_avx512f_gathersiv8di;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3DIV16SI:
-      icode = CODE_FOR_avx512f_gatherdiv16si;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3DIV8DI:
-      icode = CODE_FOR_avx512f_gatherdiv8di;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3ALTSIV8DF:
-      icode = CODE_FOR_avx512f_gathersiv8df;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3ALTDIV16SF:
-      icode = CODE_FOR_avx512f_gatherdiv16sf;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3ALTSIV8DI:
-      icode = CODE_FOR_avx512f_gathersiv8di;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3ALTDIV16SI:
-      icode = CODE_FOR_avx512f_gatherdiv16si;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3SIV2DF:
-      icode = CODE_FOR_avx512vl_gathersiv2df;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3SIV4DF:
-      icode = CODE_FOR_avx512vl_gathersiv4df;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3DIV2DF:
-      icode = CODE_FOR_avx512vl_gatherdiv2df;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3DIV4DF:
-      icode = CODE_FOR_avx512vl_gatherdiv4df;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3SIV4SF:
-      icode = CODE_FOR_avx512vl_gathersiv4sf;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3SIV8SF:
-      icode = CODE_FOR_avx512vl_gathersiv8sf;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3DIV4SF:
-      icode = CODE_FOR_avx512vl_gatherdiv4sf;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3DIV8SF:
-      icode = CODE_FOR_avx512vl_gatherdiv8sf;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3SIV2DI:
-      icode = CODE_FOR_avx512vl_gathersiv2di;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3SIV4DI:
-      icode = CODE_FOR_avx512vl_gathersiv4di;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3DIV2DI:
-      icode = CODE_FOR_avx512vl_gatherdiv2di;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3DIV4DI:
-      icode = CODE_FOR_avx512vl_gatherdiv4di;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3SIV4SI:
-      icode = CODE_FOR_avx512vl_gathersiv4si;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3SIV8SI:
-      icode = CODE_FOR_avx512vl_gathersiv8si;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3DIV4SI:
-      icode = CODE_FOR_avx512vl_gatherdiv4si;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3DIV8SI:
-      icode = CODE_FOR_avx512vl_gatherdiv8si;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3ALTSIV4DF:
-      icode = CODE_FOR_avx512vl_gathersiv4df;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3ALTDIV8SF:
-      icode = CODE_FOR_avx512vl_gatherdiv8sf;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3ALTSIV4DI:
-      icode = CODE_FOR_avx512vl_gathersiv4di;
-      goto gather_gen;
-    case IX86_BUILTIN_GATHER3ALTDIV8SI:
-      icode = CODE_FOR_avx512vl_gatherdiv8si;
-      goto gather_gen;
-    case IX86_BUILTIN_SCATTERSIV16SF:
-      icode = CODE_FOR_avx512f_scattersiv16sf;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERSIV8DF:
-      icode = CODE_FOR_avx512f_scattersiv8df;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERDIV16SF:
-      icode = CODE_FOR_avx512f_scatterdiv16sf;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERDIV8DF:
-      icode = CODE_FOR_avx512f_scatterdiv8df;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERSIV16SI:
-      icode = CODE_FOR_avx512f_scattersiv16si;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERSIV8DI:
-      icode = CODE_FOR_avx512f_scattersiv8di;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERDIV16SI:
-      icode = CODE_FOR_avx512f_scatterdiv16si;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERDIV8DI:
-      icode = CODE_FOR_avx512f_scatterdiv8di;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERSIV8SF:
-      icode = CODE_FOR_avx512vl_scattersiv8sf;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERSIV4SF:
-      icode = CODE_FOR_avx512vl_scattersiv4sf;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERSIV4DF:
-      icode = CODE_FOR_avx512vl_scattersiv4df;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERSIV2DF:
-      icode = CODE_FOR_avx512vl_scattersiv2df;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERDIV8SF:
-      icode = CODE_FOR_avx512vl_scatterdiv8sf;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERDIV4SF:
-      icode = CODE_FOR_avx512vl_scatterdiv4sf;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERDIV4DF:
-      icode = CODE_FOR_avx512vl_scatterdiv4df;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERDIV2DF:
-      icode = CODE_FOR_avx512vl_scatterdiv2df;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERSIV8SI:
-      icode = CODE_FOR_avx512vl_scattersiv8si;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERSIV4SI:
-      icode = CODE_FOR_avx512vl_scattersiv4si;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERSIV4DI:
-      icode = CODE_FOR_avx512vl_scattersiv4di;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERSIV2DI:
-      icode = CODE_FOR_avx512vl_scattersiv2di;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERDIV8SI:
-      icode = CODE_FOR_avx512vl_scatterdiv8si;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERDIV4SI:
-      icode = CODE_FOR_avx512vl_scatterdiv4si;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERDIV4DI:
-      icode = CODE_FOR_avx512vl_scatterdiv4di;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERDIV2DI:
-      icode = CODE_FOR_avx512vl_scatterdiv2di;
-      goto scatter_gen;
-    case IX86_BUILTIN_GATHERPFDPD:
-      icode = CODE_FOR_avx512pf_gatherpfv8sidf;
-      goto vec_prefetch_gen;
-    case IX86_BUILTIN_SCATTERALTSIV8DF:
-      icode = CODE_FOR_avx512f_scattersiv8df;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERALTDIV16SF:
-      icode = CODE_FOR_avx512f_scatterdiv16sf;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERALTSIV8DI:
-      icode = CODE_FOR_avx512f_scattersiv8di;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERALTDIV16SI:
-      icode = CODE_FOR_avx512f_scatterdiv16si;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERALTSIV4DF:
-      icode = CODE_FOR_avx512vl_scattersiv4df;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERALTDIV8SF:
-      icode = CODE_FOR_avx512vl_scatterdiv8sf;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERALTSIV4DI:
-      icode = CODE_FOR_avx512vl_scattersiv4di;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERALTDIV8SI:
-      icode = CODE_FOR_avx512vl_scatterdiv8si;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERALTSIV2DF:
-      icode = CODE_FOR_avx512vl_scattersiv2df;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERALTDIV4SF:
-      icode = CODE_FOR_avx512vl_scatterdiv4sf;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERALTSIV2DI:
-      icode = CODE_FOR_avx512vl_scattersiv2di;
-      goto scatter_gen;
-    case IX86_BUILTIN_SCATTERALTDIV4SI:
-      icode = CODE_FOR_avx512vl_scatterdiv4si;
-      goto scatter_gen;
-    case IX86_BUILTIN_GATHERPFDPS:
-      icode = CODE_FOR_avx512pf_gatherpfv16sisf;
-      goto vec_prefetch_gen;
-    case IX86_BUILTIN_GATHERPFQPD:
-      icode = CODE_FOR_avx512pf_gatherpfv8didf;
-      goto vec_prefetch_gen;
-    case IX86_BUILTIN_GATHERPFQPS:
-      icode = CODE_FOR_avx512pf_gatherpfv8disf;
-      goto vec_prefetch_gen;
-    case IX86_BUILTIN_SCATTERPFDPD:
-      icode = CODE_FOR_avx512pf_scatterpfv8sidf;
-      goto vec_prefetch_gen;
-    case IX86_BUILTIN_SCATTERPFDPS:
-      icode = CODE_FOR_avx512pf_scatterpfv16sisf;
-      goto vec_prefetch_gen;
-    case IX86_BUILTIN_SCATTERPFQPD:
-      icode = CODE_FOR_avx512pf_scatterpfv8didf;
-      goto vec_prefetch_gen;
-    case IX86_BUILTIN_SCATTERPFQPS:
-      icode = CODE_FOR_avx512pf_scatterpfv8disf;
-      goto vec_prefetch_gen;
-
-    gather_gen:
-      rtx half;
-      rtx (*gen) (rtx, rtx);
-
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      arg1 = CALL_EXPR_ARG (exp, 1);
-      arg2 = CALL_EXPR_ARG (exp, 2);
-      arg3 = CALL_EXPR_ARG (exp, 3);
-      arg4 = CALL_EXPR_ARG (exp, 4);
-      op0 = expand_normal (arg0);
-      op1 = expand_normal (arg1);
-      op2 = expand_normal (arg2);
-      op3 = expand_normal (arg3);
-      op4 = expand_normal (arg4);
-      /* Note the arg order is different from the operand order.  */
-      mode0 = insn_data[icode].operand[1].mode;
-      mode2 = insn_data[icode].operand[3].mode;
-      mode3 = insn_data[icode].operand[4].mode;
-      mode4 = insn_data[icode].operand[5].mode;
-
-      if (target == NULL_RTX
-	  || GET_MODE (target) != insn_data[icode].operand[0].mode
-	  || !insn_data[icode].operand[0].predicate (target,
-						     GET_MODE (target)))
-	subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
-      else
-	subtarget = target;
-
-      switch (fcode)
-	{
-	case IX86_BUILTIN_GATHER3ALTSIV8DF:
-	case IX86_BUILTIN_GATHER3ALTSIV8DI:
-	  half = gen_reg_rtx (V8SImode);
-	  if (!nonimmediate_operand (op2, V16SImode))
-	    op2 = copy_to_mode_reg (V16SImode, op2);
-	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
-	  op2 = half;
-	  break;
-	case IX86_BUILTIN_GATHER3ALTSIV4DF:
-	case IX86_BUILTIN_GATHER3ALTSIV4DI:
-	case IX86_BUILTIN_GATHERALTSIV4DF:
-	case IX86_BUILTIN_GATHERALTSIV4DI:
-	  half = gen_reg_rtx (V4SImode);
-	  if (!nonimmediate_operand (op2, V8SImode))
-	    op2 = copy_to_mode_reg (V8SImode, op2);
-	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
-	  op2 = half;
-	  break;
-	case IX86_BUILTIN_GATHER3ALTDIV16SF:
-	case IX86_BUILTIN_GATHER3ALTDIV16SI:
-	  half = gen_reg_rtx (mode0);
-	  if (mode0 == V8SFmode)
-	    gen = gen_vec_extract_lo_v16sf;
-	  else
-	    gen = gen_vec_extract_lo_v16si;
-	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
-	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
-	  emit_insn (gen (half, op0));
-	  op0 = half;
-	  op3 = lowpart_subreg (QImode, op3, HImode);
-	  break;
-	case IX86_BUILTIN_GATHER3ALTDIV8SF:
-	case IX86_BUILTIN_GATHER3ALTDIV8SI:
-	case IX86_BUILTIN_GATHERALTDIV8SF:
-	case IX86_BUILTIN_GATHERALTDIV8SI:
-	  half = gen_reg_rtx (mode0);
-	  if (mode0 == V4SFmode)
-	    gen = gen_vec_extract_lo_v8sf;
-	  else
-	    gen = gen_vec_extract_lo_v8si;
-	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
-	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
-	  emit_insn (gen (half, op0));
-	  op0 = half;
-	  if (VECTOR_MODE_P (GET_MODE (op3)))
-	    {
-	      half = gen_reg_rtx (mode0);
-	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
-		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
-	      emit_insn (gen (half, op3));
-	      op3 = half;
-	    }
-	  break;
-	default:
-	  break;
-	}
-
-      /* Force memory operand only with base register here.  But we
-	 don't want to do it on memory operand for other builtin
-	 functions.  */
-      op1 = ix86_zero_extend_to_Pmode (op1);
-
-      if (!insn_data[icode].operand[1].predicate (op0, mode0))
-	op0 = copy_to_mode_reg (mode0, op0);
-      if (!insn_data[icode].operand[2].predicate (op1, Pmode))
-	op1 = copy_to_mode_reg (Pmode, op1);
-      if (!insn_data[icode].operand[3].predicate (op2, mode2))
-	op2 = copy_to_mode_reg (mode2, op2);
-
-      op3 = fixup_modeless_constant (op3, mode3);
-
-      if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
-	{
-	  if (!insn_data[icode].operand[4].predicate (op3, mode3))
-	    op3 = copy_to_mode_reg (mode3, op3);
-	}
-      else
-	{
-	  op3 = copy_to_reg (op3);
-	  op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
-	}
-      if (!insn_data[icode].operand[5].predicate (op4, mode4))
-	{
-          error ("the last argument must be scale 1, 2, 4, 8");
-          return const0_rtx;
-	}
-
-      /* Optimize.  If mask is known to have all high bits set,
-	 replace op0 with pc_rtx to signal that the instruction
-	 overwrites the whole destination and doesn't use its
-	 previous contents.  */
-      if (optimize)
-	{
-	  if (TREE_CODE (arg3) == INTEGER_CST)
-	    {
-	      if (integer_all_onesp (arg3))
-		op0 = pc_rtx;
-	    }
-	  else if (TREE_CODE (arg3) == VECTOR_CST)
-	    {
-	      unsigned int negative = 0;
-	      for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
-		{
-		  tree cst = VECTOR_CST_ELT (arg3, i);
-		  if (TREE_CODE (cst) == INTEGER_CST
-		      && tree_int_cst_sign_bit (cst))
-		    negative++;
-		  else if (TREE_CODE (cst) == REAL_CST
-			   && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
-		    negative++;
-		}
-	      if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
-		op0 = pc_rtx;
-	    }
-	  else if (TREE_CODE (arg3) == SSA_NAME
-		   && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
-	    {
-	      /* Recognize also when mask is like:
-		 __v2df src = _mm_setzero_pd ();
-		 __v2df mask = _mm_cmpeq_pd (src, src);
-		 or
-		 __v8sf src = _mm256_setzero_ps ();
-		 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
-		 as that is a cheaper way to load all ones into
-		 a register than having to load a constant from
-		 memory.  */
-	      gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
-	      if (is_gimple_call (def_stmt))
-		{
-		  tree fndecl = gimple_call_fndecl (def_stmt);
-		  if (fndecl
-		      && fndecl_built_in_p (fndecl, BUILT_IN_MD))
-		    switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
-		      {
-		      case IX86_BUILTIN_CMPPD:
-		      case IX86_BUILTIN_CMPPS:
-		      case IX86_BUILTIN_CMPPD256:
-		      case IX86_BUILTIN_CMPPS256:
-			if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
-			  break;
-			/* FALLTHRU */
-		      case IX86_BUILTIN_CMPEQPD:
-		      case IX86_BUILTIN_CMPEQPS:
-			if (initializer_zerop (gimple_call_arg (def_stmt, 0))
-			    && initializer_zerop (gimple_call_arg (def_stmt,
-								   1)))
-			  op0 = pc_rtx;
-			break;
-		      default:
-			break;
-		      }
-		}
-	    }
-	}
-
-      pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
-      if (! pat)
-	return const0_rtx;
-      emit_insn (pat);
-
-      switch (fcode)
-	{
-	case IX86_BUILTIN_GATHER3DIV16SF:
-	  if (target == NULL_RTX)
-	    target = gen_reg_rtx (V8SFmode);
-	  emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
-	  break;
-	case IX86_BUILTIN_GATHER3DIV16SI:
-	  if (target == NULL_RTX)
-	    target = gen_reg_rtx (V8SImode);
-	  emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
-	  break;
-	case IX86_BUILTIN_GATHER3DIV8SF:
-	case IX86_BUILTIN_GATHERDIV8SF:
-	  if (target == NULL_RTX)
-	    target = gen_reg_rtx (V4SFmode);
-	  emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
-	  break;
-	case IX86_BUILTIN_GATHER3DIV8SI:
-	case IX86_BUILTIN_GATHERDIV8SI:
-	  if (target == NULL_RTX)
-	    target = gen_reg_rtx (V4SImode);
-	  emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
-	  break;
-	default:
-	  target = subtarget;
-	  break;
-	}
-      return target;
-
-    scatter_gen:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      arg1 = CALL_EXPR_ARG (exp, 1);
-      arg2 = CALL_EXPR_ARG (exp, 2);
-      arg3 = CALL_EXPR_ARG (exp, 3);
-      arg4 = CALL_EXPR_ARG (exp, 4);
-      op0 = expand_normal (arg0);
-      op1 = expand_normal (arg1);
-      op2 = expand_normal (arg2);
-      op3 = expand_normal (arg3);
-      op4 = expand_normal (arg4);
-      mode1 = insn_data[icode].operand[1].mode;
-      mode2 = insn_data[icode].operand[2].mode;
-      mode3 = insn_data[icode].operand[3].mode;
-      mode4 = insn_data[icode].operand[4].mode;
-
-      /* Scatter instruction stores operand op3 to memory with
-	 indices from op2 and scale from op4 under writemask op1.
-	 If index operand op2 has more elements then source operand
-	 op3 one need to use only its low half. And vice versa.  */
-      switch (fcode)
-	{
-	case IX86_BUILTIN_SCATTERALTSIV8DF:
-	case IX86_BUILTIN_SCATTERALTSIV8DI:
-	  half = gen_reg_rtx (V8SImode);
-	  if (!nonimmediate_operand (op2, V16SImode))
-	    op2 = copy_to_mode_reg (V16SImode, op2);
-	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
-	  op2 = half;
-	  break;
-	case IX86_BUILTIN_SCATTERALTDIV16SF:
-	case IX86_BUILTIN_SCATTERALTDIV16SI:
-	  half = gen_reg_rtx (mode3);
-	  if (mode3 == V8SFmode)
-	    gen = gen_vec_extract_lo_v16sf;
-	  else
-	    gen = gen_vec_extract_lo_v16si;
-	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
-	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
-	  emit_insn (gen (half, op3));
-	  op3 = half;
-	  break;
-	case IX86_BUILTIN_SCATTERALTSIV4DF:
-	case IX86_BUILTIN_SCATTERALTSIV4DI:
-	  half = gen_reg_rtx (V4SImode);
-	  if (!nonimmediate_operand (op2, V8SImode))
-	    op2 = copy_to_mode_reg (V8SImode, op2);
-	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
-	  op2 = half;
-	  break;
-	case IX86_BUILTIN_SCATTERALTDIV8SF:
-	case IX86_BUILTIN_SCATTERALTDIV8SI:
-	  half = gen_reg_rtx (mode3);
-	  if (mode3 == V4SFmode)
-	    gen = gen_vec_extract_lo_v8sf;
-	  else
-	    gen = gen_vec_extract_lo_v8si;
-	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
-	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
-	  emit_insn (gen (half, op3));
-	  op3 = half;
-	  break;
-	case IX86_BUILTIN_SCATTERALTSIV2DF:
-	case IX86_BUILTIN_SCATTERALTSIV2DI:
-	  if (!nonimmediate_operand (op2, V4SImode))
-	    op2 = copy_to_mode_reg (V4SImode, op2);
-	  break;
-	case IX86_BUILTIN_SCATTERALTDIV4SF:
-	case IX86_BUILTIN_SCATTERALTDIV4SI:
-	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
-	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
-	  break;
-	default:
-	  break;
-	}
-
-      /* Force memory operand only with base register here.  But we
-	 don't want to do it on memory operand for other builtin
-	 functions.  */
-      op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
-
-      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
-	op0 = copy_to_mode_reg (Pmode, op0);
-
-      op1 = fixup_modeless_constant (op1, mode1);
-
-      if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
-	{
-	  if (!insn_data[icode].operand[1].predicate (op1, mode1))
-	    op1 = copy_to_mode_reg (mode1, op1);
-	}
-      else
-	{
-	  op1 = copy_to_reg (op1);
-	  op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
-	}
-
-      if (!insn_data[icode].operand[2].predicate (op2, mode2))
-	op2 = copy_to_mode_reg (mode2, op2);
-
-      if (!insn_data[icode].operand[3].predicate (op3, mode3))
-	op3 = copy_to_mode_reg (mode3, op3);
-
-      if (!insn_data[icode].operand[4].predicate (op4, mode4))
-	{
-	  error ("the last argument must be scale 1, 2, 4, 8");
-	  return const0_rtx;
-	}
-
-      pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
-      if (! pat)
-	return const0_rtx;
-
-      emit_insn (pat);
-      return 0;
-
-    vec_prefetch_gen:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      arg1 = CALL_EXPR_ARG (exp, 1);
-      arg2 = CALL_EXPR_ARG (exp, 2);
-      arg3 = CALL_EXPR_ARG (exp, 3);
-      arg4 = CALL_EXPR_ARG (exp, 4);
-      op0 = expand_normal (arg0);
-      op1 = expand_normal (arg1);
-      op2 = expand_normal (arg2);
-      op3 = expand_normal (arg3);
-      op4 = expand_normal (arg4);
-      mode0 = insn_data[icode].operand[0].mode;
-      mode1 = insn_data[icode].operand[1].mode;
-      mode3 = insn_data[icode].operand[3].mode;
-      mode4 = insn_data[icode].operand[4].mode;
-
-      op0 = fixup_modeless_constant (op0, mode0);
-
-      if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
-	{
-	  if (!insn_data[icode].operand[0].predicate (op0, mode0))
-	    op0 = copy_to_mode_reg (mode0, op0);
-	}
-      else
-	{
-	  op0 = copy_to_reg (op0);
-	  op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
-	}
-
-      if (!insn_data[icode].operand[1].predicate (op1, mode1))
-	op1 = copy_to_mode_reg (mode1, op1);
-
-      /* Force memory operand only with base register here.  But we
-	 don't want to do it on memory operand for other builtin
-	 functions.  */
-      op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
-
-      if (!insn_data[icode].operand[2].predicate (op2, Pmode))
-	op2 = copy_to_mode_reg (Pmode, op2);
-
-      if (!insn_data[icode].operand[3].predicate (op3, mode3))
-	{
-	  error ("the forth argument must be scale 1, 2, 4, 8");
-	  return const0_rtx;
-	}
-
-      if (!insn_data[icode].operand[4].predicate (op4, mode4))
-	{
-	  error ("incorrect hint operand");
-	  return const0_rtx;
-	}
-
-      pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
-      if (! pat)
-	return const0_rtx;
-
-      emit_insn (pat);
-
-      return 0;
-
-    case IX86_BUILTIN_XABORT:
-      icode = CODE_FOR_xabort;
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      op0 = expand_normal (arg0);
-      mode0 = insn_data[icode].operand[0].mode;
-      if (!insn_data[icode].operand[0].predicate (op0, mode0))
-	{
-	  error ("the argument to %<xabort%> intrinsic must "
-		 "be an 8-bit immediate");
-	  return const0_rtx;
-	}
-      emit_insn (gen_xabort (op0));
-      return 0;
-
-    case IX86_BUILTIN_RSTORSSP:
-    case IX86_BUILTIN_CLRSSBSY:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      op0 = expand_normal (arg0);
-      icode = (fcode == IX86_BUILTIN_RSTORSSP
-	  ? CODE_FOR_rstorssp
-	  : CODE_FOR_clrssbsy);
-      if (!address_operand (op0, VOIDmode))
-	{
-	  op1 = convert_memory_address (Pmode, op0);
-	  op0 = copy_addr_to_reg (op1);
-	}
-      emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
-      return 0;
-
-    case IX86_BUILTIN_WRSSD:
-    case IX86_BUILTIN_WRSSQ:
-    case IX86_BUILTIN_WRUSSD:
-    case IX86_BUILTIN_WRUSSQ:
-      arg0 = CALL_EXPR_ARG (exp, 0);
-      op0 = expand_normal (arg0);
-      arg1 = CALL_EXPR_ARG (exp, 1);
-      op1 = expand_normal (arg1);
-      switch (fcode)
-	{
-	case IX86_BUILTIN_WRSSD:
-	  icode = CODE_FOR_wrsssi;
-	  mode = SImode;
-	  break;
-	case IX86_BUILTIN_WRSSQ:
-	  icode = CODE_FOR_wrssdi;
-	  mode = DImode;
-	  break;
-	case IX86_BUILTIN_WRUSSD:
-	  icode = CODE_FOR_wrusssi;
-	  mode = SImode;
-	  break;
-	case IX86_BUILTIN_WRUSSQ:
-	  icode = CODE_FOR_wrussdi;
-	  mode = DImode;
-	  break;
-	}
-      op0 = force_reg (mode, op0);
-      if (!address_operand (op1, VOIDmode))
-	{
-	  op2 = convert_memory_address (Pmode, op1);
-	  op1 = copy_addr_to_reg (op2);
-	}
-      emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
-      return 0;
-
-    default:
-      break;
-    }
-
-  if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
-      && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
-    {
-      i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
-      return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
-					       target);
-    }
-
-  if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
-      && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
-    {
-      i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
-      rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
-      rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
-      rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
-      int masked = 1;
-      machine_mode mode, wide_mode, nar_mode;
-
-      nar_mode  = V4SFmode;
-      mode      = V16SFmode;
-      wide_mode = V64SFmode;
-      fcn_mask  = gen_avx5124fmaddps_4fmaddps_mask;
-      fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
-
-      switch (fcode)
-	{
-	case IX86_BUILTIN_4FMAPS:
-	  fcn = gen_avx5124fmaddps_4fmaddps;
-	  masked = 0;
-	  goto v4fma_expand;
-
-	case IX86_BUILTIN_4DPWSSD:
-	  nar_mode  = V4SImode;
-	  mode      = V16SImode;
-	  wide_mode = V64SImode;
-	  fcn = gen_avx5124vnniw_vp4dpwssd;
-	  masked = 0;
-	  goto v4fma_expand;
-
-	case IX86_BUILTIN_4DPWSSDS:
-	  nar_mode  = V4SImode;
-	  mode      = V16SImode;
-	  wide_mode = V64SImode;
-	  fcn = gen_avx5124vnniw_vp4dpwssds;
-	  masked = 0;
-	  goto v4fma_expand;
-
-	case IX86_BUILTIN_4FNMAPS:
-	  fcn = gen_avx5124fmaddps_4fnmaddps;
-	  masked = 0;
-	  goto v4fma_expand;
-
-	case IX86_BUILTIN_4FNMAPS_MASK:
-	  fcn_mask  = gen_avx5124fmaddps_4fnmaddps_mask;
-	  fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
-	  goto v4fma_expand;
-
-	case IX86_BUILTIN_4DPWSSD_MASK:
-	  nar_mode  = V4SImode;
-	  mode      = V16SImode;
-	  wide_mode = V64SImode;
-	  fcn_mask  = gen_avx5124vnniw_vp4dpwssd_mask;
-	  fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
-	  goto v4fma_expand;
-
-	case IX86_BUILTIN_4DPWSSDS_MASK:
-	  nar_mode  = V4SImode;
-	  mode      = V16SImode;
-	  wide_mode = V64SImode;
-	  fcn_mask  = gen_avx5124vnniw_vp4dpwssds_mask;
-	  fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
-	  goto v4fma_expand;
-
-	case IX86_BUILTIN_4FMAPS_MASK:
-	  {
-	    tree args[4];
-	    rtx ops[4];
-	    rtx wide_reg;
-	    rtx accum;
-	    rtx addr;
-	    rtx mem;
-
-v4fma_expand:
-	    wide_reg = gen_reg_rtx (wide_mode);
-	    for (i = 0; i < 4; i++)
-	      {
-		args[i] = CALL_EXPR_ARG (exp, i);
-		ops[i] = expand_normal (args[i]);
-
-		emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
-				ops[i]);
-	      }
-
-	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
-	    accum = force_reg (mode, accum);
-
-	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
-	    addr = force_reg (Pmode, addr);
-
-	    mem = gen_rtx_MEM (nar_mode, addr);
-
-	    target = gen_reg_rtx (mode);
-
-	    emit_move_insn (target, accum);
-
-	    if (! masked)
-	      emit_insn (fcn (target, accum, wide_reg, mem));
-	    else
-	      {
-		rtx merge, mask;
-		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
-
-		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
-
-		if (CONST_INT_P (mask))
-		  mask = fixup_modeless_constant (mask, HImode);
-
-		mask = force_reg (HImode, mask);
-
-		if (GET_MODE (mask) != HImode)
-		  mask = gen_rtx_SUBREG (HImode, mask, 0);
-
-		/* If merge is 0 then we're about to emit z-masked variant.  */
-		if (const0_operand (merge, mode))
-		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
-		/* If merge is the same as accum then emit merge-masked variant.  */
-		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
-		  {
-		    merge = force_reg (mode, merge);
-		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
-		  }
-		/* Merge with something unknown might happen if we z-mask w/ -O0.  */
-		else
-		  {
-		    target = gen_reg_rtx (mode);
-		    emit_move_insn (target, merge);
-		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
-		  }
-	      }
-	    return target;
-	  }
-
-	case IX86_BUILTIN_4FNMASS:
-	  fcn = gen_avx5124fmaddps_4fnmaddss;
-	  masked = 0;
-	  goto s4fma_expand;
-
-	case IX86_BUILTIN_4FMASS:
-	  fcn = gen_avx5124fmaddps_4fmaddss;
-	  masked = 0;
-	  goto s4fma_expand;
-
-	case IX86_BUILTIN_4FNMASS_MASK:
-	  fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
-	  fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
-	  goto s4fma_expand;
-
-	case IX86_BUILTIN_4FMASS_MASK:
-	  {
-	    tree args[4];
-	    rtx ops[4];
-	    rtx wide_reg;
-	    rtx accum;
-	    rtx addr;
-	    rtx mem;
-
-	    fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
-	    fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
-
-s4fma_expand:
-	    mode = V4SFmode;
-	    wide_reg = gen_reg_rtx (V64SFmode);
-	    for (i = 0; i < 4; i++)
-	      {
-		rtx tmp;
-		args[i] = CALL_EXPR_ARG (exp, i);
-		ops[i] = expand_normal (args[i]);
-
-		tmp = gen_reg_rtx (SFmode);
-		emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
-
-		emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
-				gen_rtx_SUBREG (V16SFmode, tmp, 0));
-	      }
-
-	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
-	    accum = force_reg (V4SFmode, accum);
-
-	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
-	    addr = force_reg (Pmode, addr);
-
-	    mem = gen_rtx_MEM (V4SFmode, addr);
-
-	    target = gen_reg_rtx (V4SFmode);
-
-	    emit_move_insn (target, accum);
-
-	    if (! masked)
-	      emit_insn (fcn (target, accum, wide_reg, mem));
-	    else
-	      {
-		rtx merge, mask;
-		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
-
-		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
-
-		if (CONST_INT_P (mask))
-		  mask = fixup_modeless_constant (mask, QImode);
-
-		mask = force_reg (QImode, mask);
-
-		if (GET_MODE (mask) != QImode)
-		  mask = gen_rtx_SUBREG (QImode, mask, 0);
-
-		/* If merge is 0 then we're about to emit z-masked variant.  */
-		if (const0_operand (merge, mode))
-		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
-		/* If merge is the same as accum then emit merge-masked
-		   variant.  */
-		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
-		  {
-		    merge = force_reg (mode, merge);
-		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
-		  }
-		/* Merge with something unknown might happen if we z-mask
-		   w/ -O0.  */
-		else
-		  {
-		    target = gen_reg_rtx (mode);
-		    emit_move_insn (target, merge);
-		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
-		  }
-		}
-	      return target;
-	    }
-	  case IX86_BUILTIN_RDPID:
-	    return ix86_expand_special_args_builtin (bdesc_args + i, exp,
-						     target);
-	  case IX86_BUILTIN_FABSQ:
-	  case IX86_BUILTIN_COPYSIGNQ:
-	    if (!TARGET_SSE)
-	      /* Emit a normal call if SSE isn't available.  */
-	      return expand_call (exp, target, ignore);
-	    /* FALLTHRU */
-	  default:
-	    return ix86_expand_args_builtin (bdesc_args + i, exp, target);
-	  }
-    }
-
-  if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
-      && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
-    {
-      i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
-      return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
-    }
-
-  if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
-      && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
-    {
-      i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
-      return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
-    }
-
-  if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
-      && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
-    {
-      i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
-      return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
-    }
-
-  if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
-      && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
-    {
-      i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
-      return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
-    }
-
-  if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
-      && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
-    {
-      i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
-      const struct builtin_description *d = bdesc_multi_arg + i;
-      return ix86_expand_multi_arg_builtin (d->icode, exp, target,
-					    (enum ix86_builtin_func_type)
-					    d->flag, d->comparison);
-    }
-
-  if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
-      && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
-    {
-      i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
-      return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
-					       target);
-    }
-
-  if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
-      && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
-    {
-      i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
-      return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
-				       target);
-    }
-
-  gcc_unreachable ();
-}
-
-/* This returns the target-specific builtin with code CODE if
-   current_function_decl has visibility on this builtin, which is checked
-   using isa flags.  Returns NULL_TREE otherwise.  */
-
-static tree ix86_get_builtin (enum ix86_builtins code)
-{
-  struct cl_target_option *opts;
-  tree target_tree = NULL_TREE;
-
-  /* Determine the isa flags of current_function_decl.  */
-
-  if (current_function_decl)
-    target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
-
-  if (target_tree == NULL)
-    target_tree = target_option_default_node;
-
-  opts = TREE_TARGET_OPTION (target_tree);
-
-  if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
-      || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
-    return ix86_builtin_decl (code, true);
-  else
-    return NULL_TREE;
-}
-
-/* Returns a function decl for a vectorized version of the combined function
-   with combined_fn code FN and the result vector type TYPE, or NULL_TREE
-   if it is not available.  */
-
-static tree
-ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
-				  tree type_in)
-{
-  machine_mode in_mode, out_mode;
-  int in_n, out_n;
-
-  if (TREE_CODE (type_out) != VECTOR_TYPE
-      || TREE_CODE (type_in) != VECTOR_TYPE)
-    return NULL_TREE;
-
-  out_mode = TYPE_MODE (TREE_TYPE (type_out));
-  out_n = TYPE_VECTOR_SUBPARTS (type_out);
-  in_mode = TYPE_MODE (TREE_TYPE (type_in));
-  in_n = TYPE_VECTOR_SUBPARTS (type_in);
-
-  switch (fn)
-    {
-    CASE_CFN_EXP2:
-      if (out_mode == SFmode && in_mode == SFmode)
-	{
-	  if (out_n == 16 && in_n == 16)
-	    return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
-	}
-      break;
-
-    CASE_CFN_IFLOOR:
-    CASE_CFN_LFLOOR:
-    CASE_CFN_LLFLOOR:
-      /* The round insn does not trap on denormals.  */
-      if (flag_trapping_math || !TARGET_SSE4_1)
-	break;
-
-      if (out_mode == SImode && in_mode == DFmode)
-	{
-	  if (out_n == 4 && in_n == 2)
-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
-	  else if (out_n == 8 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
-	  else if (out_n == 16 && in_n == 8)
-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
-	}
-      if (out_mode == SImode && in_mode == SFmode)
-	{
-	  if (out_n == 4 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
-	  else if (out_n == 8 && in_n == 8)
-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
-	  else if (out_n == 16 && in_n == 16)
-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
-	}
-      break;
-
-    CASE_CFN_ICEIL:
-    CASE_CFN_LCEIL:
-    CASE_CFN_LLCEIL:
-      /* The round insn does not trap on denormals.  */
-      if (flag_trapping_math || !TARGET_SSE4_1)
-	break;
-
-      if (out_mode == SImode && in_mode == DFmode)
-	{
-	  if (out_n == 4 && in_n == 2)
-	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
-	  else if (out_n == 8 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
-	  else if (out_n == 16 && in_n == 8)
-	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
-	}
-      if (out_mode == SImode && in_mode == SFmode)
-	{
-	  if (out_n == 4 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
-	  else if (out_n == 8 && in_n == 8)
-	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
-	  else if (out_n == 16 && in_n == 16)
-	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
-	}
-      break;
-
-    CASE_CFN_IRINT:
-    CASE_CFN_LRINT:
-    CASE_CFN_LLRINT:
-      if (out_mode == SImode && in_mode == DFmode)
-	{
-	  if (out_n == 4 && in_n == 2)
-	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
-	  else if (out_n == 8 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
-	  else if (out_n == 16 && in_n == 8)
-	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
-	}
-      if (out_mode == SImode && in_mode == SFmode)
-	{
-	  if (out_n == 4 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
-	  else if (out_n == 8 && in_n == 8)
-	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
-	  else if (out_n == 16 && in_n == 16)
-	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
-	}
-      break;
-
-    CASE_CFN_IROUND:
-    CASE_CFN_LROUND:
-    CASE_CFN_LLROUND:
-      /* The round insn does not trap on denormals.  */
-      if (flag_trapping_math || !TARGET_SSE4_1)
-	break;
-
-      if (out_mode == SImode && in_mode == DFmode)
-	{
-	  if (out_n == 4 && in_n == 2)
-	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
-	  else if (out_n == 8 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
-	  else if (out_n == 16 && in_n == 8)
-	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
-	}
-      if (out_mode == SImode && in_mode == SFmode)
-	{
-	  if (out_n == 4 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
-	  else if (out_n == 8 && in_n == 8)
-	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
-	  else if (out_n == 16 && in_n == 16)
-	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
-	}
-      break;
-
-    CASE_CFN_FLOOR:
-      /* The round insn does not trap on denormals.  */
-      if (flag_trapping_math || !TARGET_SSE4_1)
-	break;
-
-      if (out_mode == DFmode && in_mode == DFmode)
-	{
-	  if (out_n == 2 && in_n == 2)
-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
-	  else if (out_n == 4 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
-	  else if (out_n == 8 && in_n == 8)
-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
-	}
-      if (out_mode == SFmode && in_mode == SFmode)
-	{
-	  if (out_n == 4 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
-	  else if (out_n == 8 && in_n == 8)
-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
-	  else if (out_n == 16 && in_n == 16)
-	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
-	}
-      break;
-
-    CASE_CFN_CEIL:
-      /* The round insn does not trap on denormals.  */
-      if (flag_trapping_math || !TARGET_SSE4_1)
-	break;
-
-      if (out_mode == DFmode && in_mode == DFmode)
-	{
-	  if (out_n == 2 && in_n == 2)
-	    return ix86_get_builtin (IX86_BUILTIN_CEILPD);
-	  else if (out_n == 4 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
-	  else if (out_n == 8 && in_n == 8)
-	    return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
-	}
-      if (out_mode == SFmode && in_mode == SFmode)
-	{
-	  if (out_n == 4 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_CEILPS);
-	  else if (out_n == 8 && in_n == 8)
-	    return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
-	  else if (out_n == 16 && in_n == 16)
-	    return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
-	}
-      break;
-
-    CASE_CFN_TRUNC:
-      /* The round insn does not trap on denormals.  */
-      if (flag_trapping_math || !TARGET_SSE4_1)
-	break;
-
-      if (out_mode == DFmode && in_mode == DFmode)
-	{
-	  if (out_n == 2 && in_n == 2)
-	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
-	  else if (out_n == 4 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
-	  else if (out_n == 8 && in_n == 8)
-	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
-	}
-      if (out_mode == SFmode && in_mode == SFmode)
-	{
-	  if (out_n == 4 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
-	  else if (out_n == 8 && in_n == 8)
-	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
-	  else if (out_n == 16 && in_n == 16)
-	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
-	}
-      break;
-
-    CASE_CFN_RINT:
-      /* The round insn does not trap on denormals.  */
-      if (flag_trapping_math || !TARGET_SSE4_1)
-	break;
-
-      if (out_mode == DFmode && in_mode == DFmode)
-	{
-	  if (out_n == 2 && in_n == 2)
-	    return ix86_get_builtin (IX86_BUILTIN_RINTPD);
-	  else if (out_n == 4 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
-	}
-      if (out_mode == SFmode && in_mode == SFmode)
-	{
-	  if (out_n == 4 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_RINTPS);
-	  else if (out_n == 8 && in_n == 8)
-	    return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
-	}
-      break;
-
-    CASE_CFN_FMA:
-      if (out_mode == DFmode && in_mode == DFmode)
-	{
-	  if (out_n == 2 && in_n == 2)
-	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
-	  if (out_n == 4 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
-	}
-      if (out_mode == SFmode && in_mode == SFmode)
-	{
-	  if (out_n == 4 && in_n == 4)
-	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
-	  if (out_n == 8 && in_n == 8)
-	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
-	}
-      break;
-
-    default:
-      break;
-    }
-
-  /* Dispatch to a handler for a vectorization library.  */
-  if (ix86_veclib_handler)
-    return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
-
-  return NULL_TREE;
-}
-
-/* Handler for an SVML-style interface to
-   a library with vectorized intrinsics.  */
-
-static tree
-ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
-{
-  char name[20];
-  tree fntype, new_fndecl, args;
-  unsigned arity;
-  const char *bname;
-  machine_mode el_mode, in_mode;
-  int n, in_n;
-
-  /* The SVML is suitable for unsafe math only.  */
-  if (!flag_unsafe_math_optimizations)
-    return NULL_TREE;
-
-  el_mode = TYPE_MODE (TREE_TYPE (type_out));
-  n = TYPE_VECTOR_SUBPARTS (type_out);
-  in_mode = TYPE_MODE (TREE_TYPE (type_in));
-  in_n = TYPE_VECTOR_SUBPARTS (type_in);
-  if (el_mode != in_mode
-      || n != in_n)
-    return NULL_TREE;
-
-  switch (fn)
-    {
-    CASE_CFN_EXP:
-    CASE_CFN_LOG:
-    CASE_CFN_LOG10:
-    CASE_CFN_POW:
-    CASE_CFN_TANH:
-    CASE_CFN_TAN:
-    CASE_CFN_ATAN:
-    CASE_CFN_ATAN2:
-    CASE_CFN_ATANH:
-    CASE_CFN_CBRT:
-    CASE_CFN_SINH:
-    CASE_CFN_SIN:
-    CASE_CFN_ASINH:
-    CASE_CFN_ASIN:
-    CASE_CFN_COSH:
-    CASE_CFN_COS:
-    CASE_CFN_ACOSH:
-    CASE_CFN_ACOS:
-      if ((el_mode != DFmode || n != 2)
-	  && (el_mode != SFmode || n != 4))
-	return NULL_TREE;
-      break;
-
-    default:
-      return NULL_TREE;
-    }
-
-  tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
-  bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
-
-  if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
-    strcpy (name, "vmlsLn4");
-  else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
-    strcpy (name, "vmldLn2");
-  else if (n == 4)
-    {
-      sprintf (name, "vmls%s", bname+10);
-      name[strlen (name)-1] = '4';
-    }
-  else
-    sprintf (name, "vmld%s2", bname+10);
-
-  /* Convert to uppercase. */
-  name[4] &= ~0x20;
-
-  arity = 0;
-  for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
-    arity++;
-
-  if (arity == 1)
-    fntype = build_function_type_list (type_out, type_in, NULL);
-  else
-    fntype = build_function_type_list (type_out, type_in, type_in, NULL);
-
-  /* Build a function declaration for the vectorized function.  */
-  new_fndecl = build_decl (BUILTINS_LOCATION,
-			   FUNCTION_DECL, get_identifier (name), fntype);
-  TREE_PUBLIC (new_fndecl) = 1;
-  DECL_EXTERNAL (new_fndecl) = 1;
-  DECL_IS_NOVOPS (new_fndecl) = 1;
-  TREE_READONLY (new_fndecl) = 1;
-
-  return new_fndecl;
-}
-
-/* Handler for an ACML-style interface to
-   a library with vectorized intrinsics.  */
-
-static tree
-ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
-{
-  char name[20] = "__vr.._";
-  tree fntype, new_fndecl, args;
-  unsigned arity;
-  const char *bname;
-  machine_mode el_mode, in_mode;
-  int n, in_n;
-
-  /* The ACML is 64bits only and suitable for unsafe math only as
-     it does not correctly support parts of IEEE with the required
-     precision such as denormals.  */
-  if (!TARGET_64BIT
-      || !flag_unsafe_math_optimizations)
-    return NULL_TREE;
-
-  el_mode = TYPE_MODE (TREE_TYPE (type_out));
-  n = TYPE_VECTOR_SUBPARTS (type_out);
-  in_mode = TYPE_MODE (TREE_TYPE (type_in));
-  in_n = TYPE_VECTOR_SUBPARTS (type_in);
-  if (el_mode != in_mode
-      || n != in_n)
-    return NULL_TREE;
-
-  switch (fn)
-    {
-    CASE_CFN_SIN:
-    CASE_CFN_COS:
-    CASE_CFN_EXP:
-    CASE_CFN_LOG:
-    CASE_CFN_LOG2:
-    CASE_CFN_LOG10:
-      if (el_mode == DFmode && n == 2)
-	{
-	  name[4] = 'd';
-	  name[5] = '2';
-	}
-      else if (el_mode == SFmode && n == 4)
-	{
-	  name[4] = 's';
-	  name[5] = '4';
-	}
-      else
-	return NULL_TREE;
-      break;
-
-    default:
-      return NULL_TREE;
-    }
-
-  tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
-  bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
-  sprintf (name + 7, "%s", bname+10);
-
-  arity = 0;
-  for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
-    arity++;
-
-  if (arity == 1)
-    fntype = build_function_type_list (type_out, type_in, NULL);
-  else
-    fntype = build_function_type_list (type_out, type_in, type_in, NULL);
-
-  /* Build a function declaration for the vectorized function.  */
-  new_fndecl = build_decl (BUILTINS_LOCATION,
-			   FUNCTION_DECL, get_identifier (name), fntype);
-  TREE_PUBLIC (new_fndecl) = 1;
-  DECL_EXTERNAL (new_fndecl) = 1;
-  DECL_IS_NOVOPS (new_fndecl) = 1;
-  TREE_READONLY (new_fndecl) = 1;
-
-  return new_fndecl;
-}
-
-/* Returns a decl of a function that implements gather load with
-   memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
-   Return NULL_TREE if it is not available.  */
-
-static tree
-ix86_vectorize_builtin_gather (const_tree mem_vectype,
-			       const_tree index_type, int scale)
-{
-  bool si;
-  enum ix86_builtins code;
-
-  if (! TARGET_AVX2 || !TARGET_USE_GATHER)
-    return NULL_TREE;
-
-  if ((TREE_CODE (index_type) != INTEGER_TYPE
-       && !POINTER_TYPE_P (index_type))
-      || (TYPE_MODE (index_type) != SImode
-	  && TYPE_MODE (index_type) != DImode))
-    return NULL_TREE;
-
-  if (TYPE_PRECISION (index_type) > POINTER_SIZE)
-    return NULL_TREE;
-
-  /* v*gather* insn sign extends index to pointer mode.  */
-  if (TYPE_PRECISION (index_type) < POINTER_SIZE
-      && TYPE_UNSIGNED (index_type))
-    return NULL_TREE;
-
-  if (scale <= 0
-      || scale > 8
-      || (scale & (scale - 1)) != 0)
-    return NULL_TREE;
-
-  si = TYPE_MODE (index_type) == SImode;
-  switch (TYPE_MODE (mem_vectype))
-    {
-    case E_V2DFmode:
-      if (TARGET_AVX512VL)
-	code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
-      else
-	code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
-      break;
-    case E_V4DFmode:
-      if (TARGET_AVX512VL)
-	code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
-      else
-	code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
-      break;
-    case E_V2DImode:
-      if (TARGET_AVX512VL)
-	code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
-      else
-	code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
-      break;
-    case E_V4DImode:
-      if (TARGET_AVX512VL)
-	code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
-      else
-	code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
-      break;
-    case E_V4SFmode:
-      if (TARGET_AVX512VL)
-	code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
-      else
-	code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
-      break;
-    case E_V8SFmode:
-      if (TARGET_AVX512VL)
-	code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
-      else
-	code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
-      break;
-    case E_V4SImode:
-      if (TARGET_AVX512VL)
-	code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
-      else
-	code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
-      break;
-    case E_V8SImode:
-      if (TARGET_AVX512VL)
-	code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
-      else
-	code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
-      break;
-    case E_V8DFmode:
-      if (TARGET_AVX512F)
-	code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
-      else
-	return NULL_TREE;
-      break;
-    case E_V8DImode:
-      if (TARGET_AVX512F)
-	code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
-      else
-	return NULL_TREE;
-      break;
-    case E_V16SFmode:
-      if (TARGET_AVX512F)
-	code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
-      else
-	return NULL_TREE;
-      break;
-    case E_V16SImode:
-      if (TARGET_AVX512F)
-	code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
-      else
-	return NULL_TREE;
-      break;
-    default:
-      return NULL_TREE;
-    }
-
-  return ix86_get_builtin (code);
-}
-
-/* Returns a decl of a function that implements scatter store with
-   register type VECTYPE and index type INDEX_TYPE and SCALE.
-   Return NULL_TREE if it is not available.  */
-
-static tree
-ix86_vectorize_builtin_scatter (const_tree vectype,
-				const_tree index_type, int scale)
-{
-  bool si;
-  enum ix86_builtins code;
-
-  if (!TARGET_AVX512F)
-    return NULL_TREE;
-
-  if ((TREE_CODE (index_type) != INTEGER_TYPE
-       && !POINTER_TYPE_P (index_type))
-      || (TYPE_MODE (index_type) != SImode
-	  && TYPE_MODE (index_type) != DImode))
-    return NULL_TREE;
-
-  if (TYPE_PRECISION (index_type) > POINTER_SIZE)
-    return NULL_TREE;
-
-  /* v*scatter* insn sign extends index to pointer mode.  */
-  if (TYPE_PRECISION (index_type) < POINTER_SIZE
-      && TYPE_UNSIGNED (index_type))
-    return NULL_TREE;
-
-  /* Scale can be 1, 2, 4 or 8.  */
-  if (scale <= 0
-      || scale > 8
-      || (scale & (scale - 1)) != 0)
-    return NULL_TREE;
-
-  si = TYPE_MODE (index_type) == SImode;
-  switch (TYPE_MODE (vectype))
-    {
-    case E_V8DFmode:
-      code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
-      break;
-    case E_V8DImode:
-      code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
-      break;
-    case E_V16SFmode:
-      code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
-      break;
-    case E_V16SImode:
-      code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
-      break;
-    case E_V4DFmode:
-      if (TARGET_AVX512VL)
-	code = si ? IX86_BUILTIN_SCATTERALTSIV4DF : IX86_BUILTIN_SCATTERDIV4DF;
-      else
-	return NULL_TREE;
-      break;
-    case E_V4DImode:
-      if (TARGET_AVX512VL)
-	code = si ? IX86_BUILTIN_SCATTERALTSIV4DI : IX86_BUILTIN_SCATTERDIV4DI;
-      else
-	return NULL_TREE;
-      break;
-    case E_V8SFmode:
-      if (TARGET_AVX512VL)
-	code = si ? IX86_BUILTIN_SCATTERSIV8SF : IX86_BUILTIN_SCATTERALTDIV8SF;
-      else
-	return NULL_TREE;
-      break;
-    case E_V8SImode:
-      if (TARGET_AVX512VL)
-	code = si ? IX86_BUILTIN_SCATTERSIV8SI : IX86_BUILTIN_SCATTERALTDIV8SI;
-      else
-	return NULL_TREE;
-      break;
-    case E_V2DFmode:
-      if (TARGET_AVX512VL)
-	code = si ? IX86_BUILTIN_SCATTERALTSIV2DF : IX86_BUILTIN_SCATTERDIV2DF;
-      else
-	return NULL_TREE;
-      break;
-    case E_V2DImode:
-      if (TARGET_AVX512VL)
-	code = si ? IX86_BUILTIN_SCATTERALTSIV2DI : IX86_BUILTIN_SCATTERDIV2DI;
-      else
-	return NULL_TREE;
-      break;
-    case E_V4SFmode:
-      if (TARGET_AVX512VL)
-	code = si ? IX86_BUILTIN_SCATTERSIV4SF : IX86_BUILTIN_SCATTERALTDIV4SF;
-      else
-	return NULL_TREE;
-      break;
-    case E_V4SImode:
-      if (TARGET_AVX512VL)
-	code = si ? IX86_BUILTIN_SCATTERSIV4SI : IX86_BUILTIN_SCATTERALTDIV4SI;
-      else
-	return NULL_TREE;
-      break;
-    default:
-      return NULL_TREE;
-    }
-
-  return ix86_builtins[code];
-}
-
-/* Return true if it is safe to use the rsqrt optabs to optimize
-   1.0/sqrt.  */
-
-static bool
-use_rsqrt_p ()
-{
-  return (TARGET_SSE && TARGET_SSE_MATH
-	  && flag_finite_math_only
-	  && !flag_trapping_math
-	  && flag_unsafe_math_optimizations);
-}
-
-/* Returns a code for a target-specific builtin that implements
-   reciprocal of the function, or NULL_TREE if not available.  */
-
-static tree
-ix86_builtin_reciprocal (tree fndecl)
-{
-  switch (DECL_FUNCTION_CODE (fndecl))
-    {
-      /* Vectorized version of sqrt to rsqrt conversion.  */
-    case IX86_BUILTIN_SQRTPS_NR:
-      return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
-
-    case IX86_BUILTIN_SQRTPS_NR256:
-      return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
-
-    default:
-      return NULL_TREE;
-    }
-}
-
-/* Helper for avx_vpermilps256_operand et al.  This is also used by
-   the expansion functions to turn the parallel back into a mask.
-   The return value is 0 for no match and the imm8+1 for a match.  */
-
-int
-avx_vpermilp_parallel (rtx par, machine_mode mode)
-{
-  unsigned i, nelt = GET_MODE_NUNITS (mode);
-  unsigned mask = 0;
-  unsigned char ipar[16] = {};  /* Silence -Wuninitialized warning.  */
-
-  if (XVECLEN (par, 0) != (int) nelt)
-    return 0;
-
-  /* Validate that all of the elements are constants, and not totally
-     out of range.  Copy the data into an integral array to make the
-     subsequent checks easier.  */
-  for (i = 0; i < nelt; ++i)
-    {
-      rtx er = XVECEXP (par, 0, i);
-      unsigned HOST_WIDE_INT ei;
-
-      if (!CONST_INT_P (er))
-	return 0;
-      ei = INTVAL (er);
-      if (ei >= nelt)
-	return 0;
-      ipar[i] = ei;
-    }
-
-  switch (mode)
-    {
-    case E_V8DFmode:
-      /* In the 512-bit DFmode case, we can only move elements within
-         a 128-bit lane.  First fill the second part of the mask,
-	 then fallthru.  */
-      for (i = 4; i < 6; ++i)
-	{
-	  if (ipar[i] < 4 || ipar[i] >= 6)
-	    return 0;
-	  mask |= (ipar[i] - 4) << i;
-	}
-      for (i = 6; i < 8; ++i)
-	{
-	  if (ipar[i] < 6)
-	    return 0;
-	  mask |= (ipar[i] - 6) << i;
-	}
-      /* FALLTHRU */
-
-    case E_V4DFmode:
-      /* In the 256-bit DFmode case, we can only move elements within
-         a 128-bit lane.  */
-      for (i = 0; i < 2; ++i)
-	{
-	  if (ipar[i] >= 2)
-	    return 0;
-	  mask |= ipar[i] << i;
-	}
-      for (i = 2; i < 4; ++i)
-	{
-	  if (ipar[i] < 2)
-	    return 0;
-	  mask |= (ipar[i] - 2) << i;
-	}
-      break;
-
-    case E_V16SFmode:
-      /* In 512 bit SFmode case, permutation in the upper 256 bits
-	 must mirror the permutation in the lower 256-bits.  */
-      for (i = 0; i < 8; ++i)
-	if (ipar[i] + 8 != ipar[i + 8])
-	  return 0;
-      /* FALLTHRU */
-
-    case E_V8SFmode:
-      /* In 256 bit SFmode case, we have full freedom of
-         movement within the low 128-bit lane, but the high 128-bit
-         lane must mirror the exact same pattern.  */
-      for (i = 0; i < 4; ++i)
-	if (ipar[i] + 4 != ipar[i + 4])
-	  return 0;
-      nelt = 4;
-      /* FALLTHRU */
-
-    case E_V2DFmode:
-    case E_V4SFmode:
-      /* In the 128-bit case, we've full freedom in the placement of
-	 the elements from the source operand.  */
-      for (i = 0; i < nelt; ++i)
-	mask |= ipar[i] << (i * (nelt / 2));
-      break;
-
-    default:
-      gcc_unreachable ();
-    }
-
-  /* Make sure success has a non-zero value by adding one.  */
-  return mask + 1;
-}
-
-/* Helper for avx_vperm2f128_v4df_operand et al.  This is also used by
-   the expansion functions to turn the parallel back into a mask.
-   The return value is 0 for no match and the imm8+1 for a match.  */
-
-int
-avx_vperm2f128_parallel (rtx par, machine_mode mode)
-{
-  unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
-  unsigned mask = 0;
-  unsigned char ipar[8] = {};  /* Silence -Wuninitialized warning.  */
-
-  if (XVECLEN (par, 0) != (int) nelt)
-    return 0;
-
-  /* Validate that all of the elements are constants, and not totally
-     out of range.  Copy the data into an integral array to make the
-     subsequent checks easier.  */
-  for (i = 0; i < nelt; ++i)
-    {
-      rtx er = XVECEXP (par, 0, i);
-      unsigned HOST_WIDE_INT ei;
-
-      if (!CONST_INT_P (er))
-	return 0;
-      ei = INTVAL (er);
-      if (ei >= 2 * nelt)
-	return 0;
-      ipar[i] = ei;
-    }
-
-  /* Validate that the halves of the permute are halves.  */
-  for (i = 0; i < nelt2 - 1; ++i)
-    if (ipar[i] + 1 != ipar[i + 1])
-      return 0;
-  for (i = nelt2; i < nelt - 1; ++i)
-    if (ipar[i] + 1 != ipar[i + 1])
-      return 0;
-
-  /* Reconstruct the mask.  */
-  for (i = 0; i < 2; ++i)
-    {
-      unsigned e = ipar[i * nelt2];
-      if (e % nelt2)
-	return 0;
-      e /= nelt2;
-      mask |= e << (i * 4);
-    }
-
-  /* Make sure success has a non-zero value by adding one.  */
-  return mask + 1;
-}
-
-/* Return a register priority for hard reg REGNO.  */
-static int
-ix86_register_priority (int hard_regno)
-{
-  /* ebp and r13 as the base always wants a displacement, r12 as the
-     base always wants an index.  So discourage their usage in an
-     address.  */
-  if (hard_regno == R12_REG || hard_regno == R13_REG)
-    return 0;
-  if (hard_regno == BP_REG)
-    return 1;
-  /* New x86-64 int registers result in bigger code size.  Discourage
-     them.  */
-  if (IN_RANGE (hard_regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
-    return 2;
-  /* New x86-64 SSE registers result in bigger code size.  Discourage
-     them.  */
-  if (IN_RANGE (hard_regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
-    return 2;
-  if (IN_RANGE (hard_regno, FIRST_EXT_REX_SSE_REG, LAST_EXT_REX_SSE_REG))
-    return 1;
-  /* Usage of AX register results in smaller code.  Prefer it.  */
-  if (hard_regno == AX_REG)
-    return 4;
-  return 3;
-}
-
-/* Implement TARGET_PREFERRED_RELOAD_CLASS.
-
-   Put float CONST_DOUBLE in the constant pool instead of fp regs.
-   QImode must go into class Q_REGS.
-   Narrow ALL_REGS to GENERAL_REGS.  This supports allowing movsf and
-   movdf to do mem-to-mem moves through integer regs.  */
-
-static reg_class_t
-ix86_preferred_reload_class (rtx x, reg_class_t regclass)
-{
-  machine_mode mode = GET_MODE (x);
-
-  /* We're only allowed to return a subclass of CLASS.  Many of the
-     following checks fail for NO_REGS, so eliminate that early.  */
-  if (regclass == NO_REGS)
-    return NO_REGS;
-
-  /* All classes can load zeros.  */
-  if (x == CONST0_RTX (mode))
-    return regclass;
-
-  /* Force constants into memory if we are loading a (nonzero) constant into
-     an MMX, SSE or MASK register.  This is because there are no MMX/SSE/MASK
-     instructions to load from a constant.  */
-  if (CONSTANT_P (x)
-      && (MAYBE_MMX_CLASS_P (regclass)
-	  || MAYBE_SSE_CLASS_P (regclass)
-	  || MAYBE_MASK_CLASS_P (regclass)))
-    return NO_REGS;
-
-  /* Floating-point constants need more complex checks.  */
-  if (CONST_DOUBLE_P (x))
-    {
-      /* General regs can load everything.  */
-      if (INTEGER_CLASS_P (regclass))
-        return regclass;
-
-      /* Floats can load 0 and 1 plus some others.  Note that we eliminated
-	 zero above.  We only want to wind up preferring 80387 registers if
-	 we plan on doing computation with them.  */
-      if (IS_STACK_MODE (mode)
-	  && standard_80387_constant_p (x) > 0)
-	{
-	  /* Limit class to FP regs.  */
-	  if (FLOAT_CLASS_P (regclass))
-	    return FLOAT_REGS;
-	}
-
-      return NO_REGS;
-    }
-
-  /* Prefer SSE regs only, if we can use them for math.  */
-  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
-    return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
-
-  /* Generally when we see PLUS here, it's the function invariant
-     (plus soft-fp const_int).  Which can only be computed into general
-     regs.  */
-  if (GET_CODE (x) == PLUS)
-    return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
-
-  /* QImode constants are easy to load, but non-constant QImode data
-     must go into Q_REGS.  */
-  if (GET_MODE (x) == QImode && !CONSTANT_P (x))
-    {
-      if (Q_CLASS_P (regclass))
-	return regclass;
-      else if (reg_class_subset_p (Q_REGS, regclass))
-	return Q_REGS;
-      else
-	return NO_REGS;
-    }
-
-  return regclass;
-}
-
-/* Discourage putting floating-point values in SSE registers unless
-   SSE math is being used, and likewise for the 387 registers.  */
-static reg_class_t
-ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
-{
-  /* Restrict the output reload class to the register bank that we are doing
-     math on.  If we would like not to return a subset of CLASS, reject this
-     alternative: if reload cannot do this, it will still use its choice.  */
-  machine_mode mode = GET_MODE (x);
-  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
-    return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
-
-  if (IS_STACK_MODE (mode))
-    return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
-
-  return regclass;
-}
-
-static reg_class_t
-ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
-		       machine_mode mode, secondary_reload_info *sri)
-{
-  /* Double-word spills from general registers to non-offsettable memory
-     references (zero-extended addresses) require special handling.  */
-  if (TARGET_64BIT
-      && MEM_P (x)
-      && GET_MODE_SIZE (mode) > UNITS_PER_WORD
-      && INTEGER_CLASS_P (rclass)
-      && !offsettable_memref_p (x))
-    {
-      sri->icode = (in_p
-		    ? CODE_FOR_reload_noff_load
-		    : CODE_FOR_reload_noff_store);
-      /* Add the cost of moving address to a temporary.  */
-      sri->extra_cost = 1;
-
-      return NO_REGS;
-    }
-
-  /* QImode spills from non-QI registers require
-     intermediate register on 32bit targets.  */
-  if (mode == QImode
-      && ((!TARGET_64BIT && !in_p
-	   && INTEGER_CLASS_P (rclass)
-	   && MAYBE_NON_Q_CLASS_P (rclass))
-	  || (!TARGET_AVX512DQ
-	      && MAYBE_MASK_CLASS_P (rclass))))
-    {
-      int regno = true_regnum (x);
-
-      /* Return Q_REGS if the operand is in memory.  */
-      if (regno == -1)
-	return Q_REGS;
-
-      return NO_REGS;
-    }
-
-  /* This condition handles corner case where an expression involving
-     pointers gets vectorized.  We're trying to use the address of a
-     stack slot as a vector initializer.
-
-     (set (reg:V2DI 74 [ vect_cst_.2 ])
-          (vec_duplicate:V2DI (reg/f:DI 20 frame)))
-
-     Eventually frame gets turned into sp+offset like this:
-
-     (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
-          (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
-	                               (const_int 392 [0x188]))))
-
-     That later gets turned into:
-
-     (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
-          (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
-	    (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
-
-     We'll have the following reload recorded:
-
-     Reload 0: reload_in (DI) =
-           (plus:DI (reg/f:DI 7 sp)
-            (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
-     reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
-     SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
-     reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
-     reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
-     reload_reg_rtx: (reg:V2DI 22 xmm1)
-
-     Which isn't going to work since SSE instructions can't handle scalar
-     additions.  Returning GENERAL_REGS forces the addition into integer
-     register and reload can handle subsequent reloads without problems.  */
-
-  if (in_p && GET_CODE (x) == PLUS
-      && SSE_CLASS_P (rclass)
-      && SCALAR_INT_MODE_P (mode))
-    return GENERAL_REGS;
-
-  return NO_REGS;
-}
-
-/* Implement TARGET_CLASS_LIKELY_SPILLED_P.  */
-
-static bool
-ix86_class_likely_spilled_p (reg_class_t rclass)
-{
-  switch (rclass)
-    {
-      case AREG:
-      case DREG:
-      case CREG:
-      case BREG:
-      case AD_REGS:
-      case SIREG:
-      case DIREG:
-      case SSE_FIRST_REG:
-      case FP_TOP_REG:
-      case FP_SECOND_REG:
-	return true;
-
-      default:
-	break;
-    }
-
-  return false;
-}
-
-/* If we are copying between registers from different register sets
-   (e.g. FP and integer), we may need a memory location.
-
-   The function can't work reliably when one of the CLASSES is a class
-   containing registers from multiple sets.  We avoid this by never combining
-   different sets in a single alternative in the machine description.
-   Ensure that this constraint holds to avoid unexpected surprises.
-
-   When STRICT is false, we are being called from REGISTER_MOVE_COST,
-   so do not enforce these sanity checks.
-
-   To optimize register_move_cost performance, define inline variant.  */
-
-static inline bool
-inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
-				reg_class_t class2, int strict)
-{
-  if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
-    return false;
-
-  if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
-      || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
-      || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
-      || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
-      || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
-      || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
-      || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
-      || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
-    {
-      gcc_assert (!strict || lra_in_progress);
-      return true;
-    }
-
-  if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
-    return true;
-
-  /* Between mask and general, we have moves no larger than word size.  */
-  if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
-      && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
-  return true;
-
-  /* ??? This is a lie.  We do have moves between mmx/general, and for
-     mmx/sse2.  But by saying we need secondary memory we discourage the
-     register allocator from using the mmx registers unless needed.  */
-  if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
-    return true;
-
-  if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
-    {
-      /* SSE1 doesn't have any direct moves from other classes.  */
-      if (!TARGET_SSE2)
-	return true;
-
-      /* If the target says that inter-unit moves are more expensive
-	 than moving through memory, then don't generate them.  */
-      if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
-	  || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
-	return true;
-
-      /* Between SSE and general, we have moves no larger than word size.  */
-      if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
-	return true;
-    }
-
-  return false;
-}
-
-/* Implement TARGET_SECONDARY_MEMORY_NEEDED.  */
-
-static bool
-ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
-			      reg_class_t class2)
-{
-  return inline_secondary_memory_needed (mode, class1, class2, true);
-}
-
-/* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
-
-   get_secondary_mem widens integral modes to BITS_PER_WORD.
-   There is no need to emit full 64 bit move on 64 bit targets
-   for integral modes that can be moved using 32 bit move.  */
-
-static machine_mode
-ix86_secondary_memory_needed_mode (machine_mode mode)
-{
-  if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
-    return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
-  return mode;
-}
-
-/* Implement the TARGET_CLASS_MAX_NREGS hook.
-
-   On the 80386, this is the size of MODE in words,
-   except in the FP regs, where a single reg is always enough.  */
-
-static unsigned char
-ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
-{
-  if (MAYBE_INTEGER_CLASS_P (rclass))
-    {
-      if (mode == XFmode)
-	return (TARGET_64BIT ? 2 : 3);
-      else if (mode == XCmode)
-	return (TARGET_64BIT ? 4 : 6);
-      else
-	return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
-    }
-  else
-    {
-      if (COMPLEX_MODE_P (mode))
-	return 2;
-      else
-	return 1;
-    }
-}
-
-/* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
-
-static bool
-ix86_can_change_mode_class (machine_mode from, machine_mode to,
-			    reg_class_t regclass)
-{
-  if (from == to)
-    return true;
-
-  /* x87 registers can't do subreg at all, as all values are reformatted
-     to extended precision.  */
-  if (MAYBE_FLOAT_CLASS_P (regclass))
-    return false;
-
-  if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
-    {
-      /* Vector registers do not support QI or HImode loads.  If we don't
-	 disallow a change to these modes, reload will assume it's ok to
-	 drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
-	 the vec_dupv4hi pattern.  */
-      if (GET_MODE_SIZE (from) < 4)
-	return false;
-    }
-
-  return true;
-}
-
-/* Return index of MODE in the sse load/store tables.  */
-
-static inline int
-sse_store_index (machine_mode mode)
-{
-      switch (GET_MODE_SIZE (mode))
-	{
-	  case 4:
-	    return 0;
-	  case 8:
-	    return 1;
-	  case 16:
-	    return 2;
-	  case 32:
-	    return 3;
-	  case 64:
-	    return 4;
-	  default:
-	    return -1;
-	}
-}
-
-/* Return the cost of moving data of mode M between a
-   register and memory.  A value of 2 is the default; this cost is
-   relative to those in `REGISTER_MOVE_COST'.
-
-   This function is used extensively by register_move_cost that is used to
-   build tables at startup.  Make it inline in this case.
-   When IN is 2, return maximum of in and out move cost.
-
-   If moving between registers and memory is more expensive than
-   between two registers, you should define this macro to express the
-   relative cost.
-
-   Model also increased moving costs of QImode registers in non
-   Q_REGS classes.
- */
-static inline int
-inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
-{
-  int cost;
-  if (FLOAT_CLASS_P (regclass))
-    {
-      int index;
-      switch (mode)
-	{
-	  case E_SFmode:
-	    index = 0;
-	    break;
-	  case E_DFmode:
-	    index = 1;
-	    break;
-	  case E_XFmode:
-	    index = 2;
-	    break;
-	  default:
-	    return 100;
-	}
-      if (in == 2)
-        return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
-      return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
-    }
-  if (SSE_CLASS_P (regclass))
-    {
-      int index = sse_store_index (mode);
-      if (index == -1)
-	return 100;
-      if (in == 2)
-        return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
-      return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
-    }
-  if (MMX_CLASS_P (regclass))
-    {
-      int index;
-      switch (GET_MODE_SIZE (mode))
-	{
-	  case 4:
-	    index = 0;
-	    break;
-	  case 8:
-	    index = 1;
-	    break;
-	  default:
-	    return 100;
-	}
-      if (in == 2)
-        return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
-      return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
-    }
-  switch (GET_MODE_SIZE (mode))
-    {
-      case 1:
-	if (Q_CLASS_P (regclass) || TARGET_64BIT)
-	  {
-	    if (!in)
-	      return ix86_cost->int_store[0];
-	    if (TARGET_PARTIAL_REG_DEPENDENCY
-	        && optimize_function_for_speed_p (cfun))
-	      cost = ix86_cost->movzbl_load;
-	    else
-	      cost = ix86_cost->int_load[0];
-	    if (in == 2)
-	      return MAX (cost, ix86_cost->int_store[0]);
-	    return cost;
-	  }
-	else
-	  {
-	   if (in == 2)
-	     return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
-	   if (in)
-	     return ix86_cost->movzbl_load;
-	   else
-	     return ix86_cost->int_store[0] + 4;
-	  }
-	break;
-      case 2:
-	if (in == 2)
-	  return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
-	return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
-      default:
-	if (in == 2)
-	  cost = MAX (ix86_cost->int_load[2], ix86_cost->int_store[2]);
-	else if (in)
-	  cost = ix86_cost->int_load[2];
-	else
-	  cost = ix86_cost->int_store[2];
-	/* Multiply with the number of GPR moves needed.  */
-	return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
-    }
-}
-
-static int
-ix86_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
-{
-  return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
-}
-
-
-/* Return the cost of moving data from a register in class CLASS1 to
-   one in class CLASS2.
-
-   It is not required that the cost always equal 2 when FROM is the same as TO;
-   on some machines it is expensive to move between registers if they are not
-   general registers.  */
-
-static int
-ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
-			 reg_class_t class2_i)
-{
-  enum reg_class class1 = (enum reg_class) class1_i;
-  enum reg_class class2 = (enum reg_class) class2_i;
-
-  /* In case we require secondary memory, compute cost of the store followed
-     by load.  In order to avoid bad register allocation choices, we need
-     for this to be *at least* as high as the symmetric MEMORY_MOVE_COST.  */
-
-  if (inline_secondary_memory_needed (mode, class1, class2, false))
-    {
-      int cost = 1;
-
-      cost += inline_memory_move_cost (mode, class1, 2);
-      cost += inline_memory_move_cost (mode, class2, 2);
-
-      /* In case of copying from general_purpose_register we may emit multiple
-         stores followed by single load causing memory size mismatch stall.
-         Count this as arbitrarily high cost of 20.  */
-      if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
-	  && TARGET_MEMORY_MISMATCH_STALL
-	  && targetm.class_max_nregs (class1, mode)
-	     > targetm.class_max_nregs (class2, mode))
-	cost += 20;
-
-      /* In the case of FP/MMX moves, the registers actually overlap, and we
-	 have to switch modes in order to treat them differently.  */
-      if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
-          || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
-	cost += 20;
-
-      return cost;
-    }
-
-  /* Moves between SSE/MMX and integer unit are expensive.  */
-  if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
-      || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
-
-    /* ??? By keeping returned value relatively high, we limit the number
-       of moves between integer and MMX/SSE registers for all targets.
-       Additionally, high value prevents problem with x86_modes_tieable_p(),
-       where integer modes in MMX/SSE registers are not tieable
-       because of missing QImode and HImode moves to, from or between
-       MMX/SSE registers.  */
-    return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
-		? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
-
-  if (MAYBE_FLOAT_CLASS_P (class1))
-    return ix86_cost->fp_move;
-  if (MAYBE_SSE_CLASS_P (class1))
-    {
-      if (GET_MODE_BITSIZE (mode) <= 128)
-	return ix86_cost->xmm_move;
-      if (GET_MODE_BITSIZE (mode) <= 256)
-	return ix86_cost->ymm_move;
-      return ix86_cost->zmm_move;
-    }
-  if (MAYBE_MMX_CLASS_P (class1))
-    return ix86_cost->mmx_move;
-  return 2;
-}
-
-/* Implement TARGET_HARD_REGNO_NREGS.  This is ordinarily the length in
-   words of a value of mode MODE but can be less for certain modes in
-   special long registers.
-
-   Actually there are no two word move instructions for consecutive
-   registers.  And only registers 0-3 may have mov byte instructions
-   applied to them.  */
-
-static unsigned int
-ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
-{
-  if (GENERAL_REGNO_P (regno))
-    {
-      if (mode == XFmode)
-	return TARGET_64BIT ? 2 : 3;
-      if (mode == XCmode)
-	return TARGET_64BIT ? 4 : 6;
-      return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
-    }
-  if (COMPLEX_MODE_P (mode))
-    return 2;
-  if (mode == V64SFmode || mode == V64SImode)
-    return 4;
-  return 1;
-}
-
-/* Implement TARGET_HARD_REGNO_MODE_OK.  */
-
-static bool
-ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
-{
-  /* Flags and only flags can only hold CCmode values.  */
-  if (CC_REGNO_P (regno))
-    return GET_MODE_CLASS (mode) == MODE_CC;
-  if (GET_MODE_CLASS (mode) == MODE_CC
-      || GET_MODE_CLASS (mode) == MODE_RANDOM
-      || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
-    return false;
-  if (STACK_REGNO_P (regno))
-    return VALID_FP_MODE_P (mode);
-  if (MASK_REGNO_P (regno))
-    return (VALID_MASK_REG_MODE (mode)
-	    || (TARGET_AVX512BW
-		&& VALID_MASK_AVX512BW_MODE (mode)));
-  if (SSE_REGNO_P (regno))
-    {
-      /* We implement the move patterns for all vector modes into and
-	 out of SSE registers, even when no operation instructions
-	 are available.  */
-
-      /* For AVX-512 we allow, regardless of regno:
-	  - XI mode
-	  - any of 512-bit wide vector mode
-	  - any scalar mode.  */
-      if (TARGET_AVX512F
-	  && (mode == XImode
-	      || VALID_AVX512F_REG_MODE (mode)
-	      || VALID_AVX512F_SCALAR_MODE (mode)))
-	return true;
-
-      /* For AVX-5124FMAPS or AVX-5124VNNIW
-	 allow V64SF and V64SI modes for special regnos.  */
-      if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
-	  && (mode == V64SFmode || mode == V64SImode)
-	  && MOD4_SSE_REGNO_P (regno))
-	return true;
-
-      /* TODO check for QI/HI scalars.  */
-      /* AVX512VL allows sse regs16+ for 128/256 bit modes.  */
-      if (TARGET_AVX512VL
-	  && (mode == OImode
-	      || mode == TImode
-	      || VALID_AVX256_REG_MODE (mode)
-	      || VALID_AVX512VL_128_REG_MODE (mode)))
-	return true;
-
-      /* xmm16-xmm31 are only available for AVX-512.  */
-      if (EXT_REX_SSE_REGNO_P (regno))
-	return false;
-
-      /* OImode and AVX modes are available only when AVX is enabled.  */
-      return ((TARGET_AVX
-	       && VALID_AVX256_REG_OR_OI_MODE (mode))
-	      || VALID_SSE_REG_MODE (mode)
-	      || VALID_SSE2_REG_MODE (mode)
-	      || VALID_MMX_REG_MODE (mode)
-	      || VALID_MMX_REG_MODE_3DNOW (mode));
-    }
-  if (MMX_REGNO_P (regno))
-    {
-      /* We implement the move patterns for 3DNOW modes even in MMX mode,
-	 so if the register is available at all, then we can move data of
-	 the given mode into or out of it.  */
-      return (VALID_MMX_REG_MODE (mode)
-	      || VALID_MMX_REG_MODE_3DNOW (mode));
-    }
-
-  if (mode == QImode)
-    {
-      /* Take care for QImode values - they can be in non-QI regs,
-	 but then they do cause partial register stalls.  */
-      if (ANY_QI_REGNO_P (regno))
-	return true;
-      if (!TARGET_PARTIAL_REG_STALL)
-	return true;
-      /* LRA checks if the hard register is OK for the given mode.
-	 QImode values can live in non-QI regs, so we allow all
-	 registers here.  */
-      if (lra_in_progress)
-       return true;
-      return !can_create_pseudo_p ();
-    }
-  /* We handle both integer and floats in the general purpose registers.  */
-  else if (VALID_INT_MODE_P (mode))
-    return true;
-  else if (VALID_FP_MODE_P (mode))
-    return true;
-  else if (VALID_DFP_MODE_P (mode))
-    return true;
-  /* Lots of MMX code casts 8 byte vector modes to DImode.  If we then go
-     on to use that value in smaller contexts, this can easily force a
-     pseudo to be allocated to GENERAL_REGS.  Since this is no worse than
-     supporting DImode, allow it.  */
-  else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
-    return true;
-
-  return false;
-}
-
-/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The only ABI that
-   saves SSE registers across calls is Win64 (thus no need to check the
-   current ABI here), and with AVX enabled Win64 only guarantees that
-   the low 16 bytes are saved.  */
-
-static bool
-ix86_hard_regno_call_part_clobbered (rtx_insn *insn ATTRIBUTE_UNUSED,
-				     unsigned int regno, machine_mode mode)
-{
-  return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
-}
-
-/* A subroutine of ix86_modes_tieable_p.  Return true if MODE is a
-   tieable integer mode.  */
-
-static bool
-ix86_tieable_integer_mode_p (machine_mode mode)
-{
-  switch (mode)
-    {
-    case E_HImode:
-    case E_SImode:
-      return true;
-
-    case E_QImode:
-      return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
-
-    case E_DImode:
-      return TARGET_64BIT;
-
-    default:
-      return false;
-    }
-}
-
-/* Implement TARGET_MODES_TIEABLE_P.
-
-   Return true if MODE1 is accessible in a register that can hold MODE2
-   without copying.  That is, all register classes that can hold MODE2
-   can also hold MODE1.  */
-
-static bool
-ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
-{
-  if (mode1 == mode2)
-    return true;
-
-  if (ix86_tieable_integer_mode_p (mode1)
-      && ix86_tieable_integer_mode_p (mode2))
-    return true;
-
-  /* MODE2 being XFmode implies fp stack or general regs, which means we
-     can tie any smaller floating point modes to it.  Note that we do not
-     tie this with TFmode.  */
-  if (mode2 == XFmode)
-    return mode1 == SFmode || mode1 == DFmode;
-
-  /* MODE2 being DFmode implies fp stack, general or sse regs, which means
-     that we can tie it with SFmode.  */
-  if (mode2 == DFmode)
-    return mode1 == SFmode;
-
-  /* If MODE2 is only appropriate for an SSE register, then tie with
-     any other mode acceptable to SSE registers.  */
-  if (GET_MODE_SIZE (mode2) == 64
-      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-    return (GET_MODE_SIZE (mode1) == 64
-	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
-  if (GET_MODE_SIZE (mode2) == 32
-      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-    return (GET_MODE_SIZE (mode1) == 32
-	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
-  if (GET_MODE_SIZE (mode2) == 16
-      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-    return (GET_MODE_SIZE (mode1) == 16
-	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
-
-  /* If MODE2 is appropriate for an MMX register, then tie
-     with any other mode acceptable to MMX registers.  */
-  if (GET_MODE_SIZE (mode2) == 8
-      && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
-    return (GET_MODE_SIZE (mode1) == 8
-	    && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
-
-  return false;
-}
-
-/* Return the cost of moving between two registers of mode MODE.  */
-
-static int
-ix86_set_reg_reg_cost (machine_mode mode)
-{
-  unsigned int units = UNITS_PER_WORD;
-
-  switch (GET_MODE_CLASS (mode))
-    {
-    default:
-      break;
-
-    case MODE_CC:
-      units = GET_MODE_SIZE (CCmode);
-      break;
-
-    case MODE_FLOAT:
-      if ((TARGET_SSE && mode == TFmode)
-	  || (TARGET_80387 && mode == XFmode)
-	  || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
-	  || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
-	units = GET_MODE_SIZE (mode);
-      break;
-
-    case MODE_COMPLEX_FLOAT:
-      if ((TARGET_SSE && mode == TCmode)
-	  || (TARGET_80387 && mode == XCmode)
-	  || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
-	  || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
-	units = GET_MODE_SIZE (mode);
-      break;
-
-    case MODE_VECTOR_INT:
-    case MODE_VECTOR_FLOAT:
-      if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
-	  || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
-	  || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
-	  || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
-	  || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
-	units = GET_MODE_SIZE (mode);
-    }
-
-  /* Return the cost of moving between two registers of mode MODE,
-     assuming that the move will be in pieces of at most UNITS bytes.  */
-  return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
-}
-
-/* Return cost of vector operation in MODE given that scalar version has
-   COST.  */
-
-static int
-ix86_vec_cost (machine_mode mode, int cost)
-{
-  if (!VECTOR_MODE_P (mode))
-    return cost;
-
-  if (GET_MODE_BITSIZE (mode) == 128
-      && TARGET_SSE_SPLIT_REGS)
-    return cost * 2;
-  if (GET_MODE_BITSIZE (mode) > 128
-      && TARGET_AVX128_OPTIMAL)
-    return cost * GET_MODE_BITSIZE (mode) / 128;
-  return cost;
-}
-
-/* Return cost of multiplication in MODE.  */
-
-static int
-ix86_multiplication_cost (const struct processor_costs *cost,
-			  enum machine_mode mode)
-{
-  machine_mode inner_mode = mode;
-  if (VECTOR_MODE_P (mode))
-    inner_mode = GET_MODE_INNER (mode);
-
-  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
-    return inner_mode == DFmode ? cost->mulsd : cost->mulss;
-  else if (X87_FLOAT_MODE_P (mode))
-    return cost->fmul;
-  else if (FLOAT_MODE_P (mode))
-    return  ix86_vec_cost (mode,
-			   inner_mode == DFmode ? cost->mulsd : cost->mulss);
-  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
-    {
-      /* vpmullq is used in this case. No emulation is needed.  */
-      if (TARGET_AVX512DQ)
-	return ix86_vec_cost (mode, cost->mulss);
-
-      /* V*QImode is emulated with 7-13 insns.  */
-      if (mode == V16QImode || mode == V32QImode)
-	{
-	  int extra = 11;
-	  if (TARGET_XOP && mode == V16QImode)
-	    extra = 5;
-	  else if (TARGET_SSSE3)
-	    extra = 6;
-	  return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * extra);
-	}
-      /* V*DImode is emulated with 5-8 insns.  */
-      else if (mode == V2DImode || mode == V4DImode)
-	{
-	  if (TARGET_XOP && mode == V2DImode)
-	    return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 3);
-	  else
-	    return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5);
-	}
-      /* Without sse4.1, we don't have PMULLD; it's emulated with 7
-	 insns, including two PMULUDQ.  */
-      else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
-	return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5);
-      else
-	return ix86_vec_cost (mode, cost->mulss);
-    }
-  else
-    return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
-}
-
-/* Return cost of multiplication in MODE.  */
-
-static int
-ix86_division_cost (const struct processor_costs *cost,
-			  enum machine_mode mode)
-{
-  machine_mode inner_mode = mode;
-  if (VECTOR_MODE_P (mode))
-    inner_mode = GET_MODE_INNER (mode);
-
-  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
-    return inner_mode == DFmode ? cost->divsd : cost->divss;
-  else if (X87_FLOAT_MODE_P (mode))
-    return cost->fdiv;
-  else if (FLOAT_MODE_P (mode))
-    return ix86_vec_cost (mode,
-			  inner_mode == DFmode ? cost->divsd : cost->divss);
-  else
-    return cost->divide[MODE_INDEX (mode)];
-}
-
-/* Return cost of shift in MODE.
-   If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
-   AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
-   if op1 is a result of subreg.
-
-   SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored.  */
-
-static int
-ix86_shift_rotate_cost (const struct processor_costs *cost,
-			enum machine_mode mode, bool constant_op1,
-			HOST_WIDE_INT op1_val,
-			bool speed,
-			bool and_in_op1,
-			bool shift_and_truncate,
-			bool *skip_op0, bool *skip_op1)
-{
-  if (skip_op0)
-    *skip_op0 = *skip_op1 = false;
-  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
-    {
-      /* V*QImode is emulated with 1-11 insns.  */
-      if (mode == V16QImode || mode == V32QImode)
-	{
-	  int count = 11;
-	  if (TARGET_XOP && mode == V16QImode)
-	    {
-	      /* For XOP we use vpshab, which requires a broadcast of the
-		 value to the variable shift insn.  For constants this
-		 means a V16Q const in mem; even when we can perform the
-		 shift with one insn set the cost to prefer paddb.  */
-	      if (constant_op1)
-		{
-		  if (skip_op1)
-		    *skip_op1 = true;
-		  return ix86_vec_cost (mode,
-					cost->sse_op
-					+ (speed
-					   ? 2
-					   : COSTS_N_BYTES
-					       (GET_MODE_UNIT_SIZE (mode))));
-		}
-	      count = 3;
-	    }
-	  else if (TARGET_SSSE3)
-	    count = 7;
-	  return ix86_vec_cost (mode, cost->sse_op * count);
-	}
-      else
-	return ix86_vec_cost (mode, cost->sse_op);
-    }
-  if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
-    {
-      if (constant_op1)
-	{
-	  if (op1_val > 32)
-	    return cost->shift_const + COSTS_N_INSNS (2);
-	  else
-	    return cost->shift_const * 2;
-	}
-      else
-	{
-	  if (and_in_op1)
-	    return cost->shift_var * 2;
-	  else
-	    return cost->shift_var * 6 + COSTS_N_INSNS (2);
-	}
-    }
-  else
-    {
-      if (constant_op1)
-	return cost->shift_const;
-      else if (shift_and_truncate)
-	{
-	  if (skip_op0)
-	    *skip_op0 = *skip_op1 = true;
-	  /* Return the cost after shift-and truncation.  */
-	  return cost->shift_var;
-	}
-      else
-	return cost->shift_var;
-    }
-  return cost->shift_const;
-}
-
-/* Compute a (partial) cost for rtx X.  Return true if the complete
-   cost has been computed, and false if subexpressions should be
-   scanned.  In either case, *TOTAL contains the cost result.  */
-
-static bool
-ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
-		int *total, bool speed)
-{
-  rtx mask;
-  enum rtx_code code = GET_CODE (x);
-  enum rtx_code outer_code = (enum rtx_code) outer_code_i;
-  const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
-  int src_cost;
-
-  switch (code)
-    {
-    case SET:
-      if (register_operand (SET_DEST (x), VOIDmode)
-	  && register_operand (SET_SRC (x), VOIDmode))
-	{
-	  *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
-	  return true;
-	}
-
-      if (register_operand (SET_SRC (x), VOIDmode))
-	/* Avoid potentially incorrect high cost from rtx_costs
-	   for non-tieable SUBREGs.  */
-	src_cost = 0;
-      else
-	{
-	  src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
-
-	  if (CONSTANT_P (SET_SRC (x)))
-	    /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
-	       a small value, possibly zero for cheap constants.  */
-	    src_cost += COSTS_N_INSNS (1);
-	}
-
-      *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
-      return true;
-
-    case CONST_INT:
-    case CONST:
-    case LABEL_REF:
-    case SYMBOL_REF:
-      if (x86_64_immediate_operand (x, VOIDmode))
-	*total = 0;
-     else
-	*total = 1;
-      return true;
-
-    case CONST_DOUBLE:
-      if (IS_STACK_MODE (mode))
-	switch (standard_80387_constant_p (x))
-	  {
-	  case -1:
-	  case 0:
-	    break;
-	  case 1: /* 0.0 */
-	    *total = 1;
-	    return true;
-	  default: /* Other constants */
-	    *total = 2;
-	    return true;
-	  }
-      /* FALLTHRU */
-
-    case CONST_VECTOR:
-      switch (standard_sse_constant_p (x, mode))
-	{
-	case 0:
-	  break;
-	case 1:  /* 0: xor eliminates false dependency */
-	  *total = 0;
-	  return true;
-	default: /* -1: cmp contains false dependency */
-	  *total = 1;
-	  return true;
-	}
-      /* FALLTHRU */
-
-    case CONST_WIDE_INT:
-      /* Fall back to (MEM (SYMBOL_REF)), since that's where
-	 it'll probably end up.  Add a penalty for size.  */
-      *total = (COSTS_N_INSNS (1)
-		+ (!TARGET_64BIT && flag_pic)
-		+ (GET_MODE_SIZE (mode) <= 4
-		   ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
-      return true;
-
-    case ZERO_EXTEND:
-      /* The zero extensions is often completely free on x86_64, so make
-	 it as cheap as possible.  */
-      if (TARGET_64BIT && mode == DImode
-	  && GET_MODE (XEXP (x, 0)) == SImode)
-	*total = 1;
-      else if (TARGET_ZERO_EXTEND_WITH_AND)
-	*total = cost->add;
-      else
-	*total = cost->movzx;
-      return false;
-
-    case SIGN_EXTEND:
-      *total = cost->movsx;
-      return false;
-
-    case ASHIFT:
-      if (SCALAR_INT_MODE_P (mode)
-	  && GET_MODE_SIZE (mode) < UNITS_PER_WORD
-	  && CONST_INT_P (XEXP (x, 1)))
-	{
-	  HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
-	  if (value == 1)
-	    {
-	      *total = cost->add;
-	      return false;
-	    }
-	  if ((value == 2 || value == 3)
-	      && cost->lea <= cost->shift_const)
-	    {
-	      *total = cost->lea;
-	      return false;
-	    }
-	}
-      /* FALLTHRU */
-
-    case ROTATE:
-    case ASHIFTRT:
-    case LSHIFTRT:
-    case ROTATERT:
-      bool skip_op0, skip_op1;
-      *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
-				       CONST_INT_P (XEXP (x, 1))
-					 ? INTVAL (XEXP (x, 1)) : -1,
-				       speed,
-				       GET_CODE (XEXP (x, 1)) == AND,
-				       SUBREG_P (XEXP (x, 1))
-				       && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
-				       &skip_op0, &skip_op1);
-      if (skip_op0 || skip_op1)
-	{
-	  if (!skip_op0)
-	    *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
-	  if (!skip_op1)
-	    *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
-	  return true;
-	}
-      return false;
-
-    case FMA:
-      {
-	rtx sub;
-
-        gcc_assert (FLOAT_MODE_P (mode));
-        gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
-
-        *total = ix86_vec_cost (mode,
-				GET_MODE_INNER (mode) == SFmode
-				? cost->fmass : cost->fmasd);
-	*total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
-
-        /* Negate in op0 or op2 is free: FMS, FNMA, FNMS.  */
-	sub = XEXP (x, 0);
-	if (GET_CODE (sub) == NEG)
-	  sub = XEXP (sub, 0);
-	*total += rtx_cost (sub, mode, FMA, 0, speed);
-
-	sub = XEXP (x, 2);
-	if (GET_CODE (sub) == NEG)
-	  sub = XEXP (sub, 0);
-	*total += rtx_cost (sub, mode, FMA, 2, speed);
-	return true;
-      }
-
-    case MULT:
-      if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
-	{
-	  rtx op0 = XEXP (x, 0);
-	  rtx op1 = XEXP (x, 1);
-	  int nbits;
-	  if (CONST_INT_P (XEXP (x, 1)))
-	    {
-	      unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
-	      for (nbits = 0; value != 0; value &= value - 1)
-	        nbits++;
-	    }
-	  else
-	    /* This is arbitrary.  */
-	    nbits = 7;
-
-	  /* Compute costs correctly for widening multiplication.  */
-	  if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
-	      && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
-	         == GET_MODE_SIZE (mode))
-	    {
-	      int is_mulwiden = 0;
-	      machine_mode inner_mode = GET_MODE (op0);
-
-	      if (GET_CODE (op0) == GET_CODE (op1))
-		is_mulwiden = 1, op1 = XEXP (op1, 0);
-	      else if (CONST_INT_P (op1))
-		{
-		  if (GET_CODE (op0) == SIGN_EXTEND)
-		    is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
-			          == INTVAL (op1);
-		  else
-		    is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
-	        }
-
-	      if (is_mulwiden)
-	        op0 = XEXP (op0, 0), mode = GET_MODE (op0);
-	    }
-
-  	  *total = (cost->mult_init[MODE_INDEX (mode)]
-		    + nbits * cost->mult_bit
-	            + rtx_cost (op0, mode, outer_code, opno, speed)
-		    + rtx_cost (op1, mode, outer_code, opno, speed));
-
-          return true;
-	}
-      *total = ix86_multiplication_cost (cost, mode);
-      return false;
-
-    case DIV:
-    case UDIV:
-    case MOD:
-    case UMOD:
-      *total = ix86_division_cost (cost, mode);
-      return false;
-
-    case PLUS:
-      if (GET_MODE_CLASS (mode) == MODE_INT
-	  && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
-	{
-	  if (GET_CODE (XEXP (x, 0)) == PLUS
-	      && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
-	      && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
-	      && CONSTANT_P (XEXP (x, 1)))
-	    {
-	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
-	      if (val == 2 || val == 4 || val == 8)
-		{
-		  *total = cost->lea;
-		  *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
-				      outer_code, opno, speed);
-		  *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
-				      outer_code, opno, speed);
-		  *total += rtx_cost (XEXP (x, 1), mode,
-				      outer_code, opno, speed);
-		  return true;
-		}
-	    }
-	  else if (GET_CODE (XEXP (x, 0)) == MULT
-		   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
-	    {
-	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
-	      if (val == 2 || val == 4 || val == 8)
-		{
-		  *total = cost->lea;
-		  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
-				      outer_code, opno, speed);
-		  *total += rtx_cost (XEXP (x, 1), mode,
-				      outer_code, opno, speed);
-		  return true;
-		}
-	    }
-	  else if (GET_CODE (XEXP (x, 0)) == PLUS)
-	    {
-	      /* Add with carry, ignore the cost of adding a carry flag.  */
-	      if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
-		*total = cost->add;
-	      else
-		{
-		  *total = cost->lea;
-		  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
-				      outer_code, opno, speed);
-		}
-
-	      *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
-				  outer_code, opno, speed);
-	      *total += rtx_cost (XEXP (x, 1), mode,
-				  outer_code, opno, speed);
-	      return true;
-	    }
-	}
-      /* FALLTHRU */
-
-    case MINUS:
-      /* Subtract with borrow, ignore the cost of subtracting a carry flag.  */
-      if (GET_MODE_CLASS (mode) == MODE_INT
-	  && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
-	  && GET_CODE (XEXP (x, 0)) == MINUS
-	  && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
-	{
-	  *total = cost->add;
-	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
-			      outer_code, opno, speed);
-	  *total += rtx_cost (XEXP (x, 1), mode,
-			      outer_code, opno, speed);
-	  return true;
-	}
-
-      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
-	{
-	  *total = cost->addss;
-	  return false;
-	}
-      else if (X87_FLOAT_MODE_P (mode))
-	{
-	  *total = cost->fadd;
-	  return false;
-	}
-      else if (FLOAT_MODE_P (mode))
-	{
-	  *total = ix86_vec_cost (mode, cost->addss);
-	  return false;
-	}
-      /* FALLTHRU */
-
-    case AND:
-    case IOR:
-    case XOR:
-      if (GET_MODE_CLASS (mode) == MODE_INT
-	  && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
-	{
-	  *total = (cost->add * 2
-		    + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
-		       << (GET_MODE (XEXP (x, 0)) != DImode))
-		    + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
-	               << (GET_MODE (XEXP (x, 1)) != DImode)));
-	  return true;
-	}
-      /* FALLTHRU */
-
-    case NEG:
-      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
-	{
-	  *total = cost->sse_op;
-	  return false;
-	}
-      else if (X87_FLOAT_MODE_P (mode))
-	{
-	  *total = cost->fchs;
-	  return false;
-	}
-      else if (FLOAT_MODE_P (mode))
-	{
-	  *total = ix86_vec_cost (mode, cost->sse_op);
-	  return false;
-	}
-      /* FALLTHRU */
-
-    case NOT:
-      if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
-	*total = ix86_vec_cost (mode, cost->sse_op);
-      else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
-	*total = cost->add * 2;
-      else
-	*total = cost->add;
-      return false;
-
-    case COMPARE:
-      if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
-	  && XEXP (XEXP (x, 0), 1) == const1_rtx
-	  && CONST_INT_P (XEXP (XEXP (x, 0), 2))
-	  && XEXP (x, 1) == const0_rtx)
-	{
-	  /* This kind of construct is implemented using test[bwl].
-	     Treat it as if we had an AND.  */
-	  mode = GET_MODE (XEXP (XEXP (x, 0), 0));
-	  *total = (cost->add
-		    + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
-				opno, speed)
-		    + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
-	  return true;
-	}
-
-      /* The embedded comparison operand is completely free.  */
-      if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
-	  && XEXP (x, 1) == const0_rtx)
-	*total = 0;
-
-      return false;
-
-    case FLOAT_EXTEND:
-      if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
-	*total = 0;
-      else
-        *total = ix86_vec_cost (mode, cost->addss);
-      return false;
-
-    case FLOAT_TRUNCATE:
-      if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
-	*total = cost->fadd;
-      else
-        *total = ix86_vec_cost (mode, cost->addss);
-      return false;
-
-    case ABS:
-      /* SSE requires memory load for the constant operand. It may make
-	 sense to account for this.  Of course the constant operand may or
-	 may not be reused. */
-      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
-	*total = cost->sse_op;
-      else if (X87_FLOAT_MODE_P (mode))
-	*total = cost->fabs;
-      else if (FLOAT_MODE_P (mode))
-	*total = ix86_vec_cost (mode, cost->sse_op);
-      return false;
-
-    case SQRT:
-      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
-	*total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
-      else if (X87_FLOAT_MODE_P (mode))
-	*total = cost->fsqrt;
-      else if (FLOAT_MODE_P (mode))
-	*total = ix86_vec_cost (mode,
-				mode == SFmode ? cost->sqrtss : cost->sqrtsd);
-      return false;
-
-    case UNSPEC:
-      if (XINT (x, 1) == UNSPEC_TP)
-	*total = 0;
-      return false;
-
-    case VEC_SELECT:
-    case VEC_CONCAT:
-    case VEC_DUPLICATE:
-      /* ??? Assume all of these vector manipulation patterns are
-	 recognizable.  In which case they all pretty much have the
-	 same cost.  */
-     *total = cost->sse_op;
-     return true;
-    case VEC_MERGE:
-      mask = XEXP (x, 2);
-      /* This is masked instruction, assume the same cost,
-	 as nonmasked variant.  */
-      if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
-	*total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
-      else
-	*total = cost->sse_op;
-      return true;
-
-    default:
-      return false;
-    }
-}
-
-#if TARGET_MACHO
-
-static int current_machopic_label_num;
-
-/* Given a symbol name and its associated stub, write out the
-   definition of the stub.  */
-
-void
-machopic_output_stub (FILE *file, const char *symb, const char *stub)
-{
-  unsigned int length;
-  char *binder_name, *symbol_name, lazy_ptr_name[32];
-  int label = ++current_machopic_label_num;
-
-  /* For 64-bit we shouldn't get here.  */
-  gcc_assert (!TARGET_64BIT);
-
-  /* Lose our funky encoding stuff so it doesn't contaminate the stub.  */
-  symb = targetm.strip_name_encoding (symb);
-
-  length = strlen (stub);
-  binder_name = XALLOCAVEC (char, length + 32);
-  GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
-
-  length = strlen (symb);
-  symbol_name = XALLOCAVEC (char, length + 32);
-  GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
-
-  sprintf (lazy_ptr_name, "L%d$lz", label);
-
-  if (MACHOPIC_ATT_STUB)
-    switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
-  else if (MACHOPIC_PURE)
-    switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
-  else
-    switch_to_section (darwin_sections[machopic_symbol_stub_section]);
-
-  fprintf (file, "%s:\n", stub);
-  fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
-
-  if (MACHOPIC_ATT_STUB)
-    {
-      fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
-    }
-  else if (MACHOPIC_PURE)
-    {
-      /* PIC stub.  */
-      /* 25-byte PIC stub using "CALL get_pc_thunk".  */
-      rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
-      output_set_got (tmp, NULL_RTX);	/* "CALL ___<cpu>.get_pc_thunk.cx".  */
-      fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
-	       label, lazy_ptr_name, label);
-      fprintf (file, "\tjmp\t*%%ecx\n");
-    }
-  else
-    fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
-
-  /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
-     it needs no stub-binding-helper.  */
-  if (MACHOPIC_ATT_STUB)
-    return;
-
-  fprintf (file, "%s:\n", binder_name);
-
-  if (MACHOPIC_PURE)
-    {
-      fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
-      fprintf (file, "\tpushl\t%%ecx\n");
-    }
-  else
-    fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
-
-  fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
-
-  /* N.B. Keep the correspondence of these
-     'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
-     old-pic/new-pic/non-pic stubs; altering this will break
-     compatibility with existing dylibs.  */
-  if (MACHOPIC_PURE)
-    {
-      /* 25-byte PIC stub using "CALL get_pc_thunk".  */
-      switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
-    }
-  else
-    /* 16-byte -mdynamic-no-pic stub.  */
-    switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
-
-  fprintf (file, "%s:\n", lazy_ptr_name);
-  fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
-  fprintf (file, ASM_LONG "%s\n", binder_name);
-}
-#endif /* TARGET_MACHO */
-
-/* Order the registers for register allocator.  */
-
-void
-x86_order_regs_for_local_alloc (void)
-{
-   int pos = 0;
-   int i;
-
-   /* First allocate the local general purpose registers.  */
-   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-     if (GENERAL_REGNO_P (i) && call_used_regs[i])
-	reg_alloc_order [pos++] = i;
-
-   /* Global general purpose registers.  */
-   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-     if (GENERAL_REGNO_P (i) && !call_used_regs[i])
-	reg_alloc_order [pos++] = i;
-
-   /* x87 registers come first in case we are doing FP math
-      using them.  */
-   if (!TARGET_SSE_MATH)
-     for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
-       reg_alloc_order [pos++] = i;
-
-   /* SSE registers.  */
-   for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
-     reg_alloc_order [pos++] = i;
-   for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
-     reg_alloc_order [pos++] = i;
-
-   /* Extended REX SSE registers.  */
-   for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
-     reg_alloc_order [pos++] = i;
-
-   /* Mask register.  */
-   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
-     reg_alloc_order [pos++] = i;
-
-   /* x87 registers.  */
-   if (TARGET_SSE_MATH)
-     for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
-       reg_alloc_order [pos++] = i;
-
-   for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
-     reg_alloc_order [pos++] = i;
-
-   /* Initialize the rest of array as we do not allocate some registers
-      at all.  */
-   while (pos < FIRST_PSEUDO_REGISTER)
-     reg_alloc_order [pos++] = 0;
-}
-
-/* Handle a "callee_pop_aggregate_return" attribute; arguments as
-   in struct attribute_spec handler.  */
-static tree
-ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int,
-					 bool *no_add_attrs)
-{
-  if (TREE_CODE (*node) != FUNCTION_TYPE
-      && TREE_CODE (*node) != METHOD_TYPE
-      && TREE_CODE (*node) != FIELD_DECL
-      && TREE_CODE (*node) != TYPE_DECL)
-    {
-      warning (OPT_Wattributes, "%qE attribute only applies to functions",
-	       name);
-      *no_add_attrs = true;
-      return NULL_TREE;
-    }
-  if (TARGET_64BIT)
-    {
-      warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
-	       name);
-      *no_add_attrs = true;
-      return NULL_TREE;
-    }
-  if (is_attribute_p ("callee_pop_aggregate_return", name))
-    {
-      tree cst;
-
-      cst = TREE_VALUE (args);
-      if (TREE_CODE (cst) != INTEGER_CST)
-	{
-	  warning (OPT_Wattributes,
-		   "%qE attribute requires an integer constant argument",
-		   name);
-	  *no_add_attrs = true;
-	}
-      else if (compare_tree_int (cst, 0) != 0
-	       && compare_tree_int (cst, 1) != 0)
-	{
-	  warning (OPT_Wattributes,
-		   "argument to %qE attribute is neither zero, nor one",
-		   name);
-	  *no_add_attrs = true;
-	}
-
-      return NULL_TREE;
-    }
-
-  return NULL_TREE;
-}
-
-/* Handle a "ms_abi" or "sysv" attribute; arguments as in
-   struct attribute_spec.handler.  */
-static tree
-ix86_handle_abi_attribute (tree *node, tree name, tree, int,
-			   bool *no_add_attrs)
-{
-  if (TREE_CODE (*node) != FUNCTION_TYPE
-      && TREE_CODE (*node) != METHOD_TYPE
-      && TREE_CODE (*node) != FIELD_DECL
-      && TREE_CODE (*node) != TYPE_DECL)
-    {
-      warning (OPT_Wattributes, "%qE attribute only applies to functions",
-	       name);
-      *no_add_attrs = true;
-      return NULL_TREE;
-    }
-
-  /* Can combine regparm with all attributes but fastcall.  */
-  if (is_attribute_p ("ms_abi", name))
-    {
-      if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
-        {
-	  error ("ms_abi and sysv_abi attributes are not compatible");
-	}
-
-      return NULL_TREE;
-    }
-  else if (is_attribute_p ("sysv_abi", name))
-    {
-      if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
-        {
-	  error ("ms_abi and sysv_abi attributes are not compatible");
-	}
-
-      return NULL_TREE;
-    }
-
-  return NULL_TREE;
-}
-
-/* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
-   struct attribute_spec.handler.  */
-static tree
-ix86_handle_struct_attribute (tree *node, tree name, tree, int,
-			      bool *no_add_attrs)
-{
-  tree *type = NULL;
-  if (DECL_P (*node))
-    {
-      if (TREE_CODE (*node) == TYPE_DECL)
-	type = &TREE_TYPE (*node);
-    }
-  else
-    type = node;
-
-  if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
-    {
-      warning (OPT_Wattributes, "%qE attribute ignored",
-	       name);
-      *no_add_attrs = true;
-    }
-
-  else if ((is_attribute_p ("ms_struct", name)
-	    && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
-	   || ((is_attribute_p ("gcc_struct", name)
-		&& lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
-    {
-      warning (OPT_Wattributes, "%qE incompatible attribute ignored",
-               name);
-      *no_add_attrs = true;
-    }
-
-  return NULL_TREE;
-}
-
-static tree
-ix86_handle_fndecl_attribute (tree *node, tree name, tree args, int,
-			      bool *no_add_attrs)
-{
-  if (TREE_CODE (*node) != FUNCTION_DECL)
-    {
-      warning (OPT_Wattributes, "%qE attribute only applies to functions",
-               name);
-      *no_add_attrs = true;
-    }
-
-  if (is_attribute_p ("indirect_branch", name))
-    {
-      tree cst = TREE_VALUE (args);
-      if (TREE_CODE (cst) != STRING_CST)
-	{
-	  warning (OPT_Wattributes,
-		   "%qE attribute requires a string constant argument",
-		   name);
-	  *no_add_attrs = true;
-	}
-      else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
-	       && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
-	       && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
-	       && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
-	{
-	  warning (OPT_Wattributes,
-		   "argument to %qE attribute is not "
-		   "(keep|thunk|thunk-inline|thunk-extern)", name);
-	  *no_add_attrs = true;
-	}
-    }
-
-  if (is_attribute_p ("function_return", name))
-    {
-      tree cst = TREE_VALUE (args);
-      if (TREE_CODE (cst) != STRING_CST)
-	{
-	  warning (OPT_Wattributes,
-		   "%qE attribute requires a string constant argument",
-		   name);
-	  *no_add_attrs = true;
-	}
-      else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
-	       && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
-	       && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
-	       && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
-	{
-	  warning (OPT_Wattributes,
-		   "argument to %qE attribute is not "
-		   "(keep|thunk|thunk-inline|thunk-extern)", name);
-	  *no_add_attrs = true;
-	}
-    }
-
-  return NULL_TREE;
-}
-
-static tree
-ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
-						 int, bool *)
-{
-  return NULL_TREE;
-}
-
-static tree
-ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
-{
-  /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
-     but the function type contains args and return type data.  */
-  tree func_type = *node;
-  tree return_type = TREE_TYPE (func_type);
-
-  int nargs = 0;
-  tree current_arg_type = TYPE_ARG_TYPES (func_type);
-  while (current_arg_type
-	 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
-    {
-      if (nargs == 0)
-	{
-	  if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
-	    error ("interrupt service routine should have a pointer "
-		   "as the first argument");
-	}
-      else if (nargs == 1)
-	{
-	  if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
-	      || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
-	    error ("interrupt service routine should have %qs "
-		   "as the second argument",
-		   TARGET_64BIT
-		   ? (TARGET_X32 ? "unsigned long long int"
-				 : "unsigned long int")
-		   : "unsigned int");
-	}
-      nargs++;
-      current_arg_type = TREE_CHAIN (current_arg_type);
-    }
-  if (!nargs || nargs > 2)
-    error ("interrupt service routine can only have a pointer argument "
-	   "and an optional integer argument");
-  if (! VOID_TYPE_P (return_type))
-    error ("interrupt service routine can%'t have non-void return value");
-
-  return NULL_TREE;
-}
-
-static bool
-ix86_ms_bitfield_layout_p (const_tree record_type)
-{
-  return ((TARGET_MS_BITFIELD_LAYOUT
-	   && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
-          || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
-}
-
-/* Returns an expression indicating where the this parameter is
-   located on entry to the FUNCTION.  */
-
-static rtx
-x86_this_parameter (tree function)
-{
-  tree type = TREE_TYPE (function);
-  bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
-  int nregs;
-
-  if (TARGET_64BIT)
-    {
-      const int *parm_regs;
-
-      if (ix86_function_type_abi (type) == MS_ABI)
-        parm_regs = x86_64_ms_abi_int_parameter_registers;
-      else
-        parm_regs = x86_64_int_parameter_registers;
-      return gen_rtx_REG (Pmode, parm_regs[aggr]);
-    }
-
-  nregs = ix86_function_regparm (type, function);
-
-  if (nregs > 0 && !stdarg_p (type))
-    {
-      int regno;
-      unsigned int ccvt = ix86_get_callcvt (type);
-
-      if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
-	regno = aggr ? DX_REG : CX_REG;
-      else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
-        {
-	  regno = CX_REG;
-	  if (aggr)
-	    return gen_rtx_MEM (SImode,
-				plus_constant (Pmode, stack_pointer_rtx, 4));
-	}
-      else
-        {
-	  regno = AX_REG;
-	  if (aggr)
-	    {
-	      regno = DX_REG;
-	      if (nregs == 1)
-		return gen_rtx_MEM (SImode,
-				    plus_constant (Pmode,
-						   stack_pointer_rtx, 4));
-	    }
-	}
-      return gen_rtx_REG (SImode, regno);
-    }
-
-  return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
-					     aggr ? 8 : 4));
-}
-
-/* Determine whether x86_output_mi_thunk can succeed.  */
-
-static bool
-x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
-			 const_tree function)
-{
-  /* 64-bit can handle anything.  */
-  if (TARGET_64BIT)
-    return true;
-
-  /* For 32-bit, everything's fine if we have one free register.  */
-  if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
-    return true;
-
-  /* Need a free register for vcall_offset.  */
-  if (vcall_offset)
-    return false;
-
-  /* Need a free register for GOT references.  */
-  if (flag_pic && !targetm.binds_local_p (function))
-    return false;
-
-  /* Otherwise ok.  */
-  return true;
-}
-
-/* Output the assembler code for a thunk function.  THUNK_DECL is the
-   declaration for the thunk function itself, FUNCTION is the decl for
-   the target function.  DELTA is an immediate constant offset to be
-   added to THIS.  If VCALL_OFFSET is nonzero, the word at
-   *(*this + vcall_offset) should be added to THIS.  */
-
-static void
-x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
-		     HOST_WIDE_INT vcall_offset, tree function)
-{
-  rtx this_param = x86_this_parameter (function);
-  rtx this_reg, tmp, fnaddr;
-  unsigned int tmp_regno;
-  rtx_insn *insn;
-
-  if (TARGET_64BIT)
-    tmp_regno = R10_REG;
-  else
-    {
-      unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
-      if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
-	tmp_regno = AX_REG;
-      else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
-	tmp_regno = DX_REG;
-      else
-	tmp_regno = CX_REG;
-    }
-
-  emit_note (NOTE_INSN_PROLOGUE_END);
-
-  /* CET is enabled, insert EB instruction.  */
-  if ((flag_cf_protection & CF_BRANCH))
-    emit_insn (gen_nop_endbr ());
-
-  /* If VCALL_OFFSET, we'll need THIS in a register.  Might as well
-     pull it in now and let DELTA benefit.  */
-  if (REG_P (this_param))
-    this_reg = this_param;
-  else if (vcall_offset)
-    {
-      /* Put the this parameter into %eax.  */
-      this_reg = gen_rtx_REG (Pmode, AX_REG);
-      emit_move_insn (this_reg, this_param);
-    }
-  else
-    this_reg = NULL_RTX;
-
-  /* Adjust the this parameter by a fixed constant.  */
-  if (delta)
-    {
-      rtx delta_rtx = GEN_INT (delta);
-      rtx delta_dst = this_reg ? this_reg : this_param;
-
-      if (TARGET_64BIT)
-	{
-	  if (!x86_64_general_operand (delta_rtx, Pmode))
-	    {
-	      tmp = gen_rtx_REG (Pmode, tmp_regno);
-	      emit_move_insn (tmp, delta_rtx);
-	      delta_rtx = tmp;
-	    }
-	}
-
-      ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
-    }
-
-  /* Adjust the this parameter by a value stored in the vtable.  */
-  if (vcall_offset)
-    {
-      rtx vcall_addr, vcall_mem, this_mem;
-
-      tmp = gen_rtx_REG (Pmode, tmp_regno);
-
-      this_mem = gen_rtx_MEM (ptr_mode, this_reg);
-      if (Pmode != ptr_mode)
-	this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
-      emit_move_insn (tmp, this_mem);
-
-      /* Adjust the this parameter.  */
-      vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
-      if (TARGET_64BIT
-	  && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
-	{
-	  rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
-	  emit_move_insn (tmp2, GEN_INT (vcall_offset));
-	  vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
-	}
-
-      vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
-      if (Pmode != ptr_mode)
-	emit_insn (gen_addsi_1_zext (this_reg,
-				     gen_rtx_REG (ptr_mode,
-						  REGNO (this_reg)),
-				     vcall_mem));
-      else
-	ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
-    }
-
-  /* If necessary, drop THIS back to its stack slot.  */
-  if (this_reg && this_reg != this_param)
-    emit_move_insn (this_param, this_reg);
-
-  fnaddr = XEXP (DECL_RTL (function), 0);
-  if (TARGET_64BIT)
-    {
-      if (!flag_pic || targetm.binds_local_p (function)
-	  || TARGET_PECOFF)
-	;
-      else
-	{
-	  tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
-	  tmp = gen_rtx_CONST (Pmode, tmp);
-	  fnaddr = gen_const_mem (Pmode, tmp);
-	}
-    }
-  else
-    {
-      if (!flag_pic || targetm.binds_local_p (function))
-	;
-#if TARGET_MACHO
-      else if (TARGET_MACHO)
-	{
-	  fnaddr = machopic_indirect_call_target (DECL_RTL (function));
-	  fnaddr = XEXP (fnaddr, 0);
-	}
-#endif /* TARGET_MACHO */
-      else
-	{
-	  tmp = gen_rtx_REG (Pmode, CX_REG);
-	  output_set_got (tmp, NULL_RTX);
-
-	  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
-	  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
-	  fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
-	  fnaddr = gen_const_mem (Pmode, fnaddr);
-	}
-    }
-
-  /* Our sibling call patterns do not allow memories, because we have no
-     predicate that can distinguish between frame and non-frame memory.
-     For our purposes here, we can get away with (ab)using a jump pattern,
-     because we're going to do no optimization.  */
-  if (MEM_P (fnaddr))
-    {
-      if (sibcall_insn_operand (fnaddr, word_mode))
-	{
-	  fnaddr = XEXP (DECL_RTL (function), 0);
-	  tmp = gen_rtx_MEM (QImode, fnaddr);
-	  tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
-	  tmp = emit_call_insn (tmp);
-	  SIBLING_CALL_P (tmp) = 1;
-	}
-      else
-	emit_jump_insn (gen_indirect_jump (fnaddr));
-    }
-  else
-    {
-      if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
-	{
-	  // CM_LARGE_PIC always uses pseudo PIC register which is
-	  // uninitialized.  Since FUNCTION is local and calling it
-	  // doesn't go through PLT, we use scratch register %r11 as
-	  // PIC register and initialize it here.
-	  pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
-	  ix86_init_large_pic_reg (tmp_regno);
-	  fnaddr = legitimize_pic_address (fnaddr,
-					   gen_rtx_REG (Pmode, tmp_regno));
-	}
-
-      if (!sibcall_insn_operand (fnaddr, word_mode))
-	{
-	  tmp = gen_rtx_REG (word_mode, tmp_regno);
-	  if (GET_MODE (fnaddr) != word_mode)
-	    fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
-	  emit_move_insn (tmp, fnaddr);
-	  fnaddr = tmp;
-	}
-
-      tmp = gen_rtx_MEM (QImode, fnaddr);
-      tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
-      tmp = emit_call_insn (tmp);
-      SIBLING_CALL_P (tmp) = 1;
-    }
-  emit_barrier ();
-
-  /* Emit just enough of rest_of_compilation to get the insns emitted.
-     Note that use_thunk calls assemble_start_function et al.  */
-  insn = get_insns ();
-  shorten_branches (insn);
-  final_start_function (insn, file, 1);
-  final (insn, file, 1);
-  final_end_function ();
-}
-
-static void
-x86_file_start (void)
-{
-  default_file_start ();
-  if (TARGET_16BIT)
-    fputs ("\t.code16gcc\n", asm_out_file);
-#if TARGET_MACHO
-  darwin_file_start ();
-#endif
-  if (X86_FILE_START_VERSION_DIRECTIVE)
-    fputs ("\t.version\t\"01.01\"\n", asm_out_file);
-  if (X86_FILE_START_FLTUSED)
-    fputs ("\t.global\t__fltused\n", asm_out_file);
-  if (ix86_asm_dialect == ASM_INTEL)
-    fputs ("\t.intel_syntax noprefix\n", asm_out_file);
-}
-
-int
-x86_field_alignment (tree type, int computed)
-{
-  machine_mode mode;
-
-  if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
-    return computed;
-  if (TARGET_IAMCU)
-    return iamcu_alignment (type, computed);
-  mode = TYPE_MODE (strip_array_types (type));
-  if (mode == DFmode || mode == DCmode
-      || GET_MODE_CLASS (mode) == MODE_INT
-      || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
-    return MIN (32, computed);
-  return computed;
-}
-
-/* Print call to TARGET to FILE.  */
-
-static void
-x86_print_call_or_nop (FILE *file, const char *target)
-{
-  if (flag_nop_mcount || !strcmp (target, "nop"))
-    /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
-    fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
-  else
-    fprintf (file, "1:\tcall\t%s\n", target);
-}
-
-static bool
-current_fentry_name (const char **name)
-{
-  tree attr = lookup_attribute ("fentry_name",
-				DECL_ATTRIBUTES (current_function_decl));
-  if (!attr)
-    return false;
-  *name = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr)));
-  return true;
-}
-
-static bool
-current_fentry_section (const char **name)
-{
-  tree attr = lookup_attribute ("fentry_section",
-				DECL_ATTRIBUTES (current_function_decl));
-  if (!attr)
-    return false;
-  *name = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr)));
-  return true;
-}
-
-/* Output assembler code to FILE to increment profiler label # LABELNO
-   for profiling a function entry.  */
-void
-x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
-{
-  if (cfun->machine->endbr_queued_at_entrance)
-    fprintf (file, "\t%s\n", TARGET_64BIT ? "endbr64" : "endbr32");
-
-  const char *mcount_name = MCOUNT_NAME;
-
-  if (current_fentry_name (&mcount_name))
-    ;
-  else if (fentry_name)
-    mcount_name = fentry_name;
-  else if (flag_fentry)
-    mcount_name = MCOUNT_NAME_BEFORE_PROLOGUE;
-
-  if (TARGET_64BIT)
-    {
-#ifndef NO_PROFILE_COUNTERS
-      fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
-#endif
-
-      if (!TARGET_PECOFF && flag_pic)
-	fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
-      else
-	x86_print_call_or_nop (file, mcount_name);
-    }
-  else if (flag_pic)
-    {
-#ifndef NO_PROFILE_COUNTERS
-      fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
-	       LPREFIX, labelno);
-#endif
-      fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
-    }
-  else
-    {
-#ifndef NO_PROFILE_COUNTERS
-      fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
-	       LPREFIX, labelno);
-#endif
-      x86_print_call_or_nop (file, mcount_name);
-    }
-
-  if (flag_record_mcount
-	|| lookup_attribute ("fentry_section",
-                                DECL_ATTRIBUTES (current_function_decl)))
-    {
-      const char *sname = "__mcount_loc";
-
-      if (current_fentry_section (&sname))
-	;
-      else if (fentry_section)
-	sname = fentry_section;
-
-      fprintf (file, "\t.section %s, \"a\",@progbits\n", sname);
-      fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
-      fprintf (file, "\t.previous\n");
-    }
-}
-
-/* We don't have exact information about the insn sizes, but we may assume
-   quite safely that we are informed about all 1 byte insns and memory
-   address sizes.  This is enough to eliminate unnecessary padding in
-   99% of cases.  */
-
-int
-ix86_min_insn_size (rtx_insn *insn)
-{
-  int l = 0, len;
-
-  if (!INSN_P (insn) || !active_insn_p (insn))
-    return 0;
-
-  /* Discard alignments we've emit and jump instructions.  */
-  if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
-      && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
-    return 0;
-
-  /* Important case - calls are always 5 bytes.
-     It is common to have many calls in the row.  */
-  if (CALL_P (insn)
-      && symbolic_reference_mentioned_p (PATTERN (insn))
-      && !SIBLING_CALL_P (insn))
-    return 5;
-  len = get_attr_length (insn);
-  if (len <= 1)
-    return 1;
-
-  /* For normal instructions we rely on get_attr_length being exact,
-     with a few exceptions.  */
-  if (!JUMP_P (insn))
-    {
-      enum attr_type type = get_attr_type (insn);
-
-      switch (type)
-	{
-	case TYPE_MULTI:
-	  if (GET_CODE (PATTERN (insn)) == ASM_INPUT
-	      || asm_noperands (PATTERN (insn)) >= 0)
-	    return 0;
-	  break;
-	case TYPE_OTHER:
-	case TYPE_FCMP:
-	  break;
-	default:
-	  /* Otherwise trust get_attr_length.  */
-	  return len;
-	}
-
-      l = get_attr_length_address (insn);
-      if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
-	l = 4;
-    }
-  if (l)
-    return 1+l;
-  else
-    return 2;
-}
-
-#ifdef ASM_OUTPUT_MAX_SKIP_PAD
-
-/* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
-   window.  */
-
-static void
-ix86_avoid_jump_mispredicts (void)
-{
-  rtx_insn *insn, *start = get_insns ();
-  int nbytes = 0, njumps = 0;
-  bool isjump = false;
-
-  /* Look for all minimal intervals of instructions containing 4 jumps.
-     The intervals are bounded by START and INSN.  NBYTES is the total
-     size of instructions in the interval including INSN and not including
-     START.  When the NBYTES is smaller than 16 bytes, it is possible
-     that the end of START and INSN ends up in the same 16byte page.
-
-     The smallest offset in the page INSN can start is the case where START
-     ends on the offset 0.  Offset of INSN is then NBYTES - sizeof (INSN).
-     We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
-
-     Don't consider asm goto as jump, while it can contain a jump, it doesn't
-     have to, control transfer to label(s) can be performed through other
-     means, and also we estimate minimum length of all asm stmts as 0.  */
-  for (insn = start; insn; insn = NEXT_INSN (insn))
-    {
-      int min_size;
-
-      if (LABEL_P (insn))
-	{
-	  align_flags alignment = label_to_alignment (insn);
-	  int align = alignment.levels[0].log;
-	  int max_skip = alignment.levels[0].maxskip;
-
-	  if (max_skip > 15)
-	    max_skip = 15;
-	  /* If align > 3, only up to 16 - max_skip - 1 bytes can be
-	     already in the current 16 byte page, because otherwise
-	     ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
-	     bytes to reach 16 byte boundary.  */
-	  if (align <= 0
-	      || (align <= 3 && max_skip != (1 << align) - 1))
-	    max_skip = 0;
-	  if (dump_file)
-	    fprintf (dump_file, "Label %i with max_skip %i\n",
-		     INSN_UID (insn), max_skip);
-	  if (max_skip)
-	    {
-	      while (nbytes + max_skip >= 16)
-		{
-		  start = NEXT_INSN (start);
-		  if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
-		      || CALL_P (start))
-		    njumps--, isjump = true;
-		  else
-		    isjump = false;
-		  nbytes -= ix86_min_insn_size (start);
-		}
-	    }
-	  continue;
-	}
-
-      min_size = ix86_min_insn_size (insn);
-      nbytes += min_size;
-      if (dump_file)
-	fprintf (dump_file, "Insn %i estimated to %i bytes\n",
-		 INSN_UID (insn), min_size);
-      if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
-	  || CALL_P (insn))
-	njumps++;
-      else
-	continue;
-
-      while (njumps > 3)
-	{
-	  start = NEXT_INSN (start);
-	  if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
-	      || CALL_P (start))
-	    njumps--, isjump = true;
-	  else
-	    isjump = false;
-	  nbytes -= ix86_min_insn_size (start);
-	}
-      gcc_assert (njumps >= 0);
-      if (dump_file)
-        fprintf (dump_file, "Interval %i to %i has %i bytes\n",
-		 INSN_UID (start), INSN_UID (insn), nbytes);
-
-      if (njumps == 3 && isjump && nbytes < 16)
-	{
-	  int padsize = 15 - nbytes + ix86_min_insn_size (insn);
-
-	  if (dump_file)
-	    fprintf (dump_file, "Padding insn %i by %i bytes!\n",
-		     INSN_UID (insn), padsize);
-          emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
-	}
-    }
-}
-#endif
-
-/* AMD Athlon works faster
-   when RET is not destination of conditional jump or directly preceded
-   by other jump instruction.  We avoid the penalty by inserting NOP just
-   before the RET instructions in such cases.  */
-static void
-ix86_pad_returns (void)
-{
-  edge e;
-  edge_iterator ei;
-
-  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
-    {
-      basic_block bb = e->src;
-      rtx_insn *ret = BB_END (bb);
-      rtx_insn *prev;
-      bool replace = false;
-
-      if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
-	  || optimize_bb_for_size_p (bb))
-	continue;
-      for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
-	if (active_insn_p (prev) || LABEL_P (prev))
-	  break;
-      if (prev && LABEL_P (prev))
-	{
-	  edge e;
-	  edge_iterator ei;
-
-	  FOR_EACH_EDGE (e, ei, bb->preds)
-	    if (EDGE_FREQUENCY (e) && e->src->index >= 0
-		&& !(e->flags & EDGE_FALLTHRU))
-	      {
-		replace = true;
-		break;
-	      }
-	}
-      if (!replace)
-	{
-	  prev = prev_active_insn (ret);
-	  if (prev
-	      && ((JUMP_P (prev) && any_condjump_p (prev))
-		  || CALL_P (prev)))
-	    replace = true;
-	  /* Empty functions get branch mispredict even when
-	     the jump destination is not visible to us.  */
-	  if (!prev && !optimize_function_for_size_p (cfun))
-	    replace = true;
-	}
-      if (replace)
-	{
-	  emit_jump_insn_before (gen_simple_return_internal_long (), ret);
-	  delete_insn (ret);
-	}
-    }
-}
-
-/* Count the minimum number of instructions in BB.  Return 4 if the
-   number of instructions >= 4.  */
-
-static int
-ix86_count_insn_bb (basic_block bb)
-{
-  rtx_insn *insn;
-  int insn_count = 0;
-
-  /* Count number of instructions in this block.  Return 4 if the number
-     of instructions >= 4.  */
-  FOR_BB_INSNS (bb, insn)
-    {
-      /* Only happen in exit blocks.  */
-      if (JUMP_P (insn)
-	  && ANY_RETURN_P (PATTERN (insn)))
-	break;
-
-      if (NONDEBUG_INSN_P (insn)
-	  && GET_CODE (PATTERN (insn)) != USE
-	  && GET_CODE (PATTERN (insn)) != CLOBBER)
-	{
-	  insn_count++;
-	  if (insn_count >= 4)
-	    return insn_count;
-	}
-    }
-
-  return insn_count;
-}
-
-
-/* Count the minimum number of instructions in code path in BB.
-   Return 4 if the number of instructions >= 4.  */
-
-static int
-ix86_count_insn (basic_block bb)
-{
-  edge e;
-  edge_iterator ei;
-  int min_prev_count;
-
-  /* Only bother counting instructions along paths with no
-     more than 2 basic blocks between entry and exit.  Given
-     that BB has an edge to exit, determine if a predecessor
-     of BB has an edge from entry.  If so, compute the number
-     of instructions in the predecessor block.  If there
-     happen to be multiple such blocks, compute the minimum.  */
-  min_prev_count = 4;
-  FOR_EACH_EDGE (e, ei, bb->preds)
-    {
-      edge prev_e;
-      edge_iterator prev_ei;
-
-      if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
-	{
-	  min_prev_count = 0;
-	  break;
-	}
-      FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
-	{
-	  if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
-	    {
-	      int count = ix86_count_insn_bb (e->src);
-	      if (count < min_prev_count)
-		min_prev_count = count;
-	      break;
-	    }
-	}
-    }
-
-  if (min_prev_count < 4)
-    min_prev_count += ix86_count_insn_bb (bb);
-
-  return min_prev_count;
-}
-
-/* Pad short function to 4 instructions.   */
-
-static void
-ix86_pad_short_function (void)
-{
-  edge e;
-  edge_iterator ei;
-
-  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
-    {
-      rtx_insn *ret = BB_END (e->src);
-      if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
-	{
-	  int insn_count = ix86_count_insn (e->src);
-
-	  /* Pad short function.  */
-	  if (insn_count < 4)
-	    {
-	      rtx_insn *insn = ret;
-
-	      /* Find epilogue.  */
-	      while (insn
-		     && (!NOTE_P (insn)
-			 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
-		insn = PREV_INSN (insn);
-
-	      if (!insn)
-		insn = ret;
-
-	      /* Two NOPs count as one instruction.  */
-	      insn_count = 2 * (4 - insn_count);
-	      emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
-	    }
-	}
-    }
-}
-
-/* Fix up a Windows system unwinder issue.  If an EH region falls through into
-   the epilogue, the Windows system unwinder will apply epilogue logic and
-   produce incorrect offsets.  This can be avoided by adding a nop between
-   the last insn that can throw and the first insn of the epilogue.  */
-
-static void
-ix86_seh_fixup_eh_fallthru (void)
-{
-  edge e;
-  edge_iterator ei;
-
-  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
-    {
-      rtx_insn *insn, *next;
-
-      /* Find the beginning of the epilogue.  */
-      for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
-	if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
-	  break;
-      if (insn == NULL)
-	continue;
-
-      /* We only care about preceding insns that can throw.  */
-      insn = prev_active_insn (insn);
-      if (insn == NULL || !can_throw_internal (insn))
-	continue;
-
-      /* Do not separate calls from their debug information.  */
-      for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
-	if (NOTE_P (next) && NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION)
-	  insn = next;
-	else
-	  break;
-
-      emit_insn_after (gen_nops (const1_rtx), insn);
-    }
-}
-
-/* Implement machine specific optimizations.  We implement padding of returns
-   for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
-static void
-ix86_reorg (void)
-{
-  /* We are freeing block_for_insn in the toplev to keep compatibility
-     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
-  compute_bb_for_insn ();
-
-  if (TARGET_SEH && current_function_has_exception_handlers ())
-    ix86_seh_fixup_eh_fallthru ();
-
-  if (optimize && optimize_function_for_speed_p (cfun))
-    {
-      if (TARGET_PAD_SHORT_FUNCTION)
-	ix86_pad_short_function ();
-      else if (TARGET_PAD_RETURNS)
-	ix86_pad_returns ();
-#ifdef ASM_OUTPUT_MAX_SKIP_PAD
-      if (TARGET_FOUR_JUMP_LIMIT)
-	ix86_avoid_jump_mispredicts ();
-#endif
-    }
-}
-
-/* Return nonzero when QImode register that must be represented via REX prefix
-   is used.  */
-bool
-x86_extended_QIreg_mentioned_p (rtx_insn *insn)
-{
-  int i;
-  extract_insn_cached (insn);
-  for (i = 0; i < recog_data.n_operands; i++)
-    if (GENERAL_REG_P (recog_data.operand[i])
-	&& !QI_REGNO_P (REGNO (recog_data.operand[i])))
-       return true;
-  return false;
-}
-
-/* Return true when INSN mentions register that must be encoded using REX
-   prefix.  */
-bool
-x86_extended_reg_mentioned_p (rtx insn)
-{
-  subrtx_iterator::array_type array;
-  FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
-    {
-      const_rtx x = *iter;
-      if (REG_P (x)
-	  && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
-	return true;
-    }
-  return false;
-}
-
-/* If profitable, negate (without causing overflow) integer constant
-   of mode MODE at location LOC.  Return true in this case.  */
-bool
-x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
-{
-  HOST_WIDE_INT val;
-
-  if (!CONST_INT_P (*loc))
-    return false;
-
-  switch (mode)
-    {
-    case E_DImode:
-      /* DImode x86_64 constants must fit in 32 bits.  */
-      gcc_assert (x86_64_immediate_operand (*loc, mode));
-
-      mode = SImode;
-      break;
-
-    case E_SImode:
-    case E_HImode:
-    case E_QImode:
-      break;
-
-    default:
-      gcc_unreachable ();
-    }
-
-  /* Avoid overflows.  */
-  if (mode_signbit_p (mode, *loc))
-    return false;
-
-  val = INTVAL (*loc);
-
-  /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
-     Exceptions: -128 encodes smaller than 128, so swap sign and op.  */
-  if ((val < 0 && val != -128)
-      || val == 128)
-    {
-      *loc = GEN_INT (-val);
-      return true;
-    }
-
-  return false;
-}
-
-/* Generate an unsigned DImode/SImode to FP conversion.  This is the same code
-   optabs would emit if we didn't have TFmode patterns.  */
-
-void
-x86_emit_floatuns (rtx operands[2])
-{
-  rtx_code_label *neglab, *donelab;
-  rtx i0, i1, f0, in, out;
-  machine_mode mode, inmode;
-
-  inmode = GET_MODE (operands[1]);
-  gcc_assert (inmode == SImode || inmode == DImode);
-
-  out = operands[0];
-  in = force_reg (inmode, operands[1]);
-  mode = GET_MODE (out);
-  neglab = gen_label_rtx ();
-  donelab = gen_label_rtx ();
-  f0 = gen_reg_rtx (mode);
-
-  emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
-
-  expand_float (out, in, 0);
-
-  emit_jump_insn (gen_jump (donelab));
-  emit_barrier ();
-
-  emit_label (neglab);
-
-  i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
-			    1, OPTAB_DIRECT);
-  i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
-			    1, OPTAB_DIRECT);
-  i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
-
-  expand_float (f0, i0, 0);
-
-  emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
-
-  emit_label (donelab);
-}
-
-static bool canonicalize_perm (struct expand_vec_perm_d *d);
-static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
-static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
-static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
-
-/* Get a vector mode of the same size as the original but with elements
-   twice as wide.  This is only guaranteed to apply to integral vectors.  */
-
-static inline machine_mode
-get_mode_wider_vector (machine_mode o)
-{
-  /* ??? Rely on the ordering that genmodes.c gives to vectors.  */
-  machine_mode n = GET_MODE_WIDER_MODE (o).require ();
-  gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
-  gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
-  return n;
-}
-
-/* A subroutine of ix86_expand_vector_init_duplicate.  Tries to
-   fill target with val via vec_duplicate.  */
-
-static bool
-ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
-{
-  bool ok;
-  rtx_insn *insn;
-  rtx dup;
-
-  /* First attempt to recognize VAL as-is.  */
-  dup = gen_vec_duplicate (mode, val);
-  insn = emit_insn (gen_rtx_SET (target, dup));
-  if (recog_memoized (insn) < 0)
-    {
-      rtx_insn *seq;
-      machine_mode innermode = GET_MODE_INNER (mode);
-      rtx reg;
-
-      /* If that fails, force VAL into a register.  */
-
-      start_sequence ();
-      reg = force_reg (innermode, val);
-      if (GET_MODE (reg) != innermode)
-	reg = gen_lowpart (innermode, reg);
-      SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
-      seq = get_insns ();
-      end_sequence ();
-      if (seq)
-	emit_insn_before (seq, insn);
-
-      ok = recog_memoized (insn) >= 0;
-      gcc_assert (ok);
-    }
-  return true;
-}
-
-/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
-   with all elements equal to VAR.  Return true if successful.  */
-
-static bool
-ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
-				   rtx target, rtx val)
-{
-  bool ok;
-
-  switch (mode)
-    {
-    case E_V2SImode:
-    case E_V2SFmode:
-      if (!mmx_ok)
-	return false;
-      /* FALLTHRU */
-
-    case E_V4DFmode:
-    case E_V4DImode:
-    case E_V8SFmode:
-    case E_V8SImode:
-    case E_V2DFmode:
-    case E_V2DImode:
-    case E_V4SFmode:
-    case E_V4SImode:
-    case E_V16SImode:
-    case E_V8DImode:
-    case E_V16SFmode:
-    case E_V8DFmode:
-      return ix86_vector_duplicate_value (mode, target, val);
-
-    case E_V4HImode:
-      if (!mmx_ok)
-	return false;
-      if (TARGET_SSE || TARGET_3DNOW_A)
-	{
-	  rtx x;
-
-	  val = gen_lowpart (SImode, val);
-	  x = gen_rtx_TRUNCATE (HImode, val);
-	  x = gen_rtx_VEC_DUPLICATE (mode, x);
-	  emit_insn (gen_rtx_SET (target, x));
-	  return true;
-	}
-      goto widen;
-
-    case E_V8QImode:
-      if (!mmx_ok)
-	return false;
-      goto widen;
-
-    case E_V8HImode:
-      if (TARGET_AVX2)
-	return ix86_vector_duplicate_value (mode, target, val);
-
-      if (TARGET_SSE2)
-	{
-	  struct expand_vec_perm_d dperm;
-	  rtx tmp1, tmp2;
-
-	permute:
-	  memset (&dperm, 0, sizeof (dperm));
-	  dperm.target = target;
-	  dperm.vmode = mode;
-	  dperm.nelt = GET_MODE_NUNITS (mode);
-	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
-	  dperm.one_operand_p = true;
-
-	  /* Extend to SImode using a paradoxical SUBREG.  */
-	  tmp1 = gen_reg_rtx (SImode);
-	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
-
-	  /* Insert the SImode value as low element of a V4SImode vector. */
-	  tmp2 = gen_reg_rtx (V4SImode);
-	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
-	  emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
-
-	  ok = (expand_vec_perm_1 (&dperm)
-		|| expand_vec_perm_broadcast_1 (&dperm));
-	  gcc_assert (ok);
-	  return ok;
-	}
-      goto widen;
-
-    case E_V16QImode:
-      if (TARGET_AVX2)
-	return ix86_vector_duplicate_value (mode, target, val);
-
-      if (TARGET_SSE2)
-	goto permute;
-      goto widen;
-
-    widen:
-      /* Replicate the value once into the next wider mode and recurse.  */
-      {
-	machine_mode smode, wsmode, wvmode;
-	rtx x;
-
-	smode = GET_MODE_INNER (mode);
-	wvmode = get_mode_wider_vector (mode);
-	wsmode = GET_MODE_INNER (wvmode);
-
-	val = convert_modes (wsmode, smode, val, true);
-	x = expand_simple_binop (wsmode, ASHIFT, val,
-				 GEN_INT (GET_MODE_BITSIZE (smode)),
-				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
-	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
-
-	x = gen_reg_rtx (wvmode);
-	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
-	gcc_assert (ok);
-	emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
-	return ok;
-      }
-
-    case E_V16HImode:
-    case E_V32QImode:
-      if (TARGET_AVX2)
-	return ix86_vector_duplicate_value (mode, target, val);
-      else
-	{
-	  machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
-	  rtx x = gen_reg_rtx (hvmode);
-
-	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
-	  gcc_assert (ok);
-
-	  x = gen_rtx_VEC_CONCAT (mode, x, x);
-	  emit_insn (gen_rtx_SET (target, x));
-	}
-      return true;
-
-    case E_V64QImode:
-    case E_V32HImode:
-      if (TARGET_AVX512BW)
-	return ix86_vector_duplicate_value (mode, target, val);
-      else
-	{
-	  machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
-	  rtx x = gen_reg_rtx (hvmode);
-
-	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
-	  gcc_assert (ok);
-
-	  x = gen_rtx_VEC_CONCAT (mode, x, x);
-	  emit_insn (gen_rtx_SET (target, x));
-	}
-      return true;
-
-    default:
-      return false;
-    }
-}
-
-/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
-   whose ONE_VAR element is VAR, and other elements are zero.  Return true
-   if successful.  */
-
-static bool
-ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
-				     rtx target, rtx var, int one_var)
-{
-  machine_mode vsimode;
-  rtx new_target;
-  rtx x, tmp;
-  bool use_vector_set = false;
-  rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
-
-  switch (mode)
-    {
-    case E_V2DImode:
-      /* For SSE4.1, we normally use vector set.  But if the second
-	 element is zero and inter-unit moves are OK, we use movq
-	 instead.  */
-      use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
-			&& !(TARGET_INTER_UNIT_MOVES_TO_VEC
-			     && one_var == 0));
-      break;
-    case E_V16QImode:
-    case E_V4SImode:
-    case E_V4SFmode:
-      use_vector_set = TARGET_SSE4_1;
-      break;
-    case E_V8HImode:
-      use_vector_set = TARGET_SSE2;
-      break;
-    case E_V4HImode:
-      use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
-      break;
-    case E_V32QImode:
-    case E_V16HImode:
-      use_vector_set = TARGET_AVX;
-      break;
-    case E_V8SImode:
-      use_vector_set = TARGET_AVX;
-      gen_vec_set_0 = gen_vec_setv8si_0;
-      break;
-    case E_V8SFmode:
-      use_vector_set = TARGET_AVX;
-      gen_vec_set_0 = gen_vec_setv8sf_0;
-      break;
-    case E_V4DFmode:
-      use_vector_set = TARGET_AVX;
-      gen_vec_set_0 = gen_vec_setv4df_0;
-      break;
-    case E_V4DImode:
-      /* Use ix86_expand_vector_set in 64bit mode only.  */
-      use_vector_set = TARGET_AVX && TARGET_64BIT;
-      gen_vec_set_0 = gen_vec_setv4di_0;
-      break;
-    case E_V16SImode:
-      use_vector_set = TARGET_AVX512F && one_var == 0;
-      gen_vec_set_0 = gen_vec_setv16si_0;
-      break;
-    case E_V16SFmode:
-      use_vector_set = TARGET_AVX512F && one_var == 0;
-      gen_vec_set_0 = gen_vec_setv16sf_0;
-      break;
-    case E_V8DFmode:
-      use_vector_set = TARGET_AVX512F && one_var == 0;
-      gen_vec_set_0 = gen_vec_setv8df_0;
-      break;
-    case E_V8DImode:
-      /* Use ix86_expand_vector_set in 64bit mode only.  */
-      use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
-      gen_vec_set_0 = gen_vec_setv8di_0;
-      break;
-    default:
-      break;
-    }
-
-  if (use_vector_set)
-    {
-      if (gen_vec_set_0 && one_var == 0)
-	{
-	  var = force_reg (GET_MODE_INNER (mode), var);
-	  emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
-	  return true;
-	}
-      emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
-      var = force_reg (GET_MODE_INNER (mode), var);
-      ix86_expand_vector_set (mmx_ok, target, var, one_var);
-      return true;
-    }
-
-  switch (mode)
-    {
-    case E_V2SFmode:
-    case E_V2SImode:
-      if (!mmx_ok)
-	return false;
-      /* FALLTHRU */
-
-    case E_V2DFmode:
-    case E_V2DImode:
-      if (one_var != 0)
-	return false;
-      var = force_reg (GET_MODE_INNER (mode), var);
-      x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
-      emit_insn (gen_rtx_SET (target, x));
-      return true;
-
-    case E_V4SFmode:
-    case E_V4SImode:
-      if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
-	new_target = gen_reg_rtx (mode);
-      else
-	new_target = target;
-      var = force_reg (GET_MODE_INNER (mode), var);
-      x = gen_rtx_VEC_DUPLICATE (mode, var);
-      x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
-      emit_insn (gen_rtx_SET (new_target, x));
-      if (one_var != 0)
-	{
-	  /* We need to shuffle the value to the correct position, so
-	     create a new pseudo to store the intermediate result.  */
-
-	  /* With SSE2, we can use the integer shuffle insns.  */
-	  if (mode != V4SFmode && TARGET_SSE2)
-	    {
-	      emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
-					    const1_rtx,
-					    GEN_INT (one_var == 1 ? 0 : 1),
-					    GEN_INT (one_var == 2 ? 0 : 1),
-					    GEN_INT (one_var == 3 ? 0 : 1)));
-	      if (target != new_target)
-		emit_move_insn (target, new_target);
-	      return true;
-	    }
-
-	  /* Otherwise convert the intermediate result to V4SFmode and
-	     use the SSE1 shuffle instructions.  */
-	  if (mode != V4SFmode)
-	    {
-	      tmp = gen_reg_rtx (V4SFmode);
-	      emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
-	    }
-	  else
-	    tmp = new_target;
-
-	  emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
-				       const1_rtx,
-				       GEN_INT (one_var == 1 ? 0 : 1),
-				       GEN_INT (one_var == 2 ? 0+4 : 1+4),
-				       GEN_INT (one_var == 3 ? 0+4 : 1+4)));
-
-	  if (mode != V4SFmode)
-	    emit_move_insn (target, gen_lowpart (V4SImode, tmp));
-	  else if (tmp != target)
-	    emit_move_insn (target, tmp);
-	}
-      else if (target != new_target)
-	emit_move_insn (target, new_target);
-      return true;
-
-    case E_V8HImode:
-    case E_V16QImode:
-      vsimode = V4SImode;
-      goto widen;
-    case E_V4HImode:
-    case E_V8QImode:
-      if (!mmx_ok)
-	return false;
-      vsimode = V2SImode;
-      goto widen;
-    widen:
-      if (one_var != 0)
-	return false;
-
-      /* Zero extend the variable element to SImode and recurse.  */
-      var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
-
-      x = gen_reg_rtx (vsimode);
-      if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
-						var, one_var))
-	gcc_unreachable ();
-
-      emit_move_insn (target, gen_lowpart (mode, x));
-      return true;
-
-    default:
-      return false;
-    }
-}
-
-/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
-   consisting of the values in VALS.  It is known that all elements
-   except ONE_VAR are constants.  Return true if successful.  */
-
-static bool
-ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
-				 rtx target, rtx vals, int one_var)
-{
-  rtx var = XVECEXP (vals, 0, one_var);
-  machine_mode wmode;
-  rtx const_vec, x;
-
-  const_vec = copy_rtx (vals);
-  XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
-  const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
-
-  switch (mode)
-    {
-    case E_V2DFmode:
-    case E_V2DImode:
-    case E_V2SFmode:
-    case E_V2SImode:
-      /* For the two element vectors, it's just as easy to use
-	 the general case.  */
-      return false;
-
-    case E_V4DImode:
-      /* Use ix86_expand_vector_set in 64bit mode only.  */
-      if (!TARGET_64BIT)
-	return false;
-      /* FALLTHRU */
-    case E_V4DFmode:
-    case E_V8SFmode:
-    case E_V8SImode:
-    case E_V16HImode:
-    case E_V32QImode:
-    case E_V4SFmode:
-    case E_V4SImode:
-    case E_V8HImode:
-    case E_V4HImode:
-      break;
-
-    case E_V16QImode:
-      if (TARGET_SSE4_1)
-	break;
-      wmode = V8HImode;
-      goto widen;
-    case E_V8QImode:
-      wmode = V4HImode;
-      goto widen;
-    widen:
-      /* There's no way to set one QImode entry easily.  Combine
-	 the variable value with its adjacent constant value, and
-	 promote to an HImode set.  */
-      x = XVECEXP (vals, 0, one_var ^ 1);
-      if (one_var & 1)
-	{
-	  var = convert_modes (HImode, QImode, var, true);
-	  var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
-				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
-	  x = GEN_INT (INTVAL (x) & 0xff);
-	}
-      else
-	{
-	  var = convert_modes (HImode, QImode, var, true);
-	  x = gen_int_mode (UINTVAL (x) << 8, HImode);
-	}
-      if (x != const0_rtx)
-	var = expand_simple_binop (HImode, IOR, var, x, var,
-				   1, OPTAB_LIB_WIDEN);
-
-      x = gen_reg_rtx (wmode);
-      emit_move_insn (x, gen_lowpart (wmode, const_vec));
-      ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
-
-      emit_move_insn (target, gen_lowpart (mode, x));
-      return true;
-
-    default:
-      return false;
-    }
-
-  emit_move_insn (target, const_vec);
-  ix86_expand_vector_set (mmx_ok, target, var, one_var);
-  return true;
-}
-
-/* A subroutine of ix86_expand_vector_init_general.  Use vector
-   concatenate to handle the most general case: all values variable,
-   and none identical.  */
-
-static void
-ix86_expand_vector_init_concat (machine_mode mode,
-				rtx target, rtx *ops, int n)
-{
-  machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
-  rtx first[16], second[8], third[4];
-  rtvec v;
-  int i, j;
-
-  switch (n)
-    {
-    case 2:
-      switch (mode)
-	{
-	case E_V16SImode:
-	  cmode = V8SImode;
-	  break;
-	case E_V16SFmode:
-	  cmode = V8SFmode;
-	  break;
-	case E_V8DImode:
-	  cmode = V4DImode;
-	  break;
-	case E_V8DFmode:
-	  cmode = V4DFmode;
-	  break;
-	case E_V8SImode:
-	  cmode = V4SImode;
-	  break;
-	case E_V8SFmode:
-	  cmode = V4SFmode;
-	  break;
-	case E_V4DImode:
-	  cmode = V2DImode;
-	  break;
-	case E_V4DFmode:
-	  cmode = V2DFmode;
-	  break;
-	case E_V4SImode:
-	  cmode = V2SImode;
-	  break;
-	case E_V4SFmode:
-	  cmode = V2SFmode;
-	  break;
-	case E_V2DImode:
-	  cmode = DImode;
-	  break;
-	case E_V2SImode:
-	  cmode = SImode;
-	  break;
-	case E_V2DFmode:
-	  cmode = DFmode;
-	  break;
-	case E_V2SFmode:
-	  cmode = SFmode;
-	  break;
-	default:
-	  gcc_unreachable ();
-	}
-
-      if (!register_operand (ops[1], cmode))
-	ops[1] = force_reg (cmode, ops[1]);
-      if (!register_operand (ops[0], cmode))
-	ops[0] = force_reg (cmode, ops[0]);
-      emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
-							  ops[1])));
-      break;
-
-    case 4:
-      switch (mode)
-	{
-	case E_V4DImode:
-	  cmode = V2DImode;
-	  break;
-	case E_V4DFmode:
-	  cmode = V2DFmode;
-	  break;
-	case E_V4SImode:
-	  cmode = V2SImode;
-	  break;
-	case E_V4SFmode:
-	  cmode = V2SFmode;
-	  break;
-	default:
-	  gcc_unreachable ();
-	}
-      goto half;
-
-    case 8:
-      switch (mode)
-	{
-	case E_V8DImode:
-	  cmode = V2DImode;
-	  hmode = V4DImode;
-	  break;
-	case E_V8DFmode:
-	  cmode = V2DFmode;
-	  hmode = V4DFmode;
-	  break;
-	case E_V8SImode:
-	  cmode = V2SImode;
-	  hmode = V4SImode;
-	  break;
-	case E_V8SFmode:
-	  cmode = V2SFmode;
-	  hmode = V4SFmode;
-	  break;
-	default:
-	  gcc_unreachable ();
-	}
-      goto half;
-
-    case 16:
-      switch (mode)
-	{
-	case E_V16SImode:
-	  cmode = V2SImode;
-	  hmode = V4SImode;
-	  gmode = V8SImode;
-	  break;
-	case E_V16SFmode:
-	  cmode = V2SFmode;
-	  hmode = V4SFmode;
-	  gmode = V8SFmode;
-	  break;
-	default:
-	  gcc_unreachable ();
-	}
-      goto half;
-
-half:
-      /* FIXME: We process inputs backward to help RA.  PR 36222.  */
-      i = n - 1;
-      j = (n >> 1) - 1;
-      for (; i > 0; i -= 2, j--)
-	{
-	  first[j] = gen_reg_rtx (cmode);
-	  v = gen_rtvec (2, ops[i - 1], ops[i]);
-	  ix86_expand_vector_init (false, first[j],
-				   gen_rtx_PARALLEL (cmode, v));
-	}
-
-      n >>= 1;
-      if (n > 4)
-	{
-	  gcc_assert (hmode != VOIDmode);
-	  gcc_assert (gmode != VOIDmode);
-	  for (i = j = 0; i < n; i += 2, j++)
-	    {
-	      second[j] = gen_reg_rtx (hmode);
-	      ix86_expand_vector_init_concat (hmode, second [j],
-					      &first [i], 2);
-	    }
-	  n >>= 1;
-	  for (i = j = 0; i < n; i += 2, j++)
-	    {
-	      third[j] = gen_reg_rtx (gmode);
-	      ix86_expand_vector_init_concat (gmode, third[j],
-					      &second[i], 2);
-	    }
-	  n >>= 1;
-	  ix86_expand_vector_init_concat (mode, target, third, n);
-	}
-      else if (n > 2)
-	{
-	  gcc_assert (hmode != VOIDmode);
-	  for (i = j = 0; i < n; i += 2, j++)
-	    {
-	      second[j] = gen_reg_rtx (hmode);
-	      ix86_expand_vector_init_concat (hmode, second [j],
-					      &first [i], 2);
-	    }
-	  n >>= 1;
-	  ix86_expand_vector_init_concat (mode, target, second, n);
-	}
-      else
-	ix86_expand_vector_init_concat (mode, target, first, n);
-      break;
-
-    default:
-      gcc_unreachable ();
-    }
-}
-
-/* A subroutine of ix86_expand_vector_init_general.  Use vector
-   interleave to handle the most general case: all values variable,
-   and none identical.  */
-
-static void
-ix86_expand_vector_init_interleave (machine_mode mode,
-				    rtx target, rtx *ops, int n)
-{
-  machine_mode first_imode, second_imode, third_imode, inner_mode;
-  int i, j;
-  rtx op0, op1;
-  rtx (*gen_load_even) (rtx, rtx, rtx);
-  rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
-  rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
-
-  switch (mode)
-    {
-    case E_V8HImode:
-      gen_load_even = gen_vec_setv8hi;
-      gen_interleave_first_low = gen_vec_interleave_lowv4si;
-      gen_interleave_second_low = gen_vec_interleave_lowv2di;
-      inner_mode = HImode;
-      first_imode = V4SImode;
-      second_imode = V2DImode;
-      third_imode = VOIDmode;
-      break;
-    case E_V16QImode:
-      gen_load_even = gen_vec_setv16qi;
-      gen_interleave_first_low = gen_vec_interleave_lowv8hi;
-      gen_interleave_second_low = gen_vec_interleave_lowv4si;
-      inner_mode = QImode;
-      first_imode = V8HImode;
-      second_imode = V4SImode;
-      third_imode = V2DImode;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  for (i = 0; i < n; i++)
-    {
-      /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
-      op0 = gen_reg_rtx (SImode);
-      emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
-
-      /* Insert the SImode value as low element of V4SImode vector. */
-      op1 = gen_reg_rtx (V4SImode);
-      op0 = gen_rtx_VEC_MERGE (V4SImode,
-			       gen_rtx_VEC_DUPLICATE (V4SImode,
-						      op0),
-			       CONST0_RTX (V4SImode),
-			       const1_rtx);
-      emit_insn (gen_rtx_SET (op1, op0));
-
-      /* Cast the V4SImode vector back to a vector in orignal mode.  */
-      op0 = gen_reg_rtx (mode);
-      emit_move_insn (op0, gen_lowpart (mode, op1));
-
-      /* Load even elements into the second position.  */
-      emit_insn (gen_load_even (op0,
-				force_reg (inner_mode,
-					   ops [i + i + 1]),
-				const1_rtx));
-
-      /* Cast vector to FIRST_IMODE vector.  */
-      ops[i] = gen_reg_rtx (first_imode);
-      emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
-    }
-
-  /* Interleave low FIRST_IMODE vectors.  */
-  for (i = j = 0; i < n; i += 2, j++)
-    {
-      op0 = gen_reg_rtx (first_imode);
-      emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
-
-      /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
-      ops[j] = gen_reg_rtx (second_imode);
-      emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
-    }
-
-  /* Interleave low SECOND_IMODE vectors.  */
-  switch (second_imode)
-    {
-    case E_V4SImode:
-      for (i = j = 0; i < n / 2; i += 2, j++)
-	{
-	  op0 = gen_reg_rtx (second_imode);
-	  emit_insn (gen_interleave_second_low (op0, ops[i],
-						ops[i + 1]));
-
-	  /* Cast the SECOND_IMODE vector to the THIRD_IMODE
-	     vector.  */
-	  ops[j] = gen_reg_rtx (third_imode);
-	  emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
-	}
-      second_imode = V2DImode;
-      gen_interleave_second_low = gen_vec_interleave_lowv2di;
-      /* FALLTHRU */
-
-    case E_V2DImode:
-      op0 = gen_reg_rtx (second_imode);
-      emit_insn (gen_interleave_second_low (op0, ops[0],
-					    ops[1]));
-
-      /* Cast the SECOND_IMODE vector back to a vector on original
-	 mode.  */
-      emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
-      break;
-
-    default:
-      gcc_unreachable ();
-    }
-}
-
-/* A subroutine of ix86_expand_vector_init.  Handle the most general case:
-   all values variable, and none identical.  */
-
-static void
-ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
-				 rtx target, rtx vals)
-{
-  rtx ops[64], op0, op1, op2, op3, op4, op5;
-  machine_mode half_mode = VOIDmode;
-  machine_mode quarter_mode = VOIDmode;
-  int n, i;
-
-  switch (mode)
-    {
-    case E_V2SFmode:
-    case E_V2SImode:
-      if (!mmx_ok && !TARGET_SSE)
-	break;
-      /* FALLTHRU */
-
-    case E_V16SImode:
-    case E_V16SFmode:
-    case E_V8DFmode:
-    case E_V8DImode:
-    case E_V8SFmode:
-    case E_V8SImode:
-    case E_V4DFmode:
-    case E_V4DImode:
-    case E_V4SFmode:
-    case E_V4SImode:
-    case E_V2DFmode:
-    case E_V2DImode:
-      n = GET_MODE_NUNITS (mode);
-      for (i = 0; i < n; i++)
-	ops[i] = XVECEXP (vals, 0, i);
-      ix86_expand_vector_init_concat (mode, target, ops, n);
-      return;
-
-    case E_V2TImode:
-      for (i = 0; i < 2; i++)
-	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
-      op0 = gen_reg_rtx (V4DImode);
-      ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
-      emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
-      return;
-
-    case E_V4TImode:
-      for (i = 0; i < 4; i++)
-	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
-      ops[4] = gen_reg_rtx (V4DImode);
-      ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
-      ops[5] = gen_reg_rtx (V4DImode);
-      ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
-      op0 = gen_reg_rtx (V8DImode);
-      ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
-      emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
-      return;
-
-    case E_V32QImode:
-      half_mode = V16QImode;
-      goto half;
-
-    case E_V16HImode:
-      half_mode = V8HImode;
-      goto half;
-
-half:
-      n = GET_MODE_NUNITS (mode);
-      for (i = 0; i < n; i++)
-	ops[i] = XVECEXP (vals, 0, i);
-      op0 = gen_reg_rtx (half_mode);
-      op1 = gen_reg_rtx (half_mode);
-      ix86_expand_vector_init_interleave (half_mode, op0, ops,
-					  n >> 2);
-      ix86_expand_vector_init_interleave (half_mode, op1,
-					  &ops [n >> 1], n >> 2);
-      emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
-      return;
-
-    case E_V64QImode:
-      quarter_mode = V16QImode;
-      half_mode = V32QImode;
-      goto quarter;
-
-    case E_V32HImode:
-      quarter_mode = V8HImode;
-      half_mode = V16HImode;
-      goto quarter;
-
-quarter:
-      n = GET_MODE_NUNITS (mode);
-      for (i = 0; i < n; i++)
-	ops[i] = XVECEXP (vals, 0, i);
-      op0 = gen_reg_rtx (quarter_mode);
-      op1 = gen_reg_rtx (quarter_mode);
-      op2 = gen_reg_rtx (quarter_mode);
-      op3 = gen_reg_rtx (quarter_mode);
-      op4 = gen_reg_rtx (half_mode);
-      op5 = gen_reg_rtx (half_mode);
-      ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
-					  n >> 3);
-      ix86_expand_vector_init_interleave (quarter_mode, op1,
-					  &ops [n >> 2], n >> 3);
-      ix86_expand_vector_init_interleave (quarter_mode, op2,
-					  &ops [n >> 1], n >> 3);
-      ix86_expand_vector_init_interleave (quarter_mode, op3,
-					  &ops [(n >> 1) | (n >> 2)], n >> 3);
-      emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
-      emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
-      emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
-      return;
-
-    case E_V16QImode:
-      if (!TARGET_SSE4_1)
-	break;
-      /* FALLTHRU */
-
-    case E_V8HImode:
-      if (!TARGET_SSE2)
-	break;
-
-      /* Don't use ix86_expand_vector_init_interleave if we can't
-	 move from GPR to SSE register directly.  */
-      if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
-	break;
-
-      n = GET_MODE_NUNITS (mode);
-      for (i = 0; i < n; i++)
-	ops[i] = XVECEXP (vals, 0, i);
-      ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
-      return;
-
-    case E_V4HImode:
-    case E_V8QImode:
-      break;
-
-    default:
-      gcc_unreachable ();
-    }
-
-    {
-      int i, j, n_elts, n_words, n_elt_per_word;
-      machine_mode inner_mode;
-      rtx words[4], shift;
-
-      inner_mode = GET_MODE_INNER (mode);
-      n_elts = GET_MODE_NUNITS (mode);
-      n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
-      n_elt_per_word = n_elts / n_words;
-      shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
-
-      for (i = 0; i < n_words; ++i)
-	{
-	  rtx word = NULL_RTX;
-
-	  for (j = 0; j < n_elt_per_word; ++j)
-	    {
-	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
-	      elt = convert_modes (word_mode, inner_mode, elt, true);
-
-	      if (j == 0)
-		word = elt;
-	      else
-		{
-		  word = expand_simple_binop (word_mode, ASHIFT, word, shift,
-					      word, 1, OPTAB_LIB_WIDEN);
-		  word = expand_simple_binop (word_mode, IOR, word, elt,
-					      word, 1, OPTAB_LIB_WIDEN);
-		}
-	    }
-
-	  words[i] = word;
-	}
-
-      if (n_words == 1)
-	emit_move_insn (target, gen_lowpart (mode, words[0]));
-      else if (n_words == 2)
-	{
-	  rtx tmp = gen_reg_rtx (mode);
-	  emit_clobber (tmp);
-	  emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
-	  emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
-	  emit_move_insn (target, tmp);
-	}
-      else if (n_words == 4)
-	{
-	  rtx tmp = gen_reg_rtx (V4SImode);
-	  gcc_assert (word_mode == SImode);
-	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
-	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
-	  emit_move_insn (target, gen_lowpart (mode, tmp));
-	}
-      else
-	gcc_unreachable ();
-    }
-}
-
-/* Initialize vector TARGET via VALS.  Suppress the use of MMX
-   instructions unless MMX_OK is true.  */
-
-void
-ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
-{
-  machine_mode mode = GET_MODE (target);
-  machine_mode inner_mode = GET_MODE_INNER (mode);
-  int n_elts = GET_MODE_NUNITS (mode);
-  int n_var = 0, one_var = -1;
-  bool all_same = true, all_const_zero = true;
-  int i;
-  rtx x;
-
-  /* Handle first initialization from vector elts.  */
-  if (n_elts != XVECLEN (vals, 0))
-    {
-      rtx subtarget = target;
-      x = XVECEXP (vals, 0, 0);
-      gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
-      if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
-	{
-	  rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
-	  if (inner_mode == QImode || inner_mode == HImode)
-	    {
-	      unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
-	      mode = mode_for_vector (SImode, n_bits / 4).require ();
-	      inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
-	      ops[0] = gen_lowpart (inner_mode, ops[0]);
-	      ops[1] = gen_lowpart (inner_mode, ops[1]);
-	      subtarget = gen_reg_rtx (mode);
-	    }
-	  ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
-	  if (subtarget != target)
-	    emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
-	  return;
-	}
-      gcc_unreachable ();
-    }
-
-  for (i = 0; i < n_elts; ++i)
-    {
-      x = XVECEXP (vals, 0, i);
-      if (!(CONST_SCALAR_INT_P (x)
-	    || CONST_DOUBLE_P (x)
-	    || CONST_FIXED_P (x)))
-	n_var++, one_var = i;
-      else if (x != CONST0_RTX (inner_mode))
-	all_const_zero = false;
-      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
-	all_same = false;
-    }
-
-  /* Constants are best loaded from the constant pool.  */
-  if (n_var == 0)
-    {
-      emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
-      return;
-    }
-
-  /* If all values are identical, broadcast the value.  */
-  if (all_same
-      && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
-					    XVECEXP (vals, 0, 0)))
-    return;
-
-  /* Values where only one field is non-constant are best loaded from
-     the pool and overwritten via move later.  */
-  if (n_var == 1)
-    {
-      if (all_const_zero
-	  && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
-						  XVECEXP (vals, 0, one_var),
-						  one_var))
-	return;
-
-      if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
-	return;
-    }
-
-  ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
-}
-
-void
-ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
-{
-  machine_mode mode = GET_MODE (target);
-  machine_mode inner_mode = GET_MODE_INNER (mode);
-  machine_mode half_mode;
-  bool use_vec_merge = false;
-  rtx tmp;
-  static rtx (*gen_extract[6][2]) (rtx, rtx)
-    = {
-	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
-	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
-	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
-	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
-	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
-	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
-      };
-  static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
-    = {
-	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
-	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
-	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
-	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
-	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
-	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
-      };
-  int i, j, n;
-  machine_mode mmode = VOIDmode;
-  rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
-
-  switch (mode)
-    {
-    case E_V2SFmode:
-    case E_V2SImode:
-      if (mmx_ok)
-	{
-	  tmp = gen_reg_rtx (GET_MODE_INNER (mode));
-	  ix86_expand_vector_extract (true, tmp, target, 1 - elt);
-	  if (elt == 0)
-	    tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
-	  else
-	    tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
-	  emit_insn (gen_rtx_SET (target, tmp));
-	  return;
-	}
-      break;
-
-    case E_V2DImode:
-      use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
-      if (use_vec_merge)
-	break;
-
-      tmp = gen_reg_rtx (GET_MODE_INNER (mode));
-      ix86_expand_vector_extract (false, tmp, target, 1 - elt);
-      if (elt == 0)
-	tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
-      else
-	tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
-      emit_insn (gen_rtx_SET (target, tmp));
-      return;
-
-    case E_V2DFmode:
-      {
-	rtx op0, op1;
-
-	/* For the two element vectors, we implement a VEC_CONCAT with
-	   the extraction of the other element.  */
-
-	tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
-	tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
-
-	if (elt == 0)
-	  op0 = val, op1 = tmp;
-	else
-	  op0 = tmp, op1 = val;
-
-	tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
-	emit_insn (gen_rtx_SET (target, tmp));
-      }
-      return;
-
-    case E_V4SFmode:
-      use_vec_merge = TARGET_SSE4_1;
-      if (use_vec_merge)
-	break;
-
-      switch (elt)
-	{
-	case 0:
-	  use_vec_merge = true;
-	  break;
-
-	case 1:
-	  /* tmp = target = A B C D */
-	  tmp = copy_to_reg (target);
-	  /* target = A A B B */
-	  emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
-	  /* target = X A B B */
-	  ix86_expand_vector_set (false, target, val, 0);
-	  /* target = A X C D  */
-	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
-					  const1_rtx, const0_rtx,
-					  GEN_INT (2+4), GEN_INT (3+4)));
-	  return;
-
-	case 2:
-	  /* tmp = target = A B C D */
-	  tmp = copy_to_reg (target);
-	  /* tmp = X B C D */
-	  ix86_expand_vector_set (false, tmp, val, 0);
-	  /* target = A B X D */
-	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
-					  const0_rtx, const1_rtx,
-					  GEN_INT (0+4), GEN_INT (3+4)));
-	  return;
-
-	case 3:
-	  /* tmp = target = A B C D */
-	  tmp = copy_to_reg (target);
-	  /* tmp = X B C D */
-	  ix86_expand_vector_set (false, tmp, val, 0);
-	  /* target = A B X D */
-	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
-					  const0_rtx, const1_rtx,
-					  GEN_INT (2+4), GEN_INT (0+4)));
-	  return;
-
-	default:
-	  gcc_unreachable ();
-	}
-      break;
-
-    case E_V4SImode:
-      use_vec_merge = TARGET_SSE4_1;
-      if (use_vec_merge)
-	break;
-
-      /* Element 0 handled by vec_merge below.  */
-      if (elt == 0)
-	{
-	  use_vec_merge = true;
-	  break;
-	}
-
-      if (TARGET_SSE2)
-	{
-	  /* With SSE2, use integer shuffles to swap element 0 and ELT,
-	     store into element 0, then shuffle them back.  */
-
-	  rtx order[4];
-
-	  order[0] = GEN_INT (elt);
-	  order[1] = const1_rtx;
-	  order[2] = const2_rtx;
-	  order[3] = GEN_INT (3);
-	  order[elt] = const0_rtx;
-
-	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
-					order[1], order[2], order[3]));
-
-	  ix86_expand_vector_set (false, target, val, 0);
-
-	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
-					order[1], order[2], order[3]));
-	}
-      else
-	{
-	  /* For SSE1, we have to reuse the V4SF code.  */
-	  rtx t = gen_reg_rtx (V4SFmode);
-	  emit_move_insn (t, gen_lowpart (V4SFmode, target));
-	  ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
-	  emit_move_insn (target, gen_lowpart (mode, t));
-	}
-      return;
-
-    case E_V8HImode:
-      use_vec_merge = TARGET_SSE2;
-      break;
-    case E_V4HImode:
-      use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
-      break;
-
-    case E_V16QImode:
-      use_vec_merge = TARGET_SSE4_1;
-      break;
-
-    case E_V8QImode:
-      break;
-
-    case E_V32QImode:
-      half_mode = V16QImode;
-      j = 0;
-      n = 16;
-      goto half;
-
-    case E_V16HImode:
-      half_mode = V8HImode;
-      j = 1;
-      n = 8;
-      goto half;
-
-    case E_V8SImode:
-      half_mode = V4SImode;
-      j = 2;
-      n = 4;
-      goto half;
-
-    case E_V4DImode:
-      half_mode = V2DImode;
-      j = 3;
-      n = 2;
-      goto half;
-
-    case E_V8SFmode:
-      half_mode = V4SFmode;
-      j = 4;
-      n = 4;
-      goto half;
-
-    case E_V4DFmode:
-      half_mode = V2DFmode;
-      j = 5;
-      n = 2;
-      goto half;
-
-half:
-      /* Compute offset.  */
-      i = elt / n;
-      elt %= n;
-
-      gcc_assert (i <= 1);
-
-      /* Extract the half.  */
-      tmp = gen_reg_rtx (half_mode);
-      emit_insn (gen_extract[j][i] (tmp, target));
-
-      /* Put val in tmp at elt.  */
-      ix86_expand_vector_set (false, tmp, val, elt);
-
-      /* Put it back.  */
-      emit_insn (gen_insert[j][i] (target, target, tmp));
-      return;
-
-    case E_V8DFmode:
-      if (TARGET_AVX512F)
-	{
-	  mmode = QImode;
-	  gen_blendm = gen_avx512f_blendmv8df;
-	}
-      break;
-
-    case E_V8DImode:
-      if (TARGET_AVX512F)
-	{
-	  mmode = QImode;
-	  gen_blendm = gen_avx512f_blendmv8di;
-	}
-      break;
-
-    case E_V16SFmode:
-      if (TARGET_AVX512F)
-	{
-	  mmode = HImode;
-	  gen_blendm = gen_avx512f_blendmv16sf;
-	}
-      break;
-
-    case E_V16SImode:
-      if (TARGET_AVX512F)
-	{
-	  mmode = HImode;
-	  gen_blendm = gen_avx512f_blendmv16si;
-	}
-      break;
-
-    case E_V32HImode:
-      if (TARGET_AVX512BW)
-	{
-	  mmode = SImode;
-	  gen_blendm = gen_avx512bw_blendmv32hi;
-	}
-      else if (TARGET_AVX512F)
-	{
-	  half_mode = E_V8HImode;
-	  n = 8;
-	  goto quarter;
-	}
-      break;
-
-    case E_V64QImode:
-      if (TARGET_AVX512BW)
-	{
-	  mmode = DImode;
-	  gen_blendm = gen_avx512bw_blendmv64qi;
-	}
-      else if (TARGET_AVX512F)
-	{
-	  half_mode = E_V16QImode;
-	  n = 16;
-	  goto quarter;
-	}
-      break;
-
-quarter:
-      /* Compute offset.  */
-      i = elt / n;
-      elt %= n;
-
-      gcc_assert (i <= 3);
-
-      {
-	/* Extract the quarter.  */
-	tmp = gen_reg_rtx (V4SImode);
-	rtx tmp2 = gen_lowpart (V16SImode, target);
-	rtx mask = gen_reg_rtx (QImode);
-
-	emit_move_insn (mask, constm1_rtx);
-	emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
-						   tmp, mask));
-
-	tmp2 = gen_reg_rtx (half_mode);
-	emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
-	tmp = tmp2;
-
-	/* Put val in tmp at elt.  */
-	ix86_expand_vector_set (false, tmp, val, elt);
-
-	/* Put it back.  */
-	tmp2 = gen_reg_rtx (V16SImode);
-	rtx tmp3 = gen_lowpart (V16SImode, target);
-	mask = gen_reg_rtx (HImode);
-	emit_move_insn (mask, constm1_rtx);
-	tmp = gen_lowpart (V4SImode, tmp);
-	emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
-						  tmp3, mask));
-	emit_move_insn (target, gen_lowpart (mode, tmp2));
-      }
-      return;
-
-    default:
-      break;
-    }
-
-  if (mmode != VOIDmode)
-    {
-      tmp = gen_reg_rtx (mode);
-      emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
-      /* The avx512*_blendm<mode> expanders have different operand order
-	 from VEC_MERGE.  In VEC_MERGE, the first input operand is used for
-	 elements where the mask is set and second input operand otherwise,
-	 in {sse,avx}*_*blend* the first input operand is used for elements
-	 where the mask is clear and second input operand otherwise.  */
-      emit_insn (gen_blendm (target, target, tmp,
-			     force_reg (mmode,
-					gen_int_mode (HOST_WIDE_INT_1U << elt,
-						      mmode))));
-    }
-  else if (use_vec_merge)
-    {
-      tmp = gen_rtx_VEC_DUPLICATE (mode, val);
-      tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
-			       GEN_INT (HOST_WIDE_INT_1U << elt));
-      emit_insn (gen_rtx_SET (target, tmp));
-    }
-  else
-    {
-      rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
-
-      emit_move_insn (mem, target);
-
-      tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
-      emit_move_insn (tmp, val);
-
-      emit_move_insn (target, mem);
-    }
-}
-
-void
-ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
-{
-  machine_mode mode = GET_MODE (vec);
-  machine_mode inner_mode = GET_MODE_INNER (mode);
-  bool use_vec_extr = false;
-  rtx tmp;
-
-  switch (mode)
-    {
-    case E_V2SImode:
-    case E_V2SFmode:
-      if (!mmx_ok)
-	break;
-      /* FALLTHRU */
-
-    case E_V2DFmode:
-    case E_V2DImode:
-    case E_V2TImode:
-    case E_V4TImode:
-      use_vec_extr = true;
-      break;
-
-    case E_V4SFmode:
-      use_vec_extr = TARGET_SSE4_1;
-      if (use_vec_extr)
-	break;
-
-      switch (elt)
-	{
-	case 0:
-	  tmp = vec;
-	  break;
-
-	case 1:
-	case 3:
-	  tmp = gen_reg_rtx (mode);
-	  emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
-				       GEN_INT (elt), GEN_INT (elt),
-				       GEN_INT (elt+4), GEN_INT (elt+4)));
-	  break;
-
-	case 2:
-	  tmp = gen_reg_rtx (mode);
-	  emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
-	  break;
-
-	default:
-	  gcc_unreachable ();
-	}
-      vec = tmp;
-      use_vec_extr = true;
-      elt = 0;
-      break;
-
-    case E_V4SImode:
-      use_vec_extr = TARGET_SSE4_1;
-      if (use_vec_extr)
-	break;
-
-      if (TARGET_SSE2)
-	{
-	  switch (elt)
-	    {
-	    case 0:
-	      tmp = vec;
-	      break;
-
-	    case 1:
-	    case 3:
-	      tmp = gen_reg_rtx (mode);
-	      emit_insn (gen_sse2_pshufd_1 (tmp, vec,
-					    GEN_INT (elt), GEN_INT (elt),
-					    GEN_INT (elt), GEN_INT (elt)));
-	      break;
-
-	    case 2:
-	      tmp = gen_reg_rtx (mode);
-	      emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
-	      break;
-
-	    default:
-	      gcc_unreachable ();
-	    }
-	  vec = tmp;
-	  use_vec_extr = true;
-	  elt = 0;
-	}
-      else
-	{
-	  /* For SSE1, we have to reuse the V4SF code.  */
-	  ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
-				      gen_lowpart (V4SFmode, vec), elt);
-	  return;
-	}
-      break;
-
-    case E_V8HImode:
-      use_vec_extr = TARGET_SSE2;
-      break;
-    case E_V4HImode:
-      use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
-      break;
-
-    case E_V16QImode:
-      use_vec_extr = TARGET_SSE4_1;
-      break;
-
-    case E_V8SFmode:
-      if (TARGET_AVX)
-	{
-	  tmp = gen_reg_rtx (V4SFmode);
-	  if (elt < 4)
-	    emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
-	  else
-	    emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
-	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
-	  return;
-	}
-      break;
-
-    case E_V4DFmode:
-      if (TARGET_AVX)
-	{
-	  tmp = gen_reg_rtx (V2DFmode);
-	  if (elt < 2)
-	    emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
-	  else
-	    emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
-	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
-	  return;
-	}
-      break;
-
-    case E_V32QImode:
-      if (TARGET_AVX)
-	{
-	  tmp = gen_reg_rtx (V16QImode);
-	  if (elt < 16)
-	    emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
-	  else
-	    emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
-	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
-	  return;
-	}
-      break;
-
-    case E_V16HImode:
-      if (TARGET_AVX)
-	{
-	  tmp = gen_reg_rtx (V8HImode);
-	  if (elt < 8)
-	    emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
-	  else
-	    emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
-	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
-	  return;
-	}
-      break;
-
-    case E_V8SImode:
-      if (TARGET_AVX)
-	{
-	  tmp = gen_reg_rtx (V4SImode);
-	  if (elt < 4)
-	    emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
-	  else
-	    emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
-	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
-	  return;
-	}
-      break;
-
-    case E_V4DImode:
-      if (TARGET_AVX)
-	{
-	  tmp = gen_reg_rtx (V2DImode);
-	  if (elt < 2)
-	    emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
-	  else
-	    emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
-	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
-	  return;
-	}
-      break;
-
-    case E_V32HImode:
-      if (TARGET_AVX512BW)
-	{
-	  tmp = gen_reg_rtx (V16HImode);
-	  if (elt < 16)
-	    emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
-	  else
-	    emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
-	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
-	  return;
-	}
-      break;
-
-    case E_V64QImode:
-      if (TARGET_AVX512BW)
-	{
-	  tmp = gen_reg_rtx (V32QImode);
-	  if (elt < 32)
-	    emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
-	  else
-	    emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
-	  ix86_expand_vector_extract (false, target, tmp, elt & 31);
-	  return;
-	}
-      break;
-
-    case E_V16SFmode:
-      tmp = gen_reg_rtx (V8SFmode);
-      if (elt < 8)
-	emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
-      else
-	emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
-      ix86_expand_vector_extract (false, target, tmp, elt & 7);
-      return;
-
-    case E_V8DFmode:
-      tmp = gen_reg_rtx (V4DFmode);
-      if (elt < 4)
-	emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
-      else
-	emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
-      ix86_expand_vector_extract (false, target, tmp, elt & 3);
-      return;
-
-    case E_V16SImode:
-      tmp = gen_reg_rtx (V8SImode);
-      if (elt < 8)
-	emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
-      else
-	emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
-      ix86_expand_vector_extract (false, target, tmp, elt & 7);
-      return;
-
-    case E_V8DImode:
-      tmp = gen_reg_rtx (V4DImode);
-      if (elt < 4)
-	emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
-      else
-	emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
-      ix86_expand_vector_extract (false, target, tmp, elt & 3);
-      return;
-
-    case E_V8QImode:
-      /* ??? Could extract the appropriate HImode element and shift.  */
-    default:
-      break;
-    }
-
-  if (use_vec_extr)
-    {
-      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
-      tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
-
-      /* Let the rtl optimizers know about the zero extension performed.  */
-      if (inner_mode == QImode || inner_mode == HImode)
-	{
-	  tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
-	  target = gen_lowpart (SImode, target);
-	}
-
-      emit_insn (gen_rtx_SET (target, tmp));
-    }
-  else
-    {
-      rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
-
-      emit_move_insn (mem, vec);
-
-      tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
-      emit_move_insn (target, tmp);
-    }
-}
-
-/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
-   to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
-   The upper bits of DEST are undefined, though they shouldn't cause
-   exceptions (some bits from src or all zeros are ok).  */
-
-static void
-emit_reduc_half (rtx dest, rtx src, int i)
-{
-  rtx tem, d = dest;
-  switch (GET_MODE (src))
-    {
-    case E_V4SFmode:
-      if (i == 128)
-	tem = gen_sse_movhlps (dest, src, src);
-      else
-	tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
-				   GEN_INT (1 + 4), GEN_INT (1 + 4));
-      break;
-    case E_V2DFmode:
-      tem = gen_vec_interleave_highv2df (dest, src, src);
-      break;
-    case E_V16QImode:
-    case E_V8HImode:
-    case E_V4SImode:
-    case E_V2DImode:
-      d = gen_reg_rtx (V1TImode);
-      tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
-				GEN_INT (i / 2));
-      break;
-    case E_V8SFmode:
-      if (i == 256)
-	tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
-      else
-	tem = gen_avx_shufps256 (dest, src, src,
-				 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
-      break;
-    case E_V4DFmode:
-      if (i == 256)
-	tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
-      else
-	tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
-      break;
-    case E_V32QImode:
-    case E_V16HImode:
-    case E_V8SImode:
-    case E_V4DImode:
-      if (i == 256)
-	{
-	  if (GET_MODE (dest) != V4DImode)
-	    d = gen_reg_rtx (V4DImode);
-	  tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
-				   gen_lowpart (V4DImode, src),
-				   const1_rtx);
-	}
-      else
-	{
-	  d = gen_reg_rtx (V2TImode);
-	  tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
-				    GEN_INT (i / 2));
-	}
-      break;
-    case E_V64QImode:
-    case E_V32HImode:
-    case E_V16SImode:
-    case E_V16SFmode:
-    case E_V8DImode:
-    case E_V8DFmode:
-      if (i > 128)
-	tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
-				      gen_lowpart (V16SImode, src),
-				      gen_lowpart (V16SImode, src),
-				      GEN_INT (0x4 + (i == 512 ? 4 : 0)),
-				      GEN_INT (0x5 + (i == 512 ? 4 : 0)),
-				      GEN_INT (0x6 + (i == 512 ? 4 : 0)),
-				      GEN_INT (0x7 + (i == 512 ? 4 : 0)),
-				      GEN_INT (0xC), GEN_INT (0xD),
-				      GEN_INT (0xE), GEN_INT (0xF),
-				      GEN_INT (0x10), GEN_INT (0x11),
-				      GEN_INT (0x12), GEN_INT (0x13),
-				      GEN_INT (0x14), GEN_INT (0x15),
-				      GEN_INT (0x16), GEN_INT (0x17));
-      else
-	tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
-				   gen_lowpart (V16SImode, src),
-				   GEN_INT (i == 128 ? 0x2 : 0x1),
-				   GEN_INT (0x3),
-				   GEN_INT (0x3),
-				   GEN_INT (0x3),
-				   GEN_INT (i == 128 ? 0x6 : 0x5),
-				   GEN_INT (0x7),
-				   GEN_INT (0x7),
-				   GEN_INT (0x7),
-				   GEN_INT (i == 128 ? 0xA : 0x9),
-				   GEN_INT (0xB),
-				   GEN_INT (0xB),
-				   GEN_INT (0xB),
-				   GEN_INT (i == 128 ? 0xE : 0xD),
-				   GEN_INT (0xF),
-				   GEN_INT (0xF),
-				   GEN_INT (0xF));
-      break;
-    default:
-      gcc_unreachable ();
-    }
-  emit_insn (tem);
-  if (d != dest)
-    emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
-}
-
-/* Expand a vector reduction.  FN is the binary pattern to reduce;
-   DEST is the destination; IN is the input vector.  */
-
-void
-ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
-{
-  rtx half, dst, vec = in;
-  machine_mode mode = GET_MODE (in);
-  int i;
-
-  /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
-  if (TARGET_SSE4_1
-      && mode == V8HImode
-      && fn == gen_uminv8hi3)
-    {
-      emit_insn (gen_sse4_1_phminposuw (dest, in));
-      return;
-    }
-
-  for (i = GET_MODE_BITSIZE (mode);
-       i > GET_MODE_UNIT_BITSIZE (mode);
-       i >>= 1)
-    {
-      half = gen_reg_rtx (mode);
-      emit_reduc_half (half, vec, i);
-      if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
-	dst = dest;
-      else
-	dst = gen_reg_rtx (mode);
-      emit_insn (fn (dst, half, vec));
-      vec = dst;
-    }
-}
-
-/* Target hook for scalar_mode_supported_p.  */
-static bool
-ix86_scalar_mode_supported_p (scalar_mode mode)
-{
-  if (DECIMAL_FLOAT_MODE_P (mode))
-    return default_decimal_float_supported_p ();
-  else if (mode == TFmode)
-    return true;
-  else
-    return default_scalar_mode_supported_p (mode);
-}
-
-/* Implements target hook vector_mode_supported_p.  */
-static bool
-ix86_vector_mode_supported_p (machine_mode mode)
-{
-  if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
-    return true;
-  if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
-    return true;
-  if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
-    return true;
-  if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
-    return true;
-  if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
-    return true;
-  if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
-    return true;
-  return false;
-}
-
-/* Target hook for c_mode_for_suffix.  */
-static machine_mode
-ix86_c_mode_for_suffix (char suffix)
-{
-  if (suffix == 'q')
-    return TFmode;
-  if (suffix == 'w')
-    return XFmode;
-
-  return VOIDmode;
-}
-
-/* Worker function for TARGET_MD_ASM_ADJUST.
-
-   We implement asm flag outputs, and maintain source compatibility
-   with the old cc0-based compiler.  */
-
-static rtx_insn *
-ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
-		    vec<const char *> &constraints,
-		    vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
-{
-  bool saw_asm_flag = false;
-
-  start_sequence ();
-  for (unsigned i = 0, n = outputs.length (); i < n; ++i)
-    {
-      const char *con = constraints[i];
-      if (strncmp (con, "=@cc", 4) != 0)
-	continue;
-      con += 4;
-      if (strchr (con, ',') != NULL)
-	{
-	  error ("alternatives not allowed in asm flag output");
-	  continue;
-	}
-
-      bool invert = false;
-      if (con[0] == 'n')
-	invert = true, con++;
-
-      machine_mode mode = CCmode;
-      rtx_code code = UNKNOWN;
-
-      switch (con[0])
-	{
-	case 'a':
-	  if (con[1] == 0)
-	    mode = CCAmode, code = EQ;
-	  else if (con[1] == 'e' && con[2] == 0)
-	    mode = CCCmode, code = NE;
-	  break;
-	case 'b':
-	  if (con[1] == 0)
-	    mode = CCCmode, code = EQ;
-	  else if (con[1] == 'e' && con[2] == 0)
-	    mode = CCAmode, code = NE;
-	  break;
-	case 'c':
-	  if (con[1] == 0)
-	    mode = CCCmode, code = EQ;
-	  break;
-	case 'e':
-	  if (con[1] == 0)
-	    mode = CCZmode, code = EQ;
-	  break;
-	case 'g':
-	  if (con[1] == 0)
-	    mode = CCGCmode, code = GT;
-	  else if (con[1] == 'e' && con[2] == 0)
-	    mode = CCGCmode, code = GE;
-	  break;
-	case 'l':
-	  if (con[1] == 0)
-	    mode = CCGCmode, code = LT;
-	  else if (con[1] == 'e' && con[2] == 0)
-	    mode = CCGCmode, code = LE;
-	  break;
-	case 'o':
-	  if (con[1] == 0)
-	    mode = CCOmode, code = EQ;
-	  break;
-	case 'p':
-	  if (con[1] == 0)
-	    mode = CCPmode, code = EQ;
-	  break;
-	case 's':
-	  if (con[1] == 0)
-	    mode = CCSmode, code = EQ;
-	  break;
-	case 'z':
-	  if (con[1] == 0)
-	    mode = CCZmode, code = EQ;
-	  break;
-	}
-      if (code == UNKNOWN)
-	{
-	  error ("unknown asm flag output %qs", constraints[i]);
-	  continue;
-	}
-      if (invert)
-	code = reverse_condition (code);
-
-      rtx dest = outputs[i];
-      if (!saw_asm_flag)
-	{
-	  /* This is the first asm flag output.  Here we put the flags
-	     register in as the real output and adjust the condition to
-	     allow it.  */
-	  constraints[i] = "=Bf";
-	  outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
-	  saw_asm_flag = true;
-	}
-      else
-	{
-	  /* We don't need the flags register as output twice.  */
-	  constraints[i] = "=X";
-	  outputs[i] = gen_rtx_SCRATCH (SImode);
-	}
-
-      rtx x = gen_rtx_REG (mode, FLAGS_REG);
-      x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
-
-      machine_mode dest_mode = GET_MODE (dest);
-      if (!SCALAR_INT_MODE_P (dest_mode))
-	{
-	  error ("invalid type for asm flag output");
-	  continue;
-	}
-
-      if (dest_mode == DImode && !TARGET_64BIT)
-	dest_mode = SImode;
-
-      if (dest_mode != QImode)
-	{
-	  rtx destqi = gen_reg_rtx (QImode);
-	  emit_insn (gen_rtx_SET (destqi, x));
-
-	  if (TARGET_ZERO_EXTEND_WITH_AND
-	      && optimize_function_for_speed_p (cfun))
-	    {
-	      x = force_reg (dest_mode, const0_rtx);
-
-	      emit_insn (gen_movstrictqi (gen_lowpart (QImode, x), destqi));
-	    }
-	  else
-	    {
-	      x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
-	      if (dest_mode == GET_MODE (dest)
-		  && !register_operand (dest, GET_MODE (dest)))
-		x = force_reg (dest_mode, x);
-	    }
-	}
-
-      if (dest_mode != GET_MODE (dest))
-	{
-	  rtx tmp = gen_reg_rtx (SImode);
-
-	  emit_insn (gen_rtx_SET (tmp, x));
-	  emit_insn (gen_zero_extendsidi2 (dest, tmp));
-	}
-      else
-	emit_insn (gen_rtx_SET (dest, x));
-    }
-  rtx_insn *seq = get_insns ();
-  end_sequence ();
-
-  if (saw_asm_flag)
-    return seq;
-  else
-    {
-      /* If we had no asm flag outputs, clobber the flags.  */
-      clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
-      SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
-      return NULL;
-    }
-}
-
-/* Implements target vector targetm.asm.encode_section_info.  */
-
-static void ATTRIBUTE_UNUSED
-ix86_encode_section_info (tree decl, rtx rtl, int first)
-{
-  default_encode_section_info (decl, rtl, first);
-
-  if (ix86_in_large_data_p (decl))
-    SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
-}
-
-/* Worker function for REVERSE_CONDITION.  */
-
-enum rtx_code
-ix86_reverse_condition (enum rtx_code code, machine_mode mode)
-{
-  return (mode == CCFPmode
-	  ? reverse_condition_maybe_unordered (code)
-	  : reverse_condition (code));
-}
-
-/* Output code to perform an x87 FP register move, from OPERANDS[1]
-   to OPERANDS[0].  */
-
-const char *
-output_387_reg_move (rtx_insn *insn, rtx *operands)
-{
-  if (REG_P (operands[0]))
-    {
-      if (REG_P (operands[1])
-	  && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
-	{
-	  if (REGNO (operands[0]) == FIRST_STACK_REG)
-	    return output_387_ffreep (operands, 0);
-	  return "fstp\t%y0";
-	}
-      if (STACK_TOP_P (operands[0]))
-	return "fld%Z1\t%y1";
-      return "fst\t%y0";
-    }
-  else if (MEM_P (operands[0]))
-    {
-      gcc_assert (REG_P (operands[1]));
-      if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
-	return "fstp%Z0\t%y0";
-      else
-	{
-	  /* There is no non-popping store to memory for XFmode.
-	     So if we need one, follow the store with a load.  */
-	  if (GET_MODE (operands[0]) == XFmode)
-	    return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
-	  else
-	    return "fst%Z0\t%y0";
-	}
-    }
-  else
-    gcc_unreachable();
-}
-
-/* Output code to perform a conditional jump to LABEL, if C2 flag in
-   FP status register is set.  */
-
-void
-ix86_emit_fp_unordered_jump (rtx label)
-{
-  rtx reg = gen_reg_rtx (HImode);
-  rtx_insn *insn;
-  rtx temp;
-
-  emit_insn (gen_x86_fnstsw_1 (reg));
-
-  if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
-    {
-      emit_insn (gen_x86_sahf_1 (reg));
-
-      temp = gen_rtx_REG (CCmode, FLAGS_REG);
-      temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
-    }
-  else
-    {
-      emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
-
-      temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
-      temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
-    }
-
-  temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
-			      gen_rtx_LABEL_REF (VOIDmode, label),
-			      pc_rtx);
-  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
-  predict_jump (REG_BR_PROB_BASE * 10 / 100);
-  JUMP_LABEL (insn) = label;
-}
-
-/* Output code to perform an sinh XFmode calculation.  */
-
-void ix86_emit_i387_sinh (rtx op0, rtx op1)
-{
-  rtx e1 = gen_reg_rtx (XFmode);
-  rtx e2 = gen_reg_rtx (XFmode);
-  rtx scratch = gen_reg_rtx (HImode);
-  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
-  rtx half = const_double_from_real_value (dconsthalf, XFmode);
-  rtx cst1, tmp;
-  rtx_code_label *jump_label = gen_label_rtx ();
-  rtx_insn *insn;
-
-  /* scratch = fxam (op1) */
-  emit_insn (gen_fxamxf2_i387 (scratch, op1));
-
-  /* e1 = expm1 (|op1|) */
-  emit_insn (gen_absxf2 (e2, op1));
-  emit_insn (gen_expm1xf2 (e1, e2));
-
-  /* e2 = e1 / (e1 + 1.0) + e1 */
-  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
-  emit_insn (gen_addxf3 (e2, e1, cst1));
-  emit_insn (gen_divxf3 (e2, e1, e2));
-  emit_insn (gen_addxf3 (e2, e2, e1));
-
-  /* flags = signbit (op1) */
-  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
-
-  /* if (flags) then e2 = -e2 */
-  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
-			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
-			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
-			      pc_rtx);
-  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
-  predict_jump (REG_BR_PROB_BASE * 50 / 100);
-  JUMP_LABEL (insn) = jump_label;
-
-  emit_insn (gen_negxf2 (e2, e2));
-
-  emit_label (jump_label);
-  LABEL_NUSES (jump_label) = 1;
-
-  /* op0 = 0.5 * e2 */
-  half = force_reg (XFmode, half);
-  emit_insn (gen_mulxf3 (op0, e2, half));
-}
-
-/* Output code to perform an cosh XFmode calculation.  */
-
-void ix86_emit_i387_cosh (rtx op0, rtx op1)
-{
-  rtx e1 = gen_reg_rtx (XFmode);
-  rtx e2 = gen_reg_rtx (XFmode);
-  rtx half = const_double_from_real_value (dconsthalf, XFmode);
-  rtx cst1;
-
-  /* e1 = exp (op1) */
-  emit_insn (gen_expxf2 (e1, op1));
-
-  /* e2 = e1 + 1.0 / e1 */
-  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
-  emit_insn (gen_divxf3 (e2, cst1, e1));
-  emit_insn (gen_addxf3 (e2, e1, e2));
-
-  /* op0 = 0.5 * e2 */
-  half = force_reg (XFmode, half);
-  emit_insn (gen_mulxf3 (op0, e2, half));
-}
-
-/* Output code to perform an tanh XFmode calculation.  */
-
-void ix86_emit_i387_tanh (rtx op0, rtx op1)
-{
-  rtx e1 = gen_reg_rtx (XFmode);
-  rtx e2 = gen_reg_rtx (XFmode);
-  rtx scratch = gen_reg_rtx (HImode);
-  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
-  rtx cst2, tmp;
-  rtx_code_label *jump_label = gen_label_rtx ();
-  rtx_insn *insn;
-
-  /* scratch = fxam (op1) */
-  emit_insn (gen_fxamxf2_i387 (scratch, op1));
-
-  /* e1 = expm1 (-|2 * op1|) */
-  emit_insn (gen_addxf3 (e2, op1, op1));
-  emit_insn (gen_absxf2 (e2, e2));
-  emit_insn (gen_negxf2 (e2, e2));
-  emit_insn (gen_expm1xf2 (e1, e2));
-
-  /* e2 = e1 / (e1 + 2.0) */
-  cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
-  emit_insn (gen_addxf3 (e2, e1, cst2));
-  emit_insn (gen_divxf3 (e2, e1, e2));
-
-  /* flags = signbit (op1) */
-  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
-
-  /* if (!flags) then e2 = -e2 */
-  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
-			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
-			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
-			      pc_rtx);
-  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
-  predict_jump (REG_BR_PROB_BASE * 50 / 100);
-  JUMP_LABEL (insn) = jump_label;
-
-  emit_insn (gen_negxf2 (e2, e2));
-
-  emit_label (jump_label);
-  LABEL_NUSES (jump_label) = 1;
-
-  emit_move_insn (op0, e2);
-}
-
-/* Output code to perform an asinh XFmode calculation.  */
-
-void ix86_emit_i387_asinh (rtx op0, rtx op1)
-{
-  rtx e1 = gen_reg_rtx (XFmode);
-  rtx e2 = gen_reg_rtx (XFmode);
-  rtx scratch = gen_reg_rtx (HImode);
-  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
-  rtx cst1, tmp;
-  rtx_code_label *jump_label = gen_label_rtx ();
-  rtx_insn *insn;
-
-  /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
-  emit_insn (gen_mulxf3 (e1, op1, op1));
-  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
-  emit_insn (gen_addxf3 (e2, e1, cst1));
-  emit_insn (gen_sqrtxf2 (e2, e2));
-  emit_insn (gen_addxf3 (e2, e2, cst1));
-
-  /* e1 = e1 / e2 */
-  emit_insn (gen_divxf3 (e1, e1, e2));
-
-  /* scratch = fxam (op1) */
-  emit_insn (gen_fxamxf2_i387 (scratch, op1));
-
-  /* e1 = e1 + |op1| */
-  emit_insn (gen_absxf2 (e2, op1));
-  emit_insn (gen_addxf3 (e1, e1, e2));
-
-  /* e2 = log1p (e1) */
-  ix86_emit_i387_log1p (e2, e1);
-
-  /* flags = signbit (op1) */
-  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
-
-  /* if (flags) then e2 = -e2 */
-  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
-			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
-			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
-			      pc_rtx);
-  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
-  predict_jump (REG_BR_PROB_BASE * 50 / 100);
-  JUMP_LABEL (insn) = jump_label;
-
-  emit_insn (gen_negxf2 (e2, e2));
-
-  emit_label (jump_label);
-  LABEL_NUSES (jump_label) = 1;
-
-  emit_move_insn (op0, e2);
-}
-
-/* Output code to perform an acosh XFmode calculation.  */
-
-void ix86_emit_i387_acosh (rtx op0, rtx op1)
-{
-  rtx e1 = gen_reg_rtx (XFmode);
-  rtx e2 = gen_reg_rtx (XFmode);
-  rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
-
-  /* e2 = sqrt (op1 + 1.0) */
-  emit_insn (gen_addxf3 (e2, op1, cst1));
-  emit_insn (gen_sqrtxf2 (e2, e2));
-
-  /* e1 = sqrt (op1 - 1.0) */
-  emit_insn (gen_subxf3 (e1, op1, cst1));
-  emit_insn (gen_sqrtxf2 (e1, e1));
-
-  /* e1 = e1 * e2 */
-  emit_insn (gen_mulxf3 (e1, e1, e2));
-
-  /* e1 = e1 + op1 */
-  emit_insn (gen_addxf3 (e1, e1, op1));
-
-  /* op0 = log (e1) */
-  emit_insn (gen_logxf2 (op0, e1));
-}
-
-/* Output code to perform an atanh XFmode calculation.  */
-
-void ix86_emit_i387_atanh (rtx op0, rtx op1)
-{
-  rtx e1 = gen_reg_rtx (XFmode);
-  rtx e2 = gen_reg_rtx (XFmode);
-  rtx scratch = gen_reg_rtx (HImode);
-  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
-  rtx half = const_double_from_real_value (dconsthalf, XFmode);
-  rtx cst1, tmp;
-  rtx_code_label *jump_label = gen_label_rtx ();
-  rtx_insn *insn;
-
-  /* scratch = fxam (op1) */
-  emit_insn (gen_fxamxf2_i387 (scratch, op1));
-
-  /* e2 = |op1| */
-  emit_insn (gen_absxf2 (e2, op1));
-
-  /* e1 = -(e2 + e2) / (e2 + 1.0) */
-  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
-  emit_insn (gen_addxf3 (e1, e2, cst1));
-  emit_insn (gen_addxf3 (e2, e2, e2));
-  emit_insn (gen_negxf2 (e2, e2));
-  emit_insn (gen_divxf3 (e1, e2, e1));
-
-  /* e2 = log1p (e1) */
-  ix86_emit_i387_log1p (e2, e1);
-
-  /* flags = signbit (op1) */
-  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
-
-  /* if (!flags) then e2 = -e2 */
-  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
-			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
-			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
-			      pc_rtx);
-  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
-  predict_jump (REG_BR_PROB_BASE * 50 / 100);
-  JUMP_LABEL (insn) = jump_label;
-
-  emit_insn (gen_negxf2 (e2, e2));
-
-  emit_label (jump_label);
-  LABEL_NUSES (jump_label) = 1;
-
-  /* op0 = 0.5 * e2 */
-  half = force_reg (XFmode, half);
-  emit_insn (gen_mulxf3 (op0, e2, half));
-}
-
-/* Output code to perform a log1p XFmode calculation.  */
-
-void ix86_emit_i387_log1p (rtx op0, rtx op1)
-{
-  rtx_code_label *label1 = gen_label_rtx ();
-  rtx_code_label *label2 = gen_label_rtx ();
-
-  rtx tmp = gen_reg_rtx (XFmode);
-  rtx res = gen_reg_rtx (XFmode);
-  rtx cst, cstln2, cst1;
-  rtx_insn *insn;
-
-  cst = const_double_from_real_value
-    (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
-  cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
-
-  emit_insn (gen_absxf2 (tmp, op1));
-
-  cst = force_reg (XFmode, cst);
-  ix86_expand_branch (GE, tmp, cst, label1);
-  predict_jump (REG_BR_PROB_BASE * 10 / 100);
-  insn = get_last_insn ();
-  JUMP_LABEL (insn) = label1;
-
-  emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
-  emit_jump (label2);
-
-  emit_label (label1);
-  LABEL_NUSES (label1) = 1;
-
-  cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
-  emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
-  emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
-
-  emit_label (label2);
-  LABEL_NUSES (label2) = 1;
-
-  emit_move_insn (op0, res);
-}
-
-/* Emit code for round calculation.  */
-void ix86_emit_i387_round (rtx op0, rtx op1)
-{
-  machine_mode inmode = GET_MODE (op1);
-  machine_mode outmode = GET_MODE (op0);
-  rtx e1 = gen_reg_rtx (XFmode);
-  rtx e2 = gen_reg_rtx (XFmode);
-  rtx scratch = gen_reg_rtx (HImode);
-  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
-  rtx half = const_double_from_real_value (dconsthalf, XFmode);
-  rtx res = gen_reg_rtx (outmode);
-  rtx_code_label *jump_label = gen_label_rtx ();
-  rtx (*floor_insn) (rtx, rtx);
-  rtx (*neg_insn) (rtx, rtx);
-  rtx_insn *insn;
-  rtx tmp;
-
-  switch (inmode)
-    {
-    case E_SFmode:
-    case E_DFmode:
-      tmp = gen_reg_rtx (XFmode);
-
-      emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
-      op1 = tmp;
-      break;
-    case E_XFmode:
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  switch (outmode)
-    {
-    case E_SFmode:
-      floor_insn = gen_frndintxf2_floor;
-      neg_insn = gen_negsf2;
-      break;
-    case E_DFmode:
-      floor_insn = gen_frndintxf2_floor;
-      neg_insn = gen_negdf2;
-      break;
-    case E_XFmode:
-      floor_insn = gen_frndintxf2_floor;
-      neg_insn = gen_negxf2;
-      break;
-    case E_HImode:
-      floor_insn = gen_lfloorxfhi2;
-      neg_insn = gen_neghi2;
-      break;
-    case E_SImode:
-      floor_insn = gen_lfloorxfsi2;
-      neg_insn = gen_negsi2;
-      break;
-    case E_DImode:
-      floor_insn = gen_lfloorxfdi2;
-      neg_insn = gen_negdi2;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-
-  /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
-
-  /* scratch = fxam(op1) */
-  emit_insn (gen_fxamxf2_i387 (scratch, op1));
-
-  /* e1 = fabs(op1) */
-  emit_insn (gen_absxf2 (e1, op1));
-
-  /* e2 = e1 + 0.5 */
-  half = force_reg (XFmode, half);
-  emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
-
-  /* res = floor(e2) */
-  switch (outmode)
-    {
-    case E_SFmode:
-    case E_DFmode:
-      {
-	tmp = gen_reg_rtx (XFmode);
-
-	emit_insn (floor_insn (tmp, e2));
-	emit_insn (gen_rtx_SET (res,
-				gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
-						UNSPEC_TRUNC_NOOP)));
-      }
-      break;
-    default:
-      emit_insn (floor_insn (res, e2));
-    }
-
-  /* flags = signbit(a) */
-  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
-
-  /* if (flags) then res = -res */
-  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
-			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
-			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
-			      pc_rtx);
-  insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
-  predict_jump (REG_BR_PROB_BASE * 50 / 100);
-  JUMP_LABEL (insn) = jump_label;
-
-  emit_insn (neg_insn (res, res));
-
-  emit_label (jump_label);
-  LABEL_NUSES (jump_label) = 1;
-
-  emit_move_insn (op0, res);
-}
-
-/* Output code to perform a Newton-Rhapson approximation of a single precision
-   floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
-
-void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
-{
-  rtx x0, x1, e0, e1;
-
-  x0 = gen_reg_rtx (mode);
-  e0 = gen_reg_rtx (mode);
-  e1 = gen_reg_rtx (mode);
-  x1 = gen_reg_rtx (mode);
-
-  /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
-
-  b = force_reg (mode, b);
-
-  /* x0 = rcp(b) estimate */
-  if (mode == V16SFmode || mode == V8DFmode)
-    {
-      if (TARGET_AVX512ER)
-	{
-	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
-						      UNSPEC_RCP28)));
-	  /* res = a * x0 */
-	  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
-	  return;
-	}
-      else
-	emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
-						    UNSPEC_RCP14)));
-    }
-  else
-    emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
-						UNSPEC_RCP)));
-
-  /* e0 = x0 * b */
-  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
-
-  /* e0 = x0 * e0 */
-  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
-
-  /* e1 = x0 + x0 */
-  emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
-
-  /* x1 = e1 - e0 */
-  emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
-
-  /* res = a * x1 */
-  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
-}
-
-/* Output code to perform a Newton-Rhapson approximation of a
-   single precision floating point [reciprocal] square root.  */
-
-void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
-{
-  rtx x0, e0, e1, e2, e3, mthree, mhalf;
-  REAL_VALUE_TYPE r;
-  int unspec;
-
-  x0 = gen_reg_rtx (mode);
-  e0 = gen_reg_rtx (mode);
-  e1 = gen_reg_rtx (mode);
-  e2 = gen_reg_rtx (mode);
-  e3 = gen_reg_rtx (mode);
-
-  if (TARGET_AVX512ER && mode == V16SFmode)
-    {
-      if (recip)
-	/* res = rsqrt28(a) estimate */
-	emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
-						     UNSPEC_RSQRT28)));
-      else
-	{
-	  /* x0 = rsqrt28(a) estimate */
-	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
-						      UNSPEC_RSQRT28)));
-	  /* res = rcp28(x0) estimate */
-	  emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
-						       UNSPEC_RCP28)));
-	}
-      return;
-    }
-
-  real_from_integer (&r, VOIDmode, -3, SIGNED);
-  mthree = const_double_from_real_value (r, SFmode);
-
-  real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
-  mhalf = const_double_from_real_value (r, SFmode);
-  unspec = UNSPEC_RSQRT;
-
-  if (VECTOR_MODE_P (mode))
-    {
-      mthree = ix86_build_const_vector (mode, true, mthree);
-      mhalf = ix86_build_const_vector (mode, true, mhalf);
-      /* There is no 512-bit rsqrt.  There is however rsqrt14.  */
-      if (GET_MODE_SIZE (mode) == 64)
-	unspec = UNSPEC_RSQRT14;
-    }
-
-  /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
-     rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
-
-  a = force_reg (mode, a);
-
-  /* x0 = rsqrt(a) estimate */
-  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
-					      unspec)));
-
-  /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
-  if (!recip)
-    {
-      rtx zero = force_reg (mode, CONST0_RTX(mode));
-      rtx mask;
-
-      /* Handle masked compare.  */
-      if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
-	{
-	  mask = gen_reg_rtx (HImode);
-	  /* Imm value 0x4 corresponds to not-equal comparison.  */
-	  emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
-	  emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
-	}
-      else
-	{
-	  mask = gen_reg_rtx (mode);
-	  emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
-	  emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
-	}
-    }
-
-  /* e0 = x0 * a */
-  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
-  /* e1 = e0 * x0 */
-  emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
-
-  /* e2 = e1 - 3. */
-  mthree = force_reg (mode, mthree);
-  emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
-
-  mhalf = force_reg (mode, mhalf);
-  if (recip)
-    /* e3 = -.5 * x0 */
-    emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
-  else
-    /* e3 = -.5 * e0 */
-    emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
-  /* ret = e2 * e3 */
-  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
-}
-
-#ifdef TARGET_SOLARIS
-/* Solaris implementation of TARGET_ASM_NAMED_SECTION.  */
+	case IX86_BUILTIN_PSLLD:
+	case IX86_BUILTIN_PSLLD128:
+	case IX86_BUILTIN_PSLLD128_MASK:
+	case IX86_BUILTIN_PSLLD256:
+	case IX86_BUILTIN_PSLLD256_MASK:
+	case IX86_BUILTIN_PSLLD512:
+	case IX86_BUILTIN_PSLLDI:
+	case IX86_BUILTIN_PSLLDI128:
+	case IX86_BUILTIN_PSLLDI128_MASK:
+	case IX86_BUILTIN_PSLLDI256:
+	case IX86_BUILTIN_PSLLDI256_MASK:
+	case IX86_BUILTIN_PSLLDI512:
+	case IX86_BUILTIN_PSLLQ:
+	case IX86_BUILTIN_PSLLQ128:
+	case IX86_BUILTIN_PSLLQ128_MASK:
+	case IX86_BUILTIN_PSLLQ256:
+	case IX86_BUILTIN_PSLLQ256_MASK:
+	case IX86_BUILTIN_PSLLQ512:
+	case IX86_BUILTIN_PSLLQI:
+	case IX86_BUILTIN_PSLLQI128:
+	case IX86_BUILTIN_PSLLQI128_MASK:
+	case IX86_BUILTIN_PSLLQI256:
+	case IX86_BUILTIN_PSLLQI256_MASK:
+	case IX86_BUILTIN_PSLLQI512:
+	case IX86_BUILTIN_PSLLW:
+	case IX86_BUILTIN_PSLLW128:
+	case IX86_BUILTIN_PSLLW128_MASK:
+	case IX86_BUILTIN_PSLLW256:
+	case IX86_BUILTIN_PSLLW256_MASK:
+	case IX86_BUILTIN_PSLLW512_MASK:
+	case IX86_BUILTIN_PSLLWI:
+	case IX86_BUILTIN_PSLLWI128:
+	case IX86_BUILTIN_PSLLWI128_MASK:
+	case IX86_BUILTIN_PSLLWI256:
+	case IX86_BUILTIN_PSLLWI256_MASK:
+	case IX86_BUILTIN_PSLLWI512_MASK:
+	  rcode = ASHIFT;
+	  is_vshift = false;
+	  goto do_shift;
+	case IX86_BUILTIN_PSRAD:
+	case IX86_BUILTIN_PSRAD128:
+	case IX86_BUILTIN_PSRAD128_MASK:
+	case IX86_BUILTIN_PSRAD256:
+	case IX86_BUILTIN_PSRAD256_MASK:
+	case IX86_BUILTIN_PSRAD512:
+	case IX86_BUILTIN_PSRADI:
+	case IX86_BUILTIN_PSRADI128:
+	case IX86_BUILTIN_PSRADI128_MASK:
+	case IX86_BUILTIN_PSRADI256:
+	case IX86_BUILTIN_PSRADI256_MASK:
+	case IX86_BUILTIN_PSRADI512:
+	case IX86_BUILTIN_PSRAQ128_MASK:
+	case IX86_BUILTIN_PSRAQ256_MASK:
+	case IX86_BUILTIN_PSRAQ512:
+	case IX86_BUILTIN_PSRAQI128_MASK:
+	case IX86_BUILTIN_PSRAQI256_MASK:
+	case IX86_BUILTIN_PSRAQI512:
+	case IX86_BUILTIN_PSRAW:
+	case IX86_BUILTIN_PSRAW128:
+	case IX86_BUILTIN_PSRAW128_MASK:
+	case IX86_BUILTIN_PSRAW256:
+	case IX86_BUILTIN_PSRAW256_MASK:
+	case IX86_BUILTIN_PSRAW512:
+	case IX86_BUILTIN_PSRAWI:
+	case IX86_BUILTIN_PSRAWI128:
+	case IX86_BUILTIN_PSRAWI128_MASK:
+	case IX86_BUILTIN_PSRAWI256:
+	case IX86_BUILTIN_PSRAWI256_MASK:
+	case IX86_BUILTIN_PSRAWI512:
+	  rcode = ASHIFTRT;
+	  is_vshift = false;
+	  goto do_shift;
+	case IX86_BUILTIN_PSRLD:
+	case IX86_BUILTIN_PSRLD128:
+	case IX86_BUILTIN_PSRLD128_MASK:
+	case IX86_BUILTIN_PSRLD256:
+	case IX86_BUILTIN_PSRLD256_MASK:
+	case IX86_BUILTIN_PSRLD512:
+	case IX86_BUILTIN_PSRLDI:
+	case IX86_BUILTIN_PSRLDI128:
+	case IX86_BUILTIN_PSRLDI128_MASK:
+	case IX86_BUILTIN_PSRLDI256:
+	case IX86_BUILTIN_PSRLDI256_MASK:
+	case IX86_BUILTIN_PSRLDI512:
+	case IX86_BUILTIN_PSRLQ:
+	case IX86_BUILTIN_PSRLQ128:
+	case IX86_BUILTIN_PSRLQ128_MASK:
+	case IX86_BUILTIN_PSRLQ256:
+	case IX86_BUILTIN_PSRLQ256_MASK:
+	case IX86_BUILTIN_PSRLQ512:
+	case IX86_BUILTIN_PSRLQI:
+	case IX86_BUILTIN_PSRLQI128:
+	case IX86_BUILTIN_PSRLQI128_MASK:
+	case IX86_BUILTIN_PSRLQI256:
+	case IX86_BUILTIN_PSRLQI256_MASK:
+	case IX86_BUILTIN_PSRLQI512:
+	case IX86_BUILTIN_PSRLW:
+	case IX86_BUILTIN_PSRLW128:
+	case IX86_BUILTIN_PSRLW128_MASK:
+	case IX86_BUILTIN_PSRLW256:
+	case IX86_BUILTIN_PSRLW256_MASK:
+	case IX86_BUILTIN_PSRLW512:
+	case IX86_BUILTIN_PSRLWI:
+	case IX86_BUILTIN_PSRLWI128:
+	case IX86_BUILTIN_PSRLWI128_MASK:
+	case IX86_BUILTIN_PSRLWI256:
+	case IX86_BUILTIN_PSRLWI256_MASK:
+	case IX86_BUILTIN_PSRLWI512:
+	  rcode = LSHIFTRT;
+	  is_vshift = false;
+	  goto do_shift;
+	case IX86_BUILTIN_PSLLVV16HI:
+	case IX86_BUILTIN_PSLLVV16SI:
+	case IX86_BUILTIN_PSLLVV2DI:
+	case IX86_BUILTIN_PSLLVV2DI_MASK:
+	case IX86_BUILTIN_PSLLVV32HI:
+	case IX86_BUILTIN_PSLLVV4DI:
+	case IX86_BUILTIN_PSLLVV4DI_MASK:
+	case IX86_BUILTIN_PSLLVV4SI:
+	case IX86_BUILTIN_PSLLVV4SI_MASK:
+	case IX86_BUILTIN_PSLLVV8DI:
+	case IX86_BUILTIN_PSLLVV8HI:
+	case IX86_BUILTIN_PSLLVV8SI:
+	case IX86_BUILTIN_PSLLVV8SI_MASK:
+	  rcode = ASHIFT;
+	  is_vshift = true;
+	  goto do_shift;
+	case IX86_BUILTIN_PSRAVQ128:
+	case IX86_BUILTIN_PSRAVQ256:
+	case IX86_BUILTIN_PSRAVV16HI:
+	case IX86_BUILTIN_PSRAVV16SI:
+	case IX86_BUILTIN_PSRAVV32HI:
+	case IX86_BUILTIN_PSRAVV4SI:
+	case IX86_BUILTIN_PSRAVV4SI_MASK:
+	case IX86_BUILTIN_PSRAVV8DI:
+	case IX86_BUILTIN_PSRAVV8HI:
+	case IX86_BUILTIN_PSRAVV8SI:
+	case IX86_BUILTIN_PSRAVV8SI_MASK:
+	  rcode = ASHIFTRT;
+	  is_vshift = true;
+	  goto do_shift;
+	case IX86_BUILTIN_PSRLVV16HI:
+	case IX86_BUILTIN_PSRLVV16SI:
+	case IX86_BUILTIN_PSRLVV2DI:
+	case IX86_BUILTIN_PSRLVV2DI_MASK:
+	case IX86_BUILTIN_PSRLVV32HI:
+	case IX86_BUILTIN_PSRLVV4DI:
+	case IX86_BUILTIN_PSRLVV4DI_MASK:
+	case IX86_BUILTIN_PSRLVV4SI:
+	case IX86_BUILTIN_PSRLVV4SI_MASK:
+	case IX86_BUILTIN_PSRLVV8DI:
+	case IX86_BUILTIN_PSRLVV8HI:
+	case IX86_BUILTIN_PSRLVV8SI:
+	case IX86_BUILTIN_PSRLVV8SI_MASK:
+	  rcode = LSHIFTRT;
+	  is_vshift = true;
+	  goto do_shift;
 
-static void
-i386_solaris_elf_named_section (const char *name, unsigned int flags,
-				tree decl)
-{
-  /* With Binutils 2.15, the "@unwind" marker must be specified on
-     every occurrence of the ".eh_frame" section, not just the first
-     one.  */
-  if (TARGET_64BIT
-      && strcmp (name, ".eh_frame") == 0)
-    {
-      fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
-	       flags & SECTION_WRITE ? "aw" : "a");
-      return;
-    }
+	do_shift:
+	  gcc_assert (n_args >= 2);
+	  if (TREE_CODE (args[0]) != VECTOR_CST)
+	    break;
+	  mask = HOST_WIDE_INT_M1U;
+	  if (n_args > 2)
+	    {
+	      /* This is masked shift.  */
+	      if (!tree_fits_uhwi_p (args[n_args - 1])
+		  || TREE_SIDE_EFFECTS (args[n_args - 2]))
+		break;
+	      mask = tree_to_uhwi (args[n_args - 1]);
+	      unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
+	      mask |= HOST_WIDE_INT_M1U << elems;
+	      if (mask != HOST_WIDE_INT_M1U
+		  && TREE_CODE (args[n_args - 2]) != VECTOR_CST)
+		break;
+	      if (mask == (HOST_WIDE_INT_M1U << elems))
+		return args[n_args - 2];
+	    }
+	  if (is_vshift && TREE_CODE (args[1]) != VECTOR_CST)
+	    break;
+	  if (tree tem = (is_vshift ? integer_one_node
+			  : ix86_vector_shift_count (args[1])))
+	    {
+	      unsigned HOST_WIDE_INT count = tree_to_uhwi (tem);
+	      unsigned HOST_WIDE_INT prec
+		= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (args[0])));
+	      if (count == 0 && mask == HOST_WIDE_INT_M1U)
+		return args[0];
+	      if (count >= prec)
+		{
+		  if (rcode == ASHIFTRT)
+		    count = prec - 1;
+		  else if (mask == HOST_WIDE_INT_M1U)
+		    return build_zero_cst (TREE_TYPE (args[0]));
+		}
+	      tree countt = NULL_TREE;
+	      if (!is_vshift)
+		{
+		  if (count >= prec)
+		    countt = integer_zero_node;
+		  else
+		    countt = build_int_cst (integer_type_node, count);
+		}
+	      tree_vector_builder builder;
+	      if (mask != HOST_WIDE_INT_M1U || is_vshift)
+		builder.new_vector (TREE_TYPE (args[0]),
+				    TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0])),
+				    1);
+	      else
+		builder.new_unary_operation (TREE_TYPE (args[0]), args[0],
+					     false);
+	      unsigned int cnt = builder.encoded_nelts ();
+	      for (unsigned int i = 0; i < cnt; ++i)
+		{
+		  tree elt = VECTOR_CST_ELT (args[0], i);
+		  if (TREE_CODE (elt) != INTEGER_CST || TREE_OVERFLOW (elt))
+		    return NULL_TREE;
+		  tree type = TREE_TYPE (elt);
+		  if (rcode == LSHIFTRT)
+		    elt = fold_convert (unsigned_type_for (type), elt);
+		  if (is_vshift)
+		    {
+		      countt = VECTOR_CST_ELT (args[1], i);
+		      if (TREE_CODE (countt) != INTEGER_CST
+			  || TREE_OVERFLOW (countt))
+			return NULL_TREE;
+		      if (wi::neg_p (wi::to_wide (countt))
+			  || wi::to_widest (countt) >= prec)
+			{
+			  if (rcode == ASHIFTRT)
+			    countt = build_int_cst (TREE_TYPE (countt),
+						    prec - 1);
+			  else
+			    {
+			      elt = build_zero_cst (TREE_TYPE (elt));
+			      countt = build_zero_cst (TREE_TYPE (countt));
+			    }
+			}
+		    }
+		  else if (count >= prec)
+		    elt = build_zero_cst (TREE_TYPE (elt));
+		  elt = const_binop (rcode == ASHIFT
+				     ? LSHIFT_EXPR : RSHIFT_EXPR,
+				     TREE_TYPE (elt), elt, countt);
+		  if (!elt || TREE_CODE (elt) != INTEGER_CST)
+		    return NULL_TREE;
+		  if (rcode == LSHIFTRT)
+		    elt = fold_convert (type, elt);
+		  if ((mask & (HOST_WIDE_INT_1U << i)) == 0)
+		    {
+		      elt = VECTOR_CST_ELT (args[n_args - 2], i);
+		      if (TREE_CODE (elt) != INTEGER_CST
+			  || TREE_OVERFLOW (elt))
+			return NULL_TREE;
+		    }
+		  builder.quick_push (elt);
+		}
+	      return builder.build ();
+	    }
+	  break;
 
-#ifndef USE_GAS
-  if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
-    {
-      solaris_elf_asm_comdat_section (name, flags, decl);
-      return;
+	default:
+	  break;
+	}
     }
 
-  /* Solaris/x86 as uses the same syntax for the SHF_EXCLUDE flags as the
-     SPARC assembler.  One cannot mix single-letter flags and #exclude, so
-     only emit the latter here.  */
-  if (flags & SECTION_EXCLUDE)
-    {
-      fprintf (asm_out_file, "\t.section\t%s,#exclude\n", name);
-      return;
-    }
+#ifdef SUBTARGET_FOLD_BUILTIN
+  return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
 #endif
 
-  default_elf_asm_named_section (name, flags, decl);
+  return NULL_TREE;
 }
-#endif /* TARGET_SOLARIS */
 
-/* Return the mangling of TYPE if it is an extended fundamental type.  */
+/* Fold a MD builtin (use ix86_fold_builtin for folding into
+   constant) in GIMPLE.  */
 
-static const char *
-ix86_mangle_type (const_tree type)
+bool
+ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 {
-  type = TYPE_MAIN_VARIANT (type);
-
-  if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
-      && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
-    return NULL;
+  gimple *stmt = gsi_stmt (*gsi);
+  tree fndecl = gimple_call_fndecl (stmt);
+  gcc_checking_assert (fndecl && fndecl_built_in_p (fndecl, BUILT_IN_MD));
+  int n_args = gimple_call_num_args (stmt);
+  enum ix86_builtins fn_code
+    = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl);
+  tree decl = NULL_TREE;
+  tree arg0, arg1, arg2;
+  enum rtx_code rcode;
+  unsigned HOST_WIDE_INT count;
+  bool is_vshift;
 
-  switch (TYPE_MODE (type))
+  switch (fn_code)
     {
-    case E_TFmode:
-      /* __float128 is "g".  */
-      return "g";
-    case E_XFmode:
-      /* "long double" or __float80 is "e".  */
-      return "e";
-    default:
-      return NULL;
-    }
-}
+    case IX86_BUILTIN_TZCNT32:
+      decl = builtin_decl_implicit (BUILT_IN_CTZ);
+      goto fold_tzcnt_lzcnt;
 
-static GTY(()) tree ix86_tls_stack_chk_guard_decl;
+    case IX86_BUILTIN_TZCNT64:
+      decl = builtin_decl_implicit (BUILT_IN_CTZLL);
+      goto fold_tzcnt_lzcnt;
 
-static tree
-ix86_stack_protect_guard (void)
-{
-  if (TARGET_SSP_TLS_GUARD)
-    {
-      tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
-      int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
-      tree type = build_qualified_type (type_node, qual);
-      tree t;
+    case IX86_BUILTIN_LZCNT32:
+      decl = builtin_decl_implicit (BUILT_IN_CLZ);
+      goto fold_tzcnt_lzcnt;
+
+    case IX86_BUILTIN_LZCNT64:
+      decl = builtin_decl_implicit (BUILT_IN_CLZLL);
+      goto fold_tzcnt_lzcnt;
+
+    fold_tzcnt_lzcnt:
+      gcc_assert (n_args == 1);
+      arg0 = gimple_call_arg (stmt, 0);
+      if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
+	{
+	  int prec = TYPE_PRECISION (TREE_TYPE (arg0));
+	  /* If arg0 is provably non-zero, optimize into generic
+	     __builtin_c[tl]z{,ll} function the middle-end handles
+	     better.  */
+	  if (!expr_not_equal_to (arg0, wi::zero (prec)))
+	    return false;
+
+	  location_t loc = gimple_location (stmt);
+	  gimple *g = gimple_build_call (decl, 1, arg0);
+	  gimple_set_location (g, loc);
+	  tree lhs = make_ssa_name (integer_type_node);
+	  gimple_call_set_lhs (g, lhs);
+	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
+	  g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
+	  gimple_set_location (g, loc);
+	  gsi_replace (gsi, g, false);
+	  return true;
+	}
+      break;
+
+    case IX86_BUILTIN_BZHI32:
+    case IX86_BUILTIN_BZHI64:
+      gcc_assert (n_args == 2);
+      arg1 = gimple_call_arg (stmt, 1);
+      if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
+	{
+	  unsigned int idx = tree_to_uhwi (arg1) & 0xff;
+	  arg0 = gimple_call_arg (stmt, 0);
+	  if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
+	    break;
+	  location_t loc = gimple_location (stmt);
+	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
+	  gimple_set_location (g, loc);
+	  gsi_replace (gsi, g, false);
+	  return true;
+	}
+      break;
+
+    case IX86_BUILTIN_PDEP32:
+    case IX86_BUILTIN_PDEP64:
+    case IX86_BUILTIN_PEXT32:
+    case IX86_BUILTIN_PEXT64:
+      gcc_assert (n_args == 2);
+      arg1 = gimple_call_arg (stmt, 1);
+      if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
+	{
+	  location_t loc = gimple_location (stmt);
+	  arg0 = gimple_call_arg (stmt, 0);
+	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
+	  gimple_set_location (g, loc);
+	  gsi_replace (gsi, g, false);
+	  return true;
+	}
+      break;
+
+    case IX86_BUILTIN_PSLLD:
+    case IX86_BUILTIN_PSLLD128:
+    case IX86_BUILTIN_PSLLD128_MASK:
+    case IX86_BUILTIN_PSLLD256:
+    case IX86_BUILTIN_PSLLD256_MASK:
+    case IX86_BUILTIN_PSLLD512:
+    case IX86_BUILTIN_PSLLDI:
+    case IX86_BUILTIN_PSLLDI128:
+    case IX86_BUILTIN_PSLLDI128_MASK:
+    case IX86_BUILTIN_PSLLDI256:
+    case IX86_BUILTIN_PSLLDI256_MASK:
+    case IX86_BUILTIN_PSLLDI512:
+    case IX86_BUILTIN_PSLLQ:
+    case IX86_BUILTIN_PSLLQ128:
+    case IX86_BUILTIN_PSLLQ128_MASK:
+    case IX86_BUILTIN_PSLLQ256:
+    case IX86_BUILTIN_PSLLQ256_MASK:
+    case IX86_BUILTIN_PSLLQ512:
+    case IX86_BUILTIN_PSLLQI:
+    case IX86_BUILTIN_PSLLQI128:
+    case IX86_BUILTIN_PSLLQI128_MASK:
+    case IX86_BUILTIN_PSLLQI256:
+    case IX86_BUILTIN_PSLLQI256_MASK:
+    case IX86_BUILTIN_PSLLQI512:
+    case IX86_BUILTIN_PSLLW:
+    case IX86_BUILTIN_PSLLW128:
+    case IX86_BUILTIN_PSLLW128_MASK:
+    case IX86_BUILTIN_PSLLW256:
+    case IX86_BUILTIN_PSLLW256_MASK:
+    case IX86_BUILTIN_PSLLW512_MASK:
+    case IX86_BUILTIN_PSLLWI:
+    case IX86_BUILTIN_PSLLWI128:
+    case IX86_BUILTIN_PSLLWI128_MASK:
+    case IX86_BUILTIN_PSLLWI256:
+    case IX86_BUILTIN_PSLLWI256_MASK:
+    case IX86_BUILTIN_PSLLWI512_MASK:
+      rcode = ASHIFT;
+      is_vshift = false;
+      goto do_shift;
+    case IX86_BUILTIN_PSRAD:
+    case IX86_BUILTIN_PSRAD128:
+    case IX86_BUILTIN_PSRAD128_MASK:
+    case IX86_BUILTIN_PSRAD256:
+    case IX86_BUILTIN_PSRAD256_MASK:
+    case IX86_BUILTIN_PSRAD512:
+    case IX86_BUILTIN_PSRADI:
+    case IX86_BUILTIN_PSRADI128:
+    case IX86_BUILTIN_PSRADI128_MASK:
+    case IX86_BUILTIN_PSRADI256:
+    case IX86_BUILTIN_PSRADI256_MASK:
+    case IX86_BUILTIN_PSRADI512:
+    case IX86_BUILTIN_PSRAQ128_MASK:
+    case IX86_BUILTIN_PSRAQ256_MASK:
+    case IX86_BUILTIN_PSRAQ512:
+    case IX86_BUILTIN_PSRAQI128_MASK:
+    case IX86_BUILTIN_PSRAQI256_MASK:
+    case IX86_BUILTIN_PSRAQI512:
+    case IX86_BUILTIN_PSRAW:
+    case IX86_BUILTIN_PSRAW128:
+    case IX86_BUILTIN_PSRAW128_MASK:
+    case IX86_BUILTIN_PSRAW256:
+    case IX86_BUILTIN_PSRAW256_MASK:
+    case IX86_BUILTIN_PSRAW512:
+    case IX86_BUILTIN_PSRAWI:
+    case IX86_BUILTIN_PSRAWI128:
+    case IX86_BUILTIN_PSRAWI128_MASK:
+    case IX86_BUILTIN_PSRAWI256:
+    case IX86_BUILTIN_PSRAWI256_MASK:
+    case IX86_BUILTIN_PSRAWI512:
+      rcode = ASHIFTRT;
+      is_vshift = false;
+      goto do_shift;
+    case IX86_BUILTIN_PSRLD:
+    case IX86_BUILTIN_PSRLD128:
+    case IX86_BUILTIN_PSRLD128_MASK:
+    case IX86_BUILTIN_PSRLD256:
+    case IX86_BUILTIN_PSRLD256_MASK:
+    case IX86_BUILTIN_PSRLD512:
+    case IX86_BUILTIN_PSRLDI:
+    case IX86_BUILTIN_PSRLDI128:
+    case IX86_BUILTIN_PSRLDI128_MASK:
+    case IX86_BUILTIN_PSRLDI256:
+    case IX86_BUILTIN_PSRLDI256_MASK:
+    case IX86_BUILTIN_PSRLDI512:
+    case IX86_BUILTIN_PSRLQ:
+    case IX86_BUILTIN_PSRLQ128:
+    case IX86_BUILTIN_PSRLQ128_MASK:
+    case IX86_BUILTIN_PSRLQ256:
+    case IX86_BUILTIN_PSRLQ256_MASK:
+    case IX86_BUILTIN_PSRLQ512:
+    case IX86_BUILTIN_PSRLQI:
+    case IX86_BUILTIN_PSRLQI128:
+    case IX86_BUILTIN_PSRLQI128_MASK:
+    case IX86_BUILTIN_PSRLQI256:
+    case IX86_BUILTIN_PSRLQI256_MASK:
+    case IX86_BUILTIN_PSRLQI512:
+    case IX86_BUILTIN_PSRLW:
+    case IX86_BUILTIN_PSRLW128:
+    case IX86_BUILTIN_PSRLW128_MASK:
+    case IX86_BUILTIN_PSRLW256:
+    case IX86_BUILTIN_PSRLW256_MASK:
+    case IX86_BUILTIN_PSRLW512:
+    case IX86_BUILTIN_PSRLWI:
+    case IX86_BUILTIN_PSRLWI128:
+    case IX86_BUILTIN_PSRLWI128_MASK:
+    case IX86_BUILTIN_PSRLWI256:
+    case IX86_BUILTIN_PSRLWI256_MASK:
+    case IX86_BUILTIN_PSRLWI512:
+      rcode = LSHIFTRT;
+      is_vshift = false;
+      goto do_shift;
+    case IX86_BUILTIN_PSLLVV16HI:
+    case IX86_BUILTIN_PSLLVV16SI:
+    case IX86_BUILTIN_PSLLVV2DI:
+    case IX86_BUILTIN_PSLLVV2DI_MASK:
+    case IX86_BUILTIN_PSLLVV32HI:
+    case IX86_BUILTIN_PSLLVV4DI:
+    case IX86_BUILTIN_PSLLVV4DI_MASK:
+    case IX86_BUILTIN_PSLLVV4SI:
+    case IX86_BUILTIN_PSLLVV4SI_MASK:
+    case IX86_BUILTIN_PSLLVV8DI:
+    case IX86_BUILTIN_PSLLVV8HI:
+    case IX86_BUILTIN_PSLLVV8SI:
+    case IX86_BUILTIN_PSLLVV8SI_MASK:
+      rcode = ASHIFT;
+      is_vshift = true;
+      goto do_shift;
+    case IX86_BUILTIN_PSRAVQ128:
+    case IX86_BUILTIN_PSRAVQ256:
+    case IX86_BUILTIN_PSRAVV16HI:
+    case IX86_BUILTIN_PSRAVV16SI:
+    case IX86_BUILTIN_PSRAVV32HI:
+    case IX86_BUILTIN_PSRAVV4SI:
+    case IX86_BUILTIN_PSRAVV4SI_MASK:
+    case IX86_BUILTIN_PSRAVV8DI:
+    case IX86_BUILTIN_PSRAVV8HI:
+    case IX86_BUILTIN_PSRAVV8SI:
+    case IX86_BUILTIN_PSRAVV8SI_MASK:
+      rcode = ASHIFTRT;
+      is_vshift = true;
+      goto do_shift;
+    case IX86_BUILTIN_PSRLVV16HI:
+    case IX86_BUILTIN_PSRLVV16SI:
+    case IX86_BUILTIN_PSRLVV2DI:
+    case IX86_BUILTIN_PSRLVV2DI_MASK:
+    case IX86_BUILTIN_PSRLVV32HI:
+    case IX86_BUILTIN_PSRLVV4DI:
+    case IX86_BUILTIN_PSRLVV4DI_MASK:
+    case IX86_BUILTIN_PSRLVV4SI:
+    case IX86_BUILTIN_PSRLVV4SI_MASK:
+    case IX86_BUILTIN_PSRLVV8DI:
+    case IX86_BUILTIN_PSRLVV8HI:
+    case IX86_BUILTIN_PSRLVV8SI:
+    case IX86_BUILTIN_PSRLVV8SI_MASK:
+      rcode = LSHIFTRT;
+      is_vshift = true;
+      goto do_shift;
 
-      if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
+    do_shift:
+      gcc_assert (n_args >= 2);
+      arg0 = gimple_call_arg (stmt, 0);
+      arg1 = gimple_call_arg (stmt, 1);
+      if (n_args > 2)
 	{
-	  t = ix86_tls_stack_chk_guard_decl;
-
-	  if (t == NULL)
-	    {
-	      rtx x;
-
-	      t = build_decl
-		(UNKNOWN_LOCATION, VAR_DECL,
-		 get_identifier (ix86_stack_protector_guard_symbol_str),
-		 type);
-	      TREE_STATIC (t) = 1;
-	      TREE_PUBLIC (t) = 1;
-	      DECL_EXTERNAL (t) = 1;
-	      TREE_USED (t) = 1;
-	      TREE_THIS_VOLATILE (t) = 1;
-	      DECL_ARTIFICIAL (t) = 1;
-	      DECL_IGNORED_P (t) = 1;
-
-	      /* Do not share RTL as the declaration is visible outside of
-		 current function.  */
-	      x = DECL_RTL (t);
-	      RTX_FLAG (x, used) = 1;
-
-	      ix86_tls_stack_chk_guard_decl = t;
-	    }
+	  /* This is masked shift.  Only optimize if the mask is all ones.  */
+	  tree argl = gimple_call_arg (stmt, n_args - 1);
+	  if (!tree_fits_uhwi_p (argl))
+	    break;
+	  unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);
+	  unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+	  if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
+	    break;
 	}
-      else
+      if (is_vshift)
 	{
-	  tree asptrtype = build_pointer_type (type);
-
-	  t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
-	  t = build2 (MEM_REF, asptrtype, t,
-		      build_int_cst (asptrtype, 0));
-	  TREE_THIS_VOLATILE (t) = 1;
+	  if (TREE_CODE (arg1) != VECTOR_CST)
+	    break;
+	  count = TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0)));
+	  if (integer_zerop (arg1))
+	    count = 0;
+	  else if (rcode == ASHIFTRT)
+	    break;
+	  else
+	    for (unsigned int i = 0; i < VECTOR_CST_NELTS (arg1); ++i)
+	      {
+		tree elt = VECTOR_CST_ELT (arg1, i);
+		if (!wi::neg_p (wi::to_wide (elt))
+		    && wi::to_widest (elt) < count)
+		  return false;
+	      }
 	}
-
-      return t;
-    }
-
-  return default_stack_protect_guard ();
-}
-
-/* For 32-bit code we can save PIC register setup by using
-   __stack_chk_fail_local hidden function instead of calling
-   __stack_chk_fail directly.  64-bit code doesn't need to setup any PIC
-   register, so it is better to call __stack_chk_fail directly.  */
-
-static tree ATTRIBUTE_UNUSED
-ix86_stack_protect_fail (void)
-{
-  return TARGET_64BIT
-	 ? default_external_stack_protect_fail ()
-	 : default_hidden_stack_protect_fail ();
-}
-
-/* Select a format to encode pointers in exception handling data.  CODE
-   is 0 for data, 1 for code labels, 2 for function pointers.  GLOBAL is
-   true if the symbol may be affected by dynamic relocations.
-
-   ??? All x86 object file formats are capable of representing this.
-   After all, the relocation needed is the same as for the call insn.
-   Whether or not a particular assembler allows us to enter such, I
-   guess we'll have to see.  */
-int
-asm_preferred_eh_data_format (int code, int global)
-{
-  if (flag_pic)
-    {
-      int type = DW_EH_PE_sdata8;
-      if (!TARGET_64BIT
-	  || ix86_cmodel == CM_SMALL_PIC
-	  || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
-	type = DW_EH_PE_sdata4;
-      return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
-    }
-  if (ix86_cmodel == CM_SMALL
-      || (ix86_cmodel == CM_MEDIUM && code))
-    return DW_EH_PE_udata4;
-  return DW_EH_PE_absptr;
-}
-
-/* Expand copysign from SIGN to the positive value ABS_VALUE
-   storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
-   the sign-bit.  */
-static void
-ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
-{
-  machine_mode mode = GET_MODE (sign);
-  rtx sgn = gen_reg_rtx (mode);
-  if (mask == NULL_RTX)
-    {
-      machine_mode vmode;
-
-      if (mode == SFmode)
-	vmode = V4SFmode;
-      else if (mode == DFmode)
-	vmode = V2DFmode;
       else
-	vmode = mode;
-
-      mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
-      if (!VECTOR_MODE_P (mode))
 	{
-	  /* We need to generate a scalar mode mask in this case.  */
-	  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
-	  tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
-	  mask = gen_reg_rtx (mode);
-	  emit_insn (gen_rtx_SET (mask, tmp));
+	  arg1 = ix86_vector_shift_count (arg1);
+	  if (!arg1)
+	    break;
+	  count = tree_to_uhwi (arg1);
 	}
-    }
-  else
-    mask = gen_rtx_NOT (mode, mask);
-  emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
-  emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
-}
+      if (count == 0)
+	{
+	  /* Just return the first argument for shift by 0.  */
+	  location_t loc = gimple_location (stmt);
+	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
+	  gimple_set_location (g, loc);
+	  gsi_replace (gsi, g, false);
+	  return true;
+	}
+      if (rcode != ASHIFTRT
+	  && count >= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0))))
+	{
+	  /* For shift counts equal or greater than precision, except for
+	     arithmetic right shift the result is zero.  */
+	  location_t loc = gimple_location (stmt);
+	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
+					   build_zero_cst (TREE_TYPE (arg0)));
+	  gimple_set_location (g, loc);
+	  gsi_replace (gsi, g, false);
+	  return true;
+	}
+      break;
 
-/* Expand fabs (OP0) and return a new rtx that holds the result.  The
-   mask for masking out the sign-bit is stored in *SMASK, if that is
-   non-null.  */
-static rtx
-ix86_expand_sse_fabs (rtx op0, rtx *smask)
-{
-  machine_mode vmode, mode = GET_MODE (op0);
-  rtx xa, mask;
+    case IX86_BUILTIN_SHUFPD:
+      arg2 = gimple_call_arg (stmt, 2);
+      if (TREE_CODE (arg2) == INTEGER_CST)
+	{
+	  location_t loc = gimple_location (stmt);
+	  unsigned HOST_WIDE_INT imask = TREE_INT_CST_LOW (arg2);
+	  arg0 = gimple_call_arg (stmt, 0);
+	  arg1 = gimple_call_arg (stmt, 1);
+	  tree itype = long_long_integer_type_node;
+	  tree vtype = build_vector_type (itype, 2); /* V2DI */
+	  tree_vector_builder elts (vtype, 2, 1);
+	  /* Ignore bits other than the lowest 2.  */
+	  elts.quick_push (build_int_cst (itype, imask & 1));
+	  imask >>= 1;
+	  elts.quick_push (build_int_cst (itype, 2 + (imask & 1)));
+	  tree omask = elts.build ();
+	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
+					   VEC_PERM_EXPR,
+					   arg0, arg1, omask);
+	  gimple_set_location (g, loc);
+	  gsi_replace (gsi, g, false);
+	  return true;
+	}
+      // Do not error yet, the constant could be propagated later?
+      break;
 
-  xa = gen_reg_rtx (mode);
-  if (mode == SFmode)
-    vmode = V4SFmode;
-  else if (mode == DFmode)
-    vmode = V2DFmode;
-  else
-    vmode = mode;
-  mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
-  if (!VECTOR_MODE_P (mode))
-    {
-      /* We need to generate a scalar mode mask in this case.  */
-      rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
-      tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
-      mask = gen_reg_rtx (mode);
-      emit_insn (gen_rtx_SET (mask, tmp));
+    default:
+      break;
     }
-  emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
-
-  if (smask)
-    *smask = mask;
-
-  return xa;
-}
-
-/* Expands a comparison of OP0 with OP1 using comparison code CODE,
-   swapping the operands if SWAP_OPERANDS is true.  The expanded
-   code is a forward jump to a newly created label in case the
-   comparison is true.  The generated label rtx is returned.  */
-static rtx_code_label *
-ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
-                                  bool swap_operands)
-{
-  bool unordered_compare = ix86_unordered_fp_compare (code);
-  rtx_code_label *label;
-  rtx tmp, reg;
-
-  if (swap_operands)
-    std::swap (op0, op1);
-
-  label = gen_label_rtx ();
-  tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
-  if (unordered_compare)
-    tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
-  reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
-  emit_insn (gen_rtx_SET (reg, tmp));
-  tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
-  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
-			      gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
-  tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
-  JUMP_LABEL (tmp) = label;
-
-  return label;
-}
-
-/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
-   using comparison code CODE.  Operands are swapped for the comparison if
-   SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
-static rtx
-ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
-			      bool swap_operands)
-{
-  rtx (*insn)(rtx, rtx, rtx, rtx);
-  machine_mode mode = GET_MODE (op0);
-  rtx mask = gen_reg_rtx (mode);
-
-  if (swap_operands)
-    std::swap (op0, op1);
-
-  insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
-
-  emit_insn (insn (mask, op0, op1,
-		   gen_rtx_fmt_ee (code, mode, op0, op1)));
-  return mask;
-}
-
-/* Generate and return a rtx of mode MODE for 2**n where n is the number
-   of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
-static rtx
-ix86_gen_TWO52 (machine_mode mode)
-{
-  REAL_VALUE_TYPE TWO52r;
-  rtx TWO52;
-
-  real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
-  TWO52 = const_double_from_real_value (TWO52r, mode);
-  TWO52 = force_reg (mode, TWO52);
-
-  return TWO52;
-}
-
-/* Expand SSE sequence for computing lround from OP1 storing
-   into OP0.  */
-void
-ix86_expand_lround (rtx op0, rtx op1)
-{
-  /* C code for the stuff we're doing below:
-       tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
-       return (long)tmp;
-   */
-  machine_mode mode = GET_MODE (op1);
-  const struct real_format *fmt;
-  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
-  rtx adj;
-
-  /* load nextafter (0.5, 0.0) */
-  fmt = REAL_MODE_FORMAT (mode);
-  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
-  real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
 
-  /* adj = copysign (0.5, op1) */
-  adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
-  ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
-
-  /* adj = op1 + adj */
-  adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
-
-  /* op0 = (imode)adj */
-  expand_fix (op0, adj, 0);
-}
-
-/* Expand SSE2 sequence for computing lround from OPERAND1 storing
-   into OPERAND0.  */
-void
-ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
-{
-  /* C code for the stuff we're doing below (for do_floor):
-	xi = (long)op1;
-        xi -= (double)xi > op1 ? 1 : 0;
-        return xi;
-   */
-  machine_mode fmode = GET_MODE (op1);
-  machine_mode imode = GET_MODE (op0);
-  rtx ireg, freg, tmp;
-  rtx_code_label *label;
-
-  /* reg = (long)op1 */
-  ireg = gen_reg_rtx (imode);
-  expand_fix (ireg, op1, 0);
-
-  /* freg = (double)reg */
-  freg = gen_reg_rtx (fmode);
-  expand_float (freg, ireg, 0);
-
-  /* ireg = (freg > op1) ? ireg - 1 : ireg */
-  label = ix86_expand_sse_compare_and_jump (UNLE,
-					    freg, op1, !do_floor);
-  tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
-			     ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
-  emit_move_insn (ireg, tmp);
-
-  emit_label (label);
-  LABEL_NUSES (label) = 1;
-
-  emit_move_insn (op0, ireg);
+  return false;
 }
 
-/* Expand rint rounding OPERAND1 and storing the result in OPERAND0.  */
-void
-ix86_expand_rint (rtx operand0, rtx operand1)
-{
-  /* C code for the stuff we're doing below:
-	xa = fabs (operand1);
-        if (!isless (xa, 2**52))
-	  return operand1;
-        two52 = 2**52;
-        if (flag_rounding_math)
-	  {
-	    two52 = copysign (two52, operand1);
-	    xa = operand1;
-	  }
-        xa = xa + two52 - two52;
-        return copysign (xa, operand1);
-   */
-  machine_mode mode = GET_MODE (operand0);
-  rtx res, xa, TWO52, two52, mask;
-  rtx_code_label *label;
-
-  res = gen_reg_rtx (mode);
-  emit_move_insn (res, operand1);
-
-  /* xa = abs (operand1) */
-  xa = ix86_expand_sse_fabs (res, &mask);
-
-  /* if (!isless (xa, TWO52)) goto label; */
-  TWO52 = ix86_gen_TWO52 (mode);
-  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
-
-  two52 = TWO52;
-  if (flag_rounding_math)
-    {
-      two52 = gen_reg_rtx (mode);
-      ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
-      xa = res;
-    }
-
-  xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
-  xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
-
-  ix86_sse_copysign_to_positive (res, xa, res, mask);
-
-  emit_label (label);
-  LABEL_NUSES (label) = 1;
+/* Handler for an SVML-style interface to
+   a library with vectorized intrinsics.  */
 
-  emit_move_insn (operand0, res);
-}
+tree
+ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
+{
+  char name[20];
+  tree fntype, new_fndecl, args;
+  unsigned arity;
+  const char *bname;
+  machine_mode el_mode, in_mode;
+  int n, in_n;
 
-/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
-   into OPERAND0.  */
-void
-ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
-{
-  /* C code for the stuff we expand below.
-        double xa = fabs (x), x2;
-        if (!isless (xa, TWO52))
-          return x;
-        xa = xa + TWO52 - TWO52;
-        x2 = copysign (xa, x);
-     Compensate.  Floor:
-        if (x2 > x)
-          x2 -= 1;
-     Compensate.  Ceil:
-        if (x2 < x)
-          x2 += 1;
-	if (HONOR_SIGNED_ZEROS (mode))
-	  x2 = copysign (x2, x);
-	return x2;
-   */
-  machine_mode mode = GET_MODE (operand0);
-  rtx xa, TWO52, tmp, one, res, mask;
-  rtx_code_label *label;
+  /* The SVML is suitable for unsafe math only.  */
+  if (!flag_unsafe_math_optimizations)
+    return NULL_TREE;
 
-  TWO52 = ix86_gen_TWO52 (mode);
+  el_mode = TYPE_MODE (TREE_TYPE (type_out));
+  n = TYPE_VECTOR_SUBPARTS (type_out);
+  in_mode = TYPE_MODE (TREE_TYPE (type_in));
+  in_n = TYPE_VECTOR_SUBPARTS (type_in);
+  if (el_mode != in_mode
+      || n != in_n)
+    return NULL_TREE;
 
-  /* Temporary for holding the result, initialized to the input
-     operand to ease control flow.  */
-  res = gen_reg_rtx (mode);
-  emit_move_insn (res, operand1);
+  switch (fn)
+    {
+    CASE_CFN_EXP:
+    CASE_CFN_LOG:
+    CASE_CFN_LOG10:
+    CASE_CFN_POW:
+    CASE_CFN_TANH:
+    CASE_CFN_TAN:
+    CASE_CFN_ATAN:
+    CASE_CFN_ATAN2:
+    CASE_CFN_ATANH:
+    CASE_CFN_CBRT:
+    CASE_CFN_SINH:
+    CASE_CFN_SIN:
+    CASE_CFN_ASINH:
+    CASE_CFN_ASIN:
+    CASE_CFN_COSH:
+    CASE_CFN_COS:
+    CASE_CFN_ACOSH:
+    CASE_CFN_ACOS:
+      if ((el_mode != DFmode || n != 2)
+	  && (el_mode != SFmode || n != 4))
+	return NULL_TREE;
+      break;
 
-  /* xa = abs (operand1) */
-  xa = ix86_expand_sse_fabs (res, &mask);
+    default:
+      return NULL_TREE;
+    }
 
-  /* if (!isless (xa, TWO52)) goto label; */
-  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+  tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
+  bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
 
-  /* xa = xa + TWO52 - TWO52; */
-  xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
-  xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
+  if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
+    strcpy (name, "vmlsLn4");
+  else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
+    strcpy (name, "vmldLn2");
+  else if (n == 4)
+    {
+      sprintf (name, "vmls%s", bname+10);
+      name[strlen (name)-1] = '4';
+    }
+  else
+    sprintf (name, "vmld%s2", bname+10);
 
-  /* xa = copysign (xa, operand1) */
-  ix86_sse_copysign_to_positive (xa, xa, res, mask);
+  /* Convert to uppercase. */
+  name[4] &= ~0x20;
 
-  /* generate 1.0 */
-  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
+  arity = 0;
+  for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
+    arity++;
 
-  /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
-  tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
-  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
-  tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
-			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
-  if (!do_floor && HONOR_SIGNED_ZEROS (mode))
-    ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
-  emit_move_insn (res, tmp);
+  if (arity == 1)
+    fntype = build_function_type_list (type_out, type_in, NULL);
+  else
+    fntype = build_function_type_list (type_out, type_in, type_in, NULL);
 
-  emit_label (label);
-  LABEL_NUSES (label) = 1;
+  /* Build a function declaration for the vectorized function.  */
+  new_fndecl = build_decl (BUILTINS_LOCATION,
+			   FUNCTION_DECL, get_identifier (name), fntype);
+  TREE_PUBLIC (new_fndecl) = 1;
+  DECL_EXTERNAL (new_fndecl) = 1;
+  DECL_IS_NOVOPS (new_fndecl) = 1;
+  TREE_READONLY (new_fndecl) = 1;
 
-  emit_move_insn (operand0, res);
+  return new_fndecl;
 }
 
-/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
-   into OPERAND0.  */
-void
-ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
-{
-  /* C code for the stuff we expand below.
-	double xa = fabs (x), x2;
-        if (!isless (xa, TWO52))
-          return x;
-	x2 = (double)(long)x;
-     Compensate.  Floor:
-	if (x2 > x)
-	  x2 -= 1;
-     Compensate.  Ceil:
-	if (x2 < x)
-	  x2 += 1;
-	if (HONOR_SIGNED_ZEROS (mode))
-	  return copysign (x2, x);
-	return x2;
-   */
-  machine_mode mode = GET_MODE (operand0);
-  rtx xa, xi, TWO52, tmp, one, res, mask;
-  rtx_code_label *label;
+/* Handler for an ACML-style interface to
+   a library with vectorized intrinsics.  */
 
-  TWO52 = ix86_gen_TWO52 (mode);
+tree
+ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
+{
+  char name[20] = "__vr.._";
+  tree fntype, new_fndecl, args;
+  unsigned arity;
+  const char *bname;
+  machine_mode el_mode, in_mode;
+  int n, in_n;
 
-  /* Temporary for holding the result, initialized to the input
-     operand to ease control flow.  */
-  res = gen_reg_rtx (mode);
-  emit_move_insn (res, operand1);
+  /* The ACML is 64bits only and suitable for unsafe math only as
+     it does not correctly support parts of IEEE with the required
+     precision such as denormals.  */
+  if (!TARGET_64BIT
+      || !flag_unsafe_math_optimizations)
+    return NULL_TREE;
 
-  /* xa = abs (operand1) */
-  xa = ix86_expand_sse_fabs (res, &mask);
+  el_mode = TYPE_MODE (TREE_TYPE (type_out));
+  n = TYPE_VECTOR_SUBPARTS (type_out);
+  in_mode = TYPE_MODE (TREE_TYPE (type_in));
+  in_n = TYPE_VECTOR_SUBPARTS (type_in);
+  if (el_mode != in_mode
+      || n != in_n)
+    return NULL_TREE;
 
-  /* if (!isless (xa, TWO52)) goto label; */
-  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+  switch (fn)
+    {
+    CASE_CFN_SIN:
+    CASE_CFN_COS:
+    CASE_CFN_EXP:
+    CASE_CFN_LOG:
+    CASE_CFN_LOG2:
+    CASE_CFN_LOG10:
+      if (el_mode == DFmode && n == 2)
+	{
+	  name[4] = 'd';
+	  name[5] = '2';
+	}
+      else if (el_mode == SFmode && n == 4)
+	{
+	  name[4] = 's';
+	  name[5] = '4';
+	}
+      else
+	return NULL_TREE;
+      break;
 
-  /* xa = (double)(long)x */
-  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
-  expand_fix (xi, res, 0);
-  expand_float (xa, xi, 0);
+    default:
+      return NULL_TREE;
+    }
 
-  /* generate 1.0 */
-  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
+  tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
+  bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
+  sprintf (name + 7, "%s", bname+10);
 
-  /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
-  tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
-  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
-  tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
-			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
-  emit_move_insn (res, tmp);
+  arity = 0;
+  for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
+    arity++;
 
-  if (HONOR_SIGNED_ZEROS (mode))
-    ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
+  if (arity == 1)
+    fntype = build_function_type_list (type_out, type_in, NULL);
+  else
+    fntype = build_function_type_list (type_out, type_in, type_in, NULL);
 
-  emit_label (label);
-  LABEL_NUSES (label) = 1;
+  /* Build a function declaration for the vectorized function.  */
+  new_fndecl = build_decl (BUILTINS_LOCATION,
+			   FUNCTION_DECL, get_identifier (name), fntype);
+  TREE_PUBLIC (new_fndecl) = 1;
+  DECL_EXTERNAL (new_fndecl) = 1;
+  DECL_IS_NOVOPS (new_fndecl) = 1;
+  TREE_READONLY (new_fndecl) = 1;
 
-  emit_move_insn (operand0, res);
+  return new_fndecl;
 }
 
-/* Expand SSE sequence for computing round from OPERAND1 storing
-   into OPERAND0.  Sequence that works without relying on DImode truncation
-   via cvttsd2siq that is only available on 64bit targets.  */
-void
-ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
-{
-  /* C code for the stuff we expand below.
-        double xa = fabs (x), xa2, x2;
-        if (!isless (xa, TWO52))
-          return x;
-     Using the absolute value and copying back sign makes
-     -0.0 -> -0.0 correct.
-        xa2 = xa + TWO52 - TWO52;
-     Compensate.
-	dxa = xa2 - xa;
-        if (dxa <= -0.5)
-          xa2 += 1;
-        else if (dxa > 0.5)
-          xa2 -= 1;
-        x2 = copysign (xa2, x);
-        return x2;
-   */
-  machine_mode mode = GET_MODE (operand0);
-  rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
-  rtx_code_label *label;
-
-  TWO52 = ix86_gen_TWO52 (mode);
-
-  /* Temporary for holding the result, initialized to the input
-     operand to ease control flow.  */
-  res = gen_reg_rtx (mode);
-  emit_move_insn (res, operand1);
-
-  /* xa = abs (operand1) */
-  xa = ix86_expand_sse_fabs (res, &mask);
+/* Returns a decl of a function that implements scatter store with
+   register type VECTYPE and index type INDEX_TYPE and SCALE.
+   Return NULL_TREE if it is not available.  */
 
-  /* if (!isless (xa, TWO52)) goto label; */
-  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+static tree
+ix86_vectorize_builtin_scatter (const_tree vectype,
+				const_tree index_type, int scale)
+{
+  bool si;
+  enum ix86_builtins code;
 
-  /* xa2 = xa + TWO52 - TWO52; */
-  xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
-  xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
+  if (!TARGET_AVX512F)
+    return NULL_TREE;
 
-  /* dxa = xa2 - xa; */
-  dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
+  if ((TREE_CODE (index_type) != INTEGER_TYPE
+       && !POINTER_TYPE_P (index_type))
+      || (TYPE_MODE (index_type) != SImode
+	  && TYPE_MODE (index_type) != DImode))
+    return NULL_TREE;
 
-  /* generate 0.5, 1.0 and -0.5 */
-  half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
-  one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
-  mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
-			       0, OPTAB_DIRECT);
+  if (TYPE_PRECISION (index_type) > POINTER_SIZE)
+    return NULL_TREE;
 
-  /* Compensate.  */
-  /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
-  tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
-  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
-  xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
-  /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
-  tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
-  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
-  xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+  /* v*scatter* insn sign extends index to pointer mode.  */
+  if (TYPE_PRECISION (index_type) < POINTER_SIZE
+      && TYPE_UNSIGNED (index_type))
+    return NULL_TREE;
 
-  /* res = copysign (xa2, operand1) */
-  ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
+  /* Scale can be 1, 2, 4 or 8.  */
+  if (scale <= 0
+      || scale > 8
+      || (scale & (scale - 1)) != 0)
+    return NULL_TREE;
 
-  emit_label (label);
-  LABEL_NUSES (label) = 1;
+  si = TYPE_MODE (index_type) == SImode;
+  switch (TYPE_MODE (vectype))
+    {
+    case E_V8DFmode:
+      code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
+      break;
+    case E_V8DImode:
+      code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
+      break;
+    case E_V16SFmode:
+      code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
+      break;
+    case E_V16SImode:
+      code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
+      break;
+    case E_V4DFmode:
+      if (TARGET_AVX512VL)
+	code = si ? IX86_BUILTIN_SCATTERALTSIV4DF : IX86_BUILTIN_SCATTERDIV4DF;
+      else
+	return NULL_TREE;
+      break;
+    case E_V4DImode:
+      if (TARGET_AVX512VL)
+	code = si ? IX86_BUILTIN_SCATTERALTSIV4DI : IX86_BUILTIN_SCATTERDIV4DI;
+      else
+	return NULL_TREE;
+      break;
+    case E_V8SFmode:
+      if (TARGET_AVX512VL)
+	code = si ? IX86_BUILTIN_SCATTERSIV8SF : IX86_BUILTIN_SCATTERALTDIV8SF;
+      else
+	return NULL_TREE;
+      break;
+    case E_V8SImode:
+      if (TARGET_AVX512VL)
+	code = si ? IX86_BUILTIN_SCATTERSIV8SI : IX86_BUILTIN_SCATTERALTDIV8SI;
+      else
+	return NULL_TREE;
+      break;
+    case E_V2DFmode:
+      if (TARGET_AVX512VL)
+	code = si ? IX86_BUILTIN_SCATTERALTSIV2DF : IX86_BUILTIN_SCATTERDIV2DF;
+      else
+	return NULL_TREE;
+      break;
+    case E_V2DImode:
+      if (TARGET_AVX512VL)
+	code = si ? IX86_BUILTIN_SCATTERALTSIV2DI : IX86_BUILTIN_SCATTERDIV2DI;
+      else
+	return NULL_TREE;
+      break;
+    case E_V4SFmode:
+      if (TARGET_AVX512VL)
+	code = si ? IX86_BUILTIN_SCATTERSIV4SF : IX86_BUILTIN_SCATTERALTDIV4SF;
+      else
+	return NULL_TREE;
+      break;
+    case E_V4SImode:
+      if (TARGET_AVX512VL)
+	code = si ? IX86_BUILTIN_SCATTERSIV4SI : IX86_BUILTIN_SCATTERALTDIV4SI;
+      else
+	return NULL_TREE;
+      break;
+    default:
+      return NULL_TREE;
+    }
 
-  emit_move_insn (operand0, res);
+  return get_ix86_builtin (code);
 }
 
-/* Expand SSE sequence for computing trunc from OPERAND1 storing
-   into OPERAND0.  */
-void
-ix86_expand_trunc (rtx operand0, rtx operand1)
-{
-  /* C code for SSE variant we expand below.
-        double xa = fabs (x), x2;
-        if (!isless (xa, TWO52))
-          return x;
-        x2 = (double)(long)x;
-	if (HONOR_SIGNED_ZEROS (mode))
-	  return copysign (x2, x);
-	return x2;
-   */
-  machine_mode mode = GET_MODE (operand0);
-  rtx xa, xi, TWO52, res, mask;
-  rtx_code_label *label;
+/* Return true if it is safe to use the rsqrt optabs to optimize
+   1.0/sqrt.  */
 
-  TWO52 = ix86_gen_TWO52 (mode);
+static bool
+use_rsqrt_p ()
+{
+  return (TARGET_SSE && TARGET_SSE_MATH
+	  && flag_finite_math_only
+	  && !flag_trapping_math
+	  && flag_unsafe_math_optimizations);
+}
+
+/* Helper for avx_vpermilps256_operand et al.  This is also used by
+   the expansion functions to turn the parallel back into a mask.
+   The return value is 0 for no match and the imm8+1 for a match.  */
 
-  /* Temporary for holding the result, initialized to the input
-     operand to ease control flow.  */
-  res = gen_reg_rtx (mode);
-  emit_move_insn (res, operand1);
+int
+avx_vpermilp_parallel (rtx par, machine_mode mode)
+{
+  unsigned i, nelt = GET_MODE_NUNITS (mode);
+  unsigned mask = 0;
+  unsigned char ipar[16] = {};  /* Silence -Wuninitialized warning.  */
 
-  /* xa = abs (operand1) */
-  xa = ix86_expand_sse_fabs (res, &mask);
+  if (XVECLEN (par, 0) != (int) nelt)
+    return 0;
 
-  /* if (!isless (xa, TWO52)) goto label; */
-  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+  /* Validate that all of the elements are constants, and not totally
+     out of range.  Copy the data into an integral array to make the
+     subsequent checks easier.  */
+  for (i = 0; i < nelt; ++i)
+    {
+      rtx er = XVECEXP (par, 0, i);
+      unsigned HOST_WIDE_INT ei;
 
-  /* x = (double)(long)x */
-  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
-  expand_fix (xi, res, 0);
-  expand_float (res, xi, 0);
+      if (!CONST_INT_P (er))
+	return 0;
+      ei = INTVAL (er);
+      if (ei >= nelt)
+	return 0;
+      ipar[i] = ei;
+    }
 
-  if (HONOR_SIGNED_ZEROS (mode))
-    ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
+  switch (mode)
+    {
+    case E_V8DFmode:
+      /* In the 512-bit DFmode case, we can only move elements within
+         a 128-bit lane.  First fill the second part of the mask,
+	 then fallthru.  */
+      for (i = 4; i < 6; ++i)
+	{
+	  if (ipar[i] < 4 || ipar[i] >= 6)
+	    return 0;
+	  mask |= (ipar[i] - 4) << i;
+	}
+      for (i = 6; i < 8; ++i)
+	{
+	  if (ipar[i] < 6)
+	    return 0;
+	  mask |= (ipar[i] - 6) << i;
+	}
+      /* FALLTHRU */
 
-  emit_label (label);
-  LABEL_NUSES (label) = 1;
+    case E_V4DFmode:
+      /* In the 256-bit DFmode case, we can only move elements within
+         a 128-bit lane.  */
+      for (i = 0; i < 2; ++i)
+	{
+	  if (ipar[i] >= 2)
+	    return 0;
+	  mask |= ipar[i] << i;
+	}
+      for (i = 2; i < 4; ++i)
+	{
+	  if (ipar[i] < 2)
+	    return 0;
+	  mask |= (ipar[i] - 2) << i;
+	}
+      break;
 
-  emit_move_insn (operand0, res);
-}
+    case E_V16SFmode:
+      /* In 512 bit SFmode case, permutation in the upper 256 bits
+	 must mirror the permutation in the lower 256-bits.  */
+      for (i = 0; i < 8; ++i)
+	if (ipar[i] + 8 != ipar[i + 8])
+	  return 0;
+      /* FALLTHRU */
 
-/* Expand SSE sequence for computing trunc from OPERAND1 storing
-   into OPERAND0.  */
-void
-ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
-{
-  machine_mode mode = GET_MODE (operand0);
-  rtx xa, mask, TWO52, one, res, smask, tmp;
-  rtx_code_label *label;
+    case E_V8SFmode:
+      /* In 256 bit SFmode case, we have full freedom of
+         movement within the low 128-bit lane, but the high 128-bit
+         lane must mirror the exact same pattern.  */
+      for (i = 0; i < 4; ++i)
+	if (ipar[i] + 4 != ipar[i + 4])
+	  return 0;
+      nelt = 4;
+      /* FALLTHRU */
 
-  /* C code for SSE variant we expand below.
-        double xa = fabs (x), x2;
-        if (!isless (xa, TWO52))
-          return x;
-        xa2 = xa + TWO52 - TWO52;
-     Compensate:
-        if (xa2 > xa)
-          xa2 -= 1.0;
-        x2 = copysign (xa2, x);
-        return x2;
-   */
+    case E_V2DFmode:
+    case E_V4SFmode:
+      /* In the 128-bit case, we've full freedom in the placement of
+	 the elements from the source operand.  */
+      for (i = 0; i < nelt; ++i)
+	mask |= ipar[i] << (i * (nelt / 2));
+      break;
 
-  TWO52 = ix86_gen_TWO52 (mode);
+    default:
+      gcc_unreachable ();
+    }
 
-  /* Temporary for holding the result, initialized to the input
-     operand to ease control flow.  */
-  res = gen_reg_rtx (mode);
-  emit_move_insn (res, operand1);
+  /* Make sure success has a non-zero value by adding one.  */
+  return mask + 1;
+}
 
-  /* xa = abs (operand1) */
-  xa = ix86_expand_sse_fabs (res, &smask);
+/* Helper for avx_vperm2f128_v4df_operand et al.  This is also used by
+   the expansion functions to turn the parallel back into a mask.
+   The return value is 0 for no match and the imm8+1 for a match.  */
 
-  /* if (!isless (xa, TWO52)) goto label; */
-  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+int
+avx_vperm2f128_parallel (rtx par, machine_mode mode)
+{
+  unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
+  unsigned mask = 0;
+  unsigned char ipar[8] = {};  /* Silence -Wuninitialized warning.  */
 
-  /* res = xa + TWO52 - TWO52; */
-  tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
-  tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
-  emit_move_insn (res, tmp);
+  if (XVECLEN (par, 0) != (int) nelt)
+    return 0;
 
-  /* generate 1.0 */
-  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
+  /* Validate that all of the elements are constants, and not totally
+     out of range.  Copy the data into an integral array to make the
+     subsequent checks easier.  */
+  for (i = 0; i < nelt; ++i)
+    {
+      rtx er = XVECEXP (par, 0, i);
+      unsigned HOST_WIDE_INT ei;
 
-  /* Compensate: res = xa2 - (res > xa ? 1 : 0)  */
-  mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
-  emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
-  tmp = expand_simple_binop (mode, MINUS,
-			     res, mask, NULL_RTX, 0, OPTAB_DIRECT);
-  emit_move_insn (res, tmp);
+      if (!CONST_INT_P (er))
+	return 0;
+      ei = INTVAL (er);
+      if (ei >= 2 * nelt)
+	return 0;
+      ipar[i] = ei;
+    }
 
-  /* res = copysign (res, operand1) */
-  ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
+  /* Validate that the halves of the permute are halves.  */
+  for (i = 0; i < nelt2 - 1; ++i)
+    if (ipar[i] + 1 != ipar[i + 1])
+      return 0;
+  for (i = nelt2; i < nelt - 1; ++i)
+    if (ipar[i] + 1 != ipar[i + 1])
+      return 0;
 
-  emit_label (label);
-  LABEL_NUSES (label) = 1;
+  /* Reconstruct the mask.  */
+  for (i = 0; i < 2; ++i)
+    {
+      unsigned e = ipar[i * nelt2];
+      if (e % nelt2)
+	return 0;
+      e /= nelt2;
+      mask |= e << (i * 4);
+    }
 
-  emit_move_insn (operand0, res);
+  /* Make sure success has a non-zero value by adding one.  */
+  return mask + 1;
+}
+
+/* Return a register priority for hard reg REGNO.  */
+static int
+ix86_register_priority (int hard_regno)
+{
+  /* ebp and r13 as the base always wants a displacement, r12 as the
+     base always wants an index.  So discourage their usage in an
+     address.  */
+  if (hard_regno == R12_REG || hard_regno == R13_REG)
+    return 0;
+  if (hard_regno == BP_REG)
+    return 1;
+  /* New x86-64 int registers result in bigger code size.  Discourage
+     them.  */
+  if (IN_RANGE (hard_regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
+    return 2;
+  /* New x86-64 SSE registers result in bigger code size.  Discourage
+     them.  */
+  if (IN_RANGE (hard_regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
+    return 2;
+  if (IN_RANGE (hard_regno, FIRST_EXT_REX_SSE_REG, LAST_EXT_REX_SSE_REG))
+    return 1;
+  /* Usage of AX register results in smaller code.  Prefer it.  */
+  if (hard_regno == AX_REG)
+    return 4;
+  return 3;
 }
 
-/* Expand SSE sequence for computing round from OPERAND1 storing
-   into OPERAND0.  */
-void
-ix86_expand_round (rtx operand0, rtx operand1)
-{
-  /* C code for the stuff we're doing below:
-        double xa = fabs (x);
-        if (!isless (xa, TWO52))
-          return x;
-        xa = (double)(long)(xa + nextafter (0.5, 0.0));
-        return copysign (xa, x);
-   */
-  machine_mode mode = GET_MODE (operand0);
-  rtx res, TWO52, xa, xi, half, mask;
-  rtx_code_label *label;
-  const struct real_format *fmt;
-  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+/* Implement TARGET_PREFERRED_RELOAD_CLASS.
+
+   Put float CONST_DOUBLE in the constant pool instead of fp regs.
+   QImode must go into class Q_REGS.
+   Narrow ALL_REGS to GENERAL_REGS.  This supports allowing movsf and
+   movdf to do mem-to-mem moves through integer regs.  */
 
-  /* Temporary for holding the result, initialized to the input
-     operand to ease control flow.  */
-  res = gen_reg_rtx (mode);
-  emit_move_insn (res, operand1);
+static reg_class_t
+ix86_preferred_reload_class (rtx x, reg_class_t regclass)
+{
+  machine_mode mode = GET_MODE (x);
 
-  TWO52 = ix86_gen_TWO52 (mode);
-  xa = ix86_expand_sse_fabs (res, &mask);
-  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+  /* We're only allowed to return a subclass of CLASS.  Many of the
+     following checks fail for NO_REGS, so eliminate that early.  */
+  if (regclass == NO_REGS)
+    return NO_REGS;
 
-  /* load nextafter (0.5, 0.0) */
-  fmt = REAL_MODE_FORMAT (mode);
-  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
-  real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
+  /* All classes can load zeros.  */
+  if (x == CONST0_RTX (mode))
+    return regclass;
 
-  /* xa = xa + 0.5 */
-  half = force_reg (mode, const_double_from_real_value (pred_half, mode));
-  xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
+  /* Force constants into memory if we are loading a (nonzero) constant into
+     an MMX, SSE or MASK register.  This is because there are no MMX/SSE/MASK
+     instructions to load from a constant.  */
+  if (CONSTANT_P (x)
+      && (MAYBE_MMX_CLASS_P (regclass)
+	  || MAYBE_SSE_CLASS_P (regclass)
+	  || MAYBE_MASK_CLASS_P (regclass)))
+    return NO_REGS;
 
-  /* xa = (double)(int64_t)xa */
-  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
-  expand_fix (xi, xa, 0);
-  expand_float (xa, xi, 0);
+  /* Floating-point constants need more complex checks.  */
+  if (CONST_DOUBLE_P (x))
+    {
+      /* General regs can load everything.  */
+      if (INTEGER_CLASS_P (regclass))
+        return regclass;
 
-  /* res = copysign (xa, operand1) */
-  ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
+      /* Floats can load 0 and 1 plus some others.  Note that we eliminated
+	 zero above.  We only want to wind up preferring 80387 registers if
+	 we plan on doing computation with them.  */
+      if (IS_STACK_MODE (mode)
+	  && standard_80387_constant_p (x) > 0)
+	{
+	  /* Limit class to FP regs.  */
+	  if (FLOAT_CLASS_P (regclass))
+	    return FLOAT_REGS;
+	}
 
-  emit_label (label);
-  LABEL_NUSES (label) = 1;
+      return NO_REGS;
+    }
 
-  emit_move_insn (operand0, res);
-}
+  /* Prefer SSE regs only, if we can use them for math.  */
+  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+    return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
 
-/* Expand SSE sequence for computing round
-   from OP1 storing into OP0 using sse4 round insn.  */
-void
-ix86_expand_round_sse4 (rtx op0, rtx op1)
-{
-  machine_mode mode = GET_MODE (op0);
-  rtx e1, e2, res, half;
-  const struct real_format *fmt;
-  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
-  rtx (*gen_copysign) (rtx, rtx, rtx);
-  rtx (*gen_round) (rtx, rtx, rtx);
+  /* Generally when we see PLUS here, it's the function invariant
+     (plus soft-fp const_int).  Which can only be computed into general
+     regs.  */
+  if (GET_CODE (x) == PLUS)
+    return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
 
-  switch (mode)
+  /* QImode constants are easy to load, but non-constant QImode data
+     must go into Q_REGS.  */
+  if (GET_MODE (x) == QImode && !CONSTANT_P (x))
     {
-    case E_SFmode:
-      gen_copysign = gen_copysignsf3;
-      gen_round = gen_sse4_1_roundsf2;
-      break;
-    case E_DFmode:
-      gen_copysign = gen_copysigndf3;
-      gen_round = gen_sse4_1_rounddf2;
-      break;
-    default:
-      gcc_unreachable ();
+      if (Q_CLASS_P (regclass))
+	return regclass;
+      else if (reg_class_subset_p (Q_REGS, regclass))
+	return Q_REGS;
+      else
+	return NO_REGS;
     }
 
-  /* round (a) = trunc (a + copysign (0.5, a)) */
-
-  /* load nextafter (0.5, 0.0) */
-  fmt = REAL_MODE_FORMAT (mode);
-  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
-  real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
-  half = const_double_from_real_value (pred_half, mode);
+  return regclass;
+}
 
-  /* e1 = copysign (0.5, op1) */
-  e1 = gen_reg_rtx (mode);
-  emit_insn (gen_copysign (e1, half, op1));
+/* Discourage putting floating-point values in SSE registers unless
+   SSE math is being used, and likewise for the 387 registers.  */
+static reg_class_t
+ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
+{
+  machine_mode mode = GET_MODE (x);
 
-  /* e2 = op1 + e1 */
-  e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
+  /* Restrict the output reload class to the register bank that we are doing
+     math on.  If we would like not to return a subset of CLASS, reject this
+     alternative: if reload cannot do this, it will still use its choice.  */
+  mode = GET_MODE (x);
+  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+    return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
 
-  /* res = trunc (e2) */
-  res = gen_reg_rtx (mode);
-  emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
+  if (IS_STACK_MODE (mode))
+    return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
 
-  emit_move_insn (op0, res);
+  return regclass;
 }
 
-/* Handle fentry_name / fentry_section attribute.  */
-
-static tree
-ix86_handle_fentry_name (tree *node, tree name, tree args,
-			 int, bool *no_add_attrs)
+static reg_class_t
+ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
+		       machine_mode mode, secondary_reload_info *sri)
 {
-  if (TREE_CODE (*node) == FUNCTION_DECL
-      && TREE_CODE (TREE_VALUE (args)) == STRING_CST)
-    /* Do nothing else, just set the attribute.  We'll get at
-       it later with lookup_attribute.  */
-    ;
-  else
+  /* Double-word spills from general registers to non-offsettable memory
+     references (zero-extended addresses) require special handling.  */
+  if (TARGET_64BIT
+      && MEM_P (x)
+      && GET_MODE_SIZE (mode) > UNITS_PER_WORD
+      && INTEGER_CLASS_P (rclass)
+      && !offsettable_memref_p (x))
     {
-      warning (OPT_Wattributes, "%qE attribute ignored", name);
-      *no_add_attrs = true;
-    }
-
-  return NULL_TREE;
-}
-
-
-/* Table of valid machine attributes.  */
-static const struct attribute_spec ix86_attribute_table[] =
-{
-  /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
-       affects_type_identity, handler, exclude } */
-  /* Stdcall attribute says callee is responsible for popping arguments
-     if they are not variable.  */
-  { "stdcall",   0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
-    NULL },
-  /* Fastcall attribute says callee is responsible for popping arguments
-     if they are not variable.  */
-  { "fastcall",  0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
-    NULL },
-  /* Thiscall attribute says callee is responsible for popping arguments
-     if they are not variable.  */
-  { "thiscall",  0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
-    NULL },
-  /* Cdecl attribute says the callee is a normal C declaration */
-  { "cdecl",     0, 0, false, true,  true,  true, ix86_handle_cconv_attribute,
-    NULL },
-  /* Regparm attribute specifies how many integer arguments are to be
-     passed in registers.  */
-  { "regparm",   1, 1, false, true,  true,  true, ix86_handle_cconv_attribute,
-    NULL },
-  /* Sseregparm attribute says we are using x86_64 calling conventions
-     for FP arguments.  */
-  { "sseregparm", 0, 0, false, true, true,  true, ix86_handle_cconv_attribute,
-    NULL },
-  /* The transactional memory builtins are implicitly regparm or fastcall
-     depending on the ABI.  Override the generic do-nothing attribute that
-     these builtins were declared with.  */
-  { "*tm regparm", 0, 0, false, true, true, true,
-    ix86_handle_tm_regparm_attribute, NULL },
-  /* force_align_arg_pointer says this function realigns the stack at entry.  */
-  { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
-    false, true,  true, false, ix86_handle_force_align_arg_pointer_attribute,
-    NULL },
-#if TARGET_DLLIMPORT_DECL_ATTRIBUTES
-  { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute,
-    NULL },
-  { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute,
-    NULL },
-  { "shared",    0, 0, true,  false, false, false,
-    ix86_handle_shared_attribute, NULL },
-#endif
-  { "ms_struct", 0, 0, false, false,  false, false,
-    ix86_handle_struct_attribute, NULL },
-  { "gcc_struct", 0, 0, false, false,  false, false,
-    ix86_handle_struct_attribute, NULL },
-#ifdef SUBTARGET_ATTRIBUTE_TABLE
-  SUBTARGET_ATTRIBUTE_TABLE,
-#endif
-  /* ms_abi and sysv_abi calling convention function attributes.  */
-  { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL },
-  { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute,
-    NULL },
-  { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
-  { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
-  { "ms_hook_prologue", 0, 0, true, false, false, false,
-    ix86_handle_fndecl_attribute, NULL },
-  { "callee_pop_aggregate_return", 1, 1, false, true, true, true,
-    ix86_handle_callee_pop_aggregate_return, NULL },
-  { "interrupt", 0, 0, false, true, true, false,
-    ix86_handle_interrupt_attribute, NULL },
-  { "no_caller_saved_registers", 0, 0, false, true, true, false,
-    ix86_handle_no_caller_saved_registers_attribute, NULL },
-  { "naked", 0, 0, true, false, false, false,
-    ix86_handle_fndecl_attribute, NULL },
-  { "indirect_branch", 1, 1, true, false, false, false,
-    ix86_handle_fndecl_attribute, NULL },
-  { "function_return", 1, 1, true, false, false, false,
-    ix86_handle_fndecl_attribute, NULL },
-  { "indirect_return", 0, 0, false, true, true, false,
-    NULL, NULL },
-  { "fentry_name", 1, 1, true, false, false, false,
-    ix86_handle_fentry_name, NULL },
-  { "fentry_section", 1, 1, true, false, false, false,
-    ix86_handle_fentry_name, NULL },
-  { "cf_check", 0, 0, true, false, false, false,
-    ix86_handle_fndecl_attribute, NULL },
-
-  /* End element.  */
-  { NULL, 0, 0, false, false, false, false, NULL, NULL }
-};
+      sri->icode = (in_p
+		    ? CODE_FOR_reload_noff_load
+		    : CODE_FOR_reload_noff_store);
+      /* Add the cost of moving address to a temporary.  */
+      sri->extra_cost = 1;
 
-/* Implement targetm.vectorize.builtin_vectorization_cost.  */
-static int
-ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
-                                 tree vectype, int)
-{
-  bool fp = false;
-  machine_mode mode = TImode;
-  int index;
-  if (vectype != NULL)
-    {
-      fp = FLOAT_TYPE_P (vectype);
-      mode = TYPE_MODE (vectype);
+      return NO_REGS;
     }
 
-  switch (type_of_cost)
+  /* QImode spills from non-QI registers require
+     intermediate register on 32bit targets.  */
+  if (mode == QImode
+      && ((!TARGET_64BIT && !in_p
+	   && INTEGER_CLASS_P (rclass)
+	   && MAYBE_NON_Q_CLASS_P (rclass))
+	  || (!TARGET_AVX512DQ
+	      && MAYBE_MASK_CLASS_P (rclass))))
     {
-      case scalar_stmt:
-        return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
+      int regno = true_regnum (x);
 
-      case scalar_load:
-	/* load/store costs are relative to register move which is 2. Recompute
- 	   it to COSTS_N_INSNS so everything have same base.  */
-        return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
-			      : ix86_cost->int_load [2]) / 2;
+      /* Return Q_REGS if the operand is in memory.  */
+      if (regno == -1)
+	return Q_REGS;
 
-      case scalar_store:
-        return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
-			      : ix86_cost->int_store [2]) / 2;
+      return NO_REGS;
+    }
 
-      case vector_stmt:
-        return ix86_vec_cost (mode,
-			      fp ? ix86_cost->addss : ix86_cost->sse_op);
+  /* This condition handles corner case where an expression involving
+     pointers gets vectorized.  We're trying to use the address of a
+     stack slot as a vector initializer.
 
-      case vector_load:
-	index = sse_store_index (mode);
-	/* See PR82713 - we may end up being called on non-vector type.  */
-	if (index < 0)
-	  index = 2;
-        return COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2;
+     (set (reg:V2DI 74 [ vect_cst_.2 ])
+          (vec_duplicate:V2DI (reg/f:DI 20 frame)))
 
-      case vector_store:
-	index = sse_store_index (mode);
-	/* See PR82713 - we may end up being called on non-vector type.  */
-	if (index < 0)
-	  index = 2;
-        return COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2;
+     Eventually frame gets turned into sp+offset like this:
 
-      case vec_to_scalar:
-      case scalar_to_vec:
-        return ix86_vec_cost (mode, ix86_cost->sse_op);
+     (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
+          (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
+	                               (const_int 392 [0x188]))))
 
-      /* We should have separate costs for unaligned loads and gather/scatter.
-	 Do that incrementally.  */
-      case unaligned_load:
-	index = sse_store_index (mode);
-	/* See PR82713 - we may end up being called on non-vector type.  */
-	if (index < 0)
-	  index = 2;
-        return COSTS_N_INSNS (ix86_cost->sse_unaligned_load[index]) / 2;
+     That later gets turned into:
 
-      case unaligned_store:
-	index = sse_store_index (mode);
-	/* See PR82713 - we may end up being called on non-vector type.  */
-	if (index < 0)
-	  index = 2;
-        return COSTS_N_INSNS (ix86_cost->sse_unaligned_store[index]) / 2;
+     (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
+          (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
+	    (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
 
-      case vector_gather_load:
-        return ix86_vec_cost (mode,
-			      COSTS_N_INSNS
-				 (ix86_cost->gather_static
-				  + ix86_cost->gather_per_elt
-				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+     We'll have the following reload recorded:
 
-      case vector_scatter_store:
-        return ix86_vec_cost (mode,
-			      COSTS_N_INSNS
-				 (ix86_cost->scatter_static
-				  + ix86_cost->scatter_per_elt
-				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+     Reload 0: reload_in (DI) =
+           (plus:DI (reg/f:DI 7 sp)
+            (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
+     reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
+     SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
+     reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
+     reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
+     reload_reg_rtx: (reg:V2DI 22 xmm1)
 
-      case cond_branch_taken:
-        return ix86_cost->cond_taken_branch_cost;
+     Which isn't going to work since SSE instructions can't handle scalar
+     additions.  Returning GENERAL_REGS forces the addition into integer
+     register and reload can handle subsequent reloads without problems.  */
 
-      case cond_branch_not_taken:
-        return ix86_cost->cond_not_taken_branch_cost;
+  if (in_p && GET_CODE (x) == PLUS
+      && SSE_CLASS_P (rclass)
+      && SCALAR_INT_MODE_P (mode))
+    return GENERAL_REGS;
 
-      case vec_perm:
-      case vec_promote_demote:
-        return ix86_vec_cost (mode, ix86_cost->sse_op);
+  return NO_REGS;
+}
 
-      case vec_construct:
-	{
-	  /* N element inserts into SSE vectors.  */
-	  int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
-	  /* One vinserti128 for combining two SSE vectors for AVX256.  */
-	  if (GET_MODE_BITSIZE (mode) == 256)
-	    cost += ix86_vec_cost (mode, ix86_cost->addss);
-	  /* One vinserti64x4 and two vinserti128 for combining SSE
-	     and AVX256 vectors to AVX512.  */
-	  else if (GET_MODE_BITSIZE (mode) == 512)
-	    cost += 3 * ix86_vec_cost (mode, ix86_cost->addss);
-	  return cost;
-	}
+/* Implement TARGET_CLASS_LIKELY_SPILLED_P.  */
+
+static bool
+ix86_class_likely_spilled_p (reg_class_t rclass)
+{
+  switch (rclass)
+    {
+      case AREG:
+      case DREG:
+      case CREG:
+      case BREG:
+      case AD_REGS:
+      case SIREG:
+      case DIREG:
+      case SSE_FIRST_REG:
+      case FP_TOP_REG:
+      case FP_SECOND_REG:
+	return true;
 
       default:
-        gcc_unreachable ();
+	break;
     }
+
+  return false;
 }
 
-/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
-   insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
-   insn every time.  */
+/* If we are copying between registers from different register sets
+   (e.g. FP and integer), we may need a memory location.
+
+   The function can't work reliably when one of the CLASSES is a class
+   containing registers from multiple sets.  We avoid this by never combining
+   different sets in a single alternative in the machine description.
+   Ensure that this constraint holds to avoid unexpected surprises.
 
-static GTY(()) rtx_insn *vselect_insn;
+   When STRICT is false, we are being called from REGISTER_MOVE_COST,
+   so do not enforce these sanity checks.
 
-/* Initialize vselect_insn.  */
+   To optimize register_move_cost performance, define inline variant.  */
 
-static void
-init_vselect_insn (void)
+static inline bool
+inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
+				reg_class_t class2, int strict)
 {
-  unsigned i;
-  rtx x;
+  if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
+    return false;
 
-  x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
-  for (i = 0; i < MAX_VECT_LEN; ++i)
-    XVECEXP (x, 0, i) = const0_rtx;
-  x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
-							const0_rtx), x);
-  x = gen_rtx_SET (const0_rtx, x);
-  start_sequence ();
-  vselect_insn = emit_insn (x);
-  end_sequence ();
-}
+  if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
+      || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
+      || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
+      || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
+      || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
+      || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
+      || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
+      || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
+    {
+      gcc_assert (!strict || lra_in_progress);
+      return true;
+    }
 
-/* Construct (set target (vec_select op0 (parallel perm))) and
-   return true if that's a valid instruction in the active ISA.  */
+  if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
+    return true;
 
-static bool
-expand_vselect (rtx target, rtx op0, const unsigned char *perm,
-		unsigned nelt, bool testing_p)
-{
-  unsigned int i;
-  rtx x, save_vconcat;
-  int icode;
+  /* Between mask and general, we have moves no larger than word size.  */
+  if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
+      && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
+  return true;
 
-  if (vselect_insn == NULL_RTX)
-    init_vselect_insn ();
+  /* ??? This is a lie.  We do have moves between mmx/general, and for
+     mmx/sse2.  But by saying we need secondary memory we discourage the
+     register allocator from using the mmx registers unless needed.  */
+  if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
+    return true;
 
-  x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
-  PUT_NUM_ELEM (XVEC (x, 0), nelt);
-  for (i = 0; i < nelt; ++i)
-    XVECEXP (x, 0, i) = GEN_INT (perm[i]);
-  save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
-  XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
-  PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
-  SET_DEST (PATTERN (vselect_insn)) = target;
-  icode = recog_memoized (vselect_insn);
+  if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
+    {
+      /* SSE1 doesn't have any direct moves from other classes.  */
+      if (!TARGET_SSE2)
+	return true;
 
-  if (icode >= 0 && !testing_p)
-    emit_insn (copy_rtx (PATTERN (vselect_insn)));
+      /* If the target says that inter-unit moves are more expensive
+	 than moving through memory, then don't generate them.  */
+      if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
+	  || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
+	return true;
 
-  SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
-  XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
-  INSN_CODE (vselect_insn) = -1;
+      /* Between SSE and general, we have moves no larger than word size.  */
+      if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+	return true;
+    }
 
-  return icode >= 0;
+  return false;
 }
 
-/* Similar, but generate a vec_concat from op0 and op1 as well.  */
+/* Implement TARGET_SECONDARY_MEMORY_NEEDED.  */
 
 static bool
-expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
-			const unsigned char *perm, unsigned nelt,
-			bool testing_p)
+ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
+			      reg_class_t class2)
 {
-  machine_mode v2mode;
-  rtx x;
-  bool ok;
-
-  if (vselect_insn == NULL_RTX)
-    init_vselect_insn ();
+  return inline_secondary_memory_needed (mode, class1, class2, true);
+}
 
-  if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
-    return false;
-  x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
-  PUT_MODE (x, v2mode);
-  XEXP (x, 0) = op0;
-  XEXP (x, 1) = op1;
-  ok = expand_vselect (target, x, perm, nelt, testing_p);
-  XEXP (x, 0) = const0_rtx;
-  XEXP (x, 1) = const0_rtx;
-  return ok;
-}
-
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
-   using movss or movsd.  */
-static bool
-expand_vec_perm_movs (struct expand_vec_perm_d *d)
-{
-  machine_mode vmode = d->vmode;
-  unsigned i, nelt = d->nelt;
-  rtx x;
+/* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
 
-  if (d->one_operand_p)
-    return false;
+   get_secondary_mem widens integral modes to BITS_PER_WORD.
+   There is no need to emit full 64 bit move on 64 bit targets
+   for integral modes that can be moved using 32 bit move.  */
 
-  if (!(TARGET_SSE && vmode == V4SFmode)
-      && !(TARGET_SSE2 && vmode == V2DFmode))
-    return false;
+static machine_mode
+ix86_secondary_memory_needed_mode (machine_mode mode)
+{
+  if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
+    return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
+  return mode;
+}
 
-  /* Only the first element is changed.  */
-  if (d->perm[0] != nelt && d->perm[0] != 0)
-    return false;
-  for (i = 1; i < nelt; ++i)
-    if (d->perm[i] != i + nelt - d->perm[0])
-      return false;
+/* Implement the TARGET_CLASS_MAX_NREGS hook.
 
-  if (d->testing_p)
-    return true;
+   On the 80386, this is the size of MODE in words,
+   except in the FP regs, where a single reg is always enough.  */
 
-  if (d->perm[0] == nelt)
-    x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
+static unsigned char
+ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
+{
+  if (MAYBE_INTEGER_CLASS_P (rclass))
+    {
+      if (mode == XFmode)
+	return (TARGET_64BIT ? 2 : 3);
+      else if (mode == XCmode)
+	return (TARGET_64BIT ? 4 : 6);
+      else
+	return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
+    }
   else
-    x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
-
-  emit_insn (gen_rtx_SET (d->target, x));
-
-  return true;
+    {
+      if (COMPLEX_MODE_P (mode))
+	return 2;
+      else
+	return 1;
+    }
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
-   in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
+/* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
 
 static bool
-expand_vec_perm_blend (struct expand_vec_perm_d *d)
+ix86_can_change_mode_class (machine_mode from, machine_mode to,
+			    reg_class_t regclass)
 {
-  machine_mode mmode, vmode = d->vmode;
-  unsigned i, nelt = d->nelt;
-  unsigned HOST_WIDE_INT mask;
-  rtx target, op0, op1, maskop, x;
-  rtx rperm[32], vperm;
+  if (from == to)
+    return true;
 
-  if (d->one_operand_p)
-    return false;
-  if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
-      && (TARGET_AVX512BW
-	  || GET_MODE_UNIT_SIZE (vmode) >= 4))
-    ;
-  else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
-    ;
-  else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
-    ;
-  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
-    ;
-  else
+  /* x87 registers can't do subreg at all, as all values are reformatted
+     to extended precision.  */
+  if (MAYBE_FLOAT_CLASS_P (regclass))
     return false;
 
-  /* This is a blend, not a permute.  Elements must stay in their
-     respective lanes.  */
-  for (i = 0; i < nelt; ++i)
+  if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
     {
-      unsigned e = d->perm[i];
-      if (!(e == i || e == i + nelt))
+      /* Vector registers do not support QI or HImode loads.  If we don't
+	 disallow a change to these modes, reload will assume it's ok to
+	 drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
+	 the vec_dupv4hi pattern.  */
+      if (GET_MODE_SIZE (from) < 4)
 	return false;
     }
 
-  if (d->testing_p)
-    return true;
-
-  /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
-     decision should be extracted elsewhere, so that we only try that
-     sequence once all budget==3 options have been tried.  */
-  target = d->target;
-  op0 = d->op0;
-  op1 = d->op1;
-  mask = 0;
-
-  switch (vmode)
-    {
-    case E_V8DFmode:
-    case E_V16SFmode:
-    case E_V4DFmode:
-    case E_V8SFmode:
-    case E_V2DFmode:
-    case E_V4SFmode:
-    case E_V8HImode:
-    case E_V8SImode:
-    case E_V32HImode:
-    case E_V64QImode:
-    case E_V16SImode:
-    case E_V8DImode:
-      for (i = 0; i < nelt; ++i)
-	mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
-      break;
-
-    case E_V2DImode:
-      for (i = 0; i < 2; ++i)
-	mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
-      vmode = V8HImode;
-      goto do_subreg;
-
-    case E_V4SImode:
-      for (i = 0; i < 4; ++i)
-	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
-      vmode = V8HImode;
-      goto do_subreg;
+  return true;
+}
 
-    case E_V16QImode:
-      /* See if bytes move in pairs so we can use pblendw with
-	 an immediate argument, rather than pblendvb with a vector
-	 argument.  */
-      for (i = 0; i < 16; i += 2)
-	if (d->perm[i] + 1 != d->perm[i + 1])
-	  {
-	  use_pblendvb:
-	    for (i = 0; i < nelt; ++i)
-	      rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
+/* Return index of MODE in the sse load/store tables.  */
 
-	  finish_pblendvb:
-	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
-	    vperm = force_reg (vmode, vperm);
+static inline int
+sse_store_index (machine_mode mode)
+{
+      switch (GET_MODE_SIZE (mode))
+	{
+	  case 4:
+	    return 0;
+	  case 8:
+	    return 1;
+	  case 16:
+	    return 2;
+	  case 32:
+	    return 3;
+	  case 64:
+	    return 4;
+	  default:
+	    return -1;
+	}
+}
 
-	    if (GET_MODE_SIZE (vmode) == 16)
-	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
-	    else
-	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
-	    if (target != d->target)
-	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
-	    return true;
-	  }
+/* Return the cost of moving data of mode M between a
+   register and memory.  A value of 2 is the default; this cost is
+   relative to those in `REGISTER_MOVE_COST'.
 
-      for (i = 0; i < 8; ++i)
-	mask |= (d->perm[i * 2] >= 16) << i;
-      vmode = V8HImode;
-      /* FALLTHRU */
+   This function is used extensively by register_move_cost that is used to
+   build tables at startup.  Make it inline in this case.
+   When IN is 2, return maximum of in and out move cost.
 
-    do_subreg:
-      target = gen_reg_rtx (vmode);
-      op0 = gen_lowpart (vmode, op0);
-      op1 = gen_lowpart (vmode, op1);
-      break;
+   If moving between registers and memory is more expensive than
+   between two registers, you should define this macro to express the
+   relative cost.
 
-    case E_V32QImode:
-      /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
-      for (i = 0; i < 32; i += 2)
-	if (d->perm[i] + 1 != d->perm[i + 1])
-	  goto use_pblendvb;
-      /* See if bytes move in quadruplets.  If yes, vpblendd
-	 with immediate can be used.  */
-      for (i = 0; i < 32; i += 4)
-	if (d->perm[i] + 2 != d->perm[i + 2])
-	  break;
-      if (i < 32)
+   Model also increased moving costs of QImode registers in non
+   Q_REGS classes.
+ */
+static inline int
+inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
+{
+  int cost;
+  if (FLOAT_CLASS_P (regclass))
+    {
+      int index;
+      switch (mode)
 	{
-	  /* See if bytes move the same in both lanes.  If yes,
-	     vpblendw with immediate can be used.  */
-	  for (i = 0; i < 16; i += 2)
-	    if (d->perm[i] + 16 != d->perm[i + 16])
-	      goto use_pblendvb;
-
-	  /* Use vpblendw.  */
-	  for (i = 0; i < 16; ++i)
-	    mask |= (d->perm[i * 2] >= 32) << i;
-	  vmode = V16HImode;
-	  goto do_subreg;
+	  case E_SFmode:
+	    index = 0;
+	    break;
+	  case E_DFmode:
+	    index = 1;
+	    break;
+	  case E_XFmode:
+	    index = 2;
+	    break;
+	  default:
+	    return 100;
 	}
-
-      /* Use vpblendd.  */
-      for (i = 0; i < 8; ++i)
-	mask |= (d->perm[i * 4] >= 32) << i;
-      vmode = V8SImode;
-      goto do_subreg;
-
-    case E_V16HImode:
-      /* See if words move in pairs.  If yes, vpblendd can be used.  */
-      for (i = 0; i < 16; i += 2)
-	if (d->perm[i] + 1 != d->perm[i + 1])
-	  break;
-      if (i < 16)
+      if (in == 2)
+        return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
+      return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
+    }
+  if (SSE_CLASS_P (regclass))
+    {
+      int index = sse_store_index (mode);
+      if (index == -1)
+	return 100;
+      if (in == 2)
+        return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
+      return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
+    }
+  if (MMX_CLASS_P (regclass))
+    {
+      int index;
+      switch (GET_MODE_SIZE (mode))
 	{
-	  /* See if words move the same in both lanes.  If not,
-	     vpblendvb must be used.  */
-	  for (i = 0; i < 8; i++)
-	    if (d->perm[i] + 8 != d->perm[i + 8])
-	      {
-		/* Use vpblendvb.  */
-		for (i = 0; i < 32; ++i)
-		  rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
-
-		vmode = V32QImode;
-		nelt = 32;
-		target = gen_reg_rtx (vmode);
-		op0 = gen_lowpart (vmode, op0);
-		op1 = gen_lowpart (vmode, op1);
-		goto finish_pblendvb;
-	      }
-
-	  /* Use vpblendw.  */
-	  for (i = 0; i < 16; ++i)
-	    mask |= (d->perm[i] >= 16) << i;
-	  break;
+	  case 4:
+	    index = 0;
+	    break;
+	  case 8:
+	    index = 1;
+	    break;
+	  default:
+	    return 100;
 	}
-
-      /* Use vpblendd.  */
-      for (i = 0; i < 8; ++i)
-	mask |= (d->perm[i * 2] >= 16) << i;
-      vmode = V8SImode;
-      goto do_subreg;
-
-    case E_V4DImode:
-      /* Use vpblendd.  */
-      for (i = 0; i < 4; ++i)
-	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
-      vmode = V8SImode;
-      goto do_subreg;
-
-    default:
-      gcc_unreachable ();
+      if (in == 2)
+        return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
+      return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
     }
-
-  switch (vmode)
+  switch (GET_MODE_SIZE (mode))
     {
-    case E_V8DFmode:
-    case E_V8DImode:
-      mmode = QImode;
-      break;
-    case E_V16SFmode:
-    case E_V16SImode:
-      mmode = HImode;
-      break;
-    case E_V32HImode:
-      mmode = SImode;
-      break;
-    case E_V64QImode:
-      mmode = DImode;
-      break;
-    default:
-      mmode = VOIDmode;
+      case 1:
+	if (Q_CLASS_P (regclass) || TARGET_64BIT)
+	  {
+	    if (!in)
+	      return ix86_cost->int_store[0];
+	    if (TARGET_PARTIAL_REG_DEPENDENCY
+	        && optimize_function_for_speed_p (cfun))
+	      cost = ix86_cost->movzbl_load;
+	    else
+	      cost = ix86_cost->int_load[0];
+	    if (in == 2)
+	      return MAX (cost, ix86_cost->int_store[0]);
+	    return cost;
+	  }
+	else
+	  {
+	   if (in == 2)
+	     return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
+	   if (in)
+	     return ix86_cost->movzbl_load;
+	   else
+	     return ix86_cost->int_store[0] + 4;
+	  }
+	break;
+      case 2:
+	if (in == 2)
+	  return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
+	return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
+      default:
+	if (in == 2)
+	  cost = MAX (ix86_cost->int_load[2], ix86_cost->int_store[2]);
+	else if (in)
+	  cost = ix86_cost->int_load[2];
+	else
+	  cost = ix86_cost->int_store[2];
+	/* Multiply with the number of GPR moves needed.  */
+	return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
     }
+}
 
-  if (mmode != VOIDmode)
-    maskop = force_reg (mmode, gen_int_mode (mask, mmode));
-  else
-    maskop = GEN_INT (mask);
-
-  /* This matches five different patterns with the different modes.  */
-  x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
-  x = gen_rtx_SET (target, x);
-  emit_insn (x);
-  if (target != d->target)
-    emit_move_insn (d->target, gen_lowpart (d->vmode, target));
-
-  return true;
+static int
+ix86_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
+{
+  return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
-   in terms of the variable form of vpermilps.
 
-   Note that we will have already failed the immediate input vpermilps,
-   which requires that the high and low part shuffle be identical; the
-   variable form doesn't require that.  */
+/* Return the cost of moving data from a register in class CLASS1 to
+   one in class CLASS2.
 
-static bool
-expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
+   It is not required that the cost always equal 2 when FROM is the same as TO;
+   on some machines it is expensive to move between registers if they are not
+   general registers.  */
+
+static int
+ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
+			 reg_class_t class2_i)
 {
-  rtx rperm[8], vperm;
-  unsigned i;
+  enum reg_class class1 = (enum reg_class) class1_i;
+  enum reg_class class2 = (enum reg_class) class2_i;
 
-  if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
-    return false;
+  /* In case we require secondary memory, compute cost of the store followed
+     by load.  In order to avoid bad register allocation choices, we need
+     for this to be *at least* as high as the symmetric MEMORY_MOVE_COST.  */
 
-  /* We can only permute within the 128-bit lane.  */
-  for (i = 0; i < 8; ++i)
+  if (inline_secondary_memory_needed (mode, class1, class2, false))
     {
-      unsigned e = d->perm[i];
-      if (i < 4 ? e >= 4 : e < 4)
-	return false;
-    }
+      int cost = 1;
 
-  if (d->testing_p)
-    return true;
+      cost += inline_memory_move_cost (mode, class1, 2);
+      cost += inline_memory_move_cost (mode, class2, 2);
 
-  for (i = 0; i < 8; ++i)
-    {
-      unsigned e = d->perm[i];
+      /* In case of copying from general_purpose_register we may emit multiple
+         stores followed by single load causing memory size mismatch stall.
+         Count this as arbitrarily high cost of 20.  */
+      if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
+	  && TARGET_MEMORY_MISMATCH_STALL
+	  && targetm.class_max_nregs (class1, mode)
+	     > targetm.class_max_nregs (class2, mode))
+	cost += 20;
 
-      /* Within each 128-bit lane, the elements of op0 are numbered
-	 from 0 and the elements of op1 are numbered from 4.  */
-      if (e >= 8 + 4)
-	e -= 8;
-      else if (e >= 4)
-	e -= 4;
+      /* In the case of FP/MMX moves, the registers actually overlap, and we
+	 have to switch modes in order to treat them differently.  */
+      if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
+          || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
+	cost += 20;
 
-      rperm[i] = GEN_INT (e);
+      return cost;
     }
 
-  vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
-  vperm = force_reg (V8SImode, vperm);
-  emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
-
-  return true;
-}
-
-/* Return true if permutation D can be performed as VMODE permutation
-   instead.  */
+  /* Moves between SSE/MMX and integer unit are expensive.  */
+  if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
+      || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
 
-static bool
-valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
-{
-  unsigned int i, j, chunk;
+    /* ??? By keeping returned value relatively high, we limit the number
+       of moves between integer and MMX/SSE registers for all targets.
+       Additionally, high value prevents problem with x86_modes_tieable_p(),
+       where integer modes in MMX/SSE registers are not tieable
+       because of missing QImode and HImode moves to, from or between
+       MMX/SSE registers.  */
+    return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
+		? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
 
-  if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
-      || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
-      || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
-    return false;
+  if (MAYBE_FLOAT_CLASS_P (class1))
+    return ix86_cost->fp_move;
+  if (MAYBE_SSE_CLASS_P (class1))
+    {
+      if (GET_MODE_BITSIZE (mode) <= 128)
+	return ix86_cost->xmm_move;
+      if (GET_MODE_BITSIZE (mode) <= 256)
+	return ix86_cost->ymm_move;
+      return ix86_cost->zmm_move;
+    }
+  if (MAYBE_MMX_CLASS_P (class1))
+    return ix86_cost->mmx_move;
+  return 2;
+}
 
-  if (GET_MODE_NUNITS (vmode) >= d->nelt)
-    return true;
+/* Implement TARGET_HARD_REGNO_NREGS.  This is ordinarily the length in
+   words of a value of mode MODE but can be less for certain modes in
+   special long registers.
 
-  chunk = d->nelt / GET_MODE_NUNITS (vmode);
-  for (i = 0; i < d->nelt; i += chunk)
-    if (d->perm[i] & (chunk - 1))
-      return false;
-    else
-      for (j = 1; j < chunk; ++j)
-	if (d->perm[i] + j != d->perm[i + j])
-	  return false;
+   Actually there are no two word move instructions for consecutive
+   registers.  And only registers 0-3 may have mov byte instructions
+   applied to them.  */
 
-  return true;
+static unsigned int
+ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
+{
+  if (GENERAL_REGNO_P (regno))
+    {
+      if (mode == XFmode)
+	return TARGET_64BIT ? 2 : 3;
+      if (mode == XCmode)
+	return TARGET_64BIT ? 4 : 6;
+      return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
+    }
+  if (COMPLEX_MODE_P (mode))
+    return 2;
+  if (mode == V64SFmode || mode == V64SImode)
+    return 4;
+  return 1;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
-   in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
+/* Implement TARGET_HARD_REGNO_MODE_OK.  */
 
 static bool
-expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
+ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
 {
-  unsigned i, nelt, eltsz, mask;
-  unsigned char perm[64];
-  machine_mode vmode = V16QImode;
-  rtx rperm[64], vperm, target, op0, op1;
-
-  nelt = d->nelt;
-
-  if (!d->one_operand_p)
-    {
-      if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
-	{
-	  if (TARGET_AVX2
-	      && valid_perm_using_mode_p (V2TImode, d))
-	    {
-	      if (d->testing_p)
-		return true;
-
-	      /* Use vperm2i128 insn.  The pattern uses
-		 V4DImode instead of V2TImode.  */
-	      target = d->target;
-	      if (d->vmode != V4DImode)
-		target = gen_reg_rtx (V4DImode);
-	      op0 = gen_lowpart (V4DImode, d->op0);
-	      op1 = gen_lowpart (V4DImode, d->op1);
-	      rperm[0]
-		= GEN_INT ((d->perm[0] / (nelt / 2))
-			   | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
-	      emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
-	      if (target != d->target)
-		emit_move_insn (d->target, gen_lowpart (d->vmode, target));
-	      return true;
-	    }
-	  return false;
-	}
-    }
-  else
+  /* Flags and only flags can only hold CCmode values.  */
+  if (CC_REGNO_P (regno))
+    return GET_MODE_CLASS (mode) == MODE_CC;
+  if (GET_MODE_CLASS (mode) == MODE_CC
+      || GET_MODE_CLASS (mode) == MODE_RANDOM
+      || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
+    return false;
+  if (STACK_REGNO_P (regno))
+    return VALID_FP_MODE_P (mode);
+  if (MASK_REGNO_P (regno))
+    return (VALID_MASK_REG_MODE (mode)
+	    || (TARGET_AVX512BW
+		&& VALID_MASK_AVX512BW_MODE (mode)));
+  if (SSE_REGNO_P (regno))
     {
-      if (GET_MODE_SIZE (d->vmode) == 16)
-	{
-	  if (!TARGET_SSSE3)
-	    return false;
-	}
-      else if (GET_MODE_SIZE (d->vmode) == 32)
-	{
-	  if (!TARGET_AVX2)
-	    return false;
-
-	  /* V4DImode should be already handled through
-	     expand_vselect by vpermq instruction.  */
-	  gcc_assert (d->vmode != V4DImode);
-
-	  vmode = V32QImode;
-	  if (d->vmode == V8SImode
-	      || d->vmode == V16HImode
-	      || d->vmode == V32QImode)
-	    {
-	      /* First see if vpermq can be used for
-		 V8SImode/V16HImode/V32QImode.  */
-	      if (valid_perm_using_mode_p (V4DImode, d))
-		{
-		  for (i = 0; i < 4; i++)
-		    perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
-		  if (d->testing_p)
-		    return true;
-		  target = gen_reg_rtx (V4DImode);
-		  if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
-				      perm, 4, false))
-		    {
-		      emit_move_insn (d->target,
-				      gen_lowpart (d->vmode, target));
-		      return true;
-		    }
-		  return false;
-		}
-
-	      /* Next see if vpermd can be used.  */
-	      if (valid_perm_using_mode_p (V8SImode, d))
-		vmode = V8SImode;
-	    }
-	  /* Or if vpermps can be used.  */
-	  else if (d->vmode == V8SFmode)
-	    vmode = V8SImode;
+      /* We implement the move patterns for all vector modes into and
+	 out of SSE registers, even when no operation instructions
+	 are available.  */
 
-	  if (vmode == V32QImode)
-	    {
-	      /* vpshufb only works intra lanes, it is not
-		 possible to shuffle bytes in between the lanes.  */
-	      for (i = 0; i < nelt; ++i)
-		if ((d->perm[i] ^ i) & (nelt / 2))
-		  return false;
-	    }
-	}
-      else if (GET_MODE_SIZE (d->vmode) == 64)
-	{
-	  if (!TARGET_AVX512BW)
-	    return false;
+      /* For AVX-512 we allow, regardless of regno:
+	  - XI mode
+	  - any of 512-bit wide vector mode
+	  - any scalar mode.  */
+      if (TARGET_AVX512F
+	  && (mode == XImode
+	      || VALID_AVX512F_REG_MODE (mode)
+	      || VALID_AVX512F_SCALAR_MODE (mode)))
+	return true;
 
-	  /* If vpermq didn't work, vpshufb won't work either.  */
-	  if (d->vmode == V8DFmode || d->vmode == V8DImode)
-	    return false;
+      /* For AVX-5124FMAPS or AVX-5124VNNIW
+	 allow V64SF and V64SI modes for special regnos.  */
+      if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
+	  && (mode == V64SFmode || mode == V64SImode)
+	  && MOD4_SSE_REGNO_P (regno))
+	return true;
 
-	  vmode = V64QImode;
-	  if (d->vmode == V16SImode
-	      || d->vmode == V32HImode
-	      || d->vmode == V64QImode)
-	    {
-	      /* First see if vpermq can be used for
-		 V16SImode/V32HImode/V64QImode.  */
-	      if (valid_perm_using_mode_p (V8DImode, d))
-		{
-		  for (i = 0; i < 8; i++)
-		    perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
-		  if (d->testing_p)
-		    return true;
-		  target = gen_reg_rtx (V8DImode);
-		  if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
-				      perm, 8, false))
-		    {
-		      emit_move_insn (d->target,
-				      gen_lowpart (d->vmode, target));
-		      return true;
-		    }
-		  return false;
-		}
+      /* TODO check for QI/HI scalars.  */
+      /* AVX512VL allows sse regs16+ for 128/256 bit modes.  */
+      if (TARGET_AVX512VL
+	  && (mode == OImode
+	      || mode == TImode
+	      || VALID_AVX256_REG_MODE (mode)
+	      || VALID_AVX512VL_128_REG_MODE (mode)))
+	return true;
 
-	      /* Next see if vpermd can be used.  */
-	      if (valid_perm_using_mode_p (V16SImode, d))
-		vmode = V16SImode;
-	    }
-	  /* Or if vpermps can be used.  */
-	  else if (d->vmode == V16SFmode)
-	    vmode = V16SImode;
-	  if (vmode == V64QImode)
-	    {
-	      /* vpshufb only works intra lanes, it is not
-		 possible to shuffle bytes in between the lanes.  */
-	      for (i = 0; i < nelt; ++i)
-		if ((d->perm[i] ^ i) & (nelt / 4))
-		  return false;
-	    }
-	}
-      else
+      /* xmm16-xmm31 are only available for AVX-512.  */
+      if (EXT_REX_SSE_REGNO_P (regno))
 	return false;
-    }
-
-  if (d->testing_p)
-    return true;
 
-  if (vmode == V8SImode)
-    for (i = 0; i < 8; ++i)
-      rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
-  else if (vmode == V16SImode)
-    for (i = 0; i < 16; ++i)
-      rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
-  else
+      /* OImode and AVX modes are available only when AVX is enabled.  */
+      return ((TARGET_AVX
+	       && VALID_AVX256_REG_OR_OI_MODE (mode))
+	      || VALID_SSE_REG_MODE (mode)
+	      || VALID_SSE2_REG_MODE (mode)
+	      || VALID_MMX_REG_MODE (mode)
+	      || VALID_MMX_REG_MODE_3DNOW (mode));
+    }
+  if (MMX_REGNO_P (regno))
     {
-      eltsz = GET_MODE_UNIT_SIZE (d->vmode);
-      if (!d->one_operand_p)
-	mask = 2 * nelt - 1;
-      else if (vmode == V16QImode)
-	mask = nelt - 1;
-      else if (vmode == V64QImode)
-	mask = nelt / 4 - 1;
-      else
-	mask = nelt / 2 - 1;
-
-      for (i = 0; i < nelt; ++i)
-	{
-	  unsigned j, e = d->perm[i] & mask;
-	  for (j = 0; j < eltsz; ++j)
-	    rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
-	}
-    }
-
-  vperm = gen_rtx_CONST_VECTOR (vmode,
-				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
-  vperm = force_reg (vmode, vperm);
-
-  target = d->target;
-  if (d->vmode != vmode)
-    target = gen_reg_rtx (vmode);
-  op0 = gen_lowpart (vmode, d->op0);
-  if (d->one_operand_p)
-    {
-      if (vmode == V16QImode)
-	emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
-      else if (vmode == V32QImode)
-	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
-      else if (vmode == V64QImode)
-	emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
-      else if (vmode == V8SFmode)
-	emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
-      else if (vmode == V8SImode)
-	emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
-      else if (vmode == V16SFmode)
-	emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
-      else if (vmode == V16SImode)
-	emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
-      else
-	gcc_unreachable ();
+      /* We implement the move patterns for 3DNOW modes even in MMX mode,
+	 so if the register is available at all, then we can move data of
+	 the given mode into or out of it.  */
+      return (VALID_MMX_REG_MODE (mode)
+	      || VALID_MMX_REG_MODE_3DNOW (mode));
     }
-  else
+
+  if (mode == QImode)
     {
-      op1 = gen_lowpart (vmode, d->op1);
-      emit_insn (gen_xop_pperm (target, op0, op1, vperm));
+      /* Take care for QImode values - they can be in non-QI regs,
+	 but then they do cause partial register stalls.  */
+      if (ANY_QI_REGNO_P (regno))
+	return true;
+      if (!TARGET_PARTIAL_REG_STALL)
+	return true;
+      /* LRA checks if the hard register is OK for the given mode.
+	 QImode values can live in non-QI regs, so we allow all
+	 registers here.  */
+      if (lra_in_progress)
+       return true;
+      return !can_create_pseudo_p ();
     }
-  if (target != d->target)
-    emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+  /* We handle both integer and floats in the general purpose registers.  */
+  else if (VALID_INT_MODE_P (mode))
+    return true;
+  else if (VALID_FP_MODE_P (mode))
+    return true;
+  else if (VALID_DFP_MODE_P (mode))
+    return true;
+  /* Lots of MMX code casts 8 byte vector modes to DImode.  If we then go
+     on to use that value in smaller contexts, this can easily force a
+     pseudo to be allocated to GENERAL_REGS.  Since this is no worse than
+     supporting DImode, allow it.  */
+  else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
+    return true;
 
-  return true;
+  return false;
 }
 
-/* For V*[QHS]Imode permutations, check if the same permutation
-   can't be performed in a 2x, 4x or 8x wider inner mode.  */
+/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The only ABI that
+   saves SSE registers across calls is Win64 (thus no need to check the
+   current ABI here), and with AVX enabled Win64 only guarantees that
+   the low 16 bytes are saved.  */
 
 static bool
-canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
-			      struct expand_vec_perm_d *nd)
+ix86_hard_regno_call_part_clobbered (unsigned int, unsigned int regno,
+				     machine_mode mode)
 {
-  int i;
-  machine_mode mode = VOIDmode;
-
-  switch (d->vmode)
-    {
-    case E_V16QImode: mode = V8HImode; break;
-    case E_V32QImode: mode = V16HImode; break;
-    case E_V64QImode: mode = V32HImode; break;
-    case E_V8HImode: mode = V4SImode; break;
-    case E_V16HImode: mode = V8SImode; break;
-    case E_V32HImode: mode = V16SImode; break;
-    case E_V4SImode: mode = V2DImode; break;
-    case E_V8SImode: mode = V4DImode; break;
-    case E_V16SImode: mode = V8DImode; break;
-    default: return false;
-    }
-  for (i = 0; i < d->nelt; i += 2)
-    if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
-      return false;
-  nd->vmode = mode;
-  nd->nelt = d->nelt / 2;
-  for (i = 0; i < nd->nelt; i++)
-    nd->perm[i] = d->perm[2 * i] / 2;
-  if (GET_MODE_INNER (mode) != DImode)
-    canonicalize_vector_int_perm (nd, nd);
-  if (nd != d)
-    {
-      nd->one_operand_p = d->one_operand_p;
-      nd->testing_p = d->testing_p;
-      if (d->op0 == d->op1)
-	nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
-      else
-	{
-	  nd->op0 = gen_lowpart (nd->vmode, d->op0);
-	  nd->op1 = gen_lowpart (nd->vmode, d->op1);
-	}
-      if (d->testing_p)
-	nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
-      else
-	nd->target = gen_reg_rtx (nd->vmode);
-    }
-  return true;
+  return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
 }
 
-/* Try to expand one-operand permutation with constant mask.  */
+/* A subroutine of ix86_modes_tieable_p.  Return true if MODE is a
+   tieable integer mode.  */
 
 static bool
-ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
+ix86_tieable_integer_mode_p (machine_mode mode)
 {
-  machine_mode mode = GET_MODE (d->op0);
-  machine_mode maskmode = mode;
-  rtx (*gen) (rtx, rtx, rtx) = NULL;
-  rtx target, op0, mask;
-  rtx vec[64];
+  switch (mode)
+    {
+    case E_HImode:
+    case E_SImode:
+      return true;
 
-  if (!rtx_equal_p (d->op0, d->op1))
-    return false;
+    case E_QImode:
+      return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
 
-  if (!TARGET_AVX512F)
-    return false;
+    case E_DImode:
+      return TARGET_64BIT;
 
-  switch (mode)
-    {
-    case E_V16SImode:
-      gen = gen_avx512f_permvarv16si;
-      break;
-    case E_V16SFmode:
-      gen = gen_avx512f_permvarv16sf;
-      maskmode = V16SImode;
-      break;
-    case E_V8DImode:
-      gen = gen_avx512f_permvarv8di;
-      break;
-    case E_V8DFmode:
-      gen = gen_avx512f_permvarv8df;
-      maskmode = V8DImode;
-      break;
     default:
       return false;
     }
-
-  target = d->target;
-  op0 = d->op0;
-  for (int i = 0; i < d->nelt; ++i)
-    vec[i] = GEN_INT (d->perm[i]);
-  mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
-  emit_insn (gen (target, op0, force_reg (maskmode, mask)));
-  return true;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
-   in a single instruction.  */
+/* Implement TARGET_MODES_TIEABLE_P.
+
+   Return true if MODE1 is accessible in a register that can hold MODE2
+   without copying.  That is, all register classes that can hold MODE2
+   can also hold MODE1.  */
 
 static bool
-expand_vec_perm_1 (struct expand_vec_perm_d *d)
+ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
 {
-  unsigned i, nelt = d->nelt;
-  struct expand_vec_perm_d nd;
-
-  /* Check plain VEC_SELECT first, because AVX has instructions that could
-     match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
-     input where SEL+CONCAT may not.  */
-  if (d->one_operand_p)
-    {
-      int mask = nelt - 1;
-      bool identity_perm = true;
-      bool broadcast_perm = true;
-
-      for (i = 0; i < nelt; i++)
-	{
-	  nd.perm[i] = d->perm[i] & mask;
-	  if (nd.perm[i] != i)
-	    identity_perm = false;
-	  if (nd.perm[i])
-	    broadcast_perm = false;
-	}
+  if (mode1 == mode2)
+    return true;
 
-      if (identity_perm)
-	{
-	  if (!d->testing_p)
-	    emit_move_insn (d->target, d->op0);
-	  return true;
-	}
-      else if (broadcast_perm && TARGET_AVX2)
-	{
-	  /* Use vpbroadcast{b,w,d}.  */
-	  rtx (*gen) (rtx, rtx) = NULL;
-	  switch (d->vmode)
-	    {
-	    case E_V64QImode:
-	      if (TARGET_AVX512BW)
-		gen = gen_avx512bw_vec_dupv64qi_1;
-	      break;
-	    case E_V32QImode:
-	      gen = gen_avx2_pbroadcastv32qi_1;
-	      break;
-	    case E_V32HImode:
-	      if (TARGET_AVX512BW)
-		gen = gen_avx512bw_vec_dupv32hi_1;
-	      break;
-	    case E_V16HImode:
-	      gen = gen_avx2_pbroadcastv16hi_1;
-	      break;
-	    case E_V16SImode:
-	      if (TARGET_AVX512F)
-		gen = gen_avx512f_vec_dupv16si_1;
-	      break;
-	    case E_V8SImode:
-	      gen = gen_avx2_pbroadcastv8si_1;
-	      break;
-	    case E_V16QImode:
-	      gen = gen_avx2_pbroadcastv16qi;
-	      break;
-	    case E_V8HImode:
-	      gen = gen_avx2_pbroadcastv8hi;
-	      break;
-	    case E_V16SFmode:
-	      if (TARGET_AVX512F)
-		gen = gen_avx512f_vec_dupv16sf_1;
-	      break;
-	    case E_V8SFmode:
-	      gen = gen_avx2_vec_dupv8sf_1;
-	      break;
-	    case E_V8DFmode:
-	      if (TARGET_AVX512F)
-		gen = gen_avx512f_vec_dupv8df_1;
-	      break;
-	    case E_V8DImode:
-	      if (TARGET_AVX512F)
-		gen = gen_avx512f_vec_dupv8di_1;
-	      break;
-	    /* For other modes prefer other shuffles this function creates.  */
-	    default: break;
-	    }
-	  if (gen != NULL)
-	    {
-	      if (!d->testing_p)
-		emit_insn (gen (d->target, d->op0));
-	      return true;
-	    }
-	}
+  if (ix86_tieable_integer_mode_p (mode1)
+      && ix86_tieable_integer_mode_p (mode2))
+    return true;
 
-      if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
-	return true;
+  /* MODE2 being XFmode implies fp stack or general regs, which means we
+     can tie any smaller floating point modes to it.  Note that we do not
+     tie this with TFmode.  */
+  if (mode2 == XFmode)
+    return mode1 == SFmode || mode1 == DFmode;
 
-      /* There are plenty of patterns in sse.md that are written for
-	 SEL+CONCAT and are not replicated for a single op.  Perhaps
-	 that should be changed, to avoid the nastiness here.  */
+  /* MODE2 being DFmode implies fp stack, general or sse regs, which means
+     that we can tie it with SFmode.  */
+  if (mode2 == DFmode)
+    return mode1 == SFmode;
 
-      /* Recognize interleave style patterns, which means incrementing
-	 every other permutation operand.  */
-      for (i = 0; i < nelt; i += 2)
-	{
-	  nd.perm[i] = d->perm[i] & mask;
-	  nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
-	}
-      if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
-				  d->testing_p))
-	return true;
+  /* If MODE2 is only appropriate for an SSE register, then tie with
+     any other mode acceptable to SSE registers.  */
+  if (GET_MODE_SIZE (mode2) == 64
+      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
+    return (GET_MODE_SIZE (mode1) == 64
+	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
+  if (GET_MODE_SIZE (mode2) == 32
+      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
+    return (GET_MODE_SIZE (mode1) == 32
+	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
+  if (GET_MODE_SIZE (mode2) == 16
+      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
+    return (GET_MODE_SIZE (mode1) == 16
+	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
 
-      /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
-      if (nelt >= 4)
-	{
-	  for (i = 0; i < nelt; i += 4)
-	    {
-	      nd.perm[i + 0] = d->perm[i + 0] & mask;
-	      nd.perm[i + 1] = d->perm[i + 1] & mask;
-	      nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
-	      nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
-	    }
+  /* If MODE2 is appropriate for an MMX register, then tie
+     with any other mode acceptable to MMX registers.  */
+  if (GET_MODE_SIZE (mode2) == 8
+      && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
+    return (GET_MODE_SIZE (mode1) == 8
+	    && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
 
-	  if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
-				      d->testing_p))
-	    return true;
-	}
-    }
+  return false;
+}
 
-  /* Try movss/movsd instructions.  */
-  if (expand_vec_perm_movs (d))
-    return true;
+/* Return the cost of moving between two registers of mode MODE.  */
 
-  /* Finally, try the fully general two operand permute.  */
-  if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
-			      d->testing_p))
-    return true;
+static int
+ix86_set_reg_reg_cost (machine_mode mode)
+{
+  unsigned int units = UNITS_PER_WORD;
 
-  /* Recognize interleave style patterns with reversed operands.  */
-  if (!d->one_operand_p)
+  switch (GET_MODE_CLASS (mode))
     {
-      for (i = 0; i < nelt; ++i)
-	{
-	  unsigned e = d->perm[i];
-	  if (e >= nelt)
-	    e -= nelt;
-	  else
-	    e += nelt;
-	  nd.perm[i] = e;
-	}
+    default:
+      break;
 
-      if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
-				  d->testing_p))
-	return true;
-    }
+    case MODE_CC:
+      units = GET_MODE_SIZE (CCmode);
+      break;
 
-  /* Try the SSE4.1 blend variable merge instructions.  */
-  if (expand_vec_perm_blend (d))
-    return true;
+    case MODE_FLOAT:
+      if ((TARGET_SSE && mode == TFmode)
+	  || (TARGET_80387 && mode == XFmode)
+	  || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
+	  || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
+	units = GET_MODE_SIZE (mode);
+      break;
 
-  /* Try one of the AVX vpermil variable permutations.  */
-  if (expand_vec_perm_vpermil (d))
-    return true;
+    case MODE_COMPLEX_FLOAT:
+      if ((TARGET_SSE && mode == TCmode)
+	  || (TARGET_80387 && mode == XCmode)
+	  || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
+	  || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
+	units = GET_MODE_SIZE (mode);
+      break;
 
-  /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
-     vpshufb, vpermd, vpermps or vpermq variable permutation.  */
-  if (expand_vec_perm_pshufb (d))
-    return true;
+    case MODE_VECTOR_INT:
+    case MODE_VECTOR_FLOAT:
+      if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
+	  || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
+	  || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
+	  || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
+	  || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
+	units = GET_MODE_SIZE (mode);
+    }
 
-  /* Try the AVX2 vpalignr instruction.  */
-  if (expand_vec_perm_palignr (d, true))
-    return true;
+  /* Return the cost of moving between two registers of mode MODE,
+     assuming that the move will be in pieces of at most UNITS bytes.  */
+  return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
+}
 
-  /* Try the AVX512F vperm{s,d} instructions.  */
-  if (ix86_expand_vec_one_operand_perm_avx512 (d))
-    return true;
+/* Return cost of vector operation in MODE given that scalar version has
+   COST.  */
 
-  /* Try the AVX512F vpermt2/vpermi2 instructions.  */
-  if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
-    return true;
+static int
+ix86_vec_cost (machine_mode mode, int cost)
+{
+  if (!VECTOR_MODE_P (mode))
+    return cost;
 
-  /* See if we can get the same permutation in different vector integer
-     mode.  */
-  if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
-    {
-      if (!d->testing_p)
-	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
-      return true;
-    }
-  return false;
+  if (GET_MODE_BITSIZE (mode) == 128
+      && TARGET_SSE_SPLIT_REGS)
+    return cost * 2;
+  if (GET_MODE_BITSIZE (mode) > 128
+      && TARGET_AVX128_OPTIMAL)
+    return cost * GET_MODE_BITSIZE (mode) / 128;
+  return cost;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
-   in terms of a pair of pshuflw + pshufhw instructions.  */
+/* Return cost of multiplication in MODE.  */
 
-static bool
-expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
+static int
+ix86_multiplication_cost (const struct processor_costs *cost,
+			  enum machine_mode mode)
 {
-  unsigned char perm2[MAX_VECT_LEN];
-  unsigned i;
-  bool ok;
-
-  if (d->vmode != V8HImode || !d->one_operand_p)
-    return false;
+  machine_mode inner_mode = mode;
+  if (VECTOR_MODE_P (mode))
+    inner_mode = GET_MODE_INNER (mode);
 
-  /* The two permutations only operate in 64-bit lanes.  */
-  for (i = 0; i < 4; ++i)
-    if (d->perm[i] >= 4)
-      return false;
-  for (i = 4; i < 8; ++i)
-    if (d->perm[i] < 4)
-      return false;
+  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+    return inner_mode == DFmode ? cost->mulsd : cost->mulss;
+  else if (X87_FLOAT_MODE_P (mode))
+    return cost->fmul;
+  else if (FLOAT_MODE_P (mode))
+    return  ix86_vec_cost (mode,
+			   inner_mode == DFmode ? cost->mulsd : cost->mulss);
+  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+    {
+      /* vpmullq is used in this case. No emulation is needed.  */
+      if (TARGET_AVX512DQ)
+	return ix86_vec_cost (mode, cost->mulss);
 
-  if (d->testing_p)
-    return true;
+      /* V*QImode is emulated with 7-13 insns.  */
+      if (mode == V16QImode || mode == V32QImode)
+	{
+	  int extra = 11;
+	  if (TARGET_XOP && mode == V16QImode)
+	    extra = 5;
+	  else if (TARGET_SSSE3)
+	    extra = 6;
+	  return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * extra);
+	}
+      /* V*DImode is emulated with 5-8 insns.  */
+      else if (mode == V2DImode || mode == V4DImode)
+	{
+	  if (TARGET_XOP && mode == V2DImode)
+	    return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 3);
+	  else
+	    return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5);
+	}
+      /* Without sse4.1, we don't have PMULLD; it's emulated with 7
+	 insns, including two PMULUDQ.  */
+      else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
+	return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5);
+      else
+	return ix86_vec_cost (mode, cost->mulss);
+    }
+  else
+    return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
+}
 
-  /* Emit the pshuflw.  */
-  memcpy (perm2, d->perm, 4);
-  for (i = 4; i < 8; ++i)
-    perm2[i] = i;
-  ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
-  gcc_assert (ok);
+/* Return cost of multiplication in MODE.  */
 
-  /* Emit the pshufhw.  */
-  memcpy (perm2 + 4, d->perm + 4, 4);
-  for (i = 0; i < 4; ++i)
-    perm2[i] = i;
-  ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
-  gcc_assert (ok);
+static int
+ix86_division_cost (const struct processor_costs *cost,
+			  enum machine_mode mode)
+{
+  machine_mode inner_mode = mode;
+  if (VECTOR_MODE_P (mode))
+    inner_mode = GET_MODE_INNER (mode);
 
-  return true;
+  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+    return inner_mode == DFmode ? cost->divsd : cost->divss;
+  else if (X87_FLOAT_MODE_P (mode))
+    return cost->fdiv;
+  else if (FLOAT_MODE_P (mode))
+    return ix86_vec_cost (mode,
+			  inner_mode == DFmode ? cost->divsd : cost->divss);
+  else
+    return cost->divide[MODE_INDEX (mode)];
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
-   the permutation using the SSSE3 palignr instruction.  This succeeds
-   when all of the elements in PERM fit within one vector and we merely
-   need to shift them down so that a single vector permutation has a
-   chance to succeed.  If SINGLE_INSN_ONLY_P, succeed if only
-   the vpalignr instruction itself can perform the requested permutation.  */
-
-static bool
-expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
-{
-  unsigned i, nelt = d->nelt;
-  unsigned min, max, minswap, maxswap;
-  bool in_order, ok, swap = false;
-  rtx shift, target;
-  struct expand_vec_perm_d dcopy;
-
-  /* Even with AVX, palignr only operates on 128-bit vectors,
-     in AVX2 palignr operates on both 128-bit lanes.  */
-  if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
-      && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
-    return false;
-
-  min = 2 * nelt;
-  max = 0;
-  minswap = 2 * nelt;
-  maxswap = 0;
-  for (i = 0; i < nelt; ++i)
-    {
-      unsigned e = d->perm[i];
-      unsigned eswap = d->perm[i] ^ nelt;
-      if (GET_MODE_SIZE (d->vmode) == 32)
-	{
-	  e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
-	  eswap = e ^ (nelt / 2);
-	}
-      if (e < min)
-	min = e;
-      if (e > max)
-	max = e;
-      if (eswap < minswap)
-	minswap = eswap;
-      if (eswap > maxswap)
-	maxswap = eswap;
-    }
-  if (min == 0
-      || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
-    {
-      if (d->one_operand_p
-	  || minswap == 0
-	  || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
-				   ? nelt / 2 : nelt))
-	return false;
-      swap = true;
-      min = minswap;
-      max = maxswap;
-    }
+#define COSTS_N_BYTES(N) ((N) * 2)
 
-  /* Given that we have SSSE3, we know we'll be able to implement the
-     single operand permutation after the palignr with pshufb for
-     128-bit vectors.  If SINGLE_INSN_ONLY_P, in_order has to be computed
-     first.  */
-  if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
-    return true;
+/* Return cost of shift in MODE.
+   If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
+   AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
+   if op1 is a result of subreg.
 
-  dcopy = *d;
-  if (swap)
-    {
-      dcopy.op0 = d->op1;
-      dcopy.op1 = d->op0;
-      for (i = 0; i < nelt; ++i)
-	dcopy.perm[i] ^= nelt;
-    }
+   SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored.  */
 
-  in_order = true;
-  for (i = 0; i < nelt; ++i)
+static int
+ix86_shift_rotate_cost (const struct processor_costs *cost,
+			enum machine_mode mode, bool constant_op1,
+			HOST_WIDE_INT op1_val,
+			bool speed,
+			bool and_in_op1,
+			bool shift_and_truncate,
+			bool *skip_op0, bool *skip_op1)
+{
+  if (skip_op0)
+    *skip_op0 = *skip_op1 = false;
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
     {
-      unsigned e = dcopy.perm[i];
-      if (GET_MODE_SIZE (d->vmode) == 32
-	  && e >= nelt
-	  && (e & (nelt / 2 - 1)) < min)
-	e = e - min - (nelt / 2);
+      /* V*QImode is emulated with 1-11 insns.  */
+      if (mode == V16QImode || mode == V32QImode)
+	{
+	  int count = 11;
+	  if (TARGET_XOP && mode == V16QImode)
+	    {
+	      /* For XOP we use vpshab, which requires a broadcast of the
+		 value to the variable shift insn.  For constants this
+		 means a V16Q const in mem; even when we can perform the
+		 shift with one insn set the cost to prefer paddb.  */
+	      if (constant_op1)
+		{
+		  if (skip_op1)
+		    *skip_op1 = true;
+		  return ix86_vec_cost (mode,
+					cost->sse_op
+					+ (speed
+					   ? 2
+					   : COSTS_N_BYTES
+					       (GET_MODE_UNIT_SIZE (mode))));
+		}
+	      count = 3;
+	    }
+	  else if (TARGET_SSSE3)
+	    count = 7;
+	  return ix86_vec_cost (mode, cost->sse_op * count);
+	}
       else
-	e = e - min;
-      if (e != i)
-	in_order = false;
-      dcopy.perm[i] = e;
-    }
-  dcopy.one_operand_p = true;
-
-  if (single_insn_only_p && !in_order)
-    return false;
-
-  /* For AVX2, test whether we can permute the result in one instruction.  */
-  if (d->testing_p)
-    {
-      if (in_order)
-	return true;
-      dcopy.op1 = dcopy.op0;
-      return expand_vec_perm_1 (&dcopy);
+	return ix86_vec_cost (mode, cost->sse_op);
     }
-
-  shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
-  if (GET_MODE_SIZE (d->vmode) == 16)
+  if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
     {
-      target = gen_reg_rtx (TImode);
-      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
-				      gen_lowpart (TImode, dcopy.op0), shift));
+      if (constant_op1)
+	{
+	  if (op1_val > 32)
+	    return cost->shift_const + COSTS_N_INSNS (2);
+	  else
+	    return cost->shift_const * 2;
+	}
+      else
+	{
+	  if (and_in_op1)
+	    return cost->shift_var * 2;
+	  else
+	    return cost->shift_var * 6 + COSTS_N_INSNS (2);
+	}
     }
   else
     {
-      target = gen_reg_rtx (V2TImode);
-      emit_insn (gen_avx2_palignrv2ti (target,
-				       gen_lowpart (V2TImode, dcopy.op1),
-				       gen_lowpart (V2TImode, dcopy.op0),
-				       shift));
-    }
-
-  dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
-
-  /* Test for the degenerate case where the alignment by itself
-     produces the desired permutation.  */
-  if (in_order)
-    {
-      emit_move_insn (d->target, dcopy.op0);
-      return true;
+      if (constant_op1)
+	return cost->shift_const;
+      else if (shift_and_truncate)
+	{
+	  if (skip_op0)
+	    *skip_op0 = *skip_op1 = true;
+	  /* Return the cost after shift-and truncation.  */
+	  return cost->shift_var;
+	}
+      else
+	return cost->shift_var;
     }
-
-  ok = expand_vec_perm_1 (&dcopy);
-  gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
-
-  return ok;
+  return cost->shift_const;
 }
 
-/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
-   the permutation using the SSE4_1 pblendv instruction.  Potentially
-   reduces permutation from 2 pshufb and or to 1 pshufb and pblendv.  */
+/* Compute a (partial) cost for rtx X.  Return true if the complete
+   cost has been computed, and false if subexpressions should be
+   scanned.  In either case, *TOTAL contains the cost result.  */
 
 static bool
-expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
+ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
+		int *total, bool speed)
 {
-  unsigned i, which, nelt = d->nelt;
-  struct expand_vec_perm_d dcopy, dcopy1;
-  machine_mode vmode = d->vmode;
-  bool ok;
-
-  /* Use the same checks as in expand_vec_perm_blend.  */
-  if (d->one_operand_p)
-    return false;
-  if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
-    ;
-  else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
-    ;
-  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
-    ;
-  else
-    return false;
-
-  /* Figure out where permutation elements stay not in their
-     respective lanes.  */
-  for (i = 0, which = 0; i < nelt; ++i)
-    {
-      unsigned e = d->perm[i];
-      if (e != i)
-	which |= (e < nelt ? 1 : 2);
-    }
-  /* We can pblend the part where elements stay not in their
-     respective lanes only when these elements are all in one
-     half of a permutation.
-     {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
-     lanes, but both 8 and 9 >= 8
-     {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
-     respective lanes and 8 >= 8, but 2 not.  */
-  if (which != 1 && which != 2)
-    return false;
-  if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
-    return true;
-
-  /* First we apply one operand permutation to the part where
-     elements stay not in their respective lanes.  */
-  dcopy = *d;
-  if (which == 2)
-    dcopy.op0 = dcopy.op1 = d->op1;
-  else
-    dcopy.op0 = dcopy.op1 = d->op0;
-  if (!d->testing_p)
-    dcopy.target = gen_reg_rtx (vmode);
-  dcopy.one_operand_p = true;
-
-  for (i = 0; i < nelt; ++i)
-    dcopy.perm[i] = d->perm[i] & (nelt - 1);
-
-  ok = expand_vec_perm_1 (&dcopy);
-  if (GET_MODE_SIZE (vmode) != 16 && !ok)
-    return false;
-  else
-    gcc_assert (ok);
-  if (d->testing_p)
-    return true;
-
-  /* Next we put permuted elements into their positions.  */
-  dcopy1 = *d;
-  if (which == 2)
-    dcopy1.op1 = dcopy.target;
-  else
-    dcopy1.op0 = dcopy.target;
-
-  for (i = 0; i < nelt; ++i)
-    dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
+  rtx mask;
+  enum rtx_code code = GET_CODE (x);
+  enum rtx_code outer_code = (enum rtx_code) outer_code_i;
+  const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
+  int src_cost;
 
-  ok = expand_vec_perm_blend (&dcopy1);
-  gcc_assert (ok);
+  switch (code)
+    {
+    case SET:
+      if (register_operand (SET_DEST (x), VOIDmode)
+	  && register_operand (SET_SRC (x), VOIDmode))
+	{
+	  *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
+	  return true;
+	}
 
-  return true;
-}
+      if (register_operand (SET_SRC (x), VOIDmode))
+	/* Avoid potentially incorrect high cost from rtx_costs
+	   for non-tieable SUBREGs.  */
+	src_cost = 0;
+      else
+	{
+	  src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
 
-static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
+	  if (CONSTANT_P (SET_SRC (x)))
+	    /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
+	       a small value, possibly zero for cheap constants.  */
+	    src_cost += COSTS_N_INSNS (1);
+	}
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
-   a two vector permutation into a single vector permutation by using
-   an interleave operation to merge the vectors.  */
+      *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
+      return true;
 
-static bool
-expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
-{
-  struct expand_vec_perm_d dremap, dfinal;
-  unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
-  unsigned HOST_WIDE_INT contents;
-  unsigned char remap[2 * MAX_VECT_LEN];
-  rtx_insn *seq;
-  bool ok, same_halves = false;
+    case CONST_INT:
+    case CONST:
+    case LABEL_REF:
+    case SYMBOL_REF:
+      if (x86_64_immediate_operand (x, VOIDmode))
+	*total = 0;
+     else
+	*total = 1;
+      return true;
 
-  if (GET_MODE_SIZE (d->vmode) == 16)
-    {
-      if (d->one_operand_p)
-	return false;
-    }
-  else if (GET_MODE_SIZE (d->vmode) == 32)
-    {
-      if (!TARGET_AVX)
-	return false;
-      /* For 32-byte modes allow even d->one_operand_p.
-	 The lack of cross-lane shuffling in some instructions
-	 might prevent a single insn shuffle.  */
-      dfinal = *d;
-      dfinal.testing_p = true;
-      /* If expand_vec_perm_interleave3 can expand this into
-	 a 3 insn sequence, give up and let it be expanded as
-	 3 insn sequence.  While that is one insn longer,
-	 it doesn't need a memory operand and in the common
-	 case that both interleave low and high permutations
-	 with the same operands are adjacent needs 4 insns
-	 for both after CSE.  */
-      if (expand_vec_perm_interleave3 (&dfinal))
-	return false;
-    }
-  else
-    return false;
+    case CONST_DOUBLE:
+      if (IS_STACK_MODE (mode))
+	switch (standard_80387_constant_p (x))
+	  {
+	  case -1:
+	  case 0:
+	    break;
+	  case 1: /* 0.0 */
+	    *total = 1;
+	    return true;
+	  default: /* Other constants */
+	    *total = 2;
+	    return true;
+	  }
+      /* FALLTHRU */
 
-  /* Examine from whence the elements come.  */
-  contents = 0;
-  for (i = 0; i < nelt; ++i)
-    contents |= HOST_WIDE_INT_1U << d->perm[i];
+    case CONST_VECTOR:
+      switch (standard_sse_constant_p (x, mode))
+	{
+	case 0:
+	  break;
+	case 1:  /* 0: xor eliminates false dependency */
+	  *total = 0;
+	  return true;
+	default: /* -1: cmp contains false dependency */
+	  *total = 1;
+	  return true;
+	}
+      /* FALLTHRU */
 
-  memset (remap, 0xff, sizeof (remap));
-  dremap = *d;
+    case CONST_WIDE_INT:
+      /* Fall back to (MEM (SYMBOL_REF)), since that's where
+	 it'll probably end up.  Add a penalty for size.  */
+      *total = (COSTS_N_INSNS (1)
+		+ (!TARGET_64BIT && flag_pic)
+		+ (GET_MODE_SIZE (mode) <= 4
+		   ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
+      return true;
 
-  if (GET_MODE_SIZE (d->vmode) == 16)
-    {
-      unsigned HOST_WIDE_INT h1, h2, h3, h4;
+    case ZERO_EXTEND:
+      /* The zero extensions is often completely free on x86_64, so make
+	 it as cheap as possible.  */
+      if (TARGET_64BIT && mode == DImode
+	  && GET_MODE (XEXP (x, 0)) == SImode)
+	*total = 1;
+      else if (TARGET_ZERO_EXTEND_WITH_AND)
+	*total = cost->add;
+      else
+	*total = cost->movzx;
+      return false;
 
-      /* Split the two input vectors into 4 halves.  */
-      h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
-      h2 = h1 << nelt2;
-      h3 = h2 << nelt2;
-      h4 = h3 << nelt2;
+    case SIGN_EXTEND:
+      *total = cost->movsx;
+      return false;
 
-      /* If the elements from the low halves use interleave low, and similarly
-	 for interleave high.  If the elements are from mis-matched halves, we
-	 can use shufps for V4SF/V4SI or do a DImode shuffle.  */
-      if ((contents & (h1 | h3)) == contents)
-	{
-	  /* punpckl* */
-	  for (i = 0; i < nelt2; ++i)
-	    {
-	      remap[i] = i * 2;
-	      remap[i + nelt] = i * 2 + 1;
-	      dremap.perm[i * 2] = i;
-	      dremap.perm[i * 2 + 1] = i + nelt;
-	    }
-	  if (!TARGET_SSE2 && d->vmode == V4SImode)
-	    dremap.vmode = V4SFmode;
-	}
-      else if ((contents & (h2 | h4)) == contents)
-	{
-	  /* punpckh* */
-	  for (i = 0; i < nelt2; ++i)
-	    {
-	      remap[i + nelt2] = i * 2;
-	      remap[i + nelt + nelt2] = i * 2 + 1;
-	      dremap.perm[i * 2] = i + nelt2;
-	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
-	    }
-	  if (!TARGET_SSE2 && d->vmode == V4SImode)
-	    dremap.vmode = V4SFmode;
-	}
-      else if ((contents & (h1 | h4)) == contents)
+    case ASHIFT:
+      if (SCALAR_INT_MODE_P (mode)
+	  && GET_MODE_SIZE (mode) < UNITS_PER_WORD
+	  && CONST_INT_P (XEXP (x, 1)))
 	{
-	  /* shufps */
-	  for (i = 0; i < nelt2; ++i)
+	  HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
+	  if (value == 1)
 	    {
-	      remap[i] = i;
-	      remap[i + nelt + nelt2] = i + nelt2;
-	      dremap.perm[i] = i;
-	      dremap.perm[i + nelt2] = i + nelt + nelt2;
+	      *total = cost->add;
+	      return false;
 	    }
-	  if (nelt != 4)
+	  if ((value == 2 || value == 3)
+	      && cost->lea <= cost->shift_const)
 	    {
-	      /* shufpd */
-	      dremap.vmode = V2DImode;
-	      dremap.nelt = 2;
-	      dremap.perm[0] = 0;
-	      dremap.perm[1] = 3;
+	      *total = cost->lea;
+	      return false;
 	    }
 	}
-      else if ((contents & (h2 | h3)) == contents)
+      /* FALLTHRU */
+
+    case ROTATE:
+    case ASHIFTRT:
+    case LSHIFTRT:
+    case ROTATERT:
+      bool skip_op0, skip_op1;
+      *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
+				       CONST_INT_P (XEXP (x, 1))
+					 ? INTVAL (XEXP (x, 1)) : -1,
+				       speed,
+				       GET_CODE (XEXP (x, 1)) == AND,
+				       SUBREG_P (XEXP (x, 1))
+				       && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
+				       &skip_op0, &skip_op1);
+      if (skip_op0 || skip_op1)
 	{
-	  /* shufps */
-	  for (i = 0; i < nelt2; ++i)
-	    {
-	      remap[i + nelt2] = i;
-	      remap[i + nelt] = i + nelt2;
-	      dremap.perm[i] = i + nelt2;
-	      dremap.perm[i + nelt2] = i + nelt;
-	    }
-	  if (nelt != 4)
-	    {
-	      /* shufpd */
-	      dremap.vmode = V2DImode;
-	      dremap.nelt = 2;
-	      dremap.perm[0] = 1;
-	      dremap.perm[1] = 2;
-	    }
+	  if (!skip_op0)
+	    *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+	  if (!skip_op1)
+	    *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
+	  return true;
 	}
-      else
-	return false;
-    }
-  else
-    {
-      unsigned int nelt4 = nelt / 4, nzcnt = 0;
-      unsigned HOST_WIDE_INT q[8];
-      unsigned int nonzero_halves[4];
+      return false;
 
-      /* Split the two input vectors into 8 quarters.  */
-      q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
-      for (i = 1; i < 8; ++i)
-	q[i] = q[0] << (nelt4 * i);
-      for (i = 0; i < 4; ++i)
-	if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
-	  {
-	    nonzero_halves[nzcnt] = i;
-	    ++nzcnt;
-	  }
+    case FMA:
+      {
+	rtx sub;
 
-      if (nzcnt == 1)
-	{
-	  gcc_assert (d->one_operand_p);
-	  nonzero_halves[1] = nonzero_halves[0];
-	  same_halves = true;
-	}
-      else if (d->one_operand_p)
-	{
-	  gcc_assert (nonzero_halves[0] == 0);
-	  gcc_assert (nonzero_halves[1] == 1);
-	}
+        gcc_assert (FLOAT_MODE_P (mode));
+        gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
+
+        *total = ix86_vec_cost (mode,
+				GET_MODE_INNER (mode) == SFmode
+				? cost->fmass : cost->fmasd);
+	*total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
+
+        /* Negate in op0 or op2 is free: FMS, FNMA, FNMS.  */
+	sub = XEXP (x, 0);
+	if (GET_CODE (sub) == NEG)
+	  sub = XEXP (sub, 0);
+	*total += rtx_cost (sub, mode, FMA, 0, speed);
+
+	sub = XEXP (x, 2);
+	if (GET_CODE (sub) == NEG)
+	  sub = XEXP (sub, 0);
+	*total += rtx_cost (sub, mode, FMA, 2, speed);
+	return true;
+      }
 
-      if (nzcnt <= 2)
+    case MULT:
+      if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
 	{
-	  if (d->perm[0] / nelt2 == nonzero_halves[1])
+	  rtx op0 = XEXP (x, 0);
+	  rtx op1 = XEXP (x, 1);
+	  int nbits;
+	  if (CONST_INT_P (XEXP (x, 1)))
 	    {
-	      /* Attempt to increase the likelihood that dfinal
-		 shuffle will be intra-lane.  */
-	      std::swap (nonzero_halves[0], nonzero_halves[1]);
+	      unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
+	      for (nbits = 0; value != 0; value &= value - 1)
+	        nbits++;
 	    }
+	  else
+	    /* This is arbitrary.  */
+	    nbits = 7;
 
-	  /* vperm2f128 or vperm2i128.  */
-	  for (i = 0; i < nelt2; ++i)
+	  /* Compute costs correctly for widening multiplication.  */
+	  if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
+	      && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
+	         == GET_MODE_SIZE (mode))
 	    {
-	      remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
-	      remap[i + nonzero_halves[0] * nelt2] = i;
-	      dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
-	      dremap.perm[i] = i + nonzero_halves[0] * nelt2;
+	      int is_mulwiden = 0;
+	      machine_mode inner_mode = GET_MODE (op0);
+
+	      if (GET_CODE (op0) == GET_CODE (op1))
+		is_mulwiden = 1, op1 = XEXP (op1, 0);
+	      else if (CONST_INT_P (op1))
+		{
+		  if (GET_CODE (op0) == SIGN_EXTEND)
+		    is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
+			          == INTVAL (op1);
+		  else
+		    is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
+	        }
+
+	      if (is_mulwiden)
+	        op0 = XEXP (op0, 0), mode = GET_MODE (op0);
 	    }
 
-	  if (d->vmode != V8SFmode
-	      && d->vmode != V4DFmode
-	      && d->vmode != V8SImode)
+  	  *total = (cost->mult_init[MODE_INDEX (mode)]
+		    + nbits * cost->mult_bit
+	            + rtx_cost (op0, mode, outer_code, opno, speed)
+		    + rtx_cost (op1, mode, outer_code, opno, speed));
+
+          return true;
+	}
+      *total = ix86_multiplication_cost (cost, mode);
+      return false;
+
+    case DIV:
+    case UDIV:
+    case MOD:
+    case UMOD:
+      *total = ix86_division_cost (cost, mode);
+      return false;
+
+    case PLUS:
+      if (GET_MODE_CLASS (mode) == MODE_INT
+	  && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
+	{
+	  if (GET_CODE (XEXP (x, 0)) == PLUS
+	      && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
+	      && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
+	      && CONSTANT_P (XEXP (x, 1)))
 	    {
-	      dremap.vmode = V8SImode;
-	      dremap.nelt = 8;
-	      for (i = 0; i < 4; ++i)
+	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
+	      if (val == 2 || val == 4 || val == 8)
 		{
-		  dremap.perm[i] = i + nonzero_halves[0] * 4;
-		  dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
+		  *total = cost->lea;
+		  *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
+				      outer_code, opno, speed);
+		  *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
+				      outer_code, opno, speed);
+		  *total += rtx_cost (XEXP (x, 1), mode,
+				      outer_code, opno, speed);
+		  return true;
 		}
 	    }
-	}
-      else if (d->one_operand_p)
-	return false;
-      else if (TARGET_AVX2
-	       && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
-	{
-	  /* vpunpckl* */
-	  for (i = 0; i < nelt4; ++i)
+	  else if (GET_CODE (XEXP (x, 0)) == MULT
+		   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
 	    {
-	      remap[i] = i * 2;
-	      remap[i + nelt] = i * 2 + 1;
-	      remap[i + nelt2] = i * 2 + nelt2;
-	      remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
-	      dremap.perm[i * 2] = i;
-	      dremap.perm[i * 2 + 1] = i + nelt;
-	      dremap.perm[i * 2 + nelt2] = i + nelt2;
-	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
+	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
+	      if (val == 2 || val == 4 || val == 8)
+		{
+		  *total = cost->lea;
+		  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+				      outer_code, opno, speed);
+		  *total += rtx_cost (XEXP (x, 1), mode,
+				      outer_code, opno, speed);
+		  return true;
+		}
 	    }
-	}
-      else if (TARGET_AVX2
-	       && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
-	{
-	  /* vpunpckh* */
-	  for (i = 0; i < nelt4; ++i)
+	  else if (GET_CODE (XEXP (x, 0)) == PLUS)
 	    {
-	      remap[i + nelt4] = i * 2;
-	      remap[i + nelt + nelt4] = i * 2 + 1;
-	      remap[i + nelt2 + nelt4] = i * 2 + nelt2;
-	      remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
-	      dremap.perm[i * 2] = i + nelt4;
-	      dremap.perm[i * 2 + 1] = i + nelt + nelt4;
-	      dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
-	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
+	      /* Add with carry, ignore the cost of adding a carry flag.  */
+	      if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
+		*total = cost->add;
+	      else
+		{
+		  *total = cost->lea;
+		  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+				      outer_code, opno, speed);
+		}
+
+	      *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
+				  outer_code, opno, speed);
+	      *total += rtx_cost (XEXP (x, 1), mode,
+				  outer_code, opno, speed);
+	      return true;
 	    }
 	}
-      else
-	return false;
-    }
+      /* FALLTHRU */
 
-  /* Use the remapping array set up above to move the elements from their
-     swizzled locations into their final destinations.  */
-  dfinal = *d;
-  for (i = 0; i < nelt; ++i)
-    {
-      unsigned e = remap[d->perm[i]];
-      gcc_assert (e < nelt);
-      /* If same_halves is true, both halves of the remapped vector are the
-	 same.  Avoid cross-lane accesses if possible.  */
-      if (same_halves && i >= nelt2)
+    case MINUS:
+      /* Subtract with borrow, ignore the cost of subtracting a carry flag.  */
+      if (GET_MODE_CLASS (mode) == MODE_INT
+	  && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
+	  && GET_CODE (XEXP (x, 0)) == MINUS
+	  && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
 	{
-	  gcc_assert (e < nelt2);
-	  dfinal.perm[i] = e + nelt2;
+	  *total = cost->add;
+	  *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+			      outer_code, opno, speed);
+	  *total += rtx_cost (XEXP (x, 1), mode,
+			      outer_code, opno, speed);
+	  return true;
 	}
-      else
-	dfinal.perm[i] = e;
-    }
-  if (!d->testing_p)
-    {
-      dremap.target = gen_reg_rtx (dremap.vmode);
-      dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
-    }
-  dfinal.op1 = dfinal.op0;
-  dfinal.one_operand_p = true;
 
-  /* Test if the final remap can be done with a single insn.  For V4SFmode or
-     V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
-  start_sequence ();
-  ok = expand_vec_perm_1 (&dfinal);
-  seq = get_insns ();
-  end_sequence ();
-
-  if (!ok)
-    return false;
-
-  if (d->testing_p)
-    return true;
-
-  if (dremap.vmode != dfinal.vmode)
-    {
-      dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
-      dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
-    }
-
-  ok = expand_vec_perm_1 (&dremap);
-  gcc_assert (ok);
-
-  emit_insn (seq);
-  return true;
-}
-
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
-   a single vector cross-lane permutation into vpermq followed
-   by any of the single insn permutations.  */
-
-static bool
-expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
-{
-  struct expand_vec_perm_d dremap, dfinal;
-  unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
-  unsigned contents[2];
-  bool ok;
-
-  if (!(TARGET_AVX2
-	&& (d->vmode == V32QImode || d->vmode == V16HImode)
-	&& d->one_operand_p))
-    return false;
-
-  contents[0] = 0;
-  contents[1] = 0;
-  for (i = 0; i < nelt2; ++i)
-    {
-      contents[0] |= 1u << (d->perm[i] / nelt4);
-      contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
-    }
-
-  for (i = 0; i < 2; ++i)
-    {
-      unsigned int cnt = 0;
-      for (j = 0; j < 4; ++j)
-	if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
+      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+	{
+	  *total = cost->addss;
 	  return false;
-    }
-
-  if (d->testing_p)
-    return true;
-
-  dremap = *d;
-  dremap.vmode = V4DImode;
-  dremap.nelt = 4;
-  dremap.target = gen_reg_rtx (V4DImode);
-  dremap.op0 = gen_lowpart (V4DImode, d->op0);
-  dremap.op1 = dremap.op0;
-  dremap.one_operand_p = true;
-  for (i = 0; i < 2; ++i)
-    {
-      unsigned int cnt = 0;
-      for (j = 0; j < 4; ++j)
-	if ((contents[i] & (1u << j)) != 0)
-	  dremap.perm[2 * i + cnt++] = j;
-      for (; cnt < 2; ++cnt)
-	dremap.perm[2 * i + cnt] = 0;
-    }
-
-  dfinal = *d;
-  dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
-  dfinal.op1 = dfinal.op0;
-  dfinal.one_operand_p = true;
-  for (i = 0, j = 0; i < nelt; ++i)
-    {
-      if (i == nelt2)
-	j = 2;
-      dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
-      if ((d->perm[i] / nelt4) == dremap.perm[j])
-	;
-      else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
-	dfinal.perm[i] |= nelt4;
-      else
-	gcc_unreachable ();
-    }
-
-  ok = expand_vec_perm_1 (&dremap);
-  gcc_assert (ok);
-
-  ok = expand_vec_perm_1 (&dfinal);
-  gcc_assert (ok);
-
-  return true;
-}
-
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to expand
-   a vector permutation using two instructions, vperm2f128 resp.
-   vperm2i128 followed by any single in-lane permutation.  */
-
-static bool
-expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
-{
-  struct expand_vec_perm_d dfirst, dsecond;
-  unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
-  bool ok;
+	}
+      else if (X87_FLOAT_MODE_P (mode))
+	{
+	  *total = cost->fadd;
+	  return false;
+	}
+      else if (FLOAT_MODE_P (mode))
+	{
+	  *total = ix86_vec_cost (mode, cost->addss);
+	  return false;
+	}
+      /* FALLTHRU */
 
-  if (!TARGET_AVX
-      || GET_MODE_SIZE (d->vmode) != 32
-      || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
-    return false;
+    case AND:
+    case IOR:
+    case XOR:
+      if (GET_MODE_CLASS (mode) == MODE_INT
+	  && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+	{
+	  *total = (cost->add * 2
+		    + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
+		       << (GET_MODE (XEXP (x, 0)) != DImode))
+		    + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
+	               << (GET_MODE (XEXP (x, 1)) != DImode)));
+	  return true;
+	}
+      /* FALLTHRU */
 
-  dsecond = *d;
-  dsecond.one_operand_p = false;
-  dsecond.testing_p = true;
-
-  /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
-     immediate.  For perm < 16 the second permutation uses
-     d->op0 as first operand, for perm >= 16 it uses d->op1
-     as first operand.  The second operand is the result of
-     vperm2[fi]128.  */
-  for (perm = 0; perm < 32; perm++)
-    {
-      /* Ignore permutations which do not move anything cross-lane.  */
-      if (perm < 16)
-	{
-	  /* The second shuffle for e.g. V4DFmode has
-	     0123 and ABCD operands.
-	     Ignore AB23, as 23 is already in the second lane
-	     of the first operand.  */
-	  if ((perm & 0xc) == (1 << 2)) continue;
-	  /* And 01CD, as 01 is in the first lane of the first
-	     operand.  */
-	  if ((perm & 3) == 0) continue;
-	  /* And 4567, as then the vperm2[fi]128 doesn't change
-	     anything on the original 4567 second operand.  */
-	  if ((perm & 0xf) == ((3 << 2) | 2)) continue;
+    case NEG:
+      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+	{
+	  *total = cost->sse_op;
+	  return false;
 	}
-      else
+      else if (X87_FLOAT_MODE_P (mode))
 	{
-	  /* The second shuffle for e.g. V4DFmode has
-	     4567 and ABCD operands.
-	     Ignore AB67, as 67 is already in the second lane
-	     of the first operand.  */
-	  if ((perm & 0xc) == (3 << 2)) continue;
-	  /* And 45CD, as 45 is in the first lane of the first
-	     operand.  */
-	  if ((perm & 3) == 2) continue;
-	  /* And 0123, as then the vperm2[fi]128 doesn't change
-	     anything on the original 0123 first operand.  */
-	  if ((perm & 0xf) == (1 << 2)) continue;
-	}
-
-      for (i = 0; i < nelt; i++)
-	{
-	  j = d->perm[i] / nelt2;
-	  if (j == ((perm >> (2 * (i >= nelt2))) & 3))
-	    dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
-	  else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
-	    dsecond.perm[i] = d->perm[i] & (nelt - 1);
-	  else
-	    break;
+	  *total = cost->fchs;
+	  return false;
 	}
-
-      if (i == nelt)
+      else if (FLOAT_MODE_P (mode))
 	{
-	  start_sequence ();
-	  ok = expand_vec_perm_1 (&dsecond);
-	  end_sequence ();
+	  *total = ix86_vec_cost (mode, cost->sse_op);
+	  return false;
 	}
+      /* FALLTHRU */
+
+    case NOT:
+      if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+	*total = ix86_vec_cost (mode, cost->sse_op);
+      else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+	*total = cost->add * 2;
       else
-	ok = false;
+	*total = cost->add;
+      return false;
 
-      if (ok)
+    case COMPARE:
+      if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
+	  && XEXP (XEXP (x, 0), 1) == const1_rtx
+	  && CONST_INT_P (XEXP (XEXP (x, 0), 2))
+	  && XEXP (x, 1) == const0_rtx)
 	{
-	  if (d->testing_p)
-	    return true;
-
-	  /* Found a usable second shuffle.  dfirst will be
-	     vperm2f128 on d->op0 and d->op1.  */
-	  dsecond.testing_p = false;
-	  dfirst = *d;
-	  dfirst.target = gen_reg_rtx (d->vmode);
-	  for (i = 0; i < nelt; i++)
-	    dfirst.perm[i] = (i & (nelt2 - 1))
-			     + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
-
-	  canonicalize_perm (&dfirst);
-	  ok = expand_vec_perm_1 (&dfirst);
-	  gcc_assert (ok);
-
-	  /* And dsecond is some single insn shuffle, taking
-	     d->op0 and result of vperm2f128 (if perm < 16) or
-	     d->op1 and result of vperm2f128 (otherwise).  */
-	  if (perm >= 16)
-	    dsecond.op0 = dsecond.op1;
-	  dsecond.op1 = dfirst.target;
-
-	  ok = expand_vec_perm_1 (&dsecond);
-	  gcc_assert (ok);
-
+	  /* This kind of construct is implemented using test[bwl].
+	     Treat it as if we had an AND.  */
+	  mode = GET_MODE (XEXP (XEXP (x, 0), 0));
+	  *total = (cost->add
+		    + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
+				opno, speed)
+		    + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
 	  return true;
 	}
 
-      /* For one operand, the only useful vperm2f128 permutation is 0x01
-	 aka lanes swap.  */
-      if (d->one_operand_p)
-	return false;
-    }
+      /* The embedded comparison operand is completely free.  */
+      if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
+	  && XEXP (x, 1) == const0_rtx)
+	*total = 0;
 
-  return false;
-}
+      return false;
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
-   a two vector permutation using 2 intra-lane interleave insns
-   and cross-lane shuffle for 32-byte vectors.  */
+    case FLOAT_EXTEND:
+      if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
+	*total = 0;
+      else
+        *total = ix86_vec_cost (mode, cost->addss);
+      return false;
 
-static bool
-expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
-{
-  unsigned i, nelt;
-  rtx (*gen) (rtx, rtx, rtx);
+    case FLOAT_TRUNCATE:
+      if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
+	*total = cost->fadd;
+      else
+        *total = ix86_vec_cost (mode, cost->addss);
+      return false;
 
-  if (d->one_operand_p)
-    return false;
-  if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
-    ;
-  else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
-    ;
-  else
-    return false;
+    case ABS:
+      /* SSE requires memory load for the constant operand. It may make
+	 sense to account for this.  Of course the constant operand may or
+	 may not be reused. */
+      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+	*total = cost->sse_op;
+      else if (X87_FLOAT_MODE_P (mode))
+	*total = cost->fabs;
+      else if (FLOAT_MODE_P (mode))
+	*total = ix86_vec_cost (mode, cost->sse_op);
+      return false;
 
-  nelt = d->nelt;
-  if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
-    return false;
-  for (i = 0; i < nelt; i += 2)
-    if (d->perm[i] != d->perm[0] + i / 2
-	|| d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
+    case SQRT:
+      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+	*total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
+      else if (X87_FLOAT_MODE_P (mode))
+	*total = cost->fsqrt;
+      else if (FLOAT_MODE_P (mode))
+	*total = ix86_vec_cost (mode,
+				mode == SFmode ? cost->sqrtss : cost->sqrtsd);
       return false;
 
-  if (d->testing_p)
-    return true;
+    case UNSPEC:
+      if (XINT (x, 1) == UNSPEC_TP)
+	*total = 0;
+      return false;
 
-  switch (d->vmode)
-    {
-    case E_V32QImode:
-      if (d->perm[0])
-	gen = gen_vec_interleave_highv32qi;
-      else
-	gen = gen_vec_interleave_lowv32qi;
-      break;
-    case E_V16HImode:
-      if (d->perm[0])
-	gen = gen_vec_interleave_highv16hi;
-      else
-	gen = gen_vec_interleave_lowv16hi;
-      break;
-    case E_V8SImode:
-      if (d->perm[0])
-	gen = gen_vec_interleave_highv8si;
-      else
-	gen = gen_vec_interleave_lowv8si;
-      break;
-    case E_V4DImode:
-      if (d->perm[0])
-	gen = gen_vec_interleave_highv4di;
-      else
-	gen = gen_vec_interleave_lowv4di;
-      break;
-    case E_V8SFmode:
-      if (d->perm[0])
-	gen = gen_vec_interleave_highv8sf;
-      else
-	gen = gen_vec_interleave_lowv8sf;
-      break;
-    case E_V4DFmode:
-      if (d->perm[0])
-	gen = gen_vec_interleave_highv4df;
+    case VEC_SELECT:
+    case VEC_CONCAT:
+    case VEC_DUPLICATE:
+      /* ??? Assume all of these vector manipulation patterns are
+	 recognizable.  In which case they all pretty much have the
+	 same cost.  */
+     *total = cost->sse_op;
+     return true;
+    case VEC_MERGE:
+      mask = XEXP (x, 2);
+      /* This is masked instruction, assume the same cost,
+	 as nonmasked variant.  */
+      if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
+	*total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
       else
-	gen = gen_vec_interleave_lowv4df;
-      break;
+	*total = cost->sse_op;
+      return true;
+
     default:
-      gcc_unreachable ();
+      return false;
     }
-
-  emit_insn (gen (d->target, d->op0, d->op1));
-  return true;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement
-   a single vector permutation using a single intra-lane vector
-   permutation, vperm2f128 swapping the lanes and vblend* insn blending
-   the non-swapped and swapped vectors together.  */
-
-static bool
-expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
-{
-  struct expand_vec_perm_d dfirst, dsecond;
-  unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
-  rtx_insn *seq;
-  bool ok;
-  rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
-
-  if (!TARGET_AVX
-      || TARGET_AVX2
-      || (d->vmode != V8SFmode && d->vmode != V4DFmode)
-      || !d->one_operand_p)
-    return false;
-
-  dfirst = *d;
-  for (i = 0; i < nelt; i++)
-    dfirst.perm[i] = 0xff;
-  for (i = 0, msk = 0; i < nelt; i++)
-    {
-      j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
-      if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
-	return false;
-      dfirst.perm[j] = d->perm[i];
-      if (j != i)
-	msk |= (1 << i);
-    }
-  for (i = 0; i < nelt; i++)
-    if (dfirst.perm[i] == 0xff)
-      dfirst.perm[i] = i;
-
-  if (!d->testing_p)
-    dfirst.target = gen_reg_rtx (dfirst.vmode);
-
-  start_sequence ();
-  ok = expand_vec_perm_1 (&dfirst);
-  seq = get_insns ();
-  end_sequence ();
-
-  if (!ok)
-    return false;
-
-  if (d->testing_p)
-    return true;
-
-  emit_insn (seq);
-
-  dsecond = *d;
-  dsecond.op0 = dfirst.target;
-  dsecond.op1 = dfirst.target;
-  dsecond.one_operand_p = true;
-  dsecond.target = gen_reg_rtx (dsecond.vmode);
-  for (i = 0; i < nelt; i++)
-    dsecond.perm[i] = i ^ nelt2;
-
-  ok = expand_vec_perm_1 (&dsecond);
-  gcc_assert (ok);
+#if TARGET_MACHO
 
-  blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
-  emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
-  return true;
-}
+static int current_machopic_label_num;
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement a V4DF
-   permutation using two vperm2f128, followed by a vshufpd insn blending
-   the two vectors together.  */
+/* Given a symbol name and its associated stub, write out the
+   definition of the stub.  */
 
-static bool
-expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
+void
+machopic_output_stub (FILE *file, const char *symb, const char *stub)
 {
-  struct expand_vec_perm_d dfirst, dsecond, dthird;
-  bool ok;
-
-  if (!TARGET_AVX || (d->vmode != V4DFmode))
-    return false;
-
-  if (d->testing_p)
-    return true;
-
-  dfirst = *d;
-  dsecond = *d;
-  dthird = *d;
-
-  dfirst.perm[0] = (d->perm[0] & ~1);
-  dfirst.perm[1] = (d->perm[0] & ~1) + 1;
-  dfirst.perm[2] = (d->perm[2] & ~1);
-  dfirst.perm[3] = (d->perm[2] & ~1) + 1;
-  dsecond.perm[0] = (d->perm[1] & ~1);
-  dsecond.perm[1] = (d->perm[1] & ~1) + 1;
-  dsecond.perm[2] = (d->perm[3] & ~1);
-  dsecond.perm[3] = (d->perm[3] & ~1) + 1;
-  dthird.perm[0] = (d->perm[0] % 2);
-  dthird.perm[1] = (d->perm[1] % 2) + 4;
-  dthird.perm[2] = (d->perm[2] % 2) + 2;
-  dthird.perm[3] = (d->perm[3] % 2) + 6;
-
-  dfirst.target = gen_reg_rtx (dfirst.vmode);
-  dsecond.target = gen_reg_rtx (dsecond.vmode);
-  dthird.op0 = dfirst.target;
-  dthird.op1 = dsecond.target;
-  dthird.one_operand_p = false;
-
-  canonicalize_perm (&dfirst);
-  canonicalize_perm (&dsecond);
-
-  ok = expand_vec_perm_1 (&dfirst)
-       && expand_vec_perm_1 (&dsecond)
-       && expand_vec_perm_1 (&dthird);
+  unsigned int length;
+  char *binder_name, *symbol_name, lazy_ptr_name[32];
+  int label = ++current_machopic_label_num;
 
-  gcc_assert (ok);
+  /* For 64-bit we shouldn't get here.  */
+  gcc_assert (!TARGET_64BIT);
 
-  return true;
-}
+  /* Lose our funky encoding stuff so it doesn't contaminate the stub.  */
+  symb = targetm.strip_name_encoding (symb);
 
-/* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
-   permutation with two pshufb insns and an ior.  We should have already
-   failed all two instruction sequences.  */
+  length = strlen (stub);
+  binder_name = XALLOCAVEC (char, length + 32);
+  GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
 
-static bool
-expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
-{
-  rtx rperm[2][16], vperm, l, h, op, m128;
-  unsigned int i, nelt, eltsz;
+  length = strlen (symb);
+  symbol_name = XALLOCAVEC (char, length + 32);
+  GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
 
-  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
-    return false;
-  gcc_assert (!d->one_operand_p);
+  sprintf (lazy_ptr_name, "L%d$lz", label);
 
-  if (d->testing_p)
-    return true;
+  if (MACHOPIC_ATT_STUB)
+    switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
+  else if (MACHOPIC_PURE)
+    switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
+  else
+    switch_to_section (darwin_sections[machopic_symbol_stub_section]);
 
-  nelt = d->nelt;
-  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+  fprintf (file, "%s:\n", stub);
+  fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
 
-  /* Generate two permutation masks.  If the required element is within
-     the given vector it is shuffled into the proper lane.  If the required
-     element is in the other vector, force a zero into the lane by setting
-     bit 7 in the permutation mask.  */
-  m128 = GEN_INT (-128);
-  for (i = 0; i < nelt; ++i)
+  if (MACHOPIC_ATT_STUB)
     {
-      unsigned j, e = d->perm[i];
-      unsigned which = (e >= nelt);
-      if (e >= nelt)
-	e -= nelt;
-
-      for (j = 0; j < eltsz; ++j)
-	{
-	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
-	  rperm[1-which][i*eltsz + j] = m128;
-	}
+      fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
     }
-
-  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
-  vperm = force_reg (V16QImode, vperm);
-
-  l = gen_reg_rtx (V16QImode);
-  op = gen_lowpart (V16QImode, d->op0);
-  emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
-
-  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
-  vperm = force_reg (V16QImode, vperm);
-
-  h = gen_reg_rtx (V16QImode);
-  op = gen_lowpart (V16QImode, d->op1);
-  emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
-
-  op = d->target;
-  if (d->vmode != V16QImode)
-    op = gen_reg_rtx (V16QImode);
-  emit_insn (gen_iorv16qi3 (op, l, h));
-  if (op != d->target)
-    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
-
-  return true;
-}
-
-/* Implement arbitrary permutation of one V32QImode and V16QImode operand
-   with two vpshufb insns, vpermq and vpor.  We should have already failed
-   all two or three instruction sequences.  */
-
-static bool
-expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
-{
-  rtx rperm[2][32], vperm, l, h, hp, op, m128;
-  unsigned int i, nelt, eltsz;
-
-  if (!TARGET_AVX2
-      || !d->one_operand_p
-      || (d->vmode != V32QImode && d->vmode != V16HImode))
-    return false;
-
-  if (d->testing_p)
-    return true;
-
-  nelt = d->nelt;
-  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
-
-  /* Generate two permutation masks.  If the required element is within
-     the same lane, it is shuffled in.  If the required element from the
-     other lane, force a zero by setting bit 7 in the permutation mask.
-     In the other mask the mask has non-negative elements if element
-     is requested from the other lane, but also moved to the other lane,
-     so that the result of vpshufb can have the two V2TImode halves
-     swapped.  */
-  m128 = GEN_INT (-128);
-  for (i = 0; i < nelt; ++i)
+  else if (MACHOPIC_PURE)
     {
-      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
-      unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
-
-      for (j = 0; j < eltsz; ++j)
-	{
-	  rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
-	  rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
-	}
+      /* PIC stub.  */
+      /* 25-byte PIC stub using "CALL get_pc_thunk".  */
+      rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
+      output_set_got (tmp, NULL_RTX);	/* "CALL ___<cpu>.get_pc_thunk.cx".  */
+      fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
+	       label, lazy_ptr_name, label);
+      fprintf (file, "\tjmp\t*%%ecx\n");
     }
+  else
+    fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
 
-  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
-  vperm = force_reg (V32QImode, vperm);
-
-  h = gen_reg_rtx (V32QImode);
-  op = gen_lowpart (V32QImode, d->op0);
-  emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
+  /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
+     it needs no stub-binding-helper.  */
+  if (MACHOPIC_ATT_STUB)
+    return;
 
-  /* Swap the 128-byte lanes of h into hp.  */
-  hp = gen_reg_rtx (V4DImode);
-  op = gen_lowpart (V4DImode, h);
-  emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
-				  const1_rtx));
+  fprintf (file, "%s:\n", binder_name);
 
-  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
-  vperm = force_reg (V32QImode, vperm);
+  if (MACHOPIC_PURE)
+    {
+      fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
+      fprintf (file, "\tpushl\t%%ecx\n");
+    }
+  else
+    fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
 
-  l = gen_reg_rtx (V32QImode);
-  op = gen_lowpart (V32QImode, d->op0);
-  emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
+  fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
 
-  op = d->target;
-  if (d->vmode != V32QImode)
-    op = gen_reg_rtx (V32QImode);
-  emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
-  if (op != d->target)
-    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+  /* N.B. Keep the correspondence of these
+     'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
+     old-pic/new-pic/non-pic stubs; altering this will break
+     compatibility with existing dylibs.  */
+  if (MACHOPIC_PURE)
+    {
+      /* 25-byte PIC stub using "CALL get_pc_thunk".  */
+      switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
+    }
+  else
+    /* 16-byte -mdynamic-no-pic stub.  */
+    switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
 
-  return true;
+  fprintf (file, "%s:\n", lazy_ptr_name);
+  fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
+  fprintf (file, ASM_LONG "%s\n", binder_name);
 }
+#endif /* TARGET_MACHO */
 
-/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
-   and extract-odd permutations of two V32QImode and V16QImode operand
-   with two vpshufb insns, vpor and vpermq.  We should have already
-   failed all two or three instruction sequences.  */
+/* Order the registers for register allocator.  */
 
-static bool
-expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
+void
+x86_order_regs_for_local_alloc (void)
 {
-  rtx rperm[2][32], vperm, l, h, ior, op, m128;
-  unsigned int i, nelt, eltsz;
-
-  if (!TARGET_AVX2
-      || d->one_operand_p
-      || (d->vmode != V32QImode && d->vmode != V16HImode))
-    return false;
-
-  for (i = 0; i < d->nelt; ++i)
-    if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
-      return false;
-
-  if (d->testing_p)
-    return true;
+   int pos = 0;
+   int i;
 
-  nelt = d->nelt;
-  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
-
-  /* Generate two permutation masks.  In the first permutation mask
-     the first quarter will contain indexes for the first half
-     of the op0, the second quarter will contain bit 7 set, third quarter
-     will contain indexes for the second half of the op0 and the
-     last quarter bit 7 set.  In the second permutation mask
-     the first quarter will contain bit 7 set, the second quarter
-     indexes for the first half of the op1, the third quarter bit 7 set
-     and last quarter indexes for the second half of the op1.
-     I.e. the first mask e.g. for V32QImode extract even will be:
-     0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
-     (all values masked with 0xf except for -128) and second mask
-     for extract even will be
-     -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
-  m128 = GEN_INT (-128);
-  for (i = 0; i < nelt; ++i)
-    {
-      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
-      unsigned which = d->perm[i] >= nelt;
-      unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
+   /* First allocate the local general purpose registers.  */
+   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+     if (GENERAL_REGNO_P (i) && call_used_or_fixed_reg_p (i))
+	reg_alloc_order [pos++] = i;
 
-      for (j = 0; j < eltsz; ++j)
-	{
-	  rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
-	  rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
-	}
-    }
+   /* Global general purpose registers.  */
+   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+     if (GENERAL_REGNO_P (i) && !call_used_or_fixed_reg_p (i))
+	reg_alloc_order [pos++] = i;
 
-  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
-  vperm = force_reg (V32QImode, vperm);
+   /* x87 registers come first in case we are doing FP math
+      using them.  */
+   if (!TARGET_SSE_MATH)
+     for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
+       reg_alloc_order [pos++] = i;
 
-  l = gen_reg_rtx (V32QImode);
-  op = gen_lowpart (V32QImode, d->op0);
-  emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
+   /* SSE registers.  */
+   for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
+     reg_alloc_order [pos++] = i;
+   for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
+     reg_alloc_order [pos++] = i;
 
-  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
-  vperm = force_reg (V32QImode, vperm);
+   /* Extended REX SSE registers.  */
+   for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
+     reg_alloc_order [pos++] = i;
 
-  h = gen_reg_rtx (V32QImode);
-  op = gen_lowpart (V32QImode, d->op1);
-  emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
+   /* Mask register.  */
+   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
+     reg_alloc_order [pos++] = i;
 
-  ior = gen_reg_rtx (V32QImode);
-  emit_insn (gen_iorv32qi3 (ior, l, h));
+   /* x87 registers.  */
+   if (TARGET_SSE_MATH)
+     for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
+       reg_alloc_order [pos++] = i;
 
-  /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
-  op = gen_reg_rtx (V4DImode);
-  ior = gen_lowpart (V4DImode, ior);
-  emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
-				  const1_rtx, GEN_INT (3)));
-  emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+   for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
+     reg_alloc_order [pos++] = i;
 
-  return true;
+   /* Initialize the rest of array as we do not allocate some registers
+      at all.  */
+   while (pos < FIRST_PSEUDO_REGISTER)
+     reg_alloc_order [pos++] = 0;
 }
 
-/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
-   and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
-   with two "and" and "pack" or two "shift" and "pack" insns.  We should
-   have already failed all two instruction sequences.  */
-
 static bool
-expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
+ix86_ms_bitfield_layout_p (const_tree record_type)
 {
-  rtx op, dop0, dop1, t;
-  unsigned i, odd, c, s, nelt = d->nelt;
-  bool end_perm = false;
-  machine_mode half_mode;
-  rtx (*gen_and) (rtx, rtx, rtx);
-  rtx (*gen_pack) (rtx, rtx, rtx);
-  rtx (*gen_shift) (rtx, rtx, rtx);
+  return ((TARGET_MS_BITFIELD_LAYOUT
+	   && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
+          || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
+}
 
-  if (d->one_operand_p)
-    return false;
+/* Returns an expression indicating where the this parameter is
+   located on entry to the FUNCTION.  */
 
-  switch (d->vmode)
-    {
-    case E_V8HImode:
-      /* Required for "pack".  */
-      if (!TARGET_SSE4_1)
-        return false;
-      c = 0xffff;
-      s = 16;
-      half_mode = V4SImode;
-      gen_and = gen_andv4si3;
-      gen_pack = gen_sse4_1_packusdw;
-      gen_shift = gen_lshrv4si3;
-      break;
-    case E_V16QImode:
-      /* No check as all instructions are SSE2.  */
-      c = 0xff;
-      s = 8;
-      half_mode = V8HImode;
-      gen_and = gen_andv8hi3;
-      gen_pack = gen_sse2_packuswb;
-      gen_shift = gen_lshrv8hi3;
-      break;
-    case E_V16HImode:
-      if (!TARGET_AVX2)
-        return false;
-      c = 0xffff;
-      s = 16;
-      half_mode = V8SImode;
-      gen_and = gen_andv8si3;
-      gen_pack = gen_avx2_packusdw;
-      gen_shift = gen_lshrv8si3;
-      end_perm = true;
-      break;
-    case E_V32QImode:
-      if (!TARGET_AVX2)
-        return false;
-      c = 0xff;
-      s = 8;
-      half_mode = V16HImode;
-      gen_and = gen_andv16hi3;
-      gen_pack = gen_avx2_packuswb;
-      gen_shift = gen_lshrv16hi3;
-      end_perm = true;
-      break;
-    default:
-      /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
-	 general shuffles.  */
-      return false;
-    }
+static rtx
+x86_this_parameter (tree function)
+{
+  tree type = TREE_TYPE (function);
+  bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
+  int nregs;
 
-  /* Check that permutation is even or odd.  */
-  odd = d->perm[0];
-  if (odd > 1)
-    return false;
+  if (TARGET_64BIT)
+    {
+      const int *parm_regs;
 
-  for (i = 1; i < nelt; ++i)
-    if (d->perm[i] != 2 * i + odd)
-      return false;
+      if (ix86_function_type_abi (type) == MS_ABI)
+        parm_regs = x86_64_ms_abi_int_parameter_registers;
+      else
+        parm_regs = x86_64_int_parameter_registers;
+      return gen_rtx_REG (Pmode, parm_regs[aggr]);
+    }
 
-  if (d->testing_p)
-    return true;
+  nregs = ix86_function_regparm (type, function);
 
-  dop0 = gen_reg_rtx (half_mode);
-  dop1 = gen_reg_rtx (half_mode);
-  if (odd == 0)
-    {
-      t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
-      t = force_reg (half_mode, t);
-      emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
-      emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
-    }
-  else
+  if (nregs > 0 && !stdarg_p (type))
     {
-      emit_insn (gen_shift (dop0,
-			    gen_lowpart (half_mode, d->op0),
-			    GEN_INT (s)));
-      emit_insn (gen_shift (dop1,
-			    gen_lowpart (half_mode, d->op1),
-			    GEN_INT (s)));
-    }
-  /* In AVX2 for 256 bit case we need to permute pack result.  */
-  if (TARGET_AVX2 && end_perm)
-    {
-      op = gen_reg_rtx (d->vmode);
-      t = gen_reg_rtx (V4DImode);
-      emit_insn (gen_pack (op, dop0, dop1));
-      emit_insn (gen_avx2_permv4di_1 (t,
-				      gen_lowpart (V4DImode, op),
-				      const0_rtx,
-				      const2_rtx,
-				      const1_rtx,
-				      GEN_INT (3)));
-      emit_move_insn (d->target, gen_lowpart (d->vmode, t));
+      int regno;
+      unsigned int ccvt = ix86_get_callcvt (type);
+
+      if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+	regno = aggr ? DX_REG : CX_REG;
+      else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+        {
+	  regno = CX_REG;
+	  if (aggr)
+	    return gen_rtx_MEM (SImode,
+				plus_constant (Pmode, stack_pointer_rtx, 4));
+	}
+      else
+        {
+	  regno = AX_REG;
+	  if (aggr)
+	    {
+	      regno = DX_REG;
+	      if (nregs == 1)
+		return gen_rtx_MEM (SImode,
+				    plus_constant (Pmode,
+						   stack_pointer_rtx, 4));
+	    }
+	}
+      return gen_rtx_REG (SImode, regno);
     }
-  else
-    emit_insn (gen_pack (d->target, dop0, dop1));
 
-  return true;
+  return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
+					     aggr ? 8 : 4));
 }
 
-/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
-   and extract-odd permutations of two V64QI operands
-   with two "shifts", two "truncs" and one "concat" insns for "odd"
-   and two "truncs" and one concat insn for "even."
-   Have already failed all two instruction sequences.  */
+/* Determine whether x86_output_mi_thunk can succeed.  */
 
 static bool
-expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
+x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
+			 const_tree function)
 {
-  rtx t1, t2, t3, t4;
-  unsigned i, odd, nelt = d->nelt;
-
-  if (!TARGET_AVX512BW
-      || d->one_operand_p
-      || d->vmode != V64QImode)
-    return false;
-
-  /* Check that permutation is even or odd.  */
-  odd = d->perm[0];
-  if (odd > 1)
-    return false;
-
-  for (i = 1; i < nelt; ++i)
-    if (d->perm[i] != 2 * i + odd)
-      return false;
-
-  if (d->testing_p)
+  /* 64-bit can handle anything.  */
+  if (TARGET_64BIT)
     return true;
 
+  /* For 32-bit, everything's fine if we have one free register.  */
+  if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
+    return true;
 
-  if (odd)
-    {
-      t1 = gen_reg_rtx (V32HImode);
-      t2 = gen_reg_rtx (V32HImode);
-      emit_insn (gen_lshrv32hi3 (t1,
-				 gen_lowpart (V32HImode, d->op0),
-				 GEN_INT (8)));
-      emit_insn (gen_lshrv32hi3 (t2,
-				 gen_lowpart (V32HImode, d->op1),
-				 GEN_INT (8)));
-    }
-  else
-    {
-      t1 = gen_lowpart (V32HImode, d->op0);
-      t2 = gen_lowpart (V32HImode, d->op1);
-    }
+  /* Need a free register for vcall_offset.  */
+  if (vcall_offset)
+    return false;
 
-  t3 = gen_reg_rtx (V32QImode);
-  t4 = gen_reg_rtx (V32QImode);
-  emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
-  emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
-  emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
+  /* Need a free register for GOT references.  */
+  if (flag_pic && !targetm.binds_local_p (function))
+    return false;
 
+  /* Otherwise ok.  */
   return true;
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
-   and extract-odd permutations.  */
+/* Output the assembler code for a thunk function.  THUNK_DECL is the
+   declaration for the thunk function itself, FUNCTION is the decl for
+   the target function.  DELTA is an immediate constant offset to be
+   added to THIS.  If VCALL_OFFSET is nonzero, the word at
+   *(*this + vcall_offset) should be added to THIS.  */
 
-static bool
-expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
+static void
+x86_output_mi_thunk (FILE *file, tree thunk_fndecl, HOST_WIDE_INT delta,
+		     HOST_WIDE_INT vcall_offset, tree function)
 {
-  rtx t1, t2, t3, t4, t5;
+  const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk_fndecl));
+  rtx this_param = x86_this_parameter (function);
+  rtx this_reg, tmp, fnaddr;
+  unsigned int tmp_regno;
+  rtx_insn *insn;
 
-  switch (d->vmode)
+  if (TARGET_64BIT)
+    tmp_regno = R10_REG;
+  else
     {
-    case E_V4DFmode:
-      if (d->testing_p)
-	break;
-      t1 = gen_reg_rtx (V4DFmode);
-      t2 = gen_reg_rtx (V4DFmode);
-
-      /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
-      emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
-      emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
-
-      /* Now an unpck[lh]pd will produce the result required.  */
-      if (odd)
-	t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
+      unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
+      if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+	tmp_regno = AX_REG;
+      else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+	tmp_regno = DX_REG;
       else
-	t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
-      emit_insn (t3);
-      break;
+	tmp_regno = CX_REG;
+    }
 
-    case E_V8SFmode:
-      {
-	int mask = odd ? 0xdd : 0x88;
+  emit_note (NOTE_INSN_PROLOGUE_END);
 
-	if (d->testing_p)
-	  break;
-	t1 = gen_reg_rtx (V8SFmode);
-	t2 = gen_reg_rtx (V8SFmode);
-	t3 = gen_reg_rtx (V8SFmode);
-
-	/* Shuffle within the 128-bit lanes to produce:
-	   { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
-	emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
-				      GEN_INT (mask)));
-
-	/* Shuffle the lanes around to produce:
-	   { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
-	emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
-					    GEN_INT (0x3)));
-
-	/* Shuffle within the 128-bit lanes to produce:
-	   { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
-	emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
-
-	/* Shuffle within the 128-bit lanes to produce:
-	   { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
-	emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
-
-	/* Shuffle the lanes around to produce:
-	   { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
-	emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
-					    GEN_INT (0x20)));
-      }
-      break;
+  /* CET is enabled, insert EB instruction.  */
+  if ((flag_cf_protection & CF_BRANCH))
+    emit_insn (gen_nop_endbr ());
 
-    case E_V2DFmode:
-    case E_V4SFmode:
-    case E_V2DImode:
-    case E_V4SImode:
-      /* These are always directly implementable by expand_vec_perm_1.  */
-      gcc_unreachable ();
+  /* If VCALL_OFFSET, we'll need THIS in a register.  Might as well
+     pull it in now and let DELTA benefit.  */
+  if (REG_P (this_param))
+    this_reg = this_param;
+  else if (vcall_offset)
+    {
+      /* Put the this parameter into %eax.  */
+      this_reg = gen_rtx_REG (Pmode, AX_REG);
+      emit_move_insn (this_reg, this_param);
+    }
+  else
+    this_reg = NULL_RTX;
 
-    case E_V8HImode:
-      if (TARGET_SSE4_1)
-	return expand_vec_perm_even_odd_pack (d);
-      else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
-	return expand_vec_perm_pshufb2 (d);
-      else
+  /* Adjust the this parameter by a fixed constant.  */
+  if (delta)
+    {
+      rtx delta_rtx = GEN_INT (delta);
+      rtx delta_dst = this_reg ? this_reg : this_param;
+
+      if (TARGET_64BIT)
 	{
-	  if (d->testing_p)
-	    break;
-	  /* We need 2*log2(N)-1 operations to achieve odd/even
-	     with interleave. */
-	  t1 = gen_reg_rtx (V8HImode);
-	  t2 = gen_reg_rtx (V8HImode);
-	  emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
-	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
-	  emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
-	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
-	  if (odd)
-	    t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
-	  else
-	    t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
-	  emit_insn (t3);
+	  if (!x86_64_general_operand (delta_rtx, Pmode))
+	    {
+	      tmp = gen_rtx_REG (Pmode, tmp_regno);
+	      emit_move_insn (tmp, delta_rtx);
+	      delta_rtx = tmp;
+	    }
 	}
-      break;
 
-    case E_V16QImode:
-      return expand_vec_perm_even_odd_pack (d);
+      ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
+    }
+
+  /* Adjust the this parameter by a value stored in the vtable.  */
+  if (vcall_offset)
+    {
+      rtx vcall_addr, vcall_mem, this_mem;
 
-    case E_V16HImode:
-    case E_V32QImode:
-      return expand_vec_perm_even_odd_pack (d);
+      tmp = gen_rtx_REG (Pmode, tmp_regno);
 
-    case E_V64QImode:
-      return expand_vec_perm_even_odd_trunc (d);
+      this_mem = gen_rtx_MEM (ptr_mode, this_reg);
+      if (Pmode != ptr_mode)
+	this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
+      emit_move_insn (tmp, this_mem);
 
-    case E_V4DImode:
-      if (!TARGET_AVX2)
+      /* Adjust the this parameter.  */
+      vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
+      if (TARGET_64BIT
+	  && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
 	{
-	  struct expand_vec_perm_d d_copy = *d;
-	  d_copy.vmode = V4DFmode;
-	  if (d->testing_p)
-	    d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
-	  else
-	    d_copy.target = gen_reg_rtx (V4DFmode);
-	  d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
-	  d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
-	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
-	    {
-	      if (!d->testing_p)
-		emit_move_insn (d->target,
-				gen_lowpart (V4DImode, d_copy.target));
-	      return true;
-	    }
-	  return false;
+	  rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
+	  emit_move_insn (tmp2, GEN_INT (vcall_offset));
+	  vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
 	}
 
-      if (d->testing_p)
-	break;
-
-      t1 = gen_reg_rtx (V4DImode);
-      t2 = gen_reg_rtx (V4DImode);
+      vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
+      if (Pmode != ptr_mode)
+	emit_insn (gen_addsi_1_zext (this_reg,
+				     gen_rtx_REG (ptr_mode,
+						  REGNO (this_reg)),
+				     vcall_mem));
+      else
+	ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
+    }
 
-      /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
-      emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
-      emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
+  /* If necessary, drop THIS back to its stack slot.  */
+  if (this_reg && this_reg != this_param)
+    emit_move_insn (this_param, this_reg);
 
-      /* Now an vpunpck[lh]qdq will produce the result required.  */
-      if (odd)
-	t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
+  fnaddr = XEXP (DECL_RTL (function), 0);
+  if (TARGET_64BIT)
+    {
+      if (!flag_pic || targetm.binds_local_p (function)
+	  || TARGET_PECOFF)
+	;
       else
-	t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
-      emit_insn (t3);
-      break;
-
-    case E_V8SImode:
-      if (!TARGET_AVX2)
 	{
-	  struct expand_vec_perm_d d_copy = *d;
-	  d_copy.vmode = V8SFmode;
-	  if (d->testing_p)
-	    d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
-	  else
-	    d_copy.target = gen_reg_rtx (V8SFmode);
-	  d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
-	  d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
-	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
-	    {
-	      if (!d->testing_p)
-		emit_move_insn (d->target,
-				gen_lowpart (V8SImode, d_copy.target));
-	      return true;
-	    }
-	  return false;
+	  tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
+	  tmp = gen_rtx_CONST (Pmode, tmp);
+	  fnaddr = gen_const_mem (Pmode, tmp);
+	}
+    }
+  else
+    {
+      if (!flag_pic || targetm.binds_local_p (function))
+	;
+#if TARGET_MACHO
+      else if (TARGET_MACHO)
+	{
+	  fnaddr = machopic_indirect_call_target (DECL_RTL (function));
+	  fnaddr = XEXP (fnaddr, 0);
 	}
+#endif /* TARGET_MACHO */
+      else
+	{
+	  tmp = gen_rtx_REG (Pmode, CX_REG);
+	  output_set_got (tmp, NULL_RTX);
 
-      if (d->testing_p)
-	break;
+	  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
+	  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
+	  fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
+	  fnaddr = gen_const_mem (Pmode, fnaddr);
+	}
+    }
 
-      t1 = gen_reg_rtx (V8SImode);
-      t2 = gen_reg_rtx (V8SImode);
-      t3 = gen_reg_rtx (V4DImode);
-      t4 = gen_reg_rtx (V4DImode);
-      t5 = gen_reg_rtx (V4DImode);
-
-      /* Shuffle the lanes around into
-	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
-      emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
-				    gen_lowpart (V4DImode, d->op1),
-				    GEN_INT (0x20)));
-      emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
-				    gen_lowpart (V4DImode, d->op1),
-				    GEN_INT (0x31)));
-
-      /* Swap the 2nd and 3rd position in each lane into
-	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
-      emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
-				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
-      emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
-				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
-
-      /* Now an vpunpck[lh]qdq will produce
-	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
-      if (odd)
-	t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
-					   gen_lowpart (V4DImode, t2));
+  /* Our sibling call patterns do not allow memories, because we have no
+     predicate that can distinguish between frame and non-frame memory.
+     For our purposes here, we can get away with (ab)using a jump pattern,
+     because we're going to do no optimization.  */
+  if (MEM_P (fnaddr))
+    {
+      if (sibcall_insn_operand (fnaddr, word_mode))
+	{
+	  fnaddr = XEXP (DECL_RTL (function), 0);
+	  tmp = gen_rtx_MEM (QImode, fnaddr);
+	  tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
+	  tmp = emit_call_insn (tmp);
+	  SIBLING_CALL_P (tmp) = 1;
+	}
       else
-	t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
-					  gen_lowpart (V4DImode, t2));
-      emit_insn (t3);
-      emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
-      break;
+	emit_jump_insn (gen_indirect_jump (fnaddr));
+    }
+  else
+    {
+      if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
+	{
+	  // CM_LARGE_PIC always uses pseudo PIC register which is
+	  // uninitialized.  Since FUNCTION is local and calling it
+	  // doesn't go through PLT, we use scratch register %r11 as
+	  // PIC register and initialize it here.
+	  pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
+	  ix86_init_large_pic_reg (tmp_regno);
+	  fnaddr = legitimize_pic_address (fnaddr,
+					   gen_rtx_REG (Pmode, tmp_regno));
+	}
 
-    default:
-      gcc_unreachable ();
+      if (!sibcall_insn_operand (fnaddr, word_mode))
+	{
+	  tmp = gen_rtx_REG (word_mode, tmp_regno);
+	  if (GET_MODE (fnaddr) != word_mode)
+	    fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
+	  emit_move_insn (tmp, fnaddr);
+	  fnaddr = tmp;
+	}
+
+      tmp = gen_rtx_MEM (QImode, fnaddr);
+      tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
+      tmp = emit_call_insn (tmp);
+      SIBLING_CALL_P (tmp) = 1;
     }
+  emit_barrier ();
 
-  return true;
+  /* Emit just enough of rest_of_compilation to get the insns emitted.
+     Note that use_thunk calls assemble_start_function et al.  */
+  insn = get_insns ();
+  shorten_branches (insn);
+  assemble_start_function (thunk_fndecl, fnname);
+  final_start_function (insn, file, 1);
+  final (insn, file, 1);
+  final_end_function ();
+  assemble_end_function (thunk_fndecl, fnname);
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
-   extract-even and extract-odd permutations.  */
+static void
+x86_file_start (void)
+{
+  default_file_start ();
+  if (TARGET_16BIT)
+    fputs ("\t.code16gcc\n", asm_out_file);
+#if TARGET_MACHO
+  darwin_file_start ();
+#endif
+  if (X86_FILE_START_VERSION_DIRECTIVE)
+    fputs ("\t.version\t\"01.01\"\n", asm_out_file);
+  if (X86_FILE_START_FLTUSED)
+    fputs ("\t.global\t__fltused\n", asm_out_file);
+  if (ix86_asm_dialect == ASM_INTEL)
+    fputs ("\t.intel_syntax noprefix\n", asm_out_file);
+}
 
-static bool
-expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
+int
+x86_field_alignment (tree type, int computed)
 {
-  unsigned i, odd, nelt = d->nelt;
+  machine_mode mode;
 
-  odd = d->perm[0];
-  if (odd != 0 && odd != 1)
-    return false;
+  if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
+    return computed;
+  if (TARGET_IAMCU)
+    return iamcu_alignment (type, computed);
+  mode = TYPE_MODE (strip_array_types (type));
+  if (mode == DFmode || mode == DCmode
+      || GET_MODE_CLASS (mode) == MODE_INT
+      || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
+    return MIN (32, computed);
+  return computed;
+}
 
-  for (i = 1; i < nelt; ++i)
-    if (d->perm[i] != 2 * i + odd)
-      return false;
+/* Print call to TARGET to FILE.  */
 
-  return expand_vec_perm_even_odd_1 (d, odd);
+static void
+x86_print_call_or_nop (FILE *file, const char *target)
+{
+  if (flag_nop_mcount || !strcmp (target, "nop"))
+    /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
+    fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
+  else
+    fprintf (file, "1:\tcall\t%s\n", target);
 }
 
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement broadcast
-   permutations.  We assume that expand_vec_perm_1 has already failed.  */
-
 static bool
-expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
+current_fentry_name (const char **name)
 {
-  unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
-  machine_mode vmode = d->vmode;
-  unsigned char perm2[4];
-  rtx op0 = d->op0, dest;
-  bool ok;
-
-  switch (vmode)
-    {
-    case E_V4DFmode:
-    case E_V8SFmode:
-      /* These are special-cased in sse.md so that we can optionally
-	 use the vbroadcast instruction.  They expand to two insns
-	 if the input happens to be in a register.  */
-      gcc_unreachable ();
-
-    case E_V2DFmode:
-    case E_V2DImode:
-    case E_V4SFmode:
-    case E_V4SImode:
-      /* These are always implementable using standard shuffle patterns.  */
-      gcc_unreachable ();
+  tree attr = lookup_attribute ("fentry_name",
+				DECL_ATTRIBUTES (current_function_decl));
+  if (!attr)
+    return false;
+  *name = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr)));
+  return true;
+}
 
-    case E_V8HImode:
-    case E_V16QImode:
-      /* These can be implemented via interleave.  We save one insn by
-	 stopping once we have promoted to V4SImode and then use pshufd.  */
-      if (d->testing_p)
-	return true;
-      do
-	{
-	  rtx dest;
-	  rtx (*gen) (rtx, rtx, rtx)
-	    = vmode == V16QImode ? gen_vec_interleave_lowv16qi
-				 : gen_vec_interleave_lowv8hi;
+static bool
+current_fentry_section (const char **name)
+{
+  tree attr = lookup_attribute ("fentry_section",
+				DECL_ATTRIBUTES (current_function_decl));
+  if (!attr)
+    return false;
+  *name = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr)));
+  return true;
+}
 
-	  if (elt >= nelt2)
-	    {
-	      gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
-				       : gen_vec_interleave_highv8hi;
-	      elt -= nelt2;
-	    }
-	  nelt2 /= 2;
+/* Output assembler code to FILE to increment profiler label # LABELNO
+   for profiling a function entry.  */
+void
+x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
+{
+  if (cfun->machine->endbr_queued_at_entrance)
+    fprintf (file, "\t%s\n", TARGET_64BIT ? "endbr64" : "endbr32");
 
-	  dest = gen_reg_rtx (vmode);
-	  emit_insn (gen (dest, op0, op0));
-	  vmode = get_mode_wider_vector (vmode);
-	  op0 = gen_lowpart (vmode, dest);
-	}
-      while (vmode != V4SImode);
+  const char *mcount_name = MCOUNT_NAME;
 
-      memset (perm2, elt, 4);
-      dest = gen_reg_rtx (V4SImode);
-      ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
-      gcc_assert (ok);
-      if (!d->testing_p)
-	emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
-      return true;
+  if (current_fentry_name (&mcount_name))
+    ;
+  else if (fentry_name)
+    mcount_name = fentry_name;
+  else if (flag_fentry)
+    mcount_name = MCOUNT_NAME_BEFORE_PROLOGUE;
 
-    case E_V64QImode:
-    case E_V32QImode:
-    case E_V16HImode:
-    case E_V8SImode:
-    case E_V4DImode:
-      /* For AVX2 broadcasts of the first element vpbroadcast* or
-	 vpermq should be used by expand_vec_perm_1.  */
-      gcc_assert (!TARGET_AVX2 || d->perm[0]);
-      return false;
+  if (TARGET_64BIT)
+    {
+#ifndef NO_PROFILE_COUNTERS
+      fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
+#endif
 
-    default:
-      gcc_unreachable ();
+      if (!TARGET_PECOFF && flag_pic)
+	fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
+      else
+	x86_print_call_or_nop (file, mcount_name);
+    }
+  else if (flag_pic)
+    {
+#ifndef NO_PROFILE_COUNTERS
+      fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
+	       LPREFIX, labelno);
+#endif
+      fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
+    }
+  else
+    {
+#ifndef NO_PROFILE_COUNTERS
+      fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
+	       LPREFIX, labelno);
+#endif
+      x86_print_call_or_nop (file, mcount_name);
     }
-}
-
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
-   broadcast permutations.  */
 
-static bool
-expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
-{
-  unsigned i, elt, nelt = d->nelt;
-
-  if (!d->one_operand_p)
-    return false;
+  if (flag_record_mcount
+	|| lookup_attribute ("fentry_section",
+                                DECL_ATTRIBUTES (current_function_decl)))
+    {
+      const char *sname = "__mcount_loc";
 
-  elt = d->perm[0];
-  for (i = 1; i < nelt; ++i)
-    if (d->perm[i] != elt)
-      return false;
+      if (current_fentry_section (&sname))
+	;
+      else if (fentry_section)
+	sname = fentry_section;
 
-  return expand_vec_perm_broadcast_1 (d);
+      fprintf (file, "\t.section %s, \"a\",@progbits\n", sname);
+      fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
+      fprintf (file, "\t.previous\n");
+    }
 }
 
-/* Implement arbitrary permutations of two V64QImode operands
-   with 2 vperm[it]2w, 2 vpshufb and one vpor instruction.  */
-static bool
-expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
+/* We don't have exact information about the insn sizes, but we may assume
+   quite safely that we are informed about all 1 byte insns and memory
+   address sizes.  This is enough to eliminate unnecessary padding in
+   99% of cases.  */
+
+int
+ix86_min_insn_size (rtx_insn *insn)
 {
-  if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
-    return false;
+  int l = 0, len;
 
-  if (d->testing_p)
-    return true;
+  if (!INSN_P (insn) || !active_insn_p (insn))
+    return 0;
 
-  struct expand_vec_perm_d ds[2];
-  rtx rperm[128], vperm, target0, target1;
-  unsigned int i, nelt;
-  machine_mode vmode;
+  /* Discard alignments we've emit and jump instructions.  */
+  if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
+      && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
+    return 0;
 
-  nelt = d->nelt;
-  vmode = V64QImode;
+  /* Important case - calls are always 5 bytes.
+     It is common to have many calls in the row.  */
+  if (CALL_P (insn)
+      && symbolic_reference_mentioned_p (PATTERN (insn))
+      && !SIBLING_CALL_P (insn))
+    return 5;
+  len = get_attr_length (insn);
+  if (len <= 1)
+    return 1;
 
-  for (i = 0; i < 2; i++)
+  /* For normal instructions we rely on get_attr_length being exact,
+     with a few exceptions.  */
+  if (!JUMP_P (insn))
     {
-      ds[i] = *d;
-      ds[i].vmode = V32HImode;
-      ds[i].nelt = 32;
-      ds[i].target = gen_reg_rtx (V32HImode);
-      ds[i].op0 = gen_lowpart (V32HImode, d->op0);
-      ds[i].op1 = gen_lowpart (V32HImode, d->op1);
-    }
-
-  /* Prepare permutations such that the first one takes care of
-     putting the even bytes into the right positions or one higher
-     positions (ds[0]) and the second one takes care of
-     putting the odd bytes into the right positions or one below
-     (ds[1]).  */
+      enum attr_type type = get_attr_type (insn);
 
-  for (i = 0; i < nelt; i++)
-    {
-      ds[i & 1].perm[i / 2] = d->perm[i] / 2;
-      if (i & 1)
-	{
-	  rperm[i] = constm1_rtx;
-	  rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
-	}
-      else
+      switch (type)
 	{
-	  rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
-	  rperm[i + 64] = constm1_rtx;
+	case TYPE_MULTI:
+	  if (GET_CODE (PATTERN (insn)) == ASM_INPUT
+	      || asm_noperands (PATTERN (insn)) >= 0)
+	    return 0;
+	  break;
+	case TYPE_OTHER:
+	case TYPE_FCMP:
+	  break;
+	default:
+	  /* Otherwise trust get_attr_length.  */
+	  return len;
 	}
-    }
-
-  bool ok = expand_vec_perm_1 (&ds[0]);
-  gcc_assert (ok);
-  ds[0].target = gen_lowpart (V64QImode, ds[0].target);
-
-  ok = expand_vec_perm_1 (&ds[1]);
-  gcc_assert (ok);
-  ds[1].target = gen_lowpart (V64QImode, ds[1].target);
-
-  vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
-  vperm = force_reg (vmode, vperm);
-  target0 = gen_reg_rtx (V64QImode);
-  emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
 
-  vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
-  vperm = force_reg (vmode, vperm);
-  target1 = gen_reg_rtx (V64QImode);
-  emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
-
-  emit_insn (gen_iorv64qi3 (d->target, target0, target1));
-  return true;
+      l = get_attr_length_address (insn);
+      if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
+	l = 4;
+    }
+  if (l)
+    return 1+l;
+  else
+    return 2;
 }
 
-/* Implement arbitrary permutation of two V32QImode and V16QImode operands
-   with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
-   all the shorter instruction sequences.  */
+#ifdef ASM_OUTPUT_MAX_SKIP_PAD
 
-static bool
-expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
+/* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
+   window.  */
+
+static void
+ix86_avoid_jump_mispredicts (void)
 {
-  rtx rperm[4][32], vperm, l[2], h[2], op, m128;
-  unsigned int i, nelt, eltsz;
-  bool used[4];
+  rtx_insn *insn, *start = get_insns ();
+  int nbytes = 0, njumps = 0;
+  bool isjump = false;
 
-  if (!TARGET_AVX2
-      || d->one_operand_p
-      || (d->vmode != V32QImode && d->vmode != V16HImode))
-    return false;
+  /* Look for all minimal intervals of instructions containing 4 jumps.
+     The intervals are bounded by START and INSN.  NBYTES is the total
+     size of instructions in the interval including INSN and not including
+     START.  When the NBYTES is smaller than 16 bytes, it is possible
+     that the end of START and INSN ends up in the same 16byte page.
 
-  if (d->testing_p)
-    return true;
+     The smallest offset in the page INSN can start is the case where START
+     ends on the offset 0.  Offset of INSN is then NBYTES - sizeof (INSN).
+     We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
 
-  nelt = d->nelt;
-  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
-
-  /* Generate 4 permutation masks.  If the required element is within
-     the same lane, it is shuffled in.  If the required element from the
-     other lane, force a zero by setting bit 7 in the permutation mask.
-     In the other mask the mask has non-negative elements if element
-     is requested from the other lane, but also moved to the other lane,
-     so that the result of vpshufb can have the two V2TImode halves
-     swapped.  */
-  m128 = GEN_INT (-128);
-  for (i = 0; i < 32; ++i)
-    {
-      rperm[0][i] = m128;
-      rperm[1][i] = m128;
-      rperm[2][i] = m128;
-      rperm[3][i] = m128;
-    }
-  used[0] = false;
-  used[1] = false;
-  used[2] = false;
-  used[3] = false;
-  for (i = 0; i < nelt; ++i)
+     Don't consider asm goto as jump, while it can contain a jump, it doesn't
+     have to, control transfer to label(s) can be performed through other
+     means, and also we estimate minimum length of all asm stmts as 0.  */
+  for (insn = start; insn; insn = NEXT_INSN (insn))
     {
-      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
-      unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
-      unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
-
-      for (j = 0; j < eltsz; ++j)
-	rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
-      used[which] = true;
-    }
+      int min_size;
 
-  for (i = 0; i < 2; ++i)
-    {
-      if (!used[2 * i + 1])
+      if (LABEL_P (insn))
 	{
-	  h[i] = NULL_RTX;
+	  align_flags alignment = label_to_alignment (insn);
+	  int align = alignment.levels[0].log;
+	  int max_skip = alignment.levels[0].maxskip;
+
+	  if (max_skip > 15)
+	    max_skip = 15;
+	  /* If align > 3, only up to 16 - max_skip - 1 bytes can be
+	     already in the current 16 byte page, because otherwise
+	     ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
+	     bytes to reach 16 byte boundary.  */
+	  if (align <= 0
+	      || (align <= 3 && max_skip != (1 << align) - 1))
+	    max_skip = 0;
+	  if (dump_file)
+	    fprintf (dump_file, "Label %i with max_skip %i\n",
+		     INSN_UID (insn), max_skip);
+	  if (max_skip)
+	    {
+	      while (nbytes + max_skip >= 16)
+		{
+		  start = NEXT_INSN (start);
+		  if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
+		      || CALL_P (start))
+		    njumps--, isjump = true;
+		  else
+		    isjump = false;
+		  nbytes -= ix86_min_insn_size (start);
+		}
+	    }
 	  continue;
 	}
-      vperm = gen_rtx_CONST_VECTOR (V32QImode,
-				    gen_rtvec_v (32, rperm[2 * i + 1]));
-      vperm = force_reg (V32QImode, vperm);
-      h[i] = gen_reg_rtx (V32QImode);
-      op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
-      emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
-    }
 
-  /* Swap the 128-byte lanes of h[X].  */
-  for (i = 0; i < 2; ++i)
-   {
-     if (h[i] == NULL_RTX)
-       continue;
-     op = gen_reg_rtx (V4DImode);
-     emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
-				     const2_rtx, GEN_INT (3), const0_rtx,
-				     const1_rtx));
-     h[i] = gen_lowpart (V32QImode, op);
-   }
+      min_size = ix86_min_insn_size (insn);
+      nbytes += min_size;
+      if (dump_file)
+	fprintf (dump_file, "Insn %i estimated to %i bytes\n",
+		 INSN_UID (insn), min_size);
+      if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
+	  || CALL_P (insn))
+	njumps++;
+      else
+	continue;
 
-  for (i = 0; i < 2; ++i)
-    {
-      if (!used[2 * i])
+      while (njumps > 3)
 	{
-	  l[i] = NULL_RTX;
-	  continue;
+	  start = NEXT_INSN (start);
+	  if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
+	      || CALL_P (start))
+	    njumps--, isjump = true;
+	  else
+	    isjump = false;
+	  nbytes -= ix86_min_insn_size (start);
 	}
-      vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
-      vperm = force_reg (V32QImode, vperm);
-      l[i] = gen_reg_rtx (V32QImode);
-      op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
-      emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
-    }
+      gcc_assert (njumps >= 0);
+      if (dump_file)
+        fprintf (dump_file, "Interval %i to %i has %i bytes\n",
+		 INSN_UID (start), INSN_UID (insn), nbytes);
 
-  for (i = 0; i < 2; ++i)
-    {
-      if (h[i] && l[i])
+      if (njumps == 3 && isjump && nbytes < 16)
 	{
-	  op = gen_reg_rtx (V32QImode);
-	  emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
-	  l[i] = op;
+	  int padsize = 15 - nbytes + ix86_min_insn_size (insn);
+
+	  if (dump_file)
+	    fprintf (dump_file, "Padding insn %i by %i bytes!\n",
+		     INSN_UID (insn), padsize);
+          emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
 	}
-      else if (h[i])
-	l[i] = h[i];
     }
-
-  gcc_assert (l[0] && l[1]);
-  op = d->target;
-  if (d->vmode != V32QImode)
-    op = gen_reg_rtx (V32QImode);
-  emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
-  if (op != d->target)
-    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
-  return true;
 }
+#endif
 
-/* The guts of ix86_vectorize_vec_perm_const.  With all of the interface bits
-   taken care of, perform the expansion in D and return true on success.  */
-
-static bool
-ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+/* AMD Athlon works faster
+   when RET is not destination of conditional jump or directly preceded
+   by other jump instruction.  We avoid the penalty by inserting NOP just
+   before the RET instructions in such cases.  */
+static void
+ix86_pad_returns (void)
 {
-  /* Try a single instruction expansion.  */
-  if (expand_vec_perm_1 (d))
-    return true;
-
-  /* Try sequences of two instructions.  */
-
-  if (expand_vec_perm_pshuflw_pshufhw (d))
-    return true;
-
-  if (expand_vec_perm_palignr (d, false))
-    return true;
-
-  if (expand_vec_perm_interleave2 (d))
-    return true;
-
-  if (expand_vec_perm_broadcast (d))
-    return true;
-
-  if (expand_vec_perm_vpermq_perm_1 (d))
-    return true;
-
-  if (expand_vec_perm_vperm2f128 (d))
-    return true;
-
-  if (expand_vec_perm_pblendv (d))
-    return true;
-
-  /* Try sequences of three instructions.  */
-
-  if (expand_vec_perm_even_odd_pack (d))
-    return true;
-
-  if (expand_vec_perm_2vperm2f128_vshuf (d))
-    return true;
-
-  if (expand_vec_perm_pshufb2 (d))
-    return true;
-
-  if (expand_vec_perm_interleave3 (d))
-    return true;
-
-  if (expand_vec_perm_vperm2f128_vblend (d))
-    return true;
-
-  /* Try sequences of four instructions.  */
-
-  if (expand_vec_perm_even_odd_trunc (d))
-    return true;
-  if (expand_vec_perm_vpshufb2_vpermq (d))
-    return true;
-
-  if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
-    return true;
-
-  if (expand_vec_perm_vpermt2_vpshub2 (d))
-    return true;
+  edge e;
+  edge_iterator ei;
 
-  /* ??? Look for narrow permutations whose element orderings would
-     allow the promotion to a wider mode.  */
+  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+    {
+      basic_block bb = e->src;
+      rtx_insn *ret = BB_END (bb);
+      rtx_insn *prev;
+      bool replace = false;
 
-  /* ??? Look for sequences of interleave or a wider permute that place
-     the data into the correct lanes for a half-vector shuffle like
-     pshuf[lh]w or vpermilps.  */
+      if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
+	  || optimize_bb_for_size_p (bb))
+	continue;
+      for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
+	if (active_insn_p (prev) || LABEL_P (prev))
+	  break;
+      if (prev && LABEL_P (prev))
+	{
+	  edge e;
+	  edge_iterator ei;
 
-  /* ??? Look for sequences of interleave that produce the desired results.
-     The combinatorics of punpck[lh] get pretty ugly... */
+	  FOR_EACH_EDGE (e, ei, bb->preds)
+	    if (EDGE_FREQUENCY (e) && e->src->index >= 0
+		&& !(e->flags & EDGE_FALLTHRU))
+	      {
+		replace = true;
+		break;
+	      }
+	}
+      if (!replace)
+	{
+	  prev = prev_active_insn (ret);
+	  if (prev
+	      && ((JUMP_P (prev) && any_condjump_p (prev))
+		  || CALL_P (prev)))
+	    replace = true;
+	  /* Empty functions get branch mispredict even when
+	     the jump destination is not visible to us.  */
+	  if (!prev && !optimize_function_for_size_p (cfun))
+	    replace = true;
+	}
+      if (replace)
+	{
+	  emit_jump_insn_before (gen_simple_return_internal_long (), ret);
+	  delete_insn (ret);
+	}
+    }
+}
 
-  if (expand_vec_perm_even_odd (d))
-    return true;
+/* Count the minimum number of instructions in BB.  Return 4 if the
+   number of instructions >= 4.  */
 
-  /* Even longer sequences.  */
-  if (expand_vec_perm_vpshufb4_vpermq2 (d))
-    return true;
+static int
+ix86_count_insn_bb (basic_block bb)
+{
+  rtx_insn *insn;
+  int insn_count = 0;
 
-  /* See if we can get the same permutation in different vector integer
-     mode.  */
-  struct expand_vec_perm_d nd;
-  if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
+  /* Count number of instructions in this block.  Return 4 if the number
+     of instructions >= 4.  */
+  FOR_BB_INSNS (bb, insn)
     {
-      if (!d->testing_p)
-	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
-      return true;
+      /* Only happen in exit blocks.  */
+      if (JUMP_P (insn)
+	  && ANY_RETURN_P (PATTERN (insn)))
+	break;
+
+      if (NONDEBUG_INSN_P (insn)
+	  && GET_CODE (PATTERN (insn)) != USE
+	  && GET_CODE (PATTERN (insn)) != CLOBBER)
+	{
+	  insn_count++;
+	  if (insn_count >= 4)
+	    return insn_count;
+	}
     }
 
-  return false;
+  return insn_count;
 }
 
-/* If a permutation only uses one operand, make it clear. Returns true
-   if the permutation references both operands.  */
 
-static bool
-canonicalize_perm (struct expand_vec_perm_d *d)
-{
-  int i, which, nelt = d->nelt;
+/* Count the minimum number of instructions in code path in BB.
+   Return 4 if the number of instructions >= 4.  */
 
-  for (i = which = 0; i < nelt; ++i)
-      which |= (d->perm[i] < nelt ? 1 : 2);
+static int
+ix86_count_insn (basic_block bb)
+{
+  edge e;
+  edge_iterator ei;
+  int min_prev_count;
 
-  d->one_operand_p = true;
-  switch (which)
+  /* Only bother counting instructions along paths with no
+     more than 2 basic blocks between entry and exit.  Given
+     that BB has an edge to exit, determine if a predecessor
+     of BB has an edge from entry.  If so, compute the number
+     of instructions in the predecessor block.  If there
+     happen to be multiple such blocks, compute the minimum.  */
+  min_prev_count = 4;
+  FOR_EACH_EDGE (e, ei, bb->preds)
     {
-    default:
-      gcc_unreachable();
+      edge prev_e;
+      edge_iterator prev_ei;
 
-    case 3:
-      if (!rtx_equal_p (d->op0, d->op1))
-        {
-	  d->one_operand_p = false;
+      if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
+	{
+	  min_prev_count = 0;
 	  break;
-        }
-      /* The elements of PERM do not suggest that only the first operand
-	 is used, but both operands are identical.  Allow easier matching
-	 of the permutation by folding the permutation into the single
-	 input vector.  */
-      /* FALLTHRU */
-
-    case 2:
-      for (i = 0; i < nelt; ++i)
-        d->perm[i] &= nelt - 1;
-      d->op0 = d->op1;
-      break;
-
-    case 1:
-      d->op1 = d->op0;
-      break;
+	}
+      FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
+	{
+	  if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
+	    {
+	      int count = ix86_count_insn_bb (e->src);
+	      if (count < min_prev_count)
+		min_prev_count = count;
+	      break;
+	    }
+	}
     }
 
-  return (which == 3);
+  if (min_prev_count < 4)
+    min_prev_count += ix86_count_insn_bb (bb);
+
+  return min_prev_count;
 }
 
-/* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
+/* Pad short function to 4 instructions.   */
 
-static bool
-ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
-			       rtx op1, const vec_perm_indices &sel)
+static void
+ix86_pad_short_function (void)
 {
-  struct expand_vec_perm_d d;
-  unsigned char perm[MAX_VECT_LEN];
-  unsigned int i, nelt, which;
-  bool two_args;
+  edge e;
+  edge_iterator ei;
 
-  d.target = target;
-  d.op0 = op0;
-  d.op1 = op1;
+  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+    {
+      rtx_insn *ret = BB_END (e->src);
+      if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
+	{
+	  int insn_count = ix86_count_insn (e->src);
 
-  d.vmode = vmode;
-  gcc_assert (VECTOR_MODE_P (d.vmode));
-  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
-  d.testing_p = !target;
+	  /* Pad short function.  */
+	  if (insn_count < 4)
+	    {
+	      rtx_insn *insn = ret;
 
-  gcc_assert (sel.length () == nelt);
-  gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
+	      /* Find epilogue.  */
+	      while (insn
+		     && (!NOTE_P (insn)
+			 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
+		insn = PREV_INSN (insn);
 
-  /* Given sufficient ISA support we can just return true here
-     for selected vector modes.  */
-  switch (d.vmode)
-    {
-    case E_V16SFmode:
-    case E_V16SImode:
-    case E_V8DImode:
-    case E_V8DFmode:
-      if (!TARGET_AVX512F)
-	return false;
-      /* All implementable with a single vperm[it]2 insn.  */
-      if (d.testing_p)
-	return true;
-      break;
-    case E_V32HImode:
-      if (!TARGET_AVX512BW)
-	return false;
-      if (d.testing_p)
-	/* All implementable with a single vperm[it]2 insn.  */
-	return true;
-      break;
-    case E_V64QImode:
-      if (!TARGET_AVX512BW)
-	return false;
-      if (d.testing_p)
-	/* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn.  */
-	return true;
-      break;
-    case E_V8SImode:
-    case E_V8SFmode:
-    case E_V4DFmode:
-    case E_V4DImode:
-      if (!TARGET_AVX)
-	return false;
-      if (d.testing_p && TARGET_AVX512VL)
-	/* All implementable with a single vperm[it]2 insn.  */
-	return true;
-      break;
-    case E_V16HImode:
-      if (!TARGET_SSE2)
-	return false;
-      if (d.testing_p && TARGET_AVX2)
-	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
-	return true;
-      break;
-    case E_V32QImode:
-      if (!TARGET_SSE2)
-	return false;
-      if (d.testing_p && TARGET_AVX2)
-	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
-	return true;
-      break;
-    case E_V8HImode:
-    case E_V16QImode:
-      if (!TARGET_SSE2)
-	return false;
-      /* Fall through.  */
-    case E_V4SImode:
-    case E_V4SFmode:
-      if (!TARGET_SSE)
-	return false;
-      /* All implementable with a single vpperm insn.  */
-      if (d.testing_p && TARGET_XOP)
-	return true;
-      /* All implementable with 2 pshufb + 1 ior.  */
-      if (d.testing_p && TARGET_SSSE3)
-	return true;
-      break;
-    case E_V2DImode:
-    case E_V2DFmode:
-      if (!TARGET_SSE)
-	return false;
-      /* All implementable with shufpd or unpck[lh]pd.  */
-      if (d.testing_p)
-	return true;
-      break;
-    default:
-      return false;
-    }
+	      if (!insn)
+		insn = ret;
 
-  for (i = which = 0; i < nelt; ++i)
-    {
-      unsigned char e = sel[i];
-      gcc_assert (e < 2 * nelt);
-      d.perm[i] = e;
-      perm[i] = e;
-      which |= (e < nelt ? 1 : 2);
+	      /* Two NOPs count as one instruction.  */
+	      insn_count = 2 * (4 - insn_count);
+	      emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
+	    }
+	}
     }
+}
 
-  if (d.testing_p)
-    {
-      /* For all elements from second vector, fold the elements to first.  */
-      if (which == 2)
-	for (i = 0; i < nelt; ++i)
-	  d.perm[i] -= nelt;
-
-      /* Check whether the mask can be applied to the vector type.  */
-      d.one_operand_p = (which != 3);
-
-      /* Implementable with shufps or pshufd.  */
-      if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
-	return true;
+/* Fix up a Windows system unwinder issue.  If an EH region falls through into
+   the epilogue, the Windows system unwinder will apply epilogue logic and
+   produce incorrect offsets.  This can be avoided by adding a nop between
+   the last insn that can throw and the first insn of the epilogue.  */
 
-      /* Otherwise we have to go through the motions and see if we can
-	 figure out how to generate the requested permutation.  */
-      d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
-      d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
-      if (!d.one_operand_p)
-	d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+static void
+ix86_seh_fixup_eh_fallthru (void)
+{
+  edge e;
+  edge_iterator ei;
 
-      start_sequence ();
-      bool ret = ix86_expand_vec_perm_const_1 (&d);
-      end_sequence ();
+  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+    {
+      rtx_insn *insn, *next;
 
-      return ret;
-    }
+      /* Find the beginning of the epilogue.  */
+      for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
+	if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
+	  break;
+      if (insn == NULL)
+	continue;
 
-  two_args = canonicalize_perm (&d);
+      /* We only care about preceding insns that can throw.  */
+      insn = prev_active_insn (insn);
+      if (insn == NULL || !can_throw_internal (insn))
+	continue;
 
-  if (ix86_expand_vec_perm_const_1 (&d))
-    return true;
+      /* Do not separate calls from their debug information.  */
+      for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
+	if (NOTE_P (next) && NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION)
+	  insn = next;
+	else
+	  break;
 
-  /* If the selector says both arguments are needed, but the operands are the
-     same, the above tried to expand with one_operand_p and flattened selector.
-     If that didn't work, retry without one_operand_p; we succeeded with that
-     during testing.  */
-  if (two_args && d.one_operand_p)
-    {
-      d.one_operand_p = false;
-      memcpy (d.perm, perm, sizeof (perm));
-      return ix86_expand_vec_perm_const_1 (&d);
+      emit_insn_after (gen_nops (const1_rtx), insn);
     }
-
-  return false;
 }
 
-void
-ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
+/* Implement machine specific optimizations.  We implement padding of returns
+   for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
+static void
+ix86_reorg (void)
 {
-  struct expand_vec_perm_d d;
-  unsigned i, nelt;
-
-  d.target = targ;
-  d.op0 = op0;
-  d.op1 = op1;
-  d.vmode = GET_MODE (targ);
-  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
-  d.one_operand_p = false;
-  d.testing_p = false;
+  /* We are freeing block_for_insn in the toplev to keep compatibility
+     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
+  compute_bb_for_insn ();
 
-  for (i = 0; i < nelt; ++i)
-    d.perm[i] = i * 2 + odd;
+  if (TARGET_SEH && current_function_has_exception_handlers ())
+    ix86_seh_fixup_eh_fallthru ();
 
-  /* We'll either be able to implement the permutation directly...  */
-  if (expand_vec_perm_1 (&d))
-    return;
+  if (optimize && optimize_function_for_speed_p (cfun))
+    {
+      if (TARGET_PAD_SHORT_FUNCTION)
+	ix86_pad_short_function ();
+      else if (TARGET_PAD_RETURNS)
+	ix86_pad_returns ();
+#ifdef ASM_OUTPUT_MAX_SKIP_PAD
+      if (TARGET_FOUR_JUMP_LIMIT)
+	ix86_avoid_jump_mispredicts ();
+#endif
+    }
+}
 
-  /* ... or we use the special-case patterns.  */
-  expand_vec_perm_even_odd_1 (&d, odd);
+/* Return nonzero when QImode register that must be represented via REX prefix
+   is used.  */
+bool
+x86_extended_QIreg_mentioned_p (rtx_insn *insn)
+{
+  int i;
+  extract_insn_cached (insn);
+  for (i = 0; i < recog_data.n_operands; i++)
+    if (GENERAL_REG_P (recog_data.operand[i])
+	&& !QI_REGNO_P (REGNO (recog_data.operand[i])))
+       return true;
+  return false;
 }
 
-static void
-ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
+/* Return true when INSN mentions register that must be encoded using REX
+   prefix.  */
+bool
+x86_extended_reg_mentioned_p (rtx insn)
 {
-  struct expand_vec_perm_d d;
-  unsigned i, nelt, base;
-  bool ok;
-
-  d.target = targ;
-  d.op0 = op0;
-  d.op1 = op1;
-  d.vmode = GET_MODE (targ);
-  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
-  d.one_operand_p = false;
-  d.testing_p = false;
-
-  base = high_p ? nelt / 2 : 0;
-  for (i = 0; i < nelt / 2; ++i)
+  subrtx_iterator::array_type array;
+  FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
     {
-      d.perm[i * 2] = i + base;
-      d.perm[i * 2 + 1] = i + base + nelt;
+      const_rtx x = *iter;
+      if (REG_P (x)
+	  && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
+	return true;
     }
-
-  /* Note that for AVX this isn't one instruction.  */
-  ok = ix86_expand_vec_perm_const_1 (&d);
-  gcc_assert (ok);
+  return false;
 }
 
+/* If profitable, negate (without causing overflow) integer constant
+   of mode MODE at location LOC.  Return true in this case.  */
+bool
+x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
+{
+  HOST_WIDE_INT val;
 
-/* Expand a vector operation CODE for a V*QImode in terms of the
-   same operation on V*HImode.  */
-
-void
-ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
-{
-  machine_mode qimode = GET_MODE (dest);
-  machine_mode himode;
-  rtx (*gen_il) (rtx, rtx, rtx);
-  rtx (*gen_ih) (rtx, rtx, rtx);
-  rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
-  struct expand_vec_perm_d d;
-  bool ok, full_interleave;
-  bool uns_p = false;
-  int i;
+  if (!CONST_INT_P (*loc))
+    return false;
 
-  switch (qimode)
+  switch (mode)
     {
-    case E_V16QImode:
-      himode = V8HImode;
-      gen_il = gen_vec_interleave_lowv16qi;
-      gen_ih = gen_vec_interleave_highv16qi;
-      break;
-    case E_V32QImode:
-      himode = V16HImode;
-      gen_il = gen_avx2_interleave_lowv32qi;
-      gen_ih = gen_avx2_interleave_highv32qi;
-      break;
-    case E_V64QImode:
-      himode = V32HImode;
-      gen_il = gen_avx512bw_interleave_lowv64qi;
-      gen_ih = gen_avx512bw_interleave_highv64qi;
-      break;
-    default:
-      gcc_unreachable ();
-    }
+    case E_DImode:
+      /* DImode x86_64 constants must fit in 32 bits.  */
+      gcc_assert (x86_64_immediate_operand (*loc, mode));
 
-  op2_l = op2_h = op2;
-  switch (code)
-    {
-    case MULT:
-      /* Unpack data such that we've got a source byte in each low byte of
-	 each word.  We don't care what goes into the high byte of each word.
-	 Rather than trying to get zero in there, most convenient is to let
-	 it be a copy of the low byte.  */
-      op2_l = gen_reg_rtx (qimode);
-      op2_h = gen_reg_rtx (qimode);
-      emit_insn (gen_il (op2_l, op2, op2));
-      emit_insn (gen_ih (op2_h, op2, op2));
-
-      op1_l = gen_reg_rtx (qimode);
-      op1_h = gen_reg_rtx (qimode);
-      emit_insn (gen_il (op1_l, op1, op1));
-      emit_insn (gen_ih (op1_h, op1, op1));
-      full_interleave = qimode == V16QImode;
+      mode = SImode;
       break;
 
-    case ASHIFT:
-    case LSHIFTRT:
-      uns_p = true;
-      /* FALLTHRU */
-    case ASHIFTRT:
-      op1_l = gen_reg_rtx (himode);
-      op1_h = gen_reg_rtx (himode);
-      ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
-      ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
-      full_interleave = true;
+    case E_SImode:
+    case E_HImode:
+    case E_QImode:
       break;
+
     default:
       gcc_unreachable ();
     }
 
-  /* Perform the operation.  */
-  res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
-			       1, OPTAB_DIRECT);
-  res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
-			       1, OPTAB_DIRECT);
-  gcc_assert (res_l && res_h);
+  /* Avoid overflows.  */
+  if (mode_signbit_p (mode, *loc))
+    return false;
 
-  /* Merge the data back into the right place.  */
-  d.target = dest;
-  d.op0 = gen_lowpart (qimode, res_l);
-  d.op1 = gen_lowpart (qimode, res_h);
-  d.vmode = qimode;
-  d.nelt = GET_MODE_NUNITS (qimode);
-  d.one_operand_p = false;
-  d.testing_p = false;
+  val = INTVAL (*loc);
 
-  if (full_interleave)
+  /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
+     Exceptions: -128 encodes smaller than 128, so swap sign and op.  */
+  if ((val < 0 && val != -128)
+      || val == 128)
     {
-      /* For SSE2, we used an full interleave, so the desired
-	 results are in the even elements.  */
-      for (i = 0; i < d.nelt; ++i)
-	d.perm[i] = i * 2;
+      *loc = GEN_INT (-val);
+      return true;
     }
-  else
-    {
-      /* For AVX, the interleave used above was not cross-lane.  So the
-	 extraction is evens but with the second and third quarter swapped.
-	 Happily, that is even one insn shorter than even extraction.
-	 For AVX512BW we have 4 lanes.  We extract evens from within a lane,
-	 always first from the first and then from the second source operand,
-	 the index bits above the low 4 bits remains the same.
-	 Thus, for d.nelt == 32 we want permutation
-	 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
-	 and for d.nelt == 64 we want permutation
-	 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
-	 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126.  */
-      for (i = 0; i < d.nelt; ++i)
-	d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
-    }
-
-  ok = ix86_expand_vec_perm_const_1 (&d);
-  gcc_assert (ok);
 
-  set_unique_reg_note (get_last_insn (), REG_EQUAL,
-		       gen_rtx_fmt_ee (code, qimode, op1, op2));
+  return false;
 }
 
-/* Helper function of ix86_expand_mul_widen_evenodd.  Return true
-   if op is CONST_VECTOR with all odd elements equal to their
-   preceding element.  */
-
-static bool
-const_vector_equal_evenodd_p (rtx op)
-{
-  machine_mode mode = GET_MODE (op);
-  int i, nunits = GET_MODE_NUNITS (mode);
-  if (GET_CODE (op) != CONST_VECTOR
-      || nunits != CONST_VECTOR_NUNITS (op))
-    return false;
-  for (i = 0; i < nunits; i += 2)
-    if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
-      return false;
-  return true;
-}
+/* Generate an unsigned DImode/SImode to FP conversion.  This is the same code
+   optabs would emit if we didn't have TFmode patterns.  */
 
 void
-ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
-			       bool uns_p, bool odd_p)
+x86_emit_floatuns (rtx operands[2])
 {
-  machine_mode mode = GET_MODE (op1);
-  machine_mode wmode = GET_MODE (dest);
-  rtx x;
-  rtx orig_op1 = op1, orig_op2 = op2;
-
-  if (!nonimmediate_operand (op1, mode))
-    op1 = force_reg (mode, op1);
-  if (!nonimmediate_operand (op2, mode))
-    op2 = force_reg (mode, op2);
+  rtx_code_label *neglab, *donelab;
+  rtx i0, i1, f0, in, out;
+  machine_mode mode, inmode;
 
-  /* We only play even/odd games with vectors of SImode.  */
-  gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
+  inmode = GET_MODE (operands[1]);
+  gcc_assert (inmode == SImode || inmode == DImode);
 
-  /* If we're looking for the odd results, shift those members down to
-     the even slots.  For some cpus this is faster than a PSHUFD.  */
-  if (odd_p)
-    {
-      /* For XOP use vpmacsdqh, but only for smult, as it is only
-	 signed.  */
-      if (TARGET_XOP && mode == V4SImode && !uns_p)
-	{
-	  x = force_reg (wmode, CONST0_RTX (wmode));
-	  emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
-	  return;
-	}
+  out = operands[0];
+  in = force_reg (inmode, operands[1]);
+  mode = GET_MODE (out);
+  neglab = gen_label_rtx ();
+  donelab = gen_label_rtx ();
+  f0 = gen_reg_rtx (mode);
 
-      x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
-      if (!const_vector_equal_evenodd_p (orig_op1))
-	op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
-			    x, NULL, 1, OPTAB_DIRECT);
-      if (!const_vector_equal_evenodd_p (orig_op2))
-	op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
-			    x, NULL, 1, OPTAB_DIRECT);
-      op1 = gen_lowpart (mode, op1);
-      op2 = gen_lowpart (mode, op2);
-    }
+  emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
 
-  if (mode == V16SImode)
-    {
-      if (uns_p)
-	x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
-      else
-	x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
-    }
-  else if (mode == V8SImode)
-    {
-      if (uns_p)
-	x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
-      else
-	x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
-    }
-  else if (uns_p)
-    x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
-  else if (TARGET_SSE4_1)
-    x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
-  else
-    {
-      rtx s1, s2, t0, t1, t2;
+  expand_float (out, in, 0);
 
-      /* The easiest way to implement this without PMULDQ is to go through
-	 the motions as if we are performing a full 64-bit multiply.  With
-	 the exception that we need to do less shuffling of the elements.  */
+  emit_jump_insn (gen_jump (donelab));
+  emit_barrier ();
 
-      /* Compute the sign-extension, aka highparts, of the two operands.  */
-      s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
-				op1, pc_rtx, pc_rtx);
-      s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
-				op2, pc_rtx, pc_rtx);
+  emit_label (neglab);
 
-      /* Multiply LO(A) * HI(B), and vice-versa.  */
-      t1 = gen_reg_rtx (wmode);
-      t2 = gen_reg_rtx (wmode);
-      emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
-      emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
+  i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
+			    1, OPTAB_DIRECT);
+  i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
+			    1, OPTAB_DIRECT);
+  i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
 
-      /* Multiply LO(A) * LO(B).  */
-      t0 = gen_reg_rtx (wmode);
-      emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
+  expand_float (f0, i0, 0);
 
-      /* Combine and shift the highparts into place.  */
-      t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
-      t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
-			 1, OPTAB_DIRECT);
+  emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
 
-      /* Combine high and low parts.  */
-      force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
-      return;
-    }
-  emit_insn (x);
+  emit_label (donelab);
 }
-
-void
-ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
-			    bool uns_p, bool high_p)
+
+/* Target hook for scalar_mode_supported_p.  */
+static bool
+ix86_scalar_mode_supported_p (scalar_mode mode)
 {
-  machine_mode wmode = GET_MODE (dest);
-  machine_mode mode = GET_MODE (op1);
-  rtx t1, t2, t3, t4, mask;
-
-  switch (mode)
-    {
-    case E_V4SImode:
-      t1 = gen_reg_rtx (mode);
-      t2 = gen_reg_rtx (mode);
-      if (TARGET_XOP && !uns_p)
-	{
-	  /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
-	     shuffle the elements once so that all elements are in the right
-	     place for immediate use: { A C B D }.  */
-	  emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
-					const1_rtx, GEN_INT (3)));
-	  emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
-					const1_rtx, GEN_INT (3)));
-	}
-      else
-	{
-	  /* Put the elements into place for the multiply.  */
-	  ix86_expand_vec_interleave (t1, op1, op1, high_p);
-	  ix86_expand_vec_interleave (t2, op2, op2, high_p);
-	  high_p = false;
-	}
-      ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
-      break;
-
-    case E_V8SImode:
-      /* Shuffle the elements between the lanes.  After this we
-	 have { A B E F | C D G H } for each operand.  */
-      t1 = gen_reg_rtx (V4DImode);
-      t2 = gen_reg_rtx (V4DImode);
-      emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
-				      const0_rtx, const2_rtx,
-				      const1_rtx, GEN_INT (3)));
-      emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
-				      const0_rtx, const2_rtx,
-				      const1_rtx, GEN_INT (3)));
-
-      /* Shuffle the elements within the lanes.  After this we
-	 have { A A B B | C C D D } or { E E F F | G G H H }.  */
-      t3 = gen_reg_rtx (V8SImode);
-      t4 = gen_reg_rtx (V8SImode);
-      mask = GEN_INT (high_p
-		      ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
-		      : 0 + (0 << 2) + (1 << 4) + (1 << 6));
-      emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
-      emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
-
-      ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
-      break;
-
-    case E_V8HImode:
-    case E_V16HImode:
-      t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
-			 uns_p, OPTAB_DIRECT);
-      t2 = expand_binop (mode,
-			 uns_p ? umul_highpart_optab : smul_highpart_optab,
-			 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
-      gcc_assert (t1 && t2);
-
-      t3 = gen_reg_rtx (mode);
-      ix86_expand_vec_interleave (t3, t1, t2, high_p);
-      emit_move_insn (dest, gen_lowpart (wmode, t3));
-      break;
-
-    case E_V16QImode:
-    case E_V32QImode:
-    case E_V32HImode:
-    case E_V16SImode:
-    case E_V64QImode:
-      t1 = gen_reg_rtx (wmode);
-      t2 = gen_reg_rtx (wmode);
-      ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
-      ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
-
-      emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
-      break;
-
-    default:
-      gcc_unreachable ();
-    }
+  if (DECIMAL_FLOAT_MODE_P (mode))
+    return default_decimal_float_supported_p ();
+  else if (mode == TFmode)
+    return true;
+  else
+    return default_scalar_mode_supported_p (mode);
 }
 
-void
-ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
+/* Implements target hook vector_mode_supported_p.  */
+static bool
+ix86_vector_mode_supported_p (machine_mode mode)
 {
-  rtx res_1, res_2, res_3, res_4;
-
-  res_1 = gen_reg_rtx (V4SImode);
-  res_2 = gen_reg_rtx (V4SImode);
-  res_3 = gen_reg_rtx (V2DImode);
-  res_4 = gen_reg_rtx (V2DImode);
-  ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
-  ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
-
-  /* Move the results in element 2 down to element 1; we don't care
-     what goes in elements 2 and 3.  Then we can merge the parts
-     back together with an interleave.
-
-     Note that two other sequences were tried:
-     (1) Use interleaves at the start instead of psrldq, which allows
-     us to use a single shufps to merge things back at the end.
-     (2) Use shufps here to combine the two vectors, then pshufd to
-     put the elements in the correct order.
-     In both cases the cost of the reformatting stall was too high
-     and the overall sequence slower.  */
-
-  emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
-				const0_rtx, const2_rtx,
-				const0_rtx, const0_rtx));
-  emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
-				const0_rtx, const2_rtx,
-				const0_rtx, const0_rtx));
-  res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
-
-  set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
+  if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
+    return true;
+  if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
+    return true;
+  if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
+    return true;
+  if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
+    return true;
+  if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
+    return true;
+  if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
+    return true;
+  return false;
 }
 
-void
-ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
+/* Target hook for c_mode_for_suffix.  */
+static machine_mode
+ix86_c_mode_for_suffix (char suffix)
 {
-  machine_mode mode = GET_MODE (op0);
-  rtx t1, t2, t3, t4, t5, t6;
+  if (suffix == 'q')
+    return TFmode;
+  if (suffix == 'w')
+    return XFmode;
 
-  if (TARGET_AVX512DQ && mode == V8DImode)
-    emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
-  else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
-    emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
-  else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
-    emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
-  else if (TARGET_XOP && mode == V2DImode)
-    {
-      /* op1: A,B,C,D, op2: E,F,G,H */
-      op1 = gen_lowpart (V4SImode, op1);
-      op2 = gen_lowpart (V4SImode, op2);
+  return VOIDmode;
+}
 
-      t1 = gen_reg_rtx (V4SImode);
-      t2 = gen_reg_rtx (V4SImode);
-      t3 = gen_reg_rtx (V2DImode);
-      t4 = gen_reg_rtx (V2DImode);
+/* Worker function for TARGET_MD_ASM_ADJUST.
 
-      /* t1: B,A,D,C */
-      emit_insn (gen_sse2_pshufd_1 (t1, op1,
-				    GEN_INT (1),
-				    GEN_INT (0),
-				    GEN_INT (3),
-				    GEN_INT (2)));
+   We implement asm flag outputs, and maintain source compatibility
+   with the old cc0-based compiler.  */
 
-      /* t2: (B*E),(A*F),(D*G),(C*H) */
-      emit_insn (gen_mulv4si3 (t2, t1, op2));
+static rtx_insn *
+ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
+		    vec<const char *> &constraints,
+		    vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
+{
+  bool saw_asm_flag = false;
 
-      /* t3: (B*E)+(A*F), (D*G)+(C*H) */
-      emit_insn (gen_xop_phadddq (t3, t2));
+  start_sequence ();
+  for (unsigned i = 0, n = outputs.length (); i < n; ++i)
+    {
+      const char *con = constraints[i];
+      if (strncmp (con, "=@cc", 4) != 0)
+	continue;
+      con += 4;
+      if (strchr (con, ',') != NULL)
+	{
+	  error ("alternatives not allowed in %<asm%> flag output");
+	  continue;
+	}
 
-      /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
-      emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
+      bool invert = false;
+      if (con[0] == 'n')
+	invert = true, con++;
 
-      /* Multiply lower parts and add all */
-      t5 = gen_reg_rtx (V2DImode);
-      emit_insn (gen_vec_widen_umult_even_v4si (t5, 
-					gen_lowpart (V4SImode, op1),
-					gen_lowpart (V4SImode, op2)));
-      op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
+      machine_mode mode = CCmode;
+      rtx_code code = UNKNOWN;
 
-    }
-  else
-    {
-      machine_mode nmode;
-      rtx (*umul) (rtx, rtx, rtx);
+      switch (con[0])
+	{
+	case 'a':
+	  if (con[1] == 0)
+	    mode = CCAmode, code = EQ;
+	  else if (con[1] == 'e' && con[2] == 0)
+	    mode = CCCmode, code = NE;
+	  break;
+	case 'b':
+	  if (con[1] == 0)
+	    mode = CCCmode, code = EQ;
+	  else if (con[1] == 'e' && con[2] == 0)
+	    mode = CCAmode, code = NE;
+	  break;
+	case 'c':
+	  if (con[1] == 0)
+	    mode = CCCmode, code = EQ;
+	  break;
+	case 'e':
+	  if (con[1] == 0)
+	    mode = CCZmode, code = EQ;
+	  break;
+	case 'g':
+	  if (con[1] == 0)
+	    mode = CCGCmode, code = GT;
+	  else if (con[1] == 'e' && con[2] == 0)
+	    mode = CCGCmode, code = GE;
+	  break;
+	case 'l':
+	  if (con[1] == 0)
+	    mode = CCGCmode, code = LT;
+	  else if (con[1] == 'e' && con[2] == 0)
+	    mode = CCGCmode, code = LE;
+	  break;
+	case 'o':
+	  if (con[1] == 0)
+	    mode = CCOmode, code = EQ;
+	  break;
+	case 'p':
+	  if (con[1] == 0)
+	    mode = CCPmode, code = EQ;
+	  break;
+	case 's':
+	  if (con[1] == 0)
+	    mode = CCSmode, code = EQ;
+	  break;
+	case 'z':
+	  if (con[1] == 0)
+	    mode = CCZmode, code = EQ;
+	  break;
+	}
+      if (code == UNKNOWN)
+	{
+	  error ("unknown %<asm%> flag output %qs", constraints[i]);
+	  continue;
+	}
+      if (invert)
+	code = reverse_condition (code);
 
-      if (mode == V2DImode)
+      rtx dest = outputs[i];
+      if (!saw_asm_flag)
 	{
-	  umul = gen_vec_widen_umult_even_v4si;
-	  nmode = V4SImode;
+	  /* This is the first asm flag output.  Here we put the flags
+	     register in as the real output and adjust the condition to
+	     allow it.  */
+	  constraints[i] = "=Bf";
+	  outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
+	  saw_asm_flag = true;
 	}
-      else if (mode == V4DImode)
+      else
 	{
-	  umul = gen_vec_widen_umult_even_v8si;
-	  nmode = V8SImode;
+	  /* We don't need the flags register as output twice.  */
+	  constraints[i] = "=X";
+	  outputs[i] = gen_rtx_SCRATCH (SImode);
 	}
-      else if (mode == V8DImode)
+
+      rtx x = gen_rtx_REG (mode, FLAGS_REG);
+      x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
+
+      machine_mode dest_mode = GET_MODE (dest);
+      if (!SCALAR_INT_MODE_P (dest_mode))
 	{
-	  umul = gen_vec_widen_umult_even_v16si;
-	  nmode = V16SImode;
+	  error ("invalid type for %<asm%> flag output");
+	  continue;
 	}
-      else
-	gcc_unreachable ();
 
+      if (dest_mode == DImode && !TARGET_64BIT)
+	dest_mode = SImode;
 
-      /* Multiply low parts.  */
-      t1 = gen_reg_rtx (mode);
-      emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
+      if (dest_mode != QImode)
+	{
+	  rtx destqi = gen_reg_rtx (QImode);
+	  emit_insn (gen_rtx_SET (destqi, x));
 
-      /* Shift input vectors right 32 bits so we can multiply high parts.  */
-      t6 = GEN_INT (32);
-      t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
-      t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
+	  if (TARGET_ZERO_EXTEND_WITH_AND
+	      && optimize_function_for_speed_p (cfun))
+	    {
+	      x = force_reg (dest_mode, const0_rtx);
 
-      /* Multiply high parts by low parts.  */
-      t4 = gen_reg_rtx (mode);
-      t5 = gen_reg_rtx (mode);
-      emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
-      emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
+	      emit_insn (gen_movstrictqi (gen_lowpart (QImode, x), destqi));
+	    }
+	  else
+	    {
+	      x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
+	      if (dest_mode == GET_MODE (dest)
+		  && !register_operand (dest, GET_MODE (dest)))
+		x = force_reg (dest_mode, x);
+	    }
+	}
 
-      /* Combine and shift the highparts back.  */
-      t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
-      t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
+      if (dest_mode != GET_MODE (dest))
+	{
+	  rtx tmp = gen_reg_rtx (SImode);
 
-      /* Combine high and low parts.  */
-      force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
+	  emit_insn (gen_rtx_SET (tmp, x));
+	  emit_insn (gen_zero_extendsidi2 (dest, tmp));
+	}
+      else
+	emit_insn (gen_rtx_SET (dest, x));
     }
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
 
-  set_unique_reg_note (get_last_insn (), REG_EQUAL,
-		       gen_rtx_MULT (mode, op1, op2));
+  if (saw_asm_flag)
+    return seq;
+  else
+    {
+      /* If we had no asm flag outputs, clobber the flags.  */
+      clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
+      SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
+      return NULL;
+    }
 }
 
-/* Return 1 if control tansfer instruction INSN
-   should be encoded with notrack prefix.  */
+/* Implements target vector targetm.asm.encode_section_info.  */
 
-static bool
-ix86_notrack_prefixed_insn_p (rtx insn)
+static void ATTRIBUTE_UNUSED
+ix86_encode_section_info (tree decl, rtx rtl, int first)
 {
-  if (!insn || !((flag_cf_protection & CF_BRANCH)))
-    return false;
-
-  if (CALL_P (insn))
-    {
-      rtx call = get_call_rtx_from (insn);
-      gcc_assert (call != NULL_RTX);
-      rtx addr = XEXP (call, 0);
+  default_encode_section_info (decl, rtl, first);
 
-      /* Do not emit 'notrack' if it's not an indirect call.  */
-      if (MEM_P (addr)
-	  && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
-	return false;
-      else
-	return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
-    }
+  if (ix86_in_large_data_p (decl))
+    SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
+}
 
-  if (JUMP_P (insn) && !flag_cet_switch)
-    {
-      rtx target = JUMP_LABEL (insn);
-      if (target == NULL_RTX || ANY_RETURN_P (target))
-	return false;
+/* Worker function for REVERSE_CONDITION.  */
 
-      /* Check the jump is a switch table.  */
-      rtx_insn *label = as_a<rtx_insn *> (target);
-      rtx_insn *table = next_insn (label);
-      if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
-	return false;
-      else
-	return true;
-    }
-  return false;
+enum rtx_code
+ix86_reverse_condition (enum rtx_code code, machine_mode mode)
+{
+  return (mode == CCFPmode
+	  ? reverse_condition_maybe_unordered (code)
+	  : reverse_condition (code));
 }
 
-/* Calculate integer abs() using only SSE2 instructions.  */
+/* Output code to perform an x87 FP register move, from OPERANDS[1]
+   to OPERANDS[0].  */
 
-void
-ix86_expand_sse2_abs (rtx target, rtx input)
+const char *
+output_387_reg_move (rtx_insn *insn, rtx *operands)
 {
-  machine_mode mode = GET_MODE (target);
-  rtx tmp0, tmp1, x;
-
-  switch (mode)
+  if (REG_P (operands[0]))
     {
-    case E_V2DImode:
-    case E_V4DImode:
-      /* For 64-bit signed integer X, with SSE4.2 use
-	 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
-	 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
-	 32 and use logical instead of arithmetic right shift (which is
-	 unimplemented) and subtract.  */
-      if (TARGET_SSE4_2)
-	{
-	  tmp0 = gen_reg_rtx (mode);
-	  tmp1 = gen_reg_rtx (mode);
-	  emit_move_insn (tmp1, CONST0_RTX (mode));
-	  if (mode == E_V2DImode)
-	    emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
-	  else
-	    emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
+      if (REG_P (operands[1])
+	  && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
+	{
+	  if (REGNO (operands[0]) == FIRST_STACK_REG)
+	    return output_387_ffreep (operands, 0);
+	  return "fstp\t%y0";
 	}
+      if (STACK_TOP_P (operands[0]))
+	return "fld%Z1\t%y1";
+      return "fst\t%y0";
+    }
+  else if (MEM_P (operands[0]))
+    {
+      gcc_assert (REG_P (operands[1]));
+      if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
+	return "fstp%Z0\t%y0";
       else
 	{
-	  tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
-				      GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
-					       - 1), NULL, 0, OPTAB_DIRECT);
-	  tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
+	  /* There is no non-popping store to memory for XFmode.
+	     So if we need one, follow the store with a load.  */
+	  if (GET_MODE (operands[0]) == XFmode)
+	    return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
+	  else
+	    return "fst%Z0\t%y0";
 	}
-
-      tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
-				  NULL, 0, OPTAB_DIRECT);
-      x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
-			       target, 0, OPTAB_DIRECT);
-      break;
-
-    case E_V4SImode:
-      /* For 32-bit signed integer X, the best way to calculate the absolute
-	 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
-      tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
-				  GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
-				  NULL, 0, OPTAB_DIRECT);
-      tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
-				  NULL, 0, OPTAB_DIRECT);
-      x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
-			       target, 0, OPTAB_DIRECT);
-      break;
-
-    case E_V8HImode:
-      /* For 16-bit signed integer X, the best way to calculate the absolute
-	 value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
-      tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
-
-      x = expand_simple_binop (mode, SMAX, tmp0, input,
-			       target, 0, OPTAB_DIRECT);
-      break;
-
-    case E_V16QImode:
-      /* For 8-bit signed integer X, the best way to calculate the absolute
-	 value of X is min ((unsigned char) X, (unsigned char) (-X)),
-	 as SSE2 provides the PMINUB insn.  */
-      tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
-
-      x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
-			       target, 0, OPTAB_DIRECT);
-      break;
-
-    default:
-      gcc_unreachable ();
     }
-
-  if (x != target)
-    emit_move_insn (target, x);
+  else
+    gcc_unreachable();
 }
+#ifdef TARGET_SOLARIS
+/* Solaris implementation of TARGET_ASM_NAMED_SECTION.  */
 
-/* Expand an extract from a vector register through pextr insn.
-   Return true if successful.  */
-
-bool
-ix86_expand_pextr (rtx *operands)
+static void
+i386_solaris_elf_named_section (const char *name, unsigned int flags,
+				tree decl)
 {
-  rtx dst = operands[0];
-  rtx src = operands[1];
-
-  unsigned int size = INTVAL (operands[2]);
-  unsigned int pos = INTVAL (operands[3]);
-
-  if (SUBREG_P (dst))
+  /* With Binutils 2.15, the "@unwind" marker must be specified on
+     every occurrence of the ".eh_frame" section, not just the first
+     one.  */
+  if (TARGET_64BIT
+      && strcmp (name, ".eh_frame") == 0)
     {
-      /* Reject non-lowpart subregs.  */
-      if (SUBREG_BYTE (dst) > 0)
-	return false;
-      dst = SUBREG_REG (dst);
+      fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
+	       flags & SECTION_WRITE ? "aw" : "a");
+      return;
     }
-	
-  if (SUBREG_P (src))
+
+#ifndef USE_GAS
+  if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
     {
-      pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
-      src = SUBREG_REG (src);
+      solaris_elf_asm_comdat_section (name, flags, decl);
+      return;
     }
 
-  switch (GET_MODE (src))
+  /* Solaris/x86 as uses the same syntax for the SHF_EXCLUDE flags as the
+     SPARC assembler.  One cannot mix single-letter flags and #exclude, so
+     only emit the latter here.  */
+  if (flags & SECTION_EXCLUDE)
     {
-    case E_V16QImode:
-    case E_V8HImode:
-    case E_V4SImode:
-    case E_V2DImode:
-    case E_V1TImode:
-    case E_TImode:
-      {
-	machine_mode srcmode, dstmode;
-	rtx d, pat;
+      fprintf (asm_out_file, "\t.section\t%s,#exclude\n", name);
+      return;
+    }
+#endif
 
-	if (!int_mode_for_size (size, 0).exists (&dstmode))
-	  return false;
+  default_elf_asm_named_section (name, flags, decl);
+}
+#endif /* TARGET_SOLARIS */
 
-	switch (dstmode)
-	  {
-	  case E_QImode:
-	    if (!TARGET_SSE4_1)
-	      return false;
-	    srcmode = V16QImode;
-	    break;
+/* Return the mangling of TYPE if it is an extended fundamental type.  */
 
-	  case E_HImode:
-	    if (!TARGET_SSE2)
-	      return false;
-	    srcmode = V8HImode;
-	    break;
+static const char *
+ix86_mangle_type (const_tree type)
+{
+  type = TYPE_MAIN_VARIANT (type);
 
-	  case E_SImode:
-	    if (!TARGET_SSE4_1)
-	      return false;
-	    srcmode = V4SImode;
-	    break;
+  if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
+      && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
+    return NULL;
 
-	  case E_DImode:
-	    gcc_assert (TARGET_64BIT);
-	    if (!TARGET_SSE4_1)
-	      return false;
-	    srcmode = V2DImode;
-	    break;
+  switch (TYPE_MODE (type))
+    {
+    case E_TFmode:
+      /* __float128 is "g".  */
+      return "g";
+    case E_XFmode:
+      /* "long double" or __float80 is "e".  */
+      return "e";
+    default:
+      return NULL;
+    }
+}
 
-	  default:
-	    return false;
-	  }
+static GTY(()) tree ix86_tls_stack_chk_guard_decl;
+
+static tree
+ix86_stack_protect_guard (void)
+{
+  if (TARGET_SSP_TLS_GUARD)
+    {
+      tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
+      int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
+      tree type = build_qualified_type (type_node, qual);
+      tree t;
 
-	/* Reject extractions from misaligned positions.  */
-	if (pos & (size-1))
-	  return false;
+      if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
+	{
+	  t = ix86_tls_stack_chk_guard_decl;
 
-	if (GET_MODE (dst) == dstmode)
-	  d = dst;
-	else
-	  d = gen_reg_rtx (dstmode);
+	  if (t == NULL)
+	    {
+	      rtx x;
 
-	/* Construct insn pattern.  */
-	pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
-	pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
+	      t = build_decl
+		(UNKNOWN_LOCATION, VAR_DECL,
+		 get_identifier (ix86_stack_protector_guard_symbol_str),
+		 type);
+	      TREE_STATIC (t) = 1;
+	      TREE_PUBLIC (t) = 1;
+	      DECL_EXTERNAL (t) = 1;
+	      TREE_USED (t) = 1;
+	      TREE_THIS_VOLATILE (t) = 1;
+	      DECL_ARTIFICIAL (t) = 1;
+	      DECL_IGNORED_P (t) = 1;
 
-	/* Let the rtl optimizers know about the zero extension performed.  */
-	if (dstmode == QImode || dstmode == HImode)
-	  {
-	    pat = gen_rtx_ZERO_EXTEND (SImode, pat);
-	    d = gen_lowpart (SImode, d);
-	  }
+	      /* Do not share RTL as the declaration is visible outside of
+		 current function.  */
+	      x = DECL_RTL (t);
+	      RTX_FLAG (x, used) = 1;
 
-	emit_insn (gen_rtx_SET (d, pat));
+	      ix86_tls_stack_chk_guard_decl = t;
+	    }
+	}
+      else
+	{
+	  tree asptrtype = build_pointer_type (type);
 
-	if (d != dst)
-	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
-	return true;
-      }
+	  t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
+	  t = build2 (MEM_REF, asptrtype, t,
+		      build_int_cst (asptrtype, 0));
+	  TREE_THIS_VOLATILE (t) = 1;
+	}
 
-    default:
-      return false;
+      return t;
     }
+
+  return default_stack_protect_guard ();
 }
 
-/* Expand an insert into a vector register through pinsr insn.
-   Return true if successful.  */
+/* For 32-bit code we can save PIC register setup by using
+   __stack_chk_fail_local hidden function instead of calling
+   __stack_chk_fail directly.  64-bit code doesn't need to setup any PIC
+   register, so it is better to call __stack_chk_fail directly.  */
 
-bool
-ix86_expand_pinsr (rtx *operands)
+static tree ATTRIBUTE_UNUSED
+ix86_stack_protect_fail (void)
 {
-  rtx dst = operands[0];
-  rtx src = operands[3];
+  return TARGET_64BIT
+	 ? default_external_stack_protect_fail ()
+	 : default_hidden_stack_protect_fail ();
+}
 
-  unsigned int size = INTVAL (operands[1]);
-  unsigned int pos = INTVAL (operands[2]);
+/* Select a format to encode pointers in exception handling data.  CODE
+   is 0 for data, 1 for code labels, 2 for function pointers.  GLOBAL is
+   true if the symbol may be affected by dynamic relocations.
 
-  if (SUBREG_P (dst))
+   ??? All x86 object file formats are capable of representing this.
+   After all, the relocation needed is the same as for the call insn.
+   Whether or not a particular assembler allows us to enter such, I
+   guess we'll have to see.  */
+int
+asm_preferred_eh_data_format (int code, int global)
+{
+  if (flag_pic)
+    {
+      int type = DW_EH_PE_sdata8;
+      if (!TARGET_64BIT
+	  || ix86_cmodel == CM_SMALL_PIC
+	  || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
+	type = DW_EH_PE_sdata4;
+      return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
+    }
+  if (ix86_cmodel == CM_SMALL
+      || (ix86_cmodel == CM_MEDIUM && code))
+    return DW_EH_PE_udata4;
+  return DW_EH_PE_absptr;
+}
+
+/* Implement targetm.vectorize.builtin_vectorization_cost.  */
+static int
+ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+                                 tree vectype, int)
+{
+  bool fp = false;
+  machine_mode mode = TImode;
+  int index;
+  if (vectype != NULL)
     {
-      pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
-      dst = SUBREG_REG (dst);
+      fp = FLOAT_TYPE_P (vectype);
+      mode = TYPE_MODE (vectype);
     }
 
-  switch (GET_MODE (dst))
+  switch (type_of_cost)
     {
-    case E_V16QImode:
-    case E_V8HImode:
-    case E_V4SImode:
-    case E_V2DImode:
-    case E_V1TImode:
-    case E_TImode:
-      {
-	machine_mode srcmode, dstmode;
-	rtx (*pinsr)(rtx, rtx, rtx, rtx);
-	rtx d;
+      case scalar_stmt:
+        return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
 
-	if (!int_mode_for_size (size, 0).exists (&srcmode))
-	  return false;
+      case scalar_load:
+	/* load/store costs are relative to register move which is 2. Recompute
+ 	   it to COSTS_N_INSNS so everything have same base.  */
+        return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
+			      : ix86_cost->int_load [2]) / 2;
 
-	switch (srcmode)
-	  {
-	  case E_QImode:
-	    if (!TARGET_SSE4_1)
-	      return false;
-	    dstmode = V16QImode;
-	    pinsr = gen_sse4_1_pinsrb;
-	    break;
+      case scalar_store:
+        return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
+			      : ix86_cost->int_store [2]) / 2;
 
-	  case E_HImode:
-	    if (!TARGET_SSE2)
-	      return false;
-	    dstmode = V8HImode;
-	    pinsr = gen_sse2_pinsrw;
-	    break;
+      case vector_stmt:
+        return ix86_vec_cost (mode,
+			      fp ? ix86_cost->addss : ix86_cost->sse_op);
 
-	  case E_SImode:
-	    if (!TARGET_SSE4_1)
-	      return false;
-	    dstmode = V4SImode;
-	    pinsr = gen_sse4_1_pinsrd;
-	    break;
+      case vector_load:
+	index = sse_store_index (mode);
+	/* See PR82713 - we may end up being called on non-vector type.  */
+	if (index < 0)
+	  index = 2;
+        return COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2;
 
-	  case E_DImode:
-	    gcc_assert (TARGET_64BIT);
-	    if (!TARGET_SSE4_1)
-	      return false;
-	    dstmode = V2DImode;
-	    pinsr = gen_sse4_1_pinsrq;
-	    break;
+      case vector_store:
+	index = sse_store_index (mode);
+	/* See PR82713 - we may end up being called on non-vector type.  */
+	if (index < 0)
+	  index = 2;
+        return COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2;
 
-	  default:
-	    return false;
-	  }
+      case vec_to_scalar:
+      case scalar_to_vec:
+        return ix86_vec_cost (mode, ix86_cost->sse_op);
 
-	/* Reject insertions to misaligned positions.  */
-	if (pos & (size-1))
-	  return false;
+      /* We should have separate costs for unaligned loads and gather/scatter.
+	 Do that incrementally.  */
+      case unaligned_load:
+	index = sse_store_index (mode);
+	/* See PR82713 - we may end up being called on non-vector type.  */
+	if (index < 0)
+	  index = 2;
+        return COSTS_N_INSNS (ix86_cost->sse_unaligned_load[index]) / 2;
 
-	if (SUBREG_P (src))
-	  {
-	    unsigned int srcpos = SUBREG_BYTE (src);
+      case unaligned_store:
+	index = sse_store_index (mode);
+	/* See PR82713 - we may end up being called on non-vector type.  */
+	if (index < 0)
+	  index = 2;
+        return COSTS_N_INSNS (ix86_cost->sse_unaligned_store[index]) / 2;
 
-	    if (srcpos > 0)
-	      {
-		rtx extr_ops[4];
+      case vector_gather_load:
+        return ix86_vec_cost (mode,
+			      COSTS_N_INSNS
+				 (ix86_cost->gather_static
+				  + ix86_cost->gather_per_elt
+				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
 
-		extr_ops[0] = gen_reg_rtx (srcmode);
-		extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
-		extr_ops[2] = GEN_INT (size);
-		extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
+      case vector_scatter_store:
+        return ix86_vec_cost (mode,
+			      COSTS_N_INSNS
+				 (ix86_cost->scatter_static
+				  + ix86_cost->scatter_per_elt
+				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
 
-		if (!ix86_expand_pextr (extr_ops))
-		  return false;
+      case cond_branch_taken:
+        return ix86_cost->cond_taken_branch_cost;
 
-		src = extr_ops[0];
-	      }
-	    else
-	      src = gen_lowpart (srcmode, SUBREG_REG (src));
-	  }
+      case cond_branch_not_taken:
+        return ix86_cost->cond_not_taken_branch_cost;
 
-	if (GET_MODE (dst) == dstmode)
-	  d = dst;
-	else
-	  d = gen_reg_rtx (dstmode);
+      case vec_perm:
+      case vec_promote_demote:
+        return ix86_vec_cost (mode, ix86_cost->sse_op);
 
-	emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
-			  gen_lowpart (srcmode, src),
-			  GEN_INT (1 << (pos / size))));
-	if (d != dst)
-	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
-	return true;
-      }
+      case vec_construct:
+	{
+	  /* N element inserts into SSE vectors.  */
+	  int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
+	  /* One vinserti128 for combining two SSE vectors for AVX256.  */
+	  if (GET_MODE_BITSIZE (mode) == 256)
+	    cost += ix86_vec_cost (mode, ix86_cost->addss);
+	  /* One vinserti64x4 and two vinserti128 for combining SSE
+	     and AVX256 vectors to AVX512.  */
+	  else if (GET_MODE_BITSIZE (mode) == 512)
+	    cost += 3 * ix86_vec_cost (mode, ix86_cost->addss);
+	  return cost;
+	}
 
-    default:
-      return false;
+      default:
+        gcc_unreachable ();
     }
 }
+
 
 /* This function returns the calling abi specific va_list type node.
    It returns  the FNDECL specific va_list type.  */
@@ -50192,39 +21332,6 @@ ix86_preferred_simd_mode (scalar_mode mode)
     }
 }
 
-/* All CPUs prefer to avoid cross-lane operations so perform reductions
-   upper against lower halves up to SSE reg size.  */
-
-static machine_mode
-ix86_split_reduction (machine_mode mode)
-{
-  /* Reduce lowpart against highpart until we reach SSE reg width to
-     avoid cross-lane operations.  */
-  switch (mode)
-    {
-    case E_V8DImode:
-    case E_V4DImode:
-      return V2DImode;
-    case E_V16SImode:
-    case E_V8SImode:
-      return V4SImode;
-    case E_V32HImode:
-    case E_V16HImode:
-      return V8HImode;
-    case E_V64QImode:
-    case E_V32QImode:
-      return V16QImode;
-    case E_V16SFmode:
-    case E_V8SFmode:
-      return V4SFmode;
-    case E_V8DFmode:
-    case E_V4DFmode:
-      return V2DFmode;
-    default:
-      return mode;
-    }
-}
-
 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
    vectors.  If AVX512F is enabled then try vectorizing with 512bit,
    256bit and 128bit vectors.  */
@@ -50596,13 +21703,15 @@ ix86_memmodel_check (unsigned HOST_WIDE_INT val)
   if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
     {
       warning (OPT_Winvalid_memory_model,
-              "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
+	      "%<HLE_ACQUIRE%> not used with %<ACQUIRE%> or stronger "
+	       "memory model");
       return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
     }
   if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
     {
       warning (OPT_Winvalid_memory_model,
-              "HLE_RELEASE not used with RELEASE or stronger memory model");
+	      "%<HLE_RELEASE%> not used with %<RELEASE%> or stronger "
+	       "memory model");
       return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
     }
   return val;
@@ -50760,50 +21869,6 @@ ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
   return ret;
 }
 
-/* Add target attribute to SIMD clone NODE if needed.  */
-
-static void
-ix86_simd_clone_adjust (struct cgraph_node *node)
-{
-  const char *str = NULL;
-
-  /* Attributes need to be adjusted for definitions, not declarations.  */
-  if (!node->definition)
-    return;
-
-  gcc_assert (node->decl == cfun->decl);
-  switch (node->simdclone->vecsize_mangle)
-    {
-    case 'b':
-      if (!TARGET_SSE2)
-	str = "sse2";
-      break;
-    case 'c':
-      if (!TARGET_AVX)
-	str = "avx";
-      break;
-    case 'd':
-      if (!TARGET_AVX2)
-	str = "avx2";
-      break;
-    case 'e':
-      if (!TARGET_AVX512F)
-	str = "avx512f";
-      break;
-    default:
-      gcc_unreachable ();
-    }
-  if (str == NULL)
-    return;
-  push_cfun (NULL);
-  tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
-  bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
-  gcc_assert (ok);
-  pop_cfun ();
-  ix86_reset_previous_fndecl ();
-  ix86_set_current_function (node->decl);
-}
-
 /* If SIMD clone NODE can't be used in a vectorized loop
    in current function, return -1, otherwise return a badness of using it
    (0 if it is most desirable from vecsize_mangle point of view, 1
@@ -50912,10 +21977,10 @@ ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
       tree fenv_ptr = build_pointer_type (fenv_type);
       tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
       fenv_addr = fold_convert (ptr_type_node, fenv_addr);
-      tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
-      tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
-      tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
-      tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
+      tree fnstenv = get_ix86_builtin (IX86_BUILTIN_FNSTENV);
+      tree fldenv = get_ix86_builtin (IX86_BUILTIN_FLDENV);
+      tree fnstsw = get_ix86_builtin (IX86_BUILTIN_FNSTSW);
+      tree fnclex = get_ix86_builtin (IX86_BUILTIN_FNCLEX);
       tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
       tree hold_fnclex = build_call_expr (fnclex, 0);
       fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
@@ -50939,8 +22004,8 @@ ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
     {
       tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
       tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
-      tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
-      tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
+      tree stmxcsr = get_ix86_builtin (IX86_BUILTIN_STMXCSR);
+      tree ldmxcsr = get_ix86_builtin (IX86_BUILTIN_LDMXCSR);
       tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
       tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
 				      mxcsr_orig_var, stmxcsr_hold_call);
@@ -51183,22 +22248,6 @@ ix86_init_libfuncs (void)
 #endif
 }
 
-/* Generate call to __divmoddi4.  */
-
-static void
-ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
-			    rtx op0, rtx op1,
-			    rtx *quot_p, rtx *rem_p)
-{
-  rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
-
-  rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
-				      mode, op0, mode, op1, mode,
-				      XEXP (rem, 0), Pmode);
-  *quot_p = quot;
-  *rem_p = rem;
-}
-
 /* Set the value of FLT_EVAL_METHOD in float.h.  When using only the
    FPU, assume that the fpcw is set to extended precision; when using
    only SSE, rounding is correct; when using both SSE and the FPU,
@@ -51970,9 +23019,6 @@ ix86_run_selftests (void)
 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
 
-#undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
-#define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
-
 #undef TARGET_OFFLOAD_OPTIONS
 #define TARGET_OFFLOAD_OPTIONS \
   ix86_offload_options
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 14e5a392f..187e52a5b 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1891,7 +1891,7 @@ typedef struct ix86_args {
    ? GET_MODE_SIZE (TImode) : UNITS_PER_WORD)
 
 /* If a memory-to-memory move would take MOVE_RATIO or more simple
-   move-instruction pairs, we will do a movmem or libcall instead.
+   move-instruction pairs, we will do a cpymem or libcall instead.
    Increasing the value will always make code faster, but eventually
    incurs high cost in increased code size.
 
@@ -2784,6 +2784,9 @@ struct GTY(()) machine_function {
   /* During SEH output, this is non-null.  */
   struct seh_frame_state * GTY((skip(""))) seh;
 };
+
+extern GTY(()) tree sysv_va_list_type_node;
+extern GTY(()) tree ms_va_list_type_node;
 #endif
 
 #define ix86_stack_locals (cfun->machine->stack_locals)
@@ -2881,6 +2884,12 @@ extern void debug_dispatch_window (int);
 
 #define TARGET_SUPPORTS_WIDE_INT 1
 
+#if !defined(GENERATOR_FILE) && !defined(IN_LIBGCC2)
+extern enum attr_cpu ix86_schedule;
+
+#define NUM_X86_64_MS_CLOBBERED_REGS 12
+#endif
+
 /*
 Local variables:
 version-control: t
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 698c31a0a..861248899 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16731,7 +16731,7 @@
    (set_attr "length_immediate" "0")
    (set_attr "modrm" "0")])
 
-(define_expand "movmem<mode>"
+(define_expand "cpymem<mode>"
   [(use (match_operand:BLK 0 "memory_operand"))
    (use (match_operand:BLK 1 "memory_operand"))
    (use (match_operand:SWI48 2 "nonmemory_operand"))
@@ -16743,7 +16743,7 @@
    (use (match_operand:SI 8 ""))]
   ""
 {
- if (ix86_expand_set_or_movmem (operands[0], operands[1],
+ if (ix86_expand_set_or_cpymem (operands[0], operands[1],
 			        operands[2], NULL, operands[3],
 			        operands[4], operands[5],
 				operands[6], operands[7],
@@ -16958,7 +16958,7 @@
     (use (match_operand:SI 8 ""))]
   ""
 {
- if (ix86_expand_set_or_movmem (operands[0], NULL,
+ if (ix86_expand_set_or_cpymem (operands[0], NULL,
 			        operands[1], operands[2],
 				operands[3], operands[4],
 			        operands[5], operands[6],
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 865947deb..4135159ac 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -683,7 +683,7 @@
   if (GET_CODE (op) == PLUS && REG_P (XEXP (op, 0)))
     {
       int regno = REGNO (XEXP (op, 0));
-      if (!HARD_REGISTER_NUM_P (regno) || call_used_regs[regno])
+      if (!HARD_REGISTER_NUM_P (regno) || call_used_or_fixed_reg_p (regno))
 	{
 	  op = XEXP (op, 1);
 	  if (GOT32_symbol_operand (op, VOIDmode))
diff --git a/gcc/config/i386/t-i386 b/gcc/config/i386/t-i386
index 0dac80fbc..50caf2c69 100644
--- a/gcc/config/i386/t-i386
+++ b/gcc/config/i386/t-i386
@@ -44,6 +44,22 @@ i386-d.o: $(srcdir)/config/i386/i386-d.c
 	$(COMPILE) $<
 	$(POSTCOMPILE)
 
+i386-options.o: $(srcdir)/config/i386/i386-options.c
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+
+i386-builtins.o: $(srcdir)/config/i386/i386-builtins.c
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+
+i386-expand.o: $(srcdir)/config/i386/i386-expand.c
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+
+i386-features.o: $(srcdir)/config/i386/i386-features.c
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+
 i386.o: i386-builtin-types.inc
 
 i386-builtin-types.inc: s-i386-bt ; @true
diff --git a/gcc/config/ia64/ia64.c b/gcc/config/ia64/ia64.c
index e8d905e22..d09e49637 100644
--- a/gcc/config/ia64/ia64.c
+++ b/gcc/config/ia64/ia64.c
@@ -5147,7 +5147,7 @@ ia64_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
 		      gimple_seq *post_p)
 {
   /* Variable sized types are passed by reference.  */
-  if (pass_by_reference (NULL, TYPE_MODE (type), type, false))
+  if (pass_va_arg_by_reference (type))
     {
       tree ptrtype = build_pointer_type (type);
       tree addr = std_gimplify_va_arg_expr (valist, ptrtype, pre_p, post_p);
diff --git a/gcc/config/lm32/lm32.md b/gcc/config/lm32/lm32.md
index c09052c62..91a5fe1e0 100644
--- a/gcc/config/lm32/lm32.md
+++ b/gcc/config/lm32/lm32.md
@@ -216,7 +216,7 @@
     }    
 }")
 
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(parallel [(set (match_operand:BLK 0 "general_operand" "")
 		   (match_operand:BLK 1 "general_operand" ""))
 	      (use (match_operand:SI 2 "" ""))
diff --git a/gcc/config/m32c/blkmov.md b/gcc/config/m32c/blkmov.md
index d7da439c2..e5cdc801f 100644
--- a/gcc/config/m32c/blkmov.md
+++ b/gcc/config/m32c/blkmov.md
@@ -40,14 +40,14 @@
 ;; 1 = source (mem:BLK ...)
 ;; 2 = count
 ;; 3 = alignment
-(define_expand "movmemhi"
+(define_expand "cpymemhi"
   [(match_operand 0 "ap_operand" "")
    (match_operand 1 "ap_operand" "")
    (match_operand 2 "m32c_r3_operand" "")
    (match_operand 3 "" "")
    ]
   ""
-  "if (m32c_expand_movmemhi(operands)) DONE; FAIL;"
+  "if (m32c_expand_cpymemhi(operands)) DONE; FAIL;"
   )
 
 ;; We can't use mode iterators for these because M16C uses r1h to extend
@@ -60,7 +60,7 @@
 ;; 3 = dest (in)
 ;; 4 = src (in)
 ;; 5 = count (in)
-(define_insn "movmemhi_bhi_op"
+(define_insn "cpymemhi_bhi_op"
   [(set (mem:QI (match_operand:HI 3 "ap_operand" "0"))
 	(mem:QI (match_operand:HI 4 "ap_operand" "1")))
    (set (match_operand:HI 2 "m32c_r3_operand" "=R3w")
@@ -75,7 +75,7 @@
   "TARGET_A16"
   "mov.b:q\t#0,r1h\n\tsmovf.b\t; %0[0..%2-1]=r1h%1[]"
   )
-(define_insn "movmemhi_bpsi_op"
+(define_insn "cpymemhi_bpsi_op"
   [(set (mem:QI (match_operand:PSI 3 "ap_operand" "0"))
 	(mem:QI (match_operand:PSI 4 "ap_operand" "1")))
    (set (match_operand:HI 2 "m32c_r3_operand" "=R3w")
@@ -89,7 +89,7 @@
   "TARGET_A24"
   "smovf.b\t; %0[0..%2-1]=%1[]"
   )
-(define_insn "movmemhi_whi_op"
+(define_insn "cpymemhi_whi_op"
   [(set (mem:HI (match_operand:HI 3 "ap_operand" "0"))
 	(mem:HI (match_operand:HI 4 "ap_operand" "1")))
    (set (match_operand:HI 2 "m32c_r3_operand" "=R3w")
@@ -104,7 +104,7 @@
   "TARGET_A16"
   "mov.b:q\t#0,r1h\n\tsmovf.w\t; %0[0..%2-1]=r1h%1[]"
   )
-(define_insn "movmemhi_wpsi_op"
+(define_insn "cpymemhi_wpsi_op"
   [(set (mem:HI (match_operand:PSI 3 "ap_operand" "0"))
 	(mem:HI (match_operand:PSI 4 "ap_operand" "1")))
    (set (match_operand:HI 2 "m32c_r3_operand" "=R3w")
diff --git a/gcc/config/m32c/m32c-protos.h b/gcc/config/m32c/m32c-protos.h
index 7d4d478fd..fe926fd50 100644
--- a/gcc/config/m32c/m32c-protos.h
+++ b/gcc/config/m32c/m32c-protos.h
@@ -43,7 +43,7 @@ void m32c_emit_eh_epilogue (rtx);
 int  m32c_expand_cmpstr (rtx *);
 int  m32c_expand_insv (rtx *);
 int  m32c_expand_movcc (rtx *);
-int  m32c_expand_movmemhi (rtx *);
+int  m32c_expand_cpymemhi (rtx *);
 int  m32c_expand_movstr (rtx *);
 void m32c_expand_neg_mulpsi3 (rtx *);
 int  m32c_expand_setmemhi (rtx *);
diff --git a/gcc/config/m32c/m32c.c b/gcc/config/m32c/m32c.c
index 1a0d0c681..d0d24bb5f 100644
--- a/gcc/config/m32c/m32c.c
+++ b/gcc/config/m32c/m32c.c
@@ -3592,7 +3592,7 @@ m32c_expand_setmemhi(rtx *operands)
    addresses, not [mem] syntax.  $0 is the destination (MEM:BLK), $1
    is the source (MEM:BLK), and $2 the count (HI).  */
 int
-m32c_expand_movmemhi(rtx *operands)
+m32c_expand_cpymemhi(rtx *operands)
 {
   rtx desta, srca, count;
   rtx desto, srco, counto;
@@ -3620,9 +3620,9 @@ m32c_expand_movmemhi(rtx *operands)
     {
       count = copy_to_mode_reg (HImode, GEN_INT (INTVAL (count) / 2));
       if (TARGET_A16)
-	emit_insn (gen_movmemhi_whi_op (desto, srco, counto, desta, srca, count));
+	emit_insn (gen_cpymemhi_whi_op (desto, srco, counto, desta, srca, count));
       else
-	emit_insn (gen_movmemhi_wpsi_op (desto, srco, counto, desta, srca, count));
+	emit_insn (gen_cpymemhi_wpsi_op (desto, srco, counto, desta, srca, count));
       return 1;
     }
 
@@ -3632,9 +3632,9 @@ m32c_expand_movmemhi(rtx *operands)
     count = copy_to_mode_reg (HImode, count);
 
   if (TARGET_A16)
-    emit_insn (gen_movmemhi_bhi_op (desto, srco, counto, desta, srca, count));
+    emit_insn (gen_cpymemhi_bhi_op (desto, srco, counto, desta, srca, count));
   else
-    emit_insn (gen_movmemhi_bpsi_op (desto, srco, counto, desta, srca, count));
+    emit_insn (gen_cpymemhi_bpsi_op (desto, srco, counto, desta, srca, count));
 
   return 1;
 }
diff --git a/gcc/config/m32r/m32r.c b/gcc/config/m32r/m32r.c
index 6e79b2aec..ac18aa286 100644
--- a/gcc/config/m32r/m32r.c
+++ b/gcc/config/m32r/m32r.c
@@ -2598,7 +2598,7 @@ m32r_expand_block_move (rtx operands[])
 	 to the word after the end of the source block, and dst_reg to point
 	 to the last word of the destination block, provided that the block
 	 is MAX_MOVE_BYTES long.  */
-      emit_insn (gen_movmemsi_internal (dst_reg, src_reg, at_a_time,
+      emit_insn (gen_cpymemsi_internal (dst_reg, src_reg, at_a_time,
 					new_dst_reg, new_src_reg));
       emit_move_insn (dst_reg, new_dst_reg);
       emit_move_insn (src_reg, new_src_reg);
@@ -2612,7 +2612,7 @@ m32r_expand_block_move (rtx operands[])
     }
 
   if (leftover)
-    emit_insn (gen_movmemsi_internal (dst_reg, src_reg, GEN_INT (leftover),
+    emit_insn (gen_cpymemsi_internal (dst_reg, src_reg, GEN_INT (leftover),
 				      gen_reg_rtx (SImode),
 				      gen_reg_rtx (SImode)));
   return 1;
diff --git a/gcc/config/m32r/m32r.md b/gcc/config/m32r/m32r.md
index be5739763..e944363fd 100644
--- a/gcc/config/m32r/m32r.md
+++ b/gcc/config/m32r/m32r.md
@@ -2195,7 +2195,7 @@
 ;; Argument 2 is the length
 ;; Argument 3 is the alignment
 
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(parallel [(set (match_operand:BLK 0 "general_operand" "")
 		   (match_operand:BLK 1 "general_operand" ""))
 	      (use (match_operand:SI  2 "immediate_operand" ""))
@@ -2214,7 +2214,7 @@
 
 ;; Insn generated by block moves
 
-(define_insn "movmemsi_internal"
+(define_insn "cpymemsi_internal"
   [(set (mem:BLK (match_operand:SI 0 "register_operand" "r"))	;; destination
 	(mem:BLK (match_operand:SI 1 "register_operand" "r")))	;; source
    (use (match_operand:SI 2 "m32r_block_immediate_operand" "J"));; # bytes to move
diff --git a/gcc/config/mcore/mcore.md b/gcc/config/mcore/mcore.md
index cc84e342b..c6893518d 100644
--- a/gcc/config/mcore/mcore.md
+++ b/gcc/config/mcore/mcore.md
@@ -2552,7 +2552,7 @@
 ;; Block move - adapted from m88k.md
 ;; ------------------------------------------------------------------------
 
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(parallel [(set (mem:BLK (match_operand:BLK 0 "" ""))
 		   (mem:BLK (match_operand:BLK 1 "" "")))
 	      (use (match_operand:SI 2 "general_operand" ""))
diff --git a/gcc/config/microblaze/microblaze.c b/gcc/config/microblaze/microblaze.c
index 55c1becf9..07dd0bc6f 100644
--- a/gcc/config/microblaze/microblaze.c
+++ b/gcc/config/microblaze/microblaze.c
@@ -1250,7 +1250,7 @@ microblaze_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length)
     microblaze_block_move_straight (dest, src, leftover);
 }
 
-/* Expand a movmemsi instruction.  */
+/* Expand a cpymemsi instruction.  */
 
 bool
 microblaze_expand_block_move (rtx dest, rtx src, rtx length, rtx align_rtx)
diff --git a/gcc/config/microblaze/microblaze.md b/gcc/config/microblaze/microblaze.md
index 183afff37..1509e4318 100644
--- a/gcc/config/microblaze/microblaze.md
+++ b/gcc/config/microblaze/microblaze.md
@@ -1144,7 +1144,7 @@
 ;; Argument 2 is the length
 ;; Argument 3 is the alignment
  
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(parallel [(set (match_operand:BLK 0 "general_operand")
 		   (match_operand:BLK 1 "general_operand"))
 	      (use (match_operand:SI 2 ""))
diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index 100894720..3c95636bf 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -6780,7 +6780,7 @@ mips_std_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
   unsigned HOST_WIDE_INT align, boundary;
   bool indirect;
 
-  indirect = pass_by_reference (NULL, TYPE_MODE (type), type, false);
+  indirect = pass_va_arg_by_reference (type);
   if (indirect)
     type = build_pointer_type (type);
 
@@ -6867,7 +6867,7 @@ mips_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
   tree addr;
   bool indirect_p;
 
-  indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, 0);
+  indirect_p = pass_va_arg_by_reference (type);
   if (indirect_p)
     type = build_pointer_type (type);
 
@@ -7938,15 +7938,15 @@ mips_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
 {
   if (op == STORE_BY_PIECES)
     return mips_store_by_pieces_p (size, align);
-  if (op == MOVE_BY_PIECES && HAVE_movmemsi)
+  if (op == MOVE_BY_PIECES && HAVE_cpymemsi)
     {
-      /* movmemsi is meant to generate code that is at least as good as
-	 move_by_pieces.  However, movmemsi effectively uses a by-pieces
+      /* cpymemsi is meant to generate code that is at least as good as
+	 move_by_pieces.  However, cpymemsi effectively uses a by-pieces
 	 implementation both for moves smaller than a word and for
 	 word-aligned moves of no more than MIPS_MAX_MOVE_BYTES_STRAIGHT
 	 bytes.  We should allow the tree-level optimisers to do such
 	 moves by pieces, as it often exposes other optimization
-	 opportunities.  We might as well continue to use movmemsi at
+	 opportunities.  We might as well continue to use cpymemsi at
 	 the rtl level though, as it produces better code when
 	 scheduling is disabled (such as at -O).  */
       if (currently_expanding_to_rtl)
@@ -8165,7 +8165,7 @@ mips_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
     emit_insn (gen_nop ());
 }
 
-/* Expand a movmemsi instruction, which copies LENGTH bytes from
+/* Expand a cpymemsi instruction, which copies LENGTH bytes from
    memory reference SRC to memory reference DEST.  */
 
 bool
diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h
index 953d82e85..a5be7fa39 100644
--- a/gcc/config/mips/mips.h
+++ b/gcc/config/mips/mips.h
@@ -3099,12 +3099,12 @@ while (0)
 #define MIPS_MIN_MOVE_MEM_ALIGN 16
 
 /* The maximum number of bytes that can be copied by one iteration of
-   a movmemsi loop; see mips_block_move_loop.  */
+   a cpymemsi loop; see mips_block_move_loop.  */
 #define MIPS_MAX_MOVE_BYTES_PER_LOOP_ITER \
   (UNITS_PER_WORD * 4)
 
 /* The maximum number of bytes that can be copied by a straight-line
-   implementation of movmemsi; see mips_block_move_straight.  We want
+   implementation of cpymemsi; see mips_block_move_straight.  We want
    to make sure that any loop-based implementation will iterate at
    least twice.  */
 #define MIPS_MAX_MOVE_BYTES_STRAIGHT \
@@ -3119,11 +3119,11 @@ while (0)
 
 #define MIPS_CALL_RATIO 8
 
-/* Any loop-based implementation of movmemsi will have at least
+/* Any loop-based implementation of cpymemsi will have at least
    MIPS_MAX_MOVE_BYTES_STRAIGHT / UNITS_PER_WORD memory-to-memory
    moves, so allow individual copies of fewer elements.
 
-   When movmemsi is not available, use a value approximating
+   When cpymemsi is not available, use a value approximating
    the length of a memcpy call sequence, so that move_by_pieces
    will generate inline code if it is shorter than a function call.
    Since move_by_pieces_ninsns counts memory-to-memory moves, but
@@ -3131,7 +3131,7 @@ while (0)
    value of MIPS_CALL_RATIO to take that into account.  */
 
 #define MOVE_RATIO(speed)				\
-  (HAVE_movmemsi					\
+  (HAVE_cpymemsi					\
    ? MIPS_MAX_MOVE_BYTES_STRAIGHT / MOVE_MAX		\
    : MIPS_CALL_RATIO / 2)
 
diff --git a/gcc/config/mips/mips.md b/gcc/config/mips/mips.md
index 3cfb1a751..a9abb6fdd 100644
--- a/gcc/config/mips/mips.md
+++ b/gcc/config/mips/mips.md
@@ -5638,7 +5638,7 @@
 ;; Argument 2 is the length
 ;; Argument 3 is the alignment
 
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(parallel [(set (match_operand:BLK 0 "general_operand")
 		   (match_operand:BLK 1 "general_operand"))
 	      (use (match_operand:SI 2 ""))
diff --git a/gcc/config/msp430/msp430.c b/gcc/config/msp430/msp430.c
index 020e980b8..3ce649648 100644
--- a/gcc/config/msp430/msp430.c
+++ b/gcc/config/msp430/msp430.c
@@ -1457,7 +1457,7 @@ msp430_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
   unsigned HOST_WIDE_INT align, boundary;
   bool indirect;
 
-  indirect = pass_by_reference (NULL, TYPE_MODE (type), type, false);
+  indirect = pass_va_arg_by_reference (type);
   if (indirect)
     type = build_pointer_type (type);
 
diff --git a/gcc/config/nds32/nds32-memory-manipulation.c b/gcc/config/nds32/nds32-memory-manipulation.c
index 71b75dca5..b3f2cd698 100644
--- a/gcc/config/nds32/nds32-memory-manipulation.c
+++ b/gcc/config/nds32/nds32-memory-manipulation.c
@@ -1,4 +1,4 @@
-/* Auxiliary functions for expand movmem, setmem, cmpmem, load_multiple
+/* Auxiliary functions for expand cpymem, setmem, cmpmem, load_multiple
    and store_multiple pattern of Andes NDS32 cpu for GNU compiler
    Copyright (C) 2012-2019 Free Software Foundation, Inc.
    Contributed by Andes Technology Corporation.
@@ -120,14 +120,14 @@ nds32_emit_mem_move_block (int base_regno, int count,
 
 /* ------------------------------------------------------------------------ */
 
-/* Auxiliary function for expand movmem pattern.  */
+/* Auxiliary function for expand cpymem pattern.  */
 
 static bool
-nds32_expand_movmemsi_loop_unknown_size (rtx dstmem, rtx srcmem,
+nds32_expand_cpymemsi_loop_unknown_size (rtx dstmem, rtx srcmem,
 					 rtx size,
 					 rtx alignment)
 {
-  /* Emit loop version of movmem.
+  /* Emit loop version of cpymem.
 
        andi    $size_least_3_bit, $size, #~7
        add     $dst_end, $dst, $size
@@ -254,7 +254,7 @@ nds32_expand_movmemsi_loop_unknown_size (rtx dstmem, rtx srcmem,
 }
 
 static bool
-nds32_expand_movmemsi_loop_known_size (rtx dstmem, rtx srcmem,
+nds32_expand_cpymemsi_loop_known_size (rtx dstmem, rtx srcmem,
 				       rtx size, rtx alignment)
 {
   rtx dst_base_reg, src_base_reg;
@@ -288,7 +288,7 @@ nds32_expand_movmemsi_loop_known_size (rtx dstmem, rtx srcmem,
 
   if (total_bytes < 8)
     {
-      /* Emit total_bytes less than 8 loop version of movmem.
+      /* Emit total_bytes less than 8 loop version of cpymem.
 	add     $dst_end, $dst, $size
 	move    $dst_itr, $dst
 	.Lbyte_mode_loop:
@@ -321,7 +321,7 @@ nds32_expand_movmemsi_loop_known_size (rtx dstmem, rtx srcmem,
     }
   else if (total_bytes % 8 == 0)
     {
-      /* Emit multiple of 8 loop version of movmem.
+      /* Emit multiple of 8 loop version of cpymem.
 
 	 add     $dst_end, $dst, $size
 	 move    $dst_itr, $dst
@@ -370,7 +370,7 @@ nds32_expand_movmemsi_loop_known_size (rtx dstmem, rtx srcmem,
   else
     {
       /* Handle size greater than 8, and not a multiple of 8.  */
-      return nds32_expand_movmemsi_loop_unknown_size (dstmem, srcmem,
+      return nds32_expand_cpymemsi_loop_unknown_size (dstmem, srcmem,
 						      size, alignment);
     }
 
@@ -378,19 +378,19 @@ nds32_expand_movmemsi_loop_known_size (rtx dstmem, rtx srcmem,
 }
 
 static bool
-nds32_expand_movmemsi_loop (rtx dstmem, rtx srcmem,
+nds32_expand_cpymemsi_loop (rtx dstmem, rtx srcmem,
 			    rtx size, rtx alignment)
 {
   if (CONST_INT_P (size))
-    return nds32_expand_movmemsi_loop_known_size (dstmem, srcmem,
+    return nds32_expand_cpymemsi_loop_known_size (dstmem, srcmem,
 						  size, alignment);
   else
-    return nds32_expand_movmemsi_loop_unknown_size (dstmem, srcmem,
+    return nds32_expand_cpymemsi_loop_unknown_size (dstmem, srcmem,
 						    size, alignment);
 }
 
 static bool
-nds32_expand_movmemsi_unroll (rtx dstmem, rtx srcmem,
+nds32_expand_cpymemsi_unroll (rtx dstmem, rtx srcmem,
 			      rtx total_bytes, rtx alignment)
 {
   rtx dst_base_reg, src_base_reg;
@@ -533,13 +533,13 @@ nds32_expand_movmemsi_unroll (rtx dstmem, rtx srcmem,
    This is auxiliary extern function to help create rtx template.
    Check nds32-multiple.md file for the patterns.  */
 bool
-nds32_expand_movmemsi (rtx dstmem, rtx srcmem, rtx total_bytes, rtx alignment)
+nds32_expand_cpymemsi (rtx dstmem, rtx srcmem, rtx total_bytes, rtx alignment)
 {
-  if (nds32_expand_movmemsi_unroll (dstmem, srcmem, total_bytes, alignment))
+  if (nds32_expand_cpymemsi_unroll (dstmem, srcmem, total_bytes, alignment))
     return true;
 
   if (!optimize_size && optimize > 2)
-    return nds32_expand_movmemsi_loop (dstmem, srcmem, total_bytes, alignment);
+    return nds32_expand_cpymemsi_loop (dstmem, srcmem, total_bytes, alignment);
 
   return false;
 }
diff --git a/gcc/config/nds32/nds32-multiple.md b/gcc/config/nds32/nds32-multiple.md
index a1e10c055..98d9508c0 100644
--- a/gcc/config/nds32/nds32-multiple.md
+++ b/gcc/config/nds32/nds32-multiple.md
@@ -3751,14 +3751,14 @@
 ;; operands[3] is the known shared alignment.
 
 
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(match_operand:BLK 0 "general_operand" "")
    (match_operand:BLK 1 "general_operand" "")
    (match_operand:SI 2 "nds32_reg_constant_operand" "")
    (match_operand:SI 3 "const_int_operand" "")]
   ""
 {
-  if (nds32_expand_movmemsi (operands[0],
+  if (nds32_expand_cpymemsi (operands[0],
 			     operands[1],
 			     operands[2],
 			     operands[3]))
diff --git a/gcc/config/nds32/nds32-protos.h b/gcc/config/nds32/nds32-protos.h
index aaa65d6f0..7ae1954d0 100644
--- a/gcc/config/nds32/nds32-protos.h
+++ b/gcc/config/nds32/nds32-protos.h
@@ -78,7 +78,7 @@ extern rtx nds32_di_low_part_subreg(rtx);
 
 extern rtx nds32_expand_load_multiple (int, int, rtx, rtx, bool, rtx *);
 extern rtx nds32_expand_store_multiple (int, int, rtx, rtx, bool, rtx *);
-extern bool nds32_expand_movmemsi (rtx, rtx, rtx, rtx);
+extern bool nds32_expand_cpymemsi (rtx, rtx, rtx, rtx);
 extern bool nds32_expand_setmem (rtx, rtx, rtx, rtx, rtx, rtx);
 extern bool nds32_expand_strlen (rtx, rtx, rtx, rtx);
 
diff --git a/gcc/config/pa/pa.c b/gcc/config/pa/pa.c
index 84a8cae22..73109c6f9 100644
--- a/gcc/config/pa/pa.c
+++ b/gcc/config/pa/pa.c
@@ -107,7 +107,7 @@ static int pa_can_combine_p (rtx_insn *, rtx_insn *, rtx_insn *, int, rtx,
 static bool forward_branch_p (rtx_insn *);
 static void compute_zdepwi_operands (unsigned HOST_WIDE_INT, unsigned *);
 static void compute_zdepdi_operands (unsigned HOST_WIDE_INT, unsigned *);
-static int compute_movmem_length (rtx_insn *);
+static int compute_cpymem_length (rtx_insn *);
 static int compute_clrmem_length (rtx_insn *);
 static bool pa_assemble_integer (rtx, unsigned int, int);
 static void remove_useless_addtr_insns (int);
@@ -2986,7 +2986,7 @@ pa_output_block_move (rtx *operands, int size_is_constant ATTRIBUTE_UNUSED)
    count insns rather than emit them.  */
 
 static int
-compute_movmem_length (rtx_insn *insn)
+compute_cpymem_length (rtx_insn *insn)
 {
   rtx pat = PATTERN (insn);
   unsigned int align = INTVAL (XEXP (XVECEXP (pat, 0, 7), 0));
@@ -5061,7 +5061,7 @@ pa_adjust_insn_length (rtx_insn *insn, int length)
       && GET_CODE (XEXP (XVECEXP (pat, 0, 0), 1)) == MEM
       && GET_MODE (XEXP (XVECEXP (pat, 0, 0), 0)) == BLKmode
       && GET_MODE (XEXP (XVECEXP (pat, 0, 0), 1)) == BLKmode)
-    length += compute_movmem_length (insn) - 4;
+    length += compute_cpymem_length (insn) - 4;
   /* Block clear pattern.  */
   else if (NONJUMP_INSN_P (insn)
 	   && GET_CODE (pat) == PARALLEL
@@ -6378,7 +6378,7 @@ hppa_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
       unsigned int size, ofs;
       bool indirect;
 
-      indirect = pass_by_reference (NULL, TYPE_MODE (type), type, 0);
+      indirect = pass_va_arg_by_reference (type);
       if (indirect)
 	{
 	  type = ptr;
diff --git a/gcc/config/pa/pa.md b/gcc/config/pa/pa.md
index 18f8e127d..a37989032 100644
--- a/gcc/config/pa/pa.md
+++ b/gcc/config/pa/pa.md
@@ -3162,9 +3162,9 @@
 
 ;; The definition of this insn does not really explain what it does,
 ;; but it should suffice that anything generated as this insn will be
-;; recognized as a movmemsi operation, and that it will not successfully
+;; recognized as a cpymemsi operation, and that it will not successfully
 ;; combine with anything.
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(parallel [(set (match_operand:BLK 0 "" "")
 		   (match_operand:BLK 1 "" ""))
 	      (clobber (match_dup 4))
@@ -3244,7 +3244,7 @@
 ;; operands 0 and 1 are both equivalent to symbolic MEMs.  Thus, we are
 ;; forced to internally copy operands 0 and 1 to operands 7 and 8,
 ;; respectively.  We then split or peephole optimize after reload.
-(define_insn "movmemsi_prereload"
+(define_insn "cpymemsi_prereload"
   [(set (mem:BLK (match_operand:SI 0 "register_operand" "r,r"))
 	(mem:BLK (match_operand:SI 1 "register_operand" "r,r")))
    (clobber (match_operand:SI 2 "register_operand" "=&r,&r"))	;loop cnt/tmp
@@ -3337,7 +3337,7 @@
     }
 }")
 
-(define_insn "movmemsi_postreload"
+(define_insn "cpymemsi_postreload"
   [(set (mem:BLK (match_operand:SI 0 "register_operand" "+r,r"))
 	(mem:BLK (match_operand:SI 1 "register_operand" "+r,r")))
    (clobber (match_operand:SI 2 "register_operand" "=&r,&r"))	;loop cnt/tmp
@@ -3352,7 +3352,7 @@
   "* return pa_output_block_move (operands, !which_alternative);"
   [(set_attr "type" "multi,multi")])
 
-(define_expand "movmemdi"
+(define_expand "cpymemdi"
   [(parallel [(set (match_operand:BLK 0 "" "")
 		   (match_operand:BLK 1 "" ""))
 	      (clobber (match_dup 4))
@@ -3432,7 +3432,7 @@
 ;; operands 0 and 1 are both equivalent to symbolic MEMs.  Thus, we are
 ;; forced to internally copy operands 0 and 1 to operands 7 and 8,
 ;; respectively.  We then split or peephole optimize after reload.
-(define_insn "movmemdi_prereload"
+(define_insn "cpymemdi_prereload"
   [(set (mem:BLK (match_operand:DI 0 "register_operand" "r,r"))
 	(mem:BLK (match_operand:DI 1 "register_operand" "r,r")))
    (clobber (match_operand:DI 2 "register_operand" "=&r,&r"))	;loop cnt/tmp
@@ -3525,7 +3525,7 @@
     }
 }")
 
-(define_insn "movmemdi_postreload"
+(define_insn "cpymemdi_postreload"
   [(set (mem:BLK (match_operand:DI 0 "register_operand" "+r,r"))
 	(mem:BLK (match_operand:DI 1 "register_operand" "+r,r")))
    (clobber (match_operand:DI 2 "register_operand" "=&r,&r"))	;loop cnt/tmp
diff --git a/gcc/config/pdp11/pdp11.md b/gcc/config/pdp11/pdp11.md
index ce781db06..be5ddc4c3 100644
--- a/gcc/config/pdp11/pdp11.md
+++ b/gcc/config/pdp11/pdp11.md
@@ -26,7 +26,7 @@
     UNSPECV_BLOCKAGE
     UNSPECV_SETD
     UNSPECV_SETI
-    UNSPECV_MOVMEM
+    UNSPECV_CPYMEM
   ])
 
 (define_constants
@@ -664,8 +664,8 @@
   [(set_attr "length" "2,2,4,4,2")])
 
 ;; Expand a block move.  We turn this into a move loop.
-(define_expand "movmemhi"
-  [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_MOVMEM)
+(define_expand "cpymemhi"
+  [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_CPYMEM)
 	      (match_operand:BLK 0 "general_operand" "=g")
 	      (match_operand:BLK 1 "general_operand" "g")
 	      (match_operand:HI 2 "immediate_operand" "i")
@@ -694,8 +694,8 @@
 }")
 
 ;; Expand a block move.  We turn this into a move loop.
-(define_insn_and_split "movmemhi1"
-  [(unspec_volatile [(const_int 0)] UNSPECV_MOVMEM)
+(define_insn_and_split "cpymemhi1"
+  [(unspec_volatile [(const_int 0)] UNSPECV_CPYMEM)
    (match_operand:HI 0 "register_operand" "+r")
    (match_operand:HI 1 "register_operand" "+r")
    (match_operand:HI 2 "register_operand" "+r")
@@ -707,7 +707,7 @@
   ""
   "#"
   "reload_completed"
-  [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_MOVMEM)
+  [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_CPYMEM)
 	      (match_dup 0)
 	      (match_dup 1)
 	      (match_dup 2)
@@ -719,8 +719,8 @@
 	      (clobber (reg:CC CC_REGNUM))])]
   "")
 
-(define_insn "movmemhi_nocc"
-  [(unspec_volatile [(const_int 0)] UNSPECV_MOVMEM)
+(define_insn "cpymemhi_nocc"
+  [(unspec_volatile [(const_int 0)] UNSPECV_CPYMEM)
    (match_operand:HI 0 "register_operand" "+r")
    (match_operand:HI 1 "register_operand" "+r")
    (match_operand:HI 2 "register_operand" "+r")
diff --git a/gcc/config/riscv/riscv.c b/gcc/config/riscv/riscv.c
index b3297a381..49383d857 100644
--- a/gcc/config/riscv/riscv.c
+++ b/gcc/config/riscv/riscv.c
@@ -3024,7 +3024,7 @@ riscv_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length,
     emit_insn(gen_nop ());
 }
 
-/* Expand a movmemsi instruction, which copies LENGTH bytes from
+/* Expand a cpymemsi instruction, which copies LENGTH bytes from
    memory reference SRC to memory reference DEST.  */
 
 bool
diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
index 5130dc826..7e3612641 100644
--- a/gcc/config/riscv/riscv.h
+++ b/gcc/config/riscv/riscv.h
@@ -829,20 +829,20 @@ while (0)
 #undef PTRDIFF_TYPE
 #define PTRDIFF_TYPE (POINTER_SIZE == 64 ? "long int" : "int")
 
-/* The maximum number of bytes copied by one iteration of a movmemsi loop.  */
+/* The maximum number of bytes copied by one iteration of a cpymemsi loop.  */
 
 #define RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER (UNITS_PER_WORD * 4)
 
 /* The maximum number of bytes that can be copied by a straight-line
-   movmemsi implementation.  */
+   cpymemsi implementation.  */
 
 #define RISCV_MAX_MOVE_BYTES_STRAIGHT (RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER * 3)
 
 /* If a memory-to-memory move would take MOVE_RATIO or more simple
-   move-instruction pairs, we will do a movmem or libcall instead.
+   move-instruction pairs, we will do a cpymem or libcall instead.
    Do not use move_by_pieces at all when strict alignment is not
    in effect but the target has slow unaligned accesses; in this
-   case, movmem or libcall is more efficient.  */
+   case, cpymem or libcall is more efficient.  */
 
 #define MOVE_RATIO(speed)						\
   (!STRICT_ALIGNMENT && riscv_slow_unaligned_access_p ? 1 :		\
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index e40535c9e..cfb5fdd6a 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -1503,7 +1503,7 @@
   DONE;
 })
 
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(parallel [(set (match_operand:BLK 0 "general_operand")
 		   (match_operand:BLK 1 "general_operand"))
 	      (use (match_operand:SI 2 ""))
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 8f046de42..ee07aa9df 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -33472,7 +33472,7 @@ get_prev_label (tree function_name)
   return NULL_TREE;
 }
 
-/* Generate PIC and indirect symbol stubs.  */
+/* Generate external symbol indirection stubs (PIC and non-PIC).  */
 
 void
 machopic_output_stub (FILE *file, const char *symb, const char *stub)
@@ -38392,7 +38392,8 @@ rs6000_call_darwin_1 (rtx value, rtx func_desc, rtx tlsarg,
   if ((cookie_val & CALL_LONG) != 0
       && GET_CODE (func_desc) == SYMBOL_REF)
     {
-      /* FIXME: the longcall opt should not hang off picsymbol stubs.  */
+      /* FIXME: the longcall opt should not hang off this flag, it is most
+	 likely incorrect for kernel-mode code-generation.  */
       if (darwin_symbol_stubs && TARGET_32BIT)
 	make_island = true; /* Do nothing yet, retain the CALL_LONG flag.  */
       else
diff --git a/gcc/config/rx/rx.md b/gcc/config/rx/rx.md
index 2790882c9..9df73e6ef 100644
--- a/gcc/config/rx/rx.md
+++ b/gcc/config/rx/rx.md
@@ -46,7 +46,7 @@
    (UNSPEC_CONST           13)
    
    (UNSPEC_MOVSTR          20)
-   (UNSPEC_MOVMEM          21)
+   (UNSPEC_CPYMEM          21)
    (UNSPEC_SETMEM          22)
    (UNSPEC_STRLEN          23)
    (UNSPEC_CMPSTRN         24)
@@ -2449,13 +2449,13 @@
    (set_attr "timings" "1111")] ;; The timing is a guesstimate.
 )
 
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(parallel
     [(set (match_operand:BLK 0 "memory_operand")    ;; Dest
 	  (match_operand:BLK 1 "memory_operand"))   ;; Source
      (use (match_operand:SI  2 "register_operand")) ;; Length in bytes
      (match_operand          3 "immediate_operand") ;; Align
-     (unspec_volatile:BLK [(reg:SI 1) (reg:SI 2) (reg:SI 3)] UNSPEC_MOVMEM)]
+     (unspec_volatile:BLK [(reg:SI 1) (reg:SI 2) (reg:SI 3)] UNSPEC_CPYMEM)]
     )]
   "rx_allow_string_insns"
   {
@@ -2486,16 +2486,16 @@
     emit_move_insn (len, force_operand (operands[2], NULL_RTX));
     operands[0] = replace_equiv_address_nv (operands[0], addr1);
     operands[1] = replace_equiv_address_nv (operands[1], addr2);
-    emit_insn (gen_rx_movmem ());
+    emit_insn (gen_rx_cpymem ());
     DONE;
   }
 )
 
-(define_insn "rx_movmem"
+(define_insn "rx_cpymem"
   [(set (mem:BLK (reg:SI 1))
 	(mem:BLK (reg:SI 2)))
    (use (reg:SI 3))
-   (unspec_volatile:BLK [(reg:SI 1) (reg:SI 2) (reg:SI 3)] UNSPEC_MOVMEM)
+   (unspec_volatile:BLK [(reg:SI 1) (reg:SI 2) (reg:SI 3)] UNSPEC_CPYMEM)
    (clobber (reg:SI 1))
    (clobber (reg:SI 2))
    (clobber (reg:SI 3))]
diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index aa04479ec..b162b26b3 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -104,7 +104,7 @@ extern void s390_reload_symref_address (rtx , rtx , rtx , bool);
 extern void s390_expand_plus_operand (rtx, rtx, rtx);
 extern void emit_symbolic_move (rtx *);
 extern void s390_load_address (rtx, rtx);
-extern bool s390_expand_movmem (rtx, rtx, rtx);
+extern bool s390_expand_cpymem (rtx, rtx, rtx);
 extern void s390_expand_setmem (rtx, rtx, rtx);
 extern bool s390_expand_cmpmem (rtx, rtx, rtx, rtx);
 extern void s390_expand_vec_strlen (rtx, rtx, rtx);
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index c35666dec..2959f6423 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -5400,7 +5400,7 @@ legitimize_reload_address (rtx ad, machine_mode mode ATTRIBUTE_UNUSED,
 /* Emit code to move LEN bytes from DST to SRC.  */
 
 bool
-s390_expand_movmem (rtx dst, rtx src, rtx len)
+s390_expand_cpymem (rtx dst, rtx src, rtx len)
 {
   /* When tuning for z10 or higher we rely on the Glibc functions to
      do the right thing. Only for constant lengths below 64k we will
@@ -5425,14 +5425,14 @@ s390_expand_movmem (rtx dst, rtx src, rtx len)
 	{
 	  rtx newdst = adjust_address (dst, BLKmode, o);
 	  rtx newsrc = adjust_address (src, BLKmode, o);
-	  emit_insn (gen_movmem_short (newdst, newsrc,
+	  emit_insn (gen_cpymem_short (newdst, newsrc,
 				       GEN_INT (l > 256 ? 255 : l - 1)));
 	}
     }
 
   else if (TARGET_MVCLE)
     {
-      emit_insn (gen_movmem_long (dst, src, convert_to_mode (Pmode, len, 1)));
+      emit_insn (gen_cpymem_long (dst, src, convert_to_mode (Pmode, len, 1)));
     }
 
   else
@@ -5494,7 +5494,7 @@ s390_expand_movmem (rtx dst, rtx src, rtx len)
 	  emit_insn (prefetch);
 	}
 
-      emit_insn (gen_movmem_short (dst, src, GEN_INT (255)));
+      emit_insn (gen_cpymem_short (dst, src, GEN_INT (255)));
       s390_load_address (dst_addr,
 			 gen_rtx_PLUS (Pmode, dst_addr, GEN_INT (256)));
       s390_load_address (src_addr,
@@ -5511,7 +5511,7 @@ s390_expand_movmem (rtx dst, rtx src, rtx len)
       emit_jump (loop_start_label);
       emit_label (loop_end_label);
 
-      emit_insn (gen_movmem_short (dst, src,
+      emit_insn (gen_cpymem_short (dst, src,
 				   convert_to_mode (Pmode, count, 1)));
       emit_label (end_label);
     }
@@ -5563,7 +5563,7 @@ s390_expand_setmem (rtx dst, rtx len, rtx val)
 	    if (l > 1)
 	      {
 		rtx newdstp1 = adjust_address (dst, BLKmode, o + 1);
-		emit_insn (gen_movmem_short (newdstp1, newdst,
+		emit_insn (gen_cpymem_short (newdstp1, newdst,
 					     GEN_INT (l > 257 ? 255 : l - 2)));
 	      }
 	  }
@@ -5670,7 +5670,7 @@ s390_expand_setmem (rtx dst, rtx len, rtx val)
 	  /* Set the first byte in the block to the value and use an
 	     overlapping mvc for the block.  */
 	  emit_move_insn (adjust_address (dst, QImode, 0), val);
-	  emit_insn (gen_movmem_short (dstp1, dst, GEN_INT (254)));
+	  emit_insn (gen_cpymem_short (dstp1, dst, GEN_INT (254)));
 	}
       s390_load_address (dst_addr,
 			 gen_rtx_PLUS (Pmode, dst_addr, GEN_INT (256)));
@@ -5694,7 +5694,7 @@ s390_expand_setmem (rtx dst, rtx len, rtx val)
 	  emit_move_insn (adjust_address (dst, QImode, 0), val);
 	  /* execute only uses the lowest 8 bits of count that's
 	     exactly what we need here.  */
-	  emit_insn (gen_movmem_short (dstp1, dst,
+	  emit_insn (gen_cpymem_short (dstp1, dst,
 				       convert_to_mode (Pmode, count, 1)));
 	}
 
@@ -6336,7 +6336,7 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
 
 	  dest = adjust_address (dest, BLKmode, 0);
 	  set_mem_size (dest, size);
-	  s390_expand_movmem (dest, src_mem, GEN_INT (size));
+	  s390_expand_cpymem (dest, src_mem, GEN_INT (size));
 	  return true;
 	}
 
@@ -12408,7 +12408,7 @@ s390_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
 
   s390_check_type_for_vector_abi (type, true, false);
 
-  if (pass_by_reference (NULL, TYPE_MODE (type), type, false))
+  if (pass_va_arg_by_reference (type))
     {
       if (TARGET_DEBUG_ARG)
 	{
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 5a3496ac9..8dc3c12df 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -3196,17 +3196,17 @@
 
 
 ;
-; movmemM instruction pattern(s).
+; cpymemM instruction pattern(s).
 ;
 
-(define_expand "movmem<mode>"
+(define_expand "cpymem<mode>"
   [(set (match_operand:BLK 0 "memory_operand" "")   ; destination
         (match_operand:BLK 1 "memory_operand" ""))  ; source
    (use (match_operand:GPR 2 "general_operand" "")) ; count
    (match_operand 3 "" "")]
   ""
 {
-  if (s390_expand_movmem (operands[0], operands[1], operands[2]))
+  if (s390_expand_cpymem (operands[0], operands[1], operands[2]))
     DONE;
   else
     FAIL;
@@ -3215,7 +3215,7 @@
 ; Move a block that is up to 256 bytes in length.
 ; The block length is taken as (operands[2] % 256) + 1.
 
-(define_expand "movmem_short"
+(define_expand "cpymem_short"
   [(parallel
     [(set (match_operand:BLK 0 "memory_operand" "")
           (match_operand:BLK 1 "memory_operand" ""))
@@ -3225,7 +3225,7 @@
   ""
   "operands[3] = gen_rtx_SCRATCH (Pmode);")
 
-(define_insn "*movmem_short"
+(define_insn "*cpymem_short"
   [(set (match_operand:BLK 0 "memory_operand" "=Q,Q,Q,Q")
         (match_operand:BLK 1 "memory_operand" "Q,Q,Q,Q"))
    (use (match_operand 2 "nonmemory_operand" "n,a,a,a"))
@@ -3293,7 +3293,7 @@
 
 ; Move a block of arbitrary length.
 
-(define_expand "movmem_long"
+(define_expand "cpymem_long"
   [(parallel
     [(clobber (match_dup 2))
      (clobber (match_dup 3))
@@ -3327,7 +3327,7 @@
   operands[3] = reg1;
 })
 
-(define_insn "*movmem_long"
+(define_insn "*cpymem_long"
   [(clobber (match_operand:<DBL> 0 "register_operand" "=d"))
    (clobber (match_operand:<DBL> 1 "register_operand" "=d"))
    (set (mem:BLK (subreg:P (match_operand:<DBL> 2 "register_operand" "0") 0))
@@ -3340,7 +3340,7 @@
   [(set_attr "length" "8")
    (set_attr "type" "vs")])
 
-(define_insn "*movmem_long_31z"
+(define_insn "*cpymem_long_31z"
   [(clobber (match_operand:TI 0 "register_operand" "=d"))
    (clobber (match_operand:TI 1 "register_operand" "=d"))
    (set (mem:BLK (subreg:SI (match_operand:TI 2 "register_operand" "0") 4))
diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md
index fdb80d5d9..e687cf22a 100644
--- a/gcc/config/sh/sh.md
+++ b/gcc/config/sh/sh.md
@@ -8906,7 +8906,7 @@
 
 ;; String/block move insn.
 
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(parallel [(set (mem:BLK (match_operand:BLK 0))
 		   (mem:BLK (match_operand:BLK 1)))
 	      (use (match_operand:SI 2 "nonmemory_operand"))
diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c
index a993aab76..02966fd03 100644
--- a/gcc/config/sparc/sparc.c
+++ b/gcc/config/sparc/sparc.c
@@ -7965,7 +7965,7 @@ sparc_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
   bool indirect;
   tree ptrtype = build_pointer_type (type);
 
-  if (pass_by_reference (NULL, TYPE_MODE (type), type, false))
+  if (pass_va_arg_by_reference (type))
     {
       indirect = true;
       size = rsize = UNITS_PER_WORD;
diff --git a/gcc/config/sparc/sparc.h b/gcc/config/sparc/sparc.h
index 4b09fc86b..8807a56f4 100644
--- a/gcc/config/sparc/sparc.h
+++ b/gcc/config/sparc/sparc.h
@@ -1419,7 +1419,7 @@ do {									   \
 #define MOVE_MAX 8
 
 /* If a memory-to-memory move would take MOVE_RATIO or more simple
-   move-instruction pairs, we will do a movmem or libcall instead.  */
+   move-instruction pairs, we will do a cpymem or libcall instead.  */
 
 #define MOVE_RATIO(speed) ((speed) ? 8 : 3)
 
diff --git a/gcc/config/spu/spu.c b/gcc/config/spu/spu.c
index 8d7439e69..ecc767bfa 100644
--- a/gcc/config/spu/spu.c
+++ b/gcc/config/spu/spu.c
@@ -4053,8 +4053,7 @@ spu_gimplify_va_arg_expr (tree valist, tree type, gimple_seq * pre_p,
 
   /* if an object is dynamically sized, a pointer to it is passed
      instead of the object itself. */
-  pass_by_reference_p = pass_by_reference (NULL, TYPE_MODE (type), type,
-					   false);
+  pass_by_reference_p = pass_va_arg_by_reference (type);
   if (pass_by_reference_p)
     type = build_pointer_type (type);
   size = int_size_in_bytes (type);
diff --git a/gcc/config/tilegx/tilegx.c b/gcc/config/tilegx/tilegx.c
index 82226da3a..d12f1a99d 100644
--- a/gcc/config/tilegx/tilegx.c
+++ b/gcc/config/tilegx/tilegx.c
@@ -471,8 +471,7 @@ tilegx_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
 
   /* If an object is dynamically sized, a pointer to it is passed
      instead of the object itself.  */
-  pass_by_reference_p = pass_by_reference (NULL, TYPE_MODE (type), type,
-					   false);
+  pass_by_reference_p = pass_va_arg_by_reference (type);
 
   if (pass_by_reference_p)
     type = build_pointer_type (type);
diff --git a/gcc/config/tilepro/tilepro.c b/gcc/config/tilepro/tilepro.c
index c8d69d32f..f1a0df0ad 100644
--- a/gcc/config/tilepro/tilepro.c
+++ b/gcc/config/tilepro/tilepro.c
@@ -419,8 +419,7 @@ tilepro_gimplify_va_arg_expr (tree valist, tree type, gimple_seq * pre_p,
 
   /* if an object is dynamically sized, a pointer to it is passed
      instead of the object itself.  */
-  pass_by_reference_p = pass_by_reference (NULL, TYPE_MODE (type), type,
-					   false);
+  pass_by_reference_p = pass_va_arg_by_reference (type);
 
   if (pass_by_reference_p)
     type = build_pointer_type (type);
diff --git a/gcc/config/vax/vax-protos.h b/gcc/config/vax/vax-protos.h
index a76cf0239..a85cf3611 100644
--- a/gcc/config/vax/vax-protos.h
+++ b/gcc/config/vax/vax-protos.h
@@ -31,7 +31,6 @@ extern void vax_expand_addsub_di_operands (rtx *, enum rtx_code);
 extern const char * vax_output_int_move (rtx, rtx *, machine_mode);
 extern const char * vax_output_int_add (rtx_insn *, rtx *, machine_mode);
 extern const char * vax_output_int_subtract (rtx_insn *, rtx *, machine_mode);
-extern const char * vax_output_movmemsi (rtx, rtx *);
 #endif /* RTX_CODE */
 
 #ifdef REAL_VALUE_TYPE
diff --git a/gcc/config/vax/vax.h b/gcc/config/vax/vax.h
index a6a8227f7..e7137dc09 100644
--- a/gcc/config/vax/vax.h
+++ b/gcc/config/vax/vax.h
@@ -430,7 +430,7 @@ enum reg_class { NO_REGS, ALL_REGS, LIM_REG_CLASSES };
 #define MOVE_MAX 8
 
 /* If a memory-to-memory move would take MOVE_RATIO or more simple
-   move-instruction pairs, we will do a movmem or libcall instead.  */
+   move-instruction pairs, we will do a cpymem or libcall instead.  */
 #define MOVE_RATIO(speed) ((speed) ? 6 : 3)
 #define CLEAR_RATIO(speed) ((speed) ? 6 : 2)
 
diff --git a/gcc/config/vax/vax.md b/gcc/config/vax/vax.md
index bfeae7f80..298f3393d 100644
--- a/gcc/config/vax/vax.md
+++ b/gcc/config/vax/vax.md
@@ -206,8 +206,8 @@
 }")
 
 ;; This is here to accept 4 arguments and pass the first 3 along
-;; to the movmemhi1 pattern that really does the work.
-(define_expand "movmemhi"
+;; to the cpymemhi1 pattern that really does the work.
+(define_expand "cpymemhi"
   [(set (match_operand:BLK 0 "general_operand" "=g")
 	(match_operand:BLK 1 "general_operand" "g"))
    (use (match_operand:HI 2 "general_operand" "g"))
@@ -215,7 +215,7 @@
   ""
   "
 {
-  emit_insn (gen_movmemhi1 (operands[0], operands[1], operands[2]));
+  emit_insn (gen_cpymemhi1 (operands[0], operands[1], operands[2]));
   DONE;
 }")
 
@@ -224,7 +224,7 @@
 ;; that anything generated as this insn will be recognized as one
 ;; and that it won't successfully combine with anything.
 
-(define_insn "movmemhi1"
+(define_insn "cpymemhi1"
   [(set (match_operand:BLK 0 "memory_operand" "=o")
 	(match_operand:BLK 1 "memory_operand" "o"))
    (use (match_operand:HI 2 "general_operand" "g"))
diff --git a/gcc/config/visium/visium.c b/gcc/config/visium/visium.c
index 431f64cfc..4ff331362 100644
--- a/gcc/config/visium/visium.c
+++ b/gcc/config/visium/visium.c
@@ -1637,8 +1637,7 @@ visium_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
   tree f_ovfl, f_gbase, f_fbase, f_gbytes, f_fbytes;
   tree ovfl, base, bytes;
   HOST_WIDE_INT size, rsize;
-  const bool by_reference_p
-    = pass_by_reference (NULL, TYPE_MODE (type), type, false);
+  const bool by_reference_p = pass_va_arg_by_reference (type);
   const bool float_reg_arg_p
     = (TARGET_FPU && !by_reference_p
        && ((GET_MODE_CLASS (TYPE_MODE (type)) == MODE_FLOAT
diff --git a/gcc/config/visium/visium.h b/gcc/config/visium/visium.h
index 817e7dc70..c9376b28f 100644
--- a/gcc/config/visium/visium.h
+++ b/gcc/config/visium/visium.h
@@ -1138,8 +1138,8 @@ do									\
    always make code faster, but eventually incurs high cost in
    increased code size.
 
-   Since we have a movmemsi pattern, the default MOVE_RATIO is 2, which
-   is too low given that movmemsi will invoke a libcall.  */
+   Since we have a cpymemsi pattern, the default MOVE_RATIO is 2, which
+   is too low given that cpymemsi will invoke a libcall.  */
 #define MOVE_RATIO(speed) ((speed) ? 9 : 3)
 
 /* `CLEAR_RATIO (SPEED)`
diff --git a/gcc/config/visium/visium.md b/gcc/config/visium/visium.md
index f53544134..e146b89d1 100644
--- a/gcc/config/visium/visium.md
+++ b/gcc/config/visium/visium.md
@@ -3006,7 +3006,7 @@
 ;; Argument 2 is the length
 ;; Argument 3 is the alignment
 
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(parallel [(set (match_operand:BLK 0 "memory_operand" "")
 		   (match_operand:BLK 1 "memory_operand" ""))
 	      (use (match_operand:SI  2 "general_operand" ""))
diff --git a/gcc/config/xtensa/xtensa.c b/gcc/config/xtensa/xtensa.c
index ee5612441..b275deafa 100644
--- a/gcc/config/xtensa/xtensa.c
+++ b/gcc/config/xtensa/xtensa.c
@@ -3252,7 +3252,7 @@ xtensa_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
   tree lab_false, lab_over, lab_false2;
   bool indirect;
 
-  indirect = pass_by_reference (NULL, TYPE_MODE (type), type, false);
+  indirect = pass_va_arg_by_reference (type);
   if (indirect)
     type = build_pointer_type (type);
 
diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 362e5ff3c..d1448a02f 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -1026,7 +1026,7 @@
 
 ;; Block moves
 
-(define_expand "movmemsi"
+(define_expand "cpymemsi"
   [(parallel [(set (match_operand:BLK 0 "" "")
 		   (match_operand:BLK 1 "" ""))
 	      (use (match_operand:SI 2 "arith_operand" ""))
diff --git a/gcc/coretypes.h b/gcc/coretypes.h
index 2f6b8599d..88fe8a3f9 100644
--- a/gcc/coretypes.h
+++ b/gcc/coretypes.h
@@ -153,6 +153,14 @@ struct cl_option_handlers;
 struct diagnostic_context;
 struct pretty_printer;
 
+template<typename T> struct array_traits;
+
+/* Provides a read-only bitmap view of a single integer bitmask or an
+   array of integer bitmasks, or of a wrapper around such bitmasks.  */
+template<typename T, typename Traits = array_traits<T>,
+	 bool has_constant_size = Traits::has_constant_size>
+struct bitmap_view;
+
 /* Address space number for named address space support.  */
 typedef unsigned char addr_space_t;
 
@@ -332,6 +340,7 @@ namespace gcc {
 }
 
 typedef std::pair <tree, tree> tree_pair;
+typedef std::pair <const char *, int> string_int_pair;
 
 /* Define a name->value mapping.  */
 template <typename ValueType>
diff --git a/gcc/coverage.c b/gcc/coverage.c
index 1ffefd5f4..a63cb94e9 100644
--- a/gcc/coverage.c
+++ b/gcc/coverage.c
@@ -643,7 +643,7 @@ coverage_begin_function (unsigned lineno_checksum, unsigned cfg_checksum)
 		     (DECL_ASSEMBLER_NAME (current_function_decl)));
   gcov_write_unsigned (DECL_ARTIFICIAL (current_function_decl)
 		       && !DECL_FUNCTION_VERSIONED (current_function_decl)
-		       && !DECL_LAMBDA_FUNCTION (current_function_decl));
+		       && !DECL_LAMBDA_FUNCTION_P (current_function_decl));
   gcov_write_filename (xloc.file);
   gcov_write_unsigned (xloc.line);
   gcov_write_unsigned (xloc.column);
diff --git a/gcc/cp/call.c b/gcc/cp/call.c
index 23a54f3c3..3a821de7a 100644
--- a/gcc/cp/call.c
+++ b/gcc/cp/call.c
@@ -9166,12 +9166,14 @@ maybe_warn_class_memaccess (location_t loc, tree fndecl,
 }
 
 /* Build and return a call to FN, using NARGS arguments in ARGARRAY.
+   If FN is the result of resolving an overloaded target built-in,
+   ORIG_FNDECL is the original function decl, otherwise it is null.
    This function performs no overload resolution, conversion, or other
    high-level operations.  */
 
 tree
 build_cxx_call (tree fn, int nargs, tree *argarray,
-		tsubst_flags_t complain)
+		tsubst_flags_t complain, tree orig_fndecl)
 {
   tree fndecl;
 
@@ -9181,11 +9183,13 @@ build_cxx_call (tree fn, int nargs, tree *argarray,
   SET_EXPR_LOCATION (fn, loc);
 
   fndecl = get_callee_fndecl (fn);
+  if (!orig_fndecl)
+    orig_fndecl = fndecl;
 
   /* Check that arguments to builtin functions match the expectations.  */
   if (fndecl
       && !processing_template_decl
-      && fndecl_built_in_p (fndecl, BUILT_IN_NORMAL))
+      && fndecl_built_in_p (fndecl))
     {
       int i;
 
@@ -9195,7 +9199,7 @@ build_cxx_call (tree fn, int nargs, tree *argarray,
 	argarray[i] = maybe_constant_value (argarray[i]);
 
       if (!check_builtin_function_arguments (EXPR_LOCATION (fn), vNULL, fndecl,
-					     nargs, argarray))
+					     orig_fndecl, nargs, argarray))
 	return error_mark_node;
     }
 
diff --git a/gcc/cp/cp-objcp-common.h b/gcc/cp/cp-objcp-common.h
index 89a889a7d..e5d34f180 100644
--- a/gcc/cp/cp-objcp-common.h
+++ b/gcc/cp/cp-objcp-common.h
@@ -35,6 +35,8 @@ extern tree cp_get_global_decls ();
 extern tree cp_pushdecl (tree);
 extern void cp_register_dumps (gcc::dump_manager *);
 extern tree cxx_make_type_hook			(tree_code);
+extern tree cxx_simulate_enum_decl (location_t, const char *,
+				    vec<string_int_pair>);
 
 /* Lang hooks that are shared between C++ and ObjC++ are defined here.  Hooks
    specific to C++ or ObjC++ go in cp/cp-lang.c and objcp/objcp-lang.c,
@@ -100,6 +102,9 @@ extern tree cxx_make_type_hook			(tree_code);
 #define LANG_HOOKS_BUILTIN_FUNCTION cxx_builtin_function
 #undef  LANG_HOOKS_BUILTIN_FUNCTION_EXT_SCOPE
 #define LANG_HOOKS_BUILTIN_FUNCTION_EXT_SCOPE cxx_builtin_function_ext_scope
+#undef  LANG_HOOKS_SIMULATE_BUILTIN_FUNCTION_DECL
+#define LANG_HOOKS_SIMULATE_BUILTIN_FUNCTION_DECL \
+  cxx_simulate_builtin_function_decl
 #undef	LANG_HOOKS_TYPE_HASH_EQ
 #define LANG_HOOKS_TYPE_HASH_EQ	cxx_type_hash_eq
 #undef	LANG_HOOKS_COPY_LANG_QUALIFIERS
@@ -128,6 +133,8 @@ extern tree cxx_make_type_hook			(tree_code);
 
 #undef LANG_HOOKS_MAKE_TYPE
 #define LANG_HOOKS_MAKE_TYPE cxx_make_type_hook
+#undef LANG_HOOKS_SIMULATE_ENUM_DECL
+#define LANG_HOOKS_SIMULATE_ENUM_DECL cxx_simulate_enum_decl
 #undef LANG_HOOKS_TYPE_FOR_MODE
 #define LANG_HOOKS_TYPE_FOR_MODE c_common_type_for_mode
 #undef LANG_HOOKS_TYPE_FOR_SIZE
diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index f7c3eea4c..4bba1887f 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -6245,7 +6245,8 @@ extern tree perform_direct_initialization_if_possible (tree, tree, bool,
                                                        tsubst_flags_t);
 extern tree in_charge_arg_for_name		(tree);
 extern tree build_cxx_call			(tree, int, tree *,
-						 tsubst_flags_t);
+						 tsubst_flags_t,
+						 tree = NULL_TREE);
 extern bool is_std_init_list			(tree);
 extern bool is_list_ctor			(tree);
 extern void validate_conversion_obstack		(void);
@@ -6451,6 +6452,7 @@ extern tmpl_spec_kind current_tmpl_spec_kind	(int);
 extern tree cp_fname_init			(const char *, tree *);
 extern tree cxx_builtin_function		(tree decl);
 extern tree cxx_builtin_function_ext_scope	(tree decl);
+extern tree cxx_simulate_builtin_function_decl	(tree);
 extern tree check_elaborated_type_specifier	(enum tag_types, tree, bool);
 extern void warn_extern_redeclared_static	(tree, tree);
 extern tree cxx_comdat_group			(tree);
@@ -7386,7 +7388,8 @@ extern tree get_member_function_from_ptrfunc	(tree *, tree, tsubst_flags_t);
 extern tree cp_build_function_call_nary         (tree, tsubst_flags_t, ...)
 						ATTRIBUTE_SENTINEL;
 extern tree cp_build_function_call_vec		(tree, vec<tree, va_gc> **,
-						 tsubst_flags_t);
+						 tsubst_flags_t,
+						 tree = NULL_TREE);
 extern tree build_x_binary_op			(const op_location_t &,
 						 enum tree_code, tree,
 						 enum tree_code, tree,
diff --git a/gcc/cp/decl.c b/gcc/cp/decl.c
index 5c82c2272..928ac3f21 100644
--- a/gcc/cp/decl.c
+++ b/gcc/cp/decl.c
@@ -2273,7 +2273,8 @@ next_arg:;
 	  DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (newdecl)
 	    |= DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (olddecl);
 	  DECL_NO_LIMIT_STACK (newdecl) |= DECL_NO_LIMIT_STACK (olddecl);
-	  DECL_IS_OPERATOR_NEW (newdecl) |= DECL_IS_OPERATOR_NEW (olddecl);
+	  if (DECL_IS_OPERATOR_NEW_P (olddecl))
+	    DECL_SET_IS_OPERATOR_NEW (newdecl, true);
 	  DECL_LOOPING_CONST_OR_PURE_P (newdecl)
 	    |= DECL_LOOPING_CONST_OR_PURE_P (olddecl);
 
@@ -2520,8 +2521,7 @@ next_arg:;
       if (fndecl_built_in_p (olddecl)
 	  && (new_defines_function ? GNU_INLINE_P (newdecl) : types_match))
 	{
-	  DECL_BUILT_IN_CLASS (newdecl) = DECL_BUILT_IN_CLASS (olddecl);
-	  DECL_FUNCTION_CODE (newdecl) = DECL_FUNCTION_CODE (olddecl);
+	  copy_decl_built_in_function (newdecl, olddecl);
 	  /* If we're keeping the built-in definition, keep the rtl,
 	     regardless of declaration matches.  */
 	  COPY_DECL_RTL (olddecl, newdecl);
@@ -4335,10 +4335,10 @@ cxx_init_decl_processing (void)
     deltype = build_exception_variant (deltype, empty_except_spec);
     tree opnew = push_cp_library_fn (NEW_EXPR, newtype, 0);
     DECL_IS_MALLOC (opnew) = 1;
-    DECL_IS_OPERATOR_NEW (opnew) = 1;
+    DECL_SET_IS_OPERATOR_NEW (opnew, true);
     opnew = push_cp_library_fn (VEC_NEW_EXPR, newtype, 0);
     DECL_IS_MALLOC (opnew) = 1;
-    DECL_IS_OPERATOR_NEW (opnew) = 1;
+    DECL_SET_IS_OPERATOR_NEW (opnew, true);
     push_cp_library_fn (DELETE_EXPR, deltype, ECF_NOTHROW);
     push_cp_library_fn (VEC_DELETE_EXPR, deltype, ECF_NOTHROW);
     if (flag_sized_deallocation)
@@ -4371,10 +4371,10 @@ cxx_init_decl_processing (void)
 	newtype = build_exception_variant (newtype, new_eh_spec);
 	opnew = push_cp_library_fn (NEW_EXPR, newtype, 0);
 	DECL_IS_MALLOC (opnew) = 1;
-	DECL_IS_OPERATOR_NEW (opnew) = 1;
+	DECL_SET_IS_OPERATOR_NEW (opnew, true);
 	opnew = push_cp_library_fn (VEC_NEW_EXPR, newtype, 0);
 	DECL_IS_MALLOC (opnew) = 1;
-	DECL_IS_OPERATOR_NEW (opnew) = 1;
+	DECL_SET_IS_OPERATOR_NEW (opnew, true);
 
 	/* operator delete (void *, align_val_t); */
 	deltype = build_function_type_list (void_type_node, ptr_type_node,
@@ -4614,6 +4614,19 @@ cxx_builtin_function_ext_scope (tree decl)
   return builtin_function_1 (decl, NULL_TREE, true);
 }
 
+/* Implement LANG_HOOKS_SIMULATE_BUILTIN_FUNCTION_DECL.  */
+
+tree
+cxx_simulate_builtin_function_decl (tree decl)
+{
+  retrofit_lang_decl (decl);
+
+  DECL_ARTIFICIAL (decl) = 1;
+  SET_DECL_LANGUAGE (decl, lang_cplusplus);
+  DECL_CONTEXT (decl) = FROB_CONTEXT (current_namespace);
+  return pushdecl (decl);
+}
+
 /* Generate a FUNCTION_DECL with the typical flags for a runtime library
    function.  Not called directly.  */
 
@@ -13570,7 +13583,7 @@ grok_op_properties (tree decl, bool complain)
 	coerce_delete_type (decl, loc);
       else
 	{
-	  DECL_IS_OPERATOR_NEW (decl) = 1;
+	  DECL_SET_IS_OPERATOR_NEW (decl, true);
 	  TREE_TYPE (decl) = coerce_new_type (TREE_TYPE (decl), loc);
 	}
 
@@ -15119,6 +15132,40 @@ lookup_enumerator (tree enumtype, tree name)
   return e? TREE_VALUE (e) : NULL_TREE;
 }
 
+/* Implement LANG_HOOKS_SIMULATE_ENUM_DECL.  */
+
+tree
+cxx_simulate_enum_decl (location_t loc, const char *name,
+			vec<string_int_pair> values)
+{
+  location_t saved_loc = input_location;
+  input_location = loc;
+
+  tree enumtype = start_enum (get_identifier (name), NULL_TREE, NULL_TREE,
+			      NULL_TREE, false, NULL);
+  if (!OPAQUE_ENUM_P (enumtype))
+    {
+      error_at (loc, "multiple definition of %q#T", enumtype);
+      inform (DECL_SOURCE_LOCATION (TYPE_MAIN_DECL (enumtype)),
+	      "previous definition here");
+      return enumtype;
+    }
+  SET_OPAQUE_ENUM_P (enumtype, false);
+  DECL_SOURCE_LOCATION (TYPE_NAME (enumtype)) = loc;
+
+  string_int_pair *value;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (values, i, value)
+    build_enumerator (get_identifier (value->first),
+		      build_int_cst (integer_type_node, value->second),
+		      enumtype, NULL_TREE, loc);
+
+  finish_enum_value_list (enumtype);
+  finish_enum (enumtype);
+
+  input_location = saved_loc;
+  return enumtype;
+}
 
 /* We're defining DECL.  Make sure that its type is OK.  */
 
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index 60fe58e03..6fc6ed4e3 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -10977,7 +10977,7 @@ cp_parser_lambda_declarator_opt (cp_parser* parser, tree lambda_expr)
 	DECL_ARTIFICIAL (fco) = 1;
 	/* Give the object parameter a different name.  */
 	DECL_NAME (DECL_ARGUMENTS (fco)) = closure_identifier;
-	DECL_LAMBDA_FUNCTION (fco) = 1;
+	DECL_SET_LAMBDA_FUNCTION (fco, true);
       }
     if (template_param_list)
       {
diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
index ff7921533..bd6df79a4 100644
--- a/gcc/cp/pt.c
+++ b/gcc/cp/pt.c
@@ -28431,9 +28431,8 @@ declare_integer_pack (void)
 							 NULL_TREE),
 			       NULL_TREE, ECF_CONST);
   DECL_DECLARED_CONSTEXPR_P (ipfn) = true;
-  DECL_BUILT_IN_CLASS (ipfn) = BUILT_IN_FRONTEND;
-  DECL_FUNCTION_CODE (ipfn)
-    = (enum built_in_function) (int) CP_BUILT_IN_INTEGER_PACK;
+  set_decl_built_in_function (ipfn, BUILT_IN_FRONTEND,
+			      CP_BUILT_IN_INTEGER_PACK);
 }
 
 /* Set up the hash tables for template instantiations.  */
diff --git a/gcc/cp/typeck.c b/gcc/cp/typeck.c
index c42fd731c..82f7bb0bd 100644
--- a/gcc/cp/typeck.c
+++ b/gcc/cp/typeck.c
@@ -3738,11 +3738,11 @@ build_function_call (location_t /*loc*/,
 tree
 build_function_call_vec (location_t /*loc*/, vec<location_t> /*arg_loc*/,
 			 tree function, vec<tree, va_gc> *params,
-			 vec<tree, va_gc> * /*origtypes*/)
+			 vec<tree, va_gc> * /*origtypes*/, tree orig_function)
 {
   vec<tree, va_gc> *orig_params = params;
   tree ret = cp_build_function_call_vec (function, &params,
-					 tf_warning_or_error);
+					 tf_warning_or_error, orig_function);
 
   /* cp_build_function_call_vec can reallocate PARAMS by adding
      default arguments.  That should never happen here.  Verify
@@ -3787,13 +3787,15 @@ cp_build_function_call_nary (tree function, tsubst_flags_t complain, ...)
   return ret;
 }
 
-/* Build a function call using a vector of arguments.  PARAMS may be
-   NULL if there are no parameters.  This changes the contents of
-   PARAMS.  */
+/* Build a function call using a vector of arguments.
+   If FUNCTION is the result of resolving an overloaded target built-in,
+   ORIG_FNDECL is the original function decl, otherwise it is null.
+   PARAMS may be NULL if there are no parameters.  This changes the
+   contents of PARAMS.  */
 
 tree
 cp_build_function_call_vec (tree function, vec<tree, va_gc> **params,
-			    tsubst_flags_t complain)
+			    tsubst_flags_t complain, tree orig_fndecl)
 {
   tree fntype, fndecl;
   int is_method;
@@ -3918,7 +3920,7 @@ cp_build_function_call_vec (tree function, vec<tree, va_gc> **params,
   bool warned_p = check_function_arguments (input_location, fndecl, fntype,
 					    nargs, argarray, NULL);
 
-  ret = build_cxx_call (function, nargs, argarray, complain);
+  ret = build_cxx_call (function, nargs, argarray, complain, orig_fndecl);
 
   if (warned_p)
     {
diff --git a/gcc/cse.c b/gcc/cse.c
index 6c9cda16a..18eb8dfbb 100644
--- a/gcc/cse.c
+++ b/gcc/cse.c
@@ -559,7 +559,6 @@ static struct table_elt *insert_with_costs (rtx, struct table_elt *, unsigned,
 static struct table_elt *insert (rtx, struct table_elt *, unsigned,
 				 machine_mode);
 static void merge_equiv_classes (struct table_elt *, struct table_elt *);
-static void invalidate_reg (rtx, bool);
 static void invalidate (rtx, machine_mode);
 static void remove_invalid_refs (unsigned int);
 static void remove_invalid_subreg_refs (unsigned int, poly_uint64,
@@ -1821,12 +1820,10 @@ check_dependence (const_rtx x, rtx exp, machine_mode mode, rtx addr)
 }
 
 /* Remove from the hash table, or mark as invalid, all expressions whose
-   values could be altered by storing in register X.
-
-   CLOBBER_HIGH is set if X was part of a CLOBBER_HIGH expression.  */
+   values could be altered by storing in register X.  */
 
 static void
-invalidate_reg (rtx x, bool clobber_high)
+invalidate_reg (rtx x)
 {
   gcc_assert (GET_CODE (x) == REG);
 
@@ -1851,10 +1848,7 @@ invalidate_reg (rtx x, bool clobber_high)
   SUBREG_TICKED (regno) = -1;
 
   if (regno >= FIRST_PSEUDO_REGISTER)
-    {
-      gcc_assert (!clobber_high);
-      remove_pseudo_from_table (x, hash);
-    }
+    remove_pseudo_from_table (x, hash);
   else
     {
       HOST_WIDE_INT in_table = TEST_HARD_REG_BIT (hard_regs_in_table, regno);
@@ -1882,18 +1876,10 @@ invalidate_reg (rtx x, bool clobber_high)
 	      if (!REG_P (p->exp) || REGNO (p->exp) >= FIRST_PSEUDO_REGISTER)
 		continue;
 
-	      if (clobber_high)
-		{
-		  if (reg_is_clobbered_by_clobber_high (p->exp, x))
-		    remove_from_table (p, hash);
-		}
-	      else
-		{
-		  unsigned int tregno = REGNO (p->exp);
-		  unsigned int tendregno = END_REGNO (p->exp);
-		  if (tendregno > regno && tregno < endregno)
-		    remove_from_table (p, hash);
-		}
+	      unsigned int tregno = REGNO (p->exp);
+	      unsigned int tendregno = END_REGNO (p->exp);
+	      if (tendregno > regno && tregno < endregno)
+		remove_from_table (p, hash);
 	    }
     }
 }
@@ -1920,7 +1906,7 @@ invalidate (rtx x, machine_mode full_mode)
   switch (GET_CODE (x))
     {
     case REG:
-      invalidate_reg (x, false);
+      invalidate_reg (x);
       return;
 
     case SUBREG:
@@ -4420,8 +4406,6 @@ canonicalize_insn (rtx_insn *insn, struct set **psets, int n_sets)
       if (MEM_P (XEXP (x, 0)))
 	canon_reg (XEXP (x, 0), insn);
     }
-  else if (GET_CODE (x) == CLOBBER_HIGH)
-    gcc_assert (REG_P (XEXP (x, 0)));
   else if (GET_CODE (x) == USE
 	   && ! (REG_P (XEXP (x, 0))
 		 && REGNO (XEXP (x, 0)) < FIRST_PSEUDO_REGISTER))
@@ -4453,8 +4437,6 @@ canonicalize_insn (rtx_insn *insn, struct set **psets, int n_sets)
 	      if (MEM_P (XEXP (y, 0)))
 		canon_reg (XEXP (y, 0), insn);
 	    }
-	  else if (GET_CODE (y) == CLOBBER_HIGH)
-	    gcc_assert (REG_P (XEXP (y, 0)));
 	  else if (GET_CODE (y) == USE
 		   && ! (REG_P (XEXP (y, 0))
 			 && REGNO (XEXP (y, 0)) < FIRST_PSEUDO_REGISTER))
@@ -6155,12 +6137,6 @@ invalidate_from_clobbers (rtx_insn *insn)
 	    invalidate (XEXP (ref, 0), GET_MODE (ref));
 	}
     }
-  if (GET_CODE (x) == CLOBBER_HIGH)
-    {
-      rtx ref = XEXP (x, 0);
-      gcc_assert (REG_P (ref));
-      invalidate_reg (ref, true);
-    }
   else if (GET_CODE (x) == PARALLEL)
     {
       int i;
@@ -6177,12 +6153,6 @@ invalidate_from_clobbers (rtx_insn *insn)
 		       || GET_CODE (ref) == ZERO_EXTRACT)
 		invalidate (XEXP (ref, 0), GET_MODE (ref));
 	    }
-	  else if (GET_CODE (y) == CLOBBER_HIGH)
-	    {
-	      rtx ref = XEXP (y, 0);
-	      gcc_assert (REG_P (ref));
-	      invalidate_reg (ref, true);
-	    }
 	}
     }
 }
@@ -6204,12 +6174,6 @@ invalidate_from_sets_and_clobbers (rtx_insn *insn)
 	  rtx temx = XEXP (tem, 0);
 	  if (GET_CODE (temx) == CLOBBER)
 	    invalidate (SET_DEST (temx), VOIDmode);
-	  else if (GET_CODE (temx) == CLOBBER_HIGH)
-	    {
-	      rtx temref = XEXP (temx, 0);
-	      gcc_assert (REG_P (temref));
-	      invalidate_reg (temref, true);
-	    }
 	}
     }
 
@@ -6237,12 +6201,6 @@ invalidate_from_sets_and_clobbers (rtx_insn *insn)
 		       || GET_CODE (clobbered) == ZERO_EXTRACT)
 		invalidate (XEXP (clobbered, 0), GET_MODE (clobbered));
 	    }
-	  else if (GET_CODE (y) == CLOBBER_HIGH)
-	    {
-	      rtx ref = XEXP (y, 0);
-	      gcc_assert (REG_P (ref));
-	      invalidate_reg (ref, true);
-	    }
 	  else if (GET_CODE (y) == SET && GET_CODE (SET_SRC (y)) == CALL)
 	    invalidate (SET_DEST (y), VOIDmode);
 	}
@@ -6902,10 +6860,6 @@ count_reg_usage (rtx x, int *counts, rtx dest, int incr)
 	count_reg_usage (XEXP (XEXP (x, 0), 0), counts, NULL_RTX, incr);
       return;
 
-    case CLOBBER_HIGH:
-      gcc_assert (REG_P ((XEXP (x, 0))));
-      return;
-
     case SET:
       /* Unless we are setting a REG, count everything in SET_DEST.  */
       if (!REG_P (SET_DEST (x)))
@@ -6958,8 +6912,7 @@ count_reg_usage (rtx x, int *counts, rtx dest, int incr)
 	  || (REG_NOTE_KIND (x) != REG_NONNEG && GET_CODE (XEXP (x,0)) == USE)
 	  /* FUNCTION_USAGE expression lists may include (CLOBBER (mem /u)),
 	     involving registers in the address.  */
-	  || GET_CODE (XEXP (x, 0)) == CLOBBER
-	  || GET_CODE (XEXP (x, 0)) == CLOBBER_HIGH)
+	  || GET_CODE (XEXP (x, 0)) == CLOBBER)
 	count_reg_usage (XEXP (x, 0), counts, NULL_RTX, incr);
 
       count_reg_usage (XEXP (x, 1), counts, NULL_RTX, incr);
@@ -7043,9 +6996,7 @@ insn_live_p (rtx_insn *insn, int *counts)
 	      if (set_live_p (elt, insn, counts))
 		return true;
 	    }
-	  else if (GET_CODE (elt) != CLOBBER
-		   && GET_CODE (elt) != CLOBBER_HIGH
-		   && GET_CODE (elt) != USE)
+	  else if (GET_CODE (elt) != CLOBBER && GET_CODE (elt) != USE)
 	    return true;
 	}
       return false;
@@ -7158,7 +7109,7 @@ delete_trivially_dead_insns (rtx_insn *insns, int nreg)
 	else if (INSN_P (insn))
 	  {
 	    count_reg_usage (insn, counts, NULL_RTX, 1);
-	    note_stores (PATTERN (insn), count_stores, counts + nreg * 2);
+	    note_stores (insn, count_stores, counts + nreg * 2);
 	  }
       /* If there can be debug insns, COUNTS are 3 consecutive arrays.
 	 First one counts how many times each pseudo is used outside
diff --git a/gcc/cselib.c b/gcc/cselib.c
index 108b2588c..e3408bb38 100644
--- a/gcc/cselib.c
+++ b/gcc/cselib.c
@@ -32,6 +32,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "dumpfile.h"
 #include "cselib.h"
 #include "params.h"
+#include "function-abi.h"
 
 /* A list of cselib_val structures.  */
 struct elt_list
@@ -54,8 +55,7 @@ static unsigned int cselib_hash_rtx (rtx, int, machine_mode);
 static cselib_val *new_cselib_val (unsigned int, machine_mode, rtx);
 static void add_mem_for_addr (cselib_val *, cselib_val *, rtx);
 static cselib_val *cselib_lookup_mem (rtx, int);
-static void cselib_invalidate_regno (unsigned int, machine_mode,
-				     const_rtx = NULL);
+static void cselib_invalidate_regno (unsigned int, machine_mode);
 static void cselib_invalidate_mem (rtx);
 static void cselib_record_set (rtx, cselib_val *, cselib_val *);
 static void cselib_record_sets (rtx_insn *);
@@ -1662,7 +1662,6 @@ cselib_expand_value_rtx_1 (rtx orig, struct expand_value_data *evd,
       /* SCRATCH must be shared because they represent distinct values.  */
       return orig;
     case CLOBBER:
-    case CLOBBER_HIGH:
       if (REG_P (XEXP (orig, 0)) && HARD_REGISTER_NUM_P (REGNO (XEXP (orig, 0))))
 	return orig;
       break;
@@ -2165,8 +2164,7 @@ cselib_lookup (rtx x, machine_mode mode,
    invalidating call clobbered registers across a call.  */
 
 static void
-cselib_invalidate_regno (unsigned int regno, machine_mode mode,
-			 const_rtx setter)
+cselib_invalidate_regno (unsigned int regno, machine_mode mode)
 {
   unsigned int endregno;
   unsigned int i;
@@ -2189,9 +2187,6 @@ cselib_invalidate_regno (unsigned int regno, machine_mode mode,
 	i = regno - max_value_regs;
 
       endregno = end_hard_regno (mode, regno);
-
-      if (setter && GET_CODE (setter) == CLOBBER_HIGH)
-	gcc_assert (endregno == regno + 1);
     }
   else
     {
@@ -2224,19 +2219,6 @@ cselib_invalidate_regno (unsigned int regno, machine_mode mode,
 	      continue;
 	    }
 
-	  /* Ignore if clobber high and the register isn't clobbered.  */
-	  if (setter && GET_CODE (setter) == CLOBBER_HIGH)
-	    {
-	      gcc_assert (endregno == regno + 1);
-	      const_rtx x = XEXP (setter, 0);
-	      if (!reg_is_clobbered_by_clobber_high (i, GET_MODE (v->val_rtx),
-						     x))
-		{
-		  l = &(*l)->next;
-		  continue;
-		}
-	    }
-
 	  /* We have an overlap.  */
 	  if (*l == REG_VALUES (i))
 	    {
@@ -2371,10 +2353,10 @@ cselib_invalidate_mem (rtx mem_rtx)
   *vp = &dummy_val;
 }
 
-/* Invalidate DEST, which is being assigned to or clobbered by SETTER.  */
+/* Invalidate DEST.  */
 
 void
-cselib_invalidate_rtx (rtx dest, const_rtx setter)
+cselib_invalidate_rtx (rtx dest)
 {
   while (GET_CODE (dest) == SUBREG
 	 || GET_CODE (dest) == ZERO_EXTRACT
@@ -2382,7 +2364,7 @@ cselib_invalidate_rtx (rtx dest, const_rtx setter)
     dest = XEXP (dest, 0);
 
   if (REG_P (dest))
-    cselib_invalidate_regno (REGNO (dest), GET_MODE (dest), setter);
+    cselib_invalidate_regno (REGNO (dest), GET_MODE (dest));
   else if (MEM_P (dest))
     cselib_invalidate_mem (dest);
 }
@@ -2390,10 +2372,10 @@ cselib_invalidate_rtx (rtx dest, const_rtx setter)
 /* A wrapper for cselib_invalidate_rtx to be called via note_stores.  */
 
 static void
-cselib_invalidate_rtx_note_stores (rtx dest, const_rtx setter,
+cselib_invalidate_rtx_note_stores (rtx dest, const_rtx,
 				   void *data ATTRIBUTE_UNUSED)
 {
-  cselib_invalidate_rtx (dest, setter);
+  cselib_invalidate_rtx (dest);
 }
 
 /* Record the result of a SET instruction.  DEST is being set; the source
@@ -2659,7 +2641,7 @@ cselib_record_sets (rtx_insn *insn)
   /* Invalidate all locations written by this insn.  Note that the elts we
      looked up in the previous loop aren't affected, just some of their
      locations may go away.  */
-  note_stores (body, cselib_invalidate_rtx_note_stores, NULL);
+  note_pattern_stores (body, cselib_invalidate_rtx_note_stores, NULL);
 
   for (i = n_sets_before_autoinc; i < n_sets; i++)
     cselib_invalidate_rtx (sets[i].dest);
@@ -2765,11 +2747,13 @@ cselib_process_insn (rtx_insn *insn)
      memory.  */
   if (CALL_P (insn))
     {
+      function_abi callee_abi = insn_callee_abi (insn);
       for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-	if (call_used_regs[i]
+	if (call_used_or_fixed_reg_p (i)
 	    || (REG_VALUES (i) && REG_VALUES (i)->elt
 		&& (targetm.hard_regno_call_part_clobbered
-		    (insn, i, GET_MODE (REG_VALUES (i)->elt->val_rtx)))))
+		    (callee_abi.id (), i,
+		     GET_MODE (REG_VALUES (i)->elt->val_rtx)))))
 	  cselib_invalidate_regno (i, reg_raw_mode[i]);
 
       /* Since it is not clear how cselib is going to be used, be
@@ -2794,11 +2778,9 @@ cselib_process_insn (rtx_insn *insn)
   if (CALL_P (insn))
     {
       for (x = CALL_INSN_FUNCTION_USAGE (insn); x; x = XEXP (x, 1))
-	{
-	  gcc_assert (GET_CODE (XEXP (x, 0)) != CLOBBER_HIGH);
-	  if (GET_CODE (XEXP (x, 0)) == CLOBBER)
-	    cselib_invalidate_rtx (XEXP (XEXP (x, 0), 0));
-	}
+	if (GET_CODE (XEXP (x, 0)) == CLOBBER)
+	  cselib_invalidate_rtx (XEXP (XEXP (x, 0), 0));
+
       /* Flush everything on setjmp.  */
       if (cselib_preserve_constants
 	  && find_reg_note (insn, REG_SETJMP, NULL))
diff --git a/gcc/cselib.h b/gcc/cselib.h
index 8b8d3e8d5..b5854aedc 100644
--- a/gcc/cselib.h
+++ b/gcc/cselib.h
@@ -92,7 +92,7 @@ extern bool cselib_dummy_expand_value_rtx_cb (rtx, bitmap, int,
 					      cselib_expand_callback, void *);
 extern rtx cselib_subst_to_values (rtx, machine_mode);
 extern rtx cselib_subst_to_values_from_insn (rtx, machine_mode, rtx_insn *);
-extern void cselib_invalidate_rtx (rtx, const_rtx = NULL);
+extern void cselib_invalidate_rtx (rtx);
 
 extern void cselib_reset_table (unsigned int);
 extern unsigned int cselib_get_next_uid (void);
diff --git a/gcc/d/intrinsics.cc b/gcc/d/intrinsics.cc
index 4bd321b2d..56eab522e 100644
--- a/gcc/d/intrinsics.cc
+++ b/gcc/d/intrinsics.cc
@@ -134,10 +134,7 @@ maybe_set_intrinsic (FuncDeclaration *decl)
 	  /* If there is no function body, then the implementation is always
 	     provided by the compiler.  */
 	  if (!decl->fbody)
-	    {
-	      DECL_BUILT_IN_CLASS (decl->csym) = BUILT_IN_FRONTEND;
-	      DECL_FUNCTION_CODE (decl->csym) = (built_in_function) code;
-	    }
+	    set_decl_built_in_function (decl->csym, BUILT_IN_FRONTEND, code);
 
 	  /* Infer whether the intrinsic can be used for CTFE, let the
 	     front-end know that it can be evaluated at compile-time.  */
diff --git a/gcc/dce.c b/gcc/dce.c
index 68d3713b0..2894fa57b 100644
--- a/gcc/dce.c
+++ b/gcc/dce.c
@@ -174,7 +174,6 @@ deletable_insn_p (rtx_insn *insn, bool fast, bitmap arg_stores)
       return false;
 
     case CLOBBER:
-    case CLOBBER_HIGH:
       if (fast)
 	{
 	  /* A CLOBBER of a dead pseudo register serves no purpose.
@@ -244,10 +243,7 @@ static void
 mark_nonreg_stores_1 (rtx dest, const_rtx pattern, void *data)
 {
   if (GET_CODE (pattern) != CLOBBER && !REG_P (dest))
-    {
-      gcc_checking_assert (GET_CODE (pattern) != CLOBBER_HIGH);
-      mark_insn ((rtx_insn *) data, true);
-    }
+    mark_insn ((rtx_insn *) data, true);
 }
 
 
@@ -258,22 +254,19 @@ static void
 mark_nonreg_stores_2 (rtx dest, const_rtx pattern, void *data)
 {
   if (GET_CODE (pattern) != CLOBBER && !REG_P (dest))
-    {
-      gcc_checking_assert (GET_CODE (pattern) != CLOBBER_HIGH);
-      mark_insn ((rtx_insn *) data, false);
-    }
+    mark_insn ((rtx_insn *) data, false);
 }
 
 
-/* Mark INSN if BODY stores to a non-register destination.  */
+/* Mark INSN if it stores to a non-register destination.  */
 
 static void
-mark_nonreg_stores (rtx body, rtx_insn *insn, bool fast)
+mark_nonreg_stores (rtx_insn *insn, bool fast)
 {
   if (fast)
-    note_stores (body, mark_nonreg_stores_1, insn);
+    note_stores (insn, mark_nonreg_stores_1, insn);
   else
-    note_stores (body, mark_nonreg_stores_2, insn);
+    note_stores (insn, mark_nonreg_stores_2, insn);
 }
 
 
@@ -691,7 +684,7 @@ prescan_insns_for_dce (bool fast)
 	    if (arg_stores && bitmap_bit_p (arg_stores, INSN_UID (insn)))
 	      continue;
 	    if (deletable_insn_p (insn, fast, arg_stores))
-	      mark_nonreg_stores (PATTERN (insn), insn, fast);
+	      mark_nonreg_stores (insn, fast);
 	    else
 	      mark_insn (insn, fast);
 	  }
diff --git a/gcc/ddg.c b/gcc/ddg.c
index 82554ed96..47a50d8ea 100644
--- a/gcc/ddg.c
+++ b/gcc/ddg.c
@@ -84,7 +84,7 @@ static bool
 mem_write_insn_p (rtx_insn *insn)
 {
   mem_ref_p = false;
-  note_stores (PATTERN (insn), mark_mem_store, NULL);
+  note_stores (insn, mark_mem_store, NULL);
   return mem_ref_p;
 }
 
diff --git a/gcc/defaults.h b/gcc/defaults.h
index b75342561..72d4fba11 100644
--- a/gcc/defaults.h
+++ b/gcc/defaults.h
@@ -1318,10 +1318,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #endif
 
 /* If a memory-to-memory move would take MOVE_RATIO or more simple
-   move-instruction sequences, we will do a movmem or libcall instead.  */
+   move-instruction sequences, we will do a cpymem or libcall instead.  */
 
 #ifndef MOVE_RATIO
-#if defined (HAVE_movmemqi) || defined (HAVE_movmemhi) || defined (HAVE_movmemsi) || defined (HAVE_movmemdi) || defined (HAVE_movmemti)
+#if defined (HAVE_cpymemqi) || defined (HAVE_cpymemhi) || defined (HAVE_cpymemsi) || defined (HAVE_cpymemdi) || defined (HAVE_cpymemti)
 #define MOVE_RATIO(speed) 2
 #else
 /* If we are optimizing for space (-Os), cut down the default move ratio.  */
@@ -1342,7 +1342,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #endif
 
 /* If a memory set (to value other than zero) operation would take
-   SET_RATIO or more simple move-instruction sequences, we will do a movmem
+   SET_RATIO or more simple move-instruction sequences, we will do a setmem
    or libcall instead.  */
 #ifndef SET_RATIO
 #define SET_RATIO(speed) MOVE_RATIO (speed)
@@ -1459,4 +1459,18 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define DWARF_GNAT_ENCODINGS_DEFAULT DWARF_GNAT_ENCODINGS_GDB
 #endif
 
+#ifndef USED_FOR_TARGET
+/* Done this way to keep gengtype happy.  */
+#if BITS_PER_UNIT == 8
+#define TARGET_UNIT uint8_t
+#elif BITS_PER_UNIT == 16
+#define TARGET_UNIT uint16_t
+#elif BITS_PER_UNIT == 32
+#define TARGET_UNIT uint32_t
+#else
+#error Unknown BITS_PER_UNIT
+#endif
+typedef TARGET_UNIT target_unit;
+#endif
+
 #endif  /* ! GCC_DEFAULTS_H */
diff --git a/gcc/df-core.c b/gcc/df-core.c
index b19ba289d..2181ff131 100644
--- a/gcc/df-core.c
+++ b/gcc/df-core.c
@@ -2052,7 +2052,7 @@ debug_regset (regset r)
    This is part of making a debugging dump.  */
 
 void
-df_print_regset (FILE *file, bitmap r)
+df_print_regset (FILE *file, const_bitmap r)
 {
   unsigned int i;
   bitmap_iterator bi;
@@ -2077,7 +2077,7 @@ df_print_regset (FILE *file, bitmap r)
    debugging dump.  */
 
 void
-df_print_word_regset (FILE *file, bitmap r)
+df_print_word_regset (FILE *file, const_bitmap r)
 {
   unsigned int max_reg = max_reg_num ();
 
diff --git a/gcc/df-problems.c b/gcc/df-problems.c
index a9dfa6203..3c7aeceb2 100644
--- a/gcc/df-problems.c
+++ b/gcc/df-problems.c
@@ -388,7 +388,6 @@ df_rd_local_compute (bitmap all_blocks)
 {
   unsigned int bb_index;
   bitmap_iterator bi;
-  unsigned int regno;
   struct df_rd_problem_data *problem_data
     = (struct df_rd_problem_data *) df_rd->problem_data;
   bitmap sparse_invalidated = &problem_data->sparse_invalidated_by_call;
@@ -405,10 +404,9 @@ df_rd_local_compute (bitmap all_blocks)
     }
 
   /* Set up the knockout bit vectors to be applied across EH_EDGES.  */
-  EXECUTE_IF_SET_IN_BITMAP (regs_invalidated_by_call_regset, 0, regno, bi)
-    {
-      if (! HARD_REGISTER_NUM_P (regno)
-	  || !(df->changeable_flags & DF_NO_HARD_REGS))
+  if (!(df->changeable_flags & DF_NO_HARD_REGS))
+    for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
+      if (TEST_HARD_REG_BIT (regs_invalidated_by_call, regno))
 	{
 	  if (DF_DEFS_COUNT (regno) > DF_SPARSE_THRESHOLD)
 	    bitmap_set_bit (sparse_invalidated, regno);
@@ -417,7 +415,6 @@ df_rd_local_compute (bitmap all_blocks)
 			      DF_DEFS_BEGIN (regno),
 			      DF_DEFS_COUNT (regno));
 	}
-    }
 
   bitmap_release (&seen_in_block);
   bitmap_release (&seen_in_insn);
@@ -982,7 +979,10 @@ df_lr_confluence_n (edge e)
   /* ??? Abnormal call edges ignored for the moment, as this gets
      confused by sibling call edges, which crashes reg-stack.  */
   if (e->flags & EDGE_EH)
-    changed = bitmap_ior_and_compl_into (op1, op2, regs_invalidated_by_call_regset);
+    {
+      bitmap_view<HARD_REG_SET> eh_kills (regs_invalidated_by_call);
+      changed = bitmap_ior_and_compl_into (op1, op2, eh_kills);
+    }
   else
     changed = bitmap_ior_into (op1, op2);
 
@@ -4093,8 +4093,7 @@ can_move_insns_across (rtx_insn *from, rtx_insn *to,
 	  if (volatile_insn_p (PATTERN (insn)))
 	    return false;
 	  memrefs_in_across |= find_memory (insn);
-	  note_stores (PATTERN (insn), find_memory_stores,
-		       &mem_sets_in_across);
+	  note_stores (insn, find_memory_stores, &mem_sets_in_across);
 	  /* This is used just to find sets of the stack pointer.  */
 	  memrefs_in_across |= mem_sets_in_across;
 	  trapping_insns_in_across |= may_trap_p (PATTERN (insn));
@@ -4173,7 +4172,7 @@ can_move_insns_across (rtx_insn *from, rtx_insn *to,
 	    {
 	      int mem_ref_flags = 0;
 	      int mem_set_flags = 0;
-	      note_stores (PATTERN (insn), find_memory_stores, &mem_set_flags);
+	      note_stores (insn, find_memory_stores, &mem_set_flags);
 	      mem_ref_flags = find_memory (insn);
 	      /* Catch sets of the stack pointer.  */
 	      mem_ref_flags |= mem_set_flags;
@@ -4635,8 +4634,10 @@ df_md_confluence_n (edge e)
     return false;
 
   if (e->flags & EDGE_EH)
-    return bitmap_ior_and_compl_into (op1, op2,
-				      regs_invalidated_by_call_regset);
+    {
+      bitmap_view<HARD_REG_SET> eh_kills (regs_invalidated_by_call);
+      return bitmap_ior_and_compl_into (op1, op2, eh_kills);
+    }
   else
     return bitmap_ior_into (op1, op2);
 }
diff --git a/gcc/df-scan.c b/gcc/df-scan.c
index 84c2e54c8..ea149c6cc 100644
--- a/gcc/df-scan.c
+++ b/gcc/df-scan.c
@@ -35,7 +35,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "emit-rtl.h"  /* FIXME: Can go away once crtl is moved to rtl.h.  */
 #include "dumpfile.h"
 #include "calls.h"
-
+#include "function-abi.h"
 
 /* The set of hard registers in eliminables[i].from. */
 
@@ -312,7 +312,7 @@ df_scan_start_dump (FILE *file ATTRIBUTE_UNUSED)
   rtx_insn *insn;
 
   fprintf (file, ";;  invalidated by call \t");
-  df_print_regset (file, regs_invalidated_by_call_regset);
+  df_print_regset (file, bitmap_view<HARD_REG_SET> (regs_invalidated_by_call));
   fprintf (file, ";;  hardware regs used \t");
   df_print_regset (file, &df->hardware_regs_used);
   fprintf (file, ";;  regular block artificial uses \t");
@@ -2773,7 +2773,6 @@ df_find_hard_reg_defs (rtx x, HARD_REG_SET *defs)
       break;
 
     case CLOBBER:
-    case CLOBBER_HIGH:
       df_find_hard_reg_defs_1 (XEXP (x, 0), defs);
       break;
 
@@ -2833,10 +2832,6 @@ df_uses_record (struct df_collection_rec *collection_rec,
       /* If we're clobbering a REG then we have a def so ignore.  */
       return;
 
-    case CLOBBER_HIGH:
-      gcc_assert (REG_P (XEXP (x, 0)));
-      return;
-
     case MEM:
       df_uses_record (collection_rec,
 		      &XEXP (x, 0), DF_REF_REG_MEM_LOAD,
@@ -3087,13 +3082,11 @@ df_get_call_refs (struct df_collection_rec *collection_rec,
   bool is_sibling_call;
   unsigned int i;
   HARD_REG_SET defs_generated;
-  HARD_REG_SET fn_reg_set_usage;
 
   CLEAR_HARD_REG_SET (defs_generated);
   df_find_hard_reg_defs (PATTERN (insn_info->insn), &defs_generated);
   is_sibling_call = SIBLING_CALL_P (insn_info->insn);
-  get_call_reg_set_usage (insn_info->insn, &fn_reg_set_usage,
-			  regs_invalidated_by_call);
+  function_abi callee_abi = insn_callee_abi (insn_info->insn);
 
   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
     {
@@ -3117,7 +3110,7 @@ df_get_call_refs (struct df_collection_rec *collection_rec,
 			       NULL, bb, insn_info, DF_REF_REG_DEF, flags);
 	    }
 	}
-      else if (TEST_HARD_REG_BIT (fn_reg_set_usage, i)
+      else if (callee_abi.clobbers_full_reg_p (i)
 	       /* no clobbers for regs that are the result of the call */
 	       && !TEST_HARD_REG_BIT (defs_generated, i)
 	       && (!is_sibling_call
@@ -3133,7 +3126,6 @@ df_get_call_refs (struct df_collection_rec *collection_rec,
   for (note = CALL_INSN_FUNCTION_USAGE (insn_info->insn); note;
        note = XEXP (note, 1))
     {
-      gcc_assert (GET_CODE (XEXP (note, 0)) != CLOBBER_HIGH);
       if (GET_CODE (XEXP (note, 0)) == USE)
         df_uses_record (collection_rec, &XEXP (XEXP (note, 0), 0),
 			DF_REF_REG_USE, bb, insn_info, flags);
@@ -3499,7 +3491,9 @@ df_get_entry_block_def_set (bitmap entry_block_defs)
       /* Defs for the callee saved registers are inserted so that the
 	 pushes have some defining location.  */
       for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-	if ((call_used_regs[i] == 0) && (df_regs_ever_live_p (i)))
+	if (!crtl->abi->clobbers_full_reg_p (i)
+	    && !fixed_regs[i]
+	    && df_regs_ever_live_p (i))
 	  bitmap_set_bit (entry_block_defs, i);
     }
 
@@ -3682,8 +3676,9 @@ df_get_exit_block_use_set (bitmap exit_block_uses)
     {
       /* Mark all call-saved registers that we actually used.  */
       for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-	if (df_regs_ever_live_p (i) && !LOCAL_REGNO (i)
-	    && !TEST_HARD_REG_BIT (regs_invalidated_by_call, i))
+	if (df_regs_ever_live_p (i)
+	    && !LOCAL_REGNO (i)
+	    && !crtl->abi->clobbers_full_reg_p (i))
 	  bitmap_set_bit (exit_block_uses, i);
     }
 
diff --git a/gcc/df.h b/gcc/df.h
index d76d31baa..241812235 100644
--- a/gcc/df.h
+++ b/gcc/df.h
@@ -984,8 +984,8 @@ extern bool df_reg_defined (rtx_insn *, rtx);
 extern df_ref df_find_use (rtx_insn *, rtx);
 extern bool df_reg_used (rtx_insn *, rtx);
 extern void df_worklist_dataflow (struct dataflow *,bitmap, int *, int);
-extern void df_print_regset (FILE *file, bitmap r);
-extern void df_print_word_regset (FILE *file, bitmap r);
+extern void df_print_regset (FILE *file, const_bitmap r);
+extern void df_print_word_regset (FILE *file, const_bitmap r);
 extern void df_dump (FILE *);
 extern void df_dump_region (FILE *);
 extern void df_dump_start (FILE *);
diff --git a/gcc/diagnostic-color.c b/gcc/diagnostic-color.c
index 69e759ff6..abc919f63 100644
--- a/gcc/diagnostic-color.c
+++ b/gcc/diagnostic-color.c
@@ -19,6 +19,7 @@
 #include "config.h"
 #include "system.h"
 #include "diagnostic-color.h"
+#include "diagnostic-url.h"
 
 #ifdef __MINGW32__
 #  include <windows.h>
@@ -236,3 +237,22 @@ colorize_init (diagnostic_color_rule_t rule)
       gcc_unreachable ();
     }
 }
+
+/* Determine if URLs should be enabled, based on RULE.
+   This reuses the logic for colorization.  */
+
+bool
+diagnostic_urls_enabled_p (diagnostic_url_rule_t rule)
+{
+  switch (rule)
+    {
+    case DIAGNOSTICS_URL_NO:
+      return false;
+    case DIAGNOSTICS_URL_YES:
+      return true;
+    case DIAGNOSTICS_URL_AUTO:
+      return should_colorize ();
+    default:
+      gcc_unreachable ();
+    }
+}
diff --git a/gcc/diagnostic-url.h b/gcc/diagnostic-url.h
new file mode 100644
index 000000000..ce0de459f
--- /dev/null
+++ b/gcc/diagnostic-url.h
@@ -0,0 +1,36 @@
+/* Copyright (C) 2019 Free Software Foundation, Inc.
+   Contributed by David Malcolm <dmalcolm@redhat.com>.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_DIAGNOSTIC_URL_H
+#define GCC_DIAGNOSTIC_URL_H
+
+/* Whether to add URLs to diagnostics:
+   - DIAGNOSTICS_URL_NO: never
+   - DIAGNOSTICS_URL_YES: always
+   - DIAGNOSTICS_URL_AUTO: depending on the output stream.  */
+typedef enum
+{
+  DIAGNOSTICS_URL_NO       = 0,
+  DIAGNOSTICS_URL_YES      = 1,
+  DIAGNOSTICS_URL_AUTO     = 2
+} diagnostic_url_rule_t;
+
+extern bool diagnostic_urls_enabled_p (diagnostic_url_rule_t);
+
+#endif /* ! GCC_DIAGNOSTIC_URL_H */
diff --git a/gcc/diagnostic.c b/gcc/diagnostic.c
index be6b65722..a9acda7cc 100644
--- a/gcc/diagnostic.c
+++ b/gcc/diagnostic.c
@@ -31,6 +31,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "backtrace.h"
 #include "diagnostic.h"
 #include "diagnostic-color.h"
+#include "diagnostic-url.h"
 #include "edit-context.h"
 #include "selftest.h"
 #include "selftest-diagnostic.h"
@@ -238,6 +239,18 @@ diagnostic_color_init (diagnostic_context *context, int value /*= -1 */)
     = colorize_init ((diagnostic_color_rule_t) value);
 }
 
+/* Initialize URL support within CONTEXT based on VALUE, handling "auto".  */
+
+void
+diagnostic_urls_init (diagnostic_context *context, int value /*= -1 */)
+{
+  if (value < 0)
+    value = DIAGNOSTICS_COLOR_DEFAULT;
+
+  context->printer->show_urls
+    = diagnostic_urls_enabled_p ((diagnostic_url_rule_t) value);
+}
+
 /* Do any cleaning up required after the last diagnostic is emitted.  */
 
 void
diff --git a/gcc/diagnostic.h b/gcc/diagnostic.h
index 46c3b50a5..5daf4f288 100644
--- a/gcc/diagnostic.h
+++ b/gcc/diagnostic.h
@@ -328,6 +328,7 @@ diagnostic_override_option_index (diagnostic_info *info, int optidx)
 /* Diagnostic related functions.  */
 extern void diagnostic_initialize (diagnostic_context *, int);
 extern void diagnostic_color_init (diagnostic_context *, int value = -1);
+extern void diagnostic_urls_init (diagnostic_context *, int value = -1);
 extern void diagnostic_finish (diagnostic_context *);
 extern void diagnostic_report_current_module (diagnostic_context *, location_t);
 extern void diagnostic_show_locus (diagnostic_context *,
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 9c87792ff..e366ab923 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -271,6 +271,7 @@ Objective-C and Objective-C++ Dialects}.
 @gccoptlist{-fmessage-length=@var{n}  @gol
 -fdiagnostics-show-location=@r{[}once@r{|}every-line@r{]}  @gol
 -fdiagnostics-color=@r{[}auto@r{|}never@r{|}always@r{]}  @gol
+-fdiagnostics-urls=@r{[}auto@r{|}never@r{|}always@r{]}  @gol
 -fdiagnostics-format=@r{[}text@r{|}json@r{]}  @gol
 -fno-diagnostics-show-option  -fno-diagnostics-show-caret @gol
 -fno-diagnostics-show-labels  -fno-diagnostics-show-line-numbers @gol
@@ -403,8 +404,7 @@ Objective-C and Objective-C++ Dialects}.
 -fallow-store-data-races @gol
 -fassociative-math  -fauto-profile  -fauto-profile[=@var{path}] @gol
 -fauto-inc-dec  -fbranch-probabilities @gol
--fbranch-target-load-optimize  -fbranch-target-load-optimize2 @gol
--fbtr-bb-exclusive  -fcaller-saves @gol
+-fcaller-saves @gol
 -fcombine-stack-adjustments  -fconserve-stack @gol
 -fcompare-elim  -fcprop-registers  -fcrossjumping @gol
 -fcse-follow-jumps  -fcse-skip-blocks  -fcx-fortran-rules @gol
@@ -636,11 +636,13 @@ Objective-C and Objective-C++ Dialects}.
 -mlow-precision-recip-sqrt  -mlow-precision-sqrt  -mlow-precision-div @gol
 -mpc-relative-literal-loads @gol
 -msign-return-address=@var{scope} @gol
--mbranch-protection=@var{none}|@var{standard}|@var{pac-ret}[+@var{leaf}]|@var{bti} @gol
+-mbranch-protection=@var{none}|@var{standard}|@var{pac-ret}[+@var{leaf}
++@var{b-key}]|@var{bti} @gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}  @gol
 -moverride=@var{string}  -mverbose-cost-dump @gol
 -mstack-protector-guard=@var{guard} -mstack-protector-guard-reg=@var{sysreg} @gol
--mstack-protector-guard-offset=@var{offset} -mtrack-speculation }
+-mstack-protector-guard-offset=@var{offset} -mtrack-speculation @gol
+-moutline-atomics }
 
 @emph{Adapteva Epiphany Options}
 @gccoptlist{-mhalf-reg-file  -mprefer-short-insn-regs @gol
@@ -3885,6 +3887,18 @@ SGR substring for highlighting mismatching types within template
 arguments in the C++ frontend.
 @end table
 
+@item -fdiagnostics-urls[=@var{WHEN}]
+@opindex fdiagnostics-urls
+@cindex urls
+Use escape sequences to embed URLs in diagnostics.  For example, when
+@option{-fdiagnostics-show-option} emits text showing the command-line
+option controlling a diagnostic, embed a URL for documentation of that
+option.
+
+@var{WHEN} is @samp{never}, @samp{always}, or @samp{auto}.
+The default is @samp{auto}, which means to use URL escape sequences only
+when the standard error is a terminal.
+
 @item -fno-diagnostics-show-option
 @opindex fno-diagnostics-show-option
 @opindex fdiagnostics-show-option
@@ -8295,6 +8309,7 @@ also turns on the following optimization flags:
 -ffinite-loops @gol
 -fgcse  -fgcse-lm  @gol
 -fhoist-adjacent-loads @gol
+-finline-functions @gol
 -finline-small-functions @gol
 -findirect-inlining @gol
 -fipa-bit-cp  -fipa-cp  -fipa-icf @gol
@@ -8328,7 +8343,6 @@ by @option{-O2} and also turns on the following optimization flags:
 
 @c Please keep the following list alphabetized!
 @gccoptlist{-fgcse-after-reload @gol
--finline-functions @gol
 -fipa-cp-clone
 -floop-interchange @gol
 -floop-unroll-and-jam @gol
@@ -8386,10 +8400,10 @@ no effect.  Otherwise @option{-Og} enables all @option{-O1}
 optimization flags except for those that may interfere with debugging:
 
 @gccoptlist{-fbranch-count-reg  -fdelayed-branch @gol
--fif-conversion  -fif-conversion2  @gol
+-fdse  -fif-conversion  -fif-conversion2  @gol
 -finline-functions-called-once @gol
 -fmove-loop-invariants  -fssa-phiopt @gol
--ftree-bit-ccp  -ftree-pta  -ftree-sra}
+-ftree-bit-ccp  -ftree-dse  -ftree-pta  -ftree-sra}
 
 @end table
 
@@ -8508,7 +8522,7 @@ If all calls to a given function are integrated, and the function is
 declared @code{static}, then the function is normally not output as
 assembler code in its own right.
 
-Enabled at levels @option{-O3}, @option{-Os}.  Also enabled
+Enabled at levels @option{-O2}, @option{-O3}, @option{-Os}.  Also enabled
 by @option{-fprofile-use} and @option{-fauto-profile}.
 
 @item -finline-functions-called-once
@@ -10986,24 +11000,6 @@ locations inside a translation unit since the locations are unknown until
 link time.  An example of such an optimization is relaxing calls to short call
 instructions.
 
-@item -fbranch-target-load-optimize
-@opindex fbranch-target-load-optimize
-Perform branch target register load optimization before prologue / epilogue
-threading.
-The use of target registers can typically be exposed only during reload,
-thus hoisting loads out of loops and doing inter-block scheduling needs
-a separate optimization pass.
-
-@item -fbranch-target-load-optimize2
-@opindex fbranch-target-load-optimize2
-Perform branch target register load optimization after prologue / epilogue
-threading.
-
-@item -fbtr-bb-exclusive
-@opindex fbtr-bb-exclusive
-When performing branch target register load optimization, don't reuse
-branch target registers within any basic block.
-
 @item -fstdarg-opt
 @opindex fstdarg-opt
 Optimize the prologue of variadic argument functions with respect to usage of
@@ -11154,19 +11150,30 @@ when modulo scheduling a loop.  Larger values can exponentially increase
 compilation time.
 
 @item max-inline-insns-single
-Several parameters control the tree inliner used in GCC@.
-This number sets the maximum number of instructions (counted in GCC's
-internal representation) in a single function that the tree inliner
-considers for inlining.  This only affects functions declared
-inline and methods implemented in a class declaration (C++).
+@item max-inline-insns-single-O2
+Several parameters control the tree inliner used in GCC@.  This number sets the
+maximum number of instructions (counted in GCC's internal representation) in a
+single function that the tree inliner considers for inlining.  This only
+affects functions declared inline and methods implemented in a class
+declaration (C++). 
+
+For functions compiled with optimization levels
+@option{-O3} and @option{-Ofast} parameter @option{max-inline-insns-single} is
+applied. In other cases @option{max-inline-insns-single-O2} is applied.
+
 
 @item max-inline-insns-auto
+@item max-inline-insns-auto-O2
 When you use @option{-finline-functions} (included in @option{-O3}),
 a lot of functions that would otherwise not be considered for inlining
 by the compiler are investigated.  To those functions, a different
 (more restrictive) limit compared to functions declared inline can
 be applied.
 
+For functions compiled with optimization levels
+@option{-O3} and @option{-Ofast} parameter @option{max-inline-insns-auto} is
+applied. In other cases @option{max-inline-insns-auto-O2} is applied.
+
 @item max-inline-insns-small
 This is bound applied to calls which are considered relevant with
 @option{-finline-small-functions}.
@@ -11189,11 +11196,16 @@ Same as @option{--param uninlined-function-insns} and
 @option{--param uninlined-function-time} but applied to function thunks
 
 @item inline-min-speedup
+@item inline-min-speedup-O2
 When estimated performance improvement of caller + callee runtime exceeds this
 threshold (in percent), the function can be inlined regardless of the limit on
 @option{--param max-inline-insns-single} and @option{--param
 max-inline-insns-auto}.
 
+For functions compiled with optimization levels
+@option{-O3} and @option{-Ofast} parameter @option{inline-min-speedup} is
+applied. In other cases @option{inline-min-speedup-O2} is applied.
+
 @item large-function-insns
 The limit specifying really large functions.  For functions larger than this
 limit after inlining, inlining is constrained by
@@ -11271,9 +11283,14 @@ via a given call expression.  This parameter limits inlining only to call
 expressions whose probability exceeds the given threshold (in percents).
 
 @item early-inlining-insns
+@item early-inlining-insns-O2
 Specify growth that the early inliner can make.  In effect it increases
 the amount of inlining for code having a large abstraction penalty.
 
+For functions compiled with optimization levels
+@option{-O3} and @option{-Ofast} parameter @option{early-inlining-insns} is
+applied. In other cases @option{early-inlining-insns-O2} is applied.
+
 @item max-early-inliner-iterations
 Limit of iterations of the early inliner.  This basically bounds
 the number of nested indirect calls the early inliner can resolve.
@@ -15816,31 +15833,38 @@ be used by the compiler when expanding calls to
 @code{__builtin_speculation_safe_copy} to permit a more efficient code
 sequence to be generated.
 
+@item -moutline-atomics
+@itemx -mno-outline-atomics
+Enable or disable calls to out-of-line helpers to implement atomic operations.
+These helpers will, at runtime, determine if the LSE instructions from
+ARMv8.1-A can be used; if not, they will use the load/store-exclusive
+instructions that are present in the base ARMv8.0 ISA.
+
+This option is only applicable when compiling for the base ARMv8.0
+instruction set.  If using a later revision, e.g. @option{-march=armv8.1-a}
+or @option{-march=armv8-a+lse}, the ARMv8.1-Atomics instructions will be
+used directly.  The same applies when using @option{-mcpu=} when the
+selected cpu supports the @samp{lse} feature.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture and, optionally, one or
 more feature modifiers.  This option has the form
 @option{-march=@var{arch}@r{@{}+@r{[}no@r{]}@var{feature}@r{@}*}}.
 
-The permissible values for @var{arch} are @samp{armv8-a},
-@samp{armv8.1-a}, @samp{armv8.2-a}, @samp{armv8.3-a}, @samp{armv8.4-a},
-@samp{armv8.5-a} or @var{native}.
-
-The value @samp{armv8.5-a} implies @samp{armv8.4-a} and enables compiler
-support for the ARMv8.5-A architecture extensions.
-
-The value @samp{armv8.4-a} implies @samp{armv8.3-a} and enables compiler
-support for the ARMv8.4-A architecture extensions.
-
-The value @samp{armv8.3-a} implies @samp{armv8.2-a} and enables compiler
-support for the ARMv8.3-A architecture extensions.
-
-The value @samp{armv8.2-a} implies @samp{armv8.1-a} and enables compiler
-support for the ARMv8.2-A architecture extensions.
-
-The value @samp{armv8.1-a} implies @samp{armv8-a} and enables compiler
-support for the ARMv8.1-A architecture extension.  In particular, it
-enables the @samp{+crc}, @samp{+lse}, and @samp{+rdma} features.
+The table below summarizes the permissible values for @var{arch}
+and the features that they enable by default:
+
+@multitable @columnfractions 0.20 0.20 0.60
+@headitem @var{arch} value @tab Architecture @tab Includes by default
+@item @samp{armv8-a} @tab Armv8-A @tab @samp{+fp}, @samp{+simd}
+@item @samp{armv8.1-a} @tab Armv8.1-A @tab @samp{armv8-a}, @samp{+crc}, @samp{+lse}, @samp{+rdma}
+@item @samp{armv8.2-a} @tab Armv8.2-A @tab @samp{armv8.1-a}
+@item @samp{armv8.3-a} @tab Armv8.3-A @tab @samp{armv8.2-a}
+@item @samp{armv8.4-a} @tab Armv8.4-A @tab @samp{armv8.3-a}, @samp{+fp16fml}, @samp{+dotprod}
+@item @samp{armv8.5-a} @tab Armv8.5-A @tab @samp{armv8.4-a}, @samp{+sb}, @samp{+ssbs}, @samp{+predres}
+@item @samp{armv8.6-a} @tab Armv8.6-A @tab @samp{armv8.5-a}, @samp{+bf16}, @samp{+i8mm}
+@end multitable
 
 The value @samp{native} is available on native AArch64 GNU/Linux and
 causes the compiler to pick the architecture of the host system.  This
@@ -15864,7 +15888,9 @@ Specify the name of the target processor for which GCC should tune the
 performance of the code.  Permissible values for this option are:
 @samp{generic}, @samp{cortex-a35}, @samp{cortex-a53}, @samp{cortex-a55},
 @samp{cortex-a57}, @samp{cortex-a72}, @samp{cortex-a73}, @samp{cortex-a75},
-@samp{cortex-a76}, @samp{ares}, @samp{exynos-m1}, @samp{emag}, @samp{falkor},
+@samp{cortex-a76}, @samp{cortex-a76ae}, @samp{cortex-a77},
+@samp{cortex-a65}, @samp{cortex-a65ae}, @samp{cortex-a34},
+@samp{ares}, @samp{exynos-m1}, @samp{emag}, @samp{falkor},
 @samp{neoverse-e1},@samp{neoverse-n1},@samp{qdf24xx}, @samp{saphira},
 @samp{phecda}, @samp{xgene1}, @samp{vulcan}, @samp{octeontx},
 @samp{octeontx81},  @samp{octeontx83}, @samp{thunderx}, @samp{thunderxt88},
@@ -15941,7 +15967,7 @@ functions, and @samp{all}, which enables pointer signing for all functions.  The
 default value is @samp{none}. This option has been deprecated by
 -mbranch-protection.
 
-@item -mbranch-protection=@var{none}|@var{standard}|@var{pac-ret}[+@var{leaf}]|@var{bti}
+@item -mbranch-protection=@var{none}|@var{standard}|@var{pac-ret}[+@var{leaf}+@var{b-key}]|@var{bti}
 @opindex mbranch-protection
 Select the branch protection features to use.
 @samp{none} is the default and turns off all types of branch protection.
@@ -15952,7 +15978,8 @@ level.
 level: signing functions that save the return address to memory (non-leaf
 functions will practically always do this) using the a-key.  The optional
 argument @samp{leaf} can be used to extend the signing to include leaf
-functions.
+functions.  The optional argument @samp{b-key} can be used to sign the functions
+with the B-key instead of the A-key.
 @samp{bti} turns on branch target identification mechanism.
 
 @item -msve-vector-bits=@var{bits}
@@ -16054,6 +16081,37 @@ Enable the Armv8-a Execution and Data Prediction Restriction instructions.
 This option is only to enable the extension at the assembler level and does
 not affect code generation.  This option is enabled by default for
 @option{-march=armv8.5-a}.
+@item sve2
+Enable the Armv8-a Scalable Vector Extension 2.  This also enables SVE
+instructions.
+@item sve2-bitperm
+Enable SVE2 bitperm instructions.  This also enables SVE2 instructions.
+@item sve2-sm4
+Enable SVE2 sm4 instructions.  This also enables SVE2 instructions.
+@item sve2-aes
+Enable SVE2 aes instructions.  This also enables SVE2 instructions.
+@item sve2-sha3
+Enable SVE2 sha3 instructions.  This also enables SVE2 instructions.
+@item tme
+Enable the Transactional Memory Extension.
+@item i8mm
+Enable 8-bit Integer Matrix Multiply instructions.  This also enables
+Advanced SIMD and floating-point instructions.  This option is enabled by
+default for @option{-march=armv8.6-a}.  Use of this option with architectures
+prior to Armv8.2-A is not supported.
+@item f32mm
+Enable 32-bit Floating point Matrix Multiply instructions.  This also enables
+SVE instructions.  Use of this option with architectures prior to Armv8.2-A is
+not supported.
+@item f64mm
+Enable 64-bit Floating point Matrix Multiply instructions.  This also enables
+SVE instructions.  Use of this option with architectures prior to Armv8.2-A is
+not supported.
+@item bf16
+Enable brain half-precision floating-point instructions.  This also enables
+Advanced SIMD and floating-point instructions.  This option is enabled by
+default for @option{-march=armv8.6-a}.  Use of this option with architectures
+prior to Armv8.2-A is not supported.
 
 @end table
 
@@ -28567,8 +28625,9 @@ By default GCC inlines string operations only when the destination is
 known to be aligned to least a 4-byte boundary.  
 This enables more inlining and increases code
 size, but may improve performance of code that depends on fast
-@code{memcpy}, @code{strlen},
-and @code{memset} for short lengths.
+@code{memcpy} and @code{memset} for short lengths.
+The option enables inline expansion of @code{strlen} for all
+pointer alignments.
 
 @item -minline-stringops-dynamically
 @opindex minline-stringops-dynamically
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 50e13124b..75482d7a2 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -1748,6 +1748,12 @@ The stack pointer register (@code{SP})
 @item w
 Floating point register, Advanced SIMD vector register or SVE vector register
 
+@item x
+Like @code{w}, but restricted to registers 0 to 15 inclusive.
+
+@item y
+Like @code{w}, but restricted to registers 0 to 7 inclusive.
+
 @item Upl
 One of the low eight SVE predicate registers (@code{P0} to @code{P7})
 
@@ -5470,6 +5476,11 @@ mode @var{m} and the scalars have the mode appropriate for one
 element of @var{m}.  The operation is strictly in-order: there is
 no reassociation.
 
+@cindex @code{mask_fold_left_plus_@var{m}} instruction pattern
+@item @code{mask_fold_left_plus_@var{m}}
+Like @samp{fold_left_plus_@var{m}}, but takes an additional mask operand
+(operand 3) that specifies which elements of the source vector should be added.
+
 @cindex @code{sdot_prod@var{m}} instruction pattern
 @item @samp{sdot_prod@var{m}}
 @cindex @code{udot_prod@var{m}} instruction pattern
@@ -5499,6 +5510,44 @@ operand 1. Add operand 1 to operand 2 and place the widened result in
 operand 0. (This is used express accumulation of elements into an accumulator
 of a wider mode.)
 
+@cindex @code{smulhs@var{m3}} instruction pattern
+@item @samp{smulhs@var{m3}}
+@cindex @code{umulhs@var{m3}} instruction pattern
+@itemx @samp{umulhs@var{m3}}
+Signed/unsigned multiply high with scale. This is equivalent to the C code:
+@smallexample
+narrow op0, op1, op2;
+@dots{}
+op0 = (narrow) (((wide) op1 * (wide) op2) >> (N / 2 - 1));
+@end smallexample
+where the sign of @samp{narrow} determines whether this is a signed
+or unsigned operation, and @var{N} is the size of @samp{wide} in bits.
+
+@cindex @code{smulhrs@var{m3}} instruction pattern
+@item @samp{smulhrs@var{m3}}
+@cindex @code{umulhrs@var{m3}} instruction pattern
+@itemx @samp{umulhrs@var{m3}}
+Signed/unsigned multiply high with round and scale. This is
+equivalent to the C code:
+@smallexample
+narrow op0, op1, op2;
+@dots{}
+op0 = (narrow) (((((wide) op1 * (wide) op2) >> (N / 2 - 2)) + 1) >> 1);
+@end smallexample
+where the sign of @samp{narrow} determines whether this is a signed
+or unsigned operation, and @var{N} is the size of @samp{wide} in bits.
+
+@cindex @code{sdiv_pow2@var{m3}} instruction pattern
+@item @samp{sdiv_pow2@var{m3}}
+@cindex @code{sdiv_pow2@var{m3}} instruction pattern
+@itemx @samp{sdiv_pow2@var{m3}}
+Signed division by power-of-2 immediate. Equivalent to:
+@smallexample
+signed op0, op1;
+@dots{}
+op0 = op1 / (1 << imm);
+@end smallexample
+
 @cindex @code{vec_shl_insert_@var{m}} instruction pattern
 @item @samp{vec_shl_insert_@var{m}}
 Shift the elements in vector input operand 1 left one element (i.e.@:
@@ -6240,13 +6289,13 @@ This pattern is not allowed to @code{FAIL}.
 @item @samp{one_cmpl@var{m}2}
 Store the bitwise-complement of operand 1 into operand 0.
 
-@cindex @code{movmem@var{m}} instruction pattern
-@item @samp{movmem@var{m}}
-Block move instruction.  The destination and source blocks of memory
+@cindex @code{cpymem@var{m}} instruction pattern
+@item @samp{cpymem@var{m}}
+Block copy instruction.  The destination and source blocks of memory
 are the first two operands, and both are @code{mem:BLK}s with an
 address in mode @code{Pmode}.
 
-The number of bytes to move is the third operand, in mode @var{m}.
+The number of bytes to copy is the third operand, in mode @var{m}.
 Usually, you specify @code{Pmode} for @var{m}.  However, if you can
 generate better code knowing the range of valid lengths is smaller than
 those representable in a full Pmode pointer, you should provide
@@ -6266,14 +6315,16 @@ in a way that the blocks are not required to be aligned according to it in
 all cases. This expected alignment is also in bytes, just like operand 4.
 Expected size, when unknown, is set to @code{(const_int -1)}.
 
-Descriptions of multiple @code{movmem@var{m}} patterns can only be
+Descriptions of multiple @code{cpymem@var{m}} patterns can only be
 beneficial if the patterns for smaller modes have fewer restrictions
 on their first, second and fourth operands.  Note that the mode @var{m}
-in @code{movmem@var{m}} does not impose any restriction on the mode of
-individually moved data units in the block.
+in @code{cpymem@var{m}} does not impose any restriction on the mode of
+individually copied data units in the block.
 
-These patterns need not give special consideration to the possibility
-that the source and destination strings might overlap.
+The @code{cpymem@var{m}} patterns need not give special consideration
+to the possibility that the source and destination strings might
+overlap. These patterns are used to do inline expansion of
+@code{__builtin_memcpy}.
 
 @cindex @code{movstr} instruction pattern
 @item @samp{movstr}
@@ -6294,7 +6345,7 @@ given as a @code{mem:BLK} whose address is in mode @code{Pmode}.  The
 number of bytes to set is the second operand, in mode @var{m}.  The value to
 initialize the memory with is the third operand. Targets that only support the
 clearing of memory should reject any value that is not the constant 0.  See
-@samp{movmem@var{m}} for a discussion of the choice of mode.
+@samp{cpymem@var{m}} for a discussion of the choice of mode.
 
 The fourth operand is the known alignment of the destination, in the form
 of a @code{const_int} rtx.  Thus, if the compiler knows that the
@@ -6312,13 +6363,13 @@ Operand 9 is the probable maximal size (i.e.@: we cannot rely on it for
 correctness, but it can be used for choosing proper code sequence for a
 given size).
 
-The use for multiple @code{setmem@var{m}} is as for @code{movmem@var{m}}.
+The use for multiple @code{setmem@var{m}} is as for @code{cpymem@var{m}}.
 
 @cindex @code{cmpstrn@var{m}} instruction pattern
 @item @samp{cmpstrn@var{m}}
 String compare instruction, with five operands.  Operand 0 is the output;
 it has mode @var{m}.  The remaining four operands are like the operands
-of @samp{movmem@var{m}}.  The two memory blocks specified are compared
+of @samp{cpymem@var{m}}.  The two memory blocks specified are compared
 byte by byte in lexicographic order starting at the beginning of each
 string.  The instruction is not allowed to prefetch more than one byte
 at a time since either string may end in the first byte and reading past
@@ -8537,6 +8588,119 @@ functionality as two separate @code{define_insn} and @code{define_split}
 patterns.  It exists for compactness, and as a maintenance tool to prevent
 having to ensure the two patterns' templates match.
 
+@findex define_insn_and_rewrite
+It is sometimes useful to have a @code{define_insn_and_split}
+that replaces specific operands of an instruction but leaves the
+rest of the instruction pattern unchanged.  You can do this directly
+with a @code{define_insn_and_split}, but it requires a
+@var{new-insn-pattern-1} that repeats most of the original @var{insn-pattern}.
+There is also the complication that an implicit @code{parallel} in
+@var{insn-pattern} must become an explicit @code{parallel} in
+@var{new-insn-pattern-1}, which is easy to overlook.
+A simpler alternative is to use @code{define_insn_and_rewrite}, which
+is a form of @code{define_insn_and_split} that automatically generates
+@var{new-insn-pattern-1} by replacing each @code{match_operand}
+in @var{insn-pattern} with a corresponding @code{match_dup}, and each
+@code{match_operator} in the pattern with a corresponding @code{match_op_dup}.
+The arguments are otherwise identical to @code{define_insn_and_split}:
+
+@smallexample
+(define_insn_and_rewrite
+  [@var{insn-pattern}]
+  "@var{condition}"
+  "@var{output-template}"
+  "@var{split-condition}"
+  "@var{preparation-statements}"
+  [@var{insn-attributes}])
+@end smallexample
+
+The @code{match_dup}s and @code{match_op_dup}s in the new
+instruction pattern use any new operand values that the
+@var{preparation-statements} store in the @code{operands} array,
+as for a normal @code{define_insn_and_split}.  @var{preparation-statements}
+can also emit additional instructions before the new instruction.
+They can even emit an entirely different sequence of instructions and
+use @code{DONE} to avoid emitting a new form of the original
+instruction.
+
+The split in a @code{define_insn_and_rewrite} is only intended
+to apply to existing instructions that match @var{insn-pattern}.
+@var{split-condition} must therefore start with @code{&&},
+so that the split condition applies on top of @var{condition}.
+
+Here is an example from the AArch64 SVE port, in which operand 1 is
+known to be equivalent to an all-true constant and isn't used by the
+output template:
+
+@smallexample
+(define_insn_and_rewrite "*while_ult<GPI:mode><PRED_ALL:mode>_cc"
+  [(set (reg:CC CC_REGNUM)
+        (compare:CC
+          (unspec:SI [(match_operand:PRED_ALL 1)
+                      (unspec:PRED_ALL
+                        [(match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")
+                         (match_operand:GPI 3 "aarch64_reg_or_zero" "rZ")]
+                        UNSPEC_WHILE_LO)]
+                     UNSPEC_PTEST_PTRUE)
+          (const_int 0)))
+   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+        (unspec:PRED_ALL [(match_dup 2)
+                          (match_dup 3)]
+                         UNSPEC_WHILE_LO))]
+  "TARGET_SVE"
+  "whilelo\t%0.<PRED_ALL:Vetype>, %<w>2, %<w>3"
+  ;; Force the compiler to drop the unused predicate operand, so that we
+  ;; don't have an unnecessary PTRUE.
+  "&& !CONSTANT_P (operands[1])"
+  @{
+    operands[1] = CONSTM1_RTX (<MODE>mode);
+  @}
+)
+@end smallexample
+
+The splitter in this case simply replaces operand 1 with the constant
+value that it is known to have.  The equivalent @code{define_insn_and_split}
+would be:
+
+@smallexample
+(define_insn_and_split "*while_ult<GPI:mode><PRED_ALL:mode>_cc"
+  [(set (reg:CC CC_REGNUM)
+        (compare:CC
+          (unspec:SI [(match_operand:PRED_ALL 1)
+                      (unspec:PRED_ALL
+                        [(match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")
+                         (match_operand:GPI 3 "aarch64_reg_or_zero" "rZ")]
+                        UNSPEC_WHILE_LO)]
+                     UNSPEC_PTEST_PTRUE)
+          (const_int 0)))
+   (set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
+        (unspec:PRED_ALL [(match_dup 2)
+                          (match_dup 3)]
+                         UNSPEC_WHILE_LO))]
+  "TARGET_SVE"
+  "whilelo\t%0.<PRED_ALL:Vetype>, %<w>2, %<w>3"
+  ;; Force the compiler to drop the unused predicate operand, so that we
+  ;; don't have an unnecessary PTRUE.
+  "&& !CONSTANT_P (operands[1])"
+  [(parallel
+     [(set (reg:CC CC_REGNUM)
+           (compare:CC
+             (unspec:SI [(match_dup 1)
+                         (unspec:PRED_ALL [(match_dup 2)
+                                           (match_dup 3)]
+                                          UNSPEC_WHILE_LO)]
+                        UNSPEC_PTEST_PTRUE)
+             (const_int 0)))
+      (set (match_dup 0)
+           (unspec:PRED_ALL [(match_dup 2)
+                             (match_dup 3)]
+                            UNSPEC_WHILE_LO))])]
+  @{
+    operands[1] = CONSTM1_RTX (<MODE>mode);
+  @}
+)
+@end smallexample
+
 @end ifset
 @ifset INTERNALS
 @node Including Patterns
@@ -10979,6 +11143,27 @@ Other attributes are defined using:
 (define_code_attr @var{name} [(@var{code1} "@var{value1}") @dots{} (@var{coden} "@var{valuen}")])
 @end smallexample
 
+Instruction patterns can use code attributes as rtx codes, which can be
+useful if two sets of codes act in tandem.  For example, the following
+@code{define_insn} defines two patterns, one calculating a signed absolute
+difference and another calculating an unsigned absolute difference:
+
+@smallexample
+(define_code_iterator any_max [smax umax])
+(define_code_attr paired_min [(smax "smin") (umax "umin")])
+(define_insn @dots{}
+  [(set (match_operand:SI 0 @dots{})
+        (minus:SI (any_max:SI (match_operand:SI 1 @dots{})
+                              (match_operand:SI 2 @dots{}))
+                  (<paired_min>:SI (match_dup 1) (match_dup 2))))]
+  @dots{})
+@end smallexample
+
+The signed version of the instruction uses @code{smax} and @code{smin}
+while the unsigned version uses @code{umax} and @code{umin}.  There
+are no versions that pair @code{smax} with @code{umin} or @code{umax}
+with @code{smin}.
+
 Here's an example of code iterators in action, taken from the MIPS port:
 
 @smallexample
@@ -11249,4 +11434,13 @@ name and same types of iterator.  For example:
 would produce a single set of functions that handles both
 @code{INTEGER_MODES} and @code{FLOAT_MODES}.
 
+It is also possible for these @samp{@@} patterns to have different
+numbers of operands from each other.  For example, patterns with
+a binary rtl code might take three operands (one output and two inputs)
+while patterns with a ternary rtl code might take four operands (one
+output and three inputs).  This combination would produce separate
+@samp{maybe_gen_@var{name}} and @samp{gen_@var{name}} functions for
+each operand count, but it would still produce a single
+@samp{maybe_code_for_@var{name}} and a single @samp{code_for_@var{name}}.
+
 @end ifset
diff --git a/gcc/doc/rtl.texi b/gcc/doc/rtl.texi
index f5f2de756..3df798216 100644
--- a/gcc/doc/rtl.texi
+++ b/gcc/doc/rtl.texi
@@ -3295,18 +3295,6 @@ There is one other known use for clobbering a pseudo register in a
 clobbered by the insn.  In this case, using the same pseudo register in
 the clobber and elsewhere in the insn produces the expected results.
 
-@findex clobber_high
-@item (clobber_high @var{x})
-Represents the storing or possible storing of an unpredictable,
-undescribed value into the upper parts of @var{x}. The mode of the expression
-represents the lower parts of the register which will not be overwritten.
-@code{reg} must be a reg expression.
-
-One place this is used is when calling into functions where the registers are
-preserved, but only up to a given number of bits.  For example when using
-Aarch64 SVE, calling a TLS descriptor will cause only the lower 128 bits of
-each of the vector registers to be preserved.
-
 @findex use
 @item (use @var{x})
 Represents the use of the value of @var{x}.  It indicates that the
@@ -3341,7 +3329,7 @@ that the register is live.  You should think twice before adding
 instead.  The @code{use} RTX is most commonly useful to describe that
 a fixed register is implicitly used in an insn.  It is also safe to use
 in patterns where the compiler knows for other reasons that the result
-of the whole pattern is variable, such as @samp{movmem@var{m}} or
+of the whole pattern is variable, such as @samp{cpymem@var{m}} or
 @samp{call} patterns.
 
 During the reload phase, an insn that has a @code{use} as pattern
@@ -3360,8 +3348,7 @@ Represents several side effects performed in parallel.  The square
 brackets stand for a vector; the operand of @code{parallel} is a
 vector of expressions.  @var{x0}, @var{x1} and so on are individual
 side effect expressions---expressions of code @code{set}, @code{call},
-@code{return}, @code{simple_return}, @code{clobber} @code{use} or
-@code{clobber_high}.
+@code{return}, @code{simple_return}, @code{clobber} or @code{use}.
 
 ``In parallel'' means that first all the values used in the individual
 side-effects are computed, and second all the actual side-effects are
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index 546af7f72..62245c2b3 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -1439,6 +1439,14 @@ vector alignment.
 Target supports both signed and unsigned averaging operations on vectors
 of bytes.
 
+@item vect_mulhrs_hi
+Target supports both signed and unsigned multiply-high-with-round-and-scale
+operations on vectors of half-words.
+
+@item vect_sdiv_pow2_si
+Target supports signed division by constant power-of-2 operations
+on vectors of 4-byte integers.
+
 @item vect_condition
 Target supports vector conditional operations.
 
@@ -1854,6 +1862,16 @@ ARM target supports extensions to generate the @code{VFMAL} and @code{VFMLS}
 half-precision floating-point instructions available from ARMv8.2-A and
 onwards.  Some multilibs may be incompatible with these options.
 
+@item arm_v8_2a_bf16_neon_ok
+ARM target supports options to generate instructions from ARMv8.2-A with
+the BFloat16 extension (bf16). Some multilibs may be incompatible with these
+options.
+
+@item arm_v8_2a_i8mm_ok
+ARM target supports options to generate instructions from ARMv8.2-A with
+the 8-Bit Integer Matrix Multiply extension (i8mm). Some multilibs may be
+incompatible with these options.
+
 @item arm_prefer_ldrd_strd
 ARM target prefers @code{LDRD} and @code{STRD} instructions over
 @code{LDM} and @code{STM} instructions.
@@ -2663,6 +2681,91 @@ assembly output.
 @item scan-not-hidden @var{symbol} [@{ target/xfail @var{selector} @}]
 Passes if @var{symbol} is not defined as a hidden symbol in the test's
 assembly output.
+
+@item check-function-bodies @var{prefix} @var{terminator} [@var{option} [@{ target/xfail @var{selector} @}]]
+Looks through the source file for comments that give the expected assembly
+output for selected functions.  Each line of expected output starts with the
+prefix string @var{prefix} and the expected output for a function as a whole
+is followed by a line that starts with the string @var{terminator}.
+Specifying an empty terminator is equivalent to specifying @samp{"*/"}.
+
+If @var{option} is specified, the test only applies to command lines
+that contain @var{option}.  This can be useful if a source file is compiled
+both with and without optimization, since it is rarely useful to check the
+assembly output for unoptimized code.
+
+The first line of the expected output for a function @var{fn} has the form:
+
+@smallexample
+@var{prefix} @var{fn}:  [@{ target/xfail @var{selector} @}]
+@end smallexample
+
+Subsequent lines of the expected output also start with @var{prefix}.
+In both cases, whitespace after @var{prefix} is not significant.
+
+The test discards assembly directives such as @code{.cfi_startproc}
+and local label definitions such as @code{.LFB0} from the compiler's
+assembly output.  It then matches the result against the expected
+output for a function as a single regular expression.  This means that
+later lines can use backslashes to refer back to @samp{(@dots{})}
+captures on earlier lines.  For example:
+
+@smallexample
+/* @{ dg-final @{ check-function-bodies "**" "" "-DCHECK_ASM" @} @} */
+@dots{}
+/*
+** add_w0_s8_m:
+**	mov	(z[0-9]+\.b), w0
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+svint8_t add_w0_s8_m (@dots{}) @{ @dots{} @}
+@dots{}
+/*
+** add_b0_s8_m:
+**	mov	(z[0-9]+\.b), b0
+**	add	z1\.b, p0/m, z1\.b, \1
+**	ret
+*/
+svint8_t add_b0_s8_m (@dots{}) @{ @dots{} @}
+@end smallexample
+
+checks whether the implementations of @code{add_w0_s8_m} and
+@code{add_b0_s8_m} match the regular expressions given.  The test only
+runs when @samp{-DCHECK_ASM} is passed on the command line.
+
+It is possible to create non-capturing multi-line regular expression
+groups of the form @samp{(@var{a}|@var{b}|@dots{})} by putting the
+@samp{(}, @samp{|} and @samp{)} on separate lines (each still using
+@var{prefix}).  For example:
+
+@smallexample
+/*
+** cmple_f16_tied:
+** (
+**	fcmge	p0\.h, p0/z, z1\.h, z0\.h
+** |
+**	fcmle	p0\.h, p0/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+svbool_t cmple_f16_tied (@dots{}) @{ @dots{} @}
+@end smallexample
+
+checks whether @code{cmple_f16_tied} is implemented by the
+@code{fcmge} instruction followed by @code{ret} or by the
+@code{fcmle} instruction followed by @code{ret}.  The test is
+still a single regular rexpression.
+
+A line containing just:
+
+@smallexample
+@var{prefix} ...
+@end smallexample
+
+stands for zero or more unmatched lines; the whitespace after
+@var{prefix} is again not significant.
+
 @end table
 
 @subsubsection Scan optimization dump files
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 73db70867..3f22bb1f6 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -1878,6 +1878,9 @@ function calls.
 If a register has 0 in @code{CALL_USED_REGISTERS}, the compiler
 automatically saves it on function entry and restores it on function
 exit, if the register is used within the function.
+
+Exactly one of @code{CALL_USED_REGISTERS} and @code{CALL_REALLY_USED_REGISTERS}
+must be defined.  Modern ports should define @code{CALL_REALLY_USED_REGISTERS}.
 @end defmac
 
 @defmac CALL_REALLY_USED_REGISTERS
@@ -1887,48 +1890,55 @@ exit, if the register is used within the function.
 Like @code{CALL_USED_REGISTERS} except this macro doesn't require
 that the entire set of @code{FIXED_REGISTERS} be included.
 (@code{CALL_USED_REGISTERS} must be a superset of @code{FIXED_REGISTERS}).
-This macro is optional.  If not specified, it defaults to the value
-of @code{CALL_USED_REGISTERS}.
+
+Exactly one of @code{CALL_USED_REGISTERS} and @code{CALL_REALLY_USED_REGISTERS}
+must be defined.  Modern ports should define @code{CALL_REALLY_USED_REGISTERS}.
 @end defmac
 
 @cindex call-used register
 @cindex call-clobbered register
 @cindex call-saved register
-@deftypefn {Target Hook} bool TARGET_HARD_REGNO_CALL_PART_CLOBBERED (rtx_insn *@var{insn}, unsigned int @var{regno}, machine_mode @var{mode})
-This hook should return true if @var{regno} is partly call-saved and
-partly call-clobbered, and if a value of mode @var{mode} would be partly
-clobbered by call instruction @var{insn}.  If @var{insn} is NULL then it
-should return true if any call could partly clobber the register.
-For example, if the low 32 bits of @var{regno} are preserved across a call
-but higher bits are clobbered, this hook should return true for a 64-bit
-mode but false for a 32-bit mode.
-
-The default implementation returns false, which is correct
-for targets that don't have partly call-clobbered registers.
+@deftypefn {Target Hook} {const predefined_function_abi &} TARGET_FNTYPE_ABI (const_tree @var{type})
+Return the ABI used by a function with type @var{type}; see the
+definition of @code{predefined_function_abi} for details of the ABI
+descriptor.  Targets only need to define this hook if they support
+interoperability between several ABIs in the same translation unit.
 @end deftypefn
 
-@deftypefn {Target Hook} void TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS (rtx_insn *@var{insn}, HARD_REG_SET *@var{used_regs})
-This hook removes registers from the set of call-clobbered registers
- in @var{used_regs} if, contrary to the default rules, something guarantees
- that @samp{insn} preserves those registers.  For example, some targets
- support variant ABIs in which functions preserve more registers than
- normal functions would.  Removing those extra registers from @var{used_regs}
- can lead to better register allocation.
- 
- The default implementation does nothing, which is always safe.
- Defining the hook is purely an optimization.
+@deftypefn {Target Hook} {const predefined_function_abi &} TARGET_INSN_CALLEE_ABI (const rtx_insn *@var{insn})
+This hook returns a description of the ABI used by the target of
+call instruction @var{insn}; see the definition of
+@code{predefined_function_abi} for details of the ABI descriptor.
+Only the global function @code{insn_callee_abi} should call this hook
+directly.
+
+Targets only need to define this hook if they support
+interoperability between several ABIs in the same translation unit.
 @end deftypefn
 
-@deftypefn {Target Hook} {rtx_insn *} TARGET_RETURN_CALL_WITH_MAX_CLOBBERS (rtx_insn *@var{call_1}, rtx_insn *@var{call_2})
-This hook returns a pointer to the call that partially clobbers the
-most registers.  If a platform supports multiple ABIs where the registers
-that are partially clobbered may vary, this function compares two
-calls and returns a pointer to the one that clobbers the most registers.
-If both calls clobber the same registers, @var{call_1} must be returned.
+@cindex call-used register
+@cindex call-clobbered register
+@cindex call-saved register
+@deftypefn {Target Hook} bool TARGET_HARD_REGNO_CALL_PART_CLOBBERED (unsigned int @var{abi_id}, unsigned int @var{regno}, machine_mode @var{mode})
+ABIs usually specify that calls must preserve the full contents
+of a particular register, or that calls can alter any part of a
+particular register.  This information is captured by the target macro
+@code{CALL_REALLY_USED_REGISTERS}.  However, some ABIs specify that calls
+must preserve certain bits of a particular register but can alter others.
+This hook should return true if this applies to at least one of the
+registers in @samp{(reg:@var{mode} @var{regno})}, and if as a result the
+call would alter part of the @var{mode} value.  For example, if a call
+preserves the low 32 bits of a 64-bit hard register @var{regno} but can
+clobber the upper 32 bits, this hook should return true for a 64-bit mode
+but false for a 32-bit mode.
+
+The value of @var{abi_id} comes from the @code{predefined_function_abi}
+structure that describes the ABI of the call; see the definition of the
+structure for more details.  If (as is usual) the target uses the same ABI
+for all functions in a translation unit, @var{abi_id} is always 0.
 
-The registers clobbered in different ABIs must be a proper subset or
-superset of all other ABIs.  @var{call_1} must always be a call insn,
-call_2 may be NULL or a call insn.
+The default implementation returns false, which is correct
+for targets that don't have partly call-clobbered registers.
 @end deftypefn
 
 @deftypefn {Target Hook} {const char *} TARGET_GET_MULTILIB_ABI_NAME (void)
@@ -3961,18 +3971,10 @@ This section describes the macros which let you control how various
 types of arguments are passed in registers or how they are arranged in
 the stack.
 
-@deftypefn {Target Hook} rtx TARGET_FUNCTION_ARG (cumulative_args_t @var{ca}, machine_mode @var{mode}, const_tree @var{type}, bool @var{named})
-Return an RTX indicating whether a function argument is passed in a
-register and if so, which register.
-
-The arguments are @var{ca}, which summarizes all the previous
-arguments; @var{mode}, the machine mode of the argument; @var{type},
-the data type of the argument as a tree node or 0 if that is not known
-(which happens for C support library functions); and @var{named},
-which is @code{true} for an ordinary argument and @code{false} for
-nameless arguments that correspond to @samp{@dots{}} in the called
-function's prototype.  @var{type} can be an incomplete type if a
-syntax error has previously occurred.
+@deftypefn {Target Hook} rtx TARGET_FUNCTION_ARG (cumulative_args_t @var{ca}, const function_arg_info @var{&arg})
+Return an RTX indicating whether function argument @var{arg} is passed
+in a register and if so, which register.  Argument @var{ca} summarizes all
+the previous arguments.
 
 The return value is usually either a @code{reg} RTX for the hard
 register in which to pass the argument, or zero to pass the argument
@@ -4020,14 +4022,14 @@ defined, the argument will be computed in the stack and then loaded into
 a register.
 @end deftypefn
 
-@deftypefn {Target Hook} bool TARGET_MUST_PASS_IN_STACK (machine_mode @var{mode}, const_tree @var{type})
-This target hook should return @code{true} if we should not pass @var{type}
+@deftypefn {Target Hook} bool TARGET_MUST_PASS_IN_STACK (const function_arg_info @var{&arg})
+This target hook should return @code{true} if we should not pass @var{arg}
 solely in registers.  The file @file{expr.h} defines a
 definition that is usually appropriate, refer to @file{expr.h} for additional
 documentation.
 @end deftypefn
 
-@deftypefn {Target Hook} rtx TARGET_FUNCTION_INCOMING_ARG (cumulative_args_t @var{ca}, machine_mode @var{mode}, const_tree @var{type}, bool @var{named})
+@deftypefn {Target Hook} rtx TARGET_FUNCTION_INCOMING_ARG (cumulative_args_t @var{ca}, const function_arg_info @var{&arg})
 Define this hook if the caller and callee on the target have different
 views of where arguments are passed.  Also define this hook if there are
 functions that are never directly called, but are invoked by the hardware
@@ -4057,7 +4059,7 @@ Perform a target dependent initialization of pic_offset_table_rtx.
 This hook is called at the start of register allocation.
 @end deftypefn
 
-@deftypefn {Target Hook} int TARGET_ARG_PARTIAL_BYTES (cumulative_args_t @var{cum}, machine_mode @var{mode}, tree @var{type}, bool @var{named})
+@deftypefn {Target Hook} int TARGET_ARG_PARTIAL_BYTES (cumulative_args_t @var{cum}, const function_arg_info @var{&arg})
 This target hook returns the number of bytes at the beginning of an
 argument that must be put in registers.  The value must be zero for
 arguments that are passed entirely in registers or that are entirely
@@ -4076,11 +4078,11 @@ register to be used by the caller for this argument; likewise
 @code{TARGET_FUNCTION_INCOMING_ARG}, for the called function.
 @end deftypefn
 
-@deftypefn {Target Hook} bool TARGET_PASS_BY_REFERENCE (cumulative_args_t @var{cum}, machine_mode @var{mode}, const_tree @var{type}, bool @var{named})
-This target hook should return @code{true} if an argument at the
+@deftypefn {Target Hook} bool TARGET_PASS_BY_REFERENCE (cumulative_args_t @var{cum}, const function_arg_info @var{&arg})
+This target hook should return @code{true} if argument @var{arg} at the
 position indicated by @var{cum} should be passed by reference.  This
 predicate is queried after target independent reasons for being
-passed by reference, such as @code{TREE_ADDRESSABLE (type)}.
+passed by reference, such as @code{TREE_ADDRESSABLE (@var{arg}.type)}.
 
 If the hook returns true, a copy of that argument is made in memory and a
 pointer to the argument is passed instead of the argument itself.
@@ -4088,7 +4090,7 @@ The pointer is passed in whatever way is appropriate for passing a pointer
 to that type.
 @end deftypefn
 
-@deftypefn {Target Hook} bool TARGET_CALLEE_COPIES (cumulative_args_t @var{cum}, machine_mode @var{mode}, const_tree @var{type}, bool @var{named})
+@deftypefn {Target Hook} bool TARGET_CALLEE_COPIES (cumulative_args_t @var{cum}, const function_arg_info @var{&arg})
 The function argument described by the parameters to this hook is
 known to be passed by reference.  The hook should return true if the
 function argument should be copied by the callee instead of copied
@@ -4167,10 +4169,9 @@ argument @var{libname} exists for symmetry with
 @c --mew 5feb93   i switched the order of the sentences.  --mew 10feb93
 @end defmac
 
-@deftypefn {Target Hook} void TARGET_FUNCTION_ARG_ADVANCE (cumulative_args_t @var{ca}, machine_mode @var{mode}, const_tree @var{type}, bool @var{named})
+@deftypefn {Target Hook} void TARGET_FUNCTION_ARG_ADVANCE (cumulative_args_t @var{ca}, const function_arg_info @var{&arg})
 This hook updates the summarizer variable pointed to by @var{ca} to
-advance past an argument in the argument list.  The values @var{mode},
-@var{type} and @var{named} describe that argument.  Once this is done,
+advance past argument @var{arg} in the argument list.  Once this is done,
 the variable @var{cum} is suitable for analyzing the @emph{following}
 argument with @code{TARGET_FUNCTION_ARG}, etc.
 
@@ -4331,6 +4332,27 @@ insns involving vector mode @var{mode}.  At the very least, it
 must have move patterns for this mode.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_COMPATIBLE_VECTOR_TYPES_P (const_tree @var{type1}, const_tree @var{type2})
+Return true if there is no target-specific reason for treating
+vector types @var{type1} and @var{type2} as distinct types.  The caller
+has already checked for target-independent reasons, meaning that the
+types are known to have the same mode, to have the same number of elements,
+and to have what the caller considers to be compatible element types.
+
+The main reason for defining this hook is to reject pairs of types
+that are handled differently by the target's calling convention.
+For example, when a new @var{N}-bit vector architecture is added
+to a target, the target may want to handle normal @var{N}-bit
+@code{VECTOR_TYPE} arguments and return values in the same way as
+before, to maintain backwards compatibility.  However, it may also
+provide new, architecture-specific @code{VECTOR_TYPE}s that are passed
+and returned in a more efficient way.  It is then important to maintain
+a distinction between the ``normal'' @code{VECTOR_TYPE}s and the new
+architecture-specific ones.
+
+The default implementation returns true, which is correct for most targets.
+@end deftypefn
+
 @deftypefn {Target Hook} opt_machine_mode TARGET_ARRAY_MODE (machine_mode @var{mode}, unsigned HOST_WIDE_INT @var{nelems})
 Return the mode that GCC should use for an array that has
 @var{nelems} elements, with each element having mode @var{mode}.
@@ -5202,7 +5224,7 @@ return value of this function should be an RTX that contains the value
 to use as the return of @code{__builtin_saveregs}.
 @end deftypefn
 
-@deftypefn {Target Hook} void TARGET_SETUP_INCOMING_VARARGS (cumulative_args_t @var{args_so_far}, machine_mode @var{mode}, tree @var{type}, int *@var{pretend_args_size}, int @var{second_time})
+@deftypefn {Target Hook} void TARGET_SETUP_INCOMING_VARARGS (cumulative_args_t @var{args_so_far}, const function_arg_info @var{&arg}, int *@var{pretend_args_size}, int @var{second_time})
 This target hook offers an alternative to using
 @code{__builtin_saveregs} and defining the hook
 @code{TARGET_EXPAND_BUILTIN_SAVEREGS}.  Use it to store the anonymous
@@ -5213,8 +5235,8 @@ pass all their arguments on the stack.
 
 The argument @var{args_so_far} points to the @code{CUMULATIVE_ARGS} data
 structure, containing the values that are obtained after processing the
-named arguments.  The arguments @var{mode} and @var{type} describe the
-last named argument---its machine mode and its data type as a tree node.
+named arguments.  The argument @var{arg} describes the last of these named
+arguments.
 
 The target hook should do two things: first, push onto the stack all the
 argument registers @emph{not} used for the named arguments, and second,
@@ -5314,12 +5336,6 @@ This hook is used by expand pass to emit insn to store @var{bounds}
 returned by function call into @var{slot}.
 @end deftypefn
 
-@deftypefn {Target Hook} void TARGET_SETUP_INCOMING_VARARG_BOUNDS (cumulative_args_t @var{args_so_far}, machine_mode @var{mode}, tree @var{type}, int *@var{pretend_args_size}, int @var{second_time})
-Use it to store bounds for anonymous register arguments stored
-into the stack.  Arguments meaning is similar to
-@code{TARGET_SETUP_INCOMING_VARARGS}.
-@end deftypefn
-
 @node Trampolines
 @section Support for Nested Functions
 @cindex support for nested functions
@@ -5967,18 +5983,6 @@ instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
-@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_CONVERSION (unsigned @var{code}, tree @var{dest_type}, tree @var{src_type})
-This hook should return the DECL of a function that implements conversion of the
-input vector of type @var{src_type} to type @var{dest_type}.
-The value of @var{code} is one of the enumerators in @code{enum tree_code} and
-specifies how the conversion is to be applied
-(truncation, rounding, etc.).
-
-If this hook is defined, the autovectorizer will use the
-@code{TARGET_VECTORIZE_BUILTIN_CONVERSION} target hook when vectorizing
-conversion. Otherwise, it will return @code{NULL_TREE}.
-@end deftypefn
-
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
 This hook should return the decl of a function that implements the
 vectorized variant of the function with the @code{combined_fn} code
@@ -6698,7 +6702,7 @@ two areas of memory, or to set, clear or store to memory, for example
 when copying a @code{struct}. The @code{by_pieces} infrastructure
 implements such memory operations as a sequence of load, store or move
 insns.  Alternate strategies are to expand the
-@code{movmem} or @code{setmem} optabs, to emit a library call, or to emit
+@code{cpymem} or @code{setmem} optabs, to emit a library call, or to emit
 unit-by-unit, loop-based operations.
 
 This target hook should return true if, for a memory operation with a
@@ -6717,7 +6721,7 @@ optimized for speed rather than size.
 
 Returning true for higher values of @var{size} can improve code generation
 for speed if the target does not provide an implementation of the
-@code{movmem} or @code{setmem} standard names, if the @code{movmem} or
+@code{cpymem} or @code{setmem} standard names, if the @code{cpymem} or
 @code{setmem} implementation would be more expensive than a sequence of
 insns, or if the overhead of a library call would dominate that of
 the body of the memory operation.
@@ -11607,6 +11611,21 @@ another @code{CALL_EXPR}.
 @var{arglist} really has type @samp{VEC(tree,gc)*}
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_CHECK_BUILTIN_CALL (location_t @var{loc}, vec<location_t> @var{arg_loc}, tree @var{fndecl}, tree @var{orig_fndecl}, unsigned int @var{nargs}, tree *@var{args})
+Perform semantic checking on a call to a machine-specific built-in
+function after its arguments have been constrained to the function
+signature.  Return true if the call is valid, otherwise report an error
+and return false.
+
+This hook is called after @code{TARGET_RESOLVE_OVERLOADED_BUILTIN}.
+The call was originally to built-in function @var{orig_fndecl},
+but after the optional @code{TARGET_RESOLVE_OVERLOADED_BUILTIN}
+step is now to built-in function @var{fndecl}.  @var{loc} is the
+location of the call and @var{args} is an array of function arguments,
+of which there are @var{nargs}.  @var{arg_loc} specifies the location
+of each argument.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_FOLD_BUILTIN (tree @var{fndecl}, int @var{n_args}, tree *@var{argp}, bool @var{ignore})
 Fold a call to a machine specific built-in function that was set up by
 @samp{TARGET_INIT_BUILTINS}.  @var{fndecl} is the declaration of the
@@ -11791,28 +11810,6 @@ cannot_modify_jumps_past_reload_p ()
 @end smallexample
 @end deftypefn
 
-@deftypefn {Target Hook} reg_class_t TARGET_BRANCH_TARGET_REGISTER_CLASS (void)
-This target hook returns a register class for which branch target register
-optimizations should be applied.  All registers in this class should be
-usable interchangeably.  After reload, registers in this class will be
-re-allocated and loads will be hoisted out of loops and be subjected
-to inter-block scheduling.
-@end deftypefn
-
-@deftypefn {Target Hook} bool TARGET_BRANCH_TARGET_REGISTER_CALLEE_SAVED (bool @var{after_prologue_epilogue_gen})
-Branch target register optimization will by default exclude callee-saved
-registers
-that are not already live during the current function; if this target hook
-returns true, they will be included.  The target code must than make sure
-that all target registers in the class returned by
-@samp{TARGET_BRANCH_TARGET_REGISTER_CLASS} that might need saving are
-saved.  @var{after_prologue_epilogue_gen} indicates if prologues and
-epilogues have already been generated.  Note, even if you only return
-true when @var{after_prologue_epilogue_gen} is false, you still are likely
-to have to make special provisions in @code{INITIAL_ELIMINATION_OFFSET}
-to reserve space for caller-saved target registers.
-@end deftypefn
-
 @deftypefn {Target Hook} bool TARGET_HAVE_CONDITIONAL_EXECUTION (void)
 This target hook returns true if the target supports conditional execution.
 This target hook is required only when the target has several different
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index bc362dca0..89cfb5253 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -1689,6 +1689,9 @@ function calls.
 If a register has 0 in @code{CALL_USED_REGISTERS}, the compiler
 automatically saves it on function entry and restores it on function
 exit, if the register is used within the function.
+
+Exactly one of @code{CALL_USED_REGISTERS} and @code{CALL_REALLY_USED_REGISTERS}
+must be defined.  Modern ports should define @code{CALL_REALLY_USED_REGISTERS}.
 @end defmac
 
 @defmac CALL_REALLY_USED_REGISTERS
@@ -1698,18 +1701,22 @@ exit, if the register is used within the function.
 Like @code{CALL_USED_REGISTERS} except this macro doesn't require
 that the entire set of @code{FIXED_REGISTERS} be included.
 (@code{CALL_USED_REGISTERS} must be a superset of @code{FIXED_REGISTERS}).
-This macro is optional.  If not specified, it defaults to the value
-of @code{CALL_USED_REGISTERS}.
+
+Exactly one of @code{CALL_USED_REGISTERS} and @code{CALL_REALLY_USED_REGISTERS}
+must be defined.  Modern ports should define @code{CALL_REALLY_USED_REGISTERS}.
 @end defmac
 
 @cindex call-used register
 @cindex call-clobbered register
 @cindex call-saved register
-@hook TARGET_HARD_REGNO_CALL_PART_CLOBBERED
+@hook TARGET_FNTYPE_ABI
 
-@hook TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
+@hook TARGET_INSN_CALLEE_ABI
 
-@hook TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
+@cindex call-used register
+@cindex call-clobbered register
+@cindex call-saved register
+@hook TARGET_HARD_REGNO_CALL_PART_CLOBBERED
 
 @hook TARGET_GET_MULTILIB_ABI_NAME
 
@@ -3362,6 +3369,8 @@ stack.
 
 @hook TARGET_VECTOR_MODE_SUPPORTED_P
 
+@hook TARGET_COMPATIBLE_VECTOR_TYPES_P
+
 @hook TARGET_ARRAY_MODE
 
 @hook TARGET_ARRAY_MODE_SUPPORTED_P
@@ -3785,8 +3794,6 @@ These machine description macros help implement varargs:
 
 @hook TARGET_STORE_RETURNED_BOUNDS
 
-@hook TARGET_SETUP_INCOMING_VARARG_BOUNDS
-
 @node Trampolines
 @section Support for Nested Functions
 @cindex support for nested functions
@@ -4160,8 +4167,6 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
-@hook TARGET_VECTORIZE_BUILTIN_CONVERSION
-
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
 @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
@@ -7934,6 +7939,8 @@ to by @var{ce_info}.
 
 @hook TARGET_RESOLVE_OVERLOADED_BUILTIN
 
+@hook TARGET_CHECK_BUILTIN_CALL
+
 @hook TARGET_FOLD_BUILTIN
 
 @hook TARGET_GIMPLE_FOLD_BUILTIN
@@ -7999,10 +8006,6 @@ build_type_attribute_variant (@var{mdecl},
 
 @hook TARGET_CANNOT_MODIFY_JUMPS_P
 
-@hook TARGET_BRANCH_TARGET_REGISTER_CLASS
-
-@hook TARGET_BRANCH_TARGET_REGISTER_CALLEE_SAVED
-
 @hook TARGET_HAVE_CONDITIONAL_EXECUTION
 
 @hook TARGET_GEN_CCMP_FIRST
diff --git a/gcc/dse.c b/gcc/dse.c
index 4becdcf1c..874ff507c 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -50,6 +50,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "rtl-iter.h"
 #include "cfgcleanup.h"
+#include "calls.h"
 
 /* This file contains three techniques for performing Dead Store
    Elimination (dse).
@@ -819,7 +820,7 @@ emit_inc_dec_insn_before (rtx mem ATTRIBUTE_UNUSED,
   for (cur = new_insn; cur; cur = NEXT_INSN (cur))
     {
       info.current = cur;
-      note_stores (PATTERN (cur), note_add_store, &info);
+      note_stores (cur, note_add_store, &info);
     }
 
   /* If a failure was flagged above, return 1 so that for_each_inc_dec will
@@ -1976,7 +1977,7 @@ replace_read (store_info *store_info, insn_info_t store_insn,
       bitmap regs_set = BITMAP_ALLOC (&reg_obstack);
 
       for (this_insn = insns; this_insn != NULL_RTX; this_insn = NEXT_INSN (this_insn))
-	note_stores (PATTERN (this_insn), look_for_hardregs, regs_set);
+	note_stores (this_insn, look_for_hardregs, regs_set);
 
       bitmap_and_into (regs_set, regs_live);
       if (!bitmap_empty_p (regs_set))
@@ -2341,7 +2342,8 @@ get_call_args (rtx call_insn, tree fn, rtx *args, int nargs)
       if (!is_int_mode (TYPE_MODE (TREE_VALUE (arg)), &mode))
 	return false;
 
-      reg = targetm.calls.function_arg (args_so_far, mode, NULL_TREE, true);
+      function_arg_info arg (mode, /*named=*/true);
+      reg = targetm.calls.function_arg (args_so_far, arg);
       if (!reg || !REG_P (reg) || GET_MODE (reg) != mode)
 	return false;
 
@@ -2373,7 +2375,7 @@ get_call_args (rtx call_insn, tree fn, rtx *args, int nargs)
       if (tmp)
 	args[idx] = tmp;
 
-      targetm.calls.function_arg_advance (args_so_far, mode, NULL_TREE, true);
+      targetm.calls.function_arg_advance (args_so_far, arg);
     }
   if (arg != void_list_node || idx != nargs)
     return false;
@@ -2388,7 +2390,7 @@ copy_fixed_regs (const_bitmap in)
   bitmap ret;
 
   ret = ALLOC_REG_SET (NULL);
-  bitmap_and (ret, in, fixed_reg_set_regset);
+  bitmap_and (ret, in, bitmap_view<HARD_REG_SET> (fixed_reg_set));
   return ret;
 }
 
diff --git a/gcc/dwarf2out.c b/gcc/dwarf2out.c
index 30c4c7007..a219d7fc3 100644
--- a/gcc/dwarf2out.c
+++ b/gcc/dwarf2out.c
@@ -16428,7 +16428,6 @@ mem_loc_descriptor (rtx rtl, machine_mode mode,
     case CONST_FIXED:
     case CLRSB:
     case CLOBBER:
-    case CLOBBER_HIGH:
       break;
 
     case CONST_STRING:
@@ -18566,6 +18565,24 @@ loc_list_from_tree_1 (tree loc, int want_address,
 	}
       break;
 
+    case POLY_INT_CST:
+      {
+	if (want_address)
+	  {
+	    expansion_failed (loc, NULL_RTX,
+			      "constant address with a runtime component");
+	    return 0;
+	  }
+	poly_int64 value;
+	if (!poly_int_tree_p (loc, &value))
+	  {
+	    expansion_failed (loc, NULL_RTX, "constant too big");
+	    return 0;
+	  }
+	ret = int_loc_descriptor (value);
+      }
+      break;
+
     case CONSTRUCTOR:
     case REAL_CST:
     case STRING_CST:
@@ -19682,6 +19699,7 @@ add_const_value_attribute (dw_die_ref die, rtx rtl)
     case MINUS:
     case SIGN_EXTEND:
     case ZERO_EXTEND:
+    case CONST_POLY_INT:
       return false;
 
     case MEM:
diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
index 78104603c..d6636ccb0 100644
--- a/gcc/emit-rtl.c
+++ b/gcc/emit-rtl.c
@@ -2865,7 +2865,6 @@ verify_rtx_sharing (rtx orig, rtx insn)
       /* SCRATCH must be shared because they represent distinct values.  */
       return;
     case CLOBBER:
-    case CLOBBER_HIGH:
       /* Share clobbers of hard registers (like cc0), but do not share pseudo reg
          clobbers or clobbers of hard registers that originated as pseudos.
          This is needed to allow safe register renaming.  */
@@ -3119,7 +3118,6 @@ repeat:
       /* SCRATCH must be shared because they represent distinct values.  */
       return;
     case CLOBBER:
-    case CLOBBER_HIGH:
       /* Share clobbers of hard registers (like cc0), but do not share pseudo reg
          clobbers or clobbers of hard registers that originated as pseudos.
          This is needed to allow safe register renaming.  */
@@ -5693,7 +5691,6 @@ copy_insn_1 (rtx orig)
     case SIMPLE_RETURN:
       return orig;
     case CLOBBER:
-    case CLOBBER_HIGH:
       /* Share clobbers of hard registers (like cc0), but do not share pseudo reg
          clobbers or clobbers of hard registers that originated as pseudos.
          This is needed to allow safe register renaming.  */
@@ -6505,21 +6502,6 @@ gen_hard_reg_clobber (machine_mode mode, unsigned int regno)
 	    gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (mode, regno)));
 }
 
-static GTY((deletable)) rtx
-hard_reg_clobbers_high[NUM_MACHINE_MODES][FIRST_PSEUDO_REGISTER];
-
-/* Return a CLOBBER_HIGH expression for register REGNO that clobbers MODE,
-   caching into HARD_REG_CLOBBERS_HIGH.  */
-rtx
-gen_hard_reg_clobber_high (machine_mode mode, unsigned int regno)
-{
-  if (hard_reg_clobbers_high[mode][regno])
-    return hard_reg_clobbers_high[mode][regno];
-  else
-    return (hard_reg_clobbers_high[mode][regno]
-	    = gen_rtx_CLOBBER_HIGH (VOIDmode, gen_rtx_REG (mode, regno)));
-}
-
 location_t prologue_location;
 location_t epilogue_location;
 
diff --git a/gcc/emit-rtl.h b/gcc/emit-rtl.h
index 7b1cecd3c..573140e84 100644
--- a/gcc/emit-rtl.h
+++ b/gcc/emit-rtl.h
@@ -22,6 +22,7 @@ along with GCC; see the file COPYING3.  If not see
 
 struct temp_slot;
 typedef struct temp_slot *temp_slot_p;
+struct predefined_function_abi;
 
 /* Information mainlined about RTL representation of incoming arguments.  */
 struct GTY(()) incoming_args {
@@ -64,6 +65,14 @@ struct GTY(()) rtl_data {
   struct function_subsections subsections;
   struct rtl_eh eh;
 
+  /* The ABI of the function, i.e. the interface it presents to its callers.
+     This is the ABI that should be queried to see which registers the
+     function needs to save before it uses them.
+
+     Other functions (including those called by this function) might use
+     different ABIs.  */
+  const predefined_function_abi *GTY((skip)) abi;
+
   /* For function.c  */
 
   /* # of bytes of outgoing arguments.  If ACCUMULATE_OUTGOING_ARGS is
diff --git a/gcc/expr.c b/gcc/expr.c
index 650be8dad..b77f0409e 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -73,7 +73,7 @@ along with GCC; see the file COPYING3.  If not see
 int cse_not_expected;
 
 static bool block_move_libcall_safe_for_call_parm (void);
-static bool emit_block_move_via_movmem (rtx, rtx, rtx, unsigned, unsigned, HOST_WIDE_INT,
+static bool emit_block_move_via_cpymem (rtx, rtx, rtx, unsigned, unsigned, HOST_WIDE_INT,
 					unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					unsigned HOST_WIDE_INT);
 static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
@@ -1645,7 +1645,7 @@ emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 
   if (CONST_INT_P (size) && can_move_by_pieces (INTVAL (size), align))
     move_by_pieces (x, y, INTVAL (size), align, RETURN_BEGIN);
-  else if (emit_block_move_via_movmem (x, y, size, align,
+  else if (emit_block_move_via_cpymem (x, y, size, align,
 				       expected_align, expected_size,
 				       min_size, max_size, probable_max_size))
     ;
@@ -1723,31 +1723,28 @@ block_move_libcall_safe_for_call_parm (void)
     for ( ; arg != void_list_node ; arg = TREE_CHAIN (arg))
       {
 	machine_mode mode = TYPE_MODE (TREE_VALUE (arg));
-	rtx tmp = targetm.calls.function_arg (args_so_far, mode,
-					      NULL_TREE, true);
+	function_arg_info arg_info (mode, /*named=*/true);
+	rtx tmp = targetm.calls.function_arg (args_so_far, arg_info);
 	if (!tmp || !REG_P (tmp))
 	  return false;
-	if (targetm.calls.arg_partial_bytes (args_so_far, mode, NULL, 1))
+	if (targetm.calls.arg_partial_bytes (args_so_far, arg_info))
 	  return false;
-	targetm.calls.function_arg_advance (args_so_far, mode,
-					    NULL_TREE, true);
+	targetm.calls.function_arg_advance (args_so_far, arg_info);
       }
   }
   return true;
 }
 
-/* A subroutine of emit_block_move.  Expand a movmem pattern;
+/* A subroutine of emit_block_move.  Expand a cpymem pattern;
    return true if successful.  */
 
 static bool
-emit_block_move_via_movmem (rtx x, rtx y, rtx size, unsigned int align,
+emit_block_move_via_cpymem (rtx x, rtx y, rtx size, unsigned int align,
 			    unsigned int expected_align, HOST_WIDE_INT expected_size,
 			    unsigned HOST_WIDE_INT min_size,
 			    unsigned HOST_WIDE_INT max_size,
 			    unsigned HOST_WIDE_INT probable_max_size)
 {
-  int save_volatile_ok = volatile_ok;
-
   if (expected_align < align)
     expected_align = align;
   if (expected_size != -1)
@@ -1759,7 +1756,7 @@ emit_block_move_via_movmem (rtx x, rtx y, rtx size, unsigned int align,
     }
 
   /* Since this is a move insn, we don't care about volatility.  */
-  volatile_ok = 1;
+  temporary_volatile_ok v (true);
 
   /* Try the most limited insn first, because there's no point
      including more than one in the machine description unless
@@ -1769,7 +1766,7 @@ emit_block_move_via_movmem (rtx x, rtx y, rtx size, unsigned int align,
   FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
     {
       scalar_int_mode mode = mode_iter.require ();
-      enum insn_code code = direct_optab_handler (movmem_optab, mode);
+      enum insn_code code = direct_optab_handler (cpymem_optab, mode);
 
       if (code != CODE_FOR_nothing
 	  /* We don't need MODE to be narrower than BITS_PER_HOST_WIDE_INT
@@ -1823,14 +1820,10 @@ emit_block_move_via_movmem (rtx x, rtx y, rtx size, unsigned int align,
 		create_fixed_operand (&ops[8], NULL);
 	    }
 	  if (maybe_expand_insn (code, nops, ops))
-	    {
-	      volatile_ok = save_volatile_ok;
-	      return true;
-	    }
+	    return true;
 	}
     }
 
-  volatile_ok = save_volatile_ok;
   return false;
 }
 
@@ -5841,7 +5834,8 @@ store_expr (tree exp, rtx target, int call_param_p,
 		copy_blkmode_from_reg (target, temp, TREE_TYPE (exp));
 	      else
 		store_bit_field (target,
-				 INTVAL (expr_size (exp)) * BITS_PER_UNIT,
+				 rtx_to_poly_int64 (expr_size (exp))
+				 * BITS_PER_UNIT,
 				 0, 0, 0, GET_MODE (temp), temp, reverse);
 	    }
 	  else
diff --git a/gcc/final.c b/gcc/final.c
index fefc4874b..7cf9ef1ef 100644
--- a/gcc/final.c
+++ b/gcc/final.c
@@ -81,6 +81,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "asan.h"
 #include "rtl-iter.h"
 #include "print-rtl.h"
+#include "function-abi.h"
 
 #ifdef XCOFF_DEBUGGING_INFO
 #include "xcoffout.h"		/* Needed for external data declarations.  */
@@ -230,7 +231,6 @@ static int alter_cond (rtx);
 #endif
 static int align_fuzz (rtx, rtx, int, unsigned);
 static void collect_fn_hard_reg_usage (void);
-static tree get_call_fndecl (rtx_insn *);
 
 /* Initialize data in final at the beginning of a compilation.  */
 
@@ -4994,7 +4994,16 @@ collect_fn_hard_reg_usage (void)
   if (!targetm.call_fusage_contains_non_callee_clobbers)
     return;
 
-  CLEAR_HARD_REG_SET (function_used_regs);
+  /* Be conservative - mark fixed and global registers as used.  */
+  function_used_regs = fixed_reg_set;
+
+#ifdef STACK_REGS
+  /* Handle STACK_REGS conservatively, since the df-framework does not
+     provide accurate information for them.  */
+
+  for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
+    SET_HARD_REG_BIT (function_used_regs, i);
+#endif
 
   for (insn = get_insns (); insn != NULL_RTX; insn = next_insn (insn))
     {
@@ -5005,97 +5014,23 @@ collect_fn_hard_reg_usage (void)
 
       if (CALL_P (insn)
 	  && !self_recursive_call_p (insn))
-	{
-	  if (!get_call_reg_set_usage (insn, &insn_used_regs,
-				       call_used_reg_set))
-	    return;
-
-	  IOR_HARD_REG_SET (function_used_regs, insn_used_regs);
-	}
+	function_used_regs
+	  |= insn_callee_abi (insn).full_and_partial_reg_clobbers ();
 
       find_all_hard_reg_sets (insn, &insn_used_regs, false);
-      IOR_HARD_REG_SET (function_used_regs, insn_used_regs);
-    }
+      function_used_regs |= insn_used_regs;
 
-  /* Be conservative - mark fixed and global registers as used.  */
-  IOR_HARD_REG_SET (function_used_regs, fixed_reg_set);
-
-#ifdef STACK_REGS
-  /* Handle STACK_REGS conservatively, since the df-framework does not
-     provide accurate information for them.  */
-
-  for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
-    SET_HARD_REG_BIT (function_used_regs, i);
-#endif
+      if (hard_reg_set_subset_p (crtl->abi->full_and_partial_reg_clobbers (),
+				 function_used_regs))
+	return;
+    }
 
-  /* The information we have gathered is only interesting if it exposes a
-     register from the call_used_regs that is not used in this function.  */
-  if (hard_reg_set_subset_p (call_used_reg_set, function_used_regs))
-    return;
+  /* Mask out fully-saved registers, so that they don't affect equality
+     comparisons between function_abis.  */
+  function_used_regs &= crtl->abi->full_and_partial_reg_clobbers ();
 
   node = cgraph_node::rtl_info (current_function_decl);
   gcc_assert (node != NULL);
 
-  COPY_HARD_REG_SET (node->function_used_regs, function_used_regs);
-  node->function_used_regs_valid = 1;
-}
-
-/* Get the declaration of the function called by INSN.  */
-
-static tree
-get_call_fndecl (rtx_insn *insn)
-{
-  rtx note, datum;
-
-  note = find_reg_note (insn, REG_CALL_DECL, NULL_RTX);
-  if (note == NULL_RTX)
-    return NULL_TREE;
-
-  datum = XEXP (note, 0);
-  if (datum != NULL_RTX)
-    return SYMBOL_REF_DECL (datum);
-
-  return NULL_TREE;
-}
-
-/* Return the cgraph_rtl_info of the function called by INSN.  Returns NULL for
-   call targets that can be overwritten.  */
-
-static struct cgraph_rtl_info *
-get_call_cgraph_rtl_info (rtx_insn *insn)
-{
-  tree fndecl;
-
-  if (insn == NULL_RTX)
-    return NULL;
-
-  fndecl = get_call_fndecl (insn);
-  if (fndecl == NULL_TREE
-      || !decl_binds_to_current_def_p (fndecl))
-    return NULL;
-
-  return cgraph_node::rtl_info (fndecl);
-}
-
-/* Find hard registers used by function call instruction INSN, and return them
-   in REG_SET.  Return DEFAULT_SET in REG_SET if not found.  */
-
-bool
-get_call_reg_set_usage (rtx_insn *insn, HARD_REG_SET *reg_set,
-			HARD_REG_SET default_set)
-{
-  if (flag_ipa_ra)
-    {
-      struct cgraph_rtl_info *node = get_call_cgraph_rtl_info (insn);
-      if (node != NULL
-	  && node->function_used_regs_valid)
-	{
-	  COPY_HARD_REG_SET (*reg_set, node->function_used_regs);
-	  AND_HARD_REG_SET (*reg_set, default_set);
-	  return true;
-	}
-    }
-  COPY_HARD_REG_SET (*reg_set, default_set);
-  targetm.remove_extra_call_preserved_regs (insn, reg_set);
-  return false;
+  node->function_used_regs = function_used_regs;
 }
diff --git a/gcc/fold-const-call.c b/gcc/fold-const-call.c
index 702c8b405..e21d8e110 100644
--- a/gcc/fold-const-call.c
+++ b/gcc/fold-const-call.c
@@ -689,6 +689,36 @@ fold_const_vec_convert (tree ret_type, tree arg)
   return elts.build ();
 }
 
+/* Try to evaluate:
+
+      IFN_WHILE_ULT (ARG0, ARG1, (TYPE) { ... })
+
+   Return the value on success and null on failure.  */
+
+static tree
+fold_while_ult (tree type, poly_uint64 arg0, poly_uint64 arg1)
+{
+  if (known_ge (arg0, arg1))
+    return build_zero_cst (type);
+
+  if (maybe_ge (arg0, arg1))
+    return NULL_TREE;
+
+  poly_uint64 diff = arg1 - arg0;
+  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
+  if (known_ge (diff, nelts))
+    return build_all_ones_cst (type);
+
+  unsigned HOST_WIDE_INT const_diff;
+  if (known_le (diff, nelts) && diff.is_constant (&const_diff))
+    {
+      tree minus_one = build_minus_one_cst (TREE_TYPE (type));
+      tree zero = build_zero_cst (TREE_TYPE (type));
+      return build_vector_a_then_b (type, const_diff, minus_one, zero);
+    }
+  return NULL_TREE;
+}
+
 /* Try to evaluate:
 
       *RESULT = FN (*ARG)
@@ -1782,6 +1812,14 @@ fold_const_call (combined_fn fn, tree type, tree arg0, tree arg1, tree arg2)
 	}
       return NULL_TREE;
 
+    case CFN_WHILE_ULT:
+      {
+	poly_uint64 parg0, parg1;
+	if (poly_int_tree_p (arg0, &parg0) && poly_int_tree_p (arg1, &parg1))
+	  return fold_while_ult (type, parg0, parg1);
+	return NULL_TREE;
+      }
+
     default:
       return fold_const_call_1 (fn, type, arg0, arg1, arg2);
     }
diff --git a/gcc/fold-const.c b/gcc/fold-const.c
index c717f2450..ffc2669a7 100644
--- a/gcc/fold-const.c
+++ b/gcc/fold-const.c
@@ -3477,7 +3477,8 @@ operand_equal_p (const_tree arg0, const_tree arg1, unsigned int flags)
       return (TREE_CODE (arg0) == FUNCTION_DECL
 	      && fndecl_built_in_p (arg0) && fndecl_built_in_p (arg1)
 	      && DECL_BUILT_IN_CLASS (arg0) == DECL_BUILT_IN_CLASS (arg1)
-	      && DECL_FUNCTION_CODE (arg0) == DECL_FUNCTION_CODE (arg1));
+	      && (DECL_UNCHECKED_FUNCTION_CODE (arg0)
+		  == DECL_UNCHECKED_FUNCTION_CODE (arg1)));
 
     case tcc_exceptional:
       if (TREE_CODE (arg0) == CONSTRUCTOR)
@@ -7380,22 +7381,18 @@ native_encode_complex (const_tree expr, unsigned char *ptr, int len, int off)
   return rsize + isize;
 }
 
-
-/* Subroutine of native_encode_expr.  Encode the VECTOR_CST
-   specified by EXPR into the buffer PTR of length LEN bytes.
-   Return the number of bytes placed in the buffer, or zero
-   upon failure.  */
+/* Like native_encode_vector, but only encode the first COUNT elements.
+   The other arguments are as for native_encode_vector.  */
 
 static int
-native_encode_vector (const_tree expr, unsigned char *ptr, int len, int off)
+native_encode_vector_part (const_tree expr, unsigned char *ptr, int len,
+			   int off, unsigned HOST_WIDE_INT count)
 {
-  unsigned HOST_WIDE_INT i, count;
+  unsigned HOST_WIDE_INT i;
   int size, offset;
   tree itype, elem;
 
   offset = 0;
-  if (!VECTOR_CST_NELTS (expr).is_constant (&count))
-    return 0;
   itype = TREE_TYPE (TREE_TYPE (expr));
   size = GET_MODE_SIZE (SCALAR_TYPE_MODE (itype));
   for (i = 0; i < count; i++)
@@ -7419,6 +7416,20 @@ native_encode_vector (const_tree expr, unsigned char *ptr, int len, int off)
   return offset;
 }
 
+/* Subroutine of native_encode_expr.  Encode the VECTOR_CST
+   specified by EXPR into the buffer PTR of length LEN bytes.
+   Return the number of bytes placed in the buffer, or zero
+   upon failure.  */
+
+static int
+native_encode_vector (const_tree expr, unsigned char *ptr, int len, int off)
+{
+  unsigned HOST_WIDE_INT count;
+  if (!VECTOR_CST_NELTS (expr).is_constant (&count))
+    return 0;
+  return native_encode_vector_part (expr, ptr, len, off, count);
+}
+
 
 /* Subroutine of native_encode_expr.  Encode the STRING_CST
    specified by EXPR into the buffer PTR of length LEN bytes.
@@ -7714,6 +7725,113 @@ can_native_interpret_type_p (tree type)
     }
 }
 
+/* Read a vector of type TYPE from the target memory image given by BYTES,
+   starting at byte FIRST_BYTE.  The vector is known to be encodable using
+   NPATTERNS interleaved patterns with NELTS_PER_PATTERN elements each,
+   and BYTES is known to have enough bytes to supply NPATTERNS *
+   NELTS_PER_PATTERN vector elements.  Each element of BYTES contains
+   BITS_PER_UNIT bits and the bytes are in target memory order.
+
+   Return the vector on success, otherwise return null.  */
+
+static tree
+native_decode_vector_tree (tree type, vec<unsigned char> bytes,
+			   unsigned int first_byte, unsigned int npatterns,
+			   unsigned int nelts_per_pattern)
+{
+  tree_vector_builder builder (type, npatterns, nelts_per_pattern);
+  tree elt_type = TREE_TYPE (type);
+  unsigned int elt_bits = tree_to_uhwi (TYPE_SIZE (elt_type));
+  if (VECTOR_BOOLEAN_TYPE_P (type) && elt_bits <= BITS_PER_UNIT)
+    {
+      /* This is the only case in which elements can be smaller than a byte.
+	 Element 0 is always in the lsb of the containing byte.  */
+      elt_bits = TYPE_PRECISION (elt_type);
+      for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
+	{
+	  unsigned int bit_index = first_byte * BITS_PER_UNIT + i * elt_bits;
+	  unsigned int byte_index = bit_index / BITS_PER_UNIT;
+	  unsigned int lsb = bit_index % BITS_PER_UNIT;
+	  builder.quick_push (bytes[byte_index] & (1 << lsb)
+			      ? build_all_ones_cst (elt_type)
+			      : build_zero_cst (elt_type));
+	}
+    }
+  else
+    {
+      unsigned int elt_bytes = elt_bits / BITS_PER_UNIT;
+      for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
+	{
+	  tree elt = native_interpret_expr (elt_type, &bytes[first_byte],
+					    elt_bytes);
+	  if (!elt)
+	    return NULL_TREE;
+	  builder.quick_push (elt);
+	  first_byte += elt_bytes;
+	}
+    }
+  return builder.build ();
+}
+
+/* Try to view-convert VECTOR_CST EXPR to VECTOR_TYPE TYPE by operating
+   directly on the VECTOR_CST encoding, in a way that works for variable-
+   length vectors.  Return the resulting VECTOR_CST on success or null
+   on failure.  */
+
+static tree
+fold_view_convert_vector_encoding (tree type, tree expr)
+{
+  tree expr_type = TREE_TYPE (expr);
+  poly_uint64 type_bits, expr_bits;
+  if (!poly_int_tree_p (TYPE_SIZE (type), &type_bits)
+      || !poly_int_tree_p (TYPE_SIZE (expr_type), &expr_bits))
+    return NULL_TREE;
+
+  poly_uint64 type_units = TYPE_VECTOR_SUBPARTS (type);
+  poly_uint64 expr_units = TYPE_VECTOR_SUBPARTS (expr_type);
+  unsigned int type_elt_bits = vector_element_size (type_bits, type_units);
+  unsigned int expr_elt_bits = vector_element_size (expr_bits, expr_units);
+
+  /* We can only preserve the semantics of a stepped pattern if the new
+     vector element is an integer of the same size.  */
+  if (VECTOR_CST_STEPPED_P (expr)
+      && (!INTEGRAL_TYPE_P (type) || type_elt_bits != expr_elt_bits))
+    return NULL_TREE;
+
+  /* The number of bits needed to encode one element from every pattern
+     of the original vector.  */
+  unsigned int expr_sequence_bits
+    = VECTOR_CST_NPATTERNS (expr) * expr_elt_bits;
+
+  /* The number of bits needed to encode one element from every pattern
+     of the result.  */
+  unsigned int type_sequence_bits
+    = least_common_multiple (expr_sequence_bits, type_elt_bits);
+
+  /* Don't try to read more bytes than are available, which can happen
+     for constant-sized vectors if TYPE has larger elements than EXPR_TYPE.
+     The general VIEW_CONVERT handling can cope with that case, so there's
+     no point complicating things here.  */
+  unsigned int nelts_per_pattern = VECTOR_CST_NELTS_PER_PATTERN (expr);
+  unsigned int buffer_bytes = CEIL (nelts_per_pattern * type_sequence_bits,
+				    BITS_PER_UNIT);
+  unsigned int buffer_bits = buffer_bytes * BITS_PER_UNIT;
+  if (known_gt (buffer_bits, expr_bits))
+    return NULL_TREE;
+
+  /* Get enough bytes of EXPR to form the new encoding.  */
+  auto_vec<unsigned char, 128> buffer (buffer_bytes);
+  buffer.quick_grow (buffer_bytes);
+  if (native_encode_vector_part (expr, buffer.address (), buffer_bytes, 0,
+				 buffer_bits / expr_elt_bits)
+      != (int) buffer_bytes)
+    return NULL_TREE;
+
+  /* Reencode the bytes as TYPE.  */
+  unsigned int type_npatterns = type_sequence_bits / type_elt_bits;
+  return native_decode_vector_tree (type, buffer, 0, type_npatterns,
+				    nelts_per_pattern);
+}
 
 /* Fold a VIEW_CONVERT_EXPR of a constant expression EXPR to type
    TYPE at compile-time.  If we're unable to perform the conversion
@@ -7730,6 +7848,10 @@ fold_view_convert_expr (tree type, tree expr)
   if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
     return NULL_TREE;
 
+  if (VECTOR_TYPE_P (type) && TREE_CODE (expr) == VECTOR_CST)
+    if (tree res = fold_view_convert_vector_encoding (type, expr))
+      return res;
+
   len = native_encode_expr (expr, buffer, sizeof (buffer));
   if (len == 0)
     return NULL_TREE;
@@ -9030,7 +9152,7 @@ vec_cst_ctor_to_array (tree arg, unsigned int nelts, tree *elts)
    selector.  Return the folded VECTOR_CST or CONSTRUCTOR if successful,
    NULL_TREE otherwise.  */
 
-static tree
+tree
 fold_vec_perm (tree type, tree arg0, tree arg1, const vec_perm_indices &sel)
 {
   unsigned int i;
@@ -9254,7 +9376,7 @@ tree_expr_nonzero_warnv_p (tree t, bool *strict_overflow_p)
 	tree fndecl = get_callee_fndecl (t);
 	if (!fndecl) return false;
 	if (flag_delete_null_pointer_checks && !flag_check_new
-	    && DECL_IS_OPERATOR_NEW (fndecl)
+	    && DECL_IS_OPERATOR_NEW_P (fndecl)
 	    && !TREE_NOTHROW (fndecl))
 	  return true;
 	if (flag_delete_null_pointer_checks
@@ -11778,7 +11900,10 @@ fold_ternary_loc (location_t loc, enum tree_code code, tree type,
       return NULL_TREE;
 
     case VEC_PERM_EXPR:
-      if (TREE_CODE (arg2) == VECTOR_CST)
+      /* Perform constant folding of BIT_INSERT_EXPR.  */
+      if (TREE_CODE (arg2) == VECTOR_CST
+	  && TREE_CODE (op0) == VECTOR_CST
+	  && TREE_CODE (op1) == VECTOR_CST)
 	{
 	  /* Build a vector of integers from the tree mask.  */
 	  vec_perm_builder builder;
@@ -11789,61 +11914,7 @@ fold_ternary_loc (location_t loc, enum tree_code code, tree type,
 	  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
 	  bool single_arg = (op0 == op1);
 	  vec_perm_indices sel (builder, single_arg ? 1 : 2, nelts);
-
-	  /* Check for cases that fold to OP0 or OP1 in their original
-	     element order.  */
-	  if (sel.series_p (0, 1, 0, 1))
-	    return op0;
-	  if (sel.series_p (0, 1, nelts, 1))
-	    return op1;
-
-	  if (!single_arg)
-	    {
-	      if (sel.all_from_input_p (0))
-		op1 = op0;
-	      else if (sel.all_from_input_p (1))
-		{
-		  op0 = op1;
-		  sel.rotate_inputs (1);
-		}
-	    }
-
-	  if ((TREE_CODE (op0) == VECTOR_CST
-	       || TREE_CODE (op0) == CONSTRUCTOR)
-	      && (TREE_CODE (op1) == VECTOR_CST
-		  || TREE_CODE (op1) == CONSTRUCTOR))
-	    {
-	      tree t = fold_vec_perm (type, op0, op1, sel);
-	      if (t != NULL_TREE)
-		return t;
-	    }
-
-	  bool changed = (op0 == op1 && !single_arg);
-
-	  /* Generate a canonical form of the selector.  */
-	  if (arg2 == op2 && sel.encoding () != builder)
-	    {
-	      /* Some targets are deficient and fail to expand a single
-		 argument permutation while still allowing an equivalent
-		 2-argument version.  */
-	      if (sel.ninputs () == 2
-		  || can_vec_perm_const_p (TYPE_MODE (type), sel, false))
-		op2 = vec_perm_indices_to_tree (TREE_TYPE (arg2), sel);
-	      else
-		{
-		  vec_perm_indices sel2 (builder, 2, nelts);
-		  if (can_vec_perm_const_p (TYPE_MODE (type), sel2, false))
-		    op2 = vec_perm_indices_to_tree (TREE_TYPE (arg2), sel2);
-		  else
-		    /* Not directly supported with either encoding,
-		       so use the preferred form.  */
-		    op2 = vec_perm_indices_to_tree (TREE_TYPE (arg2), sel);
-		}
-	      changed = true;
-	    }
-
-	  if (changed)
-	    return build3_loc (loc, VEC_PERM_EXPR, type, op0, op1, op2);
+	  return fold_vec_perm (type, op0, op1, sel);
 	}
       return NULL_TREE;
 
diff --git a/gcc/fold-const.h b/gcc/fold-const.h
index e2e662463..1d94e2894 100644
--- a/gcc/fold-const.h
+++ b/gcc/fold-const.h
@@ -100,6 +100,9 @@ extern tree fold_bit_and_mask (tree, tree, enum tree_code,
 			       tree, enum tree_code, tree, tree,
 			       tree, enum tree_code, tree, tree, tree *);
 extern tree fold_read_from_constant_string (tree);
+#if GCC_VEC_PERN_INDICES_H
+extern tree fold_vec_perm (tree, tree, tree, const vec_perm_indices &);
+#endif
 extern bool wide_int_binop (wide_int &res, enum tree_code,
 			    const wide_int &arg1, const wide_int &arg2,
 			    signop, wi::overflow_type *);
diff --git a/gcc/function-abi.cc b/gcc/function-abi.cc
new file mode 100644
index 000000000..b4a183963
--- /dev/null
+++ b/gcc/function-abi.cc
@@ -0,0 +1,260 @@
+/* Information about fuunction binary interfaces.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+This file is part of GCC
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "target.h"
+#include "rtl.h"
+#include "tree.h"
+#include "regs.h"
+#include "function-abi.h"
+#include "varasm.h"
+#include "cgraph.h"
+
+target_function_abi_info default_target_function_abi_info;
+#if SWITCHABLE_TARGET
+target_function_abi_info *this_target_function_abi_info
+  = &default_target_function_abi_info;
+#endif
+
+/* Initialize a predefined function ABI with the given values of
+   ID and FULL_REG_CLOBBERS.  */
+
+void
+predefined_function_abi::initialize (unsigned int id,
+				     const_hard_reg_set full_reg_clobbers)
+{
+  m_id = id;
+  m_initialized = true;
+  m_full_reg_clobbers = full_reg_clobbers;
+
+  /* Set up the value of m_full_and_partial_reg_clobbers.
+
+     If the ABI specifies that part of a hard register R is call-clobbered,
+     we should be able to find a single-register mode M for which
+     targetm.hard_regno_call_part_clobbered (m_id, R, M) is true.
+     In other words, it shouldn't be the case that R can hold all
+     single-register modes across a call, but can't hold part of
+     a multi-register mode.
+
+     If that assumption doesn't hold for a future target, we would need
+     to change the interface of TARGET_HARD_REGNO_CALL_PART_CLOBBERED so
+     that it tells us which registers in a multi-register value are
+     actually clobbered.  */
+  m_full_and_partial_reg_clobbers = full_reg_clobbers;
+  for (unsigned int i = 0; i < NUM_MACHINE_MODES; ++i)
+    {
+      machine_mode mode = (machine_mode) i;
+      for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
+	if (targetm.hard_regno_mode_ok (regno, mode)
+	    && hard_regno_nregs (regno, mode) == 1
+	    && targetm.hard_regno_call_part_clobbered (m_id, regno, mode))
+	  SET_HARD_REG_BIT (m_full_and_partial_reg_clobbers, regno);
+    }
+
+  /* For each mode MODE, work out which registers are unable to hold
+     any part of a MODE value across a call, i.e. those for which no
+     overlapping call-preserved (reg:MODE REGNO) exists.
+
+     We assume that this can be flipped around to say that a call
+     preserves (reg:MODE REGNO) unless the register overlaps this set.
+     The usual reason for this being true is that if (reg:MODE REGNO)
+     contains a part-clobbered register, that register would be
+     part-clobbered regardless of which part of MODE it holds.
+     For example, if (reg:M 2) occupies two registers and if the
+     register 3 portion of it is part-clobbered, (reg:M 3) is usually
+     either invalid or also part-clobbered.  */
+  for (unsigned int i = 0; i < NUM_MACHINE_MODES; ++i)
+    {
+      machine_mode mode = (machine_mode) i;
+      m_mode_clobbers[i] = m_full_and_partial_reg_clobbers;
+      for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
+	if (targetm.hard_regno_mode_ok (regno, mode)
+	    && !overlaps_hard_reg_set_p (m_full_reg_clobbers, mode, regno)
+	    && !targetm.hard_regno_call_part_clobbered (m_id, regno, mode))
+	  remove_from_hard_reg_set (&m_mode_clobbers[i], mode, regno);
+    }
+
+  /* Check that the assumptions above actually hold, i.e. that testing
+     for single-register modes makes sense, and that overlap tests for
+     mode_clobbers work as expected.  */
+  if (flag_checking)
+    for (unsigned int i = 0; i < NUM_MACHINE_MODES; ++i)
+      {
+	machine_mode mode = (machine_mode) i;
+	const_hard_reg_set all_clobbers = m_full_and_partial_reg_clobbers;
+	for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
+	  if (targetm.hard_regno_mode_ok (regno, mode)
+	      && !overlaps_hard_reg_set_p (m_full_reg_clobbers, mode, regno)
+	      && targetm.hard_regno_call_part_clobbered (m_id, regno, mode))
+	    gcc_assert (overlaps_hard_reg_set_p (all_clobbers, mode, regno)
+			&& overlaps_hard_reg_set_p (m_mode_clobbers[i],
+						    mode, regno));
+      }
+}
+
+/* If the ABI has been initialized, add REGNO to the set of registers
+   that can be completely altered by a call.  */
+
+void
+predefined_function_abi::add_full_reg_clobber (unsigned int regno)
+{
+  if (!m_initialized)
+    return;
+
+  SET_HARD_REG_BIT (m_full_reg_clobbers, regno);
+  SET_HARD_REG_BIT (m_full_and_partial_reg_clobbers, regno);
+  for (unsigned int i = 0; i < NUM_MACHINE_MODES; ++i)
+    SET_HARD_REG_BIT (m_mode_clobbers[i], regno);
+}
+
+/* Return the set of registers that the caller of the recorded functions must
+   save in order to honor the requirements of CALLER_ABI.  */
+
+HARD_REG_SET
+function_abi_aggregator::
+caller_save_regs (const function_abi &caller_abi) const
+{
+  HARD_REG_SET result;
+  CLEAR_HARD_REG_SET (result);
+  for (unsigned int abi_id = 0; abi_id < NUM_ABI_IDS; ++abi_id)
+    {
+      const predefined_function_abi &callee_abi = function_abis[abi_id];
+
+      /* Skip cases that clearly aren't problematic.  */
+      if (abi_id == caller_abi.id ()
+	  || hard_reg_set_empty_p (m_abi_clobbers[abi_id]))
+	continue;
+
+      /* Collect the set of registers that can be "more clobbered" by
+	 CALLEE_ABI than by CALLER_ABI.  */
+      HARD_REG_SET extra_clobbers;
+      CLEAR_HARD_REG_SET (extra_clobbers);
+      for (unsigned int i = 0; i < NUM_MACHINE_MODES; ++i)
+	{
+	  machine_mode mode = (machine_mode) i;
+	  extra_clobbers |= (callee_abi.mode_clobbers (mode)
+			     & ~caller_abi.mode_clobbers (mode));
+	}
+
+      /* Restrict it to the set of registers that we actually saw
+	 clobbers for (e.g. taking -fipa-ra into account).  */
+      result |= (extra_clobbers & m_abi_clobbers[abi_id]);
+    }
+  return result;
+}
+
+/* Return the set of registers that cannot be used to hold a value of
+   mode MODE across the calls in a region described by ABIS and MASK, where:
+
+   * Bit ID of ABIS is set if the region contains a call with
+     function_abi identifier ID.
+
+   * MASK contains all the registers that are fully or partially
+     clobbered by calls in the region.
+
+   This is not quite as accurate as testing each individual call,
+   but it's a close and conservatively-correct approximation.
+   It's much better for some targets than just using MASK.  */
+
+HARD_REG_SET
+call_clobbers_in_region (unsigned int abis, const_hard_reg_set mask,
+			 machine_mode mode)
+{
+  HARD_REG_SET result;
+  CLEAR_HARD_REG_SET (result);
+  for (unsigned int id = 0; abis; abis >>= 1, ++id)
+    if (abis & 1)
+      result |= function_abis[id].mode_clobbers (mode);
+  return result & mask;
+}
+
+/* Return the predefined ABI used by functions with type TYPE.  */
+
+const predefined_function_abi &
+fntype_abi (const_tree type)
+{
+  gcc_assert (FUNC_OR_METHOD_TYPE_P (type));
+  if (targetm.calls.fntype_abi)
+    return targetm.calls.fntype_abi (type);
+  return default_function_abi;
+}
+
+/* Return the ABI of function decl FNDECL.  */
+
+function_abi
+fndecl_abi (const_tree fndecl)
+{
+  gcc_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
+  const predefined_function_abi &base_abi = fntype_abi (TREE_TYPE (fndecl));
+
+  if (flag_ipa_ra && decl_binds_to_current_def_p (fndecl))
+    if (cgraph_rtl_info *info = cgraph_node::rtl_info (fndecl))
+      return function_abi (base_abi, info->function_used_regs);
+
+  return base_abi;
+}
+
+/* Return the ABI of the function called by INSN.  */
+
+function_abi
+insn_callee_abi (const rtx_insn *insn)
+{
+  gcc_assert (insn && CALL_P (insn));
+
+  if (flag_ipa_ra)
+    if (tree fndecl = get_call_fndecl (insn))
+      return fndecl_abi (fndecl);
+
+  if (targetm.calls.insn_callee_abi)
+    return targetm.calls.insn_callee_abi (insn);
+
+  return default_function_abi;
+}
+
+/* Return the ABI of the function called by CALL_EXPR EXP.  Return the
+   default ABI for erroneous calls.  */
+
+function_abi
+expr_callee_abi (const_tree exp)
+{
+  gcc_assert (TREE_CODE (exp) == CALL_EXPR);
+
+  if (tree fndecl = get_callee_fndecl (exp))
+    return fndecl_abi (fndecl);
+
+  tree callee = CALL_EXPR_FN (exp);
+  if (callee == error_mark_node)
+    return default_function_abi;
+
+  tree type = TREE_TYPE (callee);
+  if (type == error_mark_node)
+    return default_function_abi;
+
+  if (POINTER_TYPE_P (type))
+    {
+      type = TREE_TYPE (type);
+      if (type == error_mark_node)
+	return default_function_abi;
+    }
+
+  return fntype_abi (type);
+}
diff --git a/gcc/function-abi.h b/gcc/function-abi.h
new file mode 100644
index 000000000..96a49dfbe
--- /dev/null
+++ b/gcc/function-abi.h
@@ -0,0 +1,320 @@
+/* Information about fuunction binary interfaces.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+This file is part of GCC
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_FUNCTION_ABI_H
+#define GCC_FUNCTION_ABI_H
+
+/* Most targets use the same ABI for all functions in a translation
+   unit, but some targets support interoperability between several ABIs.
+   Each such ABI has a unique 0-based identifier, with 0 always being
+   the default choice of ABI.
+
+   NUM_ABI_IDS is the maximum number of such ABIs that GCC can handle at once.
+   A bitfield with this number of bits can represent any combinaion of the
+   supported ABIs.  */
+const size_t NUM_ABI_IDS = 8;
+
+/* Information about one of the target's predefined ABIs.  */
+class predefined_function_abi
+{
+public:
+  /* A target-specific identifier for this ABI.  The value must be in
+     the range [0, NUM_ABI_IDS - 1].  */
+  unsigned int id () const { return m_id; }
+
+  /* True if this ABI has been initialized.  */
+  bool initialized_p () const { return m_initialized; }
+
+  /* Return true if a function call is allowed to alter every bit of
+     register REGNO, so that the register contains an arbitrary value
+     on return.  If so, the register cannot hold any part of a value
+     that is live across a call.  */
+  bool
+  clobbers_full_reg_p (unsigned int regno) const
+  {
+    return TEST_HARD_REG_BIT (m_full_reg_clobbers, regno);
+  }
+
+  /* Return true if a function call is allowed to alter some or all bits
+     of register REGNO.
+
+     This is true whenever clobbers_full_reg_p (REGNO) is true.  It is
+     also true if, for example, the ABI says that a call must preserve the
+     low 32 or 64 bits of REGNO, but can clobber the upper bits of REGNO.
+     In the latter case, it is possible for REGNO to hold values that
+     are live across a call, provided that the value occupies only the
+     call-preserved part of the register.  */
+  bool
+  clobbers_at_least_part_of_reg_p (unsigned int regno) const
+  {
+    return TEST_HARD_REG_BIT (m_full_and_partial_reg_clobbers, regno);
+  }
+
+  /* Return true if a function call is allowed to clobber at least part
+     of (reg:MODE REGNO).  If so, it is not possible for the register
+     as a whole to be live across a call.  */
+  bool
+  clobbers_reg_p (machine_mode mode, unsigned int regno) const
+  {
+    return overlaps_hard_reg_set_p (m_mode_clobbers[mode], mode, regno);
+  }
+
+  /* Return the set of registers that a function call is allowed to
+     alter completely, so that the registers contain arbitrary values
+     on return.  This doesn't include registers that a call can only
+     partly clobber (as per TARGET_HARD_REGNO_CALL_PART_CLOBBERED).
+
+     These registers cannot hold any part of a value that is live across
+     a call.  */
+  HARD_REG_SET full_reg_clobbers () const { return m_full_reg_clobbers; }
+
+  /* Return the set of registers that a function call is allowed to alter
+     to some degree.  For example, if an ABI says that a call must preserve
+     the low 32 or 64 bits of a register R, but can clobber the upper bits
+     of R, R would be in this set but not in full_reg_clobbers ().
+
+     This set is a superset of full_reg_clobbers ().  It is possible for a
+     register in full_and_partial_reg_clobbers () & ~full_reg_clobbers ()
+     to contain values that are live across a call, provided that the live
+     value only occupies the call-preserved part of the register.  */
+  HARD_REG_SET
+  full_and_partial_reg_clobbers () const
+  {
+    return m_full_and_partial_reg_clobbers;
+  }
+
+  /* Return the set of registers that cannot be used to hold a value of
+     mode MODE across a function call.  That is:
+
+       (reg:REGNO MODE)
+
+     might be clobbered by a call whenever:
+
+       overlaps_hard_reg_set (mode_clobbers (MODE), MODE, REGNO)
+
+     In allocation terms, the registers in the returned set conflict
+     with any value of mode MODE that is live across a call.  */
+  HARD_REG_SET
+  mode_clobbers (machine_mode mode) const
+  {
+    return m_mode_clobbers[mode];
+  }
+
+  void initialize (unsigned int, const_hard_reg_set);
+  void add_full_reg_clobber (unsigned int);
+
+private:
+  unsigned int m_id : NUM_ABI_IDS;
+  unsigned int m_initialized : 1;
+  HARD_REG_SET m_full_reg_clobbers;
+  HARD_REG_SET m_full_and_partial_reg_clobbers;
+  HARD_REG_SET m_mode_clobbers[NUM_MACHINE_MODES];
+};
+
+/* Describes either a predefined ABI or the ABI of a particular function.
+   In the latter case, the ABI might make use of extra function-specific
+   information, such as for -fipa-ra.  */
+class function_abi
+{
+public:
+  /* Initialize the structure for a general function with the given ABI.  */
+  function_abi (const predefined_function_abi &base_abi)
+    : m_base_abi (&base_abi),
+      m_mask (base_abi.full_and_partial_reg_clobbers ()) {}
+
+  /* Initialize the structure for a function that has the given ABI and
+     that is known not to clobber registers outside MASK.  */
+  function_abi (const predefined_function_abi &base_abi,
+		const_hard_reg_set mask)
+    : m_base_abi (&base_abi), m_mask (mask) {}
+
+  /* The predefined ABI from which this ABI is derived.  */
+  const predefined_function_abi &base_abi () const { return *m_base_abi; }
+
+  /* The target-specific identifier of the predefined ABI.  */
+  unsigned int id () const { return m_base_abi->id (); }
+
+  /* See the corresponding predefined_function_abi functions for
+     details about the following functions.  */
+
+  HARD_REG_SET
+  full_reg_clobbers () const
+  {
+    return m_mask & m_base_abi->full_reg_clobbers ();
+  }
+
+  HARD_REG_SET
+  full_and_partial_reg_clobbers () const
+  {
+    return m_mask & m_base_abi->full_and_partial_reg_clobbers ();
+  }
+
+  HARD_REG_SET
+  mode_clobbers (machine_mode mode) const
+  {
+    return m_mask & m_base_abi->mode_clobbers (mode);
+  }
+
+  bool
+  clobbers_full_reg_p (unsigned int regno) const
+  {
+    return (TEST_HARD_REG_BIT (m_mask, regno)
+	    & m_base_abi->clobbers_full_reg_p (regno));
+  }
+
+  bool
+  clobbers_at_least_part_of_reg_p (unsigned int regno) const
+  {
+    return (TEST_HARD_REG_BIT (m_mask, regno)
+	    & m_base_abi->clobbers_at_least_part_of_reg_p (regno));
+  }
+
+  bool
+  clobbers_reg_p (machine_mode mode, unsigned int regno) const
+  {
+    return overlaps_hard_reg_set_p (mode_clobbers (mode), mode, regno);
+  }
+
+  bool
+  operator== (const function_abi &other) const
+  {
+    return m_base_abi == other.m_base_abi && m_mask == other.m_mask;
+  }
+
+  bool
+  operator!= (const function_abi &other) const
+  {
+    return !operator== (other);
+  }
+
+protected:
+  const predefined_function_abi *m_base_abi;
+  HARD_REG_SET m_mask;
+};
+
+/* This class collects information about the ABIs of functions that are
+   called in a particular region of code.  It is mostly intended to be
+   used as a local variable during an IR walk.  */
+class function_abi_aggregator
+{
+public:
+  function_abi_aggregator () : m_abi_clobbers () {}
+
+  /* Record that the code region calls a function with the given ABI.  */
+  void
+  note_callee_abi (const function_abi &abi)
+  {
+    m_abi_clobbers[abi.id ()] |= abi.full_and_partial_reg_clobbers ();
+  }
+
+  HARD_REG_SET caller_save_regs (const function_abi &) const;
+
+private:
+  HARD_REG_SET m_abi_clobbers[NUM_ABI_IDS];
+};
+
+struct target_function_abi_info
+{
+  /* An array of all the target ABIs that are available in this
+     translation unit.  Not all entries are used for all targets,
+     but the structures are relatively small, and using a fixed-size
+     array avoids extra indirection.
+
+     There are various ways of getting an ABI descriptor:
+
+     * fndecl_abi (FNDECL) is the ABI of function FNDECL.
+
+     * fntype_abi (FNTYPE) is the ABI of a function with type FNTYPE.
+
+     * crtl->abi is the ABI of the function that we are currently
+       compiling to rtl.
+
+     * insn_callee_abi (INSN) is the ABI used by the target of call insn INSN.
+
+     * eh_edge_abi is the "ABI" used when taking an EH edge from an
+       exception-throwing statement to an exception handler.  Catching
+       exceptions from calls can be treated as an abnormal return from
+       those calls, and this ABI therefore describes the ABI of functions
+       on such an abnormal return.  Statements that throw non-call
+       exceptions can be treated as being implicitly wrapped in a call
+       that has such an abnormal return.
+
+       At present, no target needs to support more than one EH ABI.
+
+     * function_abis[N] is the ABI with identifier N.  This can be useful
+       when referring back to ABIs that have been collected by number in
+       a bitmask, such as after walking function calls in a particular
+       region of code.
+
+     * default_function_abi refers specifically to the target's default
+       choice of ABI, regardless of which (if any) functions actually
+       use it.  This ABI and data derived from it do *not* provide
+       globally conservatively-correct information, so it is only
+       useful in very specific circumstances.  */
+  predefined_function_abi x_function_abis[NUM_ABI_IDS];
+};
+
+extern target_function_abi_info default_target_function_abi_info;
+#if SWITCHABLE_TARGET
+extern target_function_abi_info *this_target_function_abi_info;
+#else
+#define this_target_function_abi_info (&default_target_function_abi_info)
+#endif
+
+/* See the comment above x_function_abis for when these macros should be used.
+   At present, eh_edge_abi is always the default ABI, but that could change
+   in future if a target needs it to.  */
+#define function_abis \
+  (this_target_function_abi_info->x_function_abis)
+#define default_function_abi \
+  (this_target_function_abi_info->x_function_abis[0])
+#define eh_edge_abi default_function_abi
+
+extern HARD_REG_SET call_clobbers_in_region (unsigned int, const_hard_reg_set,
+					     machine_mode mode);
+
+/* Return true if (reg:MODE REGNO) might be clobbered by one of the
+   calls in a region described by ABIS and MASK, where:
+
+   * Bit ID of ABIS is set if the region contains a call with
+     function_abi identifier ID.
+
+   * MASK contains all the registers that are fully or partially
+     clobbered by calls in the region.
+
+   This is not quite as accurate as testing each individual call,
+   but it's a close and conservatively-correct approximation.
+   It's much better for some targets than:
+
+     overlaps_hard_reg_set_p (MASK, MODE, REGNO).  */
+
+inline bool
+call_clobbered_in_region_p (unsigned int abis, const_hard_reg_set mask,
+			    machine_mode mode, unsigned int regno)
+{
+  HARD_REG_SET clobbers = call_clobbers_in_region (abis, mask, mode);
+  return overlaps_hard_reg_set_p (clobbers, mode, regno);
+}
+
+extern const predefined_function_abi &fntype_abi (const_tree);
+extern function_abi fndecl_abi (const_tree);
+extern function_abi insn_callee_abi (const rtx_insn *);
+extern function_abi expr_callee_abi (const_tree);
+
+#endif
diff --git a/gcc/function.c b/gcc/function.c
index acf9f9e60..6d5574244 100644
--- a/gcc/function.c
+++ b/gcc/function.c
@@ -79,6 +79,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "attribs.h"
 #include "gimple.h"
 #include "options.h"
+#include "function-abi.h"
 
 /* So we can assign to cfun in this file.  */
 #undef cfun
@@ -2121,7 +2122,7 @@ aggregate_value_p (const_tree exp, const_tree fntype)
   regno = REGNO (reg);
   nregs = hard_regno_nregs (regno, TYPE_MODE (type));
   for (i = 0; i < nregs; i++)
-    if (! call_used_regs[regno + i])
+    if (! call_used_or_fixed_reg_p (regno + i))
       return 1;
 
   return 0;
@@ -2454,13 +2455,15 @@ assign_parm_find_data_types (struct assign_parm_data_all *all, tree parm,
     passed_type = TREE_TYPE (first_field (passed_type));
 
   /* See if this arg was passed by invisible reference.  */
-  if (pass_by_reference (&all->args_so_far_v, passed_mode,
-			 passed_type, data->named_arg))
-    {
-      passed_type = nominal_type = build_pointer_type (passed_type);
-      data->passed_pointer = true;
-      passed_mode = nominal_mode = TYPE_MODE (nominal_type);
-    }
+  {
+    function_arg_info arg (passed_type, passed_mode, data->named_arg);
+    if (apply_pass_by_reference_rules (&all->args_so_far_v, arg))
+      {
+	passed_type = nominal_type = arg.type;
+	data->passed_pointer = true;
+	passed_mode = nominal_mode = arg.mode;
+      }
+  }
 
   /* Find mode as it is passed by the ABI.  */
   unsignedp = TYPE_UNSIGNED (passed_type);
@@ -2483,9 +2486,9 @@ assign_parms_setup_varargs (struct assign_parm_data_all *all,
 {
   int varargs_pretend_bytes = 0;
 
-  targetm.calls.setup_incoming_varargs (all->args_so_far,
-					data->promoted_mode,
-					data->passed_type,
+  function_arg_info last_named_arg (data->passed_type, data->promoted_mode,
+				    /*named=*/true);
+  targetm.calls.setup_incoming_varargs (all->args_so_far, last_named_arg,
 					&varargs_pretend_bytes, no_rtl);
 
   /* If the back-end has requested extra stack space, record how much is
@@ -2515,11 +2518,9 @@ assign_parm_find_entry_rtl (struct assign_parm_data_all *all,
   targetm.calls.warn_parameter_passing_abi (all->args_so_far,
 					    data->passed_type);
 
-  entry_parm = targetm.calls.function_incoming_arg (all->args_so_far,
-						    data->promoted_mode,
-						    data->passed_type,
-						    data->named_arg);
-
+  function_arg_info arg (data->passed_type, data->promoted_mode,
+			 data->named_arg);
+  entry_parm = targetm.calls.function_incoming_arg (all->args_so_far, arg);
   if (entry_parm == 0)
     data->promoted_mode = data->passed_mode;
 
@@ -2542,27 +2543,26 @@ assign_parm_find_entry_rtl (struct assign_parm_data_all *all,
       if (targetm.calls.pretend_outgoing_varargs_named (all->args_so_far))
 	{
 	  rtx tem;
+	  function_arg_info named_arg (data->passed_type, data->promoted_mode,
+				       /*named=*/true);
 	  tem = targetm.calls.function_incoming_arg (all->args_so_far,
-						     data->promoted_mode,
-						     data->passed_type, true);
+						     named_arg);
 	  in_regs = tem != NULL;
 	}
     }
 
   /* If this parameter was passed both in registers and in the stack, use
      the copy on the stack.  */
-  if (targetm.calls.must_pass_in_stack (data->promoted_mode,
-					data->passed_type))
+  if (targetm.calls.must_pass_in_stack (arg))
     entry_parm = 0;
 
   if (entry_parm)
     {
       int partial;
 
-      partial = targetm.calls.arg_partial_bytes (all->args_so_far,
-						 data->promoted_mode,
-						 data->passed_type,
-						 data->named_arg);
+      function_arg_info arg (data->passed_type, data->promoted_mode,
+			     data->named_arg);
+      partial = targetm.calls.arg_partial_bytes (all->args_so_far, arg);
       data->partial = partial;
 
       /* The caller might already have allocated stack space for the
@@ -3226,8 +3226,7 @@ assign_parm_setup_reg (struct assign_parm_data_all *all, tree parm,
 	  for (insn = insns; insn && moved; insn = NEXT_INSN (insn))
 	    {
 	      if (INSN_P (insn))
-		note_stores (PATTERN (insn), record_hard_reg_sets,
-			     &hardregs);
+		note_stores (insn, record_hard_reg_sets, &hardregs);
 	      if (!hard_reg_set_empty_p (hardregs))
 		moved = false;
 	    }
@@ -3647,8 +3646,9 @@ assign_parms (tree fndecl)
 	assign_parms_setup_varargs (&all, &data, false);
 
       /* Update info on where next arg arrives in registers.  */
-      targetm.calls.function_arg_advance (all.args_so_far, data.promoted_mode,
-					  data.passed_type, data.named_arg);
+      function_arg_info arg (data.passed_type, data.promoted_mode,
+			     data.named_arg);
+      targetm.calls.function_arg_advance (all.args_so_far, arg);
     }
 
   if (targetm.calls.split_complex_arg)
@@ -3835,8 +3835,9 @@ gimplify_parameters (gimple_seq *cleanup)
 	continue;
 
       /* Update info on where next arg arrives in registers.  */
-      targetm.calls.function_arg_advance (all.args_so_far, data.promoted_mode,
-					  data.passed_type, data.named_arg);
+      function_arg_info arg (data.passed_type, data.promoted_mode,
+			     data.named_arg);
+      targetm.calls.function_arg_advance (all.args_so_far, arg);
 
       /* ??? Once upon a time variable_size stuffed parameter list
 	 SAVE_EXPRs (amongst others) onto a pending sizes list.  This
@@ -3854,8 +3855,8 @@ gimplify_parameters (gimple_seq *cleanup)
       if (data.passed_pointer)
 	{
           tree type = TREE_TYPE (data.passed_type);
-	  if (reference_callee_copied (&all.args_so_far_v, TYPE_MODE (type),
-				       type, data.named_arg))
+	  function_arg_info orig_arg (type, data.named_arg);
+	  if (reference_callee_copied (&all.args_so_far_v, orig_arg))
 	    {
 	      tree local, t;
 
@@ -4823,6 +4824,12 @@ static void
 prepare_function_start (void)
 {
   gcc_assert (!get_last_insn ());
+
+  if (in_dummy_function)
+    crtl->abi = &default_function_abi;
+  else
+    crtl->abi = &fndecl_abi (cfun->decl).base_abi ();
+
   init_temp_slots ();
   init_emit ();
   init_varasm_status ();
diff --git a/gcc/fwprop.c b/gcc/fwprop.c
index f2966fada..e6f375271 100644
--- a/gcc/fwprop.c
+++ b/gcc/fwprop.c
@@ -740,7 +740,7 @@ propagate_rtx (rtx x, machine_mode mode, rtx old_rtx, rtx new_rtx,
       || CONSTANT_P (new_rtx)
       || (GET_CODE (new_rtx) == SUBREG
 	  && REG_P (SUBREG_REG (new_rtx))
-	  && !paradoxical_subreg_p (mode, GET_MODE (SUBREG_REG (new_rtx)))))
+	  && !paradoxical_subreg_p (new_rtx)))
     flags |= PR_CAN_APPEAR;
   if (!varying_mem_p (new_rtx))
     flags |= PR_HANDLE_MEM;
diff --git a/gcc/gcc.c b/gcc/gcc.c
index 4f57765b0..1a5ad7db3 100644
--- a/gcc/gcc.c
+++ b/gcc/gcc.c
@@ -4041,6 +4041,10 @@ driver_handle_option (struct gcc_options *opts,
       diagnostic_color_init (dc, value);
       break;
 
+    case OPT_fdiagnostics_urls_:
+      diagnostic_urls_init (dc, value);
+      break;
+
     case OPT_fdiagnostics_format_:
       diagnostic_output_format_init (dc,
 				     (enum diagnostics_output_format)value);
@@ -7438,6 +7442,7 @@ driver::global_initializations ()
 
   diagnostic_initialize (global_dc, 0);
   diagnostic_color_init (global_dc);
+  diagnostic_urls_init (global_dc);
 
 #ifdef GCC_DRIVER_HOST_INITIALIZATION
   /* Perform host dependent initialization when needed.  */
diff --git a/gcc/gcse-common.c b/gcc/gcse-common.c
index e6e4b642b..55148623f 100644
--- a/gcc/gcse-common.c
+++ b/gcc/gcse-common.c
@@ -89,7 +89,7 @@ record_last_mem_set_info_common (rtx_insn *insn,
       struct gcse_note_stores_info data;
       data.insn = insn;
       data.canon_mem_list = canon_modify_mem_list;
-      note_stores (PATTERN (insn), canon_list_insert, (void*) &data);
+      note_stores (insn, canon_list_insert, (void*) &data);
     }
 }
 
diff --git a/gcc/gcse.c b/gcc/gcse.c
index 7fbdd6750..373ba7a16 100644
--- a/gcc/gcse.c
+++ b/gcc/gcse.c
@@ -1049,7 +1049,7 @@ load_killed_in_block_p (const_basic_block bb, int uid_limit, const_rtx x,
 	 note_stores to examine each hunk of memory that is modified.  */
       mci.mem = x;
       mci.conflict = false;
-      note_stores (PATTERN (setter), mems_conflict_for_gcse_p, &mci);
+      note_stores (setter, mems_conflict_for_gcse_p, &mci);
       if (mci.conflict)
 	return 1;
     }
@@ -1537,7 +1537,7 @@ compute_hash_table_work (struct gcse_hash_table_d *table)
 		record_last_mem_set_info (insn);
 	    }
 
-	  note_stores (PATTERN (insn), record_last_set_info, insn);
+	  note_stores (insn, record_last_set_info, insn);
 	}
 
       /* The next pass builds the hash table.  */
@@ -2415,7 +2415,7 @@ single_set_gcse (rtx_insn *insn)
 
   s.insn = insn;
   s.nsets = 0;
-  note_stores (pattern, record_set_data, &s);
+  note_pattern_stores (pattern, record_set_data, &s);
 
   /* Considered invariant insns have exactly one set.  */
   gcc_assert (s.nsets == 1);
diff --git a/gcc/genconfig.c b/gcc/genconfig.c
index 194fe950d..6f914b1e4 100644
--- a/gcc/genconfig.c
+++ b/gcc/genconfig.c
@@ -72,7 +72,6 @@ walk_insn_part (rtx part, int recog_p, int non_pc_set_src)
   switch (code)
     {
     case CLOBBER:
-    case CLOBBER_HIGH:
       clobbers_seen_this_insn++;
       break;
 
diff --git a/gcc/genemit.c b/gcc/genemit.c
index 83f86a35c..e03af01f2 100644
--- a/gcc/genemit.c
+++ b/gcc/genemit.c
@@ -169,15 +169,6 @@ gen_exp (rtx x, enum rtx_code subroutine_type, char *used, md_rtx_info *info)
 	  return;
 	}
       break;
-    case CLOBBER_HIGH:
-      if (!REG_P (XEXP (x, 0)))
-	error ("CLOBBER_HIGH argument is not a register expr, at %s:%d",
-	       info->loc.filename, info->loc.lineno);
-      printf ("gen_hard_reg_clobber_high (%smode, %i)",
-	      GET_MODE_NAME (GET_MODE (XEXP (x, 0))),
-	      REGNO (XEXP (x, 0)));
-      return;
-      break;
     case CC0:
       printf ("cc0_rtx");
       return;
@@ -343,8 +334,7 @@ gen_insn (md_rtx_info *info)
 
       for (i = XVECLEN (insn, 1) - 1; i > 0; i--)
 	{
-	  if (GET_CODE (XVECEXP (insn, 1, i)) != CLOBBER
-	      && GET_CODE (XVECEXP (insn, 1, i)) != CLOBBER_HIGH)
+	  if (GET_CODE (XVECEXP (insn, 1, i)) != CLOBBER)
 	    break;
 
 	  if (REG_P (XEXP (XVECEXP (insn, 1, i), 0)))
@@ -811,42 +801,45 @@ handle_overloaded_code_for (overloaded_name *oname)
 static void
 handle_overloaded_gen (overloaded_name *oname)
 {
+  unsigned HOST_WIDE_INT seen = 0;
   /* All patterns must have the same number of operands.  */
-  pattern_stats stats;
-  get_pattern_stats (&stats, XVEC (oname->first_instance->insn, 1));
   for (overloaded_instance *instance = oname->first_instance->next;
        instance; instance = instance->next)
     {
-      pattern_stats stats2;
-      get_pattern_stats (&stats2, XVEC (instance->insn, 1));
-      if (stats.num_generator_args != stats2.num_generator_args)
-	fatal_at (get_file_location (instance->insn),
-		  "inconsistent number of operands for '%s'; "
-		  "this instance has %d, but previous instances had %d",
-		  oname->name, stats2.num_generator_args,
-		  stats.num_generator_args);
+      pattern_stats stats;
+      get_pattern_stats (&stats, XVEC (instance->insn, 1));
+      unsigned HOST_WIDE_INT mask
+	= HOST_WIDE_INT_1U << stats.num_generator_args;
+      if (seen & mask)
+	continue;
+
+      seen |= mask;
+
+      /* Print the function prototype.  */
+      printf ("\nrtx\nmaybe_gen_%s (", oname->name);
+      print_overload_arguments (oname);
+      for (int i = 0; i < stats.num_generator_args; ++i)
+	printf (", rtx x%d", i);
+      printf (")\n{\n");
+
+      /* Use maybe_code_for_*, instead of duplicating the selection
+	 logic here.  */
+      printf ("  insn_code code = maybe_code_for_%s (", oname->name);
+      for (unsigned int i = 0; i < oname->arg_types.length (); ++i)
+	printf ("%sarg%d", i == 0 ? "" : ", ", i);
+      printf (");\n"
+	      "  if (code != CODE_FOR_nothing)\n"
+	      "    {\n"
+	      "      gcc_assert (insn_data[code].n_generator_args == %d);\n"
+	      "      return GEN_FCN (code) (", stats.num_generator_args);
+      for (int i = 0; i < stats.num_generator_args; ++i)
+	printf ("%sx%d", i == 0 ? "" : ", ", i);
+      printf (");\n"
+	      "    }\n"
+	      "  else\n"
+	      "    return NULL_RTX;\n"
+	      "}\n");
     }
-
-  /* Print the function prototype.  */
-  printf ("\nrtx\nmaybe_gen_%s (", oname->name);
-  print_overload_arguments (oname);
-  for (int i = 0; i < stats.num_generator_args; ++i)
-    printf (", rtx x%d", i);
-  printf (")\n{\n");
-
-  /* Use maybe_code_for_*, instead of duplicating the selection logic here.  */
-  printf ("  insn_code code = maybe_code_for_%s (", oname->name);
-  for (unsigned int i = 0; i < oname->arg_types.length (); ++i)
-    printf ("%sarg%d", i == 0 ? "" : ", ", i);
-  printf (");\n"
-	  "  if (code != CODE_FOR_nothing)\n"
-	  "    return GEN_FCN (code) (");
-  for (int i = 0; i < stats.num_generator_args; ++i)
-    printf ("%sx%d", i == 0 ? "" : ", ", i);
-  printf (");\n"
-	  "  else\n"
-	  "    return NULL_RTX;\n"
-	  "}\n");
 }
 
 int
diff --git a/gcc/generic-match-head.c b/gcc/generic-match-head.c
index 3478cf59f..e9ef343c9 100644
--- a/gcc/generic-match-head.c
+++ b/gcc/generic-match-head.c
@@ -27,6 +27,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple.h"
 #include "ssa.h"
 #include "cgraph.h"
+#include "vec-perm-indices.h"
 #include "fold-const.h"
 #include "fold-const-call.h"
 #include "stor-layout.h"
diff --git a/gcc/genmodes.c b/gcc/genmodes.c
index f33eefa24..95522d6b5 100644
--- a/gcc/genmodes.c
+++ b/gcc/genmodes.c
@@ -53,6 +53,7 @@ struct mode_data
 
   const char *name;		/* printable mode name -- SI, not SImode */
   enum mode_class cl;		/* this mode class */
+  unsigned int order;		/* top-level sorting order */
   unsigned int precision;	/* size in bits, equiv to TYPE_PRECISION */
   unsigned int bytesize;	/* storage size in addressable units */
   unsigned int ncomponents;	/* number of subunits */
@@ -85,7 +86,7 @@ static struct mode_data *void_mode;
 
 static const struct mode_data blank_mode = {
   0, "<unknown>", MAX_MODE_CLASS,
-  -1U, -1U, -1U, -1U,
+  0, -1U, -1U, -1U, -1U,
   0, 0, 0, 0, 0, 0,
   "<unknown>", 0, 0, 0, 0, false, false, 0
 };
@@ -484,14 +485,15 @@ make_complex_modes (enum mode_class cl,
     }
 }
 
-/* For all modes in class CL, construct vector modes of width
-   WIDTH, having as many components as necessary.  */
-#define VECTOR_MODES_WITH_PREFIX(PREFIX, C, W) \
-  make_vector_modes (MODE_##C, #PREFIX, W, __FILE__, __LINE__)
-#define VECTOR_MODES(C, W) VECTOR_MODES_WITH_PREFIX (V, C, W)
+/* For all modes in class CL, construct vector modes of width WIDTH,
+   having as many components as necessary.  ORDER is the sorting order
+   of the mode, with smaller numbers indicating a higher priority.  */
+#define VECTOR_MODES_WITH_PREFIX(PREFIX, C, W, ORDER) \
+  make_vector_modes (MODE_##C, #PREFIX, W, ORDER, __FILE__, __LINE__)
+#define VECTOR_MODES(C, W) VECTOR_MODES_WITH_PREFIX (V, C, W, 0)
 static void ATTRIBUTE_UNUSED
 make_vector_modes (enum mode_class cl, const char *prefix, unsigned int width,
-		   const char *file, unsigned int line)
+		   unsigned int order, const char *file, unsigned int line)
 {
   struct mode_data *m;
   struct mode_data *v;
@@ -530,6 +532,7 @@ make_vector_modes (enum mode_class cl, const char *prefix, unsigned int width,
 	}
 
       v = new_mode (vclass, xstrdup (buf), file, line);
+      v->order = order;
       v->component = m;
       v->ncomponents = ncomponents;
     }
@@ -832,6 +835,11 @@ cmp_modes (const void *a, const void *b)
   const struct mode_data *const m = *(const struct mode_data *const*)a;
   const struct mode_data *const n = *(const struct mode_data *const*)b;
 
+  if (m->order > n->order)
+    return 1;
+  else if (m->order < n->order)
+    return -1;
+
   if (m->bytesize > n->bytesize)
     return 1;
   else if (m->bytesize < n->bytesize)
diff --git a/gcc/genopinit.c b/gcc/genopinit.c
index ea4c3ce01..1dd1d82d0 100644
--- a/gcc/genopinit.c
+++ b/gcc/genopinit.c
@@ -134,31 +134,43 @@ handle_overloaded_code_for (FILE *file, overloaded_name *oname)
 static void
 handle_overloaded_gen (FILE *file, overloaded_name *oname)
 {
-  pattern_stats stats;
-  get_pattern_stats (&stats, XVEC (oname->first_instance->insn, 1));
-
-  fprintf (file, "\nextern rtx maybe_gen_%s (", oname->name);
-  for (unsigned int i = 0; i < oname->arg_types.length (); ++i)
-    fprintf (file, "%s%s", i == 0 ? "" : ", ", oname->arg_types[i]);
-  for (int i = 0; i < stats.num_generator_args; ++i)
-    fprintf (file, ", rtx");
-  fprintf (file, ");\n");
-
-  fprintf (file, "inline rtx\ngen_%s (", oname->name);
-  for (unsigned int i = 0; i < oname->arg_types.length (); ++i)
-    fprintf (file, "%s%s arg%d", i == 0 ? "" : ", ", oname->arg_types[i], i);
-  for (int i = 0; i < stats.num_generator_args; ++i)
-    fprintf (file, ", rtx x%d", i);
-  fprintf (file, ")\n{\n  rtx res = maybe_gen_%s (", oname->name);
-  for (unsigned int i = 0; i < oname->arg_types.length (); ++i)
-    fprintf (file, "%sarg%d", i == 0 ? "" : ", ", i);
-  for (int i = 0; i < stats.num_generator_args; ++i)
-    fprintf (file, ", x%d", i);
-  fprintf (file,
-	   ");\n"
-	   "  gcc_assert (res);\n"
-	   "  return res;\n"
-	   "}\n");
+  unsigned HOST_WIDE_INT seen = 0;
+  for (overloaded_instance *instance = oname->first_instance->next;
+       instance; instance = instance->next)
+    {
+      pattern_stats stats;
+      get_pattern_stats (&stats, XVEC (instance->insn, 1));
+      unsigned HOST_WIDE_INT mask
+	= HOST_WIDE_INT_1U << stats.num_generator_args;
+      if (seen & mask)
+	continue;
+
+      seen |= mask;
+
+      fprintf (file, "\nextern rtx maybe_gen_%s (", oname->name);
+      for (unsigned int i = 0; i < oname->arg_types.length (); ++i)
+	fprintf (file, "%s%s", i == 0 ? "" : ", ", oname->arg_types[i]);
+      for (int i = 0; i < stats.num_generator_args; ++i)
+	fprintf (file, ", rtx");
+      fprintf (file, ");\n");
+
+      fprintf (file, "inline rtx\ngen_%s (", oname->name);
+      for (unsigned int i = 0; i < oname->arg_types.length (); ++i)
+	fprintf (file, "%s%s arg%d", i == 0 ? "" : ", ",
+		 oname->arg_types[i], i);
+      for (int i = 0; i < stats.num_generator_args; ++i)
+	fprintf (file, ", rtx x%d", i);
+      fprintf (file, ")\n{\n  rtx res = maybe_gen_%s (", oname->name);
+      for (unsigned int i = 0; i < oname->arg_types.length (); ++i)
+	fprintf (file, "%sarg%d", i == 0 ? "" : ", ", i);
+      for (int i = 0; i < stats.num_generator_args; ++i)
+	fprintf (file, ", x%d", i);
+      fprintf (file,
+	       ");\n"
+	       "  gcc_assert (res);\n"
+	       "  return res;\n"
+	       "}\n");
+    }
 }
 
 int
diff --git a/gcc/genrecog.c b/gcc/genrecog.c
index 90e2508fa..ec921702a 100644
--- a/gcc/genrecog.c
+++ b/gcc/genrecog.c
@@ -718,7 +718,6 @@ validate_pattern (rtx pattern, md_rtx_info *info, rtx set, int set_code)
       }
 
     case CLOBBER:
-    case CLOBBER_HIGH:
       validate_pattern (SET_DEST (pattern), info, pattern, '=');
       return;
 
@@ -5295,7 +5294,7 @@ remove_clobbers (acceptance_type *acceptance_ptr, rtx *pattern_ptr)
   for (i = XVECLEN (pattern, 0); i > 0; i--)
     {
       rtx x = XVECEXP (pattern, 0, i - 1);
-      if ((GET_CODE (x) != CLOBBER && GET_CODE (x) != CLOBBER_HIGH)
+      if (GET_CODE (x) != CLOBBER
 	  || (!REG_P (XEXP (x, 0))
 	      && GET_CODE (XEXP (x, 0)) != MATCH_SCRATCH))
 	break;
diff --git a/gcc/gensupport.c b/gcc/gensupport.c
index 31a67d5ad..ab6a523dd 100644
--- a/gcc/gensupport.c
+++ b/gcc/gensupport.c
@@ -70,8 +70,8 @@ struct queue_elem
   rtx data;
   file_location loc;
   struct queue_elem *next;
-  /* In a DEFINE_INSN that came from a DEFINE_INSN_AND_SPLIT, SPLIT
-     points to the generated DEFINE_SPLIT.  */
+  /* In a DEFINE_INSN that came from a DEFINE_INSN_AND_SPLIT or
+     DEFINE_INSN_AND_REWRITE, SPLIT points to the generated DEFINE_SPLIT.  */
   struct queue_elem *split;
 };
 
@@ -485,6 +485,65 @@ remove_constraints (rtx part)
       }
 }
 
+/* Recursively replace MATCH_OPERANDs with MATCH_DUPs and MATCH_OPERATORs
+   with MATCH_OP_DUPs in X.  */
+
+static rtx
+replace_operands_with_dups (rtx x)
+{
+  if (x == 0)
+    return x;
+
+  rtx newx;
+  if (GET_CODE (x) == MATCH_OPERAND)
+    {
+      newx = rtx_alloc (MATCH_DUP);
+      XINT (newx, 0) = XINT (x, 0);
+      x = newx;
+    }
+  else if (GET_CODE (x) == MATCH_OPERATOR)
+    {
+      newx = rtx_alloc (MATCH_OP_DUP);
+      XINT (newx, 0) = XINT (x, 0);
+      XVEC (newx, 1) = XVEC (x, 2);
+      x = newx;
+    }
+  else
+    newx = shallow_copy_rtx (x);
+
+  const char *format_ptr = GET_RTX_FORMAT (GET_CODE (x));
+  for (int i = 0; i < GET_RTX_LENGTH (GET_CODE (x)); i++)
+    switch (*format_ptr++)
+      {
+      case 'e':
+      case 'u':
+	XEXP (newx, i) = replace_operands_with_dups (XEXP (x, i));
+	break;
+      case 'E':
+	if (XVEC (x, i) != NULL)
+	  {
+	    XVEC (newx, i) = rtvec_alloc (XVECLEN (x, i));
+	    for (int j = 0; j < XVECLEN (x, i); j++)
+	      XVECEXP (newx, i, j)
+		= replace_operands_with_dups (XVECEXP (x, i, j));
+	  }
+	break;
+      }
+  return newx;
+}
+
+/* Convert matching pattern VEC from a DEFINE_INSN_AND_REWRITE into
+   a sequence that should be generated by the splitter.  */
+
+static rtvec
+gen_rewrite_sequence (rtvec vec)
+{
+  rtvec new_vec = rtvec_alloc (1);
+  rtx x = add_implicit_parallel (vec);
+  RTVEC_ELT (new_vec, 0) = replace_operands_with_dups (x);
+  return new_vec;
+}
+
 /* Process a top level rtx in some way, queuing as appropriate.  */
 
 static void
@@ -527,6 +586,7 @@ process_rtx (rtx desc, file_location loc)
       break;
 
     case DEFINE_INSN_AND_SPLIT:
+    case DEFINE_INSN_AND_REWRITE:
       {
 	const char *split_cond;
 	rtx split;
@@ -534,6 +594,7 @@ process_rtx (rtx desc, file_location loc)
 	int i;
 	struct queue_elem *insn_elem;
 	struct queue_elem *split_elem;
+	int split_code = (GET_CODE (desc) == DEFINE_INSN_AND_REWRITE ? 5 : 6);
 
 	/* Create a split with values from the insn_and_split.  */
 	split = rtx_alloc (DEFINE_SPLIT);
@@ -555,12 +616,17 @@ process_rtx (rtx desc, file_location loc)
 	    split_cond = rtx_reader_ptr->join_c_conditions (XSTR (desc, 2),
 							    split_cond + 2);
 	  }
+	else if (GET_CODE (desc) == DEFINE_INSN_AND_REWRITE)
+	  error_at (loc, "the rewrite condition must start with `&&'");
 	XSTR (split, 1) = split_cond;
-	XVEC (split, 2) = XVEC (desc, 5);
-	XSTR (split, 3) = XSTR (desc, 6);
+	if (GET_CODE (desc) == DEFINE_INSN_AND_REWRITE)
+	  XVEC (split, 2) = gen_rewrite_sequence (XVEC (desc, 1));
+	else
+	  XVEC (split, 2) = XVEC (desc, 5);
+	XSTR (split, 3) = XSTR (desc, split_code);
 
 	/* Fix up the DEFINE_INSN.  */
-	attr = XVEC (desc, 7);
+	attr = XVEC (desc, split_code + 1);
 	PUT_CODE (desc, DEFINE_INSN);
 	XVEC (desc, 4) = attr;
 
diff --git a/gcc/gimple-expr.c b/gcc/gimple-expr.c
index b0c9f9b67..4ba194ff4 100644
--- a/gcc/gimple-expr.c
+++ b/gcc/gimple-expr.c
@@ -37,6 +37,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-pass.h"
 #include "stringpool.h"
 #include "attribs.h"
+#include "target.h"
 
 /* ----- Type related -----  */
 
@@ -147,10 +148,12 @@ useless_type_conversion_p (tree outer_type, tree inner_type)
 
   /* Recurse for vector types with the same number of subparts.  */
   else if (TREE_CODE (inner_type) == VECTOR_TYPE
-	   && TREE_CODE (outer_type) == VECTOR_TYPE
-	   && TYPE_PRECISION (inner_type) == TYPE_PRECISION (outer_type))
-    return useless_type_conversion_p (TREE_TYPE (outer_type),
-				      TREE_TYPE (inner_type));
+	   && TREE_CODE (outer_type) == VECTOR_TYPE)
+    return (known_eq (TYPE_VECTOR_SUBPARTS (inner_type),
+		      TYPE_VECTOR_SUBPARTS (outer_type))
+	    && useless_type_conversion_p (TREE_TYPE (outer_type),
+					  TREE_TYPE (inner_type))
+	    && targetm.compatible_vector_types_p (inner_type, outer_type));
 
   else if (TREE_CODE (inner_type) == ARRAY_TYPE
 	   && TREE_CODE (outer_type) == ARRAY_TYPE)
diff --git a/gcc/gimple-fold.c b/gcc/gimple-fold.c
index d33d93242..bbee8eb46 100644
--- a/gcc/gimple-fold.c
+++ b/gcc/gimple-fold.c
@@ -631,14 +631,7 @@ replace_call_with_call_and_fold (gimple_stmt_iterator *gsi, gimple *repl)
   gimple *stmt = gsi_stmt (*gsi);
   gimple_call_set_lhs (repl, gimple_call_lhs (stmt));
   gimple_set_location (repl, gimple_location (stmt));
-  if (gimple_vdef (stmt)
-      && TREE_CODE (gimple_vdef (stmt)) == SSA_NAME)
-    {
-      gimple_set_vdef (repl, gimple_vdef (stmt));
-      SSA_NAME_DEF_STMT (gimple_vdef (repl)) = repl;
-    }
-  if (gimple_vuse (stmt))
-    gimple_set_vuse (repl, gimple_vuse (stmt));
+  gimple_move_vops (repl, stmt);
   gsi_replace (gsi, repl, false);
   fold_stmt (gsi);
 }
@@ -822,11 +815,7 @@ gimple_fold_builtin_memory_op (gimple_stmt_iterator *gsi,
 			= gimple_build_assign (fold_build2 (MEM_REF, desttype,
 							    dest, off0),
 					       srcmem);
-		      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
-		      gimple_set_vdef (new_stmt, gimple_vdef (stmt));
-		      if (gimple_vdef (new_stmt)
-			  && TREE_CODE (gimple_vdef (new_stmt)) == SSA_NAME)
-			SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
+		      gimple_move_vops (new_stmt, stmt);
 		      if (!lhs)
 			{
 			  gsi_replace (gsi, new_stmt, false);
@@ -1087,11 +1076,7 @@ gimple_fold_builtin_memory_op (gimple_stmt_iterator *gsi,
 	= gimple_build_assign (fold_build2 (MEM_REF, desttype, dest, off0),
 			       fold_build2 (MEM_REF, srctype, src, off0));
 set_vop_and_replace:
-      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
-      gimple_set_vdef (new_stmt, gimple_vdef (stmt));
-      if (gimple_vdef (new_stmt)
-	  && TREE_CODE (gimple_vdef (new_stmt)) == SSA_NAME)
-	SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
+      gimple_move_vops (new_stmt, stmt);
       if (!lhs)
 	{
 	  gsi_replace (gsi, new_stmt, false);
@@ -1264,13 +1249,7 @@ gimple_fold_builtin_memset (gimple_stmt_iterator *gsi, tree c, tree len)
 
   var = fold_build2 (MEM_REF, etype, dest, build_int_cst (ptr_type_node, 0));
   gimple *store = gimple_build_assign (var, build_int_cst_type (etype, cval));
-  gimple_set_vuse (store, gimple_vuse (stmt));
-  tree vdef = gimple_vdef (stmt);
-  if (vdef && TREE_CODE (vdef) == SSA_NAME)
-    {
-      gimple_set_vdef (store, gimple_vdef (stmt));
-      SSA_NAME_DEF_STMT (gimple_vdef (stmt)) = store;
-    }
+  gimple_move_vops (store, stmt);
   gsi_insert_before (gsi, store, GSI_SAME_STMT);
   if (gimple_call_lhs (stmt))
     {
@@ -2979,11 +2958,7 @@ gimple_fold_builtin_stpcpy (gimple_stmt_iterator *gsi)
 			tem, build_int_cst (size_type_node, 1));
   gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
   gcall *repl = gimple_build_call (fn, 3, dest, src, lenp1);
-  gimple_set_vuse (repl, gimple_vuse (stmt));
-  gimple_set_vdef (repl, gimple_vdef (stmt));
-  if (gimple_vdef (repl)
-      && TREE_CODE (gimple_vdef (repl)) == SSA_NAME)
-    SSA_NAME_DEF_STMT (gimple_vdef (repl)) = repl;
+  gimple_move_vops (repl, stmt);
   gsi_insert_before (gsi, repl, GSI_SAME_STMT);
   /* Replace the result with dest + len.  */
   stmts = NULL;
@@ -4135,9 +4110,7 @@ fold_builtin_atomic_compare_exchange (gimple_stmt_iterator *gsi)
 				  gimple_call_arg (stmt, 5));
   tree lhs = make_ssa_name (ctype);
   gimple_call_set_lhs (g, lhs);
-  gimple_set_vdef (g, gimple_vdef (stmt));
-  gimple_set_vuse (g, gimple_vuse (stmt));
-  SSA_NAME_DEF_STMT (gimple_vdef (g)) = g;
+  gimple_move_vops (g, stmt);
   tree oldlhs = gimple_call_lhs (stmt);
   if (stmt_can_throw_internal (cfun, stmt))
     {
@@ -4316,8 +4289,7 @@ gimple_fold_call (gimple_stmt_iterator *gsi, bool inplace)
 		      SSA_NAME_DEF_STMT (lhs) = gimple_build_nop ();
 		      set_ssa_default_def (cfun, var, lhs);
 		    }
-		  gimple_set_vuse (new_stmt, gimple_vuse (stmt));
-		  gimple_set_vdef (new_stmt, gimple_vdef (stmt));
+		  gimple_move_vops (new_stmt, stmt);
 		  gsi_replace (gsi, new_stmt, false);
 		  return true;
 		}
diff --git a/gcc/gimple-match-head.c b/gcc/gimple-match-head.c
index bbbc0f2c2..f83f22561 100644
--- a/gcc/gimple-match-head.c
+++ b/gcc/gimple-match-head.c
@@ -27,6 +27,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple.h"
 #include "ssa.h"
 #include "cgraph.h"
+#include "vec-perm-indices.h"
 #include "fold-const.h"
 #include "fold-const-call.h"
 #include "stor-layout.h"
diff --git a/gcc/gimple.c b/gcc/gimple.c
index bf362dbe5..763c8e7e1 100644
--- a/gcc/gimple.c
+++ b/gcc/gimple.c
@@ -1564,7 +1564,7 @@ gimple_call_nonnull_result_p (gcall *call)
   if (!fndecl)
     return false;
   if (flag_delete_null_pointer_checks && !flag_check_new
-      && DECL_IS_OPERATOR_NEW (fndecl)
+      && DECL_IS_OPERATOR_NEW_P (fndecl)
       && !TREE_NOTHROW (fndecl))
     return true;
 
@@ -2034,6 +2034,18 @@ gimple_copy (gimple *stmt)
   return copy;
 }
 
+/* Move OLD_STMT's vuse and vdef operands to NEW_STMT, on the assumption
+   that OLD_STMT is about to be removed.  */
+
+void
+gimple_move_vops (gimple *new_stmt, gimple *old_stmt)
+{
+  tree vdef = gimple_vdef (old_stmt);
+  gimple_set_vuse (new_stmt, gimple_vuse (old_stmt));
+  gimple_set_vdef (new_stmt, vdef);
+  if (vdef && TREE_CODE (vdef) == SSA_NAME)
+    SSA_NAME_DEF_STMT (vdef) = new_stmt;
+}
 
 /* Return true if statement S has side-effects.  We consider a
    statement to have side effects if:
diff --git a/gcc/gimple.h b/gcc/gimple.h
index 8b5c9e219..f91c6db4d 100644
--- a/gcc/gimple.h
+++ b/gcc/gimple.h
@@ -1509,6 +1509,7 @@ void gimple_assign_set_rhs_with_ops (gimple_stmt_iterator *, enum tree_code,
 tree gimple_get_lhs (const gimple *);
 void gimple_set_lhs (gimple *, tree);
 gimple *gimple_copy (gimple *);
+void gimple_move_vops (gimple *, gimple *);
 bool gimple_has_side_effects (const gimple *);
 bool gimple_could_trap_p_1 (gimple *, bool, bool);
 bool gimple_could_trap_p (gimple *);
diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index bd8bd6d7e..b23680f96 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -1699,11 +1699,12 @@ gimplify_decl_expr (tree *stmt_p, gimple_seq *seq_p)
       tree init = DECL_INITIAL (decl);
       bool is_vla = false;
 
-      if (TREE_CODE (DECL_SIZE_UNIT (decl)) != INTEGER_CST
+      poly_uint64 size;
+      if (!poly_int_tree_p (DECL_SIZE_UNIT (decl), &size)
 	  || (!TREE_STATIC (decl)
 	      && flag_stack_check == GENERIC_STACK_CHECK
-	      && compare_tree_int (DECL_SIZE_UNIT (decl),
-				   STACK_CHECK_MAX_VAR_SIZE) > 0))
+	      && maybe_gt (size,
+			   (unsigned HOST_WIDE_INT) STACK_CHECK_MAX_VAR_SIZE)))
 	{
 	  gimplify_vla_decl (decl, seq_p);
 	  is_vla = true;
diff --git a/gcc/haifa-sched.c b/gcc/haifa-sched.c
index 5025aae42..33a77542a 100644
--- a/gcc/haifa-sched.c
+++ b/gcc/haifa-sched.c
@@ -529,9 +529,6 @@ haifa_classify_rtx (const_rtx x)
 	  /* Test if it is a 'store'.  */
 	  tmp_class = may_trap_exp (XEXP (x, 0), 1);
 	  break;
-	case CLOBBER_HIGH:
-	  gcc_assert (REG_P (XEXP (x, 0)));
-	  break;
 	case SET:
 	  /* Test if it is a store.  */
 	  tmp_class = may_trap_exp (SET_DEST (x), 1);
@@ -7207,7 +7204,7 @@ alloc_global_sched_pressure_data (void)
 	  fixed_regs_num[cl] = 0;
 
 	  for (int i = 0; i < ira_class_hard_regs_num[cl]; ++i)
-	    if (!call_used_regs[ira_class_hard_regs[cl][i]])
+	    if (!call_used_or_fixed_reg_p (ira_class_hard_regs[cl][i]))
 	      ++call_saved_regs_num[cl];
 	    else if (fixed_regs[ira_class_hard_regs[cl][i]])
 	      ++fixed_regs_num[cl];
diff --git a/gcc/hard-reg-set.h b/gcc/hard-reg-set.h
index a72819662..51c9e72bb 100644
--- a/gcc/hard-reg-set.h
+++ b/gcc/hard-reg-set.h
@@ -20,6 +20,8 @@ along with GCC; see the file COPYING3.  If not see
 #ifndef GCC_HARD_REG_SET_H
 #define GCC_HARD_REG_SET_H
 
+#include "array-traits.h"
+
 /* Define the type of a set of hard registers.  */
 
 /* HARD_REG_ELT_TYPE is a typedef of the unsigned integral type which
@@ -42,14 +44,88 @@ typedef unsigned HOST_WIDEST_FAST_INT HARD_REG_ELT_TYPE;
 
 #if FIRST_PSEUDO_REGISTER <= HOST_BITS_PER_WIDEST_FAST_INT
 
-#define HARD_REG_SET HARD_REG_ELT_TYPE
+typedef HARD_REG_ELT_TYPE HARD_REG_SET;
+typedef const HARD_REG_SET const_hard_reg_set;
 
 #else
 
 #define HARD_REG_SET_LONGS \
  ((FIRST_PSEUDO_REGISTER + HOST_BITS_PER_WIDEST_FAST_INT - 1)	\
   / HOST_BITS_PER_WIDEST_FAST_INT)
-typedef HARD_REG_ELT_TYPE HARD_REG_SET[HARD_REG_SET_LONGS];
+
+struct HARD_REG_SET
+{
+  HARD_REG_SET
+  operator~ () const
+  {
+    HARD_REG_SET res;
+    for (unsigned int i = 0; i < ARRAY_SIZE (elts); ++i)
+      res.elts[i] = ~elts[i];
+    return res;
+  }
+
+  HARD_REG_SET
+  operator& (const HARD_REG_SET &other) const
+  {
+    HARD_REG_SET res;
+    for (unsigned int i = 0; i < ARRAY_SIZE (elts); ++i)
+      res.elts[i] = elts[i] & other.elts[i];
+    return res;
+  }
+
+  HARD_REG_SET &
+  operator&= (const HARD_REG_SET &other)
+  {
+    for (unsigned int i = 0; i < ARRAY_SIZE (elts); ++i)
+      elts[i] &= other.elts[i];
+    return *this;
+  }
+
+  HARD_REG_SET
+  operator| (const HARD_REG_SET &other) const
+  {
+    HARD_REG_SET res;
+    for (unsigned int i = 0; i < ARRAY_SIZE (elts); ++i)
+      res.elts[i] = elts[i] | other.elts[i];
+    return res;
+  }
+
+  HARD_REG_SET &
+  operator|= (const HARD_REG_SET &other)
+  {
+    for (unsigned int i = 0; i < ARRAY_SIZE (elts); ++i)
+      elts[i] |= other.elts[i];
+    return *this;
+  }
+
+  bool
+  operator== (const HARD_REG_SET &other) const
+  {
+    HARD_REG_ELT_TYPE bad = 0;
+    for (unsigned int i = 0; i < ARRAY_SIZE (elts); ++i)
+      bad |= (elts[i] ^ other.elts[i]);
+    return bad == 0;
+  }
+
+  bool
+  operator!= (const HARD_REG_SET &other) const
+  {
+    return !operator== (other);
+  }
+
+  HARD_REG_ELT_TYPE elts[HARD_REG_SET_LONGS];
+};
+typedef const HARD_REG_SET &const_hard_reg_set;
+
+template<>
+struct array_traits<HARD_REG_SET>
+{
+  typedef HARD_REG_ELT_TYPE element_type;
+  static const bool has_constant_size = true;
+  static const size_t constant_size = HARD_REG_SET_LONGS;
+  static const element_type *base (const HARD_REG_SET &x) { return x.elts; }
+  static size_t size (const HARD_REG_SET &) { return HARD_REG_SET_LONGS; }
+};
 
 #endif
 
@@ -77,28 +153,15 @@ struct hard_reg_set_container
    CLEAR_HARD_REG_SET and SET_HARD_REG_SET.
    These take just one argument.
 
-   Also define macros for copying hard reg sets:
-   COPY_HARD_REG_SET and COMPL_HARD_REG_SET.
-   These take two arguments TO and FROM; they read from FROM
-   and store into TO.  COMPL_HARD_REG_SET complements each bit.
-
-   Also define macros for combining hard reg sets:
-   IOR_HARD_REG_SET and AND_HARD_REG_SET.
-   These take two arguments TO and FROM; they read from FROM
-   and combine bitwise into TO.  Define also two variants
-   IOR_COMPL_HARD_REG_SET and AND_COMPL_HARD_REG_SET
-   which use the complement of the set FROM.
-
    Also define:
 
    hard_reg_set_subset_p (X, Y), which returns true if X is a subset of Y.
-   hard_reg_set_equal_p (X, Y), which returns true if X and Y are equal.
    hard_reg_set_intersect_p (X, Y), which returns true if X and Y intersect.
    hard_reg_set_empty_p (X), which returns true if X is empty.  */
 
 #define UHOST_BITS_PER_WIDE_INT ((unsigned) HOST_BITS_PER_WIDEST_FAST_INT)
 
-#ifdef HARD_REG_SET
+#if FIRST_PSEUDO_REGISTER <= HOST_BITS_PER_WIDEST_FAST_INT
 
 #define SET_HARD_REG_BIT(SET, BIT)  \
  ((SET) |= HARD_CONST (1) << (BIT))
@@ -110,404 +173,87 @@ struct hard_reg_set_container
 #define CLEAR_HARD_REG_SET(TO) ((TO) = HARD_CONST (0))
 #define SET_HARD_REG_SET(TO) ((TO) = ~ HARD_CONST (0))
 
-#define COPY_HARD_REG_SET(TO, FROM) ((TO) = (FROM))
-#define COMPL_HARD_REG_SET(TO, FROM) ((TO) = ~(FROM))
-
-#define IOR_HARD_REG_SET(TO, FROM) ((TO) |= (FROM))
-#define IOR_COMPL_HARD_REG_SET(TO, FROM) ((TO) |= ~ (FROM))
-#define AND_HARD_REG_SET(TO, FROM) ((TO) &= (FROM))
-#define AND_COMPL_HARD_REG_SET(TO, FROM) ((TO) &= ~ (FROM))
-
 static inline bool
-hard_reg_set_subset_p (const HARD_REG_SET x, const HARD_REG_SET y)
+hard_reg_set_subset_p (const_hard_reg_set x, const_hard_reg_set y)
 {
   return (x & ~y) == HARD_CONST (0);
 }
 
 static inline bool
-hard_reg_set_equal_p (const HARD_REG_SET x, const HARD_REG_SET y)
-{
-  return x == y;
-}
-
-static inline bool
-hard_reg_set_intersect_p (const HARD_REG_SET x, const HARD_REG_SET y)
+hard_reg_set_intersect_p (const_hard_reg_set x, const_hard_reg_set y)
 {
   return (x & y) != HARD_CONST (0);
 }
 
 static inline bool
-hard_reg_set_empty_p (const HARD_REG_SET x)
+hard_reg_set_empty_p (const_hard_reg_set x)
 {
   return x == HARD_CONST (0);
 }
 
 #else
 
-#define SET_HARD_REG_BIT(SET, BIT)		\
-  ((SET)[(BIT) / UHOST_BITS_PER_WIDE_INT]	\
-   |= HARD_CONST (1) << ((BIT) % UHOST_BITS_PER_WIDE_INT))
-
-#define CLEAR_HARD_REG_BIT(SET, BIT)		\
-  ((SET)[(BIT) / UHOST_BITS_PER_WIDE_INT]	\
-   &= ~(HARD_CONST (1) << ((BIT) % UHOST_BITS_PER_WIDE_INT)))
-
-#define TEST_HARD_REG_BIT(SET, BIT)		\
-  (!!((SET)[(BIT) / UHOST_BITS_PER_WIDE_INT]	\
-      & (HARD_CONST (1) << ((BIT) % UHOST_BITS_PER_WIDE_INT))))
-
-#if FIRST_PSEUDO_REGISTER <= 2*HOST_BITS_PER_WIDEST_FAST_INT
-#define CLEAR_HARD_REG_SET(TO)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     scan_tp_[0] = 0;						\
-     scan_tp_[1] = 0; } while (0)
-
-#define SET_HARD_REG_SET(TO)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     scan_tp_[0] = -1;						\
-     scan_tp_[1] = -1; } while (0)
-
-#define COPY_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] = scan_fp_[0];					\
-     scan_tp_[1] = scan_fp_[1]; } while (0)
-
-#define COMPL_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] = ~ scan_fp_[0];				\
-     scan_tp_[1] = ~ scan_fp_[1]; } while (0)
-
-#define AND_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] &= scan_fp_[0];				\
-     scan_tp_[1] &= scan_fp_[1]; } while (0)
-
-#define AND_COMPL_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] &= ~ scan_fp_[0];				\
-     scan_tp_[1] &= ~ scan_fp_[1]; } while (0)
-
-#define IOR_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] |= scan_fp_[0];				\
-     scan_tp_[1] |= scan_fp_[1]; } while (0)
-
-#define IOR_COMPL_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] |= ~ scan_fp_[0];				\
-     scan_tp_[1] |= ~ scan_fp_[1]; } while (0)
-
-static inline bool
-hard_reg_set_subset_p (const HARD_REG_SET x, const HARD_REG_SET y)
-{
-  return (x[0] & ~y[0]) == 0 && (x[1] & ~y[1]) == 0;
-}
-
-static inline bool
-hard_reg_set_equal_p (const HARD_REG_SET x, const HARD_REG_SET y)
+inline void
+SET_HARD_REG_BIT (HARD_REG_SET &set, unsigned int bit)
 {
-  return x[0] == y[0] && x[1] == y[1];
+  set.elts[bit / UHOST_BITS_PER_WIDE_INT]
+    |= HARD_CONST (1) << (bit % UHOST_BITS_PER_WIDE_INT);
 }
 
-static inline bool
-hard_reg_set_intersect_p (const HARD_REG_SET x, const HARD_REG_SET y)
+inline void
+CLEAR_HARD_REG_BIT (HARD_REG_SET &set, unsigned int bit)
 {
-  return (x[0] & y[0]) != 0 || (x[1] & y[1]) != 0;
+  set.elts[bit / UHOST_BITS_PER_WIDE_INT]
+    &= ~(HARD_CONST (1) << (bit % UHOST_BITS_PER_WIDE_INT));
 }
 
-static inline bool
-hard_reg_set_empty_p (const HARD_REG_SET x)
+inline bool
+TEST_HARD_REG_BIT (const_hard_reg_set set, unsigned int bit)
 {
-  return x[0] == 0 && x[1] == 0;
+  return (set.elts[bit / UHOST_BITS_PER_WIDE_INT]
+	  & (HARD_CONST (1) << (bit % UHOST_BITS_PER_WIDE_INT)));
 }
 
-#else
-#if FIRST_PSEUDO_REGISTER <= 3*HOST_BITS_PER_WIDEST_FAST_INT
-#define CLEAR_HARD_REG_SET(TO)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     scan_tp_[0] = 0;						\
-     scan_tp_[1] = 0;						\
-     scan_tp_[2] = 0; } while (0)
-
-#define SET_HARD_REG_SET(TO)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     scan_tp_[0] = -1;						\
-     scan_tp_[1] = -1;						\
-     scan_tp_[2] = -1; } while (0)
-
-#define COPY_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] = scan_fp_[0];					\
-     scan_tp_[1] = scan_fp_[1];					\
-     scan_tp_[2] = scan_fp_[2]; } while (0)
-
-#define COMPL_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] = ~ scan_fp_[0];				\
-     scan_tp_[1] = ~ scan_fp_[1];				\
-     scan_tp_[2] = ~ scan_fp_[2]; } while (0)
-
-#define AND_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] &= scan_fp_[0];				\
-     scan_tp_[1] &= scan_fp_[1];				\
-     scan_tp_[2] &= scan_fp_[2]; } while (0)
-
-#define AND_COMPL_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] &= ~ scan_fp_[0];				\
-     scan_tp_[1] &= ~ scan_fp_[1];				\
-     scan_tp_[2] &= ~ scan_fp_[2]; } while (0)
-
-#define IOR_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] |= scan_fp_[0];				\
-     scan_tp_[1] |= scan_fp_[1];				\
-     scan_tp_[2] |= scan_fp_[2]; } while (0)
-
-#define IOR_COMPL_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] |= ~ scan_fp_[0];				\
-     scan_tp_[1] |= ~ scan_fp_[1];				\
-     scan_tp_[2] |= ~ scan_fp_[2]; } while (0)
-
-static inline bool
-hard_reg_set_subset_p (const HARD_REG_SET x, const HARD_REG_SET y)
+inline void
+CLEAR_HARD_REG_SET (HARD_REG_SET &set)
 {
-  return ((x[0] & ~y[0]) == 0
-	  && (x[1] & ~y[1]) == 0
-	  && (x[2] & ~y[2]) == 0);
+  for (unsigned int i = 0; i < ARRAY_SIZE (set.elts); ++i)
+    set.elts[i] = 0;
 }
 
-static inline bool
-hard_reg_set_equal_p (const HARD_REG_SET x, const HARD_REG_SET y)
+inline void
+SET_HARD_REG_SET (HARD_REG_SET &set)
 {
-  return x[0] == y[0] && x[1] == y[1] && x[2] == y[2];
+  for (unsigned int i = 0; i < ARRAY_SIZE (set.elts); ++i)
+    set.elts[i] = -1;
 }
 
 static inline bool
-hard_reg_set_intersect_p (const HARD_REG_SET x, const HARD_REG_SET y)
-{
-  return ((x[0] & y[0]) != 0
-	  || (x[1] & y[1]) != 0
-	  || (x[2] & y[2]) != 0);
-}
-
-static inline bool
-hard_reg_set_empty_p (const HARD_REG_SET x)
-{
-  return x[0] == 0 && x[1] == 0 && x[2] == 0;
-}
-
-#else
-#if FIRST_PSEUDO_REGISTER <= 4*HOST_BITS_PER_WIDEST_FAST_INT
-#define CLEAR_HARD_REG_SET(TO)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     scan_tp_[0] = 0;						\
-     scan_tp_[1] = 0;						\
-     scan_tp_[2] = 0;						\
-     scan_tp_[3] = 0; } while (0)
-
-#define SET_HARD_REG_SET(TO)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     scan_tp_[0] = -1;						\
-     scan_tp_[1] = -1;						\
-     scan_tp_[2] = -1;						\
-     scan_tp_[3] = -1; } while (0)
-
-#define COPY_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] = scan_fp_[0];					\
-     scan_tp_[1] = scan_fp_[1];					\
-     scan_tp_[2] = scan_fp_[2];					\
-     scan_tp_[3] = scan_fp_[3]; } while (0)
-
-#define COMPL_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] = ~ scan_fp_[0];				\
-     scan_tp_[1] = ~ scan_fp_[1];				\
-     scan_tp_[2] = ~ scan_fp_[2];				\
-     scan_tp_[3] = ~ scan_fp_[3]; } while (0)
-
-#define AND_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] &= scan_fp_[0];				\
-     scan_tp_[1] &= scan_fp_[1];				\
-     scan_tp_[2] &= scan_fp_[2];				\
-     scan_tp_[3] &= scan_fp_[3]; } while (0)
-
-#define AND_COMPL_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] &= ~ scan_fp_[0];				\
-     scan_tp_[1] &= ~ scan_fp_[1];				\
-     scan_tp_[2] &= ~ scan_fp_[2];				\
-     scan_tp_[3] &= ~ scan_fp_[3]; } while (0)
-
-#define IOR_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] |= scan_fp_[0];				\
-     scan_tp_[1] |= scan_fp_[1];				\
-     scan_tp_[2] |= scan_fp_[2];				\
-     scan_tp_[3] |= scan_fp_[3]; } while (0)
-
-#define IOR_COMPL_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     scan_tp_[0] |= ~ scan_fp_[0];				\
-     scan_tp_[1] |= ~ scan_fp_[1];				\
-     scan_tp_[2] |= ~ scan_fp_[2];				\
-     scan_tp_[3] |= ~ scan_fp_[3]; } while (0)
-
-static inline bool
-hard_reg_set_subset_p (const HARD_REG_SET x, const HARD_REG_SET y)
+hard_reg_set_subset_p (const_hard_reg_set x, const_hard_reg_set y)
 {
-  return ((x[0] & ~y[0]) == 0
-	  && (x[1] & ~y[1]) == 0
-	  && (x[2] & ~y[2]) == 0
-	  && (x[3] & ~y[3]) == 0);
+  HARD_REG_ELT_TYPE bad = 0;
+  for (unsigned int i = 0; i < ARRAY_SIZE (x.elts); ++i)
+    bad |= (x.elts[i] & ~y.elts[i]);
+  return bad == 0;
 }
 
 static inline bool
-hard_reg_set_equal_p (const HARD_REG_SET x, const HARD_REG_SET y)
+hard_reg_set_intersect_p (const_hard_reg_set x, const_hard_reg_set y)
 {
-  return x[0] == y[0] && x[1] == y[1] && x[2] == y[2] && x[3] == y[3];
+  HARD_REG_ELT_TYPE good = 0;
+  for (unsigned int i = 0; i < ARRAY_SIZE (x.elts); ++i)
+    good |= (x.elts[i] & y.elts[i]);
+  return good != 0;
 }
 
 static inline bool
-hard_reg_set_intersect_p (const HARD_REG_SET x, const HARD_REG_SET y)
+hard_reg_set_empty_p (const_hard_reg_set x)
 {
-  return ((x[0] & y[0]) != 0
-	  || (x[1] & y[1]) != 0
-	  || (x[2] & y[2]) != 0
-	  || (x[3] & y[3]) != 0);
+  HARD_REG_ELT_TYPE bad = 0;
+  for (unsigned int i = 0; i < ARRAY_SIZE (x.elts); ++i)
+    bad |= x.elts[i];
+  return bad == 0;
 }
-
-static inline bool
-hard_reg_set_empty_p (const HARD_REG_SET x)
-{
-  return x[0] == 0 && x[1] == 0 && x[2] == 0 && x[3] == 0;
-}
-
-#else /* FIRST_PSEUDO_REGISTER > 4*HOST_BITS_PER_WIDEST_FAST_INT */
-
-#define CLEAR_HARD_REG_SET(TO)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     int i;							\
-     for (i = 0; i < HARD_REG_SET_LONGS; i++)			\
-       *scan_tp_++ = 0; } while (0)
-
-#define SET_HARD_REG_SET(TO)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     int i;							\
-     for (i = 0; i < HARD_REG_SET_LONGS; i++)			\
-       *scan_tp_++ = -1; } while (0)
-
-#define COPY_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     int i;							\
-     for (i = 0; i < HARD_REG_SET_LONGS; i++)			\
-       *scan_tp_++ = *scan_fp_++; } while (0)
-
-#define COMPL_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     int i;							\
-     for (i = 0; i < HARD_REG_SET_LONGS; i++)			\
-       *scan_tp_++ = ~ *scan_fp_++; } while (0)
-
-#define AND_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     int i;							\
-     for (i = 0; i < HARD_REG_SET_LONGS; i++)			\
-       *scan_tp_++ &= *scan_fp_++; } while (0)
-
-#define AND_COMPL_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     int i;							\
-     for (i = 0; i < HARD_REG_SET_LONGS; i++)			\
-       *scan_tp_++ &= ~ *scan_fp_++; } while (0)
-
-#define IOR_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     int i;							\
-     for (i = 0; i < HARD_REG_SET_LONGS; i++)			\
-       *scan_tp_++ |= *scan_fp_++; } while (0)
-
-#define IOR_COMPL_HARD_REG_SET(TO, FROM)  \
-do { HARD_REG_ELT_TYPE *scan_tp_ = (TO);			\
-     const HARD_REG_ELT_TYPE *scan_fp_ = (FROM);		\
-     int i;							\
-     for (i = 0; i < HARD_REG_SET_LONGS; i++)			\
-       *scan_tp_++ |= ~ *scan_fp_++; } while (0)
-
-static inline bool
-hard_reg_set_subset_p (const HARD_REG_SET x, const HARD_REG_SET y)
-{
-  int i;
-
-  for (i = 0; i < HARD_REG_SET_LONGS; i++)
-    if ((x[i] & ~y[i]) != 0)
-      return false;
-  return true;
-}
-
-static inline bool
-hard_reg_set_equal_p (const HARD_REG_SET x, const HARD_REG_SET y)
-{
-  int i;
-
-  for (i = 0; i < HARD_REG_SET_LONGS; i++)
-    if (x[i] != y[i])
-      return false;
-  return true;
-}
-
-static inline bool
-hard_reg_set_intersect_p (const HARD_REG_SET x, const HARD_REG_SET y)
-{
-  int i;
-
-  for (i = 0; i < HARD_REG_SET_LONGS; i++)
-    if ((x[i] & y[i]) != 0)
-      return true;
-  return false;
-}
-
-static inline bool
-hard_reg_set_empty_p (const HARD_REG_SET x)
-{
-  int i;
-
-  for (i = 0; i < HARD_REG_SET_LONGS; i++)
-    if (x[i] != 0)
-      return false;
-  return true;
-}
-
-#endif
-#endif
-#endif
 #endif
 
 /* Iterator for hard register sets.  */
@@ -515,7 +261,7 @@ hard_reg_set_empty_p (const HARD_REG_SET x)
 struct hard_reg_set_iterator
 {
   /* Pointer to the current element.  */
-  HARD_REG_ELT_TYPE *pelt;
+  const HARD_REG_ELT_TYPE *pelt;
 
   /* The length of the set.  */
   unsigned short length;
@@ -534,11 +280,11 @@ struct hard_reg_set_iterator
 /* The implementation of the iterator functions is fully analogous to
    the bitmap iterators.  */
 static inline void
-hard_reg_set_iter_init (hard_reg_set_iterator *iter, HARD_REG_SET set,
+hard_reg_set_iter_init (hard_reg_set_iterator *iter, const_hard_reg_set set,
                         unsigned min, unsigned *regno)
 {
 #ifdef HARD_REG_SET_LONGS
-  iter->pelt = set;
+  iter->pelt = set.elts;
   iter->length = HARD_REG_SET_LONGS;
 #else
   iter->pelt = &set;
@@ -649,16 +395,15 @@ struct target_hard_regs {
      a pseudo reg whose life crosses calls.  */
   char x_call_used_regs[FIRST_PSEUDO_REGISTER];
 
-  char x_call_really_used_regs[FIRST_PSEUDO_REGISTER];
-
-  /* The same info as a HARD_REG_SET.  */
-  HARD_REG_SET x_call_used_reg_set;
+  /* For targets that use reload rather than LRA, this is the set
+     of registers that we are able to save and restore around calls
+     (i.e. those for which we know a suitable mode and set of
+     load/store instructions exist).  For LRA targets it contains
+     all registers.
 
-  /* Contains registers that are fixed use -- i.e. in fixed_reg_set -- or
-     a function value return register or TARGET_STRUCT_VALUE_RTX or
-     STATIC_CHAIN_REGNUM.  These are the registers that cannot hold quantities
-     across calls even if we are willing to save and restore them.  */
-  HARD_REG_SET x_call_fixed_reg_set;
+     This is legacy information and should be removed if all targets
+     switch to LRA.  */
+  HARD_REG_SET x_savable_regs;
 
   /* Contains registers that are fixed use -- i.e. in fixed_reg_set -- but
      only if they are not merely part of that set because they are global
@@ -674,10 +419,6 @@ struct target_hard_regs {
      with the local stack frame are safe, but scant others.  */
   HARD_REG_SET x_regs_invalidated_by_call;
 
-  /* Call used hard registers which cannot be saved because there is no
-     insn for this.  */
-  HARD_REG_SET x_no_caller_save_reg_set;
-
   /* Table of register numbers in the order in which to try to use them.  */
   int x_reg_alloc_order[FIRST_PSEUDO_REGISTER];
 
@@ -730,18 +471,16 @@ extern struct target_hard_regs *this_target_hard_regs;
   (this_target_hard_regs->x_fixed_reg_set)
 #define fixed_nonglobal_reg_set \
   (this_target_hard_regs->x_fixed_nonglobal_reg_set)
+#ifdef IN_TARGET_CODE
 #define call_used_regs \
   (this_target_hard_regs->x_call_used_regs)
-#define call_really_used_regs \
-  (this_target_hard_regs->x_call_really_used_regs)
-#define call_used_reg_set \
-  (this_target_hard_regs->x_call_used_reg_set)
-#define call_fixed_reg_set \
-  (this_target_hard_regs->x_call_fixed_reg_set)
+#endif
+#define savable_regs \
+  (this_target_hard_regs->x_savable_regs)
 #define regs_invalidated_by_call \
   (this_target_hard_regs->x_regs_invalidated_by_call)
-#define no_caller_save_reg_set \
-  (this_target_hard_regs->x_no_caller_save_reg_set)
+#define call_used_or_fixed_regs \
+  (regs_invalidated_by_call | fixed_reg_set)
 #define reg_alloc_order \
   (this_target_hard_regs->x_reg_alloc_order)
 #define inv_reg_alloc_order \
@@ -770,4 +509,13 @@ extern const char * reg_class_names[];
 #define REG_CAN_CHANGE_MODE_P(REGN, FROM, TO)                          \
   (targetm.can_change_mode_class (FROM, TO, REGNO_REG_CLASS (REGN)))
 
+/* Return true if register REGNO is either fixed or call-used
+   (aka call-clobbered).  */
+
+inline bool
+call_used_or_fixed_reg_p (unsigned int regno)
+{
+  return fixed_regs[regno] || this_target_hard_regs->x_call_used_regs[regno];
+}
+
 #endif /* ! GCC_HARD_REG_SET_H */
diff --git a/gcc/hooks.c b/gcc/hooks.c
index f95659b38..98038860e 100644
--- a/gcc/hooks.c
+++ b/gcc/hooks.c
@@ -140,9 +140,8 @@ hook_bool_puint64_puint64_true (poly_uint64, poly_uint64)
   return true;
 }
 
-/* Generic hook that takes (unsigned int, machine_mode) and returns false.  */
 bool
-hook_bool_insn_uint_mode_false (rtx_insn *, unsigned int, machine_mode)
+hook_bool_uint_uint_mode_false (unsigned int, unsigned int, machine_mode)
 {
   return false;
 }
@@ -313,6 +312,12 @@ hook_bool_const_tree_false (const_tree)
   return false;
 }
 
+bool
+hook_bool_const_tree_const_tree_true (const_tree, const_tree)
+{
+  return true;
+}
+
 bool
 hook_bool_tree_true (tree)
 {
diff --git a/gcc/hooks.h b/gcc/hooks.h
index 0bc8117c2..b398d13ce 100644
--- a/gcc/hooks.h
+++ b/gcc/hooks.h
@@ -40,11 +40,12 @@ extern bool hook_bool_const_rtx_insn_const_rtx_insn_true (const rtx_insn *,
 extern bool hook_bool_mode_uhwi_false (machine_mode,
 				       unsigned HOST_WIDE_INT);
 extern bool hook_bool_puint64_puint64_true (poly_uint64, poly_uint64);
-extern bool hook_bool_insn_uint_mode_false (rtx_insn *, unsigned int,
+extern bool hook_bool_uint_uint_mode_false (unsigned int, unsigned int,
 					    machine_mode);
 extern bool hook_bool_uint_mode_true (unsigned int, machine_mode);
 extern bool hook_bool_tree_false (tree);
 extern bool hook_bool_const_tree_false (const_tree);
+extern bool hook_bool_const_tree_const_tree_true (const_tree, const_tree);
 extern bool hook_bool_tree_true (tree);
 extern bool hook_bool_const_tree_true (const_tree);
 extern bool hook_bool_gsiptr_false (gimple_stmt_iterator *);
diff --git a/gcc/hw-doloop.c b/gcc/hw-doloop.c
index 2decece62..3ee0b4098 100644
--- a/gcc/hw-doloop.c
+++ b/gcc/hw-doloop.c
@@ -141,7 +141,7 @@ scan_loop (hwloop_info loop)
 	    CLEAR_HARD_REG_BIT (set_this_insn, REGNO (loop->iter_reg));
 	  else if (reg_mentioned_p (loop->iter_reg, PATTERN (insn)))
 	    loop->iter_reg_used = true;
-	  IOR_HARD_REG_SET (loop->regs_set_in_loop, set_this_insn);
+	  loop->regs_set_in_loop |= set_this_insn;
 	}
     }
 }
@@ -581,7 +581,7 @@ optimize_loop (hwloop_info loop, struct hw_doloop_hooks *hooks)
 	inner_depth = inner->depth;
       /* The set of registers may be changed while optimizing the inner
 	 loop.  */
-      IOR_HARD_REG_SET (loop->regs_set_in_loop, inner->regs_set_in_loop);
+      loop->regs_set_in_loop |= inner->regs_set_in_loop;
     }
 
   loop->depth = inner_depth + 1;
diff --git a/gcc/int-vector-builder.h b/gcc/int-vector-builder.h
index adf0904c5..dc9651021 100644
--- a/gcc/int-vector-builder.h
+++ b/gcc/int-vector-builder.h
@@ -26,10 +26,11 @@ along with GCC; see the file COPYING3.  If not see
    encoding as tree and rtx constants.  See vector_builder for more
    details.  */
 template<typename T>
-class int_vector_builder : public vector_builder<T, int_vector_builder<T> >
+class int_vector_builder : public vector_builder<T, poly_uint64,
+						 int_vector_builder<T> >
 {
-  typedef vector_builder<T, int_vector_builder> parent;
-  friend class vector_builder<T, int_vector_builder>;
+  typedef vector_builder<T, poly_uint64, int_vector_builder> parent;
+  friend class vector_builder<T, poly_uint64, int_vector_builder>;
 
 public:
   int_vector_builder () {}
@@ -45,6 +46,8 @@ private:
   T apply_step (T, unsigned int, T) const;
   bool can_elide_p (T) const { return true; }
   void note_representative (T *, T) {}
+
+  static poly_uint64 shape_nelts (poly_uint64 x) { return x; }
 };
 
 /* Create a new builder for a vector with FULL_NELTS elements.
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 21ecd5667..9753a12f3 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -117,6 +117,7 @@ init_internal_fns ()
 #define while_direct { 0, 2, false }
 #define fold_extract_direct { 2, 2, false }
 #define fold_left_direct { 1, 1, false }
+#define mask_fold_left_direct { 1, 1, false }
 
 const direct_internal_fn_info direct_internal_fn_array[IFN_LAST + 1] = {
 #define DEF_INTERNAL_FN(CODE, FLAGS, FNSPEC) not_direct,
@@ -3005,6 +3006,9 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 #define expand_fold_left_optab_fn(FN, STMT, OPTAB) \
   expand_direct_optab_fn (FN, STMT, OPTAB, 2)
 
+#define expand_mask_fold_left_optab_fn(FN, STMT, OPTAB) \
+  expand_direct_optab_fn (FN, STMT, OPTAB, 3)
+
 /* RETURN_TYPE and ARGS are a return type and argument list that are
    in principle compatible with FN (which satisfies direct_internal_fn_p).
    Return the types that should be used to determine whether the
@@ -3093,6 +3097,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
 #define direct_while_optab_supported_p convert_optab_supported_p
 #define direct_fold_extract_optab_supported_p direct_optab_supported_p
 #define direct_fold_left_optab_supported_p direct_optab_supported_p
+#define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
 
 /* Return the optab used by internal function FN.  */
 
@@ -3210,6 +3215,8 @@ first_commutative_argument (internal_fn fn)
     case IFN_FNMS:
     case IFN_AVG_FLOOR:
     case IFN_AVG_CEIL:
+    case IFN_MULHS:
+    case IFN_MULHRS:
     case IFN_FMIN:
     case IFN_FMAX:
       return 0;
@@ -3286,7 +3293,9 @@ static void (*const internal_fn_expanders[]) (internal_fn, gcall *) = {
   T (MAX_EXPR, IFN_COND_MAX) \
   T (BIT_AND_EXPR, IFN_COND_AND) \
   T (BIT_IOR_EXPR, IFN_COND_IOR) \
-  T (BIT_XOR_EXPR, IFN_COND_XOR)
+  T (BIT_XOR_EXPR, IFN_COND_XOR) \
+  T (LSHIFT_EXPR, IFN_COND_SHL) \
+  T (RSHIFT_EXPR, IFN_COND_SHR)
 
 /* Return a function that only performs CODE when a certain condition is met
    and that uses a given fallback value otherwise.  For example, if CODE is
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index e370eaa84..ae32fc7bd 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -140,6 +140,8 @@ DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
 DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | ECF_NOTHROW,
 		       vec_shl_insert, binary)
 
+DEF_INTERNAL_OPTAB_FN (DIV_POW2, ECF_CONST | ECF_NOTHROW, sdiv_pow2, binary)
+
 DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
 DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary)
 DEF_INTERNAL_OPTAB_FN (FNMS, ECF_CONST, fnms, ternary)
@@ -149,6 +151,11 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (AVG_FLOOR, ECF_CONST | ECF_NOTHROW, first,
 DEF_INTERNAL_SIGNED_OPTAB_FN (AVG_CEIL, ECF_CONST | ECF_NOTHROW, first,
 			      savg_ceil, uavg_ceil, binary)
 
+DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
+			      smulhs, umulhs, binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
+			      smulhrs, umulhrs, binary)
+
 DEF_INTERNAL_OPTAB_FN (COND_ADD, ECF_CONST, cond_add, cond_binary)
 DEF_INTERNAL_OPTAB_FN (COND_SUB, ECF_CONST, cond_sub, cond_binary)
 DEF_INTERNAL_OPTAB_FN (COND_MUL, ECF_CONST, cond_smul, cond_binary)
@@ -167,6 +174,10 @@ DEF_INTERNAL_OPTAB_FN (COND_IOR, ECF_CONST | ECF_NOTHROW,
 		       cond_ior, cond_binary)
 DEF_INTERNAL_OPTAB_FN (COND_XOR, ECF_CONST | ECF_NOTHROW,
 		       cond_xor, cond_binary)
+DEF_INTERNAL_OPTAB_FN (COND_SHL, ECF_CONST | ECF_NOTHROW,
+		       cond_ashl, cond_binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (COND_SHR, ECF_CONST | ECF_NOTHROW, first,
+			      cond_ashr, cond_lshr, cond_binary)
 
 DEF_INTERNAL_OPTAB_FN (COND_FMA, ECF_CONST, cond_fma, cond_ternary)
 DEF_INTERNAL_OPTAB_FN (COND_FMS, ECF_CONST, cond_fms, cond_ternary)
@@ -199,6 +210,9 @@ DEF_INTERNAL_OPTAB_FN (FOLD_EXTRACT_LAST, ECF_CONST | ECF_NOTHROW,
 DEF_INTERNAL_OPTAB_FN (FOLD_LEFT_PLUS, ECF_CONST | ECF_NOTHROW,
 		       fold_left_plus, fold_left)
 
+DEF_INTERNAL_OPTAB_FN (MASK_FOLD_LEFT_PLUS, ECF_CONST | ECF_NOTHROW,
+		       mask_fold_left_plus, mask_fold_left)
+
 /* Unary math functions.  */
 DEF_INTERNAL_FLT_FN (ACOS, ECF_CONST, acos, unary)
 DEF_INTERNAL_FLT_FN (ACOSH, ECF_CONST, acosh, unary)
@@ -217,6 +231,7 @@ DEF_INTERNAL_FLT_FN (LOG10, ECF_CONST, log10, unary)
 DEF_INTERNAL_FLT_FN (LOG1P, ECF_CONST, log1p, unary)
 DEF_INTERNAL_FLT_FN (LOG2, ECF_CONST, log2, unary)
 DEF_INTERNAL_FLT_FN (LOGB, ECF_CONST, logb, unary)
+DEF_INTERNAL_FLT_FN (SIGNBIT, ECF_CONST, signbit, unary)
 DEF_INTERNAL_FLT_FN (SIGNIFICAND, ECF_CONST, significand, unary)
 DEF_INTERNAL_FLT_FN (SIN, ECF_CONST, sin, unary)
 DEF_INTERNAL_FLT_FN (SINH, ECF_CONST, sinh, unary)
diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 8988a4e49..b9e2ef450 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -2862,8 +2862,7 @@ ipa_get_indirect_edge_target_1 (struct cgraph_edge *ie,
 	  if (can_refer)
 	    {
 	      if (!target
-		  || (TREE_CODE (TREE_TYPE (target)) == FUNCTION_TYPE
-		      && DECL_FUNCTION_CODE (target) == BUILT_IN_UNREACHABLE)
+		  || fndecl_built_in_p (target, BUILT_IN_UNREACHABLE)
 		  || !possible_polymorphic_call_target_p
 		       (ie, cgraph_node::get (target)))
 		{
diff --git a/gcc/ipa-devirt.c b/gcc/ipa-devirt.c
index 2d8a0b383..df1ea21b4 100644
--- a/gcc/ipa-devirt.c
+++ b/gcc/ipa-devirt.c
@@ -3576,12 +3576,10 @@ possible_polymorphic_call_target_p (tree otr_type,
 {
   vec <cgraph_node *> targets;
   unsigned int i;
-  enum built_in_function fcode;
   bool final;
 
-  if (TREE_CODE (TREE_TYPE (n->decl)) == FUNCTION_TYPE
-      && ((fcode = DECL_FUNCTION_CODE (n->decl)) == BUILT_IN_UNREACHABLE
-          || fcode == BUILT_IN_TRAP))
+  if (fndecl_built_in_p (n->decl, BUILT_IN_UNREACHABLE)
+      || fndecl_built_in_p (n->decl, BUILT_IN_TRAP))
     return true;
 
   if (is_cxa_pure_virtual_p (n->decl))
diff --git a/gcc/ipa-icf.c b/gcc/ipa-icf.c
index 568c6a452..8b6961486 100644
--- a/gcc/ipa-icf.c
+++ b/gcc/ipa-icf.c
@@ -351,8 +351,8 @@ sem_item::compare_referenced_symbol_properties (symtab_node *used_by,
 	    return return_false_with_msg ("inline attributes are different");
 	}
 
-      if (DECL_IS_OPERATOR_NEW (n1->decl)
-	  != DECL_IS_OPERATOR_NEW (n2->decl))
+      if (DECL_IS_OPERATOR_NEW_P (n1->decl)
+	  != DECL_IS_OPERATOR_NEW_P (n2->decl))
 	return return_false_with_msg ("operator new flags are different");
     }
 
@@ -416,7 +416,7 @@ sem_item::hash_referenced_symbol_properties (symtab_node *ref,
 	  hstate.add_flag (DECL_DISREGARD_INLINE_LIMITS (ref->decl));
 	  hstate.add_flag (DECL_DECLARED_INLINE_P (ref->decl));
 	}
-      hstate.add_flag (DECL_IS_OPERATOR_NEW (ref->decl));
+      hstate.add_flag (DECL_IS_OPERATOR_NEW_P (ref->decl));
     }
   else if (is_a <varpool_node *> (ref))
     {
diff --git a/gcc/ipa-inline.c b/gcc/ipa-inline.c
index a2fb20320..7c627eff8 100644
--- a/gcc/ipa-inline.c
+++ b/gcc/ipa-inline.c
@@ -390,6 +390,28 @@ can_inline_edge_p (struct cgraph_edge *e, bool report,
   return inlinable;
 }
 
+/* Return inlining_insns_single limit for function N */
+
+static int
+inline_insns_single (cgraph_node *n)
+{
+  if (opt_for_fn (n->decl, optimize >= 3))
+    return PARAM_VALUE (PARAM_MAX_INLINE_INSNS_SINGLE);
+  else
+    return PARAM_VALUE (PARAM_MAX_INLINE_INSNS_SINGLE_O2);
+}
+
+/* Return inlining_insns_auto limit for function N */
+
+static int
+inline_insns_auto (cgraph_node *n)
+{
+  if (opt_for_fn (n->decl, optimize >= 3))
+    return PARAM_VALUE (PARAM_MAX_INLINE_INSNS_AUTO);
+  else
+    return PARAM_VALUE (PARAM_MAX_INLINE_INSNS_AUTO_O2);
+}
+
 /* Decide if we can inline the edge and possibly update
    inline_failed reason.  
    We check whether inlining is possible at all and whether
@@ -532,8 +554,8 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool report,
 	  int growth = estimate_edge_growth (e);
 	  if (growth > PARAM_VALUE (PARAM_MAX_INLINE_INSNS_SIZE)
 	      && (!DECL_DECLARED_INLINE_P (callee->decl)
-		  && growth >= MAX (MAX_INLINE_INSNS_SINGLE,
-				    MAX_INLINE_INSNS_AUTO)))
+		  && growth >= MAX (inline_insns_single (caller),
+				    inline_insns_auto (caller))))
 	    {
 	      e->inline_failed = CIF_OPTIMIZATION_MISMATCH;
 	      inlinable = false;
@@ -641,6 +663,10 @@ want_early_inline_function_p (struct cgraph_edge *e)
     {
       int growth = estimate_edge_growth (e);
       int n;
+      int early_inlining_insns = opt_for_fn (e->caller->decl, optimize) >= 3
+				 ? PARAM_VALUE (PARAM_EARLY_INLINING_INSNS)
+				 : PARAM_VALUE (PARAM_EARLY_INLINING_INSNS_O2);
+
 
       if (growth <= PARAM_VALUE (PARAM_MAX_INLINE_INSNS_SIZE))
 	;
@@ -654,26 +680,28 @@ want_early_inline_function_p (struct cgraph_edge *e)
 			     growth);
 	  want_inline = false;
 	}
-      else if (growth > PARAM_VALUE (PARAM_EARLY_INLINING_INSNS))
+      else if (growth > early_inlining_insns)
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, e->call_stmt,
 			     "  will not early inline: %C->%C, "
-			     "growth %i exceeds --param early-inlining-insns\n",
-			     e->caller, callee,
-			     growth);
+			     "growth %i exceeds --param early-inlining-insns%s\n",
+			     e->caller, callee, growth,
+			     opt_for_fn (e->caller->decl, optimize) >= 3
+			     ? "" : "-O2");
 	  want_inline = false;
 	}
       else if ((n = num_calls (callee)) != 0
-	       && growth * (n + 1) > PARAM_VALUE (PARAM_EARLY_INLINING_INSNS))
+	       && growth * (n + 1) > early_inlining_insns)
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, e->call_stmt,
 			     "  will not early inline: %C->%C, "
-			     "growth %i exceeds --param early-inlining-insns "
+			     "growth %i exceeds --param early-inlining-insns%s "
 			     "divided by number of calls\n",
-			     e->caller, callee,
-			     growth);
+			     e->caller, callee, growth,
+			     opt_for_fn (e->caller->decl, optimize) >= 3
+			     ? "" : "-O2");
 	  want_inline = false;
 	}
     }
@@ -739,9 +767,14 @@ big_speedup_p (struct cgraph_edge *e)
   sreal spec_time = estimate_edge_time (e, &unspec_time);
   sreal time = compute_uninlined_call_time (e, unspec_time);
   sreal inlined_time = compute_inlined_call_time (e, spec_time);
+  cgraph_node *caller = (e->caller->inlined_to
+			 ? e->caller->inlined_to
+			 : e->caller);
+  int limit = opt_for_fn (caller->decl, optimize) >= 3
+	      ? PARAM_VALUE (PARAM_INLINE_MIN_SPEEDUP)
+	      : PARAM_VALUE (PARAM_INLINE_MIN_SPEEDUP_O2);
 
-  if ((time - inlined_time) * 100
-      > (sreal) (time * PARAM_VALUE (PARAM_INLINE_MIN_SPEEDUP)))
+  if ((time - inlined_time) * 100 > time * limit)
     return true;
   return false;
 }
@@ -775,20 +808,29 @@ want_inline_small_function_p (struct cgraph_edge *e, bool report)
 	   && (!e->count.ipa ().initialized_p () || !e->maybe_hot_p ()))
 	   && ipa_fn_summaries->get (callee)->min_size
 		- ipa_call_summaries->get (e)->call_stmt_size
-	      > MAX (MAX_INLINE_INSNS_SINGLE, MAX_INLINE_INSNS_AUTO))
+	      > MAX (inline_insns_single (e->caller),
+		     inline_insns_auto (e->caller)))
     {
-      e->inline_failed = CIF_MAX_INLINE_INSNS_AUTO_LIMIT;
+      if (opt_for_fn (e->caller->decl, optimize) >= 3)
+        e->inline_failed = CIF_MAX_INLINE_INSNS_AUTO_LIMIT;
+      else
+        e->inline_failed = CIF_MAX_INLINE_INSNS_AUTO_O2_LIMIT;
       want_inline = false;
     }
   else if ((DECL_DECLARED_INLINE_P (callee->decl)
 	    || e->count.ipa ().nonzero_p ())
 	   && ipa_fn_summaries->get (callee)->min_size
 		- ipa_call_summaries->get (e)->call_stmt_size
-	      > 16 * MAX_INLINE_INSNS_SINGLE)
+	      > 16 * inline_insns_single (e->caller))
     {
-      e->inline_failed = (DECL_DECLARED_INLINE_P (callee->decl)
-			  ? CIF_MAX_INLINE_INSNS_SINGLE_LIMIT
-			  : CIF_MAX_INLINE_INSNS_AUTO_LIMIT);
+      if (opt_for_fn (e->caller->decl, optimize) >= 3)
+	e->inline_failed = (DECL_DECLARED_INLINE_P (callee->decl)
+			    ? CIF_MAX_INLINE_INSNS_SINGLE_LIMIT
+			    : CIF_MAX_INLINE_INSNS_AUTO_LIMIT);
+      else
+	e->inline_failed = (DECL_DECLARED_INLINE_P (callee->decl)
+			    ? CIF_MAX_INLINE_INSNS_SINGLE_O2_LIMIT
+			    : CIF_MAX_INLINE_INSNS_AUTO_O2_LIMIT);
       want_inline = false;
     }
   else
@@ -802,15 +844,18 @@ want_inline_small_function_p (struct cgraph_edge *e, bool report)
       /* Apply MAX_INLINE_INSNS_SINGLE limit.  Do not do so when
 	 hints suggests that inlining given function is very profitable.  */
       else if (DECL_DECLARED_INLINE_P (callee->decl)
-	       && growth >= MAX_INLINE_INSNS_SINGLE
-	       && (growth >= MAX_INLINE_INSNS_SINGLE * 16
+	       && growth >= inline_insns_single (e->caller)
+	       && (growth >= inline_insns_single (e->caller) * 16
 		   || (!(hints & (INLINE_HINT_indirect_call
 				  | INLINE_HINT_known_hot
 				  | INLINE_HINT_loop_iterations
 				  | INLINE_HINT_loop_stride))
 		       && !(big_speedup = big_speedup_p (e)))))
 	{
-          e->inline_failed = CIF_MAX_INLINE_INSNS_SINGLE_LIMIT;
+	  if (opt_for_fn (e->caller->decl, optimize) >= 3)
+            e->inline_failed = CIF_MAX_INLINE_INSNS_SINGLE_LIMIT;
+	  else
+            e->inline_failed = CIF_MAX_INLINE_INSNS_SINGLE_O2_LIMIT;
 	  want_inline = false;
 	}
       else if (!DECL_DECLARED_INLINE_P (callee->decl)
@@ -818,7 +863,7 @@ want_inline_small_function_p (struct cgraph_edge *e, bool report)
 	       && growth >= PARAM_VALUE (PARAM_MAX_INLINE_INSNS_SMALL))
 	{
 	  /* growth_likely_positive is expensive, always test it last.  */
-          if (growth >= MAX_INLINE_INSNS_SINGLE
+          if (growth >= inline_insns_single (e->caller)
 	      || growth_likely_positive (callee, growth))
 	    {
               e->inline_failed = CIF_NOT_DECLARED_INLINED;
@@ -833,22 +878,25 @@ want_inline_small_function_p (struct cgraph_edge *e, bool report)
 	       && growth >= ((hints & (INLINE_HINT_indirect_call
 				       | INLINE_HINT_loop_iterations
 				       | INLINE_HINT_loop_stride))
-			     ? MAX (MAX_INLINE_INSNS_AUTO,
-				    MAX_INLINE_INSNS_SINGLE)
-			     : MAX_INLINE_INSNS_AUTO)
+			     ? MAX (inline_insns_auto (e->caller),
+				    inline_insns_single (e->caller))
+			     : inline_insns_auto (e->caller))
 	       && !(big_speedup == -1 ? big_speedup_p (e) : big_speedup))
 	{
 	  /* growth_likely_positive is expensive, always test it last.  */
-          if (growth >= MAX_INLINE_INSNS_SINGLE
+          if (growth >= inline_insns_single (e->caller)
 	      || growth_likely_positive (callee, growth))
 	    {
-	      e->inline_failed = CIF_MAX_INLINE_INSNS_AUTO_LIMIT;
+	      if (opt_for_fn (e->caller->decl, optimize) >= 3)
+		e->inline_failed = CIF_MAX_INLINE_INSNS_AUTO_LIMIT;
+	      else
+		e->inline_failed = CIF_MAX_INLINE_INSNS_AUTO_O2_LIMIT;
 	      want_inline = false;
  	    }
 	}
       /* If call is cold, do not inline when function body would grow. */
       else if (!e->maybe_hot_p ()
-	       && (growth >= MAX_INLINE_INSNS_SINGLE
+	       && (growth >= inline_insns_single (e->caller)
 		   || growth_likely_positive (callee, growth)))
 	{
           e->inline_failed = CIF_UNLIKELY_CALL;
@@ -1157,7 +1205,7 @@ edge_badness (struct cgraph_edge *edge, bool dump)
 	      && caller_info->inlinable
 	      && ipa_size_summaries->get (caller)->size
 		 < (DECL_DECLARED_INLINE_P (caller->decl)
-		    ? MAX_INLINE_INSNS_SINGLE : MAX_INLINE_INSNS_AUTO))
+		    ? inline_insns_single (caller) : inline_insns_auto (caller)))
 	    {
 	      if (dump)
 		fprintf (dump_file,
diff --git a/gcc/ipa-param-manipulation.c b/gcc/ipa-param-manipulation.c
index 037253a87..1af6d050c 100644
--- a/gcc/ipa-param-manipulation.c
+++ b/gcc/ipa-param-manipulation.c
@@ -219,10 +219,7 @@ ipa_modify_formal_parameters (tree fndecl, ipa_parm_adjustment_vec adjustments)
 
   /* When signature changes, we need to clear builtin info.  */
   if (fndecl_built_in_p (fndecl))
-    {
-      DECL_BUILT_IN_CLASS (fndecl) = NOT_BUILT_IN;
-      DECL_FUNCTION_CODE (fndecl) = (enum built_in_function) 0;
-    }
+    set_decl_built_in_function (fndecl, NOT_BUILT_IN, 0);
 
   TREE_TYPE (fndecl) = new_type;
   DECL_VIRTUAL_P (fndecl) = 0;
@@ -452,14 +449,7 @@ ipa_modify_call_arguments (struct cgraph_edge *cs, gcall *stmt,
   gimple_call_set_chain (new_stmt, gimple_call_chain (stmt));
   gimple_call_copy_flags (new_stmt, stmt);
   if (gimple_in_ssa_p (cfun))
-    {
-      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
-      if (gimple_vdef (stmt))
-	{
-	  gimple_set_vdef (new_stmt, gimple_vdef (stmt));
-	  SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
-	}
-    }
+    gimple_move_vops (new_stmt, stmt);
 
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
diff --git a/gcc/ipa-prop.c b/gcc/ipa-prop.c
index 0439ce0c5..a70319505 100644
--- a/gcc/ipa-prop.c
+++ b/gcc/ipa-prop.c
@@ -3685,8 +3685,7 @@ try_make_edge_direct_virtual_call (struct cgraph_edge *ie,
 	  if (can_refer)
 	    {
 	      if (!t
-		  || (TREE_CODE (TREE_TYPE (t)) == FUNCTION_TYPE
-		      && DECL_FUNCTION_CODE (t) == BUILT_IN_UNREACHABLE)
+		  || fndecl_built_in_p (t, BUILT_IN_UNREACHABLE)
 		  || !possible_polymorphic_call_target_p
 		       (ie, cgraph_node::get (t)))
 		{
diff --git a/gcc/ipa-split.c b/gcc/ipa-split.c
index 5eaf8257f..aef2fa53c 100644
--- a/gcc/ipa-split.c
+++ b/gcc/ipa-split.c
@@ -1348,10 +1348,7 @@ split_function (basic_block return_bb, struct split_point *split_point,
      changes.  For partial inlining we however cannot expect the part
      of builtin implementation to have same semantic as the whole.  */
   if (fndecl_built_in_p (node->decl))
-    {
-      DECL_BUILT_IN_CLASS (node->decl) = NOT_BUILT_IN;
-      DECL_FUNCTION_CODE (node->decl) = (enum built_in_function) 0;
-    }
+    set_decl_built_in_function (node->decl, NOT_BUILT_IN, 0);
 
   /* If return_bb contains any clobbers that refer to SSA_NAMEs
      set in the split part, remove them.  Also reset debug stmts that
diff --git a/gcc/ira-build.c b/gcc/ira-build.c
index 83caa3a8e..55c552679 100644
--- a/gcc/ira-build.c
+++ b/gcc/ira-build.c
@@ -456,12 +456,10 @@ ira_create_object (ira_allocno_t a, int subword)
   OBJECT_CONFLICT_VEC_P (obj) = false;
   OBJECT_CONFLICT_ARRAY (obj) = NULL;
   OBJECT_NUM_CONFLICTS (obj) = 0;
-  COPY_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj), ira_no_alloc_regs);
-  COPY_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj), ira_no_alloc_regs);
-  IOR_COMPL_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj),
-			  reg_class_contents[aclass]);
-  IOR_COMPL_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
-			  reg_class_contents[aclass]);
+  OBJECT_CONFLICT_HARD_REGS (obj) = ira_no_alloc_regs;
+  OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) = ira_no_alloc_regs;
+  OBJECT_CONFLICT_HARD_REGS (obj) |= ~reg_class_contents[aclass];
+  OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= ~reg_class_contents[aclass];
   OBJECT_MIN (obj) = INT_MAX;
   OBJECT_MAX (obj) = -1;
   OBJECT_LIVE_RANGES (obj) = NULL;
@@ -549,10 +547,8 @@ ira_set_allocno_class (ira_allocno_t a, enum reg_class aclass)
   ALLOCNO_CLASS (a) = aclass;
   FOR_EACH_ALLOCNO_OBJECT (a, obj, oi)
     {
-      IOR_COMPL_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj),
-			      reg_class_contents[aclass]);
-      IOR_COMPL_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
-			      reg_class_contents[aclass]);
+      OBJECT_CONFLICT_HARD_REGS (obj) |= ~reg_class_contents[aclass];
+      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= ~reg_class_contents[aclass];
     }
 }
 
@@ -602,10 +598,10 @@ merge_hard_reg_conflicts (ira_allocno_t from, ira_allocno_t to,
       ira_object_t to_obj = ALLOCNO_OBJECT (to, i);
 
       if (!total_only)
-	IOR_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (to_obj),
-			  OBJECT_CONFLICT_HARD_REGS (from_obj));
-      IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (to_obj),
-			OBJECT_TOTAL_CONFLICT_HARD_REGS (from_obj));
+	OBJECT_CONFLICT_HARD_REGS (to_obj)
+	  |= OBJECT_CONFLICT_HARD_REGS (from_obj);
+      OBJECT_TOTAL_CONFLICT_HARD_REGS (to_obj)
+	|= OBJECT_TOTAL_CONFLICT_HARD_REGS (from_obj);
     }
 #ifdef STACK_REGS
   if (!total_only && ALLOCNO_NO_STACK_REG_P (from))
@@ -618,15 +614,15 @@ merge_hard_reg_conflicts (ira_allocno_t from, ira_allocno_t to,
 /* Update hard register conflict information for all objects associated with
    A to include the regs in SET.  */
 void
-ior_hard_reg_conflicts (ira_allocno_t a, HARD_REG_SET *set)
+ior_hard_reg_conflicts (ira_allocno_t a, const_hard_reg_set set)
 {
   ira_allocno_object_iterator i;
   ira_object_t obj;
 
   FOR_EACH_ALLOCNO_OBJECT (a, obj, i)
     {
-      IOR_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj), *set);
-      IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj), *set);
+      OBJECT_CONFLICT_HARD_REGS (obj) |= set;
+      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= set;
     }
 }
 
@@ -907,8 +903,9 @@ create_cap_allocno (ira_allocno_t a)
 
   ALLOCNO_CALLS_CROSSED_NUM (cap) = ALLOCNO_CALLS_CROSSED_NUM (a);
   ALLOCNO_CHEAP_CALLS_CROSSED_NUM (cap) = ALLOCNO_CHEAP_CALLS_CROSSED_NUM (a);
-  IOR_HARD_REG_SET (ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (cap),
-		    ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a));
+  ALLOCNO_CROSSED_CALLS_ABIS (cap) = ALLOCNO_CROSSED_CALLS_ABIS (a);
+  ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (cap)
+    = ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a);
   if (internal_flag_ira_verbose > 2 && ira_dump_file != NULL)
     {
       fprintf (ira_dump_file, "    Creating cap ");
@@ -1876,11 +1873,6 @@ create_insn_allocnos (rtx x, rtx outer, bool output_p)
       create_insn_allocnos (XEXP (x, 0), NULL, true);
       return;
     }
-  else if (code == CLOBBER_HIGH)
-    {
-      gcc_assert (REG_P (XEXP (x, 0)) && HARD_REGISTER_P (XEXP (x, 0)));
-      return;
-    }
   else if (code == MEM)
     {
       create_insn_allocnos (XEXP (x, 0), NULL, false);
@@ -2036,8 +2028,10 @@ propagate_allocno_info (void)
 	    += ALLOCNO_CALLS_CROSSED_NUM (a);
 	  ALLOCNO_CHEAP_CALLS_CROSSED_NUM (parent_a)
 	    += ALLOCNO_CHEAP_CALLS_CROSSED_NUM (a);
- 	  IOR_HARD_REG_SET (ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (parent_a),
- 			    ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a));
+	  ALLOCNO_CROSSED_CALLS_ABIS (parent_a)
+	    |= ALLOCNO_CROSSED_CALLS_ABIS (a);
+	  ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (parent_a)
+	    |= ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a);
 	  ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (parent_a)
 	    += ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (a);
 	  aclass = ALLOCNO_CLASS (a);
@@ -2419,8 +2413,9 @@ propagate_some_info_from_allocno (ira_allocno_t a, ira_allocno_t from_a)
   ALLOCNO_CALLS_CROSSED_NUM (a) += ALLOCNO_CALLS_CROSSED_NUM (from_a);
   ALLOCNO_CHEAP_CALLS_CROSSED_NUM (a)
     += ALLOCNO_CHEAP_CALLS_CROSSED_NUM (from_a);
-  IOR_HARD_REG_SET (ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a),
- 		    ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (from_a));
+  ALLOCNO_CROSSED_CALLS_ABIS (a) |= ALLOCNO_CROSSED_CALLS_ABIS (from_a);
+  ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a)
+    |= ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (from_a);
 
   ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (a)
     += ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (from_a);
@@ -2569,8 +2564,8 @@ remove_low_level_allocnos (void)
 	  ALLOCNO_NEXT_REGNO_ALLOCNO (a) = NULL;
 	  ALLOCNO_CAP_MEMBER (a) = NULL;
 	  FOR_EACH_ALLOCNO_OBJECT (a, obj, oi)
-	    COPY_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj),
-			       OBJECT_TOTAL_CONFLICT_HARD_REGS (obj));
+	    OBJECT_CONFLICT_HARD_REGS (obj)
+	      = OBJECT_TOTAL_CONFLICT_HARD_REGS (obj);
 #ifdef STACK_REGS
 	  if (ALLOCNO_TOTAL_NO_STACK_REG_P (a))
 	    ALLOCNO_NO_STACK_REG_P (a) = true;
@@ -3060,8 +3055,10 @@ copy_info_to_removed_store_destinations (int regno)
 	+= ALLOCNO_CALLS_CROSSED_NUM (a);
       ALLOCNO_CHEAP_CALLS_CROSSED_NUM (parent_a)
 	+= ALLOCNO_CHEAP_CALLS_CROSSED_NUM (a);
-      IOR_HARD_REG_SET (ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (parent_a),
- 			ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a));
+      ALLOCNO_CROSSED_CALLS_ABIS (parent_a)
+	|= ALLOCNO_CROSSED_CALLS_ABIS (a);
+      ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (parent_a)
+	|= ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a);
       ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (parent_a)
 	+= ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (a);
       merged_p = true;
@@ -3108,8 +3105,8 @@ ira_flattening (int max_regno_before_emit, int ira_max_point_before_emit)
 	   flattening.  */
 	continue;
       FOR_EACH_ALLOCNO_OBJECT (a, obj, oi)
-	COPY_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
-			   OBJECT_CONFLICT_HARD_REGS (obj));
+	OBJECT_TOTAL_CONFLICT_HARD_REGS (obj)
+	  = OBJECT_CONFLICT_HARD_REGS (obj);
 #ifdef STACK_REGS
       ALLOCNO_TOTAL_NO_STACK_REG_P (a) = ALLOCNO_NO_STACK_REG_P (a);
 #endif
@@ -3159,6 +3156,9 @@ ira_flattening (int max_regno_before_emit, int ira_max_point_before_emit)
 		-= ALLOCNO_CALLS_CROSSED_NUM (a);
 	      ALLOCNO_CHEAP_CALLS_CROSSED_NUM (parent_a)
 		-= ALLOCNO_CHEAP_CALLS_CROSSED_NUM (a);
+	      /* Assume that ALLOCNO_CROSSED_CALLS_ABIS and
+		 ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS stay the same.
+		 We'd need to rebuild the IR to do better.  */
 	      ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (parent_a)
 		-= ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (a);
 	      ira_assert (ALLOCNO_CALLS_CROSSED_NUM (parent_a) >= 0
@@ -3466,7 +3466,7 @@ ira_build (void)
 	 allocno crossing calls.  */
       FOR_EACH_ALLOCNO (a, ai)
 	if (ALLOCNO_CALLS_CROSSED_NUM (a) != 0)
-	  ior_hard_reg_conflicts (a, &call_used_reg_set);
+	  ior_hard_reg_conflicts (a, ira_need_caller_save_regs (a));
     }
   if (internal_flag_ira_verbose > 2 && ira_dump_file != NULL)
     print_copies (ira_dump_file);
diff --git a/gcc/ira-color.c b/gcc/ira-color.c
index 8a90ae1b4..62499be91 100644
--- a/gcc/ira-color.c
+++ b/gcc/ira-color.c
@@ -218,7 +218,7 @@ inline bool
 allocno_hard_regs_hasher::equal (const allocno_hard_regs *hv1,
 				 const allocno_hard_regs *hv2)
 {
-  return hard_reg_set_equal_p (hv1->set, hv2->set);
+  return hv1->set == hv2->set;
 }
 
 /* Hash table of unique allocno hard registers.  */
@@ -261,14 +261,14 @@ add_allocno_hard_regs (HARD_REG_SET set, int64_t cost)
   allocno_hard_regs_t hv;
 
   gcc_assert (! hard_reg_set_empty_p (set));
-  COPY_HARD_REG_SET (temp.set, set);
+  temp.set = set;
   if ((hv = find_hard_regs (&temp)) != NULL)
     hv->cost += cost;
   else
     {
       hv = ((struct allocno_hard_regs *)
 	    ira_allocate (sizeof (struct allocno_hard_regs)));
-      COPY_HARD_REG_SET (hv->set, set);
+      hv->set = set;
       hv->cost = cost;
       allocno_hard_regs_vec.safe_push (hv);
       insert_hard_regs (hv);
@@ -371,7 +371,7 @@ add_allocno_hard_regs_to_forest (allocno_hard_regs_node_t *roots,
   start = hard_regs_node_vec.length ();
   for (node = *roots; node != NULL; node = node->next)
     {
-      if (hard_reg_set_equal_p (hv->set, node->hard_regs->set))
+      if (hv->set == node->hard_regs->set)
 	return;
       if (hard_reg_set_subset_p (hv->set, node->hard_regs->set))
 	{
@@ -382,8 +382,7 @@ add_allocno_hard_regs_to_forest (allocno_hard_regs_node_t *roots,
 	hard_regs_node_vec.safe_push (node);
       else if (hard_reg_set_intersect_p (hv->set, node->hard_regs->set))
 	{
-	  COPY_HARD_REG_SET (temp_set, hv->set);
-	  AND_HARD_REG_SET (temp_set, node->hard_regs->set);
+	  temp_set = hv->set & node->hard_regs->set;
 	  hv2 = add_allocno_hard_regs (temp_set, hv->cost);
 	  add_allocno_hard_regs_to_forest (&node->first, hv2);
 	}
@@ -398,7 +397,7 @@ add_allocno_hard_regs_to_forest (allocno_hard_regs_node_t *roots,
 	   i++)
 	{
 	  node = hard_regs_node_vec[i];
-	  IOR_HARD_REG_SET (temp_set, node->hard_regs->set);
+	  temp_set |= node->hard_regs->set;
 	}
       hv = add_allocno_hard_regs (temp_set, hv->cost);
       new_node = create_new_allocno_hard_regs_node (hv);
@@ -717,8 +716,7 @@ form_allocno_hard_regs_nodes_forest (void)
 	    (allocno_data->profitable_hard_regs,
 	     ALLOCNO_MEMORY_COST (a) - ALLOCNO_CLASS_COST (a)));
     }
-  SET_HARD_REG_SET (temp);
-  AND_COMPL_HARD_REG_SET (temp, ira_no_alloc_regs);
+  temp = ~ira_no_alloc_regs;
   add_allocno_hard_regs (temp, 0);
   qsort (allocno_hard_regs_vec.address () + start,
 	 allocno_hard_regs_vec.length () - start,
@@ -833,10 +831,10 @@ setup_left_conflict_sizes_p (ira_allocno_t a)
   nobj = ALLOCNO_NUM_OBJECTS (a);
   data = ALLOCNO_COLOR_DATA (a);
   subnodes = allocno_hard_regs_subnodes + data->hard_regs_subnodes_start;
-  COPY_HARD_REG_SET (profitable_hard_regs, data->profitable_hard_regs);
+  profitable_hard_regs = data->profitable_hard_regs;
   node = data->hard_regs_node;
   node_preorder_num = node->preorder_num;
-  COPY_HARD_REG_SET (node_set, node->hard_regs->set);
+  node_set = node->hard_regs->set;
   node_check_tick++;
   for (k = 0; k < nobj; k++)
     {
@@ -859,7 +857,7 @@ setup_left_conflict_sizes_p (ira_allocno_t a)
 					     ->profitable_hard_regs))
 	    continue;
 	  conflict_node = conflict_data->hard_regs_node;
-	  COPY_HARD_REG_SET (conflict_node_set, conflict_node->hard_regs->set);
+	  conflict_node_set = conflict_node->hard_regs->set;
 	  if (hard_reg_set_subset_p (node_set, conflict_node_set))
 	    temp_node = node;
 	  else
@@ -897,8 +895,7 @@ setup_left_conflict_sizes_p (ira_allocno_t a)
 	  int j, n, hard_regno;
 	  enum reg_class aclass;
 	  
-	  COPY_HARD_REG_SET (temp_set, temp_node->hard_regs->set);
-	  AND_HARD_REG_SET (temp_set, profitable_hard_regs);
+	  temp_set = temp_node->hard_regs->set & profitable_hard_regs;
 	  aclass = ALLOCNO_CLASS (a);
 	  for (n = 0, j = ira_class_hard_regs_num[aclass] - 1; j >= 0; j--)
 	    {
@@ -1042,15 +1039,15 @@ setup_profitable_hard_regs (void)
       else
 	{
 	  mode = ALLOCNO_MODE (a);
-	  COPY_HARD_REG_SET (data->profitable_hard_regs,
-			     ira_useful_class_mode_regs[aclass][mode]);
+	  data->profitable_hard_regs
+	    = ira_useful_class_mode_regs[aclass][mode];
 	  nobj = ALLOCNO_NUM_OBJECTS (a);
 	  for (k = 0; k < nobj; k++)
 	    {
 	      ira_object_t obj = ALLOCNO_OBJECT (a, k);
 	      
-	      AND_COMPL_HARD_REG_SET (data->profitable_hard_regs,
-				      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj));
+	      data->profitable_hard_regs
+		&= ~OBJECT_TOTAL_CONFLICT_HARD_REGS (obj);
 	    }
 	}
     }
@@ -1091,9 +1088,8 @@ setup_profitable_hard_regs (void)
 		       hard_regno + num);
 		}
 	      else
-		AND_COMPL_HARD_REG_SET
-		  (ALLOCNO_COLOR_DATA (conflict_a)->profitable_hard_regs,
-		   ira_reg_mode_hard_regset[hard_regno][mode]);
+		ALLOCNO_COLOR_DATA (conflict_a)->profitable_hard_regs
+		  &= ~ira_reg_mode_hard_regset[hard_regno][mode];
 	    }
 	}
     }
@@ -1589,20 +1585,15 @@ get_conflict_and_start_profitable_regs (ira_allocno_t a, bool retry_p,
   for (i = 0; i < nwords; i++)
     {
       obj = ALLOCNO_OBJECT (a, i);
-      COPY_HARD_REG_SET (conflict_regs[i],
-			 OBJECT_TOTAL_CONFLICT_HARD_REGS (obj));
+      conflict_regs[i] = OBJECT_TOTAL_CONFLICT_HARD_REGS (obj);
     }
   if (retry_p)
-    {
-      COPY_HARD_REG_SET (*start_profitable_regs,
-			 reg_class_contents[ALLOCNO_CLASS (a)]);
-      AND_COMPL_HARD_REG_SET (*start_profitable_regs,
-			      ira_prohibited_class_mode_regs
-			      [ALLOCNO_CLASS (a)][ALLOCNO_MODE (a)]);
-    }
+    *start_profitable_regs
+      = (reg_class_contents[ALLOCNO_CLASS (a)]
+	 &~ (ira_prohibited_class_mode_regs
+	     [ALLOCNO_CLASS (a)][ALLOCNO_MODE (a)]));
   else
-    COPY_HARD_REG_SET (*start_profitable_regs,
-		       ALLOCNO_COLOR_DATA (a)->profitable_hard_regs);
+    *start_profitable_regs = ALLOCNO_COLOR_DATA (a)->profitable_hard_regs;
 }
 
 /* Return true if HARD_REGNO is ok for assigning to allocno A with
@@ -1659,7 +1650,7 @@ calculate_saved_nregs (int hard_regno, machine_mode mode)
   ira_assert (hard_regno >= 0);
   for (i = hard_regno_nregs (hard_regno, mode) - 1; i >= 0; i--)
     if (!allocated_hardreg_p[hard_regno + i]
-	&& !TEST_HARD_REG_BIT (call_used_reg_set, hard_regno + i)
+	&& !crtl->abi->clobbers_full_reg_p (hard_regno + i)
 	&& !LOCAL_REGNO (hard_regno + i))
       nregs++;
   return nregs;
@@ -1803,9 +1794,8 @@ assign_hard_reg (ira_allocno_t a, bool retry_p)
 					  hard_regno + num);
 		    }
 		  else
-		    IOR_HARD_REG_SET
-		      (conflicting_regs[word],
-		       ira_reg_mode_hard_regset[hard_regno][mode]);
+		    conflicting_regs[word]
+		      |= ira_reg_mode_hard_regset[hard_regno][mode];
 		  if (hard_reg_set_subset_p (profitable_hard_regs,
 					     conflicting_regs[word]))
 		    goto fail;
@@ -2698,8 +2688,7 @@ setup_allocno_available_regs_num (ira_allocno_t a)
      reg_class_names[aclass], ira_class_hard_regs_num[aclass], n);
   print_hard_reg_set (ira_dump_file, data->profitable_hard_regs, false);
   fprintf (ira_dump_file, ", %snode: ",
-	   hard_reg_set_equal_p (data->profitable_hard_regs,
-				 data->hard_regs_node->hard_regs->set)
+	   data->profitable_hard_regs == data->hard_regs_node->hard_regs->set
 	   ? "" : "^");
   print_hard_reg_set (ira_dump_file,
 		      data->hard_regs_node->hard_regs->set, false);
@@ -4387,11 +4376,10 @@ allocno_reload_assign (ira_allocno_t a, HARD_REG_SET forbidden_regs)
   for (i = 0; i < n; i++)
     {
       ira_object_t obj = ALLOCNO_OBJECT (a, i);
-      COPY_HARD_REG_SET (saved[i], OBJECT_TOTAL_CONFLICT_HARD_REGS (obj));
-      IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj), forbidden_regs);
+      saved[i] = OBJECT_TOTAL_CONFLICT_HARD_REGS (obj);
+      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= forbidden_regs;
       if (! flag_caller_saves && ALLOCNO_CALLS_CROSSED_NUM (a) != 0)
-	IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
-			  call_used_reg_set);
+	OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= ira_need_caller_save_regs (a);
     }
   ALLOCNO_ASSIGNED_P (a) = false;
   aclass = ALLOCNO_CLASS (a);
@@ -4410,9 +4398,7 @@ allocno_reload_assign (ira_allocno_t a, HARD_REG_SET forbidden_regs)
 	       ? ALLOCNO_CLASS_COST (a)
 	       : ALLOCNO_HARD_REG_COSTS (a)[ira_class_hard_reg_index
 					    [aclass][hard_regno]]));
-      if (ALLOCNO_CALLS_CROSSED_NUM (a) != 0
-	  && ira_hard_reg_set_intersection_p (hard_regno, ALLOCNO_MODE (a),
-					      call_used_reg_set))
+      if (ira_need_caller_save_p (a, regno))
 	{
 	  ira_assert (flag_caller_saves);
 	  caller_save_needed = 1;
@@ -4434,7 +4420,7 @@ allocno_reload_assign (ira_allocno_t a, HARD_REG_SET forbidden_regs)
   for (i = 0; i < n; i++)
     {
       ira_object_t obj = ALLOCNO_OBJECT (a, i);
-      COPY_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj), saved[i]);
+      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) = saved[i];
     }
   return reg_renumber[regno] >= 0;
 }
@@ -4519,9 +4505,9 @@ ira_reassign_pseudos (int *spilled_pseudo_regs, int num,
   for (i = 0; i < num; i++)
     {
       regno = spilled_pseudo_regs[i];
-      COPY_HARD_REG_SET (forbidden_regs, bad_spill_regs);
-      IOR_HARD_REG_SET (forbidden_regs, pseudo_forbidden_regs[regno]);
-      IOR_HARD_REG_SET (forbidden_regs, pseudo_previous_regs[regno]);
+      forbidden_regs = (bad_spill_regs
+			| pseudo_forbidden_regs[regno]
+			| pseudo_previous_regs[regno]);
       gcc_assert (reg_renumber[regno] < 0);
       a = ira_regno_allocno_map[regno];
       ira_mark_allocation_change (regno);
@@ -4699,16 +4685,16 @@ ira_mark_new_stack_slot (rtx x, int regno, poly_uint64 total_size)
    given IN and OUT for INSN.  Return also number points (through
    EXCESS_PRESSURE_LIVE_LENGTH) where the pseudo-register lives and
    the register pressure is high, number of references of the
-   pseudo-registers (through NREFS), number of callee-clobbered
-   hard-registers occupied by the pseudo-registers (through
-   CALL_USED_COUNT), and the first hard regno occupied by the
+   pseudo-registers (through NREFS), the number of psuedo registers
+   whose allocated register wouldn't need saving in the prologue
+   (through CALL_USED_COUNT), and the first hard regno occupied by the
    pseudo-registers (through FIRST_HARD_REGNO).  */
 static int
 calculate_spill_cost (int *regnos, rtx in, rtx out, rtx_insn *insn,
 		      int *excess_pressure_live_length,
 		      int *nrefs, int *call_used_count, int *first_hard_regno)
 {
-  int i, cost, regno, hard_regno, j, count, saved_cost, nregs;
+  int i, cost, regno, hard_regno, count, saved_cost;
   bool in_p, out_p;
   int length;
   ira_allocno_t a;
@@ -4725,11 +4711,8 @@ calculate_spill_cost (int *regnos, rtx in, rtx out, rtx_insn *insn,
       a = ira_regno_allocno_map[regno];
       length += ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (a) / ALLOCNO_NUM_OBJECTS (a);
       cost += ALLOCNO_MEMORY_COST (a) - ALLOCNO_CLASS_COST (a);
-      nregs = hard_regno_nregs (hard_regno, ALLOCNO_MODE (a));
-      for (j = 0; j < nregs; j++)
-	if (! TEST_HARD_REG_BIT (call_used_reg_set, hard_regno + j))
-	  break;
-      if (j == nregs)
+      if (in_hard_reg_set_p (crtl->abi->full_reg_clobbers (),
+			     ALLOCNO_MODE (a), hard_regno))
 	count++;
       in_p = in && REG_P (in) && (int) REGNO (in) == hard_regno;
       out_p = out && REG_P (out) && (int) REGNO (out) == hard_regno;
@@ -4886,11 +4869,10 @@ fast_allocation (void)
       for (l = 0; l < nr; l++)
 	{
 	  ira_object_t obj = ALLOCNO_OBJECT (a, l);
-	  IOR_HARD_REG_SET (conflict_hard_regs,
-			    OBJECT_CONFLICT_HARD_REGS (obj));
+	  conflict_hard_regs |= OBJECT_CONFLICT_HARD_REGS (obj);
 	  for (r = OBJECT_LIVE_RANGES (obj); r != NULL; r = r->next)
 	    for (j = r->start; j <= r->finish; j++)
-	      IOR_HARD_REG_SET (conflict_hard_regs, used_hard_regs[j]);
+	      conflict_hard_regs |= used_hard_regs[j];
 	}
       aclass = ALLOCNO_CLASS (a);
       ALLOCNO_ASSIGNED_P (a) = true;
@@ -4938,8 +4920,7 @@ fast_allocation (void)
 	  ira_object_t obj = ALLOCNO_OBJECT (a, l);
 	  for (r = OBJECT_LIVE_RANGES (obj); r != NULL; r = r->next)
 	    for (k = r->start; k <= r->finish; k++)
-	      IOR_HARD_REG_SET (used_hard_regs[k],
-				ira_reg_mode_hard_regset[hard_regno][mode]);
+	      used_hard_regs[k] |= ira_reg_mode_hard_regset[hard_regno][mode];
 	}
     }
   ira_free (sorted_allocnos);
diff --git a/gcc/ira-conflicts.c b/gcc/ira-conflicts.c
index 9a3e3811d..a0aefaa05 100644
--- a/gcc/ira-conflicts.c
+++ b/gcc/ira-conflicts.c
@@ -325,12 +325,37 @@ process_regs_for_copy (rtx reg1, rtx reg2, bool constraint_p,
   return true;
 }
 
-/* Process all of the output registers of the current insn which are
-   not bound (BOUND_P) and the input register REG (its operand number
+/* Return true if output operand OUTPUT and input operand INPUT of
+   INSN can use the same register class for at least one alternative.
+   INSN is already described in recog_data and recog_op_alt.  */
+static bool
+can_use_same_reg_p (rtx_insn *insn, int output, int input)
+{
+  alternative_mask preferred = get_preferred_alternatives (insn);
+  for (int nalt = 0; nalt < recog_data.n_alternatives; nalt++)
+    {
+      if (!TEST_BIT (preferred, nalt))
+	continue;
+
+      const operand_alternative *op_alt
+	= &recog_op_alt[nalt * recog_data.n_operands];
+      if (op_alt[input].matches == output)
+	return true;
+
+      if (ira_reg_class_intersect[op_alt[input].cl][op_alt[output].cl]
+	  != NO_REGS)
+	return true;
+    }
+  return false;
+}
+
+/* Process all of the output registers of the current insn (INSN) which
+   are not bound (BOUND_P) and the input register REG (its operand number
    OP_NUM) which dies in the insn as if there were a move insn between
    them with frequency FREQ.  */
 static void
-process_reg_shuffles (rtx reg, int op_num, int freq, bool *bound_p)
+process_reg_shuffles (rtx_insn *insn, rtx reg, int op_num, int freq,
+		      bool *bound_p)
 {
   int i;
   rtx another_reg;
@@ -342,7 +367,13 @@ process_reg_shuffles (rtx reg, int op_num, int freq, bool *bound_p)
 
       if (!REG_SUBREG_P (another_reg) || op_num == i
 	  || recog_data.operand_type[i] != OP_OUT
-	  || bound_p[i])
+	  || bound_p[i]
+	  || (!can_use_same_reg_p (insn, i, op_num)
+	      && (recog_data.constraints[op_num][0] != '%'
+		  || !can_use_same_reg_p (insn, i, op_num + 1))
+	      && (op_num == 0
+		  || recog_data.constraints[op_num - 1][0] != '%'
+		  || !can_use_same_reg_p (insn, i, op_num - 1))))
 	continue;
 
       process_regs_for_copy (reg, another_reg, false, NULL, freq);
@@ -358,7 +389,7 @@ add_insn_allocno_copies (rtx_insn *insn)
   rtx set, operand, dup;
   bool bound_p[MAX_RECOG_OPERANDS];
   int i, n, freq;
-  HARD_REG_SET alts;
+  alternative_mask alts;
 
   freq = REG_FREQ_FROM_BB (BLOCK_FOR_INSN (insn));
   if (freq == 0)
@@ -379,7 +410,7 @@ add_insn_allocno_copies (rtx_insn *insn)
      there are no dead registers, there will be no such copies.  */
   if (! find_reg_note (insn, REG_DEAD, NULL_RTX))
     return;
-  ira_setup_alts (insn, alts);
+  alts = ira_setup_alts (insn);
   for (i = 0; i < recog_data.n_operands; i++)
     bound_p[i] = false;
   for (i = 0; i < recog_data.n_operands; i++)
@@ -412,7 +443,8 @@ add_insn_allocno_copies (rtx_insn *insn)
 	   the corresponding allocno copies.  The cost will not
 	   correspond to a real move insn cost, so make the frequency
 	   smaller.  */
-	process_reg_shuffles (operand, i, freq < 8 ? 1 : freq / 8, bound_p);
+	process_reg_shuffles (insn, operand, i, freq < 8 ? 1 : freq / 8,
+			      bound_p);
     }
 }
 
@@ -660,17 +692,15 @@ print_allocno_conflicts (FILE * file, bool reg_p, ira_allocno_t a)
 	      putc (')', file);
 	    }
 	}
-      COPY_HARD_REG_SET (conflicting_hard_regs, OBJECT_TOTAL_CONFLICT_HARD_REGS (obj));
-      AND_COMPL_HARD_REG_SET (conflicting_hard_regs, ira_no_alloc_regs);
-      AND_HARD_REG_SET (conflicting_hard_regs,
-			reg_class_contents[ALLOCNO_CLASS (a)]);
+      conflicting_hard_regs = (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj)
+			       & ~ira_no_alloc_regs
+			       & reg_class_contents[ALLOCNO_CLASS (a)]);
       print_hard_reg_set (file, "\n;;     total conflict hard regs:",
 			  conflicting_hard_regs);
 
-      COPY_HARD_REG_SET (conflicting_hard_regs, OBJECT_CONFLICT_HARD_REGS (obj));
-      AND_COMPL_HARD_REG_SET (conflicting_hard_regs, ira_no_alloc_regs);
-      AND_HARD_REG_SET (conflicting_hard_regs,
-			reg_class_contents[ALLOCNO_CLASS (a)]);
+      conflicting_hard_regs = (OBJECT_CONFLICT_HARD_REGS (obj)
+			       & ~ira_no_alloc_regs
+			       & reg_class_contents[ALLOCNO_CLASS (a)]);
       print_hard_reg_set (file, ";;     conflict hard regs:",
 			  conflicting_hard_regs);
       putc ('\n', file);
@@ -740,11 +770,7 @@ ira_build_conflicts (void)
   if (! targetm.class_likely_spilled_p (base))
     CLEAR_HARD_REG_SET (temp_hard_reg_set);
   else
-    {
-      COPY_HARD_REG_SET (temp_hard_reg_set, reg_class_contents[base]);
-      AND_COMPL_HARD_REG_SET (temp_hard_reg_set, ira_no_alloc_regs);
-      AND_HARD_REG_SET (temp_hard_reg_set, call_used_reg_set);
-    }
+    temp_hard_reg_set = reg_class_contents[base] & ~ira_no_alloc_regs;
   FOR_EACH_ALLOCNO (a, ai)
     {
       int i, n = ALLOCNO_NUM_OBJECTS (a);
@@ -752,33 +778,28 @@ ira_build_conflicts (void)
       for (i = 0; i < n; i++)
 	{
 	  ira_object_t obj = ALLOCNO_OBJECT (a, i);
-	  machine_mode obj_mode = obj->allocno->mode;
 	  rtx allocno_reg = regno_reg_rtx [ALLOCNO_REGNO (a)];
 
-	  if ((! flag_caller_saves && ALLOCNO_CALLS_CROSSED_NUM (a) != 0)
-	      /* For debugging purposes don't put user defined variables in
-		 callee-clobbered registers.  However, do allow parameters
-		 in callee-clobbered registers to improve debugging.  This
-		 is a bit of a fragile hack.  */
-	      || (optimize == 0
-		  && REG_USERVAR_P (allocno_reg)
-		  && ! reg_is_parm_p (allocno_reg)))
+	  /* For debugging purposes don't put user defined variables in
+	     callee-clobbered registers.  However, do allow parameters
+	     in callee-clobbered registers to improve debugging.  This
+	     is a bit of a fragile hack.  */
+	  if (optimize == 0
+	      && REG_USERVAR_P (allocno_reg)
+	      && ! reg_is_parm_p (allocno_reg))
 	    {
-	      IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
-				call_used_reg_set);
-	      IOR_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj),
-				call_used_reg_set);
+	      HARD_REG_SET new_conflict_regs = crtl->abi->full_reg_clobbers ();
+	      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= new_conflict_regs;
+	      OBJECT_CONFLICT_HARD_REGS (obj) |= new_conflict_regs;
 	    }
-	  else if (ALLOCNO_CALLS_CROSSED_NUM (a) != 0)
+
+	  if (ALLOCNO_CALLS_CROSSED_NUM (a) != 0)
 	    {
-	      IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
-				no_caller_save_reg_set);
-	      IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
-				temp_hard_reg_set);
-	      IOR_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj),
-				no_caller_save_reg_set);
-	      IOR_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj),
-				temp_hard_reg_set);
+	      HARD_REG_SET new_conflict_regs = ira_need_caller_save_regs (a);
+	      if (flag_caller_saves)
+		new_conflict_regs &= (~savable_regs | temp_hard_reg_set);
+	      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= new_conflict_regs;
+	      OBJECT_CONFLICT_HARD_REGS (obj) |= new_conflict_regs;
 	    }
 
 	  /* Now we deal with paradoxical subreg cases where certain registers
@@ -805,23 +826,6 @@ ira_build_conflicts (void)
 		     }
 		}
 	    }
-
-	  if (ALLOCNO_CALLS_CROSSED_NUM (a) != 0)
-	    {
-	      int regno;
-
-	      /* Allocnos bigger than the saved part of call saved
-		 regs must conflict with them.  */
-	      for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
-		if (!TEST_HARD_REG_BIT (call_used_reg_set, regno)
-		    && targetm.hard_regno_call_part_clobbered (NULL, regno,
-							       obj_mode))
-		  {
-		    SET_HARD_REG_BIT (OBJECT_CONFLICT_HARD_REGS (obj), regno);
-		    SET_HARD_REG_BIT (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
-				      regno);
-		  }
-	    }
 	}
     }
   if (optimize && ira_conflicts_p
diff --git a/gcc/ira-costs.c b/gcc/ira-costs.c
index c7feaba37..baf7261dd 100644
--- a/gcc/ira-costs.c
+++ b/gcc/ira-costs.c
@@ -237,7 +237,7 @@ setup_cost_classes (cost_classes_t from)
    allocated.  */
 static cost_classes_t
 restrict_cost_classes (cost_classes_t full, machine_mode mode,
-		       const HARD_REG_SET &regs)
+		       const_hard_reg_set regs)
 {
   static struct cost_classes narrow;
   int map[N_REG_CLASSES];
@@ -254,12 +254,9 @@ restrict_cost_classes (cost_classes_t full, machine_mode mode,
 
       /* Calculate the set of registers in CL that belong to REGS and
 	 are valid for MODE.  */
-      HARD_REG_SET valid_for_cl;
-      COPY_HARD_REG_SET (valid_for_cl, reg_class_contents[cl]);
-      AND_HARD_REG_SET (valid_for_cl, regs);
-      AND_COMPL_HARD_REG_SET (valid_for_cl,
-			      ira_prohibited_class_mode_regs[cl][mode]);
-      AND_COMPL_HARD_REG_SET (valid_for_cl, ira_no_alloc_regs);
+      HARD_REG_SET valid_for_cl = reg_class_contents[cl] & regs;
+      valid_for_cl &= ~(ira_prohibited_class_mode_regs[cl][mode]
+			| ira_no_alloc_regs);
       if (hard_reg_set_empty_p (valid_for_cl))
 	continue;
 
@@ -343,8 +340,7 @@ setup_regno_cost_classes_by_aclass (int regno, enum reg_class aclass)
 
   if ((classes_ptr = cost_classes_aclass_cache[aclass]) == NULL)
     {
-      COPY_HARD_REG_SET (temp, reg_class_contents[aclass]);
-      AND_COMPL_HARD_REG_SET (temp, ira_no_alloc_regs);
+      temp = reg_class_contents[aclass] & ~ira_no_alloc_regs;
       /* We exclude classes from consideration which are subsets of
 	 ACLASS only if ACLASS is an uniform class.  */
       exclude_p = ira_uniform_class_p[aclass];
@@ -356,8 +352,7 @@ setup_regno_cost_classes_by_aclass (int regno, enum reg_class aclass)
 	    {
 	      /* Exclude non-uniform classes which are subsets of
 		 ACLASS.  */
-	      COPY_HARD_REG_SET (temp2, reg_class_contents[cl]);
-	      AND_COMPL_HARD_REG_SET (temp2, ira_no_alloc_regs);
+	      temp2 = reg_class_contents[cl] & ~ira_no_alloc_regs;
 	      if (hard_reg_set_subset_p (temp2, temp) && cl != aclass)
 		continue;
 	    }
@@ -1482,13 +1477,6 @@ scan_one_insn (rtx_insn *insn)
       return insn;
     }
 
-  if (pat_code == CLOBBER_HIGH)
-    {
-      gcc_assert (REG_P (XEXP (PATTERN (insn), 0))
-		  && HARD_REGISTER_P (XEXP (PATTERN (insn), 0)));
-      return insn;
-    }
-
   counted_mem = false;
   set = single_set (insn);
   extract_insn (insn);
@@ -2345,7 +2333,6 @@ ira_tune_allocno_costs (void)
   ira_allocno_object_iterator oi;
   ira_object_t obj;
   bool skip_p;
-  HARD_REG_SET *crossed_calls_clobber_regs;
 
   FOR_EACH_ALLOCNO (a, ai)
     {
@@ -2380,14 +2367,7 @@ ira_tune_allocno_costs (void)
 		continue;
 	      rclass = REGNO_REG_CLASS (regno);
 	      cost = 0;
-	      crossed_calls_clobber_regs
-		= &(ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a));
-	      if (ira_hard_reg_set_intersection_p (regno, mode,
-						   *crossed_calls_clobber_regs)
-		  && (ira_hard_reg_set_intersection_p (regno, mode,
-						       call_used_reg_set)
-		      || targetm.hard_regno_call_part_clobbered (NULL, regno,
-								 mode)))
+	      if (ira_need_caller_save_p (a, regno))
 		cost += (ALLOCNO_CALL_FREQ (a)
 			 * (ira_memory_move_cost[mode][rclass][0]
 			    + ira_memory_move_cost[mode][rclass][1]));
diff --git a/gcc/ira-emit.c b/gcc/ira-emit.c
index 51bf9c8bc..f44a0d199 100644
--- a/gcc/ira-emit.c
+++ b/gcc/ira-emit.c
@@ -1115,8 +1115,8 @@ add_range_and_copies_from_move_list (move_t list, ira_loop_tree_node_t node,
 	      ira_allocate_object_conflicts (to_obj, n);
 	    }
 	}
-      ior_hard_reg_conflicts (from, &hard_regs_live);
-      ior_hard_reg_conflicts (to, &hard_regs_live);
+      ior_hard_reg_conflicts (from, hard_regs_live);
+      ior_hard_reg_conflicts (to, hard_regs_live);
 
       update_costs (from, true, freq);
       update_costs (to, false, freq);
diff --git a/gcc/ira-int.h b/gcc/ira-int.h
index 3c7fe4e64..a2529ff81 100644
--- a/gcc/ira-int.h
+++ b/gcc/ira-int.h
@@ -22,6 +22,7 @@ along with GCC; see the file COPYING3.  If not see
 #define GCC_IRA_INT_H
 
 #include "recog.h"
+#include "function-abi.h"
 
 /* To provide consistency in naming, all IRA external variables,
    functions, common typedefs start with prefix ira_.  */
@@ -287,6 +288,9 @@ struct ira_allocno
   /* Register class which should be used for allocation for given
      allocno.  NO_REGS means that we should use memory.  */
   ENUM_BITFIELD (reg_class) aclass : 16;
+  /* A bitmask of the ABIs used by calls that occur while the allocno
+     is live.  */
+  unsigned int crossed_calls_abis : NUM_ABI_IDS;
   /* During the reload, value TRUE means that we should not reassign a
      hard register to the allocno got memory earlier.  It is set up
      when we removed memory-memory move insn before each iteration of
@@ -423,6 +427,7 @@ struct ira_allocno
 #define ALLOCNO_CALL_FREQ(A) ((A)->call_freq)
 #define ALLOCNO_CALLS_CROSSED_NUM(A) ((A)->calls_crossed_num)
 #define ALLOCNO_CHEAP_CALLS_CROSSED_NUM(A) ((A)->cheap_calls_crossed_num)
+#define ALLOCNO_CROSSED_CALLS_ABIS(A) ((A)->crossed_calls_abis)
 #define ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS(A) \
   ((A)->crossed_calls_clobbered_regs)
 #define ALLOCNO_MEM_OPTIMIZED_DEST(A) ((A)->mem_optimized_dest)
@@ -963,8 +968,8 @@ extern void ira_print_disposition (FILE *);
 extern void ira_debug_disposition (void);
 extern void ira_debug_allocno_classes (void);
 extern void ira_init_register_move_cost (machine_mode);
-extern void ira_setup_alts (rtx_insn *insn, HARD_REG_SET &alts);
-extern int ira_get_dup_out_num (int op_num, HARD_REG_SET &alts);
+extern alternative_mask ira_setup_alts (rtx_insn *);
+extern int ira_get_dup_out_num (int, alternative_mask);
 
 /* ira-build.c */
 
@@ -996,7 +1001,7 @@ extern void ira_set_allocno_class (ira_allocno_t, enum reg_class);
 extern bool ira_conflict_vector_profitable_p (ira_object_t, int);
 extern void ira_allocate_conflict_vec (ira_object_t, int);
 extern void ira_allocate_object_conflicts (ira_object_t, int);
-extern void ior_hard_reg_conflicts (ira_allocno_t, HARD_REG_SET *);
+extern void ior_hard_reg_conflicts (ira_allocno_t, const_hard_reg_set);
 extern void ira_print_expanded_allocno (ira_allocno_t);
 extern void ira_add_live_range_to_object (ira_object_t, int, int);
 extern live_range_t ira_create_live_range (ira_object_t, int, int,
@@ -1508,4 +1513,28 @@ ira_allocate_and_set_or_copy_costs (int **vec, enum reg_class aclass,
 extern rtx ira_create_new_reg (rtx);
 extern int first_moveable_pseudo, last_moveable_pseudo;
 
+/* Return the set of registers that would need a caller save if allocno A
+   overlapped them.  */
+
+inline HARD_REG_SET
+ira_need_caller_save_regs (ira_allocno_t a)
+{
+  return call_clobbers_in_region (ALLOCNO_CROSSED_CALLS_ABIS (a),
+				  ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a),
+				  ALLOCNO_MODE (a));
+}
+
+/* Return true if we would need to save allocno A around a call if we
+   assigned hard register REGNO.  */
+
+inline bool
+ira_need_caller_save_p (ira_allocno_t a, unsigned int regno)
+{
+  if (ALLOCNO_CALLS_CROSSED_NUM (a) == 0)
+    return false;
+  return call_clobbered_in_region_p (ALLOCNO_CROSSED_CALLS_ABIS (a),
+				     ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a),
+				     ALLOCNO_MODE (a), regno);
+}
+
 #endif /* GCC_IRA_INT_H */
diff --git a/gcc/ira-lives.c b/gcc/ira-lives.c
index faadf08b0..b933dff16 100644
--- a/gcc/ira-lives.c
+++ b/gcc/ira-lives.c
@@ -33,6 +33,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "ira.h"
 #include "ira-int.h"
 #include "sparseset.h"
+#include "function-abi.h"
 
 /* The code in this file is similar to one in global but the code
    works on the allocno basis and creates live ranges instead of
@@ -80,8 +81,9 @@ static int last_call_num;
 /* The number of last call at which given allocno was saved.  */
 static int *allocno_saved_at_call;
 
-/* The value of get_preferred_alternatives for the current instruction,
-   supplemental to recog_data.  */
+/* The value returned by ira_setup_alts for the current instruction;
+   i.e. the set of alternatives that we should consider to be likely
+   candidates during reloading.  */
 static alternative_mask preferred_alternatives;
 
 /* If non-NULL, the source operand of a register to register copy for which
@@ -187,8 +189,8 @@ make_object_dead (ira_object_t obj)
 	}
     }
 
-  IOR_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj), hard_regs_live);
-  IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj), hard_regs_live);
+  OBJECT_CONFLICT_HARD_REGS (obj) |= hard_regs_live;
+  OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= hard_regs_live;
 
   /* If IGNORE_REG_FOR_CONFLICTS did not already conflict with OBJ, make
      sure it still doesn't.  */
@@ -989,10 +991,8 @@ process_single_reg_class_operands (bool in_p, int freq)
 	      /* We could increase costs of A instead of making it
 		 conflicting with the hard register.  But it works worse
 		 because it will be spilled in reload in anyway.  */
-	      IOR_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj),
-				reg_class_contents[cl]);
-	      IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
-				reg_class_contents[cl]);
+	      OBJECT_CONFLICT_HARD_REGS (obj) |= reg_class_contents[cl];
+	      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj) |= reg_class_contents[cl];
 	    }
 	}
     }
@@ -1130,8 +1130,7 @@ process_bb_node_lives (ira_loop_tree_node_t loop_tree_node)
       reg_live_out = df_get_live_out (bb);
       sparseset_clear (objects_live);
       REG_SET_TO_HARD_REG_SET (hard_regs_live, reg_live_out);
-      AND_COMPL_HARD_REG_SET (hard_regs_live, eliminable_regset);
-      AND_COMPL_HARD_REG_SET (hard_regs_live, ira_no_alloc_regs);
+      hard_regs_live &= ~(eliminable_regset | ira_no_alloc_regs);
       for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
 	if (TEST_HARD_REG_BIT (hard_regs_live, i))
 	  {
@@ -1236,9 +1235,7 @@ process_bb_node_lives (ira_loop_tree_node_t loop_tree_node)
 		  }
 	      }
 
-	  extract_insn (insn);
-	  preferred_alternatives = get_preferred_alternatives (insn);
-	  preprocess_constraints (insn);
+	  preferred_alternatives = ira_setup_alts (insn);
 	  process_single_reg_class_operands (false, freq);
 
 	  /* See which defined values die here.  */
@@ -1263,10 +1260,7 @@ process_bb_node_lives (ira_loop_tree_node_t loop_tree_node)
 		  ira_object_t obj = ira_object_id_map[i];
 		  a = OBJECT_ALLOCNO (obj);
 		  int num = ALLOCNO_NUM (a);
-		  HARD_REG_SET this_call_used_reg_set;
-
-		  get_call_reg_set_usage (insn, &this_call_used_reg_set,
-					  call_used_reg_set);
+		  function_abi callee_abi = insn_callee_abi (insn);
 
 		  /* Don't allocate allocnos that cross setjmps or any
 		     call, if this function receives a nonlocal
@@ -1281,10 +1275,10 @@ process_bb_node_lives (ira_loop_tree_node_t loop_tree_node)
 		    }
 		  if (can_throw_internal (insn))
 		    {
-		      IOR_HARD_REG_SET (OBJECT_CONFLICT_HARD_REGS (obj),
-					this_call_used_reg_set);
-		      IOR_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
-					this_call_used_reg_set);
+		      OBJECT_CONFLICT_HARD_REGS (obj)
+			|= callee_abi.mode_clobbers (ALLOCNO_MODE (a));
+		      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj)
+			|= callee_abi.mode_clobbers (ALLOCNO_MODE (a));
 		    }
 
 		  if (sparseset_bit_p (allocnos_processed, num))
@@ -1301,8 +1295,9 @@ process_bb_node_lives (ira_loop_tree_node_t loop_tree_node)
 		  /* Mark it as saved at the next call.  */
 		  allocno_saved_at_call[num] = last_call_num + 1;
 		  ALLOCNO_CALLS_CROSSED_NUM (a)++;
-		  IOR_HARD_REG_SET (ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a),
-				    this_call_used_reg_set);
+		  ALLOCNO_CROSSED_CALLS_ABIS (a) |= 1 << callee_abi.id ();
+		  ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a)
+		    |= callee_abi.full_and_partial_reg_clobbers ();
 		  if (cheap_reg != NULL_RTX
 		      && ALLOCNO_REGNO (a) == (int) REGNO (cheap_reg))
 		    ALLOCNO_CHEAP_CALLS_CROSSED_NUM (a)++;
@@ -1355,10 +1350,11 @@ process_bb_node_lives (ira_loop_tree_node_t loop_tree_node)
 	  }
 
       /* Allocnos can't go in stack regs at the start of a basic block
-	 that is reached by an abnormal edge. Likewise for call
-	 clobbered regs, because caller-save, fixup_abnormal_edges and
-	 possibly the table driven EH machinery are not quite ready to
-	 handle such allocnos live across such edges.  */
+	 that is reached by an abnormal edge. Likewise for registers
+	 that are at least partly call clobbered, because caller-save,
+	 fixup_abnormal_edges and possibly the table driven EH machinery
+	 are not quite ready to handle such allocnos live across such
+	 edges.  */
       if (bb_has_abnormal_pred (bb))
 	{
 #ifdef STACK_REGS
@@ -1378,7 +1374,7 @@ process_bb_node_lives (ira_loop_tree_node_t loop_tree_node)
 	  if (!cfun->has_nonlocal_label
 	      && has_abnormal_call_or_eh_pred_edge_p (bb))
 	    for (px = 0; px < FIRST_PSEUDO_REGISTER; px++)
-	      if (call_used_regs[px]
+	      if (eh_edge_abi.clobbers_at_least_part_of_reg_p (px)
 #ifdef REAL_PIC_OFFSET_TABLE_REGNUM
 		  /* We should create a conflict of PIC pseudo with
 		     PIC hard reg as PIC hard reg can have a wrong
diff --git a/gcc/ira.c b/gcc/ira.c
index 4262e5cf3..a985dddaf 100644
--- a/gcc/ira.c
+++ b/gcc/ira.c
@@ -471,8 +471,7 @@ setup_class_hard_regs (void)
   ira_assert (SHRT_MAX >= FIRST_PSEUDO_REGISTER);
   for (cl = (int) N_REG_CLASSES - 1; cl >= 0; cl--)
     {
-      COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl]);
-      AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
+      temp_hard_regset = reg_class_contents[cl] & ~no_unit_alloc_regs;
       CLEAR_HARD_REG_SET (processed_hard_reg_set);
       for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
 	{
@@ -514,7 +513,7 @@ setup_alloc_regs (bool use_hard_frame_p)
 #ifdef ADJUST_REG_ALLOC_ORDER
   ADJUST_REG_ALLOC_ORDER;
 #endif
-  COPY_HARD_REG_SET (no_unit_alloc_regs, fixed_nonglobal_reg_set);
+  no_unit_alloc_regs = fixed_nonglobal_reg_set;
   if (! use_hard_frame_p)
     SET_HARD_REG_BIT (no_unit_alloc_regs, HARD_FRAME_POINTER_REGNUM);
   setup_class_hard_regs ();
@@ -541,8 +540,7 @@ setup_reg_subclasses (void)
       if (i == (int) NO_REGS)
 	continue;
 
-      COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[i]);
-      AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
+      temp_hard_regset = reg_class_contents[i] & ~no_unit_alloc_regs;
       if (hard_reg_set_empty_p (temp_hard_regset))
 	continue;
       for (j = 0; j < N_REG_CLASSES; j++)
@@ -550,8 +548,7 @@ setup_reg_subclasses (void)
 	  {
 	    enum reg_class *p;
 
-	    COPY_HARD_REG_SET (temp_hard_regset2, reg_class_contents[j]);
-	    AND_COMPL_HARD_REG_SET (temp_hard_regset2, no_unit_alloc_regs);
+	    temp_hard_regset2 = reg_class_contents[j] & ~no_unit_alloc_regs;
 	    if (! hard_reg_set_subset_p (temp_hard_regset,
 					 temp_hard_regset2))
 	      continue;
@@ -605,10 +602,8 @@ setup_class_subset_and_memory_move_costs (void)
   for (cl = (int) N_REG_CLASSES - 1; cl >= 0; cl--)
     for (cl2 = (int) N_REG_CLASSES - 1; cl2 >= 0; cl2--)
       {
-	COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl]);
-	AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
-	COPY_HARD_REG_SET (temp_hard_regset2, reg_class_contents[cl2]);
-	AND_COMPL_HARD_REG_SET (temp_hard_regset2, no_unit_alloc_regs);
+	temp_hard_regset = reg_class_contents[cl] & ~no_unit_alloc_regs;
+	temp_hard_regset2 = reg_class_contents[cl2] & ~no_unit_alloc_regs;
 	ira_class_subset_p[cl][cl2]
 	  = hard_reg_set_subset_p (temp_hard_regset, temp_hard_regset2);
 	if (! hard_reg_set_empty_p (temp_hard_regset2)
@@ -757,8 +752,7 @@ setup_stack_reg_pressure_class (void)
     for (i = 0; i < ira_pressure_classes_num; i++)
       {
 	cl = ira_pressure_classes[i];
-	COPY_HARD_REG_SET (temp_hard_regset2, temp_hard_regset);
-	AND_HARD_REG_SET (temp_hard_regset2, reg_class_contents[cl]);
+	temp_hard_regset2 = temp_hard_regset & reg_class_contents[cl];
 	size = hard_reg_set_size (temp_hard_regset2);
 	if (best < size)
 	  {
@@ -816,10 +810,10 @@ setup_pressure_classes (void)
 		 register pressure class.  */
 	      for (m = 0; m < NUM_MACHINE_MODES; m++)
 		{
-		  COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl]);
-		  AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
-		  AND_COMPL_HARD_REG_SET (temp_hard_regset,
-					  ira_prohibited_class_mode_regs[cl][m]);
+		  temp_hard_regset
+		    = (reg_class_contents[cl]
+		       & ~(no_unit_alloc_regs
+			   | ira_prohibited_class_mode_regs[cl][m]));
 		  if (hard_reg_set_empty_p (temp_hard_regset))
 		    continue;
 		  ira_init_register_move_cost_if_necessary ((machine_mode) m);
@@ -833,8 +827,7 @@ setup_pressure_classes (void)
 	    }
 	  curr = 0;
 	  insert_p = true;
-	  COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl]);
-	  AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
+	  temp_hard_regset = reg_class_contents[cl] & ~no_unit_alloc_regs;
 	  /* Remove so far added pressure classes which are subset of the
 	     current candidate class.  Prefer GENERAL_REGS as a pressure
 	     register class to another class containing the same
@@ -845,11 +838,10 @@ setup_pressure_classes (void)
 	  for (i = 0; i < n; i++)
 	    {
 	      cl2 = pressure_classes[i];
-	      COPY_HARD_REG_SET (temp_hard_regset2, reg_class_contents[cl2]);
-	      AND_COMPL_HARD_REG_SET (temp_hard_regset2, no_unit_alloc_regs);
+	      temp_hard_regset2 = (reg_class_contents[cl2]
+				   & ~no_unit_alloc_regs);
 	      if (hard_reg_set_subset_p (temp_hard_regset, temp_hard_regset2)
-		  && (! hard_reg_set_equal_p (temp_hard_regset,
-					      temp_hard_regset2)
+		  && (temp_hard_regset != temp_hard_regset2
 		      || cl2 == (int) GENERAL_REGS))
 		{
 		  pressure_classes[curr++] = (enum reg_class) cl2;
@@ -857,11 +849,10 @@ setup_pressure_classes (void)
 		  continue;
 		}
 	      if (hard_reg_set_subset_p (temp_hard_regset2, temp_hard_regset)
-		  && (! hard_reg_set_equal_p (temp_hard_regset2,
-					      temp_hard_regset)
+		  && (temp_hard_regset2 != temp_hard_regset
 		      || cl == (int) GENERAL_REGS))
 		continue;
-	      if (hard_reg_set_equal_p (temp_hard_regset2, temp_hard_regset))
+	      if (temp_hard_regset2 == temp_hard_regset)
 		insert_p = false;
 	      pressure_classes[curr++] = (enum reg_class) cl2;
 	    }
@@ -882,7 +873,7 @@ setup_pressure_classes (void)
        registers available for the allocation.  */
     CLEAR_HARD_REG_SET (temp_hard_regset);
     CLEAR_HARD_REG_SET (temp_hard_regset2);
-    COPY_HARD_REG_SET (ignore_hard_regs, no_unit_alloc_regs);
+    ignore_hard_regs = no_unit_alloc_regs;
     for (cl = 0; cl < LIM_REG_CLASSES; cl++)
       {
 	/* For some targets (like MIPS with MD_REGS), there are some
@@ -893,23 +884,23 @@ setup_pressure_classes (void)
 	    break;
 	if (m >= NUM_MACHINE_MODES)
 	  {
-	    IOR_HARD_REG_SET (ignore_hard_regs, reg_class_contents[cl]);
+	    ignore_hard_regs |= reg_class_contents[cl];
 	    continue;
 	  }
 	for (i = 0; i < n; i++)
 	  if ((int) pressure_classes[i] == cl)
 	    break;
-	IOR_HARD_REG_SET (temp_hard_regset2, reg_class_contents[cl]);
+	temp_hard_regset2 |= reg_class_contents[cl];
 	if (i < n)
-	  IOR_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl]);
+	  temp_hard_regset |= reg_class_contents[cl];
       }
     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
       /* Some targets (like SPARC with ICC reg) have allocatable regs
 	 for which no reg class is defined.  */
       if (REGNO_REG_CLASS (i) == NO_REGS)
 	SET_HARD_REG_BIT (ignore_hard_regs, i);
-    AND_COMPL_HARD_REG_SET (temp_hard_regset, ignore_hard_regs);
-    AND_COMPL_HARD_REG_SET (temp_hard_regset2, ignore_hard_regs);
+    temp_hard_regset &= ~ignore_hard_regs;
+    temp_hard_regset2 &= ~ignore_hard_regs;
     ira_assert (hard_reg_set_subset_p (temp_hard_regset2, temp_hard_regset));
   }
 #endif
@@ -1001,16 +992,12 @@ setup_allocno_and_important_classes (void)
      same set of hard registers.  */
   for (i = 0; i < LIM_REG_CLASSES; i++)
     {
-      COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[i]);
-      AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
+      temp_hard_regset = reg_class_contents[i] & ~no_unit_alloc_regs;
       for (j = 0; j < n; j++)
 	{
 	  cl = classes[j];
-	  COPY_HARD_REG_SET (temp_hard_regset2, reg_class_contents[cl]);
-	  AND_COMPL_HARD_REG_SET (temp_hard_regset2,
-				  no_unit_alloc_regs);
-	  if (hard_reg_set_equal_p (temp_hard_regset,
-				    temp_hard_regset2))
+	  temp_hard_regset2 = reg_class_contents[cl] & ~no_unit_alloc_regs;
+	  if (temp_hard_regset == temp_hard_regset2)
 	    break;
 	}
       if (j >= n || targetm.additional_allocno_class_p (i))
@@ -1037,14 +1024,12 @@ setup_allocno_and_important_classes (void)
   for (cl = 0; cl < N_REG_CLASSES; cl++)
     if (ira_class_hard_regs_num[cl] > 0)
       {
-	COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl]);
-	AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
+	temp_hard_regset = reg_class_contents[cl] & ~no_unit_alloc_regs;
 	set_p = false;
 	for (j = 0; j < ira_allocno_classes_num; j++)
 	  {
-	    COPY_HARD_REG_SET (temp_hard_regset2,
-			       reg_class_contents[ira_allocno_classes[j]]);
-	    AND_COMPL_HARD_REG_SET (temp_hard_regset2, no_unit_alloc_regs);
+	    temp_hard_regset2 = (reg_class_contents[ira_allocno_classes[j]]
+				 & ~no_unit_alloc_regs);
 	    if ((enum reg_class) cl == ira_allocno_classes[j])
 	      break;
 	    else if (hard_reg_set_subset_p (temp_hard_regset,
@@ -1118,10 +1103,9 @@ setup_class_translate_array (enum reg_class *class_translate,
       for (i = 0; i < classes_num; i++)
 	{
 	  aclass = classes[i];
-	  COPY_HARD_REG_SET (temp_hard_regset,
-			     reg_class_contents[aclass]);
-	  AND_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl]);
-	  AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
+	  temp_hard_regset = (reg_class_contents[aclass]
+			      & reg_class_contents[cl]
+			      & ~no_unit_alloc_regs);
 	  if (! hard_reg_set_empty_p (temp_hard_regset))
 	    {
 	      min_cost = INT_MAX;
@@ -1223,10 +1207,8 @@ setup_reg_class_relations (void)
 	  ira_reg_classes_intersect_p[cl1][cl2] = false;
 	  ira_reg_class_intersect[cl1][cl2] = NO_REGS;
 	  ira_reg_class_subset[cl1][cl2] = NO_REGS;
-	  COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl1]);
-	  AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
-	  COPY_HARD_REG_SET (temp_set2, reg_class_contents[cl2]);
-	  AND_COMPL_HARD_REG_SET (temp_set2, no_unit_alloc_regs);
+	  temp_hard_regset = reg_class_contents[cl1] & ~no_unit_alloc_regs;
+	  temp_set2 = reg_class_contents[cl2] & ~no_unit_alloc_regs;
 	  if (hard_reg_set_empty_p (temp_hard_regset)
 	      && hard_reg_set_empty_p (temp_set2))
 	    {
@@ -1264,16 +1246,14 @@ setup_reg_class_relations (void)
 	    }
 	  ira_reg_class_subunion[cl1][cl2] = NO_REGS;
 	  ira_reg_class_superunion[cl1][cl2] = NO_REGS;
-	  COPY_HARD_REG_SET (intersection_set, reg_class_contents[cl1]);
-	  AND_HARD_REG_SET (intersection_set, reg_class_contents[cl2]);
-	  AND_COMPL_HARD_REG_SET (intersection_set, no_unit_alloc_regs);
-	  COPY_HARD_REG_SET (union_set, reg_class_contents[cl1]);
-	  IOR_HARD_REG_SET (union_set, reg_class_contents[cl2]);
-	  AND_COMPL_HARD_REG_SET (union_set, no_unit_alloc_regs);
+	  intersection_set = (reg_class_contents[cl1]
+			      & reg_class_contents[cl2]
+			      & ~no_unit_alloc_regs);
+	  union_set = ((reg_class_contents[cl1] | reg_class_contents[cl2])
+		       & ~no_unit_alloc_regs);
 	  for (cl3 = 0; cl3 < N_REG_CLASSES; cl3++)
 	    {
-	      COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl3]);
-	      AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
+	      temp_hard_regset = reg_class_contents[cl3] & ~no_unit_alloc_regs;
 	      if (hard_reg_set_subset_p (temp_hard_regset, intersection_set))
 		{
 		  /* CL3 allocatable hard register set is inside of
@@ -1281,17 +1261,16 @@ setup_reg_class_relations (void)
 		     of CL1 and CL2.  */
 		  if (important_class_p[cl3])
 		    {
-		      COPY_HARD_REG_SET
-			(temp_set2,
-			 reg_class_contents
-			 [(int) ira_reg_class_intersect[cl1][cl2]]);
-		      AND_COMPL_HARD_REG_SET (temp_set2, no_unit_alloc_regs);
+		      temp_set2
+			= (reg_class_contents
+			   [ira_reg_class_intersect[cl1][cl2]]);
+		      temp_set2 &= ~no_unit_alloc_regs;
 		      if (! hard_reg_set_subset_p (temp_hard_regset, temp_set2)
 			  /* If the allocatable hard register sets are
 			     the same, prefer GENERAL_REGS or the
 			     smallest class for debugging
 			     purposes.  */
-			  || (hard_reg_set_equal_p (temp_hard_regset, temp_set2)
+			  || (temp_hard_regset == temp_set2
 			      && (cl3 == GENERAL_REGS
 				  || ((ira_reg_class_intersect[cl1][cl2]
 				       != GENERAL_REGS)
@@ -1302,14 +1281,13 @@ setup_reg_class_relations (void)
 					   ira_reg_class_intersect[cl1][cl2]])))))
 			ira_reg_class_intersect[cl1][cl2] = (enum reg_class) cl3;
 		    }
-		  COPY_HARD_REG_SET
-		    (temp_set2,
-		     reg_class_contents[(int) ira_reg_class_subset[cl1][cl2]]);
-		  AND_COMPL_HARD_REG_SET (temp_set2, no_unit_alloc_regs);
+		  temp_set2
+		    = (reg_class_contents[ira_reg_class_subset[cl1][cl2]]
+		       & ~no_unit_alloc_regs);
 		  if (! hard_reg_set_subset_p (temp_hard_regset, temp_set2)
 		      /* Ignore unavailable hard registers and prefer
 			 smallest class for debugging purposes.  */
-		      || (hard_reg_set_equal_p (temp_hard_regset, temp_set2)
+		      || (temp_hard_regset == temp_set2
 			  && hard_reg_set_subset_p
 			     (reg_class_contents[cl3],
 			      reg_class_contents
@@ -1322,15 +1300,13 @@ setup_reg_class_relations (void)
 		  /* CL3 allocatable hard register set is inside of
 		     union of allocatable hard register sets of CL1
 		     and CL2.  */
-		  COPY_HARD_REG_SET
-		    (temp_set2,
-		     reg_class_contents[(int) ira_reg_class_subunion[cl1][cl2]]);
-		  AND_COMPL_HARD_REG_SET (temp_set2, no_unit_alloc_regs);
+		  temp_set2
+		    = (reg_class_contents[ira_reg_class_subunion[cl1][cl2]]
+		       & ~no_unit_alloc_regs);
 	 	  if (ira_reg_class_subunion[cl1][cl2] == NO_REGS
 		      || (hard_reg_set_subset_p (temp_set2, temp_hard_regset)
 			  
-			  && (! hard_reg_set_equal_p (temp_set2,
-						      temp_hard_regset)
+			  && (temp_set2 != temp_hard_regset
 			      || cl3 == GENERAL_REGS
 			      /* If the allocatable hard register sets are the
 				 same, prefer GENERAL_REGS or the smallest
@@ -1347,15 +1323,13 @@ setup_reg_class_relations (void)
 		  /* CL3 allocatable hard register set contains union
 		     of allocatable hard register sets of CL1 and
 		     CL2.  */
-		  COPY_HARD_REG_SET
-		    (temp_set2,
-		     reg_class_contents[(int) ira_reg_class_superunion[cl1][cl2]]);
-		  AND_COMPL_HARD_REG_SET (temp_set2, no_unit_alloc_regs);
+		  temp_set2
+		    = (reg_class_contents[ira_reg_class_superunion[cl1][cl2]]
+		       & ~no_unit_alloc_regs);
 	 	  if (ira_reg_class_superunion[cl1][cl2] == NO_REGS
 		      || (hard_reg_set_subset_p (temp_hard_regset, temp_set2)
 
-			  && (! hard_reg_set_equal_p (temp_set2,
-						      temp_hard_regset)
+			  && (temp_set2 != temp_hard_regset
 			      || cl3 == GENERAL_REGS
 			      /* If the allocatable hard register sets are the
 				 same, prefer GENERAL_REGS or the smallest
@@ -1499,8 +1473,7 @@ setup_prohibited_class_mode_regs (void)
 
   for (cl = (int) N_REG_CLASSES - 1; cl >= 0; cl--)
     {
-      COPY_HARD_REG_SET (temp_hard_regset, reg_class_contents[cl]);
-      AND_COMPL_HARD_REG_SET (temp_hard_regset, no_unit_alloc_regs);
+      temp_hard_regset = reg_class_contents[cl] & ~no_unit_alloc_regs;
       for (j = 0; j < NUM_MACHINE_MODES; j++)
 	{
 	  count = 0;
@@ -1784,68 +1757,59 @@ setup_prohibited_mode_move_regs (void)
 
 
 
-/* Setup possible alternatives in ALTS for INSN.  */
-void
-ira_setup_alts (rtx_insn *insn, HARD_REG_SET &alts)
+/* Extract INSN and return the set of alternatives that we should consider.
+   This excludes any alternatives whose constraints are obviously impossible
+   to meet (e.g. because the constraint requires a constant and the operand
+   is nonconstant).  It also excludes alternatives that are bound to need
+   a spill or reload, as long as we have other alternatives that match
+   exactly.  */
+alternative_mask
+ira_setup_alts (rtx_insn *insn)
 {
-  /* MAP nalt * nop -> start of constraints for given operand and
-     alternative.  */
-  static vec<const char *> insn_constraints;
   int nop, nalt;
   bool curr_swapped;
   const char *p;
   int commutative = -1;
 
   extract_insn (insn);
+  preprocess_constraints (insn);
   alternative_mask preferred = get_preferred_alternatives (insn);
-  CLEAR_HARD_REG_SET (alts);
-  insn_constraints.release ();
-  insn_constraints.safe_grow_cleared (recog_data.n_operands
-				      * recog_data.n_alternatives + 1);
+  alternative_mask alts = 0;
+  alternative_mask exact_alts = 0;
   /* Check that the hard reg set is enough for holding all
      alternatives.  It is hard to imagine the situation when the
      assertion is wrong.  */
   ira_assert (recog_data.n_alternatives
 	      <= (int) MAX (sizeof (HARD_REG_ELT_TYPE) * CHAR_BIT,
 			    FIRST_PSEUDO_REGISTER));
+  for (nop = 0; nop < recog_data.n_operands; nop++)
+    if (recog_data.constraints[nop][0] == '%')
+      {
+	commutative = nop;
+	break;
+      }
   for (curr_swapped = false;; curr_swapped = true)
     {
-      /* Calculate some data common for all alternatives to speed up the
-	 function.  */
-      for (nop = 0; nop < recog_data.n_operands; nop++)
-	{
-	  for (nalt = 0, p = recog_data.constraints[nop];
-	       nalt < recog_data.n_alternatives;
-	       nalt++)
-	    {
-	      insn_constraints[nop * recog_data.n_alternatives + nalt] = p;
-	      while (*p && *p != ',')
-		{
-		  /* We only support one commutative marker, the first
-		     one.  We already set commutative above.  */
-		  if (*p == '%' && commutative < 0)
-		    commutative = nop;
-		  p++;
-		}
-	      if (*p)
-		p++;
-	    }
-	}
       for (nalt = 0; nalt < recog_data.n_alternatives; nalt++)
 	{
-	  if (!TEST_BIT (preferred, nalt)
-	      || TEST_HARD_REG_BIT (alts, nalt))
+	  if (!TEST_BIT (preferred, nalt) || TEST_BIT (exact_alts, nalt))
 	    continue;
 
+	  const operand_alternative *op_alt
+	    = &recog_op_alt[nalt * recog_data.n_operands];
+	  int this_reject = 0;
 	  for (nop = 0; nop < recog_data.n_operands; nop++)
 	    {
 	      int c, len;
 
+	      this_reject += op_alt[nop].reject;
+
 	      rtx op = recog_data.operand[nop];
-	      p = insn_constraints[nop * recog_data.n_alternatives + nalt];
+	      p = op_alt[nop].constraint;
 	      if (*p == 0 || *p == ',')
 		continue;
-	      
+
+	      bool win_p = false;
 	      do
 		switch (c = *p, len = CONSTRAINT_LEN (c, p), c)
 		  {
@@ -1863,7 +1827,14 @@ ira_setup_alts (rtx_insn *insn, HARD_REG_SET &alts)
 
 		  case '0':  case '1':  case '2':  case '3':  case '4':
 		  case '5':  case '6':  case '7':  case '8':  case '9':
-		    goto op_success;
+		    {
+		      rtx other = recog_data.operand[c - '0'];
+		      if (MEM_P (other)
+			  ? rtx_equal_p (other, op)
+			  : REG_P (op) || SUBREG_P (op))
+			goto op_success;
+		      win_p = true;
+		    }
 		    break;
 		    
 		  case 'g':
@@ -1877,7 +1848,11 @@ ira_setup_alts (rtx_insn *insn, HARD_REG_SET &alts)
 			{
 			case CT_REGISTER:
 			  if (reg_class_for_constraint (cn) != NO_REGS)
-			    goto op_success;
+			    {
+			      if (REG_P (op) || SUBREG_P (op))
+				goto op_success;
+			      win_p = true;
+			    }
 			  break;
 
 			case CT_CONST_INT:
@@ -1888,9 +1863,14 @@ ira_setup_alts (rtx_insn *insn, HARD_REG_SET &alts)
 			  break;
 
 			case CT_ADDRESS:
+			  goto op_success;
+
 			case CT_MEMORY:
 			case CT_SPECIAL_MEMORY:
-			  goto op_success;
+			  if (MEM_P (op))
+			    goto op_success;
+			  win_p = true;
+			  break;
 
 			case CT_FIXED_FORM:
 			  if (constraint_satisfied_p (op, cn))
@@ -1901,12 +1881,22 @@ ira_setup_alts (rtx_insn *insn, HARD_REG_SET &alts)
 		    }
 		  }
 	      while (p += len, c);
-	      break;
+	      if (!win_p)
+		break;
+	      /* We can make the alternative match by spilling a register
+		 to memory or loading something into a register.  Count a
+		 cost of one reload (the equivalent of the '?' constraint).  */
+	      this_reject += 6;
 	    op_success:
 	      ;
 	    }
+
 	  if (nop >= recog_data.n_operands)
-	    SET_HARD_REG_BIT (alts, nalt);
+	    {
+	      alts |= ALTERNATIVE_BIT (nalt);
+	      if (this_reject == 0)
+		exact_alts |= ALTERNATIVE_BIT (nalt);
+	    }
 	}
       if (commutative < 0)
 	break;
@@ -1916,14 +1906,15 @@ ira_setup_alts (rtx_insn *insn, HARD_REG_SET &alts)
       if (curr_swapped)
 	break;
     }
+  return exact_alts ? exact_alts : alts;
 }
 
 /* Return the number of the output non-early clobber operand which
    should be the same in any case as operand with number OP_NUM (or
-   negative value if there is no such operand).  The function takes
-   only really possible alternatives into consideration.  */
+   negative value if there is no such operand).  ALTS is the mask
+   of alternatives that we should consider.  */
 int
-ira_get_dup_out_num (int op_num, HARD_REG_SET &alts)
+ira_get_dup_out_num (int op_num, alternative_mask alts)
 {
   int curr_alt, c, original, dup;
   bool ignore_p, use_commut_op_p;
@@ -1940,7 +1931,7 @@ ira_get_dup_out_num (int op_num, HARD_REG_SET &alts)
     {
       rtx op = recog_data.operand[op_num];
       
-      for (curr_alt = 0, ignore_p = !TEST_HARD_REG_BIT (alts, curr_alt),
+      for (curr_alt = 0, ignore_p = !TEST_BIT (alts, curr_alt),
 	   original = -1;;)
 	{
 	  c = *str;
@@ -1951,7 +1942,7 @@ ira_get_dup_out_num (int op_num, HARD_REG_SET &alts)
 	  else if (c == ',')
 	    {
 	      curr_alt++;
-	      ignore_p = !TEST_HARD_REG_BIT (alts, curr_alt);
+	      ignore_p = !TEST_BIT (alts, curr_alt);
 	    }
 	  else if (! ignore_p)
 	    switch (c)
@@ -1981,26 +1972,8 @@ ira_get_dup_out_num (int op_num, HARD_REG_SET &alts)
 	}
       if (original == -1)
 	goto fail;
-      dup = -1;
-      for (ignore_p = false, str = recog_data.constraints[original - '0'];
-	   *str != 0;
-	   str++)
-	if (ignore_p)
-	  {
-	    if (*str == ',')
-	      ignore_p = false;
-	  }
-	else if (*str == '#')
-	  ignore_p = true;
-	else if (! ignore_p)
-	  {
-	    if (*str == '=')
-	      dup = original - '0';
-	    /* It is better ignore an alternative with early clobber.  */
-	    else if (*str == '&')
-	      goto fail;
-	  }
-      if (dup >= 0)
+      dup = original - '0';
+      if (recog_data.operand_type[dup] == OP_OUT)
 	return dup;
     fail:
       if (use_commut_op_p)
@@ -2305,7 +2278,7 @@ ira_setup_eliminable_regset (void)
   if (frame_pointer_needed)
     df_set_regs_ever_live (HARD_FRAME_POINTER_REGNUM, true);
     
-  COPY_HARD_REG_SET (ira_no_alloc_regs, no_unit_alloc_regs);
+  ira_no_alloc_regs = no_unit_alloc_regs;
   CLEAR_HARD_REG_SET (eliminable_regset);
 
   compute_regs_asm_clobbered ();
@@ -2326,7 +2299,7 @@ ira_setup_eliminable_regset (void)
 	      SET_HARD_REG_BIT (ira_no_alloc_regs, eliminables[i].from);
 	}
       else if (cannot_elim)
-	error ("%s cannot be used in asm here",
+	error ("%s cannot be used in %<asm%> here",
 	       reg_names[eliminables[i].from]);
       else
 	df_set_regs_ever_live (eliminables[i].from, true);
@@ -2340,7 +2313,7 @@ ira_setup_eliminable_regset (void)
 	    SET_HARD_REG_BIT (ira_no_alloc_regs, HARD_FRAME_POINTER_REGNUM);
 	}
       else if (frame_pointer_needed)
-	error ("%s cannot be used in asm here",
+	error ("%s cannot be used in %<asm%> here",
 	       reg_names[HARD_FRAME_POINTER_REGNUM]);
       else
 	df_set_regs_ever_live (HARD_FRAME_POINTER_REGNUM, true);
@@ -2392,12 +2365,10 @@ setup_reg_renumber (void)
 	  for (i = 0; i < nwords; i++)
 	    {
 	      obj = ALLOCNO_OBJECT (a, i);
-	      IOR_COMPL_HARD_REG_SET (OBJECT_TOTAL_CONFLICT_HARD_REGS (obj),
-				      reg_class_contents[pclass]);
+	      OBJECT_TOTAL_CONFLICT_HARD_REGS (obj)
+		|= ~reg_class_contents[pclass];
 	    }
-	  if (ALLOCNO_CALLS_CROSSED_NUM (a) != 0
-	      && ira_hard_reg_set_intersection_p (hard_regno, ALLOCNO_MODE (a),
-						  call_used_reg_set))
+	  if (ira_need_caller_save_p (a, hard_regno))
 	    {
 	      ira_assert (!optimize || flag_caller_saves
 			  || (ALLOCNO_CALLS_CROSSED_NUM (a)
@@ -3004,7 +2975,7 @@ validate_equiv_mem (rtx_insn *start, rtx reg, rtx memref)
 	    return valid_none;
 	}
 
-      note_stores (PATTERN (insn), validate_equiv_mem_from_store, &info);
+      note_stores (insn, validate_equiv_mem_from_store, &info);
       if (info.equiv_mem_modified)
 	return valid_none;
 
@@ -3092,7 +3063,6 @@ equiv_init_movable_p (rtx x, int regno)
 
     case CC0:
     case CLOBBER:
-    case CLOBBER_HIGH:
       return 0;
 
     case PRE_INC:
@@ -3199,7 +3169,6 @@ memref_referenced_p (rtx memref, rtx x, bool read_p)
       return memref_referenced_p (memref, SET_SRC (x), true);
 
     case CLOBBER:
-    case CLOBBER_HIGH:
       if (process_set_for_memref_referenced_p (memref, XEXP (x, 0)))
 	return true;
 
@@ -3391,6 +3360,37 @@ def_dominates_uses (int regno)
   return true;
 }
 
+/* Scan the instructions before update_equiv_regs.  Record which registers
+   are referenced as paradoxical subregs.  Also check for cases in which
+   the current function needs to save a register that one of its call
+   instructions clobbers.
+
+   These things are logically unrelated, but it's more efficient to do
+   them together.  */
+
+static void
+update_equiv_regs_prescan (void)
+{
+  basic_block bb;
+  rtx_insn *insn;
+  function_abi_aggregator callee_abis;
+
+  FOR_EACH_BB_FN (bb, cfun)
+    FOR_BB_INSNS (bb, insn)
+      if (NONDEBUG_INSN_P (insn))
+	{
+	  set_paradoxical_subreg (insn);
+	  if (CALL_P (insn))
+	    callee_abis.note_callee_abi (insn_callee_abi (insn));
+	}
+
+  HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
+  if (!hard_reg_set_empty_p (extra_caller_saves))
+    for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
+      if (TEST_HARD_REG_BIT (extra_caller_saves, regno))
+	df_set_regs_ever_live (regno, true);
+}
+
 /* Find registers that are equivalent to a single value throughout the
    compilation (either because they can be referenced in memory or are
    set once from a single constant).  Lower their priority for a
@@ -3407,15 +3407,6 @@ update_equiv_regs (void)
   rtx_insn *insn;
   basic_block bb;
 
-  /* Scan insns and set pdx_subregs if the reg is used in a
-     paradoxical subreg.  Don't set such reg equivalent to a mem,
-     because lra will not substitute such equiv memory in order to
-     prevent access beyond allocated memory for paradoxical memory subreg.  */
-  FOR_EACH_BB_FN (bb, cfun)
-    FOR_BB_INSNS (bb, insn)
-      if (NONDEBUG_INSN_P (insn))
-	set_paradoxical_subreg (insn);
-
   /* Scan the insns and find which registers have equivalences.  Do this
      in a separate scan of the insns because (due to -fcse-follow-jumps)
      a register can be set below its use.  */
@@ -3447,7 +3438,7 @@ update_equiv_regs (void)
 	  if (set == NULL_RTX
 	      || side_effects_p (SET_SRC (set)))
 	    {
-	      note_stores (PATTERN (insn), no_equiv, NULL);
+	      note_pattern_stores (PATTERN (insn), no_equiv, NULL);
 	      continue;
 	    }
 	  else if (GET_CODE (PATTERN (insn)) == PARALLEL)
@@ -3458,7 +3449,7 @@ update_equiv_regs (void)
 		{
 		  rtx part = XVECEXP (PATTERN (insn), 0, i);
 		  if (part != set)
-		    note_stores (part, no_equiv, NULL);
+		    note_pattern_stores (part, no_equiv, NULL);
 		}
 	    }
 
@@ -3516,7 +3507,7 @@ update_equiv_regs (void)
 	    {
 	      /* This might be setting a SUBREG of a pseudo, a pseudo that is
 		 also set somewhere else to a constant.  */
-	      note_stores (set, no_equiv, NULL);
+	      note_pattern_stores (set, no_equiv, NULL);
 	      continue;
 	    }
 
@@ -3524,7 +3515,7 @@ update_equiv_regs (void)
 	     equivalent to a mem.  */
 	  if (MEM_P (src) && reg_equiv[regno].pdx_subregs)
 	    {
-	      note_stores (set, no_equiv, NULL);
+	      note_pattern_stores (set, no_equiv, NULL);
 	      continue;
 	    }
 
@@ -4458,7 +4449,6 @@ rtx_moveable_p (rtx *loc, enum op_type type)
 	      && rtx_moveable_p (&XEXP (x, 2), OP_IN));
 
     case CLOBBER:
-    case CLOBBER_HIGH:
       return rtx_moveable_p (&SET_DEST (x), OP_OUT);
 
     case UNSPEC_VOLATILE:
@@ -4911,9 +4901,7 @@ interesting_dest_for_shprep (rtx_insn *insn, basic_block call_dom)
   for (int i = 0; i < XVECLEN (pat, 0); i++)
     {
       rtx sub = XVECEXP (pat, 0, i);
-      if (GET_CODE (sub) == USE
-	  || GET_CODE (sub) == CLOBBER
-	  || GET_CODE (sub) == CLOBBER_HIGH)
+      if (GET_CODE (sub) == USE || GET_CODE (sub) == CLOBBER)
 	continue;
       if (GET_CODE (sub) != SET
 	  || side_effects_p (sub))
@@ -5305,6 +5293,7 @@ ira (FILE *f)
   init_alias_analysis ();
   loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
   reg_equiv = XCNEWVEC (struct equivalence, max_reg_num ());
+  update_equiv_regs_prescan ();
   update_equiv_regs ();
 
   /* Don't move insns if live range shrinkage or register
@@ -5616,7 +5605,9 @@ do_reload (void)
       poly_int64 size = get_frame_size () + STACK_CHECK_FIXED_FRAME_SIZE;
 
       for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-	if (df_regs_ever_live_p (i) && !fixed_regs[i] && call_used_regs[i])
+	if (df_regs_ever_live_p (i)
+	    && !fixed_regs[i]
+	    && !crtl->abi->clobbers_full_reg_p (i))
 	  size += UNITS_PER_WORD;
 
       if (constant_lower_bound (size) > STACK_CHECK_MAX_FRAME_SIZE)
diff --git a/gcc/jit/jit-playback.c b/gcc/jit/jit-playback.c
index b74495c58..8b16e81d5 100644
--- a/gcc/jit/jit-playback.c
+++ b/gcc/jit/jit-playback.c
@@ -399,12 +399,11 @@ new_function (location *loc,
 
   if (builtin_id)
     {
-      DECL_FUNCTION_CODE (fndecl) = builtin_id;
       gcc_assert (loc == NULL);
       DECL_SOURCE_LOCATION (fndecl) = BUILTINS_LOCATION;
 
-      DECL_BUILT_IN_CLASS (fndecl) =
-	builtins_manager::get_class (builtin_id);
+      built_in_class fclass = builtins_manager::get_class (builtin_id);
+      set_decl_built_in_function (fndecl, fclass, builtin_id);
       set_builtin_decl (builtin_id, fndecl,
 			builtins_manager::implicit_p (builtin_id));
 
diff --git a/gcc/jump.c b/gcc/jump.c
index ce5cee523..17642a95b 100644
--- a/gcc/jump.c
+++ b/gcc/jump.c
@@ -1094,7 +1094,6 @@ mark_jump_label_1 (rtx x, rtx_insn *insn, bool in_mem, bool is_target)
     case CC0:
     case REG:
     case CLOBBER:
-    case CLOBBER_HIGH:
     case CALL:
       return;
 
diff --git a/gcc/langhooks-def.h b/gcc/langhooks-def.h
index a059841b3..842f6a502 100644
--- a/gcc/langhooks-def.h
+++ b/gcc/langhooks-def.h
@@ -122,6 +122,7 @@ extern int lhd_type_dwarf_attribute (const_tree, int);
 #define LANG_HOOKS_TYPES_COMPATIBLE_P	lhd_types_compatible_p
 #define LANG_HOOKS_BUILTIN_FUNCTION	lhd_builtin_function
 #define LANG_HOOKS_BUILTIN_FUNCTION_EXT_SCOPE	LANG_HOOKS_BUILTIN_FUNCTION
+#define LANG_HOOKS_SIMULATE_BUILTIN_FUNCTION_DECL LANG_HOOKS_BUILTIN_FUNCTION
 #define LANG_HOOKS_EXPR_TO_DECL		lhd_expr_to_decl
 #define LANG_HOOKS_TO_TARGET_CHARSET	lhd_to_target_charset
 #define LANG_HOOKS_INIT_TS		lhd_do_nothing
@@ -170,6 +171,7 @@ extern tree lhd_make_node (enum tree_code);
 extern tree lhd_unit_size_without_reusable_padding (tree);
 
 #define LANG_HOOKS_MAKE_TYPE lhd_make_node
+#define LANG_HOOKS_SIMULATE_ENUM_DECL	NULL
 #define LANG_HOOKS_CLASSIFY_RECORD	NULL
 #define LANG_HOOKS_TYPE_FOR_SIZE	lhd_type_for_size
 #define LANG_HOOKS_INCOMPLETE_TYPE_ERROR lhd_incomplete_type_error
@@ -203,6 +205,7 @@ extern tree lhd_unit_size_without_reusable_padding (tree);
 
 #define LANG_HOOKS_FOR_TYPES_INITIALIZER { \
   LANG_HOOKS_MAKE_TYPE, \
+  LANG_HOOKS_SIMULATE_ENUM_DECL, \
   LANG_HOOKS_CLASSIFY_RECORD, \
   LANG_HOOKS_TYPE_FOR_MODE, \
   LANG_HOOKS_TYPE_FOR_SIZE, \
@@ -338,6 +341,7 @@ extern void lhd_end_section (void);
   LANG_HOOKS_GIMPLIFY_EXPR, \
   LANG_HOOKS_BUILTIN_FUNCTION, \
   LANG_HOOKS_BUILTIN_FUNCTION_EXT_SCOPE, \
+  LANG_HOOKS_SIMULATE_BUILTIN_FUNCTION_DECL, \
   LANG_HOOKS_INIT_TS,          \
   LANG_HOOKS_EXPR_TO_DECL, \
   LANG_HOOKS_EH_PERSONALITY, \
diff --git a/gcc/langhooks.c b/gcc/langhooks.c
index 2df97f2b6..fd8f43312 100644
--- a/gcc/langhooks.c
+++ b/gcc/langhooks.c
@@ -599,28 +599,21 @@ lhd_omp_mappable_type (tree type)
   return true;
 }
 
-/* Common function for add_builtin_function and
-   add_builtin_function_ext_scope.  */
+/* Common function for add_builtin_function, add_builtin_function_ext_scope
+   and simulate_builtin_function_decl.  */
+
 static tree
-add_builtin_function_common (const char *name,
-			     tree type,
-			     int function_code,
-			     enum built_in_class cl,
-			     const char *library_name,
-			     tree attrs,
-			     tree (*hook) (tree))
+build_builtin_function (location_t location, const char *name, tree type,
+			int function_code, enum built_in_class cl,
+			const char *library_name, tree attrs)
 {
   tree   id = get_identifier (name);
-  tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, id, type);
+  tree decl = build_decl (location, FUNCTION_DECL, id, type);
 
   TREE_PUBLIC (decl)         = 1;
   DECL_EXTERNAL (decl)       = 1;
-  DECL_BUILT_IN_CLASS (decl) = cl;
-
-  DECL_FUNCTION_CODE (decl)  = (enum built_in_function) function_code;
 
-  /* DECL_FUNCTION_CODE is a bitfield; verify that the value fits.  */
-  gcc_assert (DECL_FUNCTION_CODE (decl) == function_code);
+  set_decl_built_in_function (decl, cl, function_code);
 
   if (library_name)
     {
@@ -636,8 +629,7 @@ add_builtin_function_common (const char *name,
   else
     decl_attributes (&decl, NULL_TREE, 0);
 
-  return hook (decl);
-
+  return decl;
 }
 
 /* Create a builtin function.  */
@@ -650,9 +642,9 @@ add_builtin_function (const char *name,
 		      const char *library_name,
 		      tree attrs)
 {
-  return add_builtin_function_common (name, type, function_code, cl,
-				      library_name, attrs,
-				      lang_hooks.builtin_function);
+  tree decl = build_builtin_function (BUILTINS_LOCATION, name, type,
+				      function_code, cl, library_name, attrs);
+  return lang_hooks.builtin_function (decl);
 }
 
 /* Like add_builtin_function, but make sure the scope is the external scope.
@@ -670,9 +662,40 @@ add_builtin_function_ext_scope (const char *name,
 				const char *library_name,
 				tree attrs)
 {
-  return add_builtin_function_common (name, type, function_code, cl,
-				      library_name, attrs,
-				      lang_hooks.builtin_function_ext_scope);
+  tree decl = build_builtin_function (BUILTINS_LOCATION, name, type,
+				      function_code, cl, library_name, attrs);
+  return lang_hooks.builtin_function_ext_scope (decl);
+}
+
+/* Simulate a declaration of a target-specific built-in function at
+   location LOCATION, as though it had been declared directly in the
+   source language.  NAME is the name of the function, TYPE is its function
+   type, FUNCTION_CODE is the target-specific function code, LIBRARY_NAME
+   is the name of the underlying library function (NULL if none) and
+   ATTRS is a list of function attributes.
+
+   Return the decl of the declared function.  */
+
+tree
+simulate_builtin_function_decl (location_t location, const char *name,
+				tree type, int function_code,
+				const char *library_name, tree attrs)
+{
+  tree decl = build_builtin_function (location, name, type,
+				      function_code, BUILT_IN_MD,
+				      library_name, attrs);
+  tree new_decl = lang_hooks.simulate_builtin_function_decl (decl);
+
+  /* Give the front end a chance to create a new decl if necessary,
+     but if the front end discards the decl in favour of a conflicting
+     (erroneous) previous definition, return the decl that we tried but
+     failed to add.  This allows the caller to process the returned decl
+     normally, even though the source code won't be able to use it.  */
+  if (TREE_CODE (new_decl) == FUNCTION_DECL
+      && fndecl_built_in_p (new_decl, function_code, BUILT_IN_MD))
+    return new_decl;
+
+  return decl;
 }
 
 tree
diff --git a/gcc/langhooks.h b/gcc/langhooks.h
index a45579b33..b8cee93f5 100644
--- a/gcc/langhooks.h
+++ b/gcc/langhooks.h
@@ -64,6 +64,10 @@ struct lang_hooks_for_types
      language-specific processing is required.  */
   tree (*make_type) (enum tree_code);
 
+  /* Make an enum type with the given name and values, associating
+     them all with the given source location.  */
+  tree (*simulate_enum_decl) (location_t, const char *, vec<string_int_pair>);
+
   /* Return what kind of RECORD_TYPE this is, mainly for purposes of
      debug information.  If not defined, record types are assumed to
      be structures.  */
@@ -494,6 +498,15 @@ struct lang_hooks
      backend must add all of the builtins at program initialization time.  */
   tree (*builtin_function_ext_scope) (tree decl);
 
+  /* Do language-specific processing for target-specific built-in
+     function DECL, so that it is defined in the global scope (only)
+     and is available without needing to be explicitly declared.
+
+     This is intended for targets that want to inject declarations of
+     built-in functions into the source language (such as in response
+     to a pragma) rather than providing them in the source language itself.  */
+  tree (*simulate_builtin_function_decl) (tree decl);
+
   /* Used to set up the tree_contains_structure array for a frontend. */
   void (*init_ts) (void);
 
@@ -562,6 +575,8 @@ extern tree add_builtin_function_ext_scope (const char *name, tree type,
 					    enum built_in_class cl,
 					    const char *library_name,
 					    tree attrs);
+extern tree simulate_builtin_function_decl (location_t, const char *, tree,
+					    int, const char *, tree);
 extern tree add_builtin_type (const char *name, tree type);
 
 /* Language helper functions.  */
diff --git a/gcc/loop-doloop.c b/gcc/loop-doloop.c
index 89714be76..732687dba 100644
--- a/gcc/loop-doloop.c
+++ b/gcc/loop-doloop.c
@@ -731,7 +731,7 @@ doloop_optimize (struct loop *loop)
     bitmap modified = BITMAP_ALLOC (NULL);
 
     for (rtx_insn *i = doloop_seq; i != NULL; i = NEXT_INSN (i))
-      note_stores (PATTERN (i), record_reg_sets, modified);
+      note_stores (i, record_reg_sets, modified);
 
     basic_block loop_end = desc->out_edge->src;
     bool fail = bitmap_intersect_p (df_get_live_out (loop_end), modified);
diff --git a/gcc/loop-invariant.c b/gcc/loop-invariant.c
index b880ead3d..1af88876c 100644
--- a/gcc/loop-invariant.c
+++ b/gcc/loop-invariant.c
@@ -2170,7 +2170,7 @@ calculate_loop_reg_pressure (void)
 
 	  mark_ref_regs (PATTERN (insn));
 	  n_regs_set = 0;
-	  note_stores (PATTERN (insn), mark_reg_clobber, NULL);
+	  note_stores (insn, mark_reg_clobber, NULL);
 
 	  /* Mark any registers dead after INSN as dead now.  */
 
@@ -2183,7 +2183,7 @@ calculate_loop_reg_pressure (void)
 	     Clobbers are processed again, so they conflict with
 	     the registers that are set.  */
 
-	  note_stores (PATTERN (insn), mark_reg_store, NULL);
+	  note_stores (insn, mark_reg_store, NULL);
 
 	  if (AUTO_INC_DEC)
 	    for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
diff --git a/gcc/loop-iv.c b/gcc/loop-iv.c
index 340045ce8..1dc3bc74d 100644
--- a/gcc/loop-iv.c
+++ b/gcc/loop-iv.c
@@ -1967,16 +1967,10 @@ simplify_using_initial_values (struct loop *loop, enum rtx_code op, rtx *expr)
 	    continue;
 
 	  CLEAR_REG_SET (this_altered);
-	  note_stores (PATTERN (insn), mark_altered, this_altered);
+	  note_stores (insn, mark_altered, this_altered);
 	  if (CALL_P (insn))
-	    {
-	      /* Kill all call clobbered registers.  */
-	      unsigned int i;
-	      hard_reg_set_iterator hrsi;
-	      EXECUTE_IF_SET_IN_HARD_REG_SET (regs_invalidated_by_call,
-					      0, i, hrsi)
-		SET_REGNO_REG_SET (this_altered, i);
-	    }
+	    /* Kill all call clobbered registers.  */
+	    IOR_REG_SET_HRS (this_altered, regs_invalidated_by_call);
 
 	  if (suitable_set_for_replacement (insn, &dest, &src))
 	    {
diff --git a/gcc/lra-assigns.c b/gcc/lra-assigns.c
index 5c5c73293..a35fc41ac 100644
--- a/gcc/lra-assigns.c
+++ b/gcc/lra-assigns.c
@@ -94,6 +94,7 @@ along with GCC; see the file COPYING3.	If not see
 #include "params.h"
 #include "lra.h"
 #include "lra-int.h"
+#include "function-abi.h"
 
 /* Current iteration number of the pass and current iteration number
    of the pass after the latest spill pass when any former reload
@@ -493,18 +494,15 @@ find_hard_regno_for_1 (int regno, int *cost, int try_only_hard_regno,
   HARD_REG_SET impossible_start_hard_regs, available_regs;
 
   if (hard_reg_set_empty_p (regno_set))
-    COPY_HARD_REG_SET (conflict_set, lra_no_alloc_regs);
+    conflict_set = lra_no_alloc_regs;
   else
-    {
-      COMPL_HARD_REG_SET (conflict_set, regno_set);
-      IOR_HARD_REG_SET (conflict_set, lra_no_alloc_regs);
-    }
+    conflict_set = ~regno_set | lra_no_alloc_regs;
   rclass = regno_allocno_class_array[regno];
   rclass_intersect_p = ira_reg_classes_intersect_p[rclass];
   curr_hard_regno_costs_check++;
   sparseset_clear (conflict_reload_and_inheritance_pseudos);
   sparseset_clear (live_range_hard_reg_pseudos);
-  IOR_HARD_REG_SET (conflict_set, lra_reg_info[regno].conflict_hard_regs);
+  conflict_set |= lra_reg_info[regno].conflict_hard_regs;
   biggest_mode = lra_reg_info[regno].biggest_mode;
   for (r = lra_reg_info[regno].live_ranges; r != NULL; r = r->next)
     {
@@ -614,7 +612,7 @@ find_hard_regno_for_1 (int regno, int *cost, int try_only_hard_regno,
       }
   /* Make sure that all registers in a multi-word pseudo belong to the
      required class.  */
-  IOR_COMPL_HARD_REG_SET (conflict_set, reg_class_contents[rclass]);
+  conflict_set |= ~reg_class_contents[rclass];
   lra_assert (rclass != NO_REGS);
   rclass_size = ira_class_hard_regs_num[rclass];
   best_hard_regno = -1;
@@ -622,8 +620,7 @@ find_hard_regno_for_1 (int regno, int *cost, int try_only_hard_regno,
   biggest_nregs = hard_regno_nregs (hard_regno, biggest_mode);
   nregs_diff = (biggest_nregs
 		- hard_regno_nregs (hard_regno, PSEUDO_REGNO_MODE (regno)));
-  COPY_HARD_REG_SET (available_regs, reg_class_contents[rclass]);
-  AND_COMPL_HARD_REG_SET (available_regs, lra_no_alloc_regs);
+  available_regs = reg_class_contents[rclass] & ~lra_no_alloc_regs;
   for (i = 0; i < rclass_size; i++)
     {
       if (try_only_hard_regno >= 0)
@@ -658,7 +655,7 @@ find_hard_regno_for_1 (int regno, int *cost, int try_only_hard_regno,
 	  for (j = 0;
 	       j < hard_regno_nregs (hard_regno, PSEUDO_REGNO_MODE (regno));
 	       j++)
-	    if (! TEST_HARD_REG_BIT (call_used_reg_set, hard_regno + j)
+	    if (! crtl->abi->clobbers_full_reg_p (hard_regno + j)
 		&& ! df_regs_ever_live_p (hard_regno + j))
 	      /* It needs save restore.	 */
 	      hard_regno_costs[hard_regno]
@@ -1219,8 +1216,8 @@ setup_live_pseudos_and_spill_after_risky_transforms (bitmap
 		  sparseset_set_bit (live_range_hard_reg_pseudos, r2->regno);
 	    }
 	}
-      COPY_HARD_REG_SET (conflict_set, lra_no_alloc_regs);
-      IOR_HARD_REG_SET (conflict_set, lra_reg_info[regno].conflict_hard_regs);
+      conflict_set = lra_no_alloc_regs;
+      conflict_set |= lra_reg_info[regno].conflict_hard_regs;
       val = lra_reg_info[regno].val;
       offset = lra_reg_info[regno].offset;
       EXECUTE_IF_SET_IN_SPARSESET (live_range_hard_reg_pseudos, conflict_regno)
@@ -1640,14 +1637,14 @@ lra_assign (bool &fails_p)
   bitmap_initialize (&all_spilled_pseudos, &reg_obstack);
   create_live_range_start_chains ();
   setup_live_pseudos_and_spill_after_risky_transforms (&all_spilled_pseudos);
-  if (! lra_asm_error_p && flag_checking && !flag_ipa_ra)
+  if (! lra_asm_error_p && flag_checking)
     /* Check correctness of allocation for call-crossed pseudos but
        only when there are no asm errors as in the case of errors the
        asm is removed and it can result in incorrect allocation.  */
     for (i = FIRST_PSEUDO_REGISTER; i < max_regno; i++)
-      if (lra_reg_info[i].nrefs != 0 && reg_renumber[i] >= 0
-	  && lra_reg_info[i].call_insn
-	  && overlaps_hard_reg_set_p (call_used_reg_set,
+      if (lra_reg_info[i].nrefs != 0
+	  && reg_renumber[i] >= 0
+	  && overlaps_hard_reg_set_p (lra_reg_info[i].conflict_hard_regs,
 				      PSEUDO_REGNO_MODE (i), reg_renumber[i]))
 	gcc_unreachable ();
   /* Setup insns to process on the next constraint pass.  */
diff --git a/gcc/lra-constraints.c b/gcc/lra-constraints.c
index f0a2f0491..b34aec227 100644
--- a/gcc/lra-constraints.c
+++ b/gcc/lra-constraints.c
@@ -131,6 +131,7 @@
 #include "lra.h"
 #include "lra-int.h"
 #include "print-rtl.h"
+#include "function-abi.h"
 
 /* Value of LRA_CURR_RELOAD_NUM at the beginning of BB of the current
    insn.  Remember that LRA_CURR_RELOAD_NUM is the number of emitted
@@ -394,11 +395,24 @@ address_eliminator::~address_eliminator ()
     *m_index_loc = m_index_reg;
 }
 
-/* Return true if the eliminated form of AD is a legitimate target address.  */
+/* Return true if the eliminated form of AD is a legitimate target address.
+   If OP is a MEM, AD is the address within OP, otherwise OP should be
+   ignored.  CONSTRAINT is one constraint that the operand may need
+   to meet.  */
 static bool
-valid_address_p (struct address_info *ad)
+valid_address_p (rtx op, struct address_info *ad,
+		 enum constraint_num constraint)
 {
   address_eliminator eliminator (ad);
+
+  /* Allow a memory OP if it matches CONSTRAINT, even if CONSTRAINT is more
+     forgiving than "m".  */
+  if (MEM_P (op)
+      && (insn_extra_memory_constraint (constraint)
+	  || insn_extra_special_memory_constraint (constraint))
+      && constraint_satisfied_p (op, constraint))
+    return true;
+
   return valid_address_p (ad->mode, *ad->outer, ad->as);
 }
 
@@ -1888,8 +1902,7 @@ prohibited_class_reg_set_mode_p (enum reg_class rclass,
   HARD_REG_SET temp;
   
   lra_assert (hard_reg_set_subset_p (reg_class_contents[rclass], set));
-  COPY_HARD_REG_SET (temp, set);
-  AND_COMPL_HARD_REG_SET (temp, lra_no_alloc_regs);
+  temp = set & ~lra_no_alloc_regs;
   return (hard_reg_set_subset_p
 	  (temp, ira_prohibited_class_mode_regs[rclass][mode]));
 }
@@ -1900,11 +1913,12 @@ prohibited_class_reg_set_mode_p (enum reg_class rclass,
    alternative.  */
 static unsigned int curr_small_class_check = 0;
 
-/* Update number of used inputs of class OP_CLASS for operand NOP.
-   Return true if we have more such class operands than the number of
-   available regs.  */
+/* Update number of used inputs of class OP_CLASS for operand NOP
+   of alternative NALT.  Return true if we have more such class operands
+   than the number of available regs.  */
 static bool
-update_and_check_small_class_inputs (int nop, enum reg_class op_class)
+update_and_check_small_class_inputs (int nop, int nalt,
+				     enum reg_class op_class)
 {
   static unsigned int small_class_check[LIM_REG_CLASSES];
   static int small_class_input_nums[LIM_REG_CLASSES];
@@ -1915,7 +1929,7 @@ update_and_check_small_class_inputs (int nop, enum reg_class op_class)
       && hard_reg_set_intersect_p (reg_class_contents[op_class],
 				   ira_no_alloc_regs)
       && (curr_static_id->operand[nop].type != OP_OUT
-	  || curr_static_id->operand[nop].early_clobber))
+	  || TEST_BIT (curr_static_id->operand[nop].early_clobber_alts, nalt)))
     {
       if (small_class_check[op_class] == curr_small_class_check)
 	small_class_input_nums[op_class]++;
@@ -2184,7 +2198,8 @@ process_alt_operands (int only_alternative)
 			/* We should reject matching of an early
 			   clobber operand if the matching operand is
 			   not dying in the insn.  */
-			if (! curr_static_id->operand[m].early_clobber
+			if (!TEST_BIT (curr_static_id->operand[m]
+				       .early_clobber_alts, nalt)
 			    || operand_reg[nop] == NULL_RTX
 			    || (find_regno_note (curr_insn, REG_DEAD,
 						 REGNO (op))
@@ -2251,7 +2266,8 @@ process_alt_operands (int only_alternative)
 			   it results in less hard regs required for
 			   the insn than a non-matching earlyclobber
 			   alternative.  */
-			if (curr_static_id->operand[m].early_clobber)
+			if (TEST_BIT (curr_static_id->operand[m]
+				      .early_clobber_alts, nalt))
 			  {
 			    if (lra_dump_file != NULL)
 			      fprintf
@@ -2302,7 +2318,7 @@ process_alt_operands (int only_alternative)
 		       reloads. */
 		    badop = false;
 		    this_alternative = curr_alt[m];
-		    COPY_HARD_REG_SET (this_alternative_set, curr_alt_set[m]);
+		    this_alternative_set = curr_alt_set[m];
 		    winreg = this_alternative != NO_REGS;
 		    break;
 		  }
@@ -2387,14 +2403,12 @@ process_alt_operands (int only_alternative)
 		  if (mode == BLKmode)
 		    break;
 		  this_alternative = reg_class_subunion[this_alternative][cl];
-		  IOR_HARD_REG_SET (this_alternative_set,
-				    reg_class_contents[cl]);
+		  this_alternative_set |= reg_class_contents[cl];
 		  if (costly_p)
 		    {
 		      this_costly_alternative
 			= reg_class_subunion[this_costly_alternative][cl];
-		      IOR_HARD_REG_SET (this_costly_alternative_set,
-					reg_class_contents[cl]);
+		      this_costly_alternative_set |= reg_class_contents[cl];
 		    }
 		  winreg = true;
 		  if (REG_P (op))
@@ -2529,14 +2543,11 @@ process_alt_operands (int only_alternative)
 
 	      if (this_alternative != NO_REGS)
 		{
-		  HARD_REG_SET available_regs;
-		  
-		  COPY_HARD_REG_SET (available_regs,
-				     reg_class_contents[this_alternative]);
-		  AND_COMPL_HARD_REG_SET
-		    (available_regs,
-		     ira_prohibited_class_mode_regs[this_alternative][mode]);
-		  AND_COMPL_HARD_REG_SET (available_regs, lra_no_alloc_regs);
+		  HARD_REG_SET available_regs
+		    = (reg_class_contents[this_alternative]
+		       & ~((ira_prohibited_class_mode_regs
+			    [this_alternative][mode])
+			   | lra_no_alloc_regs));
 		  if (hard_reg_set_empty_p (available_regs))
 		    {
 		      /* There are no hard regs holding a value of given
@@ -2892,7 +2903,8 @@ process_alt_operands (int only_alternative)
               goto fail;
             }
 
-	  if (update_and_check_small_class_inputs (nop, this_alternative))
+	  if (update_and_check_small_class_inputs (nop, nalt,
+						   this_alternative))
 	    {
 	      if (lra_dump_file != NULL)
 		fprintf (lra_dump_file,
@@ -2901,7 +2913,7 @@ process_alt_operands (int only_alternative)
 	      goto fail;
 	    }
 	  curr_alt[nop] = this_alternative;
-	  COPY_HARD_REG_SET (curr_alt_set[nop], this_alternative_set);
+	  curr_alt_set[nop] = this_alternative_set;
 	  curr_alt_win[nop] = this_alternative_win;
 	  curr_alt_match_win[nop] = this_alternative_match_win;
 	  curr_alt_offmemok[nop] = this_alternative_offmemok;
@@ -3416,7 +3428,7 @@ process_address_1 (int nop, bool check_only_p,
 
      All these cases involve a non-autoinc address, so there is no
      point revalidating other types.  */
-  if (ad.autoinc_p || valid_address_p (&ad))
+  if (ad.autoinc_p || valid_address_p (op, &ad, cn))
     return change_p;
 
   /* Any index existed before LRA started, so we can assume that the
@@ -3445,7 +3457,7 @@ process_address_1 (int nop, bool check_only_p,
 	      if (code >= 0)
 		{
 		  *ad.inner = gen_rtx_LO_SUM (Pmode, new_reg, addr);
-		  if (! valid_address_p (ad.mode, *ad.outer, ad.as))
+		  if (!valid_address_p (op, &ad, cn))
 		    {
 		      /* Try to put lo_sum into register.  */
 		      insn = emit_insn (gen_rtx_SET
@@ -3455,7 +3467,7 @@ process_address_1 (int nop, bool check_only_p,
 		      if (code >= 0)
 			{
 			  *ad.inner = new_reg;
-			  if (! valid_address_p (ad.mode, *ad.outer, ad.as))
+			  if (!valid_address_p (op, &ad, cn))
 			    {
 			      *ad.inner = addr;
 			      code = -1;
@@ -3550,7 +3562,7 @@ process_address_1 (int nop, bool check_only_p,
 	  && CONSTANT_P (XEXP (SET_SRC (set), 1)))
 	{
 	  *ad.inner = SET_SRC (set);
-	  if (valid_address_p (ad.mode, *ad.outer, ad.as))
+	  if (valid_address_p (op, &ad, cn))
 	    {
 	      *ad.base_term = XEXP (SET_SRC (set), 0);
 	      *ad.disp_term = XEXP (SET_SRC (set), 1);
@@ -4573,7 +4585,7 @@ contains_reg_p (rtx x, bool hard_reg_p, bool spilled_p)
 	    regno = lra_get_regno_hard_regno (regno);
 	  if (regno < 0)
 	    return false;
-	  COMPL_HARD_REG_SET (alloc_regs, lra_no_alloc_regs);
+	  alloc_regs = ~lra_no_alloc_regs;
 	  return overlaps_hard_reg_set_p (alloc_regs, GET_MODE (x), regno);
 	}
       else
@@ -5165,6 +5177,14 @@ static int reloads_num;
 /* Number of calls passed so far in current EBB.  */
 static int calls_num;
 
+/* Index ID is the CALLS_NUM associated the last call we saw with
+   ABI identifier ID.  */
+static int last_call_for_abi[NUM_ABI_IDS];
+
+/* Which registers have been fully or partially clobbered by a call
+   since they were last used.  */
+static HARD_REG_SET full_and_partial_call_clobbers;
+
 /* Current reload pseudo check for validity of elements in
    USAGE_INSNS.	 */
 static int curr_usage_insns_check;
@@ -5208,6 +5228,10 @@ setup_next_usage_insn (int regno, rtx insn, int reloads_num, bool after_p)
   usage_insns[regno].reloads_num = reloads_num;
   usage_insns[regno].calls_num = calls_num;
   usage_insns[regno].after_p = after_p;
+  if (regno >= FIRST_PSEUDO_REGISTER && reg_renumber[regno] >= 0)
+    remove_from_hard_reg_set (&full_and_partial_call_clobbers,
+			      PSEUDO_REGNO_MODE (regno),
+			      reg_renumber[regno]);
 }
 
 /* The function is used to form list REGNO usages which consists of
@@ -5453,16 +5477,19 @@ static inline bool
 need_for_call_save_p (int regno)
 {
   lra_assert (regno >= FIRST_PSEUDO_REGISTER && reg_renumber[regno] >= 0);
-  return (usage_insns[regno].calls_num < calls_num
-	  && (overlaps_hard_reg_set_p
-	      ((flag_ipa_ra &&
-		! hard_reg_set_empty_p (lra_reg_info[regno].actual_call_used_reg_set))
-	       ? lra_reg_info[regno].actual_call_used_reg_set
-	       : call_used_reg_set,
-	       PSEUDO_REGNO_MODE (regno), reg_renumber[regno])
-	      || (targetm.hard_regno_call_part_clobbered
-		  (lra_reg_info[regno].call_insn,
-		   reg_renumber[regno], PSEUDO_REGNO_MODE (regno)))));
+  if (usage_insns[regno].calls_num < calls_num)
+    {
+      unsigned int abis = 0;
+      for (unsigned int i = 0; i < NUM_ABI_IDS; ++i)
+	if (last_call_for_abi[i] > usage_insns[regno].calls_num)
+	  abis |= 1 << i;
+      gcc_assert (abis);
+      if (call_clobbered_in_region_p (abis, full_and_partial_call_clobbers,
+				      PSEUDO_REGNO_MODE (regno),
+				      reg_renumber[regno]))
+	return true;
+    }
+  return false;
 }
 
 /* Global registers occurring in the current EBB.  */
@@ -5502,8 +5529,7 @@ need_for_split_p (HARD_REG_SET potential_reload_hard_regs, int regno)
 	      true) the assign pass assumes that all pseudos living
 	      through calls are assigned to call saved hard regs.  */
 	   && (regno >= FIRST_PSEUDO_REGISTER
-	       || ! TEST_HARD_REG_BIT (call_used_reg_set, regno)
-	       || usage_insns[regno].calls_num == calls_num)
+	       || !TEST_HARD_REG_BIT (full_and_partial_call_clobbers, regno))
 	   /* We need at least 2 reloads to make pseudo splitting
 	      profitable.  We should provide hard regno splitting in
 	      any case to solve 1st insn scheduling problem when
@@ -6255,12 +6281,14 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
   curr_usage_insns_check++;
   clear_invariants ();
   reloads_num = calls_num = 0;
+  for (unsigned int i = 0; i < NUM_ABI_IDS; ++i)
+    last_call_for_abi[i] = 0;
+  CLEAR_HARD_REG_SET (full_and_partial_call_clobbers);
   bitmap_clear (&check_only_regs);
   bitmap_clear (&invalid_invariant_regs);
   last_processed_bb = NULL;
   CLEAR_HARD_REG_SET (potential_reload_hard_regs);
-  COPY_HARD_REG_SET (live_hard_regs, eliminable_regset);
-  IOR_HARD_REG_SET (live_hard_regs, lra_no_alloc_regs);
+  live_hard_regs = eliminable_regset | lra_no_alloc_regs;
   /* We don't process new insns generated in the loop.	*/
   for (curr_insn = tail; curr_insn != PREV_INSN (head); curr_insn = prev_insn)
     {
@@ -6330,8 +6358,7 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
 	  else
 	    setup_next_usage_insn (src_regno, curr_insn, reloads_num, false);
 	  if (hard_reg_set_subset_p (reg_class_contents[cl], live_hard_regs))
-	    IOR_HARD_REG_SET (potential_reload_hard_regs,
-			      reg_class_contents[cl]);
+	    potential_reload_hard_regs |= reg_class_contents[cl];
 	}
       else if (src_regno < 0
 	       && dst_regno >= lra_constraint_new_regno_start
@@ -6348,8 +6375,7 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
 	  if (process_invariant_for_inheritance (SET_DEST (curr_set), SET_SRC (curr_set)))
 	    change_p = true;
 	  if (hard_reg_set_subset_p (reg_class_contents[cl], live_hard_regs))
-	    IOR_HARD_REG_SET (potential_reload_hard_regs,
-			      reg_class_contents[cl]);
+	    potential_reload_hard_regs |= reg_class_contents[cl];
 	}
       else if (src_regno >= lra_constraint_new_regno_start
 	       && dst_regno < lra_constraint_new_regno_start
@@ -6371,8 +6397,7 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
 	  /* Invalidate.  */
 	  usage_insns[dst_regno].check = 0;
 	  if (hard_reg_set_subset_p (reg_class_contents[cl], live_hard_regs))
-	    IOR_HARD_REG_SET (potential_reload_hard_regs,
-			      reg_class_contents[cl]);
+	    potential_reload_hard_regs |= reg_class_contents[cl];
 	}
       else if (INSN_P (curr_insn))
 	{
@@ -6427,8 +6452,8 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
 		      else
 			add_to_hard_reg_set (&s, PSEUDO_REGNO_MODE (dst_regno),
 					     reg_renumber[dst_regno]);
-		      AND_COMPL_HARD_REG_SET (live_hard_regs, s);
-		      AND_COMPL_HARD_REG_SET (potential_reload_hard_regs, s);
+		      live_hard_regs &= ~s;
+		      potential_reload_hard_regs &= ~s;
 		    }
 		  /* We should invalidate potential inheritance or
 		     splitting for the current insn usages to the next
@@ -6472,6 +6497,10 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
 	      int regno, hard_regno;
 
 	      calls_num++;
+	      function_abi callee_abi = insn_callee_abi (curr_insn);
+	      last_call_for_abi[callee_abi.id ()] = calls_num;
+	      full_and_partial_call_clobbers
+		|= callee_abi.full_and_partial_reg_clobbers ();
 	      if ((cheap = find_reg_note (curr_insn,
 					  REG_RETURNED, NULL_RTX)) != NULL_RTX
 		  && ((cheap = XEXP (cheap, 0)), true)
@@ -6481,7 +6510,7 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
 		  /* If there are pending saves/restores, the
 		     optimization is not worth.	 */
 		  && usage_insns[regno].calls_num == calls_num - 1
-		  && TEST_HARD_REG_BIT (call_used_reg_set, hard_regno))
+		  && callee_abi.clobbers_reg_p (GET_MODE (cheap), hard_regno))
 		{
 		  /* Restore the pseudo from the call result as
 		     REG_RETURNED note says that the pseudo value is
@@ -6504,6 +6533,9 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
 		      /* We don't need to save/restore of the pseudo from
 			 this call.	 */
 		      usage_insns[regno].calls_num = calls_num;
+		      remove_from_hard_reg_set
+			(&full_and_partial_call_clobbers,
+			 GET_MODE (cheap), hard_regno);
 		      bitmap_set_bit (&check_only_regs, regno);
 		    }
 		}
@@ -6607,8 +6639,7 @@ inherit_in_ebb (rtx_insn *head, rtx_insn *tail)
 	      if (ira_class_hard_regs_num[cl] <= max_small_class_regs_num)
 		reloads_num++;
 	      if (hard_reg_set_subset_p (reg_class_contents[cl], live_hard_regs))
-		IOR_HARD_REG_SET (potential_reload_hard_regs,
-	                          reg_class_contents[cl]);
+		potential_reload_hard_regs |= reg_class_contents[cl];
 	    }
 	}
       if (NONDEBUG_INSN_P (curr_insn))
diff --git a/gcc/lra-eliminations.c b/gcc/lra-eliminations.c
index 7a345a52a..9568c13cb 100644
--- a/gcc/lra-eliminations.c
+++ b/gcc/lra-eliminations.c
@@ -654,7 +654,6 @@ lra_eliminate_regs_1 (rtx_insn *insn, rtx x, machine_mode mem_mode,
       return x;
 
     case CLOBBER:
-    case CLOBBER_HIGH:
     case SET:
       gcc_unreachable ();
 
@@ -807,16 +806,6 @@ mark_not_eliminable (rtx x, machine_mode mem_mode)
 	    setup_can_eliminate (ep, false);
       return;
 
-    case CLOBBER_HIGH:
-      gcc_assert (REG_P (XEXP (x, 0)));
-      gcc_assert (REGNO (XEXP (x, 0)) < FIRST_PSEUDO_REGISTER);
-      for (ep = reg_eliminate;
-	   ep < &reg_eliminate[NUM_ELIMINABLE_REGS];
-	   ep++)
-	if (reg_is_clobbered_by_clobber_high (ep->to_rtx, XEXP (x, 0)))
-	  setup_can_eliminate (ep, false);
-      return;
-
     case SET:
       if (SET_DEST (x) == stack_pointer_rtx
 	  && GET_CODE (SET_SRC (x)) == PLUS
@@ -1180,7 +1169,7 @@ spill_pseudos (HARD_REG_SET set)
 	reg_renumber[i] = -1;
 	bitmap_ior_into (&to_process, &lra_reg_info[i].insn_bitmap);
       }
-  IOR_HARD_REG_SET (lra_no_alloc_regs, set);
+  lra_no_alloc_regs |= set;
   for (insn = get_insns (); insn != NULL_RTX; insn = NEXT_INSN (insn))
     if (bitmap_bit_p (&to_process, INSN_UID (insn)))
       {
@@ -1293,8 +1282,8 @@ update_reg_eliminate (bitmap insns_with_changed_offsets)
 	    result = true;
 	  }
       }
-  IOR_HARD_REG_SET (lra_no_alloc_regs, temp_hard_reg_set);
-  AND_COMPL_HARD_REG_SET (eliminable_regset, temp_hard_reg_set);
+  lra_no_alloc_regs |= temp_hard_reg_set;
+  eliminable_regset &= ~temp_hard_reg_set;
   spill_pseudos (temp_hard_reg_set);
   return result;
 }
diff --git a/gcc/lra-int.h b/gcc/lra-int.h
index 253ae1e6c..5671e2e65 100644
--- a/gcc/lra-int.h
+++ b/gcc/lra-int.h
@@ -72,10 +72,6 @@ struct lra_reg
   /* The following fields are defined only for pseudos.	 */
   /* Hard registers with which the pseudo conflicts.  */
   HARD_REG_SET conflict_hard_regs;
-  /* Call used registers with which the pseudo conflicts, taking into account
-     the registers used by functions called from calls which cross the
-     pseudo.  */
-  HARD_REG_SET actual_call_used_reg_set;
   /* We assign hard registers to reload pseudos which can occur in few
      places.  So two hard register preferences are enough for them.
      The following fields define the preferred hard registers.	If
@@ -103,8 +99,6 @@ struct lra_reg
   int val;
   /* Offset from relative eliminate register to pesudo reg.  */
   poly_int64 offset;
-  /* Call instruction, if any, that may affect this psuedo reg.  */
-  rtx_insn *call_insn;
   /* These members are set up in lra-lives.c and updated in
      lra-coalesce.c.  */
   /* The biggest size mode in which each pseudo reg is referred in
@@ -141,10 +135,6 @@ struct lra_operand_data
   unsigned int strict_low : 1;
   /* True if the operand is an operator.  */
   unsigned int is_operator : 1;
-  /* True if there is an early clobber alternative for this operand.
-     This field is set up every time when corresponding
-     operand_alternative in lra_static_insn_data is set up.  */
-  unsigned int early_clobber : 1;
   /* True if the operand is an address.  */
   unsigned int is_address : 1;
 };
@@ -163,11 +153,6 @@ struct lra_insn_reg
   /* True if the reg is accessed through a subreg and the subreg is
      just a part of the register.  */
   unsigned int subreg_p : 1;
-  /* True if there is an early clobber alternative for this
-     operand.  */
-  unsigned int early_clobber : 1;
-  /* True if the reg is clobber highed by the operand.  */
-  unsigned int clobber_high : 1;
   /* The corresponding regno of the register.  */
   int regno;
   /* Next reg info of the same insn.  */
diff --git a/gcc/lra-lives.c b/gcc/lra-lives.c
index 55b2adc2a..bce123d73 100644
--- a/gcc/lra-lives.c
+++ b/gcc/lra-lives.c
@@ -43,6 +43,7 @@ along with GCC; see the file COPYING3.	If not see
 #include "sparseset.h"
 #include "lra-int.h"
 #include "target.h"
+#include "function-abi.h"
 
 /* Program points are enumerated by numbers from range
    0..LRA_LIVE_MAX_POINT-1.  There are approximately two times more
@@ -327,7 +328,7 @@ static void
 mark_pseudo_dead (int regno)
 {
   lra_assert (!HARD_REGISTER_NUM_P (regno));
-  IOR_HARD_REG_SET (lra_reg_info[regno].conflict_hard_regs, hard_regs_live);
+  lra_reg_info[regno].conflict_hard_regs |= hard_regs_live;
   if (!sparseset_bit_p (pseudos_live, regno))
     return;
 
@@ -574,41 +575,21 @@ lra_setup_reload_pseudo_preferenced_hard_reg (int regno,
     }
 }
 
-/* Check that REGNO living through calls and setjumps, set up conflict
-   regs using LAST_CALL_USED_REG_SET, and clear corresponding bits in
-   PSEUDOS_LIVE_THROUGH_CALLS and PSEUDOS_LIVE_THROUGH_SETJUMPS.
-   CALL_INSN is a call that is representative of all calls in the region
-   described by the PSEUDOS_LIVE_THROUGH_* sets, in terms of the registers
-   that it preserves and clobbers.  */
+/* Check whether REGNO lives through calls and setjmps and clear
+   the corresponding bits in PSEUDOS_LIVE_THROUGH_CALLS and
+   PSEUDOS_LIVE_THROUGH_SETJUMPS.  All calls in the region described
+   by PSEUDOS_LIVE_THROUGH_CALLS have the given ABI.  */
 
 static inline void
-check_pseudos_live_through_calls (int regno,
-				  HARD_REG_SET last_call_used_reg_set,
-				  rtx_insn *call_insn)
+check_pseudos_live_through_calls (int regno, const function_abi &abi)
 {
-  int hr;
-  rtx_insn *old_call_insn;
-
   if (! sparseset_bit_p (pseudos_live_through_calls, regno))
     return;
 
-  gcc_assert (call_insn && CALL_P (call_insn));
-  old_call_insn = lra_reg_info[regno].call_insn;
-  if (!old_call_insn
-      || (targetm.return_call_with_max_clobbers
-	  && targetm.return_call_with_max_clobbers (old_call_insn, call_insn)
-	     == call_insn))
-    lra_reg_info[regno].call_insn = call_insn;
+  machine_mode mode = PSEUDO_REGNO_MODE (regno);
 
   sparseset_clear_bit (pseudos_live_through_calls, regno);
-  IOR_HARD_REG_SET (lra_reg_info[regno].conflict_hard_regs,
-		    last_call_used_reg_set);
-
-  for (hr = 0; HARD_REGISTER_NUM_P (hr); hr++)
-    if (targetm.hard_regno_call_part_clobbered (call_insn, hr,
-						PSEUDO_REGNO_MODE (regno)))
-      add_to_hard_reg_set (&lra_reg_info[regno].conflict_hard_regs,
-			   PSEUDO_REGNO_MODE (regno), hr);
+  lra_reg_info[regno].conflict_hard_regs |= abi.mode_clobbers (mode);
   if (! sparseset_bit_p (pseudos_live_through_setjumps, regno))
     return;
   sparseset_clear_bit (pseudos_live_through_setjumps, regno);
@@ -623,23 +604,10 @@ check_pseudos_live_through_calls (int regno,
 static inline bool
 reg_early_clobber_p (const struct lra_insn_reg *reg, int n_alt)
 {
-  return (reg->early_clobber
-	  && (n_alt == LRA_UNKNOWN_ALT
-	      || (n_alt != LRA_NON_CLOBBERED_ALT
-		  && TEST_BIT (reg->early_clobber_alts, n_alt))));
-}
-
-/* Return true if call instructions CALL1 and CALL2 use ABIs that
-   preserve the same set of registers.  */
-
-static bool
-calls_have_same_clobbers_p (rtx_insn *call1, rtx_insn *call2)
-{
-  if (!targetm.return_call_with_max_clobbers)
-    return false;
-
-  return (targetm.return_call_with_max_clobbers (call1, call2) == call1
-          && targetm.return_call_with_max_clobbers (call2, call1) == call2);
+  return (n_alt == LRA_UNKNOWN_ALT
+	  ? reg->early_clobber_alts != 0
+	  : (n_alt != LRA_NON_CLOBBERED_ALT
+	     && TEST_BIT (reg->early_clobber_alts, n_alt)));
 }
 
 /* Process insns of the basic block BB to update pseudo live ranges,
@@ -661,17 +629,15 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
   rtx_insn *next;
   rtx link, *link_loc;
   bool need_curr_point_incr;
-  HARD_REG_SET last_call_used_reg_set;
-  rtx_insn *call_insn = NULL;
-  rtx_insn *last_call_insn = NULL;
+  /* Only has a meaningful value once we've seen a call.  */
+  function_abi last_call_abi = default_function_abi;
 
   reg_live_out = df_get_live_out (bb);
   sparseset_clear (pseudos_live);
   sparseset_clear (pseudos_live_through_calls);
   sparseset_clear (pseudos_live_through_setjumps);
-  CLEAR_HARD_REG_SET (last_call_used_reg_set);
   REG_SET_TO_HARD_REG_SET (hard_regs_live, reg_live_out);
-  AND_COMPL_HARD_REG_SET (hard_regs_live, eliminable_regset);
+  hard_regs_live &= ~eliminable_regset;
   EXECUTE_IF_SET_IN_BITMAP (reg_live_out, FIRST_PSEUDO_REGISTER, j, bi)
     {
       update_pseudo_point (j, curr_point, USE_POINT);
@@ -701,7 +667,7 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
       bool call_p;
       int n_alt, dst_regno, src_regno;
       rtx set;
-      struct lra_insn_reg *reg, *hr;
+      struct lra_insn_reg *reg;
 
       if (!NONDEBUG_INSN_P (curr_insn))
 	continue;
@@ -733,7 +699,7 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
 		break;
 	      }
 	  for (reg = curr_static_id->hard_regs; reg != NULL; reg = reg->next)
-	    if (reg->type != OP_IN && !reg->clobber_high)
+	    if (reg->type != OP_IN)
 	      {
 		remove_p = false;
 		break;
@@ -870,24 +836,13 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
 	 unused values because they still conflict with quantities
 	 that are live at the time of the definition.  */
       for (reg = curr_id->regs; reg != NULL; reg = reg->next)
-	{
-	  if (reg->type != OP_IN)
-	    {
-	      update_pseudo_point (reg->regno, curr_point, USE_POINT);
-	      mark_regno_live (reg->regno, reg->biggest_mode);
-	      check_pseudos_live_through_calls (reg->regno,
-						last_call_used_reg_set,
-						call_insn);
-	    }
-
-	  if (!HARD_REGISTER_NUM_P (reg->regno))
-	    for (hr = curr_static_id->hard_regs; hr != NULL; hr = hr->next)
-	      if (hr->clobber_high
-		  && maybe_gt (GET_MODE_SIZE (PSEUDO_REGNO_MODE (reg->regno)),
-			       GET_MODE_SIZE (hr->biggest_mode)))
-		SET_HARD_REG_BIT (lra_reg_info[reg->regno].conflict_hard_regs,
-				  hr->regno);
-	}
+	if (reg->type != OP_IN)
+	  {
+	    update_pseudo_point (reg->regno, curr_point, USE_POINT);
+	    mark_regno_live (reg->regno, reg->biggest_mode);
+	    /* ??? Should be a no-op for unused registers.  */
+	    check_pseudos_live_through_calls (reg->regno, last_call_abi);
+	  }
 
       for (reg = curr_static_id->hard_regs; reg != NULL; reg = reg->next)
 	if (reg->type != OP_IN)
@@ -926,35 +881,13 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
 
       if (call_p)
 	{
-	  call_insn = curr_insn;
-	  if (! flag_ipa_ra && ! targetm.return_call_with_max_clobbers)
-	    COPY_HARD_REG_SET(last_call_used_reg_set, call_used_reg_set);
-	  else
-	    {
-	      HARD_REG_SET this_call_used_reg_set;
-	      get_call_reg_set_usage (curr_insn, &this_call_used_reg_set,
-				      call_used_reg_set);
-
-	      bool flush = (! hard_reg_set_empty_p (last_call_used_reg_set)
-			    && ( ! hard_reg_set_equal_p (last_call_used_reg_set,
-						       this_call_used_reg_set)))
-			   || (last_call_insn && ! calls_have_same_clobbers_p
-						     (call_insn,
-						      last_call_insn));
-
-	      EXECUTE_IF_SET_IN_SPARSESET (pseudos_live, j)
-		{
-		  IOR_HARD_REG_SET (lra_reg_info[j].actual_call_used_reg_set,
-				    this_call_used_reg_set);
+	  function_abi call_abi = insn_callee_abi (curr_insn);
 
-		  if (flush)
-		    check_pseudos_live_through_calls (j,
-						      last_call_used_reg_set,
-						      last_call_insn);
-		}
-	      COPY_HARD_REG_SET(last_call_used_reg_set, this_call_used_reg_set);
-	      last_call_insn = call_insn;
-	    }
+	  if (last_call_abi != call_abi)
+	    EXECUTE_IF_SET_IN_SPARSESET (pseudos_live, j)
+	      check_pseudos_live_through_calls (j, last_call_abi);
+
+	  last_call_abi = call_abi;
 
 	  sparseset_ior (pseudos_live_through_calls,
 			 pseudos_live_through_calls, pseudos_live);
@@ -992,9 +925,7 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
 	    if (reg->type == OP_IN)
 	      update_pseudo_point (reg->regno, curr_point, USE_POINT);
 	    mark_regno_live (reg->regno, reg->biggest_mode);
-	    check_pseudos_live_through_calls (reg->regno,
-					      last_call_used_reg_set,
-					      call_insn);
+	    check_pseudos_live_through_calls (reg->regno, last_call_abi);
 	  }
 
       for (reg = curr_static_id->hard_regs; reg != NULL; reg = reg->next)
@@ -1088,10 +1019,10 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
       }
 
   /* Pseudos can't go in stack regs at the start of a basic block that
-     is reached by an abnormal edge. Likewise for call clobbered regs,
-     because caller-save, fixup_abnormal_edges and possibly the table
-     driven EH machinery are not quite ready to handle such pseudos
-     live across such edges.  */
+     is reached by an abnormal edge.  Likewise for registers that are at
+     least partly call clobbered, because caller-save, fixup_abnormal_edges
+     and possibly the table driven EH machinery are not quite ready to
+     handle such pseudos live across such edges.  */
   if (bb_has_abnormal_pred (bb))
     {
 #ifdef STACK_REGS
@@ -1106,7 +1037,7 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
       if (!cfun->has_nonlocal_label
 	  && has_abnormal_call_or_eh_pred_edge_p (bb))
 	for (px = 0; HARD_REGISTER_NUM_P (px); px++)
-	  if (call_used_regs[px]
+	  if (eh_edge_abi.clobbers_at_least_part_of_reg_p (px)
 #ifdef REAL_PIC_OFFSET_TABLE_REGNUM
 	      /* We should create a conflict of PIC pseudo with PIC
 		 hard reg as PIC hard reg can have a wrong value after
@@ -1163,7 +1094,7 @@ process_bb_lives (basic_block bb, int &curr_point, bool dead_insn_p)
       if (sparseset_cardinality (pseudos_live_through_calls) == 0)
 	break;
       if (sparseset_bit_p (pseudos_live_through_calls, j))
-	check_pseudos_live_through_calls (j, last_call_used_reg_set, call_insn);
+	check_pseudos_live_through_calls (j, last_call_abi);
     }
 
   for (i = 0; HARD_REGISTER_NUM_P (i); ++i)
@@ -1397,7 +1328,6 @@ lra_create_live_ranges_1 (bool all_p, bool dead_insn_p)
 	lra_reg_info[i].biggest_mode = GET_MODE (regno_reg_rtx[i]);
       else
 	lra_reg_info[i].biggest_mode = VOIDmode;
-      lra_reg_info[i].call_insn = NULL;
       if (!HARD_REGISTER_NUM_P (i)
 	  && lra_reg_info[i].nrefs != 0)
 	{
diff --git a/gcc/lra-remat.c b/gcc/lra-remat.c
index 69209b2a1..914f5e2ce 100644
--- a/gcc/lra-remat.c
+++ b/gcc/lra-remat.c
@@ -65,16 +65,11 @@ along with GCC; see the file COPYING3.	If not see
 #include "recog.h"
 #include "lra.h"
 #include "lra-int.h"
+#include "function-abi.h"
 
 /* Number of candidates for rematerialization.  */
 static unsigned int cands_num;
 
-/* The following is used for representation of call_used_reg_set in
-   form array whose elements are hard register numbers with nonzero bit
-   in CALL_USED_REG_SET. */
-static int call_used_regs_arr_len;
-static int call_used_regs_arr[FIRST_PSEUDO_REGISTER];
-
 /* Bitmap used for different calculations.  */
 static bitmap_head temp_bitmap;
 
@@ -632,9 +627,12 @@ set_bb_regs (basic_block bb, rtx_insn *insn)
 	bitmap_set_bit (&subreg_regs, regno);
     }
   if (CALL_P (insn))
-    for (int i = 0; i < call_used_regs_arr_len; i++)
-      bitmap_set_bit (&get_remat_bb_data (bb)->dead_regs,
-		      call_used_regs_arr[i]);
+    {
+      /* Partially-clobbered registers might still be live.  */
+      HARD_REG_SET clobbers = insn_callee_abi (insn).full_reg_clobbers ();
+      bitmap_ior_into (&get_remat_bb_data (bb)->dead_regs,
+		       bitmap_view<HARD_REG_SET> (clobbers));
+    }
 }
 
 /* Calculate changed_regs and dead_regs for each BB.  */
@@ -697,7 +695,7 @@ reg_overlap_for_remat_p (lra_insn_reg *reg, rtx_insn *insn)
 
 /* Return true if a call used register is an input operand of INSN.  */
 static bool
-call_used_input_regno_present_p (rtx_insn *insn)
+call_used_input_regno_present_p (const function_abi &abi, rtx_insn *insn)
 {
   int iter;
   lra_insn_recog_data_t id = lra_get_insn_recog_data (insn);
@@ -708,8 +706,9 @@ call_used_input_regno_present_p (rtx_insn *insn)
     for (reg = (iter == 0 ? id->regs : static_id->hard_regs);
 	 reg != NULL;
 	 reg = reg->next)
-      if (reg->type == OP_IN && reg->regno < FIRST_PSEUDO_REGISTER
-	  && TEST_HARD_REG_BIT (call_used_reg_set, reg->regno))
+      if (reg->type == OP_IN
+	  && reg->regno < FIRST_PSEUDO_REGISTER
+	  && abi.clobbers_reg_p (reg->biggest_mode, reg->regno))
 	return true;
   return false;
 }
@@ -798,18 +797,21 @@ calculate_gen_cands (void)
 		    }
 	    
 	    if (CALL_P (insn))
-	      EXECUTE_IF_SET_IN_BITMAP (gen_insns, 0, uid, bi)
-		{
-		  rtx_insn *insn2 = lra_insn_recog_data[uid]->insn;
+	      {
+		function_abi callee_abi = insn_callee_abi (insn);
+		EXECUTE_IF_SET_IN_BITMAP (gen_insns, 0, uid, bi)
+		  {
+		    rtx_insn *insn2 = lra_insn_recog_data[uid]->insn;
 		  
-		  cand = insn_to_cand[INSN_UID (insn2)];
-		  gcc_assert (cand != NULL);
-		  if (call_used_input_regno_present_p (insn2))
-		    {
-		      bitmap_clear_bit (gen_cands, cand->index);
-		      bitmap_set_bit (&temp_bitmap, uid);
-		    }
-		}
+		    cand = insn_to_cand[INSN_UID (insn2)];
+		    gcc_assert (cand != NULL);
+		    if (call_used_input_regno_present_p (callee_abi, insn2))
+		      {
+			bitmap_clear_bit (gen_cands, cand->index);
+			bitmap_set_bit (&temp_bitmap, uid);
+		      }
+		  }
+	      }
 	    bitmap_and_compl_into (gen_insns, &temp_bitmap);
 
 	    cand = insn_to_cand[INSN_UID (insn)];
@@ -1204,13 +1206,16 @@ do_remat (void)
 		  }
 
 	  if (CALL_P (insn))
-	    EXECUTE_IF_SET_IN_BITMAP (avail_cands, 0, cid, bi)
-	      {
-		cand = all_cands[cid];
+	    {
+	      function_abi callee_abi = insn_callee_abi (insn);
+	      EXECUTE_IF_SET_IN_BITMAP (avail_cands, 0, cid, bi)
+		{
+		  cand = all_cands[cid];
 		
-		if (call_used_input_regno_present_p (cand->insn))
-		  bitmap_set_bit (&temp_bitmap, cand->index);
-	      }
+		  if (call_used_input_regno_present_p (callee_abi, cand->insn))
+		    bitmap_set_bit (&temp_bitmap, cand->index);
+		}
+	    }
 
 	  bitmap_and_compl_into (avail_cands, &temp_bitmap);
 
@@ -1306,10 +1311,6 @@ lra_remat (void)
   insn_to_cand_activation = XCNEWVEC (cand_t, get_max_uid ());
   regno_cands = XCNEWVEC (cand_t, max_regno);
   all_cands.create (8000);
-  call_used_regs_arr_len = 0;
-  for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-    if (call_used_regs[i])
-      call_used_regs_arr[call_used_regs_arr_len++] = i;
   initiate_cand_table ();
   create_remat_bb_data ();
   bitmap_initialize (&temp_bitmap, &reg_obstack);
diff --git a/gcc/lra-spills.c b/gcc/lra-spills.c
index c0f61c119..d4163eb75 100644
--- a/gcc/lra-spills.c
+++ b/gcc/lra-spills.c
@@ -242,7 +242,7 @@ assign_spill_hard_regs (int *pseudo_regnos, int n)
   /* Set up reserved hard regs for every program point.	 */
   reserved_hard_regs = XNEWVEC (HARD_REG_SET, lra_live_max_point);
   for (p = 0; p < lra_live_max_point; p++)
-    COPY_HARD_REG_SET (reserved_hard_regs[p], lra_no_alloc_regs);
+    reserved_hard_regs[p] = lra_no_alloc_regs;
   for (i = FIRST_PSEUDO_REGISTER; i < regs_num; i++)
     if (lra_reg_info[i].nrefs != 0
 	&& (hard_regno = lra_get_regno_hard_regno (i)) >= 0)
@@ -273,11 +273,10 @@ assign_spill_hard_regs (int *pseudo_regnos, int n)
 	  continue;
 	}
       lra_assert (spill_class != NO_REGS);
-      COPY_HARD_REG_SET (conflict_hard_regs,
-			 lra_reg_info[regno].conflict_hard_regs);
+      conflict_hard_regs = lra_reg_info[regno].conflict_hard_regs;
       for (r = lra_reg_info[regno].live_ranges; r != NULL; r = r->next)
 	for (p = r->start; p <= r->finish; p++)
-	  IOR_HARD_REG_SET (conflict_hard_regs, reserved_hard_regs[p]);
+	  conflict_hard_regs |= reserved_hard_regs[p];
       spill_class_size = ira_class_hard_regs_num[spill_class];
       mode = lra_reg_info[regno].biggest_mode;
       for (k = 0; k < spill_class_size; k++)
diff --git a/gcc/lra.c b/gcc/lra.c
index 10b85340f..db2f82fb1 100644
--- a/gcc/lra.c
+++ b/gcc/lra.c
@@ -121,6 +121,7 @@ along with GCC; see the file COPYING3.	If not see
 #include "lra.h"
 #include "lra-int.h"
 #include "print-rtl.h"
+#include "function-abi.h"
 
 /* Dump bitmap SET with TITLE and BB INDEX.  */
 void
@@ -536,18 +537,15 @@ object_allocator<lra_insn_reg> lra_insn_reg_pool ("insn regs");
 
 /* Create LRA insn related info about a reference to REGNO in INSN
    with TYPE (in/out/inout), biggest reference mode MODE, flag that it
-   is reference through subreg (SUBREG_P), flag that is early
-   clobbered in the insn (EARLY_CLOBBER), and reference to the next
+   is reference through subreg (SUBREG_P), and reference to the next
    insn reg info (NEXT).  If REGNO can be early clobbered,
    alternatives in which it can be early clobbered are given by
-   EARLY_CLOBBER_ALTS.  CLOBBER_HIGH marks if reference is a clobber
-   high.  */
+   EARLY_CLOBBER_ALTS.  */
 static struct lra_insn_reg *
 new_insn_reg (rtx_insn *insn, int regno, enum op_type type,
-	      machine_mode mode,
-	      bool subreg_p, bool early_clobber,
+	      machine_mode mode, bool subreg_p,
 	      alternative_mask early_clobber_alts,
-	      struct lra_insn_reg *next, bool clobber_high)
+	      struct lra_insn_reg *next)
 {
   lra_insn_reg *ir = lra_insn_reg_pool.allocate ();
   ir->type = type;
@@ -556,9 +554,7 @@ new_insn_reg (rtx_insn *insn, int regno, enum op_type type,
       && partial_subreg_p (lra_reg_info[regno].biggest_mode, mode))
     lra_reg_info[regno].biggest_mode = mode;
   ir->subreg_p = subreg_p;
-  ir->early_clobber = early_clobber;
   ir->early_clobber_alts = early_clobber_alts;
-  ir->clobber_high = clobber_high;
   ir->regno = regno;
   ir->next = next;
   return ir;
@@ -605,7 +601,7 @@ static struct lra_operand_data debug_operand_data =
     0, /* early_clobber_alts */
     E_VOIDmode, /* We are not interesting in the operand mode.  */
     OP_IN,
-    0, 0, 0, 0
+    0, 0, 0
   };
 
 /* The following data are used as static insn data for all debug
@@ -801,7 +797,6 @@ setup_operand_alternative (lra_insn_recog_data_t data,
   for (i = 0; i < nop; i++)
     {
       static_data->operand[i].early_clobber_alts = 0;
-      static_data->operand[i].early_clobber = false;
       static_data->operand[i].is_address = false;
       if (static_data->operand[i].constraint[0] == '%')
 	{
@@ -817,7 +812,6 @@ setup_operand_alternative (lra_insn_recog_data_t data,
   for (j = 0; j < nalt; j++)
     for (i = 0; i < nop; i++, op_alt++)
       {
-	static_data->operand[i].early_clobber |= op_alt->earlyclobber;
 	if (op_alt->earlyclobber)
 	  static_data->operand[i].early_clobber_alts |= (alternative_mask) 1 << j;
 	static_data->operand[i].is_address |= op_alt->is_address;
@@ -828,13 +822,12 @@ setup_operand_alternative (lra_insn_recog_data_t data,
    not the insn operands, in X with TYPE (in/out/inout) and flag that
    it is early clobbered in the insn (EARLY_CLOBBER) and add the info
    to LIST.  X is a part of insn given by DATA.	 Return the result
-   list.  CLOBBER_HIGH marks if X is a clobber high.  */
+   list.  */
 static struct lra_insn_reg *
 collect_non_operand_hard_regs (rtx_insn *insn, rtx *x,
 			       lra_insn_recog_data_t data,
 			       struct lra_insn_reg *list,
-			       enum op_type type, bool early_clobber,
-			       bool clobber_high)
+			       enum op_type type, bool early_clobber)
 {
   int i, j, regno, last;
   bool subreg_p;
@@ -878,10 +871,7 @@ collect_non_operand_hard_regs (rtx_insn *insn, rtx *x,
 		if (curr->type != type)
 		  curr->type = OP_INOUT;
 		if (early_clobber)
-		  {
-		    curr->early_clobber = true;
-		    curr->early_clobber_alts = ALL_ALTERNATIVES;
-		  }
+		  curr->early_clobber_alts = ALL_ALTERNATIVES;
 		break;
 	      }
 	  if (curr == NULL)
@@ -897,9 +887,7 @@ collect_non_operand_hard_regs (rtx_insn *insn, rtx *x,
 			 && regno <= LAST_STACK_REG));
 #endif
 	      list = new_insn_reg (data->insn, regno, type, mode, subreg_p,
-				   early_clobber,
-				   early_clobber ? ALL_ALTERNATIVES : 0, list,
-				   clobber_high);
+				   early_clobber ? ALL_ALTERNATIVES : 0, list);
 	    }
 	}
       return list;
@@ -908,31 +896,24 @@ collect_non_operand_hard_regs (rtx_insn *insn, rtx *x,
     {
     case SET:
       list = collect_non_operand_hard_regs (insn, &SET_DEST (op), data,
-					    list, OP_OUT, false, false);
+					    list, OP_OUT, false);
       list = collect_non_operand_hard_regs (insn, &SET_SRC (op), data,
-					    list, OP_IN, false, false);
+					    list, OP_IN, false);
       break;
     case CLOBBER:
       /* We treat clobber of non-operand hard registers as early clobber.  */
       list = collect_non_operand_hard_regs (insn, &XEXP (op, 0), data,
-					    list, OP_OUT, true, false);
-      break;
-    case CLOBBER_HIGH:
-      /* Clobber high should always span exactly one register.  */
-      gcc_assert (REG_NREGS (XEXP (op, 0)) == 1);
-      /* We treat clobber of non-operand hard registers as early clobber.  */
-      list = collect_non_operand_hard_regs (insn, &XEXP (op, 0), data,
-					    list, OP_OUT, true, true);
+					    list, OP_OUT, true);
       break;
     case PRE_INC: case PRE_DEC: case POST_INC: case POST_DEC:
       list = collect_non_operand_hard_regs (insn, &XEXP (op, 0), data,
-					    list, OP_INOUT, false, false);
+					    list, OP_INOUT, false);
       break;
     case PRE_MODIFY: case POST_MODIFY:
       list = collect_non_operand_hard_regs (insn, &XEXP (op, 0), data,
-					    list, OP_INOUT, false, false);
+					    list, OP_INOUT, false);
       list = collect_non_operand_hard_regs (insn, &XEXP (op, 1), data,
-					    list, OP_IN, false, false);
+					    list, OP_IN, false);
       break;
     default:
       fmt = GET_RTX_FORMAT (code);
@@ -940,12 +921,11 @@ collect_non_operand_hard_regs (rtx_insn *insn, rtx *x,
 	{
 	  if (fmt[i] == 'e')
 	    list = collect_non_operand_hard_regs (insn, &XEXP (op, i), data,
-						  list, OP_IN, false, false);
+						  list, OP_IN, false);
 	  else if (fmt[i] == 'E')
 	    for (j = XVECLEN (op, i) - 1; j >= 0; j--)
 	      list = collect_non_operand_hard_regs (insn, &XVECEXP (op, i, j),
-						    data, list, OP_IN, false,
-						    false);
+						    data, list, OP_IN, false);
 	}
     }
   return list;
@@ -1094,7 +1074,7 @@ lra_set_insn_recog_data (rtx_insn *insn)
   else
     insn_static_data->hard_regs
       = collect_non_operand_hard_regs (insn, &PATTERN (insn), data,
-				       NULL, OP_IN, false, false);
+				       NULL, OP_IN, false);
   data->arg_hard_regs = NULL;
   if (CALL_P (insn))
     {
@@ -1120,10 +1100,6 @@ lra_set_insn_recog_data (rtx_insn *insn)
 	      arg_hard_regs[n_hard_regs++]
 		= regno + i + (use_p ? 0 : FIRST_PSEUDO_REGISTER);
 	  }
-	else if (GET_CODE (XEXP (link, 0)) == CLOBBER_HIGH)
-	  /* We could support CLOBBER_HIGH and treat it in the same way as
-	     HARD_REGNO_CALL_PART_CLOBBERED, but no port needs that yet.  */
-	  gcc_unreachable ();
 
       if (n_hard_regs != 0)
 	{
@@ -1332,7 +1308,6 @@ initialize_lra_reg_info_element (int i)
   lra_reg_info[i].no_stack_p = false;
 #endif
   CLEAR_HARD_REG_SET (lra_reg_info[i].conflict_hard_regs);
-  CLEAR_HARD_REG_SET (lra_reg_info[i].actual_call_used_reg_set);
   lra_reg_info[i].preferred_hard_regno1 = -1;
   lra_reg_info[i].preferred_hard_regno2 = -1;
   lra_reg_info[i].preferred_hard_regno_profit1 = 0;
@@ -1345,7 +1320,6 @@ initialize_lra_reg_info_element (int i)
   lra_reg_info[i].val = get_new_reg_value ();
   lra_reg_info[i].offset = 0;
   lra_reg_info[i].copies = NULL;
-  lra_reg_info[i].call_insn = NULL;
 }
 
 /* Initialize common reg info and copies.  */
@@ -1449,15 +1423,13 @@ lra_get_copy (int n)
 /* This page contains code dealing with info about registers in
    insns.  */
 
-/* Process X of INSN recursively and add info (operand type is
-   given by TYPE, flag of that it is early clobber is EARLY_CLOBBER)
-   about registers in X to the insn DATA.  If X can be early clobbered,
-   alternatives in which it can be early clobbered are given by
-   EARLY_CLOBBER_ALTS.  */
+/* Process X of INSN recursively and add info (operand type is given
+   by TYPE) about registers in X to the insn DATA.  If X can be early
+   clobbered, alternatives in which it can be early clobbered are given
+   by EARLY_CLOBBER_ALTS.  */
 static void
 add_regs_to_insn_regno_info (lra_insn_recog_data_t data, rtx x,
-			     rtx_insn *insn,
-			     enum op_type type, bool early_clobber,
+			     rtx_insn *insn, enum op_type type,
 			     alternative_mask early_clobber_alts)
 {
   int i, j, regno;
@@ -1487,8 +1459,7 @@ add_regs_to_insn_regno_info (lra_insn_recog_data_t data, rtx x,
       if (bitmap_set_bit (&lra_reg_info[regno].insn_bitmap, INSN_UID (insn)))
 	{
 	  data->regs = new_insn_reg (data->insn, regno, type, mode, subreg_p,
-				     early_clobber, early_clobber_alts,
-				     data->regs, false);
+				     early_clobber_alts, data->regs);
 	  return;
 	}
       else
@@ -1500,15 +1471,12 @@ add_regs_to_insn_regno_info (lra_insn_recog_data_t data, rtx x,
 		  /* The info cannot be integrated into the found
 		     structure.  */
 		  data->regs = new_insn_reg (data->insn, regno, type, mode,
-					     subreg_p, early_clobber,
-					     early_clobber_alts, data->regs,
-					     false);
+					     subreg_p, early_clobber_alts,
+					     data->regs);
 		else
 		  {
 		    if (curr->type != type)
 		      curr->type = OP_INOUT;
-		    if (curr->early_clobber != early_clobber)
-		      curr->early_clobber = true;
 		    curr->early_clobber_alts |= early_clobber_alts;
 		  }
 		return;
@@ -1520,23 +1488,21 @@ add_regs_to_insn_regno_info (lra_insn_recog_data_t data, rtx x,
   switch (code)
     {
     case SET:
-      add_regs_to_insn_regno_info (data, SET_DEST (x), insn, OP_OUT, false, 0);
-      add_regs_to_insn_regno_info (data, SET_SRC (x), insn, OP_IN, false, 0);
+      add_regs_to_insn_regno_info (data, SET_DEST (x), insn, OP_OUT, 0);
+      add_regs_to_insn_regno_info (data, SET_SRC (x), insn, OP_IN, 0);
       break;
     case CLOBBER:
       /* We treat clobber of non-operand hard registers as early
 	 clobber.  */
       add_regs_to_insn_regno_info (data, XEXP (x, 0), insn, OP_OUT,
-				   true, ALL_ALTERNATIVES);
+				   ALL_ALTERNATIVES);
       break;
-    case CLOBBER_HIGH:
-      gcc_unreachable ();
     case PRE_INC: case PRE_DEC: case POST_INC: case POST_DEC:
-      add_regs_to_insn_regno_info (data, XEXP (x, 0), insn, OP_INOUT, false, 0);
+      add_regs_to_insn_regno_info (data, XEXP (x, 0), insn, OP_INOUT, 0);
       break;
     case PRE_MODIFY: case POST_MODIFY:
-      add_regs_to_insn_regno_info (data, XEXP (x, 0), insn, OP_INOUT, false, 0);
-      add_regs_to_insn_regno_info (data, XEXP (x, 1), insn, OP_IN, false, 0);
+      add_regs_to_insn_regno_info (data, XEXP (x, 0), insn, OP_INOUT, 0);
+      add_regs_to_insn_regno_info (data, XEXP (x, 1), insn, OP_IN, 0);
       break;
     default:
       if ((code != PARALLEL && code != EXPR_LIST) || type != OP_OUT)
@@ -1557,12 +1523,12 @@ add_regs_to_insn_regno_info (lra_insn_recog_data_t data, rtx x,
       for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
 	{
 	  if (fmt[i] == 'e')
-	    add_regs_to_insn_regno_info (data, XEXP (x, i), insn, type, false, 0);
+	    add_regs_to_insn_regno_info (data, XEXP (x, i), insn, type, 0);
 	  else if (fmt[i] == 'E')
 	    {
 	      for (j = XVECLEN (x, i) - 1; j >= 0; j--)
 		add_regs_to_insn_regno_info (data, XVECEXP (x, i, j), insn,
-					     type, false, 0);
+					     type, 0);
 	    }
 	}
     }
@@ -1652,11 +1618,10 @@ lra_update_insn_regno_info (rtx_insn *insn)
   for (i = static_data->n_operands - 1; i >= 0; i--)
     add_regs_to_insn_regno_info (data, *data->operand_loc[i], insn,
 				 static_data->operand[i].type,
-				 static_data->operand[i].early_clobber,
 				 static_data->operand[i].early_clobber_alts);
   if ((code = GET_CODE (PATTERN (insn))) == CLOBBER || code == USE)
     add_regs_to_insn_regno_info (data, XEXP (PATTERN (insn), 0), insn,
-				 code == USE ? OP_IN : OP_OUT, false, 0);
+				 code == USE ? OP_IN : OP_OUT, 0);
   if (CALL_P (insn))
     /* On some targets call insns can refer to pseudos in memory in
        CALL_INSN_FUNCTION_USAGE list.  Process them in order to
@@ -1667,13 +1632,10 @@ lra_update_insn_regno_info (rtx_insn *insn)
 	 link = XEXP (link, 1))
       {
 	code = GET_CODE (XEXP (link, 0));
-	/* We could support CLOBBER_HIGH and treat it in the same way as
-	   HARD_REGNO_CALL_PART_CLOBBERED, but no port needs that yet.  */
-	gcc_assert (code != CLOBBER_HIGH);
 	if ((code == USE || code == CLOBBER)
 	    && MEM_P (XEXP (XEXP (link, 0), 0)))
 	  add_regs_to_insn_regno_info (data, XEXP (XEXP (link, 0), 0), insn,
-				       code == USE ? OP_IN : OP_OUT, false, 0);
+				       code == USE ? OP_IN : OP_OUT, 0);
       }
   if (NONDEBUG_INSN_P (insn))
     setup_insn_reg_info (data, freq);
@@ -2400,7 +2362,7 @@ lra (FILE *f)
      need it.  */
   emit_note (NOTE_INSN_DELETED);
 
-  COPY_HARD_REG_SET (lra_no_alloc_regs, ira_no_alloc_regs);
+  lra_no_alloc_regs = ira_no_alloc_regs;
 
   init_reg_info ();
   expand_reg_info ();
@@ -2436,7 +2398,9 @@ lra (FILE *f)
 
   if (crtl->saves_all_registers)
     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-      if (! call_used_regs[i] && ! fixed_regs[i] && ! LOCAL_REGNO (i))
+      if (!crtl->abi->clobbers_full_reg_p (i)
+	  && !fixed_regs[i]
+	  && !LOCAL_REGNO (i))
 	df_set_regs_ever_live (i, true);
 
   /* We don't DF from now and avoid its using because it is to
@@ -2494,19 +2458,7 @@ lra (FILE *f)
 	    }
 	  /* Do inheritance only for regular algorithms.  */
 	  if (! lra_simple_p)
-	    {
-	      if (flag_ipa_ra)
-		{
-		  if (live_p)
-		    lra_clear_live_ranges ();
-		  /* As a side-effect of lra_create_live_ranges, we calculate
-		     actual_call_used_reg_set,  which is needed during
-		     lra_inheritance.  */
-		  lra_create_live_ranges (true, true);
-		  live_p = true;
-		}
-	      lra_inheritance ();
-	    }
+	    lra_inheritance ();
 	  if (live_p)
 	    lra_clear_live_ranges ();
 	  bool fails_p;
diff --git a/gcc/lto-streamer-out.c b/gcc/lto-streamer-out.c
index c1b160237..f47ac5b76 100644
--- a/gcc/lto-streamer-out.c
+++ b/gcc/lto-streamer-out.c
@@ -1122,12 +1122,12 @@ hash_tree (struct streamer_tree_cache_d *cache, hash_map<tree, hashval_t> *map,
       hstate.add_int (DECL_BUILT_IN_CLASS (t));
       hstate.add_flag (DECL_STATIC_CONSTRUCTOR (t));
       hstate.add_flag (DECL_STATIC_DESTRUCTOR (t));
+      hstate.add_flag (FUNCTION_DECL_DECL_TYPE (t));
       hstate.add_flag (DECL_UNINLINABLE (t));
       hstate.add_flag (DECL_POSSIBLY_INLINED (t));
       hstate.add_flag (DECL_IS_NOVOPS (t));
       hstate.add_flag (DECL_IS_RETURNS_TWICE (t));
       hstate.add_flag (DECL_IS_MALLOC (t));
-      hstate.add_flag (DECL_IS_OPERATOR_NEW (t));
       hstate.add_flag (DECL_DECLARED_INLINE_P (t));
       hstate.add_flag (DECL_STATIC_CHAIN (t));
       hstate.add_flag (DECL_NO_INLINE_WARNING_P (t));
@@ -1138,7 +1138,7 @@ hash_tree (struct streamer_tree_cache_d *cache, hash_map<tree, hashval_t> *map,
       hstate.add_flag (DECL_LOOPING_CONST_OR_PURE_P (t));
       hstate.commit_flag ();
       if (DECL_BUILT_IN_CLASS (t) != NOT_BUILT_IN)
-	hstate.add_int (DECL_FUNCTION_CODE (t));
+	hstate.add_int (DECL_UNCHECKED_FUNCTION_CODE (t));
     }
 
   if (CODE_CONTAINS_STRUCT (code, TS_TYPE_COMMON))
diff --git a/gcc/lto/Make-lang.in b/gcc/lto/Make-lang.in
index 1b856d6d4..b7ed96eac 100644
--- a/gcc/lto/Make-lang.in
+++ b/gcc/lto/Make-lang.in
@@ -22,7 +22,7 @@
 # The name of the LTO compiler.
 LTO_EXE = lto1$(exeext)
 # The LTO-specific object files inclued in $(LTO_EXE).
-LTO_OBJS = lto/lto-lang.o lto/lto.o lto/lto-object.o attribs.o lto/lto-partition.o lto/lto-symtab.o
+LTO_OBJS = lto/lto-lang.o lto/lto.o lto/lto-object.o attribs.o lto/lto-partition.o lto/lto-symtab.o lto/lto-common.o
 lto_OBJS = $(LTO_OBJS)
 
 # this is only useful in a LTO bootstrap, but this does not work right
diff --git a/gcc/lto/config-lang.in b/gcc/lto/config-lang.in
index de9712504..07214365f 100644
--- a/gcc/lto/config-lang.in
+++ b/gcc/lto/config-lang.in
@@ -20,7 +20,7 @@
 language="lto"
 compilers="lto1\$(exeext)"
 
-gtfiles="\$(srcdir)/lto/lto-tree.h \$(srcdir)/lto/lto-lang.c \$(srcdir)/lto/lto.c \$(srcdir)/lto/lto.h"
+gtfiles="\$(srcdir)/lto/lto-tree.h \$(srcdir)/lto/lto-lang.c \$(srcdir)/lto/lto.c \$(srcdir)/lto/lto.h \$(srcdir)/lto/lto-common.h \$(srcdir)/lto/lto-common.c"
 
 # LTO is a special front end.  From a user's perspective it is not
 # really a language, but a middle end feature.  However, the GIMPLE
diff --git a/gcc/lto/lto-common.c b/gcc/lto/lto-common.c
new file mode 100644
index 000000000..daf7f7b47
--- /dev/null
+++ b/gcc/lto/lto-common.c
@@ -0,0 +1,2837 @@
+/* Top-level LTO routines.
+   Copyright (C) 2009-2018 Free Software Foundation, Inc.
+   Contributed by CodeSourcery, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "function.h"
+#include "bitmap.h"
+#include "basic-block.h"
+#include "tree.h"
+#include "gimple.h"
+#include "cfghooks.h"
+#include "alloc-pool.h"
+#include "tree-pass.h"
+#include "tree-streamer.h"
+#include "cgraph.h"
+#include "opts.h"
+#include "toplev.h"
+#include "stor-layout.h"
+#include "symbol-summary.h"
+#include "tree-vrp.h"
+#include "ipa-prop.h"
+#include "common.h"
+#include "debug.h"
+#include "lto.h"
+#include "lto-section-names.h"
+#include "splay-tree.h"
+#include "lto-partition.h"
+#include "context.h"
+#include "pass_manager.h"
+#include "ipa-fnsummary.h"
+#include "params.h"
+#include "ipa-utils.h"
+#include "gomp-constants.h"
+#include "lto-symtab.h"
+#include "stringpool.h"
+#include "fold-const.h"
+#include "attribs.h"
+#include "builtins.h"
+#include "lto-common.h"
+
+GTY(()) tree first_personality_decl;
+
+GTY(()) const unsigned char *lto_mode_identity_table;
+
+/* Returns a hash code for P.  */
+
+static hashval_t
+hash_name (const void *p)
+{
+  const struct lto_section_slot *ds = (const struct lto_section_slot *) p;
+  return (hashval_t) htab_hash_string (ds->name);
+}
+
+
+/* Returns nonzero if P1 and P2 are equal.  */
+
+static int
+eq_name (const void *p1, const void *p2)
+{
+  const struct lto_section_slot *s1 =
+    (const struct lto_section_slot *) p1;
+  const struct lto_section_slot *s2 =
+    (const struct lto_section_slot *) p2;
+
+  return strcmp (s1->name, s2->name) == 0;
+}
+
+/* Free lto_section_slot */
+
+static void
+free_with_string (void *arg)
+{
+  struct lto_section_slot *s = (struct lto_section_slot *)arg;
+
+  free (CONST_CAST (char *, s->name));
+  free (arg);
+}
+
+/* Create section hash table */
+
+htab_t 
+lto_obj_create_section_hash_table (void)
+{
+  return htab_create (37, hash_name, eq_name, free_with_string);
+}
+
+/* Delete an allocated integer KEY in the splay tree.  */
+
+static void
+lto_splay_tree_delete_id (splay_tree_key key)
+{
+  free ((void *) key);
+}
+
+/* Compare splay tree node ids A and B.  */
+
+static int
+lto_splay_tree_compare_ids (splay_tree_key a, splay_tree_key b)
+{
+  unsigned HOST_WIDE_INT ai;
+  unsigned HOST_WIDE_INT bi;
+
+  ai = *(unsigned HOST_WIDE_INT *) a;
+  bi = *(unsigned HOST_WIDE_INT *) b;
+
+  if (ai < bi)
+    return -1;
+  else if (ai > bi)
+    return 1;
+  return 0;
+}
+
+/* Look up splay tree node by ID in splay tree T.  */
+
+static splay_tree_node
+lto_splay_tree_lookup (splay_tree t, unsigned HOST_WIDE_INT id)
+{
+  return splay_tree_lookup (t, (splay_tree_key) &id);
+}
+
+/* Check if KEY has ID.  */
+
+static bool
+lto_splay_tree_id_equal_p (splay_tree_key key, unsigned HOST_WIDE_INT id)
+{
+  return *(unsigned HOST_WIDE_INT *) key == id;
+}
+
+/* Insert a splay tree node into tree T with ID as key and FILE_DATA as value. 
+   The ID is allocated separately because we need HOST_WIDE_INTs which may
+   be wider than a splay_tree_key. */
+
+static void
+lto_splay_tree_insert (splay_tree t, unsigned HOST_WIDE_INT id,
+		       struct lto_file_decl_data *file_data)
+{
+  unsigned HOST_WIDE_INT *idp = XCNEW (unsigned HOST_WIDE_INT);
+  *idp = id;
+  splay_tree_insert (t, (splay_tree_key) idp, (splay_tree_value) file_data);
+}
+
+/* Create a splay tree.  */
+
+static splay_tree
+lto_splay_tree_new (void)
+{
+  return splay_tree_new (lto_splay_tree_compare_ids,
+	 	         lto_splay_tree_delete_id,
+			 NULL);
+}
+
+/* Decode the content of memory pointed to by DATA in the in decl
+   state object STATE. DATA_IN points to a data_in structure for
+   decoding. Return the address after the decoded object in the
+   input.  */
+
+static const uint32_t *
+lto_read_in_decl_state (struct data_in *data_in, const uint32_t *data,
+			struct lto_in_decl_state *state)
+{
+  uint32_t ix;
+  tree decl;
+  uint32_t i, j;
+
+  ix = *data++;
+  state->compressed = ix & 1;
+  ix /= 2;
+  decl = streamer_tree_cache_get_tree (data_in->reader_cache, ix);
+  if (!VAR_OR_FUNCTION_DECL_P (decl))
+    {
+      gcc_assert (decl == void_type_node);
+      decl = NULL_TREE;
+    }
+  state->fn_decl = decl;
+
+  for (i = 0; i < LTO_N_DECL_STREAMS; i++)
+    {
+      uint32_t size = *data++;
+      vec<tree, va_gc> *decls = NULL;
+      vec_alloc (decls, size);
+
+      for (j = 0; j < size; j++)
+	vec_safe_push (decls,
+		       streamer_tree_cache_get_tree (data_in->reader_cache,
+						     data[j]));
+
+      state->streams[i] = decls;
+      data += size;
+    }
+
+  return data;
+}
+
+
+/* Global canonical type table.  */
+static htab_t gimple_canonical_types;
+static hash_map<const_tree, hashval_t> *canonical_type_hash_cache;
+static unsigned long num_canonical_type_hash_entries;
+static unsigned long num_canonical_type_hash_queries;
+
+static void iterative_hash_canonical_type (tree type, inchash::hash &hstate);
+static hashval_t gimple_canonical_type_hash (const void *p);
+static void gimple_register_canonical_type_1 (tree t, hashval_t hash);
+
+/* Returning a hash value for gimple type TYPE.
+
+   The hash value returned is equal for types considered compatible
+   by gimple_canonical_types_compatible_p.  */
+
+static hashval_t
+hash_canonical_type (tree type)
+{
+  inchash::hash hstate;
+  enum tree_code code;
+
+  /* We compute alias sets only for types that needs them.
+     Be sure we do not recurse to something else as we cannot hash incomplete
+     types in a way they would have same hash value as compatible complete
+     types.  */
+  gcc_checking_assert (type_with_alias_set_p (type));
+
+  /* Combine a few common features of types so that types are grouped into
+     smaller sets; when searching for existing matching types to merge,
+     only existing types having the same features as the new type will be
+     checked.  */
+  code = tree_code_for_canonical_type_merging (TREE_CODE (type));
+  hstate.add_int (code);
+  hstate.add_int (TYPE_MODE (type));
+
+  /* Incorporate common features of numerical types.  */
+  if (INTEGRAL_TYPE_P (type)
+      || SCALAR_FLOAT_TYPE_P (type)
+      || FIXED_POINT_TYPE_P (type)
+      || TREE_CODE (type) == OFFSET_TYPE
+      || POINTER_TYPE_P (type))
+    {
+      hstate.add_int (TYPE_PRECISION (type));
+      if (!type_with_interoperable_signedness (type))
+        hstate.add_int (TYPE_UNSIGNED (type));
+    }
+
+  if (VECTOR_TYPE_P (type))
+    {
+      hstate.add_poly_int (TYPE_VECTOR_SUBPARTS (type));
+      hstate.add_int (TYPE_UNSIGNED (type));
+    }
+
+  if (TREE_CODE (type) == COMPLEX_TYPE)
+    hstate.add_int (TYPE_UNSIGNED (type));
+
+  /* Fortran's C_SIGNED_CHAR is !TYPE_STRING_FLAG but needs to be
+     interoperable with "signed char".  Unless all frontends are revisited to
+     agree on these types, we must ignore the flag completely.  */
+
+  /* Fortran standard define C_PTR type that is compatible with every
+     C pointer.  For this reason we need to glob all pointers into one.
+     Still pointers in different address spaces are not compatible.  */
+  if (POINTER_TYPE_P (type))
+    hstate.add_int (TYPE_ADDR_SPACE (TREE_TYPE (type)));
+
+  /* For array types hash the domain bounds and the string flag.  */
+  if (TREE_CODE (type) == ARRAY_TYPE && TYPE_DOMAIN (type))
+    {
+      hstate.add_int (TYPE_STRING_FLAG (type));
+      /* OMP lowering can introduce error_mark_node in place of
+	 random local decls in types.  */
+      if (TYPE_MIN_VALUE (TYPE_DOMAIN (type)) != error_mark_node)
+	inchash::add_expr (TYPE_MIN_VALUE (TYPE_DOMAIN (type)), hstate);
+      if (TYPE_MAX_VALUE (TYPE_DOMAIN (type)) != error_mark_node)
+	inchash::add_expr (TYPE_MAX_VALUE (TYPE_DOMAIN (type)), hstate);
+    }
+
+  /* Recurse for aggregates with a single element type.  */
+  if (TREE_CODE (type) == ARRAY_TYPE
+      || TREE_CODE (type) == COMPLEX_TYPE
+      || TREE_CODE (type) == VECTOR_TYPE)
+    iterative_hash_canonical_type (TREE_TYPE (type), hstate);
+
+  /* Incorporate function return and argument types.  */
+  if (TREE_CODE (type) == FUNCTION_TYPE || TREE_CODE (type) == METHOD_TYPE)
+    {
+      unsigned na;
+      tree p;
+
+      iterative_hash_canonical_type (TREE_TYPE (type), hstate);
+
+      for (p = TYPE_ARG_TYPES (type), na = 0; p; p = TREE_CHAIN (p))
+	{
+	  iterative_hash_canonical_type (TREE_VALUE (p), hstate);
+	  na++;
+	}
+
+      hstate.add_int (na);
+    }
+
+  if (RECORD_OR_UNION_TYPE_P (type))
+    {
+      unsigned nf;
+      tree f;
+
+      for (f = TYPE_FIELDS (type), nf = 0; f; f = TREE_CHAIN (f))
+	if (TREE_CODE (f) == FIELD_DECL
+	    && (! DECL_SIZE (f)
+		|| ! integer_zerop (DECL_SIZE (f))))
+	  {
+	    iterative_hash_canonical_type (TREE_TYPE (f), hstate);
+	    nf++;
+	  }
+
+      hstate.add_int (nf);
+    }
+
+  return hstate.end();
+}
+
+/* Returning a hash value for gimple type TYPE combined with VAL.  */
+
+static void
+iterative_hash_canonical_type (tree type, inchash::hash &hstate)
+{
+  hashval_t v;
+
+  /* All type variants have same TYPE_CANONICAL.  */
+  type = TYPE_MAIN_VARIANT (type);
+
+  if (!canonical_type_used_p (type))
+    v = hash_canonical_type (type);
+  /* An already processed type.  */
+  else if (TYPE_CANONICAL (type))
+    {
+      type = TYPE_CANONICAL (type);
+      v = gimple_canonical_type_hash (type);
+    }
+  else
+    {
+      /* Canonical types should not be able to form SCCs by design, this
+	 recursion is just because we do not register canonical types in
+	 optimal order.  To avoid quadratic behavior also register the
+	 type here.  */
+      v = hash_canonical_type (type);
+      gimple_register_canonical_type_1 (type, v);
+    }
+  hstate.add_int (v);
+}
+
+/* Returns the hash for a canonical type P.  */
+
+static hashval_t
+gimple_canonical_type_hash (const void *p)
+{
+  num_canonical_type_hash_queries++;
+  hashval_t *slot = canonical_type_hash_cache->get ((const_tree) p);
+  gcc_assert (slot != NULL);
+  return *slot;
+}
+
+
+
+/* Returns nonzero if P1 and P2 are equal.  */
+
+static int
+gimple_canonical_type_eq (const void *p1, const void *p2)
+{
+  const_tree t1 = (const_tree) p1;
+  const_tree t2 = (const_tree) p2;
+  return gimple_canonical_types_compatible_p (CONST_CAST_TREE (t1),
+					      CONST_CAST_TREE (t2));
+}
+
+/* Main worker for gimple_register_canonical_type.  */
+
+static void
+gimple_register_canonical_type_1 (tree t, hashval_t hash)
+{
+  void **slot;
+
+  gcc_checking_assert (TYPE_P (t) && !TYPE_CANONICAL (t)
+		       && type_with_alias_set_p (t)
+		       && canonical_type_used_p (t));
+
+  slot = htab_find_slot_with_hash (gimple_canonical_types, t, hash, INSERT);
+  if (*slot)
+    {
+      tree new_type = (tree)(*slot);
+      gcc_checking_assert (new_type != t);
+      TYPE_CANONICAL (t) = new_type;
+    }
+  else
+    {
+      TYPE_CANONICAL (t) = t;
+      *slot = (void *) t;
+      /* Cache the just computed hash value.  */
+      num_canonical_type_hash_entries++;
+      bool existed_p = canonical_type_hash_cache->put (t, hash);
+      gcc_assert (!existed_p);
+    }
+}
+
+/* Register type T in the global type table gimple_types and set
+   TYPE_CANONICAL of T accordingly.
+   This is used by LTO to merge structurally equivalent types for
+   type-based aliasing purposes across different TUs and languages.
+
+   ???  This merging does not exactly match how the tree.c middle-end
+   functions will assign TYPE_CANONICAL when new types are created
+   during optimization (which at least happens for pointer and array
+   types).  */
+
+static void
+gimple_register_canonical_type (tree t)
+{
+  if (TYPE_CANONICAL (t) || !type_with_alias_set_p (t)
+      || !canonical_type_used_p (t))
+    return;
+
+  /* Canonical types are same among all complete variants.  */
+  if (TYPE_CANONICAL (TYPE_MAIN_VARIANT (t)))
+    TYPE_CANONICAL (t) = TYPE_CANONICAL (TYPE_MAIN_VARIANT (t));
+  else
+    {
+      gimple_register_canonical_type_1 (TYPE_MAIN_VARIANT (t),
+					hash_canonical_type (TYPE_MAIN_VARIANT (t)));
+      TYPE_CANONICAL (t) = TYPE_CANONICAL (TYPE_MAIN_VARIANT (t));
+    }
+}
+
+/* Re-compute TYPE_CANONICAL for NODE and related types.  */
+
+static void
+lto_register_canonical_types (tree node, bool first_p)
+{
+  if (!node
+      || !TYPE_P (node))
+    return;
+
+  if (first_p)
+    TYPE_CANONICAL (node) = NULL_TREE;
+
+  if (POINTER_TYPE_P (node)
+      || TREE_CODE (node) == COMPLEX_TYPE
+      || TREE_CODE (node) == ARRAY_TYPE)
+    lto_register_canonical_types (TREE_TYPE (node), first_p);
+
+ if (!first_p) 
+    gimple_register_canonical_type (node);
+}
+
+
+/* Remember trees that contains references to declarations.  */
+vec <tree, va_gc> *tree_with_vars;
+
+#define CHECK_VAR(tt) \
+  do \
+    { \
+      if ((tt) && VAR_OR_FUNCTION_DECL_P (tt) \
+	  && (TREE_PUBLIC (tt) || DECL_EXTERNAL (tt))) \
+	return true; \
+    } while (0)
+
+#define CHECK_NO_VAR(tt) \
+  gcc_checking_assert (!(tt) || !VAR_OR_FUNCTION_DECL_P (tt))
+
+/* Check presence of pointers to decls in fields of a tree_typed T.  */
+
+static inline bool
+mentions_vars_p_typed (tree t)
+{
+  CHECK_NO_VAR (TREE_TYPE (t));
+  return false;
+}
+
+/* Check presence of pointers to decls in fields of a tree_common T.  */
+
+static inline bool
+mentions_vars_p_common (tree t)
+{
+  if (mentions_vars_p_typed (t))
+    return true;
+  CHECK_NO_VAR (TREE_CHAIN (t));
+  return false;
+}
+
+/* Check presence of pointers to decls in fields of a decl_minimal T.  */
+
+static inline bool
+mentions_vars_p_decl_minimal (tree t)
+{
+  if (mentions_vars_p_common (t))
+    return true;
+  CHECK_NO_VAR (DECL_NAME (t));
+  CHECK_VAR (DECL_CONTEXT (t));
+  return false;
+}
+
+/* Check presence of pointers to decls in fields of a decl_common T.  */
+
+static inline bool
+mentions_vars_p_decl_common (tree t)
+{
+  if (mentions_vars_p_decl_minimal (t))
+    return true;
+  CHECK_VAR (DECL_SIZE (t));
+  CHECK_VAR (DECL_SIZE_UNIT (t));
+  CHECK_VAR (DECL_INITIAL (t));
+  CHECK_NO_VAR (DECL_ATTRIBUTES (t));
+  CHECK_VAR (DECL_ABSTRACT_ORIGIN (t));
+  return false;
+}
+
+/* Check presence of pointers to decls in fields of a decl_with_vis T.  */
+
+static inline bool
+mentions_vars_p_decl_with_vis (tree t)
+{
+  if (mentions_vars_p_decl_common (t))
+    return true;
+
+  /* Accessor macro has side-effects, use field-name here. */
+  CHECK_NO_VAR (DECL_ASSEMBLER_NAME_RAW (t));
+  return false;
+}
+
+/* Check presence of pointers to decls in fields of a decl_non_common T.  */
+
+static inline bool
+mentions_vars_p_decl_non_common (tree t)
+{
+  if (mentions_vars_p_decl_with_vis (t))
+    return true;
+  CHECK_NO_VAR (DECL_RESULT_FLD (t));
+  return false;
+}
+
+/* Check presence of pointers to decls in fields of a decl_non_common T.  */
+
+static bool
+mentions_vars_p_function (tree t)
+{
+  if (mentions_vars_p_decl_non_common (t))
+    return true;
+  CHECK_NO_VAR (DECL_ARGUMENTS (t));
+  CHECK_NO_VAR (DECL_VINDEX (t));
+  CHECK_VAR (DECL_FUNCTION_PERSONALITY (t));
+  return false;
+}
+
+/* Check presence of pointers to decls in fields of a field_decl T.  */
+
+static bool
+mentions_vars_p_field_decl (tree t)
+{
+  if (mentions_vars_p_decl_common (t))
+    return true;
+  CHECK_VAR (DECL_FIELD_OFFSET (t));
+  CHECK_NO_VAR (DECL_BIT_FIELD_TYPE (t));
+  CHECK_NO_VAR (DECL_QUALIFIER (t));
+  CHECK_NO_VAR (DECL_FIELD_BIT_OFFSET (t));
+  CHECK_NO_VAR (DECL_FCONTEXT (t));
+  return false;
+}
+
+/* Check presence of pointers to decls in fields of a type T.  */
+
+static bool
+mentions_vars_p_type (tree t)
+{
+  if (mentions_vars_p_common (t))
+    return true;
+  CHECK_NO_VAR (TYPE_CACHED_VALUES (t));
+  CHECK_VAR (TYPE_SIZE (t));
+  CHECK_VAR (TYPE_SIZE_UNIT (t));
+  CHECK_NO_VAR (TYPE_ATTRIBUTES (t));
+  CHECK_NO_VAR (TYPE_NAME (t));
+
+  CHECK_VAR (TYPE_MIN_VALUE_RAW (t));
+  CHECK_VAR (TYPE_MAX_VALUE_RAW (t));
+
+  /* Accessor is for derived node types only. */
+  CHECK_NO_VAR (TYPE_LANG_SLOT_1 (t));
+
+  CHECK_VAR (TYPE_CONTEXT (t));
+  CHECK_NO_VAR (TYPE_CANONICAL (t));
+  CHECK_NO_VAR (TYPE_MAIN_VARIANT (t));
+  CHECK_NO_VAR (TYPE_NEXT_VARIANT (t));
+  return false;
+}
+
+/* Check presence of pointers to decls in fields of a BINFO T.  */
+
+static bool
+mentions_vars_p_binfo (tree t)
+{
+  unsigned HOST_WIDE_INT i, n;
+
+  if (mentions_vars_p_common (t))
+    return true;
+  CHECK_VAR (BINFO_VTABLE (t));
+  CHECK_NO_VAR (BINFO_OFFSET (t));
+  CHECK_NO_VAR (BINFO_VIRTUALS (t));
+  CHECK_NO_VAR (BINFO_VPTR_FIELD (t));
+  n = vec_safe_length (BINFO_BASE_ACCESSES (t));
+  for (i = 0; i < n; i++)
+    CHECK_NO_VAR (BINFO_BASE_ACCESS (t, i));
+  /* Do not walk BINFO_INHERITANCE_CHAIN, BINFO_SUBVTT_INDEX
+     and BINFO_VPTR_INDEX; these are used by C++ FE only.  */
+  n = BINFO_N_BASE_BINFOS (t);
+  for (i = 0; i < n; i++)
+    CHECK_NO_VAR (BINFO_BASE_BINFO (t, i));
+  return false;
+}
+
+/* Check presence of pointers to decls in fields of a CONSTRUCTOR T.  */
+
+static bool
+mentions_vars_p_constructor (tree t)
+{
+  unsigned HOST_WIDE_INT idx;
+  constructor_elt *ce;
+
+  if (mentions_vars_p_typed (t))
+    return true;
+
+  for (idx = 0; vec_safe_iterate (CONSTRUCTOR_ELTS (t), idx, &ce); idx++)
+    {
+      CHECK_NO_VAR (ce->index);
+      CHECK_VAR (ce->value);
+    }
+  return false;
+}
+
+/* Check presence of pointers to decls in fields of an expression tree T.  */
+
+static bool
+mentions_vars_p_expr (tree t)
+{
+  int i;
+  if (mentions_vars_p_typed (t))
+    return true;
+  for (i = TREE_OPERAND_LENGTH (t) - 1; i >= 0; --i)
+    CHECK_VAR (TREE_OPERAND (t, i));
+  return false;
+}
+
+/* Check presence of pointers to decls in fields of an OMP_CLAUSE T.  */
+
+static bool
+mentions_vars_p_omp_clause (tree t)
+{
+  int i;
+  if (mentions_vars_p_common (t))
+    return true;
+  for (i = omp_clause_num_ops[OMP_CLAUSE_CODE (t)] - 1; i >= 0; --i)
+    CHECK_VAR (OMP_CLAUSE_OPERAND (t, i));
+  return false;
+}
+
+/* Check presence of pointers to decls that needs later fixup in T.  */
+
+static bool
+mentions_vars_p (tree t)
+{
+  switch (TREE_CODE (t))
+    {
+    case IDENTIFIER_NODE:
+      break;
+
+    case TREE_LIST:
+      CHECK_VAR (TREE_VALUE (t));
+      CHECK_VAR (TREE_PURPOSE (t));
+      CHECK_NO_VAR (TREE_CHAIN (t));
+      break;
+
+    case FIELD_DECL:
+      return mentions_vars_p_field_decl (t);
+
+    case LABEL_DECL:
+    case CONST_DECL:
+    case PARM_DECL:
+    case RESULT_DECL:
+    case IMPORTED_DECL:
+    case NAMESPACE_DECL:
+    case NAMELIST_DECL:
+      return mentions_vars_p_decl_common (t);
+
+    case VAR_DECL:
+      return mentions_vars_p_decl_with_vis (t);
+
+    case TYPE_DECL:
+      return mentions_vars_p_decl_non_common (t);
+
+    case FUNCTION_DECL:
+      return mentions_vars_p_function (t);
+
+    case TREE_BINFO:
+      return mentions_vars_p_binfo (t);
+
+    case PLACEHOLDER_EXPR:
+      return mentions_vars_p_common (t);
+
+    case BLOCK:
+    case TRANSLATION_UNIT_DECL:
+    case OPTIMIZATION_NODE:
+    case TARGET_OPTION_NODE:
+      break;
+
+    case CONSTRUCTOR:
+      return mentions_vars_p_constructor (t);
+
+    case OMP_CLAUSE:
+      return mentions_vars_p_omp_clause (t);
+
+    default:
+      if (TYPE_P (t))
+	{
+	  if (mentions_vars_p_type (t))
+	    return true;
+	}
+      else if (EXPR_P (t))
+	{
+	  if (mentions_vars_p_expr (t))
+	    return true;
+	}
+      else if (CONSTANT_CLASS_P (t))
+	CHECK_NO_VAR (TREE_TYPE (t));
+      else
+	gcc_unreachable ();
+    }
+  return false;
+}
+
+
+/* Return the resolution for the decl with index INDEX from DATA_IN. */
+
+static enum ld_plugin_symbol_resolution
+get_resolution (struct data_in *data_in, unsigned index)
+{
+  if (data_in->globals_resolution.exists ())
+    {
+      ld_plugin_symbol_resolution_t ret;
+      /* We can have references to not emitted functions in
+	 DECL_FUNCTION_PERSONALITY at least.  So we can and have
+	 to indeed return LDPR_UNKNOWN in some cases.   */
+      if (data_in->globals_resolution.length () <= index)
+	return LDPR_UNKNOWN;
+      ret = data_in->globals_resolution[index];
+      return ret;
+    }
+  else
+    /* Delay resolution finding until decl merging.  */
+    return LDPR_UNKNOWN;
+}
+
+/* We need to record resolutions until symbol table is read.  */
+static void
+register_resolution (struct lto_file_decl_data *file_data, tree decl,
+		     enum ld_plugin_symbol_resolution resolution)
+{
+  bool existed;
+  if (resolution == LDPR_UNKNOWN)
+    return;
+  if (!file_data->resolution_map)
+    file_data->resolution_map
+      = new hash_map<tree, ld_plugin_symbol_resolution>;
+  ld_plugin_symbol_resolution_t &res
+     = file_data->resolution_map->get_or_insert (decl, &existed);
+  if (!existed
+      || resolution == LDPR_PREVAILING_DEF_IRONLY
+      || resolution == LDPR_PREVAILING_DEF
+      || resolution == LDPR_PREVAILING_DEF_IRONLY_EXP)
+    res = resolution;
+}
+
+/* Register DECL with the global symbol table and change its
+   name if necessary to avoid name clashes for static globals across
+   different files.  */
+
+static void
+lto_register_var_decl_in_symtab (struct data_in *data_in, tree decl,
+				 unsigned ix)
+{
+  tree context;
+
+  /* Variable has file scope, not local.  */
+  if (!TREE_PUBLIC (decl)
+      && !((context = decl_function_context (decl))
+	   && auto_var_in_fn_p (decl, context)))
+    rest_of_decl_compilation (decl, 1, 0);
+
+  /* If this variable has already been declared, queue the
+     declaration for merging.  */
+  if (TREE_PUBLIC (decl))
+    register_resolution (data_in->file_data,
+			 decl, get_resolution (data_in, ix));
+}
+
+
+/* Register DECL with the global symbol table and change its
+   name if necessary to avoid name clashes for static globals across
+   different files.  DATA_IN contains descriptors and tables for the
+   file being read.  */
+
+static void
+lto_register_function_decl_in_symtab (struct data_in *data_in, tree decl,
+				      unsigned ix)
+{
+  /* If this variable has already been declared, queue the
+     declaration for merging.  */
+  if (TREE_PUBLIC (decl) && !DECL_ABSTRACT_P (decl))
+    register_resolution (data_in->file_data,
+			 decl, get_resolution (data_in, ix));
+}
+
+/* Check if T is a decl and needs register its resolution info.  */
+
+static void
+lto_maybe_register_decl (struct data_in *data_in, tree t, unsigned ix)
+{
+  if (TREE_CODE (t) == VAR_DECL)
+    lto_register_var_decl_in_symtab (data_in, t, ix);
+  else if (TREE_CODE (t) == FUNCTION_DECL
+	   && !fndecl_built_in_p (t))
+    lto_register_function_decl_in_symtab (data_in, t, ix);
+}
+
+
+/* For the type T re-materialize it in the type variant list and
+   the pointer/reference-to chains.  */
+
+static void
+lto_fixup_prevailing_type (tree t)
+{
+  /* The following re-creates proper variant lists while fixing up
+     the variant leaders.  We do not stream TYPE_NEXT_VARIANT so the
+     variant list state before fixup is broken.  */
+
+  /* If we are not our own variant leader link us into our new leaders
+     variant list.  */
+  if (TYPE_MAIN_VARIANT (t) != t)
+    {
+      tree mv = TYPE_MAIN_VARIANT (t);
+      TYPE_NEXT_VARIANT (t) = TYPE_NEXT_VARIANT (mv);
+      TYPE_NEXT_VARIANT (mv) = t;
+    }
+
+  /* The following reconstructs the pointer chains
+     of the new pointed-to type if we are a main variant.  We do
+     not stream those so they are broken before fixup.  */
+  if (TREE_CODE (t) == POINTER_TYPE
+      && TYPE_MAIN_VARIANT (t) == t)
+    {
+      TYPE_NEXT_PTR_TO (t) = TYPE_POINTER_TO (TREE_TYPE (t));
+      TYPE_POINTER_TO (TREE_TYPE (t)) = t;
+    }
+  else if (TREE_CODE (t) == REFERENCE_TYPE
+	   && TYPE_MAIN_VARIANT (t) == t)
+    {
+      TYPE_NEXT_REF_TO (t) = TYPE_REFERENCE_TO (TREE_TYPE (t));
+      TYPE_REFERENCE_TO (TREE_TYPE (t)) = t;
+    }
+}
+
+
+/* We keep prevailing tree SCCs in a hashtable with manual collision
+   handling (in case all hashes compare the same) and keep the colliding
+   entries in the tree_scc->next chain.  */
+
+struct tree_scc
+{
+  tree_scc *next;
+  /* Hash of the whole SCC.  */
+  hashval_t hash;
+  /* Number of trees in the SCC.  */
+  unsigned len;
+  /* Number of possible entries into the SCC (tree nodes [0..entry_len-1]
+     which share the same individual tree hash).  */
+  unsigned entry_len;
+  /* The members of the SCC.
+     We only need to remember the first entry node candidate for prevailing
+     SCCs (but of course have access to all entries for SCCs we are
+     processing).
+     ???  For prevailing SCCs we really only need hash and the first
+     entry candidate, but that's too awkward to implement.  */
+  tree entries[1];
+};
+
+struct tree_scc_hasher : nofree_ptr_hash <tree_scc>
+{
+  static inline hashval_t hash (const tree_scc *);
+  static inline bool equal (const tree_scc *, const tree_scc *);
+};
+
+hashval_t
+tree_scc_hasher::hash (const tree_scc *scc)
+{
+  return scc->hash;
+}
+
+bool
+tree_scc_hasher::equal (const tree_scc *scc1, const tree_scc *scc2)
+{
+  if (scc1->hash != scc2->hash
+      || scc1->len != scc2->len
+      || scc1->entry_len != scc2->entry_len)
+    return false;
+  return true;
+}
+
+static hash_table<tree_scc_hasher> *tree_scc_hash;
+static struct obstack tree_scc_hash_obstack;
+
+static unsigned long num_merged_types;
+static unsigned long num_prevailing_types;
+static unsigned long num_type_scc_trees;
+static unsigned long total_scc_size;
+static unsigned long num_sccs_read;
+static unsigned long total_scc_size_merged;
+static unsigned long num_sccs_merged;
+static unsigned long num_scc_compares;
+static unsigned long num_scc_compare_collisions;
+
+
+/* Compare the two entries T1 and T2 of two SCCs that are possibly equal,
+   recursing through in-SCC tree edges.  Returns true if the SCCs entered
+   through T1 and T2 are equal and fills in *MAP with the pairs of
+   SCC entries we visited, starting with (*MAP)[0] = T1 and (*MAP)[1] = T2.  */
+
+static bool
+compare_tree_sccs_1 (tree t1, tree t2, tree **map)
+{
+  enum tree_code code;
+
+  /* Mark already visited nodes.  */
+  TREE_ASM_WRITTEN (t2) = 1;
+
+  /* Push the pair onto map.  */
+  (*map)[0] = t1;
+  (*map)[1] = t2;
+  *map = *map + 2;
+
+  /* Compare value-fields.  */
+#define compare_values(X) \
+  do { \
+    if (X(t1) != X(t2)) \
+      return false; \
+  } while (0)
+
+  compare_values (TREE_CODE);
+  code = TREE_CODE (t1);
+
+  if (!TYPE_P (t1))
+    {
+      compare_values (TREE_SIDE_EFFECTS);
+      compare_values (TREE_CONSTANT);
+      compare_values (TREE_READONLY);
+      compare_values (TREE_PUBLIC);
+    }
+  compare_values (TREE_ADDRESSABLE);
+  compare_values (TREE_THIS_VOLATILE);
+  if (DECL_P (t1))
+    compare_values (DECL_UNSIGNED);
+  else if (TYPE_P (t1))
+    compare_values (TYPE_UNSIGNED);
+  if (TYPE_P (t1))
+    compare_values (TYPE_ARTIFICIAL);
+  else
+    compare_values (TREE_NO_WARNING);
+  compare_values (TREE_NOTHROW);
+  compare_values (TREE_STATIC);
+  if (code != TREE_BINFO)
+    compare_values (TREE_PRIVATE);
+  compare_values (TREE_PROTECTED);
+  compare_values (TREE_DEPRECATED);
+  if (TYPE_P (t1))
+    {
+      if (AGGREGATE_TYPE_P (t1))
+	compare_values (TYPE_REVERSE_STORAGE_ORDER);
+      else
+	compare_values (TYPE_SATURATING);
+      compare_values (TYPE_ADDR_SPACE);
+    }
+  else if (code == SSA_NAME)
+    compare_values (SSA_NAME_IS_DEFAULT_DEF);
+
+  if (CODE_CONTAINS_STRUCT (code, TS_INT_CST))
+    {
+      if (wi::to_wide (t1) != wi::to_wide (t2))
+	return false;
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_REAL_CST))
+    {
+      /* ???  No suitable compare routine available.  */
+      REAL_VALUE_TYPE r1 = TREE_REAL_CST (t1);
+      REAL_VALUE_TYPE r2 = TREE_REAL_CST (t2);
+      if (r1.cl != r2.cl
+	  || r1.decimal != r2.decimal
+	  || r1.sign != r2.sign
+	  || r1.signalling != r2.signalling
+	  || r1.canonical != r2.canonical
+	  || r1.uexp != r2.uexp)
+	return false;
+      for (unsigned i = 0; i < SIGSZ; ++i)
+	if (r1.sig[i] != r2.sig[i])
+	  return false;
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_FIXED_CST))
+    if (!fixed_compare (EQ_EXPR,
+			TREE_FIXED_CST_PTR (t1), TREE_FIXED_CST_PTR (t2)))
+      return false;
+
+  if (CODE_CONTAINS_STRUCT (code, TS_VECTOR))
+    {
+      compare_values (VECTOR_CST_LOG2_NPATTERNS);
+      compare_values (VECTOR_CST_NELTS_PER_PATTERN);
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_DECL_COMMON))
+    {
+      compare_values (DECL_MODE);
+      compare_values (DECL_NONLOCAL);
+      compare_values (DECL_VIRTUAL_P);
+      compare_values (DECL_IGNORED_P);
+      compare_values (DECL_ABSTRACT_P);
+      compare_values (DECL_ARTIFICIAL);
+      compare_values (DECL_USER_ALIGN);
+      compare_values (DECL_PRESERVE_P);
+      compare_values (DECL_EXTERNAL);
+      compare_values (DECL_GIMPLE_REG_P);
+      compare_values (DECL_ALIGN);
+      if (code == LABEL_DECL)
+	{
+	  compare_values (EH_LANDING_PAD_NR);
+	  compare_values (LABEL_DECL_UID);
+	}
+      else if (code == FIELD_DECL)
+	{
+	  compare_values (DECL_PACKED);
+	  compare_values (DECL_NONADDRESSABLE_P);
+	  compare_values (DECL_PADDING_P);
+	  compare_values (DECL_OFFSET_ALIGN);
+	}
+      else if (code == VAR_DECL)
+	{
+	  compare_values (DECL_HAS_DEBUG_EXPR_P);
+	  compare_values (DECL_NONLOCAL_FRAME);
+	}
+      if (code == RESULT_DECL
+	  || code == PARM_DECL
+	  || code == VAR_DECL)
+	{
+	  compare_values (DECL_BY_REFERENCE);
+	  if (code == VAR_DECL
+	      || code == PARM_DECL)
+	    compare_values (DECL_HAS_VALUE_EXPR_P);
+	}
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_DECL_WRTL))
+    compare_values (DECL_REGISTER);
+
+  if (CODE_CONTAINS_STRUCT (code, TS_DECL_WITH_VIS))
+    {
+      compare_values (DECL_COMMON);
+      compare_values (DECL_DLLIMPORT_P);
+      compare_values (DECL_WEAK);
+      compare_values (DECL_SEEN_IN_BIND_EXPR_P);
+      compare_values (DECL_COMDAT);
+      compare_values (DECL_VISIBILITY);
+      compare_values (DECL_VISIBILITY_SPECIFIED);
+      if (code == VAR_DECL)
+	{
+	  compare_values (DECL_HARD_REGISTER);
+          /* DECL_IN_TEXT_SECTION is set during final asm output only.  */
+	  compare_values (DECL_IN_CONSTANT_POOL);
+	}
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_FUNCTION_DECL))
+    {
+      compare_values (DECL_BUILT_IN_CLASS);
+      compare_values (DECL_STATIC_CONSTRUCTOR);
+      compare_values (DECL_STATIC_DESTRUCTOR);
+      compare_values (DECL_UNINLINABLE);
+      compare_values (DECL_POSSIBLY_INLINED);
+      compare_values (DECL_IS_NOVOPS);
+      compare_values (DECL_IS_RETURNS_TWICE);
+      compare_values (DECL_IS_MALLOC);
+      compare_values (DECL_IS_OPERATOR_NEW_P);
+      compare_values (DECL_DECLARED_INLINE_P);
+      compare_values (DECL_STATIC_CHAIN);
+      compare_values (DECL_NO_INLINE_WARNING_P);
+      compare_values (DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT);
+      compare_values (DECL_NO_LIMIT_STACK);
+      compare_values (DECL_DISREGARD_INLINE_LIMITS);
+      compare_values (DECL_PURE_P);
+      compare_values (DECL_LOOPING_CONST_OR_PURE_P);
+      compare_values (DECL_FINAL_P);
+      compare_values (DECL_CXX_CONSTRUCTOR_P);
+      compare_values (DECL_CXX_DESTRUCTOR_P);
+      if (DECL_BUILT_IN_CLASS (t1) != NOT_BUILT_IN)
+	compare_values (DECL_UNCHECKED_FUNCTION_CODE);
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_TYPE_COMMON))
+    {
+      compare_values (TYPE_MODE);
+      compare_values (TYPE_STRING_FLAG);
+      compare_values (TYPE_NEEDS_CONSTRUCTING);
+      if (RECORD_OR_UNION_TYPE_P (t1))
+	{
+	  compare_values (TYPE_TRANSPARENT_AGGR);
+	  compare_values (TYPE_FINAL_P);
+	}
+      else if (code == ARRAY_TYPE)
+	compare_values (TYPE_NONALIASED_COMPONENT);
+      if (AGGREGATE_TYPE_P (t1))
+	compare_values (TYPE_TYPELESS_STORAGE);
+      compare_values (TYPE_EMPTY_P);
+      compare_values (TYPE_PACKED);
+      compare_values (TYPE_RESTRICT);
+      compare_values (TYPE_USER_ALIGN);
+      compare_values (TYPE_READONLY);
+      compare_values (TYPE_PRECISION);
+      compare_values (TYPE_ALIGN);
+      /* Do not compare TYPE_ALIAS_SET.  Doing so introduce ordering issues
+         with calls to get_alias_set which may initialize it for streamed
+ 	 in types.  */
+    }
+
+  /* We don't want to compare locations, so there is nothing do compare
+     for TS_EXP.  */
+
+  /* BLOCKs are function local and we don't merge anything there, so
+     simply refuse to merge.  */
+  if (CODE_CONTAINS_STRUCT (code, TS_BLOCK))
+    return false;
+
+  if (CODE_CONTAINS_STRUCT (code, TS_TRANSLATION_UNIT_DECL))
+    if (strcmp (TRANSLATION_UNIT_LANGUAGE (t1),
+		TRANSLATION_UNIT_LANGUAGE (t2)) != 0)
+      return false;
+
+  if (CODE_CONTAINS_STRUCT (code, TS_TARGET_OPTION))
+    if (!cl_target_option_eq (TREE_TARGET_OPTION (t1), TREE_TARGET_OPTION (t2)))
+      return false;
+
+  if (CODE_CONTAINS_STRUCT (code, TS_OPTIMIZATION))
+    if (!cl_optimization_option_eq (TREE_OPTIMIZATION (t1),
+				    TREE_OPTIMIZATION (t2)))
+      return false;
+
+  if (CODE_CONTAINS_STRUCT (code, TS_BINFO))
+    if (vec_safe_length (BINFO_BASE_ACCESSES (t1))
+	!= vec_safe_length (BINFO_BASE_ACCESSES (t2)))
+      return false;
+
+  if (CODE_CONTAINS_STRUCT (code, TS_CONSTRUCTOR))
+    compare_values (CONSTRUCTOR_NELTS);
+
+  if (CODE_CONTAINS_STRUCT (code, TS_IDENTIFIER))
+    if (IDENTIFIER_LENGTH (t1) != IDENTIFIER_LENGTH (t2)
+	|| memcmp (IDENTIFIER_POINTER (t1), IDENTIFIER_POINTER (t2),
+		   IDENTIFIER_LENGTH (t1)) != 0)
+      return false;
+
+  if (CODE_CONTAINS_STRUCT (code, TS_STRING))
+    if (TREE_STRING_LENGTH (t1) != TREE_STRING_LENGTH (t2)
+	|| memcmp (TREE_STRING_POINTER (t1), TREE_STRING_POINTER (t2),
+		   TREE_STRING_LENGTH (t1)) != 0)
+      return false;
+
+  if (code == OMP_CLAUSE)
+    {
+      compare_values (OMP_CLAUSE_CODE);
+      switch (OMP_CLAUSE_CODE (t1))
+	{
+	case OMP_CLAUSE_DEFAULT:
+	  compare_values (OMP_CLAUSE_DEFAULT_KIND);
+	  break;
+	case OMP_CLAUSE_SCHEDULE:
+	  compare_values (OMP_CLAUSE_SCHEDULE_KIND);
+	  break;
+	case OMP_CLAUSE_DEPEND:
+	  compare_values (OMP_CLAUSE_DEPEND_KIND);
+	  break;
+	case OMP_CLAUSE_MAP:
+	  compare_values (OMP_CLAUSE_MAP_KIND);
+	  break;
+	case OMP_CLAUSE_PROC_BIND:
+	  compare_values (OMP_CLAUSE_PROC_BIND_KIND);
+	  break;
+	case OMP_CLAUSE_REDUCTION:
+	  compare_values (OMP_CLAUSE_REDUCTION_CODE);
+	  compare_values (OMP_CLAUSE_REDUCTION_GIMPLE_INIT);
+	  compare_values (OMP_CLAUSE_REDUCTION_GIMPLE_MERGE);
+	  break;
+	default:
+	  break;
+	}
+    }
+
+#undef compare_values
+
+
+  /* Compare pointer fields.  */
+
+  /* Recurse.  Search & Replaced from DFS_write_tree_body.
+     Folding the early checks into the compare_tree_edges recursion
+     macro makes debugging way quicker as you are able to break on
+     compare_tree_sccs_1 and simply finish until a call returns false
+     to spot the SCC members with the difference.  */
+#define compare_tree_edges(E1, E2) \
+  do { \
+    tree t1_ = (E1), t2_ = (E2); \
+    if (t1_ != t2_ \
+	&& (!t1_ || !t2_ \
+	    || !TREE_VISITED (t2_) \
+	    || (!TREE_ASM_WRITTEN (t2_) \
+		&& !compare_tree_sccs_1 (t1_, t2_, map)))) \
+      return false; \
+    /* Only non-NULL trees outside of the SCC may compare equal.  */ \
+    gcc_checking_assert (t1_ != t2_ || (!t2_ || !TREE_VISITED (t2_))); \
+  } while (0)
+
+  if (CODE_CONTAINS_STRUCT (code, TS_TYPED))
+    {
+      if (code != IDENTIFIER_NODE)
+	compare_tree_edges (TREE_TYPE (t1), TREE_TYPE (t2));
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_VECTOR))
+    {
+      /* Note that the number of elements for EXPR has already been emitted
+	 in EXPR's header (see streamer_write_tree_header).  */
+      unsigned int count = vector_cst_encoded_nelts (t1);
+      for (unsigned int i = 0; i < count; ++i)
+	compare_tree_edges (VECTOR_CST_ENCODED_ELT (t1, i),
+			    VECTOR_CST_ENCODED_ELT (t2, i));
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_COMPLEX))
+    {
+      compare_tree_edges (TREE_REALPART (t1), TREE_REALPART (t2));
+      compare_tree_edges (TREE_IMAGPART (t1), TREE_IMAGPART (t2));
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_DECL_MINIMAL))
+    {
+      compare_tree_edges (DECL_NAME (t1), DECL_NAME (t2));
+      /* ???  Global decls from different TUs have non-matching
+	 TRANSLATION_UNIT_DECLs.  Only consider a small set of
+	 decls equivalent, we should not end up merging others.  */
+      if ((code == TYPE_DECL
+	   || code == NAMESPACE_DECL
+	   || code == IMPORTED_DECL
+	   || code == CONST_DECL
+	   || (VAR_OR_FUNCTION_DECL_P (t1)
+	       && (TREE_PUBLIC (t1) || DECL_EXTERNAL (t1))))
+	  && DECL_FILE_SCOPE_P (t1) && DECL_FILE_SCOPE_P (t2))
+	;
+      else
+	compare_tree_edges (DECL_CONTEXT (t1), DECL_CONTEXT (t2));
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_DECL_COMMON))
+    {
+      compare_tree_edges (DECL_SIZE (t1), DECL_SIZE (t2));
+      compare_tree_edges (DECL_SIZE_UNIT (t1), DECL_SIZE_UNIT (t2));
+      compare_tree_edges (DECL_ATTRIBUTES (t1), DECL_ATTRIBUTES (t2));
+      compare_tree_edges (DECL_ABSTRACT_ORIGIN (t1), DECL_ABSTRACT_ORIGIN (t2));
+      if ((code == VAR_DECL
+	   || code == PARM_DECL)
+	  && DECL_HAS_VALUE_EXPR_P (t1))
+	compare_tree_edges (DECL_VALUE_EXPR (t1), DECL_VALUE_EXPR (t2));
+      if (code == VAR_DECL
+	  && DECL_HAS_DEBUG_EXPR_P (t1))
+	compare_tree_edges (DECL_DEBUG_EXPR (t1), DECL_DEBUG_EXPR (t2));
+      /* LTO specific edges.  */
+      if (code != FUNCTION_DECL
+	  && code != TRANSLATION_UNIT_DECL)
+	compare_tree_edges (DECL_INITIAL (t1), DECL_INITIAL (t2));
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_DECL_NON_COMMON))
+    {
+      if (code == FUNCTION_DECL)
+	{
+	  tree a1, a2;
+	  for (a1 = DECL_ARGUMENTS (t1), a2 = DECL_ARGUMENTS (t2);
+	       a1 || a2;
+	       a1 = TREE_CHAIN (a1), a2 = TREE_CHAIN (a2))
+	    compare_tree_edges (a1, a2);
+	  compare_tree_edges (DECL_RESULT (t1), DECL_RESULT (t2));
+	}
+      else if (code == TYPE_DECL)
+	compare_tree_edges (DECL_ORIGINAL_TYPE (t1), DECL_ORIGINAL_TYPE (t2));
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_DECL_WITH_VIS))
+    {
+      /* Make sure we don't inadvertently set the assembler name.  */
+      if (DECL_ASSEMBLER_NAME_SET_P (t1))
+	compare_tree_edges (DECL_ASSEMBLER_NAME (t1),
+			    DECL_ASSEMBLER_NAME (t2));
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_FIELD_DECL))
+    {
+      compare_tree_edges (DECL_FIELD_OFFSET (t1), DECL_FIELD_OFFSET (t2));
+      compare_tree_edges (DECL_BIT_FIELD_TYPE (t1), DECL_BIT_FIELD_TYPE (t2));
+      compare_tree_edges (DECL_BIT_FIELD_REPRESENTATIVE (t1),
+			  DECL_BIT_FIELD_REPRESENTATIVE (t2));
+      compare_tree_edges (DECL_FIELD_BIT_OFFSET (t1),
+			  DECL_FIELD_BIT_OFFSET (t2));
+      compare_tree_edges (DECL_FCONTEXT (t1), DECL_FCONTEXT (t2));
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_FUNCTION_DECL))
+    {
+      compare_tree_edges (DECL_FUNCTION_PERSONALITY (t1),
+			  DECL_FUNCTION_PERSONALITY (t2));
+      compare_tree_edges (DECL_VINDEX (t1), DECL_VINDEX (t2));
+      compare_tree_edges (DECL_FUNCTION_SPECIFIC_TARGET (t1),
+			  DECL_FUNCTION_SPECIFIC_TARGET (t2));
+      compare_tree_edges (DECL_FUNCTION_SPECIFIC_OPTIMIZATION (t1),
+			  DECL_FUNCTION_SPECIFIC_OPTIMIZATION (t2));
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_TYPE_COMMON))
+    {
+      compare_tree_edges (TYPE_SIZE (t1), TYPE_SIZE (t2));
+      compare_tree_edges (TYPE_SIZE_UNIT (t1), TYPE_SIZE_UNIT (t2));
+      compare_tree_edges (TYPE_ATTRIBUTES (t1), TYPE_ATTRIBUTES (t2));
+      compare_tree_edges (TYPE_NAME (t1), TYPE_NAME (t2));
+      /* Do not compare TYPE_POINTER_TO or TYPE_REFERENCE_TO.  They will be
+	 reconstructed during fixup.  */
+      /* Do not compare TYPE_NEXT_VARIANT, we reconstruct the variant lists
+	 during fixup.  */
+      compare_tree_edges (TYPE_MAIN_VARIANT (t1), TYPE_MAIN_VARIANT (t2));
+      /* ???  Global types from different TUs have non-matching
+	 TRANSLATION_UNIT_DECLs.  Still merge them if they are otherwise
+	 equal.  */
+      if (TYPE_FILE_SCOPE_P (t1) && TYPE_FILE_SCOPE_P (t2))
+	;
+      else
+	compare_tree_edges (TYPE_CONTEXT (t1), TYPE_CONTEXT (t2));
+      /* TYPE_CANONICAL is re-computed during type merging, so do not
+	 compare it here.  */
+      compare_tree_edges (TYPE_STUB_DECL (t1), TYPE_STUB_DECL (t2));
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_TYPE_NON_COMMON))
+    {
+      if (code == ENUMERAL_TYPE)
+	compare_tree_edges (TYPE_VALUES (t1), TYPE_VALUES (t2));
+      else if (code == ARRAY_TYPE)
+	compare_tree_edges (TYPE_DOMAIN (t1), TYPE_DOMAIN (t2));
+      else if (RECORD_OR_UNION_TYPE_P (t1))
+	{
+	  tree f1, f2;
+	  for (f1 = TYPE_FIELDS (t1), f2 = TYPE_FIELDS (t2);
+	       f1 || f2;
+	       f1 = TREE_CHAIN (f1), f2 = TREE_CHAIN (f2))
+	    compare_tree_edges (f1, f2);
+	}
+      else if (code == FUNCTION_TYPE
+	       || code == METHOD_TYPE)
+	compare_tree_edges (TYPE_ARG_TYPES (t1), TYPE_ARG_TYPES (t2));
+
+      if (!POINTER_TYPE_P (t1))
+	compare_tree_edges (TYPE_MIN_VALUE_RAW (t1), TYPE_MIN_VALUE_RAW (t2));
+      compare_tree_edges (TYPE_MAX_VALUE_RAW (t1), TYPE_MAX_VALUE_RAW (t2));
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_LIST))
+    {
+      compare_tree_edges (TREE_PURPOSE (t1), TREE_PURPOSE (t2));
+      compare_tree_edges (TREE_VALUE (t1), TREE_VALUE (t2));
+      compare_tree_edges (TREE_CHAIN (t1), TREE_CHAIN (t2));
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_VEC))
+    for (int i = 0; i < TREE_VEC_LENGTH (t1); i++)
+      compare_tree_edges (TREE_VEC_ELT (t1, i), TREE_VEC_ELT (t2, i));
+
+  if (CODE_CONTAINS_STRUCT (code, TS_EXP))
+    {
+      for (int i = 0; i < TREE_OPERAND_LENGTH (t1); i++)
+	compare_tree_edges (TREE_OPERAND (t1, i),
+			    TREE_OPERAND (t2, i));
+
+      /* BLOCKs are function local and we don't merge anything there.  */
+      if (TREE_BLOCK (t1) || TREE_BLOCK (t2))
+	return false;
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_BINFO))
+    {
+      unsigned i;
+      tree t;
+      /* Lengths have already been compared above.  */
+      FOR_EACH_VEC_ELT (*BINFO_BASE_BINFOS (t1), i, t)
+	compare_tree_edges (t, BINFO_BASE_BINFO (t2, i));
+      FOR_EACH_VEC_SAFE_ELT (BINFO_BASE_ACCESSES (t1), i, t)
+	compare_tree_edges (t, BINFO_BASE_ACCESS (t2, i));
+      compare_tree_edges (BINFO_OFFSET (t1), BINFO_OFFSET (t2));
+      compare_tree_edges (BINFO_VTABLE (t1), BINFO_VTABLE (t2));
+      compare_tree_edges (BINFO_VPTR_FIELD (t1), BINFO_VPTR_FIELD (t2));
+      /* Do not walk BINFO_INHERITANCE_CHAIN, BINFO_SUBVTT_INDEX
+	 and BINFO_VPTR_INDEX; these are used by C++ FE only.  */
+    }
+
+  if (CODE_CONTAINS_STRUCT (code, TS_CONSTRUCTOR))
+    {
+      unsigned i;
+      tree index, value;
+      /* Lengths have already been compared above.  */
+      FOR_EACH_CONSTRUCTOR_ELT (CONSTRUCTOR_ELTS (t1), i, index, value)
+	{
+	  compare_tree_edges (index, CONSTRUCTOR_ELT (t2, i)->index);
+	  compare_tree_edges (value, CONSTRUCTOR_ELT (t2, i)->value);
+	}
+    }
+
+  if (code == OMP_CLAUSE)
+    {
+      int i;
+
+      for (i = 0; i < omp_clause_num_ops[OMP_CLAUSE_CODE (t1)]; i++)
+	compare_tree_edges (OMP_CLAUSE_OPERAND (t1, i),
+			    OMP_CLAUSE_OPERAND (t2, i));
+      compare_tree_edges (OMP_CLAUSE_CHAIN (t1), OMP_CLAUSE_CHAIN (t2));
+    }
+
+#undef compare_tree_edges
+
+  return true;
+}
+
+/* Compare the tree scc SCC to the prevailing candidate PSCC, filling
+   out MAP if they are equal.  */
+
+static bool
+compare_tree_sccs (tree_scc *pscc, tree_scc *scc,
+		   tree *map)
+{
+  /* Assume SCC entry hashes are sorted after their cardinality.  Which
+     means we can simply take the first n-tuple of equal hashes
+     (which is recorded as entry_len) and do n SCC entry candidate
+     comparisons.  */
+  for (unsigned i = 0; i < pscc->entry_len; ++i)
+    {
+      tree *mapp = map;
+      num_scc_compare_collisions++;
+      if (compare_tree_sccs_1 (pscc->entries[0], scc->entries[i], &mapp))
+	{
+	  /* Equal - no need to reset TREE_VISITED or TREE_ASM_WRITTEN
+	     on the scc as all trees will be freed.  */
+	  return true;
+	}
+      /* Reset TREE_ASM_WRITTEN on scc for the next compare or in case
+         the SCC prevails.  */
+      for (unsigned j = 0; j < scc->len; ++j)
+	TREE_ASM_WRITTEN (scc->entries[j]) = 0;
+    }
+
+  return false;
+}
+
+/* QSort sort function to sort a map of two pointers after the 2nd
+   pointer.  */
+
+static int
+cmp_tree (const void *p1_, const void *p2_)
+{
+  tree *p1 = (tree *)(const_cast<void *>(p1_));
+  tree *p2 = (tree *)(const_cast<void *>(p2_));
+  if (p1[1] == p2[1])
+    return 0;
+  return ((uintptr_t)p1[1] < (uintptr_t)p2[1]) ? -1 : 1;
+}
+
+/* Try to unify the SCC with nodes FROM to FROM + LEN in CACHE and
+   hash value SCC_HASH with an already recorded SCC.  Return true if
+   that was successful, otherwise return false.  */
+
+static bool
+unify_scc (struct data_in *data_in, unsigned from,
+	   unsigned len, unsigned scc_entry_len, hashval_t scc_hash)
+{
+  bool unified_p = false;
+  struct streamer_tree_cache_d *cache = data_in->reader_cache;
+  tree_scc *scc
+    = (tree_scc *) alloca (sizeof (tree_scc) + (len - 1) * sizeof (tree));
+  scc->next = NULL;
+  scc->hash = scc_hash;
+  scc->len = len;
+  scc->entry_len = scc_entry_len;
+  for (unsigned i = 0; i < len; ++i)
+    {
+      tree t = streamer_tree_cache_get_tree (cache, from + i);
+      scc->entries[i] = t;
+      /* Do not merge SCCs with local entities inside them.  Also do
+	 not merge TRANSLATION_UNIT_DECLs.  */
+      if (TREE_CODE (t) == TRANSLATION_UNIT_DECL
+	  || (VAR_OR_FUNCTION_DECL_P (t)
+	      && !(TREE_PUBLIC (t) || DECL_EXTERNAL (t)))
+	  || TREE_CODE (t) == LABEL_DECL)
+	{
+	  /* Avoid doing any work for these cases and do not worry to
+	     record the SCCs for further merging.  */
+	  return false;
+	}
+    }
+
+  /* Look for the list of candidate SCCs to compare against.  */
+  tree_scc **slot;
+  slot = tree_scc_hash->find_slot_with_hash (scc, scc_hash, INSERT);
+  if (*slot)
+    {
+      /* Try unifying against each candidate.  */
+      num_scc_compares++;
+
+      /* Set TREE_VISITED on the scc so we can easily identify tree nodes
+	 outside of the scc when following tree edges.  Make sure
+	 that TREE_ASM_WRITTEN is unset so we can use it as 2nd bit
+	 to track whether we visited the SCC member during the compare.
+	 We cannot use TREE_VISITED on the pscc members as the extended
+	 scc and pscc can overlap.  */
+      for (unsigned i = 0; i < scc->len; ++i)
+	{
+	  TREE_VISITED (scc->entries[i]) = 1;
+	  gcc_checking_assert (!TREE_ASM_WRITTEN (scc->entries[i]));
+	}
+
+      tree *map = XALLOCAVEC (tree, 2 * len);
+      for (tree_scc *pscc = *slot; pscc; pscc = pscc->next)
+	{
+	  if (!compare_tree_sccs (pscc, scc, map))
+	    continue;
+
+	  /* Found an equal SCC.  */
+	  unified_p = true;
+	  num_scc_compare_collisions--;
+	  num_sccs_merged++;
+	  total_scc_size_merged += len;
+
+	  if (flag_checking)
+	    for (unsigned i = 0; i < len; ++i)
+	      {
+		tree t = map[2*i+1];
+		enum tree_code code = TREE_CODE (t);
+		/* IDENTIFIER_NODEs should be singletons and are merged by the
+		   streamer.  The others should be singletons, too, and we
+		   should not merge them in any way.  */
+		gcc_assert (code != TRANSLATION_UNIT_DECL
+			    && code != IDENTIFIER_NODE);
+	      }
+
+	  /* Fixup the streamer cache with the prevailing nodes according
+	     to the tree node mapping computed by compare_tree_sccs.  */
+	  if (len == 1)
+	    {
+	      /* If we got a debug reference queued, see if the prevailing
+	         tree has a debug reference and if not, register the one
+		 for the tree we are about to throw away.  */
+	      if (dref_queue.length () == 1)
+		{
+		  dref_entry e = dref_queue.pop ();
+		  gcc_assert (e.decl
+			      == streamer_tree_cache_get_tree (cache, from));
+		  const char *sym;
+		  unsigned HOST_WIDE_INT off;
+		  if (!debug_hooks->die_ref_for_decl (pscc->entries[0], &sym,
+						      &off))
+		    debug_hooks->register_external_die (pscc->entries[0],
+							e.sym, e.off);
+		}
+	      lto_maybe_register_decl (data_in, pscc->entries[0], from);
+	      streamer_tree_cache_replace_tree (cache, pscc->entries[0], from);
+	    }
+	  else
+	    {
+	      tree *map2 = XALLOCAVEC (tree, 2 * len);
+	      for (unsigned i = 0; i < len; ++i)
+		{
+		  map2[i*2] = (tree)(uintptr_t)(from + i);
+		  map2[i*2+1] = scc->entries[i];
+		}
+	      qsort (map2, len, 2 * sizeof (tree), cmp_tree);
+	      qsort (map, len, 2 * sizeof (tree), cmp_tree);
+	      for (unsigned i = 0; i < len; ++i)
+		{
+		  lto_maybe_register_decl (data_in, map[2*i],
+					   (uintptr_t)map2[2*i]);
+		  streamer_tree_cache_replace_tree (cache, map[2*i],
+						    (uintptr_t)map2[2*i]);
+		}
+	    }
+
+	  /* Free the tree nodes from the read SCC.  */
+	  data_in->location_cache.revert_location_cache ();
+	  for (unsigned i = 0; i < len; ++i)
+	    {
+	      if (TYPE_P (scc->entries[i]))
+		num_merged_types++;
+	      free_node (scc->entries[i]);
+	    }
+
+	  /* Drop DIE references.
+	     ???  Do as in the size-one SCC case which involves sorting
+	     the queue.  */
+	  dref_queue.truncate (0);
+
+	  break;
+	}
+
+      /* Reset TREE_VISITED if we didn't unify the SCC with another.  */
+      if (!unified_p)
+	for (unsigned i = 0; i < scc->len; ++i)
+	  TREE_VISITED (scc->entries[i]) = 0;
+    }
+
+  /* If we didn't unify it to any candidate duplicate the relevant
+     pieces to permanent storage and link it into the chain.  */
+  if (!unified_p)
+    {
+      tree_scc *pscc
+	= XOBNEWVAR (&tree_scc_hash_obstack, tree_scc, sizeof (tree_scc));
+      memcpy (pscc, scc, sizeof (tree_scc));
+      pscc->next = (*slot);
+      *slot = pscc;
+    }
+  return unified_p;
+}
+
+
+/* Read all the symbols from buffer DATA, using descriptors in DECL_DATA.
+   RESOLUTIONS is the set of symbols picked by the linker (read from the
+   resolution file when the linker plugin is being used).  */
+
+static void
+lto_read_decls (struct lto_file_decl_data *decl_data, const void *data,
+		vec<ld_plugin_symbol_resolution_t> resolutions)
+{
+  const struct lto_decl_header *header = (const struct lto_decl_header *) data;
+  const int decl_offset = sizeof (struct lto_decl_header);
+  const int main_offset = decl_offset + header->decl_state_size;
+  const int string_offset = main_offset + header->main_size;
+  struct data_in *data_in;
+  unsigned int i;
+  const uint32_t *data_ptr, *data_end;
+  uint32_t num_decl_states;
+
+  lto_input_block ib_main ((const char *) data + main_offset,
+			   header->main_size, decl_data->mode_table);
+
+  data_in = lto_data_in_create (decl_data, (const char *) data + string_offset,
+				header->string_size, resolutions);
+
+  /* We do not uniquify the pre-loaded cache entries, those are middle-end
+     internal types that should not be merged.  */
+
+  /* Read the global declarations and types.  */
+  while (ib_main.p < ib_main.len)
+    {
+      tree t;
+      unsigned from = data_in->reader_cache->nodes.length ();
+      /* Read and uniquify SCCs as in the input stream.  */
+      enum LTO_tags tag = streamer_read_record_start (&ib_main);
+      if (tag == LTO_tree_scc)
+	{
+	  unsigned len_;
+	  unsigned scc_entry_len;
+	  hashval_t scc_hash = lto_input_scc (&ib_main, data_in, &len_,
+					      &scc_entry_len);
+	  unsigned len = data_in->reader_cache->nodes.length () - from;
+	  gcc_assert (len == len_);
+
+	  total_scc_size += len;
+	  num_sccs_read++;
+
+	  /* We have the special case of size-1 SCCs that are pre-merged
+	     by means of identifier and string sharing for example.
+	     ???  Maybe we should avoid streaming those as SCCs.  */
+	  tree first = streamer_tree_cache_get_tree (data_in->reader_cache,
+						     from);
+	  if (len == 1
+	      && (TREE_CODE (first) == IDENTIFIER_NODE
+		  || (TREE_CODE (first) == INTEGER_CST
+		      && !TREE_OVERFLOW (first))))
+	    continue;
+
+	  /* Try to unify the SCC with already existing ones.  */
+	  if (!flag_ltrans
+	      && unify_scc (data_in, from,
+			    len, scc_entry_len, scc_hash))
+	    continue;
+
+	  /* Tree merging failed, mark entries in location cache as
+	     permanent.  */
+	  data_in->location_cache.accept_location_cache ();
+
+	  bool seen_type = false;
+	  for (unsigned i = 0; i < len; ++i)
+	    {
+	      tree t = streamer_tree_cache_get_tree (data_in->reader_cache,
+						     from + i);
+	      /* Reconstruct the type variant and pointer-to/reference-to
+		 chains.  */
+	      if (TYPE_P (t))
+		{
+		  seen_type = true;
+		  num_prevailing_types++;
+		  lto_fixup_prevailing_type (t);
+
+		  /* Compute the canonical type of all types.
+		     Because SCC components are streamed in random (hash) order
+		     we may have encountered the type before while registering
+		     type canonical of a derived type in the same SCC.  */
+		  if (!TYPE_CANONICAL (t))
+		    gimple_register_canonical_type (t);
+		  if (TYPE_MAIN_VARIANT (t) == t && odr_type_p (t))
+		    register_odr_type (t);
+		}
+	      /* Link shared INTEGER_CSTs into TYPE_CACHED_VALUEs of its
+		 type which is also member of this SCC.  */
+	      if (TREE_CODE (t) == INTEGER_CST
+		  && !TREE_OVERFLOW (t))
+		cache_integer_cst (t);
+	      if (!flag_ltrans)
+		{
+		  lto_maybe_register_decl (data_in, t, from + i);
+		  /* Scan the tree for references to global functions or
+		     variables and record those for later fixup.  */
+		  if (mentions_vars_p (t))
+		    vec_safe_push (tree_with_vars, t);
+		}
+	    }
+
+	  /* Register DECLs with the debuginfo machinery.  */
+	  while (!dref_queue.is_empty ())
+	    {
+	      dref_entry e = dref_queue.pop ();
+	      debug_hooks->register_external_die (e.decl, e.sym, e.off);
+	    }
+
+	  if (seen_type)
+	    num_type_scc_trees += len;
+	}
+      else
+	{
+	  /* Pickle stray references.  */
+	  t = lto_input_tree_1 (&ib_main, data_in, tag, 0);
+	  gcc_assert (t && data_in->reader_cache->nodes.length () == from);
+	}
+    }
+  data_in->location_cache.apply_location_cache ();
+
+  /* Read in lto_in_decl_state objects.  */
+  data_ptr = (const uint32_t *) ((const char*) data + decl_offset); 
+  data_end =
+     (const uint32_t *) ((const char*) data_ptr + header->decl_state_size);
+  num_decl_states = *data_ptr++;
+  
+  gcc_assert (num_decl_states > 0);
+  decl_data->global_decl_state = lto_new_in_decl_state ();
+  data_ptr = lto_read_in_decl_state (data_in, data_ptr,
+				     decl_data->global_decl_state);
+
+  /* Read in per-function decl states and enter them in hash table.  */
+  decl_data->function_decl_states =
+    hash_table<decl_state_hasher>::create_ggc (37);
+
+  for (i = 1; i < num_decl_states; i++)
+    {
+      struct lto_in_decl_state *state = lto_new_in_decl_state ();
+
+      data_ptr = lto_read_in_decl_state (data_in, data_ptr, state);
+      lto_in_decl_state **slot
+	= decl_data->function_decl_states->find_slot (state, INSERT);
+      gcc_assert (*slot == NULL);
+      *slot = state;
+    }
+
+  if (data_ptr != data_end)
+    internal_error ("bytecode stream: garbage at the end of symbols section");
+
+  /* Set the current decl state to be the global state. */
+  decl_data->current_decl_state = decl_data->global_decl_state;
+
+  lto_data_in_delete (data_in);
+}
+
+/* Custom version of strtoll, which is not portable.  */
+
+static int64_t
+lto_parse_hex (const char *p)
+{
+  int64_t ret = 0;
+
+  for (; *p != '\0'; ++p)
+    {
+      char c = *p;
+      unsigned char part;
+      ret <<= 4;
+      if (c >= '0' && c <= '9')
+        part = c - '0';
+      else if (c >= 'a' && c <= 'f')
+        part = c - 'a' + 10;
+      else if (c >= 'A' && c <= 'F')
+        part = c - 'A' + 10;
+      else
+        internal_error ("could not parse hex number");
+      ret |= part;
+    }
+
+  return ret;
+}
+
+/* Read resolution for file named FILE_NAME. The resolution is read from
+   RESOLUTION. */
+
+static void
+lto_resolution_read (splay_tree file_ids, FILE *resolution, lto_file *file)
+{
+  /* We require that objects in the resolution file are in the same
+     order as the lto1 command line. */
+  unsigned int name_len;
+  char *obj_name;
+  unsigned int num_symbols;
+  unsigned int i;
+  struct lto_file_decl_data *file_data;
+  splay_tree_node nd = NULL; 
+
+  if (!resolution)
+    return;
+
+  name_len = strlen (file->filename);
+  obj_name = XNEWVEC (char, name_len + 1);
+  fscanf (resolution, " ");   /* Read white space. */
+
+  fread (obj_name, sizeof (char), name_len, resolution);
+  obj_name[name_len] = '\0';
+  if (filename_cmp (obj_name, file->filename) != 0)
+    internal_error ("unexpected file name %s in linker resolution file. "
+		    "Expected %s", obj_name, file->filename);
+  if (file->offset != 0)
+    {
+      int t;
+      char offset_p[17];
+      int64_t offset;
+      t = fscanf (resolution, "@0x%16s", offset_p);
+      if (t != 1)
+        internal_error ("could not parse file offset");
+      offset = lto_parse_hex (offset_p);
+      if (offset != file->offset)
+        internal_error ("unexpected offset");
+    }
+
+  free (obj_name);
+
+  fscanf (resolution, "%u", &num_symbols);
+
+  for (i = 0; i < num_symbols; i++)
+    {
+      int t;
+      unsigned index;
+      unsigned HOST_WIDE_INT id;
+      char r_str[27];
+      enum ld_plugin_symbol_resolution r = (enum ld_plugin_symbol_resolution) 0;
+      unsigned int j;
+      unsigned int lto_resolution_str_len =
+	sizeof (lto_resolution_str) / sizeof (char *);
+      res_pair rp;
+
+      t = fscanf (resolution, "%u " HOST_WIDE_INT_PRINT_HEX_PURE " %26s %*[^\n]\n", 
+		  &index, &id, r_str);
+      if (t != 3)
+        internal_error ("invalid line in the resolution file");
+
+      for (j = 0; j < lto_resolution_str_len; j++)
+	{
+	  if (strcmp (lto_resolution_str[j], r_str) == 0)
+	    {
+	      r = (enum ld_plugin_symbol_resolution) j;
+	      break;
+	    }
+	}
+      if (j == lto_resolution_str_len)
+	internal_error ("invalid resolution in the resolution file");
+
+      if (!(nd && lto_splay_tree_id_equal_p (nd->key, id)))
+	{
+	  nd = lto_splay_tree_lookup (file_ids, id);
+	  if (nd == NULL)
+	    internal_error ("resolution sub id %wx not in object file", id);
+	}
+
+      file_data = (struct lto_file_decl_data *)nd->value;
+      /* The indexes are very sparse. To save memory save them in a compact
+         format that is only unpacked later when the subfile is processed. */
+      rp.res = r;
+      rp.index = index;
+      file_data->respairs.safe_push (rp);
+      if (file_data->max_index < index)
+        file_data->max_index = index;
+    }
+}
+
+/* List of file_decl_datas */
+struct file_data_list
+  {
+    struct lto_file_decl_data *first, *last;
+  };
+
+/* Is the name for a id'ed LTO section? */
+
+static int 
+lto_section_with_id (const char *name, unsigned HOST_WIDE_INT *id)
+{
+  const char *s;
+
+  if (strncmp (name, section_name_prefix, strlen (section_name_prefix)))
+    return 0;
+  s = strrchr (name, '.');
+  if (!s)
+    return 0;
+  /* If the section is not suffixed with an ID return.  */
+  if ((size_t)(s - name) == strlen (section_name_prefix))
+    return 0;
+  return sscanf (s, "." HOST_WIDE_INT_PRINT_HEX_PURE, id) == 1;
+}
+
+/* Create file_data of each sub file id */
+
+static int 
+create_subid_section_table (struct lto_section_slot *ls, splay_tree file_ids,
+                            struct file_data_list *list)
+{
+  struct lto_section_slot s_slot, *new_slot;
+  unsigned HOST_WIDE_INT id;
+  splay_tree_node nd;
+  void **hash_slot;
+  char *new_name;
+  struct lto_file_decl_data *file_data;
+
+  if (!lto_section_with_id (ls->name, &id))
+    return 1;
+  
+  /* Find hash table of sub module id */
+  nd = lto_splay_tree_lookup (file_ids, id);
+  if (nd != NULL)
+    {
+      file_data = (struct lto_file_decl_data *)nd->value;
+    }
+  else
+    {
+      file_data = ggc_alloc<lto_file_decl_data> ();
+      memset(file_data, 0, sizeof (struct lto_file_decl_data));
+      file_data->id = id;
+      file_data->section_hash_table = lto_obj_create_section_hash_table ();
+      lto_splay_tree_insert (file_ids, id, file_data);
+
+      /* Maintain list in linker order */
+      if (!list->first)
+        list->first = file_data;
+      if (list->last)
+        list->last->next = file_data;
+      list->last = file_data;
+    }
+
+  /* Copy section into sub module hash table */
+  new_name = XDUPVEC (char, ls->name, strlen (ls->name) + 1);
+  s_slot.name = new_name;
+  hash_slot = htab_find_slot (file_data->section_hash_table, &s_slot, INSERT);
+  gcc_assert (*hash_slot == NULL);
+
+  new_slot = XDUP (struct lto_section_slot, ls);
+  new_slot->name = new_name;
+  *hash_slot = new_slot;
+  return 1;
+}
+
+/* Read declarations and other initializations for a FILE_DATA. */
+
+static void
+lto_file_finalize (struct lto_file_decl_data *file_data, lto_file *file)
+{
+  const char *data;
+  size_t len;
+  vec<ld_plugin_symbol_resolution_t>
+	resolutions = vNULL;
+  int i;
+  res_pair *rp;
+
+  /* Create vector for fast access of resolution. We do this lazily
+     to save memory. */ 
+  resolutions.safe_grow_cleared (file_data->max_index + 1);
+  for (i = 0; file_data->respairs.iterate (i, &rp); i++)
+    resolutions[rp->index] = rp->res;
+  file_data->respairs.release ();
+
+  file_data->renaming_hash_table = lto_create_renaming_table ();
+  file_data->file_name = file->filename;
+#ifdef ACCEL_COMPILER
+  lto_input_mode_table (file_data);
+#else
+  file_data->mode_table = lto_mode_identity_table;
+#endif
+  data = lto_get_section_data (file_data, LTO_section_decls, NULL, &len);
+  if (data == NULL)
+    {
+      internal_error ("cannot read LTO decls from %s", file_data->file_name);
+      return;
+    }
+  /* Frees resolutions */
+  lto_read_decls (file_data, data, resolutions);
+  lto_free_section_data (file_data, LTO_section_decls, NULL, data, len);
+}
+
+/* Finalize FILE_DATA in FILE and increase COUNT. */
+
+static int 
+lto_create_files_from_ids (lto_file *file, struct lto_file_decl_data *file_data,
+			   int *count)
+{
+  lto_file_finalize (file_data, file);
+  if (symtab->dump_file)
+    fprintf (symtab->dump_file,
+	     "Creating file %s with sub id " HOST_WIDE_INT_PRINT_HEX "\n",
+	     file_data->file_name, file_data->id);
+  (*count)++;
+  return 0;
+}
+
+/* Generate a TREE representation for all types and external decls
+   entities in FILE.  
+
+   Read all of the globals out of the file.  Then read the cgraph
+   and process the .o index into the cgraph nodes so that it can open
+   the .o file to load the functions and ipa information.   */
+
+static struct lto_file_decl_data *
+lto_file_read (lto_file *file, FILE *resolution_file, int *count)
+{
+  struct lto_file_decl_data *file_data = NULL;
+  splay_tree file_ids;
+  htab_t section_hash_table;
+  struct lto_section_slot *section;
+  struct file_data_list file_list;
+  struct lto_section_list section_list;
+ 
+  memset (&section_list, 0, sizeof (struct lto_section_list)); 
+  section_hash_table = lto_obj_build_section_table (file, &section_list);
+
+  /* Find all sub modules in the object and put their sections into new hash
+     tables in a splay tree. */
+  file_ids = lto_splay_tree_new ();
+  memset (&file_list, 0, sizeof (struct file_data_list));
+  for (section = section_list.first; section != NULL; section = section->next)
+    create_subid_section_table (section, file_ids, &file_list);
+
+  /* Add resolutions to file ids */
+  lto_resolution_read (file_ids, resolution_file, file);
+
+  /* Finalize each lto file for each submodule in the merged object */
+  for (file_data = file_list.first; file_data != NULL; file_data = file_data->next)
+    lto_create_files_from_ids (file, file_data, count);
+ 
+  splay_tree_delete (file_ids);
+  htab_delete (section_hash_table);
+
+  return file_list.first;
+}
+
+#if HAVE_MMAP_FILE && HAVE_SYSCONF && defined _SC_PAGE_SIZE
+#define LTO_MMAP_IO 1
+#endif
+
+#if LTO_MMAP_IO
+/* Page size of machine is used for mmap and munmap calls.  */
+static size_t page_mask;
+#endif
+
+/* Get the section data of length LEN from FILENAME starting at
+   OFFSET.  The data segment must be freed by the caller when the
+   caller is finished.  Returns NULL if all was not well.  */
+
+static char *
+lto_read_section_data (struct lto_file_decl_data *file_data,
+		       intptr_t offset, size_t len)
+{
+  char *result;
+  static int fd = -1;
+  static char *fd_name;
+#if LTO_MMAP_IO
+  intptr_t computed_len;
+  intptr_t computed_offset;
+  intptr_t diff;
+#endif
+
+  /* Keep a single-entry file-descriptor cache.  The last file we
+     touched will get closed at exit.
+     ???  Eventually we want to add a more sophisticated larger cache
+     or rather fix function body streaming to not stream them in
+     practically random order.  */
+  if (fd != -1
+      && filename_cmp (fd_name, file_data->file_name) != 0)
+    {
+      free (fd_name);
+      close (fd);
+      fd = -1;
+    }
+  if (fd == -1)
+    {
+      fd = open (file_data->file_name, O_RDONLY|O_BINARY);
+      if (fd == -1)
+        {
+	  fatal_error (input_location, "Cannot open %s", file_data->file_name);
+	  return NULL;
+        }
+      fd_name = xstrdup (file_data->file_name);
+    }
+
+#if LTO_MMAP_IO
+  if (!page_mask)
+    {
+      size_t page_size = sysconf (_SC_PAGE_SIZE);
+      page_mask = ~(page_size - 1);
+    }
+
+  computed_offset = offset & page_mask;
+  diff = offset - computed_offset;
+  computed_len = len + diff;
+
+  result = (char *) mmap (NULL, computed_len, PROT_READ, MAP_PRIVATE,
+			  fd, computed_offset);
+  if (result == MAP_FAILED)
+    {
+      fatal_error (input_location, "Cannot map %s", file_data->file_name);
+      return NULL;
+    }
+
+  return result + diff;
+#else
+  result = (char *) xmalloc (len);
+  if (lseek (fd, offset, SEEK_SET) != offset
+      || read (fd, result, len) != (ssize_t) len)
+    {
+      free (result);
+      fatal_error (input_location, "Cannot read %s", file_data->file_name);
+      result = NULL;
+    }
+#ifdef __MINGW32__
+  /* Native windows doesn't supports delayed unlink on opened file. So
+     we close file here again. This produces higher I/O load, but at least
+     it prevents to have dangling file handles preventing unlink.  */
+  free (fd_name);
+  fd_name = NULL;
+  close (fd);
+  fd = -1;
+#endif
+  return result;
+#endif
+}    
+
+
+/* Get the section data from FILE_DATA of SECTION_TYPE with NAME.
+   NAME will be NULL unless the section type is for a function
+   body.  */
+
+static const char *
+get_section_data (struct lto_file_decl_data *file_data,
+		      enum lto_section_type section_type,
+		      const char *name,
+		      size_t *len)
+{
+  htab_t section_hash_table = file_data->section_hash_table;
+  struct lto_section_slot *f_slot;
+  struct lto_section_slot s_slot;
+  const char *section_name = lto_get_section_name (section_type, name, file_data);
+  char *data = NULL;
+
+  *len = 0;
+  s_slot.name = section_name;
+  f_slot = (struct lto_section_slot *) htab_find (section_hash_table, &s_slot);
+  if (f_slot)
+    {
+      data = lto_read_section_data (file_data, f_slot->start, f_slot->len);
+      *len = f_slot->len;
+    }
+
+  free (CONST_CAST (char *, section_name));
+  return data;
+}
+
+
+/* Free the section data from FILE_DATA of SECTION_TYPE with NAME that
+   starts at OFFSET and has LEN bytes.  */
+
+static void
+free_section_data (struct lto_file_decl_data *file_data ATTRIBUTE_UNUSED,
+		   enum lto_section_type section_type ATTRIBUTE_UNUSED,
+		   const char *name ATTRIBUTE_UNUSED,
+		   const char *offset, size_t len ATTRIBUTE_UNUSED)
+{
+#if LTO_MMAP_IO
+  intptr_t computed_len;
+  intptr_t computed_offset;
+  intptr_t diff;
+#endif
+
+#if LTO_MMAP_IO
+  computed_offset = ((intptr_t) offset) & page_mask;
+  diff = (intptr_t) offset - computed_offset;
+  computed_len = len + diff;
+
+  munmap ((caddr_t) computed_offset, computed_len);
+#else
+  free (CONST_CAST(char *, offset));
+#endif
+}
+
+static lto_file *current_lto_file;
+
+/* If TT is a variable or function decl replace it with its
+   prevailing variant.  */
+#define LTO_SET_PREVAIL(tt) \
+  do {\
+    if ((tt) && VAR_OR_FUNCTION_DECL_P (tt) \
+	&& (TREE_PUBLIC (tt) || DECL_EXTERNAL (tt))) \
+      { \
+        tt = lto_symtab_prevailing_decl (tt); \
+	fixed = true; \
+      } \
+  } while (0)
+
+/* Ensure that TT isn't a replacable var of function decl.  */
+#define LTO_NO_PREVAIL(tt) \
+  gcc_checking_assert (!(tt) || !VAR_OR_FUNCTION_DECL_P (tt))
+
+/* Given a tree T replace all fields referring to variables or functions
+   with their prevailing variant.  */
+static void
+lto_fixup_prevailing_decls (tree t)
+{
+  enum tree_code code = TREE_CODE (t);
+  bool fixed = false;
+
+  gcc_checking_assert (code != TREE_BINFO);
+  LTO_NO_PREVAIL (TREE_TYPE (t));
+  if (CODE_CONTAINS_STRUCT (code, TS_COMMON)
+      /* lto_symtab_prevail_decl use TREE_CHAIN to link to the prevailing decl.
+	 in the case T is a prevailed declaration we would ICE here. */
+      && !VAR_OR_FUNCTION_DECL_P (t))
+    LTO_NO_PREVAIL (TREE_CHAIN (t));
+  if (DECL_P (t))
+    {
+      LTO_NO_PREVAIL (DECL_NAME (t));
+      LTO_SET_PREVAIL (DECL_CONTEXT (t));
+      if (CODE_CONTAINS_STRUCT (code, TS_DECL_COMMON))
+	{
+	  LTO_SET_PREVAIL (DECL_SIZE (t));
+	  LTO_SET_PREVAIL (DECL_SIZE_UNIT (t));
+	  LTO_SET_PREVAIL (DECL_INITIAL (t));
+	  LTO_NO_PREVAIL (DECL_ATTRIBUTES (t));
+	  LTO_SET_PREVAIL (DECL_ABSTRACT_ORIGIN (t));
+	}
+      if (CODE_CONTAINS_STRUCT (code, TS_DECL_WITH_VIS))
+	{
+	  LTO_NO_PREVAIL (DECL_ASSEMBLER_NAME_RAW (t));
+	}
+      if (CODE_CONTAINS_STRUCT (code, TS_DECL_NON_COMMON))
+	{
+	  LTO_NO_PREVAIL (DECL_RESULT_FLD (t));
+	}
+      if (CODE_CONTAINS_STRUCT (code, TS_FUNCTION_DECL))
+	{
+	  LTO_NO_PREVAIL (DECL_ARGUMENTS (t));
+	  LTO_SET_PREVAIL (DECL_FUNCTION_PERSONALITY (t));
+	  LTO_NO_PREVAIL (DECL_VINDEX (t));
+	}
+      if (CODE_CONTAINS_STRUCT (code, TS_FIELD_DECL))
+	{
+	  LTO_SET_PREVAIL (DECL_FIELD_OFFSET (t));
+	  LTO_NO_PREVAIL (DECL_BIT_FIELD_TYPE (t));
+	  LTO_NO_PREVAIL (DECL_QUALIFIER (t));
+	  LTO_NO_PREVAIL (DECL_FIELD_BIT_OFFSET (t));
+	  LTO_NO_PREVAIL (DECL_FCONTEXT (t));
+	}
+    }
+  else if (TYPE_P (t))
+    {
+      LTO_NO_PREVAIL (TYPE_CACHED_VALUES (t));
+      LTO_SET_PREVAIL (TYPE_SIZE (t));
+      LTO_SET_PREVAIL (TYPE_SIZE_UNIT (t));
+      LTO_NO_PREVAIL (TYPE_ATTRIBUTES (t));
+      LTO_NO_PREVAIL (TYPE_NAME (t));
+
+      LTO_SET_PREVAIL (TYPE_MIN_VALUE_RAW (t));
+      LTO_SET_PREVAIL (TYPE_MAX_VALUE_RAW (t));
+      LTO_NO_PREVAIL (TYPE_LANG_SLOT_1 (t));
+
+      LTO_SET_PREVAIL (TYPE_CONTEXT (t));
+
+      LTO_NO_PREVAIL (TYPE_CANONICAL (t));
+      LTO_NO_PREVAIL (TYPE_MAIN_VARIANT (t));
+      LTO_NO_PREVAIL (TYPE_NEXT_VARIANT (t));
+    }
+  else if (EXPR_P (t))
+    {
+      int i;
+      for (i = TREE_OPERAND_LENGTH (t) - 1; i >= 0; --i)
+	LTO_SET_PREVAIL (TREE_OPERAND (t, i));
+    }
+  else if (TREE_CODE (t) == CONSTRUCTOR)
+    {
+      unsigned i;
+      tree val;
+      FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (t), i, val)
+	LTO_SET_PREVAIL (val);
+    }
+  else
+    {
+      switch (code)
+	{
+	case TREE_LIST:
+	  LTO_SET_PREVAIL (TREE_VALUE (t));
+	  LTO_SET_PREVAIL (TREE_PURPOSE (t));
+	  LTO_NO_PREVAIL (TREE_PURPOSE (t));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+    }
+  /* If we fixed nothing, then we missed something seen by
+     mentions_vars_p.  */
+  gcc_checking_assert (fixed);
+}
+#undef LTO_SET_PREVAIL
+#undef LTO_NO_PREVAIL
+
+/* Helper function of lto_fixup_decls. Walks the var and fn streams in STATE,
+   replaces var and function decls with the corresponding prevailing def.  */
+
+static void
+lto_fixup_state (struct lto_in_decl_state *state)
+{
+  unsigned i, si;
+
+  /* Although we only want to replace FUNCTION_DECLs and VAR_DECLs,
+     we still need to walk from all DECLs to find the reachable
+     FUNCTION_DECLs and VAR_DECLs.  */
+  for (si = 0; si < LTO_N_DECL_STREAMS; si++)
+    {
+      vec<tree, va_gc> *trees = state->streams[si];
+      for (i = 0; i < vec_safe_length (trees); i++)
+	{
+	  tree t = (*trees)[i];
+	  if (flag_checking && TYPE_P (t))
+	    verify_type (t);
+	  if (VAR_OR_FUNCTION_DECL_P (t)
+	      && (TREE_PUBLIC (t) || DECL_EXTERNAL (t)))
+	    (*trees)[i] = lto_symtab_prevailing_decl (t);
+	}
+    }
+}
+
+/* Fix the decls from all FILES. Replaces each decl with the corresponding
+   prevailing one.  */
+
+static void
+lto_fixup_decls (struct lto_file_decl_data **files)
+{
+  unsigned int i;
+  tree t;
+
+  if (tree_with_vars)
+    FOR_EACH_VEC_ELT ((*tree_with_vars), i, t)
+      lto_fixup_prevailing_decls (t);
+
+  for (i = 0; files[i]; i++)
+    {
+      struct lto_file_decl_data *file = files[i];
+      struct lto_in_decl_state *state = file->global_decl_state;
+      lto_fixup_state (state);
+
+      hash_table<decl_state_hasher>::iterator iter;
+      lto_in_decl_state *elt;
+      FOR_EACH_HASH_TABLE_ELEMENT (*file->function_decl_states, elt,
+				   lto_in_decl_state *, iter)
+	lto_fixup_state (elt);
+    }
+}
+
+static GTY((length ("lto_stats.num_input_files + 1"))) struct lto_file_decl_data **all_file_decl_data;
+
+/* Turn file datas for sub files into a single array, so that they look
+   like separate files for further passes. */
+
+static void
+lto_flatten_files (struct lto_file_decl_data **orig, int count, int last_file_ix)
+{
+  struct lto_file_decl_data *n, *next;
+  int i, k;
+
+  lto_stats.num_input_files = count;
+  all_file_decl_data
+    = ggc_cleared_vec_alloc<lto_file_decl_data_ptr> (count + 1);
+  /* Set the hooks so that all of the ipa passes can read in their data.  */
+  lto_set_in_hooks (all_file_decl_data, get_section_data, free_section_data);
+  for (i = 0, k = 0; i < last_file_ix; i++) 
+    {
+      for (n = orig[i]; n != NULL; n = next)
+	{
+	  all_file_decl_data[k++] = n;
+	  next = n->next;
+	  n->next = NULL;
+	}
+    }
+  all_file_decl_data[k] = NULL;
+  gcc_assert (k == count);
+}
+
+/* Input file data before flattening (i.e. splitting them to subfiles to support
+   incremental linking.  */
+static int real_file_count;
+static GTY((length ("real_file_count + 1"))) struct lto_file_decl_data **real_file_decl_data;
+
+/* Read all the symbols from the input files FNAMES.  NFILES is the
+   number of files requested in the command line.  Instantiate a
+   global call graph by aggregating all the sub-graphs found in each
+   file.  */
+
+void
+read_cgraph_and_symbols (unsigned nfiles, const char **fnames)
+{
+  unsigned int i, last_file_ix;
+  FILE *resolution;
+  int count = 0;
+  struct lto_file_decl_data **decl_data;
+  symtab_node *snode;
+
+  symtab->initialize ();
+
+  timevar_push (TV_IPA_LTO_DECL_IN);
+
+#ifdef ACCEL_COMPILER
+  section_name_prefix = OFFLOAD_SECTION_NAME_PREFIX;
+  lto_stream_offload_p = true;
+#endif
+
+  real_file_decl_data
+    = decl_data = ggc_cleared_vec_alloc<lto_file_decl_data_ptr> (nfiles + 1);
+  real_file_count = nfiles;
+
+  /* Read the resolution file.  */
+  resolution = NULL;
+  if (resolution_file_name)
+    {
+      int t;
+      unsigned num_objects;
+
+      resolution = fopen (resolution_file_name, "r");
+      if (resolution == NULL)
+	fatal_error (input_location,
+		     "could not open symbol resolution file: %m");
+
+      t = fscanf (resolution, "%u", &num_objects);
+      gcc_assert (t == 1);
+
+      /* True, since the plugin splits the archives.  */
+      gcc_assert (num_objects == nfiles);
+    }
+  symtab->state = LTO_STREAMING;
+
+  canonical_type_hash_cache = new hash_map<const_tree, hashval_t> (251);
+  gimple_canonical_types = htab_create (16381, gimple_canonical_type_hash,
+					gimple_canonical_type_eq, NULL);
+  gcc_obstack_init (&tree_scc_hash_obstack);
+  tree_scc_hash = new hash_table<tree_scc_hasher> (4096);
+
+  /* Register the common node types with the canonical type machinery so
+     we properly share alias-sets across languages and TUs.  Do not
+     expose the common nodes as type merge target - those that should be
+     are already exposed so by pre-loading the LTO streamer caches.
+     Do two passes - first clear TYPE_CANONICAL and then re-compute it.  */
+  for (i = 0; i < itk_none; ++i)
+    lto_register_canonical_types (integer_types[i], true);
+  for (i = 0; i < stk_type_kind_last; ++i)
+    lto_register_canonical_types (sizetype_tab[i], true);
+  for (i = 0; i < TI_MAX; ++i)
+    lto_register_canonical_types (global_trees[i], true);
+  for (i = 0; i < itk_none; ++i)
+    lto_register_canonical_types (integer_types[i], false);
+  for (i = 0; i < stk_type_kind_last; ++i)
+    lto_register_canonical_types (sizetype_tab[i], false);
+  for (i = 0; i < TI_MAX; ++i)
+    lto_register_canonical_types (global_trees[i], false);
+
+  if (!quiet_flag)
+    fprintf (stderr, "Reading object files:");
+
+  /* Read all of the object files specified on the command line.  */
+  for (i = 0, last_file_ix = 0; i < nfiles; ++i)
+    {
+      struct lto_file_decl_data *file_data = NULL;
+      if (!quiet_flag)
+	{
+	  fprintf (stderr, " %s", fnames[i]);
+	  fflush (stderr);
+	}
+
+      current_lto_file = lto_obj_file_open (fnames[i], false);
+      if (!current_lto_file)
+	break;
+
+      file_data = lto_file_read (current_lto_file, resolution, &count);
+      if (!file_data)
+	{
+	  lto_obj_file_close (current_lto_file);
+	  free (current_lto_file);
+	  current_lto_file = NULL;
+	  break;
+	}
+
+      decl_data[last_file_ix++] = file_data;
+
+      lto_obj_file_close (current_lto_file);
+      free (current_lto_file);
+      current_lto_file = NULL;
+    }
+
+  lto_flatten_files (decl_data, count, last_file_ix);
+  lto_stats.num_input_files = count;
+  ggc_free(decl_data);
+  real_file_decl_data = NULL;
+
+  if (resolution_file_name)
+    fclose (resolution);
+
+  /* Show the LTO report before launching LTRANS.  */
+  if (flag_lto_report || (flag_wpa && flag_lto_report_wpa))
+    print_lto_report_1 ();
+
+  /* Free gimple type merging datastructures.  */
+  delete tree_scc_hash;
+  tree_scc_hash = NULL;
+  obstack_free (&tree_scc_hash_obstack, NULL);
+  htab_delete (gimple_canonical_types);
+  gimple_canonical_types = NULL;
+  delete canonical_type_hash_cache;
+  canonical_type_hash_cache = NULL;
+
+  /* At this stage we know that majority of GGC memory is reachable.  
+     Growing the limits prevents unnecesary invocation of GGC.  */
+  ggc_grow ();
+  ggc_collect ();
+
+  /* Set the hooks so that all of the ipa passes can read in their data.  */
+  lto_set_in_hooks (all_file_decl_data, get_section_data, free_section_data);
+
+  timevar_pop (TV_IPA_LTO_DECL_IN);
+
+  if (!quiet_flag)
+    fprintf (stderr, "\nReading the callgraph\n");
+
+  timevar_push (TV_IPA_LTO_CGRAPH_IO);
+  /* Read the symtab.  */
+  input_symtab ();
+
+  input_offload_tables (!flag_ltrans);
+
+  /* Store resolutions into the symbol table.  */
+
+  FOR_EACH_SYMBOL (snode)
+    if (snode->externally_visible && snode->real_symbol_p ()
+	&& snode->lto_file_data && snode->lto_file_data->resolution_map
+	&& !(TREE_CODE (snode->decl) == FUNCTION_DECL
+	     && fndecl_built_in_p (snode->decl))
+	&& !(VAR_P (snode->decl) && DECL_HARD_REGISTER (snode->decl)))
+      {
+	ld_plugin_symbol_resolution_t *res;
+
+	res = snode->lto_file_data->resolution_map->get (snode->decl);
+	if (!res || *res == LDPR_UNKNOWN)
+	  {
+	    if (snode->output_to_lto_symbol_table_p ())
+	      fatal_error (input_location, "missing resolution data for %s",
+		           IDENTIFIER_POINTER
+			     (DECL_ASSEMBLER_NAME (snode->decl)));
+	  }
+	else
+          snode->resolution = *res;
+      }
+  for (i = 0; all_file_decl_data[i]; i++)
+    if (all_file_decl_data[i]->resolution_map)
+      {
+        delete all_file_decl_data[i]->resolution_map;
+        all_file_decl_data[i]->resolution_map = NULL;
+      }
+  
+  timevar_pop (TV_IPA_LTO_CGRAPH_IO);
+
+  if (!quiet_flag)
+    fprintf (stderr, "Merging declarations\n");
+
+  timevar_push (TV_IPA_LTO_DECL_MERGE);
+  /* Merge global decls.  In ltrans mode we read merged cgraph, we do not
+     need to care about resolving symbols again, we only need to replace
+     duplicated declarations read from the callgraph and from function
+     sections.  */
+  if (!flag_ltrans)
+    {
+      lto_symtab_merge_decls ();
+
+      /* If there were errors during symbol merging bail out, we have no
+	 good way to recover here.  */
+      if (seen_error ())
+	fatal_error (input_location,
+		     "errors during merging of translation units");
+
+      /* Fixup all decls.  */
+      lto_fixup_decls (all_file_decl_data);
+    }
+  if (tree_with_vars)
+    ggc_free (tree_with_vars);
+  tree_with_vars = NULL;
+  ggc_collect ();
+
+  timevar_pop (TV_IPA_LTO_DECL_MERGE);
+  /* Each pass will set the appropriate timer.  */
+
+  if (!quiet_flag)
+    fprintf (stderr, "Reading summaries\n");
+
+  /* Read the IPA summary data.  */
+  if (flag_ltrans)
+    ipa_read_optimization_summaries ();
+  else
+    ipa_read_summaries ();
+
+  for (i = 0; all_file_decl_data[i]; i++)
+    {
+      gcc_assert (all_file_decl_data[i]->symtab_node_encoder);
+      lto_symtab_encoder_delete (all_file_decl_data[i]->symtab_node_encoder);
+      all_file_decl_data[i]->symtab_node_encoder = NULL;
+      lto_free_function_in_decl_state (all_file_decl_data[i]->global_decl_state);
+      all_file_decl_data[i]->global_decl_state = NULL;
+      all_file_decl_data[i]->current_decl_state = NULL; 
+    }
+
+  if (!flag_ltrans)
+    {
+      /* Finally merge the cgraph according to the decl merging decisions.  */
+      timevar_push (TV_IPA_LTO_CGRAPH_MERGE);
+
+      gcc_assert (!dump_file);
+      dump_file = dump_begin (lto_link_dump_id, NULL);
+
+      if (dump_file)
+	{
+	  fprintf (dump_file, "Before merging:\n");
+	  symtab->dump (dump_file);
+	}
+      lto_symtab_merge_symbols ();
+      /* Removal of unreachable symbols is needed to make verify_symtab to pass;
+	 we are still having duplicated comdat groups containing local statics.
+	 We could also just remove them while merging.  */
+      symtab->remove_unreachable_nodes (dump_file);
+      ggc_collect ();
+
+      if (dump_file)
+        dump_end (lto_link_dump_id, dump_file);
+      dump_file = NULL;
+      timevar_pop (TV_IPA_LTO_CGRAPH_MERGE);
+    }
+  symtab->state = IPA_SSA;
+  /* All node removals happening here are useless, because
+     WPA should not stream them.  Still always perform remove_unreachable_nodes
+     because we may reshape clone tree, get rid of dead masters of inline
+     clones and remove symbol entries for read-only variables we keep around
+     only to be able to constant fold them.  */
+  if (flag_ltrans)
+    {
+      if (symtab->dump_file)
+	 symtab->dump (symtab->dump_file);
+      symtab->remove_unreachable_nodes (symtab->dump_file);
+    }
+
+  /* Indicate that the cgraph is built and ready.  */
+  symtab->function_flags_ready = true;
+
+  ggc_free (all_file_decl_data);
+  all_file_decl_data = NULL;
+}
+
+
+
+/* Show various memory usage statistics related to LTO.  */
+void
+print_lto_report_1 (void)
+{
+  const char *pfx = (flag_lto) ? "LTO" : (flag_wpa) ? "WPA" : "LTRANS";
+  fprintf (stderr, "%s statistics\n", pfx);
+
+  fprintf (stderr, "[%s] read %lu SCCs of average size %f\n",
+	   pfx, num_sccs_read, total_scc_size / (double)num_sccs_read);
+  fprintf (stderr, "[%s] %lu tree bodies read in total\n", pfx, total_scc_size);
+  if (flag_wpa && tree_scc_hash)
+    {
+      fprintf (stderr, "[%s] tree SCC table: size %ld, %ld elements, "
+	       "collision ratio: %f\n", pfx,
+	       (long) tree_scc_hash->size (),
+	       (long) tree_scc_hash->elements (),
+	       tree_scc_hash->collisions ());
+      hash_table<tree_scc_hasher>::iterator hiter;
+      tree_scc *scc, *max_scc = NULL;
+      unsigned max_length = 0;
+      FOR_EACH_HASH_TABLE_ELEMENT (*tree_scc_hash, scc, x, hiter)
+	{
+	  unsigned length = 0;
+	  tree_scc *s = scc;
+	  for (; s; s = s->next)
+	    length++;
+	  if (length > max_length)
+	    {
+	      max_length = length;
+	      max_scc = scc;
+	    }
+	}
+      fprintf (stderr, "[%s] tree SCC max chain length %u (size %u)\n",
+	       pfx, max_length, max_scc->len);
+      fprintf (stderr, "[%s] Compared %lu SCCs, %lu collisions (%f)\n", pfx,
+	       num_scc_compares, num_scc_compare_collisions,
+	       num_scc_compare_collisions / (double) num_scc_compares);
+      fprintf (stderr, "[%s] Merged %lu SCCs\n", pfx, num_sccs_merged);
+      fprintf (stderr, "[%s] Merged %lu tree bodies\n", pfx,
+	       total_scc_size_merged);
+      fprintf (stderr, "[%s] Merged %lu types\n", pfx, num_merged_types);
+      fprintf (stderr, "[%s] %lu types prevailed (%lu associated trees)\n",
+	       pfx, num_prevailing_types, num_type_scc_trees);
+      fprintf (stderr, "[%s] GIMPLE canonical type table: size %ld, "
+	       "%ld elements, %ld searches, %ld collisions (ratio: %f)\n", pfx,
+	       (long) htab_size (gimple_canonical_types),
+	       (long) htab_elements (gimple_canonical_types),
+	       (long) gimple_canonical_types->searches,
+	       (long) gimple_canonical_types->collisions,
+	       htab_collisions (gimple_canonical_types));
+      fprintf (stderr, "[%s] GIMPLE canonical type pointer-map: "
+	       "%lu elements, %ld searches\n", pfx,
+	       num_canonical_type_hash_entries,
+	       num_canonical_type_hash_queries);
+    }
+
+  print_lto_report (pfx);
+}
+
+GTY(()) tree lto_eh_personality_decl;
+
+/* Return the LTO personality function decl.  */
+
+tree
+lto_eh_personality (void)
+{
+  if (!lto_eh_personality_decl)
+    {
+      /* Use the first personality DECL for our personality if we don't
+	 support multiple ones.  This ensures that we don't artificially
+	 create the need for them in a single-language program.  */
+      if (first_personality_decl && !dwarf2out_do_cfi_asm ())
+	lto_eh_personality_decl = first_personality_decl;
+      else
+	lto_eh_personality_decl = lhd_gcc_personality ();
+    }
+
+  return lto_eh_personality_decl;
+}
+
+/* Set the process name based on the LTO mode. */
+
+static void 
+lto_process_name (void)
+{
+  if (flag_lto)
+    setproctitle (flag_incremental_link == INCREMENTAL_LINK_LTO
+		  ? "lto1-inclink" : "lto1-lto");
+  if (flag_wpa)
+    setproctitle ("lto1-wpa");
+  if (flag_ltrans)
+    setproctitle ("lto1-ltrans");
+}
+
+
+/* Initialize the LTO front end.  */
+
+void
+lto_fe_init (void)
+{
+  lto_process_name ();
+  lto_streamer_hooks_init ();
+  lto_reader_init ();
+  lto_set_in_hooks (NULL, get_section_data, free_section_data);
+  memset (&lto_stats, 0, sizeof (lto_stats));
+  bitmap_obstack_initialize (NULL);
+  gimple_register_cfg_hooks ();
+#ifndef ACCEL_COMPILER
+  unsigned char *table
+    = ggc_vec_alloc<unsigned char> (MAX_MACHINE_MODE);
+  for (int m = 0; m < MAX_MACHINE_MODE; m++)
+    table[m] = m;
+  lto_mode_identity_table = table;
+#endif
+}
+
+#include "gt-lto-lto-common.h"
diff --git a/gcc/lto/lto-common.h b/gcc/lto/lto-common.h
new file mode 100644
index 000000000..b1209a3a3
--- /dev/null
+++ b/gcc/lto/lto-common.h
@@ -0,0 +1,33 @@
+/* LTO common functions between lto.c and lto-dump.c header file.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef LTO_COMMON_H
+#define LTO_COMMON_H
+
+void lto_fe_init (void);
+void read_cgraph_and_symbols (unsigned, const char **);
+void print_lto_report_1 (void);
+
+extern tree lto_eh_personality_decl;
+extern GTY(()) vec<tree, va_gc> *tree_with_vars;
+extern const unsigned char *lto_mode_identity_table;
+extern tree first_personality_decl;
+
+#endif
+
diff --git a/gcc/lto/lto-lang.c b/gcc/lto/lto-lang.c
index 4ef228fcb..1d35db11e 100644
--- a/gcc/lto/lto-lang.c
+++ b/gcc/lto/lto-lang.c
@@ -34,6 +34,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "debug.h"
 #include "lto-tree.h"
 #include "lto.h"
+#include "lto-common.h"
 #include "stringpool.h"
 #include "attribs.h"
 
diff --git a/gcc/lto/lto-symtab.c b/gcc/lto/lto-symtab.c
index 63a633302..2fd5b1e8f 100644
--- a/gcc/lto/lto-symtab.c
+++ b/gcc/lto/lto-symtab.c
@@ -556,7 +556,8 @@ lto_symtab_merge_p (tree prevailing, tree decl)
 	}
       if (fndecl_built_in_p (prevailing)
 	  && (DECL_BUILT_IN_CLASS (prevailing) != DECL_BUILT_IN_CLASS (decl)
-	      || DECL_FUNCTION_CODE (prevailing) != DECL_FUNCTION_CODE (decl)))
+	      || (DECL_UNCHECKED_FUNCTION_CODE (prevailing)
+		  != DECL_UNCHECKED_FUNCTION_CODE (decl))))
 	{
 	  if (dump_file)
 	    fprintf (dump_file, "Not merging decls; "
diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c
index 4db156fdf..c44e034a2 100644
--- a/gcc/lto/lto.c
+++ b/gcc/lto/lto.c
@@ -38,7 +38,6 @@ along with GCC; see the file COPYING3.  If not see
 #include "symbol-summary.h"
 #include "tree-vrp.h"
 #include "ipa-prop.h"
-#include "common.h"
 #include "debug.h"
 #include "lto.h"
 #include "lto-section-names.h"
@@ -55,122 +54,12 @@ along with GCC; see the file COPYING3.  If not see
 #include "fold-const.h"
 #include "attribs.h"
 #include "builtins.h"
+#include "lto-common.h"
 
 
 /* Number of parallel tasks to run, -1 if we want to use GNU Make jobserver.  */
 static int lto_parallelism;
 
-static GTY(()) tree first_personality_decl;
-
-static GTY(()) const unsigned char *lto_mode_identity_table;
-
-/* Returns a hash code for P.  */
-
-static hashval_t
-hash_name (const void *p)
-{
-  const struct lto_section_slot *ds = (const struct lto_section_slot *) p;
-  return (hashval_t) htab_hash_string (ds->name);
-}
-
-
-/* Returns nonzero if P1 and P2 are equal.  */
-
-static int
-eq_name (const void *p1, const void *p2)
-{
-  const struct lto_section_slot *s1 =
-    (const struct lto_section_slot *) p1;
-  const struct lto_section_slot *s2 =
-    (const struct lto_section_slot *) p2;
-
-  return strcmp (s1->name, s2->name) == 0;
-}
-
-/* Free lto_section_slot */
-
-static void
-free_with_string (void *arg)
-{
-  struct lto_section_slot *s = (struct lto_section_slot *)arg;
-
-  free (CONST_CAST (char *, s->name));
-  free (arg);
-}
-
-/* Create section hash table */
-
-htab_t 
-lto_obj_create_section_hash_table (void)
-{
-  return htab_create (37, hash_name, eq_name, free_with_string);
-}
-
-/* Delete an allocated integer KEY in the splay tree.  */
-
-static void
-lto_splay_tree_delete_id (splay_tree_key key)
-{
-  free ((void *) key);
-}
-
-/* Compare splay tree node ids A and B.  */
-
-static int
-lto_splay_tree_compare_ids (splay_tree_key a, splay_tree_key b)
-{
-  unsigned HOST_WIDE_INT ai;
-  unsigned HOST_WIDE_INT bi;
-
-  ai = *(unsigned HOST_WIDE_INT *) a;
-  bi = *(unsigned HOST_WIDE_INT *) b;
-
-  if (ai < bi)
-    return -1;
-  else if (ai > bi)
-    return 1;
-  return 0;
-}
-
-/* Look up splay tree node by ID in splay tree T.  */
-
-static splay_tree_node
-lto_splay_tree_lookup (splay_tree t, unsigned HOST_WIDE_INT id)
-{
-  return splay_tree_lookup (t, (splay_tree_key) &id);
-}
-
-/* Check if KEY has ID.  */
-
-static bool
-lto_splay_tree_id_equal_p (splay_tree_key key, unsigned HOST_WIDE_INT id)
-{
-  return *(unsigned HOST_WIDE_INT *) key == id;
-}
-
-/* Insert a splay tree node into tree T with ID as key and FILE_DATA as value. 
-   The ID is allocated separately because we need HOST_WIDE_INTs which may
-   be wider than a splay_tree_key. */
-
-static void
-lto_splay_tree_insert (splay_tree t, unsigned HOST_WIDE_INT id,
-		       struct lto_file_decl_data *file_data)
-{
-  unsigned HOST_WIDE_INT *idp = XCNEW (unsigned HOST_WIDE_INT);
-  *idp = id;
-  splay_tree_insert (t, (splay_tree_key) idp, (splay_tree_value) file_data);
-}
-
-/* Create a splay tree.  */
-
-static splay_tree
-lto_splay_tree_new (void)
-{
-  return splay_tree_new (lto_splay_tree_compare_ids,
-	 	         lto_splay_tree_delete_id,
-			 NULL);
-}
-
 /* Return true when NODE has a clone that is analyzed (i.e. we need
    to load its body even if the node itself is not needed).  */
 
@@ -224,2083 +113,45 @@ lto_materialize_function (struct cgraph_node *node)
   rest_of_decl_compilation (decl, 1, 0);
 }
 
-
-/* Decode the content of memory pointed to by DATA in the in decl
-   state object STATE. DATA_IN points to a data_in structure for
-   decoding. Return the address after the decoded object in the
-   input.  */
-
-static const uint32_t *
-lto_read_in_decl_state (struct data_in *data_in, const uint32_t *data,
-			struct lto_in_decl_state *state)
-{
-  uint32_t ix;
-  tree decl;
-  uint32_t i, j;
-
-  ix = *data++;
-  state->compressed = ix & 1;
-  ix /= 2;
-  decl = streamer_tree_cache_get_tree (data_in->reader_cache, ix);
-  if (!VAR_OR_FUNCTION_DECL_P (decl))
-    {
-      gcc_assert (decl == void_type_node);
-      decl = NULL_TREE;
-    }
-  state->fn_decl = decl;
-
-  for (i = 0; i < LTO_N_DECL_STREAMS; i++)
-    {
-      uint32_t size = *data++;
-      vec<tree, va_gc> *decls = NULL;
-      vec_alloc (decls, size);
-
-      for (j = 0; j < size; j++)
-	vec_safe_push (decls,
-		       streamer_tree_cache_get_tree (data_in->reader_cache,
-						     data[j]));
-
-      state->streams[i] = decls;
-      data += size;
-    }
-
-  return data;
-}
-
-
-/* Global canonical type table.  */
-static htab_t gimple_canonical_types;
-static hash_map<const_tree, hashval_t> *canonical_type_hash_cache;
-static unsigned long num_canonical_type_hash_entries;
-static unsigned long num_canonical_type_hash_queries;
-
-static void iterative_hash_canonical_type (tree type, inchash::hash &hstate);
-static hashval_t gimple_canonical_type_hash (const void *p);
-static void gimple_register_canonical_type_1 (tree t, hashval_t hash);
-
-/* Returning a hash value for gimple type TYPE.
-
-   The hash value returned is equal for types considered compatible
-   by gimple_canonical_types_compatible_p.  */
-
-static hashval_t
-hash_canonical_type (tree type)
-{
-  inchash::hash hstate;
-  enum tree_code code;
-
-  /* We compute alias sets only for types that needs them.
-     Be sure we do not recurse to something else as we cannot hash incomplete
-     types in a way they would have same hash value as compatible complete
-     types.  */
-  gcc_checking_assert (type_with_alias_set_p (type));
-
-  /* Combine a few common features of types so that types are grouped into
-     smaller sets; when searching for existing matching types to merge,
-     only existing types having the same features as the new type will be
-     checked.  */
-  code = tree_code_for_canonical_type_merging (TREE_CODE (type));
-  hstate.add_int (code);
-  hstate.add_int (TYPE_MODE (type));
-
-  /* Incorporate common features of numerical types.  */
-  if (INTEGRAL_TYPE_P (type)
-      || SCALAR_FLOAT_TYPE_P (type)
-      || FIXED_POINT_TYPE_P (type)
-      || TREE_CODE (type) == OFFSET_TYPE
-      || POINTER_TYPE_P (type))
-    {
-      hstate.add_int (TYPE_PRECISION (type));
-      if (!type_with_interoperable_signedness (type))
-        hstate.add_int (TYPE_UNSIGNED (type));
-    }
-
-  if (VECTOR_TYPE_P (type))
-    {
-      hstate.add_poly_int (TYPE_VECTOR_SUBPARTS (type));
-      hstate.add_int (TYPE_UNSIGNED (type));
-    }
-
-  if (TREE_CODE (type) == COMPLEX_TYPE)
-    hstate.add_int (TYPE_UNSIGNED (type));
-
-  /* Fortran's C_SIGNED_CHAR is !TYPE_STRING_FLAG but needs to be
-     interoperable with "signed char".  Unless all frontends are revisited to
-     agree on these types, we must ignore the flag completely.  */
-
-  /* Fortran standard define C_PTR type that is compatible with every
-     C pointer.  For this reason we need to glob all pointers into one.
-     Still pointers in different address spaces are not compatible.  */
-  if (POINTER_TYPE_P (type))
-    hstate.add_int (TYPE_ADDR_SPACE (TREE_TYPE (type)));
-
-  /* For array types hash the domain bounds and the string flag.  */
-  if (TREE_CODE (type) == ARRAY_TYPE && TYPE_DOMAIN (type))
-    {
-      hstate.add_int (TYPE_STRING_FLAG (type));
-      /* OMP lowering can introduce error_mark_node in place of
-	 random local decls in types.  */
-      if (TYPE_MIN_VALUE (TYPE_DOMAIN (type)) != error_mark_node)
-	inchash::add_expr (TYPE_MIN_VALUE (TYPE_DOMAIN (type)), hstate);
-      if (TYPE_MAX_VALUE (TYPE_DOMAIN (type)) != error_mark_node)
-	inchash::add_expr (TYPE_MAX_VALUE (TYPE_DOMAIN (type)), hstate);
-    }
-
-  /* Recurse for aggregates with a single element type.  */
-  if (TREE_CODE (type) == ARRAY_TYPE
-      || TREE_CODE (type) == COMPLEX_TYPE
-      || TREE_CODE (type) == VECTOR_TYPE)
-    iterative_hash_canonical_type (TREE_TYPE (type), hstate);
-
-  /* Incorporate function return and argument types.  */
-  if (TREE_CODE (type) == FUNCTION_TYPE || TREE_CODE (type) == METHOD_TYPE)
-    {
-      unsigned na;
-      tree p;
-
-      iterative_hash_canonical_type (TREE_TYPE (type), hstate);
-
-      for (p = TYPE_ARG_TYPES (type), na = 0; p; p = TREE_CHAIN (p))
-	{
-	  iterative_hash_canonical_type (TREE_VALUE (p), hstate);
-	  na++;
-	}
-
-      hstate.add_int (na);
-    }
-
-  if (RECORD_OR_UNION_TYPE_P (type))
-    {
-      unsigned nf;
-      tree f;
-
-      for (f = TYPE_FIELDS (type), nf = 0; f; f = TREE_CHAIN (f))
-	if (TREE_CODE (f) == FIELD_DECL
-	    && (! DECL_SIZE (f)
-		|| ! integer_zerop (DECL_SIZE (f))))
-	  {
-	    iterative_hash_canonical_type (TREE_TYPE (f), hstate);
-	    nf++;
-	  }
-
-      hstate.add_int (nf);
-    }
-
-  return hstate.end();
-}
-
-/* Returning a hash value for gimple type TYPE combined with VAL.  */
-
-static void
-iterative_hash_canonical_type (tree type, inchash::hash &hstate)
-{
-  hashval_t v;
-
-  /* All type variants have same TYPE_CANONICAL.  */
-  type = TYPE_MAIN_VARIANT (type);
-
-  if (!canonical_type_used_p (type))
-    v = hash_canonical_type (type);
-  /* An already processed type.  */
-  else if (TYPE_CANONICAL (type))
-    {
-      type = TYPE_CANONICAL (type);
-      v = gimple_canonical_type_hash (type);
-    }
-  else
-    {
-      /* Canonical types should not be able to form SCCs by design, this
-	 recursion is just because we do not register canonical types in
-	 optimal order.  To avoid quadratic behavior also register the
-	 type here.  */
-      v = hash_canonical_type (type);
-      gimple_register_canonical_type_1 (type, v);
-    }
-  hstate.add_int (v);
-}
-
-/* Returns the hash for a canonical type P.  */
-
-static hashval_t
-gimple_canonical_type_hash (const void *p)
-{
-  num_canonical_type_hash_queries++;
-  hashval_t *slot = canonical_type_hash_cache->get ((const_tree) p);
-  gcc_assert (slot != NULL);
-  return *slot;
-}
-
-
-
-/* Returns nonzero if P1 and P2 are equal.  */
-
-static int
-gimple_canonical_type_eq (const void *p1, const void *p2)
-{
-  const_tree t1 = (const_tree) p1;
-  const_tree t2 = (const_tree) p2;
-  return gimple_canonical_types_compatible_p (CONST_CAST_TREE (t1),
-					      CONST_CAST_TREE (t2));
-}
-
-/* Main worker for gimple_register_canonical_type.  */
-
-static void
-gimple_register_canonical_type_1 (tree t, hashval_t hash)
-{
-  void **slot;
-
-  gcc_checking_assert (TYPE_P (t) && !TYPE_CANONICAL (t)
-		       && type_with_alias_set_p (t)
-		       && canonical_type_used_p (t));
-
-  slot = htab_find_slot_with_hash (gimple_canonical_types, t, hash, INSERT);
-  if (*slot)
-    {
-      tree new_type = (tree)(*slot);
-      gcc_checking_assert (new_type != t);
-      TYPE_CANONICAL (t) = new_type;
-    }
-  else
-    {
-      TYPE_CANONICAL (t) = t;
-      *slot = (void *) t;
-      /* Cache the just computed hash value.  */
-      num_canonical_type_hash_entries++;
-      bool existed_p = canonical_type_hash_cache->put (t, hash);
-      gcc_assert (!existed_p);
-    }
-}
-
-/* Register type T in the global type table gimple_types and set
-   TYPE_CANONICAL of T accordingly.
-   This is used by LTO to merge structurally equivalent types for
-   type-based aliasing purposes across different TUs and languages.
-
-   ???  This merging does not exactly match how the tree.c middle-end
-   functions will assign TYPE_CANONICAL when new types are created
-   during optimization (which at least happens for pointer and array
-   types).  */
-
-static void
-gimple_register_canonical_type (tree t)
-{
-  if (TYPE_CANONICAL (t) || !type_with_alias_set_p (t)
-      || !canonical_type_used_p (t))
-    return;
-
-  /* Canonical types are same among all complete variants.  */
-  if (TYPE_CANONICAL (TYPE_MAIN_VARIANT (t)))
-    TYPE_CANONICAL (t) = TYPE_CANONICAL (TYPE_MAIN_VARIANT (t));
-  else
-    {
-      gimple_register_canonical_type_1 (TYPE_MAIN_VARIANT (t),
-					hash_canonical_type (TYPE_MAIN_VARIANT (t)));
-      TYPE_CANONICAL (t) = TYPE_CANONICAL (TYPE_MAIN_VARIANT (t));
-    }
-}
-
-/* Re-compute TYPE_CANONICAL for NODE and related types.  */
+/* Materialize all the bodies for all the nodes in the callgraph.  */
 
 static void
-lto_register_canonical_types (tree node, bool first_p)
-{
-  if (!node
-      || !TYPE_P (node))
-    return;
-
-  if (first_p)
-    TYPE_CANONICAL (node) = NULL_TREE;
-
-  if (POINTER_TYPE_P (node)
-      || TREE_CODE (node) == COMPLEX_TYPE
-      || TREE_CODE (node) == ARRAY_TYPE)
-    lto_register_canonical_types (TREE_TYPE (node), first_p);
-
- if (!first_p) 
-    gimple_register_canonical_type (node);
-}
-
-
-/* Remember trees that contains references to declarations.  */
-static GTY(()) vec <tree, va_gc> *tree_with_vars;
-
-#define CHECK_VAR(tt) \
-  do \
-    { \
-      if ((tt) && VAR_OR_FUNCTION_DECL_P (tt) \
-	  && (TREE_PUBLIC (tt) || DECL_EXTERNAL (tt))) \
-	return true; \
-    } while (0)
-
-#define CHECK_NO_VAR(tt) \
-  gcc_checking_assert (!(tt) || !VAR_OR_FUNCTION_DECL_P (tt))
-
-/* Check presence of pointers to decls in fields of a tree_typed T.  */
-
-static inline bool
-mentions_vars_p_typed (tree t)
-{
-  CHECK_NO_VAR (TREE_TYPE (t));
-  return false;
-}
-
-/* Check presence of pointers to decls in fields of a tree_common T.  */
-
-static inline bool
-mentions_vars_p_common (tree t)
-{
-  if (mentions_vars_p_typed (t))
-    return true;
-  CHECK_NO_VAR (TREE_CHAIN (t));
-  return false;
-}
-
-/* Check presence of pointers to decls in fields of a decl_minimal T.  */
-
-static inline bool
-mentions_vars_p_decl_minimal (tree t)
-{
-  if (mentions_vars_p_common (t))
-    return true;
-  CHECK_NO_VAR (DECL_NAME (t));
-  CHECK_VAR (DECL_CONTEXT (t));
-  return false;
-}
-
-/* Check presence of pointers to decls in fields of a decl_common T.  */
-
-static inline bool
-mentions_vars_p_decl_common (tree t)
-{
-  if (mentions_vars_p_decl_minimal (t))
-    return true;
-  CHECK_VAR (DECL_SIZE (t));
-  CHECK_VAR (DECL_SIZE_UNIT (t));
-  CHECK_VAR (DECL_INITIAL (t));
-  CHECK_NO_VAR (DECL_ATTRIBUTES (t));
-  CHECK_VAR (DECL_ABSTRACT_ORIGIN (t));
-  return false;
-}
-
-/* Check presence of pointers to decls in fields of a decl_with_vis T.  */
-
-static inline bool
-mentions_vars_p_decl_with_vis (tree t)
-{
-  if (mentions_vars_p_decl_common (t))
-    return true;
-
-  /* Accessor macro has side-effects, use field-name here. */
-  CHECK_NO_VAR (DECL_ASSEMBLER_NAME_RAW (t));
-  return false;
-}
-
-/* Check presence of pointers to decls in fields of a decl_non_common T.  */
-
-static inline bool
-mentions_vars_p_decl_non_common (tree t)
-{
-  if (mentions_vars_p_decl_with_vis (t))
-    return true;
-  CHECK_NO_VAR (DECL_RESULT_FLD (t));
-  return false;
-}
-
-/* Check presence of pointers to decls in fields of a decl_non_common T.  */
-
-static bool
-mentions_vars_p_function (tree t)
-{
-  if (mentions_vars_p_decl_non_common (t))
-    return true;
-  CHECK_NO_VAR (DECL_ARGUMENTS (t));
-  CHECK_NO_VAR (DECL_VINDEX (t));
-  CHECK_VAR (DECL_FUNCTION_PERSONALITY (t));
-  return false;
-}
-
-/* Check presence of pointers to decls in fields of a field_decl T.  */
-
-static bool
-mentions_vars_p_field_decl (tree t)
-{
-  if (mentions_vars_p_decl_common (t))
-    return true;
-  CHECK_VAR (DECL_FIELD_OFFSET (t));
-  CHECK_NO_VAR (DECL_BIT_FIELD_TYPE (t));
-  CHECK_NO_VAR (DECL_QUALIFIER (t));
-  CHECK_NO_VAR (DECL_FIELD_BIT_OFFSET (t));
-  CHECK_NO_VAR (DECL_FCONTEXT (t));
-  return false;
-}
-
-/* Check presence of pointers to decls in fields of a type T.  */
-
-static bool
-mentions_vars_p_type (tree t)
-{
-  if (mentions_vars_p_common (t))
-    return true;
-  CHECK_NO_VAR (TYPE_CACHED_VALUES (t));
-  CHECK_VAR (TYPE_SIZE (t));
-  CHECK_VAR (TYPE_SIZE_UNIT (t));
-  CHECK_NO_VAR (TYPE_ATTRIBUTES (t));
-  CHECK_NO_VAR (TYPE_NAME (t));
-
-  CHECK_VAR (TYPE_MIN_VALUE_RAW (t));
-  CHECK_VAR (TYPE_MAX_VALUE_RAW (t));
-
-  /* Accessor is for derived node types only. */
-  CHECK_NO_VAR (TYPE_LANG_SLOT_1 (t));
-
-  CHECK_VAR (TYPE_CONTEXT (t));
-  CHECK_NO_VAR (TYPE_CANONICAL (t));
-  CHECK_NO_VAR (TYPE_MAIN_VARIANT (t));
-  CHECK_NO_VAR (TYPE_NEXT_VARIANT (t));
-  return false;
-}
-
-/* Check presence of pointers to decls in fields of a BINFO T.  */
-
-static bool
-mentions_vars_p_binfo (tree t)
-{
-  unsigned HOST_WIDE_INT i, n;
-
-  if (mentions_vars_p_common (t))
-    return true;
-  CHECK_VAR (BINFO_VTABLE (t));
-  CHECK_NO_VAR (BINFO_OFFSET (t));
-  CHECK_NO_VAR (BINFO_VIRTUALS (t));
-  CHECK_NO_VAR (BINFO_VPTR_FIELD (t));
-  n = vec_safe_length (BINFO_BASE_ACCESSES (t));
-  for (i = 0; i < n; i++)
-    CHECK_NO_VAR (BINFO_BASE_ACCESS (t, i));
-  /* Do not walk BINFO_INHERITANCE_CHAIN, BINFO_SUBVTT_INDEX
-     and BINFO_VPTR_INDEX; these are used by C++ FE only.  */
-  n = BINFO_N_BASE_BINFOS (t);
-  for (i = 0; i < n; i++)
-    CHECK_NO_VAR (BINFO_BASE_BINFO (t, i));
-  return false;
-}
-
-/* Check presence of pointers to decls in fields of a CONSTRUCTOR T.  */
-
-static bool
-mentions_vars_p_constructor (tree t)
-{
-  unsigned HOST_WIDE_INT idx;
-  constructor_elt *ce;
-
-  if (mentions_vars_p_typed (t))
-    return true;
-
-  for (idx = 0; vec_safe_iterate (CONSTRUCTOR_ELTS (t), idx, &ce); idx++)
-    {
-      CHECK_NO_VAR (ce->index);
-      CHECK_VAR (ce->value);
-    }
-  return false;
-}
-
-/* Check presence of pointers to decls in fields of an expression tree T.  */
-
-static bool
-mentions_vars_p_expr (tree t)
-{
-  int i;
-  if (mentions_vars_p_typed (t))
-    return true;
-  for (i = TREE_OPERAND_LENGTH (t) - 1; i >= 0; --i)
-    CHECK_VAR (TREE_OPERAND (t, i));
-  return false;
-}
-
-/* Check presence of pointers to decls in fields of an OMP_CLAUSE T.  */
-
-static bool
-mentions_vars_p_omp_clause (tree t)
-{
-  int i;
-  if (mentions_vars_p_common (t))
-    return true;
-  for (i = omp_clause_num_ops[OMP_CLAUSE_CODE (t)] - 1; i >= 0; --i)
-    CHECK_VAR (OMP_CLAUSE_OPERAND (t, i));
-  return false;
-}
-
-/* Check presence of pointers to decls that needs later fixup in T.  */
-
-static bool
-mentions_vars_p (tree t)
+materialize_cgraph (void)
 {
-  switch (TREE_CODE (t))
-    {
-    case IDENTIFIER_NODE:
-      break;
-
-    case TREE_LIST:
-      CHECK_VAR (TREE_VALUE (t));
-      CHECK_VAR (TREE_PURPOSE (t));
-      CHECK_NO_VAR (TREE_CHAIN (t));
-      break;
-
-    case FIELD_DECL:
-      return mentions_vars_p_field_decl (t);
-
-    case LABEL_DECL:
-    case CONST_DECL:
-    case PARM_DECL:
-    case RESULT_DECL:
-    case IMPORTED_DECL:
-    case NAMESPACE_DECL:
-    case NAMELIST_DECL:
-      return mentions_vars_p_decl_common (t);
-
-    case VAR_DECL:
-      return mentions_vars_p_decl_with_vis (t);
-
-    case TYPE_DECL:
-      return mentions_vars_p_decl_non_common (t);
-
-    case FUNCTION_DECL:
-      return mentions_vars_p_function (t);
-
-    case TREE_BINFO:
-      return mentions_vars_p_binfo (t);
-
-    case PLACEHOLDER_EXPR:
-      return mentions_vars_p_common (t);
-
-    case BLOCK:
-    case TRANSLATION_UNIT_DECL:
-    case OPTIMIZATION_NODE:
-    case TARGET_OPTION_NODE:
-      break;
-
-    case CONSTRUCTOR:
-      return mentions_vars_p_constructor (t);
-
-    case OMP_CLAUSE:
-      return mentions_vars_p_omp_clause (t);
-
-    default:
-      if (TYPE_P (t))
-	{
-	  if (mentions_vars_p_type (t))
-	    return true;
-	}
-      else if (EXPR_P (t))
-	{
-	  if (mentions_vars_p_expr (t))
-	    return true;
-	}
-      else if (CONSTANT_CLASS_P (t))
-	CHECK_NO_VAR (TREE_TYPE (t));
-      else
-	gcc_unreachable ();
-    }
-  return false;
-}
-
-
-/* Return the resolution for the decl with index INDEX from DATA_IN. */
-
-static enum ld_plugin_symbol_resolution
-get_resolution (struct data_in *data_in, unsigned index)
-{
-  if (data_in->globals_resolution.exists ())
-    {
-      ld_plugin_symbol_resolution_t ret;
-      /* We can have references to not emitted functions in
-	 DECL_FUNCTION_PERSONALITY at least.  So we can and have
-	 to indeed return LDPR_UNKNOWN in some cases.   */
-      if (data_in->globals_resolution.length () <= index)
-	return LDPR_UNKNOWN;
-      ret = data_in->globals_resolution[index];
-      return ret;
-    }
-  else
-    /* Delay resolution finding until decl merging.  */
-    return LDPR_UNKNOWN;
-}
-
-/* We need to record resolutions until symbol table is read.  */
-static void
-register_resolution (struct lto_file_decl_data *file_data, tree decl,
-		     enum ld_plugin_symbol_resolution resolution)
-{
-  bool existed;
-  if (resolution == LDPR_UNKNOWN)
-    return;
-  if (!file_data->resolution_map)
-    file_data->resolution_map
-      = new hash_map<tree, ld_plugin_symbol_resolution>;
-  ld_plugin_symbol_resolution_t &res
-     = file_data->resolution_map->get_or_insert (decl, &existed);
-  if (!existed
-      || resolution == LDPR_PREVAILING_DEF_IRONLY
-      || resolution == LDPR_PREVAILING_DEF
-      || resolution == LDPR_PREVAILING_DEF_IRONLY_EXP)
-    res = resolution;
-}
-
-/* Register DECL with the global symbol table and change its
-   name if necessary to avoid name clashes for static globals across
-   different files.  */
-
-static void
-lto_register_var_decl_in_symtab (struct data_in *data_in, tree decl,
-				 unsigned ix)
-{
-  tree context;
-
-  /* Variable has file scope, not local.  */
-  if (!TREE_PUBLIC (decl)
-      && !((context = decl_function_context (decl))
-	   && auto_var_in_fn_p (decl, context)))
-    rest_of_decl_compilation (decl, 1, 0);
-
-  /* If this variable has already been declared, queue the
-     declaration for merging.  */
-  if (TREE_PUBLIC (decl))
-    register_resolution (data_in->file_data,
-			 decl, get_resolution (data_in, ix));
-}
-
-
-/* Register DECL with the global symbol table and change its
-   name if necessary to avoid name clashes for static globals across
-   different files.  DATA_IN contains descriptors and tables for the
-   file being read.  */
-
-static void
-lto_register_function_decl_in_symtab (struct data_in *data_in, tree decl,
-				      unsigned ix)
-{
-  /* If this variable has already been declared, queue the
-     declaration for merging.  */
-  if (TREE_PUBLIC (decl) && !DECL_ABSTRACT_P (decl))
-    register_resolution (data_in->file_data,
-			 decl, get_resolution (data_in, ix));
-}
-
-/* Check if T is a decl and needs register its resolution info.  */
-
-static void
-lto_maybe_register_decl (struct data_in *data_in, tree t, unsigned ix)
-{
-  if (TREE_CODE (t) == VAR_DECL)
-    lto_register_var_decl_in_symtab (data_in, t, ix);
-  else if (TREE_CODE (t) == FUNCTION_DECL
-	   && !fndecl_built_in_p (t))
-    lto_register_function_decl_in_symtab (data_in, t, ix);
-}
-
-
-/* For the type T re-materialize it in the type variant list and
-   the pointer/reference-to chains.  */
-
-static void
-lto_fixup_prevailing_type (tree t)
-{
-  /* The following re-creates proper variant lists while fixing up
-     the variant leaders.  We do not stream TYPE_NEXT_VARIANT so the
-     variant list state before fixup is broken.  */
-
-  /* If we are not our own variant leader link us into our new leaders
-     variant list.  */
-  if (TYPE_MAIN_VARIANT (t) != t)
-    {
-      tree mv = TYPE_MAIN_VARIANT (t);
-      TYPE_NEXT_VARIANT (t) = TYPE_NEXT_VARIANT (mv);
-      TYPE_NEXT_VARIANT (mv) = t;
-    }
-
-  /* The following reconstructs the pointer chains
-     of the new pointed-to type if we are a main variant.  We do
-     not stream those so they are broken before fixup.  */
-  if (TREE_CODE (t) == POINTER_TYPE
-      && TYPE_MAIN_VARIANT (t) == t)
-    {
-      TYPE_NEXT_PTR_TO (t) = TYPE_POINTER_TO (TREE_TYPE (t));
-      TYPE_POINTER_TO (TREE_TYPE (t)) = t;
-    }
-  else if (TREE_CODE (t) == REFERENCE_TYPE
-	   && TYPE_MAIN_VARIANT (t) == t)
-    {
-      TYPE_NEXT_REF_TO (t) = TYPE_REFERENCE_TO (TREE_TYPE (t));
-      TYPE_REFERENCE_TO (TREE_TYPE (t)) = t;
-    }
-}
-
-
-/* We keep prevailing tree SCCs in a hashtable with manual collision
-   handling (in case all hashes compare the same) and keep the colliding
-   entries in the tree_scc->next chain.  */
-
-struct tree_scc
-{
-  tree_scc *next;
-  /* Hash of the whole SCC.  */
-  hashval_t hash;
-  /* Number of trees in the SCC.  */
-  unsigned len;
-  /* Number of possible entries into the SCC (tree nodes [0..entry_len-1]
-     which share the same individual tree hash).  */
-  unsigned entry_len;
-  /* The members of the SCC.
-     We only need to remember the first entry node candidate for prevailing
-     SCCs (but of course have access to all entries for SCCs we are
-     processing).
-     ???  For prevailing SCCs we really only need hash and the first
-     entry candidate, but that's too awkward to implement.  */
-  tree entries[1];
-};
-
-struct tree_scc_hasher : nofree_ptr_hash <tree_scc>
-{
-  static inline hashval_t hash (const tree_scc *);
-  static inline bool equal (const tree_scc *, const tree_scc *);
-};
-
-hashval_t
-tree_scc_hasher::hash (const tree_scc *scc)
-{
-  return scc->hash;
-}
-
-bool
-tree_scc_hasher::equal (const tree_scc *scc1, const tree_scc *scc2)
-{
-  if (scc1->hash != scc2->hash
-      || scc1->len != scc2->len
-      || scc1->entry_len != scc2->entry_len)
-    return false;
-  return true;
-}
-
-static hash_table<tree_scc_hasher> *tree_scc_hash;
-static struct obstack tree_scc_hash_obstack;
-
-static unsigned long num_merged_types;
-static unsigned long num_prevailing_types;
-static unsigned long num_type_scc_trees;
-static unsigned long total_scc_size;
-static unsigned long num_sccs_read;
-static unsigned long total_scc_size_merged;
-static unsigned long num_sccs_merged;
-static unsigned long num_scc_compares;
-static unsigned long num_scc_compare_collisions;
-
-
-/* Compare the two entries T1 and T2 of two SCCs that are possibly equal,
-   recursing through in-SCC tree edges.  Returns true if the SCCs entered
-   through T1 and T2 are equal and fills in *MAP with the pairs of
-   SCC entries we visited, starting with (*MAP)[0] = T1 and (*MAP)[1] = T2.  */
-
-static bool
-compare_tree_sccs_1 (tree t1, tree t2, tree **map)
-{
-  enum tree_code code;
-
-  /* Mark already visited nodes.  */
-  TREE_ASM_WRITTEN (t2) = 1;
-
-  /* Push the pair onto map.  */
-  (*map)[0] = t1;
-  (*map)[1] = t2;
-  *map = *map + 2;
-
-  /* Compare value-fields.  */
-#define compare_values(X) \
-  do { \
-    if (X(t1) != X(t2)) \
-      return false; \
-  } while (0)
-
-  compare_values (TREE_CODE);
-  code = TREE_CODE (t1);
-
-  if (!TYPE_P (t1))
-    {
-      compare_values (TREE_SIDE_EFFECTS);
-      compare_values (TREE_CONSTANT);
-      compare_values (TREE_READONLY);
-      compare_values (TREE_PUBLIC);
-    }
-  compare_values (TREE_ADDRESSABLE);
-  compare_values (TREE_THIS_VOLATILE);
-  if (DECL_P (t1))
-    compare_values (DECL_UNSIGNED);
-  else if (TYPE_P (t1))
-    compare_values (TYPE_UNSIGNED);
-  if (TYPE_P (t1))
-    compare_values (TYPE_ARTIFICIAL);
-  else
-    compare_values (TREE_NO_WARNING);
-  compare_values (TREE_NOTHROW);
-  compare_values (TREE_STATIC);
-  if (code != TREE_BINFO)
-    compare_values (TREE_PRIVATE);
-  compare_values (TREE_PROTECTED);
-  compare_values (TREE_DEPRECATED);
-  if (TYPE_P (t1))
-    {
-      if (AGGREGATE_TYPE_P (t1))
-	compare_values (TYPE_REVERSE_STORAGE_ORDER);
-      else
-	compare_values (TYPE_SATURATING);
-      compare_values (TYPE_ADDR_SPACE);
-    }
-  else if (code == SSA_NAME)
-    compare_values (SSA_NAME_IS_DEFAULT_DEF);
-
-  if (CODE_CONTAINS_STRUCT (code, TS_INT_CST))
-    {
-      if (wi::to_wide (t1) != wi::to_wide (t2))
-	return false;
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_REAL_CST))
-    {
-      /* ???  No suitable compare routine available.  */
-      REAL_VALUE_TYPE r1 = TREE_REAL_CST (t1);
-      REAL_VALUE_TYPE r2 = TREE_REAL_CST (t2);
-      if (r1.cl != r2.cl
-	  || r1.decimal != r2.decimal
-	  || r1.sign != r2.sign
-	  || r1.signalling != r2.signalling
-	  || r1.canonical != r2.canonical
-	  || r1.uexp != r2.uexp)
-	return false;
-      for (unsigned i = 0; i < SIGSZ; ++i)
-	if (r1.sig[i] != r2.sig[i])
-	  return false;
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_FIXED_CST))
-    if (!fixed_compare (EQ_EXPR,
-			TREE_FIXED_CST_PTR (t1), TREE_FIXED_CST_PTR (t2)))
-      return false;
-
-  if (CODE_CONTAINS_STRUCT (code, TS_VECTOR))
-    {
-      compare_values (VECTOR_CST_LOG2_NPATTERNS);
-      compare_values (VECTOR_CST_NELTS_PER_PATTERN);
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_DECL_COMMON))
-    {
-      compare_values (DECL_MODE);
-      compare_values (DECL_NONLOCAL);
-      compare_values (DECL_VIRTUAL_P);
-      compare_values (DECL_IGNORED_P);
-      compare_values (DECL_ABSTRACT_P);
-      compare_values (DECL_ARTIFICIAL);
-      compare_values (DECL_USER_ALIGN);
-      compare_values (DECL_PRESERVE_P);
-      compare_values (DECL_EXTERNAL);
-      compare_values (DECL_GIMPLE_REG_P);
-      compare_values (DECL_ALIGN);
-      if (code == LABEL_DECL)
-	{
-	  compare_values (EH_LANDING_PAD_NR);
-	  compare_values (LABEL_DECL_UID);
-	}
-      else if (code == FIELD_DECL)
-	{
-	  compare_values (DECL_PACKED);
-	  compare_values (DECL_NONADDRESSABLE_P);
-	  compare_values (DECL_PADDING_P);
-	  compare_values (DECL_OFFSET_ALIGN);
-	}
-      else if (code == VAR_DECL)
-	{
-	  compare_values (DECL_HAS_DEBUG_EXPR_P);
-	  compare_values (DECL_NONLOCAL_FRAME);
-	}
-      if (code == RESULT_DECL
-	  || code == PARM_DECL
-	  || code == VAR_DECL)
-	{
-	  compare_values (DECL_BY_REFERENCE);
-	  if (code == VAR_DECL
-	      || code == PARM_DECL)
-	    compare_values (DECL_HAS_VALUE_EXPR_P);
-	}
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_DECL_WRTL))
-    compare_values (DECL_REGISTER);
-
-  if (CODE_CONTAINS_STRUCT (code, TS_DECL_WITH_VIS))
-    {
-      compare_values (DECL_COMMON);
-      compare_values (DECL_DLLIMPORT_P);
-      compare_values (DECL_WEAK);
-      compare_values (DECL_SEEN_IN_BIND_EXPR_P);
-      compare_values (DECL_COMDAT);
-      compare_values (DECL_VISIBILITY);
-      compare_values (DECL_VISIBILITY_SPECIFIED);
-      if (code == VAR_DECL)
-	{
-	  compare_values (DECL_HARD_REGISTER);
-          /* DECL_IN_TEXT_SECTION is set during final asm output only.  */
-	  compare_values (DECL_IN_CONSTANT_POOL);
-	}
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_FUNCTION_DECL))
-    {
-      compare_values (DECL_BUILT_IN_CLASS);
-      compare_values (DECL_STATIC_CONSTRUCTOR);
-      compare_values (DECL_STATIC_DESTRUCTOR);
-      compare_values (DECL_UNINLINABLE);
-      compare_values (DECL_POSSIBLY_INLINED);
-      compare_values (DECL_IS_NOVOPS);
-      compare_values (DECL_IS_RETURNS_TWICE);
-      compare_values (DECL_IS_MALLOC);
-      compare_values (DECL_IS_OPERATOR_NEW);
-      compare_values (DECL_DECLARED_INLINE_P);
-      compare_values (DECL_STATIC_CHAIN);
-      compare_values (DECL_NO_INLINE_WARNING_P);
-      compare_values (DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT);
-      compare_values (DECL_NO_LIMIT_STACK);
-      compare_values (DECL_DISREGARD_INLINE_LIMITS);
-      compare_values (DECL_PURE_P);
-      compare_values (DECL_LOOPING_CONST_OR_PURE_P);
-      compare_values (DECL_FINAL_P);
-      compare_values (DECL_CXX_CONSTRUCTOR_P);
-      compare_values (DECL_CXX_DESTRUCTOR_P);
-      if (DECL_BUILT_IN_CLASS (t1) != NOT_BUILT_IN)
-	compare_values (DECL_FUNCTION_CODE);
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_TYPE_COMMON))
-    {
-      compare_values (TYPE_MODE);
-      compare_values (TYPE_STRING_FLAG);
-      compare_values (TYPE_NEEDS_CONSTRUCTING);
-      if (RECORD_OR_UNION_TYPE_P (t1))
-	{
-	  compare_values (TYPE_TRANSPARENT_AGGR);
-	  compare_values (TYPE_FINAL_P);
-	}
-      else if (code == ARRAY_TYPE)
-	compare_values (TYPE_NONALIASED_COMPONENT);
-      if (AGGREGATE_TYPE_P (t1))
-	compare_values (TYPE_TYPELESS_STORAGE);
-      compare_values (TYPE_EMPTY_P);
-      compare_values (TYPE_PACKED);
-      compare_values (TYPE_RESTRICT);
-      compare_values (TYPE_USER_ALIGN);
-      compare_values (TYPE_READONLY);
-      compare_values (TYPE_PRECISION);
-      compare_values (TYPE_ALIGN);
-      /* Do not compare TYPE_ALIAS_SET.  Doing so introduce ordering issues
-         with calls to get_alias_set which may initialize it for streamed
- 	 in types.  */
-    }
-
-  /* We don't want to compare locations, so there is nothing do compare
-     for TS_EXP.  */
-
-  /* BLOCKs are function local and we don't merge anything there, so
-     simply refuse to merge.  */
-  if (CODE_CONTAINS_STRUCT (code, TS_BLOCK))
-    return false;
-
-  if (CODE_CONTAINS_STRUCT (code, TS_TRANSLATION_UNIT_DECL))
-    if (strcmp (TRANSLATION_UNIT_LANGUAGE (t1),
-		TRANSLATION_UNIT_LANGUAGE (t2)) != 0)
-      return false;
-
-  if (CODE_CONTAINS_STRUCT (code, TS_TARGET_OPTION))
-    if (!cl_target_option_eq (TREE_TARGET_OPTION (t1), TREE_TARGET_OPTION (t2)))
-      return false;
-
-  if (CODE_CONTAINS_STRUCT (code, TS_OPTIMIZATION))
-    if (!cl_optimization_option_eq (TREE_OPTIMIZATION (t1),
-				    TREE_OPTIMIZATION (t2)))
-      return false;
-
-  if (CODE_CONTAINS_STRUCT (code, TS_BINFO))
-    if (vec_safe_length (BINFO_BASE_ACCESSES (t1))
-	!= vec_safe_length (BINFO_BASE_ACCESSES (t2)))
-      return false;
-
-  if (CODE_CONTAINS_STRUCT (code, TS_CONSTRUCTOR))
-    compare_values (CONSTRUCTOR_NELTS);
-
-  if (CODE_CONTAINS_STRUCT (code, TS_IDENTIFIER))
-    if (IDENTIFIER_LENGTH (t1) != IDENTIFIER_LENGTH (t2)
-	|| memcmp (IDENTIFIER_POINTER (t1), IDENTIFIER_POINTER (t2),
-		   IDENTIFIER_LENGTH (t1)) != 0)
-      return false;
-
-  if (CODE_CONTAINS_STRUCT (code, TS_STRING))
-    if (TREE_STRING_LENGTH (t1) != TREE_STRING_LENGTH (t2)
-	|| memcmp (TREE_STRING_POINTER (t1), TREE_STRING_POINTER (t2),
-		   TREE_STRING_LENGTH (t1)) != 0)
-      return false;
-
-  if (code == OMP_CLAUSE)
-    {
-      compare_values (OMP_CLAUSE_CODE);
-      switch (OMP_CLAUSE_CODE (t1))
-	{
-	case OMP_CLAUSE_DEFAULT:
-	  compare_values (OMP_CLAUSE_DEFAULT_KIND);
-	  break;
-	case OMP_CLAUSE_SCHEDULE:
-	  compare_values (OMP_CLAUSE_SCHEDULE_KIND);
-	  break;
-	case OMP_CLAUSE_DEPEND:
-	  compare_values (OMP_CLAUSE_DEPEND_KIND);
-	  break;
-	case OMP_CLAUSE_MAP:
-	  compare_values (OMP_CLAUSE_MAP_KIND);
-	  break;
-	case OMP_CLAUSE_PROC_BIND:
-	  compare_values (OMP_CLAUSE_PROC_BIND_KIND);
-	  break;
-	case OMP_CLAUSE_REDUCTION:
-	  compare_values (OMP_CLAUSE_REDUCTION_CODE);
-	  compare_values (OMP_CLAUSE_REDUCTION_GIMPLE_INIT);
-	  compare_values (OMP_CLAUSE_REDUCTION_GIMPLE_MERGE);
-	  break;
-	default:
-	  break;
-	}
-    }
-
-#undef compare_values
-
-
-  /* Compare pointer fields.  */
-
-  /* Recurse.  Search & Replaced from DFS_write_tree_body.
-     Folding the early checks into the compare_tree_edges recursion
-     macro makes debugging way quicker as you are able to break on
-     compare_tree_sccs_1 and simply finish until a call returns false
-     to spot the SCC members with the difference.  */
-#define compare_tree_edges(E1, E2) \
-  do { \
-    tree t1_ = (E1), t2_ = (E2); \
-    if (t1_ != t2_ \
-	&& (!t1_ || !t2_ \
-	    || !TREE_VISITED (t2_) \
-	    || (!TREE_ASM_WRITTEN (t2_) \
-		&& !compare_tree_sccs_1 (t1_, t2_, map)))) \
-      return false; \
-    /* Only non-NULL trees outside of the SCC may compare equal.  */ \
-    gcc_checking_assert (t1_ != t2_ || (!t2_ || !TREE_VISITED (t2_))); \
-  } while (0)
-
-  if (CODE_CONTAINS_STRUCT (code, TS_TYPED))
-    {
-      if (code != IDENTIFIER_NODE)
-	compare_tree_edges (TREE_TYPE (t1), TREE_TYPE (t2));
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_VECTOR))
-    {
-      /* Note that the number of elements for EXPR has already been emitted
-	 in EXPR's header (see streamer_write_tree_header).  */
-      unsigned int count = vector_cst_encoded_nelts (t1);
-      for (unsigned int i = 0; i < count; ++i)
-	compare_tree_edges (VECTOR_CST_ENCODED_ELT (t1, i),
-			    VECTOR_CST_ENCODED_ELT (t2, i));
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_COMPLEX))
-    {
-      compare_tree_edges (TREE_REALPART (t1), TREE_REALPART (t2));
-      compare_tree_edges (TREE_IMAGPART (t1), TREE_IMAGPART (t2));
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_DECL_MINIMAL))
-    {
-      compare_tree_edges (DECL_NAME (t1), DECL_NAME (t2));
-      /* ???  Global decls from different TUs have non-matching
-	 TRANSLATION_UNIT_DECLs.  Only consider a small set of
-	 decls equivalent, we should not end up merging others.  */
-      if ((code == TYPE_DECL
-	   || code == NAMESPACE_DECL
-	   || code == IMPORTED_DECL
-	   || code == CONST_DECL
-	   || (VAR_OR_FUNCTION_DECL_P (t1)
-	       && (TREE_PUBLIC (t1) || DECL_EXTERNAL (t1))))
-	  && DECL_FILE_SCOPE_P (t1) && DECL_FILE_SCOPE_P (t2))
-	;
-      else
-	compare_tree_edges (DECL_CONTEXT (t1), DECL_CONTEXT (t2));
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_DECL_COMMON))
-    {
-      compare_tree_edges (DECL_SIZE (t1), DECL_SIZE (t2));
-      compare_tree_edges (DECL_SIZE_UNIT (t1), DECL_SIZE_UNIT (t2));
-      compare_tree_edges (DECL_ATTRIBUTES (t1), DECL_ATTRIBUTES (t2));
-      compare_tree_edges (DECL_ABSTRACT_ORIGIN (t1), DECL_ABSTRACT_ORIGIN (t2));
-      if ((code == VAR_DECL
-	   || code == PARM_DECL)
-	  && DECL_HAS_VALUE_EXPR_P (t1))
-	compare_tree_edges (DECL_VALUE_EXPR (t1), DECL_VALUE_EXPR (t2));
-      if (code == VAR_DECL
-	  && DECL_HAS_DEBUG_EXPR_P (t1))
-	compare_tree_edges (DECL_DEBUG_EXPR (t1), DECL_DEBUG_EXPR (t2));
-      /* LTO specific edges.  */
-      if (code != FUNCTION_DECL
-	  && code != TRANSLATION_UNIT_DECL)
-	compare_tree_edges (DECL_INITIAL (t1), DECL_INITIAL (t2));
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_DECL_NON_COMMON))
-    {
-      if (code == FUNCTION_DECL)
-	{
-	  tree a1, a2;
-	  for (a1 = DECL_ARGUMENTS (t1), a2 = DECL_ARGUMENTS (t2);
-	       a1 || a2;
-	       a1 = TREE_CHAIN (a1), a2 = TREE_CHAIN (a2))
-	    compare_tree_edges (a1, a2);
-	  compare_tree_edges (DECL_RESULT (t1), DECL_RESULT (t2));
-	}
-      else if (code == TYPE_DECL)
-	compare_tree_edges (DECL_ORIGINAL_TYPE (t1), DECL_ORIGINAL_TYPE (t2));
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_DECL_WITH_VIS))
-    {
-      /* Make sure we don't inadvertently set the assembler name.  */
-      if (DECL_ASSEMBLER_NAME_SET_P (t1))
-	compare_tree_edges (DECL_ASSEMBLER_NAME (t1),
-			    DECL_ASSEMBLER_NAME (t2));
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_FIELD_DECL))
-    {
-      compare_tree_edges (DECL_FIELD_OFFSET (t1), DECL_FIELD_OFFSET (t2));
-      compare_tree_edges (DECL_BIT_FIELD_TYPE (t1), DECL_BIT_FIELD_TYPE (t2));
-      compare_tree_edges (DECL_BIT_FIELD_REPRESENTATIVE (t1),
-			  DECL_BIT_FIELD_REPRESENTATIVE (t2));
-      compare_tree_edges (DECL_FIELD_BIT_OFFSET (t1),
-			  DECL_FIELD_BIT_OFFSET (t2));
-      compare_tree_edges (DECL_FCONTEXT (t1), DECL_FCONTEXT (t2));
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_FUNCTION_DECL))
-    {
-      compare_tree_edges (DECL_FUNCTION_PERSONALITY (t1),
-			  DECL_FUNCTION_PERSONALITY (t2));
-      compare_tree_edges (DECL_VINDEX (t1), DECL_VINDEX (t2));
-      compare_tree_edges (DECL_FUNCTION_SPECIFIC_TARGET (t1),
-			  DECL_FUNCTION_SPECIFIC_TARGET (t2));
-      compare_tree_edges (DECL_FUNCTION_SPECIFIC_OPTIMIZATION (t1),
-			  DECL_FUNCTION_SPECIFIC_OPTIMIZATION (t2));
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_TYPE_COMMON))
-    {
-      compare_tree_edges (TYPE_SIZE (t1), TYPE_SIZE (t2));
-      compare_tree_edges (TYPE_SIZE_UNIT (t1), TYPE_SIZE_UNIT (t2));
-      compare_tree_edges (TYPE_ATTRIBUTES (t1), TYPE_ATTRIBUTES (t2));
-      compare_tree_edges (TYPE_NAME (t1), TYPE_NAME (t2));
-      /* Do not compare TYPE_POINTER_TO or TYPE_REFERENCE_TO.  They will be
-	 reconstructed during fixup.  */
-      /* Do not compare TYPE_NEXT_VARIANT, we reconstruct the variant lists
-	 during fixup.  */
-      compare_tree_edges (TYPE_MAIN_VARIANT (t1), TYPE_MAIN_VARIANT (t2));
-      /* ???  Global types from different TUs have non-matching
-	 TRANSLATION_UNIT_DECLs.  Still merge them if they are otherwise
-	 equal.  */
-      if (TYPE_FILE_SCOPE_P (t1) && TYPE_FILE_SCOPE_P (t2))
-	;
-      else
-	compare_tree_edges (TYPE_CONTEXT (t1), TYPE_CONTEXT (t2));
-      /* TYPE_CANONICAL is re-computed during type merging, so do not
-	 compare it here.  */
-      compare_tree_edges (TYPE_STUB_DECL (t1), TYPE_STUB_DECL (t2));
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_TYPE_NON_COMMON))
-    {
-      if (code == ENUMERAL_TYPE)
-	compare_tree_edges (TYPE_VALUES (t1), TYPE_VALUES (t2));
-      else if (code == ARRAY_TYPE)
-	compare_tree_edges (TYPE_DOMAIN (t1), TYPE_DOMAIN (t2));
-      else if (RECORD_OR_UNION_TYPE_P (t1))
-	{
-	  tree f1, f2;
-	  for (f1 = TYPE_FIELDS (t1), f2 = TYPE_FIELDS (t2);
-	       f1 || f2;
-	       f1 = TREE_CHAIN (f1), f2 = TREE_CHAIN (f2))
-	    compare_tree_edges (f1, f2);
-	}
-      else if (code == FUNCTION_TYPE
-	       || code == METHOD_TYPE)
-	compare_tree_edges (TYPE_ARG_TYPES (t1), TYPE_ARG_TYPES (t2));
-
-      if (!POINTER_TYPE_P (t1))
-	compare_tree_edges (TYPE_MIN_VALUE_RAW (t1), TYPE_MIN_VALUE_RAW (t2));
-      compare_tree_edges (TYPE_MAX_VALUE_RAW (t1), TYPE_MAX_VALUE_RAW (t2));
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_LIST))
-    {
-      compare_tree_edges (TREE_PURPOSE (t1), TREE_PURPOSE (t2));
-      compare_tree_edges (TREE_VALUE (t1), TREE_VALUE (t2));
-      compare_tree_edges (TREE_CHAIN (t1), TREE_CHAIN (t2));
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_VEC))
-    for (int i = 0; i < TREE_VEC_LENGTH (t1); i++)
-      compare_tree_edges (TREE_VEC_ELT (t1, i), TREE_VEC_ELT (t2, i));
-
-  if (CODE_CONTAINS_STRUCT (code, TS_EXP))
-    {
-      for (int i = 0; i < TREE_OPERAND_LENGTH (t1); i++)
-	compare_tree_edges (TREE_OPERAND (t1, i),
-			    TREE_OPERAND (t2, i));
-
-      /* BLOCKs are function local and we don't merge anything there.  */
-      if (TREE_BLOCK (t1) || TREE_BLOCK (t2))
-	return false;
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_BINFO))
-    {
-      unsigned i;
-      tree t;
-      /* Lengths have already been compared above.  */
-      FOR_EACH_VEC_ELT (*BINFO_BASE_BINFOS (t1), i, t)
-	compare_tree_edges (t, BINFO_BASE_BINFO (t2, i));
-      FOR_EACH_VEC_SAFE_ELT (BINFO_BASE_ACCESSES (t1), i, t)
-	compare_tree_edges (t, BINFO_BASE_ACCESS (t2, i));
-      compare_tree_edges (BINFO_OFFSET (t1), BINFO_OFFSET (t2));
-      compare_tree_edges (BINFO_VTABLE (t1), BINFO_VTABLE (t2));
-      compare_tree_edges (BINFO_VPTR_FIELD (t1), BINFO_VPTR_FIELD (t2));
-      /* Do not walk BINFO_INHERITANCE_CHAIN, BINFO_SUBVTT_INDEX
-	 and BINFO_VPTR_INDEX; these are used by C++ FE only.  */
-    }
-
-  if (CODE_CONTAINS_STRUCT (code, TS_CONSTRUCTOR))
-    {
-      unsigned i;
-      tree index, value;
-      /* Lengths have already been compared above.  */
-      FOR_EACH_CONSTRUCTOR_ELT (CONSTRUCTOR_ELTS (t1), i, index, value)
-	{
-	  compare_tree_edges (index, CONSTRUCTOR_ELT (t2, i)->index);
-	  compare_tree_edges (value, CONSTRUCTOR_ELT (t2, i)->value);
-	}
-    }
-
-  if (code == OMP_CLAUSE)
-    {
-      int i;
-
-      for (i = 0; i < omp_clause_num_ops[OMP_CLAUSE_CODE (t1)]; i++)
-	compare_tree_edges (OMP_CLAUSE_OPERAND (t1, i),
-			    OMP_CLAUSE_OPERAND (t2, i));
-      compare_tree_edges (OMP_CLAUSE_CHAIN (t1), OMP_CLAUSE_CHAIN (t2));
-    }
-
-#undef compare_tree_edges
-
-  return true;
-}
-
-/* Compare the tree scc SCC to the prevailing candidate PSCC, filling
-   out MAP if they are equal.  */
-
-static bool
-compare_tree_sccs (tree_scc *pscc, tree_scc *scc,
-		   tree *map)
-{
-  /* Assume SCC entry hashes are sorted after their cardinality.  Which
-     means we can simply take the first n-tuple of equal hashes
-     (which is recorded as entry_len) and do n SCC entry candidate
-     comparisons.  */
-  for (unsigned i = 0; i < pscc->entry_len; ++i)
-    {
-      tree *mapp = map;
-      num_scc_compare_collisions++;
-      if (compare_tree_sccs_1 (pscc->entries[0], scc->entries[i], &mapp))
-	{
-	  /* Equal - no need to reset TREE_VISITED or TREE_ASM_WRITTEN
-	     on the scc as all trees will be freed.  */
-	  return true;
-	}
-      /* Reset TREE_ASM_WRITTEN on scc for the next compare or in case
-         the SCC prevails.  */
-      for (unsigned j = 0; j < scc->len; ++j)
-	TREE_ASM_WRITTEN (scc->entries[j]) = 0;
-    }
-
-  return false;
-}
-
-/* QSort sort function to sort a map of two pointers after the 2nd
-   pointer.  */
-
-static int
-cmp_tree (const void *p1_, const void *p2_)
-{
-  tree *p1 = (tree *)(const_cast<void *>(p1_));
-  tree *p2 = (tree *)(const_cast<void *>(p2_));
-  if (p1[1] == p2[1])
-    return 0;
-  return ((uintptr_t)p1[1] < (uintptr_t)p2[1]) ? -1 : 1;
-}
-
-/* Try to unify the SCC with nodes FROM to FROM + LEN in CACHE and
-   hash value SCC_HASH with an already recorded SCC.  Return true if
-   that was successful, otherwise return false.  */
-
-static bool
-unify_scc (struct data_in *data_in, unsigned from,
-	   unsigned len, unsigned scc_entry_len, hashval_t scc_hash)
-{
-  bool unified_p = false;
-  struct streamer_tree_cache_d *cache = data_in->reader_cache;
-  tree_scc *scc
-    = (tree_scc *) alloca (sizeof (tree_scc) + (len - 1) * sizeof (tree));
-  scc->next = NULL;
-  scc->hash = scc_hash;
-  scc->len = len;
-  scc->entry_len = scc_entry_len;
-  for (unsigned i = 0; i < len; ++i)
-    {
-      tree t = streamer_tree_cache_get_tree (cache, from + i);
-      scc->entries[i] = t;
-      /* Do not merge SCCs with local entities inside them.  Also do
-	 not merge TRANSLATION_UNIT_DECLs.  */
-      if (TREE_CODE (t) == TRANSLATION_UNIT_DECL
-	  || (VAR_OR_FUNCTION_DECL_P (t)
-	      && !(TREE_PUBLIC (t) || DECL_EXTERNAL (t)))
-	  || TREE_CODE (t) == LABEL_DECL)
-	{
-	  /* Avoid doing any work for these cases and do not worry to
-	     record the SCCs for further merging.  */
-	  return false;
-	}
-    }
-
-  /* Look for the list of candidate SCCs to compare against.  */
-  tree_scc **slot;
-  slot = tree_scc_hash->find_slot_with_hash (scc, scc_hash, INSERT);
-  if (*slot)
-    {
-      /* Try unifying against each candidate.  */
-      num_scc_compares++;
-
-      /* Set TREE_VISITED on the scc so we can easily identify tree nodes
-	 outside of the scc when following tree edges.  Make sure
-	 that TREE_ASM_WRITTEN is unset so we can use it as 2nd bit
-	 to track whether we visited the SCC member during the compare.
-	 We cannot use TREE_VISITED on the pscc members as the extended
-	 scc and pscc can overlap.  */
-      for (unsigned i = 0; i < scc->len; ++i)
-	{
-	  TREE_VISITED (scc->entries[i]) = 1;
-	  gcc_checking_assert (!TREE_ASM_WRITTEN (scc->entries[i]));
-	}
-
-      tree *map = XALLOCAVEC (tree, 2 * len);
-      for (tree_scc *pscc = *slot; pscc; pscc = pscc->next)
-	{
-	  if (!compare_tree_sccs (pscc, scc, map))
-	    continue;
-
-	  /* Found an equal SCC.  */
-	  unified_p = true;
-	  num_scc_compare_collisions--;
-	  num_sccs_merged++;
-	  total_scc_size_merged += len;
-
-	  if (flag_checking)
-	    for (unsigned i = 0; i < len; ++i)
-	      {
-		tree t = map[2*i+1];
-		enum tree_code code = TREE_CODE (t);
-		/* IDENTIFIER_NODEs should be singletons and are merged by the
-		   streamer.  The others should be singletons, too, and we
-		   should not merge them in any way.  */
-		gcc_assert (code != TRANSLATION_UNIT_DECL
-			    && code != IDENTIFIER_NODE);
-	      }
-
-	  /* Fixup the streamer cache with the prevailing nodes according
-	     to the tree node mapping computed by compare_tree_sccs.  */
-	  if (len == 1)
-	    {
-	      /* If we got a debug reference queued, see if the prevailing
-	         tree has a debug reference and if not, register the one
-		 for the tree we are about to throw away.  */
-	      if (dref_queue.length () == 1)
-		{
-		  dref_entry e = dref_queue.pop ();
-		  gcc_assert (e.decl
-			      == streamer_tree_cache_get_tree (cache, from));
-		  const char *sym;
-		  unsigned HOST_WIDE_INT off;
-		  if (!debug_hooks->die_ref_for_decl (pscc->entries[0], &sym,
-						      &off))
-		    debug_hooks->register_external_die (pscc->entries[0],
-							e.sym, e.off);
-		}
-	      lto_maybe_register_decl (data_in, pscc->entries[0], from);
-	      streamer_tree_cache_replace_tree (cache, pscc->entries[0], from);
-	    }
-	  else
-	    {
-	      tree *map2 = XALLOCAVEC (tree, 2 * len);
-	      for (unsigned i = 0; i < len; ++i)
-		{
-		  map2[i*2] = (tree)(uintptr_t)(from + i);
-		  map2[i*2+1] = scc->entries[i];
-		}
-	      qsort (map2, len, 2 * sizeof (tree), cmp_tree);
-	      qsort (map, len, 2 * sizeof (tree), cmp_tree);
-	      for (unsigned i = 0; i < len; ++i)
-		{
-		  lto_maybe_register_decl (data_in, map[2*i],
-					   (uintptr_t)map2[2*i]);
-		  streamer_tree_cache_replace_tree (cache, map[2*i],
-						    (uintptr_t)map2[2*i]);
-		}
-	    }
-
-	  /* Free the tree nodes from the read SCC.  */
-	  data_in->location_cache.revert_location_cache ();
-	  for (unsigned i = 0; i < len; ++i)
-	    {
-	      if (TYPE_P (scc->entries[i]))
-		num_merged_types++;
-	      free_node (scc->entries[i]);
-	    }
-
-	  /* Drop DIE references.
-	     ???  Do as in the size-one SCC case which involves sorting
-	     the queue.  */
-	  dref_queue.truncate (0);
-
-	  break;
-	}
-
-      /* Reset TREE_VISITED if we didn't unify the SCC with another.  */
-      if (!unified_p)
-	for (unsigned i = 0; i < scc->len; ++i)
-	  TREE_VISITED (scc->entries[i]) = 0;
-    }
-
-  /* If we didn't unify it to any candidate duplicate the relevant
-     pieces to permanent storage and link it into the chain.  */
-  if (!unified_p)
-    {
-      tree_scc *pscc
-	= XOBNEWVAR (&tree_scc_hash_obstack, tree_scc, sizeof (tree_scc));
-      memcpy (pscc, scc, sizeof (tree_scc));
-      pscc->next = (*slot);
-      *slot = pscc;
-    }
-  return unified_p;
-}
-
-
-/* Read all the symbols from buffer DATA, using descriptors in DECL_DATA.
-   RESOLUTIONS is the set of symbols picked by the linker (read from the
-   resolution file when the linker plugin is being used).  */
-
-static void
-lto_read_decls (struct lto_file_decl_data *decl_data, const void *data,
-		vec<ld_plugin_symbol_resolution_t> resolutions)
-{
-  const struct lto_decl_header *header = (const struct lto_decl_header *) data;
-  const int decl_offset = sizeof (struct lto_decl_header);
-  const int main_offset = decl_offset + header->decl_state_size;
-  const int string_offset = main_offset + header->main_size;
-  struct data_in *data_in;
-  unsigned int i;
-  const uint32_t *data_ptr, *data_end;
-  uint32_t num_decl_states;
-
-  lto_input_block ib_main ((const char *) data + main_offset,
-			   header->main_size, decl_data->mode_table);
-
-  data_in = lto_data_in_create (decl_data, (const char *) data + string_offset,
-				header->string_size, resolutions);
-
-  /* We do not uniquify the pre-loaded cache entries, those are middle-end
-     internal types that should not be merged.  */
-
-  /* Read the global declarations and types.  */
-  while (ib_main.p < ib_main.len)
-    {
-      tree t;
-      unsigned from = data_in->reader_cache->nodes.length ();
-      /* Read and uniquify SCCs as in the input stream.  */
-      enum LTO_tags tag = streamer_read_record_start (&ib_main);
-      if (tag == LTO_tree_scc)
-	{
-	  unsigned len_;
-	  unsigned scc_entry_len;
-	  hashval_t scc_hash = lto_input_scc (&ib_main, data_in, &len_,
-					      &scc_entry_len);
-	  unsigned len = data_in->reader_cache->nodes.length () - from;
-	  gcc_assert (len == len_);
-
-	  total_scc_size += len;
-	  num_sccs_read++;
-
-	  /* We have the special case of size-1 SCCs that are pre-merged
-	     by means of identifier and string sharing for example.
-	     ???  Maybe we should avoid streaming those as SCCs.  */
-	  tree first = streamer_tree_cache_get_tree (data_in->reader_cache,
-						     from);
-	  if (len == 1
-	      && (TREE_CODE (first) == IDENTIFIER_NODE
-		  || (TREE_CODE (first) == INTEGER_CST
-		      && !TREE_OVERFLOW (first))))
-	    continue;
-
-	  /* Try to unify the SCC with already existing ones.  */
-	  if (!flag_ltrans
-	      && unify_scc (data_in, from,
-			    len, scc_entry_len, scc_hash))
-	    continue;
-
-	  /* Tree merging failed, mark entries in location cache as
-	     permanent.  */
-	  data_in->location_cache.accept_location_cache ();
-
-	  bool seen_type = false;
-	  for (unsigned i = 0; i < len; ++i)
-	    {
-	      tree t = streamer_tree_cache_get_tree (data_in->reader_cache,
-						     from + i);
-	      /* Reconstruct the type variant and pointer-to/reference-to
-		 chains.  */
-	      if (TYPE_P (t))
-		{
-		  seen_type = true;
-		  num_prevailing_types++;
-		  lto_fixup_prevailing_type (t);
-
-		  /* Compute the canonical type of all types.
-		     Because SCC components are streamed in random (hash) order
-		     we may have encountered the type before while registering
-		     type canonical of a derived type in the same SCC.  */
-		  if (!TYPE_CANONICAL (t))
-		    gimple_register_canonical_type (t);
-		  if (TYPE_MAIN_VARIANT (t) == t && odr_type_p (t))
-		    register_odr_type (t);
-		}
-	      /* Link shared INTEGER_CSTs into TYPE_CACHED_VALUEs of its
-		 type which is also member of this SCC.  */
-	      if (TREE_CODE (t) == INTEGER_CST
-		  && !TREE_OVERFLOW (t))
-		cache_integer_cst (t);
-	      if (!flag_ltrans)
-		{
-		  lto_maybe_register_decl (data_in, t, from + i);
-		  /* Scan the tree for references to global functions or
-		     variables and record those for later fixup.  */
-		  if (mentions_vars_p (t))
-		    vec_safe_push (tree_with_vars, t);
-		}
-	    }
-
-	  /* Register DECLs with the debuginfo machinery.  */
-	  while (!dref_queue.is_empty ())
-	    {
-	      dref_entry e = dref_queue.pop ();
-	      debug_hooks->register_external_die (e.decl, e.sym, e.off);
-	    }
-
-	  if (seen_type)
-	    num_type_scc_trees += len;
-	}
-      else
-	{
-	  /* Pickle stray references.  */
-	  t = lto_input_tree_1 (&ib_main, data_in, tag, 0);
-	  gcc_assert (t && data_in->reader_cache->nodes.length () == from);
-	}
-    }
-  data_in->location_cache.apply_location_cache ();
-
-  /* Read in lto_in_decl_state objects.  */
-  data_ptr = (const uint32_t *) ((const char*) data + decl_offset); 
-  data_end =
-     (const uint32_t *) ((const char*) data_ptr + header->decl_state_size);
-  num_decl_states = *data_ptr++;
-  
-  gcc_assert (num_decl_states > 0);
-  decl_data->global_decl_state = lto_new_in_decl_state ();
-  data_ptr = lto_read_in_decl_state (data_in, data_ptr,
-				     decl_data->global_decl_state);
-
-  /* Read in per-function decl states and enter them in hash table.  */
-  decl_data->function_decl_states =
-    hash_table<decl_state_hasher>::create_ggc (37);
-
-  for (i = 1; i < num_decl_states; i++)
-    {
-      struct lto_in_decl_state *state = lto_new_in_decl_state ();
-
-      data_ptr = lto_read_in_decl_state (data_in, data_ptr, state);
-      lto_in_decl_state **slot
-	= decl_data->function_decl_states->find_slot (state, INSERT);
-      gcc_assert (*slot == NULL);
-      *slot = state;
-    }
-
-  if (data_ptr != data_end)
-    internal_error ("bytecode stream: garbage at the end of symbols section");
-
-  /* Set the current decl state to be the global state. */
-  decl_data->current_decl_state = decl_data->global_decl_state;
-
-  lto_data_in_delete (data_in);
-}
-
-/* Custom version of strtoll, which is not portable.  */
-
-static int64_t
-lto_parse_hex (const char *p)
-{
-  int64_t ret = 0;
-
-  for (; *p != '\0'; ++p)
-    {
-      char c = *p;
-      unsigned char part;
-      ret <<= 4;
-      if (c >= '0' && c <= '9')
-        part = c - '0';
-      else if (c >= 'a' && c <= 'f')
-        part = c - 'a' + 10;
-      else if (c >= 'A' && c <= 'F')
-        part = c - 'A' + 10;
-      else
-        internal_error ("could not parse hex number");
-      ret |= part;
-    }
-
-  return ret;
-}
-
-/* Read resolution for file named FILE_NAME. The resolution is read from
-   RESOLUTION. */
-
-static void
-lto_resolution_read (splay_tree file_ids, FILE *resolution, lto_file *file)
-{
-  /* We require that objects in the resolution file are in the same
-     order as the lto1 command line. */
-  unsigned int name_len;
-  char *obj_name;
-  unsigned int num_symbols;
-  unsigned int i;
-  struct lto_file_decl_data *file_data;
-  splay_tree_node nd = NULL; 
-
-  if (!resolution)
-    return;
-
-  name_len = strlen (file->filename);
-  obj_name = XNEWVEC (char, name_len + 1);
-  fscanf (resolution, " ");   /* Read white space. */
-
-  fread (obj_name, sizeof (char), name_len, resolution);
-  obj_name[name_len] = '\0';
-  if (filename_cmp (obj_name, file->filename) != 0)
-    internal_error ("unexpected file name %s in linker resolution file. "
-		    "Expected %s", obj_name, file->filename);
-  if (file->offset != 0)
-    {
-      int t;
-      char offset_p[17];
-      int64_t offset;
-      t = fscanf (resolution, "@0x%16s", offset_p);
-      if (t != 1)
-        internal_error ("could not parse file offset");
-      offset = lto_parse_hex (offset_p);
-      if (offset != file->offset)
-        internal_error ("unexpected offset");
-    }
-
-  free (obj_name);
-
-  fscanf (resolution, "%u", &num_symbols);
-
-  for (i = 0; i < num_symbols; i++)
-    {
-      int t;
-      unsigned index;
-      unsigned HOST_WIDE_INT id;
-      char r_str[27];
-      enum ld_plugin_symbol_resolution r = (enum ld_plugin_symbol_resolution) 0;
-      unsigned int j;
-      unsigned int lto_resolution_str_len =
-	sizeof (lto_resolution_str) / sizeof (char *);
-      res_pair rp;
-
-      t = fscanf (resolution, "%u " HOST_WIDE_INT_PRINT_HEX_PURE " %26s %*[^\n]\n", 
-		  &index, &id, r_str);
-      if (t != 3)
-        internal_error ("invalid line in the resolution file");
-
-      for (j = 0; j < lto_resolution_str_len; j++)
-	{
-	  if (strcmp (lto_resolution_str[j], r_str) == 0)
-	    {
-	      r = (enum ld_plugin_symbol_resolution) j;
-	      break;
-	    }
-	}
-      if (j == lto_resolution_str_len)
-	internal_error ("invalid resolution in the resolution file");
-
-      if (!(nd && lto_splay_tree_id_equal_p (nd->key, id)))
-	{
-	  nd = lto_splay_tree_lookup (file_ids, id);
-	  if (nd == NULL)
-	    internal_error ("resolution sub id %wx not in object file", id);
-	}
-
-      file_data = (struct lto_file_decl_data *)nd->value;
-      /* The indexes are very sparse. To save memory save them in a compact
-         format that is only unpacked later when the subfile is processed. */
-      rp.res = r;
-      rp.index = index;
-      file_data->respairs.safe_push (rp);
-      if (file_data->max_index < index)
-        file_data->max_index = index;
-    }
-}
-
-/* List of file_decl_datas */
-struct file_data_list
-  {
-    struct lto_file_decl_data *first, *last;
-  };
-
-/* Is the name for a id'ed LTO section? */
-
-static int 
-lto_section_with_id (const char *name, unsigned HOST_WIDE_INT *id)
-{
-  const char *s;
-
-  if (strncmp (name, section_name_prefix, strlen (section_name_prefix)))
-    return 0;
-  s = strrchr (name, '.');
-  if (!s)
-    return 0;
-  /* If the section is not suffixed with an ID return.  */
-  if ((size_t)(s - name) == strlen (section_name_prefix))
-    return 0;
-  return sscanf (s, "." HOST_WIDE_INT_PRINT_HEX_PURE, id) == 1;
-}
-
-/* Create file_data of each sub file id */
-
-static int 
-create_subid_section_table (struct lto_section_slot *ls, splay_tree file_ids,
-                            struct file_data_list *list)
-{
-  struct lto_section_slot s_slot, *new_slot;
-  unsigned HOST_WIDE_INT id;
-  splay_tree_node nd;
-  void **hash_slot;
-  char *new_name;
-  struct lto_file_decl_data *file_data;
-
-  if (!lto_section_with_id (ls->name, &id))
-    return 1;
-  
-  /* Find hash table of sub module id */
-  nd = lto_splay_tree_lookup (file_ids, id);
-  if (nd != NULL)
-    {
-      file_data = (struct lto_file_decl_data *)nd->value;
-    }
-  else
-    {
-      file_data = ggc_alloc<lto_file_decl_data> ();
-      memset(file_data, 0, sizeof (struct lto_file_decl_data));
-      file_data->id = id;
-      file_data->section_hash_table = lto_obj_create_section_hash_table ();
-      lto_splay_tree_insert (file_ids, id, file_data);
-
-      /* Maintain list in linker order */
-      if (!list->first)
-        list->first = file_data;
-      if (list->last)
-        list->last->next = file_data;
-      list->last = file_data;
-    }
-
-  /* Copy section into sub module hash table */
-  new_name = XDUPVEC (char, ls->name, strlen (ls->name) + 1);
-  s_slot.name = new_name;
-  hash_slot = htab_find_slot (file_data->section_hash_table, &s_slot, INSERT);
-  gcc_assert (*hash_slot == NULL);
-
-  new_slot = XDUP (struct lto_section_slot, ls);
-  new_slot->name = new_name;
-  *hash_slot = new_slot;
-  return 1;
-}
-
-/* Read declarations and other initializations for a FILE_DATA. */
-
-static void
-lto_file_finalize (struct lto_file_decl_data *file_data, lto_file *file)
-{
-  const char *data;
-  size_t len;
-  vec<ld_plugin_symbol_resolution_t>
-	resolutions = vNULL;
-  int i;
-  res_pair *rp;
-
-  /* Create vector for fast access of resolution. We do this lazily
-     to save memory. */ 
-  resolutions.safe_grow_cleared (file_data->max_index + 1);
-  for (i = 0; file_data->respairs.iterate (i, &rp); i++)
-    resolutions[rp->index] = rp->res;
-  file_data->respairs.release ();
-
-  file_data->renaming_hash_table = lto_create_renaming_table ();
-  file_data->file_name = file->filename;
-#ifdef ACCEL_COMPILER
-  lto_input_mode_table (file_data);
-#else
-  file_data->mode_table = lto_mode_identity_table;
-#endif
-  data = lto_get_section_data (file_data, LTO_section_decls, NULL, &len);
-  if (data == NULL)
-    {
-      internal_error ("cannot read LTO decls from %s", file_data->file_name);
-      return;
-    }
-  /* Frees resolutions */
-  lto_read_decls (file_data, data, resolutions);
-  lto_free_section_data (file_data, LTO_section_decls, NULL, data, len);
-}
-
-/* Finalize FILE_DATA in FILE and increase COUNT. */
-
-static int 
-lto_create_files_from_ids (lto_file *file, struct lto_file_decl_data *file_data,
-			   int *count)
-{
-  lto_file_finalize (file_data, file);
-  if (symtab->dump_file)
-    fprintf (symtab->dump_file,
-	     "Creating file %s with sub id " HOST_WIDE_INT_PRINT_HEX "\n",
-	     file_data->file_name, file_data->id);
-  (*count)++;
-  return 0;
-}
-
-/* Generate a TREE representation for all types and external decls
-   entities in FILE.  
-
-   Read all of the globals out of the file.  Then read the cgraph
-   and process the .o index into the cgraph nodes so that it can open
-   the .o file to load the functions and ipa information.   */
-
-static struct lto_file_decl_data *
-lto_file_read (lto_file *file, FILE *resolution_file, int *count)
-{
-  struct lto_file_decl_data *file_data = NULL;
-  splay_tree file_ids;
-  htab_t section_hash_table;
-  struct lto_section_slot *section;
-  struct file_data_list file_list;
-  struct lto_section_list section_list;
- 
-  memset (&section_list, 0, sizeof (struct lto_section_list)); 
-  section_hash_table = lto_obj_build_section_table (file, &section_list);
-
-  /* Find all sub modules in the object and put their sections into new hash
-     tables in a splay tree. */
-  file_ids = lto_splay_tree_new ();
-  memset (&file_list, 0, sizeof (struct file_data_list));
-  for (section = section_list.first; section != NULL; section = section->next)
-    create_subid_section_table (section, file_ids, &file_list);
-
-  /* Add resolutions to file ids */
-  lto_resolution_read (file_ids, resolution_file, file);
-
-  /* Finalize each lto file for each submodule in the merged object */
-  for (file_data = file_list.first; file_data != NULL; file_data = file_data->next)
-    lto_create_files_from_ids (file, file_data, count);
- 
-  splay_tree_delete (file_ids);
-  htab_delete (section_hash_table);
-
-  return file_list.first;
-}
-
-#if HAVE_MMAP_FILE && HAVE_SYSCONF && defined _SC_PAGE_SIZE
-#define LTO_MMAP_IO 1
-#endif
-
-#if LTO_MMAP_IO
-/* Page size of machine is used for mmap and munmap calls.  */
-static size_t page_mask;
-#endif
-
-/* Get the section data of length LEN from FILENAME starting at
-   OFFSET.  The data segment must be freed by the caller when the
-   caller is finished.  Returns NULL if all was not well.  */
-
-static char *
-lto_read_section_data (struct lto_file_decl_data *file_data,
-		       intptr_t offset, size_t len)
-{
-  char *result;
-  static int fd = -1;
-  static char *fd_name;
-#if LTO_MMAP_IO
-  intptr_t computed_len;
-  intptr_t computed_offset;
-  intptr_t diff;
-#endif
-
-  /* Keep a single-entry file-descriptor cache.  The last file we
-     touched will get closed at exit.
-     ???  Eventually we want to add a more sophisticated larger cache
-     or rather fix function body streaming to not stream them in
-     practically random order.  */
-  if (fd != -1
-      && filename_cmp (fd_name, file_data->file_name) != 0)
-    {
-      free (fd_name);
-      close (fd);
-      fd = -1;
-    }
-  if (fd == -1)
-    {
-      fd = open (file_data->file_name, O_RDONLY|O_BINARY);
-      if (fd == -1)
-        {
-	  fatal_error (input_location, "Cannot open %s", file_data->file_name);
-	  return NULL;
-        }
-      fd_name = xstrdup (file_data->file_name);
-    }
-
-#if LTO_MMAP_IO
-  if (!page_mask)
-    {
-      size_t page_size = sysconf (_SC_PAGE_SIZE);
-      page_mask = ~(page_size - 1);
-    }
-
-  computed_offset = offset & page_mask;
-  diff = offset - computed_offset;
-  computed_len = len + diff;
-
-  result = (char *) mmap (NULL, computed_len, PROT_READ, MAP_PRIVATE,
-			  fd, computed_offset);
-  if (result == MAP_FAILED)
-    {
-      fatal_error (input_location, "Cannot map %s", file_data->file_name);
-      return NULL;
-    }
-
-  return result + diff;
-#else
-  result = (char *) xmalloc (len);
-  if (lseek (fd, offset, SEEK_SET) != offset
-      || read (fd, result, len) != (ssize_t) len)
-    {
-      free (result);
-      fatal_error (input_location, "Cannot read %s", file_data->file_name);
-      result = NULL;
-    }
-#ifdef __MINGW32__
-  /* Native windows doesn't supports delayed unlink on opened file. So
-     we close file here again. This produces higher I/O load, but at least
-     it prevents to have dangling file handles preventing unlink.  */
-  free (fd_name);
-  fd_name = NULL;
-  close (fd);
-  fd = -1;
-#endif
-  return result;
-#endif
-}    
+  struct cgraph_node *node; 
+  timevar_id_t lto_timer;
 
+  if (!quiet_flag)
+    fprintf (stderr,
+	     flag_wpa ? "Materializing decls:" : "Reading function bodies:");
 
-/* Get the section data from FILE_DATA of SECTION_TYPE with NAME.
-   NAME will be NULL unless the section type is for a function
-   body.  */
 
-static const char *
-get_section_data (struct lto_file_decl_data *file_data,
-		      enum lto_section_type section_type,
-		      const char *name,
-		      size_t *len)
-{
-  htab_t section_hash_table = file_data->section_hash_table;
-  struct lto_section_slot *f_slot;
-  struct lto_section_slot s_slot;
-  const char *section_name = lto_get_section_name (section_type, name, file_data);
-  char *data = NULL;
-
-  *len = 0;
-  s_slot.name = section_name;
-  f_slot = (struct lto_section_slot *) htab_find (section_hash_table, &s_slot);
-  if (f_slot)
+  FOR_EACH_FUNCTION (node)
     {
-      data = lto_read_section_data (file_data, f_slot->start, f_slot->len);
-      *len = f_slot->len;
+      if (node->lto_file_data)
+	{
+	  lto_materialize_function (node);
+	  lto_stats.num_input_cgraph_nodes++;
+	}
     }
 
-  free (CONST_CAST (char *, section_name));
-  return data;
-}
-
 
-/* Free the section data from FILE_DATA of SECTION_TYPE with NAME that
-   starts at OFFSET and has LEN bytes.  */
+  /* Start the appropriate timer depending on the mode that we are
+     operating in.  */
+  lto_timer = (flag_wpa) ? TV_WHOPR_WPA
+	      : (flag_ltrans) ? TV_WHOPR_LTRANS
+	      : TV_LTO;
+  timevar_push (lto_timer);
 
-static void
-free_section_data (struct lto_file_decl_data *file_data ATTRIBUTE_UNUSED,
-		   enum lto_section_type section_type ATTRIBUTE_UNUSED,
-		   const char *name ATTRIBUTE_UNUSED,
-		   const char *offset, size_t len ATTRIBUTE_UNUSED)
-{
-#if LTO_MMAP_IO
-  intptr_t computed_len;
-  intptr_t computed_offset;
-  intptr_t diff;
-#endif
+  current_function_decl = NULL;
+  set_cfun (NULL);
 
-#if LTO_MMAP_IO
-  computed_offset = ((intptr_t) offset) & page_mask;
-  diff = (intptr_t) offset - computed_offset;
-  computed_len = len + diff;
+  if (!quiet_flag)
+    fprintf (stderr, "\n");
 
-  munmap ((caddr_t) computed_offset, computed_len);
-#else
-  free (CONST_CAST(char *, offset));
-#endif
+  timevar_pop (lto_timer);
 }
 
-static lto_file *current_lto_file;
-
 /* Actually stream out ENCODER into TEMP_FILENAME.  */
 
 static void
@@ -2560,581 +411,6 @@ lto_wpa_write_files (void)
   timevar_pop (TV_WHOPR_WPA_IO);
 }
 
-
-/* If TT is a variable or function decl replace it with its
-   prevailing variant.  */
-#define LTO_SET_PREVAIL(tt) \
-  do {\
-    if ((tt) && VAR_OR_FUNCTION_DECL_P (tt) \
-	&& (TREE_PUBLIC (tt) || DECL_EXTERNAL (tt))) \
-      { \
-        tt = lto_symtab_prevailing_decl (tt); \
-	fixed = true; \
-      } \
-  } while (0)
-
-/* Ensure that TT isn't a replacable var of function decl.  */
-#define LTO_NO_PREVAIL(tt) \
-  gcc_checking_assert (!(tt) || !VAR_OR_FUNCTION_DECL_P (tt))
-
-/* Given a tree T replace all fields referring to variables or functions
-   with their prevailing variant.  */
-static void
-lto_fixup_prevailing_decls (tree t)
-{
-  enum tree_code code = TREE_CODE (t);
-  bool fixed = false;
-
-  gcc_checking_assert (code != TREE_BINFO);
-  LTO_NO_PREVAIL (TREE_TYPE (t));
-  if (CODE_CONTAINS_STRUCT (code, TS_COMMON)
-      /* lto_symtab_prevail_decl use TREE_CHAIN to link to the prevailing decl.
-	 in the case T is a prevailed declaration we would ICE here. */
-      && !VAR_OR_FUNCTION_DECL_P (t))
-    LTO_NO_PREVAIL (TREE_CHAIN (t));
-  if (DECL_P (t))
-    {
-      LTO_NO_PREVAIL (DECL_NAME (t));
-      LTO_SET_PREVAIL (DECL_CONTEXT (t));
-      if (CODE_CONTAINS_STRUCT (code, TS_DECL_COMMON))
-	{
-	  LTO_SET_PREVAIL (DECL_SIZE (t));
-	  LTO_SET_PREVAIL (DECL_SIZE_UNIT (t));
-	  LTO_SET_PREVAIL (DECL_INITIAL (t));
-	  LTO_NO_PREVAIL (DECL_ATTRIBUTES (t));
-	  LTO_SET_PREVAIL (DECL_ABSTRACT_ORIGIN (t));
-	}
-      if (CODE_CONTAINS_STRUCT (code, TS_DECL_WITH_VIS))
-	{
-	  LTO_NO_PREVAIL (DECL_ASSEMBLER_NAME_RAW (t));
-	}
-      if (CODE_CONTAINS_STRUCT (code, TS_DECL_NON_COMMON))
-	{
-	  LTO_NO_PREVAIL (DECL_RESULT_FLD (t));
-	}
-      if (CODE_CONTAINS_STRUCT (code, TS_FUNCTION_DECL))
-	{
-	  LTO_NO_PREVAIL (DECL_ARGUMENTS (t));
-	  LTO_SET_PREVAIL (DECL_FUNCTION_PERSONALITY (t));
-	  LTO_NO_PREVAIL (DECL_VINDEX (t));
-	}
-      if (CODE_CONTAINS_STRUCT (code, TS_FIELD_DECL))
-	{
-	  LTO_SET_PREVAIL (DECL_FIELD_OFFSET (t));
-	  LTO_NO_PREVAIL (DECL_BIT_FIELD_TYPE (t));
-	  LTO_NO_PREVAIL (DECL_QUALIFIER (t));
-	  LTO_NO_PREVAIL (DECL_FIELD_BIT_OFFSET (t));
-	  LTO_NO_PREVAIL (DECL_FCONTEXT (t));
-	}
-    }
-  else if (TYPE_P (t))
-    {
-      LTO_NO_PREVAIL (TYPE_CACHED_VALUES (t));
-      LTO_SET_PREVAIL (TYPE_SIZE (t));
-      LTO_SET_PREVAIL (TYPE_SIZE_UNIT (t));
-      LTO_NO_PREVAIL (TYPE_ATTRIBUTES (t));
-      LTO_NO_PREVAIL (TYPE_NAME (t));
-
-      LTO_SET_PREVAIL (TYPE_MIN_VALUE_RAW (t));
-      LTO_SET_PREVAIL (TYPE_MAX_VALUE_RAW (t));
-      LTO_NO_PREVAIL (TYPE_LANG_SLOT_1 (t));
-
-      LTO_SET_PREVAIL (TYPE_CONTEXT (t));
-
-      LTO_NO_PREVAIL (TYPE_CANONICAL (t));
-      LTO_NO_PREVAIL (TYPE_MAIN_VARIANT (t));
-      LTO_NO_PREVAIL (TYPE_NEXT_VARIANT (t));
-    }
-  else if (EXPR_P (t))
-    {
-      int i;
-      for (i = TREE_OPERAND_LENGTH (t) - 1; i >= 0; --i)
-	LTO_SET_PREVAIL (TREE_OPERAND (t, i));
-    }
-  else if (TREE_CODE (t) == CONSTRUCTOR)
-    {
-      unsigned i;
-      tree val;
-      FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (t), i, val)
-	LTO_SET_PREVAIL (val);
-    }
-  else
-    {
-      switch (code)
-	{
-	case TREE_LIST:
-	  LTO_SET_PREVAIL (TREE_VALUE (t));
-	  LTO_SET_PREVAIL (TREE_PURPOSE (t));
-	  LTO_NO_PREVAIL (TREE_PURPOSE (t));
-	  break;
-	default:
-	  gcc_unreachable ();
-	}
-    }
-  /* If we fixed nothing, then we missed something seen by
-     mentions_vars_p.  */
-  gcc_checking_assert (fixed);
-}
-#undef LTO_SET_PREVAIL
-#undef LTO_NO_PREVAIL
-
-/* Helper function of lto_fixup_decls. Walks the var and fn streams in STATE,
-   replaces var and function decls with the corresponding prevailing def.  */
-
-static void
-lto_fixup_state (struct lto_in_decl_state *state)
-{
-  unsigned i, si;
-
-  /* Although we only want to replace FUNCTION_DECLs and VAR_DECLs,
-     we still need to walk from all DECLs to find the reachable
-     FUNCTION_DECLs and VAR_DECLs.  */
-  for (si = 0; si < LTO_N_DECL_STREAMS; si++)
-    {
-      vec<tree, va_gc> *trees = state->streams[si];
-      for (i = 0; i < vec_safe_length (trees); i++)
-	{
-	  tree t = (*trees)[i];
-	  if (flag_checking && TYPE_P (t))
-	    verify_type (t);
-	  if (VAR_OR_FUNCTION_DECL_P (t)
-	      && (TREE_PUBLIC (t) || DECL_EXTERNAL (t)))
-	    (*trees)[i] = lto_symtab_prevailing_decl (t);
-	}
-    }
-}
-
-/* Fix the decls from all FILES. Replaces each decl with the corresponding
-   prevailing one.  */
-
-static void
-lto_fixup_decls (struct lto_file_decl_data **files)
-{
-  unsigned int i;
-  tree t;
-
-  if (tree_with_vars)
-    FOR_EACH_VEC_ELT ((*tree_with_vars), i, t)
-      lto_fixup_prevailing_decls (t);
-
-  for (i = 0; files[i]; i++)
-    {
-      struct lto_file_decl_data *file = files[i];
-      struct lto_in_decl_state *state = file->global_decl_state;
-      lto_fixup_state (state);
-
-      hash_table<decl_state_hasher>::iterator iter;
-      lto_in_decl_state *elt;
-      FOR_EACH_HASH_TABLE_ELEMENT (*file->function_decl_states, elt,
-				   lto_in_decl_state *, iter)
-	lto_fixup_state (elt);
-    }
-}
-
-static GTY((length ("lto_stats.num_input_files + 1"))) struct lto_file_decl_data **all_file_decl_data;
-
-/* Turn file datas for sub files into a single array, so that they look
-   like separate files for further passes. */
-
-static void
-lto_flatten_files (struct lto_file_decl_data **orig, int count, int last_file_ix)
-{
-  struct lto_file_decl_data *n, *next;
-  int i, k;
-
-  lto_stats.num_input_files = count;
-  all_file_decl_data
-    = ggc_cleared_vec_alloc<lto_file_decl_data_ptr> (count + 1);
-  /* Set the hooks so that all of the ipa passes can read in their data.  */
-  lto_set_in_hooks (all_file_decl_data, get_section_data, free_section_data);
-  for (i = 0, k = 0; i < last_file_ix; i++) 
-    {
-      for (n = orig[i]; n != NULL; n = next)
-	{
-	  all_file_decl_data[k++] = n;
-	  next = n->next;
-	  n->next = NULL;
-	}
-    }
-  all_file_decl_data[k] = NULL;
-  gcc_assert (k == count);
-}
-
-/* Input file data before flattening (i.e. splitting them to subfiles to support
-   incremental linking.  */
-static int real_file_count;
-static GTY((length ("real_file_count + 1"))) struct lto_file_decl_data **real_file_decl_data;
-
-static void print_lto_report_1 (void);
-
-/* Read all the symbols from the input files FNAMES.  NFILES is the
-   number of files requested in the command line.  Instantiate a
-   global call graph by aggregating all the sub-graphs found in each
-   file.  */
-
-static void
-read_cgraph_and_symbols (unsigned nfiles, const char **fnames)
-{
-  unsigned int i, last_file_ix;
-  FILE *resolution;
-  int count = 0;
-  struct lto_file_decl_data **decl_data;
-  symtab_node *snode;
-
-  symtab->initialize ();
-
-  timevar_push (TV_IPA_LTO_DECL_IN);
-
-#ifdef ACCEL_COMPILER
-  section_name_prefix = OFFLOAD_SECTION_NAME_PREFIX;
-  lto_stream_offload_p = true;
-#endif
-
-  real_file_decl_data
-    = decl_data = ggc_cleared_vec_alloc<lto_file_decl_data_ptr> (nfiles + 1);
-  real_file_count = nfiles;
-
-  /* Read the resolution file.  */
-  resolution = NULL;
-  if (resolution_file_name)
-    {
-      int t;
-      unsigned num_objects;
-
-      resolution = fopen (resolution_file_name, "r");
-      if (resolution == NULL)
-	fatal_error (input_location,
-		     "could not open symbol resolution file: %m");
-
-      t = fscanf (resolution, "%u", &num_objects);
-      gcc_assert (t == 1);
-
-      /* True, since the plugin splits the archives.  */
-      gcc_assert (num_objects == nfiles);
-    }
-  symtab->state = LTO_STREAMING;
-
-  canonical_type_hash_cache = new hash_map<const_tree, hashval_t> (251);
-  gimple_canonical_types = htab_create (16381, gimple_canonical_type_hash,
-					gimple_canonical_type_eq, NULL);
-  gcc_obstack_init (&tree_scc_hash_obstack);
-  tree_scc_hash = new hash_table<tree_scc_hasher> (4096);
-
-  /* Register the common node types with the canonical type machinery so
-     we properly share alias-sets across languages and TUs.  Do not
-     expose the common nodes as type merge target - those that should be
-     are already exposed so by pre-loading the LTO streamer caches.
-     Do two passes - first clear TYPE_CANONICAL and then re-compute it.  */
-  for (i = 0; i < itk_none; ++i)
-    lto_register_canonical_types (integer_types[i], true);
-  for (i = 0; i < stk_type_kind_last; ++i)
-    lto_register_canonical_types (sizetype_tab[i], true);
-  for (i = 0; i < TI_MAX; ++i)
-    lto_register_canonical_types (global_trees[i], true);
-  for (i = 0; i < itk_none; ++i)
-    lto_register_canonical_types (integer_types[i], false);
-  for (i = 0; i < stk_type_kind_last; ++i)
-    lto_register_canonical_types (sizetype_tab[i], false);
-  for (i = 0; i < TI_MAX; ++i)
-    lto_register_canonical_types (global_trees[i], false);
-
-  if (!quiet_flag)
-    fprintf (stderr, "Reading object files:");
-
-  /* Read all of the object files specified on the command line.  */
-  for (i = 0, last_file_ix = 0; i < nfiles; ++i)
-    {
-      struct lto_file_decl_data *file_data = NULL;
-      if (!quiet_flag)
-	{
-	  fprintf (stderr, " %s", fnames[i]);
-	  fflush (stderr);
-	}
-
-      current_lto_file = lto_obj_file_open (fnames[i], false);
-      if (!current_lto_file)
-	break;
-
-      file_data = lto_file_read (current_lto_file, resolution, &count);
-      if (!file_data)
-	{
-	  lto_obj_file_close (current_lto_file);
-	  free (current_lto_file);
-	  current_lto_file = NULL;
-	  break;
-	}
-
-      decl_data[last_file_ix++] = file_data;
-
-      lto_obj_file_close (current_lto_file);
-      free (current_lto_file);
-      current_lto_file = NULL;
-    }
-
-  lto_flatten_files (decl_data, count, last_file_ix);
-  lto_stats.num_input_files = count;
-  ggc_free(decl_data);
-  real_file_decl_data = NULL;
-
-  if (resolution_file_name)
-    fclose (resolution);
-
-  /* Show the LTO report before launching LTRANS.  */
-  if (flag_lto_report || (flag_wpa && flag_lto_report_wpa))
-    print_lto_report_1 ();
-
-  /* Free gimple type merging datastructures.  */
-  delete tree_scc_hash;
-  tree_scc_hash = NULL;
-  obstack_free (&tree_scc_hash_obstack, NULL);
-  htab_delete (gimple_canonical_types);
-  gimple_canonical_types = NULL;
-  delete canonical_type_hash_cache;
-  canonical_type_hash_cache = NULL;
-
-  /* At this stage we know that majority of GGC memory is reachable.  
-     Growing the limits prevents unnecesary invocation of GGC.  */
-  ggc_grow ();
-  ggc_collect ();
-
-  /* Set the hooks so that all of the ipa passes can read in their data.  */
-  lto_set_in_hooks (all_file_decl_data, get_section_data, free_section_data);
-
-  timevar_pop (TV_IPA_LTO_DECL_IN);
-
-  if (!quiet_flag)
-    fprintf (stderr, "\nReading the callgraph\n");
-
-  timevar_push (TV_IPA_LTO_CGRAPH_IO);
-  /* Read the symtab.  */
-  input_symtab ();
-
-  input_offload_tables (!flag_ltrans);
-
-  /* Store resolutions into the symbol table.  */
-
-  FOR_EACH_SYMBOL (snode)
-    if (snode->externally_visible && snode->real_symbol_p ()
-	&& snode->lto_file_data && snode->lto_file_data->resolution_map
-	&& !(TREE_CODE (snode->decl) == FUNCTION_DECL
-	     && fndecl_built_in_p (snode->decl))
-	&& !(VAR_P (snode->decl) && DECL_HARD_REGISTER (snode->decl)))
-      {
-	ld_plugin_symbol_resolution_t *res;
-
-	res = snode->lto_file_data->resolution_map->get (snode->decl);
-	if (!res || *res == LDPR_UNKNOWN)
-	  {
-	    if (snode->output_to_lto_symbol_table_p ())
-	      fatal_error (input_location, "missing resolution data for %s",
-		           IDENTIFIER_POINTER
-			     (DECL_ASSEMBLER_NAME (snode->decl)));
-	  }
-	else
-          snode->resolution = *res;
-      }
-  for (i = 0; all_file_decl_data[i]; i++)
-    if (all_file_decl_data[i]->resolution_map)
-      {
-        delete all_file_decl_data[i]->resolution_map;
-        all_file_decl_data[i]->resolution_map = NULL;
-      }
-  
-  timevar_pop (TV_IPA_LTO_CGRAPH_IO);
-
-  if (!quiet_flag)
-    fprintf (stderr, "Merging declarations\n");
-
-  timevar_push (TV_IPA_LTO_DECL_MERGE);
-  /* Merge global decls.  In ltrans mode we read merged cgraph, we do not
-     need to care about resolving symbols again, we only need to replace
-     duplicated declarations read from the callgraph and from function
-     sections.  */
-  if (!flag_ltrans)
-    {
-      lto_symtab_merge_decls ();
-
-      /* If there were errors during symbol merging bail out, we have no
-	 good way to recover here.  */
-      if (seen_error ())
-	fatal_error (input_location,
-		     "errors during merging of translation units");
-
-      /* Fixup all decls.  */
-      lto_fixup_decls (all_file_decl_data);
-    }
-  if (tree_with_vars)
-    ggc_free (tree_with_vars);
-  tree_with_vars = NULL;
-  ggc_collect ();
-
-  timevar_pop (TV_IPA_LTO_DECL_MERGE);
-  /* Each pass will set the appropriate timer.  */
-
-  if (!quiet_flag)
-    fprintf (stderr, "Reading summaries\n");
-
-  /* Read the IPA summary data.  */
-  if (flag_ltrans)
-    ipa_read_optimization_summaries ();
-  else
-    ipa_read_summaries ();
-
-  for (i = 0; all_file_decl_data[i]; i++)
-    {
-      gcc_assert (all_file_decl_data[i]->symtab_node_encoder);
-      lto_symtab_encoder_delete (all_file_decl_data[i]->symtab_node_encoder);
-      all_file_decl_data[i]->symtab_node_encoder = NULL;
-      lto_free_function_in_decl_state (all_file_decl_data[i]->global_decl_state);
-      all_file_decl_data[i]->global_decl_state = NULL;
-      all_file_decl_data[i]->current_decl_state = NULL; 
-    }
-
-  if (!flag_ltrans)
-    {
-      /* Finally merge the cgraph according to the decl merging decisions.  */
-      timevar_push (TV_IPA_LTO_CGRAPH_MERGE);
-
-      gcc_assert (!dump_file);
-      dump_file = dump_begin (lto_link_dump_id, NULL);
-
-      if (dump_file)
-	{
-	  fprintf (dump_file, "Before merging:\n");
-	  symtab->dump (dump_file);
-	}
-      lto_symtab_merge_symbols ();
-      /* Removal of unreachable symbols is needed to make verify_symtab to pass;
-	 we are still having duplicated comdat groups containing local statics.
-	 We could also just remove them while merging.  */
-      symtab->remove_unreachable_nodes (dump_file);
-      ggc_collect ();
-
-      if (dump_file)
-        dump_end (lto_link_dump_id, dump_file);
-      dump_file = NULL;
-      timevar_pop (TV_IPA_LTO_CGRAPH_MERGE);
-    }
-  symtab->state = IPA_SSA;
-  /* All node removals happening here are useless, because
-     WPA should not stream them.  Still always perform remove_unreachable_nodes
-     because we may reshape clone tree, get rid of dead masters of inline
-     clones and remove symbol entries for read-only variables we keep around
-     only to be able to constant fold them.  */
-  if (flag_ltrans)
-    {
-      if (symtab->dump_file)
-	 symtab->dump (symtab->dump_file);
-      symtab->remove_unreachable_nodes (symtab->dump_file);
-    }
-
-  /* Indicate that the cgraph is built and ready.  */
-  symtab->function_flags_ready = true;
-
-  ggc_free (all_file_decl_data);
-  all_file_decl_data = NULL;
-}
-
-
-/* Materialize all the bodies for all the nodes in the callgraph.  */
-
-static void
-materialize_cgraph (void)
-{
-  struct cgraph_node *node; 
-  timevar_id_t lto_timer;
-
-  if (!quiet_flag)
-    fprintf (stderr,
-	     flag_wpa ? "Materializing decls:" : "Reading function bodies:");
-
-
-  FOR_EACH_FUNCTION (node)
-    {
-      if (node->lto_file_data)
-	{
-	  lto_materialize_function (node);
-	  lto_stats.num_input_cgraph_nodes++;
-	}
-    }
-
-
-  /* Start the appropriate timer depending on the mode that we are
-     operating in.  */
-  lto_timer = (flag_wpa) ? TV_WHOPR_WPA
-	      : (flag_ltrans) ? TV_WHOPR_LTRANS
-	      : TV_LTO;
-  timevar_push (lto_timer);
-
-  current_function_decl = NULL;
-  set_cfun (NULL);
-
-  if (!quiet_flag)
-    fprintf (stderr, "\n");
-
-  timevar_pop (lto_timer);
-}
-
-
-/* Show various memory usage statistics related to LTO.  */
-static void
-print_lto_report_1 (void)
-{
-  const char *pfx = (flag_lto) ? "LTO" : (flag_wpa) ? "WPA" : "LTRANS";
-  fprintf (stderr, "%s statistics\n", pfx);
-
-  fprintf (stderr, "[%s] read %lu SCCs of average size %f\n",
-	   pfx, num_sccs_read, total_scc_size / (double)num_sccs_read);
-  fprintf (stderr, "[%s] %lu tree bodies read in total\n", pfx, total_scc_size);
-  if (flag_wpa && tree_scc_hash)
-    {
-      fprintf (stderr, "[%s] tree SCC table: size %ld, %ld elements, "
-	       "collision ratio: %f\n", pfx,
-	       (long) tree_scc_hash->size (),
-	       (long) tree_scc_hash->elements (),
-	       tree_scc_hash->collisions ());
-      hash_table<tree_scc_hasher>::iterator hiter;
-      tree_scc *scc, *max_scc = NULL;
-      unsigned max_length = 0;
-      FOR_EACH_HASH_TABLE_ELEMENT (*tree_scc_hash, scc, x, hiter)
-	{
-	  unsigned length = 0;
-	  tree_scc *s = scc;
-	  for (; s; s = s->next)
-	    length++;
-	  if (length > max_length)
-	    {
-	      max_length = length;
-	      max_scc = scc;
-	    }
-	}
-      fprintf (stderr, "[%s] tree SCC max chain length %u (size %u)\n",
-	       pfx, max_length, max_scc->len);
-      fprintf (stderr, "[%s] Compared %lu SCCs, %lu collisions (%f)\n", pfx,
-	       num_scc_compares, num_scc_compare_collisions,
-	       num_scc_compare_collisions / (double) num_scc_compares);
-      fprintf (stderr, "[%s] Merged %lu SCCs\n", pfx, num_sccs_merged);
-      fprintf (stderr, "[%s] Merged %lu tree bodies\n", pfx,
-	       total_scc_size_merged);
-      fprintf (stderr, "[%s] Merged %lu types\n", pfx, num_merged_types);
-      fprintf (stderr, "[%s] %lu types prevailed (%lu associated trees)\n",
-	       pfx, num_prevailing_types, num_type_scc_trees);
-      fprintf (stderr, "[%s] GIMPLE canonical type table: size %ld, "
-	       "%ld elements, %ld searches, %ld collisions (ratio: %f)\n", pfx,
-	       (long) htab_size (gimple_canonical_types),
-	       (long) htab_elements (gimple_canonical_types),
-	       (long) gimple_canonical_types->searches,
-	       (long) gimple_canonical_types->collisions,
-	       htab_collisions (gimple_canonical_types));
-      fprintf (stderr, "[%s] GIMPLE canonical type pointer-map: "
-	       "%lu elements, %ld searches\n", pfx,
-	       num_canonical_type_hash_entries,
-	       num_canonical_type_hash_queries);
-    }
-
-  print_lto_report (pfx);
-}
-
 /* Perform whole program analysis (WPA) on the callgraph and write out the
    optimization plan.  */
 
@@ -3262,64 +538,6 @@ do_whole_program_analysis (void)
     dump_memory_report (true);
 }
 
-
-static GTY(()) tree lto_eh_personality_decl;
-
-/* Return the LTO personality function decl.  */
-
-tree
-lto_eh_personality (void)
-{
-  if (!lto_eh_personality_decl)
-    {
-      /* Use the first personality DECL for our personality if we don't
-	 support multiple ones.  This ensures that we don't artificially
-	 create the need for them in a single-language program.  */
-      if (first_personality_decl && !dwarf2out_do_cfi_asm ())
-	lto_eh_personality_decl = first_personality_decl;
-      else
-	lto_eh_personality_decl = lhd_gcc_personality ();
-    }
-
-  return lto_eh_personality_decl;
-}
-
-/* Set the process name based on the LTO mode. */
-
-static void 
-lto_process_name (void)
-{
-  if (flag_lto)
-    setproctitle (flag_incremental_link == INCREMENTAL_LINK_LTO
-		  ? "lto1-inclink" : "lto1-lto");
-  if (flag_wpa)
-    setproctitle ("lto1-wpa");
-  if (flag_ltrans)
-    setproctitle ("lto1-ltrans");
-}
-
-
-/* Initialize the LTO front end.  */
-
-static void
-lto_init (void)
-{
-  lto_process_name ();
-  lto_streamer_hooks_init ();
-  lto_reader_init ();
-  lto_set_in_hooks (NULL, get_section_data, free_section_data);
-  memset (&lto_stats, 0, sizeof (lto_stats));
-  bitmap_obstack_initialize (NULL);
-  gimple_register_cfg_hooks ();
-#ifndef ACCEL_COMPILER
-  unsigned char *table
-    = ggc_vec_alloc<unsigned char> (MAX_MACHINE_MODE);
-  for (int m = 0; m < MAX_MACHINE_MODE; m++)
-    table[m] = m;
-  lto_mode_identity_table = table;
-#endif
-}
-
 /* Create artificial pointers for "omp declare target link" vars.  */
 
 static void
@@ -3351,7 +569,6 @@ offload_handle_link_vars (void)
 #endif
 }
 
-
 /* Main entry point for the GIMPLE front end.  This front end has
    three main personalities:
 
@@ -3386,7 +603,7 @@ lto_main (void)
   timevar_start (TV_PHASE_SETUP);
 
   /* Initialize the LTO front end.  */
-  lto_init ();
+  lto_fe_init ();
 
   timevar_stop (TV_PHASE_SETUP);
   timevar_start (TV_PHASE_STREAM_IN);
@@ -3439,5 +656,3 @@ lto_main (void)
   timevar_start (TV_PHASE_PARSING);
   timevar_push (TV_PARSE_GLOBAL);
 }
-
-#include "gt-lto-lto.h"
diff --git a/gcc/machmode.h b/gcc/machmode.h
index d564f9c64..a507ed66c 100644
--- a/gcc/machmode.h
+++ b/gcc/machmode.h
@@ -244,14 +244,15 @@ class opt_mode
 public:
   enum from_int { dummy = MAX_MACHINE_MODE };
 
-  ALWAYS_INLINE opt_mode () : m_mode (E_VOIDmode) {}
-  ALWAYS_INLINE opt_mode (const T &m) : m_mode (m) {}
+  ALWAYS_INLINE CONSTEXPR opt_mode () : m_mode (E_VOIDmode) {}
+  ALWAYS_INLINE CONSTEXPR opt_mode (const T &m) : m_mode (m) {}
   template<typename U>
-  ALWAYS_INLINE opt_mode (const U &m) : m_mode (T (m)) {}
-  ALWAYS_INLINE opt_mode (from_int m) : m_mode (machine_mode (m)) {}
+  ALWAYS_INLINE CONSTEXPR opt_mode (const U &m) : m_mode (T (m)) {}
+  ALWAYS_INLINE CONSTEXPR opt_mode (from_int m) : m_mode (machine_mode (m)) {}
 
   machine_mode else_void () const;
-  machine_mode else_blk () const;
+  machine_mode else_blk () const { return else_mode (BLKmode); }
+  machine_mode else_mode (machine_mode) const;
   T require () const;
 
   bool exists () const;
@@ -274,13 +275,13 @@ opt_mode<T>::else_void () const
   return m_mode;
 }
 
-/* If the T exists, return its enum value, otherwise return E_BLKmode.  */
+/* If the T exists, return its enum value, otherwise return FALLBACK.  */
 
 template<typename T>
 inline machine_mode
-opt_mode<T>::else_blk () const
+opt_mode<T>::else_mode (machine_mode fallback) const
 {
-  return m_mode == E_VOIDmode ? E_BLKmode : m_mode;
+  return m_mode == E_VOIDmode ? fallback : m_mode;
 }
 
 /* Assert that the object contains a T and return it.  */
@@ -326,8 +327,12 @@ struct pod_mode
   typedef typename T::measurement_type measurement_type;
 
   machine_mode m_mode;
-  ALWAYS_INLINE operator machine_mode () const { return m_mode; }
-  ALWAYS_INLINE operator T () const { return from_int (m_mode); }
+  ALWAYS_INLINE CONSTEXPR
+  operator machine_mode () const { return m_mode; }
+
+  ALWAYS_INLINE CONSTEXPR
+  operator T () const { return from_int (m_mode); }
+
   ALWAYS_INLINE pod_mode &operator = (const T &m) { m_mode = m; return *this; }
 };
 
@@ -405,8 +410,11 @@ public:
   typedef unsigned short measurement_type;
 
   ALWAYS_INLINE scalar_int_mode () {}
-  ALWAYS_INLINE scalar_int_mode (from_int m) : m_mode (machine_mode (m)) {}
-  ALWAYS_INLINE operator machine_mode () const { return m_mode; }
+
+  ALWAYS_INLINE CONSTEXPR
+  scalar_int_mode (from_int m) : m_mode (machine_mode (m)) {}
+
+  ALWAYS_INLINE CONSTEXPR operator machine_mode () const { return m_mode; }
 
   static bool includes_p (machine_mode);
 
@@ -430,8 +438,11 @@ public:
   typedef unsigned short measurement_type;
 
   ALWAYS_INLINE scalar_float_mode () {}
-  ALWAYS_INLINE scalar_float_mode (from_int m) : m_mode (machine_mode (m)) {}
-  ALWAYS_INLINE operator machine_mode () const { return m_mode; }
+
+  ALWAYS_INLINE CONSTEXPR
+  scalar_float_mode (from_int m) : m_mode (machine_mode (m)) {}
+
+  ALWAYS_INLINE CONSTEXPR operator machine_mode () const { return m_mode; }
 
   static bool includes_p (machine_mode);
 
@@ -455,11 +466,20 @@ public:
   typedef unsigned short measurement_type;
 
   ALWAYS_INLINE scalar_mode () {}
-  ALWAYS_INLINE scalar_mode (from_int m) : m_mode (machine_mode (m)) {}
-  ALWAYS_INLINE scalar_mode (const scalar_int_mode &m) : m_mode (m) {}
-  ALWAYS_INLINE scalar_mode (const scalar_float_mode &m) : m_mode (m) {}
-  ALWAYS_INLINE scalar_mode (const scalar_int_mode_pod &m) : m_mode (m) {}
-  ALWAYS_INLINE operator machine_mode () const { return m_mode; }
+
+  ALWAYS_INLINE CONSTEXPR
+  scalar_mode (from_int m) : m_mode (machine_mode (m)) {}
+
+  ALWAYS_INLINE CONSTEXPR
+  scalar_mode (const scalar_int_mode &m) : m_mode (m) {}
+
+  ALWAYS_INLINE CONSTEXPR
+  scalar_mode (const scalar_float_mode &m) : m_mode (m) {}
+
+  ALWAYS_INLINE CONSTEXPR
+  scalar_mode (const scalar_int_mode_pod &m) : m_mode (m) {}
+
+  ALWAYS_INLINE CONSTEXPR operator machine_mode () const { return m_mode; }
 
   static bool includes_p (machine_mode);
 
@@ -496,8 +516,11 @@ public:
   typedef unsigned short measurement_type;
 
   ALWAYS_INLINE complex_mode () {}
-  ALWAYS_INLINE complex_mode (from_int m) : m_mode (machine_mode (m)) {}
-  ALWAYS_INLINE operator machine_mode () const { return m_mode; }
+
+  ALWAYS_INLINE CONSTEXPR
+  complex_mode (from_int m) : m_mode (machine_mode (m)) {}
+
+  ALWAYS_INLINE CONSTEXPR operator machine_mode () const { return m_mode; }
 
   static bool includes_p (machine_mode);
 
@@ -766,14 +789,29 @@ public:
   typedef unsigned short measurement_type;
 
   ALWAYS_INLINE fixed_size_mode () {}
-  ALWAYS_INLINE fixed_size_mode (from_int m) : m_mode (machine_mode (m)) {}
-  ALWAYS_INLINE fixed_size_mode (const scalar_mode &m) : m_mode (m) {}
-  ALWAYS_INLINE fixed_size_mode (const scalar_int_mode &m) : m_mode (m) {}
-  ALWAYS_INLINE fixed_size_mode (const scalar_float_mode &m) : m_mode (m) {}
-  ALWAYS_INLINE fixed_size_mode (const scalar_mode_pod &m) : m_mode (m) {}
-  ALWAYS_INLINE fixed_size_mode (const scalar_int_mode_pod &m) : m_mode (m) {}
-  ALWAYS_INLINE fixed_size_mode (const complex_mode &m) : m_mode (m) {}
-  ALWAYS_INLINE operator machine_mode () const { return m_mode; }
+
+  ALWAYS_INLINE CONSTEXPR
+  fixed_size_mode (from_int m) : m_mode (machine_mode (m)) {}
+
+  ALWAYS_INLINE CONSTEXPR
+  fixed_size_mode (const scalar_mode &m) : m_mode (m) {}
+
+  ALWAYS_INLINE CONSTEXPR
+  fixed_size_mode (const scalar_int_mode &m) : m_mode (m) {}
+
+  ALWAYS_INLINE CONSTEXPR
+  fixed_size_mode (const scalar_float_mode &m) : m_mode (m) {}
+
+  ALWAYS_INLINE CONSTEXPR
+  fixed_size_mode (const scalar_mode_pod &m) : m_mode (m) {}
+
+  ALWAYS_INLINE CONSTEXPR
+  fixed_size_mode (const scalar_int_mode_pod &m) : m_mode (m) {}
+
+  ALWAYS_INLINE CONSTEXPR
+  fixed_size_mode (const complex_mode &m) : m_mode (m) {}
+
+  ALWAYS_INLINE CONSTEXPR operator machine_mode () const { return m_mode; }
 
   static bool includes_p (machine_mode);
 
diff --git a/gcc/match.pd b/gcc/match.pd
index f7e192d9b..facc43387 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -82,12 +82,14 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   plus minus
   mult trunc_div trunc_mod rdiv
   min max
-  bit_and bit_ior bit_xor)
+  bit_and bit_ior bit_xor
+  lshift rshift)
 (define_operator_list COND_BINARY
   IFN_COND_ADD IFN_COND_SUB
   IFN_COND_MUL IFN_COND_DIV IFN_COND_MOD IFN_COND_RDIV
   IFN_COND_MIN IFN_COND_MAX
-  IFN_COND_AND IFN_COND_IOR IFN_COND_XOR)
+  IFN_COND_AND IFN_COND_IOR IFN_COND_XOR
+  IFN_COND_SHL IFN_COND_SHR)
 
 /* Same for ternary operations.  */
 (define_operator_list UNCOND_TERNARY
@@ -5378,3 +5380,86 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
       (bit_and:elt_type
        (BIT_FIELD_REF:elt_type @0 { size; } { pos; })
        { elt; })))))))
+
+(simplify
+ (vec_perm @0 @1 VECTOR_CST@2)
+ (with
+  {
+    tree op0 = @0, op1 = @1, op2 = @2;
+
+    /* Build a vector of integers from the tree mask.  */
+    vec_perm_builder builder;
+    if (!tree_to_vec_perm_builder (&builder, op2))
+      return NULL_TREE;
+
+    /* Create a vec_perm_indices for the integer vector.  */
+    poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
+    bool single_arg = (op0 == op1);
+    vec_perm_indices sel (builder, single_arg ? 1 : 2, nelts);
+  }
+  (if (sel.series_p (0, 1, 0, 1))
+   { op0; }
+   (if (sel.series_p (0, 1, nelts, 1))
+    { op1; }
+    (with
+     {
+       if (!single_arg)
+         {
+	   if (sel.all_from_input_p (0))
+	     op1 = op0;
+	   else if (sel.all_from_input_p (1))
+	     {
+	       op0 = op1;
+	       sel.rotate_inputs (1);
+	     }
+         }
+       gassign *def;
+       tree cop0 = op0, cop1 = op1;
+       if (TREE_CODE (op0) == SSA_NAME
+           && (def = dyn_cast <gassign *> (SSA_NAME_DEF_STMT (op0)))
+	   && gimple_assign_rhs_code (def) == CONSTRUCTOR)
+	 cop0 = gimple_assign_rhs1 (def);
+       if (TREE_CODE (op1) == SSA_NAME
+           && (def = dyn_cast <gassign *> (SSA_NAME_DEF_STMT (op1)))
+	   && gimple_assign_rhs_code (def) == CONSTRUCTOR)
+	 cop1 = gimple_assign_rhs1 (def);
+
+       tree t;
+    }
+    (if ((TREE_CODE (cop0) == VECTOR_CST
+	  || TREE_CODE (cop0) == CONSTRUCTOR)
+	 && (TREE_CODE (cop1) == VECTOR_CST
+	     || TREE_CODE (cop1) == CONSTRUCTOR)
+	 && (t = fold_vec_perm (type, cop0, cop1, sel)))
+     { t; }
+     (with
+      {
+	bool changed = (op0 == op1 && !single_arg);
+
+	/* Generate a canonical form of the selector.  */
+	if (sel.encoding () != builder)
+	  {
+	    /* Some targets are deficient and fail to expand a single
+	       argument permutation while still allowing an equivalent
+	       2-argument version.  */
+	    tree oldop2 = op2;
+	    if (sel.ninputs () == 2
+	       || can_vec_perm_const_p (TYPE_MODE (type), sel, false))
+	      op2 = vec_perm_indices_to_tree (TREE_TYPE (op2), sel);
+	    else
+	      {
+	        vec_perm_indices sel2 (builder, 2, nelts);
+	        if (can_vec_perm_const_p (TYPE_MODE (type), sel2, false))
+	          op2 = vec_perm_indices_to_tree (TREE_TYPE (op2), sel2);
+	        else
+	          /* Not directly supported with either encoding,
+		     so use the preferred form.  */
+		  op2 = vec_perm_indices_to_tree (TREE_TYPE (op2), sel);
+	      }
+	    /* Differences in the encoder do not necessarily mean
+	       differences in the resulting vector.  */
+	    changed = !operand_equal_p (op2, oldop2, 0);
+	  }
+      }
+      (if (changed)
+       (vec_perm { op0; } { op1; } { op2; })))))))))
diff --git a/gcc/mode-switching.c b/gcc/mode-switching.c
index 2ff21a400..4a34d4a2b 100644
--- a/gcc/mode-switching.c
+++ b/gcc/mode-switching.c
@@ -165,7 +165,7 @@ new_seginfo (int mode, rtx_insn *insn, int bb, HARD_REG_SET regs_live)
   ptr->insn_ptr = insn;
   ptr->bbnum = bb;
   ptr->next = NULL;
-  COPY_HARD_REG_SET (ptr->regs_live, regs_live);
+  ptr->regs_live = regs_live;
   return ptr;
 }
 
@@ -637,7 +637,7 @@ optimize_mode_switching (void)
 		    if (REG_NOTE_KIND (link) == REG_DEAD)
 		      reg_dies (XEXP (link, 0), &live_now);
 
-		  note_stores (PATTERN (insn), reg_becomes_live, &live_now);
+		  note_stores (insn, reg_becomes_live, &live_now);
 		  for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
 		    if (REG_NOTE_KIND (link) == REG_UNUSED)
 		      reg_dies (XEXP (link, 0), &live_now);
diff --git a/gcc/omp-simd-clone.c b/gcc/omp-simd-clone.c
index 10490f34f..d884514cc 100644
--- a/gcc/omp-simd-clone.c
+++ b/gcc/omp-simd-clone.c
@@ -461,8 +461,7 @@ simd_clone_create (struct cgraph_node *old_node)
   if (new_node == NULL)
     return new_node;
 
-  DECL_BUILT_IN_CLASS (new_node->decl) = NOT_BUILT_IN;
-  DECL_FUNCTION_CODE (new_node->decl) = (enum built_in_function) 0;
+  set_decl_built_in_function (new_node->decl, NOT_BUILT_IN, 0);
   TREE_PUBLIC (new_node->decl) = TREE_PUBLIC (old_node->decl);
   DECL_COMDAT (new_node->decl) = DECL_COMDAT (old_node->decl);
   DECL_WEAK (new_node->decl) = DECL_WEAK (old_node->decl);
diff --git a/gcc/opt-suggestions.c b/gcc/opt-suggestions.c
index a820c78ff..1ec94203c 100644
--- a/gcc/opt-suggestions.c
+++ b/gcc/opt-suggestions.c
@@ -307,7 +307,6 @@ test_completion_valid_options (option_proposer &proposer)
     "-Wassign-intercept",
     "-Wno-format-security",
     "-fno-sched-stalled-insns",
-    "-fbtr-bb-exclusive",
     "-fno-tree-tail-merge",
     "-Wlong-long",
     "-Wno-unused-but-set-parameter",
diff --git a/gcc/optabs-tree.c b/gcc/optabs-tree.c
index 341e02bd5..7bad9c87b 100644
--- a/gcc/optabs-tree.c
+++ b/gcc/optabs-tree.c
@@ -267,20 +267,16 @@ optab_for_tree_code (enum tree_code code, const_tree type,
 
    Convert operations we currently support directly are FIX_TRUNC and FLOAT.
    This function checks if these operations are supported
-   by the target platform either directly (via vector tree-codes), or via
-   target builtins.
+   by the target platform directly (via vector tree-codes).
 
    Output:
    - CODE1 is code of vector operation to be used when
-   vectorizing the operation, if available.
-   - DECL is decl of target builtin functions to be used
-   when vectorizing the operation, if available.  In this case,
-   CODE1 is CALL_EXPR.  */
+   vectorizing the operation, if available.  */
 
 bool
 supportable_convert_operation (enum tree_code code,
 			       tree vectype_out, tree vectype_in,
-			       tree *decl, enum tree_code *code1)
+			       enum tree_code *code1)
 {
   machine_mode m1,m2;
   bool truncp;
@@ -314,15 +310,6 @@ supportable_convert_operation (enum tree_code code,
       return true;
     }
 
-  /* Now check for builtin.  */
-  if (targetm.vectorize.builtin_conversion
-      && targetm.vectorize.builtin_conversion (code, vectype_out, vectype_in))
-    {
-      *code1 = CALL_EXPR;
-      *decl = targetm.vectorize.builtin_conversion (code, vectype_out,
-						    vectype_in);
-      return true;
-    }
   return false;
 }
 
diff --git a/gcc/optabs-tree.h b/gcc/optabs-tree.h
index 5e4848997..dac350142 100644
--- a/gcc/optabs-tree.h
+++ b/gcc/optabs-tree.h
@@ -36,7 +36,7 @@ enum optab_subtype
    the second argument.  The third argument distinguishes between the types of
    vector shifts and rotates.  */
 optab optab_for_tree_code (enum tree_code, const_tree, enum optab_subtype);
-bool supportable_convert_operation (enum tree_code, tree, tree, tree *,
+bool supportable_convert_operation (enum tree_code, tree, tree,
 				    enum tree_code *);
 bool expand_vec_cmp_expr_p (tree, tree, enum tree_code);
 bool expand_vec_cond_expr_p (tree, tree, enum tree_code);
diff --git a/gcc/optabs.c b/gcc/optabs.c
index c2c1274eb..d9788d248 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -3727,7 +3727,7 @@ emit_libcall_block_1 (rtx_insn *insns, rtx target, rtx result, rtx equiv,
 	  data.first = insns;
 	  data.insn = insn;
 	  data.must_stay = 0;
-	  note_stores (PATTERN (insn), no_conflict_move_test, &data);
+	  note_stores (insn, no_conflict_move_test, &data);
 	  if (! data.must_stay)
 	    {
 	      if (PREV_INSN (insn))
@@ -6428,7 +6428,7 @@ expand_atomic_compare_and_swap (rtx *ptarget_bool, rtx *ptarget_oval,
       /* Otherwise, work out if the compare-and-swap succeeded.  */
       cc_reg = NULL_RTX;
       if (have_insn_for (COMPARE, CCmode))
-	note_stores (PATTERN (get_last_insn ()), find_cc_set, &cc_reg);
+	note_stores (get_last_insn (), find_cc_set, &cc_reg);
       if (cc_reg)
 	{
 	  target_bool = emit_store_flag_force (target_bool, EQ, cc_reg,
@@ -7181,18 +7181,16 @@ static bool
 maybe_legitimize_operand (enum insn_code icode, unsigned int opno,
 			  struct expand_operand *op)
 {
-  machine_mode mode, imode;
-  bool old_volatile_ok, result;
+  machine_mode mode, imode, tmode;
 
   mode = op->mode;
   switch (op->type)
     {
     case EXPAND_FIXED:
-      old_volatile_ok = volatile_ok;
-      volatile_ok = true;
-      result = maybe_legitimize_operand_same_code (icode, opno, op);
-      volatile_ok = old_volatile_ok;
-      return result;
+      {
+	temporary_volatile_ok v (true);
+	return maybe_legitimize_operand_same_code (icode, opno, op);
+      }
 
     case EXPAND_OUTPUT:
       gcc_assert (mode != VOIDmode);
@@ -7230,9 +7228,17 @@ maybe_legitimize_operand (enum insn_code icode, unsigned int opno,
 	gcc_assert (mode != VOIDmode);
 
       imode = insn_data[(int) icode].operand[opno].mode;
+      tmode = (VECTOR_MODE_P (imode) && !VECTOR_MODE_P (mode)
+	       ? GET_MODE_INNER (imode) : imode);
+      if (tmode != VOIDmode && tmode != mode)
+	{
+	  op->value = convert_modes (tmode, mode, op->value, op->unsigned_p);
+	  mode = tmode;
+	}
       if (imode != VOIDmode && imode != mode)
 	{
-	  op->value = convert_modes (imode, mode, op->value, op->unsigned_p);
+	  gcc_assert (VECTOR_MODE_P (imode) && !VECTOR_MODE_P (mode));
+	  op->value = expand_vector_broadcast (imode, op->value);
 	  mode = imode;
 	}
       goto input;
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 8af3a2f43..912766656 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -230,6 +230,9 @@ OPTAB_D (cond_umod_optab, "cond_umod$a")
 OPTAB_D (cond_and_optab, "cond_and$a")
 OPTAB_D (cond_ior_optab, "cond_ior$a")
 OPTAB_D (cond_xor_optab, "cond_xor$a")
+OPTAB_D (cond_ashl_optab, "cond_ashl$a")
+OPTAB_D (cond_ashr_optab, "cond_ashr$a")
+OPTAB_D (cond_lshr_optab, "cond_lshr$a")
 OPTAB_D (cond_smin_optab, "cond_smin$a")
 OPTAB_D (cond_smax_optab, "cond_smax$a")
 OPTAB_D (cond_umin_optab, "cond_umin$a")
@@ -256,7 +259,7 @@ OPTAB_D (umul_highpart_optab, "umul$a3_highpart")
 OPTAB_D (cmpmem_optab, "cmpmem$a")
 OPTAB_D (cmpstr_optab, "cmpstr$a")
 OPTAB_D (cmpstrn_optab, "cmpstrn$a")
-OPTAB_D (movmem_optab, "movmem$a")
+OPTAB_D (cpymem_optab, "cpymem$a")
 OPTAB_D (setmem_optab, "setmem$a")
 OPTAB_D (strlen_optab, "strlen$a")
 
@@ -323,6 +326,7 @@ OPTAB_D (reduc_and_scal_optab,  "reduc_and_scal_$a")
 OPTAB_D (reduc_ior_scal_optab,  "reduc_ior_scal_$a")
 OPTAB_D (reduc_xor_scal_optab,  "reduc_xor_scal_$a")
 OPTAB_D (fold_left_plus_optab, "fold_left_plus_$a")
+OPTAB_D (mask_fold_left_plus_optab, "mask_fold_left_plus_$a")
 
 OPTAB_D (extract_last_optab, "extract_last_$a")
 OPTAB_D (fold_extract_last_optab, "fold_extract_last_$a")
@@ -337,6 +341,11 @@ OPTAB_D (udot_prod_optab, "udot_prod$I$a")
 OPTAB_D (usum_widen_optab, "widen_usum$I$a3")
 OPTAB_D (usad_optab, "usad$I$a")
 OPTAB_D (ssad_optab, "ssad$I$a")
+OPTAB_D (smulhs_optab, "smulhs$a3")
+OPTAB_D (smulhrs_optab, "smulhrs$a3")
+OPTAB_D (umulhs_optab, "umulhs$a3")
+OPTAB_D (umulhrs_optab, "umulhrs$a3")
+OPTAB_D (sdiv_pow2_optab, "sdiv_pow2$a3")
 OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
 OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")
 OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a")
diff --git a/gcc/optabs.h b/gcc/optabs.h
index 17b5dfb67..18dec50f5 100644
--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@@ -128,7 +128,11 @@ create_convert_operand_to (struct expand_operand *op, rtx value,
 /* Make OP describe an input operand that should have the same value
    as VALUE, after any mode conversion that the backend might request.
    If VALUE is a CONST_INT, it should be treated as having mode MODE.
-   UNSIGNED_P says whether VALUE is unsigned.  */
+   UNSIGNED_P says whether VALUE is unsigned.
+
+   The conversion of VALUE can include a combination of numerical
+   conversion (as for convert_modes) and duplicating a scalar to fill
+   a vector (if VALUE is a scalar but the operand is a vector).  */
 
 static inline void
 create_convert_operand_from (struct expand_operand *op, rtx value,
diff --git a/gcc/opts-global.c b/gcc/opts-global.c
index 4f8aac7e9..6e4f2d528 100644
--- a/gcc/opts-global.c
+++ b/gcc/opts-global.c
@@ -255,6 +255,7 @@ init_options_once (void)
      construct their pretty-printers means that all previous settings
      are overriden.  */
   diagnostic_color_init (global_dc);
+  diagnostic_urls_init (global_dc);
 }
 
 /* Decode command-line options to an array, like
diff --git a/gcc/opts.c b/gcc/opts.c
index 494be7a9f..a8db491b5 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -465,7 +465,6 @@ static const struct default_options default_options_table[] =
     { OPT_LEVELS_1_PLUS, OPT_ftree_copy_prop, NULL, 1 },
     { OPT_LEVELS_1_PLUS, OPT_ftree_dce, NULL, 1 },
     { OPT_LEVELS_1_PLUS, OPT_ftree_dominator_opts, NULL, 1 },
-    { OPT_LEVELS_1_PLUS, OPT_ftree_dse, NULL, 1 },
     { OPT_LEVELS_1_PLUS, OPT_ftree_fre, NULL, 1 },
     { OPT_LEVELS_1_PLUS, OPT_ftree_sink, NULL, 1 },
     { OPT_LEVELS_1_PLUS, OPT_ftree_slsr, NULL, 1 },
@@ -476,14 +475,16 @@ static const struct default_options default_options_table[] =
 #if DELAY_SLOTS
     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_fdelayed_branch, NULL, 1 },
 #endif
+    { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_fdse, NULL, 1 },
     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_fif_conversion, NULL, 1 },
     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_fif_conversion2, NULL, 1 },
     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_finline_functions_called_once, NULL, 1 },
     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_fmove_loop_invariants, NULL, 1 },
     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_fssa_phiopt, NULL, 1 },
     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_ftree_bit_ccp, NULL, 1 },
-    { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_ftree_sra, NULL, 1 },
+    { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_ftree_dse, NULL, 1 },
     { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_ftree_pta, NULL, 1 },
+    { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_ftree_sra, NULL, 1 },
 
     /* -O2 and -Os optimizations.  */
     { OPT_LEVELS_2_PLUS, OPT_fcaller_saves, NULL, 1 },
@@ -521,6 +522,7 @@ static const struct default_options default_options_table[] =
     { OPT_LEVELS_2_PLUS, OPT_ftree_tail_merge, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_ftree_vrp, NULL, 1 },
     { OPT_LEVELS_2_PLUS, OPT_fvect_cost_model_, NULL, VECT_COST_MODEL_CHEAP },
+    { OPT_LEVELS_2_PLUS, OPT_finline_functions, NULL, 1 },
 
     /* -O2 and -Os optimizations.  */
     { OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_falign_functions, NULL, 1 },
@@ -536,9 +538,6 @@ static const struct default_options default_options_table[] =
 #endif
 
     /* -O3 and -Os optimizations.  */
-    /* Inlining of functions reducing size is a good idea with -Os
-       regardless of them being declared inline.  */
-    { OPT_LEVELS_3_PLUS_AND_SIZE, OPT_finline_functions, NULL, 1 },
 
     /* -O3 optimizations.  */
     { OPT_LEVELS_3_PLUS, OPT_fgcse_after_reload, NULL, 1 },
@@ -2400,6 +2399,10 @@ common_handle_option (struct gcc_options *opts,
       diagnostic_color_init (dc, value);
       break;
 
+    case OPT_fdiagnostics_urls_:
+      diagnostic_urls_init (dc, value);
+      break;
+
     case OPT_fdiagnostics_format_:
       diagnostic_output_format_init (dc,
 				     (enum diagnostics_output_format)value);
diff --git a/gcc/params.def b/gcc/params.def
index 08c709636..0ef092214 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -61,8 +61,13 @@ DEFPARAM (PARAM_PREDICTABLE_BRANCH_OUTCOME,
 
 DEFPARAM (PARAM_INLINE_MIN_SPEEDUP,
 	  "inline-min-speedup",
+	  "The minimal estimated speedup allowing inliner to ignore inline-insns-single and inline-insns-auto with -O3 and -Ofast.",
+	  15, 0, 100)
+
+DEFPARAM (PARAM_INLINE_MIN_SPEEDUP_O2,
+	  "inline-min-speedup-O2",
 	  "The minimal estimated speedup allowing inliner to ignore inline-insns-single and inline-insns-auto.",
-	  15, 0, 0)
+	  30, 0, 100)
 
 /* The single function inlining limit. This is the maximum size
    of a function counted in internal gcc instructions (not in
@@ -77,9 +82,14 @@ DEFPARAM (PARAM_INLINE_MIN_SPEEDUP,
    gets decreased.  */
 DEFPARAM (PARAM_MAX_INLINE_INSNS_SINGLE,
 	  "max-inline-insns-single",
-	  "The maximum number of instructions in a single function eligible for inlining.",
+	  "The maximum number of instructions in a single function eligible for inlining with -O3 and -Ofast.",
 	  200, 0, 0)
 
+DEFPARAM (PARAM_MAX_INLINE_INSNS_SINGLE_O2,
+	  "max-inline-insns-single-O2",
+	  "The maximum number of instructions in a single function eligible for inlining.",
+	  30, 0, 0)
+
 /* The single function inlining limit for functions that are
    inlined by virtue of -finline-functions (-O3).
    This limit should be chosen to be below or equal to the limit
@@ -89,9 +99,14 @@ DEFPARAM (PARAM_MAX_INLINE_INSNS_SINGLE,
    The default value is 30.  */
 DEFPARAM (PARAM_MAX_INLINE_INSNS_AUTO,
 	  "max-inline-insns-auto",
-	  "The maximum number of instructions when automatically inlining.",
+	  "The maximum number of instructions when automatically inlining with -O3 and -Ofast.",
 	  30, 0, 0)
 
+DEFPARAM (PARAM_MAX_INLINE_INSNS_AUTO_O2,
+	  "max-inline-insns-auto-O2",
+	  "The maximum number of instructions when automatically inlining.",
+	  15, 0, 0)
+
 DEFPARAM (PARAM_MAX_INLINE_INSNS_SMALL,
 	  "max-inline-insns-small",
 	  "The maximum number of instructions when automatically inlining small functions.",
@@ -243,8 +258,12 @@ DEFPARAM(PARAM_IPCP_UNIT_GROWTH,
 	 10, 0, 0)
 DEFPARAM(PARAM_EARLY_INLINING_INSNS,
 	 "early-inlining-insns",
-	 "Maximal estimated growth of function body caused by early inlining of single call.",
+	 "Maximal estimated growth of function body caused by early inlining of single call with -O3 and -Ofast.",
 	 14, 0, 0)
+DEFPARAM(PARAM_EARLY_INLINING_INSNS_O2,
+	 "early-inlining-insns-O2",
+	 "Maximal estimated growth of function body caused by early inlining of single call with -O1 and -O2.",
+	 6, 0, 0)
 DEFPARAM(PARAM_LARGE_STACK_FRAME,
 	 "large-stack-frame",
 	 "The size of stack frame to be considered large.",
diff --git a/gcc/passes.def b/gcc/passes.def
index 901dbef93..a03685500 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -459,7 +459,6 @@ along with GCC; see the file COPYING3.  If not see
 	  NEXT_PASS (pass_split_after_reload);
 	  NEXT_PASS (pass_ree);
 	  NEXT_PASS (pass_compare_elim_after_reload);
-	  NEXT_PASS (pass_branch_target_load_optimize1);
 	  NEXT_PASS (pass_thread_prologue_and_epilogue);
 	  NEXT_PASS (pass_rtl_dse2);
 	  NEXT_PASS (pass_stack_adjustments);
@@ -472,7 +471,6 @@ along with GCC; see the file COPYING3.  If not see
 	  NEXT_PASS (pass_cprop_hardreg);
 	  NEXT_PASS (pass_fast_rtl_dce);
 	  NEXT_PASS (pass_reorder_blocks);
-	  NEXT_PASS (pass_branch_target_load_optimize2);
 	  NEXT_PASS (pass_leaf_regs);
 	  NEXT_PASS (pass_split_before_sched2);
 	  NEXT_PASS (pass_sched2);
diff --git a/gcc/postreload-gcse.c b/gcc/postreload-gcse.c
index a165351ca..bc2e8fc91 100644
--- a/gcc/postreload-gcse.c
+++ b/gcc/postreload-gcse.c
@@ -672,7 +672,7 @@ load_killed_in_block_p (int uid_limit, rtx x, bool after_insn)
 	 It will set mems_conflict_p to nonzero if there may be a
 	 conflict between X and SETTER.  */
       mems_conflict_p = 0;
-      note_stores (PATTERN (setter), find_mem_conflicts, x);
+      note_stores (setter, find_mem_conflicts, x);
       if (mems_conflict_p)
 	return 1;
 
@@ -774,7 +774,7 @@ record_opr_changes (rtx_insn *insn)
   rtx note;
 
   /* Find all stores and record them.  */
-  note_stores (PATTERN (insn), record_last_set_info, insn);
+  note_stores (insn, record_last_set_info, insn);
 
   /* Also record autoincremented REGs for this insn as changed.  */
   for (note = REG_NOTES (insn); note; note = XEXP (note, 1))
@@ -785,25 +785,10 @@ record_opr_changes (rtx_insn *insn)
   if (CALL_P (insn))
     {
       unsigned int regno;
-      rtx link, x;
       hard_reg_set_iterator hrsi;
       EXECUTE_IF_SET_IN_HARD_REG_SET (regs_invalidated_by_call, 0, regno, hrsi)
 	record_last_reg_set_info_regno (insn, regno);
 
-      for (link = CALL_INSN_FUNCTION_USAGE (insn); link; link = XEXP (link, 1))
-	{
-	  gcc_assert (GET_CODE (XEXP (link, 0)) != CLOBBER_HIGH);
-	  if (GET_CODE (XEXP (link, 0)) == CLOBBER)
-	    {
-	      x = XEXP (XEXP (link, 0), 0);
-	      if (REG_P (x))
-		{
-		  gcc_assert (HARD_REGISTER_P (x));
-		  record_last_reg_set_info (insn, x);
-		}
-	    }
-	}
-
       if (! RTL_CONST_OR_PURE_CALL_P (insn))
 	record_last_mem_set_info (insn);
     }
diff --git a/gcc/postreload.c b/gcc/postreload.c
index b76c7b0b7..ee0dc6ae8 100644
--- a/gcc/postreload.c
+++ b/gcc/postreload.c
@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "cselib.h"
 #include "tree-pass.h"
 #include "dbgcnt.h"
+#include "function-abi.h"
 
 static int reload_cse_noop_set_p (rtx);
 static bool reload_cse_simplify (rtx_insn *, rtx);
@@ -133,8 +134,6 @@ reload_cse_simplify (rtx_insn *insn, rtx testreg)
 	  for (i = XVECLEN (body, 0) - 1; i >= 0; --i)
 	    {
 	      rtx part = XVECEXP (body, 0, i);
-	      /* asms can only have full clobbers, not clobber_highs.  */
-	      gcc_assert (GET_CODE (part) != CLOBBER_HIGH);
 	      if (GET_CODE (part) == CLOBBER && REG_P (XEXP (part, 0)))
 		cselib_invalidate_rtx (XEXP (part, 0));
 	    }
@@ -157,9 +156,7 @@ reload_cse_simplify (rtx_insn *insn, rtx testreg)
 		  value = SET_DEST (part);
 		}
 	    }
-	  else if (GET_CODE (part) != CLOBBER
-		   && GET_CODE (part) != CLOBBER_HIGH
-		   && GET_CODE (part) != USE)
+	  else if (GET_CODE (part) != CLOBBER && GET_CODE (part) != USE)
 	    break;
 	}
 
@@ -1139,7 +1136,7 @@ reload_combine_recognize_pattern (rtx_insn *insn)
 	      if (TEST_HARD_REG_BIT (reg_class_contents[INDEX_REG_CLASS], i)
 		  && reg_state[i].use_index == RELOAD_COMBINE_MAX_USES
 		  && reg_state[i].store_ruid <= reg_state[regno].use_ruid
-		  && (call_used_regs[i] || df_regs_ever_live_p (i))
+		  && (call_used_or_fixed_reg_p (i) || df_regs_ever_live_p (i))
 		  && (!frame_pointer_needed || i != HARD_FRAME_POINTER_REGNUM)
 		  && !fixed_regs[i] && !global_regs[i]
 		  && hard_regno_nregs (i, GET_MODE (reg)) == 1
@@ -1271,8 +1268,8 @@ reload_combine (void)
 
 	  REG_SET_TO_HARD_REG_SET (live, live_in);
 	  compute_use_by_pseudos (&live, live_in);
-	  COPY_HARD_REG_SET (LABEL_LIVE (insn), live);
-	  IOR_HARD_REG_SET (ever_live_at_start, live);
+	  LABEL_LIVE (insn) = live;
+	  ever_live_at_start |= live;
 	}
     }
 
@@ -1329,14 +1326,15 @@ reload_combine (void)
 	  || reload_combine_recognize_pattern (insn))
 	continue;
 
-      note_stores (PATTERN (insn), reload_combine_note_store, NULL);
+      note_stores (insn, reload_combine_note_store, NULL);
 
       if (CALL_P (insn))
 	{
 	  rtx link;
-	  HARD_REG_SET used_regs;
-
-	  get_call_reg_set_usage (insn, &used_regs, call_used_reg_set);
+	  HARD_REG_SET used_regs = insn_callee_abi (insn).full_reg_clobbers ();
+	  /* ??? This preserves traditional behavior; it might not be
+	     needed.  */
+	  used_regs |= fixed_reg_set;
 
 	  for (r = 0; r < FIRST_PSEUDO_REGISTER; r++)
 	    if (TEST_HARD_REG_BIT (used_regs, r))
@@ -1350,22 +1348,12 @@ reload_combine (void)
 	    {
 	      rtx setuse = XEXP (link, 0);
 	      rtx usage_rtx = XEXP (setuse, 0);
-	      /* We could support CLOBBER_HIGH and treat it in the same way as
-		 HARD_REGNO_CALL_PART_CLOBBERED, but no port needs that yet.  */
-	      gcc_assert (GET_CODE (setuse) != CLOBBER_HIGH);
 
-	      if ((GET_CODE (setuse) == USE || GET_CODE (setuse) == CLOBBER)
-		  && REG_P (usage_rtx))
+	      if (GET_CODE (setuse) == USE && REG_P (usage_rtx))
 	        {
 		  unsigned int end_regno = END_REGNO (usage_rtx);
 		  for (unsigned int i = REGNO (usage_rtx); i < end_regno; ++i)
-		    if (GET_CODE (XEXP (link, 0)) == CLOBBER)
-		      {
-		        reg_state[i].use_index = RELOAD_COMBINE_MAX_USES;
-		        reg_state[i].store_ruid = reload_combine_ruid;
-		      }
-		    else
-		      reg_state[i].use_index = -1;
+		    reg_state[i].use_index = -1;
 	         }
 	     }
 	}
@@ -1529,10 +1517,6 @@ reload_combine_note_use (rtx *xp, rtx_insn *insn, int ruid, rtx containing_mem)
 	}
       break;
 
-    case CLOBBER_HIGH:
-      gcc_assert (REG_P (SET_DEST (x)));
-      return;
-
     case PLUS:
       /* We are interested in (plus (reg) (const_int)) .  */
       if (!REG_P (XEXP (x, 0))
@@ -2108,7 +2092,7 @@ reload_cse_move2add (rtx_insn *first)
 		}
 	    }
 	}
-      note_stores (PATTERN (insn), move2add_note_store, insn);
+      note_stores (insn, move2add_note_store, insn);
 
       /* If INSN is a conditional branch, we try to extract an
 	 implicit set out of it.  */
@@ -2138,32 +2122,12 @@ reload_cse_move2add (rtx_insn *first)
 	 unknown values.  */
       if (CALL_P (insn))
 	{
-	  rtx link;
-
 	  for (i = FIRST_PSEUDO_REGISTER - 1; i >= 0; i--)
 	    {
-	      if (call_used_regs[i])
+	      if (call_used_or_fixed_reg_p (i))
 		/* Reset the information about this register.  */
 		reg_mode[i] = VOIDmode;
 	    }
-
-	  for (link = CALL_INSN_FUNCTION_USAGE (insn); link;
-	       link = XEXP (link, 1))
-	    {
-	      rtx setuse = XEXP (link, 0);
-	      rtx usage_rtx = XEXP (setuse, 0);
-	      /* CALL_INSN_FUNCTION_USAGEs can only have full clobbers, not
-		 clobber_highs.  */
-	      gcc_assert (GET_CODE (setuse) != CLOBBER_HIGH);
-	      if (GET_CODE (setuse) == CLOBBER
-		  && REG_P (usage_rtx))
-	        {
-		  unsigned int end_regno = END_REGNO (usage_rtx);
-		  for (unsigned int r = REGNO (usage_rtx); r < end_regno; ++r)
-		    /* Reset the information about this register.  */
-		    reg_mode[r] = VOIDmode;
-		}
-	    }
 	}
     }
   return changed;
@@ -2317,13 +2281,6 @@ move2add_note_store (rtx dst, const_rtx set, void *data)
 
       move2add_record_mode (dst);
     }
-  else if (GET_CODE (set) == CLOBBER_HIGH)
-    {
-      /* Only invalidate if actually clobbered.  */
-      if (reg_mode[regno] == BLKmode
-	  || reg_is_clobbered_by_clobber_high (regno, reg_mode[regno], dst))
-	 goto invalidate;
-    }
   else
     {
     invalidate:
diff --git a/gcc/predict.c b/gcc/predict.c
index eaab47f99..03dd4ddfa 100644
--- a/gcc/predict.c
+++ b/gcc/predict.c
@@ -2450,7 +2450,7 @@ expr_expected_value_1 (tree type, tree op0, enum tree_code code,
 	      return NULL;
 	    }
 
-	  if (DECL_IS_MALLOC (decl) || DECL_IS_OPERATOR_NEW (decl))
+	  if (DECL_IS_MALLOC (decl) || DECL_IS_OPERATOR_NEW_P (decl))
 	    {
 	      if (predictor)
 		*predictor = PRED_MALLOC_NONNULL;
diff --git a/gcc/pretty-print.c b/gcc/pretty-print.c
index 6948971ce..5af7ca764 100644
--- a/gcc/pretty-print.c
+++ b/gcc/pretty-print.c
@@ -1579,7 +1579,8 @@ pretty_printer::pretty_printer (int maximum_length)
     emitted_prefix (),
     need_newline (),
     translate_identifiers (true),
-    show_color ()
+    show_color (),
+    show_urls (false)
 {
   pp_line_cutoff (this) = maximum_length;
   /* By default, we emit prefixes once per message.  */
@@ -2028,6 +2029,41 @@ identifier_to_locale (const char *ident)
   }
 }
 
+/* Support for encoding URLs.
+   See egmontkob/Hyperlinks_in_Terminal_Emulators.md
+   ( https://gist.github.com/egmontkob/eb114294efbcd5adb1944c9f3cb5feda ).
+
+   > A hyperlink is opened upon encountering an OSC 8 escape sequence with
+   > the target URI. The syntax is
+   >
+   >  OSC 8 ; params ; URI ST
+   >
+   > A hyperlink is closed with the same escape sequence, omitting the
+   > parameters and the URI but keeping the separators:
+   >
+   > OSC 8 ; ; ST
+   >
+   > OSC (operating system command) is typically ESC ].  */
+
+/* If URL-printing is enabled, write an "open URL" escape sequence to PP
+   for the given URL.  */
+
+void
+pp_begin_url (pretty_printer *pp, const char *url)
+{
+  if (pp->show_urls)
+    pp_printf (pp, "\33]8;;%s\33\\", url);
+}
+
+/* If URL-printing is enabled, write a "close URL" escape sequence to PP.  */
+
+void
+pp_end_url (pretty_printer *pp)
+{
+  if (pp->show_urls)
+    pp_string (pp, "\33]8;;\33\\");
+}
+
 #if CHECKING_P
 
 namespace selftest {
@@ -2312,6 +2348,32 @@ test_prefixes_and_wrapping ()
 
 }
 
+/* Verify that URL-printing works as expected.  */
+
+void
+test_urls ()
+{
+  {
+    pretty_printer pp;
+    pp.show_urls = false;
+    pp_begin_url (&pp, "http://example.com");
+    pp_string (&pp, "This is a link");
+    pp_end_url (&pp);
+    ASSERT_STREQ ("This is a link",
+		  pp_formatted_text (&pp));
+  }
+
+  {
+    pretty_printer pp;
+    pp.show_urls = true;
+    pp_begin_url (&pp, "http://example.com");
+    pp_string (&pp, "This is a link");
+    pp_end_url (&pp);
+    ASSERT_STREQ ("\33]8;;http://example.com\33\\This is a link\33]8;;\33\\",
+		  pp_formatted_text (&pp));
+  }
+}
+
 /* Run all of the selftests within this file.  */
 
 void
@@ -2320,6 +2382,7 @@ pretty_print_c_tests ()
   test_basic_printing ();
   test_pp_format ();
   test_prefixes_and_wrapping ();
+  test_urls ();
 }
 
 } // namespace selftest
diff --git a/gcc/pretty-print.h b/gcc/pretty-print.h
index e4df65907..07cd39176 100644
--- a/gcc/pretty-print.h
+++ b/gcc/pretty-print.h
@@ -271,6 +271,9 @@ struct pretty_printer
 
   /* Nonzero means that text should be colorized.  */
   bool show_color;
+
+  /* Nonzero means that URLs should be emitted.  */
+  bool show_urls;
 };
 
 static inline const char *
@@ -391,6 +394,9 @@ extern void pp_maybe_space (pretty_printer *);
 extern void pp_begin_quote (pretty_printer *, bool);
 extern void pp_end_quote (pretty_printer *, bool);
 
+extern void pp_begin_url (pretty_printer *pp, const char *url);
+extern void pp_end_url (pretty_printer *pp);
+
 /* Switch into verbatim mode and return the old mode.  */
 static inline pp_wrapping_mode_t
 pp_set_verbatim_wrapping_ (pretty_printer *pp)
diff --git a/gcc/print-rtl.c b/gcc/print-rtl.c
index fbb108568..01f281604 100644
--- a/gcc/print-rtl.c
+++ b/gcc/print-rtl.c
@@ -1756,7 +1756,6 @@ print_pattern (pretty_printer *pp, const_rtx x, int verbose)
       print_exp (pp, x, verbose);
       break;
     case CLOBBER:
-    case CLOBBER_HIGH:
     case USE:
       pp_printf (pp, "%s ", GET_RTX_NAME (GET_CODE (x)));
       print_value (pp, XEXP (x, 0), verbose);
diff --git a/gcc/print-tree.c b/gcc/print-tree.c
index 81b66a189..7c0d05548 100644
--- a/gcc/print-tree.c
+++ b/gcc/print-tree.c
@@ -517,7 +517,11 @@ print_node (FILE *file, const char *prefix, tree node, int indent,
 	  if (code == FUNCTION_DECL && fndecl_built_in_p (node))
 	    {
 	      if (DECL_BUILT_IN_CLASS (node) == BUILT_IN_MD)
-		fprintf (file, " built-in: BUILT_IN_MD:%d", DECL_FUNCTION_CODE (node));
+		fprintf (file, " built-in: BUILT_IN_MD:%d",
+			 DECL_MD_FUNCTION_CODE (node));
+	      else if (DECL_BUILT_IN_CLASS (node) == BUILT_IN_FRONTEND)
+		fprintf (file, " built-in: BUILT_IN_FRONTEND:%d",
+			 DECL_FE_FUNCTION_CODE (node));
 	      else
 		fprintf (file, " built-in: %s:%s",
 			 built_in_class_names[(int) DECL_BUILT_IN_CLASS (node)],
diff --git a/gcc/read-md.h b/gcc/read-md.h
index 18426f71d..327f378ea 100644
--- a/gcc/read-md.h
+++ b/gcc/read-md.h
@@ -337,6 +337,7 @@ class rtx_reader : public md_reader
   ~rtx_reader ();
 
   bool read_rtx (const char *rtx_name, vec<rtx> *rtxen);
+  rtx rtx_alloc_for_name (const char *);
   rtx read_rtx_code (const char *code_name);
   virtual rtx read_rtx_operand (rtx return_rtx, int idx);
   rtx read_nested_rtx ();
diff --git a/gcc/read-rtl-function.c b/gcc/read-rtl-function.c
index 53f7a94c1..ded407737 100644
--- a/gcc/read-rtl-function.c
+++ b/gcc/read-rtl-function.c
@@ -41,6 +41,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "read-rtl-function.h"
 #include "selftest.h"
 #include "selftest-rtl.h"
+#include "regs.h"
+#include "function-abi.h"
 
 /* Forward decls.  */
 class function_reader;
@@ -1610,6 +1612,7 @@ bool
 read_rtl_function_body (const char *path)
 {
   initialize_rtl ();
+  crtl->abi = &default_function_abi;
   init_emit ();
   init_varasm_status ();
 
@@ -1643,6 +1646,7 @@ read_rtl_function_body_from_file_range (location_t start_loc,
     }
 
   initialize_rtl ();
+  crtl->abi = &fndecl_abi (cfun->decl).base_abi ();
   init_emit ();
   init_varasm_status ();
 
diff --git a/gcc/read-rtl.c b/gcc/read-rtl.c
index 1af51f686..6b1b811cb 100644
--- a/gcc/read-rtl.c
+++ b/gcc/read-rtl.c
@@ -194,22 +194,31 @@ static const compact_insn_name compact_insn_names[] = {
   { NOTE, "cnote" }
 };
 
-/* Implementations of the iterator_group callbacks for codes.  */
+/* Return the rtx code for NAME, or UNKNOWN if NAME isn't a valid rtx code.  */
 
-static int
-find_code (const char *name)
+static rtx_code
+maybe_find_code (const char *name)
 {
-  int i;
-
-  for (i = 0; i < NUM_RTX_CODE; i++)
+  for (int i = 0; i < NUM_RTX_CODE; i++)
     if (strcmp (GET_RTX_NAME (i), name) == 0)
-      return i;
+      return (rtx_code) i;
 
-  for (i = 0; i < (signed)ARRAY_SIZE (compact_insn_names); i++)
+  for (int i = 0; i < (signed)ARRAY_SIZE (compact_insn_names); i++)
     if (strcmp (compact_insn_names[i].name, name) == 0)
       return compact_insn_names[i].code;
 
-  fatal_with_file_and_line ("unknown rtx code `%s'", name);
+  return UNKNOWN;
+}
+
+/* Implementations of the iterator_group callbacks for codes.  */
+
+static int
+find_code (const char *name)
+{
+  rtx_code code = maybe_find_code (name);
+  if (code == UNKNOWN)
+    fatal_with_file_and_line ("unknown rtx code `%s'", name);
+  return code;
 }
 
 static void
@@ -277,9 +286,11 @@ apply_subst_iterator (rtx rt, unsigned int, int value)
     return;
   gcc_assert (GET_CODE (rt) == DEFINE_INSN
 	      || GET_CODE (rt) == DEFINE_INSN_AND_SPLIT
+	      || GET_CODE (rt) == DEFINE_INSN_AND_REWRITE
 	      || GET_CODE (rt) == DEFINE_EXPAND);
 
-  int attrs = GET_CODE (rt) == DEFINE_INSN_AND_SPLIT ? 7 : 4;
+  int attrs = (GET_CODE (rt) == DEFINE_INSN_AND_SPLIT ? 7
+	       : GET_CODE (rt) == DEFINE_INSN_AND_REWRITE ? 6 : 4);
   attrs_vec = XVEC (rt, attrs);
 
   /* If we've already added attribute 'current_iterator_name', then we
@@ -540,6 +551,7 @@ add_condition_to_rtx (rtx x, const char *extra)
       break;
 
     case DEFINE_INSN_AND_SPLIT:
+    case DEFINE_INSN_AND_REWRITE:
       XSTR (x, 2) = add_condition_to_string (XSTR (x, 2), extra);
       XSTR (x, 4) = add_condition_to_string (XSTR (x, 4), extra);
       break;
@@ -623,6 +635,7 @@ named_rtx_p (rtx x)
     case DEFINE_EXPAND:
     case DEFINE_INSN:
     case DEFINE_INSN_AND_SPLIT:
+    case DEFINE_INSN_AND_REWRITE:
       return true;
 
     default:
@@ -1306,7 +1319,37 @@ check_code_iterator (struct mapping *iterator)
   for (v = iterator->values->next; v != 0; v = v->next)
     if (strcmp (GET_RTX_FORMAT (bellwether), GET_RTX_FORMAT (v->number)) != 0)
       fatal_with_file_and_line ("code iterator `%s' combines "
-				"different rtx formats", iterator->name);
+				"`%s' and `%s', which have different "
+				"rtx formats", iterator->name,
+				GET_RTX_NAME (bellwether),
+				GET_RTX_NAME (v->number));
+}
+
+/* Check that all values of attribute ATTR are rtx codes that have a
+   consistent format.  Return a representative code.  */
+
+static rtx_code
+check_code_attribute (mapping *attr)
+{
+  rtx_code bellwether = UNKNOWN;
+  for (map_value *v = attr->values; v != 0; v = v->next)
+    {
+      rtx_code code = maybe_find_code (v->string);
+      if (code == UNKNOWN)
+	fatal_with_file_and_line ("code attribute `%s' contains "
+				  "unrecognized rtx code `%s'",
+				  attr->name, v->string);
+      if (bellwether == UNKNOWN)
+	bellwether = code;
+      else if (strcmp (GET_RTX_FORMAT (bellwether),
+		       GET_RTX_FORMAT (code)) != 0)
+	fatal_with_file_and_line ("code attribute `%s' combines "
+				  "`%s' and `%s', which have different "
+				  "rtx formats", attr->name,
+				  GET_RTX_NAME (bellwether),
+				  GET_RTX_NAME (code));
+    }
+  return bellwether;
 }
 
 /* Read an rtx-related declaration from the MD file, given that it
@@ -1467,6 +1510,54 @@ parse_reg_note_name (const char *string)
   fatal_with_file_and_line ("unrecognized REG_NOTE name: `%s'", string);
 }
 
+/* Allocate an rtx for code NAME.  If NAME is a code iterator or code
+   attribute, record its use for later and use one of its possible
+   values as an interim rtx code.  */
+
+rtx
+rtx_reader::rtx_alloc_for_name (const char *name)
+{
+#ifdef GENERATOR_FILE
+  size_t len = strlen (name);
+  if (name[0] == '<' && name[len - 1] == '>')
+    {
+      /* Copy the attribute string into permanent storage, without the
+	 angle brackets around it.  */
+      obstack *strings = get_string_obstack ();
+      obstack_grow0 (strings, name + 1, len - 2);
+      char *deferred_name = XOBFINISH (strings, char *);
+
+      /* Find the name of the attribute.  */
+      const char *attr = strchr (deferred_name, ':');
+      if (!attr)
+	attr = deferred_name;
+
+      /* Find the attribute itself.  */
+      mapping *m = (mapping *) htab_find (codes.attrs, &attr);
+      if (!m)
+	fatal_with_file_and_line ("unknown code attribute `%s'", attr);
+
+      /* Pick the first possible code for now, and record the attribute
+	 use for later.  */
+      rtx x = rtx_alloc (check_code_attribute (m));
+      record_attribute_use (&codes, x, 0, deferred_name);
+      return x;
+    }
+
+  mapping *iterator = (mapping *) htab_find (codes.iterators, &name);
+  if (iterator != 0)
+    {
+      /* Pick the first possible code for now, and record the iterator
+	 use for later.  */
+      rtx x = rtx_alloc (rtx_code (iterator->values->number));
+      record_iterator_use (iterator, x, 0);
+      return x;
+    }
+#endif
+
+  return rtx_alloc (rtx_code (codes.find_builtin (name)));
+}
+
 /* Subroutine of read_rtx and read_nested_rtx.  CODE_NAME is the name of
    either an rtx code or a code iterator.  Parse the rest of the rtx and
    return it.  */
@@ -1475,7 +1566,6 @@ rtx
 rtx_reader::read_rtx_code (const char *code_name)
 {
   RTX_CODE code;
-  struct mapping *iterator = NULL;
   const char *format_ptr;
   struct md_name name;
   rtx return_rtx;
@@ -1509,20 +1599,9 @@ rtx_reader::read_rtx_code (const char *code_name)
       return return_rtx;
     }
 
-  /* If this code is an iterator, build the rtx using the iterator's
-     first value.  */
-#ifdef GENERATOR_FILE
-  iterator = (struct mapping *) htab_find (codes.iterators, &code_name);
-  if (iterator != 0)
-    code = (enum rtx_code) iterator->values->number;
-  else
-    code = (enum rtx_code) codes.find_builtin (code_name);
-#else
-    code = (enum rtx_code) codes.find_builtin (code_name);
-#endif
-
   /* If we end up with an insn expression then we free this space below.  */
-  return_rtx = rtx_alloc (code);
+  return_rtx = rtx_alloc_for_name (code_name);
+  code = GET_CODE (return_rtx);
   format_ptr = GET_RTX_FORMAT (code);
   memset (return_rtx, 0, RTX_CODE_SIZE (code));
   PUT_CODE (return_rtx, code);
@@ -1534,9 +1613,6 @@ rtx_reader::read_rtx_code (const char *code_name)
       m_reuse_rtx_by_id[reuse_id] = return_rtx;
     }
 
-  if (iterator)
-    record_iterator_use (iterator, return_rtx, 0);
-
   /* Check for flags. */
   read_flags (return_rtx);
 
@@ -1765,8 +1841,8 @@ rtx_reader::read_rtx_operand (rtx return_rtx, int idx)
 	    break;
 	  }
 
-	/* The output template slot of a DEFINE_INSN,
-	   DEFINE_INSN_AND_SPLIT, or DEFINE_PEEPHOLE automatically
+	/* The output template slot of a DEFINE_INSN, DEFINE_INSN_AND_SPLIT,
+	   DEFINE_INSN_AND_REWRITE or DEFINE_PEEPHOLE automatically
 	   gets a star inserted as its first character, if it is
 	   written with a brace block instead of a string constant.  */
 	star_if_braced = (format_ptr[idx] == 'T');
@@ -1783,7 +1859,8 @@ rtx_reader::read_rtx_operand (rtx return_rtx, int idx)
 	if (*stringbuf == '\0'
 	    && idx == 0
 	    && (GET_CODE (return_rtx) == DEFINE_INSN
-		|| GET_CODE (return_rtx) == DEFINE_INSN_AND_SPLIT))
+		|| GET_CODE (return_rtx) == DEFINE_INSN_AND_SPLIT
+		|| GET_CODE (return_rtx) == DEFINE_INSN_AND_REWRITE))
 	  {
 	    struct obstack *string_obstack = get_string_obstack ();
 	    char line_name[20];
diff --git a/gcc/real.c b/gcc/real.c
index 0164f097a..a2bd37a9e 100644
--- a/gcc/real.c
+++ b/gcc/real.c
@@ -4799,6 +4799,116 @@ decode_ieee_half (const struct real_format *fmt, REAL_VALUE_TYPE *r,
     }
 }
 
+/* Encode arm_bfloat types.  */
+static void
+encode_arm_bfloat_half (const struct real_format *fmt, long *buf,
+		    const REAL_VALUE_TYPE *r)
+{
+  unsigned long image, sig, exp;
+  unsigned long sign = r->sign;
+  bool denormal = (r->sig[SIGSZ-1] & SIG_MSB) == 0;
+
+  image = sign << 15;
+  sig = (r->sig[SIGSZ-1] >> (HOST_BITS_PER_LONG - 8)) & 0x7f;
+
+  switch (r->cl)
+    {
+    case rvc_zero:
+      break;
+
+    case rvc_inf:
+      if (fmt->has_inf)
+	image |= 255 << 7;
+      else
+	image |= 0x7fff;
+      break;
+
+    case rvc_nan:
+      if (fmt->has_nans)
+	{
+	  if (r->canonical)
+	    sig = (fmt->canonical_nan_lsbs_set ? (1 << 6) - 1 : 0);
+	  if (r->signalling == fmt->qnan_msb_set)
+	    sig &= ~(1 << 6);
+	  else
+	    sig |= 1 << 6;
+	  if (sig == 0)
+	    sig = 1 << 5;
+
+	  image |= 255 << 7;
+	  image |= sig;
+	}
+      else
+	image |= 0x7fff;
+      break;
+
+    case rvc_normal:
+      if (denormal)
+	exp = 0;
+      else
+      exp = REAL_EXP (r) + 127 - 1;
+      image |= exp << 7;
+      image |= sig;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  buf[0] = image;
+}
+
+/* Decode arm_bfloat types.  */
+static void
+decode_arm_bfloat_half (const struct real_format *fmt, REAL_VALUE_TYPE *r,
+		    const long *buf)
+{
+  unsigned long image = buf[0] & 0xffff;
+  bool sign = (image >> 15) & 1;
+  int exp = (image >> 7) & 0xff;
+
+  memset (r, 0, sizeof (*r));
+  image <<= HOST_BITS_PER_LONG - 8;
+  image &= ~SIG_MSB;
+
+  if (exp == 0)
+    {
+      if (image && fmt->has_denorm)
+	{
+	  r->cl = rvc_normal;
+	  r->sign = sign;
+	  SET_REAL_EXP (r, -126);
+	  r->sig[SIGSZ-1] = image << 1;
+	  normalize (r);
+	}
+      else if (fmt->has_signed_zero)
+	r->sign = sign;
+    }
+  else if (exp == 255 && (fmt->has_nans || fmt->has_inf))
+    {
+      if (image)
+	{
+	  r->cl = rvc_nan;
+	  r->sign = sign;
+	  r->signalling = (((image >> (HOST_BITS_PER_LONG - 2)) & 1)
+			   ^ fmt->qnan_msb_set);
+	  r->sig[SIGSZ-1] = image;
+	}
+      else
+	{
+	  r->cl = rvc_inf;
+	  r->sign = sign;
+	}
+    }
+  else
+    {
+      r->cl = rvc_normal;
+      r->sign = sign;
+      SET_REAL_EXP (r, exp - 127 + 1);
+      r->sig[SIGSZ-1] = image | SIG_MSB;
+    }
+}
+
 /* Half-precision format, as specified in IEEE 754R.  */
 const struct real_format ieee_half_format =
   {
@@ -4848,6 +4958,33 @@ const struct real_format arm_half_format =
     false,
     "arm_half"
   };
+
+/* ARM Bfloat half-precision format.  This format resembles a truncated
+   (16-bit) version of the 32-bit IEEE 754 single-precision floating-point
+   format.  */
+const struct real_format arm_bfloat_half_format =
+  {
+    encode_arm_bfloat_half,
+    decode_arm_bfloat_half,
+    2,
+    8,
+    8,
+    -125,
+    128,
+    15,
+    15,
+    0,
+    false,
+    true,
+    true,
+    true,
+    true,
+    true,
+    true,
+    false,
+    "arm_bfloat_half"
+  };
+
 
 /* A synthetic "format" for internal arithmetic.  It's the size of the
    internal significand minus the two bits needed for proper rounding.
diff --git a/gcc/real.h b/gcc/real.h
index 95b9db83d..d1b79f804 100644
--- a/gcc/real.h
+++ b/gcc/real.h
@@ -361,6 +361,7 @@ extern const struct real_format decimal_double_format;
 extern const struct real_format decimal_quad_format;
 extern const struct real_format ieee_half_format;
 extern const struct real_format arm_half_format;
+extern const struct real_format arm_bfloat_half_format;
 
 
 /* ====================================================================== */
diff --git a/gcc/recog.c b/gcc/recog.c
index a9f584bc0..b12eba33a 100644
--- a/gcc/recog.c
+++ b/gcc/recog.c
@@ -3227,7 +3227,8 @@ peep2_find_free_register (int from, int to, const char *class_str,
 	      break;
 	    }
 	  /* And that we don't create an extra save/restore.  */
-	  if (! call_used_regs[regno + j] && ! df_regs_ever_live_p (regno + j))
+	  if (! call_used_or_fixed_reg_p (regno + j)
+	      && ! df_regs_ever_live_p (regno + j))
 	    {
 	      success = 0;
 	      break;
@@ -3724,8 +3725,7 @@ store_data_bypass_p_1 (rtx_insn *out_insn, rtx in_set)
     {
       rtx out_exp = XVECEXP (out_pat, 0, i);
 
-      if (GET_CODE (out_exp) == CLOBBER || GET_CODE (out_exp) == USE
-	  || GET_CODE (out_exp) == CLOBBER_HIGH)
+      if (GET_CODE (out_exp) == CLOBBER || GET_CODE (out_exp) == USE)
 	continue;
 
       gcc_assert (GET_CODE (out_exp) == SET);
@@ -3756,8 +3756,7 @@ store_data_bypass_p (rtx_insn *out_insn, rtx_insn *in_insn)
     {
       rtx in_exp = XVECEXP (in_pat, 0, i);
 
-      if (GET_CODE (in_exp) == CLOBBER || GET_CODE (in_exp) == USE
-	  || GET_CODE (in_exp) == CLOBBER_HIGH)
+      if (GET_CODE (in_exp) == CLOBBER || GET_CODE (in_exp) == USE)
 	continue;
 
       gcc_assert (GET_CODE (in_exp) == SET);
@@ -3809,7 +3808,7 @@ if_test_bypass_p (rtx_insn *out_insn, rtx_insn *in_insn)
 	{
 	  rtx exp = XVECEXP (out_pat, 0, i);
 
-	  if (GET_CODE (exp) == CLOBBER  || GET_CODE (exp) == CLOBBER_HIGH)
+	  if (GET_CODE (exp) == CLOBBER)
 	    continue;
 
 	  gcc_assert (GET_CODE (exp) == SET);
diff --git a/gcc/recog.h b/gcc/recog.h
index 75cbbdc10..71d88e3e3 100644
--- a/gcc/recog.h
+++ b/gcc/recog.h
@@ -142,7 +142,7 @@ extern void preprocess_constraints (rtx_insn *);
 extern rtx_insn *peep2_next_insn (int);
 extern int peep2_regno_dead_p (int, int);
 extern int peep2_reg_dead_p (int, rtx);
-#ifdef CLEAR_HARD_REG_SET
+#ifdef HARD_CONST
 extern rtx peep2_find_free_register (int, int, const char *,
 				     machine_mode, HARD_REG_SET *);
 #endif
@@ -186,6 +186,23 @@ skip_alternative (const char *p)
 /* Nonzero means volatile operands are recognized.  */
 extern int volatile_ok;
 
+/* RAII class for temporarily setting volatile_ok.  */
+
+class temporary_volatile_ok
+{
+public:
+  temporary_volatile_ok (int value) : save_volatile_ok (volatile_ok)
+  {
+    volatile_ok = value;
+  }
+
+  ~temporary_volatile_ok () { volatile_ok = save_volatile_ok; }
+
+private:
+  temporary_volatile_ok (const temporary_volatile_ok &);
+  int save_volatile_ok;
+};
+
 /* Set by constrain_operands to the number of the alternative that
    matched.  */
 extern int which_alternative;
diff --git a/gcc/reg-stack.c b/gcc/reg-stack.c
index 033c978a1..b464f493f 100644
--- a/gcc/reg-stack.c
+++ b/gcc/reg-stack.c
@@ -368,7 +368,7 @@ straighten_stack (rtx_insn *insn, stack_ptr regstack)
   if (regstack->top <= 0)
     return;
 
-  COPY_HARD_REG_SET (temp_stack.reg_set, regstack->reg_set);
+  temp_stack.reg_set = regstack->reg_set;
 
   for (top = temp_stack.top = regstack->top; top >= 0; top--)
     temp_stack.reg[top] = FIRST_STACK_REG + temp_stack.top - top;
@@ -568,7 +568,7 @@ check_asm_stack_operands (rtx_insn *insn)
 
   if (i != LAST_STACK_REG + 1)
     {
-      error_for_asm (insn, "output regs must be grouped at top of stack");
+      error_for_asm (insn, "output registers must be grouped at top of stack");
       malformed_asm = 1;
     }
 
@@ -625,7 +625,8 @@ check_asm_stack_operands (rtx_insn *insn)
   if (i != LAST_STACK_REG + 1)
     {
       error_for_asm (insn,
-		     "explicitly used regs must be grouped at top of stack");
+		     "explicitly used registers must be grouped "
+		     "at top of stack");
       malformed_asm = 1;
     }
 
@@ -2640,7 +2641,7 @@ change_stack (rtx_insn *insn, stack_ptr old, stack_ptr new_stack,
       /* By now, the only difference should be the order of the stack,
 	 not their depth or liveliness.  */
 
-      gcc_assert (hard_reg_set_equal_p (old->reg_set, new_stack->reg_set));
+      gcc_assert (old->reg_set == new_stack->reg_set);
       gcc_assert (old->top == new_stack->top);
 
       /* If the stack is not empty (new_stack->top != -1), loop here emitting
@@ -3154,8 +3155,7 @@ convert_regs_1 (basic_block block)
      asms, we zapped the instruction itself, but that didn't produce the
      same pattern of register kills as before.  */
 
-  gcc_assert (hard_reg_set_equal_p (regstack.reg_set, bi->out_reg_set)
-	      || any_malformed_asm);
+  gcc_assert (regstack.reg_set == bi->out_reg_set || any_malformed_asm);
   bi->stack_out = regstack;
   bi->done = true;
 
diff --git a/gcc/regcprop.c b/gcc/regcprop.c
index 4842ce922..675111db8 100644
--- a/gcc/regcprop.c
+++ b/gcc/regcprop.c
@@ -35,6 +35,7 @@
 #include "rtl-iter.h"
 #include "cfgrtl.h"
 #include "target.h"
+#include "function-abi.h"
 
 /* The following code does forward propagation of hard register copies.
    The object is to eliminate as many dependencies as possible, so that
@@ -237,11 +238,8 @@ static void
 kill_clobbered_value (rtx x, const_rtx set, void *data)
 {
   struct value_data *const vd = (struct value_data *) data;
-  gcc_assert (GET_CODE (set) != CLOBBER_HIGH || REG_P (x));
 
-  if (GET_CODE (set) == CLOBBER
-      || (GET_CODE (set) == CLOBBER_HIGH
-	  && reg_is_clobbered_by_clobber_high (x, XEXP (set, 0))))
+  if (GET_CODE (set) == CLOBBER)
     kill_value (x, vd);
 }
 
@@ -262,8 +260,7 @@ kill_set_value (rtx x, const_rtx set, void *data)
   if (rtx_equal_p (x, ksvd->ignore_set_reg))
     return;
 
-  gcc_assert (GET_CODE (set) != CLOBBER_HIGH || REG_P (x));
-  if (GET_CODE (set) != CLOBBER && GET_CODE (set) != CLOBBER_HIGH)
+  if (GET_CODE (set) != CLOBBER)
     {
       kill_value (x, ksvd->vd);
       if (REG_P (x))
@@ -728,19 +725,7 @@ cprop_find_used_regs (rtx *loc, void *data)
 static void
 kill_clobbered_values (rtx_insn *insn, struct value_data *vd)
 {
-  note_stores (PATTERN (insn), kill_clobbered_value, vd);
-
-  if (CALL_P (insn))
-    {
-      rtx exp;
-
-      for (exp = CALL_INSN_FUNCTION_USAGE (insn); exp; exp = XEXP (exp, 1))
-	{
-	  rtx x = XEXP (exp, 0);
-	  if (GET_CODE (x) == CLOBBER)
-	    kill_value (SET_DEST (x), vd);
-	}
-    }
+  note_stores (insn, kill_clobbered_value, vd);
 }
 
 /* Perform the forward copy propagation on basic block BB.  */
@@ -1047,7 +1032,6 @@ copyprop_hardreg_forward_1 (basic_block bb, struct value_data *vd)
 	  unsigned int set_nregs = 0;
 	  unsigned int regno;
 	  rtx exp;
-	  HARD_REG_SET regs_invalidated_by_this_call;
 
 	  for (exp = CALL_INSN_FUNCTION_USAGE (insn); exp; exp = XEXP (exp, 1))
 	    {
@@ -1065,13 +1049,11 @@ copyprop_hardreg_forward_1 (basic_block bb, struct value_data *vd)
 		}
 	    }
 
-	  get_call_reg_set_usage (insn,
-				  &regs_invalidated_by_this_call,
-				  regs_invalidated_by_call);
+	  function_abi callee_abi = insn_callee_abi (insn);
 	  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
-	    if ((TEST_HARD_REG_BIT (regs_invalidated_by_this_call, regno)
+	    if ((callee_abi.clobbers_full_reg_p (regno)
 		 || (targetm.hard_regno_call_part_clobbered
-		     (insn, regno, vd->e[regno].mode)))
+		     (callee_abi.id (), regno, vd->e[regno].mode)))
 		&& (regno < set_regno || regno >= set_regno + set_nregs))
 	      kill_value_regno (regno, 1, vd);
 
@@ -1109,7 +1091,7 @@ copyprop_hardreg_forward_1 (basic_block bb, struct value_data *vd)
       if (!noop_p)
 	{
 	  /* Notice stores.  */
-	  note_stores (PATTERN (insn), kill_set_value, &ksvd);
+	  note_stores (insn, kill_set_value, &ksvd);
 
 	  /* Notice copies.  */
 	  if (copy_p)
diff --git a/gcc/reginfo.c b/gcc/reginfo.c
index 315c5ecab..4f07e968e 100644
--- a/gcc/reginfo.c
+++ b/gcc/reginfo.c
@@ -43,6 +43,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "reload.h"
 #include "output.h"
 #include "tree-pass.h"
+#include "function-abi.h"
 
 /* Maximum register number used in this function, plus one.  */
 
@@ -65,21 +66,20 @@ struct target_hard_regs *this_target_hard_regs = &default_target_hard_regs;
 struct target_regs *this_target_regs = &default_target_regs;
 #endif
 
+#define call_used_regs \
+  (this_target_hard_regs->x_call_used_regs)
+
 /* Data for initializing fixed_regs.  */
 static const char initial_fixed_regs[] = FIXED_REGISTERS;
 
 /* Data for initializing call_used_regs.  */
-static const char initial_call_used_regs[] = CALL_USED_REGISTERS;
-
 #ifdef CALL_REALLY_USED_REGISTERS
-/* Data for initializing call_really_used_regs.  */
-static const char initial_call_really_used_regs[] = CALL_REALLY_USED_REGISTERS;
+#ifdef CALL_USED_REGISTERS
+#error CALL_USED_REGISTERS and CALL_REALLY_USED_REGISTERS are both defined
 #endif
-
-#ifdef CALL_REALLY_USED_REGISTERS
-#define CALL_REALLY_USED_REGNO_P(X)  call_really_used_regs[X]
+static const char initial_call_used_regs[] = CALL_REALLY_USED_REGISTERS;
 #else
-#define CALL_REALLY_USED_REGNO_P(X)  call_used_regs[X]
+static const char initial_call_used_regs[] = CALL_USED_REGISTERS;
 #endif
 
 /* Indexed by hard register number, contains 1 for registers
@@ -91,17 +91,6 @@ char global_regs[FIRST_PSEUDO_REGISTER];
 /* Declaration for the global register. */
 tree global_regs_decl[FIRST_PSEUDO_REGISTER];
 
-/* Same information as REGS_INVALIDATED_BY_CALL but in regset form to be used
-   in dataflow more conveniently.  */
-regset regs_invalidated_by_call_regset;
-
-/* Same information as FIXED_REG_SET but in regset form.  */
-regset fixed_reg_set_regset;
-
-/* The bitmap_obstack is used to hold some static variables that
-   should not be reset after each function is compiled.  */
-static bitmap_obstack persistent_obstack;
-
 /* Used to initialize reg_alloc_order.  */
 #ifdef REG_ALLOC_ORDER
 static int initial_reg_alloc_order[FIRST_PSEUDO_REGISTER] = REG_ALLOC_ORDER;
@@ -171,10 +160,6 @@ init_reg_sets (void)
      CALL_USED_REGISTERS had the right number of initializers.  */
   gcc_assert (sizeof fixed_regs == sizeof initial_fixed_regs);
   gcc_assert (sizeof call_used_regs == sizeof initial_call_used_regs);
-#ifdef CALL_REALLY_USED_REGISTERS
-  gcc_assert (sizeof call_really_used_regs
-	      == sizeof initial_call_really_used_regs);
-#endif
 #ifdef REG_ALLOC_ORDER
   gcc_assert (sizeof reg_alloc_order == sizeof initial_reg_alloc_order);
 #endif
@@ -182,10 +167,6 @@ init_reg_sets (void)
 
   memcpy (fixed_regs, initial_fixed_regs, sizeof fixed_regs);
   memcpy (call_used_regs, initial_call_used_regs, sizeof call_used_regs);
-#ifdef CALL_REALLY_USED_REGISTERS
-  memcpy (call_really_used_regs, initial_call_really_used_regs,
-	  sizeof call_really_used_regs);
-#endif
 #ifdef REG_ALLOC_ORDER
   memcpy (reg_alloc_order, initial_reg_alloc_order, sizeof reg_alloc_order);
 #endif
@@ -200,9 +181,6 @@ init_reg_sets (void)
    subsequent back-end reinitialization.  */
 static char saved_fixed_regs[FIRST_PSEUDO_REGISTER];
 static char saved_call_used_regs[FIRST_PSEUDO_REGISTER];
-#ifdef CALL_REALLY_USED_REGISTERS
-static char saved_call_really_used_regs[FIRST_PSEUDO_REGISTER];
-#endif
 static const char *saved_reg_names[FIRST_PSEUDO_REGISTER];
 static HARD_REG_SET saved_accessible_reg_set;
 static HARD_REG_SET saved_operand_reg_set;
@@ -218,19 +196,11 @@ save_register_info (void)
   memcpy (saved_fixed_regs, fixed_regs, sizeof fixed_regs);
   memcpy (saved_call_used_regs, call_used_regs, sizeof call_used_regs);
 
-  /* Likewise for call_really_used_regs.  */
-#ifdef CALL_REALLY_USED_REGISTERS
-  gcc_assert (sizeof call_really_used_regs
-	      == sizeof saved_call_really_used_regs);
-  memcpy (saved_call_really_used_regs, call_really_used_regs,
-	  sizeof call_really_used_regs);
-#endif
-
   /* And similarly for reg_names.  */
   gcc_assert (sizeof reg_names == sizeof saved_reg_names);
   memcpy (saved_reg_names, reg_names, sizeof reg_names);
-  COPY_HARD_REG_SET (saved_accessible_reg_set, accessible_reg_set);
-  COPY_HARD_REG_SET (saved_operand_reg_set, operand_reg_set);
+  saved_accessible_reg_set = accessible_reg_set;
+  saved_operand_reg_set = operand_reg_set;
 }
 
 /* Restore the register information.  */
@@ -240,14 +210,9 @@ restore_register_info (void)
   memcpy (fixed_regs, saved_fixed_regs, sizeof fixed_regs);
   memcpy (call_used_regs, saved_call_used_regs, sizeof call_used_regs);
 
-#ifdef CALL_REALLY_USED_REGISTERS
-  memcpy (call_really_used_regs, saved_call_really_used_regs,
-	  sizeof call_really_used_regs);
-#endif
-
   memcpy (reg_names, saved_reg_names, sizeof reg_names);
-  COPY_HARD_REG_SET (accessible_reg_set, saved_accessible_reg_set);
-  COPY_HARD_REG_SET (operand_reg_set, saved_operand_reg_set);
+  accessible_reg_set = saved_accessible_reg_set;
+  operand_reg_set = saved_operand_reg_set;
 }
 
 /* After switches have been processed, which perhaps alter
@@ -297,8 +262,7 @@ init_reg_sets_1 (void)
 	  HARD_REG_SET c;
 	  int k;
 
-	  COPY_HARD_REG_SET (c, reg_class_contents[i]);
-	  IOR_HARD_REG_SET (c, reg_class_contents[j]);
+	  c = reg_class_contents[i] | reg_class_contents[j];
 	  for (k = 0; k < N_REG_CLASSES; k++)
 	    if (hard_reg_set_subset_p (reg_class_contents[k], c)
 		&& !hard_reg_set_subset_p (reg_class_contents[k],
@@ -320,8 +284,7 @@ init_reg_sets_1 (void)
 	  HARD_REG_SET c;
 	  int k;
 
-	  COPY_HARD_REG_SET (c, reg_class_contents[i]);
-	  IOR_HARD_REG_SET (c, reg_class_contents[j]);
+	  c = reg_class_contents[i] | reg_class_contents[j];
 	  for (k = 0; k < N_REG_CLASSES; k++)
 	    if (hard_reg_set_subset_p (c, reg_class_contents[k]))
 	      break;
@@ -362,22 +325,9 @@ init_reg_sets_1 (void)
   /* Initialize "constant" tables.  */
 
   CLEAR_HARD_REG_SET (fixed_reg_set);
-  CLEAR_HARD_REG_SET (call_used_reg_set);
-  CLEAR_HARD_REG_SET (call_fixed_reg_set);
   CLEAR_HARD_REG_SET (regs_invalidated_by_call);
-  if (!regs_invalidated_by_call_regset)
-    {
-      bitmap_obstack_initialize (&persistent_obstack);
-      regs_invalidated_by_call_regset = ALLOC_REG_SET (&persistent_obstack);
-    }
-  else
-    CLEAR_REG_SET (regs_invalidated_by_call_regset);
-  if (!fixed_reg_set_regset)
-    fixed_reg_set_regset = ALLOC_REG_SET (&persistent_obstack);
-  else
-    CLEAR_REG_SET (fixed_reg_set_regset);
 
-  AND_HARD_REG_SET (operand_reg_set, accessible_reg_set);
+  operand_reg_set &= accessible_reg_set;
   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
     {
       /* As a special exception, registers whose class is NO_REGS are
@@ -393,26 +343,10 @@ init_reg_sets_1 (void)
       /* If a register is too limited to be treated as a register operand,
 	 then it should never be allocated to a pseudo.  */
       if (!TEST_HARD_REG_BIT (operand_reg_set, i))
-	{
-	  fixed_regs[i] = 1;
-	  call_used_regs[i] = 1;
-	}
-
-      /* call_used_regs must include fixed_regs.  */
-      gcc_assert (!fixed_regs[i] || call_used_regs[i]);
-#ifdef CALL_REALLY_USED_REGISTERS
-      /* call_used_regs must include call_really_used_regs.  */
-      gcc_assert (!call_really_used_regs[i] || call_used_regs[i]);
-#endif
+	fixed_regs[i] = 1;
 
       if (fixed_regs[i])
-	{
-	  SET_HARD_REG_BIT (fixed_reg_set, i);
-	  SET_REGNO_REG_SET (fixed_reg_set_regset, i);
-	}
-
-      if (call_used_regs[i])
-	SET_HARD_REG_BIT (call_used_reg_set, i);
+	SET_HARD_REG_BIT (fixed_reg_set, i);
 
       /* There are a couple of fixed registers that we know are safe to
 	 exclude from being clobbered by calls:
@@ -427,10 +361,7 @@ init_reg_sets_1 (void)
       if (i == STACK_POINTER_REGNUM)
 	;
       else if (global_regs[i])
-        {
-	  SET_HARD_REG_BIT (regs_invalidated_by_call, i);
-	  SET_REGNO_REG_SET (regs_invalidated_by_call_regset, i);
-	}
+	SET_HARD_REG_BIT (regs_invalidated_by_call, i);
       else if (i == FRAME_POINTER_REGNUM)
 	;
       else if (!HARD_FRAME_POINTER_IS_FRAME_POINTER
@@ -442,15 +373,12 @@ init_reg_sets_1 (void)
       else if (!PIC_OFFSET_TABLE_REG_CALL_CLOBBERED
 	       && i == (unsigned) PIC_OFFSET_TABLE_REGNUM && fixed_regs[i])
 	;
-      else if (CALL_REALLY_USED_REGNO_P (i))
-        {
-	  SET_HARD_REG_BIT (regs_invalidated_by_call, i);
-	  SET_REGNO_REG_SET (regs_invalidated_by_call_regset, i);
-        }
+      else if (call_used_regs[i])
+	SET_HARD_REG_BIT (regs_invalidated_by_call, i);
     }
 
-  COPY_HARD_REG_SET (call_fixed_reg_set, fixed_reg_set);
-  COPY_HARD_REG_SET (fixed_nonglobal_reg_set, fixed_reg_set);
+  SET_HARD_REG_SET (savable_regs);
+  fixed_nonglobal_reg_set = fixed_reg_set;
 
   /* Preserve global registers if called more than once.  */
   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
@@ -459,8 +387,6 @@ init_reg_sets_1 (void)
 	{
 	  fixed_regs[i] = call_used_regs[i] = 1;
 	  SET_HARD_REG_BIT (fixed_reg_set, i);
-	  SET_HARD_REG_BIT (call_used_reg_set, i);
-	  SET_HARD_REG_BIT (call_fixed_reg_set, i);
 	}
     }
 
@@ -493,6 +419,8 @@ init_reg_sets_1 (void)
 	       }
 	  }
      }
+
+  default_function_abi.initialize (0, regs_invalidated_by_call);
 }
 
 /* Compute the table of register modes.
@@ -639,7 +567,7 @@ choose_hard_reg_mode (unsigned int regno ATTRIBUTE_UNUSED,
     if (hard_regno_nregs (regno, mode) == nregs
 	&& targetm.hard_regno_mode_ok (regno, mode)
 	&& (!call_saved
-	    || !targetm.hard_regno_call_part_clobbered (NULL, regno, mode))
+	    || !targetm.hard_regno_call_part_clobbered (0, regno, mode))
 	&& maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (found_mode)))
       found_mode = mode;
 
@@ -647,7 +575,7 @@ choose_hard_reg_mode (unsigned int regno ATTRIBUTE_UNUSED,
     if (hard_regno_nregs (regno, mode) == nregs
 	&& targetm.hard_regno_mode_ok (regno, mode)
 	&& (!call_saved
-	    || !targetm.hard_regno_call_part_clobbered (NULL, regno, mode))
+	    || !targetm.hard_regno_call_part_clobbered (0, regno, mode))
 	&& maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (found_mode)))
       found_mode = mode;
 
@@ -655,7 +583,7 @@ choose_hard_reg_mode (unsigned int regno ATTRIBUTE_UNUSED,
     if (hard_regno_nregs (regno, mode) == nregs
 	&& targetm.hard_regno_mode_ok (regno, mode)
 	&& (!call_saved
-	    || !targetm.hard_regno_call_part_clobbered (NULL, regno, mode))
+	    || !targetm.hard_regno_call_part_clobbered (0, regno, mode))
 	&& maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (found_mode)))
       found_mode = mode;
 
@@ -663,7 +591,7 @@ choose_hard_reg_mode (unsigned int regno ATTRIBUTE_UNUSED,
     if (hard_regno_nregs (regno, mode) == nregs
 	&& targetm.hard_regno_mode_ok (regno, mode)
 	&& (!call_saved
-	    || !targetm.hard_regno_call_part_clobbered (NULL, regno, mode))
+	    || !targetm.hard_regno_call_part_clobbered (0, regno, mode))
 	&& maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (found_mode)))
       found_mode = mode;
 
@@ -677,7 +605,7 @@ choose_hard_reg_mode (unsigned int regno ATTRIBUTE_UNUSED,
       if (hard_regno_nregs (regno, mode) == nregs
 	  && targetm.hard_regno_mode_ok (regno, mode)
 	  && (!call_saved
-	      || !targetm.hard_regno_call_part_clobbered (NULL, regno, mode)))
+	      || !targetm.hard_regno_call_part_clobbered (0, regno, mode)))
 	return mode;
     }
 
@@ -749,10 +677,11 @@ fix_register (const char *name, int fixed, int call_used)
 	  else
 	    {
 	      fixed_regs[i] = fixed;
-	      call_used_regs[i] = call_used;
 #ifdef CALL_REALLY_USED_REGISTERS
 	      if (fixed == 0)
-		call_really_used_regs[i] = call_used;
+		call_used_regs[i] = call_used;
+#else
+	      call_used_regs[i] = call_used;
 #endif
 	    }
 	}
@@ -803,7 +732,8 @@ globalize_reg (tree decl, int i)
   if (i != STACK_POINTER_REGNUM)
     {
       SET_HARD_REG_BIT (regs_invalidated_by_call, i);
-      SET_REGNO_REG_SET (regs_invalidated_by_call_regset, i);
+      for (unsigned int i = 0; i < NUM_ABI_IDS; ++i)
+	function_abis[i].add_full_reg_clobber (i);
     }
 
   /* If already fixed, nothing else to do.  */
@@ -811,13 +741,8 @@ globalize_reg (tree decl, int i)
     return;
 
   fixed_regs[i] = call_used_regs[i] = 1;
-#ifdef CALL_REALLY_USED_REGISTERS
-  call_really_used_regs[i] = 1;
-#endif
 
   SET_HARD_REG_BIT (fixed_reg_set, i);
-  SET_HARD_REG_BIT (call_used_reg_set, i);
-  SET_HARD_REG_BIT (call_fixed_reg_set, i);
 
   reinit_regs ();
 }
@@ -1101,10 +1026,6 @@ reg_scan_mark_refs (rtx x, rtx_insn *insn)
 	reg_scan_mark_refs (XEXP (XEXP (x, 0), 0), insn);
       break;
 
-    case CLOBBER_HIGH:
-      gcc_assert (!(MEM_P (XEXP (x, 0))));
-      break;
-
     case SET:
       /* Count a set of the destination if it is a register.  */
       for (dest = SET_DEST (x);
@@ -1316,14 +1237,12 @@ record_subregs_of_mode (rtx subreg, bool partial_def)
     }
 
   if (valid_mode_changes[regno])
-    AND_HARD_REG_SET (*valid_mode_changes[regno],
-		      simplifiable_subregs (shape));
+    *valid_mode_changes[regno] &= simplifiable_subregs (shape);
   else
     {
       valid_mode_changes[regno]
 	= XOBNEW (&valid_mode_changes_obstack, HARD_REG_SET);
-      COPY_HARD_REG_SET (*valid_mode_changes[regno],
-			 simplifiable_subregs (shape));
+      *valid_mode_changes[regno] = simplifiable_subregs (shape);
     }
 }
 
diff --git a/gcc/regrename.c b/gcc/regrename.c
index 5259d565e..6f7fe0a6d 100644
--- a/gcc/regrename.c
+++ b/gcc/regrename.c
@@ -33,6 +33,7 @@
 #include "addresses.h"
 #include "cfganal.h"
 #include "tree-pass.h"
+#include "function-abi.h"
 #include "regrename.h"
 
 /* This file implements the RTL register renaming pass of the compiler.  It is
@@ -253,7 +254,7 @@ create_new_chain (unsigned this_regno, unsigned this_nregs, rtx *loc,
       CLEAR_HARD_REG_BIT (live_hard_regs, head->regno + nregs);
     }
 
-  COPY_HARD_REG_SET (head->hard_conflicts, live_hard_regs);
+  head->hard_conflicts = live_hard_regs;
   bitmap_set_bit (&open_chains_set, head->id);
 
   open_chains = head;
@@ -292,7 +293,7 @@ merge_overlapping_regs (HARD_REG_SET *pset, struct du_head *head)
 {
   bitmap_iterator bi;
   unsigned i;
-  IOR_HARD_REG_SET (*pset, head->hard_conflicts);
+  *pset |= head->hard_conflicts;
   EXECUTE_IF_SET_IN_BITMAP (&head->conflicts, 0, i, bi)
     {
       du_head_p other = regrename_chain_from_id (i);
@@ -303,6 +304,18 @@ merge_overlapping_regs (HARD_REG_SET *pset, struct du_head *head)
     }
 }
 
+/* Return true if (reg:MODE REGNO) would be clobbered by a call covered
+   by THIS_HEAD.  */
+
+static bool
+call_clobbered_in_chain_p (du_head *this_head, machine_mode mode,
+			   unsigned int regno)
+{
+  return call_clobbered_in_region_p (this_head->call_abis,
+				     this_head->call_clobber_mask,
+				     mode, regno);
+}
+
 /* Check if NEW_REG can be the candidate register to rename for
    REG in THIS_HEAD chain.  THIS_UNAVAILABLE is a set of unavailable hard
    registers.  */
@@ -322,7 +335,7 @@ check_new_reg_p (int reg ATTRIBUTE_UNUSED, int new_reg,
 	|| global_regs[new_reg + i]
 	/* Can't use regs which aren't saved by the prologue.  */
 	|| (! df_regs_ever_live_p (new_reg + i)
-	    && ! call_used_regs[new_reg + i])
+	    && ! crtl->abi->clobbers_full_reg_p (new_reg + i))
 #ifdef LEAF_REGISTERS
 	/* We can't use a non-leaf register if we're in a
 	   leaf function.  */
@@ -337,11 +350,8 @@ check_new_reg_p (int reg ATTRIBUTE_UNUSED, int new_reg,
   for (tmp = this_head->first; tmp; tmp = tmp->next_use)
     if ((!targetm.hard_regno_mode_ok (new_reg, GET_MODE (*tmp->loc))
 	 && ! DEBUG_INSN_P (tmp->insn))
-	|| (this_head->need_caller_save_reg
-	    && ! (targetm.hard_regno_call_part_clobbered
-		  (NULL, reg, GET_MODE (*tmp->loc)))
-	    && (targetm.hard_regno_call_part_clobbered
-		(NULL, new_reg, GET_MODE (*tmp->loc)))))
+	|| call_clobbered_in_chain_p (this_head, GET_MODE (*tmp->loc),
+				      new_reg))
       return false;
 
   return true;
@@ -363,12 +373,6 @@ find_rename_reg (du_head_p this_head, enum reg_class super_class,
   int pass;
   int best_new_reg = old_reg;
 
-  /* Further narrow the set of registers we can use for renaming.
-     If the chain needs a call-saved register, mark the call-used
-     registers as unavailable.  */
-  if (this_head->need_caller_save_reg)
-    IOR_HARD_REG_SET (*unavailable, call_used_reg_set);
-
   /* Mark registers that overlap this chain's lifetime as unavailable.  */
   merge_overlapping_regs (unavailable, this_head);
 
@@ -441,8 +445,7 @@ regrename_find_superclass (du_head_p head, int *pn_uses,
       if (DEBUG_INSN_P (tmp->insn))
 	continue;
       n_uses++;
-      IOR_COMPL_HARD_REG_SET (*punavailable,
-			      reg_class_contents[tmp->cl]);
+      *punavailable |= ~reg_class_contents[tmp->cl];
       super_class
 	= reg_class_superunion[(int) super_class][(int) tmp->cl];
     }
@@ -486,7 +489,7 @@ rename_chains (void)
 	      && reg == FRAME_POINTER_REGNUM))
 	continue;
 
-      COPY_HARD_REG_SET (this_unavailable, unavailable);
+      this_unavailable = unavailable;
 
       reg_class super_class = regrename_find_superclass (this_head, &n_uses,
 							 &this_unavailable);
@@ -500,7 +503,7 @@ rename_chains (void)
 	{
 	  fprintf (dump_file, "Register %s in insn %d",
 		   reg_names[reg], INSN_UID (this_head->first->insn));
-	  if (this_head->need_caller_save_reg)
+	  if (this_head->call_abis)
 	    fprintf (dump_file, " crosses a call");
 	}
 
@@ -677,10 +680,11 @@ merge_chains (du_head_p c1, du_head_p c2)
   c2->first = c2->last = NULL;
   c2->id = c1->id;
 
-  IOR_HARD_REG_SET (c1->hard_conflicts, c2->hard_conflicts);
+  c1->hard_conflicts |= c2->hard_conflicts;
   bitmap_ior_into (&c1->conflicts, &c2->conflicts);
 
-  c1->need_caller_save_reg |= c2->need_caller_save_reg;
+  c1->call_clobber_mask |= c2->call_clobber_mask;
+  c1->call_abis |= c2->call_abis;
   c1->cannot_rename |= c2->cannot_rename;
 }
 
@@ -1740,7 +1744,7 @@ build_def_use (basic_block bb)
 	     outside an operand, as live.  */
 	  hide_operands (n_ops, old_operands, old_dups, untracked_operands,
 			 false);
-	  note_stores (PATTERN (insn), note_sets_clobbers, &clobber_code);
+	  note_stores (insn, note_sets_clobbers, &clobber_code);
 	  restore_operands (insn, n_ops, old_operands, old_dups);
 
 	  /* Step 1b: Begin new chains for earlyclobbered writes inside
@@ -1834,9 +1838,15 @@ build_def_use (basic_block bb)
 	     requires a caller-saved reg.  */
 	  if (CALL_P (insn))
 	    {
+	      function_abi callee_abi = insn_callee_abi (insn);
 	      struct du_head *p;
 	      for (p = open_chains; p; p = p->next_chain)
-		p->need_caller_save_reg = 1;
+		{
+		  p->call_abis |= (1 << callee_abi.id ());
+		  p->call_clobber_mask
+		    |= callee_abi.full_and_partial_reg_clobbers ();
+		  p->hard_conflicts |= callee_abi.full_reg_clobbers ();
+		}
 	    }
 
 	  /* Step 5: Close open chains that overlap writes.  Similar to
@@ -1856,7 +1866,7 @@ build_def_use (basic_block bb)
 	     outside an operand, as live.  */
 	  hide_operands (n_ops, old_operands, old_dups, untracked_operands,
 			 false);
-	  note_stores (PATTERN (insn), note_sets_clobbers, &set_code);
+	  note_stores (insn, note_sets_clobbers, &set_code);
 	  restore_operands (insn, n_ops, old_operands, old_dups);
 
 	  /* Step 6b: Begin new chains for writes inside operands.  */
diff --git a/gcc/regrename.h b/gcc/regrename.h
index 37f5e398d..1bbf78fda 100644
--- a/gcc/regrename.h
+++ b/gcc/regrename.h
@@ -40,9 +40,12 @@ struct du_head
   bitmap_head conflicts;
   /* Conflicts with untracked hard registers.  */
   HARD_REG_SET hard_conflicts;
+  /* Which registers are fully or partially clobbered by the calls that
+     the chain crosses.  */
+  HARD_REG_SET call_clobber_mask;
 
-  /* Nonzero if the chain crosses a call.  */
-  unsigned int need_caller_save_reg:1;
+  /* A bitmask of ABIs used by the calls that the chain crosses.  */
+  unsigned int call_abis : NUM_ABI_IDS;
   /* Nonzero if the register is used in a way that prevents renaming,
      such as the SET_DEST of a CALL_INSN or an asm operand that used
      to be a hard register.  */
diff --git a/gcc/regs.h b/gcc/regs.h
index 48b2e7081..821979ec6 100644
--- a/gcc/regs.h
+++ b/gcc/regs.h
@@ -298,7 +298,7 @@ remove_from_hard_reg_set (HARD_REG_SET *regs, machine_mode mode,
 /* Return true if REGS contains the whole of (reg:MODE REGNO).  */
 
 static inline bool
-in_hard_reg_set_p (const HARD_REG_SET regs, machine_mode mode,
+in_hard_reg_set_p (const_hard_reg_set regs, machine_mode mode,
 		   unsigned int regno)
 {
   unsigned int end_regno;
@@ -323,7 +323,7 @@ in_hard_reg_set_p (const HARD_REG_SET regs, machine_mode mode,
 /* Return true if (reg:MODE REGNO) includes an element of REGS.  */
 
 static inline bool
-overlaps_hard_reg_set_p (const HARD_REG_SET regs, machine_mode mode,
+overlaps_hard_reg_set_p (const_hard_reg_set regs, machine_mode mode,
 			 unsigned int regno)
 {
   unsigned int end_regno;
@@ -363,7 +363,7 @@ remove_range_from_hard_reg_set (HARD_REG_SET *regs, unsigned int regno,
 /* Like overlaps_hard_reg_set_p, but use a REGNO/NREGS range instead of
    REGNO and MODE.  */
 static inline bool
-range_overlaps_hard_reg_set_p (const HARD_REG_SET set, unsigned regno,
+range_overlaps_hard_reg_set_p (const_hard_reg_set set, unsigned regno,
 			       int nregs)
 {
   while (nregs-- > 0)
@@ -375,7 +375,7 @@ range_overlaps_hard_reg_set_p (const HARD_REG_SET set, unsigned regno,
 /* Like in_hard_reg_set_p, but use a REGNO/NREGS range instead of
    REGNO and MODE.  */
 static inline bool
-range_in_hard_reg_set_p (const HARD_REG_SET set, unsigned regno, int nregs)
+range_in_hard_reg_set_p (const_hard_reg_set set, unsigned regno, int nregs)
 {
   while (nregs-- > 0)
     if (!TEST_HARD_REG_BIT (set, regno + nregs))
@@ -383,8 +383,4 @@ range_in_hard_reg_set_p (const HARD_REG_SET set, unsigned regno, int nregs)
   return true;
 }
 
-/* Get registers used by given function call instruction.  */
-extern bool get_call_reg_set_usage (rtx_insn *insn, HARD_REG_SET *reg_set,
-				    HARD_REG_SET default_set);
-
 #endif /* GCC_REGS_H */
diff --git a/gcc/regset.h b/gcc/regset.h
index 34a9eb457..72ff45891 100644
--- a/gcc/regset.h
+++ b/gcc/regset.h
@@ -64,6 +64,10 @@ typedef bitmap regset;
 /* Inclusive or a register set with a second register set.  */
 #define IOR_REG_SET(TO, FROM) bitmap_ior_into (TO, FROM)
 
+/* Same, but with FROM being a HARD_REG_SET.  */
+#define IOR_REG_SET_HRS(TO, FROM) \
+  bitmap_ior_into (TO, bitmap_view<HARD_REG_SET> (FROM))
+
 /* Exclusive or a register set with a second register set.  */
 #define XOR_REG_SET(TO, FROM) bitmap_xor_into (TO, FROM)
 
@@ -107,14 +111,6 @@ typedef bitmap_iterator reg_set_iterator;
 #define EXECUTE_IF_AND_IN_REG_SET(REGSET1, REGSET2, MIN, REGNUM, RSI) \
   EXECUTE_IF_AND_IN_BITMAP (REGSET1, REGSET2, MIN, REGNUM, RSI)	\
 
-/* Same information as REGS_INVALIDATED_BY_CALL but in regset form to be used
-   in dataflow more conveniently.  */
-
-extern regset regs_invalidated_by_call_regset;
-
-/* Same information as FIXED_REG_SET but in regset form.  */
-extern regset fixed_reg_set_regset;
-
 /* An obstack for regsets.  */
 extern bitmap_obstack reg_obstack;
 
diff --git a/gcc/reload.c b/gcc/reload.c
index 72cc38a0e..b7601307f 100644
--- a/gcc/reload.c
+++ b/gcc/reload.c
@@ -6911,15 +6911,15 @@ find_equiv_reg (rtx goal, rtx_insn *insn, enum reg_class rclass, int other,
 
 	  if (regno >= 0 && regno < FIRST_PSEUDO_REGISTER)
 	    for (i = 0; i < nregs; ++i)
-	      if (call_used_regs[regno + i]
-		  || targetm.hard_regno_call_part_clobbered (NULL, regno + i,
+	      if (call_used_or_fixed_reg_p (regno + i)
+		  || targetm.hard_regno_call_part_clobbered (0, regno + i,
 							     mode))
 		return 0;
 
 	  if (valueno >= 0 && valueno < FIRST_PSEUDO_REGISTER)
 	    for (i = 0; i < valuenregs; ++i)
-	      if (call_used_regs[valueno + i]
-		  || targetm.hard_regno_call_part_clobbered (NULL, valueno + i,
+	      if (call_used_or_fixed_reg_p (valueno + i)
+		  || targetm.hard_regno_call_part_clobbered (0, valueno + i,
 							     mode))
 		return 0;
 	}
diff --git a/gcc/reload.h b/gcc/reload.h
index 813075b6f..fef6aa9da 100644
--- a/gcc/reload.h
+++ b/gcc/reload.h
@@ -274,7 +274,7 @@ extern int reload_first_uid;
 
 extern int num_not_at_initial_offset;
 
-#if defined SET_HARD_REG_BIT && defined CLEAR_REG_SET
+#if defined HARD_CONST && defined CLEAR_REG_SET
 /* This structure describes instructions which are relevant for reload.
    Apart from all regular insns, this also includes CODE_LABELs, since they
    must be examined for register elimination.  */
@@ -325,7 +325,7 @@ extern struct insn_chain *reload_insn_chain;
 extern struct insn_chain *new_insn_chain (void);
 #endif
 
-#if defined SET_HARD_REG_BIT
+#if defined HARD_CONST
 extern void compute_use_by_pseudos (HARD_REG_SET *, bitmap);
 #endif
 
diff --git a/gcc/reload1.c b/gcc/reload1.c
index bb112d817..d36ebec60 100644
--- a/gcc/reload1.c
+++ b/gcc/reload1.c
@@ -795,7 +795,9 @@ reload (rtx_insn *first, int global)
 
   if (crtl->saves_all_registers)
     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-      if (! call_used_regs[i] && ! fixed_regs[i] && ! LOCAL_REGNO (i))
+      if (! call_used_or_fixed_reg_p (i)
+	  && ! fixed_regs[i]
+	  && ! LOCAL_REGNO (i))
 	df_set_regs_ever_live (i, true);
 
   /* Find all the pseudo registers that didn't get hard regs
@@ -843,7 +845,7 @@ reload (rtx_insn *first, int global)
      cannot be done.  */
   for (insn = first; insn && num_eliminable; insn = NEXT_INSN (insn))
     if (INSN_P (insn))
-      note_stores (PATTERN (insn), mark_not_eliminable, NULL);
+      note_pattern_stores (PATTERN (insn), mark_not_eliminable, NULL);
 
   maybe_fix_stack_asms ();
 
@@ -1339,8 +1341,6 @@ maybe_fix_stack_asms (void)
 	  rtx t = XVECEXP (pat, 0, i);
 	  if (GET_CODE (t) == CLOBBER && STACK_REG_P (XEXP (t, 0)))
 	    SET_HARD_REG_BIT (clobbered, REGNO (XEXP (t, 0)));
-	  /* CLOBBER_HIGH is only supported for LRA.  */
-	  gcc_assert (GET_CODE (t) != CLOBBER_HIGH);
 	}
 
       /* Get the operand values and constraints out of the insn.  */
@@ -1364,7 +1364,7 @@ maybe_fix_stack_asms (void)
 		{
 		  /* End of one alternative - mark the regs in the current
 		     class, and reset the class.  */
-		  IOR_HARD_REG_SET (allowed, reg_class_contents[cls]);
+		  allowed |= reg_class_contents[cls];
 		  cls = NO_REGS;
 		  p++;
 		  if (c == '#')
@@ -1399,7 +1399,7 @@ maybe_fix_stack_asms (void)
       /* Those of the registers which are clobbered, but allowed by the
 	 constraints, must be usable as reload registers.  So clear them
 	 out of the life information.  */
-      AND_HARD_REG_SET (allowed, clobbered);
+      allowed &= clobbered;
       for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
 	if (TEST_HARD_REG_BIT (allowed, i))
 	  {
@@ -1732,7 +1732,7 @@ order_regs_for_reload (struct insn_chain *chain)
   HARD_REG_SET used_by_pseudos2;
   reg_set_iterator rsi;
 
-  COPY_HARD_REG_SET (bad_spill_regs, fixed_reg_set);
+  bad_spill_regs = fixed_reg_set;
 
   memset (spill_cost, 0, sizeof spill_cost);
   memset (spill_add_cost, 0, sizeof spill_add_cost);
@@ -1745,8 +1745,8 @@ order_regs_for_reload (struct insn_chain *chain)
 
   REG_SET_TO_HARD_REG_SET (used_by_pseudos, &chain->live_throughout);
   REG_SET_TO_HARD_REG_SET (used_by_pseudos2, &chain->dead_or_set);
-  IOR_HARD_REG_SET (bad_spill_regs, used_by_pseudos);
-  IOR_HARD_REG_SET (bad_spill_regs, used_by_pseudos2);
+  bad_spill_regs |= used_by_pseudos;
+  bad_spill_regs |= used_by_pseudos2;
 
   /* Now find out which pseudos are allocated to it, and update
      hard_reg_n_uses.  */
@@ -1823,9 +1823,9 @@ find_reg (struct insn_chain *chain, int order)
   static int regno_pseudo_regs[FIRST_PSEUDO_REGISTER];
   static int best_regno_pseudo_regs[FIRST_PSEUDO_REGISTER];
 
-  COPY_HARD_REG_SET (not_usable, bad_spill_regs);
-  IOR_HARD_REG_SET (not_usable, bad_spill_regs_global);
-  IOR_COMPL_HARD_REG_SET (not_usable, reg_class_contents[rl->rclass]);
+  not_usable = (bad_spill_regs
+		| bad_spill_regs_global
+		| ~reg_class_contents[rl->rclass]);
 
   CLEAR_HARD_REG_SET (used_by_other_reload);
   for (k = 0; k < order; k++)
@@ -1906,8 +1906,8 @@ find_reg (struct insn_chain *chain, int order)
 		  && (inv_reg_alloc_order[regno]
 		      < inv_reg_alloc_order[best_reg])
 #else
-		  && call_used_regs[regno]
-		  && ! call_used_regs[best_reg]
+		  && call_used_or_fixed_reg_p (regno)
+		  && ! call_used_or_fixed_reg_p (best_reg)
 #endif
 		  ))
 	    {
@@ -2007,8 +2007,8 @@ find_reload_regs (struct insn_chain *chain)
 	  }
     }
 
-  COPY_HARD_REG_SET (chain->used_spill_regs, used_spill_regs_local);
-  IOR_HARD_REG_SET (used_spill_regs, used_spill_regs_local);
+  chain->used_spill_regs = used_spill_regs_local;
+  used_spill_regs |= used_spill_regs_local;
 
   memcpy (chain->rld, rld, n_reloads * sizeof (struct reload));
 }
@@ -2881,7 +2881,6 @@ eliminate_regs_1 (rtx x, machine_mode mem_mode, rtx insn,
       return x;
 
     case CLOBBER:
-    case CLOBBER_HIGH:
     case ASM_OPERANDS:
       gcc_assert (insn && DEBUG_INSN_P (insn));
       break;
@@ -3092,10 +3091,6 @@ elimination_effects (rtx x, machine_mode mem_mode)
       elimination_effects (XEXP (x, 0), mem_mode);
       return;
 
-    case CLOBBER_HIGH:
-      /* CLOBBER_HIGH is only supported for LRA.  */
-      return;
-
     case SET:
       /* Check for setting a register that we know about.  */
       if (REG_P (SET_DEST (x)))
@@ -3817,9 +3812,6 @@ mark_not_eliminable (rtx dest, const_rtx x, void *data ATTRIBUTE_UNUSED)
   if (dest == hard_frame_pointer_rtx)
     return;
 
-  /* CLOBBER_HIGH is only supported for LRA.  */
-  gcc_assert (GET_CODE (x) != CLOBBER_HIGH);
-
   for (i = 0; i < NUM_ELIMINABLE_REGS; i++)
     if (reg_eliminate[i].can_eliminate && dest == reg_eliminate[i].to_rtx
 	&& (GET_CODE (x) != SET
@@ -4020,7 +4012,7 @@ update_eliminables_and_spill (void)
   HARD_REG_SET to_spill;
   CLEAR_HARD_REG_SET (to_spill);
   update_eliminables (&to_spill);
-  AND_COMPL_HARD_REG_SET (used_spill_regs, to_spill);
+  used_spill_regs &= ~to_spill;
 
   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
     if (TEST_HARD_REG_BIT (to_spill, i))
@@ -4346,14 +4338,12 @@ finish_spills (int global)
 	  EXECUTE_IF_SET_IN_REG_SET
 	    (&chain->live_throughout, FIRST_PSEUDO_REGISTER, i, rsi)
 	    {
-	      IOR_HARD_REG_SET (pseudo_forbidden_regs[i],
-				chain->used_spill_regs);
+	      pseudo_forbidden_regs[i] |= chain->used_spill_regs;
 	    }
 	  EXECUTE_IF_SET_IN_REG_SET
 	    (&chain->dead_or_set, FIRST_PSEUDO_REGISTER, i, rsi)
 	    {
-	      IOR_HARD_REG_SET (pseudo_forbidden_regs[i],
-				chain->used_spill_regs);
+	      pseudo_forbidden_regs[i] |= chain->used_spill_regs;
 	    }
 	}
 
@@ -4397,7 +4387,7 @@ finish_spills (int global)
 	{
 	  REG_SET_TO_HARD_REG_SET (used_by_pseudos, &chain->live_throughout);
 	  REG_SET_TO_HARD_REG_SET (used_by_pseudos2, &chain->dead_or_set);
-	  IOR_HARD_REG_SET (used_by_pseudos, used_by_pseudos2);
+	  used_by_pseudos |= used_by_pseudos2;
 
 	  compute_use_by_pseudos (&used_by_pseudos, &chain->live_throughout);
 	  compute_use_by_pseudos (&used_by_pseudos, &chain->dead_or_set);
@@ -4405,8 +4395,7 @@ finish_spills (int global)
 	     may be not included in the value calculated here because
 	     of possible removing caller-saves insns (see function
 	     delete_caller_save_insns.  */
-	  COMPL_HARD_REG_SET (chain->used_spill_regs, used_by_pseudos);
-	  AND_HARD_REG_SET (chain->used_spill_regs, used_spill_regs);
+	  chain->used_spill_regs = ~used_by_pseudos & used_spill_regs;
 	}
     }
 
@@ -4455,7 +4444,6 @@ scan_paradoxical_subregs (rtx x)
     case PC:
     case USE:
     case CLOBBER:
-    case CLOBBER_HIGH:
       return;
 
     case SUBREG:
@@ -4589,7 +4577,7 @@ reload_as_needed (int live_known)
 	{
 	  regset_head regs_to_forget;
 	  INIT_REG_SET (&regs_to_forget);
-	  note_stores (PATTERN (insn), forget_old_reloads_1, &regs_to_forget);
+	  note_stores (insn, forget_old_reloads_1, &regs_to_forget);
 
 	  /* If this is a USE and CLOBBER of a MEM, ensure that any
 	     references to eliminable registers have been removed.  */
@@ -4716,7 +4704,7 @@ reload_as_needed (int live_known)
 	     between INSN and NEXT and use them to forget old reloads.  */
 	  for (rtx_insn *x = NEXT_INSN (insn); x != old_next; x = NEXT_INSN (x))
 	    if (NONJUMP_INSN_P (x) && GET_CODE (PATTERN (x)) == CLOBBER)
-	      note_stores (PATTERN (x), forget_old_reloads_1, NULL);
+	      note_stores (x, forget_old_reloads_1, NULL);
 
 #if AUTO_INC_DEC
 	  /* Likewise for regs altered by auto-increment in this insn.
@@ -4882,8 +4870,8 @@ reload_as_needed (int live_known)
          be partially clobbered by the call.  */
       else if (CALL_P (insn))
 	{
-	  AND_COMPL_HARD_REG_SET (reg_reloaded_valid, call_used_reg_set);
-	  AND_COMPL_HARD_REG_SET (reg_reloaded_valid, reg_reloaded_call_part_clobbered);
+	  reg_reloaded_valid &= ~(call_used_or_fixed_regs
+				  | reg_reloaded_call_part_clobbered);
 
 	  /* If this is a call to a setjmp-type function, we must not
 	     reuse any reload reg contents across the call; that will
@@ -4910,8 +4898,7 @@ reload_as_needed (int live_known)
    to be forgotten later.  */
 
 static void
-forget_old_reloads_1 (rtx x, const_rtx setter,
-		      void *data)
+forget_old_reloads_1 (rtx x, const_rtx, void *data)
 {
   unsigned int regno;
   unsigned int nr;
@@ -4930,9 +4917,6 @@ forget_old_reloads_1 (rtx x, const_rtx setter,
   if (!REG_P (x))
     return;
 
-  /* CLOBBER_HIGH is only supported for LRA.  */
-  gcc_assert (setter == NULL_RTX || GET_CODE (setter) != CLOBBER_HIGH);
-
   regno = REGNO (x);
 
   if (regno >= FIRST_PSEUDO_REGISTER)
@@ -6335,9 +6319,9 @@ choose_reload_regs_init (struct insn_chain *chain, rtx *save_reload_reg_rtx)
   {
     HARD_REG_SET tmp;
     REG_SET_TO_HARD_REG_SET (tmp, &chain->live_throughout);
-    IOR_HARD_REG_SET (reg_used_in_insn, tmp);
+    reg_used_in_insn |= tmp;
     REG_SET_TO_HARD_REG_SET (tmp, &chain->dead_or_set);
-    IOR_HARD_REG_SET (reg_used_in_insn, tmp);
+    reg_used_in_insn |= tmp;
     compute_use_by_pseudos (&reg_used_in_insn, &chain->live_throughout);
     compute_use_by_pseudos (&reg_used_in_insn, &chain->dead_or_set);
   }
@@ -6352,7 +6336,7 @@ choose_reload_regs_init (struct insn_chain *chain, rtx *save_reload_reg_rtx)
       CLEAR_HARD_REG_SET (reload_reg_used_in_outaddr_addr[i]);
     }
 
-  COMPL_HARD_REG_SET (reload_reg_unavailable, chain->used_spill_regs);
+  reload_reg_unavailable = ~chain->used_spill_regs;
 
   CLEAR_HARD_REG_SET (reload_reg_used_for_inherit);
 
@@ -7797,7 +7781,7 @@ emit_output_reload_insns (struct insn_chain *chain, struct reload *rl,
 	   clear any memory of reloaded copies of the pseudo reg.
 	   If this output reload comes from a spill reg,
 	   reg_has_output_reload will make this do nothing.  */
-	note_stores (pat, forget_old_reloads_1, NULL);
+	note_stores (p, forget_old_reloads_1, NULL);
 
 	if (reg_mentioned_p (rl_reg_rtx, pat))
 	  {
@@ -8289,8 +8273,7 @@ emit_reload_insns (struct insn_chain *chain)
 			   : out_regno + k);
 		      reg_reloaded_insn[regno + k] = insn;
 		      SET_HARD_REG_BIT (reg_reloaded_valid, regno + k);
-		      if (targetm.hard_regno_call_part_clobbered (NULL,
-								  regno + k,
+		      if (targetm.hard_regno_call_part_clobbered (0, regno + k,
 								  mode))
 			SET_HARD_REG_BIT (reg_reloaded_call_part_clobbered,
 					  regno + k);
@@ -8370,8 +8353,7 @@ emit_reload_insns (struct insn_chain *chain)
 			   : in_regno + k);
 		      reg_reloaded_insn[regno + k] = insn;
 		      SET_HARD_REG_BIT (reg_reloaded_valid, regno + k);
-		      if (targetm.hard_regno_call_part_clobbered (NULL,
-								  regno + k,
+		      if (targetm.hard_regno_call_part_clobbered (0, regno + k,
 								  mode))
 			SET_HARD_REG_BIT (reg_reloaded_call_part_clobbered,
 					  regno + k);
@@ -8487,7 +8469,7 @@ emit_reload_insns (struct insn_chain *chain)
 		      CLEAR_HARD_REG_BIT (reg_reloaded_dead, src_regno + k);
 		      SET_HARD_REG_BIT (reg_reloaded_valid, src_regno + k);
 		      if (targetm.hard_regno_call_part_clobbered
-			  (NULL, src_regno + k, mode))
+			  (0, src_regno + k, mode))
 			SET_HARD_REG_BIT (reg_reloaded_call_part_clobbered,
 					  src_regno + k);
 		      else
@@ -8516,7 +8498,7 @@ emit_reload_insns (struct insn_chain *chain)
 	    }
 	}
     }
-  IOR_HARD_REG_SET (reg_reloaded_dead, reg_reloaded_died);
+  reg_reloaded_dead |= reg_reloaded_died;
 }
 
 /* Go through the motions to emit INSN and test if it is strictly valid.
diff --git a/gcc/reorg.c b/gcc/reorg.c
index bdfcf8851..cba183e9c 100644
--- a/gcc/reorg.c
+++ b/gcc/reorg.c
@@ -410,8 +410,7 @@ find_end_label (rtx kind)
   while (NOTE_P (insn)
 	 || (NONJUMP_INSN_P (insn)
 	     && (GET_CODE (PATTERN (insn)) == USE
-		 || GET_CODE (PATTERN (insn)) == CLOBBER
-		 || GET_CODE (PATTERN (insn)) == CLOBBER_HIGH)))
+		 || GET_CODE (PATTERN (insn)) == CLOBBER)))
     insn = PREV_INSN (insn);
 
   /* When a target threads its epilogue we might already have a
@@ -1311,8 +1310,7 @@ try_merge_delay_insns (rtx_insn *insn, rtx_insn *thread)
 
       /* TRIAL must be a CALL_INSN or INSN.  Skip USE and CLOBBER.  */
       if (NONJUMP_INSN_P (trial)
-	  && (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER
-	      || GET_CODE (pat) == CLOBBER_HIGH))
+	  && (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER))
 	continue;
 
       if (GET_CODE (next_to_match) == GET_CODE (trial)
@@ -1506,8 +1504,7 @@ redundant_insn (rtx insn, rtx_insn *target, const vec<rtx_insn *> &delay_list)
       --insns_to_search;
 
       pat = PATTERN (trial);
-      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER
-	  || GET_CODE (pat) == CLOBBER_HIGH)
+      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER)
 	continue;
 
       if (GET_CODE (trial) == DEBUG_INSN)
@@ -1575,7 +1572,7 @@ redundant_insn (rtx insn, rtx_insn *target, const vec<rtx_insn *> &delay_list)
   /* Insns we pass may not set either NEEDED or SET, so merge them for
      simpler tests.  */
   needed.memory |= set.memory;
-  IOR_HARD_REG_SET (needed.regs, set.regs);
+  needed.regs |= set.regs;
 
   /* This insn isn't redundant if it conflicts with an insn that either is
      or will be in a delay slot of TARGET.  */
@@ -1605,8 +1602,7 @@ redundant_insn (rtx insn, rtx_insn *target, const vec<rtx_insn *> &delay_list)
       --insns_to_search;
 
       pat = PATTERN (trial);
-      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER
-	  || GET_CODE (pat) == CLOBBER_HIGH)
+      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER)
 	continue;
 
       if (GET_CODE (trial) == DEBUG_INSN)
@@ -1718,8 +1714,7 @@ own_thread_p (rtx thread, rtx label, int allow_fallthrough)
 	|| LABEL_P (insn)
 	|| (NONJUMP_INSN_P (insn)
 	    && GET_CODE (PATTERN (insn)) != USE
-	    && GET_CODE (PATTERN (insn)) != CLOBBER
-	    && GET_CODE (PATTERN (insn)) != CLOBBER_HIGH))
+	    && GET_CODE (PATTERN (insn)) != CLOBBER))
       return 0;
 
   return 1;
@@ -2042,8 +2037,7 @@ fill_simple_delay_slots (int non_jumps_p)
 	      pat = PATTERN (trial);
 
 	      /* Stand-alone USE and CLOBBER are just for flow.  */
-	      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER
-		  || GET_CODE (pat) == CLOBBER_HIGH)
+	      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER)
 		continue;
 
 	      /* And DEBUG_INSNs never go into delay slots.  */
@@ -2169,8 +2163,7 @@ fill_simple_delay_slots (int non_jumps_p)
 	      pat = PATTERN (trial);
 
 	      /* Stand-alone USE and CLOBBER are just for flow.  */
-	      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER
-		  || GET_CODE (pat) == CLOBBER_HIGH)
+	      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER)
 		continue;
 
 	      /* And DEBUG_INSNs do not go in delay slots.  */
@@ -2438,8 +2431,7 @@ fill_slots_from_thread (rtx_jump_insn *insn, rtx condition,
 	}
 
       pat = PATTERN (trial);
-      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER
-	  || GET_CODE (pat) == CLOBBER_HIGH)
+      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER)
 	continue;
 
       if (GET_CODE (trial) == DEBUG_INSN)
@@ -3833,8 +3825,7 @@ dbr_schedule (rtx_insn *first)
 	  if (! insn->deleted ()
 	      && NONJUMP_INSN_P (insn)
 	      && GET_CODE (PATTERN (insn)) != USE
-	      && GET_CODE (PATTERN (insn)) != CLOBBER
-	      && GET_CODE (PATTERN (insn)) != CLOBBER_HIGH)
+	      && GET_CODE (PATTERN (insn)) != CLOBBER)
 	    {
 	      if (GET_CODE (PATTERN (insn)) == SEQUENCE)
 		{
diff --git a/gcc/resource.c b/gcc/resource.c
index c4bcfd7dc..bf2d6beaf 100644
--- a/gcc/resource.c
+++ b/gcc/resource.c
@@ -30,6 +30,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "resource.h"
 #include "insn-attr.h"
 #include "params.h"
+#include "function-abi.h"
 
 /* This structure is used to record liveness information at the targets or
    fallthrough insns of branches.  We will most likely need the information
@@ -108,11 +109,6 @@ update_live_status (rtx dest, const_rtx x, void *data ATTRIBUTE_UNUSED)
   if (GET_CODE (x) == CLOBBER)
     for (i = first_regno; i < last_regno; i++)
       CLEAR_HARD_REG_BIT (current_live_regs, i);
-  else if (GET_CODE (x) == CLOBBER_HIGH)
-    /* No current target supports both branch delay slots and CLOBBER_HIGH.
-       We'd need more elaborate liveness tracking to handle that
-       combination.  */
-    gcc_unreachable ();
   else
     for (i = first_regno; i < last_regno; i++)
       {
@@ -298,7 +294,6 @@ mark_referenced_resources (rtx x, struct resources *res,
       return;
 
     case CLOBBER:
-    case CLOBBER_HIGH:
       return;
 
     case CALL_INSN:
@@ -450,8 +445,8 @@ find_dead_or_set_registers (rtx_insn *target, struct resources *res,
 	case CODE_LABEL:
 	  /* After a label, any pending dead registers that weren't yet
 	     used can be made dead.  */
-	  AND_COMPL_HARD_REG_SET (pending_dead_regs, needed.regs);
-	  AND_COMPL_HARD_REG_SET (res->regs, pending_dead_regs);
+	  pending_dead_regs &= ~needed.regs;
+	  res->regs &= ~pending_dead_regs;
 	  CLEAR_HARD_REG_SET (pending_dead_regs);
 
 	  continue;
@@ -565,14 +560,12 @@ find_dead_or_set_registers (rtx_insn *target, struct resources *res,
 		    }
 
 		  target_res = *res;
-		  COPY_HARD_REG_SET (scratch, target_set.regs);
-		  AND_COMPL_HARD_REG_SET (scratch, needed.regs);
-		  AND_COMPL_HARD_REG_SET (target_res.regs, scratch);
+		  scratch = target_set.regs & ~needed.regs;
+		  target_res.regs &= ~scratch;
 
 		  fallthrough_res = *res;
-		  COPY_HARD_REG_SET (scratch, set.regs);
-		  AND_COMPL_HARD_REG_SET (scratch, needed.regs);
-		  AND_COMPL_HARD_REG_SET (fallthrough_res.regs, scratch);
+		  scratch = set.regs & ~needed.regs;
+		  fallthrough_res.regs &= ~scratch;
 
 		  if (!ANY_RETURN_P (this_jump_insn->jump_label ()))
 		    find_dead_or_set_registers
@@ -581,8 +574,8 @@ find_dead_or_set_registers (rtx_insn *target, struct resources *res,
 		  find_dead_or_set_registers (next_insn,
 					      &fallthrough_res, 0, jump_count,
 					      set, needed);
-		  IOR_HARD_REG_SET (fallthrough_res.regs, target_res.regs);
-		  AND_HARD_REG_SET (res->regs, fallthrough_res.regs);
+		  fallthrough_res.regs |= target_res.regs;
+		  res->regs &= fallthrough_res.regs;
 		  break;
 		}
 	      else
@@ -601,9 +594,8 @@ find_dead_or_set_registers (rtx_insn *target, struct resources *res,
       mark_referenced_resources (insn, &needed, true);
       mark_set_resources (insn, &set, 0, MARK_SRC_DEST_CALL);
 
-      COPY_HARD_REG_SET (scratch, set.regs);
-      AND_COMPL_HARD_REG_SET (scratch, needed.regs);
-      AND_COMPL_HARD_REG_SET (res->regs, scratch);
+      scratch = set.regs & ~needed.regs;
+      res->regs &= ~scratch;
     }
 
   return jump_insn;
@@ -665,24 +657,16 @@ mark_set_resources (rtx x, struct resources *res, int in_dest,
 	{
 	  rtx_call_insn *call_insn = as_a <rtx_call_insn *> (x);
 	  rtx link;
-	  HARD_REG_SET regs;
 
 	  res->cc = res->memory = 1;
 
-	  get_call_reg_set_usage (call_insn, &regs, regs_invalidated_by_call);
-	  IOR_HARD_REG_SET (res->regs, regs);
+	  res->regs |= insn_callee_abi (call_insn).full_reg_clobbers ();
 
 	  for (link = CALL_INSN_FUNCTION_USAGE (call_insn);
 	       link; link = XEXP (link, 1))
-	    {
-	      /* We could support CLOBBER_HIGH and treat it in the same way as
-		 HARD_REGNO_CALL_PART_CLOBBERED, but no port needs that
-		 yet.  */
-	      gcc_assert (GET_CODE (XEXP (link, 0)) != CLOBBER_HIGH);
-	      if (GET_CODE (XEXP (link, 0)) == CLOBBER)
-		mark_set_resources (SET_DEST (XEXP (link, 0)), res, 1,
-				    MARK_SRC_DEST);
-	    }
+	    if (GET_CODE (XEXP (link, 0)) == CLOBBER)
+	      mark_set_resources (SET_DEST (XEXP (link, 0)), res, 1,
+				  MARK_SRC_DEST);
 
 	  /* Check for a REG_SETJMP.  If it exists, then we must
 	     assume that this call can clobber any register.  */
@@ -725,12 +709,6 @@ mark_set_resources (rtx x, struct resources *res, int in_dest,
       mark_set_resources (XEXP (x, 0), res, 1, MARK_SRC_DEST);
       return;
 
-    case CLOBBER_HIGH:
-      /* No current target supports both branch delay slots and CLOBBER_HIGH.
-	 We'd need more elaborate liveness tracking to handle that
-	 combination.  */
-      gcc_unreachable ();
-
     case SEQUENCE:
       {
         rtx_sequence *seq = as_a <rtx_sequence *> (x);
@@ -960,7 +938,7 @@ mark_target_live_regs (rtx_insn *insns, rtx target_maybe_return, struct resource
 	     update it below.  */
 	  if (b == tinfo->block && b != -1 && tinfo->bb_tick == bb_ticks[b])
 	    {
-	      COPY_HARD_REG_SET (res->regs, tinfo->live_regs);
+	      res->regs = tinfo->live_regs;
 	      return;
 	    }
 	}
@@ -1041,15 +1019,12 @@ mark_target_live_regs (rtx_insn *insns, rtx target_maybe_return, struct resource
 		 predicated instruction, or if the CALL is NORETURN.  */
 	      if (GET_CODE (PATTERN (real_insn)) != COND_EXEC)
 		{
-		  HARD_REG_SET regs_invalidated_by_this_call;
-		  get_call_reg_set_usage (real_insn,
-					  &regs_invalidated_by_this_call,
-					  regs_invalidated_by_call);
+		  HARD_REG_SET regs_invalidated_by_this_call
+		    = insn_callee_abi (real_insn).full_reg_clobbers ();
 		  /* CALL clobbers all call-used regs that aren't fixed except
 		     sp, ap, and fp.  Do this before setting the result of the
 		     call live.  */
-		  AND_COMPL_HARD_REG_SET (current_live_regs,
-					  regs_invalidated_by_this_call);
+		  current_live_regs &= ~regs_invalidated_by_this_call;
 		}
 
 	      /* A CALL_INSN sets any global register live, since it may
@@ -1078,7 +1053,7 @@ mark_target_live_regs (rtx_insn *insns, rtx target_maybe_return, struct resource
 				      GET_MODE (XEXP (link, 0)),
 				      REGNO (XEXP (link, 0)));
 
-	      note_stores (PATTERN (real_insn), update_live_status, NULL);
+	      note_stores (real_insn, update_live_status, NULL);
 
 	      /* If any registers were unused after this insn, kill them.
 		 These notes will always be accurate.  */
@@ -1097,7 +1072,7 @@ mark_target_live_regs (rtx_insn *insns, rtx target_maybe_return, struct resource
 
 	      /* A label clobbers the pending dead registers since neither
 		 reload nor jump will propagate a value across a label.  */
-	      AND_COMPL_HARD_REG_SET (current_live_regs, pending_dead_regs);
+	      current_live_regs &= ~pending_dead_regs;
 	      CLEAR_HARD_REG_SET (pending_dead_regs);
 
 	      /* We must conservatively assume that all registers that used
@@ -1109,7 +1084,7 @@ mark_target_live_regs (rtx_insn *insns, rtx target_maybe_return, struct resource
 		  HARD_REG_SET extra_live;
 
 		  REG_SET_TO_HARD_REG_SET (extra_live, DF_LR_IN (bb));
-		  IOR_HARD_REG_SET (current_live_regs, extra_live);
+		  current_live_regs |= extra_live;
 		}
 	    }
 
@@ -1118,10 +1093,10 @@ mark_target_live_regs (rtx_insn *insns, rtx target_maybe_return, struct resource
 	     are implicitly required at that point.  */
 	  else if (NOTE_P (real_insn)
 		   && NOTE_KIND (real_insn) == NOTE_INSN_EPILOGUE_BEG)
-	    IOR_HARD_REG_SET (current_live_regs, start_of_epilogue_needs.regs);
+	    current_live_regs |= start_of_epilogue_needs.regs;
 	}
 
-      COPY_HARD_REG_SET (res->regs, current_live_regs);
+      res->regs = current_live_regs;
       if (tinfo != NULL)
 	{
 	  tinfo->block = b;
@@ -1160,20 +1135,17 @@ mark_target_live_regs (rtx_insn *insns, rtx target_maybe_return, struct resource
 	{
 	  mark_referenced_resources (insn, &needed, true);
 
-	  COPY_HARD_REG_SET (scratch, needed.regs);
-	  AND_COMPL_HARD_REG_SET (scratch, set.regs);
-	  IOR_HARD_REG_SET (new_resources.regs, scratch);
+	  scratch = needed.regs & ~set.regs;
+	  new_resources.regs |= scratch;
 
 	  mark_set_resources (insn, &set, 0, MARK_SRC_DEST_CALL);
 	}
 
-      IOR_HARD_REG_SET (res->regs, new_resources.regs);
+      res->regs |= new_resources.regs;
     }
 
   if (tinfo != NULL)
-    {
-      COPY_HARD_REG_SET (tinfo->live_regs, res->regs);
-    }
+    tinfo->live_regs = res->regs;
 }
 
 /* Initialize the resources required by mark_target_live_regs ().
diff --git a/gcc/rtl.c b/gcc/rtl.c
index d7b8e9877..ec65fbb37 100644
--- a/gcc/rtl.c
+++ b/gcc/rtl.c
@@ -315,10 +315,6 @@ copy_rtx (rtx orig)
 	return orig;
       break;
 
-    case CLOBBER_HIGH:
-	gcc_assert (REG_P (XEXP (orig, 0)));
-	return orig;
-
     case CONST:
       if (shared_const_p (orig))
 	return orig;
diff --git a/gcc/rtl.def b/gcc/rtl.def
index f4c9d946c..edb34c5ac 100644
--- a/gcc/rtl.def
+++ b/gcc/rtl.def
@@ -312,16 +312,6 @@ DEF_RTL_EXPR(USE, "use", "e", RTX_EXTRA)
    is considered undeletable before reload.  */
 DEF_RTL_EXPR(CLOBBER, "clobber", "e", RTX_EXTRA)
 
-/* Indicate that the upper parts of something are clobbered in a way that we
-   don't want to explain.  The MODE references the lower bits that will be
-   preserved.  Anything above that size will be clobbered.
-
-   CLOBBER_HIGH only occurs as the operand of a PARALLEL rtx.  It cannot appear
-   in other contexts, and unlike CLOBBER, it cannot appear on its own.
-   CLOBBER_HIGH can only be used with fixed register rtxes.  */
-
-DEF_RTL_EXPR(CLOBBER_HIGH, "clobber_high", "e", RTX_EXTRA)
-
 /* Call a subroutine.
    Operand 1 is the address to call.
    Operand 2 is the number of arguments.  */
@@ -936,6 +926,12 @@ DEF_RTL_EXPR(DEFINE_SPLIT, "define_split", "EsES", RTX_EXTRA)
    7: optionally, a vector of attributes for this insn.  */
 DEF_RTL_EXPR(DEFINE_INSN_AND_SPLIT, "define_insn_and_split", "sEsTsESV", RTX_EXTRA)
 
+/* A form of define_insn_and_split in which the split insn pattern (operand 5)
+   is determined automatically by replacing match_operands with match_dups
+   and match_operators with match_op_dups.  The operands are the same as
+   define_insn_and_split but with operand 5 removed.  */
+DEF_RTL_EXPR(DEFINE_INSN_AND_REWRITE, "define_insn_and_rewrite", "sEsTsSV", RTX_EXTRA)
+
 /* Definition of an RTL peephole operation.
    Follows the same arguments as define_split.  */
 DEF_RTL_EXPR(DEFINE_PEEPHOLE2, "define_peephole2", "EsES", RTX_EXTRA)
diff --git a/gcc/rtl.h b/gcc/rtl.h
index b4a906f91..6093d42c0 100644
--- a/gcc/rtl.h
+++ b/gcc/rtl.h
@@ -1623,11 +1623,17 @@ extern const char * const reg_note_name[];
 #define GET_REG_NOTE_NAME(MODE) (reg_note_name[(int) (MODE)])
 
 /* This field is only present on CALL_INSNs.  It holds a chain of EXPR_LIST of
-   USE and CLOBBER expressions.
+   USE, CLOBBER and SET expressions.
      USE expressions list the registers filled with arguments that
    are passed to the function.
      CLOBBER expressions document the registers explicitly clobbered
    by this CALL_INSN.
+     SET expressions say that the return value of the call (the SET_DEST)
+   is equivalent to a value available before the call (the SET_SRC).
+   This kind of SET is used when the return value is predictable in
+   advance.  It is purely an optimisation hint; unlike USEs and CLOBBERs,
+   it does not affect register liveness.
+
      Pseudo registers cannot be mentioned in this list.  */
 #define CALL_INSN_FUNCTION_USAGE(INSN)	XEXP(INSN, 7)
 
@@ -2392,12 +2398,30 @@ extern int rtx_cost (rtx, machine_mode, enum rtx_code, int, bool);
 extern int address_cost (rtx, machine_mode, addr_space_t, bool);
 extern void get_full_rtx_cost (rtx, machine_mode, enum rtx_code, int,
 			       struct full_rtx_costs *);
+extern bool native_encode_rtx (machine_mode, rtx, vec<target_unit> &,
+			       unsigned int, unsigned int);
+extern rtx native_decode_rtx (machine_mode, vec<target_unit>,
+			      unsigned int);
+extern rtx native_decode_vector_rtx (machine_mode, vec<target_unit>,
+				     unsigned int, unsigned int, unsigned int);
 extern poly_uint64 subreg_lsb (const_rtx);
-extern poly_uint64 subreg_lsb_1 (machine_mode, machine_mode, poly_uint64);
+extern poly_uint64 subreg_size_lsb (poly_uint64, poly_uint64, poly_uint64);
 extern poly_uint64 subreg_size_offset_from_lsb (poly_uint64, poly_uint64,
 						poly_uint64);
 extern bool read_modify_subreg_p (const_rtx);
 
+/* Given a subreg's OUTER_MODE, INNER_MODE, and SUBREG_BYTE, return the
+   bit offset at which the subreg begins (counting from the least significant
+   bit of the operand).  */
+
+inline poly_uint64
+subreg_lsb_1 (machine_mode outer_mode, machine_mode inner_mode,
+	      poly_uint64 subreg_byte)
+{
+  return subreg_size_lsb (GET_MODE_SIZE (outer_mode),
+			  GET_MODE_SIZE (inner_mode), subreg_byte);
+}
+
 /* Return the subreg byte offset for a subreg whose outer mode is
    OUTER_MODE, whose inner mode is INNER_MODE, and where there are
    LSB_SHIFT *bits* between the lsb of the outer value and the lsb of
@@ -2645,7 +2669,7 @@ do {								        \
 
 /* For a SET rtx, SET_DEST is the place that is set
    and SET_SRC is the value it is set to.  */
-#define SET_DEST(RTX) XC3EXP (RTX, 0, SET, CLOBBER, CLOBBER_HIGH)
+#define SET_DEST(RTX) XC2EXP (RTX, 0, SET, CLOBBER)
 #define SET_SRC(RTX) XCEXP (RTX, 1, SET)
 #define SET_IS_RETURN_P(RTX)						\
   (RTL_FLAG_CHECK1 ("SET_IS_RETURN_P", (RTX), SET)->jump)
@@ -3369,8 +3393,7 @@ extern bool val_signbit_known_clear_p (machine_mode,
 				       unsigned HOST_WIDE_INT);
 
 /* In reginfo.c  */
-extern machine_mode choose_hard_reg_mode (unsigned int, unsigned int,
-					       bool);
+extern machine_mode choose_hard_reg_mode (unsigned int, unsigned int, bool);
 extern const HARD_REG_SET &simplifiable_subregs (const subreg_shape &);
 
 /* In emit-rtl.c  */
@@ -3407,6 +3430,7 @@ extern int rtx_unstable_p (const_rtx);
 extern bool rtx_varies_p (const_rtx, bool);
 extern bool rtx_addr_varies_p (const_rtx, bool);
 extern rtx get_call_rtx_from (rtx);
+extern tree get_call_fndecl (const rtx_insn *);
 extern HOST_WIDE_INT get_integer_term (const_rtx);
 extern rtx get_related_value (const_rtx);
 extern bool offset_within_block_p (const_rtx, HOST_WIDE_INT);
@@ -3435,7 +3459,10 @@ extern void record_hard_reg_sets (rtx, const_rtx, void *);
 extern void record_hard_reg_uses (rtx *, void *);
 extern void find_all_hard_regs (const_rtx, HARD_REG_SET *);
 extern void find_all_hard_reg_sets (const rtx_insn *, HARD_REG_SET *, bool);
-extern void note_stores (const_rtx, void (*) (rtx, const_rtx, void *), void *);
+extern void note_pattern_stores (const_rtx,
+				 void (*) (rtx, const_rtx, void *), void *);
+extern void note_stores (const rtx_insn *,
+			 void (*) (rtx, const_rtx, void *), void *);
 extern void note_uses (rtx *, void (*) (rtx *, void *), void *);
 extern int dead_or_set_p (const rtx_insn *, const_rtx);
 extern int dead_or_set_regno_p (const rtx_insn *, unsigned int);
@@ -3476,16 +3503,6 @@ extern bool tablejump_p (const rtx_insn *, rtx_insn **, rtx_jump_table_data **);
 extern int computed_jump_p (const rtx_insn *);
 extern bool tls_referenced_p (const_rtx);
 extern bool contains_mem_rtx_p (rtx x);
-extern bool reg_is_clobbered_by_clobber_high (unsigned int, machine_mode,
-					      const_rtx);
-
-/* Convenient wrapper for reg_is_clobbered_by_clobber_high.  */
-inline bool
-reg_is_clobbered_by_clobber_high (const_rtx x, const_rtx clobber_high_op)
-{
-  return reg_is_clobbered_by_clobber_high (REGNO (x), GET_MODE (x),
-					   clobber_high_op);
-}
 
 /* Overload for refers_to_regno_p for checking a single register.  */
 inline bool
@@ -4279,7 +4296,6 @@ extern void vt_equate_reg_base_value (const_rtx, const_rtx);
 extern bool memory_modified_in_insn_p (const_rtx, const_rtx);
 extern bool may_be_sp_based_p (rtx);
 extern rtx gen_hard_reg_clobber (machine_mode, unsigned int);
-extern rtx gen_hard_reg_clobber_high (machine_mode, unsigned int);
 extern rtx get_reg_known_value (unsigned int);
 extern bool get_reg_known_equiv_p (unsigned int);
 extern rtx get_reg_base_value (unsigned int);
@@ -4353,14 +4369,11 @@ extern tree GTY(()) global_regs_decl[FIRST_PSEUDO_REGISTER];
    Available only for functions that has been already assembled.  */
 
 struct GTY(()) cgraph_rtl_info {
-   unsigned int preferred_incoming_stack_boundary;
+  unsigned int preferred_incoming_stack_boundary;
 
-  /* Call unsaved hard registers really used by the corresponding
-     function (including ones used by functions called by the
-     function).  */
+  /* Which registers the function clobbers, either directly or by
+     calling another function.  */
   HARD_REG_SET function_used_regs;
-  /* Set if function_used_regs is valid.  */
-  unsigned function_used_regs_valid: 1;
 };
 
 /* If loads from memories of mode MODE always sign or zero extend,
diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c
index 01af063a2..553d71c1c 100644
--- a/gcc/rtlanal.c
+++ b/gcc/rtlanal.c
@@ -823,6 +823,24 @@ get_call_rtx_from (rtx x)
     return x;
   return NULL_RTX;
 }
+
+/* Get the declaration of the function called by INSN.  */
+
+tree
+get_call_fndecl (const rtx_insn *insn)
+{
+  rtx note, datum;
+
+  note = find_reg_note (insn, REG_CALL_DECL, NULL_RTX);
+  if (note == NULL_RTX)
+    return NULL_TREE;
+
+  datum = XEXP (note, 0);
+  if (datum != NULL_RTX)
+    return SYMBOL_REF_DECL (datum);
+
+  return NULL_TREE;
+}
 
 /* Return the value of the integer term in X, if one is apparent;
    otherwise return 0.
@@ -1198,10 +1216,6 @@ reg_referenced_p (const_rtx x, const_rtx body)
 	  return 1;
       return 0;
 
-    case CLOBBER_HIGH:
-      gcc_assert (REG_P (XEXP (body, 0)));
-      return 0;
-
     case COND_EXEC:
       if (reg_overlap_mentioned_p (x, COND_EXEC_TEST (body)))
 	return 1;
@@ -1424,11 +1438,7 @@ set_of_1 (rtx x, const_rtx pat, void *data1)
 {
   struct set_of_data *const data = (struct set_of_data *) (data1);
   if (rtx_equal_p (x, data->pat)
-      || (GET_CODE (pat) == CLOBBER_HIGH
-	  && REGNO(data->pat) == REGNO(XEXP (pat, 0))
-	  && reg_is_clobbered_by_clobber_high (data->pat, XEXP (pat, 0)))
-      || (GET_CODE (pat) != CLOBBER_HIGH && !MEM_P (x)
-	  && reg_overlap_mentioned_p (data->pat, x)))
+      || (!MEM_P (x) && reg_overlap_mentioned_p (data->pat, x)))
     data->found = pat;
 }
 
@@ -1440,7 +1450,7 @@ set_of (const_rtx pat, const_rtx insn)
   struct set_of_data data;
   data.found = NULL_RTX;
   data.pat = pat;
-  note_stores (INSN_P (insn) ? PATTERN (insn) : insn, set_of_1, &data);
+  note_pattern_stores (INSN_P (insn) ? PATTERN (insn) : insn, set_of_1, &data);
   return data.found;
 }
 
@@ -1476,15 +1486,9 @@ find_all_hard_reg_sets (const rtx_insn *insn, HARD_REG_SET *pset, bool implicit)
   rtx link;
 
   CLEAR_HARD_REG_SET (*pset);
-  note_stores (PATTERN (insn), record_hard_reg_sets, pset);
-  if (CALL_P (insn))
-    {
-      if (implicit)
-	IOR_HARD_REG_SET (*pset, call_used_reg_set);
-
-      for (link = CALL_INSN_FUNCTION_USAGE (insn); link; link = XEXP (link, 1))
-	record_hard_reg_sets (XEXP (link, 0), NULL, pset);
-    }
+  note_stores (insn, record_hard_reg_sets, pset);
+  if (CALL_P (insn) && implicit)
+    *pset |= call_used_or_fixed_regs;
   for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
     if (REG_NOTE_KIND (link) == REG_INC)
       record_hard_reg_sets (XEXP (link, 0), NULL, pset);
@@ -1517,7 +1521,6 @@ single_set_2 (const rtx_insn *insn, const_rtx pat)
 	    {
 	    case USE:
 	    case CLOBBER:
-	    case CLOBBER_HIGH:
 	      break;
 
 	    case SET:
@@ -1671,9 +1674,7 @@ noop_move_p (const rtx_insn *insn)
 	{
 	  rtx tem = XVECEXP (pat, 0, i);
 
-	  if (GET_CODE (tem) == USE
-	      || GET_CODE (tem) == CLOBBER
-	      || GET_CODE (tem) == CLOBBER_HIGH)
+	  if (GET_CODE (tem) == USE || GET_CODE (tem) == CLOBBER)
 	    continue;
 
 	  if (GET_CODE (tem) != SET || ! set_noop_p (tem))
@@ -1899,16 +1900,15 @@ reg_overlap_mentioned_p (const_rtx x, const_rtx in)
   the SUBREG will be passed.  */
 
 void
-note_stores (const_rtx x, void (*fun) (rtx, const_rtx, void *), void *data)
+note_pattern_stores (const_rtx x,
+		     void (*fun) (rtx, const_rtx, void *), void *data)
 {
   int i;
 
   if (GET_CODE (x) == COND_EXEC)
     x = COND_EXEC_CODE (x);
 
-  if (GET_CODE (x) == SET
-      || GET_CODE (x) == CLOBBER
-      || GET_CODE (x) == CLOBBER_HIGH)
+  if (GET_CODE (x) == SET || GET_CODE (x) == CLOBBER)
     {
       rtx dest = SET_DEST (x);
 
@@ -1933,7 +1933,22 @@ note_stores (const_rtx x, void (*fun) (rtx, const_rtx, void *), void *data)
 
   else if (GET_CODE (x) == PARALLEL)
     for (i = XVECLEN (x, 0) - 1; i >= 0; i--)
-      note_stores (XVECEXP (x, 0, i), fun, data);
+      note_pattern_stores (XVECEXP (x, 0, i), fun, data);
+}
+
+/* Same, but for an instruction.  If the instruction is a call, include
+   any CLOBBERs in its CALL_INSN_FUNCTION_USAGE.  */
+
+void
+note_stores (const rtx_insn *insn,
+	     void (*fun) (rtx, const_rtx, void *), void *data)
+{
+  if (CALL_P (insn))
+    for (rtx link = CALL_INSN_FUNCTION_USAGE (insn);
+	 link; link = XEXP (link, 1))
+      if (GET_CODE (XEXP (link, 0)) == CLOBBER)
+	note_pattern_stores (XEXP (link, 0), fun, data);
+  note_pattern_stores (PATTERN (insn), fun, data);
 }
 
 /* Like notes_stores, but call FUN for each expression that is being
@@ -3611,23 +3626,31 @@ loc_mentioned_in_p (rtx *loc, const_rtx in)
   return 0;
 }
 
-/* Helper function for subreg_lsb.  Given a subreg's OUTER_MODE, INNER_MODE,
-   and SUBREG_BYTE, return the bit offset where the subreg begins
-   (counting from the least significant bit of the operand).  */
+/* Reinterpret a subreg as a bit extraction from an integer and return
+   the position of the least significant bit of the extracted value.
+   In other words, if the extraction were performed as a shift right
+   and mask, return the number of bits to shift right.
+
+   The outer value of the subreg has OUTER_BYTES bytes and starts at
+   byte offset SUBREG_BYTE within an inner value of INNER_BYTES bytes.  */
 
 poly_uint64
-subreg_lsb_1 (machine_mode outer_mode,
-	      machine_mode inner_mode,
-	      poly_uint64 subreg_byte)
+subreg_size_lsb (poly_uint64 outer_bytes,
+		 poly_uint64 inner_bytes,
+		 poly_uint64 subreg_byte)
 {
   poly_uint64 subreg_end, trailing_bytes, byte_pos;
 
   /* A paradoxical subreg begins at bit position 0.  */
-  if (paradoxical_subreg_p (outer_mode, inner_mode))
-    return 0;
+  gcc_checking_assert (ordered_p (outer_bytes, inner_bytes));
+  if (maybe_gt (outer_bytes, inner_bytes))
+    {
+      gcc_checking_assert (known_eq (subreg_byte, 0U));
+      return 0;
+    }
 
-  subreg_end = subreg_byte + GET_MODE_SIZE (outer_mode);
-  trailing_bytes = GET_MODE_SIZE (inner_mode) - subreg_end;
+  subreg_end = subreg_byte + outer_bytes;
+  trailing_bytes = inner_bytes - subreg_end;
   if (WORDS_BIG_ENDIAN && BYTES_BIG_ENDIAN)
     byte_pos = trailing_bytes;
   else if (!WORDS_BIG_ENDIAN && !BYTES_BIG_ENDIAN)
@@ -4123,7 +4146,7 @@ find_first_parameter_load (rtx_insn *call_insn, rtx_insn *boundary)
       if (INSN_P (before))
 	{
 	  int nregs_old = parm.nregs;
-	  note_stores (PATTERN (before), parms_set, &parm);
+	  note_stores (before, parms_set, &parm);
 	  /* If we found something that did not set a parameter reg,
 	     we're done.  Do not keep going, as that might result
 	     in hoisting an insn before the setting of a pseudo
@@ -6601,32 +6624,3 @@ tls_referenced_p (const_rtx x)
       return true;
   return false;
 }
-
-/* Return true if reg REGNO with mode REG_MODE would be clobbered by the
-   clobber_high operand in CLOBBER_HIGH_OP.  */
-
-bool
-reg_is_clobbered_by_clobber_high (unsigned int regno, machine_mode reg_mode,
-				  const_rtx clobber_high_op)
-{
-  unsigned int clobber_regno = REGNO (clobber_high_op);
-  machine_mode clobber_mode = GET_MODE (clobber_high_op);
-  unsigned char regno_nregs = hard_regno_nregs (regno, reg_mode);
-
-  /* Clobber high should always span exactly one register.  */
-  gcc_assert (REG_NREGS (clobber_high_op) == 1);
-
-  /* Clobber high needs to match with one of the registers in X.  */
-  if (clobber_regno < regno || clobber_regno >= regno + regno_nregs)
-    return false;
-
-  gcc_assert (reg_mode != BLKmode && clobber_mode != BLKmode);
-
-  if (reg_mode == VOIDmode)
-    return clobber_mode != VOIDmode;
-
-  /* Clobber high will clobber if its size might be greater than the size of
-     register regno.  */
-  return maybe_gt (exact_div (GET_MODE_SIZE (reg_mode), regno_nregs),
-		 GET_MODE_SIZE (clobber_mode));
-}
diff --git a/gcc/rtx-vector-builder.h b/gcc/rtx-vector-builder.h
index d5950e2b8..08b55dd36 100644
--- a/gcc/rtx-vector-builder.h
+++ b/gcc/rtx-vector-builder.h
@@ -24,10 +24,11 @@ along with GCC; see the file COPYING3.  If not see
 
 /* This class is used to build VECTOR_CSTs from a sequence of elements.
    See vector_builder for more details.  */
-class rtx_vector_builder : public vector_builder<rtx, rtx_vector_builder>
+class rtx_vector_builder : public vector_builder<rtx, machine_mode,
+						 rtx_vector_builder>
 {
-  typedef vector_builder<rtx, rtx_vector_builder> parent;
-  friend class vector_builder<rtx, rtx_vector_builder>;
+  typedef vector_builder<rtx, machine_mode, rtx_vector_builder> parent;
+  friend class vector_builder<rtx, machine_mode, rtx_vector_builder>;
 
 public:
   rtx_vector_builder () : m_mode (VOIDmode) {}
@@ -48,6 +49,15 @@ private:
   bool can_elide_p (rtx) const { return true; }
   void note_representative (rtx *, rtx) {}
 
+  static poly_uint64 shape_nelts (machine_mode mode)
+    { return GET_MODE_NUNITS (mode); }
+  static poly_uint64 nelts_of (const_rtx x)
+    { return CONST_VECTOR_NUNITS (x); }
+  static unsigned int npatterns_of (const_rtx x)
+    { return CONST_VECTOR_NPATTERNS (x); }
+  static unsigned int nelts_per_pattern_of (const_rtx x)
+    { return CONST_VECTOR_NELTS_PER_PATTERN (x); }
+
   rtx find_cached_value ();
 
   machine_mode m_mode;
diff --git a/gcc/sched-deps.c b/gcc/sched-deps.c
index 28b9d38ab..fe447d16a 100644
--- a/gcc/sched-deps.c
+++ b/gcc/sched-deps.c
@@ -38,6 +38,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "sched-int.h"
 #include "params.h"
 #include "cselib.h"
+#include "function-abi.h"
 
 #ifdef INSN_SCHEDULING
 
@@ -2203,9 +2204,9 @@ init_insn_reg_pressure_info (rtx_insn *insn)
       reg_pressure_info[cl].change = 0;
     }
 
-  note_stores (PATTERN (insn), mark_insn_reg_clobber, insn);
+  note_stores (insn, mark_insn_reg_clobber, insn);
 
-  note_stores (PATTERN (insn), mark_insn_reg_store, insn);
+  note_stores (insn, mark_insn_reg_store, insn);
 
   if (AUTO_INC_DEC)
     for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
@@ -2319,13 +2320,6 @@ sched_analyze_reg (struct deps_desc *deps, int regno, machine_mode mode,
 	  while (--i >= 0)
 	    note_reg_use (regno + i);
 	}
-      else if (ref == CLOBBER_HIGH)
-	{
-	  gcc_assert (i == 1);
-	  /* We don't know the current state of the register, so have to treat
-	     the clobber high as a full clobber.  */
-	  note_reg_clobber (regno);
-	}
       else
 	{
 	  while (--i >= 0)
@@ -2349,8 +2343,6 @@ sched_analyze_reg (struct deps_desc *deps, int regno, machine_mode mode,
       else if (ref == USE)
 	note_reg_use (regno);
       else
-	/* For CLOBBER_HIGH, we don't know the current state of the register,
-	   so have to treat it as a full clobber.  */
 	note_reg_clobber (regno);
 
       /* Pseudos that are REG_EQUIV to something may be replaced
@@ -2885,7 +2877,7 @@ get_implicit_reg_pending_clobbers (HARD_REG_SET *temp, rtx_insn *insn)
   preprocess_constraints (insn);
   alternative_mask preferred = get_preferred_alternatives (insn);
   ira_implicitly_set_insn_hard_regs (temp, preferred);
-  AND_COMPL_HARD_REG_SET (*temp, ira_no_alloc_regs);
+  *temp &= ~ira_no_alloc_regs;
 }
 
 /* Analyze an INSN with pattern X to find all dependencies.  */
@@ -2901,7 +2893,7 @@ sched_analyze_insn (struct deps_desc *deps, rtx x, rtx_insn *insn)
     {
       HARD_REG_SET temp;
       get_implicit_reg_pending_clobbers (&temp, insn);
-      IOR_HARD_REG_SET (implicit_reg_pending_clobbers, temp);
+      implicit_reg_pending_clobbers |= temp;
     }
 
   can_start_lhs_rhs_p = (NONJUMP_INSN_P (insn)
@@ -2973,7 +2965,7 @@ sched_analyze_insn (struct deps_desc *deps, rtx x, rtx_insn *insn)
 	      sub = COND_EXEC_CODE (sub);
 	      code = GET_CODE (sub);
 	    }
-	  else if (code == SET || code == CLOBBER || code == CLOBBER_HIGH)
+	  else if (code == SET || code == CLOBBER)
 	    sched_analyze_1 (deps, sub, insn);
 	  else
 	    sched_analyze_2 (deps, sub, insn);
@@ -2989,10 +2981,6 @@ sched_analyze_insn (struct deps_desc *deps, rtx x, rtx_insn *insn)
 	{
 	  if (GET_CODE (XEXP (link, 0)) == CLOBBER)
 	    sched_analyze_1 (deps, XEXP (link, 0), insn);
-	  else if (GET_CODE (XEXP (link, 0)) == CLOBBER_HIGH)
-	    /* We could support CLOBBER_HIGH and treat it in the same way as
-	      HARD_REGNO_CALL_PART_CLOBBERED, but no port needs that yet.  */
-	    gcc_unreachable ();
 	  else if (GET_CODE (XEXP (link, 0)) != SET)
 	    sched_analyze_2 (deps, XEXP (link, 0), insn);
 	}
@@ -3332,10 +3320,9 @@ sched_analyze_insn (struct deps_desc *deps, rtx x, rtx_insn *insn)
       IOR_REG_SET (&deps->reg_last_in_use, reg_pending_uses);
       IOR_REG_SET (&deps->reg_last_in_use, reg_pending_clobbers);
       IOR_REG_SET (&deps->reg_last_in_use, reg_pending_sets);
-      for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-	if (TEST_HARD_REG_BIT (implicit_reg_pending_uses, i)
-	    || TEST_HARD_REG_BIT (implicit_reg_pending_clobbers, i))
-	  SET_REGNO_REG_SET (&deps->reg_last_in_use, i);
+      IOR_REG_SET_HRS (&deps->reg_last_in_use,
+		       implicit_reg_pending_uses
+		       | implicit_reg_pending_clobbers);
 
       /* Set up the pending barrier found.  */
       deps->last_reg_pending_barrier = reg_pending_barrier;
@@ -3724,6 +3711,7 @@ deps_analyze_insn (struct deps_desc *deps, rtx_insn *insn)
         }
       else
         {
+	  function_abi callee_abi = insn_callee_abi (insn);
           for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
             /* A call may read and modify global register variables.  */
             if (global_regs[i])
@@ -3735,8 +3723,8 @@ deps_analyze_insn (struct deps_desc *deps, rtx_insn *insn)
              Since we only have a choice between 'might be clobbered'
              and 'definitely not clobbered', we must include all
              partly call-clobbered registers here.  */
-	    else if (targetm.hard_regno_call_part_clobbered (insn, i,
-							     reg_raw_mode[i])
+	    else if (targetm.hard_regno_call_part_clobbered
+		     (callee_abi.id (), i, reg_raw_mode[i])
                      || TEST_HARD_REG_BIT (regs_invalidated_by_call, i))
               SET_REGNO_REG_SET (reg_pending_clobbers, i);
           /* We don't know what set of fixed registers might be used
diff --git a/gcc/sched-rgn.c b/gcc/sched-rgn.c
index 83688b3c9..c5ee33bf5 100644
--- a/gcc/sched-rgn.c
+++ b/gcc/sched-rgn.c
@@ -2409,7 +2409,7 @@ static bool
 sets_likely_spilled (rtx pat)
 {
   bool ret = false;
-  note_stores (pat, sets_likely_spilled_1, &ret);
+  note_pattern_stores (pat, sets_likely_spilled_1, &ret);
   return ret;
 }
 
diff --git a/gcc/sel-sched-ir.c b/gcc/sel-sched-ir.c
index 6dec1beaa..f8f1d8238 100644
--- a/gcc/sel-sched-ir.c
+++ b/gcc/sel-sched-ir.c
@@ -2661,12 +2661,9 @@ setup_id_implicit_regs (idata_t id, insn_t insn)
     return;
 
   HARD_REG_SET temp;
-  unsigned regno;
-  hard_reg_set_iterator hrsi;
 
   get_implicit_reg_pending_clobbers (&temp, insn);
-  EXECUTE_IF_SET_IN_HARD_REG_SET (temp, 0, regno, hrsi)
-    SET_REGNO_REG_SET (IDATA_REG_SETS (id), regno);
+  IOR_REG_SET_HRS (IDATA_REG_SETS (id), temp);
 }
 
 /* Setup register sets describing INSN in ID.  */
diff --git a/gcc/sel-sched.c b/gcc/sel-sched.c
index f127ff745..bf370b5a5 100644
--- a/gcc/sel-sched.c
+++ b/gcc/sel-sched.c
@@ -1102,7 +1102,7 @@ init_regs_for_mode (machine_mode mode)
       if (i >= 0)
         continue;
 
-      if (targetm.hard_regno_call_part_clobbered (NULL, cur_reg, mode))
+      if (targetm.hard_regno_call_part_clobbered (0, cur_reg, mode))
         SET_HARD_REG_BIT (sel_hrd.regs_for_call_clobbered[mode],
                           cur_reg);
 
@@ -1123,7 +1123,7 @@ init_hard_regs_data (void)
 
   CLEAR_HARD_REG_SET (sel_hrd.regs_ever_used);
   for (cur_reg = 0; cur_reg < FIRST_PSEUDO_REGISTER; cur_reg++)
-    if (df_regs_ever_live_p (cur_reg) || call_used_regs[cur_reg])
+    if (df_regs_ever_live_p (cur_reg) || call_used_or_fixed_reg_p (cur_reg))
       SET_HARD_REG_BIT (sel_hrd.regs_ever_used, cur_reg);
 
   /* Initialize registers that are valid based on mode when this is
@@ -1221,15 +1221,13 @@ mark_unavailable_hard_regs (def_t def, struct reg_rename *reg_rename_p,
      The HARD_REGNO_RENAME_OK covers other cases in condition below.  */
   if (IN_RANGE (REGNO (orig_dest), FIRST_STACK_REG, LAST_STACK_REG)
       && REGNO_REG_SET_P (used_regs, FIRST_STACK_REG))
-    IOR_HARD_REG_SET (reg_rename_p->unavailable_hard_regs,
-                      sel_hrd.stack_regs);
+    reg_rename_p->unavailable_hard_regs |= sel_hrd.stack_regs;
 #endif
 
-  /* If there's a call on this path, make regs from call_used_reg_set
+  /* If there's a call on this path, make regs from call_used_or_fixed_regs
      unavailable.  */
   if (def->crosses_call)
-    IOR_HARD_REG_SET (reg_rename_p->unavailable_hard_regs,
-                      call_used_reg_set);
+    reg_rename_p->unavailable_hard_regs |= call_used_or_fixed_regs;
 
   /* Stop here before reload: we need FRAME_REGS, STACK_REGS, and crosses_call,
      but not register classes.  */
@@ -1238,22 +1236,20 @@ mark_unavailable_hard_regs (def_t def, struct reg_rename *reg_rename_p,
 
   /* Leave regs as 'available' only from the current
      register class.  */
-  COPY_HARD_REG_SET (reg_rename_p->available_for_renaming,
-                     reg_class_contents[cl]);
+  reg_rename_p->available_for_renaming = reg_class_contents[cl];
 
   mode = GET_MODE (orig_dest);
 
   /* Leave only registers available for this mode.  */
   if (!sel_hrd.regs_for_mode_ok[mode])
     init_regs_for_mode (mode);
-  AND_HARD_REG_SET (reg_rename_p->available_for_renaming,
-                    sel_hrd.regs_for_mode[mode]);
+  reg_rename_p->available_for_renaming &= sel_hrd.regs_for_mode[mode];
 
   /* Exclude registers that are partially call clobbered.  */
   if (def->crosses_call
-      && !targetm.hard_regno_call_part_clobbered (NULL, regno, mode))
-    AND_COMPL_HARD_REG_SET (reg_rename_p->available_for_renaming,
-                            sel_hrd.regs_for_call_clobbered[mode]);
+      && !targetm.hard_regno_call_part_clobbered (0, regno, mode))
+    reg_rename_p->available_for_renaming
+      &= ~sel_hrd.regs_for_call_clobbered[mode];
 
   /* Leave only those that are ok to rename.  */
   EXECUTE_IF_SET_IN_HARD_REG_SET (reg_rename_p->available_for_renaming,
@@ -1274,8 +1270,7 @@ mark_unavailable_hard_regs (def_t def, struct reg_rename *reg_rename_p,
                             cur_reg);
     }
 
-  AND_COMPL_HARD_REG_SET (reg_rename_p->available_for_renaming,
-                          reg_rename_p->unavailable_hard_regs);
+  reg_rename_p->available_for_renaming &= ~reg_rename_p->unavailable_hard_regs;
 
   /* Regno is always ok from the renaming part of view, but it really
      could be in *unavailable_hard_regs already, so set it here instead
@@ -1686,8 +1681,7 @@ find_best_reg_for_expr (expr_t expr, blist_t bnds, bool *is_orig_reg_p)
 
 	  /* Join hard registers unavailable due to register class
 	     restrictions and live range intersection.  */
-	  IOR_HARD_REG_SET (hard_regs_used,
-			    reg_rename_data.unavailable_hard_regs);
+	  hard_regs_used |= reg_rename_data.unavailable_hard_regs;
 
 	  best_reg = choose_best_reg (hard_regs_used, &reg_rename_data,
 				      original_insns, is_orig_reg_p);
@@ -2110,7 +2104,7 @@ implicit_clobber_conflict_p (insn_t through_insn, expr_t expr)
   preprocess_constraints (insn);
   alternative_mask prefrred = get_preferred_alternatives (insn);
   ira_implicitly_set_insn_hard_regs (&temp, prefrred);
-  AND_COMPL_HARD_REG_SET (temp, ira_no_alloc_regs);
+  temp &= ~ira_no_alloc_regs;
 
   /* If any implicit clobber registers intersect with regular ones in
      through_insn, we have a dependency and thus bail out.  */
diff --git a/gcc/shrink-wrap.c b/gcc/shrink-wrap.c
index 57124db92..018696637 100644
--- a/gcc/shrink-wrap.c
+++ b/gcc/shrink-wrap.c
@@ -76,7 +76,7 @@ requires_stack_frame_p (rtx_insn *insn, HARD_REG_SET prologue_used,
     }
   if (hard_reg_set_intersect_p (hardregs, prologue_used))
     return true;
-  AND_COMPL_HARD_REG_SET (hardregs, call_used_reg_set);
+  hardregs &= ~call_used_or_fixed_regs;
   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
     if (TEST_HARD_REG_BIT (hardregs, regno)
 	&& df_regs_ever_live_p (regno))
@@ -151,8 +151,8 @@ live_edge_for_reg (basic_block bb, int regno, int end_regno)
 
 static bool
 move_insn_for_shrink_wrap (basic_block bb, rtx_insn *insn,
-			   const HARD_REG_SET uses,
-			   const HARD_REG_SET defs,
+			   const_hard_reg_set uses,
+			   const_hard_reg_set defs,
 			   bool *split_p,
 			   struct dead_debug_local *debug)
 {
@@ -687,9 +687,9 @@ try_shrink_wrapping (edge *entry_edge, rtx_insn *prologue_seq)
 	HARD_REG_SET this_used;
 	CLEAR_HARD_REG_SET (this_used);
 	note_uses (&PATTERN (insn), record_hard_reg_uses, &this_used);
-	AND_COMPL_HARD_REG_SET (this_used, prologue_clobbered);
-	IOR_HARD_REG_SET (prologue_used, this_used);
-	note_stores (PATTERN (insn), record_hard_reg_sets, &prologue_clobbered);
+	this_used &= ~prologue_clobbered;
+	prologue_used |= this_used;
+	note_stores (insn, record_hard_reg_sets, &prologue_clobbered);
       }
   CLEAR_HARD_REG_BIT (prologue_clobbered, STACK_POINTER_REGNUM);
   if (frame_pointer_needed)
diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index bdbd1b98e..612d21b72 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -35,6 +35,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "flags.h"
 #include "selftest.h"
 #include "selftest-rtl.h"
+#include "rtx-vector-builder.h"
 
 /* Simplification and canonicalization of RTL.  */
 
@@ -45,7 +46,6 @@ along with GCC; see the file COPYING3.  If not see
 #define HWI_SIGN_EXTEND(low) \
   ((((HOST_WIDE_INT) low) < 0) ? HOST_WIDE_INT_M1 : HOST_WIDE_INT_0)
 
-static rtx neg_const_int (machine_mode, const_rtx);
 static bool plus_minus_operand_p (const_rtx);
 static rtx simplify_plus_minus (enum rtx_code, machine_mode, rtx, rtx);
 static rtx simplify_associative_operation (enum rtx_code, machine_mode,
@@ -56,17 +56,12 @@ static rtx simplify_unary_operation_1 (enum rtx_code, machine_mode, rtx);
 static rtx simplify_binary_operation_1 (enum rtx_code, machine_mode,
 					rtx, rtx, rtx, rtx);
 
-/* Negate a CONST_INT rtx.  */
+/* Negate I, which satisfies poly_int_rtx_p.  MODE is the mode of I.  */
+
 static rtx
-neg_const_int (machine_mode mode, const_rtx i)
+neg_poly_int_rtx (machine_mode mode, const_rtx i)
 {
-  unsigned HOST_WIDE_INT val = -UINTVAL (i);
-  
-  if (!HWI_COMPUTABLE_MODE_P (mode)
-      && val == UINTVAL (i))
-    return simplify_const_unary_operation (NEG, mode, CONST_CAST_RTX (i),
-					   mode);
-  return gen_int_mode (val, mode);
+  return immed_wide_int_const (-wi::to_poly_wide (i, mode), mode);
 }
 
 /* Test whether expression, X, is an immediate constant that represents
@@ -1504,12 +1499,12 @@ simplify_unary_operation_1 (enum rtx_code code, machine_mode mode, rtx op)
 	  && CONST_INT_P (XEXP (op, 1))
 	  && XEXP (XEXP (op, 0), 1) == XEXP (op, 1)
 	  && (op_mode = as_a <scalar_int_mode> (GET_MODE (op)),
-	      GET_MODE_BITSIZE (op_mode) > INTVAL (XEXP (op, 1))))
+	      GET_MODE_PRECISION (op_mode) > INTVAL (XEXP (op, 1))))
 	{
 	  scalar_int_mode tmode;
-	  gcc_assert (GET_MODE_BITSIZE (int_mode)
-		      > GET_MODE_BITSIZE (op_mode));
-	  if (int_mode_for_size (GET_MODE_BITSIZE (op_mode)
+	  gcc_assert (GET_MODE_PRECISION (int_mode)
+		      > GET_MODE_PRECISION (op_mode));
+	  if (int_mode_for_size (GET_MODE_PRECISION (op_mode)
 				 - INTVAL (XEXP (op, 1)), 1).exists (&tmode))
 	    {
 	      rtx inner =
@@ -1735,45 +1730,42 @@ simplify_const_unary_operation (enum rtx_code code, machine_mode mode,
       }
       if (CONST_SCALAR_INT_P (op) || CONST_DOUBLE_AS_FLOAT_P (op))
 	return gen_const_vec_duplicate (mode, op);
-      unsigned int n_elts;
       if (GET_CODE (op) == CONST_VECTOR
-	  && GET_MODE_NUNITS (mode).is_constant (&n_elts))
-	{
-	  /* This must be constant if we're duplicating it to a constant
-	     number of elements.  */
-	  unsigned int in_n_elts = CONST_VECTOR_NUNITS (op).to_constant ();
-	  gcc_assert (in_n_elts < n_elts);
-	  gcc_assert ((n_elts % in_n_elts) == 0);
-	  rtvec v = rtvec_alloc (n_elts);
-	  for (unsigned i = 0; i < n_elts; i++)
-	    RTVEC_ELT (v, i) = CONST_VECTOR_ELT (op, i % in_n_elts);
-	  return gen_rtx_CONST_VECTOR (mode, v);
+	  && (CONST_VECTOR_DUPLICATE_P (op)
+	      || CONST_VECTOR_NUNITS (op).is_constant ()))
+	{
+	  unsigned int npatterns = (CONST_VECTOR_DUPLICATE_P (op)
+				    ? CONST_VECTOR_NPATTERNS (op)
+				    : CONST_VECTOR_NUNITS (op).to_constant ());
+	  gcc_assert (multiple_p (GET_MODE_NUNITS (mode), npatterns));
+	  rtx_vector_builder builder (mode, npatterns, 1);
+	  for (unsigned i = 0; i < npatterns; i++)
+	    builder.quick_push (CONST_VECTOR_ELT (op, i));
+	  return builder.build ();
 	}
     }
 
-  if (VECTOR_MODE_P (mode) && GET_CODE (op) == CONST_VECTOR)
+  if (VECTOR_MODE_P (mode)
+      && GET_CODE (op) == CONST_VECTOR
+      && known_eq (GET_MODE_NUNITS (mode), CONST_VECTOR_NUNITS (op)))
     {
-      unsigned int n_elts;
-      if (!CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
-	return NULL_RTX;
+      gcc_assert (GET_MODE (op) == op_mode);
 
-      machine_mode opmode = GET_MODE (op);
-      gcc_assert (known_eq (GET_MODE_NUNITS (mode), n_elts));
-      gcc_assert (known_eq (GET_MODE_NUNITS (opmode), n_elts));
-
-      rtvec v = rtvec_alloc (n_elts);
-      unsigned int i;
+      rtx_vector_builder builder;
+      if (!builder.new_unary_operation (mode, op, false))
+	return 0;
 
-      for (i = 0; i < n_elts; i++)
+      unsigned int count = builder.encoded_nelts ();
+      for (unsigned int i = 0; i < count; i++)
 	{
 	  rtx x = simplify_unary_operation (code, GET_MODE_INNER (mode),
 					    CONST_VECTOR_ELT (op, i),
-					    GET_MODE_INNER (opmode));
+					    GET_MODE_INNER (op_mode));
 	  if (!x || !valid_for_const_vector_p (mode, x))
 	    return 0;
-	  RTVEC_ELT (v, i) = x;
+	  builder.quick_push (x);
 	}
-      return gen_rtx_CONST_VECTOR (mode, v);
+      return builder.build ();
     }
 
   /* The order of these tests is critical so that, for example, we don't
@@ -2549,10 +2541,10 @@ simplify_binary_operation_1 (enum rtx_code code, machine_mode mode,
 	return plus_constant (mode, op0, trunc_int_for_mode (-offset, mode));
 
       /* Don't let a relocatable value get a negative coeff.  */
-      if (CONST_INT_P (op1) && GET_MODE (op0) != VOIDmode)
+      if (poly_int_rtx_p (op1) && GET_MODE (op0) != VOIDmode)
 	return simplify_gen_binary (PLUS, mode,
 				    op0,
-				    neg_const_int (mode, op1));
+				    neg_poly_int_rtx (mode, op1));
 
       /* (x - (x & y)) -> (x & ~y) */
       if (INTEGRAL_MODE_P (mode) && GET_CODE (op1) == AND)
@@ -4071,6 +4063,27 @@ simplify_binary_operation_1 (enum rtx_code code, machine_mode mode,
   return 0;
 }
 
+/* Return true if binary operation OP distributes over addition in operand
+   OPNO, with the other operand being held constant.  OPNO counts from 1.  */
+
+static bool
+distributes_over_addition_p (rtx_code op, int opno)
+{
+  switch (op)
+    {
+    case PLUS:
+    case MINUS:
+    case MULT:
+      return true;
+
+    case ASHIFT:
+      return opno == 1;
+
+    default:
+      return false;
+    }
+}
+
 rtx
 simplify_const_binary_operation (enum rtx_code code, machine_mode mode,
 				 rtx op0, rtx op1)
@@ -4080,26 +4093,45 @@ simplify_const_binary_operation (enum rtx_code code, machine_mode mode,
       && GET_CODE (op0) == CONST_VECTOR
       && GET_CODE (op1) == CONST_VECTOR)
     {
-      unsigned int n_elts;
-      if (!CONST_VECTOR_NUNITS (op0).is_constant (&n_elts))
-	return NULL_RTX;
-
-      gcc_assert (known_eq (n_elts, CONST_VECTOR_NUNITS (op1)));
-      gcc_assert (known_eq (n_elts, GET_MODE_NUNITS (mode)));
-      rtvec v = rtvec_alloc (n_elts);
-      unsigned int i;
+      bool step_ok_p;
+      if (CONST_VECTOR_STEPPED_P (op0)
+	  && CONST_VECTOR_STEPPED_P (op1))
+	/* We can operate directly on the encoding if:
+
+	      a3 - a2 == a2 - a1 && b3 - b2 == b2 - b1
+	    implies
+	      (a3 op b3) - (a2 op b2) == (a2 op b2) - (a1 op b1)
+
+	   Addition and subtraction are the supported operators
+	   for which this is true.  */
+	step_ok_p = (code == PLUS || code == MINUS);
+      else if (CONST_VECTOR_STEPPED_P (op0))
+	/* We can operate directly on stepped encodings if:
+
+	     a3 - a2 == a2 - a1
+	   implies:
+	     (a3 op c) - (a2 op c) == (a2 op c) - (a1 op c)
+
+	   which is true if (x -> x op c) distributes over addition.  */
+	step_ok_p = distributes_over_addition_p (code, 1);
+      else
+	/* Similarly in reverse.  */
+	step_ok_p = distributes_over_addition_p (code, 2);
+      rtx_vector_builder builder;
+      if (!builder.new_binary_operation (mode, op0, op1, step_ok_p))
+	return 0;
 
-      for (i = 0; i < n_elts; i++)
+      unsigned int count = builder.encoded_nelts ();
+      for (unsigned int i = 0; i < count; i++)
 	{
 	  rtx x = simplify_binary_operation (code, GET_MODE_INNER (mode),
 					     CONST_VECTOR_ELT (op0, i),
 					     CONST_VECTOR_ELT (op1, i));
 	  if (!x || !valid_for_const_vector_p (mode, x))
 	    return 0;
-	  RTVEC_ELT (v, i) = x;
+	  builder.quick_push (x);
 	}
-
-      return gen_rtx_CONST_VECTOR (mode, v);
+      return builder.build ();
     }
 
   if (VECTOR_MODE_P (mode)
@@ -4593,11 +4625,12 @@ simplify_plus_minus (enum rtx_code code, machine_mode mode, rtx op0,
 		}
 	      break;
 
-	    case CONST_INT:
+	    CASE_CONST_SCALAR_INT:
+	    case CONST_POLY_INT:
 	      n_constants++;
 	      if (this_neg)
 		{
-		  ops[i].op = neg_const_int (mode, this_op);
+		  ops[i].op = neg_poly_int_rtx (mode, this_op);
 		  ops[i].neg = 0;
 		  changed = 1;
 		  canonicalized = 1;
@@ -4722,8 +4755,8 @@ simplify_plus_minus (enum rtx_code code, machine_mode mode, rtx op0,
 		    lneg &= rneg;
 		    if (GET_CODE (tem) == NEG)
 		      tem = XEXP (tem, 0), lneg = !lneg;
-		    if (CONST_INT_P (tem) && lneg)
-		      tem = neg_const_int (mode, tem), lneg = 0;
+		    if (poly_int_rtx_p (tem) && lneg)
+		      tem = neg_poly_int_rtx (mode, tem), lneg = 0;
 
 		    ops[i].op = tem;
 		    ops[i].neg = lneg;
@@ -4782,12 +4815,12 @@ simplify_plus_minus (enum rtx_code code, machine_mode mode, rtx op0,
      in the array and that any other constant will be next-to-last.  */
 
   if (n_ops > 1
-      && CONST_INT_P (ops[n_ops - 1].op)
+      && poly_int_rtx_p (ops[n_ops - 1].op)
       && CONSTANT_P (ops[n_ops - 2].op))
     {
       rtx value = ops[n_ops - 1].op;
       if (ops[n_ops - 1].neg ^ ops[n_ops - 2].neg)
-	value = neg_const_int (mode, value);
+	value = neg_poly_int_rtx (mode, value);
       if (CONST_INT_P (value))
 	{
 	  ops[n_ops - 2].op = plus_constant (mode, ops[n_ops - 2].op,
@@ -6104,342 +6137,466 @@ simplify_ternary_operation (enum rtx_code code, machine_mode mode,
   return 0;
 }
 
-/* Evaluate a SUBREG of a CONST_INT or CONST_WIDE_INT or CONST_DOUBLE
-   or CONST_FIXED or CONST_VECTOR, returning another CONST_INT or
-   CONST_WIDE_INT or CONST_DOUBLE or CONST_FIXED or CONST_VECTOR.
+/* Try to calculate NUM_BYTES bytes of the target memory image of X,
+   starting at byte FIRST_BYTE.  Return true on success and add the
+   bytes to BYTES, such that each byte has BITS_PER_UNIT bits and such
+   that the bytes follow target memory order.  Leave BYTES unmodified
+   on failure.
 
-   Works by unpacking INNER_BYTES bytes of OP into a collection of 8-bit values
-   represented as a little-endian array of 'unsigned char', selecting by BYTE,
-   and then repacking them again for OUTERMODE.  If OP is a CONST_VECTOR,
-   FIRST_ELEM is the number of the first element to extract, otherwise
-   FIRST_ELEM is ignored.  */
+   MODE is the mode of X.  The caller must reserve NUM_BYTES bytes in
+   BYTES before calling this function.  */
 
-static rtx
-simplify_immed_subreg (fixed_size_mode outermode, rtx op,
-		       machine_mode innermode, unsigned int byte,
-		       unsigned int first_elem, unsigned int inner_bytes)
+bool
+native_encode_rtx (machine_mode mode, rtx x, vec<target_unit> &bytes,
+		   unsigned int first_byte, unsigned int num_bytes)
 {
-  enum {
-    value_bit = 8,
-    value_mask = (1 << value_bit) - 1
-  };
-  unsigned char value[MAX_BITSIZE_MODE_ANY_MODE / value_bit];
-  int value_start;
-  int i;
-  int elem;
-
-  int num_elem;
-  rtx * elems;
-  int elem_bitsize;
-  rtx result_s = NULL;
-  rtvec result_v = NULL;
-  enum mode_class outer_class;
-  scalar_mode outer_submode;
-  int max_bitsize;
+  /* Check the mode is sensible.  */
+  gcc_assert (GET_MODE (x) == VOIDmode
+	      ? is_a <scalar_int_mode> (mode)
+	      : mode == GET_MODE (x));
 
-  /* Some ports misuse CCmode.  */
-  if (GET_MODE_CLASS (outermode) == MODE_CC && CONST_INT_P (op))
-    return op;
+  if (GET_CODE (x) == CONST_VECTOR)
+    {
+      /* CONST_VECTOR_ELT follows target memory order, so no shuffling
+	 is necessary.  The only complication is that MODE_VECTOR_BOOL
+	 vectors can have several elements per byte.  */
+      unsigned int elt_bits = vector_element_size (GET_MODE_BITSIZE (mode),
+						   GET_MODE_NUNITS (mode));
+      unsigned int elt = first_byte * BITS_PER_UNIT / elt_bits;
+      if (elt_bits < BITS_PER_UNIT)
+	{
+	  /* This is the only case in which elements can be smaller than
+	     a byte.  */
+	  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
+	  for (unsigned int i = 0; i < num_bytes; ++i)
+	    {
+	      target_unit value = 0;
+	      for (unsigned int j = 0; j < BITS_PER_UNIT; j += elt_bits)
+		{
+		  value |= (INTVAL (CONST_VECTOR_ELT (x, elt)) & 1) << j;
+		  elt += 1;
+		}
+	      bytes.quick_push (value);
+	    }
+	  return true;
+	}
 
-  /* We have no way to represent a complex constant at the rtl level.  */
-  if (COMPLEX_MODE_P (outermode))
-    return NULL_RTX;
+      unsigned int start = bytes.length ();
+      unsigned int elt_bytes = GET_MODE_UNIT_SIZE (mode);
+      /* Make FIRST_BYTE relative to ELT.  */
+      first_byte %= elt_bytes;
+      while (num_bytes > 0)
+	{
+	  /* Work out how many bytes we want from element ELT.  */
+	  unsigned int chunk_bytes = MIN (num_bytes, elt_bytes - first_byte);
+	  if (!native_encode_rtx (GET_MODE_INNER (mode),
+				  CONST_VECTOR_ELT (x, elt), bytes,
+				  first_byte, chunk_bytes))
+	    {
+	      bytes.truncate (start);
+	      return false;
+	    }
+	  elt += 1;
+	  first_byte = 0;
+	  num_bytes -= chunk_bytes;
+	}
+      return true;
+    }
 
-  /* We support any size mode.  */
-  max_bitsize = MAX (GET_MODE_BITSIZE (outermode),
-		     inner_bytes * BITS_PER_UNIT);
+  /* All subsequent cases are limited to scalars.  */
+  scalar_mode smode;
+  if (!is_a <scalar_mode> (mode, &smode))
+    return false;
 
-  /* Unpack the value.  */
+  /* Make sure that the region is in range.  */
+  unsigned int end_byte = first_byte + num_bytes;
+  unsigned int mode_bytes = GET_MODE_SIZE (smode);
+  gcc_assert (end_byte <= mode_bytes);
 
-  if (GET_CODE (op) == CONST_VECTOR)
+  if (CONST_SCALAR_INT_P (x))
     {
-      num_elem = CEIL (inner_bytes, GET_MODE_UNIT_SIZE (innermode));
-      elem_bitsize = GET_MODE_UNIT_BITSIZE (innermode);
+      /* The target memory layout is affected by both BYTES_BIG_ENDIAN
+	 and WORDS_BIG_ENDIAN.  Use the subreg machinery to get the lsb
+	 position of each byte.  */
+      rtx_mode_t value (x, smode);
+      wide_int_ref value_wi (value);
+      for (unsigned int byte = first_byte; byte < end_byte; ++byte)
+	{
+	  /* Always constant because the inputs are.  */
+	  unsigned int lsb
+	    = subreg_size_lsb (1, mode_bytes, byte).to_constant ();
+	  /* Operate directly on the encoding rather than using
+	     wi::extract_uhwi, so that we preserve the sign or zero
+	     extension for modes that are not a whole number of bits in
+	     size.  (Zero extension is only used for the combination of
+	     innermode == BImode && STORE_FLAG_VALUE == 1).  */
+	  unsigned int elt = lsb / HOST_BITS_PER_WIDE_INT;
+	  unsigned int shift = lsb % HOST_BITS_PER_WIDE_INT;
+	  unsigned HOST_WIDE_INT uhwi = value_wi.elt (elt);
+	  bytes.quick_push (uhwi >> shift);
+	}
+      return true;
     }
-  else
+
+  if (CONST_DOUBLE_P (x))
     {
-      num_elem = 1;
-      elem_bitsize = max_bitsize;
+      /* real_to_target produces an array of integers in target memory order.
+	 All integers before the last one have 32 bits; the last one may
+	 have 32 bits or fewer, depending on whether the mode bitsize
+	 is divisible by 32.  Each of these integers is then laid out
+	 in target memory as any other integer would be.  */
+      long el32[MAX_BITSIZE_MODE_ANY_MODE / 32];
+      real_to_target (el32, CONST_DOUBLE_REAL_VALUE (x), smode);
+
+      /* The (maximum) number of target bytes per element of el32.  */
+      unsigned int bytes_per_el32 = 32 / BITS_PER_UNIT;
+      gcc_assert (bytes_per_el32 != 0);
+
+      /* Build up the integers in a similar way to the CONST_SCALAR_INT_P
+	 handling above.  */
+      for (unsigned int byte = first_byte; byte < end_byte; ++byte)
+	{
+	  unsigned int index = byte / bytes_per_el32;
+	  unsigned int subbyte = byte % bytes_per_el32;
+	  unsigned int int_bytes = MIN (bytes_per_el32,
+					mode_bytes - index * bytes_per_el32);
+	  /* Always constant because the inputs are.  */
+	  unsigned int lsb
+	    = subreg_size_lsb (1, int_bytes, subbyte).to_constant ();
+	  bytes.quick_push ((unsigned long) el32[index] >> lsb);
+	}
+      return true;
     }
-  /* If this asserts, it is too complicated; reducing value_bit may help.  */
-  gcc_assert (BITS_PER_UNIT % value_bit == 0);
-  /* I don't know how to handle endianness of sub-units.  */
-  gcc_assert (elem_bitsize % BITS_PER_UNIT == 0);
 
-  for (elem = 0; elem < num_elem; elem++)
+  if (GET_CODE (x) == CONST_FIXED)
     {
-      unsigned char * vp;
-      rtx el = (GET_CODE (op) == CONST_VECTOR
-		? CONST_VECTOR_ELT (op, first_elem + elem)
-		: op);
+      for (unsigned int byte = first_byte; byte < end_byte; ++byte)
+	{
+	  /* Always constant because the inputs are.  */
+	  unsigned int lsb
+	    = subreg_size_lsb (1, mode_bytes, byte).to_constant ();
+	  unsigned HOST_WIDE_INT piece = CONST_FIXED_VALUE_LOW (x);
+	  if (lsb >= HOST_BITS_PER_WIDE_INT)
+	    {
+	      lsb -= HOST_BITS_PER_WIDE_INT;
+	      piece = CONST_FIXED_VALUE_HIGH (x);
+	    }
+	  bytes.quick_push (piece >> lsb);
+	}
+      return true;
+    }
 
-      /* Vectors are kept in target memory order.  (This is probably
-	 a mistake.)  */
-      {
-	unsigned byte = (elem * elem_bitsize) / BITS_PER_UNIT;
-	unsigned ibyte = (((num_elem - 1 - elem) * elem_bitsize)
-			  / BITS_PER_UNIT);
-	unsigned word_byte = WORDS_BIG_ENDIAN ? ibyte : byte;
-	unsigned subword_byte = BYTES_BIG_ENDIAN ? ibyte : byte;
-	unsigned bytele = (subword_byte % UNITS_PER_WORD
-			 + (word_byte / UNITS_PER_WORD) * UNITS_PER_WORD);
-	vp = value + (bytele * BITS_PER_UNIT) / value_bit;
-      }
+  return false;
+}
 
-      switch (GET_CODE (el))
-	{
-	case CONST_INT:
-	  for (i = 0;
-	       i < HOST_BITS_PER_WIDE_INT && i < elem_bitsize;
-	       i += value_bit)
-	    *vp++ = INTVAL (el) >> i;
-	  /* CONST_INTs are always logically sign-extended.  */
-	  for (; i < elem_bitsize; i += value_bit)
-	    *vp++ = INTVAL (el) < 0 ? -1 : 0;
-	  break;
+/* Read a vector of mode MODE from the target memory image given by BYTES,
+   starting at byte FIRST_BYTE.  The vector is known to be encodable using
+   NPATTERNS interleaved patterns with NELTS_PER_PATTERN elements each,
+   and BYTES is known to have enough bytes to supply NPATTERNS *
+   NELTS_PER_PATTERN vector elements.  Each element of BYTES contains
+   BITS_PER_UNIT bits and the bytes are in target memory order.
 
-	case CONST_WIDE_INT:
-	  {
-	    rtx_mode_t val = rtx_mode_t (el, GET_MODE_INNER (innermode));
-	    unsigned char extend = wi::sign_mask (val);
-	    int prec = wi::get_precision (val);
-
-	    for (i = 0; i < prec && i < elem_bitsize; i += value_bit)
-	      *vp++ = wi::extract_uhwi (val, i, value_bit);
-	    for (; i < elem_bitsize; i += value_bit)
-	      *vp++ = extend;
-	  }
-	  break;
+   Return the vector on success, otherwise return NULL_RTX.  */
 
-	case CONST_DOUBLE:
-	  if (TARGET_SUPPORTS_WIDE_INT == 0 && GET_MODE (el) == VOIDmode)
-	    {
-	      unsigned char extend = 0;
-	      /* If this triggers, someone should have generated a
-		 CONST_INT instead.  */
-	      gcc_assert (elem_bitsize > HOST_BITS_PER_WIDE_INT);
-
-	      for (i = 0; i < HOST_BITS_PER_WIDE_INT; i += value_bit)
-		*vp++ = CONST_DOUBLE_LOW (el) >> i;
-	      while (i < HOST_BITS_PER_DOUBLE_INT && i < elem_bitsize)
-		{
-		  *vp++
-		    = CONST_DOUBLE_HIGH (el) >> (i - HOST_BITS_PER_WIDE_INT);
-		  i += value_bit;
-		}
+rtx
+native_decode_vector_rtx (machine_mode mode, vec<target_unit> bytes,
+			  unsigned int first_byte, unsigned int npatterns,
+			  unsigned int nelts_per_pattern)
+{
+  rtx_vector_builder builder (mode, npatterns, nelts_per_pattern);
 
-	      if (CONST_DOUBLE_HIGH (el) >> (HOST_BITS_PER_WIDE_INT - 1))
-		extend = -1;
-	      for (; i < elem_bitsize; i += value_bit)
-		*vp++ = extend;
-	    }
-	  else
-	    {
-	      /* This is big enough for anything on the platform.  */
-	      long tmp[MAX_BITSIZE_MODE_ANY_MODE / 32];
-	      scalar_float_mode el_mode;
+  unsigned int elt_bits = vector_element_size (GET_MODE_BITSIZE (mode),
+					       GET_MODE_NUNITS (mode));
+  if (elt_bits < BITS_PER_UNIT)
+    {
+      /* This is the only case in which elements can be smaller than a byte.
+	 Element 0 is always in the lsb of the containing byte.  */
+      gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
+      for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
+	{
+	  unsigned int bit_index = first_byte * BITS_PER_UNIT + i * elt_bits;
+	  unsigned int byte_index = bit_index / BITS_PER_UNIT;
+	  unsigned int lsb = bit_index % BITS_PER_UNIT;
+	  builder.quick_push (bytes[byte_index] & (1 << lsb)
+			      ? CONST1_RTX (BImode)
+			      : CONST0_RTX (BImode));
+	}
+    }
+  else
+    {
+      for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
+	{
+	  rtx x = native_decode_rtx (GET_MODE_INNER (mode), bytes, first_byte);
+	  if (!x)
+	    return NULL_RTX;
+	  builder.quick_push (x);
+	  first_byte += elt_bits / BITS_PER_UNIT;
+	}
+    }
+  return builder.build ();
+}
 
-	      el_mode = as_a <scalar_float_mode> (GET_MODE (el));
-	      int bitsize = GET_MODE_BITSIZE (el_mode);
+/* Read an rtx of mode MODE from the target memory image given by BYTES,
+   starting at byte FIRST_BYTE.  Each element of BYTES contains BITS_PER_UNIT
+   bits and the bytes are in target memory order.  The image has enough
+   values to specify all bytes of MODE.
 
-	      gcc_assert (bitsize <= elem_bitsize);
-	      gcc_assert (bitsize % value_bit == 0);
+   Return the rtx on success, otherwise return NULL_RTX.  */
 
-	      real_to_target (tmp, CONST_DOUBLE_REAL_VALUE (el),
-			      GET_MODE (el));
+rtx
+native_decode_rtx (machine_mode mode, vec<target_unit> bytes,
+		   unsigned int first_byte)
+{
+  if (VECTOR_MODE_P (mode))
+    {
+      /* If we know at compile time how many elements there are,
+	 pull each element directly from BYTES.  */
+      unsigned int nelts;
+      if (GET_MODE_NUNITS (mode).is_constant (&nelts))
+	return native_decode_vector_rtx (mode, bytes, first_byte, nelts, 1);
+      return NULL_RTX;
+    }
 
-	      /* real_to_target produces its result in words affected by
-		 FLOAT_WORDS_BIG_ENDIAN.  However, we ignore this,
-		 and use WORDS_BIG_ENDIAN instead; see the documentation
-	         of SUBREG in rtl.texi.  */
-	      for (i = 0; i < bitsize; i += value_bit)
-		{
-		  int ibase;
-		  if (WORDS_BIG_ENDIAN)
-		    ibase = bitsize - 1 - i;
-		  else
-		    ibase = i;
-		  *vp++ = tmp[ibase / 32] >> i % 32;
-		}
+  scalar_int_mode imode;
+  if (is_a <scalar_int_mode> (mode, &imode)
+      && GET_MODE_PRECISION (imode) <= MAX_BITSIZE_MODE_ANY_INT)
+    {
+      /* Pull the bytes msb first, so that we can use simple
+	 shift-and-insert wide_int operations.  */
+      unsigned int size = GET_MODE_SIZE (imode);
+      wide_int result (wi::zero (GET_MODE_PRECISION (imode)));
+      for (unsigned int i = 0; i < size; ++i)
+	{
+	  unsigned int lsb = (size - i - 1) * BITS_PER_UNIT;
+	  /* Always constant because the inputs are.  */
+	  unsigned int subbyte
+	    = subreg_size_offset_from_lsb (1, size, lsb).to_constant ();
+	  result <<= BITS_PER_UNIT;
+	  result |= bytes[first_byte + subbyte];
+	}
+      return immed_wide_int_const (result, imode);
+    }
 
-	      /* It shouldn't matter what's done here, so fill it with
-		 zero.  */
-	      for (; i < elem_bitsize; i += value_bit)
-		*vp++ = 0;
-	    }
-	  break;
+  scalar_float_mode fmode;
+  if (is_a <scalar_float_mode> (mode, &fmode))
+    {
+      /* We need to build an array of integers in target memory order.
+	 All integers before the last one have 32 bits; the last one may
+	 have 32 bits or fewer, depending on whether the mode bitsize
+	 is divisible by 32.  */
+      long el32[MAX_BITSIZE_MODE_ANY_MODE / 32];
+      unsigned int num_el32 = CEIL (GET_MODE_BITSIZE (fmode), 32);
+      memset (el32, 0, num_el32 * sizeof (long));
+
+      /* The (maximum) number of target bytes per element of el32.  */
+      unsigned int bytes_per_el32 = 32 / BITS_PER_UNIT;
+      gcc_assert (bytes_per_el32 != 0);
+
+      unsigned int mode_bytes = GET_MODE_SIZE (fmode);
+      for (unsigned int byte = 0; byte < mode_bytes; ++byte)
+	{
+	  unsigned int index = byte / bytes_per_el32;
+	  unsigned int subbyte = byte % bytes_per_el32;
+	  unsigned int int_bytes = MIN (bytes_per_el32,
+					mode_bytes - index * bytes_per_el32);
+	  /* Always constant because the inputs are.  */
+	  unsigned int lsb
+	    = subreg_size_lsb (1, int_bytes, subbyte).to_constant ();
+	  el32[index] |= (unsigned long) bytes[first_byte + byte] << lsb;
+	}
+      REAL_VALUE_TYPE r;
+      real_from_target (&r, el32, fmode);
+      return const_double_from_real_value (r, fmode);
+    }
 
-        case CONST_FIXED:
-	  if (elem_bitsize <= HOST_BITS_PER_WIDE_INT)
-	    {
-	      for (i = 0; i < elem_bitsize; i += value_bit)
-		*vp++ = CONST_FIXED_VALUE_LOW (el) >> i;
-	    }
+  if (ALL_SCALAR_FIXED_POINT_MODE_P (mode))
+    {
+      scalar_mode smode = as_a <scalar_mode> (mode);
+      FIXED_VALUE_TYPE f;
+      f.data.low = 0;
+      f.data.high = 0;
+      f.mode = smode;
+
+      unsigned int mode_bytes = GET_MODE_SIZE (smode);
+      for (unsigned int byte = 0; byte < mode_bytes; ++byte)
+	{
+	  /* Always constant because the inputs are.  */
+	  unsigned int lsb
+	    = subreg_size_lsb (1, mode_bytes, byte).to_constant ();
+	  unsigned HOST_WIDE_INT unit = bytes[first_byte + byte];
+	  if (lsb >= HOST_BITS_PER_WIDE_INT)
+	    f.data.high |= unit << (lsb - HOST_BITS_PER_WIDE_INT);
 	  else
-	    {
-	      for (i = 0; i < HOST_BITS_PER_WIDE_INT; i += value_bit)
-		*vp++ = CONST_FIXED_VALUE_LOW (el) >> i;
-              for (; i < HOST_BITS_PER_DOUBLE_INT && i < elem_bitsize;
-		   i += value_bit)
-		*vp++ = CONST_FIXED_VALUE_HIGH (el)
-			>> (i - HOST_BITS_PER_WIDE_INT);
-	      for (; i < elem_bitsize; i += value_bit)
-		*vp++ = 0;
-	    }
-          break;
-
-	default:
-	  gcc_unreachable ();
+	    f.data.low |= unit << lsb;
 	}
+      return CONST_FIXED_FROM_FIXED_VALUE (f, mode);
     }
 
-  /* Now, pick the right byte to start with.  */
-  /* Renumber BYTE so that the least-significant byte is byte 0.  A special
-     case is paradoxical SUBREGs, which shouldn't be adjusted since they
-     will already have offset 0.  */
-  if (inner_bytes >= GET_MODE_SIZE (outermode))
+  return NULL_RTX;
+}
+
+/* Simplify a byte offset BYTE into CONST_VECTOR X.  The main purpose
+   is to convert a runtime BYTE value into a constant one.  */
+
+static poly_uint64
+simplify_const_vector_byte_offset (rtx x, poly_uint64 byte)
+{
+  /* Cope with MODE_VECTOR_BOOL by operating on bits rather than bytes.  */
+  machine_mode mode = GET_MODE (x);
+  unsigned int elt_bits = vector_element_size (GET_MODE_BITSIZE (mode),
+					       GET_MODE_NUNITS (mode));
+  /* The number of bits needed to encode one element from each pattern.  */
+  unsigned int sequence_bits = CONST_VECTOR_NPATTERNS (x) * elt_bits;
+
+  /* Identify the start point in terms of a sequence number and a byte offset
+     within that sequence.  */
+  poly_uint64 first_sequence;
+  unsigned HOST_WIDE_INT subbit;
+  if (can_div_trunc_p (byte * BITS_PER_UNIT, sequence_bits,
+		       &first_sequence, &subbit))
     {
-      unsigned ibyte = inner_bytes - GET_MODE_SIZE (outermode) - byte;
-      unsigned word_byte = WORDS_BIG_ENDIAN ? ibyte : byte;
-      unsigned subword_byte = BYTES_BIG_ENDIAN ? ibyte : byte;
-      byte = (subword_byte % UNITS_PER_WORD
-	      + (word_byte / UNITS_PER_WORD) * UNITS_PER_WORD);
+      unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
+      if (nelts_per_pattern == 1)
+	/* This is a duplicated vector, so the value of FIRST_SEQUENCE
+	   doesn't matter.  */
+	byte = subbit / BITS_PER_UNIT;
+      else if (nelts_per_pattern == 2 && known_gt (first_sequence, 0U))
+	{
+	  /* The subreg drops the first element from each pattern and
+	     only uses the second element.  Find the first sequence
+	     that starts on a byte boundary.  */
+	  subbit += least_common_multiple (sequence_bits, BITS_PER_UNIT);
+	  byte = subbit / BITS_PER_UNIT;
+	}
     }
+  return byte;
+}
+
+/* Subroutine of simplify_subreg in which:
 
-  /* BYTE should still be inside OP.  (Note that BYTE is unsigned,
-     so if it's become negative it will instead be very large.)  */
-  gcc_assert (byte < inner_bytes);
+   - X is known to be a CONST_VECTOR
+   - OUTERMODE is known to be a vector mode
 
-  /* Convert from bytes to chunks of size value_bit.  */
-  value_start = byte * (BITS_PER_UNIT / value_bit);
+   Try to handle the subreg by operating on the CONST_VECTOR encoding
+   rather than on each individual element of the CONST_VECTOR.
 
-  /* Re-pack the value.  */
-  num_elem = GET_MODE_NUNITS (outermode);
+   Return the simplified subreg on success, otherwise return NULL_RTX.  */
 
-  if (VECTOR_MODE_P (outermode))
+static rtx
+simplify_const_vector_subreg (machine_mode outermode, rtx x,
+			      machine_mode innermode, unsigned int first_byte)
+{
+  /* Paradoxical subregs of vectors have dubious semantics.  */
+  if (paradoxical_subreg_p (outermode, innermode))
+    return NULL_RTX;
+
+  /* We can only preserve the semantics of a stepped pattern if the new
+     vector element is the same as the original one.  */
+  if (CONST_VECTOR_STEPPED_P (x)
+      && GET_MODE_INNER (outermode) != GET_MODE_INNER (innermode))
+    return NULL_RTX;
+
+  /* Cope with MODE_VECTOR_BOOL by operating on bits rather than bytes.  */
+  unsigned int x_elt_bits
+    = vector_element_size (GET_MODE_BITSIZE (innermode),
+			   GET_MODE_NUNITS (innermode));
+  unsigned int out_elt_bits
+    = vector_element_size (GET_MODE_BITSIZE (outermode),
+			   GET_MODE_NUNITS (outermode));
+
+  /* The number of bits needed to encode one element from every pattern
+     of the original vector.  */
+  unsigned int x_sequence_bits = CONST_VECTOR_NPATTERNS (x) * x_elt_bits;
+
+  /* The number of bits needed to encode one element from every pattern
+     of the result.  */
+  unsigned int out_sequence_bits
+    = least_common_multiple (x_sequence_bits, out_elt_bits);
+
+  /* Work out the number of interleaved patterns in the output vector
+     and the number of encoded elements per pattern.  */
+  unsigned int out_npatterns = out_sequence_bits / out_elt_bits;
+  unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
+
+  /* The encoding scheme requires the number of elements to be a multiple
+     of the number of patterns, so that each pattern appears at least once
+     and so that the same number of elements appear from each pattern.  */
+  bool ok_p = multiple_p (GET_MODE_NUNITS (outermode), out_npatterns);
+  unsigned int const_nunits;
+  if (GET_MODE_NUNITS (outermode).is_constant (&const_nunits)
+      && (!ok_p || out_npatterns * nelts_per_pattern > const_nunits))
     {
-      result_v = rtvec_alloc (num_elem);
-      elems = &RTVEC_ELT (result_v, 0);
+      /* Either the encoding is invalid, or applying it would give us
+	 more elements than we need.  Just encode each element directly.  */
+      out_npatterns = const_nunits;
+      nelts_per_pattern = 1;
     }
-  else
-    elems = &result_s;
+  else if (!ok_p)
+    return NULL_RTX;
 
-  outer_submode = GET_MODE_INNER (outermode);
-  outer_class = GET_MODE_CLASS (outer_submode);
-  elem_bitsize = GET_MODE_BITSIZE (outer_submode);
+  /* Get enough bytes of X to form the new encoding.  */
+  unsigned int buffer_bits = out_npatterns * nelts_per_pattern * out_elt_bits;
+  unsigned int buffer_bytes = CEIL (buffer_bits, BITS_PER_UNIT);
+  auto_vec<target_unit, 128> buffer (buffer_bytes);
+  if (!native_encode_rtx (innermode, x, buffer, first_byte, buffer_bytes))
+    return NULL_RTX;
 
-  gcc_assert (elem_bitsize % value_bit == 0);
-  gcc_assert (elem_bitsize + value_start * value_bit <= max_bitsize);
+  /* Reencode the bytes as OUTERMODE.  */
+  return native_decode_vector_rtx (outermode, buffer, 0, out_npatterns,
+				   nelts_per_pattern);
+}
 
-  for (elem = 0; elem < num_elem; elem++)
-    {
-      unsigned char *vp;
+/* Try to simplify a subreg of a constant by encoding the subreg region
+   as a sequence of target bytes and reading them back in the new mode.
+   Return the new value on success, otherwise return null.
 
-      /* Vectors are stored in target memory order.  (This is probably
-	 a mistake.)  */
-      {
-	unsigned byte = (elem * elem_bitsize) / BITS_PER_UNIT;
-	unsigned ibyte = (((num_elem - 1 - elem) * elem_bitsize)
-			  / BITS_PER_UNIT);
-	unsigned word_byte = WORDS_BIG_ENDIAN ? ibyte : byte;
-	unsigned subword_byte = BYTES_BIG_ENDIAN ? ibyte : byte;
-	unsigned bytele = (subword_byte % UNITS_PER_WORD
-			 + (word_byte / UNITS_PER_WORD) * UNITS_PER_WORD);
-	vp = value + value_start + (bytele * BITS_PER_UNIT) / value_bit;
-      }
+   The subreg has outer mode OUTERMODE, inner mode INNERMODE, inner value X
+   and byte offset FIRST_BYTE.  */
 
-      switch (outer_class)
-	{
-	case MODE_INT:
-	case MODE_PARTIAL_INT:
-	  {
-	    int u;
-	    int base = 0;
-	    int units
-	      = (GET_MODE_BITSIZE (outer_submode) + HOST_BITS_PER_WIDE_INT - 1)
-	      / HOST_BITS_PER_WIDE_INT;
-	    HOST_WIDE_INT tmp[MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_WIDE_INT];
-	    wide_int r;
-
-	    if (GET_MODE_PRECISION (outer_submode) > MAX_BITSIZE_MODE_ANY_INT)
-	      return NULL_RTX;
-	    for (u = 0; u < units; u++)
-	      {
-		unsigned HOST_WIDE_INT buf = 0;
-		for (i = 0;
-		     i < HOST_BITS_PER_WIDE_INT && base + i < elem_bitsize;
-		     i += value_bit)
-		  buf |= (unsigned HOST_WIDE_INT)(*vp++ & value_mask) << i;
-
-		tmp[u] = buf;
-		base += HOST_BITS_PER_WIDE_INT;
-	      }
-	    r = wide_int::from_array (tmp, units,
-				      GET_MODE_PRECISION (outer_submode));
-#if TARGET_SUPPORTS_WIDE_INT == 0
-	    /* Make sure r will fit into CONST_INT or CONST_DOUBLE.  */
-	    if (wi::min_precision (r, SIGNED) > HOST_BITS_PER_DOUBLE_INT)
-	      return NULL_RTX;
-#endif
-	    elems[elem] = immed_wide_int_const (r, outer_submode);
-	  }
-	  break;
+static rtx
+simplify_immed_subreg (fixed_size_mode outermode, rtx x,
+		       machine_mode innermode, unsigned int first_byte)
+{
+  unsigned int buffer_bytes = GET_MODE_SIZE (outermode);
+  auto_vec<target_unit, 128> buffer (buffer_bytes);
 
-	case MODE_FLOAT:
-	case MODE_DECIMAL_FLOAT:
-	  {
-	    REAL_VALUE_TYPE r;
-	    long tmp[MAX_BITSIZE_MODE_ANY_MODE / 32] = { 0 };
-
-	    /* real_from_target wants its input in words affected by
-	       FLOAT_WORDS_BIG_ENDIAN.  However, we ignore this,
-	       and use WORDS_BIG_ENDIAN instead; see the documentation
-	       of SUBREG in rtl.texi.  */
-	    for (i = 0; i < elem_bitsize; i += value_bit)
-	      {
-		int ibase;
-		if (WORDS_BIG_ENDIAN)
-		  ibase = elem_bitsize - 1 - i;
-		else
-		  ibase = i;
-		tmp[ibase / 32] |= (*vp++ & value_mask) << i % 32;
-	      }
+  /* Some ports misuse CCmode.  */
+  if (GET_MODE_CLASS (outermode) == MODE_CC && CONST_INT_P (x))
+    return x;
 
-	    real_from_target (&r, tmp, outer_submode);
-	    elems[elem] = const_double_from_real_value (r, outer_submode);
-	  }
-	  break;
+  /* Paradoxical subregs read undefined values for bytes outside of the
+     inner value.  However, we have traditionally always sign-extended
+     integer constants and zero-extended others.  */
+  unsigned int inner_bytes = buffer_bytes;
+  if (paradoxical_subreg_p (outermode, innermode))
+    {
+      if (!GET_MODE_SIZE (innermode).is_constant (&inner_bytes))
+	return NULL_RTX;
 
-	case MODE_FRACT:
-	case MODE_UFRACT:
-	case MODE_ACCUM:
-	case MODE_UACCUM:
-	  {
-	    FIXED_VALUE_TYPE f;
-	    f.data.low = 0;
-	    f.data.high = 0;
-	    f.mode = outer_submode;
-
-	    for (i = 0;
-		 i < HOST_BITS_PER_WIDE_INT && i < elem_bitsize;
-		 i += value_bit)
-	      f.data.low |= (unsigned HOST_WIDE_INT)(*vp++ & value_mask) << i;
-	    for (; i < elem_bitsize; i += value_bit)
-	      f.data.high |= ((unsigned HOST_WIDE_INT)(*vp++ & value_mask)
-			     << (i - HOST_BITS_PER_WIDE_INT));
-
-	    elems[elem] = CONST_FIXED_FROM_FIXED_VALUE (f, outer_submode);
-          }
-          break;
+      target_unit filler = 0;
+      if (CONST_SCALAR_INT_P (x) && wi::neg_p (rtx_mode_t (x, innermode)))
+	filler = -1;
 
-	default:
-	  gcc_unreachable ();
-	}
+      /* Add any leading bytes due to big-endian layout.  The number of
+	 bytes must be constant because both modes have constant size.  */
+      unsigned int leading_bytes
+	= -byte_lowpart_offset (outermode, innermode).to_constant ();
+      for (unsigned int i = 0; i < leading_bytes; ++i)
+	buffer.quick_push (filler);
+
+      if (!native_encode_rtx (innermode, x, buffer, first_byte, inner_bytes))
+	return NULL_RTX;
+
+      /* Add any trailing bytes due to little-endian layout.  */
+      while (buffer.length () < buffer_bytes)
+	buffer.quick_push (filler);
     }
-  if (VECTOR_MODE_P (outermode))
-    return gen_rtx_CONST_VECTOR (outermode, result_v);
   else
-    return result_s;
+    {
+      if (!native_encode_rtx (innermode, x, buffer, first_byte, inner_bytes))
+	return NULL_RTX;
+      }
+  return native_decode_rtx (outermode, buffer, 0);
 }
 
 /* Simplify SUBREG:OUTERMODE(OP:INNERMODE, BYTE)
@@ -6468,6 +6625,9 @@ simplify_subreg (machine_mode outermode, rtx op,
   if (outermode == innermode && known_eq (byte, 0U))
     return op;
 
+  if (GET_CODE (op) == CONST_VECTOR)
+    byte = simplify_const_vector_byte_offset (op, byte);
+
   if (multiple_p (byte, GET_MODE_UNIT_SIZE (innermode)))
     {
       rtx elt;
@@ -6487,30 +6647,21 @@ simplify_subreg (machine_mode outermode, rtx op,
       || CONST_FIXED_P (op)
       || GET_CODE (op) == CONST_VECTOR)
     {
-      /* simplify_immed_subreg deconstructs OP into bytes and constructs
-	 the result from bytes, so it only works if the sizes of the modes
-	 and the value of the offset are known at compile time.  Cases that
-	 that apply to general modes and offsets should be handled here
-	 before calling simplify_immed_subreg.  */
-      fixed_size_mode fs_outermode, fs_innermode;
       unsigned HOST_WIDE_INT cbyte;
-      if (is_a <fixed_size_mode> (outermode, &fs_outermode)
-	  && is_a <fixed_size_mode> (innermode, &fs_innermode)
-	  && byte.is_constant (&cbyte))
-	return simplify_immed_subreg (fs_outermode, op, fs_innermode, cbyte,
-				      0, GET_MODE_SIZE (fs_innermode));
-
-      /* Handle constant-sized outer modes and variable-sized inner modes.  */
-      unsigned HOST_WIDE_INT first_elem;
-      if (GET_CODE (op) == CONST_VECTOR
-	  && is_a <fixed_size_mode> (outermode, &fs_outermode)
-	  && constant_multiple_p (byte, GET_MODE_UNIT_SIZE (innermode),
-				  &first_elem))
-	return simplify_immed_subreg (fs_outermode, op, innermode, 0,
-				      first_elem,
-				      GET_MODE_SIZE (fs_outermode));
+      if (byte.is_constant (&cbyte))
+	{
+	  if (GET_CODE (op) == CONST_VECTOR && VECTOR_MODE_P (outermode))
+	    {
+	      rtx tmp = simplify_const_vector_subreg (outermode, op,
+						      innermode, cbyte);
+	      if (tmp)
+		return tmp;
+	    }
 
-      return NULL_RTX;
+	  fixed_size_mode fs_outermode;
+	  if (is_a <fixed_size_mode> (outermode, &fs_outermode))
+	    return simplify_immed_subreg (fs_outermode, op, innermode, cbyte);
+	}
     }
 
   /* Changing mode twice with SUBREG => just change it once,
@@ -6952,6 +7103,18 @@ test_vector_ops_duplicate (machine_mode mode, rtx scalar_reg)
       && mode_for_vector (inner_mode, 2).exists (&narrower_mode)
       && VECTOR_MODE_P (narrower_mode))
     {
+      /* Test VEC_DUPLICATE of a vector.  */
+      rtx_vector_builder nbuilder (narrower_mode, 2, 1);
+      nbuilder.quick_push (const0_rtx);
+      nbuilder.quick_push (const1_rtx);
+      rtx_vector_builder builder (mode, 2, 1);
+      builder.quick_push (const0_rtx);
+      builder.quick_push (const1_rtx);
+      ASSERT_RTX_EQ (builder.build (),
+		     simplify_unary_operation (VEC_DUPLICATE, mode,
+					       nbuilder.build (),
+					       narrower_mode));
+
       /* Test VEC_SELECT of a vector.  */
       rtx vec_par
 	= gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, const1_rtx, const0_rtx));
@@ -7024,6 +7187,58 @@ test_vector_ops_series (machine_mode mode, rtx scalar_reg)
   ASSERT_RTX_EQ (series_0_m1,
 		 simplify_binary_operation (VEC_SERIES, mode, const0_rtx,
 					    constm1_rtx));
+
+  /* Test NEG on constant vector series.  */
+  ASSERT_RTX_EQ (series_0_m1,
+		 simplify_unary_operation (NEG, mode, series_0_1, mode));
+  ASSERT_RTX_EQ (series_0_1,
+		 simplify_unary_operation (NEG, mode, series_0_m1, mode));
+
+  /* Test PLUS and MINUS on constant vector series.  */
+  rtx scalar2 = gen_int_mode (2, inner_mode);
+  rtx scalar3 = gen_int_mode (3, inner_mode);
+  rtx series_1_1 = gen_const_vec_series (mode, const1_rtx, const1_rtx);
+  rtx series_0_2 = gen_const_vec_series (mode, const0_rtx, scalar2);
+  rtx series_1_3 = gen_const_vec_series (mode, const1_rtx, scalar3);
+  ASSERT_RTX_EQ (series_1_1,
+		 simplify_binary_operation (PLUS, mode, series_0_1,
+					    CONST1_RTX (mode)));
+  ASSERT_RTX_EQ (series_0_m1,
+		 simplify_binary_operation (PLUS, mode, CONST0_RTX (mode),
+					    series_0_m1));
+  ASSERT_RTX_EQ (series_1_3,
+		 simplify_binary_operation (PLUS, mode, series_1_1,
+					    series_0_2));
+  ASSERT_RTX_EQ (series_0_1,
+		 simplify_binary_operation (MINUS, mode, series_1_1,
+					    CONST1_RTX (mode)));
+  ASSERT_RTX_EQ (series_1_1,
+		 simplify_binary_operation (MINUS, mode, CONST1_RTX (mode),
+					    series_0_m1));
+  ASSERT_RTX_EQ (series_1_1,
+		 simplify_binary_operation (MINUS, mode, series_1_3,
+					    series_0_2));
+
+  /* Test MULT between constant vectors.  */
+  rtx vec2 = gen_const_vec_duplicate (mode, scalar2);
+  rtx vec3 = gen_const_vec_duplicate (mode, scalar3);
+  rtx scalar9 = gen_int_mode (9, inner_mode);
+  rtx series_3_9 = gen_const_vec_series (mode, scalar3, scalar9);
+  ASSERT_RTX_EQ (series_0_2,
+		 simplify_binary_operation (MULT, mode, series_0_1, vec2));
+  ASSERT_RTX_EQ (series_3_9,
+		 simplify_binary_operation (MULT, mode, vec3, series_1_3));
+  if (!GET_MODE_NUNITS (mode).is_constant ())
+    ASSERT_FALSE (simplify_binary_operation (MULT, mode, series_0_1,
+					     series_0_1));
+
+  /* Test ASHIFT between constant vectors.  */
+  ASSERT_RTX_EQ (series_0_2,
+		 simplify_binary_operation (ASHIFT, mode, series_0_1,
+					    CONST1_RTX (mode)));
+  if (!GET_MODE_NUNITS (mode).is_constant ())
+    ASSERT_FALSE (simplify_binary_operation (ASHIFT, mode, CONST1_RTX (mode),
+					     series_0_1));
 }
 
 /* Verify simplify_merge_mask works correctly.  */
@@ -7089,6 +7304,165 @@ test_vec_merge (machine_mode mode)
 		 simplify_rtx (nvm));
 }
 
+/* Test subregs of integer vector constant X, trying elements in
+   the range [ELT_BIAS, ELT_BIAS + constant_lower_bound (NELTS)),
+   where NELTS is the number of elements in X.  Subregs involving
+   elements [ELT_BIAS, ELT_BIAS + FIRST_VALID) are expected to fail.  */
+
+static void
+test_vector_subregs_modes (rtx x, poly_uint64 elt_bias = 0,
+			   unsigned int first_valid = 0)
+{
+  machine_mode inner_mode = GET_MODE (x);
+  scalar_mode int_mode = GET_MODE_INNER (inner_mode);
+
+  for (unsigned int modei = 0; modei < NUM_MACHINE_MODES; ++modei)
+    {
+      machine_mode outer_mode = (machine_mode) modei;
+      if (!VECTOR_MODE_P (outer_mode))
+	continue;
+
+      unsigned int outer_nunits;
+      if (GET_MODE_INNER (outer_mode) == int_mode
+	  && GET_MODE_NUNITS (outer_mode).is_constant (&outer_nunits)
+	  && multiple_p (GET_MODE_NUNITS (inner_mode), outer_nunits))
+	{
+	  /* Test subregs in which the outer mode is a smaller,
+	     constant-sized vector of the same element type.  */
+	  unsigned int limit
+	    = constant_lower_bound (GET_MODE_NUNITS (inner_mode));
+	  for (unsigned int elt = 0; elt < limit; elt += outer_nunits)
+	    {
+	      rtx expected = NULL_RTX;
+	      if (elt >= first_valid)
+		{
+		  rtx_vector_builder builder (outer_mode, outer_nunits, 1);
+		  for (unsigned int i = 0; i < outer_nunits; ++i)
+		    builder.quick_push (CONST_VECTOR_ELT (x, elt + i));
+		  expected = builder.build ();
+		}
+	      poly_uint64 byte = (elt_bias + elt) * GET_MODE_SIZE (int_mode);
+	      ASSERT_RTX_EQ (expected,
+			     simplify_subreg (outer_mode, x,
+					      inner_mode, byte));
+	    }
+	}
+      else if (known_eq (GET_MODE_SIZE (outer_mode),
+			 GET_MODE_SIZE (inner_mode))
+	       && known_eq (elt_bias, 0U)
+	       && (GET_MODE_CLASS (outer_mode) != MODE_VECTOR_BOOL
+		   || known_eq (GET_MODE_BITSIZE (outer_mode),
+				GET_MODE_NUNITS (outer_mode)))
+	       && (!FLOAT_MODE_P (outer_mode)
+		   || (FLOAT_MODE_FORMAT (outer_mode)->ieee_bits
+		       == GET_MODE_UNIT_PRECISION (outer_mode)))
+	       && (GET_MODE_SIZE (inner_mode).is_constant ()
+		   || !CONST_VECTOR_STEPPED_P (x)))
+	{
+	  /* Try converting to OUTER_MODE and back.  */
+	  rtx outer_x = simplify_subreg (outer_mode, x, inner_mode, 0);
+	  ASSERT_TRUE (outer_x != NULL_RTX);
+	  ASSERT_RTX_EQ (x, simplify_subreg (inner_mode, outer_x,
+					     outer_mode, 0));
+	}
+    }
+
+  if (BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN)
+    {
+      /* Test each byte in the element range.  */
+      unsigned int limit
+	= constant_lower_bound (GET_MODE_SIZE (inner_mode));
+      for (unsigned int i = 0; i < limit; ++i)
+	{
+	  unsigned int elt = i / GET_MODE_SIZE (int_mode);
+	  rtx expected = NULL_RTX;
+	  if (elt >= first_valid)
+	    {
+	      unsigned int byte_shift = i % GET_MODE_SIZE (int_mode);
+	      if (BYTES_BIG_ENDIAN)
+		byte_shift = GET_MODE_SIZE (int_mode) - byte_shift - 1;
+	      rtx_mode_t vec_elt (CONST_VECTOR_ELT (x, elt), int_mode);
+	      wide_int shifted_elt
+		= wi::lrshift (vec_elt, byte_shift * BITS_PER_UNIT);
+	      expected = immed_wide_int_const (shifted_elt, QImode);
+	    }
+	  poly_uint64 byte = elt_bias * GET_MODE_SIZE (int_mode) + i;
+	  ASSERT_RTX_EQ (expected,
+			 simplify_subreg (QImode, x, inner_mode, byte));
+	}
+    }
+}
+
+/* Test constant subregs of integer vector mode INNER_MODE, using 1
+   element per pattern.  */
+
+static void
+test_vector_subregs_repeating (machine_mode inner_mode)
+{
+  poly_uint64 nunits = GET_MODE_NUNITS (inner_mode);
+  unsigned int min_nunits = constant_lower_bound (nunits);
+  scalar_mode int_mode = GET_MODE_INNER (inner_mode);
+  unsigned int count = gcd (min_nunits, 8);
+
+  rtx_vector_builder builder (inner_mode, count, 1);
+  for (unsigned int i = 0; i < count; ++i)
+    builder.quick_push (gen_int_mode (8 - i, int_mode));
+  rtx x = builder.build ();
+
+  test_vector_subregs_modes (x);
+  if (!nunits.is_constant ())
+    test_vector_subregs_modes (x, nunits - min_nunits);
+}
+
+/* Test constant subregs of integer vector mode INNER_MODE, using 2
+   elements per pattern.  */
+
+static void
+test_vector_subregs_fore_back (machine_mode inner_mode)
+{
+  poly_uint64 nunits = GET_MODE_NUNITS (inner_mode);
+  unsigned int min_nunits = constant_lower_bound (nunits);
+  scalar_mode int_mode = GET_MODE_INNER (inner_mode);
+  unsigned int count = gcd (min_nunits, 4);
+
+  rtx_vector_builder builder (inner_mode, count, 2);
+  for (unsigned int i = 0; i < count; ++i)
+    builder.quick_push (gen_int_mode (i, int_mode));
+  for (unsigned int i = 0; i < count; ++i)
+    builder.quick_push (gen_int_mode (-(int) i, int_mode));
+  rtx x = builder.build ();
+
+  test_vector_subregs_modes (x);
+  if (!nunits.is_constant ())
+    test_vector_subregs_modes (x, nunits - min_nunits, count);
+}
+
+/* Test constant subregs of integer vector mode INNER_MODE, using 3
+   elements per pattern.  */
+
+static void
+test_vector_subregs_stepped (machine_mode inner_mode)
+{
+  /* Build { 0, 1, 2, 3, ... }.  */
+  scalar_mode int_mode = GET_MODE_INNER (inner_mode);
+  rtx_vector_builder builder (inner_mode, 1, 3);
+  for (unsigned int i = 0; i < 3; ++i)
+    builder.quick_push (gen_int_mode (i, int_mode));
+  rtx x = builder.build ();
+
+  test_vector_subregs_modes (x);
+}
+
+/* Test constant subregs of integer vector mode INNER_MODE.  */
+
+static void
+test_vector_subregs (machine_mode inner_mode)
+{
+  test_vector_subregs_repeating (inner_mode);
+  test_vector_subregs_fore_back (inner_mode);
+  test_vector_subregs_stepped (inner_mode);
+}
+
 /* Verify some simplifications involving vectors.  */
 
 static void
@@ -7103,7 +7477,10 @@ test_vector_ops ()
 	  test_vector_ops_duplicate (mode, scalar_reg);
 	  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
 	      && maybe_gt (GET_MODE_NUNITS (mode), 2))
-	    test_vector_ops_series (mode, scalar_reg);
+	    {
+	      test_vector_ops_series (mode, scalar_reg);
+	      test_vector_subregs (mode);
+	    }
 	  test_vec_merge (mode);
 	}
     }
diff --git a/gcc/stack-ptr-mod.c b/gcc/stack-ptr-mod.c
index a10d59b61..5cb95e712 100644
--- a/gcc/stack-ptr-mod.c
+++ b/gcc/stack-ptr-mod.c
@@ -91,9 +91,7 @@ pass_stack_ptr_mod::execute (function *fun)
 	  if (INSN_P (insn))
 	    {
 	      /* Check if insn modifies the stack pointer.  */
-	      note_stores (PATTERN (insn),
-			   notice_stack_pointer_modification_1,
-			   NULL);
+	      note_stores (insn, notice_stack_pointer_modification_1, NULL);
 	      if (! crtl->sp_is_unchanging)
 		return 0;
 	    }
diff --git a/gcc/stor-layout.c b/gcc/stor-layout.c
index a054b7887..7d1917f82 100644
--- a/gcc/stor-layout.c
+++ b/gcc/stor-layout.c
@@ -42,6 +42,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimplify.h"
 #include "attribs.h"
 #include "debug.h"
+#include "calls.h"
 
 /* Data type for the expressions representing sizes of data types.
    It is the first integer type laid out.  */
@@ -1835,7 +1836,8 @@ compute_record_mode (tree type)
      line.  */
   SET_TYPE_MODE (type, BLKmode);
 
-  if (! tree_fits_uhwi_p (TYPE_SIZE (type)))
+  poly_uint64 type_size;
+  if (!poly_int_tree_p (TYPE_SIZE (type), &type_size))
     return;
 
   /* A record which has any BLKmode members must itself be
@@ -1846,20 +1848,21 @@ compute_record_mode (tree type)
       if (TREE_CODE (field) != FIELD_DECL)
 	continue;
 
+      poly_uint64 field_size;
       if (TREE_CODE (TREE_TYPE (field)) == ERROR_MARK
 	  || (TYPE_MODE (TREE_TYPE (field)) == BLKmode
 	      && ! TYPE_NO_FORCE_BLK (TREE_TYPE (field))
 	      && !(TYPE_SIZE (TREE_TYPE (field)) != 0
 		   && integer_zerop (TYPE_SIZE (TREE_TYPE (field)))))
-	  || ! tree_fits_uhwi_p (bit_position (field))
+	  || !tree_fits_poly_uint64_p (bit_position (field))
 	  || DECL_SIZE (field) == 0
-	  || ! tree_fits_uhwi_p (DECL_SIZE (field)))
+	  || !poly_int_tree_p (DECL_SIZE (field), &field_size))
 	return;
 
       /* If this field is the whole struct, remember its mode so
 	 that, say, we can put a double in a class into a DF
 	 register instead of forcing it to live in the stack.  */
-      if (simple_cst_equal (TYPE_SIZE (type), DECL_SIZE (field))
+      if (known_eq (field_size, type_size)
 	  /* Partial int types (e.g. __int20) may have TYPE_SIZE equal to
 	     wider types (e.g. int32), despite precision being less.  Ensure
 	     that the TYPE_MODE of the struct does not get set to the partial
@@ -1879,15 +1882,14 @@ compute_record_mode (tree type)
      For UNION_TYPE, if the widest field is MODE_INT then use that mode.
      If the widest field is MODE_PARTIAL_INT, and the union will be passed
      by reference, then use that mode.  */
-  poly_uint64 type_size;
   if ((TREE_CODE (type) == RECORD_TYPE
        || (TREE_CODE (type) == UNION_TYPE
 	   && (GET_MODE_CLASS (mode) == MODE_INT
 	       || (GET_MODE_CLASS (mode) == MODE_PARTIAL_INT
-		   && targetm.calls.pass_by_reference (pack_cumulative_args (0),
-						       mode, type, 0)))))
+		   && (targetm.calls.pass_by_reference
+		       (pack_cumulative_args (0),
+			function_arg_info (type, mode, /*named=*/false)))))))
       && mode != VOIDmode
-      && poly_int_tree_p (TYPE_SIZE (type), &type_size)
       && known_eq (GET_MODE_BITSIZE (mode), type_size))
     ;
   else
diff --git a/gcc/target-globals.c b/gcc/target-globals.c
index 94a465c91..00bbda69c 100644
--- a/gcc/target-globals.c
+++ b/gcc/target-globals.c
@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "gcse.h"
 #include "bb-reorder.h"
 #include "lower-subreg.h"
+#include "function-abi.h"
 
 #if SWITCHABLE_TARGET
 struct target_globals default_target_globals = {
@@ -48,6 +49,7 @@ struct target_globals default_target_globals = {
   &default_target_rtl,
   &default_target_recog,
   &default_target_hard_regs,
+  &default_target_function_abi_info,
   &default_target_reload,
   &default_target_expmed,
   &default_target_optabs,
@@ -70,6 +72,7 @@ save_target_globals (void)
   g->rtl = ggc_cleared_alloc<target_rtl> ();
   g->recog = XCNEW (struct target_recog);
   g->hard_regs = XCNEW (struct target_hard_regs);
+  g->function_abi_info = XCNEW (struct target_function_abi_info);
   g->reload = XCNEW (struct target_reload);
   g->expmed = XCNEW (struct target_expmed);
   g->optabs = XCNEW (struct target_optabs);
@@ -127,6 +130,7 @@ target_globals::~target_globals ()
       XDELETE (regs);
       XDELETE (recog);
       XDELETE (hard_regs);
+      XDELETE (function_abi_info);
       XDELETE (reload);
       XDELETE (expmed);
       XDELETE (optabs);
diff --git a/gcc/target-globals.h b/gcc/target-globals.h
index 5af846c9f..f21580be6 100644
--- a/gcc/target-globals.h
+++ b/gcc/target-globals.h
@@ -26,6 +26,7 @@ extern struct target_regs *this_target_regs;
 extern struct target_rtl *this_target_rtl;
 extern struct target_recog *this_target_recog;
 extern struct target_hard_regs *this_target_hard_regs;
+extern struct target_function_abi_info *this_target_function_abi_info;
 extern struct target_reload *this_target_reload;
 extern struct target_expmed *this_target_expmed;
 extern struct target_optabs *this_target_optabs;
@@ -47,6 +48,7 @@ struct GTY(()) target_globals {
   struct target_rtl *rtl;
   struct target_recog *GTY((skip)) recog;
   struct target_hard_regs *GTY((skip)) hard_regs;
+  struct target_function_abi_info *GTY((skip)) function_abi_info;
   struct target_reload *GTY((skip)) reload;
   struct target_expmed *GTY((skip)) expmed;
   struct target_optabs *GTY((skip)) optabs;
@@ -74,6 +76,7 @@ restore_target_globals (struct target_globals *g)
   this_target_rtl = g->rtl;
   this_target_recog = g->recog;
   this_target_hard_regs = g->hard_regs;
+  this_target_function_abi_info = g->function_abi_info;
   this_target_reload = g->reload;
   this_target_expmed = g->expmed;
   this_target_optabs = g->optabs;
diff --git a/gcc/target.def b/gcc/target.def
index f998470ff..05389cdd1 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1782,22 +1782,6 @@ return type of the vectorized function shall be of vector type\n\
  tree, (tree fndecl, tree vec_type_out, tree vec_type_in),
  default_builtin_md_vectorized_function)
 
-/* Returns a function declaration for a builtin that realizes the
-   vector conversion, or NULL_TREE if not available.  */
-DEFHOOK
-(builtin_conversion,
- "This hook should return the DECL of a function that implements conversion of the\n\
-input vector of type @var{src_type} to type @var{dest_type}.\n\
-The value of @var{code} is one of the enumerators in @code{enum tree_code} and\n\
-specifies how the conversion is to be applied\n\
-(truncation, rounding, etc.).\n\
-\n\
-If this hook is defined, the autovectorizer will use the\n\
-@code{TARGET_VECTORIZE_BUILTIN_CONVERSION} target hook when vectorizing\n\
-conversion. Otherwise, it will return @code{NULL_TREE}.",
- tree, (unsigned code, tree dest_type, tree src_type),
- default_builtin_vectorized_conversion)
-
 /* Cost of different vector/scalar statements in vectorization cost
    model. In case of misaligned vector loads and stores the cost depends
    on the data type and misalignment value.  */
@@ -2431,6 +2415,24 @@ another @code{CALL_EXPR}.\n\
 @var{arglist} really has type @samp{VEC(tree,gc)*}",
  tree, (unsigned int /*location_t*/ loc, tree fndecl, void *arglist), NULL)
 
+DEFHOOK
+(check_builtin_call,
+ "Perform semantic checking on a call to a machine-specific built-in\n\
+function after its arguments have been constrained to the function\n\
+signature.  Return true if the call is valid, otherwise report an error\n\
+and return false.\n\
+\n\
+This hook is called after @code{TARGET_RESOLVE_OVERLOADED_BUILTIN}.\n\
+The call was originally to built-in function @var{orig_fndecl},\n\
+but after the optional @code{TARGET_RESOLVE_OVERLOADED_BUILTIN}\n\
+step is now to built-in function @var{fndecl}.  @var{loc} is the\n\
+location of the call and @var{args} is an array of function arguments,\n\
+of which there are @var{nargs}.  @var{arg_loc} specifies the location\n\
+of each argument.",
+ bool, (location_t loc, vec<location_t> arg_loc, tree fndecl,
+	tree orig_fndecl, unsigned int nargs, tree *args),
+ NULL)
+
 /* Fold a target-specific builtin to a tree valid for both GIMPLE
    and GENERIC.  */
 DEFHOOK
@@ -2624,38 +2626,6 @@ DEFHOOK
  bool, (const rtx_insn *follower, const rtx_insn *followee),
  hook_bool_const_rtx_insn_const_rtx_insn_true)
 
-/* Return a register class for which branch target register
-   optimizations should be applied.  */
-DEFHOOK
-(branch_target_register_class,
- "This target hook returns a register class for which branch target register\n\
-optimizations should be applied.  All registers in this class should be\n\
-usable interchangeably.  After reload, registers in this class will be\n\
-re-allocated and loads will be hoisted out of loops and be subjected\n\
-to inter-block scheduling.",
- reg_class_t, (void),
- default_branch_target_register_class)
-
-/* Return true if branch target register optimizations should include
-   callee-saved registers that are not already live during the current
-   function.  AFTER_PE_GEN is true if prologues and epilogues have
-   already been generated.  */
-DEFHOOK
-(branch_target_register_callee_saved,
- "Branch target register optimization will by default exclude callee-saved\n\
-registers\n\
-that are not already live during the current function; if this target hook\n\
-returns true, they will be included.  The target code must than make sure\n\
-that all target registers in the class returned by\n\
-@samp{TARGET_BRANCH_TARGET_REGISTER_CLASS} that might need saving are\n\
-saved.  @var{after_prologue_epilogue_gen} indicates if prologues and\n\
-epilogues have already been generated.  Note, even if you only return\n\
-true when @var{after_prologue_epilogue_gen} is false, you still are likely\n\
-to have to make special provisions in @code{INITIAL_ELIMINATION_OFFSET}\n\
-to reserve space for caller-saved target registers.",
- bool, (bool after_prologue_epilogue_gen),
- hook_bool_bool_false)
-
 /* Return true if the target supports conditional execution.  */
 DEFHOOK
 (have_conditional_execution,
@@ -3407,6 +3377,29 @@ must have move patterns for this mode.",
  bool, (machine_mode mode),
  hook_bool_mode_false)
 
+DEFHOOK
+(compatible_vector_types_p,
+ "Return true if there is no target-specific reason for treating\n\
+vector types @var{type1} and @var{type2} as distinct types.  The caller\n\
+has already checked for target-independent reasons, meaning that the\n\
+types are known to have the same mode, to have the same number of elements,\n\
+and to have what the caller considers to be compatible element types.\n\
+\n\
+The main reason for defining this hook is to reject pairs of types\n\
+that are handled differently by the target's calling convention.\n\
+For example, when a new @var{N}-bit vector architecture is added\n\
+to a target, the target may want to handle normal @var{N}-bit\n\
+@code{VECTOR_TYPE} arguments and return values in the same way as\n\
+before, to maintain backwards compatibility.  However, it may also\n\
+provide new, architecture-specific @code{VECTOR_TYPE}s that are passed\n\
+and returned in a more efficient way.  It is then important to maintain\n\
+a distinction between the ``normal'' @code{VECTOR_TYPE}s and the new\n\
+architecture-specific ones.\n\
+\n\
+The default implementation returns true, which is correct for most targets.",
+ bool, (const_tree type1, const_tree type2),
+ hook_bool_const_tree_const_tree_true)
+
 DEFHOOK
 (vector_alignment,
  "This hook can be used to define the alignment for a vector of type\n\
@@ -3569,7 +3562,7 @@ two areas of memory, or to set, clear or store to memory, for example\n\
 when copying a @code{struct}. The @code{by_pieces} infrastructure\n\
 implements such memory operations as a sequence of load, store or move\n\
 insns.  Alternate strategies are to expand the\n\
-@code{movmem} or @code{setmem} optabs, to emit a library call, or to emit\n\
+@code{cpymem} or @code{setmem} optabs, to emit a library call, or to emit\n\
 unit-by-unit, loop-based operations.\n\
 \n\
 This target hook should return true if, for a memory operation with a\n\
@@ -3588,7 +3581,7 @@ optimized for speed rather than size.\n\
 \n\
 Returning true for higher values of @var{size} can improve code generation\n\
 for speed if the target does not provide an implementation of the\n\
-@code{movmem} or @code{setmem} standard names, if the @code{movmem} or\n\
+@code{cpymem} or @code{setmem} standard names, if the @code{cpymem} or\n\
 @code{setmem} implementation would be more expensive than a sequence of\n\
 insns, or if the overhead of a library call would dominate that of\n\
 the body of the memory operation.\n\
@@ -4479,18 +4472,18 @@ or 3-byte structure is returned at the most significant end of a\n\
    from __builtin_va_arg.  */
 DEFHOOK
 (pass_by_reference,
- "This target hook should return @code{true} if an argument at the\n\
+ "This target hook should return @code{true} if argument @var{arg} at the\n\
 position indicated by @var{cum} should be passed by reference.  This\n\
 predicate is queried after target independent reasons for being\n\
-passed by reference, such as @code{TREE_ADDRESSABLE (type)}.\n\
+passed by reference, such as @code{TREE_ADDRESSABLE (@var{arg}.type)}.\n\
 \n\
 If the hook returns true, a copy of that argument is made in memory and a\n\
 pointer to the argument is passed instead of the argument itself.\n\
 The pointer is passed in whatever way is appropriate for passing a pointer\n\
 to that type.",
  bool,
- (cumulative_args_t cum, machine_mode mode, const_tree type, bool named),
- hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false)
+ (cumulative_args_t cum, const function_arg_info &arg),
+ hook_bool_CUMULATIVE_ARGS_arg_info_false)
 
 DEFHOOK
 (expand_builtin_saveregs,
@@ -4515,8 +4508,8 @@ pass all their arguments on the stack.\n\
 \n\
 The argument @var{args_so_far} points to the @code{CUMULATIVE_ARGS} data\n\
 structure, containing the values that are obtained after processing the\n\
-named arguments.  The arguments @var{mode} and @var{type} describe the\n\
-last named argument---its machine mode and its data type as a tree node.\n\
+named arguments.  The argument @var{arg} describes the last of these named\n\
+arguments.\n\
 \n\
 The target hook should do two things: first, push onto the stack all the\n\
 argument registers @emph{not} used for the named arguments, and second,\n\
@@ -4536,7 +4529,7 @@ arguments of the function are being analyzed for the second time.  This\n\
 happens for an inline function, which is not actually compiled until the\n\
 end of the source file.  The hook @code{TARGET_SETUP_INCOMING_VARARGS} should\n\
 not generate any instructions in this case.",
- void, (cumulative_args_t args_so_far, machine_mode mode, tree type,
+ void, (cumulative_args_t args_so_far, const function_arg_info &arg,
 	int *pretend_args_size, int second_time),
  default_setup_incoming_varargs)
 
@@ -4579,15 +4572,6 @@ returned by function call into @var{slot}.",
  void, (rtx slot, rtx bounds),
  default_store_returned_bounds)
 
-DEFHOOK
-(setup_incoming_vararg_bounds,
- "Use it to store bounds for anonymous register arguments stored\n\
-into the stack.  Arguments meaning is similar to\n\
-@code{TARGET_SETUP_INCOMING_VARARGS}.",
- void, (cumulative_args_t args_so_far, machine_mode mode, tree type,
-	int *pretend_args_size, int second_time),
- default_setup_incoming_vararg_bounds)
-
 DEFHOOK
 (call_args,
  "While generating RTL for a function call, this target hook is invoked once\n\
@@ -4668,11 +4652,11 @@ false.",
    Need audit to verify that this is the case.  */
 DEFHOOK
 (must_pass_in_stack,
- "This target hook should return @code{true} if we should not pass @var{type}\n\
+ "This target hook should return @code{true} if we should not pass @var{arg}\n\
 solely in registers.  The file @file{expr.h} defines a\n\
 definition that is usually appropriate, refer to @file{expr.h} for additional\n\
 documentation.",
- bool, (machine_mode mode, const_tree type),
+ bool, (const function_arg_info &arg),
  must_pass_in_stack_var_size_or_pad)
 
 /* Return true if type TYPE, mode MODE, which is passed by reference,
@@ -4691,8 +4675,8 @@ not be generated.\n\
 \n\
 The default version of this hook always returns false.",
  bool,
- (cumulative_args_t cum, machine_mode mode, const_tree type, bool named),
- hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false)
+ (cumulative_args_t cum, const function_arg_info &arg),
+ hook_bool_CUMULATIVE_ARGS_arg_info_false)
 
 /* Return zero for arguments passed entirely on the stack or entirely
    in registers.  If passed in both, return the number of bytes passed
@@ -4715,8 +4699,8 @@ compiler when this occurs, and how many bytes should go in registers.\n\
 @code{TARGET_FUNCTION_ARG} for these arguments should return the first\n\
 register to be used by the caller for this argument; likewise\n\
 @code{TARGET_FUNCTION_INCOMING_ARG}, for the called function.",
- int, (cumulative_args_t cum, machine_mode mode, tree type, bool named),
- hook_int_CUMULATIVE_ARGS_mode_tree_bool_0)
+ int, (cumulative_args_t cum, const function_arg_info &arg),
+ hook_int_CUMULATIVE_ARGS_arg_info_0)
 
 /* Update the state in CA to advance past an argument in the
    argument list.  The values MODE, TYPE, and NAMED describe that
@@ -4724,8 +4708,7 @@ register to be used by the caller for this argument; likewise\n\
 DEFHOOK
 (function_arg_advance,
  "This hook updates the summarizer variable pointed to by @var{ca} to\n\
-advance past an argument in the argument list.  The values @var{mode},\n\
-@var{type} and @var{named} describe that argument.  Once this is done,\n\
+advance past argument @var{arg} in the argument list.  Once this is done,\n\
 the variable @var{cum} is suitable for analyzing the @emph{following}\n\
 argument with @code{TARGET_FUNCTION_ARG}, etc.\n\
 \n\
@@ -4733,7 +4716,7 @@ This hook need not do anything if the argument in question was passed\n\
 on the stack.  The compiler knows how to track the amount of stack space\n\
 used for arguments without any special help.",
  void,
- (cumulative_args_t ca, machine_mode mode, const_tree type, bool named),
+ (cumulative_args_t ca, const function_arg_info &arg),
  default_function_arg_advance)
 
 DEFHOOK
@@ -4770,17 +4753,9 @@ constant size shorter than an @code{int}, and upward otherwise.",
    argument.  */
 DEFHOOK
 (function_arg,
- "Return an RTX indicating whether a function argument is passed in a\n\
-register and if so, which register.\n\
-\n\
-The arguments are @var{ca}, which summarizes all the previous\n\
-arguments; @var{mode}, the machine mode of the argument; @var{type},\n\
-the data type of the argument as a tree node or 0 if that is not known\n\
-(which happens for C support library functions); and @var{named},\n\
-which is @code{true} for an ordinary argument and @code{false} for\n\
-nameless arguments that correspond to @samp{@dots{}} in the called\n\
-function's prototype.  @var{type} can be an incomplete type if a\n\
-syntax error has previously occurred.\n\
+ "Return an RTX indicating whether function argument @var{arg} is passed\n\
+in a register and if so, which register.  Argument @var{ca} summarizes all\n\
+the previous arguments.\n\
 \n\
 The return value is usually either a @code{reg} RTX for the hard\n\
 register in which to pass the argument, or zero to pass the argument\n\
@@ -4826,8 +4801,7 @@ is not defined and @code{TARGET_FUNCTION_ARG} returns nonzero for such an\n\
 argument, the compiler will abort.  If @code{REG_PARM_STACK_SPACE} is\n\
 defined, the argument will be computed in the stack and then loaded into\n\
 a register.",
- rtx, (cumulative_args_t ca, machine_mode mode, const_tree type,
-       bool named),
+ rtx, (cumulative_args_t ca, const function_arg_info &arg),
  default_function_arg)
 
 DEFHOOK
@@ -4849,8 +4823,7 @@ so that it can be used to pass special arguments.\n\
 \n\
 If @code{TARGET_FUNCTION_INCOMING_ARG} is not defined,\n\
 @code{TARGET_FUNCTION_ARG} serves both purposes.",
- rtx, (cumulative_args_t ca, machine_mode mode, const_tree type,
-       bool named),
+ rtx, (cumulative_args_t ca, const function_arg_info &arg),
  default_function_incoming_arg)
 
 DEFHOOK
@@ -4962,6 +4935,28 @@ If this hook is not defined, then FUNCTION_VALUE_REGNO_P will be used.",
  bool, (const unsigned int regno),
  default_function_value_regno_p)
 
+DEFHOOK
+(fntype_abi,
+ "Return the ABI used by a function with type @var{type}; see the\n\
+definition of @code{predefined_function_abi} for details of the ABI\n\
+descriptor.  Targets only need to define this hook if they support\n\
+interoperability between several ABIs in the same translation unit.",
+ const predefined_function_abi &, (const_tree type),
+ NULL)
+
+DEFHOOK
+(insn_callee_abi,
+ "This hook returns a description of the ABI used by the target of\n\
+call instruction @var{insn}; see the definition of\n\
+@code{predefined_function_abi} for details of the ABI descriptor.\n\
+Only the global function @code{insn_callee_abi} should call this hook\n\
+directly.\n\
+\n\
+Targets only need to define this hook if they support\n\
+interoperability between several ABIs in the same translation unit.",
+ const predefined_function_abi &, (const rtx_insn *insn),
+ NULL)
+
 /* ??? Documenting this hook requires a GFDL license grant.  */
 DEFHOOK_UNDOC
 (internal_arg_pointer,
@@ -5811,32 +5806,27 @@ The default version of this hook always returns @code{true}.",
 
 DEFHOOK
 (hard_regno_call_part_clobbered,
- "This hook should return true if @var{regno} is partly call-saved and\n\
-partly call-clobbered, and if a value of mode @var{mode} would be partly\n\
-clobbered by call instruction @var{insn}.  If @var{insn} is NULL then it\n\
-should return true if any call could partly clobber the register.\n\
-For example, if the low 32 bits of @var{regno} are preserved across a call\n\
-but higher bits are clobbered, this hook should return true for a 64-bit\n\
-mode but false for a 32-bit mode.\n\
+ "ABIs usually specify that calls must preserve the full contents\n\
+of a particular register, or that calls can alter any part of a\n\
+particular register.  This information is captured by the target macro\n\
+@code{CALL_REALLY_USED_REGISTERS}.  However, some ABIs specify that calls\n\
+must preserve certain bits of a particular register but can alter others.\n\
+This hook should return true if this applies to at least one of the\n\
+registers in @samp{(reg:@var{mode} @var{regno})}, and if as a result the\n\
+call would alter part of the @var{mode} value.  For example, if a call\n\
+preserves the low 32 bits of a 64-bit hard register @var{regno} but can\n\
+clobber the upper 32 bits, this hook should return true for a 64-bit mode\n\
+but false for a 32-bit mode.\n\
+\n\
+The value of @var{abi_id} comes from the @code{predefined_function_abi}\n\
+structure that describes the ABI of the call; see the definition of the\n\
+structure for more details.  If (as is usual) the target uses the same ABI\n\
+for all functions in a translation unit, @var{abi_id} is always 0.\n\
 \n\
 The default implementation returns false, which is correct\n\
 for targets that don't have partly call-clobbered registers.",
- bool, (rtx_insn *insn, unsigned int regno, machine_mode mode),
- hook_bool_insn_uint_mode_false)
-
-DEFHOOK
-(return_call_with_max_clobbers,
- "This hook returns a pointer to the call that partially clobbers the\n\
-most registers.  If a platform supports multiple ABIs where the registers\n\
-that are partially clobbered may vary, this function compares two\n\
-calls and returns a pointer to the one that clobbers the most registers.\n\
-If both calls clobber the same registers, @var{call_1} must be returned.\n\
-\n\
-The registers clobbered in different ABIs must be a proper subset or\n\
-superset of all other ABIs.  @var{call_1} must always be a call insn,\n\
-call_2 may be NULL or a call insn.",
- rtx_insn *, (rtx_insn *call_1, rtx_insn *call_2),
- NULL)
+ bool, (unsigned int abi_id, unsigned int regno, machine_mode mode),
+ hook_bool_uint_uint_mode_false)
 
 DEFHOOK
 (get_multilib_abi_name,
@@ -5844,20 +5834,6 @@ DEFHOOK
  const char *, (void),
  hook_constcharptr_void_null)
 
-DEFHOOK
-(remove_extra_call_preserved_regs,
- "This hook removes registers from the set of call-clobbered registers\n\
- in @var{used_regs} if, contrary to the default rules, something guarantees\n\
- that @samp{insn} preserves those registers.  For example, some targets\n\
- support variant ABIs in which functions preserve more registers than\n\
- normal functions would.  Removing those extra registers from @var{used_regs}\n\
- can lead to better register allocation.\n\
- \n\
- The default implementation does nothing, which is always safe.\n\
- Defining the hook is purely an optimization.",
- void, (rtx_insn *insn, HARD_REG_SET *used_regs),
- default_remove_extra_call_preserved_regs)
-
 /* Return the smallest number of different values for which it is best to
    use a jump-table instead of a tree of conditional branches.  */
 DEFHOOK
diff --git a/gcc/target.h b/gcc/target.h
index 057e6ae87..964629669 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -149,6 +149,12 @@ struct ao_ref;
 /* This is defined in tree-vectorizer.h.  */
 struct _stmt_vec_info;
 
+/* This is defined in calls.h.  */
+struct function_arg_info;
+
+/* This is defined in function-abi.h.  */
+struct predefined_function_abi;
+
 /* These are defined in tree-vect-stmts.c.  */
 extern tree stmt_vectype (struct _stmt_vec_info *);
 extern bool stmt_in_inner_loop_p (struct _stmt_vec_info *);
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index 6396f6f4b..6f54de0d5 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -193,11 +193,8 @@ default_expand_builtin_saveregs (void)
 }
 
 void
-default_setup_incoming_varargs (cumulative_args_t ca ATTRIBUTE_UNUSED,
-				machine_mode mode ATTRIBUTE_UNUSED,
-				tree type ATTRIBUTE_UNUSED,
-				int *pretend_arg_size ATTRIBUTE_UNUSED,
-				int second_time ATTRIBUTE_UNUSED)
+default_setup_incoming_varargs (cumulative_args_t,
+				const function_arg_info &, int *, int)
 {
 }
 
@@ -323,22 +320,19 @@ default_cxx_get_cookie_size (tree type)
    of the TARGET_PASS_BY_REFERENCE hook uses just MUST_PASS_IN_STACK.  */
 
 bool
-hook_pass_by_reference_must_pass_in_stack (cumulative_args_t c ATTRIBUTE_UNUSED,
-	machine_mode mode ATTRIBUTE_UNUSED, const_tree type ATTRIBUTE_UNUSED,
-	bool named_arg ATTRIBUTE_UNUSED)
+hook_pass_by_reference_must_pass_in_stack (cumulative_args_t,
+					   const function_arg_info &arg)
 {
-  return targetm.calls.must_pass_in_stack (mode, type);
+  return targetm.calls.must_pass_in_stack (arg);
 }
 
 /* Return true if a parameter follows callee copies conventions.  This
    version of the hook is true for all named arguments.  */
 
 bool
-hook_callee_copies_named (cumulative_args_t ca ATTRIBUTE_UNUSED,
-			  machine_mode mode ATTRIBUTE_UNUSED,
-			  const_tree type ATTRIBUTE_UNUSED, bool named)
+hook_callee_copies_named (cumulative_args_t, const function_arg_info &arg)
 {
-  return named;
+  return arg.named;
 }
 
 /* Emit to STREAM the assembler syntax for insn operand X.  */
@@ -681,16 +675,6 @@ default_builtin_md_vectorized_function (tree, tree, tree)
   return NULL_TREE;
 }
 
-/* Vectorized conversion.  */
-
-tree
-default_builtin_vectorized_conversion (unsigned int code ATTRIBUTE_UNUSED,
-				       tree dest_type ATTRIBUTE_UNUSED,
-				       tree src_type ATTRIBUTE_UNUSED)
-{
-  return NULL_TREE;
-}
-
 /* Default vectorizer cost model values.  */
 
 int
@@ -737,28 +721,22 @@ default_builtin_reciprocal (tree)
 }
 
 bool
-hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false (
-	cumulative_args_t ca ATTRIBUTE_UNUSED,
-	machine_mode mode ATTRIBUTE_UNUSED,
-	const_tree type ATTRIBUTE_UNUSED, bool named ATTRIBUTE_UNUSED)
+hook_bool_CUMULATIVE_ARGS_arg_info_false (cumulative_args_t,
+					  const function_arg_info &)
 {
   return false;
 }
 
 bool
-hook_bool_CUMULATIVE_ARGS_mode_tree_bool_true (
-	cumulative_args_t ca ATTRIBUTE_UNUSED,
-	machine_mode mode ATTRIBUTE_UNUSED,
-	const_tree type ATTRIBUTE_UNUSED, bool named ATTRIBUTE_UNUSED)
+hook_bool_CUMULATIVE_ARGS_arg_info_true (cumulative_args_t,
+					 const function_arg_info &)
 {
   return true;
 }
 
 int
-hook_int_CUMULATIVE_ARGS_mode_tree_bool_0 (
-	cumulative_args_t ca ATTRIBUTE_UNUSED,
-	machine_mode mode ATTRIBUTE_UNUSED,
-	tree type ATTRIBUTE_UNUSED, bool named ATTRIBUTE_UNUSED)
+hook_int_CUMULATIVE_ARGS_arg_info_0 (cumulative_args_t,
+				     const function_arg_info &)
 {
   return 0;
 }
@@ -770,10 +748,7 @@ hook_void_CUMULATIVE_ARGS_tree (cumulative_args_t ca ATTRIBUTE_UNUSED,
 }
 
 void
-default_function_arg_advance (cumulative_args_t ca ATTRIBUTE_UNUSED,
-			      machine_mode mode ATTRIBUTE_UNUSED,
-			      const_tree type ATTRIBUTE_UNUSED,
-			      bool named ATTRIBUTE_UNUSED)
+default_function_arg_advance (cumulative_args_t, const function_arg_info &)
 {
   gcc_unreachable ();
 }
@@ -814,19 +789,13 @@ default_function_arg_padding (machine_mode mode, const_tree type)
 }
 
 rtx
-default_function_arg (cumulative_args_t ca ATTRIBUTE_UNUSED,
-		      machine_mode mode ATTRIBUTE_UNUSED,
-		      const_tree type ATTRIBUTE_UNUSED,
-		      bool named ATTRIBUTE_UNUSED)
+default_function_arg (cumulative_args_t, const function_arg_info &)
 {
   gcc_unreachable ();
 }
 
 rtx
-default_function_incoming_arg (cumulative_args_t ca ATTRIBUTE_UNUSED,
-			       machine_mode mode ATTRIBUTE_UNUSED,
-			       const_tree type ATTRIBUTE_UNUSED,
-			       bool named ATTRIBUTE_UNUSED)
+default_function_incoming_arg (cumulative_args_t, const function_arg_info &)
 {
   gcc_unreachable ();
 }
@@ -1061,12 +1030,6 @@ default_return_pops_args (tree, tree, poly_int64)
   return 0;
 }
 
-reg_class_t
-default_branch_target_register_class (void)
-{
-  return NO_REGS;
-}
-
 reg_class_t
 default_ira_change_pseudo_allocno_class (int regno ATTRIBUTE_UNUSED,
 					 reg_class_t cl,
@@ -1732,9 +1695,9 @@ get_move_ratio (bool speed_p ATTRIBUTE_UNUSED)
 #ifdef MOVE_RATIO
   move_ratio = (unsigned int) MOVE_RATIO (speed_p);
 #else
-#if defined (HAVE_movmemqi) || defined (HAVE_movmemhi) || defined (HAVE_movmemsi) || defined (HAVE_movmemdi) || defined (HAVE_movmemti)
+#if defined (HAVE_cpymemqi) || defined (HAVE_cpymemhi) || defined (HAVE_cpymemsi) || defined (HAVE_cpymemdi) || defined (HAVE_cpymemti)
   move_ratio = 2;
-#else /* No movmem patterns, pick a default.  */
+#else /* No cpymem patterns, pick a default.  */
   move_ratio = ((speed_p) ? 15 : 3);
 #endif
 #endif
@@ -1742,7 +1705,7 @@ get_move_ratio (bool speed_p ATTRIBUTE_UNUSED)
 }
 
 /* Return TRUE if the move_by_pieces/set_by_pieces infrastructure should be
-   used; return FALSE if the movmem/setmem optab should be expanded, or
+   used; return FALSE if the cpymem/setmem optab should be expanded, or
    a call to memcpy emitted.  */
 
 bool
@@ -1941,7 +1904,7 @@ default_dwarf_frame_reg_mode (int regno)
 {
   machine_mode save_mode = reg_raw_mode[regno];
 
-  if (targetm.hard_regno_call_part_clobbered (NULL, regno, save_mode))
+  if (targetm.hard_regno_call_part_clobbered (0, regno, save_mode))
     save_mode = choose_hard_reg_mode (regno, 1, true);
   return save_mode;
 }
@@ -2163,7 +2126,7 @@ std_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
   if (ARGS_GROW_DOWNWARD)
     gcc_unreachable ();
 
-  indirect = pass_by_reference (NULL, TYPE_MODE (type), type, false);
+  indirect = pass_va_arg_by_reference (type);
   if (indirect)
     type = build_pointer_type (type);
 
@@ -2260,15 +2223,6 @@ std_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
   return build_va_arg_indirect_ref (addr);
 }
 
-void
-default_setup_incoming_vararg_bounds (cumulative_args_t ca ATTRIBUTE_UNUSED,
-				      machine_mode mode ATTRIBUTE_UNUSED,
-				      tree type ATTRIBUTE_UNUSED,
-				      int *pretend_arg_size ATTRIBUTE_UNUSED,
-				      int second_time ATTRIBUTE_UNUSED)
-{
-}
-
 /* An implementation of TARGET_CAN_USE_DOLOOP_P for targets that do
    not support nested low-overhead loops.  */
 
@@ -2385,9 +2339,4 @@ default_speculation_safe_value (machine_mode mode ATTRIBUTE_UNUSED,
   return result;
 }
 
-void
-default_remove_extra_call_preserved_regs (rtx_insn *, HARD_REG_SET *)
-{
-}
-
 #include "gt-targhooks.h"
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 2d5991908..e5e803c33 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -40,7 +40,9 @@ extern machine_mode default_cc_modes_compatible (machine_mode,
 extern bool default_return_in_memory (const_tree, const_tree);
 
 extern rtx default_expand_builtin_saveregs (void);
-extern void default_setup_incoming_varargs (cumulative_args_t, machine_mode, tree, int *, int);
+extern void default_setup_incoming_varargs (cumulative_args_t,
+					    const function_arg_info &,
+					    int *, int);
 extern rtx default_builtin_setjmp_frame_value (void);
 extern bool default_pretend_outgoing_varargs_named (cumulative_args_t);
 
@@ -63,9 +65,9 @@ extern tree default_cxx_guard_type (void);
 extern tree default_cxx_get_cookie_size (tree);
 
 extern bool hook_pass_by_reference_must_pass_in_stack
-  (cumulative_args_t, machine_mode mode, const_tree, bool);
+  (cumulative_args_t, const function_arg_info &);
 extern bool hook_callee_copies_named
-  (cumulative_args_t ca, machine_mode, const_tree, bool);
+  (cumulative_args_t ca, const function_arg_info &);
 
 extern void default_print_operand (FILE *, rtx, int);
 extern void default_print_operand_address (FILE *, machine_mode, rtx);
@@ -90,8 +92,6 @@ extern const char * default_invalid_within_doloop (const rtx_insn *);
 extern tree default_builtin_vectorized_function (unsigned int, tree, tree);
 extern tree default_builtin_md_vectorized_function (tree, tree, tree);
 
-extern tree default_builtin_vectorized_conversion (unsigned int, tree, tree);
-
 extern int default_builtin_vectorization_cost (enum vect_cost_for_stmt, tree, int);
 
 extern tree default_builtin_reciprocal (tree);
@@ -135,24 +135,23 @@ extern void default_goacc_reduction (gcall *);
 extern bool hook_bool_CUMULATIVE_ARGS_false (cumulative_args_t);
 extern bool hook_bool_CUMULATIVE_ARGS_true (cumulative_args_t);
 
-extern bool hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
-  (cumulative_args_t, machine_mode, const_tree, bool);
-extern bool hook_bool_CUMULATIVE_ARGS_mode_tree_bool_true
-  (cumulative_args_t, machine_mode, const_tree, bool);
-extern int hook_int_CUMULATIVE_ARGS_mode_tree_bool_0
-  (cumulative_args_t, machine_mode, tree, bool);
+extern bool hook_bool_CUMULATIVE_ARGS_arg_info_false
+  (cumulative_args_t, const function_arg_info &);
+extern bool hook_bool_CUMULATIVE_ARGS_arg_info_true
+  (cumulative_args_t, const function_arg_info &);
+extern int hook_int_CUMULATIVE_ARGS_arg_info_0
+  (cumulative_args_t, const function_arg_info &);
 extern void hook_void_CUMULATIVE_ARGS_tree
   (cumulative_args_t, tree);
 extern const char *hook_invalid_arg_for_unprototyped_fn
   (const_tree, const_tree, const_tree);
 extern void default_function_arg_advance
-  (cumulative_args_t, machine_mode, const_tree, bool);
+  (cumulative_args_t, const function_arg_info &);
 extern HOST_WIDE_INT default_function_arg_offset (machine_mode, const_tree);
 extern pad_direction default_function_arg_padding (machine_mode, const_tree);
-extern rtx default_function_arg
-  (cumulative_args_t, machine_mode, const_tree, bool);
-extern rtx default_function_incoming_arg
-  (cumulative_args_t, machine_mode, const_tree, bool);
+extern rtx default_function_arg (cumulative_args_t, const function_arg_info &);
+extern rtx default_function_incoming_arg (cumulative_args_t,
+					  const function_arg_info &);
 extern unsigned int default_function_arg_boundary (machine_mode,
 						   const_tree);
 extern unsigned int default_function_arg_round_boundary (machine_mode,
@@ -165,7 +164,6 @@ extern rtx default_internal_arg_pointer (void);
 extern rtx default_static_chain (const_tree, bool);
 extern void default_trampoline_init (rtx, tree, rtx);
 extern poly_int64 default_return_pops_args (tree, tree, poly_int64);
-extern reg_class_t default_branch_target_register_class (void);
 extern reg_class_t default_ira_change_pseudo_allocno_class (int, reg_class_t,
 							    reg_class_t);
 extern bool default_lra_p (void);
@@ -266,11 +264,6 @@ extern rtx default_load_bounds_for_arg (rtx, rtx, rtx);
 extern void default_store_bounds_for_arg (rtx, rtx, rtx, rtx);
 extern rtx default_load_returned_bounds (rtx);
 extern void default_store_returned_bounds (rtx,rtx);
-extern void default_setup_incoming_vararg_bounds (cumulative_args_t ca ATTRIBUTE_UNUSED,
-						  machine_mode mode ATTRIBUTE_UNUSED,
-						  tree type ATTRIBUTE_UNUSED,
-						  int *pretend_arg_size ATTRIBUTE_UNUSED,
-						  int second_time ATTRIBUTE_UNUSED);
 extern bool default_optab_supported_p (int, machine_mode, machine_mode,
 				       optimization_type);
 extern unsigned int default_max_noce_ifcvt_seq_cost (edge);
@@ -287,7 +280,5 @@ extern tree default_preferred_else_value (unsigned, tree, unsigned, tree *);
 extern bool default_have_speculation_safe_value (bool);
 extern bool speculation_safe_value_not_needed (bool);
 extern rtx default_speculation_safe_value (machine_mode, rtx, rtx, rtx);
-extern void default_remove_extra_call_preserved_regs (rtx_insn *,
-						      HARD_REG_SET *);
 
 #endif /* GCC_TARGHOOKS_H */
diff --git a/gcc/testsuite/c-c++-common/guality/Og-dce-1.c b/gcc/testsuite/c-c++-common/guality/Og-dce-1.c
new file mode 100644
index 000000000..a859e3252
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/guality/Og-dce-1.c
@@ -0,0 +1,14 @@
+/* { dg-do run } */
+/* { dg-options "-g" } */
+
+int *__attribute__((noipa)) consume (int *ptr) { return ptr; }
+
+int
+main (void)
+{
+  int x;
+  int *volatile ptr = consume (&x);
+  x = 0;
+  x = 1;	/* { dg-final { gdb-test . "*ptr" "0" } } */
+  return 0;	/* { dg-final { gdb-test . "*ptr" "1" } } */
+}
diff --git a/gcc/testsuite/c-c++-common/guality/Og-dce-2.c b/gcc/testsuite/c-c++-common/guality/Og-dce-2.c
new file mode 100644
index 000000000..3df2c7921
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/guality/Og-dce-2.c
@@ -0,0 +1,19 @@
+/* { dg-do run } */
+/* { dg-options "-g" } */
+
+struct s { int a, b, c, d; };
+
+struct s gs1 = { 1, 2, 3, 4 };
+struct s gs2 = { 5, 6, 7, 8 };
+
+struct s *__attribute__((noipa)) consume (struct s *ptr) { return ptr; }
+
+int
+main (void)
+{
+  struct s x;
+  struct s *volatile ptr = consume (&x);
+  x = gs1;
+  x = gs2;	/* { dg-final { gdb-test . "ptr->a" "1" } } */
+  return 0;	/* { dg-final { gdb-test . "ptr->a" "5" } } */
+}
diff --git a/gcc/testsuite/c-c++-common/guality/Og-dce-3.c b/gcc/testsuite/c-c++-common/guality/Og-dce-3.c
new file mode 100644
index 000000000..fa6186a73
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/guality/Og-dce-3.c
@@ -0,0 +1,29 @@
+/* { dg-do run } */
+/* { dg-options "-g" } */
+
+volatile int amount = 10;
+
+void __attribute__((noipa))
+do_something (int *ptr)
+{
+  *ptr += 10;
+}
+
+int __attribute__((noipa))
+foo (int count)
+{
+  int x = 1;
+  for (int i = 0; i < count; ++i)
+    do_something (&x); /* { dg-final { gdb-test . "x" "1" } } */
+  int res = x; /* { dg-final { gdb-test . "x" "101" } } */
+  x = res + 1;
+  return res; /* { dg-final { gdb-test . "x" "102" } } */
+  
+}
+
+int
+main (void)
+{
+  foo (10);
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/guality/Og-global-dse-1.c b/gcc/testsuite/c-c++-common/guality/Og-global-dse-1.c
new file mode 100644
index 000000000..3d4b4e60e
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/guality/Og-global-dse-1.c
@@ -0,0 +1,17 @@
+/* { dg-do run } */
+/* { dg-options "-g" } */
+
+struct s { int i, j; };
+struct s gs1, gs2 = { 3, 4 };
+
+void __attribute__((noipa)) consume (void) {};
+
+int
+main (void)
+{
+  gs1.i = 1;
+  gs1.j = 2;	/* { dg-final { gdb-test . "gs1.i" "1" } } */
+  gs1 = gs2;	/* { dg-final { gdb-test . "gs1.j" "2" } } */
+  consume ();	/* { dg-final { gdb-test . "gs1.i" "3" } } */
+  return 0;	/* { dg-final { gdb-test . "gs1.j" "4" } } */
+}
diff --git a/gcc/testsuite/c-c++-common/guality/Og-static-wo-1.c b/gcc/testsuite/c-c++-common/guality/Og-static-wo-1.c
new file mode 100644
index 000000000..a4c7f3067
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/guality/Og-static-wo-1.c
@@ -0,0 +1,15 @@
+/* { dg-do run } */
+/* { dg-options "-g" } */
+
+#include "../../gcc.dg/nop.h"
+
+static int x = 0;
+
+int
+main (void)
+{
+  asm volatile (NOP);		/* { dg-final { gdb-test . "x" "0" } } */
+  x = 1;
+  asm volatile (NOP);		/* { dg-final { gdb-test . "x" "1" } } */
+  return 0;
+}
diff --git a/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C b/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
index 5740c0281..50c1452ed 100644
--- a/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
+++ b/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
@@ -14,6 +14,7 @@ void f4 (uint16x4_t a) {}
 void f5 (uint32x2_t a) {}
 void f23 (uint64x1_t a) {}
 void f61 (float16x4_t a) {}
+void f62 (bfloat16x4_t a) {}
 void f6 (float32x2_t a) {}
 void f7 (poly8x8_t a) {}
 void f8 (poly16x4_t a) {}
@@ -27,6 +28,7 @@ void f14 (uint16x8_t a) {}
 void f15 (uint32x4_t a) {}
 void f16 (uint64x2_t a) {}
 void f171 (float16x8_t a) {}
+void f172 (bfloat16x8_t a) {}
 void f17 (float32x4_t a) {}
 void f18 (float64x2_t a) {}
 void f19 (poly8x16_t a) {}
@@ -45,6 +47,7 @@ void g1 (int8x16_t, int8x16_t) {}
 // { dg-final { scan-assembler "_Z2f512__Uint32x2_t:" } }
 // { dg-final { scan-assembler "_Z3f2312__Uint64x1_t:" } }
 // { dg-final { scan-assembler "_Z3f6113__Float16x4_t:" } }
+// { dg-final { scan-assembler "_Z3f6214__Bfloat16x4_t:" } }
 // { dg-final { scan-assembler "_Z2f613__Float32x2_t:" } }
 // { dg-final { scan-assembler "_Z2f711__Poly8x8_t:" } }
 // { dg-final { scan-assembler "_Z2f812__Poly16x4_t:" } }
@@ -57,6 +60,7 @@ void g1 (int8x16_t, int8x16_t) {}
 // { dg-final { scan-assembler "_Z3f1512__Uint32x4_t:" } }
 // { dg-final { scan-assembler "_Z3f1612__Uint64x2_t:" } }
 // { dg-final { scan-assembler "_Z4f17113__Float16x8_t:" } }
+// { dg-final { scan-assembler "_Z4f17214__Bfloat16x8_t:" } }
 // { dg-final { scan-assembler "_Z3f1713__Float32x4_t:" } }
 // { dg-final { scan-assembler "_Z3f1813__Float64x2_t:" } }
 // { dg-final { scan-assembler "_Z3f1912__Poly8x16_t:" } }
diff --git a/gcc/testsuite/g++.dg/diagnostic/aka4.C b/gcc/testsuite/g++.dg/diagnostic/aka4.C
new file mode 100644
index 000000000..da8c57964
--- /dev/null
+++ b/gcc/testsuite/g++.dg/diagnostic/aka4.C
@@ -0,0 +1,9 @@
+typedef unsigned int myvec __attribute__((vector_size (16)));
+
+void f (float x)
+{
+  myvec y = x; // { dg-error {cannot convert 'float' to 'myvec' {aka '__vector\([48]\) unsigned int'} in initialization} }
+  myvec *ptr = &x; // { dg-error {cannot convert 'float\*' to 'myvec\*' {aka '__vector\([48]\) unsigned int\*'} in initialization} }
+  const myvec *const_ptr = &x; // { dg-error {cannot convert 'float\*' to 'const myvec\*' {aka 'const __vector\([48]\) unsigned int\*'} in initialization} }
+  volatile myvec *volatile_ptr = &x; // { dg-error {cannot convert 'float\*' to 'volatile myvec\*' {aka 'volatile __vector\([48]\) unsigned int\*'} in initialization} }
+}
diff --git a/gcc/testsuite/g++.dg/ext/arm-bf16/bf16-mangle-aarch64-1.C b/gcc/testsuite/g++.dg/ext/arm-bf16/bf16-mangle-aarch64-1.C
new file mode 100644
index 000000000..5426a1814
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/arm-bf16/bf16-mangle-aarch64-1.C
@@ -0,0 +1,13 @@
+/* { dg-do compile { target aarch64*-*-* } } */
+
+/* Test mangling */
+
+/* { dg-final { scan-assembler "\t.global\t_Z1fPu6__bf16" } } */
+void f (__bf16 *x) { }
+
+/* { dg-final { scan-assembler "\t.global\t_Z1gPu6__bf16S_" } } */
+void g (__bf16 *x, __bf16 *y) { }
+
+/* { dg-final { scan-assembler "\t.global\t_ZN1SIu6__bf16u6__bf16E1iE" } } */
+template <typename T, typename U> struct S { static int i; };
+template <> int S<__bf16, __bf16>::i = 3;
diff --git a/gcc/testsuite/g++.dg/guality/guality.exp b/gcc/testsuite/g++.dg/guality/guality.exp
index 757b20b61..33571f1f2 100644
--- a/gcc/testsuite/g++.dg/guality/guality.exp
+++ b/gcc/testsuite/g++.dg/guality/guality.exp
@@ -65,8 +65,22 @@ if {[check_guality "
     return 0;
   }
 "]} {
-  gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.C]] "" ""
-  gcc-dg-runtest [lsort [glob $srcdir/c-c++-common/guality/*.c]] "" ""
+    set general [list]
+    set Og [list]
+    foreach file [lsort [glob $srcdir/c-c++-common/guality/*.c]] {
+	switch -glob -- [file tail $file] {
+	    Og-* { lappend Og $file }
+	    * { lappend general $file }
+	}
+    }
+
+    gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.C]] "" ""
+    gcc-dg-runtest $general "" ""
+    set-torture-options \
+	[list "-O0" "-Og"] \
+	[list {}] \
+	[list "-Og -flto"]
+    gcc-dg-runtest $Og "" ""
 }
 
 if [info exists guality_gdb_name] {
diff --git a/gcc/testsuite/g++.dg/ipa/pr93763.C b/gcc/testsuite/g++.dg/ipa/pr93763.C
index 61117108e..13ab2d57f 100644
--- a/gcc/testsuite/g++.dg/ipa/pr93763.C
+++ b/gcc/testsuite/g++.dg/ipa/pr93763.C
@@ -1,4 +1,4 @@
-/* { dg-do compile } */
+/* { dg-do compile { target c++11 } } */
 /* { dg-options "-O3" } */
 
 struct search_param {
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr53844.C b/gcc/testsuite/g++.dg/tree-ssa/pr53844.C
index 954cc71b4..ab9879f6a 100644
--- a/gcc/testsuite/g++.dg/tree-ssa/pr53844.C
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr53844.C
@@ -1,5 +1,5 @@
 // { dg-do compile }
-// { dg-options "-O2 -fdump-tree-optimized-vops" }
+// { dg-options "-O2 -fdump-tree-optimized-vops -fno-inline-functions --param max-inline-insns-single-O2=200" }
 
 struct VBase;
 
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr61034.C b/gcc/testsuite/g++.dg/tree-ssa/pr61034.C
index 870b23721..2e3dfecac 100644
--- a/gcc/testsuite/g++.dg/tree-ssa/pr61034.C
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr61034.C
@@ -1,5 +1,5 @@
 // { dg-do compile }
-// { dg-options "-O2 -fdump-tree-fre3 -fdump-tree-optimized -fdelete-null-pointer-checks" }
+// { dg-options "-O2 -fdump-tree-fre3 -fdump-tree-optimized -fdelete-null-pointer-checks --param early-inlining-insns-O2=14" }
 
 #define assume(x) if(!(x))__builtin_unreachable()
 
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr8781.C b/gcc/testsuite/g++.dg/tree-ssa/pr8781.C
index 1f115b2b2..5bc1ef035 100644
--- a/gcc/testsuite/g++.dg/tree-ssa/pr8781.C
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr8781.C
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O -fno-tree-sra -fdump-tree-fre1" } */
+/* { dg-options "-O -fno-tree-sra -fdump-tree-fre1 --param early-inlining-insns-O2=14" } */
 
 int f();
 
diff --git a/gcc/testsuite/g++.dg/warn/Wstringop-truncation-1.C b/gcc/testsuite/g++.dg/warn/Wstringop-truncation-1.C
index 830660197..49dde0a65 100644
--- a/gcc/testsuite/g++.dg/warn/Wstringop-truncation-1.C
+++ b/gcc/testsuite/g++.dg/warn/Wstringop-truncation-1.C
@@ -1,7 +1,7 @@
 /* PR/tree-optimization/84480 - bogus -Wstringop-truncation despite
    assignment with an inlined string literal
    { dg-do compile }
-   { dg-options "-O2 -Wstringop-truncation" }  */
+   { dg-options "-O2 -Wstringop-truncation --param early-inlining-insns-O2=14" }  */
 
 #include <string.h>
 
diff --git a/gcc/testsuite/g++.target/aarch64/bfloat_cpp_typecheck.C b/gcc/testsuite/g++.target/aarch64/bfloat_cpp_typecheck.C
new file mode 100644
index 000000000..9203d91f8
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/bfloat_cpp_typecheck.C
@@ -0,0 +1,14 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps" } */
+
+#include <arm_neon.h>
+
+void foo (void)
+{
+  bfloat16_t (); /* { dg-bogus {invalid conversion to type 'bfloat16_t'} "" { xfail *-*-* } } */
+  bfloat16_t a = bfloat16_t(); /* { dg-bogus {invalid conversion to type 'bfloat16_t'} "" { xfail *-*-* } } */
+  bfloat16_t (0x1234); /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16_t (0.1); /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp b/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp
new file mode 100644
index 000000000..e9d624ff8
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp
@@ -0,0 +1,83 @@
+#  Assembly-based regression-test driver for the SVE ACLE
+#  Copyright (C) 2009-2019 Free Software Foundation, Inc.
+#
+#  This file is part of GCC.
+#
+#  GCC is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 3, or (at your option)
+#  any later version.
+#
+#  GCC is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with GCC; see the file COPYING3.  If not see
+#  <http://www.gnu.org/licenses/>.  */
+
+# GCC testsuite that uses the `dg.exp' driver.
+
+# Exit immediately if this isn't an AArch64 target.
+if { ![istarget aarch64*-*-*] } {
+    return
+}
+
+# Load support procs.
+load_lib g++-dg.exp
+
+# Initialize `dg'.
+dg-init
+
+# Force SVE if we're not testing it already.
+if { [check_effective_target_aarch64_sve] } {
+    set sve_flags ""
+} else {
+    set sve_flags "-march=armv8.2-a+sve"
+}
+
+global gcc_runtest_parallelize_limit_minor
+if { [info exists gcc_runtest_parallelize_limit_minor] } {
+    set old_limit_minor $gcc_runtest_parallelize_limit_minor
+    set gcc_runtest_parallelize_limit_minor 1
+}
+
+torture-init
+set-torture-options {
+    "-std=c++98 -O0 -g"
+    "-std=c++98 -O1 -g"
+    "-std=c++11 -O2 -g"
+    "-std=c++14 -O3 -g"
+    "-std=c++17 -Og -g"
+    "-std=c++2a -Os -g"
+    "-std=gnu++98 -O2 -fno-schedule-insns -DCHECK_ASM --save-temps"
+    "-std=gnu++11 -Ofast -g"
+    "-std=gnu++17 -O3 -g"
+    "-std=gnu++2a -O0 -g"
+} {
+    "-DTEST_FULL"
+    "-DTEST_OVERLOADS"
+}
+
+# Main loop.
+set gcc_subdir [string replace $subdir 0 2 gcc]
+set files [glob -nocomplain $srcdir/$gcc_subdir/asm/*.c]
+set save-dg-do-what-default ${dg-do-what-default}
+if { [check_effective_target_aarch64_asm_sve_ok]
+     && [check_effective_target_aarch64_variant_pcs] } {
+    set dg-do-what-default assemble
+} else {
+    set dg-do-what-default compile
+}
+gcc-dg-runtest [lsort $files] "" "$sve_flags -fno-ipa-icf"
+set dg-do-what-default ${save-dg-do-what-default}
+
+torture-finish
+
+if { [info exists gcc_runtest_parallelize_limit_minor] } {
+    set gcc_runtest_parallelize_limit_minor $old_limit_minor
+}
+
+# All done.
+dg-finish
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle.exp b/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle.exp
new file mode 100644
index 000000000..54c43a3ac
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/aarch64-sve-acle.exp
@@ -0,0 +1,55 @@
+#  Specific regression driver for AArch64 SVE.
+#  Copyright (C) 2009-2019 Free Software Foundation, Inc.
+#  Contributed by ARM Ltd.
+#
+#  This file is part of GCC.
+#
+#  GCC is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 3, or (at your option)
+#  any later version.
+#
+#  GCC is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with GCC; see the file COPYING3.  If not see
+#  <http://www.gnu.org/licenses/>.  */
+
+# GCC testsuite that uses the `dg.exp' driver.
+
+# Exit immediately if this isn't an AArch64 target.
+if {![istarget aarch64*-*-*] } {
+    return
+}
+
+# Load support procs.
+load_lib g++-dg.exp
+
+# If a testcase doesn't have special options, use these.
+global DEFAULT_CXXFLAGS
+if ![info exists DEFAULT_CXXFLAGS] then {
+    set DEFAULT_CXXFLAGS " -pedantic-errors -Wno-long-long"
+}
+
+# Initialize `dg'.
+dg-init
+
+# Force SVE if we're not testing it already.
+if { [check_effective_target_aarch64_sve] } {
+    set sve_flags ""
+} else {
+    set sve_flags "-march=armv8.2-a+sve"
+}
+
+# Main loop.
+set gcc_subdir [string replace $subdir 0 2 gcc]
+set files [glob -nocomplain \
+	       "$srcdir/$gcc_subdir/general/*.c" \
+	       "$srcdir/$subdir/general-c++/*.\[cC\]"]
+dg-runtest [lsort $files] "$sve_flags" $DEFAULT_CXXFLAGS
+
+# All done.
+dg-finish
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_1.C
new file mode 100644
index 000000000..44aa10e20
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_1.C
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+
+#include "add_1.h"
+
+svuint8_t
+f1 (svbool_t pg, svuint8_t x, svint8_t y)
+{
+  return svadd_u8_x (pg, x, y); /* { dg-error "cannot convert 'svint8_t' to 'svuint8_t'" } */
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_1.h b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_1.h
new file mode 100644
index 000000000..d441328a3
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_1.h
@@ -0,0 +1,2 @@
+#pragma GCC system_header
+#pragma GCC aarch64 "arm_sve.h" /* { dg-message "initializing argument 3" } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_2.C
new file mode 100644
index 000000000..fcfb0f489
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_2.C
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+
+#include "add_2.h"
+
+void
+f1 (svbool_t pg, svuint8_t x, svint8_t y)
+{
+  svadd_x (pg, x); /* { dg-error {no matching function for call to 'svadd_x\(svbool_t&, svuint8_t&\)'} } */
+  svadd_x (pg, x, x, x); /* { dg-error {no matching function for call to 'svadd_x\(svbool_t&, svuint8_t&, svuint8_t&, svuint8_t&\)'} } */
+  svadd_x (x, x, x); /* { dg-error {no matching function for call to 'svadd_x\(svuint8_t&, svuint8_t&, svuint8_t&\)'} } */
+  svadd_x (pg, pg, pg); /* { dg-error {no matching function for call to 'svadd_x\(svbool_t&, svbool_t&, svbool_t&\)'} } */
+  svadd_x (pg, 1, x); /* { dg-error {no matching function for call to 'svadd_x\(svbool_t&, int, svuint8_t&\)'} } */
+  svadd_x (pg, x, y); /* { dg-error {no matching function for call to 'svadd_x\(svbool_t&, svuint8_t&, svint8_t&\)'} } */
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_2.h b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_2.h
new file mode 100644
index 000000000..2b3a520d3
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_2.h
@@ -0,0 +1,9 @@
+#pragma GCC system_header
+#pragma GCC aarch64 "arm_sve.h"
+/* { dg-message {note: candidate: 'svfloat16_t svadd_x\(svbool_t, svfloat16_t, svfloat16_t\)'} "" { target *-*-* } 3 } */
+/* { dg-message {note: *candidate expects 3 arguments, 2 provided} "" { target *-*-* } 3 } */
+/* { dg-message {note: *candidate expects 3 arguments, 4 provided} "" { target *-*-* } 3 } */
+/* { dg-message {note: *no known conversion for argument 1 from 'svuint8_t' to 'svbool_t'} "" { target *-*-* } 3 } */
+/* { dg-message {note: *no known conversion for argument 2 from 'svbool_t' to 'svfloat16_t'} "" { target *-*-* } 3 } */
+/* { dg-message {note: *no known conversion for argument 2 from 'int' to 'svfloat16_t'} "" { target *-*-* } 3 } */
+/* { dg-message {note: *no known conversion for argument 2 from 'svuint8_t' to 'svfloat16_t'} "" { target *-*-* } 3 } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_3.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_3.C
new file mode 100644
index 000000000..1d811fc76
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/add_3.C
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized -fnon-call-exceptions" } */
+
+#include <arm_sve.h>
+
+svint8_t
+foo (svbool_t pg, svint8_t a, svint8_t b)
+{
+  try
+    {
+      a = svadd_m (pg, a, b);
+    }
+  catch (...)
+    {
+      a = b;
+    }
+  return a;
+}
+
+/* { dg-final { scan-tree-dump-not {__cxa_begin_catch} "optimized" } } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_1.C
new file mode 100644
index 000000000..a73934f56
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_1.C
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+constexpr uint64_t const_add (uint64_t a, uint64_t b) { return a + b; }
+uint64_t add (uint64_t a, uint64_t b) { return a + b; }
+
+void
+f1 (svbool_t pg, svuint8_t u8, svint8_t s8, svint16_t s16,
+    svint32_t s32, svint64_t s64, int x)
+{
+  const int one = 1;
+  u8 = svasrd_x (pg, u8, 1); /* { dg-error {no matching function for call to 'svasrd_x\(svbool_t&, svuint8_t&, [^)]*\)'} } */
+  s8 = svasrd_x (pg, s8, x); /* { dg-error "argument 3 of 'svasrd_x' must be an integer constant expression" } */
+  s8 = svasrd_x (pg, s8, one);
+  s8 = svasrd_x (pg, s8, 0.4); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} } */
+  s8 = svasrd_x (pg, s8, 1.0);
+  s8 = svasrd_x (pg, s8, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} } */
+  s8 = svasrd_x (pg, s8, 1);
+  s8 = svasrd_x (pg, s8, 1 + 1);
+  s8 = svasrd_x (pg, s8, const_add (1, 1));
+  s8 = svasrd_x (pg, s8, add (1, 1)); /* { dg-error "argument 3 of 'svasrd_x' must be an integer constant expression" } */
+  s8 = svasrd_x (pg, s8, 8);
+  s8 = svasrd_x (pg, s8, 9); /* { dg-error {passing 9 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} } */
+  s8 = svasrd_x (pg, s8, (uint64_t (1) << 62) + 1); /* { dg-error {passing [^ ]* to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} } */
+  s16 = svasrd_x (pg, s16, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 16\]} } */
+  s16 = svasrd_x (pg, s16, 1);
+  s16 = svasrd_x (pg, s16, 16);
+  s16 = svasrd_x (pg, s16, 17); /* { dg-error {passing 17 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 16\]} } */
+  s32 = svasrd_x (pg, s32, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 32\]} } */
+  s32 = svasrd_x (pg, s32, 1);
+  s32 = svasrd_x (pg, s32, 32);
+  s32 = svasrd_x (pg, s32, 33); /* { dg-error {passing 33 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 32\]} } */
+  s64 = svasrd_x (pg, s64, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 64\]} } */
+  s64 = svasrd_x (pg, s64, 1);
+  s64 = svasrd_x (pg, s64, 64);
+  s64 = svasrd_x (pg, s64, 65); /* { dg-error {passing 65 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 64\]} } */
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_2.C
new file mode 100644
index 000000000..bbe7ba72b
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_2.C
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+constexpr uint64_t const_add (uint64_t a, uint64_t b) { return a + b; }
+uint64_t add (uint64_t a, uint64_t b) { return a + b; }
+
+void
+f1 (svbool_t pg, svint8_t s8, svint16_t s16, svint32_t s32, svint64_t s64,
+    int x)
+{
+  const int one = 1;
+  s8 = svasrd_n_s8_x (pg, s8, x); /* { dg-error "argument 3 of 'svasrd_n_s8_x' must be an integer constant expression" } */
+  s8 = svasrd_n_s8_x (pg, s8, one);
+  s8 = svasrd_n_s8_x (pg, s8, 0.4); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s8_x', which expects a value in the range \[1, 8\]} } */
+  s8 = svasrd_n_s8_x (pg, s8, 1.0);
+  s8 = svasrd_n_s8_x (pg, s8, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s8_x', which expects a value in the range \[1, 8\]} } */
+  s8 = svasrd_n_s8_x (pg, s8, 1);
+  s8 = svasrd_n_s8_x (pg, s8, 1 + 1);
+  s8 = svasrd_n_s8_x (pg, s8, const_add (1, 1));
+  s8 = svasrd_n_s8_x (pg, s8, add (1, 1)); /* { dg-error "argument 3 of 'svasrd_n_s8_x' must be an integer constant expression" } */
+  s8 = svasrd_n_s8_x (pg, s8, 8);
+  s8 = svasrd_n_s8_x (pg, s8, 9); /* { dg-error {passing 9 to argument 3 of 'svasrd_n_s8_x', which expects a value in the range \[1, 8\]} } */
+  s8 = svasrd_n_s8_x (pg, s8, (uint64_t (1) << 62) + 1); /* { dg-error {passing [^ ]* to argument 3 of 'svasrd_n_s8_x', which expects a value in the range \[1, 8\]} } */
+  s16 = svasrd_n_s16_x (pg, s16, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s16_x', which expects a value in the range \[1, 16\]} } */
+  s16 = svasrd_n_s16_x (pg, s16, 1);
+  s16 = svasrd_n_s16_x (pg, s16, 16);
+  s16 = svasrd_n_s16_x (pg, s16, 17); /* { dg-error {passing 17 to argument 3 of 'svasrd_n_s16_x', which expects a value in the range \[1, 16\]} } */
+  s32 = svasrd_n_s32_x (pg, s32, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s32_x', which expects a value in the range \[1, 32\]} } */
+  s32 = svasrd_n_s32_x (pg, s32, 1);
+  s32 = svasrd_n_s32_x (pg, s32, 32);
+  s32 = svasrd_n_s32_x (pg, s32, 33); /* { dg-error {passing 33 to argument 3 of 'svasrd_n_s32_x', which expects a value in the range \[1, 32\]} } */
+  s64 = svasrd_n_s64_x (pg, s64, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s64_x', which expects a value in the range \[1, 64\]} } */
+  s64 = svasrd_n_s64_x (pg, s64, 1);
+  s64 = svasrd_n_s64_x (pg, s64, 64);
+  s64 = svasrd_n_s64_x (pg, s64, 65); /* { dg-error {passing 65 to argument 3 of 'svasrd_n_s64_x', which expects a value in the range \[1, 64\]} } */
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_3.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_3.C
new file mode 100644
index 000000000..5ebd770b2
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/asrd_3.C
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+constexpr uint64_t const_add (uint64_t a, uint64_t b) { return a + b; }
+uint64_t add (uint64_t a, uint64_t b) { return a + b; }
+
+template<uint64_t N, typename T>
+T shift (svbool_t pg, T v) { return svasrd_x (pg, v, N); }
+/* { dg-error {no matching function for call to 'svasrd_x\(svbool_t&,} "" { target *-*-* } .-1 } */
+/* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} "" { target *-*-* } .-2 } */
+/* { dg-error {passing 9 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} "" { target *-*-* } .-3 } */
+/* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 16\]} "" { target *-*-* } .-4 } */
+/* { dg-error {passing 17 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 16\]} "" { target *-*-* } .-5 } */
+/* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 32\]} "" { target *-*-* } .-6 } */
+/* { dg-error {passing 33 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 32\]} "" { target *-*-* } .-7 } */
+/* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 64\]} "" { target *-*-* } .-8 } */
+/* { dg-error {passing 65 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 64\]} "" { target *-*-* } .-9 } */
+
+template<typename T>
+T shift1 (svbool_t pg, T v, uint64_t n) { return svasrd_x (pg, v, n); }
+
+template<typename T>
+T shift2 (svbool_t pg, T v, uint64_t n) { return svasrd_x (pg, v, n); }
+/* { dg-error {argument 3 of 'svasrd_x' must be an integer constant expression} "" { target *-*-* } .-1 } */
+
+void
+f1 (svbool_t pg, svuint8_t u8, svint8_t s8, svint16_t s16,
+    svint32_t s32, svint64_t s64)
+{
+  u8 = shift <1> (pg, u8);
+  s8 = shift <0> (pg, s8);
+  s8 = shift <1> (pg, s8);
+  s8 = shift <8> (pg, s8);
+  s8 = shift <9> (pg, s8);
+  s16 = shift <0> (pg, s16);
+  s16 = shift <1> (pg, s16);
+  s16 = shift <16> (pg, s16);
+  s16 = shift <17> (pg, s16);
+  s32 = shift <0> (pg, s32);
+  s32 = shift <1> (pg, s32);
+  s32 = shift <32> (pg, s32);
+  s32 = shift <33> (pg, s32);
+  s64 = shift <0> (pg, s64);
+  s64 = shift <1> (pg, s64);
+  s64 = shift <64> (pg, s64);
+  s64 = shift <65> (pg, s64);
+
+  s8 = shift2 (pg, s8, 1);
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/cntb_pat.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/cntb_pat.c
new file mode 100644
index 000000000..bbc9f9010
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/cntb_pat.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+test (svpattern pat, int i)
+{
+  svcntb_pat (pat); /* { dg-error "argument 1 of 'svcntb_pat' must be an integer constant expression" } */
+  svcntb_pat (i); /* { dg-error "invalid conversion from 'int' to 'svpattern'" } */
+   /* { dg-error "argument 1 of 'svcntb_pat' must be an integer constant expression" "" { target *-*-* } .-1 } */
+  svcntb_pat ((svpattern) -1); /* { dg-error "passing 4294967295 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+  svcntb_pat ((svpattern) 0);
+  svcntb_pat ((svpattern) 1);
+  svcntb_pat ((svpattern) 2);
+  svcntb_pat ((svpattern) 3);
+  svcntb_pat ((svpattern) 4);
+  svcntb_pat ((svpattern) 5);
+  svcntb_pat ((svpattern) 6);
+  svcntb_pat ((svpattern) 7);
+  svcntb_pat ((svpattern) 8);
+  svcntb_pat ((svpattern) 9);
+  svcntb_pat ((svpattern) 10);
+  svcntb_pat ((svpattern) 11);
+  svcntb_pat ((svpattern) 12);
+  svcntb_pat ((svpattern) 13);
+  svcntb_pat ((svpattern) 14); /* { dg-error "passing 14 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+  svcntb_pat ((svpattern) 15); /* { dg-error "passing 15 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+  svcntb_pat ((svpattern) 16); /* { dg-error "passing 16 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+  svcntb_pat ((svpattern) 17); /* { dg-error "passing 17 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+  svcntb_pat ((svpattern) 18); /* { dg-error "passing 18 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+  svcntb_pat ((svpattern) 19); /* { dg-error "passing 19 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+  svcntb_pat ((svpattern) 20); /* { dg-error "passing 20 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+  svcntb_pat ((svpattern) 21); /* { dg-error "passing 21 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+  svcntb_pat ((svpattern) 22); /* { dg-error "passing 22 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+  svcntb_pat ((svpattern) 23); /* { dg-error "passing 23 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+  svcntb_pat ((svpattern) 24); /* { dg-error "passing 24 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+  svcntb_pat ((svpattern) 25); /* { dg-error "passing 25 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+  svcntb_pat ((svpattern) 26); /* { dg-error "passing 26 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+  svcntb_pat ((svpattern) 27); /* { dg-error "passing 27 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+  svcntb_pat ((svpattern) 28); /* { dg-error "passing 28 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+  svcntb_pat ((svpattern) 29);
+  svcntb_pat ((svpattern) 30);
+  svcntb_pat ((svpattern) 31);
+  svcntb_pat ((svpattern) 32); /* { dg-error "passing 32 to argument 1 of 'svcntb_pat', which expects a valid 'svpattern' value" } */
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/conversion_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/conversion_1.C
new file mode 100644
index 000000000..1b939cdf7
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/conversion_1.C
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+template<typename T>
+struct S
+{
+  S(T);
+  operator T() const;
+  void *base;
+};
+
+void f(svbool_t pg, const S<svuint8_t> &u8a, const S<svuint8_t> &u8b,
+       const S<svint8_t> &s8a)
+{
+  svadd_x(pg, u8a, u8b);
+  svadd_x(pg, u8a, 1);
+  svadd_x(pg, s8a, u8b); // { dg-error "no matching function for call" }
+  svadd_x(pg, s8a, 1);
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create2_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create2_1.C
new file mode 100644
index 000000000..247fd85ec
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create2_1.C
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svuint8x2_t *ptr, svbool_t pg, svuint8_t u8, svfloat64_t f64,
+    svuint8x2_t u8x2)
+{
+  *ptr = svcreate2 (u8); /* { dg-error {no matching function for call to 'svcreate2\(svuint8_t\&\)'} } */
+  *ptr = svcreate2 (u8, u8, u8); /* { dg-error {no matching function for call to 'svcreate2\(svuint8_t\&, svuint8_t\&, svuint8_t\&\)'} } */
+  *ptr = svcreate2 (u8x2, u8x2); /* { dg-error {no matching function for call to 'svcreate2\(svuint8x2_t\&, svuint8x2_t\&\)'} } */
+  *ptr = svcreate2 (u8, f64); /* { dg-error {no matching function for call to 'svcreate2\(svuint8_t\&, svfloat64_t\&\)'} } */
+  *ptr = svcreate2 (u8, pg); /* { dg-error {no matching function for call to 'svcreate2\(svuint8_t\&, svbool_t\&\)'} } */
+  *ptr = svcreate2 (u8, u8);
+  *ptr = svcreate2 (f64, f64); /* { dg-error {cannot convert 'svfloat64x2_t' to 'svuint8x2_t' in assignment} } */
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create2_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create2_2.C
new file mode 100644
index 000000000..10f3231fa
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create2_2.C
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svuint8x2_t *ptr, svbool_t pg, svuint8_t u8, svfloat64_t f64,
+    svuint8x2_t u8x2)
+{
+  *ptr = svcreate2_u8 (u8); /* { dg-error {too few arguments to function '[^']*'} } */
+  *ptr = svcreate2_u8 (u8, u8, u8); /* { dg-error {too many arguments to function '[^']*'} } */
+  *ptr = svcreate2_u8 (u8x2, u8x2); /* { dg-error {cannot convert 'svuint8x2_t' to 'svuint8_t'} } */
+  *ptr = svcreate2_u8 (u8, f64); /* { dg-error {cannot convert 'svfloat64_t' to 'svuint8_t'} } */
+  *ptr = svcreate2_u8 (pg, u8); /* { dg-error {cannot convert 'svbool_t' to 'svuint8_t'} } */
+  *ptr = svcreate2_u8 (u8, u8);
+  *ptr = svcreate2_f64 (f64, f64); /* { dg-error {cannot convert 'svfloat64x2_t' to 'svuint8x2_t' in assignment} } */
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create3_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create3_1.C
new file mode 100644
index 000000000..ff013634d
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create3_1.C
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svfloat16x3_t *ptr, svbool_t pg, svfloat16_t f16, svfloat64_t f64,
+    svfloat16x3_t f16x3)
+{
+  *ptr = svcreate3 (f16); /* { dg-error {no matching function for call to 'svcreate3\(svfloat16_t\&\)'} } */
+  *ptr = svcreate3 (f16, f16); /* { dg-error {no matching function for call to 'svcreate3\(svfloat16_t\&, svfloat16_t\&\)'} } */
+  *ptr = svcreate3 (f16, f16, f16, f16); /* { dg-error {no matching function for call to 'svcreate3\(svfloat16_t\&, svfloat16_t\&, svfloat16_t\&, svfloat16_t\&\)'} } */
+  *ptr = svcreate3 (f16x3, f16x3, f16x3); /* { dg-error {no matching function for call to 'svcreate3\(svfloat16x3_t\&, svfloat16x3_t\&, svfloat16x3_t\&\)'} } */
+  *ptr = svcreate3 (f16, f16, f64); /* { dg-error {no matching function for call to 'svcreate3\(svfloat16_t\&, svfloat16_t\&, svfloat64_t\&\)'} } */
+  *ptr = svcreate3 (f16, pg, f16); /* { dg-error {no matching function for call to 'svcreate3\(svfloat16_t\&, svbool_t\&, svfloat16_t\&\)'} } */
+  *ptr = svcreate3 (f16, f16, f16);
+  *ptr = svcreate3 (f64, f64, f64); /* { dg-error {cannot convert 'svfloat64x3_t' to 'svfloat16x3_t' in assignment} } */
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create3_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create3_2.C
new file mode 100644
index 000000000..07a72b1e2
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create3_2.C
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svfloat16x3_t *ptr, svbool_t pg, svfloat16_t f16, svfloat64_t f64,
+    svfloat16x3_t f16x3)
+{
+  *ptr = svcreate3_f16 (f16); /* { dg-error {too few arguments to function '[^']*'} } */
+  *ptr = svcreate3_f16 (f16, f16); /* { dg-error {too few arguments to function '[^']*'} } */
+  *ptr = svcreate3_f16 (f16, f16, f16, f16); /* { dg-error {too many arguments to function '[^']*'} } */
+  *ptr = svcreate3_f16 (f16x3, f16x3, f16x3); /* { dg-error {cannot convert 'svfloat16x3_t' to 'svfloat16_t'} } */
+  *ptr = svcreate3_f16 (f16, f16, f64); /* { dg-error {cannot convert 'svfloat64_t' to 'svfloat16_t'} } */
+  *ptr = svcreate3_f16 (f16, pg, f16); /* { dg-error {cannot convert 'svbool_t' to 'svfloat16_t'} } */
+  *ptr = svcreate3_f16 (f16, f16, f16);
+  *ptr = svcreate3_f64 (f64, f64, f64); /* { dg-error {cannot convert 'svfloat64x3_t' to 'svfloat16x3_t' in assignment} } */
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create4_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create4_1.C
new file mode 100644
index 000000000..2785d9011
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create4_1.C
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svint32x4_t *ptr, svbool_t pg, svint32_t s32, svfloat64_t f64,
+    svint32x4_t s32x4)
+{
+  *ptr = svcreate4 (s32); /* { dg-error {no matching function for call to 'svcreate4\(svint32_t\&\)'} } */
+  *ptr = svcreate4 (s32, s32); /* { dg-error {no matching function for call to 'svcreate4\(svint32_t\&, svint32_t\&\)'} } */
+  *ptr = svcreate4 (s32, s32, s32); /* { dg-error {no matching function for call to 'svcreate4\(svint32_t\&, svint32_t\&, svint32_t\&\)'} } */
+  *ptr = svcreate4 (s32, s32, s32, s32, s32); /* { dg-error {no matching function for call to 'svcreate4\(svint32_t\&, svint32_t\&, svint32_t\&, svint32_t\&, svint32_t\&\)'} } */
+  *ptr = svcreate4 (s32x4, s32x4, s32x4, s32x4); /* { dg-error {no matching function for call to 'svcreate4\(svint32x4_t\&, svint32x4_t\&, svint32x4_t\&, svint32x4_t\&\)'} } */
+  *ptr = svcreate4 (s32, s32, s32, f64); /* { dg-error {no matching function for call to 'svcreate4\(svint32_t\&, svint32_t\&, svint32_t\&, svfloat64_t\&\)'} } */
+  *ptr = svcreate4 (s32, pg, s32, s32); /* { dg-error {no matching function for call to 'svcreate4\(svint32_t\&, svbool_t\&, svint32_t\&, svint32_t\&\)'} } */
+  *ptr = svcreate4 (s32, s32, s32, s32);
+  *ptr = svcreate4 (f64, f64, f64, f64); /* { dg-error {cannot convert 'svfloat64x4_t' to 'svint32x4_t' in assignment} } */
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create4_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create4_2.C
new file mode 100644
index 000000000..68f21a1d4
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/create4_2.C
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svint32x4_t *ptr, svbool_t pg, svint32_t s32, svfloat64_t f64,
+    svint32x4_t s32x4)
+{
+  *ptr = svcreate4_s32 (s32); /* { dg-error {too few arguments to function '[^']*'} } */
+  *ptr = svcreate4_s32 (s32, s32); /* { dg-error {too few arguments to function '[^']*'} } */
+  *ptr = svcreate4_s32 (s32, s32, s32); /* { dg-error {too few arguments to function '[^']*'} } */
+  *ptr = svcreate4_s32 (s32, s32, s32, s32, s32); /* { dg-error {too many arguments to function '[^']*'} } */
+  *ptr = svcreate4_s32 (s32x4, s32x4, s32x4, s32x4); /* { dg-error {cannot convert 'svint32x4_t' to 'svint32_t'} } */
+  *ptr = svcreate4_s32 (s32, s32, s32, f64); /* { dg-error {cannot convert 'svfloat64_t' to 'svint32_t'} } */
+  *ptr = svcreate4_s32 (s32, pg, s32, s32); /* { dg-error {cannot convert 'svbool_t' to 'svint32_t'} } */
+  *ptr = svcreate4_s32 (s32, s32, s32, s32);
+  *ptr = svcreate4_f64 (f64, f64, f64, f64); /* { dg-error {cannot convert 'svfloat64x4_t' to 'svint32x4_t' in assignment} } */
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_1.C
new file mode 100644
index 000000000..93397c82f
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_1.C
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+
+#include "dot_1.h"
+
+svuint32_t
+f1 (svuint32_t x, svint8_t y, svuint8_t z)
+{
+  return svdot_u32 (x, y, z); /* { dg-error "cannot convert 'svint8_t' to 'svuint8_t'" } */
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_1.h b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_1.h
new file mode 100644
index 000000000..aef02f20b
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_1.h
@@ -0,0 +1,2 @@
+#pragma GCC system_header
+#pragma GCC aarch64 "arm_sve.h" /* { dg-message "initializing argument 2" } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_2.C
new file mode 100644
index 000000000..2084ed828
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_2.C
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+
+#include "dot_2.h"
+
+void
+f1 (svuint32_t x, svint8_t y, svuint8_t z)
+{
+  svdot (x, y); /* { dg-error {no matching function for call to 'svdot\(svuint32_t&, svint8_t&\)'} } */
+  svdot (x, x, x); /* { dg-error {no matching function for call to 'svdot\(svuint32_t&, svuint32_t&, svuint32_t&\)'} } */
+  svdot (1, z, z); /* { dg-error {no matching function for call to 'svdot\(int, svuint8_t&, svuint8_t&\)'} } */
+  svdot (x, y, z); /* { dg-error {no matching function for call to 'svdot\(svuint32_t&, svint8_t&, svuint8_t&\)'} } */
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_2.h b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_2.h
new file mode 100644
index 000000000..3e4a9c794
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/dot_2.h
@@ -0,0 +1,7 @@
+#pragma GCC system_header
+#pragma GCC aarch64 "arm_sve.h"
+/* { dg-message {note: candidate: 'svuint32_t svdot\(svuint32_t, svuint8_t, svuint8_t\)'} "" { target *-*-* } 3 } */
+/* { dg-message {note: *candidate expects 3 arguments, 2 provided} "" { target *-*-* } 3 } */
+/* { dg-message {note: *no known conversion for argument 2 from 'svuint32_t' to 'svuint8_t'} "" { target *-*-* } 3 } */
+/* { dg-message {note: *no known conversion for argument 1 from 'int' to 'svuint32_t'} "" { target *-*-* } 3 } */
+/* { dg-message {note: *no known conversion for argument 2 from 'svint8_t' to 'svuint8_t'} "" { target *-*-* } 3 } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_1.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_1.c
new file mode 100644
index 000000000..8f18810c0
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_1.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+int svadd_n_u8_x; /* { dg-message "note: previous declaration 'int svadd_n_u8_x'" } */
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error {'svuint8_t svadd_n_u8_x\(svbool_t, svuint8_t, [^)\n]*\)' redeclared as different kind of entity} } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_2.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_2.c
new file mode 100644
index 000000000..a67f9f756
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_2.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+int svadd_n_u8_x = 1; /* { dg-message "note: previous declaration 'int svadd_n_u8_x'" } */
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error {'svuint8_t svadd_n_u8_x\(svbool_t, svuint8_t, [^)\n]*\)' redeclared as different kind of entity} } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_3.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_3.c
new file mode 100644
index 000000000..74b820fe6
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_3.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+
+/* Although not supported, there's nothing to stop the user overloading
+   the sv* functions.  */
+extern __SVInt8_t svadd_u8_x (__SVBool_t, __SVInt8_t, __SVInt8_t);
+
+#pragma GCC aarch64 "arm_sve.h"
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_4.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_4.c
new file mode 100644
index 000000000..9591e3d01
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_4.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+
+/* Although somewhat suspect, this isn't actively wrong, and doesn't need
+   to be diagnosed.  Any attempt to call the function before including
+   arm_sve.h will lead to a link failure.  (Same for taking its address,
+   etc.)  */
+extern __SVUint8_t svadd_u8_x (__SVBool_t, __SVUint8_t, __SVUint8_t);
+
+#pragma GCC aarch64 "arm_sve.h"
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_5.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_5.c
new file mode 100644
index 000000000..f87201984
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_5.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+
+__SVUint8_t
+svadd_u8_x (__SVBool_t pg, __SVUint8_t x, __SVUint8_t y)
+{
+  return x;
+}
+
+#pragma GCC aarch64 "arm_sve.h"
+
+svuint8_t
+f (svbool_t pg, svuint8_t x, svuint8_t y)
+{
+  return svadd_u8_x (pg, x, y);
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_6.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_6.c
new file mode 100644
index 000000000..a65e0d65c
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_6.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+typedef int svadd_u8_x; /* { dg-message "note: previous declaration 'typedef int svadd_u8_x'" } */
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error {'svuint8_t svadd_u8_x\(svbool_t, svuint8_t, svuint8_t\)' redeclared as different kind of entity} } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_7.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_7.c
new file mode 100644
index 000000000..1f2e4bf66
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/func_redef_7.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+
+__SVUint8_t
+svadd_x (__SVBool_t pg, __SVUint8_t x, __SVUint8_t y)
+{
+  return x;
+}
+
+#pragma GCC aarch64 "arm_sve.h"
+
+svuint8_t
+f (svbool_t pg, svuint8_t x, svuint8_t y)
+{
+  return svadd_x (pg, x, y);
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get2_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get2_1.C
new file mode 100644
index 000000000..8d6bb2307
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get2_1.C
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
+uint64_t add (uint64_t a, uint64_t b) { return a + b; }
+
+svfloat64_t
+f1 (svbool_t pg, svuint8_t u8, svuint8x2_t u8x2, svuint8x3_t u8x3, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  u8 = svget2 (u8x2); /* { dg-error {no matching function for call to 'svget2\(svuint8x2_t\&\)'} } */
+  u8 = svget2 (u8x2, 1, 2); /* { dg-error {no matching function for call to 'svget2\(svuint8x2_t\&, int, int\)'} } */
+  u8 = svget2 (u8, 0); /* { dg-error {no matching function for call to 'svget2\(svuint8_t\&, int\)'} } */
+  u8 = svget2 (u8x3, 0); /* { dg-error {no matching function for call to 'svget2\(svuint8x3_t\&, int\)'} } */
+  u8 = svget2 (pg, 0); /* { dg-error {no matching function for call to 'svget2\(svbool_t\&, int\)'} } */
+  u8 = svget2 (u8x2, x); /* { dg-error "argument 2 of 'svget2' must be an integer constant expression" } */
+  u8 = svget2 (u8x2, 0);
+  f64 = svget2 (u8x2, 0); /* { dg-error "cannot convert 'svuint8_t' to 'svfloat64_t' in assignment" } */
+  u8 = svget2 (u8x2, 1);
+  u8 = svget2 (u8x2, 2); /* { dg-error {passing 2 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2 (u8x2, 3); /* { dg-error {passing 3 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2 (u8x2, 4); /* { dg-error {passing 4 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2 (u8x2, 5); /* { dg-error {passing 5 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2 (u8x2, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2 (u8x2, one);
+  u8 = svget2 (u8x2, 3 - 2);
+  u8 = svget2 (u8x2, 1.0);
+  u8 = svget2 (u8x2, const_sub (5, 4));
+  u8 = svget2 (u8x2, const_sub (6, 4)); /* { dg-error {passing 2 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2 (u8x2, const_sub (7, 4)); /* { dg-error {passing 3 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2 (u8x2, const_sub (8, 4)); /* { dg-error {passing 4 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2 (u8x2, add (0, 0)); /* { dg-error "argument 2 of 'svget2' must be an integer constant expression" } */
+
+  return f64;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get2_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get2_2.C
new file mode 100644
index 000000000..9c7674be1
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get2_2.C
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
+uint64_t add (uint64_t a, uint64_t b) { return a + b; }
+
+svfloat64_t
+f1 (svbool_t pg, svuint8_t u8, svuint8x2_t u8x2, svuint8x3_t u8x3, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  u8 = svget2_u8 (u8x2); /* { dg-error {too few arguments to function '[^']*'} } */
+  u8 = svget2_u8 (u8x2, 1, 2); /* { dg-error {too many arguments to function '[^']*'} } */
+  u8 = svget2_u8 (u8, 0); /* { dg-error {cannot convert 'svuint8_t' to 'svuint8x2_t'} } */
+  u8 = svget2_u8 (u8x3, 0); /* { dg-error {cannot convert 'svuint8x3_t' to 'svuint8x2_t'} } */
+  u8 = svget2_u8 (pg, 0); /* { dg-error {cannot convert 'svbool_t' to 'svuint8x2_t'} } */
+  u8 = svget2_u8 (u8x2, x); /* { dg-error "argument 2 of 'svget2_u8' must be an integer constant expression" } */
+  u8 = svget2_u8 (u8x2, 0);
+  f64 = svget2_u8 (u8x2, 0); /* { dg-error "cannot convert 'svuint8_t' to 'svfloat64_t' in assignment" } */
+  u8 = svget2_u8 (u8x2, 1);
+  u8 = svget2_u8 (u8x2, 2); /* { dg-error {passing 2 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2_u8 (u8x2, 3); /* { dg-error {passing 3 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2_u8 (u8x2, 4); /* { dg-error {passing 4 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2_u8 (u8x2, 5); /* { dg-error {passing 5 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2_u8 (u8x2, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2_u8 (u8x2, one);
+  u8 = svget2_u8 (u8x2, 3 - 2);
+  u8 = svget2_u8 (u8x2, 1.0);
+  u8 = svget2_u8 (u8x2, const_sub (5, 4));
+  u8 = svget2_u8 (u8x2, const_sub (6, 4)); /* { dg-error {passing 2 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2_u8 (u8x2, const_sub (7, 4)); /* { dg-error {passing 3 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2_u8 (u8x2, const_sub (8, 4)); /* { dg-error {passing 4 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2_u8 (u8x2, add (0, 0)); /* { dg-error "argument 2 of 'svget2_u8' must be an integer constant expression" } */
+
+  return f64;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get3_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get3_1.C
new file mode 100644
index 000000000..bd8808a8b
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get3_1.C
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
+uint64_t add (uint64_t a, uint64_t b) { return a + b; }
+
+svfloat64_t
+f1 (svbool_t pg, svfloat16_t f16, svfloat16x3_t f16x3, svfloat16x4_t f16x4,
+    int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  f16 = svget3 (f16x3); /* { dg-error {no matching function for call to 'svget3\(svfloat16x3_t\&\)'} } */
+  f16 = svget3 (f16x3, 1, 2); /* { dg-error {no matching function for call to 'svget3\(svfloat16x3_t\&, int, int\)'} } */
+  f16 = svget3 (f16, 0); /* { dg-error {no matching function for call to 'svget3\(svfloat16_t\&, int\)'} } */
+  f16 = svget3 (f16x4, 0); /* { dg-error {no matching function for call to 'svget3\(svfloat16x4_t\&, int\)'} } */
+  f16 = svget3 (pg, 0); /* { dg-error {no matching function for call to 'svget3\(svbool_t\&, int\)'} } */
+  f16 = svget3 (f16x3, x); /* { dg-error "argument 2 of 'svget3' must be an integer constant expression" } */
+  f16 = svget3 (f16x3, 0);
+  f64 = svget3 (f16x3, 0); /* { dg-error "cannot convert 'svfloat16_t' to 'svfloat64_t' in assignment" } */
+  f16 = svget3 (f16x3, 1);
+  f16 = svget3 (f16x3, 2);
+  f16 = svget3 (f16x3, 3); /* { dg-error {passing 3 to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3 (f16x3, 4); /* { dg-error {passing 4 to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3 (f16x3, 5); /* { dg-error {passing 5 to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3 (f16x3, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3 (f16x3, one);
+  f16 = svget3 (f16x3, 3 - 2);
+  f16 = svget3 (f16x3, 1.0);
+  f16 = svget3 (f16x3, const_sub (5, 4));
+  f16 = svget3 (f16x3, const_sub (6, 4));
+  f16 = svget3 (f16x3, const_sub (7, 4)); /* { dg-error {passing 3 to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3 (f16x3, const_sub (8, 4)); /* { dg-error {passing 4 to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3 (f16x3, add (0, 0)); /* { dg-error "argument 2 of 'svget3' must be an integer constant expression" } */
+
+  return f64;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get3_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get3_2.C
new file mode 100644
index 000000000..d526947d1
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get3_2.C
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
+uint64_t add (uint64_t a, uint64_t b) { return a + b; }
+
+svfloat64_t
+f1 (svbool_t pg, svfloat16_t f16, svfloat16x3_t f16x3, svfloat16x4_t f16x4,
+    int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  f16 = svget3_f16 (f16x3); /* { dg-error {too few arguments to function '[^']*'} } */
+  f16 = svget3_f16 (f16x3, 1, 2); /* { dg-error {too many arguments to function '[^']*'} } */
+  f16 = svget3_f16 (f16, 0); /* { dg-error {cannot convert 'svfloat16_t' to 'svfloat16x3_t'} } */
+  f16 = svget3_f16 (f16x4, 0); /* { dg-error {cannot convert 'svfloat16x4_t' to 'svfloat16x3_t'} } */
+  f16 = svget3_f16 (pg, 0); /* { dg-error {cannot convert 'svbool_t' to 'svfloat16x3_t'} } */
+  f16 = svget3_f16 (f16x3, x); /* { dg-error "argument 2 of 'svget3_f16' must be an integer constant expression" } */
+  f16 = svget3_f16 (f16x3, 0);
+  f64 = svget3_f16 (f16x3, 0); /* { dg-error "cannot convert 'svfloat16_t' to 'svfloat64_t' in assignment" } */
+  f16 = svget3_f16 (f16x3, 1);
+  f16 = svget3_f16 (f16x3, 2);
+  f16 = svget3_f16 (f16x3, 3); /* { dg-error {passing 3 to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3_f16 (f16x3, 4); /* { dg-error {passing 4 to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3_f16 (f16x3, 5); /* { dg-error {passing 5 to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3_f16 (f16x3, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3_f16 (f16x3, one);
+  f16 = svget3_f16 (f16x3, 3 - 2);
+  f16 = svget3_f16 (f16x3, 1.0);
+  f16 = svget3_f16 (f16x3, const_sub (5, 4));
+  f16 = svget3_f16 (f16x3, const_sub (6, 4));
+  f16 = svget3_f16 (f16x3, const_sub (7, 4)); /* { dg-error {passing 3 to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3_f16 (f16x3, const_sub (8, 4)); /* { dg-error {passing 4 to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3_f16 (f16x3, add (0, 0)); /* { dg-error "argument 2 of 'svget3_f16' must be an integer constant expression" } */
+
+  return f64;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get4_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get4_1.C
new file mode 100644
index 000000000..19853dece
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get4_1.C
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
+uint64_t add (uint64_t a, uint64_t b) { return a + b; }
+
+svfloat64_t
+f1 (svbool_t pg, svint32_t s32, svint32x4_t s32x4, svint32x2_t s32x2, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  s32 = svget4 (s32x4); /* { dg-error {no matching function for call to 'svget4\(svint32x4_t\&\)'} } */
+  s32 = svget4 (s32x4, 1, 2); /* { dg-error {no matching function for call to 'svget4\(svint32x4_t\&, int, int\)'} } */
+  s32 = svget4 (s32, 0); /* { dg-error {no matching function for call to 'svget4\(svint32_t\&, int\)'} } */
+  s32 = svget4 (s32x2, 0); /* { dg-error {no matching function for call to 'svget4\(svint32x2_t\&, int\)'} } */
+  s32 = svget4 (pg, 0); /* { dg-error {no matching function for call to 'svget4\(svbool_t\&, int\)'} } */
+  s32 = svget4 (s32x4, x); /* { dg-error "argument 2 of 'svget4' must be an integer constant expression" } */
+  s32 = svget4 (s32x4, 0);
+  f64 = svget4 (s32x4, 0); /* { dg-error "cannot convert 'svint32_t' to 'svfloat64_t' in assignment" } */
+  s32 = svget4 (s32x4, 1);
+  s32 = svget4 (s32x4, 2);
+  s32 = svget4 (s32x4, 3);
+  s32 = svget4 (s32x4, 4); /* { dg-error {passing 4 to argument 2 of 'svget4', which expects a value in the range \[0, 3\]} } */
+  s32 = svget4 (s32x4, 5); /* { dg-error {passing 5 to argument 2 of 'svget4', which expects a value in the range \[0, 3\]} } */
+  s32 = svget4 (s32x4, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget4', which expects a value in the range \[0, 3\]} } */
+  s32 = svget4 (s32x4, one);
+  s32 = svget4 (s32x4, 3 - 2);
+  s32 = svget4 (s32x4, 1.0);
+  s32 = svget4 (s32x4, const_sub (5, 4));
+  s32 = svget4 (s32x4, const_sub (6, 4));
+  s32 = svget4 (s32x4, const_sub (7, 4));
+  s32 = svget4 (s32x4, const_sub (8, 4)); /* { dg-error {passing 4 to argument 2 of 'svget4', which expects a value in the range \[0, 3\]} } */
+  s32 = svget4 (s32x4, add (0, 0)); /* { dg-error "argument 2 of 'svget4' must be an integer constant expression" } */
+
+  return f64;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get4_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get4_2.C
new file mode 100644
index 000000000..7a0979225
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/get4_2.C
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
+uint64_t add (uint64_t a, uint64_t b) { return a + b; }
+
+svfloat64_t
+f1 (svbool_t pg, svint32_t s32, svint32x4_t s32x4, svint32x2_t s32x2, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  s32 = svget4_s32 (s32x4); /* { dg-error {too few arguments to function '[^']*'} } */
+  s32 = svget4_s32 (s32x4, 1, 2); /* { dg-error {too many arguments to function '[^']*'} } */
+  s32 = svget4_s32 (s32, 0); /* { dg-error {cannot convert 'svint32_t' to 'svint32x4_t'} } */
+  s32 = svget4_s32 (s32x2, 0); /* { dg-error {cannot convert 'svint32x2_t' to 'svint32x4_t'} } */
+  s32 = svget4_s32 (pg, 0); /* { dg-error {cannot convert 'svbool_t' to 'svint32x4_t'} } */
+  s32 = svget4_s32 (s32x4, x); /* { dg-error "argument 2 of 'svget4_s32' must be an integer constant expression" } */
+  s32 = svget4_s32 (s32x4, 0);
+  f64 = svget4_s32 (s32x4, 0); /* { dg-error "cannot convert 'svint32_t' to 'svfloat64_t' in assignment" } */
+  s32 = svget4_s32 (s32x4, 1);
+  s32 = svget4_s32 (s32x4, 2);
+  s32 = svget4_s32 (s32x4, 3);
+  s32 = svget4_s32 (s32x4, 4); /* { dg-error {passing 4 to argument 2 of 'svget4_s32', which expects a value in the range \[0, 3\]} } */
+  s32 = svget4_s32 (s32x4, 5); /* { dg-error {passing 5 to argument 2 of 'svget4_s32', which expects a value in the range \[0, 3\]} } */
+  s32 = svget4_s32 (s32x4, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget4_s32', which expects a value in the range \[0, 3\]} } */
+  s32 = svget4_s32 (s32x4, one);
+  s32 = svget4_s32 (s32x4, 3 - 2);
+  s32 = svget4_s32 (s32x4, 1.0);
+  s32 = svget4_s32 (s32x4, const_sub (5, 4));
+  s32 = svget4_s32 (s32x4, const_sub (6, 4));
+  s32 = svget4_s32 (s32x4, const_sub (7, 4));
+  s32 = svget4_s32 (s32x4, const_sub (8, 4)); /* { dg-error {passing 4 to argument 2 of 'svget4_s32', which expects a value in the range \[0, 3\]} } */
+  s32 = svget4_s32 (s32x4, add (0, 0)); /* { dg-error "argument 2 of 'svget4_s32' must be an integer constant expression" } */
+
+  return f64;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/lsl_wide_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/lsl_wide_1.C
new file mode 100644
index 000000000..fb31e947d
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/lsl_wide_1.C
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+svuint8_t
+f1 (svbool_t pg, svuint8_t x, svint8_t w, svuint64_t y)
+{
+  svlsl_wide_u8_x (pg, x, x); /* { dg-error "cannot convert 'svuint8_t' to 'svuint64_t'" } */
+  svlsl_wide_u8_x (pg, x); /* { dg-error {too few arguments to function 'svuint8_t svlsl_wide_u8_x\(svbool_t, svuint8_t, svuint64_t\)'} } */
+  svlsl_wide_u8_x (pg, x, y, x); /* { dg-error {too many arguments to function 'svuint8_t svlsl_wide_u8_x\(svbool_t, svuint8_t, svuint64_t\)'} } */
+  return svlsl_wide_s8_x (pg, w, y); /* { dg-error {cannot convert 'svint8_t' to 'svuint8_t' in return} } */
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/lsl_wide_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/lsl_wide_2.C
new file mode 100644
index 000000000..95d341dc5
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/lsl_wide_2.C
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+void
+f1 (svbool_t pg, svuint8_t x, svuint64_t y)
+{
+  svlsl_wide_x (pg, x); /* { dg-error {no matching function for call to 'svlsl_wide_x\(svbool_t&, svuint8_t&\)'} } */
+  svlsl_wide_x (pg, x, x, x, x); /* { dg-error {no matching function for call to 'svlsl_wide_x\(svbool_t&, svuint8_t&, svuint8_t&, svuint8_t&, svuint8_t&\)'} } */
+  svlsl_wide_x (x, x, y); /* { dg-error {no matching function for call to 'svlsl_wide_x\(svuint8_t&, svuint8_t&, svuint64_t&\)'} } */
+  svlsl_wide_x (pg, 1, y); /* { dg-error {no matching function for call to 'svlsl_wide_x\(svbool_t&, int, svuint64_t&\)'} } */
+  svlsl_wide_x (pg, x, x); /* { dg-error {no matching function for call to 'svlsl_wide_x\(svbool_t&, svuint8_t&, svuint8_t&\)'} } */
+  svlsl_wide_x (pg, y, y); /* { dg-error {no matching function for call to 'svlsl_wide_x\(svbool_t&, svuint64_t&, svuint64_t&\)'} } */
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_1.C
new file mode 100644
index 000000000..1a1712485
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_1.C
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void f1(svbool_t) {}
+void f2(svint8_t) {}
+void f3(svint16_t) {}
+void f4(svint32_t) {}
+void f5(svint64_t) {}
+void f6(svuint8_t) {}
+void f7(svuint16_t) {}
+void f8(svuint32_t) {}
+void f9(svuint64_t) {}
+void f10(svfloat16_t) {}
+void f11(svfloat32_t) {}
+void f12(svfloat64_t) {}
+void f13(svbfloat16_t) {}
+
+/* { dg-final { scan-assembler "_Z2f110__SVBool_t:" } } */
+/* { dg-final { scan-assembler "_Z2f210__SVInt8_t:" } } */
+/* { dg-final { scan-assembler "_Z2f311__SVInt16_t:" } } */
+/* { dg-final { scan-assembler "_Z2f411__SVInt32_t:" } } */
+/* { dg-final { scan-assembler "_Z2f511__SVInt64_t:" } } */
+/* { dg-final { scan-assembler "_Z2f611__SVUint8_t:" } } */
+/* { dg-final { scan-assembler "_Z2f712__SVUint16_t:" } } */
+/* { dg-final { scan-assembler "_Z2f812__SVUint32_t:" } } */
+/* { dg-final { scan-assembler "_Z2f912__SVUint64_t:" } } */
+/* { dg-final { scan-assembler "_Z3f1013__SVFloat16_t:" } } */
+/* { dg-final { scan-assembler "_Z3f1113__SVFloat32_t:" } } */
+/* { dg-final { scan-assembler "_Z3f1213__SVFloat64_t:" } } */
+/* { dg-final { scan-assembler "_Z3f1314__SVBfloat16_t:" } } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_2.C
new file mode 100644
index 000000000..6792b8a31
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_2.C
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+
+void f1(__SVBool_t) {}
+void f2(__SVInt8_t) {}
+void f3(__SVInt16_t) {}
+void f4(__SVInt32_t) {}
+void f5(__SVInt64_t) {}
+void f6(__SVUint8_t) {}
+void f7(__SVUint16_t) {}
+void f8(__SVUint32_t) {}
+void f9(__SVUint64_t) {}
+void f10(__SVFloat16_t) {}
+void f11(__SVFloat32_t) {}
+void f12(__SVFloat64_t) {}
+void f13(__SVBfloat16_t) {}
+
+/* { dg-final { scan-assembler "_Z2f110__SVBool_t:" } } */
+/* { dg-final { scan-assembler "_Z2f210__SVInt8_t:" } } */
+/* { dg-final { scan-assembler "_Z2f311__SVInt16_t:" } } */
+/* { dg-final { scan-assembler "_Z2f411__SVInt32_t:" } } */
+/* { dg-final { scan-assembler "_Z2f511__SVInt64_t:" } } */
+/* { dg-final { scan-assembler "_Z2f611__SVUint8_t:" } } */
+/* { dg-final { scan-assembler "_Z2f712__SVUint16_t:" } } */
+/* { dg-final { scan-assembler "_Z2f812__SVUint32_t:" } } */
+/* { dg-final { scan-assembler "_Z2f912__SVUint64_t:" } } */
+/* { dg-final { scan-assembler "_Z3f1013__SVFloat16_t:" } } */
+/* { dg-final { scan-assembler "_Z3f1113__SVFloat32_t:" } } */
+/* { dg-final { scan-assembler "_Z3f1213__SVFloat64_t:" } } */
+/* { dg-final { scan-assembler "_Z3f1314__SVBfloat16_t:" } } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_3.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_3.C
new file mode 100644
index 000000000..8f64f7c2e
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_3.C
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-msve-vector-bits=256" } */
+
+#include <arm_sve.h>
+
+typedef __SVInt8_t t1;
+typedef svint8_t t2;
+/* Distinct from svint8_t, but compatible with it.  */
+typedef int8_t t3 __attribute__((vector_size(32)));
+
+void f1(t1) {}
+void f2(t2) {}
+void f3(t3) {}
+void f4(t1 &a, t2 &b, t3 &c) { a = b = c; }
+
+/* { dg-final { scan-assembler "_Z2f110__SVInt8_t:" } } */
+/* { dg-final { scan-assembler "_Z2f210__SVInt8_t:" } } */
+/* { dg-final { scan-assembler "_Z2f3Dv32_a:" } } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_4.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_4.C
new file mode 100644
index 000000000..7cdc6cb0c
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/mangle_4.C
@@ -0,0 +1,75 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void f1(svint8x2_t) {}
+void f2(svint16x2_t) {}
+void f3(svint32x2_t) {}
+void f4(svint64x2_t) {}
+void f5(svuint8x2_t) {}
+void f6(svuint16x2_t) {}
+void f7(svuint32x2_t) {}
+void f8(svuint64x2_t) {}
+void f9(svfloat16x2_t) {}
+void f10(svfloat32x2_t) {}
+void f11(svfloat64x2_t) {}
+
+void g1(svint8x3_t) {}
+void g2(svint16x3_t) {}
+void g3(svint32x3_t) {}
+void g4(svint64x3_t) {}
+void g5(svuint8x3_t) {}
+void g6(svuint16x3_t) {}
+void g7(svuint32x3_t) {}
+void g8(svuint64x3_t) {}
+void g9(svfloat16x3_t) {}
+void g10(svfloat32x3_t) {}
+void g11(svfloat64x3_t) {}
+
+void h1(svint8x4_t) {}
+void h2(svint16x4_t) {}
+void h3(svint32x4_t) {}
+void h4(svint64x4_t) {}
+void h5(svuint8x4_t) {}
+void h6(svuint16x4_t) {}
+void h7(svuint32x4_t) {}
+void h8(svuint64x4_t) {}
+void h9(svfloat16x4_t) {}
+void h10(svfloat32x4_t) {}
+void h11(svfloat64x4_t) {}
+
+/* { dg-final { scan-assembler "_Z2f110svint8x2_t:" } } */
+/* { dg-final { scan-assembler "_Z2f211svint16x2_t:" } } */
+/* { dg-final { scan-assembler "_Z2f311svint32x2_t:" } } */
+/* { dg-final { scan-assembler "_Z2f411svint64x2_t:" } } */
+/* { dg-final { scan-assembler "_Z2f511svuint8x2_t:" } } */
+/* { dg-final { scan-assembler "_Z2f612svuint16x2_t:" } } */
+/* { dg-final { scan-assembler "_Z2f712svuint32x2_t:" } } */
+/* { dg-final { scan-assembler "_Z2f812svuint64x2_t:" } } */
+/* { dg-final { scan-assembler "_Z2f913svfloat16x2_t:" } } */
+/* { dg-final { scan-assembler "_Z3f1013svfloat32x2_t:" } } */
+/* { dg-final { scan-assembler "_Z3f1113svfloat64x2_t:" } } */
+
+/* { dg-final { scan-assembler "_Z2g110svint8x3_t:" } } */
+/* { dg-final { scan-assembler "_Z2g211svint16x3_t:" } } */
+/* { dg-final { scan-assembler "_Z2g311svint32x3_t:" } } */
+/* { dg-final { scan-assembler "_Z2g411svint64x3_t:" } } */
+/* { dg-final { scan-assembler "_Z2g511svuint8x3_t:" } } */
+/* { dg-final { scan-assembler "_Z2g612svuint16x3_t:" } } */
+/* { dg-final { scan-assembler "_Z2g712svuint32x3_t:" } } */
+/* { dg-final { scan-assembler "_Z2g812svuint64x3_t:" } } */
+/* { dg-final { scan-assembler "_Z2g913svfloat16x3_t:" } } */
+/* { dg-final { scan-assembler "_Z3g1013svfloat32x3_t:" } } */
+/* { dg-final { scan-assembler "_Z3g1113svfloat64x3_t:" } } */
+
+/* { dg-final { scan-assembler "_Z2h110svint8x4_t:" } } */
+/* { dg-final { scan-assembler "_Z2h211svint16x4_t:" } } */
+/* { dg-final { scan-assembler "_Z2h311svint32x4_t:" } } */
+/* { dg-final { scan-assembler "_Z2h411svint64x4_t:" } } */
+/* { dg-final { scan-assembler "_Z2h511svuint8x4_t:" } } */
+/* { dg-final { scan-assembler "_Z2h612svuint16x4_t:" } } */
+/* { dg-final { scan-assembler "_Z2h712svuint32x4_t:" } } */
+/* { dg-final { scan-assembler "_Z2h812svuint64x4_t:" } } */
+/* { dg-final { scan-assembler "_Z2h913svfloat16x4_t:" } } */
+/* { dg-final { scan-assembler "_Z3h1013svfloat32x4_t:" } } */
+/* { dg-final { scan-assembler "_Z3h1113svfloat64x4_t:" } } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set2_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set2_1.C
new file mode 100644
index 000000000..80c3ad74f
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set2_1.C
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
+uint64_t add (uint64_t a, uint64_t b) { return a + b; }
+
+svfloat64_t
+f1 (svbool_t pg, svuint8_t u8, svuint8x2_t u8x2, svint8x2_t s8x2,
+    svuint8x3_t u8x3, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  u8x2 = svset2 (u8x2); /* { dg-error {no matching function for call to 'svset2\(svuint8x2_t\&\)'} } */
+  u8x2 = svset2 (u8x2, 1); /* { dg-error {no matching function for call to 'svset2\(svuint8x2_t\&, int\)'} } */
+  u8x2 = svset2 (u8x2, 1, u8, 2); /* { dg-error {no matching function for call to 'svset2\(svuint8x2_t\&, int, svuint8_t\&, int\)'} } */
+  u8x2 = svset2 (u8, 0, u8); /* { dg-error {no matching function for call to 'svset2\(svuint8_t\&, int, svuint8_t\&\)'} } */
+  u8x2 = svset2 (s8x2, 0, u8); /* { dg-error {no matching function for call to 'svset2\(svint8x2_t\&, int, svuint8_t\&\)'} } */
+  u8x2 = svset2 (u8x3, 0, u8); /* { dg-error {no matching function for call to 'svset2\(svuint8x3_t\&, int, svuint8_t\&\)'} } */
+  u8x2 = svset2 (pg, 0, u8); /* { dg-error {no matching function for call to 'svset2\(svbool_t\&, int, svuint8_t\&\)'} } */
+  u8x2 = svset2 (u8x2, 0, f64); /* { dg-error {no matching function for call to 'svset2\(svuint8x2_t\&, int, svfloat64_t\&\)'} } */
+  u8x2 = svset2 (u8x2, 0, u8x2); /* { dg-error {no matching function for call to 'svset2\(svuint8x2_t\&, int, svuint8x2_t\&\)'} } */
+  u8x2 = svset2 (u8x2, 0, pg); /* { dg-error {no matching function for call to 'svset2\(svuint8x2_t\&, int, svbool_t\&\)'} } */
+  u8x2 = svset2 (u8x2, x, u8); /* { dg-error "argument 2 of 'svset2' must be an integer constant expression" } */
+  u8x2 = svset2 (u8x2, 0, u8);
+  s8x2 = svset2 (u8x2, 0, u8); /* { dg-error {cannot convert 'svuint8x2_t' to 'svint8x2_t' in assignment} } */
+  u8x2 = svset2 (u8x2, 1, u8);
+  u8x2 = svset2 (u8x2, 2, u8); /* { dg-error {passing 2 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2 (u8x2, 3, u8); /* { dg-error {passing 3 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2 (u8x2, 4, u8); /* { dg-error {passing 4 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2 (u8x2, 5, u8); /* { dg-error {passing 5 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2 (u8x2, ~0U, u8); /* { dg-error {passing [^ ]* to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2 (u8x2, one, u8);
+  u8x2 = svset2 (u8x2, 3 - 2, u8);
+  u8x2 = svset2 (u8x2, 1.0, u8);
+  u8x2 = svset2 (u8x2, const_sub (5, 4), u8);
+  u8x2 = svset2 (u8x2, const_sub (6, 4), u8); /* { dg-error {passing 2 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2 (u8x2, const_sub (7, 4), u8); /* { dg-error {passing 3 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2 (u8x2, const_sub (8, 4), u8); /* { dg-error {passing 4 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2 (u8x2, add (0, 0), u8); /* { dg-error "argument 2 of 'svset2' must be an integer constant expression" } */
+
+  return f64;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set2_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set2_2.C
new file mode 100644
index 000000000..1433b78ba
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set2_2.C
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
+uint64_t add (uint64_t a, uint64_t b) { return a + b; }
+
+svfloat64_t
+f1 (svbool_t pg, svuint8_t u8, svuint8x2_t u8x2, svint8x2_t s8x2,
+    svuint8x3_t u8x3, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  u8x2 = svset2_u8 (u8x2); /* { dg-error {too few arguments to function '[^']*'} } */
+  u8x2 = svset2_u8 (u8x2, 1); /* { dg-error {too few arguments to function '[^']*'} } */
+  u8x2 = svset2_u8 (u8x2, 1, u8, 2); /* { dg-error {too many arguments to function '[^']*'} } */
+  u8x2 = svset2_u8 (u8, 0, u8); /* { dg-error {cannot convert 'svuint8_t' to 'svuint8x2_t'} } */
+  u8x2 = svset2_u8 (s8x2, 0, u8); /* { dg-error {cannot convert 'svint8x2_t' to 'svuint8x2_t'} } */
+  u8x2 = svset2_u8 (u8x3, 0, u8); /* { dg-error {cannot convert 'svuint8x3_t' to 'svuint8x2_t'} } */
+  u8x2 = svset2_u8 (pg, 0, u8); /* { dg-error {cannot convert 'svbool_t' to 'svuint8x2_t'} } */
+  u8x2 = svset2_u8 (u8x2, 0, f64); /* { dg-error {cannot convert 'svfloat64_t' to 'svuint8_t'} } */
+  u8x2 = svset2_u8 (u8x2, 0, u8x2); /* { dg-error {cannot convert 'svuint8x2_t' to 'svuint8_t'} } */
+  u8x2 = svset2_u8 (u8x2, 0, pg); /* { dg-error {cannot convert 'svbool_t' to 'svuint8_t'} } */
+  u8x2 = svset2_u8 (u8x2, x, u8); /* { dg-error "argument 2 of 'svset2_u8' must be an integer constant expression" } */
+  u8x2 = svset2_u8 (u8x2, 0, u8);
+  s8x2 = svset2_u8 (u8x2, 0, u8); /* { dg-error {cannot convert 'svuint8x2_t' to 'svint8x2_t' in assignment} } */
+  u8x2 = svset2_u8 (u8x2, 1, u8);
+  u8x2 = svset2_u8 (u8x2, 2, u8); /* { dg-error {passing 2 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2_u8 (u8x2, 3, u8); /* { dg-error {passing 3 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2_u8 (u8x2, 4, u8); /* { dg-error {passing 4 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2_u8 (u8x2, 5, u8); /* { dg-error {passing 5 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2_u8 (u8x2, ~0U, u8); /* { dg-error {passing [^ ]* to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2_u8 (u8x2, one, u8);
+  u8x2 = svset2_u8 (u8x2, 3 - 2, u8);
+  u8x2 = svset2_u8 (u8x2, 1.0, u8);
+  u8x2 = svset2_u8 (u8x2, const_sub (5, 4), u8);
+  u8x2 = svset2_u8 (u8x2, const_sub (6, 4), u8); /* { dg-error {passing 2 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2_u8 (u8x2, const_sub (7, 4), u8); /* { dg-error {passing 3 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2_u8 (u8x2, const_sub (8, 4), u8); /* { dg-error {passing 4 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2_u8 (u8x2, add (0, 0), u8); /* { dg-error "argument 2 of 'svset2_u8' must be an integer constant expression" } */
+
+  return f64;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set3_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set3_1.C
new file mode 100644
index 000000000..9bb4f7a04
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set3_1.C
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
+uint64_t add (uint64_t a, uint64_t b) { return a + b; }
+
+svfloat64_t
+f1 (svbool_t pg, svfloat16_t f16, svfloat16x3_t f16x3, svuint16x3_t u16x3,
+    svfloat16x4_t f16x4, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  f16x3 = svset3 (f16x3); /* { dg-error {no matching function for call to 'svset3\(svfloat16x3_t\&\)'} } */
+  f16x3 = svset3 (f16x3, 1); /* { dg-error {no matching function for call to 'svset3\(svfloat16x3_t\&, int\)'} } */
+  f16x3 = svset3 (f16x3, 1, f16, 2); /* { dg-error {no matching function for call to 'svset3\(svfloat16x3_t\&, int, svfloat16_t\&, int\)'} } */
+  f16x3 = svset3 (f16, 0, f16); /* { dg-error {no matching function for call to 'svset3\(svfloat16_t\&, int, svfloat16_t\&\)'} } */
+  f16x3 = svset3 (u16x3, 0, f16); /* { dg-error {no matching function for call to 'svset3\(svuint16x3_t\&, int, svfloat16_t\&\)'} } */
+  f16x3 = svset3 (f16x4, 0, f16); /* { dg-error {no matching function for call to 'svset3\(svfloat16x4_t\&, int, svfloat16_t\&\)'} } */
+  f16x3 = svset3 (pg, 0, f16); /* { dg-error {no matching function for call to 'svset3\(svbool_t\&, int, svfloat16_t\&\)'} } */
+  f16x3 = svset3 (f16x3, 0, f64); /* { dg-error {no matching function for call to 'svset3\(svfloat16x3_t\&, int, svfloat64_t\&\)'} } */
+  f16x3 = svset3 (f16x3, 0, f16x3); /* { dg-error {no matching function for call to 'svset3\(svfloat16x3_t\&, int, svfloat16x3_t\&\)'} } */
+  f16x3 = svset3 (f16x3, 0, pg); /* { dg-error {no matching function for call to 'svset3\(svfloat16x3_t\&, int, svbool_t\&\)'} } */
+  f16x3 = svset3 (f16x3, x, f16); /* { dg-error "argument 2 of 'svset3' must be an integer constant expression" } */
+  f16x3 = svset3 (f16x3, 0, f16);
+  u16x3 = svset3 (f16x3, 0, f16); /* { dg-error {cannot convert 'svfloat16x3_t' to 'svuint16x3_t' in assignment} } */
+  f16x3 = svset3 (f16x3, 1, f16);
+  f16x3 = svset3 (f16x3, 2, f16);
+  f16x3 = svset3 (f16x3, 3, f16); /* { dg-error {passing 3 to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3 (f16x3, 4, f16); /* { dg-error {passing 4 to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3 (f16x3, 5, f16); /* { dg-error {passing 5 to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3 (f16x3, ~0U, f16); /* { dg-error {passing [^ ]* to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3 (f16x3, one, f16);
+  f16x3 = svset3 (f16x3, 3 - 2, f16);
+  f16x3 = svset3 (f16x3, 1.0, f16);
+  f16x3 = svset3 (f16x3, const_sub (5, 4), f16);
+  f16x3 = svset3 (f16x3, const_sub (6, 4), f16);
+  f16x3 = svset3 (f16x3, const_sub (7, 4), f16); /* { dg-error {passing 3 to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3 (f16x3, const_sub (8, 4), f16); /* { dg-error {passing 4 to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3 (f16x3, add (0, 0), f16); /* { dg-error "argument 2 of 'svset3' must be an integer constant expression" } */
+
+  return f64;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set3_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set3_2.C
new file mode 100644
index 000000000..0bb604924
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set3_2.C
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
+uint64_t add (uint64_t a, uint64_t b) { return a + b; }
+
+svfloat64_t
+f1 (svbool_t pg, svfloat16_t f16, svfloat16x3_t f16x3, svuint16x3_t u16x3,
+    svfloat16x4_t f16x4, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  f16x3 = svset3_f16 (f16x3); /* { dg-error {too few arguments to function '[^']*'} } */
+  f16x3 = svset3_f16 (f16x3, 1); /* { dg-error {too few arguments to function '[^']*'} } */
+  f16x3 = svset3_f16 (f16x3, 1, f16, 2); /* { dg-error {too many arguments to function '[^']*'} } */
+  f16x3 = svset3_f16 (f16, 0, f16); /* { dg-error {cannot convert 'svfloat16_t' to 'svfloat16x3_t'} } */
+  f16x3 = svset3_f16 (u16x3, 0, f16); /* { dg-error {cannot convert 'svuint16x3_t' to 'svfloat16x3_t'} } */
+  f16x3 = svset3_f16 (f16x4, 0, f16); /* { dg-error {cannot convert 'svfloat16x4_t' to 'svfloat16x3_t'} } */
+  f16x3 = svset3_f16 (pg, 0, f16); /* { dg-error {cannot convert 'svbool_t' to 'svfloat16x3_t'} } */
+  f16x3 = svset3_f16 (f16x3, 0, f64); /* { dg-error {cannot convert 'svfloat64_t' to 'svfloat16_t'} } */
+  f16x3 = svset3_f16 (f16x3, 0, f16x3); /* { dg-error {cannot convert 'svfloat16x3_t' to 'svfloat16_t'} } */
+  f16x3 = svset3_f16 (f16x3, 0, pg); /* { dg-error {cannot convert 'svbool_t' to 'svfloat16_t'} } */
+  f16x3 = svset3_f16 (f16x3, x, f16); /* { dg-error "argument 2 of 'svset3_f16' must be an integer constant expression" } */
+  f16x3 = svset3_f16 (f16x3, 0, f16);
+  u16x3 = svset3_f16 (f16x3, 0, f16); /* { dg-error {cannot convert 'svfloat16x3_t' to 'svuint16x3_t' in assignment} } */
+  f16x3 = svset3_f16 (f16x3, 1, f16);
+  f16x3 = svset3_f16 (f16x3, 2, f16);
+  f16x3 = svset3_f16 (f16x3, 3, f16); /* { dg-error {passing 3 to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3_f16 (f16x3, 4, f16); /* { dg-error {passing 4 to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3_f16 (f16x3, 5, f16); /* { dg-error {passing 5 to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3_f16 (f16x3, ~0U, f16); /* { dg-error {passing [^ ]* to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3_f16 (f16x3, one, f16);
+  f16x3 = svset3_f16 (f16x3, 3 - 2, f16);
+  f16x3 = svset3_f16 (f16x3, 1.0, f16);
+  f16x3 = svset3_f16 (f16x3, const_sub (5, 4), f16);
+  f16x3 = svset3_f16 (f16x3, const_sub (6, 4), f16);
+  f16x3 = svset3_f16 (f16x3, const_sub (7, 4), f16); /* { dg-error {passing 3 to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3_f16 (f16x3, const_sub (8, 4), f16); /* { dg-error {passing 4 to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3_f16 (f16x3, add (0, 0), f16); /* { dg-error "argument 2 of 'svset3_f16' must be an integer constant expression" } */
+
+  return f64;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set4_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set4_1.C
new file mode 100644
index 000000000..dc5dae872
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set4_1.C
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
+uint64_t add (uint64_t a, uint64_t b) { return a + b; }
+
+svfloat64_t
+f1 (svbool_t pg, svint32_t s32, svint32x4_t s32x4, svfloat32x4_t f32x4,
+    svint32x2_t s32x2, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  s32x4 = svset4 (s32x4); /* { dg-error {no matching function for call to 'svset4\(svint32x4_t\&\)'} } */
+  s32x4 = svset4 (s32x4, 1); /* { dg-error {no matching function for call to 'svset4\(svint32x4_t\&, int\)'} } */
+  s32x4 = svset4 (s32x4, 1, s32, 2); /* { dg-error {no matching function for call to 'svset4\(svint32x4_t\&, int, svint32_t\&, int\)'} } */
+  s32x4 = svset4 (s32, 0, s32); /* { dg-error {no matching function for call to 'svset4\(svint32_t\&, int, svint32_t\&\)'} } */
+  s32x4 = svset4 (f32x4, 0, s32); /* { dg-error {no matching function for call to 'svset4\(svfloat32x4_t\&, int, svint32_t\&\)'} } */
+  s32x4 = svset4 (s32x2, 0, s32); /* { dg-error {no matching function for call to 'svset4\(svint32x2_t\&, int, svint32_t\&\)'} } */
+  s32x4 = svset4 (pg, 0, s32); /* { dg-error {no matching function for call to 'svset4\(svbool_t\&, int, svint32_t\&\)'} } */
+  s32x4 = svset4 (s32x4, 0, f64); /* { dg-error {no matching function for call to 'svset4\(svint32x4_t\&, int, svfloat64_t\&\)'} } */
+  s32x4 = svset4 (s32x4, 0, s32x4); /* { dg-error {no matching function for call to 'svset4\(svint32x4_t\&, int, svint32x4_t\&\)'} } */
+  s32x4 = svset4 (s32x4, 0, pg); /* { dg-error {no matching function for call to 'svset4\(svint32x4_t\&, int, svbool_t\&\)'} } */
+  s32x4 = svset4 (s32x4, x, s32); /* { dg-error "argument 2 of 'svset4' must be an integer constant expression" } */
+  s32x4 = svset4 (s32x4, 0, s32);
+  f32x4 = svset4 (s32x4, 0, s32); /* { dg-error {cannot convert 'svint32x4_t' to 'svfloat32x4_t' in assignment} } */
+  s32x4 = svset4 (s32x4, 1, s32);
+  s32x4 = svset4 (s32x4, 2, s32);
+  s32x4 = svset4 (s32x4, 3, s32);
+  s32x4 = svset4 (s32x4, 4, s32); /* { dg-error {passing 4 to argument 2 of 'svset4', which expects a value in the range \[0, 3\]} } */
+  s32x4 = svset4 (s32x4, 5, s32); /* { dg-error {passing 5 to argument 2 of 'svset4', which expects a value in the range \[0, 3\]} } */
+  s32x4 = svset4 (s32x4, ~0U, s32); /* { dg-error {passing [^ ]* to argument 2 of 'svset4', which expects a value in the range \[0, 3\]} } */
+  s32x4 = svset4 (s32x4, one, s32);
+  s32x4 = svset4 (s32x4, 3 - 2, s32);
+  s32x4 = svset4 (s32x4, 1.0, s32);
+  s32x4 = svset4 (s32x4, const_sub (5, 4), s32);
+  s32x4 = svset4 (s32x4, const_sub (6, 4), s32);
+  s32x4 = svset4 (s32x4, const_sub (7, 4), s32);
+  s32x4 = svset4 (s32x4, const_sub (8, 4), s32); /* { dg-error {passing 4 to argument 2 of 'svset4', which expects a value in the range \[0, 3\]} } */
+  s32x4 = svset4 (s32x4, add (0, 0), s32); /* { dg-error "argument 2 of 'svset4' must be an integer constant expression" } */
+
+  return f64;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set4_2.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set4_2.C
new file mode 100644
index 000000000..762a6db74
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/set4_2.C
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c++11 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+constexpr uint64_t const_sub (uint64_t a, uint64_t b) { return a - b; }
+uint64_t add (uint64_t a, uint64_t b) { return a + b; }
+
+svfloat64_t
+f1 (svbool_t pg, svint32_t s32, svint32x4_t s32x4, svfloat32x4_t f32x4,
+    svint32x2_t s32x2, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  s32x4 = svset4_s32 (s32x4); /* { dg-error {too few arguments to function '[^']*'} } */
+  s32x4 = svset4_s32 (s32x4, 1); /* { dg-error {too few arguments to function '[^']*'} } */
+  s32x4 = svset4_s32 (s32x4, 1, s32, 2); /* { dg-error {too many arguments to function '[^']*'} } */
+  s32x4 = svset4_s32 (s32, 0, s32); /* { dg-error {cannot convert 'svint32_t' to 'svint32x4_t'} } */
+  s32x4 = svset4_s32 (f32x4, 0, s32); /* { dg-error {cannot convert 'svfloat32x4_t' to 'svint32x4_t'} } */
+  s32x4 = svset4_s32 (s32x2, 0, s32); /* { dg-error {cannot convert 'svint32x2_t' to 'svint32x4_t'} } */
+  s32x4 = svset4_s32 (pg, 0, s32); /* { dg-error {cannot convert 'svbool_t' to 'svint32x4_t'} } */
+  s32x4 = svset4_s32 (s32x4, 0, f64); /* { dg-error {cannot convert 'svfloat64_t' to 'svint32_t'} } */
+  s32x4 = svset4_s32 (s32x4, 0, s32x4); /* { dg-error {cannot convert 'svint32x4_t' to 'svint32_t'} } */
+  s32x4 = svset4_s32 (s32x4, 0, pg); /* { dg-error {cannot convert 'svbool_t' to 'svint32_t'} } */
+  s32x4 = svset4_s32 (s32x4, x, s32); /* { dg-error "argument 2 of 'svset4_s32' must be an integer constant expression" } */
+  s32x4 = svset4_s32 (s32x4, 0, s32);
+  f32x4 = svset4_s32 (s32x4, 0, s32); /* { dg-error {cannot convert 'svint32x4_t' to 'svfloat32x4_t' in assignment} } */
+  s32x4 = svset4_s32 (s32x4, 1, s32);
+  s32x4 = svset4_s32 (s32x4, 2, s32);
+  s32x4 = svset4_s32 (s32x4, 3, s32);
+  s32x4 = svset4_s32 (s32x4, 4, s32); /* { dg-error {passing 4 to argument 2 of 'svset4_s32', which expects a value in the range \[0, 3\]} } */
+  s32x4 = svset4_s32 (s32x4, 5, s32); /* { dg-error {passing 5 to argument 2 of 'svset4_s32', which expects a value in the range \[0, 3\]} } */
+  s32x4 = svset4_s32 (s32x4, ~0U, s32); /* { dg-error {passing [^ ]* to argument 2 of 'svset4_s32', which expects a value in the range \[0, 3\]} } */
+  s32x4 = svset4_s32 (s32x4, one, s32);
+  s32x4 = svset4_s32 (s32x4, 3 - 2, s32);
+  s32x4 = svset4_s32 (s32x4, 1.0, s32);
+  s32x4 = svset4_s32 (s32x4, const_sub (5, 4), s32);
+  s32x4 = svset4_s32 (s32x4, const_sub (6, 4), s32);
+  s32x4 = svset4_s32 (s32x4, const_sub (7, 4), s32);
+  s32x4 = svset4_s32 (s32x4, const_sub (8, 4), s32); /* { dg-error {passing 4 to argument 2 of 'svset4_s32', which expects a value in the range \[0, 3\]} } */
+  s32x4 = svset4_s32 (s32x4, add (0, 0), s32); /* { dg-error "argument 2 of 'svset4_s32' must be an integer constant expression" } */
+
+  return f64;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_1.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_1.c
new file mode 100644
index 000000000..ff2590032
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_1.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+int svbool_t; /* { dg-message "note: previous declaration 'int svbool_t'" } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'typedef [^'\n]* svbool_t' redeclared as different kind of entity} } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_10.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_10.c
new file mode 100644
index 000000000..86d87fa37
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_10.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+typedef int svint8x2_t;
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error {conflicting declaration 'typedef struct svint8x2_t svint8x2_t'} } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_11.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_11.c
new file mode 100644
index 000000000..741d10eaf
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_11.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+
+struct svint8x2_t;
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error {conflicting declaration 'typedef struct svint8x2_t svint8x2_t'} } */
+
+svint8_t f (svint8x2_t x) { return x.__val[0]; } /* { dg-error {'x' has incomplete type} } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_12.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_12.c
new file mode 100644
index 000000000..fc6a07ac6
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_12.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+
+typedef struct svint8x2_t svint8x2_t;
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error {conflicting declaration 'typedef struct svint8x2_t svint8x2_t'} } */
+
+svint8_t f (svint8x2_t x) { return x.__val[0]; } /* { dg-error {'x' has incomplete type} } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_13.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_13.c
new file mode 100644
index 000000000..161aacb7b
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_13.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+struct svint8x2_t {};
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error {conflicting declaration 'typedef struct svint8x2_t svint8x2_t'} } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_14.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_14.c
new file mode 100644
index 000000000..83191118f
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_14.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+enum svpattern { FOO }; /* { dg-message "note: previous definition here" } */
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error "multiple definition of 'enum svpattern'" } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_15.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_15.c
new file mode 100644
index 000000000..71e35a4eb
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_15.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+enum svpattern { FOO }; /* { dg-error "multiple definition of 'enum svpattern'" } */
+enum foo { SV_ALL }; /* { dg-error "'SV_ALL' conflicts with a previous declaration" } */
+typedef int SV_POW2; /* { dg-error "'typedef int SV_POW2' redeclared as different kind of entity" } */
+int SV_VL3; /* { dg-error "'int SV_VL3' redeclared as different kind of entity" } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_16.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_16.c
new file mode 100644
index 000000000..277064d31
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_16.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+struct svpattern { int x; };
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error "'svpattern' referred to as enum" } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_17.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_17.c
new file mode 100644
index 000000000..e4bcda6fb
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_17.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+struct svpattern { int x; }; /* { dg-error "'svpattern' referred to as 'struct'" } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_18.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_18.c
new file mode 100644
index 000000000..b6706150b
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_18.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+int svpattern; /* OK in C.  */
+
+#pragma GCC aarch64 "arm_sve.h"
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_19.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_19.c
new file mode 100644
index 000000000..c6379f762
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_19.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+int svpattern; /* OK in C.  */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_2.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_2.c
new file mode 100644
index 000000000..5baf59932
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_2.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+int svint8_t; /* { dg-message "note: previous declaration 'int svint8_t" } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'typedef [^'\n]* svint8_t' redeclared as different kind of entity} } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_20.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_20.c
new file mode 100644
index 000000000..3ba19f596
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_20.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+
+enum foo { SV_VL4 };
+typedef int SV_POW2;
+int SV_ALL;
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error "'SV_VL4' conflicts with a previous declaration" } */
+/* { dg-error "'SV_POW2' redeclared as different kind of entity" "" { target *-*-* } .-1 } */
+/* { dg-error "'SV_ALL' redeclared as different kind of entity" "" { target *-*-* } .-2 } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_3.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_3.c
new file mode 100644
index 000000000..a8d7bdcc7
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_3.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+int svuint16_t; /* { dg-message "note: previous declaration 'int svuint16_t'" } */
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error {'typedef [^'\n]* svuint16_t' redeclared as different kind of entity} } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_4.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_4.c
new file mode 100644
index 000000000..c0563d0ee
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_4.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+int svfloat32_t; /* { dg-message "note: previous declaration 'int svfloat32_t'" } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'typedef [^'\n]* svfloat32_t' redeclared as different kind of entity} } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_5.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_5.c
new file mode 100644
index 000000000..ee28e9527
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_5.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+typedef int svbool_t; /* { dg-message "note: previous declaration as 'typedef int svbool_t'" } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {conflicting declaration '[^'\n]* svbool_t'} } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_6.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_6.c
new file mode 100644
index 000000000..85c17eab6
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_6.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+
+typedef __SVBool_t svbool_t;
+
+#pragma GCC aarch64 "arm_sve.h"
+
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_7.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_7.c
new file mode 100644
index 000000000..3a0dfb1c0
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_7.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+
+int svint8x2_t;
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error {'typedef struct svint8x2_t svint8x2_t' redeclared as different kind of entity} } */
+
+void f (struct svint8x2_t) {} /* { dg-error {incomplete type} } */
+void g () { int &x = svint8x2_t; }
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_8.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_8.c
new file mode 100644
index 000000000..9b0df9137
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_8.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+
+struct svint8x2_t;
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error {conflicting declaration 'typedef struct svint8x2_t svint8x2_t'} } */
+
+void f (svint8x2_t) {} /* { dg-error {incomplete type} } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_9.c b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_9.c
new file mode 100644
index 000000000..43068da78
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/type_redef_9.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+int svint8x2_t; /* { dg-error {'int svint8x2_t' redeclared as different kind of entity} } */
+
+void f (struct svint8x2_t) {} /* { dg-error {using typedef-name 'svint8x2_t' after 'struct'} } */
+void g () { int &x = svint8x2_t; } /* { dg-error {expected primary-expression before ';' token} } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/whilele_1.C b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/whilele_1.C
new file mode 100644
index 000000000..9571e668b
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/acle/general-c++/whilele_1.C
@@ -0,0 +1,81 @@
+// { dg-do compile }
+
+#include <arm_sve.h>
+
+enum foo { A, B };
+
+void
+test (int8_t s8, int16_t s16, int32_t s32, int64_t s64,
+      uint8_t u8, uint16_t u16, uint32_t u32, uint64_t u64,
+      bool b, foo e, int *ptr, float f32, svbool_t pg,
+      svint32_t vec)
+{
+  svwhilele_b8 (s32); // { dg-error {no matching function for call to 'svwhilele_b8\(int32_t&\)'} }
+  svwhilele_b8 (s32, s32, s32); // { dg-error {no matching function for call to 'svwhilele_b8\(int32_t&, int32_t&, int32_t&\)'} }
+
+  svwhilele_b8 (b, b);
+  svwhilele_b8 (e, e);
+  svwhilele_b8 (s8, s8);
+  svwhilele_b8 (u8, u8);
+  svwhilele_b8 (s16, s16);
+  svwhilele_b8 (u16, u16);
+  svwhilele_b8 (ptr, ptr); // { dg-error {no matching function for call to 'svwhilele_b8\(int\*&, int\*&\)'} }
+  // { dg-error {invalid conversion from 'int\*' to '[^']*'} "" { target *-*-* } .-1 }
+  svwhilele_b8 (f32, f32); // { dg-error {call of overloaded 'svwhilele_b8\(float&, float&\)' is ambiguous} }
+  svwhilele_b8 (pg, pg); // { dg-error {no matching function for call to 'svwhilele_b8\(svbool_t&, svbool_t&\)'} }
+  svwhilele_b8 (vec, vec); // { dg-error {no matching function for call to 'svwhilele_b8\(svint32_t&, svint32_t&\)'} }
+
+  svwhilele_b8 (s32, b);
+  svwhilele_b8 (s32, e);
+  svwhilele_b8 (s32, s8);
+  svwhilele_b8 (s32, u8);
+  svwhilele_b8 (s32, s16);
+  svwhilele_b8 (s32, u16);
+
+  svwhilele_b8 (u32, b); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, bool&\)' is ambiguous} }
+  svwhilele_b8 (u32, e); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, foo&\)' is ambiguous} }
+  svwhilele_b8 (u32, s8); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, int8_t&\)' is ambiguous} }
+  svwhilele_b8 (u32, u8); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, uint8_t&\)' is ambiguous} }
+  svwhilele_b8 (u32, s16); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, int16_t&\)' is ambiguous} }
+  svwhilele_b8 (u32, u16); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, uint16_t&\)' is ambiguous} }
+
+  svwhilele_b8 (s32, s32);
+  svwhilele_b8 (s32, u32); // { dg-error {call of overloaded 'svwhilele_b8\(int32_t&, uint32_t&\)' is ambiguous} }
+  svwhilele_b8 (s32, s64); // { dg-error {call of overloaded 'svwhilele_b8\(int32_t&, int64_t&\)' is ambiguous} }
+  svwhilele_b8 (s32, u64); // { dg-error {call of overloaded 'svwhilele_b8\(int32_t&, uint64_t&\)' is ambiguous} }
+
+  svwhilele_b8 (u32, s32); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, int32_t&\)' is ambiguous} }
+  svwhilele_b8 (u32, u32);
+  svwhilele_b8 (u32, s64); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, int64_t&\)' is ambiguous} }
+  svwhilele_b8 (u32, u64); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, uint64_t&\)' is ambiguous} }
+
+  svwhilele_b8 (s64, s32); // { dg-error {call of overloaded 'svwhilele_b8\(int64_t&, int32_t&\)' is ambiguous} }
+  svwhilele_b8 (s64, u32); // { dg-error {call of overloaded 'svwhilele_b8\(int64_t&, uint32_t&\)' is ambiguous} }
+  svwhilele_b8 (s64, s64);
+  svwhilele_b8 (s64, u64); // { dg-error {call of overloaded 'svwhilele_b8\(int64_t&, uint64_t&\)' is ambiguous} }
+
+  svwhilele_b8 (u64, s32); // { dg-error {call of overloaded 'svwhilele_b8\(uint64_t&, int32_t&\)' is ambiguous} }
+  svwhilele_b8 (u64, u32); // { dg-error {call of overloaded 'svwhilele_b8\(uint64_t&, uint32_t&\)' is ambiguous} }
+  svwhilele_b8 (u64, s64); // { dg-error {call of overloaded 'svwhilele_b8\(uint64_t&, int64_t&\)' is ambiguous} }
+  svwhilele_b8 (u64, u64);
+
+  svwhilele_b8 (0, s32);
+  svwhilele_b8 (0, u32); // { dg-error {call of overloaded 'svwhilele_b8\(int, uint32_t&\)' is ambiguous} }
+  svwhilele_b8 (0, s64); // { dg-error {call of overloaded 'svwhilele_b8\(int, int64_t&\)' is ambiguous} }
+  svwhilele_b8 (0, u64); // { dg-error {call of overloaded 'svwhilele_b8\(int, uint64_t&\)' is ambiguous} }
+
+  svwhilele_b8 (s32, 0);
+  svwhilele_b8 (u32, 0); // { dg-error {call of overloaded 'svwhilele_b8\(uint32_t&, int\)' is ambiguous} }
+  svwhilele_b8 (s64, 0); // { dg-error {call of overloaded 'svwhilele_b8\(int64_t&, int\)' is ambiguous} }
+  svwhilele_b8 (u64, 0); // { dg-error {call of overloaded 'svwhilele_b8\(uint64_t&, int\)' is ambiguous} }
+
+  svwhilele_b8 (0U, s32); // { dg-error {call of overloaded 'svwhilele_b8\(unsigned int, int32_t&\)' is ambiguous} }
+  svwhilele_b8 (0U, u32);
+  svwhilele_b8 (0U, s64); // { dg-error {call of overloaded 'svwhilele_b8\(unsigned int, int64_t&\)' is ambiguous} }
+  svwhilele_b8 (0U, u64); // { dg-error {call of overloaded 'svwhilele_b8\(unsigned int, uint64_t&\)' is ambiguous} }
+
+  svwhilele_b8 (s32, 0U); // { dg-error {call of overloaded 'svwhilele_b8\(int32_t&, unsigned int\)' is ambiguous} }
+  svwhilele_b8 (u32, 0U);
+  svwhilele_b8 (s64, 0U); // { dg-error {call of overloaded 'svwhilele_b8\(int64_t&, unsigned int\)' is ambiguous} }
+  svwhilele_b8 (u64, 0U); // { dg-error {call of overloaded 'svwhilele_b8\(uint64_t&, unsigned int\)' is ambiguous} }
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/catch_7.C b/gcc/testsuite/g++.target/aarch64/sve/catch_7.C
new file mode 100644
index 000000000..ac10b6984
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/catch_7.C
@@ -0,0 +1,38 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O" } */
+
+#include <arm_sve.h>
+
+void __attribute__ ((noipa))
+f1 (void)
+{
+  throw 1;
+}
+
+void __attribute__ ((noipa))
+f2 (svbool_t)
+{
+  register svint8_t z8 asm ("z8") = svindex_s8 (11, 1);
+  asm volatile ("" :: "w" (z8));
+  f1 ();
+}
+
+void __attribute__ ((noipa))
+f3 (int n)
+{
+  register double d8 asm ("v8") = 42.0;
+  for (int i = 0; i < n; ++i)
+    {
+      asm volatile ("" : "=w" (d8) : "w" (d8));
+      try { f2 (svptrue_b8 ()); } catch (int) { break; }
+    }
+  if (d8 != 42.0)
+    __builtin_abort ();
+}
+
+int
+main (void)
+{
+  f3 (100);
+  return 0;
+}
diff --git a/gcc/testsuite/g++.target/aarch64/sve/dup_sel_1.C b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_1.C
new file mode 100644
index 000000000..a59862cf9
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_1.C
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size(32)));
+
+void
+foo (int32_t val)
+{
+  register vnx4si x asm ("z0");
+  register vnx4si y asm ("z0");
+  asm volatile ("" : "=w" (y));
+  val += 1;
+  vnx4si z = { val, val, val, val, val, val, val, val };
+  x = (vnx4si) { -1, 0, 0, -1, 0, -1, 0, -1 } ? z : y;
+  asm volatile ("" :: "w" (x));
+}
+
+/* { dg-final { scan-assembler {\tmov\tz0\.s, p[0-7]/m, w[0-9]+\n} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/dup_sel_2.C b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_2.C
new file mode 100644
index 000000000..47aad2d58
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_2.C
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size(32)));
+
+void
+foo (int32_t val)
+{
+  register vnx4si x asm ("z0");
+  register vnx4si y asm ("z1");
+  asm volatile ("" : "=w" (y));
+  val += 1;
+  vnx4si z = { val, val, val, val, val, val, val, val };
+  x = (vnx4si) { -1, 0, 0, -1, 0, -1, 0, -1 } ? z : y;
+  asm volatile ("" :: "w" (x));
+}
+
+/* { dg-final { scan-assembler {\tmovprfx\tz0, z1\n\tmov\tz0\.s, p[0-7]/m, w[0-9]+\n} } } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/dup_sel_3.C b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_3.C
new file mode 100644
index 000000000..e8ec6f8b4
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_3.C
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size(32)));
+typedef float vnx4sf __attribute__((vector_size(32)));
+
+void
+foo (float val)
+{
+  register vnx4sf x asm ("z0");
+  register vnx4sf y asm ("z0");
+  asm volatile ("" : "=w" (y));
+  vnx4sf z = { val, val, val, val, val, val, val, val };
+  x = (vnx4si) { -1, 0, 0, -1, 0, -1, 0, -1 } ? z : y;
+  asm volatile ("" :: "w" (x));
+}
+
+/* { dg-final { scan-assembler {\tmov\tz0\.s, p[0-7]/m, s[0-9]+\n} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/dup_sel_4.C b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_4.C
new file mode 100644
index 000000000..32ca59439
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_4.C
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size(32)));
+typedef float vnx4sf __attribute__((vector_size(32)));
+
+void
+foo (float val)
+{
+  register vnx4sf x asm ("z0");
+  register vnx4sf y asm ("z1");
+  asm volatile ("" : "=w" (y));
+  vnx4sf z = { val, val, val, val, val, val, val, val };
+  x = (vnx4si) { -1, 0, 0, -1, 0, -1, 0, -1 } ? z : y;
+  asm volatile ("" :: "w" (x));
+}
+
+/* { dg-final { scan-assembler {\tmovprfx\tz0, z1\n\tmov\tz0\.s, p[0-7]/m, s[0-9]+\n} } } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/dup_sel_5.C b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_5.C
new file mode 100644
index 000000000..2fb903a91
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_5.C
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size(32)));
+
+void
+foo (int32_t val)
+{
+  register vnx4si x asm ("z0");
+  val += 1;
+  vnx4si y = { val, val, val, val, val, val, val, val };
+  x = (vnx4si) { -1, 0, 0, -1, 0, -1, 0, -1 } ? y : (vnx4si) { 0 };
+  asm volatile ("" :: "w" (x));
+}
+
+/* { dg-final { scan-assembler {\tmovprfx\tz0\.s, p[0-7]/z, z0\.s\n\tmov\tz0\.s, p[0-7]/m, w[0-9]+\n} } } */
diff --git a/gcc/testsuite/g++.target/aarch64/sve/dup_sel_6.C b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_6.C
new file mode 100644
index 000000000..f2b0181bb
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/dup_sel_6.C
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size(32)));
+typedef float vnx4sf __attribute__((vector_size(32)));
+
+void
+foo (float val)
+{
+  register vnx4sf x asm ("z0");
+  vnx4sf y = { val, val, val, val, val, val, val, val };
+  x = (vnx4si) { -1, 0, 0, -1, 0, -1, 0, -1 } ? y : (vnx4sf) { 0 };
+  asm volatile ("" :: "w" (x));
+}
+
+/* { dg-final { scan-assembler {\tmovprfx\tz0\.s, p[0-7]/z, z0\.s\n\tmov\tz0\.s, p[0-7]/m, s[0-9]+\n} } } */
diff --git a/gcc/testsuite/gcc.c-torture/execute/builtins/builtins.exp b/gcc/testsuite/gcc.c-torture/execute/builtins/builtins.exp
index acb9eacb4..3560a1ff2 100644
--- a/gcc/testsuite/gcc.c-torture/execute/builtins/builtins.exp
+++ b/gcc/testsuite/gcc.c-torture/execute/builtins/builtins.exp
@@ -37,7 +37,7 @@ load_lib c-torture.exp
 torture-init
 set-torture-options $C_TORTURE_OPTIONS {{}} $LTO_TORTURE_OPTIONS
 
-set additional_flags "-fno-tree-loop-distribute-patterns -fno-tracer -fno-ipa-ra"
+set additional_flags "-fno-tree-loop-distribute-patterns -fno-tracer -fno-ipa-ra -fno-inline-functions"
 if [istarget "powerpc-*-darwin*"] {
    lappend additional_flags "-Wl,-multiply_defined,suppress"
 }
diff --git a/gcc/testsuite/gcc.dg/diag-aka-3.c b/gcc/testsuite/gcc.dg/diag-aka-3.c
new file mode 100644
index 000000000..a3778ed7d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/diag-aka-3.c
@@ -0,0 +1,9 @@
+typedef unsigned int myvec __attribute__((vector_size (16)));
+
+void f (float x)
+{
+  myvec y = x; /* { dg-error {incompatible types when initializing type 'myvec' {aka '__vector\([48]\) unsigned int'} using type 'float'} } */
+  myvec *ptr = &x; /* { dg-error {initialization of 'myvec \*' {aka '__vector\([48]\) unsigned int \*'} from incompatible pointer type 'float \*'} } */
+  const myvec *const_ptr = &x; /* { dg-error {initialization of 'const myvec \*' {aka 'const __vector\([48]\) unsigned int \*'} from incompatible pointer type 'float \*'} } */
+  volatile myvec *volatile_ptr = &x; /* { dg-error {initialization of 'volatile myvec \*' {aka 'volatile __vector\([48]\) unsigned int \*'} from incompatible pointer type 'float \*'} } */
+}
diff --git a/gcc/testsuite/gcc.dg/enum-redef-1.c b/gcc/testsuite/gcc.dg/enum-redef-1.c
new file mode 100644
index 000000000..b3fa6cbf8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/enum-redef-1.c
@@ -0,0 +1,29 @@
+enum a { A };
+enum a { B }; /* { dg-bogus "nested redefinition" } */
+/* { dg-error "redeclaration of 'enum a'" "" { target *-*-* } .-1 } */
+
+enum empty {}; /* { dg-error "empty enum is invalid" } */
+enum empty {}; /* { dg-bogus "nested redefinition" } */
+/* { dg-error "empty enum is invalid" "" { target *-*-* } .-1 } */
+
+enum nested_first {
+  C1 = sizeof(enum nested_first { C1a }), /* { dg-error "nested redefinition of 'enum nested_first" } */
+  C2 = sizeof(enum nested_first { C2a }) /* { dg-error "redeclaration of 'enum nested_first'" "" } */
+};
+
+enum nested_second {
+  D1,
+  D2 = sizeof(enum nested_second { D2a }), /* { dg-error "nested redefinition of 'enum nested_second" } */
+  D3 = sizeof(enum nested_second { D3a }) /* { dg-error "redeclaration of 'enum nested_second'" "" } */
+};
+
+enum nested_repeat { E };
+enum nested_repeat { /* { dg-error "redeclaration of 'enum nested_repeat'" "" } */
+  F = sizeof(enum nested_repeat { Fa }) /* { dg-error "nested redefinition of 'enum nested_repeat" } */
+};
+
+enum nested_empty {
+  G1 = sizeof(enum nested_empty {}), /* { dg-error "nested redefinition of 'enum nested_empty" } */
+  /* { dg-error "empty enum is invalid" "" { target *-*-* } .-1 } */
+  G2 = sizeof(enum nested_empty { G2a })
+};
diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-1.c b/gcc/testsuite/gcc.dg/graphite/interchange-1.c
index b65d4861e..65a569e71 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-1.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-1.c
@@ -48,10 +48,3 @@ main (void)
 
   return 0;
 }
-
-/*FIXME: Between isl 0.12 and isl 0.15 the schedule optimizer needs to print
-something canonical so that it can be checked in the test.  The final code
-generated by both are same in this case but the messaged printed are
-not consistent.  */
-
-/* { dg-final { scan-tree-dump "tiled" "graphite" } } */
diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-10.c b/gcc/testsuite/gcc.dg/graphite/interchange-10.c
index a955644de..45c248db8 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-10.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-10.c
@@ -45,5 +45,3 @@ main (void)
 
   return 0;
 }
-
-/* { dg-final { scan-tree-dump "tiled" "graphite" } } */
diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-11.c b/gcc/testsuite/gcc.dg/graphite/interchange-11.c
index 61028225f..6ba6907a5 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-11.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-11.c
@@ -45,5 +45,3 @@ main (void)
 
   return 0;
 }
-
-/* { dg-final { scan-tree-dump "tiled" "graphite" } } */
diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-3.c b/gcc/testsuite/gcc.dg/graphite/interchange-3.c
index 4aec82418..e8539e2d3 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-3.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-3.c
@@ -46,5 +46,3 @@ main (void)
 
   return 0;
 }
-
-/* { dg-final { scan-tree-dump "tiled" "graphite" } } */
diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-4.c b/gcc/testsuite/gcc.dg/graphite/interchange-4.c
index 463ecb5a6..1370d5f9d 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-4.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-4.c
@@ -45,5 +45,3 @@ main (void)
 
   return 0;
 }
-
-/* { dg-final { scan-tree-dump "tiled" "graphite" } } */
diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-7.c b/gcc/testsuite/gcc.dg/graphite/interchange-7.c
index 50f7dd7f8..b2696dbec 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-7.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-7.c
@@ -46,5 +46,3 @@ main (void)
 
   return 0;
 }
-
-/* { dg-final { scan-tree-dump "tiled" "graphite" } } */
diff --git a/gcc/testsuite/gcc.dg/graphite/interchange-9.c b/gcc/testsuite/gcc.dg/graphite/interchange-9.c
index 88a357893..506b5001f 100644
--- a/gcc/testsuite/gcc.dg/graphite/interchange-9.c
+++ b/gcc/testsuite/gcc.dg/graphite/interchange-9.c
@@ -43,5 +43,3 @@ main (void)
 
   return 0;
 }
-
-/* { dg-final { scan-tree-dump "tiled" "graphite" } } */
diff --git a/gcc/testsuite/gcc.dg/graphite/uns-interchange-9.c b/gcc/testsuite/gcc.dg/graphite/uns-interchange-9.c
index cc108c2bb..a89578032 100644
--- a/gcc/testsuite/gcc.dg/graphite/uns-interchange-9.c
+++ b/gcc/testsuite/gcc.dg/graphite/uns-interchange-9.c
@@ -44,5 +44,3 @@ main (void)
 
   return 0;
 }
-
-/* { dg-final { scan-tree-dump "tiled" "graphite" } } */
diff --git a/gcc/testsuite/gcc.dg/guality/guality.exp b/gcc/testsuite/gcc.dg/guality/guality.exp
index ca77a446f..89cd896d0 100644
--- a/gcc/testsuite/gcc.dg/guality/guality.exp
+++ b/gcc/testsuite/gcc.dg/guality/guality.exp
@@ -80,8 +80,22 @@ if {[check_guality "
     return 0;
   }
 "]} {
-  gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] "" ""
-  gcc-dg-runtest [lsort [glob $srcdir/c-c++-common/guality/*.c]] "" "-Wc++-compat"
+    set general [list]
+    set Og [list]
+    foreach file [lsort [glob $srcdir/c-c++-common/guality/*.c]] {
+	switch -glob -- [file tail $file] {
+	    Og-* { lappend Og $file }
+	    * { lappend general $file }
+	}
+    }
+
+    gcc-dg-runtest [lsort [glob $srcdir/$subdir/*.c]] "" ""
+    gcc-dg-runtest $general "" "-Wc++-compat"
+    set-torture-options \
+	[list "-O0" "-Og"] \
+	[list {}] \
+	[list "-Og -flto"]
+    gcc-dg-runtest $Og "" "-Wc++-compat"
 }
 
 if [info exists guality_gdb_name] {
diff --git a/gcc/testsuite/gcc.dg/guality/pr59776.c b/gcc/testsuite/gcc.dg/guality/pr59776.c
index 382abb622..6c1c8165b 100644
--- a/gcc/testsuite/gcc.dg/guality/pr59776.c
+++ b/gcc/testsuite/gcc.dg/guality/pr59776.c
@@ -12,11 +12,11 @@ foo (struct S *p)
   struct S s1, s2;			/* { dg-final { gdb-test pr59776.c:17 "s1.f" "5.0" } } */
   s1 = *p;				/* { dg-final { gdb-test pr59776.c:17 "s1.g" "6.0" } } */
   s2 = s1;				/* { dg-final { gdb-test pr59776.c:17 "s2.f" "0.0" } } */
-  *(int *) &s2.f = 0;			/* { dg-final { gdb-test pr59776.c:17 "s2.g" "6.0" } } */
+  *(int *) &s2.f = 0;			/* { dg-final { gdb-test pr59776.c:17 "s2.g" "6.0" { xfail *-*-* } } } */
   asm volatile (NOP : : : "memory");	/* { dg-final { gdb-test pr59776.c:20 "s1.f" "5.0" } } */
   asm volatile (NOP : : : "memory");	/* { dg-final { gdb-test pr59776.c:20 "s1.g" "6.0" } } */
   s2 = s1;				/* { dg-final { gdb-test pr59776.c:20 "s2.f" "5.0" } } */
-  asm volatile (NOP : : : "memory");	/* { dg-final { gdb-test pr59776.c:20 "s2.g" "6.0" } } */
+  asm volatile (NOP : : : "memory");	/* { dg-final { gdb-test pr59776.c:20 "s2.g" "6.0" { xfail *-*-* } } } */
   asm volatile (NOP : : : "memory");
 }
 
diff --git a/gcc/testsuite/gcc.dg/ipa/inline-7.c b/gcc/testsuite/gcc.dg/ipa/inline-7.c
index 7dabb14f6..7c6491141 100644
--- a/gcc/testsuite/gcc.dg/ipa/inline-7.c
+++ b/gcc/testsuite/gcc.dg/ipa/inline-7.c
@@ -1,6 +1,6 @@
 /* Check that early inliner works out that a is empty of parameter 0.  */
 /* { dg-do compile } */
-/* { dg-options "-O2 -fdump-tree-einline-optimized -fopt-info-inline -fno-partial-inlining"  } */
+/* { dg-options "-O2 -fdump-tree-einline-optimized -fopt-info-inline -fno-partial-inlining -fno-inline-functions"  } */
 void t(void);
 int a (int b)
 {
diff --git a/gcc/testsuite/gcc.dg/ipa/pr63416.c b/gcc/testsuite/gcc.dg/ipa/pr63416.c
index b5374c51f..5873954fb 100644
--- a/gcc/testsuite/gcc.dg/ipa/pr63416.c
+++ b/gcc/testsuite/gcc.dg/ipa/pr63416.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fdump-tree-optimized"  } */
+/* { dg-options "-O2 -fdump-tree-optimized --param early-inlining-insns-O2=14"  } */
 #define _UNUSED_ __attribute__((__unused__))
 
 typedef int TEST_F30 (int *v);
diff --git a/gcc/testsuite/gcc.dg/ipa/pr93763.c b/gcc/testsuite/gcc.dg/ipa/pr93763.c
index d11705932..aa2e60c5f 100644
--- a/gcc/testsuite/gcc.dg/ipa/pr93763.c
+++ b/gcc/testsuite/gcc.dg/ipa/pr93763.c
@@ -3,44 +3,48 @@
 
 typedef struct a a;
 struct a {
-  a *b
+  a *b;
 } d;
-e, k, ah, al;
-f(aa) {
+int e, k, ah, al;
+void h(void);
+void
+f(aa) int aa; {
   if (aa & 1)
     goto g;
   f(aa | 2);
 g:
   h();
 }
+void i();
+void
 l() {
-  {
     f(072);
     i(e, d, 92);
-  }
 }
+void
 ag() {
-  { i(e, d, 36); }
+  i(e, d, 36);
 }
+void j();
+void
 ai(a *m, a *n, unsigned aa) {
   f(aa);
   j(k, l, ah, 1);
 }
+void
 j(int c, a m, int aj, int aa) {
   int ak = aa;
-  { i(e, d, ak); }
+  i(e, d, ak);
 }
+void
 i(int c, a *m, unsigned aa) {
-  {
-    {             i(c, (*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(
+  i(c, (*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(
 *(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(
 *(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*(*m).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b)
 .b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b)
 .b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b)
 .b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b)
 .b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b).b, 0);
-    }
-  }
   int am = aa;
-  ai(ag, al, am);
+  ai((a *) (void *) ag, (a *) (__INTPTR_TYPE__) al, am);
 }
diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-5.c b/gcc/testsuite/gcc.dg/optimize-bswapsi-5.c
index 5819fd719..b4d8b9a8d 100644
--- a/gcc/testsuite/gcc.dg/optimize-bswapsi-5.c
+++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-5.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target bswap } */
-/* { dg-options "-O2 -fdump-tree-bswap" } */
+/* { dg-options "-O2 -fdump-tree-bswap -fno-inline-functions" } */
 /* { dg-additional-options "-march=z900" { target s390-*-* } } */
 
 struct L { unsigned int l[2]; };
diff --git a/gcc/testsuite/gcc.dg/pr79983.c b/gcc/testsuite/gcc.dg/pr79983.c
index 84aae6913..1e292d421 100644
--- a/gcc/testsuite/gcc.dg/pr79983.c
+++ b/gcc/testsuite/gcc.dg/pr79983.c
@@ -8,7 +8,7 @@ struct S { int i, j; }; /* { dg-error "redefinition of 'struct S'" } */
 
 enum E;
 enum E { A, B, C }; /* { dg-message "originally defined here" } */
-enum E { D, F }; /* { dg-error "nested redefinition of 'enum E'|redeclaration of 'enum E'" } */
+enum E { D, F }; /* { dg-error "redeclaration of 'enum E'" } */
 
 union U;
 union U { int i; }; /* { dg-message "originally defined here" } */
diff --git a/gcc/testsuite/gcc.dg/struct-ret-1.c b/gcc/testsuite/gcc.dg/struct-ret-1.c
index 23c9e9813..330c76ab8 100644
--- a/gcc/testsuite/gcc.dg/struct-ret-1.c
+++ b/gcc/testsuite/gcc.dg/struct-ret-1.c
@@ -1,5 +1,5 @@
-/* { dg-do run { target hppa*-*-* } } */
-/* { dg-options { -O2 } { target hppa*-*-* } } */
+/* { dg-do run } */
+/* { dg-options { -O2 } } */
 extern void abort (void);
 extern void exit (int);
 typedef struct {
diff --git a/gcc/testsuite/gcc.dg/torture/pr90395.c b/gcc/testsuite/gcc.dg/torture/pr90395.c
new file mode 100644
index 000000000..eba8750ef
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr90395.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-fexceptions -fnon-call-exceptions" } */
+
+typedef int v16si __attribute__ ((__vector_size__ (64)));
+
+void
+rl (int uq)
+{
+  v16si qw[1];
+
+  qw[uq] = (v16si) { uq };
+}
diff --git a/gcc/testsuite/gcc.dg/torture/pr92690.c b/gcc/testsuite/gcc.dg/torture/pr92690.c
new file mode 100644
index 000000000..b49f184fc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr92690.c
@@ -0,0 +1,38 @@
+/* { dg-do run { target *-*-*gnu* } } */
+/* { dg-additional-options "-D_GNU_SOURCE" } */
+/* { dg-require-effective-target fenv_exceptions } */
+
+#include <fenv.h>
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef float v4sf __attribute__((vector_size(16)));
+
+void __attribute__((noipa))
+foo (v4si *dstp, v4sf *srcp)
+{
+  v4sf src = *srcp;
+  *dstp = (v4si) { src[0], src[1], 3, 4 };
+}
+
+void __attribute__((noipa))
+bar (v4sf *dstp, v4si *srcp)
+{
+  v4si src = *srcp;
+  *dstp = (v4sf) { src[0], src[1], 3.5, 4.5 };
+}
+
+int
+main()
+{
+  feenableexcept (FE_INVALID|FE_INEXACT);
+  v4sf x = (v4sf) { 1, 2, __builtin_nanf (""), 3.5 };
+  v4si y;
+  foo (&y, &x);
+  if (y[0] != 1 || y[1] != 2 || y[2] != 3 || y[3] != 4)
+    __builtin_abort ();
+  y = (v4si) { 0, 1, __INT_MAX__, -__INT_MAX__ };
+  bar (&x, &y);
+  if (x[0] != 0 || x[1] != 1 || x[2] != 3.5 || x[3] != 4.5)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/torture/pr92715.c b/gcc/testsuite/gcc.dg/torture/pr92715.c
new file mode 100644
index 000000000..170179c20
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr92715.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
+
+typedef double v4si __attribute__((vector_size(32)));
+typedef double v2si __attribute__((vector_size(16)));
+
+void foo (v4si *dstp, v2si *srcp)
+{
+  v2si src = *srcp;
+  *dstp = (v4si) { src[0], src[1], src[0], src[1] };
+}
+
+void bar (v4si *dstp, v2si *srcp)
+{
+  v2si src = *srcp;
+  *dstp = (v4si) { src[0], src[0], src[0], src[0] };
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/alias-access-path-1.c b/gcc/testsuite/gcc.dg/tree-ssa/alias-access-path-1.c
new file mode 100644
index 000000000..ba90b56fe
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/alias-access-path-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-fre3" } */
+struct foo
+{
+  int val;
+} *fooptr;
+struct bar
+{
+  struct foo foo;
+  int val2;
+} *barptr;
+int
+test ()
+{
+  struct foo foo = { 0 };
+  barptr->val2 = 123;
+  *fooptr = foo;
+  return barptr->val2;
+}
+
+/* { dg-final { scan-tree-dump-times "return 123" 1 "fre3"} } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-35.c b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-35.c
index d55197bce..24e633869 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-35.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-35.c
@@ -16,4 +16,5 @@ v4sf vec_cast_perm(v4si f)
 }
 
 /* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 1 "cddce1" { target { i?86-*-* x86_64-*-* } } } } */
-/* { dg-final { scan-tree-dump-times "\\\(v4sf\\\) " 2 "cddce1" { target { i?86-*-* x86_64-*-* } } } } */
+/* Catch (v4sf) and (vector(4) float).  */
+/* { dg-final { scan-tree-dump-times " = \\\(v" 2 "cddce1" { target { i?86-*-* x86_64-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr92706-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr92706-2.c
new file mode 100644
index 000000000..37ab9765d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr92706-2.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-esra" } */
+
+typedef __UINT64_TYPE__ uint64_t;
+typedef __UINT32_TYPE__ uint32_t;
+struct S { uint32_t i[2]; } __attribute__((aligned(__alignof__(uint64_t))));
+typedef uint64_t my_int64 __attribute__((may_alias));
+uint64_t load (void *p)
+{
+  struct S u, v, w;
+  uint64_t tem;
+  tem = *(my_int64 *)p;
+  *(my_int64 *)&v = tem;
+  u = v;
+  w = u;
+  return *(my_int64 *)&w;
+}
+
+/* { dg-final { scan-tree-dump "Created a replacement for v" "esra" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c
index 32d63899b..836a8092a 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-26.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fdump-tree-dse1-details -fno-short-enums" } */
+/* { dg-options "-O2 -fdump-tree-dse1-details -fno-short-enums -fno-tree-fre" } */
 /* { dg-skip-if "temporary variable for constraint_expr is never used" { msp430-*-* } } */
 
 enum constraint_expr_type
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-31.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-31.c
index 6402c81e6..3d429ab15 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-31.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-31.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O -fdump-tree-fre1-details" } */
+/* { dg-options "-O -fdump-tree-fre1-details -fno-tree-forwprop" } */
 /* { dg-additional-options "-fno-common" { target hppa*-*-hpux* } } */
 
 typedef double d128 __attribute__((vector_size(16)));
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-thread-12.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-thread-12.c
index 67526762f..fff731e8c 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-thread-12.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-thread-12.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fdump-tree-thread2-details -fdump-tree-thread3-details -fdump-tree-thread4-details -fno-finite-loops" } */
+/* { dg-options "-O2 -fdump-tree-thread2-details -fdump-tree-thread3-details -fdump-tree-thread4-details -fno-finite-loops --param early-inlining-insns-O2=14 -fno-inline-functions" } */
 /* { dg-final { scan-tree-dump "FSM" "thread2" } } */
 /* { dg-final { scan-tree-dump "FSM" "thread3" } } */
 /* { dg-final { scan-tree-dump "FSM" "thread4" { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr66142.c b/gcc/testsuite/gcc.dg/vect/pr66142.c
index 8c79f2907..a0316f1f0 100644
--- a/gcc/testsuite/gcc.dg/vect/pr66142.c
+++ b/gcc/testsuite/gcc.dg/vect/pr66142.c
@@ -1,6 +1,6 @@
 /* PR middle-end/66142 */
 /* { dg-do compile } */
-/* { dg-additional-options "-ffast-math -fopenmp-simd" } */
+/* { dg-additional-options "-ffast-math -fopenmp-simd --param early-inlining-insns-O2=14" } */
 /* { dg-additional-options "-mavx" { target avx_runtime } } */
 
 struct A { float x, y; };
diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-arith-7.c b/gcc/testsuite/gcc.dg/vect/vect-cond-arith-7.c
new file mode 100644
index 000000000..739b98f59
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cond-arith-7.c
@@ -0,0 +1,60 @@
+/* { dg-require-effective-target scalar_all_fma } */
+/* { dg-additional-options "-fdump-tree-optimized -ffp-contract=fast" } */
+
+#include "tree-vect.h"
+
+#define N (VECTOR_BITS * 11 / 64 + 3)
+
+#define DEF(INV)					\
+  void __attribute__ ((noipa))				\
+  f_##INV (double *restrict a, double *restrict b,	\
+	   double *restrict c, double *restrict d)	\
+  {							\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	double mb = (INV & 1 ? -b[i] : b[i]);		\
+	double mc = c[i];				\
+	double md = (INV & 2 ? -d[i] : d[i]);		\
+	a[i] = b[i] < 10 ? mb * mc + md : 10.0;		\
+      }							\
+  }
+
+#define TEST(INV)					\
+  {							\
+    f_##INV (a, b, c, d);				\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	double mb = (INV & 1 ? -b[i] : b[i]);		\
+	double mc = c[i];				\
+	double md = (INV & 2 ? -d[i] : d[i]);		\
+	double fma = __builtin_fma (mb, mc, md);	\
+	if (a[i] != (i % 17 < 10 ? fma : 10.0))		\
+	  __builtin_abort ();				\
+	asm volatile ("" ::: "memory");			\
+      }							\
+  }
+
+#define FOR_EACH_INV(T) \
+  T (0) T (1) T (2) T (3)
+
+FOR_EACH_INV (DEF)
+
+int
+main (void)
+{
+  double a[N], b[N], c[N], d[N];
+  for (int i = 0; i < N; ++i)
+    {
+      b[i] = i % 17;
+      c[i] = i % 9 + 11;
+      d[i] = i % 13 + 14;
+      asm volatile ("" ::: "memory");
+    }
+  FOR_EACH_INV (TEST)
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times { = \.COND_FMA } 1 "optimized" { target vect_double_cond_arith } } } */
+/* { dg-final { scan-tree-dump-times { = \.COND_FMS } 1 "optimized" { target vect_double_cond_arith } } } */
+/* { dg-final { scan-tree-dump-times { = \.COND_FNMA } 1 "optimized" { target vect_double_cond_arith } } } */
+/* { dg-final { scan-tree-dump-times { = \.COND_FNMS } 1 "optimized" { target vect_double_cond_arith } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-mulhrs-1.c b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-1.c
new file mode 100644
index 000000000..8e46ff6b0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-1.c
@@ -0,0 +1,49 @@
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+#ifndef SIGNEDNESS
+#define SIGNEDNESS signed
+#endif
+#ifndef BIAS
+#define BIAS 0
+#endif
+
+#define HRS(x) ((((x) >> (15 - BIAS)) + BIAS) >> BIAS)
+
+void __attribute__ ((noipa))
+f (SIGNEDNESS short *restrict a, SIGNEDNESS short *restrict b,
+   SIGNEDNESS short *restrict c, __INTPTR_TYPE__ n)
+{
+  for (__INTPTR_TYPE__ i = 0; i < n; ++i)
+    a[i] = HRS((SIGNEDNESS int) b[i] * (SIGNEDNESS int) c[i]);
+}
+
+#define N 50
+#define BASE1 ((SIGNEDNESS int) -1 < 0 ? -126 : 4)
+#define BASE2 ((SIGNEDNESS int) -1 < 0 ? -101 : 26)
+#define CONST1 0x01AB
+#define CONST2 0x01CD
+
+int
+main (void)
+{
+  check_vect ();
+
+  SIGNEDNESS short a[N], b[N], c[N];
+  for (int i = 0; i < N; ++i)
+    {
+      b[i] = BASE1 + i * CONST1;
+      c[i] = BASE2 + i * CONST2;
+      asm volatile ("" ::: "memory");
+    }
+  f (a, b, c, N);
+  for (int i = 0; i < N; ++i)
+    if (a[i] != HRS(BASE1 * BASE2 + i * i * (CONST1 * CONST2)
+		    + i * (BASE1 * CONST2 + BASE2 * CONST1)))
+      __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.MULHS} "vect" { target vect_mulhrs_hi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-mulhrs-2.c b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-2.c
new file mode 100644
index 000000000..a16e71c6a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-2.c
@@ -0,0 +1,9 @@
+/* { dg-require-effective-target vect_int } */
+
+#define SIGNEDNESS unsigned
+
+#include "vect-mulhrs-1.c"
+
+/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.MULHS} "vect" { target vect_mulhrs_hi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-mulhrs-3.c b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-3.c
new file mode 100644
index 000000000..e7d44d75d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-3.c
@@ -0,0 +1,9 @@
+/* { dg-require-effective-target vect_int } */
+
+#define BIAS 1
+
+#include "vect-mulhrs-1.c"
+
+/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.MULHRS} "vect" { target vect_mulhrs_hi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-mulhrs-4.c b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-4.c
new file mode 100644
index 000000000..e12176335
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-mulhrs-4.c
@@ -0,0 +1,10 @@
+/* { dg-require-effective-target vect_int } */
+
+#define SIGNEDNESS unsigned
+#define BIAS 1
+
+#include "vect-mulhrs-1.c"
+
+/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump {\.MULHRS} "vect" { target vect_mulhrs_hi } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-sdiv-pow2-1.c b/gcc/testsuite/gcc.dg/vect/vect-sdiv-pow2-1.c
new file mode 100644
index 000000000..be70bc6c4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-sdiv-pow2-1.c
@@ -0,0 +1,79 @@
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+#define DIV(x,y) ((x)/(y))
+#define MOD(x,y) ((x)%(y))
+
+#define TEMPLATE(PO2,OP)						\
+void __attribute__ ((noipa))						\
+f_##PO2##_##OP (int *restrict a, int *restrict b, __INTPTR_TYPE__ n)	\
+{									\
+  for (__INTPTR_TYPE__ i = 0; i < n; ++i)				\
+    a[i] = OP (b[i], (1 << PO2));					\
+}
+#define TEMPLATES(PO2)	\
+TEMPLATE (PO2,DIV);	\
+TEMPLATE (PO2,MOD);
+
+TEMPLATES (1);
+TEMPLATES (2);
+TEMPLATES (3);
+TEMPLATES (7);
+TEMPLATES (8);
+TEMPLATES (10);
+TEMPLATES (15);
+TEMPLATES (16);
+TEMPLATES (20);
+
+typedef void (*func_t) (int *, int *, __INTPTR_TYPE__);
+typedef struct {
+  int po2;
+  func_t div;
+  func_t mod;
+} fn_t;
+const fn_t fns[] = {
+#define FN_PAIR(PO2) { PO2, f_##PO2##_DIV, f_##PO2##_MOD }
+  FN_PAIR (1),
+  FN_PAIR (2),
+  FN_PAIR (3),
+  FN_PAIR (7),
+  FN_PAIR (8),
+  FN_PAIR (10),
+  FN_PAIR (15),
+  FN_PAIR (16),
+  FN_PAIR (20),
+};
+
+int __attribute__ ((noipa, noinline))
+power2 (int x)
+{
+  return 1 << x;
+}
+
+#define N 50
+
+int
+main (void)
+{
+  int a[N], b[N], c[N];
+
+  for (int i = 0; i < (sizeof(fns)/sizeof(fns[0])); i++)
+    {
+      int p = power2 (fns[i].po2);
+      for (int j = 0; j < N; j++)
+        a[j] = ((p << 4) * j) / (N - 1) - (p << 5);
+
+      fns[i].div (b, a, N);
+      fns[i].mod (c, a, N);
+
+      for (int j = 0; j < N; j++)
+	if (a[j] != (b[j] * p + c[j]))
+          __builtin_abort ();
+    }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump {\.DIV_POW2} "vect" { target vect_sdiv_pow2_si } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 18 "vect" { target vect_sdiv_pow2_si } } } */
diff --git a/gcc/testsuite/gcc.dg/winline-3.c b/gcc/testsuite/gcc.dg/winline-3.c
index 7b7c8c5b9..7043a2760 100644
--- a/gcc/testsuite/gcc.dg/winline-3.c
+++ b/gcc/testsuite/gcc.dg/winline-3.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-Winline -O2 --param max-inline-insns-single=1 --param inline-min-speedup=100 -fgnu89-inline" } */
+/* { dg-options "-Winline -O2 --param max-inline-insns-single-O2=1 --param inline-min-speedup-O2=100 -fgnu89-inline" } */
 
 void big (void);
 inline int q(void) /* { dg-warning "max-inline-insns-single" } */
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/jcvt_1.c b/gcc/testsuite/gcc.target/aarch64/acle/jcvt_1.c
new file mode 100644
index 000000000..0c900b1b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/jcvt_1.c
@@ -0,0 +1,15 @@
+/* Test the __jcvt ACLE intrinsic.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.3-a" } */
+
+#include <arm_acle.h>
+
+#ifdef __ARM_FEATURE_JCVT
+int32_t
+test_jcvt (double a)
+{
+  return __jcvt (a);
+}
+#endif
+
+/* { dg-final { scan-assembler-times "fjcvtzs\tw\[0-9\]+, d\[0-9\]+\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/rintnzx_1.c b/gcc/testsuite/gcc.target/aarch64/acle/rintnzx_1.c
new file mode 100644
index 000000000..125720848
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/rintnzx_1.c
@@ -0,0 +1,73 @@
+/* Test the __rint[32,64][z,x] intrinsics.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.5-a" } */
+
+#include <arm_acle.h>
+
+#ifdef __ARM_FEATURE_FRINT
+float
+foo_32z_f32_scal (float a)
+{
+  return __rint32zf (a);
+}
+
+/* { dg-final { scan-assembler-times "frint32z\ts\[0-9\]+, s\[0-9\]+\n" 1 } } */
+
+double
+foo_32z_f64_scal (double a)
+{
+  return __rint32z (a);
+}
+
+/* { dg-final { scan-assembler-times "frint32z\td\[0-9\]+, d\[0-9\]+\n" 1 } } */
+
+float
+foo_32x_f32_scal (float a)
+{
+  return __rint32xf (a);
+}
+
+/* { dg-final { scan-assembler-times "frint32x\ts\[0-9\]+, s\[0-9\]+\n" 1 } } */
+
+double
+foo_32x_f64_scal (double a)
+{
+  return __rint32x (a);
+}
+
+/* { dg-final { scan-assembler-times "frint32x\td\[0-9\]+, d\[0-9\]+\n" 1 } } */
+
+float
+foo_64z_f32_scal (float a)
+{
+  return __rint64zf (a);
+}
+
+/* { dg-final { scan-assembler-times "frint64z\ts\[0-9\]+, s\[0-9\]+\n" 1 } } */
+
+double
+foo_64z_f64_scal (double a)
+{
+  return __rint64z (a);
+}
+
+/* { dg-final { scan-assembler-times "frint64z\td\[0-9\]+, d\[0-9\]+\n" 1 } } */
+
+float
+foo_64x_f32_scal (float a)
+{
+  return __rint64xf (a);
+}
+
+/* { dg-final { scan-assembler-times "frint64x\ts\[0-9\]+, s\[0-9\]+\n" 1 } } */
+
+double
+foo_64x_f64_scal (double a)
+{
+  return __rint64x (a);
+}
+
+/* { dg-final { scan-assembler-times "frint64x\td\[0-9\]+, d\[0-9\]+\n" 1 } } */
+
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/rng_1.c b/gcc/testsuite/gcc.target/aarch64/acle/rng_1.c
new file mode 100644
index 000000000..1fbdb6276
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/rng_1.c
@@ -0,0 +1,53 @@
+/* Test the __rndr ACLE intrinsic.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.5-a+rng" } */
+
+#include <arm_acle.h>
+
+#ifdef __ARM_FEATURE_RNG
+/* Check that instruction is generated when status result is unused.  */
+uint64_t
+test_rndr_no_stat (void)
+{
+  uint64_t res;
+  __rndr (&res);
+  return res;
+}
+
+/* Check that instruction is generated when random number result
+   is unused.  */
+int
+test_rndr_error_check (void)
+{
+  uint64_t res;
+  int fail = __rndr (&res);
+  if (fail)
+    return 0;
+  return -1;
+}
+
+/* { dg-final { scan-assembler-times "mrs\tx..?, RNDR\n" 2 } } */
+
+/* Check that instruction is generated when status result is unused.  */
+uint64_t
+test_rndrrs_no_stat (void)
+{
+  uint64_t res;
+  __rndrrs (&res);
+  return res;
+}
+
+/* Check that instruction is generated when random number result
+   is unused.  */
+int
+test_rndrrs_error_check (void)
+{
+  uint64_t res;
+  int fail = __rndrrs (&res);
+  if (fail)
+    return 0;
+  return -1;
+}
+
+/* { dg-final { scan-assembler-times "mrs\tx..?, RNDRRS\n" 2 } } */
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/tme.c b/gcc/testsuite/gcc.target/aarch64/acle/tme.c
new file mode 100644
index 000000000..5df93b1dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/tme.c
@@ -0,0 +1,34 @@
+/* Test the TME intrinsics.  */
+
+/* { dg-do compile } */
+/* { dg-options "-save-temps -O2 -march=armv8-a+tme" } */
+
+#include "arm_acle.h"
+
+#define tcancel_reason 0x234
+
+unsigned
+check_tme (void)
+{
+  unsigned status = __tstart ();
+  if (status == 0)
+    {
+      if (__ttest () == 2)
+	{
+	  __tcancel (tcancel_reason & _TMFAILURE_REASON);
+	  return tcancel_reason;
+	}
+
+      __tcommit ();
+      return 0;
+    }
+  else if (status & _TMFAILURE_NEST)
+    return _TMFAILURE_NEST;
+  else if (status & _TMFAILURE_TRIVIAL)
+    return _TMFAILURE_TRIVIAL;
+}
+
+/* { dg-final { scan-assembler "tstart\tx..?\n" } } */
+/* { dg-final { scan-assembler "tcancel\t#564\n" } } */
+/* { dg-final { scan-assembler "ttest\tx..?\n" } } */
+/* { dg-final { scan-assembler "tcommit\n" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
new file mode 100644
index 000000000..c42c7acbb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
@@ -0,0 +1,85 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-save-temps" } */
+
+#include <arm_neon.h>
+
+float32x2_t test_vcreate (float32x2_t r, uint64_t a, uint64_t b)
+{
+  bfloat16x4_t _a = vcreate_bf16(a);
+  bfloat16x4_t _b = vcreate_bf16(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+/* { dg-final { scan-assembler {bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h} } } */
+
+bfloat16x4_t test_vset_lane_bf16 (bfloat16_t a, bfloat16x4_t b)
+{
+  return vset_lane_bf16 (a, b, 3);
+}
+
+bfloat16x8_t test_vsetq_lane_bf16 (bfloat16_t a, bfloat16x8_t b)
+{
+  return vsetq_lane_bf16 (a, b, 7);
+}
+/* { dg-final { scan-assembler-times "ins\\t" 2 } } */
+
+bfloat16x4_t vdup_test (bfloat16_t a)
+{
+  return vdup_n_bf16 (a);
+}
+/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.4h, v\[0-9\]+.h\\\[0\\\]" } } */
+
+bfloat16x8_t vdupq_test (bfloat16_t a)
+{
+  return vdupq_n_bf16 (a);
+}
+
+bfloat16x8_t test_vdupq_lane_bf16 (bfloat16x4_t a)
+{
+  return vdupq_lane_bf16 (a, 1);
+}
+/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.8h, v\[0-9\]+.h\\\[0\\\]" 2 } } */
+
+bfloat16_t test_vget_lane_bf16 (bfloat16x4_t a)
+{
+  return vget_lane_bf16 (a, 1);
+}
+/* { dg-final { scan-assembler-times "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[1\\\]" 2 } } */
+
+bfloat16x4_t test_vdup_lane_bf16 (bfloat16x4_t a)
+{
+  return vdup_lane_bf16 (a, 1);
+}
+/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" } } */
+
+bfloat16x4_t test_vdup_laneq_bf16 (bfloat16x8_t a)
+{
+  return vdup_laneq_bf16 (a, 7);
+}
+/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.8h, v\[0-9\]+\.h\\\[7\\\]" } } */
+
+bfloat16x8_t test_vdupq_laneq_bf16 (bfloat16x8_t a)
+{
+  return vdupq_laneq_bf16 (a, 5);
+}
+/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.8h, v\[0-9\]+\.h\\\[5\\\]" } } */
+
+bfloat16_t test_vduph_lane_bf16 (bfloat16x4_t a)
+{
+  return vduph_lane_bf16 (a, 3);
+}
+/* { dg-final { scan-assembler "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[3\\\]" } } */
+
+bfloat16_t test_vgetq_lane_bf16 (bfloat16x8_t a)
+{
+  return vgetq_lane_bf16 (a, 7);
+}
+
+bfloat16_t test_vduph_laneq_bf16 (bfloat16x8_t a)
+{
+  return vduph_laneq_bf16 (a, 7);
+}
+/* { dg-final { scan-assembler-times "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[7\\\]" 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c
new file mode 100644
index 000000000..f5adf40c6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c
@@ -0,0 +1,466 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-save-temps" } */
+
+#include <arm_neon.h>
+
+float32x2_t
+test_vbfdot_f32_s8 (float32x2_t r, int8x8_t a, int8x8_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_s8(a);
+  bfloat16x4_t _b = vreinterpret_bf16_s8(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_s16 (float32x2_t r, int16x4_t a, int16x4_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_s16(a);
+  bfloat16x4_t _b = vreinterpret_bf16_s16(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_s32 (float32x2_t r, int32x2_t a, int32x2_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_s32(a);
+  bfloat16x4_t _b = vreinterpret_bf16_s32(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_s64 (float32x2_t r, int64x1_t a, int64x1_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_s64(a);
+  bfloat16x4_t _b = vreinterpret_bf16_s64(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_u8 (float32x2_t r, uint8x8_t a, uint8x8_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_u8(a);
+  bfloat16x4_t _b = vreinterpret_bf16_u8(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_u16 (float32x2_t r, uint16x4_t a, uint16x4_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_u16(a);
+  bfloat16x4_t _b = vreinterpret_bf16_u16(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_u32 (float32x2_t r, uint32x2_t a, uint32x2_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_u32(a);
+  bfloat16x4_t _b = vreinterpret_bf16_u32(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_u64 (float32x2_t r, uint64x1_t a, uint64x1_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_u64(a);
+  bfloat16x4_t _b = vreinterpret_bf16_u64(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_p8 (float32x2_t r, poly8x8_t a, poly8x8_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_p8(a);
+  bfloat16x4_t _b = vreinterpret_bf16_p8(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_p16 (float32x2_t r, poly16x4_t a, poly16x4_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_p16(a);
+  bfloat16x4_t _b = vreinterpret_bf16_p16(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_p64 (float32x2_t r, poly64x1_t a, poly64x1_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_p64(a);
+  bfloat16x4_t _b = vreinterpret_bf16_p64(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_f16 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_f16(a);
+  bfloat16x4_t _b = vreinterpret_bf16_f16(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_f32 (float32x2_t r, float32x2_t a, float32x2_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_f32(a);
+  bfloat16x4_t _b = vreinterpret_bf16_f32(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x2_t
+test_vbfdot_f32_f64 (float32x2_t r, float64x1_t a, float64x1_t b)
+{
+  bfloat16x4_t _a = vreinterpret_bf16_f64(a);
+  bfloat16x4_t _b = vreinterpret_bf16_f64(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_s8 (float32x4_t r, int8x16_t a, int8x16_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_s8(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_s8(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_s16 (float32x4_t r, int16x8_t a, int16x8_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_s16(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_s16(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_s32 (float32x4_t r, int32x4_t a, int32x4_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_s32(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_s32(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_s64 (float32x4_t r, int64x2_t a, int64x2_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_s64(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_s64(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_u8 (float32x4_t r, uint8x16_t a, uint8x16_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_u8(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_u8(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_u16 (float32x4_t r, uint16x8_t a, uint16x8_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_u16(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_u16(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_u32 (float32x4_t r, uint32x4_t a, uint32x4_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_u32(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_u32(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_u64 (float32x4_t r, uint64x2_t a, uint64x2_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_u64(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_u64(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_p8 (float32x4_t r, poly8x16_t a, poly8x16_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_p8(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_p8(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_p16 (float32x4_t r, poly16x8_t a, poly16x8_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_p16(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_p16(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_p64 (float32x4_t r, poly64x2_t a, poly64x2_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_p64(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_p64(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_p128 (float32x4_t r, poly128_t a, poly128_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_p128(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_p128(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_f16 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_f16(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_f16(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_f32 (float32x4_t r, float32x4_t a, float32x4_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_f32(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_f32(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+float32x4_t
+test_vbfdotq_f32_f64 (float32x4_t r, float64x2_t a, float64x2_t b)
+{
+  bfloat16x8_t _a = vreinterpretq_bf16_f64(a);
+  bfloat16x8_t _b = vreinterpretq_bf16_f64(b);
+
+  return vbfdotq_f32 (r, _a, _b);
+}
+
+/* { dg-final { scan-assembler-times {bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h} 14 } } */
+/* { dg-final { scan-assembler-times {bfdot\tv[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h} 15 } } */
+
+int8x8_t test_vreinterpret_s8_bf16 (bfloat16x4_t a, int8x8_t b)
+{
+  int8x8_t _a = vreinterpret_s8_bf16 (a);
+  return vadd_s8 (_a, b);
+}
+
+int16x4_t test_vreinterpret_s16_bf16 (bfloat16x4_t a, int16x4_t b)
+{
+  int16x4_t _a = vreinterpret_s16_bf16 (a);
+  return vadd_s16 (_a, b);
+}
+
+int32x2_t test_vreinterpret_s32_bf16 (bfloat16x4_t a, int32x2_t b)
+{
+  int32x2_t _a = vreinterpret_s32_bf16 (a);
+  return vadd_s32 (_a, b);
+}
+
+int64x1_t test_vreinterpret_s64_bf16 (bfloat16x4_t a, int64x1_t b)
+{
+  int64x1_t _a = vreinterpret_s64_bf16 (a);
+  return vrshl_s64 (_a, b);
+}
+
+uint8x8_t test_vreinterpret_u8_bf16 (bfloat16x4_t a, uint8x8_t b)
+{
+  uint8x8_t _a = vreinterpret_u8_bf16 (a);
+  return vadd_u8 (_a, b);
+}
+
+uint16x4_t test_vreinterpret_u16_bf16 (bfloat16x4_t a, uint16x4_t b)
+{
+  uint16x4_t _a = vreinterpret_u16_bf16 (a);
+  return vadd_u16 (_a, b);
+}
+
+uint32x2_t test_vreinterpret_u32_bf16 (bfloat16x4_t a, uint32x2_t b)
+{
+  uint32x2_t _a = vreinterpret_u32_bf16 (a);
+  return vadd_u32 (_a, b);
+}
+
+uint64x1_t test_vreinterpret_u64_bf16 (bfloat16x4_t a, int64x1_t b)
+{
+  uint64x1_t _a = vreinterpret_u64_bf16 (a);
+  return vrshl_u64 (_a, b);
+}
+
+poly8x8_t test_vreinterpret_p8_bf16 (bfloat16x4_t a, poly8x8_t b)
+{
+  poly8x8_t _a = vreinterpret_p8_bf16 (a);
+  return vzip1_p8 (_a, b);
+}
+
+poly16x4_t test_vreinterpret_p16_bf16 (bfloat16x4_t a, poly16x4_t b)
+{
+  poly16x4_t _a = vreinterpret_p16_bf16 (a);
+  return vzip1_p16 (_a, b);
+}
+
+poly64x1_t test_vreinterpret_p64_bf16 (bfloat16x4_t a, poly64x1_t b)
+{
+  poly64x1_t _a = vreinterpret_p64_bf16 (a);
+  return vsli_n_p64 (_a, b, 3);
+}
+
+float32x2_t test_vreinterpret_f32_bf16 (bfloat16x4_t a, float32x2_t b)
+{
+  float32x2_t _a = vreinterpret_f32_bf16 (a);
+  return vsub_f32 (_a, b);
+}
+
+float64x1_t test_vreinterpret_f64_bf16 (bfloat16x4_t a, float64x1_t b)
+{
+  float64x1_t _a = vreinterpret_f64_bf16 (a);
+  return vsub_f64 (_a, b);
+}
+
+int8x16_t test_vreinterpretq_s8_bf16 (bfloat16x8_t a, int8x16_t b)
+{
+  int8x16_t _a = vreinterpretq_s8_bf16 (a);
+  return vaddq_s8 (_a, b);
+}
+
+int16x8_t test_vreinterpretq_s16_bf16 (bfloat16x8_t a, int16x8_t b)
+{
+  int16x8_t _a = vreinterpretq_s16_bf16 (a);
+  return vaddq_s16 (_a, b);
+}
+
+int32x4_t test_vreinterpretq_s32_bf16 (bfloat16x8_t a, int32x4_t b)
+{
+  int32x4_t _a = vreinterpretq_s32_bf16 (a);
+  return vaddq_s32 (_a, b);
+}
+
+int64x2_t test_vreinterpretq_s64_bf16 (bfloat16x8_t a, int64x2_t b)
+{
+  int64x2_t _a = vreinterpretq_s64_bf16 (a);
+  return vaddq_s64 (_a, b);
+}
+
+uint8x16_t test_vreinterpretq_u8_bf16 (bfloat16x8_t a, uint8x16_t b)
+{
+  uint8x16_t _a = vreinterpretq_u8_bf16 (a);
+  return vaddq_u8 (_a, b);
+}
+
+uint16x8_t test_vreinterpretq_u16_bf16 (bfloat16x8_t a, uint16x8_t b)
+{
+  uint16x8_t _a = vreinterpretq_u16_bf16 (a);
+  return vaddq_u16 (_a, b);
+}
+
+uint32x4_t test_vreinterpretq_u32_bf16 (bfloat16x8_t a, uint32x4_t b)
+{
+  uint32x4_t _a = vreinterpretq_u32_bf16 (a);
+  return vaddq_u32 (_a, b);
+}
+
+uint64x2_t test_vreinterpretq_u64_bf16 (bfloat16x8_t a, uint64x2_t b)
+{
+  uint64x2_t _a = vreinterpretq_u64_bf16 (a);
+  return vaddq_u64 (_a, b);
+}
+
+poly8x16_t test_vreinterpretq_p8_bf16 (bfloat16x8_t a, poly8x16_t b)
+{
+  poly8x16_t _a = vreinterpretq_p8_bf16 (a);
+  return vzip1q_p8 (_a, b);
+}
+
+poly16x8_t test_vreinterpretq_p16_bf16 (bfloat16x8_t a, poly16x8_t b)
+{
+  poly16x8_t _a = vreinterpretq_p16_bf16 (a);
+  return vzip1q_p16 (_a, b);
+}
+
+poly64x2_t test_vreinterpretq_p64_bf16 (bfloat16x8_t a, poly64x2_t b)
+{
+  poly64x2_t _a = vreinterpretq_p64_bf16 (a);
+  return vsliq_n_p64 (_a, b, 3);
+}
+
+poly128_t test_vreinterpretq_p128_bf16 (bfloat16x8_t a, poly16x8_t b)
+{
+  poly128_t _a = vreinterpretq_p128_bf16 (a);
+  return _a;
+}
+
+float32x4_t test_vreinterpretq_f32_bf16 (bfloat16x8_t a, float32x4_t b)
+{
+  float32x4_t _a = vreinterpretq_f32_bf16 (a);
+  return vsubq_f32 (_a, b);
+}
+
+float64x2_t test_vreinterpretq_f64_bf16 (bfloat16x8_t a, float64x2_t b)
+{
+  float64x2_t _a = vreinterpretq_f64_bf16 (a);
+  return vsubq_f64 (_a, b);
+}
+
+float16x4_t test_vreinterpret_f16_bf16 (bfloat16x4_t a)
+{
+  return vreinterpret_f16_bf16 (a);
+}
+
+float16x8_t test_vreinterpretq_f16_bf16 (bfloat16x8_t a)
+{
+  return vreinterpretq_f16_bf16 (a);
+}
+
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s} 2 } } */
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h} 2 } } */
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b} 2 } } */
+
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s} 2 } } */
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h} 2 } } */
+/* { dg-final { scan-assembler-times {add\tv[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b} 2 } } */
+
+/* { dg-final { scan-assembler {fsub\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s} } } */
+/* { dg-final { scan-assembler {fsub\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s} } } */
+/* { dg-final { scan-assembler {fsub\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d} } } */
+/* { dg-final { scan-assembler {fsub\td[0-9]+, d[0-9]+, d[0-9]+} } } */
+
+/* { dg-final { scan-assembler {zip1\tv[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b} } } */
+/* { dg-final { scan-assembler {zip1\tv[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b} } } */
+/* { dg-final { scan-assembler {zip1\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h} } } */
+/* { dg-final { scan-assembler {zip1\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h} } } */
+
+/* { dg-final { scan-assembler {sli\tv[0-9]+.2d, v[0-9]+.2d, 3} } } */
+/* { dg-final { scan-assembler {sli\td[0-9]+, d[0-9]+, 3} } } */
+
+/* { dg-final { scan-assembler {urshl\td[0-9]+, d[0-9]+, d[0-9]+} } } */
+/* { dg-final { scan-assembler {srshl\td[0-9]+, d[0-9]+, d[0-9]+} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c
new file mode 100644
index 000000000..cf245091a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c
@@ -0,0 +1,150 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+
+#include <arm_neon.h>
+
+bfloat16x4_t
+test_vld1_dup_bf16 (bfloat16_t * ptr)
+{
+  return vld1_dup_bf16 (ptr);
+}
+
+bfloat16x8_t
+test_vld1q_dup_bf16 (bfloat16_t * ptr)
+{
+  return vld1q_dup_bf16 (ptr);
+}
+
+bfloat16x4_t
+test_vld1_lane_bf16 (bfloat16_t * ptr, bfloat16x4_t src)
+{
+  return vld1_lane_bf16 (ptr, src, 3);
+}
+
+bfloat16x8_t
+test_vld1q_lane_bf16 (bfloat16_t * ptr, bfloat16x8_t src)
+{
+  return vld1q_lane_bf16 (ptr, src, 7);
+}
+
+bfloat16x4_t
+test_vld1_bf16 (bfloat16_t * ptr)
+{
+  return vld1_bf16 (ptr);
+}
+
+bfloat16x8_t
+test_vld1q_bf16 (bfloat16_t * ptr)
+{
+  return vld1q_bf16 (ptr);
+}
+
+bfloat16x4x2_t
+test_vld1_bf16_x2 (bfloat16_t * ptr)
+{
+  return vld1_bf16_x2 (ptr);
+}
+
+bfloat16x8x2_t
+test_vld1q_bf16_x2 (bfloat16_t * ptr)
+{
+  return vld1q_bf16_x2 (ptr);
+}
+
+bfloat16x4x3_t
+test_vld1_bf16_x3 (bfloat16_t * ptr)
+{
+  return vld1_bf16_x3 (ptr);
+}
+
+bfloat16x8x3_t
+test_vld1q_bf16_x3 (bfloat16_t * ptr)
+{
+  return vld1q_bf16_x3 (ptr);
+}
+
+bfloat16x4x4_t
+test_vld1_bf16_x4 (bfloat16_t * ptr)
+{
+  return vld1_bf16_x4 (ptr);
+}
+
+bfloat16x8x4_t
+test_vld1q_bf16_x4 (bfloat16_t * ptr)
+{
+  return vld1q_bf16_x4 (ptr);
+}
+
+bfloat16x4x2_t
+test_vld2_bf16 (bfloat16_t * ptr)
+{
+  return vld2_bf16 (ptr);
+}
+
+bfloat16x8x2_t
+test_vld2q_bf16 (bfloat16_t * ptr)
+{
+  return vld2q_bf16 (ptr);
+}
+
+bfloat16x4x2_t
+test_vld2_dup_bf16 (bfloat16_t * ptr)
+{
+  return vld2_dup_bf16 (ptr);
+}
+
+bfloat16x8x2_t
+test_vld2q_dup_bf16 (bfloat16_t * ptr)
+{
+  return vld2q_dup_bf16 (ptr);
+}
+
+bfloat16x4x3_t
+test_vld3_bf16 (bfloat16_t * ptr)
+{
+  return vld3_bf16 (ptr);
+}
+
+bfloat16x8x3_t
+test_vld3q_bf16 (bfloat16_t * ptr)
+{
+  return vld3q_bf16 (ptr);
+}
+
+bfloat16x4x3_t
+test_vld3_dup_bf16 (bfloat16_t * ptr)
+{
+  return vld3_dup_bf16 (ptr);
+}
+
+bfloat16x8x3_t
+test_vld3q_dup_bf16 (bfloat16_t * ptr)
+{
+  return vld3q_dup_bf16 (ptr);
+}
+
+bfloat16x4x4_t
+test_vld4_bf16 (bfloat16_t * ptr)
+{
+ return vld4_bf16 (ptr);
+}
+
+bfloat16x8x4_t
+test_vld4q_bf16 (bfloat16_t * ptr)
+{
+ return vld4q_bf16 (ptr);
+}
+
+bfloat16x4x4_t
+test_vld4_dup_bf16 (bfloat16_t * ptr)
+{
+  return vld4_dup_bf16 (ptr);
+}
+
+bfloat16x8x4_t
+test_vld4q_dup_bf16 (bfloat16_t * ptr)
+{
+  return vld4q_dup_bf16 (ptr);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c
new file mode 100644
index 000000000..162b3ee36
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c
@@ -0,0 +1,107 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+
+#include <arm_neon.h>
+
+void
+test_vst1_bf16_x2 (bfloat16_t *ptr, bfloat16x4x2_t val)
+{
+  vst1_bf16_x2 (ptr, val);
+}
+
+void
+test_vst1q_bf16_x2 (bfloat16_t *ptr, bfloat16x8x2_t val)
+{
+  vst1q_bf16_x2 (ptr, val);
+}
+
+void
+test_vst1_bf16_x3 (bfloat16_t *ptr, bfloat16x4x3_t val)
+{
+  vst1_bf16_x3 (ptr, val);
+}
+
+void
+test_vst1q_bf16_x3 (bfloat16_t *ptr, bfloat16x8x3_t val)
+{
+  vst1q_bf16_x3 (ptr, val);
+}
+
+void
+test_vst1_bf16_x4 (bfloat16_t *ptr, bfloat16x4x4_t val)
+{
+  vst1_bf16_x4 (ptr, val);
+}
+
+void
+test_vst1q_bf16_x4 (bfloat16_t *ptr, bfloat16x8x4_t val)
+{
+  vst1q_bf16_x4 (ptr, val);
+}
+
+void
+test_vst1_lane_bf16 (bfloat16_t *ptr, bfloat16x4_t val)
+{
+  vst1_lane_bf16 (ptr, val, 3);
+}
+
+void
+test_vst1q_lane_bf16 (bfloat16_t *ptr, bfloat16x8_t val)
+{
+  vst1q_lane_bf16 (ptr, val, 7);
+}
+
+void
+test_vst1_bf16 (bfloat16_t *ptr, bfloat16x4_t val)
+{
+  vst1_bf16 (ptr, val);
+}
+
+void
+test_vst1q_bf16 (bfloat16_t *ptr, bfloat16x8_t val)
+{
+  vst1q_bf16 (ptr, val);
+}
+
+void
+test_vst2_bf16 (bfloat16_t *ptr, bfloat16x4x2_t val)
+{
+  vst2_bf16 (ptr, val);
+}
+
+void
+test_vst2q_bf16 (bfloat16_t *ptr, bfloat16x8x2_t val)
+{
+  vst2q_bf16 (ptr, val);
+}
+
+void
+test_vst3_bf16 (bfloat16_t *ptr, bfloat16x4x3_t val)
+{
+  vst3_bf16 (ptr, val);
+}
+
+void
+test_vst3q_bf16 (bfloat16_t *ptr, bfloat16x8x3_t val)
+{
+  vst3q_bf16 (ptr, val);
+}
+
+void
+test_vst4_bf16 (bfloat16_t *ptr, bfloat16x4x4_t val)
+{
+  vst4_bf16 (ptr, val);
+}
+
+void
+test_vst4q_bf16 (bfloat16_t *ptr, bfloat16x8x4_t val)
+{
+  vst4q_bf16 (ptr, val);
+}
+
+int main()
+{
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c
new file mode 100644
index 000000000..bbea630b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c
@@ -0,0 +1,48 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/*
+**test_bfcvtn:
+**     bfcvtn	v0.4h, v0.4s
+**     ret
+*/
+bfloat16x4_t test_bfcvtn (float32x4_t a)
+{
+  return vcvt_bf16_f32 (a);
+}
+
+/*
+**test_bfcvtnq:
+**     bfcvtn	v0.4h, v0.4s
+**     ret
+*/
+bfloat16x8_t test_bfcvtnq (float32x4_t a)
+{
+  return vcvtq_low_bf16_f32 (a);
+}
+
+/*
+**test_bfcvtnq2:
+**     bfcvtn2	v0.8h, v1.4s
+**     ret
+*/
+bfloat16x8_t test_bfcvtnq2 (bfloat16x8_t inactive, float32x4_t a)
+{
+  return vcvtq_high_bf16_f32 (inactive, a);
+}
+
+/*
+**test_bfcvt:
+**     bfcvt	h0, s0
+**     ret
+*/
+bfloat16_t test_bfcvt (float32_t a)
+{
+  return vcvth_bf16_f32 (a);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-nobf16.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-nobf16.c
new file mode 100644
index 000000000..9904d65f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-nobf16.c
@@ -0,0 +1,10 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-additional-options "-march=armv8.2-a+nobf16" } */
+
+#include <arm_neon.h>
+
+bfloat16_t test_bfcvt (float32_t a)
+{
+  /* { dg-error "inlining failed .* 'vcvth_bf16_f32" "" { target *-*-* } 0 } */
+  return vcvth_bf16_f32 (a);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c
new file mode 100644
index 000000000..a91468093
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-nosimd.c
@@ -0,0 +1,17 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target aarch64_asm_bf16_ok } */
+/* { dg-additional-options "-save-temps -march=armv8.2-a+bf16+nosimd" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+
+#include <arm_neon.h>
+
+/*
+**test_bfcvt:
+**	bfcvt	h0, s0
+**	ret
+*/
+bfloat16_t test_bfcvt (float32_t a)
+{
+  return vcvth_bf16_f32 (a);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvtn-nobf16.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvtn-nobf16.c
new file mode 100644
index 000000000..b3b6db123
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvtn-nobf16.c
@@ -0,0 +1,10 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-additional-options "-march=armv8.2-a+nobf16" } */
+
+#include <arm_neon.h>
+
+bfloat16x4_t test_bfcvtn (float32x4_t a)
+{
+  /* { dg-error "inlining failed .* 'vcvt_bf16_f32" "" { target *-*-* } 0 } */
+  return vcvt_bf16_f32 (a);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvtnq2-untied.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvtnq2-untied.c
new file mode 100644
index 000000000..4b730e39d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvtnq2-untied.c
@@ -0,0 +1,20 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/*
+**test_bfcvtnq2_untied:
+**     mov	v0.16b, v1.16b
+**     bfcvtn2	v0.8h, v2.4s
+**     ret
+*/
+bfloat16x8_t test_bfcvtnq2_untied (bfloat16x8_t unused, bfloat16x8_t inactive,
+                                  float32x4_t a)
+{
+  return vcvtq_high_bf16_f32 (inactive, a);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c
new file mode 100755
index 000000000..ad5150773
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c
@@ -0,0 +1,91 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/*
+**ufoo:
+**	bfdot	v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
+**	ret
+*/
+float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq:
+**	bfdot	v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
+**	ret
+*/
+float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_f32 (r, x, y);
+}
+
+/*
+**ufoo_lane:
+**	bfdot	v0.2s, v1.4h, v2.2h\[0\]
+**	ret
+*/
+float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_lane_f32 (r, x, y, 0);
+}
+
+/*
+**ufooq_laneq:
+**	bfdot	v0.4s, v1.8h, v2.2h\[2\]
+**	ret
+*/
+float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_laneq_f32 (r, x, y, 2);
+}
+
+/*
+**ufoo_laneq:
+**	bfdot	v0.2s, v1.4h, v2.2h\[3\]
+**	ret
+*/
+float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
+{
+  return vbfdot_laneq_f32 (r, x, y, 3);
+}
+
+/*
+**ufooq_lane:
+**	bfdot	v0.4s, v1.8h, v2.2h\[1\]
+**	ret
+*/
+float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
+/*
+**ufoo_untied:
+**	mov	v0.8b, v1.8b
+**	bfdot	v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
+**	ret
+*/
+float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq_lane_untied:
+**	mov	v0.16b, v1.16b
+**	bfdot	v0.4s, v2.8h, v3.2h\[1\]
+**	ret
+*/
+float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c
new file mode 100755
index 000000000..58bdee5ac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c
@@ -0,0 +1,91 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-mbig-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/*
+**ufoo:
+**	bfdot	v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
+**	ret
+*/
+float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq:
+**	bfdot	v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
+**	ret
+*/
+float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_f32 (r, x, y);
+}
+
+/*
+**ufoo_lane:
+**	bfdot	v0.2s, v1.4h, v2.2h\[0\]
+**	ret
+*/
+float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_lane_f32 (r, x, y, 0);
+}
+
+/*
+**ufooq_laneq:
+**	bfdot	v0.4s, v1.8h, v2.2h\[2\]
+**	ret
+*/
+float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_laneq_f32 (r, x, y, 2);
+}
+
+/*
+**ufoo_laneq:
+**	bfdot	v0.2s, v1.4h, v2.2h\[3\]
+**	ret
+*/
+float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
+{
+  return vbfdot_laneq_f32 (r, x, y, 3);
+}
+
+/*
+**ufooq_lane:
+**	bfdot	v0.4s, v1.8h, v2.2h\[1\]
+**	ret
+*/
+float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
+/*
+**ufoo_untied:
+**	mov	v0.8b, v1.8b
+**	bfdot	v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
+**	ret
+*/
+float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq_lane_untied:
+**	mov	v0.16b, v1.16b
+**	bfdot	v0.4s, v2.8h, v3.2h\[1\]
+**	ret
+*/
+float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c
new file mode 100755
index 000000000..607126203
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c
@@ -0,0 +1,28 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "--save-temps" } */
+
+#include <arm_neon.h>
+
+float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_lane_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 1} "" { target *-*-* } 0 } */
+}
+
+float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_laneq_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 3} "" { target *-*-* } 0 } */
+}
+
+float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
+{
+  return vbfdot_laneq_f32 (r, x, y, 4); /* { dg-error {lane 4 out of range 0 - 3} "" { target *-*-* } 0 } */
+}
+
+float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 2); /* { dg-error {lane 2 out of range 0 - 1} "" { target *-*-* } 0 } */
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c
new file mode 100644
index 000000000..9810e4ba3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c
@@ -0,0 +1,67 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include <arm_neon.h>
+
+/*
+**test_bfmlalb:
+**      bfmlalb	v0.4s, v1.8h, v2.8h
+**      ret
+*/
+float32x4_t test_bfmlalb (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_f32 (r, a, b);
+}
+
+/*
+**test_bfmlalt:
+**      bfmlalt	v0.4s, v1.8h, v2.8h
+**      ret
+*/
+float32x4_t test_bfmlalt (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_f32 (r, a, b);
+}
+
+/*
+**test_bfmlalb_lane:
+**      bfmlalb	v0.4s, v1.8h, v2.h[0]
+**      ret
+*/
+float32x4_t test_bfmlalb_lane (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlalbq_lane_f32 (r, a, b, 0);
+}
+
+/*
+**test_bfmlalt_lane:
+**      bfmlalt	v0.4s, v1.8h, v2.h[2]
+**      ret
+*/
+float32x4_t test_bfmlalt_lane (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  return vbfmlaltq_lane_f32 (r, a, b, 2);
+}
+
+/*
+**test_bfmlalb_laneq:
+**      bfmlalb	v0.4s, v1.8h, v2.h[4]
+**      ret
+*/
+float32x4_t test_bfmlalb_laneq (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlalbq_laneq_f32 (r, a, b, 4);
+}
+
+/*
+**test_bfmlalt_laneq:
+**      bfmlalt	v0.4s, v1.8h, v2.h[7]
+**      ret
+*/
+float32x4_t test_bfmlalt_laneq (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  return vbfmlaltq_laneq_f32 (r, a, b, 7);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmmla-compile.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmmla-compile.c
new file mode 100644
index 000000000..0aaa69f00
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmmla-compile.c
@@ -0,0 +1,18 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include <arm_neon.h>
+
+
+/*
+**test_bfmmla:
+**     bfmmla	v0.4s, v1.8h, v2.8h
+**     ret
+*/
+float32x4_t test_bfmmla (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfmmlaq_f32 (r, x, y);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbfmlalbt_lane_f32_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbfmlalbt_lane_f32_indices_1.c
new file mode 100644
index 000000000..4d50ba3a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbfmlalbt_lane_f32_indices_1.c
@@ -0,0 +1,46 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include <arm_neon.h>
+
+void
+f_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+  vbfmlaltq_lane_f32 (r, a, b, -1);
+  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+  vbfmlaltq_lane_f32 (r, a, b, 4);
+  return;
+}
+
+void
+f_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */
+  vbfmlaltq_laneq_f32 (r, a, b, -1);
+  /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */
+  vbfmlaltq_laneq_f32 (r, a, b, 8);
+  return;
+}
+
+void
+f_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  /* { dg-error "lane -2 out of range 0 - 3" "" { target *-*-* } 0 } */
+  vbfmlalbq_lane_f32 (r, a, b, -2);
+  /* { dg-error "lane 5 out of range 0 - 3" "" { target *-*-* } 0 } */
+  vbfmlalbq_lane_f32 (r, a, b, 5);
+  return;
+}
+
+void
+f_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  /* { dg-error "lane -2 out of range 0 - 7" "" { target *-*-* } 0 } */
+  vbfmlalbq_laneq_f32 (r, a, b, -2);
+  /* { dg-error "lane 9 out of range 0 - 7" "" { target *-*-* } 0 } */
+  vbfmlalbq_laneq_f32 (r, a, b, 9);
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-1.c
new file mode 100755
index 000000000..ac4f821e7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-1.c
@@ -0,0 +1,136 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/* Unsigned-Signed Dot Product instructions.  */
+
+/*
+**ufoo:
+**	usdot	v0\.2s, v1\.8b, v2\.8b
+**	ret
+*/
+int32x2_t ufoo (int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+  return vusdot_s32 (r, x, y);
+}
+
+/*
+**ufooq:
+**	usdot	v0\.4s, v1\.16b, v2\.16b
+**	ret
+*/
+int32x4_t ufooq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_s32 (r, x, y);
+}
+
+/*
+**ufoo_lane:
+**	usdot	v0\.2s, v1\.8b, v2\.4b\[0\]
+**	ret
+*/
+int32x2_t ufoo_lane (int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+  return vusdot_lane_s32 (r, x, y, 0);
+}
+
+/*
+**ufoo_laneq:
+**	usdot	v0\.2s, v1\.8b, v2\.4b\[2\]
+**	ret
+*/
+int32x2_t ufoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
+{
+  return vusdot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**ufooq_lane:
+**	usdot	v0\.4s, v1\.16b, v2\.4b\[1\]
+**	ret
+*/
+int32x4_t ufooq_lane (int32x4_t r, uint8x16_t x, int8x8_t y)
+{
+  return vusdotq_lane_s32 (r, x, y, 1);
+}
+
+/*
+**ufooq_laneq:
+**	usdot	v0\.4s, v1\.16b, v2\.4b\[3\]
+**	ret
+*/
+int32x4_t ufooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_laneq_s32 (r, x, y, 3);
+}
+
+
+/* Signed-Unsigned Dot Product instructions.  */
+
+/*
+**sfoo_lane:
+**	sudot	v0\.2s, v1\.8b, v2\.4b\[0\]
+**	ret
+*/
+int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, uint8x8_t y)
+{
+  return vsudot_lane_s32 (r, x, y, 0);
+}
+
+/*
+**sfoo_laneq:
+**	sudot	v0\.2s, v1\.8b, v2\.4b\[2\]
+**	ret
+*/
+int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
+{
+  return vsudot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**sfooq_lane:
+**	sudot	v0\.4s, v1\.16b, v2\.4b\[1\]
+**	ret
+*/
+int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
+{
+  return vsudotq_lane_s32 (r, x, y, 1);
+}
+
+/*
+**sfooq_laneq:
+**	sudot	v0\.4s, v1\.16b, v2\.4b\[3\]
+**	ret
+*/
+int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
+{
+  return vsudotq_laneq_s32 (r, x, y, 3);
+}
+
+/*
+**ufoo_untied:
+**	mov	v0\.8b, v1\.8b
+**	usdot	v0\.2s, v2\.8b, v3\.8b
+**	ret
+*/
+int32x2_t ufoo_untied (int32x2_t unused, int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+  return vusdot_s32 (r, x, y);
+}
+
+/*
+**ufooq_laneq_untied:
+**	mov	v0\.16b, v1\.16b
+**	usdot	v0\.4s, v2\.16b, v3\.4b\[3\]
+**	ret
+*/
+int32x4_t ufooq_laneq_untied (int32x2_t unused, int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_laneq_s32 (r, x, y, 3);
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-2.c
new file mode 100755
index 000000000..96bca2356
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-2.c
@@ -0,0 +1,137 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-additional-options "-mbig-endian -save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/* Unsigned-Signed Dot Product instructions.  */
+
+/*
+**ufoo:
+**	usdot	v0\.2s, v1\.8b, v2\.8b
+**	ret
+*/
+int32x2_t ufoo (int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+  return vusdot_s32 (r, x, y);
+}
+
+/*
+**ufooq:
+**	usdot	v0\.4s, v1\.16b, v2\.16b
+**	ret
+*/
+int32x4_t ufooq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_s32 (r, x, y);
+}
+
+/*
+**ufoo_lane:
+**	usdot	v0\.2s, v1\.8b, v2\.4b\[0\]
+**	ret
+*/
+int32x2_t ufoo_lane (int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+  return vusdot_lane_s32 (r, x, y, 0);
+}
+
+/*
+**ufoo_laneq:
+**	usdot	v0\.2s, v1\.8b, v2\.4b\[2\]
+**	ret
+*/
+int32x2_t ufoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
+{
+  return vusdot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**ufooq_lane:
+**	usdot	v0\.4s, v1\.16b, v2\.4b\[1\]
+**	ret
+*/
+int32x4_t ufooq_lane (int32x4_t r, uint8x16_t x, int8x8_t y)
+{
+  return vusdotq_lane_s32 (r, x, y, 1);
+}
+
+/*
+**ufooq_laneq:
+**	usdot	v0\.4s, v1\.16b, v2\.4b\[3\]
+**	ret
+*/
+int32x4_t ufooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_laneq_s32 (r, x, y, 3);
+}
+
+
+/* Signed-Unsigned Dot Product instructions.  */
+
+/*
+**sfoo_lane:
+**	sudot	v0\.2s, v1\.8b, v2\.4b\[0\]
+**	ret
+*/
+int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, uint8x8_t y)
+{
+  return vsudot_lane_s32 (r, x, y, 0);
+}
+
+/*
+**sfoo_laneq:
+**	sudot	v0\.2s, v1\.8b, v2\.4b\[2\]
+**	ret
+*/
+int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
+{
+  return vsudot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**sfooq_lane:
+**	sudot	v0\.4s, v1\.16b, v2\.4b\[1\]
+**	ret
+*/
+int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
+{
+  return vsudotq_lane_s32 (r, x, y, 1);
+}
+
+/*
+**sfooq_laneq:
+**	sudot	v0\.4s, v1\.16b, v2\.4b\[3\]
+**	ret
+*/
+int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
+{
+  return vsudotq_laneq_s32 (r, x, y, 3);
+}
+
+/*
+**ufoo_untied:
+**	mov	v0\.8b, v1\.8b
+**	usdot	v0\.2s, v2\.8b, v3\.8b
+**	ret
+*/
+int32x2_t ufoo_untied (int32x2_t unused, int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+  return vusdot_s32 (r, x, y);
+}
+
+/*
+**ufooq_laneq_untied:
+**	mov	v0\.16b, v1\.16b
+**	usdot	v0\.4s, v2\.16b, v3\.4b\[3\]
+**	ret
+*/
+int32x4_t ufooq_laneq_untied (int32x2_t unused, int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_laneq_s32 (r, x, y, 3);
+}
+
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-3.c
new file mode 100755
index 000000000..18ecabef8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-3.c
@@ -0,0 +1,31 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-additional-options "--save-temps" } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+int32x2_t ufoo_lane (int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+  /* { dg-error "lane -1 out of range 0 - 1" "" { target *-*-* } 0 } */
+  return vusdot_lane_s32 (r, x, y, -1);
+}
+
+int32x2_t ufoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
+{
+  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vusdot_laneq_s32 (r, x, y, -1);
+}
+
+int32x4_t ufooq_lane (int32x4_t r, uint8x16_t x, int8x8_t y)
+{
+  /* { dg-error "lane 2 out of range 0 - 1" "" { target *-*-* } 0 } */
+  return vusdotq_lane_s32 (r, x, y, 2);
+}
+
+int32x4_t ufooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vusdotq_laneq_s32 (r, x, y, 4);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-4.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-4.c
new file mode 100755
index 000000000..66c87d486
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-3-4.c
@@ -0,0 +1,31 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-additional-options "--save-temps" } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, uint8x8_t y)
+{
+  /* { dg-error "lane -1 out of range 0 - 1" "" { target *-*-* } 0 } */
+  return vsudot_lane_s32 (r, x, y, -1);
+}
+
+int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
+{
+  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vsudot_laneq_s32 (r, x, y, -1);
+}
+
+int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
+{
+  /* { dg-error "lane 2 out of range 0 - 1" "" { target *-*-* } 0 } */
+  return vsudotq_lane_s32 (r, x, y, 2);
+}
+
+int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
+{
+  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vsudotq_laneq_s32 (r, x, y, 4);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x4.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x4.c
new file mode 100644
index 000000000..451a0afc6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1x4.c
@@ -0,0 +1,83 @@
+/* We haven't implemented these intrinsics for arm yet.  */
+/* { dg-xfail-if "" { arm*-*-* } } */
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)			\
+int __attribute__ ((noinline))				\
+test_vld1##SUFFIX##_x4 ()				\
+{							\
+  BASE##_t data[ELTS * 4];				\
+  BASE##_t temp[ELTS * 4];				\
+  BASE##x##ELTS##x##4##_t vectors;			\
+  int i,j;						\
+  for (i = 0; i < ELTS * 4; i++)			\
+    data [i] = (BASE##_t) 4*i;				\
+  asm volatile ("" : : : "memory");			\
+  vectors = vld1##SUFFIX##_x4 (data);			\
+  vst1##SUFFIX (temp, vectors.val[0]);			\
+  vst1##SUFFIX (&temp[ELTS], vectors.val[1]);		\
+  vst1##SUFFIX (&temp[ELTS * 2], vectors.val[2]);	\
+  vst1##SUFFIX (&temp[ELTS * 3], vectors.val[3]);	\
+  asm volatile ("" : : : "memory");			\
+  for (j = 0; j < ELTS * 4; j++)			\
+    if (temp[j] != data[j])				\
+      return 1;						\
+  return 0;						\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (poly64, 1, _p64)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (poly64, 2, q_p64)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+/* Tests of vld1_x4 and vld1q_x4.  */
+VARIANTS (TESTMETH)
+
+#define CHECKS(BASE, ELTS, SUFFIX)	\
+  if (test_vld1##SUFFIX##_x4 () != 0)	\
+    fprintf (stderr, "test_vld1##SUFFIX##_x4");
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECKS)
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x4.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x4.c
new file mode 100644
index 000000000..1f17b5342
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1x4.c
@@ -0,0 +1,83 @@
+/* We haven't implemented these intrinsics for arm yet.  */
+/* { dg-xfail-if "" { arm*-*-* } } */
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+
+extern void abort (void);
+
+#define TESTMETH(BASE, ELTS, SUFFIX)			\
+int __attribute__ ((noinline))				\
+test_vst1##SUFFIX##_x4 ()				\
+{							\
+  BASE##_t data[ELTS * 4];				\
+  BASE##_t temp[ELTS * 4];				\
+  BASE##x##ELTS##x##4##_t vectors;			\
+  int i,j;						\
+  for (i = 0; i < ELTS * 4; i++)			\
+    data [i] = (BASE##_t) 4*i;				\
+  asm volatile ("" : : : "memory");			\
+  vectors.val[0] = vld1##SUFFIX (data);			\
+  vectors.val[1] = vld1##SUFFIX (&data[ELTS]);		\
+  vectors.val[2] = vld1##SUFFIX (&data[ELTS * 2]);	\
+  vectors.val[3] = vld1##SUFFIX (&data[ELTS * 3]);	\
+  vst1##SUFFIX##_x4 (temp, vectors);			\
+  asm volatile ("" : : : "memory");			\
+  for (j = 0; j < ELTS * 4; j++)			\
+    if (temp[j] != data[j])				\
+      return 1;						\
+  return 0;						\
+}
+
+#define VARIANTS_1(VARIANT)	\
+VARIANT (uint8, 8, _u8)		\
+VARIANT (uint16, 4, _u16)	\
+VARIANT (uint32, 2, _u32)	\
+VARIANT (uint64, 1, _u64)	\
+VARIANT (int8, 8, _s8)		\
+VARIANT (int16, 4, _s16)	\
+VARIANT (int32, 2, _s32)	\
+VARIANT (int64, 1, _s64)	\
+VARIANT (poly8, 8, _p8)		\
+VARIANT (poly16, 4, _p16)	\
+VARIANT (poly64, 1, _p64)	\
+VARIANT (float16, 4, _f16)	\
+VARIANT (float32, 2, _f32)	\
+VARIANT (uint8, 16, q_u8)	\
+VARIANT (uint16, 8, q_u16)	\
+VARIANT (uint32, 4, q_u32)	\
+VARIANT (uint64, 2, q_u64)	\
+VARIANT (int8, 16, q_s8)	\
+VARIANT (int16, 8, q_s16)	\
+VARIANT (int32, 4, q_s32)	\
+VARIANT (int64, 2, q_s64)	\
+VARIANT (poly8, 16, q_p8)	\
+VARIANT (poly16, 8, q_p16)	\
+VARIANT (poly64, 2, q_p64)	\
+VARIANT (float16, 8, q_f16)	\
+VARIANT (float32, 4, q_f32)
+
+#ifdef __aarch64__
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)	\
+VARIANT (float64, 1, _f64)			\
+VARIANT (float64, 2, q_f64)
+#else
+#define VARIANTS(VARIANT) VARIANTS_1(VARIANT)
+#endif
+
+/* Tests of vst1_x4 and vst1q_x4.  */
+VARIANTS (TESTMETH)
+
+#define CHECKS(BASE, ELTS, SUFFIX)	\
+  if (test_vst1##SUFFIX##_x4 () != 0)	\
+    fprintf (stderr, "test_vst1##SUFFIX##_x4");
+
+int
+main (int argc, char **argv)
+{
+  VARIANTS (CHECKS)
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c b/gcc/testsuite/gcc.target/aarch64/aes-fuse-1.c
similarity index 51%
rename from gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c
rename to gcc/testsuite/gcc.target/aarch64/aes-fuse-1.c
index d8adc8946..d7b4f8991 100644
--- a/gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/aes-fuse-1.c
@@ -1,45 +1,66 @@
 /* { dg-do compile } */
 /* { dg-options "-O3 -mcpu=cortex-a72+crypto -dp" } */
+/* { dg-additional-options "-march=armv8-a+crypto" { target { aarch64*-*-* } } }*/
 
 #include <arm_neon.h>
 
 #define AESE(r, v, key) (r = vaeseq_u8 ((v), (key)));
 #define AESMC(r, i) (r = vaesmcq_u8 (i))
 
+const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
 uint8x16_t dummy;
 uint8x16_t a;
 uint8x16_t b;
 uint8x16_t c;
 uint8x16_t d;
-uint8x16_t e;
+uint8x16_t x;
+uint8x16_t y;
+uint8x16_t k;
+
+void foo (void)
 
-void
-foo (void)
 {
-  AESE (a, a, e);
+  AESE (a, a, k);
   dummy = vaddq_u8 (dummy, dummy);
   dummy = vaddq_u8 (dummy, dummy);
-  AESE (b, b, e);
+  AESE (b, b, k);
   dummy = vaddq_u8 (dummy, dummy);
   dummy = vaddq_u8 (dummy, dummy);
-  AESE (c, c, e);
+  AESE (c, c, k);
   dummy = vaddq_u8 (dummy, dummy);
   dummy = vaddq_u8 (dummy, dummy);
-  AESE (d, d, e);
+  AESE (d, d, k);
   dummy = vaddq_u8 (dummy, dummy);
   dummy = vaddq_u8 (dummy, dummy);
 
-  AESMC (a, a);
+  x = x ^ k;
+  AESE (x, x, zero);
   dummy = vaddq_u8 (dummy, dummy);
   dummy = vaddq_u8 (dummy, dummy);
-  AESMC (b, b);
+  y = y ^ k;
+  AESE (y, y, zero);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+
+  AESMC (d, d);
   dummy = vaddq_u8 (dummy, dummy);
   dummy = vaddq_u8 (dummy, dummy);
   AESMC (c, c);
   dummy = vaddq_u8 (dummy, dummy);
   dummy = vaddq_u8 (dummy, dummy);
-  AESMC (d, d);
-}
+  AESMC (b, b);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESMC (a, a);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
 
-/* { dg-final { scan-assembler-times "crypto_aese_fused" 4 } } */
+  AESMC (y, y);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESMC (x, x);
+}
 
+/* { dg-final { scan-assembler-times "crypto_aese_fused" 6 } } */
+/* { dg-final { scan-assembler-not "veor" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/aes-fuse-2.c b/gcc/testsuite/gcc.target/aarch64/aes-fuse-2.c
new file mode 100644
index 000000000..dfe01b03a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/aes-fuse-2.c
@@ -0,0 +1,65 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mcpu=cortex-a72+crypto -dp" } */
+/* { dg-additional-options "-march=armv8-a+crypto" { target { aarch64*-*-* } } }*/
+
+#include <arm_neon.h>
+
+#define AESD(r, v, key) (r = vaesdq_u8 ((v), (key)));
+#define AESIMC(r, i) (r = vaesimcq_u8 (i))
+
+const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+uint8x16_t dummy;
+uint8x16_t a;
+uint8x16_t b;
+uint8x16_t c;
+uint8x16_t d;
+uint8x16_t x;
+uint8x16_t y;
+uint8x16_t k;
+
+void foo (void)
+{
+  AESD (a, a, k);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESD (b, b, k);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESD (c, c, k);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESD (d, d, k);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+
+  x = x ^ k;
+  AESD (x, x, zero);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  y = y ^ k;
+  AESD (y, y, zero);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+
+  AESIMC (d, d);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESIMC (c, c);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESIMC (b, b);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESIMC (a, a);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+
+  AESIMC (y, y);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESIMC (x, x);
+}
+
+/* { dg-final { scan-assembler-times "crypto_aesd_fused" 6 } } */
+/* { dg-final { scan-assembler-not "veor" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/asm-x-constraint-1.c b/gcc/testsuite/gcc.target/aarch64/asm-x-constraint-1.c
new file mode 100644
index 000000000..a71043be5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/asm-x-constraint-1.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+void
+f (void)
+{
+  register float s0 asm ("s0");
+  register float s7 asm ("s7");
+  register float s8 asm ("s8");
+  register float s15 asm ("s15");
+  register float s16 asm ("s16");
+  register float s31 asm ("s31");
+  asm volatile ("// s0 out: %s0" : "=w" (s0));
+  asm volatile ("// s0 in: %s0" :: "x" (s0));
+  asm volatile ("// s7 out: %s0" : "=w" (s7));
+  asm volatile ("// s7 in: %s0" :: "x" (s7));
+  asm volatile ("// s8 out: %s0" : "=w" (s8));
+  asm volatile ("// s8 in: %s0" :: "x" (s8));
+  asm volatile ("// s15 out: %s0" : "=w" (s15));
+  asm volatile ("// s15 in: %s0" :: "x" (s15));
+  asm volatile ("// s16 out: %s0" : "=w" (s16));
+  asm volatile ("// s16 in: %s0" :: "x" (s16));
+  asm volatile ("// s31 out: %s0" : "=w" (s31));
+  asm volatile ("// s31 in: %s0" :: "x" (s31));
+}
+
+/* { dg-final { scan-assembler {\t// s0 out: s0\n.*[/]/ s0 in: s0\n} } } */
+/* { dg-final { scan-assembler {\t// s7 out: s7\n.*[/]/ s7 in: s7\n} } } */
+/* { dg-final { scan-assembler {\t// s8 out: s8\n.*[/]/ s8 in: s8\n} } } */
+/* { dg-final { scan-assembler {\t// s15 out: s15\n.*[/]/ s15 in: s15\n} } } */
+/* { dg-final { scan-assembler {\t// s16 out: s16\n.*\tfmov\t(s[0-7]), s16\n.*[/]/ s16 in: \1\n} } } */
+/* { dg-final { scan-assembler {\t// s31 out: s31\n.*\tfmov\t(s[0-7]), s31\n.*[/]/ s31 in: \1\n} } } */
+/* { dg-final { scan-assembler-not {\t// s16 in: s16\n} } } */
+/* { dg-final { scan-assembler-not {\t// s31 in: s31\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/asm-y-constraint-1.c b/gcc/testsuite/gcc.target/aarch64/asm-y-constraint-1.c
new file mode 100644
index 000000000..4a3fcac56
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/asm-y-constraint-1.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+void
+f (void)
+{
+  register float s0 asm ("s0");
+  register float s7 asm ("s7");
+  register float s8 asm ("s8");
+  register float s15 asm ("s15");
+  register float s16 asm ("s16");
+  register float s31 asm ("s31");
+  asm volatile ("// s0 out: %s0" : "=w" (s0));
+  asm volatile ("// s0 in: %s0" :: "y" (s0));
+  asm volatile ("// s7 out: %s0" : "=w" (s7));
+  asm volatile ("// s7 in: %s0" :: "y" (s7));
+  asm volatile ("// s8 out: %s0" : "=w" (s8));
+  asm volatile ("// s8 in: %s0" :: "y" (s8));
+  asm volatile ("// s15 out: %s0" : "=w" (s15));
+  asm volatile ("// s15 in: %s0" :: "y" (s15));
+  asm volatile ("// s16 out: %s0" : "=w" (s16));
+  asm volatile ("// s16 in: %s0" :: "y" (s16));
+  asm volatile ("// s31 out: %s0" : "=w" (s31));
+  asm volatile ("// s31 in: %s0" :: "y" (s31));
+}
+
+/* { dg-final { scan-assembler {\t// s0 out: s0\n.*[/]/ s0 in: s0\n} } } */
+/* { dg-final { scan-assembler {\t// s7 out: s7\n.*[/]/ s7 in: s7\n} } } */
+/* { dg-final { scan-assembler {\t// s8 out: s8\n.*\tfmov\t(s[0-7]), s8\n.*[/]/ s8 in: \1\n} } } */
+/* { dg-final { scan-assembler {\t// s15 out: s15\n.*\tfmov\t(s[0-7]), s15\n.*[/]/ s15 in: \1\n} } } */
+/* { dg-final { scan-assembler {\t// s16 out: s16\n.*\tfmov\t(s[0-7]), s16\n.*[/]/ s16 in: \1\n} } } */
+/* { dg-final { scan-assembler {\t// s31 out: s31\n.*\tfmov\t(s[0-7]), s31\n.*[/]/ s31 in: \1\n} } } */
+/* { dg-final { scan-assembler-not {\t// s8 in: s8\n} } } */
+/* { dg-final { scan-assembler-not {\t// s15 in: s15\n} } } */
+/* { dg-final { scan-assembler-not {\t// s16 in: s16\n} } } */
+/* { dg-final { scan-assembler-not {\t// s31 in: s31\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-comp-swap-release-acquire.c b/gcc/testsuite/gcc.target/aarch64/atomic-comp-swap-release-acquire.c
index 49ca5d0d0..a828a72aa 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-comp-swap-release-acquire.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-comp-swap-release-acquire.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf -mno-outline-atomics" } */
 
 #include "atomic-comp-swap-release-acquire.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-acq_rel.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-acq_rel.c
index 74f26348e..6823ce381 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-acq_rel.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-acq_rel.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
 
 #include "atomic-op-acq_rel.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-acquire.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-acquire.c
index 66c1b1efe..87937de37 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-acquire.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-acquire.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
 
 #include "atomic-op-acquire.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-char.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-char.c
index c09d0434e..60955e57d 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-char.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-char.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
 
 #include "atomic-op-char.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-consume.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-consume.c
index 5783ab84f..16cb11aee 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-consume.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-consume.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
 
 #include "atomic-op-consume.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-imm.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-imm.c
index 18b8f0b04..bcab4e481 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-imm.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-imm.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
 
 int v = 0;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-int.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-int.c
index 8520f0839..040e4a8d1 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-int.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-int.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
 
 #include "atomic-op-int.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-long.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-long.c
index d011f8c5c..fc88b92cd 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-long.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-long.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
 
 long v = 0;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-relaxed.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-relaxed.c
index ed96bfdb9..503d62b02 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-relaxed.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-relaxed.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
 
 #include "atomic-op-relaxed.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-release.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-release.c
index fc4be17de..efe14aea7 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-release.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-release.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
 
 #include "atomic-op-release.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-seq_cst.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-seq_cst.c
index 613000fe4..09973bf82 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-seq_cst.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-seq_cst.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
 
 #include "atomic-op-seq_cst.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-short.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-short.c
index e82c8118e..e1dcebb0f 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-short.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-short.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
 
 #include "atomic-op-short.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c
index f2a21ddf2..29246979b 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -march=armv8-a+nolse" } */
+/* { dg-options "-O2 -march=armv8-a+nolse -mno-outline-atomics" } */
 /* { dg-skip-if "" { *-*-* } { "-mcpu=*" } { "" } } */
 
 int
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c
index 8d2ae67df..6daf9b08f 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -march=armv8-a+nolse" } */
+/* { dg-options "-O2 -march=armv8-a+nolse -mno-outline-atomics" } */
 /* { dg-skip-if "" { *-*-* } { "-mcpu=*" } { "" } } */
 
 int
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_1.c
new file mode 100644
index 000000000..ef4376649
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_1.c
@@ -0,0 +1,102 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps -std=gnu90" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_bf16.h>
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**bfloat_mov_ww:
+**	mov	v1.h\[0\], v2.h\[0\]
+**	ret
+*/
+void bfloat_mov_ww (void)
+{
+  register bfloat16_t x asm ("h2");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_rw:
+**	dup	v1.4h, w1
+**	ret
+*/
+void bfloat_mov_rw (void)
+{
+  register bfloat16_t x asm ("w1");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_wr:
+**	umov	w1, v1.h\[0\]
+**	ret
+*/
+void bfloat_mov_wr (void)
+{
+  register bfloat16_t x asm ("h1");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rr:
+**	mov	w1, w2
+**	ret
+*/
+void bfloat_mov_rr (void)
+{
+  register bfloat16_t x asm ("w2");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rm:
+**	strh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_rm (bfloat16_t *ptr)
+{
+   register bfloat16_t x asm ("w2");
+   asm volatile ("" : "=r" (x));
+   *ptr = x;
+}
+
+/*
+**bfloat_mov_mr:
+**	ldrh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_mr (bfloat16_t *ptr)
+{
+   register bfloat16_t y asm ("w2");
+   y = *ptr;
+   asm volatile ("" :: "r" (y));
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_2.c
new file mode 100644
index 000000000..df8e7518c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_2.c
@@ -0,0 +1,106 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps -std=gnu90" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_bf16.h>
+
+#pragma GCC push_options
+#pragma GCC target ("+bf16")
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**bfloat_mov_ww:
+**	mov	v1.h\[0\], v2.h\[0\]
+**	ret
+*/
+void bfloat_mov_ww (void)
+{
+  register bfloat16_t x asm ("h2");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_rw:
+**	dup	v1.4h, w1
+**	ret
+*/
+void bfloat_mov_rw (void)
+{
+  register bfloat16_t x asm ("w1");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_wr:
+**	umov	w1, v1.h\[0\]
+**	ret
+*/
+void bfloat_mov_wr (void)
+{
+  register bfloat16_t x asm ("h1");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rr:
+**	mov	w1, w2
+**	ret
+*/
+void bfloat_mov_rr (void)
+{
+  register bfloat16_t x asm ("w2");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rm:
+**	strh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_rm (bfloat16_t *ptr)
+{
+   register bfloat16_t x asm ("w2");
+   asm volatile ("" : "=r" (x));
+   *ptr = x;
+}
+
+/*
+**bfloat_mov_mr:
+**	ldrh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_mr (bfloat16_t *ptr)
+{
+   register bfloat16_t y asm ("w2");
+   y = *ptr;
+   asm volatile ("" :: "r" (y));
+}
+
+#pragma GCC pop_options
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_3.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_3.c
new file mode 100644
index 000000000..5d7a4317c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_3.c
@@ -0,0 +1,101 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps -std=gnu90" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_bf16.h>
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**bfloat_mov_ww:
+**	mov	v1.h\[0\], v2.h\[0\]
+**	ret
+*/
+void bfloat_mov_ww (void)
+{
+  register bfloat16_t x asm ("h2");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_rw:
+**	dup	v1.4h, w1
+**	ret
+*/
+void bfloat_mov_rw (void)
+{
+  register bfloat16_t x asm ("w1");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_wr:
+**	umov	w1, v1.h\[0\]
+**	ret
+*/
+void bfloat_mov_wr (void)
+{
+  register bfloat16_t x asm ("h1");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rr:
+**	mov	w1, w2
+**	ret
+*/
+void bfloat_mov_rr (void)
+{
+  register bfloat16_t x asm ("w2");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rm:
+**	strh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_rm (bfloat16_t *ptr)
+{
+   register bfloat16_t x asm ("w2");
+   asm volatile ("" : "=r" (x));
+   *ptr = x;
+}
+
+/*
+**bfloat_mov_mr:
+**	ldrh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_mr (bfloat16_t *ptr)
+{
+   register bfloat16_t y asm ("w2");
+   y = *ptr;
+   asm volatile ("" :: "r" (y));
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_4.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_4.c
new file mode 100644
index 000000000..b812011c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_4.c
@@ -0,0 +1,16 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-std=c99 -pedantic-errors -O3 --save-temps" } */
+
+#include <arm_bf16.h>
+
+_Complex bfloat16_t stacktest1 (_Complex bfloat16_t __a)
+{
+  volatile _Complex bfloat16_t b = __a;
+  return b;
+}
+
+/* { dg-error {ISO C does not support plain 'complex' meaning 'double complex'} "" { target *-*-* } 8 } */
+/* { dg-error {expected '=', ',', ';', 'asm' or '__attribute__' before 'stacktest1'} "" { target *-*-* } 8 } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_typecheck.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_typecheck.c
new file mode 100644
index 000000000..7c9188cf2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_typecheck.c
@@ -0,0 +1,219 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-Wno-pedantic -O3 --save-temps" }  */
+
+#include <arm_neon.h>
+
+bfloat16_t glob_bfloat;
+
+int is_an_int;
+short is_a_short_int;
+float is_a_float;
+float is_a_float16;
+double is_a_double;
+
+float *float_ptr;
+
+bfloat16_t foo1 (void) { return (bfloat16_t) 0x1234; } /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+bfloat16_t foo2 (void) { return (bfloat16_t) (short) 0x1234; } /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+
+bfloat16_t footest (bfloat16_t scalar0)
+{
+
+  /* Initialisation  */
+
+  bfloat16_t scalar1_1;
+  bfloat16_t scalar1_2 = glob_bfloat;
+  bfloat16_t scalar1_3 = 0;   /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16_t scalar1_4 = 0.1; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16_t scalar1_5 = is_a_float; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16_t scalar1_6 = is_an_int;  /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16_t scalar1_7 = is_a_float16; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16_t scalar1_8 = is_a_double; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16_t scalar1_9 = is_a_short_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+
+  int initi_1_1 = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  float initi_1_2 = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  float16_t initi_1_3 = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  short initi_1_4 = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  double initi_1_5 = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+
+  bfloat16_t scalar2_1 = {}; /* { dg-error {empty scalar initializer} } */
+  bfloat16_t scalar2_2 = { glob_bfloat };
+  bfloat16_t scalar2_3 = { 0 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16_t scalar2_4 = { 0.1 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16_t scalar2_5 = { is_a_float }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16_t scalar2_6 = { is_an_int }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16_t scalar2_7 = { is_a_float16 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16_t scalar2_8 = { is_a_double }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16_t scalar2_9 = { is_a_short_int }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+
+  int initi_2_1 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  float initi_2_2 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  float16_t initi_2_3 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  short initi_2_4 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  double initi_2_5 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+
+  /* Assignments.  */
+
+  glob_bfloat = glob_bfloat;
+  glob_bfloat = 0;   /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  glob_bfloat = 0.1; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  glob_bfloat = is_a_float; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  glob_bfloat = is_an_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  glob_bfloat = is_a_float16; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  glob_bfloat = is_a_double; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  glob_bfloat = is_a_short_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+
+  is_an_int = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  is_a_float = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  is_a_float16 = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  is_a_double = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  is_a_short_int = glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+
+  /* Casting.  */
+
+  (void) glob_bfloat;
+  (bfloat16_t) glob_bfloat;
+
+  (int) glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  (float) glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  (float16_t) glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  (double) glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  (short) glob_bfloat; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+
+  (bfloat16_t) is_an_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  (bfloat16_t) is_a_float; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  (bfloat16_t) is_a_float16; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  (bfloat16_t) is_a_double; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  (bfloat16_t) is_a_short_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+
+  /* Compound literals.  */
+
+  (bfloat16_t) {}; /* { dg-error {empty scalar initializer} } */
+  (bfloat16_t) { glob_bfloat };
+  (bfloat16_t) { 0 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  (bfloat16_t) { 0.1 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  (bfloat16_t) { is_a_float }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  (bfloat16_t) { is_an_int }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  (bfloat16_t) { is_a_float16 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  (bfloat16_t) { is_a_double }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  (bfloat16_t) { is_a_short_int }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+
+  (int) { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  (float) { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  (float16_t) { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  (double) { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  (short) { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+
+  /* Arrays and Structs.  */
+
+  typedef bfloat16_t array_type[2];
+  extern bfloat16_t extern_array[];
+
+  bfloat16_t array[2];
+  bfloat16_t zero_length_array[0];
+  bfloat16_t empty_init_array[] = {};
+  typedef bfloat16_t some_other_type[is_an_int];
+
+  struct struct1 {
+    bfloat16_t a;
+  };
+
+  union union1 {
+    bfloat16_t a;
+  };
+
+  /* Addressing and dereferencing.  */
+
+  bfloat16_t *bfloat_ptr = &scalar0;
+  scalar0 = *bfloat_ptr;
+
+  /* Pointer assignment.  */
+
+  bfloat16_t *bfloat_ptr2 = bfloat_ptr;
+  bfloat16_t *bfloat_ptr3 = array;
+
+  /* Pointer arithmetic.  */
+
+  ++bfloat_ptr;
+  --bfloat_ptr;
+  bfloat_ptr++;
+  bfloat_ptr--;
+  bfloat_ptr += 1;
+  bfloat_ptr -= 1;
+  bfloat_ptr - bfloat_ptr2;
+  bfloat_ptr = &bfloat_ptr3[0];
+  bfloat_ptr = &bfloat_ptr3[1];
+
+  /* Simple comparison.  */
+  scalar0 > glob_bfloat; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  glob_bfloat == scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  scalar0 > is_a_float; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  is_a_float == scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  scalar0 > 0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  0 == scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  scalar0 > 0.1; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  0.1 == scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  scalar0 > is_an_int; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  is_an_int == scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+
+  /* Pointer comparison.  */
+
+  bfloat_ptr == &scalar0;
+  bfloat_ptr != &scalar0;
+  bfloat_ptr < &scalar0;
+  bfloat_ptr <= &scalar0;
+  bfloat_ptr > &scalar0;
+  bfloat_ptr >= &scalar0;
+  bfloat_ptr == bfloat_ptr2;
+  bfloat_ptr != bfloat_ptr2;
+  bfloat_ptr < bfloat_ptr2;
+  bfloat_ptr <= bfloat_ptr2;
+  bfloat_ptr > bfloat_ptr2;
+  bfloat_ptr >= bfloat_ptr2;
+
+  /* Conditional expressions.  */
+
+  0 ? scalar0 : scalar0;
+  0 ? scalar0 : is_a_float; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  0 ? is_a_float : scalar0; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  0 ? scalar0 : 0; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  0 ? 0 : scalar0; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  0 ? 0.1 : scalar0; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  0 ? scalar0 : 0.1; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  0 ? bfloat_ptr : bfloat_ptr2;
+  0 ? bfloat_ptr : float_ptr; /* { dg-error {pointer type mismatch in conditional expression} } */
+  0 ? float_ptr : bfloat_ptr; /* { dg-error {pointer type mismatch in conditional expression} } */
+
+  scalar0 ? scalar0 : scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  scalar0 ? is_a_float : scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  scalar0 ? scalar0 : is_a_float; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  scalar0 ? is_a_float : is_a_float; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+
+  /* Unary operators.  */
+
+  +scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  -scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  ~scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  !scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  *scalar0; /* { dg-error {invalid type argument of unary '\*'} } */
+  __real scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  __imag scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  ++scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  --scalar0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  scalar0++; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  scalar0--; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+
+  /* Binary arithmetic operations.  */
+
+  scalar0 = glob_bfloat + *bfloat_ptr; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  scalar0 = glob_bfloat + 0.1; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  scalar0 = glob_bfloat + 0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  scalar0 = glob_bfloat + is_a_float; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+
+  return scalar0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_1.c
new file mode 100644
index 000000000..6cad557eb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_1.c
@@ -0,0 +1,93 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	sub	sp, sp, #16
+**	str	d0, \[sp, 8\]
+**	ldr	d0, \[sp, 8\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	sub	sp, sp, #16
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
+typedef bfloat16_t v8bf __attribute__((vector_size(16)));
+typedef bfloat16_t v16bf __attribute__((vector_size(32)));
+typedef bfloat16_t v32bf __attribute__((vector_size(64)));
+typedef bfloat16_t v64bf __attribute__((vector_size(128)));
+typedef bfloat16_t v128bf __attribute__((vector_size(256)));
+
+v8bf stacktest4 (v8bf __a)
+{
+  volatile v8bf b = __a;
+  return b;
+}
+
+v16bf stacktest5 (v16bf __a)
+{
+  volatile v16bf b = __a;
+  return b;
+}
+
+v32bf stacktest6 (v32bf __a)
+{
+  volatile v32bf b = __a;
+  return b;
+}
+
+v64bf stacktest7 (v64bf __a)
+{
+  volatile v64bf b = __a;
+  return b;
+}
+
+v128bf stacktest8 (v128bf __a)
+{
+  volatile v128bf b = __a;
+  return b;
+}
+
+/* Test use of constant values to assign values to vectors.  */
+
+typedef bfloat16_t v2bf __attribute__((vector_size(4)));
+v2bf c2 (void) { return (v2bf) 0x12345678; }
+
+bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_2.c
new file mode 100644
index 000000000..3891dcfc9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_2.c
@@ -0,0 +1,97 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+#pragma GCC push_options
+#pragma GCC target ("+bf16")
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	sub	sp, sp, #16
+**	str	d0, \[sp, 8\]
+**	ldr	d0, \[sp, 8\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	sub	sp, sp, #16
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
+typedef bfloat16_t v8bf __attribute__((vector_size(16)));
+typedef bfloat16_t v16bf __attribute__((vector_size(32)));
+typedef bfloat16_t v32bf __attribute__((vector_size(64)));
+typedef bfloat16_t v64bf __attribute__((vector_size(128)));
+typedef bfloat16_t v128bf __attribute__((vector_size(256)));
+
+v8bf stacktest4 (v8bf __a)
+{
+  volatile v8bf b = __a;
+  return b;
+}
+
+v16bf stacktest5 (v16bf __a)
+{
+  volatile v16bf b = __a;
+  return b;
+}
+
+v32bf stacktest6 (v32bf __a)
+{
+  volatile v32bf b = __a;
+  return b;
+}
+
+v64bf stacktest7 (v64bf __a)
+{
+  volatile v64bf b = __a;
+  return b;
+}
+
+v128bf stacktest8 (v128bf __a)
+{
+  volatile v128bf b = __a;
+  return b;
+}
+
+/* Test use of constant values to assign values to vectors.  */
+
+typedef bfloat16_t v2bf __attribute__((vector_size(4)));
+v2bf c2 (void) { return (v2bf) 0x12345678; }
+
+bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_3.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_3.c
new file mode 100644
index 000000000..b35f5e527
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_3.c
@@ -0,0 +1,92 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	sub	sp, sp, #16
+**	str	d0, \[sp, 8\]
+**	ldr	d0, \[sp, 8\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	sub	sp, sp, #16
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
+typedef bfloat16_t v8bf __attribute__((vector_size(16)));
+typedef bfloat16_t v16bf __attribute__((vector_size(32)));
+typedef bfloat16_t v32bf __attribute__((vector_size(64)));
+typedef bfloat16_t v64bf __attribute__((vector_size(128)));
+typedef bfloat16_t v128bf __attribute__((vector_size(256)));
+
+v8bf stacktest4 (v8bf __a)
+{
+  volatile v8bf b = __a;
+  return b;
+}
+
+v16bf stacktest5 (v16bf __a)
+{
+  volatile v16bf b = __a;
+  return b;
+}
+
+v32bf stacktest6 (v32bf __a)
+{
+  volatile v32bf b = __a;
+  return b;
+}
+
+v64bf stacktest7 (v64bf __a)
+{
+  volatile v64bf b = __a;
+  return b;
+}
+
+v128bf stacktest8 (v128bf __a)
+{
+  volatile v128bf b = __a;
+  return b;
+}
+
+/* Test use of constant values to assign values to vectors.  */
+
+typedef bfloat16_t v2bf __attribute__((vector_size(4)));
+v2bf c2 (void) { return (v2bf) 0x12345678; }
+
+bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_vector_typecheck_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_vector_typecheck_1.c
new file mode 100644
index 000000000..4af3d295f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_vector_typecheck_1.c
@@ -0,0 +1,262 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps -Wno-pedantic" }  */
+#include <arm_neon.h>
+
+bfloat16_t glob_bfloat;
+bfloat16x4_t glob_bfloat_vec;
+
+float32x4_t is_a_float_vec;
+float32x2_t is_a_float_pair;
+
+float16x4_t *float_ptr;
+float16x4_t is_a_float16_vec;
+
+int32x4_t is_an_int_vec;
+int32x2_t is_an_int_pair;
+int16x4_t is_a_short_vec;
+
+int is_an_int;
+short is_a_short_int;
+float is_a_float;
+float is_a_float16;
+double is_a_double;
+
+/* Create a vector of 2 bfloat16_t.  */
+typedef bfloat16_t v2bf __attribute__((vector_size(4)));
+v2bf foo1 (void) { return (v2bf) 0x12345678; }
+bfloat16x4_t foo2 (void) { return (bfloat16x4_t) 0x1234567812345678; }
+
+bfloat16x4_t footest (bfloat16x4_t vector0)
+{
+  /* Initialisation  */
+
+  bfloat16x4_t vector1_1;
+  bfloat16x4_t vector1_2 = glob_bfloat_vec;
+  bfloat16x4_t vector1_3 = is_a_float_vec; /* { dg-error {incompatible types when initializing type 'bfloat16x4_t' using type 'float32x4_t'} } */
+  bfloat16x4_t vector1_4 = is_an_int_vec;  /* { dg-error {incompatible types when initializing type 'bfloat16x4_t' using type 'int32x4_t'} } */
+  bfloat16x4_t vector1_5 = is_a_float16_vec; /* { dg-error {incompatible types when initializing type 'bfloat16x4_t' using type 'float16x4_t'} } */
+  bfloat16x4_t vector1_6 = is_a_float_pair; /* { dg-error {incompatible types when initializing type 'bfloat16x4_t' using type 'float32x2_t'} } */
+  bfloat16x4_t vector1_7 = is_an_int_pair; /* { dg-error {incompatible types when initializing type 'bfloat16x4_t' using type 'int32x2_t'} } */
+  bfloat16x4_t vector1_8 = is_a_short_vec; /* { dg-error {incompatible types when initializing type 'bfloat16x4_t' using type 'int16x4_t'} } */
+
+  int32x4_t initi_1_1 = glob_bfloat_vec;   /* { dg-error {incompatible types when initializing type 'int32x4_t' using type 'bfloat16x4_t'} } */
+  float32x4_t initi_1_2 = glob_bfloat_vec; /* { dg-error {incompatible types when initializing type 'float32x4_t' using type 'bfloat16x4_t'} } */
+  float16x4_t initi_1_3 = glob_bfloat_vec; /* { dg-error {incompatible types when initializing type 'float16x4_t' using type 'bfloat16x4_t'} } */
+  float32x2_t initi_1_4 = glob_bfloat_vec; /* { dg-error {incompatible types when initializing type 'float32x2_t' using type 'bfloat16x4_t'} } */
+  int32x2_t initi_1_5 = glob_bfloat_vec;  /* { dg-error {incompatible types when initializing type 'int32x2_t' using type 'bfloat16x4_t'} } */
+  int16x4_t initi_1_6 = glob_bfloat_vec;  /* { dg-error {incompatible types when initializing type 'int16x4_t' using type 'bfloat16x4_t'} } */
+
+  bfloat16x4_t vector2_1 = {};
+  bfloat16x4_t vector2_2 = { glob_bfloat };
+  bfloat16x4_t vector2_3 = { glob_bfloat, glob_bfloat, glob_bfloat, glob_bfloat };
+  bfloat16x4_t vector2_4 = { 0 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16x4_t vector2_5 = { 0.1 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16x4_t vector2_6 = { is_a_float16 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16x4_t vector2_7 = { is_a_float }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16x4_t vector2_8 = { is_an_int }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16x4_t vector2_9 = { is_a_short_int }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16x4_t vector2_10 = { 0.0, 0, is_a_short_int, is_a_float }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+
+  int32x4_t initi_2_1 = { glob_bfloat };   /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  float32x4_t initi_2_2 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  float16x4_t initi_2_3 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  float32x2_t initi_2_4 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  int32x2_t initi_2_5 = { glob_bfloat };   /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  int16x4_t initi_2_6 = { glob_bfloat };   /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+
+  /* Assignments to/from vectors.  */
+
+  glob_bfloat_vec = glob_bfloat_vec;
+  glob_bfloat_vec = 0;   /* { dg-error {incompatible types when assigning to type 'bfloat16x4_t' from type 'int'} } */
+  glob_bfloat_vec = 0.1; /* { dg-error {incompatible types when assigning to type 'bfloat16x4_t' from type 'double'} } */
+  glob_bfloat_vec = is_a_float_vec; /* { dg-error {incompatible types when assigning to type 'bfloat16x4_t' from type 'float32x4_t'} } */
+  glob_bfloat_vec = is_an_int_vec; /* { dg-error {incompatible types when assigning to type 'bfloat16x4_t' from type 'int32x4_t'} } */
+  glob_bfloat_vec = is_a_float16_vec; /* { dg-error {incompatible types when assigning to type 'bfloat16x4_t' from type 'float16x4_t'} } */
+  glob_bfloat_vec = is_a_float_pair; /* { dg-error {incompatible types when assigning to type 'bfloat16x4_t' from type 'float32x2_t'} } */
+  glob_bfloat_vec = is_an_int_pair; /* { dg-error {incompatible types when assigning to type 'bfloat16x4_t' from type 'int32x2_t'} } */
+  glob_bfloat_vec = is_a_short_vec; /* { dg-error {incompatible types when assigning to type 'bfloat16x4_t' from type 'int16x4_t'} } */
+
+  is_an_int_vec = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'int32x4_t' from type 'bfloat16x4_t'} } */
+  is_a_float_vec = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'float32x4_t' from type 'bfloat16x4_t'} } */
+  is_a_float16_vec = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'float16x4_t' from type 'bfloat16x4_t'} } */
+  is_a_float_pair = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'float32x2_t' from type 'bfloat16x4_t'} } */
+  is_an_int_pair = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'int32x2_t' from type 'bfloat16x4_t'} } */
+  is_a_short_vec = glob_bfloat_vec;/* { dg-error {incompatible types when assigning to type 'int16x4_t' from type 'bfloat16x4_t'} } */
+
+  /* Assignments to/from elements.  */
+
+  vector2_3[0] = glob_bfloat;
+  vector2_3[0] = is_an_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  vector2_3[0] = is_a_short_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  vector2_3[0] = is_a_float; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  vector2_3[0] = is_a_float16; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  vector2_3[0] = 0; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  vector2_3[0] = 0.1; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+
+  glob_bfloat = vector2_3[0];
+  is_an_int = vector2_3[0]; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  is_a_short_int = vector2_3[0]; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  is_a_float = vector2_3[0]; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  is_a_float16 = vector2_3[0]; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+
+  /* Compound literals.  */
+
+  (bfloat16x4_t) {};
+
+  (bfloat16x4_t) { 0 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  (bfloat16x4_t) { 0.1 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  (bfloat16x4_t) { is_a_float_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'float32x4_t'} } */
+  (bfloat16x4_t) { is_an_int_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'int32x4_t'} } */
+  (bfloat16x4_t) { is_a_float_pair }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'float32x2_t'} } */
+  (bfloat16x4_t) { is_an_int_pair }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'int32x2_t'} } */
+  (bfloat16x4_t) { is_a_float16_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'float16x4_t'} } */
+  (bfloat16x4_t) { is_a_short_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'int16x4_t'} } */
+
+  (bfloat16x4_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'bfloat16x4_t'} } */
+  (int32x4_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type 'int' using type 'bfloat16x4_t'} } */
+  (float32x4_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type 'float' using type 'bfloat16x4_t'} } */
+  (int32x2_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type 'int' using type 'bfloat16x4_t'} } */
+  (float16x4_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type '__fp16' using type 'bfloat16x4_t'} } */
+  (int16x4_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type 'short int' using type 'bfloat16x4_t'} } */
+
+  /* Casting.  */
+
+  (void) glob_bfloat_vec;
+  (bfloat16x4_t) glob_bfloat_vec;
+
+  (bfloat16_t) glob_bfloat_vec; /* { dg-error {aggregate value used where a float was expected} } */
+  (short) glob_bfloat_vec; /* { dg-error {can't convert a vector of type 'bfloat16x4_t' to type 'short int' which has different size} } */
+  (int) glob_bfloat_vec; /* { dg-error {can't convert a vector of type 'bfloat16x4_t' to type 'int' which has different size} } */
+  (float16_t) glob_bfloat_vec; /* { dg-error {aggregate value used where a float was expected} } */
+  (float) glob_bfloat_vec; /* { dg-error {aggregate value used where a float was expected} } */
+  (double) glob_bfloat_vec; /* { dg-error {aggregate value used where a float was expected} } */
+
+  (int32x4_t) glob_bfloat_vec; /* { dg-error {can't convert a value of type 'bfloat16x4_t' to vector type '__Int32x4_t' which has different size} } */
+  (float32x4_t) glob_bfloat_vec; /* { dg-error {can't convert a value of type 'bfloat16x4_t' to vector type '__Float32x4_t' which has different size} } */
+  (float16x4_t) glob_bfloat_vec;
+  (int32x2_t) glob_bfloat_vec;
+  (float32x2_t) glob_bfloat_vec;
+  (int16x4_t) glob_bfloat_vec;
+
+  (bfloat16x4_t) is_an_int_vec; /* { dg-error {can't convert a value of type 'int32x4_t' to vector type '__Bfloat16x4_t' which has different size} } */
+  (bfloat16x4_t) is_a_float_vec; /* { dg-error {can't convert a value of type 'float32x4_t' to vector type '__Bfloat16x4_t' which has different size} } */
+  (bfloat16x4_t) is_a_float16_vec;
+  (bfloat16x4_t) is_an_int_pair;
+  (bfloat16x4_t) is_a_float_pair;
+  (bfloat16x4_t) is_a_short_vec;
+  (bfloat16x4_t) is_a_double; /* { dg-error {can't convert value to a vector} } */
+
+  /* Arrays and Structs.  */
+
+  typedef bfloat16x4_t array_type[2];
+  extern bfloat16x4_t extern_array[];
+
+  bfloat16x4_t array[2];
+  bfloat16x4_t zero_length_array[0];
+  bfloat16x4_t empty_init_array[] = {};
+  typedef bfloat16x4_t some_other_type[is_an_int];
+
+  struct struct1 {
+    bfloat16x4_t a;
+  };
+
+  union union1 {
+    bfloat16x4_t a;
+  };
+
+  /* Addressing and dereferencing.  */
+
+  bfloat16x4_t *bfloat_ptr = &vector0;
+  vector0 = *bfloat_ptr;
+
+  /* Pointer assignment.  */
+
+  bfloat16x4_t *bfloat_ptr2 = bfloat_ptr;
+  bfloat16x4_t *bfloat_ptr3 = array;
+
+  /* Pointer arithmetic.  */
+
+  ++bfloat_ptr;
+  --bfloat_ptr;
+  bfloat_ptr++;
+  bfloat_ptr--;
+  bfloat_ptr += 1;
+  bfloat_ptr -= 1;
+  bfloat_ptr - bfloat_ptr2;
+  bfloat_ptr = &bfloat_ptr3[0];
+  bfloat_ptr = &bfloat_ptr3[1];
+
+  /* Simple comparison.  */
+  vector0 > glob_bfloat_vec; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  glob_bfloat_vec == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0 > is_a_float_vec; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  is_a_float_vec == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0 > 0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  0 == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0 > 0.1; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  0.1 == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0 > is_an_int_vec; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  is_an_int_vec == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+
+  /* Pointer comparison.  */
+
+  bfloat_ptr == &vector0;
+  bfloat_ptr != &vector0;
+  bfloat_ptr < &vector0;
+  bfloat_ptr <= &vector0;
+  bfloat_ptr > &vector0;
+  bfloat_ptr >= &vector0;
+  bfloat_ptr == bfloat_ptr2;
+  bfloat_ptr != bfloat_ptr2;
+  bfloat_ptr < bfloat_ptr2;
+  bfloat_ptr <= bfloat_ptr2;
+  bfloat_ptr > bfloat_ptr2;
+  bfloat_ptr >= bfloat_ptr2;
+
+  /* Conditional expressions.  */
+
+  0 ? vector0 : vector0;
+  0 ? vector0 : is_a_float_vec; /* { dg-error {type mismatch in conditional expression} } */
+  0 ? is_a_float_vec : vector0; /* { dg-error {type mismatch in conditional expression} } */
+  0 ? vector0 : is_a_float16_vec; /* { dg-error {type mismatch in conditional expression} } */
+  0 ? is_a_float16_vec : vector0; /* { dg-error {type mismatch in conditional expression} } */
+  0 ? vector0 : 0; /* { dg-error {type mismatch in conditional expression} } */
+  0 ? 0 : vector0; /* { dg-error {type mismatch in conditional expression} } */
+  0 ? 0.1 : vector0; /* { dg-error {type mismatch in conditional expression} } */
+  0 ? vector0 : 0.1; /* { dg-error {type mismatch in conditional expression} } */
+  0 ? bfloat_ptr : bfloat_ptr2;
+  0 ? bfloat_ptr : float_ptr; /* { dg-error {pointer type mismatch in conditional expression} } */
+  0 ? float_ptr : bfloat_ptr; /* { dg-error {pointer type mismatch in conditional expression} } */
+
+  vector0 ? vector0 : vector0; /* { dg-error {used vector type where scalar is required} } */
+  vector0 ? is_a_float16_vec : vector0; /* { dg-error {used vector type where scalar is required} } */
+  vector0 ? vector0 : is_a_float16_vec; /* { dg-error {used vector type where scalar is required} } */
+  vector0 ? is_a_float16_vec : is_a_float16_vec; /* { dg-error {used vector type where scalar is required} } */
+
+  /* Unary operators.  */
+
+  +vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  -vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  ~vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  !vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  *vector0; /* { dg-error {invalid type argument of unary '\*'} } */
+  __real vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  __imag vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  ++vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  --vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0++; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0--; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+
+  /* Binary arithmetic operations.  */
+
+  vector0 = glob_bfloat_vec + *bfloat_ptr; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0 = glob_bfloat_vec + 0.1; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0 = glob_bfloat_vec + 0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0 = glob_bfloat_vec + is_a_float_vec; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+
+  return vector0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_vector_typecheck_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_vector_typecheck_2.c
new file mode 100644
index 000000000..99c499ce8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_vector_typecheck_2.c
@@ -0,0 +1,260 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps -Wno-pedantic" }  */
+#include <arm_neon.h>
+
+bfloat16_t glob_bfloat;
+bfloat16x8_t glob_bfloat_vec;
+
+float32x4_t is_a_float_vec;
+float64x2_t is_a_double_pair;
+
+float16x8_t *float_ptr;
+float16x8_t is_a_float16_vec;
+
+int32x4_t is_an_int_vec;
+int64x2_t is_a_long_int_pair;
+int16x8_t is_a_short_vec;
+
+int is_an_int;
+short is_a_short_int;
+float is_a_float;
+float is_a_float16;
+double is_a_double;
+
+bfloat16x8_t foo3 (void) { return (bfloat16x8_t) 0x12345678123456781234567812345678; }
+ /* { dg-error {integer constant is too large for its type} "" {target *-*-*} 27 } */
+ /* { dg-error {can't convert a value of type 'long int' to vector type '__Bfloat16x8_t' which has different size} "" {target *-*-*} 27 } */
+
+bfloat16x8_t footest (bfloat16x8_t vector0)
+{
+  /* Initialisation  */
+
+  bfloat16x8_t vector1_1;
+  bfloat16x8_t vector1_2 = glob_bfloat_vec;
+  bfloat16x8_t vector1_3 = is_a_float_vec; /* { dg-error {incompatible types when initializing type 'bfloat16x8_t' using type 'float32x4_t'} } */
+  bfloat16x8_t vector1_4 = is_an_int_vec;  /* { dg-error {incompatible types when initializing type 'bfloat16x8_t' using type 'int32x4_t'} } */
+  bfloat16x8_t vector1_5 = is_a_float16_vec; /* { dg-error {incompatible types when initializing type 'bfloat16x8_t' using type 'float16x8_t'} } */
+  bfloat16x8_t vector1_6 = is_a_double_pair; /* { dg-error {incompatible types when initializing type 'bfloat16x8_t' using type 'float64x2_t'} } */
+  bfloat16x8_t vector1_7 = is_a_long_int_pair; /* { dg-error {incompatible types when initializing type 'bfloat16x8_t' using type 'int64x2_t'} } */
+  bfloat16x8_t vector1_8 = is_a_short_vec; /* { dg-error {incompatible types when initializing type 'bfloat16x8_t' using type 'int16x8_t'} } */
+
+  int32x4_t initi_1_1 = glob_bfloat_vec;   /* { dg-error {incompatible types when initializing type 'int32x4_t' using type 'bfloat16x8_t'} } */
+  float32x4_t initi_1_2 = glob_bfloat_vec; /* { dg-error {incompatible types when initializing type 'float32x4_t' using type 'bfloat16x8_t'} } */
+  float16x8_t initi_1_3 = glob_bfloat_vec; /* { dg-error {incompatible types when initializing type 'float16x8_t' using type 'bfloat16x8_t'} } */
+  float64x2_t initi_1_4 = glob_bfloat_vec; /* { dg-error {incompatible types when initializing type 'float64x2_t' using type 'bfloat16x8_t'} } */
+  int64x2_t initi_1_5 = glob_bfloat_vec;  /* { dg-error {incompatible types when initializing type 'int64x2_t' using type 'bfloat16x8_t'} } */
+  int16x8_t initi_1_6 = glob_bfloat_vec;  /* { dg-error {incompatible types when initializing type 'int16x8_t' using type 'bfloat16x8_t'} } */
+
+  bfloat16x8_t vector2_1 = {};
+  bfloat16x8_t vector2_2 = { glob_bfloat };
+  bfloat16x8_t vector2_3 = { glob_bfloat, glob_bfloat, glob_bfloat, glob_bfloat };
+  bfloat16x8_t vector2_4 = { 0 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16x8_t vector2_5 = { 0.1 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16x8_t vector2_6 = { is_a_float16 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16x8_t vector2_7 = { is_a_float }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16x8_t vector2_8 = { is_an_int }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16x8_t vector2_9 = { is_a_short_int }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  bfloat16x8_t vector2_10 = { 0.0, 0, is_a_short_int, is_a_float }; /* { dg-error "invalid conversion to type 'bfloat16_t'" } */
+
+  int32x4_t initi_2_1 = { glob_bfloat };   /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  float32x4_t initi_2_2 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  float16x8_t initi_2_3 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  float64x2_t initi_2_4 = { glob_bfloat }; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  int64x2_t initi_2_5 = { glob_bfloat };   /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  int16x8_t initi_2_6 = { glob_bfloat };   /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+
+  /* Assignments to/from vectors.  */
+
+  glob_bfloat_vec = glob_bfloat_vec;
+  glob_bfloat_vec = 0;   /* { dg-error {incompatible types when assigning to type 'bfloat16x8_t' from type 'int'} } */
+  glob_bfloat_vec = 0.1; /* { dg-error {incompatible types when assigning to type 'bfloat16x8_t' from type 'double'} } */
+  glob_bfloat_vec = is_a_float_vec; /* { dg-error {incompatible types when assigning to type 'bfloat16x8_t' from type 'float32x4_t'} } */
+  glob_bfloat_vec = is_an_int_vec; /* { dg-error {incompatible types when assigning to type 'bfloat16x8_t' from type 'int32x4_t'} } */
+  glob_bfloat_vec = is_a_float16_vec; /* { dg-error {incompatible types when assigning to type 'bfloat16x8_t' from type 'float16x8_t'} } */
+  glob_bfloat_vec = is_a_double_pair; /* { dg-error {incompatible types when assigning to type 'bfloat16x8_t' from type 'float64x2_t'} } */
+  glob_bfloat_vec = is_a_long_int_pair; /* { dg-error {incompatible types when assigning to type 'bfloat16x8_t' from type 'int64x2_t'} } */
+  glob_bfloat_vec = is_a_short_vec; /* { dg-error {incompatible types when assigning to type 'bfloat16x8_t' from type 'int16x8_t'} } */
+
+  is_an_int_vec = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'int32x4_t' from type 'bfloat16x8_t'} } */
+  is_a_float_vec = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'float32x4_t' from type 'bfloat16x8_t'} } */
+  is_a_float16_vec = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'float16x8_t' from type 'bfloat16x8_t'} } */
+  is_a_double_pair = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'float64x2_t' from type 'bfloat16x8_t'} } */
+  is_a_long_int_pair = glob_bfloat_vec; /* { dg-error {incompatible types when assigning to type 'int64x2_t' from type 'bfloat16x8_t'} } */
+  is_a_short_vec = glob_bfloat_vec;/* { dg-error {incompatible types when assigning to type 'int16x8_t' from type 'bfloat16x8_t'} } */
+
+  /* Assignments to/from elements.  */
+
+  vector2_3[0] = glob_bfloat;
+  vector2_3[0] = is_an_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  vector2_3[0] = is_a_short_int; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  vector2_3[0] = is_a_float; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  vector2_3[0] = is_a_float16; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  vector2_3[0] = 0; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  vector2_3[0] = 0.1; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+
+  glob_bfloat = vector2_3[0];
+  is_an_int = vector2_3[0]; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  is_a_short_int = vector2_3[0]; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  is_a_float = vector2_3[0]; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+  is_a_float16 = vector2_3[0]; /* { dg-error {invalid conversion from type 'bfloat16_t'} } */
+
+  /* Compound literals.  */
+
+  (bfloat16x8_t) {};
+
+  (bfloat16x8_t) { 0 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  (bfloat16x8_t) { 0.1 }; /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  (bfloat16x8_t) { is_a_float_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'float32x4_t'} } */
+  (bfloat16x8_t) { is_an_int_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'int32x4_t'} } */
+  (bfloat16x8_t) { is_a_double_pair }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'float64x2_t'} } */
+  (bfloat16x8_t) { is_a_long_int_pair }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'int64x2_t'} } */
+  (bfloat16x8_t) { is_a_float16_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'float16x8_t'} } */
+  (bfloat16x8_t) { is_a_short_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'int16x8_t'} } */
+
+  (bfloat16x8_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type '__bf16' using type 'bfloat16x8_t'} } */
+  (int32x4_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type 'int' using type 'bfloat16x8_t'} } */
+  (float32x4_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type 'float' using type 'bfloat16x8_t'} } */
+  (int64x2_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type 'long int' using type 'bfloat16x8_t'} } */
+  (float16x8_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type '__fp16' using type 'bfloat16x8_t'} } */
+  (int16x8_t) { glob_bfloat_vec }; /* { dg-error {incompatible types when initializing type 'short int' using type 'bfloat16x8_t'} } */
+
+  /* Casting.  */
+
+  (void) glob_bfloat_vec;
+  (bfloat16x8_t) glob_bfloat_vec;
+
+  (bfloat16_t) glob_bfloat_vec; /* { dg-error {aggregate value used where a float was expected} } */
+  (short) glob_bfloat_vec; /* { dg-error {can't convert a vector of type 'bfloat16x8_t' to type 'short int' which has different size} } */
+  (int) glob_bfloat_vec; /* { dg-error {can't convert a vector of type 'bfloat16x8_t' to type 'int' which has different size} } */
+  (float16_t) glob_bfloat_vec; /* { dg-error {aggregate value used where a float was expected} } */
+  (float) glob_bfloat_vec; /* { dg-error {aggregate value used where a float was expected} } */
+  (double) glob_bfloat_vec; /* { dg-error {aggregate value used where a float was expected} } */
+
+  (int32x4_t) glob_bfloat_vec;
+  (float32x4_t) glob_bfloat_vec;
+  (float16x8_t) glob_bfloat_vec;
+  (int64x2_t) glob_bfloat_vec;
+  (float64x2_t) glob_bfloat_vec;
+  (int16x8_t) glob_bfloat_vec;
+
+  (bfloat16x8_t) is_an_int_vec;
+  (bfloat16x8_t) is_a_float_vec;
+  (bfloat16x8_t) is_a_float16_vec;
+  (bfloat16x8_t) is_a_long_int_pair;
+  (bfloat16x8_t) is_a_double_pair;
+  (bfloat16x8_t) is_a_short_vec;
+
+  /* Arrays and Structs.  */
+
+  typedef bfloat16x8_t array_type[2];
+  extern bfloat16x8_t extern_array[];
+
+  bfloat16x8_t array[2];
+  bfloat16x8_t zero_length_array[0];
+  bfloat16x8_t empty_init_array[] = {};
+  typedef bfloat16x8_t some_other_type[is_an_int];
+
+  struct struct1 {
+    bfloat16x8_t a;
+  };
+
+  union union1 {
+    bfloat16x8_t a;
+  };
+
+  /* Addressing and dereferencing.  */
+
+  bfloat16x8_t *bfloat_ptr = &vector0;
+  vector0 = *bfloat_ptr;
+
+  /* Pointer assignment.  */
+
+  bfloat16x8_t *bfloat_ptr2 = bfloat_ptr;
+  bfloat16x8_t *bfloat_ptr3 = array;
+
+  /* Pointer arithmetic.  */
+
+  ++bfloat_ptr;
+  --bfloat_ptr;
+  bfloat_ptr++;
+  bfloat_ptr--;
+  bfloat_ptr += 1;
+  bfloat_ptr -= 1;
+  bfloat_ptr - bfloat_ptr2;
+  bfloat_ptr = &bfloat_ptr3[0];
+  bfloat_ptr = &bfloat_ptr3[1];
+
+  /* Simple comparison.  */
+  vector0 > glob_bfloat_vec; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  glob_bfloat_vec == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0 > is_a_float_vec; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  is_a_float_vec == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0 > 0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  0 == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0 > 0.1; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  0.1 == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0 > is_an_int_vec; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  is_an_int_vec == vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+
+  /* Pointer comparison.  */
+
+  bfloat_ptr == &vector0;
+  bfloat_ptr != &vector0;
+  bfloat_ptr < &vector0;
+  bfloat_ptr <= &vector0;
+  bfloat_ptr > &vector0;
+  bfloat_ptr >= &vector0;
+  bfloat_ptr == bfloat_ptr2;
+  bfloat_ptr != bfloat_ptr2;
+  bfloat_ptr < bfloat_ptr2;
+  bfloat_ptr <= bfloat_ptr2;
+  bfloat_ptr > bfloat_ptr2;
+  bfloat_ptr >= bfloat_ptr2;
+
+  /* Conditional expressions.  */
+
+  0 ? vector0 : vector0;
+  0 ? vector0 : is_a_float_vec; /* { dg-error {type mismatch in conditional expression} } */
+  0 ? is_a_float_vec : vector0; /* { dg-error {type mismatch in conditional expression} } */
+  0 ? vector0 : is_a_float16_vec; /* { dg-error {type mismatch in conditional expression} } */
+  0 ? is_a_float16_vec : vector0; /* { dg-error {type mismatch in conditional expression} } */
+  0 ? vector0 : 0; /* { dg-error {type mismatch in conditional expression} } */
+  0 ? 0 : vector0; /* { dg-error {type mismatch in conditional expression} } */
+  0 ? 0.1 : vector0; /* { dg-error {type mismatch in conditional expression} } */
+  0 ? vector0 : 0.1; /* { dg-error {type mismatch in conditional expression} } */
+  0 ? bfloat_ptr : bfloat_ptr2;
+  0 ? bfloat_ptr : float_ptr; /* { dg-error {pointer type mismatch in conditional expression} } */
+  0 ? float_ptr : bfloat_ptr; /* { dg-error {pointer type mismatch in conditional expression} } */
+
+  vector0 ? vector0 : vector0; /* { dg-error {used vector type where scalar is required} } */
+  vector0 ? is_a_float16_vec : vector0; /* { dg-error {used vector type where scalar is required} } */
+  vector0 ? vector0 : is_a_float16_vec; /* { dg-error {used vector type where scalar is required} } */
+  vector0 ? is_a_float16_vec : is_a_float16_vec; /* { dg-error {used vector type where scalar is required} } */
+
+  /* Unary operators.  */
+
+  +vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  -vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  ~vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  !vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  *vector0; /* { dg-error {invalid type argument of unary '\*'} } */
+  __real vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  __imag vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  ++vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  --vector0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0++; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0--; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+
+  /* Binary arithmetic operations.  */
+
+  vector0 = glob_bfloat_vec + *bfloat_ptr; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0 = glob_bfloat_vec + 0.1; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0 = glob_bfloat_vec + 0; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+  vector0 = glob_bfloat_vec + is_a_float_vec; /* { dg-error {operation not permitted on type 'bfloat16_t'} } */
+
+  return vector0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c
deleted file mode 100644
index b12df2d3e..000000000
--- a/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/* { dg-do compile } */
-/* { dg-options "-O3 -mcpu=cortex-a72+crypto -dp" } */
-
-#include <arm_neon.h>
-
-#define AESE(r, v, key) (r = vaesdq_u8 ((v), (key)));
-#define AESMC(r, i) (r = vaesimcq_u8 (i))
-
-uint8x16_t dummy;
-uint8x16_t a;
-uint8x16_t b;
-uint8x16_t c;
-uint8x16_t d;
-uint8x16_t e;
-
-void
-foo (void)
-{
-  AESE (a, a, e);
-  dummy = vaddq_u8 (dummy, dummy);
-  dummy = vaddq_u8 (dummy, dummy);
-  AESE (b, b, e);
-  dummy = vaddq_u8 (dummy, dummy);
-  dummy = vaddq_u8 (dummy, dummy);
-  AESE (c, c, e);
-  dummy = vaddq_u8 (dummy, dummy);
-  dummy = vaddq_u8 (dummy, dummy);
-  AESE (d, d, e);
-  dummy = vaddq_u8 (dummy, dummy);
-  dummy = vaddq_u8 (dummy, dummy);
-
-  AESMC (a, a);
-  dummy = vaddq_u8 (dummy, dummy);
-  dummy = vaddq_u8 (dummy, dummy);
-  AESMC (b, b);
-  dummy = vaddq_u8 (dummy, dummy);
-  dummy = vaddq_u8 (dummy, dummy);
-  AESMC (c, c);
-  dummy = vaddq_u8 (dummy, dummy);
-  dummy = vaddq_u8 (dummy, dummy);
-  AESMC (d, d);
-}
-
-/* { dg-final { scan-assembler-times "crypto_aesd_fused" 4 } } */
-
diff --git a/gcc/testsuite/gcc.target/aarch64/diag_aka_1.c b/gcc/testsuite/gcc.target/aarch64/diag_aka_1.c
new file mode 100644
index 000000000..59e24f48b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/diag_aka_1.c
@@ -0,0 +1,14 @@
+#include <arm_neon.h>
+
+typedef int16x4_t myvec;
+
+void f (float x)
+{
+  __Int8x8_t y1 = x; /* { dg-error {incompatible types when initializing type '__Int8x8_t' using type 'float'} } */
+  __Int8x8_t *ptr1 = &x; /* { dg-error {initialization of '__Int8x8_t \*' from incompatible pointer type 'float \*'} } */
+  int8x8_t y2 = x; /* { dg-error {incompatible types when initializing type 'int8x8_t' using type 'float'} } */
+  int8x8_t *ptr2 = &x; /* { dg-error {initialization of 'int8x8_t \*' from incompatible pointer type 'float \*'} } */
+  /* ??? For these it would be better to print an aka for 'int16x4_t'.  */
+  myvec y3 = x; /* { dg-error {incompatible types when initializing type 'myvec' using type 'float'} } */
+  myvec *ptr3 = &x; /* { dg-error {initialization of 'myvec \*' from incompatible pointer type 'float \*'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fmul_scvtf_1.c b/gcc/testsuite/gcc.target/aarch64/fmul_scvtf_1.c
new file mode 100644
index 000000000..8bfe06ac3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmul_scvtf_1.c
@@ -0,0 +1,140 @@
+/* { dg-do run } */
+/* { dg-options "-save-temps -O2 -fno-inline" } */
+
+#define FUNC_DEFS(__a)				\
+float						\
+fsfoo##__a (int x)				\
+{						\
+  return ((float) x)/(1lu << __a);		\
+}						\
+float						\
+fusfoo##__a (unsigned int x)			\
+{						\
+  return ((float) x)/(1lu << __a);		\
+}						\
+float						\
+fslfoo##__a (long long x)			\
+{						\
+  return ((float) x)/(1lu << __a);		\
+}						\
+float						\
+fulfoo##__a (unsigned long long x)		\
+{						\
+  return ((float) x)/(1lu << __a);		\
+}						\
+
+#define FUNC_DEFD(__a)				\
+double						\
+dsfoo##__a (int x)				\
+{						\
+  return ((double) x)/(1lu << __a);		\
+}						\
+double						\
+dusfoo##__a (unsigned int x)			\
+{						\
+  return ((double) x)/(1lu << __a);		\
+}						\
+double						\
+dslfoo##__a (long long x)			\
+{						\
+  return ((double) x)/(1lu << __a);		\
+}						\
+double						\
+dulfoo##__a (unsigned long long x)		\
+{						\
+  return ((double) x)/(1lu << __a);		\
+}
+
+FUNC_DEFS (4)
+	/* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], w\[0-9\]*.*#4" 1 } } */
+	/* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], w\[0-9\]*.*#4" 1 } } */
+	/* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], x\[0-9\]*.*#4" 1 } } */
+	/* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], x\[0-9\]*.*#4" 1 } } */
+
+FUNC_DEFD (4)
+	/* { dg-final { scan-assembler-times "scvtf\td\[0-9\], w\[0-9\]*.*#4" 1 } } */
+	/* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], w\[0-9\]*.*#4" 1 } } */
+	/* { dg-final { scan-assembler-times "scvtf\td\[0-9\], x\[0-9\]*.*#4" 1 } } */
+	/* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], x\[0-9\]*.*#4" 1 } } */
+
+FUNC_DEFS (8)
+	/* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], w\[0-9\]*.*#8" 1 } } */
+	/* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], w\[0-9\]*.*#8" 1 } } */
+	/* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], x\[0-9\]*.*#8" 1 } } */
+	/* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], x\[0-9\]*.*#8" 1 } } */
+
+FUNC_DEFD (8)
+	/* { dg-final { scan-assembler-times "scvtf\td\[0-9\], w\[0-9\]*.*#8" 1 } } */
+	/* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], w\[0-9\]*.*#8" 1 } } */
+	/* { dg-final { scan-assembler-times "scvtf\td\[0-9\], x\[0-9\]*.*#8" 1 } } */
+	/* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], x\[0-9\]*.*#8" 1 } } */
+
+FUNC_DEFS (16)
+	/* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], w\[0-9\]*.*#16" 1 } } */
+	/* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], w\[0-9\]*.*#16" 1 } } */
+	/* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], x\[0-9\]*.*#16" 1 } } */
+	/* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], x\[0-9\]*.*#16" 1 } } */
+
+FUNC_DEFD (16)
+	/* { dg-final { scan-assembler-times "scvtf\td\[0-9\], w\[0-9\]*.*#16" 1 } } */
+	/* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], w\[0-9\]*.*#16" 1 } } */
+	/* { dg-final { scan-assembler-times "scvtf\td\[0-9\], x\[0-9\]*.*#16" 1 } } */
+	/* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], x\[0-9\]*.*#16" 1 } } */
+
+FUNC_DEFS (32)
+	/* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], w\[0-9\]*.*#32" 1 } } */
+	/* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], w\[0-9\]*.*#32" 1 } } */
+	/* { dg-final { scan-assembler-times "scvtf\ts\[0-9\], x\[0-9\]*.*#32" 1 } } */
+	/* { dg-final { scan-assembler-times "ucvtf\ts\[0-9\], x\[0-9\]*.*#32" 1 } } */
+
+FUNC_DEFD (32)
+	/* { dg-final { scan-assembler-times "scvtf\td\[0-9\], w\[0-9\]*.*#32" 1 } } */
+	/* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], w\[0-9\]*.*#32" 1 } } */
+	/* { dg-final { scan-assembler-times "scvtf\td\[0-9\], x\[0-9\]*.*#32" 1 } } */
+	/* { dg-final { scan-assembler-times "ucvtf\td\[0-9\], x\[0-9\]*.*#32" 1 } } */
+
+#define FUNC_TESTS(__a, __b)					\
+do								\
+{								\
+  if (fsfoo##__a (__b) !=  ((int) i) * (1.0f/(1lu << __a)) )	\
+    __builtin_abort ();						\
+  if (fusfoo##__a (__b) != ((int) i) * (1.0f/(1lu << __a)) )	\
+    __builtin_abort ();						\
+  if (fslfoo##__a (__b) != ((int) i) * (1.0f/(1lu << __a)) )	\
+    __builtin_abort ();						\
+  if (fulfoo##__a (__b) != ((int) i) * (1.0f/(1lu << __a)) )	\
+    __builtin_abort ();						\
+} while (0)
+
+#define FUNC_TESTD(__a, __b)					\
+do								\
+{								\
+  if (dsfoo##__a (__b) !=  ((int) i) * (1.0d/(1lu << __a)) )	\
+    __builtin_abort ();						\
+  if (dusfoo##__a (__b) != ((int) i) * (1.0d/(1lu << __a)) )	\
+    __builtin_abort ();						\
+  if (dslfoo##__a (__b) != ((int) i) * (1.0d/(1lu << __a)) )	\
+    __builtin_abort ();						\
+  if (dulfoo##__a (__b) != ((int) i) * (1.0d/(1lu << __a)) )	\
+    __builtin_abort ();						\
+} while (0)
+
+int
+main (void)
+{
+	int i;
+
+	for (i = 0; i < 32; i ++)
+	{
+		FUNC_TESTS (4, i);
+		FUNC_TESTS (8, i);
+		FUNC_TESTS (16, i);
+		FUNC_TESTS (32, i);
+
+		FUNC_TESTD (4, i);
+		FUNC_TESTD (8, i);
+		FUNC_TESTD (16, i);
+		FUNC_TESTD (32, i);
+	}
+	return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/pr88834.c b/gcc/testsuite/gcc.target/aarch64/pr88834.c
new file mode 100644
index 000000000..ea00967ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr88834.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-S -O3 -march=armv8.2-a+sve" } */
+
+void
+f (int *restrict x, int *restrict y, int *restrict z, int n)
+{
+  for (int i = 0; i < n; i += 2)
+    {
+      x[i] = y[i] + z[i];
+      x[i + 1] = y[i + 1] - z[i + 1];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tld2w\t{z[0-9]+.s - z[0-9]+.s}, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tst2w\t{z[0-9]+.s - z[0-9]+.s}, p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
new file mode 100644
index 000000000..fa2267598
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
@@ -0,0 +1,215 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8-a")
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8-a+tme")
+#ifndef __ARM_FEATURE_TME
+#error "__ARM_FEATURE_TME is not defined but should be!"
+#endif
+
+#pragma GCC pop_options
+
+#ifdef __ARM_FEATURE_TME
+#error "__ARM_FEATURE_TME is defined but should not be!"
+#endif
+
+/* Test Armv8.6-A features.  */
+
+#ifdef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_INT8
+#error "__ARM_FEATURE_SVE_MATMUL_INT8 is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_FP32
+#error "__ARM_FEATURE_SVE_MATMUL_FP32 is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_FP64
+#error "__ARM_FEATURE_SVE_MATMUL_FP64 is defined but should not be!"
+#endif
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a")
+#ifndef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
+#endif
+#ifdef __ARM_FEATURE_SVE
+#error "__ARM_FEATURE_SVE is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_INT8
+#error "__ARM_FEATURE_SVE_MATMUL_INT8 is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_FP32
+#error "__ARM_FEATURE_SVE_MATMUL_FP32 is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_FP64
+#error "__ARM_FEATURE_SVE_MATMUL_FP64 is defined but should not be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a+sve")
+#ifndef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
+#endif
+#ifndef __ARM_FEATURE_SVE
+#error "__ARM_FEATURE_SVE is not defined but should be!"
+#endif
+#ifndef __ARM_FEATURE_SVE_MATMUL_INT8
+#error "__ARM_FEATURE_SVE_MATMUL_INT8 is not defined but should be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_FP32
+#error "__ARM_FEATURE_SVE_MATMUL_FP32 is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_FP64
+#error "__ARM_FEATURE_SVE_MATMUL_FP64 is defined but should not be!"
+#endif
+#pragma GCC pop_pragma
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+i8mm")
+#ifndef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
+#endif
+#ifdef __ARM_FEATURE_SVE
+#error "__ARM_FEATURE_SVE is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_INT8
+#error "__ARM_FEATURE_SVE_MATMUL_INT8 is defined but should not be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+i8mm+sve")
+#ifndef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
+#endif
+#ifndef __ARM_FEATURE_SVE
+#error "__ARM_FEATURE_SVE is not defined but should be!"
+#endif
+#ifndef __ARM_FEATURE_SVE_MATMUL_INT8
+#error "__ARM_FEATURE_SVE_MATMUL_INT8 is not defined but should be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_FP32
+#error "__ARM_FEATURE_SVE_MATMUL_FP32 is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_FP64
+#error "__ARM_FEATURE_SVE_MATMUL_FP64 is defined but should not be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+f32mm")
+#ifndef __ARM_FEATURE_SVE
+#error "__ARM_FEATURE_SVE is not defined but should be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_INT8
+#error "__ARM_FEATURE_SVE_MATMUL_INT8 is defined but should not be!"
+#endif
+#ifndef __ARM_FEATURE_SVE_MATMUL_FP32
+#error "__ARM_FEATURE_SVE_MATMUL_FP32 is not defined but should be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_FP64
+#error "__ARM_FEATURE_SVE_MATMUL_FP64 is defined but should not be!"
+#endif
+#pragma GCC pop_pragma
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+f64mm")
+#ifndef __ARM_FEATURE_SVE
+#error "__ARM_FEATURE_SVE is not defined but should be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_INT8
+#error "__ARM_FEATURE_SVE_MATMUL_INT8 is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_FP32
+#error "__ARM_FEATURE_SVE_MATMUL_FP32 is defined but should not be!"
+#endif
+#ifndef __ARM_FEATURE_SVE_MATMUL_FP64
+#error "__ARM_FEATURE_SVE_MATMUL_FP64 is not defined but should be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a+nosimd")
+#ifdef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_FP32
+#error "__ARM_FEATURE_SVE_MATMUL_FP32 is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_FP64
+#error "__ARM_FEATURE_SVE_MATMUL_FP64 is defined but should not be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a+nofp")
+#ifdef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_FP32
+#error "__ARM_FEATURE_SVE_MATMUL_FP32 is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_SVE_MATMUL_FP64
+#error "__ARM_FEATURE_SVE_MATMUL_FP64 is defined but should not be!"
+#endif
+#pragma GCC pop_options
+
+#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
+#endif
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a")
+#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
+#endif
+#ifndef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is not defined but should be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
+#endif
+#ifndef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is not defined but should be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16+nosimd")
+#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
+#endif
+#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a+nofp")
+#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC pop_options
+
+int
+foo (int a)
+{
+  return a;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/signbitv2sf.c b/gcc/testsuite/gcc.target/aarch64/signbitv2sf.c
new file mode 100644
index 000000000..2587bfedd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/signbitv2sf.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O3 --save-temps" } */
+
+extern void abort ();
+
+#define N 8
+float in[N] = {1.0, -1.0, -2.0, 3.0, -5.0, -8.0, 13.0, 21.0};
+int out[N];
+
+void
+foo (int *i, float *f)
+{
+  i[0] = __builtin_signbit (f[0]);
+  i[1] = __builtin_signbit (f[1]);
+}
+
+/* { dg-final { scan-assembler-not {-2147483648} } } */
+/* { dg-final { scan-assembler {\tushr\tv[0-9]+.2s, v[0-9]+.2s, 31} } } */
+
+int
+main ()
+{
+  int i;
+
+  foo (out, in);
+  foo (out + 2, in + 2);
+  foo (out + 4, in + 4);
+  foo (out + 6, in + 6);
+
+  for (i = 0; i < N; i++)
+  {
+    if (in[i] >= 0.0 && out[i])
+      abort ();
+    if (in[i] < 0.0 && !out[i])
+      abort ();
+  }
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/signbitv4sf.c b/gcc/testsuite/gcc.target/aarch64/signbitv4sf.c
new file mode 100644
index 000000000..18cffdc7d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/signbitv4sf.c
@@ -0,0 +1,38 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O3 --save-temps" } */
+
+extern void abort ();
+
+#define N 1024
+float in[N] = {1.0, -1.0, -2.0, 3.0, -5.0, -8.0, 13.0, 21.0};
+int out[N];
+
+void
+foo ()
+{
+  int i;
+  for (i = 0; i < N; i++)
+    out[i] = __builtin_signbit (in[i]);
+}
+
+/* { dg-final { scan-assembler-not {-2147483648} } } */
+/* { dg-final { scan-assembler {\tushr\tv[0-9]+.4s, v[0-9]+.4s, 31} } } */
+
+int
+main ()
+{
+  int i;
+
+  foo ();
+
+  for (i = 0; i < N; i++)
+  {
+    if (in[i] >= 0.0 && out[i])
+      abort ();
+    if (in[i] < 0.0 && !out[i])
+      abort ();
+  }
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/ssra.c b/gcc/testsuite/gcc.target/aarch64/simd/ssra.c
new file mode 100644
index 000000000..e9c2e04c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/ssra.c
@@ -0,0 +1,36 @@
+/* { dg-do compile { target aarch64*-*-* } } */
+/* { dg-options "-O3" } */
+/* { dg-skip-if "" { *-*-* } {"*sve*"} {""} } */
+
+#include <stdint.h>
+
+#define SSRA(func, vtype, n)				\
+	void func ()					\
+	{						\
+	    int i;					\
+	    for (i = 0; i < n; i++)			\
+	    {						\
+		s1##vtype[i] += s2##vtype[i] >> 2;	\
+	    }						\
+	}
+
+#define TEST_VDQ_I_MODES(FUNC)				\
+	FUNC (test_v8qi_v16qi, _char, 16)		\
+	FUNC (test_v4hi_v8h1, _short, 8)		\
+	FUNC (test_v2si_v4si, _int, 4)			\
+	FUNC (test_v2di, _ll, 2)			\
+
+int8_t s1_char[16], s2_char[16];
+int16_t s1_short[8], s2_short[8];
+int32_t s1_int[4], s2_int[4];
+int64_t s1_ll[2], s2_ll[2];
+
+TEST_VDQ_I_MODES(SSRA)
+
+/* { dg-final { scan-assembler "ssra" } } */
+/* { dg-final { scan-assembler-not "sshr" } } */
+
+/* { dg-final { scan-assembler-times {ssra\tv[0-9]+\.16b, v[0-9]+\.16b, [0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {ssra\tv[0-9]+\.8h, v[0-9]+\.8h, [0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {ssra\tv[0-9]+\.4s, v[0-9]+\.4s, [0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {ssra\tv[0-9]+\.2d, v[0-9]+\.2d, [0-9]+} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/usra.c b/gcc/testsuite/gcc.target/aarch64/simd/usra.c
new file mode 100644
index 000000000..4e7446dfa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/usra.c
@@ -0,0 +1,36 @@
+/* { dg-do compile { target aarch64*-*-* } } */
+/* { dg-options "-O3" } */
+/* { dg-skip-if "" { *-*-* } {"*sve*"} {""} } */
+
+#include <stdint.h>
+
+#define USRA(func, vtype, n)				\
+	void func ()					\
+	{						\
+	    int i;					\
+	    for (i = 0; i < n; i++)			\
+	    {						\
+		u1##vtype[i] += u2##vtype[i] >> 2;	\
+	    }						\
+	}
+
+#define TEST_VDQ_I_MODES(FUNC)				\
+	FUNC (test_v8qi_v16qi, _char, 16)		\
+	FUNC (test_v4hi_v8h1, _short, 8)		\
+	FUNC (test_v2si_v4si, _int, 4)			\
+	FUNC (test_v2di, _ll, 2)			\
+
+uint8_t u1_char[16], u2_char[16];
+uint16_t u1_short[8], u2_short[8];
+uint32_t u1_int[4], u2_int[4];
+uint64_t u1_ll[2], u2_ll[2];
+
+TEST_VDQ_I_MODES(USRA)
+
+/* { dg-final { scan-assembler "usra" } } */
+/* { dg-final { scan-assembler-not "ushr" } } */
+
+/* { dg-final { scan-assembler-times {usra\tv[0-9]+\.16b, v[0-9]+\.16b, [0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {usra\tv[0-9]+\.8h, v[0-9]+\.8h, [0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {usra\tv[0-9]+\.4s, v[0-9]+\.4s, [0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {usra\tv[0-9]+\.2d, v[0-9]+\.2d, [0-9]+} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c b/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c
new file mode 100644
index 000000000..5eec2b5cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmmla.c
@@ -0,0 +1,27 @@
+/* { dg-do assemble} */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+i8mm" } */
+
+#include "arm_neon.h"
+
+int32x4_t
+test_vmmlaq_s32 (int32x4_t r, int8x16_t a, int8x16_t b)
+{
+  return vmmlaq_s32 (r, a, b);
+}
+
+uint32x4_t
+test_vmmlaq_u32 (uint32x4_t r, uint8x16_t a, uint8x16_t b)
+{
+  return vmmlaq_u32 (r, a, b);
+}
+
+int32x4_t
+test_vusmmlaq_s32 (int32x4_t r, uint8x16_t a, int8x16_t b)
+{
+  return vusmmlaq_s32 (r, a, b);
+}
+
+/* { dg-final { scan-assembler-times {\tsmmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} 1 } } */
+/* { dg-final { scan-assembler-times {\tummla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} 1 } } */
+/* { dg-final { scan-assembler-times {\tusmmla\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vrndnzx_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vrndnzx_1.c
new file mode 100644
index 000000000..0399b838d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vrndnzx_1.c
@@ -0,0 +1,137 @@
+/* Test the vrnd[32,64][z,x] intrinsics.  */
+
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.5-a" } */
+
+#include "arm_neon.h"
+
+#ifdef __ARM_FEATURE_FRINT
+
+float32x2_t
+foo_32z (float32x2_t a)
+{
+  return vrnd32z_f32 (a);
+}
+
+/* { dg-final { scan-assembler-times "frint32z\tv\[0-9\]+\.2s, v\[0-9\]+\.2s\n" 1 } } */
+
+float32x4_t
+foo_32z_q (float32x4_t a)
+{
+  return vrnd32zq_f32 (a);
+}
+
+/* { dg-final { scan-assembler-times "frint32z\tv\[0-9\]+\.4s, v\[0-9\]+\.4s\n" 1 } } */
+
+float64x1_t
+foo_32z_f64 (float64x1_t a)
+{
+  return vrnd32z_f64 (a);
+}
+
+/* { dg-final { scan-assembler-times "frint32z\td\[0-9\]+, d\[0-9\]+\n" 1 } } */
+
+float64x2_t
+foo_32z_q_f64 (float64x2_t a)
+{
+  return vrnd32zq_f64 (a);
+}
+
+/* { dg-final { scan-assembler-times "frint32z\tv\[0-9\]+\.2d, v\[0-9\]+\.2d\n" 1 } } */
+
+float32x2_t
+foo_32x (float32x2_t a)
+{
+  return vrnd32x_f32 (a);
+}
+
+/* { dg-final { scan-assembler-times "frint32x\tv\[0-9\]+\.2s, v\[0-9\]+\.2s\n" 1 } } */
+
+float32x4_t
+foo_32x_q (float32x4_t a)
+{
+  return vrnd32xq_f32 (a);
+}
+
+/* { dg-final { scan-assembler-times "frint32x\tv\[0-9\]+\.4s, v\[0-9\]+\.4s\n" 1 } } */
+
+float64x1_t
+foo_32x_f64 (float64x1_t a)
+{
+  return vrnd32x_f64 (a);
+}
+
+/* { dg-final { scan-assembler-times "frint32x\td\[0-9\]+, d\[0-9\]+\n" 1 } } */
+
+float64x2_t
+foo_32x_q_f64 (float64x2_t a)
+{
+  return vrnd32xq_f64 (a);
+}
+
+/* { dg-final { scan-assembler-times "frint32x\tv\[0-9\]+\.2d, v\[0-9\]+\.2d\n" 1 } } */
+
+float32x2_t
+foo_64z (float32x2_t a)
+{
+  return vrnd64z_f32 (a);
+}
+
+/* { dg-final { scan-assembler-times "frint64z\tv\[0-9\]+\.2s, v\[0-9\]+\.2s\n" 1 } } */
+
+float32x4_t
+foo_64z_q (float32x4_t a)
+{
+  return vrnd64zq_f32 (a);
+}
+
+/* { dg-final { scan-assembler-times "frint64z\tv\[0-9\]+\.4s, v\[0-9\]+\.4s\n" 1 } } */
+
+float64x1_t
+foo_64z_f64 (float64x1_t a)
+{
+  return vrnd64z_f64 (a);
+}
+
+/* { dg-final { scan-assembler-times "frint64z\td\[0-9\]+, d\[0-9\]+\n" 1 } } */
+
+float64x2_t
+foo_64z_q_f64 (float64x2_t a)
+{
+  return vrnd64zq_f64 (a);
+}
+
+/* { dg-final { scan-assembler-times "frint64z\tv\[0-9\]+\.2d, v\[0-9\]+\.2d\n" 1 } } */
+
+float32x2_t
+foo_64x (float32x2_t a)
+{
+  return vrnd64x_f32 (a);
+}
+
+/* { dg-final { scan-assembler-times "frint64x\tv\[0-9\]+\.2s, v\[0-9\]+\.2s\n" 1 } } */
+
+float32x4_t
+foo_64x_q (float32x4_t a)
+{
+  return vrnd64xq_f32 (a);
+}
+
+/* { dg-final { scan-assembler-times "frint64x\tv\[0-9\]+\.4s, v\[0-9\]+\.4s\n" 1 } } */
+
+float64x1_t
+foo_64x_f64 (float64x1_t a)
+{
+  return vrnd64x_f64 (a);
+}
+
+/* { dg-final { scan-assembler-times "frint64x\td\[0-9\]+, d\[0-9\]+\n" 1 } } */
+
+float64x2_t
+foo_64x_q_f64 (float64x2_t a)
+{
+  return vrnd64xq_f64 (a);
+}
+
+/* { dg-final { scan-assembler-times "frint64x\tv\[0-9\]+\.2d, v\[0-9\]+\.2d\n" 1 } } */
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c b/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c
new file mode 100644
index 000000000..08b6831cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/ssadv16qi-dotprod.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+/* { dg-additional-options "-O3" } */
+
+#pragma GCC target "+nosve"
+
+#define N 1024
+
+signed char pix1[N], pix2[N];
+
+int foo (void)
+{
+  int i_sum = 0;
+  int i;
+
+  for (i = 0; i < N; i++)
+    i_sum += __builtin_abs (pix1[i] - pix2[i]);
+
+  return i_sum;
+}
+
+/* { dg-final { scan-assembler-not {\tsshll\t} } } */
+/* { dg-final { scan-assembler-not {\tsshll2\t} } } */
+/* { dg-final { scan-assembler-not {\tssubl\t} } } */
+/* { dg-final { scan-assembler-not {\tssubl2\t} } } */
+/* { dg-final { scan-assembler-not {\tabs\t} } } */
+
+/* { dg-final { scan-assembler {\tsabd\t} } } */
+/* { dg-final { scan-assembler {\tudot\t} } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c b/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
index 40b288436..85a867a11 100644
--- a/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
+++ b/gcc/testsuite/gcc.target/aarch64/ssadv16qi.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O3" } */
 
-#pragma GCC target "+nosve"
+#pragma GCC target "+nosve+nodotprod"
 
 #define N 1024
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp b/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp
new file mode 100644
index 000000000..7ce85a414
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle-asm.exp
@@ -0,0 +1,79 @@
+#  Assembly-based regression-test driver for the SVE ACLE
+#  Copyright (C) 2009-2019 Free Software Foundation, Inc.
+#
+#  This file is part of GCC.
+#
+#  GCC is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 3, or (at your option)
+#  any later version.
+#
+#  GCC is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with GCC; see the file COPYING3.  If not see
+#  <http://www.gnu.org/licenses/>.  */
+
+# GCC testsuite that uses the `dg.exp' driver.
+
+# Exit immediately if this isn't an AArch64 target.
+if {![istarget aarch64*-*-*] } {
+    return
+}
+
+# Load support procs.
+load_lib gcc-dg.exp
+
+# Initialize `dg'.
+dg-init
+
+# Force SVE if we're not testing it already.
+if { [check_effective_target_aarch64_sve] } {
+    set sve_flags ""
+} else {
+    set sve_flags "-march=armv8.2-a+sve"
+}
+
+global gcc_runtest_parallelize_limit_minor
+if { [info exists gcc_runtest_parallelize_limit_minor] } {
+    set old_limit_minor $gcc_runtest_parallelize_limit_minor
+    set gcc_runtest_parallelize_limit_minor 1
+}
+
+torture-init
+set-torture-options {
+    "-std=c90 -O0 -g"
+    "-std=c90 -O1 -g"
+    "-std=c99 -O2 -g"
+    "-std=c11 -O3 -g"
+    "-std=gnu90 -O2 -fno-schedule-insns -DCHECK_ASM --save-temps"
+    "-std=gnu99 -Ofast -g"
+    "-std=gnu11 -Os -g"
+} {
+    "-DTEST_FULL"
+    "-DTEST_OVERLOADS"
+}
+
+# Main loop.
+set files [glob -nocomplain $srcdir/$subdir/asm/*.c]
+set save-dg-do-what-default ${dg-do-what-default}
+if { [check_effective_target_aarch64_asm_sve_ok]
+     && [check_effective_target_aarch64_variant_pcs] } {
+    set dg-do-what-default assemble
+} else {
+    set dg-do-what-default compile
+}
+gcc-dg-runtest [lsort $files] "" "$sve_flags -fno-ipa-icf"
+set dg-do-what-default ${save-dg-do-what-default}
+
+torture-finish
+
+if { [info exists gcc_runtest_parallelize_limit_minor] } {
+    set gcc_runtest_parallelize_limit_minor $old_limit_minor
+}
+
+# All done.
+dg-finish
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle.exp b/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle.exp
new file mode 100644
index 000000000..34d9dfd43
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/aarch64-sve-acle.exp
@@ -0,0 +1,54 @@
+#  Specific regression driver for AArch64 SVE.
+#  Copyright (C) 2009-2019 Free Software Foundation, Inc.
+#  Contributed by ARM Ltd.
+#
+#  This file is part of GCC.
+#
+#  GCC is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 3, or (at your option)
+#  any later version.
+#
+#  GCC is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with GCC; see the file COPYING3.  If not see
+#  <http://www.gnu.org/licenses/>.  */
+
+# GCC testsuite that uses the `dg.exp' driver.
+
+# Exit immediately if this isn't an AArch64 target.
+if {![istarget aarch64*-*-*] } {
+    return
+}
+
+# Load support procs.
+load_lib gcc-dg.exp
+
+# If a testcase doesn't have special options, use these.
+global DEFAULT_CFLAGS
+if ![info exists DEFAULT_CFLAGS] then {
+    set DEFAULT_CFLAGS " -ansi -pedantic-errors"
+}
+
+# Initialize `dg'.
+dg-init
+
+# Force SVE if we're not testing it already.
+if { [check_effective_target_aarch64_sve] } {
+    set sve_flags ""
+} else {
+    set sve_flags "-march=armv8.2-a+sve"
+}
+
+# Main loop.
+# FIXME: This should include general/*.c too, but leave that until the
+# C frontend allows initialization of SVE vectors.
+set files [glob -nocomplain $srcdir/$subdir/general-c/*.c]
+dg-runtest [lsort $files] "$sve_flags" $DEFAULT_CFLAGS
+
+# All done.
+dg-finish
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f16.c
new file mode 100644
index 000000000..c019f248d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f16.c
@@ -0,0 +1,552 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abd_f16_m_tied1:
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f16_m_tied1, svfloat16_t,
+		z0 = svabd_f16_m (p0, z0, z1),
+		z0 = svabd_m (p0, z0, z1))
+
+/*
+** abd_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fabd	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f16_m_tied2, svfloat16_t,
+		z0 = svabd_f16_m (p0, z1, z0),
+		z0 = svabd_m (p0, z1, z0))
+
+/*
+** abd_f16_m_untied:
+**	movprfx	z0, z1
+**	fabd	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f16_m_untied, svfloat16_t,
+		z0 = svabd_f16_m (p0, z1, z2),
+		z0 = svabd_m (p0, z1, z2))
+
+/*
+** abd_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svabd_n_f16_m (p0, z0, d4),
+		 z0 = svabd_m (p0, z0, d4))
+
+/*
+** abd_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svabd_n_f16_m (p0, z1, d4),
+		 z0 = svabd_m (p0, z1, d4))
+
+/*
+** abd_1_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f16_m_tied1, svfloat16_t,
+		z0 = svabd_n_f16_m (p0, z0, 1),
+		z0 = svabd_m (p0, z0, 1))
+
+/*
+** abd_1_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f16_m_untied, svfloat16_t,
+		z0 = svabd_n_f16_m (p0, z1, 1),
+		z0 = svabd_m (p0, z1, 1))
+
+/*
+** abd_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f16_z_tied1, svfloat16_t,
+		z0 = svabd_f16_z (p0, z0, z1),
+		z0 = svabd_z (p0, z0, z1))
+
+/*
+** abd_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f16_z_tied2, svfloat16_t,
+		z0 = svabd_f16_z (p0, z1, z0),
+		z0 = svabd_z (p0, z1, z0))
+
+/*
+** abd_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fabd	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f16_z_untied, svfloat16_t,
+		z0 = svabd_f16_z (p0, z1, z2),
+		z0 = svabd_z (p0, z1, z2))
+
+/*
+** abd_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svabd_n_f16_z (p0, z0, d4),
+		 z0 = svabd_z (p0, z0, d4))
+
+/*
+** abd_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fabd	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svabd_n_f16_z (p0, z1, d4),
+		 z0 = svabd_z (p0, z1, d4))
+
+/*
+** abd_1_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f16_z_tied1, svfloat16_t,
+		z0 = svabd_n_f16_z (p0, z0, 1),
+		z0 = svabd_z (p0, z0, 1))
+
+/*
+** abd_1_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fabd	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f16_z_untied, svfloat16_t,
+		z0 = svabd_n_f16_z (p0, z1, 1),
+		z0 = svabd_z (p0, z1, 1))
+
+/*
+** abd_0p5_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_0p5_f16_z_tied1, svfloat16_t,
+		z0 = svabd_n_f16_z (p0, z0, 0.5),
+		z0 = svabd_z (p0, z0, 0.5))
+
+/*
+** abd_0p5_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fabd	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_0p5_f16_z_untied, svfloat16_t,
+		z0 = svabd_n_f16_z (p0, z1, 0.5),
+		z0 = svabd_z (p0, z1, 0.5))
+
+/*
+** abd_m1_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m1_f16_z_tied1, svfloat16_t,
+		z0 = svabd_n_f16_z (p0, z0, -1),
+		z0 = svabd_z (p0, z0, -1))
+
+/*
+** abd_m1_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fabd	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m1_f16_z_untied, svfloat16_t,
+		z0 = svabd_n_f16_z (p0, z1, -1),
+		z0 = svabd_z (p0, z1, -1))
+
+/*
+** abd_m0p5_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #-(?:0\.5|5\.0e-1)
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m0p5_f16_z_tied1, svfloat16_t,
+		z0 = svabd_n_f16_z (p0, z0, -0.5),
+		z0 = svabd_z (p0, z0, -0.5))
+
+/*
+** abd_m0p5_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #-(?:0\.5|5\.0e-1)
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fabd	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m0p5_f16_z_untied, svfloat16_t,
+		z0 = svabd_n_f16_z (p0, z1, -0.5),
+		z0 = svabd_z (p0, z1, -0.5))
+
+/*
+** abd_m2_f16_z:
+**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m2_f16_z, svfloat16_t,
+		z0 = svabd_n_f16_z (p0, z0, -2),
+		z0 = svabd_z (p0, z0, -2))
+
+/*
+** abd_f16_x_tied1:
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f16_x_tied1, svfloat16_t,
+		z0 = svabd_f16_x (p0, z0, z1),
+		z0 = svabd_x (p0, z0, z1))
+
+/*
+** abd_f16_x_tied2:
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f16_x_tied2, svfloat16_t,
+		z0 = svabd_f16_x (p0, z1, z0),
+		z0 = svabd_x (p0, z1, z0))
+
+/*
+** abd_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fabd	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f16_x_untied, svfloat16_t,
+		z0 = svabd_f16_x (p0, z1, z2),
+		z0 = svabd_x (p0, z1, z2))
+
+/*
+** abd_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svabd_n_f16_x (p0, z0, d4),
+		 z0 = svabd_x (p0, z0, d4))
+
+/*
+** abd_h4_f16_x_untied:
+**	mov	z0\.h, h4
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svabd_n_f16_x (p0, z1, d4),
+		 z0 = svabd_x (p0, z1, d4))
+
+/*
+** abd_1_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f16_x_tied1, svfloat16_t,
+		z0 = svabd_n_f16_x (p0, z0, 1),
+		z0 = svabd_x (p0, z0, 1))
+
+/*
+** abd_1_f16_x_untied:
+**	fmov	z0\.h, #1\.0(?:e\+0)?
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f16_x_untied, svfloat16_t,
+		z0 = svabd_n_f16_x (p0, z1, 1),
+		z0 = svabd_x (p0, z1, 1))
+
+/*
+** abd_0p5_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
+**	fabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svabd_n_f16_x (p0, z0, 0.5),
+		z0 = svabd_x (p0, z0, 0.5))
+
+/*
+** abd_0p5_f16_x_untied:
+**	fmov	z0\.h, #(?:0\.5|5\.0e-1)
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_0p5_f16_x_untied, svfloat16_t,
+		z0 = svabd_n_f16_x (p0, z1, 0.5),
+		z0 = svabd_x (p0, z1, 0.5))
+
+/*
+** abd_m1_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+**	fabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m1_f16_x_tied1, svfloat16_t,
+		z0 = svabd_n_f16_x (p0, z0, -1),
+		z0 = svabd_x (p0, z0, -1))
+
+/*
+** abd_m1_f16_x_untied:
+**	fmov	z0\.h, #-1\.0(?:e\+0)?
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m1_f16_x_untied, svfloat16_t,
+		z0 = svabd_n_f16_x (p0, z1, -1),
+		z0 = svabd_x (p0, z1, -1))
+
+/*
+** abd_m0p5_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #-(?:0\.5|5\.0e-1)
+**	fabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m0p5_f16_x_tied1, svfloat16_t,
+		z0 = svabd_n_f16_x (p0, z0, -0.5),
+		z0 = svabd_x (p0, z0, -0.5))
+
+/*
+** abd_m0p5_f16_x_untied:
+**	fmov	z0\.h, #-(?:0\.5|5\.0e-1)
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m0p5_f16_x_untied, svfloat16_t,
+		z0 = svabd_n_f16_x (p0, z1, -0.5),
+		z0 = svabd_x (p0, z1, -0.5))
+
+/*
+** abd_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_2_f16_x_tied1, svfloat16_t,
+		z0 = svabd_n_f16_x (p0, z0, 2),
+		z0 = svabd_x (p0, z0, 2))
+
+/*
+** abd_2_f16_x_untied:
+**	fmov	z0\.h, #2\.0(?:e\+0)?
+**	fabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_2_f16_x_untied, svfloat16_t,
+		z0 = svabd_n_f16_x (p0, z1, 2),
+		z0 = svabd_x (p0, z1, 2))
+
+/*
+** ptrue_abd_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_f16_x_tied1, svfloat16_t,
+		z0 = svabd_f16_x (svptrue_b16 (), z0, z1),
+		z0 = svabd_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_abd_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_f16_x_tied2, svfloat16_t,
+		z0 = svabd_f16_x (svptrue_b16 (), z1, z0),
+		z0 = svabd_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_abd_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_f16_x_untied, svfloat16_t,
+		z0 = svabd_f16_x (svptrue_b16 (), z1, z2),
+		z0 = svabd_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_abd_1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_1_f16_x_tied1, svfloat16_t,
+		z0 = svabd_n_f16_x (svptrue_b16 (), z0, 1),
+		z0 = svabd_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_abd_1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_1_f16_x_untied, svfloat16_t,
+		z0 = svabd_n_f16_x (svptrue_b16 (), z1, 1),
+		z0 = svabd_x (svptrue_b16 (), z1, 1))
+
+/*
+** ptrue_abd_0p5_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svabd_n_f16_x (svptrue_b16 (), z0, 0.5),
+		z0 = svabd_x (svptrue_b16 (), z0, 0.5))
+
+/*
+** ptrue_abd_0p5_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_0p5_f16_x_untied, svfloat16_t,
+		z0 = svabd_n_f16_x (svptrue_b16 (), z1, 0.5),
+		z0 = svabd_x (svptrue_b16 (), z1, 0.5))
+
+/*
+** ptrue_abd_m1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_m1_f16_x_tied1, svfloat16_t,
+		z0 = svabd_n_f16_x (svptrue_b16 (), z0, -1),
+		z0 = svabd_x (svptrue_b16 (), z0, -1))
+
+/*
+** ptrue_abd_m1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_m1_f16_x_untied, svfloat16_t,
+		z0 = svabd_n_f16_x (svptrue_b16 (), z1, -1),
+		z0 = svabd_x (svptrue_b16 (), z1, -1))
+
+/*
+** ptrue_abd_m0p5_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_m0p5_f16_x_tied1, svfloat16_t,
+		z0 = svabd_n_f16_x (svptrue_b16 (), z0, -0.5),
+		z0 = svabd_x (svptrue_b16 (), z0, -0.5))
+
+/*
+** ptrue_abd_m0p5_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_m0p5_f16_x_untied, svfloat16_t,
+		z0 = svabd_n_f16_x (svptrue_b16 (), z1, -0.5),
+		z0 = svabd_x (svptrue_b16 (), z1, -0.5))
+
+/*
+** ptrue_abd_2_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_2_f16_x_tied1, svfloat16_t,
+		z0 = svabd_n_f16_x (svptrue_b16 (), z0, 2),
+		z0 = svabd_x (svptrue_b16 (), z0, 2))
+
+/*
+** ptrue_abd_2_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_2_f16_x_untied, svfloat16_t,
+		z0 = svabd_n_f16_x (svptrue_b16 (), z1, 2),
+		z0 = svabd_x (svptrue_b16 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f32.c
new file mode 100644
index 000000000..bff37580c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f32.c
@@ -0,0 +1,552 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abd_f32_m_tied1:
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f32_m_tied1, svfloat32_t,
+		z0 = svabd_f32_m (p0, z0, z1),
+		z0 = svabd_m (p0, z0, z1))
+
+/*
+** abd_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fabd	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f32_m_tied2, svfloat32_t,
+		z0 = svabd_f32_m (p0, z1, z0),
+		z0 = svabd_m (p0, z1, z0))
+
+/*
+** abd_f32_m_untied:
+**	movprfx	z0, z1
+**	fabd	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f32_m_untied, svfloat32_t,
+		z0 = svabd_f32_m (p0, z1, z2),
+		z0 = svabd_m (p0, z1, z2))
+
+/*
+** abd_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svabd_n_f32_m (p0, z0, d4),
+		 z0 = svabd_m (p0, z0, d4))
+
+/*
+** abd_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svabd_n_f32_m (p0, z1, d4),
+		 z0 = svabd_m (p0, z1, d4))
+
+/*
+** abd_1_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f32_m_tied1, svfloat32_t,
+		z0 = svabd_n_f32_m (p0, z0, 1),
+		z0 = svabd_m (p0, z0, 1))
+
+/*
+** abd_1_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f32_m_untied, svfloat32_t,
+		z0 = svabd_n_f32_m (p0, z1, 1),
+		z0 = svabd_m (p0, z1, 1))
+
+/*
+** abd_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f32_z_tied1, svfloat32_t,
+		z0 = svabd_f32_z (p0, z0, z1),
+		z0 = svabd_z (p0, z0, z1))
+
+/*
+** abd_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f32_z_tied2, svfloat32_t,
+		z0 = svabd_f32_z (p0, z1, z0),
+		z0 = svabd_z (p0, z1, z0))
+
+/*
+** abd_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fabd	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f32_z_untied, svfloat32_t,
+		z0 = svabd_f32_z (p0, z1, z2),
+		z0 = svabd_z (p0, z1, z2))
+
+/*
+** abd_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svabd_n_f32_z (p0, z0, d4),
+		 z0 = svabd_z (p0, z0, d4))
+
+/*
+** abd_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fabd	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svabd_n_f32_z (p0, z1, d4),
+		 z0 = svabd_z (p0, z1, d4))
+
+/*
+** abd_1_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f32_z_tied1, svfloat32_t,
+		z0 = svabd_n_f32_z (p0, z0, 1),
+		z0 = svabd_z (p0, z0, 1))
+
+/*
+** abd_1_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fabd	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f32_z_untied, svfloat32_t,
+		z0 = svabd_n_f32_z (p0, z1, 1),
+		z0 = svabd_z (p0, z1, 1))
+
+/*
+** abd_0p5_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_0p5_f32_z_tied1, svfloat32_t,
+		z0 = svabd_n_f32_z (p0, z0, 0.5),
+		z0 = svabd_z (p0, z0, 0.5))
+
+/*
+** abd_0p5_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fabd	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_0p5_f32_z_untied, svfloat32_t,
+		z0 = svabd_n_f32_z (p0, z1, 0.5),
+		z0 = svabd_z (p0, z1, 0.5))
+
+/*
+** abd_m1_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m1_f32_z_tied1, svfloat32_t,
+		z0 = svabd_n_f32_z (p0, z0, -1),
+		z0 = svabd_z (p0, z0, -1))
+
+/*
+** abd_m1_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fabd	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m1_f32_z_untied, svfloat32_t,
+		z0 = svabd_n_f32_z (p0, z1, -1),
+		z0 = svabd_z (p0, z1, -1))
+
+/*
+** abd_m0p5_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #-(?:0\.5|5\.0e-1)
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m0p5_f32_z_tied1, svfloat32_t,
+		z0 = svabd_n_f32_z (p0, z0, -0.5),
+		z0 = svabd_z (p0, z0, -0.5))
+
+/*
+** abd_m0p5_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #-(?:0\.5|5\.0e-1)
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fabd	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m0p5_f32_z_untied, svfloat32_t,
+		z0 = svabd_n_f32_z (p0, z1, -0.5),
+		z0 = svabd_z (p0, z1, -0.5))
+
+/*
+** abd_m2_f32_z:
+**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m2_f32_z, svfloat32_t,
+		z0 = svabd_n_f32_z (p0, z0, -2),
+		z0 = svabd_z (p0, z0, -2))
+
+/*
+** abd_f32_x_tied1:
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f32_x_tied1, svfloat32_t,
+		z0 = svabd_f32_x (p0, z0, z1),
+		z0 = svabd_x (p0, z0, z1))
+
+/*
+** abd_f32_x_tied2:
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f32_x_tied2, svfloat32_t,
+		z0 = svabd_f32_x (p0, z1, z0),
+		z0 = svabd_x (p0, z1, z0))
+
+/*
+** abd_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fabd	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f32_x_untied, svfloat32_t,
+		z0 = svabd_f32_x (p0, z1, z2),
+		z0 = svabd_x (p0, z1, z2))
+
+/*
+** abd_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svabd_n_f32_x (p0, z0, d4),
+		 z0 = svabd_x (p0, z0, d4))
+
+/*
+** abd_s4_f32_x_untied:
+**	mov	z0\.s, s4
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svabd_n_f32_x (p0, z1, d4),
+		 z0 = svabd_x (p0, z1, d4))
+
+/*
+** abd_1_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f32_x_tied1, svfloat32_t,
+		z0 = svabd_n_f32_x (p0, z0, 1),
+		z0 = svabd_x (p0, z0, 1))
+
+/*
+** abd_1_f32_x_untied:
+**	fmov	z0\.s, #1\.0(?:e\+0)?
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f32_x_untied, svfloat32_t,
+		z0 = svabd_n_f32_x (p0, z1, 1),
+		z0 = svabd_x (p0, z1, 1))
+
+/*
+** abd_0p5_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
+**	fabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svabd_n_f32_x (p0, z0, 0.5),
+		z0 = svabd_x (p0, z0, 0.5))
+
+/*
+** abd_0p5_f32_x_untied:
+**	fmov	z0\.s, #(?:0\.5|5\.0e-1)
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_0p5_f32_x_untied, svfloat32_t,
+		z0 = svabd_n_f32_x (p0, z1, 0.5),
+		z0 = svabd_x (p0, z1, 0.5))
+
+/*
+** abd_m1_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+**	fabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m1_f32_x_tied1, svfloat32_t,
+		z0 = svabd_n_f32_x (p0, z0, -1),
+		z0 = svabd_x (p0, z0, -1))
+
+/*
+** abd_m1_f32_x_untied:
+**	fmov	z0\.s, #-1\.0(?:e\+0)?
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m1_f32_x_untied, svfloat32_t,
+		z0 = svabd_n_f32_x (p0, z1, -1),
+		z0 = svabd_x (p0, z1, -1))
+
+/*
+** abd_m0p5_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #-(?:0\.5|5\.0e-1)
+**	fabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m0p5_f32_x_tied1, svfloat32_t,
+		z0 = svabd_n_f32_x (p0, z0, -0.5),
+		z0 = svabd_x (p0, z0, -0.5))
+
+/*
+** abd_m0p5_f32_x_untied:
+**	fmov	z0\.s, #-(?:0\.5|5\.0e-1)
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m0p5_f32_x_untied, svfloat32_t,
+		z0 = svabd_n_f32_x (p0, z1, -0.5),
+		z0 = svabd_x (p0, z1, -0.5))
+
+/*
+** abd_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_2_f32_x_tied1, svfloat32_t,
+		z0 = svabd_n_f32_x (p0, z0, 2),
+		z0 = svabd_x (p0, z0, 2))
+
+/*
+** abd_2_f32_x_untied:
+**	fmov	z0\.s, #2\.0(?:e\+0)?
+**	fabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_2_f32_x_untied, svfloat32_t,
+		z0 = svabd_n_f32_x (p0, z1, 2),
+		z0 = svabd_x (p0, z1, 2))
+
+/*
+** ptrue_abd_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_f32_x_tied1, svfloat32_t,
+		z0 = svabd_f32_x (svptrue_b32 (), z0, z1),
+		z0 = svabd_x (svptrue_b32 (), z0, z1))
+
+/*
+** ptrue_abd_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_f32_x_tied2, svfloat32_t,
+		z0 = svabd_f32_x (svptrue_b32 (), z1, z0),
+		z0 = svabd_x (svptrue_b32 (), z1, z0))
+
+/*
+** ptrue_abd_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_f32_x_untied, svfloat32_t,
+		z0 = svabd_f32_x (svptrue_b32 (), z1, z2),
+		z0 = svabd_x (svptrue_b32 (), z1, z2))
+
+/*
+** ptrue_abd_1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_1_f32_x_tied1, svfloat32_t,
+		z0 = svabd_n_f32_x (svptrue_b32 (), z0, 1),
+		z0 = svabd_x (svptrue_b32 (), z0, 1))
+
+/*
+** ptrue_abd_1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_1_f32_x_untied, svfloat32_t,
+		z0 = svabd_n_f32_x (svptrue_b32 (), z1, 1),
+		z0 = svabd_x (svptrue_b32 (), z1, 1))
+
+/*
+** ptrue_abd_0p5_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svabd_n_f32_x (svptrue_b32 (), z0, 0.5),
+		z0 = svabd_x (svptrue_b32 (), z0, 0.5))
+
+/*
+** ptrue_abd_0p5_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_0p5_f32_x_untied, svfloat32_t,
+		z0 = svabd_n_f32_x (svptrue_b32 (), z1, 0.5),
+		z0 = svabd_x (svptrue_b32 (), z1, 0.5))
+
+/*
+** ptrue_abd_m1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_m1_f32_x_tied1, svfloat32_t,
+		z0 = svabd_n_f32_x (svptrue_b32 (), z0, -1),
+		z0 = svabd_x (svptrue_b32 (), z0, -1))
+
+/*
+** ptrue_abd_m1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_m1_f32_x_untied, svfloat32_t,
+		z0 = svabd_n_f32_x (svptrue_b32 (), z1, -1),
+		z0 = svabd_x (svptrue_b32 (), z1, -1))
+
+/*
+** ptrue_abd_m0p5_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_m0p5_f32_x_tied1, svfloat32_t,
+		z0 = svabd_n_f32_x (svptrue_b32 (), z0, -0.5),
+		z0 = svabd_x (svptrue_b32 (), z0, -0.5))
+
+/*
+** ptrue_abd_m0p5_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_m0p5_f32_x_untied, svfloat32_t,
+		z0 = svabd_n_f32_x (svptrue_b32 (), z1, -0.5),
+		z0 = svabd_x (svptrue_b32 (), z1, -0.5))
+
+/*
+** ptrue_abd_2_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_2_f32_x_tied1, svfloat32_t,
+		z0 = svabd_n_f32_x (svptrue_b32 (), z0, 2),
+		z0 = svabd_x (svptrue_b32 (), z0, 2))
+
+/*
+** ptrue_abd_2_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_2_f32_x_untied, svfloat32_t,
+		z0 = svabd_n_f32_x (svptrue_b32 (), z1, 2),
+		z0 = svabd_x (svptrue_b32 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f64.c
new file mode 100644
index 000000000..c1e5f14e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_f64.c
@@ -0,0 +1,552 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abd_f64_m_tied1:
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f64_m_tied1, svfloat64_t,
+		z0 = svabd_f64_m (p0, z0, z1),
+		z0 = svabd_m (p0, z0, z1))
+
+/*
+** abd_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f64_m_tied2, svfloat64_t,
+		z0 = svabd_f64_m (p0, z1, z0),
+		z0 = svabd_m (p0, z1, z0))
+
+/*
+** abd_f64_m_untied:
+**	movprfx	z0, z1
+**	fabd	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f64_m_untied, svfloat64_t,
+		z0 = svabd_f64_m (p0, z1, z2),
+		z0 = svabd_m (p0, z1, z2))
+
+/*
+** abd_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svabd_n_f64_m (p0, z0, d4),
+		 z0 = svabd_m (p0, z0, d4))
+
+/*
+** abd_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svabd_n_f64_m (p0, z1, d4),
+		 z0 = svabd_m (p0, z1, d4))
+
+/*
+** abd_1_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f64_m_tied1, svfloat64_t,
+		z0 = svabd_n_f64_m (p0, z0, 1),
+		z0 = svabd_m (p0, z0, 1))
+
+/*
+** abd_1_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f64_m_untied, svfloat64_t,
+		z0 = svabd_n_f64_m (p0, z1, 1),
+		z0 = svabd_m (p0, z1, 1))
+
+/*
+** abd_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f64_z_tied1, svfloat64_t,
+		z0 = svabd_f64_z (p0, z0, z1),
+		z0 = svabd_z (p0, z0, z1))
+
+/*
+** abd_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f64_z_tied2, svfloat64_t,
+		z0 = svabd_f64_z (p0, z1, z0),
+		z0 = svabd_z (p0, z1, z0))
+
+/*
+** abd_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fabd	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f64_z_untied, svfloat64_t,
+		z0 = svabd_f64_z (p0, z1, z2),
+		z0 = svabd_z (p0, z1, z2))
+
+/*
+** abd_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svabd_n_f64_z (p0, z0, d4),
+		 z0 = svabd_z (p0, z0, d4))
+
+/*
+** abd_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fabd	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svabd_n_f64_z (p0, z1, d4),
+		 z0 = svabd_z (p0, z1, d4))
+
+/*
+** abd_1_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f64_z_tied1, svfloat64_t,
+		z0 = svabd_n_f64_z (p0, z0, 1),
+		z0 = svabd_z (p0, z0, 1))
+
+/*
+** abd_1_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fabd	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f64_z_untied, svfloat64_t,
+		z0 = svabd_n_f64_z (p0, z1, 1),
+		z0 = svabd_z (p0, z1, 1))
+
+/*
+** abd_0p5_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_0p5_f64_z_tied1, svfloat64_t,
+		z0 = svabd_n_f64_z (p0, z0, 0.5),
+		z0 = svabd_z (p0, z0, 0.5))
+
+/*
+** abd_0p5_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fabd	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_0p5_f64_z_untied, svfloat64_t,
+		z0 = svabd_n_f64_z (p0, z1, 0.5),
+		z0 = svabd_z (p0, z1, 0.5))
+
+/*
+** abd_m1_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m1_f64_z_tied1, svfloat64_t,
+		z0 = svabd_n_f64_z (p0, z0, -1),
+		z0 = svabd_z (p0, z0, -1))
+
+/*
+** abd_m1_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fabd	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m1_f64_z_untied, svfloat64_t,
+		z0 = svabd_n_f64_z (p0, z1, -1),
+		z0 = svabd_z (p0, z1, -1))
+
+/*
+** abd_m0p5_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #-(?:0\.5|5\.0e-1)
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m0p5_f64_z_tied1, svfloat64_t,
+		z0 = svabd_n_f64_z (p0, z0, -0.5),
+		z0 = svabd_z (p0, z0, -0.5))
+
+/*
+** abd_m0p5_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #-(?:0\.5|5\.0e-1)
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fabd	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m0p5_f64_z_untied, svfloat64_t,
+		z0 = svabd_n_f64_z (p0, z1, -0.5),
+		z0 = svabd_z (p0, z1, -0.5))
+
+/*
+** abd_m2_f64_z:
+**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m2_f64_z, svfloat64_t,
+		z0 = svabd_n_f64_z (p0, z0, -2),
+		z0 = svabd_z (p0, z0, -2))
+
+/*
+** abd_f64_x_tied1:
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f64_x_tied1, svfloat64_t,
+		z0 = svabd_f64_x (p0, z0, z1),
+		z0 = svabd_x (p0, z0, z1))
+
+/*
+** abd_f64_x_tied2:
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f64_x_tied2, svfloat64_t,
+		z0 = svabd_f64_x (p0, z1, z0),
+		z0 = svabd_x (p0, z1, z0))
+
+/*
+** abd_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fabd	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_f64_x_untied, svfloat64_t,
+		z0 = svabd_f64_x (p0, z1, z2),
+		z0 = svabd_x (p0, z1, z2))
+
+/*
+** abd_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svabd_n_f64_x (p0, z0, d4),
+		 z0 = svabd_x (p0, z0, d4))
+
+/*
+** abd_d4_f64_x_untied:
+**	mov	z0\.d, d4
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (abd_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svabd_n_f64_x (p0, z1, d4),
+		 z0 = svabd_x (p0, z1, d4))
+
+/*
+** abd_1_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f64_x_tied1, svfloat64_t,
+		z0 = svabd_n_f64_x (p0, z0, 1),
+		z0 = svabd_x (p0, z0, 1))
+
+/*
+** abd_1_f64_x_untied:
+**	fmov	z0\.d, #1\.0(?:e\+0)?
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_f64_x_untied, svfloat64_t,
+		z0 = svabd_n_f64_x (p0, z1, 1),
+		z0 = svabd_x (p0, z1, 1))
+
+/*
+** abd_0p5_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svabd_n_f64_x (p0, z0, 0.5),
+		z0 = svabd_x (p0, z0, 0.5))
+
+/*
+** abd_0p5_f64_x_untied:
+**	fmov	z0\.d, #(?:0\.5|5\.0e-1)
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_0p5_f64_x_untied, svfloat64_t,
+		z0 = svabd_n_f64_x (p0, z1, 0.5),
+		z0 = svabd_x (p0, z1, 0.5))
+
+/*
+** abd_m1_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m1_f64_x_tied1, svfloat64_t,
+		z0 = svabd_n_f64_x (p0, z0, -1),
+		z0 = svabd_x (p0, z0, -1))
+
+/*
+** abd_m1_f64_x_untied:
+**	fmov	z0\.d, #-1\.0(?:e\+0)?
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m1_f64_x_untied, svfloat64_t,
+		z0 = svabd_n_f64_x (p0, z1, -1),
+		z0 = svabd_x (p0, z1, -1))
+
+/*
+** abd_m0p5_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #-(?:0\.5|5\.0e-1)
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m0p5_f64_x_tied1, svfloat64_t,
+		z0 = svabd_n_f64_x (p0, z0, -0.5),
+		z0 = svabd_x (p0, z0, -0.5))
+
+/*
+** abd_m0p5_f64_x_untied:
+**	fmov	z0\.d, #-(?:0\.5|5\.0e-1)
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_m0p5_f64_x_untied, svfloat64_t,
+		z0 = svabd_n_f64_x (p0, z1, -0.5),
+		z0 = svabd_x (p0, z1, -0.5))
+
+/*
+** abd_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_2_f64_x_tied1, svfloat64_t,
+		z0 = svabd_n_f64_x (p0, z0, 2),
+		z0 = svabd_x (p0, z0, 2))
+
+/*
+** abd_2_f64_x_untied:
+**	fmov	z0\.d, #2\.0(?:e\+0)?
+**	fabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_2_f64_x_untied, svfloat64_t,
+		z0 = svabd_n_f64_x (p0, z1, 2),
+		z0 = svabd_x (p0, z1, 2))
+
+/*
+** ptrue_abd_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_f64_x_tied1, svfloat64_t,
+		z0 = svabd_f64_x (svptrue_b64 (), z0, z1),
+		z0 = svabd_x (svptrue_b64 (), z0, z1))
+
+/*
+** ptrue_abd_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_f64_x_tied2, svfloat64_t,
+		z0 = svabd_f64_x (svptrue_b64 (), z1, z0),
+		z0 = svabd_x (svptrue_b64 (), z1, z0))
+
+/*
+** ptrue_abd_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_f64_x_untied, svfloat64_t,
+		z0 = svabd_f64_x (svptrue_b64 (), z1, z2),
+		z0 = svabd_x (svptrue_b64 (), z1, z2))
+
+/*
+** ptrue_abd_1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_1_f64_x_tied1, svfloat64_t,
+		z0 = svabd_n_f64_x (svptrue_b64 (), z0, 1),
+		z0 = svabd_x (svptrue_b64 (), z0, 1))
+
+/*
+** ptrue_abd_1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_1_f64_x_untied, svfloat64_t,
+		z0 = svabd_n_f64_x (svptrue_b64 (), z1, 1),
+		z0 = svabd_x (svptrue_b64 (), z1, 1))
+
+/*
+** ptrue_abd_0p5_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svabd_n_f64_x (svptrue_b64 (), z0, 0.5),
+		z0 = svabd_x (svptrue_b64 (), z0, 0.5))
+
+/*
+** ptrue_abd_0p5_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_0p5_f64_x_untied, svfloat64_t,
+		z0 = svabd_n_f64_x (svptrue_b64 (), z1, 0.5),
+		z0 = svabd_x (svptrue_b64 (), z1, 0.5))
+
+/*
+** ptrue_abd_m1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_m1_f64_x_tied1, svfloat64_t,
+		z0 = svabd_n_f64_x (svptrue_b64 (), z0, -1),
+		z0 = svabd_x (svptrue_b64 (), z0, -1))
+
+/*
+** ptrue_abd_m1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_m1_f64_x_untied, svfloat64_t,
+		z0 = svabd_n_f64_x (svptrue_b64 (), z1, -1),
+		z0 = svabd_x (svptrue_b64 (), z1, -1))
+
+/*
+** ptrue_abd_m0p5_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_m0p5_f64_x_tied1, svfloat64_t,
+		z0 = svabd_n_f64_x (svptrue_b64 (), z0, -0.5),
+		z0 = svabd_x (svptrue_b64 (), z0, -0.5))
+
+/*
+** ptrue_abd_m0p5_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_m0p5_f64_x_untied, svfloat64_t,
+		z0 = svabd_n_f64_x (svptrue_b64 (), z1, -0.5),
+		z0 = svabd_x (svptrue_b64 (), z1, -0.5))
+
+/*
+** ptrue_abd_2_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_2_f64_x_tied1, svfloat64_t,
+		z0 = svabd_n_f64_x (svptrue_b64 (), z0, 2),
+		z0 = svabd_x (svptrue_b64 (), z0, 2))
+
+/*
+** ptrue_abd_2_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abd_2_f64_x_untied, svfloat64_t,
+		z0 = svabd_n_f64_x (svptrue_b64 (), z1, 2),
+		z0 = svabd_x (svptrue_b64 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s16.c
new file mode 100644
index 000000000..e2d0c0fb7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s16.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abd_s16_m_tied1:
+**	sabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s16_m_tied1, svint16_t,
+		z0 = svabd_s16_m (p0, z0, z1),
+		z0 = svabd_m (p0, z0, z1))
+
+/*
+** abd_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	sabd	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s16_m_tied2, svint16_t,
+		z0 = svabd_s16_m (p0, z1, z0),
+		z0 = svabd_m (p0, z1, z0))
+
+/*
+** abd_s16_m_untied:
+**	movprfx	z0, z1
+**	sabd	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s16_m_untied, svint16_t,
+		z0 = svabd_s16_m (p0, z1, z2),
+		z0 = svabd_m (p0, z1, z2))
+
+/*
+** abd_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	sabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s16_m_tied1, svint16_t, int16_t,
+		 z0 = svabd_n_s16_m (p0, z0, x0),
+		 z0 = svabd_m (p0, z0, x0))
+
+/*
+** abd_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	sabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s16_m_untied, svint16_t, int16_t,
+		 z0 = svabd_n_s16_m (p0, z1, x0),
+		 z0 = svabd_m (p0, z1, x0))
+
+/*
+** abd_1_s16_m_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	sabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s16_m_tied1, svint16_t,
+		z0 = svabd_n_s16_m (p0, z0, 1),
+		z0 = svabd_m (p0, z0, 1))
+
+/*
+** abd_1_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0, z1
+**	sabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s16_m_untied, svint16_t,
+		z0 = svabd_n_s16_m (p0, z1, 1),
+		z0 = svabd_m (p0, z1, 1))
+
+/*
+** abd_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	sabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s16_z_tied1, svint16_t,
+		z0 = svabd_s16_z (p0, z0, z1),
+		z0 = svabd_z (p0, z0, z1))
+
+/*
+** abd_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	sabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s16_z_tied2, svint16_t,
+		z0 = svabd_s16_z (p0, z1, z0),
+		z0 = svabd_z (p0, z1, z0))
+
+/*
+** abd_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	sabd	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	sabd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s16_z_untied, svint16_t,
+		z0 = svabd_s16_z (p0, z1, z2),
+		z0 = svabd_z (p0, z1, z2))
+
+/*
+** abd_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	sabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s16_z_tied1, svint16_t, int16_t,
+		 z0 = svabd_n_s16_z (p0, z0, x0),
+		 z0 = svabd_z (p0, z0, x0))
+
+/*
+** abd_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	sabd	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	sabd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s16_z_untied, svint16_t, int16_t,
+		 z0 = svabd_n_s16_z (p0, z1, x0),
+		 z0 = svabd_z (p0, z1, x0))
+
+/*
+** abd_1_s16_z_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	sabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s16_z_tied1, svint16_t,
+		z0 = svabd_n_s16_z (p0, z0, 1),
+		z0 = svabd_z (p0, z0, 1))
+
+/*
+** abd_1_s16_z_untied:
+**	mov	(z[0-9]+\.h), #1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	sabd	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	sabd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s16_z_untied, svint16_t,
+		z0 = svabd_n_s16_z (p0, z1, 1),
+		z0 = svabd_z (p0, z1, 1))
+
+/*
+** abd_s16_x_tied1:
+**	sabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s16_x_tied1, svint16_t,
+		z0 = svabd_s16_x (p0, z0, z1),
+		z0 = svabd_x (p0, z0, z1))
+
+/*
+** abd_s16_x_tied2:
+**	sabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s16_x_tied2, svint16_t,
+		z0 = svabd_s16_x (p0, z1, z0),
+		z0 = svabd_x (p0, z1, z0))
+
+/*
+** abd_s16_x_untied:
+** (
+**	movprfx	z0, z1
+**	sabd	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	sabd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s16_x_untied, svint16_t,
+		z0 = svabd_s16_x (p0, z1, z2),
+		z0 = svabd_x (p0, z1, z2))
+
+/*
+** abd_w0_s16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	sabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s16_x_tied1, svint16_t, int16_t,
+		 z0 = svabd_n_s16_x (p0, z0, x0),
+		 z0 = svabd_x (p0, z0, x0))
+
+/*
+** abd_w0_s16_x_untied:
+**	mov	z0\.h, w0
+**	sabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s16_x_untied, svint16_t, int16_t,
+		 z0 = svabd_n_s16_x (p0, z1, x0),
+		 z0 = svabd_x (p0, z1, x0))
+
+/*
+** abd_1_s16_x_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	sabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s16_x_tied1, svint16_t,
+		z0 = svabd_n_s16_x (p0, z0, 1),
+		z0 = svabd_x (p0, z0, 1))
+
+/*
+** abd_1_s16_x_untied:
+**	mov	z0\.h, #1
+**	sabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s16_x_untied, svint16_t,
+		z0 = svabd_n_s16_x (p0, z1, 1),
+		z0 = svabd_x (p0, z1, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s32.c
new file mode 100644
index 000000000..5c95ec04d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s32.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abd_s32_m_tied1:
+**	sabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s32_m_tied1, svint32_t,
+		z0 = svabd_s32_m (p0, z0, z1),
+		z0 = svabd_m (p0, z0, z1))
+
+/*
+** abd_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	sabd	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s32_m_tied2, svint32_t,
+		z0 = svabd_s32_m (p0, z1, z0),
+		z0 = svabd_m (p0, z1, z0))
+
+/*
+** abd_s32_m_untied:
+**	movprfx	z0, z1
+**	sabd	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s32_m_untied, svint32_t,
+		z0 = svabd_s32_m (p0, z1, z2),
+		z0 = svabd_m (p0, z1, z2))
+
+/*
+** abd_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	sabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svabd_n_s32_m (p0, z0, x0),
+		 z0 = svabd_m (p0, z0, x0))
+
+/*
+** abd_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	sabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svabd_n_s32_m (p0, z1, x0),
+		 z0 = svabd_m (p0, z1, x0))
+
+/*
+** abd_1_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	sabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s32_m_tied1, svint32_t,
+		z0 = svabd_n_s32_m (p0, z0, 1),
+		z0 = svabd_m (p0, z0, 1))
+
+/*
+** abd_1_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0, z1
+**	sabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s32_m_untied, svint32_t,
+		z0 = svabd_n_s32_m (p0, z1, 1),
+		z0 = svabd_m (p0, z1, 1))
+
+/*
+** abd_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s32_z_tied1, svint32_t,
+		z0 = svabd_s32_z (p0, z0, z1),
+		z0 = svabd_z (p0, z0, z1))
+
+/*
+** abd_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s32_z_tied2, svint32_t,
+		z0 = svabd_s32_z (p0, z1, z0),
+		z0 = svabd_z (p0, z1, z0))
+
+/*
+** abd_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	sabd	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	sabd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s32_z_untied, svint32_t,
+		z0 = svabd_s32_z (p0, z1, z2),
+		z0 = svabd_z (p0, z1, z2))
+
+/*
+** abd_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svabd_n_s32_z (p0, z0, x0),
+		 z0 = svabd_z (p0, z0, x0))
+
+/*
+** abd_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	sabd	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	sabd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svabd_n_s32_z (p0, z1, x0),
+		 z0 = svabd_z (p0, z1, x0))
+
+/*
+** abd_1_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s32_z_tied1, svint32_t,
+		z0 = svabd_n_s32_z (p0, z0, 1),
+		z0 = svabd_z (p0, z0, 1))
+
+/*
+** abd_1_s32_z_untied:
+**	mov	(z[0-9]+\.s), #1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	sabd	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	sabd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s32_z_untied, svint32_t,
+		z0 = svabd_n_s32_z (p0, z1, 1),
+		z0 = svabd_z (p0, z1, 1))
+
+/*
+** abd_s32_x_tied1:
+**	sabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s32_x_tied1, svint32_t,
+		z0 = svabd_s32_x (p0, z0, z1),
+		z0 = svabd_x (p0, z0, z1))
+
+/*
+** abd_s32_x_tied2:
+**	sabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s32_x_tied2, svint32_t,
+		z0 = svabd_s32_x (p0, z1, z0),
+		z0 = svabd_x (p0, z1, z0))
+
+/*
+** abd_s32_x_untied:
+** (
+**	movprfx	z0, z1
+**	sabd	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	sabd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s32_x_untied, svint32_t,
+		z0 = svabd_s32_x (p0, z1, z2),
+		z0 = svabd_x (p0, z1, z2))
+
+/*
+** abd_w0_s32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	sabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svabd_n_s32_x (p0, z0, x0),
+		 z0 = svabd_x (p0, z0, x0))
+
+/*
+** abd_w0_s32_x_untied:
+**	mov	z0\.s, w0
+**	sabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svabd_n_s32_x (p0, z1, x0),
+		 z0 = svabd_x (p0, z1, x0))
+
+/*
+** abd_1_s32_x_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	sabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s32_x_tied1, svint32_t,
+		z0 = svabd_n_s32_x (p0, z0, 1),
+		z0 = svabd_x (p0, z0, 1))
+
+/*
+** abd_1_s32_x_untied:
+**	mov	z0\.s, #1
+**	sabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s32_x_untied, svint32_t,
+		z0 = svabd_n_s32_x (p0, z1, 1),
+		z0 = svabd_x (p0, z1, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s64.c
new file mode 100644
index 000000000..2402ecf29
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s64.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abd_s64_m_tied1:
+**	sabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s64_m_tied1, svint64_t,
+		z0 = svabd_s64_m (p0, z0, z1),
+		z0 = svabd_m (p0, z0, z1))
+
+/*
+** abd_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	sabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s64_m_tied2, svint64_t,
+		z0 = svabd_s64_m (p0, z1, z0),
+		z0 = svabd_m (p0, z1, z0))
+
+/*
+** abd_s64_m_untied:
+**	movprfx	z0, z1
+**	sabd	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s64_m_untied, svint64_t,
+		z0 = svabd_s64_m (p0, z1, z2),
+		z0 = svabd_m (p0, z1, z2))
+
+/*
+** abd_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	sabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svabd_n_s64_m (p0, z0, x0),
+		 z0 = svabd_m (p0, z0, x0))
+
+/*
+** abd_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	sabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svabd_n_s64_m (p0, z1, x0),
+		 z0 = svabd_m (p0, z1, x0))
+
+/*
+** abd_1_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	sabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s64_m_tied1, svint64_t,
+		z0 = svabd_n_s64_m (p0, z0, 1),
+		z0 = svabd_m (p0, z0, 1))
+
+/*
+** abd_1_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0, z1
+**	sabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s64_m_untied, svint64_t,
+		z0 = svabd_n_s64_m (p0, z1, 1),
+		z0 = svabd_m (p0, z1, 1))
+
+/*
+** abd_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s64_z_tied1, svint64_t,
+		z0 = svabd_s64_z (p0, z0, z1),
+		z0 = svabd_z (p0, z0, z1))
+
+/*
+** abd_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s64_z_tied2, svint64_t,
+		z0 = svabd_s64_z (p0, z1, z0),
+		z0 = svabd_z (p0, z1, z0))
+
+/*
+** abd_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	sabd	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	sabd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s64_z_untied, svint64_t,
+		z0 = svabd_s64_z (p0, z1, z2),
+		z0 = svabd_z (p0, z1, z2))
+
+/*
+** abd_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svabd_n_s64_z (p0, z0, x0),
+		 z0 = svabd_z (p0, z0, x0))
+
+/*
+** abd_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	sabd	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	sabd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svabd_n_s64_z (p0, z1, x0),
+		 z0 = svabd_z (p0, z1, x0))
+
+/*
+** abd_1_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s64_z_tied1, svint64_t,
+		z0 = svabd_n_s64_z (p0, z0, 1),
+		z0 = svabd_z (p0, z0, 1))
+
+/*
+** abd_1_s64_z_untied:
+**	mov	(z[0-9]+\.d), #1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	sabd	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	sabd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s64_z_untied, svint64_t,
+		z0 = svabd_n_s64_z (p0, z1, 1),
+		z0 = svabd_z (p0, z1, 1))
+
+/*
+** abd_s64_x_tied1:
+**	sabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s64_x_tied1, svint64_t,
+		z0 = svabd_s64_x (p0, z0, z1),
+		z0 = svabd_x (p0, z0, z1))
+
+/*
+** abd_s64_x_tied2:
+**	sabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s64_x_tied2, svint64_t,
+		z0 = svabd_s64_x (p0, z1, z0),
+		z0 = svabd_x (p0, z1, z0))
+
+/*
+** abd_s64_x_untied:
+** (
+**	movprfx	z0, z1
+**	sabd	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	sabd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s64_x_untied, svint64_t,
+		z0 = svabd_s64_x (p0, z1, z2),
+		z0 = svabd_x (p0, z1, z2))
+
+/*
+** abd_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	sabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svabd_n_s64_x (p0, z0, x0),
+		 z0 = svabd_x (p0, z0, x0))
+
+/*
+** abd_x0_s64_x_untied:
+**	mov	z0\.d, x0
+**	sabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svabd_n_s64_x (p0, z1, x0),
+		 z0 = svabd_x (p0, z1, x0))
+
+/*
+** abd_1_s64_x_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	sabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s64_x_tied1, svint64_t,
+		z0 = svabd_n_s64_x (p0, z0, 1),
+		z0 = svabd_x (p0, z0, 1))
+
+/*
+** abd_1_s64_x_untied:
+**	mov	z0\.d, #1
+**	sabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s64_x_untied, svint64_t,
+		z0 = svabd_n_s64_x (p0, z1, 1),
+		z0 = svabd_x (p0, z1, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s8.c
new file mode 100644
index 000000000..49a2cc388
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_s8.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abd_s8_m_tied1:
+**	sabd	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s8_m_tied1, svint8_t,
+		z0 = svabd_s8_m (p0, z0, z1),
+		z0 = svabd_m (p0, z0, z1))
+
+/*
+** abd_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	sabd	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s8_m_tied2, svint8_t,
+		z0 = svabd_s8_m (p0, z1, z0),
+		z0 = svabd_m (p0, z1, z0))
+
+/*
+** abd_s8_m_untied:
+**	movprfx	z0, z1
+**	sabd	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s8_m_untied, svint8_t,
+		z0 = svabd_s8_m (p0, z1, z2),
+		z0 = svabd_m (p0, z1, z2))
+
+/*
+** abd_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	sabd	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s8_m_tied1, svint8_t, int8_t,
+		 z0 = svabd_n_s8_m (p0, z0, x0),
+		 z0 = svabd_m (p0, z0, x0))
+
+/*
+** abd_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	sabd	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s8_m_untied, svint8_t, int8_t,
+		 z0 = svabd_n_s8_m (p0, z1, x0),
+		 z0 = svabd_m (p0, z1, x0))
+
+/*
+** abd_1_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	sabd	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s8_m_tied1, svint8_t,
+		z0 = svabd_n_s8_m (p0, z0, 1),
+		z0 = svabd_m (p0, z0, 1))
+
+/*
+** abd_1_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0, z1
+**	sabd	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s8_m_untied, svint8_t,
+		z0 = svabd_n_s8_m (p0, z1, 1),
+		z0 = svabd_m (p0, z1, 1))
+
+/*
+** abd_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	sabd	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s8_z_tied1, svint8_t,
+		z0 = svabd_s8_z (p0, z0, z1),
+		z0 = svabd_z (p0, z0, z1))
+
+/*
+** abd_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	sabd	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s8_z_tied2, svint8_t,
+		z0 = svabd_s8_z (p0, z1, z0),
+		z0 = svabd_z (p0, z1, z0))
+
+/*
+** abd_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	sabd	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	sabd	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s8_z_untied, svint8_t,
+		z0 = svabd_s8_z (p0, z1, z2),
+		z0 = svabd_z (p0, z1, z2))
+
+/*
+** abd_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	sabd	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s8_z_tied1, svint8_t, int8_t,
+		 z0 = svabd_n_s8_z (p0, z0, x0),
+		 z0 = svabd_z (p0, z0, x0))
+
+/*
+** abd_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	sabd	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	sabd	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s8_z_untied, svint8_t, int8_t,
+		 z0 = svabd_n_s8_z (p0, z1, x0),
+		 z0 = svabd_z (p0, z1, x0))
+
+/*
+** abd_1_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	sabd	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s8_z_tied1, svint8_t,
+		z0 = svabd_n_s8_z (p0, z0, 1),
+		z0 = svabd_z (p0, z0, 1))
+
+/*
+** abd_1_s8_z_untied:
+**	mov	(z[0-9]+\.b), #1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	sabd	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	sabd	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s8_z_untied, svint8_t,
+		z0 = svabd_n_s8_z (p0, z1, 1),
+		z0 = svabd_z (p0, z1, 1))
+
+/*
+** abd_s8_x_tied1:
+**	sabd	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s8_x_tied1, svint8_t,
+		z0 = svabd_s8_x (p0, z0, z1),
+		z0 = svabd_x (p0, z0, z1))
+
+/*
+** abd_s8_x_tied2:
+**	sabd	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s8_x_tied2, svint8_t,
+		z0 = svabd_s8_x (p0, z1, z0),
+		z0 = svabd_x (p0, z1, z0))
+
+/*
+** abd_s8_x_untied:
+** (
+**	movprfx	z0, z1
+**	sabd	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0, z2
+**	sabd	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_s8_x_untied, svint8_t,
+		z0 = svabd_s8_x (p0, z1, z2),
+		z0 = svabd_x (p0, z1, z2))
+
+/*
+** abd_w0_s8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	sabd	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s8_x_tied1, svint8_t, int8_t,
+		 z0 = svabd_n_s8_x (p0, z0, x0),
+		 z0 = svabd_x (p0, z0, x0))
+
+/*
+** abd_w0_s8_x_untied:
+**	mov	z0\.b, w0
+**	sabd	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_s8_x_untied, svint8_t, int8_t,
+		 z0 = svabd_n_s8_x (p0, z1, x0),
+		 z0 = svabd_x (p0, z1, x0))
+
+/*
+** abd_1_s8_x_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	sabd	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s8_x_tied1, svint8_t,
+		z0 = svabd_n_s8_x (p0, z0, 1),
+		z0 = svabd_x (p0, z0, 1))
+
+/*
+** abd_1_s8_x_untied:
+**	mov	z0\.b, #1
+**	sabd	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_s8_x_untied, svint8_t,
+		z0 = svabd_n_s8_x (p0, z1, 1),
+		z0 = svabd_x (p0, z1, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u16.c
new file mode 100644
index 000000000..60aa9429e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u16.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abd_u16_m_tied1:
+**	uabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u16_m_tied1, svuint16_t,
+		z0 = svabd_u16_m (p0, z0, z1),
+		z0 = svabd_m (p0, z0, z1))
+
+/*
+** abd_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	uabd	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u16_m_tied2, svuint16_t,
+		z0 = svabd_u16_m (p0, z1, z0),
+		z0 = svabd_m (p0, z1, z0))
+
+/*
+** abd_u16_m_untied:
+**	movprfx	z0, z1
+**	uabd	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u16_m_untied, svuint16_t,
+		z0 = svabd_u16_m (p0, z1, z2),
+		z0 = svabd_m (p0, z1, z2))
+
+/*
+** abd_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	uabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svabd_n_u16_m (p0, z0, x0),
+		 z0 = svabd_m (p0, z0, x0))
+
+/*
+** abd_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	uabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svabd_n_u16_m (p0, z1, x0),
+		 z0 = svabd_m (p0, z1, x0))
+
+/*
+** abd_1_u16_m_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	uabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u16_m_tied1, svuint16_t,
+		z0 = svabd_n_u16_m (p0, z0, 1),
+		z0 = svabd_m (p0, z0, 1))
+
+/*
+** abd_1_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0, z1
+**	uabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u16_m_untied, svuint16_t,
+		z0 = svabd_n_u16_m (p0, z1, 1),
+		z0 = svabd_m (p0, z1, 1))
+
+/*
+** abd_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	uabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u16_z_tied1, svuint16_t,
+		z0 = svabd_u16_z (p0, z0, z1),
+		z0 = svabd_z (p0, z0, z1))
+
+/*
+** abd_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	uabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u16_z_tied2, svuint16_t,
+		z0 = svabd_u16_z (p0, z1, z0),
+		z0 = svabd_z (p0, z1, z0))
+
+/*
+** abd_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	uabd	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	uabd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u16_z_untied, svuint16_t,
+		z0 = svabd_u16_z (p0, z1, z2),
+		z0 = svabd_z (p0, z1, z2))
+
+/*
+** abd_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	uabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svabd_n_u16_z (p0, z0, x0),
+		 z0 = svabd_z (p0, z0, x0))
+
+/*
+** abd_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	uabd	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	uabd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svabd_n_u16_z (p0, z1, x0),
+		 z0 = svabd_z (p0, z1, x0))
+
+/*
+** abd_1_u16_z_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	uabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u16_z_tied1, svuint16_t,
+		z0 = svabd_n_u16_z (p0, z0, 1),
+		z0 = svabd_z (p0, z0, 1))
+
+/*
+** abd_1_u16_z_untied:
+**	mov	(z[0-9]+\.h), #1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	uabd	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	uabd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u16_z_untied, svuint16_t,
+		z0 = svabd_n_u16_z (p0, z1, 1),
+		z0 = svabd_z (p0, z1, 1))
+
+/*
+** abd_u16_x_tied1:
+**	uabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u16_x_tied1, svuint16_t,
+		z0 = svabd_u16_x (p0, z0, z1),
+		z0 = svabd_x (p0, z0, z1))
+
+/*
+** abd_u16_x_tied2:
+**	uabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u16_x_tied2, svuint16_t,
+		z0 = svabd_u16_x (p0, z1, z0),
+		z0 = svabd_x (p0, z1, z0))
+
+/*
+** abd_u16_x_untied:
+** (
+**	movprfx	z0, z1
+**	uabd	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	uabd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u16_x_untied, svuint16_t,
+		z0 = svabd_u16_x (p0, z1, z2),
+		z0 = svabd_x (p0, z1, z2))
+
+/*
+** abd_w0_u16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	uabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svabd_n_u16_x (p0, z0, x0),
+		 z0 = svabd_x (p0, z0, x0))
+
+/*
+** abd_w0_u16_x_untied:
+**	mov	z0\.h, w0
+**	uabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svabd_n_u16_x (p0, z1, x0),
+		 z0 = svabd_x (p0, z1, x0))
+
+/*
+** abd_1_u16_x_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	uabd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u16_x_tied1, svuint16_t,
+		z0 = svabd_n_u16_x (p0, z0, 1),
+		z0 = svabd_x (p0, z0, 1))
+
+/*
+** abd_1_u16_x_untied:
+**	mov	z0\.h, #1
+**	uabd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u16_x_untied, svuint16_t,
+		z0 = svabd_n_u16_x (p0, z1, 1),
+		z0 = svabd_x (p0, z1, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u32.c
new file mode 100644
index 000000000..bc2410783
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u32.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abd_u32_m_tied1:
+**	uabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u32_m_tied1, svuint32_t,
+		z0 = svabd_u32_m (p0, z0, z1),
+		z0 = svabd_m (p0, z0, z1))
+
+/*
+** abd_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	uabd	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u32_m_tied2, svuint32_t,
+		z0 = svabd_u32_m (p0, z1, z0),
+		z0 = svabd_m (p0, z1, z0))
+
+/*
+** abd_u32_m_untied:
+**	movprfx	z0, z1
+**	uabd	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u32_m_untied, svuint32_t,
+		z0 = svabd_u32_m (p0, z1, z2),
+		z0 = svabd_m (p0, z1, z2))
+
+/*
+** abd_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	uabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svabd_n_u32_m (p0, z0, x0),
+		 z0 = svabd_m (p0, z0, x0))
+
+/*
+** abd_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	uabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svabd_n_u32_m (p0, z1, x0),
+		 z0 = svabd_m (p0, z1, x0))
+
+/*
+** abd_1_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	uabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u32_m_tied1, svuint32_t,
+		z0 = svabd_n_u32_m (p0, z0, 1),
+		z0 = svabd_m (p0, z0, 1))
+
+/*
+** abd_1_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0, z1
+**	uabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u32_m_untied, svuint32_t,
+		z0 = svabd_n_u32_m (p0, z1, 1),
+		z0 = svabd_m (p0, z1, 1))
+
+/*
+** abd_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	uabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u32_z_tied1, svuint32_t,
+		z0 = svabd_u32_z (p0, z0, z1),
+		z0 = svabd_z (p0, z0, z1))
+
+/*
+** abd_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	uabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u32_z_tied2, svuint32_t,
+		z0 = svabd_u32_z (p0, z1, z0),
+		z0 = svabd_z (p0, z1, z0))
+
+/*
+** abd_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	uabd	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	uabd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u32_z_untied, svuint32_t,
+		z0 = svabd_u32_z (p0, z1, z2),
+		z0 = svabd_z (p0, z1, z2))
+
+/*
+** abd_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	uabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svabd_n_u32_z (p0, z0, x0),
+		 z0 = svabd_z (p0, z0, x0))
+
+/*
+** abd_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	uabd	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	uabd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svabd_n_u32_z (p0, z1, x0),
+		 z0 = svabd_z (p0, z1, x0))
+
+/*
+** abd_1_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	uabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u32_z_tied1, svuint32_t,
+		z0 = svabd_n_u32_z (p0, z0, 1),
+		z0 = svabd_z (p0, z0, 1))
+
+/*
+** abd_1_u32_z_untied:
+**	mov	(z[0-9]+\.s), #1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	uabd	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	uabd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u32_z_untied, svuint32_t,
+		z0 = svabd_n_u32_z (p0, z1, 1),
+		z0 = svabd_z (p0, z1, 1))
+
+/*
+** abd_u32_x_tied1:
+**	uabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u32_x_tied1, svuint32_t,
+		z0 = svabd_u32_x (p0, z0, z1),
+		z0 = svabd_x (p0, z0, z1))
+
+/*
+** abd_u32_x_tied2:
+**	uabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u32_x_tied2, svuint32_t,
+		z0 = svabd_u32_x (p0, z1, z0),
+		z0 = svabd_x (p0, z1, z0))
+
+/*
+** abd_u32_x_untied:
+** (
+**	movprfx	z0, z1
+**	uabd	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	uabd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u32_x_untied, svuint32_t,
+		z0 = svabd_u32_x (p0, z1, z2),
+		z0 = svabd_x (p0, z1, z2))
+
+/*
+** abd_w0_u32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	uabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svabd_n_u32_x (p0, z0, x0),
+		 z0 = svabd_x (p0, z0, x0))
+
+/*
+** abd_w0_u32_x_untied:
+**	mov	z0\.s, w0
+**	uabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svabd_n_u32_x (p0, z1, x0),
+		 z0 = svabd_x (p0, z1, x0))
+
+/*
+** abd_1_u32_x_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	uabd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u32_x_tied1, svuint32_t,
+		z0 = svabd_n_u32_x (p0, z0, 1),
+		z0 = svabd_x (p0, z0, 1))
+
+/*
+** abd_1_u32_x_untied:
+**	mov	z0\.s, #1
+**	uabd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u32_x_untied, svuint32_t,
+		z0 = svabd_n_u32_x (p0, z1, 1),
+		z0 = svabd_x (p0, z1, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u64.c
new file mode 100644
index 000000000..d2cdaa06a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u64.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abd_u64_m_tied1:
+**	uabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u64_m_tied1, svuint64_t,
+		z0 = svabd_u64_m (p0, z0, z1),
+		z0 = svabd_m (p0, z0, z1))
+
+/*
+** abd_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	uabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u64_m_tied2, svuint64_t,
+		z0 = svabd_u64_m (p0, z1, z0),
+		z0 = svabd_m (p0, z1, z0))
+
+/*
+** abd_u64_m_untied:
+**	movprfx	z0, z1
+**	uabd	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u64_m_untied, svuint64_t,
+		z0 = svabd_u64_m (p0, z1, z2),
+		z0 = svabd_m (p0, z1, z2))
+
+/*
+** abd_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	uabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svabd_n_u64_m (p0, z0, x0),
+		 z0 = svabd_m (p0, z0, x0))
+
+/*
+** abd_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	uabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svabd_n_u64_m (p0, z1, x0),
+		 z0 = svabd_m (p0, z1, x0))
+
+/*
+** abd_1_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	uabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u64_m_tied1, svuint64_t,
+		z0 = svabd_n_u64_m (p0, z0, 1),
+		z0 = svabd_m (p0, z0, 1))
+
+/*
+** abd_1_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0, z1
+**	uabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u64_m_untied, svuint64_t,
+		z0 = svabd_n_u64_m (p0, z1, 1),
+		z0 = svabd_m (p0, z1, 1))
+
+/*
+** abd_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	uabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u64_z_tied1, svuint64_t,
+		z0 = svabd_u64_z (p0, z0, z1),
+		z0 = svabd_z (p0, z0, z1))
+
+/*
+** abd_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	uabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u64_z_tied2, svuint64_t,
+		z0 = svabd_u64_z (p0, z1, z0),
+		z0 = svabd_z (p0, z1, z0))
+
+/*
+** abd_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	uabd	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	uabd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u64_z_untied, svuint64_t,
+		z0 = svabd_u64_z (p0, z1, z2),
+		z0 = svabd_z (p0, z1, z2))
+
+/*
+** abd_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	uabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svabd_n_u64_z (p0, z0, x0),
+		 z0 = svabd_z (p0, z0, x0))
+
+/*
+** abd_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	uabd	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	uabd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svabd_n_u64_z (p0, z1, x0),
+		 z0 = svabd_z (p0, z1, x0))
+
+/*
+** abd_1_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	uabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u64_z_tied1, svuint64_t,
+		z0 = svabd_n_u64_z (p0, z0, 1),
+		z0 = svabd_z (p0, z0, 1))
+
+/*
+** abd_1_u64_z_untied:
+**	mov	(z[0-9]+\.d), #1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	uabd	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	uabd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u64_z_untied, svuint64_t,
+		z0 = svabd_n_u64_z (p0, z1, 1),
+		z0 = svabd_z (p0, z1, 1))
+
+/*
+** abd_u64_x_tied1:
+**	uabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u64_x_tied1, svuint64_t,
+		z0 = svabd_u64_x (p0, z0, z1),
+		z0 = svabd_x (p0, z0, z1))
+
+/*
+** abd_u64_x_tied2:
+**	uabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u64_x_tied2, svuint64_t,
+		z0 = svabd_u64_x (p0, z1, z0),
+		z0 = svabd_x (p0, z1, z0))
+
+/*
+** abd_u64_x_untied:
+** (
+**	movprfx	z0, z1
+**	uabd	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	uabd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u64_x_untied, svuint64_t,
+		z0 = svabd_u64_x (p0, z1, z2),
+		z0 = svabd_x (p0, z1, z2))
+
+/*
+** abd_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	uabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svabd_n_u64_x (p0, z0, x0),
+		 z0 = svabd_x (p0, z0, x0))
+
+/*
+** abd_x0_u64_x_untied:
+**	mov	z0\.d, x0
+**	uabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svabd_n_u64_x (p0, z1, x0),
+		 z0 = svabd_x (p0, z1, x0))
+
+/*
+** abd_1_u64_x_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	uabd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u64_x_tied1, svuint64_t,
+		z0 = svabd_n_u64_x (p0, z0, 1),
+		z0 = svabd_x (p0, z0, 1))
+
+/*
+** abd_1_u64_x_untied:
+**	mov	z0\.d, #1
+**	uabd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u64_x_untied, svuint64_t,
+		z0 = svabd_n_u64_x (p0, z1, 1),
+		z0 = svabd_x (p0, z1, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u8.c
new file mode 100644
index 000000000..454ef153c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abd_u8.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abd_u8_m_tied1:
+**	uabd	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u8_m_tied1, svuint8_t,
+		z0 = svabd_u8_m (p0, z0, z1),
+		z0 = svabd_m (p0, z0, z1))
+
+/*
+** abd_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	uabd	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u8_m_tied2, svuint8_t,
+		z0 = svabd_u8_m (p0, z1, z0),
+		z0 = svabd_m (p0, z1, z0))
+
+/*
+** abd_u8_m_untied:
+**	movprfx	z0, z1
+**	uabd	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u8_m_untied, svuint8_t,
+		z0 = svabd_u8_m (p0, z1, z2),
+		z0 = svabd_m (p0, z1, z2))
+
+/*
+** abd_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	uabd	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svabd_n_u8_m (p0, z0, x0),
+		 z0 = svabd_m (p0, z0, x0))
+
+/*
+** abd_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	uabd	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svabd_n_u8_m (p0, z1, x0),
+		 z0 = svabd_m (p0, z1, x0))
+
+/*
+** abd_1_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	uabd	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u8_m_tied1, svuint8_t,
+		z0 = svabd_n_u8_m (p0, z0, 1),
+		z0 = svabd_m (p0, z0, 1))
+
+/*
+** abd_1_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0, z1
+**	uabd	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u8_m_untied, svuint8_t,
+		z0 = svabd_n_u8_m (p0, z1, 1),
+		z0 = svabd_m (p0, z1, 1))
+
+/*
+** abd_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	uabd	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u8_z_tied1, svuint8_t,
+		z0 = svabd_u8_z (p0, z0, z1),
+		z0 = svabd_z (p0, z0, z1))
+
+/*
+** abd_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	uabd	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u8_z_tied2, svuint8_t,
+		z0 = svabd_u8_z (p0, z1, z0),
+		z0 = svabd_z (p0, z1, z0))
+
+/*
+** abd_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	uabd	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	uabd	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u8_z_untied, svuint8_t,
+		z0 = svabd_u8_z (p0, z1, z2),
+		z0 = svabd_z (p0, z1, z2))
+
+/*
+** abd_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	uabd	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svabd_n_u8_z (p0, z0, x0),
+		 z0 = svabd_z (p0, z0, x0))
+
+/*
+** abd_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	uabd	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	uabd	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svabd_n_u8_z (p0, z1, x0),
+		 z0 = svabd_z (p0, z1, x0))
+
+/*
+** abd_1_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	uabd	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u8_z_tied1, svuint8_t,
+		z0 = svabd_n_u8_z (p0, z0, 1),
+		z0 = svabd_z (p0, z0, 1))
+
+/*
+** abd_1_u8_z_untied:
+**	mov	(z[0-9]+\.b), #1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	uabd	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	uabd	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u8_z_untied, svuint8_t,
+		z0 = svabd_n_u8_z (p0, z1, 1),
+		z0 = svabd_z (p0, z1, 1))
+
+/*
+** abd_u8_x_tied1:
+**	uabd	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u8_x_tied1, svuint8_t,
+		z0 = svabd_u8_x (p0, z0, z1),
+		z0 = svabd_x (p0, z0, z1))
+
+/*
+** abd_u8_x_tied2:
+**	uabd	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u8_x_tied2, svuint8_t,
+		z0 = svabd_u8_x (p0, z1, z0),
+		z0 = svabd_x (p0, z1, z0))
+
+/*
+** abd_u8_x_untied:
+** (
+**	movprfx	z0, z1
+**	uabd	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0, z2
+**	uabd	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (abd_u8_x_untied, svuint8_t,
+		z0 = svabd_u8_x (p0, z1, z2),
+		z0 = svabd_x (p0, z1, z2))
+
+/*
+** abd_w0_u8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	uabd	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svabd_n_u8_x (p0, z0, x0),
+		 z0 = svabd_x (p0, z0, x0))
+
+/*
+** abd_w0_u8_x_untied:
+**	mov	z0\.b, w0
+**	uabd	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (abd_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svabd_n_u8_x (p0, z1, x0),
+		 z0 = svabd_x (p0, z1, x0))
+
+/*
+** abd_1_u8_x_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	uabd	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u8_x_tied1, svuint8_t,
+		z0 = svabd_n_u8_x (p0, z0, 1),
+		z0 = svabd_x (p0, z0, 1))
+
+/*
+** abd_1_u8_x_untied:
+**	mov	z0\.b, #1
+**	uabd	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abd_1_u8_x_untied, svuint8_t,
+		z0 = svabd_n_u8_x (p0, z1, 1),
+		z0 = svabd_x (p0, z1, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f16.c
new file mode 100644
index 000000000..2aa8736e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f16.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abs_f16_m_tied12:
+**	fabs	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f16_m_tied12, svfloat16_t,
+		z0 = svabs_f16_m (z0, p0, z0),
+		z0 = svabs_m (z0, p0, z0))
+
+/*
+** abs_f16_m_tied1:
+**	fabs	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f16_m_tied1, svfloat16_t,
+		z0 = svabs_f16_m (z0, p0, z1),
+		z0 = svabs_m (z0, p0, z1))
+
+/*
+** abs_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fabs	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f16_m_tied2, svfloat16_t,
+		z0 = svabs_f16_m (z1, p0, z0),
+		z0 = svabs_m (z1, p0, z0))
+
+/*
+** abs_f16_m_untied:
+**	movprfx	z0, z2
+**	fabs	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f16_m_untied, svfloat16_t,
+		z0 = svabs_f16_m (z2, p0, z1),
+		z0 = svabs_m (z2, p0, z1))
+
+/*
+** abs_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	fabs	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f16_z_tied1, svfloat16_t,
+		z0 = svabs_f16_z (p0, z0),
+		z0 = svabs_z (p0, z0))
+
+/*
+** abs_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fabs	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f16_z_untied, svfloat16_t,
+		z0 = svabs_f16_z (p0, z1),
+		z0 = svabs_z (p0, z1))
+
+/*
+** abs_f16_x_tied1:
+**	fabs	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f16_x_tied1, svfloat16_t,
+		z0 = svabs_f16_x (p0, z0),
+		z0 = svabs_x (p0, z0))
+
+/*
+** abs_f16_x_untied:
+**	fabs	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f16_x_untied, svfloat16_t,
+		z0 = svabs_f16_x (p0, z1),
+		z0 = svabs_x (p0, z1))
+
+/*
+** ptrue_abs_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abs_f16_x_tied1, svfloat16_t,
+		z0 = svabs_f16_x (svptrue_b16 (), z0),
+		z0 = svabs_x (svptrue_b16 (), z0))
+
+/*
+** ptrue_abs_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abs_f16_x_untied, svfloat16_t,
+		z0 = svabs_f16_x (svptrue_b16 (), z1),
+		z0 = svabs_x (svptrue_b16 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f32.c
new file mode 100644
index 000000000..30286afc7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f32.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abs_f32_m_tied12:
+**	fabs	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f32_m_tied12, svfloat32_t,
+		z0 = svabs_f32_m (z0, p0, z0),
+		z0 = svabs_m (z0, p0, z0))
+
+/*
+** abs_f32_m_tied1:
+**	fabs	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f32_m_tied1, svfloat32_t,
+		z0 = svabs_f32_m (z0, p0, z1),
+		z0 = svabs_m (z0, p0, z1))
+
+/*
+** abs_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fabs	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f32_m_tied2, svfloat32_t,
+		z0 = svabs_f32_m (z1, p0, z0),
+		z0 = svabs_m (z1, p0, z0))
+
+/*
+** abs_f32_m_untied:
+**	movprfx	z0, z2
+**	fabs	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f32_m_untied, svfloat32_t,
+		z0 = svabs_f32_m (z2, p0, z1),
+		z0 = svabs_m (z2, p0, z1))
+
+/*
+** abs_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	fabs	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f32_z_tied1, svfloat32_t,
+		z0 = svabs_f32_z (p0, z0),
+		z0 = svabs_z (p0, z0))
+
+/*
+** abs_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fabs	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f32_z_untied, svfloat32_t,
+		z0 = svabs_f32_z (p0, z1),
+		z0 = svabs_z (p0, z1))
+
+/*
+** abs_f32_x_tied1:
+**	fabs	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f32_x_tied1, svfloat32_t,
+		z0 = svabs_f32_x (p0, z0),
+		z0 = svabs_x (p0, z0))
+
+/*
+** abs_f32_x_untied:
+**	fabs	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f32_x_untied, svfloat32_t,
+		z0 = svabs_f32_x (p0, z1),
+		z0 = svabs_x (p0, z1))
+
+/*
+** ptrue_abs_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abs_f32_x_tied1, svfloat32_t,
+		z0 = svabs_f32_x (svptrue_b32 (), z0),
+		z0 = svabs_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_abs_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abs_f32_x_untied, svfloat32_t,
+		z0 = svabs_f32_x (svptrue_b32 (), z1),
+		z0 = svabs_x (svptrue_b32 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f64.c
new file mode 100644
index 000000000..28ef9fbba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_f64.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abs_f64_m_tied12:
+**	fabs	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f64_m_tied12, svfloat64_t,
+		z0 = svabs_f64_m (z0, p0, z0),
+		z0 = svabs_m (z0, p0, z0))
+
+/*
+** abs_f64_m_tied1:
+**	fabs	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f64_m_tied1, svfloat64_t,
+		z0 = svabs_f64_m (z0, p0, z1),
+		z0 = svabs_m (z0, p0, z1))
+
+/*
+** abs_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fabs	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f64_m_tied2, svfloat64_t,
+		z0 = svabs_f64_m (z1, p0, z0),
+		z0 = svabs_m (z1, p0, z0))
+
+/*
+** abs_f64_m_untied:
+**	movprfx	z0, z2
+**	fabs	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f64_m_untied, svfloat64_t,
+		z0 = svabs_f64_m (z2, p0, z1),
+		z0 = svabs_m (z2, p0, z1))
+
+/*
+** abs_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	fabs	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f64_z_tied1, svfloat64_t,
+		z0 = svabs_f64_z (p0, z0),
+		z0 = svabs_z (p0, z0))
+
+/*
+** abs_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fabs	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f64_z_untied, svfloat64_t,
+		z0 = svabs_f64_z (p0, z1),
+		z0 = svabs_z (p0, z1))
+
+/*
+** abs_f64_x_tied1:
+**	fabs	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f64_x_tied1, svfloat64_t,
+		z0 = svabs_f64_x (p0, z0),
+		z0 = svabs_x (p0, z0))
+
+/*
+** abs_f64_x_untied:
+**	fabs	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abs_f64_x_untied, svfloat64_t,
+		z0 = svabs_f64_x (p0, z1),
+		z0 = svabs_x (p0, z1))
+
+/*
+** ptrue_abs_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abs_f64_x_tied1, svfloat64_t,
+		z0 = svabs_f64_x (svptrue_b64 (), z0),
+		z0 = svabs_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_abs_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_abs_f64_x_untied, svfloat64_t,
+		z0 = svabs_f64_x (svptrue_b64 (), z1),
+		z0 = svabs_x (svptrue_b64 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s16.c
new file mode 100644
index 000000000..3b16a9c4f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s16.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abs_s16_m_tied12:
+**	abs	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s16_m_tied12, svint16_t,
+		z0 = svabs_s16_m (z0, p0, z0),
+		z0 = svabs_m (z0, p0, z0))
+
+/*
+** abs_s16_m_tied1:
+**	abs	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s16_m_tied1, svint16_t,
+		z0 = svabs_s16_m (z0, p0, z1),
+		z0 = svabs_m (z0, p0, z1))
+
+/*
+** abs_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	abs	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s16_m_tied2, svint16_t,
+		z0 = svabs_s16_m (z1, p0, z0),
+		z0 = svabs_m (z1, p0, z0))
+
+/*
+** abs_s16_m_untied:
+**	movprfx	z0, z2
+**	abs	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s16_m_untied, svint16_t,
+		z0 = svabs_s16_m (z2, p0, z1),
+		z0 = svabs_m (z2, p0, z1))
+
+/*
+** abs_s16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	abs	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s16_z_tied1, svint16_t,
+		z0 = svabs_s16_z (p0, z0),
+		z0 = svabs_z (p0, z0))
+
+/*
+** abs_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	abs	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s16_z_untied, svint16_t,
+		z0 = svabs_s16_z (p0, z1),
+		z0 = svabs_z (p0, z1))
+
+/*
+** abs_s16_x_tied1:
+**	abs	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s16_x_tied1, svint16_t,
+		z0 = svabs_s16_x (p0, z0),
+		z0 = svabs_x (p0, z0))
+
+/*
+** abs_s16_x_untied:
+**	abs	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s16_x_untied, svint16_t,
+		z0 = svabs_s16_x (p0, z1),
+		z0 = svabs_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s32.c
new file mode 100644
index 000000000..14bcbd50c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s32.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abs_s32_m_tied12:
+**	abs	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s32_m_tied12, svint32_t,
+		z0 = svabs_s32_m (z0, p0, z0),
+		z0 = svabs_m (z0, p0, z0))
+
+/*
+** abs_s32_m_tied1:
+**	abs	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s32_m_tied1, svint32_t,
+		z0 = svabs_s32_m (z0, p0, z1),
+		z0 = svabs_m (z0, p0, z1))
+
+/*
+** abs_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	abs	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s32_m_tied2, svint32_t,
+		z0 = svabs_s32_m (z1, p0, z0),
+		z0 = svabs_m (z1, p0, z0))
+
+/*
+** abs_s32_m_untied:
+**	movprfx	z0, z2
+**	abs	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s32_m_untied, svint32_t,
+		z0 = svabs_s32_m (z2, p0, z1),
+		z0 = svabs_m (z2, p0, z1))
+
+/*
+** abs_s32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	abs	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s32_z_tied1, svint32_t,
+		z0 = svabs_s32_z (p0, z0),
+		z0 = svabs_z (p0, z0))
+
+/*
+** abs_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	abs	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s32_z_untied, svint32_t,
+		z0 = svabs_s32_z (p0, z1),
+		z0 = svabs_z (p0, z1))
+
+/*
+** abs_s32_x_tied1:
+**	abs	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s32_x_tied1, svint32_t,
+		z0 = svabs_s32_x (p0, z0),
+		z0 = svabs_x (p0, z0))
+
+/*
+** abs_s32_x_untied:
+**	abs	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s32_x_untied, svint32_t,
+		z0 = svabs_s32_x (p0, z1),
+		z0 = svabs_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s64.c
new file mode 100644
index 000000000..c7b60ff48
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abs_s64_m_tied12:
+**	abs	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s64_m_tied12, svint64_t,
+		z0 = svabs_s64_m (z0, p0, z0),
+		z0 = svabs_m (z0, p0, z0))
+
+/*
+** abs_s64_m_tied1:
+**	abs	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s64_m_tied1, svint64_t,
+		z0 = svabs_s64_m (z0, p0, z1),
+		z0 = svabs_m (z0, p0, z1))
+
+/*
+** abs_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	abs	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s64_m_tied2, svint64_t,
+		z0 = svabs_s64_m (z1, p0, z0),
+		z0 = svabs_m (z1, p0, z0))
+
+/*
+** abs_s64_m_untied:
+**	movprfx	z0, z2
+**	abs	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s64_m_untied, svint64_t,
+		z0 = svabs_s64_m (z2, p0, z1),
+		z0 = svabs_m (z2, p0, z1))
+
+/*
+** abs_s64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	abs	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s64_z_tied1, svint64_t,
+		z0 = svabs_s64_z (p0, z0),
+		z0 = svabs_z (p0, z0))
+
+/*
+** abs_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	abs	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s64_z_untied, svint64_t,
+		z0 = svabs_s64_z (p0, z1),
+		z0 = svabs_z (p0, z1))
+
+/*
+** abs_s64_x_tied1:
+**	abs	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s64_x_tied1, svint64_t,
+		z0 = svabs_s64_x (p0, z0),
+		z0 = svabs_x (p0, z0))
+
+/*
+** abs_s64_x_untied:
+**	abs	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s64_x_untied, svint64_t,
+		z0 = svabs_s64_x (p0, z1),
+		z0 = svabs_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s8.c
new file mode 100644
index 000000000..0bc64c078
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/abs_s8.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** abs_s8_m_tied12:
+**	abs	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s8_m_tied12, svint8_t,
+		z0 = svabs_s8_m (z0, p0, z0),
+		z0 = svabs_m (z0, p0, z0))
+
+/*
+** abs_s8_m_tied1:
+**	abs	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s8_m_tied1, svint8_t,
+		z0 = svabs_s8_m (z0, p0, z1),
+		z0 = svabs_m (z0, p0, z1))
+
+/*
+** abs_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	abs	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s8_m_tied2, svint8_t,
+		z0 = svabs_s8_m (z1, p0, z0),
+		z0 = svabs_m (z1, p0, z0))
+
+/*
+** abs_s8_m_untied:
+**	movprfx	z0, z2
+**	abs	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s8_m_untied, svint8_t,
+		z0 = svabs_s8_m (z2, p0, z1),
+		z0 = svabs_m (z2, p0, z1))
+
+/*
+** abs_s8_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.b, p0/z, \1\.b
+**	abs	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s8_z_tied1, svint8_t,
+		z0 = svabs_s8_z (p0, z0),
+		z0 = svabs_z (p0, z0))
+
+/*
+** abs_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	abs	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s8_z_untied, svint8_t,
+		z0 = svabs_s8_z (p0, z1),
+		z0 = svabs_z (p0, z1))
+
+/*
+** abs_s8_x_tied1:
+**	abs	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s8_x_tied1, svint8_t,
+		z0 = svabs_s8_x (p0, z0),
+		z0 = svabs_x (p0, z0))
+
+/*
+** abs_s8_x_untied:
+**	abs	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (abs_s8_x_untied, svint8_t,
+		z0 = svabs_s8_x (p0, z1),
+		z0 = svabs_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f16.c
new file mode 100644
index 000000000..acef17309
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f16.c
@@ -0,0 +1,71 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** acge_f16_tied:
+** (
+**	facge	p0\.h, p0/z, z0\.h, z1\.h
+** |
+**	facle	p0\.h, p0/z, z1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acge_f16_tied, svfloat16_t,
+		p0 = svacge_f16 (p0, z0, z1),
+		p0 = svacge (p0, z0, z1))
+
+/*
+** acge_f16_untied:
+** (
+**	facge	p0\.h, p1/z, z0\.h, z1\.h
+** |
+**	facle	p0\.h, p1/z, z1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acge_f16_untied, svfloat16_t,
+		p0 = svacge_f16 (p1, z0, z1),
+		p0 = svacge (p1, z0, z1))
+
+/*
+** acge_h4_f16:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	facge	p0\.h, p1/z, z0\.h, \1
+** |
+**	facle	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (acge_h4_f16, svfloat16_t, float16_t,
+		 p0 = svacge_n_f16 (p1, z0, d4),
+		 p0 = svacge (p1, z0, d4))
+
+/*
+** acge_0_f16:
+**	mov	(z[0-9]+\.h), #0
+** (
+**	facge	p0\.h, p1/z, z0\.h, \1
+** |
+**	facle	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acge_0_f16, svfloat16_t,
+		p0 = svacge_n_f16 (p1, z0, 0),
+		p0 = svacge (p1, z0, 0))
+
+/*
+** acge_1_f16:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+** (
+**	facge	p0\.h, p1/z, z0\.h, \1
+** |
+**	facle	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acge_1_f16, svfloat16_t,
+		p0 = svacge_n_f16 (p1, z0, 1),
+		p0 = svacge (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f32.c
new file mode 100644
index 000000000..c3d195ab8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f32.c
@@ -0,0 +1,71 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** acge_f32_tied:
+** (
+**	facge	p0\.s, p0/z, z0\.s, z1\.s
+** |
+**	facle	p0\.s, p0/z, z1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acge_f32_tied, svfloat32_t,
+		p0 = svacge_f32 (p0, z0, z1),
+		p0 = svacge (p0, z0, z1))
+
+/*
+** acge_f32_untied:
+** (
+**	facge	p0\.s, p1/z, z0\.s, z1\.s
+** |
+**	facle	p0\.s, p1/z, z1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acge_f32_untied, svfloat32_t,
+		p0 = svacge_f32 (p1, z0, z1),
+		p0 = svacge (p1, z0, z1))
+
+/*
+** acge_s4_f32:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	facge	p0\.s, p1/z, z0\.s, \1
+** |
+**	facle	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (acge_s4_f32, svfloat32_t, float32_t,
+		 p0 = svacge_n_f32 (p1, z0, d4),
+		 p0 = svacge (p1, z0, d4))
+
+/*
+** acge_0_f32:
+**	mov	(z[0-9]+\.s), #0
+** (
+**	facge	p0\.s, p1/z, z0\.s, \1
+** |
+**	facle	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acge_0_f32, svfloat32_t,
+		p0 = svacge_n_f32 (p1, z0, 0),
+		p0 = svacge (p1, z0, 0))
+
+/*
+** acge_1_f32:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+** (
+**	facge	p0\.s, p1/z, z0\.s, \1
+** |
+**	facle	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acge_1_f32, svfloat32_t,
+		p0 = svacge_n_f32 (p1, z0, 1),
+		p0 = svacge (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f64.c
new file mode 100644
index 000000000..207ce93a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acge_f64.c
@@ -0,0 +1,71 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** acge_f64_tied:
+** (
+**	facge	p0\.d, p0/z, z0\.d, z1\.d
+** |
+**	facle	p0\.d, p0/z, z1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acge_f64_tied, svfloat64_t,
+		p0 = svacge_f64 (p0, z0, z1),
+		p0 = svacge (p0, z0, z1))
+
+/*
+** acge_f64_untied:
+** (
+**	facge	p0\.d, p1/z, z0\.d, z1\.d
+** |
+**	facle	p0\.d, p1/z, z1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acge_f64_untied, svfloat64_t,
+		p0 = svacge_f64 (p1, z0, z1),
+		p0 = svacge (p1, z0, z1))
+
+/*
+** acge_d4_f64:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	facge	p0\.d, p1/z, z0\.d, \1
+** |
+**	facle	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (acge_d4_f64, svfloat64_t, float64_t,
+		 p0 = svacge_n_f64 (p1, z0, d4),
+		 p0 = svacge (p1, z0, d4))
+
+/*
+** acge_0_f64:
+**	mov	(z[0-9]+\.d), #0
+** (
+**	facge	p0\.d, p1/z, z0\.d, \1
+** |
+**	facle	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acge_0_f64, svfloat64_t,
+		p0 = svacge_n_f64 (p1, z0, 0),
+		p0 = svacge (p1, z0, 0))
+
+/*
+** acge_1_f64:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+** (
+**	facge	p0\.d, p1/z, z0\.d, \1
+** |
+**	facle	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acge_1_f64, svfloat64_t,
+		p0 = svacge_n_f64 (p1, z0, 1),
+		p0 = svacge (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f16.c
new file mode 100644
index 000000000..53c63351c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f16.c
@@ -0,0 +1,71 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** acgt_f16_tied:
+** (
+**	facgt	p0\.h, p0/z, z0\.h, z1\.h
+** |
+**	faclt	p0\.h, p0/z, z1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acgt_f16_tied, svfloat16_t,
+		p0 = svacgt_f16 (p0, z0, z1),
+		p0 = svacgt (p0, z0, z1))
+
+/*
+** acgt_f16_untied:
+** (
+**	facgt	p0\.h, p1/z, z0\.h, z1\.h
+** |
+**	faclt	p0\.h, p1/z, z1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acgt_f16_untied, svfloat16_t,
+		p0 = svacgt_f16 (p1, z0, z1),
+		p0 = svacgt (p1, z0, z1))
+
+/*
+** acgt_h4_f16:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	facgt	p0\.h, p1/z, z0\.h, \1
+** |
+**	faclt	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (acgt_h4_f16, svfloat16_t, float16_t,
+		 p0 = svacgt_n_f16 (p1, z0, d4),
+		 p0 = svacgt (p1, z0, d4))
+
+/*
+** acgt_0_f16:
+**	mov	(z[0-9]+\.h), #0
+** (
+**	facgt	p0\.h, p1/z, z0\.h, \1
+** |
+**	faclt	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acgt_0_f16, svfloat16_t,
+		p0 = svacgt_n_f16 (p1, z0, 0),
+		p0 = svacgt (p1, z0, 0))
+
+/*
+** acgt_1_f16:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+** (
+**	facgt	p0\.h, p1/z, z0\.h, \1
+** |
+**	faclt	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acgt_1_f16, svfloat16_t,
+		p0 = svacgt_n_f16 (p1, z0, 1),
+		p0 = svacgt (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f32.c
new file mode 100644
index 000000000..d71c84ea6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f32.c
@@ -0,0 +1,71 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** acgt_f32_tied:
+** (
+**	facgt	p0\.s, p0/z, z0\.s, z1\.s
+** |
+**	faclt	p0\.s, p0/z, z1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acgt_f32_tied, svfloat32_t,
+		p0 = svacgt_f32 (p0, z0, z1),
+		p0 = svacgt (p0, z0, z1))
+
+/*
+** acgt_f32_untied:
+** (
+**	facgt	p0\.s, p1/z, z0\.s, z1\.s
+** |
+**	faclt	p0\.s, p1/z, z1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acgt_f32_untied, svfloat32_t,
+		p0 = svacgt_f32 (p1, z0, z1),
+		p0 = svacgt (p1, z0, z1))
+
+/*
+** acgt_s4_f32:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	facgt	p0\.s, p1/z, z0\.s, \1
+** |
+**	faclt	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (acgt_s4_f32, svfloat32_t, float32_t,
+		 p0 = svacgt_n_f32 (p1, z0, d4),
+		 p0 = svacgt (p1, z0, d4))
+
+/*
+** acgt_0_f32:
+**	mov	(z[0-9]+\.s), #0
+** (
+**	facgt	p0\.s, p1/z, z0\.s, \1
+** |
+**	faclt	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acgt_0_f32, svfloat32_t,
+		p0 = svacgt_n_f32 (p1, z0, 0),
+		p0 = svacgt (p1, z0, 0))
+
+/*
+** acgt_1_f32:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+** (
+**	facgt	p0\.s, p1/z, z0\.s, \1
+** |
+**	faclt	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acgt_1_f32, svfloat32_t,
+		p0 = svacgt_n_f32 (p1, z0, 1),
+		p0 = svacgt (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f64.c
new file mode 100644
index 000000000..15d549e18
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acgt_f64.c
@@ -0,0 +1,71 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** acgt_f64_tied:
+** (
+**	facgt	p0\.d, p0/z, z0\.d, z1\.d
+** |
+**	faclt	p0\.d, p0/z, z1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acgt_f64_tied, svfloat64_t,
+		p0 = svacgt_f64 (p0, z0, z1),
+		p0 = svacgt (p0, z0, z1))
+
+/*
+** acgt_f64_untied:
+** (
+**	facgt	p0\.d, p1/z, z0\.d, z1\.d
+** |
+**	faclt	p0\.d, p1/z, z1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acgt_f64_untied, svfloat64_t,
+		p0 = svacgt_f64 (p1, z0, z1),
+		p0 = svacgt (p1, z0, z1))
+
+/*
+** acgt_d4_f64:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	facgt	p0\.d, p1/z, z0\.d, \1
+** |
+**	faclt	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (acgt_d4_f64, svfloat64_t, float64_t,
+		 p0 = svacgt_n_f64 (p1, z0, d4),
+		 p0 = svacgt (p1, z0, d4))
+
+/*
+** acgt_0_f64:
+**	mov	(z[0-9]+\.d), #0
+** (
+**	facgt	p0\.d, p1/z, z0\.d, \1
+** |
+**	faclt	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acgt_0_f64, svfloat64_t,
+		p0 = svacgt_n_f64 (p1, z0, 0),
+		p0 = svacgt (p1, z0, 0))
+
+/*
+** acgt_1_f64:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+** (
+**	facgt	p0\.d, p1/z, z0\.d, \1
+** |
+**	faclt	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acgt_1_f64, svfloat64_t,
+		p0 = svacgt_n_f64 (p1, z0, 1),
+		p0 = svacgt (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f16.c
new file mode 100644
index 000000000..ed6721d57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f16.c
@@ -0,0 +1,71 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** acle_f16_tied:
+** (
+**	facge	p0\.h, p0/z, z1\.h, z0\.h
+** |
+**	facle	p0\.h, p0/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acle_f16_tied, svfloat16_t,
+		p0 = svacle_f16 (p0, z0, z1),
+		p0 = svacle (p0, z0, z1))
+
+/*
+** acle_f16_untied:
+** (
+**	facge	p0\.h, p1/z, z1\.h, z0\.h
+** |
+**	facle	p0\.h, p1/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acle_f16_untied, svfloat16_t,
+		p0 = svacle_f16 (p1, z0, z1),
+		p0 = svacle (p1, z0, z1))
+
+/*
+** acle_h4_f16:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	facge	p0\.h, p1/z, \1, z0\.h
+** |
+**	facle	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (acle_h4_f16, svfloat16_t, float16_t,
+		 p0 = svacle_n_f16 (p1, z0, d4),
+		 p0 = svacle (p1, z0, d4))
+
+/*
+** acle_0_f16:
+**	mov	(z[0-9]+\.h), #0
+** (
+**	facge	p0\.h, p1/z, \1, z0\.h
+** |
+**	facle	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acle_0_f16, svfloat16_t,
+		p0 = svacle_n_f16 (p1, z0, 0),
+		p0 = svacle (p1, z0, 0))
+
+/*
+** acle_1_f16:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+** (
+**	facge	p0\.h, p1/z, \1, z0\.h
+** |
+**	facle	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acle_1_f16, svfloat16_t,
+		p0 = svacle_n_f16 (p1, z0, 1),
+		p0 = svacle (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f32.c
new file mode 100644
index 000000000..7fc9da701
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f32.c
@@ -0,0 +1,71 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** acle_f32_tied:
+** (
+**	facge	p0\.s, p0/z, z1\.s, z0\.s
+** |
+**	facle	p0\.s, p0/z, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acle_f32_tied, svfloat32_t,
+		p0 = svacle_f32 (p0, z0, z1),
+		p0 = svacle (p0, z0, z1))
+
+/*
+** acle_f32_untied:
+** (
+**	facge	p0\.s, p1/z, z1\.s, z0\.s
+** |
+**	facle	p0\.s, p1/z, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acle_f32_untied, svfloat32_t,
+		p0 = svacle_f32 (p1, z0, z1),
+		p0 = svacle (p1, z0, z1))
+
+/*
+** acle_s4_f32:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	facge	p0\.s, p1/z, \1, z0\.s
+** |
+**	facle	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (acle_s4_f32, svfloat32_t, float32_t,
+		 p0 = svacle_n_f32 (p1, z0, d4),
+		 p0 = svacle (p1, z0, d4))
+
+/*
+** acle_0_f32:
+**	mov	(z[0-9]+\.s), #0
+** (
+**	facge	p0\.s, p1/z, \1, z0\.s
+** |
+**	facle	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acle_0_f32, svfloat32_t,
+		p0 = svacle_n_f32 (p1, z0, 0),
+		p0 = svacle (p1, z0, 0))
+
+/*
+** acle_1_f32:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+** (
+**	facge	p0\.s, p1/z, \1, z0\.s
+** |
+**	facle	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acle_1_f32, svfloat32_t,
+		p0 = svacle_n_f32 (p1, z0, 1),
+		p0 = svacle (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f64.c
new file mode 100644
index 000000000..ecbb8e500
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/acle_f64.c
@@ -0,0 +1,71 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** acle_f64_tied:
+** (
+**	facge	p0\.d, p0/z, z1\.d, z0\.d
+** |
+**	facle	p0\.d, p0/z, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acle_f64_tied, svfloat64_t,
+		p0 = svacle_f64 (p0, z0, z1),
+		p0 = svacle (p0, z0, z1))
+
+/*
+** acle_f64_untied:
+** (
+**	facge	p0\.d, p1/z, z1\.d, z0\.d
+** |
+**	facle	p0\.d, p1/z, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acle_f64_untied, svfloat64_t,
+		p0 = svacle_f64 (p1, z0, z1),
+		p0 = svacle (p1, z0, z1))
+
+/*
+** acle_d4_f64:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	facge	p0\.d, p1/z, \1, z0\.d
+** |
+**	facle	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (acle_d4_f64, svfloat64_t, float64_t,
+		 p0 = svacle_n_f64 (p1, z0, d4),
+		 p0 = svacle (p1, z0, d4))
+
+/*
+** acle_0_f64:
+**	mov	(z[0-9]+\.d), #0
+** (
+**	facge	p0\.d, p1/z, \1, z0\.d
+** |
+**	facle	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acle_0_f64, svfloat64_t,
+		p0 = svacle_n_f64 (p1, z0, 0),
+		p0 = svacle (p1, z0, 0))
+
+/*
+** acle_1_f64:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+** (
+**	facge	p0\.d, p1/z, \1, z0\.d
+** |
+**	facle	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (acle_1_f64, svfloat64_t,
+		p0 = svacle_n_f64 (p1, z0, 1),
+		p0 = svacle (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f16.c
new file mode 100644
index 000000000..e5f5040c7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f16.c
@@ -0,0 +1,71 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** aclt_f16_tied:
+** (
+**	facgt	p0\.h, p0/z, z1\.h, z0\.h
+** |
+**	faclt	p0\.h, p0/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (aclt_f16_tied, svfloat16_t,
+		p0 = svaclt_f16 (p0, z0, z1),
+		p0 = svaclt (p0, z0, z1))
+
+/*
+** aclt_f16_untied:
+** (
+**	facgt	p0\.h, p1/z, z1\.h, z0\.h
+** |
+**	faclt	p0\.h, p1/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (aclt_f16_untied, svfloat16_t,
+		p0 = svaclt_f16 (p1, z0, z1),
+		p0 = svaclt (p1, z0, z1))
+
+/*
+** aclt_h4_f16:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	facgt	p0\.h, p1/z, \1, z0\.h
+** |
+**	faclt	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (aclt_h4_f16, svfloat16_t, float16_t,
+		 p0 = svaclt_n_f16 (p1, z0, d4),
+		 p0 = svaclt (p1, z0, d4))
+
+/*
+** aclt_0_f16:
+**	mov	(z[0-9]+\.h), #0
+** (
+**	facgt	p0\.h, p1/z, \1, z0\.h
+** |
+**	faclt	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (aclt_0_f16, svfloat16_t,
+		p0 = svaclt_n_f16 (p1, z0, 0),
+		p0 = svaclt (p1, z0, 0))
+
+/*
+** aclt_1_f16:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+** (
+**	facgt	p0\.h, p1/z, \1, z0\.h
+** |
+**	faclt	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (aclt_1_f16, svfloat16_t,
+		p0 = svaclt_n_f16 (p1, z0, 1),
+		p0 = svaclt (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f32.c
new file mode 100644
index 000000000..f40826445
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f32.c
@@ -0,0 +1,71 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** aclt_f32_tied:
+** (
+**	facgt	p0\.s, p0/z, z1\.s, z0\.s
+** |
+**	faclt	p0\.s, p0/z, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (aclt_f32_tied, svfloat32_t,
+		p0 = svaclt_f32 (p0, z0, z1),
+		p0 = svaclt (p0, z0, z1))
+
+/*
+** aclt_f32_untied:
+** (
+**	facgt	p0\.s, p1/z, z1\.s, z0\.s
+** |
+**	faclt	p0\.s, p1/z, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (aclt_f32_untied, svfloat32_t,
+		p0 = svaclt_f32 (p1, z0, z1),
+		p0 = svaclt (p1, z0, z1))
+
+/*
+** aclt_s4_f32:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	facgt	p0\.s, p1/z, \1, z0\.s
+** |
+**	faclt	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (aclt_s4_f32, svfloat32_t, float32_t,
+		 p0 = svaclt_n_f32 (p1, z0, d4),
+		 p0 = svaclt (p1, z0, d4))
+
+/*
+** aclt_0_f32:
+**	mov	(z[0-9]+\.s), #0
+** (
+**	facgt	p0\.s, p1/z, \1, z0\.s
+** |
+**	faclt	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (aclt_0_f32, svfloat32_t,
+		p0 = svaclt_n_f32 (p1, z0, 0),
+		p0 = svaclt (p1, z0, 0))
+
+/*
+** aclt_1_f32:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+** (
+**	facgt	p0\.s, p1/z, \1, z0\.s
+** |
+**	faclt	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (aclt_1_f32, svfloat32_t,
+		p0 = svaclt_n_f32 (p1, z0, 1),
+		p0 = svaclt (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f64.c
new file mode 100644
index 000000000..0170b3307
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/aclt_f64.c
@@ -0,0 +1,71 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** aclt_f64_tied:
+** (
+**	facgt	p0\.d, p0/z, z1\.d, z0\.d
+** |
+**	faclt	p0\.d, p0/z, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (aclt_f64_tied, svfloat64_t,
+		p0 = svaclt_f64 (p0, z0, z1),
+		p0 = svaclt (p0, z0, z1))
+
+/*
+** aclt_f64_untied:
+** (
+**	facgt	p0\.d, p1/z, z1\.d, z0\.d
+** |
+**	faclt	p0\.d, p1/z, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (aclt_f64_untied, svfloat64_t,
+		p0 = svaclt_f64 (p1, z0, z1),
+		p0 = svaclt (p1, z0, z1))
+
+/*
+** aclt_d4_f64:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	facgt	p0\.d, p1/z, \1, z0\.d
+** |
+**	faclt	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (aclt_d4_f64, svfloat64_t, float64_t,
+		 p0 = svaclt_n_f64 (p1, z0, d4),
+		 p0 = svaclt (p1, z0, d4))
+
+/*
+** aclt_0_f64:
+**	mov	(z[0-9]+\.d), #0
+** (
+**	facgt	p0\.d, p1/z, \1, z0\.d
+** |
+**	faclt	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (aclt_0_f64, svfloat64_t,
+		p0 = svaclt_n_f64 (p1, z0, 0),
+		p0 = svaclt (p1, z0, 0))
+
+/*
+** aclt_1_f64:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+** (
+**	facgt	p0\.d, p1/z, \1, z0\.d
+** |
+**	faclt	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (aclt_1_f64, svfloat64_t,
+		p0 = svaclt_n_f64 (p1, z0, 1),
+		p0 = svaclt (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f16.c
new file mode 100644
index 000000000..7228e5dd5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f16.c
@@ -0,0 +1,577 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** add_f16_m_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_m_tied1, svfloat16_t,
+		z0 = svadd_f16_m (p0, z0, z1),
+		z0 = svadd_m (p0, z0, z1))
+
+/*
+** add_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_m_tied2, svfloat16_t,
+		z0 = svadd_f16_m (p0, z1, z0),
+		z0 = svadd_m (p0, z1, z0))
+
+/*
+** add_f16_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_m_untied, svfloat16_t,
+		z0 = svadd_f16_m (p0, z1, z2),
+		z0 = svadd_m (p0, z1, z2))
+
+/*
+** add_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svadd_n_f16_m (p0, z0, d4),
+		 z0 = svadd_m (p0, z0, d4))
+
+/*
+** add_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svadd_n_f16_m (p0, z1, d4),
+		 z0 = svadd_m (p0, z1, d4))
+
+/*
+** add_1_f16_m_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f16_m_tied1, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z0, 1),
+		z0 = svadd_m (p0, z0, 1))
+
+/*
+** add_1_f16_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f16_m_untied, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z1, 1),
+		z0 = svadd_m (p0, z1, 1))
+
+/*
+** add_0p5_f16_m_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f16_m_tied1, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z0, 0.5),
+		z0 = svadd_m (p0, z0, 0.5))
+
+/*
+** add_0p5_f16_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f16_m_untied, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z1, 0.5),
+		z0 = svadd_m (p0, z1, 0.5))
+
+/*
+** add_m1_f16_m_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f16_m_tied1, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z0, -1),
+		z0 = svadd_m (p0, z0, -1))
+
+/*
+** add_m1_f16_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f16_m_untied, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z1, -1),
+		z0 = svadd_m (p0, z1, -1))
+
+/*
+** add_m0p5_f16_m_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f16_m_tied1, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z0, -0.5),
+		z0 = svadd_m (p0, z0, -0.5))
+
+/*
+** add_m0p5_f16_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f16_m_untied, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z1, -0.5),
+		z0 = svadd_m (p0, z1, -0.5))
+
+/*
+** add_m2_f16_m:
+**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_f16_m, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z0, -2),
+		z0 = svadd_m (p0, z0, -2))
+
+/*
+** add_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_z_tied1, svfloat16_t,
+		z0 = svadd_f16_z (p0, z0, z1),
+		z0 = svadd_z (p0, z0, z1))
+
+/*
+** add_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_z_tied2, svfloat16_t,
+		z0 = svadd_f16_z (p0, z1, z0),
+		z0 = svadd_z (p0, z1, z0))
+
+/*
+** add_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fadd	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fadd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_z_untied, svfloat16_t,
+		z0 = svadd_f16_z (p0, z1, z2),
+		z0 = svadd_z (p0, z1, z2))
+
+/*
+** add_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svadd_n_f16_z (p0, z0, d4),
+		 z0 = svadd_z (p0, z0, d4))
+
+/*
+** add_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fadd	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fadd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (add_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svadd_n_f16_z (p0, z1, d4),
+		 z0 = svadd_z (p0, z1, d4))
+
+/*
+** add_1_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f16_z_tied1, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z0, 1),
+		z0 = svadd_z (p0, z0, 1))
+
+/*
+** add_1_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f16_z_untied, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z1, 1),
+		z0 = svadd_z (p0, z1, 1))
+
+/*
+** add_0p5_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f16_z_tied1, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z0, 0.5),
+		z0 = svadd_z (p0, z0, 0.5))
+
+/*
+** add_0p5_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f16_z_untied, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z1, 0.5),
+		z0 = svadd_z (p0, z1, 0.5))
+
+/*
+** add_m1_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f16_z_tied1, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z0, -1),
+		z0 = svadd_z (p0, z0, -1))
+
+/*
+** add_m1_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f16_z_untied, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z1, -1),
+		z0 = svadd_z (p0, z1, -1))
+
+/*
+** add_m0p5_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f16_z_tied1, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z0, -0.5),
+		z0 = svadd_z (p0, z0, -0.5))
+
+/*
+** add_m0p5_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f16_z_untied, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z1, -0.5),
+		z0 = svadd_z (p0, z1, -0.5))
+
+/*
+** add_m2_f16_z:
+**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_f16_z, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z0, -2),
+		z0 = svadd_z (p0, z0, -2))
+
+/*
+** add_f16_x_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_x_tied1, svfloat16_t,
+		z0 = svadd_f16_x (p0, z0, z1),
+		z0 = svadd_x (p0, z0, z1))
+
+/*
+** add_f16_x_tied2:
+**	fadd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_x_tied2, svfloat16_t,
+		z0 = svadd_f16_x (p0, z1, z0),
+		z0 = svadd_x (p0, z1, z0))
+
+/*
+** add_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	fadd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_x_untied, svfloat16_t,
+		z0 = svadd_f16_x (p0, z1, z2),
+		z0 = svadd_x (p0, z1, z2))
+
+/*
+** add_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svadd_n_f16_x (p0, z0, d4),
+		 z0 = svadd_x (p0, z0, d4))
+
+/*
+** add_h4_f16_x_untied:
+**	mov	z0\.h, h4
+**	fadd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (add_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svadd_n_f16_x (p0, z1, d4),
+		 z0 = svadd_x (p0, z1, d4))
+
+/*
+** add_1_f16_x_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z0, 1),
+		z0 = svadd_x (p0, z0, 1))
+
+/*
+** add_1_f16_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z1, 1),
+		z0 = svadd_x (p0, z1, 1))
+
+/*
+** add_0p5_f16_x_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z0, 0.5),
+		z0 = svadd_x (p0, z0, 0.5))
+
+/*
+** add_0p5_f16_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z1, 0.5),
+		z0 = svadd_x (p0, z1, 0.5))
+
+/*
+** add_m1_f16_x_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z0, -1),
+		z0 = svadd_x (p0, z0, -1))
+
+/*
+** add_m1_f16_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z1, -1),
+		z0 = svadd_x (p0, z1, -1))
+
+/*
+** add_m0p5_f16_x_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z0, -0.5),
+		z0 = svadd_x (p0, z0, -0.5))
+
+/*
+** add_m0p5_f16_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z1, -0.5),
+		z0 = svadd_x (p0, z1, -0.5))
+
+/*
+** add_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_2_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z0, 2),
+		z0 = svadd_x (p0, z0, 2))
+
+/*
+** add_2_f16_x_untied:
+**	fmov	z0\.h, #2\.0(?:e\+0)?
+**	fadd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_2_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z1, 2),
+		z0 = svadd_x (p0, z1, 2))
+
+/*
+** ptrue_add_f16_x_tied1:
+**	fadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f16_x_tied1, svfloat16_t,
+		z0 = svadd_f16_x (svptrue_b16 (), z0, z1),
+		z0 = svadd_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_add_f16_x_tied2:
+**	fadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f16_x_tied2, svfloat16_t,
+		z0 = svadd_f16_x (svptrue_b16 (), z1, z0),
+		z0 = svadd_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_add_f16_x_untied:
+**	fadd	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f16_x_untied, svfloat16_t,
+		z0 = svadd_f16_x (svptrue_b16 (), z1, z2),
+		z0 = svadd_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_add_1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_1_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z0, 1),
+		z0 = svadd_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_add_1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_1_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z1, 1),
+		z0 = svadd_x (svptrue_b16 (), z1, 1))
+
+/*
+** ptrue_add_0p5_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z0, 0.5),
+		z0 = svadd_x (svptrue_b16 (), z0, 0.5))
+
+/*
+** ptrue_add_0p5_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_0p5_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z1, 0.5),
+		z0 = svadd_x (svptrue_b16 (), z1, 0.5))
+
+/*
+** ptrue_add_m1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m1_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z0, -1),
+		z0 = svadd_x (svptrue_b16 (), z0, -1))
+
+/*
+** ptrue_add_m1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m1_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z1, -1),
+		z0 = svadd_x (svptrue_b16 (), z1, -1))
+
+/*
+** ptrue_add_m0p5_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m0p5_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z0, -0.5),
+		z0 = svadd_x (svptrue_b16 (), z0, -0.5))
+
+/*
+** ptrue_add_m0p5_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m0p5_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z1, -0.5),
+		z0 = svadd_x (svptrue_b16 (), z1, -0.5))
+
+/*
+** ptrue_add_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fadd	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_2_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z0, 2),
+		z0 = svadd_x (svptrue_b16 (), z0, 2))
+
+/*
+** ptrue_add_2_f16_x_untied:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fadd	z0\.h, (z1\.h, \1|\1, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_2_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z1, 2),
+		z0 = svadd_x (svptrue_b16 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f16_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f16_notrap.c
new file mode 100644
index 000000000..f6330acee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f16_notrap.c
@@ -0,0 +1,572 @@
+/* { dg-additional-options "-fno-trapping-math" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** add_f16_m_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_m_tied1, svfloat16_t,
+		z0 = svadd_f16_m (p0, z0, z1),
+		z0 = svadd_m (p0, z0, z1))
+
+/*
+** add_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_m_tied2, svfloat16_t,
+		z0 = svadd_f16_m (p0, z1, z0),
+		z0 = svadd_m (p0, z1, z0))
+
+/*
+** add_f16_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_m_untied, svfloat16_t,
+		z0 = svadd_f16_m (p0, z1, z2),
+		z0 = svadd_m (p0, z1, z2))
+
+/*
+** add_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svadd_n_f16_m (p0, z0, d4),
+		 z0 = svadd_m (p0, z0, d4))
+
+/*
+** add_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svadd_n_f16_m (p0, z1, d4),
+		 z0 = svadd_m (p0, z1, d4))
+
+/*
+** add_1_f16_m_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f16_m_tied1, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z0, 1),
+		z0 = svadd_m (p0, z0, 1))
+
+/*
+** add_1_f16_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f16_m_untied, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z1, 1),
+		z0 = svadd_m (p0, z1, 1))
+
+/*
+** add_0p5_f16_m_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f16_m_tied1, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z0, 0.5),
+		z0 = svadd_m (p0, z0, 0.5))
+
+/*
+** add_0p5_f16_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f16_m_untied, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z1, 0.5),
+		z0 = svadd_m (p0, z1, 0.5))
+
+/*
+** add_m1_f16_m_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f16_m_tied1, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z0, -1),
+		z0 = svadd_m (p0, z0, -1))
+
+/*
+** add_m1_f16_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f16_m_untied, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z1, -1),
+		z0 = svadd_m (p0, z1, -1))
+
+/*
+** add_m0p5_f16_m_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f16_m_tied1, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z0, -0.5),
+		z0 = svadd_m (p0, z0, -0.5))
+
+/*
+** add_m0p5_f16_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f16_m_untied, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z1, -0.5),
+		z0 = svadd_m (p0, z1, -0.5))
+
+/*
+** add_m2_f16_m:
+**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_f16_m, svfloat16_t,
+		z0 = svadd_n_f16_m (p0, z0, -2),
+		z0 = svadd_m (p0, z0, -2))
+
+/*
+** add_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_z_tied1, svfloat16_t,
+		z0 = svadd_f16_z (p0, z0, z1),
+		z0 = svadd_z (p0, z0, z1))
+
+/*
+** add_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_z_tied2, svfloat16_t,
+		z0 = svadd_f16_z (p0, z1, z0),
+		z0 = svadd_z (p0, z1, z0))
+
+/*
+** add_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fadd	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fadd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_z_untied, svfloat16_t,
+		z0 = svadd_f16_z (p0, z1, z2),
+		z0 = svadd_z (p0, z1, z2))
+
+/*
+** add_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svadd_n_f16_z (p0, z0, d4),
+		 z0 = svadd_z (p0, z0, d4))
+
+/*
+** add_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fadd	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fadd	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (add_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svadd_n_f16_z (p0, z1, d4),
+		 z0 = svadd_z (p0, z1, d4))
+
+/*
+** add_1_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f16_z_tied1, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z0, 1),
+		z0 = svadd_z (p0, z0, 1))
+
+/*
+** add_1_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f16_z_untied, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z1, 1),
+		z0 = svadd_z (p0, z1, 1))
+
+/*
+** add_0p5_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f16_z_tied1, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z0, 0.5),
+		z0 = svadd_z (p0, z0, 0.5))
+
+/*
+** add_0p5_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f16_z_untied, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z1, 0.5),
+		z0 = svadd_z (p0, z1, 0.5))
+
+/*
+** add_m1_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f16_z_tied1, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z0, -1),
+		z0 = svadd_z (p0, z0, -1))
+
+/*
+** add_m1_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f16_z_untied, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z1, -1),
+		z0 = svadd_z (p0, z1, -1))
+
+/*
+** add_m0p5_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f16_z_tied1, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z0, -0.5),
+		z0 = svadd_z (p0, z0, -0.5))
+
+/*
+** add_m0p5_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f16_z_untied, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z1, -0.5),
+		z0 = svadd_z (p0, z1, -0.5))
+
+/*
+** add_m2_f16_z:
+**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_f16_z, svfloat16_t,
+		z0 = svadd_n_f16_z (p0, z0, -2),
+		z0 = svadd_z (p0, z0, -2))
+
+/*
+** add_f16_x_tied1:
+**	fadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_x_tied1, svfloat16_t,
+		z0 = svadd_f16_x (p0, z0, z1),
+		z0 = svadd_x (p0, z0, z1))
+
+/*
+** add_f16_x_tied2:
+**	fadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_x_tied2, svfloat16_t,
+		z0 = svadd_f16_x (p0, z1, z0),
+		z0 = svadd_x (p0, z1, z0))
+
+/*
+** add_f16_x_untied:
+**	fadd	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (add_f16_x_untied, svfloat16_t,
+		z0 = svadd_f16_x (p0, z1, z2),
+		z0 = svadd_x (p0, z1, z2))
+
+/*
+** add_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fadd	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_ZD (add_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svadd_n_f16_x (p0, z0, d4),
+		 z0 = svadd_x (p0, z0, d4))
+
+/*
+** add_h4_f16_x_untied:
+**	mov	(z[0-9]+\.h), h4
+**	fadd	z0\.h, (z1\.h, \1|\1, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_ZD (add_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svadd_n_f16_x (p0, z1, d4),
+		 z0 = svadd_x (p0, z1, d4))
+
+/*
+** add_1_f16_x_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z0, 1),
+		z0 = svadd_x (p0, z0, 1))
+
+/*
+** add_1_f16_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z1, 1),
+		z0 = svadd_x (p0, z1, 1))
+
+/*
+** add_0p5_f16_x_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z0, 0.5),
+		z0 = svadd_x (p0, z0, 0.5))
+
+/*
+** add_0p5_f16_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z1, 0.5),
+		z0 = svadd_x (p0, z1, 0.5))
+
+/*
+** add_m1_f16_x_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z0, -1),
+		z0 = svadd_x (p0, z0, -1))
+
+/*
+** add_m1_f16_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z1, -1),
+		z0 = svadd_x (p0, z1, -1))
+
+/*
+** add_m0p5_f16_x_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z0, -0.5),
+		z0 = svadd_x (p0, z0, -0.5))
+
+/*
+** add_m0p5_f16_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z1, -0.5),
+		z0 = svadd_x (p0, z1, -0.5))
+
+/*
+** add_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fadd	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (add_2_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z0, 2),
+		z0 = svadd_x (p0, z0, 2))
+
+/*
+** add_2_f16_x_untied:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fadd	z0\.h, (z1\.h, \1|\1, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (add_2_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (p0, z1, 2),
+		z0 = svadd_x (p0, z1, 2))
+
+/*
+** ptrue_add_f16_x_tied1:
+**	fadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f16_x_tied1, svfloat16_t,
+		z0 = svadd_f16_x (svptrue_b16 (), z0, z1),
+		z0 = svadd_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_add_f16_x_tied2:
+**	fadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f16_x_tied2, svfloat16_t,
+		z0 = svadd_f16_x (svptrue_b16 (), z1, z0),
+		z0 = svadd_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_add_f16_x_untied:
+**	fadd	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f16_x_untied, svfloat16_t,
+		z0 = svadd_f16_x (svptrue_b16 (), z1, z2),
+		z0 = svadd_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_add_1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_1_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z0, 1),
+		z0 = svadd_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_add_1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_1_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z1, 1),
+		z0 = svadd_x (svptrue_b16 (), z1, 1))
+
+/*
+** ptrue_add_0p5_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z0, 0.5),
+		z0 = svadd_x (svptrue_b16 (), z0, 0.5))
+
+/*
+** ptrue_add_0p5_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_0p5_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z1, 0.5),
+		z0 = svadd_x (svptrue_b16 (), z1, 0.5))
+
+/*
+** ptrue_add_m1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m1_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z0, -1),
+		z0 = svadd_x (svptrue_b16 (), z0, -1))
+
+/*
+** ptrue_add_m1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m1_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z1, -1),
+		z0 = svadd_x (svptrue_b16 (), z1, -1))
+
+/*
+** ptrue_add_m0p5_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m0p5_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z0, -0.5),
+		z0 = svadd_x (svptrue_b16 (), z0, -0.5))
+
+/*
+** ptrue_add_m0p5_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m0p5_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z1, -0.5),
+		z0 = svadd_x (svptrue_b16 (), z1, -0.5))
+
+/*
+** ptrue_add_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fadd	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_2_f16_x_tied1, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z0, 2),
+		z0 = svadd_x (svptrue_b16 (), z0, 2))
+
+/*
+** ptrue_add_2_f16_x_untied:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fadd	z0\.h, (z1\.h, \1|\1, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_2_f16_x_untied, svfloat16_t,
+		z0 = svadd_n_f16_x (svptrue_b16 (), z1, 2),
+		z0 = svadd_x (svptrue_b16 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f32.c
new file mode 100644
index 000000000..b5f4e9623
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f32.c
@@ -0,0 +1,577 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** add_f32_m_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_m_tied1, svfloat32_t,
+		z0 = svadd_f32_m (p0, z0, z1),
+		z0 = svadd_m (p0, z0, z1))
+
+/*
+** add_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_m_tied2, svfloat32_t,
+		z0 = svadd_f32_m (p0, z1, z0),
+		z0 = svadd_m (p0, z1, z0))
+
+/*
+** add_f32_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_m_untied, svfloat32_t,
+		z0 = svadd_f32_m (p0, z1, z2),
+		z0 = svadd_m (p0, z1, z2))
+
+/*
+** add_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svadd_n_f32_m (p0, z0, d4),
+		 z0 = svadd_m (p0, z0, d4))
+
+/*
+** add_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svadd_n_f32_m (p0, z1, d4),
+		 z0 = svadd_m (p0, z1, d4))
+
+/*
+** add_1_f32_m_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f32_m_tied1, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z0, 1),
+		z0 = svadd_m (p0, z0, 1))
+
+/*
+** add_1_f32_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f32_m_untied, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z1, 1),
+		z0 = svadd_m (p0, z1, 1))
+
+/*
+** add_0p5_f32_m_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f32_m_tied1, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z0, 0.5),
+		z0 = svadd_m (p0, z0, 0.5))
+
+/*
+** add_0p5_f32_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f32_m_untied, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z1, 0.5),
+		z0 = svadd_m (p0, z1, 0.5))
+
+/*
+** add_m1_f32_m_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f32_m_tied1, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z0, -1),
+		z0 = svadd_m (p0, z0, -1))
+
+/*
+** add_m1_f32_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f32_m_untied, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z1, -1),
+		z0 = svadd_m (p0, z1, -1))
+
+/*
+** add_m0p5_f32_m_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f32_m_tied1, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z0, -0.5),
+		z0 = svadd_m (p0, z0, -0.5))
+
+/*
+** add_m0p5_f32_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f32_m_untied, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z1, -0.5),
+		z0 = svadd_m (p0, z1, -0.5))
+
+/*
+** add_m2_f32_m:
+**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_f32_m, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z0, -2),
+		z0 = svadd_m (p0, z0, -2))
+
+/*
+** add_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_z_tied1, svfloat32_t,
+		z0 = svadd_f32_z (p0, z0, z1),
+		z0 = svadd_z (p0, z0, z1))
+
+/*
+** add_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_z_tied2, svfloat32_t,
+		z0 = svadd_f32_z (p0, z1, z0),
+		z0 = svadd_z (p0, z1, z0))
+
+/*
+** add_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fadd	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fadd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_z_untied, svfloat32_t,
+		z0 = svadd_f32_z (p0, z1, z2),
+		z0 = svadd_z (p0, z1, z2))
+
+/*
+** add_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svadd_n_f32_z (p0, z0, d4),
+		 z0 = svadd_z (p0, z0, d4))
+
+/*
+** add_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fadd	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fadd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (add_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svadd_n_f32_z (p0, z1, d4),
+		 z0 = svadd_z (p0, z1, d4))
+
+/*
+** add_1_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f32_z_tied1, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z0, 1),
+		z0 = svadd_z (p0, z0, 1))
+
+/*
+** add_1_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f32_z_untied, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z1, 1),
+		z0 = svadd_z (p0, z1, 1))
+
+/*
+** add_0p5_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f32_z_tied1, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z0, 0.5),
+		z0 = svadd_z (p0, z0, 0.5))
+
+/*
+** add_0p5_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f32_z_untied, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z1, 0.5),
+		z0 = svadd_z (p0, z1, 0.5))
+
+/*
+** add_m1_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f32_z_tied1, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z0, -1),
+		z0 = svadd_z (p0, z0, -1))
+
+/*
+** add_m1_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f32_z_untied, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z1, -1),
+		z0 = svadd_z (p0, z1, -1))
+
+/*
+** add_m0p5_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f32_z_tied1, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z0, -0.5),
+		z0 = svadd_z (p0, z0, -0.5))
+
+/*
+** add_m0p5_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f32_z_untied, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z1, -0.5),
+		z0 = svadd_z (p0, z1, -0.5))
+
+/*
+** add_m2_f32_z:
+**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_f32_z, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z0, -2),
+		z0 = svadd_z (p0, z0, -2))
+
+/*
+** add_f32_x_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_x_tied1, svfloat32_t,
+		z0 = svadd_f32_x (p0, z0, z1),
+		z0 = svadd_x (p0, z0, z1))
+
+/*
+** add_f32_x_tied2:
+**	fadd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_x_tied2, svfloat32_t,
+		z0 = svadd_f32_x (p0, z1, z0),
+		z0 = svadd_x (p0, z1, z0))
+
+/*
+** add_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	fadd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_x_untied, svfloat32_t,
+		z0 = svadd_f32_x (p0, z1, z2),
+		z0 = svadd_x (p0, z1, z2))
+
+/*
+** add_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svadd_n_f32_x (p0, z0, d4),
+		 z0 = svadd_x (p0, z0, d4))
+
+/*
+** add_s4_f32_x_untied:
+**	mov	z0\.s, s4
+**	fadd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (add_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svadd_n_f32_x (p0, z1, d4),
+		 z0 = svadd_x (p0, z1, d4))
+
+/*
+** add_1_f32_x_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z0, 1),
+		z0 = svadd_x (p0, z0, 1))
+
+/*
+** add_1_f32_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z1, 1),
+		z0 = svadd_x (p0, z1, 1))
+
+/*
+** add_0p5_f32_x_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z0, 0.5),
+		z0 = svadd_x (p0, z0, 0.5))
+
+/*
+** add_0p5_f32_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z1, 0.5),
+		z0 = svadd_x (p0, z1, 0.5))
+
+/*
+** add_m1_f32_x_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z0, -1),
+		z0 = svadd_x (p0, z0, -1))
+
+/*
+** add_m1_f32_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z1, -1),
+		z0 = svadd_x (p0, z1, -1))
+
+/*
+** add_m0p5_f32_x_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z0, -0.5),
+		z0 = svadd_x (p0, z0, -0.5))
+
+/*
+** add_m0p5_f32_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z1, -0.5),
+		z0 = svadd_x (p0, z1, -0.5))
+
+/*
+** add_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_2_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z0, 2),
+		z0 = svadd_x (p0, z0, 2))
+
+/*
+** add_2_f32_x_untied:
+**	fmov	z0\.s, #2\.0(?:e\+0)?
+**	fadd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_2_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z1, 2),
+		z0 = svadd_x (p0, z1, 2))
+
+/*
+** ptrue_add_f32_x_tied1:
+**	fadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f32_x_tied1, svfloat32_t,
+		z0 = svadd_f32_x (svptrue_b32 (), z0, z1),
+		z0 = svadd_x (svptrue_b32 (), z0, z1))
+
+/*
+** ptrue_add_f32_x_tied2:
+**	fadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f32_x_tied2, svfloat32_t,
+		z0 = svadd_f32_x (svptrue_b32 (), z1, z0),
+		z0 = svadd_x (svptrue_b32 (), z1, z0))
+
+/*
+** ptrue_add_f32_x_untied:
+**	fadd	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f32_x_untied, svfloat32_t,
+		z0 = svadd_f32_x (svptrue_b32 (), z1, z2),
+		z0 = svadd_x (svptrue_b32 (), z1, z2))
+
+/*
+** ptrue_add_1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_1_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z0, 1),
+		z0 = svadd_x (svptrue_b32 (), z0, 1))
+
+/*
+** ptrue_add_1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_1_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z1, 1),
+		z0 = svadd_x (svptrue_b32 (), z1, 1))
+
+/*
+** ptrue_add_0p5_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z0, 0.5),
+		z0 = svadd_x (svptrue_b32 (), z0, 0.5))
+
+/*
+** ptrue_add_0p5_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_0p5_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z1, 0.5),
+		z0 = svadd_x (svptrue_b32 (), z1, 0.5))
+
+/*
+** ptrue_add_m1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m1_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z0, -1),
+		z0 = svadd_x (svptrue_b32 (), z0, -1))
+
+/*
+** ptrue_add_m1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m1_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z1, -1),
+		z0 = svadd_x (svptrue_b32 (), z1, -1))
+
+/*
+** ptrue_add_m0p5_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m0p5_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z0, -0.5),
+		z0 = svadd_x (svptrue_b32 (), z0, -0.5))
+
+/*
+** ptrue_add_m0p5_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m0p5_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z1, -0.5),
+		z0 = svadd_x (svptrue_b32 (), z1, -0.5))
+
+/*
+** ptrue_add_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fadd	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_2_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z0, 2),
+		z0 = svadd_x (svptrue_b32 (), z0, 2))
+
+/*
+** ptrue_add_2_f32_x_untied:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fadd	z0\.s, (z1\.s, \1|\1, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_2_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z1, 2),
+		z0 = svadd_x (svptrue_b32 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f32_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f32_notrap.c
new file mode 100644
index 000000000..062e5fd67
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f32_notrap.c
@@ -0,0 +1,572 @@
+/* { dg-additional-options "-fno-trapping-math" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** add_f32_m_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_m_tied1, svfloat32_t,
+		z0 = svadd_f32_m (p0, z0, z1),
+		z0 = svadd_m (p0, z0, z1))
+
+/*
+** add_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_m_tied2, svfloat32_t,
+		z0 = svadd_f32_m (p0, z1, z0),
+		z0 = svadd_m (p0, z1, z0))
+
+/*
+** add_f32_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_m_untied, svfloat32_t,
+		z0 = svadd_f32_m (p0, z1, z2),
+		z0 = svadd_m (p0, z1, z2))
+
+/*
+** add_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svadd_n_f32_m (p0, z0, d4),
+		 z0 = svadd_m (p0, z0, d4))
+
+/*
+** add_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svadd_n_f32_m (p0, z1, d4),
+		 z0 = svadd_m (p0, z1, d4))
+
+/*
+** add_1_f32_m_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f32_m_tied1, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z0, 1),
+		z0 = svadd_m (p0, z0, 1))
+
+/*
+** add_1_f32_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f32_m_untied, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z1, 1),
+		z0 = svadd_m (p0, z1, 1))
+
+/*
+** add_0p5_f32_m_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f32_m_tied1, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z0, 0.5),
+		z0 = svadd_m (p0, z0, 0.5))
+
+/*
+** add_0p5_f32_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f32_m_untied, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z1, 0.5),
+		z0 = svadd_m (p0, z1, 0.5))
+
+/*
+** add_m1_f32_m_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f32_m_tied1, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z0, -1),
+		z0 = svadd_m (p0, z0, -1))
+
+/*
+** add_m1_f32_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f32_m_untied, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z1, -1),
+		z0 = svadd_m (p0, z1, -1))
+
+/*
+** add_m0p5_f32_m_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f32_m_tied1, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z0, -0.5),
+		z0 = svadd_m (p0, z0, -0.5))
+
+/*
+** add_m0p5_f32_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f32_m_untied, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z1, -0.5),
+		z0 = svadd_m (p0, z1, -0.5))
+
+/*
+** add_m2_f32_m:
+**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_f32_m, svfloat32_t,
+		z0 = svadd_n_f32_m (p0, z0, -2),
+		z0 = svadd_m (p0, z0, -2))
+
+/*
+** add_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_z_tied1, svfloat32_t,
+		z0 = svadd_f32_z (p0, z0, z1),
+		z0 = svadd_z (p0, z0, z1))
+
+/*
+** add_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_z_tied2, svfloat32_t,
+		z0 = svadd_f32_z (p0, z1, z0),
+		z0 = svadd_z (p0, z1, z0))
+
+/*
+** add_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fadd	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fadd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_z_untied, svfloat32_t,
+		z0 = svadd_f32_z (p0, z1, z2),
+		z0 = svadd_z (p0, z1, z2))
+
+/*
+** add_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svadd_n_f32_z (p0, z0, d4),
+		 z0 = svadd_z (p0, z0, d4))
+
+/*
+** add_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fadd	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fadd	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (add_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svadd_n_f32_z (p0, z1, d4),
+		 z0 = svadd_z (p0, z1, d4))
+
+/*
+** add_1_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f32_z_tied1, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z0, 1),
+		z0 = svadd_z (p0, z0, 1))
+
+/*
+** add_1_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f32_z_untied, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z1, 1),
+		z0 = svadd_z (p0, z1, 1))
+
+/*
+** add_0p5_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f32_z_tied1, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z0, 0.5),
+		z0 = svadd_z (p0, z0, 0.5))
+
+/*
+** add_0p5_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f32_z_untied, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z1, 0.5),
+		z0 = svadd_z (p0, z1, 0.5))
+
+/*
+** add_m1_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f32_z_tied1, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z0, -1),
+		z0 = svadd_z (p0, z0, -1))
+
+/*
+** add_m1_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f32_z_untied, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z1, -1),
+		z0 = svadd_z (p0, z1, -1))
+
+/*
+** add_m0p5_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f32_z_tied1, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z0, -0.5),
+		z0 = svadd_z (p0, z0, -0.5))
+
+/*
+** add_m0p5_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f32_z_untied, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z1, -0.5),
+		z0 = svadd_z (p0, z1, -0.5))
+
+/*
+** add_m2_f32_z:
+**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_f32_z, svfloat32_t,
+		z0 = svadd_n_f32_z (p0, z0, -2),
+		z0 = svadd_z (p0, z0, -2))
+
+/*
+** add_f32_x_tied1:
+**	fadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_x_tied1, svfloat32_t,
+		z0 = svadd_f32_x (p0, z0, z1),
+		z0 = svadd_x (p0, z0, z1))
+
+/*
+** add_f32_x_tied2:
+**	fadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_x_tied2, svfloat32_t,
+		z0 = svadd_f32_x (p0, z1, z0),
+		z0 = svadd_x (p0, z1, z0))
+
+/*
+** add_f32_x_untied:
+**	fadd	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_f32_x_untied, svfloat32_t,
+		z0 = svadd_f32_x (p0, z1, z2),
+		z0 = svadd_x (p0, z1, z2))
+
+/*
+** add_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fadd	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_ZD (add_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svadd_n_f32_x (p0, z0, d4),
+		 z0 = svadd_x (p0, z0, d4))
+
+/*
+** add_s4_f32_x_untied:
+**	mov	(z[0-9]+\.s), s4
+**	fadd	z0\.s, (z1\.s, \1|\1, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_ZD (add_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svadd_n_f32_x (p0, z1, d4),
+		 z0 = svadd_x (p0, z1, d4))
+
+/*
+** add_1_f32_x_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z0, 1),
+		z0 = svadd_x (p0, z0, 1))
+
+/*
+** add_1_f32_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z1, 1),
+		z0 = svadd_x (p0, z1, 1))
+
+/*
+** add_0p5_f32_x_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z0, 0.5),
+		z0 = svadd_x (p0, z0, 0.5))
+
+/*
+** add_0p5_f32_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z1, 0.5),
+		z0 = svadd_x (p0, z1, 0.5))
+
+/*
+** add_m1_f32_x_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z0, -1),
+		z0 = svadd_x (p0, z0, -1))
+
+/*
+** add_m1_f32_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z1, -1),
+		z0 = svadd_x (p0, z1, -1))
+
+/*
+** add_m0p5_f32_x_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z0, -0.5),
+		z0 = svadd_x (p0, z0, -0.5))
+
+/*
+** add_m0p5_f32_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z1, -0.5),
+		z0 = svadd_x (p0, z1, -0.5))
+
+/*
+** add_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fadd	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_2_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z0, 2),
+		z0 = svadd_x (p0, z0, 2))
+
+/*
+** add_2_f32_x_untied:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fadd	z0\.s, (z1\.s, \1|\1, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_2_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (p0, z1, 2),
+		z0 = svadd_x (p0, z1, 2))
+
+/*
+** ptrue_add_f32_x_tied1:
+**	fadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f32_x_tied1, svfloat32_t,
+		z0 = svadd_f32_x (svptrue_b32 (), z0, z1),
+		z0 = svadd_x (svptrue_b32 (), z0, z1))
+
+/*
+** ptrue_add_f32_x_tied2:
+**	fadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f32_x_tied2, svfloat32_t,
+		z0 = svadd_f32_x (svptrue_b32 (), z1, z0),
+		z0 = svadd_x (svptrue_b32 (), z1, z0))
+
+/*
+** ptrue_add_f32_x_untied:
+**	fadd	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f32_x_untied, svfloat32_t,
+		z0 = svadd_f32_x (svptrue_b32 (), z1, z2),
+		z0 = svadd_x (svptrue_b32 (), z1, z2))
+
+/*
+** ptrue_add_1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_1_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z0, 1),
+		z0 = svadd_x (svptrue_b32 (), z0, 1))
+
+/*
+** ptrue_add_1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_1_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z1, 1),
+		z0 = svadd_x (svptrue_b32 (), z1, 1))
+
+/*
+** ptrue_add_0p5_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z0, 0.5),
+		z0 = svadd_x (svptrue_b32 (), z0, 0.5))
+
+/*
+** ptrue_add_0p5_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_0p5_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z1, 0.5),
+		z0 = svadd_x (svptrue_b32 (), z1, 0.5))
+
+/*
+** ptrue_add_m1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m1_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z0, -1),
+		z0 = svadd_x (svptrue_b32 (), z0, -1))
+
+/*
+** ptrue_add_m1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m1_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z1, -1),
+		z0 = svadd_x (svptrue_b32 (), z1, -1))
+
+/*
+** ptrue_add_m0p5_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m0p5_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z0, -0.5),
+		z0 = svadd_x (svptrue_b32 (), z0, -0.5))
+
+/*
+** ptrue_add_m0p5_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m0p5_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z1, -0.5),
+		z0 = svadd_x (svptrue_b32 (), z1, -0.5))
+
+/*
+** ptrue_add_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fadd	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_2_f32_x_tied1, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z0, 2),
+		z0 = svadd_x (svptrue_b32 (), z0, 2))
+
+/*
+** ptrue_add_2_f32_x_untied:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fadd	z0\.s, (z1\.s, \1|\1, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_2_f32_x_untied, svfloat32_t,
+		z0 = svadd_n_f32_x (svptrue_b32 (), z1, 2),
+		z0 = svadd_x (svptrue_b32 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f64.c
new file mode 100644
index 000000000..7185f3acf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f64.c
@@ -0,0 +1,577 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** add_f64_m_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_m_tied1, svfloat64_t,
+		z0 = svadd_f64_m (p0, z0, z1),
+		z0 = svadd_m (p0, z0, z1))
+
+/*
+** add_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_m_tied2, svfloat64_t,
+		z0 = svadd_f64_m (p0, z1, z0),
+		z0 = svadd_m (p0, z1, z0))
+
+/*
+** add_f64_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_m_untied, svfloat64_t,
+		z0 = svadd_f64_m (p0, z1, z2),
+		z0 = svadd_m (p0, z1, z2))
+
+/*
+** add_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svadd_n_f64_m (p0, z0, d4),
+		 z0 = svadd_m (p0, z0, d4))
+
+/*
+** add_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svadd_n_f64_m (p0, z1, d4),
+		 z0 = svadd_m (p0, z1, d4))
+
+/*
+** add_1_f64_m_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f64_m_tied1, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z0, 1),
+		z0 = svadd_m (p0, z0, 1))
+
+/*
+** add_1_f64_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f64_m_untied, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z1, 1),
+		z0 = svadd_m (p0, z1, 1))
+
+/*
+** add_0p5_f64_m_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f64_m_tied1, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z0, 0.5),
+		z0 = svadd_m (p0, z0, 0.5))
+
+/*
+** add_0p5_f64_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f64_m_untied, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z1, 0.5),
+		z0 = svadd_m (p0, z1, 0.5))
+
+/*
+** add_m1_f64_m_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f64_m_tied1, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z0, -1),
+		z0 = svadd_m (p0, z0, -1))
+
+/*
+** add_m1_f64_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f64_m_untied, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z1, -1),
+		z0 = svadd_m (p0, z1, -1))
+
+/*
+** add_m0p5_f64_m_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f64_m_tied1, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z0, -0.5),
+		z0 = svadd_m (p0, z0, -0.5))
+
+/*
+** add_m0p5_f64_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f64_m_untied, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z1, -0.5),
+		z0 = svadd_m (p0, z1, -0.5))
+
+/*
+** add_m2_f64_m:
+**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_f64_m, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z0, -2),
+		z0 = svadd_m (p0, z0, -2))
+
+/*
+** add_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_z_tied1, svfloat64_t,
+		z0 = svadd_f64_z (p0, z0, z1),
+		z0 = svadd_z (p0, z0, z1))
+
+/*
+** add_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_z_tied2, svfloat64_t,
+		z0 = svadd_f64_z (p0, z1, z0),
+		z0 = svadd_z (p0, z1, z0))
+
+/*
+** add_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fadd	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fadd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_z_untied, svfloat64_t,
+		z0 = svadd_f64_z (p0, z1, z2),
+		z0 = svadd_z (p0, z1, z2))
+
+/*
+** add_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svadd_n_f64_z (p0, z0, d4),
+		 z0 = svadd_z (p0, z0, d4))
+
+/*
+** add_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fadd	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fadd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (add_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svadd_n_f64_z (p0, z1, d4),
+		 z0 = svadd_z (p0, z1, d4))
+
+/*
+** add_1_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f64_z_tied1, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z0, 1),
+		z0 = svadd_z (p0, z0, 1))
+
+/*
+** add_1_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f64_z_untied, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z1, 1),
+		z0 = svadd_z (p0, z1, 1))
+
+/*
+** add_0p5_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f64_z_tied1, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z0, 0.5),
+		z0 = svadd_z (p0, z0, 0.5))
+
+/*
+** add_0p5_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f64_z_untied, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z1, 0.5),
+		z0 = svadd_z (p0, z1, 0.5))
+
+/*
+** add_m1_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f64_z_tied1, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z0, -1),
+		z0 = svadd_z (p0, z0, -1))
+
+/*
+** add_m1_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f64_z_untied, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z1, -1),
+		z0 = svadd_z (p0, z1, -1))
+
+/*
+** add_m0p5_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f64_z_tied1, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z0, -0.5),
+		z0 = svadd_z (p0, z0, -0.5))
+
+/*
+** add_m0p5_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f64_z_untied, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z1, -0.5),
+		z0 = svadd_z (p0, z1, -0.5))
+
+/*
+** add_m2_f64_z:
+**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_f64_z, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z0, -2),
+		z0 = svadd_z (p0, z0, -2))
+
+/*
+** add_f64_x_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_x_tied1, svfloat64_t,
+		z0 = svadd_f64_x (p0, z0, z1),
+		z0 = svadd_x (p0, z0, z1))
+
+/*
+** add_f64_x_tied2:
+**	fadd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_x_tied2, svfloat64_t,
+		z0 = svadd_f64_x (p0, z1, z0),
+		z0 = svadd_x (p0, z1, z0))
+
+/*
+** add_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	fadd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_x_untied, svfloat64_t,
+		z0 = svadd_f64_x (p0, z1, z2),
+		z0 = svadd_x (p0, z1, z2))
+
+/*
+** add_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svadd_n_f64_x (p0, z0, d4),
+		 z0 = svadd_x (p0, z0, d4))
+
+/*
+** add_d4_f64_x_untied:
+**	mov	z0\.d, d4
+**	fadd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (add_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svadd_n_f64_x (p0, z1, d4),
+		 z0 = svadd_x (p0, z1, d4))
+
+/*
+** add_1_f64_x_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z0, 1),
+		z0 = svadd_x (p0, z0, 1))
+
+/*
+** add_1_f64_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z1, 1),
+		z0 = svadd_x (p0, z1, 1))
+
+/*
+** add_0p5_f64_x_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z0, 0.5),
+		z0 = svadd_x (p0, z0, 0.5))
+
+/*
+** add_0p5_f64_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z1, 0.5),
+		z0 = svadd_x (p0, z1, 0.5))
+
+/*
+** add_m1_f64_x_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z0, -1),
+		z0 = svadd_x (p0, z0, -1))
+
+/*
+** add_m1_f64_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z1, -1),
+		z0 = svadd_x (p0, z1, -1))
+
+/*
+** add_m0p5_f64_x_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z0, -0.5),
+		z0 = svadd_x (p0, z0, -0.5))
+
+/*
+** add_m0p5_f64_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z1, -0.5),
+		z0 = svadd_x (p0, z1, -0.5))
+
+/*
+** add_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_2_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z0, 2),
+		z0 = svadd_x (p0, z0, 2))
+
+/*
+** add_2_f64_x_untied:
+**	fmov	z0\.d, #2\.0(?:e\+0)?
+**	fadd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_2_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z1, 2),
+		z0 = svadd_x (p0, z1, 2))
+
+/*
+** ptrue_add_f64_x_tied1:
+**	fadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f64_x_tied1, svfloat64_t,
+		z0 = svadd_f64_x (svptrue_b64 (), z0, z1),
+		z0 = svadd_x (svptrue_b64 (), z0, z1))
+
+/*
+** ptrue_add_f64_x_tied2:
+**	fadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f64_x_tied2, svfloat64_t,
+		z0 = svadd_f64_x (svptrue_b64 (), z1, z0),
+		z0 = svadd_x (svptrue_b64 (), z1, z0))
+
+/*
+** ptrue_add_f64_x_untied:
+**	fadd	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f64_x_untied, svfloat64_t,
+		z0 = svadd_f64_x (svptrue_b64 (), z1, z2),
+		z0 = svadd_x (svptrue_b64 (), z1, z2))
+
+/*
+** ptrue_add_1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_1_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z0, 1),
+		z0 = svadd_x (svptrue_b64 (), z0, 1))
+
+/*
+** ptrue_add_1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_1_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z1, 1),
+		z0 = svadd_x (svptrue_b64 (), z1, 1))
+
+/*
+** ptrue_add_0p5_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z0, 0.5),
+		z0 = svadd_x (svptrue_b64 (), z0, 0.5))
+
+/*
+** ptrue_add_0p5_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_0p5_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z1, 0.5),
+		z0 = svadd_x (svptrue_b64 (), z1, 0.5))
+
+/*
+** ptrue_add_m1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m1_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z0, -1),
+		z0 = svadd_x (svptrue_b64 (), z0, -1))
+
+/*
+** ptrue_add_m1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m1_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z1, -1),
+		z0 = svadd_x (svptrue_b64 (), z1, -1))
+
+/*
+** ptrue_add_m0p5_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m0p5_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z0, -0.5),
+		z0 = svadd_x (svptrue_b64 (), z0, -0.5))
+
+/*
+** ptrue_add_m0p5_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m0p5_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z1, -0.5),
+		z0 = svadd_x (svptrue_b64 (), z1, -0.5))
+
+/*
+** ptrue_add_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fadd	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_2_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z0, 2),
+		z0 = svadd_x (svptrue_b64 (), z0, 2))
+
+/*
+** ptrue_add_2_f64_x_untied:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fadd	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_2_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z1, 2),
+		z0 = svadd_x (svptrue_b64 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f64_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f64_notrap.c
new file mode 100644
index 000000000..6d095b507
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_f64_notrap.c
@@ -0,0 +1,572 @@
+/* { dg-additional-options "-fno-trapping-math" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** add_f64_m_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_m_tied1, svfloat64_t,
+		z0 = svadd_f64_m (p0, z0, z1),
+		z0 = svadd_m (p0, z0, z1))
+
+/*
+** add_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_m_tied2, svfloat64_t,
+		z0 = svadd_f64_m (p0, z1, z0),
+		z0 = svadd_m (p0, z1, z0))
+
+/*
+** add_f64_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_m_untied, svfloat64_t,
+		z0 = svadd_f64_m (p0, z1, z2),
+		z0 = svadd_m (p0, z1, z2))
+
+/*
+** add_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svadd_n_f64_m (p0, z0, d4),
+		 z0 = svadd_m (p0, z0, d4))
+
+/*
+** add_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svadd_n_f64_m (p0, z1, d4),
+		 z0 = svadd_m (p0, z1, d4))
+
+/*
+** add_1_f64_m_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f64_m_tied1, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z0, 1),
+		z0 = svadd_m (p0, z0, 1))
+
+/*
+** add_1_f64_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f64_m_untied, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z1, 1),
+		z0 = svadd_m (p0, z1, 1))
+
+/*
+** add_0p5_f64_m_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f64_m_tied1, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z0, 0.5),
+		z0 = svadd_m (p0, z0, 0.5))
+
+/*
+** add_0p5_f64_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f64_m_untied, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z1, 0.5),
+		z0 = svadd_m (p0, z1, 0.5))
+
+/*
+** add_m1_f64_m_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f64_m_tied1, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z0, -1),
+		z0 = svadd_m (p0, z0, -1))
+
+/*
+** add_m1_f64_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f64_m_untied, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z1, -1),
+		z0 = svadd_m (p0, z1, -1))
+
+/*
+** add_m0p5_f64_m_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f64_m_tied1, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z0, -0.5),
+		z0 = svadd_m (p0, z0, -0.5))
+
+/*
+** add_m0p5_f64_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f64_m_untied, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z1, -0.5),
+		z0 = svadd_m (p0, z1, -0.5))
+
+/*
+** add_m2_f64_m:
+**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_f64_m, svfloat64_t,
+		z0 = svadd_n_f64_m (p0, z0, -2),
+		z0 = svadd_m (p0, z0, -2))
+
+/*
+** add_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_z_tied1, svfloat64_t,
+		z0 = svadd_f64_z (p0, z0, z1),
+		z0 = svadd_z (p0, z0, z1))
+
+/*
+** add_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_z_tied2, svfloat64_t,
+		z0 = svadd_f64_z (p0, z1, z0),
+		z0 = svadd_z (p0, z1, z0))
+
+/*
+** add_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fadd	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fadd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_z_untied, svfloat64_t,
+		z0 = svadd_f64_z (p0, z1, z2),
+		z0 = svadd_z (p0, z1, z2))
+
+/*
+** add_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (add_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svadd_n_f64_z (p0, z0, d4),
+		 z0 = svadd_z (p0, z0, d4))
+
+/*
+** add_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fadd	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fadd	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (add_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svadd_n_f64_z (p0, z1, d4),
+		 z0 = svadd_z (p0, z1, d4))
+
+/*
+** add_1_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f64_z_tied1, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z0, 1),
+		z0 = svadd_z (p0, z0, 1))
+
+/*
+** add_1_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f64_z_untied, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z1, 1),
+		z0 = svadd_z (p0, z1, 1))
+
+/*
+** add_0p5_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f64_z_tied1, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z0, 0.5),
+		z0 = svadd_z (p0, z0, 0.5))
+
+/*
+** add_0p5_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f64_z_untied, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z1, 0.5),
+		z0 = svadd_z (p0, z1, 0.5))
+
+/*
+** add_m1_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f64_z_tied1, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z0, -1),
+		z0 = svadd_z (p0, z0, -1))
+
+/*
+** add_m1_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f64_z_untied, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z1, -1),
+		z0 = svadd_z (p0, z1, -1))
+
+/*
+** add_m0p5_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f64_z_tied1, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z0, -0.5),
+		z0 = svadd_z (p0, z0, -0.5))
+
+/*
+** add_m0p5_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f64_z_untied, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z1, -0.5),
+		z0 = svadd_z (p0, z1, -0.5))
+
+/*
+** add_m2_f64_z:
+**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_f64_z, svfloat64_t,
+		z0 = svadd_n_f64_z (p0, z0, -2),
+		z0 = svadd_z (p0, z0, -2))
+
+/*
+** add_f64_x_tied1:
+**	fadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_x_tied1, svfloat64_t,
+		z0 = svadd_f64_x (p0, z0, z1),
+		z0 = svadd_x (p0, z0, z1))
+
+/*
+** add_f64_x_tied2:
+**	fadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_x_tied2, svfloat64_t,
+		z0 = svadd_f64_x (p0, z1, z0),
+		z0 = svadd_x (p0, z1, z0))
+
+/*
+** add_f64_x_untied:
+**	fadd	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_f64_x_untied, svfloat64_t,
+		z0 = svadd_f64_x (p0, z1, z2),
+		z0 = svadd_x (p0, z1, z2))
+
+/*
+** add_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fadd	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZD (add_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svadd_n_f64_x (p0, z0, d4),
+		 z0 = svadd_x (p0, z0, d4))
+
+/*
+** add_d4_f64_x_untied:
+**	mov	(z[0-9]+\.d), d4
+**	fadd	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZD (add_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svadd_n_f64_x (p0, z1, d4),
+		 z0 = svadd_x (p0, z1, d4))
+
+/*
+** add_1_f64_x_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z0, 1),
+		z0 = svadd_x (p0, z0, 1))
+
+/*
+** add_1_f64_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z1, 1),
+		z0 = svadd_x (p0, z1, 1))
+
+/*
+** add_0p5_f64_x_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z0, 0.5),
+		z0 = svadd_x (p0, z0, 0.5))
+
+/*
+** add_0p5_f64_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_0p5_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z1, 0.5),
+		z0 = svadd_x (p0, z1, 0.5))
+
+/*
+** add_m1_f64_x_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z0, -1),
+		z0 = svadd_x (p0, z0, -1))
+
+/*
+** add_m1_f64_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z1, -1),
+		z0 = svadd_x (p0, z1, -1))
+
+/*
+** add_m0p5_f64_x_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z0, -0.5),
+		z0 = svadd_x (p0, z0, -0.5))
+
+/*
+** add_m0p5_f64_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (add_m0p5_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z1, -0.5),
+		z0 = svadd_x (p0, z1, -0.5))
+
+/*
+** add_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fadd	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_2_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z0, 2),
+		z0 = svadd_x (p0, z0, 2))
+
+/*
+** add_2_f64_x_untied:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fadd	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_2_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (p0, z1, 2),
+		z0 = svadd_x (p0, z1, 2))
+
+/*
+** ptrue_add_f64_x_tied1:
+**	fadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f64_x_tied1, svfloat64_t,
+		z0 = svadd_f64_x (svptrue_b64 (), z0, z1),
+		z0 = svadd_x (svptrue_b64 (), z0, z1))
+
+/*
+** ptrue_add_f64_x_tied2:
+**	fadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f64_x_tied2, svfloat64_t,
+		z0 = svadd_f64_x (svptrue_b64 (), z1, z0),
+		z0 = svadd_x (svptrue_b64 (), z1, z0))
+
+/*
+** ptrue_add_f64_x_untied:
+**	fadd	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_f64_x_untied, svfloat64_t,
+		z0 = svadd_f64_x (svptrue_b64 (), z1, z2),
+		z0 = svadd_x (svptrue_b64 (), z1, z2))
+
+/*
+** ptrue_add_1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_1_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z0, 1),
+		z0 = svadd_x (svptrue_b64 (), z0, 1))
+
+/*
+** ptrue_add_1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_1_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z1, 1),
+		z0 = svadd_x (svptrue_b64 (), z1, 1))
+
+/*
+** ptrue_add_0p5_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z0, 0.5),
+		z0 = svadd_x (svptrue_b64 (), z0, 0.5))
+
+/*
+** ptrue_add_0p5_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_0p5_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z1, 0.5),
+		z0 = svadd_x (svptrue_b64 (), z1, 0.5))
+
+/*
+** ptrue_add_m1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m1_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z0, -1),
+		z0 = svadd_x (svptrue_b64 (), z0, -1))
+
+/*
+** ptrue_add_m1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m1_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z1, -1),
+		z0 = svadd_x (svptrue_b64 (), z1, -1))
+
+/*
+** ptrue_add_m0p5_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m0p5_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z0, -0.5),
+		z0 = svadd_x (svptrue_b64 (), z0, -0.5))
+
+/*
+** ptrue_add_m0p5_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_m0p5_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z1, -0.5),
+		z0 = svadd_x (svptrue_b64 (), z1, -0.5))
+
+/*
+** ptrue_add_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fadd	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_2_f64_x_tied1, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z0, 2),
+		z0 = svadd_x (svptrue_b64 (), z0, 2))
+
+/*
+** ptrue_add_2_f64_x_untied:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fadd	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_add_2_f64_x_untied, svfloat64_t,
+		z0 = svadd_n_f64_x (svptrue_b64 (), z1, 2),
+		z0 = svadd_x (svptrue_b64 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s16.c
new file mode 100644
index 000000000..c0883edf9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s16.c
@@ -0,0 +1,377 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** add_s16_m_tied1:
+**	add	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_s16_m_tied1, svint16_t,
+		z0 = svadd_s16_m (p0, z0, z1),
+		z0 = svadd_m (p0, z0, z1))
+
+/*
+** add_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	add	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_s16_m_tied2, svint16_t,
+		z0 = svadd_s16_m (p0, z1, z0),
+		z0 = svadd_m (p0, z1, z0))
+
+/*
+** add_s16_m_untied:
+**	movprfx	z0, z1
+**	add	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_s16_m_untied, svint16_t,
+		z0 = svadd_s16_m (p0, z1, z2),
+		z0 = svadd_m (p0, z1, z2))
+
+/*
+** add_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	add	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s16_m_tied1, svint16_t, int16_t,
+		 z0 = svadd_n_s16_m (p0, z0, x0),
+		 z0 = svadd_m (p0, z0, x0))
+
+/*
+** add_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	add	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s16_m_untied, svint16_t, int16_t,
+		 z0 = svadd_n_s16_m (p0, z1, x0),
+		 z0 = svadd_m (p0, z1, x0))
+
+/*
+** add_1_s16_m_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	add	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s16_m_tied1, svint16_t,
+		z0 = svadd_n_s16_m (p0, z0, 1),
+		z0 = svadd_m (p0, z0, 1))
+
+/*
+** add_1_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0, z1
+**	add	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s16_m_untied, svint16_t,
+		z0 = svadd_n_s16_m (p0, z1, 1),
+		z0 = svadd_m (p0, z1, 1))
+
+/*
+** add_m2_s16_m:
+**	mov	(z[0-9]+\.h), #-2
+**	add	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_s16_m, svint16_t,
+		z0 = svadd_n_s16_m (p0, z0, -2),
+		z0 = svadd_m (p0, z0, -2))
+
+/*
+** add_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	add	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_s16_z_tied1, svint16_t,
+		z0 = svadd_s16_z (p0, z0, z1),
+		z0 = svadd_z (p0, z0, z1))
+
+/*
+** add_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	add	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_s16_z_tied2, svint16_t,
+		z0 = svadd_s16_z (p0, z1, z0),
+		z0 = svadd_z (p0, z1, z0))
+
+/*
+** add_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	add	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	add	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_s16_z_untied, svint16_t,
+		z0 = svadd_s16_z (p0, z1, z2),
+		z0 = svadd_z (p0, z1, z2))
+
+/*
+** add_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	add	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s16_z_tied1, svint16_t, int16_t,
+		 z0 = svadd_n_s16_z (p0, z0, x0),
+		 z0 = svadd_z (p0, z0, x0))
+
+/*
+** add_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	add	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	add	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s16_z_untied, svint16_t, int16_t,
+		 z0 = svadd_n_s16_z (p0, z1, x0),
+		 z0 = svadd_z (p0, z1, x0))
+
+/*
+** add_1_s16_z_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	add	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s16_z_tied1, svint16_t,
+		z0 = svadd_n_s16_z (p0, z0, 1),
+		z0 = svadd_z (p0, z0, 1))
+
+/*
+** add_1_s16_z_untied:
+**	mov	(z[0-9]+\.h), #1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	add	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	add	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s16_z_untied, svint16_t,
+		z0 = svadd_n_s16_z (p0, z1, 1),
+		z0 = svadd_z (p0, z1, 1))
+
+/*
+** add_s16_x_tied1:
+**	add	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (add_s16_x_tied1, svint16_t,
+		z0 = svadd_s16_x (p0, z0, z1),
+		z0 = svadd_x (p0, z0, z1))
+
+/*
+** add_s16_x_tied2:
+**	add	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (add_s16_x_tied2, svint16_t,
+		z0 = svadd_s16_x (p0, z1, z0),
+		z0 = svadd_x (p0, z1, z0))
+
+/*
+** add_s16_x_untied:
+**	add	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (add_s16_x_untied, svint16_t,
+		z0 = svadd_s16_x (p0, z1, z2),
+		z0 = svadd_x (p0, z1, z2))
+
+/*
+** add_w0_s16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	add	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s16_x_tied1, svint16_t, int16_t,
+		 z0 = svadd_n_s16_x (p0, z0, x0),
+		 z0 = svadd_x (p0, z0, x0))
+
+/*
+** add_w0_s16_x_untied:
+**	mov	(z[0-9]+\.h), w0
+**	add	z0\.h, (z1\.h, \1|\1, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s16_x_untied, svint16_t, int16_t,
+		 z0 = svadd_n_s16_x (p0, z1, x0),
+		 z0 = svadd_x (p0, z1, x0))
+
+/*
+** add_1_s16_x_tied1:
+**	add	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s16_x_tied1, svint16_t,
+		z0 = svadd_n_s16_x (p0, z0, 1),
+		z0 = svadd_x (p0, z0, 1))
+
+/*
+** add_1_s16_x_untied:
+**	movprfx	z0, z1
+**	add	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s16_x_untied, svint16_t,
+		z0 = svadd_n_s16_x (p0, z1, 1),
+		z0 = svadd_x (p0, z1, 1))
+
+/*
+** add_127_s16_x:
+**	add	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (add_127_s16_x, svint16_t,
+		z0 = svadd_n_s16_x (p0, z0, 127),
+		z0 = svadd_x (p0, z0, 127))
+
+/*
+** add_128_s16_x:
+**	add	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (add_128_s16_x, svint16_t,
+		z0 = svadd_n_s16_x (p0, z0, 128),
+		z0 = svadd_x (p0, z0, 128))
+
+/*
+** add_255_s16_x:
+**	add	z0\.h, z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (add_255_s16_x, svint16_t,
+		z0 = svadd_n_s16_x (p0, z0, 255),
+		z0 = svadd_x (p0, z0, 255))
+
+/*
+** add_256_s16_x:
+**	add	z0\.h, z0\.h, #256
+**	ret
+*/
+TEST_UNIFORM_Z (add_256_s16_x, svint16_t,
+		z0 = svadd_n_s16_x (p0, z0, 256),
+		z0 = svadd_x (p0, z0, 256))
+
+/*
+** add_257_s16_x:
+**	mov	(z[0-9]+)\.b, #1
+**	add	z0\.h, (z0\.h, \1\.h|\1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (add_257_s16_x, svint16_t,
+		z0 = svadd_n_s16_x (p0, z0, 257),
+		z0 = svadd_x (p0, z0, 257))
+
+/*
+** add_512_s16_x:
+**	add	z0\.h, z0\.h, #512
+**	ret
+*/
+TEST_UNIFORM_Z (add_512_s16_x, svint16_t,
+		z0 = svadd_n_s16_x (p0, z0, 512),
+		z0 = svadd_x (p0, z0, 512))
+
+/*
+** add_65280_s16_x:
+**	add	z0\.h, z0\.h, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (add_65280_s16_x, svint16_t,
+		z0 = svadd_n_s16_x (p0, z0, 0xff00),
+		z0 = svadd_x (p0, z0, 0xff00))
+
+/*
+** add_m1_s16_x:
+**	sub	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_s16_x, svint16_t,
+		z0 = svadd_n_s16_x (p0, z0, -1),
+		z0 = svadd_x (p0, z0, -1))
+
+/*
+** add_m127_s16_x:
+**	sub	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (add_m127_s16_x, svint16_t,
+		z0 = svadd_n_s16_x (p0, z0, -127),
+		z0 = svadd_x (p0, z0, -127))
+
+/*
+** add_m128_s16_x:
+**	sub	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (add_m128_s16_x, svint16_t,
+		z0 = svadd_n_s16_x (p0, z0, -128),
+		z0 = svadd_x (p0, z0, -128))
+
+/*
+** add_m255_s16_x:
+**	sub	z0\.h, z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (add_m255_s16_x, svint16_t,
+		z0 = svadd_n_s16_x (p0, z0, -255),
+		z0 = svadd_x (p0, z0, -255))
+
+/*
+** add_m256_s16_x:
+**	add	z0\.h, z0\.h, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (add_m256_s16_x, svint16_t,
+		z0 = svadd_n_s16_x (p0, z0, -256),
+		z0 = svadd_x (p0, z0, -256))
+
+/*
+** add_m257_s16_x:
+**	mov	(z[0-9]+\.h), #-257
+**	add	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (add_m257_s16_x, svint16_t,
+		z0 = svadd_n_s16_x (p0, z0, -257),
+		z0 = svadd_x (p0, z0, -257))
+
+/*
+** add_m512_s16_x:
+**	add	z0\.h, z0\.h, #65024
+**	ret
+*/
+TEST_UNIFORM_Z (add_m512_s16_x, svint16_t,
+		z0 = svadd_n_s16_x (p0, z0, -512),
+		z0 = svadd_x (p0, z0, -512))
+
+/*
+** add_m32768_s16_x:
+**	add	z0\.h, z0\.h, #32768
+**	ret
+*/
+TEST_UNIFORM_Z (add_m32768_s16_x, svint16_t,
+		z0 = svadd_n_s16_x (p0, z0, -0x8000),
+		z0 = svadd_x (p0, z0, -0x8000))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s32.c
new file mode 100644
index 000000000..887038ba3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s32.c
@@ -0,0 +1,426 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** add_s32_m_tied1:
+**	add	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_s32_m_tied1, svint32_t,
+		z0 = svadd_s32_m (p0, z0, z1),
+		z0 = svadd_m (p0, z0, z1))
+
+/*
+** add_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	add	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_s32_m_tied2, svint32_t,
+		z0 = svadd_s32_m (p0, z1, z0),
+		z0 = svadd_m (p0, z1, z0))
+
+/*
+** add_s32_m_untied:
+**	movprfx	z0, z1
+**	add	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_s32_m_untied, svint32_t,
+		z0 = svadd_s32_m (p0, z1, z2),
+		z0 = svadd_m (p0, z1, z2))
+
+/*
+** add_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	add	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svadd_n_s32_m (p0, z0, x0),
+		 z0 = svadd_m (p0, z0, x0))
+
+/*
+** add_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	add	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svadd_n_s32_m (p0, z1, x0),
+		 z0 = svadd_m (p0, z1, x0))
+
+/*
+** add_1_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	add	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s32_m_tied1, svint32_t,
+		z0 = svadd_n_s32_m (p0, z0, 1),
+		z0 = svadd_m (p0, z0, 1))
+
+/*
+** add_1_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0, z1
+**	add	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s32_m_untied, svint32_t,
+		z0 = svadd_n_s32_m (p0, z1, 1),
+		z0 = svadd_m (p0, z1, 1))
+
+/*
+** add_m2_s32_m:
+**	mov	(z[0-9]+\.s), #-2
+**	add	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_s32_m, svint32_t,
+		z0 = svadd_n_s32_m (p0, z0, -2),
+		z0 = svadd_m (p0, z0, -2))
+
+/*
+** add_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	add	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_s32_z_tied1, svint32_t,
+		z0 = svadd_s32_z (p0, z0, z1),
+		z0 = svadd_z (p0, z0, z1))
+
+/*
+** add_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	add	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_s32_z_tied2, svint32_t,
+		z0 = svadd_s32_z (p0, z1, z0),
+		z0 = svadd_z (p0, z1, z0))
+
+/*
+** add_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	add	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	add	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_s32_z_untied, svint32_t,
+		z0 = svadd_s32_z (p0, z1, z2),
+		z0 = svadd_z (p0, z1, z2))
+
+/*
+** add_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	add	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svadd_n_s32_z (p0, z0, x0),
+		 z0 = svadd_z (p0, z0, x0))
+
+/*
+** add_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	add	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	add	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svadd_n_s32_z (p0, z1, x0),
+		 z0 = svadd_z (p0, z1, x0))
+
+/*
+** add_1_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	add	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s32_z_tied1, svint32_t,
+		z0 = svadd_n_s32_z (p0, z0, 1),
+		z0 = svadd_z (p0, z0, 1))
+
+/*
+** add_1_s32_z_untied:
+**	mov	(z[0-9]+\.s), #1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	add	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	add	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s32_z_untied, svint32_t,
+		z0 = svadd_n_s32_z (p0, z1, 1),
+		z0 = svadd_z (p0, z1, 1))
+
+/*
+** add_s32_x_tied1:
+**	add	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_s32_x_tied1, svint32_t,
+		z0 = svadd_s32_x (p0, z0, z1),
+		z0 = svadd_x (p0, z0, z1))
+
+/*
+** add_s32_x_tied2:
+**	add	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_s32_x_tied2, svint32_t,
+		z0 = svadd_s32_x (p0, z1, z0),
+		z0 = svadd_x (p0, z1, z0))
+
+/*
+** add_s32_x_untied:
+**	add	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_s32_x_untied, svint32_t,
+		z0 = svadd_s32_x (p0, z1, z2),
+		z0 = svadd_x (p0, z1, z2))
+
+/*
+** add_w0_s32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svadd_n_s32_x (p0, z0, x0),
+		 z0 = svadd_x (p0, z0, x0))
+
+/*
+** add_w0_s32_x_untied:
+**	mov	(z[0-9]+\.s), w0
+**	add	z0\.s, (z1\.s, \1|\1, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svadd_n_s32_x (p0, z1, x0),
+		 z0 = svadd_x (p0, z1, x0))
+
+/*
+** add_1_s32_x_tied1:
+**	add	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s32_x_tied1, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, 1),
+		z0 = svadd_x (p0, z0, 1))
+
+/*
+** add_1_s32_x_untied:
+**	movprfx	z0, z1
+**	add	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s32_x_untied, svint32_t,
+		z0 = svadd_n_s32_x (p0, z1, 1),
+		z0 = svadd_x (p0, z1, 1))
+
+/*
+** add_127_s32_x:
+**	add	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (add_127_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, 127),
+		z0 = svadd_x (p0, z0, 127))
+
+/*
+** add_128_s32_x:
+**	add	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (add_128_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, 128),
+		z0 = svadd_x (p0, z0, 128))
+
+/*
+** add_255_s32_x:
+**	add	z0\.s, z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (add_255_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, 255),
+		z0 = svadd_x (p0, z0, 255))
+
+/*
+** add_256_s32_x:
+**	add	z0\.s, z0\.s, #256
+**	ret
+*/
+TEST_UNIFORM_Z (add_256_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, 256),
+		z0 = svadd_x (p0, z0, 256))
+
+/*
+** add_511_s32_x:
+**	mov	(z[0-9]+\.s), #511
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_511_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, 511),
+		z0 = svadd_x (p0, z0, 511))
+
+/*
+** add_512_s32_x:
+**	add	z0\.s, z0\.s, #512
+**	ret
+*/
+TEST_UNIFORM_Z (add_512_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, 512),
+		z0 = svadd_x (p0, z0, 512))
+
+/*
+** add_65280_s32_x:
+**	add	z0\.s, z0\.s, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (add_65280_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, 0xff00),
+		z0 = svadd_x (p0, z0, 0xff00))
+
+/*
+** add_65535_s32_x:
+**	mov	(z[0-9]+\.s), #65535
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_65535_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, 65535),
+		z0 = svadd_x (p0, z0, 65535))
+
+/*
+** add_65536_s32_x:
+**	mov	(z[0-9]+\.s), #65536
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_65536_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, 65536),
+		z0 = svadd_x (p0, z0, 65536))
+
+/*
+** add_m1_s32_x:
+**	sub	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, -1),
+		z0 = svadd_x (p0, z0, -1))
+
+/*
+** add_m127_s32_x:
+**	sub	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (add_m127_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, -127),
+		z0 = svadd_x (p0, z0, -127))
+
+/*
+** add_m128_s32_x:
+**	sub	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (add_m128_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, -128),
+		z0 = svadd_x (p0, z0, -128))
+
+/*
+** add_m255_s32_x:
+**	sub	z0\.s, z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (add_m255_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, -255),
+		z0 = svadd_x (p0, z0, -255))
+
+/*
+** add_m256_s32_x:
+**	sub	z0\.s, z0\.s, #256
+**	ret
+*/
+TEST_UNIFORM_Z (add_m256_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, -256),
+		z0 = svadd_x (p0, z0, -256))
+
+/*
+** add_m511_s32_x:
+**	mov	(z[0-9]+\.s), #-511
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_m511_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, -511),
+		z0 = svadd_x (p0, z0, -511))
+
+/*
+** add_m512_s32_x:
+**	sub	z0\.s, z0\.s, #512
+**	ret
+*/
+TEST_UNIFORM_Z (add_m512_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, -512),
+		z0 = svadd_x (p0, z0, -512))
+
+/*
+** add_m32768_s32_x:
+**	sub	z0\.s, z0\.s, #32768
+**	ret
+*/
+TEST_UNIFORM_Z (add_m32768_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, -0x8000),
+		z0 = svadd_x (p0, z0, -0x8000))
+
+/*
+** add_m65280_s32_x:
+**	sub	z0\.s, z0\.s, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (add_m65280_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, -0xff00),
+		z0 = svadd_x (p0, z0, -0xff00))
+
+/*
+** add_m65535_s32_x:
+**	mov	(z[0-9]+\.s), #-65535
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_m65535_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, -65535),
+		z0 = svadd_x (p0, z0, -65535))
+
+/*
+** add_m65536_s32_x:
+**	mov	(z[0-9]+\.s), #-65536
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_m65536_s32_x, svint32_t,
+		z0 = svadd_n_s32_x (p0, z0, -65536),
+		z0 = svadd_x (p0, z0, -65536))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s64.c
new file mode 100644
index 000000000..aab63ef62
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s64.c
@@ -0,0 +1,426 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** add_s64_m_tied1:
+**	add	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_s64_m_tied1, svint64_t,
+		z0 = svadd_s64_m (p0, z0, z1),
+		z0 = svadd_m (p0, z0, z1))
+
+/*
+** add_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_s64_m_tied2, svint64_t,
+		z0 = svadd_s64_m (p0, z1, z0),
+		z0 = svadd_m (p0, z1, z0))
+
+/*
+** add_s64_m_untied:
+**	movprfx	z0, z1
+**	add	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_s64_m_untied, svint64_t,
+		z0 = svadd_s64_m (p0, z1, z2),
+		z0 = svadd_m (p0, z1, z2))
+
+/*
+** add_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svadd_n_s64_m (p0, z0, x0),
+		 z0 = svadd_m (p0, z0, x0))
+
+/*
+** add_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svadd_n_s64_m (p0, z1, x0),
+		 z0 = svadd_m (p0, z1, x0))
+
+/*
+** add_1_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s64_m_tied1, svint64_t,
+		z0 = svadd_n_s64_m (p0, z0, 1),
+		z0 = svadd_m (p0, z0, 1))
+
+/*
+** add_1_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0, z1
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s64_m_untied, svint64_t,
+		z0 = svadd_n_s64_m (p0, z1, 1),
+		z0 = svadd_m (p0, z1, 1))
+
+/*
+** add_m2_s64_m:
+**	mov	(z[0-9]+\.d), #-2
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_s64_m, svint64_t,
+		z0 = svadd_n_s64_m (p0, z0, -2),
+		z0 = svadd_m (p0, z0, -2))
+
+/*
+** add_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	add	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_s64_z_tied1, svint64_t,
+		z0 = svadd_s64_z (p0, z0, z1),
+		z0 = svadd_z (p0, z0, z1))
+
+/*
+** add_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	add	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_s64_z_tied2, svint64_t,
+		z0 = svadd_s64_z (p0, z1, z0),
+		z0 = svadd_z (p0, z1, z0))
+
+/*
+** add_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	add	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	add	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_s64_z_untied, svint64_t,
+		z0 = svadd_s64_z (p0, z1, z2),
+		z0 = svadd_z (p0, z1, z2))
+
+/*
+** add_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svadd_n_s64_z (p0, z0, x0),
+		 z0 = svadd_z (p0, z0, x0))
+
+/*
+** add_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	add	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	add	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (add_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svadd_n_s64_z (p0, z1, x0),
+		 z0 = svadd_z (p0, z1, x0))
+
+/*
+** add_1_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s64_z_tied1, svint64_t,
+		z0 = svadd_n_s64_z (p0, z0, 1),
+		z0 = svadd_z (p0, z0, 1))
+
+/*
+** add_1_s64_z_untied:
+**	mov	(z[0-9]+\.d), #1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	add	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	add	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s64_z_untied, svint64_t,
+		z0 = svadd_n_s64_z (p0, z1, 1),
+		z0 = svadd_z (p0, z1, 1))
+
+/*
+** add_s64_x_tied1:
+**	add	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_s64_x_tied1, svint64_t,
+		z0 = svadd_s64_x (p0, z0, z1),
+		z0 = svadd_x (p0, z0, z1))
+
+/*
+** add_s64_x_tied2:
+**	add	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_s64_x_tied2, svint64_t,
+		z0 = svadd_s64_x (p0, z1, z0),
+		z0 = svadd_x (p0, z1, z0))
+
+/*
+** add_s64_x_untied:
+**	add	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_s64_x_untied, svint64_t,
+		z0 = svadd_s64_x (p0, z1, z2),
+		z0 = svadd_x (p0, z1, z2))
+
+/*
+** add_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (add_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svadd_n_s64_x (p0, z0, x0),
+		 z0 = svadd_x (p0, z0, x0))
+
+/*
+** add_x0_s64_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	add	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (add_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svadd_n_s64_x (p0, z1, x0),
+		 z0 = svadd_x (p0, z1, x0))
+
+/*
+** add_1_s64_x_tied1:
+**	add	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s64_x_tied1, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, 1),
+		z0 = svadd_x (p0, z0, 1))
+
+/*
+** add_1_s64_x_untied:
+**	movprfx	z0, z1
+**	add	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s64_x_untied, svint64_t,
+		z0 = svadd_n_s64_x (p0, z1, 1),
+		z0 = svadd_x (p0, z1, 1))
+
+/*
+** add_127_s64_x:
+**	add	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (add_127_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, 127),
+		z0 = svadd_x (p0, z0, 127))
+
+/*
+** add_128_s64_x:
+**	add	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (add_128_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, 128),
+		z0 = svadd_x (p0, z0, 128))
+
+/*
+** add_255_s64_x:
+**	add	z0\.d, z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (add_255_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, 255),
+		z0 = svadd_x (p0, z0, 255))
+
+/*
+** add_256_s64_x:
+**	add	z0\.d, z0\.d, #256
+**	ret
+*/
+TEST_UNIFORM_Z (add_256_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, 256),
+		z0 = svadd_x (p0, z0, 256))
+
+/*
+** add_511_s64_x:
+**	mov	(z[0-9]+\.d), #511
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_511_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, 511),
+		z0 = svadd_x (p0, z0, 511))
+
+/*
+** add_512_s64_x:
+**	add	z0\.d, z0\.d, #512
+**	ret
+*/
+TEST_UNIFORM_Z (add_512_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, 512),
+		z0 = svadd_x (p0, z0, 512))
+
+/*
+** add_65280_s64_x:
+**	add	z0\.d, z0\.d, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (add_65280_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, 0xff00),
+		z0 = svadd_x (p0, z0, 0xff00))
+
+/*
+** add_65535_s64_x:
+**	mov	(z[0-9]+\.d), #65535
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_65535_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, 65535),
+		z0 = svadd_x (p0, z0, 65535))
+
+/*
+** add_65536_s64_x:
+**	mov	(z[0-9]+\.d), #65536
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_65536_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, 65536),
+		z0 = svadd_x (p0, z0, 65536))
+
+/*
+** add_m1_s64_x:
+**	sub	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, -1),
+		z0 = svadd_x (p0, z0, -1))
+
+/*
+** add_m127_s64_x:
+**	sub	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (add_m127_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, -127),
+		z0 = svadd_x (p0, z0, -127))
+
+/*
+** add_m128_s64_x:
+**	sub	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (add_m128_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, -128),
+		z0 = svadd_x (p0, z0, -128))
+
+/*
+** add_m255_s64_x:
+**	sub	z0\.d, z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (add_m255_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, -255),
+		z0 = svadd_x (p0, z0, -255))
+
+/*
+** add_m256_s64_x:
+**	sub	z0\.d, z0\.d, #256
+**	ret
+*/
+TEST_UNIFORM_Z (add_m256_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, -256),
+		z0 = svadd_x (p0, z0, -256))
+
+/*
+** add_m511_s64_x:
+**	mov	(z[0-9]+\.d), #-511
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_m511_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, -511),
+		z0 = svadd_x (p0, z0, -511))
+
+/*
+** add_m512_s64_x:
+**	sub	z0\.d, z0\.d, #512
+**	ret
+*/
+TEST_UNIFORM_Z (add_m512_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, -512),
+		z0 = svadd_x (p0, z0, -512))
+
+/*
+** add_m32768_s64_x:
+**	sub	z0\.d, z0\.d, #32768
+**	ret
+*/
+TEST_UNIFORM_Z (add_m32768_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, -0x8000),
+		z0 = svadd_x (p0, z0, -0x8000))
+
+/*
+** add_m65280_s64_x:
+**	sub	z0\.d, z0\.d, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (add_m65280_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, -0xff00),
+		z0 = svadd_x (p0, z0, -0xff00))
+
+/*
+** add_m65535_s64_x:
+**	mov	(z[0-9]+\.d), #-65535
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_m65535_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, -65535),
+		z0 = svadd_x (p0, z0, -65535))
+
+/*
+** add_m65536_s64_x:
+**	mov	(z[0-9]+\.d), #-65536
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_m65536_s64_x, svint64_t,
+		z0 = svadd_n_s64_x (p0, z0, -65536),
+		z0 = svadd_x (p0, z0, -65536))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s8.c
new file mode 100644
index 000000000..0889c189d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_s8.c
@@ -0,0 +1,294 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** add_s8_m_tied1:
+**	add	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (add_s8_m_tied1, svint8_t,
+		z0 = svadd_s8_m (p0, z0, z1),
+		z0 = svadd_m (p0, z0, z1))
+
+/*
+** add_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	add	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (add_s8_m_tied2, svint8_t,
+		z0 = svadd_s8_m (p0, z1, z0),
+		z0 = svadd_m (p0, z1, z0))
+
+/*
+** add_s8_m_untied:
+**	movprfx	z0, z1
+**	add	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (add_s8_m_untied, svint8_t,
+		z0 = svadd_s8_m (p0, z1, z2),
+		z0 = svadd_m (p0, z1, z2))
+
+/*
+** add_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s8_m_tied1, svint8_t, int8_t,
+		 z0 = svadd_n_s8_m (p0, z0, x0),
+		 z0 = svadd_m (p0, z0, x0))
+
+/*
+** add_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s8_m_untied, svint8_t, int8_t,
+		 z0 = svadd_n_s8_m (p0, z1, x0),
+		 z0 = svadd_m (p0, z1, x0))
+
+/*
+** add_1_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s8_m_tied1, svint8_t,
+		z0 = svadd_n_s8_m (p0, z0, 1),
+		z0 = svadd_m (p0, z0, 1))
+
+/*
+** add_1_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0, z1
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s8_m_untied, svint8_t,
+		z0 = svadd_n_s8_m (p0, z1, 1),
+		z0 = svadd_m (p0, z1, 1))
+
+/*
+** add_m1_s8_m:
+**	mov	(z[0-9]+\.b), #-1
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_s8_m, svint8_t,
+		z0 = svadd_n_s8_m (p0, z0, -1),
+		z0 = svadd_m (p0, z0, -1))
+
+/*
+** add_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	add	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (add_s8_z_tied1, svint8_t,
+		z0 = svadd_s8_z (p0, z0, z1),
+		z0 = svadd_z (p0, z0, z1))
+
+/*
+** add_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	add	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (add_s8_z_tied2, svint8_t,
+		z0 = svadd_s8_z (p0, z1, z0),
+		z0 = svadd_z (p0, z1, z0))
+
+/*
+** add_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	add	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	add	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_s8_z_untied, svint8_t,
+		z0 = svadd_s8_z (p0, z1, z2),
+		z0 = svadd_z (p0, z1, z2))
+
+/*
+** add_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s8_z_tied1, svint8_t, int8_t,
+		 z0 = svadd_n_s8_z (p0, z0, x0),
+		 z0 = svadd_z (p0, z0, x0))
+
+/*
+** add_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	add	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	add	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s8_z_untied, svint8_t, int8_t,
+		 z0 = svadd_n_s8_z (p0, z1, x0),
+		 z0 = svadd_z (p0, z1, x0))
+
+/*
+** add_1_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s8_z_tied1, svint8_t,
+		z0 = svadd_n_s8_z (p0, z0, 1),
+		z0 = svadd_z (p0, z0, 1))
+
+/*
+** add_1_s8_z_untied:
+**	mov	(z[0-9]+\.b), #1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	add	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	add	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s8_z_untied, svint8_t,
+		z0 = svadd_n_s8_z (p0, z1, 1),
+		z0 = svadd_z (p0, z1, 1))
+
+/*
+** add_s8_x_tied1:
+**	add	z0\.b, (z0\.b, z1\.b|z1\.b, z0\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (add_s8_x_tied1, svint8_t,
+		z0 = svadd_s8_x (p0, z0, z1),
+		z0 = svadd_x (p0, z0, z1))
+
+/*
+** add_s8_x_tied2:
+**	add	z0\.b, (z0\.b, z1\.b|z1\.b, z0\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (add_s8_x_tied2, svint8_t,
+		z0 = svadd_s8_x (p0, z1, z0),
+		z0 = svadd_x (p0, z1, z0))
+
+/*
+** add_s8_x_untied:
+**	add	z0\.b, (z1\.b, z2\.b|z2\.b, z1\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (add_s8_x_untied, svint8_t,
+		z0 = svadd_s8_x (p0, z1, z2),
+		z0 = svadd_x (p0, z1, z2))
+
+/*
+** add_w0_s8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	add	z0\.b, (z0\.b, \1|\1, z0\.b)
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s8_x_tied1, svint8_t, int8_t,
+		 z0 = svadd_n_s8_x (p0, z0, x0),
+		 z0 = svadd_x (p0, z0, x0))
+
+/*
+** add_w0_s8_x_untied:
+**	mov	(z[0-9]+\.b), w0
+**	add	z0\.b, (z1\.b, \1|\1, z1\.b)
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_s8_x_untied, svint8_t, int8_t,
+		 z0 = svadd_n_s8_x (p0, z1, x0),
+		 z0 = svadd_x (p0, z1, x0))
+
+/*
+** add_1_s8_x_tied1:
+**	add	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s8_x_tied1, svint8_t,
+		z0 = svadd_n_s8_x (p0, z0, 1),
+		z0 = svadd_x (p0, z0, 1))
+
+/*
+** add_1_s8_x_untied:
+**	movprfx	z0, z1
+**	add	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_s8_x_untied, svint8_t,
+		z0 = svadd_n_s8_x (p0, z1, 1),
+		z0 = svadd_x (p0, z1, 1))
+
+/*
+** add_127_s8_x:
+**	add	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (add_127_s8_x, svint8_t,
+		z0 = svadd_n_s8_x (p0, z0, 127),
+		z0 = svadd_x (p0, z0, 127))
+
+/*
+** add_128_s8_x:
+**	add	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (add_128_s8_x, svint8_t,
+		z0 = svadd_n_s8_x (p0, z0, 128),
+		z0 = svadd_x (p0, z0, 128))
+
+/*
+** add_255_s8_x:
+**	add	z0\.b, z0\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (add_255_s8_x, svint8_t,
+		z0 = svadd_n_s8_x (p0, z0, 255),
+		z0 = svadd_x (p0, z0, 255))
+
+/*
+** add_m1_s8_x:
+**	add	z0\.b, z0\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_s8_x, svint8_t,
+		z0 = svadd_n_s8_x (p0, z0, -1),
+		z0 = svadd_x (p0, z0, -1))
+
+/*
+** add_m127_s8_x:
+**	add	z0\.b, z0\.b, #129
+**	ret
+*/
+TEST_UNIFORM_Z (add_m127_s8_x, svint8_t,
+		z0 = svadd_n_s8_x (p0, z0, -127),
+		z0 = svadd_x (p0, z0, -127))
+
+/*
+** add_m128_s8_x:
+**	add	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (add_m128_s8_x, svint8_t,
+		z0 = svadd_n_s8_x (p0, z0, -128),
+		z0 = svadd_x (p0, z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u16.c
new file mode 100644
index 000000000..25cb90353
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u16.c
@@ -0,0 +1,377 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** add_u16_m_tied1:
+**	add	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_u16_m_tied1, svuint16_t,
+		z0 = svadd_u16_m (p0, z0, z1),
+		z0 = svadd_m (p0, z0, z1))
+
+/*
+** add_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	add	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_u16_m_tied2, svuint16_t,
+		z0 = svadd_u16_m (p0, z1, z0),
+		z0 = svadd_m (p0, z1, z0))
+
+/*
+** add_u16_m_untied:
+**	movprfx	z0, z1
+**	add	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_u16_m_untied, svuint16_t,
+		z0 = svadd_u16_m (p0, z1, z2),
+		z0 = svadd_m (p0, z1, z2))
+
+/*
+** add_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	add	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svadd_n_u16_m (p0, z0, x0),
+		 z0 = svadd_m (p0, z0, x0))
+
+/*
+** add_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	add	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svadd_n_u16_m (p0, z1, x0),
+		 z0 = svadd_m (p0, z1, x0))
+
+/*
+** add_1_u16_m_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	add	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u16_m_tied1, svuint16_t,
+		z0 = svadd_n_u16_m (p0, z0, 1),
+		z0 = svadd_m (p0, z0, 1))
+
+/*
+** add_1_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0, z1
+**	add	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u16_m_untied, svuint16_t,
+		z0 = svadd_n_u16_m (p0, z1, 1),
+		z0 = svadd_m (p0, z1, 1))
+
+/*
+** add_m2_u16_m:
+**	mov	(z[0-9]+\.h), #-2
+**	add	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_u16_m, svuint16_t,
+		z0 = svadd_n_u16_m (p0, z0, -2),
+		z0 = svadd_m (p0, z0, -2))
+
+/*
+** add_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	add	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_u16_z_tied1, svuint16_t,
+		z0 = svadd_u16_z (p0, z0, z1),
+		z0 = svadd_z (p0, z0, z1))
+
+/*
+** add_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	add	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (add_u16_z_tied2, svuint16_t,
+		z0 = svadd_u16_z (p0, z1, z0),
+		z0 = svadd_z (p0, z1, z0))
+
+/*
+** add_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	add	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	add	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_u16_z_untied, svuint16_t,
+		z0 = svadd_u16_z (p0, z1, z2),
+		z0 = svadd_z (p0, z1, z2))
+
+/*
+** add_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	add	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svadd_n_u16_z (p0, z0, x0),
+		 z0 = svadd_z (p0, z0, x0))
+
+/*
+** add_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	add	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	add	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svadd_n_u16_z (p0, z1, x0),
+		 z0 = svadd_z (p0, z1, x0))
+
+/*
+** add_1_u16_z_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	add	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u16_z_tied1, svuint16_t,
+		z0 = svadd_n_u16_z (p0, z0, 1),
+		z0 = svadd_z (p0, z0, 1))
+
+/*
+** add_1_u16_z_untied:
+**	mov	(z[0-9]+\.h), #1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	add	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	add	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u16_z_untied, svuint16_t,
+		z0 = svadd_n_u16_z (p0, z1, 1),
+		z0 = svadd_z (p0, z1, 1))
+
+/*
+** add_u16_x_tied1:
+**	add	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (add_u16_x_tied1, svuint16_t,
+		z0 = svadd_u16_x (p0, z0, z1),
+		z0 = svadd_x (p0, z0, z1))
+
+/*
+** add_u16_x_tied2:
+**	add	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (add_u16_x_tied2, svuint16_t,
+		z0 = svadd_u16_x (p0, z1, z0),
+		z0 = svadd_x (p0, z1, z0))
+
+/*
+** add_u16_x_untied:
+**	add	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (add_u16_x_untied, svuint16_t,
+		z0 = svadd_u16_x (p0, z1, z2),
+		z0 = svadd_x (p0, z1, z2))
+
+/*
+** add_w0_u16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	add	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svadd_n_u16_x (p0, z0, x0),
+		 z0 = svadd_x (p0, z0, x0))
+
+/*
+** add_w0_u16_x_untied:
+**	mov	(z[0-9]+\.h), w0
+**	add	z0\.h, (z1\.h, \1|\1, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svadd_n_u16_x (p0, z1, x0),
+		 z0 = svadd_x (p0, z1, x0))
+
+/*
+** add_1_u16_x_tied1:
+**	add	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u16_x_tied1, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z0, 1),
+		z0 = svadd_x (p0, z0, 1))
+
+/*
+** add_1_u16_x_untied:
+**	movprfx	z0, z1
+**	add	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u16_x_untied, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z1, 1),
+		z0 = svadd_x (p0, z1, 1))
+
+/*
+** add_127_u16_x:
+**	add	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (add_127_u16_x, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z0, 127),
+		z0 = svadd_x (p0, z0, 127))
+
+/*
+** add_128_u16_x:
+**	add	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (add_128_u16_x, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z0, 128),
+		z0 = svadd_x (p0, z0, 128))
+
+/*
+** add_255_u16_x:
+**	add	z0\.h, z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (add_255_u16_x, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z0, 255),
+		z0 = svadd_x (p0, z0, 255))
+
+/*
+** add_256_u16_x:
+**	add	z0\.h, z0\.h, #256
+**	ret
+*/
+TEST_UNIFORM_Z (add_256_u16_x, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z0, 256),
+		z0 = svadd_x (p0, z0, 256))
+
+/*
+** add_257_u16_x:
+**	mov	(z[0-9]+)\.b, #1
+**	add	z0\.h, (z0\.h, \1\.h|\1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (add_257_u16_x, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z0, 257),
+		z0 = svadd_x (p0, z0, 257))
+
+/*
+** add_512_u16_x:
+**	add	z0\.h, z0\.h, #512
+**	ret
+*/
+TEST_UNIFORM_Z (add_512_u16_x, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z0, 512),
+		z0 = svadd_x (p0, z0, 512))
+
+/*
+** add_65280_u16_x:
+**	add	z0\.h, z0\.h, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (add_65280_u16_x, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z0, 0xff00),
+		z0 = svadd_x (p0, z0, 0xff00))
+
+/*
+** add_m1_u16_x:
+**	sub	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_u16_x, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z0, -1),
+		z0 = svadd_x (p0, z0, -1))
+
+/*
+** add_m127_u16_x:
+**	sub	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (add_m127_u16_x, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z0, -127),
+		z0 = svadd_x (p0, z0, -127))
+
+/*
+** add_m128_u16_x:
+**	sub	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (add_m128_u16_x, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z0, -128),
+		z0 = svadd_x (p0, z0, -128))
+
+/*
+** add_m255_u16_x:
+**	sub	z0\.h, z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (add_m255_u16_x, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z0, -255),
+		z0 = svadd_x (p0, z0, -255))
+
+/*
+** add_m256_u16_x:
+**	add	z0\.h, z0\.h, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (add_m256_u16_x, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z0, -256),
+		z0 = svadd_x (p0, z0, -256))
+
+/*
+** add_m257_u16_x:
+**	mov	(z[0-9]+\.h), #-257
+**	add	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (add_m257_u16_x, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z0, -257),
+		z0 = svadd_x (p0, z0, -257))
+
+/*
+** add_m512_u16_x:
+**	add	z0\.h, z0\.h, #65024
+**	ret
+*/
+TEST_UNIFORM_Z (add_m512_u16_x, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z0, -512),
+		z0 = svadd_x (p0, z0, -512))
+
+/*
+** add_m32768_u16_x:
+**	add	z0\.h, z0\.h, #32768
+**	ret
+*/
+TEST_UNIFORM_Z (add_m32768_u16_x, svuint16_t,
+		z0 = svadd_n_u16_x (p0, z0, -0x8000),
+		z0 = svadd_x (p0, z0, -0x8000))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u32.c
new file mode 100644
index 000000000..ee979489b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u32.c
@@ -0,0 +1,426 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** add_u32_m_tied1:
+**	add	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_u32_m_tied1, svuint32_t,
+		z0 = svadd_u32_m (p0, z0, z1),
+		z0 = svadd_m (p0, z0, z1))
+
+/*
+** add_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	add	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_u32_m_tied2, svuint32_t,
+		z0 = svadd_u32_m (p0, z1, z0),
+		z0 = svadd_m (p0, z1, z0))
+
+/*
+** add_u32_m_untied:
+**	movprfx	z0, z1
+**	add	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_u32_m_untied, svuint32_t,
+		z0 = svadd_u32_m (p0, z1, z2),
+		z0 = svadd_m (p0, z1, z2))
+
+/*
+** add_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	add	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svadd_n_u32_m (p0, z0, x0),
+		 z0 = svadd_m (p0, z0, x0))
+
+/*
+** add_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	add	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svadd_n_u32_m (p0, z1, x0),
+		 z0 = svadd_m (p0, z1, x0))
+
+/*
+** add_1_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	add	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u32_m_tied1, svuint32_t,
+		z0 = svadd_n_u32_m (p0, z0, 1),
+		z0 = svadd_m (p0, z0, 1))
+
+/*
+** add_1_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0, z1
+**	add	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u32_m_untied, svuint32_t,
+		z0 = svadd_n_u32_m (p0, z1, 1),
+		z0 = svadd_m (p0, z1, 1))
+
+/*
+** add_m2_u32_m:
+**	mov	(z[0-9]+\.s), #-2
+**	add	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_u32_m, svuint32_t,
+		z0 = svadd_n_u32_m (p0, z0, -2),
+		z0 = svadd_m (p0, z0, -2))
+
+/*
+** add_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	add	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_u32_z_tied1, svuint32_t,
+		z0 = svadd_u32_z (p0, z0, z1),
+		z0 = svadd_z (p0, z0, z1))
+
+/*
+** add_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	add	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (add_u32_z_tied2, svuint32_t,
+		z0 = svadd_u32_z (p0, z1, z0),
+		z0 = svadd_z (p0, z1, z0))
+
+/*
+** add_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	add	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	add	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_u32_z_untied, svuint32_t,
+		z0 = svadd_u32_z (p0, z1, z2),
+		z0 = svadd_z (p0, z1, z2))
+
+/*
+** add_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	add	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svadd_n_u32_z (p0, z0, x0),
+		 z0 = svadd_z (p0, z0, x0))
+
+/*
+** add_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	add	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	add	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svadd_n_u32_z (p0, z1, x0),
+		 z0 = svadd_z (p0, z1, x0))
+
+/*
+** add_1_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	add	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u32_z_tied1, svuint32_t,
+		z0 = svadd_n_u32_z (p0, z0, 1),
+		z0 = svadd_z (p0, z0, 1))
+
+/*
+** add_1_u32_z_untied:
+**	mov	(z[0-9]+\.s), #1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	add	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	add	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u32_z_untied, svuint32_t,
+		z0 = svadd_n_u32_z (p0, z1, 1),
+		z0 = svadd_z (p0, z1, 1))
+
+/*
+** add_u32_x_tied1:
+**	add	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_u32_x_tied1, svuint32_t,
+		z0 = svadd_u32_x (p0, z0, z1),
+		z0 = svadd_x (p0, z0, z1))
+
+/*
+** add_u32_x_tied2:
+**	add	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_u32_x_tied2, svuint32_t,
+		z0 = svadd_u32_x (p0, z1, z0),
+		z0 = svadd_x (p0, z1, z0))
+
+/*
+** add_u32_x_untied:
+**	add	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_u32_x_untied, svuint32_t,
+		z0 = svadd_u32_x (p0, z1, z2),
+		z0 = svadd_x (p0, z1, z2))
+
+/*
+** add_w0_u32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svadd_n_u32_x (p0, z0, x0),
+		 z0 = svadd_x (p0, z0, x0))
+
+/*
+** add_w0_u32_x_untied:
+**	mov	(z[0-9]+\.s), w0
+**	add	z0\.s, (z1\.s, \1|\1, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svadd_n_u32_x (p0, z1, x0),
+		 z0 = svadd_x (p0, z1, x0))
+
+/*
+** add_1_u32_x_tied1:
+**	add	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u32_x_tied1, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, 1),
+		z0 = svadd_x (p0, z0, 1))
+
+/*
+** add_1_u32_x_untied:
+**	movprfx	z0, z1
+**	add	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u32_x_untied, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z1, 1),
+		z0 = svadd_x (p0, z1, 1))
+
+/*
+** add_127_u32_x:
+**	add	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (add_127_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, 127),
+		z0 = svadd_x (p0, z0, 127))
+
+/*
+** add_128_u32_x:
+**	add	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (add_128_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, 128),
+		z0 = svadd_x (p0, z0, 128))
+
+/*
+** add_255_u32_x:
+**	add	z0\.s, z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (add_255_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, 255),
+		z0 = svadd_x (p0, z0, 255))
+
+/*
+** add_256_u32_x:
+**	add	z0\.s, z0\.s, #256
+**	ret
+*/
+TEST_UNIFORM_Z (add_256_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, 256),
+		z0 = svadd_x (p0, z0, 256))
+
+/*
+** add_511_u32_x:
+**	mov	(z[0-9]+\.s), #511
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_511_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, 511),
+		z0 = svadd_x (p0, z0, 511))
+
+/*
+** add_512_u32_x:
+**	add	z0\.s, z0\.s, #512
+**	ret
+*/
+TEST_UNIFORM_Z (add_512_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, 512),
+		z0 = svadd_x (p0, z0, 512))
+
+/*
+** add_65280_u32_x:
+**	add	z0\.s, z0\.s, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (add_65280_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, 0xff00),
+		z0 = svadd_x (p0, z0, 0xff00))
+
+/*
+** add_65535_u32_x:
+**	mov	(z[0-9]+\.s), #65535
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_65535_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, 65535),
+		z0 = svadd_x (p0, z0, 65535))
+
+/*
+** add_65536_u32_x:
+**	mov	(z[0-9]+\.s), #65536
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_65536_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, 65536),
+		z0 = svadd_x (p0, z0, 65536))
+
+/*
+** add_m1_u32_x:
+**	sub	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, -1),
+		z0 = svadd_x (p0, z0, -1))
+
+/*
+** add_m127_u32_x:
+**	sub	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (add_m127_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, -127),
+		z0 = svadd_x (p0, z0, -127))
+
+/*
+** add_m128_u32_x:
+**	sub	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (add_m128_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, -128),
+		z0 = svadd_x (p0, z0, -128))
+
+/*
+** add_m255_u32_x:
+**	sub	z0\.s, z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (add_m255_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, -255),
+		z0 = svadd_x (p0, z0, -255))
+
+/*
+** add_m256_u32_x:
+**	sub	z0\.s, z0\.s, #256
+**	ret
+*/
+TEST_UNIFORM_Z (add_m256_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, -256),
+		z0 = svadd_x (p0, z0, -256))
+
+/*
+** add_m511_u32_x:
+**	mov	(z[0-9]+\.s), #-511
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_m511_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, -511),
+		z0 = svadd_x (p0, z0, -511))
+
+/*
+** add_m512_u32_x:
+**	sub	z0\.s, z0\.s, #512
+**	ret
+*/
+TEST_UNIFORM_Z (add_m512_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, -512),
+		z0 = svadd_x (p0, z0, -512))
+
+/*
+** add_m32768_u32_x:
+**	sub	z0\.s, z0\.s, #32768
+**	ret
+*/
+TEST_UNIFORM_Z (add_m32768_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, -0x8000),
+		z0 = svadd_x (p0, z0, -0x8000))
+
+/*
+** add_m65280_u32_x:
+**	sub	z0\.s, z0\.s, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (add_m65280_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, -0xff00),
+		z0 = svadd_x (p0, z0, -0xff00))
+
+/*
+** add_m65535_u32_x:
+**	mov	(z[0-9]+\.s), #-65535
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_m65535_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, -65535),
+		z0 = svadd_x (p0, z0, -65535))
+
+/*
+** add_m65536_u32_x:
+**	mov	(z[0-9]+\.s), #-65536
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (add_m65536_u32_x, svuint32_t,
+		z0 = svadd_n_u32_x (p0, z0, -65536),
+		z0 = svadd_x (p0, z0, -65536))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u64.c
new file mode 100644
index 000000000..25d2972a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u64.c
@@ -0,0 +1,426 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** add_u64_m_tied1:
+**	add	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_u64_m_tied1, svuint64_t,
+		z0 = svadd_u64_m (p0, z0, z1),
+		z0 = svadd_m (p0, z0, z1))
+
+/*
+** add_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_u64_m_tied2, svuint64_t,
+		z0 = svadd_u64_m (p0, z1, z0),
+		z0 = svadd_m (p0, z1, z0))
+
+/*
+** add_u64_m_untied:
+**	movprfx	z0, z1
+**	add	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_u64_m_untied, svuint64_t,
+		z0 = svadd_u64_m (p0, z1, z2),
+		z0 = svadd_m (p0, z1, z2))
+
+/*
+** add_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svadd_n_u64_m (p0, z0, x0),
+		 z0 = svadd_m (p0, z0, x0))
+
+/*
+** add_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svadd_n_u64_m (p0, z1, x0),
+		 z0 = svadd_m (p0, z1, x0))
+
+/*
+** add_1_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u64_m_tied1, svuint64_t,
+		z0 = svadd_n_u64_m (p0, z0, 1),
+		z0 = svadd_m (p0, z0, 1))
+
+/*
+** add_1_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0, z1
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u64_m_untied, svuint64_t,
+		z0 = svadd_n_u64_m (p0, z1, 1),
+		z0 = svadd_m (p0, z1, 1))
+
+/*
+** add_m2_u64_m:
+**	mov	(z[0-9]+\.d), #-2
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m2_u64_m, svuint64_t,
+		z0 = svadd_n_u64_m (p0, z0, -2),
+		z0 = svadd_m (p0, z0, -2))
+
+/*
+** add_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	add	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_u64_z_tied1, svuint64_t,
+		z0 = svadd_u64_z (p0, z0, z1),
+		z0 = svadd_z (p0, z0, z1))
+
+/*
+** add_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	add	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (add_u64_z_tied2, svuint64_t,
+		z0 = svadd_u64_z (p0, z1, z0),
+		z0 = svadd_z (p0, z1, z0))
+
+/*
+** add_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	add	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	add	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_u64_z_untied, svuint64_t,
+		z0 = svadd_u64_z (p0, z1, z2),
+		z0 = svadd_z (p0, z1, z2))
+
+/*
+** add_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svadd_n_u64_z (p0, z0, x0),
+		 z0 = svadd_z (p0, z0, x0))
+
+/*
+** add_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	add	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	add	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (add_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svadd_n_u64_z (p0, z1, x0),
+		 z0 = svadd_z (p0, z1, x0))
+
+/*
+** add_1_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u64_z_tied1, svuint64_t,
+		z0 = svadd_n_u64_z (p0, z0, 1),
+		z0 = svadd_z (p0, z0, 1))
+
+/*
+** add_1_u64_z_untied:
+**	mov	(z[0-9]+\.d), #1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	add	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	add	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u64_z_untied, svuint64_t,
+		z0 = svadd_n_u64_z (p0, z1, 1),
+		z0 = svadd_z (p0, z1, 1))
+
+/*
+** add_u64_x_tied1:
+**	add	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_u64_x_tied1, svuint64_t,
+		z0 = svadd_u64_x (p0, z0, z1),
+		z0 = svadd_x (p0, z0, z1))
+
+/*
+** add_u64_x_tied2:
+**	add	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_u64_x_tied2, svuint64_t,
+		z0 = svadd_u64_x (p0, z1, z0),
+		z0 = svadd_x (p0, z1, z0))
+
+/*
+** add_u64_x_untied:
+**	add	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_u64_x_untied, svuint64_t,
+		z0 = svadd_u64_x (p0, z1, z2),
+		z0 = svadd_x (p0, z1, z2))
+
+/*
+** add_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (add_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svadd_n_u64_x (p0, z0, x0),
+		 z0 = svadd_x (p0, z0, x0))
+
+/*
+** add_x0_u64_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	add	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (add_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svadd_n_u64_x (p0, z1, x0),
+		 z0 = svadd_x (p0, z1, x0))
+
+/*
+** add_1_u64_x_tied1:
+**	add	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u64_x_tied1, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, 1),
+		z0 = svadd_x (p0, z0, 1))
+
+/*
+** add_1_u64_x_untied:
+**	movprfx	z0, z1
+**	add	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u64_x_untied, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z1, 1),
+		z0 = svadd_x (p0, z1, 1))
+
+/*
+** add_127_u64_x:
+**	add	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (add_127_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, 127),
+		z0 = svadd_x (p0, z0, 127))
+
+/*
+** add_128_u64_x:
+**	add	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (add_128_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, 128),
+		z0 = svadd_x (p0, z0, 128))
+
+/*
+** add_255_u64_x:
+**	add	z0\.d, z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (add_255_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, 255),
+		z0 = svadd_x (p0, z0, 255))
+
+/*
+** add_256_u64_x:
+**	add	z0\.d, z0\.d, #256
+**	ret
+*/
+TEST_UNIFORM_Z (add_256_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, 256),
+		z0 = svadd_x (p0, z0, 256))
+
+/*
+** add_511_u64_x:
+**	mov	(z[0-9]+\.d), #511
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_511_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, 511),
+		z0 = svadd_x (p0, z0, 511))
+
+/*
+** add_512_u64_x:
+**	add	z0\.d, z0\.d, #512
+**	ret
+*/
+TEST_UNIFORM_Z (add_512_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, 512),
+		z0 = svadd_x (p0, z0, 512))
+
+/*
+** add_65280_u64_x:
+**	add	z0\.d, z0\.d, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (add_65280_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, 0xff00),
+		z0 = svadd_x (p0, z0, 0xff00))
+
+/*
+** add_65535_u64_x:
+**	mov	(z[0-9]+\.d), #65535
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_65535_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, 65535),
+		z0 = svadd_x (p0, z0, 65535))
+
+/*
+** add_65536_u64_x:
+**	mov	(z[0-9]+\.d), #65536
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_65536_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, 65536),
+		z0 = svadd_x (p0, z0, 65536))
+
+/*
+** add_m1_u64_x:
+**	sub	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, -1),
+		z0 = svadd_x (p0, z0, -1))
+
+/*
+** add_m127_u64_x:
+**	sub	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (add_m127_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, -127),
+		z0 = svadd_x (p0, z0, -127))
+
+/*
+** add_m128_u64_x:
+**	sub	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (add_m128_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, -128),
+		z0 = svadd_x (p0, z0, -128))
+
+/*
+** add_m255_u64_x:
+**	sub	z0\.d, z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (add_m255_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, -255),
+		z0 = svadd_x (p0, z0, -255))
+
+/*
+** add_m256_u64_x:
+**	sub	z0\.d, z0\.d, #256
+**	ret
+*/
+TEST_UNIFORM_Z (add_m256_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, -256),
+		z0 = svadd_x (p0, z0, -256))
+
+/*
+** add_m511_u64_x:
+**	mov	(z[0-9]+\.d), #-511
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_m511_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, -511),
+		z0 = svadd_x (p0, z0, -511))
+
+/*
+** add_m512_u64_x:
+**	sub	z0\.d, z0\.d, #512
+**	ret
+*/
+TEST_UNIFORM_Z (add_m512_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, -512),
+		z0 = svadd_x (p0, z0, -512))
+
+/*
+** add_m32768_u64_x:
+**	sub	z0\.d, z0\.d, #32768
+**	ret
+*/
+TEST_UNIFORM_Z (add_m32768_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, -0x8000),
+		z0 = svadd_x (p0, z0, -0x8000))
+
+/*
+** add_m65280_u64_x:
+**	sub	z0\.d, z0\.d, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (add_m65280_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, -0xff00),
+		z0 = svadd_x (p0, z0, -0xff00))
+
+/*
+** add_m65535_u64_x:
+**	mov	(z[0-9]+\.d), #-65535
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_m65535_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, -65535),
+		z0 = svadd_x (p0, z0, -65535))
+
+/*
+** add_m65536_u64_x:
+**	mov	(z[0-9]+\.d), #-65536
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (add_m65536_u64_x, svuint64_t,
+		z0 = svadd_n_u64_x (p0, z0, -65536),
+		z0 = svadd_x (p0, z0, -65536))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u8.c
new file mode 100644
index 000000000..06b68c97c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/add_u8.c
@@ -0,0 +1,294 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** add_u8_m_tied1:
+**	add	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (add_u8_m_tied1, svuint8_t,
+		z0 = svadd_u8_m (p0, z0, z1),
+		z0 = svadd_m (p0, z0, z1))
+
+/*
+** add_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	add	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (add_u8_m_tied2, svuint8_t,
+		z0 = svadd_u8_m (p0, z1, z0),
+		z0 = svadd_m (p0, z1, z0))
+
+/*
+** add_u8_m_untied:
+**	movprfx	z0, z1
+**	add	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (add_u8_m_untied, svuint8_t,
+		z0 = svadd_u8_m (p0, z1, z2),
+		z0 = svadd_m (p0, z1, z2))
+
+/*
+** add_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svadd_n_u8_m (p0, z0, x0),
+		 z0 = svadd_m (p0, z0, x0))
+
+/*
+** add_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svadd_n_u8_m (p0, z1, x0),
+		 z0 = svadd_m (p0, z1, x0))
+
+/*
+** add_1_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u8_m_tied1, svuint8_t,
+		z0 = svadd_n_u8_m (p0, z0, 1),
+		z0 = svadd_m (p0, z0, 1))
+
+/*
+** add_1_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0, z1
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u8_m_untied, svuint8_t,
+		z0 = svadd_n_u8_m (p0, z1, 1),
+		z0 = svadd_m (p0, z1, 1))
+
+/*
+** add_m1_u8_m:
+**	mov	(z[0-9]+\.b), #-1
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_u8_m, svuint8_t,
+		z0 = svadd_n_u8_m (p0, z0, -1),
+		z0 = svadd_m (p0, z0, -1))
+
+/*
+** add_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	add	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (add_u8_z_tied1, svuint8_t,
+		z0 = svadd_u8_z (p0, z0, z1),
+		z0 = svadd_z (p0, z0, z1))
+
+/*
+** add_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	add	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (add_u8_z_tied2, svuint8_t,
+		z0 = svadd_u8_z (p0, z1, z0),
+		z0 = svadd_z (p0, z1, z0))
+
+/*
+** add_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	add	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	add	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_u8_z_untied, svuint8_t,
+		z0 = svadd_u8_z (p0, z1, z2),
+		z0 = svadd_z (p0, z1, z2))
+
+/*
+** add_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svadd_n_u8_z (p0, z0, x0),
+		 z0 = svadd_z (p0, z0, x0))
+
+/*
+** add_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	add	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	add	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svadd_n_u8_z (p0, z1, x0),
+		 z0 = svadd_z (p0, z1, x0))
+
+/*
+** add_1_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u8_z_tied1, svuint8_t,
+		z0 = svadd_n_u8_z (p0, z0, 1),
+		z0 = svadd_z (p0, z0, 1))
+
+/*
+** add_1_u8_z_untied:
+**	mov	(z[0-9]+\.b), #1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	add	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	add	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u8_z_untied, svuint8_t,
+		z0 = svadd_n_u8_z (p0, z1, 1),
+		z0 = svadd_z (p0, z1, 1))
+
+/*
+** add_u8_x_tied1:
+**	add	z0\.b, (z0\.b, z1\.b|z1\.b, z0\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (add_u8_x_tied1, svuint8_t,
+		z0 = svadd_u8_x (p0, z0, z1),
+		z0 = svadd_x (p0, z0, z1))
+
+/*
+** add_u8_x_tied2:
+**	add	z0\.b, (z0\.b, z1\.b|z1\.b, z0\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (add_u8_x_tied2, svuint8_t,
+		z0 = svadd_u8_x (p0, z1, z0),
+		z0 = svadd_x (p0, z1, z0))
+
+/*
+** add_u8_x_untied:
+**	add	z0\.b, (z1\.b, z2\.b|z2\.b, z1\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (add_u8_x_untied, svuint8_t,
+		z0 = svadd_u8_x (p0, z1, z2),
+		z0 = svadd_x (p0, z1, z2))
+
+/*
+** add_w0_u8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	add	z0\.b, (z0\.b, \1|\1, z0\.b)
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svadd_n_u8_x (p0, z0, x0),
+		 z0 = svadd_x (p0, z0, x0))
+
+/*
+** add_w0_u8_x_untied:
+**	mov	(z[0-9]+\.b), w0
+**	add	z0\.b, (z1\.b, \1|\1, z1\.b)
+**	ret
+*/
+TEST_UNIFORM_ZX (add_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svadd_n_u8_x (p0, z1, x0),
+		 z0 = svadd_x (p0, z1, x0))
+
+/*
+** add_1_u8_x_tied1:
+**	add	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u8_x_tied1, svuint8_t,
+		z0 = svadd_n_u8_x (p0, z0, 1),
+		z0 = svadd_x (p0, z0, 1))
+
+/*
+** add_1_u8_x_untied:
+**	movprfx	z0, z1
+**	add	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (add_1_u8_x_untied, svuint8_t,
+		z0 = svadd_n_u8_x (p0, z1, 1),
+		z0 = svadd_x (p0, z1, 1))
+
+/*
+** add_127_u8_x:
+**	add	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (add_127_u8_x, svuint8_t,
+		z0 = svadd_n_u8_x (p0, z0, 127),
+		z0 = svadd_x (p0, z0, 127))
+
+/*
+** add_128_u8_x:
+**	add	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (add_128_u8_x, svuint8_t,
+		z0 = svadd_n_u8_x (p0, z0, 128),
+		z0 = svadd_x (p0, z0, 128))
+
+/*
+** add_255_u8_x:
+**	add	z0\.b, z0\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (add_255_u8_x, svuint8_t,
+		z0 = svadd_n_u8_x (p0, z0, 255),
+		z0 = svadd_x (p0, z0, 255))
+
+/*
+** add_m1_u8_x:
+**	add	z0\.b, z0\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (add_m1_u8_x, svuint8_t,
+		z0 = svadd_n_u8_x (p0, z0, -1),
+		z0 = svadd_x (p0, z0, -1))
+
+/*
+** add_m127_u8_x:
+**	add	z0\.b, z0\.b, #129
+**	ret
+*/
+TEST_UNIFORM_Z (add_m127_u8_x, svuint8_t,
+		z0 = svadd_n_u8_x (p0, z0, -127),
+		z0 = svadd_x (p0, z0, -127))
+
+/*
+** add_m128_u8_x:
+**	add	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (add_m128_u8_x, svuint8_t,
+		z0 = svadd_n_u8_x (p0, z0, -128),
+		z0 = svadd_x (p0, z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f16.c
new file mode 100644
index 000000000..6c6bfa1c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f16.c
@@ -0,0 +1,22 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** adda_d0_f16:
+**	fadda	h0, p0, h0, z2\.h
+**	ret
+*/
+TEST_FOLD_LEFT_D (adda_d0_f16, float16_t, svfloat16_t,
+		  d0 = svadda_f16 (p0, d0, z2),
+		  d0 = svadda (p0, d0, z2))
+
+/*
+** adda_d1_f16:
+**	mov	v0\.h\[0\], v1\.h\[0\]
+**	fadda	h0, p0, h0, z2\.h
+**	ret
+*/
+TEST_FOLD_LEFT_D (adda_d1_f16, float16_t, svfloat16_t,
+		  d0 = svadda_f16 (p0, d1, z2),
+		  d0 = svadda (p0, d1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f32.c
new file mode 100644
index 000000000..8b2a1dd1c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f32.c
@@ -0,0 +1,22 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** adda_d0_f32:
+**	fadda	s0, p0, s0, z2\.s
+**	ret
+*/
+TEST_FOLD_LEFT_D (adda_d0_f32, float32_t, svfloat32_t,
+		  d0 = svadda_f32 (p0, d0, z2),
+		  d0 = svadda (p0, d0, z2))
+
+/*
+** adda_d1_f32:
+**	fmov	s0, s1
+**	fadda	s0, p0, s0, z2\.s
+**	ret
+*/
+TEST_FOLD_LEFT_D (adda_d1_f32, float32_t, svfloat32_t,
+		  d0 = svadda_f32 (p0, d1, z2),
+		  d0 = svadda (p0, d1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f64.c
new file mode 100644
index 000000000..90a56420a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adda_f64.c
@@ -0,0 +1,22 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** adda_d0_f64:
+**	fadda	d0, p0, d0, z2\.d
+**	ret
+*/
+TEST_FOLD_LEFT_D (adda_d0_f64, float64_t, svfloat64_t,
+		  d0 = svadda_f64 (p0, d0, z2),
+		  d0 = svadda (p0, d0, z2))
+
+/*
+** adda_d1_f64:
+**	fmov	d0, d1
+**	fadda	d0, p0, d0, z2\.d
+**	ret
+*/
+TEST_FOLD_LEFT_D (adda_d1_f64, float64_t, svfloat64_t,
+		  d0 = svadda_f64 (p0, d1, z2),
+		  d0 = svadda (p0, d1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f16.c
new file mode 100644
index 000000000..7bb0c1de4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** addv_d0_f16_tied:
+**	faddv	h0, p0, z0\.h
+**	ret
+*/
+TEST_REDUCTION_D (addv_d0_f16_tied, float16_t, svfloat16_t,
+		  d0 = svaddv_f16 (p0, z0),
+		  d0 = svaddv (p0, z0))
+
+/*
+** addv_d0_f16_untied:
+**	faddv	h0, p0, z1\.h
+**	ret
+*/
+TEST_REDUCTION_D (addv_d0_f16_untied, float16_t, svfloat16_t,
+		  d0 = svaddv_f16 (p0, z1),
+		  d0 = svaddv (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f32.c
new file mode 100644
index 000000000..51c621910
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** addv_d0_f32_tied:
+**	faddv	s0, p0, z0\.s
+**	ret
+*/
+TEST_REDUCTION_D (addv_d0_f32_tied, float32_t, svfloat32_t,
+		  d0 = svaddv_f32 (p0, z0),
+		  d0 = svaddv (p0, z0))
+
+/*
+** addv_d0_f32_untied:
+**	faddv	s0, p0, z1\.s
+**	ret
+*/
+TEST_REDUCTION_D (addv_d0_f32_untied, float32_t, svfloat32_t,
+		  d0 = svaddv_f32 (p0, z1),
+		  d0 = svaddv (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f64.c
new file mode 100644
index 000000000..882866210
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_f64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** addv_d0_f64_tied:
+**	faddv	d0, p0, z0\.d
+**	ret
+*/
+TEST_REDUCTION_D (addv_d0_f64_tied, float64_t, svfloat64_t,
+		  d0 = svaddv_f64 (p0, z0),
+		  d0 = svaddv (p0, z0))
+
+/*
+** addv_d0_f64_untied:
+**	faddv	d0, p0, z1\.d
+**	ret
+*/
+TEST_REDUCTION_D (addv_d0_f64_untied, float64_t, svfloat64_t,
+		  d0 = svaddv_f64 (p0, z1),
+		  d0 = svaddv (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s16.c
new file mode 100644
index 000000000..05429a47e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s16.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** addv_x0_s16:
+**	saddv	(d[0-9]+), p0, z0\.h
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (addv_x0_s16, int64_t, svint16_t,
+		  x0 = svaddv_s16 (p0, z0),
+		  x0 = svaddv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s32.c
new file mode 100644
index 000000000..5f7789a9a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s32.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** addv_x0_s32:
+**	saddv	(d[0-9]+), p0, z0\.s
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (addv_x0_s32, int64_t, svint32_t,
+		  x0 = svaddv_s32 (p0, z0),
+		  x0 = svaddv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s64.c
new file mode 100644
index 000000000..76c480091
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s64.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** addv_x0_s64:
+**	uaddv	(d[0-9]+), p0, z0\.d
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (addv_x0_s64, int64_t, svint64_t,
+		  x0 = svaddv_s64 (p0, z0),
+		  x0 = svaddv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s8.c
new file mode 100644
index 000000000..8ccb2bf4f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_s8.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** addv_x0_s8:
+**	saddv	(d[0-9]+), p0, z0\.b
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (addv_x0_s8, int64_t, svint8_t,
+		  x0 = svaddv_s8 (p0, z0),
+		  x0 = svaddv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u16.c
new file mode 100644
index 000000000..6371921fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u16.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** addv_x0_u16:
+**	uaddv	(d[0-9]+), p0, z0\.h
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (addv_x0_u16, uint64_t, svuint16_t,
+		  x0 = svaddv_u16 (p0, z0),
+		  x0 = svaddv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u32.c
new file mode 100644
index 000000000..bdd0ed1f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u32.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** addv_x0_u32:
+**	uaddv	(d[0-9]+), p0, z0\.s
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (addv_x0_u32, uint64_t, svuint32_t,
+		  x0 = svaddv_u32 (p0, z0),
+		  x0 = svaddv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u64.c
new file mode 100644
index 000000000..7b1995d3f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u64.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** addv_x0_u64:
+**	uaddv	(d[0-9]+), p0, z0\.d
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (addv_x0_u64, uint64_t, svuint64_t,
+		  x0 = svaddv_u64 (p0, z0),
+		  x0 = svaddv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u8.c
new file mode 100644
index 000000000..0e972f093
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/addv_u8.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** addv_x0_u8:
+**	uaddv	(d[0-9]+), p0, z0\.b
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (addv_x0_u8, uint64_t, svuint8_t,
+		  x0 = svaddv_u8 (p0, z0),
+		  x0 = svaddv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrb.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrb.c
new file mode 100644
index 000000000..a61eec971
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrb.c
@@ -0,0 +1,57 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** adrb_u32base_s32offset:
+**	adr	z0\.s, \[z0\.s, z1\.s\]
+**	ret
+*/
+TEST_ADR (adrb_u32base_s32offset, svuint32_t, svint32_t,
+	  z0 = svadrb_u32base_s32offset (z0, z1),
+	  z0 = svadrb_offset (z0, z1))
+
+/*
+** adrb_u32base_u32offset:
+**	adr	z0\.s, \[z0\.s, z1\.s\]
+**	ret
+*/
+TEST_ADR (adrb_u32base_u32offset, svuint32_t, svuint32_t,
+	  z0 = svadrb_u32base_u32offset (z0, z1),
+	  z0 = svadrb_offset (z0, z1))
+
+/*
+** adrb_u64base_s64offset:
+**	adr	z0\.d, \[z0\.d, z1\.d\]
+**	ret
+*/
+TEST_ADR (adrb_u64base_s64offset, svuint64_t, svint64_t,
+	  z0 = svadrb_u64base_s64offset (z0, z1),
+	  z0 = svadrb_offset (z0, z1))
+
+/*
+** adrb_ext_u64base_s64offset:
+**	adr	z0\.d, \[z0\.d, z1\.d, sxtw\]
+**	ret
+*/
+TEST_ADR (adrb_ext_u64base_s64offset, svuint64_t, svint64_t,
+	  z0 = svadrb_u64base_s64offset (z0, svextw_s64_x (svptrue_b64 (), z1)),
+	  z0 = svadrb_offset (z0, svextw_x (svptrue_b64 (), z1)))
+
+/*
+** adrb_u64base_u64offset:
+**	adr	z0\.d, \[z0\.d, z1\.d\]
+**	ret
+*/
+TEST_ADR (adrb_u64base_u64offset, svuint64_t, svuint64_t,
+	  z0 = svadrb_u64base_u64offset (z0, z1),
+	  z0 = svadrb_offset (z0, z1))
+
+/*
+** adrb_ext_u64base_u64offset:
+**	adr	z0\.d, \[z0\.d, z1\.d, uxtw\]
+**	ret
+*/
+TEST_ADR (adrb_ext_u64base_u64offset, svuint64_t, svuint64_t,
+	  z0 = svadrb_u64base_u64offset (z0, svextw_u64_x (svptrue_b64 (), z1)),
+	  z0 = svadrb_offset (z0, svextw_x (svptrue_b64 (), z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrd.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrd.c
new file mode 100644
index 000000000..970485bd6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrd.c
@@ -0,0 +1,57 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** adrd_u32base_s32index:
+**	adr	z0\.s, \[z0\.s, z1\.s, lsl 3\]
+**	ret
+*/
+TEST_ADR (adrd_u32base_s32index, svuint32_t, svint32_t,
+	  z0 = svadrd_u32base_s32index (z0, z1),
+	  z0 = svadrd_index (z0, z1))
+
+/*
+** adrd_u32base_u32index:
+**	adr	z0\.s, \[z0\.s, z1\.s, lsl 3\]
+**	ret
+*/
+TEST_ADR (adrd_u32base_u32index, svuint32_t, svuint32_t,
+	  z0 = svadrd_u32base_u32index (z0, z1),
+	  z0 = svadrd_index (z0, z1))
+
+/*
+** adrd_u64base_s64index:
+**	adr	z0\.d, \[z0\.d, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_ADR (adrd_u64base_s64index, svuint64_t, svint64_t,
+	  z0 = svadrd_u64base_s64index (z0, z1),
+	  z0 = svadrd_index (z0, z1))
+
+/*
+** adrd_ext_u64base_s64index:
+**	adr	z0\.d, \[z0\.d, z1\.d, sxtw 3\]
+**	ret
+*/
+TEST_ADR (adrd_ext_u64base_s64index, svuint64_t, svint64_t,
+	  z0 = svadrd_u64base_s64index (z0, svextw_s64_x (svptrue_b64 (), z1)),
+	  z0 = svadrd_index (z0, svextw_x (svptrue_b64 (), z1)))
+
+/*
+** adrd_u64base_u64index:
+**	adr	z0\.d, \[z0\.d, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_ADR (adrd_u64base_u64index, svuint64_t, svuint64_t,
+	  z0 = svadrd_u64base_u64index (z0, z1),
+	  z0 = svadrd_index (z0, z1))
+
+/*
+** adrd_ext_u64base_u64index:
+**	adr	z0\.d, \[z0\.d, z1\.d, uxtw 3\]
+**	ret
+*/
+TEST_ADR (adrd_ext_u64base_u64index, svuint64_t, svuint64_t,
+	  z0 = svadrd_u64base_u64index (z0, svextw_u64_x (svptrue_b64 (), z1)),
+	  z0 = svadrd_index (z0, svextw_x (svptrue_b64 (), z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrh.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrh.c
new file mode 100644
index 000000000..d06f51fe3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrh.c
@@ -0,0 +1,57 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** adrh_u32base_s32index:
+**	adr	z0\.s, \[z0\.s, z1\.s, lsl 1\]
+**	ret
+*/
+TEST_ADR (adrh_u32base_s32index, svuint32_t, svint32_t,
+	  z0 = svadrh_u32base_s32index (z0, z1),
+	  z0 = svadrh_index (z0, z1))
+
+/*
+** adrh_u32base_u32index:
+**	adr	z0\.s, \[z0\.s, z1\.s, lsl 1\]
+**	ret
+*/
+TEST_ADR (adrh_u32base_u32index, svuint32_t, svuint32_t,
+	  z0 = svadrh_u32base_u32index (z0, z1),
+	  z0 = svadrh_index (z0, z1))
+
+/*
+** adrh_u64base_s64index:
+**	adr	z0\.d, \[z0\.d, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_ADR (adrh_u64base_s64index, svuint64_t, svint64_t,
+	  z0 = svadrh_u64base_s64index (z0, z1),
+	  z0 = svadrh_index (z0, z1))
+
+/*
+** adrh_ext_u64base_s64index:
+**	adr	z0\.d, \[z0\.d, z1\.d, sxtw 1\]
+**	ret
+*/
+TEST_ADR (adrh_ext_u64base_s64index, svuint64_t, svint64_t,
+	  z0 = svadrh_u64base_s64index (z0, svextw_s64_x (svptrue_b64 (), z1)),
+	  z0 = svadrh_index (z0, svextw_x (svptrue_b64 (), z1)))
+
+/*
+** adrh_u64base_u64index:
+**	adr	z0\.d, \[z0\.d, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_ADR (adrh_u64base_u64index, svuint64_t, svuint64_t,
+	  z0 = svadrh_u64base_u64index (z0, z1),
+	  z0 = svadrh_index (z0, z1))
+
+/*
+** adrh_ext_u64base_u64index:
+**	adr	z0\.d, \[z0\.d, z1\.d, uxtw 1\]
+**	ret
+*/
+TEST_ADR (adrh_ext_u64base_u64index, svuint64_t, svuint64_t,
+	  z0 = svadrh_u64base_u64index (z0, svextw_u64_x (svptrue_b64 (), z1)),
+	  z0 = svadrh_index (z0, svextw_x (svptrue_b64 (), z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrw.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrw.c
new file mode 100644
index 000000000..b23f25a11
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/adrw.c
@@ -0,0 +1,57 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** adrw_u32base_s32index:
+**	adr	z0\.s, \[z0\.s, z1\.s, lsl 2\]
+**	ret
+*/
+TEST_ADR (adrw_u32base_s32index, svuint32_t, svint32_t,
+	  z0 = svadrw_u32base_s32index (z0, z1),
+	  z0 = svadrw_index (z0, z1))
+
+/*
+** adrw_u32base_u32index:
+**	adr	z0\.s, \[z0\.s, z1\.s, lsl 2\]
+**	ret
+*/
+TEST_ADR (adrw_u32base_u32index, svuint32_t, svuint32_t,
+	  z0 = svadrw_u32base_u32index (z0, z1),
+	  z0 = svadrw_index (z0, z1))
+
+/*
+** adrw_u64base_s64index:
+**	adr	z0\.d, \[z0\.d, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_ADR (adrw_u64base_s64index, svuint64_t, svint64_t,
+	  z0 = svadrw_u64base_s64index (z0, z1),
+	  z0 = svadrw_index (z0, z1))
+
+/*
+** adrw_ext_u64base_s64index:
+**	adr	z0\.d, \[z0\.d, z1\.d, sxtw 2\]
+**	ret
+*/
+TEST_ADR (adrw_ext_u64base_s64index, svuint64_t, svint64_t,
+	  z0 = svadrw_u64base_s64index (z0, svextw_s64_x (svptrue_b64 (), z1)),
+	  z0 = svadrw_index (z0, svextw_x (svptrue_b64 (), z1)))
+
+/*
+** adrw_u64base_u64index:
+**	adr	z0\.d, \[z0\.d, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_ADR (adrw_u64base_u64index, svuint64_t, svuint64_t,
+	  z0 = svadrw_u64base_u64index (z0, z1),
+	  z0 = svadrw_index (z0, z1))
+
+/*
+** adrw_ext_u64base_u64index:
+**	adr	z0\.d, \[z0\.d, z1\.d, uxtw 2\]
+**	ret
+*/
+TEST_ADR (adrw_ext_u64base_u64index, svuint64_t, svuint64_t,
+	  z0 = svadrw_u64base_u64index (z0, svextw_u64_x (svptrue_b64 (), z1)),
+	  z0 = svadrw_index (z0, svextw_x (svptrue_b64 (), z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_b.c
new file mode 100644
index 000000000..f0c4ff1b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_b.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** and_b_z_tied1:
+**	and	p0\.b, p3/z, (p0\.b, p1\.b|p1\.b, p0\.b)
+**	ret
+*/
+TEST_UNIFORM_P (and_b_z_tied1,
+		p0 = svand_b_z (p3, p0, p1),
+		p0 = svand_z (p3, p0, p1))
+
+/*
+** and_b_z_tied2:
+**	and	p0\.b, p3/z, (p0\.b, p1\.b|p1\.b, p0\.b)
+**	ret
+*/
+TEST_UNIFORM_P (and_b_z_tied2,
+		p0 = svand_b_z (p3, p1, p0),
+		p0 = svand_z (p3, p1, p0))
+
+/*
+** and_b_z_untied:
+**	and	p0\.b, p3/z, (p1\.b, p2\.b|p2\.b, p1\.b)
+**	ret
+*/
+TEST_UNIFORM_P (and_b_z_untied,
+		p0 = svand_b_z (p3, p1, p2),
+		p0 = svand_z (p3, p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s16.c
new file mode 100644
index 000000000..d54613e91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s16.c
@@ -0,0 +1,422 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** and_s16_m_tied1:
+**	and	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (and_s16_m_tied1, svint16_t,
+		z0 = svand_s16_m (p0, z0, z1),
+		z0 = svand_m (p0, z0, z1))
+
+/*
+** and_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	and	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (and_s16_m_tied2, svint16_t,
+		z0 = svand_s16_m (p0, z1, z0),
+		z0 = svand_m (p0, z1, z0))
+
+/*
+** and_s16_m_untied:
+**	movprfx	z0, z1
+**	and	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (and_s16_m_untied, svint16_t,
+		z0 = svand_s16_m (p0, z1, z2),
+		z0 = svand_m (p0, z1, z2))
+
+/*
+** and_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s16_m_tied1, svint16_t, int16_t,
+		 z0 = svand_n_s16_m (p0, z0, x0),
+		 z0 = svand_m (p0, z0, x0))
+
+/*
+** and_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s16_m_untied, svint16_t, int16_t,
+		 z0 = svand_n_s16_m (p0, z1, x0),
+		 z0 = svand_m (p0, z1, x0))
+
+/*
+** and_1_s16_m_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s16_m_tied1, svint16_t,
+		z0 = svand_n_s16_m (p0, z0, 1),
+		z0 = svand_m (p0, z0, 1))
+
+/*
+** and_1_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0, z1
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s16_m_untied, svint16_t,
+		z0 = svand_n_s16_m (p0, z1, 1),
+		z0 = svand_m (p0, z1, 1))
+
+/*
+** and_m2_s16_m:
+**	mov	(z[0-9]+\.h), #-2
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_m2_s16_m, svint16_t,
+		z0 = svand_n_s16_m (p0, z0, -2),
+		z0 = svand_m (p0, z0, -2))
+
+/*
+** and_255_s16_m_tied1:
+**	uxtb	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_s16_m_tied1, svint16_t,
+		z0 = svand_n_s16_m (p0, z0, 255),
+		z0 = svand_m (p0, z0, 255))
+
+/*
+** and_255_s16_m_untied:
+**	movprfx	z0, z1
+**	uxtb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_s16_m_untied, svint16_t,
+		z0 = svand_n_s16_m (p0, z1, 255),
+		z0 = svand_m (p0, z1, 255))
+
+/*
+** and_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	and	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (and_s16_z_tied1, svint16_t,
+		z0 = svand_s16_z (p0, z0, z1),
+		z0 = svand_z (p0, z0, z1))
+
+/*
+** and_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	and	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (and_s16_z_tied2, svint16_t,
+		z0 = svand_s16_z (p0, z1, z0),
+		z0 = svand_z (p0, z1, z0))
+
+/*
+** and_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	and	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	and	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_s16_z_untied, svint16_t,
+		z0 = svand_s16_z (p0, z1, z2),
+		z0 = svand_z (p0, z1, z2))
+
+/*
+** and_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s16_z_tied1, svint16_t, int16_t,
+		 z0 = svand_n_s16_z (p0, z0, x0),
+		 z0 = svand_z (p0, z0, x0))
+
+/*
+** and_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	and	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	and	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s16_z_untied, svint16_t, int16_t,
+		 z0 = svand_n_s16_z (p0, z1, x0),
+		 z0 = svand_z (p0, z1, x0))
+
+/*
+** and_1_s16_z_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s16_z_tied1, svint16_t,
+		z0 = svand_n_s16_z (p0, z0, 1),
+		z0 = svand_z (p0, z0, 1))
+
+/*
+** and_1_s16_z_untied:
+**	mov	(z[0-9]+\.h), #1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	and	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	and	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s16_z_untied, svint16_t,
+		z0 = svand_n_s16_z (p0, z1, 1),
+		z0 = svand_z (p0, z1, 1))
+
+/*
+** and_255_s16_z_tied1:
+** (
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	uxtb	z0\.h, p0/m, \1\.h
+** |
+**	mov	(z[0-9]+\.h), #255
+**	movprfx	z0\.h, p0/z, z0\.h
+**	and	z0\.h, p0/m, z0\.h, \1
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_s16_z_tied1, svint16_t,
+		z0 = svand_n_s16_z (p0, z0, 255),
+		z0 = svand_z (p0, z0, 255))
+
+/*
+** and_255_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	uxtb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_s16_z_untied, svint16_t,
+		z0 = svand_n_s16_z (p0, z1, 255),
+		z0 = svand_z (p0, z1, 255))
+
+/*
+** and_s16_x_tied1:
+**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_s16_x_tied1, svint16_t,
+		z0 = svand_s16_x (p0, z0, z1),
+		z0 = svand_x (p0, z0, z1))
+
+/*
+** and_s16_x_tied2:
+**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_s16_x_tied2, svint16_t,
+		z0 = svand_s16_x (p0, z1, z0),
+		z0 = svand_x (p0, z1, z0))
+
+/*
+** and_s16_x_untied:
+**	and	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_s16_x_untied, svint16_t,
+		z0 = svand_s16_x (p0, z1, z2),
+		z0 = svand_x (p0, z1, z2))
+
+/*
+** and_w0_s16_x_tied1:
+**	mov	(z[0-9]+)\.h, w0
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s16_x_tied1, svint16_t, int16_t,
+		 z0 = svand_n_s16_x (p0, z0, x0),
+		 z0 = svand_x (p0, z0, x0))
+
+/*
+** and_w0_s16_x_untied:
+**	mov	(z[0-9]+)\.h, w0
+**	and	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s16_x_untied, svint16_t, int16_t,
+		 z0 = svand_n_s16_x (p0, z1, x0),
+		 z0 = svand_x (p0, z1, x0))
+
+/*
+** and_1_s16_x_tied1:
+**	and	z0\.h, z0\.h, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s16_x_tied1, svint16_t,
+		z0 = svand_n_s16_x (p0, z0, 1),
+		z0 = svand_x (p0, z0, 1))
+
+/*
+** and_1_s16_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.h, z0\.h, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s16_x_untied, svint16_t,
+		z0 = svand_n_s16_x (p0, z1, 1),
+		z0 = svand_x (p0, z1, 1))
+
+/*
+** and_127_s16_x:
+**	and	z0\.h, z0\.h, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (and_127_s16_x, svint16_t,
+		z0 = svand_n_s16_x (p0, z0, 127),
+		z0 = svand_x (p0, z0, 127))
+
+/*
+** and_128_s16_x:
+**	and	z0\.h, z0\.h, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (and_128_s16_x, svint16_t,
+		z0 = svand_n_s16_x (p0, z0, 128),
+		z0 = svand_x (p0, z0, 128))
+
+/*
+** and_255_s16_x:
+**	and	z0\.h, z0\.h, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_s16_x, svint16_t,
+		z0 = svand_n_s16_x (p0, z0, 255),
+		z0 = svand_x (p0, z0, 255))
+
+/*
+** and_256_s16_x:
+**	and	z0\.h, z0\.h, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (and_256_s16_x, svint16_t,
+		z0 = svand_n_s16_x (p0, z0, 256),
+		z0 = svand_x (p0, z0, 256))
+
+/*
+** and_257_s16_x:
+**	and	z0\.h, z0\.h, #0x101
+**	ret
+*/
+TEST_UNIFORM_Z (and_257_s16_x, svint16_t,
+		z0 = svand_n_s16_x (p0, z0, 257),
+		z0 = svand_x (p0, z0, 257))
+
+/*
+** and_512_s16_x:
+**	and	z0\.h, z0\.h, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (and_512_s16_x, svint16_t,
+		z0 = svand_n_s16_x (p0, z0, 512),
+		z0 = svand_x (p0, z0, 512))
+
+/*
+** and_65280_s16_x:
+**	and	z0\.h, z0\.h, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (and_65280_s16_x, svint16_t,
+		z0 = svand_n_s16_x (p0, z0, 0xff00),
+		z0 = svand_x (p0, z0, 0xff00))
+
+/*
+** and_m127_s16_x:
+**	and	z0\.h, z0\.h, #0xff81
+**	ret
+*/
+TEST_UNIFORM_Z (and_m127_s16_x, svint16_t,
+		z0 = svand_n_s16_x (p0, z0, -127),
+		z0 = svand_x (p0, z0, -127))
+
+/*
+** and_m128_s16_x:
+**	and	z0\.h, z0\.h, #0xff80
+**	ret
+*/
+TEST_UNIFORM_Z (and_m128_s16_x, svint16_t,
+		z0 = svand_n_s16_x (p0, z0, -128),
+		z0 = svand_x (p0, z0, -128))
+
+/*
+** and_m255_s16_x:
+**	and	z0\.h, z0\.h, #0xff01
+**	ret
+*/
+TEST_UNIFORM_Z (and_m255_s16_x, svint16_t,
+		z0 = svand_n_s16_x (p0, z0, -255),
+		z0 = svand_x (p0, z0, -255))
+
+/*
+** and_m256_s16_x:
+**	and	z0\.h, z0\.h, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (and_m256_s16_x, svint16_t,
+		z0 = svand_n_s16_x (p0, z0, -256),
+		z0 = svand_x (p0, z0, -256))
+
+/*
+** and_m257_s16_x:
+**	and	z0\.h, z0\.h, #0xfeff
+**	ret
+*/
+TEST_UNIFORM_Z (and_m257_s16_x, svint16_t,
+		z0 = svand_n_s16_x (p0, z0, -257),
+		z0 = svand_x (p0, z0, -257))
+
+/*
+** and_m512_s16_x:
+**	and	z0\.h, z0\.h, #0xfe00
+**	ret
+*/
+TEST_UNIFORM_Z (and_m512_s16_x, svint16_t,
+		z0 = svand_n_s16_x (p0, z0, -512),
+		z0 = svand_x (p0, z0, -512))
+
+/*
+** and_m32768_s16_x:
+**	and	z0\.h, z0\.h, #0x8000
+**	ret
+*/
+TEST_UNIFORM_Z (and_m32768_s16_x, svint16_t,
+		z0 = svand_n_s16_x (p0, z0, -0x8000),
+		z0 = svand_x (p0, z0, -0x8000))
+
+/*
+** and_5_s16_x:
+**	mov	(z[0-9]+)\.h, #5
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_5_s16_x, svint16_t,
+		z0 = svand_n_s16_x (p0, z0, 5),
+		z0 = svand_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s32.c
new file mode 100644
index 000000000..7f4082b32
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s32.c
@@ -0,0 +1,464 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** and_s32_m_tied1:
+**	and	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_s32_m_tied1, svint32_t,
+		z0 = svand_s32_m (p0, z0, z1),
+		z0 = svand_m (p0, z0, z1))
+
+/*
+** and_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	and	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_s32_m_tied2, svint32_t,
+		z0 = svand_s32_m (p0, z1, z0),
+		z0 = svand_m (p0, z1, z0))
+
+/*
+** and_s32_m_untied:
+**	movprfx	z0, z1
+**	and	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_s32_m_untied, svint32_t,
+		z0 = svand_s32_m (p0, z1, z2),
+		z0 = svand_m (p0, z1, z2))
+
+/*
+** and_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svand_n_s32_m (p0, z0, x0),
+		 z0 = svand_m (p0, z0, x0))
+
+/*
+** and_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svand_n_s32_m (p0, z1, x0),
+		 z0 = svand_m (p0, z1, x0))
+
+/*
+** and_1_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s32_m_tied1, svint32_t,
+		z0 = svand_n_s32_m (p0, z0, 1),
+		z0 = svand_m (p0, z0, 1))
+
+/*
+** and_1_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0, z1
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s32_m_untied, svint32_t,
+		z0 = svand_n_s32_m (p0, z1, 1),
+		z0 = svand_m (p0, z1, 1))
+
+/*
+** and_m2_s32_m:
+**	mov	(z[0-9]+\.s), #-2
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_m2_s32_m, svint32_t,
+		z0 = svand_n_s32_m (p0, z0, -2),
+		z0 = svand_m (p0, z0, -2))
+
+/*
+** and_255_s32_m_tied1:
+**	uxtb	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_s32_m_tied1, svint32_t,
+		z0 = svand_n_s32_m (p0, z0, 255),
+		z0 = svand_m (p0, z0, 255))
+
+/*
+** and_255_s32_m_untied:
+**	movprfx	z0, z1
+**	uxtb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_s32_m_untied, svint32_t,
+		z0 = svand_n_s32_m (p0, z1, 255),
+		z0 = svand_m (p0, z1, 255))
+
+/*
+** and_65535_s32_m_tied1:
+**	uxth	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_65535_s32_m_tied1, svint32_t,
+		z0 = svand_n_s32_m (p0, z0, 65535),
+		z0 = svand_m (p0, z0, 65535))
+
+/*
+** and_65535_s32_m_untied:
+**	movprfx	z0, z1
+**	uxth	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_65535_s32_m_untied, svint32_t,
+		z0 = svand_n_s32_m (p0, z1, 65535),
+		z0 = svand_m (p0, z1, 65535))
+
+/*
+** and_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	and	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_s32_z_tied1, svint32_t,
+		z0 = svand_s32_z (p0, z0, z1),
+		z0 = svand_z (p0, z0, z1))
+
+/*
+** and_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	and	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_s32_z_tied2, svint32_t,
+		z0 = svand_s32_z (p0, z1, z0),
+		z0 = svand_z (p0, z1, z0))
+
+/*
+** and_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	and	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	and	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_s32_z_untied, svint32_t,
+		z0 = svand_s32_z (p0, z1, z2),
+		z0 = svand_z (p0, z1, z2))
+
+/*
+** and_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svand_n_s32_z (p0, z0, x0),
+		 z0 = svand_z (p0, z0, x0))
+
+/*
+** and_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	and	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	and	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svand_n_s32_z (p0, z1, x0),
+		 z0 = svand_z (p0, z1, x0))
+
+/*
+** and_1_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s32_z_tied1, svint32_t,
+		z0 = svand_n_s32_z (p0, z0, 1),
+		z0 = svand_z (p0, z0, 1))
+
+/*
+** and_1_s32_z_untied:
+**	mov	(z[0-9]+\.s), #1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	and	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	and	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s32_z_untied, svint32_t,
+		z0 = svand_n_s32_z (p0, z1, 1),
+		z0 = svand_z (p0, z1, 1))
+
+/*
+** and_255_s32_z_tied1:
+** (
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	uxtb	z0\.s, p0/m, \1\.s
+** |
+**	mov	(z[0-9]+\.s), #255
+**	movprfx	z0\.s, p0/z, z0\.s
+**	and	z0\.s, p0/m, z0\.s, \1
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_s32_z_tied1, svint32_t,
+		z0 = svand_n_s32_z (p0, z0, 255),
+		z0 = svand_z (p0, z0, 255))
+
+/*
+** and_255_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	uxtb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_s32_z_untied, svint32_t,
+		z0 = svand_n_s32_z (p0, z1, 255),
+		z0 = svand_z (p0, z1, 255))
+
+/*
+** and_65535_s32_z_tied1:
+** (
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	uxth	z0\.s, p0/m, \1\.s
+** |
+**	mov	(z[0-9]+\.s), #65535
+**	movprfx	z0\.s, p0/z, z0\.s
+**	and	z0\.s, p0/m, z0\.s, \1
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_65535_s32_z_tied1, svint32_t,
+		z0 = svand_n_s32_z (p0, z0, 65535),
+		z0 = svand_z (p0, z0, 65535))
+
+/*
+** and_65535_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	uxth	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_65535_s32_z_untied, svint32_t,
+		z0 = svand_n_s32_z (p0, z1, 65535),
+		z0 = svand_z (p0, z1, 65535))
+
+/*
+** and_s32_x_tied1:
+**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_s32_x_tied1, svint32_t,
+		z0 = svand_s32_x (p0, z0, z1),
+		z0 = svand_x (p0, z0, z1))
+
+/*
+** and_s32_x_tied2:
+**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_s32_x_tied2, svint32_t,
+		z0 = svand_s32_x (p0, z1, z0),
+		z0 = svand_x (p0, z1, z0))
+
+/*
+** and_s32_x_untied:
+**	and	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_s32_x_untied, svint32_t,
+		z0 = svand_s32_x (p0, z1, z2),
+		z0 = svand_x (p0, z1, z2))
+
+/*
+** and_w0_s32_x_tied1:
+**	mov	(z[0-9]+)\.s, w0
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svand_n_s32_x (p0, z0, x0),
+		 z0 = svand_x (p0, z0, x0))
+
+/*
+** and_w0_s32_x_untied:
+**	mov	(z[0-9]+)\.s, w0
+**	and	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svand_n_s32_x (p0, z1, x0),
+		 z0 = svand_x (p0, z1, x0))
+
+/*
+** and_1_s32_x_tied1:
+**	and	z0\.s, z0\.s, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s32_x_tied1, svint32_t,
+		z0 = svand_n_s32_x (p0, z0, 1),
+		z0 = svand_x (p0, z0, 1))
+
+/*
+** and_1_s32_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.s, z0\.s, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s32_x_untied, svint32_t,
+		z0 = svand_n_s32_x (p0, z1, 1),
+		z0 = svand_x (p0, z1, 1))
+
+/*
+** and_127_s32_x:
+**	and	z0\.s, z0\.s, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (and_127_s32_x, svint32_t,
+		z0 = svand_n_s32_x (p0, z0, 127),
+		z0 = svand_x (p0, z0, 127))
+
+/*
+** and_128_s32_x:
+**	and	z0\.s, z0\.s, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (and_128_s32_x, svint32_t,
+		z0 = svand_n_s32_x (p0, z0, 128),
+		z0 = svand_x (p0, z0, 128))
+
+/*
+** and_255_s32_x:
+**	and	z0\.s, z0\.s, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_s32_x, svint32_t,
+		z0 = svand_n_s32_x (p0, z0, 255),
+		z0 = svand_x (p0, z0, 255))
+
+/*
+** and_256_s32_x:
+**	and	z0\.s, z0\.s, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (and_256_s32_x, svint32_t,
+		z0 = svand_n_s32_x (p0, z0, 256),
+		z0 = svand_x (p0, z0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (and_257_s32_x, svint32_t,
+		z0 = svand_n_s32_x (p0, z0, 257),
+		z0 = svand_x (p0, z0, 257))
+
+/*
+** and_512_s32_x:
+**	and	z0\.s, z0\.s, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (and_512_s32_x, svint32_t,
+		z0 = svand_n_s32_x (p0, z0, 512),
+		z0 = svand_x (p0, z0, 512))
+
+/*
+** and_65280_s32_x:
+**	and	z0\.s, z0\.s, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (and_65280_s32_x, svint32_t,
+		z0 = svand_n_s32_x (p0, z0, 0xff00),
+		z0 = svand_x (p0, z0, 0xff00))
+
+/*
+** and_m127_s32_x:
+**	and	z0\.s, z0\.s, #0xffffff81
+**	ret
+*/
+TEST_UNIFORM_Z (and_m127_s32_x, svint32_t,
+		z0 = svand_n_s32_x (p0, z0, -127),
+		z0 = svand_x (p0, z0, -127))
+
+/*
+** and_m128_s32_x:
+**	and	z0\.s, z0\.s, #0xffffff80
+**	ret
+*/
+TEST_UNIFORM_Z (and_m128_s32_x, svint32_t,
+		z0 = svand_n_s32_x (p0, z0, -128),
+		z0 = svand_x (p0, z0, -128))
+
+/*
+** and_m255_s32_x:
+**	and	z0\.s, z0\.s, #0xffffff01
+**	ret
+*/
+TEST_UNIFORM_Z (and_m255_s32_x, svint32_t,
+		z0 = svand_n_s32_x (p0, z0, -255),
+		z0 = svand_x (p0, z0, -255))
+
+/*
+** and_m256_s32_x:
+**	and	z0\.s, z0\.s, #0xffffff00
+**	ret
+*/
+TEST_UNIFORM_Z (and_m256_s32_x, svint32_t,
+		z0 = svand_n_s32_x (p0, z0, -256),
+		z0 = svand_x (p0, z0, -256))
+
+/*
+** and_m257_s32_x:
+**	and	z0\.s, z0\.s, #0xfffffeff
+**	ret
+*/
+TEST_UNIFORM_Z (and_m257_s32_x, svint32_t,
+		z0 = svand_n_s32_x (p0, z0, -257),
+		z0 = svand_x (p0, z0, -257))
+
+/*
+** and_m512_s32_x:
+**	and	z0\.s, z0\.s, #0xfffffe00
+**	ret
+*/
+TEST_UNIFORM_Z (and_m512_s32_x, svint32_t,
+		z0 = svand_n_s32_x (p0, z0, -512),
+		z0 = svand_x (p0, z0, -512))
+
+/*
+** and_m32768_s32_x:
+**	and	z0\.s, z0\.s, #0xffff8000
+**	ret
+*/
+TEST_UNIFORM_Z (and_m32768_s32_x, svint32_t,
+		z0 = svand_n_s32_x (p0, z0, -0x8000),
+		z0 = svand_x (p0, z0, -0x8000))
+
+/*
+** and_5_s32_x:
+**	mov	(z[0-9]+)\.s, #5
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_5_s32_x, svint32_t,
+		z0 = svand_n_s32_x (p0, z0, 5),
+		z0 = svand_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s64.c
new file mode 100644
index 000000000..8868258dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s64.c
@@ -0,0 +1,510 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** and_s64_m_tied1:
+**	and	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_s64_m_tied1, svint64_t,
+		z0 = svand_s64_m (p0, z0, z1),
+		z0 = svand_m (p0, z0, z1))
+
+/*
+** and_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_s64_m_tied2, svint64_t,
+		z0 = svand_s64_m (p0, z1, z0),
+		z0 = svand_m (p0, z1, z0))
+
+/*
+** and_s64_m_untied:
+**	movprfx	z0, z1
+**	and	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_s64_m_untied, svint64_t,
+		z0 = svand_s64_m (p0, z1, z2),
+		z0 = svand_m (p0, z1, z2))
+
+/*
+** and_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svand_n_s64_m (p0, z0, x0),
+		 z0 = svand_m (p0, z0, x0))
+
+/*
+** and_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svand_n_s64_m (p0, z1, x0),
+		 z0 = svand_m (p0, z1, x0))
+
+/*
+** and_1_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s64_m_tied1, svint64_t,
+		z0 = svand_n_s64_m (p0, z0, 1),
+		z0 = svand_m (p0, z0, 1))
+
+/*
+** and_1_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0, z1
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s64_m_untied, svint64_t,
+		z0 = svand_n_s64_m (p0, z1, 1),
+		z0 = svand_m (p0, z1, 1))
+
+/*
+** and_m2_s64_m:
+**	mov	(z[0-9]+\.d), #-2
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_m2_s64_m, svint64_t,
+		z0 = svand_n_s64_m (p0, z0, -2),
+		z0 = svand_m (p0, z0, -2))
+
+/*
+** and_255_s64_m_tied1:
+**	uxtb	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_s64_m_tied1, svint64_t,
+		z0 = svand_n_s64_m (p0, z0, 255),
+		z0 = svand_m (p0, z0, 255))
+
+/*
+** and_255_s64_m_untied:
+**	movprfx	z0, z1
+**	uxtb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_s64_m_untied, svint64_t,
+		z0 = svand_n_s64_m (p0, z1, 255),
+		z0 = svand_m (p0, z1, 255))
+
+/*
+** and_65535_s64_m_tied1:
+**	uxth	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_65535_s64_m_tied1, svint64_t,
+		z0 = svand_n_s64_m (p0, z0, 65535),
+		z0 = svand_m (p0, z0, 65535))
+
+/*
+** and_65535_s64_m_untied:
+**	movprfx	z0, z1
+**	uxth	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_65535_s64_m_untied, svint64_t,
+		z0 = svand_n_s64_m (p0, z1, 65535),
+		z0 = svand_m (p0, z1, 65535))
+
+/*
+** and_0xffffffff_s64_m_tied1:
+**	uxtw	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_0xffffffff_s64_m_tied1, svint64_t,
+		z0 = svand_n_s64_m (p0, z0, 0xffffffff),
+		z0 = svand_m (p0, z0, 0xffffffff))
+
+/*
+** and_0xffffffff_s64_m_untied:
+**	movprfx	z0, z1
+**	uxtw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_0xffffffff_s64_m_untied, svint64_t,
+		z0 = svand_n_s64_m (p0, z1, 0xffffffff),
+		z0 = svand_m (p0, z1, 0xffffffff))
+
+/*
+** and_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	and	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_s64_z_tied1, svint64_t,
+		z0 = svand_s64_z (p0, z0, z1),
+		z0 = svand_z (p0, z0, z1))
+
+/*
+** and_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	and	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_s64_z_tied2, svint64_t,
+		z0 = svand_s64_z (p0, z1, z0),
+		z0 = svand_z (p0, z1, z0))
+
+/*
+** and_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	and	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	and	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_s64_z_untied, svint64_t,
+		z0 = svand_s64_z (p0, z1, z2),
+		z0 = svand_z (p0, z1, z2))
+
+/*
+** and_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svand_n_s64_z (p0, z0, x0),
+		 z0 = svand_z (p0, z0, x0))
+
+/*
+** and_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	and	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (and_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svand_n_s64_z (p0, z1, x0),
+		 z0 = svand_z (p0, z1, x0))
+
+/*
+** and_1_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s64_z_tied1, svint64_t,
+		z0 = svand_n_s64_z (p0, z0, 1),
+		z0 = svand_z (p0, z0, 1))
+
+/*
+** and_1_s64_z_untied:
+**	mov	(z[0-9]+\.d), #1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	and	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s64_z_untied, svint64_t,
+		z0 = svand_n_s64_z (p0, z1, 1),
+		z0 = svand_z (p0, z1, 1))
+
+/*
+** and_255_s64_z_tied1:
+** (
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	uxtb	z0\.d, p0/m, \1
+** |
+**	mov	(z[0-9]+\.d), #255
+**	movprfx	z0\.d, p0/z, z0\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_s64_z_tied1, svint64_t,
+		z0 = svand_n_s64_z (p0, z0, 255),
+		z0 = svand_z (p0, z0, 255))
+
+/*
+** and_255_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	uxtb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_s64_z_untied, svint64_t,
+		z0 = svand_n_s64_z (p0, z1, 255),
+		z0 = svand_z (p0, z1, 255))
+
+/*
+** and_65535_s64_z_tied1:
+** (
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	uxth	z0\.d, p0/m, \1
+** |
+**	mov	(z[0-9]+\.d), #65535
+**	movprfx	z0\.d, p0/z, z0\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_65535_s64_z_tied1, svint64_t,
+		z0 = svand_n_s64_z (p0, z0, 65535),
+		z0 = svand_z (p0, z0, 65535))
+
+/*
+** and_65535_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	uxth	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_65535_s64_z_untied, svint64_t,
+		z0 = svand_n_s64_z (p0, z1, 65535),
+		z0 = svand_z (p0, z1, 65535))
+
+/*
+** and_0xffffffff_s64_z_tied1:
+** (
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	uxtw	z0\.d, p0/m, \1
+** |
+**	mov	(z[0-9]+\.d), #4294967295
+**	movprfx	z0\.d, p0/z, z0\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_0xffffffff_s64_z_tied1, svint64_t,
+		z0 = svand_n_s64_z (p0, z0, 0xffffffff),
+		z0 = svand_z (p0, z0, 0xffffffff))
+
+/*
+** and_0xffffffff_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	uxtw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_0xffffffff_s64_z_untied, svint64_t,
+		z0 = svand_n_s64_z (p0, z1, 0xffffffff),
+		z0 = svand_z (p0, z1, 0xffffffff))
+
+/*
+** and_s64_x_tied1:
+**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_s64_x_tied1, svint64_t,
+		z0 = svand_s64_x (p0, z0, z1),
+		z0 = svand_x (p0, z0, z1))
+
+/*
+** and_s64_x_tied2:
+**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_s64_x_tied2, svint64_t,
+		z0 = svand_s64_x (p0, z1, z0),
+		z0 = svand_x (p0, z1, z0))
+
+/*
+** and_s64_x_untied:
+**	and	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_s64_x_untied, svint64_t,
+		z0 = svand_s64_x (p0, z1, z2),
+		z0 = svand_x (p0, z1, z2))
+
+/*
+** and_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	and	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (and_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svand_n_s64_x (p0, z0, x0),
+		 z0 = svand_x (p0, z0, x0))
+
+/*
+** and_x0_s64_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	and	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (and_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svand_n_s64_x (p0, z1, x0),
+		 z0 = svand_x (p0, z1, x0))
+
+/*
+** and_1_s64_x_tied1:
+**	and	z0\.d, z0\.d, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s64_x_tied1, svint64_t,
+		z0 = svand_n_s64_x (p0, z0, 1),
+		z0 = svand_x (p0, z0, 1))
+
+/*
+** and_1_s64_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.d, z0\.d, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s64_x_untied, svint64_t,
+		z0 = svand_n_s64_x (p0, z1, 1),
+		z0 = svand_x (p0, z1, 1))
+
+/*
+** and_127_s64_x:
+**	and	z0\.d, z0\.d, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (and_127_s64_x, svint64_t,
+		z0 = svand_n_s64_x (p0, z0, 127),
+		z0 = svand_x (p0, z0, 127))
+
+/*
+** and_128_s64_x:
+**	and	z0\.d, z0\.d, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (and_128_s64_x, svint64_t,
+		z0 = svand_n_s64_x (p0, z0, 128),
+		z0 = svand_x (p0, z0, 128))
+
+/*
+** and_255_s64_x:
+**	and	z0\.d, z0\.d, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_s64_x, svint64_t,
+		z0 = svand_n_s64_x (p0, z0, 255),
+		z0 = svand_x (p0, z0, 255))
+
+/*
+** and_256_s64_x:
+**	and	z0\.d, z0\.d, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (and_256_s64_x, svint64_t,
+		z0 = svand_n_s64_x (p0, z0, 256),
+		z0 = svand_x (p0, z0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (and_257_s64_x, svint64_t,
+		z0 = svand_n_s64_x (p0, z0, 257),
+		z0 = svand_x (p0, z0, 257))
+
+/*
+** and_512_s64_x:
+**	and	z0\.d, z0\.d, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (and_512_s64_x, svint64_t,
+		z0 = svand_n_s64_x (p0, z0, 512),
+		z0 = svand_x (p0, z0, 512))
+
+/*
+** and_65280_s64_x:
+**	and	z0\.d, z0\.d, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (and_65280_s64_x, svint64_t,
+		z0 = svand_n_s64_x (p0, z0, 0xff00),
+		z0 = svand_x (p0, z0, 0xff00))
+
+/*
+** and_m127_s64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffffff81
+**	ret
+*/
+TEST_UNIFORM_Z (and_m127_s64_x, svint64_t,
+		z0 = svand_n_s64_x (p0, z0, -127),
+		z0 = svand_x (p0, z0, -127))
+
+/*
+** and_m128_s64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffffff80
+**	ret
+*/
+TEST_UNIFORM_Z (and_m128_s64_x, svint64_t,
+		z0 = svand_n_s64_x (p0, z0, -128),
+		z0 = svand_x (p0, z0, -128))
+
+/*
+** and_m255_s64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffffff01
+**	ret
+*/
+TEST_UNIFORM_Z (and_m255_s64_x, svint64_t,
+		z0 = svand_n_s64_x (p0, z0, -255),
+		z0 = svand_x (p0, z0, -255))
+
+/*
+** and_m256_s64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffffff00
+**	ret
+*/
+TEST_UNIFORM_Z (and_m256_s64_x, svint64_t,
+		z0 = svand_n_s64_x (p0, z0, -256),
+		z0 = svand_x (p0, z0, -256))
+
+/*
+** and_m257_s64_x:
+**	and	z0\.d, z0\.d, #0xfffffffffffffeff
+**	ret
+*/
+TEST_UNIFORM_Z (and_m257_s64_x, svint64_t,
+		z0 = svand_n_s64_x (p0, z0, -257),
+		z0 = svand_x (p0, z0, -257))
+
+/*
+** and_m512_s64_x:
+**	and	z0\.d, z0\.d, #0xfffffffffffffe00
+**	ret
+*/
+TEST_UNIFORM_Z (and_m512_s64_x, svint64_t,
+		z0 = svand_n_s64_x (p0, z0, -512),
+		z0 = svand_x (p0, z0, -512))
+
+/*
+** and_m32768_s64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffff8000
+**	ret
+*/
+TEST_UNIFORM_Z (and_m32768_s64_x, svint64_t,
+		z0 = svand_n_s64_x (p0, z0, -0x8000),
+		z0 = svand_x (p0, z0, -0x8000))
+
+/*
+** and_5_s64_x:
+**	mov	(z[0-9]+\.d), #5
+**	and	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_5_s64_x, svint64_t,
+		z0 = svand_n_s64_x (p0, z0, 5),
+		z0 = svand_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s8.c
new file mode 100644
index 000000000..61d168d3f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_s8.c
@@ -0,0 +1,294 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** and_s8_m_tied1:
+**	and	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (and_s8_m_tied1, svint8_t,
+		z0 = svand_s8_m (p0, z0, z1),
+		z0 = svand_m (p0, z0, z1))
+
+/*
+** and_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	and	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (and_s8_m_tied2, svint8_t,
+		z0 = svand_s8_m (p0, z1, z0),
+		z0 = svand_m (p0, z1, z0))
+
+/*
+** and_s8_m_untied:
+**	movprfx	z0, z1
+**	and	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (and_s8_m_untied, svint8_t,
+		z0 = svand_s8_m (p0, z1, z2),
+		z0 = svand_m (p0, z1, z2))
+
+/*
+** and_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s8_m_tied1, svint8_t, int8_t,
+		 z0 = svand_n_s8_m (p0, z0, x0),
+		 z0 = svand_m (p0, z0, x0))
+
+/*
+** and_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s8_m_untied, svint8_t, int8_t,
+		 z0 = svand_n_s8_m (p0, z1, x0),
+		 z0 = svand_m (p0, z1, x0))
+
+/*
+** and_1_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s8_m_tied1, svint8_t,
+		z0 = svand_n_s8_m (p0, z0, 1),
+		z0 = svand_m (p0, z0, 1))
+
+/*
+** and_1_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0, z1
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s8_m_untied, svint8_t,
+		z0 = svand_n_s8_m (p0, z1, 1),
+		z0 = svand_m (p0, z1, 1))
+
+/*
+** and_m2_s8_m:
+**	mov	(z[0-9]+\.b), #-2
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_m2_s8_m, svint8_t,
+		z0 = svand_n_s8_m (p0, z0, -2),
+		z0 = svand_m (p0, z0, -2))
+
+/*
+** and_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	and	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (and_s8_z_tied1, svint8_t,
+		z0 = svand_s8_z (p0, z0, z1),
+		z0 = svand_z (p0, z0, z1))
+
+/*
+** and_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	and	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (and_s8_z_tied2, svint8_t,
+		z0 = svand_s8_z (p0, z1, z0),
+		z0 = svand_z (p0, z1, z0))
+
+/*
+** and_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	and	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	and	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_s8_z_untied, svint8_t,
+		z0 = svand_s8_z (p0, z1, z2),
+		z0 = svand_z (p0, z1, z2))
+
+/*
+** and_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s8_z_tied1, svint8_t, int8_t,
+		 z0 = svand_n_s8_z (p0, z0, x0),
+		 z0 = svand_z (p0, z0, x0))
+
+/*
+** and_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	and	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	and	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s8_z_untied, svint8_t, int8_t,
+		 z0 = svand_n_s8_z (p0, z1, x0),
+		 z0 = svand_z (p0, z1, x0))
+
+/*
+** and_1_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s8_z_tied1, svint8_t,
+		z0 = svand_n_s8_z (p0, z0, 1),
+		z0 = svand_z (p0, z0, 1))
+
+/*
+** and_1_s8_z_untied:
+**	mov	(z[0-9]+\.b), #1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	and	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	and	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s8_z_untied, svint8_t,
+		z0 = svand_n_s8_z (p0, z1, 1),
+		z0 = svand_z (p0, z1, 1))
+
+/*
+** and_s8_x_tied1:
+**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_s8_x_tied1, svint8_t,
+		z0 = svand_s8_x (p0, z0, z1),
+		z0 = svand_x (p0, z0, z1))
+
+/*
+** and_s8_x_tied2:
+**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_s8_x_tied2, svint8_t,
+		z0 = svand_s8_x (p0, z1, z0),
+		z0 = svand_x (p0, z1, z0))
+
+/*
+** and_s8_x_untied:
+**	and	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_s8_x_untied, svint8_t,
+		z0 = svand_s8_x (p0, z1, z2),
+		z0 = svand_x (p0, z1, z2))
+
+/*
+** and_w0_s8_x_tied1:
+**	mov	(z[0-9]+)\.b, w0
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s8_x_tied1, svint8_t, int8_t,
+		 z0 = svand_n_s8_x (p0, z0, x0),
+		 z0 = svand_x (p0, z0, x0))
+
+/*
+** and_w0_s8_x_untied:
+**	mov	(z[0-9]+)\.b, w0
+**	and	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_s8_x_untied, svint8_t, int8_t,
+		 z0 = svand_n_s8_x (p0, z1, x0),
+		 z0 = svand_x (p0, z1, x0))
+
+/*
+** and_1_s8_x_tied1:
+**	and	z0\.b, z0\.b, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s8_x_tied1, svint8_t,
+		z0 = svand_n_s8_x (p0, z0, 1),
+		z0 = svand_x (p0, z0, 1))
+
+/*
+** and_1_s8_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.b, z0\.b, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_s8_x_untied, svint8_t,
+		z0 = svand_n_s8_x (p0, z1, 1),
+		z0 = svand_x (p0, z1, 1))
+
+/*
+** and_127_s8_x:
+**	and	z0\.b, z0\.b, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (and_127_s8_x, svint8_t,
+		z0 = svand_n_s8_x (p0, z0, 127),
+		z0 = svand_x (p0, z0, 127))
+
+/*
+** and_128_s8_x:
+**	and	z0\.b, z0\.b, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (and_128_s8_x, svint8_t,
+		z0 = svand_n_s8_x (p0, z0, 128),
+		z0 = svand_x (p0, z0, 128))
+
+/*
+** and_255_s8_x:
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_s8_x, svint8_t,
+		z0 = svand_n_s8_x (p0, z0, 255),
+		z0 = svand_x (p0, z0, 255))
+
+/*
+** and_m127_s8_x:
+**	and	z0\.b, z0\.b, #0x81
+**	ret
+*/
+TEST_UNIFORM_Z (and_m127_s8_x, svint8_t,
+		z0 = svand_n_s8_x (p0, z0, -127),
+		z0 = svand_x (p0, z0, -127))
+
+/*
+** and_m128_s8_x:
+**	and	z0\.b, z0\.b, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (and_m128_s8_x, svint8_t,
+		z0 = svand_n_s8_x (p0, z0, -128),
+		z0 = svand_x (p0, z0, -128))
+
+/*
+** and_5_s8_x:
+**	mov	(z[0-9]+)\.b, #5
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_5_s8_x, svint8_t,
+		z0 = svand_n_s8_x (p0, z0, 5),
+		z0 = svand_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u16.c
new file mode 100644
index 000000000..875a08d71
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u16.c
@@ -0,0 +1,422 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** and_u16_m_tied1:
+**	and	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (and_u16_m_tied1, svuint16_t,
+		z0 = svand_u16_m (p0, z0, z1),
+		z0 = svand_m (p0, z0, z1))
+
+/*
+** and_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	and	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (and_u16_m_tied2, svuint16_t,
+		z0 = svand_u16_m (p0, z1, z0),
+		z0 = svand_m (p0, z1, z0))
+
+/*
+** and_u16_m_untied:
+**	movprfx	z0, z1
+**	and	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (and_u16_m_untied, svuint16_t,
+		z0 = svand_u16_m (p0, z1, z2),
+		z0 = svand_m (p0, z1, z2))
+
+/*
+** and_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svand_n_u16_m (p0, z0, x0),
+		 z0 = svand_m (p0, z0, x0))
+
+/*
+** and_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svand_n_u16_m (p0, z1, x0),
+		 z0 = svand_m (p0, z1, x0))
+
+/*
+** and_1_u16_m_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u16_m_tied1, svuint16_t,
+		z0 = svand_n_u16_m (p0, z0, 1),
+		z0 = svand_m (p0, z0, 1))
+
+/*
+** and_1_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0, z1
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u16_m_untied, svuint16_t,
+		z0 = svand_n_u16_m (p0, z1, 1),
+		z0 = svand_m (p0, z1, 1))
+
+/*
+** and_m2_u16_m:
+**	mov	(z[0-9]+\.h), #-2
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_m2_u16_m, svuint16_t,
+		z0 = svand_n_u16_m (p0, z0, -2),
+		z0 = svand_m (p0, z0, -2))
+
+/*
+** and_255_u16_m_tied1:
+**	uxtb	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_u16_m_tied1, svuint16_t,
+		z0 = svand_n_u16_m (p0, z0, 255),
+		z0 = svand_m (p0, z0, 255))
+
+/*
+** and_255_u16_m_untied:
+**	movprfx	z0, z1
+**	uxtb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_u16_m_untied, svuint16_t,
+		z0 = svand_n_u16_m (p0, z1, 255),
+		z0 = svand_m (p0, z1, 255))
+
+/*
+** and_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	and	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (and_u16_z_tied1, svuint16_t,
+		z0 = svand_u16_z (p0, z0, z1),
+		z0 = svand_z (p0, z0, z1))
+
+/*
+** and_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	and	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (and_u16_z_tied2, svuint16_t,
+		z0 = svand_u16_z (p0, z1, z0),
+		z0 = svand_z (p0, z1, z0))
+
+/*
+** and_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	and	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	and	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_u16_z_untied, svuint16_t,
+		z0 = svand_u16_z (p0, z1, z2),
+		z0 = svand_z (p0, z1, z2))
+
+/*
+** and_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svand_n_u16_z (p0, z0, x0),
+		 z0 = svand_z (p0, z0, x0))
+
+/*
+** and_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	and	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	and	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svand_n_u16_z (p0, z1, x0),
+		 z0 = svand_z (p0, z1, x0))
+
+/*
+** and_1_u16_z_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u16_z_tied1, svuint16_t,
+		z0 = svand_n_u16_z (p0, z0, 1),
+		z0 = svand_z (p0, z0, 1))
+
+/*
+** and_1_u16_z_untied:
+**	mov	(z[0-9]+\.h), #1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	and	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	and	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u16_z_untied, svuint16_t,
+		z0 = svand_n_u16_z (p0, z1, 1),
+		z0 = svand_z (p0, z1, 1))
+
+/*
+** and_255_u16_z_tied1:
+** (
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	uxtb	z0\.h, p0/m, \1\.h
+** |
+**	mov	(z[0-9]+\.h), #255
+**	movprfx	z0\.h, p0/z, z0\.h
+**	and	z0\.h, p0/m, z0\.h, \1
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_u16_z_tied1, svuint16_t,
+		z0 = svand_n_u16_z (p0, z0, 255),
+		z0 = svand_z (p0, z0, 255))
+
+/*
+** and_255_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	uxtb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_u16_z_untied, svuint16_t,
+		z0 = svand_n_u16_z (p0, z1, 255),
+		z0 = svand_z (p0, z1, 255))
+
+/*
+** and_u16_x_tied1:
+**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_u16_x_tied1, svuint16_t,
+		z0 = svand_u16_x (p0, z0, z1),
+		z0 = svand_x (p0, z0, z1))
+
+/*
+** and_u16_x_tied2:
+**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_u16_x_tied2, svuint16_t,
+		z0 = svand_u16_x (p0, z1, z0),
+		z0 = svand_x (p0, z1, z0))
+
+/*
+** and_u16_x_untied:
+**	and	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_u16_x_untied, svuint16_t,
+		z0 = svand_u16_x (p0, z1, z2),
+		z0 = svand_x (p0, z1, z2))
+
+/*
+** and_w0_u16_x_tied1:
+**	mov	(z[0-9]+)\.h, w0
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svand_n_u16_x (p0, z0, x0),
+		 z0 = svand_x (p0, z0, x0))
+
+/*
+** and_w0_u16_x_untied:
+**	mov	(z[0-9]+)\.h, w0
+**	and	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svand_n_u16_x (p0, z1, x0),
+		 z0 = svand_x (p0, z1, x0))
+
+/*
+** and_1_u16_x_tied1:
+**	and	z0\.h, z0\.h, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u16_x_tied1, svuint16_t,
+		z0 = svand_n_u16_x (p0, z0, 1),
+		z0 = svand_x (p0, z0, 1))
+
+/*
+** and_1_u16_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.h, z0\.h, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u16_x_untied, svuint16_t,
+		z0 = svand_n_u16_x (p0, z1, 1),
+		z0 = svand_x (p0, z1, 1))
+
+/*
+** and_127_u16_x:
+**	and	z0\.h, z0\.h, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (and_127_u16_x, svuint16_t,
+		z0 = svand_n_u16_x (p0, z0, 127),
+		z0 = svand_x (p0, z0, 127))
+
+/*
+** and_128_u16_x:
+**	and	z0\.h, z0\.h, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (and_128_u16_x, svuint16_t,
+		z0 = svand_n_u16_x (p0, z0, 128),
+		z0 = svand_x (p0, z0, 128))
+
+/*
+** and_255_u16_x:
+**	and	z0\.h, z0\.h, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_u16_x, svuint16_t,
+		z0 = svand_n_u16_x (p0, z0, 255),
+		z0 = svand_x (p0, z0, 255))
+
+/*
+** and_256_u16_x:
+**	and	z0\.h, z0\.h, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (and_256_u16_x, svuint16_t,
+		z0 = svand_n_u16_x (p0, z0, 256),
+		z0 = svand_x (p0, z0, 256))
+
+/*
+** and_257_u16_x:
+**	and	z0\.h, z0\.h, #0x101
+**	ret
+*/
+TEST_UNIFORM_Z (and_257_u16_x, svuint16_t,
+		z0 = svand_n_u16_x (p0, z0, 257),
+		z0 = svand_x (p0, z0, 257))
+
+/*
+** and_512_u16_x:
+**	and	z0\.h, z0\.h, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (and_512_u16_x, svuint16_t,
+		z0 = svand_n_u16_x (p0, z0, 512),
+		z0 = svand_x (p0, z0, 512))
+
+/*
+** and_65280_u16_x:
+**	and	z0\.h, z0\.h, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (and_65280_u16_x, svuint16_t,
+		z0 = svand_n_u16_x (p0, z0, 0xff00),
+		z0 = svand_x (p0, z0, 0xff00))
+
+/*
+** and_m127_u16_x:
+**	and	z0\.h, z0\.h, #0xff81
+**	ret
+*/
+TEST_UNIFORM_Z (and_m127_u16_x, svuint16_t,
+		z0 = svand_n_u16_x (p0, z0, -127),
+		z0 = svand_x (p0, z0, -127))
+
+/*
+** and_m128_u16_x:
+**	and	z0\.h, z0\.h, #0xff80
+**	ret
+*/
+TEST_UNIFORM_Z (and_m128_u16_x, svuint16_t,
+		z0 = svand_n_u16_x (p0, z0, -128),
+		z0 = svand_x (p0, z0, -128))
+
+/*
+** and_m255_u16_x:
+**	and	z0\.h, z0\.h, #0xff01
+**	ret
+*/
+TEST_UNIFORM_Z (and_m255_u16_x, svuint16_t,
+		z0 = svand_n_u16_x (p0, z0, -255),
+		z0 = svand_x (p0, z0, -255))
+
+/*
+** and_m256_u16_x:
+**	and	z0\.h, z0\.h, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (and_m256_u16_x, svuint16_t,
+		z0 = svand_n_u16_x (p0, z0, -256),
+		z0 = svand_x (p0, z0, -256))
+
+/*
+** and_m257_u16_x:
+**	and	z0\.h, z0\.h, #0xfeff
+**	ret
+*/
+TEST_UNIFORM_Z (and_m257_u16_x, svuint16_t,
+		z0 = svand_n_u16_x (p0, z0, -257),
+		z0 = svand_x (p0, z0, -257))
+
+/*
+** and_m512_u16_x:
+**	and	z0\.h, z0\.h, #0xfe00
+**	ret
+*/
+TEST_UNIFORM_Z (and_m512_u16_x, svuint16_t,
+		z0 = svand_n_u16_x (p0, z0, -512),
+		z0 = svand_x (p0, z0, -512))
+
+/*
+** and_m32768_u16_x:
+**	and	z0\.h, z0\.h, #0x8000
+**	ret
+*/
+TEST_UNIFORM_Z (and_m32768_u16_x, svuint16_t,
+		z0 = svand_n_u16_x (p0, z0, -0x8000),
+		z0 = svand_x (p0, z0, -0x8000))
+
+/*
+** and_5_u16_x:
+**	mov	(z[0-9]+)\.h, #5
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_5_u16_x, svuint16_t,
+		z0 = svand_n_u16_x (p0, z0, 5),
+		z0 = svand_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u32.c
new file mode 100644
index 000000000..80ff50396
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u32.c
@@ -0,0 +1,464 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** and_u32_m_tied1:
+**	and	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_u32_m_tied1, svuint32_t,
+		z0 = svand_u32_m (p0, z0, z1),
+		z0 = svand_m (p0, z0, z1))
+
+/*
+** and_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	and	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_u32_m_tied2, svuint32_t,
+		z0 = svand_u32_m (p0, z1, z0),
+		z0 = svand_m (p0, z1, z0))
+
+/*
+** and_u32_m_untied:
+**	movprfx	z0, z1
+**	and	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_u32_m_untied, svuint32_t,
+		z0 = svand_u32_m (p0, z1, z2),
+		z0 = svand_m (p0, z1, z2))
+
+/*
+** and_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svand_n_u32_m (p0, z0, x0),
+		 z0 = svand_m (p0, z0, x0))
+
+/*
+** and_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svand_n_u32_m (p0, z1, x0),
+		 z0 = svand_m (p0, z1, x0))
+
+/*
+** and_1_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u32_m_tied1, svuint32_t,
+		z0 = svand_n_u32_m (p0, z0, 1),
+		z0 = svand_m (p0, z0, 1))
+
+/*
+** and_1_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0, z1
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u32_m_untied, svuint32_t,
+		z0 = svand_n_u32_m (p0, z1, 1),
+		z0 = svand_m (p0, z1, 1))
+
+/*
+** and_m2_u32_m:
+**	mov	(z[0-9]+\.s), #-2
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_m2_u32_m, svuint32_t,
+		z0 = svand_n_u32_m (p0, z0, -2),
+		z0 = svand_m (p0, z0, -2))
+
+/*
+** and_255_u32_m_tied1:
+**	uxtb	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_u32_m_tied1, svuint32_t,
+		z0 = svand_n_u32_m (p0, z0, 255),
+		z0 = svand_m (p0, z0, 255))
+
+/*
+** and_255_u32_m_untied:
+**	movprfx	z0, z1
+**	uxtb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_u32_m_untied, svuint32_t,
+		z0 = svand_n_u32_m (p0, z1, 255),
+		z0 = svand_m (p0, z1, 255))
+
+/*
+** and_65535_u32_m_tied1:
+**	uxth	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_65535_u32_m_tied1, svuint32_t,
+		z0 = svand_n_u32_m (p0, z0, 65535),
+		z0 = svand_m (p0, z0, 65535))
+
+/*
+** and_65535_u32_m_untied:
+**	movprfx	z0, z1
+**	uxth	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_65535_u32_m_untied, svuint32_t,
+		z0 = svand_n_u32_m (p0, z1, 65535),
+		z0 = svand_m (p0, z1, 65535))
+
+/*
+** and_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	and	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_u32_z_tied1, svuint32_t,
+		z0 = svand_u32_z (p0, z0, z1),
+		z0 = svand_z (p0, z0, z1))
+
+/*
+** and_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	and	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_u32_z_tied2, svuint32_t,
+		z0 = svand_u32_z (p0, z1, z0),
+		z0 = svand_z (p0, z1, z0))
+
+/*
+** and_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	and	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	and	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_u32_z_untied, svuint32_t,
+		z0 = svand_u32_z (p0, z1, z2),
+		z0 = svand_z (p0, z1, z2))
+
+/*
+** and_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svand_n_u32_z (p0, z0, x0),
+		 z0 = svand_z (p0, z0, x0))
+
+/*
+** and_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	and	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	and	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svand_n_u32_z (p0, z1, x0),
+		 z0 = svand_z (p0, z1, x0))
+
+/*
+** and_1_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u32_z_tied1, svuint32_t,
+		z0 = svand_n_u32_z (p0, z0, 1),
+		z0 = svand_z (p0, z0, 1))
+
+/*
+** and_1_u32_z_untied:
+**	mov	(z[0-9]+\.s), #1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	and	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	and	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u32_z_untied, svuint32_t,
+		z0 = svand_n_u32_z (p0, z1, 1),
+		z0 = svand_z (p0, z1, 1))
+
+/*
+** and_255_u32_z_tied1:
+** (
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	uxtb	z0\.s, p0/m, \1\.s
+** |
+**	mov	(z[0-9]+\.s), #255
+**	movprfx	z0\.s, p0/z, z0\.s
+**	and	z0\.s, p0/m, z0\.s, \1
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_u32_z_tied1, svuint32_t,
+		z0 = svand_n_u32_z (p0, z0, 255),
+		z0 = svand_z (p0, z0, 255))
+
+/*
+** and_255_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	uxtb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_u32_z_untied, svuint32_t,
+		z0 = svand_n_u32_z (p0, z1, 255),
+		z0 = svand_z (p0, z1, 255))
+
+/*
+** and_65535_u32_z_tied1:
+** (
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	uxth	z0\.s, p0/m, \1\.s
+** |
+**	mov	(z[0-9]+\.s), #65535
+**	movprfx	z0\.s, p0/z, z0\.s
+**	and	z0\.s, p0/m, z0\.s, \1
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_65535_u32_z_tied1, svuint32_t,
+		z0 = svand_n_u32_z (p0, z0, 65535),
+		z0 = svand_z (p0, z0, 65535))
+
+/*
+** and_65535_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	uxth	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (and_65535_u32_z_untied, svuint32_t,
+		z0 = svand_n_u32_z (p0, z1, 65535),
+		z0 = svand_z (p0, z1, 65535))
+
+/*
+** and_u32_x_tied1:
+**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_u32_x_tied1, svuint32_t,
+		z0 = svand_u32_x (p0, z0, z1),
+		z0 = svand_x (p0, z0, z1))
+
+/*
+** and_u32_x_tied2:
+**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_u32_x_tied2, svuint32_t,
+		z0 = svand_u32_x (p0, z1, z0),
+		z0 = svand_x (p0, z1, z0))
+
+/*
+** and_u32_x_untied:
+**	and	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_u32_x_untied, svuint32_t,
+		z0 = svand_u32_x (p0, z1, z2),
+		z0 = svand_x (p0, z1, z2))
+
+/*
+** and_w0_u32_x_tied1:
+**	mov	(z[0-9]+)\.s, w0
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svand_n_u32_x (p0, z0, x0),
+		 z0 = svand_x (p0, z0, x0))
+
+/*
+** and_w0_u32_x_untied:
+**	mov	(z[0-9]+)\.s, w0
+**	and	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svand_n_u32_x (p0, z1, x0),
+		 z0 = svand_x (p0, z1, x0))
+
+/*
+** and_1_u32_x_tied1:
+**	and	z0\.s, z0\.s, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u32_x_tied1, svuint32_t,
+		z0 = svand_n_u32_x (p0, z0, 1),
+		z0 = svand_x (p0, z0, 1))
+
+/*
+** and_1_u32_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.s, z0\.s, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u32_x_untied, svuint32_t,
+		z0 = svand_n_u32_x (p0, z1, 1),
+		z0 = svand_x (p0, z1, 1))
+
+/*
+** and_127_u32_x:
+**	and	z0\.s, z0\.s, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (and_127_u32_x, svuint32_t,
+		z0 = svand_n_u32_x (p0, z0, 127),
+		z0 = svand_x (p0, z0, 127))
+
+/*
+** and_128_u32_x:
+**	and	z0\.s, z0\.s, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (and_128_u32_x, svuint32_t,
+		z0 = svand_n_u32_x (p0, z0, 128),
+		z0 = svand_x (p0, z0, 128))
+
+/*
+** and_255_u32_x:
+**	and	z0\.s, z0\.s, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_u32_x, svuint32_t,
+		z0 = svand_n_u32_x (p0, z0, 255),
+		z0 = svand_x (p0, z0, 255))
+
+/*
+** and_256_u32_x:
+**	and	z0\.s, z0\.s, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (and_256_u32_x, svuint32_t,
+		z0 = svand_n_u32_x (p0, z0, 256),
+		z0 = svand_x (p0, z0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (and_257_u32_x, svuint32_t,
+		z0 = svand_n_u32_x (p0, z0, 257),
+		z0 = svand_x (p0, z0, 257))
+
+/*
+** and_512_u32_x:
+**	and	z0\.s, z0\.s, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (and_512_u32_x, svuint32_t,
+		z0 = svand_n_u32_x (p0, z0, 512),
+		z0 = svand_x (p0, z0, 512))
+
+/*
+** and_65280_u32_x:
+**	and	z0\.s, z0\.s, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (and_65280_u32_x, svuint32_t,
+		z0 = svand_n_u32_x (p0, z0, 0xff00),
+		z0 = svand_x (p0, z0, 0xff00))
+
+/*
+** and_m127_u32_x:
+**	and	z0\.s, z0\.s, #0xffffff81
+**	ret
+*/
+TEST_UNIFORM_Z (and_m127_u32_x, svuint32_t,
+		z0 = svand_n_u32_x (p0, z0, -127),
+		z0 = svand_x (p0, z0, -127))
+
+/*
+** and_m128_u32_x:
+**	and	z0\.s, z0\.s, #0xffffff80
+**	ret
+*/
+TEST_UNIFORM_Z (and_m128_u32_x, svuint32_t,
+		z0 = svand_n_u32_x (p0, z0, -128),
+		z0 = svand_x (p0, z0, -128))
+
+/*
+** and_m255_u32_x:
+**	and	z0\.s, z0\.s, #0xffffff01
+**	ret
+*/
+TEST_UNIFORM_Z (and_m255_u32_x, svuint32_t,
+		z0 = svand_n_u32_x (p0, z0, -255),
+		z0 = svand_x (p0, z0, -255))
+
+/*
+** and_m256_u32_x:
+**	and	z0\.s, z0\.s, #0xffffff00
+**	ret
+*/
+TEST_UNIFORM_Z (and_m256_u32_x, svuint32_t,
+		z0 = svand_n_u32_x (p0, z0, -256),
+		z0 = svand_x (p0, z0, -256))
+
+/*
+** and_m257_u32_x:
+**	and	z0\.s, z0\.s, #0xfffffeff
+**	ret
+*/
+TEST_UNIFORM_Z (and_m257_u32_x, svuint32_t,
+		z0 = svand_n_u32_x (p0, z0, -257),
+		z0 = svand_x (p0, z0, -257))
+
+/*
+** and_m512_u32_x:
+**	and	z0\.s, z0\.s, #0xfffffe00
+**	ret
+*/
+TEST_UNIFORM_Z (and_m512_u32_x, svuint32_t,
+		z0 = svand_n_u32_x (p0, z0, -512),
+		z0 = svand_x (p0, z0, -512))
+
+/*
+** and_m32768_u32_x:
+**	and	z0\.s, z0\.s, #0xffff8000
+**	ret
+*/
+TEST_UNIFORM_Z (and_m32768_u32_x, svuint32_t,
+		z0 = svand_n_u32_x (p0, z0, -0x8000),
+		z0 = svand_x (p0, z0, -0x8000))
+
+/*
+** and_5_u32_x:
+**	mov	(z[0-9]+)\.s, #5
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_5_u32_x, svuint32_t,
+		z0 = svand_n_u32_x (p0, z0, 5),
+		z0 = svand_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u64.c
new file mode 100644
index 000000000..906b19c37
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u64.c
@@ -0,0 +1,510 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** and_u64_m_tied1:
+**	and	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_u64_m_tied1, svuint64_t,
+		z0 = svand_u64_m (p0, z0, z1),
+		z0 = svand_m (p0, z0, z1))
+
+/*
+** and_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_u64_m_tied2, svuint64_t,
+		z0 = svand_u64_m (p0, z1, z0),
+		z0 = svand_m (p0, z1, z0))
+
+/*
+** and_u64_m_untied:
+**	movprfx	z0, z1
+**	and	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_u64_m_untied, svuint64_t,
+		z0 = svand_u64_m (p0, z1, z2),
+		z0 = svand_m (p0, z1, z2))
+
+/*
+** and_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svand_n_u64_m (p0, z0, x0),
+		 z0 = svand_m (p0, z0, x0))
+
+/*
+** and_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svand_n_u64_m (p0, z1, x0),
+		 z0 = svand_m (p0, z1, x0))
+
+/*
+** and_1_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u64_m_tied1, svuint64_t,
+		z0 = svand_n_u64_m (p0, z0, 1),
+		z0 = svand_m (p0, z0, 1))
+
+/*
+** and_1_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0, z1
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u64_m_untied, svuint64_t,
+		z0 = svand_n_u64_m (p0, z1, 1),
+		z0 = svand_m (p0, z1, 1))
+
+/*
+** and_m2_u64_m:
+**	mov	(z[0-9]+\.d), #-2
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_m2_u64_m, svuint64_t,
+		z0 = svand_n_u64_m (p0, z0, -2),
+		z0 = svand_m (p0, z0, -2))
+
+/*
+** and_255_u64_m_tied1:
+**	uxtb	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_u64_m_tied1, svuint64_t,
+		z0 = svand_n_u64_m (p0, z0, 255),
+		z0 = svand_m (p0, z0, 255))
+
+/*
+** and_255_u64_m_untied:
+**	movprfx	z0, z1
+**	uxtb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_u64_m_untied, svuint64_t,
+		z0 = svand_n_u64_m (p0, z1, 255),
+		z0 = svand_m (p0, z1, 255))
+
+/*
+** and_65535_u64_m_tied1:
+**	uxth	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_65535_u64_m_tied1, svuint64_t,
+		z0 = svand_n_u64_m (p0, z0, 65535),
+		z0 = svand_m (p0, z0, 65535))
+
+/*
+** and_65535_u64_m_untied:
+**	movprfx	z0, z1
+**	uxth	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_65535_u64_m_untied, svuint64_t,
+		z0 = svand_n_u64_m (p0, z1, 65535),
+		z0 = svand_m (p0, z1, 65535))
+
+/*
+** and_0xffffffff_u64_m_tied1:
+**	uxtw	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_0xffffffff_u64_m_tied1, svuint64_t,
+		z0 = svand_n_u64_m (p0, z0, 0xffffffff),
+		z0 = svand_m (p0, z0, 0xffffffff))
+
+/*
+** and_0xffffffff_u64_m_untied:
+**	movprfx	z0, z1
+**	uxtw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_0xffffffff_u64_m_untied, svuint64_t,
+		z0 = svand_n_u64_m (p0, z1, 0xffffffff),
+		z0 = svand_m (p0, z1, 0xffffffff))
+
+/*
+** and_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	and	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_u64_z_tied1, svuint64_t,
+		z0 = svand_u64_z (p0, z0, z1),
+		z0 = svand_z (p0, z0, z1))
+
+/*
+** and_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	and	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_u64_z_tied2, svuint64_t,
+		z0 = svand_u64_z (p0, z1, z0),
+		z0 = svand_z (p0, z1, z0))
+
+/*
+** and_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	and	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	and	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_u64_z_untied, svuint64_t,
+		z0 = svand_u64_z (p0, z1, z2),
+		z0 = svand_z (p0, z1, z2))
+
+/*
+** and_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svand_n_u64_z (p0, z0, x0),
+		 z0 = svand_z (p0, z0, x0))
+
+/*
+** and_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	and	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (and_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svand_n_u64_z (p0, z1, x0),
+		 z0 = svand_z (p0, z1, x0))
+
+/*
+** and_1_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u64_z_tied1, svuint64_t,
+		z0 = svand_n_u64_z (p0, z0, 1),
+		z0 = svand_z (p0, z0, 1))
+
+/*
+** and_1_u64_z_untied:
+**	mov	(z[0-9]+\.d), #1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	and	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u64_z_untied, svuint64_t,
+		z0 = svand_n_u64_z (p0, z1, 1),
+		z0 = svand_z (p0, z1, 1))
+
+/*
+** and_255_u64_z_tied1:
+** (
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	uxtb	z0\.d, p0/m, \1
+** |
+**	mov	(z[0-9]+\.d), #255
+**	movprfx	z0\.d, p0/z, z0\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_u64_z_tied1, svuint64_t,
+		z0 = svand_n_u64_z (p0, z0, 255),
+		z0 = svand_z (p0, z0, 255))
+
+/*
+** and_255_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	uxtb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_u64_z_untied, svuint64_t,
+		z0 = svand_n_u64_z (p0, z1, 255),
+		z0 = svand_z (p0, z1, 255))
+
+/*
+** and_65535_u64_z_tied1:
+** (
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	uxth	z0\.d, p0/m, \1
+** |
+**	mov	(z[0-9]+\.d), #65535
+**	movprfx	z0\.d, p0/z, z0\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_65535_u64_z_tied1, svuint64_t,
+		z0 = svand_n_u64_z (p0, z0, 65535),
+		z0 = svand_z (p0, z0, 65535))
+
+/*
+** and_65535_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	uxth	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_65535_u64_z_untied, svuint64_t,
+		z0 = svand_n_u64_z (p0, z1, 65535),
+		z0 = svand_z (p0, z1, 65535))
+
+/*
+** and_0xffffffff_u64_z_tied1:
+** (
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	uxtw	z0\.d, p0/m, \1
+** |
+**	mov	(z[0-9]+\.d), #4294967295
+**	movprfx	z0\.d, p0/z, z0\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_0xffffffff_u64_z_tied1, svuint64_t,
+		z0 = svand_n_u64_z (p0, z0, 0xffffffff),
+		z0 = svand_z (p0, z0, 0xffffffff))
+
+/*
+** and_0xffffffff_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	uxtw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (and_0xffffffff_u64_z_untied, svuint64_t,
+		z0 = svand_n_u64_z (p0, z1, 0xffffffff),
+		z0 = svand_z (p0, z1, 0xffffffff))
+
+/*
+** and_u64_x_tied1:
+**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_u64_x_tied1, svuint64_t,
+		z0 = svand_u64_x (p0, z0, z1),
+		z0 = svand_x (p0, z0, z1))
+
+/*
+** and_u64_x_tied2:
+**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_u64_x_tied2, svuint64_t,
+		z0 = svand_u64_x (p0, z1, z0),
+		z0 = svand_x (p0, z1, z0))
+
+/*
+** and_u64_x_untied:
+**	and	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_u64_x_untied, svuint64_t,
+		z0 = svand_u64_x (p0, z1, z2),
+		z0 = svand_x (p0, z1, z2))
+
+/*
+** and_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	and	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (and_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svand_n_u64_x (p0, z0, x0),
+		 z0 = svand_x (p0, z0, x0))
+
+/*
+** and_x0_u64_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	and	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (and_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svand_n_u64_x (p0, z1, x0),
+		 z0 = svand_x (p0, z1, x0))
+
+/*
+** and_1_u64_x_tied1:
+**	and	z0\.d, z0\.d, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u64_x_tied1, svuint64_t,
+		z0 = svand_n_u64_x (p0, z0, 1),
+		z0 = svand_x (p0, z0, 1))
+
+/*
+** and_1_u64_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.d, z0\.d, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u64_x_untied, svuint64_t,
+		z0 = svand_n_u64_x (p0, z1, 1),
+		z0 = svand_x (p0, z1, 1))
+
+/*
+** and_127_u64_x:
+**	and	z0\.d, z0\.d, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (and_127_u64_x, svuint64_t,
+		z0 = svand_n_u64_x (p0, z0, 127),
+		z0 = svand_x (p0, z0, 127))
+
+/*
+** and_128_u64_x:
+**	and	z0\.d, z0\.d, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (and_128_u64_x, svuint64_t,
+		z0 = svand_n_u64_x (p0, z0, 128),
+		z0 = svand_x (p0, z0, 128))
+
+/*
+** and_255_u64_x:
+**	and	z0\.d, z0\.d, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_u64_x, svuint64_t,
+		z0 = svand_n_u64_x (p0, z0, 255),
+		z0 = svand_x (p0, z0, 255))
+
+/*
+** and_256_u64_x:
+**	and	z0\.d, z0\.d, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (and_256_u64_x, svuint64_t,
+		z0 = svand_n_u64_x (p0, z0, 256),
+		z0 = svand_x (p0, z0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (and_257_u64_x, svuint64_t,
+		z0 = svand_n_u64_x (p0, z0, 257),
+		z0 = svand_x (p0, z0, 257))
+
+/*
+** and_512_u64_x:
+**	and	z0\.d, z0\.d, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (and_512_u64_x, svuint64_t,
+		z0 = svand_n_u64_x (p0, z0, 512),
+		z0 = svand_x (p0, z0, 512))
+
+/*
+** and_65280_u64_x:
+**	and	z0\.d, z0\.d, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (and_65280_u64_x, svuint64_t,
+		z0 = svand_n_u64_x (p0, z0, 0xff00),
+		z0 = svand_x (p0, z0, 0xff00))
+
+/*
+** and_m127_u64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffffff81
+**	ret
+*/
+TEST_UNIFORM_Z (and_m127_u64_x, svuint64_t,
+		z0 = svand_n_u64_x (p0, z0, -127),
+		z0 = svand_x (p0, z0, -127))
+
+/*
+** and_m128_u64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffffff80
+**	ret
+*/
+TEST_UNIFORM_Z (and_m128_u64_x, svuint64_t,
+		z0 = svand_n_u64_x (p0, z0, -128),
+		z0 = svand_x (p0, z0, -128))
+
+/*
+** and_m255_u64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffffff01
+**	ret
+*/
+TEST_UNIFORM_Z (and_m255_u64_x, svuint64_t,
+		z0 = svand_n_u64_x (p0, z0, -255),
+		z0 = svand_x (p0, z0, -255))
+
+/*
+** and_m256_u64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffffff00
+**	ret
+*/
+TEST_UNIFORM_Z (and_m256_u64_x, svuint64_t,
+		z0 = svand_n_u64_x (p0, z0, -256),
+		z0 = svand_x (p0, z0, -256))
+
+/*
+** and_m257_u64_x:
+**	and	z0\.d, z0\.d, #0xfffffffffffffeff
+**	ret
+*/
+TEST_UNIFORM_Z (and_m257_u64_x, svuint64_t,
+		z0 = svand_n_u64_x (p0, z0, -257),
+		z0 = svand_x (p0, z0, -257))
+
+/*
+** and_m512_u64_x:
+**	and	z0\.d, z0\.d, #0xfffffffffffffe00
+**	ret
+*/
+TEST_UNIFORM_Z (and_m512_u64_x, svuint64_t,
+		z0 = svand_n_u64_x (p0, z0, -512),
+		z0 = svand_x (p0, z0, -512))
+
+/*
+** and_m32768_u64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffff8000
+**	ret
+*/
+TEST_UNIFORM_Z (and_m32768_u64_x, svuint64_t,
+		z0 = svand_n_u64_x (p0, z0, -0x8000),
+		z0 = svand_x (p0, z0, -0x8000))
+
+/*
+** and_5_u64_x:
+**	mov	(z[0-9]+\.d), #5
+**	and	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_5_u64_x, svuint64_t,
+		z0 = svand_n_u64_x (p0, z0, 5),
+		z0 = svand_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u8.c
new file mode 100644
index 000000000..b0f1c9529
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/and_u8.c
@@ -0,0 +1,294 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** and_u8_m_tied1:
+**	and	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (and_u8_m_tied1, svuint8_t,
+		z0 = svand_u8_m (p0, z0, z1),
+		z0 = svand_m (p0, z0, z1))
+
+/*
+** and_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	and	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (and_u8_m_tied2, svuint8_t,
+		z0 = svand_u8_m (p0, z1, z0),
+		z0 = svand_m (p0, z1, z0))
+
+/*
+** and_u8_m_untied:
+**	movprfx	z0, z1
+**	and	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (and_u8_m_untied, svuint8_t,
+		z0 = svand_u8_m (p0, z1, z2),
+		z0 = svand_m (p0, z1, z2))
+
+/*
+** and_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svand_n_u8_m (p0, z0, x0),
+		 z0 = svand_m (p0, z0, x0))
+
+/*
+** and_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svand_n_u8_m (p0, z1, x0),
+		 z0 = svand_m (p0, z1, x0))
+
+/*
+** and_1_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u8_m_tied1, svuint8_t,
+		z0 = svand_n_u8_m (p0, z0, 1),
+		z0 = svand_m (p0, z0, 1))
+
+/*
+** and_1_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0, z1
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u8_m_untied, svuint8_t,
+		z0 = svand_n_u8_m (p0, z1, 1),
+		z0 = svand_m (p0, z1, 1))
+
+/*
+** and_m2_u8_m:
+**	mov	(z[0-9]+\.b), #-2
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_m2_u8_m, svuint8_t,
+		z0 = svand_n_u8_m (p0, z0, -2),
+		z0 = svand_m (p0, z0, -2))
+
+/*
+** and_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	and	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (and_u8_z_tied1, svuint8_t,
+		z0 = svand_u8_z (p0, z0, z1),
+		z0 = svand_z (p0, z0, z1))
+
+/*
+** and_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	and	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (and_u8_z_tied2, svuint8_t,
+		z0 = svand_u8_z (p0, z1, z0),
+		z0 = svand_z (p0, z1, z0))
+
+/*
+** and_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	and	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	and	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_u8_z_untied, svuint8_t,
+		z0 = svand_u8_z (p0, z1, z2),
+		z0 = svand_z (p0, z1, z2))
+
+/*
+** and_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svand_n_u8_z (p0, z0, x0),
+		 z0 = svand_z (p0, z0, x0))
+
+/*
+** and_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	and	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	and	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svand_n_u8_z (p0, z1, x0),
+		 z0 = svand_z (p0, z1, x0))
+
+/*
+** and_1_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u8_z_tied1, svuint8_t,
+		z0 = svand_n_u8_z (p0, z0, 1),
+		z0 = svand_z (p0, z0, 1))
+
+/*
+** and_1_u8_z_untied:
+**	mov	(z[0-9]+\.b), #1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	and	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	and	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u8_z_untied, svuint8_t,
+		z0 = svand_n_u8_z (p0, z1, 1),
+		z0 = svand_z (p0, z1, 1))
+
+/*
+** and_u8_x_tied1:
+**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_u8_x_tied1, svuint8_t,
+		z0 = svand_u8_x (p0, z0, z1),
+		z0 = svand_x (p0, z0, z1))
+
+/*
+** and_u8_x_tied2:
+**	and	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_u8_x_tied2, svuint8_t,
+		z0 = svand_u8_x (p0, z1, z0),
+		z0 = svand_x (p0, z1, z0))
+
+/*
+** and_u8_x_untied:
+**	and	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_u8_x_untied, svuint8_t,
+		z0 = svand_u8_x (p0, z1, z2),
+		z0 = svand_x (p0, z1, z2))
+
+/*
+** and_w0_u8_x_tied1:
+**	mov	(z[0-9]+)\.b, w0
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svand_n_u8_x (p0, z0, x0),
+		 z0 = svand_x (p0, z0, x0))
+
+/*
+** and_w0_u8_x_untied:
+**	mov	(z[0-9]+)\.b, w0
+**	and	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (and_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svand_n_u8_x (p0, z1, x0),
+		 z0 = svand_x (p0, z1, x0))
+
+/*
+** and_1_u8_x_tied1:
+**	and	z0\.b, z0\.b, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u8_x_tied1, svuint8_t,
+		z0 = svand_n_u8_x (p0, z0, 1),
+		z0 = svand_x (p0, z0, 1))
+
+/*
+** and_1_u8_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.b, z0\.b, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (and_1_u8_x_untied, svuint8_t,
+		z0 = svand_n_u8_x (p0, z1, 1),
+		z0 = svand_x (p0, z1, 1))
+
+/*
+** and_127_u8_x:
+**	and	z0\.b, z0\.b, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (and_127_u8_x, svuint8_t,
+		z0 = svand_n_u8_x (p0, z0, 127),
+		z0 = svand_x (p0, z0, 127))
+
+/*
+** and_128_u8_x:
+**	and	z0\.b, z0\.b, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (and_128_u8_x, svuint8_t,
+		z0 = svand_n_u8_x (p0, z0, 128),
+		z0 = svand_x (p0, z0, 128))
+
+/*
+** and_255_u8_x:
+**	ret
+*/
+TEST_UNIFORM_Z (and_255_u8_x, svuint8_t,
+		z0 = svand_n_u8_x (p0, z0, 255),
+		z0 = svand_x (p0, z0, 255))
+
+/*
+** and_m127_u8_x:
+**	and	z0\.b, z0\.b, #0x81
+**	ret
+*/
+TEST_UNIFORM_Z (and_m127_u8_x, svuint8_t,
+		z0 = svand_n_u8_x (p0, z0, -127),
+		z0 = svand_x (p0, z0, -127))
+
+/*
+** and_m128_u8_x:
+**	and	z0\.b, z0\.b, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (and_m128_u8_x, svuint8_t,
+		z0 = svand_n_u8_x (p0, z0, -128),
+		z0 = svand_x (p0, z0, -128))
+
+/*
+** and_5_u8_x:
+**	mov	(z[0-9]+)\.b, #5
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (and_5_u8_x, svuint8_t,
+		z0 = svand_n_u8_x (p0, z0, 5),
+		z0 = svand_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s16.c
new file mode 100644
index 000000000..16761b823
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s16.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** andv_x0_s16:
+**	andv	h([0-9]+), p0, z0\.h
+**	umov	w0, v\1\.h\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (andv_x0_s16, int16_t, svint16_t,
+		  x0 = svandv_s16 (p0, z0),
+		  x0 = svandv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s32.c
new file mode 100644
index 000000000..bccc91e21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s32.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** andv_x0_s32:
+**	andv	(s[0-9]+), p0, z0\.s
+**	fmov	w0, \1
+**	ret
+*/
+TEST_REDUCTION_X (andv_x0_s32, int32_t, svint32_t,
+		  x0 = svandv_s32 (p0, z0),
+		  x0 = svandv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s64.c
new file mode 100644
index 000000000..53488b6e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s64.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** andv_x0_s64:
+**	andv	(d[0-9]+), p0, z0\.d
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (andv_x0_s64, int64_t, svint64_t,
+		  x0 = svandv_s64 (p0, z0),
+		  x0 = svandv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s8.c
new file mode 100644
index 000000000..052f74c7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_s8.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** andv_x0_s8:
+**	andv	b([0-9]+), p0, z0\.b
+**	umov	w0, v\1\.b\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (andv_x0_s8, int8_t, svint8_t,
+		  x0 = svandv_s8 (p0, z0),
+		  x0 = svandv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u16.c
new file mode 100644
index 000000000..03328022d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u16.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** andv_x0_u16:
+**	andv	h([0-9]+), p0, z0\.h
+**	umov	w0, v\1\.h\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (andv_x0_u16, uint16_t, svuint16_t,
+		  x0 = svandv_u16 (p0, z0),
+		  x0 = svandv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u32.c
new file mode 100644
index 000000000..a1677e703
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u32.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** andv_x0_u32:
+**	andv	(s[0-9]+), p0, z0\.s
+**	fmov	w0, \1
+**	ret
+*/
+TEST_REDUCTION_X (andv_x0_u32, uint32_t, svuint32_t,
+		  x0 = svandv_u32 (p0, z0),
+		  x0 = svandv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u64.c
new file mode 100644
index 000000000..d45422693
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u64.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** andv_x0_u64:
+**	andv	(d[0-9]+), p0, z0\.d
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (andv_x0_u64, uint64_t, svuint64_t,
+		  x0 = svandv_u64 (p0, z0),
+		  x0 = svandv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u8.c
new file mode 100644
index 000000000..b07f6b6e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/andv_u8.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** andv_x0_u8:
+**	andv	b([0-9]+), p0, z0\.b
+**	umov	w0, v\1\.b\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (andv_x0_u8, uint8_t, svuint8_t,
+		  x0 = svandv_u8 (p0, z0),
+		  x0 = svandv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s16.c
new file mode 100644
index 000000000..877bf1068
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s16.c
@@ -0,0 +1,340 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** asr_s16_m_tied1:
+**	asr	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (asr_s16_m_tied1, svint16_t, svuint16_t,
+	     z0 = svasr_s16_m (p0, z0, z4),
+	     z0 = svasr_m (p0, z0, z4))
+
+/*
+** asr_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	asr	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_s16_m_tied2, svint16_t, svuint16_t,
+		 z0_res = svasr_s16_m (p0, z4, z0),
+		 z0_res = svasr_m (p0, z4, z0))
+
+/*
+** asr_s16_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (asr_s16_m_untied, svint16_t, svuint16_t,
+	     z0 = svasr_s16_m (p0, z1, z4),
+	     z0 = svasr_m (p0, z1, z4))
+
+/*
+** asr_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	asr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s16_m_tied1, svint16_t, uint16_t,
+		 z0 = svasr_n_s16_m (p0, z0, x0),
+		 z0 = svasr_m (p0, z0, x0))
+
+/*
+** asr_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	asr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s16_m_untied, svint16_t, uint16_t,
+		 z0 = svasr_n_s16_m (p0, z1, x0),
+		 z0 = svasr_m (p0, z1, x0))
+
+/*
+** asr_1_s16_m_tied1:
+**	asr	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s16_m_tied1, svint16_t,
+		z0 = svasr_n_s16_m (p0, z0, 1),
+		z0 = svasr_m (p0, z0, 1))
+
+/*
+** asr_1_s16_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s16_m_untied, svint16_t,
+		z0 = svasr_n_s16_m (p0, z1, 1),
+		z0 = svasr_m (p0, z1, 1))
+
+/*
+** asr_15_s16_m_tied1:
+**	asr	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (asr_15_s16_m_tied1, svint16_t,
+		z0 = svasr_n_s16_m (p0, z0, 15),
+		z0 = svasr_m (p0, z0, 15))
+
+/*
+** asr_15_s16_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (asr_15_s16_m_untied, svint16_t,
+		z0 = svasr_n_s16_m (p0, z1, 15),
+		z0 = svasr_m (p0, z1, 15))
+
+/*
+** asr_16_s16_m_tied1:
+**	asr	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asr_16_s16_m_tied1, svint16_t,
+		z0 = svasr_n_s16_m (p0, z0, 16),
+		z0 = svasr_m (p0, z0, 16))
+
+/*
+** asr_16_s16_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asr_16_s16_m_untied, svint16_t,
+		z0 = svasr_n_s16_m (p0, z1, 16),
+		z0 = svasr_m (p0, z1, 16))
+
+/*
+** asr_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	asr	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (asr_s16_z_tied1, svint16_t, svuint16_t,
+	     z0 = svasr_s16_z (p0, z0, z4),
+	     z0 = svasr_z (p0, z0, z4))
+
+/*
+** asr_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	asrr	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_s16_z_tied2, svint16_t, svuint16_t,
+		 z0_res = svasr_s16_z (p0, z4, z0),
+		 z0_res = svasr_z (p0, z4, z0))
+
+/*
+** asr_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	asr	z0\.h, p0/m, z0\.h, z4\.h
+** |
+**	movprfx	z0\.h, p0/z, z4\.h
+**	asrr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_DUAL_Z (asr_s16_z_untied, svint16_t, svuint16_t,
+	     z0 = svasr_s16_z (p0, z1, z4),
+	     z0 = svasr_z (p0, z1, z4))
+
+/*
+** asr_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	asr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s16_z_tied1, svint16_t, uint16_t,
+		 z0 = svasr_n_s16_z (p0, z0, x0),
+		 z0 = svasr_z (p0, z0, x0))
+
+/*
+** asr_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	asr	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	asrr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s16_z_untied, svint16_t, uint16_t,
+		 z0 = svasr_n_s16_z (p0, z1, x0),
+		 z0 = svasr_z (p0, z1, x0))
+
+/*
+** asr_1_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	asr	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s16_z_tied1, svint16_t,
+		z0 = svasr_n_s16_z (p0, z0, 1),
+		z0 = svasr_z (p0, z0, 1))
+
+/*
+** asr_1_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	asr	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s16_z_untied, svint16_t,
+		z0 = svasr_n_s16_z (p0, z1, 1),
+		z0 = svasr_z (p0, z1, 1))
+
+/*
+** asr_15_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	asr	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (asr_15_s16_z_tied1, svint16_t,
+		z0 = svasr_n_s16_z (p0, z0, 15),
+		z0 = svasr_z (p0, z0, 15))
+
+/*
+** asr_15_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	asr	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (asr_15_s16_z_untied, svint16_t,
+		z0 = svasr_n_s16_z (p0, z1, 15),
+		z0 = svasr_z (p0, z1, 15))
+
+/*
+** asr_16_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	asr	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asr_16_s16_z_tied1, svint16_t,
+		z0 = svasr_n_s16_z (p0, z0, 16),
+		z0 = svasr_z (p0, z0, 16))
+
+/*
+** asr_16_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	asr	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asr_16_s16_z_untied, svint16_t,
+		z0 = svasr_n_s16_z (p0, z1, 16),
+		z0 = svasr_z (p0, z1, 16))
+
+/*
+** asr_s16_x_tied1:
+**	asr	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (asr_s16_x_tied1, svint16_t, svuint16_t,
+	     z0 = svasr_s16_x (p0, z0, z4),
+	     z0 = svasr_x (p0, z0, z4))
+
+/*
+** asr_s16_x_tied2:
+**	asrr	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_s16_x_tied2, svint16_t, svuint16_t,
+		 z0_res = svasr_s16_x (p0, z4, z0),
+		 z0_res = svasr_x (p0, z4, z0))
+
+/*
+** asr_s16_x_untied:
+** (
+**	movprfx	z0, z1
+**	asr	z0\.h, p0/m, z0\.h, z4\.h
+** |
+**	movprfx	z0, z4
+**	asrr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_DUAL_Z (asr_s16_x_untied, svint16_t, svuint16_t,
+	     z0 = svasr_s16_x (p0, z1, z4),
+	     z0 = svasr_x (p0, z1, z4))
+
+/*
+** asr_w0_s16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	asr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s16_x_tied1, svint16_t, uint16_t,
+		 z0 = svasr_n_s16_x (p0, z0, x0),
+		 z0 = svasr_x (p0, z0, x0))
+
+/*
+** asr_w0_s16_x_untied:
+**	mov	z0\.h, w0
+**	asrr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s16_x_untied, svint16_t, uint16_t,
+		 z0 = svasr_n_s16_x (p0, z1, x0),
+		 z0 = svasr_x (p0, z1, x0))
+
+/*
+** asr_1_s16_x_tied1:
+**	asr	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s16_x_tied1, svint16_t,
+		z0 = svasr_n_s16_x (p0, z0, 1),
+		z0 = svasr_x (p0, z0, 1))
+
+/*
+** asr_1_s16_x_untied:
+**	asr	z0\.h, z1\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s16_x_untied, svint16_t,
+		z0 = svasr_n_s16_x (p0, z1, 1),
+		z0 = svasr_x (p0, z1, 1))
+
+/*
+** asr_15_s16_x_tied1:
+**	asr	z0\.h, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (asr_15_s16_x_tied1, svint16_t,
+		z0 = svasr_n_s16_x (p0, z0, 15),
+		z0 = svasr_x (p0, z0, 15))
+
+/*
+** asr_15_s16_x_untied:
+**	asr	z0\.h, z1\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (asr_15_s16_x_untied, svint16_t,
+		z0 = svasr_n_s16_x (p0, z1, 15),
+		z0 = svasr_x (p0, z1, 15))
+
+/*
+** asr_16_s16_x_tied1:
+**	asr	z0\.h, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asr_16_s16_x_tied1, svint16_t,
+		z0 = svasr_n_s16_x (p0, z0, 16),
+		z0 = svasr_x (p0, z0, 16))
+
+/*
+** asr_16_s16_x_untied:
+**	asr	z0\.h, z1\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asr_16_s16_x_untied, svint16_t,
+		z0 = svasr_n_s16_x (p0, z1, 16),
+		z0 = svasr_x (p0, z1, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s32.c
new file mode 100644
index 000000000..0f5a37372
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s32.c
@@ -0,0 +1,340 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** asr_s32_m_tied1:
+**	asr	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (asr_s32_m_tied1, svint32_t, svuint32_t,
+	     z0 = svasr_s32_m (p0, z0, z4),
+	     z0 = svasr_m (p0, z0, z4))
+
+/*
+** asr_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	asr	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_s32_m_tied2, svint32_t, svuint32_t,
+		 z0_res = svasr_s32_m (p0, z4, z0),
+		 z0_res = svasr_m (p0, z4, z0))
+
+/*
+** asr_s32_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (asr_s32_m_untied, svint32_t, svuint32_t,
+	     z0 = svasr_s32_m (p0, z1, z4),
+	     z0 = svasr_m (p0, z1, z4))
+
+/*
+** asr_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	asr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s32_m_tied1, svint32_t, uint32_t,
+		 z0 = svasr_n_s32_m (p0, z0, x0),
+		 z0 = svasr_m (p0, z0, x0))
+
+/*
+** asr_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	asr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s32_m_untied, svint32_t, uint32_t,
+		 z0 = svasr_n_s32_m (p0, z1, x0),
+		 z0 = svasr_m (p0, z1, x0))
+
+/*
+** asr_1_s32_m_tied1:
+**	asr	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s32_m_tied1, svint32_t,
+		z0 = svasr_n_s32_m (p0, z0, 1),
+		z0 = svasr_m (p0, z0, 1))
+
+/*
+** asr_1_s32_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s32_m_untied, svint32_t,
+		z0 = svasr_n_s32_m (p0, z1, 1),
+		z0 = svasr_m (p0, z1, 1))
+
+/*
+** asr_31_s32_m_tied1:
+**	asr	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (asr_31_s32_m_tied1, svint32_t,
+		z0 = svasr_n_s32_m (p0, z0, 31),
+		z0 = svasr_m (p0, z0, 31))
+
+/*
+** asr_31_s32_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (asr_31_s32_m_untied, svint32_t,
+		z0 = svasr_n_s32_m (p0, z1, 31),
+		z0 = svasr_m (p0, z1, 31))
+
+/*
+** asr_32_s32_m_tied1:
+**	asr	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asr_32_s32_m_tied1, svint32_t,
+		z0 = svasr_n_s32_m (p0, z0, 32),
+		z0 = svasr_m (p0, z0, 32))
+
+/*
+** asr_32_s32_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asr_32_s32_m_untied, svint32_t,
+		z0 = svasr_n_s32_m (p0, z1, 32),
+		z0 = svasr_m (p0, z1, 32))
+
+/*
+** asr_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	asr	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (asr_s32_z_tied1, svint32_t, svuint32_t,
+	     z0 = svasr_s32_z (p0, z0, z4),
+	     z0 = svasr_z (p0, z0, z4))
+
+/*
+** asr_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	asrr	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_s32_z_tied2, svint32_t, svuint32_t,
+		 z0_res = svasr_s32_z (p0, z4, z0),
+		 z0_res = svasr_z (p0, z4, z0))
+
+/*
+** asr_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	asr	z0\.s, p0/m, z0\.s, z4\.s
+** |
+**	movprfx	z0\.s, p0/z, z4\.s
+**	asrr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_DUAL_Z (asr_s32_z_untied, svint32_t, svuint32_t,
+	     z0 = svasr_s32_z (p0, z1, z4),
+	     z0 = svasr_z (p0, z1, z4))
+
+/*
+** asr_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	asr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s32_z_tied1, svint32_t, uint32_t,
+		 z0 = svasr_n_s32_z (p0, z0, x0),
+		 z0 = svasr_z (p0, z0, x0))
+
+/*
+** asr_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	asr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	asrr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s32_z_untied, svint32_t, uint32_t,
+		 z0 = svasr_n_s32_z (p0, z1, x0),
+		 z0 = svasr_z (p0, z1, x0))
+
+/*
+** asr_1_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	asr	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s32_z_tied1, svint32_t,
+		z0 = svasr_n_s32_z (p0, z0, 1),
+		z0 = svasr_z (p0, z0, 1))
+
+/*
+** asr_1_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	asr	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s32_z_untied, svint32_t,
+		z0 = svasr_n_s32_z (p0, z1, 1),
+		z0 = svasr_z (p0, z1, 1))
+
+/*
+** asr_31_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	asr	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (asr_31_s32_z_tied1, svint32_t,
+		z0 = svasr_n_s32_z (p0, z0, 31),
+		z0 = svasr_z (p0, z0, 31))
+
+/*
+** asr_31_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	asr	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (asr_31_s32_z_untied, svint32_t,
+		z0 = svasr_n_s32_z (p0, z1, 31),
+		z0 = svasr_z (p0, z1, 31))
+
+/*
+** asr_32_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	asr	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asr_32_s32_z_tied1, svint32_t,
+		z0 = svasr_n_s32_z (p0, z0, 32),
+		z0 = svasr_z (p0, z0, 32))
+
+/*
+** asr_32_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	asr	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asr_32_s32_z_untied, svint32_t,
+		z0 = svasr_n_s32_z (p0, z1, 32),
+		z0 = svasr_z (p0, z1, 32))
+
+/*
+** asr_s32_x_tied1:
+**	asr	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (asr_s32_x_tied1, svint32_t, svuint32_t,
+	     z0 = svasr_s32_x (p0, z0, z4),
+	     z0 = svasr_x (p0, z0, z4))
+
+/*
+** asr_s32_x_tied2:
+**	asrr	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_s32_x_tied2, svint32_t, svuint32_t,
+		 z0_res = svasr_s32_x (p0, z4, z0),
+		 z0_res = svasr_x (p0, z4, z0))
+
+/*
+** asr_s32_x_untied:
+** (
+**	movprfx	z0, z1
+**	asr	z0\.s, p0/m, z0\.s, z4\.s
+** |
+**	movprfx	z0, z4
+**	asrr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_DUAL_Z (asr_s32_x_untied, svint32_t, svuint32_t,
+	     z0 = svasr_s32_x (p0, z1, z4),
+	     z0 = svasr_x (p0, z1, z4))
+
+/*
+** asr_w0_s32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	asr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s32_x_tied1, svint32_t, uint32_t,
+		 z0 = svasr_n_s32_x (p0, z0, x0),
+		 z0 = svasr_x (p0, z0, x0))
+
+/*
+** asr_w0_s32_x_untied:
+**	mov	z0\.s, w0
+**	asrr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s32_x_untied, svint32_t, uint32_t,
+		 z0 = svasr_n_s32_x (p0, z1, x0),
+		 z0 = svasr_x (p0, z1, x0))
+
+/*
+** asr_1_s32_x_tied1:
+**	asr	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s32_x_tied1, svint32_t,
+		z0 = svasr_n_s32_x (p0, z0, 1),
+		z0 = svasr_x (p0, z0, 1))
+
+/*
+** asr_1_s32_x_untied:
+**	asr	z0\.s, z1\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s32_x_untied, svint32_t,
+		z0 = svasr_n_s32_x (p0, z1, 1),
+		z0 = svasr_x (p0, z1, 1))
+
+/*
+** asr_31_s32_x_tied1:
+**	asr	z0\.s, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (asr_31_s32_x_tied1, svint32_t,
+		z0 = svasr_n_s32_x (p0, z0, 31),
+		z0 = svasr_x (p0, z0, 31))
+
+/*
+** asr_31_s32_x_untied:
+**	asr	z0\.s, z1\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (asr_31_s32_x_untied, svint32_t,
+		z0 = svasr_n_s32_x (p0, z1, 31),
+		z0 = svasr_x (p0, z1, 31))
+
+/*
+** asr_32_s32_x_tied1:
+**	asr	z0\.s, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asr_32_s32_x_tied1, svint32_t,
+		z0 = svasr_n_s32_x (p0, z0, 32),
+		z0 = svasr_x (p0, z0, 32))
+
+/*
+** asr_32_s32_x_untied:
+**	asr	z0\.s, z1\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asr_32_s32_x_untied, svint32_t,
+		z0 = svasr_n_s32_x (p0, z1, 32),
+		z0 = svasr_x (p0, z1, 32))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s64.c
new file mode 100644
index 000000000..80cae07c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s64.c
@@ -0,0 +1,340 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** asr_s64_m_tied1:
+**	asr	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_s64_m_tied1, svint64_t, svuint64_t,
+	     z0 = svasr_s64_m (p0, z0, z4),
+	     z0 = svasr_m (p0, z0, z4))
+
+/*
+** asr_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	asr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_s64_m_tied2, svint64_t, svuint64_t,
+		 z0_res = svasr_s64_m (p0, z4, z0),
+		 z0_res = svasr_m (p0, z4, z0))
+
+/*
+** asr_s64_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_s64_m_untied, svint64_t, svuint64_t,
+	     z0 = svasr_s64_m (p0, z1, z4),
+	     z0 = svasr_m (p0, z1, z4))
+
+/*
+** asr_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	asr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_x0_s64_m_tied1, svint64_t, uint64_t,
+		 z0 = svasr_n_s64_m (p0, z0, x0),
+		 z0 = svasr_m (p0, z0, x0))
+
+/*
+** asr_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	asr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_x0_s64_m_untied, svint64_t, uint64_t,
+		 z0 = svasr_n_s64_m (p0, z1, x0),
+		 z0 = svasr_m (p0, z1, x0))
+
+/*
+** asr_1_s64_m_tied1:
+**	asr	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s64_m_tied1, svint64_t,
+		z0 = svasr_n_s64_m (p0, z0, 1),
+		z0 = svasr_m (p0, z0, 1))
+
+/*
+** asr_1_s64_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s64_m_untied, svint64_t,
+		z0 = svasr_n_s64_m (p0, z1, 1),
+		z0 = svasr_m (p0, z1, 1))
+
+/*
+** asr_63_s64_m_tied1:
+**	asr	z0\.d, p0/m, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (asr_63_s64_m_tied1, svint64_t,
+		z0 = svasr_n_s64_m (p0, z0, 63),
+		z0 = svasr_m (p0, z0, 63))
+
+/*
+** asr_63_s64_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.d, p0/m, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (asr_63_s64_m_untied, svint64_t,
+		z0 = svasr_n_s64_m (p0, z1, 63),
+		z0 = svasr_m (p0, z1, 63))
+
+/*
+** asr_64_s64_m_tied1:
+**	asr	z0\.d, p0/m, z0\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (asr_64_s64_m_tied1, svint64_t,
+		z0 = svasr_n_s64_m (p0, z0, 64),
+		z0 = svasr_m (p0, z0, 64))
+
+/*
+** asr_64_s64_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.d, p0/m, z0\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (asr_64_s64_m_untied, svint64_t,
+		z0 = svasr_n_s64_m (p0, z1, 64),
+		z0 = svasr_m (p0, z1, 64))
+
+/*
+** asr_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	asr	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_s64_z_tied1, svint64_t, svuint64_t,
+	     z0 = svasr_s64_z (p0, z0, z4),
+	     z0 = svasr_z (p0, z0, z4))
+
+/*
+** asr_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	asrr	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_s64_z_tied2, svint64_t, svuint64_t,
+		 z0_res = svasr_s64_z (p0, z4, z0),
+		 z0_res = svasr_z (p0, z4, z0))
+
+/*
+** asr_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	asr	z0\.d, p0/m, z0\.d, z4\.d
+** |
+**	movprfx	z0\.d, p0/z, z4\.d
+**	asrr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_DUAL_Z (asr_s64_z_untied, svint64_t, svuint64_t,
+	     z0 = svasr_s64_z (p0, z1, z4),
+	     z0 = svasr_z (p0, z1, z4))
+
+/*
+** asr_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	asr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_x0_s64_z_tied1, svint64_t, uint64_t,
+		 z0 = svasr_n_s64_z (p0, z0, x0),
+		 z0 = svasr_z (p0, z0, x0))
+
+/*
+** asr_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	asr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	asrr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_x0_s64_z_untied, svint64_t, uint64_t,
+		 z0 = svasr_n_s64_z (p0, z1, x0),
+		 z0 = svasr_z (p0, z1, x0))
+
+/*
+** asr_1_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	asr	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s64_z_tied1, svint64_t,
+		z0 = svasr_n_s64_z (p0, z0, 1),
+		z0 = svasr_z (p0, z0, 1))
+
+/*
+** asr_1_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	asr	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s64_z_untied, svint64_t,
+		z0 = svasr_n_s64_z (p0, z1, 1),
+		z0 = svasr_z (p0, z1, 1))
+
+/*
+** asr_63_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	asr	z0\.d, p0/m, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (asr_63_s64_z_tied1, svint64_t,
+		z0 = svasr_n_s64_z (p0, z0, 63),
+		z0 = svasr_z (p0, z0, 63))
+
+/*
+** asr_63_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	asr	z0\.d, p0/m, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (asr_63_s64_z_untied, svint64_t,
+		z0 = svasr_n_s64_z (p0, z1, 63),
+		z0 = svasr_z (p0, z1, 63))
+
+/*
+** asr_64_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	asr	z0\.d, p0/m, z0\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (asr_64_s64_z_tied1, svint64_t,
+		z0 = svasr_n_s64_z (p0, z0, 64),
+		z0 = svasr_z (p0, z0, 64))
+
+/*
+** asr_64_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	asr	z0\.d, p0/m, z0\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (asr_64_s64_z_untied, svint64_t,
+		z0 = svasr_n_s64_z (p0, z1, 64),
+		z0 = svasr_z (p0, z1, 64))
+
+/*
+** asr_s64_x_tied1:
+**	asr	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_s64_x_tied1, svint64_t, svuint64_t,
+	     z0 = svasr_s64_x (p0, z0, z4),
+	     z0 = svasr_x (p0, z0, z4))
+
+/*
+** asr_s64_x_tied2:
+**	asrr	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_s64_x_tied2, svint64_t, svuint64_t,
+		 z0_res = svasr_s64_x (p0, z4, z0),
+		 z0_res = svasr_x (p0, z4, z0))
+
+/*
+** asr_s64_x_untied:
+** (
+**	movprfx	z0, z1
+**	asr	z0\.d, p0/m, z0\.d, z4\.d
+** |
+**	movprfx	z0, z4
+**	asrr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_DUAL_Z (asr_s64_x_untied, svint64_t, svuint64_t,
+	     z0 = svasr_s64_x (p0, z1, z4),
+	     z0 = svasr_x (p0, z1, z4))
+
+/*
+** asr_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	asr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_x0_s64_x_tied1, svint64_t, uint64_t,
+		 z0 = svasr_n_s64_x (p0, z0, x0),
+		 z0 = svasr_x (p0, z0, x0))
+
+/*
+** asr_x0_s64_x_untied:
+**	mov	z0\.d, x0
+**	asrr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_x0_s64_x_untied, svint64_t, uint64_t,
+		 z0 = svasr_n_s64_x (p0, z1, x0),
+		 z0 = svasr_x (p0, z1, x0))
+
+/*
+** asr_1_s64_x_tied1:
+**	asr	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s64_x_tied1, svint64_t,
+		z0 = svasr_n_s64_x (p0, z0, 1),
+		z0 = svasr_x (p0, z0, 1))
+
+/*
+** asr_1_s64_x_untied:
+**	asr	z0\.d, z1\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s64_x_untied, svint64_t,
+		z0 = svasr_n_s64_x (p0, z1, 1),
+		z0 = svasr_x (p0, z1, 1))
+
+/*
+** asr_63_s64_x_tied1:
+**	asr	z0\.d, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (asr_63_s64_x_tied1, svint64_t,
+		z0 = svasr_n_s64_x (p0, z0, 63),
+		z0 = svasr_x (p0, z0, 63))
+
+/*
+** asr_63_s64_x_untied:
+**	asr	z0\.d, z1\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (asr_63_s64_x_untied, svint64_t,
+		z0 = svasr_n_s64_x (p0, z1, 63),
+		z0 = svasr_x (p0, z1, 63))
+
+/*
+** asr_64_s64_x_tied1:
+**	asr	z0\.d, z0\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (asr_64_s64_x_tied1, svint64_t,
+		z0 = svasr_n_s64_x (p0, z0, 64),
+		z0 = svasr_x (p0, z0, 64))
+
+/*
+** asr_64_s64_x_untied:
+**	asr	z0\.d, z1\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (asr_64_s64_x_untied, svint64_t,
+		z0 = svasr_n_s64_x (p0, z1, 64),
+		z0 = svasr_x (p0, z1, 64))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s8.c
new file mode 100644
index 000000000..992e93fde
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_s8.c
@@ -0,0 +1,340 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** asr_s8_m_tied1:
+**	asr	z0\.b, p0/m, z0\.b, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (asr_s8_m_tied1, svint8_t, svuint8_t,
+	     z0 = svasr_s8_m (p0, z0, z4),
+	     z0 = svasr_m (p0, z0, z4))
+
+/*
+** asr_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	asr	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_s8_m_tied2, svint8_t, svuint8_t,
+		 z0_res = svasr_s8_m (p0, z4, z0),
+		 z0_res = svasr_m (p0, z4, z0))
+
+/*
+** asr_s8_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.b, p0/m, z0\.b, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (asr_s8_m_untied, svint8_t, svuint8_t,
+	     z0 = svasr_s8_m (p0, z1, z4),
+	     z0 = svasr_m (p0, z1, z4))
+
+/*
+** asr_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	asr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s8_m_tied1, svint8_t, uint8_t,
+		 z0 = svasr_n_s8_m (p0, z0, x0),
+		 z0 = svasr_m (p0, z0, x0))
+
+/*
+** asr_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	asr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s8_m_untied, svint8_t, uint8_t,
+		 z0 = svasr_n_s8_m (p0, z1, x0),
+		 z0 = svasr_m (p0, z1, x0))
+
+/*
+** asr_1_s8_m_tied1:
+**	asr	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s8_m_tied1, svint8_t,
+		z0 = svasr_n_s8_m (p0, z0, 1),
+		z0 = svasr_m (p0, z0, 1))
+
+/*
+** asr_1_s8_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s8_m_untied, svint8_t,
+		z0 = svasr_n_s8_m (p0, z1, 1),
+		z0 = svasr_m (p0, z1, 1))
+
+/*
+** asr_7_s8_m_tied1:
+**	asr	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (asr_7_s8_m_tied1, svint8_t,
+		z0 = svasr_n_s8_m (p0, z0, 7),
+		z0 = svasr_m (p0, z0, 7))
+
+/*
+** asr_7_s8_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (asr_7_s8_m_untied, svint8_t,
+		z0 = svasr_n_s8_m (p0, z1, 7),
+		z0 = svasr_m (p0, z1, 7))
+
+/*
+** asr_8_s8_m_tied1:
+**	asr	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asr_8_s8_m_tied1, svint8_t,
+		z0 = svasr_n_s8_m (p0, z0, 8),
+		z0 = svasr_m (p0, z0, 8))
+
+/*
+** asr_8_s8_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asr_8_s8_m_untied, svint8_t,
+		z0 = svasr_n_s8_m (p0, z1, 8),
+		z0 = svasr_m (p0, z1, 8))
+
+/*
+** asr_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	asr	z0\.b, p0/m, z0\.b, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (asr_s8_z_tied1, svint8_t, svuint8_t,
+	     z0 = svasr_s8_z (p0, z0, z4),
+	     z0 = svasr_z (p0, z0, z4))
+
+/*
+** asr_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	asrr	z0\.b, p0/m, z0\.b, z4\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_s8_z_tied2, svint8_t, svuint8_t,
+		 z0_res = svasr_s8_z (p0, z4, z0),
+		 z0_res = svasr_z (p0, z4, z0))
+
+/*
+** asr_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	asr	z0\.b, p0/m, z0\.b, z4\.b
+** |
+**	movprfx	z0\.b, p0/z, z4\.b
+**	asrr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_DUAL_Z (asr_s8_z_untied, svint8_t, svuint8_t,
+	     z0 = svasr_s8_z (p0, z1, z4),
+	     z0 = svasr_z (p0, z1, z4))
+
+/*
+** asr_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	asr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s8_z_tied1, svint8_t, uint8_t,
+		 z0 = svasr_n_s8_z (p0, z0, x0),
+		 z0 = svasr_z (p0, z0, x0))
+
+/*
+** asr_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	asr	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	asrr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s8_z_untied, svint8_t, uint8_t,
+		 z0 = svasr_n_s8_z (p0, z1, x0),
+		 z0 = svasr_z (p0, z1, x0))
+
+/*
+** asr_1_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	asr	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s8_z_tied1, svint8_t,
+		z0 = svasr_n_s8_z (p0, z0, 1),
+		z0 = svasr_z (p0, z0, 1))
+
+/*
+** asr_1_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	asr	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s8_z_untied, svint8_t,
+		z0 = svasr_n_s8_z (p0, z1, 1),
+		z0 = svasr_z (p0, z1, 1))
+
+/*
+** asr_7_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	asr	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (asr_7_s8_z_tied1, svint8_t,
+		z0 = svasr_n_s8_z (p0, z0, 7),
+		z0 = svasr_z (p0, z0, 7))
+
+/*
+** asr_7_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	asr	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (asr_7_s8_z_untied, svint8_t,
+		z0 = svasr_n_s8_z (p0, z1, 7),
+		z0 = svasr_z (p0, z1, 7))
+
+/*
+** asr_8_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	asr	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asr_8_s8_z_tied1, svint8_t,
+		z0 = svasr_n_s8_z (p0, z0, 8),
+		z0 = svasr_z (p0, z0, 8))
+
+/*
+** asr_8_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	asr	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asr_8_s8_z_untied, svint8_t,
+		z0 = svasr_n_s8_z (p0, z1, 8),
+		z0 = svasr_z (p0, z1, 8))
+
+/*
+** asr_s8_x_tied1:
+**	asr	z0\.b, p0/m, z0\.b, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (asr_s8_x_tied1, svint8_t, svuint8_t,
+	     z0 = svasr_s8_x (p0, z0, z4),
+	     z0 = svasr_x (p0, z0, z4))
+
+/*
+** asr_s8_x_tied2:
+**	asrr	z0\.b, p0/m, z0\.b, z4\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_s8_x_tied2, svint8_t, svuint8_t,
+		 z0_res = svasr_s8_x (p0, z4, z0),
+		 z0_res = svasr_x (p0, z4, z0))
+
+/*
+** asr_s8_x_untied:
+** (
+**	movprfx	z0, z1
+**	asr	z0\.b, p0/m, z0\.b, z4\.b
+** |
+**	movprfx	z0, z4
+**	asrr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_DUAL_Z (asr_s8_x_untied, svint8_t, svuint8_t,
+	     z0 = svasr_s8_x (p0, z1, z4),
+	     z0 = svasr_x (p0, z1, z4))
+
+/*
+** asr_w0_s8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	asr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s8_x_tied1, svint8_t, uint8_t,
+		 z0 = svasr_n_s8_x (p0, z0, x0),
+		 z0 = svasr_x (p0, z0, x0))
+
+/*
+** asr_w0_s8_x_untied:
+**	mov	z0\.b, w0
+**	asrr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_w0_s8_x_untied, svint8_t, uint8_t,
+		 z0 = svasr_n_s8_x (p0, z1, x0),
+		 z0 = svasr_x (p0, z1, x0))
+
+/*
+** asr_1_s8_x_tied1:
+**	asr	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s8_x_tied1, svint8_t,
+		z0 = svasr_n_s8_x (p0, z0, 1),
+		z0 = svasr_x (p0, z0, 1))
+
+/*
+** asr_1_s8_x_untied:
+**	asr	z0\.b, z1\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_1_s8_x_untied, svint8_t,
+		z0 = svasr_n_s8_x (p0, z1, 1),
+		z0 = svasr_x (p0, z1, 1))
+
+/*
+** asr_7_s8_x_tied1:
+**	asr	z0\.b, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (asr_7_s8_x_tied1, svint8_t,
+		z0 = svasr_n_s8_x (p0, z0, 7),
+		z0 = svasr_x (p0, z0, 7))
+
+/*
+** asr_7_s8_x_untied:
+**	asr	z0\.b, z1\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (asr_7_s8_x_untied, svint8_t,
+		z0 = svasr_n_s8_x (p0, z1, 7),
+		z0 = svasr_x (p0, z1, 7))
+
+/*
+** asr_8_s8_x_tied1:
+**	asr	z0\.b, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asr_8_s8_x_tied1, svint8_t,
+		z0 = svasr_n_s8_x (p0, z0, 8),
+		z0 = svasr_x (p0, z0, 8))
+
+/*
+** asr_8_s8_x_untied:
+**	asr	z0\.b, z1\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asr_8_s8_x_untied, svint8_t,
+		z0 = svasr_n_s8_x (p0, z1, 8),
+		z0 = svasr_x (p0, z1, 8))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s16.c
new file mode 100644
index 000000000..b74ae33e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s16.c
@@ -0,0 +1,325 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** asr_wide_s16_m_tied1:
+**	asr	z0\.h, p0/m, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s16_m_tied1, svint16_t, svuint64_t,
+	     z0 = svasr_wide_s16_m (p0, z0, z4),
+	     z0 = svasr_wide_m (p0, z0, z4))
+
+/*
+** asr_wide_s16_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	asr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_wide_s16_m_tied2, svint16_t, svuint64_t,
+		 z0_res = svasr_wide_s16_m (p0, z4, z0),
+		 z0_res = svasr_wide_m (p0, z4, z0))
+
+/*
+** asr_wide_s16_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.h, p0/m, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s16_m_untied, svint16_t, svuint64_t,
+	     z0 = svasr_wide_s16_m (p0, z1, z4),
+	     z0 = svasr_wide_m (p0, z1, z4))
+
+/*
+** asr_wide_x0_s16_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	asr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s16_m_tied1, svint16_t, uint64_t,
+		 z0 = svasr_wide_n_s16_m (p0, z0, x0),
+		 z0 = svasr_wide_m (p0, z0, x0))
+
+/*
+** asr_wide_x0_s16_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	asr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s16_m_untied, svint16_t, uint64_t,
+		 z0 = svasr_wide_n_s16_m (p0, z1, x0),
+		 z0 = svasr_wide_m (p0, z1, x0))
+
+/*
+** asr_wide_1_s16_m_tied1:
+**	asr	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s16_m_tied1, svint16_t,
+		z0 = svasr_wide_n_s16_m (p0, z0, 1),
+		z0 = svasr_wide_m (p0, z0, 1))
+
+/*
+** asr_wide_1_s16_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s16_m_untied, svint16_t,
+		z0 = svasr_wide_n_s16_m (p0, z1, 1),
+		z0 = svasr_wide_m (p0, z1, 1))
+
+/*
+** asr_wide_15_s16_m_tied1:
+**	asr	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_15_s16_m_tied1, svint16_t,
+		z0 = svasr_wide_n_s16_m (p0, z0, 15),
+		z0 = svasr_wide_m (p0, z0, 15))
+
+/*
+** asr_wide_15_s16_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_15_s16_m_untied, svint16_t,
+		z0 = svasr_wide_n_s16_m (p0, z1, 15),
+		z0 = svasr_wide_m (p0, z1, 15))
+
+/*
+** asr_wide_16_s16_m_tied1:
+**	asr	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_16_s16_m_tied1, svint16_t,
+		z0 = svasr_wide_n_s16_m (p0, z0, 16),
+		z0 = svasr_wide_m (p0, z0, 16))
+
+/*
+** asr_wide_16_s16_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_16_s16_m_untied, svint16_t,
+		z0 = svasr_wide_n_s16_m (p0, z1, 16),
+		z0 = svasr_wide_m (p0, z1, 16))
+
+/*
+** asr_wide_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	asr	z0\.h, p0/m, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s16_z_tied1, svint16_t, svuint64_t,
+	     z0 = svasr_wide_s16_z (p0, z0, z4),
+	     z0 = svasr_wide_z (p0, z0, z4))
+
+/*
+** asr_wide_s16_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.h, p0/z, z4\.h
+**	asr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_wide_s16_z_tied2, svint16_t, svuint64_t,
+		 z0_res = svasr_wide_s16_z (p0, z4, z0),
+		 z0_res = svasr_wide_z (p0, z4, z0))
+
+/*
+** asr_wide_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	asr	z0\.h, p0/m, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s16_z_untied, svint16_t, svuint64_t,
+	     z0 = svasr_wide_s16_z (p0, z1, z4),
+	     z0 = svasr_wide_z (p0, z1, z4))
+
+/*
+** asr_wide_x0_s16_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	asr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s16_z_tied1, svint16_t, uint64_t,
+		 z0 = svasr_wide_n_s16_z (p0, z0, x0),
+		 z0 = svasr_wide_z (p0, z0, x0))
+
+/*
+** asr_wide_x0_s16_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.h, p0/z, z1\.h
+**	asr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s16_z_untied, svint16_t, uint64_t,
+		 z0 = svasr_wide_n_s16_z (p0, z1, x0),
+		 z0 = svasr_wide_z (p0, z1, x0))
+
+/*
+** asr_wide_1_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	asr	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s16_z_tied1, svint16_t,
+		z0 = svasr_wide_n_s16_z (p0, z0, 1),
+		z0 = svasr_wide_z (p0, z0, 1))
+
+/*
+** asr_wide_1_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	asr	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s16_z_untied, svint16_t,
+		z0 = svasr_wide_n_s16_z (p0, z1, 1),
+		z0 = svasr_wide_z (p0, z1, 1))
+
+/*
+** asr_wide_15_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	asr	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_15_s16_z_tied1, svint16_t,
+		z0 = svasr_wide_n_s16_z (p0, z0, 15),
+		z0 = svasr_wide_z (p0, z0, 15))
+
+/*
+** asr_wide_15_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	asr	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_15_s16_z_untied, svint16_t,
+		z0 = svasr_wide_n_s16_z (p0, z1, 15),
+		z0 = svasr_wide_z (p0, z1, 15))
+
+/*
+** asr_wide_16_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	asr	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_16_s16_z_tied1, svint16_t,
+		z0 = svasr_wide_n_s16_z (p0, z0, 16),
+		z0 = svasr_wide_z (p0, z0, 16))
+
+/*
+** asr_wide_16_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	asr	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_16_s16_z_untied, svint16_t,
+		z0 = svasr_wide_n_s16_z (p0, z1, 16),
+		z0 = svasr_wide_z (p0, z1, 16))
+
+/*
+** asr_wide_s16_x_tied1:
+**	asr	z0\.h, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s16_x_tied1, svint16_t, svuint64_t,
+	     z0 = svasr_wide_s16_x (p0, z0, z4),
+	     z0 = svasr_wide_x (p0, z0, z4))
+
+/*
+** asr_wide_s16_x_tied2:
+**	asr	z0\.h, z4\.h, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_wide_s16_x_tied2, svint16_t, svuint64_t,
+		 z0_res = svasr_wide_s16_x (p0, z4, z0),
+		 z0_res = svasr_wide_x (p0, z4, z0))
+
+/*
+** asr_wide_s16_x_untied:
+**	asr	z0\.h, z1\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s16_x_untied, svint16_t, svuint64_t,
+	     z0 = svasr_wide_s16_x (p0, z1, z4),
+	     z0 = svasr_wide_x (p0, z1, z4))
+
+/*
+** asr_wide_x0_s16_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	asr	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s16_x_tied1, svint16_t, uint64_t,
+		 z0 = svasr_wide_n_s16_x (p0, z0, x0),
+		 z0 = svasr_wide_x (p0, z0, x0))
+
+/*
+** asr_wide_x0_s16_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	asr	z0\.h, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s16_x_untied, svint16_t, uint64_t,
+		 z0 = svasr_wide_n_s16_x (p0, z1, x0),
+		 z0 = svasr_wide_x (p0, z1, x0))
+
+/*
+** asr_wide_1_s16_x_tied1:
+**	asr	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s16_x_tied1, svint16_t,
+		z0 = svasr_wide_n_s16_x (p0, z0, 1),
+		z0 = svasr_wide_x (p0, z0, 1))
+
+/*
+** asr_wide_1_s16_x_untied:
+**	asr	z0\.h, z1\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s16_x_untied, svint16_t,
+		z0 = svasr_wide_n_s16_x (p0, z1, 1),
+		z0 = svasr_wide_x (p0, z1, 1))
+
+/*
+** asr_wide_15_s16_x_tied1:
+**	asr	z0\.h, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_15_s16_x_tied1, svint16_t,
+		z0 = svasr_wide_n_s16_x (p0, z0, 15),
+		z0 = svasr_wide_x (p0, z0, 15))
+
+/*
+** asr_wide_15_s16_x_untied:
+**	asr	z0\.h, z1\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_15_s16_x_untied, svint16_t,
+		z0 = svasr_wide_n_s16_x (p0, z1, 15),
+		z0 = svasr_wide_x (p0, z1, 15))
+
+/*
+** asr_wide_16_s16_x_tied1:
+**	asr	z0\.h, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_16_s16_x_tied1, svint16_t,
+		z0 = svasr_wide_n_s16_x (p0, z0, 16),
+		z0 = svasr_wide_x (p0, z0, 16))
+
+/*
+** asr_wide_16_s16_x_untied:
+**	asr	z0\.h, z1\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_16_s16_x_untied, svint16_t,
+		z0 = svasr_wide_n_s16_x (p0, z1, 16),
+		z0 = svasr_wide_x (p0, z1, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s32.c
new file mode 100644
index 000000000..8698aef26
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s32.c
@@ -0,0 +1,325 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** asr_wide_s32_m_tied1:
+**	asr	z0\.s, p0/m, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s32_m_tied1, svint32_t, svuint64_t,
+	     z0 = svasr_wide_s32_m (p0, z0, z4),
+	     z0 = svasr_wide_m (p0, z0, z4))
+
+/*
+** asr_wide_s32_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	asr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_wide_s32_m_tied2, svint32_t, svuint64_t,
+		 z0_res = svasr_wide_s32_m (p0, z4, z0),
+		 z0_res = svasr_wide_m (p0, z4, z0))
+
+/*
+** asr_wide_s32_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.s, p0/m, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s32_m_untied, svint32_t, svuint64_t,
+	     z0 = svasr_wide_s32_m (p0, z1, z4),
+	     z0 = svasr_wide_m (p0, z1, z4))
+
+/*
+** asr_wide_x0_s32_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	asr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s32_m_tied1, svint32_t, uint64_t,
+		 z0 = svasr_wide_n_s32_m (p0, z0, x0),
+		 z0 = svasr_wide_m (p0, z0, x0))
+
+/*
+** asr_wide_x0_s32_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	asr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s32_m_untied, svint32_t, uint64_t,
+		 z0 = svasr_wide_n_s32_m (p0, z1, x0),
+		 z0 = svasr_wide_m (p0, z1, x0))
+
+/*
+** asr_wide_1_s32_m_tied1:
+**	asr	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s32_m_tied1, svint32_t,
+		z0 = svasr_wide_n_s32_m (p0, z0, 1),
+		z0 = svasr_wide_m (p0, z0, 1))
+
+/*
+** asr_wide_1_s32_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s32_m_untied, svint32_t,
+		z0 = svasr_wide_n_s32_m (p0, z1, 1),
+		z0 = svasr_wide_m (p0, z1, 1))
+
+/*
+** asr_wide_31_s32_m_tied1:
+**	asr	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_31_s32_m_tied1, svint32_t,
+		z0 = svasr_wide_n_s32_m (p0, z0, 31),
+		z0 = svasr_wide_m (p0, z0, 31))
+
+/*
+** asr_wide_31_s32_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_31_s32_m_untied, svint32_t,
+		z0 = svasr_wide_n_s32_m (p0, z1, 31),
+		z0 = svasr_wide_m (p0, z1, 31))
+
+/*
+** asr_wide_32_s32_m_tied1:
+**	asr	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_32_s32_m_tied1, svint32_t,
+		z0 = svasr_wide_n_s32_m (p0, z0, 32),
+		z0 = svasr_wide_m (p0, z0, 32))
+
+/*
+** asr_wide_32_s32_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_32_s32_m_untied, svint32_t,
+		z0 = svasr_wide_n_s32_m (p0, z1, 32),
+		z0 = svasr_wide_m (p0, z1, 32))
+
+/*
+** asr_wide_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	asr	z0\.s, p0/m, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s32_z_tied1, svint32_t, svuint64_t,
+	     z0 = svasr_wide_s32_z (p0, z0, z4),
+	     z0 = svasr_wide_z (p0, z0, z4))
+
+/*
+** asr_wide_s32_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.s, p0/z, z4\.s
+**	asr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_wide_s32_z_tied2, svint32_t, svuint64_t,
+		 z0_res = svasr_wide_s32_z (p0, z4, z0),
+		 z0_res = svasr_wide_z (p0, z4, z0))
+
+/*
+** asr_wide_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	asr	z0\.s, p0/m, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s32_z_untied, svint32_t, svuint64_t,
+	     z0 = svasr_wide_s32_z (p0, z1, z4),
+	     z0 = svasr_wide_z (p0, z1, z4))
+
+/*
+** asr_wide_x0_s32_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	asr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s32_z_tied1, svint32_t, uint64_t,
+		 z0 = svasr_wide_n_s32_z (p0, z0, x0),
+		 z0 = svasr_wide_z (p0, z0, x0))
+
+/*
+** asr_wide_x0_s32_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.s, p0/z, z1\.s
+**	asr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s32_z_untied, svint32_t, uint64_t,
+		 z0 = svasr_wide_n_s32_z (p0, z1, x0),
+		 z0 = svasr_wide_z (p0, z1, x0))
+
+/*
+** asr_wide_1_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	asr	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s32_z_tied1, svint32_t,
+		z0 = svasr_wide_n_s32_z (p0, z0, 1),
+		z0 = svasr_wide_z (p0, z0, 1))
+
+/*
+** asr_wide_1_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	asr	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s32_z_untied, svint32_t,
+		z0 = svasr_wide_n_s32_z (p0, z1, 1),
+		z0 = svasr_wide_z (p0, z1, 1))
+
+/*
+** asr_wide_31_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	asr	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_31_s32_z_tied1, svint32_t,
+		z0 = svasr_wide_n_s32_z (p0, z0, 31),
+		z0 = svasr_wide_z (p0, z0, 31))
+
+/*
+** asr_wide_31_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	asr	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_31_s32_z_untied, svint32_t,
+		z0 = svasr_wide_n_s32_z (p0, z1, 31),
+		z0 = svasr_wide_z (p0, z1, 31))
+
+/*
+** asr_wide_32_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	asr	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_32_s32_z_tied1, svint32_t,
+		z0 = svasr_wide_n_s32_z (p0, z0, 32),
+		z0 = svasr_wide_z (p0, z0, 32))
+
+/*
+** asr_wide_32_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	asr	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_32_s32_z_untied, svint32_t,
+		z0 = svasr_wide_n_s32_z (p0, z1, 32),
+		z0 = svasr_wide_z (p0, z1, 32))
+
+/*
+** asr_wide_s32_x_tied1:
+**	asr	z0\.s, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s32_x_tied1, svint32_t, svuint64_t,
+	     z0 = svasr_wide_s32_x (p0, z0, z4),
+	     z0 = svasr_wide_x (p0, z0, z4))
+
+/*
+** asr_wide_s32_x_tied2:
+**	asr	z0\.s, z4\.s, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_wide_s32_x_tied2, svint32_t, svuint64_t,
+		 z0_res = svasr_wide_s32_x (p0, z4, z0),
+		 z0_res = svasr_wide_x (p0, z4, z0))
+
+/*
+** asr_wide_s32_x_untied:
+**	asr	z0\.s, z1\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s32_x_untied, svint32_t, svuint64_t,
+	     z0 = svasr_wide_s32_x (p0, z1, z4),
+	     z0 = svasr_wide_x (p0, z1, z4))
+
+/*
+** asr_wide_x0_s32_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	asr	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s32_x_tied1, svint32_t, uint64_t,
+		 z0 = svasr_wide_n_s32_x (p0, z0, x0),
+		 z0 = svasr_wide_x (p0, z0, x0))
+
+/*
+** asr_wide_x0_s32_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	asr	z0\.s, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s32_x_untied, svint32_t, uint64_t,
+		 z0 = svasr_wide_n_s32_x (p0, z1, x0),
+		 z0 = svasr_wide_x (p0, z1, x0))
+
+/*
+** asr_wide_1_s32_x_tied1:
+**	asr	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s32_x_tied1, svint32_t,
+		z0 = svasr_wide_n_s32_x (p0, z0, 1),
+		z0 = svasr_wide_x (p0, z0, 1))
+
+/*
+** asr_wide_1_s32_x_untied:
+**	asr	z0\.s, z1\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s32_x_untied, svint32_t,
+		z0 = svasr_wide_n_s32_x (p0, z1, 1),
+		z0 = svasr_wide_x (p0, z1, 1))
+
+/*
+** asr_wide_31_s32_x_tied1:
+**	asr	z0\.s, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_31_s32_x_tied1, svint32_t,
+		z0 = svasr_wide_n_s32_x (p0, z0, 31),
+		z0 = svasr_wide_x (p0, z0, 31))
+
+/*
+** asr_wide_31_s32_x_untied:
+**	asr	z0\.s, z1\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_31_s32_x_untied, svint32_t,
+		z0 = svasr_wide_n_s32_x (p0, z1, 31),
+		z0 = svasr_wide_x (p0, z1, 31))
+
+/*
+** asr_wide_32_s32_x_tied1:
+**	asr	z0\.s, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_32_s32_x_tied1, svint32_t,
+		z0 = svasr_wide_n_s32_x (p0, z0, 32),
+		z0 = svasr_wide_x (p0, z0, 32))
+
+/*
+** asr_wide_32_s32_x_untied:
+**	asr	z0\.s, z1\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_32_s32_x_untied, svint32_t,
+		z0 = svasr_wide_n_s32_x (p0, z1, 32),
+		z0 = svasr_wide_x (p0, z1, 32))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s8.c
new file mode 100644
index 000000000..77b166939
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asr_wide_s8.c
@@ -0,0 +1,325 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** asr_wide_s8_m_tied1:
+**	asr	z0\.b, p0/m, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s8_m_tied1, svint8_t, svuint64_t,
+	     z0 = svasr_wide_s8_m (p0, z0, z4),
+	     z0 = svasr_wide_m (p0, z0, z4))
+
+/*
+** asr_wide_s8_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	asr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_wide_s8_m_tied2, svint8_t, svuint64_t,
+		 z0_res = svasr_wide_s8_m (p0, z4, z0),
+		 z0_res = svasr_wide_m (p0, z4, z0))
+
+/*
+** asr_wide_s8_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.b, p0/m, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s8_m_untied, svint8_t, svuint64_t,
+	     z0 = svasr_wide_s8_m (p0, z1, z4),
+	     z0 = svasr_wide_m (p0, z1, z4))
+
+/*
+** asr_wide_x0_s8_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	asr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s8_m_tied1, svint8_t, uint64_t,
+		 z0 = svasr_wide_n_s8_m (p0, z0, x0),
+		 z0 = svasr_wide_m (p0, z0, x0))
+
+/*
+** asr_wide_x0_s8_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	asr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s8_m_untied, svint8_t, uint64_t,
+		 z0 = svasr_wide_n_s8_m (p0, z1, x0),
+		 z0 = svasr_wide_m (p0, z1, x0))
+
+/*
+** asr_wide_1_s8_m_tied1:
+**	asr	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s8_m_tied1, svint8_t,
+		z0 = svasr_wide_n_s8_m (p0, z0, 1),
+		z0 = svasr_wide_m (p0, z0, 1))
+
+/*
+** asr_wide_1_s8_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s8_m_untied, svint8_t,
+		z0 = svasr_wide_n_s8_m (p0, z1, 1),
+		z0 = svasr_wide_m (p0, z1, 1))
+
+/*
+** asr_wide_7_s8_m_tied1:
+**	asr	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_7_s8_m_tied1, svint8_t,
+		z0 = svasr_wide_n_s8_m (p0, z0, 7),
+		z0 = svasr_wide_m (p0, z0, 7))
+
+/*
+** asr_wide_7_s8_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_7_s8_m_untied, svint8_t,
+		z0 = svasr_wide_n_s8_m (p0, z1, 7),
+		z0 = svasr_wide_m (p0, z1, 7))
+
+/*
+** asr_wide_8_s8_m_tied1:
+**	asr	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_8_s8_m_tied1, svint8_t,
+		z0 = svasr_wide_n_s8_m (p0, z0, 8),
+		z0 = svasr_wide_m (p0, z0, 8))
+
+/*
+** asr_wide_8_s8_m_untied:
+**	movprfx	z0, z1
+**	asr	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_8_s8_m_untied, svint8_t,
+		z0 = svasr_wide_n_s8_m (p0, z1, 8),
+		z0 = svasr_wide_m (p0, z1, 8))
+
+/*
+** asr_wide_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	asr	z0\.b, p0/m, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s8_z_tied1, svint8_t, svuint64_t,
+	     z0 = svasr_wide_s8_z (p0, z0, z4),
+	     z0 = svasr_wide_z (p0, z0, z4))
+
+/*
+** asr_wide_s8_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.b, p0/z, z4\.b
+**	asr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_wide_s8_z_tied2, svint8_t, svuint64_t,
+		 z0_res = svasr_wide_s8_z (p0, z4, z0),
+		 z0_res = svasr_wide_z (p0, z4, z0))
+
+/*
+** asr_wide_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	asr	z0\.b, p0/m, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s8_z_untied, svint8_t, svuint64_t,
+	     z0 = svasr_wide_s8_z (p0, z1, z4),
+	     z0 = svasr_wide_z (p0, z1, z4))
+
+/*
+** asr_wide_x0_s8_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	asr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s8_z_tied1, svint8_t, uint64_t,
+		 z0 = svasr_wide_n_s8_z (p0, z0, x0),
+		 z0 = svasr_wide_z (p0, z0, x0))
+
+/*
+** asr_wide_x0_s8_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.b, p0/z, z1\.b
+**	asr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s8_z_untied, svint8_t, uint64_t,
+		 z0 = svasr_wide_n_s8_z (p0, z1, x0),
+		 z0 = svasr_wide_z (p0, z1, x0))
+
+/*
+** asr_wide_1_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	asr	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s8_z_tied1, svint8_t,
+		z0 = svasr_wide_n_s8_z (p0, z0, 1),
+		z0 = svasr_wide_z (p0, z0, 1))
+
+/*
+** asr_wide_1_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	asr	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s8_z_untied, svint8_t,
+		z0 = svasr_wide_n_s8_z (p0, z1, 1),
+		z0 = svasr_wide_z (p0, z1, 1))
+
+/*
+** asr_wide_7_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	asr	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_7_s8_z_tied1, svint8_t,
+		z0 = svasr_wide_n_s8_z (p0, z0, 7),
+		z0 = svasr_wide_z (p0, z0, 7))
+
+/*
+** asr_wide_7_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	asr	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_7_s8_z_untied, svint8_t,
+		z0 = svasr_wide_n_s8_z (p0, z1, 7),
+		z0 = svasr_wide_z (p0, z1, 7))
+
+/*
+** asr_wide_8_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	asr	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_8_s8_z_tied1, svint8_t,
+		z0 = svasr_wide_n_s8_z (p0, z0, 8),
+		z0 = svasr_wide_z (p0, z0, 8))
+
+/*
+** asr_wide_8_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	asr	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_8_s8_z_untied, svint8_t,
+		z0 = svasr_wide_n_s8_z (p0, z1, 8),
+		z0 = svasr_wide_z (p0, z1, 8))
+
+/*
+** asr_wide_s8_x_tied1:
+**	asr	z0\.b, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s8_x_tied1, svint8_t, svuint64_t,
+	     z0 = svasr_wide_s8_x (p0, z0, z4),
+	     z0 = svasr_wide_x (p0, z0, z4))
+
+/*
+** asr_wide_s8_x_tied2:
+**	asr	z0\.b, z4\.b, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (asr_wide_s8_x_tied2, svint8_t, svuint64_t,
+		 z0_res = svasr_wide_s8_x (p0, z4, z0),
+		 z0_res = svasr_wide_x (p0, z4, z0))
+
+/*
+** asr_wide_s8_x_untied:
+**	asr	z0\.b, z1\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (asr_wide_s8_x_untied, svint8_t, svuint64_t,
+	     z0 = svasr_wide_s8_x (p0, z1, z4),
+	     z0 = svasr_wide_x (p0, z1, z4))
+
+/*
+** asr_wide_x0_s8_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	asr	z0\.b, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s8_x_tied1, svint8_t, uint64_t,
+		 z0 = svasr_wide_n_s8_x (p0, z0, x0),
+		 z0 = svasr_wide_x (p0, z0, x0))
+
+/*
+** asr_wide_x0_s8_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	asr	z0\.b, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (asr_wide_x0_s8_x_untied, svint8_t, uint64_t,
+		 z0 = svasr_wide_n_s8_x (p0, z1, x0),
+		 z0 = svasr_wide_x (p0, z1, x0))
+
+/*
+** asr_wide_1_s8_x_tied1:
+**	asr	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s8_x_tied1, svint8_t,
+		z0 = svasr_wide_n_s8_x (p0, z0, 1),
+		z0 = svasr_wide_x (p0, z0, 1))
+
+/*
+** asr_wide_1_s8_x_untied:
+**	asr	z0\.b, z1\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_1_s8_x_untied, svint8_t,
+		z0 = svasr_wide_n_s8_x (p0, z1, 1),
+		z0 = svasr_wide_x (p0, z1, 1))
+
+/*
+** asr_wide_7_s8_x_tied1:
+**	asr	z0\.b, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_7_s8_x_tied1, svint8_t,
+		z0 = svasr_wide_n_s8_x (p0, z0, 7),
+		z0 = svasr_wide_x (p0, z0, 7))
+
+/*
+** asr_wide_7_s8_x_untied:
+**	asr	z0\.b, z1\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_7_s8_x_untied, svint8_t,
+		z0 = svasr_wide_n_s8_x (p0, z1, 7),
+		z0 = svasr_wide_x (p0, z1, 7))
+
+/*
+** asr_wide_8_s8_x_tied1:
+**	asr	z0\.b, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_8_s8_x_tied1, svint8_t,
+		z0 = svasr_wide_n_s8_x (p0, z0, 8),
+		z0 = svasr_wide_x (p0, z0, 8))
+
+/*
+** asr_wide_8_s8_x_untied:
+**	asr	z0\.b, z1\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asr_wide_8_s8_x_untied, svint8_t,
+		z0 = svasr_wide_n_s8_x (p0, z1, 8),
+		z0 = svasr_wide_x (p0, z1, 8))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s16.c
new file mode 100644
index 000000000..40bbce042
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s16.c
@@ -0,0 +1,177 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** asrd_1_s16_m_tied1:
+**	asrd	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s16_m_tied1, svint16_t,
+		z0 = svasrd_n_s16_m (p0, z0, 1),
+		z0 = svasrd_m (p0, z0, 1))
+
+/*
+** asrd_1_s16_m_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s16_m_untied, svint16_t,
+		z0 = svasrd_n_s16_m (p0, z1, 1),
+		z0 = svasrd_m (p0, z1, 1))
+
+/*
+** asrd_2_s16_m_tied1:
+**	asrd	z0\.h, p0/m, z0\.h, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s16_m_tied1, svint16_t,
+		z0 = svasrd_n_s16_m (p0, z0, 2),
+		z0 = svasrd_m (p0, z0, 2))
+
+/*
+** asrd_2_s16_m_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.h, p0/m, z0\.h, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s16_m_untied, svint16_t,
+		z0 = svasrd_n_s16_m (p0, z1, 2),
+		z0 = svasrd_m (p0, z1, 2))
+
+/*
+** asrd_16_s16_m_tied1:
+**	asrd	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_16_s16_m_tied1, svint16_t,
+		z0 = svasrd_n_s16_m (p0, z0, 16),
+		z0 = svasrd_m (p0, z0, 16))
+
+/*
+** asrd_16_s16_m_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_16_s16_m_untied, svint16_t,
+		z0 = svasrd_n_s16_m (p0, z1, 16),
+		z0 = svasrd_m (p0, z1, 16))
+
+/*
+** asrd_1_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	asrd	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s16_z_tied1, svint16_t,
+		z0 = svasrd_n_s16_z (p0, z0, 1),
+		z0 = svasrd_z (p0, z0, 1))
+
+/*
+** asrd_1_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	asrd	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s16_z_untied, svint16_t,
+		z0 = svasrd_n_s16_z (p0, z1, 1),
+		z0 = svasrd_z (p0, z1, 1))
+
+/*
+** asrd_2_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	asrd	z0\.h, p0/m, z0\.h, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s16_z_tied1, svint16_t,
+		z0 = svasrd_n_s16_z (p0, z0, 2),
+		z0 = svasrd_z (p0, z0, 2))
+
+/*
+** asrd_2_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	asrd	z0\.h, p0/m, z0\.h, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s16_z_untied, svint16_t,
+		z0 = svasrd_n_s16_z (p0, z1, 2),
+		z0 = svasrd_z (p0, z1, 2))
+
+/*
+** asrd_16_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	asrd	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_16_s16_z_tied1, svint16_t,
+		z0 = svasrd_n_s16_z (p0, z0, 16),
+		z0 = svasrd_z (p0, z0, 16))
+
+/*
+** asrd_16_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	asrd	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_16_s16_z_untied, svint16_t,
+		z0 = svasrd_n_s16_z (p0, z1, 16),
+		z0 = svasrd_z (p0, z1, 16))
+
+/*
+** asrd_1_s16_x_tied1:
+**	asrd	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s16_x_tied1, svint16_t,
+		z0 = svasrd_n_s16_x (p0, z0, 1),
+		z0 = svasrd_x (p0, z0, 1))
+
+/*
+** asrd_1_s16_x_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s16_x_untied, svint16_t,
+		z0 = svasrd_n_s16_x (p0, z1, 1),
+		z0 = svasrd_x (p0, z1, 1))
+
+/*
+** asrd_2_s16_x_tied1:
+**	asrd	z0\.h, p0/m, z0\.h, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s16_x_tied1, svint16_t,
+		z0 = svasrd_n_s16_x (p0, z0, 2),
+		z0 = svasrd_x (p0, z0, 2))
+
+/*
+** asrd_2_s16_x_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.h, p0/m, z0\.h, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s16_x_untied, svint16_t,
+		z0 = svasrd_n_s16_x (p0, z1, 2),
+		z0 = svasrd_x (p0, z1, 2))
+
+/*
+** asrd_16_s16_x_tied1:
+**	asrd	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_16_s16_x_tied1, svint16_t,
+		z0 = svasrd_n_s16_x (p0, z0, 16),
+		z0 = svasrd_x (p0, z0, 16))
+
+/*
+** asrd_16_s16_x_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_16_s16_x_untied, svint16_t,
+		z0 = svasrd_n_s16_x (p0, z1, 16),
+		z0 = svasrd_x (p0, z1, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s32.c
new file mode 100644
index 000000000..0760b03de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s32.c
@@ -0,0 +1,177 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** asrd_1_s32_m_tied1:
+**	asrd	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s32_m_tied1, svint32_t,
+		z0 = svasrd_n_s32_m (p0, z0, 1),
+		z0 = svasrd_m (p0, z0, 1))
+
+/*
+** asrd_1_s32_m_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s32_m_untied, svint32_t,
+		z0 = svasrd_n_s32_m (p0, z1, 1),
+		z0 = svasrd_m (p0, z1, 1))
+
+/*
+** asrd_2_s32_m_tied1:
+**	asrd	z0\.s, p0/m, z0\.s, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s32_m_tied1, svint32_t,
+		z0 = svasrd_n_s32_m (p0, z0, 2),
+		z0 = svasrd_m (p0, z0, 2))
+
+/*
+** asrd_2_s32_m_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.s, p0/m, z0\.s, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s32_m_untied, svint32_t,
+		z0 = svasrd_n_s32_m (p0, z1, 2),
+		z0 = svasrd_m (p0, z1, 2))
+
+/*
+** asrd_32_s32_m_tied1:
+**	asrd	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_32_s32_m_tied1, svint32_t,
+		z0 = svasrd_n_s32_m (p0, z0, 32),
+		z0 = svasrd_m (p0, z0, 32))
+
+/*
+** asrd_32_s32_m_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_32_s32_m_untied, svint32_t,
+		z0 = svasrd_n_s32_m (p0, z1, 32),
+		z0 = svasrd_m (p0, z1, 32))
+
+/*
+** asrd_1_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	asrd	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s32_z_tied1, svint32_t,
+		z0 = svasrd_n_s32_z (p0, z0, 1),
+		z0 = svasrd_z (p0, z0, 1))
+
+/*
+** asrd_1_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	asrd	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s32_z_untied, svint32_t,
+		z0 = svasrd_n_s32_z (p0, z1, 1),
+		z0 = svasrd_z (p0, z1, 1))
+
+/*
+** asrd_2_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	asrd	z0\.s, p0/m, z0\.s, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s32_z_tied1, svint32_t,
+		z0 = svasrd_n_s32_z (p0, z0, 2),
+		z0 = svasrd_z (p0, z0, 2))
+
+/*
+** asrd_2_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	asrd	z0\.s, p0/m, z0\.s, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s32_z_untied, svint32_t,
+		z0 = svasrd_n_s32_z (p0, z1, 2),
+		z0 = svasrd_z (p0, z1, 2))
+
+/*
+** asrd_32_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	asrd	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_32_s32_z_tied1, svint32_t,
+		z0 = svasrd_n_s32_z (p0, z0, 32),
+		z0 = svasrd_z (p0, z0, 32))
+
+/*
+** asrd_32_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	asrd	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_32_s32_z_untied, svint32_t,
+		z0 = svasrd_n_s32_z (p0, z1, 32),
+		z0 = svasrd_z (p0, z1, 32))
+
+/*
+** asrd_1_s32_x_tied1:
+**	asrd	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s32_x_tied1, svint32_t,
+		z0 = svasrd_n_s32_x (p0, z0, 1),
+		z0 = svasrd_x (p0, z0, 1))
+
+/*
+** asrd_1_s32_x_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s32_x_untied, svint32_t,
+		z0 = svasrd_n_s32_x (p0, z1, 1),
+		z0 = svasrd_x (p0, z1, 1))
+
+/*
+** asrd_2_s32_x_tied1:
+**	asrd	z0\.s, p0/m, z0\.s, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s32_x_tied1, svint32_t,
+		z0 = svasrd_n_s32_x (p0, z0, 2),
+		z0 = svasrd_x (p0, z0, 2))
+
+/*
+** asrd_2_s32_x_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.s, p0/m, z0\.s, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s32_x_untied, svint32_t,
+		z0 = svasrd_n_s32_x (p0, z1, 2),
+		z0 = svasrd_x (p0, z1, 2))
+
+/*
+** asrd_32_s32_x_tied1:
+**	asrd	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_32_s32_x_tied1, svint32_t,
+		z0 = svasrd_n_s32_x (p0, z0, 32),
+		z0 = svasrd_x (p0, z0, 32))
+
+/*
+** asrd_32_s32_x_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_32_s32_x_untied, svint32_t,
+		z0 = svasrd_n_s32_x (p0, z1, 32),
+		z0 = svasrd_x (p0, z1, 32))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s64.c
new file mode 100644
index 000000000..0ef26c9fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s64.c
@@ -0,0 +1,177 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** asrd_1_s64_m_tied1:
+**	asrd	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s64_m_tied1, svint64_t,
+		z0 = svasrd_n_s64_m (p0, z0, 1),
+		z0 = svasrd_m (p0, z0, 1))
+
+/*
+** asrd_1_s64_m_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s64_m_untied, svint64_t,
+		z0 = svasrd_n_s64_m (p0, z1, 1),
+		z0 = svasrd_m (p0, z1, 1))
+
+/*
+** asrd_2_s64_m_tied1:
+**	asrd	z0\.d, p0/m, z0\.d, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s64_m_tied1, svint64_t,
+		z0 = svasrd_n_s64_m (p0, z0, 2),
+		z0 = svasrd_m (p0, z0, 2))
+
+/*
+** asrd_2_s64_m_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.d, p0/m, z0\.d, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s64_m_untied, svint64_t,
+		z0 = svasrd_n_s64_m (p0, z1, 2),
+		z0 = svasrd_m (p0, z1, 2))
+
+/*
+** asrd_64_s64_m_tied1:
+**	asrd	z0\.d, p0/m, z0\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_64_s64_m_tied1, svint64_t,
+		z0 = svasrd_n_s64_m (p0, z0, 64),
+		z0 = svasrd_m (p0, z0, 64))
+
+/*
+** asrd_64_s64_m_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.d, p0/m, z0\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_64_s64_m_untied, svint64_t,
+		z0 = svasrd_n_s64_m (p0, z1, 64),
+		z0 = svasrd_m (p0, z1, 64))
+
+/*
+** asrd_1_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	asrd	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s64_z_tied1, svint64_t,
+		z0 = svasrd_n_s64_z (p0, z0, 1),
+		z0 = svasrd_z (p0, z0, 1))
+
+/*
+** asrd_1_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	asrd	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s64_z_untied, svint64_t,
+		z0 = svasrd_n_s64_z (p0, z1, 1),
+		z0 = svasrd_z (p0, z1, 1))
+
+/*
+** asrd_2_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	asrd	z0\.d, p0/m, z0\.d, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s64_z_tied1, svint64_t,
+		z0 = svasrd_n_s64_z (p0, z0, 2),
+		z0 = svasrd_z (p0, z0, 2))
+
+/*
+** asrd_2_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	asrd	z0\.d, p0/m, z0\.d, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s64_z_untied, svint64_t,
+		z0 = svasrd_n_s64_z (p0, z1, 2),
+		z0 = svasrd_z (p0, z1, 2))
+
+/*
+** asrd_64_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	asrd	z0\.d, p0/m, z0\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_64_s64_z_tied1, svint64_t,
+		z0 = svasrd_n_s64_z (p0, z0, 64),
+		z0 = svasrd_z (p0, z0, 64))
+
+/*
+** asrd_64_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	asrd	z0\.d, p0/m, z0\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_64_s64_z_untied, svint64_t,
+		z0 = svasrd_n_s64_z (p0, z1, 64),
+		z0 = svasrd_z (p0, z1, 64))
+
+/*
+** asrd_1_s64_x_tied1:
+**	asrd	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s64_x_tied1, svint64_t,
+		z0 = svasrd_n_s64_x (p0, z0, 1),
+		z0 = svasrd_x (p0, z0, 1))
+
+/*
+** asrd_1_s64_x_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s64_x_untied, svint64_t,
+		z0 = svasrd_n_s64_x (p0, z1, 1),
+		z0 = svasrd_x (p0, z1, 1))
+
+/*
+** asrd_2_s64_x_tied1:
+**	asrd	z0\.d, p0/m, z0\.d, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s64_x_tied1, svint64_t,
+		z0 = svasrd_n_s64_x (p0, z0, 2),
+		z0 = svasrd_x (p0, z0, 2))
+
+/*
+** asrd_2_s64_x_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.d, p0/m, z0\.d, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s64_x_untied, svint64_t,
+		z0 = svasrd_n_s64_x (p0, z1, 2),
+		z0 = svasrd_x (p0, z1, 2))
+
+/*
+** asrd_64_s64_x_tied1:
+**	asrd	z0\.d, p0/m, z0\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_64_s64_x_tied1, svint64_t,
+		z0 = svasrd_n_s64_x (p0, z0, 64),
+		z0 = svasrd_x (p0, z0, 64))
+
+/*
+** asrd_64_s64_x_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.d, p0/m, z0\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_64_s64_x_untied, svint64_t,
+		z0 = svasrd_n_s64_x (p0, z1, 64),
+		z0 = svasrd_x (p0, z1, 64))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s8.c
new file mode 100644
index 000000000..9249ffbcb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/asrd_s8.c
@@ -0,0 +1,177 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** asrd_1_s8_m_tied1:
+**	asrd	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s8_m_tied1, svint8_t,
+		z0 = svasrd_n_s8_m (p0, z0, 1),
+		z0 = svasrd_m (p0, z0, 1))
+
+/*
+** asrd_1_s8_m_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s8_m_untied, svint8_t,
+		z0 = svasrd_n_s8_m (p0, z1, 1),
+		z0 = svasrd_m (p0, z1, 1))
+
+/*
+** asrd_2_s8_m_tied1:
+**	asrd	z0\.b, p0/m, z0\.b, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s8_m_tied1, svint8_t,
+		z0 = svasrd_n_s8_m (p0, z0, 2),
+		z0 = svasrd_m (p0, z0, 2))
+
+/*
+** asrd_2_s8_m_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.b, p0/m, z0\.b, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s8_m_untied, svint8_t,
+		z0 = svasrd_n_s8_m (p0, z1, 2),
+		z0 = svasrd_m (p0, z1, 2))
+
+/*
+** asrd_8_s8_m_tied1:
+**	asrd	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_8_s8_m_tied1, svint8_t,
+		z0 = svasrd_n_s8_m (p0, z0, 8),
+		z0 = svasrd_m (p0, z0, 8))
+
+/*
+** asrd_8_s8_m_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_8_s8_m_untied, svint8_t,
+		z0 = svasrd_n_s8_m (p0, z1, 8),
+		z0 = svasrd_m (p0, z1, 8))
+
+/*
+** asrd_1_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	asrd	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s8_z_tied1, svint8_t,
+		z0 = svasrd_n_s8_z (p0, z0, 1),
+		z0 = svasrd_z (p0, z0, 1))
+
+/*
+** asrd_1_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	asrd	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s8_z_untied, svint8_t,
+		z0 = svasrd_n_s8_z (p0, z1, 1),
+		z0 = svasrd_z (p0, z1, 1))
+
+/*
+** asrd_2_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	asrd	z0\.b, p0/m, z0\.b, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s8_z_tied1, svint8_t,
+		z0 = svasrd_n_s8_z (p0, z0, 2),
+		z0 = svasrd_z (p0, z0, 2))
+
+/*
+** asrd_2_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	asrd	z0\.b, p0/m, z0\.b, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s8_z_untied, svint8_t,
+		z0 = svasrd_n_s8_z (p0, z1, 2),
+		z0 = svasrd_z (p0, z1, 2))
+
+/*
+** asrd_8_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	asrd	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_8_s8_z_tied1, svint8_t,
+		z0 = svasrd_n_s8_z (p0, z0, 8),
+		z0 = svasrd_z (p0, z0, 8))
+
+/*
+** asrd_8_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	asrd	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_8_s8_z_untied, svint8_t,
+		z0 = svasrd_n_s8_z (p0, z1, 8),
+		z0 = svasrd_z (p0, z1, 8))
+
+/*
+** asrd_1_s8_x_tied1:
+**	asrd	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s8_x_tied1, svint8_t,
+		z0 = svasrd_n_s8_x (p0, z0, 1),
+		z0 = svasrd_x (p0, z0, 1))
+
+/*
+** asrd_1_s8_x_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_1_s8_x_untied, svint8_t,
+		z0 = svasrd_n_s8_x (p0, z1, 1),
+		z0 = svasrd_x (p0, z1, 1))
+
+/*
+** asrd_2_s8_x_tied1:
+**	asrd	z0\.b, p0/m, z0\.b, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s8_x_tied1, svint8_t,
+		z0 = svasrd_n_s8_x (p0, z0, 2),
+		z0 = svasrd_x (p0, z0, 2))
+
+/*
+** asrd_2_s8_x_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.b, p0/m, z0\.b, #2
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_2_s8_x_untied, svint8_t,
+		z0 = svasrd_n_s8_x (p0, z1, 2),
+		z0 = svasrd_x (p0, z1, 2))
+
+/*
+** asrd_8_s8_x_tied1:
+**	asrd	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_8_s8_x_tied1, svint8_t,
+		z0 = svasrd_n_s8_x (p0, z0, 8),
+		z0 = svasrd_x (p0, z0, 8))
+
+/*
+** asrd_8_s8_x_untied:
+**	movprfx	z0, z1
+**	asrd	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (asrd_8_s8_x_untied, svint8_t,
+		z0 = svasrd_n_s8_x (p0, z1, 8),
+		z0 = svasrd_x (p0, z1, 8))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfdot_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfdot_f32.c
new file mode 100644
index 000000000..376622da0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfdot_f32.c
@@ -0,0 +1,67 @@
+/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
+/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** bfdot_f32_tied1:
+**	bfdot	z0\.s, z4\.h, z5\.h
+**	ret
+*/
+TEST_DUAL_Z (bfdot_f32_tied1, svfloat32_t, svbfloat16_t,
+	     z0 = svbfdot_f32 (z0, z4, z5),
+	     z0 = svbfdot (z0, z4, z5))
+
+/*
+** bfdot_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	bfdot	z0\.s, \1\.h, z1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (bfdot_f32_tied2, svfloat32_t, svbfloat16_t,
+		 z0_res = svbfdot_f32 (z4, z0, z1),
+		 z0_res = svbfdot (z4, z0, z1))
+
+/*
+** bfdot_f32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	bfdot	z0\.s, z1\.h, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (bfdot_f32_tied3, svfloat32_t, svbfloat16_t,
+		 z0_res = svbfdot_f32 (z4, z1, z0),
+		 z0_res = svbfdot (z4, z1, z0))
+
+/*
+** bfdot_f32_untied:
+**	movprfx	z0, z1
+**	bfdot	z0\.s, z4\.h, z5\.h
+**	ret
+*/
+TEST_DUAL_Z (bfdot_f32_untied, svfloat32_t, svbfloat16_t,
+	     z0 = svbfdot_f32 (z1, z4, z5),
+	     z0 = svbfdot (z1, z4, z5))
+
+/*
+** bfdot_h7_f32_tied1:
+**	mov	(z[0-9]+\.h), h7
+**	bfdot	z0\.s, z4\.h, \1
+**	ret
+*/
+TEST_DUAL_ZD (bfdot_h7_f32_tied1, svfloat32_t, svbfloat16_t, bfloat16_t,
+	      z0 = svbfdot_n_f32 (z0, z4, d7),
+	      z0 = svbfdot (z0, z4, d7))
+
+/*
+** bfdot_h7_f32_untied:
+**	mov	(z[0-9]+\.h), h7
+**	movprfx	z0, z1
+**	bfdot	z0\.s, z4\.h, \1
+**	ret
+*/
+TEST_DUAL_ZD (bfdot_h7_f32_untied, svfloat32_t, svbfloat16_t, bfloat16_t,
+	      z0 = svbfdot_n_f32 (z1, z4, d7),
+	      z0 = svbfdot (z1, z4, d7))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfdot_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfdot_lane_f32.c
new file mode 100644
index 000000000..0f624fe9f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfdot_lane_f32.c
@@ -0,0 +1,86 @@
+/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
+/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** bfdot_lane_0_f32_tied1:
+**	bfdot	z0\.s, z4\.h, z5\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z (bfdot_lane_0_f32_tied1, svfloat32_t, svbfloat16_t,
+	     z0 = svbfdot_lane_f32 (z0, z4, z5, 0),
+	     z0 = svbfdot_lane (z0, z4, z5, 0))
+
+/*
+** bfdot_lane_0_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	bfdot	z0\.s, \1\.h, z1\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z_REV (bfdot_lane_0_f32_tied2, svfloat32_t, svbfloat16_t,
+		 z0_res = svbfdot_lane_f32 (z4, z0, z1, 0),
+		 z0_res = svbfdot_lane (z4, z0, z1, 0))
+
+/*
+** bfdot_lane_0_f32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	bfdot	z0\.s, z1\.h, \1\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z_REV (bfdot_lane_0_f32_tied3, svfloat32_t, svbfloat16_t,
+		 z0_res = svbfdot_lane_f32 (z4, z1, z0, 0),
+		 z0_res = svbfdot_lane (z4, z1, z0, 0))
+
+/*
+** bfdot_lane_0_f32_untied:
+**	movprfx	z0, z1
+**	bfdot	z0\.s, z4\.h, z5\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z (bfdot_lane_0_f32_untied, svfloat32_t, svbfloat16_t,
+	     z0 = svbfdot_lane_f32 (z1, z4, z5, 0),
+	     z0 = svbfdot_lane (z1, z4, z5, 0))
+
+/*
+** bfdot_lane_1_f32:
+**	bfdot	z0\.s, z4\.h, z5\.h\[1\]
+**	ret
+*/
+TEST_DUAL_Z (bfdot_lane_1_f32, svfloat32_t, svbfloat16_t,
+	     z0 = svbfdot_lane_f32 (z0, z4, z5, 1),
+	     z0 = svbfdot_lane (z0, z4, z5, 1))
+
+/*
+** bfdot_lane_3_f32:
+**	bfdot	z0\.s, z4\.h, z5\.h\[3\]
+**	ret
+*/
+TEST_DUAL_Z (bfdot_lane_3_f32, svfloat32_t, svbfloat16_t,
+	     z0 = svbfdot_lane_f32 (z0, z4, z5, 3),
+	     z0 = svbfdot_lane (z0, z4, z5, 3))
+
+/*
+** bfdot_lane_z8_f32:
+**	str	d8, \[sp, -16\]!
+**	mov	(z[0-7])\.d, z8\.d
+**	bfdot	z0\.s, z1\.h, \1\.h\[1\]
+**	ldr	d8, \[sp\], 16
+**	ret
+*/
+TEST_DUAL_LANE_REG (bfdot_lane_z8_f32, svfloat32_t, svbfloat16_t, z8,
+		    z0 = svbfdot_lane_f32 (z0, z1, z8, 1),
+		    z0 = svbfdot_lane (z0, z1, z8, 1))
+
+/*
+** bfdot_lane_z16_f32:
+**	mov	(z[0-7])\.d, z16\.d
+**	bfdot	z0\.s, z1\.h, \1\.h\[1\]
+**	ret
+*/
+TEST_DUAL_LANE_REG (bfdot_lane_z16_f32, svfloat32_t, svbfloat16_t, z16,
+		    z0 = svbfdot_lane_f32 (z0, z1, z16, 1),
+		    z0 = svbfdot_lane (z0, z1, z16, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalb_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalb_f32.c
new file mode 100644
index 000000000..0f810116c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalb_f32.c
@@ -0,0 +1,67 @@
+/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
+/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** bfmlalb_f32_tied1:
+**	bfmlalb	z0\.s, z4\.h, z5\.h
+**	ret
+*/
+TEST_DUAL_Z (bfmlalb_f32_tied1, svfloat32_t, svbfloat16_t,
+	     z0 = svbfmlalb_f32 (z0, z4, z5),
+	     z0 = svbfmlalb (z0, z4, z5))
+
+/*
+** bfmlalb_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	bfmlalb	z0\.s, \1\.h, z1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (bfmlalb_f32_tied2, svfloat32_t, svbfloat16_t,
+		 z0_res = svbfmlalb_f32 (z4, z0, z1),
+		 z0_res = svbfmlalb (z4, z0, z1))
+
+/*
+** bfmlalb_f32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	bfmlalb	z0\.s, z1\.h, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (bfmlalb_f32_tied3, svfloat32_t, svbfloat16_t,
+		 z0_res = svbfmlalb_f32 (z4, z1, z0),
+		 z0_res = svbfmlalb (z4, z1, z0))
+
+/*
+** bfmlalb_f32_untied:
+**	movprfx	z0, z1
+**	bfmlalb	z0\.s, z4\.h, z5\.h
+**	ret
+*/
+TEST_DUAL_Z (bfmlalb_f32_untied, svfloat32_t, svbfloat16_t,
+	     z0 = svbfmlalb_f32 (z1, z4, z5),
+	     z0 = svbfmlalb (z1, z4, z5))
+
+/*
+** bfmlalb_h7_f32_tied1:
+**	mov	(z[0-9]+\.h), h7
+**	bfmlalb	z0\.s, z4\.h, \1
+**	ret
+*/
+TEST_DUAL_ZD (bfmlalb_h7_f32_tied1, svfloat32_t, svbfloat16_t, bfloat16_t,
+	      z0 = svbfmlalb_n_f32 (z0, z4, d7),
+	      z0 = svbfmlalb (z0, z4, d7))
+
+/*
+** bfmlalb_h7_f32_untied:
+**	mov	(z[0-9]+\.h), h7
+**	movprfx	z0, z1
+**	bfmlalb	z0\.s, z4\.h, \1
+**	ret
+*/
+TEST_DUAL_ZD (bfmlalb_h7_f32_untied, svfloat32_t, svbfloat16_t, bfloat16_t,
+	      z0 = svbfmlalb_n_f32 (z1, z4, d7),
+	      z0 = svbfmlalb (z1, z4, d7))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalb_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalb_lane_f32.c
new file mode 100644
index 000000000..b0ec0881d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalb_lane_f32.c
@@ -0,0 +1,86 @@
+/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
+/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** bfmlalb_lane_0_f32_tied1:
+**	bfmlalb	z0\.s, z4\.h, z5\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z (bfmlalb_lane_0_f32_tied1, svfloat32_t, svbfloat16_t,
+	     z0 = svbfmlalb_lane_f32 (z0, z4, z5, 0),
+	     z0 = svbfmlalb_lane (z0, z4, z5, 0))
+
+/*
+** bfmlalb_lane_0_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	bfmlalb	z0\.s, \1\.h, z1\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z_REV (bfmlalb_lane_0_f32_tied2, svfloat32_t, svbfloat16_t,
+		 z0_res = svbfmlalb_lane_f32 (z4, z0, z1, 0),
+		 z0_res = svbfmlalb_lane (z4, z0, z1, 0))
+
+/*
+** bfmlalb_lane_0_f32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	bfmlalb	z0\.s, z1\.h, \1\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z_REV (bfmlalb_lane_0_f32_tied3, svfloat32_t, svbfloat16_t,
+		 z0_res = svbfmlalb_lane_f32 (z4, z1, z0, 0),
+		 z0_res = svbfmlalb_lane (z4, z1, z0, 0))
+
+/*
+** bfmlalb_lane_0_f32_untied:
+**	movprfx	z0, z1
+**	bfmlalb	z0\.s, z4\.h, z5\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z (bfmlalb_lane_0_f32_untied, svfloat32_t, svbfloat16_t,
+	     z0 = svbfmlalb_lane_f32 (z1, z4, z5, 0),
+	     z0 = svbfmlalb_lane (z1, z4, z5, 0))
+
+/*
+** bfmlalb_lane_1_f32:
+**	bfmlalb	z0\.s, z4\.h, z5\.h\[1\]
+**	ret
+*/
+TEST_DUAL_Z (bfmlalb_lane_1_f32, svfloat32_t, svbfloat16_t,
+	     z0 = svbfmlalb_lane_f32 (z0, z4, z5, 1),
+	     z0 = svbfmlalb_lane (z0, z4, z5, 1))
+
+/*
+** bfmlalb_lane_7_f32:
+**	bfmlalb	z0\.s, z4\.h, z5\.h\[7\]
+**	ret
+*/
+TEST_DUAL_Z (bfmlalb_lane_7_f32, svfloat32_t, svbfloat16_t,
+	     z0 = svbfmlalb_lane_f32 (z0, z4, z5, 7),
+	     z0 = svbfmlalb_lane (z0, z4, z5, 7))
+
+/*
+** bfmlalb_lane_z8_f32:
+**	str	d8, \[sp, -16\]!
+**	mov	(z[0-7])\.d, z8\.d
+**	bfmlalb	z0\.s, z1\.h, \1\.h\[1\]
+**	ldr	d8, \[sp\], 16
+**	ret
+*/
+TEST_DUAL_LANE_REG (bfmlalb_lane_z8_f32, svfloat32_t, svbfloat16_t, z8,
+		    z0 = svbfmlalb_lane_f32 (z0, z1, z8, 1),
+		    z0 = svbfmlalb_lane (z0, z1, z8, 1))
+
+/*
+** bfmlalb_lane_z16_f32:
+**	mov	(z[0-7])\.d, z16\.d
+**	bfmlalb	z0\.s, z1\.h, \1\.h\[1\]
+**	ret
+*/
+TEST_DUAL_LANE_REG (bfmlalb_lane_z16_f32, svfloat32_t, svbfloat16_t, z16,
+		    z0 = svbfmlalb_lane_f32 (z0, z1, z16, 1),
+		    z0 = svbfmlalb_lane (z0, z1, z16, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalt_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalt_f32.c
new file mode 100644
index 000000000..2a583fa4a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalt_f32.c
@@ -0,0 +1,67 @@
+/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
+/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** bfmlalt_f32_tied1:
+**	bfmlalt	z0\.s, z4\.h, z5\.h
+**	ret
+*/
+TEST_DUAL_Z (bfmlalt_f32_tied1, svfloat32_t, svbfloat16_t,
+	     z0 = svbfmlalt_f32 (z0, z4, z5),
+	     z0 = svbfmlalt (z0, z4, z5))
+
+/*
+** bfmlalt_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	bfmlalt	z0\.s, \1\.h, z1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (bfmlalt_f32_tied2, svfloat32_t, svbfloat16_t,
+		 z0_res = svbfmlalt_f32 (z4, z0, z1),
+		 z0_res = svbfmlalt (z4, z0, z1))
+
+/*
+** bfmlalt_f32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	bfmlalt	z0\.s, z1\.h, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (bfmlalt_f32_tied3, svfloat32_t, svbfloat16_t,
+		 z0_res = svbfmlalt_f32 (z4, z1, z0),
+		 z0_res = svbfmlalt (z4, z1, z0))
+
+/*
+** bfmlalt_f32_untied:
+**	movprfx	z0, z1
+**	bfmlalt	z0\.s, z4\.h, z5\.h
+**	ret
+*/
+TEST_DUAL_Z (bfmlalt_f32_untied, svfloat32_t, svbfloat16_t,
+	     z0 = svbfmlalt_f32 (z1, z4, z5),
+	     z0 = svbfmlalt (z1, z4, z5))
+
+/*
+** bfmlalt_h7_f32_tied1:
+**	mov	(z[0-9]+\.h), h7
+**	bfmlalt	z0\.s, z4\.h, \1
+**	ret
+*/
+TEST_DUAL_ZD (bfmlalt_h7_f32_tied1, svfloat32_t, svbfloat16_t, bfloat16_t,
+	      z0 = svbfmlalt_n_f32 (z0, z4, d7),
+	      z0 = svbfmlalt (z0, z4, d7))
+
+/*
+** bfmlalt_h7_f32_untied:
+**	mov	(z[0-9]+\.h), h7
+**	movprfx	z0, z1
+**	bfmlalt	z0\.s, z4\.h, \1
+**	ret
+*/
+TEST_DUAL_ZD (bfmlalt_h7_f32_untied, svfloat32_t, svbfloat16_t, bfloat16_t,
+	      z0 = svbfmlalt_n_f32 (z1, z4, d7),
+	      z0 = svbfmlalt (z1, z4, d7))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalt_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalt_lane_f32.c
new file mode 100644
index 000000000..3af3997e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmlalt_lane_f32.c
@@ -0,0 +1,86 @@
+/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
+/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** bfmlalt_lane_0_f32_tied1:
+**	bfmlalt	z0\.s, z4\.h, z5\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z (bfmlalt_lane_0_f32_tied1, svfloat32_t, svbfloat16_t,
+	     z0 = svbfmlalt_lane_f32 (z0, z4, z5, 0),
+	     z0 = svbfmlalt_lane (z0, z4, z5, 0))
+
+/*
+** bfmlalt_lane_0_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	bfmlalt	z0\.s, \1\.h, z1\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z_REV (bfmlalt_lane_0_f32_tied2, svfloat32_t, svbfloat16_t,
+		 z0_res = svbfmlalt_lane_f32 (z4, z0, z1, 0),
+		 z0_res = svbfmlalt_lane (z4, z0, z1, 0))
+
+/*
+** bfmlalt_lane_0_f32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	bfmlalt	z0\.s, z1\.h, \1\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z_REV (bfmlalt_lane_0_f32_tied3, svfloat32_t, svbfloat16_t,
+		 z0_res = svbfmlalt_lane_f32 (z4, z1, z0, 0),
+		 z0_res = svbfmlalt_lane (z4, z1, z0, 0))
+
+/*
+** bfmlalt_lane_0_f32_untied:
+**	movprfx	z0, z1
+**	bfmlalt	z0\.s, z4\.h, z5\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z (bfmlalt_lane_0_f32_untied, svfloat32_t, svbfloat16_t,
+	     z0 = svbfmlalt_lane_f32 (z1, z4, z5, 0),
+	     z0 = svbfmlalt_lane (z1, z4, z5, 0))
+
+/*
+** bfmlalt_lane_1_f32:
+**	bfmlalt	z0\.s, z4\.h, z5\.h\[1\]
+**	ret
+*/
+TEST_DUAL_Z (bfmlalt_lane_1_f32, svfloat32_t, svbfloat16_t,
+	     z0 = svbfmlalt_lane_f32 (z0, z4, z5, 1),
+	     z0 = svbfmlalt_lane (z0, z4, z5, 1))
+
+/*
+** bfmlalt_lane_7_f32:
+**	bfmlalt	z0\.s, z4\.h, z5\.h\[7\]
+**	ret
+*/
+TEST_DUAL_Z (bfmlalt_lane_7_f32, svfloat32_t, svbfloat16_t,
+	     z0 = svbfmlalt_lane_f32 (z0, z4, z5, 7),
+	     z0 = svbfmlalt_lane (z0, z4, z5, 7))
+
+/*
+** bfmlalt_lane_z8_f32:
+**	str	d8, \[sp, -16\]!
+**	mov	(z[0-7])\.d, z8\.d
+**	bfmlalt	z0\.s, z1\.h, \1\.h\[1\]
+**	ldr	d8, \[sp\], 16
+**	ret
+*/
+TEST_DUAL_LANE_REG (bfmlalt_lane_z8_f32, svfloat32_t, svbfloat16_t, z8,
+		    z0 = svbfmlalt_lane_f32 (z0, z1, z8, 1),
+		    z0 = svbfmlalt_lane (z0, z1, z8, 1))
+
+/*
+** bfmlalt_lane_z16_f32:
+**	mov	(z[0-7])\.d, z16\.d
+**	bfmlalt	z0\.s, z1\.h, \1\.h\[1\]
+**	ret
+*/
+TEST_DUAL_LANE_REG (bfmlalt_lane_z16_f32, svfloat32_t, svbfloat16_t, z16,
+		    z0 = svbfmlalt_lane_f32 (z0, z1, z16, 1),
+		    z0 = svbfmlalt_lane (z0, z1, z16, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmmla_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmmla_f32.c
new file mode 100644
index 000000000..b1d98fbf5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bfmmla_f32.c
@@ -0,0 +1,46 @@
+/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
+/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** bfmmla_f32_tied1:
+**	bfmmla	z0\.s, z4\.h, z5\.h
+**	ret
+*/
+TEST_DUAL_Z (bfmmla_f32_tied1, svfloat32_t, svbfloat16_t,
+	     z0 = svbfmmla_f32 (z0, z4, z5),
+	     z0 = svbfmmla (z0, z4, z5))
+
+/*
+** bfmmla_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	bfmmla	z0\.s, \1\.h, z1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (bfmmla_f32_tied2, svfloat32_t, svbfloat16_t,
+		 z0_res = svbfmmla_f32 (z4, z0, z1),
+		 z0_res = svbfmmla (z4, z0, z1))
+
+/*
+** bfmmla_f32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	bfmmla	z0\.s, z1\.h, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (bfmmla_f32_tied3, svfloat32_t, svbfloat16_t,
+		 z0_res = svbfmmla_f32 (z4, z1, z0),
+		 z0_res = svbfmmla (z4, z1, z0))
+
+/*
+** bfmmla_f32_untied:
+**	movprfx	z0, z1
+**	bfmmla	z0\.s, z4\.h, z5\.h
+**	ret
+*/
+TEST_DUAL_Z (bfmmla_f32_untied, svfloat32_t, svbfloat16_t,
+	     z0 = svbfmmla_f32 (z1, z4, z5),
+	     z0 = svbfmmla (z1, z4, z5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_b.c
new file mode 100644
index 000000000..9d41aeaa2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_b.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** bic_b_z_tied1:
+**	bic	p0\.b, p3/z, p0\.b, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (bic_b_z_tied1,
+		p0 = svbic_b_z (p3, p0, p1),
+		p0 = svbic_z (p3, p0, p1))
+
+/*
+** bic_b_z_tied2:
+**	bic	p0\.b, p3/z, p1\.b, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (bic_b_z_tied2,
+		p0 = svbic_b_z (p3, p1, p0),
+		p0 = svbic_z (p3, p1, p0))
+
+/*
+** bic_b_z_untied:
+**	bic	p0\.b, p3/z, p1\.b, p2\.b
+**	ret
+*/
+TEST_UNIFORM_P (bic_b_z_untied,
+		p0 = svbic_b_z (p3, p1, p2),
+		p0 = svbic_z (p3, p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s16.c
new file mode 100644
index 000000000..c80f5697f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s16.c
@@ -0,0 +1,367 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** bic_s16_m_tied1:
+**	bic	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s16_m_tied1, svint16_t,
+		z0 = svbic_s16_m (p0, z0, z1),
+		z0 = svbic_m (p0, z0, z1))
+
+/*
+** bic_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	bic	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s16_m_tied2, svint16_t,
+		z0 = svbic_s16_m (p0, z1, z0),
+		z0 = svbic_m (p0, z1, z0))
+
+/*
+** bic_s16_m_untied:
+**	movprfx	z0, z1
+**	bic	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s16_m_untied, svint16_t,
+		z0 = svbic_s16_m (p0, z1, z2),
+		z0 = svbic_m (p0, z1, z2))
+
+/*
+** bic_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	bic	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s16_m_tied1, svint16_t, int16_t,
+		 z0 = svbic_n_s16_m (p0, z0, x0),
+		 z0 = svbic_m (p0, z0, x0))
+
+/*
+** bic_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	bic	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s16_m_untied, svint16_t, int16_t,
+		 z0 = svbic_n_s16_m (p0, z1, x0),
+		 z0 = svbic_m (p0, z1, x0))
+
+/*
+** bic_1_s16_m_tied1:
+**	mov	(z[0-9]+\.h), #-2
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s16_m_tied1, svint16_t,
+		z0 = svbic_n_s16_m (p0, z0, 1),
+		z0 = svbic_m (p0, z0, 1))
+
+/*
+** bic_1_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #-2
+**	movprfx	z0, z1
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s16_m_untied, svint16_t,
+		z0 = svbic_n_s16_m (p0, z1, 1),
+		z0 = svbic_m (p0, z1, 1))
+
+/*
+** bic_m2_s16_m:
+**	mov	(z[0-9]+\.h), #1
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m2_s16_m, svint16_t,
+		z0 = svbic_n_s16_m (p0, z0, -2),
+		z0 = svbic_m (p0, z0, -2))
+
+/*
+** bic_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	bic	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s16_z_tied1, svint16_t,
+		z0 = svbic_s16_z (p0, z0, z1),
+		z0 = svbic_z (p0, z0, z1))
+
+/*
+** bic_s16_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, z1\.h
+**	bic	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s16_z_tied2, svint16_t,
+		z0 = svbic_s16_z (p0, z1, z0),
+		z0 = svbic_z (p0, z1, z0))
+
+/*
+** bic_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	bic	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s16_z_untied, svint16_t,
+		z0 = svbic_s16_z (p0, z1, z2),
+		z0 = svbic_z (p0, z1, z2))
+
+/*
+** bic_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	bic	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s16_z_tied1, svint16_t, int16_t,
+		 z0 = svbic_n_s16_z (p0, z0, x0),
+		 z0 = svbic_z (p0, z0, x0))
+
+/*
+** bic_w0_s16_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z1\.h
+**	bic	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s16_z_untied, svint16_t, int16_t,
+		 z0 = svbic_n_s16_z (p0, z1, x0),
+		 z0 = svbic_z (p0, z1, x0))
+
+/*
+** bic_1_s16_z_tied1:
+**	mov	(z[0-9]+\.h), #-2
+**	movprfx	z0\.h, p0/z, z0\.h
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s16_z_tied1, svint16_t,
+		z0 = svbic_n_s16_z (p0, z0, 1),
+		z0 = svbic_z (p0, z0, 1))
+
+/*
+** bic_1_s16_z_untied:
+**	mov	(z[0-9]+\.h), #-2
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	and	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	and	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s16_z_untied, svint16_t,
+		z0 = svbic_n_s16_z (p0, z1, 1),
+		z0 = svbic_z (p0, z1, 1))
+
+/*
+** bic_s16_x_tied1:
+**	bic	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s16_x_tied1, svint16_t,
+		z0 = svbic_s16_x (p0, z0, z1),
+		z0 = svbic_x (p0, z0, z1))
+
+/*
+** bic_s16_x_tied2:
+**	bic	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s16_x_tied2, svint16_t,
+		z0 = svbic_s16_x (p0, z1, z0),
+		z0 = svbic_x (p0, z1, z0))
+
+/*
+** bic_s16_x_untied:
+**	bic	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s16_x_untied, svint16_t,
+		z0 = svbic_s16_x (p0, z1, z2),
+		z0 = svbic_x (p0, z1, z2))
+
+/*
+** bic_w0_s16_x_tied1:
+**	mov	(z[0-9]+)\.h, w0
+**	bic	z0\.d, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s16_x_tied1, svint16_t, int16_t,
+		 z0 = svbic_n_s16_x (p0, z0, x0),
+		 z0 = svbic_x (p0, z0, x0))
+
+/*
+** bic_w0_s16_x_untied:
+**	mov	(z[0-9]+)\.h, w0
+**	bic	z0\.d, z1\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s16_x_untied, svint16_t, int16_t,
+		 z0 = svbic_n_s16_x (p0, z1, x0),
+		 z0 = svbic_x (p0, z1, x0))
+
+/*
+** bic_1_s16_x_tied1:
+**	and	z0\.h, z0\.h, #0xfffe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s16_x_tied1, svint16_t,
+		z0 = svbic_n_s16_x (p0, z0, 1),
+		z0 = svbic_x (p0, z0, 1))
+
+/*
+** bic_1_s16_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.h, z0\.h, #0xfffe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s16_x_untied, svint16_t,
+		z0 = svbic_n_s16_x (p0, z1, 1),
+		z0 = svbic_x (p0, z1, 1))
+
+/*
+** bic_127_s16_x:
+**	and	z0\.h, z0\.h, #0xff80
+**	ret
+*/
+TEST_UNIFORM_Z (bic_127_s16_x, svint16_t,
+		z0 = svbic_n_s16_x (p0, z0, 127),
+		z0 = svbic_x (p0, z0, 127))
+
+/*
+** bic_128_s16_x:
+**	and	z0\.h, z0\.h, #0xff7f
+**	ret
+*/
+TEST_UNIFORM_Z (bic_128_s16_x, svint16_t,
+		z0 = svbic_n_s16_x (p0, z0, 128),
+		z0 = svbic_x (p0, z0, 128))
+
+/*
+** bic_255_s16_x:
+**	and	z0\.h, z0\.h, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (bic_255_s16_x, svint16_t,
+		z0 = svbic_n_s16_x (p0, z0, 255),
+		z0 = svbic_x (p0, z0, 255))
+
+/*
+** bic_256_s16_x:
+**	and	z0\.h, z0\.h, #0xfeff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_256_s16_x, svint16_t,
+		z0 = svbic_n_s16_x (p0, z0, 256),
+		z0 = svbic_x (p0, z0, 256))
+
+/*
+** bic_257_s16_x:
+**	and	z0\.h, z0\.h, #0xfefe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_257_s16_x, svint16_t,
+		z0 = svbic_n_s16_x (p0, z0, 257),
+		z0 = svbic_x (p0, z0, 257))
+
+/*
+** bic_512_s16_x:
+**	and	z0\.h, z0\.h, #0xfdff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_512_s16_x, svint16_t,
+		z0 = svbic_n_s16_x (p0, z0, 512),
+		z0 = svbic_x (p0, z0, 512))
+
+/*
+** bic_65280_s16_x:
+**	and	z0\.h, z0\.h, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_65280_s16_x, svint16_t,
+		z0 = svbic_n_s16_x (p0, z0, 0xff00),
+		z0 = svbic_x (p0, z0, 0xff00))
+
+/*
+** bic_m127_s16_x:
+**	and	z0\.h, z0\.h, #0x7e
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m127_s16_x, svint16_t,
+		z0 = svbic_n_s16_x (p0, z0, -127),
+		z0 = svbic_x (p0, z0, -127))
+
+/*
+** bic_m128_s16_x:
+**	and	z0\.h, z0\.h, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m128_s16_x, svint16_t,
+		z0 = svbic_n_s16_x (p0, z0, -128),
+		z0 = svbic_x (p0, z0, -128))
+
+/*
+** bic_m255_s16_x:
+**	and	z0\.h, z0\.h, #0xfe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m255_s16_x, svint16_t,
+		z0 = svbic_n_s16_x (p0, z0, -255),
+		z0 = svbic_x (p0, z0, -255))
+
+/*
+** bic_m256_s16_x:
+**	and	z0\.h, z0\.h, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m256_s16_x, svint16_t,
+		z0 = svbic_n_s16_x (p0, z0, -256),
+		z0 = svbic_x (p0, z0, -256))
+
+/*
+** bic_m257_s16_x:
+**	and	z0\.h, z0\.h, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m257_s16_x, svint16_t,
+		z0 = svbic_n_s16_x (p0, z0, -257),
+		z0 = svbic_x (p0, z0, -257))
+
+/*
+** bic_m512_s16_x:
+**	and	z0\.h, z0\.h, #0x1ff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m512_s16_x, svint16_t,
+		z0 = svbic_n_s16_x (p0, z0, -512),
+		z0 = svbic_x (p0, z0, -512))
+
+/*
+** bic_m32768_s16_x:
+**	and	z0\.h, z0\.h, #0x7fff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m32768_s16_x, svint16_t,
+		z0 = svbic_n_s16_x (p0, z0, -0x8000),
+		z0 = svbic_x (p0, z0, -0x8000))
+
+/*
+** bic_5_s16_x:
+**	mov	(z[0-9]+)\.h, #-6
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (bic_5_s16_x, svint16_t,
+		z0 = svbic_n_s16_x (p0, z0, 5),
+		z0 = svbic_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s32.c
new file mode 100644
index 000000000..9e388e499
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s32.c
@@ -0,0 +1,363 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** bic_s32_m_tied1:
+**	bic	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s32_m_tied1, svint32_t,
+		z0 = svbic_s32_m (p0, z0, z1),
+		z0 = svbic_m (p0, z0, z1))
+
+/*
+** bic_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	bic	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s32_m_tied2, svint32_t,
+		z0 = svbic_s32_m (p0, z1, z0),
+		z0 = svbic_m (p0, z1, z0))
+
+/*
+** bic_s32_m_untied:
+**	movprfx	z0, z1
+**	bic	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s32_m_untied, svint32_t,
+		z0 = svbic_s32_m (p0, z1, z2),
+		z0 = svbic_m (p0, z1, z2))
+
+/*
+** bic_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	bic	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svbic_n_s32_m (p0, z0, x0),
+		 z0 = svbic_m (p0, z0, x0))
+
+/*
+** bic_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	bic	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svbic_n_s32_m (p0, z1, x0),
+		 z0 = svbic_m (p0, z1, x0))
+
+/*
+** bic_1_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #-2
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s32_m_tied1, svint32_t,
+		z0 = svbic_n_s32_m (p0, z0, 1),
+		z0 = svbic_m (p0, z0, 1))
+
+/*
+** bic_1_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #-2
+**	movprfx	z0, z1
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s32_m_untied, svint32_t,
+		z0 = svbic_n_s32_m (p0, z1, 1),
+		z0 = svbic_m (p0, z1, 1))
+
+/*
+** bic_m2_s32_m:
+**	mov	(z[0-9]+\.s), #1
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m2_s32_m, svint32_t,
+		z0 = svbic_n_s32_m (p0, z0, -2),
+		z0 = svbic_m (p0, z0, -2))
+
+/*
+** bic_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	bic	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s32_z_tied1, svint32_t,
+		z0 = svbic_s32_z (p0, z0, z1),
+		z0 = svbic_z (p0, z0, z1))
+
+/*
+** bic_s32_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, z1\.s
+**	bic	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s32_z_tied2, svint32_t,
+		z0 = svbic_s32_z (p0, z1, z0),
+		z0 = svbic_z (p0, z1, z0))
+
+/*
+** bic_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	bic	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s32_z_untied, svint32_t,
+		z0 = svbic_s32_z (p0, z1, z2),
+		z0 = svbic_z (p0, z1, z2))
+
+/*
+** bic_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	bic	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svbic_n_s32_z (p0, z0, x0),
+		 z0 = svbic_z (p0, z0, x0))
+
+/*
+** bic_w0_s32_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z1\.s
+**	bic	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svbic_n_s32_z (p0, z1, x0),
+		 z0 = svbic_z (p0, z1, x0))
+
+/*
+** bic_1_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #-2
+**	movprfx	z0\.s, p0/z, z0\.s
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s32_z_tied1, svint32_t,
+		z0 = svbic_n_s32_z (p0, z0, 1),
+		z0 = svbic_z (p0, z0, 1))
+
+/*
+** bic_1_s32_z_untied:
+**	mov	(z[0-9]+\.s), #-2
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	and	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	and	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s32_z_untied, svint32_t,
+		z0 = svbic_n_s32_z (p0, z1, 1),
+		z0 = svbic_z (p0, z1, 1))
+
+/*
+** bic_s32_x_tied1:
+**	bic	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s32_x_tied1, svint32_t,
+		z0 = svbic_s32_x (p0, z0, z1),
+		z0 = svbic_x (p0, z0, z1))
+
+/*
+** bic_s32_x_tied2:
+**	bic	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s32_x_tied2, svint32_t,
+		z0 = svbic_s32_x (p0, z1, z0),
+		z0 = svbic_x (p0, z1, z0))
+
+/*
+** bic_s32_x_untied:
+**	bic	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s32_x_untied, svint32_t,
+		z0 = svbic_s32_x (p0, z1, z2),
+		z0 = svbic_x (p0, z1, z2))
+
+/*
+** bic_w0_s32_x_tied1:
+**	mov	(z[0-9]+)\.s, w0
+**	bic	z0\.d, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svbic_n_s32_x (p0, z0, x0),
+		 z0 = svbic_x (p0, z0, x0))
+
+/*
+** bic_w0_s32_x_untied:
+**	mov	(z[0-9]+)\.s, w0
+**	bic	z0\.d, z1\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svbic_n_s32_x (p0, z1, x0),
+		 z0 = svbic_x (p0, z1, x0))
+
+/*
+** bic_1_s32_x_tied1:
+**	and	z0\.s, z0\.s, #0xfffffffe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s32_x_tied1, svint32_t,
+		z0 = svbic_n_s32_x (p0, z0, 1),
+		z0 = svbic_x (p0, z0, 1))
+
+/*
+** bic_1_s32_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.s, z0\.s, #0xfffffffe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s32_x_untied, svint32_t,
+		z0 = svbic_n_s32_x (p0, z1, 1),
+		z0 = svbic_x (p0, z1, 1))
+
+/*
+** bic_127_s32_x:
+**	and	z0\.s, z0\.s, #0xffffff80
+**	ret
+*/
+TEST_UNIFORM_Z (bic_127_s32_x, svint32_t,
+		z0 = svbic_n_s32_x (p0, z0, 127),
+		z0 = svbic_x (p0, z0, 127))
+
+/*
+** bic_128_s32_x:
+**	and	z0\.s, z0\.s, #0xffffff7f
+**	ret
+*/
+TEST_UNIFORM_Z (bic_128_s32_x, svint32_t,
+		z0 = svbic_n_s32_x (p0, z0, 128),
+		z0 = svbic_x (p0, z0, 128))
+
+/*
+** bic_255_s32_x:
+**	and	z0\.s, z0\.s, #0xffffff00
+**	ret
+*/
+TEST_UNIFORM_Z (bic_255_s32_x, svint32_t,
+		z0 = svbic_n_s32_x (p0, z0, 255),
+		z0 = svbic_x (p0, z0, 255))
+
+/*
+** bic_256_s32_x:
+**	and	z0\.s, z0\.s, #0xfffffeff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_256_s32_x, svint32_t,
+		z0 = svbic_n_s32_x (p0, z0, 256),
+		z0 = svbic_x (p0, z0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (bic_257_s32_x, svint32_t,
+		z0 = svbic_n_s32_x (p0, z0, 257),
+		z0 = svbic_x (p0, z0, 257))
+
+/*
+** bic_512_s32_x:
+**	and	z0\.s, z0\.s, #0xfffffdff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_512_s32_x, svint32_t,
+		z0 = svbic_n_s32_x (p0, z0, 512),
+		z0 = svbic_x (p0, z0, 512))
+
+/*
+** bic_65280_s32_x:
+**	and	z0\.s, z0\.s, #0xffff00ff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_65280_s32_x, svint32_t,
+		z0 = svbic_n_s32_x (p0, z0, 0xff00),
+		z0 = svbic_x (p0, z0, 0xff00))
+
+/*
+** bic_m127_s32_x:
+**	and	z0\.s, z0\.s, #0x7e
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m127_s32_x, svint32_t,
+		z0 = svbic_n_s32_x (p0, z0, -127),
+		z0 = svbic_x (p0, z0, -127))
+
+/*
+** bic_m128_s32_x:
+**	and	z0\.s, z0\.s, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m128_s32_x, svint32_t,
+		z0 = svbic_n_s32_x (p0, z0, -128),
+		z0 = svbic_x (p0, z0, -128))
+
+/*
+** bic_m255_s32_x:
+**	and	z0\.s, z0\.s, #0xfe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m255_s32_x, svint32_t,
+		z0 = svbic_n_s32_x (p0, z0, -255),
+		z0 = svbic_x (p0, z0, -255))
+
+/*
+** bic_m256_s32_x:
+**	and	z0\.s, z0\.s, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m256_s32_x, svint32_t,
+		z0 = svbic_n_s32_x (p0, z0, -256),
+		z0 = svbic_x (p0, z0, -256))
+
+/*
+** bic_m257_s32_x:
+**	and	z0\.s, z0\.s, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m257_s32_x, svint32_t,
+		z0 = svbic_n_s32_x (p0, z0, -257),
+		z0 = svbic_x (p0, z0, -257))
+
+/*
+** bic_m512_s32_x:
+**	and	z0\.s, z0\.s, #0x1ff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m512_s32_x, svint32_t,
+		z0 = svbic_n_s32_x (p0, z0, -512),
+		z0 = svbic_x (p0, z0, -512))
+
+/*
+** bic_m32768_s32_x:
+**	and	z0\.s, z0\.s, #0x7fff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m32768_s32_x, svint32_t,
+		z0 = svbic_n_s32_x (p0, z0, -0x8000),
+		z0 = svbic_x (p0, z0, -0x8000))
+
+/*
+** bic_5_s32_x:
+**	mov	(z[0-9]+)\.s, #-6
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (bic_5_s32_x, svint32_t,
+		z0 = svbic_n_s32_x (p0, z0, 5),
+		z0 = svbic_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s64.c
new file mode 100644
index 000000000..bf9536815
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s64.c
@@ -0,0 +1,363 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** bic_s64_m_tied1:
+**	bic	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s64_m_tied1, svint64_t,
+		z0 = svbic_s64_m (p0, z0, z1),
+		z0 = svbic_m (p0, z0, z1))
+
+/*
+** bic_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	bic	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s64_m_tied2, svint64_t,
+		z0 = svbic_s64_m (p0, z1, z0),
+		z0 = svbic_m (p0, z1, z0))
+
+/*
+** bic_s64_m_untied:
+**	movprfx	z0, z1
+**	bic	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s64_m_untied, svint64_t,
+		z0 = svbic_s64_m (p0, z1, z2),
+		z0 = svbic_m (p0, z1, z2))
+
+/*
+** bic_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	bic	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svbic_n_s64_m (p0, z0, x0),
+		 z0 = svbic_m (p0, z0, x0))
+
+/*
+** bic_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	bic	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svbic_n_s64_m (p0, z1, x0),
+		 z0 = svbic_m (p0, z1, x0))
+
+/*
+** bic_1_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #-2
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s64_m_tied1, svint64_t,
+		z0 = svbic_n_s64_m (p0, z0, 1),
+		z0 = svbic_m (p0, z0, 1))
+
+/*
+** bic_1_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #-2
+**	movprfx	z0, z1
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s64_m_untied, svint64_t,
+		z0 = svbic_n_s64_m (p0, z1, 1),
+		z0 = svbic_m (p0, z1, 1))
+
+/*
+** bic_m2_s64_m:
+**	mov	(z[0-9]+\.d), #1
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m2_s64_m, svint64_t,
+		z0 = svbic_n_s64_m (p0, z0, -2),
+		z0 = svbic_m (p0, z0, -2))
+
+/*
+** bic_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	bic	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s64_z_tied1, svint64_t,
+		z0 = svbic_s64_z (p0, z0, z1),
+		z0 = svbic_z (p0, z0, z1))
+
+/*
+** bic_s64_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, z1\.d
+**	bic	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s64_z_tied2, svint64_t,
+		z0 = svbic_s64_z (p0, z1, z0),
+		z0 = svbic_z (p0, z1, z0))
+
+/*
+** bic_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	bic	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s64_z_untied, svint64_t,
+		z0 = svbic_s64_z (p0, z1, z2),
+		z0 = svbic_z (p0, z1, z2))
+
+/*
+** bic_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	bic	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svbic_n_s64_z (p0, z0, x0),
+		 z0 = svbic_z (p0, z0, x0))
+
+/*
+** bic_x0_s64_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z1\.d
+**	bic	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svbic_n_s64_z (p0, z1, x0),
+		 z0 = svbic_z (p0, z1, x0))
+
+/*
+** bic_1_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #-2
+**	movprfx	z0\.d, p0/z, z0\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s64_z_tied1, svint64_t,
+		z0 = svbic_n_s64_z (p0, z0, 1),
+		z0 = svbic_z (p0, z0, 1))
+
+/*
+** bic_1_s64_z_untied:
+**	mov	(z[0-9]+\.d), #-2
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	and	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s64_z_untied, svint64_t,
+		z0 = svbic_n_s64_z (p0, z1, 1),
+		z0 = svbic_z (p0, z1, 1))
+
+/*
+** bic_s64_x_tied1:
+**	bic	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s64_x_tied1, svint64_t,
+		z0 = svbic_s64_x (p0, z0, z1),
+		z0 = svbic_x (p0, z0, z1))
+
+/*
+** bic_s64_x_tied2:
+**	bic	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s64_x_tied2, svint64_t,
+		z0 = svbic_s64_x (p0, z1, z0),
+		z0 = svbic_x (p0, z1, z0))
+
+/*
+** bic_s64_x_untied:
+**	bic	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s64_x_untied, svint64_t,
+		z0 = svbic_s64_x (p0, z1, z2),
+		z0 = svbic_x (p0, z1, z2))
+
+/*
+** bic_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	bic	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svbic_n_s64_x (p0, z0, x0),
+		 z0 = svbic_x (p0, z0, x0))
+
+/*
+** bic_x0_s64_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	bic	z0\.d, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svbic_n_s64_x (p0, z1, x0),
+		 z0 = svbic_x (p0, z1, x0))
+
+/*
+** bic_1_s64_x_tied1:
+**	and	z0\.d, z0\.d, #0xfffffffffffffffe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s64_x_tied1, svint64_t,
+		z0 = svbic_n_s64_x (p0, z0, 1),
+		z0 = svbic_x (p0, z0, 1))
+
+/*
+** bic_1_s64_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.d, z0\.d, #0xfffffffffffffffe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s64_x_untied, svint64_t,
+		z0 = svbic_n_s64_x (p0, z1, 1),
+		z0 = svbic_x (p0, z1, 1))
+
+/*
+** bic_127_s64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffffff80
+**	ret
+*/
+TEST_UNIFORM_Z (bic_127_s64_x, svint64_t,
+		z0 = svbic_n_s64_x (p0, z0, 127),
+		z0 = svbic_x (p0, z0, 127))
+
+/*
+** bic_128_s64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffffff7f
+**	ret
+*/
+TEST_UNIFORM_Z (bic_128_s64_x, svint64_t,
+		z0 = svbic_n_s64_x (p0, z0, 128),
+		z0 = svbic_x (p0, z0, 128))
+
+/*
+** bic_255_s64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffffff00
+**	ret
+*/
+TEST_UNIFORM_Z (bic_255_s64_x, svint64_t,
+		z0 = svbic_n_s64_x (p0, z0, 255),
+		z0 = svbic_x (p0, z0, 255))
+
+/*
+** bic_256_s64_x:
+**	and	z0\.d, z0\.d, #0xfffffffffffffeff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_256_s64_x, svint64_t,
+		z0 = svbic_n_s64_x (p0, z0, 256),
+		z0 = svbic_x (p0, z0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (bic_257_s64_x, svint64_t,
+		z0 = svbic_n_s64_x (p0, z0, 257),
+		z0 = svbic_x (p0, z0, 257))
+
+/*
+** bic_512_s64_x:
+**	and	z0\.d, z0\.d, #0xfffffffffffffdff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_512_s64_x, svint64_t,
+		z0 = svbic_n_s64_x (p0, z0, 512),
+		z0 = svbic_x (p0, z0, 512))
+
+/*
+** bic_65280_s64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffff00ff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_65280_s64_x, svint64_t,
+		z0 = svbic_n_s64_x (p0, z0, 0xff00),
+		z0 = svbic_x (p0, z0, 0xff00))
+
+/*
+** bic_m127_s64_x:
+**	and	z0\.d, z0\.d, #0x7e
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m127_s64_x, svint64_t,
+		z0 = svbic_n_s64_x (p0, z0, -127),
+		z0 = svbic_x (p0, z0, -127))
+
+/*
+** bic_m128_s64_x:
+**	and	z0\.d, z0\.d, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m128_s64_x, svint64_t,
+		z0 = svbic_n_s64_x (p0, z0, -128),
+		z0 = svbic_x (p0, z0, -128))
+
+/*
+** bic_m255_s64_x:
+**	and	z0\.d, z0\.d, #0xfe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m255_s64_x, svint64_t,
+		z0 = svbic_n_s64_x (p0, z0, -255),
+		z0 = svbic_x (p0, z0, -255))
+
+/*
+** bic_m256_s64_x:
+**	and	z0\.d, z0\.d, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m256_s64_x, svint64_t,
+		z0 = svbic_n_s64_x (p0, z0, -256),
+		z0 = svbic_x (p0, z0, -256))
+
+/*
+** bic_m257_s64_x:
+**	and	z0\.d, z0\.d, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m257_s64_x, svint64_t,
+		z0 = svbic_n_s64_x (p0, z0, -257),
+		z0 = svbic_x (p0, z0, -257))
+
+/*
+** bic_m512_s64_x:
+**	and	z0\.d, z0\.d, #0x1ff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m512_s64_x, svint64_t,
+		z0 = svbic_n_s64_x (p0, z0, -512),
+		z0 = svbic_x (p0, z0, -512))
+
+/*
+** bic_m32768_s64_x:
+**	and	z0\.d, z0\.d, #0x7fff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m32768_s64_x, svint64_t,
+		z0 = svbic_n_s64_x (p0, z0, -0x8000),
+		z0 = svbic_x (p0, z0, -0x8000))
+
+/*
+** bic_5_s64_x:
+**	mov	(z[0-9]+\.d), #-6
+**	and	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (bic_5_s64_x, svint64_t,
+		z0 = svbic_n_s64_x (p0, z0, 5),
+		z0 = svbic_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s8.c
new file mode 100644
index 000000000..0958a3403
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_s8.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** bic_s8_m_tied1:
+**	bic	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s8_m_tied1, svint8_t,
+		z0 = svbic_s8_m (p0, z0, z1),
+		z0 = svbic_m (p0, z0, z1))
+
+/*
+** bic_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	bic	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s8_m_tied2, svint8_t,
+		z0 = svbic_s8_m (p0, z1, z0),
+		z0 = svbic_m (p0, z1, z0))
+
+/*
+** bic_s8_m_untied:
+**	movprfx	z0, z1
+**	bic	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s8_m_untied, svint8_t,
+		z0 = svbic_s8_m (p0, z1, z2),
+		z0 = svbic_m (p0, z1, z2))
+
+/*
+** bic_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	bic	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s8_m_tied1, svint8_t, int8_t,
+		 z0 = svbic_n_s8_m (p0, z0, x0),
+		 z0 = svbic_m (p0, z0, x0))
+
+/*
+** bic_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	bic	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s8_m_untied, svint8_t, int8_t,
+		 z0 = svbic_n_s8_m (p0, z1, x0),
+		 z0 = svbic_m (p0, z1, x0))
+
+/*
+** bic_1_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #-2
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s8_m_tied1, svint8_t,
+		z0 = svbic_n_s8_m (p0, z0, 1),
+		z0 = svbic_m (p0, z0, 1))
+
+/*
+** bic_1_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #-2
+**	movprfx	z0, z1
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s8_m_untied, svint8_t,
+		z0 = svbic_n_s8_m (p0, z1, 1),
+		z0 = svbic_m (p0, z1, 1))
+
+/*
+** bic_m2_s8_m:
+**	mov	(z[0-9]+\.b), #1
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m2_s8_m, svint8_t,
+		z0 = svbic_n_s8_m (p0, z0, -2),
+		z0 = svbic_m (p0, z0, -2))
+
+/*
+** bic_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	bic	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s8_z_tied1, svint8_t,
+		z0 = svbic_s8_z (p0, z0, z1),
+		z0 = svbic_z (p0, z0, z1))
+
+/*
+** bic_s8_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.b, p0/z, z1\.b
+**	bic	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s8_z_tied2, svint8_t,
+		z0 = svbic_s8_z (p0, z1, z0),
+		z0 = svbic_z (p0, z1, z0))
+
+/*
+** bic_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	bic	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s8_z_untied, svint8_t,
+		z0 = svbic_s8_z (p0, z1, z2),
+		z0 = svbic_z (p0, z1, z2))
+
+/*
+** bic_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	bic	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s8_z_tied1, svint8_t, int8_t,
+		 z0 = svbic_n_s8_z (p0, z0, x0),
+		 z0 = svbic_z (p0, z0, x0))
+
+/*
+** bic_w0_s8_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z1\.b
+**	bic	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s8_z_untied, svint8_t, int8_t,
+		 z0 = svbic_n_s8_z (p0, z1, x0),
+		 z0 = svbic_z (p0, z1, x0))
+
+/*
+** bic_1_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #-2
+**	movprfx	z0\.b, p0/z, z0\.b
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s8_z_tied1, svint8_t,
+		z0 = svbic_n_s8_z (p0, z0, 1),
+		z0 = svbic_z (p0, z0, 1))
+
+/*
+** bic_1_s8_z_untied:
+**	mov	(z[0-9]+\.b), #-2
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	and	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	and	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s8_z_untied, svint8_t,
+		z0 = svbic_n_s8_z (p0, z1, 1),
+		z0 = svbic_z (p0, z1, 1))
+
+/*
+** bic_s8_x_tied1:
+**	bic	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s8_x_tied1, svint8_t,
+		z0 = svbic_s8_x (p0, z0, z1),
+		z0 = svbic_x (p0, z0, z1))
+
+/*
+** bic_s8_x_tied2:
+**	bic	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s8_x_tied2, svint8_t,
+		z0 = svbic_s8_x (p0, z1, z0),
+		z0 = svbic_x (p0, z1, z0))
+
+/*
+** bic_s8_x_untied:
+**	bic	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_s8_x_untied, svint8_t,
+		z0 = svbic_s8_x (p0, z1, z2),
+		z0 = svbic_x (p0, z1, z2))
+
+/*
+** bic_w0_s8_x_tied1:
+**	mov	(z[0-9]+)\.b, w0
+**	bic	z0\.d, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s8_x_tied1, svint8_t, int8_t,
+		 z0 = svbic_n_s8_x (p0, z0, x0),
+		 z0 = svbic_x (p0, z0, x0))
+
+/*
+** bic_w0_s8_x_untied:
+**	mov	(z[0-9]+)\.b, w0
+**	bic	z0\.d, z1\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_s8_x_untied, svint8_t, int8_t,
+		 z0 = svbic_n_s8_x (p0, z1, x0),
+		 z0 = svbic_x (p0, z1, x0))
+
+/*
+** bic_1_s8_x_tied1:
+**	and	z0\.b, z0\.b, #0xfe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s8_x_tied1, svint8_t,
+		z0 = svbic_n_s8_x (p0, z0, 1),
+		z0 = svbic_x (p0, z0, 1))
+
+/*
+** bic_1_s8_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.b, z0\.b, #0xfe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_s8_x_untied, svint8_t,
+		z0 = svbic_n_s8_x (p0, z1, 1),
+		z0 = svbic_x (p0, z1, 1))
+
+/*
+** bic_127_s8_x:
+**	and	z0\.b, z0\.b, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (bic_127_s8_x, svint8_t,
+		z0 = svbic_n_s8_x (p0, z0, 127),
+		z0 = svbic_x (p0, z0, 127))
+
+/*
+** bic_128_s8_x:
+**	and	z0\.b, z0\.b, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (bic_128_s8_x, svint8_t,
+		z0 = svbic_n_s8_x (p0, z0, 128),
+		z0 = svbic_x (p0, z0, 128))
+
+/*
+** bic_255_s8_x:
+**	mov	z0\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (bic_255_s8_x, svint8_t,
+		z0 = svbic_n_s8_x (p0, z0, 255),
+		z0 = svbic_x (p0, z0, 255))
+
+/*
+** bic_m127_s8_x:
+**	and	z0\.b, z0\.b, #0x7e
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m127_s8_x, svint8_t,
+		z0 = svbic_n_s8_x (p0, z0, -127),
+		z0 = svbic_x (p0, z0, -127))
+
+/*
+** bic_m128_s8_x:
+**	and	z0\.b, z0\.b, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m128_s8_x, svint8_t,
+		z0 = svbic_n_s8_x (p0, z0, -128),
+		z0 = svbic_x (p0, z0, -128))
+
+/*
+** bic_5_s8_x:
+**	mov	(z[0-9]+)\.b, #-6
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (bic_5_s8_x, svint8_t,
+		z0 = svbic_n_s8_x (p0, z0, 5),
+		z0 = svbic_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u16.c
new file mode 100644
index 000000000..30209ffb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u16.c
@@ -0,0 +1,367 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** bic_u16_m_tied1:
+**	bic	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u16_m_tied1, svuint16_t,
+		z0 = svbic_u16_m (p0, z0, z1),
+		z0 = svbic_m (p0, z0, z1))
+
+/*
+** bic_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	bic	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u16_m_tied2, svuint16_t,
+		z0 = svbic_u16_m (p0, z1, z0),
+		z0 = svbic_m (p0, z1, z0))
+
+/*
+** bic_u16_m_untied:
+**	movprfx	z0, z1
+**	bic	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u16_m_untied, svuint16_t,
+		z0 = svbic_u16_m (p0, z1, z2),
+		z0 = svbic_m (p0, z1, z2))
+
+/*
+** bic_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	bic	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svbic_n_u16_m (p0, z0, x0),
+		 z0 = svbic_m (p0, z0, x0))
+
+/*
+** bic_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	bic	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svbic_n_u16_m (p0, z1, x0),
+		 z0 = svbic_m (p0, z1, x0))
+
+/*
+** bic_1_u16_m_tied1:
+**	mov	(z[0-9]+\.h), #-2
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u16_m_tied1, svuint16_t,
+		z0 = svbic_n_u16_m (p0, z0, 1),
+		z0 = svbic_m (p0, z0, 1))
+
+/*
+** bic_1_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #-2
+**	movprfx	z0, z1
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u16_m_untied, svuint16_t,
+		z0 = svbic_n_u16_m (p0, z1, 1),
+		z0 = svbic_m (p0, z1, 1))
+
+/*
+** bic_m2_u16_m:
+**	mov	(z[0-9]+\.h), #1
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m2_u16_m, svuint16_t,
+		z0 = svbic_n_u16_m (p0, z0, -2),
+		z0 = svbic_m (p0, z0, -2))
+
+/*
+** bic_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	bic	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u16_z_tied1, svuint16_t,
+		z0 = svbic_u16_z (p0, z0, z1),
+		z0 = svbic_z (p0, z0, z1))
+
+/*
+** bic_u16_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, z1\.h
+**	bic	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u16_z_tied2, svuint16_t,
+		z0 = svbic_u16_z (p0, z1, z0),
+		z0 = svbic_z (p0, z1, z0))
+
+/*
+** bic_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	bic	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u16_z_untied, svuint16_t,
+		z0 = svbic_u16_z (p0, z1, z2),
+		z0 = svbic_z (p0, z1, z2))
+
+/*
+** bic_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	bic	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svbic_n_u16_z (p0, z0, x0),
+		 z0 = svbic_z (p0, z0, x0))
+
+/*
+** bic_w0_u16_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z1\.h
+**	bic	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svbic_n_u16_z (p0, z1, x0),
+		 z0 = svbic_z (p0, z1, x0))
+
+/*
+** bic_1_u16_z_tied1:
+**	mov	(z[0-9]+\.h), #-2
+**	movprfx	z0\.h, p0/z, z0\.h
+**	and	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u16_z_tied1, svuint16_t,
+		z0 = svbic_n_u16_z (p0, z0, 1),
+		z0 = svbic_z (p0, z0, 1))
+
+/*
+** bic_1_u16_z_untied:
+**	mov	(z[0-9]+\.h), #-2
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	and	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	and	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u16_z_untied, svuint16_t,
+		z0 = svbic_n_u16_z (p0, z1, 1),
+		z0 = svbic_z (p0, z1, 1))
+
+/*
+** bic_u16_x_tied1:
+**	bic	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u16_x_tied1, svuint16_t,
+		z0 = svbic_u16_x (p0, z0, z1),
+		z0 = svbic_x (p0, z0, z1))
+
+/*
+** bic_u16_x_tied2:
+**	bic	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u16_x_tied2, svuint16_t,
+		z0 = svbic_u16_x (p0, z1, z0),
+		z0 = svbic_x (p0, z1, z0))
+
+/*
+** bic_u16_x_untied:
+**	bic	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u16_x_untied, svuint16_t,
+		z0 = svbic_u16_x (p0, z1, z2),
+		z0 = svbic_x (p0, z1, z2))
+
+/*
+** bic_w0_u16_x_tied1:
+**	mov	(z[0-9]+)\.h, w0
+**	bic	z0\.d, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svbic_n_u16_x (p0, z0, x0),
+		 z0 = svbic_x (p0, z0, x0))
+
+/*
+** bic_w0_u16_x_untied:
+**	mov	(z[0-9]+)\.h, w0
+**	bic	z0\.d, z1\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svbic_n_u16_x (p0, z1, x0),
+		 z0 = svbic_x (p0, z1, x0))
+
+/*
+** bic_1_u16_x_tied1:
+**	and	z0\.h, z0\.h, #0xfffe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u16_x_tied1, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z0, 1),
+		z0 = svbic_x (p0, z0, 1))
+
+/*
+** bic_1_u16_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.h, z0\.h, #0xfffe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u16_x_untied, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z1, 1),
+		z0 = svbic_x (p0, z1, 1))
+
+/*
+** bic_127_u16_x:
+**	and	z0\.h, z0\.h, #0xff80
+**	ret
+*/
+TEST_UNIFORM_Z (bic_127_u16_x, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z0, 127),
+		z0 = svbic_x (p0, z0, 127))
+
+/*
+** bic_128_u16_x:
+**	and	z0\.h, z0\.h, #0xff7f
+**	ret
+*/
+TEST_UNIFORM_Z (bic_128_u16_x, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z0, 128),
+		z0 = svbic_x (p0, z0, 128))
+
+/*
+** bic_255_u16_x:
+**	and	z0\.h, z0\.h, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (bic_255_u16_x, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z0, 255),
+		z0 = svbic_x (p0, z0, 255))
+
+/*
+** bic_256_u16_x:
+**	and	z0\.h, z0\.h, #0xfeff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_256_u16_x, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z0, 256),
+		z0 = svbic_x (p0, z0, 256))
+
+/*
+** bic_257_u16_x:
+**	and	z0\.h, z0\.h, #0xfefe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_257_u16_x, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z0, 257),
+		z0 = svbic_x (p0, z0, 257))
+
+/*
+** bic_512_u16_x:
+**	and	z0\.h, z0\.h, #0xfdff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_512_u16_x, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z0, 512),
+		z0 = svbic_x (p0, z0, 512))
+
+/*
+** bic_65280_u16_x:
+**	and	z0\.h, z0\.h, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_65280_u16_x, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z0, 0xff00),
+		z0 = svbic_x (p0, z0, 0xff00))
+
+/*
+** bic_m127_u16_x:
+**	and	z0\.h, z0\.h, #0x7e
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m127_u16_x, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z0, -127),
+		z0 = svbic_x (p0, z0, -127))
+
+/*
+** bic_m128_u16_x:
+**	and	z0\.h, z0\.h, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m128_u16_x, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z0, -128),
+		z0 = svbic_x (p0, z0, -128))
+
+/*
+** bic_m255_u16_x:
+**	and	z0\.h, z0\.h, #0xfe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m255_u16_x, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z0, -255),
+		z0 = svbic_x (p0, z0, -255))
+
+/*
+** bic_m256_u16_x:
+**	and	z0\.h, z0\.h, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m256_u16_x, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z0, -256),
+		z0 = svbic_x (p0, z0, -256))
+
+/*
+** bic_m257_u16_x:
+**	and	z0\.h, z0\.h, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m257_u16_x, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z0, -257),
+		z0 = svbic_x (p0, z0, -257))
+
+/*
+** bic_m512_u16_x:
+**	and	z0\.h, z0\.h, #0x1ff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m512_u16_x, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z0, -512),
+		z0 = svbic_x (p0, z0, -512))
+
+/*
+** bic_m32768_u16_x:
+**	and	z0\.h, z0\.h, #0x7fff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m32768_u16_x, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z0, -0x8000),
+		z0 = svbic_x (p0, z0, -0x8000))
+
+/*
+** bic_5_u16_x:
+**	mov	(z[0-9]+)\.h, #-6
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (bic_5_u16_x, svuint16_t,
+		z0 = svbic_n_u16_x (p0, z0, 5),
+		z0 = svbic_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u32.c
new file mode 100644
index 000000000..b308b599b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u32.c
@@ -0,0 +1,363 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** bic_u32_m_tied1:
+**	bic	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u32_m_tied1, svuint32_t,
+		z0 = svbic_u32_m (p0, z0, z1),
+		z0 = svbic_m (p0, z0, z1))
+
+/*
+** bic_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	bic	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u32_m_tied2, svuint32_t,
+		z0 = svbic_u32_m (p0, z1, z0),
+		z0 = svbic_m (p0, z1, z0))
+
+/*
+** bic_u32_m_untied:
+**	movprfx	z0, z1
+**	bic	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u32_m_untied, svuint32_t,
+		z0 = svbic_u32_m (p0, z1, z2),
+		z0 = svbic_m (p0, z1, z2))
+
+/*
+** bic_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	bic	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svbic_n_u32_m (p0, z0, x0),
+		 z0 = svbic_m (p0, z0, x0))
+
+/*
+** bic_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	bic	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svbic_n_u32_m (p0, z1, x0),
+		 z0 = svbic_m (p0, z1, x0))
+
+/*
+** bic_1_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #-2
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u32_m_tied1, svuint32_t,
+		z0 = svbic_n_u32_m (p0, z0, 1),
+		z0 = svbic_m (p0, z0, 1))
+
+/*
+** bic_1_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #-2
+**	movprfx	z0, z1
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u32_m_untied, svuint32_t,
+		z0 = svbic_n_u32_m (p0, z1, 1),
+		z0 = svbic_m (p0, z1, 1))
+
+/*
+** bic_m2_u32_m:
+**	mov	(z[0-9]+\.s), #1
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m2_u32_m, svuint32_t,
+		z0 = svbic_n_u32_m (p0, z0, -2),
+		z0 = svbic_m (p0, z0, -2))
+
+/*
+** bic_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	bic	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u32_z_tied1, svuint32_t,
+		z0 = svbic_u32_z (p0, z0, z1),
+		z0 = svbic_z (p0, z0, z1))
+
+/*
+** bic_u32_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, z1\.s
+**	bic	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u32_z_tied2, svuint32_t,
+		z0 = svbic_u32_z (p0, z1, z0),
+		z0 = svbic_z (p0, z1, z0))
+
+/*
+** bic_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	bic	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u32_z_untied, svuint32_t,
+		z0 = svbic_u32_z (p0, z1, z2),
+		z0 = svbic_z (p0, z1, z2))
+
+/*
+** bic_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	bic	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svbic_n_u32_z (p0, z0, x0),
+		 z0 = svbic_z (p0, z0, x0))
+
+/*
+** bic_w0_u32_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z1\.s
+**	bic	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svbic_n_u32_z (p0, z1, x0),
+		 z0 = svbic_z (p0, z1, x0))
+
+/*
+** bic_1_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #-2
+**	movprfx	z0\.s, p0/z, z0\.s
+**	and	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u32_z_tied1, svuint32_t,
+		z0 = svbic_n_u32_z (p0, z0, 1),
+		z0 = svbic_z (p0, z0, 1))
+
+/*
+** bic_1_u32_z_untied:
+**	mov	(z[0-9]+\.s), #-2
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	and	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	and	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u32_z_untied, svuint32_t,
+		z0 = svbic_n_u32_z (p0, z1, 1),
+		z0 = svbic_z (p0, z1, 1))
+
+/*
+** bic_u32_x_tied1:
+**	bic	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u32_x_tied1, svuint32_t,
+		z0 = svbic_u32_x (p0, z0, z1),
+		z0 = svbic_x (p0, z0, z1))
+
+/*
+** bic_u32_x_tied2:
+**	bic	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u32_x_tied2, svuint32_t,
+		z0 = svbic_u32_x (p0, z1, z0),
+		z0 = svbic_x (p0, z1, z0))
+
+/*
+** bic_u32_x_untied:
+**	bic	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u32_x_untied, svuint32_t,
+		z0 = svbic_u32_x (p0, z1, z2),
+		z0 = svbic_x (p0, z1, z2))
+
+/*
+** bic_w0_u32_x_tied1:
+**	mov	(z[0-9]+)\.s, w0
+**	bic	z0\.d, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svbic_n_u32_x (p0, z0, x0),
+		 z0 = svbic_x (p0, z0, x0))
+
+/*
+** bic_w0_u32_x_untied:
+**	mov	(z[0-9]+)\.s, w0
+**	bic	z0\.d, z1\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svbic_n_u32_x (p0, z1, x0),
+		 z0 = svbic_x (p0, z1, x0))
+
+/*
+** bic_1_u32_x_tied1:
+**	and	z0\.s, z0\.s, #0xfffffffe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u32_x_tied1, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z0, 1),
+		z0 = svbic_x (p0, z0, 1))
+
+/*
+** bic_1_u32_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.s, z0\.s, #0xfffffffe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u32_x_untied, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z1, 1),
+		z0 = svbic_x (p0, z1, 1))
+
+/*
+** bic_127_u32_x:
+**	and	z0\.s, z0\.s, #0xffffff80
+**	ret
+*/
+TEST_UNIFORM_Z (bic_127_u32_x, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z0, 127),
+		z0 = svbic_x (p0, z0, 127))
+
+/*
+** bic_128_u32_x:
+**	and	z0\.s, z0\.s, #0xffffff7f
+**	ret
+*/
+TEST_UNIFORM_Z (bic_128_u32_x, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z0, 128),
+		z0 = svbic_x (p0, z0, 128))
+
+/*
+** bic_255_u32_x:
+**	and	z0\.s, z0\.s, #0xffffff00
+**	ret
+*/
+TEST_UNIFORM_Z (bic_255_u32_x, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z0, 255),
+		z0 = svbic_x (p0, z0, 255))
+
+/*
+** bic_256_u32_x:
+**	and	z0\.s, z0\.s, #0xfffffeff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_256_u32_x, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z0, 256),
+		z0 = svbic_x (p0, z0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (bic_257_u32_x, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z0, 257),
+		z0 = svbic_x (p0, z0, 257))
+
+/*
+** bic_512_u32_x:
+**	and	z0\.s, z0\.s, #0xfffffdff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_512_u32_x, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z0, 512),
+		z0 = svbic_x (p0, z0, 512))
+
+/*
+** bic_65280_u32_x:
+**	and	z0\.s, z0\.s, #0xffff00ff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_65280_u32_x, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z0, 0xff00),
+		z0 = svbic_x (p0, z0, 0xff00))
+
+/*
+** bic_m127_u32_x:
+**	and	z0\.s, z0\.s, #0x7e
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m127_u32_x, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z0, -127),
+		z0 = svbic_x (p0, z0, -127))
+
+/*
+** bic_m128_u32_x:
+**	and	z0\.s, z0\.s, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m128_u32_x, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z0, -128),
+		z0 = svbic_x (p0, z0, -128))
+
+/*
+** bic_m255_u32_x:
+**	and	z0\.s, z0\.s, #0xfe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m255_u32_x, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z0, -255),
+		z0 = svbic_x (p0, z0, -255))
+
+/*
+** bic_m256_u32_x:
+**	and	z0\.s, z0\.s, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m256_u32_x, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z0, -256),
+		z0 = svbic_x (p0, z0, -256))
+
+/*
+** bic_m257_u32_x:
+**	and	z0\.s, z0\.s, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m257_u32_x, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z0, -257),
+		z0 = svbic_x (p0, z0, -257))
+
+/*
+** bic_m512_u32_x:
+**	and	z0\.s, z0\.s, #0x1ff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m512_u32_x, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z0, -512),
+		z0 = svbic_x (p0, z0, -512))
+
+/*
+** bic_m32768_u32_x:
+**	and	z0\.s, z0\.s, #0x7fff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m32768_u32_x, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z0, -0x8000),
+		z0 = svbic_x (p0, z0, -0x8000))
+
+/*
+** bic_5_u32_x:
+**	mov	(z[0-9]+)\.s, #-6
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (bic_5_u32_x, svuint32_t,
+		z0 = svbic_n_u32_x (p0, z0, 5),
+		z0 = svbic_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u64.c
new file mode 100644
index 000000000..e82db1e94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u64.c
@@ -0,0 +1,363 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** bic_u64_m_tied1:
+**	bic	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u64_m_tied1, svuint64_t,
+		z0 = svbic_u64_m (p0, z0, z1),
+		z0 = svbic_m (p0, z0, z1))
+
+/*
+** bic_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	bic	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u64_m_tied2, svuint64_t,
+		z0 = svbic_u64_m (p0, z1, z0),
+		z0 = svbic_m (p0, z1, z0))
+
+/*
+** bic_u64_m_untied:
+**	movprfx	z0, z1
+**	bic	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u64_m_untied, svuint64_t,
+		z0 = svbic_u64_m (p0, z1, z2),
+		z0 = svbic_m (p0, z1, z2))
+
+/*
+** bic_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	bic	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svbic_n_u64_m (p0, z0, x0),
+		 z0 = svbic_m (p0, z0, x0))
+
+/*
+** bic_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	bic	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svbic_n_u64_m (p0, z1, x0),
+		 z0 = svbic_m (p0, z1, x0))
+
+/*
+** bic_1_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #-2
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u64_m_tied1, svuint64_t,
+		z0 = svbic_n_u64_m (p0, z0, 1),
+		z0 = svbic_m (p0, z0, 1))
+
+/*
+** bic_1_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #-2
+**	movprfx	z0, z1
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u64_m_untied, svuint64_t,
+		z0 = svbic_n_u64_m (p0, z1, 1),
+		z0 = svbic_m (p0, z1, 1))
+
+/*
+** bic_m2_u64_m:
+**	mov	(z[0-9]+\.d), #1
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m2_u64_m, svuint64_t,
+		z0 = svbic_n_u64_m (p0, z0, -2),
+		z0 = svbic_m (p0, z0, -2))
+
+/*
+** bic_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	bic	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u64_z_tied1, svuint64_t,
+		z0 = svbic_u64_z (p0, z0, z1),
+		z0 = svbic_z (p0, z0, z1))
+
+/*
+** bic_u64_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, z1\.d
+**	bic	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u64_z_tied2, svuint64_t,
+		z0 = svbic_u64_z (p0, z1, z0),
+		z0 = svbic_z (p0, z1, z0))
+
+/*
+** bic_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	bic	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u64_z_untied, svuint64_t,
+		z0 = svbic_u64_z (p0, z1, z2),
+		z0 = svbic_z (p0, z1, z2))
+
+/*
+** bic_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	bic	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svbic_n_u64_z (p0, z0, x0),
+		 z0 = svbic_z (p0, z0, x0))
+
+/*
+** bic_x0_u64_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z1\.d
+**	bic	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svbic_n_u64_z (p0, z1, x0),
+		 z0 = svbic_z (p0, z1, x0))
+
+/*
+** bic_1_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #-2
+**	movprfx	z0\.d, p0/z, z0\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u64_z_tied1, svuint64_t,
+		z0 = svbic_n_u64_z (p0, z0, 1),
+		z0 = svbic_z (p0, z0, 1))
+
+/*
+** bic_1_u64_z_untied:
+**	mov	(z[0-9]+\.d), #-2
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	and	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	and	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u64_z_untied, svuint64_t,
+		z0 = svbic_n_u64_z (p0, z1, 1),
+		z0 = svbic_z (p0, z1, 1))
+
+/*
+** bic_u64_x_tied1:
+**	bic	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u64_x_tied1, svuint64_t,
+		z0 = svbic_u64_x (p0, z0, z1),
+		z0 = svbic_x (p0, z0, z1))
+
+/*
+** bic_u64_x_tied2:
+**	bic	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u64_x_tied2, svuint64_t,
+		z0 = svbic_u64_x (p0, z1, z0),
+		z0 = svbic_x (p0, z1, z0))
+
+/*
+** bic_u64_x_untied:
+**	bic	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u64_x_untied, svuint64_t,
+		z0 = svbic_u64_x (p0, z1, z2),
+		z0 = svbic_x (p0, z1, z2))
+
+/*
+** bic_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	bic	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svbic_n_u64_x (p0, z0, x0),
+		 z0 = svbic_x (p0, z0, x0))
+
+/*
+** bic_x0_u64_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	bic	z0\.d, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svbic_n_u64_x (p0, z1, x0),
+		 z0 = svbic_x (p0, z1, x0))
+
+/*
+** bic_1_u64_x_tied1:
+**	and	z0\.d, z0\.d, #0xfffffffffffffffe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u64_x_tied1, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z0, 1),
+		z0 = svbic_x (p0, z0, 1))
+
+/*
+** bic_1_u64_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.d, z0\.d, #0xfffffffffffffffe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u64_x_untied, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z1, 1),
+		z0 = svbic_x (p0, z1, 1))
+
+/*
+** bic_127_u64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffffff80
+**	ret
+*/
+TEST_UNIFORM_Z (bic_127_u64_x, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z0, 127),
+		z0 = svbic_x (p0, z0, 127))
+
+/*
+** bic_128_u64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffffff7f
+**	ret
+*/
+TEST_UNIFORM_Z (bic_128_u64_x, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z0, 128),
+		z0 = svbic_x (p0, z0, 128))
+
+/*
+** bic_255_u64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffffff00
+**	ret
+*/
+TEST_UNIFORM_Z (bic_255_u64_x, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z0, 255),
+		z0 = svbic_x (p0, z0, 255))
+
+/*
+** bic_256_u64_x:
+**	and	z0\.d, z0\.d, #0xfffffffffffffeff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_256_u64_x, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z0, 256),
+		z0 = svbic_x (p0, z0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (bic_257_u64_x, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z0, 257),
+		z0 = svbic_x (p0, z0, 257))
+
+/*
+** bic_512_u64_x:
+**	and	z0\.d, z0\.d, #0xfffffffffffffdff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_512_u64_x, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z0, 512),
+		z0 = svbic_x (p0, z0, 512))
+
+/*
+** bic_65280_u64_x:
+**	and	z0\.d, z0\.d, #0xffffffffffff00ff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_65280_u64_x, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z0, 0xff00),
+		z0 = svbic_x (p0, z0, 0xff00))
+
+/*
+** bic_m127_u64_x:
+**	and	z0\.d, z0\.d, #0x7e
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m127_u64_x, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z0, -127),
+		z0 = svbic_x (p0, z0, -127))
+
+/*
+** bic_m128_u64_x:
+**	and	z0\.d, z0\.d, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m128_u64_x, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z0, -128),
+		z0 = svbic_x (p0, z0, -128))
+
+/*
+** bic_m255_u64_x:
+**	and	z0\.d, z0\.d, #0xfe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m255_u64_x, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z0, -255),
+		z0 = svbic_x (p0, z0, -255))
+
+/*
+** bic_m256_u64_x:
+**	and	z0\.d, z0\.d, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m256_u64_x, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z0, -256),
+		z0 = svbic_x (p0, z0, -256))
+
+/*
+** bic_m257_u64_x:
+**	and	z0\.d, z0\.d, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m257_u64_x, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z0, -257),
+		z0 = svbic_x (p0, z0, -257))
+
+/*
+** bic_m512_u64_x:
+**	and	z0\.d, z0\.d, #0x1ff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m512_u64_x, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z0, -512),
+		z0 = svbic_x (p0, z0, -512))
+
+/*
+** bic_m32768_u64_x:
+**	and	z0\.d, z0\.d, #0x7fff
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m32768_u64_x, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z0, -0x8000),
+		z0 = svbic_x (p0, z0, -0x8000))
+
+/*
+** bic_5_u64_x:
+**	mov	(z[0-9]+\.d), #-6
+**	and	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (bic_5_u64_x, svuint64_t,
+		z0 = svbic_n_u64_x (p0, z0, 5),
+		z0 = svbic_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u8.c
new file mode 100644
index 000000000..80c489b9c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/bic_u8.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** bic_u8_m_tied1:
+**	bic	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u8_m_tied1, svuint8_t,
+		z0 = svbic_u8_m (p0, z0, z1),
+		z0 = svbic_m (p0, z0, z1))
+
+/*
+** bic_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	bic	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u8_m_tied2, svuint8_t,
+		z0 = svbic_u8_m (p0, z1, z0),
+		z0 = svbic_m (p0, z1, z0))
+
+/*
+** bic_u8_m_untied:
+**	movprfx	z0, z1
+**	bic	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u8_m_untied, svuint8_t,
+		z0 = svbic_u8_m (p0, z1, z2),
+		z0 = svbic_m (p0, z1, z2))
+
+/*
+** bic_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	bic	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svbic_n_u8_m (p0, z0, x0),
+		 z0 = svbic_m (p0, z0, x0))
+
+/*
+** bic_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	bic	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svbic_n_u8_m (p0, z1, x0),
+		 z0 = svbic_m (p0, z1, x0))
+
+/*
+** bic_1_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #-2
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u8_m_tied1, svuint8_t,
+		z0 = svbic_n_u8_m (p0, z0, 1),
+		z0 = svbic_m (p0, z0, 1))
+
+/*
+** bic_1_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #-2
+**	movprfx	z0, z1
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u8_m_untied, svuint8_t,
+		z0 = svbic_n_u8_m (p0, z1, 1),
+		z0 = svbic_m (p0, z1, 1))
+
+/*
+** bic_m2_u8_m:
+**	mov	(z[0-9]+\.b), #1
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m2_u8_m, svuint8_t,
+		z0 = svbic_n_u8_m (p0, z0, -2),
+		z0 = svbic_m (p0, z0, -2))
+
+/*
+** bic_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	bic	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u8_z_tied1, svuint8_t,
+		z0 = svbic_u8_z (p0, z0, z1),
+		z0 = svbic_z (p0, z0, z1))
+
+/*
+** bic_u8_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.b, p0/z, z1\.b
+**	bic	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u8_z_tied2, svuint8_t,
+		z0 = svbic_u8_z (p0, z1, z0),
+		z0 = svbic_z (p0, z1, z0))
+
+/*
+** bic_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	bic	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u8_z_untied, svuint8_t,
+		z0 = svbic_u8_z (p0, z1, z2),
+		z0 = svbic_z (p0, z1, z2))
+
+/*
+** bic_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	bic	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svbic_n_u8_z (p0, z0, x0),
+		 z0 = svbic_z (p0, z0, x0))
+
+/*
+** bic_w0_u8_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z1\.b
+**	bic	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svbic_n_u8_z (p0, z1, x0),
+		 z0 = svbic_z (p0, z1, x0))
+
+/*
+** bic_1_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #-2
+**	movprfx	z0\.b, p0/z, z0\.b
+**	and	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u8_z_tied1, svuint8_t,
+		z0 = svbic_n_u8_z (p0, z0, 1),
+		z0 = svbic_z (p0, z0, 1))
+
+/*
+** bic_1_u8_z_untied:
+**	mov	(z[0-9]+\.b), #-2
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	and	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	and	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u8_z_untied, svuint8_t,
+		z0 = svbic_n_u8_z (p0, z1, 1),
+		z0 = svbic_z (p0, z1, 1))
+
+/*
+** bic_u8_x_tied1:
+**	bic	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u8_x_tied1, svuint8_t,
+		z0 = svbic_u8_x (p0, z0, z1),
+		z0 = svbic_x (p0, z0, z1))
+
+/*
+** bic_u8_x_tied2:
+**	bic	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u8_x_tied2, svuint8_t,
+		z0 = svbic_u8_x (p0, z1, z0),
+		z0 = svbic_x (p0, z1, z0))
+
+/*
+** bic_u8_x_untied:
+**	bic	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (bic_u8_x_untied, svuint8_t,
+		z0 = svbic_u8_x (p0, z1, z2),
+		z0 = svbic_x (p0, z1, z2))
+
+/*
+** bic_w0_u8_x_tied1:
+**	mov	(z[0-9]+)\.b, w0
+**	bic	z0\.d, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svbic_n_u8_x (p0, z0, x0),
+		 z0 = svbic_x (p0, z0, x0))
+
+/*
+** bic_w0_u8_x_untied:
+**	mov	(z[0-9]+)\.b, w0
+**	bic	z0\.d, z1\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (bic_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svbic_n_u8_x (p0, z1, x0),
+		 z0 = svbic_x (p0, z1, x0))
+
+/*
+** bic_1_u8_x_tied1:
+**	and	z0\.b, z0\.b, #0xfe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u8_x_tied1, svuint8_t,
+		z0 = svbic_n_u8_x (p0, z0, 1),
+		z0 = svbic_x (p0, z0, 1))
+
+/*
+** bic_1_u8_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.b, z0\.b, #0xfe
+**	ret
+*/
+TEST_UNIFORM_Z (bic_1_u8_x_untied, svuint8_t,
+		z0 = svbic_n_u8_x (p0, z1, 1),
+		z0 = svbic_x (p0, z1, 1))
+
+/*
+** bic_127_u8_x:
+**	and	z0\.b, z0\.b, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (bic_127_u8_x, svuint8_t,
+		z0 = svbic_n_u8_x (p0, z0, 127),
+		z0 = svbic_x (p0, z0, 127))
+
+/*
+** bic_128_u8_x:
+**	and	z0\.b, z0\.b, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (bic_128_u8_x, svuint8_t,
+		z0 = svbic_n_u8_x (p0, z0, 128),
+		z0 = svbic_x (p0, z0, 128))
+
+/*
+** bic_255_u8_x:
+**	mov	z0\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (bic_255_u8_x, svuint8_t,
+		z0 = svbic_n_u8_x (p0, z0, 255),
+		z0 = svbic_x (p0, z0, 255))
+
+/*
+** bic_m127_u8_x:
+**	and	z0\.b, z0\.b, #0x7e
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m127_u8_x, svuint8_t,
+		z0 = svbic_n_u8_x (p0, z0, -127),
+		z0 = svbic_x (p0, z0, -127))
+
+/*
+** bic_m128_u8_x:
+**	and	z0\.b, z0\.b, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (bic_m128_u8_x, svuint8_t,
+		z0 = svbic_n_u8_x (p0, z0, -128),
+		z0 = svbic_x (p0, z0, -128))
+
+/*
+** bic_5_u8_x:
+**	mov	(z[0-9]+)\.b, #-6
+**	and	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (bic_5_u8_x, svuint8_t,
+		z0 = svbic_n_u8_x (p0, z0, 5),
+		z0 = svbic_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brka_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brka_b.c
new file mode 100644
index 000000000..63426cf94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brka_b.c
@@ -0,0 +1,54 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** brka_b_m_tied12:
+**	brka	p0\.b, p3/m, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (brka_b_m_tied12,
+		p0 = svbrka_b_m (p0, p3, p0),
+		p0 = svbrka_m (p0, p3, p0))
+
+/*
+** brka_b_m_tied1:
+**	brka	p0\.b, p3/m, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (brka_b_m_tied1,
+		p0 = svbrka_b_m (p0, p3, p1),
+		p0 = svbrka_m (p0, p3, p1))
+
+/* Bad RA choice: no preferred output sequence.  */
+TEST_UNIFORM_P (brka_b_m_tied2,
+		p0 = svbrka_b_m (p1, p3, p0),
+		p0 = svbrka_m (p1, p3, p0))
+
+/*
+** brka_b_m_untied:
+**	mov	p0\.b, p2\.b
+**	brka	p0\.b, p3/m, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (brka_b_m_untied,
+		p0 = svbrka_b_m (p2, p3, p1),
+		p0 = svbrka_m (p2, p3, p1))
+
+/*
+** brka_b_z_tied1:
+**	brka	p0\.b, p3/z, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (brka_b_z_tied1,
+		p0 = svbrka_b_z (p3, p0),
+		p0 = svbrka_z (p3, p0))
+
+/*
+** brka_b_z_untied:
+**	brka	p0\.b, p3/z, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (brka_b_z_untied,
+		p0 = svbrka_b_z (p3, p1),
+		p0 = svbrka_z (p3, p1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkb_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkb_b.c
new file mode 100644
index 000000000..4f9a2c2d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkb_b.c
@@ -0,0 +1,54 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** brkb_b_m_tied12:
+**	brkb	p0\.b, p3/m, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (brkb_b_m_tied12,
+		p0 = svbrkb_b_m (p0, p3, p0),
+		p0 = svbrkb_m (p0, p3, p0))
+
+/*
+** brkb_b_m_tied1:
+**	brkb	p0\.b, p3/m, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (brkb_b_m_tied1,
+		p0 = svbrkb_b_m (p0, p3, p1),
+		p0 = svbrkb_m (p0, p3, p1))
+
+/* Bad RA choice: no preferred output sequence.  */
+TEST_UNIFORM_P (brkb_b_m_tied2,
+		p0 = svbrkb_b_m (p1, p3, p0),
+		p0 = svbrkb_m (p1, p3, p0))
+
+/*
+** brkb_b_m_untied:
+**	mov	p0\.b, p2\.b
+**	brkb	p0\.b, p3/m, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (brkb_b_m_untied,
+		p0 = svbrkb_b_m (p2, p3, p1),
+		p0 = svbrkb_m (p2, p3, p1))
+
+/*
+** brkb_b_z_tied1:
+**	brkb	p0\.b, p3/z, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (brkb_b_z_tied1,
+		p0 = svbrkb_b_z (p3, p0),
+		p0 = svbrkb_z (p3, p0))
+
+/*
+** brkb_b_z_untied:
+**	brkb	p0\.b, p3/z, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (brkb_b_z_untied,
+		p0 = svbrkb_b_z (p3, p1),
+		p0 = svbrkb_z (p3, p1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkn_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkn_b.c
new file mode 100644
index 000000000..229a5fff9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkn_b.c
@@ -0,0 +1,27 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/* Bad RA choice: no preferred output sequence.  */
+TEST_UNIFORM_P (brkn_b_z_tied1,
+		p0 = svbrkn_b_z (p3, p0, p1),
+		p0 = svbrkn_z (p3, p0, p1))
+
+/*
+** brkn_b_z_tied2:
+**	brkn	p0\.b, p3/z, p1\.b, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (brkn_b_z_tied2,
+		p0 = svbrkn_b_z (p3, p1, p0),
+		p0 = svbrkn_z (p3, p1, p0))
+
+/*
+** brkn_b_z_untied:
+**	mov	p0\.b, p2\.b
+**	brkn	p0\.b, p3/z, p1\.b, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (brkn_b_z_untied,
+		p0 = svbrkn_b_z (p3, p1, p2),
+		p0 = svbrkn_z (p3, p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkpa_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkpa_b.c
new file mode 100644
index 000000000..2c074e389
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkpa_b.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** brkpa_b_z_tied1:
+**	brkpa	p0\.b, p3/z, p0\.b, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (brkpa_b_z_tied1,
+		p0 = svbrkpa_b_z (p3, p0, p1),
+		p0 = svbrkpa_z (p3, p0, p1))
+
+/*
+** brkpa_b_z_tied2:
+**	brkpa	p0\.b, p3/z, p1\.b, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (brkpa_b_z_tied2,
+		p0 = svbrkpa_b_z (p3, p1, p0),
+		p0 = svbrkpa_z (p3, p1, p0))
+
+/*
+** brkpa_b_z_untied:
+**	brkpa	p0\.b, p3/z, p1\.b, p2\.b
+**	ret
+*/
+TEST_UNIFORM_P (brkpa_b_z_untied,
+		p0 = svbrkpa_b_z (p3, p1, p2),
+		p0 = svbrkpa_z (p3, p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkpb_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkpb_b.c
new file mode 100644
index 000000000..b41797ee1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/brkpb_b.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** brkpb_b_z_tied1:
+**	brkpb	p0\.b, p3/z, p0\.b, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (brkpb_b_z_tied1,
+		p0 = svbrkpb_b_z (p3, p0, p1),
+		p0 = svbrkpb_z (p3, p0, p1))
+
+/*
+** brkpb_b_z_tied2:
+**	brkpb	p0\.b, p3/z, p1\.b, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (brkpb_b_z_tied2,
+		p0 = svbrkpb_b_z (p3, p1, p0),
+		p0 = svbrkpb_z (p3, p1, p0))
+
+/*
+** brkpb_b_z_untied:
+**	brkpb	p0\.b, p3/z, p1\.b, p2\.b
+**	ret
+*/
+TEST_UNIFORM_P (brkpb_b_z_untied,
+		p0 = svbrkpb_b_z (p3, p1, p2),
+		p0 = svbrkpb_z (p3, p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f16.c
new file mode 100644
index 000000000..e89c78455
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f16.c
@@ -0,0 +1,251 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cadd_90_f16_m_tied1:
+**	fcadd	z0\.h, p0/m, z0\.h, z1\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f16_m_tied1, svfloat16_t,
+		z0 = svcadd_f16_m (p0, z0, z1, 90),
+		z0 = svcadd_m (p0, z0, z1, 90))
+
+/*
+** cadd_90_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcadd	z0\.h, p0/m, z0\.h, \1\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f16_m_tied2, svfloat16_t,
+		z0 = svcadd_f16_m (p0, z1, z0, 90),
+		z0 = svcadd_m (p0, z1, z0, 90))
+
+/*
+** cadd_90_f16_m_untied:
+**	movprfx	z0, z1
+**	fcadd	z0\.h, p0/m, z0\.h, z2\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f16_m_untied, svfloat16_t,
+		z0 = svcadd_f16_m (p0, z1, z2, 90),
+		z0 = svcadd_m (p0, z1, z2, 90))
+
+/*
+** cadd_270_f16_m_tied1:
+**	fcadd	z0\.h, p0/m, z0\.h, z1\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f16_m_tied1, svfloat16_t,
+		z0 = svcadd_f16_m (p0, z0, z1, 270),
+		z0 = svcadd_m (p0, z0, z1, 270))
+
+/*
+** cadd_270_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcadd	z0\.h, p0/m, z0\.h, \1\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f16_m_tied2, svfloat16_t,
+		z0 = svcadd_f16_m (p0, z1, z0, 270),
+		z0 = svcadd_m (p0, z1, z0, 270))
+
+/*
+** cadd_270_f16_m_untied:
+**	movprfx	z0, z1
+**	fcadd	z0\.h, p0/m, z0\.h, z2\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f16_m_untied, svfloat16_t,
+		z0 = svcadd_f16_m (p0, z1, z2, 270),
+		z0 = svcadd_m (p0, z1, z2, 270))
+
+/*
+** cadd_90_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fcadd	z0\.h, p0/m, z0\.h, z1\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f16_z_tied1, svfloat16_t,
+		z0 = svcadd_f16_z (p0, z0, z1, 90),
+		z0 = svcadd_z (p0, z0, z1, 90))
+
+/*
+** cadd_90_f16_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fcadd	z0\.h, p0/m, z0\.h, \1\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f16_z_tied2, svfloat16_t,
+		z0 = svcadd_f16_z (p0, z1, z0, 90),
+		z0 = svcadd_z (p0, z1, z0, 90))
+
+/*
+** cadd_90_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fcadd	z0\.h, p0/m, z0\.h, z2\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f16_z_untied, svfloat16_t,
+		z0 = svcadd_f16_z (p0, z1, z2, 90),
+		z0 = svcadd_z (p0, z1, z2, 90))
+
+/*
+** cadd_270_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fcadd	z0\.h, p0/m, z0\.h, z1\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f16_z_tied1, svfloat16_t,
+		z0 = svcadd_f16_z (p0, z0, z1, 270),
+		z0 = svcadd_z (p0, z0, z1, 270))
+
+/*
+** cadd_270_f16_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fcadd	z0\.h, p0/m, z0\.h, \1\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f16_z_tied2, svfloat16_t,
+		z0 = svcadd_f16_z (p0, z1, z0, 270),
+		z0 = svcadd_z (p0, z1, z0, 270))
+
+/*
+** cadd_270_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fcadd	z0\.h, p0/m, z0\.h, z2\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f16_z_untied, svfloat16_t,
+		z0 = svcadd_f16_z (p0, z1, z2, 270),
+		z0 = svcadd_z (p0, z1, z2, 270))
+
+/*
+** cadd_90_f16_x_tied1:
+**	fcadd	z0\.h, p0/m, z0\.h, z1\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f16_x_tied1, svfloat16_t,
+		z0 = svcadd_f16_x (p0, z0, z1, 90),
+		z0 = svcadd_x (p0, z0, z1, 90))
+
+/*
+** cadd_90_f16_x_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcadd	z0\.h, p0/m, z0\.h, \1\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f16_x_tied2, svfloat16_t,
+		z0 = svcadd_f16_x (p0, z1, z0, 90),
+		z0 = svcadd_x (p0, z1, z0, 90))
+
+/*
+** cadd_90_f16_x_untied:
+**	movprfx	z0, z1
+**	fcadd	z0\.h, p0/m, z0\.h, z2\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f16_x_untied, svfloat16_t,
+		z0 = svcadd_f16_x (p0, z1, z2, 90),
+		z0 = svcadd_x (p0, z1, z2, 90))
+
+/*
+** cadd_270_f16_x_tied1:
+**	fcadd	z0\.h, p0/m, z0\.h, z1\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f16_x_tied1, svfloat16_t,
+		z0 = svcadd_f16_x (p0, z0, z1, 270),
+		z0 = svcadd_x (p0, z0, z1, 270))
+
+/*
+** cadd_270_f16_x_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcadd	z0\.h, p0/m, z0\.h, \1\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f16_x_tied2, svfloat16_t,
+		z0 = svcadd_f16_x (p0, z1, z0, 270),
+		z0 = svcadd_x (p0, z1, z0, 270))
+
+/*
+** cadd_270_f16_x_untied:
+**	movprfx	z0, z1
+**	fcadd	z0\.h, p0/m, z0\.h, z2\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f16_x_untied, svfloat16_t,
+		z0 = svcadd_f16_x (p0, z1, z2, 270),
+		z0 = svcadd_x (p0, z1, z2, 270))
+
+/*
+** ptrue_cadd_90_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_90_f16_x_tied1, svfloat16_t,
+		z0 = svcadd_f16_x (svptrue_b16 (), z0, z1, 90),
+		z0 = svcadd_x (svptrue_b16 (), z0, z1, 90))
+
+/*
+** ptrue_cadd_90_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_90_f16_x_tied2, svfloat16_t,
+		z0 = svcadd_f16_x (svptrue_b16 (), z1, z0, 90),
+		z0 = svcadd_x (svptrue_b16 (), z1, z0, 90))
+
+/*
+** ptrue_cadd_90_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_90_f16_x_untied, svfloat16_t,
+		z0 = svcadd_f16_x (svptrue_b16 (), z1, z2, 90),
+		z0 = svcadd_x (svptrue_b16 (), z1, z2, 90))
+
+/*
+** ptrue_cadd_270_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_270_f16_x_tied1, svfloat16_t,
+		z0 = svcadd_f16_x (svptrue_b16 (), z0, z1, 270),
+		z0 = svcadd_x (svptrue_b16 (), z0, z1, 270))
+
+/*
+** ptrue_cadd_270_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_270_f16_x_tied2, svfloat16_t,
+		z0 = svcadd_f16_x (svptrue_b16 (), z1, z0, 270),
+		z0 = svcadd_x (svptrue_b16 (), z1, z0, 270))
+
+/*
+** ptrue_cadd_270_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_270_f16_x_untied, svfloat16_t,
+		z0 = svcadd_f16_x (svptrue_b16 (), z1, z2, 270),
+		z0 = svcadd_x (svptrue_b16 (), z1, z2, 270))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f32.c
new file mode 100644
index 000000000..ed5c16ff3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f32.c
@@ -0,0 +1,251 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cadd_90_f32_m_tied1:
+**	fcadd	z0\.s, p0/m, z0\.s, z1\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f32_m_tied1, svfloat32_t,
+		z0 = svcadd_f32_m (p0, z0, z1, 90),
+		z0 = svcadd_m (p0, z0, z1, 90))
+
+/*
+** cadd_90_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcadd	z0\.s, p0/m, z0\.s, \1\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f32_m_tied2, svfloat32_t,
+		z0 = svcadd_f32_m (p0, z1, z0, 90),
+		z0 = svcadd_m (p0, z1, z0, 90))
+
+/*
+** cadd_90_f32_m_untied:
+**	movprfx	z0, z1
+**	fcadd	z0\.s, p0/m, z0\.s, z2\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f32_m_untied, svfloat32_t,
+		z0 = svcadd_f32_m (p0, z1, z2, 90),
+		z0 = svcadd_m (p0, z1, z2, 90))
+
+/*
+** cadd_270_f32_m_tied1:
+**	fcadd	z0\.s, p0/m, z0\.s, z1\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f32_m_tied1, svfloat32_t,
+		z0 = svcadd_f32_m (p0, z0, z1, 270),
+		z0 = svcadd_m (p0, z0, z1, 270))
+
+/*
+** cadd_270_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcadd	z0\.s, p0/m, z0\.s, \1\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f32_m_tied2, svfloat32_t,
+		z0 = svcadd_f32_m (p0, z1, z0, 270),
+		z0 = svcadd_m (p0, z1, z0, 270))
+
+/*
+** cadd_270_f32_m_untied:
+**	movprfx	z0, z1
+**	fcadd	z0\.s, p0/m, z0\.s, z2\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f32_m_untied, svfloat32_t,
+		z0 = svcadd_f32_m (p0, z1, z2, 270),
+		z0 = svcadd_m (p0, z1, z2, 270))
+
+/*
+** cadd_90_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fcadd	z0\.s, p0/m, z0\.s, z1\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f32_z_tied1, svfloat32_t,
+		z0 = svcadd_f32_z (p0, z0, z1, 90),
+		z0 = svcadd_z (p0, z0, z1, 90))
+
+/*
+** cadd_90_f32_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fcadd	z0\.s, p0/m, z0\.s, \1\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f32_z_tied2, svfloat32_t,
+		z0 = svcadd_f32_z (p0, z1, z0, 90),
+		z0 = svcadd_z (p0, z1, z0, 90))
+
+/*
+** cadd_90_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fcadd	z0\.s, p0/m, z0\.s, z2\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f32_z_untied, svfloat32_t,
+		z0 = svcadd_f32_z (p0, z1, z2, 90),
+		z0 = svcadd_z (p0, z1, z2, 90))
+
+/*
+** cadd_270_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fcadd	z0\.s, p0/m, z0\.s, z1\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f32_z_tied1, svfloat32_t,
+		z0 = svcadd_f32_z (p0, z0, z1, 270),
+		z0 = svcadd_z (p0, z0, z1, 270))
+
+/*
+** cadd_270_f32_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fcadd	z0\.s, p0/m, z0\.s, \1\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f32_z_tied2, svfloat32_t,
+		z0 = svcadd_f32_z (p0, z1, z0, 270),
+		z0 = svcadd_z (p0, z1, z0, 270))
+
+/*
+** cadd_270_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fcadd	z0\.s, p0/m, z0\.s, z2\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f32_z_untied, svfloat32_t,
+		z0 = svcadd_f32_z (p0, z1, z2, 270),
+		z0 = svcadd_z (p0, z1, z2, 270))
+
+/*
+** cadd_90_f32_x_tied1:
+**	fcadd	z0\.s, p0/m, z0\.s, z1\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f32_x_tied1, svfloat32_t,
+		z0 = svcadd_f32_x (p0, z0, z1, 90),
+		z0 = svcadd_x (p0, z0, z1, 90))
+
+/*
+** cadd_90_f32_x_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcadd	z0\.s, p0/m, z0\.s, \1\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f32_x_tied2, svfloat32_t,
+		z0 = svcadd_f32_x (p0, z1, z0, 90),
+		z0 = svcadd_x (p0, z1, z0, 90))
+
+/*
+** cadd_90_f32_x_untied:
+**	movprfx	z0, z1
+**	fcadd	z0\.s, p0/m, z0\.s, z2\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f32_x_untied, svfloat32_t,
+		z0 = svcadd_f32_x (p0, z1, z2, 90),
+		z0 = svcadd_x (p0, z1, z2, 90))
+
+/*
+** cadd_270_f32_x_tied1:
+**	fcadd	z0\.s, p0/m, z0\.s, z1\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f32_x_tied1, svfloat32_t,
+		z0 = svcadd_f32_x (p0, z0, z1, 270),
+		z0 = svcadd_x (p0, z0, z1, 270))
+
+/*
+** cadd_270_f32_x_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcadd	z0\.s, p0/m, z0\.s, \1\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f32_x_tied2, svfloat32_t,
+		z0 = svcadd_f32_x (p0, z1, z0, 270),
+		z0 = svcadd_x (p0, z1, z0, 270))
+
+/*
+** cadd_270_f32_x_untied:
+**	movprfx	z0, z1
+**	fcadd	z0\.s, p0/m, z0\.s, z2\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f32_x_untied, svfloat32_t,
+		z0 = svcadd_f32_x (p0, z1, z2, 270),
+		z0 = svcadd_x (p0, z1, z2, 270))
+
+/*
+** ptrue_cadd_90_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_90_f32_x_tied1, svfloat32_t,
+		z0 = svcadd_f32_x (svptrue_b32 (), z0, z1, 90),
+		z0 = svcadd_x (svptrue_b32 (), z0, z1, 90))
+
+/*
+** ptrue_cadd_90_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_90_f32_x_tied2, svfloat32_t,
+		z0 = svcadd_f32_x (svptrue_b32 (), z1, z0, 90),
+		z0 = svcadd_x (svptrue_b32 (), z1, z0, 90))
+
+/*
+** ptrue_cadd_90_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_90_f32_x_untied, svfloat32_t,
+		z0 = svcadd_f32_x (svptrue_b32 (), z1, z2, 90),
+		z0 = svcadd_x (svptrue_b32 (), z1, z2, 90))
+
+/*
+** ptrue_cadd_270_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_270_f32_x_tied1, svfloat32_t,
+		z0 = svcadd_f32_x (svptrue_b32 (), z0, z1, 270),
+		z0 = svcadd_x (svptrue_b32 (), z0, z1, 270))
+
+/*
+** ptrue_cadd_270_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_270_f32_x_tied2, svfloat32_t,
+		z0 = svcadd_f32_x (svptrue_b32 (), z1, z0, 270),
+		z0 = svcadd_x (svptrue_b32 (), z1, z0, 270))
+
+/*
+** ptrue_cadd_270_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_270_f32_x_untied, svfloat32_t,
+		z0 = svcadd_f32_x (svptrue_b32 (), z1, z2, 270),
+		z0 = svcadd_x (svptrue_b32 (), z1, z2, 270))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f64.c
new file mode 100644
index 000000000..0ada881c5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cadd_f64.c
@@ -0,0 +1,251 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cadd_90_f64_m_tied1:
+**	fcadd	z0\.d, p0/m, z0\.d, z1\.d, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f64_m_tied1, svfloat64_t,
+		z0 = svcadd_f64_m (p0, z0, z1, 90),
+		z0 = svcadd_m (p0, z0, z1, 90))
+
+/*
+** cadd_90_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcadd	z0\.d, p0/m, z0\.d, \1, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f64_m_tied2, svfloat64_t,
+		z0 = svcadd_f64_m (p0, z1, z0, 90),
+		z0 = svcadd_m (p0, z1, z0, 90))
+
+/*
+** cadd_90_f64_m_untied:
+**	movprfx	z0, z1
+**	fcadd	z0\.d, p0/m, z0\.d, z2\.d, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f64_m_untied, svfloat64_t,
+		z0 = svcadd_f64_m (p0, z1, z2, 90),
+		z0 = svcadd_m (p0, z1, z2, 90))
+
+/*
+** cadd_270_f64_m_tied1:
+**	fcadd	z0\.d, p0/m, z0\.d, z1\.d, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f64_m_tied1, svfloat64_t,
+		z0 = svcadd_f64_m (p0, z0, z1, 270),
+		z0 = svcadd_m (p0, z0, z1, 270))
+
+/*
+** cadd_270_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcadd	z0\.d, p0/m, z0\.d, \1, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f64_m_tied2, svfloat64_t,
+		z0 = svcadd_f64_m (p0, z1, z0, 270),
+		z0 = svcadd_m (p0, z1, z0, 270))
+
+/*
+** cadd_270_f64_m_untied:
+**	movprfx	z0, z1
+**	fcadd	z0\.d, p0/m, z0\.d, z2\.d, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f64_m_untied, svfloat64_t,
+		z0 = svcadd_f64_m (p0, z1, z2, 270),
+		z0 = svcadd_m (p0, z1, z2, 270))
+
+/*
+** cadd_90_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fcadd	z0\.d, p0/m, z0\.d, z1\.d, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f64_z_tied1, svfloat64_t,
+		z0 = svcadd_f64_z (p0, z0, z1, 90),
+		z0 = svcadd_z (p0, z0, z1, 90))
+
+/*
+** cadd_90_f64_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fcadd	z0\.d, p0/m, z0\.d, \1, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f64_z_tied2, svfloat64_t,
+		z0 = svcadd_f64_z (p0, z1, z0, 90),
+		z0 = svcadd_z (p0, z1, z0, 90))
+
+/*
+** cadd_90_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fcadd	z0\.d, p0/m, z0\.d, z2\.d, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f64_z_untied, svfloat64_t,
+		z0 = svcadd_f64_z (p0, z1, z2, 90),
+		z0 = svcadd_z (p0, z1, z2, 90))
+
+/*
+** cadd_270_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fcadd	z0\.d, p0/m, z0\.d, z1\.d, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f64_z_tied1, svfloat64_t,
+		z0 = svcadd_f64_z (p0, z0, z1, 270),
+		z0 = svcadd_z (p0, z0, z1, 270))
+
+/*
+** cadd_270_f64_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fcadd	z0\.d, p0/m, z0\.d, \1, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f64_z_tied2, svfloat64_t,
+		z0 = svcadd_f64_z (p0, z1, z0, 270),
+		z0 = svcadd_z (p0, z1, z0, 270))
+
+/*
+** cadd_270_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fcadd	z0\.d, p0/m, z0\.d, z2\.d, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f64_z_untied, svfloat64_t,
+		z0 = svcadd_f64_z (p0, z1, z2, 270),
+		z0 = svcadd_z (p0, z1, z2, 270))
+
+/*
+** cadd_90_f64_x_tied1:
+**	fcadd	z0\.d, p0/m, z0\.d, z1\.d, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f64_x_tied1, svfloat64_t,
+		z0 = svcadd_f64_x (p0, z0, z1, 90),
+		z0 = svcadd_x (p0, z0, z1, 90))
+
+/*
+** cadd_90_f64_x_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcadd	z0\.d, p0/m, z0\.d, \1, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f64_x_tied2, svfloat64_t,
+		z0 = svcadd_f64_x (p0, z1, z0, 90),
+		z0 = svcadd_x (p0, z1, z0, 90))
+
+/*
+** cadd_90_f64_x_untied:
+**	movprfx	z0, z1
+**	fcadd	z0\.d, p0/m, z0\.d, z2\.d, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_90_f64_x_untied, svfloat64_t,
+		z0 = svcadd_f64_x (p0, z1, z2, 90),
+		z0 = svcadd_x (p0, z1, z2, 90))
+
+/*
+** cadd_270_f64_x_tied1:
+**	fcadd	z0\.d, p0/m, z0\.d, z1\.d, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f64_x_tied1, svfloat64_t,
+		z0 = svcadd_f64_x (p0, z0, z1, 270),
+		z0 = svcadd_x (p0, z0, z1, 270))
+
+/*
+** cadd_270_f64_x_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcadd	z0\.d, p0/m, z0\.d, \1, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f64_x_tied2, svfloat64_t,
+		z0 = svcadd_f64_x (p0, z1, z0, 270),
+		z0 = svcadd_x (p0, z1, z0, 270))
+
+/*
+** cadd_270_f64_x_untied:
+**	movprfx	z0, z1
+**	fcadd	z0\.d, p0/m, z0\.d, z2\.d, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cadd_270_f64_x_untied, svfloat64_t,
+		z0 = svcadd_f64_x (p0, z1, z2, 270),
+		z0 = svcadd_x (p0, z1, z2, 270))
+
+/*
+** ptrue_cadd_90_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_90_f64_x_tied1, svfloat64_t,
+		z0 = svcadd_f64_x (svptrue_b64 (), z0, z1, 90),
+		z0 = svcadd_x (svptrue_b64 (), z0, z1, 90))
+
+/*
+** ptrue_cadd_90_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_90_f64_x_tied2, svfloat64_t,
+		z0 = svcadd_f64_x (svptrue_b64 (), z1, z0, 90),
+		z0 = svcadd_x (svptrue_b64 (), z1, z0, 90))
+
+/*
+** ptrue_cadd_90_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_90_f64_x_untied, svfloat64_t,
+		z0 = svcadd_f64_x (svptrue_b64 (), z1, z2, 90),
+		z0 = svcadd_x (svptrue_b64 (), z1, z2, 90))
+
+/*
+** ptrue_cadd_270_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_270_f64_x_tied1, svfloat64_t,
+		z0 = svcadd_f64_x (svptrue_b64 (), z0, z1, 270),
+		z0 = svcadd_x (svptrue_b64 (), z0, z1, 270))
+
+/*
+** ptrue_cadd_270_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_270_f64_x_tied2, svfloat64_t,
+		z0 = svcadd_f64_x (svptrue_b64 (), z1, z0, 270),
+		z0 = svcadd_x (svptrue_b64 (), z1, z0, 270))
+
+/*
+** ptrue_cadd_270_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cadd_270_f64_x_untied, svfloat64_t,
+		z0 = svcadd_f64_x (svptrue_b64 (), z1, z2, 270),
+		z0 = svcadd_x (svptrue_b64 (), z1, z2, 270))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_bf16.c
new file mode 100644
index 000000000..a15e34400
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_bf16.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clasta_bf16_tied1:
+**	clasta	z0\.h, p0, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_bf16_tied1, svbfloat16_t,
+		z0 = svclasta_bf16 (p0, z0, z1),
+		z0 = svclasta (p0, z0, z1))
+
+/*
+** clasta_bf16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clasta	z0\.h, p0, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_bf16_tied2, svbfloat16_t,
+		z0 = svclasta_bf16 (p0, z1, z0),
+		z0 = svclasta (p0, z1, z0))
+
+/*
+** clasta_bf16_untied:
+**	movprfx	z0, z1
+**	clasta	z0\.h, p0, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_bf16_untied, svbfloat16_t,
+		z0 = svclasta_bf16 (p0, z1, z2),
+		z0 = svclasta (p0, z1, z2))
+
+/*
+** clasta_d0_bf16:
+**	clasta	h0, p0, h0, z2\.h
+**	ret
+*/
+TEST_FOLD_LEFT_D (clasta_d0_bf16, bfloat16_t, svbfloat16_t,
+		  d0 = svclasta_n_bf16 (p0, d0, z2),
+		  d0 = svclasta (p0, d0, z2))
+
+/*
+** clasta_d1_bf16:
+**	mov	v0\.h\[0\], v1\.h\[0\]
+**	clasta	h0, p0, h0, z2\.h
+**	ret
+*/
+TEST_FOLD_LEFT_D (clasta_d1_bf16, bfloat16_t, svbfloat16_t,
+		  d0 = svclasta_n_bf16 (p0, d1, z2),
+		  d0 = svclasta (p0, d1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f16.c
new file mode 100644
index 000000000..d9a980f60
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f16.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clasta_f16_tied1:
+**	clasta	z0\.h, p0, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_f16_tied1, svfloat16_t,
+		z0 = svclasta_f16 (p0, z0, z1),
+		z0 = svclasta (p0, z0, z1))
+
+/*
+** clasta_f16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clasta	z0\.h, p0, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_f16_tied2, svfloat16_t,
+		z0 = svclasta_f16 (p0, z1, z0),
+		z0 = svclasta (p0, z1, z0))
+
+/*
+** clasta_f16_untied:
+**	movprfx	z0, z1
+**	clasta	z0\.h, p0, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_f16_untied, svfloat16_t,
+		z0 = svclasta_f16 (p0, z1, z2),
+		z0 = svclasta (p0, z1, z2))
+
+/*
+** clasta_d0_f16:
+**	clasta	h0, p0, h0, z2\.h
+**	ret
+*/
+TEST_FOLD_LEFT_D (clasta_d0_f16, float16_t, svfloat16_t,
+		  d0 = svclasta_n_f16 (p0, d0, z2),
+		  d0 = svclasta (p0, d0, z2))
+
+/*
+** clasta_d1_f16:
+**	mov	v0\.h\[0\], v1\.h\[0\]
+**	clasta	h0, p0, h0, z2\.h
+**	ret
+*/
+TEST_FOLD_LEFT_D (clasta_d1_f16, float16_t, svfloat16_t,
+		  d0 = svclasta_n_f16 (p0, d1, z2),
+		  d0 = svclasta (p0, d1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f32.c
new file mode 100644
index 000000000..cac01fa6d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f32.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clasta_f32_tied1:
+**	clasta	z0\.s, p0, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_f32_tied1, svfloat32_t,
+		z0 = svclasta_f32 (p0, z0, z1),
+		z0 = svclasta (p0, z0, z1))
+
+/*
+** clasta_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clasta	z0\.s, p0, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_f32_tied2, svfloat32_t,
+		z0 = svclasta_f32 (p0, z1, z0),
+		z0 = svclasta (p0, z1, z0))
+
+/*
+** clasta_f32_untied:
+**	movprfx	z0, z1
+**	clasta	z0\.s, p0, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_f32_untied, svfloat32_t,
+		z0 = svclasta_f32 (p0, z1, z2),
+		z0 = svclasta (p0, z1, z2))
+
+/*
+** clasta_d0_f32:
+**	clasta	s0, p0, s0, z2\.s
+**	ret
+*/
+TEST_FOLD_LEFT_D (clasta_d0_f32, float32_t, svfloat32_t,
+		  d0 = svclasta_n_f32 (p0, d0, z2),
+		  d0 = svclasta (p0, d0, z2))
+
+/*
+** clasta_d1_f32:
+**	fmov	s0, s1
+**	clasta	s0, p0, s0, z2\.s
+**	ret
+*/
+TEST_FOLD_LEFT_D (clasta_d1_f32, float32_t, svfloat32_t,
+		  d0 = svclasta_n_f32 (p0, d1, z2),
+		  d0 = svclasta (p0, d1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f64.c
new file mode 100644
index 000000000..43b93553b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_f64.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clasta_f64_tied1:
+**	clasta	z0\.d, p0, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_f64_tied1, svfloat64_t,
+		z0 = svclasta_f64 (p0, z0, z1),
+		z0 = svclasta (p0, z0, z1))
+
+/*
+** clasta_f64_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	clasta	z0\.d, p0, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_f64_tied2, svfloat64_t,
+		z0 = svclasta_f64 (p0, z1, z0),
+		z0 = svclasta (p0, z1, z0))
+
+/*
+** clasta_f64_untied:
+**	movprfx	z0, z1
+**	clasta	z0\.d, p0, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_f64_untied, svfloat64_t,
+		z0 = svclasta_f64 (p0, z1, z2),
+		z0 = svclasta (p0, z1, z2))
+
+/*
+** clasta_d0_f64:
+**	clasta	d0, p0, d0, z2\.d
+**	ret
+*/
+TEST_FOLD_LEFT_D (clasta_d0_f64, float64_t, svfloat64_t,
+		  d0 = svclasta_n_f64 (p0, d0, z2),
+		  d0 = svclasta (p0, d0, z2))
+
+/*
+** clasta_d1_f64:
+**	fmov	d0, d1
+**	clasta	d0, p0, d0, z2\.d
+**	ret
+*/
+TEST_FOLD_LEFT_D (clasta_d1_f64, float64_t, svfloat64_t,
+		  d0 = svclasta_n_f64 (p0, d1, z2),
+		  d0 = svclasta (p0, d1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s16.c
new file mode 100644
index 000000000..f5e4f85ce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s16.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clasta_s16_tied1:
+**	clasta	z0\.h, p0, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_s16_tied1, svint16_t,
+		z0 = svclasta_s16 (p0, z0, z1),
+		z0 = svclasta (p0, z0, z1))
+
+/*
+** clasta_s16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clasta	z0\.h, p0, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_s16_tied2, svint16_t,
+		z0 = svclasta_s16 (p0, z1, z0),
+		z0 = svclasta (p0, z1, z0))
+
+/*
+** clasta_s16_untied:
+**	movprfx	z0, z1
+**	clasta	z0\.h, p0, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_s16_untied, svint16_t,
+		z0 = svclasta_s16 (p0, z1, z2),
+		z0 = svclasta (p0, z1, z2))
+
+/*
+** clasta_x0_s16:
+**	clasta	w0, p0, w0, z0\.h
+**	ret
+*/
+TEST_FOLD_LEFT_X (clasta_x0_s16, int16_t, svint16_t,
+		  x0 = svclasta_n_s16 (p0, x0, z0),
+		  x0 = svclasta (p0, x0, z0))
+
+/*
+** clasta_x1_s16:
+**	mov	w0, w1
+**	clasta	w0, p0, w0, z0\.h
+**	ret
+*/
+TEST_FOLD_LEFT_X (clasta_x1_s16, int16_t, svint16_t,
+		  x0 = svclasta_n_s16 (p0, x1, z0),
+		  x0 = svclasta (p0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s32.c
new file mode 100644
index 000000000..fbd82e778
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s32.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clasta_s32_tied1:
+**	clasta	z0\.s, p0, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_s32_tied1, svint32_t,
+		z0 = svclasta_s32 (p0, z0, z1),
+		z0 = svclasta (p0, z0, z1))
+
+/*
+** clasta_s32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clasta	z0\.s, p0, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_s32_tied2, svint32_t,
+		z0 = svclasta_s32 (p0, z1, z0),
+		z0 = svclasta (p0, z1, z0))
+
+/*
+** clasta_s32_untied:
+**	movprfx	z0, z1
+**	clasta	z0\.s, p0, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_s32_untied, svint32_t,
+		z0 = svclasta_s32 (p0, z1, z2),
+		z0 = svclasta (p0, z1, z2))
+
+/*
+** clasta_x0_s32:
+**	clasta	w0, p0, w0, z0\.s
+**	ret
+*/
+TEST_FOLD_LEFT_X (clasta_x0_s32, int32_t, svint32_t,
+		  x0 = svclasta_n_s32 (p0, x0, z0),
+		  x0 = svclasta (p0, x0, z0))
+
+/*
+** clasta_x1_s32:
+**	mov	w0, w1
+**	clasta	w0, p0, w0, z0\.s
+**	ret
+*/
+TEST_FOLD_LEFT_X (clasta_x1_s32, int32_t, svint32_t,
+		  x0 = svclasta_n_s32 (p0, x1, z0),
+		  x0 = svclasta (p0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s64.c
new file mode 100644
index 000000000..08edf157b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s64.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clasta_s64_tied1:
+**	clasta	z0\.d, p0, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_s64_tied1, svint64_t,
+		z0 = svclasta_s64 (p0, z0, z1),
+		z0 = svclasta (p0, z0, z1))
+
+/*
+** clasta_s64_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	clasta	z0\.d, p0, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_s64_tied2, svint64_t,
+		z0 = svclasta_s64 (p0, z1, z0),
+		z0 = svclasta (p0, z1, z0))
+
+/*
+** clasta_s64_untied:
+**	movprfx	z0, z1
+**	clasta	z0\.d, p0, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_s64_untied, svint64_t,
+		z0 = svclasta_s64 (p0, z1, z2),
+		z0 = svclasta (p0, z1, z2))
+
+/*
+** clasta_x0_s64:
+**	clasta	x0, p0, x0, z0\.d
+**	ret
+*/
+TEST_FOLD_LEFT_X (clasta_x0_s64, int64_t, svint64_t,
+		  x0 = svclasta_n_s64 (p0, x0, z0),
+		  x0 = svclasta (p0, x0, z0))
+
+/*
+** clasta_x1_s64:
+**	mov	x0, x1
+**	clasta	x0, p0, x0, z0\.d
+**	ret
+*/
+TEST_FOLD_LEFT_X (clasta_x1_s64, int64_t, svint64_t,
+		  x0 = svclasta_n_s64 (p0, x1, z0),
+		  x0 = svclasta (p0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s8.c
new file mode 100644
index 000000000..286f16a9d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_s8.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clasta_s8_tied1:
+**	clasta	z0\.b, p0, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_s8_tied1, svint8_t,
+		z0 = svclasta_s8 (p0, z0, z1),
+		z0 = svclasta (p0, z0, z1))
+
+/*
+** clasta_s8_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clasta	z0\.b, p0, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_s8_tied2, svint8_t,
+		z0 = svclasta_s8 (p0, z1, z0),
+		z0 = svclasta (p0, z1, z0))
+
+/*
+** clasta_s8_untied:
+**	movprfx	z0, z1
+**	clasta	z0\.b, p0, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_s8_untied, svint8_t,
+		z0 = svclasta_s8 (p0, z1, z2),
+		z0 = svclasta (p0, z1, z2))
+
+/*
+** clasta_x0_s8:
+**	clasta	w0, p0, w0, z0\.b
+**	ret
+*/
+TEST_FOLD_LEFT_X (clasta_x0_s8, int8_t, svint8_t,
+		  x0 = svclasta_n_s8 (p0, x0, z0),
+		  x0 = svclasta (p0, x0, z0))
+
+/*
+** clasta_x1_s8:
+**	mov	w0, w1
+**	clasta	w0, p0, w0, z0\.b
+**	ret
+*/
+TEST_FOLD_LEFT_X (clasta_x1_s8, int8_t, svint8_t,
+		  x0 = svclasta_n_s8 (p0, x1, z0),
+		  x0 = svclasta (p0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u16.c
new file mode 100644
index 000000000..40c6dca90
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u16.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clasta_u16_tied1:
+**	clasta	z0\.h, p0, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_u16_tied1, svuint16_t,
+		z0 = svclasta_u16 (p0, z0, z1),
+		z0 = svclasta (p0, z0, z1))
+
+/*
+** clasta_u16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clasta	z0\.h, p0, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_u16_tied2, svuint16_t,
+		z0 = svclasta_u16 (p0, z1, z0),
+		z0 = svclasta (p0, z1, z0))
+
+/*
+** clasta_u16_untied:
+**	movprfx	z0, z1
+**	clasta	z0\.h, p0, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_u16_untied, svuint16_t,
+		z0 = svclasta_u16 (p0, z1, z2),
+		z0 = svclasta (p0, z1, z2))
+
+/*
+** clasta_x0_u16:
+**	clasta	w0, p0, w0, z0\.h
+**	ret
+*/
+TEST_FOLD_LEFT_X (clasta_x0_u16, uint16_t, svuint16_t,
+		  x0 = svclasta_n_u16 (p0, x0, z0),
+		  x0 = svclasta (p0, x0, z0))
+
+/*
+** clasta_x1_u16:
+**	mov	w0, w1
+**	clasta	w0, p0, w0, z0\.h
+**	ret
+*/
+TEST_FOLD_LEFT_X (clasta_x1_u16, uint16_t, svuint16_t,
+		  x0 = svclasta_n_u16 (p0, x1, z0),
+		  x0 = svclasta (p0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u32.c
new file mode 100644
index 000000000..6c46e13cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u32.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clasta_u32_tied1:
+**	clasta	z0\.s, p0, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_u32_tied1, svuint32_t,
+		z0 = svclasta_u32 (p0, z0, z1),
+		z0 = svclasta (p0, z0, z1))
+
+/*
+** clasta_u32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clasta	z0\.s, p0, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_u32_tied2, svuint32_t,
+		z0 = svclasta_u32 (p0, z1, z0),
+		z0 = svclasta (p0, z1, z0))
+
+/*
+** clasta_u32_untied:
+**	movprfx	z0, z1
+**	clasta	z0\.s, p0, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_u32_untied, svuint32_t,
+		z0 = svclasta_u32 (p0, z1, z2),
+		z0 = svclasta (p0, z1, z2))
+
+/*
+** clasta_x0_u32:
+**	clasta	w0, p0, w0, z0\.s
+**	ret
+*/
+TEST_FOLD_LEFT_X (clasta_x0_u32, uint32_t, svuint32_t,
+		  x0 = svclasta_n_u32 (p0, x0, z0),
+		  x0 = svclasta (p0, x0, z0))
+
+/*
+** clasta_x1_u32:
+**	mov	w0, w1
+**	clasta	w0, p0, w0, z0\.s
+**	ret
+*/
+TEST_FOLD_LEFT_X (clasta_x1_u32, uint32_t, svuint32_t,
+		  x0 = svclasta_n_u32 (p0, x1, z0),
+		  x0 = svclasta (p0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u64.c
new file mode 100644
index 000000000..99ad41e50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u64.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clasta_u64_tied1:
+**	clasta	z0\.d, p0, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_u64_tied1, svuint64_t,
+		z0 = svclasta_u64 (p0, z0, z1),
+		z0 = svclasta (p0, z0, z1))
+
+/*
+** clasta_u64_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	clasta	z0\.d, p0, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_u64_tied2, svuint64_t,
+		z0 = svclasta_u64 (p0, z1, z0),
+		z0 = svclasta (p0, z1, z0))
+
+/*
+** clasta_u64_untied:
+**	movprfx	z0, z1
+**	clasta	z0\.d, p0, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_u64_untied, svuint64_t,
+		z0 = svclasta_u64 (p0, z1, z2),
+		z0 = svclasta (p0, z1, z2))
+
+/*
+** clasta_x0_u64:
+**	clasta	x0, p0, x0, z0\.d
+**	ret
+*/
+TEST_FOLD_LEFT_X (clasta_x0_u64, uint64_t, svuint64_t,
+		  x0 = svclasta_n_u64 (p0, x0, z0),
+		  x0 = svclasta (p0, x0, z0))
+
+/*
+** clasta_x1_u64:
+**	mov	x0, x1
+**	clasta	x0, p0, x0, z0\.d
+**	ret
+*/
+TEST_FOLD_LEFT_X (clasta_x1_u64, uint64_t, svuint64_t,
+		  x0 = svclasta_n_u64 (p0, x1, z0),
+		  x0 = svclasta (p0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u8.c
new file mode 100644
index 000000000..eb438f4ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clasta_u8.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clasta_u8_tied1:
+**	clasta	z0\.b, p0, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_u8_tied1, svuint8_t,
+		z0 = svclasta_u8 (p0, z0, z1),
+		z0 = svclasta (p0, z0, z1))
+
+/*
+** clasta_u8_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clasta	z0\.b, p0, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_u8_tied2, svuint8_t,
+		z0 = svclasta_u8 (p0, z1, z0),
+		z0 = svclasta (p0, z1, z0))
+
+/*
+** clasta_u8_untied:
+**	movprfx	z0, z1
+**	clasta	z0\.b, p0, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clasta_u8_untied, svuint8_t,
+		z0 = svclasta_u8 (p0, z1, z2),
+		z0 = svclasta (p0, z1, z2))
+
+/*
+** clasta_x0_u8:
+**	clasta	w0, p0, w0, z0\.b
+**	ret
+*/
+TEST_FOLD_LEFT_X (clasta_x0_u8, uint8_t, svuint8_t,
+		  x0 = svclasta_n_u8 (p0, x0, z0),
+		  x0 = svclasta (p0, x0, z0))
+
+/*
+** clasta_x1_u8:
+**	mov	w0, w1
+**	clasta	w0, p0, w0, z0\.b
+**	ret
+*/
+TEST_FOLD_LEFT_X (clasta_x1_u8, uint8_t, svuint8_t,
+		  x0 = svclasta_n_u8 (p0, x1, z0),
+		  x0 = svclasta (p0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_bf16.c
new file mode 100644
index 000000000..235fd1b4e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_bf16.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clastb_bf16_tied1:
+**	clastb	z0\.h, p0, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_bf16_tied1, svbfloat16_t,
+		z0 = svclastb_bf16 (p0, z0, z1),
+		z0 = svclastb (p0, z0, z1))
+
+/*
+** clastb_bf16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clastb	z0\.h, p0, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_bf16_tied2, svbfloat16_t,
+		z0 = svclastb_bf16 (p0, z1, z0),
+		z0 = svclastb (p0, z1, z0))
+
+/*
+** clastb_bf16_untied:
+**	movprfx	z0, z1
+**	clastb	z0\.h, p0, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_bf16_untied, svbfloat16_t,
+		z0 = svclastb_bf16 (p0, z1, z2),
+		z0 = svclastb (p0, z1, z2))
+
+/*
+** clastb_d0_bf16:
+**	clastb	h0, p0, h0, z2\.h
+**	ret
+*/
+TEST_FOLD_LEFT_D (clastb_d0_bf16, bfloat16_t, svbfloat16_t,
+		  d0 = svclastb_n_bf16 (p0, d0, z2),
+		  d0 = svclastb (p0, d0, z2))
+
+/*
+** clastb_d1_bf16:
+**	mov	v0\.h\[0\], v1\.h\[0\]
+**	clastb	h0, p0, h0, z2\.h
+**	ret
+*/
+TEST_FOLD_LEFT_D (clastb_d1_bf16, bfloat16_t, svbfloat16_t,
+		  d0 = svclastb_n_bf16 (p0, d1, z2),
+		  d0 = svclastb (p0, d1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f16.c
new file mode 100644
index 000000000..e56d7688a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f16.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clastb_f16_tied1:
+**	clastb	z0\.h, p0, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_f16_tied1, svfloat16_t,
+		z0 = svclastb_f16 (p0, z0, z1),
+		z0 = svclastb (p0, z0, z1))
+
+/*
+** clastb_f16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clastb	z0\.h, p0, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_f16_tied2, svfloat16_t,
+		z0 = svclastb_f16 (p0, z1, z0),
+		z0 = svclastb (p0, z1, z0))
+
+/*
+** clastb_f16_untied:
+**	movprfx	z0, z1
+**	clastb	z0\.h, p0, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_f16_untied, svfloat16_t,
+		z0 = svclastb_f16 (p0, z1, z2),
+		z0 = svclastb (p0, z1, z2))
+
+/*
+** clastb_d0_f16:
+**	clastb	h0, p0, h0, z2\.h
+**	ret
+*/
+TEST_FOLD_LEFT_D (clastb_d0_f16, float16_t, svfloat16_t,
+		  d0 = svclastb_n_f16 (p0, d0, z2),
+		  d0 = svclastb (p0, d0, z2))
+
+/*
+** clastb_d1_f16:
+**	mov	v0\.h\[0\], v1\.h\[0\]
+**	clastb	h0, p0, h0, z2\.h
+**	ret
+*/
+TEST_FOLD_LEFT_D (clastb_d1_f16, float16_t, svfloat16_t,
+		  d0 = svclastb_n_f16 (p0, d1, z2),
+		  d0 = svclastb (p0, d1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f32.c
new file mode 100644
index 000000000..c580d1306
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f32.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clastb_f32_tied1:
+**	clastb	z0\.s, p0, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_f32_tied1, svfloat32_t,
+		z0 = svclastb_f32 (p0, z0, z1),
+		z0 = svclastb (p0, z0, z1))
+
+/*
+** clastb_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clastb	z0\.s, p0, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_f32_tied2, svfloat32_t,
+		z0 = svclastb_f32 (p0, z1, z0),
+		z0 = svclastb (p0, z1, z0))
+
+/*
+** clastb_f32_untied:
+**	movprfx	z0, z1
+**	clastb	z0\.s, p0, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_f32_untied, svfloat32_t,
+		z0 = svclastb_f32 (p0, z1, z2),
+		z0 = svclastb (p0, z1, z2))
+
+/*
+** clastb_d0_f32:
+**	clastb	s0, p0, s0, z2\.s
+**	ret
+*/
+TEST_FOLD_LEFT_D (clastb_d0_f32, float32_t, svfloat32_t,
+		  d0 = svclastb_n_f32 (p0, d0, z2),
+		  d0 = svclastb (p0, d0, z2))
+
+/*
+** clastb_d1_f32:
+**	fmov	s0, s1
+**	clastb	s0, p0, s0, z2\.s
+**	ret
+*/
+TEST_FOLD_LEFT_D (clastb_d1_f32, float32_t, svfloat32_t,
+		  d0 = svclastb_n_f32 (p0, d1, z2),
+		  d0 = svclastb (p0, d1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f64.c
new file mode 100644
index 000000000..217a76f51
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_f64.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clastb_f64_tied1:
+**	clastb	z0\.d, p0, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_f64_tied1, svfloat64_t,
+		z0 = svclastb_f64 (p0, z0, z1),
+		z0 = svclastb (p0, z0, z1))
+
+/*
+** clastb_f64_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	clastb	z0\.d, p0, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_f64_tied2, svfloat64_t,
+		z0 = svclastb_f64 (p0, z1, z0),
+		z0 = svclastb (p0, z1, z0))
+
+/*
+** clastb_f64_untied:
+**	movprfx	z0, z1
+**	clastb	z0\.d, p0, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_f64_untied, svfloat64_t,
+		z0 = svclastb_f64 (p0, z1, z2),
+		z0 = svclastb (p0, z1, z2))
+
+/*
+** clastb_d0_f64:
+**	clastb	d0, p0, d0, z2\.d
+**	ret
+*/
+TEST_FOLD_LEFT_D (clastb_d0_f64, float64_t, svfloat64_t,
+		  d0 = svclastb_n_f64 (p0, d0, z2),
+		  d0 = svclastb (p0, d0, z2))
+
+/*
+** clastb_d1_f64:
+**	fmov	d0, d1
+**	clastb	d0, p0, d0, z2\.d
+**	ret
+*/
+TEST_FOLD_LEFT_D (clastb_d1_f64, float64_t, svfloat64_t,
+		  d0 = svclastb_n_f64 (p0, d1, z2),
+		  d0 = svclastb (p0, d1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s16.c
new file mode 100644
index 000000000..37be28040
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s16.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clastb_s16_tied1:
+**	clastb	z0\.h, p0, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_s16_tied1, svint16_t,
+		z0 = svclastb_s16 (p0, z0, z1),
+		z0 = svclastb (p0, z0, z1))
+
+/*
+** clastb_s16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clastb	z0\.h, p0, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_s16_tied2, svint16_t,
+		z0 = svclastb_s16 (p0, z1, z0),
+		z0 = svclastb (p0, z1, z0))
+
+/*
+** clastb_s16_untied:
+**	movprfx	z0, z1
+**	clastb	z0\.h, p0, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_s16_untied, svint16_t,
+		z0 = svclastb_s16 (p0, z1, z2),
+		z0 = svclastb (p0, z1, z2))
+
+/*
+** clastb_x0_s16:
+**	clastb	w0, p0, w0, z0\.h
+**	ret
+*/
+TEST_FOLD_LEFT_X (clastb_x0_s16, int16_t, svint16_t,
+		  x0 = svclastb_n_s16 (p0, x0, z0),
+		  x0 = svclastb (p0, x0, z0))
+
+/*
+** clastb_x1_s16:
+**	mov	w0, w1
+**	clastb	w0, p0, w0, z0\.h
+**	ret
+*/
+TEST_FOLD_LEFT_X (clastb_x1_s16, int16_t, svint16_t,
+		  x0 = svclastb_n_s16 (p0, x1, z0),
+		  x0 = svclastb (p0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s32.c
new file mode 100644
index 000000000..2e56c5a8f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s32.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clastb_s32_tied1:
+**	clastb	z0\.s, p0, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_s32_tied1, svint32_t,
+		z0 = svclastb_s32 (p0, z0, z1),
+		z0 = svclastb (p0, z0, z1))
+
+/*
+** clastb_s32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clastb	z0\.s, p0, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_s32_tied2, svint32_t,
+		z0 = svclastb_s32 (p0, z1, z0),
+		z0 = svclastb (p0, z1, z0))
+
+/*
+** clastb_s32_untied:
+**	movprfx	z0, z1
+**	clastb	z0\.s, p0, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_s32_untied, svint32_t,
+		z0 = svclastb_s32 (p0, z1, z2),
+		z0 = svclastb (p0, z1, z2))
+
+/*
+** clastb_x0_s32:
+**	clastb	w0, p0, w0, z0\.s
+**	ret
+*/
+TEST_FOLD_LEFT_X (clastb_x0_s32, int32_t, svint32_t,
+		  x0 = svclastb_n_s32 (p0, x0, z0),
+		  x0 = svclastb (p0, x0, z0))
+
+/*
+** clastb_x1_s32:
+**	mov	w0, w1
+**	clastb	w0, p0, w0, z0\.s
+**	ret
+*/
+TEST_FOLD_LEFT_X (clastb_x1_s32, int32_t, svint32_t,
+		  x0 = svclastb_n_s32 (p0, x1, z0),
+		  x0 = svclastb (p0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s64.c
new file mode 100644
index 000000000..9ce210aae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s64.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clastb_s64_tied1:
+**	clastb	z0\.d, p0, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_s64_tied1, svint64_t,
+		z0 = svclastb_s64 (p0, z0, z1),
+		z0 = svclastb (p0, z0, z1))
+
+/*
+** clastb_s64_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	clastb	z0\.d, p0, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_s64_tied2, svint64_t,
+		z0 = svclastb_s64 (p0, z1, z0),
+		z0 = svclastb (p0, z1, z0))
+
+/*
+** clastb_s64_untied:
+**	movprfx	z0, z1
+**	clastb	z0\.d, p0, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_s64_untied, svint64_t,
+		z0 = svclastb_s64 (p0, z1, z2),
+		z0 = svclastb (p0, z1, z2))
+
+/*
+** clastb_x0_s64:
+**	clastb	x0, p0, x0, z0\.d
+**	ret
+*/
+TEST_FOLD_LEFT_X (clastb_x0_s64, int64_t, svint64_t,
+		  x0 = svclastb_n_s64 (p0, x0, z0),
+		  x0 = svclastb (p0, x0, z0))
+
+/*
+** clastb_x1_s64:
+**	mov	x0, x1
+**	clastb	x0, p0, x0, z0\.d
+**	ret
+*/
+TEST_FOLD_LEFT_X (clastb_x1_s64, int64_t, svint64_t,
+		  x0 = svclastb_n_s64 (p0, x1, z0),
+		  x0 = svclastb (p0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s8.c
new file mode 100644
index 000000000..eb76c22cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_s8.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clastb_s8_tied1:
+**	clastb	z0\.b, p0, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_s8_tied1, svint8_t,
+		z0 = svclastb_s8 (p0, z0, z1),
+		z0 = svclastb (p0, z0, z1))
+
+/*
+** clastb_s8_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clastb	z0\.b, p0, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_s8_tied2, svint8_t,
+		z0 = svclastb_s8 (p0, z1, z0),
+		z0 = svclastb (p0, z1, z0))
+
+/*
+** clastb_s8_untied:
+**	movprfx	z0, z1
+**	clastb	z0\.b, p0, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_s8_untied, svint8_t,
+		z0 = svclastb_s8 (p0, z1, z2),
+		z0 = svclastb (p0, z1, z2))
+
+/*
+** clastb_x0_s8:
+**	clastb	w0, p0, w0, z0\.b
+**	ret
+*/
+TEST_FOLD_LEFT_X (clastb_x0_s8, int8_t, svint8_t,
+		  x0 = svclastb_n_s8 (p0, x0, z0),
+		  x0 = svclastb (p0, x0, z0))
+
+/*
+** clastb_x1_s8:
+**	mov	w0, w1
+**	clastb	w0, p0, w0, z0\.b
+**	ret
+*/
+TEST_FOLD_LEFT_X (clastb_x1_s8, int8_t, svint8_t,
+		  x0 = svclastb_n_s8 (p0, x1, z0),
+		  x0 = svclastb (p0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u16.c
new file mode 100644
index 000000000..5aea9c7bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u16.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clastb_u16_tied1:
+**	clastb	z0\.h, p0, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_u16_tied1, svuint16_t,
+		z0 = svclastb_u16 (p0, z0, z1),
+		z0 = svclastb (p0, z0, z1))
+
+/*
+** clastb_u16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clastb	z0\.h, p0, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_u16_tied2, svuint16_t,
+		z0 = svclastb_u16 (p0, z1, z0),
+		z0 = svclastb (p0, z1, z0))
+
+/*
+** clastb_u16_untied:
+**	movprfx	z0, z1
+**	clastb	z0\.h, p0, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_u16_untied, svuint16_t,
+		z0 = svclastb_u16 (p0, z1, z2),
+		z0 = svclastb (p0, z1, z2))
+
+/*
+** clastb_x0_u16:
+**	clastb	w0, p0, w0, z0\.h
+**	ret
+*/
+TEST_FOLD_LEFT_X (clastb_x0_u16, uint16_t, svuint16_t,
+		  x0 = svclastb_n_u16 (p0, x0, z0),
+		  x0 = svclastb (p0, x0, z0))
+
+/*
+** clastb_x1_u16:
+**	mov	w0, w1
+**	clastb	w0, p0, w0, z0\.h
+**	ret
+*/
+TEST_FOLD_LEFT_X (clastb_x1_u16, uint16_t, svuint16_t,
+		  x0 = svclastb_n_u16 (p0, x1, z0),
+		  x0 = svclastb (p0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u32.c
new file mode 100644
index 000000000..47fcf4f27
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u32.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clastb_u32_tied1:
+**	clastb	z0\.s, p0, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_u32_tied1, svuint32_t,
+		z0 = svclastb_u32 (p0, z0, z1),
+		z0 = svclastb (p0, z0, z1))
+
+/*
+** clastb_u32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clastb	z0\.s, p0, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_u32_tied2, svuint32_t,
+		z0 = svclastb_u32 (p0, z1, z0),
+		z0 = svclastb (p0, z1, z0))
+
+/*
+** clastb_u32_untied:
+**	movprfx	z0, z1
+**	clastb	z0\.s, p0, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_u32_untied, svuint32_t,
+		z0 = svclastb_u32 (p0, z1, z2),
+		z0 = svclastb (p0, z1, z2))
+
+/*
+** clastb_x0_u32:
+**	clastb	w0, p0, w0, z0\.s
+**	ret
+*/
+TEST_FOLD_LEFT_X (clastb_x0_u32, uint32_t, svuint32_t,
+		  x0 = svclastb_n_u32 (p0, x0, z0),
+		  x0 = svclastb (p0, x0, z0))
+
+/*
+** clastb_x1_u32:
+**	mov	w0, w1
+**	clastb	w0, p0, w0, z0\.s
+**	ret
+*/
+TEST_FOLD_LEFT_X (clastb_x1_u32, uint32_t, svuint32_t,
+		  x0 = svclastb_n_u32 (p0, x1, z0),
+		  x0 = svclastb (p0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u64.c
new file mode 100644
index 000000000..fb57afe85
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u64.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clastb_u64_tied1:
+**	clastb	z0\.d, p0, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_u64_tied1, svuint64_t,
+		z0 = svclastb_u64 (p0, z0, z1),
+		z0 = svclastb (p0, z0, z1))
+
+/*
+** clastb_u64_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	clastb	z0\.d, p0, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_u64_tied2, svuint64_t,
+		z0 = svclastb_u64 (p0, z1, z0),
+		z0 = svclastb (p0, z1, z0))
+
+/*
+** clastb_u64_untied:
+**	movprfx	z0, z1
+**	clastb	z0\.d, p0, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_u64_untied, svuint64_t,
+		z0 = svclastb_u64 (p0, z1, z2),
+		z0 = svclastb (p0, z1, z2))
+
+/*
+** clastb_x0_u64:
+**	clastb	x0, p0, x0, z0\.d
+**	ret
+*/
+TEST_FOLD_LEFT_X (clastb_x0_u64, uint64_t, svuint64_t,
+		  x0 = svclastb_n_u64 (p0, x0, z0),
+		  x0 = svclastb (p0, x0, z0))
+
+/*
+** clastb_x1_u64:
+**	mov	x0, x1
+**	clastb	x0, p0, x0, z0\.d
+**	ret
+*/
+TEST_FOLD_LEFT_X (clastb_x1_u64, uint64_t, svuint64_t,
+		  x0 = svclastb_n_u64 (p0, x1, z0),
+		  x0 = svclastb (p0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u8.c
new file mode 100644
index 000000000..f3ca84920
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clastb_u8.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clastb_u8_tied1:
+**	clastb	z0\.b, p0, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_u8_tied1, svuint8_t,
+		z0 = svclastb_u8 (p0, z0, z1),
+		z0 = svclastb (p0, z0, z1))
+
+/*
+** clastb_u8_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clastb	z0\.b, p0, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_u8_tied2, svuint8_t,
+		z0 = svclastb_u8 (p0, z1, z0),
+		z0 = svclastb (p0, z1, z0))
+
+/*
+** clastb_u8_untied:
+**	movprfx	z0, z1
+**	clastb	z0\.b, p0, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clastb_u8_untied, svuint8_t,
+		z0 = svclastb_u8 (p0, z1, z2),
+		z0 = svclastb (p0, z1, z2))
+
+/*
+** clastb_x0_u8:
+**	clastb	w0, p0, w0, z0\.b
+**	ret
+*/
+TEST_FOLD_LEFT_X (clastb_x0_u8, uint8_t, svuint8_t,
+		  x0 = svclastb_n_u8 (p0, x0, z0),
+		  x0 = svclastb (p0, x0, z0))
+
+/*
+** clastb_x1_u8:
+**	mov	w0, w1
+**	clastb	w0, p0, w0, z0\.b
+**	ret
+*/
+TEST_FOLD_LEFT_X (clastb_x1_u8, uint8_t, svuint8_t,
+		  x0 = svclastb_n_u8 (p0, x1, z0),
+		  x0 = svclastb (p0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s16.c
new file mode 100644
index 000000000..7af312397
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s16.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cls_s16_m_tied1:
+**	cls	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cls_s16_m_tied1, svuint16_t, svint16_t,
+	     z0 = svcls_s16_m (z0, p0, z4),
+	     z0 = svcls_m (z0, p0, z4))
+
+/*
+** cls_s16_m_untied:
+**	movprfx	z0, z1
+**	cls	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cls_s16_m_untied, svuint16_t, svint16_t,
+	     z0 = svcls_s16_m (z1, p0, z4),
+	     z0 = svcls_m (z1, p0, z4))
+
+/*
+** cls_s16_z:
+**	movprfx	z0\.h, p0/z, z4\.h
+**	cls	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cls_s16_z, svuint16_t, svint16_t,
+	     z0 = svcls_s16_z (p0, z4),
+	     z0 = svcls_z (p0, z4))
+
+/*
+** cls_s16_x:
+**	cls	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cls_s16_x, svuint16_t, svint16_t,
+	     z0 = svcls_s16_x (p0, z4),
+	     z0 = svcls_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s32.c
new file mode 100644
index 000000000..813876f68
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s32.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cls_s32_m_tied1:
+**	cls	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cls_s32_m_tied1, svuint32_t, svint32_t,
+	     z0 = svcls_s32_m (z0, p0, z4),
+	     z0 = svcls_m (z0, p0, z4))
+
+/*
+** cls_s32_m_untied:
+**	movprfx	z0, z1
+**	cls	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cls_s32_m_untied, svuint32_t, svint32_t,
+	     z0 = svcls_s32_m (z1, p0, z4),
+	     z0 = svcls_m (z1, p0, z4))
+
+/*
+** cls_s32_z:
+**	movprfx	z0\.s, p0/z, z4\.s
+**	cls	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cls_s32_z, svuint32_t, svint32_t,
+	     z0 = svcls_s32_z (p0, z4),
+	     z0 = svcls_z (p0, z4))
+
+/*
+** cls_s32_x:
+**	cls	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cls_s32_x, svuint32_t, svint32_t,
+	     z0 = svcls_s32_x (p0, z4),
+	     z0 = svcls_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s64.c
new file mode 100644
index 000000000..660a20556
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s64.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cls_s64_m_tied1:
+**	cls	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cls_s64_m_tied1, svuint64_t, svint64_t,
+	     z0 = svcls_s64_m (z0, p0, z4),
+	     z0 = svcls_m (z0, p0, z4))
+
+/*
+** cls_s64_m_untied:
+**	movprfx	z0, z1
+**	cls	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cls_s64_m_untied, svuint64_t, svint64_t,
+	     z0 = svcls_s64_m (z1, p0, z4),
+	     z0 = svcls_m (z1, p0, z4))
+
+/*
+** cls_s64_z:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	cls	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cls_s64_z, svuint64_t, svint64_t,
+	     z0 = svcls_s64_z (p0, z4),
+	     z0 = svcls_z (p0, z4))
+
+/*
+** cls_s64_x:
+**	cls	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cls_s64_x, svuint64_t, svint64_t,
+	     z0 = svcls_s64_x (p0, z4),
+	     z0 = svcls_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s8.c
new file mode 100644
index 000000000..56f5c2608
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cls_s8.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cls_s8_m_tied1:
+**	cls	z0\.b, p0/m, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (cls_s8_m_tied1, svuint8_t, svint8_t,
+	     z0 = svcls_s8_m (z0, p0, z4),
+	     z0 = svcls_m (z0, p0, z4))
+
+/*
+** cls_s8_m_untied:
+**	movprfx	z0, z1
+**	cls	z0\.b, p0/m, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (cls_s8_m_untied, svuint8_t, svint8_t,
+	     z0 = svcls_s8_m (z1, p0, z4),
+	     z0 = svcls_m (z1, p0, z4))
+
+/*
+** cls_s8_z:
+**	movprfx	z0\.b, p0/z, z4\.b
+**	cls	z0\.b, p0/m, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (cls_s8_z, svuint8_t, svint8_t,
+	     z0 = svcls_s8_z (p0, z4),
+	     z0 = svcls_z (p0, z4))
+
+/*
+** cls_s8_x:
+**	cls	z0\.b, p0/m, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (cls_s8_x, svuint8_t, svint8_t,
+	     z0 = svcls_s8_x (p0, z4),
+	     z0 = svcls_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s16.c
new file mode 100644
index 000000000..58f89005c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s16.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clz_s16_m_tied1:
+**	clz	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (clz_s16_m_tied1, svuint16_t, svint16_t,
+	     z0 = svclz_s16_m (z0, p0, z4),
+	     z0 = svclz_m (z0, p0, z4))
+
+/*
+** clz_s16_m_untied:
+**	movprfx	z0, z1
+**	clz	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (clz_s16_m_untied, svuint16_t, svint16_t,
+	     z0 = svclz_s16_m (z1, p0, z4),
+	     z0 = svclz_m (z1, p0, z4))
+
+/*
+** clz_s16_z:
+**	movprfx	z0\.h, p0/z, z4\.h
+**	clz	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (clz_s16_z, svuint16_t, svint16_t,
+	     z0 = svclz_s16_z (p0, z4),
+	     z0 = svclz_z (p0, z4))
+
+/*
+** clz_s16_x:
+**	clz	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (clz_s16_x, svuint16_t, svint16_t,
+	     z0 = svclz_s16_x (p0, z4),
+	     z0 = svclz_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s32.c
new file mode 100644
index 000000000..a9198070b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s32.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clz_s32_m_tied1:
+**	clz	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (clz_s32_m_tied1, svuint32_t, svint32_t,
+	     z0 = svclz_s32_m (z0, p0, z4),
+	     z0 = svclz_m (z0, p0, z4))
+
+/*
+** clz_s32_m_untied:
+**	movprfx	z0, z1
+**	clz	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (clz_s32_m_untied, svuint32_t, svint32_t,
+	     z0 = svclz_s32_m (z1, p0, z4),
+	     z0 = svclz_m (z1, p0, z4))
+
+/*
+** clz_s32_z:
+**	movprfx	z0\.s, p0/z, z4\.s
+**	clz	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (clz_s32_z, svuint32_t, svint32_t,
+	     z0 = svclz_s32_z (p0, z4),
+	     z0 = svclz_z (p0, z4))
+
+/*
+** clz_s32_x:
+**	clz	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (clz_s32_x, svuint32_t, svint32_t,
+	     z0 = svclz_s32_x (p0, z4),
+	     z0 = svclz_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s64.c
new file mode 100644
index 000000000..02c0c993e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s64.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clz_s64_m_tied1:
+**	clz	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (clz_s64_m_tied1, svuint64_t, svint64_t,
+	     z0 = svclz_s64_m (z0, p0, z4),
+	     z0 = svclz_m (z0, p0, z4))
+
+/*
+** clz_s64_m_untied:
+**	movprfx	z0, z1
+**	clz	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (clz_s64_m_untied, svuint64_t, svint64_t,
+	     z0 = svclz_s64_m (z1, p0, z4),
+	     z0 = svclz_m (z1, p0, z4))
+
+/*
+** clz_s64_z:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	clz	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (clz_s64_z, svuint64_t, svint64_t,
+	     z0 = svclz_s64_z (p0, z4),
+	     z0 = svclz_z (p0, z4))
+
+/*
+** clz_s64_x:
+**	clz	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (clz_s64_x, svuint64_t, svint64_t,
+	     z0 = svclz_s64_x (p0, z4),
+	     z0 = svclz_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s8.c
new file mode 100644
index 000000000..642d298c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_s8.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clz_s8_m_tied1:
+**	clz	z0\.b, p0/m, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (clz_s8_m_tied1, svuint8_t, svint8_t,
+	     z0 = svclz_s8_m (z0, p0, z4),
+	     z0 = svclz_m (z0, p0, z4))
+
+/*
+** clz_s8_m_untied:
+**	movprfx	z0, z1
+**	clz	z0\.b, p0/m, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (clz_s8_m_untied, svuint8_t, svint8_t,
+	     z0 = svclz_s8_m (z1, p0, z4),
+	     z0 = svclz_m (z1, p0, z4))
+
+/*
+** clz_s8_z:
+**	movprfx	z0\.b, p0/z, z4\.b
+**	clz	z0\.b, p0/m, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (clz_s8_z, svuint8_t, svint8_t,
+	     z0 = svclz_s8_z (p0, z4),
+	     z0 = svclz_z (p0, z4))
+
+/*
+** clz_s8_x:
+**	clz	z0\.b, p0/m, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (clz_s8_x, svuint8_t, svint8_t,
+	     z0 = svclz_s8_x (p0, z4),
+	     z0 = svclz_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u16.c
new file mode 100644
index 000000000..f08723017
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u16.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clz_u16_m_tied12:
+**	clz	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u16_m_tied12, svuint16_t,
+		z0 = svclz_u16_m (z0, p0, z0),
+		z0 = svclz_m (z0, p0, z0))
+
+/*
+** clz_u16_m_tied1:
+**	clz	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u16_m_tied1, svuint16_t,
+		z0 = svclz_u16_m (z0, p0, z1),
+		z0 = svclz_m (z0, p0, z1))
+
+/*
+** clz_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clz	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u16_m_tied2, svuint16_t,
+		z0 = svclz_u16_m (z1, p0, z0),
+		z0 = svclz_m (z1, p0, z0))
+
+/*
+** clz_u16_m_untied:
+**	movprfx	z0, z2
+**	clz	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u16_m_untied, svuint16_t,
+		z0 = svclz_u16_m (z2, p0, z1),
+		z0 = svclz_m (z2, p0, z1))
+
+/*
+** clz_u16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	clz	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u16_z_tied1, svuint16_t,
+		z0 = svclz_u16_z (p0, z0),
+		z0 = svclz_z (p0, z0))
+
+/*
+** clz_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	clz	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u16_z_untied, svuint16_t,
+		z0 = svclz_u16_z (p0, z1),
+		z0 = svclz_z (p0, z1))
+
+/*
+** clz_u16_x_tied1:
+**	clz	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u16_x_tied1, svuint16_t,
+		z0 = svclz_u16_x (p0, z0),
+		z0 = svclz_x (p0, z0))
+
+/*
+** clz_u16_x_untied:
+**	clz	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u16_x_untied, svuint16_t,
+		z0 = svclz_u16_x (p0, z1),
+		z0 = svclz_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u32.c
new file mode 100644
index 000000000..e00424131
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u32.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clz_u32_m_tied12:
+**	clz	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u32_m_tied12, svuint32_t,
+		z0 = svclz_u32_m (z0, p0, z0),
+		z0 = svclz_m (z0, p0, z0))
+
+/*
+** clz_u32_m_tied1:
+**	clz	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u32_m_tied1, svuint32_t,
+		z0 = svclz_u32_m (z0, p0, z1),
+		z0 = svclz_m (z0, p0, z1))
+
+/*
+** clz_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clz	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u32_m_tied2, svuint32_t,
+		z0 = svclz_u32_m (z1, p0, z0),
+		z0 = svclz_m (z1, p0, z0))
+
+/*
+** clz_u32_m_untied:
+**	movprfx	z0, z2
+**	clz	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u32_m_untied, svuint32_t,
+		z0 = svclz_u32_m (z2, p0, z1),
+		z0 = svclz_m (z2, p0, z1))
+
+/*
+** clz_u32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	clz	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u32_z_tied1, svuint32_t,
+		z0 = svclz_u32_z (p0, z0),
+		z0 = svclz_z (p0, z0))
+
+/*
+** clz_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	clz	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u32_z_untied, svuint32_t,
+		z0 = svclz_u32_z (p0, z1),
+		z0 = svclz_z (p0, z1))
+
+/*
+** clz_u32_x_tied1:
+**	clz	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u32_x_tied1, svuint32_t,
+		z0 = svclz_u32_x (p0, z0),
+		z0 = svclz_x (p0, z0))
+
+/*
+** clz_u32_x_untied:
+**	clz	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u32_x_untied, svuint32_t,
+		z0 = svclz_u32_x (p0, z1),
+		z0 = svclz_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u64.c
new file mode 100644
index 000000000..e879e1b9a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clz_u64_m_tied12:
+**	clz	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u64_m_tied12, svuint64_t,
+		z0 = svclz_u64_m (z0, p0, z0),
+		z0 = svclz_m (z0, p0, z0))
+
+/*
+** clz_u64_m_tied1:
+**	clz	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u64_m_tied1, svuint64_t,
+		z0 = svclz_u64_m (z0, p0, z1),
+		z0 = svclz_m (z0, p0, z1))
+
+/*
+** clz_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	clz	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u64_m_tied2, svuint64_t,
+		z0 = svclz_u64_m (z1, p0, z0),
+		z0 = svclz_m (z1, p0, z0))
+
+/*
+** clz_u64_m_untied:
+**	movprfx	z0, z2
+**	clz	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u64_m_untied, svuint64_t,
+		z0 = svclz_u64_m (z2, p0, z1),
+		z0 = svclz_m (z2, p0, z1))
+
+/*
+** clz_u64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	clz	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u64_z_tied1, svuint64_t,
+		z0 = svclz_u64_z (p0, z0),
+		z0 = svclz_z (p0, z0))
+
+/*
+** clz_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	clz	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u64_z_untied, svuint64_t,
+		z0 = svclz_u64_z (p0, z1),
+		z0 = svclz_z (p0, z1))
+
+/*
+** clz_u64_x_tied1:
+**	clz	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u64_x_tied1, svuint64_t,
+		z0 = svclz_u64_x (p0, z0),
+		z0 = svclz_x (p0, z0))
+
+/*
+** clz_u64_x_untied:
+**	clz	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u64_x_untied, svuint64_t,
+		z0 = svclz_u64_x (p0, z1),
+		z0 = svclz_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u8.c
new file mode 100644
index 000000000..ce6cb8f45
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/clz_u8.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clz_u8_m_tied12:
+**	clz	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u8_m_tied12, svuint8_t,
+		z0 = svclz_u8_m (z0, p0, z0),
+		z0 = svclz_m (z0, p0, z0))
+
+/*
+** clz_u8_m_tied1:
+**	clz	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u8_m_tied1, svuint8_t,
+		z0 = svclz_u8_m (z0, p0, z1),
+		z0 = svclz_m (z0, p0, z1))
+
+/*
+** clz_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	clz	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u8_m_tied2, svuint8_t,
+		z0 = svclz_u8_m (z1, p0, z0),
+		z0 = svclz_m (z1, p0, z0))
+
+/*
+** clz_u8_m_untied:
+**	movprfx	z0, z2
+**	clz	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u8_m_untied, svuint8_t,
+		z0 = svclz_u8_m (z2, p0, z1),
+		z0 = svclz_m (z2, p0, z1))
+
+/*
+** clz_u8_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.b, p0/z, \1\.b
+**	clz	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u8_z_tied1, svuint8_t,
+		z0 = svclz_u8_z (p0, z0),
+		z0 = svclz_z (p0, z0))
+
+/*
+** clz_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	clz	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u8_z_untied, svuint8_t,
+		z0 = svclz_u8_z (p0, z1),
+		z0 = svclz_z (p0, z1))
+
+/*
+** clz_u8_x_tied1:
+**	clz	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u8_x_tied1, svuint8_t,
+		z0 = svclz_u8_x (p0, z0),
+		z0 = svclz_x (p0, z0))
+
+/*
+** clz_u8_x_untied:
+**	clz	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (clz_u8_x_untied, svuint8_t,
+		z0 = svclz_u8_x (p0, z1),
+		z0 = svclz_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f16.c
new file mode 100644
index 000000000..3bf44a59f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f16.c
@@ -0,0 +1,675 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmla_0_f16_m_tied1:
+**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f16_m_tied1, svfloat16_t,
+		z0 = svcmla_f16_m (p0, z0, z1, z2, 0),
+		z0 = svcmla_m (p0, z0, z1, z2, 0))
+
+/*
+** cmla_0_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f16_m_tied2, svfloat16_t,
+		z0 = svcmla_f16_m (p0, z1, z0, z2, 0),
+		z0 = svcmla_m (p0, z1, z0, z2, 0))
+
+/*
+** cmla_0_f16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f16_m_tied3, svfloat16_t,
+		z0 = svcmla_f16_m (p0, z1, z2, z0, 0),
+		z0 = svcmla_m (p0, z1, z2, z0, 0))
+
+/*
+** cmla_0_f16_m_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f16_m_untied, svfloat16_t,
+		z0 = svcmla_f16_m (p0, z1, z2, z3, 0),
+		z0 = svcmla_m (p0, z1, z2, z3, 0))
+
+/*
+** cmla_90_f16_m_tied1:
+**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f16_m_tied1, svfloat16_t,
+		z0 = svcmla_f16_m (p0, z0, z1, z2, 90),
+		z0 = svcmla_m (p0, z0, z1, z2, 90))
+
+/*
+** cmla_90_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f16_m_tied2, svfloat16_t,
+		z0 = svcmla_f16_m (p0, z1, z0, z2, 90),
+		z0 = svcmla_m (p0, z1, z0, z2, 90))
+
+/*
+** cmla_90_f16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f16_m_tied3, svfloat16_t,
+		z0 = svcmla_f16_m (p0, z1, z2, z0, 90),
+		z0 = svcmla_m (p0, z1, z2, z0, 90))
+
+/*
+** cmla_90_f16_m_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f16_m_untied, svfloat16_t,
+		z0 = svcmla_f16_m (p0, z1, z2, z3, 90),
+		z0 = svcmla_m (p0, z1, z2, z3, 90))
+
+/*
+** cmla_180_f16_m_tied1:
+**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f16_m_tied1, svfloat16_t,
+		z0 = svcmla_f16_m (p0, z0, z1, z2, 180),
+		z0 = svcmla_m (p0, z0, z1, z2, 180))
+
+/*
+** cmla_180_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f16_m_tied2, svfloat16_t,
+		z0 = svcmla_f16_m (p0, z1, z0, z2, 180),
+		z0 = svcmla_m (p0, z1, z0, z2, 180))
+
+/*
+** cmla_180_f16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f16_m_tied3, svfloat16_t,
+		z0 = svcmla_f16_m (p0, z1, z2, z0, 180),
+		z0 = svcmla_m (p0, z1, z2, z0, 180))
+
+/*
+** cmla_180_f16_m_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f16_m_untied, svfloat16_t,
+		z0 = svcmla_f16_m (p0, z1, z2, z3, 180),
+		z0 = svcmla_m (p0, z1, z2, z3, 180))
+
+/*
+** cmla_270_f16_m_tied1:
+**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f16_m_tied1, svfloat16_t,
+		z0 = svcmla_f16_m (p0, z0, z1, z2, 270),
+		z0 = svcmla_m (p0, z0, z1, z2, 270))
+
+/*
+** cmla_270_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f16_m_tied2, svfloat16_t,
+		z0 = svcmla_f16_m (p0, z1, z0, z2, 270),
+		z0 = svcmla_m (p0, z1, z0, z2, 270))
+
+/*
+** cmla_270_f16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f16_m_tied3, svfloat16_t,
+		z0 = svcmla_f16_m (p0, z1, z2, z0, 270),
+		z0 = svcmla_m (p0, z1, z2, z0, 270))
+
+/*
+** cmla_270_f16_m_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f16_m_untied, svfloat16_t,
+		z0 = svcmla_f16_m (p0, z1, z2, z3, 270),
+		z0 = svcmla_m (p0, z1, z2, z3, 270))
+
+/*
+** cmla_0_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f16_z_tied1, svfloat16_t,
+		z0 = svcmla_f16_z (p0, z0, z1, z2, 0),
+		z0 = svcmla_z (p0, z0, z1, z2, 0))
+
+/*
+** cmla_0_f16_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f16_z_tied2, svfloat16_t,
+		z0 = svcmla_f16_z (p0, z1, z0, z2, 0),
+		z0 = svcmla_z (p0, z1, z0, z2, 0))
+
+/*
+** cmla_0_f16_z_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f16_z_tied3, svfloat16_t,
+		z0 = svcmla_f16_z (p0, z1, z2, z0, 0),
+		z0 = svcmla_z (p0, z1, z2, z0, 0))
+
+/*
+** cmla_0_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f16_z_untied, svfloat16_t,
+		z0 = svcmla_f16_z (p0, z1, z2, z3, 0),
+		z0 = svcmla_z (p0, z1, z2, z3, 0))
+
+/*
+** cmla_90_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f16_z_tied1, svfloat16_t,
+		z0 = svcmla_f16_z (p0, z0, z1, z2, 90),
+		z0 = svcmla_z (p0, z0, z1, z2, 90))
+
+/*
+** cmla_90_f16_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f16_z_tied2, svfloat16_t,
+		z0 = svcmla_f16_z (p0, z1, z0, z2, 90),
+		z0 = svcmla_z (p0, z1, z0, z2, 90))
+
+/*
+** cmla_90_f16_z_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f16_z_tied3, svfloat16_t,
+		z0 = svcmla_f16_z (p0, z1, z2, z0, 90),
+		z0 = svcmla_z (p0, z1, z2, z0, 90))
+
+/*
+** cmla_90_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f16_z_untied, svfloat16_t,
+		z0 = svcmla_f16_z (p0, z1, z2, z3, 90),
+		z0 = svcmla_z (p0, z1, z2, z3, 90))
+
+/*
+** cmla_180_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f16_z_tied1, svfloat16_t,
+		z0 = svcmla_f16_z (p0, z0, z1, z2, 180),
+		z0 = svcmla_z (p0, z0, z1, z2, 180))
+
+/*
+** cmla_180_f16_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f16_z_tied2, svfloat16_t,
+		z0 = svcmla_f16_z (p0, z1, z0, z2, 180),
+		z0 = svcmla_z (p0, z1, z0, z2, 180))
+
+/*
+** cmla_180_f16_z_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f16_z_tied3, svfloat16_t,
+		z0 = svcmla_f16_z (p0, z1, z2, z0, 180),
+		z0 = svcmla_z (p0, z1, z2, z0, 180))
+
+/*
+** cmla_180_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f16_z_untied, svfloat16_t,
+		z0 = svcmla_f16_z (p0, z1, z2, z3, 180),
+		z0 = svcmla_z (p0, z1, z2, z3, 180))
+
+/*
+** cmla_270_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f16_z_tied1, svfloat16_t,
+		z0 = svcmla_f16_z (p0, z0, z1, z2, 270),
+		z0 = svcmla_z (p0, z0, z1, z2, 270))
+
+/*
+** cmla_270_f16_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f16_z_tied2, svfloat16_t,
+		z0 = svcmla_f16_z (p0, z1, z0, z2, 270),
+		z0 = svcmla_z (p0, z1, z0, z2, 270))
+
+/*
+** cmla_270_f16_z_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f16_z_tied3, svfloat16_t,
+		z0 = svcmla_f16_z (p0, z1, z2, z0, 270),
+		z0 = svcmla_z (p0, z1, z2, z0, 270))
+
+/*
+** cmla_270_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f16_z_untied, svfloat16_t,
+		z0 = svcmla_f16_z (p0, z1, z2, z3, 270),
+		z0 = svcmla_z (p0, z1, z2, z3, 270))
+
+/*
+** cmla_0_f16_x_tied1:
+**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f16_x_tied1, svfloat16_t,
+		z0 = svcmla_f16_x (p0, z0, z1, z2, 0),
+		z0 = svcmla_x (p0, z0, z1, z2, 0))
+
+/*
+** cmla_0_f16_x_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f16_x_tied2, svfloat16_t,
+		z0 = svcmla_f16_x (p0, z1, z0, z2, 0),
+		z0 = svcmla_x (p0, z1, z0, z2, 0))
+
+/*
+** cmla_0_f16_x_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f16_x_tied3, svfloat16_t,
+		z0 = svcmla_f16_x (p0, z1, z2, z0, 0),
+		z0 = svcmla_x (p0, z1, z2, z0, 0))
+
+/*
+** cmla_0_f16_x_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f16_x_untied, svfloat16_t,
+		z0 = svcmla_f16_x (p0, z1, z2, z3, 0),
+		z0 = svcmla_x (p0, z1, z2, z3, 0))
+
+/*
+** cmla_90_f16_x_tied1:
+**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f16_x_tied1, svfloat16_t,
+		z0 = svcmla_f16_x (p0, z0, z1, z2, 90),
+		z0 = svcmla_x (p0, z0, z1, z2, 90))
+
+/*
+** cmla_90_f16_x_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f16_x_tied2, svfloat16_t,
+		z0 = svcmla_f16_x (p0, z1, z0, z2, 90),
+		z0 = svcmla_x (p0, z1, z0, z2, 90))
+
+/*
+** cmla_90_f16_x_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f16_x_tied3, svfloat16_t,
+		z0 = svcmla_f16_x (p0, z1, z2, z0, 90),
+		z0 = svcmla_x (p0, z1, z2, z0, 90))
+
+/*
+** cmla_90_f16_x_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f16_x_untied, svfloat16_t,
+		z0 = svcmla_f16_x (p0, z1, z2, z3, 90),
+		z0 = svcmla_x (p0, z1, z2, z3, 90))
+
+/*
+** cmla_180_f16_x_tied1:
+**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f16_x_tied1, svfloat16_t,
+		z0 = svcmla_f16_x (p0, z0, z1, z2, 180),
+		z0 = svcmla_x (p0, z0, z1, z2, 180))
+
+/*
+** cmla_180_f16_x_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f16_x_tied2, svfloat16_t,
+		z0 = svcmla_f16_x (p0, z1, z0, z2, 180),
+		z0 = svcmla_x (p0, z1, z0, z2, 180))
+
+/*
+** cmla_180_f16_x_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f16_x_tied3, svfloat16_t,
+		z0 = svcmla_f16_x (p0, z1, z2, z0, 180),
+		z0 = svcmla_x (p0, z1, z2, z0, 180))
+
+/*
+** cmla_180_f16_x_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f16_x_untied, svfloat16_t,
+		z0 = svcmla_f16_x (p0, z1, z2, z3, 180),
+		z0 = svcmla_x (p0, z1, z2, z3, 180))
+
+/*
+** cmla_270_f16_x_tied1:
+**	fcmla	z0\.h, p0/m, z1\.h, z2\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f16_x_tied1, svfloat16_t,
+		z0 = svcmla_f16_x (p0, z0, z1, z2, 270),
+		z0 = svcmla_x (p0, z0, z1, z2, 270))
+
+/*
+** cmla_270_f16_x_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, \1\.h, z2\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f16_x_tied2, svfloat16_t,
+		z0 = svcmla_f16_x (p0, z1, z0, z2, 270),
+		z0 = svcmla_x (p0, z1, z0, z2, 270))
+
+/*
+** cmla_270_f16_x_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, z2\.h, \1\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f16_x_tied3, svfloat16_t,
+		z0 = svcmla_f16_x (p0, z1, z2, z0, 270),
+		z0 = svcmla_x (p0, z1, z2, z0, 270))
+
+/*
+** cmla_270_f16_x_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.h, p0/m, z2\.h, z3\.h, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f16_x_untied, svfloat16_t,
+		z0 = svcmla_f16_x (p0, z1, z2, z3, 270),
+		z0 = svcmla_x (p0, z1, z2, z3, 270))
+
+/*
+** ptrue_cmla_0_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_0_f16_x_tied1, svfloat16_t,
+		z0 = svcmla_f16_x (svptrue_b16 (), z0, z1, z2, 0),
+		z0 = svcmla_x (svptrue_b16 (), z0, z1, z2, 0))
+
+/*
+** ptrue_cmla_0_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_0_f16_x_tied2, svfloat16_t,
+		z0 = svcmla_f16_x (svptrue_b16 (), z1, z0, z2, 0),
+		z0 = svcmla_x (svptrue_b16 (), z1, z0, z2, 0))
+
+/*
+** ptrue_cmla_0_f16_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_0_f16_x_tied3, svfloat16_t,
+		z0 = svcmla_f16_x (svptrue_b16 (), z1, z2, z0, 0),
+		z0 = svcmla_x (svptrue_b16 (), z1, z2, z0, 0))
+
+/*
+** ptrue_cmla_0_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_0_f16_x_untied, svfloat16_t,
+		z0 = svcmla_f16_x (svptrue_b16 (), z1, z2, z3, 0),
+		z0 = svcmla_x (svptrue_b16 (), z1, z2, z3, 0))
+
+/*
+** ptrue_cmla_90_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_90_f16_x_tied1, svfloat16_t,
+		z0 = svcmla_f16_x (svptrue_b16 (), z0, z1, z2, 90),
+		z0 = svcmla_x (svptrue_b16 (), z0, z1, z2, 90))
+
+/*
+** ptrue_cmla_90_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_90_f16_x_tied2, svfloat16_t,
+		z0 = svcmla_f16_x (svptrue_b16 (), z1, z0, z2, 90),
+		z0 = svcmla_x (svptrue_b16 (), z1, z0, z2, 90))
+
+/*
+** ptrue_cmla_90_f16_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_90_f16_x_tied3, svfloat16_t,
+		z0 = svcmla_f16_x (svptrue_b16 (), z1, z2, z0, 90),
+		z0 = svcmla_x (svptrue_b16 (), z1, z2, z0, 90))
+
+/*
+** ptrue_cmla_90_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_90_f16_x_untied, svfloat16_t,
+		z0 = svcmla_f16_x (svptrue_b16 (), z1, z2, z3, 90),
+		z0 = svcmla_x (svptrue_b16 (), z1, z2, z3, 90))
+
+/*
+** ptrue_cmla_180_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_180_f16_x_tied1, svfloat16_t,
+		z0 = svcmla_f16_x (svptrue_b16 (), z0, z1, z2, 180),
+		z0 = svcmla_x (svptrue_b16 (), z0, z1, z2, 180))
+
+/*
+** ptrue_cmla_180_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_180_f16_x_tied2, svfloat16_t,
+		z0 = svcmla_f16_x (svptrue_b16 (), z1, z0, z2, 180),
+		z0 = svcmla_x (svptrue_b16 (), z1, z0, z2, 180))
+
+/*
+** ptrue_cmla_180_f16_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_180_f16_x_tied3, svfloat16_t,
+		z0 = svcmla_f16_x (svptrue_b16 (), z1, z2, z0, 180),
+		z0 = svcmla_x (svptrue_b16 (), z1, z2, z0, 180))
+
+/*
+** ptrue_cmla_180_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_180_f16_x_untied, svfloat16_t,
+		z0 = svcmla_f16_x (svptrue_b16 (), z1, z2, z3, 180),
+		z0 = svcmla_x (svptrue_b16 (), z1, z2, z3, 180))
+
+/*
+** ptrue_cmla_270_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_270_f16_x_tied1, svfloat16_t,
+		z0 = svcmla_f16_x (svptrue_b16 (), z0, z1, z2, 270),
+		z0 = svcmla_x (svptrue_b16 (), z0, z1, z2, 270))
+
+/*
+** ptrue_cmla_270_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_270_f16_x_tied2, svfloat16_t,
+		z0 = svcmla_f16_x (svptrue_b16 (), z1, z0, z2, 270),
+		z0 = svcmla_x (svptrue_b16 (), z1, z0, z2, 270))
+
+/*
+** ptrue_cmla_270_f16_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_270_f16_x_tied3, svfloat16_t,
+		z0 = svcmla_f16_x (svptrue_b16 (), z1, z2, z0, 270),
+		z0 = svcmla_x (svptrue_b16 (), z1, z2, z0, 270))
+
+/*
+** ptrue_cmla_270_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_270_f16_x_untied, svfloat16_t,
+		z0 = svcmla_f16_x (svptrue_b16 (), z1, z2, z3, 270),
+		z0 = svcmla_x (svptrue_b16 (), z1, z2, z3, 270))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f32.c
new file mode 100644
index 000000000..b266738b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f32.c
@@ -0,0 +1,675 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmla_0_f32_m_tied1:
+**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f32_m_tied1, svfloat32_t,
+		z0 = svcmla_f32_m (p0, z0, z1, z2, 0),
+		z0 = svcmla_m (p0, z0, z1, z2, 0))
+
+/*
+** cmla_0_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f32_m_tied2, svfloat32_t,
+		z0 = svcmla_f32_m (p0, z1, z0, z2, 0),
+		z0 = svcmla_m (p0, z1, z0, z2, 0))
+
+/*
+** cmla_0_f32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f32_m_tied3, svfloat32_t,
+		z0 = svcmla_f32_m (p0, z1, z2, z0, 0),
+		z0 = svcmla_m (p0, z1, z2, z0, 0))
+
+/*
+** cmla_0_f32_m_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f32_m_untied, svfloat32_t,
+		z0 = svcmla_f32_m (p0, z1, z2, z3, 0),
+		z0 = svcmla_m (p0, z1, z2, z3, 0))
+
+/*
+** cmla_90_f32_m_tied1:
+**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f32_m_tied1, svfloat32_t,
+		z0 = svcmla_f32_m (p0, z0, z1, z2, 90),
+		z0 = svcmla_m (p0, z0, z1, z2, 90))
+
+/*
+** cmla_90_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f32_m_tied2, svfloat32_t,
+		z0 = svcmla_f32_m (p0, z1, z0, z2, 90),
+		z0 = svcmla_m (p0, z1, z0, z2, 90))
+
+/*
+** cmla_90_f32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f32_m_tied3, svfloat32_t,
+		z0 = svcmla_f32_m (p0, z1, z2, z0, 90),
+		z0 = svcmla_m (p0, z1, z2, z0, 90))
+
+/*
+** cmla_90_f32_m_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f32_m_untied, svfloat32_t,
+		z0 = svcmla_f32_m (p0, z1, z2, z3, 90),
+		z0 = svcmla_m (p0, z1, z2, z3, 90))
+
+/*
+** cmla_180_f32_m_tied1:
+**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f32_m_tied1, svfloat32_t,
+		z0 = svcmla_f32_m (p0, z0, z1, z2, 180),
+		z0 = svcmla_m (p0, z0, z1, z2, 180))
+
+/*
+** cmla_180_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f32_m_tied2, svfloat32_t,
+		z0 = svcmla_f32_m (p0, z1, z0, z2, 180),
+		z0 = svcmla_m (p0, z1, z0, z2, 180))
+
+/*
+** cmla_180_f32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f32_m_tied3, svfloat32_t,
+		z0 = svcmla_f32_m (p0, z1, z2, z0, 180),
+		z0 = svcmla_m (p0, z1, z2, z0, 180))
+
+/*
+** cmla_180_f32_m_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f32_m_untied, svfloat32_t,
+		z0 = svcmla_f32_m (p0, z1, z2, z3, 180),
+		z0 = svcmla_m (p0, z1, z2, z3, 180))
+
+/*
+** cmla_270_f32_m_tied1:
+**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f32_m_tied1, svfloat32_t,
+		z0 = svcmla_f32_m (p0, z0, z1, z2, 270),
+		z0 = svcmla_m (p0, z0, z1, z2, 270))
+
+/*
+** cmla_270_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f32_m_tied2, svfloat32_t,
+		z0 = svcmla_f32_m (p0, z1, z0, z2, 270),
+		z0 = svcmla_m (p0, z1, z0, z2, 270))
+
+/*
+** cmla_270_f32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f32_m_tied3, svfloat32_t,
+		z0 = svcmla_f32_m (p0, z1, z2, z0, 270),
+		z0 = svcmla_m (p0, z1, z2, z0, 270))
+
+/*
+** cmla_270_f32_m_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f32_m_untied, svfloat32_t,
+		z0 = svcmla_f32_m (p0, z1, z2, z3, 270),
+		z0 = svcmla_m (p0, z1, z2, z3, 270))
+
+/*
+** cmla_0_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f32_z_tied1, svfloat32_t,
+		z0 = svcmla_f32_z (p0, z0, z1, z2, 0),
+		z0 = svcmla_z (p0, z0, z1, z2, 0))
+
+/*
+** cmla_0_f32_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f32_z_tied2, svfloat32_t,
+		z0 = svcmla_f32_z (p0, z1, z0, z2, 0),
+		z0 = svcmla_z (p0, z1, z0, z2, 0))
+
+/*
+** cmla_0_f32_z_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f32_z_tied3, svfloat32_t,
+		z0 = svcmla_f32_z (p0, z1, z2, z0, 0),
+		z0 = svcmla_z (p0, z1, z2, z0, 0))
+
+/*
+** cmla_0_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f32_z_untied, svfloat32_t,
+		z0 = svcmla_f32_z (p0, z1, z2, z3, 0),
+		z0 = svcmla_z (p0, z1, z2, z3, 0))
+
+/*
+** cmla_90_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f32_z_tied1, svfloat32_t,
+		z0 = svcmla_f32_z (p0, z0, z1, z2, 90),
+		z0 = svcmla_z (p0, z0, z1, z2, 90))
+
+/*
+** cmla_90_f32_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f32_z_tied2, svfloat32_t,
+		z0 = svcmla_f32_z (p0, z1, z0, z2, 90),
+		z0 = svcmla_z (p0, z1, z0, z2, 90))
+
+/*
+** cmla_90_f32_z_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f32_z_tied3, svfloat32_t,
+		z0 = svcmla_f32_z (p0, z1, z2, z0, 90),
+		z0 = svcmla_z (p0, z1, z2, z0, 90))
+
+/*
+** cmla_90_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f32_z_untied, svfloat32_t,
+		z0 = svcmla_f32_z (p0, z1, z2, z3, 90),
+		z0 = svcmla_z (p0, z1, z2, z3, 90))
+
+/*
+** cmla_180_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f32_z_tied1, svfloat32_t,
+		z0 = svcmla_f32_z (p0, z0, z1, z2, 180),
+		z0 = svcmla_z (p0, z0, z1, z2, 180))
+
+/*
+** cmla_180_f32_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f32_z_tied2, svfloat32_t,
+		z0 = svcmla_f32_z (p0, z1, z0, z2, 180),
+		z0 = svcmla_z (p0, z1, z0, z2, 180))
+
+/*
+** cmla_180_f32_z_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f32_z_tied3, svfloat32_t,
+		z0 = svcmla_f32_z (p0, z1, z2, z0, 180),
+		z0 = svcmla_z (p0, z1, z2, z0, 180))
+
+/*
+** cmla_180_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f32_z_untied, svfloat32_t,
+		z0 = svcmla_f32_z (p0, z1, z2, z3, 180),
+		z0 = svcmla_z (p0, z1, z2, z3, 180))
+
+/*
+** cmla_270_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f32_z_tied1, svfloat32_t,
+		z0 = svcmla_f32_z (p0, z0, z1, z2, 270),
+		z0 = svcmla_z (p0, z0, z1, z2, 270))
+
+/*
+** cmla_270_f32_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f32_z_tied2, svfloat32_t,
+		z0 = svcmla_f32_z (p0, z1, z0, z2, 270),
+		z0 = svcmla_z (p0, z1, z0, z2, 270))
+
+/*
+** cmla_270_f32_z_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f32_z_tied3, svfloat32_t,
+		z0 = svcmla_f32_z (p0, z1, z2, z0, 270),
+		z0 = svcmla_z (p0, z1, z2, z0, 270))
+
+/*
+** cmla_270_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f32_z_untied, svfloat32_t,
+		z0 = svcmla_f32_z (p0, z1, z2, z3, 270),
+		z0 = svcmla_z (p0, z1, z2, z3, 270))
+
+/*
+** cmla_0_f32_x_tied1:
+**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f32_x_tied1, svfloat32_t,
+		z0 = svcmla_f32_x (p0, z0, z1, z2, 0),
+		z0 = svcmla_x (p0, z0, z1, z2, 0))
+
+/*
+** cmla_0_f32_x_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f32_x_tied2, svfloat32_t,
+		z0 = svcmla_f32_x (p0, z1, z0, z2, 0),
+		z0 = svcmla_x (p0, z1, z0, z2, 0))
+
+/*
+** cmla_0_f32_x_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f32_x_tied3, svfloat32_t,
+		z0 = svcmla_f32_x (p0, z1, z2, z0, 0),
+		z0 = svcmla_x (p0, z1, z2, z0, 0))
+
+/*
+** cmla_0_f32_x_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f32_x_untied, svfloat32_t,
+		z0 = svcmla_f32_x (p0, z1, z2, z3, 0),
+		z0 = svcmla_x (p0, z1, z2, z3, 0))
+
+/*
+** cmla_90_f32_x_tied1:
+**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f32_x_tied1, svfloat32_t,
+		z0 = svcmla_f32_x (p0, z0, z1, z2, 90),
+		z0 = svcmla_x (p0, z0, z1, z2, 90))
+
+/*
+** cmla_90_f32_x_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f32_x_tied2, svfloat32_t,
+		z0 = svcmla_f32_x (p0, z1, z0, z2, 90),
+		z0 = svcmla_x (p0, z1, z0, z2, 90))
+
+/*
+** cmla_90_f32_x_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f32_x_tied3, svfloat32_t,
+		z0 = svcmla_f32_x (p0, z1, z2, z0, 90),
+		z0 = svcmla_x (p0, z1, z2, z0, 90))
+
+/*
+** cmla_90_f32_x_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f32_x_untied, svfloat32_t,
+		z0 = svcmla_f32_x (p0, z1, z2, z3, 90),
+		z0 = svcmla_x (p0, z1, z2, z3, 90))
+
+/*
+** cmla_180_f32_x_tied1:
+**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f32_x_tied1, svfloat32_t,
+		z0 = svcmla_f32_x (p0, z0, z1, z2, 180),
+		z0 = svcmla_x (p0, z0, z1, z2, 180))
+
+/*
+** cmla_180_f32_x_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f32_x_tied2, svfloat32_t,
+		z0 = svcmla_f32_x (p0, z1, z0, z2, 180),
+		z0 = svcmla_x (p0, z1, z0, z2, 180))
+
+/*
+** cmla_180_f32_x_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f32_x_tied3, svfloat32_t,
+		z0 = svcmla_f32_x (p0, z1, z2, z0, 180),
+		z0 = svcmla_x (p0, z1, z2, z0, 180))
+
+/*
+** cmla_180_f32_x_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f32_x_untied, svfloat32_t,
+		z0 = svcmla_f32_x (p0, z1, z2, z3, 180),
+		z0 = svcmla_x (p0, z1, z2, z3, 180))
+
+/*
+** cmla_270_f32_x_tied1:
+**	fcmla	z0\.s, p0/m, z1\.s, z2\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f32_x_tied1, svfloat32_t,
+		z0 = svcmla_f32_x (p0, z0, z1, z2, 270),
+		z0 = svcmla_x (p0, z0, z1, z2, 270))
+
+/*
+** cmla_270_f32_x_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, \1\.s, z2\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f32_x_tied2, svfloat32_t,
+		z0 = svcmla_f32_x (p0, z1, z0, z2, 270),
+		z0 = svcmla_x (p0, z1, z0, z2, 270))
+
+/*
+** cmla_270_f32_x_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, z2\.s, \1\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f32_x_tied3, svfloat32_t,
+		z0 = svcmla_f32_x (p0, z1, z2, z0, 270),
+		z0 = svcmla_x (p0, z1, z2, z0, 270))
+
+/*
+** cmla_270_f32_x_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.s, p0/m, z2\.s, z3\.s, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f32_x_untied, svfloat32_t,
+		z0 = svcmla_f32_x (p0, z1, z2, z3, 270),
+		z0 = svcmla_x (p0, z1, z2, z3, 270))
+
+/*
+** ptrue_cmla_0_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_0_f32_x_tied1, svfloat32_t,
+		z0 = svcmla_f32_x (svptrue_b32 (), z0, z1, z2, 0),
+		z0 = svcmla_x (svptrue_b32 (), z0, z1, z2, 0))
+
+/*
+** ptrue_cmla_0_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_0_f32_x_tied2, svfloat32_t,
+		z0 = svcmla_f32_x (svptrue_b32 (), z1, z0, z2, 0),
+		z0 = svcmla_x (svptrue_b32 (), z1, z0, z2, 0))
+
+/*
+** ptrue_cmla_0_f32_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_0_f32_x_tied3, svfloat32_t,
+		z0 = svcmla_f32_x (svptrue_b32 (), z1, z2, z0, 0),
+		z0 = svcmla_x (svptrue_b32 (), z1, z2, z0, 0))
+
+/*
+** ptrue_cmla_0_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_0_f32_x_untied, svfloat32_t,
+		z0 = svcmla_f32_x (svptrue_b32 (), z1, z2, z3, 0),
+		z0 = svcmla_x (svptrue_b32 (), z1, z2, z3, 0))
+
+/*
+** ptrue_cmla_90_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_90_f32_x_tied1, svfloat32_t,
+		z0 = svcmla_f32_x (svptrue_b32 (), z0, z1, z2, 90),
+		z0 = svcmla_x (svptrue_b32 (), z0, z1, z2, 90))
+
+/*
+** ptrue_cmla_90_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_90_f32_x_tied2, svfloat32_t,
+		z0 = svcmla_f32_x (svptrue_b32 (), z1, z0, z2, 90),
+		z0 = svcmla_x (svptrue_b32 (), z1, z0, z2, 90))
+
+/*
+** ptrue_cmla_90_f32_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_90_f32_x_tied3, svfloat32_t,
+		z0 = svcmla_f32_x (svptrue_b32 (), z1, z2, z0, 90),
+		z0 = svcmla_x (svptrue_b32 (), z1, z2, z0, 90))
+
+/*
+** ptrue_cmla_90_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_90_f32_x_untied, svfloat32_t,
+		z0 = svcmla_f32_x (svptrue_b32 (), z1, z2, z3, 90),
+		z0 = svcmla_x (svptrue_b32 (), z1, z2, z3, 90))
+
+/*
+** ptrue_cmla_180_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_180_f32_x_tied1, svfloat32_t,
+		z0 = svcmla_f32_x (svptrue_b32 (), z0, z1, z2, 180),
+		z0 = svcmla_x (svptrue_b32 (), z0, z1, z2, 180))
+
+/*
+** ptrue_cmla_180_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_180_f32_x_tied2, svfloat32_t,
+		z0 = svcmla_f32_x (svptrue_b32 (), z1, z0, z2, 180),
+		z0 = svcmla_x (svptrue_b32 (), z1, z0, z2, 180))
+
+/*
+** ptrue_cmla_180_f32_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_180_f32_x_tied3, svfloat32_t,
+		z0 = svcmla_f32_x (svptrue_b32 (), z1, z2, z0, 180),
+		z0 = svcmla_x (svptrue_b32 (), z1, z2, z0, 180))
+
+/*
+** ptrue_cmla_180_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_180_f32_x_untied, svfloat32_t,
+		z0 = svcmla_f32_x (svptrue_b32 (), z1, z2, z3, 180),
+		z0 = svcmla_x (svptrue_b32 (), z1, z2, z3, 180))
+
+/*
+** ptrue_cmla_270_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_270_f32_x_tied1, svfloat32_t,
+		z0 = svcmla_f32_x (svptrue_b32 (), z0, z1, z2, 270),
+		z0 = svcmla_x (svptrue_b32 (), z0, z1, z2, 270))
+
+/*
+** ptrue_cmla_270_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_270_f32_x_tied2, svfloat32_t,
+		z0 = svcmla_f32_x (svptrue_b32 (), z1, z0, z2, 270),
+		z0 = svcmla_x (svptrue_b32 (), z1, z0, z2, 270))
+
+/*
+** ptrue_cmla_270_f32_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_270_f32_x_tied3, svfloat32_t,
+		z0 = svcmla_f32_x (svptrue_b32 (), z1, z2, z0, 270),
+		z0 = svcmla_x (svptrue_b32 (), z1, z2, z0, 270))
+
+/*
+** ptrue_cmla_270_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_270_f32_x_untied, svfloat32_t,
+		z0 = svcmla_f32_x (svptrue_b32 (), z1, z2, z3, 270),
+		z0 = svcmla_x (svptrue_b32 (), z1, z2, z3, 270))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f64.c
new file mode 100644
index 000000000..024ae5ce3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_f64.c
@@ -0,0 +1,675 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmla_0_f64_m_tied1:
+**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f64_m_tied1, svfloat64_t,
+		z0 = svcmla_f64_m (p0, z0, z1, z2, 0),
+		z0 = svcmla_m (p0, z0, z1, z2, 0))
+
+/*
+** cmla_0_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, \1, z2\.d, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f64_m_tied2, svfloat64_t,
+		z0 = svcmla_f64_m (p0, z1, z0, z2, 0),
+		z0 = svcmla_m (p0, z1, z0, z2, 0))
+
+/*
+** cmla_0_f64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, z2\.d, \1, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f64_m_tied3, svfloat64_t,
+		z0 = svcmla_f64_m (p0, z1, z2, z0, 0),
+		z0 = svcmla_m (p0, z1, z2, z0, 0))
+
+/*
+** cmla_0_f64_m_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f64_m_untied, svfloat64_t,
+		z0 = svcmla_f64_m (p0, z1, z2, z3, 0),
+		z0 = svcmla_m (p0, z1, z2, z3, 0))
+
+/*
+** cmla_90_f64_m_tied1:
+**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f64_m_tied1, svfloat64_t,
+		z0 = svcmla_f64_m (p0, z0, z1, z2, 90),
+		z0 = svcmla_m (p0, z0, z1, z2, 90))
+
+/*
+** cmla_90_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, \1, z2\.d, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f64_m_tied2, svfloat64_t,
+		z0 = svcmla_f64_m (p0, z1, z0, z2, 90),
+		z0 = svcmla_m (p0, z1, z0, z2, 90))
+
+/*
+** cmla_90_f64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, z2\.d, \1, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f64_m_tied3, svfloat64_t,
+		z0 = svcmla_f64_m (p0, z1, z2, z0, 90),
+		z0 = svcmla_m (p0, z1, z2, z0, 90))
+
+/*
+** cmla_90_f64_m_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f64_m_untied, svfloat64_t,
+		z0 = svcmla_f64_m (p0, z1, z2, z3, 90),
+		z0 = svcmla_m (p0, z1, z2, z3, 90))
+
+/*
+** cmla_180_f64_m_tied1:
+**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f64_m_tied1, svfloat64_t,
+		z0 = svcmla_f64_m (p0, z0, z1, z2, 180),
+		z0 = svcmla_m (p0, z0, z1, z2, 180))
+
+/*
+** cmla_180_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, \1, z2\.d, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f64_m_tied2, svfloat64_t,
+		z0 = svcmla_f64_m (p0, z1, z0, z2, 180),
+		z0 = svcmla_m (p0, z1, z0, z2, 180))
+
+/*
+** cmla_180_f64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, z2\.d, \1, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f64_m_tied3, svfloat64_t,
+		z0 = svcmla_f64_m (p0, z1, z2, z0, 180),
+		z0 = svcmla_m (p0, z1, z2, z0, 180))
+
+/*
+** cmla_180_f64_m_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f64_m_untied, svfloat64_t,
+		z0 = svcmla_f64_m (p0, z1, z2, z3, 180),
+		z0 = svcmla_m (p0, z1, z2, z3, 180))
+
+/*
+** cmla_270_f64_m_tied1:
+**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f64_m_tied1, svfloat64_t,
+		z0 = svcmla_f64_m (p0, z0, z1, z2, 270),
+		z0 = svcmla_m (p0, z0, z1, z2, 270))
+
+/*
+** cmla_270_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, \1, z2\.d, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f64_m_tied2, svfloat64_t,
+		z0 = svcmla_f64_m (p0, z1, z0, z2, 270),
+		z0 = svcmla_m (p0, z1, z0, z2, 270))
+
+/*
+** cmla_270_f64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, z2\.d, \1, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f64_m_tied3, svfloat64_t,
+		z0 = svcmla_f64_m (p0, z1, z2, z0, 270),
+		z0 = svcmla_m (p0, z1, z2, z0, 270))
+
+/*
+** cmla_270_f64_m_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f64_m_untied, svfloat64_t,
+		z0 = svcmla_f64_m (p0, z1, z2, z3, 270),
+		z0 = svcmla_m (p0, z1, z2, z3, 270))
+
+/*
+** cmla_0_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f64_z_tied1, svfloat64_t,
+		z0 = svcmla_f64_z (p0, z0, z1, z2, 0),
+		z0 = svcmla_z (p0, z0, z1, z2, 0))
+
+/*
+** cmla_0_f64_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fcmla	z0\.d, p0/m, \1, z2\.d, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f64_z_tied2, svfloat64_t,
+		z0 = svcmla_f64_z (p0, z1, z0, z2, 0),
+		z0 = svcmla_z (p0, z1, z0, z2, 0))
+
+/*
+** cmla_0_f64_z_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fcmla	z0\.d, p0/m, z2\.d, \1, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f64_z_tied3, svfloat64_t,
+		z0 = svcmla_f64_z (p0, z1, z2, z0, 0),
+		z0 = svcmla_z (p0, z1, z2, z0, 0))
+
+/*
+** cmla_0_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f64_z_untied, svfloat64_t,
+		z0 = svcmla_f64_z (p0, z1, z2, z3, 0),
+		z0 = svcmla_z (p0, z1, z2, z3, 0))
+
+/*
+** cmla_90_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f64_z_tied1, svfloat64_t,
+		z0 = svcmla_f64_z (p0, z0, z1, z2, 90),
+		z0 = svcmla_z (p0, z0, z1, z2, 90))
+
+/*
+** cmla_90_f64_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fcmla	z0\.d, p0/m, \1, z2\.d, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f64_z_tied2, svfloat64_t,
+		z0 = svcmla_f64_z (p0, z1, z0, z2, 90),
+		z0 = svcmla_z (p0, z1, z0, z2, 90))
+
+/*
+** cmla_90_f64_z_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fcmla	z0\.d, p0/m, z2\.d, \1, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f64_z_tied3, svfloat64_t,
+		z0 = svcmla_f64_z (p0, z1, z2, z0, 90),
+		z0 = svcmla_z (p0, z1, z2, z0, 90))
+
+/*
+** cmla_90_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f64_z_untied, svfloat64_t,
+		z0 = svcmla_f64_z (p0, z1, z2, z3, 90),
+		z0 = svcmla_z (p0, z1, z2, z3, 90))
+
+/*
+** cmla_180_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f64_z_tied1, svfloat64_t,
+		z0 = svcmla_f64_z (p0, z0, z1, z2, 180),
+		z0 = svcmla_z (p0, z0, z1, z2, 180))
+
+/*
+** cmla_180_f64_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fcmla	z0\.d, p0/m, \1, z2\.d, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f64_z_tied2, svfloat64_t,
+		z0 = svcmla_f64_z (p0, z1, z0, z2, 180),
+		z0 = svcmla_z (p0, z1, z0, z2, 180))
+
+/*
+** cmla_180_f64_z_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fcmla	z0\.d, p0/m, z2\.d, \1, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f64_z_tied3, svfloat64_t,
+		z0 = svcmla_f64_z (p0, z1, z2, z0, 180),
+		z0 = svcmla_z (p0, z1, z2, z0, 180))
+
+/*
+** cmla_180_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f64_z_untied, svfloat64_t,
+		z0 = svcmla_f64_z (p0, z1, z2, z3, 180),
+		z0 = svcmla_z (p0, z1, z2, z3, 180))
+
+/*
+** cmla_270_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f64_z_tied1, svfloat64_t,
+		z0 = svcmla_f64_z (p0, z0, z1, z2, 270),
+		z0 = svcmla_z (p0, z0, z1, z2, 270))
+
+/*
+** cmla_270_f64_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fcmla	z0\.d, p0/m, \1, z2\.d, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f64_z_tied2, svfloat64_t,
+		z0 = svcmla_f64_z (p0, z1, z0, z2, 270),
+		z0 = svcmla_z (p0, z1, z0, z2, 270))
+
+/*
+** cmla_270_f64_z_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fcmla	z0\.d, p0/m, z2\.d, \1, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f64_z_tied3, svfloat64_t,
+		z0 = svcmla_f64_z (p0, z1, z2, z0, 270),
+		z0 = svcmla_z (p0, z1, z2, z0, 270))
+
+/*
+** cmla_270_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f64_z_untied, svfloat64_t,
+		z0 = svcmla_f64_z (p0, z1, z2, z3, 270),
+		z0 = svcmla_z (p0, z1, z2, z3, 270))
+
+/*
+** cmla_0_f64_x_tied1:
+**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f64_x_tied1, svfloat64_t,
+		z0 = svcmla_f64_x (p0, z0, z1, z2, 0),
+		z0 = svcmla_x (p0, z0, z1, z2, 0))
+
+/*
+** cmla_0_f64_x_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, \1, z2\.d, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f64_x_tied2, svfloat64_t,
+		z0 = svcmla_f64_x (p0, z1, z0, z2, 0),
+		z0 = svcmla_x (p0, z1, z0, z2, 0))
+
+/*
+** cmla_0_f64_x_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, z2\.d, \1, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f64_x_tied3, svfloat64_t,
+		z0 = svcmla_f64_x (p0, z1, z2, z0, 0),
+		z0 = svcmla_x (p0, z1, z2, z0, 0))
+
+/*
+** cmla_0_f64_x_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_0_f64_x_untied, svfloat64_t,
+		z0 = svcmla_f64_x (p0, z1, z2, z3, 0),
+		z0 = svcmla_x (p0, z1, z2, z3, 0))
+
+/*
+** cmla_90_f64_x_tied1:
+**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f64_x_tied1, svfloat64_t,
+		z0 = svcmla_f64_x (p0, z0, z1, z2, 90),
+		z0 = svcmla_x (p0, z0, z1, z2, 90))
+
+/*
+** cmla_90_f64_x_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, \1, z2\.d, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f64_x_tied2, svfloat64_t,
+		z0 = svcmla_f64_x (p0, z1, z0, z2, 90),
+		z0 = svcmla_x (p0, z1, z0, z2, 90))
+
+/*
+** cmla_90_f64_x_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, z2\.d, \1, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f64_x_tied3, svfloat64_t,
+		z0 = svcmla_f64_x (p0, z1, z2, z0, 90),
+		z0 = svcmla_x (p0, z1, z2, z0, 90))
+
+/*
+** cmla_90_f64_x_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_90_f64_x_untied, svfloat64_t,
+		z0 = svcmla_f64_x (p0, z1, z2, z3, 90),
+		z0 = svcmla_x (p0, z1, z2, z3, 90))
+
+/*
+** cmla_180_f64_x_tied1:
+**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f64_x_tied1, svfloat64_t,
+		z0 = svcmla_f64_x (p0, z0, z1, z2, 180),
+		z0 = svcmla_x (p0, z0, z1, z2, 180))
+
+/*
+** cmla_180_f64_x_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, \1, z2\.d, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f64_x_tied2, svfloat64_t,
+		z0 = svcmla_f64_x (p0, z1, z0, z2, 180),
+		z0 = svcmla_x (p0, z1, z0, z2, 180))
+
+/*
+** cmla_180_f64_x_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, z2\.d, \1, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f64_x_tied3, svfloat64_t,
+		z0 = svcmla_f64_x (p0, z1, z2, z0, 180),
+		z0 = svcmla_x (p0, z1, z2, z0, 180))
+
+/*
+** cmla_180_f64_x_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_180_f64_x_untied, svfloat64_t,
+		z0 = svcmla_f64_x (p0, z1, z2, z3, 180),
+		z0 = svcmla_x (p0, z1, z2, z3, 180))
+
+/*
+** cmla_270_f64_x_tied1:
+**	fcmla	z0\.d, p0/m, z1\.d, z2\.d, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f64_x_tied1, svfloat64_t,
+		z0 = svcmla_f64_x (p0, z0, z1, z2, 270),
+		z0 = svcmla_x (p0, z0, z1, z2, 270))
+
+/*
+** cmla_270_f64_x_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, \1, z2\.d, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f64_x_tied2, svfloat64_t,
+		z0 = svcmla_f64_x (p0, z1, z0, z2, 270),
+		z0 = svcmla_x (p0, z1, z0, z2, 270))
+
+/*
+** cmla_270_f64_x_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, z2\.d, \1, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f64_x_tied3, svfloat64_t,
+		z0 = svcmla_f64_x (p0, z1, z2, z0, 270),
+		z0 = svcmla_x (p0, z1, z2, z0, 270))
+
+/*
+** cmla_270_f64_x_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.d, p0/m, z2\.d, z3\.d, #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_270_f64_x_untied, svfloat64_t,
+		z0 = svcmla_f64_x (p0, z1, z2, z3, 270),
+		z0 = svcmla_x (p0, z1, z2, z3, 270))
+
+/*
+** ptrue_cmla_0_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_0_f64_x_tied1, svfloat64_t,
+		z0 = svcmla_f64_x (svptrue_b64 (), z0, z1, z2, 0),
+		z0 = svcmla_x (svptrue_b64 (), z0, z1, z2, 0))
+
+/*
+** ptrue_cmla_0_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_0_f64_x_tied2, svfloat64_t,
+		z0 = svcmla_f64_x (svptrue_b64 (), z1, z0, z2, 0),
+		z0 = svcmla_x (svptrue_b64 (), z1, z0, z2, 0))
+
+/*
+** ptrue_cmla_0_f64_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_0_f64_x_tied3, svfloat64_t,
+		z0 = svcmla_f64_x (svptrue_b64 (), z1, z2, z0, 0),
+		z0 = svcmla_x (svptrue_b64 (), z1, z2, z0, 0))
+
+/*
+** ptrue_cmla_0_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_0_f64_x_untied, svfloat64_t,
+		z0 = svcmla_f64_x (svptrue_b64 (), z1, z2, z3, 0),
+		z0 = svcmla_x (svptrue_b64 (), z1, z2, z3, 0))
+
+/*
+** ptrue_cmla_90_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_90_f64_x_tied1, svfloat64_t,
+		z0 = svcmla_f64_x (svptrue_b64 (), z0, z1, z2, 90),
+		z0 = svcmla_x (svptrue_b64 (), z0, z1, z2, 90))
+
+/*
+** ptrue_cmla_90_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_90_f64_x_tied2, svfloat64_t,
+		z0 = svcmla_f64_x (svptrue_b64 (), z1, z0, z2, 90),
+		z0 = svcmla_x (svptrue_b64 (), z1, z0, z2, 90))
+
+/*
+** ptrue_cmla_90_f64_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_90_f64_x_tied3, svfloat64_t,
+		z0 = svcmla_f64_x (svptrue_b64 (), z1, z2, z0, 90),
+		z0 = svcmla_x (svptrue_b64 (), z1, z2, z0, 90))
+
+/*
+** ptrue_cmla_90_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_90_f64_x_untied, svfloat64_t,
+		z0 = svcmla_f64_x (svptrue_b64 (), z1, z2, z3, 90),
+		z0 = svcmla_x (svptrue_b64 (), z1, z2, z3, 90))
+
+/*
+** ptrue_cmla_180_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_180_f64_x_tied1, svfloat64_t,
+		z0 = svcmla_f64_x (svptrue_b64 (), z0, z1, z2, 180),
+		z0 = svcmla_x (svptrue_b64 (), z0, z1, z2, 180))
+
+/*
+** ptrue_cmla_180_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_180_f64_x_tied2, svfloat64_t,
+		z0 = svcmla_f64_x (svptrue_b64 (), z1, z0, z2, 180),
+		z0 = svcmla_x (svptrue_b64 (), z1, z0, z2, 180))
+
+/*
+** ptrue_cmla_180_f64_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_180_f64_x_tied3, svfloat64_t,
+		z0 = svcmla_f64_x (svptrue_b64 (), z1, z2, z0, 180),
+		z0 = svcmla_x (svptrue_b64 (), z1, z2, z0, 180))
+
+/*
+** ptrue_cmla_180_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_180_f64_x_untied, svfloat64_t,
+		z0 = svcmla_f64_x (svptrue_b64 (), z1, z2, z3, 180),
+		z0 = svcmla_x (svptrue_b64 (), z1, z2, z3, 180))
+
+/*
+** ptrue_cmla_270_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_270_f64_x_tied1, svfloat64_t,
+		z0 = svcmla_f64_x (svptrue_b64 (), z0, z1, z2, 270),
+		z0 = svcmla_x (svptrue_b64 (), z0, z1, z2, 270))
+
+/*
+** ptrue_cmla_270_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_270_f64_x_tied2, svfloat64_t,
+		z0 = svcmla_f64_x (svptrue_b64 (), z1, z0, z2, 270),
+		z0 = svcmla_x (svptrue_b64 (), z1, z0, z2, 270))
+
+/*
+** ptrue_cmla_270_f64_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_270_f64_x_tied3, svfloat64_t,
+		z0 = svcmla_f64_x (svptrue_b64 (), z1, z2, z0, 270),
+		z0 = svcmla_x (svptrue_b64 (), z1, z2, z0, 270))
+
+/*
+** ptrue_cmla_270_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_cmla_270_f64_x_untied, svfloat64_t,
+		z0 = svcmla_f64_x (svptrue_b64 (), z1, z2, z3, 270),
+		z0 = svcmla_x (svptrue_b64 (), z1, z2, z3, 270))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_lane_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_lane_f16.c
new file mode 100644
index 000000000..16f1b77ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_lane_f16.c
@@ -0,0 +1,194 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmla_lane_0_0_f16_tied1:
+**	fcmla	z0\.h, z1\.h, z2\.h\[0\], #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_0_f16_tied1, svfloat16_t,
+		z0 = svcmla_lane_f16 (z0, z1, z2, 0, 0),
+		z0 = svcmla_lane (z0, z1, z2, 0, 0))
+
+/*
+** cmla_lane_0_0_f16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, \1\.h, z2\.h\[0\], #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_0_f16_tied2, svfloat16_t,
+		z0 = svcmla_lane_f16 (z1, z0, z2, 0, 0),
+		z0 = svcmla_lane (z1, z0, z2, 0, 0))
+
+/*
+** cmla_lane_0_0_f16_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, z2\.h, \1\.h\[0\], #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_0_f16_tied3, svfloat16_t,
+		z0 = svcmla_lane_f16 (z1, z2, z0, 0, 0),
+		z0 = svcmla_lane (z1, z2, z0, 0, 0))
+
+/*
+** cmla_lane_0_0_f16_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.h, z2\.h, z3\.h\[0\], #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_0_f16_untied, svfloat16_t,
+		z0 = svcmla_lane_f16 (z1, z2, z3, 0, 0),
+		z0 = svcmla_lane (z1, z2, z3, 0, 0))
+
+/*
+** cmla_lane_0_90_f16_tied1:
+**	fcmla	z0\.h, z1\.h, z2\.h\[0\], #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_90_f16_tied1, svfloat16_t,
+		z0 = svcmla_lane_f16 (z0, z1, z2, 0, 90),
+		z0 = svcmla_lane (z0, z1, z2, 0, 90))
+
+/*
+** cmla_lane_0_90_f16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, \1\.h, z2\.h\[0\], #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_90_f16_tied2, svfloat16_t,
+		z0 = svcmla_lane_f16 (z1, z0, z2, 0, 90),
+		z0 = svcmla_lane (z1, z0, z2, 0, 90))
+
+/*
+** cmla_lane_0_90_f16_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, z2\.h, \1\.h\[0\], #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_90_f16_tied3, svfloat16_t,
+		z0 = svcmla_lane_f16 (z1, z2, z0, 0, 90),
+		z0 = svcmla_lane (z1, z2, z0, 0, 90))
+
+/*
+** cmla_lane_0_90_f16_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.h, z2\.h, z3\.h\[0\], #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_90_f16_untied, svfloat16_t,
+		z0 = svcmla_lane_f16 (z1, z2, z3, 0, 90),
+		z0 = svcmla_lane (z1, z2, z3, 0, 90))
+
+/*
+** cmla_lane_0_180_f16_tied1:
+**	fcmla	z0\.h, z1\.h, z2\.h\[0\], #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_180_f16_tied1, svfloat16_t,
+		z0 = svcmla_lane_f16 (z0, z1, z2, 0, 180),
+		z0 = svcmla_lane (z0, z1, z2, 0, 180))
+
+/*
+** cmla_lane_0_180_f16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, \1\.h, z2\.h\[0\], #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_180_f16_tied2, svfloat16_t,
+		z0 = svcmla_lane_f16 (z1, z0, z2, 0, 180),
+		z0 = svcmla_lane (z1, z0, z2, 0, 180))
+
+/*
+** cmla_lane_0_180_f16_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, z2\.h, \1\.h\[0\], #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_180_f16_tied3, svfloat16_t,
+		z0 = svcmla_lane_f16 (z1, z2, z0, 0, 180),
+		z0 = svcmla_lane (z1, z2, z0, 0, 180))
+
+/*
+** cmla_lane_0_180_f16_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.h, z2\.h, z3\.h\[0\], #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_180_f16_untied, svfloat16_t,
+		z0 = svcmla_lane_f16 (z1, z2, z3, 0, 180),
+		z0 = svcmla_lane (z1, z2, z3, 0, 180))
+
+/*
+** cmla_lane_0_270_f16_tied1:
+**	fcmla	z0\.h, z1\.h, z2\.h\[0\], #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_270_f16_tied1, svfloat16_t,
+		z0 = svcmla_lane_f16 (z0, z1, z2, 0, 270),
+		z0 = svcmla_lane (z0, z1, z2, 0, 270))
+
+/*
+** cmla_lane_0_270_f16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, \1\.h, z2\.h\[0\], #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_270_f16_tied2, svfloat16_t,
+		z0 = svcmla_lane_f16 (z1, z0, z2, 0, 270),
+		z0 = svcmla_lane (z1, z0, z2, 0, 270))
+
+/*
+** cmla_lane_0_270_f16_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.h, z2\.h, \1\.h\[0\], #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_270_f16_tied3, svfloat16_t,
+		z0 = svcmla_lane_f16 (z1, z2, z0, 0, 270),
+		z0 = svcmla_lane (z1, z2, z0, 0, 270))
+
+/*
+** cmla_lane_0_270_f16_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.h, z2\.h, z3\.h\[0\], #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_270_f16_untied, svfloat16_t,
+		z0 = svcmla_lane_f16 (z1, z2, z3, 0, 270),
+		z0 = svcmla_lane (z1, z2, z3, 0, 270))
+
+/*
+** cmla_lane_1_f16:
+**	fcmla	z0\.h, z1\.h, z2\.h\[1\], #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_1_f16, svfloat16_t,
+		z0 = svcmla_lane_f16 (z0, z1, z2, 1, 0),
+		z0 = svcmla_lane (z0, z1, z2, 1, 0))
+
+/*
+** cmla_lane_2_f16:
+**	fcmla	z0\.h, z1\.h, z2\.h\[2\], #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_2_f16, svfloat16_t,
+		z0 = svcmla_lane_f16 (z0, z1, z2, 2, 0),
+		z0 = svcmla_lane (z0, z1, z2, 2, 0))
+
+/*
+** cmla_lane_3_f16:
+**	fcmla	z0\.h, z1\.h, z2\.h\[3\], #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_3_f16, svfloat16_t,
+		z0 = svcmla_lane_f16 (z0, z1, z2, 3, 0),
+		z0 = svcmla_lane (z0, z1, z2, 3, 0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_lane_f32.c
new file mode 100644
index 000000000..85bff68fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmla_lane_f32.c
@@ -0,0 +1,176 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmla_lane_0_0_f32_tied1:
+**	fcmla	z0\.s, z1\.s, z2\.s\[0\], #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_0_f32_tied1, svfloat32_t,
+		z0 = svcmla_lane_f32 (z0, z1, z2, 0, 0),
+		z0 = svcmla_lane (z0, z1, z2, 0, 0))
+
+/*
+** cmla_lane_0_0_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, \1\.s, z2\.s\[0\], #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_0_f32_tied2, svfloat32_t,
+		z0 = svcmla_lane_f32 (z1, z0, z2, 0, 0),
+		z0 = svcmla_lane (z1, z0, z2, 0, 0))
+
+/*
+** cmla_lane_0_0_f32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, z2\.s, \1\.s\[0\], #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_0_f32_tied3, svfloat32_t,
+		z0 = svcmla_lane_f32 (z1, z2, z0, 0, 0),
+		z0 = svcmla_lane (z1, z2, z0, 0, 0))
+
+/*
+** cmla_lane_0_0_f32_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.s, z2\.s, z3\.s\[0\], #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_0_f32_untied, svfloat32_t,
+		z0 = svcmla_lane_f32 (z1, z2, z3, 0, 0),
+		z0 = svcmla_lane (z1, z2, z3, 0, 0))
+
+/*
+** cmla_lane_0_90_f32_tied1:
+**	fcmla	z0\.s, z1\.s, z2\.s\[0\], #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_90_f32_tied1, svfloat32_t,
+		z0 = svcmla_lane_f32 (z0, z1, z2, 0, 90),
+		z0 = svcmla_lane (z0, z1, z2, 0, 90))
+
+/*
+** cmla_lane_0_90_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, \1\.s, z2\.s\[0\], #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_90_f32_tied2, svfloat32_t,
+		z0 = svcmla_lane_f32 (z1, z0, z2, 0, 90),
+		z0 = svcmla_lane (z1, z0, z2, 0, 90))
+
+/*
+** cmla_lane_0_90_f32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, z2\.s, \1\.s\[0\], #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_90_f32_tied3, svfloat32_t,
+		z0 = svcmla_lane_f32 (z1, z2, z0, 0, 90),
+		z0 = svcmla_lane (z1, z2, z0, 0, 90))
+
+/*
+** cmla_lane_0_90_f32_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.s, z2\.s, z3\.s\[0\], #90
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_90_f32_untied, svfloat32_t,
+		z0 = svcmla_lane_f32 (z1, z2, z3, 0, 90),
+		z0 = svcmla_lane (z1, z2, z3, 0, 90))
+
+/*
+** cmla_lane_0_180_f32_tied1:
+**	fcmla	z0\.s, z1\.s, z2\.s\[0\], #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_180_f32_tied1, svfloat32_t,
+		z0 = svcmla_lane_f32 (z0, z1, z2, 0, 180),
+		z0 = svcmla_lane (z0, z1, z2, 0, 180))
+
+/*
+** cmla_lane_0_180_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, \1\.s, z2\.s\[0\], #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_180_f32_tied2, svfloat32_t,
+		z0 = svcmla_lane_f32 (z1, z0, z2, 0, 180),
+		z0 = svcmla_lane (z1, z0, z2, 0, 180))
+
+/*
+** cmla_lane_0_180_f32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, z2\.s, \1\.s\[0\], #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_180_f32_tied3, svfloat32_t,
+		z0 = svcmla_lane_f32 (z1, z2, z0, 0, 180),
+		z0 = svcmla_lane (z1, z2, z0, 0, 180))
+
+/*
+** cmla_lane_0_180_f32_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.s, z2\.s, z3\.s\[0\], #180
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_180_f32_untied, svfloat32_t,
+		z0 = svcmla_lane_f32 (z1, z2, z3, 0, 180),
+		z0 = svcmla_lane (z1, z2, z3, 0, 180))
+
+/*
+** cmla_lane_0_270_f32_tied1:
+**	fcmla	z0\.s, z1\.s, z2\.s\[0\], #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_270_f32_tied1, svfloat32_t,
+		z0 = svcmla_lane_f32 (z0, z1, z2, 0, 270),
+		z0 = svcmla_lane (z0, z1, z2, 0, 270))
+
+/*
+** cmla_lane_0_270_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, \1\.s, z2\.s\[0\], #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_270_f32_tied2, svfloat32_t,
+		z0 = svcmla_lane_f32 (z1, z0, z2, 0, 270),
+		z0 = svcmla_lane (z1, z0, z2, 0, 270))
+
+/*
+** cmla_lane_0_270_f32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fcmla	z0\.s, z2\.s, \1\.s\[0\], #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_270_f32_tied3, svfloat32_t,
+		z0 = svcmla_lane_f32 (z1, z2, z0, 0, 270),
+		z0 = svcmla_lane (z1, z2, z0, 0, 270))
+
+/*
+** cmla_lane_0_270_f32_untied:
+**	movprfx	z0, z1
+**	fcmla	z0\.s, z2\.s, z3\.s\[0\], #270
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_0_270_f32_untied, svfloat32_t,
+		z0 = svcmla_lane_f32 (z1, z2, z3, 0, 270),
+		z0 = svcmla_lane (z1, z2, z3, 0, 270))
+
+/*
+** cmla_lane_1_f32:
+**	fcmla	z0\.s, z1\.s, z2\.s\[1\], #0
+**	ret
+*/
+TEST_UNIFORM_Z (cmla_lane_1_f32, svfloat32_t,
+		z0 = svcmla_lane_f32 (z0, z1, z2, 1, 0),
+		z0 = svcmla_lane (z0, z1, z2, 1, 0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f16.c
new file mode 100644
index 000000000..7149ad300
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f16.c
@@ -0,0 +1,50 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpeq_f16_tied:
+**	fcmeq	p0\.h, p0/z, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_f16_tied, svfloat16_t,
+		p0 = svcmpeq_f16 (p0, z0, z1),
+		p0 = svcmpeq (p0, z0, z1))
+
+/*
+** cmpeq_f16_untied:
+**	fcmeq	p0\.h, p1/z, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_f16_untied, svfloat16_t,
+		p0 = svcmpeq_f16 (p1, z0, z1),
+		p0 = svcmpeq (p1, z0, z1))
+
+/*
+** cmpeq_h4_f16:
+**	mov	(z[0-9]+\.h), h4
+**	fcmeq	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_ZD (cmpeq_h4_f16, svfloat16_t, float16_t,
+		 p0 = svcmpeq_n_f16 (p1, z0, d4),
+		 p0 = svcmpeq (p1, z0, d4))
+
+/*
+** cmpeq_0_f16:
+**	fcmeq	p0\.h, p1/z, z0\.h, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_0_f16, svfloat16_t,
+		p0 = svcmpeq_n_f16 (p1, z0, 0),
+		p0 = svcmpeq (p1, z0, 0))
+
+/*
+** cmpeq_1_f16:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fcmeq	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_1_f16, svfloat16_t,
+		p0 = svcmpeq_n_f16 (p1, z0, 1),
+		p0 = svcmpeq (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f32.c
new file mode 100644
index 000000000..05910bc50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f32.c
@@ -0,0 +1,50 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpeq_f32_tied:
+**	fcmeq	p0\.s, p0/z, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_f32_tied, svfloat32_t,
+		p0 = svcmpeq_f32 (p0, z0, z1),
+		p0 = svcmpeq (p0, z0, z1))
+
+/*
+** cmpeq_f32_untied:
+**	fcmeq	p0\.s, p1/z, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_f32_untied, svfloat32_t,
+		p0 = svcmpeq_f32 (p1, z0, z1),
+		p0 = svcmpeq (p1, z0, z1))
+
+/*
+** cmpeq_s4_f32:
+**	mov	(z[0-9]+\.s), s4
+**	fcmeq	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_ZD (cmpeq_s4_f32, svfloat32_t, float32_t,
+		 p0 = svcmpeq_n_f32 (p1, z0, d4),
+		 p0 = svcmpeq (p1, z0, d4))
+
+/*
+** cmpeq_0_f32:
+**	fcmeq	p0\.s, p1/z, z0\.s, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_0_f32, svfloat32_t,
+		p0 = svcmpeq_n_f32 (p1, z0, 0),
+		p0 = svcmpeq (p1, z0, 0))
+
+/*
+** cmpeq_1_f32:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fcmeq	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_1_f32, svfloat32_t,
+		p0 = svcmpeq_n_f32 (p1, z0, 1),
+		p0 = svcmpeq (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f64.c
new file mode 100644
index 000000000..f94bdfe27
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_f64.c
@@ -0,0 +1,50 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpeq_f64_tied:
+**	fcmeq	p0\.d, p0/z, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_f64_tied, svfloat64_t,
+		p0 = svcmpeq_f64 (p0, z0, z1),
+		p0 = svcmpeq (p0, z0, z1))
+
+/*
+** cmpeq_f64_untied:
+**	fcmeq	p0\.d, p1/z, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_f64_untied, svfloat64_t,
+		p0 = svcmpeq_f64 (p1, z0, z1),
+		p0 = svcmpeq (p1, z0, z1))
+
+/*
+** cmpeq_d4_f64:
+**	mov	(z[0-9]+\.d), d4
+**	fcmeq	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_ZD (cmpeq_d4_f64, svfloat64_t, float64_t,
+		 p0 = svcmpeq_n_f64 (p1, z0, d4),
+		 p0 = svcmpeq (p1, z0, d4))
+
+/*
+** cmpeq_0_f64:
+**	fcmeq	p0\.d, p1/z, z0\.d, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_0_f64, svfloat64_t,
+		p0 = svcmpeq_n_f64 (p1, z0, 0),
+		p0 = svcmpeq (p1, z0, 0))
+
+/*
+** cmpeq_1_f64:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fcmeq	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_1_f64, svfloat64_t,
+		p0 = svcmpeq_n_f64 (p1, z0, 1),
+		p0 = svcmpeq (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s16.c
new file mode 100644
index 000000000..b0befcb77
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s16.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpeq_s16_tied:
+**	cmpeq	p0\.h, p0/z, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_s16_tied, svint16_t,
+		p0 = svcmpeq_s16 (p0, z0, z1),
+		p0 = svcmpeq (p0, z0, z1))
+
+/*
+** cmpeq_s16_untied:
+**	cmpeq	p0\.h, p1/z, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_s16_untied, svint16_t,
+		p0 = svcmpeq_s16 (p1, z0, z1),
+		p0 = svcmpeq (p1, z0, z1))
+
+/*
+** cmpeq_w0_s16:
+**	mov	(z[0-9]+\.h), w0
+**	cmpeq	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_ZX (cmpeq_w0_s16, svint16_t, int16_t,
+		 p0 = svcmpeq_n_s16 (p1, z0, x0),
+		 p0 = svcmpeq (p1, z0, x0))
+
+/*
+** cmpeq_0_s16:
+**	cmpeq	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_0_s16, svint16_t,
+		p0 = svcmpeq_n_s16 (p1, z0, 0),
+		p0 = svcmpeq (p1, z0, 0))
+
+/*
+** cmpeq_1_s16:
+**	cmpeq	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_1_s16, svint16_t,
+		p0 = svcmpeq_n_s16 (p1, z0, 1),
+		p0 = svcmpeq (p1, z0, 1))
+
+/*
+** cmpeq_15_s16:
+**	cmpeq	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_15_s16, svint16_t,
+		p0 = svcmpeq_n_s16 (p1, z0, 15),
+		p0 = svcmpeq (p1, z0, 15))
+
+/*
+** cmpeq_16_s16:
+**	mov	(z[0-9]+\.h), #16
+**	cmpeq	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_16_s16, svint16_t,
+		p0 = svcmpeq_n_s16 (p1, z0, 16),
+		p0 = svcmpeq (p1, z0, 16))
+
+/*
+** cmpeq_m1_s16:
+**	cmpeq	p0\.h, p1/z, z0\.h, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m1_s16, svint16_t,
+		p0 = svcmpeq_n_s16 (p1, z0, -1),
+		p0 = svcmpeq (p1, z0, -1))
+
+/*
+** cmpeq_m16_s16:
+**	cmpeq	p0\.h, p1/z, z0\.h, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m16_s16, svint16_t,
+		p0 = svcmpeq_n_s16 (p1, z0, -16),
+		p0 = svcmpeq (p1, z0, -16))
+
+/*
+** cmpeq_m17_s16:
+**	mov	(z[0-9]+\.h), #-17
+**	cmpeq	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m17_s16, svint16_t,
+		p0 = svcmpeq_n_s16 (p1, z0, -17),
+		p0 = svcmpeq (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s32.c
new file mode 100644
index 000000000..de48a2c38
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s32.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpeq_s32_tied:
+**	cmpeq	p0\.s, p0/z, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_s32_tied, svint32_t,
+		p0 = svcmpeq_s32 (p0, z0, z1),
+		p0 = svcmpeq (p0, z0, z1))
+
+/*
+** cmpeq_s32_untied:
+**	cmpeq	p0\.s, p1/z, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_s32_untied, svint32_t,
+		p0 = svcmpeq_s32 (p1, z0, z1),
+		p0 = svcmpeq (p1, z0, z1))
+
+/*
+** cmpeq_w0_s32:
+**	mov	(z[0-9]+\.s), w0
+**	cmpeq	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_ZX (cmpeq_w0_s32, svint32_t, int32_t,
+		 p0 = svcmpeq_n_s32 (p1, z0, x0),
+		 p0 = svcmpeq (p1, z0, x0))
+
+/*
+** cmpeq_0_s32:
+**	cmpeq	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_0_s32, svint32_t,
+		p0 = svcmpeq_n_s32 (p1, z0, 0),
+		p0 = svcmpeq (p1, z0, 0))
+
+/*
+** cmpeq_1_s32:
+**	cmpeq	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_1_s32, svint32_t,
+		p0 = svcmpeq_n_s32 (p1, z0, 1),
+		p0 = svcmpeq (p1, z0, 1))
+
+/*
+** cmpeq_15_s32:
+**	cmpeq	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_15_s32, svint32_t,
+		p0 = svcmpeq_n_s32 (p1, z0, 15),
+		p0 = svcmpeq (p1, z0, 15))
+
+/*
+** cmpeq_16_s32:
+**	mov	(z[0-9]+\.s), #16
+**	cmpeq	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_16_s32, svint32_t,
+		p0 = svcmpeq_n_s32 (p1, z0, 16),
+		p0 = svcmpeq (p1, z0, 16))
+
+/*
+** cmpeq_m1_s32:
+**	cmpeq	p0\.s, p1/z, z0\.s, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m1_s32, svint32_t,
+		p0 = svcmpeq_n_s32 (p1, z0, -1),
+		p0 = svcmpeq (p1, z0, -1))
+
+/*
+** cmpeq_m16_s32:
+**	cmpeq	p0\.s, p1/z, z0\.s, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m16_s32, svint32_t,
+		p0 = svcmpeq_n_s32 (p1, z0, -16),
+		p0 = svcmpeq (p1, z0, -16))
+
+/*
+** cmpeq_m17_s32:
+**	mov	(z[0-9]+\.s), #-17
+**	cmpeq	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m17_s32, svint32_t,
+		p0 = svcmpeq_n_s32 (p1, z0, -17),
+		p0 = svcmpeq (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s64.c
new file mode 100644
index 000000000..ff976712a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s64.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpeq_s64_tied:
+**	cmpeq	p0\.d, p0/z, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_s64_tied, svint64_t,
+		p0 = svcmpeq_s64 (p0, z0, z1),
+		p0 = svcmpeq (p0, z0, z1))
+
+/*
+** cmpeq_s64_untied:
+**	cmpeq	p0\.d, p1/z, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_s64_untied, svint64_t,
+		p0 = svcmpeq_s64 (p1, z0, z1),
+		p0 = svcmpeq (p1, z0, z1))
+
+/*
+** cmpeq_x0_s64:
+**	mov	(z[0-9]+\.d), x0
+**	cmpeq	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_ZX (cmpeq_x0_s64, svint64_t, int64_t,
+		 p0 = svcmpeq_n_s64 (p1, z0, x0),
+		 p0 = svcmpeq (p1, z0, x0))
+
+/*
+** cmpeq_0_s64:
+**	cmpeq	p0\.d, p1/z, z0\.d, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_0_s64, svint64_t,
+		p0 = svcmpeq_n_s64 (p1, z0, 0),
+		p0 = svcmpeq (p1, z0, 0))
+
+/*
+** cmpeq_1_s64:
+**	cmpeq	p0\.d, p1/z, z0\.d, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_1_s64, svint64_t,
+		p0 = svcmpeq_n_s64 (p1, z0, 1),
+		p0 = svcmpeq (p1, z0, 1))
+
+/*
+** cmpeq_15_s64:
+**	cmpeq	p0\.d, p1/z, z0\.d, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_15_s64, svint64_t,
+		p0 = svcmpeq_n_s64 (p1, z0, 15),
+		p0 = svcmpeq (p1, z0, 15))
+
+/*
+** cmpeq_16_s64:
+**	mov	(z[0-9]+\.d), #16
+**	cmpeq	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_16_s64, svint64_t,
+		p0 = svcmpeq_n_s64 (p1, z0, 16),
+		p0 = svcmpeq (p1, z0, 16))
+
+/*
+** cmpeq_m1_s64:
+**	cmpeq	p0\.d, p1/z, z0\.d, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m1_s64, svint64_t,
+		p0 = svcmpeq_n_s64 (p1, z0, -1),
+		p0 = svcmpeq (p1, z0, -1))
+
+/*
+** cmpeq_m16_s64:
+**	cmpeq	p0\.d, p1/z, z0\.d, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m16_s64, svint64_t,
+		p0 = svcmpeq_n_s64 (p1, z0, -16),
+		p0 = svcmpeq (p1, z0, -16))
+
+/*
+** cmpeq_m17_s64:
+**	mov	(z[0-9]+\.d), #-17
+**	cmpeq	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m17_s64, svint64_t,
+		p0 = svcmpeq_n_s64 (p1, z0, -17),
+		p0 = svcmpeq (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s8.c
new file mode 100644
index 000000000..1325755a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_s8.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpeq_s8_tied:
+**	cmpeq	p0\.b, p0/z, (z0\.b, z1\.b|z1\.b, z0\.b)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_s8_tied, svint8_t,
+		p0 = svcmpeq_s8 (p0, z0, z1),
+		p0 = svcmpeq (p0, z0, z1))
+
+/*
+** cmpeq_s8_untied:
+**	cmpeq	p0\.b, p1/z, (z0\.b, z1\.b|z1\.b, z0\.b)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_s8_untied, svint8_t,
+		p0 = svcmpeq_s8 (p1, z0, z1),
+		p0 = svcmpeq (p1, z0, z1))
+
+/*
+** cmpeq_w0_s8:
+**	mov	(z[0-9]+\.b), w0
+**	cmpeq	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
+**	ret
+*/
+TEST_COMPARE_ZX (cmpeq_w0_s8, svint8_t, int8_t,
+		 p0 = svcmpeq_n_s8 (p1, z0, x0),
+		 p0 = svcmpeq (p1, z0, x0))
+
+/*
+** cmpeq_0_s8:
+**	cmpeq	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_0_s8, svint8_t,
+		p0 = svcmpeq_n_s8 (p1, z0, 0),
+		p0 = svcmpeq (p1, z0, 0))
+
+/*
+** cmpeq_1_s8:
+**	cmpeq	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_1_s8, svint8_t,
+		p0 = svcmpeq_n_s8 (p1, z0, 1),
+		p0 = svcmpeq (p1, z0, 1))
+
+/*
+** cmpeq_15_s8:
+**	cmpeq	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_15_s8, svint8_t,
+		p0 = svcmpeq_n_s8 (p1, z0, 15),
+		p0 = svcmpeq (p1, z0, 15))
+
+/*
+** cmpeq_16_s8:
+**	mov	(z[0-9]+\.b), #16
+**	cmpeq	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_16_s8, svint8_t,
+		p0 = svcmpeq_n_s8 (p1, z0, 16),
+		p0 = svcmpeq (p1, z0, 16))
+
+/*
+** cmpeq_m1_s8:
+**	cmpeq	p0\.b, p1/z, z0\.b, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m1_s8, svint8_t,
+		p0 = svcmpeq_n_s8 (p1, z0, -1),
+		p0 = svcmpeq (p1, z0, -1))
+
+/*
+** cmpeq_m16_s8:
+**	cmpeq	p0\.b, p1/z, z0\.b, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m16_s8, svint8_t,
+		p0 = svcmpeq_n_s8 (p1, z0, -16),
+		p0 = svcmpeq (p1, z0, -16))
+
+/*
+** cmpeq_m17_s8:
+**	mov	(z[0-9]+\.b), #-17
+**	cmpeq	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m17_s8, svint8_t,
+		p0 = svcmpeq_n_s8 (p1, z0, -17),
+		p0 = svcmpeq (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u16.c
new file mode 100644
index 000000000..91004692c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u16.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpeq_u16_tied:
+**	cmpeq	p0\.h, p0/z, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_u16_tied, svuint16_t,
+		p0 = svcmpeq_u16 (p0, z0, z1),
+		p0 = svcmpeq (p0, z0, z1))
+
+/*
+** cmpeq_u16_untied:
+**	cmpeq	p0\.h, p1/z, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_u16_untied, svuint16_t,
+		p0 = svcmpeq_u16 (p1, z0, z1),
+		p0 = svcmpeq (p1, z0, z1))
+
+/*
+** cmpeq_w0_u16:
+**	mov	(z[0-9]+\.h), w0
+**	cmpeq	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_ZX (cmpeq_w0_u16, svuint16_t, uint16_t,
+		 p0 = svcmpeq_n_u16 (p1, z0, x0),
+		 p0 = svcmpeq (p1, z0, x0))
+
+/*
+** cmpeq_0_u16:
+**	cmpeq	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_0_u16, svuint16_t,
+		p0 = svcmpeq_n_u16 (p1, z0, 0),
+		p0 = svcmpeq (p1, z0, 0))
+
+/*
+** cmpeq_1_u16:
+**	cmpeq	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_1_u16, svuint16_t,
+		p0 = svcmpeq_n_u16 (p1, z0, 1),
+		p0 = svcmpeq (p1, z0, 1))
+
+/*
+** cmpeq_15_u16:
+**	cmpeq	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_15_u16, svuint16_t,
+		p0 = svcmpeq_n_u16 (p1, z0, 15),
+		p0 = svcmpeq (p1, z0, 15))
+
+/*
+** cmpeq_16_u16:
+**	mov	(z[0-9]+\.h), #16
+**	cmpeq	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_16_u16, svuint16_t,
+		p0 = svcmpeq_n_u16 (p1, z0, 16),
+		p0 = svcmpeq (p1, z0, 16))
+
+/*
+** cmpeq_m1_u16:
+**	cmpeq	p0\.h, p1/z, z0\.h, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m1_u16, svuint16_t,
+		p0 = svcmpeq_n_u16 (p1, z0, -1),
+		p0 = svcmpeq (p1, z0, -1))
+
+/*
+** cmpeq_m16_u16:
+**	cmpeq	p0\.h, p1/z, z0\.h, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m16_u16, svuint16_t,
+		p0 = svcmpeq_n_u16 (p1, z0, -16),
+		p0 = svcmpeq (p1, z0, -16))
+
+/*
+** cmpeq_m17_u16:
+**	mov	(z[0-9]+\.h), #-17
+**	cmpeq	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m17_u16, svuint16_t,
+		p0 = svcmpeq_n_u16 (p1, z0, -17),
+		p0 = svcmpeq (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u32.c
new file mode 100644
index 000000000..2cff56eb6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u32.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpeq_u32_tied:
+**	cmpeq	p0\.s, p0/z, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_u32_tied, svuint32_t,
+		p0 = svcmpeq_u32 (p0, z0, z1),
+		p0 = svcmpeq (p0, z0, z1))
+
+/*
+** cmpeq_u32_untied:
+**	cmpeq	p0\.s, p1/z, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_u32_untied, svuint32_t,
+		p0 = svcmpeq_u32 (p1, z0, z1),
+		p0 = svcmpeq (p1, z0, z1))
+
+/*
+** cmpeq_w0_u32:
+**	mov	(z[0-9]+\.s), w0
+**	cmpeq	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_ZX (cmpeq_w0_u32, svuint32_t, uint32_t,
+		 p0 = svcmpeq_n_u32 (p1, z0, x0),
+		 p0 = svcmpeq (p1, z0, x0))
+
+/*
+** cmpeq_0_u32:
+**	cmpeq	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_0_u32, svuint32_t,
+		p0 = svcmpeq_n_u32 (p1, z0, 0),
+		p0 = svcmpeq (p1, z0, 0))
+
+/*
+** cmpeq_1_u32:
+**	cmpeq	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_1_u32, svuint32_t,
+		p0 = svcmpeq_n_u32 (p1, z0, 1),
+		p0 = svcmpeq (p1, z0, 1))
+
+/*
+** cmpeq_15_u32:
+**	cmpeq	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_15_u32, svuint32_t,
+		p0 = svcmpeq_n_u32 (p1, z0, 15),
+		p0 = svcmpeq (p1, z0, 15))
+
+/*
+** cmpeq_16_u32:
+**	mov	(z[0-9]+\.s), #16
+**	cmpeq	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_16_u32, svuint32_t,
+		p0 = svcmpeq_n_u32 (p1, z0, 16),
+		p0 = svcmpeq (p1, z0, 16))
+
+/*
+** cmpeq_m1_u32:
+**	cmpeq	p0\.s, p1/z, z0\.s, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m1_u32, svuint32_t,
+		p0 = svcmpeq_n_u32 (p1, z0, -1),
+		p0 = svcmpeq (p1, z0, -1))
+
+/*
+** cmpeq_m16_u32:
+**	cmpeq	p0\.s, p1/z, z0\.s, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m16_u32, svuint32_t,
+		p0 = svcmpeq_n_u32 (p1, z0, -16),
+		p0 = svcmpeq (p1, z0, -16))
+
+/*
+** cmpeq_m17_u32:
+**	mov	(z[0-9]+\.s), #-17
+**	cmpeq	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m17_u32, svuint32_t,
+		p0 = svcmpeq_n_u32 (p1, z0, -17),
+		p0 = svcmpeq (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u64.c
new file mode 100644
index 000000000..0f02c9988
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u64.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpeq_u64_tied:
+**	cmpeq	p0\.d, p0/z, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_u64_tied, svuint64_t,
+		p0 = svcmpeq_u64 (p0, z0, z1),
+		p0 = svcmpeq (p0, z0, z1))
+
+/*
+** cmpeq_u64_untied:
+**	cmpeq	p0\.d, p1/z, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_u64_untied, svuint64_t,
+		p0 = svcmpeq_u64 (p1, z0, z1),
+		p0 = svcmpeq (p1, z0, z1))
+
+/*
+** cmpeq_x0_u64:
+**	mov	(z[0-9]+\.d), x0
+**	cmpeq	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_ZX (cmpeq_x0_u64, svuint64_t, uint64_t,
+		 p0 = svcmpeq_n_u64 (p1, z0, x0),
+		 p0 = svcmpeq (p1, z0, x0))
+
+/*
+** cmpeq_0_u64:
+**	cmpeq	p0\.d, p1/z, z0\.d, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_0_u64, svuint64_t,
+		p0 = svcmpeq_n_u64 (p1, z0, 0),
+		p0 = svcmpeq (p1, z0, 0))
+
+/*
+** cmpeq_1_u64:
+**	cmpeq	p0\.d, p1/z, z0\.d, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_1_u64, svuint64_t,
+		p0 = svcmpeq_n_u64 (p1, z0, 1),
+		p0 = svcmpeq (p1, z0, 1))
+
+/*
+** cmpeq_15_u64:
+**	cmpeq	p0\.d, p1/z, z0\.d, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_15_u64, svuint64_t,
+		p0 = svcmpeq_n_u64 (p1, z0, 15),
+		p0 = svcmpeq (p1, z0, 15))
+
+/*
+** cmpeq_16_u64:
+**	mov	(z[0-9]+\.d), #16
+**	cmpeq	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_16_u64, svuint64_t,
+		p0 = svcmpeq_n_u64 (p1, z0, 16),
+		p0 = svcmpeq (p1, z0, 16))
+
+/*
+** cmpeq_m1_u64:
+**	cmpeq	p0\.d, p1/z, z0\.d, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m1_u64, svuint64_t,
+		p0 = svcmpeq_n_u64 (p1, z0, -1),
+		p0 = svcmpeq (p1, z0, -1))
+
+/*
+** cmpeq_m16_u64:
+**	cmpeq	p0\.d, p1/z, z0\.d, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m16_u64, svuint64_t,
+		p0 = svcmpeq_n_u64 (p1, z0, -16),
+		p0 = svcmpeq (p1, z0, -16))
+
+/*
+** cmpeq_m17_u64:
+**	mov	(z[0-9]+\.d), #-17
+**	cmpeq	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m17_u64, svuint64_t,
+		p0 = svcmpeq_n_u64 (p1, z0, -17),
+		p0 = svcmpeq (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u8.c
new file mode 100644
index 000000000..ccd9a61c6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_u8.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpeq_u8_tied:
+**	cmpeq	p0\.b, p0/z, (z0\.b, z1\.b|z1\.b, z0\.b)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_u8_tied, svuint8_t,
+		p0 = svcmpeq_u8 (p0, z0, z1),
+		p0 = svcmpeq (p0, z0, z1))
+
+/*
+** cmpeq_u8_untied:
+**	cmpeq	p0\.b, p1/z, (z0\.b, z1\.b|z1\.b, z0\.b)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_u8_untied, svuint8_t,
+		p0 = svcmpeq_u8 (p1, z0, z1),
+		p0 = svcmpeq (p1, z0, z1))
+
+/*
+** cmpeq_w0_u8:
+**	mov	(z[0-9]+\.b), w0
+**	cmpeq	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
+**	ret
+*/
+TEST_COMPARE_ZX (cmpeq_w0_u8, svuint8_t, uint8_t,
+		 p0 = svcmpeq_n_u8 (p1, z0, x0),
+		 p0 = svcmpeq (p1, z0, x0))
+
+/*
+** cmpeq_0_u8:
+**	cmpeq	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_0_u8, svuint8_t,
+		p0 = svcmpeq_n_u8 (p1, z0, 0),
+		p0 = svcmpeq (p1, z0, 0))
+
+/*
+** cmpeq_1_u8:
+**	cmpeq	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_1_u8, svuint8_t,
+		p0 = svcmpeq_n_u8 (p1, z0, 1),
+		p0 = svcmpeq (p1, z0, 1))
+
+/*
+** cmpeq_15_u8:
+**	cmpeq	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_15_u8, svuint8_t,
+		p0 = svcmpeq_n_u8 (p1, z0, 15),
+		p0 = svcmpeq (p1, z0, 15))
+
+/*
+** cmpeq_16_u8:
+**	mov	(z[0-9]+\.b), #16
+**	cmpeq	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_16_u8, svuint8_t,
+		p0 = svcmpeq_n_u8 (p1, z0, 16),
+		p0 = svcmpeq (p1, z0, 16))
+
+/*
+** cmpeq_m1_u8:
+**	cmpeq	p0\.b, p1/z, z0\.b, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m1_u8, svuint8_t,
+		p0 = svcmpeq_n_u8 (p1, z0, -1),
+		p0 = svcmpeq (p1, z0, -1))
+
+/*
+** cmpeq_m16_u8:
+**	cmpeq	p0\.b, p1/z, z0\.b, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m16_u8, svuint8_t,
+		p0 = svcmpeq_n_u8 (p1, z0, -16),
+		p0 = svcmpeq (p1, z0, -16))
+
+/*
+** cmpeq_m17_u8:
+**	mov	(z[0-9]+\.b), #-17
+**	cmpeq	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_m17_u8, svuint8_t,
+		p0 = svcmpeq_n_u8 (p1, z0, -17),
+		p0 = svcmpeq (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s16.c
new file mode 100644
index 000000000..c9712b3b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s16.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpeq_wide_s16_tied:
+**	cmpeq	p0\.h, p0/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpeq_wide_s16_tied, svint16_t, svint64_t,
+		     p0 = svcmpeq_wide_s16 (p0, z0, z1),
+		     p0 = svcmpeq_wide (p0, z0, z1))
+
+/*
+** cmpeq_wide_s16_untied:
+**	cmpeq	p0\.h, p1/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpeq_wide_s16_untied, svint16_t, svint64_t,
+		     p0 = svcmpeq_wide_s16 (p1, z0, z1),
+		     p0 = svcmpeq_wide (p1, z0, z1))
+
+/*
+** cmpeq_wide_x0_s16:
+**	mov	(z[0-9]+\.d), x0
+**	cmpeq	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpeq_wide_x0_s16, svint16_t, int64_t,
+		 p0 = svcmpeq_wide_n_s16 (p1, z0, x0),
+		 p0 = svcmpeq_wide (p1, z0, x0))
+
+/*
+** cmpeq_wide_0_s16:
+**	cmpeq	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_0_s16, svint16_t,
+		p0 = svcmpeq_wide_n_s16 (p1, z0, 0),
+		p0 = svcmpeq_wide (p1, z0, 0))
+
+/*
+** cmpeq_wide_1_s16:
+**	cmpeq	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_1_s16, svint16_t,
+		p0 = svcmpeq_wide_n_s16 (p1, z0, 1),
+		p0 = svcmpeq_wide (p1, z0, 1))
+
+/*
+** cmpeq_wide_15_s16:
+**	cmpeq	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_15_s16, svint16_t,
+		p0 = svcmpeq_wide_n_s16 (p1, z0, 15),
+		p0 = svcmpeq_wide (p1, z0, 15))
+
+/*
+** cmpeq_wide_16_s16:
+**	mov	(z[0-9]+\.d), #16
+**	cmpeq	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_16_s16, svint16_t,
+		p0 = svcmpeq_wide_n_s16 (p1, z0, 16),
+		p0 = svcmpeq_wide (p1, z0, 16))
+
+/*
+** cmpeq_wide_m1_s16:
+**	cmpeq	p0\.h, p1/z, z0\.h, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_m1_s16, svint16_t,
+		p0 = svcmpeq_wide_n_s16 (p1, z0, -1),
+		p0 = svcmpeq_wide (p1, z0, -1))
+
+/*
+** cmpeq_wide_m16_s16:
+**	cmpeq	p0\.h, p1/z, z0\.h, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_m16_s16, svint16_t,
+		p0 = svcmpeq_wide_n_s16 (p1, z0, -16),
+		p0 = svcmpeq_wide (p1, z0, -16))
+
+/*
+** cmpeq_wide_m17_s16:
+**	mov	(z[0-9]+\.d), #-17
+**	cmpeq	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_m17_s16, svint16_t,
+		p0 = svcmpeq_wide_n_s16 (p1, z0, -17),
+		p0 = svcmpeq_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s32.c
new file mode 100644
index 000000000..22bd99f57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s32.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpeq_wide_s32_tied:
+**	cmpeq	p0\.s, p0/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpeq_wide_s32_tied, svint32_t, svint64_t,
+		     p0 = svcmpeq_wide_s32 (p0, z0, z1),
+		     p0 = svcmpeq_wide (p0, z0, z1))
+
+/*
+** cmpeq_wide_s32_untied:
+**	cmpeq	p0\.s, p1/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpeq_wide_s32_untied, svint32_t, svint64_t,
+		     p0 = svcmpeq_wide_s32 (p1, z0, z1),
+		     p0 = svcmpeq_wide (p1, z0, z1))
+
+/*
+** cmpeq_wide_x0_s32:
+**	mov	(z[0-9]+\.d), x0
+**	cmpeq	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpeq_wide_x0_s32, svint32_t, int64_t,
+		 p0 = svcmpeq_wide_n_s32 (p1, z0, x0),
+		 p0 = svcmpeq_wide (p1, z0, x0))
+
+/*
+** cmpeq_wide_0_s32:
+**	cmpeq	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_0_s32, svint32_t,
+		p0 = svcmpeq_wide_n_s32 (p1, z0, 0),
+		p0 = svcmpeq_wide (p1, z0, 0))
+
+/*
+** cmpeq_wide_1_s32:
+**	cmpeq	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_1_s32, svint32_t,
+		p0 = svcmpeq_wide_n_s32 (p1, z0, 1),
+		p0 = svcmpeq_wide (p1, z0, 1))
+
+/*
+** cmpeq_wide_15_s32:
+**	cmpeq	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_15_s32, svint32_t,
+		p0 = svcmpeq_wide_n_s32 (p1, z0, 15),
+		p0 = svcmpeq_wide (p1, z0, 15))
+
+/*
+** cmpeq_wide_16_s32:
+**	mov	(z[0-9]+\.d), #16
+**	cmpeq	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_16_s32, svint32_t,
+		p0 = svcmpeq_wide_n_s32 (p1, z0, 16),
+		p0 = svcmpeq_wide (p1, z0, 16))
+
+/*
+** cmpeq_wide_m1_s32:
+**	cmpeq	p0\.s, p1/z, z0\.s, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_m1_s32, svint32_t,
+		p0 = svcmpeq_wide_n_s32 (p1, z0, -1),
+		p0 = svcmpeq_wide (p1, z0, -1))
+
+/*
+** cmpeq_wide_m16_s32:
+**	cmpeq	p0\.s, p1/z, z0\.s, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_m16_s32, svint32_t,
+		p0 = svcmpeq_wide_n_s32 (p1, z0, -16),
+		p0 = svcmpeq_wide (p1, z0, -16))
+
+/*
+** cmpeq_wide_m17_s32:
+**	mov	(z[0-9]+\.d), #-17
+**	cmpeq	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_m17_s32, svint32_t,
+		p0 = svcmpeq_wide_n_s32 (p1, z0, -17),
+		p0 = svcmpeq_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s8.c
new file mode 100644
index 000000000..a9e9a0bf5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpeq_wide_s8.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpeq_wide_s8_tied:
+**	cmpeq	p0\.b, p0/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpeq_wide_s8_tied, svint8_t, svint64_t,
+		     p0 = svcmpeq_wide_s8 (p0, z0, z1),
+		     p0 = svcmpeq_wide (p0, z0, z1))
+
+/*
+** cmpeq_wide_s8_untied:
+**	cmpeq	p0\.b, p1/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpeq_wide_s8_untied, svint8_t, svint64_t,
+		     p0 = svcmpeq_wide_s8 (p1, z0, z1),
+		     p0 = svcmpeq_wide (p1, z0, z1))
+
+/*
+** cmpeq_wide_x0_s8:
+**	mov	(z[0-9]+\.d), x0
+**	cmpeq	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpeq_wide_x0_s8, svint8_t, int64_t,
+		 p0 = svcmpeq_wide_n_s8 (p1, z0, x0),
+		 p0 = svcmpeq_wide (p1, z0, x0))
+
+/*
+** cmpeq_wide_0_s8:
+**	cmpeq	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_0_s8, svint8_t,
+		p0 = svcmpeq_wide_n_s8 (p1, z0, 0),
+		p0 = svcmpeq_wide (p1, z0, 0))
+
+/*
+** cmpeq_wide_1_s8:
+**	cmpeq	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_1_s8, svint8_t,
+		p0 = svcmpeq_wide_n_s8 (p1, z0, 1),
+		p0 = svcmpeq_wide (p1, z0, 1))
+
+/*
+** cmpeq_wide_15_s8:
+**	cmpeq	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_15_s8, svint8_t,
+		p0 = svcmpeq_wide_n_s8 (p1, z0, 15),
+		p0 = svcmpeq_wide (p1, z0, 15))
+
+/*
+** cmpeq_wide_16_s8:
+**	mov	(z[0-9]+\.d), #16
+**	cmpeq	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_16_s8, svint8_t,
+		p0 = svcmpeq_wide_n_s8 (p1, z0, 16),
+		p0 = svcmpeq_wide (p1, z0, 16))
+
+/*
+** cmpeq_wide_m1_s8:
+**	cmpeq	p0\.b, p1/z, z0\.b, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_m1_s8, svint8_t,
+		p0 = svcmpeq_wide_n_s8 (p1, z0, -1),
+		p0 = svcmpeq_wide (p1, z0, -1))
+
+/*
+** cmpeq_wide_m16_s8:
+**	cmpeq	p0\.b, p1/z, z0\.b, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_m16_s8, svint8_t,
+		p0 = svcmpeq_wide_n_s8 (p1, z0, -16),
+		p0 = svcmpeq_wide (p1, z0, -16))
+
+/*
+** cmpeq_wide_m17_s8:
+**	mov	(z[0-9]+\.d), #-17
+**	cmpeq	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpeq_wide_m17_s8, svint8_t,
+		p0 = svcmpeq_wide_n_s8 (p1, z0, -17),
+		p0 = svcmpeq_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f16.c
new file mode 100644
index 000000000..a6db8c16a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f16.c
@@ -0,0 +1,66 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_f16_tied:
+** (
+**	fcmge	p0\.h, p0/z, z0\.h, z1\.h
+** |
+**	fcmle	p0\.h, p0/z, z1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_f16_tied, svfloat16_t,
+		p0 = svcmpge_f16 (p0, z0, z1),
+		p0 = svcmpge (p0, z0, z1))
+
+/*
+** cmpge_f16_untied:
+** (
+**	fcmge	p0\.h, p1/z, z0\.h, z1\.h
+** |
+**	fcmle	p0\.h, p1/z, z1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_f16_untied, svfloat16_t,
+		p0 = svcmpge_f16 (p1, z0, z1),
+		p0 = svcmpge (p1, z0, z1))
+
+/*
+** cmpge_h4_f16:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	fcmge	p0\.h, p1/z, z0\.h, \1
+** |
+**	fcmle	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (cmpge_h4_f16, svfloat16_t, float16_t,
+		 p0 = svcmpge_n_f16 (p1, z0, d4),
+		 p0 = svcmpge (p1, z0, d4))
+
+/*
+** cmpge_0_f16:
+**	fcmge	p0\.h, p1/z, z0\.h, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_0_f16, svfloat16_t,
+		p0 = svcmpge_n_f16 (p1, z0, 0),
+		p0 = svcmpge (p1, z0, 0))
+
+/*
+** cmpge_1_f16:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+** (
+**	fcmge	p0\.h, p1/z, z0\.h, \1
+** |
+**	fcmle	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_1_f16, svfloat16_t,
+		p0 = svcmpge_n_f16 (p1, z0, 1),
+		p0 = svcmpge (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f32.c
new file mode 100644
index 000000000..ee2976e58
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f32.c
@@ -0,0 +1,66 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_f32_tied:
+** (
+**	fcmge	p0\.s, p0/z, z0\.s, z1\.s
+** |
+**	fcmle	p0\.s, p0/z, z1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_f32_tied, svfloat32_t,
+		p0 = svcmpge_f32 (p0, z0, z1),
+		p0 = svcmpge (p0, z0, z1))
+
+/*
+** cmpge_f32_untied:
+** (
+**	fcmge	p0\.s, p1/z, z0\.s, z1\.s
+** |
+**	fcmle	p0\.s, p1/z, z1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_f32_untied, svfloat32_t,
+		p0 = svcmpge_f32 (p1, z0, z1),
+		p0 = svcmpge (p1, z0, z1))
+
+/*
+** cmpge_s4_f32:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	fcmge	p0\.s, p1/z, z0\.s, \1
+** |
+**	fcmle	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (cmpge_s4_f32, svfloat32_t, float32_t,
+		 p0 = svcmpge_n_f32 (p1, z0, d4),
+		 p0 = svcmpge (p1, z0, d4))
+
+/*
+** cmpge_0_f32:
+**	fcmge	p0\.s, p1/z, z0\.s, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_0_f32, svfloat32_t,
+		p0 = svcmpge_n_f32 (p1, z0, 0),
+		p0 = svcmpge (p1, z0, 0))
+
+/*
+** cmpge_1_f32:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+** (
+**	fcmge	p0\.s, p1/z, z0\.s, \1
+** |
+**	fcmle	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_1_f32, svfloat32_t,
+		p0 = svcmpge_n_f32 (p1, z0, 1),
+		p0 = svcmpge (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f64.c
new file mode 100644
index 000000000..ceea0afe3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_f64.c
@@ -0,0 +1,66 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_f64_tied:
+** (
+**	fcmge	p0\.d, p0/z, z0\.d, z1\.d
+** |
+**	fcmle	p0\.d, p0/z, z1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_f64_tied, svfloat64_t,
+		p0 = svcmpge_f64 (p0, z0, z1),
+		p0 = svcmpge (p0, z0, z1))
+
+/*
+** cmpge_f64_untied:
+** (
+**	fcmge	p0\.d, p1/z, z0\.d, z1\.d
+** |
+**	fcmle	p0\.d, p1/z, z1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_f64_untied, svfloat64_t,
+		p0 = svcmpge_f64 (p1, z0, z1),
+		p0 = svcmpge (p1, z0, z1))
+
+/*
+** cmpge_d4_f64:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	fcmge	p0\.d, p1/z, z0\.d, \1
+** |
+**	fcmle	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (cmpge_d4_f64, svfloat64_t, float64_t,
+		 p0 = svcmpge_n_f64 (p1, z0, d4),
+		 p0 = svcmpge (p1, z0, d4))
+
+/*
+** cmpge_0_f64:
+**	fcmge	p0\.d, p1/z, z0\.d, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_0_f64, svfloat64_t,
+		p0 = svcmpge_n_f64 (p1, z0, 0),
+		p0 = svcmpge (p1, z0, 0))
+
+/*
+** cmpge_1_f64:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+** (
+**	fcmge	p0\.d, p1/z, z0\.d, \1
+** |
+**	fcmle	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_1_f64, svfloat64_t,
+		p0 = svcmpge_n_f64 (p1, z0, 1),
+		p0 = svcmpge (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s16.c
new file mode 100644
index 000000000..de9180b84
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s16.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_s16_tied:
+** (
+**	cmpge	p0\.h, p0/z, z0\.h, z1\.h
+** |
+**	cmple	p0\.h, p0/z, z1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_s16_tied, svint16_t,
+		p0 = svcmpge_s16 (p0, z0, z1),
+		p0 = svcmpge (p0, z0, z1))
+
+/*
+** cmpge_s16_untied:
+** (
+**	cmpge	p0\.h, p1/z, z0\.h, z1\.h
+** |
+**	cmple	p0\.h, p1/z, z1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_s16_untied, svint16_t,
+		p0 = svcmpge_s16 (p1, z0, z1),
+		p0 = svcmpge (p1, z0, z1))
+
+/*
+** cmpge_w0_s16:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	cmpge	p0\.h, p1/z, z0\.h, \1
+** |
+**	cmple	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmpge_w0_s16, svint16_t, int16_t,
+		 p0 = svcmpge_n_s16 (p1, z0, x0),
+		 p0 = svcmpge (p1, z0, x0))
+
+/*
+** cmpge_0_s16:
+**	cmpge	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_0_s16, svint16_t,
+		p0 = svcmpge_n_s16 (p1, z0, 0),
+		p0 = svcmpge (p1, z0, 0))
+
+/*
+** cmpge_1_s16:
+**	cmpge	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_1_s16, svint16_t,
+		p0 = svcmpge_n_s16 (p1, z0, 1),
+		p0 = svcmpge (p1, z0, 1))
+
+/*
+** cmpge_15_s16:
+**	cmpge	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_15_s16, svint16_t,
+		p0 = svcmpge_n_s16 (p1, z0, 15),
+		p0 = svcmpge (p1, z0, 15))
+
+/*
+** cmpge_16_s16:
+**	mov	(z[0-9]+\.h), #16
+** (
+**	cmpge	p0\.h, p1/z, z0\.h, \1
+** |
+**	cmple	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_16_s16, svint16_t,
+		p0 = svcmpge_n_s16 (p1, z0, 16),
+		p0 = svcmpge (p1, z0, 16))
+
+/*
+** cmpge_m1_s16:
+**	cmpge	p0\.h, p1/z, z0\.h, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_m1_s16, svint16_t,
+		p0 = svcmpge_n_s16 (p1, z0, -1),
+		p0 = svcmpge (p1, z0, -1))
+
+/*
+** cmpge_m16_s16:
+**	cmpge	p0\.h, p1/z, z0\.h, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_m16_s16, svint16_t,
+		p0 = svcmpge_n_s16 (p1, z0, -16),
+		p0 = svcmpge (p1, z0, -16))
+
+/*
+** cmpge_m17_s16:
+**	mov	(z[0-9]+\.h), #-17
+** (
+**	cmpge	p0\.h, p1/z, z0\.h, \1
+** |
+**	cmple	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_m17_s16, svint16_t,
+		p0 = svcmpge_n_s16 (p1, z0, -17),
+		p0 = svcmpge (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s32.c
new file mode 100644
index 000000000..67286b1fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s32.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_s32_tied:
+** (
+**	cmpge	p0\.s, p0/z, z0\.s, z1\.s
+** |
+**	cmple	p0\.s, p0/z, z1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_s32_tied, svint32_t,
+		p0 = svcmpge_s32 (p0, z0, z1),
+		p0 = svcmpge (p0, z0, z1))
+
+/*
+** cmpge_s32_untied:
+** (
+**	cmpge	p0\.s, p1/z, z0\.s, z1\.s
+** |
+**	cmple	p0\.s, p1/z, z1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_s32_untied, svint32_t,
+		p0 = svcmpge_s32 (p1, z0, z1),
+		p0 = svcmpge (p1, z0, z1))
+
+/*
+** cmpge_w0_s32:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	cmpge	p0\.s, p1/z, z0\.s, \1
+** |
+**	cmple	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmpge_w0_s32, svint32_t, int32_t,
+		 p0 = svcmpge_n_s32 (p1, z0, x0),
+		 p0 = svcmpge (p1, z0, x0))
+
+/*
+** cmpge_0_s32:
+**	cmpge	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_0_s32, svint32_t,
+		p0 = svcmpge_n_s32 (p1, z0, 0),
+		p0 = svcmpge (p1, z0, 0))
+
+/*
+** cmpge_1_s32:
+**	cmpge	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_1_s32, svint32_t,
+		p0 = svcmpge_n_s32 (p1, z0, 1),
+		p0 = svcmpge (p1, z0, 1))
+
+/*
+** cmpge_15_s32:
+**	cmpge	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_15_s32, svint32_t,
+		p0 = svcmpge_n_s32 (p1, z0, 15),
+		p0 = svcmpge (p1, z0, 15))
+
+/*
+** cmpge_16_s32:
+**	mov	(z[0-9]+\.s), #16
+** (
+**	cmpge	p0\.s, p1/z, z0\.s, \1
+** |
+**	cmple	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_16_s32, svint32_t,
+		p0 = svcmpge_n_s32 (p1, z0, 16),
+		p0 = svcmpge (p1, z0, 16))
+
+/*
+** cmpge_m1_s32:
+**	cmpge	p0\.s, p1/z, z0\.s, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_m1_s32, svint32_t,
+		p0 = svcmpge_n_s32 (p1, z0, -1),
+		p0 = svcmpge (p1, z0, -1))
+
+/*
+** cmpge_m16_s32:
+**	cmpge	p0\.s, p1/z, z0\.s, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_m16_s32, svint32_t,
+		p0 = svcmpge_n_s32 (p1, z0, -16),
+		p0 = svcmpge (p1, z0, -16))
+
+/*
+** cmpge_m17_s32:
+**	mov	(z[0-9]+\.s), #-17
+** (
+**	cmpge	p0\.s, p1/z, z0\.s, \1
+** |
+**	cmple	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_m17_s32, svint32_t,
+		p0 = svcmpge_n_s32 (p1, z0, -17),
+		p0 = svcmpge (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s64.c
new file mode 100644
index 000000000..02e3ac07a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s64.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_s64_tied:
+** (
+**	cmpge	p0\.d, p0/z, z0\.d, z1\.d
+** |
+**	cmple	p0\.d, p0/z, z1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_s64_tied, svint64_t,
+		p0 = svcmpge_s64 (p0, z0, z1),
+		p0 = svcmpge (p0, z0, z1))
+
+/*
+** cmpge_s64_untied:
+** (
+**	cmpge	p0\.d, p1/z, z0\.d, z1\.d
+** |
+**	cmple	p0\.d, p1/z, z1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_s64_untied, svint64_t,
+		p0 = svcmpge_s64 (p1, z0, z1),
+		p0 = svcmpge (p1, z0, z1))
+
+/*
+** cmpge_x0_s64:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	cmpge	p0\.d, p1/z, z0\.d, \1
+** |
+**	cmple	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmpge_x0_s64, svint64_t, int64_t,
+		 p0 = svcmpge_n_s64 (p1, z0, x0),
+		 p0 = svcmpge (p1, z0, x0))
+
+/*
+** cmpge_0_s64:
+**	cmpge	p0\.d, p1/z, z0\.d, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_0_s64, svint64_t,
+		p0 = svcmpge_n_s64 (p1, z0, 0),
+		p0 = svcmpge (p1, z0, 0))
+
+/*
+** cmpge_1_s64:
+**	cmpge	p0\.d, p1/z, z0\.d, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_1_s64, svint64_t,
+		p0 = svcmpge_n_s64 (p1, z0, 1),
+		p0 = svcmpge (p1, z0, 1))
+
+/*
+** cmpge_15_s64:
+**	cmpge	p0\.d, p1/z, z0\.d, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_15_s64, svint64_t,
+		p0 = svcmpge_n_s64 (p1, z0, 15),
+		p0 = svcmpge (p1, z0, 15))
+
+/*
+** cmpge_16_s64:
+**	mov	(z[0-9]+\.d), #16
+** (
+**	cmpge	p0\.d, p1/z, z0\.d, \1
+** |
+**	cmple	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_16_s64, svint64_t,
+		p0 = svcmpge_n_s64 (p1, z0, 16),
+		p0 = svcmpge (p1, z0, 16))
+
+/*
+** cmpge_m1_s64:
+**	cmpge	p0\.d, p1/z, z0\.d, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_m1_s64, svint64_t,
+		p0 = svcmpge_n_s64 (p1, z0, -1),
+		p0 = svcmpge (p1, z0, -1))
+
+/*
+** cmpge_m16_s64:
+**	cmpge	p0\.d, p1/z, z0\.d, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_m16_s64, svint64_t,
+		p0 = svcmpge_n_s64 (p1, z0, -16),
+		p0 = svcmpge (p1, z0, -16))
+
+/*
+** cmpge_m17_s64:
+**	mov	(z[0-9]+\.d), #-17
+** (
+**	cmpge	p0\.d, p1/z, z0\.d, \1
+** |
+**	cmple	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_m17_s64, svint64_t,
+		p0 = svcmpge_n_s64 (p1, z0, -17),
+		p0 = svcmpge (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s8.c
new file mode 100644
index 000000000..45c9c5f10
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_s8.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_s8_tied:
+** (
+**	cmpge	p0\.b, p0/z, z0\.b, z1\.b
+** |
+**	cmple	p0\.b, p0/z, z1\.b, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_s8_tied, svint8_t,
+		p0 = svcmpge_s8 (p0, z0, z1),
+		p0 = svcmpge (p0, z0, z1))
+
+/*
+** cmpge_s8_untied:
+** (
+**	cmpge	p0\.b, p1/z, z0\.b, z1\.b
+** |
+**	cmple	p0\.b, p1/z, z1\.b, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_s8_untied, svint8_t,
+		p0 = svcmpge_s8 (p1, z0, z1),
+		p0 = svcmpge (p1, z0, z1))
+
+/*
+** cmpge_w0_s8:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	cmpge	p0\.b, p1/z, z0\.b, \1
+** |
+**	cmple	p0\.b, p1/z, \1, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmpge_w0_s8, svint8_t, int8_t,
+		 p0 = svcmpge_n_s8 (p1, z0, x0),
+		 p0 = svcmpge (p1, z0, x0))
+
+/*
+** cmpge_0_s8:
+**	cmpge	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_0_s8, svint8_t,
+		p0 = svcmpge_n_s8 (p1, z0, 0),
+		p0 = svcmpge (p1, z0, 0))
+
+/*
+** cmpge_1_s8:
+**	cmpge	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_1_s8, svint8_t,
+		p0 = svcmpge_n_s8 (p1, z0, 1),
+		p0 = svcmpge (p1, z0, 1))
+
+/*
+** cmpge_15_s8:
+**	cmpge	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_15_s8, svint8_t,
+		p0 = svcmpge_n_s8 (p1, z0, 15),
+		p0 = svcmpge (p1, z0, 15))
+
+/*
+** cmpge_16_s8:
+**	mov	(z[0-9]+\.b), #16
+** (
+**	cmpge	p0\.b, p1/z, z0\.b, \1
+** |
+**	cmple	p0\.b, p1/z, \1, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_16_s8, svint8_t,
+		p0 = svcmpge_n_s8 (p1, z0, 16),
+		p0 = svcmpge (p1, z0, 16))
+
+/*
+** cmpge_m1_s8:
+**	cmpge	p0\.b, p1/z, z0\.b, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_m1_s8, svint8_t,
+		p0 = svcmpge_n_s8 (p1, z0, -1),
+		p0 = svcmpge (p1, z0, -1))
+
+/*
+** cmpge_m16_s8:
+**	cmpge	p0\.b, p1/z, z0\.b, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_m16_s8, svint8_t,
+		p0 = svcmpge_n_s8 (p1, z0, -16),
+		p0 = svcmpge (p1, z0, -16))
+
+/*
+** cmpge_m17_s8:
+**	mov	(z[0-9]+\.b), #-17
+** (
+**	cmpge	p0\.b, p1/z, z0\.b, \1
+** |
+**	cmple	p0\.b, p1/z, \1, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_m17_s8, svint8_t,
+		p0 = svcmpge_n_s8 (p1, z0, -17),
+		p0 = svcmpge (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u16.c
new file mode 100644
index 000000000..7c7d2b307
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u16.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_u16_tied:
+** (
+**	cmphs	p0\.h, p0/z, z0\.h, z1\.h
+** |
+**	cmpls	p0\.h, p0/z, z1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_u16_tied, svuint16_t,
+		p0 = svcmpge_u16 (p0, z0, z1),
+		p0 = svcmpge (p0, z0, z1))
+
+/*
+** cmpge_u16_untied:
+** (
+**	cmphs	p0\.h, p1/z, z0\.h, z1\.h
+** |
+**	cmpls	p0\.h, p1/z, z1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_u16_untied, svuint16_t,
+		p0 = svcmpge_u16 (p1, z0, z1),
+		p0 = svcmpge (p1, z0, z1))
+
+/*
+** cmpge_w0_u16:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	cmphs	p0\.h, p1/z, z0\.h, \1
+** |
+**	cmpls	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmpge_w0_u16, svuint16_t, uint16_t,
+		 p0 = svcmpge_n_u16 (p1, z0, x0),
+		 p0 = svcmpge (p1, z0, x0))
+
+/*
+** cmpge_0_u16:
+**	cmphs	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_0_u16, svuint16_t,
+		p0 = svcmpge_n_u16 (p1, z0, 0),
+		p0 = svcmpge (p1, z0, 0))
+
+/*
+** cmpge_1_u16:
+**	cmphs	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_1_u16, svuint16_t,
+		p0 = svcmpge_n_u16 (p1, z0, 1),
+		p0 = svcmpge (p1, z0, 1))
+
+/*
+** cmpge_15_u16:
+**	cmphs	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_15_u16, svuint16_t,
+		p0 = svcmpge_n_u16 (p1, z0, 15),
+		p0 = svcmpge (p1, z0, 15))
+
+/*
+** cmpge_16_u16:
+**	cmphs	p0\.h, p1/z, z0\.h, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_16_u16, svuint16_t,
+		p0 = svcmpge_n_u16 (p1, z0, 16),
+		p0 = svcmpge (p1, z0, 16))
+
+/*
+** cmpge_127_u16:
+**	cmphs	p0\.h, p1/z, z0\.h, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_127_u16, svuint16_t,
+		p0 = svcmpge_n_u16 (p1, z0, 127),
+		p0 = svcmpge (p1, z0, 127))
+
+/*
+** cmpge_128_u16:
+**	mov	(z[0-9]+\.h), #128
+** (
+**	cmphs	p0\.h, p1/z, z0\.h, \1
+** |
+**	cmpls	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_128_u16, svuint16_t,
+		p0 = svcmpge_n_u16 (p1, z0, 128),
+		p0 = svcmpge (p1, z0, 128))
+
+/*
+** cmpge_m1_u16:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	cmphs	p0\.h, p1/z, z0\.h, \1\.h
+** |
+**	cmpls	p0\.h, p1/z, \1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_m1_u16, svuint16_t,
+		p0 = svcmpge_n_u16 (p1, z0, -1),
+		p0 = svcmpge (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u32.c
new file mode 100644
index 000000000..a2021ef50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u32.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_u32_tied:
+** (
+**	cmphs	p0\.s, p0/z, z0\.s, z1\.s
+** |
+**	cmpls	p0\.s, p0/z, z1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_u32_tied, svuint32_t,
+		p0 = svcmpge_u32 (p0, z0, z1),
+		p0 = svcmpge (p0, z0, z1))
+
+/*
+** cmpge_u32_untied:
+** (
+**	cmphs	p0\.s, p1/z, z0\.s, z1\.s
+** |
+**	cmpls	p0\.s, p1/z, z1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_u32_untied, svuint32_t,
+		p0 = svcmpge_u32 (p1, z0, z1),
+		p0 = svcmpge (p1, z0, z1))
+
+/*
+** cmpge_w0_u32:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	cmphs	p0\.s, p1/z, z0\.s, \1
+** |
+**	cmpls	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmpge_w0_u32, svuint32_t, uint32_t,
+		 p0 = svcmpge_n_u32 (p1, z0, x0),
+		 p0 = svcmpge (p1, z0, x0))
+
+/*
+** cmpge_0_u32:
+**	cmphs	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_0_u32, svuint32_t,
+		p0 = svcmpge_n_u32 (p1, z0, 0),
+		p0 = svcmpge (p1, z0, 0))
+
+/*
+** cmpge_1_u32:
+**	cmphs	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_1_u32, svuint32_t,
+		p0 = svcmpge_n_u32 (p1, z0, 1),
+		p0 = svcmpge (p1, z0, 1))
+
+/*
+** cmpge_15_u32:
+**	cmphs	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_15_u32, svuint32_t,
+		p0 = svcmpge_n_u32 (p1, z0, 15),
+		p0 = svcmpge (p1, z0, 15))
+
+/*
+** cmpge_16_u32:
+**	cmphs	p0\.s, p1/z, z0\.s, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_16_u32, svuint32_t,
+		p0 = svcmpge_n_u32 (p1, z0, 16),
+		p0 = svcmpge (p1, z0, 16))
+
+/*
+** cmpge_127_u32:
+**	cmphs	p0\.s, p1/z, z0\.s, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_127_u32, svuint32_t,
+		p0 = svcmpge_n_u32 (p1, z0, 127),
+		p0 = svcmpge (p1, z0, 127))
+
+/*
+** cmpge_128_u32:
+**	mov	(z[0-9]+\.s), #128
+** (
+**	cmphs	p0\.s, p1/z, z0\.s, \1
+** |
+**	cmpls	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_128_u32, svuint32_t,
+		p0 = svcmpge_n_u32 (p1, z0, 128),
+		p0 = svcmpge (p1, z0, 128))
+
+/*
+** cmpge_m1_u32:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	cmphs	p0\.s, p1/z, z0\.s, \1\.s
+** |
+**	cmpls	p0\.s, p1/z, \1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_m1_u32, svuint32_t,
+		p0 = svcmpge_n_u32 (p1, z0, -1),
+		p0 = svcmpge (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u64.c
new file mode 100644
index 000000000..0f9159590
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u64.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_u64_tied:
+** (
+**	cmphs	p0\.d, p0/z, z0\.d, z1\.d
+** |
+**	cmpls	p0\.d, p0/z, z1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_u64_tied, svuint64_t,
+		p0 = svcmpge_u64 (p0, z0, z1),
+		p0 = svcmpge (p0, z0, z1))
+
+/*
+** cmpge_u64_untied:
+** (
+**	cmphs	p0\.d, p1/z, z0\.d, z1\.d
+** |
+**	cmpls	p0\.d, p1/z, z1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_u64_untied, svuint64_t,
+		p0 = svcmpge_u64 (p1, z0, z1),
+		p0 = svcmpge (p1, z0, z1))
+
+/*
+** cmpge_x0_u64:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	cmphs	p0\.d, p1/z, z0\.d, \1
+** |
+**	cmpls	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmpge_x0_u64, svuint64_t, uint64_t,
+		 p0 = svcmpge_n_u64 (p1, z0, x0),
+		 p0 = svcmpge (p1, z0, x0))
+
+/*
+** cmpge_0_u64:
+**	cmphs	p0\.d, p1/z, z0\.d, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_0_u64, svuint64_t,
+		p0 = svcmpge_n_u64 (p1, z0, 0),
+		p0 = svcmpge (p1, z0, 0))
+
+/*
+** cmpge_1_u64:
+**	cmphs	p0\.d, p1/z, z0\.d, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_1_u64, svuint64_t,
+		p0 = svcmpge_n_u64 (p1, z0, 1),
+		p0 = svcmpge (p1, z0, 1))
+
+/*
+** cmpge_15_u64:
+**	cmphs	p0\.d, p1/z, z0\.d, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_15_u64, svuint64_t,
+		p0 = svcmpge_n_u64 (p1, z0, 15),
+		p0 = svcmpge (p1, z0, 15))
+
+/*
+** cmpge_16_u64:
+**	cmphs	p0\.d, p1/z, z0\.d, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_16_u64, svuint64_t,
+		p0 = svcmpge_n_u64 (p1, z0, 16),
+		p0 = svcmpge (p1, z0, 16))
+
+/*
+** cmpge_127_u64:
+**	cmphs	p0\.d, p1/z, z0\.d, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_127_u64, svuint64_t,
+		p0 = svcmpge_n_u64 (p1, z0, 127),
+		p0 = svcmpge (p1, z0, 127))
+
+/*
+** cmpge_128_u64:
+**	mov	(z[0-9]+\.d), #128
+** (
+**	cmphs	p0\.d, p1/z, z0\.d, \1
+** |
+**	cmpls	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_128_u64, svuint64_t,
+		p0 = svcmpge_n_u64 (p1, z0, 128),
+		p0 = svcmpge (p1, z0, 128))
+
+/*
+** cmpge_m1_u64:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	cmphs	p0\.d, p1/z, z0\.d, \1\.d
+** |
+**	cmpls	p0\.d, p1/z, \1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_m1_u64, svuint64_t,
+		p0 = svcmpge_n_u64 (p1, z0, -1),
+		p0 = svcmpge (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u8.c
new file mode 100644
index 000000000..39f988d01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_u8.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_u8_tied:
+** (
+**	cmphs	p0\.b, p0/z, z0\.b, z1\.b
+** |
+**	cmpls	p0\.b, p0/z, z1\.b, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_u8_tied, svuint8_t,
+		p0 = svcmpge_u8 (p0, z0, z1),
+		p0 = svcmpge (p0, z0, z1))
+
+/*
+** cmpge_u8_untied:
+** (
+**	cmphs	p0\.b, p1/z, z0\.b, z1\.b
+** |
+**	cmpls	p0\.b, p1/z, z1\.b, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_u8_untied, svuint8_t,
+		p0 = svcmpge_u8 (p1, z0, z1),
+		p0 = svcmpge (p1, z0, z1))
+
+/*
+** cmpge_w0_u8:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	cmphs	p0\.b, p1/z, z0\.b, \1
+** |
+**	cmpls	p0\.b, p1/z, \1, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmpge_w0_u8, svuint8_t, uint8_t,
+		 p0 = svcmpge_n_u8 (p1, z0, x0),
+		 p0 = svcmpge (p1, z0, x0))
+
+/*
+** cmpge_0_u8:
+**	cmphs	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_0_u8, svuint8_t,
+		p0 = svcmpge_n_u8 (p1, z0, 0),
+		p0 = svcmpge (p1, z0, 0))
+
+/*
+** cmpge_1_u8:
+**	cmphs	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_1_u8, svuint8_t,
+		p0 = svcmpge_n_u8 (p1, z0, 1),
+		p0 = svcmpge (p1, z0, 1))
+
+/*
+** cmpge_15_u8:
+**	cmphs	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_15_u8, svuint8_t,
+		p0 = svcmpge_n_u8 (p1, z0, 15),
+		p0 = svcmpge (p1, z0, 15))
+
+/*
+** cmpge_16_u8:
+**	cmphs	p0\.b, p1/z, z0\.b, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_16_u8, svuint8_t,
+		p0 = svcmpge_n_u8 (p1, z0, 16),
+		p0 = svcmpge (p1, z0, 16))
+
+/*
+** cmpge_127_u8:
+**	cmphs	p0\.b, p1/z, z0\.b, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_127_u8, svuint8_t,
+		p0 = svcmpge_n_u8 (p1, z0, 127),
+		p0 = svcmpge (p1, z0, 127))
+
+/*
+** cmpge_128_u8:
+**	mov	(z[0-9]+\.b), #-128
+** (
+**	cmphs	p0\.b, p1/z, z0\.b, \1
+** |
+**	cmpls	p0\.b, p1/z, \1, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_128_u8, svuint8_t,
+		p0 = svcmpge_n_u8 (p1, z0, 128),
+		p0 = svcmpge (p1, z0, 128))
+
+/*
+** cmpge_m1_u8:
+**	mov	(z[0-9]+\.b), #-1
+** (
+**	cmphs	p0\.b, p1/z, z0\.b, \1
+** |
+**	cmpls	p0\.b, p1/z, \1, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_m1_u8, svuint8_t,
+		p0 = svcmpge_n_u8 (p1, z0, -1),
+		p0 = svcmpge (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s16.c
new file mode 100644
index 000000000..0400d7871
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s16.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_wide_s16_tied:
+**	cmpge	p0\.h, p0/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpge_wide_s16_tied, svint16_t, svint64_t,
+		     p0 = svcmpge_wide_s16 (p0, z0, z1),
+		     p0 = svcmpge_wide (p0, z0, z1))
+
+/*
+** cmpge_wide_s16_untied:
+**	cmpge	p0\.h, p1/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpge_wide_s16_untied, svint16_t, svint64_t,
+		     p0 = svcmpge_wide_s16 (p1, z0, z1),
+		     p0 = svcmpge_wide (p1, z0, z1))
+
+/*
+** cmpge_wide_x0_s16:
+**	mov	(z[0-9]+\.d), x0
+**	cmpge	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpge_wide_x0_s16, svint16_t, int64_t,
+		 p0 = svcmpge_wide_n_s16 (p1, z0, x0),
+		 p0 = svcmpge_wide (p1, z0, x0))
+
+/*
+** cmpge_wide_0_s16:
+**	cmpge	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_0_s16, svint16_t,
+		p0 = svcmpge_wide_n_s16 (p1, z0, 0),
+		p0 = svcmpge_wide (p1, z0, 0))
+
+/*
+** cmpge_wide_1_s16:
+**	cmpge	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_1_s16, svint16_t,
+		p0 = svcmpge_wide_n_s16 (p1, z0, 1),
+		p0 = svcmpge_wide (p1, z0, 1))
+
+/*
+** cmpge_wide_15_s16:
+**	cmpge	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_15_s16, svint16_t,
+		p0 = svcmpge_wide_n_s16 (p1, z0, 15),
+		p0 = svcmpge_wide (p1, z0, 15))
+
+/*
+** cmpge_wide_16_s16:
+**	mov	(z[0-9]+\.d), #16
+**	cmpge	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_16_s16, svint16_t,
+		p0 = svcmpge_wide_n_s16 (p1, z0, 16),
+		p0 = svcmpge_wide (p1, z0, 16))
+
+/*
+** cmpge_wide_m1_s16:
+**	cmpge	p0\.h, p1/z, z0\.h, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_m1_s16, svint16_t,
+		p0 = svcmpge_wide_n_s16 (p1, z0, -1),
+		p0 = svcmpge_wide (p1, z0, -1))
+
+/*
+** cmpge_wide_m16_s16:
+**	cmpge	p0\.h, p1/z, z0\.h, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_m16_s16, svint16_t,
+		p0 = svcmpge_wide_n_s16 (p1, z0, -16),
+		p0 = svcmpge_wide (p1, z0, -16))
+
+/*
+** cmpge_wide_m17_s16:
+**	mov	(z[0-9]+\.d), #-17
+**	cmpge	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_m17_s16, svint16_t,
+		p0 = svcmpge_wide_n_s16 (p1, z0, -17),
+		p0 = svcmpge_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s32.c
new file mode 100644
index 000000000..ad7b9c55b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s32.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_wide_s32_tied:
+**	cmpge	p0\.s, p0/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpge_wide_s32_tied, svint32_t, svint64_t,
+		     p0 = svcmpge_wide_s32 (p0, z0, z1),
+		     p0 = svcmpge_wide (p0, z0, z1))
+
+/*
+** cmpge_wide_s32_untied:
+**	cmpge	p0\.s, p1/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpge_wide_s32_untied, svint32_t, svint64_t,
+		     p0 = svcmpge_wide_s32 (p1, z0, z1),
+		     p0 = svcmpge_wide (p1, z0, z1))
+
+/*
+** cmpge_wide_x0_s32:
+**	mov	(z[0-9]+\.d), x0
+**	cmpge	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpge_wide_x0_s32, svint32_t, int64_t,
+		 p0 = svcmpge_wide_n_s32 (p1, z0, x0),
+		 p0 = svcmpge_wide (p1, z0, x0))
+
+/*
+** cmpge_wide_0_s32:
+**	cmpge	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_0_s32, svint32_t,
+		p0 = svcmpge_wide_n_s32 (p1, z0, 0),
+		p0 = svcmpge_wide (p1, z0, 0))
+
+/*
+** cmpge_wide_1_s32:
+**	cmpge	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_1_s32, svint32_t,
+		p0 = svcmpge_wide_n_s32 (p1, z0, 1),
+		p0 = svcmpge_wide (p1, z0, 1))
+
+/*
+** cmpge_wide_15_s32:
+**	cmpge	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_15_s32, svint32_t,
+		p0 = svcmpge_wide_n_s32 (p1, z0, 15),
+		p0 = svcmpge_wide (p1, z0, 15))
+
+/*
+** cmpge_wide_16_s32:
+**	mov	(z[0-9]+\.d), #16
+**	cmpge	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_16_s32, svint32_t,
+		p0 = svcmpge_wide_n_s32 (p1, z0, 16),
+		p0 = svcmpge_wide (p1, z0, 16))
+
+/*
+** cmpge_wide_m1_s32:
+**	cmpge	p0\.s, p1/z, z0\.s, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_m1_s32, svint32_t,
+		p0 = svcmpge_wide_n_s32 (p1, z0, -1),
+		p0 = svcmpge_wide (p1, z0, -1))
+
+/*
+** cmpge_wide_m16_s32:
+**	cmpge	p0\.s, p1/z, z0\.s, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_m16_s32, svint32_t,
+		p0 = svcmpge_wide_n_s32 (p1, z0, -16),
+		p0 = svcmpge_wide (p1, z0, -16))
+
+/*
+** cmpge_wide_m17_s32:
+**	mov	(z[0-9]+\.d), #-17
+**	cmpge	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_m17_s32, svint32_t,
+		p0 = svcmpge_wide_n_s32 (p1, z0, -17),
+		p0 = svcmpge_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s8.c
new file mode 100644
index 000000000..b03a42488
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_s8.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_wide_s8_tied:
+**	cmpge	p0\.b, p0/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpge_wide_s8_tied, svint8_t, svint64_t,
+		     p0 = svcmpge_wide_s8 (p0, z0, z1),
+		     p0 = svcmpge_wide (p0, z0, z1))
+
+/*
+** cmpge_wide_s8_untied:
+**	cmpge	p0\.b, p1/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpge_wide_s8_untied, svint8_t, svint64_t,
+		     p0 = svcmpge_wide_s8 (p1, z0, z1),
+		     p0 = svcmpge_wide (p1, z0, z1))
+
+/*
+** cmpge_wide_x0_s8:
+**	mov	(z[0-9]+\.d), x0
+**	cmpge	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpge_wide_x0_s8, svint8_t, int64_t,
+		 p0 = svcmpge_wide_n_s8 (p1, z0, x0),
+		 p0 = svcmpge_wide (p1, z0, x0))
+
+/*
+** cmpge_wide_0_s8:
+**	cmpge	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_0_s8, svint8_t,
+		p0 = svcmpge_wide_n_s8 (p1, z0, 0),
+		p0 = svcmpge_wide (p1, z0, 0))
+
+/*
+** cmpge_wide_1_s8:
+**	cmpge	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_1_s8, svint8_t,
+		p0 = svcmpge_wide_n_s8 (p1, z0, 1),
+		p0 = svcmpge_wide (p1, z0, 1))
+
+/*
+** cmpge_wide_15_s8:
+**	cmpge	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_15_s8, svint8_t,
+		p0 = svcmpge_wide_n_s8 (p1, z0, 15),
+		p0 = svcmpge_wide (p1, z0, 15))
+
+/*
+** cmpge_wide_16_s8:
+**	mov	(z[0-9]+\.d), #16
+**	cmpge	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_16_s8, svint8_t,
+		p0 = svcmpge_wide_n_s8 (p1, z0, 16),
+		p0 = svcmpge_wide (p1, z0, 16))
+
+/*
+** cmpge_wide_m1_s8:
+**	cmpge	p0\.b, p1/z, z0\.b, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_m1_s8, svint8_t,
+		p0 = svcmpge_wide_n_s8 (p1, z0, -1),
+		p0 = svcmpge_wide (p1, z0, -1))
+
+/*
+** cmpge_wide_m16_s8:
+**	cmpge	p0\.b, p1/z, z0\.b, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_m16_s8, svint8_t,
+		p0 = svcmpge_wide_n_s8 (p1, z0, -16),
+		p0 = svcmpge_wide (p1, z0, -16))
+
+/*
+** cmpge_wide_m17_s8:
+**	mov	(z[0-9]+\.d), #-17
+**	cmpge	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_m17_s8, svint8_t,
+		p0 = svcmpge_wide_n_s8 (p1, z0, -17),
+		p0 = svcmpge_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u16.c
new file mode 100644
index 000000000..966b1e554
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u16.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_wide_u16_tied:
+**	cmphs	p0\.h, p0/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpge_wide_u16_tied, svuint16_t, svuint64_t,
+		     p0 = svcmpge_wide_u16 (p0, z0, z1),
+		     p0 = svcmpge_wide (p0, z0, z1))
+
+/*
+** cmpge_wide_u16_untied:
+**	cmphs	p0\.h, p1/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpge_wide_u16_untied, svuint16_t, svuint64_t,
+		     p0 = svcmpge_wide_u16 (p1, z0, z1),
+		     p0 = svcmpge_wide (p1, z0, z1))
+
+/*
+** cmpge_wide_x0_u16:
+**	mov	(z[0-9]+\.d), x0
+**	cmphs	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpge_wide_x0_u16, svuint16_t, uint64_t,
+		 p0 = svcmpge_wide_n_u16 (p1, z0, x0),
+		 p0 = svcmpge_wide (p1, z0, x0))
+
+/*
+** cmpge_wide_0_u16:
+**	cmphs	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_0_u16, svuint16_t,
+		p0 = svcmpge_wide_n_u16 (p1, z0, 0),
+		p0 = svcmpge_wide (p1, z0, 0))
+
+/*
+** cmpge_wide_1_u16:
+**	cmphs	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_1_u16, svuint16_t,
+		p0 = svcmpge_wide_n_u16 (p1, z0, 1),
+		p0 = svcmpge_wide (p1, z0, 1))
+
+/*
+** cmpge_wide_15_u16:
+**	cmphs	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_15_u16, svuint16_t,
+		p0 = svcmpge_wide_n_u16 (p1, z0, 15),
+		p0 = svcmpge_wide (p1, z0, 15))
+
+/*
+** cmpge_wide_16_u16:
+**	cmphs	p0\.h, p1/z, z0\.h, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_16_u16, svuint16_t,
+		p0 = svcmpge_wide_n_u16 (p1, z0, 16),
+		p0 = svcmpge_wide (p1, z0, 16))
+
+/*
+** cmpge_wide_127_u16:
+**	cmphs	p0\.h, p1/z, z0\.h, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_127_u16, svuint16_t,
+		p0 = svcmpge_wide_n_u16 (p1, z0, 127),
+		p0 = svcmpge_wide (p1, z0, 127))
+
+/*
+** cmpge_wide_128_u16:
+**	mov	(z[0-9]+\.d), #128
+**	cmphs	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_128_u16, svuint16_t,
+		p0 = svcmpge_wide_n_u16 (p1, z0, 128),
+		p0 = svcmpge_wide (p1, z0, 128))
+
+/*
+** cmpge_wide_m1_u16:
+**	mov	(z[0-9]+)\.b, #-1
+**	cmphs	p0\.h, p1/z, z0\.h, \1\.d
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_m1_u16, svuint16_t,
+		p0 = svcmpge_wide_n_u16 (p1, z0, -1),
+		p0 = svcmpge_wide (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u32.c
new file mode 100644
index 000000000..fdeb53a46
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u32.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_wide_u32_tied:
+**	cmphs	p0\.s, p0/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpge_wide_u32_tied, svuint32_t, svuint64_t,
+		     p0 = svcmpge_wide_u32 (p0, z0, z1),
+		     p0 = svcmpge_wide (p0, z0, z1))
+
+/*
+** cmpge_wide_u32_untied:
+**	cmphs	p0\.s, p1/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpge_wide_u32_untied, svuint32_t, svuint64_t,
+		     p0 = svcmpge_wide_u32 (p1, z0, z1),
+		     p0 = svcmpge_wide (p1, z0, z1))
+
+/*
+** cmpge_wide_x0_u32:
+**	mov	(z[0-9]+\.d), x0
+**	cmphs	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpge_wide_x0_u32, svuint32_t, uint64_t,
+		 p0 = svcmpge_wide_n_u32 (p1, z0, x0),
+		 p0 = svcmpge_wide (p1, z0, x0))
+
+/*
+** cmpge_wide_0_u32:
+**	cmphs	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_0_u32, svuint32_t,
+		p0 = svcmpge_wide_n_u32 (p1, z0, 0),
+		p0 = svcmpge_wide (p1, z0, 0))
+
+/*
+** cmpge_wide_1_u32:
+**	cmphs	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_1_u32, svuint32_t,
+		p0 = svcmpge_wide_n_u32 (p1, z0, 1),
+		p0 = svcmpge_wide (p1, z0, 1))
+
+/*
+** cmpge_wide_15_u32:
+**	cmphs	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_15_u32, svuint32_t,
+		p0 = svcmpge_wide_n_u32 (p1, z0, 15),
+		p0 = svcmpge_wide (p1, z0, 15))
+
+/*
+** cmpge_wide_16_u32:
+**	cmphs	p0\.s, p1/z, z0\.s, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_16_u32, svuint32_t,
+		p0 = svcmpge_wide_n_u32 (p1, z0, 16),
+		p0 = svcmpge_wide (p1, z0, 16))
+
+/*
+** cmpge_wide_127_u32:
+**	cmphs	p0\.s, p1/z, z0\.s, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_127_u32, svuint32_t,
+		p0 = svcmpge_wide_n_u32 (p1, z0, 127),
+		p0 = svcmpge_wide (p1, z0, 127))
+
+/*
+** cmpge_wide_128_u32:
+**	mov	(z[0-9]+\.d), #128
+**	cmphs	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_128_u32, svuint32_t,
+		p0 = svcmpge_wide_n_u32 (p1, z0, 128),
+		p0 = svcmpge_wide (p1, z0, 128))
+
+/*
+** cmpge_wide_m1_u32:
+**	mov	(z[0-9]+)\.b, #-1
+**	cmphs	p0\.s, p1/z, z0\.s, \1\.d
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_m1_u32, svuint32_t,
+		p0 = svcmpge_wide_n_u32 (p1, z0, -1),
+		p0 = svcmpge_wide (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u8.c
new file mode 100644
index 000000000..565093120
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpge_wide_u8.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpge_wide_u8_tied:
+**	cmphs	p0\.b, p0/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpge_wide_u8_tied, svuint8_t, svuint64_t,
+		     p0 = svcmpge_wide_u8 (p0, z0, z1),
+		     p0 = svcmpge_wide (p0, z0, z1))
+
+/*
+** cmpge_wide_u8_untied:
+**	cmphs	p0\.b, p1/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpge_wide_u8_untied, svuint8_t, svuint64_t,
+		     p0 = svcmpge_wide_u8 (p1, z0, z1),
+		     p0 = svcmpge_wide (p1, z0, z1))
+
+/*
+** cmpge_wide_x0_u8:
+**	mov	(z[0-9]+\.d), x0
+**	cmphs	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpge_wide_x0_u8, svuint8_t, uint64_t,
+		 p0 = svcmpge_wide_n_u8 (p1, z0, x0),
+		 p0 = svcmpge_wide (p1, z0, x0))
+
+/*
+** cmpge_wide_0_u8:
+**	cmphs	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_0_u8, svuint8_t,
+		p0 = svcmpge_wide_n_u8 (p1, z0, 0),
+		p0 = svcmpge_wide (p1, z0, 0))
+
+/*
+** cmpge_wide_1_u8:
+**	cmphs	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_1_u8, svuint8_t,
+		p0 = svcmpge_wide_n_u8 (p1, z0, 1),
+		p0 = svcmpge_wide (p1, z0, 1))
+
+/*
+** cmpge_wide_15_u8:
+**	cmphs	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_15_u8, svuint8_t,
+		p0 = svcmpge_wide_n_u8 (p1, z0, 15),
+		p0 = svcmpge_wide (p1, z0, 15))
+
+/*
+** cmpge_wide_16_u8:
+**	cmphs	p0\.b, p1/z, z0\.b, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_16_u8, svuint8_t,
+		p0 = svcmpge_wide_n_u8 (p1, z0, 16),
+		p0 = svcmpge_wide (p1, z0, 16))
+
+/*
+** cmpge_wide_127_u8:
+**	cmphs	p0\.b, p1/z, z0\.b, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_127_u8, svuint8_t,
+		p0 = svcmpge_wide_n_u8 (p1, z0, 127),
+		p0 = svcmpge_wide (p1, z0, 127))
+
+/*
+** cmpge_wide_128_u8:
+**	mov	(z[0-9]+\.d), #128
+**	cmphs	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_128_u8, svuint8_t,
+		p0 = svcmpge_wide_n_u8 (p1, z0, 128),
+		p0 = svcmpge_wide (p1, z0, 128))
+
+/*
+** cmpge_wide_m1_u8:
+**	mov	(z[0-9]+)\.b, #-1
+**	cmphs	p0\.b, p1/z, z0\.b, \1\.d
+**	ret
+*/
+TEST_COMPARE_Z (cmpge_wide_m1_u8, svuint8_t,
+		p0 = svcmpge_wide_n_u8 (p1, z0, -1),
+		p0 = svcmpge_wide (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f16.c
new file mode 100644
index 000000000..69b015794
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f16.c
@@ -0,0 +1,66 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_f16_tied:
+** (
+**	fcmgt	p0\.h, p0/z, z0\.h, z1\.h
+** |
+**	fcmlt	p0\.h, p0/z, z1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_f16_tied, svfloat16_t,
+		p0 = svcmpgt_f16 (p0, z0, z1),
+		p0 = svcmpgt (p0, z0, z1))
+
+/*
+** cmpgt_f16_untied:
+** (
+**	fcmgt	p0\.h, p1/z, z0\.h, z1\.h
+** |
+**	fcmlt	p0\.h, p1/z, z1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_f16_untied, svfloat16_t,
+		p0 = svcmpgt_f16 (p1, z0, z1),
+		p0 = svcmpgt (p1, z0, z1))
+
+/*
+** cmpgt_h4_f16:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	fcmgt	p0\.h, p1/z, z0\.h, \1
+** |
+**	fcmlt	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (cmpgt_h4_f16, svfloat16_t, float16_t,
+		 p0 = svcmpgt_n_f16 (p1, z0, d4),
+		 p0 = svcmpgt (p1, z0, d4))
+
+/*
+** cmpgt_0_f16:
+**	fcmgt	p0\.h, p1/z, z0\.h, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_0_f16, svfloat16_t,
+		p0 = svcmpgt_n_f16 (p1, z0, 0),
+		p0 = svcmpgt (p1, z0, 0))
+
+/*
+** cmpgt_1_f16:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+** (
+**	fcmgt	p0\.h, p1/z, z0\.h, \1
+** |
+**	fcmlt	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_1_f16, svfloat16_t,
+		p0 = svcmpgt_n_f16 (p1, z0, 1),
+		p0 = svcmpgt (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f32.c
new file mode 100644
index 000000000..7d66b67c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f32.c
@@ -0,0 +1,66 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_f32_tied:
+** (
+**	fcmgt	p0\.s, p0/z, z0\.s, z1\.s
+** |
+**	fcmlt	p0\.s, p0/z, z1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_f32_tied, svfloat32_t,
+		p0 = svcmpgt_f32 (p0, z0, z1),
+		p0 = svcmpgt (p0, z0, z1))
+
+/*
+** cmpgt_f32_untied:
+** (
+**	fcmgt	p0\.s, p1/z, z0\.s, z1\.s
+** |
+**	fcmlt	p0\.s, p1/z, z1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_f32_untied, svfloat32_t,
+		p0 = svcmpgt_f32 (p1, z0, z1),
+		p0 = svcmpgt (p1, z0, z1))
+
+/*
+** cmpgt_s4_f32:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	fcmgt	p0\.s, p1/z, z0\.s, \1
+** |
+**	fcmlt	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (cmpgt_s4_f32, svfloat32_t, float32_t,
+		 p0 = svcmpgt_n_f32 (p1, z0, d4),
+		 p0 = svcmpgt (p1, z0, d4))
+
+/*
+** cmpgt_0_f32:
+**	fcmgt	p0\.s, p1/z, z0\.s, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_0_f32, svfloat32_t,
+		p0 = svcmpgt_n_f32 (p1, z0, 0),
+		p0 = svcmpgt (p1, z0, 0))
+
+/*
+** cmpgt_1_f32:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+** (
+**	fcmgt	p0\.s, p1/z, z0\.s, \1
+** |
+**	fcmlt	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_1_f32, svfloat32_t,
+		p0 = svcmpgt_n_f32 (p1, z0, 1),
+		p0 = svcmpgt (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f64.c
new file mode 100644
index 000000000..f3a155476
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_f64.c
@@ -0,0 +1,66 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_f64_tied:
+** (
+**	fcmgt	p0\.d, p0/z, z0\.d, z1\.d
+** |
+**	fcmlt	p0\.d, p0/z, z1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_f64_tied, svfloat64_t,
+		p0 = svcmpgt_f64 (p0, z0, z1),
+		p0 = svcmpgt (p0, z0, z1))
+
+/*
+** cmpgt_f64_untied:
+** (
+**	fcmgt	p0\.d, p1/z, z0\.d, z1\.d
+** |
+**	fcmlt	p0\.d, p1/z, z1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_f64_untied, svfloat64_t,
+		p0 = svcmpgt_f64 (p1, z0, z1),
+		p0 = svcmpgt (p1, z0, z1))
+
+/*
+** cmpgt_d4_f64:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	fcmgt	p0\.d, p1/z, z0\.d, \1
+** |
+**	fcmlt	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (cmpgt_d4_f64, svfloat64_t, float64_t,
+		 p0 = svcmpgt_n_f64 (p1, z0, d4),
+		 p0 = svcmpgt (p1, z0, d4))
+
+/*
+** cmpgt_0_f64:
+**	fcmgt	p0\.d, p1/z, z0\.d, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_0_f64, svfloat64_t,
+		p0 = svcmpgt_n_f64 (p1, z0, 0),
+		p0 = svcmpgt (p1, z0, 0))
+
+/*
+** cmpgt_1_f64:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+** (
+**	fcmgt	p0\.d, p1/z, z0\.d, \1
+** |
+**	fcmlt	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_1_f64, svfloat64_t,
+		p0 = svcmpgt_n_f64 (p1, z0, 1),
+		p0 = svcmpgt (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s16.c
new file mode 100644
index 000000000..cc86c0c00
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s16.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_s16_tied:
+** (
+**	cmpgt	p0\.h, p0/z, z0\.h, z1\.h
+** |
+**	cmplt	p0\.h, p0/z, z1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_s16_tied, svint16_t,
+		p0 = svcmpgt_s16 (p0, z0, z1),
+		p0 = svcmpgt (p0, z0, z1))
+
+/*
+** cmpgt_s16_untied:
+** (
+**	cmpgt	p0\.h, p1/z, z0\.h, z1\.h
+** |
+**	cmplt	p0\.h, p1/z, z1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_s16_untied, svint16_t,
+		p0 = svcmpgt_s16 (p1, z0, z1),
+		p0 = svcmpgt (p1, z0, z1))
+
+/*
+** cmpgt_w0_s16:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	cmpgt	p0\.h, p1/z, z0\.h, \1
+** |
+**	cmplt	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmpgt_w0_s16, svint16_t, int16_t,
+		 p0 = svcmpgt_n_s16 (p1, z0, x0),
+		 p0 = svcmpgt (p1, z0, x0))
+
+/*
+** cmpgt_0_s16:
+**	cmpgt	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_0_s16, svint16_t,
+		p0 = svcmpgt_n_s16 (p1, z0, 0),
+		p0 = svcmpgt (p1, z0, 0))
+
+/*
+** cmpgt_1_s16:
+**	cmpgt	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_1_s16, svint16_t,
+		p0 = svcmpgt_n_s16 (p1, z0, 1),
+		p0 = svcmpgt (p1, z0, 1))
+
+/*
+** cmpgt_15_s16:
+**	cmpgt	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_15_s16, svint16_t,
+		p0 = svcmpgt_n_s16 (p1, z0, 15),
+		p0 = svcmpgt (p1, z0, 15))
+
+/*
+** cmpgt_16_s16:
+**	mov	(z[0-9]+\.h), #16
+** (
+**	cmpgt	p0\.h, p1/z, z0\.h, \1
+** |
+**	cmplt	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_16_s16, svint16_t,
+		p0 = svcmpgt_n_s16 (p1, z0, 16),
+		p0 = svcmpgt (p1, z0, 16))
+
+/*
+** cmpgt_m1_s16:
+**	cmpgt	p0\.h, p1/z, z0\.h, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_m1_s16, svint16_t,
+		p0 = svcmpgt_n_s16 (p1, z0, -1),
+		p0 = svcmpgt (p1, z0, -1))
+
+/*
+** cmpgt_m16_s16:
+**	cmpgt	p0\.h, p1/z, z0\.h, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_m16_s16, svint16_t,
+		p0 = svcmpgt_n_s16 (p1, z0, -16),
+		p0 = svcmpgt (p1, z0, -16))
+
+/*
+** cmpgt_m17_s16:
+**	mov	(z[0-9]+\.h), #-17
+** (
+**	cmpgt	p0\.h, p1/z, z0\.h, \1
+** |
+**	cmplt	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_m17_s16, svint16_t,
+		p0 = svcmpgt_n_s16 (p1, z0, -17),
+		p0 = svcmpgt (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s32.c
new file mode 100644
index 000000000..75f0cc737
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s32.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_s32_tied:
+** (
+**	cmpgt	p0\.s, p0/z, z0\.s, z1\.s
+** |
+**	cmplt	p0\.s, p0/z, z1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_s32_tied, svint32_t,
+		p0 = svcmpgt_s32 (p0, z0, z1),
+		p0 = svcmpgt (p0, z0, z1))
+
+/*
+** cmpgt_s32_untied:
+** (
+**	cmpgt	p0\.s, p1/z, z0\.s, z1\.s
+** |
+**	cmplt	p0\.s, p1/z, z1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_s32_untied, svint32_t,
+		p0 = svcmpgt_s32 (p1, z0, z1),
+		p0 = svcmpgt (p1, z0, z1))
+
+/*
+** cmpgt_w0_s32:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	cmpgt	p0\.s, p1/z, z0\.s, \1
+** |
+**	cmplt	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmpgt_w0_s32, svint32_t, int32_t,
+		 p0 = svcmpgt_n_s32 (p1, z0, x0),
+		 p0 = svcmpgt (p1, z0, x0))
+
+/*
+** cmpgt_0_s32:
+**	cmpgt	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_0_s32, svint32_t,
+		p0 = svcmpgt_n_s32 (p1, z0, 0),
+		p0 = svcmpgt (p1, z0, 0))
+
+/*
+** cmpgt_1_s32:
+**	cmpgt	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_1_s32, svint32_t,
+		p0 = svcmpgt_n_s32 (p1, z0, 1),
+		p0 = svcmpgt (p1, z0, 1))
+
+/*
+** cmpgt_15_s32:
+**	cmpgt	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_15_s32, svint32_t,
+		p0 = svcmpgt_n_s32 (p1, z0, 15),
+		p0 = svcmpgt (p1, z0, 15))
+
+/*
+** cmpgt_16_s32:
+**	mov	(z[0-9]+\.s), #16
+** (
+**	cmpgt	p0\.s, p1/z, z0\.s, \1
+** |
+**	cmplt	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_16_s32, svint32_t,
+		p0 = svcmpgt_n_s32 (p1, z0, 16),
+		p0 = svcmpgt (p1, z0, 16))
+
+/*
+** cmpgt_m1_s32:
+**	cmpgt	p0\.s, p1/z, z0\.s, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_m1_s32, svint32_t,
+		p0 = svcmpgt_n_s32 (p1, z0, -1),
+		p0 = svcmpgt (p1, z0, -1))
+
+/*
+** cmpgt_m16_s32:
+**	cmpgt	p0\.s, p1/z, z0\.s, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_m16_s32, svint32_t,
+		p0 = svcmpgt_n_s32 (p1, z0, -16),
+		p0 = svcmpgt (p1, z0, -16))
+
+/*
+** cmpgt_m17_s32:
+**	mov	(z[0-9]+\.s), #-17
+** (
+**	cmpgt	p0\.s, p1/z, z0\.s, \1
+** |
+**	cmplt	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_m17_s32, svint32_t,
+		p0 = svcmpgt_n_s32 (p1, z0, -17),
+		p0 = svcmpgt (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s64.c
new file mode 100644
index 000000000..dbfd55e6f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s64.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_s64_tied:
+** (
+**	cmpgt	p0\.d, p0/z, z0\.d, z1\.d
+** |
+**	cmplt	p0\.d, p0/z, z1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_s64_tied, svint64_t,
+		p0 = svcmpgt_s64 (p0, z0, z1),
+		p0 = svcmpgt (p0, z0, z1))
+
+/*
+** cmpgt_s64_untied:
+** (
+**	cmpgt	p0\.d, p1/z, z0\.d, z1\.d
+** |
+**	cmplt	p0\.d, p1/z, z1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_s64_untied, svint64_t,
+		p0 = svcmpgt_s64 (p1, z0, z1),
+		p0 = svcmpgt (p1, z0, z1))
+
+/*
+** cmpgt_x0_s64:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	cmpgt	p0\.d, p1/z, z0\.d, \1
+** |
+**	cmplt	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmpgt_x0_s64, svint64_t, int64_t,
+		 p0 = svcmpgt_n_s64 (p1, z0, x0),
+		 p0 = svcmpgt (p1, z0, x0))
+
+/*
+** cmpgt_0_s64:
+**	cmpgt	p0\.d, p1/z, z0\.d, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_0_s64, svint64_t,
+		p0 = svcmpgt_n_s64 (p1, z0, 0),
+		p0 = svcmpgt (p1, z0, 0))
+
+/*
+** cmpgt_1_s64:
+**	cmpgt	p0\.d, p1/z, z0\.d, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_1_s64, svint64_t,
+		p0 = svcmpgt_n_s64 (p1, z0, 1),
+		p0 = svcmpgt (p1, z0, 1))
+
+/*
+** cmpgt_15_s64:
+**	cmpgt	p0\.d, p1/z, z0\.d, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_15_s64, svint64_t,
+		p0 = svcmpgt_n_s64 (p1, z0, 15),
+		p0 = svcmpgt (p1, z0, 15))
+
+/*
+** cmpgt_16_s64:
+**	mov	(z[0-9]+\.d), #16
+** (
+**	cmpgt	p0\.d, p1/z, z0\.d, \1
+** |
+**	cmplt	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_16_s64, svint64_t,
+		p0 = svcmpgt_n_s64 (p1, z0, 16),
+		p0 = svcmpgt (p1, z0, 16))
+
+/*
+** cmpgt_m1_s64:
+**	cmpgt	p0\.d, p1/z, z0\.d, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_m1_s64, svint64_t,
+		p0 = svcmpgt_n_s64 (p1, z0, -1),
+		p0 = svcmpgt (p1, z0, -1))
+
+/*
+** cmpgt_m16_s64:
+**	cmpgt	p0\.d, p1/z, z0\.d, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_m16_s64, svint64_t,
+		p0 = svcmpgt_n_s64 (p1, z0, -16),
+		p0 = svcmpgt (p1, z0, -16))
+
+/*
+** cmpgt_m17_s64:
+**	mov	(z[0-9]+\.d), #-17
+** (
+**	cmpgt	p0\.d, p1/z, z0\.d, \1
+** |
+**	cmplt	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_m17_s64, svint64_t,
+		p0 = svcmpgt_n_s64 (p1, z0, -17),
+		p0 = svcmpgt (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s8.c
new file mode 100644
index 000000000..710c2e602
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_s8.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_s8_tied:
+** (
+**	cmpgt	p0\.b, p0/z, z0\.b, z1\.b
+** |
+**	cmplt	p0\.b, p0/z, z1\.b, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_s8_tied, svint8_t,
+		p0 = svcmpgt_s8 (p0, z0, z1),
+		p0 = svcmpgt (p0, z0, z1))
+
+/*
+** cmpgt_s8_untied:
+** (
+**	cmpgt	p0\.b, p1/z, z0\.b, z1\.b
+** |
+**	cmplt	p0\.b, p1/z, z1\.b, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_s8_untied, svint8_t,
+		p0 = svcmpgt_s8 (p1, z0, z1),
+		p0 = svcmpgt (p1, z0, z1))
+
+/*
+** cmpgt_w0_s8:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	cmpgt	p0\.b, p1/z, z0\.b, \1
+** |
+**	cmplt	p0\.b, p1/z, \1, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmpgt_w0_s8, svint8_t, int8_t,
+		 p0 = svcmpgt_n_s8 (p1, z0, x0),
+		 p0 = svcmpgt (p1, z0, x0))
+
+/*
+** cmpgt_0_s8:
+**	cmpgt	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_0_s8, svint8_t,
+		p0 = svcmpgt_n_s8 (p1, z0, 0),
+		p0 = svcmpgt (p1, z0, 0))
+
+/*
+** cmpgt_1_s8:
+**	cmpgt	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_1_s8, svint8_t,
+		p0 = svcmpgt_n_s8 (p1, z0, 1),
+		p0 = svcmpgt (p1, z0, 1))
+
+/*
+** cmpgt_15_s8:
+**	cmpgt	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_15_s8, svint8_t,
+		p0 = svcmpgt_n_s8 (p1, z0, 15),
+		p0 = svcmpgt (p1, z0, 15))
+
+/*
+** cmpgt_16_s8:
+**	mov	(z[0-9]+\.b), #16
+** (
+**	cmpgt	p0\.b, p1/z, z0\.b, \1
+** |
+**	cmplt	p0\.b, p1/z, \1, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_16_s8, svint8_t,
+		p0 = svcmpgt_n_s8 (p1, z0, 16),
+		p0 = svcmpgt (p1, z0, 16))
+
+/*
+** cmpgt_m1_s8:
+**	cmpgt	p0\.b, p1/z, z0\.b, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_m1_s8, svint8_t,
+		p0 = svcmpgt_n_s8 (p1, z0, -1),
+		p0 = svcmpgt (p1, z0, -1))
+
+/*
+** cmpgt_m16_s8:
+**	cmpgt	p0\.b, p1/z, z0\.b, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_m16_s8, svint8_t,
+		p0 = svcmpgt_n_s8 (p1, z0, -16),
+		p0 = svcmpgt (p1, z0, -16))
+
+/*
+** cmpgt_m17_s8:
+**	mov	(z[0-9]+\.b), #-17
+** (
+**	cmpgt	p0\.b, p1/z, z0\.b, \1
+** |
+**	cmplt	p0\.b, p1/z, \1, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_m17_s8, svint8_t,
+		p0 = svcmpgt_n_s8 (p1, z0, -17),
+		p0 = svcmpgt (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u16.c
new file mode 100644
index 000000000..48e99c72c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u16.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_u16_tied:
+** (
+**	cmphi	p0\.h, p0/z, z0\.h, z1\.h
+** |
+**	cmplo	p0\.h, p0/z, z1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_u16_tied, svuint16_t,
+		p0 = svcmpgt_u16 (p0, z0, z1),
+		p0 = svcmpgt (p0, z0, z1))
+
+/*
+** cmpgt_u16_untied:
+** (
+**	cmphi	p0\.h, p1/z, z0\.h, z1\.h
+** |
+**	cmplo	p0\.h, p1/z, z1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_u16_untied, svuint16_t,
+		p0 = svcmpgt_u16 (p1, z0, z1),
+		p0 = svcmpgt (p1, z0, z1))
+
+/*
+** cmpgt_w0_u16:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	cmphi	p0\.h, p1/z, z0\.h, \1
+** |
+**	cmplo	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmpgt_w0_u16, svuint16_t, uint16_t,
+		 p0 = svcmpgt_n_u16 (p1, z0, x0),
+		 p0 = svcmpgt (p1, z0, x0))
+
+/*
+** cmpgt_0_u16:
+**	cmphi	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_0_u16, svuint16_t,
+		p0 = svcmpgt_n_u16 (p1, z0, 0),
+		p0 = svcmpgt (p1, z0, 0))
+
+/*
+** cmpgt_1_u16:
+**	cmphi	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_1_u16, svuint16_t,
+		p0 = svcmpgt_n_u16 (p1, z0, 1),
+		p0 = svcmpgt (p1, z0, 1))
+
+/*
+** cmpgt_15_u16:
+**	cmphi	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_15_u16, svuint16_t,
+		p0 = svcmpgt_n_u16 (p1, z0, 15),
+		p0 = svcmpgt (p1, z0, 15))
+
+/*
+** cmpgt_16_u16:
+**	cmphi	p0\.h, p1/z, z0\.h, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_16_u16, svuint16_t,
+		p0 = svcmpgt_n_u16 (p1, z0, 16),
+		p0 = svcmpgt (p1, z0, 16))
+
+/*
+** cmpgt_127_u16:
+**	cmphi	p0\.h, p1/z, z0\.h, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_127_u16, svuint16_t,
+		p0 = svcmpgt_n_u16 (p1, z0, 127),
+		p0 = svcmpgt (p1, z0, 127))
+
+/*
+** cmpgt_128_u16:
+**	mov	(z[0-9]+\.h), #128
+** (
+**	cmphi	p0\.h, p1/z, z0\.h, \1
+** |
+**	cmplo	p0\.h, p1/z, \1, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_128_u16, svuint16_t,
+		p0 = svcmpgt_n_u16 (p1, z0, 128),
+		p0 = svcmpgt (p1, z0, 128))
+
+/*
+** cmpgt_m1_u16:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	cmphi	p0\.h, p1/z, z0\.h, \1\.h
+** |
+**	cmplo	p0\.h, p1/z, \1\.h, z0\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_m1_u16, svuint16_t,
+		p0 = svcmpgt_n_u16 (p1, z0, -1),
+		p0 = svcmpgt (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u32.c
new file mode 100644
index 000000000..408037d72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u32.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_u32_tied:
+** (
+**	cmphi	p0\.s, p0/z, z0\.s, z1\.s
+** |
+**	cmplo	p0\.s, p0/z, z1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_u32_tied, svuint32_t,
+		p0 = svcmpgt_u32 (p0, z0, z1),
+		p0 = svcmpgt (p0, z0, z1))
+
+/*
+** cmpgt_u32_untied:
+** (
+**	cmphi	p0\.s, p1/z, z0\.s, z1\.s
+** |
+**	cmplo	p0\.s, p1/z, z1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_u32_untied, svuint32_t,
+		p0 = svcmpgt_u32 (p1, z0, z1),
+		p0 = svcmpgt (p1, z0, z1))
+
+/*
+** cmpgt_w0_u32:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	cmphi	p0\.s, p1/z, z0\.s, \1
+** |
+**	cmplo	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmpgt_w0_u32, svuint32_t, uint32_t,
+		 p0 = svcmpgt_n_u32 (p1, z0, x0),
+		 p0 = svcmpgt (p1, z0, x0))
+
+/*
+** cmpgt_0_u32:
+**	cmphi	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_0_u32, svuint32_t,
+		p0 = svcmpgt_n_u32 (p1, z0, 0),
+		p0 = svcmpgt (p1, z0, 0))
+
+/*
+** cmpgt_1_u32:
+**	cmphi	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_1_u32, svuint32_t,
+		p0 = svcmpgt_n_u32 (p1, z0, 1),
+		p0 = svcmpgt (p1, z0, 1))
+
+/*
+** cmpgt_15_u32:
+**	cmphi	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_15_u32, svuint32_t,
+		p0 = svcmpgt_n_u32 (p1, z0, 15),
+		p0 = svcmpgt (p1, z0, 15))
+
+/*
+** cmpgt_16_u32:
+**	cmphi	p0\.s, p1/z, z0\.s, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_16_u32, svuint32_t,
+		p0 = svcmpgt_n_u32 (p1, z0, 16),
+		p0 = svcmpgt (p1, z0, 16))
+
+/*
+** cmpgt_127_u32:
+**	cmphi	p0\.s, p1/z, z0\.s, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_127_u32, svuint32_t,
+		p0 = svcmpgt_n_u32 (p1, z0, 127),
+		p0 = svcmpgt (p1, z0, 127))
+
+/*
+** cmpgt_128_u32:
+**	mov	(z[0-9]+\.s), #128
+** (
+**	cmphi	p0\.s, p1/z, z0\.s, \1
+** |
+**	cmplo	p0\.s, p1/z, \1, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_128_u32, svuint32_t,
+		p0 = svcmpgt_n_u32 (p1, z0, 128),
+		p0 = svcmpgt (p1, z0, 128))
+
+/*
+** cmpgt_m1_u32:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	cmphi	p0\.s, p1/z, z0\.s, \1\.s
+** |
+**	cmplo	p0\.s, p1/z, \1\.s, z0\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_m1_u32, svuint32_t,
+		p0 = svcmpgt_n_u32 (p1, z0, -1),
+		p0 = svcmpgt (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u64.c
new file mode 100644
index 000000000..f76a23e49
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u64.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_u64_tied:
+** (
+**	cmphi	p0\.d, p0/z, z0\.d, z1\.d
+** |
+**	cmplo	p0\.d, p0/z, z1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_u64_tied, svuint64_t,
+		p0 = svcmpgt_u64 (p0, z0, z1),
+		p0 = svcmpgt (p0, z0, z1))
+
+/*
+** cmpgt_u64_untied:
+** (
+**	cmphi	p0\.d, p1/z, z0\.d, z1\.d
+** |
+**	cmplo	p0\.d, p1/z, z1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_u64_untied, svuint64_t,
+		p0 = svcmpgt_u64 (p1, z0, z1),
+		p0 = svcmpgt (p1, z0, z1))
+
+/*
+** cmpgt_x0_u64:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	cmphi	p0\.d, p1/z, z0\.d, \1
+** |
+**	cmplo	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmpgt_x0_u64, svuint64_t, uint64_t,
+		 p0 = svcmpgt_n_u64 (p1, z0, x0),
+		 p0 = svcmpgt (p1, z0, x0))
+
+/*
+** cmpgt_0_u64:
+**	cmphi	p0\.d, p1/z, z0\.d, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_0_u64, svuint64_t,
+		p0 = svcmpgt_n_u64 (p1, z0, 0),
+		p0 = svcmpgt (p1, z0, 0))
+
+/*
+** cmpgt_1_u64:
+**	cmphi	p0\.d, p1/z, z0\.d, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_1_u64, svuint64_t,
+		p0 = svcmpgt_n_u64 (p1, z0, 1),
+		p0 = svcmpgt (p1, z0, 1))
+
+/*
+** cmpgt_15_u64:
+**	cmphi	p0\.d, p1/z, z0\.d, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_15_u64, svuint64_t,
+		p0 = svcmpgt_n_u64 (p1, z0, 15),
+		p0 = svcmpgt (p1, z0, 15))
+
+/*
+** cmpgt_16_u64:
+**	cmphi	p0\.d, p1/z, z0\.d, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_16_u64, svuint64_t,
+		p0 = svcmpgt_n_u64 (p1, z0, 16),
+		p0 = svcmpgt (p1, z0, 16))
+
+/*
+** cmpgt_127_u64:
+**	cmphi	p0\.d, p1/z, z0\.d, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_127_u64, svuint64_t,
+		p0 = svcmpgt_n_u64 (p1, z0, 127),
+		p0 = svcmpgt (p1, z0, 127))
+
+/*
+** cmpgt_128_u64:
+**	mov	(z[0-9]+\.d), #128
+** (
+**	cmphi	p0\.d, p1/z, z0\.d, \1
+** |
+**	cmplo	p0\.d, p1/z, \1, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_128_u64, svuint64_t,
+		p0 = svcmpgt_n_u64 (p1, z0, 128),
+		p0 = svcmpgt (p1, z0, 128))
+
+/*
+** cmpgt_m1_u64:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	cmphi	p0\.d, p1/z, z0\.d, \1\.d
+** |
+**	cmplo	p0\.d, p1/z, \1\.d, z0\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_m1_u64, svuint64_t,
+		p0 = svcmpgt_n_u64 (p1, z0, -1),
+		p0 = svcmpgt (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u8.c
new file mode 100644
index 000000000..4f28331f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_u8.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_u8_tied:
+** (
+**	cmphi	p0\.b, p0/z, z0\.b, z1\.b
+** |
+**	cmplo	p0\.b, p0/z, z1\.b, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_u8_tied, svuint8_t,
+		p0 = svcmpgt_u8 (p0, z0, z1),
+		p0 = svcmpgt (p0, z0, z1))
+
+/*
+** cmpgt_u8_untied:
+** (
+**	cmphi	p0\.b, p1/z, z0\.b, z1\.b
+** |
+**	cmplo	p0\.b, p1/z, z1\.b, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_u8_untied, svuint8_t,
+		p0 = svcmpgt_u8 (p1, z0, z1),
+		p0 = svcmpgt (p1, z0, z1))
+
+/*
+** cmpgt_w0_u8:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	cmphi	p0\.b, p1/z, z0\.b, \1
+** |
+**	cmplo	p0\.b, p1/z, \1, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmpgt_w0_u8, svuint8_t, uint8_t,
+		 p0 = svcmpgt_n_u8 (p1, z0, x0),
+		 p0 = svcmpgt (p1, z0, x0))
+
+/*
+** cmpgt_0_u8:
+**	cmphi	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_0_u8, svuint8_t,
+		p0 = svcmpgt_n_u8 (p1, z0, 0),
+		p0 = svcmpgt (p1, z0, 0))
+
+/*
+** cmpgt_1_u8:
+**	cmphi	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_1_u8, svuint8_t,
+		p0 = svcmpgt_n_u8 (p1, z0, 1),
+		p0 = svcmpgt (p1, z0, 1))
+
+/*
+** cmpgt_15_u8:
+**	cmphi	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_15_u8, svuint8_t,
+		p0 = svcmpgt_n_u8 (p1, z0, 15),
+		p0 = svcmpgt (p1, z0, 15))
+
+/*
+** cmpgt_16_u8:
+**	cmphi	p0\.b, p1/z, z0\.b, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_16_u8, svuint8_t,
+		p0 = svcmpgt_n_u8 (p1, z0, 16),
+		p0 = svcmpgt (p1, z0, 16))
+
+/*
+** cmpgt_127_u8:
+**	cmphi	p0\.b, p1/z, z0\.b, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_127_u8, svuint8_t,
+		p0 = svcmpgt_n_u8 (p1, z0, 127),
+		p0 = svcmpgt (p1, z0, 127))
+
+/*
+** cmpgt_128_u8:
+**	mov	(z[0-9]+\.b), #-128
+** (
+**	cmphi	p0\.b, p1/z, z0\.b, \1
+** |
+**	cmplo	p0\.b, p1/z, \1, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_128_u8, svuint8_t,
+		p0 = svcmpgt_n_u8 (p1, z0, 128),
+		p0 = svcmpgt (p1, z0, 128))
+
+/*
+** cmpgt_m1_u8:
+**	mov	(z[0-9]+\.b), #-1
+** (
+**	cmphi	p0\.b, p1/z, z0\.b, \1
+** |
+**	cmplo	p0\.b, p1/z, \1, z0\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_m1_u8, svuint8_t,
+		p0 = svcmpgt_n_u8 (p1, z0, -1),
+		p0 = svcmpgt (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s16.c
new file mode 100644
index 000000000..07d3bbbd9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s16.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_wide_s16_tied:
+**	cmpgt	p0\.h, p0/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpgt_wide_s16_tied, svint16_t, svint64_t,
+		     p0 = svcmpgt_wide_s16 (p0, z0, z1),
+		     p0 = svcmpgt_wide (p0, z0, z1))
+
+/*
+** cmpgt_wide_s16_untied:
+**	cmpgt	p0\.h, p1/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpgt_wide_s16_untied, svint16_t, svint64_t,
+		     p0 = svcmpgt_wide_s16 (p1, z0, z1),
+		     p0 = svcmpgt_wide (p1, z0, z1))
+
+/*
+** cmpgt_wide_x0_s16:
+**	mov	(z[0-9]+\.d), x0
+**	cmpgt	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpgt_wide_x0_s16, svint16_t, int64_t,
+		 p0 = svcmpgt_wide_n_s16 (p1, z0, x0),
+		 p0 = svcmpgt_wide (p1, z0, x0))
+
+/*
+** cmpgt_wide_0_s16:
+**	cmpgt	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_0_s16, svint16_t,
+		p0 = svcmpgt_wide_n_s16 (p1, z0, 0),
+		p0 = svcmpgt_wide (p1, z0, 0))
+
+/*
+** cmpgt_wide_1_s16:
+**	cmpgt	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_1_s16, svint16_t,
+		p0 = svcmpgt_wide_n_s16 (p1, z0, 1),
+		p0 = svcmpgt_wide (p1, z0, 1))
+
+/*
+** cmpgt_wide_15_s16:
+**	cmpgt	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_15_s16, svint16_t,
+		p0 = svcmpgt_wide_n_s16 (p1, z0, 15),
+		p0 = svcmpgt_wide (p1, z0, 15))
+
+/*
+** cmpgt_wide_16_s16:
+**	mov	(z[0-9]+\.d), #16
+**	cmpgt	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_16_s16, svint16_t,
+		p0 = svcmpgt_wide_n_s16 (p1, z0, 16),
+		p0 = svcmpgt_wide (p1, z0, 16))
+
+/*
+** cmpgt_wide_m1_s16:
+**	cmpgt	p0\.h, p1/z, z0\.h, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_m1_s16, svint16_t,
+		p0 = svcmpgt_wide_n_s16 (p1, z0, -1),
+		p0 = svcmpgt_wide (p1, z0, -1))
+
+/*
+** cmpgt_wide_m16_s16:
+**	cmpgt	p0\.h, p1/z, z0\.h, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_m16_s16, svint16_t,
+		p0 = svcmpgt_wide_n_s16 (p1, z0, -16),
+		p0 = svcmpgt_wide (p1, z0, -16))
+
+/*
+** cmpgt_wide_m17_s16:
+**	mov	(z[0-9]+\.d), #-17
+**	cmpgt	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_m17_s16, svint16_t,
+		p0 = svcmpgt_wide_n_s16 (p1, z0, -17),
+		p0 = svcmpgt_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s32.c
new file mode 100644
index 000000000..f984362e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s32.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_wide_s32_tied:
+**	cmpgt	p0\.s, p0/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpgt_wide_s32_tied, svint32_t, svint64_t,
+		     p0 = svcmpgt_wide_s32 (p0, z0, z1),
+		     p0 = svcmpgt_wide (p0, z0, z1))
+
+/*
+** cmpgt_wide_s32_untied:
+**	cmpgt	p0\.s, p1/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpgt_wide_s32_untied, svint32_t, svint64_t,
+		     p0 = svcmpgt_wide_s32 (p1, z0, z1),
+		     p0 = svcmpgt_wide (p1, z0, z1))
+
+/*
+** cmpgt_wide_x0_s32:
+**	mov	(z[0-9]+\.d), x0
+**	cmpgt	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpgt_wide_x0_s32, svint32_t, int64_t,
+		 p0 = svcmpgt_wide_n_s32 (p1, z0, x0),
+		 p0 = svcmpgt_wide (p1, z0, x0))
+
+/*
+** cmpgt_wide_0_s32:
+**	cmpgt	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_0_s32, svint32_t,
+		p0 = svcmpgt_wide_n_s32 (p1, z0, 0),
+		p0 = svcmpgt_wide (p1, z0, 0))
+
+/*
+** cmpgt_wide_1_s32:
+**	cmpgt	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_1_s32, svint32_t,
+		p0 = svcmpgt_wide_n_s32 (p1, z0, 1),
+		p0 = svcmpgt_wide (p1, z0, 1))
+
+/*
+** cmpgt_wide_15_s32:
+**	cmpgt	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_15_s32, svint32_t,
+		p0 = svcmpgt_wide_n_s32 (p1, z0, 15),
+		p0 = svcmpgt_wide (p1, z0, 15))
+
+/*
+** cmpgt_wide_16_s32:
+**	mov	(z[0-9]+\.d), #16
+**	cmpgt	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_16_s32, svint32_t,
+		p0 = svcmpgt_wide_n_s32 (p1, z0, 16),
+		p0 = svcmpgt_wide (p1, z0, 16))
+
+/*
+** cmpgt_wide_m1_s32:
+**	cmpgt	p0\.s, p1/z, z0\.s, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_m1_s32, svint32_t,
+		p0 = svcmpgt_wide_n_s32 (p1, z0, -1),
+		p0 = svcmpgt_wide (p1, z0, -1))
+
+/*
+** cmpgt_wide_m16_s32:
+**	cmpgt	p0\.s, p1/z, z0\.s, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_m16_s32, svint32_t,
+		p0 = svcmpgt_wide_n_s32 (p1, z0, -16),
+		p0 = svcmpgt_wide (p1, z0, -16))
+
+/*
+** cmpgt_wide_m17_s32:
+**	mov	(z[0-9]+\.d), #-17
+**	cmpgt	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_m17_s32, svint32_t,
+		p0 = svcmpgt_wide_n_s32 (p1, z0, -17),
+		p0 = svcmpgt_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s8.c
new file mode 100644
index 000000000..07047a315
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_s8.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_wide_s8_tied:
+**	cmpgt	p0\.b, p0/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpgt_wide_s8_tied, svint8_t, svint64_t,
+		     p0 = svcmpgt_wide_s8 (p0, z0, z1),
+		     p0 = svcmpgt_wide (p0, z0, z1))
+
+/*
+** cmpgt_wide_s8_untied:
+**	cmpgt	p0\.b, p1/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpgt_wide_s8_untied, svint8_t, svint64_t,
+		     p0 = svcmpgt_wide_s8 (p1, z0, z1),
+		     p0 = svcmpgt_wide (p1, z0, z1))
+
+/*
+** cmpgt_wide_x0_s8:
+**	mov	(z[0-9]+\.d), x0
+**	cmpgt	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpgt_wide_x0_s8, svint8_t, int64_t,
+		 p0 = svcmpgt_wide_n_s8 (p1, z0, x0),
+		 p0 = svcmpgt_wide (p1, z0, x0))
+
+/*
+** cmpgt_wide_0_s8:
+**	cmpgt	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_0_s8, svint8_t,
+		p0 = svcmpgt_wide_n_s8 (p1, z0, 0),
+		p0 = svcmpgt_wide (p1, z0, 0))
+
+/*
+** cmpgt_wide_1_s8:
+**	cmpgt	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_1_s8, svint8_t,
+		p0 = svcmpgt_wide_n_s8 (p1, z0, 1),
+		p0 = svcmpgt_wide (p1, z0, 1))
+
+/*
+** cmpgt_wide_15_s8:
+**	cmpgt	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_15_s8, svint8_t,
+		p0 = svcmpgt_wide_n_s8 (p1, z0, 15),
+		p0 = svcmpgt_wide (p1, z0, 15))
+
+/*
+** cmpgt_wide_16_s8:
+**	mov	(z[0-9]+\.d), #16
+**	cmpgt	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_16_s8, svint8_t,
+		p0 = svcmpgt_wide_n_s8 (p1, z0, 16),
+		p0 = svcmpgt_wide (p1, z0, 16))
+
+/*
+** cmpgt_wide_m1_s8:
+**	cmpgt	p0\.b, p1/z, z0\.b, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_m1_s8, svint8_t,
+		p0 = svcmpgt_wide_n_s8 (p1, z0, -1),
+		p0 = svcmpgt_wide (p1, z0, -1))
+
+/*
+** cmpgt_wide_m16_s8:
+**	cmpgt	p0\.b, p1/z, z0\.b, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_m16_s8, svint8_t,
+		p0 = svcmpgt_wide_n_s8 (p1, z0, -16),
+		p0 = svcmpgt_wide (p1, z0, -16))
+
+/*
+** cmpgt_wide_m17_s8:
+**	mov	(z[0-9]+\.d), #-17
+**	cmpgt	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_m17_s8, svint8_t,
+		p0 = svcmpgt_wide_n_s8 (p1, z0, -17),
+		p0 = svcmpgt_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u16.c
new file mode 100644
index 000000000..bcffb88c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u16.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_wide_u16_tied:
+**	cmphi	p0\.h, p0/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpgt_wide_u16_tied, svuint16_t, svuint64_t,
+		     p0 = svcmpgt_wide_u16 (p0, z0, z1),
+		     p0 = svcmpgt_wide (p0, z0, z1))
+
+/*
+** cmpgt_wide_u16_untied:
+**	cmphi	p0\.h, p1/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpgt_wide_u16_untied, svuint16_t, svuint64_t,
+		     p0 = svcmpgt_wide_u16 (p1, z0, z1),
+		     p0 = svcmpgt_wide (p1, z0, z1))
+
+/*
+** cmpgt_wide_x0_u16:
+**	mov	(z[0-9]+\.d), x0
+**	cmphi	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpgt_wide_x0_u16, svuint16_t, uint64_t,
+		 p0 = svcmpgt_wide_n_u16 (p1, z0, x0),
+		 p0 = svcmpgt_wide (p1, z0, x0))
+
+/*
+** cmpgt_wide_0_u16:
+**	cmphi	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_0_u16, svuint16_t,
+		p0 = svcmpgt_wide_n_u16 (p1, z0, 0),
+		p0 = svcmpgt_wide (p1, z0, 0))
+
+/*
+** cmpgt_wide_1_u16:
+**	cmphi	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_1_u16, svuint16_t,
+		p0 = svcmpgt_wide_n_u16 (p1, z0, 1),
+		p0 = svcmpgt_wide (p1, z0, 1))
+
+/*
+** cmpgt_wide_15_u16:
+**	cmphi	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_15_u16, svuint16_t,
+		p0 = svcmpgt_wide_n_u16 (p1, z0, 15),
+		p0 = svcmpgt_wide (p1, z0, 15))
+
+/*
+** cmpgt_wide_16_u16:
+**	cmphi	p0\.h, p1/z, z0\.h, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_16_u16, svuint16_t,
+		p0 = svcmpgt_wide_n_u16 (p1, z0, 16),
+		p0 = svcmpgt_wide (p1, z0, 16))
+
+/*
+** cmpgt_wide_127_u16:
+**	cmphi	p0\.h, p1/z, z0\.h, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_127_u16, svuint16_t,
+		p0 = svcmpgt_wide_n_u16 (p1, z0, 127),
+		p0 = svcmpgt_wide (p1, z0, 127))
+
+/*
+** cmpgt_wide_128_u16:
+**	mov	(z[0-9]+\.d), #128
+**	cmphi	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_128_u16, svuint16_t,
+		p0 = svcmpgt_wide_n_u16 (p1, z0, 128),
+		p0 = svcmpgt_wide (p1, z0, 128))
+
+/*
+** cmpgt_wide_m1_u16:
+**	mov	(z[0-9]+)\.b, #-1
+**	cmphi	p0\.h, p1/z, z0\.h, \1\.d
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_m1_u16, svuint16_t,
+		p0 = svcmpgt_wide_n_u16 (p1, z0, -1),
+		p0 = svcmpgt_wide (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u32.c
new file mode 100644
index 000000000..65c0231e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u32.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_wide_u32_tied:
+**	cmphi	p0\.s, p0/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpgt_wide_u32_tied, svuint32_t, svuint64_t,
+		     p0 = svcmpgt_wide_u32 (p0, z0, z1),
+		     p0 = svcmpgt_wide (p0, z0, z1))
+
+/*
+** cmpgt_wide_u32_untied:
+**	cmphi	p0\.s, p1/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpgt_wide_u32_untied, svuint32_t, svuint64_t,
+		     p0 = svcmpgt_wide_u32 (p1, z0, z1),
+		     p0 = svcmpgt_wide (p1, z0, z1))
+
+/*
+** cmpgt_wide_x0_u32:
+**	mov	(z[0-9]+\.d), x0
+**	cmphi	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpgt_wide_x0_u32, svuint32_t, uint64_t,
+		 p0 = svcmpgt_wide_n_u32 (p1, z0, x0),
+		 p0 = svcmpgt_wide (p1, z0, x0))
+
+/*
+** cmpgt_wide_0_u32:
+**	cmphi	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_0_u32, svuint32_t,
+		p0 = svcmpgt_wide_n_u32 (p1, z0, 0),
+		p0 = svcmpgt_wide (p1, z0, 0))
+
+/*
+** cmpgt_wide_1_u32:
+**	cmphi	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_1_u32, svuint32_t,
+		p0 = svcmpgt_wide_n_u32 (p1, z0, 1),
+		p0 = svcmpgt_wide (p1, z0, 1))
+
+/*
+** cmpgt_wide_15_u32:
+**	cmphi	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_15_u32, svuint32_t,
+		p0 = svcmpgt_wide_n_u32 (p1, z0, 15),
+		p0 = svcmpgt_wide (p1, z0, 15))
+
+/*
+** cmpgt_wide_16_u32:
+**	cmphi	p0\.s, p1/z, z0\.s, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_16_u32, svuint32_t,
+		p0 = svcmpgt_wide_n_u32 (p1, z0, 16),
+		p0 = svcmpgt_wide (p1, z0, 16))
+
+/*
+** cmpgt_wide_127_u32:
+**	cmphi	p0\.s, p1/z, z0\.s, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_127_u32, svuint32_t,
+		p0 = svcmpgt_wide_n_u32 (p1, z0, 127),
+		p0 = svcmpgt_wide (p1, z0, 127))
+
+/*
+** cmpgt_wide_128_u32:
+**	mov	(z[0-9]+\.d), #128
+**	cmphi	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_128_u32, svuint32_t,
+		p0 = svcmpgt_wide_n_u32 (p1, z0, 128),
+		p0 = svcmpgt_wide (p1, z0, 128))
+
+/*
+** cmpgt_wide_m1_u32:
+**	mov	(z[0-9]+)\.b, #-1
+**	cmphi	p0\.s, p1/z, z0\.s, \1\.d
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_m1_u32, svuint32_t,
+		p0 = svcmpgt_wide_n_u32 (p1, z0, -1),
+		p0 = svcmpgt_wide (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u8.c
new file mode 100644
index 000000000..0d1142f27
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpgt_wide_u8.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpgt_wide_u8_tied:
+**	cmphi	p0\.b, p0/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpgt_wide_u8_tied, svuint8_t, svuint64_t,
+		     p0 = svcmpgt_wide_u8 (p0, z0, z1),
+		     p0 = svcmpgt_wide (p0, z0, z1))
+
+/*
+** cmpgt_wide_u8_untied:
+**	cmphi	p0\.b, p1/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpgt_wide_u8_untied, svuint8_t, svuint64_t,
+		     p0 = svcmpgt_wide_u8 (p1, z0, z1),
+		     p0 = svcmpgt_wide (p1, z0, z1))
+
+/*
+** cmpgt_wide_x0_u8:
+**	mov	(z[0-9]+\.d), x0
+**	cmphi	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpgt_wide_x0_u8, svuint8_t, uint64_t,
+		 p0 = svcmpgt_wide_n_u8 (p1, z0, x0),
+		 p0 = svcmpgt_wide (p1, z0, x0))
+
+/*
+** cmpgt_wide_0_u8:
+**	cmphi	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_0_u8, svuint8_t,
+		p0 = svcmpgt_wide_n_u8 (p1, z0, 0),
+		p0 = svcmpgt_wide (p1, z0, 0))
+
+/*
+** cmpgt_wide_1_u8:
+**	cmphi	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_1_u8, svuint8_t,
+		p0 = svcmpgt_wide_n_u8 (p1, z0, 1),
+		p0 = svcmpgt_wide (p1, z0, 1))
+
+/*
+** cmpgt_wide_15_u8:
+**	cmphi	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_15_u8, svuint8_t,
+		p0 = svcmpgt_wide_n_u8 (p1, z0, 15),
+		p0 = svcmpgt_wide (p1, z0, 15))
+
+/*
+** cmpgt_wide_16_u8:
+**	cmphi	p0\.b, p1/z, z0\.b, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_16_u8, svuint8_t,
+		p0 = svcmpgt_wide_n_u8 (p1, z0, 16),
+		p0 = svcmpgt_wide (p1, z0, 16))
+
+/*
+** cmpgt_wide_127_u8:
+**	cmphi	p0\.b, p1/z, z0\.b, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_127_u8, svuint8_t,
+		p0 = svcmpgt_wide_n_u8 (p1, z0, 127),
+		p0 = svcmpgt_wide (p1, z0, 127))
+
+/*
+** cmpgt_wide_128_u8:
+**	mov	(z[0-9]+\.d), #128
+**	cmphi	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_128_u8, svuint8_t,
+		p0 = svcmpgt_wide_n_u8 (p1, z0, 128),
+		p0 = svcmpgt_wide (p1, z0, 128))
+
+/*
+** cmpgt_wide_m1_u8:
+**	mov	(z[0-9]+)\.b, #-1
+**	cmphi	p0\.b, p1/z, z0\.b, \1\.d
+**	ret
+*/
+TEST_COMPARE_Z (cmpgt_wide_m1_u8, svuint8_t,
+		p0 = svcmpgt_wide_n_u8 (p1, z0, -1),
+		p0 = svcmpgt_wide (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f16.c
new file mode 100644
index 000000000..7d500590f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f16.c
@@ -0,0 +1,66 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_f16_tied:
+** (
+**	fcmge	p0\.h, p0/z, z1\.h, z0\.h
+** |
+**	fcmle	p0\.h, p0/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_f16_tied, svfloat16_t,
+		p0 = svcmple_f16 (p0, z0, z1),
+		p0 = svcmple (p0, z0, z1))
+
+/*
+** cmple_f16_untied:
+** (
+**	fcmge	p0\.h, p1/z, z1\.h, z0\.h
+** |
+**	fcmle	p0\.h, p1/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_f16_untied, svfloat16_t,
+		p0 = svcmple_f16 (p1, z0, z1),
+		p0 = svcmple (p1, z0, z1))
+
+/*
+** cmple_h4_f16:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	fcmge	p0\.h, p1/z, \1, z0\.h
+** |
+**	fcmle	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (cmple_h4_f16, svfloat16_t, float16_t,
+		 p0 = svcmple_n_f16 (p1, z0, d4),
+		 p0 = svcmple (p1, z0, d4))
+
+/*
+** cmple_0_f16:
+**	fcmle	p0\.h, p1/z, z0\.h, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_0_f16, svfloat16_t,
+		p0 = svcmple_n_f16 (p1, z0, 0),
+		p0 = svcmple (p1, z0, 0))
+
+/*
+** cmple_1_f16:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+** (
+**	fcmge	p0\.h, p1/z, \1, z0\.h
+** |
+**	fcmle	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_1_f16, svfloat16_t,
+		p0 = svcmple_n_f16 (p1, z0, 1),
+		p0 = svcmple (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f32.c
new file mode 100644
index 000000000..3df63fef7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f32.c
@@ -0,0 +1,66 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_f32_tied:
+** (
+**	fcmge	p0\.s, p0/z, z1\.s, z0\.s
+** |
+**	fcmle	p0\.s, p0/z, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_f32_tied, svfloat32_t,
+		p0 = svcmple_f32 (p0, z0, z1),
+		p0 = svcmple (p0, z0, z1))
+
+/*
+** cmple_f32_untied:
+** (
+**	fcmge	p0\.s, p1/z, z1\.s, z0\.s
+** |
+**	fcmle	p0\.s, p1/z, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_f32_untied, svfloat32_t,
+		p0 = svcmple_f32 (p1, z0, z1),
+		p0 = svcmple (p1, z0, z1))
+
+/*
+** cmple_s4_f32:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	fcmge	p0\.s, p1/z, \1, z0\.s
+** |
+**	fcmle	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (cmple_s4_f32, svfloat32_t, float32_t,
+		 p0 = svcmple_n_f32 (p1, z0, d4),
+		 p0 = svcmple (p1, z0, d4))
+
+/*
+** cmple_0_f32:
+**	fcmle	p0\.s, p1/z, z0\.s, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_0_f32, svfloat32_t,
+		p0 = svcmple_n_f32 (p1, z0, 0),
+		p0 = svcmple (p1, z0, 0))
+
+/*
+** cmple_1_f32:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+** (
+**	fcmge	p0\.s, p1/z, \1, z0\.s
+** |
+**	fcmle	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_1_f32, svfloat32_t,
+		p0 = svcmple_n_f32 (p1, z0, 1),
+		p0 = svcmple (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f64.c
new file mode 100644
index 000000000..5946a1b3a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_f64.c
@@ -0,0 +1,66 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_f64_tied:
+** (
+**	fcmge	p0\.d, p0/z, z1\.d, z0\.d
+** |
+**	fcmle	p0\.d, p0/z, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_f64_tied, svfloat64_t,
+		p0 = svcmple_f64 (p0, z0, z1),
+		p0 = svcmple (p0, z0, z1))
+
+/*
+** cmple_f64_untied:
+** (
+**	fcmge	p0\.d, p1/z, z1\.d, z0\.d
+** |
+**	fcmle	p0\.d, p1/z, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_f64_untied, svfloat64_t,
+		p0 = svcmple_f64 (p1, z0, z1),
+		p0 = svcmple (p1, z0, z1))
+
+/*
+** cmple_d4_f64:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	fcmge	p0\.d, p1/z, \1, z0\.d
+** |
+**	fcmle	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (cmple_d4_f64, svfloat64_t, float64_t,
+		 p0 = svcmple_n_f64 (p1, z0, d4),
+		 p0 = svcmple (p1, z0, d4))
+
+/*
+** cmple_0_f64:
+**	fcmle	p0\.d, p1/z, z0\.d, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_0_f64, svfloat64_t,
+		p0 = svcmple_n_f64 (p1, z0, 0),
+		p0 = svcmple (p1, z0, 0))
+
+/*
+** cmple_1_f64:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+** (
+**	fcmge	p0\.d, p1/z, \1, z0\.d
+** |
+**	fcmle	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_1_f64, svfloat64_t,
+		p0 = svcmple_n_f64 (p1, z0, 1),
+		p0 = svcmple (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s16.c
new file mode 100644
index 000000000..9b221bb4c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s16.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_s16_tied:
+** (
+**	cmpge	p0\.h, p0/z, z1\.h, z0\.h
+** |
+**	cmple	p0\.h, p0/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_s16_tied, svint16_t,
+		p0 = svcmple_s16 (p0, z0, z1),
+		p0 = svcmple (p0, z0, z1))
+
+/*
+** cmple_s16_untied:
+** (
+**	cmpge	p0\.h, p1/z, z1\.h, z0\.h
+** |
+**	cmple	p0\.h, p1/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_s16_untied, svint16_t,
+		p0 = svcmple_s16 (p1, z0, z1),
+		p0 = svcmple (p1, z0, z1))
+
+/*
+** cmple_w0_s16:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	cmpge	p0\.h, p1/z, \1, z0\.h
+** |
+**	cmple	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmple_w0_s16, svint16_t, int16_t,
+		 p0 = svcmple_n_s16 (p1, z0, x0),
+		 p0 = svcmple (p1, z0, x0))
+
+/*
+** cmple_0_s16:
+**	cmple	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_0_s16, svint16_t,
+		p0 = svcmple_n_s16 (p1, z0, 0),
+		p0 = svcmple (p1, z0, 0))
+
+/*
+** cmple_1_s16:
+**	cmple	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_1_s16, svint16_t,
+		p0 = svcmple_n_s16 (p1, z0, 1),
+		p0 = svcmple (p1, z0, 1))
+
+/*
+** cmple_15_s16:
+**	cmple	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmple_15_s16, svint16_t,
+		p0 = svcmple_n_s16 (p1, z0, 15),
+		p0 = svcmple (p1, z0, 15))
+
+/*
+** cmple_16_s16:
+**	mov	(z[0-9]+\.h), #16
+** (
+**	cmpge	p0\.h, p1/z, \1, z0\.h
+** |
+**	cmple	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_16_s16, svint16_t,
+		p0 = svcmple_n_s16 (p1, z0, 16),
+		p0 = svcmple (p1, z0, 16))
+
+/*
+** cmple_m1_s16:
+**	cmple	p0\.h, p1/z, z0\.h, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_m1_s16, svint16_t,
+		p0 = svcmple_n_s16 (p1, z0, -1),
+		p0 = svcmple (p1, z0, -1))
+
+/*
+** cmple_m16_s16:
+**	cmple	p0\.h, p1/z, z0\.h, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmple_m16_s16, svint16_t,
+		p0 = svcmple_n_s16 (p1, z0, -16),
+		p0 = svcmple (p1, z0, -16))
+
+/*
+** cmple_m17_s16:
+**	mov	(z[0-9]+\.h), #-17
+** (
+**	cmpge	p0\.h, p1/z, \1, z0\.h
+** |
+**	cmple	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_m17_s16, svint16_t,
+		p0 = svcmple_n_s16 (p1, z0, -17),
+		p0 = svcmple (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s32.c
new file mode 100644
index 000000000..b0c8367e2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s32.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_s32_tied:
+** (
+**	cmpge	p0\.s, p0/z, z1\.s, z0\.s
+** |
+**	cmple	p0\.s, p0/z, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_s32_tied, svint32_t,
+		p0 = svcmple_s32 (p0, z0, z1),
+		p0 = svcmple (p0, z0, z1))
+
+/*
+** cmple_s32_untied:
+** (
+**	cmpge	p0\.s, p1/z, z1\.s, z0\.s
+** |
+**	cmple	p0\.s, p1/z, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_s32_untied, svint32_t,
+		p0 = svcmple_s32 (p1, z0, z1),
+		p0 = svcmple (p1, z0, z1))
+
+/*
+** cmple_w0_s32:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	cmpge	p0\.s, p1/z, \1, z0\.s
+** |
+**	cmple	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmple_w0_s32, svint32_t, int32_t,
+		 p0 = svcmple_n_s32 (p1, z0, x0),
+		 p0 = svcmple (p1, z0, x0))
+
+/*
+** cmple_0_s32:
+**	cmple	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_0_s32, svint32_t,
+		p0 = svcmple_n_s32 (p1, z0, 0),
+		p0 = svcmple (p1, z0, 0))
+
+/*
+** cmple_1_s32:
+**	cmple	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_1_s32, svint32_t,
+		p0 = svcmple_n_s32 (p1, z0, 1),
+		p0 = svcmple (p1, z0, 1))
+
+/*
+** cmple_15_s32:
+**	cmple	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmple_15_s32, svint32_t,
+		p0 = svcmple_n_s32 (p1, z0, 15),
+		p0 = svcmple (p1, z0, 15))
+
+/*
+** cmple_16_s32:
+**	mov	(z[0-9]+\.s), #16
+** (
+**	cmpge	p0\.s, p1/z, \1, z0\.s
+** |
+**	cmple	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_16_s32, svint32_t,
+		p0 = svcmple_n_s32 (p1, z0, 16),
+		p0 = svcmple (p1, z0, 16))
+
+/*
+** cmple_m1_s32:
+**	cmple	p0\.s, p1/z, z0\.s, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_m1_s32, svint32_t,
+		p0 = svcmple_n_s32 (p1, z0, -1),
+		p0 = svcmple (p1, z0, -1))
+
+/*
+** cmple_m16_s32:
+**	cmple	p0\.s, p1/z, z0\.s, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmple_m16_s32, svint32_t,
+		p0 = svcmple_n_s32 (p1, z0, -16),
+		p0 = svcmple (p1, z0, -16))
+
+/*
+** cmple_m17_s32:
+**	mov	(z[0-9]+\.s), #-17
+** (
+**	cmpge	p0\.s, p1/z, \1, z0\.s
+** |
+**	cmple	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_m17_s32, svint32_t,
+		p0 = svcmple_n_s32 (p1, z0, -17),
+		p0 = svcmple (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s64.c
new file mode 100644
index 000000000..faaa87614
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s64.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_s64_tied:
+** (
+**	cmpge	p0\.d, p0/z, z1\.d, z0\.d
+** |
+**	cmple	p0\.d, p0/z, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_s64_tied, svint64_t,
+		p0 = svcmple_s64 (p0, z0, z1),
+		p0 = svcmple (p0, z0, z1))
+
+/*
+** cmple_s64_untied:
+** (
+**	cmpge	p0\.d, p1/z, z1\.d, z0\.d
+** |
+**	cmple	p0\.d, p1/z, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_s64_untied, svint64_t,
+		p0 = svcmple_s64 (p1, z0, z1),
+		p0 = svcmple (p1, z0, z1))
+
+/*
+** cmple_x0_s64:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	cmpge	p0\.d, p1/z, \1, z0\.d
+** |
+**	cmple	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmple_x0_s64, svint64_t, int64_t,
+		 p0 = svcmple_n_s64 (p1, z0, x0),
+		 p0 = svcmple (p1, z0, x0))
+
+/*
+** cmple_0_s64:
+**	cmple	p0\.d, p1/z, z0\.d, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_0_s64, svint64_t,
+		p0 = svcmple_n_s64 (p1, z0, 0),
+		p0 = svcmple (p1, z0, 0))
+
+/*
+** cmple_1_s64:
+**	cmple	p0\.d, p1/z, z0\.d, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_1_s64, svint64_t,
+		p0 = svcmple_n_s64 (p1, z0, 1),
+		p0 = svcmple (p1, z0, 1))
+
+/*
+** cmple_15_s64:
+**	cmple	p0\.d, p1/z, z0\.d, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmple_15_s64, svint64_t,
+		p0 = svcmple_n_s64 (p1, z0, 15),
+		p0 = svcmple (p1, z0, 15))
+
+/*
+** cmple_16_s64:
+**	mov	(z[0-9]+\.d), #16
+** (
+**	cmpge	p0\.d, p1/z, \1, z0\.d
+** |
+**	cmple	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_16_s64, svint64_t,
+		p0 = svcmple_n_s64 (p1, z0, 16),
+		p0 = svcmple (p1, z0, 16))
+
+/*
+** cmple_m1_s64:
+**	cmple	p0\.d, p1/z, z0\.d, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_m1_s64, svint64_t,
+		p0 = svcmple_n_s64 (p1, z0, -1),
+		p0 = svcmple (p1, z0, -1))
+
+/*
+** cmple_m16_s64:
+**	cmple	p0\.d, p1/z, z0\.d, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmple_m16_s64, svint64_t,
+		p0 = svcmple_n_s64 (p1, z0, -16),
+		p0 = svcmple (p1, z0, -16))
+
+/*
+** cmple_m17_s64:
+**	mov	(z[0-9]+\.d), #-17
+** (
+**	cmpge	p0\.d, p1/z, \1, z0\.d
+** |
+**	cmple	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_m17_s64, svint64_t,
+		p0 = svcmple_n_s64 (p1, z0, -17),
+		p0 = svcmple (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s8.c
new file mode 100644
index 000000000..222487d75
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_s8.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_s8_tied:
+** (
+**	cmpge	p0\.b, p0/z, z1\.b, z0\.b
+** |
+**	cmple	p0\.b, p0/z, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_s8_tied, svint8_t,
+		p0 = svcmple_s8 (p0, z0, z1),
+		p0 = svcmple (p0, z0, z1))
+
+/*
+** cmple_s8_untied:
+** (
+**	cmpge	p0\.b, p1/z, z1\.b, z0\.b
+** |
+**	cmple	p0\.b, p1/z, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_s8_untied, svint8_t,
+		p0 = svcmple_s8 (p1, z0, z1),
+		p0 = svcmple (p1, z0, z1))
+
+/*
+** cmple_w0_s8:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	cmpge	p0\.b, p1/z, \1, z0\.b
+** |
+**	cmple	p0\.b, p1/z, z0\.b, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmple_w0_s8, svint8_t, int8_t,
+		 p0 = svcmple_n_s8 (p1, z0, x0),
+		 p0 = svcmple (p1, z0, x0))
+
+/*
+** cmple_0_s8:
+**	cmple	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_0_s8, svint8_t,
+		p0 = svcmple_n_s8 (p1, z0, 0),
+		p0 = svcmple (p1, z0, 0))
+
+/*
+** cmple_1_s8:
+**	cmple	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_1_s8, svint8_t,
+		p0 = svcmple_n_s8 (p1, z0, 1),
+		p0 = svcmple (p1, z0, 1))
+
+/*
+** cmple_15_s8:
+**	cmple	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmple_15_s8, svint8_t,
+		p0 = svcmple_n_s8 (p1, z0, 15),
+		p0 = svcmple (p1, z0, 15))
+
+/*
+** cmple_16_s8:
+**	mov	(z[0-9]+\.b), #16
+** (
+**	cmpge	p0\.b, p1/z, \1, z0\.b
+** |
+**	cmple	p0\.b, p1/z, z0\.b, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_16_s8, svint8_t,
+		p0 = svcmple_n_s8 (p1, z0, 16),
+		p0 = svcmple (p1, z0, 16))
+
+/*
+** cmple_m1_s8:
+**	cmple	p0\.b, p1/z, z0\.b, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_m1_s8, svint8_t,
+		p0 = svcmple_n_s8 (p1, z0, -1),
+		p0 = svcmple (p1, z0, -1))
+
+/*
+** cmple_m16_s8:
+**	cmple	p0\.b, p1/z, z0\.b, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmple_m16_s8, svint8_t,
+		p0 = svcmple_n_s8 (p1, z0, -16),
+		p0 = svcmple (p1, z0, -16))
+
+/*
+** cmple_m17_s8:
+**	mov	(z[0-9]+\.b), #-17
+** (
+**	cmpge	p0\.b, p1/z, \1, z0\.b
+** |
+**	cmple	p0\.b, p1/z, z0\.b, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_m17_s8, svint8_t,
+		p0 = svcmple_n_s8 (p1, z0, -17),
+		p0 = svcmple (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u16.c
new file mode 100644
index 000000000..26af06e52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u16.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_u16_tied:
+** (
+**	cmphs	p0\.h, p0/z, z1\.h, z0\.h
+** |
+**	cmpls	p0\.h, p0/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_u16_tied, svuint16_t,
+		p0 = svcmple_u16 (p0, z0, z1),
+		p0 = svcmple (p0, z0, z1))
+
+/*
+** cmple_u16_untied:
+** (
+**	cmphs	p0\.h, p1/z, z1\.h, z0\.h
+** |
+**	cmpls	p0\.h, p1/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_u16_untied, svuint16_t,
+		p0 = svcmple_u16 (p1, z0, z1),
+		p0 = svcmple (p1, z0, z1))
+
+/*
+** cmple_w0_u16:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	cmphs	p0\.h, p1/z, \1, z0\.h
+** |
+**	cmpls	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmple_w0_u16, svuint16_t, uint16_t,
+		 p0 = svcmple_n_u16 (p1, z0, x0),
+		 p0 = svcmple (p1, z0, x0))
+
+/*
+** cmple_0_u16:
+**	cmpls	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_0_u16, svuint16_t,
+		p0 = svcmple_n_u16 (p1, z0, 0),
+		p0 = svcmple (p1, z0, 0))
+
+/*
+** cmple_1_u16:
+**	cmpls	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_1_u16, svuint16_t,
+		p0 = svcmple_n_u16 (p1, z0, 1),
+		p0 = svcmple (p1, z0, 1))
+
+/*
+** cmple_15_u16:
+**	cmpls	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmple_15_u16, svuint16_t,
+		p0 = svcmple_n_u16 (p1, z0, 15),
+		p0 = svcmple (p1, z0, 15))
+
+/*
+** cmple_16_u16:
+**	cmpls	p0\.h, p1/z, z0\.h, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmple_16_u16, svuint16_t,
+		p0 = svcmple_n_u16 (p1, z0, 16),
+		p0 = svcmple (p1, z0, 16))
+
+/*
+** cmple_127_u16:
+**	cmpls	p0\.h, p1/z, z0\.h, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmple_127_u16, svuint16_t,
+		p0 = svcmple_n_u16 (p1, z0, 127),
+		p0 = svcmple (p1, z0, 127))
+
+/*
+** cmple_128_u16:
+**	mov	(z[0-9]+\.h), #128
+** (
+**	cmphs	p0\.h, p1/z, \1, z0\.h
+** |
+**	cmpls	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_128_u16, svuint16_t,
+		p0 = svcmple_n_u16 (p1, z0, 128),
+		p0 = svcmple (p1, z0, 128))
+
+/*
+** cmple_m1_u16:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	cmphs	p0\.h, p1/z, \1\.h, z0\.h
+** |
+**	cmpls	p0\.h, p1/z, z0\.h, \1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_m1_u16, svuint16_t,
+		p0 = svcmple_n_u16 (p1, z0, -1),
+		p0 = svcmple (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u32.c
new file mode 100644
index 000000000..cee2d14c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u32.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_u32_tied:
+** (
+**	cmphs	p0\.s, p0/z, z1\.s, z0\.s
+** |
+**	cmpls	p0\.s, p0/z, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_u32_tied, svuint32_t,
+		p0 = svcmple_u32 (p0, z0, z1),
+		p0 = svcmple (p0, z0, z1))
+
+/*
+** cmple_u32_untied:
+** (
+**	cmphs	p0\.s, p1/z, z1\.s, z0\.s
+** |
+**	cmpls	p0\.s, p1/z, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_u32_untied, svuint32_t,
+		p0 = svcmple_u32 (p1, z0, z1),
+		p0 = svcmple (p1, z0, z1))
+
+/*
+** cmple_w0_u32:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	cmphs	p0\.s, p1/z, \1, z0\.s
+** |
+**	cmpls	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmple_w0_u32, svuint32_t, uint32_t,
+		 p0 = svcmple_n_u32 (p1, z0, x0),
+		 p0 = svcmple (p1, z0, x0))
+
+/*
+** cmple_0_u32:
+**	cmpls	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_0_u32, svuint32_t,
+		p0 = svcmple_n_u32 (p1, z0, 0),
+		p0 = svcmple (p1, z0, 0))
+
+/*
+** cmple_1_u32:
+**	cmpls	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_1_u32, svuint32_t,
+		p0 = svcmple_n_u32 (p1, z0, 1),
+		p0 = svcmple (p1, z0, 1))
+
+/*
+** cmple_15_u32:
+**	cmpls	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmple_15_u32, svuint32_t,
+		p0 = svcmple_n_u32 (p1, z0, 15),
+		p0 = svcmple (p1, z0, 15))
+
+/*
+** cmple_16_u32:
+**	cmpls	p0\.s, p1/z, z0\.s, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmple_16_u32, svuint32_t,
+		p0 = svcmple_n_u32 (p1, z0, 16),
+		p0 = svcmple (p1, z0, 16))
+
+/*
+** cmple_127_u32:
+**	cmpls	p0\.s, p1/z, z0\.s, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmple_127_u32, svuint32_t,
+		p0 = svcmple_n_u32 (p1, z0, 127),
+		p0 = svcmple (p1, z0, 127))
+
+/*
+** cmple_128_u32:
+**	mov	(z[0-9]+\.s), #128
+** (
+**	cmphs	p0\.s, p1/z, \1, z0\.s
+** |
+**	cmpls	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_128_u32, svuint32_t,
+		p0 = svcmple_n_u32 (p1, z0, 128),
+		p0 = svcmple (p1, z0, 128))
+
+/*
+** cmple_m1_u32:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	cmphs	p0\.s, p1/z, \1\.s, z0\.s
+** |
+**	cmpls	p0\.s, p1/z, z0\.s, \1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_m1_u32, svuint32_t,
+		p0 = svcmple_n_u32 (p1, z0, -1),
+		p0 = svcmple (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u64.c
new file mode 100644
index 000000000..b8388bca8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u64.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_u64_tied:
+** (
+**	cmphs	p0\.d, p0/z, z1\.d, z0\.d
+** |
+**	cmpls	p0\.d, p0/z, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_u64_tied, svuint64_t,
+		p0 = svcmple_u64 (p0, z0, z1),
+		p0 = svcmple (p0, z0, z1))
+
+/*
+** cmple_u64_untied:
+** (
+**	cmphs	p0\.d, p1/z, z1\.d, z0\.d
+** |
+**	cmpls	p0\.d, p1/z, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_u64_untied, svuint64_t,
+		p0 = svcmple_u64 (p1, z0, z1),
+		p0 = svcmple (p1, z0, z1))
+
+/*
+** cmple_x0_u64:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	cmphs	p0\.d, p1/z, \1, z0\.d
+** |
+**	cmpls	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmple_x0_u64, svuint64_t, uint64_t,
+		 p0 = svcmple_n_u64 (p1, z0, x0),
+		 p0 = svcmple (p1, z0, x0))
+
+/*
+** cmple_0_u64:
+**	cmpls	p0\.d, p1/z, z0\.d, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_0_u64, svuint64_t,
+		p0 = svcmple_n_u64 (p1, z0, 0),
+		p0 = svcmple (p1, z0, 0))
+
+/*
+** cmple_1_u64:
+**	cmpls	p0\.d, p1/z, z0\.d, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_1_u64, svuint64_t,
+		p0 = svcmple_n_u64 (p1, z0, 1),
+		p0 = svcmple (p1, z0, 1))
+
+/*
+** cmple_15_u64:
+**	cmpls	p0\.d, p1/z, z0\.d, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmple_15_u64, svuint64_t,
+		p0 = svcmple_n_u64 (p1, z0, 15),
+		p0 = svcmple (p1, z0, 15))
+
+/*
+** cmple_16_u64:
+**	cmpls	p0\.d, p1/z, z0\.d, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmple_16_u64, svuint64_t,
+		p0 = svcmple_n_u64 (p1, z0, 16),
+		p0 = svcmple (p1, z0, 16))
+
+/*
+** cmple_127_u64:
+**	cmpls	p0\.d, p1/z, z0\.d, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmple_127_u64, svuint64_t,
+		p0 = svcmple_n_u64 (p1, z0, 127),
+		p0 = svcmple (p1, z0, 127))
+
+/*
+** cmple_128_u64:
+**	mov	(z[0-9]+\.d), #128
+** (
+**	cmphs	p0\.d, p1/z, \1, z0\.d
+** |
+**	cmpls	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_128_u64, svuint64_t,
+		p0 = svcmple_n_u64 (p1, z0, 128),
+		p0 = svcmple (p1, z0, 128))
+
+/*
+** cmple_m1_u64:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	cmphs	p0\.d, p1/z, \1\.d, z0\.d
+** |
+**	cmpls	p0\.d, p1/z, z0\.d, \1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_m1_u64, svuint64_t,
+		p0 = svcmple_n_u64 (p1, z0, -1),
+		p0 = svcmple (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u8.c
new file mode 100644
index 000000000..55a8d4f40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_u8.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_u8_tied:
+** (
+**	cmphs	p0\.b, p0/z, z1\.b, z0\.b
+** |
+**	cmpls	p0\.b, p0/z, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_u8_tied, svuint8_t,
+		p0 = svcmple_u8 (p0, z0, z1),
+		p0 = svcmple (p0, z0, z1))
+
+/*
+** cmple_u8_untied:
+** (
+**	cmphs	p0\.b, p1/z, z1\.b, z0\.b
+** |
+**	cmpls	p0\.b, p1/z, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_u8_untied, svuint8_t,
+		p0 = svcmple_u8 (p1, z0, z1),
+		p0 = svcmple (p1, z0, z1))
+
+/*
+** cmple_w0_u8:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	cmphs	p0\.b, p1/z, \1, z0\.b
+** |
+**	cmpls	p0\.b, p1/z, z0\.b, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmple_w0_u8, svuint8_t, uint8_t,
+		 p0 = svcmple_n_u8 (p1, z0, x0),
+		 p0 = svcmple (p1, z0, x0))
+
+/*
+** cmple_0_u8:
+**	cmpls	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_0_u8, svuint8_t,
+		p0 = svcmple_n_u8 (p1, z0, 0),
+		p0 = svcmple (p1, z0, 0))
+
+/*
+** cmple_1_u8:
+**	cmpls	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_1_u8, svuint8_t,
+		p0 = svcmple_n_u8 (p1, z0, 1),
+		p0 = svcmple (p1, z0, 1))
+
+/*
+** cmple_15_u8:
+**	cmpls	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmple_15_u8, svuint8_t,
+		p0 = svcmple_n_u8 (p1, z0, 15),
+		p0 = svcmple (p1, z0, 15))
+
+/*
+** cmple_16_u8:
+**	cmpls	p0\.b, p1/z, z0\.b, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmple_16_u8, svuint8_t,
+		p0 = svcmple_n_u8 (p1, z0, 16),
+		p0 = svcmple (p1, z0, 16))
+
+/*
+** cmple_127_u8:
+**	cmpls	p0\.b, p1/z, z0\.b, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmple_127_u8, svuint8_t,
+		p0 = svcmple_n_u8 (p1, z0, 127),
+		p0 = svcmple (p1, z0, 127))
+
+/*
+** cmple_128_u8:
+**	mov	(z[0-9]+\.b), #-128
+** (
+**	cmphs	p0\.b, p1/z, \1, z0\.b
+** |
+**	cmpls	p0\.b, p1/z, z0\.b, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_128_u8, svuint8_t,
+		p0 = svcmple_n_u8 (p1, z0, 128),
+		p0 = svcmple (p1, z0, 128))
+
+/*
+** cmple_m1_u8:
+**	mov	(z[0-9]+\.b), #-1
+** (
+**	cmphs	p0\.b, p1/z, \1, z0\.b
+** |
+**	cmpls	p0\.b, p1/z, z0\.b, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmple_m1_u8, svuint8_t,
+		p0 = svcmple_n_u8 (p1, z0, -1),
+		p0 = svcmple (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s16.c
new file mode 100644
index 000000000..f1f0b2ed6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s16.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_wide_s16_tied:
+**	cmple	p0\.h, p0/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmple_wide_s16_tied, svint16_t, svint64_t,
+		     p0 = svcmple_wide_s16 (p0, z0, z1),
+		     p0 = svcmple_wide (p0, z0, z1))
+
+/*
+** cmple_wide_s16_untied:
+**	cmple	p0\.h, p1/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmple_wide_s16_untied, svint16_t, svint64_t,
+		     p0 = svcmple_wide_s16 (p1, z0, z1),
+		     p0 = svcmple_wide (p1, z0, z1))
+
+/*
+** cmple_wide_x0_s16:
+**	mov	(z[0-9]+\.d), x0
+**	cmple	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmple_wide_x0_s16, svint16_t, int64_t,
+		 p0 = svcmple_wide_n_s16 (p1, z0, x0),
+		 p0 = svcmple_wide (p1, z0, x0))
+
+/*
+** cmple_wide_0_s16:
+**	cmple	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_0_s16, svint16_t,
+		p0 = svcmple_wide_n_s16 (p1, z0, 0),
+		p0 = svcmple_wide (p1, z0, 0))
+
+/*
+** cmple_wide_1_s16:
+**	cmple	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_1_s16, svint16_t,
+		p0 = svcmple_wide_n_s16 (p1, z0, 1),
+		p0 = svcmple_wide (p1, z0, 1))
+
+/*
+** cmple_wide_15_s16:
+**	cmple	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_15_s16, svint16_t,
+		p0 = svcmple_wide_n_s16 (p1, z0, 15),
+		p0 = svcmple_wide (p1, z0, 15))
+
+/*
+** cmple_wide_16_s16:
+**	mov	(z[0-9]+\.d), #16
+**	cmple	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_16_s16, svint16_t,
+		p0 = svcmple_wide_n_s16 (p1, z0, 16),
+		p0 = svcmple_wide (p1, z0, 16))
+
+/*
+** cmple_wide_m1_s16:
+**	cmple	p0\.h, p1/z, z0\.h, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_m1_s16, svint16_t,
+		p0 = svcmple_wide_n_s16 (p1, z0, -1),
+		p0 = svcmple_wide (p1, z0, -1))
+
+/*
+** cmple_wide_m16_s16:
+**	cmple	p0\.h, p1/z, z0\.h, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_m16_s16, svint16_t,
+		p0 = svcmple_wide_n_s16 (p1, z0, -16),
+		p0 = svcmple_wide (p1, z0, -16))
+
+/*
+** cmple_wide_m17_s16:
+**	mov	(z[0-9]+\.d), #-17
+**	cmple	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_m17_s16, svint16_t,
+		p0 = svcmple_wide_n_s16 (p1, z0, -17),
+		p0 = svcmple_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s32.c
new file mode 100644
index 000000000..edc5513b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s32.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_wide_s32_tied:
+**	cmple	p0\.s, p0/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmple_wide_s32_tied, svint32_t, svint64_t,
+		     p0 = svcmple_wide_s32 (p0, z0, z1),
+		     p0 = svcmple_wide (p0, z0, z1))
+
+/*
+** cmple_wide_s32_untied:
+**	cmple	p0\.s, p1/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmple_wide_s32_untied, svint32_t, svint64_t,
+		     p0 = svcmple_wide_s32 (p1, z0, z1),
+		     p0 = svcmple_wide (p1, z0, z1))
+
+/*
+** cmple_wide_x0_s32:
+**	mov	(z[0-9]+\.d), x0
+**	cmple	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmple_wide_x0_s32, svint32_t, int64_t,
+		 p0 = svcmple_wide_n_s32 (p1, z0, x0),
+		 p0 = svcmple_wide (p1, z0, x0))
+
+/*
+** cmple_wide_0_s32:
+**	cmple	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_0_s32, svint32_t,
+		p0 = svcmple_wide_n_s32 (p1, z0, 0),
+		p0 = svcmple_wide (p1, z0, 0))
+
+/*
+** cmple_wide_1_s32:
+**	cmple	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_1_s32, svint32_t,
+		p0 = svcmple_wide_n_s32 (p1, z0, 1),
+		p0 = svcmple_wide (p1, z0, 1))
+
+/*
+** cmple_wide_15_s32:
+**	cmple	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_15_s32, svint32_t,
+		p0 = svcmple_wide_n_s32 (p1, z0, 15),
+		p0 = svcmple_wide (p1, z0, 15))
+
+/*
+** cmple_wide_16_s32:
+**	mov	(z[0-9]+\.d), #16
+**	cmple	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_16_s32, svint32_t,
+		p0 = svcmple_wide_n_s32 (p1, z0, 16),
+		p0 = svcmple_wide (p1, z0, 16))
+
+/*
+** cmple_wide_m1_s32:
+**	cmple	p0\.s, p1/z, z0\.s, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_m1_s32, svint32_t,
+		p0 = svcmple_wide_n_s32 (p1, z0, -1),
+		p0 = svcmple_wide (p1, z0, -1))
+
+/*
+** cmple_wide_m16_s32:
+**	cmple	p0\.s, p1/z, z0\.s, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_m16_s32, svint32_t,
+		p0 = svcmple_wide_n_s32 (p1, z0, -16),
+		p0 = svcmple_wide (p1, z0, -16))
+
+/*
+** cmple_wide_m17_s32:
+**	mov	(z[0-9]+\.d), #-17
+**	cmple	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_m17_s32, svint32_t,
+		p0 = svcmple_wide_n_s32 (p1, z0, -17),
+		p0 = svcmple_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s8.c
new file mode 100644
index 000000000..984044460
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_s8.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_wide_s8_tied:
+**	cmple	p0\.b, p0/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmple_wide_s8_tied, svint8_t, svint64_t,
+		     p0 = svcmple_wide_s8 (p0, z0, z1),
+		     p0 = svcmple_wide (p0, z0, z1))
+
+/*
+** cmple_wide_s8_untied:
+**	cmple	p0\.b, p1/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmple_wide_s8_untied, svint8_t, svint64_t,
+		     p0 = svcmple_wide_s8 (p1, z0, z1),
+		     p0 = svcmple_wide (p1, z0, z1))
+
+/*
+** cmple_wide_x0_s8:
+**	mov	(z[0-9]+\.d), x0
+**	cmple	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmple_wide_x0_s8, svint8_t, int64_t,
+		 p0 = svcmple_wide_n_s8 (p1, z0, x0),
+		 p0 = svcmple_wide (p1, z0, x0))
+
+/*
+** cmple_wide_0_s8:
+**	cmple	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_0_s8, svint8_t,
+		p0 = svcmple_wide_n_s8 (p1, z0, 0),
+		p0 = svcmple_wide (p1, z0, 0))
+
+/*
+** cmple_wide_1_s8:
+**	cmple	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_1_s8, svint8_t,
+		p0 = svcmple_wide_n_s8 (p1, z0, 1),
+		p0 = svcmple_wide (p1, z0, 1))
+
+/*
+** cmple_wide_15_s8:
+**	cmple	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_15_s8, svint8_t,
+		p0 = svcmple_wide_n_s8 (p1, z0, 15),
+		p0 = svcmple_wide (p1, z0, 15))
+
+/*
+** cmple_wide_16_s8:
+**	mov	(z[0-9]+\.d), #16
+**	cmple	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_16_s8, svint8_t,
+		p0 = svcmple_wide_n_s8 (p1, z0, 16),
+		p0 = svcmple_wide (p1, z0, 16))
+
+/*
+** cmple_wide_m1_s8:
+**	cmple	p0\.b, p1/z, z0\.b, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_m1_s8, svint8_t,
+		p0 = svcmple_wide_n_s8 (p1, z0, -1),
+		p0 = svcmple_wide (p1, z0, -1))
+
+/*
+** cmple_wide_m16_s8:
+**	cmple	p0\.b, p1/z, z0\.b, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_m16_s8, svint8_t,
+		p0 = svcmple_wide_n_s8 (p1, z0, -16),
+		p0 = svcmple_wide (p1, z0, -16))
+
+/*
+** cmple_wide_m17_s8:
+**	mov	(z[0-9]+\.d), #-17
+**	cmple	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_m17_s8, svint8_t,
+		p0 = svcmple_wide_n_s8 (p1, z0, -17),
+		p0 = svcmple_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u16.c
new file mode 100644
index 000000000..a39a1aad5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u16.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_wide_u16_tied:
+**	cmpls	p0\.h, p0/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmple_wide_u16_tied, svuint16_t, svuint64_t,
+		     p0 = svcmple_wide_u16 (p0, z0, z1),
+		     p0 = svcmple_wide (p0, z0, z1))
+
+/*
+** cmple_wide_u16_untied:
+**	cmpls	p0\.h, p1/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmple_wide_u16_untied, svuint16_t, svuint64_t,
+		     p0 = svcmple_wide_u16 (p1, z0, z1),
+		     p0 = svcmple_wide (p1, z0, z1))
+
+/*
+** cmple_wide_x0_u16:
+**	mov	(z[0-9]+\.d), x0
+**	cmpls	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmple_wide_x0_u16, svuint16_t, uint64_t,
+		 p0 = svcmple_wide_n_u16 (p1, z0, x0),
+		 p0 = svcmple_wide (p1, z0, x0))
+
+/*
+** cmple_wide_0_u16:
+**	cmpls	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_0_u16, svuint16_t,
+		p0 = svcmple_wide_n_u16 (p1, z0, 0),
+		p0 = svcmple_wide (p1, z0, 0))
+
+/*
+** cmple_wide_1_u16:
+**	cmpls	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_1_u16, svuint16_t,
+		p0 = svcmple_wide_n_u16 (p1, z0, 1),
+		p0 = svcmple_wide (p1, z0, 1))
+
+/*
+** cmple_wide_15_u16:
+**	cmpls	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_15_u16, svuint16_t,
+		p0 = svcmple_wide_n_u16 (p1, z0, 15),
+		p0 = svcmple_wide (p1, z0, 15))
+
+/*
+** cmple_wide_16_u16:
+**	cmpls	p0\.h, p1/z, z0\.h, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_16_u16, svuint16_t,
+		p0 = svcmple_wide_n_u16 (p1, z0, 16),
+		p0 = svcmple_wide (p1, z0, 16))
+
+/*
+** cmple_wide_127_u16:
+**	cmpls	p0\.h, p1/z, z0\.h, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_127_u16, svuint16_t,
+		p0 = svcmple_wide_n_u16 (p1, z0, 127),
+		p0 = svcmple_wide (p1, z0, 127))
+
+/*
+** cmple_wide_128_u16:
+**	mov	(z[0-9]+\.d), #128
+**	cmpls	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_128_u16, svuint16_t,
+		p0 = svcmple_wide_n_u16 (p1, z0, 128),
+		p0 = svcmple_wide (p1, z0, 128))
+
+/*
+** cmple_wide_m1_u16:
+**	mov	(z[0-9]+)\.b, #-1
+**	cmpls	p0\.h, p1/z, z0\.h, \1\.d
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_m1_u16, svuint16_t,
+		p0 = svcmple_wide_n_u16 (p1, z0, -1),
+		p0 = svcmple_wide (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u32.c
new file mode 100644
index 000000000..fe682c9e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u32.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_wide_u32_tied:
+**	cmpls	p0\.s, p0/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmple_wide_u32_tied, svuint32_t, svuint64_t,
+		     p0 = svcmple_wide_u32 (p0, z0, z1),
+		     p0 = svcmple_wide (p0, z0, z1))
+
+/*
+** cmple_wide_u32_untied:
+**	cmpls	p0\.s, p1/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmple_wide_u32_untied, svuint32_t, svuint64_t,
+		     p0 = svcmple_wide_u32 (p1, z0, z1),
+		     p0 = svcmple_wide (p1, z0, z1))
+
+/*
+** cmple_wide_x0_u32:
+**	mov	(z[0-9]+\.d), x0
+**	cmpls	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmple_wide_x0_u32, svuint32_t, uint64_t,
+		 p0 = svcmple_wide_n_u32 (p1, z0, x0),
+		 p0 = svcmple_wide (p1, z0, x0))
+
+/*
+** cmple_wide_0_u32:
+**	cmpls	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_0_u32, svuint32_t,
+		p0 = svcmple_wide_n_u32 (p1, z0, 0),
+		p0 = svcmple_wide (p1, z0, 0))
+
+/*
+** cmple_wide_1_u32:
+**	cmpls	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_1_u32, svuint32_t,
+		p0 = svcmple_wide_n_u32 (p1, z0, 1),
+		p0 = svcmple_wide (p1, z0, 1))
+
+/*
+** cmple_wide_15_u32:
+**	cmpls	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_15_u32, svuint32_t,
+		p0 = svcmple_wide_n_u32 (p1, z0, 15),
+		p0 = svcmple_wide (p1, z0, 15))
+
+/*
+** cmple_wide_16_u32:
+**	cmpls	p0\.s, p1/z, z0\.s, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_16_u32, svuint32_t,
+		p0 = svcmple_wide_n_u32 (p1, z0, 16),
+		p0 = svcmple_wide (p1, z0, 16))
+
+/*
+** cmple_wide_127_u32:
+**	cmpls	p0\.s, p1/z, z0\.s, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_127_u32, svuint32_t,
+		p0 = svcmple_wide_n_u32 (p1, z0, 127),
+		p0 = svcmple_wide (p1, z0, 127))
+
+/*
+** cmple_wide_128_u32:
+**	mov	(z[0-9]+\.d), #128
+**	cmpls	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_128_u32, svuint32_t,
+		p0 = svcmple_wide_n_u32 (p1, z0, 128),
+		p0 = svcmple_wide (p1, z0, 128))
+
+/*
+** cmple_wide_m1_u32:
+**	mov	(z[0-9]+)\.b, #-1
+**	cmpls	p0\.s, p1/z, z0\.s, \1\.d
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_m1_u32, svuint32_t,
+		p0 = svcmple_wide_n_u32 (p1, z0, -1),
+		p0 = svcmple_wide (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u8.c
new file mode 100644
index 000000000..893dfa627
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmple_wide_u8.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmple_wide_u8_tied:
+**	cmpls	p0\.b, p0/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmple_wide_u8_tied, svuint8_t, svuint64_t,
+		     p0 = svcmple_wide_u8 (p0, z0, z1),
+		     p0 = svcmple_wide (p0, z0, z1))
+
+/*
+** cmple_wide_u8_untied:
+**	cmpls	p0\.b, p1/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmple_wide_u8_untied, svuint8_t, svuint64_t,
+		     p0 = svcmple_wide_u8 (p1, z0, z1),
+		     p0 = svcmple_wide (p1, z0, z1))
+
+/*
+** cmple_wide_x0_u8:
+**	mov	(z[0-9]+\.d), x0
+**	cmpls	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmple_wide_x0_u8, svuint8_t, uint64_t,
+		 p0 = svcmple_wide_n_u8 (p1, z0, x0),
+		 p0 = svcmple_wide (p1, z0, x0))
+
+/*
+** cmple_wide_0_u8:
+**	cmpls	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_0_u8, svuint8_t,
+		p0 = svcmple_wide_n_u8 (p1, z0, 0),
+		p0 = svcmple_wide (p1, z0, 0))
+
+/*
+** cmple_wide_1_u8:
+**	cmpls	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_1_u8, svuint8_t,
+		p0 = svcmple_wide_n_u8 (p1, z0, 1),
+		p0 = svcmple_wide (p1, z0, 1))
+
+/*
+** cmple_wide_15_u8:
+**	cmpls	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_15_u8, svuint8_t,
+		p0 = svcmple_wide_n_u8 (p1, z0, 15),
+		p0 = svcmple_wide (p1, z0, 15))
+
+/*
+** cmple_wide_16_u8:
+**	cmpls	p0\.b, p1/z, z0\.b, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_16_u8, svuint8_t,
+		p0 = svcmple_wide_n_u8 (p1, z0, 16),
+		p0 = svcmple_wide (p1, z0, 16))
+
+/*
+** cmple_wide_127_u8:
+**	cmpls	p0\.b, p1/z, z0\.b, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_127_u8, svuint8_t,
+		p0 = svcmple_wide_n_u8 (p1, z0, 127),
+		p0 = svcmple_wide (p1, z0, 127))
+
+/*
+** cmple_wide_128_u8:
+**	mov	(z[0-9]+\.d), #128
+**	cmpls	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_128_u8, svuint8_t,
+		p0 = svcmple_wide_n_u8 (p1, z0, 128),
+		p0 = svcmple_wide (p1, z0, 128))
+
+/*
+** cmple_wide_m1_u8:
+**	mov	(z[0-9]+)\.b, #-1
+**	cmpls	p0\.b, p1/z, z0\.b, \1\.d
+**	ret
+*/
+TEST_COMPARE_Z (cmple_wide_m1_u8, svuint8_t,
+		p0 = svcmple_wide_n_u8 (p1, z0, -1),
+		p0 = svcmple_wide (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f16.c
new file mode 100644
index 000000000..598f673a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f16.c
@@ -0,0 +1,66 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_f16_tied:
+** (
+**	fcmgt	p0\.h, p0/z, z1\.h, z0\.h
+** |
+**	fcmlt	p0\.h, p0/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_f16_tied, svfloat16_t,
+		p0 = svcmplt_f16 (p0, z0, z1),
+		p0 = svcmplt (p0, z0, z1))
+
+/*
+** cmplt_f16_untied:
+** (
+**	fcmgt	p0\.h, p1/z, z1\.h, z0\.h
+** |
+**	fcmlt	p0\.h, p1/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_f16_untied, svfloat16_t,
+		p0 = svcmplt_f16 (p1, z0, z1),
+		p0 = svcmplt (p1, z0, z1))
+
+/*
+** cmplt_h4_f16:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	fcmgt	p0\.h, p1/z, \1, z0\.h
+** |
+**	fcmlt	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (cmplt_h4_f16, svfloat16_t, float16_t,
+		 p0 = svcmplt_n_f16 (p1, z0, d4),
+		 p0 = svcmplt (p1, z0, d4))
+
+/*
+** cmplt_0_f16:
+**	fcmlt	p0\.h, p1/z, z0\.h, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_0_f16, svfloat16_t,
+		p0 = svcmplt_n_f16 (p1, z0, 0),
+		p0 = svcmplt (p1, z0, 0))
+
+/*
+** cmplt_1_f16:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+** (
+**	fcmgt	p0\.h, p1/z, \1, z0\.h
+** |
+**	fcmlt	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_1_f16, svfloat16_t,
+		p0 = svcmplt_n_f16 (p1, z0, 1),
+		p0 = svcmplt (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f32.c
new file mode 100644
index 000000000..f9dea3665
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f32.c
@@ -0,0 +1,66 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_f32_tied:
+** (
+**	fcmgt	p0\.s, p0/z, z1\.s, z0\.s
+** |
+**	fcmlt	p0\.s, p0/z, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_f32_tied, svfloat32_t,
+		p0 = svcmplt_f32 (p0, z0, z1),
+		p0 = svcmplt (p0, z0, z1))
+
+/*
+** cmplt_f32_untied:
+** (
+**	fcmgt	p0\.s, p1/z, z1\.s, z0\.s
+** |
+**	fcmlt	p0\.s, p1/z, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_f32_untied, svfloat32_t,
+		p0 = svcmplt_f32 (p1, z0, z1),
+		p0 = svcmplt (p1, z0, z1))
+
+/*
+** cmplt_s4_f32:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	fcmgt	p0\.s, p1/z, \1, z0\.s
+** |
+**	fcmlt	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (cmplt_s4_f32, svfloat32_t, float32_t,
+		 p0 = svcmplt_n_f32 (p1, z0, d4),
+		 p0 = svcmplt (p1, z0, d4))
+
+/*
+** cmplt_0_f32:
+**	fcmlt	p0\.s, p1/z, z0\.s, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_0_f32, svfloat32_t,
+		p0 = svcmplt_n_f32 (p1, z0, 0),
+		p0 = svcmplt (p1, z0, 0))
+
+/*
+** cmplt_1_f32:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+** (
+**	fcmgt	p0\.s, p1/z, \1, z0\.s
+** |
+**	fcmlt	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_1_f32, svfloat32_t,
+		p0 = svcmplt_n_f32 (p1, z0, 1),
+		p0 = svcmplt (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f64.c
new file mode 100644
index 000000000..6f251db4f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_f64.c
@@ -0,0 +1,66 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_f64_tied:
+** (
+**	fcmgt	p0\.d, p0/z, z1\.d, z0\.d
+** |
+**	fcmlt	p0\.d, p0/z, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_f64_tied, svfloat64_t,
+		p0 = svcmplt_f64 (p0, z0, z1),
+		p0 = svcmplt (p0, z0, z1))
+
+/*
+** cmplt_f64_untied:
+** (
+**	fcmgt	p0\.d, p1/z, z1\.d, z0\.d
+** |
+**	fcmlt	p0\.d, p1/z, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_f64_untied, svfloat64_t,
+		p0 = svcmplt_f64 (p1, z0, z1),
+		p0 = svcmplt (p1, z0, z1))
+
+/*
+** cmplt_d4_f64:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	fcmgt	p0\.d, p1/z, \1, z0\.d
+** |
+**	fcmlt	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZD (cmplt_d4_f64, svfloat64_t, float64_t,
+		 p0 = svcmplt_n_f64 (p1, z0, d4),
+		 p0 = svcmplt (p1, z0, d4))
+
+/*
+** cmplt_0_f64:
+**	fcmlt	p0\.d, p1/z, z0\.d, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_0_f64, svfloat64_t,
+		p0 = svcmplt_n_f64 (p1, z0, 0),
+		p0 = svcmplt (p1, z0, 0))
+
+/*
+** cmplt_1_f64:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+** (
+**	fcmgt	p0\.d, p1/z, \1, z0\.d
+** |
+**	fcmlt	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_1_f64, svfloat64_t,
+		p0 = svcmplt_n_f64 (p1, z0, 1),
+		p0 = svcmplt (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s16.c
new file mode 100644
index 000000000..1e2bf9dde
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s16.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_s16_tied:
+** (
+**	cmpgt	p0\.h, p0/z, z1\.h, z0\.h
+** |
+**	cmplt	p0\.h, p0/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_s16_tied, svint16_t,
+		p0 = svcmplt_s16 (p0, z0, z1),
+		p0 = svcmplt (p0, z0, z1))
+
+/*
+** cmplt_s16_untied:
+** (
+**	cmpgt	p0\.h, p1/z, z1\.h, z0\.h
+** |
+**	cmplt	p0\.h, p1/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_s16_untied, svint16_t,
+		p0 = svcmplt_s16 (p1, z0, z1),
+		p0 = svcmplt (p1, z0, z1))
+
+/*
+** cmplt_w0_s16:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	cmpgt	p0\.h, p1/z, \1, z0\.h
+** |
+**	cmplt	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmplt_w0_s16, svint16_t, int16_t,
+		 p0 = svcmplt_n_s16 (p1, z0, x0),
+		 p0 = svcmplt (p1, z0, x0))
+
+/*
+** cmplt_0_s16:
+**	cmplt	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_0_s16, svint16_t,
+		p0 = svcmplt_n_s16 (p1, z0, 0),
+		p0 = svcmplt (p1, z0, 0))
+
+/*
+** cmplt_1_s16:
+**	cmplt	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_1_s16, svint16_t,
+		p0 = svcmplt_n_s16 (p1, z0, 1),
+		p0 = svcmplt (p1, z0, 1))
+
+/*
+** cmplt_15_s16:
+**	cmplt	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_15_s16, svint16_t,
+		p0 = svcmplt_n_s16 (p1, z0, 15),
+		p0 = svcmplt (p1, z0, 15))
+
+/*
+** cmplt_16_s16:
+**	mov	(z[0-9]+\.h), #16
+** (
+**	cmpgt	p0\.h, p1/z, \1, z0\.h
+** |
+**	cmplt	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_16_s16, svint16_t,
+		p0 = svcmplt_n_s16 (p1, z0, 16),
+		p0 = svcmplt (p1, z0, 16))
+
+/*
+** cmplt_m1_s16:
+**	cmplt	p0\.h, p1/z, z0\.h, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_m1_s16, svint16_t,
+		p0 = svcmplt_n_s16 (p1, z0, -1),
+		p0 = svcmplt (p1, z0, -1))
+
+/*
+** cmplt_m16_s16:
+**	cmplt	p0\.h, p1/z, z0\.h, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_m16_s16, svint16_t,
+		p0 = svcmplt_n_s16 (p1, z0, -16),
+		p0 = svcmplt (p1, z0, -16))
+
+/*
+** cmplt_m17_s16:
+**	mov	(z[0-9]+\.h), #-17
+** (
+**	cmpgt	p0\.h, p1/z, \1, z0\.h
+** |
+**	cmplt	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_m17_s16, svint16_t,
+		p0 = svcmplt_n_s16 (p1, z0, -17),
+		p0 = svcmplt (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s32.c
new file mode 100644
index 000000000..8e2c02c4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s32.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_s32_tied:
+** (
+**	cmpgt	p0\.s, p0/z, z1\.s, z0\.s
+** |
+**	cmplt	p0\.s, p0/z, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_s32_tied, svint32_t,
+		p0 = svcmplt_s32 (p0, z0, z1),
+		p0 = svcmplt (p0, z0, z1))
+
+/*
+** cmplt_s32_untied:
+** (
+**	cmpgt	p0\.s, p1/z, z1\.s, z0\.s
+** |
+**	cmplt	p0\.s, p1/z, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_s32_untied, svint32_t,
+		p0 = svcmplt_s32 (p1, z0, z1),
+		p0 = svcmplt (p1, z0, z1))
+
+/*
+** cmplt_w0_s32:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	cmpgt	p0\.s, p1/z, \1, z0\.s
+** |
+**	cmplt	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmplt_w0_s32, svint32_t, int32_t,
+		 p0 = svcmplt_n_s32 (p1, z0, x0),
+		 p0 = svcmplt (p1, z0, x0))
+
+/*
+** cmplt_0_s32:
+**	cmplt	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_0_s32, svint32_t,
+		p0 = svcmplt_n_s32 (p1, z0, 0),
+		p0 = svcmplt (p1, z0, 0))
+
+/*
+** cmplt_1_s32:
+**	cmplt	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_1_s32, svint32_t,
+		p0 = svcmplt_n_s32 (p1, z0, 1),
+		p0 = svcmplt (p1, z0, 1))
+
+/*
+** cmplt_15_s32:
+**	cmplt	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_15_s32, svint32_t,
+		p0 = svcmplt_n_s32 (p1, z0, 15),
+		p0 = svcmplt (p1, z0, 15))
+
+/*
+** cmplt_16_s32:
+**	mov	(z[0-9]+\.s), #16
+** (
+**	cmpgt	p0\.s, p1/z, \1, z0\.s
+** |
+**	cmplt	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_16_s32, svint32_t,
+		p0 = svcmplt_n_s32 (p1, z0, 16),
+		p0 = svcmplt (p1, z0, 16))
+
+/*
+** cmplt_m1_s32:
+**	cmplt	p0\.s, p1/z, z0\.s, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_m1_s32, svint32_t,
+		p0 = svcmplt_n_s32 (p1, z0, -1),
+		p0 = svcmplt (p1, z0, -1))
+
+/*
+** cmplt_m16_s32:
+**	cmplt	p0\.s, p1/z, z0\.s, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_m16_s32, svint32_t,
+		p0 = svcmplt_n_s32 (p1, z0, -16),
+		p0 = svcmplt (p1, z0, -16))
+
+/*
+** cmplt_m17_s32:
+**	mov	(z[0-9]+\.s), #-17
+** (
+**	cmpgt	p0\.s, p1/z, \1, z0\.s
+** |
+**	cmplt	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_m17_s32, svint32_t,
+		p0 = svcmplt_n_s32 (p1, z0, -17),
+		p0 = svcmplt (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s64.c
new file mode 100644
index 000000000..818c9fba9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s64.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_s64_tied:
+** (
+**	cmpgt	p0\.d, p0/z, z1\.d, z0\.d
+** |
+**	cmplt	p0\.d, p0/z, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_s64_tied, svint64_t,
+		p0 = svcmplt_s64 (p0, z0, z1),
+		p0 = svcmplt (p0, z0, z1))
+
+/*
+** cmplt_s64_untied:
+** (
+**	cmpgt	p0\.d, p1/z, z1\.d, z0\.d
+** |
+**	cmplt	p0\.d, p1/z, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_s64_untied, svint64_t,
+		p0 = svcmplt_s64 (p1, z0, z1),
+		p0 = svcmplt (p1, z0, z1))
+
+/*
+** cmplt_x0_s64:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	cmpgt	p0\.d, p1/z, \1, z0\.d
+** |
+**	cmplt	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmplt_x0_s64, svint64_t, int64_t,
+		 p0 = svcmplt_n_s64 (p1, z0, x0),
+		 p0 = svcmplt (p1, z0, x0))
+
+/*
+** cmplt_0_s64:
+**	cmplt	p0\.d, p1/z, z0\.d, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_0_s64, svint64_t,
+		p0 = svcmplt_n_s64 (p1, z0, 0),
+		p0 = svcmplt (p1, z0, 0))
+
+/*
+** cmplt_1_s64:
+**	cmplt	p0\.d, p1/z, z0\.d, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_1_s64, svint64_t,
+		p0 = svcmplt_n_s64 (p1, z0, 1),
+		p0 = svcmplt (p1, z0, 1))
+
+/*
+** cmplt_15_s64:
+**	cmplt	p0\.d, p1/z, z0\.d, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_15_s64, svint64_t,
+		p0 = svcmplt_n_s64 (p1, z0, 15),
+		p0 = svcmplt (p1, z0, 15))
+
+/*
+** cmplt_16_s64:
+**	mov	(z[0-9]+\.d), #16
+** (
+**	cmpgt	p0\.d, p1/z, \1, z0\.d
+** |
+**	cmplt	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_16_s64, svint64_t,
+		p0 = svcmplt_n_s64 (p1, z0, 16),
+		p0 = svcmplt (p1, z0, 16))
+
+/*
+** cmplt_m1_s64:
+**	cmplt	p0\.d, p1/z, z0\.d, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_m1_s64, svint64_t,
+		p0 = svcmplt_n_s64 (p1, z0, -1),
+		p0 = svcmplt (p1, z0, -1))
+
+/*
+** cmplt_m16_s64:
+**	cmplt	p0\.d, p1/z, z0\.d, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_m16_s64, svint64_t,
+		p0 = svcmplt_n_s64 (p1, z0, -16),
+		p0 = svcmplt (p1, z0, -16))
+
+/*
+** cmplt_m17_s64:
+**	mov	(z[0-9]+\.d), #-17
+** (
+**	cmpgt	p0\.d, p1/z, \1, z0\.d
+** |
+**	cmplt	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_m17_s64, svint64_t,
+		p0 = svcmplt_n_s64 (p1, z0, -17),
+		p0 = svcmplt (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s8.c
new file mode 100644
index 000000000..54b8dc408
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_s8.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_s8_tied:
+** (
+**	cmpgt	p0\.b, p0/z, z1\.b, z0\.b
+** |
+**	cmplt	p0\.b, p0/z, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_s8_tied, svint8_t,
+		p0 = svcmplt_s8 (p0, z0, z1),
+		p0 = svcmplt (p0, z0, z1))
+
+/*
+** cmplt_s8_untied:
+** (
+**	cmpgt	p0\.b, p1/z, z1\.b, z0\.b
+** |
+**	cmplt	p0\.b, p1/z, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_s8_untied, svint8_t,
+		p0 = svcmplt_s8 (p1, z0, z1),
+		p0 = svcmplt (p1, z0, z1))
+
+/*
+** cmplt_w0_s8:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	cmpgt	p0\.b, p1/z, \1, z0\.b
+** |
+**	cmplt	p0\.b, p1/z, z0\.b, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmplt_w0_s8, svint8_t, int8_t,
+		 p0 = svcmplt_n_s8 (p1, z0, x0),
+		 p0 = svcmplt (p1, z0, x0))
+
+/*
+** cmplt_0_s8:
+**	cmplt	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_0_s8, svint8_t,
+		p0 = svcmplt_n_s8 (p1, z0, 0),
+		p0 = svcmplt (p1, z0, 0))
+
+/*
+** cmplt_1_s8:
+**	cmplt	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_1_s8, svint8_t,
+		p0 = svcmplt_n_s8 (p1, z0, 1),
+		p0 = svcmplt (p1, z0, 1))
+
+/*
+** cmplt_15_s8:
+**	cmplt	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_15_s8, svint8_t,
+		p0 = svcmplt_n_s8 (p1, z0, 15),
+		p0 = svcmplt (p1, z0, 15))
+
+/*
+** cmplt_16_s8:
+**	mov	(z[0-9]+\.b), #16
+** (
+**	cmpgt	p0\.b, p1/z, \1, z0\.b
+** |
+**	cmplt	p0\.b, p1/z, z0\.b, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_16_s8, svint8_t,
+		p0 = svcmplt_n_s8 (p1, z0, 16),
+		p0 = svcmplt (p1, z0, 16))
+
+/*
+** cmplt_m1_s8:
+**	cmplt	p0\.b, p1/z, z0\.b, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_m1_s8, svint8_t,
+		p0 = svcmplt_n_s8 (p1, z0, -1),
+		p0 = svcmplt (p1, z0, -1))
+
+/*
+** cmplt_m16_s8:
+**	cmplt	p0\.b, p1/z, z0\.b, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_m16_s8, svint8_t,
+		p0 = svcmplt_n_s8 (p1, z0, -16),
+		p0 = svcmplt (p1, z0, -16))
+
+/*
+** cmplt_m17_s8:
+**	mov	(z[0-9]+\.b), #-17
+** (
+**	cmpgt	p0\.b, p1/z, \1, z0\.b
+** |
+**	cmplt	p0\.b, p1/z, z0\.b, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_m17_s8, svint8_t,
+		p0 = svcmplt_n_s8 (p1, z0, -17),
+		p0 = svcmplt (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u16.c
new file mode 100644
index 000000000..c0f2a0550
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u16.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_u16_tied:
+** (
+**	cmphi	p0\.h, p0/z, z1\.h, z0\.h
+** |
+**	cmplo	p0\.h, p0/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_u16_tied, svuint16_t,
+		p0 = svcmplt_u16 (p0, z0, z1),
+		p0 = svcmplt (p0, z0, z1))
+
+/*
+** cmplt_u16_untied:
+** (
+**	cmphi	p0\.h, p1/z, z1\.h, z0\.h
+** |
+**	cmplo	p0\.h, p1/z, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_u16_untied, svuint16_t,
+		p0 = svcmplt_u16 (p1, z0, z1),
+		p0 = svcmplt (p1, z0, z1))
+
+/*
+** cmplt_w0_u16:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	cmphi	p0\.h, p1/z, \1, z0\.h
+** |
+**	cmplo	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmplt_w0_u16, svuint16_t, uint16_t,
+		 p0 = svcmplt_n_u16 (p1, z0, x0),
+		 p0 = svcmplt (p1, z0, x0))
+
+/*
+** cmplt_0_u16:
+**	cmplo	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_0_u16, svuint16_t,
+		p0 = svcmplt_n_u16 (p1, z0, 0),
+		p0 = svcmplt (p1, z0, 0))
+
+/*
+** cmplt_1_u16:
+**	cmplo	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_1_u16, svuint16_t,
+		p0 = svcmplt_n_u16 (p1, z0, 1),
+		p0 = svcmplt (p1, z0, 1))
+
+/*
+** cmplt_15_u16:
+**	cmplo	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_15_u16, svuint16_t,
+		p0 = svcmplt_n_u16 (p1, z0, 15),
+		p0 = svcmplt (p1, z0, 15))
+
+/*
+** cmplt_16_u16:
+**	cmplo	p0\.h, p1/z, z0\.h, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_16_u16, svuint16_t,
+		p0 = svcmplt_n_u16 (p1, z0, 16),
+		p0 = svcmplt (p1, z0, 16))
+
+/*
+** cmplt_127_u16:
+**	cmplo	p0\.h, p1/z, z0\.h, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_127_u16, svuint16_t,
+		p0 = svcmplt_n_u16 (p1, z0, 127),
+		p0 = svcmplt (p1, z0, 127))
+
+/*
+** cmplt_128_u16:
+**	mov	(z[0-9]+\.h), #128
+** (
+**	cmphi	p0\.h, p1/z, \1, z0\.h
+** |
+**	cmplo	p0\.h, p1/z, z0\.h, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_128_u16, svuint16_t,
+		p0 = svcmplt_n_u16 (p1, z0, 128),
+		p0 = svcmplt (p1, z0, 128))
+
+/*
+** cmplt_m1_u16:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	cmphi	p0\.h, p1/z, \1\.h, z0\.h
+** |
+**	cmplo	p0\.h, p1/z, z0\.h, \1\.h
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_m1_u16, svuint16_t,
+		p0 = svcmplt_n_u16 (p1, z0, -1),
+		p0 = svcmplt (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u32.c
new file mode 100644
index 000000000..3bb0b1464
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u32.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_u32_tied:
+** (
+**	cmphi	p0\.s, p0/z, z1\.s, z0\.s
+** |
+**	cmplo	p0\.s, p0/z, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_u32_tied, svuint32_t,
+		p0 = svcmplt_u32 (p0, z0, z1),
+		p0 = svcmplt (p0, z0, z1))
+
+/*
+** cmplt_u32_untied:
+** (
+**	cmphi	p0\.s, p1/z, z1\.s, z0\.s
+** |
+**	cmplo	p0\.s, p1/z, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_u32_untied, svuint32_t,
+		p0 = svcmplt_u32 (p1, z0, z1),
+		p0 = svcmplt (p1, z0, z1))
+
+/*
+** cmplt_w0_u32:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	cmphi	p0\.s, p1/z, \1, z0\.s
+** |
+**	cmplo	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmplt_w0_u32, svuint32_t, uint32_t,
+		 p0 = svcmplt_n_u32 (p1, z0, x0),
+		 p0 = svcmplt (p1, z0, x0))
+
+/*
+** cmplt_0_u32:
+**	cmplo	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_0_u32, svuint32_t,
+		p0 = svcmplt_n_u32 (p1, z0, 0),
+		p0 = svcmplt (p1, z0, 0))
+
+/*
+** cmplt_1_u32:
+**	cmplo	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_1_u32, svuint32_t,
+		p0 = svcmplt_n_u32 (p1, z0, 1),
+		p0 = svcmplt (p1, z0, 1))
+
+/*
+** cmplt_15_u32:
+**	cmplo	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_15_u32, svuint32_t,
+		p0 = svcmplt_n_u32 (p1, z0, 15),
+		p0 = svcmplt (p1, z0, 15))
+
+/*
+** cmplt_16_u32:
+**	cmplo	p0\.s, p1/z, z0\.s, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_16_u32, svuint32_t,
+		p0 = svcmplt_n_u32 (p1, z0, 16),
+		p0 = svcmplt (p1, z0, 16))
+
+/*
+** cmplt_127_u32:
+**	cmplo	p0\.s, p1/z, z0\.s, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_127_u32, svuint32_t,
+		p0 = svcmplt_n_u32 (p1, z0, 127),
+		p0 = svcmplt (p1, z0, 127))
+
+/*
+** cmplt_128_u32:
+**	mov	(z[0-9]+\.s), #128
+** (
+**	cmphi	p0\.s, p1/z, \1, z0\.s
+** |
+**	cmplo	p0\.s, p1/z, z0\.s, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_128_u32, svuint32_t,
+		p0 = svcmplt_n_u32 (p1, z0, 128),
+		p0 = svcmplt (p1, z0, 128))
+
+/*
+** cmplt_m1_u32:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	cmphi	p0\.s, p1/z, \1\.s, z0\.s
+** |
+**	cmplo	p0\.s, p1/z, z0\.s, \1\.s
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_m1_u32, svuint32_t,
+		p0 = svcmplt_n_u32 (p1, z0, -1),
+		p0 = svcmplt (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u64.c
new file mode 100644
index 000000000..d9de5add2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u64.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_u64_tied:
+** (
+**	cmphi	p0\.d, p0/z, z1\.d, z0\.d
+** |
+**	cmplo	p0\.d, p0/z, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_u64_tied, svuint64_t,
+		p0 = svcmplt_u64 (p0, z0, z1),
+		p0 = svcmplt (p0, z0, z1))
+
+/*
+** cmplt_u64_untied:
+** (
+**	cmphi	p0\.d, p1/z, z1\.d, z0\.d
+** |
+**	cmplo	p0\.d, p1/z, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_u64_untied, svuint64_t,
+		p0 = svcmplt_u64 (p1, z0, z1),
+		p0 = svcmplt (p1, z0, z1))
+
+/*
+** cmplt_x0_u64:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	cmphi	p0\.d, p1/z, \1, z0\.d
+** |
+**	cmplo	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmplt_x0_u64, svuint64_t, uint64_t,
+		 p0 = svcmplt_n_u64 (p1, z0, x0),
+		 p0 = svcmplt (p1, z0, x0))
+
+/*
+** cmplt_0_u64:
+**	cmplo	p0\.d, p1/z, z0\.d, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_0_u64, svuint64_t,
+		p0 = svcmplt_n_u64 (p1, z0, 0),
+		p0 = svcmplt (p1, z0, 0))
+
+/*
+** cmplt_1_u64:
+**	cmplo	p0\.d, p1/z, z0\.d, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_1_u64, svuint64_t,
+		p0 = svcmplt_n_u64 (p1, z0, 1),
+		p0 = svcmplt (p1, z0, 1))
+
+/*
+** cmplt_15_u64:
+**	cmplo	p0\.d, p1/z, z0\.d, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_15_u64, svuint64_t,
+		p0 = svcmplt_n_u64 (p1, z0, 15),
+		p0 = svcmplt (p1, z0, 15))
+
+/*
+** cmplt_16_u64:
+**	cmplo	p0\.d, p1/z, z0\.d, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_16_u64, svuint64_t,
+		p0 = svcmplt_n_u64 (p1, z0, 16),
+		p0 = svcmplt (p1, z0, 16))
+
+/*
+** cmplt_127_u64:
+**	cmplo	p0\.d, p1/z, z0\.d, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_127_u64, svuint64_t,
+		p0 = svcmplt_n_u64 (p1, z0, 127),
+		p0 = svcmplt (p1, z0, 127))
+
+/*
+** cmplt_128_u64:
+**	mov	(z[0-9]+\.d), #128
+** (
+**	cmphi	p0\.d, p1/z, \1, z0\.d
+** |
+**	cmplo	p0\.d, p1/z, z0\.d, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_128_u64, svuint64_t,
+		p0 = svcmplt_n_u64 (p1, z0, 128),
+		p0 = svcmplt (p1, z0, 128))
+
+/*
+** cmplt_m1_u64:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	cmphi	p0\.d, p1/z, \1\.d, z0\.d
+** |
+**	cmplo	p0\.d, p1/z, z0\.d, \1\.d
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_m1_u64, svuint64_t,
+		p0 = svcmplt_n_u64 (p1, z0, -1),
+		p0 = svcmplt (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u8.c
new file mode 100644
index 000000000..42d5ad868
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_u8.c
@@ -0,0 +1,116 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_u8_tied:
+** (
+**	cmphi	p0\.b, p0/z, z1\.b, z0\.b
+** |
+**	cmplo	p0\.b, p0/z, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_u8_tied, svuint8_t,
+		p0 = svcmplt_u8 (p0, z0, z1),
+		p0 = svcmplt (p0, z0, z1))
+
+/*
+** cmplt_u8_untied:
+** (
+**	cmphi	p0\.b, p1/z, z1\.b, z0\.b
+** |
+**	cmplo	p0\.b, p1/z, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_u8_untied, svuint8_t,
+		p0 = svcmplt_u8 (p1, z0, z1),
+		p0 = svcmplt (p1, z0, z1))
+
+/*
+** cmplt_w0_u8:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	cmphi	p0\.b, p1/z, \1, z0\.b
+** |
+**	cmplo	p0\.b, p1/z, z0\.b, \1
+** )
+**	ret
+*/
+TEST_COMPARE_ZX (cmplt_w0_u8, svuint8_t, uint8_t,
+		 p0 = svcmplt_n_u8 (p1, z0, x0),
+		 p0 = svcmplt (p1, z0, x0))
+
+/*
+** cmplt_0_u8:
+**	cmplo	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_0_u8, svuint8_t,
+		p0 = svcmplt_n_u8 (p1, z0, 0),
+		p0 = svcmplt (p1, z0, 0))
+
+/*
+** cmplt_1_u8:
+**	cmplo	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_1_u8, svuint8_t,
+		p0 = svcmplt_n_u8 (p1, z0, 1),
+		p0 = svcmplt (p1, z0, 1))
+
+/*
+** cmplt_15_u8:
+**	cmplo	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_15_u8, svuint8_t,
+		p0 = svcmplt_n_u8 (p1, z0, 15),
+		p0 = svcmplt (p1, z0, 15))
+
+/*
+** cmplt_16_u8:
+**	cmplo	p0\.b, p1/z, z0\.b, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_16_u8, svuint8_t,
+		p0 = svcmplt_n_u8 (p1, z0, 16),
+		p0 = svcmplt (p1, z0, 16))
+
+/*
+** cmplt_127_u8:
+**	cmplo	p0\.b, p1/z, z0\.b, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_127_u8, svuint8_t,
+		p0 = svcmplt_n_u8 (p1, z0, 127),
+		p0 = svcmplt (p1, z0, 127))
+
+/*
+** cmplt_128_u8:
+**	mov	(z[0-9]+\.b), #-128
+** (
+**	cmphi	p0\.b, p1/z, \1, z0\.b
+** |
+**	cmplo	p0\.b, p1/z, z0\.b, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_128_u8, svuint8_t,
+		p0 = svcmplt_n_u8 (p1, z0, 128),
+		p0 = svcmplt (p1, z0, 128))
+
+/*
+** cmplt_m1_u8:
+**	mov	(z[0-9]+\.b), #-1
+** (
+**	cmphi	p0\.b, p1/z, \1, z0\.b
+** |
+**	cmplo	p0\.b, p1/z, z0\.b, \1
+** )
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_m1_u8, svuint8_t,
+		p0 = svcmplt_n_u8 (p1, z0, -1),
+		p0 = svcmplt (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s16.c
new file mode 100644
index 000000000..a3c8942ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s16.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_wide_s16_tied:
+**	cmplt	p0\.h, p0/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmplt_wide_s16_tied, svint16_t, svint64_t,
+		     p0 = svcmplt_wide_s16 (p0, z0, z1),
+		     p0 = svcmplt_wide (p0, z0, z1))
+
+/*
+** cmplt_wide_s16_untied:
+**	cmplt	p0\.h, p1/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmplt_wide_s16_untied, svint16_t, svint64_t,
+		     p0 = svcmplt_wide_s16 (p1, z0, z1),
+		     p0 = svcmplt_wide (p1, z0, z1))
+
+/*
+** cmplt_wide_x0_s16:
+**	mov	(z[0-9]+\.d), x0
+**	cmplt	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmplt_wide_x0_s16, svint16_t, int64_t,
+		 p0 = svcmplt_wide_n_s16 (p1, z0, x0),
+		 p0 = svcmplt_wide (p1, z0, x0))
+
+/*
+** cmplt_wide_0_s16:
+**	cmplt	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_0_s16, svint16_t,
+		p0 = svcmplt_wide_n_s16 (p1, z0, 0),
+		p0 = svcmplt_wide (p1, z0, 0))
+
+/*
+** cmplt_wide_1_s16:
+**	cmplt	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_1_s16, svint16_t,
+		p0 = svcmplt_wide_n_s16 (p1, z0, 1),
+		p0 = svcmplt_wide (p1, z0, 1))
+
+/*
+** cmplt_wide_15_s16:
+**	cmplt	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_15_s16, svint16_t,
+		p0 = svcmplt_wide_n_s16 (p1, z0, 15),
+		p0 = svcmplt_wide (p1, z0, 15))
+
+/*
+** cmplt_wide_16_s16:
+**	mov	(z[0-9]+\.d), #16
+**	cmplt	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_16_s16, svint16_t,
+		p0 = svcmplt_wide_n_s16 (p1, z0, 16),
+		p0 = svcmplt_wide (p1, z0, 16))
+
+/*
+** cmplt_wide_m1_s16:
+**	cmplt	p0\.h, p1/z, z0\.h, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_m1_s16, svint16_t,
+		p0 = svcmplt_wide_n_s16 (p1, z0, -1),
+		p0 = svcmplt_wide (p1, z0, -1))
+
+/*
+** cmplt_wide_m16_s16:
+**	cmplt	p0\.h, p1/z, z0\.h, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_m16_s16, svint16_t,
+		p0 = svcmplt_wide_n_s16 (p1, z0, -16),
+		p0 = svcmplt_wide (p1, z0, -16))
+
+/*
+** cmplt_wide_m17_s16:
+**	mov	(z[0-9]+\.d), #-17
+**	cmplt	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_m17_s16, svint16_t,
+		p0 = svcmplt_wide_n_s16 (p1, z0, -17),
+		p0 = svcmplt_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s32.c
new file mode 100644
index 000000000..b2cad6773
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s32.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_wide_s32_tied:
+**	cmplt	p0\.s, p0/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmplt_wide_s32_tied, svint32_t, svint64_t,
+		     p0 = svcmplt_wide_s32 (p0, z0, z1),
+		     p0 = svcmplt_wide (p0, z0, z1))
+
+/*
+** cmplt_wide_s32_untied:
+**	cmplt	p0\.s, p1/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmplt_wide_s32_untied, svint32_t, svint64_t,
+		     p0 = svcmplt_wide_s32 (p1, z0, z1),
+		     p0 = svcmplt_wide (p1, z0, z1))
+
+/*
+** cmplt_wide_x0_s32:
+**	mov	(z[0-9]+\.d), x0
+**	cmplt	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmplt_wide_x0_s32, svint32_t, int64_t,
+		 p0 = svcmplt_wide_n_s32 (p1, z0, x0),
+		 p0 = svcmplt_wide (p1, z0, x0))
+
+/*
+** cmplt_wide_0_s32:
+**	cmplt	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_0_s32, svint32_t,
+		p0 = svcmplt_wide_n_s32 (p1, z0, 0),
+		p0 = svcmplt_wide (p1, z0, 0))
+
+/*
+** cmplt_wide_1_s32:
+**	cmplt	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_1_s32, svint32_t,
+		p0 = svcmplt_wide_n_s32 (p1, z0, 1),
+		p0 = svcmplt_wide (p1, z0, 1))
+
+/*
+** cmplt_wide_15_s32:
+**	cmplt	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_15_s32, svint32_t,
+		p0 = svcmplt_wide_n_s32 (p1, z0, 15),
+		p0 = svcmplt_wide (p1, z0, 15))
+
+/*
+** cmplt_wide_16_s32:
+**	mov	(z[0-9]+\.d), #16
+**	cmplt	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_16_s32, svint32_t,
+		p0 = svcmplt_wide_n_s32 (p1, z0, 16),
+		p0 = svcmplt_wide (p1, z0, 16))
+
+/*
+** cmplt_wide_m1_s32:
+**	cmplt	p0\.s, p1/z, z0\.s, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_m1_s32, svint32_t,
+		p0 = svcmplt_wide_n_s32 (p1, z0, -1),
+		p0 = svcmplt_wide (p1, z0, -1))
+
+/*
+** cmplt_wide_m16_s32:
+**	cmplt	p0\.s, p1/z, z0\.s, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_m16_s32, svint32_t,
+		p0 = svcmplt_wide_n_s32 (p1, z0, -16),
+		p0 = svcmplt_wide (p1, z0, -16))
+
+/*
+** cmplt_wide_m17_s32:
+**	mov	(z[0-9]+\.d), #-17
+**	cmplt	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_m17_s32, svint32_t,
+		p0 = svcmplt_wide_n_s32 (p1, z0, -17),
+		p0 = svcmplt_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s8.c
new file mode 100644
index 000000000..1015fe309
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_s8.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_wide_s8_tied:
+**	cmplt	p0\.b, p0/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmplt_wide_s8_tied, svint8_t, svint64_t,
+		     p0 = svcmplt_wide_s8 (p0, z0, z1),
+		     p0 = svcmplt_wide (p0, z0, z1))
+
+/*
+** cmplt_wide_s8_untied:
+**	cmplt	p0\.b, p1/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmplt_wide_s8_untied, svint8_t, svint64_t,
+		     p0 = svcmplt_wide_s8 (p1, z0, z1),
+		     p0 = svcmplt_wide (p1, z0, z1))
+
+/*
+** cmplt_wide_x0_s8:
+**	mov	(z[0-9]+\.d), x0
+**	cmplt	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmplt_wide_x0_s8, svint8_t, int64_t,
+		 p0 = svcmplt_wide_n_s8 (p1, z0, x0),
+		 p0 = svcmplt_wide (p1, z0, x0))
+
+/*
+** cmplt_wide_0_s8:
+**	cmplt	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_0_s8, svint8_t,
+		p0 = svcmplt_wide_n_s8 (p1, z0, 0),
+		p0 = svcmplt_wide (p1, z0, 0))
+
+/*
+** cmplt_wide_1_s8:
+**	cmplt	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_1_s8, svint8_t,
+		p0 = svcmplt_wide_n_s8 (p1, z0, 1),
+		p0 = svcmplt_wide (p1, z0, 1))
+
+/*
+** cmplt_wide_15_s8:
+**	cmplt	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_15_s8, svint8_t,
+		p0 = svcmplt_wide_n_s8 (p1, z0, 15),
+		p0 = svcmplt_wide (p1, z0, 15))
+
+/*
+** cmplt_wide_16_s8:
+**	mov	(z[0-9]+\.d), #16
+**	cmplt	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_16_s8, svint8_t,
+		p0 = svcmplt_wide_n_s8 (p1, z0, 16),
+		p0 = svcmplt_wide (p1, z0, 16))
+
+/*
+** cmplt_wide_m1_s8:
+**	cmplt	p0\.b, p1/z, z0\.b, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_m1_s8, svint8_t,
+		p0 = svcmplt_wide_n_s8 (p1, z0, -1),
+		p0 = svcmplt_wide (p1, z0, -1))
+
+/*
+** cmplt_wide_m16_s8:
+**	cmplt	p0\.b, p1/z, z0\.b, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_m16_s8, svint8_t,
+		p0 = svcmplt_wide_n_s8 (p1, z0, -16),
+		p0 = svcmplt_wide (p1, z0, -16))
+
+/*
+** cmplt_wide_m17_s8:
+**	mov	(z[0-9]+\.d), #-17
+**	cmplt	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_m17_s8, svint8_t,
+		p0 = svcmplt_wide_n_s8 (p1, z0, -17),
+		p0 = svcmplt_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u16.c
new file mode 100644
index 000000000..851400d36
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u16.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_wide_u16_tied:
+**	cmplo	p0\.h, p0/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmplt_wide_u16_tied, svuint16_t, svuint64_t,
+		     p0 = svcmplt_wide_u16 (p0, z0, z1),
+		     p0 = svcmplt_wide (p0, z0, z1))
+
+/*
+** cmplt_wide_u16_untied:
+**	cmplo	p0\.h, p1/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmplt_wide_u16_untied, svuint16_t, svuint64_t,
+		     p0 = svcmplt_wide_u16 (p1, z0, z1),
+		     p0 = svcmplt_wide (p1, z0, z1))
+
+/*
+** cmplt_wide_x0_u16:
+**	mov	(z[0-9]+\.d), x0
+**	cmplo	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmplt_wide_x0_u16, svuint16_t, uint64_t,
+		 p0 = svcmplt_wide_n_u16 (p1, z0, x0),
+		 p0 = svcmplt_wide (p1, z0, x0))
+
+/*
+** cmplt_wide_0_u16:
+**	cmplo	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_0_u16, svuint16_t,
+		p0 = svcmplt_wide_n_u16 (p1, z0, 0),
+		p0 = svcmplt_wide (p1, z0, 0))
+
+/*
+** cmplt_wide_1_u16:
+**	cmplo	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_1_u16, svuint16_t,
+		p0 = svcmplt_wide_n_u16 (p1, z0, 1),
+		p0 = svcmplt_wide (p1, z0, 1))
+
+/*
+** cmplt_wide_15_u16:
+**	cmplo	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_15_u16, svuint16_t,
+		p0 = svcmplt_wide_n_u16 (p1, z0, 15),
+		p0 = svcmplt_wide (p1, z0, 15))
+
+/*
+** cmplt_wide_16_u16:
+**	cmplo	p0\.h, p1/z, z0\.h, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_16_u16, svuint16_t,
+		p0 = svcmplt_wide_n_u16 (p1, z0, 16),
+		p0 = svcmplt_wide (p1, z0, 16))
+
+/*
+** cmplt_wide_127_u16:
+**	cmplo	p0\.h, p1/z, z0\.h, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_127_u16, svuint16_t,
+		p0 = svcmplt_wide_n_u16 (p1, z0, 127),
+		p0 = svcmplt_wide (p1, z0, 127))
+
+/*
+** cmplt_wide_128_u16:
+**	mov	(z[0-9]+\.d), #128
+**	cmplo	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_128_u16, svuint16_t,
+		p0 = svcmplt_wide_n_u16 (p1, z0, 128),
+		p0 = svcmplt_wide (p1, z0, 128))
+
+/*
+** cmplt_wide_m1_u16:
+**	mov	(z[0-9]+)\.b, #-1
+**	cmplo	p0\.h, p1/z, z0\.h, \1\.d
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_m1_u16, svuint16_t,
+		p0 = svcmplt_wide_n_u16 (p1, z0, -1),
+		p0 = svcmplt_wide (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u32.c
new file mode 100644
index 000000000..1f9652def
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u32.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_wide_u32_tied:
+**	cmplo	p0\.s, p0/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmplt_wide_u32_tied, svuint32_t, svuint64_t,
+		     p0 = svcmplt_wide_u32 (p0, z0, z1),
+		     p0 = svcmplt_wide (p0, z0, z1))
+
+/*
+** cmplt_wide_u32_untied:
+**	cmplo	p0\.s, p1/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmplt_wide_u32_untied, svuint32_t, svuint64_t,
+		     p0 = svcmplt_wide_u32 (p1, z0, z1),
+		     p0 = svcmplt_wide (p1, z0, z1))
+
+/*
+** cmplt_wide_x0_u32:
+**	mov	(z[0-9]+\.d), x0
+**	cmplo	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmplt_wide_x0_u32, svuint32_t, uint64_t,
+		 p0 = svcmplt_wide_n_u32 (p1, z0, x0),
+		 p0 = svcmplt_wide (p1, z0, x0))
+
+/*
+** cmplt_wide_0_u32:
+**	cmplo	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_0_u32, svuint32_t,
+		p0 = svcmplt_wide_n_u32 (p1, z0, 0),
+		p0 = svcmplt_wide (p1, z0, 0))
+
+/*
+** cmplt_wide_1_u32:
+**	cmplo	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_1_u32, svuint32_t,
+		p0 = svcmplt_wide_n_u32 (p1, z0, 1),
+		p0 = svcmplt_wide (p1, z0, 1))
+
+/*
+** cmplt_wide_15_u32:
+**	cmplo	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_15_u32, svuint32_t,
+		p0 = svcmplt_wide_n_u32 (p1, z0, 15),
+		p0 = svcmplt_wide (p1, z0, 15))
+
+/*
+** cmplt_wide_16_u32:
+**	cmplo	p0\.s, p1/z, z0\.s, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_16_u32, svuint32_t,
+		p0 = svcmplt_wide_n_u32 (p1, z0, 16),
+		p0 = svcmplt_wide (p1, z0, 16))
+
+/*
+** cmplt_wide_127_u32:
+**	cmplo	p0\.s, p1/z, z0\.s, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_127_u32, svuint32_t,
+		p0 = svcmplt_wide_n_u32 (p1, z0, 127),
+		p0 = svcmplt_wide (p1, z0, 127))
+
+/*
+** cmplt_wide_128_u32:
+**	mov	(z[0-9]+\.d), #128
+**	cmplo	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_128_u32, svuint32_t,
+		p0 = svcmplt_wide_n_u32 (p1, z0, 128),
+		p0 = svcmplt_wide (p1, z0, 128))
+
+/*
+** cmplt_wide_m1_u32:
+**	mov	(z[0-9]+)\.b, #-1
+**	cmplo	p0\.s, p1/z, z0\.s, \1\.d
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_m1_u32, svuint32_t,
+		p0 = svcmplt_wide_n_u32 (p1, z0, -1),
+		p0 = svcmplt_wide (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u8.c
new file mode 100644
index 000000000..95ef3cf16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmplt_wide_u8.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmplt_wide_u8_tied:
+**	cmplo	p0\.b, p0/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmplt_wide_u8_tied, svuint8_t, svuint64_t,
+		     p0 = svcmplt_wide_u8 (p0, z0, z1),
+		     p0 = svcmplt_wide (p0, z0, z1))
+
+/*
+** cmplt_wide_u8_untied:
+**	cmplo	p0\.b, p1/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmplt_wide_u8_untied, svuint8_t, svuint64_t,
+		     p0 = svcmplt_wide_u8 (p1, z0, z1),
+		     p0 = svcmplt_wide (p1, z0, z1))
+
+/*
+** cmplt_wide_x0_u8:
+**	mov	(z[0-9]+\.d), x0
+**	cmplo	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmplt_wide_x0_u8, svuint8_t, uint64_t,
+		 p0 = svcmplt_wide_n_u8 (p1, z0, x0),
+		 p0 = svcmplt_wide (p1, z0, x0))
+
+/*
+** cmplt_wide_0_u8:
+**	cmplo	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_0_u8, svuint8_t,
+		p0 = svcmplt_wide_n_u8 (p1, z0, 0),
+		p0 = svcmplt_wide (p1, z0, 0))
+
+/*
+** cmplt_wide_1_u8:
+**	cmplo	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_1_u8, svuint8_t,
+		p0 = svcmplt_wide_n_u8 (p1, z0, 1),
+		p0 = svcmplt_wide (p1, z0, 1))
+
+/*
+** cmplt_wide_15_u8:
+**	cmplo	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_15_u8, svuint8_t,
+		p0 = svcmplt_wide_n_u8 (p1, z0, 15),
+		p0 = svcmplt_wide (p1, z0, 15))
+
+/*
+** cmplt_wide_16_u8:
+**	cmplo	p0\.b, p1/z, z0\.b, #16
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_16_u8, svuint8_t,
+		p0 = svcmplt_wide_n_u8 (p1, z0, 16),
+		p0 = svcmplt_wide (p1, z0, 16))
+
+/*
+** cmplt_wide_127_u8:
+**	cmplo	p0\.b, p1/z, z0\.b, #127
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_127_u8, svuint8_t,
+		p0 = svcmplt_wide_n_u8 (p1, z0, 127),
+		p0 = svcmplt_wide (p1, z0, 127))
+
+/*
+** cmplt_wide_128_u8:
+**	mov	(z[0-9]+\.d), #128
+**	cmplo	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_128_u8, svuint8_t,
+		p0 = svcmplt_wide_n_u8 (p1, z0, 128),
+		p0 = svcmplt_wide (p1, z0, 128))
+
+/*
+** cmplt_wide_m1_u8:
+**	mov	(z[0-9]+)\.b, #-1
+**	cmplo	p0\.b, p1/z, z0\.b, \1\.d
+**	ret
+*/
+TEST_COMPARE_Z (cmplt_wide_m1_u8, svuint8_t,
+		p0 = svcmplt_wide_n_u8 (p1, z0, -1),
+		p0 = svcmplt_wide (p1, z0, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f16.c
new file mode 100644
index 000000000..63e203b09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f16.c
@@ -0,0 +1,50 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpne_f16_tied:
+**	fcmne	p0\.h, p0/z, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_f16_tied, svfloat16_t,
+		p0 = svcmpne_f16 (p0, z0, z1),
+		p0 = svcmpne (p0, z0, z1))
+
+/*
+** cmpne_f16_untied:
+**	fcmne	p0\.h, p1/z, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_f16_untied, svfloat16_t,
+		p0 = svcmpne_f16 (p1, z0, z1),
+		p0 = svcmpne (p1, z0, z1))
+
+/*
+** cmpne_h4_f16:
+**	mov	(z[0-9]+\.h), h4
+**	fcmne	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_ZD (cmpne_h4_f16, svfloat16_t, float16_t,
+		 p0 = svcmpne_n_f16 (p1, z0, d4),
+		 p0 = svcmpne (p1, z0, d4))
+
+/*
+** cmpne_0_f16:
+**	fcmne	p0\.h, p1/z, z0\.h, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_0_f16, svfloat16_t,
+		p0 = svcmpne_n_f16 (p1, z0, 0),
+		p0 = svcmpne (p1, z0, 0))
+
+/*
+** cmpne_1_f16:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fcmne	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_1_f16, svfloat16_t,
+		p0 = svcmpne_n_f16 (p1, z0, 1),
+		p0 = svcmpne (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f32.c
new file mode 100644
index 000000000..f81e2da51
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f32.c
@@ -0,0 +1,50 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpne_f32_tied:
+**	fcmne	p0\.s, p0/z, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_f32_tied, svfloat32_t,
+		p0 = svcmpne_f32 (p0, z0, z1),
+		p0 = svcmpne (p0, z0, z1))
+
+/*
+** cmpne_f32_untied:
+**	fcmne	p0\.s, p1/z, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_f32_untied, svfloat32_t,
+		p0 = svcmpne_f32 (p1, z0, z1),
+		p0 = svcmpne (p1, z0, z1))
+
+/*
+** cmpne_s4_f32:
+**	mov	(z[0-9]+\.s), s4
+**	fcmne	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_ZD (cmpne_s4_f32, svfloat32_t, float32_t,
+		 p0 = svcmpne_n_f32 (p1, z0, d4),
+		 p0 = svcmpne (p1, z0, d4))
+
+/*
+** cmpne_0_f32:
+**	fcmne	p0\.s, p1/z, z0\.s, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_0_f32, svfloat32_t,
+		p0 = svcmpne_n_f32 (p1, z0, 0),
+		p0 = svcmpne (p1, z0, 0))
+
+/*
+** cmpne_1_f32:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fcmne	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_1_f32, svfloat32_t,
+		p0 = svcmpne_n_f32 (p1, z0, 1),
+		p0 = svcmpne (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f64.c
new file mode 100644
index 000000000..22e4eeef4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_f64.c
@@ -0,0 +1,50 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpne_f64_tied:
+**	fcmne	p0\.d, p0/z, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_f64_tied, svfloat64_t,
+		p0 = svcmpne_f64 (p0, z0, z1),
+		p0 = svcmpne (p0, z0, z1))
+
+/*
+** cmpne_f64_untied:
+**	fcmne	p0\.d, p1/z, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_f64_untied, svfloat64_t,
+		p0 = svcmpne_f64 (p1, z0, z1),
+		p0 = svcmpne (p1, z0, z1))
+
+/*
+** cmpne_d4_f64:
+**	mov	(z[0-9]+\.d), d4
+**	fcmne	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_ZD (cmpne_d4_f64, svfloat64_t, float64_t,
+		 p0 = svcmpne_n_f64 (p1, z0, d4),
+		 p0 = svcmpne (p1, z0, d4))
+
+/*
+** cmpne_0_f64:
+**	fcmne	p0\.d, p1/z, z0\.d, #0\.0
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_0_f64, svfloat64_t,
+		p0 = svcmpne_n_f64 (p1, z0, 0),
+		p0 = svcmpne (p1, z0, 0))
+
+/*
+** cmpne_1_f64:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fcmne	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_1_f64, svfloat64_t,
+		p0 = svcmpne_n_f64 (p1, z0, 1),
+		p0 = svcmpne (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s16.c
new file mode 100644
index 000000000..d8c743f8b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s16.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpne_s16_tied:
+**	cmpne	p0\.h, p0/z, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_s16_tied, svint16_t,
+		p0 = svcmpne_s16 (p0, z0, z1),
+		p0 = svcmpne (p0, z0, z1))
+
+/*
+** cmpne_s16_untied:
+**	cmpne	p0\.h, p1/z, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_s16_untied, svint16_t,
+		p0 = svcmpne_s16 (p1, z0, z1),
+		p0 = svcmpne (p1, z0, z1))
+
+/*
+** cmpne_w0_s16:
+**	mov	(z[0-9]+\.h), w0
+**	cmpne	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_ZX (cmpne_w0_s16, svint16_t, int16_t,
+		 p0 = svcmpne_n_s16 (p1, z0, x0),
+		 p0 = svcmpne (p1, z0, x0))
+
+/*
+** cmpne_0_s16:
+**	cmpne	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_0_s16, svint16_t,
+		p0 = svcmpne_n_s16 (p1, z0, 0),
+		p0 = svcmpne (p1, z0, 0))
+
+/*
+** cmpne_1_s16:
+**	cmpne	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_1_s16, svint16_t,
+		p0 = svcmpne_n_s16 (p1, z0, 1),
+		p0 = svcmpne (p1, z0, 1))
+
+/*
+** cmpne_15_s16:
+**	cmpne	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_15_s16, svint16_t,
+		p0 = svcmpne_n_s16 (p1, z0, 15),
+		p0 = svcmpne (p1, z0, 15))
+
+/*
+** cmpne_16_s16:
+**	mov	(z[0-9]+\.h), #16
+**	cmpne	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_16_s16, svint16_t,
+		p0 = svcmpne_n_s16 (p1, z0, 16),
+		p0 = svcmpne (p1, z0, 16))
+
+/*
+** cmpne_m1_s16:
+**	cmpne	p0\.h, p1/z, z0\.h, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m1_s16, svint16_t,
+		p0 = svcmpne_n_s16 (p1, z0, -1),
+		p0 = svcmpne (p1, z0, -1))
+
+/*
+** cmpne_m16_s16:
+**	cmpne	p0\.h, p1/z, z0\.h, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m16_s16, svint16_t,
+		p0 = svcmpne_n_s16 (p1, z0, -16),
+		p0 = svcmpne (p1, z0, -16))
+
+/*
+** cmpne_m17_s16:
+**	mov	(z[0-9]+\.h), #-17
+**	cmpne	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m17_s16, svint16_t,
+		p0 = svcmpne_n_s16 (p1, z0, -17),
+		p0 = svcmpne (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s32.c
new file mode 100644
index 000000000..0d3c35111
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s32.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpne_s32_tied:
+**	cmpne	p0\.s, p0/z, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_s32_tied, svint32_t,
+		p0 = svcmpne_s32 (p0, z0, z1),
+		p0 = svcmpne (p0, z0, z1))
+
+/*
+** cmpne_s32_untied:
+**	cmpne	p0\.s, p1/z, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_s32_untied, svint32_t,
+		p0 = svcmpne_s32 (p1, z0, z1),
+		p0 = svcmpne (p1, z0, z1))
+
+/*
+** cmpne_w0_s32:
+**	mov	(z[0-9]+\.s), w0
+**	cmpne	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_ZX (cmpne_w0_s32, svint32_t, int32_t,
+		 p0 = svcmpne_n_s32 (p1, z0, x0),
+		 p0 = svcmpne (p1, z0, x0))
+
+/*
+** cmpne_0_s32:
+**	cmpne	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_0_s32, svint32_t,
+		p0 = svcmpne_n_s32 (p1, z0, 0),
+		p0 = svcmpne (p1, z0, 0))
+
+/*
+** cmpne_1_s32:
+**	cmpne	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_1_s32, svint32_t,
+		p0 = svcmpne_n_s32 (p1, z0, 1),
+		p0 = svcmpne (p1, z0, 1))
+
+/*
+** cmpne_15_s32:
+**	cmpne	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_15_s32, svint32_t,
+		p0 = svcmpne_n_s32 (p1, z0, 15),
+		p0 = svcmpne (p1, z0, 15))
+
+/*
+** cmpne_16_s32:
+**	mov	(z[0-9]+\.s), #16
+**	cmpne	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_16_s32, svint32_t,
+		p0 = svcmpne_n_s32 (p1, z0, 16),
+		p0 = svcmpne (p1, z0, 16))
+
+/*
+** cmpne_m1_s32:
+**	cmpne	p0\.s, p1/z, z0\.s, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m1_s32, svint32_t,
+		p0 = svcmpne_n_s32 (p1, z0, -1),
+		p0 = svcmpne (p1, z0, -1))
+
+/*
+** cmpne_m16_s32:
+**	cmpne	p0\.s, p1/z, z0\.s, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m16_s32, svint32_t,
+		p0 = svcmpne_n_s32 (p1, z0, -16),
+		p0 = svcmpne (p1, z0, -16))
+
+/*
+** cmpne_m17_s32:
+**	mov	(z[0-9]+\.s), #-17
+**	cmpne	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m17_s32, svint32_t,
+		p0 = svcmpne_n_s32 (p1, z0, -17),
+		p0 = svcmpne (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s64.c
new file mode 100644
index 000000000..4cf78f2dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s64.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpne_s64_tied:
+**	cmpne	p0\.d, p0/z, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_s64_tied, svint64_t,
+		p0 = svcmpne_s64 (p0, z0, z1),
+		p0 = svcmpne (p0, z0, z1))
+
+/*
+** cmpne_s64_untied:
+**	cmpne	p0\.d, p1/z, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_s64_untied, svint64_t,
+		p0 = svcmpne_s64 (p1, z0, z1),
+		p0 = svcmpne (p1, z0, z1))
+
+/*
+** cmpne_x0_s64:
+**	mov	(z[0-9]+\.d), x0
+**	cmpne	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_ZX (cmpne_x0_s64, svint64_t, int64_t,
+		 p0 = svcmpne_n_s64 (p1, z0, x0),
+		 p0 = svcmpne (p1, z0, x0))
+
+/*
+** cmpne_0_s64:
+**	cmpne	p0\.d, p1/z, z0\.d, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_0_s64, svint64_t,
+		p0 = svcmpne_n_s64 (p1, z0, 0),
+		p0 = svcmpne (p1, z0, 0))
+
+/*
+** cmpne_1_s64:
+**	cmpne	p0\.d, p1/z, z0\.d, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_1_s64, svint64_t,
+		p0 = svcmpne_n_s64 (p1, z0, 1),
+		p0 = svcmpne (p1, z0, 1))
+
+/*
+** cmpne_15_s64:
+**	cmpne	p0\.d, p1/z, z0\.d, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_15_s64, svint64_t,
+		p0 = svcmpne_n_s64 (p1, z0, 15),
+		p0 = svcmpne (p1, z0, 15))
+
+/*
+** cmpne_16_s64:
+**	mov	(z[0-9]+\.d), #16
+**	cmpne	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_16_s64, svint64_t,
+		p0 = svcmpne_n_s64 (p1, z0, 16),
+		p0 = svcmpne (p1, z0, 16))
+
+/*
+** cmpne_m1_s64:
+**	cmpne	p0\.d, p1/z, z0\.d, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m1_s64, svint64_t,
+		p0 = svcmpne_n_s64 (p1, z0, -1),
+		p0 = svcmpne (p1, z0, -1))
+
+/*
+** cmpne_m16_s64:
+**	cmpne	p0\.d, p1/z, z0\.d, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m16_s64, svint64_t,
+		p0 = svcmpne_n_s64 (p1, z0, -16),
+		p0 = svcmpne (p1, z0, -16))
+
+/*
+** cmpne_m17_s64:
+**	mov	(z[0-9]+\.d), #-17
+**	cmpne	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m17_s64, svint64_t,
+		p0 = svcmpne_n_s64 (p1, z0, -17),
+		p0 = svcmpne (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s8.c
new file mode 100644
index 000000000..6409ecdd4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_s8.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpne_s8_tied:
+**	cmpne	p0\.b, p0/z, (z0\.b, z1\.b|z1\.b, z0\.b)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_s8_tied, svint8_t,
+		p0 = svcmpne_s8 (p0, z0, z1),
+		p0 = svcmpne (p0, z0, z1))
+
+/*
+** cmpne_s8_untied:
+**	cmpne	p0\.b, p1/z, (z0\.b, z1\.b|z1\.b, z0\.b)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_s8_untied, svint8_t,
+		p0 = svcmpne_s8 (p1, z0, z1),
+		p0 = svcmpne (p1, z0, z1))
+
+/*
+** cmpne_w0_s8:
+**	mov	(z[0-9]+\.b), w0
+**	cmpne	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
+**	ret
+*/
+TEST_COMPARE_ZX (cmpne_w0_s8, svint8_t, int8_t,
+		 p0 = svcmpne_n_s8 (p1, z0, x0),
+		 p0 = svcmpne (p1, z0, x0))
+
+/*
+** cmpne_0_s8:
+**	cmpne	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_0_s8, svint8_t,
+		p0 = svcmpne_n_s8 (p1, z0, 0),
+		p0 = svcmpne (p1, z0, 0))
+
+/*
+** cmpne_1_s8:
+**	cmpne	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_1_s8, svint8_t,
+		p0 = svcmpne_n_s8 (p1, z0, 1),
+		p0 = svcmpne (p1, z0, 1))
+
+/*
+** cmpne_15_s8:
+**	cmpne	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_15_s8, svint8_t,
+		p0 = svcmpne_n_s8 (p1, z0, 15),
+		p0 = svcmpne (p1, z0, 15))
+
+/*
+** cmpne_16_s8:
+**	mov	(z[0-9]+\.b), #16
+**	cmpne	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_16_s8, svint8_t,
+		p0 = svcmpne_n_s8 (p1, z0, 16),
+		p0 = svcmpne (p1, z0, 16))
+
+/*
+** cmpne_m1_s8:
+**	cmpne	p0\.b, p1/z, z0\.b, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m1_s8, svint8_t,
+		p0 = svcmpne_n_s8 (p1, z0, -1),
+		p0 = svcmpne (p1, z0, -1))
+
+/*
+** cmpne_m16_s8:
+**	cmpne	p0\.b, p1/z, z0\.b, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m16_s8, svint8_t,
+		p0 = svcmpne_n_s8 (p1, z0, -16),
+		p0 = svcmpne (p1, z0, -16))
+
+/*
+** cmpne_m17_s8:
+**	mov	(z[0-9]+\.b), #-17
+**	cmpne	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m17_s8, svint8_t,
+		p0 = svcmpne_n_s8 (p1, z0, -17),
+		p0 = svcmpne (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u16.c
new file mode 100644
index 000000000..4d22bc7d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u16.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpne_u16_tied:
+**	cmpne	p0\.h, p0/z, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_u16_tied, svuint16_t,
+		p0 = svcmpne_u16 (p0, z0, z1),
+		p0 = svcmpne (p0, z0, z1))
+
+/*
+** cmpne_u16_untied:
+**	cmpne	p0\.h, p1/z, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_u16_untied, svuint16_t,
+		p0 = svcmpne_u16 (p1, z0, z1),
+		p0 = svcmpne (p1, z0, z1))
+
+/*
+** cmpne_w0_u16:
+**	mov	(z[0-9]+\.h), w0
+**	cmpne	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_ZX (cmpne_w0_u16, svuint16_t, uint16_t,
+		 p0 = svcmpne_n_u16 (p1, z0, x0),
+		 p0 = svcmpne (p1, z0, x0))
+
+/*
+** cmpne_0_u16:
+**	cmpne	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_0_u16, svuint16_t,
+		p0 = svcmpne_n_u16 (p1, z0, 0),
+		p0 = svcmpne (p1, z0, 0))
+
+/*
+** cmpne_1_u16:
+**	cmpne	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_1_u16, svuint16_t,
+		p0 = svcmpne_n_u16 (p1, z0, 1),
+		p0 = svcmpne (p1, z0, 1))
+
+/*
+** cmpne_15_u16:
+**	cmpne	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_15_u16, svuint16_t,
+		p0 = svcmpne_n_u16 (p1, z0, 15),
+		p0 = svcmpne (p1, z0, 15))
+
+/*
+** cmpne_16_u16:
+**	mov	(z[0-9]+\.h), #16
+**	cmpne	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_16_u16, svuint16_t,
+		p0 = svcmpne_n_u16 (p1, z0, 16),
+		p0 = svcmpne (p1, z0, 16))
+
+/*
+** cmpne_m1_u16:
+**	cmpne	p0\.h, p1/z, z0\.h, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m1_u16, svuint16_t,
+		p0 = svcmpne_n_u16 (p1, z0, -1),
+		p0 = svcmpne (p1, z0, -1))
+
+/*
+** cmpne_m16_u16:
+**	cmpne	p0\.h, p1/z, z0\.h, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m16_u16, svuint16_t,
+		p0 = svcmpne_n_u16 (p1, z0, -16),
+		p0 = svcmpne (p1, z0, -16))
+
+/*
+** cmpne_m17_u16:
+**	mov	(z[0-9]+\.h), #-17
+**	cmpne	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m17_u16, svuint16_t,
+		p0 = svcmpne_n_u16 (p1, z0, -17),
+		p0 = svcmpne (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u32.c
new file mode 100644
index 000000000..b7ca94a69
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u32.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpne_u32_tied:
+**	cmpne	p0\.s, p0/z, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_u32_tied, svuint32_t,
+		p0 = svcmpne_u32 (p0, z0, z1),
+		p0 = svcmpne (p0, z0, z1))
+
+/*
+** cmpne_u32_untied:
+**	cmpne	p0\.s, p1/z, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_u32_untied, svuint32_t,
+		p0 = svcmpne_u32 (p1, z0, z1),
+		p0 = svcmpne (p1, z0, z1))
+
+/*
+** cmpne_w0_u32:
+**	mov	(z[0-9]+\.s), w0
+**	cmpne	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_ZX (cmpne_w0_u32, svuint32_t, uint32_t,
+		 p0 = svcmpne_n_u32 (p1, z0, x0),
+		 p0 = svcmpne (p1, z0, x0))
+
+/*
+** cmpne_0_u32:
+**	cmpne	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_0_u32, svuint32_t,
+		p0 = svcmpne_n_u32 (p1, z0, 0),
+		p0 = svcmpne (p1, z0, 0))
+
+/*
+** cmpne_1_u32:
+**	cmpne	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_1_u32, svuint32_t,
+		p0 = svcmpne_n_u32 (p1, z0, 1),
+		p0 = svcmpne (p1, z0, 1))
+
+/*
+** cmpne_15_u32:
+**	cmpne	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_15_u32, svuint32_t,
+		p0 = svcmpne_n_u32 (p1, z0, 15),
+		p0 = svcmpne (p1, z0, 15))
+
+/*
+** cmpne_16_u32:
+**	mov	(z[0-9]+\.s), #16
+**	cmpne	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_16_u32, svuint32_t,
+		p0 = svcmpne_n_u32 (p1, z0, 16),
+		p0 = svcmpne (p1, z0, 16))
+
+/*
+** cmpne_m1_u32:
+**	cmpne	p0\.s, p1/z, z0\.s, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m1_u32, svuint32_t,
+		p0 = svcmpne_n_u32 (p1, z0, -1),
+		p0 = svcmpne (p1, z0, -1))
+
+/*
+** cmpne_m16_u32:
+**	cmpne	p0\.s, p1/z, z0\.s, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m16_u32, svuint32_t,
+		p0 = svcmpne_n_u32 (p1, z0, -16),
+		p0 = svcmpne (p1, z0, -16))
+
+/*
+** cmpne_m17_u32:
+**	mov	(z[0-9]+\.s), #-17
+**	cmpne	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m17_u32, svuint32_t,
+		p0 = svcmpne_n_u32 (p1, z0, -17),
+		p0 = svcmpne (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u64.c
new file mode 100644
index 000000000..960ac85b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u64.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpne_u64_tied:
+**	cmpne	p0\.d, p0/z, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_u64_tied, svuint64_t,
+		p0 = svcmpne_u64 (p0, z0, z1),
+		p0 = svcmpne (p0, z0, z1))
+
+/*
+** cmpne_u64_untied:
+**	cmpne	p0\.d, p1/z, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_u64_untied, svuint64_t,
+		p0 = svcmpne_u64 (p1, z0, z1),
+		p0 = svcmpne (p1, z0, z1))
+
+/*
+** cmpne_x0_u64:
+**	mov	(z[0-9]+\.d), x0
+**	cmpne	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_ZX (cmpne_x0_u64, svuint64_t, uint64_t,
+		 p0 = svcmpne_n_u64 (p1, z0, x0),
+		 p0 = svcmpne (p1, z0, x0))
+
+/*
+** cmpne_0_u64:
+**	cmpne	p0\.d, p1/z, z0\.d, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_0_u64, svuint64_t,
+		p0 = svcmpne_n_u64 (p1, z0, 0),
+		p0 = svcmpne (p1, z0, 0))
+
+/*
+** cmpne_1_u64:
+**	cmpne	p0\.d, p1/z, z0\.d, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_1_u64, svuint64_t,
+		p0 = svcmpne_n_u64 (p1, z0, 1),
+		p0 = svcmpne (p1, z0, 1))
+
+/*
+** cmpne_15_u64:
+**	cmpne	p0\.d, p1/z, z0\.d, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_15_u64, svuint64_t,
+		p0 = svcmpne_n_u64 (p1, z0, 15),
+		p0 = svcmpne (p1, z0, 15))
+
+/*
+** cmpne_16_u64:
+**	mov	(z[0-9]+\.d), #16
+**	cmpne	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_16_u64, svuint64_t,
+		p0 = svcmpne_n_u64 (p1, z0, 16),
+		p0 = svcmpne (p1, z0, 16))
+
+/*
+** cmpne_m1_u64:
+**	cmpne	p0\.d, p1/z, z0\.d, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m1_u64, svuint64_t,
+		p0 = svcmpne_n_u64 (p1, z0, -1),
+		p0 = svcmpne (p1, z0, -1))
+
+/*
+** cmpne_m16_u64:
+**	cmpne	p0\.d, p1/z, z0\.d, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m16_u64, svuint64_t,
+		p0 = svcmpne_n_u64 (p1, z0, -16),
+		p0 = svcmpne (p1, z0, -16))
+
+/*
+** cmpne_m17_u64:
+**	mov	(z[0-9]+\.d), #-17
+**	cmpne	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m17_u64, svuint64_t,
+		p0 = svcmpne_n_u64 (p1, z0, -17),
+		p0 = svcmpne (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u8.c
new file mode 100644
index 000000000..cb8496eab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_u8.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpne_u8_tied:
+**	cmpne	p0\.b, p0/z, (z0\.b, z1\.b|z1\.b, z0\.b)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_u8_tied, svuint8_t,
+		p0 = svcmpne_u8 (p0, z0, z1),
+		p0 = svcmpne (p0, z0, z1))
+
+/*
+** cmpne_u8_untied:
+**	cmpne	p0\.b, p1/z, (z0\.b, z1\.b|z1\.b, z0\.b)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_u8_untied, svuint8_t,
+		p0 = svcmpne_u8 (p1, z0, z1),
+		p0 = svcmpne (p1, z0, z1))
+
+/*
+** cmpne_w0_u8:
+**	mov	(z[0-9]+\.b), w0
+**	cmpne	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
+**	ret
+*/
+TEST_COMPARE_ZX (cmpne_w0_u8, svuint8_t, uint8_t,
+		 p0 = svcmpne_n_u8 (p1, z0, x0),
+		 p0 = svcmpne (p1, z0, x0))
+
+/*
+** cmpne_0_u8:
+**	cmpne	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_0_u8, svuint8_t,
+		p0 = svcmpne_n_u8 (p1, z0, 0),
+		p0 = svcmpne (p1, z0, 0))
+
+/*
+** cmpne_1_u8:
+**	cmpne	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_1_u8, svuint8_t,
+		p0 = svcmpne_n_u8 (p1, z0, 1),
+		p0 = svcmpne (p1, z0, 1))
+
+/*
+** cmpne_15_u8:
+**	cmpne	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_15_u8, svuint8_t,
+		p0 = svcmpne_n_u8 (p1, z0, 15),
+		p0 = svcmpne (p1, z0, 15))
+
+/*
+** cmpne_16_u8:
+**	mov	(z[0-9]+\.b), #16
+**	cmpne	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_16_u8, svuint8_t,
+		p0 = svcmpne_n_u8 (p1, z0, 16),
+		p0 = svcmpne (p1, z0, 16))
+
+/*
+** cmpne_m1_u8:
+**	cmpne	p0\.b, p1/z, z0\.b, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m1_u8, svuint8_t,
+		p0 = svcmpne_n_u8 (p1, z0, -1),
+		p0 = svcmpne (p1, z0, -1))
+
+/*
+** cmpne_m16_u8:
+**	cmpne	p0\.b, p1/z, z0\.b, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m16_u8, svuint8_t,
+		p0 = svcmpne_n_u8 (p1, z0, -16),
+		p0 = svcmpne (p1, z0, -16))
+
+/*
+** cmpne_m17_u8:
+**	mov	(z[0-9]+\.b), #-17
+**	cmpne	p0\.b, p1/z, (z0\.b, \1|\1, z0\.b)
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_m17_u8, svuint8_t,
+		p0 = svcmpne_n_u8 (p1, z0, -17),
+		p0 = svcmpne (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s16.c
new file mode 100644
index 000000000..4cb7586c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s16.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpne_wide_s16_tied:
+**	cmpne	p0\.h, p0/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpne_wide_s16_tied, svint16_t, svint64_t,
+		     p0 = svcmpne_wide_s16 (p0, z0, z1),
+		     p0 = svcmpne_wide (p0, z0, z1))
+
+/*
+** cmpne_wide_s16_untied:
+**	cmpne	p0\.h, p1/z, z0\.h, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpne_wide_s16_untied, svint16_t, svint64_t,
+		     p0 = svcmpne_wide_s16 (p1, z0, z1),
+		     p0 = svcmpne_wide (p1, z0, z1))
+
+/*
+** cmpne_wide_x0_s16:
+**	mov	(z[0-9]+\.d), x0
+**	cmpne	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpne_wide_x0_s16, svint16_t, int64_t,
+		 p0 = svcmpne_wide_n_s16 (p1, z0, x0),
+		 p0 = svcmpne_wide (p1, z0, x0))
+
+/*
+** cmpne_wide_0_s16:
+**	cmpne	p0\.h, p1/z, z0\.h, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_0_s16, svint16_t,
+		p0 = svcmpne_wide_n_s16 (p1, z0, 0),
+		p0 = svcmpne_wide (p1, z0, 0))
+
+/*
+** cmpne_wide_1_s16:
+**	cmpne	p0\.h, p1/z, z0\.h, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_1_s16, svint16_t,
+		p0 = svcmpne_wide_n_s16 (p1, z0, 1),
+		p0 = svcmpne_wide (p1, z0, 1))
+
+/*
+** cmpne_wide_15_s16:
+**	cmpne	p0\.h, p1/z, z0\.h, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_15_s16, svint16_t,
+		p0 = svcmpne_wide_n_s16 (p1, z0, 15),
+		p0 = svcmpne_wide (p1, z0, 15))
+
+/*
+** cmpne_wide_16_s16:
+**	mov	(z[0-9]+\.d), #16
+**	cmpne	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_16_s16, svint16_t,
+		p0 = svcmpne_wide_n_s16 (p1, z0, 16),
+		p0 = svcmpne_wide (p1, z0, 16))
+
+/*
+** cmpne_wide_m1_s16:
+**	cmpne	p0\.h, p1/z, z0\.h, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_m1_s16, svint16_t,
+		p0 = svcmpne_wide_n_s16 (p1, z0, -1),
+		p0 = svcmpne_wide (p1, z0, -1))
+
+/*
+** cmpne_wide_m16_s16:
+**	cmpne	p0\.h, p1/z, z0\.h, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_m16_s16, svint16_t,
+		p0 = svcmpne_wide_n_s16 (p1, z0, -16),
+		p0 = svcmpne_wide (p1, z0, -16))
+
+/*
+** cmpne_wide_m17_s16:
+**	mov	(z[0-9]+\.d), #-17
+**	cmpne	p0\.h, p1/z, z0\.h, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_m17_s16, svint16_t,
+		p0 = svcmpne_wide_n_s16 (p1, z0, -17),
+		p0 = svcmpne_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s32.c
new file mode 100644
index 000000000..633994ed3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s32.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpne_wide_s32_tied:
+**	cmpne	p0\.s, p0/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpne_wide_s32_tied, svint32_t, svint64_t,
+		     p0 = svcmpne_wide_s32 (p0, z0, z1),
+		     p0 = svcmpne_wide (p0, z0, z1))
+
+/*
+** cmpne_wide_s32_untied:
+**	cmpne	p0\.s, p1/z, z0\.s, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpne_wide_s32_untied, svint32_t, svint64_t,
+		     p0 = svcmpne_wide_s32 (p1, z0, z1),
+		     p0 = svcmpne_wide (p1, z0, z1))
+
+/*
+** cmpne_wide_x0_s32:
+**	mov	(z[0-9]+\.d), x0
+**	cmpne	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpne_wide_x0_s32, svint32_t, int64_t,
+		 p0 = svcmpne_wide_n_s32 (p1, z0, x0),
+		 p0 = svcmpne_wide (p1, z0, x0))
+
+/*
+** cmpne_wide_0_s32:
+**	cmpne	p0\.s, p1/z, z0\.s, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_0_s32, svint32_t,
+		p0 = svcmpne_wide_n_s32 (p1, z0, 0),
+		p0 = svcmpne_wide (p1, z0, 0))
+
+/*
+** cmpne_wide_1_s32:
+**	cmpne	p0\.s, p1/z, z0\.s, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_1_s32, svint32_t,
+		p0 = svcmpne_wide_n_s32 (p1, z0, 1),
+		p0 = svcmpne_wide (p1, z0, 1))
+
+/*
+** cmpne_wide_15_s32:
+**	cmpne	p0\.s, p1/z, z0\.s, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_15_s32, svint32_t,
+		p0 = svcmpne_wide_n_s32 (p1, z0, 15),
+		p0 = svcmpne_wide (p1, z0, 15))
+
+/*
+** cmpne_wide_16_s32:
+**	mov	(z[0-9]+\.d), #16
+**	cmpne	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_16_s32, svint32_t,
+		p0 = svcmpne_wide_n_s32 (p1, z0, 16),
+		p0 = svcmpne_wide (p1, z0, 16))
+
+/*
+** cmpne_wide_m1_s32:
+**	cmpne	p0\.s, p1/z, z0\.s, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_m1_s32, svint32_t,
+		p0 = svcmpne_wide_n_s32 (p1, z0, -1),
+		p0 = svcmpne_wide (p1, z0, -1))
+
+/*
+** cmpne_wide_m16_s32:
+**	cmpne	p0\.s, p1/z, z0\.s, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_m16_s32, svint32_t,
+		p0 = svcmpne_wide_n_s32 (p1, z0, -16),
+		p0 = svcmpne_wide (p1, z0, -16))
+
+/*
+** cmpne_wide_m17_s32:
+**	mov	(z[0-9]+\.d), #-17
+**	cmpne	p0\.s, p1/z, z0\.s, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_m17_s32, svint32_t,
+		p0 = svcmpne_wide_n_s32 (p1, z0, -17),
+		p0 = svcmpne_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s8.c
new file mode 100644
index 000000000..de343f4cc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpne_wide_s8.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpne_wide_s8_tied:
+**	cmpne	p0\.b, p0/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpne_wide_s8_tied, svint8_t, svint64_t,
+		     p0 = svcmpne_wide_s8 (p0, z0, z1),
+		     p0 = svcmpne_wide (p0, z0, z1))
+
+/*
+** cmpne_wide_s8_untied:
+**	cmpne	p0\.b, p1/z, z0\.b, z1\.d
+**	ret
+*/
+TEST_COMPARE_DUAL_Z (cmpne_wide_s8_untied, svint8_t, svint64_t,
+		     p0 = svcmpne_wide_s8 (p1, z0, z1),
+		     p0 = svcmpne_wide (p1, z0, z1))
+
+/*
+** cmpne_wide_x0_s8:
+**	mov	(z[0-9]+\.d), x0
+**	cmpne	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_ZX (cmpne_wide_x0_s8, svint8_t, int64_t,
+		 p0 = svcmpne_wide_n_s8 (p1, z0, x0),
+		 p0 = svcmpne_wide (p1, z0, x0))
+
+/*
+** cmpne_wide_0_s8:
+**	cmpne	p0\.b, p1/z, z0\.b, #0
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_0_s8, svint8_t,
+		p0 = svcmpne_wide_n_s8 (p1, z0, 0),
+		p0 = svcmpne_wide (p1, z0, 0))
+
+/*
+** cmpne_wide_1_s8:
+**	cmpne	p0\.b, p1/z, z0\.b, #1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_1_s8, svint8_t,
+		p0 = svcmpne_wide_n_s8 (p1, z0, 1),
+		p0 = svcmpne_wide (p1, z0, 1))
+
+/*
+** cmpne_wide_15_s8:
+**	cmpne	p0\.b, p1/z, z0\.b, #15
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_15_s8, svint8_t,
+		p0 = svcmpne_wide_n_s8 (p1, z0, 15),
+		p0 = svcmpne_wide (p1, z0, 15))
+
+/*
+** cmpne_wide_16_s8:
+**	mov	(z[0-9]+\.d), #16
+**	cmpne	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_16_s8, svint8_t,
+		p0 = svcmpne_wide_n_s8 (p1, z0, 16),
+		p0 = svcmpne_wide (p1, z0, 16))
+
+/*
+** cmpne_wide_m1_s8:
+**	cmpne	p0\.b, p1/z, z0\.b, #-1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_m1_s8, svint8_t,
+		p0 = svcmpne_wide_n_s8 (p1, z0, -1),
+		p0 = svcmpne_wide (p1, z0, -1))
+
+/*
+** cmpne_wide_m16_s8:
+**	cmpne	p0\.b, p1/z, z0\.b, #-16
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_m16_s8, svint8_t,
+		p0 = svcmpne_wide_n_s8 (p1, z0, -16),
+		p0 = svcmpne_wide (p1, z0, -16))
+
+/*
+** cmpne_wide_m17_s8:
+**	mov	(z[0-9]+\.d), #-17
+**	cmpne	p0\.b, p1/z, z0\.b, \1
+**	ret
+*/
+TEST_COMPARE_Z (cmpne_wide_m17_s8, svint8_t,
+		p0 = svcmpne_wide_n_s8 (p1, z0, -17),
+		p0 = svcmpne_wide (p1, z0, -17))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f16.c
new file mode 100644
index 000000000..8f702cdde
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f16.c
@@ -0,0 +1,51 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpuo_f16_tied:
+**	fcmuo	p0\.h, p0/z, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpuo_f16_tied, svfloat16_t,
+		p0 = svcmpuo_f16 (p0, z0, z1),
+		p0 = svcmpuo (p0, z0, z1))
+
+/*
+** cmpuo_f16_untied:
+**	fcmuo	p0\.h, p1/z, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpuo_f16_untied, svfloat16_t,
+		p0 = svcmpuo_f16 (p1, z0, z1),
+		p0 = svcmpuo (p1, z0, z1))
+
+/*
+** cmpuo_h4_f16:
+**	mov	(z[0-9]+\.h), h4
+**	fcmuo	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_ZD (cmpuo_h4_f16, svfloat16_t, float16_t,
+		 p0 = svcmpuo_n_f16 (p1, z0, d4),
+		 p0 = svcmpuo (p1, z0, d4))
+
+/*
+** cmpuo_0_f16:
+**	mov	(z[0-9]+\.h), #0
+**	fcmuo	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpuo_0_f16, svfloat16_t,
+		p0 = svcmpuo_n_f16 (p1, z0, 0),
+		p0 = svcmpuo (p1, z0, 0))
+
+/*
+** cmpuo_1_f16:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fcmuo	p0\.h, p1/z, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_COMPARE_Z (cmpuo_1_f16, svfloat16_t,
+		p0 = svcmpuo_n_f16 (p1, z0, 1),
+		p0 = svcmpuo (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f32.c
new file mode 100644
index 000000000..8827604aa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f32.c
@@ -0,0 +1,51 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpuo_f32_tied:
+**	fcmuo	p0\.s, p0/z, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpuo_f32_tied, svfloat32_t,
+		p0 = svcmpuo_f32 (p0, z0, z1),
+		p0 = svcmpuo (p0, z0, z1))
+
+/*
+** cmpuo_f32_untied:
+**	fcmuo	p0\.s, p1/z, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpuo_f32_untied, svfloat32_t,
+		p0 = svcmpuo_f32 (p1, z0, z1),
+		p0 = svcmpuo (p1, z0, z1))
+
+/*
+** cmpuo_s4_f32:
+**	mov	(z[0-9]+\.s), s4
+**	fcmuo	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_ZD (cmpuo_s4_f32, svfloat32_t, float32_t,
+		 p0 = svcmpuo_n_f32 (p1, z0, d4),
+		 p0 = svcmpuo (p1, z0, d4))
+
+/*
+** cmpuo_0_f32:
+**	mov	(z[0-9]+\.s), #0
+**	fcmuo	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpuo_0_f32, svfloat32_t,
+		p0 = svcmpuo_n_f32 (p1, z0, 0),
+		p0 = svcmpuo (p1, z0, 0))
+
+/*
+** cmpuo_1_f32:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fcmuo	p0\.s, p1/z, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_COMPARE_Z (cmpuo_1_f32, svfloat32_t,
+		p0 = svcmpuo_n_f32 (p1, z0, 1),
+		p0 = svcmpuo (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f64.c
new file mode 100644
index 000000000..d7a71eca4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cmpuo_f64.c
@@ -0,0 +1,51 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cmpuo_f64_tied:
+**	fcmuo	p0\.d, p0/z, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpuo_f64_tied, svfloat64_t,
+		p0 = svcmpuo_f64 (p0, z0, z1),
+		p0 = svcmpuo (p0, z0, z1))
+
+/*
+** cmpuo_f64_untied:
+**	fcmuo	p0\.d, p1/z, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpuo_f64_untied, svfloat64_t,
+		p0 = svcmpuo_f64 (p1, z0, z1),
+		p0 = svcmpuo (p1, z0, z1))
+
+/*
+** cmpuo_d4_f64:
+**	mov	(z[0-9]+\.d), d4
+**	fcmuo	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_ZD (cmpuo_d4_f64, svfloat64_t, float64_t,
+		 p0 = svcmpuo_n_f64 (p1, z0, d4),
+		 p0 = svcmpuo (p1, z0, d4))
+
+/*
+** cmpuo_0_f64:
+**	mov	(z[0-9]+\.d), #0
+**	fcmuo	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpuo_0_f64, svfloat64_t,
+		p0 = svcmpuo_n_f64 (p1, z0, 0),
+		p0 = svcmpuo (p1, z0, 0))
+
+/*
+** cmpuo_1_f64:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fcmuo	p0\.d, p1/z, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_COMPARE_Z (cmpuo_1_f64, svfloat64_t,
+		p0 = svcmpuo_n_f64 (p1, z0, 1),
+		p0 = svcmpuo (p1, z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s16.c
new file mode 100644
index 000000000..19d46be68
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s16.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnot_s16_m_tied12:
+**	cnot	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s16_m_tied12, svint16_t,
+		z0 = svcnot_s16_m (z0, p0, z0),
+		z0 = svcnot_m (z0, p0, z0))
+
+/*
+** cnot_s16_m_tied1:
+**	cnot	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s16_m_tied1, svint16_t,
+		z0 = svcnot_s16_m (z0, p0, z1),
+		z0 = svcnot_m (z0, p0, z1))
+
+/*
+** cnot_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	cnot	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s16_m_tied2, svint16_t,
+		z0 = svcnot_s16_m (z1, p0, z0),
+		z0 = svcnot_m (z1, p0, z0))
+
+/*
+** cnot_s16_m_untied:
+**	movprfx	z0, z2
+**	cnot	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s16_m_untied, svint16_t,
+		z0 = svcnot_s16_m (z2, p0, z1),
+		z0 = svcnot_m (z2, p0, z1))
+
+/*
+** cnot_s16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	cnot	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s16_z_tied1, svint16_t,
+		z0 = svcnot_s16_z (p0, z0),
+		z0 = svcnot_z (p0, z0))
+
+/*
+** cnot_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	cnot	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s16_z_untied, svint16_t,
+		z0 = svcnot_s16_z (p0, z1),
+		z0 = svcnot_z (p0, z1))
+
+/*
+** cnot_s16_x_tied1:
+**	cnot	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s16_x_tied1, svint16_t,
+		z0 = svcnot_s16_x (p0, z0),
+		z0 = svcnot_x (p0, z0))
+
+/*
+** cnot_s16_x_untied:
+**	cnot	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s16_x_untied, svint16_t,
+		z0 = svcnot_s16_x (p0, z1),
+		z0 = svcnot_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s32.c
new file mode 100644
index 000000000..041b59a04
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s32.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnot_s32_m_tied12:
+**	cnot	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s32_m_tied12, svint32_t,
+		z0 = svcnot_s32_m (z0, p0, z0),
+		z0 = svcnot_m (z0, p0, z0))
+
+/*
+** cnot_s32_m_tied1:
+**	cnot	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s32_m_tied1, svint32_t,
+		z0 = svcnot_s32_m (z0, p0, z1),
+		z0 = svcnot_m (z0, p0, z1))
+
+/*
+** cnot_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	cnot	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s32_m_tied2, svint32_t,
+		z0 = svcnot_s32_m (z1, p0, z0),
+		z0 = svcnot_m (z1, p0, z0))
+
+/*
+** cnot_s32_m_untied:
+**	movprfx	z0, z2
+**	cnot	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s32_m_untied, svint32_t,
+		z0 = svcnot_s32_m (z2, p0, z1),
+		z0 = svcnot_m (z2, p0, z1))
+
+/*
+** cnot_s32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	cnot	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s32_z_tied1, svint32_t,
+		z0 = svcnot_s32_z (p0, z0),
+		z0 = svcnot_z (p0, z0))
+
+/*
+** cnot_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	cnot	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s32_z_untied, svint32_t,
+		z0 = svcnot_s32_z (p0, z1),
+		z0 = svcnot_z (p0, z1))
+
+/*
+** cnot_s32_x_tied1:
+**	cnot	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s32_x_tied1, svint32_t,
+		z0 = svcnot_s32_x (p0, z0),
+		z0 = svcnot_x (p0, z0))
+
+/*
+** cnot_s32_x_untied:
+**	cnot	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s32_x_untied, svint32_t,
+		z0 = svcnot_s32_x (p0, z1),
+		z0 = svcnot_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s64.c
new file mode 100644
index 000000000..c7135cb95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnot_s64_m_tied12:
+**	cnot	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s64_m_tied12, svint64_t,
+		z0 = svcnot_s64_m (z0, p0, z0),
+		z0 = svcnot_m (z0, p0, z0))
+
+/*
+** cnot_s64_m_tied1:
+**	cnot	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s64_m_tied1, svint64_t,
+		z0 = svcnot_s64_m (z0, p0, z1),
+		z0 = svcnot_m (z0, p0, z1))
+
+/*
+** cnot_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	cnot	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s64_m_tied2, svint64_t,
+		z0 = svcnot_s64_m (z1, p0, z0),
+		z0 = svcnot_m (z1, p0, z0))
+
+/*
+** cnot_s64_m_untied:
+**	movprfx	z0, z2
+**	cnot	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s64_m_untied, svint64_t,
+		z0 = svcnot_s64_m (z2, p0, z1),
+		z0 = svcnot_m (z2, p0, z1))
+
+/*
+** cnot_s64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	cnot	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s64_z_tied1, svint64_t,
+		z0 = svcnot_s64_z (p0, z0),
+		z0 = svcnot_z (p0, z0))
+
+/*
+** cnot_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	cnot	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s64_z_untied, svint64_t,
+		z0 = svcnot_s64_z (p0, z1),
+		z0 = svcnot_z (p0, z1))
+
+/*
+** cnot_s64_x_tied1:
+**	cnot	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s64_x_tied1, svint64_t,
+		z0 = svcnot_s64_x (p0, z0),
+		z0 = svcnot_x (p0, z0))
+
+/*
+** cnot_s64_x_untied:
+**	cnot	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s64_x_untied, svint64_t,
+		z0 = svcnot_s64_x (p0, z1),
+		z0 = svcnot_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s8.c
new file mode 100644
index 000000000..0560f9751
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_s8.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnot_s8_m_tied12:
+**	cnot	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s8_m_tied12, svint8_t,
+		z0 = svcnot_s8_m (z0, p0, z0),
+		z0 = svcnot_m (z0, p0, z0))
+
+/*
+** cnot_s8_m_tied1:
+**	cnot	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s8_m_tied1, svint8_t,
+		z0 = svcnot_s8_m (z0, p0, z1),
+		z0 = svcnot_m (z0, p0, z1))
+
+/*
+** cnot_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	cnot	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s8_m_tied2, svint8_t,
+		z0 = svcnot_s8_m (z1, p0, z0),
+		z0 = svcnot_m (z1, p0, z0))
+
+/*
+** cnot_s8_m_untied:
+**	movprfx	z0, z2
+**	cnot	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s8_m_untied, svint8_t,
+		z0 = svcnot_s8_m (z2, p0, z1),
+		z0 = svcnot_m (z2, p0, z1))
+
+/*
+** cnot_s8_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.b, p0/z, \1\.b
+**	cnot	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s8_z_tied1, svint8_t,
+		z0 = svcnot_s8_z (p0, z0),
+		z0 = svcnot_z (p0, z0))
+
+/*
+** cnot_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	cnot	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s8_z_untied, svint8_t,
+		z0 = svcnot_s8_z (p0, z1),
+		z0 = svcnot_z (p0, z1))
+
+/*
+** cnot_s8_x_tied1:
+**	cnot	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s8_x_tied1, svint8_t,
+		z0 = svcnot_s8_x (p0, z0),
+		z0 = svcnot_x (p0, z0))
+
+/*
+** cnot_s8_x_untied:
+**	cnot	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_s8_x_untied, svint8_t,
+		z0 = svcnot_s8_x (p0, z1),
+		z0 = svcnot_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u16.c
new file mode 100644
index 000000000..7ea9ff71d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u16.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnot_u16_m_tied12:
+**	cnot	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u16_m_tied12, svuint16_t,
+		z0 = svcnot_u16_m (z0, p0, z0),
+		z0 = svcnot_m (z0, p0, z0))
+
+/*
+** cnot_u16_m_tied1:
+**	cnot	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u16_m_tied1, svuint16_t,
+		z0 = svcnot_u16_m (z0, p0, z1),
+		z0 = svcnot_m (z0, p0, z1))
+
+/*
+** cnot_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	cnot	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u16_m_tied2, svuint16_t,
+		z0 = svcnot_u16_m (z1, p0, z0),
+		z0 = svcnot_m (z1, p0, z0))
+
+/*
+** cnot_u16_m_untied:
+**	movprfx	z0, z2
+**	cnot	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u16_m_untied, svuint16_t,
+		z0 = svcnot_u16_m (z2, p0, z1),
+		z0 = svcnot_m (z2, p0, z1))
+
+/*
+** cnot_u16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	cnot	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u16_z_tied1, svuint16_t,
+		z0 = svcnot_u16_z (p0, z0),
+		z0 = svcnot_z (p0, z0))
+
+/*
+** cnot_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	cnot	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u16_z_untied, svuint16_t,
+		z0 = svcnot_u16_z (p0, z1),
+		z0 = svcnot_z (p0, z1))
+
+/*
+** cnot_u16_x_tied1:
+**	cnot	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u16_x_tied1, svuint16_t,
+		z0 = svcnot_u16_x (p0, z0),
+		z0 = svcnot_x (p0, z0))
+
+/*
+** cnot_u16_x_untied:
+**	cnot	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u16_x_untied, svuint16_t,
+		z0 = svcnot_u16_x (p0, z1),
+		z0 = svcnot_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u32.c
new file mode 100644
index 000000000..972c7751e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u32.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnot_u32_m_tied12:
+**	cnot	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u32_m_tied12, svuint32_t,
+		z0 = svcnot_u32_m (z0, p0, z0),
+		z0 = svcnot_m (z0, p0, z0))
+
+/*
+** cnot_u32_m_tied1:
+**	cnot	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u32_m_tied1, svuint32_t,
+		z0 = svcnot_u32_m (z0, p0, z1),
+		z0 = svcnot_m (z0, p0, z1))
+
+/*
+** cnot_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	cnot	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u32_m_tied2, svuint32_t,
+		z0 = svcnot_u32_m (z1, p0, z0),
+		z0 = svcnot_m (z1, p0, z0))
+
+/*
+** cnot_u32_m_untied:
+**	movprfx	z0, z2
+**	cnot	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u32_m_untied, svuint32_t,
+		z0 = svcnot_u32_m (z2, p0, z1),
+		z0 = svcnot_m (z2, p0, z1))
+
+/*
+** cnot_u32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	cnot	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u32_z_tied1, svuint32_t,
+		z0 = svcnot_u32_z (p0, z0),
+		z0 = svcnot_z (p0, z0))
+
+/*
+** cnot_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	cnot	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u32_z_untied, svuint32_t,
+		z0 = svcnot_u32_z (p0, z1),
+		z0 = svcnot_z (p0, z1))
+
+/*
+** cnot_u32_x_tied1:
+**	cnot	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u32_x_tied1, svuint32_t,
+		z0 = svcnot_u32_x (p0, z0),
+		z0 = svcnot_x (p0, z0))
+
+/*
+** cnot_u32_x_untied:
+**	cnot	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u32_x_untied, svuint32_t,
+		z0 = svcnot_u32_x (p0, z1),
+		z0 = svcnot_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u64.c
new file mode 100644
index 000000000..f25e001c5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnot_u64_m_tied12:
+**	cnot	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u64_m_tied12, svuint64_t,
+		z0 = svcnot_u64_m (z0, p0, z0),
+		z0 = svcnot_m (z0, p0, z0))
+
+/*
+** cnot_u64_m_tied1:
+**	cnot	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u64_m_tied1, svuint64_t,
+		z0 = svcnot_u64_m (z0, p0, z1),
+		z0 = svcnot_m (z0, p0, z1))
+
+/*
+** cnot_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	cnot	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u64_m_tied2, svuint64_t,
+		z0 = svcnot_u64_m (z1, p0, z0),
+		z0 = svcnot_m (z1, p0, z0))
+
+/*
+** cnot_u64_m_untied:
+**	movprfx	z0, z2
+**	cnot	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u64_m_untied, svuint64_t,
+		z0 = svcnot_u64_m (z2, p0, z1),
+		z0 = svcnot_m (z2, p0, z1))
+
+/*
+** cnot_u64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	cnot	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u64_z_tied1, svuint64_t,
+		z0 = svcnot_u64_z (p0, z0),
+		z0 = svcnot_z (p0, z0))
+
+/*
+** cnot_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	cnot	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u64_z_untied, svuint64_t,
+		z0 = svcnot_u64_z (p0, z1),
+		z0 = svcnot_z (p0, z1))
+
+/*
+** cnot_u64_x_tied1:
+**	cnot	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u64_x_tied1, svuint64_t,
+		z0 = svcnot_u64_x (p0, z0),
+		z0 = svcnot_x (p0, z0))
+
+/*
+** cnot_u64_x_untied:
+**	cnot	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u64_x_untied, svuint64_t,
+		z0 = svcnot_u64_x (p0, z1),
+		z0 = svcnot_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u8.c
new file mode 100644
index 000000000..e135a7295
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnot_u8.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnot_u8_m_tied12:
+**	cnot	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u8_m_tied12, svuint8_t,
+		z0 = svcnot_u8_m (z0, p0, z0),
+		z0 = svcnot_m (z0, p0, z0))
+
+/*
+** cnot_u8_m_tied1:
+**	cnot	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u8_m_tied1, svuint8_t,
+		z0 = svcnot_u8_m (z0, p0, z1),
+		z0 = svcnot_m (z0, p0, z1))
+
+/*
+** cnot_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	cnot	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u8_m_tied2, svuint8_t,
+		z0 = svcnot_u8_m (z1, p0, z0),
+		z0 = svcnot_m (z1, p0, z0))
+
+/*
+** cnot_u8_m_untied:
+**	movprfx	z0, z2
+**	cnot	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u8_m_untied, svuint8_t,
+		z0 = svcnot_u8_m (z2, p0, z1),
+		z0 = svcnot_m (z2, p0, z1))
+
+/*
+** cnot_u8_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.b, p0/z, \1\.b
+**	cnot	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u8_z_tied1, svuint8_t,
+		z0 = svcnot_u8_z (p0, z0),
+		z0 = svcnot_z (p0, z0))
+
+/*
+** cnot_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	cnot	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u8_z_untied, svuint8_t,
+		z0 = svcnot_u8_z (p0, z1),
+		z0 = svcnot_z (p0, z1))
+
+/*
+** cnot_u8_x_tied1:
+**	cnot	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u8_x_tied1, svuint8_t,
+		z0 = svcnot_u8_x (p0, z0),
+		z0 = svcnot_x (p0, z0))
+
+/*
+** cnot_u8_x_untied:
+**	cnot	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnot_u8_x_untied, svuint8_t,
+		z0 = svcnot_u8_x (p0, z1),
+		z0 = svcnot_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_bf16.c
new file mode 100644
index 000000000..d92fbc157
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_bf16.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnt_bf16_m_tied1:
+**	cnt	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cnt_bf16_m_tied1, svuint16_t, svbfloat16_t,
+	     z0 = svcnt_bf16_m (z0, p0, z4),
+	     z0 = svcnt_m (z0, p0, z4))
+
+/*
+** cnt_bf16_m_untied:
+**	movprfx	z0, z1
+**	cnt	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cnt_bf16_m_untied, svuint16_t, svbfloat16_t,
+	     z0 = svcnt_bf16_m (z1, p0, z4),
+	     z0 = svcnt_m (z1, p0, z4))
+
+/*
+** cnt_bf16_z:
+**	movprfx	z0\.h, p0/z, z4\.h
+**	cnt	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cnt_bf16_z, svuint16_t, svbfloat16_t,
+	     z0 = svcnt_bf16_z (p0, z4),
+	     z0 = svcnt_z (p0, z4))
+
+/*
+** cnt_bf16_x:
+**	cnt	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cnt_bf16_x, svuint16_t, svbfloat16_t,
+	     z0 = svcnt_bf16_x (p0, z4),
+	     z0 = svcnt_x (p0, z4))
+
+/*
+** ptrue_cnt_bf16_x:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cnt_bf16_x, svuint16_t, svbfloat16_t,
+	     z0 = svcnt_bf16_x (svptrue_b16 (), z4),
+	     z0 = svcnt_x (svptrue_b16 (), z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f16.c
new file mode 100644
index 000000000..b8061bb80
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f16.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnt_f16_m_tied1:
+**	cnt	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cnt_f16_m_tied1, svuint16_t, svfloat16_t,
+	     z0 = svcnt_f16_m (z0, p0, z4),
+	     z0 = svcnt_m (z0, p0, z4))
+
+/*
+** cnt_f16_m_untied:
+**	movprfx	z0, z1
+**	cnt	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cnt_f16_m_untied, svuint16_t, svfloat16_t,
+	     z0 = svcnt_f16_m (z1, p0, z4),
+	     z0 = svcnt_m (z1, p0, z4))
+
+/*
+** cnt_f16_z:
+**	movprfx	z0\.h, p0/z, z4\.h
+**	cnt	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cnt_f16_z, svuint16_t, svfloat16_t,
+	     z0 = svcnt_f16_z (p0, z4),
+	     z0 = svcnt_z (p0, z4))
+
+/*
+** cnt_f16_x:
+**	cnt	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cnt_f16_x, svuint16_t, svfloat16_t,
+	     z0 = svcnt_f16_x (p0, z4),
+	     z0 = svcnt_x (p0, z4))
+
+/*
+** ptrue_cnt_f16_x:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cnt_f16_x, svuint16_t, svfloat16_t,
+	     z0 = svcnt_f16_x (svptrue_b16 (), z4),
+	     z0 = svcnt_x (svptrue_b16 (), z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f32.c
new file mode 100644
index 000000000..b9292c977
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f32.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnt_f32_m_tied1:
+**	cnt	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cnt_f32_m_tied1, svuint32_t, svfloat32_t,
+	     z0 = svcnt_f32_m (z0, p0, z4),
+	     z0 = svcnt_m (z0, p0, z4))
+
+/*
+** cnt_f32_m_untied:
+**	movprfx	z0, z1
+**	cnt	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cnt_f32_m_untied, svuint32_t, svfloat32_t,
+	     z0 = svcnt_f32_m (z1, p0, z4),
+	     z0 = svcnt_m (z1, p0, z4))
+
+/*
+** cnt_f32_z:
+**	movprfx	z0\.s, p0/z, z4\.s
+**	cnt	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cnt_f32_z, svuint32_t, svfloat32_t,
+	     z0 = svcnt_f32_z (p0, z4),
+	     z0 = svcnt_z (p0, z4))
+
+/*
+** cnt_f32_x:
+**	cnt	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cnt_f32_x, svuint32_t, svfloat32_t,
+	     z0 = svcnt_f32_x (p0, z4),
+	     z0 = svcnt_x (p0, z4))
+
+/*
+** ptrue_cnt_f32_x:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cnt_f32_x, svuint32_t, svfloat32_t,
+	     z0 = svcnt_f32_x (svptrue_b32 (), z4),
+	     z0 = svcnt_x (svptrue_b32 (), z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f64.c
new file mode 100644
index 000000000..4976ee467
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_f64.c
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnt_f64_m_tied1:
+**	cnt	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cnt_f64_m_tied1, svuint64_t, svfloat64_t,
+	     z0 = svcnt_f64_m (z0, p0, z4),
+	     z0 = svcnt_m (z0, p0, z4))
+
+/*
+** cnt_f64_m_untied:
+**	movprfx	z0, z1
+**	cnt	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cnt_f64_m_untied, svuint64_t, svfloat64_t,
+	     z0 = svcnt_f64_m (z1, p0, z4),
+	     z0 = svcnt_m (z1, p0, z4))
+
+/*
+** cnt_f64_z:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	cnt	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cnt_f64_z, svuint64_t, svfloat64_t,
+	     z0 = svcnt_f64_z (p0, z4),
+	     z0 = svcnt_z (p0, z4))
+
+/*
+** cnt_f64_x:
+**	cnt	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cnt_f64_x, svuint64_t, svfloat64_t,
+	     z0 = svcnt_f64_x (p0, z4),
+	     z0 = svcnt_x (p0, z4))
+
+/*
+** ptrue_cnt_f64_x:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cnt_f64_x, svuint64_t, svfloat64_t,
+	     z0 = svcnt_f64_x (svptrue_b64 (), z4),
+	     z0 = svcnt_x (svptrue_b64 (), z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s16.c
new file mode 100644
index 000000000..a8ff8f3d2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s16.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnt_s16_m_tied1:
+**	cnt	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cnt_s16_m_tied1, svuint16_t, svint16_t,
+	     z0 = svcnt_s16_m (z0, p0, z4),
+	     z0 = svcnt_m (z0, p0, z4))
+
+/*
+** cnt_s16_m_untied:
+**	movprfx	z0, z1
+**	cnt	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cnt_s16_m_untied, svuint16_t, svint16_t,
+	     z0 = svcnt_s16_m (z1, p0, z4),
+	     z0 = svcnt_m (z1, p0, z4))
+
+/*
+** cnt_s16_z:
+**	movprfx	z0\.h, p0/z, z4\.h
+**	cnt	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cnt_s16_z, svuint16_t, svint16_t,
+	     z0 = svcnt_s16_z (p0, z4),
+	     z0 = svcnt_z (p0, z4))
+
+/*
+** cnt_s16_x:
+**	cnt	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cnt_s16_x, svuint16_t, svint16_t,
+	     z0 = svcnt_s16_x (p0, z4),
+	     z0 = svcnt_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s32.c
new file mode 100644
index 000000000..3d16041f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s32.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnt_s32_m_tied1:
+**	cnt	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cnt_s32_m_tied1, svuint32_t, svint32_t,
+	     z0 = svcnt_s32_m (z0, p0, z4),
+	     z0 = svcnt_m (z0, p0, z4))
+
+/*
+** cnt_s32_m_untied:
+**	movprfx	z0, z1
+**	cnt	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cnt_s32_m_untied, svuint32_t, svint32_t,
+	     z0 = svcnt_s32_m (z1, p0, z4),
+	     z0 = svcnt_m (z1, p0, z4))
+
+/*
+** cnt_s32_z:
+**	movprfx	z0\.s, p0/z, z4\.s
+**	cnt	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cnt_s32_z, svuint32_t, svint32_t,
+	     z0 = svcnt_s32_z (p0, z4),
+	     z0 = svcnt_z (p0, z4))
+
+/*
+** cnt_s32_x:
+**	cnt	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cnt_s32_x, svuint32_t, svint32_t,
+	     z0 = svcnt_s32_x (p0, z4),
+	     z0 = svcnt_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s64.c
new file mode 100644
index 000000000..8c8871ba5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s64.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnt_s64_m_tied1:
+**	cnt	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cnt_s64_m_tied1, svuint64_t, svint64_t,
+	     z0 = svcnt_s64_m (z0, p0, z4),
+	     z0 = svcnt_m (z0, p0, z4))
+
+/*
+** cnt_s64_m_untied:
+**	movprfx	z0, z1
+**	cnt	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cnt_s64_m_untied, svuint64_t, svint64_t,
+	     z0 = svcnt_s64_m (z1, p0, z4),
+	     z0 = svcnt_m (z1, p0, z4))
+
+/*
+** cnt_s64_z:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	cnt	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cnt_s64_z, svuint64_t, svint64_t,
+	     z0 = svcnt_s64_z (p0, z4),
+	     z0 = svcnt_z (p0, z4))
+
+/*
+** cnt_s64_x:
+**	cnt	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cnt_s64_x, svuint64_t, svint64_t,
+	     z0 = svcnt_s64_x (p0, z4),
+	     z0 = svcnt_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s8.c
new file mode 100644
index 000000000..8d85c8e51
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_s8.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnt_s8_m_tied1:
+**	cnt	z0\.b, p0/m, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (cnt_s8_m_tied1, svuint8_t, svint8_t,
+	     z0 = svcnt_s8_m (z0, p0, z4),
+	     z0 = svcnt_m (z0, p0, z4))
+
+/*
+** cnt_s8_m_untied:
+**	movprfx	z0, z1
+**	cnt	z0\.b, p0/m, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (cnt_s8_m_untied, svuint8_t, svint8_t,
+	     z0 = svcnt_s8_m (z1, p0, z4),
+	     z0 = svcnt_m (z1, p0, z4))
+
+/*
+** cnt_s8_z:
+**	movprfx	z0\.b, p0/z, z4\.b
+**	cnt	z0\.b, p0/m, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (cnt_s8_z, svuint8_t, svint8_t,
+	     z0 = svcnt_s8_z (p0, z4),
+	     z0 = svcnt_z (p0, z4))
+
+/*
+** cnt_s8_x:
+**	cnt	z0\.b, p0/m, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (cnt_s8_x, svuint8_t, svint8_t,
+	     z0 = svcnt_s8_x (p0, z4),
+	     z0 = svcnt_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u16.c
new file mode 100644
index 000000000..f173d3108
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u16.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnt_u16_m_tied12:
+**	cnt	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u16_m_tied12, svuint16_t,
+		z0 = svcnt_u16_m (z0, p0, z0),
+		z0 = svcnt_m (z0, p0, z0))
+
+/*
+** cnt_u16_m_tied1:
+**	cnt	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u16_m_tied1, svuint16_t,
+		z0 = svcnt_u16_m (z0, p0, z1),
+		z0 = svcnt_m (z0, p0, z1))
+
+/*
+** cnt_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	cnt	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u16_m_tied2, svuint16_t,
+		z0 = svcnt_u16_m (z1, p0, z0),
+		z0 = svcnt_m (z1, p0, z0))
+
+/*
+** cnt_u16_m_untied:
+**	movprfx	z0, z2
+**	cnt	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u16_m_untied, svuint16_t,
+		z0 = svcnt_u16_m (z2, p0, z1),
+		z0 = svcnt_m (z2, p0, z1))
+
+/*
+** cnt_u16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	cnt	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u16_z_tied1, svuint16_t,
+		z0 = svcnt_u16_z (p0, z0),
+		z0 = svcnt_z (p0, z0))
+
+/*
+** cnt_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	cnt	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u16_z_untied, svuint16_t,
+		z0 = svcnt_u16_z (p0, z1),
+		z0 = svcnt_z (p0, z1))
+
+/*
+** cnt_u16_x_tied1:
+**	cnt	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u16_x_tied1, svuint16_t,
+		z0 = svcnt_u16_x (p0, z0),
+		z0 = svcnt_x (p0, z0))
+
+/*
+** cnt_u16_x_untied:
+**	cnt	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u16_x_untied, svuint16_t,
+		z0 = svcnt_u16_x (p0, z1),
+		z0 = svcnt_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u32.c
new file mode 100644
index 000000000..11969a6b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u32.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnt_u32_m_tied12:
+**	cnt	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u32_m_tied12, svuint32_t,
+		z0 = svcnt_u32_m (z0, p0, z0),
+		z0 = svcnt_m (z0, p0, z0))
+
+/*
+** cnt_u32_m_tied1:
+**	cnt	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u32_m_tied1, svuint32_t,
+		z0 = svcnt_u32_m (z0, p0, z1),
+		z0 = svcnt_m (z0, p0, z1))
+
+/*
+** cnt_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	cnt	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u32_m_tied2, svuint32_t,
+		z0 = svcnt_u32_m (z1, p0, z0),
+		z0 = svcnt_m (z1, p0, z0))
+
+/*
+** cnt_u32_m_untied:
+**	movprfx	z0, z2
+**	cnt	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u32_m_untied, svuint32_t,
+		z0 = svcnt_u32_m (z2, p0, z1),
+		z0 = svcnt_m (z2, p0, z1))
+
+/*
+** cnt_u32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	cnt	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u32_z_tied1, svuint32_t,
+		z0 = svcnt_u32_z (p0, z0),
+		z0 = svcnt_z (p0, z0))
+
+/*
+** cnt_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	cnt	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u32_z_untied, svuint32_t,
+		z0 = svcnt_u32_z (p0, z1),
+		z0 = svcnt_z (p0, z1))
+
+/*
+** cnt_u32_x_tied1:
+**	cnt	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u32_x_tied1, svuint32_t,
+		z0 = svcnt_u32_x (p0, z0),
+		z0 = svcnt_x (p0, z0))
+
+/*
+** cnt_u32_x_untied:
+**	cnt	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u32_x_untied, svuint32_t,
+		z0 = svcnt_u32_x (p0, z1),
+		z0 = svcnt_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u64.c
new file mode 100644
index 000000000..4eb69ea84
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnt_u64_m_tied12:
+**	cnt	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u64_m_tied12, svuint64_t,
+		z0 = svcnt_u64_m (z0, p0, z0),
+		z0 = svcnt_m (z0, p0, z0))
+
+/*
+** cnt_u64_m_tied1:
+**	cnt	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u64_m_tied1, svuint64_t,
+		z0 = svcnt_u64_m (z0, p0, z1),
+		z0 = svcnt_m (z0, p0, z1))
+
+/*
+** cnt_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	cnt	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u64_m_tied2, svuint64_t,
+		z0 = svcnt_u64_m (z1, p0, z0),
+		z0 = svcnt_m (z1, p0, z0))
+
+/*
+** cnt_u64_m_untied:
+**	movprfx	z0, z2
+**	cnt	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u64_m_untied, svuint64_t,
+		z0 = svcnt_u64_m (z2, p0, z1),
+		z0 = svcnt_m (z2, p0, z1))
+
+/*
+** cnt_u64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	cnt	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u64_z_tied1, svuint64_t,
+		z0 = svcnt_u64_z (p0, z0),
+		z0 = svcnt_z (p0, z0))
+
+/*
+** cnt_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	cnt	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u64_z_untied, svuint64_t,
+		z0 = svcnt_u64_z (p0, z1),
+		z0 = svcnt_z (p0, z1))
+
+/*
+** cnt_u64_x_tied1:
+**	cnt	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u64_x_tied1, svuint64_t,
+		z0 = svcnt_u64_x (p0, z0),
+		z0 = svcnt_x (p0, z0))
+
+/*
+** cnt_u64_x_untied:
+**	cnt	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u64_x_untied, svuint64_t,
+		z0 = svcnt_u64_x (p0, z1),
+		z0 = svcnt_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u8.c
new file mode 100644
index 000000000..30e798302
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnt_u8.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnt_u8_m_tied12:
+**	cnt	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u8_m_tied12, svuint8_t,
+		z0 = svcnt_u8_m (z0, p0, z0),
+		z0 = svcnt_m (z0, p0, z0))
+
+/*
+** cnt_u8_m_tied1:
+**	cnt	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u8_m_tied1, svuint8_t,
+		z0 = svcnt_u8_m (z0, p0, z1),
+		z0 = svcnt_m (z0, p0, z1))
+
+/*
+** cnt_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	cnt	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u8_m_tied2, svuint8_t,
+		z0 = svcnt_u8_m (z1, p0, z0),
+		z0 = svcnt_m (z1, p0, z0))
+
+/*
+** cnt_u8_m_untied:
+**	movprfx	z0, z2
+**	cnt	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u8_m_untied, svuint8_t,
+		z0 = svcnt_u8_m (z2, p0, z1),
+		z0 = svcnt_m (z2, p0, z1))
+
+/*
+** cnt_u8_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.b, p0/z, \1\.b
+**	cnt	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u8_z_tied1, svuint8_t,
+		z0 = svcnt_u8_z (p0, z0),
+		z0 = svcnt_z (p0, z0))
+
+/*
+** cnt_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	cnt	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u8_z_untied, svuint8_t,
+		z0 = svcnt_u8_z (p0, z1),
+		z0 = svcnt_z (p0, z1))
+
+/*
+** cnt_u8_x_tied1:
+**	cnt	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u8_x_tied1, svuint8_t,
+		z0 = svcnt_u8_x (p0, z0),
+		z0 = svcnt_x (p0, z0))
+
+/*
+** cnt_u8_x_untied:
+**	cnt	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (cnt_u8_x_untied, svuint8_t,
+		z0 = svcnt_u8_x (p0, z1),
+		z0 = svcnt_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c
new file mode 100644
index 000000000..8b8fe8e4f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c
@@ -0,0 +1,280 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cntb_1:
+**	cntb	x0
+**	ret
+*/
+PROTO (cntb_1, uint64_t, ()) { return svcntb (); }
+
+/*
+** cntb_2:
+**	cntb	x0, all, mul #2
+**	ret
+*/
+PROTO (cntb_2, uint64_t, ()) { return svcntb () * 2; }
+
+/*
+** cntb_3:
+**	cntb	x0, all, mul #3
+**	ret
+*/
+PROTO (cntb_3, uint64_t, ()) { return svcntb () * 3; }
+
+/*
+** cntb_4:
+**	cntb	x0, all, mul #4
+**	ret
+*/
+PROTO (cntb_4, uint64_t, ()) { return svcntb () * 4; }
+
+/*
+** cntb_8:
+**	cntb	x0, all, mul #8
+**	ret
+*/
+PROTO (cntb_8, uint64_t, ()) { return svcntb () * 8; }
+
+/*
+** cntb_15:
+**	cntb	x0, all, mul #15
+**	ret
+*/
+PROTO (cntb_15, uint64_t, ()) { return svcntb () * 15; }
+
+/*
+** cntb_16:
+**	cntb	x0, all, mul #16
+**	ret
+*/
+PROTO (cntb_16, uint64_t, ()) { return svcntb () * 16; }
+
+/* Other sequences would be OK.  */
+/*
+** cntb_17:
+**	cntb	x0, all, mul #16
+**	incb	x0
+**	ret
+*/
+PROTO (cntb_17, uint64_t, ()) { return svcntb () * 17; }
+
+/*
+** cntb_32:
+**	cntd	(x[0-9]+)
+**	lsl	x0, \1, 8
+**	ret
+*/
+PROTO (cntb_32, uint64_t, ()) { return svcntb () * 32; }
+
+/* Other sequences would be OK.  */
+/*
+** cntb_33:
+**	cntb	(x[0-9]+)
+**	lsl	x0, \1, 5
+**	incb	x0
+**	ret
+*/
+PROTO (cntb_33, uint64_t, ()) { return svcntb () * 33; }
+
+/*
+** cntb_64:
+**	cntd	(x[0-9]+)
+**	lsl	x0, \1, 9
+**	ret
+*/
+PROTO (cntb_64, uint64_t, ()) { return svcntb () * 64; }
+
+/*
+** cntb_128:
+**	cntd	(x[0-9]+)
+**	lsl	x0, \1, 10
+**	ret
+*/
+PROTO (cntb_128, uint64_t, ()) { return svcntb () * 128; }
+
+/* Other sequences would be OK.  */
+/*
+** cntb_129:
+**	cntb	(x[0-9]+)
+**	lsl	x0, \1, 7
+**	incb	x0
+**	ret
+*/
+PROTO (cntb_129, uint64_t, ()) { return svcntb () * 129; }
+
+/*
+** cntb_m1:
+**	cntb	(x[0-9]+)
+**	neg	x0, \1
+**	ret
+*/
+PROTO (cntb_m1, uint64_t, ()) { return -svcntb (); }
+
+/*
+** cntb_m13:
+**	cntb	(x[0-9]+), all, mul #13
+**	neg	x0, \1
+**	ret
+*/
+PROTO (cntb_m13, uint64_t, ()) { return -svcntb () * 13; }
+
+/*
+** cntb_m15:
+**	cntb	(x[0-9]+), all, mul #15
+**	neg	x0, \1
+**	ret
+*/
+PROTO (cntb_m15, uint64_t, ()) { return -svcntb () * 15; }
+
+/*
+** cntb_m16:
+**	cntb	(x[0-9]+), all, mul #16
+**	neg	x0, \1
+**	ret
+*/
+PROTO (cntb_m16, uint64_t, ()) { return -svcntb () * 16; }
+
+/* Other sequences would be OK.  */
+/*
+** cntb_m17:
+**	cntb	x0, all, mul #16
+**	incb	x0
+**	neg	x0, x0
+**	ret
+*/
+PROTO (cntb_m17, uint64_t, ()) { return -svcntb () * 17; }
+
+/*
+** incb_1:
+**	incb	x0
+**	ret
+*/
+PROTO (incb_1, uint64_t, (uint64_t x0)) { return x0 + svcntb (); }
+
+/*
+** incb_2:
+**	incb	x0, all, mul #2
+**	ret
+*/
+PROTO (incb_2, uint64_t, (uint64_t x0)) { return x0 + svcntb () * 2; }
+
+/*
+** incb_3:
+**	incb	x0, all, mul #3
+**	ret
+*/
+PROTO (incb_3, uint64_t, (uint64_t x0)) { return x0 + svcntb () * 3; }
+
+/*
+** incb_4:
+**	incb	x0, all, mul #4
+**	ret
+*/
+PROTO (incb_4, uint64_t, (uint64_t x0)) { return x0 + svcntb () * 4; }
+
+/*
+** incb_8:
+**	incb	x0, all, mul #8
+**	ret
+*/
+PROTO (incb_8, uint64_t, (uint64_t x0)) { return x0 + svcntb () * 8; }
+
+/*
+** incb_15:
+**	incb	x0, all, mul #15
+**	ret
+*/
+PROTO (incb_15, uint64_t, (uint64_t x0)) { return x0 + svcntb () * 15; }
+
+/*
+** incb_16:
+**	incb	x0, all, mul #16
+**	ret
+*/
+PROTO (incb_16, uint64_t, (uint64_t x0)) { return x0 + svcntb () * 16; }
+
+/*
+** incb_17:
+**	addvl	x0, x0, #17
+**	ret
+*/
+PROTO (incb_17, uint64_t, (uint64_t x0)) { return x0 + svcntb () * 17; }
+
+/*
+** incb_31:
+**	addvl	x0, x0, #31
+**	ret
+*/
+PROTO (incb_31, uint64_t, (uint64_t x0)) { return x0 + svcntb () * 31; }
+
+/*
+** decb_1:
+**	decb	x0
+**	ret
+*/
+PROTO (decb_1, uint64_t, (uint64_t x0)) { return x0 - svcntb (); }
+
+/*
+** decb_2:
+**	decb	x0, all, mul #2
+**	ret
+*/
+PROTO (decb_2, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 2; }
+
+/*
+** decb_3:
+**	decb	x0, all, mul #3
+**	ret
+*/
+PROTO (decb_3, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 3; }
+
+/*
+** decb_4:
+**	decb	x0, all, mul #4
+**	ret
+*/
+PROTO (decb_4, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 4; }
+
+/*
+** decb_8:
+**	decb	x0, all, mul #8
+**	ret
+*/
+PROTO (decb_8, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 8; }
+
+/*
+** decb_15:
+**	decb	x0, all, mul #15
+**	ret
+*/
+PROTO (decb_15, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 15; }
+
+/*
+** decb_16:
+**	decb	x0, all, mul #16
+**	ret
+*/
+PROTO (decb_16, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 16; }
+
+/*
+** decb_17:
+**	addvl	x0, x0, #-17
+**	ret
+*/
+PROTO (decb_17, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 17; }
+
+/*
+** decb_31:
+**	addvl	x0, x0, #-31
+**	ret
+*/
+PROTO (decb_31, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 31; }
+
+/*
+** decb_32:
+**	addvl	x0, x0, #-32
+**	ret
+*/
+PROTO (decb_32, uint64_t, (uint64_t x0)) { return x0 - svcntb () * 32; }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb_pat.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb_pat.c
new file mode 100644
index 000000000..effc5668d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb_pat.c
@@ -0,0 +1,432 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cntb_pow2:
+**	cntb	x0, pow2
+**	ret
+*/
+PROTO (cntb_pow2, uint64_t, ()) { return svcntb_pat (SV_POW2); }
+
+/*
+** cntb_vl1:
+**	mov	x0, #?1
+**	ret
+*/
+PROTO (cntb_vl1, uint64_t, ()) { return svcntb_pat (SV_VL1); }
+
+/*
+** cntb_vl2:
+**	mov	x0, #?2
+**	ret
+*/
+PROTO (cntb_vl2, uint64_t, ()) { return svcntb_pat (SV_VL2); }
+
+/*
+** cntb_vl3:
+**	mov	x0, #?3
+**	ret
+*/
+PROTO (cntb_vl3, uint64_t, ()) { return svcntb_pat (SV_VL3); }
+
+/*
+** cntb_vl4:
+**	mov	x0, #?4
+**	ret
+*/
+PROTO (cntb_vl4, uint64_t, ()) { return svcntb_pat (SV_VL4); }
+
+/*
+** cntb_vl5:
+**	mov	x0, #?5
+**	ret
+*/
+PROTO (cntb_vl5, uint64_t, ()) { return svcntb_pat (SV_VL5); }
+
+/*
+** cntb_vl6:
+**	mov	x0, #?6
+**	ret
+*/
+PROTO (cntb_vl6, uint64_t, ()) { return svcntb_pat (SV_VL6); }
+
+/*
+** cntb_vl7:
+**	mov	x0, #?7
+**	ret
+*/
+PROTO (cntb_vl7, uint64_t, ()) { return svcntb_pat (SV_VL7); }
+
+/*
+** cntb_vl8:
+**	mov	x0, #?8
+**	ret
+*/
+PROTO (cntb_vl8, uint64_t, ()) { return svcntb_pat (SV_VL8); }
+
+/*
+** cntb_vl16:
+**	mov	x0, #?16
+**	ret
+*/
+PROTO (cntb_vl16, uint64_t, ()) { return svcntb_pat (SV_VL16); }
+
+/*
+** cntb_vl32:
+**	cntb	x0, vl32
+**	ret
+*/
+PROTO (cntb_vl32, uint64_t, ()) { return svcntb_pat (SV_VL32); }
+
+/*
+** cntb_vl64:
+**	cntb	x0, vl64
+**	ret
+*/
+PROTO (cntb_vl64, uint64_t, ()) { return svcntb_pat (SV_VL64); }
+
+/*
+** cntb_vl128:
+**	cntb	x0, vl128
+**	ret
+*/
+PROTO (cntb_vl128, uint64_t, ()) { return svcntb_pat (SV_VL128); }
+
+/*
+** cntb_vl256:
+**	cntb	x0, vl256
+**	ret
+*/
+PROTO (cntb_vl256, uint64_t, ()) { return svcntb_pat (SV_VL256); }
+
+/*
+** cntb_mul3:
+**	cntb	x0, mul3
+**	ret
+*/
+PROTO (cntb_mul3, uint64_t, ()) { return svcntb_pat (SV_MUL3); }
+
+/*
+** cntb_mul4:
+**	cntb	x0, mul4
+**	ret
+*/
+PROTO (cntb_mul4, uint64_t, ()) { return svcntb_pat (SV_MUL4); }
+
+/*
+** cntb_all:
+**	cntb	x0
+**	ret
+*/
+PROTO (cntb_all, uint64_t, ()) { return svcntb_pat (SV_ALL); }
+
+/*
+** incb_32_pow2:
+**	incb	x0, pow2
+**	ret
+*/
+PROTO (incb_32_pow2, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_POW2); }
+
+/*
+** incb_32_vl1:
+**	add	w0, w0, #?1
+**	ret
+*/
+PROTO (incb_32_vl1, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL1); }
+
+/*
+** incb_32_vl2:
+**	add	w0, w0, #?2
+**	ret
+*/
+PROTO (incb_32_vl2, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL2); }
+
+/*
+** incb_32_vl3:
+**	add	w0, w0, #?3
+**	ret
+*/
+PROTO (incb_32_vl3, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL3); }
+
+/*
+** incb_32_vl4:
+**	add	w0, w0, #?4
+**	ret
+*/
+PROTO (incb_32_vl4, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL4); }
+
+/*
+** incb_32_vl5:
+**	add	w0, w0, #?5
+**	ret
+*/
+PROTO (incb_32_vl5, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL5); }
+
+/*
+** incb_32_vl6:
+**	add	w0, w0, #?6
+**	ret
+*/
+PROTO (incb_32_vl6, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL6); }
+
+/*
+** incb_32_vl7:
+**	add	w0, w0, #?7
+**	ret
+*/
+PROTO (incb_32_vl7, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL7); }
+
+/*
+** incb_32_vl8:
+**	add	w0, w0, #?8
+**	ret
+*/
+PROTO (incb_32_vl8, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL8); }
+
+/*
+** incb_32_vl16:
+**	add	w0, w0, #?16
+**	ret
+*/
+PROTO (incb_32_vl16, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL16); }
+
+/*
+** incb_32_vl32:
+**	incb	x0, vl32
+**	ret
+*/
+PROTO (incb_32_vl32, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL32); }
+
+/*
+** incb_32_vl64:
+**	incb	x0, vl64
+**	ret
+*/
+PROTO (incb_32_vl64, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL64); }
+
+/*
+** incb_32_vl128:
+**	incb	x0, vl128
+**	ret
+*/
+PROTO (incb_32_vl128, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL128); }
+
+/*
+** incb_32_vl256:
+**	incb	x0, vl256
+**	ret
+*/
+PROTO (incb_32_vl256, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_VL256); }
+
+/*
+** incb_32_mul3:
+**	incb	x0, mul3
+**	ret
+*/
+PROTO (incb_32_mul3, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_MUL3); }
+
+/*
+** incb_32_mul4:
+**	incb	x0, mul4
+**	ret
+*/
+PROTO (incb_32_mul4, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_MUL4); }
+
+/*
+** incb_32_all:
+**	incb	x0
+**	ret
+*/
+PROTO (incb_32_all, uint32_t, (uint32_t w0)) { return w0 + svcntb_pat (SV_ALL); }
+
+/*
+** incb_64_pow2:
+**	incb	x0, pow2
+**	ret
+*/
+PROTO (incb_64_pow2, uint64_t, (uint64_t x0)) { return x0 + svcntb_pat (SV_POW2); }
+
+/*
+** incb_64_all:
+**	incb	x0
+**	ret
+*/
+PROTO (incb_64_all, uint64_t, (uint64_t x0)) { return x0 + svcntb_pat (SV_ALL); }
+
+/*
+** decb_32_pow2:
+**	decb	x0, pow2
+**	ret
+*/
+PROTO (decb_32_pow2, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_POW2); }
+
+/*
+** decb_32_vl1:
+**	sub	w0, w0, #?1
+**	ret
+*/
+PROTO (decb_32_vl1, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL1); }
+
+/*
+** decb_32_vl2:
+**	sub	w0, w0, #?2
+**	ret
+*/
+PROTO (decb_32_vl2, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL2); }
+
+/*
+** decb_32_vl3:
+**	sub	w0, w0, #?3
+**	ret
+*/
+PROTO (decb_32_vl3, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL3); }
+
+/*
+** decb_32_vl4:
+**	sub	w0, w0, #?4
+**	ret
+*/
+PROTO (decb_32_vl4, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL4); }
+
+/*
+** decb_32_vl5:
+**	sub	w0, w0, #?5
+**	ret
+*/
+PROTO (decb_32_vl5, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL5); }
+
+/*
+** decb_32_vl6:
+**	sub	w0, w0, #?6
+**	ret
+*/
+PROTO (decb_32_vl6, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL6); }
+
+/*
+** decb_32_vl7:
+**	sub	w0, w0, #?7
+**	ret
+*/
+PROTO (decb_32_vl7, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL7); }
+
+/*
+** decb_32_vl8:
+**	sub	w0, w0, #?8
+**	ret
+*/
+PROTO (decb_32_vl8, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL8); }
+
+/*
+** decb_32_vl16:
+**	sub	w0, w0, #?16
+**	ret
+*/
+PROTO (decb_32_vl16, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL16); }
+
+/*
+** decb_32_vl32:
+**	decb	x0, vl32
+**	ret
+*/
+PROTO (decb_32_vl32, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL32); }
+
+/*
+** decb_32_vl64:
+**	decb	x0, vl64
+**	ret
+*/
+PROTO (decb_32_vl64, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL64); }
+
+/*
+** decb_32_vl128:
+**	decb	x0, vl128
+**	ret
+*/
+PROTO (decb_32_vl128, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL128); }
+
+/*
+** decb_32_vl256:
+**	decb	x0, vl256
+**	ret
+*/
+PROTO (decb_32_vl256, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_VL256); }
+
+/*
+** decb_32_mul3:
+**	decb	x0, mul3
+**	ret
+*/
+PROTO (decb_32_mul3, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_MUL3); }
+
+/*
+** decb_32_mul4:
+**	decb	x0, mul4
+**	ret
+*/
+PROTO (decb_32_mul4, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_MUL4); }
+
+/*
+** decb_32_all:
+**	decb	x0
+**	ret
+*/
+PROTO (decb_32_all, uint32_t, (uint32_t w0)) { return w0 - svcntb_pat (SV_ALL); }
+
+/*
+** decb_64_pow2:
+**	decb	x0, pow2
+**	ret
+*/
+PROTO (decb_64_pow2, uint64_t, (uint64_t x0)) { return x0 - svcntb_pat (SV_POW2); }
+
+/*
+** decb_64_all:
+**	decb	x0
+**	ret
+*/
+PROTO (decb_64_all, uint64_t, (uint64_t x0)) { return x0 - svcntb_pat (SV_ALL); }
+
+/*
+** incb_s8_pow2_z0:
+**	cntb	x([0-9]+), pow2
+**	mov	(z[0-9]+\.b), w\1
+**	add	z0\.b, (z0\.b, \2|\2, z0\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (incb_s8_pow2_z0, svint8_t,
+		z0 = svadd_n_s8_x (svptrue_b8 (), z0, svcntb_pat (SV_POW2)),
+		z0 = svadd_x (svptrue_b8 (), z0, svcntb_pat (SV_POW2)));
+
+/*
+** incb_s8_pow2_z1:
+**	cntb	x([0-9]+), pow2
+**	mov	(z[0-9]+\.b), w\1
+**	add	z0\.b, (z1\.b, \2|\2, z1\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (incb_s8_pow2_z1, svint8_t,
+		z0 = svadd_n_s8_x (svptrue_b8 (), z1, svcntb_pat (SV_POW2)),
+		z0 = svadd_x (svptrue_b8 (), z1, svcntb_pat (SV_POW2)));
+
+/*
+** decb_s8_pow2_z0:
+**	cntb	x([0-9]+), pow2
+**	mov	(z[0-9]+\.b), w\1
+**	sub	z0\.b, z0\.b, \2
+**	ret
+*/
+TEST_UNIFORM_Z (decb_s8_pow2_z0, svint8_t,
+		z0 = svsub_n_s8_x (svptrue_b8 (), z0, svcntb_pat (SV_POW2)),
+		z0 = svsub_x (svptrue_b8 (), z0, svcntb_pat (SV_POW2)));
+
+/*
+** decb_s8_pow2_z1:
+**	cntb	x([0-9]+), pow2
+**	mov	(z[0-9]+\.b), w\1
+**	sub	z0\.b, z1\.b, \2
+**	ret
+*/
+TEST_UNIFORM_Z (decb_s8_pow2_z1, svint8_t,
+		z0 = svsub_n_s8_x (svptrue_b8 (), z1, svcntb_pat (SV_POW2)),
+		z0 = svsub_x (svptrue_b8 (), z1, svcntb_pat (SV_POW2)));
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c
new file mode 100644
index 000000000..0d0ed4849
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c
@@ -0,0 +1,278 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cntd_1:
+**	cntd	x0
+**	ret
+*/
+PROTO (cntd_1, uint64_t, ()) { return svcntd (); }
+
+/*
+** cntd_2:
+**	cntw	x0
+**	ret
+*/
+PROTO (cntd_2, uint64_t, ()) { return svcntd () * 2; }
+
+/*
+** cntd_3:
+**	cntd	x0, all, mul #3
+**	ret
+*/
+PROTO (cntd_3, uint64_t, ()) { return svcntd () * 3; }
+
+/*
+** cntd_4:
+**	cnth	x0
+**	ret
+*/
+PROTO (cntd_4, uint64_t, ()) { return svcntd () * 4; }
+
+/*
+** cntd_8:
+**	cntb	x0
+**	ret
+*/
+PROTO (cntd_8, uint64_t, ()) { return svcntd () * 8; }
+
+/*
+** cntd_15:
+**	cntd	x0, all, mul #15
+**	ret
+*/
+PROTO (cntd_15, uint64_t, ()) { return svcntd () * 15; }
+
+/*
+** cntd_16:
+**	cntb	x0, all, mul #2
+**	ret
+*/
+PROTO (cntd_16, uint64_t, ()) { return svcntd () * 16; }
+
+/* Other sequences would be OK.  */
+/*
+** cntd_17:
+**	cntb	x0, all, mul #2
+**	incd	x0
+**	ret
+*/
+PROTO (cntd_17, uint64_t, ()) { return svcntd () * 17; }
+
+/*
+** cntd_32:
+**	cntb	x0, all, mul #4
+**	ret
+*/
+PROTO (cntd_32, uint64_t, ()) { return svcntd () * 32; }
+
+/*
+** cntd_64:
+**	cntb	x0, all, mul #8
+**	ret
+*/
+PROTO (cntd_64, uint64_t, ()) { return svcntd () * 64; }
+
+/*
+** cntd_128:
+**	cntb	x0, all, mul #16
+**	ret
+*/
+PROTO (cntd_128, uint64_t, ()) { return svcntd () * 128; }
+
+/*
+** cntd_m1:
+**	cntd	(x[0-9]+)
+**	neg	x0, \1
+**	ret
+*/
+PROTO (cntd_m1, uint64_t, ()) { return -svcntd (); }
+
+/*
+** cntd_m13:
+**	cntd	(x[0-9]+), all, mul #13
+**	neg	x0, \1
+**	ret
+*/
+PROTO (cntd_m13, uint64_t, ()) { return -svcntd () * 13; }
+
+/*
+** cntd_m15:
+**	cntd	(x[0-9]+), all, mul #15
+**	neg	x0, \1
+**	ret
+*/
+PROTO (cntd_m15, uint64_t, ()) { return -svcntd () * 15; }
+
+/*
+** cntd_m16:
+**	cntb	(x[0-9]+), all, mul #2
+**	neg	x0, \1
+**	ret
+*/
+PROTO (cntd_m16, uint64_t, ()) { return -svcntd () * 16; }
+
+/* Other sequences would be OK.  */
+/*
+** cntd_m17:
+**	cntb	x0, all, mul #2
+**	incd	x0
+**	neg	x0, x0
+**	ret
+*/
+PROTO (cntd_m17, uint64_t, ()) { return -svcntd () * 17; }
+
+/*
+** incd_1:
+**	incd	x0
+**	ret
+*/
+PROTO (incd_1, uint64_t, (uint64_t x0)) { return x0 + svcntd (); }
+
+/*
+** incd_2:
+**	incw	x0
+**	ret
+*/
+PROTO (incd_2, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 2; }
+
+/*
+** incd_3:
+**	incd	x0, all, mul #3
+**	ret
+*/
+PROTO (incd_3, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 3; }
+
+/*
+** incd_4:
+**	inch	x0
+**	ret
+*/
+PROTO (incd_4, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 4; }
+
+/*
+** incd_7:
+**	incd	x0, all, mul #7
+**	ret
+*/
+PROTO (incd_7, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 7; }
+
+/*
+** incd_8:
+**	incb	x0
+**	ret
+*/
+PROTO (incd_8, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 8; }
+
+/*
+** incd_9:
+**	incd	x0, all, mul #9
+**	ret
+*/
+PROTO (incd_9, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 9; }
+
+/*
+** incd_15:
+**	incd	x0, all, mul #15
+**	ret
+*/
+PROTO (incd_15, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 15; }
+
+/*
+** incd_16:
+**	incb	x0, all, mul #2
+**	ret
+*/
+PROTO (incd_16, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 16; }
+
+/*
+** incd_18:
+**	incw	x0, all, mul #9
+**	ret
+*/
+PROTO (incd_18, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 18; }
+
+/*
+** incd_30:
+**	incw	x0, all, mul #15
+**	ret
+*/
+PROTO (incd_30, uint64_t, (uint64_t x0)) { return x0 + svcntd () * 30; }
+
+/*
+** decd_1:
+**	decd	x0
+**	ret
+*/
+PROTO (decd_1, uint64_t, (uint64_t x0)) { return x0 - svcntd (); }
+
+/*
+** decd_2:
+**	decw	x0
+**	ret
+*/
+PROTO (decd_2, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 2; }
+
+/*
+** decd_3:
+**	decd	x0, all, mul #3
+**	ret
+*/
+PROTO (decd_3, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 3; }
+
+/*
+** decd_4:
+**	dech	x0
+**	ret
+*/
+PROTO (decd_4, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 4; }
+
+/*
+** decd_7:
+**	decd	x0, all, mul #7
+**	ret
+*/
+PROTO (decd_7, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 7; }
+
+/*
+** decd_8:
+**	decb	x0
+**	ret
+*/
+PROTO (decd_8, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 8; }
+
+/*
+** decd_9:
+**	decd	x0, all, mul #9
+**	ret
+*/
+PROTO (decd_9, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 9; }
+
+/*
+** decd_15:
+**	decd	x0, all, mul #15
+**	ret
+*/
+PROTO (decd_15, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 15; }
+
+/*
+** decd_16:
+**	decb	x0, all, mul #2
+**	ret
+*/
+PROTO (decd_16, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 16; }
+
+/*
+** decd_18:
+**	decw	x0, all, mul #9
+**	ret
+*/
+PROTO (decd_18, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 18; }
+
+/*
+** decd_30:
+**	decw	x0, all, mul #15
+**	ret
+*/
+PROTO (decd_30, uint64_t, (uint64_t x0)) { return x0 - svcntd () * 30; }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd_pat.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd_pat.c
new file mode 100644
index 000000000..31ecde7ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd_pat.c
@@ -0,0 +1,426 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cntd_pow2:
+**	cntd	x0, pow2
+**	ret
+*/
+PROTO (cntd_pow2, uint64_t, ()) { return svcntd_pat (SV_POW2); }
+
+/*
+** cntd_vl1:
+**	mov	x0, #?1
+**	ret
+*/
+PROTO (cntd_vl1, uint64_t, ()) { return svcntd_pat (SV_VL1); }
+
+/*
+** cntd_vl2:
+**	mov	x0, #?2
+**	ret
+*/
+PROTO (cntd_vl2, uint64_t, ()) { return svcntd_pat (SV_VL2); }
+
+/*
+** cntd_vl3:
+**	cntd	x0, vl3
+**	ret
+*/
+PROTO (cntd_vl3, uint64_t, ()) { return svcntd_pat (SV_VL3); }
+
+/*
+** cntd_vl4:
+**	cntd	x0, vl4
+**	ret
+*/
+PROTO (cntd_vl4, uint64_t, ()) { return svcntd_pat (SV_VL4); }
+
+/*
+** cntd_vl5:
+**	cntd	x0, vl5
+**	ret
+*/
+PROTO (cntd_vl5, uint64_t, ()) { return svcntd_pat (SV_VL5); }
+
+/*
+** cntd_vl6:
+**	cntd	x0, vl6
+**	ret
+*/
+PROTO (cntd_vl6, uint64_t, ()) { return svcntd_pat (SV_VL6); }
+
+/*
+** cntd_vl7:
+**	cntd	x0, vl7
+**	ret
+*/
+PROTO (cntd_vl7, uint64_t, ()) { return svcntd_pat (SV_VL7); }
+
+/*
+** cntd_vl8:
+**	cntd	x0, vl8
+**	ret
+*/
+PROTO (cntd_vl8, uint64_t, ()) { return svcntd_pat (SV_VL8); }
+
+/*
+** cntd_vl16:
+**	cntd	x0, vl16
+**	ret
+*/
+PROTO (cntd_vl16, uint64_t, ()) { return svcntd_pat (SV_VL16); }
+
+/*
+** cntd_vl32:
+**	cntd	x0, vl32
+**	ret
+*/
+PROTO (cntd_vl32, uint64_t, ()) { return svcntd_pat (SV_VL32); }
+
+/*
+** cntd_vl64:
+**	cntd	x0, vl64
+**	ret
+*/
+PROTO (cntd_vl64, uint64_t, ()) { return svcntd_pat (SV_VL64); }
+
+/*
+** cntd_vl128:
+**	cntd	x0, vl128
+**	ret
+*/
+PROTO (cntd_vl128, uint64_t, ()) { return svcntd_pat (SV_VL128); }
+
+/*
+** cntd_vl256:
+**	cntd	x0, vl256
+**	ret
+*/
+PROTO (cntd_vl256, uint64_t, ()) { return svcntd_pat (SV_VL256); }
+
+/*
+** cntd_mul3:
+**	cntd	x0, mul3
+**	ret
+*/
+PROTO (cntd_mul3, uint64_t, ()) { return svcntd_pat (SV_MUL3); }
+
+/*
+** cntd_mul4:
+**	cntd	x0, mul4
+**	ret
+*/
+PROTO (cntd_mul4, uint64_t, ()) { return svcntd_pat (SV_MUL4); }
+
+/*
+** cntd_all:
+**	cntd	x0
+**	ret
+*/
+PROTO (cntd_all, uint64_t, ()) { return svcntd_pat (SV_ALL); }
+
+/*
+** incd_32_pow2:
+**	incd	x0, pow2
+**	ret
+*/
+PROTO (incd_32_pow2, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_POW2); }
+
+/*
+** incd_32_vl1:
+**	add	w0, w0, #?1
+**	ret
+*/
+PROTO (incd_32_vl1, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL1); }
+
+/*
+** incd_32_vl2:
+**	add	w0, w0, #?2
+**	ret
+*/
+PROTO (incd_32_vl2, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL2); }
+
+/*
+** incd_32_vl3:
+**	incd	x0, vl3
+**	ret
+*/
+PROTO (incd_32_vl3, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL3); }
+
+/*
+** incd_32_vl4:
+**	incd	x0, vl4
+**	ret
+*/
+PROTO (incd_32_vl4, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL4); }
+
+/*
+** incd_32_vl5:
+**	incd	x0, vl5
+**	ret
+*/
+PROTO (incd_32_vl5, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL5); }
+
+/*
+** incd_32_vl6:
+**	incd	x0, vl6
+**	ret
+*/
+PROTO (incd_32_vl6, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL6); }
+
+/*
+** incd_32_vl7:
+**	incd	x0, vl7
+**	ret
+*/
+PROTO (incd_32_vl7, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL7); }
+
+/*
+** incd_32_vl8:
+**	incd	x0, vl8
+**	ret
+*/
+PROTO (incd_32_vl8, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL8); }
+
+/*
+** incd_32_vl16:
+**	incd	x0, vl16
+**	ret
+*/
+PROTO (incd_32_vl16, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL16); }
+
+/*
+** incd_32_vl32:
+**	incd	x0, vl32
+**	ret
+*/
+PROTO (incd_32_vl32, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL32); }
+
+/*
+** incd_32_vl64:
+**	incd	x0, vl64
+**	ret
+*/
+PROTO (incd_32_vl64, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL64); }
+
+/*
+** incd_32_vl128:
+**	incd	x0, vl128
+**	ret
+*/
+PROTO (incd_32_vl128, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL128); }
+
+/*
+** incd_32_vl256:
+**	incd	x0, vl256
+**	ret
+*/
+PROTO (incd_32_vl256, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_VL256); }
+
+/*
+** incd_32_mul3:
+**	incd	x0, mul3
+**	ret
+*/
+PROTO (incd_32_mul3, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_MUL3); }
+
+/*
+** incd_32_mul4:
+**	incd	x0, mul4
+**	ret
+*/
+PROTO (incd_32_mul4, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_MUL4); }
+
+/*
+** incd_32_all:
+**	incd	x0
+**	ret
+*/
+PROTO (incd_32_all, uint32_t, (uint32_t w0)) { return w0 + svcntd_pat (SV_ALL); }
+
+/*
+** incd_64_pow2:
+**	incd	x0, pow2
+**	ret
+*/
+PROTO (incd_64_pow2, uint64_t, (uint64_t x0)) { return x0 + svcntd_pat (SV_POW2); }
+
+/*
+** incd_64_all:
+**	incd	x0
+**	ret
+*/
+PROTO (incd_64_all, uint64_t, (uint64_t x0)) { return x0 + svcntd_pat (SV_ALL); }
+
+/*
+** decd_32_pow2:
+**	decd	x0, pow2
+**	ret
+*/
+PROTO (decd_32_pow2, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_POW2); }
+
+/*
+** decd_32_vl1:
+**	sub	w0, w0, #?1
+**	ret
+*/
+PROTO (decd_32_vl1, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL1); }
+
+/*
+** decd_32_vl2:
+**	sub	w0, w0, #?2
+**	ret
+*/
+PROTO (decd_32_vl2, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL2); }
+
+/*
+** decd_32_vl3:
+**	decd	x0, vl3
+**	ret
+*/
+PROTO (decd_32_vl3, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL3); }
+
+/*
+** decd_32_vl4:
+**	decd	x0, vl4
+**	ret
+*/
+PROTO (decd_32_vl4, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL4); }
+
+/*
+** decd_32_vl5:
+**	decd	x0, vl5
+**	ret
+*/
+PROTO (decd_32_vl5, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL5); }
+
+/*
+** decd_32_vl6:
+**	decd	x0, vl6
+**	ret
+*/
+PROTO (decd_32_vl6, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL6); }
+
+/*
+** decd_32_vl7:
+**	decd	x0, vl7
+**	ret
+*/
+PROTO (decd_32_vl7, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL7); }
+
+/*
+** decd_32_vl8:
+**	decd	x0, vl8
+**	ret
+*/
+PROTO (decd_32_vl8, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL8); }
+
+/*
+** decd_32_vl16:
+**	decd	x0, vl16
+**	ret
+*/
+PROTO (decd_32_vl16, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL16); }
+
+/*
+** decd_32_vl32:
+**	decd	x0, vl32
+**	ret
+*/
+PROTO (decd_32_vl32, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL32); }
+
+/*
+** decd_32_vl64:
+**	decd	x0, vl64
+**	ret
+*/
+PROTO (decd_32_vl64, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL64); }
+
+/*
+** decd_32_vl128:
+**	decd	x0, vl128
+**	ret
+*/
+PROTO (decd_32_vl128, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL128); }
+
+/*
+** decd_32_vl256:
+**	decd	x0, vl256
+**	ret
+*/
+PROTO (decd_32_vl256, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_VL256); }
+
+/*
+** decd_32_mul3:
+**	decd	x0, mul3
+**	ret
+*/
+PROTO (decd_32_mul3, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_MUL3); }
+
+/*
+** decd_32_mul4:
+**	decd	x0, mul4
+**	ret
+*/
+PROTO (decd_32_mul4, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_MUL4); }
+
+/*
+** decd_32_all:
+**	decd	x0
+**	ret
+*/
+PROTO (decd_32_all, uint32_t, (uint32_t w0)) { return w0 - svcntd_pat (SV_ALL); }
+
+/*
+** decd_64_pow2:
+**	decd	x0, pow2
+**	ret
+*/
+PROTO (decd_64_pow2, uint64_t, (uint64_t x0)) { return x0 - svcntd_pat (SV_POW2); }
+
+/*
+** decd_64_all:
+**	decd	x0
+**	ret
+*/
+PROTO (decd_64_all, uint64_t, (uint64_t x0)) { return x0 - svcntd_pat (SV_ALL); }
+
+/*
+** incd_s64_pow2_z0:
+**	incd	z0\.d, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (incd_s64_pow2_z0, svint64_t,
+		z0 = svadd_n_s64_x (svptrue_b64 (), z0, svcntd_pat (SV_POW2)),
+		z0 = svadd_x (svptrue_b64 (), z0, svcntd_pat (SV_POW2)));
+
+/*
+** incd_s64_pow2_z1:
+**	movprfx	z0, z1
+**	incd	z0\.d, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (incd_s64_pow2_z1, svint64_t,
+		z0 = svadd_n_s64_x (svptrue_b64 (), z1, svcntd_pat (SV_POW2)),
+		z0 = svadd_x (svptrue_b64 (), z1, svcntd_pat (SV_POW2)));
+
+/*
+** decd_s64_pow2_z0:
+**	decd	z0\.d, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (decd_s64_pow2_z0, svint64_t,
+		z0 = svsub_n_s64_x (svptrue_b64 (), z0, svcntd_pat (SV_POW2)),
+		z0 = svsub_x (svptrue_b64 (), z0, svcntd_pat (SV_POW2)));
+
+/*
+** decd_s64_pow2_z1:
+**	movprfx	z0, z1
+**	decd	z0\.d, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (decd_s64_pow2_z1, svint64_t,
+		z0 = svsub_n_s64_x (svptrue_b64 (), z1, svcntd_pat (SV_POW2)),
+		z0 = svsub_x (svptrue_b64 (), z1, svcntd_pat (SV_POW2)));
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c
new file mode 100644
index 000000000..c29930f15
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c
@@ -0,0 +1,280 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnth_1:
+**	cnth	x0
+**	ret
+*/
+PROTO (cnth_1, uint64_t, ()) { return svcnth (); }
+
+/*
+** cnth_2:
+**	cntb	x0
+**	ret
+*/
+PROTO (cnth_2, uint64_t, ()) { return svcnth () * 2; }
+
+/*
+** cnth_3:
+**	cnth	x0, all, mul #3
+**	ret
+*/
+PROTO (cnth_3, uint64_t, ()) { return svcnth () * 3; }
+
+/*
+** cnth_4:
+**	cntb	x0, all, mul #2
+**	ret
+*/
+PROTO (cnth_4, uint64_t, ()) { return svcnth () * 4; }
+
+/*
+** cnth_8:
+**	cntb	x0, all, mul #4
+**	ret
+*/
+PROTO (cnth_8, uint64_t, ()) { return svcnth () * 8; }
+
+/*
+** cnth_15:
+**	cnth	x0, all, mul #15
+**	ret
+*/
+PROTO (cnth_15, uint64_t, ()) { return svcnth () * 15; }
+
+/*
+** cnth_16:
+**	cntb	x0, all, mul #8
+**	ret
+*/
+PROTO (cnth_16, uint64_t, ()) { return svcnth () * 16; }
+
+/* Other sequences would be OK.  */
+/*
+** cnth_17:
+**	cntb	x0, all, mul #8
+**	inch	x0
+**	ret
+*/
+PROTO (cnth_17, uint64_t, ()) { return svcnth () * 17; }
+
+/*
+** cnth_32:
+**	cntb	x0, all, mul #16
+**	ret
+*/
+PROTO (cnth_32, uint64_t, ()) { return svcnth () * 32; }
+
+/*
+** cnth_64:
+**	cntd	(x[0-9]+)
+**	lsl	x0, \1, 8
+**	ret
+*/
+PROTO (cnth_64, uint64_t, ()) { return svcnth () * 64; }
+
+/*
+** cnth_128:
+**	cntd	(x[0-9]+)
+**	lsl	x0, \1, 9
+**	ret
+*/
+PROTO (cnth_128, uint64_t, ()) { return svcnth () * 128; }
+
+/*
+** cnth_m1:
+**	cnth	(x[0-9]+)
+**	neg	x0, \1
+**	ret
+*/
+PROTO (cnth_m1, uint64_t, ()) { return -svcnth (); }
+
+/*
+** cnth_m13:
+**	cnth	(x[0-9]+), all, mul #13
+**	neg	x0, \1
+**	ret
+*/
+PROTO (cnth_m13, uint64_t, ()) { return -svcnth () * 13; }
+
+/*
+** cnth_m15:
+**	cnth	(x[0-9]+), all, mul #15
+**	neg	x0, \1
+**	ret
+*/
+PROTO (cnth_m15, uint64_t, ()) { return -svcnth () * 15; }
+
+/*
+** cnth_m16:
+**	cntb	(x[0-9]+), all, mul #8
+**	neg	x0, \1
+**	ret
+*/
+PROTO (cnth_m16, uint64_t, ()) { return -svcnth () * 16; }
+
+/* Other sequences would be OK.  */
+/*
+** cnth_m17:
+**	cntb	x0, all, mul #8
+**	inch	x0
+**	neg	x0, x0
+**	ret
+*/
+PROTO (cnth_m17, uint64_t, ()) { return -svcnth () * 17; }
+
+/*
+** inch_1:
+**	inch	x0
+**	ret
+*/
+PROTO (inch_1, uint64_t, (uint64_t x0)) { return x0 + svcnth (); }
+
+/*
+** inch_2:
+**	incb	x0
+**	ret
+*/
+PROTO (inch_2, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 2; }
+
+/*
+** inch_3:
+**	inch	x0, all, mul #3
+**	ret
+*/
+PROTO (inch_3, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 3; }
+
+/*
+** inch_4:
+**	incb	x0, all, mul #2
+**	ret
+*/
+PROTO (inch_4, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 4; }
+
+/*
+** inch_7:
+**	inch	x0, all, mul #7
+**	ret
+*/
+PROTO (inch_7, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 7; }
+
+/*
+** inch_8:
+**	incb	x0, all, mul #4
+**	ret
+*/
+PROTO (inch_8, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 8; }
+
+/*
+** inch_9:
+**	inch	x0, all, mul #9
+**	ret
+*/
+PROTO (inch_9, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 9; }
+
+/*
+** inch_15:
+**	inch	x0, all, mul #15
+**	ret
+*/
+PROTO (inch_15, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 15; }
+
+/*
+** inch_16:
+**	incb	x0, all, mul #8
+**	ret
+*/
+PROTO (inch_16, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 16; }
+
+/*
+** inch_18:
+**	incb	x0, all, mul #9
+**	ret
+*/
+PROTO (inch_18, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 18; }
+
+/*
+** inch_30:
+**	incb	x0, all, mul #15
+**	ret
+*/
+PROTO (inch_30, uint64_t, (uint64_t x0)) { return x0 + svcnth () * 30; }
+
+/*
+** dech_1:
+**	dech	x0
+**	ret
+*/
+PROTO (dech_1, uint64_t, (uint64_t x0)) { return x0 - svcnth (); }
+
+/*
+** dech_2:
+**	decb	x0
+**	ret
+*/
+PROTO (dech_2, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 2; }
+
+/*
+** dech_3:
+**	dech	x0, all, mul #3
+**	ret
+*/
+PROTO (dech_3, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 3; }
+
+/*
+** dech_4:
+**	decb	x0, all, mul #2
+**	ret
+*/
+PROTO (dech_4, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 4; }
+
+/*
+** dech_7:
+**	dech	x0, all, mul #7
+**	ret
+*/
+PROTO (dech_7, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 7; }
+
+/*
+** dech_8:
+**	decb	x0, all, mul #4
+**	ret
+*/
+PROTO (dech_8, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 8; }
+
+/*
+** dech_9:
+**	dech	x0, all, mul #9
+**	ret
+*/
+PROTO (dech_9, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 9; }
+
+/*
+** dech_15:
+**	dech	x0, all, mul #15
+**	ret
+*/
+PROTO (dech_15, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 15; }
+
+/*
+** dech_16:
+**	decb	x0, all, mul #8
+**	ret
+*/
+PROTO (dech_16, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 16; }
+
+/*
+** dech_18:
+**	decb	x0, all, mul #9
+**	ret
+*/
+PROTO (dech_18, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 18; }
+
+/*
+** dech_30:
+**	decb	x0, all, mul #15
+**	ret
+*/
+PROTO (dech_30, uint64_t, (uint64_t x0)) { return x0 - svcnth () * 30; }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth_pat.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth_pat.c
new file mode 100644
index 000000000..7a42e7ad9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth_pat.c
@@ -0,0 +1,426 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnth_pow2:
+**	cnth	x0, pow2
+**	ret
+*/
+PROTO (cnth_pow2, uint64_t, ()) { return svcnth_pat (SV_POW2); }
+
+/*
+** cnth_vl1:
+**	mov	x0, #?1
+**	ret
+*/
+PROTO (cnth_vl1, uint64_t, ()) { return svcnth_pat (SV_VL1); }
+
+/*
+** cnth_vl2:
+**	mov	x0, #?2
+**	ret
+*/
+PROTO (cnth_vl2, uint64_t, ()) { return svcnth_pat (SV_VL2); }
+
+/*
+** cnth_vl3:
+**	mov	x0, #?3
+**	ret
+*/
+PROTO (cnth_vl3, uint64_t, ()) { return svcnth_pat (SV_VL3); }
+
+/*
+** cnth_vl4:
+**	mov	x0, #?4
+**	ret
+*/
+PROTO (cnth_vl4, uint64_t, ()) { return svcnth_pat (SV_VL4); }
+
+/*
+** cnth_vl5:
+**	mov	x0, #?5
+**	ret
+*/
+PROTO (cnth_vl5, uint64_t, ()) { return svcnth_pat (SV_VL5); }
+
+/*
+** cnth_vl6:
+**	mov	x0, #?6
+**	ret
+*/
+PROTO (cnth_vl6, uint64_t, ()) { return svcnth_pat (SV_VL6); }
+
+/*
+** cnth_vl7:
+**	mov	x0, #?7
+**	ret
+*/
+PROTO (cnth_vl7, uint64_t, ()) { return svcnth_pat (SV_VL7); }
+
+/*
+** cnth_vl8:
+**	mov	x0, #?8
+**	ret
+*/
+PROTO (cnth_vl8, uint64_t, ()) { return svcnth_pat (SV_VL8); }
+
+/*
+** cnth_vl16:
+**	cnth	x0, vl16
+**	ret
+*/
+PROTO (cnth_vl16, uint64_t, ()) { return svcnth_pat (SV_VL16); }
+
+/*
+** cnth_vl32:
+**	cnth	x0, vl32
+**	ret
+*/
+PROTO (cnth_vl32, uint64_t, ()) { return svcnth_pat (SV_VL32); }
+
+/*
+** cnth_vl64:
+**	cnth	x0, vl64
+**	ret
+*/
+PROTO (cnth_vl64, uint64_t, ()) { return svcnth_pat (SV_VL64); }
+
+/*
+** cnth_vl128:
+**	cnth	x0, vl128
+**	ret
+*/
+PROTO (cnth_vl128, uint64_t, ()) { return svcnth_pat (SV_VL128); }
+
+/*
+** cnth_vl256:
+**	cnth	x0, vl256
+**	ret
+*/
+PROTO (cnth_vl256, uint64_t, ()) { return svcnth_pat (SV_VL256); }
+
+/*
+** cnth_mul3:
+**	cnth	x0, mul3
+**	ret
+*/
+PROTO (cnth_mul3, uint64_t, ()) { return svcnth_pat (SV_MUL3); }
+
+/*
+** cnth_mul4:
+**	cnth	x0, mul4
+**	ret
+*/
+PROTO (cnth_mul4, uint64_t, ()) { return svcnth_pat (SV_MUL4); }
+
+/*
+** cnth_all:
+**	cnth	x0
+**	ret
+*/
+PROTO (cnth_all, uint64_t, ()) { return svcnth_pat (SV_ALL); }
+
+/*
+** inch_32_pow2:
+**	inch	x0, pow2
+**	ret
+*/
+PROTO (inch_32_pow2, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_POW2); }
+
+/*
+** inch_32_vl1:
+**	add	w0, w0, #?1
+**	ret
+*/
+PROTO (inch_32_vl1, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL1); }
+
+/*
+** inch_32_vl2:
+**	add	w0, w0, #?2
+**	ret
+*/
+PROTO (inch_32_vl2, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL2); }
+
+/*
+** inch_32_vl3:
+**	add	w0, w0, #?3
+**	ret
+*/
+PROTO (inch_32_vl3, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL3); }
+
+/*
+** inch_32_vl4:
+**	add	w0, w0, #?4
+**	ret
+*/
+PROTO (inch_32_vl4, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL4); }
+
+/*
+** inch_32_vl5:
+**	add	w0, w0, #?5
+**	ret
+*/
+PROTO (inch_32_vl5, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL5); }
+
+/*
+** inch_32_vl6:
+**	add	w0, w0, #?6
+**	ret
+*/
+PROTO (inch_32_vl6, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL6); }
+
+/*
+** inch_32_vl7:
+**	add	w0, w0, #?7
+**	ret
+*/
+PROTO (inch_32_vl7, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL7); }
+
+/*
+** inch_32_vl8:
+**	add	w0, w0, #?8
+**	ret
+*/
+PROTO (inch_32_vl8, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL8); }
+
+/*
+** inch_32_vl16:
+**	inch	x0, vl16
+**	ret
+*/
+PROTO (inch_32_vl16, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL16); }
+
+/*
+** inch_32_vl32:
+**	inch	x0, vl32
+**	ret
+*/
+PROTO (inch_32_vl32, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL32); }
+
+/*
+** inch_32_vl64:
+**	inch	x0, vl64
+**	ret
+*/
+PROTO (inch_32_vl64, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL64); }
+
+/*
+** inch_32_vl128:
+**	inch	x0, vl128
+**	ret
+*/
+PROTO (inch_32_vl128, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL128); }
+
+/*
+** inch_32_vl256:
+**	inch	x0, vl256
+**	ret
+*/
+PROTO (inch_32_vl256, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_VL256); }
+
+/*
+** inch_32_mul3:
+**	inch	x0, mul3
+**	ret
+*/
+PROTO (inch_32_mul3, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_MUL3); }
+
+/*
+** inch_32_mul4:
+**	inch	x0, mul4
+**	ret
+*/
+PROTO (inch_32_mul4, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_MUL4); }
+
+/*
+** inch_32_all:
+**	inch	x0
+**	ret
+*/
+PROTO (inch_32_all, uint32_t, (uint32_t w0)) { return w0 + svcnth_pat (SV_ALL); }
+
+/*
+** inch_64_pow2:
+**	inch	x0, pow2
+**	ret
+*/
+PROTO (inch_64_pow2, uint64_t, (uint64_t x0)) { return x0 + svcnth_pat (SV_POW2); }
+
+/*
+** inch_64_all:
+**	inch	x0
+**	ret
+*/
+PROTO (inch_64_all, uint64_t, (uint64_t x0)) { return x0 + svcnth_pat (SV_ALL); }
+
+/*
+** dech_32_pow2:
+**	dech	x0, pow2
+**	ret
+*/
+PROTO (dech_32_pow2, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_POW2); }
+
+/*
+** dech_32_vl1:
+**	sub	w0, w0, #?1
+**	ret
+*/
+PROTO (dech_32_vl1, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL1); }
+
+/*
+** dech_32_vl2:
+**	sub	w0, w0, #?2
+**	ret
+*/
+PROTO (dech_32_vl2, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL2); }
+
+/*
+** dech_32_vl3:
+**	sub	w0, w0, #?3
+**	ret
+*/
+PROTO (dech_32_vl3, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL3); }
+
+/*
+** dech_32_vl4:
+**	sub	w0, w0, #?4
+**	ret
+*/
+PROTO (dech_32_vl4, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL4); }
+
+/*
+** dech_32_vl5:
+**	sub	w0, w0, #?5
+**	ret
+*/
+PROTO (dech_32_vl5, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL5); }
+
+/*
+** dech_32_vl6:
+**	sub	w0, w0, #?6
+**	ret
+*/
+PROTO (dech_32_vl6, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL6); }
+
+/*
+** dech_32_vl7:
+**	sub	w0, w0, #?7
+**	ret
+*/
+PROTO (dech_32_vl7, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL7); }
+
+/*
+** dech_32_vl8:
+**	sub	w0, w0, #?8
+**	ret
+*/
+PROTO (dech_32_vl8, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL8); }
+
+/*
+** dech_32_vl16:
+**	dech	x0, vl16
+**	ret
+*/
+PROTO (dech_32_vl16, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL16); }
+
+/*
+** dech_32_vl32:
+**	dech	x0, vl32
+**	ret
+*/
+PROTO (dech_32_vl32, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL32); }
+
+/*
+** dech_32_vl64:
+**	dech	x0, vl64
+**	ret
+*/
+PROTO (dech_32_vl64, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL64); }
+
+/*
+** dech_32_vl128:
+**	dech	x0, vl128
+**	ret
+*/
+PROTO (dech_32_vl128, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL128); }
+
+/*
+** dech_32_vl256:
+**	dech	x0, vl256
+**	ret
+*/
+PROTO (dech_32_vl256, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_VL256); }
+
+/*
+** dech_32_mul3:
+**	dech	x0, mul3
+**	ret
+*/
+PROTO (dech_32_mul3, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_MUL3); }
+
+/*
+** dech_32_mul4:
+**	dech	x0, mul4
+**	ret
+*/
+PROTO (dech_32_mul4, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_MUL4); }
+
+/*
+** dech_32_all:
+**	dech	x0
+**	ret
+*/
+PROTO (dech_32_all, uint32_t, (uint32_t w0)) { return w0 - svcnth_pat (SV_ALL); }
+
+/*
+** dech_64_pow2:
+**	dech	x0, pow2
+**	ret
+*/
+PROTO (dech_64_pow2, uint64_t, (uint64_t x0)) { return x0 - svcnth_pat (SV_POW2); }
+
+/*
+** dech_64_all:
+**	dech	x0
+**	ret
+*/
+PROTO (dech_64_all, uint64_t, (uint64_t x0)) { return x0 - svcnth_pat (SV_ALL); }
+
+/*
+** inch_s16_pow2_z0:
+**	inch	z0\.h, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (inch_s16_pow2_z0, svint16_t,
+		z0 = svadd_n_s16_x (svptrue_b16 (), z0, svcnth_pat (SV_POW2)),
+		z0 = svadd_x (svptrue_b16 (), z0, svcnth_pat (SV_POW2)));
+
+/*
+** inch_s16_pow2_z1:
+**	movprfx	z0, z1
+**	inch	z0\.h, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (inch_s16_pow2_z1, svint16_t,
+		z0 = svadd_n_s16_x (svptrue_b16 (), z1, svcnth_pat (SV_POW2)),
+		z0 = svadd_x (svptrue_b16 (), z1, svcnth_pat (SV_POW2)));
+
+/*
+** dech_s16_pow2_z0:
+**	dech	z0\.h, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (dech_s16_pow2_z0, svint16_t,
+		z0 = svsub_n_s16_x (svptrue_b16 (), z0, svcnth_pat (SV_POW2)),
+		z0 = svsub_x (svptrue_b16 (), z0, svcnth_pat (SV_POW2)));
+
+/*
+** dech_s16_pow2_z1:
+**	movprfx	z0, z1
+**	dech	z0\.h, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (dech_s16_pow2_z1, svint16_t,
+		z0 = svsub_n_s16_x (svptrue_b16 (), z1, svcnth_pat (SV_POW2)),
+		z0 = svsub_x (svptrue_b16 (), z1, svcnth_pat (SV_POW2)));
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b16.c
new file mode 100644
index 000000000..d88b9e5f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b16.c
@@ -0,0 +1,243 @@
+/* { dg-additional-options "-msve-vector-bits=scalable" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+#include <stdbool.h>
+
+/*
+** cnt_b16_32:
+**	cntp	x0, p0, p1\.h
+**	ret
+*/
+TEST_PTEST (cnt_b16_32, uint32_t,
+	    x0 = svcntp_b16 (p0, p1));
+
+/*
+** cnt_b16_64:
+**	cntp	x0, p0, p1\.h
+**	ret
+*/
+TEST_PTEST (cnt_b16_64, uint64_t,
+	    x0 = svcntp_b16 (p0, p1));
+
+/*
+** inc_b16_32_general_x0:
+**	cntp	x([0-9]+), p0, p1\.h
+**	add	w0, (w0, w\1|w\1, w0)
+**	ret
+*/
+TEST_PTEST (inc_b16_32_general_x0, uint32_t,
+	    x0 += svcntp_b16 (p0, p1));
+
+/*
+** inc_b16_32_general_x1:
+**	cntp	x([0-9]+), p0, p1\.h
+**	add	w0, (w1, w\1|w\1, w1)
+**	ret
+*/
+TEST_PTEST (inc_b16_32_general_x1, uint32_t,
+	    x0 = x1 + svcntp_b16 (p0, p1));
+
+/*
+** inc_b16_32_ptrue_x0:
+**	incp	x0, p1\.h
+**	ret
+*/
+TEST_PTEST (inc_b16_32_ptrue_x0, uint32_t,
+	    x0 += svcntp_b16 (svptrue_b16 (), p1));
+
+/*
+** inc_b16_32_ptrue_x1:
+**	mov	w0, w1
+**	incp	x0, p1\.h
+**	ret
+*/
+TEST_PTEST (inc_b16_32_ptrue_x1, uint32_t,
+	    x0 = x1 + svcntp_b16 (svptrue_b16 (), p1));
+
+/*
+** inc_b16_64_general_x0:
+**	cntp	(x[0-9]+), p0, p1\.h
+**	add	x0, (x0, \1|\1, x0)
+**	ret
+*/
+TEST_PTEST (inc_b16_64_general_x0, uint64_t,
+	    x0 += svcntp_b16 (p0, p1));
+
+/*
+** inc_b16_64_general_x1:
+**	cntp	(x[0-9]+), p0, p1\.h
+**	add	x0, (x1, \1|\1, x1)
+**	ret
+*/
+TEST_PTEST (inc_b16_64_general_x1, uint64_t,
+	    x0 = x1 + svcntp_b16 (p0, p1));
+
+/*
+** inc_b16_64_ptrue_x0:
+**	incp	x0, p1\.h
+**	ret
+*/
+TEST_PTEST (inc_b16_64_ptrue_x0, uint64_t,
+	    x0 += svcntp_b16 (svptrue_b16 (), p1));
+
+/*
+** inc_b16_64_ptrue_x1:
+**	mov	x0, x1
+**	incp	x0, p1\.h
+**	ret
+*/
+TEST_PTEST (inc_b16_64_ptrue_x1, uint64_t,
+	    x0 = x1 + svcntp_b16 (svptrue_b16 (), p1));
+
+/*
+** dec_b16_32_general_x0:
+**	cntp	x([0-9]+), p0, p1\.h
+**	sub	w0, w0, w\1
+**	ret
+*/
+TEST_PTEST (dec_b16_32_general_x0, uint32_t,
+	    x0 -= svcntp_b16 (p0, p1));
+
+/*
+** dec_b16_32_general_x1:
+**	cntp	x([0-9]+), p0, p1\.h
+**	sub	w0, w1, w\1
+**	ret
+*/
+TEST_PTEST (dec_b16_32_general_x1, uint32_t,
+	    x0 = x1 - svcntp_b16 (p0, p1));
+
+/*
+** dec_b16_32_ptrue_x0:
+**	decp	x0, p1\.h
+**	ret
+*/
+TEST_PTEST (dec_b16_32_ptrue_x0, uint32_t,
+	    x0 -= svcntp_b16 (svptrue_b16 (), p1));
+
+/*
+** dec_b16_32_ptrue_x1:
+**	mov	w0, w1
+**	decp	x0, p1\.h
+**	ret
+*/
+TEST_PTEST (dec_b16_32_ptrue_x1, uint32_t,
+	    x0 = x1 - svcntp_b16 (svptrue_b16 (), p1));
+
+/*
+** dec_b16_64_general_x0:
+**	cntp	(x[0-9]+), p0, p1\.h
+**	sub	x0, x0, \1
+**	ret
+*/
+TEST_PTEST (dec_b16_64_general_x0, uint64_t,
+	    x0 -= svcntp_b16 (p0, p1));
+
+/*
+** dec_b16_64_general_x1:
+**	cntp	(x[0-9]+), p0, p1\.h
+**	sub	x0, x1, \1
+**	ret
+*/
+TEST_PTEST (dec_b16_64_general_x1, uint64_t,
+	    x0 = x1 - svcntp_b16 (p0, p1));
+
+/*
+** dec_b16_64_ptrue_x0:
+**	decp	x0, p1\.h
+**	ret
+*/
+TEST_PTEST (dec_b16_64_ptrue_x0, uint64_t,
+	    x0 -= svcntp_b16 (svptrue_b16 (), p1));
+
+/*
+** dec_b16_64_ptrue_x1:
+**	mov	x0, x1
+**	decp	x0, p1\.h
+**	ret
+*/
+TEST_PTEST (dec_b16_64_ptrue_x1, uint64_t,
+	    x0 = x1 - svcntp_b16 (svptrue_b16 (), p1));
+
+/*
+** inc_b16_u16_general_z0:
+**	cntp	x([0-9]+), p0, p1\.h
+**	mov	(z[0-9]+\.h), w\1
+**	add	z0\.h, (z0\.h, \2|\2, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (inc_b16_u16_general_z0, svuint16_t,
+		z0 = svadd_n_u16_x (svptrue_b16 (), z0, svcntp_b16 (p0, p1)),
+		z0 = svadd_x (svptrue_b16 (), z0, svcntp_b16 (p0, p1)));
+
+/*
+** inc_b16_u16_general_z1:
+**	cntp	x([0-9]+), p0, p1\.h
+**	mov	(z[0-9]+\.h), w\1
+**	add	z0\.h, (z1\.h, \2|\2, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (inc_b16_u16_general_z1, svuint16_t,
+		z0 = svadd_n_u16_x (svptrue_b16 (), z1, svcntp_b16 (p0, p1)),
+		z0 = svadd_x (svptrue_b16 (), z1, svcntp_b16 (p0, p1)));
+
+/*
+** inc_b16_u16_ptrue_z0:
+**	incp	z0\.h, p0
+**	ret
+*/
+TEST_UNIFORM_Z (inc_b16_u16_ptrue_z0, svuint16_t,
+		z0 = svadd_n_u16_x (svptrue_b16 (), z0, svcntp_b16 (svptrue_b16 (), p0)),
+		z0 = svadd_x (svptrue_b16 (), z0, svcntp_b16 (svptrue_b16 (), p0)));
+
+/*
+** inc_b16_u16_ptrue_z1:
+**	movprfx	z0, z1
+**	incp	z0\.h, p0
+**	ret
+*/
+TEST_UNIFORM_Z (inc_b16_u16_ptrue_z1, svuint16_t,
+		z0 = svadd_n_u16_x (svptrue_b16 (), z1, svcntp_b16 (svptrue_b16 (), p0)),
+		z0 = svadd_x (svptrue_b16 (), z1, svcntp_b16 (svptrue_b16 (), p0)));
+
+/*
+** dec_b16_u16_general_z0:
+**	cntp	x([0-9]+), p0, p1\.h
+**	mov	(z[0-9]+\.h), w\1
+**	sub	z0\.h, z0\.h, \2
+**	ret
+*/
+TEST_UNIFORM_Z (dec_b16_u16_general_z0, svuint16_t,
+		z0 = svsub_n_u16_x (svptrue_b16 (), z0, svcntp_b16 (p0, p1)),
+		z0 = svsub_x (svptrue_b16 (), z0, svcntp_b16 (p0, p1)));
+
+/*
+** dec_b16_u16_general_z1:
+**	cntp	x([0-9]+), p0, p1\.h
+**	mov	(z[0-9]+\.h), w\1
+**	sub	z0\.h, z1\.h, \2
+**	ret
+*/
+TEST_UNIFORM_Z (dec_b16_u16_general_z1, svuint16_t,
+		z0 = svsub_n_u16_x (svptrue_b16 (), z1, svcntp_b16 (p0, p1)),
+		z0 = svsub_x (svptrue_b16 (), z1, svcntp_b16 (p0, p1)));
+
+/*
+** dec_b16_u16_ptrue_z0:
+**	decp	z0\.h, p0
+**	ret
+*/
+TEST_UNIFORM_Z (dec_b16_u16_ptrue_z0, svuint16_t,
+		z0 = svsub_n_u16_x (svptrue_b16 (), z0, svcntp_b16 (svptrue_b16 (), p0)),
+		z0 = svsub_x (svptrue_b16 (), z0, svcntp_b16 (svptrue_b16 (), p0)));
+
+/*
+** dec_b16_u16_ptrue_z1:
+**	movprfx	z0, z1
+**	decp	z0\.h, p0
+**	ret
+*/
+TEST_UNIFORM_Z (dec_b16_u16_ptrue_z1, svuint16_t,
+		z0 = svsub_n_u16_x (svptrue_b16 (), z1, svcntp_b16 (svptrue_b16 (), p0)),
+		z0 = svsub_x (svptrue_b16 (), z1, svcntp_b16 (svptrue_b16 (), p0)));
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b32.c
new file mode 100644
index 000000000..0da818895
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b32.c
@@ -0,0 +1,243 @@
+/* { dg-additional-options "-msve-vector-bits=scalable" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+#include <stdbool.h>
+
+/*
+** cnt_b32_32:
+**	cntp	x0, p0, p1\.s
+**	ret
+*/
+TEST_PTEST (cnt_b32_32, uint32_t,
+	    x0 = svcntp_b32 (p0, p1));
+
+/*
+** cnt_b32_64:
+**	cntp	x0, p0, p1\.s
+**	ret
+*/
+TEST_PTEST (cnt_b32_64, uint64_t,
+	    x0 = svcntp_b32 (p0, p1));
+
+/*
+** inc_b32_32_general_x0:
+**	cntp	x([0-9]+), p0, p1\.s
+**	add	w0, (w0, w\1|w\1, w0)
+**	ret
+*/
+TEST_PTEST (inc_b32_32_general_x0, uint32_t,
+	    x0 += svcntp_b32 (p0, p1));
+
+/*
+** inc_b32_32_general_x1:
+**	cntp	x([0-9]+), p0, p1\.s
+**	add	w0, (w1, w\1|w\1, w1)
+**	ret
+*/
+TEST_PTEST (inc_b32_32_general_x1, uint32_t,
+	    x0 = x1 + svcntp_b32 (p0, p1));
+
+/*
+** inc_b32_32_ptrue_x0:
+**	incp	x0, p1\.s
+**	ret
+*/
+TEST_PTEST (inc_b32_32_ptrue_x0, uint32_t,
+	    x0 += svcntp_b32 (svptrue_b32 (), p1));
+
+/*
+** inc_b32_32_ptrue_x1:
+**	mov	w0, w1
+**	incp	x0, p1\.s
+**	ret
+*/
+TEST_PTEST (inc_b32_32_ptrue_x1, uint32_t,
+	    x0 = x1 + svcntp_b32 (svptrue_b32 (), p1));
+
+/*
+** inc_b32_64_general_x0:
+**	cntp	(x[0-9]+), p0, p1\.s
+**	add	x0, (x0, \1|\1, x0)
+**	ret
+*/
+TEST_PTEST (inc_b32_64_general_x0, uint64_t,
+	    x0 += svcntp_b32 (p0, p1));
+
+/*
+** inc_b32_64_general_x1:
+**	cntp	(x[0-9]+), p0, p1\.s
+**	add	x0, (x1, \1|\1, x1)
+**	ret
+*/
+TEST_PTEST (inc_b32_64_general_x1, uint64_t,
+	    x0 = x1 + svcntp_b32 (p0, p1));
+
+/*
+** inc_b32_64_ptrue_x0:
+**	incp	x0, p1\.s
+**	ret
+*/
+TEST_PTEST (inc_b32_64_ptrue_x0, uint64_t,
+	    x0 += svcntp_b32 (svptrue_b32 (), p1));
+
+/*
+** inc_b32_64_ptrue_x1:
+**	mov	x0, x1
+**	incp	x0, p1\.s
+**	ret
+*/
+TEST_PTEST (inc_b32_64_ptrue_x1, uint64_t,
+	    x0 = x1 + svcntp_b32 (svptrue_b32 (), p1));
+
+/*
+** dec_b32_32_general_x0:
+**	cntp	x([0-9]+), p0, p1\.s
+**	sub	w0, w0, w\1
+**	ret
+*/
+TEST_PTEST (dec_b32_32_general_x0, uint32_t,
+	    x0 -= svcntp_b32 (p0, p1));
+
+/*
+** dec_b32_32_general_x1:
+**	cntp	x([0-9]+), p0, p1\.s
+**	sub	w0, w1, w\1
+**	ret
+*/
+TEST_PTEST (dec_b32_32_general_x1, uint32_t,
+	    x0 = x1 - svcntp_b32 (p0, p1));
+
+/*
+** dec_b32_32_ptrue_x0:
+**	decp	x0, p1\.s
+**	ret
+*/
+TEST_PTEST (dec_b32_32_ptrue_x0, uint32_t,
+	    x0 -= svcntp_b32 (svptrue_b32 (), p1));
+
+/*
+** dec_b32_32_ptrue_x1:
+**	mov	w0, w1
+**	decp	x0, p1\.s
+**	ret
+*/
+TEST_PTEST (dec_b32_32_ptrue_x1, uint32_t,
+	    x0 = x1 - svcntp_b32 (svptrue_b32 (), p1));
+
+/*
+** dec_b32_64_general_x0:
+**	cntp	(x[0-9]+), p0, p1\.s
+**	sub	x0, x0, \1
+**	ret
+*/
+TEST_PTEST (dec_b32_64_general_x0, uint64_t,
+	    x0 -= svcntp_b32 (p0, p1));
+
+/*
+** dec_b32_64_general_x1:
+**	cntp	(x[0-9]+), p0, p1\.s
+**	sub	x0, x1, \1
+**	ret
+*/
+TEST_PTEST (dec_b32_64_general_x1, uint64_t,
+	    x0 = x1 - svcntp_b32 (p0, p1));
+
+/*
+** dec_b32_64_ptrue_x0:
+**	decp	x0, p1\.s
+**	ret
+*/
+TEST_PTEST (dec_b32_64_ptrue_x0, uint64_t,
+	    x0 -= svcntp_b32 (svptrue_b32 (), p1));
+
+/*
+** dec_b32_64_ptrue_x1:
+**	mov	x0, x1
+**	decp	x0, p1\.s
+**	ret
+*/
+TEST_PTEST (dec_b32_64_ptrue_x1, uint64_t,
+	    x0 = x1 - svcntp_b32 (svptrue_b32 (), p1));
+
+/*
+** inc_b32_s32_general_z0:
+**	cntp	x([0-9]+), p0, p1\.s
+**	mov	(z[0-9]+\.s), w\1
+**	add	z0\.s, (z0\.s, \2|\2, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (inc_b32_s32_general_z0, svint32_t,
+		z0 = svadd_n_s32_x (svptrue_b32 (), z0, svcntp_b32 (p0, p1)),
+		z0 = svadd_x (svptrue_b32 (), z0, svcntp_b32 (p0, p1)));
+
+/*
+** inc_b32_s32_general_z1:
+**	cntp	x([0-9]+), p0, p1\.s
+**	mov	(z[0-9]+\.s), w\1
+**	add	z0\.s, (z1\.s, \2|\2, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (inc_b32_s32_general_z1, svint32_t,
+		z0 = svadd_n_s32_x (svptrue_b32 (), z1, svcntp_b32 (p0, p1)),
+		z0 = svadd_x (svptrue_b32 (), z1, svcntp_b32 (p0, p1)));
+
+/*
+** inc_b32_s32_ptrue_z0:
+**	incp	z0\.s, p0
+**	ret
+*/
+TEST_UNIFORM_Z (inc_b32_s32_ptrue_z0, svint32_t,
+		z0 = svadd_n_s32_x (svptrue_b32 (), z0, svcntp_b32 (svptrue_b32 (), p0)),
+		z0 = svadd_x (svptrue_b32 (), z0, svcntp_b32 (svptrue_b32 (), p0)));
+
+/*
+** inc_b32_s32_ptrue_z1:
+**	movprfx	z0, z1
+**	incp	z0\.s, p0
+**	ret
+*/
+TEST_UNIFORM_Z (inc_b32_s32_ptrue_z1, svint32_t,
+		z0 = svadd_n_s32_x (svptrue_b32 (), z1, svcntp_b32 (svptrue_b32 (), p0)),
+		z0 = svadd_x (svptrue_b32 (), z1, svcntp_b32 (svptrue_b32 (), p0)));
+
+/*
+** dec_b32_s32_general_z0:
+**	cntp	x([0-9]+), p0, p1\.s
+**	mov	(z[0-9]+\.s), w\1
+**	sub	z0\.s, z0\.s, \2
+**	ret
+*/
+TEST_UNIFORM_Z (dec_b32_s32_general_z0, svint32_t,
+		z0 = svsub_n_s32_x (svptrue_b32 (), z0, svcntp_b32 (p0, p1)),
+		z0 = svsub_x (svptrue_b32 (), z0, svcntp_b32 (p0, p1)));
+
+/*
+** dec_b32_s32_general_z1:
+**	cntp	x([0-9]+), p0, p1\.s
+**	mov	(z[0-9]+\.s), w\1
+**	sub	z0\.s, z1\.s, \2
+**	ret
+*/
+TEST_UNIFORM_Z (dec_b32_s32_general_z1, svint32_t,
+		z0 = svsub_n_s32_x (svptrue_b32 (), z1, svcntp_b32 (p0, p1)),
+		z0 = svsub_x (svptrue_b32 (), z1, svcntp_b32 (p0, p1)));
+
+/*
+** dec_b32_s32_ptrue_z0:
+**	decp	z0\.s, p0
+**	ret
+*/
+TEST_UNIFORM_Z (dec_b32_s32_ptrue_z0, svint32_t,
+		z0 = svsub_n_s32_x (svptrue_b32 (), z0, svcntp_b32 (svptrue_b32 (), p0)),
+		z0 = svsub_x (svptrue_b32 (), z0, svcntp_b32 (svptrue_b32 (), p0)));
+
+/*
+** dec_b32_s32_ptrue_z1:
+**	movprfx	z0, z1
+**	decp	z0\.s, p0
+**	ret
+*/
+TEST_UNIFORM_Z (dec_b32_s32_ptrue_z1, svint32_t,
+		z0 = svsub_n_s32_x (svptrue_b32 (), z1, svcntp_b32 (svptrue_b32 (), p0)),
+		z0 = svsub_x (svptrue_b32 (), z1, svcntp_b32 (svptrue_b32 (), p0)));
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b64.c
new file mode 100644
index 000000000..6ddbaef5a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b64.c
@@ -0,0 +1,243 @@
+/* { dg-additional-options "-msve-vector-bits=scalable" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+#include <stdbool.h>
+
+/*
+** cnt_b64_32:
+**	cntp	x0, p0, p1\.d
+**	ret
+*/
+TEST_PTEST (cnt_b64_32, uint32_t,
+	    x0 = svcntp_b64 (p0, p1));
+
+/*
+** cnt_b64_64:
+**	cntp	x0, p0, p1\.d
+**	ret
+*/
+TEST_PTEST (cnt_b64_64, uint64_t,
+	    x0 = svcntp_b64 (p0, p1));
+
+/*
+** inc_b64_32_general_x0:
+**	cntp	x([0-9]+), p0, p1\.d
+**	add	w0, (w0, w\1|w\1, w0)
+**	ret
+*/
+TEST_PTEST (inc_b64_32_general_x0, uint32_t,
+	    x0 += svcntp_b64 (p0, p1));
+
+/*
+** inc_b64_32_general_x1:
+**	cntp	x([0-9]+), p0, p1\.d
+**	add	w0, (w1, w\1|w\1, w1)
+**	ret
+*/
+TEST_PTEST (inc_b64_32_general_x1, uint32_t,
+	    x0 = x1 + svcntp_b64 (p0, p1));
+
+/*
+** inc_b64_32_ptrue_x0:
+**	incp	x0, p1\.d
+**	ret
+*/
+TEST_PTEST (inc_b64_32_ptrue_x0, uint32_t,
+	    x0 += svcntp_b64 (svptrue_b64 (), p1));
+
+/*
+** inc_b64_32_ptrue_x1:
+**	mov	w0, w1
+**	incp	x0, p1\.d
+**	ret
+*/
+TEST_PTEST (inc_b64_32_ptrue_x1, uint32_t,
+	    x0 = x1 + svcntp_b64 (svptrue_b64 (), p1));
+
+/*
+** inc_b64_64_general_x0:
+**	cntp	(x[0-9]+), p0, p1\.d
+**	add	x0, (x0, \1|\1, x0)
+**	ret
+*/
+TEST_PTEST (inc_b64_64_general_x0, uint64_t,
+	    x0 += svcntp_b64 (p0, p1));
+
+/*
+** inc_b64_64_general_x1:
+**	cntp	(x[0-9]+), p0, p1\.d
+**	add	x0, (x1, \1|\1, x1)
+**	ret
+*/
+TEST_PTEST (inc_b64_64_general_x1, uint64_t,
+	    x0 = x1 + svcntp_b64 (p0, p1));
+
+/*
+** inc_b64_64_ptrue_x0:
+**	incp	x0, p1\.d
+**	ret
+*/
+TEST_PTEST (inc_b64_64_ptrue_x0, uint64_t,
+	    x0 += svcntp_b64 (svptrue_b64 (), p1));
+
+/*
+** inc_b64_64_ptrue_x1:
+**	mov	x0, x1
+**	incp	x0, p1\.d
+**	ret
+*/
+TEST_PTEST (inc_b64_64_ptrue_x1, uint64_t,
+	    x0 = x1 + svcntp_b64 (svptrue_b64 (), p1));
+
+/*
+** dec_b64_32_general_x0:
+**	cntp	x([0-9]+), p0, p1\.d
+**	sub	w0, w0, w\1
+**	ret
+*/
+TEST_PTEST (dec_b64_32_general_x0, uint32_t,
+	    x0 -= svcntp_b64 (p0, p1));
+
+/*
+** dec_b64_32_general_x1:
+**	cntp	x([0-9]+), p0, p1\.d
+**	sub	w0, w1, w\1
+**	ret
+*/
+TEST_PTEST (dec_b64_32_general_x1, uint32_t,
+	    x0 = x1 - svcntp_b64 (p0, p1));
+
+/*
+** dec_b64_32_ptrue_x0:
+**	decp	x0, p1\.d
+**	ret
+*/
+TEST_PTEST (dec_b64_32_ptrue_x0, uint32_t,
+	    x0 -= svcntp_b64 (svptrue_b64 (), p1));
+
+/*
+** dec_b64_32_ptrue_x1:
+**	mov	w0, w1
+**	decp	x0, p1\.d
+**	ret
+*/
+TEST_PTEST (dec_b64_32_ptrue_x1, uint32_t,
+	    x0 = x1 - svcntp_b64 (svptrue_b64 (), p1));
+
+/*
+** dec_b64_64_general_x0:
+**	cntp	(x[0-9]+), p0, p1\.d
+**	sub	x0, x0, \1
+**	ret
+*/
+TEST_PTEST (dec_b64_64_general_x0, uint64_t,
+	    x0 -= svcntp_b64 (p0, p1));
+
+/*
+** dec_b64_64_general_x1:
+**	cntp	(x[0-9]+), p0, p1\.d
+**	sub	x0, x1, \1
+**	ret
+*/
+TEST_PTEST (dec_b64_64_general_x1, uint64_t,
+	    x0 = x1 - svcntp_b64 (p0, p1));
+
+/*
+** dec_b64_64_ptrue_x0:
+**	decp	x0, p1\.d
+**	ret
+*/
+TEST_PTEST (dec_b64_64_ptrue_x0, uint64_t,
+	    x0 -= svcntp_b64 (svptrue_b64 (), p1));
+
+/*
+** dec_b64_64_ptrue_x1:
+**	mov	x0, x1
+**	decp	x0, p1\.d
+**	ret
+*/
+TEST_PTEST (dec_b64_64_ptrue_x1, uint64_t,
+	    x0 = x1 - svcntp_b64 (svptrue_b64 (), p1));
+
+/*
+** inc_b64_u64_general_z0:
+**	cntp	(x[0-9]+), p0, p1\.d
+**	mov	(z[0-9]+\.d), \1
+**	add	z0\.d, (z0\.d, \2|\2, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (inc_b64_u64_general_z0, svuint64_t,
+		z0 = svadd_n_u64_x (svptrue_b64 (), z0, svcntp_b64 (p0, p1)),
+		z0 = svadd_x (svptrue_b64 (), z0, svcntp_b64 (p0, p1)));
+
+/*
+** inc_b64_u64_general_z1:
+**	cntp	(x[0-9]+), p0, p1\.d
+**	mov	(z[0-9]+\.d), \1
+**	add	z0\.d, (z1\.d, \2|\2, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (inc_b64_u64_general_z1, svuint64_t,
+		z0 = svadd_n_u64_x (svptrue_b64 (), z1, svcntp_b64 (p0, p1)),
+		z0 = svadd_x (svptrue_b64 (), z1, svcntp_b64 (p0, p1)));
+
+/*
+** inc_b64_u64_ptrue_z0:
+**	incp	z0\.d, p0
+**	ret
+*/
+TEST_UNIFORM_Z (inc_b64_u64_ptrue_z0, svuint64_t,
+		z0 = svadd_n_u64_x (svptrue_b64 (), z0, svcntp_b64 (svptrue_b64 (), p0)),
+		z0 = svadd_x (svptrue_b64 (), z0, svcntp_b64 (svptrue_b64 (), p0)));
+
+/*
+** inc_b64_u64_ptrue_z1:
+**	movprfx	z0, z1
+**	incp	z0\.d, p0
+**	ret
+*/
+TEST_UNIFORM_Z (inc_b64_u64_ptrue_z1, svuint64_t,
+		z0 = svadd_n_u64_x (svptrue_b64 (), z1, svcntp_b64 (svptrue_b64 (), p0)),
+		z0 = svadd_x (svptrue_b64 (), z1, svcntp_b64 (svptrue_b64 (), p0)));
+
+/*
+** dec_b64_u64_general_z0:
+**	cntp	(x[0-9]+), p0, p1\.d
+**	mov	(z[0-9]+\.d), \1
+**	sub	z0\.d, z0\.d, \2
+**	ret
+*/
+TEST_UNIFORM_Z (dec_b64_u64_general_z0, svuint64_t,
+		z0 = svsub_n_u64_x (svptrue_b64 (), z0, svcntp_b64 (p0, p1)),
+		z0 = svsub_x (svptrue_b64 (), z0, svcntp_b64 (p0, p1)));
+
+/*
+** dec_b64_u64_general_z1:
+**	cntp	(x[0-9]+), p0, p1\.d
+**	mov	(z[0-9]+\.d), \1
+**	sub	z0\.d, z1\.d, \2
+**	ret
+*/
+TEST_UNIFORM_Z (dec_b64_u64_general_z1, svuint64_t,
+		z0 = svsub_n_u64_x (svptrue_b64 (), z1, svcntp_b64 (p0, p1)),
+		z0 = svsub_x (svptrue_b64 (), z1, svcntp_b64 (p0, p1)));
+
+/*
+** dec_b64_u64_ptrue_z0:
+**	decp	z0\.d, p0
+**	ret
+*/
+TEST_UNIFORM_Z (dec_b64_u64_ptrue_z0, svuint64_t,
+		z0 = svsub_n_u64_x (svptrue_b64 (), z0, svcntp_b64 (svptrue_b64 (), p0)),
+		z0 = svsub_x (svptrue_b64 (), z0, svcntp_b64 (svptrue_b64 (), p0)));
+
+/*
+** dec_b64_u64_ptrue_z1:
+**	movprfx	z0, z1
+**	decp	z0\.d, p0
+**	ret
+*/
+TEST_UNIFORM_Z (dec_b64_u64_ptrue_z1, svuint64_t,
+		z0 = svsub_n_u64_x (svptrue_b64 (), z1, svcntp_b64 (svptrue_b64 (), p0)),
+		z0 = svsub_x (svptrue_b64 (), z1, svcntp_b64 (svptrue_b64 (), p0)));
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b8.c
new file mode 100644
index 000000000..e02c02cd6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntp_b8.c
@@ -0,0 +1,253 @@
+/* { dg-additional-options "-msve-vector-bits=scalable" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+#include <stdbool.h>
+
+/*
+** cnt_b8_32:
+**	cntp	x0, p0, p1\.b
+**	ret
+*/
+TEST_PTEST (cnt_b8_32, uint32_t,
+	    x0 = svcntp_b8 (p0, p1));
+
+/*
+** cnt_b8_64:
+**	cntp	x0, p0, p1\.b
+**	ret
+*/
+TEST_PTEST (cnt_b8_64, uint64_t,
+	    x0 = svcntp_b8 (p0, p1));
+
+/*
+** inc_b8_32_general_x0:
+**	cntp	x([0-9]+), p0, p1\.b
+**	add	w0, (w0, w\1|w\1, w0)
+**	ret
+*/
+TEST_PTEST (inc_b8_32_general_x0, uint32_t,
+	    x0 += svcntp_b8 (p0, p1));
+
+/*
+** inc_b8_32_general_x1:
+**	cntp	x([0-9]+), p0, p1\.b
+**	add	w0, (w1, w\1|w\1, w1)
+**	ret
+*/
+TEST_PTEST (inc_b8_32_general_x1, uint32_t,
+	    x0 = x1 + svcntp_b8 (p0, p1));
+
+/*
+** inc_b8_32_ptrue_x0:
+**	incp	x0, p1\.b
+**	ret
+*/
+TEST_PTEST (inc_b8_32_ptrue_x0, uint32_t,
+	    x0 += svcntp_b8 (svptrue_b8 (), p1));
+
+/*
+** inc_b8_32_ptrue_x1:
+**	mov	w0, w1
+**	incp	x0, p1\.b
+**	ret
+*/
+TEST_PTEST (inc_b8_32_ptrue_x1, uint32_t,
+	    x0 = x1 + svcntp_b8 (svptrue_b8 (), p1));
+
+/*
+** inc_b8_64_general_x0:
+**	cntp	(x[0-9]+), p0, p1\.b
+**	add	x0, (x0, \1|\1, x0)
+**	ret
+*/
+TEST_PTEST (inc_b8_64_general_x0, uint64_t,
+	    x0 += svcntp_b8 (p0, p1));
+
+/*
+** inc_b8_64_general_x1:
+**	cntp	(x[0-9]+), p0, p1\.b
+**	add	x0, (x1, \1|\1, x1)
+**	ret
+*/
+TEST_PTEST (inc_b8_64_general_x1, uint64_t,
+	    x0 = x1 + svcntp_b8 (p0, p1));
+
+/*
+** inc_b8_64_ptrue_x0:
+**	incp	x0, p1\.b
+**	ret
+*/
+TEST_PTEST (inc_b8_64_ptrue_x0, uint64_t,
+	    x0 += svcntp_b8 (svptrue_b8 (), p1));
+
+/*
+** inc_b8_64_ptrue_x1:
+**	mov	x0, x1
+**	incp	x0, p1\.b
+**	ret
+*/
+TEST_PTEST (inc_b8_64_ptrue_x1, uint64_t,
+	    x0 = x1 + svcntp_b8 (svptrue_b8 (), p1));
+
+/*
+** dec_b8_32_general_x0:
+**	cntp	x([0-9]+), p0, p1\.b
+**	sub	w0, w0, w\1
+**	ret
+*/
+TEST_PTEST (dec_b8_32_general_x0, uint32_t,
+	    x0 -= svcntp_b8 (p0, p1));
+
+/*
+** dec_b8_32_general_x1:
+**	cntp	x([0-9]+), p0, p1\.b
+**	sub	w0, w1, w\1
+**	ret
+*/
+TEST_PTEST (dec_b8_32_general_x1, uint32_t,
+	    x0 = x1 - svcntp_b8 (p0, p1));
+
+/*
+** dec_b8_32_ptrue_x0:
+**	decp	x0, p1\.b
+**	ret
+*/
+TEST_PTEST (dec_b8_32_ptrue_x0, uint32_t,
+	    x0 -= svcntp_b8 (svptrue_b8 (), p1));
+
+/*
+** dec_b8_32_ptrue_x1:
+**	mov	w0, w1
+**	decp	x0, p1\.b
+**	ret
+*/
+TEST_PTEST (dec_b8_32_ptrue_x1, uint32_t,
+	    x0 = x1 - svcntp_b8 (svptrue_b8 (), p1));
+
+/*
+** dec_b8_64_general_x0:
+**	cntp	(x[0-9]+), p0, p1\.b
+**	sub	x0, x0, \1
+**	ret
+*/
+TEST_PTEST (dec_b8_64_general_x0, uint64_t,
+	    x0 -= svcntp_b8 (p0, p1));
+
+/*
+** dec_b8_64_general_x1:
+**	cntp	(x[0-9]+), p0, p1\.b
+**	sub	x0, x1, \1
+**	ret
+*/
+TEST_PTEST (dec_b8_64_general_x1, uint64_t,
+	    x0 = x1 - svcntp_b8 (p0, p1));
+
+/*
+** dec_b8_64_ptrue_x0:
+**	decp	x0, p1\.b
+**	ret
+*/
+TEST_PTEST (dec_b8_64_ptrue_x0, uint64_t,
+	    x0 -= svcntp_b8 (svptrue_b8 (), p1));
+
+/*
+** dec_b8_64_ptrue_x1:
+**	mov	x0, x1
+**	decp	x0, p1\.b
+**	ret
+*/
+TEST_PTEST (dec_b8_64_ptrue_x1, uint64_t,
+	    x0 = x1 - svcntp_b8 (svptrue_b8 (), p1));
+
+/*
+** inc_b8_s8_general_z0:
+**	cntp	x([0-9]+), p0, p1\.b
+**	mov	(z[0-9]+\.b), w\1
+**	add	z0\.b, (z0\.b, \2|\2, z0\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (inc_b8_s8_general_z0, svint8_t,
+		z0 = svadd_n_s8_x (svptrue_b8 (), z0, svcntp_b8 (p0, p1)),
+		z0 = svadd_x (svptrue_b8 (), z0, svcntp_b8 (p0, p1)));
+
+/*
+** inc_b8_s8_general_z1:
+**	cntp	x([0-9]+), p0, p1\.b
+**	mov	(z[0-9]+\.b), w\1
+**	add	z0\.b, (z1\.b, \2|\2, z1\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (inc_b8_s8_general_z1, svint8_t,
+		z0 = svadd_n_s8_x (svptrue_b8 (), z1, svcntp_b8 (p0, p1)),
+		z0 = svadd_x (svptrue_b8 (), z1, svcntp_b8 (p0, p1)));
+
+/*
+** inc_b8_s8_ptrue_z0:
+**	ptrue	(p[0-7])\.b, all
+**	cntp	x([0-9]+), \1, p0\.b
+**	mov	(z[0-9]+\.b), w\2
+**	add	z0\.b, (z0\.b, \3|\3, z0\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (inc_b8_s8_ptrue_z0, svint8_t,
+		z0 = svadd_n_s8_x (svptrue_b8 (), z0, svcntp_b8 (svptrue_b8 (), p0)),
+		z0 = svadd_x (svptrue_b8 (), z0, svcntp_b8 (svptrue_b8 (), p0)));
+
+/*
+** inc_b8_s8_ptrue_z1:
+**	ptrue	(p[0-7])\.b, all
+**	cntp	x([0-9]+), \1, p0\.b
+**	mov	(z[0-9]+\.b), w\2
+**	add	z0\.b, (z1\.b, \3|\3, z1\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (inc_b8_s8_ptrue_z1, svint8_t,
+		z0 = svadd_n_s8_x (svptrue_b8 (), z1, svcntp_b8 (svptrue_b8 (), p0)),
+		z0 = svadd_x (svptrue_b8 (), z1, svcntp_b8 (svptrue_b8 (), p0)));
+
+/*
+** dec_b8_s8_general_z0:
+**	cntp	x([0-9]+), p0, p1\.b
+**	mov	(z[0-9]+\.b), w\1
+**	sub	z0\.b, z0\.b, \2
+**	ret
+*/
+TEST_UNIFORM_Z (dec_b8_s8_general_z0, svint8_t,
+		z0 = svsub_n_s8_x (svptrue_b8 (), z0, svcntp_b8 (p0, p1)),
+		z0 = svsub_x (svptrue_b8 (), z0, svcntp_b8 (p0, p1)));
+
+/*
+** dec_b8_s8_general_z1:
+**	cntp	x([0-9]+), p0, p1\.b
+**	mov	(z[0-9]+\.b), w\1
+**	sub	z0\.b, z1\.b, \2
+**	ret
+*/
+TEST_UNIFORM_Z (dec_b8_s8_general_z1, svint8_t,
+		z0 = svsub_n_s8_x (svptrue_b8 (), z1, svcntp_b8 (p0, p1)),
+		z0 = svsub_x (svptrue_b8 (), z1, svcntp_b8 (p0, p1)));
+
+/*
+** dec_b8_s8_ptrue_z0:
+**	ptrue	(p[0-7])\.b, all
+**	cntp	x([0-9]+), \1, p0\.b
+**	mov	(z[0-9]+\.b), w\2
+**	sub	z0\.b, z0\.b, \3
+**	ret
+*/
+TEST_UNIFORM_Z (dec_b8_s8_ptrue_z0, svint8_t,
+		z0 = svsub_n_s8_x (svptrue_b8 (), z0, svcntp_b8 (svptrue_b8 (), p0)),
+		z0 = svsub_x (svptrue_b8 (), z0, svcntp_b8 (svptrue_b8 (), p0)));
+
+/*
+** dec_b8_s8_ptrue_z1:
+**	ptrue	(p[0-7])\.b, all
+**	cntp	x([0-9]+), \1, p0\.b
+**	mov	(z[0-9]+\.b), w\2
+**	sub	z0\.b, z1\.b, \3
+**	ret
+*/
+TEST_UNIFORM_Z (dec_b8_s8_ptrue_z1, svint8_t,
+		z0 = svsub_n_s8_x (svptrue_b8 (), z1, svcntp_b8 (svptrue_b8 (), p0)),
+		z0 = svsub_x (svptrue_b8 (), z1, svcntp_b8 (svptrue_b8 (), p0)));
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c
new file mode 100644
index 000000000..e26cc67a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c
@@ -0,0 +1,279 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cntw_1:
+**	cntw	x0
+**	ret
+*/
+PROTO (cntw_1, uint64_t, ()) { return svcntw (); }
+
+/*
+** cntw_2:
+**	cnth	x0
+**	ret
+*/
+PROTO (cntw_2, uint64_t, ()) { return svcntw () * 2; }
+
+/*
+** cntw_3:
+**	cntw	x0, all, mul #3
+**	ret
+*/
+PROTO (cntw_3, uint64_t, ()) { return svcntw () * 3; }
+
+/*
+** cntw_4:
+**	cntb	x0
+**	ret
+*/
+PROTO (cntw_4, uint64_t, ()) { return svcntw () * 4; }
+
+/*
+** cntw_8:
+**	cntb	x0, all, mul #2
+**	ret
+*/
+PROTO (cntw_8, uint64_t, ()) { return svcntw () * 8; }
+
+/*
+** cntw_15:
+**	cntw	x0, all, mul #15
+**	ret
+*/
+PROTO (cntw_15, uint64_t, ()) { return svcntw () * 15; }
+
+/*
+** cntw_16:
+**	cntb	x0, all, mul #4
+**	ret
+*/
+PROTO (cntw_16, uint64_t, ()) { return svcntw () * 16; }
+
+/* Other sequences would be OK.  */
+/*
+** cntw_17:
+**	cntb	x0, all, mul #4
+**	incw	x0
+**	ret
+*/
+PROTO (cntw_17, uint64_t, ()) { return svcntw () * 17; }
+
+/*
+** cntw_32:
+**	cntb	x0, all, mul #8
+**	ret
+*/
+PROTO (cntw_32, uint64_t, ()) { return svcntw () * 32; }
+
+/*
+** cntw_64:
+**	cntb	x0, all, mul #16
+**	ret
+*/
+PROTO (cntw_64, uint64_t, ()) { return svcntw () * 64; }
+
+/*
+** cntw_128:
+**	cntd	(x[0-9]+)
+**	lsl	x0, \1, 8
+**	ret
+*/
+PROTO (cntw_128, uint64_t, ()) { return svcntw () * 128; }
+
+/*
+** cntw_m1:
+**	cntw	(x[0-9]+)
+**	neg	x0, \1
+**	ret
+*/
+PROTO (cntw_m1, uint64_t, ()) { return -svcntw (); }
+
+/*
+** cntw_m13:
+**	cntw	(x[0-9]+), all, mul #13
+**	neg	x0, \1
+**	ret
+*/
+PROTO (cntw_m13, uint64_t, ()) { return -svcntw () * 13; }
+
+/*
+** cntw_m15:
+**	cntw	(x[0-9]+), all, mul #15
+**	neg	x0, \1
+**	ret
+*/
+PROTO (cntw_m15, uint64_t, ()) { return -svcntw () * 15; }
+
+/*
+** cntw_m16:
+**	cntb	(x[0-9]+), all, mul #4
+**	neg	x0, \1
+**	ret
+*/
+PROTO (cntw_m16, uint64_t, ()) { return -svcntw () * 16; }
+
+/* Other sequences would be OK.  */
+/*
+** cntw_m17:
+**	cntb	x0, all, mul #4
+**	incw	x0
+**	neg	x0, x0
+**	ret
+*/
+PROTO (cntw_m17, uint64_t, ()) { return -svcntw () * 17; }
+
+/*
+** incw_1:
+**	incw	x0
+**	ret
+*/
+PROTO (incw_1, uint64_t, (uint64_t x0)) { return x0 + svcntw (); }
+
+/*
+** incw_2:
+**	inch	x0
+**	ret
+*/
+PROTO (incw_2, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 2; }
+
+/*
+** incw_3:
+**	incw	x0, all, mul #3
+**	ret
+*/
+PROTO (incw_3, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 3; }
+
+/*
+** incw_4:
+**	incb	x0
+**	ret
+*/
+PROTO (incw_4, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 4; }
+
+/*
+** incw_7:
+**	incw	x0, all, mul #7
+**	ret
+*/
+PROTO (incw_7, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 7; }
+
+/*
+** incw_8:
+**	incb	x0, all, mul #2
+**	ret
+*/
+PROTO (incw_8, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 8; }
+
+/*
+** incw_9:
+**	incw	x0, all, mul #9
+**	ret
+*/
+PROTO (incw_9, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 9; }
+
+/*
+** incw_15:
+**	incw	x0, all, mul #15
+**	ret
+*/
+PROTO (incw_15, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 15; }
+
+/*
+** incw_16:
+**	incb	x0, all, mul #4
+**	ret
+*/
+PROTO (incw_16, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 16; }
+
+/*
+** incw_18:
+**	inch	x0, all, mul #9
+**	ret
+*/
+PROTO (incw_18, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 18; }
+
+/*
+** incw_30:
+**	inch	x0, all, mul #15
+**	ret
+*/
+PROTO (incw_30, uint64_t, (uint64_t x0)) { return x0 + svcntw () * 30; }
+
+/*
+** decw_1:
+**	decw	x0
+**	ret
+*/
+PROTO (decw_1, uint64_t, (uint64_t x0)) { return x0 - svcntw (); }
+
+/*
+** decw_2:
+**	dech	x0
+**	ret
+*/
+PROTO (decw_2, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 2; }
+
+/*
+** decw_3:
+**	decw	x0, all, mul #3
+**	ret
+*/
+PROTO (decw_3, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 3; }
+
+/*
+** decw_4:
+**	decb	x0
+**	ret
+*/
+PROTO (decw_4, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 4; }
+
+/*
+** decw_7:
+**	decw	x0, all, mul #7
+**	ret
+*/
+PROTO (decw_7, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 7; }
+
+/*
+** decw_8:
+**	decb	x0, all, mul #2
+**	ret
+*/
+PROTO (decw_8, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 8; }
+
+/*
+** decw_9:
+**	decw	x0, all, mul #9
+**	ret
+*/
+PROTO (decw_9, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 9; }
+
+/*
+** decw_15:
+**	decw	x0, all, mul #15
+**	ret
+*/
+PROTO (decw_15, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 15; }
+
+/*
+** decw_16:
+**	decb	x0, all, mul #4
+**	ret
+*/
+PROTO (decw_16, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 16; }
+
+/*
+** decw_18:
+**	dech	x0, all, mul #9
+**	ret
+*/
+PROTO (decw_18, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 18; }
+
+/*
+** decw_30:
+**	dech	x0, all, mul #15
+**	ret
+*/
+PROTO (decw_30, uint64_t, (uint64_t x0)) { return x0 - svcntw () * 30; }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw_pat.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw_pat.c
new file mode 100644
index 000000000..ff6b7d882
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw_pat.c
@@ -0,0 +1,426 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cntw_pow2:
+**	cntw	x0, pow2
+**	ret
+*/
+PROTO (cntw_pow2, uint64_t, ()) { return svcntw_pat (SV_POW2); }
+
+/*
+** cntw_vl1:
+**	mov	x0, #?1
+**	ret
+*/
+PROTO (cntw_vl1, uint64_t, ()) { return svcntw_pat (SV_VL1); }
+
+/*
+** cntw_vl2:
+**	mov	x0, #?2
+**	ret
+*/
+PROTO (cntw_vl2, uint64_t, ()) { return svcntw_pat (SV_VL2); }
+
+/*
+** cntw_vl3:
+**	mov	x0, #?3
+**	ret
+*/
+PROTO (cntw_vl3, uint64_t, ()) { return svcntw_pat (SV_VL3); }
+
+/*
+** cntw_vl4:
+**	mov	x0, #?4
+**	ret
+*/
+PROTO (cntw_vl4, uint64_t, ()) { return svcntw_pat (SV_VL4); }
+
+/*
+** cntw_vl5:
+**	cntw	x0, vl5
+**	ret
+*/
+PROTO (cntw_vl5, uint64_t, ()) { return svcntw_pat (SV_VL5); }
+
+/*
+** cntw_vl6:
+**	cntw	x0, vl6
+**	ret
+*/
+PROTO (cntw_vl6, uint64_t, ()) { return svcntw_pat (SV_VL6); }
+
+/*
+** cntw_vl7:
+**	cntw	x0, vl7
+**	ret
+*/
+PROTO (cntw_vl7, uint64_t, ()) { return svcntw_pat (SV_VL7); }
+
+/*
+** cntw_vl8:
+**	cntw	x0, vl8
+**	ret
+*/
+PROTO (cntw_vl8, uint64_t, ()) { return svcntw_pat (SV_VL8); }
+
+/*
+** cntw_vl16:
+**	cntw	x0, vl16
+**	ret
+*/
+PROTO (cntw_vl16, uint64_t, ()) { return svcntw_pat (SV_VL16); }
+
+/*
+** cntw_vl32:
+**	cntw	x0, vl32
+**	ret
+*/
+PROTO (cntw_vl32, uint64_t, ()) { return svcntw_pat (SV_VL32); }
+
+/*
+** cntw_vl64:
+**	cntw	x0, vl64
+**	ret
+*/
+PROTO (cntw_vl64, uint64_t, ()) { return svcntw_pat (SV_VL64); }
+
+/*
+** cntw_vl128:
+**	cntw	x0, vl128
+**	ret
+*/
+PROTO (cntw_vl128, uint64_t, ()) { return svcntw_pat (SV_VL128); }
+
+/*
+** cntw_vl256:
+**	cntw	x0, vl256
+**	ret
+*/
+PROTO (cntw_vl256, uint64_t, ()) { return svcntw_pat (SV_VL256); }
+
+/*
+** cntw_mul3:
+**	cntw	x0, mul3
+**	ret
+*/
+PROTO (cntw_mul3, uint64_t, ()) { return svcntw_pat (SV_MUL3); }
+
+/*
+** cntw_mul4:
+**	cntw	x0, mul4
+**	ret
+*/
+PROTO (cntw_mul4, uint64_t, ()) { return svcntw_pat (SV_MUL4); }
+
+/*
+** cntw_all:
+**	cntw	x0
+**	ret
+*/
+PROTO (cntw_all, uint64_t, ()) { return svcntw_pat (SV_ALL); }
+
+/*
+** incw_32_pow2:
+**	incw	x0, pow2
+**	ret
+*/
+PROTO (incw_32_pow2, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_POW2); }
+
+/*
+** incw_32_vl1:
+**	add	w0, w0, #?1
+**	ret
+*/
+PROTO (incw_32_vl1, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL1); }
+
+/*
+** incw_32_vl2:
+**	add	w0, w0, #?2
+**	ret
+*/
+PROTO (incw_32_vl2, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL2); }
+
+/*
+** incw_32_vl3:
+**	add	w0, w0, #?3
+**	ret
+*/
+PROTO (incw_32_vl3, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL3); }
+
+/*
+** incw_32_vl4:
+**	add	w0, w0, #?4
+**	ret
+*/
+PROTO (incw_32_vl4, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL4); }
+
+/*
+** incw_32_vl5:
+**	incw	x0, vl5
+**	ret
+*/
+PROTO (incw_32_vl5, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL5); }
+
+/*
+** incw_32_vl6:
+**	incw	x0, vl6
+**	ret
+*/
+PROTO (incw_32_vl6, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL6); }
+
+/*
+** incw_32_vl7:
+**	incw	x0, vl7
+**	ret
+*/
+PROTO (incw_32_vl7, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL7); }
+
+/*
+** incw_32_vl8:
+**	incw	x0, vl8
+**	ret
+*/
+PROTO (incw_32_vl8, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL8); }
+
+/*
+** incw_32_vl16:
+**	incw	x0, vl16
+**	ret
+*/
+PROTO (incw_32_vl16, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL16); }
+
+/*
+** incw_32_vl32:
+**	incw	x0, vl32
+**	ret
+*/
+PROTO (incw_32_vl32, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL32); }
+
+/*
+** incw_32_vl64:
+**	incw	x0, vl64
+**	ret
+*/
+PROTO (incw_32_vl64, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL64); }
+
+/*
+** incw_32_vl128:
+**	incw	x0, vl128
+**	ret
+*/
+PROTO (incw_32_vl128, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL128); }
+
+/*
+** incw_32_vl256:
+**	incw	x0, vl256
+**	ret
+*/
+PROTO (incw_32_vl256, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_VL256); }
+
+/*
+** incw_32_mul3:
+**	incw	x0, mul3
+**	ret
+*/
+PROTO (incw_32_mul3, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_MUL3); }
+
+/*
+** incw_32_mul4:
+**	incw	x0, mul4
+**	ret
+*/
+PROTO (incw_32_mul4, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_MUL4); }
+
+/*
+** incw_32_all:
+**	incw	x0
+**	ret
+*/
+PROTO (incw_32_all, uint32_t, (uint32_t w0)) { return w0 + svcntw_pat (SV_ALL); }
+
+/*
+** incw_64_pow2:
+**	incw	x0, pow2
+**	ret
+*/
+PROTO (incw_64_pow2, uint64_t, (uint64_t x0)) { return x0 + svcntw_pat (SV_POW2); }
+
+/*
+** incw_64_all:
+**	incw	x0
+**	ret
+*/
+PROTO (incw_64_all, uint64_t, (uint64_t x0)) { return x0 + svcntw_pat (SV_ALL); }
+
+/*
+** decw_32_pow2:
+**	decw	x0, pow2
+**	ret
+*/
+PROTO (decw_32_pow2, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_POW2); }
+
+/*
+** decw_32_vl1:
+**	sub	w0, w0, #?1
+**	ret
+*/
+PROTO (decw_32_vl1, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL1); }
+
+/*
+** decw_32_vl2:
+**	sub	w0, w0, #?2
+**	ret
+*/
+PROTO (decw_32_vl2, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL2); }
+
+/*
+** decw_32_vl3:
+**	sub	w0, w0, #?3
+**	ret
+*/
+PROTO (decw_32_vl3, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL3); }
+
+/*
+** decw_32_vl4:
+**	sub	w0, w0, #?4
+**	ret
+*/
+PROTO (decw_32_vl4, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL4); }
+
+/*
+** decw_32_vl5:
+**	decw	x0, vl5
+**	ret
+*/
+PROTO (decw_32_vl5, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL5); }
+
+/*
+** decw_32_vl6:
+**	decw	x0, vl6
+**	ret
+*/
+PROTO (decw_32_vl6, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL6); }
+
+/*
+** decw_32_vl7:
+**	decw	x0, vl7
+**	ret
+*/
+PROTO (decw_32_vl7, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL7); }
+
+/*
+** decw_32_vl8:
+**	decw	x0, vl8
+**	ret
+*/
+PROTO (decw_32_vl8, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL8); }
+
+/*
+** decw_32_vl16:
+**	decw	x0, vl16
+**	ret
+*/
+PROTO (decw_32_vl16, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL16); }
+
+/*
+** decw_32_vl32:
+**	decw	x0, vl32
+**	ret
+*/
+PROTO (decw_32_vl32, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL32); }
+
+/*
+** decw_32_vl64:
+**	decw	x0, vl64
+**	ret
+*/
+PROTO (decw_32_vl64, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL64); }
+
+/*
+** decw_32_vl128:
+**	decw	x0, vl128
+**	ret
+*/
+PROTO (decw_32_vl128, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL128); }
+
+/*
+** decw_32_vl256:
+**	decw	x0, vl256
+**	ret
+*/
+PROTO (decw_32_vl256, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_VL256); }
+
+/*
+** decw_32_mul3:
+**	decw	x0, mul3
+**	ret
+*/
+PROTO (decw_32_mul3, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_MUL3); }
+
+/*
+** decw_32_mul4:
+**	decw	x0, mul4
+**	ret
+*/
+PROTO (decw_32_mul4, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_MUL4); }
+
+/*
+** decw_32_all:
+**	decw	x0
+**	ret
+*/
+PROTO (decw_32_all, uint32_t, (uint32_t w0)) { return w0 - svcntw_pat (SV_ALL); }
+
+/*
+** decw_64_pow2:
+**	decw	x0, pow2
+**	ret
+*/
+PROTO (decw_64_pow2, uint64_t, (uint64_t x0)) { return x0 - svcntw_pat (SV_POW2); }
+
+/*
+** decw_64_all:
+**	decw	x0
+**	ret
+*/
+PROTO (decw_64_all, uint64_t, (uint64_t x0)) { return x0 - svcntw_pat (SV_ALL); }
+
+/*
+** incw_s32_pow2_z0:
+**	incw	z0\.s, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (incw_s32_pow2_z0, svint32_t,
+		z0 = svadd_n_s32_x (svptrue_b32 (), z0, svcntw_pat (SV_POW2)),
+		z0 = svadd_x (svptrue_b32 (), z0, svcntw_pat (SV_POW2)));
+
+/*
+** incw_s32_pow2_z1:
+**	movprfx	z0, z1
+**	incw	z0\.s, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (incw_s32_pow2_z1, svint32_t,
+		z0 = svadd_n_s32_x (svptrue_b32 (), z1, svcntw_pat (SV_POW2)),
+		z0 = svadd_x (svptrue_b32 (), z1, svcntw_pat (SV_POW2)));
+
+/*
+** decw_s32_pow2_z0:
+**	decw	z0\.s, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (decw_s32_pow2_z0, svint32_t,
+		z0 = svsub_n_s32_x (svptrue_b32 (), z0, svcntw_pat (SV_POW2)),
+		z0 = svsub_x (svptrue_b32 (), z0, svcntw_pat (SV_POW2)));
+
+/*
+** decw_s32_pow2_z1:
+**	movprfx	z0, z1
+**	decw	z0\.s, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (decw_s32_pow2_z1, svint32_t,
+		z0 = svsub_n_s32_x (svptrue_b32 (), z1, svcntw_pat (SV_POW2)),
+		z0 = svsub_x (svptrue_b32 (), z1, svcntw_pat (SV_POW2)));
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_f32.c
new file mode 100644
index 000000000..2e80d6830
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_f32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** compact_f32_tied1:
+**	compact	z0\.s, p0, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (compact_f32_tied1, svfloat32_t,
+		z0 = svcompact_f32 (p0, z0),
+		z0 = svcompact (p0, z0))
+
+/*
+** compact_f32_untied:
+**	compact	z0\.s, p0, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (compact_f32_untied, svfloat32_t,
+		z0 = svcompact_f32 (p0, z1),
+		z0 = svcompact (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_f64.c
new file mode 100644
index 000000000..e0bc33efe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_f64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** compact_f64_tied1:
+**	compact	z0\.d, p0, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (compact_f64_tied1, svfloat64_t,
+		z0 = svcompact_f64 (p0, z0),
+		z0 = svcompact (p0, z0))
+
+/*
+** compact_f64_untied:
+**	compact	z0\.d, p0, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (compact_f64_untied, svfloat64_t,
+		z0 = svcompact_f64 (p0, z1),
+		z0 = svcompact (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_s32.c
new file mode 100644
index 000000000..e4634982b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_s32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** compact_s32_tied1:
+**	compact	z0\.s, p0, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (compact_s32_tied1, svint32_t,
+		z0 = svcompact_s32 (p0, z0),
+		z0 = svcompact (p0, z0))
+
+/*
+** compact_s32_untied:
+**	compact	z0\.s, p0, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (compact_s32_untied, svint32_t,
+		z0 = svcompact_s32 (p0, z1),
+		z0 = svcompact (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_s64.c
new file mode 100644
index 000000000..71cb97b8a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_s64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** compact_s64_tied1:
+**	compact	z0\.d, p0, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (compact_s64_tied1, svint64_t,
+		z0 = svcompact_s64 (p0, z0),
+		z0 = svcompact (p0, z0))
+
+/*
+** compact_s64_untied:
+**	compact	z0\.d, p0, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (compact_s64_untied, svint64_t,
+		z0 = svcompact_s64 (p0, z1),
+		z0 = svcompact (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_u32.c
new file mode 100644
index 000000000..954329a0b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_u32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** compact_u32_tied1:
+**	compact	z0\.s, p0, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (compact_u32_tied1, svuint32_t,
+		z0 = svcompact_u32 (p0, z0),
+		z0 = svcompact (p0, z0))
+
+/*
+** compact_u32_untied:
+**	compact	z0\.s, p0, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (compact_u32_untied, svuint32_t,
+		z0 = svcompact_u32 (p0, z1),
+		z0 = svcompact (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_u64.c
new file mode 100644
index 000000000..ec664845f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/compact_u64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** compact_u64_tied1:
+**	compact	z0\.d, p0, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (compact_u64_tied1, svuint64_t,
+		z0 = svcompact_u64 (p0, z0),
+		z0 = svcompact (p0, z0))
+
+/*
+** compact_u64_untied:
+**	compact	z0\.d, p0, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (compact_u64_untied, svuint64_t,
+		z0 = svcompact_u64 (p0, z1),
+		z0 = svcompact (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create2_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create2_1.c
new file mode 100644
index 000000000..e9158ed8a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create2_1.c
@@ -0,0 +1,123 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** create2_s8:
+**	mov	z0\.d, z6\.d
+**	mov	z1\.d, z4\.d
+**	ret
+*/
+TEST_CREATE (create2_s8, svint8x2_t, svint8_t,
+	     z0 = svcreate2_s8 (z6, z4),
+	     z0 = svcreate2 (z6, z4))
+
+/*
+** create2_u8:
+**	mov	z0\.d, z4\.d
+**	mov	z1\.d, z6\.d
+**	ret
+*/
+TEST_CREATE (create2_u8, svuint8x2_t, svuint8_t,
+	     z0 = svcreate2_u8 (z4, z6),
+	     z0 = svcreate2 (z4, z6))
+
+/*
+** create2_s16:
+**	mov	z0\.d, z6\.d
+**	mov	z1\.d, z4\.d
+**	ret
+*/
+TEST_CREATE (create2_s16, svint16x2_t, svint16_t,
+	     z0 = svcreate2_s16 (z6, z4),
+	     z0 = svcreate2 (z6, z4))
+
+/*
+** create2_u16:
+**	mov	z0\.d, z6\.d
+**	mov	z1\.d, z5\.d
+**	ret
+*/
+TEST_CREATE (create2_u16, svuint16x2_t, svuint16_t,
+	     z0 = svcreate2_u16 (z6, z5),
+	     z0 = svcreate2 (z6, z5))
+
+/*
+** create2_bf16:
+**	mov	z0\.d, z4\.d
+**	mov	z1\.d, z5\.d
+**	ret
+*/
+TEST_CREATE (create2_bf16, svbfloat16x2_t, svbfloat16_t,
+	     z0 = svcreate2_bf16 (z4, z5),
+	     z0 = svcreate2 (z4, z5))
+
+/*
+** create2_f16:
+**	mov	z0\.d, z4\.d
+**	mov	z1\.d, z5\.d
+**	ret
+*/
+TEST_CREATE (create2_f16, svfloat16x2_t, svfloat16_t,
+	     z0 = svcreate2_f16 (z4, z5),
+	     z0 = svcreate2 (z4, z5))
+
+/*
+** create2_s32:
+**	mov	z0\.d, z6\.d
+**	mov	z1\.d, z7\.d
+**	ret
+*/
+TEST_CREATE (create2_s32, svint32x2_t, svint32_t,
+	     z0 = svcreate2_s32 (z6, z7),
+	     z0 = svcreate2 (z6, z7))
+
+/*
+** create2_u32:
+**	mov	z0\.d, z7\.d
+**	mov	z1\.d, z5\.d
+**	ret
+*/
+TEST_CREATE (create2_u32, svuint32x2_t, svuint32_t,
+	     z0 = svcreate2_u32 (z7, z5),
+	     z0 = svcreate2 (z7, z5))
+
+/*
+** create2_f32:
+**	mov	z0\.d, z7\.d
+**	mov	z1\.d, z4\.d
+**	ret
+*/
+TEST_CREATE (create2_f32, svfloat32x2_t, svfloat32_t,
+	     z0 = svcreate2_f32 (z7, z4),
+	     z0 = svcreate2 (z7, z4))
+
+/*
+** create2_s64:
+**	mov	z0\.d, z5\.d
+**	mov	z1\.d, z7\.d
+**	ret
+*/
+TEST_CREATE (create2_s64, svint64x2_t, svint64_t,
+	     z0 = svcreate2_s64 (z5, z7),
+	     z0 = svcreate2 (z5, z7))
+
+/*
+** create2_u64:
+**	mov	z0\.d, z7\.d
+**	mov	z1\.d, z6\.d
+**	ret
+*/
+TEST_CREATE (create2_u64, svuint64x2_t, svuint64_t,
+	     z0 = svcreate2_u64 (z7, z6),
+	     z0 = svcreate2 (z7, z6))
+
+/*
+** create2_f64:
+**	mov	z0\.d, z5\.d
+**	mov	z1\.d, z4\.d
+**	ret
+*/
+TEST_CREATE (create2_f64, svfloat64x2_t, svfloat64_t,
+	     z0 = svcreate2_f64 (z5, z4),
+	     z0 = svcreate2 (z5, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create3_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create3_1.c
new file mode 100644
index 000000000..6f1afb772
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create3_1.c
@@ -0,0 +1,135 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** create3_s8:
+**	mov	z0\.d, z6\.d
+**	mov	z1\.d, z4\.d
+**	mov	z2\.d, z7\.d
+**	ret
+*/
+TEST_CREATE (create3_s8, svint8x3_t, svint8_t,
+	     z0 = svcreate3_s8 (z6, z4, z7),
+	     z0 = svcreate3 (z6, z4, z7))
+
+/*
+** create3_u8:
+**	mov	z0\.d, z4\.d
+**	mov	z1\.d, z6\.d
+**	mov	z2\.d, z5\.d
+**	ret
+*/
+TEST_CREATE (create3_u8, svuint8x3_t, svuint8_t,
+	     z0 = svcreate3_u8 (z4, z6, z5),
+	     z0 = svcreate3 (z4, z6, z5))
+
+/*
+** create3_s16:
+**	mov	z0\.d, z6\.d
+**	mov	z1\.d, z4\.d
+**	mov	z2\.d, z5\.d
+**	ret
+*/
+TEST_CREATE (create3_s16, svint16x3_t, svint16_t,
+	     z0 = svcreate3_s16 (z6, z4, z5),
+	     z0 = svcreate3 (z6, z4, z5))
+
+/*
+** create3_u16:
+**	mov	z0\.d, z6\.d
+**	mov	z1\.d, z5\.d
+**	mov	z2\.d, z4\.d
+**	ret
+*/
+TEST_CREATE (create3_u16, svuint16x3_t, svuint16_t,
+	     z0 = svcreate3_u16 (z6, z5, z4),
+	     z0 = svcreate3 (z6, z5, z4))
+
+/*
+** create3_bf16:
+**	mov	z0\.d, z4\.d
+**	mov	z1\.d, z5\.d
+**	mov	z2\.d, z6\.d
+**	ret
+*/
+TEST_CREATE (create3_bf16, svbfloat16x3_t, svbfloat16_t,
+	     z0 = svcreate3_bf16 (z4, z5, z6),
+	     z0 = svcreate3 (z4, z5, z6))
+
+/*
+** create3_f16:
+**	mov	z0\.d, z4\.d
+**	mov	z1\.d, z5\.d
+**	mov	z2\.d, z6\.d
+**	ret
+*/
+TEST_CREATE (create3_f16, svfloat16x3_t, svfloat16_t,
+	     z0 = svcreate3_f16 (z4, z5, z6),
+	     z0 = svcreate3 (z4, z5, z6))
+
+/*
+** create3_s32:
+**	mov	z0\.d, z6\.d
+**	mov	z1\.d, z7\.d
+**	mov	z2\.d, z4\.d
+**	ret
+*/
+TEST_CREATE (create3_s32, svint32x3_t, svint32_t,
+	     z0 = svcreate3_s32 (z6, z7, z4),
+	     z0 = svcreate3 (z6, z7, z4))
+
+/*
+** create3_u32:
+**	mov	z0\.d, z7\.d
+**	mov	z1\.d, z5\.d
+**	mov	z2\.d, z6\.d
+**	ret
+*/
+TEST_CREATE (create3_u32, svuint32x3_t, svuint32_t,
+	     z0 = svcreate3_u32 (z7, z5, z6),
+	     z0 = svcreate3 (z7, z5, z6))
+
+/*
+** create3_f32:
+**	mov	z0\.d, z7\.d
+**	mov	z1\.d, z4\.d
+**	mov	z2\.d, z6\.d
+**	ret
+*/
+TEST_CREATE (create3_f32, svfloat32x3_t, svfloat32_t,
+	     z0 = svcreate3_f32 (z7, z4, z6),
+	     z0 = svcreate3 (z7, z4, z6))
+
+/*
+** create3_s64:
+**	mov	z0\.d, z5\.d
+**	mov	z1\.d, z7\.d
+**	mov	z2\.d, z6\.d
+**	ret
+*/
+TEST_CREATE (create3_s64, svint64x3_t, svint64_t,
+	     z0 = svcreate3_s64 (z5, z7, z6),
+	     z0 = svcreate3 (z5, z7, z6))
+
+/*
+** create3_u64:
+**	mov	z0\.d, z7\.d
+**	mov	z1\.d, z6\.d
+**	mov	z2\.d, z4\.d
+**	ret
+*/
+TEST_CREATE (create3_u64, svuint64x3_t, svuint64_t,
+	     z0 = svcreate3_u64 (z7, z6, z4),
+	     z0 = svcreate3 (z7, z6, z4))
+
+/*
+** create3_f64:
+**	mov	z0\.d, z5\.d
+**	mov	z1\.d, z4\.d
+**	mov	z2\.d, z7\.d
+**	ret
+*/
+TEST_CREATE (create3_f64, svfloat64x3_t, svfloat64_t,
+	     z0 = svcreate3_f64 (z5, z4, z7),
+	     z0 = svcreate3 (z5, z4, z7))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create4_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create4_1.c
new file mode 100644
index 000000000..a3866286e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/create4_1.c
@@ -0,0 +1,147 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** create4_s8:
+**	mov	z0\.d, z6\.d
+**	mov	z1\.d, z4\.d
+**	mov	z2\.d, z7\.d
+**	mov	z3\.d, z5\.d
+**	ret
+*/
+TEST_CREATE (create4_s8, svint8x4_t, svint8_t,
+	     z0 = svcreate4_s8 (z6, z4, z7, z5),
+	     z0 = svcreate4 (z6, z4, z7, z5))
+
+/*
+** create4_u8:
+**	mov	z0\.d, z4\.d
+**	mov	z1\.d, z6\.d
+**	mov	z2\.d, z5\.d
+**	mov	z3\.d, z7\.d
+**	ret
+*/
+TEST_CREATE (create4_u8, svuint8x4_t, svuint8_t,
+	     z0 = svcreate4_u8 (z4, z6, z5, z7),
+	     z0 = svcreate4 (z4, z6, z5, z7))
+
+/*
+** create4_s16:
+**	mov	z0\.d, z6\.d
+**	mov	z1\.d, z4\.d
+**	mov	z2\.d, z5\.d
+**	mov	z3\.d, z7\.d
+**	ret
+*/
+TEST_CREATE (create4_s16, svint16x4_t, svint16_t,
+	     z0 = svcreate4_s16 (z6, z4, z5, z7),
+	     z0 = svcreate4 (z6, z4, z5, z7))
+
+/*
+** create4_u16:
+**	mov	z0\.d, z6\.d
+**	mov	z1\.d, z5\.d
+**	mov	z2\.d, z4\.d
+**	mov	z3\.d, z7\.d
+**	ret
+*/
+TEST_CREATE (create4_u16, svuint16x4_t, svuint16_t,
+	     z0 = svcreate4_u16 (z6, z5, z4, z7),
+	     z0 = svcreate4 (z6, z5, z4, z7))
+
+/*
+** create4_bf16:
+**	mov	z0\.d, z4\.d
+**	mov	z1\.d, z5\.d
+**	mov	z2\.d, z6\.d
+**	mov	z3\.d, z7\.d
+**	ret
+*/
+TEST_CREATE (create4_bf16, svbfloat16x4_t, svbfloat16_t,
+	     z0 = svcreate4_bf16 (z4, z5, z6, z7),
+	     z0 = svcreate4 (z4, z5, z6, z7))
+
+/*
+** create4_f16:
+**	mov	z0\.d, z4\.d
+**	mov	z1\.d, z5\.d
+**	mov	z2\.d, z6\.d
+**	mov	z3\.d, z7\.d
+**	ret
+*/
+TEST_CREATE (create4_f16, svfloat16x4_t, svfloat16_t,
+	     z0 = svcreate4_f16 (z4, z5, z6, z7),
+	     z0 = svcreate4 (z4, z5, z6, z7))
+
+/*
+** create4_s32:
+**	mov	z0\.d, z6\.d
+**	mov	z1\.d, z7\.d
+**	mov	z2\.d, z4\.d
+**	mov	z3\.d, z5\.d
+**	ret
+*/
+TEST_CREATE (create4_s32, svint32x4_t, svint32_t,
+	     z0 = svcreate4_s32 (z6, z7, z4, z5),
+	     z0 = svcreate4 (z6, z7, z4, z5))
+
+/*
+** create4_u32:
+**	mov	z0\.d, z7\.d
+**	mov	z1\.d, z5\.d
+**	mov	z2\.d, z6\.d
+**	mov	z3\.d, z7\.d
+**	ret
+*/
+TEST_CREATE (create4_u32, svuint32x4_t, svuint32_t,
+	     z0 = svcreate4_u32 (z7, z5, z6, z7),
+	     z0 = svcreate4 (z7, z5, z6, z7))
+
+/*
+** create4_f32:
+**	mov	z0\.d, z7\.d
+**	mov	z1\.d, z4\.d
+**	mov	z2\.d, z6\.d
+**	mov	z3\.d, z4\.d
+**	ret
+*/
+TEST_CREATE (create4_f32, svfloat32x4_t, svfloat32_t,
+	     z0 = svcreate4_f32 (z7, z4, z6, z4),
+	     z0 = svcreate4 (z7, z4, z6, z4))
+
+/*
+** create4_s64:
+**	mov	z0\.d, z5\.d
+**	mov	z1\.d, z7\.d
+**	mov	z2\.d, z6\.d
+**	mov	z3\.d, z6\.d
+**	ret
+*/
+TEST_CREATE (create4_s64, svint64x4_t, svint64_t,
+	     z0 = svcreate4_s64 (z5, z7, z6, z6),
+	     z0 = svcreate4 (z5, z7, z6, z6))
+
+/*
+** create4_u64:
+**	mov	z0\.d, z7\.d
+**	mov	z1\.d, z6\.d
+**	mov	z2\.d, z4\.d
+**	mov	z3\.d, z5\.d
+**	ret
+*/
+TEST_CREATE (create4_u64, svuint64x4_t, svuint64_t,
+	     z0 = svcreate4_u64 (z7, z6, z4, z5),
+	     z0 = svcreate4 (z7, z6, z4, z5))
+
+/*
+** create4_f64:
+**	mov	z0\.d, z5\.d
+**	mov	z1\.d, z4\.d
+**	mov	z2\.d, z7\.d
+**	mov	z3\.d, z6\.d
+**	ret
+*/
+TEST_CREATE (create4_f64, svfloat64x4_t, svfloat64_t,
+	     z0 = svcreate4_f64 (z5, z4, z7, z6),
+	     z0 = svcreate4 (z5, z4, z7, z6))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_bf16.c
new file mode 100644
index 000000000..52baa1f58
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_bf16.c
@@ -0,0 +1,96 @@
+/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
+/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cvt_bf16_f32_m_tied1:
+**	bfcvt	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_bf16_f32_m_tied1, svbfloat16_t, svfloat32_t,
+	     z0 = svcvt_bf16_f32_m (z0, p0, z4),
+	     z0 = svcvt_bf16_m (z0, p0, z4))
+
+/*
+** cvt_bf16_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	bfcvt	z0\.h, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_bf16_f32_m_tied2, svbfloat16_t, svfloat32_t,
+		 z0_res = svcvt_bf16_f32_m (z4, p0, z0),
+		 z0_res = svcvt_bf16_m (z4, p0, z0))
+
+/*
+** cvt_bf16_f32_m_untied:
+**	movprfx	z0, z1
+**	bfcvt	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_bf16_f32_m_untied, svbfloat16_t, svfloat32_t,
+	     z0 = svcvt_bf16_f32_m (z1, p0, z4),
+	     z0 = svcvt_bf16_m (z1, p0, z4))
+
+/*
+** cvt_bf16_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	bfcvt	z0\.h, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_bf16_f32_z_tied1, svbfloat16_t, svfloat32_t,
+		 z0_res = svcvt_bf16_f32_z (p0, z0),
+		 z0_res = svcvt_bf16_z (p0, z0))
+
+/*
+** cvt_bf16_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z4\.s
+**	bfcvt	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_bf16_f32_z_untied, svbfloat16_t, svfloat32_t,
+	     z0 = svcvt_bf16_f32_z (p0, z4),
+	     z0 = svcvt_bf16_z (p0, z4))
+
+/*
+** cvt_bf16_f32_x_tied1:
+**	bfcvt	z0\.h, p0/m, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_bf16_f32_x_tied1, svbfloat16_t, svfloat32_t,
+		 z0_res = svcvt_bf16_f32_x (p0, z0),
+		 z0_res = svcvt_bf16_x (p0, z0))
+
+/*
+** cvt_bf16_f32_x_untied:
+**	bfcvt	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_bf16_f32_x_untied, svbfloat16_t, svfloat32_t,
+	     z0 = svcvt_bf16_f32_x (p0, z4),
+	     z0 = svcvt_bf16_x (p0, z4))
+
+/*
+** ptrue_cvt_bf16_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_bf16_f32_x_tied1, svbfloat16_t, svfloat32_t,
+		 z0_res = svcvt_bf16_f32_x (svptrue_b32 (), z0),
+		 z0_res = svcvt_bf16_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_cvt_bf16_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_bf16_f32_x_untied, svbfloat16_t, svfloat32_t,
+	     z0 = svcvt_bf16_f32_x (svptrue_b32 (), z4),
+	     z0 = svcvt_bf16_x (svptrue_b32 (), z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f16.c
new file mode 100644
index 000000000..5dcd48046
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f16.c
@@ -0,0 +1,731 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cvt_f16_f32_m_tied1:
+**	fcvt	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_f32_m_tied1, svfloat16_t, svfloat32_t,
+	     z0 = svcvt_f16_f32_m (z0, p0, z4),
+	     z0 = svcvt_f16_m (z0, p0, z4))
+
+/*
+** cvt_f16_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fcvt	z0\.h, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_f32_m_tied2, svfloat16_t, svfloat32_t,
+		 z0_res = svcvt_f16_f32_m (z4, p0, z0),
+		 z0_res = svcvt_f16_m (z4, p0, z0))
+
+/*
+** cvt_f16_f32_m_untied:
+**	movprfx	z0, z1
+**	fcvt	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_f32_m_untied, svfloat16_t, svfloat32_t,
+	     z0 = svcvt_f16_f32_m (z1, p0, z4),
+	     z0 = svcvt_f16_m (z1, p0, z4))
+
+/*
+** cvt_f16_f64_m_tied1:
+**	fcvt	z0\.h, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_f64_m_tied1, svfloat16_t, svfloat64_t,
+	     z0 = svcvt_f16_f64_m (z0, p0, z4),
+	     z0 = svcvt_f16_m (z0, p0, z4))
+
+/*
+** cvt_f16_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	fcvt	z0\.h, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_f64_m_tied2, svfloat16_t, svfloat64_t,
+		 z0_res = svcvt_f16_f64_m (z4, p0, z0),
+		 z0_res = svcvt_f16_m (z4, p0, z0))
+
+/*
+** cvt_f16_f64_m_untied:
+**	movprfx	z0, z1
+**	fcvt	z0\.h, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_f64_m_untied, svfloat16_t, svfloat64_t,
+	     z0 = svcvt_f16_f64_m (z1, p0, z4),
+	     z0 = svcvt_f16_m (z1, p0, z4))
+
+/*
+** cvt_f16_s16_m_tied1:
+**	scvtf	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_s16_m_tied1, svfloat16_t, svint16_t,
+	     z0 = svcvt_f16_s16_m (z0, p0, z4),
+	     z0 = svcvt_f16_m (z0, p0, z4))
+
+/*
+** cvt_f16_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	scvtf	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_s16_m_tied2, svfloat16_t, svint16_t,
+		 z0_res = svcvt_f16_s16_m (z4, p0, z0),
+		 z0_res = svcvt_f16_m (z4, p0, z0))
+
+/*
+** cvt_f16_s16_m_untied:
+**	movprfx	z0, z1
+**	scvtf	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_s16_m_untied, svfloat16_t, svint16_t,
+	     z0 = svcvt_f16_s16_m (z1, p0, z4),
+	     z0 = svcvt_f16_m (z1, p0, z4))
+
+/*
+** cvt_f16_s32_m_tied1:
+**	scvtf	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_s32_m_tied1, svfloat16_t, svint32_t,
+	     z0 = svcvt_f16_s32_m (z0, p0, z4),
+	     z0 = svcvt_f16_m (z0, p0, z4))
+
+/*
+** cvt_f16_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	scvtf	z0\.h, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_s32_m_tied2, svfloat16_t, svint32_t,
+		 z0_res = svcvt_f16_s32_m (z4, p0, z0),
+		 z0_res = svcvt_f16_m (z4, p0, z0))
+
+/*
+** cvt_f16_s32_m_untied:
+**	movprfx	z0, z1
+**	scvtf	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_s32_m_untied, svfloat16_t, svint32_t,
+	     z0 = svcvt_f16_s32_m (z1, p0, z4),
+	     z0 = svcvt_f16_m (z1, p0, z4))
+
+/*
+** cvt_f16_s64_m_tied1:
+**	scvtf	z0\.h, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_s64_m_tied1, svfloat16_t, svint64_t,
+	     z0 = svcvt_f16_s64_m (z0, p0, z4),
+	     z0 = svcvt_f16_m (z0, p0, z4))
+
+/*
+** cvt_f16_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	scvtf	z0\.h, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_s64_m_tied2, svfloat16_t, svint64_t,
+		 z0_res = svcvt_f16_s64_m (z4, p0, z0),
+		 z0_res = svcvt_f16_m (z4, p0, z0))
+
+/*
+** cvt_f16_s64_m_untied:
+**	movprfx	z0, z1
+**	scvtf	z0\.h, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_s64_m_untied, svfloat16_t, svint64_t,
+	     z0 = svcvt_f16_s64_m (z1, p0, z4),
+	     z0 = svcvt_f16_m (z1, p0, z4))
+
+/*
+** cvt_f16_u16_m_tied1:
+**	ucvtf	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_u16_m_tied1, svfloat16_t, svuint16_t,
+	     z0 = svcvt_f16_u16_m (z0, p0, z4),
+	     z0 = svcvt_f16_m (z0, p0, z4))
+
+/*
+** cvt_f16_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	ucvtf	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_u16_m_tied2, svfloat16_t, svuint16_t,
+		 z0_res = svcvt_f16_u16_m (z4, p0, z0),
+		 z0_res = svcvt_f16_m (z4, p0, z0))
+
+/*
+** cvt_f16_u16_m_untied:
+**	movprfx	z0, z1
+**	ucvtf	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_u16_m_untied, svfloat16_t, svuint16_t,
+	     z0 = svcvt_f16_u16_m (z1, p0, z4),
+	     z0 = svcvt_f16_m (z1, p0, z4))
+
+/*
+** cvt_f16_u32_m_tied1:
+**	ucvtf	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_u32_m_tied1, svfloat16_t, svuint32_t,
+	     z0 = svcvt_f16_u32_m (z0, p0, z4),
+	     z0 = svcvt_f16_m (z0, p0, z4))
+
+/*
+** cvt_f16_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	ucvtf	z0\.h, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_u32_m_tied2, svfloat16_t, svuint32_t,
+		 z0_res = svcvt_f16_u32_m (z4, p0, z0),
+		 z0_res = svcvt_f16_m (z4, p0, z0))
+
+/*
+** cvt_f16_u32_m_untied:
+**	movprfx	z0, z1
+**	ucvtf	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_u32_m_untied, svfloat16_t, svuint32_t,
+	     z0 = svcvt_f16_u32_m (z1, p0, z4),
+	     z0 = svcvt_f16_m (z1, p0, z4))
+
+/*
+** cvt_f16_u64_m_tied1:
+**	ucvtf	z0\.h, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_u64_m_tied1, svfloat16_t, svuint64_t,
+	     z0 = svcvt_f16_u64_m (z0, p0, z4),
+	     z0 = svcvt_f16_m (z0, p0, z4))
+
+/*
+** cvt_f16_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	ucvtf	z0\.h, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_u64_m_tied2, svfloat16_t, svuint64_t,
+		 z0_res = svcvt_f16_u64_m (z4, p0, z0),
+		 z0_res = svcvt_f16_m (z4, p0, z0))
+
+/*
+** cvt_f16_u64_m_untied:
+**	movprfx	z0, z1
+**	ucvtf	z0\.h, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_u64_m_untied, svfloat16_t, svuint64_t,
+	     z0 = svcvt_f16_u64_m (z1, p0, z4),
+	     z0 = svcvt_f16_m (z1, p0, z4))
+
+/*
+** cvt_f16_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	fcvt	z0\.h, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_f32_z_tied1, svfloat16_t, svfloat32_t,
+		 z0_res = svcvt_f16_f32_z (p0, z0),
+		 z0_res = svcvt_f16_z (p0, z0))
+
+/*
+** cvt_f16_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z4\.s
+**	fcvt	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_f32_z_untied, svfloat16_t, svfloat32_t,
+	     z0 = svcvt_f16_f32_z (p0, z4),
+	     z0 = svcvt_f16_z (p0, z4))
+
+/*
+** cvt_f16_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	fcvt	z0\.h, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_f64_z_tied1, svfloat16_t, svfloat64_t,
+		 z0_res = svcvt_f16_f64_z (p0, z0),
+		 z0_res = svcvt_f16_z (p0, z0))
+
+/*
+** cvt_f16_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	fcvt	z0\.h, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_f64_z_untied, svfloat16_t, svfloat64_t,
+	     z0 = svcvt_f16_f64_z (p0, z4),
+	     z0 = svcvt_f16_z (p0, z4))
+
+/*
+** cvt_f16_s16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	scvtf	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_s16_z_tied1, svfloat16_t, svint16_t,
+		 z0_res = svcvt_f16_s16_z (p0, z0),
+		 z0_res = svcvt_f16_z (p0, z0))
+
+/*
+** cvt_f16_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z4\.h
+**	scvtf	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_s16_z_untied, svfloat16_t, svint16_t,
+	     z0 = svcvt_f16_s16_z (p0, z4),
+	     z0 = svcvt_f16_z (p0, z4))
+
+/*
+** cvt_f16_s32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	scvtf	z0\.h, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_s32_z_tied1, svfloat16_t, svint32_t,
+		 z0_res = svcvt_f16_s32_z (p0, z0),
+		 z0_res = svcvt_f16_z (p0, z0))
+
+/*
+** cvt_f16_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z4\.s
+**	scvtf	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_s32_z_untied, svfloat16_t, svint32_t,
+	     z0 = svcvt_f16_s32_z (p0, z4),
+	     z0 = svcvt_f16_z (p0, z4))
+
+/*
+** cvt_f16_s64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	scvtf	z0\.h, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_s64_z_tied1, svfloat16_t, svint64_t,
+		 z0_res = svcvt_f16_s64_z (p0, z0),
+		 z0_res = svcvt_f16_z (p0, z0))
+
+/*
+** cvt_f16_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	scvtf	z0\.h, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_s64_z_untied, svfloat16_t, svint64_t,
+	     z0 = svcvt_f16_s64_z (p0, z4),
+	     z0 = svcvt_f16_z (p0, z4))
+
+/*
+** cvt_f16_u16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	ucvtf	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_u16_z_tied1, svfloat16_t, svuint16_t,
+		 z0_res = svcvt_f16_u16_z (p0, z0),
+		 z0_res = svcvt_f16_z (p0, z0))
+
+/*
+** cvt_f16_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z4\.h
+**	ucvtf	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_u16_z_untied, svfloat16_t, svuint16_t,
+	     z0 = svcvt_f16_u16_z (p0, z4),
+	     z0 = svcvt_f16_z (p0, z4))
+
+/*
+** cvt_f16_u32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	ucvtf	z0\.h, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_u32_z_tied1, svfloat16_t, svuint32_t,
+		 z0_res = svcvt_f16_u32_z (p0, z0),
+		 z0_res = svcvt_f16_z (p0, z0))
+
+/*
+** cvt_f16_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z4\.s
+**	ucvtf	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_u32_z_untied, svfloat16_t, svuint32_t,
+	     z0 = svcvt_f16_u32_z (p0, z4),
+	     z0 = svcvt_f16_z (p0, z4))
+
+/*
+** cvt_f16_u64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	ucvtf	z0\.h, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_u64_z_tied1, svfloat16_t, svuint64_t,
+		 z0_res = svcvt_f16_u64_z (p0, z0),
+		 z0_res = svcvt_f16_z (p0, z0))
+
+/*
+** cvt_f16_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	ucvtf	z0\.h, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_u64_z_untied, svfloat16_t, svuint64_t,
+	     z0 = svcvt_f16_u64_z (p0, z4),
+	     z0 = svcvt_f16_z (p0, z4))
+
+/*
+** cvt_f16_f32_x_tied1:
+**	fcvt	z0\.h, p0/m, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_f32_x_tied1, svfloat16_t, svfloat32_t,
+		 z0_res = svcvt_f16_f32_x (p0, z0),
+		 z0_res = svcvt_f16_x (p0, z0))
+
+/*
+** cvt_f16_f32_x_untied:
+**	fcvt	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_f32_x_untied, svfloat16_t, svfloat32_t,
+	     z0 = svcvt_f16_f32_x (p0, z4),
+	     z0 = svcvt_f16_x (p0, z4))
+
+/*
+** cvt_f16_f64_x_tied1:
+**	fcvt	z0\.h, p0/m, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_f64_x_tied1, svfloat16_t, svfloat64_t,
+		 z0_res = svcvt_f16_f64_x (p0, z0),
+		 z0_res = svcvt_f16_x (p0, z0))
+
+/*
+** cvt_f16_f64_x_untied:
+**	fcvt	z0\.h, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_f64_x_untied, svfloat16_t, svfloat64_t,
+	     z0 = svcvt_f16_f64_x (p0, z4),
+	     z0 = svcvt_f16_x (p0, z4))
+
+/*
+** cvt_f16_s16_x_tied1:
+**	scvtf	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_s16_x_tied1, svfloat16_t, svint16_t,
+		 z0_res = svcvt_f16_s16_x (p0, z0),
+		 z0_res = svcvt_f16_x (p0, z0))
+
+/*
+** cvt_f16_s16_x_untied:
+**	scvtf	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_s16_x_untied, svfloat16_t, svint16_t,
+	     z0 = svcvt_f16_s16_x (p0, z4),
+	     z0 = svcvt_f16_x (p0, z4))
+
+/*
+** cvt_f16_s32_x_tied1:
+**	scvtf	z0\.h, p0/m, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_s32_x_tied1, svfloat16_t, svint32_t,
+		 z0_res = svcvt_f16_s32_x (p0, z0),
+		 z0_res = svcvt_f16_x (p0, z0))
+
+/*
+** cvt_f16_s32_x_untied:
+**	scvtf	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_s32_x_untied, svfloat16_t, svint32_t,
+	     z0 = svcvt_f16_s32_x (p0, z4),
+	     z0 = svcvt_f16_x (p0, z4))
+
+/*
+** cvt_f16_s64_x_tied1:
+**	scvtf	z0\.h, p0/m, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_s64_x_tied1, svfloat16_t, svint64_t,
+		 z0_res = svcvt_f16_s64_x (p0, z0),
+		 z0_res = svcvt_f16_x (p0, z0))
+
+/*
+** cvt_f16_s64_x_untied:
+**	scvtf	z0\.h, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_s64_x_untied, svfloat16_t, svint64_t,
+	     z0 = svcvt_f16_s64_x (p0, z4),
+	     z0 = svcvt_f16_x (p0, z4))
+
+/*
+** cvt_f16_u16_x_tied1:
+**	ucvtf	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_u16_x_tied1, svfloat16_t, svuint16_t,
+		 z0_res = svcvt_f16_u16_x (p0, z0),
+		 z0_res = svcvt_f16_x (p0, z0))
+
+/*
+** cvt_f16_u16_x_untied:
+**	ucvtf	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_u16_x_untied, svfloat16_t, svuint16_t,
+	     z0 = svcvt_f16_u16_x (p0, z4),
+	     z0 = svcvt_f16_x (p0, z4))
+
+/*
+** cvt_f16_u32_x_tied1:
+**	ucvtf	z0\.h, p0/m, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_u32_x_tied1, svfloat16_t, svuint32_t,
+		 z0_res = svcvt_f16_u32_x (p0, z0),
+		 z0_res = svcvt_f16_x (p0, z0))
+
+/*
+** cvt_f16_u32_x_untied:
+**	ucvtf	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_u32_x_untied, svfloat16_t, svuint32_t,
+	     z0 = svcvt_f16_u32_x (p0, z4),
+	     z0 = svcvt_f16_x (p0, z4))
+
+/*
+** cvt_f16_u64_x_tied1:
+**	ucvtf	z0\.h, p0/m, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f16_u64_x_tied1, svfloat16_t, svuint64_t,
+		 z0_res = svcvt_f16_u64_x (p0, z0),
+		 z0_res = svcvt_f16_x (p0, z0))
+
+/*
+** cvt_f16_u64_x_untied:
+**	ucvtf	z0\.h, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f16_u64_x_untied, svfloat16_t, svuint64_t,
+	     z0 = svcvt_f16_u64_x (p0, z4),
+	     z0 = svcvt_f16_x (p0, z4))
+
+/*
+** ptrue_cvt_f16_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f16_f32_x_tied1, svfloat16_t, svfloat32_t,
+		 z0_res = svcvt_f16_f32_x (svptrue_b32 (), z0),
+		 z0_res = svcvt_f16_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_cvt_f16_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f16_f32_x_untied, svfloat16_t, svfloat32_t,
+	     z0 = svcvt_f16_f32_x (svptrue_b32 (), z4),
+	     z0 = svcvt_f16_x (svptrue_b32 (), z4))
+
+/*
+** ptrue_cvt_f16_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f16_f64_x_tied1, svfloat16_t, svfloat64_t,
+		 z0_res = svcvt_f16_f64_x (svptrue_b64 (), z0),
+		 z0_res = svcvt_f16_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_cvt_f16_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f16_f64_x_untied, svfloat16_t, svfloat64_t,
+	     z0 = svcvt_f16_f64_x (svptrue_b64 (), z4),
+	     z0 = svcvt_f16_x (svptrue_b64 (), z4))
+
+/*
+** ptrue_cvt_f16_s16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f16_s16_x_tied1, svfloat16_t, svint16_t,
+		 z0_res = svcvt_f16_s16_x (svptrue_b16 (), z0),
+		 z0_res = svcvt_f16_x (svptrue_b16 (), z0))
+
+/*
+** ptrue_cvt_f16_s16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f16_s16_x_untied, svfloat16_t, svint16_t,
+	     z0 = svcvt_f16_s16_x (svptrue_b16 (), z4),
+	     z0 = svcvt_f16_x (svptrue_b16 (), z4))
+
+/*
+** ptrue_cvt_f16_s32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f16_s32_x_tied1, svfloat16_t, svint32_t,
+		 z0_res = svcvt_f16_s32_x (svptrue_b32 (), z0),
+		 z0_res = svcvt_f16_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_cvt_f16_s32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f16_s32_x_untied, svfloat16_t, svint32_t,
+	     z0 = svcvt_f16_s32_x (svptrue_b32 (), z4),
+	     z0 = svcvt_f16_x (svptrue_b32 (), z4))
+
+/*
+** ptrue_cvt_f16_s64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f16_s64_x_tied1, svfloat16_t, svint64_t,
+		 z0_res = svcvt_f16_s64_x (svptrue_b64 (), z0),
+		 z0_res = svcvt_f16_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_cvt_f16_s64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f16_s64_x_untied, svfloat16_t, svint64_t,
+	     z0 = svcvt_f16_s64_x (svptrue_b64 (), z4),
+	     z0 = svcvt_f16_x (svptrue_b64 (), z4))
+
+/*
+** ptrue_cvt_f16_u16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f16_u16_x_tied1, svfloat16_t, svuint16_t,
+		 z0_res = svcvt_f16_u16_x (svptrue_b16 (), z0),
+		 z0_res = svcvt_f16_x (svptrue_b16 (), z0))
+
+/*
+** ptrue_cvt_f16_u16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f16_u16_x_untied, svfloat16_t, svuint16_t,
+	     z0 = svcvt_f16_u16_x (svptrue_b16 (), z4),
+	     z0 = svcvt_f16_x (svptrue_b16 (), z4))
+
+/*
+** ptrue_cvt_f16_u32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f16_u32_x_tied1, svfloat16_t, svuint32_t,
+		 z0_res = svcvt_f16_u32_x (svptrue_b32 (), z0),
+		 z0_res = svcvt_f16_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_cvt_f16_u32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f16_u32_x_untied, svfloat16_t, svuint32_t,
+	     z0 = svcvt_f16_u32_x (svptrue_b32 (), z4),
+	     z0 = svcvt_f16_x (svptrue_b32 (), z4))
+
+/*
+** ptrue_cvt_f16_u64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f16_u64_x_tied1, svfloat16_t, svuint64_t,
+		 z0_res = svcvt_f16_u64_x (svptrue_b64 (), z0),
+		 z0_res = svcvt_f16_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_cvt_f16_u64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f16_u64_x_untied, svfloat16_t, svuint64_t,
+	     z0 = svcvt_f16_u64_x (svptrue_b64 (), z4),
+	     z0 = svcvt_f16_x (svptrue_b64 (), z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f32.c
new file mode 100644
index 000000000..c16469939
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f32.c
@@ -0,0 +1,549 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cvt_f32_f16_m_tied1:
+**	fcvt	z0\.s, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_f16_m_tied1, svfloat32_t, svfloat16_t,
+	     z0 = svcvt_f32_f16_m (z0, p0, z4),
+	     z0 = svcvt_f32_m (z0, p0, z4))
+
+/*
+** cvt_f32_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fcvt	z0\.s, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_f16_m_tied2, svfloat32_t, svfloat16_t,
+		 z0_res = svcvt_f32_f16_m (z4, p0, z0),
+		 z0_res = svcvt_f32_m (z4, p0, z0))
+
+/*
+** cvt_f32_f16_m_untied:
+**	movprfx	z0, z1
+**	fcvt	z0\.s, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_f16_m_untied, svfloat32_t, svfloat16_t,
+	     z0 = svcvt_f32_f16_m (z1, p0, z4),
+	     z0 = svcvt_f32_m (z1, p0, z4))
+
+/*
+** cvt_f32_f64_m_tied1:
+**	fcvt	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_f64_m_tied1, svfloat32_t, svfloat64_t,
+	     z0 = svcvt_f32_f64_m (z0, p0, z4),
+	     z0 = svcvt_f32_m (z0, p0, z4))
+
+/*
+** cvt_f32_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	fcvt	z0\.s, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_f64_m_tied2, svfloat32_t, svfloat64_t,
+		 z0_res = svcvt_f32_f64_m (z4, p0, z0),
+		 z0_res = svcvt_f32_m (z4, p0, z0))
+
+/*
+** cvt_f32_f64_m_untied:
+**	movprfx	z0, z1
+**	fcvt	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_f64_m_untied, svfloat32_t, svfloat64_t,
+	     z0 = svcvt_f32_f64_m (z1, p0, z4),
+	     z0 = svcvt_f32_m (z1, p0, z4))
+
+/*
+** cvt_f32_s32_m_tied1:
+**	scvtf	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_s32_m_tied1, svfloat32_t, svint32_t,
+	     z0 = svcvt_f32_s32_m (z0, p0, z4),
+	     z0 = svcvt_f32_m (z0, p0, z4))
+
+/*
+** cvt_f32_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	scvtf	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_s32_m_tied2, svfloat32_t, svint32_t,
+		 z0_res = svcvt_f32_s32_m (z4, p0, z0),
+		 z0_res = svcvt_f32_m (z4, p0, z0))
+
+/*
+** cvt_f32_s32_m_untied:
+**	movprfx	z0, z1
+**	scvtf	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_s32_m_untied, svfloat32_t, svint32_t,
+	     z0 = svcvt_f32_s32_m (z1, p0, z4),
+	     z0 = svcvt_f32_m (z1, p0, z4))
+
+/*
+** cvt_f32_s64_m_tied1:
+**	scvtf	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_s64_m_tied1, svfloat32_t, svint64_t,
+	     z0 = svcvt_f32_s64_m (z0, p0, z4),
+	     z0 = svcvt_f32_m (z0, p0, z4))
+
+/*
+** cvt_f32_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	scvtf	z0\.s, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_s64_m_tied2, svfloat32_t, svint64_t,
+		 z0_res = svcvt_f32_s64_m (z4, p0, z0),
+		 z0_res = svcvt_f32_m (z4, p0, z0))
+
+/*
+** cvt_f32_s64_m_untied:
+**	movprfx	z0, z1
+**	scvtf	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_s64_m_untied, svfloat32_t, svint64_t,
+	     z0 = svcvt_f32_s64_m (z1, p0, z4),
+	     z0 = svcvt_f32_m (z1, p0, z4))
+
+/*
+** cvt_f32_u32_m_tied1:
+**	ucvtf	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_u32_m_tied1, svfloat32_t, svuint32_t,
+	     z0 = svcvt_f32_u32_m (z0, p0, z4),
+	     z0 = svcvt_f32_m (z0, p0, z4))
+
+/*
+** cvt_f32_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	ucvtf	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_u32_m_tied2, svfloat32_t, svuint32_t,
+		 z0_res = svcvt_f32_u32_m (z4, p0, z0),
+		 z0_res = svcvt_f32_m (z4, p0, z0))
+
+/*
+** cvt_f32_u32_m_untied:
+**	movprfx	z0, z1
+**	ucvtf	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_u32_m_untied, svfloat32_t, svuint32_t,
+	     z0 = svcvt_f32_u32_m (z1, p0, z4),
+	     z0 = svcvt_f32_m (z1, p0, z4))
+
+/*
+** cvt_f32_u64_m_tied1:
+**	ucvtf	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_u64_m_tied1, svfloat32_t, svuint64_t,
+	     z0 = svcvt_f32_u64_m (z0, p0, z4),
+	     z0 = svcvt_f32_m (z0, p0, z4))
+
+/*
+** cvt_f32_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	ucvtf	z0\.s, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_u64_m_tied2, svfloat32_t, svuint64_t,
+		 z0_res = svcvt_f32_u64_m (z4, p0, z0),
+		 z0_res = svcvt_f32_m (z4, p0, z0))
+
+/*
+** cvt_f32_u64_m_untied:
+**	movprfx	z0, z1
+**	ucvtf	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_u64_m_untied, svfloat32_t, svuint64_t,
+	     z0 = svcvt_f32_u64_m (z1, p0, z4),
+	     z0 = svcvt_f32_m (z1, p0, z4))
+
+/*
+** cvt_f32_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	fcvt	z0\.s, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_f16_z_tied1, svfloat32_t, svfloat16_t,
+		 z0_res = svcvt_f32_f16_z (p0, z0),
+		 z0_res = svcvt_f32_z (p0, z0))
+
+/*
+** cvt_f32_f16_z_untied:
+**	movprfx	z0\.s, p0/z, z4\.s
+**	fcvt	z0\.s, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_f16_z_untied, svfloat32_t, svfloat16_t,
+	     z0 = svcvt_f32_f16_z (p0, z4),
+	     z0 = svcvt_f32_z (p0, z4))
+
+/*
+** cvt_f32_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	fcvt	z0\.s, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_f64_z_tied1, svfloat32_t, svfloat64_t,
+		 z0_res = svcvt_f32_f64_z (p0, z0),
+		 z0_res = svcvt_f32_z (p0, z0))
+
+/*
+** cvt_f32_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	fcvt	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_f64_z_untied, svfloat32_t, svfloat64_t,
+	     z0 = svcvt_f32_f64_z (p0, z4),
+	     z0 = svcvt_f32_z (p0, z4))
+
+/*
+** cvt_f32_s32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	scvtf	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_s32_z_tied1, svfloat32_t, svint32_t,
+		 z0_res = svcvt_f32_s32_z (p0, z0),
+		 z0_res = svcvt_f32_z (p0, z0))
+
+/*
+** cvt_f32_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z4\.s
+**	scvtf	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_s32_z_untied, svfloat32_t, svint32_t,
+	     z0 = svcvt_f32_s32_z (p0, z4),
+	     z0 = svcvt_f32_z (p0, z4))
+
+/*
+** cvt_f32_s64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	scvtf	z0\.s, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_s64_z_tied1, svfloat32_t, svint64_t,
+		 z0_res = svcvt_f32_s64_z (p0, z0),
+		 z0_res = svcvt_f32_z (p0, z0))
+
+/*
+** cvt_f32_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	scvtf	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_s64_z_untied, svfloat32_t, svint64_t,
+	     z0 = svcvt_f32_s64_z (p0, z4),
+	     z0 = svcvt_f32_z (p0, z4))
+
+/*
+** cvt_f32_u32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	ucvtf	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_u32_z_tied1, svfloat32_t, svuint32_t,
+		 z0_res = svcvt_f32_u32_z (p0, z0),
+		 z0_res = svcvt_f32_z (p0, z0))
+
+/*
+** cvt_f32_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z4\.s
+**	ucvtf	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_u32_z_untied, svfloat32_t, svuint32_t,
+	     z0 = svcvt_f32_u32_z (p0, z4),
+	     z0 = svcvt_f32_z (p0, z4))
+
+/*
+** cvt_f32_u64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	ucvtf	z0\.s, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_u64_z_tied1, svfloat32_t, svuint64_t,
+		 z0_res = svcvt_f32_u64_z (p0, z0),
+		 z0_res = svcvt_f32_z (p0, z0))
+
+/*
+** cvt_f32_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	ucvtf	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_u64_z_untied, svfloat32_t, svuint64_t,
+	     z0 = svcvt_f32_u64_z (p0, z4),
+	     z0 = svcvt_f32_z (p0, z4))
+
+/*
+** cvt_f32_f16_x_tied1:
+**	fcvt	z0\.s, p0/m, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_f16_x_tied1, svfloat32_t, svfloat16_t,
+		 z0_res = svcvt_f32_f16_x (p0, z0),
+		 z0_res = svcvt_f32_x (p0, z0))
+
+/*
+** cvt_f32_f16_x_untied:
+**	fcvt	z0\.s, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_f16_x_untied, svfloat32_t, svfloat16_t,
+	     z0 = svcvt_f32_f16_x (p0, z4),
+	     z0 = svcvt_f32_x (p0, z4))
+
+/*
+** cvt_f32_f64_x_tied1:
+**	fcvt	z0\.s, p0/m, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_f64_x_tied1, svfloat32_t, svfloat64_t,
+		 z0_res = svcvt_f32_f64_x (p0, z0),
+		 z0_res = svcvt_f32_x (p0, z0))
+
+/*
+** cvt_f32_f64_x_untied:
+**	fcvt	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_f64_x_untied, svfloat32_t, svfloat64_t,
+	     z0 = svcvt_f32_f64_x (p0, z4),
+	     z0 = svcvt_f32_x (p0, z4))
+
+/*
+** cvt_f32_s32_x_tied1:
+**	scvtf	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_s32_x_tied1, svfloat32_t, svint32_t,
+		 z0_res = svcvt_f32_s32_x (p0, z0),
+		 z0_res = svcvt_f32_x (p0, z0))
+
+/*
+** cvt_f32_s32_x_untied:
+**	scvtf	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_s32_x_untied, svfloat32_t, svint32_t,
+	     z0 = svcvt_f32_s32_x (p0, z4),
+	     z0 = svcvt_f32_x (p0, z4))
+
+/*
+** cvt_f32_s64_x_tied1:
+**	scvtf	z0\.s, p0/m, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_s64_x_tied1, svfloat32_t, svint64_t,
+		 z0_res = svcvt_f32_s64_x (p0, z0),
+		 z0_res = svcvt_f32_x (p0, z0))
+
+/*
+** cvt_f32_s64_x_untied:
+**	scvtf	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_s64_x_untied, svfloat32_t, svint64_t,
+	     z0 = svcvt_f32_s64_x (p0, z4),
+	     z0 = svcvt_f32_x (p0, z4))
+
+/*
+** cvt_f32_u32_x_tied1:
+**	ucvtf	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_u32_x_tied1, svfloat32_t, svuint32_t,
+		 z0_res = svcvt_f32_u32_x (p0, z0),
+		 z0_res = svcvt_f32_x (p0, z0))
+
+/*
+** cvt_f32_u32_x_untied:
+**	ucvtf	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_u32_x_untied, svfloat32_t, svuint32_t,
+	     z0 = svcvt_f32_u32_x (p0, z4),
+	     z0 = svcvt_f32_x (p0, z4))
+
+/*
+** cvt_f32_u64_x_tied1:
+**	ucvtf	z0\.s, p0/m, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f32_u64_x_tied1, svfloat32_t, svuint64_t,
+		 z0_res = svcvt_f32_u64_x (p0, z0),
+		 z0_res = svcvt_f32_x (p0, z0))
+
+/*
+** cvt_f32_u64_x_untied:
+**	ucvtf	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f32_u64_x_untied, svfloat32_t, svuint64_t,
+	     z0 = svcvt_f32_u64_x (p0, z4),
+	     z0 = svcvt_f32_x (p0, z4))
+
+/*
+** ptrue_cvt_f32_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f32_f16_x_tied1, svfloat32_t, svfloat16_t,
+		 z0_res = svcvt_f32_f16_x (svptrue_b32 (), z0),
+		 z0_res = svcvt_f32_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_cvt_f32_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f32_f16_x_untied, svfloat32_t, svfloat16_t,
+	     z0 = svcvt_f32_f16_x (svptrue_b32 (), z4),
+	     z0 = svcvt_f32_x (svptrue_b32 (), z4))
+
+/*
+** ptrue_cvt_f32_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f32_f64_x_tied1, svfloat32_t, svfloat64_t,
+		 z0_res = svcvt_f32_f64_x (svptrue_b64 (), z0),
+		 z0_res = svcvt_f32_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_cvt_f32_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f32_f64_x_untied, svfloat32_t, svfloat64_t,
+	     z0 = svcvt_f32_f64_x (svptrue_b64 (), z4),
+	     z0 = svcvt_f32_x (svptrue_b64 (), z4))
+
+/*
+** ptrue_cvt_f32_s32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f32_s32_x_tied1, svfloat32_t, svint32_t,
+		 z0_res = svcvt_f32_s32_x (svptrue_b32 (), z0),
+		 z0_res = svcvt_f32_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_cvt_f32_s32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f32_s32_x_untied, svfloat32_t, svint32_t,
+	     z0 = svcvt_f32_s32_x (svptrue_b32 (), z4),
+	     z0 = svcvt_f32_x (svptrue_b32 (), z4))
+
+/*
+** ptrue_cvt_f32_s64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f32_s64_x_tied1, svfloat32_t, svint64_t,
+		 z0_res = svcvt_f32_s64_x (svptrue_b64 (), z0),
+		 z0_res = svcvt_f32_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_cvt_f32_s64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f32_s64_x_untied, svfloat32_t, svint64_t,
+	     z0 = svcvt_f32_s64_x (svptrue_b64 (), z4),
+	     z0 = svcvt_f32_x (svptrue_b64 (), z4))
+
+/*
+** ptrue_cvt_f32_u32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f32_u32_x_tied1, svfloat32_t, svuint32_t,
+		 z0_res = svcvt_f32_u32_x (svptrue_b32 (), z0),
+		 z0_res = svcvt_f32_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_cvt_f32_u32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f32_u32_x_untied, svfloat32_t, svuint32_t,
+	     z0 = svcvt_f32_u32_x (svptrue_b32 (), z4),
+	     z0 = svcvt_f32_x (svptrue_b32 (), z4))
+
+/*
+** ptrue_cvt_f32_u64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f32_u64_x_tied1, svfloat32_t, svuint64_t,
+		 z0_res = svcvt_f32_u64_x (svptrue_b64 (), z0),
+		 z0_res = svcvt_f32_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_cvt_f32_u64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f32_u64_x_untied, svfloat32_t, svuint64_t,
+	     z0 = svcvt_f32_u64_x (svptrue_b64 (), z4),
+	     z0 = svcvt_f32_x (svptrue_b64 (), z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f64.c
new file mode 100644
index 000000000..1d08e6ec5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_f64.c
@@ -0,0 +1,549 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cvt_f64_f16_m_tied1:
+**	fcvt	z0\.d, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_f16_m_tied1, svfloat64_t, svfloat16_t,
+	     z0 = svcvt_f64_f16_m (z0, p0, z4),
+	     z0 = svcvt_f64_m (z0, p0, z4))
+
+/*
+** cvt_f64_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fcvt	z0\.d, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_f16_m_tied2, svfloat64_t, svfloat16_t,
+		 z0_res = svcvt_f64_f16_m (z4, p0, z0),
+		 z0_res = svcvt_f64_m (z4, p0, z0))
+
+/*
+** cvt_f64_f16_m_untied:
+**	movprfx	z0, z1
+**	fcvt	z0\.d, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_f16_m_untied, svfloat64_t, svfloat16_t,
+	     z0 = svcvt_f64_f16_m (z1, p0, z4),
+	     z0 = svcvt_f64_m (z1, p0, z4))
+
+/*
+** cvt_f64_f32_m_tied1:
+**	fcvt	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_f32_m_tied1, svfloat64_t, svfloat32_t,
+	     z0 = svcvt_f64_f32_m (z0, p0, z4),
+	     z0 = svcvt_f64_m (z0, p0, z4))
+
+/*
+** cvt_f64_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fcvt	z0\.d, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_f32_m_tied2, svfloat64_t, svfloat32_t,
+		 z0_res = svcvt_f64_f32_m (z4, p0, z0),
+		 z0_res = svcvt_f64_m (z4, p0, z0))
+
+/*
+** cvt_f64_f32_m_untied:
+**	movprfx	z0, z1
+**	fcvt	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_f32_m_untied, svfloat64_t, svfloat32_t,
+	     z0 = svcvt_f64_f32_m (z1, p0, z4),
+	     z0 = svcvt_f64_m (z1, p0, z4))
+
+/*
+** cvt_f64_s32_m_tied1:
+**	scvtf	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_s32_m_tied1, svfloat64_t, svint32_t,
+	     z0 = svcvt_f64_s32_m (z0, p0, z4),
+	     z0 = svcvt_f64_m (z0, p0, z4))
+
+/*
+** cvt_f64_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	scvtf	z0\.d, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_s32_m_tied2, svfloat64_t, svint32_t,
+		 z0_res = svcvt_f64_s32_m (z4, p0, z0),
+		 z0_res = svcvt_f64_m (z4, p0, z0))
+
+/*
+** cvt_f64_s32_m_untied:
+**	movprfx	z0, z1
+**	scvtf	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_s32_m_untied, svfloat64_t, svint32_t,
+	     z0 = svcvt_f64_s32_m (z1, p0, z4),
+	     z0 = svcvt_f64_m (z1, p0, z4))
+
+/*
+** cvt_f64_s64_m_tied1:
+**	scvtf	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_s64_m_tied1, svfloat64_t, svint64_t,
+	     z0 = svcvt_f64_s64_m (z0, p0, z4),
+	     z0 = svcvt_f64_m (z0, p0, z4))
+
+/*
+** cvt_f64_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	scvtf	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_s64_m_tied2, svfloat64_t, svint64_t,
+		 z0_res = svcvt_f64_s64_m (z4, p0, z0),
+		 z0_res = svcvt_f64_m (z4, p0, z0))
+
+/*
+** cvt_f64_s64_m_untied:
+**	movprfx	z0, z1
+**	scvtf	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_s64_m_untied, svfloat64_t, svint64_t,
+	     z0 = svcvt_f64_s64_m (z1, p0, z4),
+	     z0 = svcvt_f64_m (z1, p0, z4))
+
+/*
+** cvt_f64_u32_m_tied1:
+**	ucvtf	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_u32_m_tied1, svfloat64_t, svuint32_t,
+	     z0 = svcvt_f64_u32_m (z0, p0, z4),
+	     z0 = svcvt_f64_m (z0, p0, z4))
+
+/*
+** cvt_f64_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	ucvtf	z0\.d, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_u32_m_tied2, svfloat64_t, svuint32_t,
+		 z0_res = svcvt_f64_u32_m (z4, p0, z0),
+		 z0_res = svcvt_f64_m (z4, p0, z0))
+
+/*
+** cvt_f64_u32_m_untied:
+**	movprfx	z0, z1
+**	ucvtf	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_u32_m_untied, svfloat64_t, svuint32_t,
+	     z0 = svcvt_f64_u32_m (z1, p0, z4),
+	     z0 = svcvt_f64_m (z1, p0, z4))
+
+/*
+** cvt_f64_u64_m_tied1:
+**	ucvtf	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_u64_m_tied1, svfloat64_t, svuint64_t,
+	     z0 = svcvt_f64_u64_m (z0, p0, z4),
+	     z0 = svcvt_f64_m (z0, p0, z4))
+
+/*
+** cvt_f64_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	ucvtf	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_u64_m_tied2, svfloat64_t, svuint64_t,
+		 z0_res = svcvt_f64_u64_m (z4, p0, z0),
+		 z0_res = svcvt_f64_m (z4, p0, z0))
+
+/*
+** cvt_f64_u64_m_untied:
+**	movprfx	z0, z1
+**	ucvtf	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_u64_m_untied, svfloat64_t, svuint64_t,
+	     z0 = svcvt_f64_u64_m (z1, p0, z4),
+	     z0 = svcvt_f64_m (z1, p0, z4))
+
+/*
+** cvt_f64_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.d, p0/z, \1\.d
+**	fcvt	z0\.d, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_f16_z_tied1, svfloat64_t, svfloat16_t,
+		 z0_res = svcvt_f64_f16_z (p0, z0),
+		 z0_res = svcvt_f64_z (p0, z0))
+
+/*
+** cvt_f64_f16_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	fcvt	z0\.d, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_f16_z_untied, svfloat64_t, svfloat16_t,
+	     z0 = svcvt_f64_f16_z (p0, z4),
+	     z0 = svcvt_f64_z (p0, z4))
+
+/*
+** cvt_f64_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.d, p0/z, \1\.d
+**	fcvt	z0\.d, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_f32_z_tied1, svfloat64_t, svfloat32_t,
+		 z0_res = svcvt_f64_f32_z (p0, z0),
+		 z0_res = svcvt_f64_z (p0, z0))
+
+/*
+** cvt_f64_f32_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	fcvt	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_f32_z_untied, svfloat64_t, svfloat32_t,
+	     z0 = svcvt_f64_f32_z (p0, z4),
+	     z0 = svcvt_f64_z (p0, z4))
+
+/*
+** cvt_f64_s32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.d, p0/z, \1\.d
+**	scvtf	z0\.d, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_s32_z_tied1, svfloat64_t, svint32_t,
+		 z0_res = svcvt_f64_s32_z (p0, z0),
+		 z0_res = svcvt_f64_z (p0, z0))
+
+/*
+** cvt_f64_s32_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	scvtf	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_s32_z_untied, svfloat64_t, svint32_t,
+	     z0 = svcvt_f64_s32_z (p0, z4),
+	     z0 = svcvt_f64_z (p0, z4))
+
+/*
+** cvt_f64_s64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	scvtf	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_s64_z_tied1, svfloat64_t, svint64_t,
+		 z0_res = svcvt_f64_s64_z (p0, z0),
+		 z0_res = svcvt_f64_z (p0, z0))
+
+/*
+** cvt_f64_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	scvtf	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_s64_z_untied, svfloat64_t, svint64_t,
+	     z0 = svcvt_f64_s64_z (p0, z4),
+	     z0 = svcvt_f64_z (p0, z4))
+
+/*
+** cvt_f64_u32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.d, p0/z, \1\.d
+**	ucvtf	z0\.d, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_u32_z_tied1, svfloat64_t, svuint32_t,
+		 z0_res = svcvt_f64_u32_z (p0, z0),
+		 z0_res = svcvt_f64_z (p0, z0))
+
+/*
+** cvt_f64_u32_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	ucvtf	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_u32_z_untied, svfloat64_t, svuint32_t,
+	     z0 = svcvt_f64_u32_z (p0, z4),
+	     z0 = svcvt_f64_z (p0, z4))
+
+/*
+** cvt_f64_u64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	ucvtf	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_u64_z_tied1, svfloat64_t, svuint64_t,
+		 z0_res = svcvt_f64_u64_z (p0, z0),
+		 z0_res = svcvt_f64_z (p0, z0))
+
+/*
+** cvt_f64_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	ucvtf	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_u64_z_untied, svfloat64_t, svuint64_t,
+	     z0 = svcvt_f64_u64_z (p0, z4),
+	     z0 = svcvt_f64_z (p0, z4))
+
+/*
+** cvt_f64_f16_x_tied1:
+**	fcvt	z0\.d, p0/m, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_f16_x_tied1, svfloat64_t, svfloat16_t,
+		 z0_res = svcvt_f64_f16_x (p0, z0),
+		 z0_res = svcvt_f64_x (p0, z0))
+
+/*
+** cvt_f64_f16_x_untied:
+**	fcvt	z0\.d, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_f16_x_untied, svfloat64_t, svfloat16_t,
+	     z0 = svcvt_f64_f16_x (p0, z4),
+	     z0 = svcvt_f64_x (p0, z4))
+
+/*
+** cvt_f64_f32_x_tied1:
+**	fcvt	z0\.d, p0/m, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_f32_x_tied1, svfloat64_t, svfloat32_t,
+		 z0_res = svcvt_f64_f32_x (p0, z0),
+		 z0_res = svcvt_f64_x (p0, z0))
+
+/*
+** cvt_f64_f32_x_untied:
+**	fcvt	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_f32_x_untied, svfloat64_t, svfloat32_t,
+	     z0 = svcvt_f64_f32_x (p0, z4),
+	     z0 = svcvt_f64_x (p0, z4))
+
+/*
+** cvt_f64_s32_x_tied1:
+**	scvtf	z0\.d, p0/m, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_s32_x_tied1, svfloat64_t, svint32_t,
+		 z0_res = svcvt_f64_s32_x (p0, z0),
+		 z0_res = svcvt_f64_x (p0, z0))
+
+/*
+** cvt_f64_s32_x_untied:
+**	scvtf	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_s32_x_untied, svfloat64_t, svint32_t,
+	     z0 = svcvt_f64_s32_x (p0, z4),
+	     z0 = svcvt_f64_x (p0, z4))
+
+/*
+** cvt_f64_s64_x_tied1:
+**	scvtf	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_s64_x_tied1, svfloat64_t, svint64_t,
+		 z0_res = svcvt_f64_s64_x (p0, z0),
+		 z0_res = svcvt_f64_x (p0, z0))
+
+/*
+** cvt_f64_s64_x_untied:
+**	scvtf	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_s64_x_untied, svfloat64_t, svint64_t,
+	     z0 = svcvt_f64_s64_x (p0, z4),
+	     z0 = svcvt_f64_x (p0, z4))
+
+/*
+** cvt_f64_u32_x_tied1:
+**	ucvtf	z0\.d, p0/m, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_u32_x_tied1, svfloat64_t, svuint32_t,
+		 z0_res = svcvt_f64_u32_x (p0, z0),
+		 z0_res = svcvt_f64_x (p0, z0))
+
+/*
+** cvt_f64_u32_x_untied:
+**	ucvtf	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_u32_x_untied, svfloat64_t, svuint32_t,
+	     z0 = svcvt_f64_u32_x (p0, z4),
+	     z0 = svcvt_f64_x (p0, z4))
+
+/*
+** cvt_f64_u64_x_tied1:
+**	ucvtf	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_f64_u64_x_tied1, svfloat64_t, svuint64_t,
+		 z0_res = svcvt_f64_u64_x (p0, z0),
+		 z0_res = svcvt_f64_x (p0, z0))
+
+/*
+** cvt_f64_u64_x_untied:
+**	ucvtf	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_f64_u64_x_untied, svfloat64_t, svuint64_t,
+	     z0 = svcvt_f64_u64_x (p0, z4),
+	     z0 = svcvt_f64_x (p0, z4))
+
+/*
+** ptrue_cvt_f64_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f64_f16_x_tied1, svfloat64_t, svfloat16_t,
+		 z0_res = svcvt_f64_f16_x (svptrue_b64 (), z0),
+		 z0_res = svcvt_f64_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_cvt_f64_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f64_f16_x_untied, svfloat64_t, svfloat16_t,
+	     z0 = svcvt_f64_f16_x (svptrue_b64 (), z4),
+	     z0 = svcvt_f64_x (svptrue_b64 (), z4))
+
+/*
+** ptrue_cvt_f64_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f64_f32_x_tied1, svfloat64_t, svfloat32_t,
+		 z0_res = svcvt_f64_f32_x (svptrue_b64 (), z0),
+		 z0_res = svcvt_f64_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_cvt_f64_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f64_f32_x_untied, svfloat64_t, svfloat32_t,
+	     z0 = svcvt_f64_f32_x (svptrue_b64 (), z4),
+	     z0 = svcvt_f64_x (svptrue_b64 (), z4))
+
+/*
+** ptrue_cvt_f64_s32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f64_s32_x_tied1, svfloat64_t, svint32_t,
+		 z0_res = svcvt_f64_s32_x (svptrue_b64 (), z0),
+		 z0_res = svcvt_f64_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_cvt_f64_s32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f64_s32_x_untied, svfloat64_t, svint32_t,
+	     z0 = svcvt_f64_s32_x (svptrue_b64 (), z4),
+	     z0 = svcvt_f64_x (svptrue_b64 (), z4))
+
+/*
+** ptrue_cvt_f64_s64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f64_s64_x_tied1, svfloat64_t, svint64_t,
+		 z0_res = svcvt_f64_s64_x (svptrue_b64 (), z0),
+		 z0_res = svcvt_f64_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_cvt_f64_s64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f64_s64_x_untied, svfloat64_t, svint64_t,
+	     z0 = svcvt_f64_s64_x (svptrue_b64 (), z4),
+	     z0 = svcvt_f64_x (svptrue_b64 (), z4))
+
+/*
+** ptrue_cvt_f64_u32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f64_u32_x_tied1, svfloat64_t, svuint32_t,
+		 z0_res = svcvt_f64_u32_x (svptrue_b64 (), z0),
+		 z0_res = svcvt_f64_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_cvt_f64_u32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f64_u32_x_untied, svfloat64_t, svuint32_t,
+	     z0 = svcvt_f64_u32_x (svptrue_b64 (), z4),
+	     z0 = svcvt_f64_x (svptrue_b64 (), z4))
+
+/*
+** ptrue_cvt_f64_u64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_cvt_f64_u64_x_tied1, svfloat64_t, svuint64_t,
+		 z0_res = svcvt_f64_u64_x (svptrue_b64 (), z0),
+		 z0_res = svcvt_f64_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_cvt_f64_u64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvt_f64_u64_x_untied, svfloat64_t, svuint64_t,
+	     z0 = svcvt_f64_u64_x (svptrue_b64 (), z4),
+	     z0 = svcvt_f64_x (svptrue_b64 (), z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s16.c
new file mode 100644
index 000000000..81761ab09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s16.c
@@ -0,0 +1,72 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cvt_s16_f16_m_tied1:
+**	fcvtzs	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_s16_f16_m_tied1, svint16_t, svfloat16_t,
+	     z0 = svcvt_s16_f16_m (z0, p0, z4),
+	     z0 = svcvt_s16_m (z0, p0, z4))
+
+/*
+** cvt_s16_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fcvtzs	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s16_f16_m_tied2, svint16_t, svfloat16_t,
+		 z0_res = svcvt_s16_f16_m (z4, p0, z0),
+		 z0_res = svcvt_s16_m (z4, p0, z0))
+
+/*
+** cvt_s16_f16_m_untied:
+**	movprfx	z0, z1
+**	fcvtzs	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_s16_f16_m_untied, svint16_t, svfloat16_t,
+	     z0 = svcvt_s16_f16_m (z1, p0, z4),
+	     z0 = svcvt_s16_m (z1, p0, z4))
+
+/*
+** cvt_s16_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	fcvtzs	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s16_f16_z_tied1, svint16_t, svfloat16_t,
+		 z0_res = svcvt_s16_f16_z (p0, z0),
+		 z0_res = svcvt_s16_z (p0, z0))
+
+/*
+** cvt_s16_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z4\.h
+**	fcvtzs	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_s16_f16_z_untied, svint16_t, svfloat16_t,
+	     z0 = svcvt_s16_f16_z (p0, z4),
+	     z0 = svcvt_s16_z (p0, z4))
+
+/*
+** cvt_s16_f16_x_tied1:
+**	fcvtzs	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s16_f16_x_tied1, svint16_t, svfloat16_t,
+		 z0_res = svcvt_s16_f16_x (p0, z0),
+		 z0_res = svcvt_s16_x (p0, z0))
+
+/*
+** cvt_s16_f16_x_untied:
+**	fcvtzs	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_s16_f16_x_untied, svint16_t, svfloat16_t,
+	     z0 = svcvt_s16_f16_x (p0, z4),
+	     z0 = svcvt_s16_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s32.c
new file mode 100644
index 000000000..d30da5cc5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s32.c
@@ -0,0 +1,210 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cvt_s32_f16_m_tied1:
+**	fcvtzs	z0\.s, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_s32_f16_m_tied1, svint32_t, svfloat16_t,
+	     z0 = svcvt_s32_f16_m (z0, p0, z4),
+	     z0 = svcvt_s32_m (z0, p0, z4))
+
+/*
+** cvt_s32_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fcvtzs	z0\.s, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s32_f16_m_tied2, svint32_t, svfloat16_t,
+		 z0_res = svcvt_s32_f16_m (z4, p0, z0),
+		 z0_res = svcvt_s32_m (z4, p0, z0))
+
+/*
+** cvt_s32_f16_m_untied:
+**	movprfx	z0, z1
+**	fcvtzs	z0\.s, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_s32_f16_m_untied, svint32_t, svfloat16_t,
+	     z0 = svcvt_s32_f16_m (z1, p0, z4),
+	     z0 = svcvt_s32_m (z1, p0, z4))
+
+/*
+** cvt_s32_f32_m_tied1:
+**	fcvtzs	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_s32_f32_m_tied1, svint32_t, svfloat32_t,
+	     z0 = svcvt_s32_f32_m (z0, p0, z4),
+	     z0 = svcvt_s32_m (z0, p0, z4))
+
+/*
+** cvt_s32_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fcvtzs	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s32_f32_m_tied2, svint32_t, svfloat32_t,
+		 z0_res = svcvt_s32_f32_m (z4, p0, z0),
+		 z0_res = svcvt_s32_m (z4, p0, z0))
+
+/*
+** cvt_s32_f32_m_untied:
+**	movprfx	z0, z1
+**	fcvtzs	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_s32_f32_m_untied, svint32_t, svfloat32_t,
+	     z0 = svcvt_s32_f32_m (z1, p0, z4),
+	     z0 = svcvt_s32_m (z1, p0, z4))
+
+/*
+** cvt_s32_f64_m_tied1:
+**	fcvtzs	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_s32_f64_m_tied1, svint32_t, svfloat64_t,
+	     z0 = svcvt_s32_f64_m (z0, p0, z4),
+	     z0 = svcvt_s32_m (z0, p0, z4))
+
+/*
+** cvt_s32_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	fcvtzs	z0\.s, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s32_f64_m_tied2, svint32_t, svfloat64_t,
+		 z0_res = svcvt_s32_f64_m (z4, p0, z0),
+		 z0_res = svcvt_s32_m (z4, p0, z0))
+
+/*
+** cvt_s32_f64_m_untied:
+**	movprfx	z0, z1
+**	fcvtzs	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_s32_f64_m_untied, svint32_t, svfloat64_t,
+	     z0 = svcvt_s32_f64_m (z1, p0, z4),
+	     z0 = svcvt_s32_m (z1, p0, z4))
+
+/*
+** cvt_s32_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	fcvtzs	z0\.s, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s32_f16_z_tied1, svint32_t, svfloat16_t,
+		 z0_res = svcvt_s32_f16_z (p0, z0),
+		 z0_res = svcvt_s32_z (p0, z0))
+
+/*
+** cvt_s32_f16_z_untied:
+**	movprfx	z0\.s, p0/z, z4\.s
+**	fcvtzs	z0\.s, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_s32_f16_z_untied, svint32_t, svfloat16_t,
+	     z0 = svcvt_s32_f16_z (p0, z4),
+	     z0 = svcvt_s32_z (p0, z4))
+
+/*
+** cvt_s32_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	fcvtzs	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s32_f32_z_tied1, svint32_t, svfloat32_t,
+		 z0_res = svcvt_s32_f32_z (p0, z0),
+		 z0_res = svcvt_s32_z (p0, z0))
+
+/*
+** cvt_s32_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z4\.s
+**	fcvtzs	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_s32_f32_z_untied, svint32_t, svfloat32_t,
+	     z0 = svcvt_s32_f32_z (p0, z4),
+	     z0 = svcvt_s32_z (p0, z4))
+
+/*
+** cvt_s32_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	fcvtzs	z0\.s, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s32_f64_z_tied1, svint32_t, svfloat64_t,
+		 z0_res = svcvt_s32_f64_z (p0, z0),
+		 z0_res = svcvt_s32_z (p0, z0))
+
+/*
+** cvt_s32_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	fcvtzs	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_s32_f64_z_untied, svint32_t, svfloat64_t,
+	     z0 = svcvt_s32_f64_z (p0, z4),
+	     z0 = svcvt_s32_z (p0, z4))
+
+/*
+** cvt_s32_f16_x_tied1:
+**	fcvtzs	z0\.s, p0/m, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s32_f16_x_tied1, svint32_t, svfloat16_t,
+		 z0_res = svcvt_s32_f16_x (p0, z0),
+		 z0_res = svcvt_s32_x (p0, z0))
+
+/*
+** cvt_s32_f16_x_untied:
+**	fcvtzs	z0\.s, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_s32_f16_x_untied, svint32_t, svfloat16_t,
+	     z0 = svcvt_s32_f16_x (p0, z4),
+	     z0 = svcvt_s32_x (p0, z4))
+
+/*
+** cvt_s32_f32_x_tied1:
+**	fcvtzs	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s32_f32_x_tied1, svint32_t, svfloat32_t,
+		 z0_res = svcvt_s32_f32_x (p0, z0),
+		 z0_res = svcvt_s32_x (p0, z0))
+
+/*
+** cvt_s32_f32_x_untied:
+**	fcvtzs	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_s32_f32_x_untied, svint32_t, svfloat32_t,
+	     z0 = svcvt_s32_f32_x (p0, z4),
+	     z0 = svcvt_s32_x (p0, z4))
+
+/*
+** cvt_s32_f64_x_tied1:
+**	fcvtzs	z0\.s, p0/m, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s32_f64_x_tied1, svint32_t, svfloat64_t,
+		 z0_res = svcvt_s32_f64_x (p0, z0),
+		 z0_res = svcvt_s32_x (p0, z0))
+
+/*
+** cvt_s32_f64_x_untied:
+**	fcvtzs	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_s32_f64_x_untied, svint32_t, svfloat64_t,
+	     z0 = svcvt_s32_f64_x (p0, z4),
+	     z0 = svcvt_s32_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s64.c
new file mode 100644
index 000000000..68cd80784
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_s64.c
@@ -0,0 +1,210 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cvt_s64_f16_m_tied1:
+**	fcvtzs	z0\.d, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_s64_f16_m_tied1, svint64_t, svfloat16_t,
+	     z0 = svcvt_s64_f16_m (z0, p0, z4),
+	     z0 = svcvt_s64_m (z0, p0, z4))
+
+/*
+** cvt_s64_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fcvtzs	z0\.d, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s64_f16_m_tied2, svint64_t, svfloat16_t,
+		 z0_res = svcvt_s64_f16_m (z4, p0, z0),
+		 z0_res = svcvt_s64_m (z4, p0, z0))
+
+/*
+** cvt_s64_f16_m_untied:
+**	movprfx	z0, z1
+**	fcvtzs	z0\.d, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_s64_f16_m_untied, svint64_t, svfloat16_t,
+	     z0 = svcvt_s64_f16_m (z1, p0, z4),
+	     z0 = svcvt_s64_m (z1, p0, z4))
+
+/*
+** cvt_s64_f32_m_tied1:
+**	fcvtzs	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_s64_f32_m_tied1, svint64_t, svfloat32_t,
+	     z0 = svcvt_s64_f32_m (z0, p0, z4),
+	     z0 = svcvt_s64_m (z0, p0, z4))
+
+/*
+** cvt_s64_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fcvtzs	z0\.d, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s64_f32_m_tied2, svint64_t, svfloat32_t,
+		 z0_res = svcvt_s64_f32_m (z4, p0, z0),
+		 z0_res = svcvt_s64_m (z4, p0, z0))
+
+/*
+** cvt_s64_f32_m_untied:
+**	movprfx	z0, z1
+**	fcvtzs	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_s64_f32_m_untied, svint64_t, svfloat32_t,
+	     z0 = svcvt_s64_f32_m (z1, p0, z4),
+	     z0 = svcvt_s64_m (z1, p0, z4))
+
+/*
+** cvt_s64_f64_m_tied1:
+**	fcvtzs	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_s64_f64_m_tied1, svint64_t, svfloat64_t,
+	     z0 = svcvt_s64_f64_m (z0, p0, z4),
+	     z0 = svcvt_s64_m (z0, p0, z4))
+
+/*
+** cvt_s64_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	fcvtzs	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s64_f64_m_tied2, svint64_t, svfloat64_t,
+		 z0_res = svcvt_s64_f64_m (z4, p0, z0),
+		 z0_res = svcvt_s64_m (z4, p0, z0))
+
+/*
+** cvt_s64_f64_m_untied:
+**	movprfx	z0, z1
+**	fcvtzs	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_s64_f64_m_untied, svint64_t, svfloat64_t,
+	     z0 = svcvt_s64_f64_m (z1, p0, z4),
+	     z0 = svcvt_s64_m (z1, p0, z4))
+
+/*
+** cvt_s64_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.d, p0/z, \1\.d
+**	fcvtzs	z0\.d, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s64_f16_z_tied1, svint64_t, svfloat16_t,
+		 z0_res = svcvt_s64_f16_z (p0, z0),
+		 z0_res = svcvt_s64_z (p0, z0))
+
+/*
+** cvt_s64_f16_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	fcvtzs	z0\.d, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_s64_f16_z_untied, svint64_t, svfloat16_t,
+	     z0 = svcvt_s64_f16_z (p0, z4),
+	     z0 = svcvt_s64_z (p0, z4))
+
+/*
+** cvt_s64_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.d, p0/z, \1\.d
+**	fcvtzs	z0\.d, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s64_f32_z_tied1, svint64_t, svfloat32_t,
+		 z0_res = svcvt_s64_f32_z (p0, z0),
+		 z0_res = svcvt_s64_z (p0, z0))
+
+/*
+** cvt_s64_f32_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	fcvtzs	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_s64_f32_z_untied, svint64_t, svfloat32_t,
+	     z0 = svcvt_s64_f32_z (p0, z4),
+	     z0 = svcvt_s64_z (p0, z4))
+
+/*
+** cvt_s64_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	fcvtzs	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s64_f64_z_tied1, svint64_t, svfloat64_t,
+		 z0_res = svcvt_s64_f64_z (p0, z0),
+		 z0_res = svcvt_s64_z (p0, z0))
+
+/*
+** cvt_s64_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	fcvtzs	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_s64_f64_z_untied, svint64_t, svfloat64_t,
+	     z0 = svcvt_s64_f64_z (p0, z4),
+	     z0 = svcvt_s64_z (p0, z4))
+
+/*
+** cvt_s64_f16_x_tied1:
+**	fcvtzs	z0\.d, p0/m, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s64_f16_x_tied1, svint64_t, svfloat16_t,
+		 z0_res = svcvt_s64_f16_x (p0, z0),
+		 z0_res = svcvt_s64_x (p0, z0))
+
+/*
+** cvt_s64_f16_x_untied:
+**	fcvtzs	z0\.d, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_s64_f16_x_untied, svint64_t, svfloat16_t,
+	     z0 = svcvt_s64_f16_x (p0, z4),
+	     z0 = svcvt_s64_x (p0, z4))
+
+/*
+** cvt_s64_f32_x_tied1:
+**	fcvtzs	z0\.d, p0/m, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s64_f32_x_tied1, svint64_t, svfloat32_t,
+		 z0_res = svcvt_s64_f32_x (p0, z0),
+		 z0_res = svcvt_s64_x (p0, z0))
+
+/*
+** cvt_s64_f32_x_untied:
+**	fcvtzs	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_s64_f32_x_untied, svint64_t, svfloat32_t,
+	     z0 = svcvt_s64_f32_x (p0, z4),
+	     z0 = svcvt_s64_x (p0, z4))
+
+/*
+** cvt_s64_f64_x_tied1:
+**	fcvtzs	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_s64_f64_x_tied1, svint64_t, svfloat64_t,
+		 z0_res = svcvt_s64_f64_x (p0, z0),
+		 z0_res = svcvt_s64_x (p0, z0))
+
+/*
+** cvt_s64_f64_x_untied:
+**	fcvtzs	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_s64_f64_x_untied, svint64_t, svfloat64_t,
+	     z0 = svcvt_s64_f64_x (p0, z4),
+	     z0 = svcvt_s64_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u16.c
new file mode 100644
index 000000000..4db0dffdd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u16.c
@@ -0,0 +1,72 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cvt_u16_f16_m_tied1:
+**	fcvtzu	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_u16_f16_m_tied1, svuint16_t, svfloat16_t,
+	     z0 = svcvt_u16_f16_m (z0, p0, z4),
+	     z0 = svcvt_u16_m (z0, p0, z4))
+
+/*
+** cvt_u16_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fcvtzu	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u16_f16_m_tied2, svuint16_t, svfloat16_t,
+		 z0_res = svcvt_u16_f16_m (z4, p0, z0),
+		 z0_res = svcvt_u16_m (z4, p0, z0))
+
+/*
+** cvt_u16_f16_m_untied:
+**	movprfx	z0, z1
+**	fcvtzu	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_u16_f16_m_untied, svuint16_t, svfloat16_t,
+	     z0 = svcvt_u16_f16_m (z1, p0, z4),
+	     z0 = svcvt_u16_m (z1, p0, z4))
+
+/*
+** cvt_u16_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	fcvtzu	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u16_f16_z_tied1, svuint16_t, svfloat16_t,
+		 z0_res = svcvt_u16_f16_z (p0, z0),
+		 z0_res = svcvt_u16_z (p0, z0))
+
+/*
+** cvt_u16_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z4\.h
+**	fcvtzu	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_u16_f16_z_untied, svuint16_t, svfloat16_t,
+	     z0 = svcvt_u16_f16_z (p0, z4),
+	     z0 = svcvt_u16_z (p0, z4))
+
+/*
+** cvt_u16_f16_x_tied1:
+**	fcvtzu	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u16_f16_x_tied1, svuint16_t, svfloat16_t,
+		 z0_res = svcvt_u16_f16_x (p0, z0),
+		 z0_res = svcvt_u16_x (p0, z0))
+
+/*
+** cvt_u16_f16_x_untied:
+**	fcvtzu	z0\.h, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_u16_f16_x_untied, svuint16_t, svfloat16_t,
+	     z0 = svcvt_u16_f16_x (p0, z4),
+	     z0 = svcvt_u16_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u32.c
new file mode 100644
index 000000000..52ef49fcf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u32.c
@@ -0,0 +1,210 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cvt_u32_f16_m_tied1:
+**	fcvtzu	z0\.s, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_u32_f16_m_tied1, svuint32_t, svfloat16_t,
+	     z0 = svcvt_u32_f16_m (z0, p0, z4),
+	     z0 = svcvt_u32_m (z0, p0, z4))
+
+/*
+** cvt_u32_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fcvtzu	z0\.s, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u32_f16_m_tied2, svuint32_t, svfloat16_t,
+		 z0_res = svcvt_u32_f16_m (z4, p0, z0),
+		 z0_res = svcvt_u32_m (z4, p0, z0))
+
+/*
+** cvt_u32_f16_m_untied:
+**	movprfx	z0, z1
+**	fcvtzu	z0\.s, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_u32_f16_m_untied, svuint32_t, svfloat16_t,
+	     z0 = svcvt_u32_f16_m (z1, p0, z4),
+	     z0 = svcvt_u32_m (z1, p0, z4))
+
+/*
+** cvt_u32_f32_m_tied1:
+**	fcvtzu	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_u32_f32_m_tied1, svuint32_t, svfloat32_t,
+	     z0 = svcvt_u32_f32_m (z0, p0, z4),
+	     z0 = svcvt_u32_m (z0, p0, z4))
+
+/*
+** cvt_u32_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fcvtzu	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u32_f32_m_tied2, svuint32_t, svfloat32_t,
+		 z0_res = svcvt_u32_f32_m (z4, p0, z0),
+		 z0_res = svcvt_u32_m (z4, p0, z0))
+
+/*
+** cvt_u32_f32_m_untied:
+**	movprfx	z0, z1
+**	fcvtzu	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_u32_f32_m_untied, svuint32_t, svfloat32_t,
+	     z0 = svcvt_u32_f32_m (z1, p0, z4),
+	     z0 = svcvt_u32_m (z1, p0, z4))
+
+/*
+** cvt_u32_f64_m_tied1:
+**	fcvtzu	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_u32_f64_m_tied1, svuint32_t, svfloat64_t,
+	     z0 = svcvt_u32_f64_m (z0, p0, z4),
+	     z0 = svcvt_u32_m (z0, p0, z4))
+
+/*
+** cvt_u32_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	fcvtzu	z0\.s, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u32_f64_m_tied2, svuint32_t, svfloat64_t,
+		 z0_res = svcvt_u32_f64_m (z4, p0, z0),
+		 z0_res = svcvt_u32_m (z4, p0, z0))
+
+/*
+** cvt_u32_f64_m_untied:
+**	movprfx	z0, z1
+**	fcvtzu	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_u32_f64_m_untied, svuint32_t, svfloat64_t,
+	     z0 = svcvt_u32_f64_m (z1, p0, z4),
+	     z0 = svcvt_u32_m (z1, p0, z4))
+
+/*
+** cvt_u32_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	fcvtzu	z0\.s, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u32_f16_z_tied1, svuint32_t, svfloat16_t,
+		 z0_res = svcvt_u32_f16_z (p0, z0),
+		 z0_res = svcvt_u32_z (p0, z0))
+
+/*
+** cvt_u32_f16_z_untied:
+**	movprfx	z0\.s, p0/z, z4\.s
+**	fcvtzu	z0\.s, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_u32_f16_z_untied, svuint32_t, svfloat16_t,
+	     z0 = svcvt_u32_f16_z (p0, z4),
+	     z0 = svcvt_u32_z (p0, z4))
+
+/*
+** cvt_u32_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	fcvtzu	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u32_f32_z_tied1, svuint32_t, svfloat32_t,
+		 z0_res = svcvt_u32_f32_z (p0, z0),
+		 z0_res = svcvt_u32_z (p0, z0))
+
+/*
+** cvt_u32_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z4\.s
+**	fcvtzu	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_u32_f32_z_untied, svuint32_t, svfloat32_t,
+	     z0 = svcvt_u32_f32_z (p0, z4),
+	     z0 = svcvt_u32_z (p0, z4))
+
+/*
+** cvt_u32_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	fcvtzu	z0\.s, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u32_f64_z_tied1, svuint32_t, svfloat64_t,
+		 z0_res = svcvt_u32_f64_z (p0, z0),
+		 z0_res = svcvt_u32_z (p0, z0))
+
+/*
+** cvt_u32_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	fcvtzu	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_u32_f64_z_untied, svuint32_t, svfloat64_t,
+	     z0 = svcvt_u32_f64_z (p0, z4),
+	     z0 = svcvt_u32_z (p0, z4))
+
+/*
+** cvt_u32_f16_x_tied1:
+**	fcvtzu	z0\.s, p0/m, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u32_f16_x_tied1, svuint32_t, svfloat16_t,
+		 z0_res = svcvt_u32_f16_x (p0, z0),
+		 z0_res = svcvt_u32_x (p0, z0))
+
+/*
+** cvt_u32_f16_x_untied:
+**	fcvtzu	z0\.s, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_u32_f16_x_untied, svuint32_t, svfloat16_t,
+	     z0 = svcvt_u32_f16_x (p0, z4),
+	     z0 = svcvt_u32_x (p0, z4))
+
+/*
+** cvt_u32_f32_x_tied1:
+**	fcvtzu	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u32_f32_x_tied1, svuint32_t, svfloat32_t,
+		 z0_res = svcvt_u32_f32_x (p0, z0),
+		 z0_res = svcvt_u32_x (p0, z0))
+
+/*
+** cvt_u32_f32_x_untied:
+**	fcvtzu	z0\.s, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_u32_f32_x_untied, svuint32_t, svfloat32_t,
+	     z0 = svcvt_u32_f32_x (p0, z4),
+	     z0 = svcvt_u32_x (p0, z4))
+
+/*
+** cvt_u32_f64_x_tied1:
+**	fcvtzu	z0\.s, p0/m, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u32_f64_x_tied1, svuint32_t, svfloat64_t,
+		 z0_res = svcvt_u32_f64_x (p0, z0),
+		 z0_res = svcvt_u32_x (p0, z0))
+
+/*
+** cvt_u32_f64_x_untied:
+**	fcvtzu	z0\.s, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_u32_f64_x_untied, svuint32_t, svfloat64_t,
+	     z0 = svcvt_u32_f64_x (p0, z4),
+	     z0 = svcvt_u32_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u64.c
new file mode 100644
index 000000000..0c43758ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvt_u64.c
@@ -0,0 +1,210 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cvt_u64_f16_m_tied1:
+**	fcvtzu	z0\.d, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_u64_f16_m_tied1, svuint64_t, svfloat16_t,
+	     z0 = svcvt_u64_f16_m (z0, p0, z4),
+	     z0 = svcvt_u64_m (z0, p0, z4))
+
+/*
+** cvt_u64_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fcvtzu	z0\.d, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u64_f16_m_tied2, svuint64_t, svfloat16_t,
+		 z0_res = svcvt_u64_f16_m (z4, p0, z0),
+		 z0_res = svcvt_u64_m (z4, p0, z0))
+
+/*
+** cvt_u64_f16_m_untied:
+**	movprfx	z0, z1
+**	fcvtzu	z0\.d, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_u64_f16_m_untied, svuint64_t, svfloat16_t,
+	     z0 = svcvt_u64_f16_m (z1, p0, z4),
+	     z0 = svcvt_u64_m (z1, p0, z4))
+
+/*
+** cvt_u64_f32_m_tied1:
+**	fcvtzu	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_u64_f32_m_tied1, svuint64_t, svfloat32_t,
+	     z0 = svcvt_u64_f32_m (z0, p0, z4),
+	     z0 = svcvt_u64_m (z0, p0, z4))
+
+/*
+** cvt_u64_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fcvtzu	z0\.d, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u64_f32_m_tied2, svuint64_t, svfloat32_t,
+		 z0_res = svcvt_u64_f32_m (z4, p0, z0),
+		 z0_res = svcvt_u64_m (z4, p0, z0))
+
+/*
+** cvt_u64_f32_m_untied:
+**	movprfx	z0, z1
+**	fcvtzu	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_u64_f32_m_untied, svuint64_t, svfloat32_t,
+	     z0 = svcvt_u64_f32_m (z1, p0, z4),
+	     z0 = svcvt_u64_m (z1, p0, z4))
+
+/*
+** cvt_u64_f64_m_tied1:
+**	fcvtzu	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_u64_f64_m_tied1, svuint64_t, svfloat64_t,
+	     z0 = svcvt_u64_f64_m (z0, p0, z4),
+	     z0 = svcvt_u64_m (z0, p0, z4))
+
+/*
+** cvt_u64_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	fcvtzu	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u64_f64_m_tied2, svuint64_t, svfloat64_t,
+		 z0_res = svcvt_u64_f64_m (z4, p0, z0),
+		 z0_res = svcvt_u64_m (z4, p0, z0))
+
+/*
+** cvt_u64_f64_m_untied:
+**	movprfx	z0, z1
+**	fcvtzu	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_u64_f64_m_untied, svuint64_t, svfloat64_t,
+	     z0 = svcvt_u64_f64_m (z1, p0, z4),
+	     z0 = svcvt_u64_m (z1, p0, z4))
+
+/*
+** cvt_u64_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.d, p0/z, \1\.d
+**	fcvtzu	z0\.d, p0/m, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u64_f16_z_tied1, svuint64_t, svfloat16_t,
+		 z0_res = svcvt_u64_f16_z (p0, z0),
+		 z0_res = svcvt_u64_z (p0, z0))
+
+/*
+** cvt_u64_f16_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	fcvtzu	z0\.d, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_u64_f16_z_untied, svuint64_t, svfloat16_t,
+	     z0 = svcvt_u64_f16_z (p0, z4),
+	     z0 = svcvt_u64_z (p0, z4))
+
+/*
+** cvt_u64_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.d, p0/z, \1\.d
+**	fcvtzu	z0\.d, p0/m, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u64_f32_z_tied1, svuint64_t, svfloat32_t,
+		 z0_res = svcvt_u64_f32_z (p0, z0),
+		 z0_res = svcvt_u64_z (p0, z0))
+
+/*
+** cvt_u64_f32_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	fcvtzu	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_u64_f32_z_untied, svuint64_t, svfloat32_t,
+	     z0 = svcvt_u64_f32_z (p0, z4),
+	     z0 = svcvt_u64_z (p0, z4))
+
+/*
+** cvt_u64_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	fcvtzu	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u64_f64_z_tied1, svuint64_t, svfloat64_t,
+		 z0_res = svcvt_u64_f64_z (p0, z0),
+		 z0_res = svcvt_u64_z (p0, z0))
+
+/*
+** cvt_u64_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z4\.d
+**	fcvtzu	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_u64_f64_z_untied, svuint64_t, svfloat64_t,
+	     z0 = svcvt_u64_f64_z (p0, z4),
+	     z0 = svcvt_u64_z (p0, z4))
+
+/*
+** cvt_u64_f16_x_tied1:
+**	fcvtzu	z0\.d, p0/m, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u64_f16_x_tied1, svuint64_t, svfloat16_t,
+		 z0_res = svcvt_u64_f16_x (p0, z0),
+		 z0_res = svcvt_u64_x (p0, z0))
+
+/*
+** cvt_u64_f16_x_untied:
+**	fcvtzu	z0\.d, p0/m, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (cvt_u64_f16_x_untied, svuint64_t, svfloat16_t,
+	     z0 = svcvt_u64_f16_x (p0, z4),
+	     z0 = svcvt_u64_x (p0, z4))
+
+/*
+** cvt_u64_f32_x_tied1:
+**	fcvtzu	z0\.d, p0/m, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u64_f32_x_tied1, svuint64_t, svfloat32_t,
+		 z0_res = svcvt_u64_f32_x (p0, z0),
+		 z0_res = svcvt_u64_x (p0, z0))
+
+/*
+** cvt_u64_f32_x_untied:
+**	fcvtzu	z0\.d, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvt_u64_f32_x_untied, svuint64_t, svfloat32_t,
+	     z0 = svcvt_u64_f32_x (p0, z4),
+	     z0 = svcvt_u64_x (p0, z4))
+
+/*
+** cvt_u64_f64_x_tied1:
+**	fcvtzu	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (cvt_u64_f64_x_tied1, svuint64_t, svfloat64_t,
+		 z0_res = svcvt_u64_f64_x (p0, z0),
+		 z0_res = svcvt_u64_x (p0, z0))
+
+/*
+** cvt_u64_f64_x_untied:
+**	fcvtzu	z0\.d, p0/m, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (cvt_u64_f64_x_untied, svuint64_t, svfloat64_t,
+	     z0 = svcvt_u64_f64_x (p0, z4),
+	     z0 = svcvt_u64_x (p0, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvtnt_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvtnt_bf16.c
new file mode 100644
index 000000000..54614c95d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cvtnt_bf16.c
@@ -0,0 +1,90 @@
+/* { dg-additional-options "-march=armv8.2-a+sve+bf16" } */
+/* { dg-require-effective-target aarch64_asm_bf16_ok }  */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cvtnt_bf16_f32_m_tied1:
+**	bfcvtnt	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvtnt_bf16_f32_m_tied1, svbfloat16_t, svfloat32_t,
+	     z0 = svcvtnt_bf16_f32_m (z0, p0, z4),
+	     z0 = svcvtnt_bf16_m (z0, p0, z4))
+
+/* Bad RA choice: no preferred output sequence.  */
+TEST_DUAL_Z_REV (cvtnt_bf16_f32_m_tied2, svbfloat16_t, svfloat32_t,
+		 z0_res = svcvtnt_bf16_f32_m (z4, p0, z0),
+		 z0_res = svcvtnt_bf16_m (z4, p0, z0))
+
+/*
+** cvtnt_bf16_f32_m_untied:
+** (
+**	mov	z0\.d, z1\.d
+**	bfcvtnt	z0\.h, p0/m, z4\.s
+** |
+**	bfcvtnt	z1\.h, p0/m, z4\.s
+**	mov	z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_DUAL_Z (cvtnt_bf16_f32_m_untied, svbfloat16_t, svfloat32_t,
+	     z0 = svcvtnt_bf16_f32_m (z1, p0, z4),
+	     z0 = svcvtnt_bf16_m (z1, p0, z4))
+
+/*
+** cvtnt_bf16_f32_x_tied1:
+**	bfcvtnt	z0\.h, p0/m, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (cvtnt_bf16_f32_x_tied1, svbfloat16_t, svfloat32_t,
+	     z0 = svcvtnt_bf16_f32_x (z0, p0, z4),
+	     z0 = svcvtnt_bf16_x (z0, p0, z4))
+
+/* Bad RA choice: no preferred output sequence.  */
+TEST_DUAL_Z_REV (cvtnt_bf16_f32_x_tied2, svbfloat16_t, svfloat32_t,
+		 z0_res = svcvtnt_bf16_f32_x (z4, p0, z0),
+		 z0_res = svcvtnt_bf16_x (z4, p0, z0))
+
+/*
+** cvtnt_bf16_f32_x_untied:
+** (
+**	mov	z0\.d, z1\.d
+**	bfcvtnt	z0\.h, p0/m, z4\.s
+** |
+**	bfcvtnt	z1\.h, p0/m, z4\.s
+**	mov	z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_DUAL_Z (cvtnt_bf16_f32_x_untied, svbfloat16_t, svfloat32_t,
+	     z0 = svcvtnt_bf16_f32_x (z1, p0, z4),
+	     z0 = svcvtnt_bf16_x (z1, p0, z4))
+
+/*
+** ptrue_cvtnt_bf16_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvtnt_bf16_f32_x_tied1, svbfloat16_t, svfloat32_t,
+	     z0 = svcvtnt_bf16_f32_x (z0, svptrue_b32 (), z4),
+	     z0 = svcvtnt_bf16_x (z0, svptrue_b32 (), z4))
+
+/* Bad RA choice: no preferred output sequence.  */
+TEST_DUAL_Z_REV (ptrue_cvtnt_bf16_f32_x_tied2, svbfloat16_t, svfloat32_t,
+		 z0_res = svcvtnt_bf16_f32_x (z4, svptrue_b32 (), z0),
+		 z0_res = svcvtnt_bf16_x (z4, svptrue_b32 (), z0))
+
+/*
+** ptrue_cvtnt_bf16_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_cvtnt_bf16_f32_x_untied, svbfloat16_t, svfloat32_t,
+	     z0 = svcvtnt_bf16_f32_x (z1, svptrue_b32 (), z4),
+	     z0 = svcvtnt_bf16_x (z1, svptrue_b32 (), z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f16.c
new file mode 100644
index 000000000..35f5c1589
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f16.c
@@ -0,0 +1,303 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** div_f16_m_tied1:
+**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (div_f16_m_tied1, svfloat16_t,
+		z0 = svdiv_f16_m (p0, z0, z1),
+		z0 = svdiv_m (p0, z0, z1))
+
+/*
+** div_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fdiv	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (div_f16_m_tied2, svfloat16_t,
+		z0 = svdiv_f16_m (p0, z1, z0),
+		z0 = svdiv_m (p0, z1, z0))
+
+/*
+** div_f16_m_untied:
+**	movprfx	z0, z1
+**	fdiv	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (div_f16_m_untied, svfloat16_t,
+		z0 = svdiv_f16_m (p0, z1, z2),
+		z0 = svdiv_m (p0, z1, z2))
+
+/*
+** div_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fdiv	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (div_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svdiv_n_f16_m (p0, z0, d4),
+		 z0 = svdiv_m (p0, z0, d4))
+
+/*
+** div_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fdiv	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (div_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svdiv_n_f16_m (p0, z1, d4),
+		 z0 = svdiv_m (p0, z1, d4))
+
+/*
+** div_1_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fdiv	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f16_m_tied1, svfloat16_t,
+		z0 = svdiv_n_f16_m (p0, z0, 1),
+		z0 = svdiv_m (p0, z0, 1))
+
+/*
+** div_1_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fdiv	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f16_m_untied, svfloat16_t,
+		z0 = svdiv_n_f16_m (p0, z1, 1),
+		z0 = svdiv_m (p0, z1, 1))
+
+/*
+** div_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (div_f16_z_tied1, svfloat16_t,
+		z0 = svdiv_f16_z (p0, z0, z1),
+		z0 = svdiv_z (p0, z0, z1))
+
+/*
+** div_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (div_f16_z_tied2, svfloat16_t,
+		z0 = svdiv_f16_z (p0, z1, z0),
+		z0 = svdiv_z (p0, z1, z0))
+
+/*
+** div_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fdiv	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_f16_z_untied, svfloat16_t,
+		z0 = svdiv_f16_z (p0, z1, z2),
+		z0 = svdiv_z (p0, z1, z2))
+
+/*
+** div_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fdiv	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (div_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svdiv_n_f16_z (p0, z0, d4),
+		 z0 = svdiv_z (p0, z0, d4))
+
+/*
+** div_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fdiv	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (div_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svdiv_n_f16_z (p0, z1, d4),
+		 z0 = svdiv_z (p0, z1, d4))
+
+/*
+** div_1_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fdiv	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f16_z_tied1, svfloat16_t,
+		z0 = svdiv_n_f16_z (p0, z0, 1),
+		z0 = svdiv_z (p0, z0, 1))
+
+/*
+** div_1_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fdiv	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f16_z_untied, svfloat16_t,
+		z0 = svdiv_n_f16_z (p0, z1, 1),
+		z0 = svdiv_z (p0, z1, 1))
+
+/*
+** div_0p5_f16_z:
+**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fdiv	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_0p5_f16_z, svfloat16_t,
+		z0 = svdiv_n_f16_z (p0, z0, 0.5),
+		z0 = svdiv_z (p0, z0, 0.5))
+
+/*
+** div_f16_x_tied1:
+**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (div_f16_x_tied1, svfloat16_t,
+		z0 = svdiv_f16_x (p0, z0, z1),
+		z0 = svdiv_x (p0, z0, z1))
+
+/*
+** div_f16_x_tied2:
+**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (div_f16_x_tied2, svfloat16_t,
+		z0 = svdiv_f16_x (p0, z1, z0),
+		z0 = svdiv_x (p0, z1, z0))
+
+/*
+** div_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fdiv	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_f16_x_untied, svfloat16_t,
+		z0 = svdiv_f16_x (p0, z1, z2),
+		z0 = svdiv_x (p0, z1, z2))
+
+/*
+** div_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fdiv	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (div_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svdiv_n_f16_x (p0, z0, d4),
+		 z0 = svdiv_x (p0, z0, d4))
+
+/*
+** div_h4_f16_x_untied: { xfail *-*-* }
+**	mov	z0\.h, h4
+**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (div_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svdiv_n_f16_x (p0, z1, d4),
+		 z0 = svdiv_x (p0, z1, d4))
+
+/*
+** div_1_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fdiv	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f16_x_tied1, svfloat16_t,
+		z0 = svdiv_n_f16_x (p0, z0, 1),
+		z0 = svdiv_x (p0, z0, 1))
+
+/*
+** div_1_f16_x_untied:
+**	fmov	z0\.h, #1\.0(?:e\+0)?
+**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f16_x_untied, svfloat16_t,
+		z0 = svdiv_n_f16_x (p0, z1, 1),
+		z0 = svdiv_x (p0, z1, 1))
+
+/*
+** ptrue_div_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_div_f16_x_tied1, svfloat16_t,
+		z0 = svdiv_f16_x (svptrue_b16 (), z0, z1),
+		z0 = svdiv_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_div_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_div_f16_x_tied2, svfloat16_t,
+		z0 = svdiv_f16_x (svptrue_b16 (), z1, z0),
+		z0 = svdiv_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_div_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_div_f16_x_untied, svfloat16_t,
+		z0 = svdiv_f16_x (svptrue_b16 (), z1, z2),
+		z0 = svdiv_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_div_1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_div_1_f16_x_tied1, svfloat16_t,
+		z0 = svdiv_n_f16_x (svptrue_b16 (), z0, 1),
+		z0 = svdiv_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_div_1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_div_1_f16_x_untied, svfloat16_t,
+		z0 = svdiv_n_f16_x (svptrue_b16 (), z1, 1),
+		z0 = svdiv_x (svptrue_b16 (), z1, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f32.c
new file mode 100644
index 000000000..40cc203da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f32.c
@@ -0,0 +1,303 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** div_f32_m_tied1:
+**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_f32_m_tied1, svfloat32_t,
+		z0 = svdiv_f32_m (p0, z0, z1),
+		z0 = svdiv_m (p0, z0, z1))
+
+/*
+** div_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fdiv	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_f32_m_tied2, svfloat32_t,
+		z0 = svdiv_f32_m (p0, z1, z0),
+		z0 = svdiv_m (p0, z1, z0))
+
+/*
+** div_f32_m_untied:
+**	movprfx	z0, z1
+**	fdiv	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_f32_m_untied, svfloat32_t,
+		z0 = svdiv_f32_m (p0, z1, z2),
+		z0 = svdiv_m (p0, z1, z2))
+
+/*
+** div_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (div_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svdiv_n_f32_m (p0, z0, d4),
+		 z0 = svdiv_m (p0, z0, d4))
+
+/*
+** div_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (div_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svdiv_n_f32_m (p0, z1, d4),
+		 z0 = svdiv_m (p0, z1, d4))
+
+/*
+** div_1_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f32_m_tied1, svfloat32_t,
+		z0 = svdiv_n_f32_m (p0, z0, 1),
+		z0 = svdiv_m (p0, z0, 1))
+
+/*
+** div_1_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f32_m_untied, svfloat32_t,
+		z0 = svdiv_n_f32_m (p0, z1, 1),
+		z0 = svdiv_m (p0, z1, 1))
+
+/*
+** div_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_f32_z_tied1, svfloat32_t,
+		z0 = svdiv_f32_z (p0, z0, z1),
+		z0 = svdiv_z (p0, z0, z1))
+
+/*
+** div_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_f32_z_tied2, svfloat32_t,
+		z0 = svdiv_f32_z (p0, z1, z0),
+		z0 = svdiv_z (p0, z1, z0))
+
+/*
+** div_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fdiv	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_f32_z_untied, svfloat32_t,
+		z0 = svdiv_f32_z (p0, z1, z2),
+		z0 = svdiv_z (p0, z1, z2))
+
+/*
+** div_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (div_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svdiv_n_f32_z (p0, z0, d4),
+		 z0 = svdiv_z (p0, z0, d4))
+
+/*
+** div_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fdiv	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (div_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svdiv_n_f32_z (p0, z1, d4),
+		 z0 = svdiv_z (p0, z1, d4))
+
+/*
+** div_1_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f32_z_tied1, svfloat32_t,
+		z0 = svdiv_n_f32_z (p0, z0, 1),
+		z0 = svdiv_z (p0, z0, 1))
+
+/*
+** div_1_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fdiv	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f32_z_untied, svfloat32_t,
+		z0 = svdiv_n_f32_z (p0, z1, 1),
+		z0 = svdiv_z (p0, z1, 1))
+
+/*
+** div_0p5_f32_z:
+**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_0p5_f32_z, svfloat32_t,
+		z0 = svdiv_n_f32_z (p0, z0, 0.5),
+		z0 = svdiv_z (p0, z0, 0.5))
+
+/*
+** div_f32_x_tied1:
+**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_f32_x_tied1, svfloat32_t,
+		z0 = svdiv_f32_x (p0, z0, z1),
+		z0 = svdiv_x (p0, z0, z1))
+
+/*
+** div_f32_x_tied2:
+**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_f32_x_tied2, svfloat32_t,
+		z0 = svdiv_f32_x (p0, z1, z0),
+		z0 = svdiv_x (p0, z1, z0))
+
+/*
+** div_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fdiv	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_f32_x_untied, svfloat32_t,
+		z0 = svdiv_f32_x (p0, z1, z2),
+		z0 = svdiv_x (p0, z1, z2))
+
+/*
+** div_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (div_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svdiv_n_f32_x (p0, z0, d4),
+		 z0 = svdiv_x (p0, z0, d4))
+
+/*
+** div_s4_f32_x_untied: { xfail *-*-* }
+**	mov	z0\.s, s4
+**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (div_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svdiv_n_f32_x (p0, z1, d4),
+		 z0 = svdiv_x (p0, z1, d4))
+
+/*
+** div_1_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f32_x_tied1, svfloat32_t,
+		z0 = svdiv_n_f32_x (p0, z0, 1),
+		z0 = svdiv_x (p0, z0, 1))
+
+/*
+** div_1_f32_x_untied:
+**	fmov	z0\.s, #1\.0(?:e\+0)?
+**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f32_x_untied, svfloat32_t,
+		z0 = svdiv_n_f32_x (p0, z1, 1),
+		z0 = svdiv_x (p0, z1, 1))
+
+/*
+** ptrue_div_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_div_f32_x_tied1, svfloat32_t,
+		z0 = svdiv_f32_x (svptrue_b32 (), z0, z1),
+		z0 = svdiv_x (svptrue_b32 (), z0, z1))
+
+/*
+** ptrue_div_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_div_f32_x_tied2, svfloat32_t,
+		z0 = svdiv_f32_x (svptrue_b32 (), z1, z0),
+		z0 = svdiv_x (svptrue_b32 (), z1, z0))
+
+/*
+** ptrue_div_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_div_f32_x_untied, svfloat32_t,
+		z0 = svdiv_f32_x (svptrue_b32 (), z1, z2),
+		z0 = svdiv_x (svptrue_b32 (), z1, z2))
+
+/*
+** ptrue_div_1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_div_1_f32_x_tied1, svfloat32_t,
+		z0 = svdiv_n_f32_x (svptrue_b32 (), z0, 1),
+		z0 = svdiv_x (svptrue_b32 (), z0, 1))
+
+/*
+** ptrue_div_1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_div_1_f32_x_untied, svfloat32_t,
+		z0 = svdiv_n_f32_x (svptrue_b32 (), z1, 1),
+		z0 = svdiv_x (svptrue_b32 (), z1, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f64.c
new file mode 100644
index 000000000..56acbbe95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_f64.c
@@ -0,0 +1,303 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** div_f64_m_tied1:
+**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_f64_m_tied1, svfloat64_t,
+		z0 = svdiv_f64_m (p0, z0, z1),
+		z0 = svdiv_m (p0, z0, z1))
+
+/*
+** div_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_f64_m_tied2, svfloat64_t,
+		z0 = svdiv_f64_m (p0, z1, z0),
+		z0 = svdiv_m (p0, z1, z0))
+
+/*
+** div_f64_m_untied:
+**	movprfx	z0, z1
+**	fdiv	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_f64_m_untied, svfloat64_t,
+		z0 = svdiv_f64_m (p0, z1, z2),
+		z0 = svdiv_m (p0, z1, z2))
+
+/*
+** div_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (div_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svdiv_n_f64_m (p0, z0, d4),
+		 z0 = svdiv_m (p0, z0, d4))
+
+/*
+** div_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (div_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svdiv_n_f64_m (p0, z1, d4),
+		 z0 = svdiv_m (p0, z1, d4))
+
+/*
+** div_1_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f64_m_tied1, svfloat64_t,
+		z0 = svdiv_n_f64_m (p0, z0, 1),
+		z0 = svdiv_m (p0, z0, 1))
+
+/*
+** div_1_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f64_m_untied, svfloat64_t,
+		z0 = svdiv_n_f64_m (p0, z1, 1),
+		z0 = svdiv_m (p0, z1, 1))
+
+/*
+** div_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_f64_z_tied1, svfloat64_t,
+		z0 = svdiv_f64_z (p0, z0, z1),
+		z0 = svdiv_z (p0, z0, z1))
+
+/*
+** div_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_f64_z_tied2, svfloat64_t,
+		z0 = svdiv_f64_z (p0, z1, z0),
+		z0 = svdiv_z (p0, z1, z0))
+
+/*
+** div_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fdiv	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_f64_z_untied, svfloat64_t,
+		z0 = svdiv_f64_z (p0, z1, z2),
+		z0 = svdiv_z (p0, z1, z2))
+
+/*
+** div_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (div_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svdiv_n_f64_z (p0, z0, d4),
+		 z0 = svdiv_z (p0, z0, d4))
+
+/*
+** div_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fdiv	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (div_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svdiv_n_f64_z (p0, z1, d4),
+		 z0 = svdiv_z (p0, z1, d4))
+
+/*
+** div_1_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f64_z_tied1, svfloat64_t,
+		z0 = svdiv_n_f64_z (p0, z0, 1),
+		z0 = svdiv_z (p0, z0, 1))
+
+/*
+** div_1_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fdiv	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f64_z_untied, svfloat64_t,
+		z0 = svdiv_n_f64_z (p0, z1, 1),
+		z0 = svdiv_z (p0, z1, 1))
+
+/*
+** div_0p5_f64_z:
+**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_0p5_f64_z, svfloat64_t,
+		z0 = svdiv_n_f64_z (p0, z0, 0.5),
+		z0 = svdiv_z (p0, z0, 0.5))
+
+/*
+** div_f64_x_tied1:
+**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_f64_x_tied1, svfloat64_t,
+		z0 = svdiv_f64_x (p0, z0, z1),
+		z0 = svdiv_x (p0, z0, z1))
+
+/*
+** div_f64_x_tied2:
+**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_f64_x_tied2, svfloat64_t,
+		z0 = svdiv_f64_x (p0, z1, z0),
+		z0 = svdiv_x (p0, z1, z0))
+
+/*
+** div_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fdiv	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_f64_x_untied, svfloat64_t,
+		z0 = svdiv_f64_x (p0, z1, z2),
+		z0 = svdiv_x (p0, z1, z2))
+
+/*
+** div_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (div_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svdiv_n_f64_x (p0, z0, d4),
+		 z0 = svdiv_x (p0, z0, d4))
+
+/*
+** div_d4_f64_x_untied: { xfail *-*-* }
+**	mov	z0\.d, d4
+**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (div_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svdiv_n_f64_x (p0, z1, d4),
+		 z0 = svdiv_x (p0, z1, d4))
+
+/*
+** div_1_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f64_x_tied1, svfloat64_t,
+		z0 = svdiv_n_f64_x (p0, z0, 1),
+		z0 = svdiv_x (p0, z0, 1))
+
+/*
+** div_1_f64_x_untied:
+**	fmov	z0\.d, #1\.0(?:e\+0)?
+**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_1_f64_x_untied, svfloat64_t,
+		z0 = svdiv_n_f64_x (p0, z1, 1),
+		z0 = svdiv_x (p0, z1, 1))
+
+/*
+** ptrue_div_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_div_f64_x_tied1, svfloat64_t,
+		z0 = svdiv_f64_x (svptrue_b64 (), z0, z1),
+		z0 = svdiv_x (svptrue_b64 (), z0, z1))
+
+/*
+** ptrue_div_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_div_f64_x_tied2, svfloat64_t,
+		z0 = svdiv_f64_x (svptrue_b64 (), z1, z0),
+		z0 = svdiv_x (svptrue_b64 (), z1, z0))
+
+/*
+** ptrue_div_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_div_f64_x_untied, svfloat64_t,
+		z0 = svdiv_f64_x (svptrue_b64 (), z1, z2),
+		z0 = svdiv_x (svptrue_b64 (), z1, z2))
+
+/*
+** ptrue_div_1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_div_1_f64_x_tied1, svfloat64_t,
+		z0 = svdiv_n_f64_x (svptrue_b64 (), z0, 1),
+		z0 = svdiv_x (svptrue_b64 (), z0, 1))
+
+/*
+** ptrue_div_1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_div_1_f64_x_untied, svfloat64_t,
+		z0 = svdiv_n_f64_x (svptrue_b64 (), z1, 1),
+		z0 = svdiv_x (svptrue_b64 (), z1, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
new file mode 100644
index 000000000..8e70ae797
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** div_s32_m_tied1:
+**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_s32_m_tied1, svint32_t,
+		z0 = svdiv_s32_m (p0, z0, z1),
+		z0 = svdiv_m (p0, z0, z1))
+
+/*
+** div_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	sdiv	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_s32_m_tied2, svint32_t,
+		z0 = svdiv_s32_m (p0, z1, z0),
+		z0 = svdiv_m (p0, z1, z0))
+
+/*
+** div_s32_m_untied:
+**	movprfx	z0, z1
+**	sdiv	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_s32_m_untied, svint32_t,
+		z0 = svdiv_s32_m (p0, z1, z2),
+		z0 = svdiv_m (p0, z1, z2))
+
+/*
+** div_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	sdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (div_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svdiv_n_s32_m (p0, z0, x0),
+		 z0 = svdiv_m (p0, z0, x0))
+
+/*
+** div_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	sdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (div_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svdiv_n_s32_m (p0, z1, x0),
+		 z0 = svdiv_m (p0, z1, x0))
+
+/*
+** div_2_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #2
+**	sdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_s32_m_tied1, svint32_t,
+		z0 = svdiv_n_s32_m (p0, z0, 2),
+		z0 = svdiv_m (p0, z0, 2))
+
+/*
+** div_2_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #2
+**	movprfx	z0, z1
+**	sdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_s32_m_untied, svint32_t,
+		z0 = svdiv_n_s32_m (p0, z1, 2),
+		z0 = svdiv_m (p0, z1, 2))
+
+/*
+** div_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_s32_z_tied1, svint32_t,
+		z0 = svdiv_s32_z (p0, z0, z1),
+		z0 = svdiv_z (p0, z0, z1))
+
+/*
+** div_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_s32_z_tied2, svint32_t,
+		z0 = svdiv_s32_z (p0, z1, z0),
+		z0 = svdiv_z (p0, z1, z0))
+
+/*
+** div_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	sdiv	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_s32_z_untied, svint32_t,
+		z0 = svdiv_s32_z (p0, z1, z2),
+		z0 = svdiv_z (p0, z1, z2))
+
+/*
+** div_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (div_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svdiv_n_s32_z (p0, z0, x0),
+		 z0 = svdiv_z (p0, z0, x0))
+
+/*
+** div_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	sdiv	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (div_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svdiv_n_s32_z (p0, z1, x0),
+		 z0 = svdiv_z (p0, z1, x0))
+
+/*
+** div_2_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #2
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_s32_z_tied1, svint32_t,
+		z0 = svdiv_n_s32_z (p0, z0, 2),
+		z0 = svdiv_z (p0, z0, 2))
+
+/*
+** div_2_s32_z_untied:
+**	mov	(z[0-9]+\.s), #2
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	sdiv	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_s32_z_untied, svint32_t,
+		z0 = svdiv_n_s32_z (p0, z1, 2),
+		z0 = svdiv_z (p0, z1, 2))
+
+/*
+** div_s32_x_tied1:
+**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_s32_x_tied1, svint32_t,
+		z0 = svdiv_s32_x (p0, z0, z1),
+		z0 = svdiv_x (p0, z0, z1))
+
+/*
+** div_s32_x_tied2:
+**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_s32_x_tied2, svint32_t,
+		z0 = svdiv_s32_x (p0, z1, z0),
+		z0 = svdiv_x (p0, z1, z0))
+
+/*
+** div_s32_x_untied:
+** (
+**	movprfx	z0, z1
+**	sdiv	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_s32_x_untied, svint32_t,
+		z0 = svdiv_s32_x (p0, z1, z2),
+		z0 = svdiv_x (p0, z1, z2))
+
+/*
+** div_w0_s32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	sdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (div_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svdiv_n_s32_x (p0, z0, x0),
+		 z0 = svdiv_x (p0, z0, x0))
+
+/*
+** div_w0_s32_x_untied:
+**	mov	z0\.s, w0
+**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (div_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svdiv_n_s32_x (p0, z1, x0),
+		 z0 = svdiv_x (p0, z1, x0))
+
+/*
+** div_2_s32_x_tied1:
+**	mov	(z[0-9]+\.s), #2
+**	sdiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_s32_x_tied1, svint32_t,
+		z0 = svdiv_n_s32_x (p0, z0, 2),
+		z0 = svdiv_x (p0, z0, 2))
+
+/*
+** div_2_s32_x_untied:
+**	mov	z0\.s, #2
+**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_s32_x_untied, svint32_t,
+		z0 = svdiv_n_s32_x (p0, z1, 2),
+		z0 = svdiv_x (p0, z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c
new file mode 100644
index 000000000..439da1f57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s64.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** div_s64_m_tied1:
+**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_s64_m_tied1, svint64_t,
+		z0 = svdiv_s64_m (p0, z0, z1),
+		z0 = svdiv_m (p0, z0, z1))
+
+/*
+** div_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	sdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_s64_m_tied2, svint64_t,
+		z0 = svdiv_s64_m (p0, z1, z0),
+		z0 = svdiv_m (p0, z1, z0))
+
+/*
+** div_s64_m_untied:
+**	movprfx	z0, z1
+**	sdiv	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_s64_m_untied, svint64_t,
+		z0 = svdiv_s64_m (p0, z1, z2),
+		z0 = svdiv_m (p0, z1, z2))
+
+/*
+** div_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	sdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (div_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svdiv_n_s64_m (p0, z0, x0),
+		 z0 = svdiv_m (p0, z0, x0))
+
+/*
+** div_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	sdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (div_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svdiv_n_s64_m (p0, z1, x0),
+		 z0 = svdiv_m (p0, z1, x0))
+
+/*
+** div_2_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #2
+**	sdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_s64_m_tied1, svint64_t,
+		z0 = svdiv_n_s64_m (p0, z0, 2),
+		z0 = svdiv_m (p0, z0, 2))
+
+/*
+** div_2_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #2
+**	movprfx	z0, z1
+**	sdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_s64_m_untied, svint64_t,
+		z0 = svdiv_n_s64_m (p0, z1, 2),
+		z0 = svdiv_m (p0, z1, 2))
+
+/*
+** div_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_s64_z_tied1, svint64_t,
+		z0 = svdiv_s64_z (p0, z0, z1),
+		z0 = svdiv_z (p0, z0, z1))
+
+/*
+** div_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_s64_z_tied2, svint64_t,
+		z0 = svdiv_s64_z (p0, z1, z0),
+		z0 = svdiv_z (p0, z1, z0))
+
+/*
+** div_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	sdiv	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_s64_z_untied, svint64_t,
+		z0 = svdiv_s64_z (p0, z1, z2),
+		z0 = svdiv_z (p0, z1, z2))
+
+/*
+** div_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (div_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svdiv_n_s64_z (p0, z0, x0),
+		 z0 = svdiv_z (p0, z0, x0))
+
+/*
+** div_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	sdiv	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (div_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svdiv_n_s64_z (p0, z1, x0),
+		 z0 = svdiv_z (p0, z1, x0))
+
+/*
+** div_2_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #2
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_s64_z_tied1, svint64_t,
+		z0 = svdiv_n_s64_z (p0, z0, 2),
+		z0 = svdiv_z (p0, z0, 2))
+
+/*
+** div_2_s64_z_untied:
+**	mov	(z[0-9]+\.d), #2
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	sdiv	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_s64_z_untied, svint64_t,
+		z0 = svdiv_n_s64_z (p0, z1, 2),
+		z0 = svdiv_z (p0, z1, 2))
+
+/*
+** div_s64_x_tied1:
+**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_s64_x_tied1, svint64_t,
+		z0 = svdiv_s64_x (p0, z0, z1),
+		z0 = svdiv_x (p0, z0, z1))
+
+/*
+** div_s64_x_tied2:
+**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_s64_x_tied2, svint64_t,
+		z0 = svdiv_s64_x (p0, z1, z0),
+		z0 = svdiv_x (p0, z1, z0))
+
+/*
+** div_s64_x_untied:
+** (
+**	movprfx	z0, z1
+**	sdiv	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_s64_x_untied, svint64_t,
+		z0 = svdiv_s64_x (p0, z1, z2),
+		z0 = svdiv_x (p0, z1, z2))
+
+/*
+** div_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	sdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (div_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svdiv_n_s64_x (p0, z0, x0),
+		 z0 = svdiv_x (p0, z0, x0))
+
+/*
+** div_x0_s64_x_untied:
+**	mov	z0\.d, x0
+**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (div_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svdiv_n_s64_x (p0, z1, x0),
+		 z0 = svdiv_x (p0, z1, x0))
+
+/*
+** div_2_s64_x_tied1:
+**	mov	(z[0-9]+\.d), #2
+**	sdiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_s64_x_tied1, svint64_t,
+		z0 = svdiv_n_s64_x (p0, z0, 2),
+		z0 = svdiv_x (p0, z0, 2))
+
+/*
+** div_2_s64_x_untied:
+**	mov	z0\.d, #2
+**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_s64_x_untied, svint64_t,
+		z0 = svdiv_n_s64_x (p0, z1, 2),
+		z0 = svdiv_x (p0, z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u32.c
new file mode 100644
index 000000000..8e8e464b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u32.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** div_u32_m_tied1:
+**	udiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_u32_m_tied1, svuint32_t,
+		z0 = svdiv_u32_m (p0, z0, z1),
+		z0 = svdiv_m (p0, z0, z1))
+
+/*
+** div_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	udiv	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_u32_m_tied2, svuint32_t,
+		z0 = svdiv_u32_m (p0, z1, z0),
+		z0 = svdiv_m (p0, z1, z0))
+
+/*
+** div_u32_m_untied:
+**	movprfx	z0, z1
+**	udiv	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_u32_m_untied, svuint32_t,
+		z0 = svdiv_u32_m (p0, z1, z2),
+		z0 = svdiv_m (p0, z1, z2))
+
+/*
+** div_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	udiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (div_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svdiv_n_u32_m (p0, z0, x0),
+		 z0 = svdiv_m (p0, z0, x0))
+
+/*
+** div_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	udiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (div_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svdiv_n_u32_m (p0, z1, x0),
+		 z0 = svdiv_m (p0, z1, x0))
+
+/*
+** div_2_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #2
+**	udiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_u32_m_tied1, svuint32_t,
+		z0 = svdiv_n_u32_m (p0, z0, 2),
+		z0 = svdiv_m (p0, z0, 2))
+
+/*
+** div_2_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #2
+**	movprfx	z0, z1
+**	udiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_u32_m_untied, svuint32_t,
+		z0 = svdiv_n_u32_m (p0, z1, 2),
+		z0 = svdiv_m (p0, z1, 2))
+
+/*
+** div_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	udiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_u32_z_tied1, svuint32_t,
+		z0 = svdiv_u32_z (p0, z0, z1),
+		z0 = svdiv_z (p0, z0, z1))
+
+/*
+** div_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	udivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_u32_z_tied2, svuint32_t,
+		z0 = svdiv_u32_z (p0, z1, z0),
+		z0 = svdiv_z (p0, z1, z0))
+
+/*
+** div_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	udiv	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	udivr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_u32_z_untied, svuint32_t,
+		z0 = svdiv_u32_z (p0, z1, z2),
+		z0 = svdiv_z (p0, z1, z2))
+
+/*
+** div_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	udiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (div_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svdiv_n_u32_z (p0, z0, x0),
+		 z0 = svdiv_z (p0, z0, x0))
+
+/*
+** div_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	udiv	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	udivr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (div_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svdiv_n_u32_z (p0, z1, x0),
+		 z0 = svdiv_z (p0, z1, x0))
+
+/*
+** div_2_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #2
+**	movprfx	z0\.s, p0/z, z0\.s
+**	udiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_u32_z_tied1, svuint32_t,
+		z0 = svdiv_n_u32_z (p0, z0, 2),
+		z0 = svdiv_z (p0, z0, 2))
+
+/*
+** div_2_u32_z_untied:
+**	mov	(z[0-9]+\.s), #2
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	udiv	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	udivr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_u32_z_untied, svuint32_t,
+		z0 = svdiv_n_u32_z (p0, z1, 2),
+		z0 = svdiv_z (p0, z1, 2))
+
+/*
+** div_u32_x_tied1:
+**	udiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_u32_x_tied1, svuint32_t,
+		z0 = svdiv_u32_x (p0, z0, z1),
+		z0 = svdiv_x (p0, z0, z1))
+
+/*
+** div_u32_x_tied2:
+**	udivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_u32_x_tied2, svuint32_t,
+		z0 = svdiv_u32_x (p0, z1, z0),
+		z0 = svdiv_x (p0, z1, z0))
+
+/*
+** div_u32_x_untied:
+** (
+**	movprfx	z0, z1
+**	udiv	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	udivr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_u32_x_untied, svuint32_t,
+		z0 = svdiv_u32_x (p0, z1, z2),
+		z0 = svdiv_x (p0, z1, z2))
+
+/*
+** div_w0_u32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	udiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (div_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svdiv_n_u32_x (p0, z0, x0),
+		 z0 = svdiv_x (p0, z0, x0))
+
+/*
+** div_w0_u32_x_untied:
+**	mov	z0\.s, w0
+**	udivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (div_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svdiv_n_u32_x (p0, z1, x0),
+		 z0 = svdiv_x (p0, z1, x0))
+
+/*
+** div_2_u32_x_tied1:
+**	mov	(z[0-9]+\.s), #2
+**	udiv	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_u32_x_tied1, svuint32_t,
+		z0 = svdiv_n_u32_x (p0, z0, 2),
+		z0 = svdiv_x (p0, z0, 2))
+
+/*
+** div_2_u32_x_untied:
+**	mov	z0\.s, #2
+**	udivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_u32_x_untied, svuint32_t,
+		z0 = svdiv_n_u32_x (p0, z1, 2),
+		z0 = svdiv_x (p0, z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u64.c
new file mode 100644
index 000000000..fc152e8e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_u64.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** div_u64_m_tied1:
+**	udiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_u64_m_tied1, svuint64_t,
+		z0 = svdiv_u64_m (p0, z0, z1),
+		z0 = svdiv_m (p0, z0, z1))
+
+/*
+** div_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	udiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_u64_m_tied2, svuint64_t,
+		z0 = svdiv_u64_m (p0, z1, z0),
+		z0 = svdiv_m (p0, z1, z0))
+
+/*
+** div_u64_m_untied:
+**	movprfx	z0, z1
+**	udiv	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_u64_m_untied, svuint64_t,
+		z0 = svdiv_u64_m (p0, z1, z2),
+		z0 = svdiv_m (p0, z1, z2))
+
+/*
+** div_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	udiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (div_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svdiv_n_u64_m (p0, z0, x0),
+		 z0 = svdiv_m (p0, z0, x0))
+
+/*
+** div_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	udiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (div_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svdiv_n_u64_m (p0, z1, x0),
+		 z0 = svdiv_m (p0, z1, x0))
+
+/*
+** div_2_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #2
+**	udiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_u64_m_tied1, svuint64_t,
+		z0 = svdiv_n_u64_m (p0, z0, 2),
+		z0 = svdiv_m (p0, z0, 2))
+
+/*
+** div_2_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #2
+**	movprfx	z0, z1
+**	udiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_u64_m_untied, svuint64_t,
+		z0 = svdiv_n_u64_m (p0, z1, 2),
+		z0 = svdiv_m (p0, z1, 2))
+
+/*
+** div_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	udiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_u64_z_tied1, svuint64_t,
+		z0 = svdiv_u64_z (p0, z0, z1),
+		z0 = svdiv_z (p0, z0, z1))
+
+/*
+** div_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	udivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_u64_z_tied2, svuint64_t,
+		z0 = svdiv_u64_z (p0, z1, z0),
+		z0 = svdiv_z (p0, z1, z0))
+
+/*
+** div_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	udiv	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	udivr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_u64_z_untied, svuint64_t,
+		z0 = svdiv_u64_z (p0, z1, z2),
+		z0 = svdiv_z (p0, z1, z2))
+
+/*
+** div_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	udiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (div_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svdiv_n_u64_z (p0, z0, x0),
+		 z0 = svdiv_z (p0, z0, x0))
+
+/*
+** div_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	udiv	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	udivr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (div_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svdiv_n_u64_z (p0, z1, x0),
+		 z0 = svdiv_z (p0, z1, x0))
+
+/*
+** div_2_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #2
+**	movprfx	z0\.d, p0/z, z0\.d
+**	udiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_u64_z_tied1, svuint64_t,
+		z0 = svdiv_n_u64_z (p0, z0, 2),
+		z0 = svdiv_z (p0, z0, 2))
+
+/*
+** div_2_u64_z_untied:
+**	mov	(z[0-9]+\.d), #2
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	udiv	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	udivr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_u64_z_untied, svuint64_t,
+		z0 = svdiv_n_u64_z (p0, z1, 2),
+		z0 = svdiv_z (p0, z1, 2))
+
+/*
+** div_u64_x_tied1:
+**	udiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_u64_x_tied1, svuint64_t,
+		z0 = svdiv_u64_x (p0, z0, z1),
+		z0 = svdiv_x (p0, z0, z1))
+
+/*
+** div_u64_x_tied2:
+**	udivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_u64_x_tied2, svuint64_t,
+		z0 = svdiv_u64_x (p0, z1, z0),
+		z0 = svdiv_x (p0, z1, z0))
+
+/*
+** div_u64_x_untied:
+** (
+**	movprfx	z0, z1
+**	udiv	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	udivr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (div_u64_x_untied, svuint64_t,
+		z0 = svdiv_u64_x (p0, z1, z2),
+		z0 = svdiv_x (p0, z1, z2))
+
+/*
+** div_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	udiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (div_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svdiv_n_u64_x (p0, z0, x0),
+		 z0 = svdiv_x (p0, z0, x0))
+
+/*
+** div_x0_u64_x_untied:
+**	mov	z0\.d, x0
+**	udivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (div_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svdiv_n_u64_x (p0, z1, x0),
+		 z0 = svdiv_x (p0, z1, x0))
+
+/*
+** div_2_u64_x_tied1:
+**	mov	(z[0-9]+\.d), #2
+**	udiv	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_u64_x_tied1, svuint64_t,
+		z0 = svdiv_n_u64_x (p0, z0, 2),
+		z0 = svdiv_x (p0, z0, 2))
+
+/*
+** div_2_u64_x_untied:
+**	mov	z0\.d, #2
+**	udivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (div_2_u64_x_untied, svuint64_t,
+		z0 = svdiv_n_u64_x (p0, z1, 2),
+		z0 = svdiv_x (p0, z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f16.c
new file mode 100644
index 000000000..03cc0343b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f16.c
@@ -0,0 +1,324 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** divr_f16_m_tied1:
+**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f16_m_tied1, svfloat16_t,
+		z0 = svdivr_f16_m (p0, z0, z1),
+		z0 = svdivr_m (p0, z0, z1))
+
+/*
+** divr_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fdivr	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f16_m_tied2, svfloat16_t,
+		z0 = svdivr_f16_m (p0, z1, z0),
+		z0 = svdivr_m (p0, z1, z0))
+
+/*
+** divr_f16_m_untied:
+**	movprfx	z0, z1
+**	fdivr	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f16_m_untied, svfloat16_t,
+		z0 = svdivr_f16_m (p0, z1, z2),
+		z0 = svdivr_m (p0, z1, z2))
+
+/*
+** divr_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fdivr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svdivr_n_f16_m (p0, z0, d4),
+		 z0 = svdivr_m (p0, z0, d4))
+
+/*
+** divr_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fdivr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svdivr_n_f16_m (p0, z1, d4),
+		 z0 = svdivr_m (p0, z1, d4))
+
+/*
+** divr_1_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fdivr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_1_f16_m_tied1, svfloat16_t,
+		z0 = svdivr_n_f16_m (p0, z0, 1),
+		z0 = svdivr_m (p0, z0, 1))
+
+/*
+** divr_1_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fdivr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_1_f16_m_untied, svfloat16_t,
+		z0 = svdivr_n_f16_m (p0, z1, 1),
+		z0 = svdivr_m (p0, z1, 1))
+
+/*
+** divr_0p5_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
+**	fdivr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_0p5_f16_m_tied1, svfloat16_t,
+		z0 = svdivr_n_f16_m (p0, z0, 0.5),
+		z0 = svdivr_m (p0, z0, 0.5))
+
+/*
+** divr_0p5_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
+**	movprfx	z0, z1
+**	fdivr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_0p5_f16_m_untied, svfloat16_t,
+		z0 = svdivr_n_f16_m (p0, z1, 0.5),
+		z0 = svdivr_m (p0, z1, 0.5))
+
+/*
+** divr_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f16_z_tied1, svfloat16_t,
+		z0 = svdivr_f16_z (p0, z0, z1),
+		z0 = svdivr_z (p0, z0, z1))
+
+/*
+** divr_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f16_z_tied2, svfloat16_t,
+		z0 = svdivr_f16_z (p0, z1, z0),
+		z0 = svdivr_z (p0, z1, z0))
+
+/*
+** divr_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fdivr	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f16_z_untied, svfloat16_t,
+		z0 = svdivr_f16_z (p0, z1, z2),
+		z0 = svdivr_z (p0, z1, z2))
+
+/*
+** divr_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fdivr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svdivr_n_f16_z (p0, z0, d4),
+		 z0 = svdivr_z (p0, z0, d4))
+
+/*
+** divr_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fdivr	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svdivr_n_f16_z (p0, z1, d4),
+		 z0 = svdivr_z (p0, z1, d4))
+
+/*
+** divr_1_f16_z:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fdivr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_1_f16_z, svfloat16_t,
+		z0 = svdivr_n_f16_z (p0, z0, 1),
+		z0 = svdivr_z (p0, z0, 1))
+
+/*
+** divr_0p5_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fdivr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_0p5_f16_z_tied1, svfloat16_t,
+		z0 = svdivr_n_f16_z (p0, z0, 0.5),
+		z0 = svdivr_z (p0, z0, 0.5))
+
+/*
+** divr_0p5_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fdivr	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_0p5_f16_z_untied, svfloat16_t,
+		z0 = svdivr_n_f16_z (p0, z1, 0.5),
+		z0 = svdivr_z (p0, z1, 0.5))
+
+/*
+** divr_f16_x_tied1:
+**	fdivr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f16_x_tied1, svfloat16_t,
+		z0 = svdivr_f16_x (p0, z0, z1),
+		z0 = svdivr_x (p0, z0, z1))
+
+/*
+** divr_f16_x_tied2:
+**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f16_x_tied2, svfloat16_t,
+		z0 = svdivr_f16_x (p0, z1, z0),
+		z0 = svdivr_x (p0, z1, z0))
+
+/*
+** divr_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fdivr	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f16_x_untied, svfloat16_t,
+		z0 = svdivr_f16_x (p0, z1, z2),
+		z0 = svdivr_x (p0, z1, z2))
+
+/*
+** divr_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fdivr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svdivr_n_f16_x (p0, z0, d4),
+		 z0 = svdivr_x (p0, z0, d4))
+
+/*
+** divr_h4_f16_x_untied: { xfail *-*-* }
+**	mov	z0\.h, h4
+**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svdivr_n_f16_x (p0, z1, d4),
+		 z0 = svdivr_x (p0, z1, d4))
+
+/*
+** divr_1_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fdivr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_1_f16_x_tied1, svfloat16_t,
+		z0 = svdivr_n_f16_x (p0, z0, 1),
+		z0 = svdivr_x (p0, z0, 1))
+
+/*
+** divr_1_f16_x_untied:
+**	fmov	z0\.h, #1\.0(?:e\+0)?
+**	fdiv	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (divr_1_f16_x_untied, svfloat16_t,
+		z0 = svdivr_n_f16_x (p0, z1, 1),
+		z0 = svdivr_x (p0, z1, 1))
+
+/*
+** ptrue_divr_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_divr_f16_x_tied1, svfloat16_t,
+		z0 = svdivr_f16_x (svptrue_b16 (), z0, z1),
+		z0 = svdivr_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_divr_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_divr_f16_x_tied2, svfloat16_t,
+		z0 = svdivr_f16_x (svptrue_b16 (), z1, z0),
+		z0 = svdivr_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_divr_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_divr_f16_x_untied, svfloat16_t,
+		z0 = svdivr_f16_x (svptrue_b16 (), z1, z2),
+		z0 = svdivr_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_divr_1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_divr_1_f16_x_tied1, svfloat16_t,
+		z0 = svdivr_n_f16_x (svptrue_b16 (), z0, 1),
+		z0 = svdivr_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_divr_1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_divr_1_f16_x_untied, svfloat16_t,
+		z0 = svdivr_n_f16_x (svptrue_b16 (), z1, 1),
+		z0 = svdivr_x (svptrue_b16 (), z1, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f32.c
new file mode 100644
index 000000000..c2b65fc33
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f32.c
@@ -0,0 +1,324 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** divr_f32_m_tied1:
+**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f32_m_tied1, svfloat32_t,
+		z0 = svdivr_f32_m (p0, z0, z1),
+		z0 = svdivr_m (p0, z0, z1))
+
+/*
+** divr_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fdivr	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f32_m_tied2, svfloat32_t,
+		z0 = svdivr_f32_m (p0, z1, z0),
+		z0 = svdivr_m (p0, z1, z0))
+
+/*
+** divr_f32_m_untied:
+**	movprfx	z0, z1
+**	fdivr	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f32_m_untied, svfloat32_t,
+		z0 = svdivr_f32_m (p0, z1, z2),
+		z0 = svdivr_m (p0, z1, z2))
+
+/*
+** divr_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svdivr_n_f32_m (p0, z0, d4),
+		 z0 = svdivr_m (p0, z0, d4))
+
+/*
+** divr_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svdivr_n_f32_m (p0, z1, d4),
+		 z0 = svdivr_m (p0, z1, d4))
+
+/*
+** divr_1_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_1_f32_m_tied1, svfloat32_t,
+		z0 = svdivr_n_f32_m (p0, z0, 1),
+		z0 = svdivr_m (p0, z0, 1))
+
+/*
+** divr_1_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_1_f32_m_untied, svfloat32_t,
+		z0 = svdivr_n_f32_m (p0, z1, 1),
+		z0 = svdivr_m (p0, z1, 1))
+
+/*
+** divr_0p5_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
+**	fdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_0p5_f32_m_tied1, svfloat32_t,
+		z0 = svdivr_n_f32_m (p0, z0, 0.5),
+		z0 = svdivr_m (p0, z0, 0.5))
+
+/*
+** divr_0p5_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
+**	movprfx	z0, z1
+**	fdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_0p5_f32_m_untied, svfloat32_t,
+		z0 = svdivr_n_f32_m (p0, z1, 0.5),
+		z0 = svdivr_m (p0, z1, 0.5))
+
+/*
+** divr_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f32_z_tied1, svfloat32_t,
+		z0 = svdivr_f32_z (p0, z0, z1),
+		z0 = svdivr_z (p0, z0, z1))
+
+/*
+** divr_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f32_z_tied2, svfloat32_t,
+		z0 = svdivr_f32_z (p0, z1, z0),
+		z0 = svdivr_z (p0, z1, z0))
+
+/*
+** divr_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fdivr	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f32_z_untied, svfloat32_t,
+		z0 = svdivr_f32_z (p0, z1, z2),
+		z0 = svdivr_z (p0, z1, z2))
+
+/*
+** divr_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svdivr_n_f32_z (p0, z0, d4),
+		 z0 = svdivr_z (p0, z0, d4))
+
+/*
+** divr_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fdivr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svdivr_n_f32_z (p0, z1, d4),
+		 z0 = svdivr_z (p0, z1, d4))
+
+/*
+** divr_1_f32_z:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_1_f32_z, svfloat32_t,
+		z0 = svdivr_n_f32_z (p0, z0, 1),
+		z0 = svdivr_z (p0, z0, 1))
+
+/*
+** divr_0p5_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_0p5_f32_z_tied1, svfloat32_t,
+		z0 = svdivr_n_f32_z (p0, z0, 0.5),
+		z0 = svdivr_z (p0, z0, 0.5))
+
+/*
+** divr_0p5_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fdivr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_0p5_f32_z_untied, svfloat32_t,
+		z0 = svdivr_n_f32_z (p0, z1, 0.5),
+		z0 = svdivr_z (p0, z1, 0.5))
+
+/*
+** divr_f32_x_tied1:
+**	fdivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f32_x_tied1, svfloat32_t,
+		z0 = svdivr_f32_x (p0, z0, z1),
+		z0 = svdivr_x (p0, z0, z1))
+
+/*
+** divr_f32_x_tied2:
+**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f32_x_tied2, svfloat32_t,
+		z0 = svdivr_f32_x (p0, z1, z0),
+		z0 = svdivr_x (p0, z1, z0))
+
+/*
+** divr_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fdivr	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f32_x_untied, svfloat32_t,
+		z0 = svdivr_f32_x (p0, z1, z2),
+		z0 = svdivr_x (p0, z1, z2))
+
+/*
+** divr_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svdivr_n_f32_x (p0, z0, d4),
+		 z0 = svdivr_x (p0, z0, d4))
+
+/*
+** divr_s4_f32_x_untied: { xfail *-*-* }
+**	mov	z0\.s, s4
+**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svdivr_n_f32_x (p0, z1, d4),
+		 z0 = svdivr_x (p0, z1, d4))
+
+/*
+** divr_1_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_1_f32_x_tied1, svfloat32_t,
+		z0 = svdivr_n_f32_x (p0, z0, 1),
+		z0 = svdivr_x (p0, z0, 1))
+
+/*
+** divr_1_f32_x_untied:
+**	fmov	z0\.s, #1\.0(?:e\+0)?
+**	fdiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_1_f32_x_untied, svfloat32_t,
+		z0 = svdivr_n_f32_x (p0, z1, 1),
+		z0 = svdivr_x (p0, z1, 1))
+
+/*
+** ptrue_divr_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_divr_f32_x_tied1, svfloat32_t,
+		z0 = svdivr_f32_x (svptrue_b32 (), z0, z1),
+		z0 = svdivr_x (svptrue_b32 (), z0, z1))
+
+/*
+** ptrue_divr_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_divr_f32_x_tied2, svfloat32_t,
+		z0 = svdivr_f32_x (svptrue_b32 (), z1, z0),
+		z0 = svdivr_x (svptrue_b32 (), z1, z0))
+
+/*
+** ptrue_divr_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_divr_f32_x_untied, svfloat32_t,
+		z0 = svdivr_f32_x (svptrue_b32 (), z1, z2),
+		z0 = svdivr_x (svptrue_b32 (), z1, z2))
+
+/*
+** ptrue_divr_1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_divr_1_f32_x_tied1, svfloat32_t,
+		z0 = svdivr_n_f32_x (svptrue_b32 (), z0, 1),
+		z0 = svdivr_x (svptrue_b32 (), z0, 1))
+
+/*
+** ptrue_divr_1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_divr_1_f32_x_untied, svfloat32_t,
+		z0 = svdivr_n_f32_x (svptrue_b32 (), z1, 1),
+		z0 = svdivr_x (svptrue_b32 (), z1, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f64.c
new file mode 100644
index 000000000..0a72a37b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_f64.c
@@ -0,0 +1,324 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** divr_f64_m_tied1:
+**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f64_m_tied1, svfloat64_t,
+		z0 = svdivr_f64_m (p0, z0, z1),
+		z0 = svdivr_m (p0, z0, z1))
+
+/*
+** divr_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f64_m_tied2, svfloat64_t,
+		z0 = svdivr_f64_m (p0, z1, z0),
+		z0 = svdivr_m (p0, z1, z0))
+
+/*
+** divr_f64_m_untied:
+**	movprfx	z0, z1
+**	fdivr	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f64_m_untied, svfloat64_t,
+		z0 = svdivr_f64_m (p0, z1, z2),
+		z0 = svdivr_m (p0, z1, z2))
+
+/*
+** divr_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svdivr_n_f64_m (p0, z0, d4),
+		 z0 = svdivr_m (p0, z0, d4))
+
+/*
+** divr_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svdivr_n_f64_m (p0, z1, d4),
+		 z0 = svdivr_m (p0, z1, d4))
+
+/*
+** divr_1_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_1_f64_m_tied1, svfloat64_t,
+		z0 = svdivr_n_f64_m (p0, z0, 1),
+		z0 = svdivr_m (p0, z0, 1))
+
+/*
+** divr_1_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_1_f64_m_untied, svfloat64_t,
+		z0 = svdivr_n_f64_m (p0, z1, 1),
+		z0 = svdivr_m (p0, z1, 1))
+
+/*
+** divr_0p5_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
+**	fdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_0p5_f64_m_tied1, svfloat64_t,
+		z0 = svdivr_n_f64_m (p0, z0, 0.5),
+		z0 = svdivr_m (p0, z0, 0.5))
+
+/*
+** divr_0p5_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
+**	movprfx	z0, z1
+**	fdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_0p5_f64_m_untied, svfloat64_t,
+		z0 = svdivr_n_f64_m (p0, z1, 0.5),
+		z0 = svdivr_m (p0, z1, 0.5))
+
+/*
+** divr_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f64_z_tied1, svfloat64_t,
+		z0 = svdivr_f64_z (p0, z0, z1),
+		z0 = svdivr_z (p0, z0, z1))
+
+/*
+** divr_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f64_z_tied2, svfloat64_t,
+		z0 = svdivr_f64_z (p0, z1, z0),
+		z0 = svdivr_z (p0, z1, z0))
+
+/*
+** divr_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fdivr	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f64_z_untied, svfloat64_t,
+		z0 = svdivr_f64_z (p0, z1, z2),
+		z0 = svdivr_z (p0, z1, z2))
+
+/*
+** divr_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svdivr_n_f64_z (p0, z0, d4),
+		 z0 = svdivr_z (p0, z0, d4))
+
+/*
+** divr_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fdivr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svdivr_n_f64_z (p0, z1, d4),
+		 z0 = svdivr_z (p0, z1, d4))
+
+/*
+** divr_1_f64_z:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_1_f64_z, svfloat64_t,
+		z0 = svdivr_n_f64_z (p0, z0, 1),
+		z0 = svdivr_z (p0, z0, 1))
+
+/*
+** divr_0p5_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_0p5_f64_z_tied1, svfloat64_t,
+		z0 = svdivr_n_f64_z (p0, z0, 0.5),
+		z0 = svdivr_z (p0, z0, 0.5))
+
+/*
+** divr_0p5_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fdivr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_0p5_f64_z_untied, svfloat64_t,
+		z0 = svdivr_n_f64_z (p0, z1, 0.5),
+		z0 = svdivr_z (p0, z1, 0.5))
+
+/*
+** divr_f64_x_tied1:
+**	fdivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f64_x_tied1, svfloat64_t,
+		z0 = svdivr_f64_x (p0, z0, z1),
+		z0 = svdivr_x (p0, z0, z1))
+
+/*
+** divr_f64_x_tied2:
+**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f64_x_tied2, svfloat64_t,
+		z0 = svdivr_f64_x (p0, z1, z0),
+		z0 = svdivr_x (p0, z1, z0))
+
+/*
+** divr_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fdivr	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_f64_x_untied, svfloat64_t,
+		z0 = svdivr_f64_x (p0, z1, z2),
+		z0 = svdivr_x (p0, z1, z2))
+
+/*
+** divr_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svdivr_n_f64_x (p0, z0, d4),
+		 z0 = svdivr_x (p0, z0, d4))
+
+/*
+** divr_d4_f64_x_untied: { xfail *-*-* }
+**	mov	z0\.d, d4
+**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (divr_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svdivr_n_f64_x (p0, z1, d4),
+		 z0 = svdivr_x (p0, z1, d4))
+
+/*
+** divr_1_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_1_f64_x_tied1, svfloat64_t,
+		z0 = svdivr_n_f64_x (p0, z0, 1),
+		z0 = svdivr_x (p0, z0, 1))
+
+/*
+** divr_1_f64_x_untied:
+**	fmov	z0\.d, #1\.0(?:e\+0)?
+**	fdiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_1_f64_x_untied, svfloat64_t,
+		z0 = svdivr_n_f64_x (p0, z1, 1),
+		z0 = svdivr_x (p0, z1, 1))
+
+/*
+** ptrue_divr_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_divr_f64_x_tied1, svfloat64_t,
+		z0 = svdivr_f64_x (svptrue_b64 (), z0, z1),
+		z0 = svdivr_x (svptrue_b64 (), z0, z1))
+
+/*
+** ptrue_divr_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_divr_f64_x_tied2, svfloat64_t,
+		z0 = svdivr_f64_x (svptrue_b64 (), z1, z0),
+		z0 = svdivr_x (svptrue_b64 (), z1, z0))
+
+/*
+** ptrue_divr_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_divr_f64_x_untied, svfloat64_t,
+		z0 = svdivr_f64_x (svptrue_b64 (), z1, z2),
+		z0 = svdivr_x (svptrue_b64 (), z1, z2))
+
+/*
+** ptrue_divr_1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_divr_1_f64_x_tied1, svfloat64_t,
+		z0 = svdivr_n_f64_x (svptrue_b64 (), z0, 1),
+		z0 = svdivr_x (svptrue_b64 (), z0, 1))
+
+/*
+** ptrue_divr_1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_divr_1_f64_x_untied, svfloat64_t,
+		z0 = svdivr_n_f64_x (svptrue_b64 (), z1, 1),
+		z0 = svdivr_x (svptrue_b64 (), z1, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_s32.c
new file mode 100644
index 000000000..75a6c1d97
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_s32.c
@@ -0,0 +1,247 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** divr_s32_m_tied1:
+**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s32_m_tied1, svint32_t,
+		z0 = svdivr_s32_m (p0, z0, z1),
+		z0 = svdivr_m (p0, z0, z1))
+
+/*
+** divr_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	sdivr	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s32_m_tied2, svint32_t,
+		z0 = svdivr_s32_m (p0, z1, z0),
+		z0 = svdivr_m (p0, z1, z0))
+
+/*
+** divr_s32_m_untied:
+**	movprfx	z0, z1
+**	sdivr	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s32_m_untied, svint32_t,
+		z0 = svdivr_s32_m (p0, z1, z2),
+		z0 = svdivr_m (p0, z1, z2))
+
+/*
+** divr_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	sdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svdivr_n_s32_m (p0, z0, x0),
+		 z0 = svdivr_m (p0, z0, x0))
+
+/*
+** divr_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	sdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svdivr_n_s32_m (p0, z1, x0),
+		 z0 = svdivr_m (p0, z1, x0))
+
+/*
+** divr_2_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #2
+**	sdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_s32_m_tied1, svint32_t,
+		z0 = svdivr_n_s32_m (p0, z0, 2),
+		z0 = svdivr_m (p0, z0, 2))
+
+/*
+** divr_2_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #2
+**	movprfx	z0, z1
+**	sdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_s32_m_untied, svint32_t,
+		z0 = svdivr_n_s32_m (p0, z1, 2),
+		z0 = svdivr_m (p0, z1, 2))
+
+/*
+** divr_m1_s32_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	sdivr	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_m1_s32_m, svint32_t,
+		z0 = svdivr_n_s32_m (p0, z0, -1),
+		z0 = svdivr_m (p0, z0, -1))
+
+/*
+** divr_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s32_z_tied1, svint32_t,
+		z0 = svdivr_s32_z (p0, z0, z1),
+		z0 = svdivr_z (p0, z0, z1))
+
+/*
+** divr_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s32_z_tied2, svint32_t,
+		z0 = svdivr_s32_z (p0, z1, z0),
+		z0 = svdivr_z (p0, z1, z0))
+
+/*
+** divr_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	sdivr	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s32_z_untied, svint32_t,
+		z0 = svdivr_s32_z (p0, z1, z2),
+		z0 = svdivr_z (p0, z1, z2))
+
+/*
+** divr_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svdivr_n_s32_z (p0, z0, x0),
+		 z0 = svdivr_z (p0, z0, x0))
+
+/*
+** divr_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	sdivr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svdivr_n_s32_z (p0, z1, x0),
+		 z0 = svdivr_z (p0, z1, x0))
+
+/*
+** divr_2_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #2
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_s32_z_tied1, svint32_t,
+		z0 = svdivr_n_s32_z (p0, z0, 2),
+		z0 = svdivr_z (p0, z0, 2))
+
+/*
+** divr_2_s32_z_untied:
+**	mov	(z[0-9]+\.s), #2
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	sdivr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_s32_z_untied, svint32_t,
+		z0 = svdivr_n_s32_z (p0, z1, 2),
+		z0 = svdivr_z (p0, z1, 2))
+
+/*
+** divr_s32_x_tied1:
+**	sdivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s32_x_tied1, svint32_t,
+		z0 = svdivr_s32_x (p0, z0, z1),
+		z0 = svdivr_x (p0, z0, z1))
+
+/*
+** divr_s32_x_tied2:
+**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s32_x_tied2, svint32_t,
+		z0 = svdivr_s32_x (p0, z1, z0),
+		z0 = svdivr_x (p0, z1, z0))
+
+/*
+** divr_s32_x_untied:
+** (
+**	movprfx	z0, z1
+**	sdivr	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s32_x_untied, svint32_t,
+		z0 = svdivr_s32_x (p0, z1, z2),
+		z0 = svdivr_x (p0, z1, z2))
+
+/*
+** divr_w0_s32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	sdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svdivr_n_s32_x (p0, z0, x0),
+		 z0 = svdivr_x (p0, z0, x0))
+
+/*
+** divr_w0_s32_x_untied:
+**	mov	z0\.s, w0
+**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svdivr_n_s32_x (p0, z1, x0),
+		 z0 = svdivr_x (p0, z1, x0))
+
+/*
+** divr_2_s32_x_tied1:
+**	mov	(z[0-9]+\.s), #2
+**	sdivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_s32_x_tied1, svint32_t,
+		z0 = svdivr_n_s32_x (p0, z0, 2),
+		z0 = svdivr_x (p0, z0, 2))
+
+/*
+** divr_2_s32_x_untied:
+**	mov	z0\.s, #2
+**	sdiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_s32_x_untied, svint32_t,
+		z0 = svdivr_n_s32_x (p0, z1, 2),
+		z0 = svdivr_x (p0, z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_s64.c
new file mode 100644
index 000000000..8f4939a91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_s64.c
@@ -0,0 +1,247 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** divr_s64_m_tied1:
+**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s64_m_tied1, svint64_t,
+		z0 = svdivr_s64_m (p0, z0, z1),
+		z0 = svdivr_m (p0, z0, z1))
+
+/*
+** divr_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	sdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s64_m_tied2, svint64_t,
+		z0 = svdivr_s64_m (p0, z1, z0),
+		z0 = svdivr_m (p0, z1, z0))
+
+/*
+** divr_s64_m_untied:
+**	movprfx	z0, z1
+**	sdivr	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s64_m_untied, svint64_t,
+		z0 = svdivr_s64_m (p0, z1, z2),
+		z0 = svdivr_m (p0, z1, z2))
+
+/*
+** divr_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	sdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svdivr_n_s64_m (p0, z0, x0),
+		 z0 = svdivr_m (p0, z0, x0))
+
+/*
+** divr_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	sdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svdivr_n_s64_m (p0, z1, x0),
+		 z0 = svdivr_m (p0, z1, x0))
+
+/*
+** divr_2_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #2
+**	sdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_s64_m_tied1, svint64_t,
+		z0 = svdivr_n_s64_m (p0, z0, 2),
+		z0 = svdivr_m (p0, z0, 2))
+
+/*
+** divr_2_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #2
+**	movprfx	z0, z1
+**	sdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_s64_m_untied, svint64_t,
+		z0 = svdivr_n_s64_m (p0, z1, 2),
+		z0 = svdivr_m (p0, z1, 2))
+
+/*
+** divr_m1_s64_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	sdivr	z0\.d, p0/m, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_m1_s64_m, svint64_t,
+		z0 = svdivr_n_s64_m (p0, z0, -1),
+		z0 = svdivr_m (p0, z0, -1))
+
+/*
+** divr_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s64_z_tied1, svint64_t,
+		z0 = svdivr_s64_z (p0, z0, z1),
+		z0 = svdivr_z (p0, z0, z1))
+
+/*
+** divr_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s64_z_tied2, svint64_t,
+		z0 = svdivr_s64_z (p0, z1, z0),
+		z0 = svdivr_z (p0, z1, z0))
+
+/*
+** divr_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	sdivr	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s64_z_untied, svint64_t,
+		z0 = svdivr_s64_z (p0, z1, z2),
+		z0 = svdivr_z (p0, z1, z2))
+
+/*
+** divr_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svdivr_n_s64_z (p0, z0, x0),
+		 z0 = svdivr_z (p0, z0, x0))
+
+/*
+** divr_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	sdivr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svdivr_n_s64_z (p0, z1, x0),
+		 z0 = svdivr_z (p0, z1, x0))
+
+/*
+** divr_2_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #2
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_s64_z_tied1, svint64_t,
+		z0 = svdivr_n_s64_z (p0, z0, 2),
+		z0 = svdivr_z (p0, z0, 2))
+
+/*
+** divr_2_s64_z_untied:
+**	mov	(z[0-9]+\.d), #2
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	sdivr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_s64_z_untied, svint64_t,
+		z0 = svdivr_n_s64_z (p0, z1, 2),
+		z0 = svdivr_z (p0, z1, 2))
+
+/*
+** divr_s64_x_tied1:
+**	sdivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s64_x_tied1, svint64_t,
+		z0 = svdivr_s64_x (p0, z0, z1),
+		z0 = svdivr_x (p0, z0, z1))
+
+/*
+** divr_s64_x_tied2:
+**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s64_x_tied2, svint64_t,
+		z0 = svdivr_s64_x (p0, z1, z0),
+		z0 = svdivr_x (p0, z1, z0))
+
+/*
+** divr_s64_x_untied:
+** (
+**	movprfx	z0, z1
+**	sdivr	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_s64_x_untied, svint64_t,
+		z0 = svdivr_s64_x (p0, z1, z2),
+		z0 = svdivr_x (p0, z1, z2))
+
+/*
+** divr_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	sdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svdivr_n_s64_x (p0, z0, x0),
+		 z0 = svdivr_x (p0, z0, x0))
+
+/*
+** divr_x0_s64_x_untied:
+**	mov	z0\.d, x0
+**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svdivr_n_s64_x (p0, z1, x0),
+		 z0 = svdivr_x (p0, z1, x0))
+
+/*
+** divr_2_s64_x_tied1:
+**	mov	(z[0-9]+\.d), #2
+**	sdivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_s64_x_tied1, svint64_t,
+		z0 = svdivr_n_s64_x (p0, z0, 2),
+		z0 = svdivr_x (p0, z0, 2))
+
+/*
+** divr_2_s64_x_untied:
+**	mov	z0\.d, #2
+**	sdiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_s64_x_untied, svint64_t,
+		z0 = svdivr_n_s64_x (p0, z1, 2),
+		z0 = svdivr_x (p0, z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_u32.c
new file mode 100644
index 000000000..84c243b44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_u32.c
@@ -0,0 +1,247 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** divr_u32_m_tied1:
+**	udivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u32_m_tied1, svuint32_t,
+		z0 = svdivr_u32_m (p0, z0, z1),
+		z0 = svdivr_m (p0, z0, z1))
+
+/*
+** divr_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	udivr	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u32_m_tied2, svuint32_t,
+		z0 = svdivr_u32_m (p0, z1, z0),
+		z0 = svdivr_m (p0, z1, z0))
+
+/*
+** divr_u32_m_untied:
+**	movprfx	z0, z1
+**	udivr	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u32_m_untied, svuint32_t,
+		z0 = svdivr_u32_m (p0, z1, z2),
+		z0 = svdivr_m (p0, z1, z2))
+
+/*
+** divr_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	udivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svdivr_n_u32_m (p0, z0, x0),
+		 z0 = svdivr_m (p0, z0, x0))
+
+/*
+** divr_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	udivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svdivr_n_u32_m (p0, z1, x0),
+		 z0 = svdivr_m (p0, z1, x0))
+
+/*
+** divr_2_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #2
+**	udivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_u32_m_tied1, svuint32_t,
+		z0 = svdivr_n_u32_m (p0, z0, 2),
+		z0 = svdivr_m (p0, z0, 2))
+
+/*
+** divr_2_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #2
+**	movprfx	z0, z1
+**	udivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_u32_m_untied, svuint32_t,
+		z0 = svdivr_n_u32_m (p0, z1, 2),
+		z0 = svdivr_m (p0, z1, 2))
+
+/*
+** divr_m1_u32_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	udivr	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_m1_u32_m, svuint32_t,
+		z0 = svdivr_n_u32_m (p0, z0, -1),
+		z0 = svdivr_m (p0, z0, -1))
+
+/*
+** divr_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	udivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u32_z_tied1, svuint32_t,
+		z0 = svdivr_u32_z (p0, z0, z1),
+		z0 = svdivr_z (p0, z0, z1))
+
+/*
+** divr_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	udiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u32_z_tied2, svuint32_t,
+		z0 = svdivr_u32_z (p0, z1, z0),
+		z0 = svdivr_z (p0, z1, z0))
+
+/*
+** divr_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	udivr	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	udiv	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u32_z_untied, svuint32_t,
+		z0 = svdivr_u32_z (p0, z1, z2),
+		z0 = svdivr_z (p0, z1, z2))
+
+/*
+** divr_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	udivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svdivr_n_u32_z (p0, z0, x0),
+		 z0 = svdivr_z (p0, z0, x0))
+
+/*
+** divr_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	udivr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	udiv	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svdivr_n_u32_z (p0, z1, x0),
+		 z0 = svdivr_z (p0, z1, x0))
+
+/*
+** divr_2_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #2
+**	movprfx	z0\.s, p0/z, z0\.s
+**	udivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_u32_z_tied1, svuint32_t,
+		z0 = svdivr_n_u32_z (p0, z0, 2),
+		z0 = svdivr_z (p0, z0, 2))
+
+/*
+** divr_2_u32_z_untied:
+**	mov	(z[0-9]+\.s), #2
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	udivr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	udiv	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_u32_z_untied, svuint32_t,
+		z0 = svdivr_n_u32_z (p0, z1, 2),
+		z0 = svdivr_z (p0, z1, 2))
+
+/*
+** divr_u32_x_tied1:
+**	udivr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u32_x_tied1, svuint32_t,
+		z0 = svdivr_u32_x (p0, z0, z1),
+		z0 = svdivr_x (p0, z0, z1))
+
+/*
+** divr_u32_x_tied2:
+**	udiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u32_x_tied2, svuint32_t,
+		z0 = svdivr_u32_x (p0, z1, z0),
+		z0 = svdivr_x (p0, z1, z0))
+
+/*
+** divr_u32_x_untied:
+** (
+**	movprfx	z0, z1
+**	udivr	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	udiv	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u32_x_untied, svuint32_t,
+		z0 = svdivr_u32_x (p0, z1, z2),
+		z0 = svdivr_x (p0, z1, z2))
+
+/*
+** divr_w0_u32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	udivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svdivr_n_u32_x (p0, z0, x0),
+		 z0 = svdivr_x (p0, z0, x0))
+
+/*
+** divr_w0_u32_x_untied:
+**	mov	z0\.s, w0
+**	udiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svdivr_n_u32_x (p0, z1, x0),
+		 z0 = svdivr_x (p0, z1, x0))
+
+/*
+** divr_2_u32_x_tied1:
+**	mov	(z[0-9]+\.s), #2
+**	udivr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_u32_x_tied1, svuint32_t,
+		z0 = svdivr_n_u32_x (p0, z0, 2),
+		z0 = svdivr_x (p0, z0, 2))
+
+/*
+** divr_2_u32_x_untied:
+**	mov	z0\.s, #2
+**	udiv	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_u32_x_untied, svuint32_t,
+		z0 = svdivr_n_u32_x (p0, z1, 2),
+		z0 = svdivr_x (p0, z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_u64.c
new file mode 100644
index 000000000..03bb62472
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/divr_u64.c
@@ -0,0 +1,247 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** divr_u64_m_tied1:
+**	udivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u64_m_tied1, svuint64_t,
+		z0 = svdivr_u64_m (p0, z0, z1),
+		z0 = svdivr_m (p0, z0, z1))
+
+/*
+** divr_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	udivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u64_m_tied2, svuint64_t,
+		z0 = svdivr_u64_m (p0, z1, z0),
+		z0 = svdivr_m (p0, z1, z0))
+
+/*
+** divr_u64_m_untied:
+**	movprfx	z0, z1
+**	udivr	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u64_m_untied, svuint64_t,
+		z0 = svdivr_u64_m (p0, z1, z2),
+		z0 = svdivr_m (p0, z1, z2))
+
+/*
+** divr_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	udivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svdivr_n_u64_m (p0, z0, x0),
+		 z0 = svdivr_m (p0, z0, x0))
+
+/*
+** divr_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	udivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svdivr_n_u64_m (p0, z1, x0),
+		 z0 = svdivr_m (p0, z1, x0))
+
+/*
+** divr_2_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #2
+**	udivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_u64_m_tied1, svuint64_t,
+		z0 = svdivr_n_u64_m (p0, z0, 2),
+		z0 = svdivr_m (p0, z0, 2))
+
+/*
+** divr_2_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #2
+**	movprfx	z0, z1
+**	udivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_u64_m_untied, svuint64_t,
+		z0 = svdivr_n_u64_m (p0, z1, 2),
+		z0 = svdivr_m (p0, z1, 2))
+
+/*
+** divr_m1_u64_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	udivr	z0\.d, p0/m, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_m1_u64_m, svuint64_t,
+		z0 = svdivr_n_u64_m (p0, z0, -1),
+		z0 = svdivr_m (p0, z0, -1))
+
+/*
+** divr_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	udivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u64_z_tied1, svuint64_t,
+		z0 = svdivr_u64_z (p0, z0, z1),
+		z0 = svdivr_z (p0, z0, z1))
+
+/*
+** divr_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	udiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u64_z_tied2, svuint64_t,
+		z0 = svdivr_u64_z (p0, z1, z0),
+		z0 = svdivr_z (p0, z1, z0))
+
+/*
+** divr_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	udivr	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	udiv	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u64_z_untied, svuint64_t,
+		z0 = svdivr_u64_z (p0, z1, z2),
+		z0 = svdivr_z (p0, z1, z2))
+
+/*
+** divr_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	udivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svdivr_n_u64_z (p0, z0, x0),
+		 z0 = svdivr_z (p0, z0, x0))
+
+/*
+** divr_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	udivr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	udiv	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svdivr_n_u64_z (p0, z1, x0),
+		 z0 = svdivr_z (p0, z1, x0))
+
+/*
+** divr_2_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #2
+**	movprfx	z0\.d, p0/z, z0\.d
+**	udivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_u64_z_tied1, svuint64_t,
+		z0 = svdivr_n_u64_z (p0, z0, 2),
+		z0 = svdivr_z (p0, z0, 2))
+
+/*
+** divr_2_u64_z_untied:
+**	mov	(z[0-9]+\.d), #2
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	udivr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	udiv	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_u64_z_untied, svuint64_t,
+		z0 = svdivr_n_u64_z (p0, z1, 2),
+		z0 = svdivr_z (p0, z1, 2))
+
+/*
+** divr_u64_x_tied1:
+**	udivr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u64_x_tied1, svuint64_t,
+		z0 = svdivr_u64_x (p0, z0, z1),
+		z0 = svdivr_x (p0, z0, z1))
+
+/*
+** divr_u64_x_tied2:
+**	udiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u64_x_tied2, svuint64_t,
+		z0 = svdivr_u64_x (p0, z1, z0),
+		z0 = svdivr_x (p0, z1, z0))
+
+/*
+** divr_u64_x_untied:
+** (
+**	movprfx	z0, z1
+**	udivr	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	udiv	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (divr_u64_x_untied, svuint64_t,
+		z0 = svdivr_u64_x (p0, z1, z2),
+		z0 = svdivr_x (p0, z1, z2))
+
+/*
+** divr_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	udivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svdivr_n_u64_x (p0, z0, x0),
+		 z0 = svdivr_x (p0, z0, x0))
+
+/*
+** divr_x0_u64_x_untied:
+**	mov	z0\.d, x0
+**	udiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (divr_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svdivr_n_u64_x (p0, z1, x0),
+		 z0 = svdivr_x (p0, z1, x0))
+
+/*
+** divr_2_u64_x_tied1:
+**	mov	(z[0-9]+\.d), #2
+**	udivr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_u64_x_tied1, svuint64_t,
+		z0 = svdivr_n_u64_x (p0, z0, 2),
+		z0 = svdivr_x (p0, z0, 2))
+
+/*
+** divr_2_u64_x_untied:
+**	mov	z0\.d, #2
+**	udiv	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (divr_2_u64_x_untied, svuint64_t,
+		z0 = svdivr_n_u64_x (p0, z1, 2),
+		z0 = svdivr_x (p0, z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_s32.c
new file mode 100644
index 000000000..a4d713e29
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_s32.c
@@ -0,0 +1,93 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dot_lane_0_s32_tied1:
+**	sdot	z0\.s, z4\.b, z5\.b\[0\]
+**	ret
+*/
+TEST_DUAL_Z (dot_lane_0_s32_tied1, svint32_t, svint8_t,
+	     z0 = svdot_lane_s32 (z0, z4, z5, 0),
+	     z0 = svdot_lane (z0, z4, z5, 0))
+
+/*
+** dot_lane_0_s32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	sdot	z0\.s, \1\.b, z1\.b\[0\]
+**	ret
+*/
+TEST_DUAL_Z_REV (dot_lane_0_s32_tied2, svint32_t, svint8_t,
+		 z0_res = svdot_lane_s32 (z4, z0, z1, 0),
+		 z0_res = svdot_lane (z4, z0, z1, 0))
+
+/*
+** dot_lane_0_s32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	sdot	z0\.s, z1\.b, \1\.b\[0\]
+**	ret
+*/
+TEST_DUAL_Z_REV (dot_lane_0_s32_tied3, svint32_t, svint8_t,
+		 z0_res = svdot_lane_s32 (z4, z1, z0, 0),
+		 z0_res = svdot_lane (z4, z1, z0, 0))
+
+/*
+** dot_lane_0_s32_untied:
+**	movprfx	z0, z1
+**	sdot	z0\.s, z4\.b, z5\.b\[0\]
+**	ret
+*/
+TEST_DUAL_Z (dot_lane_0_s32_untied, svint32_t, svint8_t,
+	     z0 = svdot_lane_s32 (z1, z4, z5, 0),
+	     z0 = svdot_lane (z1, z4, z5, 0))
+
+/*
+** dot_lane_1_s32:
+**	sdot	z0\.s, z4\.b, z5\.b\[1\]
+**	ret
+*/
+TEST_DUAL_Z (dot_lane_1_s32, svint32_t, svint8_t,
+	     z0 = svdot_lane_s32 (z0, z4, z5, 1),
+	     z0 = svdot_lane (z0, z4, z5, 1))
+
+/*
+** dot_lane_2_s32:
+**	sdot	z0\.s, z4\.b, z5\.b\[2\]
+**	ret
+*/
+TEST_DUAL_Z (dot_lane_2_s32, svint32_t, svint8_t,
+	     z0 = svdot_lane_s32 (z0, z4, z5, 2),
+	     z0 = svdot_lane (z0, z4, z5, 2))
+
+/*
+** dot_lane_3_s32:
+**	sdot	z0\.s, z4\.b, z5\.b\[3\]
+**	ret
+*/
+TEST_DUAL_Z (dot_lane_3_s32, svint32_t, svint8_t,
+	     z0 = svdot_lane_s32 (z0, z4, z5, 3),
+	     z0 = svdot_lane (z0, z4, z5, 3))
+
+/*
+** dot_lane_z8_s32:
+**	str	d8, \[sp, -16\]!
+**	mov	(z[0-7])\.d, z8\.d
+**	sdot	z0\.s, z1\.b, \1\.b\[1\]
+**	ldr	d8, \[sp\], 16
+**	ret
+*/
+TEST_DUAL_LANE_REG (dot_lane_z8_s32, svint32_t, svint8_t, z8,
+		    z0 = svdot_lane_s32 (z0, z1, z8, 1),
+		    z0 = svdot_lane (z0, z1, z8, 1))
+
+/*
+** dot_lane_z16_s32:
+**	mov	(z[0-7])\.d, z16\.d
+**	sdot	z0\.s, z1\.b, \1\.b\[1\]
+**	ret
+*/
+TEST_DUAL_LANE_REG (dot_lane_z16_s32, svint32_t, svint8_t, z16,
+		    z0 = svdot_lane_s32 (z0, z1, z16, 1),
+		    z0 = svdot_lane (z0, z1, z16, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_s64.c
new file mode 100644
index 000000000..daee74091
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_s64.c
@@ -0,0 +1,74 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dot_lane_0_s64_tied1:
+**	sdot	z0\.d, z4\.h, z5\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z (dot_lane_0_s64_tied1, svint64_t, svint16_t,
+	     z0 = svdot_lane_s64 (z0, z4, z5, 0),
+	     z0 = svdot_lane (z0, z4, z5, 0))
+
+/*
+** dot_lane_0_s64_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	sdot	z0\.d, \1\.h, z1\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z_REV (dot_lane_0_s64_tied2, svint64_t, svint16_t,
+		 z0_res = svdot_lane_s64 (z4, z0, z1, 0),
+		 z0_res = svdot_lane (z4, z0, z1, 0))
+
+/*
+** dot_lane_0_s64_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	sdot	z0\.d, z1\.h, \1\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z_REV (dot_lane_0_s64_tied3, svint64_t, svint16_t,
+		 z0_res = svdot_lane_s64 (z4, z1, z0, 0),
+		 z0_res = svdot_lane (z4, z1, z0, 0))
+
+/*
+** dot_lane_0_s64_untied:
+**	movprfx	z0, z1
+**	sdot	z0\.d, z4\.h, z5\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z (dot_lane_0_s64_untied, svint64_t, svint16_t,
+	     z0 = svdot_lane_s64 (z1, z4, z5, 0),
+	     z0 = svdot_lane (z1, z4, z5, 0))
+
+/*
+** dot_lane_1_s64:
+**	sdot	z0\.d, z4\.h, z5\.h\[1\]
+**	ret
+*/
+TEST_DUAL_Z (dot_lane_1_s64, svint64_t, svint16_t,
+	     z0 = svdot_lane_s64 (z0, z4, z5, 1),
+	     z0 = svdot_lane (z0, z4, z5, 1))
+
+/*
+** dot_lane_z15_s64:
+**	str	d15, \[sp, -16\]!
+**	sdot	z0\.d, z1\.h, z15\.h\[1\]
+**	ldr	d15, \[sp\], 16
+**	ret
+*/
+TEST_DUAL_LANE_REG (dot_lane_z15_s64, svint64_t, svint16_t, z15,
+		    z0 = svdot_lane_s64 (z0, z1, z15, 1),
+		    z0 = svdot_lane (z0, z1, z15, 1))
+
+/*
+** dot_lane_z16_s64:
+**	mov	(z[0-9]|z1[0-5])\.d, z16\.d
+**	sdot	z0\.d, z1\.h, \1\.h\[1\]
+**	ret
+*/
+TEST_DUAL_LANE_REG (dot_lane_z16_s64, svint64_t, svint16_t, z16,
+		    z0 = svdot_lane_s64 (z0, z1, z16, 1),
+		    z0 = svdot_lane (z0, z1, z16, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_u32.c
new file mode 100644
index 000000000..6d69df76d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_u32.c
@@ -0,0 +1,93 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dot_lane_0_u32_tied1:
+**	udot	z0\.s, z4\.b, z5\.b\[0\]
+**	ret
+*/
+TEST_DUAL_Z (dot_lane_0_u32_tied1, svuint32_t, svuint8_t,
+	     z0 = svdot_lane_u32 (z0, z4, z5, 0),
+	     z0 = svdot_lane (z0, z4, z5, 0))
+
+/*
+** dot_lane_0_u32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	udot	z0\.s, \1\.b, z1\.b\[0\]
+**	ret
+*/
+TEST_DUAL_Z_REV (dot_lane_0_u32_tied2, svuint32_t, svuint8_t,
+		 z0_res = svdot_lane_u32 (z4, z0, z1, 0),
+		 z0_res = svdot_lane (z4, z0, z1, 0))
+
+/*
+** dot_lane_0_u32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	udot	z0\.s, z1\.b, \1\.b\[0\]
+**	ret
+*/
+TEST_DUAL_Z_REV (dot_lane_0_u32_tied3, svuint32_t, svuint8_t,
+		 z0_res = svdot_lane_u32 (z4, z1, z0, 0),
+		 z0_res = svdot_lane (z4, z1, z0, 0))
+
+/*
+** dot_lane_0_u32_untied:
+**	movprfx	z0, z1
+**	udot	z0\.s, z4\.b, z5\.b\[0\]
+**	ret
+*/
+TEST_DUAL_Z (dot_lane_0_u32_untied, svuint32_t, svuint8_t,
+	     z0 = svdot_lane_u32 (z1, z4, z5, 0),
+	     z0 = svdot_lane (z1, z4, z5, 0))
+
+/*
+** dot_lane_1_u32:
+**	udot	z0\.s, z4\.b, z5\.b\[1\]
+**	ret
+*/
+TEST_DUAL_Z (dot_lane_1_u32, svuint32_t, svuint8_t,
+	     z0 = svdot_lane_u32 (z0, z4, z5, 1),
+	     z0 = svdot_lane (z0, z4, z5, 1))
+
+/*
+** dot_lane_2_u32:
+**	udot	z0\.s, z4\.b, z5\.b\[2\]
+**	ret
+*/
+TEST_DUAL_Z (dot_lane_2_u32, svuint32_t, svuint8_t,
+	     z0 = svdot_lane_u32 (z0, z4, z5, 2),
+	     z0 = svdot_lane (z0, z4, z5, 2))
+
+/*
+** dot_lane_3_u32:
+**	udot	z0\.s, z4\.b, z5\.b\[3\]
+**	ret
+*/
+TEST_DUAL_Z (dot_lane_3_u32, svuint32_t, svuint8_t,
+	     z0 = svdot_lane_u32 (z0, z4, z5, 3),
+	     z0 = svdot_lane (z0, z4, z5, 3))
+
+/*
+** dot_lane_z8_u32:
+**	str	d8, \[sp, -16\]!
+**	mov	(z[0-7])\.d, z8\.d
+**	udot	z0\.s, z1\.b, \1\.b\[1\]
+**	ldr	d8, \[sp\], 16
+**	ret
+*/
+TEST_DUAL_LANE_REG (dot_lane_z8_u32, svuint32_t, svuint8_t, z8,
+		    z0 = svdot_lane_u32 (z0, z1, z8, 1),
+		    z0 = svdot_lane (z0, z1, z8, 1))
+
+/*
+** dot_lane_z16_u32:
+**	mov	(z[0-7])\.d, z16\.d
+**	udot	z0\.s, z1\.b, \1\.b\[1\]
+**	ret
+*/
+TEST_DUAL_LANE_REG (dot_lane_z16_u32, svuint32_t, svuint8_t, z16,
+		    z0 = svdot_lane_u32 (z0, z1, z16, 1),
+		    z0 = svdot_lane (z0, z1, z16, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_u64.c
new file mode 100644
index 000000000..242e21c78
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_lane_u64.c
@@ -0,0 +1,74 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dot_lane_0_u64_tied1:
+**	udot	z0\.d, z4\.h, z5\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z (dot_lane_0_u64_tied1, svuint64_t, svuint16_t,
+	     z0 = svdot_lane_u64 (z0, z4, z5, 0),
+	     z0 = svdot_lane (z0, z4, z5, 0))
+
+/*
+** dot_lane_0_u64_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	udot	z0\.d, \1\.h, z1\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z_REV (dot_lane_0_u64_tied2, svuint64_t, svuint16_t,
+		 z0_res = svdot_lane_u64 (z4, z0, z1, 0),
+		 z0_res = svdot_lane (z4, z0, z1, 0))
+
+/*
+** dot_lane_0_u64_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	udot	z0\.d, z1\.h, \1\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z_REV (dot_lane_0_u64_tied3, svuint64_t, svuint16_t,
+		 z0_res = svdot_lane_u64 (z4, z1, z0, 0),
+		 z0_res = svdot_lane (z4, z1, z0, 0))
+
+/*
+** dot_lane_0_u64_untied:
+**	movprfx	z0, z1
+**	udot	z0\.d, z4\.h, z5\.h\[0\]
+**	ret
+*/
+TEST_DUAL_Z (dot_lane_0_u64_untied, svuint64_t, svuint16_t,
+	     z0 = svdot_lane_u64 (z1, z4, z5, 0),
+	     z0 = svdot_lane (z1, z4, z5, 0))
+
+/*
+** dot_lane_1_u64:
+**	udot	z0\.d, z4\.h, z5\.h\[1\]
+**	ret
+*/
+TEST_DUAL_Z (dot_lane_1_u64, svuint64_t, svuint16_t,
+	     z0 = svdot_lane_u64 (z0, z4, z5, 1),
+	     z0 = svdot_lane (z0, z4, z5, 1))
+
+/*
+** dot_lane_z15_u64:
+**	str	d15, \[sp, -16\]!
+**	udot	z0\.d, z1\.h, z15\.h\[1\]
+**	ldr	d15, \[sp\], 16
+**	ret
+*/
+TEST_DUAL_LANE_REG (dot_lane_z15_u64, svuint64_t, svuint16_t, z15,
+		    z0 = svdot_lane_u64 (z0, z1, z15, 1),
+		    z0 = svdot_lane (z0, z1, z15, 1))
+
+/*
+** dot_lane_z16_u64:
+**	mov	(z[0-9]|z1[0-5])\.d, z16\.d
+**	udot	z0\.d, z1\.h, \1\.h\[1\]
+**	ret
+*/
+TEST_DUAL_LANE_REG (dot_lane_z16_u64, svuint64_t, svuint16_t, z16,
+		    z0 = svdot_lane_u64 (z0, z1, z16, 1),
+		    z0 = svdot_lane (z0, z1, z16, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_s32.c
new file mode 100644
index 000000000..605bd1b30
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_s32.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dot_s32_tied1:
+**	sdot	z0\.s, z4\.b, z5\.b
+**	ret
+*/
+TEST_DUAL_Z (dot_s32_tied1, svint32_t, svint8_t,
+	     z0 = svdot_s32 (z0, z4, z5),
+	     z0 = svdot (z0, z4, z5))
+
+/*
+** dot_s32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	sdot	z0\.s, \1\.b, z1\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (dot_s32_tied2, svint32_t, svint8_t,
+		 z0_res = svdot_s32 (z4, z0, z1),
+		 z0_res = svdot (z4, z0, z1))
+
+/*
+** dot_s32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	sdot	z0\.s, z1\.b, \1\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (dot_s32_tied3, svint32_t, svint8_t,
+		 z0_res = svdot_s32 (z4, z1, z0),
+		 z0_res = svdot (z4, z1, z0))
+
+/*
+** dot_s32_untied:
+**	movprfx	z0, z1
+**	sdot	z0\.s, z4\.b, z5\.b
+**	ret
+*/
+TEST_DUAL_Z (dot_s32_untied, svint32_t, svint8_t,
+	     z0 = svdot_s32 (z1, z4, z5),
+	     z0 = svdot (z1, z4, z5))
+
+/*
+** dot_w0_s32_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	sdot	z0\.s, z4\.b, \1
+**	ret
+*/
+TEST_DUAL_ZX (dot_w0_s32_tied1, svint32_t, svint8_t, int8_t,
+	      z0 = svdot_n_s32 (z0, z4, x0),
+	      z0 = svdot (z0, z4, x0))
+
+/*
+** dot_w0_s32_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	sdot	z0\.s, z4\.b, \1
+**	ret
+*/
+TEST_DUAL_ZX (dot_w0_s32_untied, svint32_t, svint8_t, int8_t,
+	      z0 = svdot_n_s32 (z1, z4, x0),
+	      z0 = svdot (z1, z4, x0))
+
+/*
+** dot_9_s32_tied1:
+**	mov	(z[0-9]+\.b), #9
+**	sdot	z0\.s, z4\.b, \1
+**	ret
+*/
+TEST_DUAL_Z (dot_9_s32_tied1, svint32_t, svint8_t,
+	     z0 = svdot_n_s32 (z0, z4, 9),
+	     z0 = svdot (z0, z4, 9))
+
+/*
+** dot_9_s32_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #9
+**	movprfx	z0, z1
+**	sdot	z0\.s, z4\.b, \1
+**	ret
+*/
+TEST_DUAL_Z (dot_9_s32_untied, svint32_t, svint8_t,
+	     z0 = svdot_n_s32 (z1, z4, 9),
+	     z0 = svdot (z1, z4, 9))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_s64.c
new file mode 100644
index 000000000..b6574740b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_s64.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dot_s64_tied1:
+**	sdot	z0\.d, z4\.h, z5\.h
+**	ret
+*/
+TEST_DUAL_Z (dot_s64_tied1, svint64_t, svint16_t,
+	     z0 = svdot_s64 (z0, z4, z5),
+	     z0 = svdot (z0, z4, z5))
+
+/*
+** dot_s64_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	sdot	z0\.d, \1\.h, z1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (dot_s64_tied2, svint64_t, svint16_t,
+		 z0_res = svdot_s64 (z4, z0, z1),
+		 z0_res = svdot (z4, z0, z1))
+
+/*
+** dot_s64_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	sdot	z0\.d, z1\.h, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (dot_s64_tied3, svint64_t, svint16_t,
+		 z0_res = svdot_s64 (z4, z1, z0),
+		 z0_res = svdot (z4, z1, z0))
+
+/*
+** dot_s64_untied:
+**	movprfx	z0, z1
+**	sdot	z0\.d, z4\.h, z5\.h
+**	ret
+*/
+TEST_DUAL_Z (dot_s64_untied, svint64_t, svint16_t,
+	     z0 = svdot_s64 (z1, z4, z5),
+	     z0 = svdot (z1, z4, z5))
+
+/*
+** dot_w0_s64_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	sdot	z0\.d, z4\.h, \1
+**	ret
+*/
+TEST_DUAL_ZX (dot_w0_s64_tied1, svint64_t, svint16_t, int16_t,
+	      z0 = svdot_n_s64 (z0, z4, x0),
+	      z0 = svdot (z0, z4, x0))
+
+/*
+** dot_w0_s64_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	sdot	z0\.d, z4\.h, \1
+**	ret
+*/
+TEST_DUAL_ZX (dot_w0_s64_untied, svint64_t, svint16_t, int16_t,
+	      z0 = svdot_n_s64 (z1, z4, x0),
+	      z0 = svdot (z1, z4, x0))
+
+/*
+** dot_9_s64_tied1:
+**	mov	(z[0-9]+\.h), #9
+**	sdot	z0\.d, z4\.h, \1
+**	ret
+*/
+TEST_DUAL_Z (dot_9_s64_tied1, svint64_t, svint16_t,
+	     z0 = svdot_n_s64 (z0, z4, 9),
+	     z0 = svdot (z0, z4, 9))
+
+/*
+** dot_9_s64_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #9
+**	movprfx	z0, z1
+**	sdot	z0\.d, z4\.h, \1
+**	ret
+*/
+TEST_DUAL_Z (dot_9_s64_untied, svint64_t, svint16_t,
+	     z0 = svdot_n_s64 (z1, z4, 9),
+	     z0 = svdot (z1, z4, 9))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_u32.c
new file mode 100644
index 000000000..541e71cc2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_u32.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dot_u32_tied1:
+**	udot	z0\.s, z4\.b, z5\.b
+**	ret
+*/
+TEST_DUAL_Z (dot_u32_tied1, svuint32_t, svuint8_t,
+	     z0 = svdot_u32 (z0, z4, z5),
+	     z0 = svdot (z0, z4, z5))
+
+/*
+** dot_u32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	udot	z0\.s, \1\.b, z1\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (dot_u32_tied2, svuint32_t, svuint8_t,
+		 z0_res = svdot_u32 (z4, z0, z1),
+		 z0_res = svdot (z4, z0, z1))
+
+/*
+** dot_u32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	udot	z0\.s, z1\.b, \1\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (dot_u32_tied3, svuint32_t, svuint8_t,
+		 z0_res = svdot_u32 (z4, z1, z0),
+		 z0_res = svdot (z4, z1, z0))
+
+/*
+** dot_u32_untied:
+**	movprfx	z0, z1
+**	udot	z0\.s, z4\.b, z5\.b
+**	ret
+*/
+TEST_DUAL_Z (dot_u32_untied, svuint32_t, svuint8_t,
+	     z0 = svdot_u32 (z1, z4, z5),
+	     z0 = svdot (z1, z4, z5))
+
+/*
+** dot_w0_u32_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	udot	z0\.s, z4\.b, \1
+**	ret
+*/
+TEST_DUAL_ZX (dot_w0_u32_tied1, svuint32_t, svuint8_t, uint8_t,
+	      z0 = svdot_n_u32 (z0, z4, x0),
+	      z0 = svdot (z0, z4, x0))
+
+/*
+** dot_w0_u32_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	udot	z0\.s, z4\.b, \1
+**	ret
+*/
+TEST_DUAL_ZX (dot_w0_u32_untied, svuint32_t, svuint8_t, uint8_t,
+	      z0 = svdot_n_u32 (z1, z4, x0),
+	      z0 = svdot (z1, z4, x0))
+
+/*
+** dot_9_u32_tied1:
+**	mov	(z[0-9]+\.b), #9
+**	udot	z0\.s, z4\.b, \1
+**	ret
+*/
+TEST_DUAL_Z (dot_9_u32_tied1, svuint32_t, svuint8_t,
+	     z0 = svdot_n_u32 (z0, z4, 9),
+	     z0 = svdot (z0, z4, 9))
+
+/*
+** dot_9_u32_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #9
+**	movprfx	z0, z1
+**	udot	z0\.s, z4\.b, \1
+**	ret
+*/
+TEST_DUAL_Z (dot_9_u32_untied, svuint32_t, svuint8_t,
+	     z0 = svdot_n_u32 (z1, z4, 9),
+	     z0 = svdot (z1, z4, 9))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_u64.c
new file mode 100644
index 000000000..cc0e85373
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dot_u64.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dot_u64_tied1:
+**	udot	z0\.d, z4\.h, z5\.h
+**	ret
+*/
+TEST_DUAL_Z (dot_u64_tied1, svuint64_t, svuint16_t,
+	     z0 = svdot_u64 (z0, z4, z5),
+	     z0 = svdot (z0, z4, z5))
+
+/*
+** dot_u64_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	udot	z0\.d, \1\.h, z1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (dot_u64_tied2, svuint64_t, svuint16_t,
+		 z0_res = svdot_u64 (z4, z0, z1),
+		 z0_res = svdot (z4, z0, z1))
+
+/*
+** dot_u64_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	udot	z0\.d, z1\.h, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (dot_u64_tied3, svuint64_t, svuint16_t,
+		 z0_res = svdot_u64 (z4, z1, z0),
+		 z0_res = svdot (z4, z1, z0))
+
+/*
+** dot_u64_untied:
+**	movprfx	z0, z1
+**	udot	z0\.d, z4\.h, z5\.h
+**	ret
+*/
+TEST_DUAL_Z (dot_u64_untied, svuint64_t, svuint16_t,
+	     z0 = svdot_u64 (z1, z4, z5),
+	     z0 = svdot (z1, z4, z5))
+
+/*
+** dot_w0_u64_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	udot	z0\.d, z4\.h, \1
+**	ret
+*/
+TEST_DUAL_ZX (dot_w0_u64_tied1, svuint64_t, svuint16_t, uint16_t,
+	      z0 = svdot_n_u64 (z0, z4, x0),
+	      z0 = svdot (z0, z4, x0))
+
+/*
+** dot_w0_u64_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	udot	z0\.d, z4\.h, \1
+**	ret
+*/
+TEST_DUAL_ZX (dot_w0_u64_untied, svuint64_t, svuint16_t, uint16_t,
+	      z0 = svdot_n_u64 (z1, z4, x0),
+	      z0 = svdot (z1, z4, x0))
+
+/*
+** dot_9_u64_tied1:
+**	mov	(z[0-9]+\.h), #9
+**	udot	z0\.d, z4\.h, \1
+**	ret
+*/
+TEST_DUAL_Z (dot_9_u64_tied1, svuint64_t, svuint16_t,
+	     z0 = svdot_n_u64 (z0, z4, 9),
+	     z0 = svdot (z0, z4, 9))
+
+/*
+** dot_9_u64_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #9
+**	movprfx	z0, z1
+**	udot	z0\.d, z4\.h, \1
+**	ret
+*/
+TEST_DUAL_Z (dot_9_u64_untied, svuint64_t, svuint16_t,
+	     z0 = svdot_n_u64 (z1, z4, 9),
+	     z0 = svdot (z1, z4, 9))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b16.c
new file mode 100644
index 000000000..785832ab3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b16.c
@@ -0,0 +1,32 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include <stdbool.h>
+#include "test_sve_acle.h"
+
+/*
+** dup_false_b16:
+**	pfalse	p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (dup_false_b16,
+		p0 = svdup_n_b16 (false),
+		p0 = svdup_b16 (false))
+
+/*
+** dup_true_b16:
+**	ptrue	p0\.h, all
+**	ret
+*/
+TEST_UNIFORM_P (dup_true_b16,
+		p0 = svdup_n_b16 (true),
+		p0 = svdup_b16 (true))
+
+/*
+** dup_w0_b16:
+**	lsl	(x[0-9]+), x0, 63
+**	whilelo	p0\.h, xzr, \1
+**	ret
+*/
+TEST_UNIFORM_PS (dup_w0_b16,
+		 p0 = svdup_n_b16 (x0),
+		 p0 = svdup_b16 (x0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b32.c
new file mode 100644
index 000000000..6e9d91eaf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b32.c
@@ -0,0 +1,32 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include <stdbool.h>
+#include "test_sve_acle.h"
+
+/*
+** dup_false_b32:
+**	pfalse	p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (dup_false_b32,
+		p0 = svdup_n_b32 (false),
+		p0 = svdup_b32 (false))
+
+/*
+** dup_true_b32:
+**	ptrue	p0\.s, all
+**	ret
+*/
+TEST_UNIFORM_P (dup_true_b32,
+		p0 = svdup_n_b32 (true),
+		p0 = svdup_b32 (true))
+
+/*
+** dup_w0_b32:
+**	lsl	(x[0-9]+), x0, 63
+**	whilelo	p0\.s, xzr, \1
+**	ret
+*/
+TEST_UNIFORM_PS (dup_w0_b32,
+		 p0 = svdup_n_b32 (x0),
+		 p0 = svdup_b32 (x0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b64.c
new file mode 100644
index 000000000..ed69896c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b64.c
@@ -0,0 +1,32 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include <stdbool.h>
+#include "test_sve_acle.h"
+
+/*
+** dup_false_b64:
+**	pfalse	p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (dup_false_b64,
+		p0 = svdup_n_b64 (false),
+		p0 = svdup_b64 (false))
+
+/*
+** dup_true_b64:
+**	ptrue	p0\.d, all
+**	ret
+*/
+TEST_UNIFORM_P (dup_true_b64,
+		p0 = svdup_n_b64 (true),
+		p0 = svdup_b64 (true))
+
+/*
+** dup_w0_b64:
+**	lsl	(x[0-9]+), x0, 63
+**	whilelo	p0\.d, xzr, \1
+**	ret
+*/
+TEST_UNIFORM_PS (dup_w0_b64,
+		 p0 = svdup_n_b64 (x0),
+		 p0 = svdup_b64 (x0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b8.c
new file mode 100644
index 000000000..a99ab552a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_b8.c
@@ -0,0 +1,32 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include <stdbool.h>
+#include "test_sve_acle.h"
+
+/*
+** dup_false_b8:
+**	pfalse	p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (dup_false_b8,
+		p0 = svdup_n_b8 (false),
+		p0 = svdup_b8 (false))
+
+/*
+** dup_true_b8:
+**	ptrue	p0\.b, all
+**	ret
+*/
+TEST_UNIFORM_P (dup_true_b8,
+		p0 = svdup_n_b8 (true),
+		p0 = svdup_b8 (true))
+
+/*
+** dup_w0_b8:
+**	lsl	(x[0-9]+), x0, 63
+**	whilelo	p0\.b, xzr, \1
+**	ret
+*/
+TEST_UNIFORM_PS (dup_w0_b8,
+		 p0 = svdup_n_b8 (x0),
+		 p0 = svdup_b8 (x0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_bf16.c
new file mode 100644
index 000000000..db47d849c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_bf16.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_h4_bf16:
+**	mov	z0\.h, h4
+**	ret
+*/
+TEST_UNIFORM_ZD (dup_h4_bf16, svbfloat16_t, __bf16,
+		z0 = svdup_n_bf16 (d4),
+		z0 = svdup_bf16 (d4))
+
+/*
+** dup_h4_bf16_m:
+**	movprfx	z0, z1
+**	mov	z0\.h, p0/m, h4
+**	ret
+*/
+TEST_UNIFORM_ZD (dup_h4_bf16_m, svbfloat16_t, __bf16,
+		z0 = svdup_n_bf16_m (z1, p0, d4),
+		z0 = svdup_bf16_m (z1, p0, d4))
+
+/*
+** dup_h4_bf16_z:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mov	z0\.h, p0/m, h4
+**	ret
+*/
+TEST_UNIFORM_ZD (dup_h4_bf16_z, svbfloat16_t, __bf16,
+		z0 = svdup_n_bf16_z (p0, d4),
+		z0 = svdup_bf16_z (p0, d4))
+
+/*
+** dup_h4_bf16_x:
+**	mov	z0\.h, h4
+**	ret
+*/
+TEST_UNIFORM_ZD (dup_h4_bf16_x, svbfloat16_t, __bf16,
+		z0 = svdup_n_bf16_x (p0, d4),
+		z0 = svdup_bf16_x (p0, d4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f16.c
new file mode 100644
index 000000000..2d48b9a3d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f16.c
@@ -0,0 +1,215 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_1_f16:
+**	fmov	z0\.h, #1\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_f16, svfloat16_t,
+		z0 = svdup_n_f16 (1),
+		z0 = svdup_f16 (1))
+
+/*
+** dup_0_f16:
+**	mov	z0\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_f16, svfloat16_t,
+		z0 = svdup_n_f16 (0),
+		z0 = svdup_f16 (0))
+
+/*
+** dup_8_f16:
+**	fmov	z0\.h, #8\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_8_f16, svfloat16_t,
+		z0 = svdup_n_f16 (8),
+		z0 = svdup_f16 (8))
+
+/*
+** dup_512_f16:
+**	mov	z0\.h, #24576
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_f16, svfloat16_t,
+		z0 = svdup_n_f16 (512),
+		z0 = svdup_f16 (512))
+
+/*
+** dup_513_f16:
+**	mov	(w[0-7]+), 24578
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_513_f16, svfloat16_t,
+		z0 = svdup_n_f16 (513),
+		z0 = svdup_f16 (513))
+
+/*
+** dup_h4_f16:
+**	mov	z0\.h, h4
+**	ret
+*/
+TEST_UNIFORM_ZD (dup_h4_f16, svfloat16_t, __fp16,
+		z0 = svdup_n_f16 (d4),
+		z0 = svdup_f16 (d4))
+
+/*
+** dup_1_f16_m:
+**	mov	z0\.h, p0/m, #15360
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_f16_m, svfloat16_t,
+		z0 = svdup_n_f16_m (z0, p0, 1),
+		z0 = svdup_f16_m (z0, p0, 1))
+
+/*
+** dup_0_f16_m:
+**	mov	z0\.h, p0/m, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_f16_m, svfloat16_t,
+		z0 = svdup_n_f16_m (z0, p0, 0),
+		z0 = svdup_f16_m (z0, p0, 0))
+
+/*
+** dup_8_f16_m:
+**	mov	z0\.h, p0/m, #18432
+**	ret
+*/
+TEST_UNIFORM_Z (dup_8_f16_m, svfloat16_t,
+		z0 = svdup_n_f16_m (z0, p0, 8),
+		z0 = svdup_f16_m (z0, p0, 8))
+
+/*
+** dup_512_f16_m:
+**	mov	z0\.h, p0/m, #24576
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_f16_m, svfloat16_t,
+		z0 = svdup_n_f16_m (z0, p0, 512),
+		z0 = svdup_f16_m (z0, p0, 512))
+
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_513_f16_m, svfloat16_t,
+		z0 = svdup_n_f16_m (z0, p0, 513),
+		z0 = svdup_f16_m (z0, p0, 513))
+
+/*
+** dup_h4_f16_m:
+**	movprfx	z0, z1
+**	mov	z0\.h, p0/m, h4
+**	ret
+*/
+TEST_UNIFORM_ZD (dup_h4_f16_m, svfloat16_t, __fp16,
+		z0 = svdup_n_f16_m (z1, p0, d4),
+		z0 = svdup_f16_m (z1, p0, d4))
+
+/*
+** dup_1_f16_z:
+**	mov	z0\.h, p0/z, #15360
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_f16_z, svfloat16_t,
+		z0 = svdup_n_f16_z (p0, 1),
+		z0 = svdup_f16_z (p0, 1))
+
+/*
+** dup_0_f16_z:
+**	mov	z0\.h, p0/z, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_f16_z, svfloat16_t,
+		z0 = svdup_n_f16_z (p0, 0),
+		z0 = svdup_f16_z (p0, 0))
+
+/*
+** dup_8_f16_z:
+**	mov	z0\.h, p0/z, #18432
+**	ret
+*/
+TEST_UNIFORM_Z (dup_8_f16_z, svfloat16_t,
+		z0 = svdup_n_f16_z (p0, 8),
+		z0 = svdup_f16_z (p0, 8))
+
+/*
+** dup_512_f16_z:
+**	mov	z0\.h, p0/z, #24576
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_f16_z, svfloat16_t,
+		z0 = svdup_n_f16_z (p0, 512),
+		z0 = svdup_f16_z (p0, 512))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_513_f16_z, svfloat16_t,
+		z0 = svdup_n_f16_z (p0, 513),
+		z0 = svdup_f16_z (p0, 513))
+/*
+** dup_h4_f16_z:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mov	z0\.h, p0/m, h4
+**	ret
+*/
+TEST_UNIFORM_ZD (dup_h4_f16_z, svfloat16_t, __fp16,
+		z0 = svdup_n_f16_z (p0, d4),
+		z0 = svdup_f16_z (p0, d4))
+
+/*
+** dup_1_f16_x:
+**	fmov	z0\.h, #1\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_f16_x, svfloat16_t,
+		z0 = svdup_n_f16_x (p0, 1),
+		z0 = svdup_f16_x (p0, 1))
+
+/*
+** dup_0_f16_x:
+**	mov	z0\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_f16_x, svfloat16_t,
+		z0 = svdup_n_f16_x (p0, 0),
+		z0 = svdup_f16_x (p0, 0))
+
+/*
+** dup_8_f16_x:
+**	fmov	z0\.h, #8\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_8_f16_x, svfloat16_t,
+		z0 = svdup_n_f16_x (p0, 8),
+		z0 = svdup_f16_x (p0, 8))
+
+/*
+** dup_512_f16_x:
+**	mov	z0\.h, #24576
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_f16_x, svfloat16_t,
+		z0 = svdup_n_f16_x (p0, 512),
+		z0 = svdup_f16_x (p0, 512))
+
+/*
+** dup_513_f16_x:
+**	mov	(w[0-7]+), 24578
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_513_f16_x, svfloat16_t,
+		z0 = svdup_n_f16_x (p0, 513),
+		z0 = svdup_f16_x (p0, 513))
+
+/*
+** dup_h4_f16_x:
+**	mov	z0\.h, h4
+**	ret
+*/
+TEST_UNIFORM_ZD (dup_h4_f16_x, svfloat16_t, __fp16,
+		z0 = svdup_n_f16_x (p0, d4),
+		z0 = svdup_f16_x (p0, d4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f32.c
new file mode 100644
index 000000000..f997b7a7d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f32.c
@@ -0,0 +1,212 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_1_f32:
+**	fmov	z0\.s, #1\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_f32, svfloat32_t,
+		z0 = svdup_n_f32 (1),
+		z0 = svdup_f32 (1))
+
+/*
+** dup_0_f32:
+**	mov	z0\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_f32, svfloat32_t,
+		z0 = svdup_n_f32 (0),
+		z0 = svdup_f32 (0))
+
+/*
+** dup_8_f32:
+**	fmov	z0\.s, #8\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_8_f32, svfloat32_t,
+		z0 = svdup_n_f32 (8),
+		z0 = svdup_f32 (8))
+
+/*
+** dup_512_f32:
+**	movi	v([0-9]+).4s, 0x44, lsl 24
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_f32, svfloat32_t,
+		z0 = svdup_n_f32 (512),
+		z0 = svdup_f32 (512))
+
+/*
+** dup_513_f32:
+**	...
+**	ld1rw	z0\.s, p[0-7]/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_513_f32, svfloat32_t,
+		z0 = svdup_n_f32 (513),
+		z0 = svdup_f32 (513))
+
+/*
+** dup_s4_f32:
+**	mov	z0\.s, s4
+**	ret
+*/
+TEST_UNIFORM_ZD (dup_s4_f32, svfloat32_t, float,
+		z0 = svdup_n_f32 (d4),
+		z0 = svdup_f32 (d4))
+
+/*
+** dup_1_f32_m:
+**	fmov	z0\.s, p0/m, #1\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_f32_m, svfloat32_t,
+		z0 = svdup_n_f32_m (z0, p0, 1),
+		z0 = svdup_f32_m (z0, p0, 1))
+
+/*
+** dup_0_f32_m:
+**	mov	z0\.s, p0/m, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_f32_m, svfloat32_t,
+		z0 = svdup_n_f32_m (z0, p0, 0),
+		z0 = svdup_f32_m (z0, p0, 0))
+
+/*
+** dup_8_f32_m:
+**	fmov	z0\.s, p0/m, #8\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_8_f32_m, svfloat32_t,
+		z0 = svdup_n_f32_m (z0, p0, 8),
+		z0 = svdup_f32_m (z0, p0, 8))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_512_f32_m, svfloat32_t,
+		z0 = svdup_n_f32_m (z0, p0, 512),
+		z0 = svdup_f32_m (z0, p0, 512))
+
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_513_f32_m, svfloat32_t,
+		z0 = svdup_n_f32_m (z0, p0, 513),
+		z0 = svdup_f32_m (z0, p0, 513))
+
+/*
+** dup_s4_f32_m:
+**	movprfx	z0, z1
+**	mov	z0\.s, p0/m, s4
+**	ret
+*/
+TEST_UNIFORM_ZD (dup_s4_f32_m, svfloat32_t, float,
+		z0 = svdup_n_f32_m (z1, p0, d4),
+		z0 = svdup_f32_m (z1, p0, d4))
+
+/*
+** dup_1_f32_z:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmov	z0\.s, p0/m, #1\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_f32_z, svfloat32_t,
+		z0 = svdup_n_f32_z (p0, 1),
+		z0 = svdup_f32_z (p0, 1))
+
+/*
+** dup_0_f32_z:
+**	mov	z0\.s, p0/z, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_f32_z, svfloat32_t,
+		z0 = svdup_n_f32_z (p0, 0),
+		z0 = svdup_f32_z (p0, 0))
+
+/*
+** dup_8_f32_z:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmov	z0\.s, p0/m, #8\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_8_f32_z, svfloat32_t,
+		z0 = svdup_n_f32_z (p0, 8),
+		z0 = svdup_f32_z (p0, 8))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_512_f32_z, svfloat32_t,
+		z0 = svdup_n_f32_z (p0, 512),
+		z0 = svdup_f32_z (p0, 512))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_513_f32_z, svfloat32_t,
+		z0 = svdup_n_f32_z (p0, 513),
+		z0 = svdup_f32_z (p0, 513))
+
+/*
+** dup_s4_f32_z:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mov	z0\.s, p0/m, s4
+**	ret
+*/
+TEST_UNIFORM_ZD (dup_s4_f32_z, svfloat32_t, float,
+		z0 = svdup_n_f32_z (p0, d4),
+		z0 = svdup_f32_z (p0, d4))
+
+/*
+** dup_1_f32_x:
+**	fmov	z0\.s, #1\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_f32_x, svfloat32_t,
+		z0 = svdup_n_f32_x (p0, 1),
+		z0 = svdup_f32_x (p0, 1))
+
+/*
+** dup_0_f32_x:
+**	mov	z0\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_f32_x, svfloat32_t,
+		z0 = svdup_n_f32_x (p0, 0),
+		z0 = svdup_f32_x (p0, 0))
+
+/*
+** dup_8_f32_x:
+**	fmov	z0\.s, #8\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_8_f32_x, svfloat32_t,
+		z0 = svdup_n_f32_x (p0, 8),
+		z0 = svdup_f32_x (p0, 8))
+
+/*
+** dup_512_f32_x:
+**	movi	v([0-9]+).4s, 0x44, lsl 24
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_f32_x, svfloat32_t,
+		z0 = svdup_n_f32_x (p0, 512),
+		z0 = svdup_f32_x (p0, 512))
+
+/*
+** dup_513_f32_x:
+**	...
+**	ld1rw	z0\.s, p[0-7]/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_513_f32_x, svfloat32_t,
+		z0 = svdup_n_f32_x (p0, 513),
+		z0 = svdup_f32_x (p0, 513))
+
+/*
+** dup_s4_f32_x:
+**	mov	z0\.s, s4
+**	ret
+*/
+TEST_UNIFORM_ZD (dup_s4_f32_x, svfloat32_t, float,
+		z0 = svdup_n_f32_x (p0, d4),
+		z0 = svdup_f32_x (p0, d4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f64.c
new file mode 100644
index 000000000..e177d9108
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_f64.c
@@ -0,0 +1,212 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_1_f64:
+**	fmov	z0\.d, #1\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_f64, svfloat64_t,
+		z0 = svdup_n_f64 (1),
+		z0 = svdup_f64 (1))
+
+/*
+** dup_0_f64:
+**	mov	z0\.d, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_f64, svfloat64_t,
+		z0 = svdup_n_f64 (0),
+		z0 = svdup_f64 (0))
+
+/*
+** dup_8_f64:
+**	fmov	z0\.d, #8\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_8_f64, svfloat64_t,
+		z0 = svdup_n_f64 (8),
+		z0 = svdup_f64 (8))
+
+/*
+** dup_512_f64:
+**	mov	(x[0-9]+), 4647714815446351872
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_f64, svfloat64_t,
+		z0 = svdup_n_f64 (512),
+		z0 = svdup_f64 (512))
+
+/*
+** dup_513_f64:
+**	...
+**	ld1rd	z0\.d, p[0-7]/z, \[x[0-9+]\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_513_f64, svfloat64_t,
+		z0 = svdup_n_f64 (513),
+		z0 = svdup_f64 (513))
+
+/*
+** dup_d4_f64:
+**	mov	z0\.d, d4
+**	ret
+*/
+TEST_UNIFORM_ZD (dup_d4_f64, svfloat64_t, double,
+		z0 = svdup_n_f64 (d4),
+		z0 = svdup_f64 (d4))
+
+/*
+** dup_1_f64_m:
+**	fmov	z0\.d, p0/m, #1\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_f64_m, svfloat64_t,
+		z0 = svdup_n_f64_m (z0, p0, 1),
+		z0 = svdup_f64_m (z0, p0, 1))
+
+/*
+** dup_0_f64_m:
+**	mov	z0\.d, p0/m, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_f64_m, svfloat64_t,
+		z0 = svdup_n_f64_m (z0, p0, 0),
+		z0 = svdup_f64_m (z0, p0, 0))
+
+/*
+** dup_8_f64_m:
+**	fmov	z0\.d, p0/m, #8\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_8_f64_m, svfloat64_t,
+		z0 = svdup_n_f64_m (z0, p0, 8),
+		z0 = svdup_f64_m (z0, p0, 8))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_512_f64_m, svfloat64_t,
+		z0 = svdup_n_f64_m (z0, p0, 512),
+		z0 = svdup_f64_m (z0, p0, 512))
+
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_513_f64_m, svfloat64_t,
+		z0 = svdup_n_f64_m (z0, p0, 513),
+		z0 = svdup_f64_m (z0, p0, 513))
+
+/*
+** dup_d4_f64_m:
+**	movprfx	z0, z1
+**	mov	z0\.d, p0/m, d4
+**	ret
+*/
+TEST_UNIFORM_ZD (dup_d4_f64_m, svfloat64_t, double,
+		z0 = svdup_n_f64_m (z1, p0, d4),
+		z0 = svdup_f64_m (z1, p0, d4))
+
+/*
+** dup_1_f64_z:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmov	z0\.d, p0/m, #1\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_f64_z, svfloat64_t,
+		z0 = svdup_n_f64_z (p0, 1),
+		z0 = svdup_f64_z (p0, 1))
+
+/*
+** dup_0_f64_z:
+**	mov	z0\.d, p0/z, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_f64_z, svfloat64_t,
+		z0 = svdup_n_f64_z (p0, 0),
+		z0 = svdup_f64_z (p0, 0))
+
+/*
+** dup_8_f64_z:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmov	z0\.d, p0/m, #8\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_8_f64_z, svfloat64_t,
+		z0 = svdup_n_f64_z (p0, 8),
+		z0 = svdup_f64_z (p0, 8))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_512_f64_z, svfloat64_t,
+		z0 = svdup_n_f64_z (p0, 512),
+		z0 = svdup_f64_z (p0, 512))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_513_f64_z, svfloat64_t,
+		z0 = svdup_n_f64_z (p0, 513),
+		z0 = svdup_f64_z (p0, 513))
+
+/*
+** dup_d4_f64_z:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mov	z0\.d, p0/m, d4
+**	ret
+*/
+TEST_UNIFORM_ZD (dup_d4_f64_z, svfloat64_t, double,
+		z0 = svdup_n_f64_z (p0, d4),
+		z0 = svdup_f64_z (p0, d4))
+
+/*
+** dup_1_f64_x:
+**	fmov	z0\.d, #1\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_f64_x, svfloat64_t,
+		z0 = svdup_n_f64_x (p0, 1),
+		z0 = svdup_f64_x (p0, 1))
+
+/*
+** dup_0_f64_x:
+**	mov	z0\.d, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_f64_x, svfloat64_t,
+		z0 = svdup_n_f64_x (p0, 0),
+		z0 = svdup_f64_x (p0, 0))
+
+/*
+** dup_8_f64_x:
+**	fmov	z0\.d, #8\.0(?:e\+0)?
+**	ret
+*/
+TEST_UNIFORM_Z (dup_8_f64_x, svfloat64_t,
+		z0 = svdup_n_f64_x (p0, 8),
+		z0 = svdup_f64_x (p0, 8))
+
+/*
+** dup_512_f64_x:
+**	mov	(x[0-9]+), 4647714815446351872
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_f64_x, svfloat64_t,
+		z0 = svdup_n_f64_x (p0, 512),
+		z0 = svdup_f64_x (p0, 512))
+
+/*
+** dup_513_f64_x:
+**	...
+**	ld1rd	z0\.d, p[0-7]/z, \[x[0-9+]\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_513_f64_x, svfloat64_t,
+		z0 = svdup_n_f64_x (p0, 513),
+		z0 = svdup_f64_x (p0, 513))
+
+/*
+** dup_d4_f64_x:
+**	mov	z0\.d, d4
+**	ret
+*/
+TEST_UNIFORM_ZD (dup_d4_f64_x, svfloat64_t, double,
+		z0 = svdup_n_f64_x (p0, d4),
+		z0 = svdup_f64_x (p0, d4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_bf16.c
new file mode 100644
index 000000000..d05ad5adb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_bf16.c
@@ -0,0 +1,108 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_lane_w0_bf16_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_bf16_tied1, svbfloat16_t, uint16_t,
+		 z0 = svdup_lane_bf16 (z0, x0),
+		 z0 = svdup_lane (z0, x0))
+
+/*
+** dup_lane_w0_bf16_untied:
+**	mov	(z[0-9]+\.h), w0
+**	tbl	z0\.h, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_bf16_untied, svbfloat16_t, uint16_t,
+		 z0 = svdup_lane_bf16 (z1, x0),
+		 z0 = svdup_lane (z1, x0))
+
+/*
+** dup_lane_0_bf16_tied1:
+**	dup	z0\.h, z0\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_bf16_tied1, svbfloat16_t,
+		z0 = svdup_lane_bf16 (z0, 0),
+		z0 = svdup_lane (z0, 0))
+
+/*
+** dup_lane_0_bf16_untied:
+**	dup	z0\.h, z1\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_bf16_untied, svbfloat16_t,
+		z0 = svdup_lane_bf16 (z1, 0),
+		z0 = svdup_lane (z1, 0))
+
+/*
+** dup_lane_15_bf16:
+**	dup	z0\.h, z0\.h\[15\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_15_bf16, svbfloat16_t,
+		z0 = svdup_lane_bf16 (z0, 15),
+		z0 = svdup_lane (z0, 15))
+
+/*
+** dup_lane_16_bf16:
+**	dup	z0\.h, z0\.h\[16\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_16_bf16, svbfloat16_t,
+		z0 = svdup_lane_bf16 (z0, 16),
+		z0 = svdup_lane (z0, 16))
+
+/*
+** dup_lane_31_bf16:
+**	dup	z0\.h, z0\.h\[31\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_31_bf16, svbfloat16_t,
+		z0 = svdup_lane_bf16 (z0, 31),
+		z0 = svdup_lane (z0, 31))
+
+/*
+** dup_lane_32_bf16:
+**	mov	(z[0-9]+\.h), #32
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_32_bf16, svbfloat16_t,
+		z0 = svdup_lane_bf16 (z0, 32),
+		z0 = svdup_lane (z0, 32))
+
+/*
+** dup_lane_63_bf16:
+**	mov	(z[0-9]+\.h), #63
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_63_bf16, svbfloat16_t,
+		z0 = svdup_lane_bf16 (z0, 63),
+		z0 = svdup_lane (z0, 63))
+
+/*
+** dup_lane_64_bf16:
+**	mov	(z[0-9]+\.h), #64
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_64_bf16, svbfloat16_t,
+		z0 = svdup_lane_bf16 (z0, 64),
+		z0 = svdup_lane (z0, 64))
+
+/*
+** dup_lane_255_bf16:
+**	mov	(z[0-9]+\.h), #255
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_255_bf16, svbfloat16_t,
+		z0 = svdup_lane_bf16 (z0, 255),
+		z0 = svdup_lane (z0, 255))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f16.c
new file mode 100644
index 000000000..142afbb24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f16.c
@@ -0,0 +1,108 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_lane_w0_f16_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_f16_tied1, svfloat16_t, uint16_t,
+		 z0 = svdup_lane_f16 (z0, x0),
+		 z0 = svdup_lane (z0, x0))
+
+/*
+** dup_lane_w0_f16_untied:
+**	mov	(z[0-9]+\.h), w0
+**	tbl	z0\.h, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_f16_untied, svfloat16_t, uint16_t,
+		 z0 = svdup_lane_f16 (z1, x0),
+		 z0 = svdup_lane (z1, x0))
+
+/*
+** dup_lane_0_f16_tied1:
+**	dup	z0\.h, z0\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_f16_tied1, svfloat16_t,
+		z0 = svdup_lane_f16 (z0, 0),
+		z0 = svdup_lane (z0, 0))
+
+/*
+** dup_lane_0_f16_untied:
+**	dup	z0\.h, z1\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_f16_untied, svfloat16_t,
+		z0 = svdup_lane_f16 (z1, 0),
+		z0 = svdup_lane (z1, 0))
+
+/*
+** dup_lane_15_f16:
+**	dup	z0\.h, z0\.h\[15\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_15_f16, svfloat16_t,
+		z0 = svdup_lane_f16 (z0, 15),
+		z0 = svdup_lane (z0, 15))
+
+/*
+** dup_lane_16_f16:
+**	dup	z0\.h, z0\.h\[16\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_16_f16, svfloat16_t,
+		z0 = svdup_lane_f16 (z0, 16),
+		z0 = svdup_lane (z0, 16))
+
+/*
+** dup_lane_31_f16:
+**	dup	z0\.h, z0\.h\[31\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_31_f16, svfloat16_t,
+		z0 = svdup_lane_f16 (z0, 31),
+		z0 = svdup_lane (z0, 31))
+
+/*
+** dup_lane_32_f16:
+**	mov	(z[0-9]+\.h), #32
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_32_f16, svfloat16_t,
+		z0 = svdup_lane_f16 (z0, 32),
+		z0 = svdup_lane (z0, 32))
+
+/*
+** dup_lane_63_f16:
+**	mov	(z[0-9]+\.h), #63
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_63_f16, svfloat16_t,
+		z0 = svdup_lane_f16 (z0, 63),
+		z0 = svdup_lane (z0, 63))
+
+/*
+** dup_lane_64_f16:
+**	mov	(z[0-9]+\.h), #64
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_64_f16, svfloat16_t,
+		z0 = svdup_lane_f16 (z0, 64),
+		z0 = svdup_lane (z0, 64))
+
+/*
+** dup_lane_255_f16:
+**	mov	(z[0-9]+\.h), #255
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_255_f16, svfloat16_t,
+		z0 = svdup_lane_f16 (z0, 255),
+		z0 = svdup_lane (z0, 255))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f32.c
new file mode 100644
index 000000000..b32068a37
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f32.c
@@ -0,0 +1,110 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_lane_w0_f32_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_f32_tied1, svfloat32_t, uint32_t,
+		 z0 = svdup_lane_f32 (z0, x0),
+		 z0 = svdup_lane (z0, x0))
+
+/*
+** dup_lane_w0_f32_untied:
+**	mov	(z[0-9]+\.s), w0
+**	tbl	z0\.s, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_f32_untied, svfloat32_t, uint32_t,
+		 z0 = svdup_lane_f32 (z1, x0),
+		 z0 = svdup_lane (z1, x0))
+
+/*
+** dup_lane_0_f32_tied1:
+**	dup	z0\.s, z0\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_f32_tied1, svfloat32_t,
+		z0 = svdup_lane_f32 (z0, 0),
+		z0 = svdup_lane (z0, 0))
+
+/*
+** dup_lane_0_f32_untied:
+**	dup	z0\.s, z1\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_f32_untied, svfloat32_t,
+		z0 = svdup_lane_f32 (z1, 0),
+		z0 = svdup_lane (z1, 0))
+
+/*
+** dup_lane_15_f32:
+**	dup	z0\.s, z0\.s\[15\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_15_f32, svfloat32_t,
+		z0 = svdup_lane_f32 (z0, 15),
+		z0 = svdup_lane (z0, 15))
+
+/*
+** dup_lane_16_f32:
+**	mov	(z[0-9]+\.s), #16
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_16_f32, svfloat32_t,
+		z0 = svdup_lane_f32 (z0, 16),
+		z0 = svdup_lane (z0, 16))
+
+/*
+** dup_lane_31_f32:
+**	mov	(z[0-9]+\.s), #31
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_31_f32, svfloat32_t,
+		z0 = svdup_lane_f32 (z0, 31),
+		z0 = svdup_lane (z0, 31))
+
+/*
+** dup_lane_32_f32:
+**	mov	(z[0-9]+\.s), #32
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_32_f32, svfloat32_t,
+		z0 = svdup_lane_f32 (z0, 32),
+		z0 = svdup_lane (z0, 32))
+
+/*
+** dup_lane_63_f32:
+**	mov	(z[0-9]+\.s), #63
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_63_f32, svfloat32_t,
+		z0 = svdup_lane_f32 (z0, 63),
+		z0 = svdup_lane (z0, 63))
+
+/*
+** dup_lane_64_f32:
+**	mov	(z[0-9]+\.s), #64
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_64_f32, svfloat32_t,
+		z0 = svdup_lane_f32 (z0, 64),
+		z0 = svdup_lane (z0, 64))
+
+/*
+** dup_lane_255_f32:
+**	mov	(z[0-9]+\.s), #255
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_255_f32, svfloat32_t,
+		z0 = svdup_lane_f32 (z0, 255),
+		z0 = svdup_lane (z0, 255))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f64.c
new file mode 100644
index 000000000..64af50d0c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_f64.c
@@ -0,0 +1,111 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_lane_x0_f64_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_x0_f64_tied1, svfloat64_t, uint64_t,
+		 z0 = svdup_lane_f64 (z0, x0),
+		 z0 = svdup_lane (z0, x0))
+
+/*
+** dup_lane_x0_f64_untied:
+**	mov	(z[0-9]+\.d), x0
+**	tbl	z0\.d, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_x0_f64_untied, svfloat64_t, uint64_t,
+		 z0 = svdup_lane_f64 (z1, x0),
+		 z0 = svdup_lane (z1, x0))
+
+/*
+** dup_lane_0_f64_tied1:
+**	dup	z0\.d, z0\.d\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_f64_tied1, svfloat64_t,
+		z0 = svdup_lane_f64 (z0, 0),
+		z0 = svdup_lane (z0, 0))
+
+/*
+** dup_lane_0_f64_untied:
+**	dup	z0\.d, z1\.d\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_f64_untied, svfloat64_t,
+		z0 = svdup_lane_f64 (z1, 0),
+		z0 = svdup_lane (z1, 0))
+
+/*
+** dup_lane_15_f64:
+**	mov	(z[0-9]+\.d), #15
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_15_f64, svfloat64_t,
+		z0 = svdup_lane_f64 (z0, 15),
+		z0 = svdup_lane (z0, 15))
+
+/*
+** dup_lane_16_f64:
+**	mov	(z[0-9]+\.d), #16
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_16_f64, svfloat64_t,
+		z0 = svdup_lane_f64 (z0, 16),
+		z0 = svdup_lane (z0, 16))
+
+/*
+** dup_lane_31_f64:
+**	mov	(z[0-9]+\.d), #31
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_31_f64, svfloat64_t,
+		z0 = svdup_lane_f64 (z0, 31),
+		z0 = svdup_lane (z0, 31))
+
+/*
+** dup_lane_32_f64:
+**	mov	(z[0-9]+\.d), #32
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_32_f64, svfloat64_t,
+		z0 = svdup_lane_f64 (z0, 32),
+		z0 = svdup_lane (z0, 32))
+
+/*
+** dup_lane_63_f64:
+**	mov	(z[0-9]+\.d), #63
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_63_f64, svfloat64_t,
+		z0 = svdup_lane_f64 (z0, 63),
+		z0 = svdup_lane (z0, 63))
+
+/*
+** dup_lane_64_f64:
+**	mov	(z[0-9]+\.d), #64
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_64_f64, svfloat64_t,
+		z0 = svdup_lane_f64 (z0, 64),
+		z0 = svdup_lane (z0, 64))
+
+/*
+** dup_lane_255_f64:
+**	mov	(z[0-9]+\.d), #255
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_255_f64, svfloat64_t,
+		z0 = svdup_lane_f64 (z0, 255),
+		z0 = svdup_lane (z0, 255))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s16.c
new file mode 100644
index 000000000..3b6f20696
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s16.c
@@ -0,0 +1,126 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_lane_w0_s16_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_s16_tied1, svint16_t, uint16_t,
+		 z0 = svdup_lane_s16 (z0, x0),
+		 z0 = svdup_lane (z0, x0))
+
+/*
+** dup_lane_w0_s16_untied:
+**	mov	(z[0-9]+\.h), w0
+**	tbl	z0\.h, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_s16_untied, svint16_t, uint16_t,
+		 z0 = svdup_lane_s16 (z1, x0),
+		 z0 = svdup_lane (z1, x0))
+
+/*
+** dup_lane_0_s16_tied1:
+**	dup	z0\.h, z0\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_s16_tied1, svint16_t,
+		z0 = svdup_lane_s16 (z0, 0),
+		z0 = svdup_lane (z0, 0))
+
+/*
+** dup_lane_0_s16_untied:
+**	dup	z0\.h, z1\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_s16_untied, svint16_t,
+		z0 = svdup_lane_s16 (z1, 0),
+		z0 = svdup_lane (z1, 0))
+
+/*
+** dup_lane_7_s16:
+**	dup	z0\.h, z0\.h\[7\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_7_s16, svint16_t,
+		z0 = svdup_lane_s16 (z0, 7),
+		z0 = svdup_lane (z0, 7))
+
+/*
+** dup_lane_8_s16:
+**	dup	z0\.h, z0\.h\[8\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_8_s16, svint16_t,
+		z0 = svdup_lane_s16 (z0, 8),
+		z0 = svdup_lane (z0, 8))
+
+/*
+** dup_lane_15_s16:
+**	dup	z0\.h, z0\.h\[15\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_15_s16, svint16_t,
+		z0 = svdup_lane_s16 (z0, 15),
+		z0 = svdup_lane (z0, 15))
+
+/*
+** dup_lane_16_s16:
+**	dup	z0\.h, z0\.h\[16\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_16_s16, svint16_t,
+		z0 = svdup_lane_s16 (z0, 16),
+		z0 = svdup_lane (z0, 16))
+
+/*
+** dup_lane_31_s16:
+**	dup	z0\.h, z0\.h\[31\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_31_s16, svint16_t,
+		z0 = svdup_lane_s16 (z0, 31),
+		z0 = svdup_lane (z0, 31))
+
+/*
+** dup_lane_32_s16:
+**	mov	(z[0-9]+\.h), #32
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_32_s16, svint16_t,
+		z0 = svdup_lane_s16 (z0, 32),
+		z0 = svdup_lane (z0, 32))
+
+/*
+** dup_lane_63_s16:
+**	mov	(z[0-9]+\.h), #63
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_63_s16, svint16_t,
+		z0 = svdup_lane_s16 (z0, 63),
+		z0 = svdup_lane (z0, 63))
+
+/*
+** dup_lane_64_s16:
+**	mov	(z[0-9]+\.h), #64
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_64_s16, svint16_t,
+		z0 = svdup_lane_s16 (z0, 64),
+		z0 = svdup_lane (z0, 64))
+
+/*
+** dup_lane_255_s16:
+**	mov	(z[0-9]+\.h), #255
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_255_s16, svint16_t,
+		z0 = svdup_lane_s16 (z0, 255),
+		z0 = svdup_lane (z0, 255))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s32.c
new file mode 100644
index 000000000..bf597fdf6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s32.c
@@ -0,0 +1,128 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_lane_w0_s32_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_s32_tied1, svint32_t, uint32_t,
+		 z0 = svdup_lane_s32 (z0, x0),
+		 z0 = svdup_lane (z0, x0))
+
+/*
+** dup_lane_w0_s32_untied:
+**	mov	(z[0-9]+\.s), w0
+**	tbl	z0\.s, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_s32_untied, svint32_t, uint32_t,
+		 z0 = svdup_lane_s32 (z1, x0),
+		 z0 = svdup_lane (z1, x0))
+
+/*
+** dup_lane_0_s32_tied1:
+**	dup	z0\.s, z0\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_s32_tied1, svint32_t,
+		z0 = svdup_lane_s32 (z0, 0),
+		z0 = svdup_lane (z0, 0))
+
+/*
+** dup_lane_0_s32_untied:
+**	dup	z0\.s, z1\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_s32_untied, svint32_t,
+		z0 = svdup_lane_s32 (z1, 0),
+		z0 = svdup_lane (z1, 0))
+
+/*
+** dup_lane_7_s32:
+**	dup	z0\.s, z0\.s\[7\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_7_s32, svint32_t,
+		z0 = svdup_lane_s32 (z0, 7),
+		z0 = svdup_lane (z0, 7))
+
+/*
+** dup_lane_8_s32:
+**	dup	z0\.s, z0\.s\[8\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_8_s32, svint32_t,
+		z0 = svdup_lane_s32 (z0, 8),
+		z0 = svdup_lane (z0, 8))
+
+/*
+** dup_lane_15_s32:
+**	dup	z0\.s, z0\.s\[15\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_15_s32, svint32_t,
+		z0 = svdup_lane_s32 (z0, 15),
+		z0 = svdup_lane (z0, 15))
+
+/*
+** dup_lane_16_s32:
+**	mov	(z[0-9]+\.s), #16
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_16_s32, svint32_t,
+		z0 = svdup_lane_s32 (z0, 16),
+		z0 = svdup_lane (z0, 16))
+
+/*
+** dup_lane_31_s32:
+**	mov	(z[0-9]+\.s), #31
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_31_s32, svint32_t,
+		z0 = svdup_lane_s32 (z0, 31),
+		z0 = svdup_lane (z0, 31))
+
+/*
+** dup_lane_32_s32:
+**	mov	(z[0-9]+\.s), #32
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_32_s32, svint32_t,
+		z0 = svdup_lane_s32 (z0, 32),
+		z0 = svdup_lane (z0, 32))
+
+/*
+** dup_lane_63_s32:
+**	mov	(z[0-9]+\.s), #63
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_63_s32, svint32_t,
+		z0 = svdup_lane_s32 (z0, 63),
+		z0 = svdup_lane (z0, 63))
+
+/*
+** dup_lane_64_s32:
+**	mov	(z[0-9]+\.s), #64
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_64_s32, svint32_t,
+		z0 = svdup_lane_s32 (z0, 64),
+		z0 = svdup_lane (z0, 64))
+
+/*
+** dup_lane_255_s32:
+**	mov	(z[0-9]+\.s), #255
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_255_s32, svint32_t,
+		z0 = svdup_lane_s32 (z0, 255),
+		z0 = svdup_lane (z0, 255))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s64.c
new file mode 100644
index 000000000..f2f3a1770
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s64.c
@@ -0,0 +1,130 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_lane_x0_s64_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_x0_s64_tied1, svint64_t, uint64_t,
+		 z0 = svdup_lane_s64 (z0, x0),
+		 z0 = svdup_lane (z0, x0))
+
+/*
+** dup_lane_x0_s64_untied:
+**	mov	(z[0-9]+\.d), x0
+**	tbl	z0\.d, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_x0_s64_untied, svint64_t, uint64_t,
+		 z0 = svdup_lane_s64 (z1, x0),
+		 z0 = svdup_lane (z1, x0))
+
+/*
+** dup_lane_0_s64_tied1:
+**	dup	z0\.d, z0\.d\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_s64_tied1, svint64_t,
+		z0 = svdup_lane_s64 (z0, 0),
+		z0 = svdup_lane (z0, 0))
+
+/*
+** dup_lane_0_s64_untied:
+**	dup	z0\.d, z1\.d\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_s64_untied, svint64_t,
+		z0 = svdup_lane_s64 (z1, 0),
+		z0 = svdup_lane (z1, 0))
+
+/*
+** dup_lane_7_s64:
+**	dup	z0\.d, z0\.d\[7\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_7_s64, svint64_t,
+		z0 = svdup_lane_s64 (z0, 7),
+		z0 = svdup_lane (z0, 7))
+
+/*
+** dup_lane_8_s64:
+**	mov	(z[0-9]+\.d), #8
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_8_s64, svint64_t,
+		z0 = svdup_lane_s64 (z0, 8),
+		z0 = svdup_lane (z0, 8))
+
+/*
+** dup_lane_15_s64:
+**	mov	(z[0-9]+\.d), #15
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_15_s64, svint64_t,
+		z0 = svdup_lane_s64 (z0, 15),
+		z0 = svdup_lane (z0, 15))
+
+/*
+** dup_lane_16_s64:
+**	mov	(z[0-9]+\.d), #16
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_16_s64, svint64_t,
+		z0 = svdup_lane_s64 (z0, 16),
+		z0 = svdup_lane (z0, 16))
+
+/*
+** dup_lane_31_s64:
+**	mov	(z[0-9]+\.d), #31
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_31_s64, svint64_t,
+		z0 = svdup_lane_s64 (z0, 31),
+		z0 = svdup_lane (z0, 31))
+
+/*
+** dup_lane_32_s64:
+**	mov	(z[0-9]+\.d), #32
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_32_s64, svint64_t,
+		z0 = svdup_lane_s64 (z0, 32),
+		z0 = svdup_lane (z0, 32))
+
+/*
+** dup_lane_63_s64:
+**	mov	(z[0-9]+\.d), #63
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_63_s64, svint64_t,
+		z0 = svdup_lane_s64 (z0, 63),
+		z0 = svdup_lane (z0, 63))
+
+/*
+** dup_lane_64_s64:
+**	mov	(z[0-9]+\.d), #64
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_64_s64, svint64_t,
+		z0 = svdup_lane_s64 (z0, 64),
+		z0 = svdup_lane (z0, 64))
+
+/*
+** dup_lane_255_s64:
+**	mov	(z[0-9]+\.d), #255
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_255_s64, svint64_t,
+		z0 = svdup_lane_s64 (z0, 255),
+		z0 = svdup_lane (z0, 255))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s8.c
new file mode 100644
index 000000000..f5a07e9f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_s8.c
@@ -0,0 +1,124 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_lane_w0_s8_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	tbl	z0\.b, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_s8_tied1, svint8_t, uint8_t,
+		 z0 = svdup_lane_s8 (z0, x0),
+		 z0 = svdup_lane (z0, x0))
+
+/*
+** dup_lane_w0_s8_untied:
+**	mov	(z[0-9]+\.b), w0
+**	tbl	z0\.b, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_s8_untied, svint8_t, uint8_t,
+		 z0 = svdup_lane_s8 (z1, x0),
+		 z0 = svdup_lane (z1, x0))
+
+/*
+** dup_lane_0_s8_tied1:
+**	dup	z0\.b, z0\.b\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_s8_tied1, svint8_t,
+		z0 = svdup_lane_s8 (z0, 0),
+		z0 = svdup_lane (z0, 0))
+
+/*
+** dup_lane_0_s8_untied:
+**	dup	z0\.b, z1\.b\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_s8_untied, svint8_t,
+		z0 = svdup_lane_s8 (z1, 0),
+		z0 = svdup_lane (z1, 0))
+
+/*
+** dup_lane_7_s8:
+**	dup	z0\.b, z0\.b\[7\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_7_s8, svint8_t,
+		z0 = svdup_lane_s8 (z0, 7),
+		z0 = svdup_lane (z0, 7))
+
+/*
+** dup_lane_8_s8:
+**	dup	z0\.b, z0\.b\[8\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_8_s8, svint8_t,
+		z0 = svdup_lane_s8 (z0, 8),
+		z0 = svdup_lane (z0, 8))
+
+/*
+** dup_lane_15_s8:
+**	dup	z0\.b, z0\.b\[15\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_15_s8, svint8_t,
+		z0 = svdup_lane_s8 (z0, 15),
+		z0 = svdup_lane (z0, 15))
+
+/*
+** dup_lane_16_s8:
+**	dup	z0\.b, z0\.b\[16\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_16_s8, svint8_t,
+		z0 = svdup_lane_s8 (z0, 16),
+		z0 = svdup_lane (z0, 16))
+
+/*
+** dup_lane_31_s8:
+**	dup	z0\.b, z0\.b\[31\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_31_s8, svint8_t,
+		z0 = svdup_lane_s8 (z0, 31),
+		z0 = svdup_lane (z0, 31))
+
+/*
+** dup_lane_32_s8:
+**	dup	z0\.b, z0\.b\[32\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_32_s8, svint8_t,
+		z0 = svdup_lane_s8 (z0, 32),
+		z0 = svdup_lane (z0, 32))
+
+/*
+** dup_lane_63_s8:
+**	dup	z0\.b, z0\.b\[63\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_63_s8, svint8_t,
+		z0 = svdup_lane_s8 (z0, 63),
+		z0 = svdup_lane (z0, 63))
+
+/*
+** dup_lane_64_s8:
+**	mov	(z[0-9]+\.b), #64
+**	tbl	z0\.b, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_64_s8, svint8_t,
+		z0 = svdup_lane_s8 (z0, 64),
+		z0 = svdup_lane (z0, 64))
+
+/*
+** dup_lane_255_s8:
+**	mov	(z[0-9]+\.b), #-1
+**	tbl	z0\.b, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_255_s8, svint8_t,
+		z0 = svdup_lane_s8 (z0, 255),
+		z0 = svdup_lane (z0, 255))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u16.c
new file mode 100644
index 000000000..e5135caa5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u16.c
@@ -0,0 +1,126 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_lane_w0_u16_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_u16_tied1, svuint16_t, uint16_t,
+		 z0 = svdup_lane_u16 (z0, x0),
+		 z0 = svdup_lane (z0, x0))
+
+/*
+** dup_lane_w0_u16_untied:
+**	mov	(z[0-9]+\.h), w0
+**	tbl	z0\.h, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_u16_untied, svuint16_t, uint16_t,
+		 z0 = svdup_lane_u16 (z1, x0),
+		 z0 = svdup_lane (z1, x0))
+
+/*
+** dup_lane_0_u16_tied1:
+**	dup	z0\.h, z0\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_u16_tied1, svuint16_t,
+		z0 = svdup_lane_u16 (z0, 0),
+		z0 = svdup_lane (z0, 0))
+
+/*
+** dup_lane_0_u16_untied:
+**	dup	z0\.h, z1\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_u16_untied, svuint16_t,
+		z0 = svdup_lane_u16 (z1, 0),
+		z0 = svdup_lane (z1, 0))
+
+/*
+** dup_lane_7_u16:
+**	dup	z0\.h, z0\.h\[7\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_7_u16, svuint16_t,
+		z0 = svdup_lane_u16 (z0, 7),
+		z0 = svdup_lane (z0, 7))
+
+/*
+** dup_lane_8_u16:
+**	dup	z0\.h, z0\.h\[8\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_8_u16, svuint16_t,
+		z0 = svdup_lane_u16 (z0, 8),
+		z0 = svdup_lane (z0, 8))
+
+/*
+** dup_lane_15_u16:
+**	dup	z0\.h, z0\.h\[15\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_15_u16, svuint16_t,
+		z0 = svdup_lane_u16 (z0, 15),
+		z0 = svdup_lane (z0, 15))
+
+/*
+** dup_lane_16_u16:
+**	dup	z0\.h, z0\.h\[16\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_16_u16, svuint16_t,
+		z0 = svdup_lane_u16 (z0, 16),
+		z0 = svdup_lane (z0, 16))
+
+/*
+** dup_lane_31_u16:
+**	dup	z0\.h, z0\.h\[31\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_31_u16, svuint16_t,
+		z0 = svdup_lane_u16 (z0, 31),
+		z0 = svdup_lane (z0, 31))
+
+/*
+** dup_lane_32_u16:
+**	mov	(z[0-9]+\.h), #32
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_32_u16, svuint16_t,
+		z0 = svdup_lane_u16 (z0, 32),
+		z0 = svdup_lane (z0, 32))
+
+/*
+** dup_lane_63_u16:
+**	mov	(z[0-9]+\.h), #63
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_63_u16, svuint16_t,
+		z0 = svdup_lane_u16 (z0, 63),
+		z0 = svdup_lane (z0, 63))
+
+/*
+** dup_lane_64_u16:
+**	mov	(z[0-9]+\.h), #64
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_64_u16, svuint16_t,
+		z0 = svdup_lane_u16 (z0, 64),
+		z0 = svdup_lane (z0, 64))
+
+/*
+** dup_lane_255_u16:
+**	mov	(z[0-9]+\.h), #255
+**	tbl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_255_u16, svuint16_t,
+		z0 = svdup_lane_u16 (z0, 255),
+		z0 = svdup_lane (z0, 255))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u32.c
new file mode 100644
index 000000000..7e972aca7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u32.c
@@ -0,0 +1,128 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_lane_w0_u32_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_u32_tied1, svuint32_t, uint32_t,
+		 z0 = svdup_lane_u32 (z0, x0),
+		 z0 = svdup_lane (z0, x0))
+
+/*
+** dup_lane_w0_u32_untied:
+**	mov	(z[0-9]+\.s), w0
+**	tbl	z0\.s, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_u32_untied, svuint32_t, uint32_t,
+		 z0 = svdup_lane_u32 (z1, x0),
+		 z0 = svdup_lane (z1, x0))
+
+/*
+** dup_lane_0_u32_tied1:
+**	dup	z0\.s, z0\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_u32_tied1, svuint32_t,
+		z0 = svdup_lane_u32 (z0, 0),
+		z0 = svdup_lane (z0, 0))
+
+/*
+** dup_lane_0_u32_untied:
+**	dup	z0\.s, z1\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_u32_untied, svuint32_t,
+		z0 = svdup_lane_u32 (z1, 0),
+		z0 = svdup_lane (z1, 0))
+
+/*
+** dup_lane_7_u32:
+**	dup	z0\.s, z0\.s\[7\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_7_u32, svuint32_t,
+		z0 = svdup_lane_u32 (z0, 7),
+		z0 = svdup_lane (z0, 7))
+
+/*
+** dup_lane_8_u32:
+**	dup	z0\.s, z0\.s\[8\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_8_u32, svuint32_t,
+		z0 = svdup_lane_u32 (z0, 8),
+		z0 = svdup_lane (z0, 8))
+
+/*
+** dup_lane_15_u32:
+**	dup	z0\.s, z0\.s\[15\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_15_u32, svuint32_t,
+		z0 = svdup_lane_u32 (z0, 15),
+		z0 = svdup_lane (z0, 15))
+
+/*
+** dup_lane_16_u32:
+**	mov	(z[0-9]+\.s), #16
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_16_u32, svuint32_t,
+		z0 = svdup_lane_u32 (z0, 16),
+		z0 = svdup_lane (z0, 16))
+
+/*
+** dup_lane_31_u32:
+**	mov	(z[0-9]+\.s), #31
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_31_u32, svuint32_t,
+		z0 = svdup_lane_u32 (z0, 31),
+		z0 = svdup_lane (z0, 31))
+
+/*
+** dup_lane_32_u32:
+**	mov	(z[0-9]+\.s), #32
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_32_u32, svuint32_t,
+		z0 = svdup_lane_u32 (z0, 32),
+		z0 = svdup_lane (z0, 32))
+
+/*
+** dup_lane_63_u32:
+**	mov	(z[0-9]+\.s), #63
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_63_u32, svuint32_t,
+		z0 = svdup_lane_u32 (z0, 63),
+		z0 = svdup_lane (z0, 63))
+
+/*
+** dup_lane_64_u32:
+**	mov	(z[0-9]+\.s), #64
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_64_u32, svuint32_t,
+		z0 = svdup_lane_u32 (z0, 64),
+		z0 = svdup_lane (z0, 64))
+
+/*
+** dup_lane_255_u32:
+**	mov	(z[0-9]+\.s), #255
+**	tbl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_255_u32, svuint32_t,
+		z0 = svdup_lane_u32 (z0, 255),
+		z0 = svdup_lane (z0, 255))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u64.c
new file mode 100644
index 000000000..5097b7e96
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u64.c
@@ -0,0 +1,130 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_lane_x0_u64_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_x0_u64_tied1, svuint64_t, uint64_t,
+		 z0 = svdup_lane_u64 (z0, x0),
+		 z0 = svdup_lane (z0, x0))
+
+/*
+** dup_lane_x0_u64_untied:
+**	mov	(z[0-9]+\.d), x0
+**	tbl	z0\.d, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_x0_u64_untied, svuint64_t, uint64_t,
+		 z0 = svdup_lane_u64 (z1, x0),
+		 z0 = svdup_lane (z1, x0))
+
+/*
+** dup_lane_0_u64_tied1:
+**	dup	z0\.d, z0\.d\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_u64_tied1, svuint64_t,
+		z0 = svdup_lane_u64 (z0, 0),
+		z0 = svdup_lane (z0, 0))
+
+/*
+** dup_lane_0_u64_untied:
+**	dup	z0\.d, z1\.d\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_u64_untied, svuint64_t,
+		z0 = svdup_lane_u64 (z1, 0),
+		z0 = svdup_lane (z1, 0))
+
+/*
+** dup_lane_7_u64:
+**	dup	z0\.d, z0\.d\[7\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_7_u64, svuint64_t,
+		z0 = svdup_lane_u64 (z0, 7),
+		z0 = svdup_lane (z0, 7))
+
+/*
+** dup_lane_8_u64:
+**	mov	(z[0-9]+\.d), #8
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_8_u64, svuint64_t,
+		z0 = svdup_lane_u64 (z0, 8),
+		z0 = svdup_lane (z0, 8))
+
+/*
+** dup_lane_15_u64:
+**	mov	(z[0-9]+\.d), #15
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_15_u64, svuint64_t,
+		z0 = svdup_lane_u64 (z0, 15),
+		z0 = svdup_lane (z0, 15))
+
+/*
+** dup_lane_16_u64:
+**	mov	(z[0-9]+\.d), #16
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_16_u64, svuint64_t,
+		z0 = svdup_lane_u64 (z0, 16),
+		z0 = svdup_lane (z0, 16))
+
+/*
+** dup_lane_31_u64:
+**	mov	(z[0-9]+\.d), #31
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_31_u64, svuint64_t,
+		z0 = svdup_lane_u64 (z0, 31),
+		z0 = svdup_lane (z0, 31))
+
+/*
+** dup_lane_32_u64:
+**	mov	(z[0-9]+\.d), #32
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_32_u64, svuint64_t,
+		z0 = svdup_lane_u64 (z0, 32),
+		z0 = svdup_lane (z0, 32))
+
+/*
+** dup_lane_63_u64:
+**	mov	(z[0-9]+\.d), #63
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_63_u64, svuint64_t,
+		z0 = svdup_lane_u64 (z0, 63),
+		z0 = svdup_lane (z0, 63))
+
+/*
+** dup_lane_64_u64:
+**	mov	(z[0-9]+\.d), #64
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_64_u64, svuint64_t,
+		z0 = svdup_lane_u64 (z0, 64),
+		z0 = svdup_lane (z0, 64))
+
+/*
+** dup_lane_255_u64:
+**	mov	(z[0-9]+\.d), #255
+**	tbl	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_255_u64, svuint64_t,
+		z0 = svdup_lane_u64 (z0, 255),
+		z0 = svdup_lane (z0, 255))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u8.c
new file mode 100644
index 000000000..25fdf0acb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_lane_u8.c
@@ -0,0 +1,124 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_lane_w0_u8_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	tbl	z0\.b, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_u8_tied1, svuint8_t, uint8_t,
+		 z0 = svdup_lane_u8 (z0, x0),
+		 z0 = svdup_lane (z0, x0))
+
+/*
+** dup_lane_w0_u8_untied:
+**	mov	(z[0-9]+\.b), w0
+**	tbl	z0\.b, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_u8_untied, svuint8_t, uint8_t,
+		 z0 = svdup_lane_u8 (z1, x0),
+		 z0 = svdup_lane (z1, x0))
+
+/*
+** dup_lane_0_u8_tied1:
+**	dup	z0\.b, z0\.b\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_u8_tied1, svuint8_t,
+		z0 = svdup_lane_u8 (z0, 0),
+		z0 = svdup_lane (z0, 0))
+
+/*
+** dup_lane_0_u8_untied:
+**	dup	z0\.b, z1\.b\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_u8_untied, svuint8_t,
+		z0 = svdup_lane_u8 (z1, 0),
+		z0 = svdup_lane (z1, 0))
+
+/*
+** dup_lane_7_u8:
+**	dup	z0\.b, z0\.b\[7\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_7_u8, svuint8_t,
+		z0 = svdup_lane_u8 (z0, 7),
+		z0 = svdup_lane (z0, 7))
+
+/*
+** dup_lane_8_u8:
+**	dup	z0\.b, z0\.b\[8\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_8_u8, svuint8_t,
+		z0 = svdup_lane_u8 (z0, 8),
+		z0 = svdup_lane (z0, 8))
+
+/*
+** dup_lane_15_u8:
+**	dup	z0\.b, z0\.b\[15\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_15_u8, svuint8_t,
+		z0 = svdup_lane_u8 (z0, 15),
+		z0 = svdup_lane (z0, 15))
+
+/*
+** dup_lane_16_u8:
+**	dup	z0\.b, z0\.b\[16\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_16_u8, svuint8_t,
+		z0 = svdup_lane_u8 (z0, 16),
+		z0 = svdup_lane (z0, 16))
+
+/*
+** dup_lane_31_u8:
+**	dup	z0\.b, z0\.b\[31\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_31_u8, svuint8_t,
+		z0 = svdup_lane_u8 (z0, 31),
+		z0 = svdup_lane (z0, 31))
+
+/*
+** dup_lane_32_u8:
+**	dup	z0\.b, z0\.b\[32\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_32_u8, svuint8_t,
+		z0 = svdup_lane_u8 (z0, 32),
+		z0 = svdup_lane (z0, 32))
+
+/*
+** dup_lane_63_u8:
+**	dup	z0\.b, z0\.b\[63\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_63_u8, svuint8_t,
+		z0 = svdup_lane_u8 (z0, 63),
+		z0 = svdup_lane (z0, 63))
+
+/*
+** dup_lane_64_u8:
+**	mov	(z[0-9]+\.b), #64
+**	tbl	z0\.b, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_64_u8, svuint8_t,
+		z0 = svdup_lane_u8 (z0, 64),
+		z0 = svdup_lane (z0, 64))
+
+/*
+** dup_lane_255_u8:
+**	mov	(z[0-9]+\.b), #-1
+**	tbl	z0\.b, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_lane_255_u8, svuint8_t,
+		z0 = svdup_lane_u8 (z0, 255),
+		z0 = svdup_lane (z0, 255))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s16.c
new file mode 100644
index 000000000..876f36db7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s16.c
@@ -0,0 +1,1193 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_1_s16:
+**	mov	z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_s16, svint16_t,
+		z0 = svdup_n_s16 (1),
+		z0 = svdup_s16 (1))
+
+/*
+** dup_127_s16:
+**	mov	z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_s16, svint16_t,
+		z0 = svdup_n_s16 (127),
+		z0 = svdup_s16 (127))
+
+/*
+** dup_128_s16:
+**	mov	z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_s16, svint16_t,
+		z0 = svdup_n_s16 (128),
+		z0 = svdup_s16 (128))
+
+/*
+** dup_129_s16:
+**	movi	v([0-9]+)\.8h, 0x81
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_s16, svint16_t,
+		z0 = svdup_n_s16 (129),
+		z0 = svdup_s16 (129))
+
+/*
+** dup_253_s16:
+**	movi	v([0-9]+)\.8h, 0xfd
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_s16, svint16_t,
+		z0 = svdup_n_s16 (253),
+		z0 = svdup_s16 (253))
+
+/*
+** dup_254_s16:
+**	mov	z0\.h, #254
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_s16, svint16_t,
+		z0 = svdup_n_s16 (254),
+		z0 = svdup_s16 (254))
+
+/*
+** dup_255_s16:
+**	mov	z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_s16, svint16_t,
+		z0 = svdup_n_s16 (255),
+		z0 = svdup_s16 (255))
+
+/*
+** dup_256_s16:
+**	mov	z0\.h, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_s16, svint16_t,
+		z0 = svdup_n_s16 (256),
+		z0 = svdup_s16 (256))
+
+/*
+** dup_257_s16:
+**	mov	z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_257_s16, svint16_t,
+		z0 = svdup_n_s16 (257),
+		z0 = svdup_s16 (257))
+
+/*
+** dup_512_s16:
+**	mov	z0\.h, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_s16, svint16_t,
+		z0 = svdup_n_s16 (512),
+		z0 = svdup_s16 (512))
+
+/*
+** dup_7f00_s16:
+**	mov	z0\.h, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_s16, svint16_t,
+		z0 = svdup_n_s16 (0x7f00),
+		z0 = svdup_s16 (0x7f00))
+
+/*
+** dup_7f01_s16:
+**	mov	(w[0-9]+), 32513
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f01_s16, svint16_t,
+		z0 = svdup_n_s16 (0x7f01),
+		z0 = svdup_s16 (0x7f01))
+
+/*
+** dup_7ffd_s16:
+**	mov	(w[0-9]+), 32765
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffd_s16, svint16_t,
+		z0 = svdup_n_s16 (0x7ffd),
+		z0 = svdup_s16 (0x7ffd))
+
+/*
+** dup_7ffe_s16:
+**	mov	z0\.h, #32766
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_s16, svint16_t,
+		z0 = svdup_n_s16 (0x7ffe),
+		z0 = svdup_s16 (0x7ffe))
+
+/*
+** dup_7fff_s16:
+**	mov	z0\.h, #32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_s16, svint16_t,
+		z0 = svdup_n_s16 (0x7fff),
+		z0 = svdup_s16 (0x7fff))
+
+/*
+** dup_m1_s16:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_s16, svint16_t,
+		z0 = svdup_n_s16 (-1),
+		z0 = svdup_s16 (-1))
+
+/*
+** dup_m128_s16:
+**	mov	z0\.h, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_s16, svint16_t,
+		z0 = svdup_n_s16 (-128),
+		z0 = svdup_s16 (-128))
+
+/*
+** dup_m129_s16:
+**	mov	z0\.h, #-129
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_s16, svint16_t,
+		z0 = svdup_n_s16 (-129),
+		z0 = svdup_s16 (-129))
+
+/*
+** dup_m130_s16:
+**	mvni	v([0-9]+)\.8h, 0x81
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m130_s16, svint16_t,
+		z0 = svdup_n_s16 (-130),
+		z0 = svdup_s16 (-130))
+
+/*
+** dup_m254_s16:
+**	mvni	v([0-9]+)\.8h, 0xfd
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m254_s16, svint16_t,
+		z0 = svdup_n_s16 (-254),
+		z0 = svdup_s16 (-254))
+
+/*
+** dup_m255_s16:
+**	mov	z0\.h, #-255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_s16, svint16_t,
+		z0 = svdup_n_s16 (-255),
+		z0 = svdup_s16 (-255))
+
+/*
+** dup_m256_s16:
+**	mov	z0\.h, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_s16, svint16_t,
+		z0 = svdup_n_s16 (-256),
+		z0 = svdup_s16 (-256))
+
+/*
+** dup_m257_s16:
+**	mov	z0\.h, #-257
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_s16, svint16_t,
+		z0 = svdup_n_s16 (-257),
+		z0 = svdup_s16 (-257))
+
+/*
+** dup_m258_s16:
+**	mov	z0\.b, #-2
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m258_s16, svint16_t,
+		z0 = svdup_n_s16 (-258),
+		z0 = svdup_s16 (-258))
+
+/*
+** dup_m259_s16:
+**	mov	(w[0-9]+), -259
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m259_s16, svint16_t,
+		z0 = svdup_n_s16 (-259),
+		z0 = svdup_s16 (-259))
+
+/*
+** dup_m512_s16:
+**	mov	z0\.h, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_s16, svint16_t,
+		z0 = svdup_n_s16 (-512),
+		z0 = svdup_s16 (-512))
+
+/*
+** dup_m7f00_s16:
+**	mov	z0\.h, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_s16, svint16_t,
+		z0 = svdup_n_s16 (-0x7f00),
+		z0 = svdup_s16 (-0x7f00))
+
+/*
+** dup_m7f01_s16:
+**	mov	z0\.h, #-32513
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_s16, svint16_t,
+		z0 = svdup_n_s16 (-0x7f01),
+		z0 = svdup_s16 (-0x7f01))
+
+/*
+** dup_m7f02_s16:
+**	mov	(w[0-9]+), -32514
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f02_s16, svint16_t,
+		z0 = svdup_n_s16 (-0x7f02),
+		z0 = svdup_s16 (-0x7f02))
+
+/*
+** dup_m7ffe_s16:
+**	mov	(w[0-9]+), -32766
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7ffe_s16, svint16_t,
+		z0 = svdup_n_s16 (-0x7ffe),
+		z0 = svdup_s16 (-0x7ffe))
+
+/*
+** dup_m7fff_s16:
+**	mov	z0\.h, #-32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_s16, svint16_t,
+		z0 = svdup_n_s16 (-0x7fff),
+		z0 = svdup_s16 (-0x7fff))
+
+/*
+** dup_m8000_s16:
+**	mov	z0\.h, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_s16, svint16_t,
+		z0 = svdup_n_s16 (-0x8000),
+		z0 = svdup_s16 (-0x8000))
+
+/*
+** dup_w0_s16:
+**	mov	z0\.h, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_s16, svint16_t, int16_t,
+		 z0 = svdup_n_s16 (x0),
+		 z0 = svdup_s16 (x0))
+
+/*
+** dup_1_s16_m:
+**	mov	z0\.h, p0/m, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, 1),
+		z0 = svdup_s16_m (z0, p0, 1))
+
+/*
+** dup_127_s16_m:
+**	mov	z0\.h, p0/m, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, 127),
+		z0 = svdup_s16_m (z0, p0, 127))
+
+/*
+** dup_128_s16_m:
+**	mov	(z[0-9]+\.h), #128
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, 128),
+		z0 = svdup_s16_m (z0, p0, 128))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_129_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, 129),
+		z0 = svdup_s16_m (z0, p0, 129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_253_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, 253),
+		z0 = svdup_s16_m (z0, p0, 253))
+
+/*
+** dup_254_s16_m:
+**	mov	(z[0-9]+\.h), #254
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, 254),
+		z0 = svdup_s16_m (z0, p0, 254))
+
+/*
+** dup_255_s16_m:
+**	mov	(z[0-9]+\.h), #255
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, 255),
+		z0 = svdup_s16_m (z0, p0, 255))
+
+/*
+** dup_256_s16_m:
+**	mov	z0\.h, p0/m, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, 256),
+		z0 = svdup_s16_m (z0, p0, 256))
+
+/*
+** dup_257_s16_m:
+**	mov	(z[0-9]+)\.b, #1
+**	sel	z0\.h, p0, \1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_257_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, 257),
+		z0 = svdup_s16_m (z0, p0, 257))
+
+/*
+** dup_512_s16_m:
+**	mov	z0\.h, p0/m, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, 512),
+		z0 = svdup_s16_m (z0, p0, 512))
+
+/*
+** dup_7f00_s16_m:
+**	mov	z0\.h, p0/m, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, 0x7f00),
+		z0 = svdup_s16_m (z0, p0, 0x7f00))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7f01_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, 0x7f01),
+		z0 = svdup_s16_m (z0, p0, 0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7ffd_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, 0x7ffd),
+		z0 = svdup_s16_m (z0, p0, 0x7ffd))
+
+/*
+** dup_7ffe_s16_m:
+**	mov	(z[0-9]+\.h), #32766
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, 0x7ffe),
+		z0 = svdup_s16_m (z0, p0, 0x7ffe))
+
+/*
+** dup_7fff_s16_m:
+**	mov	(z[0-9]+\.h), #32767
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, 0x7fff),
+		z0 = svdup_s16_m (z0, p0, 0x7fff))
+
+/*
+** dup_m1_s16_m:
+**	mov	z0\.h, p0/m, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -1),
+		z0 = svdup_s16_m (z0, p0, -1))
+
+/*
+** dup_m128_s16_m:
+**	mov	z0\.h, p0/m, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -128),
+		z0 = svdup_s16_m (z0, p0, -128))
+
+/*
+** dup_m129_s16_m:
+**	mov	(z[0-9]+\.h), #-129
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -129),
+		z0 = svdup_s16_m (z0, p0, -129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m130_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -130),
+		z0 = svdup_s16_m (z0, p0, -130))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m254_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -254),
+		z0 = svdup_s16_m (z0, p0, -254))
+
+/*
+** dup_m255_s16_m:
+**	mov	(z[0-9]+\.h), #-255
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -255),
+		z0 = svdup_s16_m (z0, p0, -255))
+
+/*
+** dup_m256_s16_m:
+**	mov	z0\.h, p0/m, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -256),
+		z0 = svdup_s16_m (z0, p0, -256))
+
+/*
+** dup_m257_s16_m:
+**	mov	(z[0-9]+\.h), #-257
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -257),
+		z0 = svdup_s16_m (z0, p0, -257))
+
+/*
+** dup_m258_s16_m:
+**	mov	(z[0-9]+)\.b, #-2
+**	sel	z0\.h, p0, \1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m258_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -258),
+		z0 = svdup_s16_m (z0, p0, -258))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m259_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -259),
+		z0 = svdup_s16_m (z0, p0, -259))
+
+/*
+** dup_m512_s16_m:
+**	mov	z0\.h, p0/m, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -512),
+		z0 = svdup_s16_m (z0, p0, -512))
+
+/*
+** dup_m7f00_s16_m:
+**	mov	z0\.h, p0/m, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -0x7f00),
+		z0 = svdup_s16_m (z0, p0, -0x7f00))
+
+/*
+** dup_m7f01_s16_m:
+**	mov	(z[0-9]+\.h), #-32513
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -0x7f01),
+		z0 = svdup_s16_m (z0, p0, -0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7f02_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -0x7f02),
+		z0 = svdup_s16_m (z0, p0, -0x7f02))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7ffe_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -0x7ffe),
+		z0 = svdup_s16_m (z0, p0, -0x7ffe))
+
+/*
+** dup_m7fff_s16_m:
+**	mov	(z[0-9]+\.h), #-32767
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -0x7fff),
+		z0 = svdup_s16_m (z0, p0, -0x7fff))
+
+/*
+** dup_m8000_s16_m:
+**	mov	z0\.h, p0/m, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, -0x8000),
+		z0 = svdup_s16_m (z0, p0, -0x8000))
+
+/*
+** dup_0_s16_m:
+**	mov	z0\.h, p0/m, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_s16_m, svint16_t,
+		z0 = svdup_n_s16_m (z0, p0, 0),
+		z0 = svdup_s16_m (z0, p0, 0))
+
+/*
+** dup_w0_s16_m:
+**	movprfx	z0, z1
+**	mov	z0\.h, p0/m, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_s16_m, svint16_t, int16_t,
+		z0 = svdup_n_s16_m (z1, p0, x0),
+		z0 = svdup_s16_m (z1, p0, x0))
+
+/*
+** dup_1_s16_z:
+**	mov	z0\.h, p0/z, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, 1),
+		z0 = svdup_s16_z (p0, 1))
+
+/*
+** dup_127_s16_z:
+**	mov	z0\.h, p0/z, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, 127),
+		z0 = svdup_s16_z (p0, 127))
+
+/*
+** dup_128_s16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #128
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, 128),
+		z0 = svdup_s16_z (p0, 128))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_129_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, 129),
+		z0 = svdup_s16_z (p0, 129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_253_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, 253),
+		z0 = svdup_s16_z (p0, 253))
+
+/*
+** dup_254_s16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #254
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, 254),
+		z0 = svdup_s16_z (p0, 254))
+
+/*
+** dup_255_s16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #255
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, 255),
+		z0 = svdup_s16_z (p0, 255))
+
+/*
+** dup_256_s16_z:
+**	mov	z0\.h, p0/z, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, 256),
+		z0 = svdup_s16_z (p0, 256))
+
+/*
+** dup_257_s16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+)\.b, #1
+**	sel	z0\.h, p0, \2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_257_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, 257),
+		z0 = svdup_s16_z (p0, 257))
+
+/*
+** dup_512_s16_z:
+**	mov	z0\.h, p0/z, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, 512),
+		z0 = svdup_s16_z (p0, 512))
+
+/*
+** dup_7f00_s16_z:
+**	mov	z0\.h, p0/z, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, 0x7f00),
+		z0 = svdup_s16_z (p0, 0x7f00))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7f01_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, 0x7f01),
+		z0 = svdup_s16_z (p0, 0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7ffd_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, 0x7ffd),
+		z0 = svdup_s16_z (p0, 0x7ffd))
+
+/*
+** dup_7ffe_s16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #32766
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, 0x7ffe),
+		z0 = svdup_s16_z (p0, 0x7ffe))
+
+/*
+** dup_7fff_s16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #32767
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, 0x7fff),
+		z0 = svdup_s16_z (p0, 0x7fff))
+
+/*
+** dup_m1_s16_z:
+**	mov	z0\.h, p0/z, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -1),
+		z0 = svdup_s16_z (p0, -1))
+
+/*
+** dup_m128_s16_z:
+**	mov	z0\.h, p0/z, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -128),
+		z0 = svdup_s16_z (p0, -128))
+
+/*
+** dup_m129_s16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #-129
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -129),
+		z0 = svdup_s16_z (p0, -129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m130_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -130),
+		z0 = svdup_s16_z (p0, -130))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m254_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -254),
+		z0 = svdup_s16_z (p0, -254))
+
+/*
+** dup_m255_s16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #-255
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -255),
+		z0 = svdup_s16_z (p0, -255))
+
+/*
+** dup_m256_s16_z:
+**	mov	z0\.h, p0/z, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -256),
+		z0 = svdup_s16_z (p0, -256))
+
+/*
+** dup_m257_s16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #-257
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -257),
+		z0 = svdup_s16_z (p0, -257))
+
+/*
+** dup_m258_s16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+)\.b, #-2
+**	sel	z0\.h, p0, \2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m258_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -258),
+		z0 = svdup_s16_z (p0, -258))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m259_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -259),
+		z0 = svdup_s16_z (p0, -259))
+
+/*
+** dup_m512_s16_z:
+**	mov	z0\.h, p0/z, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -512),
+		z0 = svdup_s16_z (p0, -512))
+
+/*
+** dup_m7f00_s16_z:
+**	mov	z0\.h, p0/z, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -0x7f00),
+		z0 = svdup_s16_z (p0, -0x7f00))
+
+/*
+** dup_m7f01_s16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #-32513
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -0x7f01),
+		z0 = svdup_s16_z (p0, -0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7f02_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -0x7f02),
+		z0 = svdup_s16_z (p0, -0x7f02))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7ffe_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -0x7ffe),
+		z0 = svdup_s16_z (p0, -0x7ffe))
+
+/*
+** dup_m7fff_s16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #-32767
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -0x7fff),
+		z0 = svdup_s16_z (p0, -0x7fff))
+
+/*
+** dup_m8000_s16_z:
+**	mov	z0\.h, p0/z, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, -0x8000),
+		z0 = svdup_s16_z (p0, -0x8000))
+
+/*
+** dup_0_s16_z:
+**	mov	z0\.h, p0/z, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_s16_z, svint16_t,
+		z0 = svdup_n_s16_z (p0, 0),
+		z0 = svdup_s16_z (p0, 0))
+
+/*
+** dup_w0_s16_z:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mov	z0\.h, p0/m, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_s16_z, svint16_t, int16_t,
+		z0 = svdup_n_s16_z (p0, x0),
+		z0 = svdup_s16_z (p0, x0))
+
+/*
+** dup_1_s16_x:
+**	mov	z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, 1),
+		z0 = svdup_s16_x (p0, 1))
+
+/*
+** dup_127_s16_x:
+**	mov	z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, 127),
+		z0 = svdup_s16_x (p0, 127))
+
+/*
+** dup_128_s16_x:
+**	mov	z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, 128),
+		z0 = svdup_s16_x (p0, 128))
+
+/*
+** dup_129_s16_x:
+**	movi	v([0-9]+)\.8h, 0x81
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, 129),
+		z0 = svdup_s16_x (p0, 129))
+
+/*
+** dup_253_s16_x:
+**	movi	v([0-9]+)\.8h, 0xfd
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, 253),
+		z0 = svdup_s16_x (p0, 253))
+
+/*
+** dup_254_s16_x:
+**	mov	z0\.h, #254
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, 254),
+		z0 = svdup_s16_x (p0, 254))
+
+/*
+** dup_255_s16_x:
+**	mov	z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, 255),
+		z0 = svdup_s16_x (p0, 255))
+
+/*
+** dup_256_s16_x:
+**	mov	z0\.h, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, 256),
+		z0 = svdup_s16_x (p0, 256))
+
+/*
+** dup_257_s16_x:
+**	mov	z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_257_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, 257),
+		z0 = svdup_s16_x (p0, 257))
+
+/*
+** dup_512_s16_x:
+**	mov	z0\.h, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, 512),
+		z0 = svdup_s16_x (p0, 512))
+
+/*
+** dup_7f00_s16_x:
+**	mov	z0\.h, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, 0x7f00),
+		z0 = svdup_s16_x (p0, 0x7f00))
+
+/*
+** dup_7f01_s16_x:
+**	mov	(w[0-9]+), 32513
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f01_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, 0x7f01),
+		z0 = svdup_s16_x (p0, 0x7f01))
+
+/*
+** dup_7ffd_s16_x:
+**	mov	(w[0-9]+), 32765
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffd_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, 0x7ffd),
+		z0 = svdup_s16_x (p0, 0x7ffd))
+
+/*
+** dup_7ffe_s16_x:
+**	mov	z0\.h, #32766
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, 0x7ffe),
+		z0 = svdup_s16_x (p0, 0x7ffe))
+
+/*
+** dup_7fff_s16_x:
+**	mov	z0\.h, #32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, 0x7fff),
+		z0 = svdup_s16_x (p0, 0x7fff))
+
+/*
+** dup_m1_s16_x:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -1),
+		z0 = svdup_s16_x (p0, -1))
+
+/*
+** dup_m128_s16_x:
+**	mov	z0\.h, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -128),
+		z0 = svdup_s16_x (p0, -128))
+
+/*
+** dup_m129_s16_x:
+**	mov	z0\.h, #-129
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -129),
+		z0 = svdup_s16_x (p0, -129))
+
+/*
+** dup_m130_s16_x:
+**	mvni	v([0-9]+)\.8h, 0x81
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m130_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -130),
+		z0 = svdup_s16_x (p0, -130))
+
+/*
+** dup_m254_s16_x:
+**	mvni	v([0-9]+)\.8h, 0xfd
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m254_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -254),
+		z0 = svdup_s16_x (p0, -254))
+
+/*
+** dup_m255_s16_x:
+**	mov	z0\.h, #-255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -255),
+		z0 = svdup_s16_x (p0, -255))
+
+/*
+** dup_m256_s16_x:
+**	mov	z0\.h, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -256),
+		z0 = svdup_s16_x (p0, -256))
+
+/*
+** dup_m257_s16_x:
+**	mov	z0\.h, #-257
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -257),
+		z0 = svdup_s16_x (p0, -257))
+
+/*
+** dup_m258_s16_x:
+**	mov	z0\.b, #-2
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m258_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -258),
+		z0 = svdup_s16_x (p0, -258))
+
+/*
+** dup_m259_s16_x:
+**	mov	(w[0-9]+), -259
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m259_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -259),
+		z0 = svdup_s16_x (p0, -259))
+
+/*
+** dup_m512_s16_x:
+**	mov	z0\.h, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -512),
+		z0 = svdup_s16_x (p0, -512))
+
+/*
+** dup_m7f00_s16_x:
+**	mov	z0\.h, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -0x7f00),
+		z0 = svdup_s16_x (p0, -0x7f00))
+
+/*
+** dup_m7f01_s16_x:
+**	mov	z0\.h, #-32513
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -0x7f01),
+		z0 = svdup_s16_x (p0, -0x7f01))
+
+/*
+** dup_m7f02_s16_x:
+**	mov	(w[0-9]+), -32514
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f02_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -0x7f02),
+		z0 = svdup_s16_x (p0, -0x7f02))
+
+/*
+** dup_m7ffe_s16_x:
+**	mov	(w[0-9]+), -32766
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7ffe_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -0x7ffe),
+		z0 = svdup_s16_x (p0, -0x7ffe))
+
+/*
+** dup_m7fff_s16_x:
+**	mov	z0\.h, #-32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -0x7fff),
+		z0 = svdup_s16_x (p0, -0x7fff))
+
+/*
+** dup_m8000_s16_x:
+**	mov	z0\.h, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_s16_x, svint16_t,
+		z0 = svdup_n_s16_x (p0, -0x8000),
+		z0 = svdup_s16_x (p0, -0x8000))
+
+/*
+** dup_w0_s16_x:
+**	mov	z0\.h, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_s16_x, svint16_t, int16_t,
+		z0 = svdup_n_s16_x (p0, x0),
+		z0 = svdup_s16_x (p0, x0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s32.c
new file mode 100644
index 000000000..0b396dbeb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s32.c
@@ -0,0 +1,1175 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_1_s32:
+**	mov	z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_s32, svint32_t,
+		z0 = svdup_n_s32 (1),
+		z0 = svdup_s32 (1))
+
+/*
+** dup_127_s32:
+**	mov	z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_s32, svint32_t,
+		z0 = svdup_n_s32 (127),
+		z0 = svdup_s32 (127))
+
+/*
+** dup_128_s32:
+**	mov	z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_s32, svint32_t,
+		z0 = svdup_n_s32 (128),
+		z0 = svdup_s32 (128))
+
+/*
+** dup_129_s32:
+**	movi	v([0-9]+)\.4s, 0x81
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_s32, svint32_t,
+		z0 = svdup_n_s32 (129),
+		z0 = svdup_s32 (129))
+
+/*
+** dup_253_s32:
+**	movi	v([0-9]+)\.4s, 0xfd
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_s32, svint32_t,
+		z0 = svdup_n_s32 (253),
+		z0 = svdup_s32 (253))
+
+/*
+** dup_254_s32:
+**	mov	z0\.s, #254
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_s32, svint32_t,
+		z0 = svdup_n_s32 (254),
+		z0 = svdup_s32 (254))
+
+/*
+** dup_255_s32:
+**	mov	z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_s32, svint32_t,
+		z0 = svdup_n_s32 (255),
+		z0 = svdup_s32 (255))
+
+/*
+** dup_256_s32:
+**	mov	z0\.s, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_s32, svint32_t,
+		z0 = svdup_n_s32 (256),
+		z0 = svdup_s32 (256))
+
+/*
+** dup_257_s32:
+**	mov	(w[0-9]+), 257
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_257_s32, svint32_t,
+		z0 = svdup_n_s32 (257),
+		z0 = svdup_s32 (257))
+
+/*
+** dup_512_s32:
+**	mov	z0\.s, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_s32, svint32_t,
+		z0 = svdup_n_s32 (512),
+		z0 = svdup_s32 (512))
+
+/*
+** dup_7f00_s32:
+**	mov	z0\.s, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_s32, svint32_t,
+		z0 = svdup_n_s32 (0x7f00),
+		z0 = svdup_s32 (0x7f00))
+
+/*
+** dup_7f01_s32:
+**	mov	(w[0-9]+), 32513
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f01_s32, svint32_t,
+		z0 = svdup_n_s32 (0x7f01),
+		z0 = svdup_s32 (0x7f01))
+
+/*
+** dup_7ffd_s32:
+**	mov	(w[0-9]+), 32765
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffd_s32, svint32_t,
+		z0 = svdup_n_s32 (0x7ffd),
+		z0 = svdup_s32 (0x7ffd))
+
+/*
+** dup_7ffe_s32:
+**	mov	z0\.s, #32766
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_s32, svint32_t,
+		z0 = svdup_n_s32 (0x7ffe),
+		z0 = svdup_s32 (0x7ffe))
+
+/*
+** dup_7fff_s32:
+**	mov	z0\.s, #32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_s32, svint32_t,
+		z0 = svdup_n_s32 (0x7fff),
+		z0 = svdup_s32 (0x7fff))
+
+/*
+** dup_m1_s32:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_s32, svint32_t,
+		z0 = svdup_n_s32 (-1),
+		z0 = svdup_s32 (-1))
+
+/*
+** dup_m128_s32:
+**	mov	z0\.s, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_s32, svint32_t,
+		z0 = svdup_n_s32 (-128),
+		z0 = svdup_s32 (-128))
+
+/*
+** dup_m129_s32:
+**	mov	z0\.s, #-129
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_s32, svint32_t,
+		z0 = svdup_n_s32 (-129),
+		z0 = svdup_s32 (-129))
+
+/*
+** dup_m130_s32:
+**	mvni	v([0-9]+)\.4s, 0x81
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m130_s32, svint32_t,
+		z0 = svdup_n_s32 (-130),
+		z0 = svdup_s32 (-130))
+
+/*
+** dup_m254_s32:
+**	mvni	v([0-9]+)\.4s, 0xfd
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m254_s32, svint32_t,
+		z0 = svdup_n_s32 (-254),
+		z0 = svdup_s32 (-254))
+
+/*
+** dup_m255_s32:
+**	mov	z0\.s, #-255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_s32, svint32_t,
+		z0 = svdup_n_s32 (-255),
+		z0 = svdup_s32 (-255))
+
+/*
+** dup_m256_s32:
+**	mov	z0\.s, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_s32, svint32_t,
+		z0 = svdup_n_s32 (-256),
+		z0 = svdup_s32 (-256))
+
+/*
+** dup_m257_s32:
+**	mov	z0\.s, #-257
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_s32, svint32_t,
+		z0 = svdup_n_s32 (-257),
+		z0 = svdup_s32 (-257))
+
+/*
+** dup_m258_s32:
+**	mov	(w[0-9]+), -258
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m258_s32, svint32_t,
+		z0 = svdup_n_s32 (-258),
+		z0 = svdup_s32 (-258))
+
+/*
+** dup_m259_s32:
+**	mov	(w[0-9]+), -259
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m259_s32, svint32_t,
+		z0 = svdup_n_s32 (-259),
+		z0 = svdup_s32 (-259))
+
+/*
+** dup_m512_s32:
+**	mov	z0\.s, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_s32, svint32_t,
+		z0 = svdup_n_s32 (-512),
+		z0 = svdup_s32 (-512))
+
+/*
+** dup_m7f00_s32:
+**	mov	z0\.s, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_s32, svint32_t,
+		z0 = svdup_n_s32 (-0x7f00),
+		z0 = svdup_s32 (-0x7f00))
+
+/*
+** dup_m7f01_s32:
+**	mov	z0\.s, #-32513
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_s32, svint32_t,
+		z0 = svdup_n_s32 (-0x7f01),
+		z0 = svdup_s32 (-0x7f01))
+
+/*
+** dup_m7f02_s32:
+**	mov	(w[0-9]+), -32514
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f02_s32, svint32_t,
+		z0 = svdup_n_s32 (-0x7f02),
+		z0 = svdup_s32 (-0x7f02))
+
+/*
+** dup_m7ffe_s32:
+**	mov	(w[0-9]+), -32766
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7ffe_s32, svint32_t,
+		z0 = svdup_n_s32 (-0x7ffe),
+		z0 = svdup_s32 (-0x7ffe))
+
+/*
+** dup_m7fff_s32:
+**	mov	z0\.s, #-32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_s32, svint32_t,
+		z0 = svdup_n_s32 (-0x7fff),
+		z0 = svdup_s32 (-0x7fff))
+
+/*
+** dup_m8000_s32:
+**	mov	z0\.s, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_s32, svint32_t,
+		z0 = svdup_n_s32 (-0x8000),
+		z0 = svdup_s32 (-0x8000))
+
+/*
+** dup_w0_s32:
+**	mov	z0\.s, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_s32, svint32_t, int32_t,
+		 z0 = svdup_n_s32 (x0),
+		 z0 = svdup_s32 (x0))
+
+/*
+** dup_1_s32_m:
+**	mov	z0\.s, p0/m, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, 1),
+		z0 = svdup_s32_m (z0, p0, 1))
+
+/*
+** dup_127_s32_m:
+**	mov	z0\.s, p0/m, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, 127),
+		z0 = svdup_s32_m (z0, p0, 127))
+
+/*
+** dup_128_s32_m:
+**	mov	(z[0-9]+\.s), #128
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, 128),
+		z0 = svdup_s32_m (z0, p0, 128))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_129_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, 129),
+		z0 = svdup_s32_m (z0, p0, 129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_253_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, 253),
+		z0 = svdup_s32_m (z0, p0, 253))
+
+/*
+** dup_254_s32_m:
+**	mov	(z[0-9]+\.s), #254
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, 254),
+		z0 = svdup_s32_m (z0, p0, 254))
+
+/*
+** dup_255_s32_m:
+**	mov	(z[0-9]+\.s), #255
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, 255),
+		z0 = svdup_s32_m (z0, p0, 255))
+
+/*
+** dup_256_s32_m:
+**	mov	z0\.s, p0/m, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, 256),
+		z0 = svdup_s32_m (z0, p0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_257_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, 257),
+		z0 = svdup_s32_m (z0, p0, 257))
+
+/*
+** dup_512_s32_m:
+**	mov	z0\.s, p0/m, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, 512),
+		z0 = svdup_s32_m (z0, p0, 512))
+
+/*
+** dup_7f00_s32_m:
+**	mov	z0\.s, p0/m, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, 0x7f00),
+		z0 = svdup_s32_m (z0, p0, 0x7f00))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7f01_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, 0x7f01),
+		z0 = svdup_s32_m (z0, p0, 0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7ffd_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, 0x7ffd),
+		z0 = svdup_s32_m (z0, p0, 0x7ffd))
+
+/*
+** dup_7ffe_s32_m:
+**	mov	(z[0-9]+\.s), #32766
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, 0x7ffe),
+		z0 = svdup_s32_m (z0, p0, 0x7ffe))
+
+/*
+** dup_7fff_s32_m:
+**	mov	(z[0-9]+\.s), #32767
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, 0x7fff),
+		z0 = svdup_s32_m (z0, p0, 0x7fff))
+
+/*
+** dup_m1_s32_m:
+**	mov	z0\.s, p0/m, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -1),
+		z0 = svdup_s32_m (z0, p0, -1))
+
+/*
+** dup_m128_s32_m:
+**	mov	z0\.s, p0/m, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -128),
+		z0 = svdup_s32_m (z0, p0, -128))
+
+/*
+** dup_m129_s32_m:
+**	mov	(z[0-9]+\.s), #-129
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -129),
+		z0 = svdup_s32_m (z0, p0, -129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m130_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -130),
+		z0 = svdup_s32_m (z0, p0, -130))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m254_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -254),
+		z0 = svdup_s32_m (z0, p0, -254))
+
+/*
+** dup_m255_s32_m:
+**	mov	(z[0-9]+\.s), #-255
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -255),
+		z0 = svdup_s32_m (z0, p0, -255))
+
+/*
+** dup_m256_s32_m:
+**	mov	z0\.s, p0/m, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -256),
+		z0 = svdup_s32_m (z0, p0, -256))
+
+/*
+** dup_m257_s32_m:
+**	mov	(z[0-9]+\.s), #-257
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -257),
+		z0 = svdup_s32_m (z0, p0, -257))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m258_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -258),
+		z0 = svdup_s32_m (z0, p0, -258))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m259_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -259),
+		z0 = svdup_s32_m (z0, p0, -259))
+
+/*
+** dup_m512_s32_m:
+**	mov	z0\.s, p0/m, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -512),
+		z0 = svdup_s32_m (z0, p0, -512))
+
+/*
+** dup_m7f00_s32_m:
+**	mov	z0\.s, p0/m, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -0x7f00),
+		z0 = svdup_s32_m (z0, p0, -0x7f00))
+
+/*
+** dup_m7f01_s32_m:
+**	mov	(z[0-9]+\.s), #-32513
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -0x7f01),
+		z0 = svdup_s32_m (z0, p0, -0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7f02_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -0x7f02),
+		z0 = svdup_s32_m (z0, p0, -0x7f02))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7ffe_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -0x7ffe),
+		z0 = svdup_s32_m (z0, p0, -0x7ffe))
+
+/*
+** dup_m7fff_s32_m:
+**	mov	(z[0-9]+\.s), #-32767
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -0x7fff),
+		z0 = svdup_s32_m (z0, p0, -0x7fff))
+
+/*
+** dup_m8000_s32_m:
+**	mov	z0\.s, p0/m, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, -0x8000),
+		z0 = svdup_s32_m (z0, p0, -0x8000))
+
+/*
+** dup_0_s32_m:
+**	mov	z0\.s, p0/m, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_s32_m, svint32_t,
+		z0 = svdup_n_s32_m (z0, p0, 0),
+		z0 = svdup_s32_m (z0, p0, 0))
+
+/*
+** dup_w0_s32_m:
+**	movprfx	z0, z1
+**	mov	z0\.s, p0/m, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_s32_m, svint32_t, int32_t,
+		z0 = svdup_n_s32_m (z1, p0, x0),
+		z0 = svdup_s32_m (z1, p0, x0))
+
+/*
+** dup_1_s32_z:
+**	mov	z0\.s, p0/z, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, 1),
+		z0 = svdup_s32_z (p0, 1))
+
+/*
+** dup_127_s32_z:
+**	mov	z0\.s, p0/z, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, 127),
+		z0 = svdup_s32_z (p0, 127))
+
+/*
+** dup_128_s32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #128
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, 128),
+		z0 = svdup_s32_z (p0, 128))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_129_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, 129),
+		z0 = svdup_s32_z (p0, 129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_253_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, 253),
+		z0 = svdup_s32_z (p0, 253))
+
+/*
+** dup_254_s32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #254
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, 254),
+		z0 = svdup_s32_z (p0, 254))
+
+/*
+** dup_255_s32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #255
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, 255),
+		z0 = svdup_s32_z (p0, 255))
+
+/*
+** dup_256_s32_z:
+**	mov	z0\.s, p0/z, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, 256),
+		z0 = svdup_s32_z (p0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_257_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, 257),
+		z0 = svdup_s32_z (p0, 257))
+
+/*
+** dup_512_s32_z:
+**	mov	z0\.s, p0/z, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, 512),
+		z0 = svdup_s32_z (p0, 512))
+
+/*
+** dup_7f00_s32_z:
+**	mov	z0\.s, p0/z, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, 0x7f00),
+		z0 = svdup_s32_z (p0, 0x7f00))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7f01_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, 0x7f01),
+		z0 = svdup_s32_z (p0, 0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7ffd_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, 0x7ffd),
+		z0 = svdup_s32_z (p0, 0x7ffd))
+
+/*
+** dup_7ffe_s32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #32766
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, 0x7ffe),
+		z0 = svdup_s32_z (p0, 0x7ffe))
+
+/*
+** dup_7fff_s32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #32767
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, 0x7fff),
+		z0 = svdup_s32_z (p0, 0x7fff))
+
+/*
+** dup_m1_s32_z:
+**	mov	z0\.s, p0/z, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -1),
+		z0 = svdup_s32_z (p0, -1))
+
+/*
+** dup_m128_s32_z:
+**	mov	z0\.s, p0/z, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -128),
+		z0 = svdup_s32_z (p0, -128))
+
+/*
+** dup_m129_s32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #-129
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -129),
+		z0 = svdup_s32_z (p0, -129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m130_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -130),
+		z0 = svdup_s32_z (p0, -130))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m254_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -254),
+		z0 = svdup_s32_z (p0, -254))
+
+/*
+** dup_m255_s32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #-255
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -255),
+		z0 = svdup_s32_z (p0, -255))
+
+/*
+** dup_m256_s32_z:
+**	mov	z0\.s, p0/z, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -256),
+		z0 = svdup_s32_z (p0, -256))
+
+/*
+** dup_m257_s32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #-257
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -257),
+		z0 = svdup_s32_z (p0, -257))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m258_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -258),
+		z0 = svdup_s32_z (p0, -258))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m259_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -259),
+		z0 = svdup_s32_z (p0, -259))
+
+/*
+** dup_m512_s32_z:
+**	mov	z0\.s, p0/z, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -512),
+		z0 = svdup_s32_z (p0, -512))
+
+/*
+** dup_m7f00_s32_z:
+**	mov	z0\.s, p0/z, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -0x7f00),
+		z0 = svdup_s32_z (p0, -0x7f00))
+
+/*
+** dup_m7f01_s32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #-32513
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -0x7f01),
+		z0 = svdup_s32_z (p0, -0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7f02_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -0x7f02),
+		z0 = svdup_s32_z (p0, -0x7f02))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7ffe_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -0x7ffe),
+		z0 = svdup_s32_z (p0, -0x7ffe))
+
+/*
+** dup_m7fff_s32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #-32767
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -0x7fff),
+		z0 = svdup_s32_z (p0, -0x7fff))
+
+/*
+** dup_m8000_s32_z:
+**	mov	z0\.s, p0/z, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, -0x8000),
+		z0 = svdup_s32_z (p0, -0x8000))
+
+/*
+** dup_0_s32_z:
+**	mov	z0\.s, p0/z, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_s32_z, svint32_t,
+		z0 = svdup_n_s32_z (p0, 0),
+		z0 = svdup_s32_z (p0, 0))
+
+/*
+** dup_w0_s32_z:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mov	z0\.s, p0/m, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_s32_z, svint32_t, int32_t,
+		z0 = svdup_n_s32_z (p0, x0),
+		z0 = svdup_s32_z (p0, x0))
+
+/*
+** dup_1_s32_x:
+**	mov	z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, 1),
+		z0 = svdup_s32_x (p0, 1))
+
+/*
+** dup_127_s32_x:
+**	mov	z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, 127),
+		z0 = svdup_s32_x (p0, 127))
+
+/*
+** dup_128_s32_x:
+**	mov	z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, 128),
+		z0 = svdup_s32_x (p0, 128))
+
+/*
+** dup_129_s32_x:
+**	movi	v([0-9]+)\.4s, 0x81
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, 129),
+		z0 = svdup_s32_x (p0, 129))
+
+/*
+** dup_253_s32_x:
+**	movi	v([0-9]+)\.4s, 0xfd
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, 253),
+		z0 = svdup_s32_x (p0, 253))
+
+/*
+** dup_254_s32_x:
+**	mov	z0\.s, #254
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, 254),
+		z0 = svdup_s32_x (p0, 254))
+
+/*
+** dup_255_s32_x:
+**	mov	z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, 255),
+		z0 = svdup_s32_x (p0, 255))
+
+/*
+** dup_256_s32_x:
+**	mov	z0\.s, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, 256),
+		z0 = svdup_s32_x (p0, 256))
+
+/*
+** dup_257_s32_x:
+**	mov	(w[0-9]+), 257
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_257_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, 257),
+		z0 = svdup_s32_x (p0, 257))
+
+/*
+** dup_512_s32_x:
+**	mov	z0\.s, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, 512),
+		z0 = svdup_s32_x (p0, 512))
+
+/*
+** dup_7f00_s32_x:
+**	mov	z0\.s, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, 0x7f00),
+		z0 = svdup_s32_x (p0, 0x7f00))
+
+/*
+** dup_7f01_s32_x:
+**	mov	(w[0-9]+), 32513
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f01_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, 0x7f01),
+		z0 = svdup_s32_x (p0, 0x7f01))
+
+/*
+** dup_7ffd_s32_x:
+**	mov	(w[0-9]+), 32765
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffd_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, 0x7ffd),
+		z0 = svdup_s32_x (p0, 0x7ffd))
+
+/*
+** dup_7ffe_s32_x:
+**	mov	z0\.s, #32766
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, 0x7ffe),
+		z0 = svdup_s32_x (p0, 0x7ffe))
+
+/*
+** dup_7fff_s32_x:
+**	mov	z0\.s, #32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, 0x7fff),
+		z0 = svdup_s32_x (p0, 0x7fff))
+
+/*
+** dup_m1_s32_x:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -1),
+		z0 = svdup_s32_x (p0, -1))
+
+/*
+** dup_m128_s32_x:
+**	mov	z0\.s, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -128),
+		z0 = svdup_s32_x (p0, -128))
+
+/*
+** dup_m129_s32_x:
+**	mov	z0\.s, #-129
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -129),
+		z0 = svdup_s32_x (p0, -129))
+
+/*
+** dup_m130_s32_x:
+**	mvni	v([0-9]+)\.4s, 0x81
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m130_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -130),
+		z0 = svdup_s32_x (p0, -130))
+
+/*
+** dup_m254_s32_x:
+**	mvni	v([0-9]+)\.4s, 0xfd
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m254_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -254),
+		z0 = svdup_s32_x (p0, -254))
+
+/*
+** dup_m255_s32_x:
+**	mov	z0\.s, #-255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -255),
+		z0 = svdup_s32_x (p0, -255))
+
+/*
+** dup_m256_s32_x:
+**	mov	z0\.s, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -256),
+		z0 = svdup_s32_x (p0, -256))
+
+/*
+** dup_m257_s32_x:
+**	mov	z0\.s, #-257
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -257),
+		z0 = svdup_s32_x (p0, -257))
+
+/*
+** dup_m258_s32_x:
+**	mov	(w[0-9]+), -258
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m258_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -258),
+		z0 = svdup_s32_x (p0, -258))
+
+/*
+** dup_m259_s32_x:
+**	mov	(w[0-9]+), -259
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m259_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -259),
+		z0 = svdup_s32_x (p0, -259))
+
+/*
+** dup_m512_s32_x:
+**	mov	z0\.s, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -512),
+		z0 = svdup_s32_x (p0, -512))
+
+/*
+** dup_m7f00_s32_x:
+**	mov	z0\.s, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -0x7f00),
+		z0 = svdup_s32_x (p0, -0x7f00))
+
+/*
+** dup_m7f01_s32_x:
+**	mov	z0\.s, #-32513
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -0x7f01),
+		z0 = svdup_s32_x (p0, -0x7f01))
+
+/*
+** dup_m7f02_s32_x:
+**	mov	(w[0-9]+), -32514
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f02_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -0x7f02),
+		z0 = svdup_s32_x (p0, -0x7f02))
+
+/*
+** dup_m7ffe_s32_x:
+**	mov	(w[0-9]+), -32766
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7ffe_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -0x7ffe),
+		z0 = svdup_s32_x (p0, -0x7ffe))
+
+/*
+** dup_m7fff_s32_x:
+**	mov	z0\.s, #-32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -0x7fff),
+		z0 = svdup_s32_x (p0, -0x7fff))
+
+/*
+** dup_m8000_s32_x:
+**	mov	z0\.s, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_s32_x, svint32_t,
+		z0 = svdup_n_s32_x (p0, -0x8000),
+		z0 = svdup_s32_x (p0, -0x8000))
+
+/*
+** dup_w0_s32_x:
+**	mov	z0\.s, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_s32_x, svint32_t, int32_t,
+		z0 = svdup_n_s32_x (p0, x0),
+		z0 = svdup_s32_x (p0, x0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s64.c
new file mode 100644
index 000000000..6259b7fb5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s64.c
@@ -0,0 +1,1175 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_1_s64:
+**	mov	z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_s64, svint64_t,
+		z0 = svdup_n_s64 (1),
+		z0 = svdup_s64 (1))
+
+/*
+** dup_127_s64:
+**	mov	z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_s64, svint64_t,
+		z0 = svdup_n_s64 (127),
+		z0 = svdup_s64 (127))
+
+/*
+** dup_128_s64:
+**	mov	z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_s64, svint64_t,
+		z0 = svdup_n_s64 (128),
+		z0 = svdup_s64 (128))
+
+/*
+** dup_129_s64:
+**	mov	(x[0-9]+), 129
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_s64, svint64_t,
+		z0 = svdup_n_s64 (129),
+		z0 = svdup_s64 (129))
+
+/*
+** dup_253_s64:
+**	mov	(x[0-9]+), 253
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_s64, svint64_t,
+		z0 = svdup_n_s64 (253),
+		z0 = svdup_s64 (253))
+
+/*
+** dup_254_s64:
+**	mov	z0\.d, #254
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_s64, svint64_t,
+		z0 = svdup_n_s64 (254),
+		z0 = svdup_s64 (254))
+
+/*
+** dup_255_s64:
+**	mov	z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_s64, svint64_t,
+		z0 = svdup_n_s64 (255),
+		z0 = svdup_s64 (255))
+
+/*
+** dup_256_s64:
+**	mov	z0\.d, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_s64, svint64_t,
+		z0 = svdup_n_s64 (256),
+		z0 = svdup_s64 (256))
+
+/*
+** dup_257_s64:
+**	mov	(x[0-9]+), 257
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_257_s64, svint64_t,
+		z0 = svdup_n_s64 (257),
+		z0 = svdup_s64 (257))
+
+/*
+** dup_512_s64:
+**	mov	z0\.d, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_s64, svint64_t,
+		z0 = svdup_n_s64 (512),
+		z0 = svdup_s64 (512))
+
+/*
+** dup_7f00_s64:
+**	mov	z0\.d, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_s64, svint64_t,
+		z0 = svdup_n_s64 (0x7f00),
+		z0 = svdup_s64 (0x7f00))
+
+/*
+** dup_7f01_s64:
+**	mov	(x[0-9]+), 32513
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f01_s64, svint64_t,
+		z0 = svdup_n_s64 (0x7f01),
+		z0 = svdup_s64 (0x7f01))
+
+/*
+** dup_7ffd_s64:
+**	mov	(x[0-9]+), 32765
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffd_s64, svint64_t,
+		z0 = svdup_n_s64 (0x7ffd),
+		z0 = svdup_s64 (0x7ffd))
+
+/*
+** dup_7ffe_s64:
+**	mov	z0\.d, #32766
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_s64, svint64_t,
+		z0 = svdup_n_s64 (0x7ffe),
+		z0 = svdup_s64 (0x7ffe))
+
+/*
+** dup_7fff_s64:
+**	mov	z0\.d, #32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_s64, svint64_t,
+		z0 = svdup_n_s64 (0x7fff),
+		z0 = svdup_s64 (0x7fff))
+
+/*
+** dup_m1_s64:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_s64, svint64_t,
+		z0 = svdup_n_s64 (-1),
+		z0 = svdup_s64 (-1))
+
+/*
+** dup_m128_s64:
+**	mov	z0\.d, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_s64, svint64_t,
+		z0 = svdup_n_s64 (-128),
+		z0 = svdup_s64 (-128))
+
+/*
+** dup_m129_s64:
+**	mov	z0\.d, #-129
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_s64, svint64_t,
+		z0 = svdup_n_s64 (-129),
+		z0 = svdup_s64 (-129))
+
+/*
+** dup_m130_s64:
+**	mov	(x[0-9]+), -130
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m130_s64, svint64_t,
+		z0 = svdup_n_s64 (-130),
+		z0 = svdup_s64 (-130))
+
+/*
+** dup_m254_s64:
+**	mov	(x[0-9]+), -254
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m254_s64, svint64_t,
+		z0 = svdup_n_s64 (-254),
+		z0 = svdup_s64 (-254))
+
+/*
+** dup_m255_s64:
+**	mov	z0\.d, #-255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_s64, svint64_t,
+		z0 = svdup_n_s64 (-255),
+		z0 = svdup_s64 (-255))
+
+/*
+** dup_m256_s64:
+**	mov	z0\.d, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_s64, svint64_t,
+		z0 = svdup_n_s64 (-256),
+		z0 = svdup_s64 (-256))
+
+/*
+** dup_m257_s64:
+**	mov	z0\.d, #-257
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_s64, svint64_t,
+		z0 = svdup_n_s64 (-257),
+		z0 = svdup_s64 (-257))
+
+/*
+** dup_m258_s64:
+**	mov	(x[0-9]+), -258
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m258_s64, svint64_t,
+		z0 = svdup_n_s64 (-258),
+		z0 = svdup_s64 (-258))
+
+/*
+** dup_m259_s64:
+**	mov	(x[0-9]+), -259
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m259_s64, svint64_t,
+		z0 = svdup_n_s64 (-259),
+		z0 = svdup_s64 (-259))
+
+/*
+** dup_m512_s64:
+**	mov	z0\.d, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_s64, svint64_t,
+		z0 = svdup_n_s64 (-512),
+		z0 = svdup_s64 (-512))
+
+/*
+** dup_m7f00_s64:
+**	mov	z0\.d, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_s64, svint64_t,
+		z0 = svdup_n_s64 (-0x7f00),
+		z0 = svdup_s64 (-0x7f00))
+
+/*
+** dup_m7f01_s64:
+**	mov	z0\.d, #-32513
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_s64, svint64_t,
+		z0 = svdup_n_s64 (-0x7f01),
+		z0 = svdup_s64 (-0x7f01))
+
+/*
+** dup_m7f02_s64:
+**	mov	(x[0-9]+), -32514
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f02_s64, svint64_t,
+		z0 = svdup_n_s64 (-0x7f02),
+		z0 = svdup_s64 (-0x7f02))
+
+/*
+** dup_m7ffe_s64:
+**	mov	(x[0-9]+), -32766
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7ffe_s64, svint64_t,
+		z0 = svdup_n_s64 (-0x7ffe),
+		z0 = svdup_s64 (-0x7ffe))
+
+/*
+** dup_m7fff_s64:
+**	mov	z0\.d, #-32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_s64, svint64_t,
+		z0 = svdup_n_s64 (-0x7fff),
+		z0 = svdup_s64 (-0x7fff))
+
+/*
+** dup_m8000_s64:
+**	mov	z0\.d, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_s64, svint64_t,
+		z0 = svdup_n_s64 (-0x8000),
+		z0 = svdup_s64 (-0x8000))
+
+/*
+** dup_x0_s64:
+**	mov	z0\.d, x0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_x0_s64, svint64_t, int64_t,
+		 z0 = svdup_n_s64 (x0),
+		 z0 = svdup_s64 (x0))
+
+/*
+** dup_1_s64_m:
+**	mov	z0\.d, p0/m, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, 1),
+		z0 = svdup_s64_m (z0, p0, 1))
+
+/*
+** dup_127_s64_m:
+**	mov	z0\.d, p0/m, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, 127),
+		z0 = svdup_s64_m (z0, p0, 127))
+
+/*
+** dup_128_s64_m:
+**	mov	(z[0-9]+\.d), #128
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, 128),
+		z0 = svdup_s64_m (z0, p0, 128))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_129_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, 129),
+		z0 = svdup_s64_m (z0, p0, 129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_253_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, 253),
+		z0 = svdup_s64_m (z0, p0, 253))
+
+/*
+** dup_254_s64_m:
+**	mov	(z[0-9]+\.d), #254
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, 254),
+		z0 = svdup_s64_m (z0, p0, 254))
+
+/*
+** dup_255_s64_m:
+**	mov	(z[0-9]+\.d), #255
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, 255),
+		z0 = svdup_s64_m (z0, p0, 255))
+
+/*
+** dup_256_s64_m:
+**	mov	z0\.d, p0/m, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, 256),
+		z0 = svdup_s64_m (z0, p0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_257_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, 257),
+		z0 = svdup_s64_m (z0, p0, 257))
+
+/*
+** dup_512_s64_m:
+**	mov	z0\.d, p0/m, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, 512),
+		z0 = svdup_s64_m (z0, p0, 512))
+
+/*
+** dup_7f00_s64_m:
+**	mov	z0\.d, p0/m, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, 0x7f00),
+		z0 = svdup_s64_m (z0, p0, 0x7f00))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7f01_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, 0x7f01),
+		z0 = svdup_s64_m (z0, p0, 0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7ffd_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, 0x7ffd),
+		z0 = svdup_s64_m (z0, p0, 0x7ffd))
+
+/*
+** dup_7ffe_s64_m:
+**	mov	(z[0-9]+\.d), #32766
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, 0x7ffe),
+		z0 = svdup_s64_m (z0, p0, 0x7ffe))
+
+/*
+** dup_7fff_s64_m:
+**	mov	(z[0-9]+\.d), #32767
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, 0x7fff),
+		z0 = svdup_s64_m (z0, p0, 0x7fff))
+
+/*
+** dup_m1_s64_m:
+**	mov	z0\.d, p0/m, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -1),
+		z0 = svdup_s64_m (z0, p0, -1))
+
+/*
+** dup_m128_s64_m:
+**	mov	z0\.d, p0/m, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -128),
+		z0 = svdup_s64_m (z0, p0, -128))
+
+/*
+** dup_m129_s64_m:
+**	mov	(z[0-9]+\.d), #-129
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -129),
+		z0 = svdup_s64_m (z0, p0, -129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m130_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -130),
+		z0 = svdup_s64_m (z0, p0, -130))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m254_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -254),
+		z0 = svdup_s64_m (z0, p0, -254))
+
+/*
+** dup_m255_s64_m:
+**	mov	(z[0-9]+\.d), #-255
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -255),
+		z0 = svdup_s64_m (z0, p0, -255))
+
+/*
+** dup_m256_s64_m:
+**	mov	z0\.d, p0/m, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -256),
+		z0 = svdup_s64_m (z0, p0, -256))
+
+/*
+** dup_m257_s64_m:
+**	mov	(z[0-9]+\.d), #-257
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -257),
+		z0 = svdup_s64_m (z0, p0, -257))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m258_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -258),
+		z0 = svdup_s64_m (z0, p0, -258))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m259_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -259),
+		z0 = svdup_s64_m (z0, p0, -259))
+
+/*
+** dup_m512_s64_m:
+**	mov	z0\.d, p0/m, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -512),
+		z0 = svdup_s64_m (z0, p0, -512))
+
+/*
+** dup_m7f00_s64_m:
+**	mov	z0\.d, p0/m, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -0x7f00),
+		z0 = svdup_s64_m (z0, p0, -0x7f00))
+
+/*
+** dup_m7f01_s64_m:
+**	mov	(z[0-9]+\.d), #-32513
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -0x7f01),
+		z0 = svdup_s64_m (z0, p0, -0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7f02_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -0x7f02),
+		z0 = svdup_s64_m (z0, p0, -0x7f02))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7ffe_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -0x7ffe),
+		z0 = svdup_s64_m (z0, p0, -0x7ffe))
+
+/*
+** dup_m7fff_s64_m:
+**	mov	(z[0-9]+\.d), #-32767
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -0x7fff),
+		z0 = svdup_s64_m (z0, p0, -0x7fff))
+
+/*
+** dup_m8000_s64_m:
+**	mov	z0\.d, p0/m, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, -0x8000),
+		z0 = svdup_s64_m (z0, p0, -0x8000))
+
+/*
+** dup_0_s64_m:
+**	mov	z0\.d, p0/m, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_s64_m, svint64_t,
+		z0 = svdup_n_s64_m (z0, p0, 0),
+		z0 = svdup_s64_m (z0, p0, 0))
+
+/*
+** dup_x0_s64_m:
+**	movprfx	z0, z1
+**	mov	z0\.d, p0/m, x0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_x0_s64_m, svint64_t, int64_t,
+		z0 = svdup_n_s64_m (z1, p0, x0),
+		z0 = svdup_s64_m (z1, p0, x0))
+
+/*
+** dup_1_s64_z:
+**	mov	z0\.d, p0/z, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, 1),
+		z0 = svdup_s64_z (p0, 1))
+
+/*
+** dup_127_s64_z:
+**	mov	z0\.d, p0/z, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, 127),
+		z0 = svdup_s64_z (p0, 127))
+
+/*
+** dup_128_s64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #128
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, 128),
+		z0 = svdup_s64_z (p0, 128))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_129_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, 129),
+		z0 = svdup_s64_z (p0, 129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_253_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, 253),
+		z0 = svdup_s64_z (p0, 253))
+
+/*
+** dup_254_s64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #254
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, 254),
+		z0 = svdup_s64_z (p0, 254))
+
+/*
+** dup_255_s64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #255
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, 255),
+		z0 = svdup_s64_z (p0, 255))
+
+/*
+** dup_256_s64_z:
+**	mov	z0\.d, p0/z, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, 256),
+		z0 = svdup_s64_z (p0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_257_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, 257),
+		z0 = svdup_s64_z (p0, 257))
+
+/*
+** dup_512_s64_z:
+**	mov	z0\.d, p0/z, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, 512),
+		z0 = svdup_s64_z (p0, 512))
+
+/*
+** dup_7f00_s64_z:
+**	mov	z0\.d, p0/z, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, 0x7f00),
+		z0 = svdup_s64_z (p0, 0x7f00))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7f01_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, 0x7f01),
+		z0 = svdup_s64_z (p0, 0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7ffd_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, 0x7ffd),
+		z0 = svdup_s64_z (p0, 0x7ffd))
+
+/*
+** dup_7ffe_s64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #32766
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, 0x7ffe),
+		z0 = svdup_s64_z (p0, 0x7ffe))
+
+/*
+** dup_7fff_s64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #32767
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, 0x7fff),
+		z0 = svdup_s64_z (p0, 0x7fff))
+
+/*
+** dup_m1_s64_z:
+**	mov	z0\.d, p0/z, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -1),
+		z0 = svdup_s64_z (p0, -1))
+
+/*
+** dup_m128_s64_z:
+**	mov	z0\.d, p0/z, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -128),
+		z0 = svdup_s64_z (p0, -128))
+
+/*
+** dup_m129_s64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #-129
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -129),
+		z0 = svdup_s64_z (p0, -129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m130_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -130),
+		z0 = svdup_s64_z (p0, -130))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m254_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -254),
+		z0 = svdup_s64_z (p0, -254))
+
+/*
+** dup_m255_s64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #-255
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -255),
+		z0 = svdup_s64_z (p0, -255))
+
+/*
+** dup_m256_s64_z:
+**	mov	z0\.d, p0/z, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -256),
+		z0 = svdup_s64_z (p0, -256))
+
+/*
+** dup_m257_s64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #-257
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -257),
+		z0 = svdup_s64_z (p0, -257))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m258_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -258),
+		z0 = svdup_s64_z (p0, -258))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m259_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -259),
+		z0 = svdup_s64_z (p0, -259))
+
+/*
+** dup_m512_s64_z:
+**	mov	z0\.d, p0/z, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -512),
+		z0 = svdup_s64_z (p0, -512))
+
+/*
+** dup_m7f00_s64_z:
+**	mov	z0\.d, p0/z, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -0x7f00),
+		z0 = svdup_s64_z (p0, -0x7f00))
+
+/*
+** dup_m7f01_s64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #-32513
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -0x7f01),
+		z0 = svdup_s64_z (p0, -0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7f02_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -0x7f02),
+		z0 = svdup_s64_z (p0, -0x7f02))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7ffe_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -0x7ffe),
+		z0 = svdup_s64_z (p0, -0x7ffe))
+
+/*
+** dup_m7fff_s64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #-32767
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -0x7fff),
+		z0 = svdup_s64_z (p0, -0x7fff))
+
+/*
+** dup_m8000_s64_z:
+**	mov	z0\.d, p0/z, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, -0x8000),
+		z0 = svdup_s64_z (p0, -0x8000))
+
+/*
+** dup_0_s64_z:
+**	mov	z0\.d, p0/z, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_s64_z, svint64_t,
+		z0 = svdup_n_s64_z (p0, 0),
+		z0 = svdup_s64_z (p0, 0))
+
+/*
+** dup_x0_s64_z:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mov	z0\.d, p0/m, x0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_x0_s64_z, svint64_t, int64_t,
+		z0 = svdup_n_s64_z (p0, x0),
+		z0 = svdup_s64_z (p0, x0))
+
+/*
+** dup_1_s64_x:
+**	mov	z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, 1),
+		z0 = svdup_s64_x (p0, 1))
+
+/*
+** dup_127_s64_x:
+**	mov	z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, 127),
+		z0 = svdup_s64_x (p0, 127))
+
+/*
+** dup_128_s64_x:
+**	mov	z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, 128),
+		z0 = svdup_s64_x (p0, 128))
+
+/*
+** dup_129_s64_x:
+**	mov	(x[0-9]+), 129
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, 129),
+		z0 = svdup_s64_x (p0, 129))
+
+/*
+** dup_253_s64_x:
+**	mov	(x[0-9]+), 253
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, 253),
+		z0 = svdup_s64_x (p0, 253))
+
+/*
+** dup_254_s64_x:
+**	mov	z0\.d, #254
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, 254),
+		z0 = svdup_s64_x (p0, 254))
+
+/*
+** dup_255_s64_x:
+**	mov	z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, 255),
+		z0 = svdup_s64_x (p0, 255))
+
+/*
+** dup_256_s64_x:
+**	mov	z0\.d, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, 256),
+		z0 = svdup_s64_x (p0, 256))
+
+/*
+** dup_257_s64_x:
+**	mov	(x[0-9]+), 257
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_257_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, 257),
+		z0 = svdup_s64_x (p0, 257))
+
+/*
+** dup_512_s64_x:
+**	mov	z0\.d, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, 512),
+		z0 = svdup_s64_x (p0, 512))
+
+/*
+** dup_7f00_s64_x:
+**	mov	z0\.d, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, 0x7f00),
+		z0 = svdup_s64_x (p0, 0x7f00))
+
+/*
+** dup_7f01_s64_x:
+**	mov	(x[0-9]+), 32513
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f01_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, 0x7f01),
+		z0 = svdup_s64_x (p0, 0x7f01))
+
+/*
+** dup_7ffd_s64_x:
+**	mov	(x[0-9]+), 32765
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffd_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, 0x7ffd),
+		z0 = svdup_s64_x (p0, 0x7ffd))
+
+/*
+** dup_7ffe_s64_x:
+**	mov	z0\.d, #32766
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, 0x7ffe),
+		z0 = svdup_s64_x (p0, 0x7ffe))
+
+/*
+** dup_7fff_s64_x:
+**	mov	z0\.d, #32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, 0x7fff),
+		z0 = svdup_s64_x (p0, 0x7fff))
+
+/*
+** dup_m1_s64_x:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -1),
+		z0 = svdup_s64_x (p0, -1))
+
+/*
+** dup_m128_s64_x:
+**	mov	z0\.d, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -128),
+		z0 = svdup_s64_x (p0, -128))
+
+/*
+** dup_m129_s64_x:
+**	mov	z0\.d, #-129
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -129),
+		z0 = svdup_s64_x (p0, -129))
+
+/*
+** dup_m130_s64_x:
+**	mov	(x[0-9]+), -130
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m130_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -130),
+		z0 = svdup_s64_x (p0, -130))
+
+/*
+** dup_m254_s64_x:
+**	mov	(x[0-9]+), -254
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m254_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -254),
+		z0 = svdup_s64_x (p0, -254))
+
+/*
+** dup_m255_s64_x:
+**	mov	z0\.d, #-255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -255),
+		z0 = svdup_s64_x (p0, -255))
+
+/*
+** dup_m256_s64_x:
+**	mov	z0\.d, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -256),
+		z0 = svdup_s64_x (p0, -256))
+
+/*
+** dup_m257_s64_x:
+**	mov	z0\.d, #-257
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -257),
+		z0 = svdup_s64_x (p0, -257))
+
+/*
+** dup_m258_s64_x:
+**	mov	(x[0-9]+), -258
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m258_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -258),
+		z0 = svdup_s64_x (p0, -258))
+
+/*
+** dup_m259_s64_x:
+**	mov	(x[0-9]+), -259
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m259_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -259),
+		z0 = svdup_s64_x (p0, -259))
+
+/*
+** dup_m512_s64_x:
+**	mov	z0\.d, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -512),
+		z0 = svdup_s64_x (p0, -512))
+
+/*
+** dup_m7f00_s64_x:
+**	mov	z0\.d, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -0x7f00),
+		z0 = svdup_s64_x (p0, -0x7f00))
+
+/*
+** dup_m7f01_s64_x:
+**	mov	z0\.d, #-32513
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -0x7f01),
+		z0 = svdup_s64_x (p0, -0x7f01))
+
+/*
+** dup_m7f02_s64_x:
+**	mov	(x[0-9]+), -32514
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f02_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -0x7f02),
+		z0 = svdup_s64_x (p0, -0x7f02))
+
+/*
+** dup_m7ffe_s64_x:
+**	mov	(x[0-9]+), -32766
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7ffe_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -0x7ffe),
+		z0 = svdup_s64_x (p0, -0x7ffe))
+
+/*
+** dup_m7fff_s64_x:
+**	mov	z0\.d, #-32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -0x7fff),
+		z0 = svdup_s64_x (p0, -0x7fff))
+
+/*
+** dup_m8000_s64_x:
+**	mov	z0\.d, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_s64_x, svint64_t,
+		z0 = svdup_n_s64_x (p0, -0x8000),
+		z0 = svdup_s64_x (p0, -0x8000))
+
+/*
+** dup_x0_s64_x:
+**	mov	z0\.d, x0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_x0_s64_x, svint64_t, int64_t,
+		z0 = svdup_n_s64_x (p0, x0),
+		z0 = svdup_s64_x (p0, x0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s8.c
new file mode 100644
index 000000000..96fc5fa64
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_s8.c
@@ -0,0 +1,383 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_1_s8:
+**	mov	z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_s8, svint8_t,
+		z0 = svdup_n_s8 (1),
+		z0 = svdup_s8 (1))
+
+/*
+** dup_127_s8:
+**	mov	z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_s8, svint8_t,
+		z0 = svdup_n_s8 (127),
+		z0 = svdup_s8 (127))
+
+/*
+** dup_128_s8:
+**	mov	z0\.b, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_s8, svint8_t,
+		z0 = svdup_n_s8 (128),
+		z0 = svdup_s8 (128))
+
+/*
+** dup_129_s8:
+**	mov	z0\.b, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_s8, svint8_t,
+		z0 = svdup_n_s8 (129),
+		z0 = svdup_s8 (129))
+
+/*
+** dup_253_s8:
+**	mov	z0\.b, #-3
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_s8, svint8_t,
+		z0 = svdup_n_s8 (253),
+		z0 = svdup_s8 (253))
+
+/*
+** dup_254_s8:
+**	mov	z0\.b, #-2
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_s8, svint8_t,
+		z0 = svdup_n_s8 (254),
+		z0 = svdup_s8 (254))
+
+/*
+** dup_255_s8:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_s8, svint8_t,
+		z0 = svdup_n_s8 (255),
+		z0 = svdup_s8 (255))
+
+/*
+** dup_m1_s8:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_s8, svint8_t,
+		z0 = svdup_n_s8 (-1),
+		z0 = svdup_s8 (-1))
+
+/*
+** dup_m128_s8:
+**	mov	z0\.b, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_s8, svint8_t,
+		z0 = svdup_n_s8 (-128),
+		z0 = svdup_s8 (-128))
+
+/*
+** dup_w0_s8:
+**	mov	z0\.b, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_s8, svint8_t, int8_t,
+		 z0 = svdup_n_s8 (x0),
+		 z0 = svdup_s8 (x0))
+
+/*
+** dup_1_s8_m:
+**	mov	z0\.b, p0/m, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_s8_m, svint8_t,
+		z0 = svdup_n_s8_m (z0, p0, 1),
+		z0 = svdup_s8_m (z0, p0, 1))
+
+/*
+** dup_127_s8_m:
+**	mov	z0\.b, p0/m, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_s8_m, svint8_t,
+		z0 = svdup_n_s8_m (z0, p0, 127),
+		z0 = svdup_s8_m (z0, p0, 127))
+
+/*
+** dup_128_s8_m:
+**	mov	z0\.b, p0/m, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_s8_m, svint8_t,
+		z0 = svdup_n_s8_m (z0, p0, 128),
+		z0 = svdup_s8_m (z0, p0, 128))
+
+/*
+** dup_129_s8_m:
+**	mov	z0\.b, p0/m, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_s8_m, svint8_t,
+		z0 = svdup_n_s8_m (z0, p0, 129),
+		z0 = svdup_s8_m (z0, p0, 129))
+
+/*
+** dup_253_s8_m:
+**	mov	z0\.b, p0/m, #-3
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_s8_m, svint8_t,
+		z0 = svdup_n_s8_m (z0, p0, 253),
+		z0 = svdup_s8_m (z0, p0, 253))
+
+/*
+** dup_254_s8_m:
+**	mov	z0\.b, p0/m, #-2
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_s8_m, svint8_t,
+		z0 = svdup_n_s8_m (z0, p0, 254),
+		z0 = svdup_s8_m (z0, p0, 254))
+
+/*
+** dup_255_s8_m:
+**	mov	z0\.b, p0/m, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_s8_m, svint8_t,
+		z0 = svdup_n_s8_m (z0, p0, 255),
+		z0 = svdup_s8_m (z0, p0, 255))
+
+/*
+** dup_m1_s8_m:
+**	mov	z0\.b, p0/m, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_s8_m, svint8_t,
+		z0 = svdup_n_s8_m (z0, p0, -1),
+		z0 = svdup_s8_m (z0, p0, -1))
+
+/*
+** dup_m128_s8_m:
+**	mov	z0\.b, p0/m, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_s8_m, svint8_t,
+		z0 = svdup_n_s8_m (z0, p0, -128),
+		z0 = svdup_s8_m (z0, p0, -128))
+
+/*
+** dup_0_s8_m:
+**	mov	z0\.b, p0/m, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_s8_m, svint8_t,
+		z0 = svdup_n_s8_m (z0, p0, 0),
+		z0 = svdup_s8_m (z0, p0, 0))
+
+/*
+** dup_w0_s8_m:
+**	movprfx	z0, z1
+**	mov	z0\.b, p0/m, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_s8_m, svint8_t, int8_t,
+		z0 = svdup_n_s8_m (z1, p0, x0),
+		z0 = svdup_s8_m (z1, p0, x0))
+
+/*
+** dup_1_s8_z:
+**	mov	z0\.b, p0/z, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_s8_z, svint8_t,
+		z0 = svdup_n_s8_z (p0, 1),
+		z0 = svdup_s8_z (p0, 1))
+
+/*
+** dup_127_s8_z:
+**	mov	z0\.b, p0/z, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_s8_z, svint8_t,
+		z0 = svdup_n_s8_z (p0, 127),
+		z0 = svdup_s8_z (p0, 127))
+
+/*
+** dup_128_s8_z:
+**	mov	z0\.b, p0/z, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_s8_z, svint8_t,
+		z0 = svdup_n_s8_z (p0, 128),
+		z0 = svdup_s8_z (p0, 128))
+
+/*
+** dup_129_s8_z:
+**	mov	z0\.b, p0/z, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_s8_z, svint8_t,
+		z0 = svdup_n_s8_z (p0, 129),
+		z0 = svdup_s8_z (p0, 129))
+
+/*
+** dup_253_s8_z:
+**	mov	z0\.b, p0/z, #-3
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_s8_z, svint8_t,
+		z0 = svdup_n_s8_z (p0, 253),
+		z0 = svdup_s8_z (p0, 253))
+
+/*
+** dup_254_s8_z:
+**	mov	z0\.b, p0/z, #-2
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_s8_z, svint8_t,
+		z0 = svdup_n_s8_z (p0, 254),
+		z0 = svdup_s8_z (p0, 254))
+
+/*
+** dup_255_s8_z:
+**	mov	z0\.b, p0/z, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_s8_z, svint8_t,
+		z0 = svdup_n_s8_z (p0, 255),
+		z0 = svdup_s8_z (p0, 255))
+
+/*
+** dup_m1_s8_z:
+**	mov	z0\.b, p0/z, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_s8_z, svint8_t,
+		z0 = svdup_n_s8_z (p0, -1),
+		z0 = svdup_s8_z (p0, -1))
+
+/*
+** dup_m128_s8_z:
+**	mov	z0\.b, p0/z, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_s8_z, svint8_t,
+		z0 = svdup_n_s8_z (p0, -128),
+		z0 = svdup_s8_z (p0, -128))
+
+/*
+** dup_0_s8_z:
+**	mov	z0\.b, p0/z, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_s8_z, svint8_t,
+		z0 = svdup_n_s8_z (p0, 0),
+		z0 = svdup_s8_z (p0, 0))
+
+/*
+** dup_w0_s8_z:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mov	z0\.b, p0/m, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_s8_z, svint8_t, int8_t,
+		z0 = svdup_n_s8_z (p0, x0),
+		z0 = svdup_s8_z (p0, x0))
+
+/*
+** dup_1_s8_x:
+**	mov	z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_s8_x, svint8_t,
+		z0 = svdup_n_s8_x (p0, 1),
+		z0 = svdup_s8_x (p0, 1))
+
+/*
+** dup_127_s8_x:
+**	mov	z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_s8_x, svint8_t,
+		z0 = svdup_n_s8_x (p0, 127),
+		z0 = svdup_s8_x (p0, 127))
+
+/*
+** dup_128_s8_x:
+**	mov	z0\.b, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_s8_x, svint8_t,
+		z0 = svdup_n_s8_x (p0, 128),
+		z0 = svdup_s8_x (p0, 128))
+
+/*
+** dup_129_s8_x:
+**	mov	z0\.b, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_s8_x, svint8_t,
+		z0 = svdup_n_s8_x (p0, 129),
+		z0 = svdup_s8_x (p0, 129))
+
+/*
+** dup_253_s8_x:
+**	mov	z0\.b, #-3
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_s8_x, svint8_t,
+		z0 = svdup_n_s8_x (p0, 253),
+		z0 = svdup_s8_x (p0, 253))
+
+/*
+** dup_254_s8_x:
+**	mov	z0\.b, #-2
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_s8_x, svint8_t,
+		z0 = svdup_n_s8_x (p0, 254),
+		z0 = svdup_s8_x (p0, 254))
+
+/*
+** dup_255_s8_x:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_s8_x, svint8_t,
+		z0 = svdup_n_s8_x (p0, 255),
+		z0 = svdup_s8_x (p0, 255))
+
+/*
+** dup_m1_s8_x:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_s8_x, svint8_t,
+		z0 = svdup_n_s8_x (p0, -1),
+		z0 = svdup_s8_x (p0, -1))
+
+/*
+** dup_m128_s8_x:
+**	mov	z0\.b, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_s8_x, svint8_t,
+		z0 = svdup_n_s8_x (p0, -128),
+		z0 = svdup_s8_x (p0, -128))
+
+/*
+** dup_w0_s8_x:
+**	mov	z0\.b, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_s8_x, svint8_t, int8_t,
+		z0 = svdup_n_s8_x (p0, x0),
+		z0 = svdup_s8_x (p0, x0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u16.c
new file mode 100644
index 000000000..263eafef0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u16.c
@@ -0,0 +1,1193 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_1_u16:
+**	mov	z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_u16, svuint16_t,
+		z0 = svdup_n_u16 (1),
+		z0 = svdup_u16 (1))
+
+/*
+** dup_127_u16:
+**	mov	z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_u16, svuint16_t,
+		z0 = svdup_n_u16 (127),
+		z0 = svdup_u16 (127))
+
+/*
+** dup_128_u16:
+**	mov	z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_u16, svuint16_t,
+		z0 = svdup_n_u16 (128),
+		z0 = svdup_u16 (128))
+
+/*
+** dup_129_u16:
+**	movi	v([0-9]+)\.8h, 0x81
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_u16, svuint16_t,
+		z0 = svdup_n_u16 (129),
+		z0 = svdup_u16 (129))
+
+/*
+** dup_253_u16:
+**	movi	v([0-9]+)\.8h, 0xfd
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_u16, svuint16_t,
+		z0 = svdup_n_u16 (253),
+		z0 = svdup_u16 (253))
+
+/*
+** dup_254_u16:
+**	mov	z0\.h, #254
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_u16, svuint16_t,
+		z0 = svdup_n_u16 (254),
+		z0 = svdup_u16 (254))
+
+/*
+** dup_255_u16:
+**	mov	z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_u16, svuint16_t,
+		z0 = svdup_n_u16 (255),
+		z0 = svdup_u16 (255))
+
+/*
+** dup_256_u16:
+**	mov	z0\.h, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_u16, svuint16_t,
+		z0 = svdup_n_u16 (256),
+		z0 = svdup_u16 (256))
+
+/*
+** dup_257_u16:
+**	mov	z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_257_u16, svuint16_t,
+		z0 = svdup_n_u16 (257),
+		z0 = svdup_u16 (257))
+
+/*
+** dup_512_u16:
+**	mov	z0\.h, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_u16, svuint16_t,
+		z0 = svdup_n_u16 (512),
+		z0 = svdup_u16 (512))
+
+/*
+** dup_7f00_u16:
+**	mov	z0\.h, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_u16, svuint16_t,
+		z0 = svdup_n_u16 (0x7f00),
+		z0 = svdup_u16 (0x7f00))
+
+/*
+** dup_7f01_u16:
+**	mov	(w[0-9]+), 32513
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f01_u16, svuint16_t,
+		z0 = svdup_n_u16 (0x7f01),
+		z0 = svdup_u16 (0x7f01))
+
+/*
+** dup_7ffd_u16:
+**	mov	(w[0-9]+), 32765
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffd_u16, svuint16_t,
+		z0 = svdup_n_u16 (0x7ffd),
+		z0 = svdup_u16 (0x7ffd))
+
+/*
+** dup_7ffe_u16:
+**	mov	z0\.h, #32766
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_u16, svuint16_t,
+		z0 = svdup_n_u16 (0x7ffe),
+		z0 = svdup_u16 (0x7ffe))
+
+/*
+** dup_7fff_u16:
+**	mov	z0\.h, #32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_u16, svuint16_t,
+		z0 = svdup_n_u16 (0x7fff),
+		z0 = svdup_u16 (0x7fff))
+
+/*
+** dup_m1_u16:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_u16, svuint16_t,
+		z0 = svdup_n_u16 (-1),
+		z0 = svdup_u16 (-1))
+
+/*
+** dup_m128_u16:
+**	mov	z0\.h, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_u16, svuint16_t,
+		z0 = svdup_n_u16 (-128),
+		z0 = svdup_u16 (-128))
+
+/*
+** dup_m129_u16:
+**	mov	z0\.h, #-129
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_u16, svuint16_t,
+		z0 = svdup_n_u16 (-129),
+		z0 = svdup_u16 (-129))
+
+/*
+** dup_m130_u16:
+**	mvni	v([0-9]+)\.8h, 0x81
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m130_u16, svuint16_t,
+		z0 = svdup_n_u16 (-130),
+		z0 = svdup_u16 (-130))
+
+/*
+** dup_m254_u16:
+**	mvni	v([0-9]+)\.8h, 0xfd
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m254_u16, svuint16_t,
+		z0 = svdup_n_u16 (-254),
+		z0 = svdup_u16 (-254))
+
+/*
+** dup_m255_u16:
+**	mov	z0\.h, #-255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_u16, svuint16_t,
+		z0 = svdup_n_u16 (-255),
+		z0 = svdup_u16 (-255))
+
+/*
+** dup_m256_u16:
+**	mov	z0\.h, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_u16, svuint16_t,
+		z0 = svdup_n_u16 (-256),
+		z0 = svdup_u16 (-256))
+
+/*
+** dup_m257_u16:
+**	mov	z0\.h, #-257
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_u16, svuint16_t,
+		z0 = svdup_n_u16 (-257),
+		z0 = svdup_u16 (-257))
+
+/*
+** dup_m258_u16:
+**	mov	z0\.b, #-2
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m258_u16, svuint16_t,
+		z0 = svdup_n_u16 (-258),
+		z0 = svdup_u16 (-258))
+
+/*
+** dup_m259_u16:
+**	mov	(w[0-9]+), -259
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m259_u16, svuint16_t,
+		z0 = svdup_n_u16 (-259),
+		z0 = svdup_u16 (-259))
+
+/*
+** dup_m512_u16:
+**	mov	z0\.h, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_u16, svuint16_t,
+		z0 = svdup_n_u16 (-512),
+		z0 = svdup_u16 (-512))
+
+/*
+** dup_m7f00_u16:
+**	mov	z0\.h, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_u16, svuint16_t,
+		z0 = svdup_n_u16 (-0x7f00),
+		z0 = svdup_u16 (-0x7f00))
+
+/*
+** dup_m7f01_u16:
+**	mov	z0\.h, #-32513
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_u16, svuint16_t,
+		z0 = svdup_n_u16 (-0x7f01),
+		z0 = svdup_u16 (-0x7f01))
+
+/*
+** dup_m7f02_u16:
+**	mov	(w[0-9]+), -32514
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f02_u16, svuint16_t,
+		z0 = svdup_n_u16 (-0x7f02),
+		z0 = svdup_u16 (-0x7f02))
+
+/*
+** dup_m7ffe_u16:
+**	mov	(w[0-9]+), -32766
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7ffe_u16, svuint16_t,
+		z0 = svdup_n_u16 (-0x7ffe),
+		z0 = svdup_u16 (-0x7ffe))
+
+/*
+** dup_m7fff_u16:
+**	mov	z0\.h, #-32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_u16, svuint16_t,
+		z0 = svdup_n_u16 (-0x7fff),
+		z0 = svdup_u16 (-0x7fff))
+
+/*
+** dup_m8000_u16:
+**	mov	z0\.h, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_u16, svuint16_t,
+		z0 = svdup_n_u16 (-0x8000),
+		z0 = svdup_u16 (-0x8000))
+
+/*
+** dup_w0_u16:
+**	mov	z0\.h, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_u16, svuint16_t, uint16_t,
+		 z0 = svdup_n_u16 (x0),
+		 z0 = svdup_u16 (x0))
+
+/*
+** dup_1_u16_m:
+**	mov	z0\.h, p0/m, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, 1),
+		z0 = svdup_u16_m (z0, p0, 1))
+
+/*
+** dup_127_u16_m:
+**	mov	z0\.h, p0/m, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, 127),
+		z0 = svdup_u16_m (z0, p0, 127))
+
+/*
+** dup_128_u16_m:
+**	mov	(z[0-9]+\.h), #128
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, 128),
+		z0 = svdup_u16_m (z0, p0, 128))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_129_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, 129),
+		z0 = svdup_u16_m (z0, p0, 129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_253_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, 253),
+		z0 = svdup_u16_m (z0, p0, 253))
+
+/*
+** dup_254_u16_m:
+**	mov	(z[0-9]+\.h), #254
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, 254),
+		z0 = svdup_u16_m (z0, p0, 254))
+
+/*
+** dup_255_u16_m:
+**	mov	(z[0-9]+\.h), #255
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, 255),
+		z0 = svdup_u16_m (z0, p0, 255))
+
+/*
+** dup_256_u16_m:
+**	mov	z0\.h, p0/m, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, 256),
+		z0 = svdup_u16_m (z0, p0, 256))
+
+/*
+** dup_257_u16_m:
+**	mov	(z[0-9]+)\.b, #1
+**	sel	z0\.h, p0, \1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_257_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, 257),
+		z0 = svdup_u16_m (z0, p0, 257))
+
+/*
+** dup_512_u16_m:
+**	mov	z0\.h, p0/m, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, 512),
+		z0 = svdup_u16_m (z0, p0, 512))
+
+/*
+** dup_7f00_u16_m:
+**	mov	z0\.h, p0/m, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, 0x7f00),
+		z0 = svdup_u16_m (z0, p0, 0x7f00))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7f01_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, 0x7f01),
+		z0 = svdup_u16_m (z0, p0, 0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7ffd_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, 0x7ffd),
+		z0 = svdup_u16_m (z0, p0, 0x7ffd))
+
+/*
+** dup_7ffe_u16_m:
+**	mov	(z[0-9]+\.h), #32766
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, 0x7ffe),
+		z0 = svdup_u16_m (z0, p0, 0x7ffe))
+
+/*
+** dup_7fff_u16_m:
+**	mov	(z[0-9]+\.h), #32767
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, 0x7fff),
+		z0 = svdup_u16_m (z0, p0, 0x7fff))
+
+/*
+** dup_m1_u16_m:
+**	mov	z0\.h, p0/m, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -1),
+		z0 = svdup_u16_m (z0, p0, -1))
+
+/*
+** dup_m128_u16_m:
+**	mov	z0\.h, p0/m, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -128),
+		z0 = svdup_u16_m (z0, p0, -128))
+
+/*
+** dup_m129_u16_m:
+**	mov	(z[0-9]+\.h), #-129
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -129),
+		z0 = svdup_u16_m (z0, p0, -129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m130_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -130),
+		z0 = svdup_u16_m (z0, p0, -130))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m254_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -254),
+		z0 = svdup_u16_m (z0, p0, -254))
+
+/*
+** dup_m255_u16_m:
+**	mov	(z[0-9]+\.h), #-255
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -255),
+		z0 = svdup_u16_m (z0, p0, -255))
+
+/*
+** dup_m256_u16_m:
+**	mov	z0\.h, p0/m, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -256),
+		z0 = svdup_u16_m (z0, p0, -256))
+
+/*
+** dup_m257_u16_m:
+**	mov	(z[0-9]+\.h), #-257
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -257),
+		z0 = svdup_u16_m (z0, p0, -257))
+
+/*
+** dup_m258_u16_m:
+**	mov	(z[0-9]+)\.b, #-2
+**	sel	z0\.h, p0, \1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m258_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -258),
+		z0 = svdup_u16_m (z0, p0, -258))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m259_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -259),
+		z0 = svdup_u16_m (z0, p0, -259))
+
+/*
+** dup_m512_u16_m:
+**	mov	z0\.h, p0/m, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -512),
+		z0 = svdup_u16_m (z0, p0, -512))
+
+/*
+** dup_m7f00_u16_m:
+**	mov	z0\.h, p0/m, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -0x7f00),
+		z0 = svdup_u16_m (z0, p0, -0x7f00))
+
+/*
+** dup_m7f01_u16_m:
+**	mov	(z[0-9]+\.h), #-32513
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -0x7f01),
+		z0 = svdup_u16_m (z0, p0, -0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7f02_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -0x7f02),
+		z0 = svdup_u16_m (z0, p0, -0x7f02))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7ffe_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -0x7ffe),
+		z0 = svdup_u16_m (z0, p0, -0x7ffe))
+
+/*
+** dup_m7fff_u16_m:
+**	mov	(z[0-9]+\.h), #-32767
+**	sel	z0\.h, p0, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -0x7fff),
+		z0 = svdup_u16_m (z0, p0, -0x7fff))
+
+/*
+** dup_m8000_u16_m:
+**	mov	z0\.h, p0/m, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, -0x8000),
+		z0 = svdup_u16_m (z0, p0, -0x8000))
+
+/*
+** dup_0_u16_m:
+**	mov	z0\.h, p0/m, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_u16_m, svuint16_t,
+		z0 = svdup_n_u16_m (z0, p0, 0),
+		z0 = svdup_u16_m (z0, p0, 0))
+
+/*
+** dup_w0_u16_m:
+**	movprfx	z0, z1
+**	mov	z0\.h, p0/m, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_u16_m, svuint16_t, uint16_t,
+		z0 = svdup_n_u16_m (z1, p0, x0),
+		z0 = svdup_u16_m (z1, p0, x0))
+
+/*
+** dup_1_u16_z:
+**	mov	z0\.h, p0/z, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, 1),
+		z0 = svdup_u16_z (p0, 1))
+
+/*
+** dup_127_u16_z:
+**	mov	z0\.h, p0/z, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, 127),
+		z0 = svdup_u16_z (p0, 127))
+
+/*
+** dup_128_u16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #128
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, 128),
+		z0 = svdup_u16_z (p0, 128))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_129_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, 129),
+		z0 = svdup_u16_z (p0, 129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_253_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, 253),
+		z0 = svdup_u16_z (p0, 253))
+
+/*
+** dup_254_u16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #254
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, 254),
+		z0 = svdup_u16_z (p0, 254))
+
+/*
+** dup_255_u16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #255
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, 255),
+		z0 = svdup_u16_z (p0, 255))
+
+/*
+** dup_256_u16_z:
+**	mov	z0\.h, p0/z, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, 256),
+		z0 = svdup_u16_z (p0, 256))
+
+/*
+** dup_257_u16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+)\.b, #1
+**	sel	z0\.h, p0, \2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_257_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, 257),
+		z0 = svdup_u16_z (p0, 257))
+
+/*
+** dup_512_u16_z:
+**	mov	z0\.h, p0/z, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, 512),
+		z0 = svdup_u16_z (p0, 512))
+
+/*
+** dup_7f00_u16_z:
+**	mov	z0\.h, p0/z, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, 0x7f00),
+		z0 = svdup_u16_z (p0, 0x7f00))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7f01_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, 0x7f01),
+		z0 = svdup_u16_z (p0, 0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7ffd_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, 0x7ffd),
+		z0 = svdup_u16_z (p0, 0x7ffd))
+
+/*
+** dup_7ffe_u16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #32766
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, 0x7ffe),
+		z0 = svdup_u16_z (p0, 0x7ffe))
+
+/*
+** dup_7fff_u16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #32767
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, 0x7fff),
+		z0 = svdup_u16_z (p0, 0x7fff))
+
+/*
+** dup_m1_u16_z:
+**	mov	z0\.h, p0/z, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -1),
+		z0 = svdup_u16_z (p0, -1))
+
+/*
+** dup_m128_u16_z:
+**	mov	z0\.h, p0/z, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -128),
+		z0 = svdup_u16_z (p0, -128))
+
+/*
+** dup_m129_u16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #-129
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -129),
+		z0 = svdup_u16_z (p0, -129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m130_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -130),
+		z0 = svdup_u16_z (p0, -130))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m254_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -254),
+		z0 = svdup_u16_z (p0, -254))
+
+/*
+** dup_m255_u16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #-255
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -255),
+		z0 = svdup_u16_z (p0, -255))
+
+/*
+** dup_m256_u16_z:
+**	mov	z0\.h, p0/z, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -256),
+		z0 = svdup_u16_z (p0, -256))
+
+/*
+** dup_m257_u16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #-257
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -257),
+		z0 = svdup_u16_z (p0, -257))
+
+/*
+** dup_m258_u16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+)\.b, #-2
+**	sel	z0\.h, p0, \2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m258_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -258),
+		z0 = svdup_u16_z (p0, -258))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m259_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -259),
+		z0 = svdup_u16_z (p0, -259))
+
+/*
+** dup_m512_u16_z:
+**	mov	z0\.h, p0/z, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -512),
+		z0 = svdup_u16_z (p0, -512))
+
+/*
+** dup_m7f00_u16_z:
+**	mov	z0\.h, p0/z, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -0x7f00),
+		z0 = svdup_u16_z (p0, -0x7f00))
+
+/*
+** dup_m7f01_u16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #-32513
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -0x7f01),
+		z0 = svdup_u16_z (p0, -0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7f02_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -0x7f02),
+		z0 = svdup_u16_z (p0, -0x7f02))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7ffe_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -0x7ffe),
+		z0 = svdup_u16_z (p0, -0x7ffe))
+
+/*
+** dup_m7fff_u16_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.h), #-32767
+**	sel	z0\.h, p0, \2, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -0x7fff),
+		z0 = svdup_u16_z (p0, -0x7fff))
+
+/*
+** dup_m8000_u16_z:
+**	mov	z0\.h, p0/z, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, -0x8000),
+		z0 = svdup_u16_z (p0, -0x8000))
+
+/*
+** dup_0_u16_z:
+**	mov	z0\.h, p0/z, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_u16_z, svuint16_t,
+		z0 = svdup_n_u16_z (p0, 0),
+		z0 = svdup_u16_z (p0, 0))
+
+/*
+** dup_w0_u16_z:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mov	z0\.h, p0/m, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_u16_z, svuint16_t, uint16_t,
+		z0 = svdup_n_u16_z (p0, x0),
+		z0 = svdup_u16_z (p0, x0))
+
+/*
+** dup_1_u16_x:
+**	mov	z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, 1),
+		z0 = svdup_u16_x (p0, 1))
+
+/*
+** dup_127_u16_x:
+**	mov	z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, 127),
+		z0 = svdup_u16_x (p0, 127))
+
+/*
+** dup_128_u16_x:
+**	mov	z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, 128),
+		z0 = svdup_u16_x (p0, 128))
+
+/*
+** dup_129_u16_x:
+**	movi	v([0-9]+)\.8h, 0x81
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, 129),
+		z0 = svdup_u16_x (p0, 129))
+
+/*
+** dup_253_u16_x:
+**	movi	v([0-9]+)\.8h, 0xfd
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, 253),
+		z0 = svdup_u16_x (p0, 253))
+
+/*
+** dup_254_u16_x:
+**	mov	z0\.h, #254
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, 254),
+		z0 = svdup_u16_x (p0, 254))
+
+/*
+** dup_255_u16_x:
+**	mov	z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, 255),
+		z0 = svdup_u16_x (p0, 255))
+
+/*
+** dup_256_u16_x:
+**	mov	z0\.h, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, 256),
+		z0 = svdup_u16_x (p0, 256))
+
+/*
+** dup_257_u16_x:
+**	mov	z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_257_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, 257),
+		z0 = svdup_u16_x (p0, 257))
+
+/*
+** dup_512_u16_x:
+**	mov	z0\.h, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, 512),
+		z0 = svdup_u16_x (p0, 512))
+
+/*
+** dup_7f00_u16_x:
+**	mov	z0\.h, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, 0x7f00),
+		z0 = svdup_u16_x (p0, 0x7f00))
+
+/*
+** dup_7f01_u16_x:
+**	mov	(w[0-9]+), 32513
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f01_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, 0x7f01),
+		z0 = svdup_u16_x (p0, 0x7f01))
+
+/*
+** dup_7ffd_u16_x:
+**	mov	(w[0-9]+), 32765
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffd_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, 0x7ffd),
+		z0 = svdup_u16_x (p0, 0x7ffd))
+
+/*
+** dup_7ffe_u16_x:
+**	mov	z0\.h, #32766
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, 0x7ffe),
+		z0 = svdup_u16_x (p0, 0x7ffe))
+
+/*
+** dup_7fff_u16_x:
+**	mov	z0\.h, #32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, 0x7fff),
+		z0 = svdup_u16_x (p0, 0x7fff))
+
+/*
+** dup_m1_u16_x:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -1),
+		z0 = svdup_u16_x (p0, -1))
+
+/*
+** dup_m128_u16_x:
+**	mov	z0\.h, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -128),
+		z0 = svdup_u16_x (p0, -128))
+
+/*
+** dup_m129_u16_x:
+**	mov	z0\.h, #-129
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -129),
+		z0 = svdup_u16_x (p0, -129))
+
+/*
+** dup_m130_u16_x:
+**	mvni	v([0-9]+)\.8h, 0x81
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m130_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -130),
+		z0 = svdup_u16_x (p0, -130))
+
+/*
+** dup_m254_u16_x:
+**	mvni	v([0-9]+)\.8h, 0xfd
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m254_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -254),
+		z0 = svdup_u16_x (p0, -254))
+
+/*
+** dup_m255_u16_x:
+**	mov	z0\.h, #-255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -255),
+		z0 = svdup_u16_x (p0, -255))
+
+/*
+** dup_m256_u16_x:
+**	mov	z0\.h, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -256),
+		z0 = svdup_u16_x (p0, -256))
+
+/*
+** dup_m257_u16_x:
+**	mov	z0\.h, #-257
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -257),
+		z0 = svdup_u16_x (p0, -257))
+
+/*
+** dup_m258_u16_x:
+**	mov	z0\.b, #-2
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m258_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -258),
+		z0 = svdup_u16_x (p0, -258))
+
+/*
+** dup_m259_u16_x:
+**	mov	(w[0-9]+), -259
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m259_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -259),
+		z0 = svdup_u16_x (p0, -259))
+
+/*
+** dup_m512_u16_x:
+**	mov	z0\.h, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -512),
+		z0 = svdup_u16_x (p0, -512))
+
+/*
+** dup_m7f00_u16_x:
+**	mov	z0\.h, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -0x7f00),
+		z0 = svdup_u16_x (p0, -0x7f00))
+
+/*
+** dup_m7f01_u16_x:
+**	mov	z0\.h, #-32513
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -0x7f01),
+		z0 = svdup_u16_x (p0, -0x7f01))
+
+/*
+** dup_m7f02_u16_x:
+**	mov	(w[0-9]+), -32514
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f02_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -0x7f02),
+		z0 = svdup_u16_x (p0, -0x7f02))
+
+/*
+** dup_m7ffe_u16_x:
+**	mov	(w[0-9]+), -32766
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7ffe_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -0x7ffe),
+		z0 = svdup_u16_x (p0, -0x7ffe))
+
+/*
+** dup_m7fff_u16_x:
+**	mov	z0\.h, #-32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -0x7fff),
+		z0 = svdup_u16_x (p0, -0x7fff))
+
+/*
+** dup_m8000_u16_x:
+**	mov	z0\.h, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_u16_x, svuint16_t,
+		z0 = svdup_n_u16_x (p0, -0x8000),
+		z0 = svdup_u16_x (p0, -0x8000))
+
+/*
+** dup_w0_u16_x:
+**	mov	z0\.h, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_u16_x, svuint16_t, uint16_t,
+		z0 = svdup_n_u16_x (p0, x0),
+		z0 = svdup_u16_x (p0, x0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u32.c
new file mode 100644
index 000000000..667feea64
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u32.c
@@ -0,0 +1,1175 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_1_u32:
+**	mov	z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_u32, svuint32_t,
+		z0 = svdup_n_u32 (1),
+		z0 = svdup_u32 (1))
+
+/*
+** dup_127_u32:
+**	mov	z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_u32, svuint32_t,
+		z0 = svdup_n_u32 (127),
+		z0 = svdup_u32 (127))
+
+/*
+** dup_128_u32:
+**	mov	z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_u32, svuint32_t,
+		z0 = svdup_n_u32 (128),
+		z0 = svdup_u32 (128))
+
+/*
+** dup_129_u32:
+**	movi	v([0-9]+)\.4s, 0x81
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_u32, svuint32_t,
+		z0 = svdup_n_u32 (129),
+		z0 = svdup_u32 (129))
+
+/*
+** dup_253_u32:
+**	movi	v([0-9]+)\.4s, 0xfd
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_u32, svuint32_t,
+		z0 = svdup_n_u32 (253),
+		z0 = svdup_u32 (253))
+
+/*
+** dup_254_u32:
+**	mov	z0\.s, #254
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_u32, svuint32_t,
+		z0 = svdup_n_u32 (254),
+		z0 = svdup_u32 (254))
+
+/*
+** dup_255_u32:
+**	mov	z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_u32, svuint32_t,
+		z0 = svdup_n_u32 (255),
+		z0 = svdup_u32 (255))
+
+/*
+** dup_256_u32:
+**	mov	z0\.s, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_u32, svuint32_t,
+		z0 = svdup_n_u32 (256),
+		z0 = svdup_u32 (256))
+
+/*
+** dup_257_u32:
+**	mov	(w[0-9]+), 257
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_257_u32, svuint32_t,
+		z0 = svdup_n_u32 (257),
+		z0 = svdup_u32 (257))
+
+/*
+** dup_512_u32:
+**	mov	z0\.s, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_u32, svuint32_t,
+		z0 = svdup_n_u32 (512),
+		z0 = svdup_u32 (512))
+
+/*
+** dup_7f00_u32:
+**	mov	z0\.s, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_u32, svuint32_t,
+		z0 = svdup_n_u32 (0x7f00),
+		z0 = svdup_u32 (0x7f00))
+
+/*
+** dup_7f01_u32:
+**	mov	(w[0-9]+), 32513
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f01_u32, svuint32_t,
+		z0 = svdup_n_u32 (0x7f01),
+		z0 = svdup_u32 (0x7f01))
+
+/*
+** dup_7ffd_u32:
+**	mov	(w[0-9]+), 32765
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffd_u32, svuint32_t,
+		z0 = svdup_n_u32 (0x7ffd),
+		z0 = svdup_u32 (0x7ffd))
+
+/*
+** dup_7ffe_u32:
+**	mov	z0\.s, #32766
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_u32, svuint32_t,
+		z0 = svdup_n_u32 (0x7ffe),
+		z0 = svdup_u32 (0x7ffe))
+
+/*
+** dup_7fff_u32:
+**	mov	z0\.s, #32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_u32, svuint32_t,
+		z0 = svdup_n_u32 (0x7fff),
+		z0 = svdup_u32 (0x7fff))
+
+/*
+** dup_m1_u32:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_u32, svuint32_t,
+		z0 = svdup_n_u32 (-1),
+		z0 = svdup_u32 (-1))
+
+/*
+** dup_m128_u32:
+**	mov	z0\.s, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_u32, svuint32_t,
+		z0 = svdup_n_u32 (-128),
+		z0 = svdup_u32 (-128))
+
+/*
+** dup_m129_u32:
+**	mov	z0\.s, #-129
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_u32, svuint32_t,
+		z0 = svdup_n_u32 (-129),
+		z0 = svdup_u32 (-129))
+
+/*
+** dup_m130_u32:
+**	mvni	v([0-9]+)\.4s, 0x81
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m130_u32, svuint32_t,
+		z0 = svdup_n_u32 (-130),
+		z0 = svdup_u32 (-130))
+
+/*
+** dup_m254_u32:
+**	mvni	v([0-9]+)\.4s, 0xfd
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m254_u32, svuint32_t,
+		z0 = svdup_n_u32 (-254),
+		z0 = svdup_u32 (-254))
+
+/*
+** dup_m255_u32:
+**	mov	z0\.s, #-255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_u32, svuint32_t,
+		z0 = svdup_n_u32 (-255),
+		z0 = svdup_u32 (-255))
+
+/*
+** dup_m256_u32:
+**	mov	z0\.s, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_u32, svuint32_t,
+		z0 = svdup_n_u32 (-256),
+		z0 = svdup_u32 (-256))
+
+/*
+** dup_m257_u32:
+**	mov	z0\.s, #-257
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_u32, svuint32_t,
+		z0 = svdup_n_u32 (-257),
+		z0 = svdup_u32 (-257))
+
+/*
+** dup_m258_u32:
+**	mov	(w[0-9]+), -258
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m258_u32, svuint32_t,
+		z0 = svdup_n_u32 (-258),
+		z0 = svdup_u32 (-258))
+
+/*
+** dup_m259_u32:
+**	mov	(w[0-9]+), -259
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m259_u32, svuint32_t,
+		z0 = svdup_n_u32 (-259),
+		z0 = svdup_u32 (-259))
+
+/*
+** dup_m512_u32:
+**	mov	z0\.s, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_u32, svuint32_t,
+		z0 = svdup_n_u32 (-512),
+		z0 = svdup_u32 (-512))
+
+/*
+** dup_m7f00_u32:
+**	mov	z0\.s, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_u32, svuint32_t,
+		z0 = svdup_n_u32 (-0x7f00),
+		z0 = svdup_u32 (-0x7f00))
+
+/*
+** dup_m7f01_u32:
+**	mov	z0\.s, #-32513
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_u32, svuint32_t,
+		z0 = svdup_n_u32 (-0x7f01),
+		z0 = svdup_u32 (-0x7f01))
+
+/*
+** dup_m7f02_u32:
+**	mov	(w[0-9]+), -32514
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f02_u32, svuint32_t,
+		z0 = svdup_n_u32 (-0x7f02),
+		z0 = svdup_u32 (-0x7f02))
+
+/*
+** dup_m7ffe_u32:
+**	mov	(w[0-9]+), -32766
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7ffe_u32, svuint32_t,
+		z0 = svdup_n_u32 (-0x7ffe),
+		z0 = svdup_u32 (-0x7ffe))
+
+/*
+** dup_m7fff_u32:
+**	mov	z0\.s, #-32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_u32, svuint32_t,
+		z0 = svdup_n_u32 (-0x7fff),
+		z0 = svdup_u32 (-0x7fff))
+
+/*
+** dup_m8000_u32:
+**	mov	z0\.s, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_u32, svuint32_t,
+		z0 = svdup_n_u32 (-0x8000),
+		z0 = svdup_u32 (-0x8000))
+
+/*
+** dup_w0_u32:
+**	mov	z0\.s, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_u32, svuint32_t, uint32_t,
+		 z0 = svdup_n_u32 (x0),
+		 z0 = svdup_u32 (x0))
+
+/*
+** dup_1_u32_m:
+**	mov	z0\.s, p0/m, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, 1),
+		z0 = svdup_u32_m (z0, p0, 1))
+
+/*
+** dup_127_u32_m:
+**	mov	z0\.s, p0/m, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, 127),
+		z0 = svdup_u32_m (z0, p0, 127))
+
+/*
+** dup_128_u32_m:
+**	mov	(z[0-9]+\.s), #128
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, 128),
+		z0 = svdup_u32_m (z0, p0, 128))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_129_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, 129),
+		z0 = svdup_u32_m (z0, p0, 129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_253_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, 253),
+		z0 = svdup_u32_m (z0, p0, 253))
+
+/*
+** dup_254_u32_m:
+**	mov	(z[0-9]+\.s), #254
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, 254),
+		z0 = svdup_u32_m (z0, p0, 254))
+
+/*
+** dup_255_u32_m:
+**	mov	(z[0-9]+\.s), #255
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, 255),
+		z0 = svdup_u32_m (z0, p0, 255))
+
+/*
+** dup_256_u32_m:
+**	mov	z0\.s, p0/m, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, 256),
+		z0 = svdup_u32_m (z0, p0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_257_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, 257),
+		z0 = svdup_u32_m (z0, p0, 257))
+
+/*
+** dup_512_u32_m:
+**	mov	z0\.s, p0/m, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, 512),
+		z0 = svdup_u32_m (z0, p0, 512))
+
+/*
+** dup_7f00_u32_m:
+**	mov	z0\.s, p0/m, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, 0x7f00),
+		z0 = svdup_u32_m (z0, p0, 0x7f00))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7f01_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, 0x7f01),
+		z0 = svdup_u32_m (z0, p0, 0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7ffd_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, 0x7ffd),
+		z0 = svdup_u32_m (z0, p0, 0x7ffd))
+
+/*
+** dup_7ffe_u32_m:
+**	mov	(z[0-9]+\.s), #32766
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, 0x7ffe),
+		z0 = svdup_u32_m (z0, p0, 0x7ffe))
+
+/*
+** dup_7fff_u32_m:
+**	mov	(z[0-9]+\.s), #32767
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, 0x7fff),
+		z0 = svdup_u32_m (z0, p0, 0x7fff))
+
+/*
+** dup_m1_u32_m:
+**	mov	z0\.s, p0/m, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -1),
+		z0 = svdup_u32_m (z0, p0, -1))
+
+/*
+** dup_m128_u32_m:
+**	mov	z0\.s, p0/m, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -128),
+		z0 = svdup_u32_m (z0, p0, -128))
+
+/*
+** dup_m129_u32_m:
+**	mov	(z[0-9]+\.s), #-129
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -129),
+		z0 = svdup_u32_m (z0, p0, -129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m130_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -130),
+		z0 = svdup_u32_m (z0, p0, -130))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m254_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -254),
+		z0 = svdup_u32_m (z0, p0, -254))
+
+/*
+** dup_m255_u32_m:
+**	mov	(z[0-9]+\.s), #-255
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -255),
+		z0 = svdup_u32_m (z0, p0, -255))
+
+/*
+** dup_m256_u32_m:
+**	mov	z0\.s, p0/m, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -256),
+		z0 = svdup_u32_m (z0, p0, -256))
+
+/*
+** dup_m257_u32_m:
+**	mov	(z[0-9]+\.s), #-257
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -257),
+		z0 = svdup_u32_m (z0, p0, -257))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m258_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -258),
+		z0 = svdup_u32_m (z0, p0, -258))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m259_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -259),
+		z0 = svdup_u32_m (z0, p0, -259))
+
+/*
+** dup_m512_u32_m:
+**	mov	z0\.s, p0/m, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -512),
+		z0 = svdup_u32_m (z0, p0, -512))
+
+/*
+** dup_m7f00_u32_m:
+**	mov	z0\.s, p0/m, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -0x7f00),
+		z0 = svdup_u32_m (z0, p0, -0x7f00))
+
+/*
+** dup_m7f01_u32_m:
+**	mov	(z[0-9]+\.s), #-32513
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -0x7f01),
+		z0 = svdup_u32_m (z0, p0, -0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7f02_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -0x7f02),
+		z0 = svdup_u32_m (z0, p0, -0x7f02))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7ffe_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -0x7ffe),
+		z0 = svdup_u32_m (z0, p0, -0x7ffe))
+
+/*
+** dup_m7fff_u32_m:
+**	mov	(z[0-9]+\.s), #-32767
+**	sel	z0\.s, p0, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -0x7fff),
+		z0 = svdup_u32_m (z0, p0, -0x7fff))
+
+/*
+** dup_m8000_u32_m:
+**	mov	z0\.s, p0/m, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, -0x8000),
+		z0 = svdup_u32_m (z0, p0, -0x8000))
+
+/*
+** dup_0_u32_m:
+**	mov	z0\.s, p0/m, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_u32_m, svuint32_t,
+		z0 = svdup_n_u32_m (z0, p0, 0),
+		z0 = svdup_u32_m (z0, p0, 0))
+
+/*
+** dup_w0_u32_m:
+**	movprfx	z0, z1
+**	mov	z0\.s, p0/m, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_u32_m, svuint32_t, uint32_t,
+		z0 = svdup_n_u32_m (z1, p0, x0),
+		z0 = svdup_u32_m (z1, p0, x0))
+
+/*
+** dup_1_u32_z:
+**	mov	z0\.s, p0/z, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, 1),
+		z0 = svdup_u32_z (p0, 1))
+
+/*
+** dup_127_u32_z:
+**	mov	z0\.s, p0/z, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, 127),
+		z0 = svdup_u32_z (p0, 127))
+
+/*
+** dup_128_u32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #128
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, 128),
+		z0 = svdup_u32_z (p0, 128))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_129_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, 129),
+		z0 = svdup_u32_z (p0, 129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_253_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, 253),
+		z0 = svdup_u32_z (p0, 253))
+
+/*
+** dup_254_u32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #254
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, 254),
+		z0 = svdup_u32_z (p0, 254))
+
+/*
+** dup_255_u32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #255
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, 255),
+		z0 = svdup_u32_z (p0, 255))
+
+/*
+** dup_256_u32_z:
+**	mov	z0\.s, p0/z, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, 256),
+		z0 = svdup_u32_z (p0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_257_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, 257),
+		z0 = svdup_u32_z (p0, 257))
+
+/*
+** dup_512_u32_z:
+**	mov	z0\.s, p0/z, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, 512),
+		z0 = svdup_u32_z (p0, 512))
+
+/*
+** dup_7f00_u32_z:
+**	mov	z0\.s, p0/z, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, 0x7f00),
+		z0 = svdup_u32_z (p0, 0x7f00))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7f01_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, 0x7f01),
+		z0 = svdup_u32_z (p0, 0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7ffd_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, 0x7ffd),
+		z0 = svdup_u32_z (p0, 0x7ffd))
+
+/*
+** dup_7ffe_u32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #32766
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, 0x7ffe),
+		z0 = svdup_u32_z (p0, 0x7ffe))
+
+/*
+** dup_7fff_u32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #32767
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, 0x7fff),
+		z0 = svdup_u32_z (p0, 0x7fff))
+
+/*
+** dup_m1_u32_z:
+**	mov	z0\.s, p0/z, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -1),
+		z0 = svdup_u32_z (p0, -1))
+
+/*
+** dup_m128_u32_z:
+**	mov	z0\.s, p0/z, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -128),
+		z0 = svdup_u32_z (p0, -128))
+
+/*
+** dup_m129_u32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #-129
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -129),
+		z0 = svdup_u32_z (p0, -129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m130_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -130),
+		z0 = svdup_u32_z (p0, -130))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m254_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -254),
+		z0 = svdup_u32_z (p0, -254))
+
+/*
+** dup_m255_u32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #-255
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -255),
+		z0 = svdup_u32_z (p0, -255))
+
+/*
+** dup_m256_u32_z:
+**	mov	z0\.s, p0/z, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -256),
+		z0 = svdup_u32_z (p0, -256))
+
+/*
+** dup_m257_u32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #-257
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -257),
+		z0 = svdup_u32_z (p0, -257))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m258_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -258),
+		z0 = svdup_u32_z (p0, -258))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m259_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -259),
+		z0 = svdup_u32_z (p0, -259))
+
+/*
+** dup_m512_u32_z:
+**	mov	z0\.s, p0/z, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -512),
+		z0 = svdup_u32_z (p0, -512))
+
+/*
+** dup_m7f00_u32_z:
+**	mov	z0\.s, p0/z, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -0x7f00),
+		z0 = svdup_u32_z (p0, -0x7f00))
+
+/*
+** dup_m7f01_u32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #-32513
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -0x7f01),
+		z0 = svdup_u32_z (p0, -0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7f02_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -0x7f02),
+		z0 = svdup_u32_z (p0, -0x7f02))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7ffe_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -0x7ffe),
+		z0 = svdup_u32_z (p0, -0x7ffe))
+
+/*
+** dup_m7fff_u32_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.s), #-32767
+**	sel	z0\.s, p0, \2, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -0x7fff),
+		z0 = svdup_u32_z (p0, -0x7fff))
+
+/*
+** dup_m8000_u32_z:
+**	mov	z0\.s, p0/z, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, -0x8000),
+		z0 = svdup_u32_z (p0, -0x8000))
+
+/*
+** dup_0_u32_z:
+**	mov	z0\.s, p0/z, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_u32_z, svuint32_t,
+		z0 = svdup_n_u32_z (p0, 0),
+		z0 = svdup_u32_z (p0, 0))
+
+/*
+** dup_w0_u32_z:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mov	z0\.s, p0/m, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_u32_z, svuint32_t, uint32_t,
+		z0 = svdup_n_u32_z (p0, x0),
+		z0 = svdup_u32_z (p0, x0))
+
+/*
+** dup_1_u32_x:
+**	mov	z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, 1),
+		z0 = svdup_u32_x (p0, 1))
+
+/*
+** dup_127_u32_x:
+**	mov	z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, 127),
+		z0 = svdup_u32_x (p0, 127))
+
+/*
+** dup_128_u32_x:
+**	mov	z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, 128),
+		z0 = svdup_u32_x (p0, 128))
+
+/*
+** dup_129_u32_x:
+**	movi	v([0-9]+)\.4s, 0x81
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, 129),
+		z0 = svdup_u32_x (p0, 129))
+
+/*
+** dup_253_u32_x:
+**	movi	v([0-9]+)\.4s, 0xfd
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, 253),
+		z0 = svdup_u32_x (p0, 253))
+
+/*
+** dup_254_u32_x:
+**	mov	z0\.s, #254
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, 254),
+		z0 = svdup_u32_x (p0, 254))
+
+/*
+** dup_255_u32_x:
+**	mov	z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, 255),
+		z0 = svdup_u32_x (p0, 255))
+
+/*
+** dup_256_u32_x:
+**	mov	z0\.s, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, 256),
+		z0 = svdup_u32_x (p0, 256))
+
+/*
+** dup_257_u32_x:
+**	mov	(w[0-9]+), 257
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_257_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, 257),
+		z0 = svdup_u32_x (p0, 257))
+
+/*
+** dup_512_u32_x:
+**	mov	z0\.s, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, 512),
+		z0 = svdup_u32_x (p0, 512))
+
+/*
+** dup_7f00_u32_x:
+**	mov	z0\.s, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, 0x7f00),
+		z0 = svdup_u32_x (p0, 0x7f00))
+
+/*
+** dup_7f01_u32_x:
+**	mov	(w[0-9]+), 32513
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f01_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, 0x7f01),
+		z0 = svdup_u32_x (p0, 0x7f01))
+
+/*
+** dup_7ffd_u32_x:
+**	mov	(w[0-9]+), 32765
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffd_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, 0x7ffd),
+		z0 = svdup_u32_x (p0, 0x7ffd))
+
+/*
+** dup_7ffe_u32_x:
+**	mov	z0\.s, #32766
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, 0x7ffe),
+		z0 = svdup_u32_x (p0, 0x7ffe))
+
+/*
+** dup_7fff_u32_x:
+**	mov	z0\.s, #32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, 0x7fff),
+		z0 = svdup_u32_x (p0, 0x7fff))
+
+/*
+** dup_m1_u32_x:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -1),
+		z0 = svdup_u32_x (p0, -1))
+
+/*
+** dup_m128_u32_x:
+**	mov	z0\.s, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -128),
+		z0 = svdup_u32_x (p0, -128))
+
+/*
+** dup_m129_u32_x:
+**	mov	z0\.s, #-129
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -129),
+		z0 = svdup_u32_x (p0, -129))
+
+/*
+** dup_m130_u32_x:
+**	mvni	v([0-9]+)\.4s, 0x81
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m130_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -130),
+		z0 = svdup_u32_x (p0, -130))
+
+/*
+** dup_m254_u32_x:
+**	mvni	v([0-9]+)\.4s, 0xfd
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m254_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -254),
+		z0 = svdup_u32_x (p0, -254))
+
+/*
+** dup_m255_u32_x:
+**	mov	z0\.s, #-255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -255),
+		z0 = svdup_u32_x (p0, -255))
+
+/*
+** dup_m256_u32_x:
+**	mov	z0\.s, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -256),
+		z0 = svdup_u32_x (p0, -256))
+
+/*
+** dup_m257_u32_x:
+**	mov	z0\.s, #-257
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -257),
+		z0 = svdup_u32_x (p0, -257))
+
+/*
+** dup_m258_u32_x:
+**	mov	(w[0-9]+), -258
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m258_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -258),
+		z0 = svdup_u32_x (p0, -258))
+
+/*
+** dup_m259_u32_x:
+**	mov	(w[0-9]+), -259
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m259_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -259),
+		z0 = svdup_u32_x (p0, -259))
+
+/*
+** dup_m512_u32_x:
+**	mov	z0\.s, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -512),
+		z0 = svdup_u32_x (p0, -512))
+
+/*
+** dup_m7f00_u32_x:
+**	mov	z0\.s, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -0x7f00),
+		z0 = svdup_u32_x (p0, -0x7f00))
+
+/*
+** dup_m7f01_u32_x:
+**	mov	z0\.s, #-32513
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -0x7f01),
+		z0 = svdup_u32_x (p0, -0x7f01))
+
+/*
+** dup_m7f02_u32_x:
+**	mov	(w[0-9]+), -32514
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f02_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -0x7f02),
+		z0 = svdup_u32_x (p0, -0x7f02))
+
+/*
+** dup_m7ffe_u32_x:
+**	mov	(w[0-9]+), -32766
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7ffe_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -0x7ffe),
+		z0 = svdup_u32_x (p0, -0x7ffe))
+
+/*
+** dup_m7fff_u32_x:
+**	mov	z0\.s, #-32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -0x7fff),
+		z0 = svdup_u32_x (p0, -0x7fff))
+
+/*
+** dup_m8000_u32_x:
+**	mov	z0\.s, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_u32_x, svuint32_t,
+		z0 = svdup_n_u32_x (p0, -0x8000),
+		z0 = svdup_u32_x (p0, -0x8000))
+
+/*
+** dup_w0_u32_x:
+**	mov	z0\.s, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_u32_x, svuint32_t, uint32_t,
+		z0 = svdup_n_u32_x (p0, x0),
+		z0 = svdup_u32_x (p0, x0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u64.c
new file mode 100644
index 000000000..a7cca7af0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u64.c
@@ -0,0 +1,1175 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_1_u64:
+**	mov	z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_u64, svuint64_t,
+		z0 = svdup_n_u64 (1),
+		z0 = svdup_u64 (1))
+
+/*
+** dup_127_u64:
+**	mov	z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_u64, svuint64_t,
+		z0 = svdup_n_u64 (127),
+		z0 = svdup_u64 (127))
+
+/*
+** dup_128_u64:
+**	mov	z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_u64, svuint64_t,
+		z0 = svdup_n_u64 (128),
+		z0 = svdup_u64 (128))
+
+/*
+** dup_129_u64:
+**	mov	(x[0-9]+), 129
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_u64, svuint64_t,
+		z0 = svdup_n_u64 (129),
+		z0 = svdup_u64 (129))
+
+/*
+** dup_253_u64:
+**	mov	(x[0-9]+), 253
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_u64, svuint64_t,
+		z0 = svdup_n_u64 (253),
+		z0 = svdup_u64 (253))
+
+/*
+** dup_254_u64:
+**	mov	z0\.d, #254
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_u64, svuint64_t,
+		z0 = svdup_n_u64 (254),
+		z0 = svdup_u64 (254))
+
+/*
+** dup_255_u64:
+**	mov	z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_u64, svuint64_t,
+		z0 = svdup_n_u64 (255),
+		z0 = svdup_u64 (255))
+
+/*
+** dup_256_u64:
+**	mov	z0\.d, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_u64, svuint64_t,
+		z0 = svdup_n_u64 (256),
+		z0 = svdup_u64 (256))
+
+/*
+** dup_257_u64:
+**	mov	(x[0-9]+), 257
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_257_u64, svuint64_t,
+		z0 = svdup_n_u64 (257),
+		z0 = svdup_u64 (257))
+
+/*
+** dup_512_u64:
+**	mov	z0\.d, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_u64, svuint64_t,
+		z0 = svdup_n_u64 (512),
+		z0 = svdup_u64 (512))
+
+/*
+** dup_7f00_u64:
+**	mov	z0\.d, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_u64, svuint64_t,
+		z0 = svdup_n_u64 (0x7f00),
+		z0 = svdup_u64 (0x7f00))
+
+/*
+** dup_7f01_u64:
+**	mov	(x[0-9]+), 32513
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f01_u64, svuint64_t,
+		z0 = svdup_n_u64 (0x7f01),
+		z0 = svdup_u64 (0x7f01))
+
+/*
+** dup_7ffd_u64:
+**	mov	(x[0-9]+), 32765
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffd_u64, svuint64_t,
+		z0 = svdup_n_u64 (0x7ffd),
+		z0 = svdup_u64 (0x7ffd))
+
+/*
+** dup_7ffe_u64:
+**	mov	z0\.d, #32766
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_u64, svuint64_t,
+		z0 = svdup_n_u64 (0x7ffe),
+		z0 = svdup_u64 (0x7ffe))
+
+/*
+** dup_7fff_u64:
+**	mov	z0\.d, #32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_u64, svuint64_t,
+		z0 = svdup_n_u64 (0x7fff),
+		z0 = svdup_u64 (0x7fff))
+
+/*
+** dup_m1_u64:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_u64, svuint64_t,
+		z0 = svdup_n_u64 (-1),
+		z0 = svdup_u64 (-1))
+
+/*
+** dup_m128_u64:
+**	mov	z0\.d, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_u64, svuint64_t,
+		z0 = svdup_n_u64 (-128),
+		z0 = svdup_u64 (-128))
+
+/*
+** dup_m129_u64:
+**	mov	z0\.d, #-129
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_u64, svuint64_t,
+		z0 = svdup_n_u64 (-129),
+		z0 = svdup_u64 (-129))
+
+/*
+** dup_m130_u64:
+**	mov	(x[0-9]+), -130
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m130_u64, svuint64_t,
+		z0 = svdup_n_u64 (-130),
+		z0 = svdup_u64 (-130))
+
+/*
+** dup_m254_u64:
+**	mov	(x[0-9]+), -254
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m254_u64, svuint64_t,
+		z0 = svdup_n_u64 (-254),
+		z0 = svdup_u64 (-254))
+
+/*
+** dup_m255_u64:
+**	mov	z0\.d, #-255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_u64, svuint64_t,
+		z0 = svdup_n_u64 (-255),
+		z0 = svdup_u64 (-255))
+
+/*
+** dup_m256_u64:
+**	mov	z0\.d, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_u64, svuint64_t,
+		z0 = svdup_n_u64 (-256),
+		z0 = svdup_u64 (-256))
+
+/*
+** dup_m257_u64:
+**	mov	z0\.d, #-257
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_u64, svuint64_t,
+		z0 = svdup_n_u64 (-257),
+		z0 = svdup_u64 (-257))
+
+/*
+** dup_m258_u64:
+**	mov	(x[0-9]+), -258
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m258_u64, svuint64_t,
+		z0 = svdup_n_u64 (-258),
+		z0 = svdup_u64 (-258))
+
+/*
+** dup_m259_u64:
+**	mov	(x[0-9]+), -259
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m259_u64, svuint64_t,
+		z0 = svdup_n_u64 (-259),
+		z0 = svdup_u64 (-259))
+
+/*
+** dup_m512_u64:
+**	mov	z0\.d, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_u64, svuint64_t,
+		z0 = svdup_n_u64 (-512),
+		z0 = svdup_u64 (-512))
+
+/*
+** dup_m7f00_u64:
+**	mov	z0\.d, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_u64, svuint64_t,
+		z0 = svdup_n_u64 (-0x7f00),
+		z0 = svdup_u64 (-0x7f00))
+
+/*
+** dup_m7f01_u64:
+**	mov	z0\.d, #-32513
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_u64, svuint64_t,
+		z0 = svdup_n_u64 (-0x7f01),
+		z0 = svdup_u64 (-0x7f01))
+
+/*
+** dup_m7f02_u64:
+**	mov	(x[0-9]+), -32514
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f02_u64, svuint64_t,
+		z0 = svdup_n_u64 (-0x7f02),
+		z0 = svdup_u64 (-0x7f02))
+
+/*
+** dup_m7ffe_u64:
+**	mov	(x[0-9]+), -32766
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7ffe_u64, svuint64_t,
+		z0 = svdup_n_u64 (-0x7ffe),
+		z0 = svdup_u64 (-0x7ffe))
+
+/*
+** dup_m7fff_u64:
+**	mov	z0\.d, #-32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_u64, svuint64_t,
+		z0 = svdup_n_u64 (-0x7fff),
+		z0 = svdup_u64 (-0x7fff))
+
+/*
+** dup_m8000_u64:
+**	mov	z0\.d, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_u64, svuint64_t,
+		z0 = svdup_n_u64 (-0x8000),
+		z0 = svdup_u64 (-0x8000))
+
+/*
+** dup_x0_u64:
+**	mov	z0\.d, x0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_x0_u64, svuint64_t, uint64_t,
+		 z0 = svdup_n_u64 (x0),
+		 z0 = svdup_u64 (x0))
+
+/*
+** dup_1_u64_m:
+**	mov	z0\.d, p0/m, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, 1),
+		z0 = svdup_u64_m (z0, p0, 1))
+
+/*
+** dup_127_u64_m:
+**	mov	z0\.d, p0/m, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, 127),
+		z0 = svdup_u64_m (z0, p0, 127))
+
+/*
+** dup_128_u64_m:
+**	mov	(z[0-9]+\.d), #128
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, 128),
+		z0 = svdup_u64_m (z0, p0, 128))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_129_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, 129),
+		z0 = svdup_u64_m (z0, p0, 129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_253_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, 253),
+		z0 = svdup_u64_m (z0, p0, 253))
+
+/*
+** dup_254_u64_m:
+**	mov	(z[0-9]+\.d), #254
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, 254),
+		z0 = svdup_u64_m (z0, p0, 254))
+
+/*
+** dup_255_u64_m:
+**	mov	(z[0-9]+\.d), #255
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, 255),
+		z0 = svdup_u64_m (z0, p0, 255))
+
+/*
+** dup_256_u64_m:
+**	mov	z0\.d, p0/m, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, 256),
+		z0 = svdup_u64_m (z0, p0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_257_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, 257),
+		z0 = svdup_u64_m (z0, p0, 257))
+
+/*
+** dup_512_u64_m:
+**	mov	z0\.d, p0/m, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, 512),
+		z0 = svdup_u64_m (z0, p0, 512))
+
+/*
+** dup_7f00_u64_m:
+**	mov	z0\.d, p0/m, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, 0x7f00),
+		z0 = svdup_u64_m (z0, p0, 0x7f00))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7f01_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, 0x7f01),
+		z0 = svdup_u64_m (z0, p0, 0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7ffd_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, 0x7ffd),
+		z0 = svdup_u64_m (z0, p0, 0x7ffd))
+
+/*
+** dup_7ffe_u64_m:
+**	mov	(z[0-9]+\.d), #32766
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, 0x7ffe),
+		z0 = svdup_u64_m (z0, p0, 0x7ffe))
+
+/*
+** dup_7fff_u64_m:
+**	mov	(z[0-9]+\.d), #32767
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, 0x7fff),
+		z0 = svdup_u64_m (z0, p0, 0x7fff))
+
+/*
+** dup_m1_u64_m:
+**	mov	z0\.d, p0/m, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -1),
+		z0 = svdup_u64_m (z0, p0, -1))
+
+/*
+** dup_m128_u64_m:
+**	mov	z0\.d, p0/m, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -128),
+		z0 = svdup_u64_m (z0, p0, -128))
+
+/*
+** dup_m129_u64_m:
+**	mov	(z[0-9]+\.d), #-129
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -129),
+		z0 = svdup_u64_m (z0, p0, -129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m130_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -130),
+		z0 = svdup_u64_m (z0, p0, -130))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m254_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -254),
+		z0 = svdup_u64_m (z0, p0, -254))
+
+/*
+** dup_m255_u64_m:
+**	mov	(z[0-9]+\.d), #-255
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -255),
+		z0 = svdup_u64_m (z0, p0, -255))
+
+/*
+** dup_m256_u64_m:
+**	mov	z0\.d, p0/m, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -256),
+		z0 = svdup_u64_m (z0, p0, -256))
+
+/*
+** dup_m257_u64_m:
+**	mov	(z[0-9]+\.d), #-257
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -257),
+		z0 = svdup_u64_m (z0, p0, -257))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m258_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -258),
+		z0 = svdup_u64_m (z0, p0, -258))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m259_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -259),
+		z0 = svdup_u64_m (z0, p0, -259))
+
+/*
+** dup_m512_u64_m:
+**	mov	z0\.d, p0/m, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -512),
+		z0 = svdup_u64_m (z0, p0, -512))
+
+/*
+** dup_m7f00_u64_m:
+**	mov	z0\.d, p0/m, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -0x7f00),
+		z0 = svdup_u64_m (z0, p0, -0x7f00))
+
+/*
+** dup_m7f01_u64_m:
+**	mov	(z[0-9]+\.d), #-32513
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -0x7f01),
+		z0 = svdup_u64_m (z0, p0, -0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7f02_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -0x7f02),
+		z0 = svdup_u64_m (z0, p0, -0x7f02))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7ffe_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -0x7ffe),
+		z0 = svdup_u64_m (z0, p0, -0x7ffe))
+
+/*
+** dup_m7fff_u64_m:
+**	mov	(z[0-9]+\.d), #-32767
+**	sel	z0\.d, p0, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -0x7fff),
+		z0 = svdup_u64_m (z0, p0, -0x7fff))
+
+/*
+** dup_m8000_u64_m:
+**	mov	z0\.d, p0/m, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, -0x8000),
+		z0 = svdup_u64_m (z0, p0, -0x8000))
+
+/*
+** dup_0_u64_m:
+**	mov	z0\.d, p0/m, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_u64_m, svuint64_t,
+		z0 = svdup_n_u64_m (z0, p0, 0),
+		z0 = svdup_u64_m (z0, p0, 0))
+
+/*
+** dup_x0_u64_m:
+**	movprfx	z0, z1
+**	mov	z0\.d, p0/m, x0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_x0_u64_m, svuint64_t, uint64_t,
+		z0 = svdup_n_u64_m (z1, p0, x0),
+		z0 = svdup_u64_m (z1, p0, x0))
+
+/*
+** dup_1_u64_z:
+**	mov	z0\.d, p0/z, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, 1),
+		z0 = svdup_u64_z (p0, 1))
+
+/*
+** dup_127_u64_z:
+**	mov	z0\.d, p0/z, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, 127),
+		z0 = svdup_u64_z (p0, 127))
+
+/*
+** dup_128_u64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #128
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, 128),
+		z0 = svdup_u64_z (p0, 128))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_129_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, 129),
+		z0 = svdup_u64_z (p0, 129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_253_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, 253),
+		z0 = svdup_u64_z (p0, 253))
+
+/*
+** dup_254_u64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #254
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, 254),
+		z0 = svdup_u64_z (p0, 254))
+
+/*
+** dup_255_u64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #255
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, 255),
+		z0 = svdup_u64_z (p0, 255))
+
+/*
+** dup_256_u64_z:
+**	mov	z0\.d, p0/z, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, 256),
+		z0 = svdup_u64_z (p0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_257_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, 257),
+		z0 = svdup_u64_z (p0, 257))
+
+/*
+** dup_512_u64_z:
+**	mov	z0\.d, p0/z, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, 512),
+		z0 = svdup_u64_z (p0, 512))
+
+/*
+** dup_7f00_u64_z:
+**	mov	z0\.d, p0/z, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, 0x7f00),
+		z0 = svdup_u64_z (p0, 0x7f00))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7f01_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, 0x7f01),
+		z0 = svdup_u64_z (p0, 0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_7ffd_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, 0x7ffd),
+		z0 = svdup_u64_z (p0, 0x7ffd))
+
+/*
+** dup_7ffe_u64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #32766
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, 0x7ffe),
+		z0 = svdup_u64_z (p0, 0x7ffe))
+
+/*
+** dup_7fff_u64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #32767
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, 0x7fff),
+		z0 = svdup_u64_z (p0, 0x7fff))
+
+/*
+** dup_m1_u64_z:
+**	mov	z0\.d, p0/z, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -1),
+		z0 = svdup_u64_z (p0, -1))
+
+/*
+** dup_m128_u64_z:
+**	mov	z0\.d, p0/z, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -128),
+		z0 = svdup_u64_z (p0, -128))
+
+/*
+** dup_m129_u64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #-129
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -129),
+		z0 = svdup_u64_z (p0, -129))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m130_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -130),
+		z0 = svdup_u64_z (p0, -130))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m254_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -254),
+		z0 = svdup_u64_z (p0, -254))
+
+/*
+** dup_m255_u64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #-255
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -255),
+		z0 = svdup_u64_z (p0, -255))
+
+/*
+** dup_m256_u64_z:
+**	mov	z0\.d, p0/z, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -256),
+		z0 = svdup_u64_z (p0, -256))
+
+/*
+** dup_m257_u64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #-257
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -257),
+		z0 = svdup_u64_z (p0, -257))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m258_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -258),
+		z0 = svdup_u64_z (p0, -258))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m259_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -259),
+		z0 = svdup_u64_z (p0, -259))
+
+/*
+** dup_m512_u64_z:
+**	mov	z0\.d, p0/z, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -512),
+		z0 = svdup_u64_z (p0, -512))
+
+/*
+** dup_m7f00_u64_z:
+**	mov	z0\.d, p0/z, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -0x7f00),
+		z0 = svdup_u64_z (p0, -0x7f00))
+
+/*
+** dup_m7f01_u64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #-32513
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -0x7f01),
+		z0 = svdup_u64_z (p0, -0x7f01))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7f02_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -0x7f02),
+		z0 = svdup_u64_z (p0, -0x7f02))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (dup_m7ffe_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -0x7ffe),
+		z0 = svdup_u64_z (p0, -0x7ffe))
+
+/*
+** dup_m7fff_u64_z:
+**	mov	(z[0-9]+)\.b, #0
+**	mov	(z[0-9]+\.d), #-32767
+**	sel	z0\.d, p0, \2, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -0x7fff),
+		z0 = svdup_u64_z (p0, -0x7fff))
+
+/*
+** dup_m8000_u64_z:
+**	mov	z0\.d, p0/z, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, -0x8000),
+		z0 = svdup_u64_z (p0, -0x8000))
+
+/*
+** dup_0_u64_z:
+**	mov	z0\.d, p0/z, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_u64_z, svuint64_t,
+		z0 = svdup_n_u64_z (p0, 0),
+		z0 = svdup_u64_z (p0, 0))
+
+/*
+** dup_x0_u64_z:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mov	z0\.d, p0/m, x0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_x0_u64_z, svuint64_t, uint64_t,
+		z0 = svdup_n_u64_z (p0, x0),
+		z0 = svdup_u64_z (p0, x0))
+
+/*
+** dup_1_u64_x:
+**	mov	z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, 1),
+		z0 = svdup_u64_x (p0, 1))
+
+/*
+** dup_127_u64_x:
+**	mov	z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, 127),
+		z0 = svdup_u64_x (p0, 127))
+
+/*
+** dup_128_u64_x:
+**	mov	z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, 128),
+		z0 = svdup_u64_x (p0, 128))
+
+/*
+** dup_129_u64_x:
+**	mov	(x[0-9]+), 129
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, 129),
+		z0 = svdup_u64_x (p0, 129))
+
+/*
+** dup_253_u64_x:
+**	mov	(x[0-9]+), 253
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, 253),
+		z0 = svdup_u64_x (p0, 253))
+
+/*
+** dup_254_u64_x:
+**	mov	z0\.d, #254
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, 254),
+		z0 = svdup_u64_x (p0, 254))
+
+/*
+** dup_255_u64_x:
+**	mov	z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, 255),
+		z0 = svdup_u64_x (p0, 255))
+
+/*
+** dup_256_u64_x:
+**	mov	z0\.d, #256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_256_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, 256),
+		z0 = svdup_u64_x (p0, 256))
+
+/*
+** dup_257_u64_x:
+**	mov	(x[0-9]+), 257
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_257_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, 257),
+		z0 = svdup_u64_x (p0, 257))
+
+/*
+** dup_512_u64_x:
+**	mov	z0\.d, #512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_512_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, 512),
+		z0 = svdup_u64_x (p0, 512))
+
+/*
+** dup_7f00_u64_x:
+**	mov	z0\.d, #32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f00_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, 0x7f00),
+		z0 = svdup_u64_x (p0, 0x7f00))
+
+/*
+** dup_7f01_u64_x:
+**	mov	(x[0-9]+), 32513
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7f01_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, 0x7f01),
+		z0 = svdup_u64_x (p0, 0x7f01))
+
+/*
+** dup_7ffd_u64_x:
+**	mov	(x[0-9]+), 32765
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffd_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, 0x7ffd),
+		z0 = svdup_u64_x (p0, 0x7ffd))
+
+/*
+** dup_7ffe_u64_x:
+**	mov	z0\.d, #32766
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7ffe_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, 0x7ffe),
+		z0 = svdup_u64_x (p0, 0x7ffe))
+
+/*
+** dup_7fff_u64_x:
+**	mov	z0\.d, #32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_7fff_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, 0x7fff),
+		z0 = svdup_u64_x (p0, 0x7fff))
+
+/*
+** dup_m1_u64_x:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -1),
+		z0 = svdup_u64_x (p0, -1))
+
+/*
+** dup_m128_u64_x:
+**	mov	z0\.d, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -128),
+		z0 = svdup_u64_x (p0, -128))
+
+/*
+** dup_m129_u64_x:
+**	mov	z0\.d, #-129
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m129_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -129),
+		z0 = svdup_u64_x (p0, -129))
+
+/*
+** dup_m130_u64_x:
+**	mov	(x[0-9]+), -130
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m130_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -130),
+		z0 = svdup_u64_x (p0, -130))
+
+/*
+** dup_m254_u64_x:
+**	mov	(x[0-9]+), -254
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m254_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -254),
+		z0 = svdup_u64_x (p0, -254))
+
+/*
+** dup_m255_u64_x:
+**	mov	z0\.d, #-255
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m255_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -255),
+		z0 = svdup_u64_x (p0, -255))
+
+/*
+** dup_m256_u64_x:
+**	mov	z0\.d, #-256
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m256_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -256),
+		z0 = svdup_u64_x (p0, -256))
+
+/*
+** dup_m257_u64_x:
+**	mov	z0\.d, #-257
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m257_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -257),
+		z0 = svdup_u64_x (p0, -257))
+
+/*
+** dup_m258_u64_x:
+**	mov	(x[0-9]+), -258
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m258_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -258),
+		z0 = svdup_u64_x (p0, -258))
+
+/*
+** dup_m259_u64_x:
+**	mov	(x[0-9]+), -259
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m259_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -259),
+		z0 = svdup_u64_x (p0, -259))
+
+/*
+** dup_m512_u64_x:
+**	mov	z0\.d, #-512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m512_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -512),
+		z0 = svdup_u64_x (p0, -512))
+
+/*
+** dup_m7f00_u64_x:
+**	mov	z0\.d, #-32512
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f00_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -0x7f00),
+		z0 = svdup_u64_x (p0, -0x7f00))
+
+/*
+** dup_m7f01_u64_x:
+**	mov	z0\.d, #-32513
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f01_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -0x7f01),
+		z0 = svdup_u64_x (p0, -0x7f01))
+
+/*
+** dup_m7f02_u64_x:
+**	mov	(x[0-9]+), -32514
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7f02_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -0x7f02),
+		z0 = svdup_u64_x (p0, -0x7f02))
+
+/*
+** dup_m7ffe_u64_x:
+**	mov	(x[0-9]+), -32766
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7ffe_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -0x7ffe),
+		z0 = svdup_u64_x (p0, -0x7ffe))
+
+/*
+** dup_m7fff_u64_x:
+**	mov	z0\.d, #-32767
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m7fff_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -0x7fff),
+		z0 = svdup_u64_x (p0, -0x7fff))
+
+/*
+** dup_m8000_u64_x:
+**	mov	z0\.d, #-32768
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m8000_u64_x, svuint64_t,
+		z0 = svdup_n_u64_x (p0, -0x8000),
+		z0 = svdup_u64_x (p0, -0x8000))
+
+/*
+** dup_x0_u64_x:
+**	mov	z0\.d, x0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_x0_u64_x, svuint64_t, uint64_t,
+		z0 = svdup_n_u64_x (p0, x0),
+		z0 = svdup_u64_x (p0, x0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u8.c
new file mode 100644
index 000000000..d27f4bba9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dup_u8.c
@@ -0,0 +1,383 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_1_u8:
+**	mov	z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_u8, svuint8_t,
+		z0 = svdup_n_u8 (1),
+		z0 = svdup_u8 (1))
+
+/*
+** dup_127_u8:
+**	mov	z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_u8, svuint8_t,
+		z0 = svdup_n_u8 (127),
+		z0 = svdup_u8 (127))
+
+/*
+** dup_128_u8:
+**	mov	z0\.b, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_u8, svuint8_t,
+		z0 = svdup_n_u8 (128),
+		z0 = svdup_u8 (128))
+
+/*
+** dup_129_u8:
+**	mov	z0\.b, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_u8, svuint8_t,
+		z0 = svdup_n_u8 (129),
+		z0 = svdup_u8 (129))
+
+/*
+** dup_253_u8:
+**	mov	z0\.b, #-3
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_u8, svuint8_t,
+		z0 = svdup_n_u8 (253),
+		z0 = svdup_u8 (253))
+
+/*
+** dup_254_u8:
+**	mov	z0\.b, #-2
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_u8, svuint8_t,
+		z0 = svdup_n_u8 (254),
+		z0 = svdup_u8 (254))
+
+/*
+** dup_255_u8:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_u8, svuint8_t,
+		z0 = svdup_n_u8 (255),
+		z0 = svdup_u8 (255))
+
+/*
+** dup_m1_u8:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_u8, svuint8_t,
+		z0 = svdup_n_u8 (-1),
+		z0 = svdup_u8 (-1))
+
+/*
+** dup_m128_u8:
+**	mov	z0\.b, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_u8, svuint8_t,
+		z0 = svdup_n_u8 (-128),
+		z0 = svdup_u8 (-128))
+
+/*
+** dup_w0_u8:
+**	mov	z0\.b, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_u8, svuint8_t, uint8_t,
+		 z0 = svdup_n_u8 (x0),
+		 z0 = svdup_u8 (x0))
+
+/*
+** dup_1_u8_m:
+**	mov	z0\.b, p0/m, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_u8_m, svuint8_t,
+		z0 = svdup_n_u8_m (z0, p0, 1),
+		z0 = svdup_u8_m (z0, p0, 1))
+
+/*
+** dup_127_u8_m:
+**	mov	z0\.b, p0/m, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_u8_m, svuint8_t,
+		z0 = svdup_n_u8_m (z0, p0, 127),
+		z0 = svdup_u8_m (z0, p0, 127))
+
+/*
+** dup_128_u8_m:
+**	mov	z0\.b, p0/m, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_u8_m, svuint8_t,
+		z0 = svdup_n_u8_m (z0, p0, 128),
+		z0 = svdup_u8_m (z0, p0, 128))
+
+/*
+** dup_129_u8_m:
+**	mov	z0\.b, p0/m, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_u8_m, svuint8_t,
+		z0 = svdup_n_u8_m (z0, p0, 129),
+		z0 = svdup_u8_m (z0, p0, 129))
+
+/*
+** dup_253_u8_m:
+**	mov	z0\.b, p0/m, #-3
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_u8_m, svuint8_t,
+		z0 = svdup_n_u8_m (z0, p0, 253),
+		z0 = svdup_u8_m (z0, p0, 253))
+
+/*
+** dup_254_u8_m:
+**	mov	z0\.b, p0/m, #-2
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_u8_m, svuint8_t,
+		z0 = svdup_n_u8_m (z0, p0, 254),
+		z0 = svdup_u8_m (z0, p0, 254))
+
+/*
+** dup_255_u8_m:
+**	mov	z0\.b, p0/m, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_u8_m, svuint8_t,
+		z0 = svdup_n_u8_m (z0, p0, 255),
+		z0 = svdup_u8_m (z0, p0, 255))
+
+/*
+** dup_m1_u8_m:
+**	mov	z0\.b, p0/m, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_u8_m, svuint8_t,
+		z0 = svdup_n_u8_m (z0, p0, -1),
+		z0 = svdup_u8_m (z0, p0, -1))
+
+/*
+** dup_m128_u8_m:
+**	mov	z0\.b, p0/m, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_u8_m, svuint8_t,
+		z0 = svdup_n_u8_m (z0, p0, -128),
+		z0 = svdup_u8_m (z0, p0, -128))
+
+/*
+** dup_0_u8_m:
+**	mov	z0\.b, p0/m, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_u8_m, svuint8_t,
+		z0 = svdup_n_u8_m (z0, p0, 0),
+		z0 = svdup_u8_m (z0, p0, 0))
+
+/*
+** dup_w0_u8_m:
+**	movprfx	z0, z1
+**	mov	z0\.b, p0/m, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_u8_m, svuint8_t, uint8_t,
+		z0 = svdup_n_u8_m (z1, p0, x0),
+		z0 = svdup_u8_m (z1, p0, x0))
+
+/*
+** dup_1_u8_z:
+**	mov	z0\.b, p0/z, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_u8_z, svuint8_t,
+		z0 = svdup_n_u8_z (p0, 1),
+		z0 = svdup_u8_z (p0, 1))
+
+/*
+** dup_127_u8_z:
+**	mov	z0\.b, p0/z, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_u8_z, svuint8_t,
+		z0 = svdup_n_u8_z (p0, 127),
+		z0 = svdup_u8_z (p0, 127))
+
+/*
+** dup_128_u8_z:
+**	mov	z0\.b, p0/z, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_u8_z, svuint8_t,
+		z0 = svdup_n_u8_z (p0, 128),
+		z0 = svdup_u8_z (p0, 128))
+
+/*
+** dup_129_u8_z:
+**	mov	z0\.b, p0/z, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_u8_z, svuint8_t,
+		z0 = svdup_n_u8_z (p0, 129),
+		z0 = svdup_u8_z (p0, 129))
+
+/*
+** dup_253_u8_z:
+**	mov	z0\.b, p0/z, #-3
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_u8_z, svuint8_t,
+		z0 = svdup_n_u8_z (p0, 253),
+		z0 = svdup_u8_z (p0, 253))
+
+/*
+** dup_254_u8_z:
+**	mov	z0\.b, p0/z, #-2
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_u8_z, svuint8_t,
+		z0 = svdup_n_u8_z (p0, 254),
+		z0 = svdup_u8_z (p0, 254))
+
+/*
+** dup_255_u8_z:
+**	mov	z0\.b, p0/z, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_u8_z, svuint8_t,
+		z0 = svdup_n_u8_z (p0, 255),
+		z0 = svdup_u8_z (p0, 255))
+
+/*
+** dup_m1_u8_z:
+**	mov	z0\.b, p0/z, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_u8_z, svuint8_t,
+		z0 = svdup_n_u8_z (p0, -1),
+		z0 = svdup_u8_z (p0, -1))
+
+/*
+** dup_m128_u8_z:
+**	mov	z0\.b, p0/z, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_u8_z, svuint8_t,
+		z0 = svdup_n_u8_z (p0, -128),
+		z0 = svdup_u8_z (p0, -128))
+
+/*
+** dup_0_u8_z:
+**	mov	z0\.b, p0/z, #0
+**	ret
+*/
+TEST_UNIFORM_Z (dup_0_u8_z, svuint8_t,
+		z0 = svdup_n_u8_z (p0, 0),
+		z0 = svdup_u8_z (p0, 0))
+
+/*
+** dup_w0_u8_z:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mov	z0\.b, p0/m, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_u8_z, svuint8_t, uint8_t,
+		z0 = svdup_n_u8_z (p0, x0),
+		z0 = svdup_u8_z (p0, x0))
+
+/*
+** dup_1_u8_x:
+**	mov	z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_1_u8_x, svuint8_t,
+		z0 = svdup_n_u8_x (p0, 1),
+		z0 = svdup_u8_x (p0, 1))
+
+/*
+** dup_127_u8_x:
+**	mov	z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_127_u8_x, svuint8_t,
+		z0 = svdup_n_u8_x (p0, 127),
+		z0 = svdup_u8_x (p0, 127))
+
+/*
+** dup_128_u8_x:
+**	mov	z0\.b, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_128_u8_x, svuint8_t,
+		z0 = svdup_n_u8_x (p0, 128),
+		z0 = svdup_u8_x (p0, 128))
+
+/*
+** dup_129_u8_x:
+**	mov	z0\.b, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (dup_129_u8_x, svuint8_t,
+		z0 = svdup_n_u8_x (p0, 129),
+		z0 = svdup_u8_x (p0, 129))
+
+/*
+** dup_253_u8_x:
+**	mov	z0\.b, #-3
+**	ret
+*/
+TEST_UNIFORM_Z (dup_253_u8_x, svuint8_t,
+		z0 = svdup_n_u8_x (p0, 253),
+		z0 = svdup_u8_x (p0, 253))
+
+/*
+** dup_254_u8_x:
+**	mov	z0\.b, #-2
+**	ret
+*/
+TEST_UNIFORM_Z (dup_254_u8_x, svuint8_t,
+		z0 = svdup_n_u8_x (p0, 254),
+		z0 = svdup_u8_x (p0, 254))
+
+/*
+** dup_255_u8_x:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_255_u8_x, svuint8_t,
+		z0 = svdup_n_u8_x (p0, 255),
+		z0 = svdup_u8_x (p0, 255))
+
+/*
+** dup_m1_u8_x:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m1_u8_x, svuint8_t,
+		z0 = svdup_n_u8_x (p0, -1),
+		z0 = svdup_u8_x (p0, -1))
+
+/*
+** dup_m128_u8_x:
+**	mov	z0\.b, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (dup_m128_u8_x, svuint8_t,
+		z0 = svdup_n_u8_x (p0, -128),
+		z0 = svdup_u8_x (p0, -128))
+
+/*
+** dup_w0_u8_x:
+**	mov	z0\.b, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (dup_w0_u8_x, svuint8_t, uint8_t,
+		z0 = svdup_n_u8_x (p0, x0),
+		z0 = svdup_u8_x (p0, x0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b16.c
new file mode 100644
index 000000000..ecbacd7e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b16.c
@@ -0,0 +1,276 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_00_b16:
+**	pfalse	p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (dupq_00_b16,
+		p0 = svdupq_n_b16 (0, 0, 0, 0, 0, 0, 0, 0),
+		p0 = svdupq_b16 (0, 0, 0, 0, 0, 0, 0, 0))
+
+/*
+** dupq_11_b16:
+**	ptrue	p0\.d, all
+**	ret
+*/
+TEST_UNIFORM_P (dupq_11_b16,
+		p0 = svdupq_n_b16 (1, 0, 0, 0, 1, 0, 0, 0),
+		p0 = svdupq_b16 (1, 0, 0, 0, 1, 0, 0, 0))
+
+/*
+** dupq_22_b16:
+** (
+**	pfalse	(p[0-7])\.b
+**	ptrue	(p[0-7])\.d, all
+**	trn1	p0\.h, \1\.h, \2\.h
+** |
+**	ptrue	(p[0-7])\.d, all
+**	pfalse	(p[0-7])\.b
+**	trn1	p0\.h, \4\.h, \3\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_22_b16,
+		p0 = svdupq_n_b16 (0, 1, 0, 0, 0, 1, 0, 0),
+		p0 = svdupq_b16 (0, 1, 0, 0, 0, 1, 0, 0))
+
+/*
+** dupq_33_b16:
+**	ptrue	(p[0-7])\.d, all
+**	trn1	p0\.h, \1\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_P (dupq_33_b16,
+		p0 = svdupq_n_b16 (1, 1, 0, 0, 1, 1, 0, 0),
+		p0 = svdupq_b16 (1, 1, 0, 0, 1, 1, 0, 0))
+
+/*
+** dupq_44_b16:
+** (
+**	ptrue	(p[0-7])\.d, all
+**	ptrue	(p[0-7])\.s, all
+**	not	p0\.b, \2/z, \1\.b
+** |
+**	ptrue	(p[0-7])\.s, all
+**	ptrue	(p[0-7])\.d, all
+**	not	p0\.b, \3/z, \4\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_44_b16,
+		p0 = svdupq_n_b16 (0, 0, 1, 0, 0, 0, 1, 0),
+		p0 = svdupq_b16 (0, 0, 1, 0, 0, 0, 1, 0))
+
+/*
+** dupq_55_b16:
+**	ptrue	p0\.s, all
+**	ret
+*/
+TEST_UNIFORM_P (dupq_55_b16,
+		p0 = svdupq_n_b16 (1, 0, 1, 0, 1, 0, 1, 0),
+		p0 = svdupq_b16 (1, 0, 1, 0, 1, 0, 1, 0))
+
+/*
+** dupq_66_b16:
+**	...
+**	cmpne	p0\.b, p[0-7]/z, z[0-9]+\.b, #0
+**	ret
+*/
+TEST_UNIFORM_P (dupq_66_b16,
+		p0 = svdupq_n_b16 (0, 1, 1, 0, 0, 1, 1, 0),
+		p0 = svdupq_b16 (0, 1, 1, 0, 0, 1, 1, 0))
+
+/*
+** dupq_77_b16:
+** (
+**	ptrue	(p[0-7])\.d, all
+**	ptrue	(p[0-7])\.[hs], all
+**	trn1	p0\.h, \2\.h, \1\.h
+** |
+**	ptrue	(p[0-7])\.[hs], all
+**	ptrue	(p[0-7])\.s, all
+**	trn1	p0\.h, \3\.h, \4\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_77_b16,
+		p0 = svdupq_n_b16 (1, 1, 1, 0, 1, 1, 1, 0),
+		p0 = svdupq_b16 (1, 1, 1, 0, 1, 1, 1, 0))
+
+/*
+** dupq_88_b16:
+** (
+**	mov	(z[0-9]+)\.d, #71776119061217280
+**	ptrue	(p[0-7])\.b, all
+**	cmpne	p0\.b, \2/z, \1\.b, #0
+** |
+**	ptrue	(p[0-7])\.b, all
+**	mov	(z[0-9]+)\.d, #71776119061217280
+**	cmpne	p0\.b, \3/z, \4\.b, #0
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_88_b16,
+		p0 = svdupq_n_b16 (0, 0, 0, 1, 0, 0, 0, 1),
+		p0 = svdupq_b16 (0, 0, 0, 1, 0, 0, 0, 1))
+
+/*
+** dupq_99_b16:
+**	...
+**	cmpne	p0\.b, p[0-7]/z, z[0-9]+\.b, #0
+**	ret
+*/
+TEST_UNIFORM_P (dupq_99_b16,
+		p0 = svdupq_n_b16 (1, 0, 0, 1, 1, 0, 0, 1),
+		p0 = svdupq_b16 (1, 0, 0, 1, 1, 0, 0, 1))
+
+/*
+** dupq_aa_b16:
+** (
+**	ptrue	(p[0-7])\.s, all
+**	ptrue	(p[0-7])\.h, all
+**	not	p0\.b, \2/z, \1\.b
+** |
+**	ptrue	(p[0-7])\.h, all
+**	ptrue	(p[0-7])\.s, all
+**	not	p0\.b, \3/z, \4\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_aa_b16,
+		p0 = svdupq_n_b16 (0, 1, 0, 1, 0, 1, 0, 1),
+		p0 = svdupq_b16 (0, 1, 0, 1, 0, 1, 0, 1))
+
+/*
+** dupq_bb_b16:
+** (
+**	ptrue	(p[0-7])\.d, all
+**	ptrue	(p[0-7])\.[hs], all
+**	trn1	p0\.h, \1\.h, \2\.h
+** |
+**	ptrue	(p[0-7])\.[hs], all
+**	ptrue	(p[0-7])\.d, all
+**	trn1	p0\.h, \4\.h, \3\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_bb_b16,
+		p0 = svdupq_n_b16 (1, 1, 0, 1, 1, 1, 0, 1),
+		p0 = svdupq_b16 (1, 1, 0, 1, 1, 1, 0, 1))
+
+/*
+** dupq_cc_b16:
+** (
+**	pfalse	(p[0-7])\.b
+**	ptrue	(p[0-7])\.h, all
+**	trn1	p0\.s, \1\.s, \2\.s
+** |
+**	ptrue	(p[0-7])\.h, all
+**	pfalse	(p[0-7])\.b
+**	trn1	p0\.s, \4\.s, \3\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_cc_b16,
+		p0 = svdupq_n_b16 (0, 0, 1, 1, 0, 0, 1, 1),
+		p0 = svdupq_b16 (0, 0, 1, 1, 0, 0, 1, 1))
+
+/*
+** dupq_dd_b16:
+** (
+**	ptrue	(p[0-7])\.[sd], all
+**	ptrue	(p[0-7])\.h, all
+**	trn1	p0\.s, \1\.s, \2\.s
+** |
+**	ptrue	(p[0-7])\.h, all
+**	ptrue	(p[0-7])\.[sd], all
+**	trn1	p0\.s, \4\.s, \3\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_dd_b16,
+		p0 = svdupq_n_b16 (1, 0, 1, 1, 1, 0, 1, 1),
+		p0 = svdupq_b16 (1, 0, 1, 1, 1, 0, 1, 1))
+
+/*
+** dupq_ee_b16:
+** (
+**	ptrue	(p[0-7])\.d, all
+**	ptrue	(p[0-7])\.h, all
+**	not	p0\.b, \2/z, \1\.b
+** |
+**	ptrue	(p[0-7])\.h, all
+**	ptrue	(p[0-7])\.d, all
+**	not	p0\.b, \3/z, \4\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_ee_b16,
+		p0 = svdupq_n_b16 (0, 1, 1, 1, 0, 1, 1, 1),
+		p0 = svdupq_b16 (0, 1, 1, 1, 0, 1, 1, 1))
+
+/*
+** dupq_ff_b16:
+**	ptrue	p0\.h, all
+**	ret
+*/
+TEST_UNIFORM_P (dupq_ff_b16,
+		p0 = svdupq_n_b16 (1, 1, 1, 1, 1, 1, 1, 1),
+		p0 = svdupq_b16 (1, 1, 1, 1, 1, 1, 1, 1))
+
+/*
+** dupq_01_b16:
+** (
+**	ptrue	(p[0-7])\.d, all
+**	pfalse	(p[0-7])\.b
+**	trn1	p0\.d, \1\.d, \2\.d
+** |
+**	pfalse	(p[0-7])\.b
+**	ptrue	(p[0-7])\.d, all
+**	trn1	p0\.d, \4\.d, \3\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_01_b16,
+		p0 = svdupq_n_b16 (1, 0, 0, 0, 0, 0, 0, 0),
+		p0 = svdupq_b16 (1, 0, 0, 0, 0, 0, 0, 0))
+
+/*
+** dupq_03_b16:
+**	...
+**	cmpne	p0\.b, p[0-7]/z, z[0-9]+\.b, #0
+**	ret
+*/
+TEST_UNIFORM_P (dupq_03_b16,
+		p0 = svdupq_n_b16 (1, 1, 0, 0, 0, 0, 0, 0),
+		p0 = svdupq_b16 (1, 1, 0, 0, 0, 0, 0, 0))
+
+/*
+** dupq_0f_b16:
+** (
+**	ptrue	(p[0-7])\.h, all
+**	pfalse	(p[0-7])\.b
+**	trn1	p0\.d, \1\.d, \2\.d
+** |
+**	pfalse	(p[0-7])\.b
+**	ptrue	(p[0-7])\.h, all
+**	trn1	p0\.d, \4\.d, \3\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_0f_b16,
+		p0 = svdupq_n_b16 (1, 1, 1, 1, 0, 0, 0, 0),
+		p0 = svdupq_b16 (1, 1, 1, 1, 0, 0, 0, 0))
+
+/*
+** dupq_3f_b16:
+**	...
+**	cmpne	p0\.b, p[0-7]/z, z[0-9]+\.b, #0
+**	ret
+*/
+TEST_UNIFORM_P (dupq_3f_b16,
+		p0 = svdupq_n_b16 (1, 1, 1, 1, 1, 1, 0, 0),
+		p0 = svdupq_b16 (1, 1, 1, 1, 1, 1, 0, 0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b32.c
new file mode 100644
index 000000000..39719a76d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b32.c
@@ -0,0 +1,132 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_0_b32:
+**	pfalse	p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (dupq_0_b32,
+		p0 = svdupq_n_b32 (0, 0, 0, 0),
+		p0 = svdupq_b32 (0, 0, 0, 0))
+
+/*
+** dupq_1_b32:
+** (
+**	ptrue	(p[0-7])\.d, all
+**	pfalse	(p[0-7])\.b
+**	trn1	p0\.d, \1\.d, \2\.d
+** |
+**	pfalse	(p[0-7])\.b
+**	ptrue	(p[0-7])\.d, all
+**	trn1	p0\.d, \4\.d, \3\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_1_b32,
+		p0 = svdupq_n_b32 (1, 0, 0, 0),
+		p0 = svdupq_b32 (1, 0, 0, 0))
+
+/*
+** dupq_3_b32:
+** (
+**	ptrue	(p[0-7])\.s, all
+**	pfalse	(p[0-7])\.b
+**	trn1	p0\.d, \1\.d, \2\.d
+** |
+**	pfalse	(p[0-7])\.b
+**	ptrue	(p[0-7])\.s, all
+**	trn1	p0\.d, \4\.d, \3\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_3_b32,
+		p0 = svdupq_n_b32 (1, 1, 0, 0),
+		p0 = svdupq_b32 (1, 1, 0, 0))
+
+/*
+** dupq_4_b32:
+** (
+**	pfalse	(p[0-7])\.b
+**	ptrue	(p[0-7])\.d, all
+**	trn1	p0\.d, \1\.d, \2\.d
+** |
+**	ptrue	(p[0-7])\.d, all
+**	pfalse	(p[0-7])\.b
+**	trn1	p0\.d, \4\.d, \3\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_4_b32,
+		p0 = svdupq_n_b32 (0, 0, 1, 0),
+		p0 = svdupq_b32 (0, 0, 1, 0))
+
+/*
+** dupq_5_b32:
+**	ptrue	p0\.d, all
+**	ret
+*/
+TEST_UNIFORM_P (dupq_5_b32,
+		p0 = svdupq_n_b32 (1, 0, 1, 0),
+		p0 = svdupq_b32 (1, 0, 1, 0))
+
+/*
+** dupq_7_b32:
+** (
+**	ptrue	(p[0-7])\.s, all
+**	ptrue	(p[0-7])\.d, all
+**	trn1	p0\.d, \1\.d, \2\.d
+** |
+**	ptrue	(p[0-7])\.d, all
+**	ptrue	(p[0-7])\.s, all
+**	trn1	p0\.d, \4\.d, \3\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_7_b32,
+		p0 = svdupq_n_b32 (1, 1, 1, 0),
+		p0 = svdupq_b32 (1, 1, 1, 0))
+
+/*
+** dupq_a_b32:
+** (
+**	ptrue	(p[0-7])\.d, all
+**	ptrue	(p[0-7])\.s, all
+**	not	p0\.b, \2/z, \1\.b
+** |
+**	ptrue	(p[0-7])\.s, all
+**	ptrue	(p[0-7])\.d, all
+**	not	p0\.b, \3/z, \4\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_a_b32,
+		p0 = svdupq_n_b32 (0, 1, 0, 1),
+		p0 = svdupq_b32 (0, 1, 0, 1))
+
+/*
+** dupq_e_b32:
+** (
+**	ptrue	(p[0-7])\.d, all
+**	ptrue	(p[0-7])\.s, all
+**	trn1	p0\.d, \1\.d, \2\.d
+** |
+**	ptrue	(p[0-7])\.s, all
+**	ptrue	(p[0-7])\.d, all
+**	trn1	p0\.d, \4\.d, \3\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_e_b32,
+		p0 = svdupq_n_b32 (1, 0, 1, 1),
+		p0 = svdupq_b32 (1, 0, 1, 1))
+
+/*
+** dupq_f_b32:
+**	ptrue	p0\.s, all
+**	ret
+*/
+TEST_UNIFORM_P (dupq_f_b32,
+		p0 = svdupq_n_b32 (1, 1, 1, 1),
+		p0 = svdupq_b32 (1, 1, 1, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b64.c
new file mode 100644
index 000000000..820ace431
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b64.c
@@ -0,0 +1,55 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_0_b64:
+**	pfalse	p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (dupq_0_b64,
+		p0 = svdupq_n_b64 (0, 0),
+		p0 = svdupq_b64 (0, 0))
+
+/*
+** dupq_1_b64:
+** (
+**	ptrue	(p[0-7])\.d, all
+**	pfalse	(p[0-7])\.b
+**	trn1	p0\.d, \1\.d, \2\.d
+** |
+**	pfalse	(p[0-7])\.b
+**	ptrue	(p[0-7])\.d, all
+**	trn1	p0\.d, \4\.d, \3\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_1_b64,
+		p0 = svdupq_n_b64 (1, 0),
+		p0 = svdupq_b64 (1, 0))
+
+/*
+** dupq_2_b64:
+** (
+**	pfalse	(p[0-7])\.b
+**	ptrue	(p[0-7])\.d, all
+**	trn1	p0\.d, \1\.d, \2\.d
+** |
+**	ptrue	(p[0-7])\.d, all
+**	pfalse	(p[0-7])\.b
+**	trn1	p0\.d, \4\.d, \3\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_2_b64,
+		p0 = svdupq_n_b64 (0, 1),
+		p0 = svdupq_b64 (0, 1))
+
+/*
+** dupq_3_b64:
+**	ptrue	p0\.d, all
+**	ret
+*/
+TEST_UNIFORM_P (dupq_3_b64,
+		p0 = svdupq_n_b64 (1, 1),
+		p0 = svdupq_b64 (1, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b8.c
new file mode 100644
index 000000000..4762f950b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_b8.c
@@ -0,0 +1,413 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_0000_b8:
+**	pfalse	p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (dupq_0000_b8,
+		p0 = svdupq_n_b8 (0, 0, 0, 0, 0, 0, 0, 0,
+				  0, 0, 0, 0, 0, 0, 0, 0),
+		p0 = svdupq_b8 (0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0))
+
+/*
+** dupq_1111_b8:
+**	ptrue	p0\.s, all
+**	ret
+*/
+TEST_UNIFORM_P (dupq_1111_b8,
+		p0 = svdupq_n_b8 (1, 0, 0, 0, 1, 0, 0, 0,
+				  1, 0, 0, 0, 1, 0, 0, 0),
+		p0 = svdupq_b8 (1, 0, 0, 0, 1, 0, 0, 0,
+				1, 0, 0, 0, 1, 0, 0, 0))
+
+/*
+** dupq_2222_b8:
+** (
+**	pfalse	(p[0-7])\.b
+**	ptrue	(p[0-7])\.s, all
+**	trn1	p0\.b, \1\.b, \2\.b
+** |
+**	ptrue	(p[0-7])\.s, all
+**	pfalse	(p[0-7])\.b
+**	trn1	p0\.b, \4\.b, \3\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_2222_b8,
+		p0 = svdupq_n_b8 (0, 1, 0, 0, 0, 1, 0, 0,
+				  0, 1, 0, 0, 0, 1, 0, 0),
+		p0 = svdupq_b8 (0, 1, 0, 0, 0, 1, 0, 0,
+				0, 1, 0, 0, 0, 1, 0, 0))
+
+/*
+** dupq_3333_b8:
+**	ptrue	(p[0-7])\.s, all
+**	trn1	p0\.b, \1\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_P (dupq_3333_b8,
+		p0 = svdupq_n_b8 (1, 1, 0, 0, 1, 1, 0, 0,
+				  1, 1, 0, 0, 1, 1, 0, 0),
+		p0 = svdupq_b8 (1, 1, 0, 0, 1, 1, 0, 0,
+				1, 1, 0, 0, 1, 1, 0, 0))
+
+/*
+** dupq_4444_b8:
+** (
+**	ptrue	(p[0-7])\.s, all
+**	ptrue	(p[0-7])\.h, all
+**	not	p0\.b, \2/z, \1\.b
+** |
+**	ptrue	(p[0-7])\.h, all
+**	ptrue	(p[0-7])\.s, all
+**	not	p0\.b, \3/z, \4\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_4444_b8,
+		p0 = svdupq_n_b8 (0, 0, 1, 0, 0, 0, 1, 0,
+				  0, 0, 1, 0, 0, 0, 1, 0),
+		p0 = svdupq_b8 (0, 0, 1, 0, 0, 0, 1, 0,
+				0, 0, 1, 0, 0, 0, 1, 0))
+
+/*
+** dupq_5555_b8:
+**	ptrue	p0\.h, all
+**	ret
+*/
+TEST_UNIFORM_P (dupq_5555_b8,
+		p0 = svdupq_n_b8 (1, 0, 1, 0, 1, 0, 1, 0,
+				  1, 0, 1, 0, 1, 0, 1, 0),
+		p0 = svdupq_b8 (1, 0, 1, 0, 1, 0, 1, 0,
+				1, 0, 1, 0, 1, 0, 1, 0))
+
+/*
+** dupq_6666_b8:
+** (
+**	mov	(z[0-9]+)\.s, #16776960
+**	ptrue	(p[0-7])\.b, all
+**	cmpne	p0\.b, \2/z, \1\.b, #0
+** |
+**	ptrue	(p[0-7])\.b, all
+**	mov	(z[0-9]+)\.s, #16776960
+**	cmpne	p0\.b, \3/z, \4\.b, #0
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_6666_b8,
+		p0 = svdupq_n_b8 (0, 1, 1, 0, 0, 1, 1, 0,
+				  0, 1, 1, 0, 0, 1, 1, 0),
+		p0 = svdupq_b8 (0, 1, 1, 0, 0, 1, 1, 0,
+				0, 1, 1, 0, 0, 1, 1, 0))
+
+/*
+** dupq_7777_b8:
+** (
+**	ptrue	(p[0-7])\.s, all
+**	ptrue	(p[0-7])\.[bh], all
+**	trn1	p0\.b, \2\.b, \1\.b
+** |
+**	ptrue	(p[0-7])\.[bh], all
+**	ptrue	(p[0-7])\.s, all
+**	trn1	p0\.b, \3\.b, \4\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_7777_b8,
+		p0 = svdupq_n_b8 (1, 1, 1, 0, 1, 1, 1, 0,
+				  1, 1, 1, 0, 1, 1, 1, 0),
+		p0 = svdupq_b8 (1, 1, 1, 0, 1, 1, 1, 0,
+				1, 1, 1, 0, 1, 1, 1, 0))
+
+/*
+** dupq_8888_b8:
+** (
+**	mov	(z[0-9]+)\.s, #-16777216
+**	ptrue	(p[0-7])\.b, all
+**	cmpne	p0\.b, \2/z, \1\.b, #0
+** |
+**	ptrue	(p[0-7])\.b, all
+**	mov	(z[0-9]+)\.s, #-16777216
+**	cmpne	p0\.b, \3/z, \4\.b, #0
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_8888_b8,
+		p0 = svdupq_n_b8 (0, 0, 0, 1, 0, 0, 0, 1,
+				  0, 0, 0, 1, 0, 0, 0, 1),
+		p0 = svdupq_b8 (0, 0, 0, 1, 0, 0, 0, 1,
+				0, 0, 0, 1, 0, 0, 0, 1))
+
+/*
+** dupq_9999_b8:
+** (
+**	mov	(z[0-9]+)\.s, #-16776961
+**	ptrue	(p[0-7])\.b, all
+**	cmpne	p0\.b, \2/z, \1\.b, #0
+** |
+**	ptrue	(p[0-7])\.b, all
+**	mov	(z[0-9]+)\.s, #-16776961
+**	cmpne	p0\.b, \3/z, \4\.b, #0
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_9999_b8,
+		p0 = svdupq_n_b8 (1, 0, 0, 1, 1, 0, 0, 1,
+				  1, 0, 0, 1, 1, 0, 0, 1),
+		p0 = svdupq_b8 (1, 0, 0, 1, 1, 0, 0, 1,
+				1, 0, 0, 1, 1, 0, 0, 1))
+
+/*
+** dupq_aaaa_b8:
+** (
+**	ptrue	(p[0-7])\.h, all
+**	ptrue	(p[0-7])\.b, all
+**	not	p0\.b, \2/z, \1\.b
+** |
+**	ptrue	(p[0-7])\.b, all
+**	ptrue	(p[0-7])\.h, all
+**	not	p0\.b, \3/z, \4\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_aaaa_b8,
+		p0 = svdupq_n_b8 (0, 1, 0, 1, 0, 1, 0, 1,
+				  0, 1, 0, 1, 0, 1, 0, 1),
+		p0 = svdupq_b8 (0, 1, 0, 1, 0, 1, 0, 1,
+				0, 1, 0, 1, 0, 1, 0, 1))
+
+/*
+** dupq_bbbb_b8:
+** (
+**	ptrue	(p[0-7])\.s, all
+**	ptrue	(p[0-7])\.[bh], all
+**	trn1	p0\.b, \1\.b, \2\.b
+** |
+**	ptrue	(p[0-7])\.[bh], all
+**	ptrue	(p[0-7])\.s, all
+**	trn1	p0\.b, \4\.b, \3\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_bbbb_b8,
+		p0 = svdupq_n_b8 (1, 1, 0, 1, 1, 1, 0, 1,
+				  1, 1, 0, 1, 1, 1, 0, 1),
+		p0 = svdupq_b8 (1, 1, 0, 1, 1, 1, 0, 1,
+				1, 1, 0, 1, 1, 1, 0, 1))
+
+/*
+** dupq_cccc_b8:
+** (
+**	pfalse	(p[0-7])\.b
+**	ptrue	(p[0-7])\.b, all
+**	trn1	p0\.h, \1\.h, \2\.h
+** |
+**	ptrue	(p[0-7])\.b, all
+**	pfalse	(p[0-7])\.b
+**	trn1	p0\.h, \4\.h, \3\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_cccc_b8,
+		p0 = svdupq_n_b8 (0, 0, 1, 1, 0, 0, 1, 1,
+				  0, 0, 1, 1, 0, 0, 1, 1),
+		p0 = svdupq_b8 (0, 0, 1, 1, 0, 0, 1, 1,
+				0, 0, 1, 1, 0, 0, 1, 1))
+
+/*
+** dupq_dddd_b8:
+** (
+**	ptrue	(p[0-7])\.[hs], all
+**	ptrue	(p[0-7])\.b, all
+**	trn1	p0\.h, \1\.h, \2\.h
+** |
+**	ptrue	(p[0-7])\.b, all
+**	ptrue	(p[0-7])\.[hs], all
+**	trn1	p0\.h, \4\.h, \3\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_dddd_b8,
+		p0 = svdupq_n_b8 (1, 0, 1, 1, 1, 0, 1, 1,
+				  1, 0, 1, 1, 1, 0, 1, 1),
+		p0 = svdupq_b8 (1, 0, 1, 1, 1, 0, 1, 1,
+				1, 0, 1, 1, 1, 0, 1, 1))
+
+/*
+** dupq_eeee_b8:
+** (
+**	ptrue	(p[0-7])\.s, all
+**	ptrue	(p[0-7])\.b, all
+**	not	p0\.b, \2/z, \1\.b
+** |
+**	ptrue	(p[0-7])\.b, all
+**	ptrue	(p[0-7])\.s, all
+**	not	p0\.b, \3/z, \4\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_eeee_b8,
+		p0 = svdupq_n_b8 (0, 1, 1, 1, 0, 1, 1, 1,
+				  0, 1, 1, 1, 0, 1, 1, 1),
+		p0 = svdupq_b8 (0, 1, 1, 1, 0, 1, 1, 1,
+				0, 1, 1, 1, 0, 1, 1, 1))
+
+/*
+** dupq_ffff_b8:
+**	ptrue	p0\.b, all
+**	ret
+*/
+TEST_UNIFORM_P (dupq_ffff_b8,
+		p0 = svdupq_n_b8 (1, 1, 1, 1, 1, 1, 1, 1,
+				  1, 1, 1, 1, 1, 1, 1, 1),
+		p0 = svdupq_b8 (1, 1, 1, 1, 1, 1, 1, 1,
+				1, 1, 1, 1, 1, 1, 1, 1))
+
+/*
+** dupq_5f5f_b8:
+** (
+**	ptrue	(p[0-7])\.h, all
+**	ptrue	(p[0-7])\.b, all
+**	trn1	p0\.s, \2\.s, \1\.s
+** |
+**	ptrue	(p[0-7])\.b, all
+**	ptrue	(p[0-7])\.h, all
+**	trn1	p0\.s, \3\.s, \4\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_5f5f_b8,
+		p0 = svdupq_n_b8 (1, 1, 1, 1, 1, 0, 1, 0,
+				  1, 1, 1, 1, 1, 0, 1, 0),
+		p0 = svdupq_b8 (1, 1, 1, 1, 1, 0, 1, 0,
+				1, 1, 1, 1, 1, 0, 1, 0))
+
+/*
+** dupq_1f1f_b8:
+** (
+**	ptrue	(p[0-7])\.[sd], all
+**	ptrue	(p[0-7])\.b, all
+**	trn1	p0\.s, \2\.s, \1\.s
+** |
+**	ptrue	(p[0-7])\.b, all
+**	ptrue	(p[0-7])\.[sd], all
+**	trn1	p0\.s, \3\.s, \4\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_1f1f_b8,
+		p0 = svdupq_n_b8 (1, 1, 1, 1, 1, 0, 0, 0,
+				  1, 1, 1, 1, 1, 0, 0, 0),
+		p0 = svdupq_b8 (1, 1, 1, 1, 1, 0, 0, 0,
+				1, 1, 1, 1, 1, 0, 0, 0))
+
+/*
+** dupq_1515_b8:
+** (
+**	ptrue	(p[0-7])\.d, all
+**	ptrue	(p[0-7])\.[hs], all
+**	trn1	p0\.h, \2\.h, \1\.h
+** |
+**	ptrue	(p[0-7])\.[hs], all
+**	ptrue	(p[0-7])\.d, all
+**	trn1	p0\.h, \3\.h, \4\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_1515_b8,
+		p0 = svdupq_n_b8 (1, 0, 1, 0, 1, 0, 0, 0,
+				  1, 0, 1, 0, 1, 0, 0, 0),
+		p0 = svdupq_b8 (1, 0, 1, 0, 1, 0, 0, 0,
+				1, 0, 1, 0, 1, 0, 0, 0))
+
+/*
+** dupq_0505_b8:
+**	ptrue	(p[0-7])\.d, all
+**	trn1	p0\.h, \1\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_P (dupq_0505_b8,
+		p0 = svdupq_n_b8 (1, 0, 1, 0, 0, 0, 0, 0,
+				  1, 0, 1, 0, 0, 0, 0, 0),
+		p0 = svdupq_b8 (1, 0, 1, 0, 0, 0, 0, 0,
+				1, 0, 1, 0, 0, 0, 0, 0))
+
+/*
+** dupq_00ff_b8:
+** (
+**	pfalse	(p[0-7])\.b
+**	ptrue	(p[0-7])\.b, all
+**	trn1	p0\.d, \2\.d, \1\.d
+** |
+**	ptrue	(p[0-7])\.b, all
+**	pfalse	(p[0-7])\.b
+**	trn1	p0\.d, \3\.d, \4\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_00ff_b8,
+		p0 = svdupq_n_b8 (1, 1, 1, 1, 1, 1, 1, 1,
+				  0, 0, 0, 0, 0, 0, 0, 0),
+		p0 = svdupq_b8 (1, 1, 1, 1, 1, 1, 1, 1,
+				0, 0, 0, 0, 0, 0, 0, 0))
+
+/*
+** dupq_0055_b8:
+** (
+**	pfalse	(p[0-7])\.b
+**	ptrue	(p[0-7])\.h, all
+**	trn1	p0\.d, \2\.d, \1\.d
+** |
+**	ptrue	(p[0-7])\.h, all
+**	pfalse	(p[0-7])\.b
+**	trn1	p0\.d, \3\.d, \4\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_0055_b8,
+		p0 = svdupq_n_b8 (1, 0, 1, 0, 1, 0, 1, 0,
+				  0, 0, 0, 0, 0, 0, 0, 0),
+		p0 = svdupq_b8 (1, 0, 1, 0, 1, 0, 1, 0,
+				0, 0, 0, 0, 0, 0, 0, 0))
+
+/*
+** dupq_0011_b8:
+** (
+**	pfalse	(p[0-7])\.b
+**	ptrue	(p[0-7])\.s, all
+**	trn1	p0\.d, \2\.d, \1\.d
+** |
+**	ptrue	(p[0-7])\.s, all
+**	pfalse	(p[0-7])\.b
+**	trn1	p0\.d, \3\.d, \4\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_0011_b8,
+		p0 = svdupq_n_b8 (1, 0, 0, 0, 1, 0, 0, 0,
+				  0, 0, 0, 0, 0, 0, 0, 0),
+		p0 = svdupq_b8 (1, 0, 0, 0, 1, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0))
+
+/*
+** dupq_0111_b8:
+** (
+**	ptrue	(p[0-7])\.d, all
+**	ptrue	(p[0-7])\.s, all
+**	trn1	p0\.d, \2\.d, \1\.d
+** |
+**	ptrue	(p[0-7])\.s, all
+**	ptrue	(p[0-7])\.d, all
+**	trn1	p0\.d, \3\.d, \4\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_P (dupq_0111_b8,
+		p0 = svdupq_n_b8 (1, 0, 0, 0, 1, 0, 0, 0,
+				  1, 0, 0, 0, 0, 0, 0, 0),
+		p0 = svdupq_b8 (1, 0, 0, 0, 1, 0, 0, 0,
+				1, 0, 0, 0, 0, 0, 0, 0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f16.c
new file mode 100644
index 000000000..91de8344c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f16.c
@@ -0,0 +1,53 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_1c_f16:
+**	mov	z0\.s, #15360
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_1c_f16, svfloat16_t,
+		z0 = svdupq_n_f16 (1.0, 0, 1.0, 0, 1.0, 0, 1.0, 0),
+		z0 = svdupq_f16 (1.0, 0, 1.0, 0, 1.0, 0, 1.0, 0));
+
+/*
+** dupq_5ic_f16:
+**	movi	v([0-9]+)\.4s, 0x45, lsl 24
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_5ic_f16, svfloat16_t,
+		z0 = svdupq_n_f16 (0, 5.0, 0, 5.0, 0, 5.0, 0, 5.0),
+		z0 = svdupq_f16 (0, 5.0, 0, 5.0, 0, 5.0, 0, 5.0));
+
+
+/*
+** dupq_m1c_f16:
+**	movi	v([0-9]+)\.4s, 0xbc, lsl 8
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_m1c_f16, svfloat16_t,
+		z0 = svdupq_n_f16 (-1.0, 0, -1.0, 0, -1.0, 0, -1.0, 0),
+		z0 = svdupq_f16 (-1.0, 0, -1.0, 0, -1.0, 0, -1.0, 0));
+
+/*
+** dupq_40p5c_f16:
+**	mov	(w[0-9]+), 20752
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_40p5c_f16, svfloat16_t,
+		z0 = svdupq_n_f16 (40.5, 0, 40.5, 0, 40.5, 0, 40.5, 0),
+		z0 = svdupq_f16 (40.5, 0, 40.5, 0, 40.5, 0, 40.5, 0));
+
+/*
+** dupq_pool_f16:
+**	...
+**	ld1rqh	z0\.h, p[0-7]/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_pool_f16, svfloat16_t,
+		z0 = svdupq_n_f16 (4.75, 1.0, 9, 77, 5.25, 22, 19, 50),
+		z0 = svdupq_f16 (4.75, 1.0, 9, 77, 5.25, 22, 19, 50))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f32.c
new file mode 100644
index 000000000..4f9c04f1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f32.c
@@ -0,0 +1,53 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_1c_f32:
+**	mov	z0\.d, #1065353216
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_1c_f32, svfloat32_t,
+		z0 = svdupq_n_f32 (1.0, 0, 1.0, 0),
+		z0 = svdupq_f32 (1.0, 0, 1.0, 0));
+
+/*
+** dupq_5ic_f32:
+**	mov	(x[0-9]+), 4656722014701092864
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_5ic_f32, svfloat32_t,
+		z0 = svdupq_n_f32 (0, 5.0, 0, 5.0),
+		z0 = svdupq_f32 (0, 5.0, 0, 5.0));
+
+
+/*
+** dupq_m1c_f32:
+**	mov	(x[0-9]+), 3212836864
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_m1c_f32, svfloat32_t,
+		z0 = svdupq_n_f32 (-1.0, 0, -1.0, 0),
+		z0 = svdupq_f32 (-1.0, 0, -1.0, 0));
+
+/*
+** dupq_40p5c_f32:
+**	mov	(x[0-9]+), 1109524480
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_40p5c_f32, svfloat32_t,
+		z0 = svdupq_n_f32 (40.5, 0, 40.5, 0),
+		z0 = svdupq_f32 (40.5, 0, 40.5, 0));
+
+/*
+** dupq_pool_f32:
+**	...
+**	ld1rqw	z0\.s, p[0-7]/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_pool_f32, svfloat32_t,
+		z0 = svdupq_n_f32 (4.5, 10.1, 7.3, 11.8),
+		z0 = svdupq_f32 (4.5, 10.1, 7.3, 11.8))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f64.c
new file mode 100644
index 000000000..27d14480e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_f64.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_pool_f64:
+**	...
+**	ld1rqd	z0\.d, p[0-7]/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_pool_f64, svfloat64_t,
+		z0 = svdupq_n_f64 (4.5, 10.1),
+		z0 = svdupq_f64 (4.5, 10.1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_bf16.c
new file mode 100644
index 000000000..89ae4a4c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_bf16.c
@@ -0,0 +1,48 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_lane_0_bf16_tied:
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_bf16_tied, svbfloat16_t,
+		z0 = svdupq_lane_bf16 (z0, 0),
+		z0 = svdupq_lane (z0, 0))
+
+/*
+** dupq_lane_0_bf16_untied:
+**	dup	z0\.q, z1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_bf16_untied, svbfloat16_t,
+		z0 = svdupq_lane_bf16 (z1, 0),
+		z0 = svdupq_lane (z1, 0))
+
+/*
+** dupq_lane_1_bf16:
+**	dup	z0\.q, z0\.q\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_1_bf16, svbfloat16_t,
+		z0 = svdupq_lane_bf16 (z0, 1),
+		z0 = svdupq_lane (z0, 1))
+
+/*
+** dupq_lane_2_bf16:
+**	dup	z0\.q, z0\.q\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_2_bf16, svbfloat16_t,
+		z0 = svdupq_lane_bf16 (z0, 2),
+		z0 = svdupq_lane (z0, 2))
+
+/*
+** dupq_lane_3_bf16:
+**	dup	z0\.q, z0\.q\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_3_bf16, svbfloat16_t,
+		z0 = svdupq_lane_bf16 (z0, 3),
+		z0 = svdupq_lane (z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f16.c
new file mode 100644
index 000000000..6fa97ca3a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f16.c
@@ -0,0 +1,48 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_lane_0_f16_tied:
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_f16_tied, svfloat16_t,
+		z0 = svdupq_lane_f16 (z0, 0),
+		z0 = svdupq_lane (z0, 0))
+
+/*
+** dupq_lane_0_f16_untied:
+**	dup	z0\.q, z1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_f16_untied, svfloat16_t,
+		z0 = svdupq_lane_f16 (z1, 0),
+		z0 = svdupq_lane (z1, 0))
+
+/*
+** dupq_lane_1_f16:
+**	dup	z0\.q, z0\.q\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_1_f16, svfloat16_t,
+		z0 = svdupq_lane_f16 (z0, 1),
+		z0 = svdupq_lane (z0, 1))
+
+/*
+** dupq_lane_2_f16:
+**	dup	z0\.q, z0\.q\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_2_f16, svfloat16_t,
+		z0 = svdupq_lane_f16 (z0, 2),
+		z0 = svdupq_lane (z0, 2))
+
+/*
+** dupq_lane_3_f16:
+**	dup	z0\.q, z0\.q\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_3_f16, svfloat16_t,
+		z0 = svdupq_lane_f16 (z0, 3),
+		z0 = svdupq_lane (z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f32.c
new file mode 100644
index 000000000..69ce5452e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f32.c
@@ -0,0 +1,48 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_lane_0_f32_tied:
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_f32_tied, svfloat32_t,
+		z0 = svdupq_lane_f32 (z0, 0),
+		z0 = svdupq_lane (z0, 0))
+
+/*
+** dupq_lane_0_f32_untied:
+**	dup	z0\.q, z1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_f32_untied, svfloat32_t,
+		z0 = svdupq_lane_f32 (z1, 0),
+		z0 = svdupq_lane (z1, 0))
+
+/*
+** dupq_lane_1_f32:
+**	dup	z0\.q, z0\.q\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_1_f32, svfloat32_t,
+		z0 = svdupq_lane_f32 (z0, 1),
+		z0 = svdupq_lane (z0, 1))
+
+/*
+** dupq_lane_2_f32:
+**	dup	z0\.q, z0\.q\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_2_f32, svfloat32_t,
+		z0 = svdupq_lane_f32 (z0, 2),
+		z0 = svdupq_lane (z0, 2))
+
+/*
+** dupq_lane_3_f32:
+**	dup	z0\.q, z0\.q\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_3_f32, svfloat32_t,
+		z0 = svdupq_lane_f32 (z0, 3),
+		z0 = svdupq_lane (z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f64.c
new file mode 100644
index 000000000..51a8d9f2d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_f64.c
@@ -0,0 +1,48 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_lane_0_f64_tied:
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_f64_tied, svfloat64_t,
+		z0 = svdupq_lane_f64 (z0, 0),
+		z0 = svdupq_lane (z0, 0))
+
+/*
+** dupq_lane_0_f64_untied:
+**	dup	z0\.q, z1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_f64_untied, svfloat64_t,
+		z0 = svdupq_lane_f64 (z1, 0),
+		z0 = svdupq_lane (z1, 0))
+
+/*
+** dupq_lane_1_f64:
+**	dup	z0\.q, z0\.q\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_1_f64, svfloat64_t,
+		z0 = svdupq_lane_f64 (z0, 1),
+		z0 = svdupq_lane (z0, 1))
+
+/*
+** dupq_lane_2_f64:
+**	dup	z0\.q, z0\.q\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_2_f64, svfloat64_t,
+		z0 = svdupq_lane_f64 (z0, 2),
+		z0 = svdupq_lane (z0, 2))
+
+/*
+** dupq_lane_3_f64:
+**	dup	z0\.q, z0\.q\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_3_f64, svfloat64_t,
+		z0 = svdupq_lane_f64 (z0, 3),
+		z0 = svdupq_lane (z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s16.c
new file mode 100644
index 000000000..08a0510be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s16.c
@@ -0,0 +1,48 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_lane_0_s16_tied:
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_s16_tied, svint16_t,
+		z0 = svdupq_lane_s16 (z0, 0),
+		z0 = svdupq_lane (z0, 0))
+
+/*
+** dupq_lane_0_s16_untied:
+**	dup	z0\.q, z1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_s16_untied, svint16_t,
+		z0 = svdupq_lane_s16 (z1, 0),
+		z0 = svdupq_lane (z1, 0))
+
+/*
+** dupq_lane_1_s16:
+**	dup	z0\.q, z0\.q\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_1_s16, svint16_t,
+		z0 = svdupq_lane_s16 (z0, 1),
+		z0 = svdupq_lane (z0, 1))
+
+/*
+** dupq_lane_2_s16:
+**	dup	z0\.q, z0\.q\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_2_s16, svint16_t,
+		z0 = svdupq_lane_s16 (z0, 2),
+		z0 = svdupq_lane (z0, 2))
+
+/*
+** dupq_lane_3_s16:
+**	dup	z0\.q, z0\.q\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_3_s16, svint16_t,
+		z0 = svdupq_lane_s16 (z0, 3),
+		z0 = svdupq_lane (z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s32.c
new file mode 100644
index 000000000..e9a9c9a60
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s32.c
@@ -0,0 +1,48 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_lane_0_s32_tied:
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_s32_tied, svint32_t,
+		z0 = svdupq_lane_s32 (z0, 0),
+		z0 = svdupq_lane (z0, 0))
+
+/*
+** dupq_lane_0_s32_untied:
+**	dup	z0\.q, z1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_s32_untied, svint32_t,
+		z0 = svdupq_lane_s32 (z1, 0),
+		z0 = svdupq_lane (z1, 0))
+
+/*
+** dupq_lane_1_s32:
+**	dup	z0\.q, z0\.q\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_1_s32, svint32_t,
+		z0 = svdupq_lane_s32 (z0, 1),
+		z0 = svdupq_lane (z0, 1))
+
+/*
+** dupq_lane_2_s32:
+**	dup	z0\.q, z0\.q\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_2_s32, svint32_t,
+		z0 = svdupq_lane_s32 (z0, 2),
+		z0 = svdupq_lane (z0, 2))
+
+/*
+** dupq_lane_3_s32:
+**	dup	z0\.q, z0\.q\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_3_s32, svint32_t,
+		z0 = svdupq_lane_s32 (z0, 3),
+		z0 = svdupq_lane (z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s64.c
new file mode 100644
index 000000000..2c6342149
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s64.c
@@ -0,0 +1,48 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_lane_0_s64_tied:
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_s64_tied, svint64_t,
+		z0 = svdupq_lane_s64 (z0, 0),
+		z0 = svdupq_lane (z0, 0))
+
+/*
+** dupq_lane_0_s64_untied:
+**	dup	z0\.q, z1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_s64_untied, svint64_t,
+		z0 = svdupq_lane_s64 (z1, 0),
+		z0 = svdupq_lane (z1, 0))
+
+/*
+** dupq_lane_1_s64:
+**	dup	z0\.q, z0\.q\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_1_s64, svint64_t,
+		z0 = svdupq_lane_s64 (z0, 1),
+		z0 = svdupq_lane (z0, 1))
+
+/*
+** dupq_lane_2_s64:
+**	dup	z0\.q, z0\.q\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_2_s64, svint64_t,
+		z0 = svdupq_lane_s64 (z0, 2),
+		z0 = svdupq_lane (z0, 2))
+
+/*
+** dupq_lane_3_s64:
+**	dup	z0\.q, z0\.q\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_3_s64, svint64_t,
+		z0 = svdupq_lane_s64 (z0, 3),
+		z0 = svdupq_lane (z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s8.c
new file mode 100644
index 000000000..2c2e6ee72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_s8.c
@@ -0,0 +1,48 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_lane_0_s8_tied:
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_s8_tied, svint8_t,
+		z0 = svdupq_lane_s8 (z0, 0),
+		z0 = svdupq_lane (z0, 0))
+
+/*
+** dupq_lane_0_s8_untied:
+**	dup	z0\.q, z1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_s8_untied, svint8_t,
+		z0 = svdupq_lane_s8 (z1, 0),
+		z0 = svdupq_lane (z1, 0))
+
+/*
+** dupq_lane_1_s8:
+**	dup	z0\.q, z0\.q\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_1_s8, svint8_t,
+		z0 = svdupq_lane_s8 (z0, 1),
+		z0 = svdupq_lane (z0, 1))
+
+/*
+** dupq_lane_2_s8:
+**	dup	z0\.q, z0\.q\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_2_s8, svint8_t,
+		z0 = svdupq_lane_s8 (z0, 2),
+		z0 = svdupq_lane (z0, 2))
+
+/*
+** dupq_lane_3_s8:
+**	dup	z0\.q, z0\.q\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_3_s8, svint8_t,
+		z0 = svdupq_lane_s8 (z0, 3),
+		z0 = svdupq_lane (z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u16.c
new file mode 100644
index 000000000..e5fba592f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u16.c
@@ -0,0 +1,48 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_lane_0_u16_tied:
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_u16_tied, svuint16_t,
+		z0 = svdupq_lane_u16 (z0, 0),
+		z0 = svdupq_lane (z0, 0))
+
+/*
+** dupq_lane_0_u16_untied:
+**	dup	z0\.q, z1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_u16_untied, svuint16_t,
+		z0 = svdupq_lane_u16 (z1, 0),
+		z0 = svdupq_lane (z1, 0))
+
+/*
+** dupq_lane_1_u16:
+**	dup	z0\.q, z0\.q\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_1_u16, svuint16_t,
+		z0 = svdupq_lane_u16 (z0, 1),
+		z0 = svdupq_lane (z0, 1))
+
+/*
+** dupq_lane_2_u16:
+**	dup	z0\.q, z0\.q\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_2_u16, svuint16_t,
+		z0 = svdupq_lane_u16 (z0, 2),
+		z0 = svdupq_lane (z0, 2))
+
+/*
+** dupq_lane_3_u16:
+**	dup	z0\.q, z0\.q\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_3_u16, svuint16_t,
+		z0 = svdupq_lane_u16 (z0, 3),
+		z0 = svdupq_lane (z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u32.c
new file mode 100644
index 000000000..fb3346e45
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u32.c
@@ -0,0 +1,48 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_lane_0_u32_tied:
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_u32_tied, svuint32_t,
+		z0 = svdupq_lane_u32 (z0, 0),
+		z0 = svdupq_lane (z0, 0))
+
+/*
+** dupq_lane_0_u32_untied:
+**	dup	z0\.q, z1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_u32_untied, svuint32_t,
+		z0 = svdupq_lane_u32 (z1, 0),
+		z0 = svdupq_lane (z1, 0))
+
+/*
+** dupq_lane_1_u32:
+**	dup	z0\.q, z0\.q\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_1_u32, svuint32_t,
+		z0 = svdupq_lane_u32 (z0, 1),
+		z0 = svdupq_lane (z0, 1))
+
+/*
+** dupq_lane_2_u32:
+**	dup	z0\.q, z0\.q\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_2_u32, svuint32_t,
+		z0 = svdupq_lane_u32 (z0, 2),
+		z0 = svdupq_lane (z0, 2))
+
+/*
+** dupq_lane_3_u32:
+**	dup	z0\.q, z0\.q\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_3_u32, svuint32_t,
+		z0 = svdupq_lane_u32 (z0, 3),
+		z0 = svdupq_lane (z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u64.c
new file mode 100644
index 000000000..22f1d5d55
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u64.c
@@ -0,0 +1,48 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_lane_0_u64_tied:
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_u64_tied, svuint64_t,
+		z0 = svdupq_lane_u64 (z0, 0),
+		z0 = svdupq_lane (z0, 0))
+
+/*
+** dupq_lane_0_u64_untied:
+**	dup	z0\.q, z1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_u64_untied, svuint64_t,
+		z0 = svdupq_lane_u64 (z1, 0),
+		z0 = svdupq_lane (z1, 0))
+
+/*
+** dupq_lane_1_u64:
+**	dup	z0\.q, z0\.q\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_1_u64, svuint64_t,
+		z0 = svdupq_lane_u64 (z0, 1),
+		z0 = svdupq_lane (z0, 1))
+
+/*
+** dupq_lane_2_u64:
+**	dup	z0\.q, z0\.q\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_2_u64, svuint64_t,
+		z0 = svdupq_lane_u64 (z0, 2),
+		z0 = svdupq_lane (z0, 2))
+
+/*
+** dupq_lane_3_u64:
+**	dup	z0\.q, z0\.q\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_3_u64, svuint64_t,
+		z0 = svdupq_lane_u64 (z0, 3),
+		z0 = svdupq_lane (z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u8.c
new file mode 100644
index 000000000..ba16f836a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_lane_u8.c
@@ -0,0 +1,48 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_lane_0_u8_tied:
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_u8_tied, svuint8_t,
+		z0 = svdupq_lane_u8 (z0, 0),
+		z0 = svdupq_lane (z0, 0))
+
+/*
+** dupq_lane_0_u8_untied:
+**	dup	z0\.q, z1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_u8_untied, svuint8_t,
+		z0 = svdupq_lane_u8 (z1, 0),
+		z0 = svdupq_lane (z1, 0))
+
+/*
+** dupq_lane_1_u8:
+**	dup	z0\.q, z0\.q\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_1_u8, svuint8_t,
+		z0 = svdupq_lane_u8 (z0, 1),
+		z0 = svdupq_lane (z0, 1))
+
+/*
+** dupq_lane_2_u8:
+**	dup	z0\.q, z0\.q\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_2_u8, svuint8_t,
+		z0 = svdupq_lane_u8 (z0, 2),
+		z0 = svdupq_lane (z0, 2))
+
+/*
+** dupq_lane_3_u8:
+**	dup	z0\.q, z0\.q\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_lane_3_u8, svuint8_t,
+		z0 = svdupq_lane_u8 (z0, 3),
+		z0 = svdupq_lane (z0, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s16.c
new file mode 100644
index 000000000..5a9a53b2d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s16.c
@@ -0,0 +1,70 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_25600s_s16:
+**	mov	z0\.s, #25600
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_25600s_s16, svint16_t,
+		z0 = svdupq_n_s16 (25600, 0, 25600, 0, 25600, 0, 25600, 0),
+		z0 = svdupq_s16 (25600, 0, 25600, 0, 25600, 0, 25600, 0))
+
+/*
+** dupq_7ff00s_s16:
+**	mov	z0\.s, #524032
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_7ff00s_s16, svint16_t,
+		z0 = svdupq_n_s16 (0xff00, 7, 0xff00, 7, 0xff00, 7, 0xff00, 7),
+		z0 = svdupq_s16 (0xff00, 7, 0xff00, 7, 0xff00, 7, 0xff00, 7))
+
+/*
+** dupq_65536d_s16:
+**	mov	z0\.d, #65536
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_65536d_s16, svint16_t,
+		z0 = svdupq_n_s16 (0, 1, 0, 0, 0, 1, 0, 0),
+		z0 = svdupq_s16 (0, 1, 0, 0, 0, 1, 0, 0))
+
+/*
+** dupq_m2d_s16:
+**	mov	z0\.d, #-2
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_m2d_s16, svint16_t,
+		z0 = svdupq_n_s16 (-2, -1, -1, -1, -2, -1, -1, -1),
+		z0 = svdupq_s16 (-2, -1, -1, -1, -2, -1, -1, -1))
+
+/*
+** dupq_4ddb_s16:
+**	movi	v([0-9]+)\.2d, 0xff0000ffff00ff
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_4ddb_s16, svint16_t,
+		z0 = svdupq_n_s16 (0xff, -1, 0, 0xff, 0xff, -1, 0, 0xff),
+		z0 = svdupq_s16 (0xff, -1, 0, 0xff, 0xff, -1, 0, 0xff))
+
+
+/*
+** dupq_a093s_s16:
+**	mov	(w[0-9]+), 41107
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_a093s_s16, svint16_t,
+		z0 = svdupq_n_s16 (0xa093, 0, 0xa093, 0, 0xa093, 0, 0xa093, 0),
+		z0 = svdupq_s16 (0xa093, 0, 0xa093, 0, 0xa093, 0, 0xa093, 0));
+
+/*
+** dupq_pool_s16:
+**	...
+**	ld1rqh	z0\.h, p[0-7]/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_pool_s16, svint16_t,
+		z0 = svdupq_n_s16 (4, 10, 9, 77, 52, 22, 19, 50),
+		z0 = svdupq_s16 (4, 10, 9, 77, 52, 22, 19, 50))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s32.c
new file mode 100644
index 000000000..13b24c0db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s32.c
@@ -0,0 +1,61 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_12800d_s32:
+**	mov	z0\.d, #12800
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_12800d_s32, svint32_t,
+		z0 = svdupq_n_s32 (12800, 0, 12800, 0),
+		z0 = svdupq_s32 (12800, 0, 12800, 0))
+
+/*
+** dupq_fffffffed_s32:
+**	mov	z0\.d, #4294967294
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_fffffffed_s32, svint32_t,
+		z0 = svdupq_n_s32 (-2, 0, -2, 0),
+		z0 = svdupq_s32 (-2, 0, -2, 0))
+
+/*
+** dupq_ff00ffffff00d_s32:
+**	movi	v([0-9]+)\.2d, 0xff00ffffff00
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_ff00ffffff00d_s32, svint32_t,
+		z0 = svdupq_n_s32 (-256, 0xff00, -256, 0xff00),
+		z0 = svdupq_s32 (-256, 0xff00, -256, 0xff00))
+
+/*
+** dupq_fedcd_s32:
+**	mov	(x[0-9]+), 65244
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_fedcd_s32, svint32_t,
+		z0 = svdupq_n_s32 (0xfedc, 0, 0xfedc, 0),
+		z0 = svdupq_s32 (0xfedc, 0, 0xfedc, 0))
+
+/*
+** dupq_1357ud_s32:
+**	mov	(x[0-9]+), 21264383082496
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_1357ud_s32, svint32_t,
+		z0 = svdupq_n_s32 (0, 0x1357, 0, 0x1357),
+		z0 = svdupq_s32 (0, 0x1357, 0, 0x1357))
+
+/*
+** dupq_pool_s32:
+**	...
+**	ld1rqw	z0\.s, p[0-7]/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_pool_s32, svint32_t,
+		z0 = svdupq_n_s32 (4, 10, 9, 77),
+		z0 = svdupq_s32 (4, 10, 9, 77))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s64.c
new file mode 100644
index 000000000..d2689fa5c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s64.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_pool_s64:
+**	...
+**	ld1rqd	z0\.d, p[0-7]/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_pool_s64, svint64_t,
+		z0 = svdupq_n_s64 (4, 10),
+		z0 = svdupq_s64 (4, 10))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s8.c
new file mode 100644
index 000000000..30b36c162
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_s8.c
@@ -0,0 +1,99 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_54h_s8:
+**	mov	z0\.h, #54
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_54h_s8, svint8_t,
+		z0 = svdupq_n_s8 (54, 0, 54, 0, 54, 0, 54, 0,
+				  54, 0, 54, 0, 54, 0, 54, 0),
+		z0 = svdupq_s8 (54, 0, 54, 0, 54, 0, 54, 0,
+				54, 0, 54, 0, 54, 0, 54, 0))
+
+/*
+** dupq_2560h_s8:
+**	mov	z0\.h, #2560
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_2560h_s8, svint8_t,
+		z0 = svdupq_n_s8 (0, 10, 0, 10, 0, 10, 0, 10,
+				  0, 10, 0, 10, 0, 10, 0, 10),
+		z0 = svdupq_s8 (0, 10, 0, 10, 0, 10, 0, 10,
+				0, 10, 0, 10, 0, 10, 0, 10))
+
+/*
+** dupq_5120s_s8:
+**	mov	z0\.s, #5120
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_5120s_s8, svint8_t,
+		z0 = svdupq_n_s8 (0, 20, 0, 0, 0, 20, 0, 0,
+				  0, 20, 0, 0, 0, 20, 0, 0),
+		z0 = svdupq_s8 (0, 20, 0, 0, 0, 20, 0, 0,
+				0, 20, 0, 0, 0, 20, 0, 0))
+
+/*
+** dupq_1ff00s_s8:
+**	mov	z0\.s, #130816
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_1ff00s_s8, svint8_t,
+		z0 = svdupq_n_s8 (0, -1, 1, 0, 0, -1, 1, 0,
+				  0, -1, 1, 0, 0, -1, 1, 0),
+		z0 = svdupq_s8 (0, -1, 1, 0, 0, -1, 1, 0,
+				0, -1, 1, 0, 0, -1, 1, 0))
+
+/*
+** dupq_96db_s8:
+**	movi	v([0-9]+)\.2d, 0xff0000ff00ffff00
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_96db_s8, svint8_t,
+		z0 = svdupq_n_s8 (0, -1, -1, 0, -1, 0, 0, -1,
+				  0, -1, -1, 0, -1, 0, 0, -1),
+		z0 = svdupq_s8 (0, -1, -1, 0, -1, 0, 0, -1,
+				0, -1, -1, 0, -1, 0, 0, -1))
+
+/*
+** dupq_7755h_s8:
+**	mov	(w[0-9]+), 21879
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_7755h_s8, svint8_t,
+		z0 = svdupq_n_s8 (0x77, 0x55, 0x77, 0x55,
+				  0x77, 0x55, 0x77, 0x55,
+				  0x77, 0x55, 0x77, 0x55,
+				  0x77, 0x55, 0x77, 0x55),
+		z0 = svdupq_s8 (0x77, 0x55, 0x77, 0x55,
+				0x77, 0x55, 0x77, 0x55,
+				0x77, 0x55, 0x77, 0x55,
+				0x77, 0x55, 0x77, 0x55))
+
+/*
+** dupq_729a0000s_s8:
+**	mov	(w[0-9]+), 1922695168
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_729a0000s_s8, svint8_t,
+		z0 = svdupq_n_s8 (0, 0, 0x9a, 0x72, 0, 0, 0x9a, 0x72,
+				  0, 0, 0x9a, 0x72, 0, 0, 0x9a, 0x72),
+		z0 = svdupq_s8 (0, 0, 0x9a, 0x72, 0, 0, 0x9a, 0x72,
+				0, 0, 0x9a, 0x72, 0, 0, 0x9a, 0x72))
+
+/*
+** dupq_pool_s8:
+**	...
+**	ld1rqb	z0\.b, p[0-7]/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_pool_s8, svint8_t,
+		z0 = svdupq_n_s8 (4, 10, 9, 77, 52, 22, 19, 50,
+				  -1, 32, 44, 17, 23, 99, 53, 39),
+		z0 = svdupq_s8 (4, 10, 9, 77, 52, 22, 19, 50,
+				-1, 32, 44, 17, 23, 99, 53, 39))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u16.c
new file mode 100644
index 000000000..6ca13222d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u16.c
@@ -0,0 +1,70 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_25600s_u16:
+**	mov	z0\.s, #25600
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_25600s_u16, svuint16_t,
+		z0 = svdupq_n_u16 (25600, 0, 25600, 0, 25600, 0, 25600, 0),
+		z0 = svdupq_u16 (25600, 0, 25600, 0, 25600, 0, 25600, 0))
+
+/*
+** dupq_7ff00s_u16:
+**	mov	z0\.s, #524032
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_7ff00s_u16, svuint16_t,
+		z0 = svdupq_n_u16 (0xff00, 7, 0xff00, 7, 0xff00, 7, 0xff00, 7),
+		z0 = svdupq_u16 (0xff00, 7, 0xff00, 7, 0xff00, 7, 0xff00, 7))
+
+/*
+** dupq_65536d_u16:
+**	mov	z0\.d, #65536
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_65536d_u16, svuint16_t,
+		z0 = svdupq_n_u16 (0, 1, 0, 0, 0, 1, 0, 0),
+		z0 = svdupq_u16 (0, 1, 0, 0, 0, 1, 0, 0))
+
+/*
+** dupq_m2d_u16:
+**	mov	z0\.d, #-2
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_m2d_u16, svuint16_t,
+		z0 = svdupq_n_u16 (-2, -1, -1, -1, -2, -1, -1, -1),
+		z0 = svdupq_u16 (-2, -1, -1, -1, -2, -1, -1, -1))
+
+/*
+** dupq_4ddb_u16:
+**	movi	v([0-9]+)\.2d, 0xff0000ffff00ff
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_4ddb_u16, svuint16_t,
+		z0 = svdupq_n_u16 (0xff, -1, 0, 0xff, 0xff, -1, 0, 0xff),
+		z0 = svdupq_u16 (0xff, -1, 0, 0xff, 0xff, -1, 0, 0xff))
+
+
+/*
+** dupq_a093s_u16:
+**	mov	(w[0-9]+), 41107
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_a093s_u16, svuint16_t,
+		z0 = svdupq_n_u16 (0xa093, 0, 0xa093, 0, 0xa093, 0, 0xa093, 0),
+		z0 = svdupq_u16 (0xa093, 0, 0xa093, 0, 0xa093, 0, 0xa093, 0));
+
+/*
+** dupq_pool_u16:
+**	...
+**	ld1rqh	z0\.h, p[0-7]/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_pool_u16, svuint16_t,
+		z0 = svdupq_n_u16 (4, 10, 9, 77, 52, 22, 19, 50),
+		z0 = svdupq_u16 (4, 10, 9, 77, 52, 22, 19, 50))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u32.c
new file mode 100644
index 000000000..3669bf8a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u32.c
@@ -0,0 +1,61 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_12800d_u32:
+**	mov	z0\.d, #12800
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_12800d_u32, svuint32_t,
+		z0 = svdupq_n_u32 (12800, 0, 12800, 0),
+		z0 = svdupq_u32 (12800, 0, 12800, 0))
+
+/*
+** dupq_fffffffed_u32:
+**	mov	z0\.d, #4294967294
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_fffffffed_u32, svuint32_t,
+		z0 = svdupq_n_u32 (-2, 0, -2, 0),
+		z0 = svdupq_u32 (-2, 0, -2, 0))
+
+/*
+** dupq_ff00ffffff00d_u32:
+**	movi	v([0-9]+)\.2d, 0xff00ffffff00
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_ff00ffffff00d_u32, svuint32_t,
+		z0 = svdupq_n_u32 (-256, 0xff00, -256, 0xff00),
+		z0 = svdupq_u32 (-256, 0xff00, -256, 0xff00))
+
+/*
+** dupq_fedcd_u32:
+**	mov	(x[0-9]+), 65244
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_fedcd_u32, svuint32_t,
+		z0 = svdupq_n_u32 (0xfedc, 0, 0xfedc, 0),
+		z0 = svdupq_u32 (0xfedc, 0, 0xfedc, 0))
+
+/*
+** dupq_1357ud_u32:
+**	mov	(x[0-9]+), 21264383082496
+**	mov	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_1357ud_u32, svuint32_t,
+		z0 = svdupq_n_u32 (0, 0x1357, 0, 0x1357),
+		z0 = svdupq_u32 (0, 0x1357, 0, 0x1357))
+
+/*
+** dupq_pool_u32:
+**	...
+**	ld1rqw	z0\.s, p[0-7]/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_pool_u32, svuint32_t,
+		z0 = svdupq_n_u32 (4, 10, 9, 77),
+		z0 = svdupq_u32 (4, 10, 9, 77))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u64.c
new file mode 100644
index 000000000..cb655a15a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u64.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_pool_u64:
+**	...
+**	ld1rqd	z0\.d, p[0-7]/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_pool_u64, svuint64_t,
+		z0 = svdupq_n_u64 (4, 10),
+		z0 = svdupq_u64 (4, 10))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u8.c
new file mode 100644
index 000000000..8b40c2b41
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/dupq_u8.c
@@ -0,0 +1,99 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_54h_u8:
+**	mov	z0\.h, #54
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_54h_u8, svuint8_t,
+		z0 = svdupq_n_u8 (54, 0, 54, 0, 54, 0, 54, 0,
+				  54, 0, 54, 0, 54, 0, 54, 0),
+		z0 = svdupq_u8 (54, 0, 54, 0, 54, 0, 54, 0,
+				54, 0, 54, 0, 54, 0, 54, 0))
+
+/*
+** dupq_2560h_u8:
+**	mov	z0\.h, #2560
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_2560h_u8, svuint8_t,
+		z0 = svdupq_n_u8 (0, 10, 0, 10, 0, 10, 0, 10,
+				  0, 10, 0, 10, 0, 10, 0, 10),
+		z0 = svdupq_u8 (0, 10, 0, 10, 0, 10, 0, 10,
+				0, 10, 0, 10, 0, 10, 0, 10))
+
+/*
+** dupq_5120s_u8:
+**	mov	z0\.s, #5120
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_5120s_u8, svuint8_t,
+		z0 = svdupq_n_u8 (0, 20, 0, 0, 0, 20, 0, 0,
+				  0, 20, 0, 0, 0, 20, 0, 0),
+		z0 = svdupq_u8 (0, 20, 0, 0, 0, 20, 0, 0,
+				0, 20, 0, 0, 0, 20, 0, 0))
+
+/*
+** dupq_1ff00s_u8:
+**	mov	z0\.s, #130816
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_1ff00s_u8, svuint8_t,
+		z0 = svdupq_n_u8 (0, -1, 1, 0, 0, -1, 1, 0,
+				  0, -1, 1, 0, 0, -1, 1, 0),
+		z0 = svdupq_u8 (0, -1, 1, 0, 0, -1, 1, 0,
+				0, -1, 1, 0, 0, -1, 1, 0))
+
+/*
+** dupq_96db_u8:
+**	movi	v([0-9]+)\.2d, 0xff0000ff00ffff00
+**	dup	z0\.q, z\1\.q\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_96db_u8, svuint8_t,
+		z0 = svdupq_n_u8 (0, -1, -1, 0, -1, 0, 0, -1,
+				  0, -1, -1, 0, -1, 0, 0, -1),
+		z0 = svdupq_u8 (0, -1, -1, 0, -1, 0, 0, -1,
+				0, -1, -1, 0, -1, 0, 0, -1))
+
+/*
+** dupq_7755h_u8:
+**	mov	(w[0-9]+), 21879
+**	mov	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_7755h_u8, svuint8_t,
+		z0 = svdupq_n_u8 (0x77, 0x55, 0x77, 0x55,
+				  0x77, 0x55, 0x77, 0x55,
+				  0x77, 0x55, 0x77, 0x55,
+				  0x77, 0x55, 0x77, 0x55),
+		z0 = svdupq_u8 (0x77, 0x55, 0x77, 0x55,
+				0x77, 0x55, 0x77, 0x55,
+				0x77, 0x55, 0x77, 0x55,
+				0x77, 0x55, 0x77, 0x55))
+
+/*
+** dupq_729a0000s_u8:
+**	mov	(w[0-9]+), 1922695168
+**	mov	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_729a0000s_u8, svuint8_t,
+		z0 = svdupq_n_u8 (0, 0, 0x9a, 0x72, 0, 0, 0x9a, 0x72,
+				  0, 0, 0x9a, 0x72, 0, 0, 0x9a, 0x72),
+		z0 = svdupq_u8 (0, 0, 0x9a, 0x72, 0, 0, 0x9a, 0x72,
+				0, 0, 0x9a, 0x72, 0, 0, 0x9a, 0x72))
+
+/*
+** dupq_pool_u8:
+**	...
+**	ld1rqb	z0\.b, p[0-7]/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_UNIFORM_Z (dupq_pool_u8, svuint8_t,
+		z0 = svdupq_n_u8 (4, 10, 9, 77, 52, 22, 19, 50,
+				  -1, 32, 44, 17, 23, 99, 53, 39),
+		z0 = svdupq_u8 (4, 10, 9, 77, 52, 22, 19, 50,
+				-1, 32, 44, 17, 23, 99, 53, 39))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_b.c
new file mode 100644
index 000000000..961ae84c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_b.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eor_b_z_tied1:
+**	eor	p0\.b, p3/z, (p0\.b, p1\.b|p1\.b, p0\.b)
+**	ret
+*/
+TEST_UNIFORM_P (eor_b_z_tied1,
+		p0 = sveor_b_z (p3, p0, p1),
+		p0 = sveor_z (p3, p0, p1))
+
+/*
+** eor_b_z_tied2:
+**	eor	p0\.b, p3/z, (p0\.b, p1\.b|p1\.b, p0\.b)
+**	ret
+*/
+TEST_UNIFORM_P (eor_b_z_tied2,
+		p0 = sveor_b_z (p3, p1, p0),
+		p0 = sveor_z (p3, p1, p0))
+
+/*
+** eor_b_z_untied:
+**	eor	p0\.b, p3/z, (p1\.b, p2\.b|p2\.b, p1\.b)
+**	ret
+*/
+TEST_UNIFORM_P (eor_b_z_untied,
+		p0 = sveor_b_z (p3, p1, p2),
+		p0 = sveor_z (p3, p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s16.c
new file mode 100644
index 000000000..7cf73609a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s16.c
@@ -0,0 +1,376 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eor_s16_m_tied1:
+**	eor	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s16_m_tied1, svint16_t,
+		z0 = sveor_s16_m (p0, z0, z1),
+		z0 = sveor_m (p0, z0, z1))
+
+/*
+** eor_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	eor	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s16_m_tied2, svint16_t,
+		z0 = sveor_s16_m (p0, z1, z0),
+		z0 = sveor_m (p0, z1, z0))
+
+/*
+** eor_s16_m_untied:
+**	movprfx	z0, z1
+**	eor	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s16_m_untied, svint16_t,
+		z0 = sveor_s16_m (p0, z1, z2),
+		z0 = sveor_m (p0, z1, z2))
+
+/*
+** eor_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	eor	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s16_m_tied1, svint16_t, int16_t,
+		 z0 = sveor_n_s16_m (p0, z0, x0),
+		 z0 = sveor_m (p0, z0, x0))
+
+/*
+** eor_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	eor	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s16_m_untied, svint16_t, int16_t,
+		 z0 = sveor_n_s16_m (p0, z1, x0),
+		 z0 = sveor_m (p0, z1, x0))
+
+/*
+** eor_1_s16_m_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	eor	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s16_m_tied1, svint16_t,
+		z0 = sveor_n_s16_m (p0, z0, 1),
+		z0 = sveor_m (p0, z0, 1))
+
+/*
+** eor_1_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0, z1
+**	eor	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s16_m_untied, svint16_t,
+		z0 = sveor_n_s16_m (p0, z1, 1),
+		z0 = sveor_m (p0, z1, 1))
+
+/*
+** eor_m2_s16_m:
+**	mov	(z[0-9]+\.h), #-2
+**	eor	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m2_s16_m, svint16_t,
+		z0 = sveor_n_s16_m (p0, z0, -2),
+		z0 = sveor_m (p0, z0, -2))
+
+/*
+** eor_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	eor	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s16_z_tied1, svint16_t,
+		z0 = sveor_s16_z (p0, z0, z1),
+		z0 = sveor_z (p0, z0, z1))
+
+/*
+** eor_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	eor	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s16_z_tied2, svint16_t,
+		z0 = sveor_s16_z (p0, z1, z0),
+		z0 = sveor_z (p0, z1, z0))
+
+/*
+** eor_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	eor	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	eor	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s16_z_untied, svint16_t,
+		z0 = sveor_s16_z (p0, z1, z2),
+		z0 = sveor_z (p0, z1, z2))
+
+/*
+** eor_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	eor	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s16_z_tied1, svint16_t, int16_t,
+		 z0 = sveor_n_s16_z (p0, z0, x0),
+		 z0 = sveor_z (p0, z0, x0))
+
+/*
+** eor_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	eor	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	eor	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s16_z_untied, svint16_t, int16_t,
+		 z0 = sveor_n_s16_z (p0, z1, x0),
+		 z0 = sveor_z (p0, z1, x0))
+
+/*
+** eor_1_s16_z_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	eor	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s16_z_tied1, svint16_t,
+		z0 = sveor_n_s16_z (p0, z0, 1),
+		z0 = sveor_z (p0, z0, 1))
+
+/*
+** eor_1_s16_z_untied:
+**	mov	(z[0-9]+\.h), #1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	eor	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	eor	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s16_z_untied, svint16_t,
+		z0 = sveor_n_s16_z (p0, z1, 1),
+		z0 = sveor_z (p0, z1, 1))
+
+/*
+** eor_s16_x_tied1:
+**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s16_x_tied1, svint16_t,
+		z0 = sveor_s16_x (p0, z0, z1),
+		z0 = sveor_x (p0, z0, z1))
+
+/*
+** eor_s16_x_tied2:
+**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s16_x_tied2, svint16_t,
+		z0 = sveor_s16_x (p0, z1, z0),
+		z0 = sveor_x (p0, z1, z0))
+
+/*
+** eor_s16_x_untied:
+**	eor	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s16_x_untied, svint16_t,
+		z0 = sveor_s16_x (p0, z1, z2),
+		z0 = sveor_x (p0, z1, z2))
+
+/*
+** eor_w0_s16_x_tied1:
+**	mov	(z[0-9]+)\.h, w0
+**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s16_x_tied1, svint16_t, int16_t,
+		 z0 = sveor_n_s16_x (p0, z0, x0),
+		 z0 = sveor_x (p0, z0, x0))
+
+/*
+** eor_w0_s16_x_untied:
+**	mov	(z[0-9]+)\.h, w0
+**	eor	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s16_x_untied, svint16_t, int16_t,
+		 z0 = sveor_n_s16_x (p0, z1, x0),
+		 z0 = sveor_x (p0, z1, x0))
+
+/*
+** eor_1_s16_x_tied1:
+**	eor	z0\.h, z0\.h, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s16_x_tied1, svint16_t,
+		z0 = sveor_n_s16_x (p0, z0, 1),
+		z0 = sveor_x (p0, z0, 1))
+
+/*
+** eor_1_s16_x_untied:
+**	movprfx	z0, z1
+**	eor	z0\.h, z0\.h, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s16_x_untied, svint16_t,
+		z0 = sveor_n_s16_x (p0, z1, 1),
+		z0 = sveor_x (p0, z1, 1))
+
+/*
+** eor_127_s16_x:
+**	eor	z0\.h, z0\.h, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (eor_127_s16_x, svint16_t,
+		z0 = sveor_n_s16_x (p0, z0, 127),
+		z0 = sveor_x (p0, z0, 127))
+
+/*
+** eor_128_s16_x:
+**	eor	z0\.h, z0\.h, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (eor_128_s16_x, svint16_t,
+		z0 = sveor_n_s16_x (p0, z0, 128),
+		z0 = sveor_x (p0, z0, 128))
+
+/*
+** eor_255_s16_x:
+**	eor	z0\.h, z0\.h, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (eor_255_s16_x, svint16_t,
+		z0 = sveor_n_s16_x (p0, z0, 255),
+		z0 = sveor_x (p0, z0, 255))
+
+/*
+** eor_256_s16_x:
+**	eor	z0\.h, z0\.h, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (eor_256_s16_x, svint16_t,
+		z0 = sveor_n_s16_x (p0, z0, 256),
+		z0 = sveor_x (p0, z0, 256))
+
+/*
+** eor_257_s16_x:
+**	eor	z0\.h, z0\.h, #0x101
+**	ret
+*/
+TEST_UNIFORM_Z (eor_257_s16_x, svint16_t,
+		z0 = sveor_n_s16_x (p0, z0, 257),
+		z0 = sveor_x (p0, z0, 257))
+
+/*
+** eor_512_s16_x:
+**	eor	z0\.h, z0\.h, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (eor_512_s16_x, svint16_t,
+		z0 = sveor_n_s16_x (p0, z0, 512),
+		z0 = sveor_x (p0, z0, 512))
+
+/*
+** eor_65280_s16_x:
+**	eor	z0\.h, z0\.h, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_65280_s16_x, svint16_t,
+		z0 = sveor_n_s16_x (p0, z0, 0xff00),
+		z0 = sveor_x (p0, z0, 0xff00))
+
+/*
+** eor_m127_s16_x:
+**	eor	z0\.h, z0\.h, #0xff81
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m127_s16_x, svint16_t,
+		z0 = sveor_n_s16_x (p0, z0, -127),
+		z0 = sveor_x (p0, z0, -127))
+
+/*
+** eor_m128_s16_x:
+**	eor	z0\.h, z0\.h, #0xff80
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m128_s16_x, svint16_t,
+		z0 = sveor_n_s16_x (p0, z0, -128),
+		z0 = sveor_x (p0, z0, -128))
+
+/*
+** eor_m255_s16_x:
+**	eor	z0\.h, z0\.h, #0xff01
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m255_s16_x, svint16_t,
+		z0 = sveor_n_s16_x (p0, z0, -255),
+		z0 = sveor_x (p0, z0, -255))
+
+/*
+** eor_m256_s16_x:
+**	eor	z0\.h, z0\.h, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m256_s16_x, svint16_t,
+		z0 = sveor_n_s16_x (p0, z0, -256),
+		z0 = sveor_x (p0, z0, -256))
+
+/*
+** eor_m257_s16_x:
+**	eor	z0\.h, z0\.h, #0xfeff
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m257_s16_x, svint16_t,
+		z0 = sveor_n_s16_x (p0, z0, -257),
+		z0 = sveor_x (p0, z0, -257))
+
+/*
+** eor_m512_s16_x:
+**	eor	z0\.h, z0\.h, #0xfe00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m512_s16_x, svint16_t,
+		z0 = sveor_n_s16_x (p0, z0, -512),
+		z0 = sveor_x (p0, z0, -512))
+
+/*
+** eor_m32768_s16_x:
+**	eor	z0\.h, z0\.h, #0x8000
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m32768_s16_x, svint16_t,
+		z0 = sveor_n_s16_x (p0, z0, -0x8000),
+		z0 = sveor_x (p0, z0, -0x8000))
+
+/*
+** eor_5_s16_x:
+**	mov	(z[0-9]+)\.h, #5
+**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_5_s16_x, svint16_t,
+		z0 = sveor_n_s16_x (p0, z0, 5),
+		z0 = sveor_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s32.c
new file mode 100644
index 000000000..d5aecb201
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s32.c
@@ -0,0 +1,372 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eor_s32_m_tied1:
+**	eor	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s32_m_tied1, svint32_t,
+		z0 = sveor_s32_m (p0, z0, z1),
+		z0 = sveor_m (p0, z0, z1))
+
+/*
+** eor_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	eor	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s32_m_tied2, svint32_t,
+		z0 = sveor_s32_m (p0, z1, z0),
+		z0 = sveor_m (p0, z1, z0))
+
+/*
+** eor_s32_m_untied:
+**	movprfx	z0, z1
+**	eor	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s32_m_untied, svint32_t,
+		z0 = sveor_s32_m (p0, z1, z2),
+		z0 = sveor_m (p0, z1, z2))
+
+/*
+** eor_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	eor	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = sveor_n_s32_m (p0, z0, x0),
+		 z0 = sveor_m (p0, z0, x0))
+
+/*
+** eor_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	eor	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = sveor_n_s32_m (p0, z1, x0),
+		 z0 = sveor_m (p0, z1, x0))
+
+/*
+** eor_1_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	eor	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s32_m_tied1, svint32_t,
+		z0 = sveor_n_s32_m (p0, z0, 1),
+		z0 = sveor_m (p0, z0, 1))
+
+/*
+** eor_1_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0, z1
+**	eor	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s32_m_untied, svint32_t,
+		z0 = sveor_n_s32_m (p0, z1, 1),
+		z0 = sveor_m (p0, z1, 1))
+
+/*
+** eor_m2_s32_m:
+**	mov	(z[0-9]+\.s), #-2
+**	eor	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m2_s32_m, svint32_t,
+		z0 = sveor_n_s32_m (p0, z0, -2),
+		z0 = sveor_m (p0, z0, -2))
+
+/*
+** eor_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	eor	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s32_z_tied1, svint32_t,
+		z0 = sveor_s32_z (p0, z0, z1),
+		z0 = sveor_z (p0, z0, z1))
+
+/*
+** eor_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	eor	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s32_z_tied2, svint32_t,
+		z0 = sveor_s32_z (p0, z1, z0),
+		z0 = sveor_z (p0, z1, z0))
+
+/*
+** eor_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	eor	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	eor	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s32_z_untied, svint32_t,
+		z0 = sveor_s32_z (p0, z1, z2),
+		z0 = sveor_z (p0, z1, z2))
+
+/*
+** eor_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	eor	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = sveor_n_s32_z (p0, z0, x0),
+		 z0 = sveor_z (p0, z0, x0))
+
+/*
+** eor_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	eor	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	eor	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = sveor_n_s32_z (p0, z1, x0),
+		 z0 = sveor_z (p0, z1, x0))
+
+/*
+** eor_1_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	eor	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s32_z_tied1, svint32_t,
+		z0 = sveor_n_s32_z (p0, z0, 1),
+		z0 = sveor_z (p0, z0, 1))
+
+/*
+** eor_1_s32_z_untied:
+**	mov	(z[0-9]+\.s), #1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	eor	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	eor	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s32_z_untied, svint32_t,
+		z0 = sveor_n_s32_z (p0, z1, 1),
+		z0 = sveor_z (p0, z1, 1))
+
+/*
+** eor_s32_x_tied1:
+**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s32_x_tied1, svint32_t,
+		z0 = sveor_s32_x (p0, z0, z1),
+		z0 = sveor_x (p0, z0, z1))
+
+/*
+** eor_s32_x_tied2:
+**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s32_x_tied2, svint32_t,
+		z0 = sveor_s32_x (p0, z1, z0),
+		z0 = sveor_x (p0, z1, z0))
+
+/*
+** eor_s32_x_untied:
+**	eor	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s32_x_untied, svint32_t,
+		z0 = sveor_s32_x (p0, z1, z2),
+		z0 = sveor_x (p0, z1, z2))
+
+/*
+** eor_w0_s32_x_tied1:
+**	mov	(z[0-9]+)\.s, w0
+**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = sveor_n_s32_x (p0, z0, x0),
+		 z0 = sveor_x (p0, z0, x0))
+
+/*
+** eor_w0_s32_x_untied:
+**	mov	(z[0-9]+)\.s, w0
+**	eor	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = sveor_n_s32_x (p0, z1, x0),
+		 z0 = sveor_x (p0, z1, x0))
+
+/*
+** eor_1_s32_x_tied1:
+**	eor	z0\.s, z0\.s, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s32_x_tied1, svint32_t,
+		z0 = sveor_n_s32_x (p0, z0, 1),
+		z0 = sveor_x (p0, z0, 1))
+
+/*
+** eor_1_s32_x_untied:
+**	movprfx	z0, z1
+**	eor	z0\.s, z0\.s, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s32_x_untied, svint32_t,
+		z0 = sveor_n_s32_x (p0, z1, 1),
+		z0 = sveor_x (p0, z1, 1))
+
+/*
+** eor_127_s32_x:
+**	eor	z0\.s, z0\.s, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (eor_127_s32_x, svint32_t,
+		z0 = sveor_n_s32_x (p0, z0, 127),
+		z0 = sveor_x (p0, z0, 127))
+
+/*
+** eor_128_s32_x:
+**	eor	z0\.s, z0\.s, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (eor_128_s32_x, svint32_t,
+		z0 = sveor_n_s32_x (p0, z0, 128),
+		z0 = sveor_x (p0, z0, 128))
+
+/*
+** eor_255_s32_x:
+**	eor	z0\.s, z0\.s, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (eor_255_s32_x, svint32_t,
+		z0 = sveor_n_s32_x (p0, z0, 255),
+		z0 = sveor_x (p0, z0, 255))
+
+/*
+** eor_256_s32_x:
+**	eor	z0\.s, z0\.s, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (eor_256_s32_x, svint32_t,
+		z0 = sveor_n_s32_x (p0, z0, 256),
+		z0 = sveor_x (p0, z0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (eor_257_s32_x, svint32_t,
+		z0 = sveor_n_s32_x (p0, z0, 257),
+		z0 = sveor_x (p0, z0, 257))
+
+/*
+** eor_512_s32_x:
+**	eor	z0\.s, z0\.s, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (eor_512_s32_x, svint32_t,
+		z0 = sveor_n_s32_x (p0, z0, 512),
+		z0 = sveor_x (p0, z0, 512))
+
+/*
+** eor_65280_s32_x:
+**	eor	z0\.s, z0\.s, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_65280_s32_x, svint32_t,
+		z0 = sveor_n_s32_x (p0, z0, 0xff00),
+		z0 = sveor_x (p0, z0, 0xff00))
+
+/*
+** eor_m127_s32_x:
+**	eor	z0\.s, z0\.s, #0xffffff81
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m127_s32_x, svint32_t,
+		z0 = sveor_n_s32_x (p0, z0, -127),
+		z0 = sveor_x (p0, z0, -127))
+
+/*
+** eor_m128_s32_x:
+**	eor	z0\.s, z0\.s, #0xffffff80
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m128_s32_x, svint32_t,
+		z0 = sveor_n_s32_x (p0, z0, -128),
+		z0 = sveor_x (p0, z0, -128))
+
+/*
+** eor_m255_s32_x:
+**	eor	z0\.s, z0\.s, #0xffffff01
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m255_s32_x, svint32_t,
+		z0 = sveor_n_s32_x (p0, z0, -255),
+		z0 = sveor_x (p0, z0, -255))
+
+/*
+** eor_m256_s32_x:
+**	eor	z0\.s, z0\.s, #0xffffff00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m256_s32_x, svint32_t,
+		z0 = sveor_n_s32_x (p0, z0, -256),
+		z0 = sveor_x (p0, z0, -256))
+
+/*
+** eor_m257_s32_x:
+**	eor	z0\.s, z0\.s, #0xfffffeff
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m257_s32_x, svint32_t,
+		z0 = sveor_n_s32_x (p0, z0, -257),
+		z0 = sveor_x (p0, z0, -257))
+
+/*
+** eor_m512_s32_x:
+**	eor	z0\.s, z0\.s, #0xfffffe00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m512_s32_x, svint32_t,
+		z0 = sveor_n_s32_x (p0, z0, -512),
+		z0 = sveor_x (p0, z0, -512))
+
+/*
+** eor_m32768_s32_x:
+**	eor	z0\.s, z0\.s, #0xffff8000
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m32768_s32_x, svint32_t,
+		z0 = sveor_n_s32_x (p0, z0, -0x8000),
+		z0 = sveor_x (p0, z0, -0x8000))
+
+/*
+** eor_5_s32_x:
+**	mov	(z[0-9]+)\.s, #5
+**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_5_s32_x, svint32_t,
+		z0 = sveor_n_s32_x (p0, z0, 5),
+		z0 = sveor_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s64.c
new file mode 100644
index 000000000..157128974
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s64.c
@@ -0,0 +1,372 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eor_s64_m_tied1:
+**	eor	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s64_m_tied1, svint64_t,
+		z0 = sveor_s64_m (p0, z0, z1),
+		z0 = sveor_m (p0, z0, z1))
+
+/*
+** eor_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	eor	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s64_m_tied2, svint64_t,
+		z0 = sveor_s64_m (p0, z1, z0),
+		z0 = sveor_m (p0, z1, z0))
+
+/*
+** eor_s64_m_untied:
+**	movprfx	z0, z1
+**	eor	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s64_m_untied, svint64_t,
+		z0 = sveor_s64_m (p0, z1, z2),
+		z0 = sveor_m (p0, z1, z2))
+
+/*
+** eor_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	eor	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = sveor_n_s64_m (p0, z0, x0),
+		 z0 = sveor_m (p0, z0, x0))
+
+/*
+** eor_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	eor	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = sveor_n_s64_m (p0, z1, x0),
+		 z0 = sveor_m (p0, z1, x0))
+
+/*
+** eor_1_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	eor	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s64_m_tied1, svint64_t,
+		z0 = sveor_n_s64_m (p0, z0, 1),
+		z0 = sveor_m (p0, z0, 1))
+
+/*
+** eor_1_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0, z1
+**	eor	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s64_m_untied, svint64_t,
+		z0 = sveor_n_s64_m (p0, z1, 1),
+		z0 = sveor_m (p0, z1, 1))
+
+/*
+** eor_m2_s64_m:
+**	mov	(z[0-9]+\.d), #-2
+**	eor	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m2_s64_m, svint64_t,
+		z0 = sveor_n_s64_m (p0, z0, -2),
+		z0 = sveor_m (p0, z0, -2))
+
+/*
+** eor_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	eor	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s64_z_tied1, svint64_t,
+		z0 = sveor_s64_z (p0, z0, z1),
+		z0 = sveor_z (p0, z0, z1))
+
+/*
+** eor_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	eor	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s64_z_tied2, svint64_t,
+		z0 = sveor_s64_z (p0, z1, z0),
+		z0 = sveor_z (p0, z1, z0))
+
+/*
+** eor_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	eor	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	eor	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s64_z_untied, svint64_t,
+		z0 = sveor_s64_z (p0, z1, z2),
+		z0 = sveor_z (p0, z1, z2))
+
+/*
+** eor_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	eor	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = sveor_n_s64_z (p0, z0, x0),
+		 z0 = sveor_z (p0, z0, x0))
+
+/*
+** eor_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	eor	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	eor	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = sveor_n_s64_z (p0, z1, x0),
+		 z0 = sveor_z (p0, z1, x0))
+
+/*
+** eor_1_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	eor	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s64_z_tied1, svint64_t,
+		z0 = sveor_n_s64_z (p0, z0, 1),
+		z0 = sveor_z (p0, z0, 1))
+
+/*
+** eor_1_s64_z_untied:
+**	mov	(z[0-9]+\.d), #1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	eor	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	eor	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s64_z_untied, svint64_t,
+		z0 = sveor_n_s64_z (p0, z1, 1),
+		z0 = sveor_z (p0, z1, 1))
+
+/*
+** eor_s64_x_tied1:
+**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s64_x_tied1, svint64_t,
+		z0 = sveor_s64_x (p0, z0, z1),
+		z0 = sveor_x (p0, z0, z1))
+
+/*
+** eor_s64_x_tied2:
+**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s64_x_tied2, svint64_t,
+		z0 = sveor_s64_x (p0, z1, z0),
+		z0 = sveor_x (p0, z1, z0))
+
+/*
+** eor_s64_x_untied:
+**	eor	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s64_x_untied, svint64_t,
+		z0 = sveor_s64_x (p0, z1, z2),
+		z0 = sveor_x (p0, z1, z2))
+
+/*
+** eor_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	eor	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = sveor_n_s64_x (p0, z0, x0),
+		 z0 = sveor_x (p0, z0, x0))
+
+/*
+** eor_x0_s64_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	eor	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = sveor_n_s64_x (p0, z1, x0),
+		 z0 = sveor_x (p0, z1, x0))
+
+/*
+** eor_1_s64_x_tied1:
+**	eor	z0\.d, z0\.d, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s64_x_tied1, svint64_t,
+		z0 = sveor_n_s64_x (p0, z0, 1),
+		z0 = sveor_x (p0, z0, 1))
+
+/*
+** eor_1_s64_x_untied:
+**	movprfx	z0, z1
+**	eor	z0\.d, z0\.d, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s64_x_untied, svint64_t,
+		z0 = sveor_n_s64_x (p0, z1, 1),
+		z0 = sveor_x (p0, z1, 1))
+
+/*
+** eor_127_s64_x:
+**	eor	z0\.d, z0\.d, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (eor_127_s64_x, svint64_t,
+		z0 = sveor_n_s64_x (p0, z0, 127),
+		z0 = sveor_x (p0, z0, 127))
+
+/*
+** eor_128_s64_x:
+**	eor	z0\.d, z0\.d, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (eor_128_s64_x, svint64_t,
+		z0 = sveor_n_s64_x (p0, z0, 128),
+		z0 = sveor_x (p0, z0, 128))
+
+/*
+** eor_255_s64_x:
+**	eor	z0\.d, z0\.d, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (eor_255_s64_x, svint64_t,
+		z0 = sveor_n_s64_x (p0, z0, 255),
+		z0 = sveor_x (p0, z0, 255))
+
+/*
+** eor_256_s64_x:
+**	eor	z0\.d, z0\.d, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (eor_256_s64_x, svint64_t,
+		z0 = sveor_n_s64_x (p0, z0, 256),
+		z0 = sveor_x (p0, z0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (eor_257_s64_x, svint64_t,
+		z0 = sveor_n_s64_x (p0, z0, 257),
+		z0 = sveor_x (p0, z0, 257))
+
+/*
+** eor_512_s64_x:
+**	eor	z0\.d, z0\.d, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (eor_512_s64_x, svint64_t,
+		z0 = sveor_n_s64_x (p0, z0, 512),
+		z0 = sveor_x (p0, z0, 512))
+
+/*
+** eor_65280_s64_x:
+**	eor	z0\.d, z0\.d, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_65280_s64_x, svint64_t,
+		z0 = sveor_n_s64_x (p0, z0, 0xff00),
+		z0 = sveor_x (p0, z0, 0xff00))
+
+/*
+** eor_m127_s64_x:
+**	eor	z0\.d, z0\.d, #0xffffffffffffff81
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m127_s64_x, svint64_t,
+		z0 = sveor_n_s64_x (p0, z0, -127),
+		z0 = sveor_x (p0, z0, -127))
+
+/*
+** eor_m128_s64_x:
+**	eor	z0\.d, z0\.d, #0xffffffffffffff80
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m128_s64_x, svint64_t,
+		z0 = sveor_n_s64_x (p0, z0, -128),
+		z0 = sveor_x (p0, z0, -128))
+
+/*
+** eor_m255_s64_x:
+**	eor	z0\.d, z0\.d, #0xffffffffffffff01
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m255_s64_x, svint64_t,
+		z0 = sveor_n_s64_x (p0, z0, -255),
+		z0 = sveor_x (p0, z0, -255))
+
+/*
+** eor_m256_s64_x:
+**	eor	z0\.d, z0\.d, #0xffffffffffffff00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m256_s64_x, svint64_t,
+		z0 = sveor_n_s64_x (p0, z0, -256),
+		z0 = sveor_x (p0, z0, -256))
+
+/*
+** eor_m257_s64_x:
+**	eor	z0\.d, z0\.d, #0xfffffffffffffeff
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m257_s64_x, svint64_t,
+		z0 = sveor_n_s64_x (p0, z0, -257),
+		z0 = sveor_x (p0, z0, -257))
+
+/*
+** eor_m512_s64_x:
+**	eor	z0\.d, z0\.d, #0xfffffffffffffe00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m512_s64_x, svint64_t,
+		z0 = sveor_n_s64_x (p0, z0, -512),
+		z0 = sveor_x (p0, z0, -512))
+
+/*
+** eor_m32768_s64_x:
+**	eor	z0\.d, z0\.d, #0xffffffffffff8000
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m32768_s64_x, svint64_t,
+		z0 = sveor_n_s64_x (p0, z0, -0x8000),
+		z0 = sveor_x (p0, z0, -0x8000))
+
+/*
+** eor_5_s64_x:
+**	mov	(z[0-9]+\.d), #5
+**	eor	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_5_s64_x, svint64_t,
+		z0 = sveor_n_s64_x (p0, z0, 5),
+		z0 = sveor_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s8.c
new file mode 100644
index 000000000..083ac2dde
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_s8.c
@@ -0,0 +1,296 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eor_s8_m_tied1:
+**	eor	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s8_m_tied1, svint8_t,
+		z0 = sveor_s8_m (p0, z0, z1),
+		z0 = sveor_m (p0, z0, z1))
+
+/*
+** eor_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	eor	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s8_m_tied2, svint8_t,
+		z0 = sveor_s8_m (p0, z1, z0),
+		z0 = sveor_m (p0, z1, z0))
+
+/*
+** eor_s8_m_untied:
+**	movprfx	z0, z1
+**	eor	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s8_m_untied, svint8_t,
+		z0 = sveor_s8_m (p0, z1, z2),
+		z0 = sveor_m (p0, z1, z2))
+
+/*
+** eor_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	eor	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s8_m_tied1, svint8_t, int8_t,
+		 z0 = sveor_n_s8_m (p0, z0, x0),
+		 z0 = sveor_m (p0, z0, x0))
+
+/*
+** eor_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	eor	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s8_m_untied, svint8_t, int8_t,
+		 z0 = sveor_n_s8_m (p0, z1, x0),
+		 z0 = sveor_m (p0, z1, x0))
+
+/*
+** eor_1_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	eor	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s8_m_tied1, svint8_t,
+		z0 = sveor_n_s8_m (p0, z0, 1),
+		z0 = sveor_m (p0, z0, 1))
+
+/*
+** eor_1_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0, z1
+**	eor	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s8_m_untied, svint8_t,
+		z0 = sveor_n_s8_m (p0, z1, 1),
+		z0 = sveor_m (p0, z1, 1))
+
+/*
+** eor_m2_s8_m:
+**	mov	(z[0-9]+\.b), #-2
+**	eor	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m2_s8_m, svint8_t,
+		z0 = sveor_n_s8_m (p0, z0, -2),
+		z0 = sveor_m (p0, z0, -2))
+
+/*
+** eor_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	eor	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s8_z_tied1, svint8_t,
+		z0 = sveor_s8_z (p0, z0, z1),
+		z0 = sveor_z (p0, z0, z1))
+
+/*
+** eor_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	eor	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s8_z_tied2, svint8_t,
+		z0 = sveor_s8_z (p0, z1, z0),
+		z0 = sveor_z (p0, z1, z0))
+
+/*
+** eor_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	eor	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	eor	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s8_z_untied, svint8_t,
+		z0 = sveor_s8_z (p0, z1, z2),
+		z0 = sveor_z (p0, z1, z2))
+
+/*
+** eor_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	eor	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s8_z_tied1, svint8_t, int8_t,
+		 z0 = sveor_n_s8_z (p0, z0, x0),
+		 z0 = sveor_z (p0, z0, x0))
+
+/*
+** eor_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	eor	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	eor	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s8_z_untied, svint8_t, int8_t,
+		 z0 = sveor_n_s8_z (p0, z1, x0),
+		 z0 = sveor_z (p0, z1, x0))
+
+/*
+** eor_1_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	eor	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s8_z_tied1, svint8_t,
+		z0 = sveor_n_s8_z (p0, z0, 1),
+		z0 = sveor_z (p0, z0, 1))
+
+/*
+** eor_1_s8_z_untied:
+**	mov	(z[0-9]+\.b), #1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	eor	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	eor	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s8_z_untied, svint8_t,
+		z0 = sveor_n_s8_z (p0, z1, 1),
+		z0 = sveor_z (p0, z1, 1))
+
+/*
+** eor_s8_x_tied1:
+**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s8_x_tied1, svint8_t,
+		z0 = sveor_s8_x (p0, z0, z1),
+		z0 = sveor_x (p0, z0, z1))
+
+/*
+** eor_s8_x_tied2:
+**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s8_x_tied2, svint8_t,
+		z0 = sveor_s8_x (p0, z1, z0),
+		z0 = sveor_x (p0, z1, z0))
+
+/*
+** eor_s8_x_untied:
+**	eor	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_s8_x_untied, svint8_t,
+		z0 = sveor_s8_x (p0, z1, z2),
+		z0 = sveor_x (p0, z1, z2))
+
+/*
+** eor_w0_s8_x_tied1:
+**	mov	(z[0-9]+)\.b, w0
+**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s8_x_tied1, svint8_t, int8_t,
+		 z0 = sveor_n_s8_x (p0, z0, x0),
+		 z0 = sveor_x (p0, z0, x0))
+
+/*
+** eor_w0_s8_x_untied:
+**	mov	(z[0-9]+)\.b, w0
+**	eor	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_s8_x_untied, svint8_t, int8_t,
+		 z0 = sveor_n_s8_x (p0, z1, x0),
+		 z0 = sveor_x (p0, z1, x0))
+
+/*
+** eor_1_s8_x_tied1:
+**	eor	z0\.b, z0\.b, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s8_x_tied1, svint8_t,
+		z0 = sveor_n_s8_x (p0, z0, 1),
+		z0 = sveor_x (p0, z0, 1))
+
+/*
+** eor_1_s8_x_untied:
+**	movprfx	z0, z1
+**	eor	z0\.b, z0\.b, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_s8_x_untied, svint8_t,
+		z0 = sveor_n_s8_x (p0, z1, 1),
+		z0 = sveor_x (p0, z1, 1))
+
+/*
+** eor_127_s8_x:
+**	eor	z0\.b, z0\.b, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (eor_127_s8_x, svint8_t,
+		z0 = sveor_n_s8_x (p0, z0, 127),
+		z0 = sveor_x (p0, z0, 127))
+
+/*
+** eor_128_s8_x:
+**	eor	z0\.b, z0\.b, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (eor_128_s8_x, svint8_t,
+		z0 = sveor_n_s8_x (p0, z0, 128),
+		z0 = sveor_x (p0, z0, 128))
+
+/*
+** eor_255_s8_x:
+**	mov	(z[0-9]+)\.b, #-1
+**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_255_s8_x, svint8_t,
+		z0 = sveor_n_s8_x (p0, z0, 255),
+		z0 = sveor_x (p0, z0, 255))
+
+/*
+** eor_m127_s8_x:
+**	eor	z0\.b, z0\.b, #0x81
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m127_s8_x, svint8_t,
+		z0 = sveor_n_s8_x (p0, z0, -127),
+		z0 = sveor_x (p0, z0, -127))
+
+/*
+** eor_m128_s8_x:
+**	eor	z0\.b, z0\.b, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m128_s8_x, svint8_t,
+		z0 = sveor_n_s8_x (p0, z0, -128),
+		z0 = sveor_x (p0, z0, -128))
+
+/*
+** eor_5_s8_x:
+**	mov	(z[0-9]+)\.b, #5
+**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_5_s8_x, svint8_t,
+		z0 = sveor_n_s8_x (p0, z0, 5),
+		z0 = sveor_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u16.c
new file mode 100644
index 000000000..40b43a5f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u16.c
@@ -0,0 +1,376 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eor_u16_m_tied1:
+**	eor	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u16_m_tied1, svuint16_t,
+		z0 = sveor_u16_m (p0, z0, z1),
+		z0 = sveor_m (p0, z0, z1))
+
+/*
+** eor_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	eor	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u16_m_tied2, svuint16_t,
+		z0 = sveor_u16_m (p0, z1, z0),
+		z0 = sveor_m (p0, z1, z0))
+
+/*
+** eor_u16_m_untied:
+**	movprfx	z0, z1
+**	eor	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u16_m_untied, svuint16_t,
+		z0 = sveor_u16_m (p0, z1, z2),
+		z0 = sveor_m (p0, z1, z2))
+
+/*
+** eor_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	eor	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = sveor_n_u16_m (p0, z0, x0),
+		 z0 = sveor_m (p0, z0, x0))
+
+/*
+** eor_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	eor	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = sveor_n_u16_m (p0, z1, x0),
+		 z0 = sveor_m (p0, z1, x0))
+
+/*
+** eor_1_u16_m_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	eor	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u16_m_tied1, svuint16_t,
+		z0 = sveor_n_u16_m (p0, z0, 1),
+		z0 = sveor_m (p0, z0, 1))
+
+/*
+** eor_1_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0, z1
+**	eor	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u16_m_untied, svuint16_t,
+		z0 = sveor_n_u16_m (p0, z1, 1),
+		z0 = sveor_m (p0, z1, 1))
+
+/*
+** eor_m2_u16_m:
+**	mov	(z[0-9]+\.h), #-2
+**	eor	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m2_u16_m, svuint16_t,
+		z0 = sveor_n_u16_m (p0, z0, -2),
+		z0 = sveor_m (p0, z0, -2))
+
+/*
+** eor_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	eor	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u16_z_tied1, svuint16_t,
+		z0 = sveor_u16_z (p0, z0, z1),
+		z0 = sveor_z (p0, z0, z1))
+
+/*
+** eor_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	eor	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u16_z_tied2, svuint16_t,
+		z0 = sveor_u16_z (p0, z1, z0),
+		z0 = sveor_z (p0, z1, z0))
+
+/*
+** eor_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	eor	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	eor	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u16_z_untied, svuint16_t,
+		z0 = sveor_u16_z (p0, z1, z2),
+		z0 = sveor_z (p0, z1, z2))
+
+/*
+** eor_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	eor	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = sveor_n_u16_z (p0, z0, x0),
+		 z0 = sveor_z (p0, z0, x0))
+
+/*
+** eor_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	eor	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	eor	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = sveor_n_u16_z (p0, z1, x0),
+		 z0 = sveor_z (p0, z1, x0))
+
+/*
+** eor_1_u16_z_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	eor	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u16_z_tied1, svuint16_t,
+		z0 = sveor_n_u16_z (p0, z0, 1),
+		z0 = sveor_z (p0, z0, 1))
+
+/*
+** eor_1_u16_z_untied:
+**	mov	(z[0-9]+\.h), #1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	eor	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	eor	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u16_z_untied, svuint16_t,
+		z0 = sveor_n_u16_z (p0, z1, 1),
+		z0 = sveor_z (p0, z1, 1))
+
+/*
+** eor_u16_x_tied1:
+**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u16_x_tied1, svuint16_t,
+		z0 = sveor_u16_x (p0, z0, z1),
+		z0 = sveor_x (p0, z0, z1))
+
+/*
+** eor_u16_x_tied2:
+**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u16_x_tied2, svuint16_t,
+		z0 = sveor_u16_x (p0, z1, z0),
+		z0 = sveor_x (p0, z1, z0))
+
+/*
+** eor_u16_x_untied:
+**	eor	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u16_x_untied, svuint16_t,
+		z0 = sveor_u16_x (p0, z1, z2),
+		z0 = sveor_x (p0, z1, z2))
+
+/*
+** eor_w0_u16_x_tied1:
+**	mov	(z[0-9]+)\.h, w0
+**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = sveor_n_u16_x (p0, z0, x0),
+		 z0 = sveor_x (p0, z0, x0))
+
+/*
+** eor_w0_u16_x_untied:
+**	mov	(z[0-9]+)\.h, w0
+**	eor	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = sveor_n_u16_x (p0, z1, x0),
+		 z0 = sveor_x (p0, z1, x0))
+
+/*
+** eor_1_u16_x_tied1:
+**	eor	z0\.h, z0\.h, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u16_x_tied1, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z0, 1),
+		z0 = sveor_x (p0, z0, 1))
+
+/*
+** eor_1_u16_x_untied:
+**	movprfx	z0, z1
+**	eor	z0\.h, z0\.h, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u16_x_untied, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z1, 1),
+		z0 = sveor_x (p0, z1, 1))
+
+/*
+** eor_127_u16_x:
+**	eor	z0\.h, z0\.h, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (eor_127_u16_x, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z0, 127),
+		z0 = sveor_x (p0, z0, 127))
+
+/*
+** eor_128_u16_x:
+**	eor	z0\.h, z0\.h, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (eor_128_u16_x, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z0, 128),
+		z0 = sveor_x (p0, z0, 128))
+
+/*
+** eor_255_u16_x:
+**	eor	z0\.h, z0\.h, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (eor_255_u16_x, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z0, 255),
+		z0 = sveor_x (p0, z0, 255))
+
+/*
+** eor_256_u16_x:
+**	eor	z0\.h, z0\.h, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (eor_256_u16_x, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z0, 256),
+		z0 = sveor_x (p0, z0, 256))
+
+/*
+** eor_257_u16_x:
+**	eor	z0\.h, z0\.h, #0x101
+**	ret
+*/
+TEST_UNIFORM_Z (eor_257_u16_x, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z0, 257),
+		z0 = sveor_x (p0, z0, 257))
+
+/*
+** eor_512_u16_x:
+**	eor	z0\.h, z0\.h, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (eor_512_u16_x, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z0, 512),
+		z0 = sveor_x (p0, z0, 512))
+
+/*
+** eor_65280_u16_x:
+**	eor	z0\.h, z0\.h, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_65280_u16_x, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z0, 0xff00),
+		z0 = sveor_x (p0, z0, 0xff00))
+
+/*
+** eor_m127_u16_x:
+**	eor	z0\.h, z0\.h, #0xff81
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m127_u16_x, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z0, -127),
+		z0 = sveor_x (p0, z0, -127))
+
+/*
+** eor_m128_u16_x:
+**	eor	z0\.h, z0\.h, #0xff80
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m128_u16_x, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z0, -128),
+		z0 = sveor_x (p0, z0, -128))
+
+/*
+** eor_m255_u16_x:
+**	eor	z0\.h, z0\.h, #0xff01
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m255_u16_x, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z0, -255),
+		z0 = sveor_x (p0, z0, -255))
+
+/*
+** eor_m256_u16_x:
+**	eor	z0\.h, z0\.h, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m256_u16_x, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z0, -256),
+		z0 = sveor_x (p0, z0, -256))
+
+/*
+** eor_m257_u16_x:
+**	eor	z0\.h, z0\.h, #0xfeff
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m257_u16_x, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z0, -257),
+		z0 = sveor_x (p0, z0, -257))
+
+/*
+** eor_m512_u16_x:
+**	eor	z0\.h, z0\.h, #0xfe00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m512_u16_x, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z0, -512),
+		z0 = sveor_x (p0, z0, -512))
+
+/*
+** eor_m32768_u16_x:
+**	eor	z0\.h, z0\.h, #0x8000
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m32768_u16_x, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z0, -0x8000),
+		z0 = sveor_x (p0, z0, -0x8000))
+
+/*
+** eor_5_u16_x:
+**	mov	(z[0-9]+)\.h, #5
+**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_5_u16_x, svuint16_t,
+		z0 = sveor_n_u16_x (p0, z0, 5),
+		z0 = sveor_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u32.c
new file mode 100644
index 000000000..8e46d08ca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u32.c
@@ -0,0 +1,372 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eor_u32_m_tied1:
+**	eor	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u32_m_tied1, svuint32_t,
+		z0 = sveor_u32_m (p0, z0, z1),
+		z0 = sveor_m (p0, z0, z1))
+
+/*
+** eor_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	eor	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u32_m_tied2, svuint32_t,
+		z0 = sveor_u32_m (p0, z1, z0),
+		z0 = sveor_m (p0, z1, z0))
+
+/*
+** eor_u32_m_untied:
+**	movprfx	z0, z1
+**	eor	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u32_m_untied, svuint32_t,
+		z0 = sveor_u32_m (p0, z1, z2),
+		z0 = sveor_m (p0, z1, z2))
+
+/*
+** eor_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	eor	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = sveor_n_u32_m (p0, z0, x0),
+		 z0 = sveor_m (p0, z0, x0))
+
+/*
+** eor_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	eor	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = sveor_n_u32_m (p0, z1, x0),
+		 z0 = sveor_m (p0, z1, x0))
+
+/*
+** eor_1_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	eor	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u32_m_tied1, svuint32_t,
+		z0 = sveor_n_u32_m (p0, z0, 1),
+		z0 = sveor_m (p0, z0, 1))
+
+/*
+** eor_1_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0, z1
+**	eor	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u32_m_untied, svuint32_t,
+		z0 = sveor_n_u32_m (p0, z1, 1),
+		z0 = sveor_m (p0, z1, 1))
+
+/*
+** eor_m2_u32_m:
+**	mov	(z[0-9]+\.s), #-2
+**	eor	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m2_u32_m, svuint32_t,
+		z0 = sveor_n_u32_m (p0, z0, -2),
+		z0 = sveor_m (p0, z0, -2))
+
+/*
+** eor_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	eor	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u32_z_tied1, svuint32_t,
+		z0 = sveor_u32_z (p0, z0, z1),
+		z0 = sveor_z (p0, z0, z1))
+
+/*
+** eor_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	eor	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u32_z_tied2, svuint32_t,
+		z0 = sveor_u32_z (p0, z1, z0),
+		z0 = sveor_z (p0, z1, z0))
+
+/*
+** eor_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	eor	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	eor	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u32_z_untied, svuint32_t,
+		z0 = sveor_u32_z (p0, z1, z2),
+		z0 = sveor_z (p0, z1, z2))
+
+/*
+** eor_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	eor	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = sveor_n_u32_z (p0, z0, x0),
+		 z0 = sveor_z (p0, z0, x0))
+
+/*
+** eor_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	eor	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	eor	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = sveor_n_u32_z (p0, z1, x0),
+		 z0 = sveor_z (p0, z1, x0))
+
+/*
+** eor_1_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	eor	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u32_z_tied1, svuint32_t,
+		z0 = sveor_n_u32_z (p0, z0, 1),
+		z0 = sveor_z (p0, z0, 1))
+
+/*
+** eor_1_u32_z_untied:
+**	mov	(z[0-9]+\.s), #1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	eor	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	eor	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u32_z_untied, svuint32_t,
+		z0 = sveor_n_u32_z (p0, z1, 1),
+		z0 = sveor_z (p0, z1, 1))
+
+/*
+** eor_u32_x_tied1:
+**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u32_x_tied1, svuint32_t,
+		z0 = sveor_u32_x (p0, z0, z1),
+		z0 = sveor_x (p0, z0, z1))
+
+/*
+** eor_u32_x_tied2:
+**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u32_x_tied2, svuint32_t,
+		z0 = sveor_u32_x (p0, z1, z0),
+		z0 = sveor_x (p0, z1, z0))
+
+/*
+** eor_u32_x_untied:
+**	eor	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u32_x_untied, svuint32_t,
+		z0 = sveor_u32_x (p0, z1, z2),
+		z0 = sveor_x (p0, z1, z2))
+
+/*
+** eor_w0_u32_x_tied1:
+**	mov	(z[0-9]+)\.s, w0
+**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = sveor_n_u32_x (p0, z0, x0),
+		 z0 = sveor_x (p0, z0, x0))
+
+/*
+** eor_w0_u32_x_untied:
+**	mov	(z[0-9]+)\.s, w0
+**	eor	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = sveor_n_u32_x (p0, z1, x0),
+		 z0 = sveor_x (p0, z1, x0))
+
+/*
+** eor_1_u32_x_tied1:
+**	eor	z0\.s, z0\.s, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u32_x_tied1, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z0, 1),
+		z0 = sveor_x (p0, z0, 1))
+
+/*
+** eor_1_u32_x_untied:
+**	movprfx	z0, z1
+**	eor	z0\.s, z0\.s, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u32_x_untied, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z1, 1),
+		z0 = sveor_x (p0, z1, 1))
+
+/*
+** eor_127_u32_x:
+**	eor	z0\.s, z0\.s, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (eor_127_u32_x, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z0, 127),
+		z0 = sveor_x (p0, z0, 127))
+
+/*
+** eor_128_u32_x:
+**	eor	z0\.s, z0\.s, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (eor_128_u32_x, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z0, 128),
+		z0 = sveor_x (p0, z0, 128))
+
+/*
+** eor_255_u32_x:
+**	eor	z0\.s, z0\.s, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (eor_255_u32_x, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z0, 255),
+		z0 = sveor_x (p0, z0, 255))
+
+/*
+** eor_256_u32_x:
+**	eor	z0\.s, z0\.s, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (eor_256_u32_x, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z0, 256),
+		z0 = sveor_x (p0, z0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (eor_257_u32_x, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z0, 257),
+		z0 = sveor_x (p0, z0, 257))
+
+/*
+** eor_512_u32_x:
+**	eor	z0\.s, z0\.s, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (eor_512_u32_x, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z0, 512),
+		z0 = sveor_x (p0, z0, 512))
+
+/*
+** eor_65280_u32_x:
+**	eor	z0\.s, z0\.s, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_65280_u32_x, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z0, 0xff00),
+		z0 = sveor_x (p0, z0, 0xff00))
+
+/*
+** eor_m127_u32_x:
+**	eor	z0\.s, z0\.s, #0xffffff81
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m127_u32_x, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z0, -127),
+		z0 = sveor_x (p0, z0, -127))
+
+/*
+** eor_m128_u32_x:
+**	eor	z0\.s, z0\.s, #0xffffff80
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m128_u32_x, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z0, -128),
+		z0 = sveor_x (p0, z0, -128))
+
+/*
+** eor_m255_u32_x:
+**	eor	z0\.s, z0\.s, #0xffffff01
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m255_u32_x, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z0, -255),
+		z0 = sveor_x (p0, z0, -255))
+
+/*
+** eor_m256_u32_x:
+**	eor	z0\.s, z0\.s, #0xffffff00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m256_u32_x, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z0, -256),
+		z0 = sveor_x (p0, z0, -256))
+
+/*
+** eor_m257_u32_x:
+**	eor	z0\.s, z0\.s, #0xfffffeff
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m257_u32_x, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z0, -257),
+		z0 = sveor_x (p0, z0, -257))
+
+/*
+** eor_m512_u32_x:
+**	eor	z0\.s, z0\.s, #0xfffffe00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m512_u32_x, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z0, -512),
+		z0 = sveor_x (p0, z0, -512))
+
+/*
+** eor_m32768_u32_x:
+**	eor	z0\.s, z0\.s, #0xffff8000
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m32768_u32_x, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z0, -0x8000),
+		z0 = sveor_x (p0, z0, -0x8000))
+
+/*
+** eor_5_u32_x:
+**	mov	(z[0-9]+)\.s, #5
+**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_5_u32_x, svuint32_t,
+		z0 = sveor_n_u32_x (p0, z0, 5),
+		z0 = sveor_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u64.c
new file mode 100644
index 000000000..a82398f91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u64.c
@@ -0,0 +1,372 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eor_u64_m_tied1:
+**	eor	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u64_m_tied1, svuint64_t,
+		z0 = sveor_u64_m (p0, z0, z1),
+		z0 = sveor_m (p0, z0, z1))
+
+/*
+** eor_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	eor	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u64_m_tied2, svuint64_t,
+		z0 = sveor_u64_m (p0, z1, z0),
+		z0 = sveor_m (p0, z1, z0))
+
+/*
+** eor_u64_m_untied:
+**	movprfx	z0, z1
+**	eor	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u64_m_untied, svuint64_t,
+		z0 = sveor_u64_m (p0, z1, z2),
+		z0 = sveor_m (p0, z1, z2))
+
+/*
+** eor_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	eor	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = sveor_n_u64_m (p0, z0, x0),
+		 z0 = sveor_m (p0, z0, x0))
+
+/*
+** eor_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	eor	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = sveor_n_u64_m (p0, z1, x0),
+		 z0 = sveor_m (p0, z1, x0))
+
+/*
+** eor_1_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	eor	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u64_m_tied1, svuint64_t,
+		z0 = sveor_n_u64_m (p0, z0, 1),
+		z0 = sveor_m (p0, z0, 1))
+
+/*
+** eor_1_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0, z1
+**	eor	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u64_m_untied, svuint64_t,
+		z0 = sveor_n_u64_m (p0, z1, 1),
+		z0 = sveor_m (p0, z1, 1))
+
+/*
+** eor_m2_u64_m:
+**	mov	(z[0-9]+\.d), #-2
+**	eor	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m2_u64_m, svuint64_t,
+		z0 = sveor_n_u64_m (p0, z0, -2),
+		z0 = sveor_m (p0, z0, -2))
+
+/*
+** eor_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	eor	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u64_z_tied1, svuint64_t,
+		z0 = sveor_u64_z (p0, z0, z1),
+		z0 = sveor_z (p0, z0, z1))
+
+/*
+** eor_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	eor	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u64_z_tied2, svuint64_t,
+		z0 = sveor_u64_z (p0, z1, z0),
+		z0 = sveor_z (p0, z1, z0))
+
+/*
+** eor_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	eor	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	eor	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u64_z_untied, svuint64_t,
+		z0 = sveor_u64_z (p0, z1, z2),
+		z0 = sveor_z (p0, z1, z2))
+
+/*
+** eor_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	eor	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = sveor_n_u64_z (p0, z0, x0),
+		 z0 = sveor_z (p0, z0, x0))
+
+/*
+** eor_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	eor	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	eor	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = sveor_n_u64_z (p0, z1, x0),
+		 z0 = sveor_z (p0, z1, x0))
+
+/*
+** eor_1_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	eor	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u64_z_tied1, svuint64_t,
+		z0 = sveor_n_u64_z (p0, z0, 1),
+		z0 = sveor_z (p0, z0, 1))
+
+/*
+** eor_1_u64_z_untied:
+**	mov	(z[0-9]+\.d), #1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	eor	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	eor	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u64_z_untied, svuint64_t,
+		z0 = sveor_n_u64_z (p0, z1, 1),
+		z0 = sveor_z (p0, z1, 1))
+
+/*
+** eor_u64_x_tied1:
+**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u64_x_tied1, svuint64_t,
+		z0 = sveor_u64_x (p0, z0, z1),
+		z0 = sveor_x (p0, z0, z1))
+
+/*
+** eor_u64_x_tied2:
+**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u64_x_tied2, svuint64_t,
+		z0 = sveor_u64_x (p0, z1, z0),
+		z0 = sveor_x (p0, z1, z0))
+
+/*
+** eor_u64_x_untied:
+**	eor	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u64_x_untied, svuint64_t,
+		z0 = sveor_u64_x (p0, z1, z2),
+		z0 = sveor_x (p0, z1, z2))
+
+/*
+** eor_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	eor	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = sveor_n_u64_x (p0, z0, x0),
+		 z0 = sveor_x (p0, z0, x0))
+
+/*
+** eor_x0_u64_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	eor	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = sveor_n_u64_x (p0, z1, x0),
+		 z0 = sveor_x (p0, z1, x0))
+
+/*
+** eor_1_u64_x_tied1:
+**	eor	z0\.d, z0\.d, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u64_x_tied1, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z0, 1),
+		z0 = sveor_x (p0, z0, 1))
+
+/*
+** eor_1_u64_x_untied:
+**	movprfx	z0, z1
+**	eor	z0\.d, z0\.d, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u64_x_untied, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z1, 1),
+		z0 = sveor_x (p0, z1, 1))
+
+/*
+** eor_127_u64_x:
+**	eor	z0\.d, z0\.d, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (eor_127_u64_x, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z0, 127),
+		z0 = sveor_x (p0, z0, 127))
+
+/*
+** eor_128_u64_x:
+**	eor	z0\.d, z0\.d, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (eor_128_u64_x, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z0, 128),
+		z0 = sveor_x (p0, z0, 128))
+
+/*
+** eor_255_u64_x:
+**	eor	z0\.d, z0\.d, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (eor_255_u64_x, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z0, 255),
+		z0 = sveor_x (p0, z0, 255))
+
+/*
+** eor_256_u64_x:
+**	eor	z0\.d, z0\.d, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (eor_256_u64_x, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z0, 256),
+		z0 = sveor_x (p0, z0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (eor_257_u64_x, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z0, 257),
+		z0 = sveor_x (p0, z0, 257))
+
+/*
+** eor_512_u64_x:
+**	eor	z0\.d, z0\.d, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (eor_512_u64_x, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z0, 512),
+		z0 = sveor_x (p0, z0, 512))
+
+/*
+** eor_65280_u64_x:
+**	eor	z0\.d, z0\.d, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_65280_u64_x, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z0, 0xff00),
+		z0 = sveor_x (p0, z0, 0xff00))
+
+/*
+** eor_m127_u64_x:
+**	eor	z0\.d, z0\.d, #0xffffffffffffff81
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m127_u64_x, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z0, -127),
+		z0 = sveor_x (p0, z0, -127))
+
+/*
+** eor_m128_u64_x:
+**	eor	z0\.d, z0\.d, #0xffffffffffffff80
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m128_u64_x, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z0, -128),
+		z0 = sveor_x (p0, z0, -128))
+
+/*
+** eor_m255_u64_x:
+**	eor	z0\.d, z0\.d, #0xffffffffffffff01
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m255_u64_x, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z0, -255),
+		z0 = sveor_x (p0, z0, -255))
+
+/*
+** eor_m256_u64_x:
+**	eor	z0\.d, z0\.d, #0xffffffffffffff00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m256_u64_x, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z0, -256),
+		z0 = sveor_x (p0, z0, -256))
+
+/*
+** eor_m257_u64_x:
+**	eor	z0\.d, z0\.d, #0xfffffffffffffeff
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m257_u64_x, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z0, -257),
+		z0 = sveor_x (p0, z0, -257))
+
+/*
+** eor_m512_u64_x:
+**	eor	z0\.d, z0\.d, #0xfffffffffffffe00
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m512_u64_x, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z0, -512),
+		z0 = sveor_x (p0, z0, -512))
+
+/*
+** eor_m32768_u64_x:
+**	eor	z0\.d, z0\.d, #0xffffffffffff8000
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m32768_u64_x, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z0, -0x8000),
+		z0 = sveor_x (p0, z0, -0x8000))
+
+/*
+** eor_5_u64_x:
+**	mov	(z[0-9]+\.d), #5
+**	eor	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_5_u64_x, svuint64_t,
+		z0 = sveor_n_u64_x (p0, z0, 5),
+		z0 = sveor_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u8.c
new file mode 100644
index 000000000..006637699
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eor_u8.c
@@ -0,0 +1,296 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eor_u8_m_tied1:
+**	eor	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u8_m_tied1, svuint8_t,
+		z0 = sveor_u8_m (p0, z0, z1),
+		z0 = sveor_m (p0, z0, z1))
+
+/*
+** eor_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	eor	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u8_m_tied2, svuint8_t,
+		z0 = sveor_u8_m (p0, z1, z0),
+		z0 = sveor_m (p0, z1, z0))
+
+/*
+** eor_u8_m_untied:
+**	movprfx	z0, z1
+**	eor	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u8_m_untied, svuint8_t,
+		z0 = sveor_u8_m (p0, z1, z2),
+		z0 = sveor_m (p0, z1, z2))
+
+/*
+** eor_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	eor	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = sveor_n_u8_m (p0, z0, x0),
+		 z0 = sveor_m (p0, z0, x0))
+
+/*
+** eor_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	eor	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = sveor_n_u8_m (p0, z1, x0),
+		 z0 = sveor_m (p0, z1, x0))
+
+/*
+** eor_1_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	eor	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u8_m_tied1, svuint8_t,
+		z0 = sveor_n_u8_m (p0, z0, 1),
+		z0 = sveor_m (p0, z0, 1))
+
+/*
+** eor_1_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0, z1
+**	eor	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u8_m_untied, svuint8_t,
+		z0 = sveor_n_u8_m (p0, z1, 1),
+		z0 = sveor_m (p0, z1, 1))
+
+/*
+** eor_m2_u8_m:
+**	mov	(z[0-9]+\.b), #-2
+**	eor	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m2_u8_m, svuint8_t,
+		z0 = sveor_n_u8_m (p0, z0, -2),
+		z0 = sveor_m (p0, z0, -2))
+
+/*
+** eor_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	eor	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u8_z_tied1, svuint8_t,
+		z0 = sveor_u8_z (p0, z0, z1),
+		z0 = sveor_z (p0, z0, z1))
+
+/*
+** eor_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	eor	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u8_z_tied2, svuint8_t,
+		z0 = sveor_u8_z (p0, z1, z0),
+		z0 = sveor_z (p0, z1, z0))
+
+/*
+** eor_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	eor	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	eor	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u8_z_untied, svuint8_t,
+		z0 = sveor_u8_z (p0, z1, z2),
+		z0 = sveor_z (p0, z1, z2))
+
+/*
+** eor_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	eor	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = sveor_n_u8_z (p0, z0, x0),
+		 z0 = sveor_z (p0, z0, x0))
+
+/*
+** eor_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	eor	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	eor	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = sveor_n_u8_z (p0, z1, x0),
+		 z0 = sveor_z (p0, z1, x0))
+
+/*
+** eor_1_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	eor	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u8_z_tied1, svuint8_t,
+		z0 = sveor_n_u8_z (p0, z0, 1),
+		z0 = sveor_z (p0, z0, 1))
+
+/*
+** eor_1_u8_z_untied:
+**	mov	(z[0-9]+\.b), #1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	eor	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	eor	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u8_z_untied, svuint8_t,
+		z0 = sveor_n_u8_z (p0, z1, 1),
+		z0 = sveor_z (p0, z1, 1))
+
+/*
+** eor_u8_x_tied1:
+**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u8_x_tied1, svuint8_t,
+		z0 = sveor_u8_x (p0, z0, z1),
+		z0 = sveor_x (p0, z0, z1))
+
+/*
+** eor_u8_x_tied2:
+**	eor	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u8_x_tied2, svuint8_t,
+		z0 = sveor_u8_x (p0, z1, z0),
+		z0 = sveor_x (p0, z1, z0))
+
+/*
+** eor_u8_x_untied:
+**	eor	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_u8_x_untied, svuint8_t,
+		z0 = sveor_u8_x (p0, z1, z2),
+		z0 = sveor_x (p0, z1, z2))
+
+/*
+** eor_w0_u8_x_tied1:
+**	mov	(z[0-9]+)\.b, w0
+**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = sveor_n_u8_x (p0, z0, x0),
+		 z0 = sveor_x (p0, z0, x0))
+
+/*
+** eor_w0_u8_x_untied:
+**	mov	(z[0-9]+)\.b, w0
+**	eor	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (eor_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = sveor_n_u8_x (p0, z1, x0),
+		 z0 = sveor_x (p0, z1, x0))
+
+/*
+** eor_1_u8_x_tied1:
+**	eor	z0\.b, z0\.b, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u8_x_tied1, svuint8_t,
+		z0 = sveor_n_u8_x (p0, z0, 1),
+		z0 = sveor_x (p0, z0, 1))
+
+/*
+** eor_1_u8_x_untied:
+**	movprfx	z0, z1
+**	eor	z0\.b, z0\.b, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (eor_1_u8_x_untied, svuint8_t,
+		z0 = sveor_n_u8_x (p0, z1, 1),
+		z0 = sveor_x (p0, z1, 1))
+
+/*
+** eor_127_u8_x:
+**	eor	z0\.b, z0\.b, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (eor_127_u8_x, svuint8_t,
+		z0 = sveor_n_u8_x (p0, z0, 127),
+		z0 = sveor_x (p0, z0, 127))
+
+/*
+** eor_128_u8_x:
+**	eor	z0\.b, z0\.b, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (eor_128_u8_x, svuint8_t,
+		z0 = sveor_n_u8_x (p0, z0, 128),
+		z0 = sveor_x (p0, z0, 128))
+
+/*
+** eor_255_u8_x:
+**	mov	(z[0-9]+)\.b, #-1
+**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_255_u8_x, svuint8_t,
+		z0 = sveor_n_u8_x (p0, z0, 255),
+		z0 = sveor_x (p0, z0, 255))
+
+/*
+** eor_m127_u8_x:
+**	eor	z0\.b, z0\.b, #0x81
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m127_u8_x, svuint8_t,
+		z0 = sveor_n_u8_x (p0, z0, -127),
+		z0 = sveor_x (p0, z0, -127))
+
+/*
+** eor_m128_u8_x:
+**	eor	z0\.b, z0\.b, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (eor_m128_u8_x, svuint8_t,
+		z0 = sveor_n_u8_x (p0, z0, -128),
+		z0 = sveor_x (p0, z0, -128))
+
+/*
+** eor_5_u8_x:
+**	mov	(z[0-9]+)\.b, #5
+**	eor	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (eor_5_u8_x, svuint8_t,
+		z0 = sveor_n_u8_x (p0, z0, 5),
+		z0 = sveor_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s16.c
new file mode 100644
index 000000000..0675d7ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s16.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eorv_x0_s16:
+**	eorv	h([0-9]+), p0, z0\.h
+**	umov	w0, v\1\.h\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (eorv_x0_s16, int16_t, svint16_t,
+		  x0 = sveorv_s16 (p0, z0),
+		  x0 = sveorv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s32.c
new file mode 100644
index 000000000..9c0c1089f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s32.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eorv_x0_s32:
+**	eorv	(s[0-9]+), p0, z0\.s
+**	fmov	w0, \1
+**	ret
+*/
+TEST_REDUCTION_X (eorv_x0_s32, int32_t, svint32_t,
+		  x0 = sveorv_s32 (p0, z0),
+		  x0 = sveorv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s64.c
new file mode 100644
index 000000000..7a474556c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s64.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eorv_x0_s64:
+**	eorv	(d[0-9]+), p0, z0\.d
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (eorv_x0_s64, int64_t, svint64_t,
+		  x0 = sveorv_s64 (p0, z0),
+		  x0 = sveorv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s8.c
new file mode 100644
index 000000000..43f056d3a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_s8.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eorv_x0_s8:
+**	eorv	b([0-9]+), p0, z0\.b
+**	umov	w0, v\1\.b\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (eorv_x0_s8, int8_t, svint8_t,
+		  x0 = sveorv_s8 (p0, z0),
+		  x0 = sveorv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u16.c
new file mode 100644
index 000000000..5f7836db4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u16.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eorv_x0_u16:
+**	eorv	h([0-9]+), p0, z0\.h
+**	umov	w0, v\1\.h\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (eorv_x0_u16, uint16_t, svuint16_t,
+		  x0 = sveorv_u16 (p0, z0),
+		  x0 = sveorv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u32.c
new file mode 100644
index 000000000..f112a0dc2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u32.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eorv_x0_u32:
+**	eorv	(s[0-9]+), p0, z0\.s
+**	fmov	w0, \1
+**	ret
+*/
+TEST_REDUCTION_X (eorv_x0_u32, uint32_t, svuint32_t,
+		  x0 = sveorv_u32 (p0, z0),
+		  x0 = sveorv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u64.c
new file mode 100644
index 000000000..5f8b8f86b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u64.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eorv_x0_u64:
+**	eorv	(d[0-9]+), p0, z0\.d
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (eorv_x0_u64, uint64_t, svuint64_t,
+		  x0 = sveorv_u64 (p0, z0),
+		  x0 = sveorv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u8.c
new file mode 100644
index 000000000..eed4d4915
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/eorv_u8.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** eorv_x0_u8:
+**	eorv	b([0-9]+), p0, z0\.b
+**	umov	w0, v\1\.b\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (eorv_x0_u8, uint8_t, svuint8_t,
+		  x0 = sveorv_u8 (p0, z0),
+		  x0 = sveorv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f16.c
new file mode 100644
index 000000000..5a5411e46
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** expa_f16_tied1:
+**	fexpa	z0\.h, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (expa_f16_tied1, svfloat16_t, svuint16_t,
+		 z0_res = svexpa_f16 (z0),
+		 z0_res = svexpa (z0))
+
+/*
+** expa_f16_untied:
+**	fexpa	z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (expa_f16_untied, svfloat16_t, svuint16_t,
+	     z0 = svexpa_f16 (z4),
+	     z0 = svexpa (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f32.c
new file mode 100644
index 000000000..4ded1c575
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** expa_f32_tied1:
+**	fexpa	z0\.s, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (expa_f32_tied1, svfloat32_t, svuint32_t,
+		 z0_res = svexpa_f32 (z0),
+		 z0_res = svexpa (z0))
+
+/*
+** expa_f32_untied:
+**	fexpa	z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (expa_f32_untied, svfloat32_t, svuint32_t,
+	     z0 = svexpa_f32 (z4),
+	     z0 = svexpa (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f64.c
new file mode 100644
index 000000000..c31f9ccb5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/expa_f64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** expa_f64_tied1:
+**	fexpa	z0\.d, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (expa_f64_tied1, svfloat64_t, svuint64_t,
+		 z0_res = svexpa_f64 (z0),
+		 z0_res = svexpa (z0))
+
+/*
+** expa_f64_untied:
+**	fexpa	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (expa_f64_untied, svfloat64_t, svuint64_t,
+	     z0 = svexpa_f64 (z4),
+	     z0 = svexpa (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_bf16.c
new file mode 100644
index 000000000..f982873c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_bf16.c
@@ -0,0 +1,73 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ext_0_bf16_tied1:
+**	ext	z0\.b, z0\.b, z1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_bf16_tied1, svbfloat16_t,
+		z0 = svext_bf16 (z0, z1, 0),
+		z0 = svext (z0, z1, 0))
+
+/*
+** ext_0_bf16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, \1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_bf16_tied2, svbfloat16_t,
+		z0 = svext_bf16 (z1, z0, 0),
+		z0 = svext (z1, z0, 0))
+
+/*
+** ext_0_bf16_untied:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_bf16_untied, svbfloat16_t,
+		z0 = svext_bf16 (z1, z2, 0),
+		z0 = svext (z1, z2, 0))
+
+/*
+** ext_1_bf16:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #2
+**	ret
+*/
+TEST_UNIFORM_Z (ext_1_bf16, svbfloat16_t,
+		z0 = svext_bf16 (z1, z2, 1),
+		z0 = svext (z1, z2, 1))
+
+/*
+** ext_2_bf16:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #4
+**	ret
+*/
+TEST_UNIFORM_Z (ext_2_bf16, svbfloat16_t,
+		z0 = svext_bf16 (z1, z2, 2),
+		z0 = svext (z1, z2, 2))
+
+/*
+** ext_3_bf16:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #6
+**	ret
+*/
+TEST_UNIFORM_Z (ext_3_bf16, svbfloat16_t,
+		z0 = svext_bf16 (z1, z2, 3),
+		z0 = svext (z1, z2, 3))
+
+/*
+** ext_127_bf16:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #254
+**	ret
+*/
+TEST_UNIFORM_Z (ext_127_bf16, svbfloat16_t,
+		z0 = svext_bf16 (z1, z2, 127),
+		z0 = svext (z1, z2, 127))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f16.c
new file mode 100644
index 000000000..d8edccb9f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f16.c
@@ -0,0 +1,73 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ext_0_f16_tied1:
+**	ext	z0\.b, z0\.b, z1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_f16_tied1, svfloat16_t,
+		z0 = svext_f16 (z0, z1, 0),
+		z0 = svext (z0, z1, 0))
+
+/*
+** ext_0_f16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, \1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_f16_tied2, svfloat16_t,
+		z0 = svext_f16 (z1, z0, 0),
+		z0 = svext (z1, z0, 0))
+
+/*
+** ext_0_f16_untied:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_f16_untied, svfloat16_t,
+		z0 = svext_f16 (z1, z2, 0),
+		z0 = svext (z1, z2, 0))
+
+/*
+** ext_1_f16:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #2
+**	ret
+*/
+TEST_UNIFORM_Z (ext_1_f16, svfloat16_t,
+		z0 = svext_f16 (z1, z2, 1),
+		z0 = svext (z1, z2, 1))
+
+/*
+** ext_2_f16:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #4
+**	ret
+*/
+TEST_UNIFORM_Z (ext_2_f16, svfloat16_t,
+		z0 = svext_f16 (z1, z2, 2),
+		z0 = svext (z1, z2, 2))
+
+/*
+** ext_3_f16:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #6
+**	ret
+*/
+TEST_UNIFORM_Z (ext_3_f16, svfloat16_t,
+		z0 = svext_f16 (z1, z2, 3),
+		z0 = svext (z1, z2, 3))
+
+/*
+** ext_127_f16:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #254
+**	ret
+*/
+TEST_UNIFORM_Z (ext_127_f16, svfloat16_t,
+		z0 = svext_f16 (z1, z2, 127),
+		z0 = svext (z1, z2, 127))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f32.c
new file mode 100644
index 000000000..c00ea06fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f32.c
@@ -0,0 +1,73 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ext_0_f32_tied1:
+**	ext	z0\.b, z0\.b, z1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_f32_tied1, svfloat32_t,
+		z0 = svext_f32 (z0, z1, 0),
+		z0 = svext (z0, z1, 0))
+
+/*
+** ext_0_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, \1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_f32_tied2, svfloat32_t,
+		z0 = svext_f32 (z1, z0, 0),
+		z0 = svext (z1, z0, 0))
+
+/*
+** ext_0_f32_untied:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_f32_untied, svfloat32_t,
+		z0 = svext_f32 (z1, z2, 0),
+		z0 = svext (z1, z2, 0))
+
+/*
+** ext_1_f32:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #4
+**	ret
+*/
+TEST_UNIFORM_Z (ext_1_f32, svfloat32_t,
+		z0 = svext_f32 (z1, z2, 1),
+		z0 = svext (z1, z2, 1))
+
+/*
+** ext_2_f32:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (ext_2_f32, svfloat32_t,
+		z0 = svext_f32 (z1, z2, 2),
+		z0 = svext (z1, z2, 2))
+
+/*
+** ext_3_f32:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #12
+**	ret
+*/
+TEST_UNIFORM_Z (ext_3_f32, svfloat32_t,
+		z0 = svext_f32 (z1, z2, 3),
+		z0 = svext (z1, z2, 3))
+
+/*
+** ext_63_f32:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #252
+**	ret
+*/
+TEST_UNIFORM_Z (ext_63_f32, svfloat32_t,
+		z0 = svext_f32 (z1, z2, 63),
+		z0 = svext (z1, z2, 63))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f64.c
new file mode 100644
index 000000000..af72870ca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_f64.c
@@ -0,0 +1,73 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ext_0_f64_tied1:
+**	ext	z0\.b, z0\.b, z1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_f64_tied1, svfloat64_t,
+		z0 = svext_f64 (z0, z1, 0),
+		z0 = svext (z0, z1, 0))
+
+/*
+** ext_0_f64_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, \1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_f64_tied2, svfloat64_t,
+		z0 = svext_f64 (z1, z0, 0),
+		z0 = svext (z1, z0, 0))
+
+/*
+** ext_0_f64_untied:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_f64_untied, svfloat64_t,
+		z0 = svext_f64 (z1, z2, 0),
+		z0 = svext (z1, z2, 0))
+
+/*
+** ext_1_f64:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (ext_1_f64, svfloat64_t,
+		z0 = svext_f64 (z1, z2, 1),
+		z0 = svext (z1, z2, 1))
+
+/*
+** ext_2_f64:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #16
+**	ret
+*/
+TEST_UNIFORM_Z (ext_2_f64, svfloat64_t,
+		z0 = svext_f64 (z1, z2, 2),
+		z0 = svext (z1, z2, 2))
+
+/*
+** ext_3_f64:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #24
+**	ret
+*/
+TEST_UNIFORM_Z (ext_3_f64, svfloat64_t,
+		z0 = svext_f64 (z1, z2, 3),
+		z0 = svext (z1, z2, 3))
+
+/*
+** ext_31_f64:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #248
+**	ret
+*/
+TEST_UNIFORM_Z (ext_31_f64, svfloat64_t,
+		z0 = svext_f64 (z1, z2, 31),
+		z0 = svext (z1, z2, 31))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s16.c
new file mode 100644
index 000000000..a7c4484ac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s16.c
@@ -0,0 +1,73 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ext_0_s16_tied1:
+**	ext	z0\.b, z0\.b, z1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_s16_tied1, svint16_t,
+		z0 = svext_s16 (z0, z1, 0),
+		z0 = svext (z0, z1, 0))
+
+/*
+** ext_0_s16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, \1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_s16_tied2, svint16_t,
+		z0 = svext_s16 (z1, z0, 0),
+		z0 = svext (z1, z0, 0))
+
+/*
+** ext_0_s16_untied:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_s16_untied, svint16_t,
+		z0 = svext_s16 (z1, z2, 0),
+		z0 = svext (z1, z2, 0))
+
+/*
+** ext_1_s16:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #2
+**	ret
+*/
+TEST_UNIFORM_Z (ext_1_s16, svint16_t,
+		z0 = svext_s16 (z1, z2, 1),
+		z0 = svext (z1, z2, 1))
+
+/*
+** ext_2_s16:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #4
+**	ret
+*/
+TEST_UNIFORM_Z (ext_2_s16, svint16_t,
+		z0 = svext_s16 (z1, z2, 2),
+		z0 = svext (z1, z2, 2))
+
+/*
+** ext_3_s16:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #6
+**	ret
+*/
+TEST_UNIFORM_Z (ext_3_s16, svint16_t,
+		z0 = svext_s16 (z1, z2, 3),
+		z0 = svext (z1, z2, 3))
+
+/*
+** ext_127_s16:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #254
+**	ret
+*/
+TEST_UNIFORM_Z (ext_127_s16, svint16_t,
+		z0 = svext_s16 (z1, z2, 127),
+		z0 = svext (z1, z2, 127))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s32.c
new file mode 100644
index 000000000..68242a9ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s32.c
@@ -0,0 +1,73 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ext_0_s32_tied1:
+**	ext	z0\.b, z0\.b, z1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_s32_tied1, svint32_t,
+		z0 = svext_s32 (z0, z1, 0),
+		z0 = svext (z0, z1, 0))
+
+/*
+** ext_0_s32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, \1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_s32_tied2, svint32_t,
+		z0 = svext_s32 (z1, z0, 0),
+		z0 = svext (z1, z0, 0))
+
+/*
+** ext_0_s32_untied:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_s32_untied, svint32_t,
+		z0 = svext_s32 (z1, z2, 0),
+		z0 = svext (z1, z2, 0))
+
+/*
+** ext_1_s32:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #4
+**	ret
+*/
+TEST_UNIFORM_Z (ext_1_s32, svint32_t,
+		z0 = svext_s32 (z1, z2, 1),
+		z0 = svext (z1, z2, 1))
+
+/*
+** ext_2_s32:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (ext_2_s32, svint32_t,
+		z0 = svext_s32 (z1, z2, 2),
+		z0 = svext (z1, z2, 2))
+
+/*
+** ext_3_s32:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #12
+**	ret
+*/
+TEST_UNIFORM_Z (ext_3_s32, svint32_t,
+		z0 = svext_s32 (z1, z2, 3),
+		z0 = svext (z1, z2, 3))
+
+/*
+** ext_63_s32:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #252
+**	ret
+*/
+TEST_UNIFORM_Z (ext_63_s32, svint32_t,
+		z0 = svext_s32 (z1, z2, 63),
+		z0 = svext (z1, z2, 63))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s64.c
new file mode 100644
index 000000000..8bdbd0561
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s64.c
@@ -0,0 +1,73 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ext_0_s64_tied1:
+**	ext	z0\.b, z0\.b, z1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_s64_tied1, svint64_t,
+		z0 = svext_s64 (z0, z1, 0),
+		z0 = svext (z0, z1, 0))
+
+/*
+** ext_0_s64_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, \1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_s64_tied2, svint64_t,
+		z0 = svext_s64 (z1, z0, 0),
+		z0 = svext (z1, z0, 0))
+
+/*
+** ext_0_s64_untied:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_s64_untied, svint64_t,
+		z0 = svext_s64 (z1, z2, 0),
+		z0 = svext (z1, z2, 0))
+
+/*
+** ext_1_s64:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (ext_1_s64, svint64_t,
+		z0 = svext_s64 (z1, z2, 1),
+		z0 = svext (z1, z2, 1))
+
+/*
+** ext_2_s64:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #16
+**	ret
+*/
+TEST_UNIFORM_Z (ext_2_s64, svint64_t,
+		z0 = svext_s64 (z1, z2, 2),
+		z0 = svext (z1, z2, 2))
+
+/*
+** ext_3_s64:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #24
+**	ret
+*/
+TEST_UNIFORM_Z (ext_3_s64, svint64_t,
+		z0 = svext_s64 (z1, z2, 3),
+		z0 = svext (z1, z2, 3))
+
+/*
+** ext_31_s64:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #248
+**	ret
+*/
+TEST_UNIFORM_Z (ext_31_s64, svint64_t,
+		z0 = svext_s64 (z1, z2, 31),
+		z0 = svext (z1, z2, 31))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s8.c
new file mode 100644
index 000000000..52490f00e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_s8.c
@@ -0,0 +1,73 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ext_0_s8_tied1:
+**	ext	z0\.b, z0\.b, z1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_s8_tied1, svint8_t,
+		z0 = svext_s8 (z0, z1, 0),
+		z0 = svext (z0, z1, 0))
+
+/*
+** ext_0_s8_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, \1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_s8_tied2, svint8_t,
+		z0 = svext_s8 (z1, z0, 0),
+		z0 = svext (z1, z0, 0))
+
+/*
+** ext_0_s8_untied:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_s8_untied, svint8_t,
+		z0 = svext_s8 (z1, z2, 0),
+		z0 = svext (z1, z2, 0))
+
+/*
+** ext_1_s8:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (ext_1_s8, svint8_t,
+		z0 = svext_s8 (z1, z2, 1),
+		z0 = svext (z1, z2, 1))
+
+/*
+** ext_2_s8:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #2
+**	ret
+*/
+TEST_UNIFORM_Z (ext_2_s8, svint8_t,
+		z0 = svext_s8 (z1, z2, 2),
+		z0 = svext (z1, z2, 2))
+
+/*
+** ext_3_s8:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #3
+**	ret
+*/
+TEST_UNIFORM_Z (ext_3_s8, svint8_t,
+		z0 = svext_s8 (z1, z2, 3),
+		z0 = svext (z1, z2, 3))
+
+/*
+** ext_255_s8:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (ext_255_s8, svint8_t,
+		z0 = svext_s8 (z1, z2, 255),
+		z0 = svext (z1, z2, 255))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u16.c
new file mode 100644
index 000000000..dc7574ffa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u16.c
@@ -0,0 +1,73 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ext_0_u16_tied1:
+**	ext	z0\.b, z0\.b, z1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_u16_tied1, svuint16_t,
+		z0 = svext_u16 (z0, z1, 0),
+		z0 = svext (z0, z1, 0))
+
+/*
+** ext_0_u16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, \1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_u16_tied2, svuint16_t,
+		z0 = svext_u16 (z1, z0, 0),
+		z0 = svext (z1, z0, 0))
+
+/*
+** ext_0_u16_untied:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_u16_untied, svuint16_t,
+		z0 = svext_u16 (z1, z2, 0),
+		z0 = svext (z1, z2, 0))
+
+/*
+** ext_1_u16:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #2
+**	ret
+*/
+TEST_UNIFORM_Z (ext_1_u16, svuint16_t,
+		z0 = svext_u16 (z1, z2, 1),
+		z0 = svext (z1, z2, 1))
+
+/*
+** ext_2_u16:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #4
+**	ret
+*/
+TEST_UNIFORM_Z (ext_2_u16, svuint16_t,
+		z0 = svext_u16 (z1, z2, 2),
+		z0 = svext (z1, z2, 2))
+
+/*
+** ext_3_u16:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #6
+**	ret
+*/
+TEST_UNIFORM_Z (ext_3_u16, svuint16_t,
+		z0 = svext_u16 (z1, z2, 3),
+		z0 = svext (z1, z2, 3))
+
+/*
+** ext_127_u16:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #254
+**	ret
+*/
+TEST_UNIFORM_Z (ext_127_u16, svuint16_t,
+		z0 = svext_u16 (z1, z2, 127),
+		z0 = svext (z1, z2, 127))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u32.c
new file mode 100644
index 000000000..0d417fc43
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u32.c
@@ -0,0 +1,73 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ext_0_u32_tied1:
+**	ext	z0\.b, z0\.b, z1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_u32_tied1, svuint32_t,
+		z0 = svext_u32 (z0, z1, 0),
+		z0 = svext (z0, z1, 0))
+
+/*
+** ext_0_u32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, \1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_u32_tied2, svuint32_t,
+		z0 = svext_u32 (z1, z0, 0),
+		z0 = svext (z1, z0, 0))
+
+/*
+** ext_0_u32_untied:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_u32_untied, svuint32_t,
+		z0 = svext_u32 (z1, z2, 0),
+		z0 = svext (z1, z2, 0))
+
+/*
+** ext_1_u32:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #4
+**	ret
+*/
+TEST_UNIFORM_Z (ext_1_u32, svuint32_t,
+		z0 = svext_u32 (z1, z2, 1),
+		z0 = svext (z1, z2, 1))
+
+/*
+** ext_2_u32:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (ext_2_u32, svuint32_t,
+		z0 = svext_u32 (z1, z2, 2),
+		z0 = svext (z1, z2, 2))
+
+/*
+** ext_3_u32:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #12
+**	ret
+*/
+TEST_UNIFORM_Z (ext_3_u32, svuint32_t,
+		z0 = svext_u32 (z1, z2, 3),
+		z0 = svext (z1, z2, 3))
+
+/*
+** ext_63_u32:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #252
+**	ret
+*/
+TEST_UNIFORM_Z (ext_63_u32, svuint32_t,
+		z0 = svext_u32 (z1, z2, 63),
+		z0 = svext (z1, z2, 63))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u64.c
new file mode 100644
index 000000000..ed81f811e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u64.c
@@ -0,0 +1,73 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ext_0_u64_tied1:
+**	ext	z0\.b, z0\.b, z1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_u64_tied1, svuint64_t,
+		z0 = svext_u64 (z0, z1, 0),
+		z0 = svext (z0, z1, 0))
+
+/*
+** ext_0_u64_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, \1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_u64_tied2, svuint64_t,
+		z0 = svext_u64 (z1, z0, 0),
+		z0 = svext (z1, z0, 0))
+
+/*
+** ext_0_u64_untied:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_u64_untied, svuint64_t,
+		z0 = svext_u64 (z1, z2, 0),
+		z0 = svext (z1, z2, 0))
+
+/*
+** ext_1_u64:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (ext_1_u64, svuint64_t,
+		z0 = svext_u64 (z1, z2, 1),
+		z0 = svext (z1, z2, 1))
+
+/*
+** ext_2_u64:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #16
+**	ret
+*/
+TEST_UNIFORM_Z (ext_2_u64, svuint64_t,
+		z0 = svext_u64 (z1, z2, 2),
+		z0 = svext (z1, z2, 2))
+
+/*
+** ext_3_u64:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #24
+**	ret
+*/
+TEST_UNIFORM_Z (ext_3_u64, svuint64_t,
+		z0 = svext_u64 (z1, z2, 3),
+		z0 = svext (z1, z2, 3))
+
+/*
+** ext_31_u64:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #248
+**	ret
+*/
+TEST_UNIFORM_Z (ext_31_u64, svuint64_t,
+		z0 = svext_u64 (z1, z2, 31),
+		z0 = svext (z1, z2, 31))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u8.c
new file mode 100644
index 000000000..6c061406b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ext_u8.c
@@ -0,0 +1,73 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ext_0_u8_tied1:
+**	ext	z0\.b, z0\.b, z1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_u8_tied1, svuint8_t,
+		z0 = svext_u8 (z0, z1, 0),
+		z0 = svext (z0, z1, 0))
+
+/*
+** ext_0_u8_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, \1\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_u8_tied2, svuint8_t,
+		z0 = svext_u8 (z1, z0, 0),
+		z0 = svext (z1, z0, 0))
+
+/*
+** ext_0_u8_untied:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #0
+**	ret
+*/
+TEST_UNIFORM_Z (ext_0_u8_untied, svuint8_t,
+		z0 = svext_u8 (z1, z2, 0),
+		z0 = svext (z1, z2, 0))
+
+/*
+** ext_1_u8:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (ext_1_u8, svuint8_t,
+		z0 = svext_u8 (z1, z2, 1),
+		z0 = svext (z1, z2, 1))
+
+/*
+** ext_2_u8:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #2
+**	ret
+*/
+TEST_UNIFORM_Z (ext_2_u8, svuint8_t,
+		z0 = svext_u8 (z1, z2, 2),
+		z0 = svext (z1, z2, 2))
+
+/*
+** ext_3_u8:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #3
+**	ret
+*/
+TEST_UNIFORM_Z (ext_3_u8, svuint8_t,
+		z0 = svext_u8 (z1, z2, 3),
+		z0 = svext (z1, z2, 3))
+
+/*
+** ext_255_u8:
+**	movprfx	z0, z1
+**	ext	z0\.b, z0\.b, z2\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (ext_255_u8, svuint8_t,
+		z0 = svext_u8 (z1, z2, 255),
+		z0 = svext (z1, z2, 255))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s16.c
new file mode 100644
index 000000000..32e836f01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s16.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** extb_s16_m_tied12:
+**	sxtb	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s16_m_tied12, svint16_t,
+		z0 = svextb_s16_m (z0, p0, z0),
+		z0 = svextb_m (z0, p0, z0))
+
+/*
+** extb_s16_m_tied1:
+**	sxtb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s16_m_tied1, svint16_t,
+		z0 = svextb_s16_m (z0, p0, z1),
+		z0 = svextb_m (z0, p0, z1))
+
+/*
+** extb_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	sxtb	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s16_m_tied2, svint16_t,
+		z0 = svextb_s16_m (z1, p0, z0),
+		z0 = svextb_m (z1, p0, z0))
+
+/*
+** extb_s16_m_untied:
+**	movprfx	z0, z2
+**	sxtb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s16_m_untied, svint16_t,
+		z0 = svextb_s16_m (z2, p0, z1),
+		z0 = svextb_m (z2, p0, z1))
+
+/*
+** extb_s16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	sxtb	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s16_z_tied1, svint16_t,
+		z0 = svextb_s16_z (p0, z0),
+		z0 = svextb_z (p0, z0))
+
+/*
+** extb_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	sxtb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s16_z_untied, svint16_t,
+		z0 = svextb_s16_z (p0, z1),
+		z0 = svextb_z (p0, z1))
+
+/*
+** extb_s16_x_tied1:
+**	sxtb	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s16_x_tied1, svint16_t,
+		z0 = svextb_s16_x (p0, z0),
+		z0 = svextb_x (p0, z0))
+
+/*
+** extb_s16_x_untied:
+**	sxtb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s16_x_untied, svint16_t,
+		z0 = svextb_s16_x (p0, z1),
+		z0 = svextb_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s32.c
new file mode 100644
index 000000000..e2f13f41c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s32.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** extb_s32_m_tied12:
+**	sxtb	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s32_m_tied12, svint32_t,
+		z0 = svextb_s32_m (z0, p0, z0),
+		z0 = svextb_m (z0, p0, z0))
+
+/*
+** extb_s32_m_tied1:
+**	sxtb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s32_m_tied1, svint32_t,
+		z0 = svextb_s32_m (z0, p0, z1),
+		z0 = svextb_m (z0, p0, z1))
+
+/*
+** extb_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	sxtb	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s32_m_tied2, svint32_t,
+		z0 = svextb_s32_m (z1, p0, z0),
+		z0 = svextb_m (z1, p0, z0))
+
+/*
+** extb_s32_m_untied:
+**	movprfx	z0, z2
+**	sxtb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s32_m_untied, svint32_t,
+		z0 = svextb_s32_m (z2, p0, z1),
+		z0 = svextb_m (z2, p0, z1))
+
+/*
+** extb_s32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	sxtb	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s32_z_tied1, svint32_t,
+		z0 = svextb_s32_z (p0, z0),
+		z0 = svextb_z (p0, z0))
+
+/*
+** extb_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	sxtb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s32_z_untied, svint32_t,
+		z0 = svextb_s32_z (p0, z1),
+		z0 = svextb_z (p0, z1))
+
+/*
+** extb_s32_x_tied1:
+**	sxtb	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s32_x_tied1, svint32_t,
+		z0 = svextb_s32_x (p0, z0),
+		z0 = svextb_x (p0, z0))
+
+/*
+** extb_s32_x_untied:
+**	sxtb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s32_x_untied, svint32_t,
+		z0 = svextb_s32_x (p0, z1),
+		z0 = svextb_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s64.c
new file mode 100644
index 000000000..83363efdb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_s64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** extb_s64_m_tied12:
+**	sxtb	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s64_m_tied12, svint64_t,
+		z0 = svextb_s64_m (z0, p0, z0),
+		z0 = svextb_m (z0, p0, z0))
+
+/*
+** extb_s64_m_tied1:
+**	sxtb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s64_m_tied1, svint64_t,
+		z0 = svextb_s64_m (z0, p0, z1),
+		z0 = svextb_m (z0, p0, z1))
+
+/*
+** extb_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	sxtb	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s64_m_tied2, svint64_t,
+		z0 = svextb_s64_m (z1, p0, z0),
+		z0 = svextb_m (z1, p0, z0))
+
+/*
+** extb_s64_m_untied:
+**	movprfx	z0, z2
+**	sxtb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s64_m_untied, svint64_t,
+		z0 = svextb_s64_m (z2, p0, z1),
+		z0 = svextb_m (z2, p0, z1))
+
+/*
+** extb_s64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	sxtb	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s64_z_tied1, svint64_t,
+		z0 = svextb_s64_z (p0, z0),
+		z0 = svextb_z (p0, z0))
+
+/*
+** extb_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	sxtb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s64_z_untied, svint64_t,
+		z0 = svextb_s64_z (p0, z1),
+		z0 = svextb_z (p0, z1))
+
+/*
+** extb_s64_x_tied1:
+**	sxtb	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s64_x_tied1, svint64_t,
+		z0 = svextb_s64_x (p0, z0),
+		z0 = svextb_x (p0, z0))
+
+/*
+** extb_s64_x_untied:
+**	sxtb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extb_s64_x_untied, svint64_t,
+		z0 = svextb_s64_x (p0, z1),
+		z0 = svextb_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u16.c
new file mode 100644
index 000000000..d806edfaa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u16.c
@@ -0,0 +1,82 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** extb_u16_m_tied12:
+**	uxtb	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u16_m_tied12, svuint16_t,
+		z0 = svextb_u16_m (z0, p0, z0),
+		z0 = svextb_m (z0, p0, z0))
+
+/*
+** extb_u16_m_tied1:
+**	uxtb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u16_m_tied1, svuint16_t,
+		z0 = svextb_u16_m (z0, p0, z1),
+		z0 = svextb_m (z0, p0, z1))
+
+/*
+** extb_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	uxtb	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u16_m_tied2, svuint16_t,
+		z0 = svextb_u16_m (z1, p0, z0),
+		z0 = svextb_m (z1, p0, z0))
+
+/*
+** extb_u16_m_untied:
+**	movprfx	z0, z2
+**	uxtb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u16_m_untied, svuint16_t,
+		z0 = svextb_u16_m (z2, p0, z1),
+		z0 = svextb_m (z2, p0, z1))
+
+/*
+** extb_u16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	uxtb	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u16_z_tied1, svuint16_t,
+		z0 = svextb_u16_z (p0, z0),
+		z0 = svextb_z (p0, z0))
+
+/*
+** extb_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	uxtb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u16_z_untied, svuint16_t,
+		z0 = svextb_u16_z (p0, z1),
+		z0 = svextb_z (p0, z1))
+
+/*
+** extb_u16_x_tied1:
+**	and	z0\.h, z0\.h, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u16_x_tied1, svuint16_t,
+		z0 = svextb_u16_x (p0, z0),
+		z0 = svextb_x (p0, z0))
+
+/*
+** extb_u16_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.h, z0\.h, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u16_x_untied, svuint16_t,
+		z0 = svextb_u16_x (p0, z1),
+		z0 = svextb_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u32.c
new file mode 100644
index 000000000..274656dbd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u32.c
@@ -0,0 +1,82 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** extb_u32_m_tied12:
+**	uxtb	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u32_m_tied12, svuint32_t,
+		z0 = svextb_u32_m (z0, p0, z0),
+		z0 = svextb_m (z0, p0, z0))
+
+/*
+** extb_u32_m_tied1:
+**	uxtb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u32_m_tied1, svuint32_t,
+		z0 = svextb_u32_m (z0, p0, z1),
+		z0 = svextb_m (z0, p0, z1))
+
+/*
+** extb_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	uxtb	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u32_m_tied2, svuint32_t,
+		z0 = svextb_u32_m (z1, p0, z0),
+		z0 = svextb_m (z1, p0, z0))
+
+/*
+** extb_u32_m_untied:
+**	movprfx	z0, z2
+**	uxtb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u32_m_untied, svuint32_t,
+		z0 = svextb_u32_m (z2, p0, z1),
+		z0 = svextb_m (z2, p0, z1))
+
+/*
+** extb_u32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	uxtb	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u32_z_tied1, svuint32_t,
+		z0 = svextb_u32_z (p0, z0),
+		z0 = svextb_z (p0, z0))
+
+/*
+** extb_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	uxtb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u32_z_untied, svuint32_t,
+		z0 = svextb_u32_z (p0, z1),
+		z0 = svextb_z (p0, z1))
+
+/*
+** extb_u32_x_tied1:
+**	and	z0\.s, z0\.s, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u32_x_tied1, svuint32_t,
+		z0 = svextb_u32_x (p0, z0),
+		z0 = svextb_x (p0, z0))
+
+/*
+** extb_u32_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.s, z0\.s, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u32_x_untied, svuint32_t,
+		z0 = svextb_u32_x (p0, z1),
+		z0 = svextb_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u64.c
new file mode 100644
index 000000000..de24cc605
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extb_u64.c
@@ -0,0 +1,82 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** extb_u64_m_tied12:
+**	uxtb	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u64_m_tied12, svuint64_t,
+		z0 = svextb_u64_m (z0, p0, z0),
+		z0 = svextb_m (z0, p0, z0))
+
+/*
+** extb_u64_m_tied1:
+**	uxtb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u64_m_tied1, svuint64_t,
+		z0 = svextb_u64_m (z0, p0, z1),
+		z0 = svextb_m (z0, p0, z1))
+
+/*
+** extb_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	uxtb	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u64_m_tied2, svuint64_t,
+		z0 = svextb_u64_m (z1, p0, z0),
+		z0 = svextb_m (z1, p0, z0))
+
+/*
+** extb_u64_m_untied:
+**	movprfx	z0, z2
+**	uxtb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u64_m_untied, svuint64_t,
+		z0 = svextb_u64_m (z2, p0, z1),
+		z0 = svextb_m (z2, p0, z1))
+
+/*
+** extb_u64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	uxtb	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u64_z_tied1, svuint64_t,
+		z0 = svextb_u64_z (p0, z0),
+		z0 = svextb_z (p0, z0))
+
+/*
+** extb_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	uxtb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u64_z_untied, svuint64_t,
+		z0 = svextb_u64_z (p0, z1),
+		z0 = svextb_z (p0, z1))
+
+/*
+** extb_u64_x_tied1:
+**	and	z0\.d, z0\.d, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u64_x_tied1, svuint64_t,
+		z0 = svextb_u64_x (p0, z0),
+		z0 = svextb_x (p0, z0))
+
+/*
+** extb_u64_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.d, z0\.d, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (extb_u64_x_untied, svuint64_t,
+		z0 = svextb_u64_x (p0, z1),
+		z0 = svextb_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_s32.c
new file mode 100644
index 000000000..3bb0bf31f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_s32.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** exth_s32_m_tied12:
+**	sxth	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (exth_s32_m_tied12, svint32_t,
+		z0 = svexth_s32_m (z0, p0, z0),
+		z0 = svexth_m (z0, p0, z0))
+
+/*
+** exth_s32_m_tied1:
+**	sxth	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (exth_s32_m_tied1, svint32_t,
+		z0 = svexth_s32_m (z0, p0, z1),
+		z0 = svexth_m (z0, p0, z1))
+
+/*
+** exth_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	sxth	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (exth_s32_m_tied2, svint32_t,
+		z0 = svexth_s32_m (z1, p0, z0),
+		z0 = svexth_m (z1, p0, z0))
+
+/*
+** exth_s32_m_untied:
+**	movprfx	z0, z2
+**	sxth	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (exth_s32_m_untied, svint32_t,
+		z0 = svexth_s32_m (z2, p0, z1),
+		z0 = svexth_m (z2, p0, z1))
+
+/*
+** exth_s32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	sxth	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (exth_s32_z_tied1, svint32_t,
+		z0 = svexth_s32_z (p0, z0),
+		z0 = svexth_z (p0, z0))
+
+/*
+** exth_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	sxth	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (exth_s32_z_untied, svint32_t,
+		z0 = svexth_s32_z (p0, z1),
+		z0 = svexth_z (p0, z1))
+
+/*
+** exth_s32_x_tied1:
+**	sxth	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (exth_s32_x_tied1, svint32_t,
+		z0 = svexth_s32_x (p0, z0),
+		z0 = svexth_x (p0, z0))
+
+/*
+** exth_s32_x_untied:
+**	sxth	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (exth_s32_x_untied, svint32_t,
+		z0 = svexth_s32_x (p0, z1),
+		z0 = svexth_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_s64.c
new file mode 100644
index 000000000..0718b67ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_s64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** exth_s64_m_tied12:
+**	sxth	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (exth_s64_m_tied12, svint64_t,
+		z0 = svexth_s64_m (z0, p0, z0),
+		z0 = svexth_m (z0, p0, z0))
+
+/*
+** exth_s64_m_tied1:
+**	sxth	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (exth_s64_m_tied1, svint64_t,
+		z0 = svexth_s64_m (z0, p0, z1),
+		z0 = svexth_m (z0, p0, z1))
+
+/*
+** exth_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	sxth	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (exth_s64_m_tied2, svint64_t,
+		z0 = svexth_s64_m (z1, p0, z0),
+		z0 = svexth_m (z1, p0, z0))
+
+/*
+** exth_s64_m_untied:
+**	movprfx	z0, z2
+**	sxth	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (exth_s64_m_untied, svint64_t,
+		z0 = svexth_s64_m (z2, p0, z1),
+		z0 = svexth_m (z2, p0, z1))
+
+/*
+** exth_s64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	sxth	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (exth_s64_z_tied1, svint64_t,
+		z0 = svexth_s64_z (p0, z0),
+		z0 = svexth_z (p0, z0))
+
+/*
+** exth_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	sxth	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (exth_s64_z_untied, svint64_t,
+		z0 = svexth_s64_z (p0, z1),
+		z0 = svexth_z (p0, z1))
+
+/*
+** exth_s64_x_tied1:
+**	sxth	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (exth_s64_x_tied1, svint64_t,
+		z0 = svexth_s64_x (p0, z0),
+		z0 = svexth_x (p0, z0))
+
+/*
+** exth_s64_x_untied:
+**	sxth	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (exth_s64_x_untied, svint64_t,
+		z0 = svexth_s64_x (p0, z1),
+		z0 = svexth_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_u32.c
new file mode 100644
index 000000000..1ba7fc8c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_u32.c
@@ -0,0 +1,82 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** exth_u32_m_tied12:
+**	uxth	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (exth_u32_m_tied12, svuint32_t,
+		z0 = svexth_u32_m (z0, p0, z0),
+		z0 = svexth_m (z0, p0, z0))
+
+/*
+** exth_u32_m_tied1:
+**	uxth	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (exth_u32_m_tied1, svuint32_t,
+		z0 = svexth_u32_m (z0, p0, z1),
+		z0 = svexth_m (z0, p0, z1))
+
+/*
+** exth_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	uxth	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (exth_u32_m_tied2, svuint32_t,
+		z0 = svexth_u32_m (z1, p0, z0),
+		z0 = svexth_m (z1, p0, z0))
+
+/*
+** exth_u32_m_untied:
+**	movprfx	z0, z2
+**	uxth	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (exth_u32_m_untied, svuint32_t,
+		z0 = svexth_u32_m (z2, p0, z1),
+		z0 = svexth_m (z2, p0, z1))
+
+/*
+** exth_u32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	uxth	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (exth_u32_z_tied1, svuint32_t,
+		z0 = svexth_u32_z (p0, z0),
+		z0 = svexth_z (p0, z0))
+
+/*
+** exth_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	uxth	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (exth_u32_z_untied, svuint32_t,
+		z0 = svexth_u32_z (p0, z1),
+		z0 = svexth_z (p0, z1))
+
+/*
+** exth_u32_x_tied1:
+**	and	z0\.s, z0\.s, #0xffff
+**	ret
+*/
+TEST_UNIFORM_Z (exth_u32_x_tied1, svuint32_t,
+		z0 = svexth_u32_x (p0, z0),
+		z0 = svexth_x (p0, z0))
+
+/*
+** exth_u32_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.s, z0\.s, #0xffff
+**	ret
+*/
+TEST_UNIFORM_Z (exth_u32_x_untied, svuint32_t,
+		z0 = svexth_u32_x (p0, z1),
+		z0 = svexth_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_u64.c
new file mode 100644
index 000000000..1555cf0b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/exth_u64.c
@@ -0,0 +1,82 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** exth_u64_m_tied12:
+**	uxth	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (exth_u64_m_tied12, svuint64_t,
+		z0 = svexth_u64_m (z0, p0, z0),
+		z0 = svexth_m (z0, p0, z0))
+
+/*
+** exth_u64_m_tied1:
+**	uxth	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (exth_u64_m_tied1, svuint64_t,
+		z0 = svexth_u64_m (z0, p0, z1),
+		z0 = svexth_m (z0, p0, z1))
+
+/*
+** exth_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	uxth	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (exth_u64_m_tied2, svuint64_t,
+		z0 = svexth_u64_m (z1, p0, z0),
+		z0 = svexth_m (z1, p0, z0))
+
+/*
+** exth_u64_m_untied:
+**	movprfx	z0, z2
+**	uxth	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (exth_u64_m_untied, svuint64_t,
+		z0 = svexth_u64_m (z2, p0, z1),
+		z0 = svexth_m (z2, p0, z1))
+
+/*
+** exth_u64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	uxth	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (exth_u64_z_tied1, svuint64_t,
+		z0 = svexth_u64_z (p0, z0),
+		z0 = svexth_z (p0, z0))
+
+/*
+** exth_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	uxth	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (exth_u64_z_untied, svuint64_t,
+		z0 = svexth_u64_z (p0, z1),
+		z0 = svexth_z (p0, z1))
+
+/*
+** exth_u64_x_tied1:
+**	and	z0\.d, z0\.d, #0xffff
+**	ret
+*/
+TEST_UNIFORM_Z (exth_u64_x_tied1, svuint64_t,
+		z0 = svexth_u64_x (p0, z0),
+		z0 = svexth_x (p0, z0))
+
+/*
+** exth_u64_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.d, z0\.d, #0xffff
+**	ret
+*/
+TEST_UNIFORM_Z (exth_u64_x_untied, svuint64_t,
+		z0 = svexth_u64_x (p0, z1),
+		z0 = svexth_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extw_s64.c
new file mode 100644
index 000000000..a6edadfa7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extw_s64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** extw_s64_m_tied12:
+**	sxtw	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extw_s64_m_tied12, svint64_t,
+		z0 = svextw_s64_m (z0, p0, z0),
+		z0 = svextw_m (z0, p0, z0))
+
+/*
+** extw_s64_m_tied1:
+**	sxtw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extw_s64_m_tied1, svint64_t,
+		z0 = svextw_s64_m (z0, p0, z1),
+		z0 = svextw_m (z0, p0, z1))
+
+/*
+** extw_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	sxtw	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (extw_s64_m_tied2, svint64_t,
+		z0 = svextw_s64_m (z1, p0, z0),
+		z0 = svextw_m (z1, p0, z0))
+
+/*
+** extw_s64_m_untied:
+**	movprfx	z0, z2
+**	sxtw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extw_s64_m_untied, svint64_t,
+		z0 = svextw_s64_m (z2, p0, z1),
+		z0 = svextw_m (z2, p0, z1))
+
+/*
+** extw_s64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	sxtw	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (extw_s64_z_tied1, svint64_t,
+		z0 = svextw_s64_z (p0, z0),
+		z0 = svextw_z (p0, z0))
+
+/*
+** extw_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	sxtw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extw_s64_z_untied, svint64_t,
+		z0 = svextw_s64_z (p0, z1),
+		z0 = svextw_z (p0, z1))
+
+/*
+** extw_s64_x_tied1:
+**	sxtw	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extw_s64_x_tied1, svint64_t,
+		z0 = svextw_s64_x (p0, z0),
+		z0 = svextw_x (p0, z0))
+
+/*
+** extw_s64_x_untied:
+**	sxtw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extw_s64_x_untied, svint64_t,
+		z0 = svextw_s64_x (p0, z1),
+		z0 = svextw_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extw_u64.c
new file mode 100644
index 000000000..880a287f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/extw_u64.c
@@ -0,0 +1,82 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** extw_u64_m_tied12:
+**	uxtw	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extw_u64_m_tied12, svuint64_t,
+		z0 = svextw_u64_m (z0, p0, z0),
+		z0 = svextw_m (z0, p0, z0))
+
+/*
+** extw_u64_m_tied1:
+**	uxtw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extw_u64_m_tied1, svuint64_t,
+		z0 = svextw_u64_m (z0, p0, z1),
+		z0 = svextw_m (z0, p0, z1))
+
+/*
+** extw_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	uxtw	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (extw_u64_m_tied2, svuint64_t,
+		z0 = svextw_u64_m (z1, p0, z0),
+		z0 = svextw_m (z1, p0, z0))
+
+/*
+** extw_u64_m_untied:
+**	movprfx	z0, z2
+**	uxtw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extw_u64_m_untied, svuint64_t,
+		z0 = svextw_u64_m (z2, p0, z1),
+		z0 = svextw_m (z2, p0, z1))
+
+/*
+** extw_u64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	uxtw	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (extw_u64_z_tied1, svuint64_t,
+		z0 = svextw_u64_z (p0, z0),
+		z0 = svextw_z (p0, z0))
+
+/*
+** extw_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	uxtw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (extw_u64_z_untied, svuint64_t,
+		z0 = svextw_u64_z (p0, z1),
+		z0 = svextw_z (p0, z1))
+
+/*
+** extw_u64_x_tied1:
+**	and	z0\.d, z0\.d, #0xffffffff
+**	ret
+*/
+TEST_UNIFORM_Z (extw_u64_x_tied1, svuint64_t,
+		z0 = svextw_u64_x (p0, z0),
+		z0 = svextw_x (p0, z0))
+
+/*
+** extw_u64_x_untied:
+**	movprfx	z0, z1
+**	and	z0\.d, z0\.d, #0xffffffff
+**	ret
+*/
+TEST_UNIFORM_Z (extw_u64_x_untied, svuint64_t,
+		z0 = svextw_u64_x (p0, z1),
+		z0 = svextw_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_bf16.c
new file mode 100644
index 000000000..6e5c773b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_bf16.c
@@ -0,0 +1,55 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get2_bf16_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_bf16_z0_0, svbfloat16x2_t, svbfloat16_t,
+	  z0 = svget2_bf16 (z4, 0),
+	  z0 = svget2 (z4, 0))
+
+/*
+** get2_bf16_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_bf16_z0_1, svbfloat16x2_t, svbfloat16_t,
+	  z0 = svget2_bf16 (z4, 1),
+	  z0 = svget2 (z4, 1))
+
+/*
+** get2_bf16_z4_0:
+**	ret
+*/
+TEST_GET (get2_bf16_z4_0, svbfloat16x2_t, svbfloat16_t,
+	  z4_res = svget2_bf16 (z4, 0),
+	  z4_res = svget2 (z4, 0))
+
+/*
+** get2_bf16_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_bf16_z4_1, svbfloat16x2_t, svbfloat16_t,
+	  z4_res = svget2_bf16 (z4, 1),
+	  z4_res = svget2 (z4, 1))
+
+/*
+** get2_bf16_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_bf16_z5_0, svbfloat16x2_t, svbfloat16_t,
+	  z5_res = svget2_bf16 (z4, 0),
+	  z5_res = svget2 (z4, 0))
+
+/*
+** get2_bf16_z5_1:
+**	ret
+*/
+TEST_GET (get2_bf16_z5_1, svbfloat16x2_t, svbfloat16_t,
+	  z5_res = svget2_bf16 (z4, 1),
+	  z5_res = svget2 (z4, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f16.c
new file mode 100644
index 000000000..9b6379e0b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f16.c
@@ -0,0 +1,55 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get2_f16_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_f16_z0_0, svfloat16x2_t, svfloat16_t,
+	  z0 = svget2_f16 (z4, 0),
+	  z0 = svget2 (z4, 0))
+
+/*
+** get2_f16_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_f16_z0_1, svfloat16x2_t, svfloat16_t,
+	  z0 = svget2_f16 (z4, 1),
+	  z0 = svget2 (z4, 1))
+
+/*
+** get2_f16_z4_0:
+**	ret
+*/
+TEST_GET (get2_f16_z4_0, svfloat16x2_t, svfloat16_t,
+	  z4_res = svget2_f16 (z4, 0),
+	  z4_res = svget2 (z4, 0))
+
+/*
+** get2_f16_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_f16_z4_1, svfloat16x2_t, svfloat16_t,
+	  z4_res = svget2_f16 (z4, 1),
+	  z4_res = svget2 (z4, 1))
+
+/*
+** get2_f16_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_f16_z5_0, svfloat16x2_t, svfloat16_t,
+	  z5_res = svget2_f16 (z4, 0),
+	  z5_res = svget2 (z4, 0))
+
+/*
+** get2_f16_z5_1:
+**	ret
+*/
+TEST_GET (get2_f16_z5_1, svfloat16x2_t, svfloat16_t,
+	  z5_res = svget2_f16 (z4, 1),
+	  z5_res = svget2 (z4, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f32.c
new file mode 100644
index 000000000..76080dc66
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f32.c
@@ -0,0 +1,55 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get2_f32_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_f32_z0_0, svfloat32x2_t, svfloat32_t,
+	  z0 = svget2_f32 (z4, 0),
+	  z0 = svget2 (z4, 0))
+
+/*
+** get2_f32_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_f32_z0_1, svfloat32x2_t, svfloat32_t,
+	  z0 = svget2_f32 (z4, 1),
+	  z0 = svget2 (z4, 1))
+
+/*
+** get2_f32_z4_0:
+**	ret
+*/
+TEST_GET (get2_f32_z4_0, svfloat32x2_t, svfloat32_t,
+	  z4_res = svget2_f32 (z4, 0),
+	  z4_res = svget2 (z4, 0))
+
+/*
+** get2_f32_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_f32_z4_1, svfloat32x2_t, svfloat32_t,
+	  z4_res = svget2_f32 (z4, 1),
+	  z4_res = svget2 (z4, 1))
+
+/*
+** get2_f32_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_f32_z5_0, svfloat32x2_t, svfloat32_t,
+	  z5_res = svget2_f32 (z4, 0),
+	  z5_res = svget2 (z4, 0))
+
+/*
+** get2_f32_z5_1:
+**	ret
+*/
+TEST_GET (get2_f32_z5_1, svfloat32x2_t, svfloat32_t,
+	  z5_res = svget2_f32 (z4, 1),
+	  z5_res = svget2 (z4, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f64.c
new file mode 100644
index 000000000..cabe6e7de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_f64.c
@@ -0,0 +1,55 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get2_f64_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_f64_z0_0, svfloat64x2_t, svfloat64_t,
+	  z0 = svget2_f64 (z4, 0),
+	  z0 = svget2 (z4, 0))
+
+/*
+** get2_f64_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_f64_z0_1, svfloat64x2_t, svfloat64_t,
+	  z0 = svget2_f64 (z4, 1),
+	  z0 = svget2 (z4, 1))
+
+/*
+** get2_f64_z4_0:
+**	ret
+*/
+TEST_GET (get2_f64_z4_0, svfloat64x2_t, svfloat64_t,
+	  z4_res = svget2_f64 (z4, 0),
+	  z4_res = svget2 (z4, 0))
+
+/*
+** get2_f64_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_f64_z4_1, svfloat64x2_t, svfloat64_t,
+	  z4_res = svget2_f64 (z4, 1),
+	  z4_res = svget2 (z4, 1))
+
+/*
+** get2_f64_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_f64_z5_0, svfloat64x2_t, svfloat64_t,
+	  z5_res = svget2_f64 (z4, 0),
+	  z5_res = svget2 (z4, 0))
+
+/*
+** get2_f64_z5_1:
+**	ret
+*/
+TEST_GET (get2_f64_z5_1, svfloat64x2_t, svfloat64_t,
+	  z5_res = svget2_f64 (z4, 1),
+	  z5_res = svget2 (z4, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s16.c
new file mode 100644
index 000000000..387e6daad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s16.c
@@ -0,0 +1,55 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get2_s16_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_s16_z0_0, svint16x2_t, svint16_t,
+	  z0 = svget2_s16 (z4, 0),
+	  z0 = svget2 (z4, 0))
+
+/*
+** get2_s16_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_s16_z0_1, svint16x2_t, svint16_t,
+	  z0 = svget2_s16 (z4, 1),
+	  z0 = svget2 (z4, 1))
+
+/*
+** get2_s16_z4_0:
+**	ret
+*/
+TEST_GET (get2_s16_z4_0, svint16x2_t, svint16_t,
+	  z4_res = svget2_s16 (z4, 0),
+	  z4_res = svget2 (z4, 0))
+
+/*
+** get2_s16_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_s16_z4_1, svint16x2_t, svint16_t,
+	  z4_res = svget2_s16 (z4, 1),
+	  z4_res = svget2 (z4, 1))
+
+/*
+** get2_s16_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_s16_z5_0, svint16x2_t, svint16_t,
+	  z5_res = svget2_s16 (z4, 0),
+	  z5_res = svget2 (z4, 0))
+
+/*
+** get2_s16_z5_1:
+**	ret
+*/
+TEST_GET (get2_s16_z5_1, svint16x2_t, svint16_t,
+	  z5_res = svget2_s16 (z4, 1),
+	  z5_res = svget2 (z4, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s32.c
new file mode 100644
index 000000000..5c47286e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s32.c
@@ -0,0 +1,55 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get2_s32_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_s32_z0_0, svint32x2_t, svint32_t,
+	  z0 = svget2_s32 (z4, 0),
+	  z0 = svget2 (z4, 0))
+
+/*
+** get2_s32_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_s32_z0_1, svint32x2_t, svint32_t,
+	  z0 = svget2_s32 (z4, 1),
+	  z0 = svget2 (z4, 1))
+
+/*
+** get2_s32_z4_0:
+**	ret
+*/
+TEST_GET (get2_s32_z4_0, svint32x2_t, svint32_t,
+	  z4_res = svget2_s32 (z4, 0),
+	  z4_res = svget2 (z4, 0))
+
+/*
+** get2_s32_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_s32_z4_1, svint32x2_t, svint32_t,
+	  z4_res = svget2_s32 (z4, 1),
+	  z4_res = svget2 (z4, 1))
+
+/*
+** get2_s32_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_s32_z5_0, svint32x2_t, svint32_t,
+	  z5_res = svget2_s32 (z4, 0),
+	  z5_res = svget2 (z4, 0))
+
+/*
+** get2_s32_z5_1:
+**	ret
+*/
+TEST_GET (get2_s32_z5_1, svint32x2_t, svint32_t,
+	  z5_res = svget2_s32 (z4, 1),
+	  z5_res = svget2 (z4, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s64.c
new file mode 100644
index 000000000..18f930d4c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s64.c
@@ -0,0 +1,55 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get2_s64_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_s64_z0_0, svint64x2_t, svint64_t,
+	  z0 = svget2_s64 (z4, 0),
+	  z0 = svget2 (z4, 0))
+
+/*
+** get2_s64_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_s64_z0_1, svint64x2_t, svint64_t,
+	  z0 = svget2_s64 (z4, 1),
+	  z0 = svget2 (z4, 1))
+
+/*
+** get2_s64_z4_0:
+**	ret
+*/
+TEST_GET (get2_s64_z4_0, svint64x2_t, svint64_t,
+	  z4_res = svget2_s64 (z4, 0),
+	  z4_res = svget2 (z4, 0))
+
+/*
+** get2_s64_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_s64_z4_1, svint64x2_t, svint64_t,
+	  z4_res = svget2_s64 (z4, 1),
+	  z4_res = svget2 (z4, 1))
+
+/*
+** get2_s64_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_s64_z5_0, svint64x2_t, svint64_t,
+	  z5_res = svget2_s64 (z4, 0),
+	  z5_res = svget2 (z4, 0))
+
+/*
+** get2_s64_z5_1:
+**	ret
+*/
+TEST_GET (get2_s64_z5_1, svint64x2_t, svint64_t,
+	  z5_res = svget2_s64 (z4, 1),
+	  z5_res = svget2 (z4, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s8.c
new file mode 100644
index 000000000..27e2cfafb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_s8.c
@@ -0,0 +1,55 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get2_s8_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_s8_z0_0, svint8x2_t, svint8_t,
+	  z0 = svget2_s8 (z4, 0),
+	  z0 = svget2 (z4, 0))
+
+/*
+** get2_s8_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_s8_z0_1, svint8x2_t, svint8_t,
+	  z0 = svget2_s8 (z4, 1),
+	  z0 = svget2 (z4, 1))
+
+/*
+** get2_s8_z4_0:
+**	ret
+*/
+TEST_GET (get2_s8_z4_0, svint8x2_t, svint8_t,
+	  z4_res = svget2_s8 (z4, 0),
+	  z4_res = svget2 (z4, 0))
+
+/*
+** get2_s8_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_s8_z4_1, svint8x2_t, svint8_t,
+	  z4_res = svget2_s8 (z4, 1),
+	  z4_res = svget2 (z4, 1))
+
+/*
+** get2_s8_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_s8_z5_0, svint8x2_t, svint8_t,
+	  z5_res = svget2_s8 (z4, 0),
+	  z5_res = svget2 (z4, 0))
+
+/*
+** get2_s8_z5_1:
+**	ret
+*/
+TEST_GET (get2_s8_z5_1, svint8x2_t, svint8_t,
+	  z5_res = svget2_s8 (z4, 1),
+	  z5_res = svget2 (z4, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u16.c
new file mode 100644
index 000000000..1804900cc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u16.c
@@ -0,0 +1,55 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get2_u16_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_u16_z0_0, svuint16x2_t, svuint16_t,
+	  z0 = svget2_u16 (z4, 0),
+	  z0 = svget2 (z4, 0))
+
+/*
+** get2_u16_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_u16_z0_1, svuint16x2_t, svuint16_t,
+	  z0 = svget2_u16 (z4, 1),
+	  z0 = svget2 (z4, 1))
+
+/*
+** get2_u16_z4_0:
+**	ret
+*/
+TEST_GET (get2_u16_z4_0, svuint16x2_t, svuint16_t,
+	  z4_res = svget2_u16 (z4, 0),
+	  z4_res = svget2 (z4, 0))
+
+/*
+** get2_u16_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_u16_z4_1, svuint16x2_t, svuint16_t,
+	  z4_res = svget2_u16 (z4, 1),
+	  z4_res = svget2 (z4, 1))
+
+/*
+** get2_u16_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_u16_z5_0, svuint16x2_t, svuint16_t,
+	  z5_res = svget2_u16 (z4, 0),
+	  z5_res = svget2 (z4, 0))
+
+/*
+** get2_u16_z5_1:
+**	ret
+*/
+TEST_GET (get2_u16_z5_1, svuint16x2_t, svuint16_t,
+	  z5_res = svget2_u16 (z4, 1),
+	  z5_res = svget2 (z4, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u32.c
new file mode 100644
index 000000000..5c14de6aa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u32.c
@@ -0,0 +1,55 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get2_u32_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_u32_z0_0, svuint32x2_t, svuint32_t,
+	  z0 = svget2_u32 (z4, 0),
+	  z0 = svget2 (z4, 0))
+
+/*
+** get2_u32_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_u32_z0_1, svuint32x2_t, svuint32_t,
+	  z0 = svget2_u32 (z4, 1),
+	  z0 = svget2 (z4, 1))
+
+/*
+** get2_u32_z4_0:
+**	ret
+*/
+TEST_GET (get2_u32_z4_0, svuint32x2_t, svuint32_t,
+	  z4_res = svget2_u32 (z4, 0),
+	  z4_res = svget2 (z4, 0))
+
+/*
+** get2_u32_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_u32_z4_1, svuint32x2_t, svuint32_t,
+	  z4_res = svget2_u32 (z4, 1),
+	  z4_res = svget2 (z4, 1))
+
+/*
+** get2_u32_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_u32_z5_0, svuint32x2_t, svuint32_t,
+	  z5_res = svget2_u32 (z4, 0),
+	  z5_res = svget2 (z4, 0))
+
+/*
+** get2_u32_z5_1:
+**	ret
+*/
+TEST_GET (get2_u32_z5_1, svuint32x2_t, svuint32_t,
+	  z5_res = svget2_u32 (z4, 1),
+	  z5_res = svget2 (z4, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u64.c
new file mode 100644
index 000000000..fd389a01e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u64.c
@@ -0,0 +1,55 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get2_u64_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_u64_z0_0, svuint64x2_t, svuint64_t,
+	  z0 = svget2_u64 (z4, 0),
+	  z0 = svget2 (z4, 0))
+
+/*
+** get2_u64_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_u64_z0_1, svuint64x2_t, svuint64_t,
+	  z0 = svget2_u64 (z4, 1),
+	  z0 = svget2 (z4, 1))
+
+/*
+** get2_u64_z4_0:
+**	ret
+*/
+TEST_GET (get2_u64_z4_0, svuint64x2_t, svuint64_t,
+	  z4_res = svget2_u64 (z4, 0),
+	  z4_res = svget2 (z4, 0))
+
+/*
+** get2_u64_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_u64_z4_1, svuint64x2_t, svuint64_t,
+	  z4_res = svget2_u64 (z4, 1),
+	  z4_res = svget2 (z4, 1))
+
+/*
+** get2_u64_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_u64_z5_0, svuint64x2_t, svuint64_t,
+	  z5_res = svget2_u64 (z4, 0),
+	  z5_res = svget2 (z4, 0))
+
+/*
+** get2_u64_z5_1:
+**	ret
+*/
+TEST_GET (get2_u64_z5_1, svuint64x2_t, svuint64_t,
+	  z5_res = svget2_u64 (z4, 1),
+	  z5_res = svget2 (z4, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u8.c
new file mode 100644
index 000000000..42ffb0344
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get2_u8.c
@@ -0,0 +1,55 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get2_u8_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_u8_z0_0, svuint8x2_t, svuint8_t,
+	  z0 = svget2_u8 (z4, 0),
+	  z0 = svget2 (z4, 0))
+
+/*
+** get2_u8_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_u8_z0_1, svuint8x2_t, svuint8_t,
+	  z0 = svget2_u8 (z4, 1),
+	  z0 = svget2 (z4, 1))
+
+/*
+** get2_u8_z4_0:
+**	ret
+*/
+TEST_GET (get2_u8_z4_0, svuint8x2_t, svuint8_t,
+	  z4_res = svget2_u8 (z4, 0),
+	  z4_res = svget2 (z4, 0))
+
+/*
+** get2_u8_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get2_u8_z4_1, svuint8x2_t, svuint8_t,
+	  z4_res = svget2_u8 (z4, 1),
+	  z4_res = svget2 (z4, 1))
+
+/*
+** get2_u8_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get2_u8_z5_0, svuint8x2_t, svuint8_t,
+	  z5_res = svget2_u8 (z4, 0),
+	  z5_res = svget2 (z4, 0))
+
+/*
+** get2_u8_z5_1:
+**	ret
+*/
+TEST_GET (get2_u8_z5_1, svuint8x2_t, svuint8_t,
+	  z5_res = svget2_u8 (z4, 1),
+	  z5_res = svget2 (z4, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_bf16.c
new file mode 100644
index 000000000..292f02a12
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_bf16.c
@@ -0,0 +1,108 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get3_bf16_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_bf16_z0_0, svbfloat16x3_t, svbfloat16_t,
+	  z0 = svget3_bf16 (z4, 0),
+	  z0 = svget3 (z4, 0))
+
+/*
+** get3_bf16_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_bf16_z0_1, svbfloat16x3_t, svbfloat16_t,
+	  z0 = svget3_bf16 (z4, 1),
+	  z0 = svget3 (z4, 1))
+
+/*
+** get3_bf16_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_bf16_z0_2, svbfloat16x3_t, svbfloat16_t,
+	  z0 = svget3_bf16 (z4, 2),
+	  z0 = svget3 (z4, 2))
+
+/*
+** get3_bf16_z4_0:
+**	ret
+*/
+TEST_GET (get3_bf16_z4_0, svbfloat16x3_t, svbfloat16_t,
+	  z4_res = svget3_bf16 (z4, 0),
+	  z4_res = svget3 (z4, 0))
+
+/*
+** get3_bf16_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_bf16_z4_1, svbfloat16x3_t, svbfloat16_t,
+	  z4_res = svget3_bf16 (z4, 1),
+	  z4_res = svget3 (z4, 1))
+
+/*
+** get3_bf16_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_bf16_z4_2, svbfloat16x3_t, svbfloat16_t,
+	  z4_res = svget3_bf16 (z4, 2),
+	  z4_res = svget3 (z4, 2))
+
+/*
+** get3_bf16_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_bf16_z5_0, svbfloat16x3_t, svbfloat16_t,
+	  z5_res = svget3_bf16 (z4, 0),
+	  z5_res = svget3 (z4, 0))
+
+/*
+** get3_bf16_z5_1:
+**	ret
+*/
+TEST_GET (get3_bf16_z5_1, svbfloat16x3_t, svbfloat16_t,
+	  z5_res = svget3_bf16 (z4, 1),
+	  z5_res = svget3 (z4, 1))
+
+/*
+** get3_bf16_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_bf16_z5_2, svbfloat16x3_t, svbfloat16_t,
+	  z5_res = svget3_bf16 (z4, 2),
+	  z5_res = svget3 (z4, 2))
+
+/*
+** get3_bf16_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_bf16_z6_0, svbfloat16x3_t, svbfloat16_t,
+	  z6_res = svget3_bf16 (z4, 0),
+	  z6_res = svget3 (z4, 0))
+
+/*
+** get3_bf16_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_bf16_z6_1, svbfloat16x3_t, svbfloat16_t,
+	  z6_res = svget3_bf16 (z4, 1),
+	  z6_res = svget3 (z4, 1))
+
+/*
+** get3_bf16_z6_2:
+**	ret
+*/
+TEST_GET (get3_bf16_z6_2, svbfloat16x3_t, svbfloat16_t,
+	  z6_res = svget3_bf16 (z4, 2),
+	  z6_res = svget3 (z4, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f16.c
new file mode 100644
index 000000000..8bea03bc5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f16.c
@@ -0,0 +1,108 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get3_f16_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_f16_z0_0, svfloat16x3_t, svfloat16_t,
+	  z0 = svget3_f16 (z4, 0),
+	  z0 = svget3 (z4, 0))
+
+/*
+** get3_f16_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_f16_z0_1, svfloat16x3_t, svfloat16_t,
+	  z0 = svget3_f16 (z4, 1),
+	  z0 = svget3 (z4, 1))
+
+/*
+** get3_f16_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_f16_z0_2, svfloat16x3_t, svfloat16_t,
+	  z0 = svget3_f16 (z4, 2),
+	  z0 = svget3 (z4, 2))
+
+/*
+** get3_f16_z4_0:
+**	ret
+*/
+TEST_GET (get3_f16_z4_0, svfloat16x3_t, svfloat16_t,
+	  z4_res = svget3_f16 (z4, 0),
+	  z4_res = svget3 (z4, 0))
+
+/*
+** get3_f16_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_f16_z4_1, svfloat16x3_t, svfloat16_t,
+	  z4_res = svget3_f16 (z4, 1),
+	  z4_res = svget3 (z4, 1))
+
+/*
+** get3_f16_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_f16_z4_2, svfloat16x3_t, svfloat16_t,
+	  z4_res = svget3_f16 (z4, 2),
+	  z4_res = svget3 (z4, 2))
+
+/*
+** get3_f16_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_f16_z5_0, svfloat16x3_t, svfloat16_t,
+	  z5_res = svget3_f16 (z4, 0),
+	  z5_res = svget3 (z4, 0))
+
+/*
+** get3_f16_z5_1:
+**	ret
+*/
+TEST_GET (get3_f16_z5_1, svfloat16x3_t, svfloat16_t,
+	  z5_res = svget3_f16 (z4, 1),
+	  z5_res = svget3 (z4, 1))
+
+/*
+** get3_f16_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_f16_z5_2, svfloat16x3_t, svfloat16_t,
+	  z5_res = svget3_f16 (z4, 2),
+	  z5_res = svget3 (z4, 2))
+
+/*
+** get3_f16_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_f16_z6_0, svfloat16x3_t, svfloat16_t,
+	  z6_res = svget3_f16 (z4, 0),
+	  z6_res = svget3 (z4, 0))
+
+/*
+** get3_f16_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_f16_z6_1, svfloat16x3_t, svfloat16_t,
+	  z6_res = svget3_f16 (z4, 1),
+	  z6_res = svget3 (z4, 1))
+
+/*
+** get3_f16_z6_2:
+**	ret
+*/
+TEST_GET (get3_f16_z6_2, svfloat16x3_t, svfloat16_t,
+	  z6_res = svget3_f16 (z4, 2),
+	  z6_res = svget3 (z4, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f32.c
new file mode 100644
index 000000000..246679584
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f32.c
@@ -0,0 +1,108 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get3_f32_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_f32_z0_0, svfloat32x3_t, svfloat32_t,
+	  z0 = svget3_f32 (z4, 0),
+	  z0 = svget3 (z4, 0))
+
+/*
+** get3_f32_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_f32_z0_1, svfloat32x3_t, svfloat32_t,
+	  z0 = svget3_f32 (z4, 1),
+	  z0 = svget3 (z4, 1))
+
+/*
+** get3_f32_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_f32_z0_2, svfloat32x3_t, svfloat32_t,
+	  z0 = svget3_f32 (z4, 2),
+	  z0 = svget3 (z4, 2))
+
+/*
+** get3_f32_z4_0:
+**	ret
+*/
+TEST_GET (get3_f32_z4_0, svfloat32x3_t, svfloat32_t,
+	  z4_res = svget3_f32 (z4, 0),
+	  z4_res = svget3 (z4, 0))
+
+/*
+** get3_f32_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_f32_z4_1, svfloat32x3_t, svfloat32_t,
+	  z4_res = svget3_f32 (z4, 1),
+	  z4_res = svget3 (z4, 1))
+
+/*
+** get3_f32_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_f32_z4_2, svfloat32x3_t, svfloat32_t,
+	  z4_res = svget3_f32 (z4, 2),
+	  z4_res = svget3 (z4, 2))
+
+/*
+** get3_f32_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_f32_z5_0, svfloat32x3_t, svfloat32_t,
+	  z5_res = svget3_f32 (z4, 0),
+	  z5_res = svget3 (z4, 0))
+
+/*
+** get3_f32_z5_1:
+**	ret
+*/
+TEST_GET (get3_f32_z5_1, svfloat32x3_t, svfloat32_t,
+	  z5_res = svget3_f32 (z4, 1),
+	  z5_res = svget3 (z4, 1))
+
+/*
+** get3_f32_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_f32_z5_2, svfloat32x3_t, svfloat32_t,
+	  z5_res = svget3_f32 (z4, 2),
+	  z5_res = svget3 (z4, 2))
+
+/*
+** get3_f32_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_f32_z6_0, svfloat32x3_t, svfloat32_t,
+	  z6_res = svget3_f32 (z4, 0),
+	  z6_res = svget3 (z4, 0))
+
+/*
+** get3_f32_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_f32_z6_1, svfloat32x3_t, svfloat32_t,
+	  z6_res = svget3_f32 (z4, 1),
+	  z6_res = svget3 (z4, 1))
+
+/*
+** get3_f32_z6_2:
+**	ret
+*/
+TEST_GET (get3_f32_z6_2, svfloat32x3_t, svfloat32_t,
+	  z6_res = svget3_f32 (z4, 2),
+	  z6_res = svget3 (z4, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f64.c
new file mode 100644
index 000000000..e44eb15fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_f64.c
@@ -0,0 +1,108 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get3_f64_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_f64_z0_0, svfloat64x3_t, svfloat64_t,
+	  z0 = svget3_f64 (z4, 0),
+	  z0 = svget3 (z4, 0))
+
+/*
+** get3_f64_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_f64_z0_1, svfloat64x3_t, svfloat64_t,
+	  z0 = svget3_f64 (z4, 1),
+	  z0 = svget3 (z4, 1))
+
+/*
+** get3_f64_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_f64_z0_2, svfloat64x3_t, svfloat64_t,
+	  z0 = svget3_f64 (z4, 2),
+	  z0 = svget3 (z4, 2))
+
+/*
+** get3_f64_z4_0:
+**	ret
+*/
+TEST_GET (get3_f64_z4_0, svfloat64x3_t, svfloat64_t,
+	  z4_res = svget3_f64 (z4, 0),
+	  z4_res = svget3 (z4, 0))
+
+/*
+** get3_f64_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_f64_z4_1, svfloat64x3_t, svfloat64_t,
+	  z4_res = svget3_f64 (z4, 1),
+	  z4_res = svget3 (z4, 1))
+
+/*
+** get3_f64_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_f64_z4_2, svfloat64x3_t, svfloat64_t,
+	  z4_res = svget3_f64 (z4, 2),
+	  z4_res = svget3 (z4, 2))
+
+/*
+** get3_f64_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_f64_z5_0, svfloat64x3_t, svfloat64_t,
+	  z5_res = svget3_f64 (z4, 0),
+	  z5_res = svget3 (z4, 0))
+
+/*
+** get3_f64_z5_1:
+**	ret
+*/
+TEST_GET (get3_f64_z5_1, svfloat64x3_t, svfloat64_t,
+	  z5_res = svget3_f64 (z4, 1),
+	  z5_res = svget3 (z4, 1))
+
+/*
+** get3_f64_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_f64_z5_2, svfloat64x3_t, svfloat64_t,
+	  z5_res = svget3_f64 (z4, 2),
+	  z5_res = svget3 (z4, 2))
+
+/*
+** get3_f64_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_f64_z6_0, svfloat64x3_t, svfloat64_t,
+	  z6_res = svget3_f64 (z4, 0),
+	  z6_res = svget3 (z4, 0))
+
+/*
+** get3_f64_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_f64_z6_1, svfloat64x3_t, svfloat64_t,
+	  z6_res = svget3_f64 (z4, 1),
+	  z6_res = svget3 (z4, 1))
+
+/*
+** get3_f64_z6_2:
+**	ret
+*/
+TEST_GET (get3_f64_z6_2, svfloat64x3_t, svfloat64_t,
+	  z6_res = svget3_f64 (z4, 2),
+	  z6_res = svget3 (z4, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s16.c
new file mode 100644
index 000000000..88f7e4986
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s16.c
@@ -0,0 +1,108 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get3_s16_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_s16_z0_0, svint16x3_t, svint16_t,
+	  z0 = svget3_s16 (z4, 0),
+	  z0 = svget3 (z4, 0))
+
+/*
+** get3_s16_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_s16_z0_1, svint16x3_t, svint16_t,
+	  z0 = svget3_s16 (z4, 1),
+	  z0 = svget3 (z4, 1))
+
+/*
+** get3_s16_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_s16_z0_2, svint16x3_t, svint16_t,
+	  z0 = svget3_s16 (z4, 2),
+	  z0 = svget3 (z4, 2))
+
+/*
+** get3_s16_z4_0:
+**	ret
+*/
+TEST_GET (get3_s16_z4_0, svint16x3_t, svint16_t,
+	  z4_res = svget3_s16 (z4, 0),
+	  z4_res = svget3 (z4, 0))
+
+/*
+** get3_s16_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_s16_z4_1, svint16x3_t, svint16_t,
+	  z4_res = svget3_s16 (z4, 1),
+	  z4_res = svget3 (z4, 1))
+
+/*
+** get3_s16_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_s16_z4_2, svint16x3_t, svint16_t,
+	  z4_res = svget3_s16 (z4, 2),
+	  z4_res = svget3 (z4, 2))
+
+/*
+** get3_s16_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_s16_z5_0, svint16x3_t, svint16_t,
+	  z5_res = svget3_s16 (z4, 0),
+	  z5_res = svget3 (z4, 0))
+
+/*
+** get3_s16_z5_1:
+**	ret
+*/
+TEST_GET (get3_s16_z5_1, svint16x3_t, svint16_t,
+	  z5_res = svget3_s16 (z4, 1),
+	  z5_res = svget3 (z4, 1))
+
+/*
+** get3_s16_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_s16_z5_2, svint16x3_t, svint16_t,
+	  z5_res = svget3_s16 (z4, 2),
+	  z5_res = svget3 (z4, 2))
+
+/*
+** get3_s16_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_s16_z6_0, svint16x3_t, svint16_t,
+	  z6_res = svget3_s16 (z4, 0),
+	  z6_res = svget3 (z4, 0))
+
+/*
+** get3_s16_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_s16_z6_1, svint16x3_t, svint16_t,
+	  z6_res = svget3_s16 (z4, 1),
+	  z6_res = svget3 (z4, 1))
+
+/*
+** get3_s16_z6_2:
+**	ret
+*/
+TEST_GET (get3_s16_z6_2, svint16x3_t, svint16_t,
+	  z6_res = svget3_s16 (z4, 2),
+	  z6_res = svget3 (z4, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s32.c
new file mode 100644
index 000000000..f0f7785c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s32.c
@@ -0,0 +1,108 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get3_s32_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_s32_z0_0, svint32x3_t, svint32_t,
+	  z0 = svget3_s32 (z4, 0),
+	  z0 = svget3 (z4, 0))
+
+/*
+** get3_s32_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_s32_z0_1, svint32x3_t, svint32_t,
+	  z0 = svget3_s32 (z4, 1),
+	  z0 = svget3 (z4, 1))
+
+/*
+** get3_s32_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_s32_z0_2, svint32x3_t, svint32_t,
+	  z0 = svget3_s32 (z4, 2),
+	  z0 = svget3 (z4, 2))
+
+/*
+** get3_s32_z4_0:
+**	ret
+*/
+TEST_GET (get3_s32_z4_0, svint32x3_t, svint32_t,
+	  z4_res = svget3_s32 (z4, 0),
+	  z4_res = svget3 (z4, 0))
+
+/*
+** get3_s32_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_s32_z4_1, svint32x3_t, svint32_t,
+	  z4_res = svget3_s32 (z4, 1),
+	  z4_res = svget3 (z4, 1))
+
+/*
+** get3_s32_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_s32_z4_2, svint32x3_t, svint32_t,
+	  z4_res = svget3_s32 (z4, 2),
+	  z4_res = svget3 (z4, 2))
+
+/*
+** get3_s32_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_s32_z5_0, svint32x3_t, svint32_t,
+	  z5_res = svget3_s32 (z4, 0),
+	  z5_res = svget3 (z4, 0))
+
+/*
+** get3_s32_z5_1:
+**	ret
+*/
+TEST_GET (get3_s32_z5_1, svint32x3_t, svint32_t,
+	  z5_res = svget3_s32 (z4, 1),
+	  z5_res = svget3 (z4, 1))
+
+/*
+** get3_s32_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_s32_z5_2, svint32x3_t, svint32_t,
+	  z5_res = svget3_s32 (z4, 2),
+	  z5_res = svget3 (z4, 2))
+
+/*
+** get3_s32_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_s32_z6_0, svint32x3_t, svint32_t,
+	  z6_res = svget3_s32 (z4, 0),
+	  z6_res = svget3 (z4, 0))
+
+/*
+** get3_s32_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_s32_z6_1, svint32x3_t, svint32_t,
+	  z6_res = svget3_s32 (z4, 1),
+	  z6_res = svget3 (z4, 1))
+
+/*
+** get3_s32_z6_2:
+**	ret
+*/
+TEST_GET (get3_s32_z6_2, svint32x3_t, svint32_t,
+	  z6_res = svget3_s32 (z4, 2),
+	  z6_res = svget3 (z4, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s64.c
new file mode 100644
index 000000000..92500bfdf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s64.c
@@ -0,0 +1,108 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get3_s64_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_s64_z0_0, svint64x3_t, svint64_t,
+	  z0 = svget3_s64 (z4, 0),
+	  z0 = svget3 (z4, 0))
+
+/*
+** get3_s64_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_s64_z0_1, svint64x3_t, svint64_t,
+	  z0 = svget3_s64 (z4, 1),
+	  z0 = svget3 (z4, 1))
+
+/*
+** get3_s64_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_s64_z0_2, svint64x3_t, svint64_t,
+	  z0 = svget3_s64 (z4, 2),
+	  z0 = svget3 (z4, 2))
+
+/*
+** get3_s64_z4_0:
+**	ret
+*/
+TEST_GET (get3_s64_z4_0, svint64x3_t, svint64_t,
+	  z4_res = svget3_s64 (z4, 0),
+	  z4_res = svget3 (z4, 0))
+
+/*
+** get3_s64_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_s64_z4_1, svint64x3_t, svint64_t,
+	  z4_res = svget3_s64 (z4, 1),
+	  z4_res = svget3 (z4, 1))
+
+/*
+** get3_s64_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_s64_z4_2, svint64x3_t, svint64_t,
+	  z4_res = svget3_s64 (z4, 2),
+	  z4_res = svget3 (z4, 2))
+
+/*
+** get3_s64_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_s64_z5_0, svint64x3_t, svint64_t,
+	  z5_res = svget3_s64 (z4, 0),
+	  z5_res = svget3 (z4, 0))
+
+/*
+** get3_s64_z5_1:
+**	ret
+*/
+TEST_GET (get3_s64_z5_1, svint64x3_t, svint64_t,
+	  z5_res = svget3_s64 (z4, 1),
+	  z5_res = svget3 (z4, 1))
+
+/*
+** get3_s64_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_s64_z5_2, svint64x3_t, svint64_t,
+	  z5_res = svget3_s64 (z4, 2),
+	  z5_res = svget3 (z4, 2))
+
+/*
+** get3_s64_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_s64_z6_0, svint64x3_t, svint64_t,
+	  z6_res = svget3_s64 (z4, 0),
+	  z6_res = svget3 (z4, 0))
+
+/*
+** get3_s64_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_s64_z6_1, svint64x3_t, svint64_t,
+	  z6_res = svget3_s64 (z4, 1),
+	  z6_res = svget3 (z4, 1))
+
+/*
+** get3_s64_z6_2:
+**	ret
+*/
+TEST_GET (get3_s64_z6_2, svint64x3_t, svint64_t,
+	  z6_res = svget3_s64 (z4, 2),
+	  z6_res = svget3 (z4, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s8.c
new file mode 100644
index 000000000..edf225ba5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_s8.c
@@ -0,0 +1,108 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get3_s8_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_s8_z0_0, svint8x3_t, svint8_t,
+	  z0 = svget3_s8 (z4, 0),
+	  z0 = svget3 (z4, 0))
+
+/*
+** get3_s8_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_s8_z0_1, svint8x3_t, svint8_t,
+	  z0 = svget3_s8 (z4, 1),
+	  z0 = svget3 (z4, 1))
+
+/*
+** get3_s8_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_s8_z0_2, svint8x3_t, svint8_t,
+	  z0 = svget3_s8 (z4, 2),
+	  z0 = svget3 (z4, 2))
+
+/*
+** get3_s8_z4_0:
+**	ret
+*/
+TEST_GET (get3_s8_z4_0, svint8x3_t, svint8_t,
+	  z4_res = svget3_s8 (z4, 0),
+	  z4_res = svget3 (z4, 0))
+
+/*
+** get3_s8_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_s8_z4_1, svint8x3_t, svint8_t,
+	  z4_res = svget3_s8 (z4, 1),
+	  z4_res = svget3 (z4, 1))
+
+/*
+** get3_s8_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_s8_z4_2, svint8x3_t, svint8_t,
+	  z4_res = svget3_s8 (z4, 2),
+	  z4_res = svget3 (z4, 2))
+
+/*
+** get3_s8_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_s8_z5_0, svint8x3_t, svint8_t,
+	  z5_res = svget3_s8 (z4, 0),
+	  z5_res = svget3 (z4, 0))
+
+/*
+** get3_s8_z5_1:
+**	ret
+*/
+TEST_GET (get3_s8_z5_1, svint8x3_t, svint8_t,
+	  z5_res = svget3_s8 (z4, 1),
+	  z5_res = svget3 (z4, 1))
+
+/*
+** get3_s8_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_s8_z5_2, svint8x3_t, svint8_t,
+	  z5_res = svget3_s8 (z4, 2),
+	  z5_res = svget3 (z4, 2))
+
+/*
+** get3_s8_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_s8_z6_0, svint8x3_t, svint8_t,
+	  z6_res = svget3_s8 (z4, 0),
+	  z6_res = svget3 (z4, 0))
+
+/*
+** get3_s8_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_s8_z6_1, svint8x3_t, svint8_t,
+	  z6_res = svget3_s8 (z4, 1),
+	  z6_res = svget3 (z4, 1))
+
+/*
+** get3_s8_z6_2:
+**	ret
+*/
+TEST_GET (get3_s8_z6_2, svint8x3_t, svint8_t,
+	  z6_res = svget3_s8 (z4, 2),
+	  z6_res = svget3 (z4, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u16.c
new file mode 100644
index 000000000..1fa7c63c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u16.c
@@ -0,0 +1,108 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get3_u16_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_u16_z0_0, svuint16x3_t, svuint16_t,
+	  z0 = svget3_u16 (z4, 0),
+	  z0 = svget3 (z4, 0))
+
+/*
+** get3_u16_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_u16_z0_1, svuint16x3_t, svuint16_t,
+	  z0 = svget3_u16 (z4, 1),
+	  z0 = svget3 (z4, 1))
+
+/*
+** get3_u16_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_u16_z0_2, svuint16x3_t, svuint16_t,
+	  z0 = svget3_u16 (z4, 2),
+	  z0 = svget3 (z4, 2))
+
+/*
+** get3_u16_z4_0:
+**	ret
+*/
+TEST_GET (get3_u16_z4_0, svuint16x3_t, svuint16_t,
+	  z4_res = svget3_u16 (z4, 0),
+	  z4_res = svget3 (z4, 0))
+
+/*
+** get3_u16_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_u16_z4_1, svuint16x3_t, svuint16_t,
+	  z4_res = svget3_u16 (z4, 1),
+	  z4_res = svget3 (z4, 1))
+
+/*
+** get3_u16_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_u16_z4_2, svuint16x3_t, svuint16_t,
+	  z4_res = svget3_u16 (z4, 2),
+	  z4_res = svget3 (z4, 2))
+
+/*
+** get3_u16_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_u16_z5_0, svuint16x3_t, svuint16_t,
+	  z5_res = svget3_u16 (z4, 0),
+	  z5_res = svget3 (z4, 0))
+
+/*
+** get3_u16_z5_1:
+**	ret
+*/
+TEST_GET (get3_u16_z5_1, svuint16x3_t, svuint16_t,
+	  z5_res = svget3_u16 (z4, 1),
+	  z5_res = svget3 (z4, 1))
+
+/*
+** get3_u16_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_u16_z5_2, svuint16x3_t, svuint16_t,
+	  z5_res = svget3_u16 (z4, 2),
+	  z5_res = svget3 (z4, 2))
+
+/*
+** get3_u16_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_u16_z6_0, svuint16x3_t, svuint16_t,
+	  z6_res = svget3_u16 (z4, 0),
+	  z6_res = svget3 (z4, 0))
+
+/*
+** get3_u16_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_u16_z6_1, svuint16x3_t, svuint16_t,
+	  z6_res = svget3_u16 (z4, 1),
+	  z6_res = svget3 (z4, 1))
+
+/*
+** get3_u16_z6_2:
+**	ret
+*/
+TEST_GET (get3_u16_z6_2, svuint16x3_t, svuint16_t,
+	  z6_res = svget3_u16 (z4, 2),
+	  z6_res = svget3 (z4, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u32.c
new file mode 100644
index 000000000..03b5f2616
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u32.c
@@ -0,0 +1,108 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get3_u32_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_u32_z0_0, svuint32x3_t, svuint32_t,
+	  z0 = svget3_u32 (z4, 0),
+	  z0 = svget3 (z4, 0))
+
+/*
+** get3_u32_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_u32_z0_1, svuint32x3_t, svuint32_t,
+	  z0 = svget3_u32 (z4, 1),
+	  z0 = svget3 (z4, 1))
+
+/*
+** get3_u32_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_u32_z0_2, svuint32x3_t, svuint32_t,
+	  z0 = svget3_u32 (z4, 2),
+	  z0 = svget3 (z4, 2))
+
+/*
+** get3_u32_z4_0:
+**	ret
+*/
+TEST_GET (get3_u32_z4_0, svuint32x3_t, svuint32_t,
+	  z4_res = svget3_u32 (z4, 0),
+	  z4_res = svget3 (z4, 0))
+
+/*
+** get3_u32_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_u32_z4_1, svuint32x3_t, svuint32_t,
+	  z4_res = svget3_u32 (z4, 1),
+	  z4_res = svget3 (z4, 1))
+
+/*
+** get3_u32_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_u32_z4_2, svuint32x3_t, svuint32_t,
+	  z4_res = svget3_u32 (z4, 2),
+	  z4_res = svget3 (z4, 2))
+
+/*
+** get3_u32_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_u32_z5_0, svuint32x3_t, svuint32_t,
+	  z5_res = svget3_u32 (z4, 0),
+	  z5_res = svget3 (z4, 0))
+
+/*
+** get3_u32_z5_1:
+**	ret
+*/
+TEST_GET (get3_u32_z5_1, svuint32x3_t, svuint32_t,
+	  z5_res = svget3_u32 (z4, 1),
+	  z5_res = svget3 (z4, 1))
+
+/*
+** get3_u32_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_u32_z5_2, svuint32x3_t, svuint32_t,
+	  z5_res = svget3_u32 (z4, 2),
+	  z5_res = svget3 (z4, 2))
+
+/*
+** get3_u32_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_u32_z6_0, svuint32x3_t, svuint32_t,
+	  z6_res = svget3_u32 (z4, 0),
+	  z6_res = svget3 (z4, 0))
+
+/*
+** get3_u32_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_u32_z6_1, svuint32x3_t, svuint32_t,
+	  z6_res = svget3_u32 (z4, 1),
+	  z6_res = svget3 (z4, 1))
+
+/*
+** get3_u32_z6_2:
+**	ret
+*/
+TEST_GET (get3_u32_z6_2, svuint32x3_t, svuint32_t,
+	  z6_res = svget3_u32 (z4, 2),
+	  z6_res = svget3 (z4, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u64.c
new file mode 100644
index 000000000..ae4ef0024
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u64.c
@@ -0,0 +1,108 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get3_u64_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_u64_z0_0, svuint64x3_t, svuint64_t,
+	  z0 = svget3_u64 (z4, 0),
+	  z0 = svget3 (z4, 0))
+
+/*
+** get3_u64_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_u64_z0_1, svuint64x3_t, svuint64_t,
+	  z0 = svget3_u64 (z4, 1),
+	  z0 = svget3 (z4, 1))
+
+/*
+** get3_u64_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_u64_z0_2, svuint64x3_t, svuint64_t,
+	  z0 = svget3_u64 (z4, 2),
+	  z0 = svget3 (z4, 2))
+
+/*
+** get3_u64_z4_0:
+**	ret
+*/
+TEST_GET (get3_u64_z4_0, svuint64x3_t, svuint64_t,
+	  z4_res = svget3_u64 (z4, 0),
+	  z4_res = svget3 (z4, 0))
+
+/*
+** get3_u64_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_u64_z4_1, svuint64x3_t, svuint64_t,
+	  z4_res = svget3_u64 (z4, 1),
+	  z4_res = svget3 (z4, 1))
+
+/*
+** get3_u64_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_u64_z4_2, svuint64x3_t, svuint64_t,
+	  z4_res = svget3_u64 (z4, 2),
+	  z4_res = svget3 (z4, 2))
+
+/*
+** get3_u64_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_u64_z5_0, svuint64x3_t, svuint64_t,
+	  z5_res = svget3_u64 (z4, 0),
+	  z5_res = svget3 (z4, 0))
+
+/*
+** get3_u64_z5_1:
+**	ret
+*/
+TEST_GET (get3_u64_z5_1, svuint64x3_t, svuint64_t,
+	  z5_res = svget3_u64 (z4, 1),
+	  z5_res = svget3 (z4, 1))
+
+/*
+** get3_u64_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_u64_z5_2, svuint64x3_t, svuint64_t,
+	  z5_res = svget3_u64 (z4, 2),
+	  z5_res = svget3 (z4, 2))
+
+/*
+** get3_u64_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_u64_z6_0, svuint64x3_t, svuint64_t,
+	  z6_res = svget3_u64 (z4, 0),
+	  z6_res = svget3 (z4, 0))
+
+/*
+** get3_u64_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_u64_z6_1, svuint64x3_t, svuint64_t,
+	  z6_res = svget3_u64 (z4, 1),
+	  z6_res = svget3 (z4, 1))
+
+/*
+** get3_u64_z6_2:
+**	ret
+*/
+TEST_GET (get3_u64_z6_2, svuint64x3_t, svuint64_t,
+	  z6_res = svget3_u64 (z4, 2),
+	  z6_res = svget3 (z4, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u8.c
new file mode 100644
index 000000000..497dcbbae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get3_u8.c
@@ -0,0 +1,108 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get3_u8_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_u8_z0_0, svuint8x3_t, svuint8_t,
+	  z0 = svget3_u8 (z4, 0),
+	  z0 = svget3 (z4, 0))
+
+/*
+** get3_u8_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_u8_z0_1, svuint8x3_t, svuint8_t,
+	  z0 = svget3_u8 (z4, 1),
+	  z0 = svget3 (z4, 1))
+
+/*
+** get3_u8_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_u8_z0_2, svuint8x3_t, svuint8_t,
+	  z0 = svget3_u8 (z4, 2),
+	  z0 = svget3 (z4, 2))
+
+/*
+** get3_u8_z4_0:
+**	ret
+*/
+TEST_GET (get3_u8_z4_0, svuint8x3_t, svuint8_t,
+	  z4_res = svget3_u8 (z4, 0),
+	  z4_res = svget3 (z4, 0))
+
+/*
+** get3_u8_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_u8_z4_1, svuint8x3_t, svuint8_t,
+	  z4_res = svget3_u8 (z4, 1),
+	  z4_res = svget3 (z4, 1))
+
+/*
+** get3_u8_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_u8_z4_2, svuint8x3_t, svuint8_t,
+	  z4_res = svget3_u8 (z4, 2),
+	  z4_res = svget3 (z4, 2))
+
+/*
+** get3_u8_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_u8_z5_0, svuint8x3_t, svuint8_t,
+	  z5_res = svget3_u8 (z4, 0),
+	  z5_res = svget3 (z4, 0))
+
+/*
+** get3_u8_z5_1:
+**	ret
+*/
+TEST_GET (get3_u8_z5_1, svuint8x3_t, svuint8_t,
+	  z5_res = svget3_u8 (z4, 1),
+	  z5_res = svget3 (z4, 1))
+
+/*
+** get3_u8_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get3_u8_z5_2, svuint8x3_t, svuint8_t,
+	  z5_res = svget3_u8 (z4, 2),
+	  z5_res = svget3 (z4, 2))
+
+/*
+** get3_u8_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get3_u8_z6_0, svuint8x3_t, svuint8_t,
+	  z6_res = svget3_u8 (z4, 0),
+	  z6_res = svget3 (z4, 0))
+
+/*
+** get3_u8_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get3_u8_z6_1, svuint8x3_t, svuint8_t,
+	  z6_res = svget3_u8 (z4, 1),
+	  z6_res = svget3 (z4, 1))
+
+/*
+** get3_u8_z6_2:
+**	ret
+*/
+TEST_GET (get3_u8_z6_2, svuint8x3_t, svuint8_t,
+	  z6_res = svget3_u8 (z4, 2),
+	  z6_res = svget3 (z4, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_bf16.c
new file mode 100644
index 000000000..f751fc147
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_bf16.c
@@ -0,0 +1,179 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get4_bf16_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_bf16_z0_0, svbfloat16x4_t, svbfloat16_t,
+	  z0 = svget4_bf16 (z4, 0),
+	  z0 = svget4 (z4, 0))
+
+/*
+** get4_bf16_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_bf16_z0_1, svbfloat16x4_t, svbfloat16_t,
+	  z0 = svget4_bf16 (z4, 1),
+	  z0 = svget4 (z4, 1))
+
+/*
+** get4_bf16_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_bf16_z0_2, svbfloat16x4_t, svbfloat16_t,
+	  z0 = svget4_bf16 (z4, 2),
+	  z0 = svget4 (z4, 2))
+
+/*
+** get4_bf16_z0_3:
+**	mov	z0\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_bf16_z0_3, svbfloat16x4_t, svbfloat16_t,
+	  z0 = svget4_bf16 (z4, 3),
+	  z0 = svget4 (z4, 3))
+
+/*
+** get4_bf16_z4_0:
+**	ret
+*/
+TEST_GET (get4_bf16_z4_0, svbfloat16x4_t, svbfloat16_t,
+	  z4_res = svget4_bf16 (z4, 0),
+	  z4_res = svget4 (z4, 0))
+
+/*
+** get4_bf16_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_bf16_z4_1, svbfloat16x4_t, svbfloat16_t,
+	  z4_res = svget4_bf16 (z4, 1),
+	  z4_res = svget4 (z4, 1))
+
+/*
+** get4_bf16_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_bf16_z4_2, svbfloat16x4_t, svbfloat16_t,
+	  z4_res = svget4_bf16 (z4, 2),
+	  z4_res = svget4 (z4, 2))
+
+/*
+** get4_bf16_z4_3:
+**	mov	z4\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_bf16_z4_3, svbfloat16x4_t, svbfloat16_t,
+	  z4_res = svget4_bf16 (z4, 3),
+	  z4_res = svget4 (z4, 3))
+
+/*
+** get4_bf16_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_bf16_z5_0, svbfloat16x4_t, svbfloat16_t,
+	  z5_res = svget4_bf16 (z4, 0),
+	  z5_res = svget4 (z4, 0))
+
+/*
+** get4_bf16_z5_1:
+**	ret
+*/
+TEST_GET (get4_bf16_z5_1, svbfloat16x4_t, svbfloat16_t,
+	  z5_res = svget4_bf16 (z4, 1),
+	  z5_res = svget4 (z4, 1))
+
+/*
+** get4_bf16_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_bf16_z5_2, svbfloat16x4_t, svbfloat16_t,
+	  z5_res = svget4_bf16 (z4, 2),
+	  z5_res = svget4 (z4, 2))
+
+/*
+** get4_bf16_z5_3:
+**	mov	z5\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_bf16_z5_3, svbfloat16x4_t, svbfloat16_t,
+	  z5_res = svget4_bf16 (z4, 3),
+	  z5_res = svget4 (z4, 3))
+
+/*
+** get4_bf16_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_bf16_z6_0, svbfloat16x4_t, svbfloat16_t,
+	  z6_res = svget4_bf16 (z4, 0),
+	  z6_res = svget4 (z4, 0))
+
+/*
+** get4_bf16_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_bf16_z6_1, svbfloat16x4_t, svbfloat16_t,
+	  z6_res = svget4_bf16 (z4, 1),
+	  z6_res = svget4 (z4, 1))
+
+/*
+** get4_bf16_z6_2:
+**	ret
+*/
+TEST_GET (get4_bf16_z6_2, svbfloat16x4_t, svbfloat16_t,
+	  z6_res = svget4_bf16 (z4, 2),
+	  z6_res = svget4 (z4, 2))
+
+/*
+** get4_bf16_z6_3:
+**	mov	z6\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_bf16_z6_3, svbfloat16x4_t, svbfloat16_t,
+	  z6_res = svget4_bf16 (z4, 3),
+	  z6_res = svget4 (z4, 3))
+
+/*
+** get4_bf16_z7_0:
+**	mov	z7\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_bf16_z7_0, svbfloat16x4_t, svbfloat16_t,
+	  z7_res = svget4_bf16 (z4, 0),
+	  z7_res = svget4 (z4, 0))
+
+/*
+** get4_bf16_z7_1:
+**	mov	z7\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_bf16_z7_1, svbfloat16x4_t, svbfloat16_t,
+	  z7_res = svget4_bf16 (z4, 1),
+	  z7_res = svget4 (z4, 1))
+
+/*
+** get4_bf16_z7_2:
+**	mov	z7\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_bf16_z7_2, svbfloat16x4_t, svbfloat16_t,
+	  z7_res = svget4_bf16 (z4, 2),
+	  z7_res = svget4 (z4, 2))
+
+/*
+** get4_bf16_z7_3:
+**	ret
+*/
+TEST_GET (get4_bf16_z7_3, svbfloat16x4_t, svbfloat16_t,
+	  z7_res = svget4_bf16 (z4, 3),
+	  z7_res = svget4 (z4, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f16.c
new file mode 100644
index 000000000..7871f6f4e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f16.c
@@ -0,0 +1,179 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get4_f16_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_f16_z0_0, svfloat16x4_t, svfloat16_t,
+	  z0 = svget4_f16 (z4, 0),
+	  z0 = svget4 (z4, 0))
+
+/*
+** get4_f16_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_f16_z0_1, svfloat16x4_t, svfloat16_t,
+	  z0 = svget4_f16 (z4, 1),
+	  z0 = svget4 (z4, 1))
+
+/*
+** get4_f16_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_f16_z0_2, svfloat16x4_t, svfloat16_t,
+	  z0 = svget4_f16 (z4, 2),
+	  z0 = svget4 (z4, 2))
+
+/*
+** get4_f16_z0_3:
+**	mov	z0\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_f16_z0_3, svfloat16x4_t, svfloat16_t,
+	  z0 = svget4_f16 (z4, 3),
+	  z0 = svget4 (z4, 3))
+
+/*
+** get4_f16_z4_0:
+**	ret
+*/
+TEST_GET (get4_f16_z4_0, svfloat16x4_t, svfloat16_t,
+	  z4_res = svget4_f16 (z4, 0),
+	  z4_res = svget4 (z4, 0))
+
+/*
+** get4_f16_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_f16_z4_1, svfloat16x4_t, svfloat16_t,
+	  z4_res = svget4_f16 (z4, 1),
+	  z4_res = svget4 (z4, 1))
+
+/*
+** get4_f16_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_f16_z4_2, svfloat16x4_t, svfloat16_t,
+	  z4_res = svget4_f16 (z4, 2),
+	  z4_res = svget4 (z4, 2))
+
+/*
+** get4_f16_z4_3:
+**	mov	z4\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_f16_z4_3, svfloat16x4_t, svfloat16_t,
+	  z4_res = svget4_f16 (z4, 3),
+	  z4_res = svget4 (z4, 3))
+
+/*
+** get4_f16_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_f16_z5_0, svfloat16x4_t, svfloat16_t,
+	  z5_res = svget4_f16 (z4, 0),
+	  z5_res = svget4 (z4, 0))
+
+/*
+** get4_f16_z5_1:
+**	ret
+*/
+TEST_GET (get4_f16_z5_1, svfloat16x4_t, svfloat16_t,
+	  z5_res = svget4_f16 (z4, 1),
+	  z5_res = svget4 (z4, 1))
+
+/*
+** get4_f16_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_f16_z5_2, svfloat16x4_t, svfloat16_t,
+	  z5_res = svget4_f16 (z4, 2),
+	  z5_res = svget4 (z4, 2))
+
+/*
+** get4_f16_z5_3:
+**	mov	z5\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_f16_z5_3, svfloat16x4_t, svfloat16_t,
+	  z5_res = svget4_f16 (z4, 3),
+	  z5_res = svget4 (z4, 3))
+
+/*
+** get4_f16_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_f16_z6_0, svfloat16x4_t, svfloat16_t,
+	  z6_res = svget4_f16 (z4, 0),
+	  z6_res = svget4 (z4, 0))
+
+/*
+** get4_f16_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_f16_z6_1, svfloat16x4_t, svfloat16_t,
+	  z6_res = svget4_f16 (z4, 1),
+	  z6_res = svget4 (z4, 1))
+
+/*
+** get4_f16_z6_2:
+**	ret
+*/
+TEST_GET (get4_f16_z6_2, svfloat16x4_t, svfloat16_t,
+	  z6_res = svget4_f16 (z4, 2),
+	  z6_res = svget4 (z4, 2))
+
+/*
+** get4_f16_z6_3:
+**	mov	z6\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_f16_z6_3, svfloat16x4_t, svfloat16_t,
+	  z6_res = svget4_f16 (z4, 3),
+	  z6_res = svget4 (z4, 3))
+
+/*
+** get4_f16_z7_0:
+**	mov	z7\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_f16_z7_0, svfloat16x4_t, svfloat16_t,
+	  z7_res = svget4_f16 (z4, 0),
+	  z7_res = svget4 (z4, 0))
+
+/*
+** get4_f16_z7_1:
+**	mov	z7\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_f16_z7_1, svfloat16x4_t, svfloat16_t,
+	  z7_res = svget4_f16 (z4, 1),
+	  z7_res = svget4 (z4, 1))
+
+/*
+** get4_f16_z7_2:
+**	mov	z7\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_f16_z7_2, svfloat16x4_t, svfloat16_t,
+	  z7_res = svget4_f16 (z4, 2),
+	  z7_res = svget4 (z4, 2))
+
+/*
+** get4_f16_z7_3:
+**	ret
+*/
+TEST_GET (get4_f16_z7_3, svfloat16x4_t, svfloat16_t,
+	  z7_res = svget4_f16 (z4, 3),
+	  z7_res = svget4 (z4, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f32.c
new file mode 100644
index 000000000..a290e026d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f32.c
@@ -0,0 +1,179 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get4_f32_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_f32_z0_0, svfloat32x4_t, svfloat32_t,
+	  z0 = svget4_f32 (z4, 0),
+	  z0 = svget4 (z4, 0))
+
+/*
+** get4_f32_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_f32_z0_1, svfloat32x4_t, svfloat32_t,
+	  z0 = svget4_f32 (z4, 1),
+	  z0 = svget4 (z4, 1))
+
+/*
+** get4_f32_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_f32_z0_2, svfloat32x4_t, svfloat32_t,
+	  z0 = svget4_f32 (z4, 2),
+	  z0 = svget4 (z4, 2))
+
+/*
+** get4_f32_z0_3:
+**	mov	z0\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_f32_z0_3, svfloat32x4_t, svfloat32_t,
+	  z0 = svget4_f32 (z4, 3),
+	  z0 = svget4 (z4, 3))
+
+/*
+** get4_f32_z4_0:
+**	ret
+*/
+TEST_GET (get4_f32_z4_0, svfloat32x4_t, svfloat32_t,
+	  z4_res = svget4_f32 (z4, 0),
+	  z4_res = svget4 (z4, 0))
+
+/*
+** get4_f32_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_f32_z4_1, svfloat32x4_t, svfloat32_t,
+	  z4_res = svget4_f32 (z4, 1),
+	  z4_res = svget4 (z4, 1))
+
+/*
+** get4_f32_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_f32_z4_2, svfloat32x4_t, svfloat32_t,
+	  z4_res = svget4_f32 (z4, 2),
+	  z4_res = svget4 (z4, 2))
+
+/*
+** get4_f32_z4_3:
+**	mov	z4\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_f32_z4_3, svfloat32x4_t, svfloat32_t,
+	  z4_res = svget4_f32 (z4, 3),
+	  z4_res = svget4 (z4, 3))
+
+/*
+** get4_f32_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_f32_z5_0, svfloat32x4_t, svfloat32_t,
+	  z5_res = svget4_f32 (z4, 0),
+	  z5_res = svget4 (z4, 0))
+
+/*
+** get4_f32_z5_1:
+**	ret
+*/
+TEST_GET (get4_f32_z5_1, svfloat32x4_t, svfloat32_t,
+	  z5_res = svget4_f32 (z4, 1),
+	  z5_res = svget4 (z4, 1))
+
+/*
+** get4_f32_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_f32_z5_2, svfloat32x4_t, svfloat32_t,
+	  z5_res = svget4_f32 (z4, 2),
+	  z5_res = svget4 (z4, 2))
+
+/*
+** get4_f32_z5_3:
+**	mov	z5\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_f32_z5_3, svfloat32x4_t, svfloat32_t,
+	  z5_res = svget4_f32 (z4, 3),
+	  z5_res = svget4 (z4, 3))
+
+/*
+** get4_f32_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_f32_z6_0, svfloat32x4_t, svfloat32_t,
+	  z6_res = svget4_f32 (z4, 0),
+	  z6_res = svget4 (z4, 0))
+
+/*
+** get4_f32_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_f32_z6_1, svfloat32x4_t, svfloat32_t,
+	  z6_res = svget4_f32 (z4, 1),
+	  z6_res = svget4 (z4, 1))
+
+/*
+** get4_f32_z6_2:
+**	ret
+*/
+TEST_GET (get4_f32_z6_2, svfloat32x4_t, svfloat32_t,
+	  z6_res = svget4_f32 (z4, 2),
+	  z6_res = svget4 (z4, 2))
+
+/*
+** get4_f32_z6_3:
+**	mov	z6\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_f32_z6_3, svfloat32x4_t, svfloat32_t,
+	  z6_res = svget4_f32 (z4, 3),
+	  z6_res = svget4 (z4, 3))
+
+/*
+** get4_f32_z7_0:
+**	mov	z7\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_f32_z7_0, svfloat32x4_t, svfloat32_t,
+	  z7_res = svget4_f32 (z4, 0),
+	  z7_res = svget4 (z4, 0))
+
+/*
+** get4_f32_z7_1:
+**	mov	z7\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_f32_z7_1, svfloat32x4_t, svfloat32_t,
+	  z7_res = svget4_f32 (z4, 1),
+	  z7_res = svget4 (z4, 1))
+
+/*
+** get4_f32_z7_2:
+**	mov	z7\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_f32_z7_2, svfloat32x4_t, svfloat32_t,
+	  z7_res = svget4_f32 (z4, 2),
+	  z7_res = svget4 (z4, 2))
+
+/*
+** get4_f32_z7_3:
+**	ret
+*/
+TEST_GET (get4_f32_z7_3, svfloat32x4_t, svfloat32_t,
+	  z7_res = svget4_f32 (z4, 3),
+	  z7_res = svget4 (z4, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f64.c
new file mode 100644
index 000000000..2c34dfef1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_f64.c
@@ -0,0 +1,179 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get4_f64_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_f64_z0_0, svfloat64x4_t, svfloat64_t,
+	  z0 = svget4_f64 (z4, 0),
+	  z0 = svget4 (z4, 0))
+
+/*
+** get4_f64_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_f64_z0_1, svfloat64x4_t, svfloat64_t,
+	  z0 = svget4_f64 (z4, 1),
+	  z0 = svget4 (z4, 1))
+
+/*
+** get4_f64_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_f64_z0_2, svfloat64x4_t, svfloat64_t,
+	  z0 = svget4_f64 (z4, 2),
+	  z0 = svget4 (z4, 2))
+
+/*
+** get4_f64_z0_3:
+**	mov	z0\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_f64_z0_3, svfloat64x4_t, svfloat64_t,
+	  z0 = svget4_f64 (z4, 3),
+	  z0 = svget4 (z4, 3))
+
+/*
+** get4_f64_z4_0:
+**	ret
+*/
+TEST_GET (get4_f64_z4_0, svfloat64x4_t, svfloat64_t,
+	  z4_res = svget4_f64 (z4, 0),
+	  z4_res = svget4 (z4, 0))
+
+/*
+** get4_f64_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_f64_z4_1, svfloat64x4_t, svfloat64_t,
+	  z4_res = svget4_f64 (z4, 1),
+	  z4_res = svget4 (z4, 1))
+
+/*
+** get4_f64_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_f64_z4_2, svfloat64x4_t, svfloat64_t,
+	  z4_res = svget4_f64 (z4, 2),
+	  z4_res = svget4 (z4, 2))
+
+/*
+** get4_f64_z4_3:
+**	mov	z4\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_f64_z4_3, svfloat64x4_t, svfloat64_t,
+	  z4_res = svget4_f64 (z4, 3),
+	  z4_res = svget4 (z4, 3))
+
+/*
+** get4_f64_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_f64_z5_0, svfloat64x4_t, svfloat64_t,
+	  z5_res = svget4_f64 (z4, 0),
+	  z5_res = svget4 (z4, 0))
+
+/*
+** get4_f64_z5_1:
+**	ret
+*/
+TEST_GET (get4_f64_z5_1, svfloat64x4_t, svfloat64_t,
+	  z5_res = svget4_f64 (z4, 1),
+	  z5_res = svget4 (z4, 1))
+
+/*
+** get4_f64_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_f64_z5_2, svfloat64x4_t, svfloat64_t,
+	  z5_res = svget4_f64 (z4, 2),
+	  z5_res = svget4 (z4, 2))
+
+/*
+** get4_f64_z5_3:
+**	mov	z5\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_f64_z5_3, svfloat64x4_t, svfloat64_t,
+	  z5_res = svget4_f64 (z4, 3),
+	  z5_res = svget4 (z4, 3))
+
+/*
+** get4_f64_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_f64_z6_0, svfloat64x4_t, svfloat64_t,
+	  z6_res = svget4_f64 (z4, 0),
+	  z6_res = svget4 (z4, 0))
+
+/*
+** get4_f64_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_f64_z6_1, svfloat64x4_t, svfloat64_t,
+	  z6_res = svget4_f64 (z4, 1),
+	  z6_res = svget4 (z4, 1))
+
+/*
+** get4_f64_z6_2:
+**	ret
+*/
+TEST_GET (get4_f64_z6_2, svfloat64x4_t, svfloat64_t,
+	  z6_res = svget4_f64 (z4, 2),
+	  z6_res = svget4 (z4, 2))
+
+/*
+** get4_f64_z6_3:
+**	mov	z6\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_f64_z6_3, svfloat64x4_t, svfloat64_t,
+	  z6_res = svget4_f64 (z4, 3),
+	  z6_res = svget4 (z4, 3))
+
+/*
+** get4_f64_z7_0:
+**	mov	z7\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_f64_z7_0, svfloat64x4_t, svfloat64_t,
+	  z7_res = svget4_f64 (z4, 0),
+	  z7_res = svget4 (z4, 0))
+
+/*
+** get4_f64_z7_1:
+**	mov	z7\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_f64_z7_1, svfloat64x4_t, svfloat64_t,
+	  z7_res = svget4_f64 (z4, 1),
+	  z7_res = svget4 (z4, 1))
+
+/*
+** get4_f64_z7_2:
+**	mov	z7\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_f64_z7_2, svfloat64x4_t, svfloat64_t,
+	  z7_res = svget4_f64 (z4, 2),
+	  z7_res = svget4 (z4, 2))
+
+/*
+** get4_f64_z7_3:
+**	ret
+*/
+TEST_GET (get4_f64_z7_3, svfloat64x4_t, svfloat64_t,
+	  z7_res = svget4_f64 (z4, 3),
+	  z7_res = svget4 (z4, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s16.c
new file mode 100644
index 000000000..6a2280fea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s16.c
@@ -0,0 +1,179 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get4_s16_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_s16_z0_0, svint16x4_t, svint16_t,
+	  z0 = svget4_s16 (z4, 0),
+	  z0 = svget4 (z4, 0))
+
+/*
+** get4_s16_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_s16_z0_1, svint16x4_t, svint16_t,
+	  z0 = svget4_s16 (z4, 1),
+	  z0 = svget4 (z4, 1))
+
+/*
+** get4_s16_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_s16_z0_2, svint16x4_t, svint16_t,
+	  z0 = svget4_s16 (z4, 2),
+	  z0 = svget4 (z4, 2))
+
+/*
+** get4_s16_z0_3:
+**	mov	z0\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_s16_z0_3, svint16x4_t, svint16_t,
+	  z0 = svget4_s16 (z4, 3),
+	  z0 = svget4 (z4, 3))
+
+/*
+** get4_s16_z4_0:
+**	ret
+*/
+TEST_GET (get4_s16_z4_0, svint16x4_t, svint16_t,
+	  z4_res = svget4_s16 (z4, 0),
+	  z4_res = svget4 (z4, 0))
+
+/*
+** get4_s16_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_s16_z4_1, svint16x4_t, svint16_t,
+	  z4_res = svget4_s16 (z4, 1),
+	  z4_res = svget4 (z4, 1))
+
+/*
+** get4_s16_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_s16_z4_2, svint16x4_t, svint16_t,
+	  z4_res = svget4_s16 (z4, 2),
+	  z4_res = svget4 (z4, 2))
+
+/*
+** get4_s16_z4_3:
+**	mov	z4\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_s16_z4_3, svint16x4_t, svint16_t,
+	  z4_res = svget4_s16 (z4, 3),
+	  z4_res = svget4 (z4, 3))
+
+/*
+** get4_s16_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_s16_z5_0, svint16x4_t, svint16_t,
+	  z5_res = svget4_s16 (z4, 0),
+	  z5_res = svget4 (z4, 0))
+
+/*
+** get4_s16_z5_1:
+**	ret
+*/
+TEST_GET (get4_s16_z5_1, svint16x4_t, svint16_t,
+	  z5_res = svget4_s16 (z4, 1),
+	  z5_res = svget4 (z4, 1))
+
+/*
+** get4_s16_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_s16_z5_2, svint16x4_t, svint16_t,
+	  z5_res = svget4_s16 (z4, 2),
+	  z5_res = svget4 (z4, 2))
+
+/*
+** get4_s16_z5_3:
+**	mov	z5\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_s16_z5_3, svint16x4_t, svint16_t,
+	  z5_res = svget4_s16 (z4, 3),
+	  z5_res = svget4 (z4, 3))
+
+/*
+** get4_s16_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_s16_z6_0, svint16x4_t, svint16_t,
+	  z6_res = svget4_s16 (z4, 0),
+	  z6_res = svget4 (z4, 0))
+
+/*
+** get4_s16_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_s16_z6_1, svint16x4_t, svint16_t,
+	  z6_res = svget4_s16 (z4, 1),
+	  z6_res = svget4 (z4, 1))
+
+/*
+** get4_s16_z6_2:
+**	ret
+*/
+TEST_GET (get4_s16_z6_2, svint16x4_t, svint16_t,
+	  z6_res = svget4_s16 (z4, 2),
+	  z6_res = svget4 (z4, 2))
+
+/*
+** get4_s16_z6_3:
+**	mov	z6\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_s16_z6_3, svint16x4_t, svint16_t,
+	  z6_res = svget4_s16 (z4, 3),
+	  z6_res = svget4 (z4, 3))
+
+/*
+** get4_s16_z7_0:
+**	mov	z7\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_s16_z7_0, svint16x4_t, svint16_t,
+	  z7_res = svget4_s16 (z4, 0),
+	  z7_res = svget4 (z4, 0))
+
+/*
+** get4_s16_z7_1:
+**	mov	z7\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_s16_z7_1, svint16x4_t, svint16_t,
+	  z7_res = svget4_s16 (z4, 1),
+	  z7_res = svget4 (z4, 1))
+
+/*
+** get4_s16_z7_2:
+**	mov	z7\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_s16_z7_2, svint16x4_t, svint16_t,
+	  z7_res = svget4_s16 (z4, 2),
+	  z7_res = svget4 (z4, 2))
+
+/*
+** get4_s16_z7_3:
+**	ret
+*/
+TEST_GET (get4_s16_z7_3, svint16x4_t, svint16_t,
+	  z7_res = svget4_s16 (z4, 3),
+	  z7_res = svget4 (z4, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s32.c
new file mode 100644
index 000000000..41aca09d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s32.c
@@ -0,0 +1,179 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get4_s32_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_s32_z0_0, svint32x4_t, svint32_t,
+	  z0 = svget4_s32 (z4, 0),
+	  z0 = svget4 (z4, 0))
+
+/*
+** get4_s32_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_s32_z0_1, svint32x4_t, svint32_t,
+	  z0 = svget4_s32 (z4, 1),
+	  z0 = svget4 (z4, 1))
+
+/*
+** get4_s32_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_s32_z0_2, svint32x4_t, svint32_t,
+	  z0 = svget4_s32 (z4, 2),
+	  z0 = svget4 (z4, 2))
+
+/*
+** get4_s32_z0_3:
+**	mov	z0\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_s32_z0_3, svint32x4_t, svint32_t,
+	  z0 = svget4_s32 (z4, 3),
+	  z0 = svget4 (z4, 3))
+
+/*
+** get4_s32_z4_0:
+**	ret
+*/
+TEST_GET (get4_s32_z4_0, svint32x4_t, svint32_t,
+	  z4_res = svget4_s32 (z4, 0),
+	  z4_res = svget4 (z4, 0))
+
+/*
+** get4_s32_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_s32_z4_1, svint32x4_t, svint32_t,
+	  z4_res = svget4_s32 (z4, 1),
+	  z4_res = svget4 (z4, 1))
+
+/*
+** get4_s32_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_s32_z4_2, svint32x4_t, svint32_t,
+	  z4_res = svget4_s32 (z4, 2),
+	  z4_res = svget4 (z4, 2))
+
+/*
+** get4_s32_z4_3:
+**	mov	z4\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_s32_z4_3, svint32x4_t, svint32_t,
+	  z4_res = svget4_s32 (z4, 3),
+	  z4_res = svget4 (z4, 3))
+
+/*
+** get4_s32_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_s32_z5_0, svint32x4_t, svint32_t,
+	  z5_res = svget4_s32 (z4, 0),
+	  z5_res = svget4 (z4, 0))
+
+/*
+** get4_s32_z5_1:
+**	ret
+*/
+TEST_GET (get4_s32_z5_1, svint32x4_t, svint32_t,
+	  z5_res = svget4_s32 (z4, 1),
+	  z5_res = svget4 (z4, 1))
+
+/*
+** get4_s32_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_s32_z5_2, svint32x4_t, svint32_t,
+	  z5_res = svget4_s32 (z4, 2),
+	  z5_res = svget4 (z4, 2))
+
+/*
+** get4_s32_z5_3:
+**	mov	z5\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_s32_z5_3, svint32x4_t, svint32_t,
+	  z5_res = svget4_s32 (z4, 3),
+	  z5_res = svget4 (z4, 3))
+
+/*
+** get4_s32_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_s32_z6_0, svint32x4_t, svint32_t,
+	  z6_res = svget4_s32 (z4, 0),
+	  z6_res = svget4 (z4, 0))
+
+/*
+** get4_s32_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_s32_z6_1, svint32x4_t, svint32_t,
+	  z6_res = svget4_s32 (z4, 1),
+	  z6_res = svget4 (z4, 1))
+
+/*
+** get4_s32_z6_2:
+**	ret
+*/
+TEST_GET (get4_s32_z6_2, svint32x4_t, svint32_t,
+	  z6_res = svget4_s32 (z4, 2),
+	  z6_res = svget4 (z4, 2))
+
+/*
+** get4_s32_z6_3:
+**	mov	z6\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_s32_z6_3, svint32x4_t, svint32_t,
+	  z6_res = svget4_s32 (z4, 3),
+	  z6_res = svget4 (z4, 3))
+
+/*
+** get4_s32_z7_0:
+**	mov	z7\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_s32_z7_0, svint32x4_t, svint32_t,
+	  z7_res = svget4_s32 (z4, 0),
+	  z7_res = svget4 (z4, 0))
+
+/*
+** get4_s32_z7_1:
+**	mov	z7\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_s32_z7_1, svint32x4_t, svint32_t,
+	  z7_res = svget4_s32 (z4, 1),
+	  z7_res = svget4 (z4, 1))
+
+/*
+** get4_s32_z7_2:
+**	mov	z7\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_s32_z7_2, svint32x4_t, svint32_t,
+	  z7_res = svget4_s32 (z4, 2),
+	  z7_res = svget4 (z4, 2))
+
+/*
+** get4_s32_z7_3:
+**	ret
+*/
+TEST_GET (get4_s32_z7_3, svint32x4_t, svint32_t,
+	  z7_res = svget4_s32 (z4, 3),
+	  z7_res = svget4 (z4, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s64.c
new file mode 100644
index 000000000..a17e2779c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s64.c
@@ -0,0 +1,179 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get4_s64_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_s64_z0_0, svint64x4_t, svint64_t,
+	  z0 = svget4_s64 (z4, 0),
+	  z0 = svget4 (z4, 0))
+
+/*
+** get4_s64_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_s64_z0_1, svint64x4_t, svint64_t,
+	  z0 = svget4_s64 (z4, 1),
+	  z0 = svget4 (z4, 1))
+
+/*
+** get4_s64_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_s64_z0_2, svint64x4_t, svint64_t,
+	  z0 = svget4_s64 (z4, 2),
+	  z0 = svget4 (z4, 2))
+
+/*
+** get4_s64_z0_3:
+**	mov	z0\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_s64_z0_3, svint64x4_t, svint64_t,
+	  z0 = svget4_s64 (z4, 3),
+	  z0 = svget4 (z4, 3))
+
+/*
+** get4_s64_z4_0:
+**	ret
+*/
+TEST_GET (get4_s64_z4_0, svint64x4_t, svint64_t,
+	  z4_res = svget4_s64 (z4, 0),
+	  z4_res = svget4 (z4, 0))
+
+/*
+** get4_s64_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_s64_z4_1, svint64x4_t, svint64_t,
+	  z4_res = svget4_s64 (z4, 1),
+	  z4_res = svget4 (z4, 1))
+
+/*
+** get4_s64_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_s64_z4_2, svint64x4_t, svint64_t,
+	  z4_res = svget4_s64 (z4, 2),
+	  z4_res = svget4 (z4, 2))
+
+/*
+** get4_s64_z4_3:
+**	mov	z4\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_s64_z4_3, svint64x4_t, svint64_t,
+	  z4_res = svget4_s64 (z4, 3),
+	  z4_res = svget4 (z4, 3))
+
+/*
+** get4_s64_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_s64_z5_0, svint64x4_t, svint64_t,
+	  z5_res = svget4_s64 (z4, 0),
+	  z5_res = svget4 (z4, 0))
+
+/*
+** get4_s64_z5_1:
+**	ret
+*/
+TEST_GET (get4_s64_z5_1, svint64x4_t, svint64_t,
+	  z5_res = svget4_s64 (z4, 1),
+	  z5_res = svget4 (z4, 1))
+
+/*
+** get4_s64_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_s64_z5_2, svint64x4_t, svint64_t,
+	  z5_res = svget4_s64 (z4, 2),
+	  z5_res = svget4 (z4, 2))
+
+/*
+** get4_s64_z5_3:
+**	mov	z5\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_s64_z5_3, svint64x4_t, svint64_t,
+	  z5_res = svget4_s64 (z4, 3),
+	  z5_res = svget4 (z4, 3))
+
+/*
+** get4_s64_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_s64_z6_0, svint64x4_t, svint64_t,
+	  z6_res = svget4_s64 (z4, 0),
+	  z6_res = svget4 (z4, 0))
+
+/*
+** get4_s64_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_s64_z6_1, svint64x4_t, svint64_t,
+	  z6_res = svget4_s64 (z4, 1),
+	  z6_res = svget4 (z4, 1))
+
+/*
+** get4_s64_z6_2:
+**	ret
+*/
+TEST_GET (get4_s64_z6_2, svint64x4_t, svint64_t,
+	  z6_res = svget4_s64 (z4, 2),
+	  z6_res = svget4 (z4, 2))
+
+/*
+** get4_s64_z6_3:
+**	mov	z6\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_s64_z6_3, svint64x4_t, svint64_t,
+	  z6_res = svget4_s64 (z4, 3),
+	  z6_res = svget4 (z4, 3))
+
+/*
+** get4_s64_z7_0:
+**	mov	z7\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_s64_z7_0, svint64x4_t, svint64_t,
+	  z7_res = svget4_s64 (z4, 0),
+	  z7_res = svget4 (z4, 0))
+
+/*
+** get4_s64_z7_1:
+**	mov	z7\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_s64_z7_1, svint64x4_t, svint64_t,
+	  z7_res = svget4_s64 (z4, 1),
+	  z7_res = svget4 (z4, 1))
+
+/*
+** get4_s64_z7_2:
+**	mov	z7\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_s64_z7_2, svint64x4_t, svint64_t,
+	  z7_res = svget4_s64 (z4, 2),
+	  z7_res = svget4 (z4, 2))
+
+/*
+** get4_s64_z7_3:
+**	ret
+*/
+TEST_GET (get4_s64_z7_3, svint64x4_t, svint64_t,
+	  z7_res = svget4_s64 (z4, 3),
+	  z7_res = svget4 (z4, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s8.c
new file mode 100644
index 000000000..9fa159597
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_s8.c
@@ -0,0 +1,179 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get4_s8_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_s8_z0_0, svint8x4_t, svint8_t,
+	  z0 = svget4_s8 (z4, 0),
+	  z0 = svget4 (z4, 0))
+
+/*
+** get4_s8_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_s8_z0_1, svint8x4_t, svint8_t,
+	  z0 = svget4_s8 (z4, 1),
+	  z0 = svget4 (z4, 1))
+
+/*
+** get4_s8_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_s8_z0_2, svint8x4_t, svint8_t,
+	  z0 = svget4_s8 (z4, 2),
+	  z0 = svget4 (z4, 2))
+
+/*
+** get4_s8_z0_3:
+**	mov	z0\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_s8_z0_3, svint8x4_t, svint8_t,
+	  z0 = svget4_s8 (z4, 3),
+	  z0 = svget4 (z4, 3))
+
+/*
+** get4_s8_z4_0:
+**	ret
+*/
+TEST_GET (get4_s8_z4_0, svint8x4_t, svint8_t,
+	  z4_res = svget4_s8 (z4, 0),
+	  z4_res = svget4 (z4, 0))
+
+/*
+** get4_s8_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_s8_z4_1, svint8x4_t, svint8_t,
+	  z4_res = svget4_s8 (z4, 1),
+	  z4_res = svget4 (z4, 1))
+
+/*
+** get4_s8_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_s8_z4_2, svint8x4_t, svint8_t,
+	  z4_res = svget4_s8 (z4, 2),
+	  z4_res = svget4 (z4, 2))
+
+/*
+** get4_s8_z4_3:
+**	mov	z4\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_s8_z4_3, svint8x4_t, svint8_t,
+	  z4_res = svget4_s8 (z4, 3),
+	  z4_res = svget4 (z4, 3))
+
+/*
+** get4_s8_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_s8_z5_0, svint8x4_t, svint8_t,
+	  z5_res = svget4_s8 (z4, 0),
+	  z5_res = svget4 (z4, 0))
+
+/*
+** get4_s8_z5_1:
+**	ret
+*/
+TEST_GET (get4_s8_z5_1, svint8x4_t, svint8_t,
+	  z5_res = svget4_s8 (z4, 1),
+	  z5_res = svget4 (z4, 1))
+
+/*
+** get4_s8_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_s8_z5_2, svint8x4_t, svint8_t,
+	  z5_res = svget4_s8 (z4, 2),
+	  z5_res = svget4 (z4, 2))
+
+/*
+** get4_s8_z5_3:
+**	mov	z5\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_s8_z5_3, svint8x4_t, svint8_t,
+	  z5_res = svget4_s8 (z4, 3),
+	  z5_res = svget4 (z4, 3))
+
+/*
+** get4_s8_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_s8_z6_0, svint8x4_t, svint8_t,
+	  z6_res = svget4_s8 (z4, 0),
+	  z6_res = svget4 (z4, 0))
+
+/*
+** get4_s8_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_s8_z6_1, svint8x4_t, svint8_t,
+	  z6_res = svget4_s8 (z4, 1),
+	  z6_res = svget4 (z4, 1))
+
+/*
+** get4_s8_z6_2:
+**	ret
+*/
+TEST_GET (get4_s8_z6_2, svint8x4_t, svint8_t,
+	  z6_res = svget4_s8 (z4, 2),
+	  z6_res = svget4 (z4, 2))
+
+/*
+** get4_s8_z6_3:
+**	mov	z6\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_s8_z6_3, svint8x4_t, svint8_t,
+	  z6_res = svget4_s8 (z4, 3),
+	  z6_res = svget4 (z4, 3))
+
+/*
+** get4_s8_z7_0:
+**	mov	z7\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_s8_z7_0, svint8x4_t, svint8_t,
+	  z7_res = svget4_s8 (z4, 0),
+	  z7_res = svget4 (z4, 0))
+
+/*
+** get4_s8_z7_1:
+**	mov	z7\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_s8_z7_1, svint8x4_t, svint8_t,
+	  z7_res = svget4_s8 (z4, 1),
+	  z7_res = svget4 (z4, 1))
+
+/*
+** get4_s8_z7_2:
+**	mov	z7\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_s8_z7_2, svint8x4_t, svint8_t,
+	  z7_res = svget4_s8 (z4, 2),
+	  z7_res = svget4 (z4, 2))
+
+/*
+** get4_s8_z7_3:
+**	ret
+*/
+TEST_GET (get4_s8_z7_3, svint8x4_t, svint8_t,
+	  z7_res = svget4_s8 (z4, 3),
+	  z7_res = svget4 (z4, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u16.c
new file mode 100644
index 000000000..8f17ad213
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u16.c
@@ -0,0 +1,179 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get4_u16_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_u16_z0_0, svuint16x4_t, svuint16_t,
+	  z0 = svget4_u16 (z4, 0),
+	  z0 = svget4 (z4, 0))
+
+/*
+** get4_u16_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_u16_z0_1, svuint16x4_t, svuint16_t,
+	  z0 = svget4_u16 (z4, 1),
+	  z0 = svget4 (z4, 1))
+
+/*
+** get4_u16_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_u16_z0_2, svuint16x4_t, svuint16_t,
+	  z0 = svget4_u16 (z4, 2),
+	  z0 = svget4 (z4, 2))
+
+/*
+** get4_u16_z0_3:
+**	mov	z0\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_u16_z0_3, svuint16x4_t, svuint16_t,
+	  z0 = svget4_u16 (z4, 3),
+	  z0 = svget4 (z4, 3))
+
+/*
+** get4_u16_z4_0:
+**	ret
+*/
+TEST_GET (get4_u16_z4_0, svuint16x4_t, svuint16_t,
+	  z4_res = svget4_u16 (z4, 0),
+	  z4_res = svget4 (z4, 0))
+
+/*
+** get4_u16_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_u16_z4_1, svuint16x4_t, svuint16_t,
+	  z4_res = svget4_u16 (z4, 1),
+	  z4_res = svget4 (z4, 1))
+
+/*
+** get4_u16_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_u16_z4_2, svuint16x4_t, svuint16_t,
+	  z4_res = svget4_u16 (z4, 2),
+	  z4_res = svget4 (z4, 2))
+
+/*
+** get4_u16_z4_3:
+**	mov	z4\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_u16_z4_3, svuint16x4_t, svuint16_t,
+	  z4_res = svget4_u16 (z4, 3),
+	  z4_res = svget4 (z4, 3))
+
+/*
+** get4_u16_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_u16_z5_0, svuint16x4_t, svuint16_t,
+	  z5_res = svget4_u16 (z4, 0),
+	  z5_res = svget4 (z4, 0))
+
+/*
+** get4_u16_z5_1:
+**	ret
+*/
+TEST_GET (get4_u16_z5_1, svuint16x4_t, svuint16_t,
+	  z5_res = svget4_u16 (z4, 1),
+	  z5_res = svget4 (z4, 1))
+
+/*
+** get4_u16_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_u16_z5_2, svuint16x4_t, svuint16_t,
+	  z5_res = svget4_u16 (z4, 2),
+	  z5_res = svget4 (z4, 2))
+
+/*
+** get4_u16_z5_3:
+**	mov	z5\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_u16_z5_3, svuint16x4_t, svuint16_t,
+	  z5_res = svget4_u16 (z4, 3),
+	  z5_res = svget4 (z4, 3))
+
+/*
+** get4_u16_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_u16_z6_0, svuint16x4_t, svuint16_t,
+	  z6_res = svget4_u16 (z4, 0),
+	  z6_res = svget4 (z4, 0))
+
+/*
+** get4_u16_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_u16_z6_1, svuint16x4_t, svuint16_t,
+	  z6_res = svget4_u16 (z4, 1),
+	  z6_res = svget4 (z4, 1))
+
+/*
+** get4_u16_z6_2:
+**	ret
+*/
+TEST_GET (get4_u16_z6_2, svuint16x4_t, svuint16_t,
+	  z6_res = svget4_u16 (z4, 2),
+	  z6_res = svget4 (z4, 2))
+
+/*
+** get4_u16_z6_3:
+**	mov	z6\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_u16_z6_3, svuint16x4_t, svuint16_t,
+	  z6_res = svget4_u16 (z4, 3),
+	  z6_res = svget4 (z4, 3))
+
+/*
+** get4_u16_z7_0:
+**	mov	z7\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_u16_z7_0, svuint16x4_t, svuint16_t,
+	  z7_res = svget4_u16 (z4, 0),
+	  z7_res = svget4 (z4, 0))
+
+/*
+** get4_u16_z7_1:
+**	mov	z7\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_u16_z7_1, svuint16x4_t, svuint16_t,
+	  z7_res = svget4_u16 (z4, 1),
+	  z7_res = svget4 (z4, 1))
+
+/*
+** get4_u16_z7_2:
+**	mov	z7\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_u16_z7_2, svuint16x4_t, svuint16_t,
+	  z7_res = svget4_u16 (z4, 2),
+	  z7_res = svget4 (z4, 2))
+
+/*
+** get4_u16_z7_3:
+**	ret
+*/
+TEST_GET (get4_u16_z7_3, svuint16x4_t, svuint16_t,
+	  z7_res = svget4_u16 (z4, 3),
+	  z7_res = svget4 (z4, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u32.c
new file mode 100644
index 000000000..e6c94b39d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u32.c
@@ -0,0 +1,179 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get4_u32_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_u32_z0_0, svuint32x4_t, svuint32_t,
+	  z0 = svget4_u32 (z4, 0),
+	  z0 = svget4 (z4, 0))
+
+/*
+** get4_u32_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_u32_z0_1, svuint32x4_t, svuint32_t,
+	  z0 = svget4_u32 (z4, 1),
+	  z0 = svget4 (z4, 1))
+
+/*
+** get4_u32_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_u32_z0_2, svuint32x4_t, svuint32_t,
+	  z0 = svget4_u32 (z4, 2),
+	  z0 = svget4 (z4, 2))
+
+/*
+** get4_u32_z0_3:
+**	mov	z0\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_u32_z0_3, svuint32x4_t, svuint32_t,
+	  z0 = svget4_u32 (z4, 3),
+	  z0 = svget4 (z4, 3))
+
+/*
+** get4_u32_z4_0:
+**	ret
+*/
+TEST_GET (get4_u32_z4_0, svuint32x4_t, svuint32_t,
+	  z4_res = svget4_u32 (z4, 0),
+	  z4_res = svget4 (z4, 0))
+
+/*
+** get4_u32_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_u32_z4_1, svuint32x4_t, svuint32_t,
+	  z4_res = svget4_u32 (z4, 1),
+	  z4_res = svget4 (z4, 1))
+
+/*
+** get4_u32_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_u32_z4_2, svuint32x4_t, svuint32_t,
+	  z4_res = svget4_u32 (z4, 2),
+	  z4_res = svget4 (z4, 2))
+
+/*
+** get4_u32_z4_3:
+**	mov	z4\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_u32_z4_3, svuint32x4_t, svuint32_t,
+	  z4_res = svget4_u32 (z4, 3),
+	  z4_res = svget4 (z4, 3))
+
+/*
+** get4_u32_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_u32_z5_0, svuint32x4_t, svuint32_t,
+	  z5_res = svget4_u32 (z4, 0),
+	  z5_res = svget4 (z4, 0))
+
+/*
+** get4_u32_z5_1:
+**	ret
+*/
+TEST_GET (get4_u32_z5_1, svuint32x4_t, svuint32_t,
+	  z5_res = svget4_u32 (z4, 1),
+	  z5_res = svget4 (z4, 1))
+
+/*
+** get4_u32_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_u32_z5_2, svuint32x4_t, svuint32_t,
+	  z5_res = svget4_u32 (z4, 2),
+	  z5_res = svget4 (z4, 2))
+
+/*
+** get4_u32_z5_3:
+**	mov	z5\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_u32_z5_3, svuint32x4_t, svuint32_t,
+	  z5_res = svget4_u32 (z4, 3),
+	  z5_res = svget4 (z4, 3))
+
+/*
+** get4_u32_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_u32_z6_0, svuint32x4_t, svuint32_t,
+	  z6_res = svget4_u32 (z4, 0),
+	  z6_res = svget4 (z4, 0))
+
+/*
+** get4_u32_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_u32_z6_1, svuint32x4_t, svuint32_t,
+	  z6_res = svget4_u32 (z4, 1),
+	  z6_res = svget4 (z4, 1))
+
+/*
+** get4_u32_z6_2:
+**	ret
+*/
+TEST_GET (get4_u32_z6_2, svuint32x4_t, svuint32_t,
+	  z6_res = svget4_u32 (z4, 2),
+	  z6_res = svget4 (z4, 2))
+
+/*
+** get4_u32_z6_3:
+**	mov	z6\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_u32_z6_3, svuint32x4_t, svuint32_t,
+	  z6_res = svget4_u32 (z4, 3),
+	  z6_res = svget4 (z4, 3))
+
+/*
+** get4_u32_z7_0:
+**	mov	z7\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_u32_z7_0, svuint32x4_t, svuint32_t,
+	  z7_res = svget4_u32 (z4, 0),
+	  z7_res = svget4 (z4, 0))
+
+/*
+** get4_u32_z7_1:
+**	mov	z7\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_u32_z7_1, svuint32x4_t, svuint32_t,
+	  z7_res = svget4_u32 (z4, 1),
+	  z7_res = svget4 (z4, 1))
+
+/*
+** get4_u32_z7_2:
+**	mov	z7\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_u32_z7_2, svuint32x4_t, svuint32_t,
+	  z7_res = svget4_u32 (z4, 2),
+	  z7_res = svget4 (z4, 2))
+
+/*
+** get4_u32_z7_3:
+**	ret
+*/
+TEST_GET (get4_u32_z7_3, svuint32x4_t, svuint32_t,
+	  z7_res = svget4_u32 (z4, 3),
+	  z7_res = svget4 (z4, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u64.c
new file mode 100644
index 000000000..79c293a2c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u64.c
@@ -0,0 +1,179 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get4_u64_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_u64_z0_0, svuint64x4_t, svuint64_t,
+	  z0 = svget4_u64 (z4, 0),
+	  z0 = svget4 (z4, 0))
+
+/*
+** get4_u64_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_u64_z0_1, svuint64x4_t, svuint64_t,
+	  z0 = svget4_u64 (z4, 1),
+	  z0 = svget4 (z4, 1))
+
+/*
+** get4_u64_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_u64_z0_2, svuint64x4_t, svuint64_t,
+	  z0 = svget4_u64 (z4, 2),
+	  z0 = svget4 (z4, 2))
+
+/*
+** get4_u64_z0_3:
+**	mov	z0\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_u64_z0_3, svuint64x4_t, svuint64_t,
+	  z0 = svget4_u64 (z4, 3),
+	  z0 = svget4 (z4, 3))
+
+/*
+** get4_u64_z4_0:
+**	ret
+*/
+TEST_GET (get4_u64_z4_0, svuint64x4_t, svuint64_t,
+	  z4_res = svget4_u64 (z4, 0),
+	  z4_res = svget4 (z4, 0))
+
+/*
+** get4_u64_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_u64_z4_1, svuint64x4_t, svuint64_t,
+	  z4_res = svget4_u64 (z4, 1),
+	  z4_res = svget4 (z4, 1))
+
+/*
+** get4_u64_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_u64_z4_2, svuint64x4_t, svuint64_t,
+	  z4_res = svget4_u64 (z4, 2),
+	  z4_res = svget4 (z4, 2))
+
+/*
+** get4_u64_z4_3:
+**	mov	z4\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_u64_z4_3, svuint64x4_t, svuint64_t,
+	  z4_res = svget4_u64 (z4, 3),
+	  z4_res = svget4 (z4, 3))
+
+/*
+** get4_u64_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_u64_z5_0, svuint64x4_t, svuint64_t,
+	  z5_res = svget4_u64 (z4, 0),
+	  z5_res = svget4 (z4, 0))
+
+/*
+** get4_u64_z5_1:
+**	ret
+*/
+TEST_GET (get4_u64_z5_1, svuint64x4_t, svuint64_t,
+	  z5_res = svget4_u64 (z4, 1),
+	  z5_res = svget4 (z4, 1))
+
+/*
+** get4_u64_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_u64_z5_2, svuint64x4_t, svuint64_t,
+	  z5_res = svget4_u64 (z4, 2),
+	  z5_res = svget4 (z4, 2))
+
+/*
+** get4_u64_z5_3:
+**	mov	z5\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_u64_z5_3, svuint64x4_t, svuint64_t,
+	  z5_res = svget4_u64 (z4, 3),
+	  z5_res = svget4 (z4, 3))
+
+/*
+** get4_u64_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_u64_z6_0, svuint64x4_t, svuint64_t,
+	  z6_res = svget4_u64 (z4, 0),
+	  z6_res = svget4 (z4, 0))
+
+/*
+** get4_u64_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_u64_z6_1, svuint64x4_t, svuint64_t,
+	  z6_res = svget4_u64 (z4, 1),
+	  z6_res = svget4 (z4, 1))
+
+/*
+** get4_u64_z6_2:
+**	ret
+*/
+TEST_GET (get4_u64_z6_2, svuint64x4_t, svuint64_t,
+	  z6_res = svget4_u64 (z4, 2),
+	  z6_res = svget4 (z4, 2))
+
+/*
+** get4_u64_z6_3:
+**	mov	z6\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_u64_z6_3, svuint64x4_t, svuint64_t,
+	  z6_res = svget4_u64 (z4, 3),
+	  z6_res = svget4 (z4, 3))
+
+/*
+** get4_u64_z7_0:
+**	mov	z7\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_u64_z7_0, svuint64x4_t, svuint64_t,
+	  z7_res = svget4_u64 (z4, 0),
+	  z7_res = svget4 (z4, 0))
+
+/*
+** get4_u64_z7_1:
+**	mov	z7\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_u64_z7_1, svuint64x4_t, svuint64_t,
+	  z7_res = svget4_u64 (z4, 1),
+	  z7_res = svget4 (z4, 1))
+
+/*
+** get4_u64_z7_2:
+**	mov	z7\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_u64_z7_2, svuint64x4_t, svuint64_t,
+	  z7_res = svget4_u64 (z4, 2),
+	  z7_res = svget4 (z4, 2))
+
+/*
+** get4_u64_z7_3:
+**	ret
+*/
+TEST_GET (get4_u64_z7_3, svuint64x4_t, svuint64_t,
+	  z7_res = svget4_u64 (z4, 3),
+	  z7_res = svget4 (z4, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u8.c
new file mode 100644
index 000000000..f3ad9a85b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/get4_u8.c
@@ -0,0 +1,179 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get4_u8_z0_0:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_u8_z0_0, svuint8x4_t, svuint8_t,
+	  z0 = svget4_u8 (z4, 0),
+	  z0 = svget4 (z4, 0))
+
+/*
+** get4_u8_z0_1:
+**	mov	z0\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_u8_z0_1, svuint8x4_t, svuint8_t,
+	  z0 = svget4_u8 (z4, 1),
+	  z0 = svget4 (z4, 1))
+
+/*
+** get4_u8_z0_2:
+**	mov	z0\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_u8_z0_2, svuint8x4_t, svuint8_t,
+	  z0 = svget4_u8 (z4, 2),
+	  z0 = svget4 (z4, 2))
+
+/*
+** get4_u8_z0_3:
+**	mov	z0\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_u8_z0_3, svuint8x4_t, svuint8_t,
+	  z0 = svget4_u8 (z4, 3),
+	  z0 = svget4 (z4, 3))
+
+/*
+** get4_u8_z4_0:
+**	ret
+*/
+TEST_GET (get4_u8_z4_0, svuint8x4_t, svuint8_t,
+	  z4_res = svget4_u8 (z4, 0),
+	  z4_res = svget4 (z4, 0))
+
+/*
+** get4_u8_z4_1:
+**	mov	z4\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_u8_z4_1, svuint8x4_t, svuint8_t,
+	  z4_res = svget4_u8 (z4, 1),
+	  z4_res = svget4 (z4, 1))
+
+/*
+** get4_u8_z4_2:
+**	mov	z4\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_u8_z4_2, svuint8x4_t, svuint8_t,
+	  z4_res = svget4_u8 (z4, 2),
+	  z4_res = svget4 (z4, 2))
+
+/*
+** get4_u8_z4_3:
+**	mov	z4\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_u8_z4_3, svuint8x4_t, svuint8_t,
+	  z4_res = svget4_u8 (z4, 3),
+	  z4_res = svget4 (z4, 3))
+
+/*
+** get4_u8_z5_0:
+**	mov	z5\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_u8_z5_0, svuint8x4_t, svuint8_t,
+	  z5_res = svget4_u8 (z4, 0),
+	  z5_res = svget4 (z4, 0))
+
+/*
+** get4_u8_z5_1:
+**	ret
+*/
+TEST_GET (get4_u8_z5_1, svuint8x4_t, svuint8_t,
+	  z5_res = svget4_u8 (z4, 1),
+	  z5_res = svget4 (z4, 1))
+
+/*
+** get4_u8_z5_2:
+**	mov	z5\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_u8_z5_2, svuint8x4_t, svuint8_t,
+	  z5_res = svget4_u8 (z4, 2),
+	  z5_res = svget4 (z4, 2))
+
+/*
+** get4_u8_z5_3:
+**	mov	z5\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_u8_z5_3, svuint8x4_t, svuint8_t,
+	  z5_res = svget4_u8 (z4, 3),
+	  z5_res = svget4 (z4, 3))
+
+/*
+** get4_u8_z6_0:
+**	mov	z6\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_u8_z6_0, svuint8x4_t, svuint8_t,
+	  z6_res = svget4_u8 (z4, 0),
+	  z6_res = svget4 (z4, 0))
+
+/*
+** get4_u8_z6_1:
+**	mov	z6\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_u8_z6_1, svuint8x4_t, svuint8_t,
+	  z6_res = svget4_u8 (z4, 1),
+	  z6_res = svget4 (z4, 1))
+
+/*
+** get4_u8_z6_2:
+**	ret
+*/
+TEST_GET (get4_u8_z6_2, svuint8x4_t, svuint8_t,
+	  z6_res = svget4_u8 (z4, 2),
+	  z6_res = svget4 (z4, 2))
+
+/*
+** get4_u8_z6_3:
+**	mov	z6\.d, z7\.d
+**	ret
+*/
+TEST_GET (get4_u8_z6_3, svuint8x4_t, svuint8_t,
+	  z6_res = svget4_u8 (z4, 3),
+	  z6_res = svget4 (z4, 3))
+
+/*
+** get4_u8_z7_0:
+**	mov	z7\.d, z4\.d
+**	ret
+*/
+TEST_GET (get4_u8_z7_0, svuint8x4_t, svuint8_t,
+	  z7_res = svget4_u8 (z4, 0),
+	  z7_res = svget4 (z4, 0))
+
+/*
+** get4_u8_z7_1:
+**	mov	z7\.d, z5\.d
+**	ret
+*/
+TEST_GET (get4_u8_z7_1, svuint8x4_t, svuint8_t,
+	  z7_res = svget4_u8 (z4, 1),
+	  z7_res = svget4 (z4, 1))
+
+/*
+** get4_u8_z7_2:
+**	mov	z7\.d, z6\.d
+**	ret
+*/
+TEST_GET (get4_u8_z7_2, svuint8x4_t, svuint8_t,
+	  z7_res = svget4_u8 (z4, 2),
+	  z7_res = svget4 (z4, 2))
+
+/*
+** get4_u8_z7_3:
+**	ret
+*/
+TEST_GET (get4_u8_z7_3, svuint8x4_t, svuint8_t,
+	  z7_res = svget4_u8 (z4, 3),
+	  z7_res = svget4 (z4, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s16.c
new file mode 100644
index 000000000..90a1434f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s16.c
@@ -0,0 +1,220 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** index_s16_w0_w1:
+**	index	z0\.h, w0, w1
+**	ret
+*/
+TEST_S (index_s16_w0_w1, svint16_t, int16_t,
+	z0 = svindex_s16 (x0, x1))
+
+/*
+** index_s16_w0_2:
+**	index	z0\.h, w0, #2
+**	ret
+*/
+TEST_S (index_s16_w0_2, svint16_t, int16_t,
+	z0 = svindex_s16 (x0, 2))
+
+/*
+** index_s16_50_2:
+**	mov	(w[0-9]+), 50
+**	index	z0\.h, \1, #2
+**	ret
+*/
+TEST_S (index_s16_50_2, svint16_t, int16_t,
+	z0 = svindex_s16 (50, 2))
+
+/*
+** index_s16_0_m17:
+**	mov	(w[0-9]+), -17
+**	index	z0\.h, #0, \1
+**	ret
+*/
+TEST_S (index_s16_0_m17, svint16_t, int16_t,
+	z0 = svindex_s16 (0, -17))
+
+/*
+** index_s16_0_m16:
+**	index	z0\.h, #0, #-16
+**	ret
+*/
+TEST_S (index_s16_0_m16, svint16_t, int16_t,
+	z0 = svindex_s16 (0, -16))
+
+/*
+** index_s16_0_1:
+**	index	z0\.h, #0, #1
+**	ret
+*/
+TEST_S (index_s16_0_1, svint16_t, int16_t,
+	z0 = svindex_s16 (0, 1))
+
+/*
+** index_s16_0_15:
+**	index	z0\.h, #0, #15
+**	ret
+*/
+TEST_S (index_s16_0_15, svint16_t, int16_t,
+	z0 = svindex_s16 (0, 15))
+
+/*
+** index_s16_0_16:
+**	mov	(w[0-9]+), 16
+**	index	z0\.h, #0, \1
+**	ret
+*/
+TEST_S (index_s16_0_16, svint16_t, int16_t,
+	z0 = svindex_s16 (0, 16))
+
+/*
+** index_s16_m17_1:
+**	mov	(w[0-9]+), -17
+**	index	z0\.h, \1, #1
+**	ret
+*/
+TEST_S (index_s16_m17_1, svint16_t, int16_t,
+	z0 = svindex_s16 (-17, 1))
+
+/*
+** index_s16_m16_1:
+**	index	z0\.h, #-16, #1
+**	ret
+*/
+TEST_S (index_s16_m16_1, svint16_t, int16_t,
+	z0 = svindex_s16 (-16, 1))
+
+/*
+** index_s16_m1_1:
+**	index	z0\.h, #-1, #1
+**	ret
+*/
+TEST_S (index_s16_m1_1, svint16_t, int16_t,
+	z0 = svindex_s16 (-1, 1))
+
+/*
+** index_s16_1_1:
+**	index	z0\.h, #1, #1
+**	ret
+*/
+TEST_S (index_s16_1_1, svint16_t, int16_t,
+	z0 = svindex_s16 (1, 1))
+
+/*
+** index_s16_15_1:
+**	index	z0\.h, #15, #1
+**	ret
+*/
+TEST_S (index_s16_15_1, svint16_t, int16_t,
+	z0 = svindex_s16 (15, 1))
+
+/*
+** index_s16_16_1:
+**	mov	(w[0-9]+), 16
+**	index	z0\.h, \1, #1
+**	ret
+*/
+TEST_S (index_s16_16_1, svint16_t, int16_t,
+	z0 = svindex_s16 (16, 1))
+
+/*
+** index_s16_m17_x0:
+**	mov	(w[0-9]+), -17
+**	index	z0\.h, \1, w0
+**	ret
+*/
+TEST_S (index_s16_m17_x0, svint16_t, int16_t,
+	z0 = svindex_s16 (-17, x0))
+
+/*
+** index_s16_m16_x0:
+**	index	z0\.h, #-16, w0
+**	ret
+*/
+TEST_S (index_s16_m16_x0, svint16_t, int16_t,
+	z0 = svindex_s16 (-16, x0))
+
+/*
+** index_s16_m1_x0:
+**	index	z0\.h, #-1, w0
+**	ret
+*/
+TEST_S (index_s16_m1_x0, svint16_t, int16_t,
+	z0 = svindex_s16 (-1, x0))
+
+/*
+** index_s16_0_x0:
+**	index	z0\.h, #0, w0
+**	ret
+*/
+TEST_S (index_s16_0_x0, svint16_t, int16_t,
+	z0 = svindex_s16 (0, x0))
+
+/*
+** index_s16_1_x0:
+**	index	z0\.h, #1, w0
+**	ret
+*/
+TEST_S (index_s16_1_x0, svint16_t, int16_t,
+	z0 = svindex_s16 (1, x0))
+
+/*
+** index_s16_15_x0:
+**	index	z0\.h, #15, w0
+**	ret
+*/
+TEST_S (index_s16_15_x0, svint16_t, int16_t,
+	z0 = svindex_s16 (15, x0))
+
+/*
+** index_s16_16_x0:
+**	mov	(w[0-9]+), 16
+**	index	z0\.h, \1, w0
+**	ret
+*/
+TEST_S (index_s16_16_x0, svint16_t, int16_t,
+	z0 = svindex_s16 (16, x0))
+
+/*
+** index_s16_x0_m17:
+**	mov	(w[0-9]+), -17
+**	index	z0\.h, w0, \1
+**	ret
+*/
+TEST_S (index_s16_x0_m17, svint16_t, int16_t,
+	z0 = svindex_s16 (x0, -17))
+
+/*
+** index_s16_x0_m16:
+**	index	z0\.h, w0, #-16
+**	ret
+*/
+TEST_S (index_s16_x0_m16, svint16_t, int16_t,
+	z0 = svindex_s16 (x0, -16))
+
+/*
+** index_s16_x0_1:
+**	index	z0\.h, w0, #1
+**	ret
+*/
+TEST_S (index_s16_x0_1, svint16_t, int16_t,
+	z0 = svindex_s16 (x0, 1))
+
+/*
+** index_s16_x0_15:
+**	index	z0\.h, w0, #15
+**	ret
+*/
+TEST_S (index_s16_x0_15, svint16_t, int16_t,
+	z0 = svindex_s16 (x0, 15))
+
+/*
+** index_s16_x0_16:
+**	mov	(w[0-9]+), 16
+**	index	z0\.h, w0, \1
+**	ret
+*/
+TEST_S (index_s16_x0_16, svint16_t, int16_t,
+	z0 = svindex_s16 (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s32.c
new file mode 100644
index 000000000..18afedac0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s32.c
@@ -0,0 +1,220 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** index_s32_w0_w1:
+**	index	z0\.s, w0, w1
+**	ret
+*/
+TEST_S (index_s32_w0_w1, svint32_t, int32_t,
+	z0 = svindex_s32 (x0, x1))
+
+/*
+** index_s32_w0_2:
+**	index	z0\.s, w0, #2
+**	ret
+*/
+TEST_S (index_s32_w0_2, svint32_t, int32_t,
+	z0 = svindex_s32 (x0, 2))
+
+/*
+** index_s32_50_2:
+**	mov	(w[0-9]+), 50
+**	index	z0\.s, \1, #2
+**	ret
+*/
+TEST_S (index_s32_50_2, svint32_t, int32_t,
+	z0 = svindex_s32 (50, 2))
+
+/*
+** index_s32_0_m17:
+**	mov	(w[0-9]+), -17
+**	index	z0\.s, #0, \1
+**	ret
+*/
+TEST_S (index_s32_0_m17, svint32_t, int32_t,
+	z0 = svindex_s32 (0, -17))
+
+/*
+** index_s32_0_m16:
+**	index	z0\.s, #0, #-16
+**	ret
+*/
+TEST_S (index_s32_0_m16, svint32_t, int32_t,
+	z0 = svindex_s32 (0, -16))
+
+/*
+** index_s32_0_1:
+**	index	z0\.s, #0, #1
+**	ret
+*/
+TEST_S (index_s32_0_1, svint32_t, int32_t,
+	z0 = svindex_s32 (0, 1))
+
+/*
+** index_s32_0_15:
+**	index	z0\.s, #0, #15
+**	ret
+*/
+TEST_S (index_s32_0_15, svint32_t, int32_t,
+	z0 = svindex_s32 (0, 15))
+
+/*
+** index_s32_0_16:
+**	mov	(w[0-9]+), 16
+**	index	z0\.s, #0, \1
+**	ret
+*/
+TEST_S (index_s32_0_16, svint32_t, int32_t,
+	z0 = svindex_s32 (0, 16))
+
+/*
+** index_s32_m17_1:
+**	mov	(w[0-9]+), -17
+**	index	z0\.s, \1, #1
+**	ret
+*/
+TEST_S (index_s32_m17_1, svint32_t, int32_t,
+	z0 = svindex_s32 (-17, 1))
+
+/*
+** index_s32_m16_1:
+**	index	z0\.s, #-16, #1
+**	ret
+*/
+TEST_S (index_s32_m16_1, svint32_t, int32_t,
+	z0 = svindex_s32 (-16, 1))
+
+/*
+** index_s32_m1_1:
+**	index	z0\.s, #-1, #1
+**	ret
+*/
+TEST_S (index_s32_m1_1, svint32_t, int32_t,
+	z0 = svindex_s32 (-1, 1))
+
+/*
+** index_s32_1_1:
+**	index	z0\.s, #1, #1
+**	ret
+*/
+TEST_S (index_s32_1_1, svint32_t, int32_t,
+	z0 = svindex_s32 (1, 1))
+
+/*
+** index_s32_15_1:
+**	index	z0\.s, #15, #1
+**	ret
+*/
+TEST_S (index_s32_15_1, svint32_t, int32_t,
+	z0 = svindex_s32 (15, 1))
+
+/*
+** index_s32_16_1:
+**	mov	(w[0-9]+), 16
+**	index	z0\.s, \1, #1
+**	ret
+*/
+TEST_S (index_s32_16_1, svint32_t, int32_t,
+	z0 = svindex_s32 (16, 1))
+
+/*
+** index_s32_m17_x0:
+**	mov	(w[0-9]+), -17
+**	index	z0\.s, \1, w0
+**	ret
+*/
+TEST_S (index_s32_m17_x0, svint32_t, int32_t,
+	z0 = svindex_s32 (-17, x0))
+
+/*
+** index_s32_m16_x0:
+**	index	z0\.s, #-16, w0
+**	ret
+*/
+TEST_S (index_s32_m16_x0, svint32_t, int32_t,
+	z0 = svindex_s32 (-16, x0))
+
+/*
+** index_s32_m1_x0:
+**	index	z0\.s, #-1, w0
+**	ret
+*/
+TEST_S (index_s32_m1_x0, svint32_t, int32_t,
+	z0 = svindex_s32 (-1, x0))
+
+/*
+** index_s32_0_x0:
+**	index	z0\.s, #0, w0
+**	ret
+*/
+TEST_S (index_s32_0_x0, svint32_t, int32_t,
+	z0 = svindex_s32 (0, x0))
+
+/*
+** index_s32_1_x0:
+**	index	z0\.s, #1, w0
+**	ret
+*/
+TEST_S (index_s32_1_x0, svint32_t, int32_t,
+	z0 = svindex_s32 (1, x0))
+
+/*
+** index_s32_15_x0:
+**	index	z0\.s, #15, w0
+**	ret
+*/
+TEST_S (index_s32_15_x0, svint32_t, int32_t,
+	z0 = svindex_s32 (15, x0))
+
+/*
+** index_s32_16_x0:
+**	mov	(w[0-9]+), 16
+**	index	z0\.s, \1, w0
+**	ret
+*/
+TEST_S (index_s32_16_x0, svint32_t, int32_t,
+	z0 = svindex_s32 (16, x0))
+
+/*
+** index_s32_x0_m17:
+**	mov	(w[0-9]+), -17
+**	index	z0\.s, w0, \1
+**	ret
+*/
+TEST_S (index_s32_x0_m17, svint32_t, int32_t,
+	z0 = svindex_s32 (x0, -17))
+
+/*
+** index_s32_x0_m16:
+**	index	z0\.s, w0, #-16
+**	ret
+*/
+TEST_S (index_s32_x0_m16, svint32_t, int32_t,
+	z0 = svindex_s32 (x0, -16))
+
+/*
+** index_s32_x0_1:
+**	index	z0\.s, w0, #1
+**	ret
+*/
+TEST_S (index_s32_x0_1, svint32_t, int32_t,
+	z0 = svindex_s32 (x0, 1))
+
+/*
+** index_s32_x0_15:
+**	index	z0\.s, w0, #15
+**	ret
+*/
+TEST_S (index_s32_x0_15, svint32_t, int32_t,
+	z0 = svindex_s32 (x0, 15))
+
+/*
+** index_s32_x0_16:
+**	mov	(w[0-9]+), 16
+**	index	z0\.s, w0, \1
+**	ret
+*/
+TEST_S (index_s32_x0_16, svint32_t, int32_t,
+	z0 = svindex_s32 (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s64.c
new file mode 100644
index 000000000..298eec9ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s64.c
@@ -0,0 +1,220 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** index_s64_x0_x1:
+**	index	z0\.d, x0, x1
+**	ret
+*/
+TEST_S (index_s64_x0_x1, svint64_t, int64_t,
+	z0 = svindex_s64 (x0, x1))
+
+/*
+** index_s64_x0_2:
+**	index	z0\.d, x0, #2
+**	ret
+*/
+TEST_S (index_s64_x0_2, svint64_t, int64_t,
+	z0 = svindex_s64 (x0, 2))
+
+/*
+** index_s64_50_2:
+**	mov	(x[0-9]+), 50
+**	index	z0\.d, \1, #2
+**	ret
+*/
+TEST_S (index_s64_50_2, svint64_t, int64_t,
+	z0 = svindex_s64 (50, 2))
+
+/*
+** index_s64_0_m17:
+**	mov	(x[0-9]+), -17
+**	index	z0\.d, #0, \1
+**	ret
+*/
+TEST_S (index_s64_0_m17, svint64_t, int64_t,
+	z0 = svindex_s64 (0, -17))
+
+/*
+** index_s64_0_m16:
+**	index	z0\.d, #0, #-16
+**	ret
+*/
+TEST_S (index_s64_0_m16, svint64_t, int64_t,
+	z0 = svindex_s64 (0, -16))
+
+/*
+** index_s64_0_1:
+**	index	z0\.d, #0, #1
+**	ret
+*/
+TEST_S (index_s64_0_1, svint64_t, int64_t,
+	z0 = svindex_s64 (0, 1))
+
+/*
+** index_s64_0_15:
+**	index	z0\.d, #0, #15
+**	ret
+*/
+TEST_S (index_s64_0_15, svint64_t, int64_t,
+	z0 = svindex_s64 (0, 15))
+
+/*
+** index_s64_0_16:
+**	mov	(x[0-9]+), 16
+**	index	z0\.d, #0, \1
+**	ret
+*/
+TEST_S (index_s64_0_16, svint64_t, int64_t,
+	z0 = svindex_s64 (0, 16))
+
+/*
+** index_s64_m17_1:
+**	mov	(x[0-9]+), -17
+**	index	z0\.d, \1, #1
+**	ret
+*/
+TEST_S (index_s64_m17_1, svint64_t, int64_t,
+	z0 = svindex_s64 (-17, 1))
+
+/*
+** index_s64_m16_1:
+**	index	z0\.d, #-16, #1
+**	ret
+*/
+TEST_S (index_s64_m16_1, svint64_t, int64_t,
+	z0 = svindex_s64 (-16, 1))
+
+/*
+** index_s64_m1_1:
+**	index	z0\.d, #-1, #1
+**	ret
+*/
+TEST_S (index_s64_m1_1, svint64_t, int64_t,
+	z0 = svindex_s64 (-1, 1))
+
+/*
+** index_s64_1_1:
+**	index	z0\.d, #1, #1
+**	ret
+*/
+TEST_S (index_s64_1_1, svint64_t, int64_t,
+	z0 = svindex_s64 (1, 1))
+
+/*
+** index_s64_15_1:
+**	index	z0\.d, #15, #1
+**	ret
+*/
+TEST_S (index_s64_15_1, svint64_t, int64_t,
+	z0 = svindex_s64 (15, 1))
+
+/*
+** index_s64_16_1:
+**	mov	(x[0-9]+), 16
+**	index	z0\.d, \1, #1
+**	ret
+*/
+TEST_S (index_s64_16_1, svint64_t, int64_t,
+	z0 = svindex_s64 (16, 1))
+
+/*
+** index_s64_m17_x0:
+**	mov	(x[0-9]+), -17
+**	index	z0\.d, \1, x0
+**	ret
+*/
+TEST_S (index_s64_m17_x0, svint64_t, int64_t,
+	z0 = svindex_s64 (-17, x0))
+
+/*
+** index_s64_m16_x0:
+**	index	z0\.d, #-16, x0
+**	ret
+*/
+TEST_S (index_s64_m16_x0, svint64_t, int64_t,
+	z0 = svindex_s64 (-16, x0))
+
+/*
+** index_s64_m1_x0:
+**	index	z0\.d, #-1, x0
+**	ret
+*/
+TEST_S (index_s64_m1_x0, svint64_t, int64_t,
+	z0 = svindex_s64 (-1, x0))
+
+/*
+** index_s64_0_x0:
+**	index	z0\.d, #0, x0
+**	ret
+*/
+TEST_S (index_s64_0_x0, svint64_t, int64_t,
+	z0 = svindex_s64 (0, x0))
+
+/*
+** index_s64_1_x0:
+**	index	z0\.d, #1, x0
+**	ret
+*/
+TEST_S (index_s64_1_x0, svint64_t, int64_t,
+	z0 = svindex_s64 (1, x0))
+
+/*
+** index_s64_15_x0:
+**	index	z0\.d, #15, x0
+**	ret
+*/
+TEST_S (index_s64_15_x0, svint64_t, int64_t,
+	z0 = svindex_s64 (15, x0))
+
+/*
+** index_s64_16_x0:
+**	mov	(x[0-9]+), 16
+**	index	z0\.d, \1, x0
+**	ret
+*/
+TEST_S (index_s64_16_x0, svint64_t, int64_t,
+	z0 = svindex_s64 (16, x0))
+
+/*
+** index_s64_x0_m17:
+**	mov	(x[0-9]+), -17
+**	index	z0\.d, x0, \1
+**	ret
+*/
+TEST_S (index_s64_x0_m17, svint64_t, int64_t,
+	z0 = svindex_s64 (x0, -17))
+
+/*
+** index_s64_x0_m16:
+**	index	z0\.d, x0, #-16
+**	ret
+*/
+TEST_S (index_s64_x0_m16, svint64_t, int64_t,
+	z0 = svindex_s64 (x0, -16))
+
+/*
+** index_s64_x0_1:
+**	index	z0\.d, x0, #1
+**	ret
+*/
+TEST_S (index_s64_x0_1, svint64_t, int64_t,
+	z0 = svindex_s64 (x0, 1))
+
+/*
+** index_s64_x0_15:
+**	index	z0\.d, x0, #15
+**	ret
+*/
+TEST_S (index_s64_x0_15, svint64_t, int64_t,
+	z0 = svindex_s64 (x0, 15))
+
+/*
+** index_s64_x0_16:
+**	mov	(x[0-9]+), 16
+**	index	z0\.d, x0, \1
+**	ret
+*/
+TEST_S (index_s64_x0_16, svint64_t, int64_t,
+	z0 = svindex_s64 (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s8.c
new file mode 100644
index 000000000..8a1f14f50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_s8.c
@@ -0,0 +1,220 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** index_s8_w0_w1:
+**	index	z0\.b, w0, w1
+**	ret
+*/
+TEST_S (index_s8_w0_w1, svint8_t, int8_t,
+	z0 = svindex_s8 (x0, x1))
+
+/*
+** index_s8_w0_2:
+**	index	z0\.b, w0, #2
+**	ret
+*/
+TEST_S (index_s8_w0_2, svint8_t, int8_t,
+	z0 = svindex_s8 (x0, 2))
+
+/*
+** index_s8_50_2:
+**	mov	(w[0-9]+), 50
+**	index	z0\.b, \1, #2
+**	ret
+*/
+TEST_S (index_s8_50_2, svint8_t, int8_t,
+	z0 = svindex_s8 (50, 2))
+
+/*
+** index_s8_0_m17:
+**	mov	(w[0-9]+), -17
+**	index	z0\.b, #0, \1
+**	ret
+*/
+TEST_S (index_s8_0_m17, svint8_t, int8_t,
+	z0 = svindex_s8 (0, -17))
+
+/*
+** index_s8_0_m16:
+**	index	z0\.b, #0, #-16
+**	ret
+*/
+TEST_S (index_s8_0_m16, svint8_t, int8_t,
+	z0 = svindex_s8 (0, -16))
+
+/*
+** index_s8_0_1:
+**	index	z0\.b, #0, #1
+**	ret
+*/
+TEST_S (index_s8_0_1, svint8_t, int8_t,
+	z0 = svindex_s8 (0, 1))
+
+/*
+** index_s8_0_15:
+**	index	z0\.b, #0, #15
+**	ret
+*/
+TEST_S (index_s8_0_15, svint8_t, int8_t,
+	z0 = svindex_s8 (0, 15))
+
+/*
+** index_s8_0_16:
+**	mov	(w[0-9]+), 16
+**	index	z0\.b, #0, \1
+**	ret
+*/
+TEST_S (index_s8_0_16, svint8_t, int8_t,
+	z0 = svindex_s8 (0, 16))
+
+/*
+** index_s8_m17_1:
+**	mov	(w[0-9]+), -17
+**	index	z0\.b, \1, #1
+**	ret
+*/
+TEST_S (index_s8_m17_1, svint8_t, int8_t,
+	z0 = svindex_s8 (-17, 1))
+
+/*
+** index_s8_m16_1:
+**	index	z0\.b, #-16, #1
+**	ret
+*/
+TEST_S (index_s8_m16_1, svint8_t, int8_t,
+	z0 = svindex_s8 (-16, 1))
+
+/*
+** index_s8_m1_1:
+**	index	z0\.b, #-1, #1
+**	ret
+*/
+TEST_S (index_s8_m1_1, svint8_t, int8_t,
+	z0 = svindex_s8 (-1, 1))
+
+/*
+** index_s8_1_1:
+**	index	z0\.b, #1, #1
+**	ret
+*/
+TEST_S (index_s8_1_1, svint8_t, int8_t,
+	z0 = svindex_s8 (1, 1))
+
+/*
+** index_s8_15_1:
+**	index	z0\.b, #15, #1
+**	ret
+*/
+TEST_S (index_s8_15_1, svint8_t, int8_t,
+	z0 = svindex_s8 (15, 1))
+
+/*
+** index_s8_16_1:
+**	mov	(w[0-9]+), 16
+**	index	z0\.b, \1, #1
+**	ret
+*/
+TEST_S (index_s8_16_1, svint8_t, int8_t,
+	z0 = svindex_s8 (16, 1))
+
+/*
+** index_s8_m17_x0:
+**	mov	(w[0-9]+), -17
+**	index	z0\.b, \1, w0
+**	ret
+*/
+TEST_S (index_s8_m17_x0, svint8_t, int8_t,
+	z0 = svindex_s8 (-17, x0))
+
+/*
+** index_s8_m16_x0:
+**	index	z0\.b, #-16, w0
+**	ret
+*/
+TEST_S (index_s8_m16_x0, svint8_t, int8_t,
+	z0 = svindex_s8 (-16, x0))
+
+/*
+** index_s8_m1_x0:
+**	index	z0\.b, #-1, w0
+**	ret
+*/
+TEST_S (index_s8_m1_x0, svint8_t, int8_t,
+	z0 = svindex_s8 (-1, x0))
+
+/*
+** index_s8_0_x0:
+**	index	z0\.b, #0, w0
+**	ret
+*/
+TEST_S (index_s8_0_x0, svint8_t, int8_t,
+	z0 = svindex_s8 (0, x0))
+
+/*
+** index_s8_1_x0:
+**	index	z0\.b, #1, w0
+**	ret
+*/
+TEST_S (index_s8_1_x0, svint8_t, int8_t,
+	z0 = svindex_s8 (1, x0))
+
+/*
+** index_s8_15_x0:
+**	index	z0\.b, #15, w0
+**	ret
+*/
+TEST_S (index_s8_15_x0, svint8_t, int8_t,
+	z0 = svindex_s8 (15, x0))
+
+/*
+** index_s8_16_x0:
+**	mov	(w[0-9]+), 16
+**	index	z0\.b, \1, w0
+**	ret
+*/
+TEST_S (index_s8_16_x0, svint8_t, int8_t,
+	z0 = svindex_s8 (16, x0))
+
+/*
+** index_s8_x0_m17:
+**	mov	(w[0-9]+), -17
+**	index	z0\.b, w0, \1
+**	ret
+*/
+TEST_S (index_s8_x0_m17, svint8_t, int8_t,
+	z0 = svindex_s8 (x0, -17))
+
+/*
+** index_s8_x0_m16:
+**	index	z0\.b, w0, #-16
+**	ret
+*/
+TEST_S (index_s8_x0_m16, svint8_t, int8_t,
+	z0 = svindex_s8 (x0, -16))
+
+/*
+** index_s8_x0_1:
+**	index	z0\.b, w0, #1
+**	ret
+*/
+TEST_S (index_s8_x0_1, svint8_t, int8_t,
+	z0 = svindex_s8 (x0, 1))
+
+/*
+** index_s8_x0_15:
+**	index	z0\.b, w0, #15
+**	ret
+*/
+TEST_S (index_s8_x0_15, svint8_t, int8_t,
+	z0 = svindex_s8 (x0, 15))
+
+/*
+** index_s8_x0_16:
+**	mov	(w[0-9]+), 16
+**	index	z0\.b, w0, \1
+**	ret
+*/
+TEST_S (index_s8_x0_16, svint8_t, int8_t,
+	z0 = svindex_s8 (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u16.c
new file mode 100644
index 000000000..1c6631088
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u16.c
@@ -0,0 +1,220 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** index_u16_w0_w1:
+**	index	z0\.h, w0, w1
+**	ret
+*/
+TEST_S (index_u16_w0_w1, svuint16_t, uint16_t,
+	z0 = svindex_u16 (x0, x1))
+
+/*
+** index_u16_w0_2:
+**	index	z0\.h, w0, #2
+**	ret
+*/
+TEST_S (index_u16_w0_2, svuint16_t, uint16_t,
+	z0 = svindex_u16 (x0, 2))
+
+/*
+** index_u16_50_2:
+**	mov	(w[0-9]+), 50
+**	index	z0\.h, \1, #2
+**	ret
+*/
+TEST_S (index_u16_50_2, svuint16_t, uint16_t,
+	z0 = svindex_u16 (50, 2))
+
+/*
+** index_u16_0_m17:
+**	mov	(w[0-9]+), -17
+**	index	z0\.h, #0, \1
+**	ret
+*/
+TEST_S (index_u16_0_m17, svuint16_t, uint16_t,
+	z0 = svindex_u16 (0, -17))
+
+/*
+** index_u16_0_m16:
+**	index	z0\.h, #0, #-16
+**	ret
+*/
+TEST_S (index_u16_0_m16, svuint16_t, uint16_t,
+	z0 = svindex_u16 (0, -16))
+
+/*
+** index_u16_0_1:
+**	index	z0\.h, #0, #1
+**	ret
+*/
+TEST_S (index_u16_0_1, svuint16_t, uint16_t,
+	z0 = svindex_u16 (0, 1))
+
+/*
+** index_u16_0_15:
+**	index	z0\.h, #0, #15
+**	ret
+*/
+TEST_S (index_u16_0_15, svuint16_t, uint16_t,
+	z0 = svindex_u16 (0, 15))
+
+/*
+** index_u16_0_16:
+**	mov	(w[0-9]+), 16
+**	index	z0\.h, #0, \1
+**	ret
+*/
+TEST_S (index_u16_0_16, svuint16_t, uint16_t,
+	z0 = svindex_u16 (0, 16))
+
+/*
+** index_u16_m17_1:
+**	mov	(w[0-9]+), -17
+**	index	z0\.h, \1, #1
+**	ret
+*/
+TEST_S (index_u16_m17_1, svuint16_t, uint16_t,
+	z0 = svindex_u16 (-17, 1))
+
+/*
+** index_u16_m16_1:
+**	index	z0\.h, #-16, #1
+**	ret
+*/
+TEST_S (index_u16_m16_1, svuint16_t, uint16_t,
+	z0 = svindex_u16 (-16, 1))
+
+/*
+** index_u16_m1_1:
+**	index	z0\.h, #-1, #1
+**	ret
+*/
+TEST_S (index_u16_m1_1, svuint16_t, uint16_t,
+	z0 = svindex_u16 (-1, 1))
+
+/*
+** index_u16_1_1:
+**	index	z0\.h, #1, #1
+**	ret
+*/
+TEST_S (index_u16_1_1, svuint16_t, uint16_t,
+	z0 = svindex_u16 (1, 1))
+
+/*
+** index_u16_15_1:
+**	index	z0\.h, #15, #1
+**	ret
+*/
+TEST_S (index_u16_15_1, svuint16_t, uint16_t,
+	z0 = svindex_u16 (15, 1))
+
+/*
+** index_u16_16_1:
+**	mov	(w[0-9]+), 16
+**	index	z0\.h, \1, #1
+**	ret
+*/
+TEST_S (index_u16_16_1, svuint16_t, uint16_t,
+	z0 = svindex_u16 (16, 1))
+
+/*
+** index_u16_m17_x0:
+**	mov	(w[0-9]+), -17
+**	index	z0\.h, \1, w0
+**	ret
+*/
+TEST_S (index_u16_m17_x0, svuint16_t, uint16_t,
+	z0 = svindex_u16 (-17, x0))
+
+/*
+** index_u16_m16_x0:
+**	index	z0\.h, #-16, w0
+**	ret
+*/
+TEST_S (index_u16_m16_x0, svuint16_t, uint16_t,
+	z0 = svindex_u16 (-16, x0))
+
+/*
+** index_u16_m1_x0:
+**	index	z0\.h, #-1, w0
+**	ret
+*/
+TEST_S (index_u16_m1_x0, svuint16_t, uint16_t,
+	z0 = svindex_u16 (-1, x0))
+
+/*
+** index_u16_0_x0:
+**	index	z0\.h, #0, w0
+**	ret
+*/
+TEST_S (index_u16_0_x0, svuint16_t, uint16_t,
+	z0 = svindex_u16 (0, x0))
+
+/*
+** index_u16_1_x0:
+**	index	z0\.h, #1, w0
+**	ret
+*/
+TEST_S (index_u16_1_x0, svuint16_t, uint16_t,
+	z0 = svindex_u16 (1, x0))
+
+/*
+** index_u16_15_x0:
+**	index	z0\.h, #15, w0
+**	ret
+*/
+TEST_S (index_u16_15_x0, svuint16_t, uint16_t,
+	z0 = svindex_u16 (15, x0))
+
+/*
+** index_u16_16_x0:
+**	mov	(w[0-9]+), 16
+**	index	z0\.h, \1, w0
+**	ret
+*/
+TEST_S (index_u16_16_x0, svuint16_t, uint16_t,
+	z0 = svindex_u16 (16, x0))
+
+/*
+** index_u16_x0_m17:
+**	mov	(w[0-9]+), -17
+**	index	z0\.h, w0, \1
+**	ret
+*/
+TEST_S (index_u16_x0_m17, svuint16_t, uint16_t,
+	z0 = svindex_u16 (x0, -17))
+
+/*
+** index_u16_x0_m16:
+**	index	z0\.h, w0, #-16
+**	ret
+*/
+TEST_S (index_u16_x0_m16, svuint16_t, uint16_t,
+	z0 = svindex_u16 (x0, -16))
+
+/*
+** index_u16_x0_1:
+**	index	z0\.h, w0, #1
+**	ret
+*/
+TEST_S (index_u16_x0_1, svuint16_t, uint16_t,
+	z0 = svindex_u16 (x0, 1))
+
+/*
+** index_u16_x0_15:
+**	index	z0\.h, w0, #15
+**	ret
+*/
+TEST_S (index_u16_x0_15, svuint16_t, uint16_t,
+	z0 = svindex_u16 (x0, 15))
+
+/*
+** index_u16_x0_16:
+**	mov	(w[0-9]+), 16
+**	index	z0\.h, w0, \1
+**	ret
+*/
+TEST_S (index_u16_x0_16, svuint16_t, uint16_t,
+	z0 = svindex_u16 (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u32.c
new file mode 100644
index 000000000..c2badb05e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u32.c
@@ -0,0 +1,220 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** index_u32_w0_w1:
+**	index	z0\.s, w0, w1
+**	ret
+*/
+TEST_S (index_u32_w0_w1, svuint32_t, uint32_t,
+	z0 = svindex_u32 (x0, x1))
+
+/*
+** index_u32_w0_2:
+**	index	z0\.s, w0, #2
+**	ret
+*/
+TEST_S (index_u32_w0_2, svuint32_t, uint32_t,
+	z0 = svindex_u32 (x0, 2))
+
+/*
+** index_u32_50_2:
+**	mov	(w[0-9]+), 50
+**	index	z0\.s, \1, #2
+**	ret
+*/
+TEST_S (index_u32_50_2, svuint32_t, uint32_t,
+	z0 = svindex_u32 (50, 2))
+
+/*
+** index_u32_0_m17:
+**	mov	(w[0-9]+), -17
+**	index	z0\.s, #0, \1
+**	ret
+*/
+TEST_S (index_u32_0_m17, svuint32_t, uint32_t,
+	z0 = svindex_u32 (0, -17))
+
+/*
+** index_u32_0_m16:
+**	index	z0\.s, #0, #-16
+**	ret
+*/
+TEST_S (index_u32_0_m16, svuint32_t, uint32_t,
+	z0 = svindex_u32 (0, -16))
+
+/*
+** index_u32_0_1:
+**	index	z0\.s, #0, #1
+**	ret
+*/
+TEST_S (index_u32_0_1, svuint32_t, uint32_t,
+	z0 = svindex_u32 (0, 1))
+
+/*
+** index_u32_0_15:
+**	index	z0\.s, #0, #15
+**	ret
+*/
+TEST_S (index_u32_0_15, svuint32_t, uint32_t,
+	z0 = svindex_u32 (0, 15))
+
+/*
+** index_u32_0_16:
+**	mov	(w[0-9]+), 16
+**	index	z0\.s, #0, \1
+**	ret
+*/
+TEST_S (index_u32_0_16, svuint32_t, uint32_t,
+	z0 = svindex_u32 (0, 16))
+
+/*
+** index_u32_m17_1:
+**	mov	(w[0-9]+), -17
+**	index	z0\.s, \1, #1
+**	ret
+*/
+TEST_S (index_u32_m17_1, svuint32_t, uint32_t,
+	z0 = svindex_u32 (-17, 1))
+
+/*
+** index_u32_m16_1:
+**	index	z0\.s, #-16, #1
+**	ret
+*/
+TEST_S (index_u32_m16_1, svuint32_t, uint32_t,
+	z0 = svindex_u32 (-16, 1))
+
+/*
+** index_u32_m1_1:
+**	index	z0\.s, #-1, #1
+**	ret
+*/
+TEST_S (index_u32_m1_1, svuint32_t, uint32_t,
+	z0 = svindex_u32 (-1, 1))
+
+/*
+** index_u32_1_1:
+**	index	z0\.s, #1, #1
+**	ret
+*/
+TEST_S (index_u32_1_1, svuint32_t, uint32_t,
+	z0 = svindex_u32 (1, 1))
+
+/*
+** index_u32_15_1:
+**	index	z0\.s, #15, #1
+**	ret
+*/
+TEST_S (index_u32_15_1, svuint32_t, uint32_t,
+	z0 = svindex_u32 (15, 1))
+
+/*
+** index_u32_16_1:
+**	mov	(w[0-9]+), 16
+**	index	z0\.s, \1, #1
+**	ret
+*/
+TEST_S (index_u32_16_1, svuint32_t, uint32_t,
+	z0 = svindex_u32 (16, 1))
+
+/*
+** index_u32_m17_x0:
+**	mov	(w[0-9]+), -17
+**	index	z0\.s, \1, w0
+**	ret
+*/
+TEST_S (index_u32_m17_x0, svuint32_t, uint32_t,
+	z0 = svindex_u32 (-17, x0))
+
+/*
+** index_u32_m16_x0:
+**	index	z0\.s, #-16, w0
+**	ret
+*/
+TEST_S (index_u32_m16_x0, svuint32_t, uint32_t,
+	z0 = svindex_u32 (-16, x0))
+
+/*
+** index_u32_m1_x0:
+**	index	z0\.s, #-1, w0
+**	ret
+*/
+TEST_S (index_u32_m1_x0, svuint32_t, uint32_t,
+	z0 = svindex_u32 (-1, x0))
+
+/*
+** index_u32_0_x0:
+**	index	z0\.s, #0, w0
+**	ret
+*/
+TEST_S (index_u32_0_x0, svuint32_t, uint32_t,
+	z0 = svindex_u32 (0, x0))
+
+/*
+** index_u32_1_x0:
+**	index	z0\.s, #1, w0
+**	ret
+*/
+TEST_S (index_u32_1_x0, svuint32_t, uint32_t,
+	z0 = svindex_u32 (1, x0))
+
+/*
+** index_u32_15_x0:
+**	index	z0\.s, #15, w0
+**	ret
+*/
+TEST_S (index_u32_15_x0, svuint32_t, uint32_t,
+	z0 = svindex_u32 (15, x0))
+
+/*
+** index_u32_16_x0:
+**	mov	(w[0-9]+), 16
+**	index	z0\.s, \1, w0
+**	ret
+*/
+TEST_S (index_u32_16_x0, svuint32_t, uint32_t,
+	z0 = svindex_u32 (16, x0))
+
+/*
+** index_u32_x0_m17:
+**	mov	(w[0-9]+), -17
+**	index	z0\.s, w0, \1
+**	ret
+*/
+TEST_S (index_u32_x0_m17, svuint32_t, uint32_t,
+	z0 = svindex_u32 (x0, -17))
+
+/*
+** index_u32_x0_m16:
+**	index	z0\.s, w0, #-16
+**	ret
+*/
+TEST_S (index_u32_x0_m16, svuint32_t, uint32_t,
+	z0 = svindex_u32 (x0, -16))
+
+/*
+** index_u32_x0_1:
+**	index	z0\.s, w0, #1
+**	ret
+*/
+TEST_S (index_u32_x0_1, svuint32_t, uint32_t,
+	z0 = svindex_u32 (x0, 1))
+
+/*
+** index_u32_x0_15:
+**	index	z0\.s, w0, #15
+**	ret
+*/
+TEST_S (index_u32_x0_15, svuint32_t, uint32_t,
+	z0 = svindex_u32 (x0, 15))
+
+/*
+** index_u32_x0_16:
+**	mov	(w[0-9]+), 16
+**	index	z0\.s, w0, \1
+**	ret
+*/
+TEST_S (index_u32_x0_16, svuint32_t, uint32_t,
+	z0 = svindex_u32 (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u64.c
new file mode 100644
index 000000000..526c5e80a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u64.c
@@ -0,0 +1,220 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** index_u64_x0_x1:
+**	index	z0\.d, x0, x1
+**	ret
+*/
+TEST_S (index_u64_x0_x1, svuint64_t, uint64_t,
+	z0 = svindex_u64 (x0, x1))
+
+/*
+** index_u64_x0_2:
+**	index	z0\.d, x0, #2
+**	ret
+*/
+TEST_S (index_u64_x0_2, svuint64_t, uint64_t,
+	z0 = svindex_u64 (x0, 2))
+
+/*
+** index_u64_50_2:
+**	mov	(x[0-9]+), 50
+**	index	z0\.d, \1, #2
+**	ret
+*/
+TEST_S (index_u64_50_2, svuint64_t, uint64_t,
+	z0 = svindex_u64 (50, 2))
+
+/*
+** index_u64_0_m17:
+**	mov	(x[0-9]+), -17
+**	index	z0\.d, #0, \1
+**	ret
+*/
+TEST_S (index_u64_0_m17, svuint64_t, uint64_t,
+	z0 = svindex_u64 (0, -17))
+
+/*
+** index_u64_0_m16:
+**	index	z0\.d, #0, #-16
+**	ret
+*/
+TEST_S (index_u64_0_m16, svuint64_t, uint64_t,
+	z0 = svindex_u64 (0, -16))
+
+/*
+** index_u64_0_1:
+**	index	z0\.d, #0, #1
+**	ret
+*/
+TEST_S (index_u64_0_1, svuint64_t, uint64_t,
+	z0 = svindex_u64 (0, 1))
+
+/*
+** index_u64_0_15:
+**	index	z0\.d, #0, #15
+**	ret
+*/
+TEST_S (index_u64_0_15, svuint64_t, uint64_t,
+	z0 = svindex_u64 (0, 15))
+
+/*
+** index_u64_0_16:
+**	mov	(x[0-9]+), 16
+**	index	z0\.d, #0, \1
+**	ret
+*/
+TEST_S (index_u64_0_16, svuint64_t, uint64_t,
+	z0 = svindex_u64 (0, 16))
+
+/*
+** index_u64_m17_1:
+**	mov	(x[0-9]+), -17
+**	index	z0\.d, \1, #1
+**	ret
+*/
+TEST_S (index_u64_m17_1, svuint64_t, uint64_t,
+	z0 = svindex_u64 (-17, 1))
+
+/*
+** index_u64_m16_1:
+**	index	z0\.d, #-16, #1
+**	ret
+*/
+TEST_S (index_u64_m16_1, svuint64_t, uint64_t,
+	z0 = svindex_u64 (-16, 1))
+
+/*
+** index_u64_m1_1:
+**	index	z0\.d, #-1, #1
+**	ret
+*/
+TEST_S (index_u64_m1_1, svuint64_t, uint64_t,
+	z0 = svindex_u64 (-1, 1))
+
+/*
+** index_u64_1_1:
+**	index	z0\.d, #1, #1
+**	ret
+*/
+TEST_S (index_u64_1_1, svuint64_t, uint64_t,
+	z0 = svindex_u64 (1, 1))
+
+/*
+** index_u64_15_1:
+**	index	z0\.d, #15, #1
+**	ret
+*/
+TEST_S (index_u64_15_1, svuint64_t, uint64_t,
+	z0 = svindex_u64 (15, 1))
+
+/*
+** index_u64_16_1:
+**	mov	(x[0-9]+), 16
+**	index	z0\.d, \1, #1
+**	ret
+*/
+TEST_S (index_u64_16_1, svuint64_t, uint64_t,
+	z0 = svindex_u64 (16, 1))
+
+/*
+** index_u64_m17_x0:
+**	mov	(x[0-9]+), -17
+**	index	z0\.d, \1, x0
+**	ret
+*/
+TEST_S (index_u64_m17_x0, svuint64_t, uint64_t,
+	z0 = svindex_u64 (-17, x0))
+
+/*
+** index_u64_m16_x0:
+**	index	z0\.d, #-16, x0
+**	ret
+*/
+TEST_S (index_u64_m16_x0, svuint64_t, uint64_t,
+	z0 = svindex_u64 (-16, x0))
+
+/*
+** index_u64_m1_x0:
+**	index	z0\.d, #-1, x0
+**	ret
+*/
+TEST_S (index_u64_m1_x0, svuint64_t, uint64_t,
+	z0 = svindex_u64 (-1, x0))
+
+/*
+** index_u64_0_x0:
+**	index	z0\.d, #0, x0
+**	ret
+*/
+TEST_S (index_u64_0_x0, svuint64_t, uint64_t,
+	z0 = svindex_u64 (0, x0))
+
+/*
+** index_u64_1_x0:
+**	index	z0\.d, #1, x0
+**	ret
+*/
+TEST_S (index_u64_1_x0, svuint64_t, uint64_t,
+	z0 = svindex_u64 (1, x0))
+
+/*
+** index_u64_15_x0:
+**	index	z0\.d, #15, x0
+**	ret
+*/
+TEST_S (index_u64_15_x0, svuint64_t, uint64_t,
+	z0 = svindex_u64 (15, x0))
+
+/*
+** index_u64_16_x0:
+**	mov	(x[0-9]+), 16
+**	index	z0\.d, \1, x0
+**	ret
+*/
+TEST_S (index_u64_16_x0, svuint64_t, uint64_t,
+	z0 = svindex_u64 (16, x0))
+
+/*
+** index_u64_x0_m17:
+**	mov	(x[0-9]+), -17
+**	index	z0\.d, x0, \1
+**	ret
+*/
+TEST_S (index_u64_x0_m17, svuint64_t, uint64_t,
+	z0 = svindex_u64 (x0, -17))
+
+/*
+** index_u64_x0_m16:
+**	index	z0\.d, x0, #-16
+**	ret
+*/
+TEST_S (index_u64_x0_m16, svuint64_t, uint64_t,
+	z0 = svindex_u64 (x0, -16))
+
+/*
+** index_u64_x0_1:
+**	index	z0\.d, x0, #1
+**	ret
+*/
+TEST_S (index_u64_x0_1, svuint64_t, uint64_t,
+	z0 = svindex_u64 (x0, 1))
+
+/*
+** index_u64_x0_15:
+**	index	z0\.d, x0, #15
+**	ret
+*/
+TEST_S (index_u64_x0_15, svuint64_t, uint64_t,
+	z0 = svindex_u64 (x0, 15))
+
+/*
+** index_u64_x0_16:
+**	mov	(x[0-9]+), 16
+**	index	z0\.d, x0, \1
+**	ret
+*/
+TEST_S (index_u64_x0_16, svuint64_t, uint64_t,
+	z0 = svindex_u64 (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u8.c
new file mode 100644
index 000000000..c6ce12ec8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/index_u8.c
@@ -0,0 +1,220 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** index_u8_w0_w1:
+**	index	z0\.b, w0, w1
+**	ret
+*/
+TEST_S (index_u8_w0_w1, svuint8_t, uint8_t,
+	z0 = svindex_u8 (x0, x1))
+
+/*
+** index_u8_w0_2:
+**	index	z0\.b, w0, #2
+**	ret
+*/
+TEST_S (index_u8_w0_2, svuint8_t, uint8_t,
+	z0 = svindex_u8 (x0, 2))
+
+/*
+** index_u8_50_2:
+**	mov	(w[0-9]+), 50
+**	index	z0\.b, \1, #2
+**	ret
+*/
+TEST_S (index_u8_50_2, svuint8_t, uint8_t,
+	z0 = svindex_u8 (50, 2))
+
+/*
+** index_u8_0_m17:
+**	mov	(w[0-9]+), -17
+**	index	z0\.b, #0, \1
+**	ret
+*/
+TEST_S (index_u8_0_m17, svuint8_t, uint8_t,
+	z0 = svindex_u8 (0, -17))
+
+/*
+** index_u8_0_m16:
+**	index	z0\.b, #0, #-16
+**	ret
+*/
+TEST_S (index_u8_0_m16, svuint8_t, uint8_t,
+	z0 = svindex_u8 (0, -16))
+
+/*
+** index_u8_0_1:
+**	index	z0\.b, #0, #1
+**	ret
+*/
+TEST_S (index_u8_0_1, svuint8_t, uint8_t,
+	z0 = svindex_u8 (0, 1))
+
+/*
+** index_u8_0_15:
+**	index	z0\.b, #0, #15
+**	ret
+*/
+TEST_S (index_u8_0_15, svuint8_t, uint8_t,
+	z0 = svindex_u8 (0, 15))
+
+/*
+** index_u8_0_16:
+**	mov	(w[0-9]+), 16
+**	index	z0\.b, #0, \1
+**	ret
+*/
+TEST_S (index_u8_0_16, svuint8_t, uint8_t,
+	z0 = svindex_u8 (0, 16))
+
+/*
+** index_u8_m17_1:
+**	mov	(w[0-9]+), -17
+**	index	z0\.b, \1, #1
+**	ret
+*/
+TEST_S (index_u8_m17_1, svuint8_t, uint8_t,
+	z0 = svindex_u8 (-17, 1))
+
+/*
+** index_u8_m16_1:
+**	index	z0\.b, #-16, #1
+**	ret
+*/
+TEST_S (index_u8_m16_1, svuint8_t, uint8_t,
+	z0 = svindex_u8 (-16, 1))
+
+/*
+** index_u8_m1_1:
+**	index	z0\.b, #-1, #1
+**	ret
+*/
+TEST_S (index_u8_m1_1, svuint8_t, uint8_t,
+	z0 = svindex_u8 (-1, 1))
+
+/*
+** index_u8_1_1:
+**	index	z0\.b, #1, #1
+**	ret
+*/
+TEST_S (index_u8_1_1, svuint8_t, uint8_t,
+	z0 = svindex_u8 (1, 1))
+
+/*
+** index_u8_15_1:
+**	index	z0\.b, #15, #1
+**	ret
+*/
+TEST_S (index_u8_15_1, svuint8_t, uint8_t,
+	z0 = svindex_u8 (15, 1))
+
+/*
+** index_u8_16_1:
+**	mov	(w[0-9]+), 16
+**	index	z0\.b, \1, #1
+**	ret
+*/
+TEST_S (index_u8_16_1, svuint8_t, uint8_t,
+	z0 = svindex_u8 (16, 1))
+
+/*
+** index_u8_m17_x0:
+**	mov	(w[0-9]+), -17
+**	index	z0\.b, \1, w0
+**	ret
+*/
+TEST_S (index_u8_m17_x0, svuint8_t, uint8_t,
+	z0 = svindex_u8 (-17, x0))
+
+/*
+** index_u8_m16_x0:
+**	index	z0\.b, #-16, w0
+**	ret
+*/
+TEST_S (index_u8_m16_x0, svuint8_t, uint8_t,
+	z0 = svindex_u8 (-16, x0))
+
+/*
+** index_u8_m1_x0:
+**	index	z0\.b, #-1, w0
+**	ret
+*/
+TEST_S (index_u8_m1_x0, svuint8_t, uint8_t,
+	z0 = svindex_u8 (-1, x0))
+
+/*
+** index_u8_0_x0:
+**	index	z0\.b, #0, w0
+**	ret
+*/
+TEST_S (index_u8_0_x0, svuint8_t, uint8_t,
+	z0 = svindex_u8 (0, x0))
+
+/*
+** index_u8_1_x0:
+**	index	z0\.b, #1, w0
+**	ret
+*/
+TEST_S (index_u8_1_x0, svuint8_t, uint8_t,
+	z0 = svindex_u8 (1, x0))
+
+/*
+** index_u8_15_x0:
+**	index	z0\.b, #15, w0
+**	ret
+*/
+TEST_S (index_u8_15_x0, svuint8_t, uint8_t,
+	z0 = svindex_u8 (15, x0))
+
+/*
+** index_u8_16_x0:
+**	mov	(w[0-9]+), 16
+**	index	z0\.b, \1, w0
+**	ret
+*/
+TEST_S (index_u8_16_x0, svuint8_t, uint8_t,
+	z0 = svindex_u8 (16, x0))
+
+/*
+** index_u8_x0_m17:
+**	mov	(w[0-9]+), -17
+**	index	z0\.b, w0, \1
+**	ret
+*/
+TEST_S (index_u8_x0_m17, svuint8_t, uint8_t,
+	z0 = svindex_u8 (x0, -17))
+
+/*
+** index_u8_x0_m16:
+**	index	z0\.b, w0, #-16
+**	ret
+*/
+TEST_S (index_u8_x0_m16, svuint8_t, uint8_t,
+	z0 = svindex_u8 (x0, -16))
+
+/*
+** index_u8_x0_1:
+**	index	z0\.b, w0, #1
+**	ret
+*/
+TEST_S (index_u8_x0_1, svuint8_t, uint8_t,
+	z0 = svindex_u8 (x0, 1))
+
+/*
+** index_u8_x0_15:
+**	index	z0\.b, w0, #15
+**	ret
+*/
+TEST_S (index_u8_x0_15, svuint8_t, uint8_t,
+	z0 = svindex_u8 (x0, 15))
+
+/*
+** index_u8_x0_16:
+**	mov	(w[0-9]+), 16
+**	index	z0\.b, w0, \1
+**	ret
+*/
+TEST_S (index_u8_x0_16, svuint8_t, uint8_t,
+	z0 = svindex_u8 (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_bf16.c
new file mode 100644
index 000000000..55afdba62
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_bf16.c
@@ -0,0 +1,22 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** insr_h4_bf16_tied1:
+**	insr	z0\.h, h4
+**	ret
+*/
+TEST_UNIFORM_ZD (insr_h4_bf16_tied1, svbfloat16_t, bfloat16_t,
+		 z0 = svinsr_n_bf16 (z0, d4),
+		 z0 = svinsr (z0, d4))
+
+/*
+** insr_h4_bf16_untied:
+**	movprfx	z0, z1
+**	insr	z0\.h, h4
+**	ret
+*/
+TEST_UNIFORM_ZD (insr_h4_bf16_untied, svbfloat16_t, bfloat16_t,
+		 z0 = svinsr_n_bf16 (z1, d4),
+		 z0 = svinsr (z1, d4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f16.c
new file mode 100644
index 000000000..f01a36189
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f16.c
@@ -0,0 +1,51 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** insr_h4_f16_tied1:
+**	insr	z0\.h, h4
+**	ret
+*/
+TEST_UNIFORM_ZD (insr_h4_f16_tied1, svfloat16_t, __fp16,
+		 z0 = svinsr_n_f16 (z0, d4),
+		 z0 = svinsr (z0, d4))
+
+/*
+** insr_h4_f16_untied:
+**	movprfx	z0, z1
+**	insr	z0\.h, h4
+**	ret
+*/
+TEST_UNIFORM_ZD (insr_h4_f16_untied, svfloat16_t, __fp16,
+		 z0 = svinsr_n_f16 (z1, d4),
+		 z0 = svinsr (z1, d4))
+
+/*
+** insr_0_f16_tied1:
+**	insr	z0\.h, wzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_f16_tied1, svfloat16_t,
+		z0 = svinsr_n_f16 (z0, 0),
+		z0 = svinsr (z0, 0))
+
+/*
+** insr_0_f16_untied:
+**	movprfx	z0, z1
+**	insr	z0\.h, wzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_f16_untied, svfloat16_t,
+		z0 = svinsr_n_f16 (z1, 0),
+		z0 = svinsr (z1, 0))
+
+/*
+** insr_1_f16:
+**	fmov	(h[0-9]+), #?1\.0(?:e\+0)?
+**	insr	z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (insr_1_f16, svfloat16_t,
+		z0 = svinsr_n_f16 (z0, 1),
+		z0 = svinsr (z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f32.c
new file mode 100644
index 000000000..e339727b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f32.c
@@ -0,0 +1,51 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** insr_s4_f32_tied1:
+**	insr	z0\.s, s4
+**	ret
+*/
+TEST_UNIFORM_ZD (insr_s4_f32_tied1, svfloat32_t, float,
+		 z0 = svinsr_n_f32 (z0, d4),
+		 z0 = svinsr (z0, d4))
+
+/*
+** insr_s4_f32_untied:
+**	movprfx	z0, z1
+**	insr	z0\.s, s4
+**	ret
+*/
+TEST_UNIFORM_ZD (insr_s4_f32_untied, svfloat32_t, float,
+		 z0 = svinsr_n_f32 (z1, d4),
+		 z0 = svinsr (z1, d4))
+
+/*
+** insr_0_f32_tied1:
+**	insr	z0\.s, wzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_f32_tied1, svfloat32_t,
+		z0 = svinsr_n_f32 (z0, 0),
+		z0 = svinsr (z0, 0))
+
+/*
+** insr_0_f32_untied:
+**	movprfx	z0, z1
+**	insr	z0\.s, wzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_f32_untied, svfloat32_t,
+		z0 = svinsr_n_f32 (z1, 0),
+		z0 = svinsr (z1, 0))
+
+/*
+** insr_1_f32:
+**	fmov	(s[0-9]+), #?1\.0(?:e\+0)?
+**	insr	z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (insr_1_f32, svfloat32_t,
+		z0 = svinsr_n_f32 (z0, 1),
+		z0 = svinsr (z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f64.c
new file mode 100644
index 000000000..9400225a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_f64.c
@@ -0,0 +1,51 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** insr_d4_f64_tied1:
+**	insr	z0\.d, d4
+**	ret
+*/
+TEST_UNIFORM_ZD (insr_d4_f64_tied1, svfloat64_t, double,
+		 z0 = svinsr_n_f64 (z0, d4),
+		 z0 = svinsr (z0, d4))
+
+/*
+** insr_d4_f64_untied:
+**	movprfx	z0, z1
+**	insr	z0\.d, d4
+**	ret
+*/
+TEST_UNIFORM_ZD (insr_d4_f64_untied, svfloat64_t, double,
+		 z0 = svinsr_n_f64 (z1, d4),
+		 z0 = svinsr (z1, d4))
+
+/*
+** insr_0_f64_tied1:
+**	insr	z0\.d, xzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_f64_tied1, svfloat64_t,
+		z0 = svinsr_n_f64 (z0, 0),
+		z0 = svinsr (z0, 0))
+
+/*
+** insr_0_f64_untied:
+**	movprfx	z0, z1
+**	insr	z0\.d, xzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_f64_untied, svfloat64_t,
+		z0 = svinsr_n_f64 (z1, 0),
+		z0 = svinsr (z1, 0))
+
+/*
+** insr_1_f64:
+**	fmov	(d[0-9]+), #?1\.0(?:e\+0)?
+**	insr	z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (insr_1_f64, svfloat64_t,
+		z0 = svinsr_n_f64 (z0, 1),
+		z0 = svinsr (z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s16.c
new file mode 100644
index 000000000..651977a9d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s16.c
@@ -0,0 +1,56 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** insr_w0_s16_tied1:
+**	insr	z0\.h, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (insr_w0_s16_tied1, svint16_t, int16_t,
+		 z0 = svinsr_n_s16 (z0, x0),
+		 z0 = svinsr (z0, x0))
+
+/*
+** insr_w0_s16_untied:
+**	movprfx	z0, z1
+**	insr	z0\.h, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (insr_w0_s16_untied, svint16_t, int16_t,
+		 z0 = svinsr_n_s16 (z1, x0),
+		 z0 = svinsr (z1, x0))
+
+/*
+** insr_0_s16_tied1:
+**	insr	z0\.h, wzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_s16_tied1, svint16_t,
+		z0 = svinsr_n_s16 (z0, 0),
+		z0 = svinsr (z0, 0))
+
+/*
+** insr_0_s16_untied:
+**	movprfx	z0, z1
+**	insr	z0\.h, wzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_s16_untied, svint16_t,
+		z0 = svinsr_n_s16 (z1, 0),
+		z0 = svinsr (z1, 0))
+
+/*
+** insr_1_s16:
+** (
+**	mov	(w[0-9]+), #?1
+**	insr	z0\.h, \1
+** |
+**	movi	v([0-9]+)\.4h, 0x1
+**	insr	z0\.h, h\2
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (insr_1_s16, svint16_t,
+		z0 = svinsr_n_s16 (z0, 1),
+		z0 = svinsr (z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s32.c
new file mode 100644
index 000000000..a1dcfc090
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s32.c
@@ -0,0 +1,56 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** insr_w0_s32_tied1:
+**	insr	z0\.s, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (insr_w0_s32_tied1, svint32_t, int32_t,
+		 z0 = svinsr_n_s32 (z0, x0),
+		 z0 = svinsr (z0, x0))
+
+/*
+** insr_w0_s32_untied:
+**	movprfx	z0, z1
+**	insr	z0\.s, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (insr_w0_s32_untied, svint32_t, int32_t,
+		 z0 = svinsr_n_s32 (z1, x0),
+		 z0 = svinsr (z1, x0))
+
+/*
+** insr_0_s32_tied1:
+**	insr	z0\.s, wzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_s32_tied1, svint32_t,
+		z0 = svinsr_n_s32 (z0, 0),
+		z0 = svinsr (z0, 0))
+
+/*
+** insr_0_s32_untied:
+**	movprfx	z0, z1
+**	insr	z0\.s, wzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_s32_untied, svint32_t,
+		z0 = svinsr_n_s32 (z1, 0),
+		z0 = svinsr (z1, 0))
+
+/*
+** insr_1_s32:
+** (
+**	mov	(w[0-9]+), #?1
+**	insr	z0\.s, \1
+** |
+**	movi	v([0-9]+)\.2s, 0x1
+**	insr	z0\.s, s\2
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (insr_1_s32, svint32_t,
+		z0 = svinsr_n_s32 (z0, 1),
+		z0 = svinsr (z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s64.c
new file mode 100644
index 000000000..32cdc8263
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s64.c
@@ -0,0 +1,56 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** insr_x0_s64_tied1:
+**	insr	z0\.d, x0
+**	ret
+*/
+TEST_UNIFORM_ZX (insr_x0_s64_tied1, svint64_t, int64_t,
+		 z0 = svinsr_n_s64 (z0, x0),
+		 z0 = svinsr (z0, x0))
+
+/*
+** insr_x0_s64_untied:
+**	movprfx	z0, z1
+**	insr	z0\.d, x0
+**	ret
+*/
+TEST_UNIFORM_ZX (insr_x0_s64_untied, svint64_t, int64_t,
+		 z0 = svinsr_n_s64 (z1, x0),
+		 z0 = svinsr (z1, x0))
+
+/*
+** insr_0_s64_tied1:
+**	insr	z0\.d, xzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_s64_tied1, svint64_t,
+		z0 = svinsr_n_s64 (z0, 0),
+		z0 = svinsr (z0, 0))
+
+/*
+** insr_0_s64_untied:
+**	movprfx	z0, z1
+**	insr	z0\.d, xzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_s64_untied, svint64_t,
+		z0 = svinsr_n_s64 (z1, 0),
+		z0 = svinsr (z1, 0))
+
+/*
+** insr_1_s64:
+** (
+**	mov	(x[0-9]+), #?1
+**	insr	z0\.d, \1
+** |
+**	movi	v([0-9]+)\.2d, 0x1
+**	insr	z0\.d, d\2
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (insr_1_s64, svint64_t,
+		z0 = svinsr_n_s64 (z0, 1),
+		z0 = svinsr (z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s8.c
new file mode 100644
index 000000000..cb69b09fa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_s8.c
@@ -0,0 +1,56 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** insr_w0_s8_tied1:
+**	insr	z0\.b, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (insr_w0_s8_tied1, svint8_t, int8_t,
+		 z0 = svinsr_n_s8 (z0, x0),
+		 z0 = svinsr (z0, x0))
+
+/*
+** insr_w0_s8_untied:
+**	movprfx	z0, z1
+**	insr	z0\.b, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (insr_w0_s8_untied, svint8_t, int8_t,
+		 z0 = svinsr_n_s8 (z1, x0),
+		 z0 = svinsr (z1, x0))
+
+/*
+** insr_0_s8_tied1:
+**	insr	z0\.b, wzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_s8_tied1, svint8_t,
+		z0 = svinsr_n_s8 (z0, 0),
+		z0 = svinsr (z0, 0))
+
+/*
+** insr_0_s8_untied:
+**	movprfx	z0, z1
+**	insr	z0\.b, wzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_s8_untied, svint8_t,
+		z0 = svinsr_n_s8 (z1, 0),
+		z0 = svinsr (z1, 0))
+
+/*
+** insr_1_s8:
+** (
+**	mov	(w[0-9]+), #?1
+**	insr	z0\.b, \1
+** |
+**	movi	v([0-9]+)\.8b, 0x1
+**	insr	z0\.b, b\2
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (insr_1_s8, svint8_t,
+		z0 = svinsr_n_s8 (z0, 1),
+		z0 = svinsr (z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u16.c
new file mode 100644
index 000000000..35af77402
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u16.c
@@ -0,0 +1,56 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** insr_w0_u16_tied1:
+**	insr	z0\.h, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (insr_w0_u16_tied1, svuint16_t, uint16_t,
+		 z0 = svinsr_n_u16 (z0, x0),
+		 z0 = svinsr (z0, x0))
+
+/*
+** insr_w0_u16_untied:
+**	movprfx	z0, z1
+**	insr	z0\.h, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (insr_w0_u16_untied, svuint16_t, uint16_t,
+		 z0 = svinsr_n_u16 (z1, x0),
+		 z0 = svinsr (z1, x0))
+
+/*
+** insr_0_u16_tied1:
+**	insr	z0\.h, wzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_u16_tied1, svuint16_t,
+		z0 = svinsr_n_u16 (z0, 0),
+		z0 = svinsr (z0, 0))
+
+/*
+** insr_0_u16_untied:
+**	movprfx	z0, z1
+**	insr	z0\.h, wzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_u16_untied, svuint16_t,
+		z0 = svinsr_n_u16 (z1, 0),
+		z0 = svinsr (z1, 0))
+
+/*
+** insr_1_u16:
+** (
+**	mov	(w[0-9]+), #?1
+**	insr	z0\.h, \1
+** |
+**	movi	v([0-9]+)\.4h, 0x1
+**	insr	z0\.h, h\2
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (insr_1_u16, svuint16_t,
+		z0 = svinsr_n_u16 (z0, 1),
+		z0 = svinsr (z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u32.c
new file mode 100644
index 000000000..8a72e7f2a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u32.c
@@ -0,0 +1,56 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** insr_w0_u32_tied1:
+**	insr	z0\.s, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (insr_w0_u32_tied1, svuint32_t, uint32_t,
+		 z0 = svinsr_n_u32 (z0, x0),
+		 z0 = svinsr (z0, x0))
+
+/*
+** insr_w0_u32_untied:
+**	movprfx	z0, z1
+**	insr	z0\.s, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (insr_w0_u32_untied, svuint32_t, uint32_t,
+		 z0 = svinsr_n_u32 (z1, x0),
+		 z0 = svinsr (z1, x0))
+
+/*
+** insr_0_u32_tied1:
+**	insr	z0\.s, wzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_u32_tied1, svuint32_t,
+		z0 = svinsr_n_u32 (z0, 0),
+		z0 = svinsr (z0, 0))
+
+/*
+** insr_0_u32_untied:
+**	movprfx	z0, z1
+**	insr	z0\.s, wzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_u32_untied, svuint32_t,
+		z0 = svinsr_n_u32 (z1, 0),
+		z0 = svinsr (z1, 0))
+
+/*
+** insr_1_u32:
+** (
+**	mov	(w[0-9]+), #?1
+**	insr	z0\.s, \1
+** |
+**	movi	v([0-9]+)\.2s, 0x1
+**	insr	z0\.s, s\2
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (insr_1_u32, svuint32_t,
+		z0 = svinsr_n_u32 (z0, 1),
+		z0 = svinsr (z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u64.c
new file mode 100644
index 000000000..ab23f677d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u64.c
@@ -0,0 +1,56 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** insr_x0_u64_tied1:
+**	insr	z0\.d, x0
+**	ret
+*/
+TEST_UNIFORM_ZX (insr_x0_u64_tied1, svuint64_t, uint64_t,
+		 z0 = svinsr_n_u64 (z0, x0),
+		 z0 = svinsr (z0, x0))
+
+/*
+** insr_x0_u64_untied:
+**	movprfx	z0, z1
+**	insr	z0\.d, x0
+**	ret
+*/
+TEST_UNIFORM_ZX (insr_x0_u64_untied, svuint64_t, uint64_t,
+		 z0 = svinsr_n_u64 (z1, x0),
+		 z0 = svinsr (z1, x0))
+
+/*
+** insr_0_u64_tied1:
+**	insr	z0\.d, xzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_u64_tied1, svuint64_t,
+		z0 = svinsr_n_u64 (z0, 0),
+		z0 = svinsr (z0, 0))
+
+/*
+** insr_0_u64_untied:
+**	movprfx	z0, z1
+**	insr	z0\.d, xzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_u64_untied, svuint64_t,
+		z0 = svinsr_n_u64 (z1, 0),
+		z0 = svinsr (z1, 0))
+
+/*
+** insr_1_u64:
+** (
+**	mov	(x[0-9]+), #?1
+**	insr	z0\.d, \1
+** |
+**	movi	v([0-9]+)\.2d, 0x1
+**	insr	z0\.d, d\2
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (insr_1_u64, svuint64_t,
+		z0 = svinsr_n_u64 (z0, 1),
+		z0 = svinsr (z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u8.c
new file mode 100644
index 000000000..549d71882
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/insr_u8.c
@@ -0,0 +1,56 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** insr_w0_u8_tied1:
+**	insr	z0\.b, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (insr_w0_u8_tied1, svuint8_t, uint8_t,
+		 z0 = svinsr_n_u8 (z0, x0),
+		 z0 = svinsr (z0, x0))
+
+/*
+** insr_w0_u8_untied:
+**	movprfx	z0, z1
+**	insr	z0\.b, w0
+**	ret
+*/
+TEST_UNIFORM_ZX (insr_w0_u8_untied, svuint8_t, uint8_t,
+		 z0 = svinsr_n_u8 (z1, x0),
+		 z0 = svinsr (z1, x0))
+
+/*
+** insr_0_u8_tied1:
+**	insr	z0\.b, wzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_u8_tied1, svuint8_t,
+		z0 = svinsr_n_u8 (z0, 0),
+		z0 = svinsr (z0, 0))
+
+/*
+** insr_0_u8_untied:
+**	movprfx	z0, z1
+**	insr	z0\.b, wzr
+**	ret
+*/
+TEST_UNIFORM_Z (insr_0_u8_untied, svuint8_t,
+		z0 = svinsr_n_u8 (z1, 0),
+		z0 = svinsr (z1, 0))
+
+/*
+** insr_1_u8:
+** (
+**	mov	(w[0-9]+), #?1
+**	insr	z0\.b, \1
+** |
+**	movi	v([0-9]+)\.8b, 0x1
+**	insr	z0\.b, b\2
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (insr_1_u8, svuint8_t,
+		z0 = svinsr_n_u8 (z0, 1),
+		z0 = svinsr (z0, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_bf16.c
new file mode 100644
index 000000000..da30e05e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_bf16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lasta_d0_bf16_tied:
+**	lasta	h0, p0, z0\.h
+**	ret
+*/
+TEST_REDUCTION_D (lasta_d0_bf16_tied, bfloat16_t, svbfloat16_t,
+		  d0 = svlasta_bf16 (p0, z0),
+		  d0 = svlasta (p0, z0))
+
+/*
+** lasta_d0_bf16_untied:
+**	lasta	h0, p0, z1\.h
+**	ret
+*/
+TEST_REDUCTION_D (lasta_d0_bf16_untied, bfloat16_t, svbfloat16_t,
+		  d0 = svlasta_bf16 (p0, z1),
+		  d0 = svlasta (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f16.c
new file mode 100644
index 000000000..972b55ab6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lasta_d0_f16_tied:
+**	lasta	h0, p0, z0\.h
+**	ret
+*/
+TEST_REDUCTION_D (lasta_d0_f16_tied, float16_t, svfloat16_t,
+		  d0 = svlasta_f16 (p0, z0),
+		  d0 = svlasta (p0, z0))
+
+/*
+** lasta_d0_f16_untied:
+**	lasta	h0, p0, z1\.h
+**	ret
+*/
+TEST_REDUCTION_D (lasta_d0_f16_untied, float16_t, svfloat16_t,
+		  d0 = svlasta_f16 (p0, z1),
+		  d0 = svlasta (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f32.c
new file mode 100644
index 000000000..cfb537f2f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lasta_d0_f32_tied:
+**	lasta	s0, p0, z0\.s
+**	ret
+*/
+TEST_REDUCTION_D (lasta_d0_f32_tied, float32_t, svfloat32_t,
+		  d0 = svlasta_f32 (p0, z0),
+		  d0 = svlasta (p0, z0))
+
+/*
+** lasta_d0_f32_untied:
+**	lasta	s0, p0, z1\.s
+**	ret
+*/
+TEST_REDUCTION_D (lasta_d0_f32_untied, float32_t, svfloat32_t,
+		  d0 = svlasta_f32 (p0, z1),
+		  d0 = svlasta (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f64.c
new file mode 100644
index 000000000..a4a8a74c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_f64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lasta_d0_f64_tied:
+**	lasta	d0, p0, z0\.d
+**	ret
+*/
+TEST_REDUCTION_D (lasta_d0_f64_tied, float64_t, svfloat64_t,
+		  d0 = svlasta_f64 (p0, z0),
+		  d0 = svlasta (p0, z0))
+
+/*
+** lasta_d0_f64_untied:
+**	lasta	d0, p0, z1\.d
+**	ret
+*/
+TEST_REDUCTION_D (lasta_d0_f64_untied, float64_t, svfloat64_t,
+		  d0 = svlasta_f64 (p0, z1),
+		  d0 = svlasta (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s16.c
new file mode 100644
index 000000000..54bd0248f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s16.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lasta_x0_s16:
+**	lasta	w0, p0, z0\.h
+**	ret
+*/
+TEST_REDUCTION_X (lasta_x0_s16, int16_t, svint16_t,
+		  x0 = svlasta_s16 (p0, z0),
+		  x0 = svlasta (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s32.c
new file mode 100644
index 000000000..18f852f94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s32.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lasta_x0_s32:
+**	lasta	w0, p0, z0\.s
+**	ret
+*/
+TEST_REDUCTION_X (lasta_x0_s32, int32_t, svint32_t,
+		  x0 = svlasta_s32 (p0, z0),
+		  x0 = svlasta (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s64.c
new file mode 100644
index 000000000..6e45af3d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s64.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lasta_x0_s64:
+**	lasta	x0, p0, z0\.d
+**	ret
+*/
+TEST_REDUCTION_X (lasta_x0_s64, int64_t, svint64_t,
+		  x0 = svlasta_s64 (p0, z0),
+		  x0 = svlasta (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s8.c
new file mode 100644
index 000000000..58e574f30
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_s8.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lasta_x0_s8:
+**	lasta	w0, p0, z0\.b
+**	ret
+*/
+TEST_REDUCTION_X (lasta_x0_s8, int8_t, svint8_t,
+		  x0 = svlasta_s8 (p0, z0),
+		  x0 = svlasta (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u16.c
new file mode 100644
index 000000000..a0e14eca4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u16.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lasta_x0_u16:
+**	lasta	w0, p0, z0\.h
+**	ret
+*/
+TEST_REDUCTION_X (lasta_x0_u16, uint16_t, svuint16_t,
+		  x0 = svlasta_u16 (p0, z0),
+		  x0 = svlasta (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u32.c
new file mode 100644
index 000000000..dab37c36a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u32.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lasta_x0_u32:
+**	lasta	w0, p0, z0\.s
+**	ret
+*/
+TEST_REDUCTION_X (lasta_x0_u32, uint32_t, svuint32_t,
+		  x0 = svlasta_u32 (p0, z0),
+		  x0 = svlasta (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u64.c
new file mode 100644
index 000000000..c766f36ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u64.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lasta_x0_u64:
+**	lasta	x0, p0, z0\.d
+**	ret
+*/
+TEST_REDUCTION_X (lasta_x0_u64, uint64_t, svuint64_t,
+		  x0 = svlasta_u64 (p0, z0),
+		  x0 = svlasta (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u8.c
new file mode 100644
index 000000000..a83f25fe4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lasta_u8.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lasta_x0_u8:
+**	lasta	w0, p0, z0\.b
+**	ret
+*/
+TEST_REDUCTION_X (lasta_x0_u8, uint8_t, svuint8_t,
+		  x0 = svlasta_u8 (p0, z0),
+		  x0 = svlasta (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_bf16.c
new file mode 100644
index 000000000..01ba39a02
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_bf16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lastb_d0_bf16_tied:
+**	lastb	h0, p0, z0\.h
+**	ret
+*/
+TEST_REDUCTION_D (lastb_d0_bf16_tied, bfloat16_t, svbfloat16_t,
+		  d0 = svlastb_bf16 (p0, z0),
+		  d0 = svlastb (p0, z0))
+
+/*
+** lastb_d0_bf16_untied:
+**	lastb	h0, p0, z1\.h
+**	ret
+*/
+TEST_REDUCTION_D (lastb_d0_bf16_untied, bfloat16_t, svbfloat16_t,
+		  d0 = svlastb_bf16 (p0, z1),
+		  d0 = svlastb (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f16.c
new file mode 100644
index 000000000..0bc7e9ef4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lastb_d0_f16_tied:
+**	lastb	h0, p0, z0\.h
+**	ret
+*/
+TEST_REDUCTION_D (lastb_d0_f16_tied, float16_t, svfloat16_t,
+		  d0 = svlastb_f16 (p0, z0),
+		  d0 = svlastb (p0, z0))
+
+/*
+** lastb_d0_f16_untied:
+**	lastb	h0, p0, z1\.h
+**	ret
+*/
+TEST_REDUCTION_D (lastb_d0_f16_untied, float16_t, svfloat16_t,
+		  d0 = svlastb_f16 (p0, z1),
+		  d0 = svlastb (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f32.c
new file mode 100644
index 000000000..b33d61eee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lastb_d0_f32_tied:
+**	lastb	s0, p0, z0\.s
+**	ret
+*/
+TEST_REDUCTION_D (lastb_d0_f32_tied, float32_t, svfloat32_t,
+		  d0 = svlastb_f32 (p0, z0),
+		  d0 = svlastb (p0, z0))
+
+/*
+** lastb_d0_f32_untied:
+**	lastb	s0, p0, z1\.s
+**	ret
+*/
+TEST_REDUCTION_D (lastb_d0_f32_untied, float32_t, svfloat32_t,
+		  d0 = svlastb_f32 (p0, z1),
+		  d0 = svlastb (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f64.c
new file mode 100644
index 000000000..9fa7de706
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_f64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lastb_d0_f64_tied:
+**	lastb	d0, p0, z0\.d
+**	ret
+*/
+TEST_REDUCTION_D (lastb_d0_f64_tied, float64_t, svfloat64_t,
+		  d0 = svlastb_f64 (p0, z0),
+		  d0 = svlastb (p0, z0))
+
+/*
+** lastb_d0_f64_untied:
+**	lastb	d0, p0, z1\.d
+**	ret
+*/
+TEST_REDUCTION_D (lastb_d0_f64_untied, float64_t, svfloat64_t,
+		  d0 = svlastb_f64 (p0, z1),
+		  d0 = svlastb (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s16.c
new file mode 100644
index 000000000..6575f21cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s16.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lastb_x0_s16:
+**	lastb	w0, p0, z0\.h
+**	ret
+*/
+TEST_REDUCTION_X (lastb_x0_s16, int16_t, svint16_t,
+		  x0 = svlastb_s16 (p0, z0),
+		  x0 = svlastb (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s32.c
new file mode 100644
index 000000000..856e5bdc8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s32.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lastb_x0_s32:
+**	lastb	w0, p0, z0\.s
+**	ret
+*/
+TEST_REDUCTION_X (lastb_x0_s32, int32_t, svint32_t,
+		  x0 = svlastb_s32 (p0, z0),
+		  x0 = svlastb (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s64.c
new file mode 100644
index 000000000..bd7de2ab2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s64.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lastb_x0_s64:
+**	lastb	x0, p0, z0\.d
+**	ret
+*/
+TEST_REDUCTION_X (lastb_x0_s64, int64_t, svint64_t,
+		  x0 = svlastb_s64 (p0, z0),
+		  x0 = svlastb (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s8.c
new file mode 100644
index 000000000..4c343a705
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_s8.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lastb_x0_s8:
+**	lastb	w0, p0, z0\.b
+**	ret
+*/
+TEST_REDUCTION_X (lastb_x0_s8, int8_t, svint8_t,
+		  x0 = svlastb_s8 (p0, z0),
+		  x0 = svlastb (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u16.c
new file mode 100644
index 000000000..7f3db1bb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u16.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lastb_x0_u16:
+**	lastb	w0, p0, z0\.h
+**	ret
+*/
+TEST_REDUCTION_X (lastb_x0_u16, uint16_t, svuint16_t,
+		  x0 = svlastb_u16 (p0, z0),
+		  x0 = svlastb (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u32.c
new file mode 100644
index 000000000..c2eeacba0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u32.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lastb_x0_u32:
+**	lastb	w0, p0, z0\.s
+**	ret
+*/
+TEST_REDUCTION_X (lastb_x0_u32, uint32_t, svuint32_t,
+		  x0 = svlastb_u32 (p0, z0),
+		  x0 = svlastb (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u64.c
new file mode 100644
index 000000000..1496ffa0e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u64.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lastb_x0_u64:
+**	lastb	x0, p0, z0\.d
+**	ret
+*/
+TEST_REDUCTION_X (lastb_x0_u64, uint64_t, svuint64_t,
+		  x0 = svlastb_u64 (p0, z0),
+		  x0 = svlastb (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u8.c
new file mode 100644
index 000000000..25f036063
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lastb_u8.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lastb_x0_u8:
+**	lastb	w0, p0, z0\.b
+**	ret
+*/
+TEST_REDUCTION_X (lastb_x0_u8, uint8_t, svuint8_t,
+		  x0 = svlastb_u8 (p0, z0),
+		  x0 = svlastb (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_bf16.c
new file mode 100644
index 000000000..07891de04
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_bf16.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_bf16_base:
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_bf16_base, svbfloat16_t, bfloat16_t,
+	   z0 = svld1_bf16 (p0, x0),
+	   z0 = svld1 (p0, x0))
+
+/*
+** ld1_bf16_index:
+**	ld1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1_bf16_index, svbfloat16_t, bfloat16_t,
+	   z0 = svld1_bf16 (p0, x0 + x1),
+	   z0 = svld1 (p0, x0 + x1))
+
+/*
+** ld1_bf16_1:
+**	ld1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_bf16_1, svbfloat16_t, bfloat16_t,
+	   z0 = svld1_bf16 (p0, x0 + svcnth ()),
+	   z0 = svld1 (p0, x0 + svcnth ()))
+
+/*
+** ld1_bf16_7:
+**	ld1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_bf16_7, svbfloat16_t, bfloat16_t,
+	   z0 = svld1_bf16 (p0, x0 + svcnth () * 7),
+	   z0 = svld1 (p0, x0 + svcnth () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_bf16_8:
+**	incb	x0, all, mul #8
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_bf16_8, svbfloat16_t, bfloat16_t,
+	   z0 = svld1_bf16 (p0, x0 + svcnth () * 8),
+	   z0 = svld1 (p0, x0 + svcnth () * 8))
+
+/*
+** ld1_bf16_m1:
+**	ld1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_bf16_m1, svbfloat16_t, bfloat16_t,
+	   z0 = svld1_bf16 (p0, x0 - svcnth ()),
+	   z0 = svld1 (p0, x0 - svcnth ()))
+
+/*
+** ld1_bf16_m8:
+**	ld1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_bf16_m8, svbfloat16_t, bfloat16_t,
+	   z0 = svld1_bf16 (p0, x0 - svcnth () * 8),
+	   z0 = svld1 (p0, x0 - svcnth () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_bf16_m9:
+**	decb	x0, all, mul #9
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_bf16_m9, svbfloat16_t, bfloat16_t,
+	   z0 = svld1_bf16 (p0, x0 - svcnth () * 9),
+	   z0 = svld1 (p0, x0 - svcnth () * 9))
+
+/*
+** ld1_vnum_bf16_0:
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+	   z0 = svld1_vnum_bf16 (p0, x0, 0),
+	   z0 = svld1_vnum (p0, x0, 0))
+
+/*
+** ld1_vnum_bf16_1:
+**	ld1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+	   z0 = svld1_vnum_bf16 (p0, x0, 1),
+	   z0 = svld1_vnum (p0, x0, 1))
+
+/*
+** ld1_vnum_bf16_7:
+**	ld1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
+	   z0 = svld1_vnum_bf16 (p0, x0, 7),
+	   z0 = svld1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_bf16_8:
+**	incb	x0, all, mul #8
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
+	   z0 = svld1_vnum_bf16 (p0, x0, 8),
+	   z0 = svld1_vnum (p0, x0, 8))
+
+/*
+** ld1_vnum_bf16_m1:
+**	ld1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+	   z0 = svld1_vnum_bf16 (p0, x0, -1),
+	   z0 = svld1_vnum (p0, x0, -1))
+
+/*
+** ld1_vnum_bf16_m8:
+**	ld1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
+	   z0 = svld1_vnum_bf16 (p0, x0, -8),
+	   z0 = svld1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_bf16_m9:
+**	decb	x0, all, mul #9
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
+	   z0 = svld1_vnum_bf16 (p0, x0, -9),
+	   z0 = svld1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1_vnum_bf16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1h	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+	   z0 = svld1_vnum_bf16 (p0, x0, x1),
+	   z0 = svld1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f16.c
new file mode 100644
index 000000000..c3552bfbd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f16.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_f16_base:
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_f16_base, svfloat16_t, float16_t,
+	   z0 = svld1_f16 (p0, x0),
+	   z0 = svld1 (p0, x0))
+
+/*
+** ld1_f16_index:
+**	ld1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1_f16_index, svfloat16_t, float16_t,
+	   z0 = svld1_f16 (p0, x0 + x1),
+	   z0 = svld1 (p0, x0 + x1))
+
+/*
+** ld1_f16_1:
+**	ld1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_f16_1, svfloat16_t, float16_t,
+	   z0 = svld1_f16 (p0, x0 + svcnth ()),
+	   z0 = svld1 (p0, x0 + svcnth ()))
+
+/*
+** ld1_f16_7:
+**	ld1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_f16_7, svfloat16_t, float16_t,
+	   z0 = svld1_f16 (p0, x0 + svcnth () * 7),
+	   z0 = svld1 (p0, x0 + svcnth () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_f16_8:
+**	incb	x0, all, mul #8
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_f16_8, svfloat16_t, float16_t,
+	   z0 = svld1_f16 (p0, x0 + svcnth () * 8),
+	   z0 = svld1 (p0, x0 + svcnth () * 8))
+
+/*
+** ld1_f16_m1:
+**	ld1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_f16_m1, svfloat16_t, float16_t,
+	   z0 = svld1_f16 (p0, x0 - svcnth ()),
+	   z0 = svld1 (p0, x0 - svcnth ()))
+
+/*
+** ld1_f16_m8:
+**	ld1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_f16_m8, svfloat16_t, float16_t,
+	   z0 = svld1_f16 (p0, x0 - svcnth () * 8),
+	   z0 = svld1 (p0, x0 - svcnth () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_f16_m9:
+**	decb	x0, all, mul #9
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_f16_m9, svfloat16_t, float16_t,
+	   z0 = svld1_f16 (p0, x0 - svcnth () * 9),
+	   z0 = svld1 (p0, x0 - svcnth () * 9))
+
+/*
+** ld1_vnum_f16_0:
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f16_0, svfloat16_t, float16_t,
+	   z0 = svld1_vnum_f16 (p0, x0, 0),
+	   z0 = svld1_vnum (p0, x0, 0))
+
+/*
+** ld1_vnum_f16_1:
+**	ld1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f16_1, svfloat16_t, float16_t,
+	   z0 = svld1_vnum_f16 (p0, x0, 1),
+	   z0 = svld1_vnum (p0, x0, 1))
+
+/*
+** ld1_vnum_f16_7:
+**	ld1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f16_7, svfloat16_t, float16_t,
+	   z0 = svld1_vnum_f16 (p0, x0, 7),
+	   z0 = svld1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_f16_8:
+**	incb	x0, all, mul #8
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f16_8, svfloat16_t, float16_t,
+	   z0 = svld1_vnum_f16 (p0, x0, 8),
+	   z0 = svld1_vnum (p0, x0, 8))
+
+/*
+** ld1_vnum_f16_m1:
+**	ld1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f16_m1, svfloat16_t, float16_t,
+	   z0 = svld1_vnum_f16 (p0, x0, -1),
+	   z0 = svld1_vnum (p0, x0, -1))
+
+/*
+** ld1_vnum_f16_m8:
+**	ld1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f16_m8, svfloat16_t, float16_t,
+	   z0 = svld1_vnum_f16 (p0, x0, -8),
+	   z0 = svld1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_f16_m9:
+**	decb	x0, all, mul #9
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f16_m9, svfloat16_t, float16_t,
+	   z0 = svld1_vnum_f16 (p0, x0, -9),
+	   z0 = svld1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1_vnum_f16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1h	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f16_x1, svfloat16_t, float16_t,
+	   z0 = svld1_vnum_f16 (p0, x0, x1),
+	   z0 = svld1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f32.c
new file mode 100644
index 000000000..8990f48d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_f32_base:
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_f32_base, svfloat32_t, float32_t,
+	   z0 = svld1_f32 (p0, x0),
+	   z0 = svld1 (p0, x0))
+
+/*
+** ld1_f32_index:
+**	ld1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld1_f32_index, svfloat32_t, float32_t,
+	   z0 = svld1_f32 (p0, x0 + x1),
+	   z0 = svld1 (p0, x0 + x1))
+
+/*
+** ld1_f32_1:
+**	ld1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_f32_1, svfloat32_t, float32_t,
+	   z0 = svld1_f32 (p0, x0 + svcntw ()),
+	   z0 = svld1 (p0, x0 + svcntw ()))
+
+/*
+** ld1_f32_7:
+**	ld1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_f32_7, svfloat32_t, float32_t,
+	   z0 = svld1_f32 (p0, x0 + svcntw () * 7),
+	   z0 = svld1 (p0, x0 + svcntw () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_f32_8:
+**	incb	x0, all, mul #8
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_f32_8, svfloat32_t, float32_t,
+	   z0 = svld1_f32 (p0, x0 + svcntw () * 8),
+	   z0 = svld1 (p0, x0 + svcntw () * 8))
+
+/*
+** ld1_f32_m1:
+**	ld1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_f32_m1, svfloat32_t, float32_t,
+	   z0 = svld1_f32 (p0, x0 - svcntw ()),
+	   z0 = svld1 (p0, x0 - svcntw ()))
+
+/*
+** ld1_f32_m8:
+**	ld1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_f32_m8, svfloat32_t, float32_t,
+	   z0 = svld1_f32 (p0, x0 - svcntw () * 8),
+	   z0 = svld1 (p0, x0 - svcntw () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_f32_m9:
+**	decb	x0, all, mul #9
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_f32_m9, svfloat32_t, float32_t,
+	   z0 = svld1_f32 (p0, x0 - svcntw () * 9),
+	   z0 = svld1 (p0, x0 - svcntw () * 9))
+
+/*
+** ld1_vnum_f32_0:
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f32_0, svfloat32_t, float32_t,
+	   z0 = svld1_vnum_f32 (p0, x0, 0),
+	   z0 = svld1_vnum (p0, x0, 0))
+
+/*
+** ld1_vnum_f32_1:
+**	ld1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f32_1, svfloat32_t, float32_t,
+	   z0 = svld1_vnum_f32 (p0, x0, 1),
+	   z0 = svld1_vnum (p0, x0, 1))
+
+/*
+** ld1_vnum_f32_7:
+**	ld1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f32_7, svfloat32_t, float32_t,
+	   z0 = svld1_vnum_f32 (p0, x0, 7),
+	   z0 = svld1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_f32_8:
+**	incb	x0, all, mul #8
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f32_8, svfloat32_t, float32_t,
+	   z0 = svld1_vnum_f32 (p0, x0, 8),
+	   z0 = svld1_vnum (p0, x0, 8))
+
+/*
+** ld1_vnum_f32_m1:
+**	ld1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f32_m1, svfloat32_t, float32_t,
+	   z0 = svld1_vnum_f32 (p0, x0, -1),
+	   z0 = svld1_vnum (p0, x0, -1))
+
+/*
+** ld1_vnum_f32_m8:
+**	ld1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f32_m8, svfloat32_t, float32_t,
+	   z0 = svld1_vnum_f32 (p0, x0, -8),
+	   z0 = svld1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_f32_m9:
+**	decb	x0, all, mul #9
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f32_m9, svfloat32_t, float32_t,
+	   z0 = svld1_vnum_f32 (p0, x0, -9),
+	   z0 = svld1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1_vnum_f32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1w	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f32_x1, svfloat32_t, float32_t,
+	   z0 = svld1_vnum_f32 (p0, x0, x1),
+	   z0 = svld1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f64.c
new file mode 100644
index 000000000..eb28687fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_f64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_f64_base:
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_f64_base, svfloat64_t, float64_t,
+	   z0 = svld1_f64 (p0, x0),
+	   z0 = svld1 (p0, x0))
+
+/*
+** ld1_f64_index:
+**	ld1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld1_f64_index, svfloat64_t, float64_t,
+	   z0 = svld1_f64 (p0, x0 + x1),
+	   z0 = svld1 (p0, x0 + x1))
+
+/*
+** ld1_f64_1:
+**	ld1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_f64_1, svfloat64_t, float64_t,
+	   z0 = svld1_f64 (p0, x0 + svcntd ()),
+	   z0 = svld1 (p0, x0 + svcntd ()))
+
+/*
+** ld1_f64_7:
+**	ld1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_f64_7, svfloat64_t, float64_t,
+	   z0 = svld1_f64 (p0, x0 + svcntd () * 7),
+	   z0 = svld1 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_f64_8:
+**	incb	x0, all, mul #8
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_f64_8, svfloat64_t, float64_t,
+	   z0 = svld1_f64 (p0, x0 + svcntd () * 8),
+	   z0 = svld1 (p0, x0 + svcntd () * 8))
+
+/*
+** ld1_f64_m1:
+**	ld1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_f64_m1, svfloat64_t, float64_t,
+	   z0 = svld1_f64 (p0, x0 - svcntd ()),
+	   z0 = svld1 (p0, x0 - svcntd ()))
+
+/*
+** ld1_f64_m8:
+**	ld1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_f64_m8, svfloat64_t, float64_t,
+	   z0 = svld1_f64 (p0, x0 - svcntd () * 8),
+	   z0 = svld1 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_f64_m9:
+**	decb	x0, all, mul #9
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_f64_m9, svfloat64_t, float64_t,
+	   z0 = svld1_f64 (p0, x0 - svcntd () * 9),
+	   z0 = svld1 (p0, x0 - svcntd () * 9))
+
+/*
+** ld1_vnum_f64_0:
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f64_0, svfloat64_t, float64_t,
+	   z0 = svld1_vnum_f64 (p0, x0, 0),
+	   z0 = svld1_vnum (p0, x0, 0))
+
+/*
+** ld1_vnum_f64_1:
+**	ld1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f64_1, svfloat64_t, float64_t,
+	   z0 = svld1_vnum_f64 (p0, x0, 1),
+	   z0 = svld1_vnum (p0, x0, 1))
+
+/*
+** ld1_vnum_f64_7:
+**	ld1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f64_7, svfloat64_t, float64_t,
+	   z0 = svld1_vnum_f64 (p0, x0, 7),
+	   z0 = svld1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_f64_8:
+**	incb	x0, all, mul #8
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f64_8, svfloat64_t, float64_t,
+	   z0 = svld1_vnum_f64 (p0, x0, 8),
+	   z0 = svld1_vnum (p0, x0, 8))
+
+/*
+** ld1_vnum_f64_m1:
+**	ld1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f64_m1, svfloat64_t, float64_t,
+	   z0 = svld1_vnum_f64 (p0, x0, -1),
+	   z0 = svld1_vnum (p0, x0, -1))
+
+/*
+** ld1_vnum_f64_m8:
+**	ld1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f64_m8, svfloat64_t, float64_t,
+	   z0 = svld1_vnum_f64 (p0, x0, -8),
+	   z0 = svld1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_f64_m9:
+**	decb	x0, all, mul #9
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f64_m9, svfloat64_t, float64_t,
+	   z0 = svld1_vnum_f64 (p0, x0, -9),
+	   z0 = svld1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1_vnum_f64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1d	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_f64_x1, svfloat64_t, float64_t,
+	   z0 = svld1_vnum_f64 (p0, x0, x1),
+	   z0 = svld1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_f32.c
new file mode 100644
index 000000000..00b68ff29
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_f32.c
@@ -0,0 +1,272 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_gather_f32_tied1:
+**	ld1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_f32_tied1, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_f32 (p0, z0),
+		     z0_res = svld1_gather_f32 (p0, z0))
+
+/*
+** ld1_gather_f32_untied:
+**	ld1w	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_f32_untied, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_f32 (p0, z1),
+		     z0_res = svld1_gather_f32 (p0, z1))
+
+/*
+** ld1_gather_x0_f32_offset:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_x0_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, x0),
+		     z0_res = svld1_gather_offset_f32 (p0, z0, x0))
+
+/*
+** ld1_gather_m4_f32_offset:
+**	mov	(x[0-9]+), #?-4
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_m4_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, -4),
+		     z0_res = svld1_gather_offset_f32 (p0, z0, -4))
+
+/*
+** ld1_gather_0_f32_offset:
+**	ld1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_0_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, 0),
+		     z0_res = svld1_gather_offset_f32 (p0, z0, 0))
+
+/*
+** ld1_gather_5_f32_offset:
+**	mov	(x[0-9]+), #?5
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_5_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, 5),
+		     z0_res = svld1_gather_offset_f32 (p0, z0, 5))
+
+/*
+** ld1_gather_6_f32_offset:
+**	mov	(x[0-9]+), #?6
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_6_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, 6),
+		     z0_res = svld1_gather_offset_f32 (p0, z0, 6))
+
+/*
+** ld1_gather_7_f32_offset:
+**	mov	(x[0-9]+), #?7
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_7_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, 7),
+		     z0_res = svld1_gather_offset_f32 (p0, z0, 7))
+
+/*
+** ld1_gather_8_f32_offset:
+**	ld1w	z0\.s, p0/z, \[z0\.s, #8\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_8_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, 8),
+		     z0_res = svld1_gather_offset_f32 (p0, z0, 8))
+
+/*
+** ld1_gather_124_f32_offset:
+**	ld1w	z0\.s, p0/z, \[z0\.s, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_124_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, 124),
+		     z0_res = svld1_gather_offset_f32 (p0, z0, 124))
+
+/*
+** ld1_gather_128_f32_offset:
+**	mov	(x[0-9]+), #?128
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_128_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_f32 (p0, z0, 128),
+		     z0_res = svld1_gather_offset_f32 (p0, z0, 128))
+
+/*
+** ld1_gather_x0_f32_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_x0_f32_index, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_f32 (p0, z0, x0),
+		     z0_res = svld1_gather_index_f32 (p0, z0, x0))
+
+/*
+** ld1_gather_m1_f32_index:
+**	mov	(x[0-9]+), #?-4
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_m1_f32_index, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_f32 (p0, z0, -1),
+		     z0_res = svld1_gather_index_f32 (p0, z0, -1))
+
+/*
+** ld1_gather_0_f32_index:
+**	ld1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_0_f32_index, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_f32 (p0, z0, 0),
+		     z0_res = svld1_gather_index_f32 (p0, z0, 0))
+
+/*
+** ld1_gather_5_f32_index:
+**	ld1w	z0\.s, p0/z, \[z0\.s, #20\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_5_f32_index, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_f32 (p0, z0, 5),
+		     z0_res = svld1_gather_index_f32 (p0, z0, 5))
+
+/*
+** ld1_gather_31_f32_index:
+**	ld1w	z0\.s, p0/z, \[z0\.s, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_31_f32_index, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_f32 (p0, z0, 31),
+		     z0_res = svld1_gather_index_f32 (p0, z0, 31))
+
+/*
+** ld1_gather_32_f32_index:
+**	mov	(x[0-9]+), #?128
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_32_f32_index, svfloat32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_f32 (p0, z0, 32),
+		     z0_res = svld1_gather_index_f32 (p0, z0, 32))
+
+/*
+** ld1_gather_x0_f32_s32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_f32_s32offset, svfloat32_t, float32_t, svint32_t,
+		     z0_res = svld1_gather_s32offset_f32 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_f32_s32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_f32_s32offset, svfloat32_t, float32_t, svint32_t,
+		     z0_res = svld1_gather_s32offset_f32 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_untied_f32_s32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_f32_s32offset, svfloat32_t, float32_t, svint32_t,
+		     z0_res = svld1_gather_s32offset_f32 (p0, x0, z1),
+		     z0_res = svld1_gather_offset (p0, x0, z1))
+
+/*
+** ld1_gather_x0_f32_u32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_f32_u32offset, svfloat32_t, float32_t, svuint32_t,
+		     z0_res = svld1_gather_u32offset_f32 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_f32_u32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_f32_u32offset, svfloat32_t, float32_t, svuint32_t,
+		     z0_res = svld1_gather_u32offset_f32 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_untied_f32_u32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_f32_u32offset, svfloat32_t, float32_t, svuint32_t,
+		     z0_res = svld1_gather_u32offset_f32 (p0, x0, z1),
+		     z0_res = svld1_gather_offset (p0, x0, z1))
+
+/*
+** ld1_gather_x0_f32_s32index:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_f32_s32index, svfloat32_t, float32_t, svint32_t,
+		     z0_res = svld1_gather_s32index_f32 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_f32_s32index:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_f32_s32index, svfloat32_t, float32_t, svint32_t,
+		     z0_res = svld1_gather_s32index_f32 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_untied_f32_s32index:
+**	ld1w	z0\.s, p0/z, \[x0, z1\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_f32_s32index, svfloat32_t, float32_t, svint32_t,
+		     z0_res = svld1_gather_s32index_f32 (p0, x0, z1),
+		     z0_res = svld1_gather_index (p0, x0, z1))
+
+/*
+** ld1_gather_x0_f32_u32index:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_f32_u32index, svfloat32_t, float32_t, svuint32_t,
+		     z0_res = svld1_gather_u32index_f32 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_f32_u32index:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_f32_u32index, svfloat32_t, float32_t, svuint32_t,
+		     z0_res = svld1_gather_u32index_f32 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_untied_f32_u32index:
+**	ld1w	z0\.s, p0/z, \[x0, z1\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_f32_u32index, svfloat32_t, float32_t, svuint32_t,
+		     z0_res = svld1_gather_u32index_f32 (p0, x0, z1),
+		     z0_res = svld1_gather_index (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_f64.c
new file mode 100644
index 000000000..47127960c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_f64.c
@@ -0,0 +1,348 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_gather_f64_tied1:
+**	ld1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_f64_tied1, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_f64 (p0, z0),
+		     z0_res = svld1_gather_f64 (p0, z0))
+
+/*
+** ld1_gather_f64_untied:
+**	ld1d	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_f64_untied, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_f64 (p0, z1),
+		     z0_res = svld1_gather_f64 (p0, z1))
+
+/*
+** ld1_gather_x0_f64_offset:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_x0_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, x0),
+		     z0_res = svld1_gather_offset_f64 (p0, z0, x0))
+
+/*
+** ld1_gather_m8_f64_offset:
+**	mov	(x[0-9]+), #?-8
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_m8_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, -8),
+		     z0_res = svld1_gather_offset_f64 (p0, z0, -8))
+
+/*
+** ld1_gather_0_f64_offset:
+**	ld1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_0_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 0),
+		     z0_res = svld1_gather_offset_f64 (p0, z0, 0))
+
+/*
+** ld1_gather_9_f64_offset:
+**	mov	(x[0-9]+), #?9
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_9_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 9),
+		     z0_res = svld1_gather_offset_f64 (p0, z0, 9))
+
+/*
+** ld1_gather_10_f64_offset:
+**	mov	(x[0-9]+), #?10
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_10_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 10),
+		     z0_res = svld1_gather_offset_f64 (p0, z0, 10))
+
+/*
+** ld1_gather_11_f64_offset:
+**	mov	(x[0-9]+), #?11
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_11_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 11),
+		     z0_res = svld1_gather_offset_f64 (p0, z0, 11))
+
+/*
+** ld1_gather_12_f64_offset:
+**	mov	(x[0-9]+), #?12
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_12_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 12),
+		     z0_res = svld1_gather_offset_f64 (p0, z0, 12))
+
+/*
+** ld1_gather_13_f64_offset:
+**	mov	(x[0-9]+), #?13
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_13_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 13),
+		     z0_res = svld1_gather_offset_f64 (p0, z0, 13))
+
+/*
+** ld1_gather_14_f64_offset:
+**	mov	(x[0-9]+), #?14
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_14_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 14),
+		     z0_res = svld1_gather_offset_f64 (p0, z0, 14))
+
+/*
+** ld1_gather_15_f64_offset:
+**	mov	(x[0-9]+), #?15
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_15_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 15),
+		     z0_res = svld1_gather_offset_f64 (p0, z0, 15))
+
+/*
+** ld1_gather_16_f64_offset:
+**	ld1d	z0\.d, p0/z, \[z0\.d, #16\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_16_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 16),
+		     z0_res = svld1_gather_offset_f64 (p0, z0, 16))
+
+/*
+** ld1_gather_248_f64_offset:
+**	ld1d	z0\.d, p0/z, \[z0\.d, #248\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_248_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 248),
+		     z0_res = svld1_gather_offset_f64 (p0, z0, 248))
+
+/*
+** ld1_gather_256_f64_offset:
+**	mov	(x[0-9]+), #?256
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_256_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_f64 (p0, z0, 256),
+		     z0_res = svld1_gather_offset_f64 (p0, z0, 256))
+
+/*
+** ld1_gather_x0_f64_index:
+**	lsl	(x[0-9]+), x0, #?3
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_x0_f64_index, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_f64 (p0, z0, x0),
+		     z0_res = svld1_gather_index_f64 (p0, z0, x0))
+
+/*
+** ld1_gather_m1_f64_index:
+**	mov	(x[0-9]+), #?-8
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_m1_f64_index, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_f64 (p0, z0, -1),
+		     z0_res = svld1_gather_index_f64 (p0, z0, -1))
+
+/*
+** ld1_gather_0_f64_index:
+**	ld1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_0_f64_index, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_f64 (p0, z0, 0),
+		     z0_res = svld1_gather_index_f64 (p0, z0, 0))
+
+/*
+** ld1_gather_5_f64_index:
+**	ld1d	z0\.d, p0/z, \[z0\.d, #40\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_5_f64_index, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_f64 (p0, z0, 5),
+		     z0_res = svld1_gather_index_f64 (p0, z0, 5))
+
+/*
+** ld1_gather_31_f64_index:
+**	ld1d	z0\.d, p0/z, \[z0\.d, #248\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_31_f64_index, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_f64 (p0, z0, 31),
+		     z0_res = svld1_gather_index_f64 (p0, z0, 31))
+
+/*
+** ld1_gather_32_f64_index:
+**	mov	(x[0-9]+), #?256
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_32_f64_index, svfloat64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_f64 (p0, z0, 32),
+		     z0_res = svld1_gather_index_f64 (p0, z0, 32))
+
+/*
+** ld1_gather_x0_f64_s64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_f64_s64offset, svfloat64_t, float64_t, svint64_t,
+		     z0_res = svld1_gather_s64offset_f64 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_f64_s64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_f64_s64offset, svfloat64_t, float64_t, svint64_t,
+		     z0_res = svld1_gather_s64offset_f64 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_untied_f64_s64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_f64_s64offset, svfloat64_t, float64_t, svint64_t,
+		     z0_res = svld1_gather_s64offset_f64 (p0, x0, z1),
+		     z0_res = svld1_gather_offset (p0, x0, z1))
+
+/*
+** ld1_gather_ext_f64_s64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_ext_f64_s64offset, svfloat64_t, float64_t, svint64_t,
+		     z0_res = svld1_gather_s64offset_f64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1_gather_offset (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1_gather_x0_f64_u64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
+		     z0_res = svld1_gather_u64offset_f64 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_f64_u64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
+		     z0_res = svld1_gather_u64offset_f64 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_untied_f64_u64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
+		     z0_res = svld1_gather_u64offset_f64 (p0, x0, z1),
+		     z0_res = svld1_gather_offset (p0, x0, z1))
+
+/*
+** ld1_gather_ext_f64_u64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_ext_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
+		     z0_res = svld1_gather_u64offset_f64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1_gather_offset (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1_gather_x0_f64_s64index:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_f64_s64index, svfloat64_t, float64_t, svint64_t,
+		     z0_res = svld1_gather_s64index_f64 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_f64_s64index:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_f64_s64index, svfloat64_t, float64_t, svint64_t,
+		     z0_res = svld1_gather_s64index_f64 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_untied_f64_s64index:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_f64_s64index, svfloat64_t, float64_t, svint64_t,
+		     z0_res = svld1_gather_s64index_f64 (p0, x0, z1),
+		     z0_res = svld1_gather_index (p0, x0, z1))
+
+/*
+** ld1_gather_ext_f64_s64index:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, sxtw 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_ext_f64_s64index, svfloat64_t, float64_t, svint64_t,
+		     z0_res = svld1_gather_s64index_f64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1_gather_index (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1_gather_x0_f64_u64index:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_f64_u64index, svfloat64_t, float64_t, svuint64_t,
+		     z0_res = svld1_gather_u64index_f64 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_f64_u64index:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_f64_u64index, svfloat64_t, float64_t, svuint64_t,
+		     z0_res = svld1_gather_u64index_f64 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_untied_f64_u64index:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_f64_u64index, svfloat64_t, float64_t, svuint64_t,
+		     z0_res = svld1_gather_u64index_f64 (p0, x0, z1),
+		     z0_res = svld1_gather_index (p0, x0, z1))
+
+/*
+** ld1_gather_ext_f64_u64index:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, uxtw 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_ext_f64_u64index, svfloat64_t, float64_t, svuint64_t,
+		     z0_res = svld1_gather_u64index_f64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1_gather_index (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_s32.c
new file mode 100644
index 000000000..9b6335547
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_s32.c
@@ -0,0 +1,272 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_gather_s32_tied1:
+**	ld1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_s32_tied1, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_s32 (p0, z0),
+		     z0_res = svld1_gather_s32 (p0, z0))
+
+/*
+** ld1_gather_s32_untied:
+**	ld1w	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_s32_untied, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_s32 (p0, z1),
+		     z0_res = svld1_gather_s32 (p0, z1))
+
+/*
+** ld1_gather_x0_s32_offset:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_x0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, x0),
+		     z0_res = svld1_gather_offset_s32 (p0, z0, x0))
+
+/*
+** ld1_gather_m4_s32_offset:
+**	mov	(x[0-9]+), #?-4
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_m4_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, -4),
+		     z0_res = svld1_gather_offset_s32 (p0, z0, -4))
+
+/*
+** ld1_gather_0_s32_offset:
+**	ld1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, 0),
+		     z0_res = svld1_gather_offset_s32 (p0, z0, 0))
+
+/*
+** ld1_gather_5_s32_offset:
+**	mov	(x[0-9]+), #?5
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_5_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, 5),
+		     z0_res = svld1_gather_offset_s32 (p0, z0, 5))
+
+/*
+** ld1_gather_6_s32_offset:
+**	mov	(x[0-9]+), #?6
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_6_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, 6),
+		     z0_res = svld1_gather_offset_s32 (p0, z0, 6))
+
+/*
+** ld1_gather_7_s32_offset:
+**	mov	(x[0-9]+), #?7
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_7_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, 7),
+		     z0_res = svld1_gather_offset_s32 (p0, z0, 7))
+
+/*
+** ld1_gather_8_s32_offset:
+**	ld1w	z0\.s, p0/z, \[z0\.s, #8\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_8_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, 8),
+		     z0_res = svld1_gather_offset_s32 (p0, z0, 8))
+
+/*
+** ld1_gather_124_s32_offset:
+**	ld1w	z0\.s, p0/z, \[z0\.s, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_124_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, 124),
+		     z0_res = svld1_gather_offset_s32 (p0, z0, 124))
+
+/*
+** ld1_gather_128_s32_offset:
+**	mov	(x[0-9]+), #?128
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_128_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_s32 (p0, z0, 128),
+		     z0_res = svld1_gather_offset_s32 (p0, z0, 128))
+
+/*
+** ld1_gather_x0_s32_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_x0_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_s32 (p0, z0, x0),
+		     z0_res = svld1_gather_index_s32 (p0, z0, x0))
+
+/*
+** ld1_gather_m1_s32_index:
+**	mov	(x[0-9]+), #?-4
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_m1_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_s32 (p0, z0, -1),
+		     z0_res = svld1_gather_index_s32 (p0, z0, -1))
+
+/*
+** ld1_gather_0_s32_index:
+**	ld1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_0_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_s32 (p0, z0, 0),
+		     z0_res = svld1_gather_index_s32 (p0, z0, 0))
+
+/*
+** ld1_gather_5_s32_index:
+**	ld1w	z0\.s, p0/z, \[z0\.s, #20\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_5_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_s32 (p0, z0, 5),
+		     z0_res = svld1_gather_index_s32 (p0, z0, 5))
+
+/*
+** ld1_gather_31_s32_index:
+**	ld1w	z0\.s, p0/z, \[z0\.s, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_31_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_s32 (p0, z0, 31),
+		     z0_res = svld1_gather_index_s32 (p0, z0, 31))
+
+/*
+** ld1_gather_32_s32_index:
+**	mov	(x[0-9]+), #?128
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_32_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_s32 (p0, z0, 32),
+		     z0_res = svld1_gather_index_s32 (p0, z0, 32))
+
+/*
+** ld1_gather_x0_s32_s32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_s32_s32offset, svint32_t, int32_t, svint32_t,
+		     z0_res = svld1_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_s32_s32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_s32_s32offset, svint32_t, int32_t, svint32_t,
+		     z0_res = svld1_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_untied_s32_s32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_s32_s32offset, svint32_t, int32_t, svint32_t,
+		     z0_res = svld1_gather_s32offset_s32 (p0, x0, z1),
+		     z0_res = svld1_gather_offset (p0, x0, z1))
+
+/*
+** ld1_gather_x0_s32_u32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_s32_u32offset, svint32_t, int32_t, svuint32_t,
+		     z0_res = svld1_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_s32_u32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_s32_u32offset, svint32_t, int32_t, svuint32_t,
+		     z0_res = svld1_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_untied_s32_u32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_s32_u32offset, svint32_t, int32_t, svuint32_t,
+		     z0_res = svld1_gather_u32offset_s32 (p0, x0, z1),
+		     z0_res = svld1_gather_offset (p0, x0, z1))
+
+/*
+** ld1_gather_x0_s32_s32index:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_s32_s32index, svint32_t, int32_t, svint32_t,
+		     z0_res = svld1_gather_s32index_s32 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_s32_s32index:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_s32_s32index, svint32_t, int32_t, svint32_t,
+		     z0_res = svld1_gather_s32index_s32 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_untied_s32_s32index:
+**	ld1w	z0\.s, p0/z, \[x0, z1\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_s32_s32index, svint32_t, int32_t, svint32_t,
+		     z0_res = svld1_gather_s32index_s32 (p0, x0, z1),
+		     z0_res = svld1_gather_index (p0, x0, z1))
+
+/*
+** ld1_gather_x0_s32_u32index:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_s32_u32index, svint32_t, int32_t, svuint32_t,
+		     z0_res = svld1_gather_u32index_s32 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_s32_u32index:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_s32_u32index, svint32_t, int32_t, svuint32_t,
+		     z0_res = svld1_gather_u32index_s32 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_untied_s32_u32index:
+**	ld1w	z0\.s, p0/z, \[x0, z1\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_s32_u32index, svint32_t, int32_t, svuint32_t,
+		     z0_res = svld1_gather_u32index_s32 (p0, x0, z1),
+		     z0_res = svld1_gather_index (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_s64.c
new file mode 100644
index 000000000..c9cea3ad8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_s64.c
@@ -0,0 +1,348 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_gather_s64_tied1:
+**	ld1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_s64_tied1, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_s64 (p0, z0),
+		     z0_res = svld1_gather_s64 (p0, z0))
+
+/*
+** ld1_gather_s64_untied:
+**	ld1d	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_s64_untied, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_s64 (p0, z1),
+		     z0_res = svld1_gather_s64 (p0, z1))
+
+/*
+** ld1_gather_x0_s64_offset:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_x0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, x0),
+		     z0_res = svld1_gather_offset_s64 (p0, z0, x0))
+
+/*
+** ld1_gather_m8_s64_offset:
+**	mov	(x[0-9]+), #?-8
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_m8_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, -8),
+		     z0_res = svld1_gather_offset_s64 (p0, z0, -8))
+
+/*
+** ld1_gather_0_s64_offset:
+**	ld1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 0),
+		     z0_res = svld1_gather_offset_s64 (p0, z0, 0))
+
+/*
+** ld1_gather_9_s64_offset:
+**	mov	(x[0-9]+), #?9
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_9_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 9),
+		     z0_res = svld1_gather_offset_s64 (p0, z0, 9))
+
+/*
+** ld1_gather_10_s64_offset:
+**	mov	(x[0-9]+), #?10
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_10_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 10),
+		     z0_res = svld1_gather_offset_s64 (p0, z0, 10))
+
+/*
+** ld1_gather_11_s64_offset:
+**	mov	(x[0-9]+), #?11
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_11_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 11),
+		     z0_res = svld1_gather_offset_s64 (p0, z0, 11))
+
+/*
+** ld1_gather_12_s64_offset:
+**	mov	(x[0-9]+), #?12
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_12_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 12),
+		     z0_res = svld1_gather_offset_s64 (p0, z0, 12))
+
+/*
+** ld1_gather_13_s64_offset:
+**	mov	(x[0-9]+), #?13
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_13_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 13),
+		     z0_res = svld1_gather_offset_s64 (p0, z0, 13))
+
+/*
+** ld1_gather_14_s64_offset:
+**	mov	(x[0-9]+), #?14
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_14_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 14),
+		     z0_res = svld1_gather_offset_s64 (p0, z0, 14))
+
+/*
+** ld1_gather_15_s64_offset:
+**	mov	(x[0-9]+), #?15
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_15_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 15),
+		     z0_res = svld1_gather_offset_s64 (p0, z0, 15))
+
+/*
+** ld1_gather_16_s64_offset:
+**	ld1d	z0\.d, p0/z, \[z0\.d, #16\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_16_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 16),
+		     z0_res = svld1_gather_offset_s64 (p0, z0, 16))
+
+/*
+** ld1_gather_248_s64_offset:
+**	ld1d	z0\.d, p0/z, \[z0\.d, #248\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_248_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 248),
+		     z0_res = svld1_gather_offset_s64 (p0, z0, 248))
+
+/*
+** ld1_gather_256_s64_offset:
+**	mov	(x[0-9]+), #?256
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_256_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_s64 (p0, z0, 256),
+		     z0_res = svld1_gather_offset_s64 (p0, z0, 256))
+
+/*
+** ld1_gather_x0_s64_index:
+**	lsl	(x[0-9]+), x0, #?3
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_x0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_s64 (p0, z0, x0),
+		     z0_res = svld1_gather_index_s64 (p0, z0, x0))
+
+/*
+** ld1_gather_m1_s64_index:
+**	mov	(x[0-9]+), #?-8
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_m1_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_s64 (p0, z0, -1),
+		     z0_res = svld1_gather_index_s64 (p0, z0, -1))
+
+/*
+** ld1_gather_0_s64_index:
+**	ld1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_s64 (p0, z0, 0),
+		     z0_res = svld1_gather_index_s64 (p0, z0, 0))
+
+/*
+** ld1_gather_5_s64_index:
+**	ld1d	z0\.d, p0/z, \[z0\.d, #40\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_5_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_s64 (p0, z0, 5),
+		     z0_res = svld1_gather_index_s64 (p0, z0, 5))
+
+/*
+** ld1_gather_31_s64_index:
+**	ld1d	z0\.d, p0/z, \[z0\.d, #248\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_31_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_s64 (p0, z0, 31),
+		     z0_res = svld1_gather_index_s64 (p0, z0, 31))
+
+/*
+** ld1_gather_32_s64_index:
+**	mov	(x[0-9]+), #?256
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_32_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_s64 (p0, z0, 32),
+		     z0_res = svld1_gather_index_s64 (p0, z0, 32))
+
+/*
+** ld1_gather_x0_s64_s64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_s64_s64offset, svint64_t, int64_t, svint64_t,
+		     z0_res = svld1_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_s64_s64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_s64_s64offset, svint64_t, int64_t, svint64_t,
+		     z0_res = svld1_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_untied_s64_s64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_s64_s64offset, svint64_t, int64_t, svint64_t,
+		     z0_res = svld1_gather_s64offset_s64 (p0, x0, z1),
+		     z0_res = svld1_gather_offset (p0, x0, z1))
+
+/*
+** ld1_gather_ext_s64_s64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_ext_s64_s64offset, svint64_t, int64_t, svint64_t,
+		     z0_res = svld1_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1_gather_offset (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1_gather_x0_s64_u64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_s64_u64offset, svint64_t, int64_t, svuint64_t,
+		     z0_res = svld1_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_s64_u64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_s64_u64offset, svint64_t, int64_t, svuint64_t,
+		     z0_res = svld1_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_untied_s64_u64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_s64_u64offset, svint64_t, int64_t, svuint64_t,
+		     z0_res = svld1_gather_u64offset_s64 (p0, x0, z1),
+		     z0_res = svld1_gather_offset (p0, x0, z1))
+
+/*
+** ld1_gather_ext_s64_u64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_ext_s64_u64offset, svint64_t, int64_t, svuint64_t,
+		     z0_res = svld1_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1_gather_offset (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1_gather_x0_s64_s64index:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_s64_s64index, svint64_t, int64_t, svint64_t,
+		     z0_res = svld1_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_s64_s64index:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_s64_s64index, svint64_t, int64_t, svint64_t,
+		     z0_res = svld1_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_untied_s64_s64index:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_s64_s64index, svint64_t, int64_t, svint64_t,
+		     z0_res = svld1_gather_s64index_s64 (p0, x0, z1),
+		     z0_res = svld1_gather_index (p0, x0, z1))
+
+/*
+** ld1_gather_ext_s64_s64index:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, sxtw 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_ext_s64_s64index, svint64_t, int64_t, svint64_t,
+		     z0_res = svld1_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1_gather_index (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1_gather_x0_s64_u64index:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_s64_u64index, svint64_t, int64_t, svuint64_t,
+		     z0_res = svld1_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_s64_u64index:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_s64_u64index, svint64_t, int64_t, svuint64_t,
+		     z0_res = svld1_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_untied_s64_u64index:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_s64_u64index, svint64_t, int64_t, svuint64_t,
+		     z0_res = svld1_gather_u64index_s64 (p0, x0, z1),
+		     z0_res = svld1_gather_index (p0, x0, z1))
+
+/*
+** ld1_gather_ext_s64_u64index:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, uxtw 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_ext_s64_u64index, svint64_t, int64_t, svuint64_t,
+		     z0_res = svld1_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1_gather_index (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_u32.c
new file mode 100644
index 000000000..2cccc8d49
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_u32.c
@@ -0,0 +1,272 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_gather_u32_tied1:
+**	ld1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_u32_tied1, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_u32 (p0, z0),
+		     z0_res = svld1_gather_u32 (p0, z0))
+
+/*
+** ld1_gather_u32_untied:
+**	ld1w	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_u32_untied, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_u32 (p0, z1),
+		     z0_res = svld1_gather_u32 (p0, z1))
+
+/*
+** ld1_gather_x0_u32_offset:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_x0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, x0),
+		     z0_res = svld1_gather_offset_u32 (p0, z0, x0))
+
+/*
+** ld1_gather_m4_u32_offset:
+**	mov	(x[0-9]+), #?-4
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_m4_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, -4),
+		     z0_res = svld1_gather_offset_u32 (p0, z0, -4))
+
+/*
+** ld1_gather_0_u32_offset:
+**	ld1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, 0),
+		     z0_res = svld1_gather_offset_u32 (p0, z0, 0))
+
+/*
+** ld1_gather_5_u32_offset:
+**	mov	(x[0-9]+), #?5
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_5_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, 5),
+		     z0_res = svld1_gather_offset_u32 (p0, z0, 5))
+
+/*
+** ld1_gather_6_u32_offset:
+**	mov	(x[0-9]+), #?6
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_6_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, 6),
+		     z0_res = svld1_gather_offset_u32 (p0, z0, 6))
+
+/*
+** ld1_gather_7_u32_offset:
+**	mov	(x[0-9]+), #?7
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_7_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, 7),
+		     z0_res = svld1_gather_offset_u32 (p0, z0, 7))
+
+/*
+** ld1_gather_8_u32_offset:
+**	ld1w	z0\.s, p0/z, \[z0\.s, #8\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_8_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, 8),
+		     z0_res = svld1_gather_offset_u32 (p0, z0, 8))
+
+/*
+** ld1_gather_124_u32_offset:
+**	ld1w	z0\.s, p0/z, \[z0\.s, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_124_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, 124),
+		     z0_res = svld1_gather_offset_u32 (p0, z0, 124))
+
+/*
+** ld1_gather_128_u32_offset:
+**	mov	(x[0-9]+), #?128
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_128_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_offset_u32 (p0, z0, 128),
+		     z0_res = svld1_gather_offset_u32 (p0, z0, 128))
+
+/*
+** ld1_gather_x0_u32_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_x0_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_u32 (p0, z0, x0),
+		     z0_res = svld1_gather_index_u32 (p0, z0, x0))
+
+/*
+** ld1_gather_m1_u32_index:
+**	mov	(x[0-9]+), #?-4
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_m1_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_u32 (p0, z0, -1),
+		     z0_res = svld1_gather_index_u32 (p0, z0, -1))
+
+/*
+** ld1_gather_0_u32_index:
+**	ld1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_0_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_u32 (p0, z0, 0),
+		     z0_res = svld1_gather_index_u32 (p0, z0, 0))
+
+/*
+** ld1_gather_5_u32_index:
+**	ld1w	z0\.s, p0/z, \[z0\.s, #20\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_5_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_u32 (p0, z0, 5),
+		     z0_res = svld1_gather_index_u32 (p0, z0, 5))
+
+/*
+** ld1_gather_31_u32_index:
+**	ld1w	z0\.s, p0/z, \[z0\.s, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_31_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_u32 (p0, z0, 31),
+		     z0_res = svld1_gather_index_u32 (p0, z0, 31))
+
+/*
+** ld1_gather_32_u32_index:
+**	mov	(x[0-9]+), #?128
+**	ld1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_32_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32base_index_u32 (p0, z0, 32),
+		     z0_res = svld1_gather_index_u32 (p0, z0, 32))
+
+/*
+** ld1_gather_x0_u32_s32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_u32_s32offset, svuint32_t, uint32_t, svint32_t,
+		     z0_res = svld1_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_u32_s32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_u32_s32offset, svuint32_t, uint32_t, svint32_t,
+		     z0_res = svld1_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_untied_u32_s32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_u32_s32offset, svuint32_t, uint32_t, svint32_t,
+		     z0_res = svld1_gather_s32offset_u32 (p0, x0, z1),
+		     z0_res = svld1_gather_offset (p0, x0, z1))
+
+/*
+** ld1_gather_x0_u32_u32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_u32_u32offset, svuint32_t, uint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_u32_u32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_u32_u32offset, svuint32_t, uint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_untied_u32_u32offset:
+**	ld1w	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_u32_u32offset, svuint32_t, uint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32offset_u32 (p0, x0, z1),
+		     z0_res = svld1_gather_offset (p0, x0, z1))
+
+/*
+** ld1_gather_x0_u32_s32index:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_u32_s32index, svuint32_t, uint32_t, svint32_t,
+		     z0_res = svld1_gather_s32index_u32 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_u32_s32index:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_u32_s32index, svuint32_t, uint32_t, svint32_t,
+		     z0_res = svld1_gather_s32index_u32 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_untied_u32_s32index:
+**	ld1w	z0\.s, p0/z, \[x0, z1\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_u32_s32index, svuint32_t, uint32_t, svint32_t,
+		     z0_res = svld1_gather_s32index_u32 (p0, x0, z1),
+		     z0_res = svld1_gather_index (p0, x0, z1))
+
+/*
+** ld1_gather_x0_u32_u32index:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_u32_u32index, svuint32_t, uint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32index_u32 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_u32_u32index:
+**	ld1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_u32_u32index, svuint32_t, uint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32index_u32 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_untied_u32_u32index:
+**	ld1w	z0\.s, p0/z, \[x0, z1\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_u32_u32index, svuint32_t, uint32_t, svuint32_t,
+		     z0_res = svld1_gather_u32index_u32 (p0, x0, z1),
+		     z0_res = svld1_gather_index (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_u64.c
new file mode 100644
index 000000000..6ee1d48ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_gather_u64.c
@@ -0,0 +1,348 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_gather_u64_tied1:
+**	ld1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_u64_tied1, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_u64 (p0, z0),
+		     z0_res = svld1_gather_u64 (p0, z0))
+
+/*
+** ld1_gather_u64_untied:
+**	ld1d	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_u64_untied, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_u64 (p0, z1),
+		     z0_res = svld1_gather_u64 (p0, z1))
+
+/*
+** ld1_gather_x0_u64_offset:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_x0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, x0),
+		     z0_res = svld1_gather_offset_u64 (p0, z0, x0))
+
+/*
+** ld1_gather_m8_u64_offset:
+**	mov	(x[0-9]+), #?-8
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_m8_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, -8),
+		     z0_res = svld1_gather_offset_u64 (p0, z0, -8))
+
+/*
+** ld1_gather_0_u64_offset:
+**	ld1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 0),
+		     z0_res = svld1_gather_offset_u64 (p0, z0, 0))
+
+/*
+** ld1_gather_9_u64_offset:
+**	mov	(x[0-9]+), #?9
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_9_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 9),
+		     z0_res = svld1_gather_offset_u64 (p0, z0, 9))
+
+/*
+** ld1_gather_10_u64_offset:
+**	mov	(x[0-9]+), #?10
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_10_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 10),
+		     z0_res = svld1_gather_offset_u64 (p0, z0, 10))
+
+/*
+** ld1_gather_11_u64_offset:
+**	mov	(x[0-9]+), #?11
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_11_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 11),
+		     z0_res = svld1_gather_offset_u64 (p0, z0, 11))
+
+/*
+** ld1_gather_12_u64_offset:
+**	mov	(x[0-9]+), #?12
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_12_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 12),
+		     z0_res = svld1_gather_offset_u64 (p0, z0, 12))
+
+/*
+** ld1_gather_13_u64_offset:
+**	mov	(x[0-9]+), #?13
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_13_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 13),
+		     z0_res = svld1_gather_offset_u64 (p0, z0, 13))
+
+/*
+** ld1_gather_14_u64_offset:
+**	mov	(x[0-9]+), #?14
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_14_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 14),
+		     z0_res = svld1_gather_offset_u64 (p0, z0, 14))
+
+/*
+** ld1_gather_15_u64_offset:
+**	mov	(x[0-9]+), #?15
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_15_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 15),
+		     z0_res = svld1_gather_offset_u64 (p0, z0, 15))
+
+/*
+** ld1_gather_16_u64_offset:
+**	ld1d	z0\.d, p0/z, \[z0\.d, #16\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_16_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 16),
+		     z0_res = svld1_gather_offset_u64 (p0, z0, 16))
+
+/*
+** ld1_gather_248_u64_offset:
+**	ld1d	z0\.d, p0/z, \[z0\.d, #248\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_248_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 248),
+		     z0_res = svld1_gather_offset_u64 (p0, z0, 248))
+
+/*
+** ld1_gather_256_u64_offset:
+**	mov	(x[0-9]+), #?256
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_256_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_offset_u64 (p0, z0, 256),
+		     z0_res = svld1_gather_offset_u64 (p0, z0, 256))
+
+/*
+** ld1_gather_x0_u64_index:
+**	lsl	(x[0-9]+), x0, #?3
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_x0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_u64 (p0, z0, x0),
+		     z0_res = svld1_gather_index_u64 (p0, z0, x0))
+
+/*
+** ld1_gather_m1_u64_index:
+**	mov	(x[0-9]+), #?-8
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_m1_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_u64 (p0, z0, -1),
+		     z0_res = svld1_gather_index_u64 (p0, z0, -1))
+
+/*
+** ld1_gather_0_u64_index:
+**	ld1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_u64 (p0, z0, 0),
+		     z0_res = svld1_gather_index_u64 (p0, z0, 0))
+
+/*
+** ld1_gather_5_u64_index:
+**	ld1d	z0\.d, p0/z, \[z0\.d, #40\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_5_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_u64 (p0, z0, 5),
+		     z0_res = svld1_gather_index_u64 (p0, z0, 5))
+
+/*
+** ld1_gather_31_u64_index:
+**	ld1d	z0\.d, p0/z, \[z0\.d, #248\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_31_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_u64 (p0, z0, 31),
+		     z0_res = svld1_gather_index_u64 (p0, z0, 31))
+
+/*
+** ld1_gather_32_u64_index:
+**	mov	(x[0-9]+), #?256
+**	ld1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1_gather_32_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64base_index_u64 (p0, z0, 32),
+		     z0_res = svld1_gather_index_u64 (p0, z0, 32))
+
+/*
+** ld1_gather_x0_u64_s64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_u64_s64offset, svuint64_t, uint64_t, svint64_t,
+		     z0_res = svld1_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_u64_s64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_u64_s64offset, svuint64_t, uint64_t, svint64_t,
+		     z0_res = svld1_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_untied_u64_s64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_u64_s64offset, svuint64_t, uint64_t, svint64_t,
+		     z0_res = svld1_gather_s64offset_u64 (p0, x0, z1),
+		     z0_res = svld1_gather_offset (p0, x0, z1))
+
+/*
+** ld1_gather_ext_u64_s64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_ext_u64_s64offset, svuint64_t, uint64_t, svint64_t,
+		     z0_res = svld1_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1_gather_offset (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1_gather_x0_u64_u64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_u64_u64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svld1_gather_offset (p0, x0, z0))
+
+/*
+** ld1_gather_untied_u64_u64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64offset_u64 (p0, x0, z1),
+		     z0_res = svld1_gather_offset (p0, x0, z1))
+
+/*
+** ld1_gather_ext_u64_u64offset:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_ext_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1_gather_offset (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1_gather_x0_u64_s64index:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_u64_s64index, svuint64_t, uint64_t, svint64_t,
+		     z0_res = svld1_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_u64_s64index:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_u64_s64index, svuint64_t, uint64_t, svint64_t,
+		     z0_res = svld1_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_untied_u64_s64index:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_u64_s64index, svuint64_t, uint64_t, svint64_t,
+		     z0_res = svld1_gather_s64index_u64 (p0, x0, z1),
+		     z0_res = svld1_gather_index (p0, x0, z1))
+
+/*
+** ld1_gather_ext_u64_s64index:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, sxtw 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_ext_u64_s64index, svuint64_t, uint64_t, svint64_t,
+		     z0_res = svld1_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1_gather_index (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1_gather_x0_u64_u64index:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_x0_u64_u64index, svuint64_t, uint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_tied1_u64_u64index:
+**	ld1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_tied1_u64_u64index, svuint64_t, uint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svld1_gather_index (p0, x0, z0))
+
+/*
+** ld1_gather_untied_u64_u64index:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_untied_u64_u64index, svuint64_t, uint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64index_u64 (p0, x0, z1),
+		     z0_res = svld1_gather_index (p0, x0, z1))
+
+/*
+** ld1_gather_ext_u64_u64index:
+**	ld1d	z0\.d, p0/z, \[x0, z1\.d, uxtw 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1_gather_ext_u64_u64index, svuint64_t, uint64_t, svuint64_t,
+		     z0_res = svld1_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1_gather_index (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s16.c
new file mode 100644
index 000000000..d86b49a73
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s16.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_s16_base:
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_s16_base, svint16_t, int16_t,
+	   z0 = svld1_s16 (p0, x0),
+	   z0 = svld1 (p0, x0))
+
+/*
+** ld1_s16_index:
+**	ld1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1_s16_index, svint16_t, int16_t,
+	   z0 = svld1_s16 (p0, x0 + x1),
+	   z0 = svld1 (p0, x0 + x1))
+
+/*
+** ld1_s16_1:
+**	ld1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_s16_1, svint16_t, int16_t,
+	   z0 = svld1_s16 (p0, x0 + svcnth ()),
+	   z0 = svld1 (p0, x0 + svcnth ()))
+
+/*
+** ld1_s16_7:
+**	ld1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_s16_7, svint16_t, int16_t,
+	   z0 = svld1_s16 (p0, x0 + svcnth () * 7),
+	   z0 = svld1 (p0, x0 + svcnth () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_s16_8:
+**	incb	x0, all, mul #8
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_s16_8, svint16_t, int16_t,
+	   z0 = svld1_s16 (p0, x0 + svcnth () * 8),
+	   z0 = svld1 (p0, x0 + svcnth () * 8))
+
+/*
+** ld1_s16_m1:
+**	ld1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_s16_m1, svint16_t, int16_t,
+	   z0 = svld1_s16 (p0, x0 - svcnth ()),
+	   z0 = svld1 (p0, x0 - svcnth ()))
+
+/*
+** ld1_s16_m8:
+**	ld1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_s16_m8, svint16_t, int16_t,
+	   z0 = svld1_s16 (p0, x0 - svcnth () * 8),
+	   z0 = svld1 (p0, x0 - svcnth () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_s16_m9:
+**	decb	x0, all, mul #9
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_s16_m9, svint16_t, int16_t,
+	   z0 = svld1_s16 (p0, x0 - svcnth () * 9),
+	   z0 = svld1 (p0, x0 - svcnth () * 9))
+
+/*
+** ld1_vnum_s16_0:
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s16_0, svint16_t, int16_t,
+	   z0 = svld1_vnum_s16 (p0, x0, 0),
+	   z0 = svld1_vnum (p0, x0, 0))
+
+/*
+** ld1_vnum_s16_1:
+**	ld1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s16_1, svint16_t, int16_t,
+	   z0 = svld1_vnum_s16 (p0, x0, 1),
+	   z0 = svld1_vnum (p0, x0, 1))
+
+/*
+** ld1_vnum_s16_7:
+**	ld1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s16_7, svint16_t, int16_t,
+	   z0 = svld1_vnum_s16 (p0, x0, 7),
+	   z0 = svld1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_s16_8:
+**	incb	x0, all, mul #8
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s16_8, svint16_t, int16_t,
+	   z0 = svld1_vnum_s16 (p0, x0, 8),
+	   z0 = svld1_vnum (p0, x0, 8))
+
+/*
+** ld1_vnum_s16_m1:
+**	ld1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s16_m1, svint16_t, int16_t,
+	   z0 = svld1_vnum_s16 (p0, x0, -1),
+	   z0 = svld1_vnum (p0, x0, -1))
+
+/*
+** ld1_vnum_s16_m8:
+**	ld1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s16_m8, svint16_t, int16_t,
+	   z0 = svld1_vnum_s16 (p0, x0, -8),
+	   z0 = svld1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_s16_m9:
+**	decb	x0, all, mul #9
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s16_m9, svint16_t, int16_t,
+	   z0 = svld1_vnum_s16 (p0, x0, -9),
+	   z0 = svld1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1_vnum_s16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1h	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s16_x1, svint16_t, int16_t,
+	   z0 = svld1_vnum_s16 (p0, x0, x1),
+	   z0 = svld1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s32.c
new file mode 100644
index 000000000..5b692e510
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_s32_base:
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_s32_base, svint32_t, int32_t,
+	   z0 = svld1_s32 (p0, x0),
+	   z0 = svld1 (p0, x0))
+
+/*
+** ld1_s32_index:
+**	ld1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld1_s32_index, svint32_t, int32_t,
+	   z0 = svld1_s32 (p0, x0 + x1),
+	   z0 = svld1 (p0, x0 + x1))
+
+/*
+** ld1_s32_1:
+**	ld1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_s32_1, svint32_t, int32_t,
+	   z0 = svld1_s32 (p0, x0 + svcntw ()),
+	   z0 = svld1 (p0, x0 + svcntw ()))
+
+/*
+** ld1_s32_7:
+**	ld1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_s32_7, svint32_t, int32_t,
+	   z0 = svld1_s32 (p0, x0 + svcntw () * 7),
+	   z0 = svld1 (p0, x0 + svcntw () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_s32_8:
+**	incb	x0, all, mul #8
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_s32_8, svint32_t, int32_t,
+	   z0 = svld1_s32 (p0, x0 + svcntw () * 8),
+	   z0 = svld1 (p0, x0 + svcntw () * 8))
+
+/*
+** ld1_s32_m1:
+**	ld1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_s32_m1, svint32_t, int32_t,
+	   z0 = svld1_s32 (p0, x0 - svcntw ()),
+	   z0 = svld1 (p0, x0 - svcntw ()))
+
+/*
+** ld1_s32_m8:
+**	ld1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_s32_m8, svint32_t, int32_t,
+	   z0 = svld1_s32 (p0, x0 - svcntw () * 8),
+	   z0 = svld1 (p0, x0 - svcntw () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_s32_m9:
+**	decb	x0, all, mul #9
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_s32_m9, svint32_t, int32_t,
+	   z0 = svld1_s32 (p0, x0 - svcntw () * 9),
+	   z0 = svld1 (p0, x0 - svcntw () * 9))
+
+/*
+** ld1_vnum_s32_0:
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s32_0, svint32_t, int32_t,
+	   z0 = svld1_vnum_s32 (p0, x0, 0),
+	   z0 = svld1_vnum (p0, x0, 0))
+
+/*
+** ld1_vnum_s32_1:
+**	ld1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s32_1, svint32_t, int32_t,
+	   z0 = svld1_vnum_s32 (p0, x0, 1),
+	   z0 = svld1_vnum (p0, x0, 1))
+
+/*
+** ld1_vnum_s32_7:
+**	ld1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s32_7, svint32_t, int32_t,
+	   z0 = svld1_vnum_s32 (p0, x0, 7),
+	   z0 = svld1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_s32_8:
+**	incb	x0, all, mul #8
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s32_8, svint32_t, int32_t,
+	   z0 = svld1_vnum_s32 (p0, x0, 8),
+	   z0 = svld1_vnum (p0, x0, 8))
+
+/*
+** ld1_vnum_s32_m1:
+**	ld1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s32_m1, svint32_t, int32_t,
+	   z0 = svld1_vnum_s32 (p0, x0, -1),
+	   z0 = svld1_vnum (p0, x0, -1))
+
+/*
+** ld1_vnum_s32_m8:
+**	ld1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s32_m8, svint32_t, int32_t,
+	   z0 = svld1_vnum_s32 (p0, x0, -8),
+	   z0 = svld1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_s32_m9:
+**	decb	x0, all, mul #9
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s32_m9, svint32_t, int32_t,
+	   z0 = svld1_vnum_s32 (p0, x0, -9),
+	   z0 = svld1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1_vnum_s32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1w	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s32_x1, svint32_t, int32_t,
+	   z0 = svld1_vnum_s32 (p0, x0, x1),
+	   z0 = svld1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s64.c
new file mode 100644
index 000000000..15ee29bba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_s64_base:
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_s64_base, svint64_t, int64_t,
+	   z0 = svld1_s64 (p0, x0),
+	   z0 = svld1 (p0, x0))
+
+/*
+** ld1_s64_index:
+**	ld1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld1_s64_index, svint64_t, int64_t,
+	   z0 = svld1_s64 (p0, x0 + x1),
+	   z0 = svld1 (p0, x0 + x1))
+
+/*
+** ld1_s64_1:
+**	ld1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_s64_1, svint64_t, int64_t,
+	   z0 = svld1_s64 (p0, x0 + svcntd ()),
+	   z0 = svld1 (p0, x0 + svcntd ()))
+
+/*
+** ld1_s64_7:
+**	ld1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_s64_7, svint64_t, int64_t,
+	   z0 = svld1_s64 (p0, x0 + svcntd () * 7),
+	   z0 = svld1 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_s64_8:
+**	incb	x0, all, mul #8
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_s64_8, svint64_t, int64_t,
+	   z0 = svld1_s64 (p0, x0 + svcntd () * 8),
+	   z0 = svld1 (p0, x0 + svcntd () * 8))
+
+/*
+** ld1_s64_m1:
+**	ld1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_s64_m1, svint64_t, int64_t,
+	   z0 = svld1_s64 (p0, x0 - svcntd ()),
+	   z0 = svld1 (p0, x0 - svcntd ()))
+
+/*
+** ld1_s64_m8:
+**	ld1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_s64_m8, svint64_t, int64_t,
+	   z0 = svld1_s64 (p0, x0 - svcntd () * 8),
+	   z0 = svld1 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_s64_m9:
+**	decb	x0, all, mul #9
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_s64_m9, svint64_t, int64_t,
+	   z0 = svld1_s64 (p0, x0 - svcntd () * 9),
+	   z0 = svld1 (p0, x0 - svcntd () * 9))
+
+/*
+** ld1_vnum_s64_0:
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s64_0, svint64_t, int64_t,
+	   z0 = svld1_vnum_s64 (p0, x0, 0),
+	   z0 = svld1_vnum (p0, x0, 0))
+
+/*
+** ld1_vnum_s64_1:
+**	ld1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s64_1, svint64_t, int64_t,
+	   z0 = svld1_vnum_s64 (p0, x0, 1),
+	   z0 = svld1_vnum (p0, x0, 1))
+
+/*
+** ld1_vnum_s64_7:
+**	ld1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s64_7, svint64_t, int64_t,
+	   z0 = svld1_vnum_s64 (p0, x0, 7),
+	   z0 = svld1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_s64_8:
+**	incb	x0, all, mul #8
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s64_8, svint64_t, int64_t,
+	   z0 = svld1_vnum_s64 (p0, x0, 8),
+	   z0 = svld1_vnum (p0, x0, 8))
+
+/*
+** ld1_vnum_s64_m1:
+**	ld1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s64_m1, svint64_t, int64_t,
+	   z0 = svld1_vnum_s64 (p0, x0, -1),
+	   z0 = svld1_vnum (p0, x0, -1))
+
+/*
+** ld1_vnum_s64_m8:
+**	ld1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s64_m8, svint64_t, int64_t,
+	   z0 = svld1_vnum_s64 (p0, x0, -8),
+	   z0 = svld1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_s64_m9:
+**	decb	x0, all, mul #9
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s64_m9, svint64_t, int64_t,
+	   z0 = svld1_vnum_s64 (p0, x0, -9),
+	   z0 = svld1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1_vnum_s64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1d	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s64_x1, svint64_t, int64_t,
+	   z0 = svld1_vnum_s64 (p0, x0, x1),
+	   z0 = svld1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s8.c
new file mode 100644
index 000000000..036fb3d41
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_s8.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_s8_base:
+**	ld1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_s8_base, svint8_t, int8_t,
+	   z0 = svld1_s8 (p0, x0),
+	   z0 = svld1 (p0, x0))
+
+/*
+** ld1_s8_index:
+**	ld1b	z0\.b, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1_s8_index, svint8_t, int8_t,
+	   z0 = svld1_s8 (p0, x0 + x1),
+	   z0 = svld1 (p0, x0 + x1))
+
+/*
+** ld1_s8_1:
+**	ld1b	z0\.b, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_s8_1, svint8_t, int8_t,
+	   z0 = svld1_s8 (p0, x0 + svcntb ()),
+	   z0 = svld1 (p0, x0 + svcntb ()))
+
+/*
+** ld1_s8_7:
+**	ld1b	z0\.b, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_s8_7, svint8_t, int8_t,
+	   z0 = svld1_s8 (p0, x0 + svcntb () * 7),
+	   z0 = svld1 (p0, x0 + svcntb () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_s8_8:
+**	incb	x0, all, mul #8
+**	ld1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_s8_8, svint8_t, int8_t,
+	   z0 = svld1_s8 (p0, x0 + svcntb () * 8),
+	   z0 = svld1 (p0, x0 + svcntb () * 8))
+
+/*
+** ld1_s8_m1:
+**	ld1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_s8_m1, svint8_t, int8_t,
+	   z0 = svld1_s8 (p0, x0 - svcntb ()),
+	   z0 = svld1 (p0, x0 - svcntb ()))
+
+/*
+** ld1_s8_m8:
+**	ld1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_s8_m8, svint8_t, int8_t,
+	   z0 = svld1_s8 (p0, x0 - svcntb () * 8),
+	   z0 = svld1 (p0, x0 - svcntb () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_s8_m9:
+**	decb	x0, all, mul #9
+**	ld1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_s8_m9, svint8_t, int8_t,
+	   z0 = svld1_s8 (p0, x0 - svcntb () * 9),
+	   z0 = svld1 (p0, x0 - svcntb () * 9))
+
+/*
+** ld1_vnum_s8_0:
+**	ld1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s8_0, svint8_t, int8_t,
+	   z0 = svld1_vnum_s8 (p0, x0, 0),
+	   z0 = svld1_vnum (p0, x0, 0))
+
+/*
+** ld1_vnum_s8_1:
+**	ld1b	z0\.b, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s8_1, svint8_t, int8_t,
+	   z0 = svld1_vnum_s8 (p0, x0, 1),
+	   z0 = svld1_vnum (p0, x0, 1))
+
+/*
+** ld1_vnum_s8_7:
+**	ld1b	z0\.b, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s8_7, svint8_t, int8_t,
+	   z0 = svld1_vnum_s8 (p0, x0, 7),
+	   z0 = svld1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_s8_8:
+**	incb	x0, all, mul #8
+**	ld1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s8_8, svint8_t, int8_t,
+	   z0 = svld1_vnum_s8 (p0, x0, 8),
+	   z0 = svld1_vnum (p0, x0, 8))
+
+/*
+** ld1_vnum_s8_m1:
+**	ld1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s8_m1, svint8_t, int8_t,
+	   z0 = svld1_vnum_s8 (p0, x0, -1),
+	   z0 = svld1_vnum (p0, x0, -1))
+
+/*
+** ld1_vnum_s8_m8:
+**	ld1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s8_m8, svint8_t, int8_t,
+	   z0 = svld1_vnum_s8 (p0, x0, -8),
+	   z0 = svld1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_s8_m9:
+**	decb	x0, all, mul #9
+**	ld1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s8_m9, svint8_t, int8_t,
+	   z0 = svld1_vnum_s8 (p0, x0, -9),
+	   z0 = svld1_vnum (p0, x0, -9))
+
+/*
+** ld1_vnum_s8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld1b	z0\.b, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld1b	z0\.b, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld1_vnum_s8_x1, svint8_t, int8_t,
+	   z0 = svld1_vnum_s8 (p0, x0, x1),
+	   z0 = svld1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u16.c
new file mode 100644
index 000000000..ee25b9e37
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u16.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_u16_base:
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_u16_base, svuint16_t, uint16_t,
+	   z0 = svld1_u16 (p0, x0),
+	   z0 = svld1 (p0, x0))
+
+/*
+** ld1_u16_index:
+**	ld1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1_u16_index, svuint16_t, uint16_t,
+	   z0 = svld1_u16 (p0, x0 + x1),
+	   z0 = svld1 (p0, x0 + x1))
+
+/*
+** ld1_u16_1:
+**	ld1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_u16_1, svuint16_t, uint16_t,
+	   z0 = svld1_u16 (p0, x0 + svcnth ()),
+	   z0 = svld1 (p0, x0 + svcnth ()))
+
+/*
+** ld1_u16_7:
+**	ld1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_u16_7, svuint16_t, uint16_t,
+	   z0 = svld1_u16 (p0, x0 + svcnth () * 7),
+	   z0 = svld1 (p0, x0 + svcnth () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_u16_8:
+**	incb	x0, all, mul #8
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_u16_8, svuint16_t, uint16_t,
+	   z0 = svld1_u16 (p0, x0 + svcnth () * 8),
+	   z0 = svld1 (p0, x0 + svcnth () * 8))
+
+/*
+** ld1_u16_m1:
+**	ld1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_u16_m1, svuint16_t, uint16_t,
+	   z0 = svld1_u16 (p0, x0 - svcnth ()),
+	   z0 = svld1 (p0, x0 - svcnth ()))
+
+/*
+** ld1_u16_m8:
+**	ld1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_u16_m8, svuint16_t, uint16_t,
+	   z0 = svld1_u16 (p0, x0 - svcnth () * 8),
+	   z0 = svld1 (p0, x0 - svcnth () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_u16_m9:
+**	decb	x0, all, mul #9
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_u16_m9, svuint16_t, uint16_t,
+	   z0 = svld1_u16 (p0, x0 - svcnth () * 9),
+	   z0 = svld1 (p0, x0 - svcnth () * 9))
+
+/*
+** ld1_vnum_u16_0:
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u16_0, svuint16_t, uint16_t,
+	   z0 = svld1_vnum_u16 (p0, x0, 0),
+	   z0 = svld1_vnum (p0, x0, 0))
+
+/*
+** ld1_vnum_u16_1:
+**	ld1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u16_1, svuint16_t, uint16_t,
+	   z0 = svld1_vnum_u16 (p0, x0, 1),
+	   z0 = svld1_vnum (p0, x0, 1))
+
+/*
+** ld1_vnum_u16_7:
+**	ld1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u16_7, svuint16_t, uint16_t,
+	   z0 = svld1_vnum_u16 (p0, x0, 7),
+	   z0 = svld1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_u16_8:
+**	incb	x0, all, mul #8
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u16_8, svuint16_t, uint16_t,
+	   z0 = svld1_vnum_u16 (p0, x0, 8),
+	   z0 = svld1_vnum (p0, x0, 8))
+
+/*
+** ld1_vnum_u16_m1:
+**	ld1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u16_m1, svuint16_t, uint16_t,
+	   z0 = svld1_vnum_u16 (p0, x0, -1),
+	   z0 = svld1_vnum (p0, x0, -1))
+
+/*
+** ld1_vnum_u16_m8:
+**	ld1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u16_m8, svuint16_t, uint16_t,
+	   z0 = svld1_vnum_u16 (p0, x0, -8),
+	   z0 = svld1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_u16_m9:
+**	decb	x0, all, mul #9
+**	ld1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u16_m9, svuint16_t, uint16_t,
+	   z0 = svld1_vnum_u16 (p0, x0, -9),
+	   z0 = svld1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1_vnum_u16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1h	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u16_x1, svuint16_t, uint16_t,
+	   z0 = svld1_vnum_u16 (p0, x0, x1),
+	   z0 = svld1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u32.c
new file mode 100644
index 000000000..bcd304126
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_u32_base:
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_u32_base, svuint32_t, uint32_t,
+	   z0 = svld1_u32 (p0, x0),
+	   z0 = svld1 (p0, x0))
+
+/*
+** ld1_u32_index:
+**	ld1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld1_u32_index, svuint32_t, uint32_t,
+	   z0 = svld1_u32 (p0, x0 + x1),
+	   z0 = svld1 (p0, x0 + x1))
+
+/*
+** ld1_u32_1:
+**	ld1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_u32_1, svuint32_t, uint32_t,
+	   z0 = svld1_u32 (p0, x0 + svcntw ()),
+	   z0 = svld1 (p0, x0 + svcntw ()))
+
+/*
+** ld1_u32_7:
+**	ld1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_u32_7, svuint32_t, uint32_t,
+	   z0 = svld1_u32 (p0, x0 + svcntw () * 7),
+	   z0 = svld1 (p0, x0 + svcntw () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_u32_8:
+**	incb	x0, all, mul #8
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_u32_8, svuint32_t, uint32_t,
+	   z0 = svld1_u32 (p0, x0 + svcntw () * 8),
+	   z0 = svld1 (p0, x0 + svcntw () * 8))
+
+/*
+** ld1_u32_m1:
+**	ld1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_u32_m1, svuint32_t, uint32_t,
+	   z0 = svld1_u32 (p0, x0 - svcntw ()),
+	   z0 = svld1 (p0, x0 - svcntw ()))
+
+/*
+** ld1_u32_m8:
+**	ld1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_u32_m8, svuint32_t, uint32_t,
+	   z0 = svld1_u32 (p0, x0 - svcntw () * 8),
+	   z0 = svld1 (p0, x0 - svcntw () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_u32_m9:
+**	decb	x0, all, mul #9
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_u32_m9, svuint32_t, uint32_t,
+	   z0 = svld1_u32 (p0, x0 - svcntw () * 9),
+	   z0 = svld1 (p0, x0 - svcntw () * 9))
+
+/*
+** ld1_vnum_u32_0:
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u32_0, svuint32_t, uint32_t,
+	   z0 = svld1_vnum_u32 (p0, x0, 0),
+	   z0 = svld1_vnum (p0, x0, 0))
+
+/*
+** ld1_vnum_u32_1:
+**	ld1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u32_1, svuint32_t, uint32_t,
+	   z0 = svld1_vnum_u32 (p0, x0, 1),
+	   z0 = svld1_vnum (p0, x0, 1))
+
+/*
+** ld1_vnum_u32_7:
+**	ld1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u32_7, svuint32_t, uint32_t,
+	   z0 = svld1_vnum_u32 (p0, x0, 7),
+	   z0 = svld1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_u32_8:
+**	incb	x0, all, mul #8
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u32_8, svuint32_t, uint32_t,
+	   z0 = svld1_vnum_u32 (p0, x0, 8),
+	   z0 = svld1_vnum (p0, x0, 8))
+
+/*
+** ld1_vnum_u32_m1:
+**	ld1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u32_m1, svuint32_t, uint32_t,
+	   z0 = svld1_vnum_u32 (p0, x0, -1),
+	   z0 = svld1_vnum (p0, x0, -1))
+
+/*
+** ld1_vnum_u32_m8:
+**	ld1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u32_m8, svuint32_t, uint32_t,
+	   z0 = svld1_vnum_u32 (p0, x0, -8),
+	   z0 = svld1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_u32_m9:
+**	decb	x0, all, mul #9
+**	ld1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u32_m9, svuint32_t, uint32_t,
+	   z0 = svld1_vnum_u32 (p0, x0, -9),
+	   z0 = svld1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1_vnum_u32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1w	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u32_x1, svuint32_t, uint32_t,
+	   z0 = svld1_vnum_u32 (p0, x0, x1),
+	   z0 = svld1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u64.c
new file mode 100644
index 000000000..ebb874720
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_u64_base:
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_u64_base, svuint64_t, uint64_t,
+	   z0 = svld1_u64 (p0, x0),
+	   z0 = svld1 (p0, x0))
+
+/*
+** ld1_u64_index:
+**	ld1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld1_u64_index, svuint64_t, uint64_t,
+	   z0 = svld1_u64 (p0, x0 + x1),
+	   z0 = svld1 (p0, x0 + x1))
+
+/*
+** ld1_u64_1:
+**	ld1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_u64_1, svuint64_t, uint64_t,
+	   z0 = svld1_u64 (p0, x0 + svcntd ()),
+	   z0 = svld1 (p0, x0 + svcntd ()))
+
+/*
+** ld1_u64_7:
+**	ld1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_u64_7, svuint64_t, uint64_t,
+	   z0 = svld1_u64 (p0, x0 + svcntd () * 7),
+	   z0 = svld1 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_u64_8:
+**	incb	x0, all, mul #8
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_u64_8, svuint64_t, uint64_t,
+	   z0 = svld1_u64 (p0, x0 + svcntd () * 8),
+	   z0 = svld1 (p0, x0 + svcntd () * 8))
+
+/*
+** ld1_u64_m1:
+**	ld1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_u64_m1, svuint64_t, uint64_t,
+	   z0 = svld1_u64 (p0, x0 - svcntd ()),
+	   z0 = svld1 (p0, x0 - svcntd ()))
+
+/*
+** ld1_u64_m8:
+**	ld1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_u64_m8, svuint64_t, uint64_t,
+	   z0 = svld1_u64 (p0, x0 - svcntd () * 8),
+	   z0 = svld1 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_u64_m9:
+**	decb	x0, all, mul #9
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_u64_m9, svuint64_t, uint64_t,
+	   z0 = svld1_u64 (p0, x0 - svcntd () * 9),
+	   z0 = svld1 (p0, x0 - svcntd () * 9))
+
+/*
+** ld1_vnum_u64_0:
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u64_0, svuint64_t, uint64_t,
+	   z0 = svld1_vnum_u64 (p0, x0, 0),
+	   z0 = svld1_vnum (p0, x0, 0))
+
+/*
+** ld1_vnum_u64_1:
+**	ld1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u64_1, svuint64_t, uint64_t,
+	   z0 = svld1_vnum_u64 (p0, x0, 1),
+	   z0 = svld1_vnum (p0, x0, 1))
+
+/*
+** ld1_vnum_u64_7:
+**	ld1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u64_7, svuint64_t, uint64_t,
+	   z0 = svld1_vnum_u64 (p0, x0, 7),
+	   z0 = svld1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_u64_8:
+**	incb	x0, all, mul #8
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u64_8, svuint64_t, uint64_t,
+	   z0 = svld1_vnum_u64 (p0, x0, 8),
+	   z0 = svld1_vnum (p0, x0, 8))
+
+/*
+** ld1_vnum_u64_m1:
+**	ld1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u64_m1, svuint64_t, uint64_t,
+	   z0 = svld1_vnum_u64 (p0, x0, -1),
+	   z0 = svld1_vnum (p0, x0, -1))
+
+/*
+** ld1_vnum_u64_m8:
+**	ld1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u64_m8, svuint64_t, uint64_t,
+	   z0 = svld1_vnum_u64 (p0, x0, -8),
+	   z0 = svld1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_u64_m9:
+**	decb	x0, all, mul #9
+**	ld1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u64_m9, svuint64_t, uint64_t,
+	   z0 = svld1_vnum_u64 (p0, x0, -9),
+	   z0 = svld1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1_vnum_u64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1d	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u64_x1, svuint64_t, uint64_t,
+	   z0 = svld1_vnum_u64 (p0, x0, x1),
+	   z0 = svld1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u8.c
new file mode 100644
index 000000000..12f42bd92
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1_u8.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_u8_base:
+**	ld1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_u8_base, svuint8_t, uint8_t,
+	   z0 = svld1_u8 (p0, x0),
+	   z0 = svld1 (p0, x0))
+
+/*
+** ld1_u8_index:
+**	ld1b	z0\.b, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1_u8_index, svuint8_t, uint8_t,
+	   z0 = svld1_u8 (p0, x0 + x1),
+	   z0 = svld1 (p0, x0 + x1))
+
+/*
+** ld1_u8_1:
+**	ld1b	z0\.b, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_u8_1, svuint8_t, uint8_t,
+	   z0 = svld1_u8 (p0, x0 + svcntb ()),
+	   z0 = svld1 (p0, x0 + svcntb ()))
+
+/*
+** ld1_u8_7:
+**	ld1b	z0\.b, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_u8_7, svuint8_t, uint8_t,
+	   z0 = svld1_u8 (p0, x0 + svcntb () * 7),
+	   z0 = svld1 (p0, x0 + svcntb () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_u8_8:
+**	incb	x0, all, mul #8
+**	ld1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_u8_8, svuint8_t, uint8_t,
+	   z0 = svld1_u8 (p0, x0 + svcntb () * 8),
+	   z0 = svld1 (p0, x0 + svcntb () * 8))
+
+/*
+** ld1_u8_m1:
+**	ld1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_u8_m1, svuint8_t, uint8_t,
+	   z0 = svld1_u8 (p0, x0 - svcntb ()),
+	   z0 = svld1 (p0, x0 - svcntb ()))
+
+/*
+** ld1_u8_m8:
+**	ld1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_u8_m8, svuint8_t, uint8_t,
+	   z0 = svld1_u8 (p0, x0 - svcntb () * 8),
+	   z0 = svld1 (p0, x0 - svcntb () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_u8_m9:
+**	decb	x0, all, mul #9
+**	ld1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_u8_m9, svuint8_t, uint8_t,
+	   z0 = svld1_u8 (p0, x0 - svcntb () * 9),
+	   z0 = svld1 (p0, x0 - svcntb () * 9))
+
+/*
+** ld1_vnum_u8_0:
+**	ld1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u8_0, svuint8_t, uint8_t,
+	   z0 = svld1_vnum_u8 (p0, x0, 0),
+	   z0 = svld1_vnum (p0, x0, 0))
+
+/*
+** ld1_vnum_u8_1:
+**	ld1b	z0\.b, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u8_1, svuint8_t, uint8_t,
+	   z0 = svld1_vnum_u8 (p0, x0, 1),
+	   z0 = svld1_vnum (p0, x0, 1))
+
+/*
+** ld1_vnum_u8_7:
+**	ld1b	z0\.b, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u8_7, svuint8_t, uint8_t,
+	   z0 = svld1_vnum_u8 (p0, x0, 7),
+	   z0 = svld1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_u8_8:
+**	incb	x0, all, mul #8
+**	ld1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u8_8, svuint8_t, uint8_t,
+	   z0 = svld1_vnum_u8 (p0, x0, 8),
+	   z0 = svld1_vnum (p0, x0, 8))
+
+/*
+** ld1_vnum_u8_m1:
+**	ld1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u8_m1, svuint8_t, uint8_t,
+	   z0 = svld1_vnum_u8 (p0, x0, -1),
+	   z0 = svld1_vnum (p0, x0, -1))
+
+/*
+** ld1_vnum_u8_m8:
+**	ld1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u8_m8, svuint8_t, uint8_t,
+	   z0 = svld1_vnum_u8 (p0, x0, -8),
+	   z0 = svld1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1_vnum_u8_m9:
+**	decb	x0, all, mul #9
+**	ld1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u8_m9, svuint8_t, uint8_t,
+	   z0 = svld1_vnum_u8 (p0, x0, -9),
+	   z0 = svld1_vnum (p0, x0, -9))
+
+/*
+** ld1_vnum_u8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld1b	z0\.b, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld1b	z0\.b, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld1_vnum_u8_x1, svuint8_t, uint8_t,
+	   z0 = svld1_vnum_u8 (p0, x0, x1),
+	   z0 = svld1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_bf16.c
new file mode 100644
index 000000000..cb1801778
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_bf16.c
@@ -0,0 +1,120 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
+/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ro_bf16_base:
+**	ld1roh	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ro_bf16_base, svbfloat16_t, bfloat16_t,
+	   z0 = svld1ro_bf16 (p0, x0),
+	   z0 = svld1ro (p0, x0))
+
+/*
+** ld1ro_bf16_index:
+**	ld1roh	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_bf16_index, svbfloat16_t, bfloat16_t,
+	   z0 = svld1ro_bf16 (p0, x0 + x1),
+	   z0 = svld1ro (p0, x0 + x1))
+
+/*
+** ld1ro_bf16_1:
+**	add	(x[0-9]+), x0, #?2
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_bf16_1, svbfloat16_t, bfloat16_t,
+	   z0 = svld1ro_bf16 (p0, x0 + 1),
+	   z0 = svld1ro (p0, x0 + 1))
+
+/*
+** ld1ro_bf16_8:
+**	add	(x[0-9]+), x0, #?16
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_bf16_8, svbfloat16_t, bfloat16_t,
+	   z0 = svld1ro_bf16 (p0, x0 + 8),
+	   z0 = svld1ro (p0, x0 + 8))
+
+/*
+** ld1ro_bf16_128:
+**	add	(x[0-9]+), x0, #?256
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_bf16_128, svbfloat16_t, bfloat16_t,
+	   z0 = svld1ro_bf16 (p0, x0 + 128),
+	   z0 = svld1ro (p0, x0 + 128))
+
+/*
+** ld1ro_bf16_m1:
+**	sub	(x[0-9]+), x0, #?2
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_bf16_m1, svbfloat16_t, bfloat16_t,
+	   z0 = svld1ro_bf16 (p0, x0 - 1),
+	   z0 = svld1ro (p0, x0 - 1))
+
+/*
+** ld1ro_bf16_m8:
+**	sub	(x[0-9]+), x0, #?16
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_bf16_m8, svbfloat16_t, bfloat16_t,
+	   z0 = svld1ro_bf16 (p0, x0 - 8),
+	   z0 = svld1ro (p0, x0 - 8))
+
+/*
+** ld1ro_bf16_m144:
+**	sub	(x[0-9]+), x0, #?288
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_bf16_m144, svbfloat16_t, bfloat16_t,
+	   z0 = svld1ro_bf16 (p0, x0 - 144),
+	   z0 = svld1ro (p0, x0 - 144))
+
+/*
+** ld1ro_bf16_16:
+**	ld1roh	z0\.h, p0/z, \[x0, #?32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_bf16_16, svbfloat16_t, bfloat16_t,
+	   z0 = svld1ro_bf16 (p0, x0 + 16),
+	   z0 = svld1ro (p0, x0 + 16))
+
+/*
+** ld1ro_bf16_112:
+**	ld1roh	z0\.h, p0/z, \[x0, #?224\]
+**	ret
+*/
+TEST_LOAD (ld1ro_bf16_112, svbfloat16_t, bfloat16_t,
+	   z0 = svld1ro_bf16 (p0, x0 + 112),
+	   z0 = svld1ro (p0, x0 + 112))
+
+/*
+** ld1ro_bf16_m16:
+**	ld1roh	z0\.h, p0/z, \[x0, #?-32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_bf16_m16, svbfloat16_t, bfloat16_t,
+	   z0 = svld1ro_bf16 (p0, x0 - 16),
+	   z0 = svld1ro (p0, x0 - 16))
+
+/*
+** ld1ro_bf16_m128:
+**	ld1roh	z0\.h, p0/z, \[x0, #?-256\]
+**	ret
+*/
+TEST_LOAD (ld1ro_bf16_m128, svbfloat16_t, bfloat16_t,
+	   z0 = svld1ro_bf16 (p0, x0 - 128),
+	   z0 = svld1ro (p0, x0 - 128))
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f16.c
new file mode 100644
index 000000000..86081edbd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f16.c
@@ -0,0 +1,120 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
+/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ro_f16_base:
+**	ld1roh	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f16_base, svfloat16_t, float16_t,
+	   z0 = svld1ro_f16 (p0, x0),
+	   z0 = svld1ro (p0, x0))
+
+/*
+** ld1ro_f16_index:
+**	ld1roh	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f16_index, svfloat16_t, float16_t,
+	   z0 = svld1ro_f16 (p0, x0 + x1),
+	   z0 = svld1ro (p0, x0 + x1))
+
+/*
+** ld1ro_f16_1:
+**	add	(x[0-9]+), x0, #?2
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f16_1, svfloat16_t, float16_t,
+	   z0 = svld1ro_f16 (p0, x0 + 1),
+	   z0 = svld1ro (p0, x0 + 1))
+
+/*
+** ld1ro_f16_8:
+**	add	(x[0-9]+), x0, #?16
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f16_8, svfloat16_t, float16_t,
+	   z0 = svld1ro_f16 (p0, x0 + 8),
+	   z0 = svld1ro (p0, x0 + 8))
+
+/*
+** ld1ro_f16_128:
+**	add	(x[0-9]+), x0, #?256
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f16_128, svfloat16_t, float16_t,
+	   z0 = svld1ro_f16 (p0, x0 + 128),
+	   z0 = svld1ro (p0, x0 + 128))
+
+/*
+** ld1ro_f16_m1:
+**	sub	(x[0-9]+), x0, #?2
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f16_m1, svfloat16_t, float16_t,
+	   z0 = svld1ro_f16 (p0, x0 - 1),
+	   z0 = svld1ro (p0, x0 - 1))
+
+/*
+** ld1ro_f16_m8:
+**	sub	(x[0-9]+), x0, #?16
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f16_m8, svfloat16_t, float16_t,
+	   z0 = svld1ro_f16 (p0, x0 - 8),
+	   z0 = svld1ro (p0, x0 - 8))
+
+/*
+** ld1ro_f16_m144:
+**	sub	(x[0-9]+), x0, #?288
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f16_m144, svfloat16_t, float16_t,
+	   z0 = svld1ro_f16 (p0, x0 - 144),
+	   z0 = svld1ro (p0, x0 - 144))
+
+/*
+** ld1ro_f16_16:
+**	ld1roh	z0\.h, p0/z, \[x0, #?32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f16_16, svfloat16_t, float16_t,
+	   z0 = svld1ro_f16 (p0, x0 + 16),
+	   z0 = svld1ro (p0, x0 + 16))
+
+/*
+** ld1ro_f16_112:
+**	ld1roh	z0\.h, p0/z, \[x0, #?224\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f16_112, svfloat16_t, float16_t,
+	   z0 = svld1ro_f16 (p0, x0 + 112),
+	   z0 = svld1ro (p0, x0 + 112))
+
+/*
+** ld1ro_f16_m16:
+**	ld1roh	z0\.h, p0/z, \[x0, #?-32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f16_m16, svfloat16_t, float16_t,
+	   z0 = svld1ro_f16 (p0, x0 - 16),
+	   z0 = svld1ro (p0, x0 - 16))
+
+/*
+** ld1ro_f16_m128:
+**	ld1roh	z0\.h, p0/z, \[x0, #?-256\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f16_m128, svfloat16_t, float16_t,
+	   z0 = svld1ro_f16 (p0, x0 - 128),
+	   z0 = svld1ro (p0, x0 - 128))
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f32.c
new file mode 100644
index 000000000..c8df00f8a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f32.c
@@ -0,0 +1,120 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
+/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ro_f32_base:
+**	ld1row	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f32_base, svfloat32_t, float32_t,
+	   z0 = svld1ro_f32 (p0, x0),
+	   z0 = svld1ro (p0, x0))
+
+/*
+** ld1ro_f32_index:
+**	ld1row	z0\.s, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f32_index, svfloat32_t, float32_t,
+	   z0 = svld1ro_f32 (p0, x0 + x1),
+	   z0 = svld1ro (p0, x0 + x1))
+
+/*
+** ld1ro_f32_1:
+**	add	(x[0-9]+), x0, #?4
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f32_1, svfloat32_t, float32_t,
+	   z0 = svld1ro_f32 (p0, x0 + 1),
+	   z0 = svld1ro (p0, x0 + 1))
+
+/*
+** ld1ro_f32_4:
+**	add	(x[0-9]+), x0, #?16
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f32_4, svfloat32_t, float32_t,
+	   z0 = svld1ro_f32 (p0, x0 + 4),
+	   z0 = svld1ro (p0, x0 + 4))
+
+/*
+** ld1ro_f32_64:
+**	add	(x[0-9]+), x0, #?256
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f32_64, svfloat32_t, float32_t,
+	   z0 = svld1ro_f32 (p0, x0 + 64),
+	   z0 = svld1ro (p0, x0 + 64))
+
+/*
+** ld1ro_f32_m1:
+**	sub	(x[0-9]+), x0, #?4
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f32_m1, svfloat32_t, float32_t,
+	   z0 = svld1ro_f32 (p0, x0 - 1),
+	   z0 = svld1ro (p0, x0 - 1))
+
+/*
+** ld1ro_f32_m4:
+**	sub	(x[0-9]+), x0, #?16
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f32_m4, svfloat32_t, float32_t,
+	   z0 = svld1ro_f32 (p0, x0 - 4),
+	   z0 = svld1ro (p0, x0 - 4))
+
+/*
+** ld1ro_f32_m72:
+**	sub	(x[0-9]+), x0, #?288
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f32_m72, svfloat32_t, float32_t,
+	   z0 = svld1ro_f32 (p0, x0 - 72),
+	   z0 = svld1ro (p0, x0 - 72))
+
+/*
+** ld1ro_f32_8:
+**	ld1row	z0\.s, p0/z, \[x0, #?32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f32_8, svfloat32_t, float32_t,
+	   z0 = svld1ro_f32 (p0, x0 + 8),
+	   z0 = svld1ro (p0, x0 + 8))
+
+/*
+** ld1ro_f32_56:
+**	ld1row	z0\.s, p0/z, \[x0, #?224\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f32_56, svfloat32_t, float32_t,
+	   z0 = svld1ro_f32 (p0, x0 + 56),
+	   z0 = svld1ro (p0, x0 + 56))
+
+/*
+** ld1ro_f32_m8:
+**	ld1row	z0\.s, p0/z, \[x0, #?-32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f32_m8, svfloat32_t, float32_t,
+	   z0 = svld1ro_f32 (p0, x0 - 8),
+	   z0 = svld1ro (p0, x0 - 8))
+
+/*
+** ld1ro_f32_m64:
+**	ld1row	z0\.s, p0/z, \[x0, #?-256\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f32_m64, svfloat32_t, float32_t,
+	   z0 = svld1ro_f32 (p0, x0 - 64),
+	   z0 = svld1ro (p0, x0 - 64))
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f64.c
new file mode 100644
index 000000000..2fb9d5b74
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_f64.c
@@ -0,0 +1,120 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
+/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ro_f64_base:
+**	ld1rod	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f64_base, svfloat64_t, float64_t,
+	   z0 = svld1ro_f64 (p0, x0),
+	   z0 = svld1ro (p0, x0))
+
+/*
+** ld1ro_f64_index:
+**	ld1rod	z0\.d, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f64_index, svfloat64_t, float64_t,
+	   z0 = svld1ro_f64 (p0, x0 + x1),
+	   z0 = svld1ro (p0, x0 + x1))
+
+/*
+** ld1ro_f64_1:
+**	add	(x[0-9]+), x0, #?8
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f64_1, svfloat64_t, float64_t,
+	   z0 = svld1ro_f64 (p0, x0 + 1),
+	   z0 = svld1ro (p0, x0 + 1))
+
+/*
+** ld1ro_f64_2:
+**	add	(x[0-9]+), x0, #?16
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f64_2, svfloat64_t, float64_t,
+	   z0 = svld1ro_f64 (p0, x0 + 2),
+	   z0 = svld1ro (p0, x0 + 2))
+
+/*
+** ld1ro_f64_32:
+**	add	(x[0-9]+), x0, #?256
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f64_32, svfloat64_t, float64_t,
+	   z0 = svld1ro_f64 (p0, x0 + 32),
+	   z0 = svld1ro (p0, x0 + 32))
+
+/*
+** ld1ro_f64_m1:
+**	sub	(x[0-9]+), x0, #?8
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f64_m1, svfloat64_t, float64_t,
+	   z0 = svld1ro_f64 (p0, x0 - 1),
+	   z0 = svld1ro (p0, x0 - 1))
+
+/*
+** ld1ro_f64_m2:
+**	sub	(x[0-9]+), x0, #?16
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f64_m2, svfloat64_t, float64_t,
+	   z0 = svld1ro_f64 (p0, x0 - 2),
+	   z0 = svld1ro (p0, x0 - 2))
+
+/*
+** ld1ro_f64_m36:
+**	sub	(x[0-9]+), x0, #?288
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f64_m36, svfloat64_t, float64_t,
+	   z0 = svld1ro_f64 (p0, x0 - 36),
+	   z0 = svld1ro (p0, x0 - 36))
+
+/*
+** ld1ro_f64_4:
+**	ld1rod	z0\.d, p0/z, \[x0, #?32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f64_4, svfloat64_t, float64_t,
+	   z0 = svld1ro_f64 (p0, x0 + 4),
+	   z0 = svld1ro (p0, x0 + 4))
+
+/*
+** ld1ro_f64_28:
+**	ld1rod	z0\.d, p0/z, \[x0, #?224\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f64_28, svfloat64_t, float64_t,
+	   z0 = svld1ro_f64 (p0, x0 + 28),
+	   z0 = svld1ro (p0, x0 + 28))
+
+/*
+** ld1ro_f64_m4:
+**	ld1rod	z0\.d, p0/z, \[x0, #?-32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f64_m4, svfloat64_t, float64_t,
+	   z0 = svld1ro_f64 (p0, x0 - 4),
+	   z0 = svld1ro (p0, x0 - 4))
+
+/*
+** ld1ro_f64_m32:
+**	ld1rod	z0\.d, p0/z, \[x0, #?-256\]
+**	ret
+*/
+TEST_LOAD (ld1ro_f64_m32, svfloat64_t, float64_t,
+	   z0 = svld1ro_f64 (p0, x0 - 32),
+	   z0 = svld1ro (p0, x0 - 32))
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s16.c
new file mode 100644
index 000000000..3cd211b16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s16.c
@@ -0,0 +1,120 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
+/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ro_s16_base:
+**	ld1roh	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s16_base, svint16_t, int16_t,
+	   z0 = svld1ro_s16 (p0, x0),
+	   z0 = svld1ro (p0, x0))
+
+/*
+** ld1ro_s16_index:
+**	ld1roh	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s16_index, svint16_t, int16_t,
+	   z0 = svld1ro_s16 (p0, x0 + x1),
+	   z0 = svld1ro (p0, x0 + x1))
+
+/*
+** ld1ro_s16_1:
+**	add	(x[0-9]+), x0, #?2
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s16_1, svint16_t, int16_t,
+	   z0 = svld1ro_s16 (p0, x0 + 1),
+	   z0 = svld1ro (p0, x0 + 1))
+
+/*
+** ld1ro_s16_8:
+**	add	(x[0-9]+), x0, #?16
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s16_8, svint16_t, int16_t,
+	   z0 = svld1ro_s16 (p0, x0 + 8),
+	   z0 = svld1ro (p0, x0 + 8))
+
+/*
+** ld1ro_s16_128:
+**	add	(x[0-9]+), x0, #?256
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s16_128, svint16_t, int16_t,
+	   z0 = svld1ro_s16 (p0, x0 + 128),
+	   z0 = svld1ro (p0, x0 + 128))
+
+/*
+** ld1ro_s16_m1:
+**	sub	(x[0-9]+), x0, #?2
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s16_m1, svint16_t, int16_t,
+	   z0 = svld1ro_s16 (p0, x0 - 1),
+	   z0 = svld1ro (p0, x0 - 1))
+
+/*
+** ld1ro_s16_m8:
+**	sub	(x[0-9]+), x0, #?16
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s16_m8, svint16_t, int16_t,
+	   z0 = svld1ro_s16 (p0, x0 - 8),
+	   z0 = svld1ro (p0, x0 - 8))
+
+/*
+** ld1ro_s16_m144:
+**	sub	(x[0-9]+), x0, #?288
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s16_m144, svint16_t, int16_t,
+	   z0 = svld1ro_s16 (p0, x0 - 144),
+	   z0 = svld1ro (p0, x0 - 144))
+
+/*
+** ld1ro_s16_16:
+**	ld1roh	z0\.h, p0/z, \[x0, #?32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s16_16, svint16_t, int16_t,
+	   z0 = svld1ro_s16 (p0, x0 + 16),
+	   z0 = svld1ro (p0, x0 + 16))
+
+/*
+** ld1ro_s16_112:
+**	ld1roh	z0\.h, p0/z, \[x0, #?224\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s16_112, svint16_t, int16_t,
+	   z0 = svld1ro_s16 (p0, x0 + 112),
+	   z0 = svld1ro (p0, x0 + 112))
+
+/*
+** ld1ro_s16_m16:
+**	ld1roh	z0\.h, p0/z, \[x0, #?-32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s16_m16, svint16_t, int16_t,
+	   z0 = svld1ro_s16 (p0, x0 - 16),
+	   z0 = svld1ro (p0, x0 - 16))
+
+/*
+** ld1ro_s16_m128:
+**	ld1roh	z0\.h, p0/z, \[x0, #?-256\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s16_m128, svint16_t, int16_t,
+	   z0 = svld1ro_s16 (p0, x0 - 128),
+	   z0 = svld1ro (p0, x0 - 128))
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s32.c
new file mode 100644
index 000000000..44b16ed5f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s32.c
@@ -0,0 +1,120 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
+/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ro_s32_base:
+**	ld1row	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s32_base, svint32_t, int32_t,
+	   z0 = svld1ro_s32 (p0, x0),
+	   z0 = svld1ro (p0, x0))
+
+/*
+** ld1ro_s32_index:
+**	ld1row	z0\.s, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s32_index, svint32_t, int32_t,
+	   z0 = svld1ro_s32 (p0, x0 + x1),
+	   z0 = svld1ro (p0, x0 + x1))
+
+/*
+** ld1ro_s32_1:
+**	add	(x[0-9]+), x0, #?4
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s32_1, svint32_t, int32_t,
+	   z0 = svld1ro_s32 (p0, x0 + 1),
+	   z0 = svld1ro (p0, x0 + 1))
+
+/*
+** ld1ro_s32_4:
+**	add	(x[0-9]+), x0, #?16
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s32_4, svint32_t, int32_t,
+	   z0 = svld1ro_s32 (p0, x0 + 4),
+	   z0 = svld1ro (p0, x0 + 4))
+
+/*
+** ld1ro_s32_64:
+**	add	(x[0-9]+), x0, #?256
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s32_64, svint32_t, int32_t,
+	   z0 = svld1ro_s32 (p0, x0 + 64),
+	   z0 = svld1ro (p0, x0 + 64))
+
+/*
+** ld1ro_s32_m1:
+**	sub	(x[0-9]+), x0, #?4
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s32_m1, svint32_t, int32_t,
+	   z0 = svld1ro_s32 (p0, x0 - 1),
+	   z0 = svld1ro (p0, x0 - 1))
+
+/*
+** ld1ro_s32_m4:
+**	sub	(x[0-9]+), x0, #?16
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s32_m4, svint32_t, int32_t,
+	   z0 = svld1ro_s32 (p0, x0 - 4),
+	   z0 = svld1ro (p0, x0 - 4))
+
+/*
+** ld1ro_s32_m72:
+**	sub	(x[0-9]+), x0, #?288
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s32_m72, svint32_t, int32_t,
+	   z0 = svld1ro_s32 (p0, x0 - 72),
+	   z0 = svld1ro (p0, x0 - 72))
+
+/*
+** ld1ro_s32_8:
+**	ld1row	z0\.s, p0/z, \[x0, #?32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s32_8, svint32_t, int32_t,
+	   z0 = svld1ro_s32 (p0, x0 + 8),
+	   z0 = svld1ro (p0, x0 + 8))
+
+/*
+** ld1ro_s32_56:
+**	ld1row	z0\.s, p0/z, \[x0, #?224\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s32_56, svint32_t, int32_t,
+	   z0 = svld1ro_s32 (p0, x0 + 56),
+	   z0 = svld1ro (p0, x0 + 56))
+
+/*
+** ld1ro_s32_m8:
+**	ld1row	z0\.s, p0/z, \[x0, #?-32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s32_m8, svint32_t, int32_t,
+	   z0 = svld1ro_s32 (p0, x0 - 8),
+	   z0 = svld1ro (p0, x0 - 8))
+
+/*
+** ld1ro_s32_m64:
+**	ld1row	z0\.s, p0/z, \[x0, #?-256\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s32_m64, svint32_t, int32_t,
+	   z0 = svld1ro_s32 (p0, x0 - 64),
+	   z0 = svld1ro (p0, x0 - 64))
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s64.c
new file mode 100644
index 000000000..3aa9a15ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s64.c
@@ -0,0 +1,120 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
+/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ro_s64_base:
+**	ld1rod	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s64_base, svint64_t, int64_t,
+	   z0 = svld1ro_s64 (p0, x0),
+	   z0 = svld1ro (p0, x0))
+
+/*
+** ld1ro_s64_index:
+**	ld1rod	z0\.d, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s64_index, svint64_t, int64_t,
+	   z0 = svld1ro_s64 (p0, x0 + x1),
+	   z0 = svld1ro (p0, x0 + x1))
+
+/*
+** ld1ro_s64_1:
+**	add	(x[0-9]+), x0, #?8
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s64_1, svint64_t, int64_t,
+	   z0 = svld1ro_s64 (p0, x0 + 1),
+	   z0 = svld1ro (p0, x0 + 1))
+
+/*
+** ld1ro_s64_2:
+**	add	(x[0-9]+), x0, #?16
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s64_2, svint64_t, int64_t,
+	   z0 = svld1ro_s64 (p0, x0 + 2),
+	   z0 = svld1ro (p0, x0 + 2))
+
+/*
+** ld1ro_s64_32:
+**	add	(x[0-9]+), x0, #?256
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s64_32, svint64_t, int64_t,
+	   z0 = svld1ro_s64 (p0, x0 + 32),
+	   z0 = svld1ro (p0, x0 + 32))
+
+/*
+** ld1ro_s64_m1:
+**	sub	(x[0-9]+), x0, #?8
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s64_m1, svint64_t, int64_t,
+	   z0 = svld1ro_s64 (p0, x0 - 1),
+	   z0 = svld1ro (p0, x0 - 1))
+
+/*
+** ld1ro_s64_m2:
+**	sub	(x[0-9]+), x0, #?16
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s64_m2, svint64_t, int64_t,
+	   z0 = svld1ro_s64 (p0, x0 - 2),
+	   z0 = svld1ro (p0, x0 - 2))
+
+/*
+** ld1ro_s64_m36:
+**	sub	(x[0-9]+), x0, #?288
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s64_m36, svint64_t, int64_t,
+	   z0 = svld1ro_s64 (p0, x0 - 36),
+	   z0 = svld1ro (p0, x0 - 36))
+
+/*
+** ld1ro_s64_4:
+**	ld1rod	z0\.d, p0/z, \[x0, #?32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s64_4, svint64_t, int64_t,
+	   z0 = svld1ro_s64 (p0, x0 + 4),
+	   z0 = svld1ro (p0, x0 + 4))
+
+/*
+** ld1ro_s64_28:
+**	ld1rod	z0\.d, p0/z, \[x0, #?224\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s64_28, svint64_t, int64_t,
+	   z0 = svld1ro_s64 (p0, x0 + 28),
+	   z0 = svld1ro (p0, x0 + 28))
+
+/*
+** ld1ro_s64_m4:
+**	ld1rod	z0\.d, p0/z, \[x0, #?-32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s64_m4, svint64_t, int64_t,
+	   z0 = svld1ro_s64 (p0, x0 - 4),
+	   z0 = svld1ro (p0, x0 - 4))
+
+/*
+** ld1ro_s64_m32:
+**	ld1rod	z0\.d, p0/z, \[x0, #?-256\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s64_m32, svint64_t, int64_t,
+	   z0 = svld1ro_s64 (p0, x0 - 32),
+	   z0 = svld1ro (p0, x0 - 32))
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s8.c
new file mode 100644
index 000000000..49aff5146
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_s8.c
@@ -0,0 +1,120 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
+/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ro_s8_base:
+**	ld1rob	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s8_base, svint8_t, int8_t,
+	   z0 = svld1ro_s8 (p0, x0),
+	   z0 = svld1ro (p0, x0))
+
+/*
+** ld1ro_s8_index:
+**	ld1rob	z0\.b, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s8_index, svint8_t, int8_t,
+	   z0 = svld1ro_s8 (p0, x0 + x1),
+	   z0 = svld1ro (p0, x0 + x1))
+
+/*
+** ld1ro_s8_1:
+**	add	(x[0-9]+), x0, #?1
+**	ld1rob	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s8_1, svint8_t, int8_t,
+	   z0 = svld1ro_s8 (p0, x0 + 1),
+	   z0 = svld1ro (p0, x0 + 1))
+
+/*
+** ld1ro_s8_16:
+**	add	(x[0-9]+), x0, #?16
+**	ld1rob	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s8_16, svint8_t, int8_t,
+	   z0 = svld1ro_s8 (p0, x0 + 16),
+	   z0 = svld1ro (p0, x0 + 16))
+
+/*
+** ld1ro_s8_256:
+**	add	(x[0-9]+), x0, #?256
+**	ld1rob	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s8_256, svint8_t, int8_t,
+	   z0 = svld1ro_s8 (p0, x0 + 256),
+	   z0 = svld1ro (p0, x0 + 256))
+
+/*
+** ld1ro_s8_m1:
+**	sub	(x[0-9]+), x0, #?1
+**	ld1rob	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s8_m1, svint8_t, int8_t,
+	   z0 = svld1ro_s8 (p0, x0 - 1),
+	   z0 = svld1ro (p0, x0 - 1))
+
+/*
+** ld1ro_s8_m16:
+**	sub	(x[0-9]+), x0, #?16
+**	ld1rob	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s8_m16, svint8_t, int8_t,
+	   z0 = svld1ro_s8 (p0, x0 - 16),
+	   z0 = svld1ro (p0, x0 - 16))
+
+/*
+** ld1ro_s8_m288:
+**	sub	(x[0-9]+), x0, #?288
+**	ld1rob	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s8_m288, svint8_t, int8_t,
+	   z0 = svld1ro_s8 (p0, x0 - 288),
+	   z0 = svld1ro (p0, x0 - 288))
+
+/*
+** ld1ro_s8_32:
+**	ld1rob	z0\.b, p0/z, \[x0, #?32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s8_32, svint8_t, int8_t,
+	   z0 = svld1ro_s8 (p0, x0 + 32),
+	   z0 = svld1ro (p0, x0 + 32))
+
+/*
+** ld1ro_s8_224:
+**	ld1rob	z0\.b, p0/z, \[x0, #?224\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s8_224, svint8_t, int8_t,
+	   z0 = svld1ro_s8 (p0, x0 + 224),
+	   z0 = svld1ro (p0, x0 + 224))
+
+/*
+** ld1ro_s8_m32:
+**	ld1rob	z0\.b, p0/z, \[x0, #?-32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s8_m32, svint8_t, int8_t,
+	   z0 = svld1ro_s8 (p0, x0 - 32),
+	   z0 = svld1ro (p0, x0 - 32))
+
+/*
+** ld1ro_s8_m256:
+**	ld1rob	z0\.b, p0/z, \[x0, #?-256\]
+**	ret
+*/
+TEST_LOAD (ld1ro_s8_m256, svint8_t, int8_t,
+	   z0 = svld1ro_s8 (p0, x0 - 256),
+	   z0 = svld1ro (p0, x0 - 256))
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u16.c
new file mode 100644
index 000000000..00bf9e129
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u16.c
@@ -0,0 +1,120 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
+/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ro_u16_base:
+**	ld1roh	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u16_base, svuint16_t, uint16_t,
+	   z0 = svld1ro_u16 (p0, x0),
+	   z0 = svld1ro (p0, x0))
+
+/*
+** ld1ro_u16_index:
+**	ld1roh	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u16_index, svuint16_t, uint16_t,
+	   z0 = svld1ro_u16 (p0, x0 + x1),
+	   z0 = svld1ro (p0, x0 + x1))
+
+/*
+** ld1ro_u16_1:
+**	add	(x[0-9]+), x0, #?2
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u16_1, svuint16_t, uint16_t,
+	   z0 = svld1ro_u16 (p0, x0 + 1),
+	   z0 = svld1ro (p0, x0 + 1))
+
+/*
+** ld1ro_u16_8:
+**	add	(x[0-9]+), x0, #?16
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u16_8, svuint16_t, uint16_t,
+	   z0 = svld1ro_u16 (p0, x0 + 8),
+	   z0 = svld1ro (p0, x0 + 8))
+
+/*
+** ld1ro_u16_128:
+**	add	(x[0-9]+), x0, #?256
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u16_128, svuint16_t, uint16_t,
+	   z0 = svld1ro_u16 (p0, x0 + 128),
+	   z0 = svld1ro (p0, x0 + 128))
+
+/*
+** ld1ro_u16_m1:
+**	sub	(x[0-9]+), x0, #?2
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u16_m1, svuint16_t, uint16_t,
+	   z0 = svld1ro_u16 (p0, x0 - 1),
+	   z0 = svld1ro (p0, x0 - 1))
+
+/*
+** ld1ro_u16_m8:
+**	sub	(x[0-9]+), x0, #?16
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u16_m8, svuint16_t, uint16_t,
+	   z0 = svld1ro_u16 (p0, x0 - 8),
+	   z0 = svld1ro (p0, x0 - 8))
+
+/*
+** ld1ro_u16_m144:
+**	sub	(x[0-9]+), x0, #?288
+**	ld1roh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u16_m144, svuint16_t, uint16_t,
+	   z0 = svld1ro_u16 (p0, x0 - 144),
+	   z0 = svld1ro (p0, x0 - 144))
+
+/*
+** ld1ro_u16_16:
+**	ld1roh	z0\.h, p0/z, \[x0, #?32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u16_16, svuint16_t, uint16_t,
+	   z0 = svld1ro_u16 (p0, x0 + 16),
+	   z0 = svld1ro (p0, x0 + 16))
+
+/*
+** ld1ro_u16_112:
+**	ld1roh	z0\.h, p0/z, \[x0, #?224\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u16_112, svuint16_t, uint16_t,
+	   z0 = svld1ro_u16 (p0, x0 + 112),
+	   z0 = svld1ro (p0, x0 + 112))
+
+/*
+** ld1ro_u16_m16:
+**	ld1roh	z0\.h, p0/z, \[x0, #?-32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u16_m16, svuint16_t, uint16_t,
+	   z0 = svld1ro_u16 (p0, x0 - 16),
+	   z0 = svld1ro (p0, x0 - 16))
+
+/*
+** ld1ro_u16_m128:
+**	ld1roh	z0\.h, p0/z, \[x0, #?-256\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u16_m128, svuint16_t, uint16_t,
+	   z0 = svld1ro_u16 (p0, x0 - 128),
+	   z0 = svld1ro (p0, x0 - 128))
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u32.c
new file mode 100644
index 000000000..9e9b3290a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u32.c
@@ -0,0 +1,120 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
+/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ro_u32_base:
+**	ld1row	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u32_base, svuint32_t, uint32_t,
+	   z0 = svld1ro_u32 (p0, x0),
+	   z0 = svld1ro (p0, x0))
+
+/*
+** ld1ro_u32_index:
+**	ld1row	z0\.s, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u32_index, svuint32_t, uint32_t,
+	   z0 = svld1ro_u32 (p0, x0 + x1),
+	   z0 = svld1ro (p0, x0 + x1))
+
+/*
+** ld1ro_u32_1:
+**	add	(x[0-9]+), x0, #?4
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u32_1, svuint32_t, uint32_t,
+	   z0 = svld1ro_u32 (p0, x0 + 1),
+	   z0 = svld1ro (p0, x0 + 1))
+
+/*
+** ld1ro_u32_4:
+**	add	(x[0-9]+), x0, #?16
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u32_4, svuint32_t, uint32_t,
+	   z0 = svld1ro_u32 (p0, x0 + 4),
+	   z0 = svld1ro (p0, x0 + 4))
+
+/*
+** ld1ro_u32_64:
+**	add	(x[0-9]+), x0, #?256
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u32_64, svuint32_t, uint32_t,
+	   z0 = svld1ro_u32 (p0, x0 + 64),
+	   z0 = svld1ro (p0, x0 + 64))
+
+/*
+** ld1ro_u32_m1:
+**	sub	(x[0-9]+), x0, #?4
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u32_m1, svuint32_t, uint32_t,
+	   z0 = svld1ro_u32 (p0, x0 - 1),
+	   z0 = svld1ro (p0, x0 - 1))
+
+/*
+** ld1ro_u32_m4:
+**	sub	(x[0-9]+), x0, #?16
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u32_m4, svuint32_t, uint32_t,
+	   z0 = svld1ro_u32 (p0, x0 - 4),
+	   z0 = svld1ro (p0, x0 - 4))
+
+/*
+** ld1ro_u32_m72:
+**	sub	(x[0-9]+), x0, #?288
+**	ld1row	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u32_m72, svuint32_t, uint32_t,
+	   z0 = svld1ro_u32 (p0, x0 - 72),
+	   z0 = svld1ro (p0, x0 - 72))
+
+/*
+** ld1ro_u32_8:
+**	ld1row	z0\.s, p0/z, \[x0, #?32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u32_8, svuint32_t, uint32_t,
+	   z0 = svld1ro_u32 (p0, x0 + 8),
+	   z0 = svld1ro (p0, x0 + 8))
+
+/*
+** ld1ro_u32_56:
+**	ld1row	z0\.s, p0/z, \[x0, #?224\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u32_56, svuint32_t, uint32_t,
+	   z0 = svld1ro_u32 (p0, x0 + 56),
+	   z0 = svld1ro (p0, x0 + 56))
+
+/*
+** ld1ro_u32_m8:
+**	ld1row	z0\.s, p0/z, \[x0, #?-32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u32_m8, svuint32_t, uint32_t,
+	   z0 = svld1ro_u32 (p0, x0 - 8),
+	   z0 = svld1ro (p0, x0 - 8))
+
+/*
+** ld1ro_u32_m64:
+**	ld1row	z0\.s, p0/z, \[x0, #?-256\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u32_m64, svuint32_t, uint32_t,
+	   z0 = svld1ro_u32 (p0, x0 - 64),
+	   z0 = svld1ro (p0, x0 - 64))
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u64.c
new file mode 100644
index 000000000..64ec62871
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u64.c
@@ -0,0 +1,120 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
+/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ro_u64_base:
+**	ld1rod	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u64_base, svuint64_t, uint64_t,
+	   z0 = svld1ro_u64 (p0, x0),
+	   z0 = svld1ro (p0, x0))
+
+/*
+** ld1ro_u64_index:
+**	ld1rod	z0\.d, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u64_index, svuint64_t, uint64_t,
+	   z0 = svld1ro_u64 (p0, x0 + x1),
+	   z0 = svld1ro (p0, x0 + x1))
+
+/*
+** ld1ro_u64_1:
+**	add	(x[0-9]+), x0, #?8
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u64_1, svuint64_t, uint64_t,
+	   z0 = svld1ro_u64 (p0, x0 + 1),
+	   z0 = svld1ro (p0, x0 + 1))
+
+/*
+** ld1ro_u64_2:
+**	add	(x[0-9]+), x0, #?16
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u64_2, svuint64_t, uint64_t,
+	   z0 = svld1ro_u64 (p0, x0 + 2),
+	   z0 = svld1ro (p0, x0 + 2))
+
+/*
+** ld1ro_u64_32:
+**	add	(x[0-9]+), x0, #?256
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u64_32, svuint64_t, uint64_t,
+	   z0 = svld1ro_u64 (p0, x0 + 32),
+	   z0 = svld1ro (p0, x0 + 32))
+
+/*
+** ld1ro_u64_m1:
+**	sub	(x[0-9]+), x0, #?8
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u64_m1, svuint64_t, uint64_t,
+	   z0 = svld1ro_u64 (p0, x0 - 1),
+	   z0 = svld1ro (p0, x0 - 1))
+
+/*
+** ld1ro_u64_m2:
+**	sub	(x[0-9]+), x0, #?16
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u64_m2, svuint64_t, uint64_t,
+	   z0 = svld1ro_u64 (p0, x0 - 2),
+	   z0 = svld1ro (p0, x0 - 2))
+
+/*
+** ld1ro_u64_m36:
+**	sub	(x[0-9]+), x0, #?288
+**	ld1rod	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u64_m36, svuint64_t, uint64_t,
+	   z0 = svld1ro_u64 (p0, x0 - 36),
+	   z0 = svld1ro (p0, x0 - 36))
+
+/*
+** ld1ro_u64_4:
+**	ld1rod	z0\.d, p0/z, \[x0, #?32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u64_4, svuint64_t, uint64_t,
+	   z0 = svld1ro_u64 (p0, x0 + 4),
+	   z0 = svld1ro (p0, x0 + 4))
+
+/*
+** ld1ro_u64_28:
+**	ld1rod	z0\.d, p0/z, \[x0, #?224\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u64_28, svuint64_t, uint64_t,
+	   z0 = svld1ro_u64 (p0, x0 + 28),
+	   z0 = svld1ro (p0, x0 + 28))
+
+/*
+** ld1ro_u64_m4:
+**	ld1rod	z0\.d, p0/z, \[x0, #?-32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u64_m4, svuint64_t, uint64_t,
+	   z0 = svld1ro_u64 (p0, x0 - 4),
+	   z0 = svld1ro (p0, x0 - 4))
+
+/*
+** ld1ro_u64_m32:
+**	ld1rod	z0\.d, p0/z, \[x0, #?-256\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u64_m32, svuint64_t, uint64_t,
+	   z0 = svld1ro_u64 (p0, x0 - 32),
+	   z0 = svld1ro (p0, x0 - 32))
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u8.c
new file mode 100644
index 000000000..22701320b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ro_u8.c
@@ -0,0 +1,120 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
+/* { dg-require-effective-target aarch64_asm_f64mm_ok }  */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ro_u8_base:
+**	ld1rob	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u8_base, svuint8_t, uint8_t,
+	   z0 = svld1ro_u8 (p0, x0),
+	   z0 = svld1ro (p0, x0))
+
+/*
+** ld1ro_u8_index:
+**	ld1rob	z0\.b, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u8_index, svuint8_t, uint8_t,
+	   z0 = svld1ro_u8 (p0, x0 + x1),
+	   z0 = svld1ro (p0, x0 + x1))
+
+/*
+** ld1ro_u8_1:
+**	add	(x[0-9]+), x0, #?1
+**	ld1rob	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u8_1, svuint8_t, uint8_t,
+	   z0 = svld1ro_u8 (p0, x0 + 1),
+	   z0 = svld1ro (p0, x0 + 1))
+
+/*
+** ld1ro_u8_16:
+**	add	(x[0-9]+), x0, #?16
+**	ld1rob	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u8_16, svuint8_t, uint8_t,
+	   z0 = svld1ro_u8 (p0, x0 + 16),
+	   z0 = svld1ro (p0, x0 + 16))
+
+/*
+** ld1ro_u8_256:
+**	add	(x[0-9]+), x0, #?256
+**	ld1rob	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u8_256, svuint8_t, uint8_t,
+	   z0 = svld1ro_u8 (p0, x0 + 256),
+	   z0 = svld1ro (p0, x0 + 256))
+
+/*
+** ld1ro_u8_m1:
+**	sub	(x[0-9]+), x0, #?1
+**	ld1rob	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u8_m1, svuint8_t, uint8_t,
+	   z0 = svld1ro_u8 (p0, x0 - 1),
+	   z0 = svld1ro (p0, x0 - 1))
+
+/*
+** ld1ro_u8_m16:
+**	sub	(x[0-9]+), x0, #?16
+**	ld1rob	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u8_m16, svuint8_t, uint8_t,
+	   z0 = svld1ro_u8 (p0, x0 - 16),
+	   z0 = svld1ro (p0, x0 - 16))
+
+/*
+** ld1ro_u8_m288:
+**	sub	(x[0-9]+), x0, #?288
+**	ld1rob	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u8_m288, svuint8_t, uint8_t,
+	   z0 = svld1ro_u8 (p0, x0 - 288),
+	   z0 = svld1ro (p0, x0 - 288))
+
+/*
+** ld1ro_u8_32:
+**	ld1rob	z0\.b, p0/z, \[x0, #?32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u8_32, svuint8_t, uint8_t,
+	   z0 = svld1ro_u8 (p0, x0 + 32),
+	   z0 = svld1ro (p0, x0 + 32))
+
+/*
+** ld1ro_u8_224:
+**	ld1rob	z0\.b, p0/z, \[x0, #?224\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u8_224, svuint8_t, uint8_t,
+	   z0 = svld1ro_u8 (p0, x0 + 224),
+	   z0 = svld1ro (p0, x0 + 224))
+
+/*
+** ld1ro_u8_m32:
+**	ld1rob	z0\.b, p0/z, \[x0, #?-32\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u8_m32, svuint8_t, uint8_t,
+	   z0 = svld1ro_u8 (p0, x0 - 32),
+	   z0 = svld1ro (p0, x0 - 32))
+
+/*
+** ld1ro_u8_m256:
+**	ld1rob	z0\.b, p0/z, \[x0, #?-256\]
+**	ret
+*/
+TEST_LOAD (ld1ro_u8_m256, svuint8_t, uint8_t,
+	   z0 = svld1ro_u8 (p0, x0 - 256),
+	   z0 = svld1ro (p0, x0 - 256))
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_bf16.c
new file mode 100644
index 000000000..54c69a1db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_bf16.c
@@ -0,0 +1,137 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1rq_bf16_base:
+**	ld1rqh	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1rq_bf16_base, svbfloat16_t, bfloat16_t,
+	   z0 = svld1rq_bf16 (p0, x0),
+	   z0 = svld1rq (p0, x0))
+
+/*
+** ld1rq_bf16_index:
+**	ld1rqh	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_bf16_index, svbfloat16_t, bfloat16_t,
+	   z0 = svld1rq_bf16 (p0, x0 + x1),
+	   z0 = svld1rq (p0, x0 + x1))
+
+/*
+** ld1rq_bf16_1:
+**	add	(x[0-9]+), x0, #?2
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_bf16_1, svbfloat16_t, bfloat16_t,
+	   z0 = svld1rq_bf16 (p0, x0 + 1),
+	   z0 = svld1rq (p0, x0 + 1))
+
+/*
+** ld1rq_bf16_4:
+**	add	(x[0-9]+), x0, #?8
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_bf16_4, svbfloat16_t, bfloat16_t,
+	   z0 = svld1rq_bf16 (p0, x0 + 4),
+	   z0 = svld1rq (p0, x0 + 4))
+
+/*
+** ld1rq_bf16_7:
+**	add	(x[0-9]+), x0, #?14
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_bf16_7, svbfloat16_t, bfloat16_t,
+	   z0 = svld1rq_bf16 (p0, x0 + 7),
+	   z0 = svld1rq (p0, x0 + 7))
+
+/*
+** ld1rq_bf16_8:
+**	ld1rqh	z0\.h, p0/z, \[x0, #?16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_bf16_8, svbfloat16_t, bfloat16_t,
+	   z0 = svld1rq_bf16 (p0, x0 + 8),
+	   z0 = svld1rq (p0, x0 + 8))
+
+/*
+** ld1rq_bf16_56:
+**	ld1rqh	z0\.h, p0/z, \[x0, #?112\]
+**	ret
+*/
+TEST_LOAD (ld1rq_bf16_56, svbfloat16_t, bfloat16_t,
+	   z0 = svld1rq_bf16 (p0, x0 + 56),
+	   z0 = svld1rq (p0, x0 + 56))
+
+/*
+** ld1rq_bf16_64:
+**	add	(x[0-9]+), x0, #?128
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_bf16_64, svbfloat16_t, bfloat16_t,
+	   z0 = svld1rq_bf16 (p0, x0 + 64),
+	   z0 = svld1rq (p0, x0 + 64))
+
+/*
+** ld1rq_bf16_m1:
+**	sub	(x[0-9]+), x0, #?2
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_bf16_m1, svbfloat16_t, bfloat16_t,
+	   z0 = svld1rq_bf16 (p0, x0 - 1),
+	   z0 = svld1rq (p0, x0 - 1))
+
+/*
+** ld1rq_bf16_m4:
+**	sub	(x[0-9]+), x0, #?8
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_bf16_m4, svbfloat16_t, bfloat16_t,
+	   z0 = svld1rq_bf16 (p0, x0 - 4),
+	   z0 = svld1rq (p0, x0 - 4))
+
+/*
+** ld1rq_bf16_m7:
+**	sub	(x[0-9]+), x0, #?14
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_bf16_m7, svbfloat16_t, bfloat16_t,
+	   z0 = svld1rq_bf16 (p0, x0 - 7),
+	   z0 = svld1rq (p0, x0 - 7))
+
+/*
+** ld1rq_bf16_m8:
+**	ld1rqh	z0\.h, p0/z, \[x0, #?-16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_bf16_m8, svbfloat16_t, bfloat16_t,
+	   z0 = svld1rq_bf16 (p0, x0 - 8),
+	   z0 = svld1rq (p0, x0 - 8))
+
+/*
+** ld1rq_bf16_m64:
+**	ld1rqh	z0\.h, p0/z, \[x0, #?-128\]
+**	ret
+*/
+TEST_LOAD (ld1rq_bf16_m64, svbfloat16_t, bfloat16_t,
+	   z0 = svld1rq_bf16 (p0, x0 - 64),
+	   z0 = svld1rq (p0, x0 - 64))
+
+/*
+** ld1rq_bf16_m72:
+**	sub	(x[0-9]+), x0, #?144
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_bf16_m72, svbfloat16_t, bfloat16_t,
+	   z0 = svld1rq_bf16 (p0, x0 - 72),
+	   z0 = svld1rq (p0, x0 - 72))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f16.c
new file mode 100644
index 000000000..7536236f7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f16.c
@@ -0,0 +1,137 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1rq_f16_base:
+**	ld1rqh	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f16_base, svfloat16_t, float16_t,
+	   z0 = svld1rq_f16 (p0, x0),
+	   z0 = svld1rq (p0, x0))
+
+/*
+** ld1rq_f16_index:
+**	ld1rqh	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f16_index, svfloat16_t, float16_t,
+	   z0 = svld1rq_f16 (p0, x0 + x1),
+	   z0 = svld1rq (p0, x0 + x1))
+
+/*
+** ld1rq_f16_1:
+**	add	(x[0-9]+), x0, #?2
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f16_1, svfloat16_t, float16_t,
+	   z0 = svld1rq_f16 (p0, x0 + 1),
+	   z0 = svld1rq (p0, x0 + 1))
+
+/*
+** ld1rq_f16_4:
+**	add	(x[0-9]+), x0, #?8
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f16_4, svfloat16_t, float16_t,
+	   z0 = svld1rq_f16 (p0, x0 + 4),
+	   z0 = svld1rq (p0, x0 + 4))
+
+/*
+** ld1rq_f16_7:
+**	add	(x[0-9]+), x0, #?14
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f16_7, svfloat16_t, float16_t,
+	   z0 = svld1rq_f16 (p0, x0 + 7),
+	   z0 = svld1rq (p0, x0 + 7))
+
+/*
+** ld1rq_f16_8:
+**	ld1rqh	z0\.h, p0/z, \[x0, #?16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f16_8, svfloat16_t, float16_t,
+	   z0 = svld1rq_f16 (p0, x0 + 8),
+	   z0 = svld1rq (p0, x0 + 8))
+
+/*
+** ld1rq_f16_56:
+**	ld1rqh	z0\.h, p0/z, \[x0, #?112\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f16_56, svfloat16_t, float16_t,
+	   z0 = svld1rq_f16 (p0, x0 + 56),
+	   z0 = svld1rq (p0, x0 + 56))
+
+/*
+** ld1rq_f16_64:
+**	add	(x[0-9]+), x0, #?128
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f16_64, svfloat16_t, float16_t,
+	   z0 = svld1rq_f16 (p0, x0 + 64),
+	   z0 = svld1rq (p0, x0 + 64))
+
+/*
+** ld1rq_f16_m1:
+**	sub	(x[0-9]+), x0, #?2
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f16_m1, svfloat16_t, float16_t,
+	   z0 = svld1rq_f16 (p0, x0 - 1),
+	   z0 = svld1rq (p0, x0 - 1))
+
+/*
+** ld1rq_f16_m4:
+**	sub	(x[0-9]+), x0, #?8
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f16_m4, svfloat16_t, float16_t,
+	   z0 = svld1rq_f16 (p0, x0 - 4),
+	   z0 = svld1rq (p0, x0 - 4))
+
+/*
+** ld1rq_f16_m7:
+**	sub	(x[0-9]+), x0, #?14
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f16_m7, svfloat16_t, float16_t,
+	   z0 = svld1rq_f16 (p0, x0 - 7),
+	   z0 = svld1rq (p0, x0 - 7))
+
+/*
+** ld1rq_f16_m8:
+**	ld1rqh	z0\.h, p0/z, \[x0, #?-16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f16_m8, svfloat16_t, float16_t,
+	   z0 = svld1rq_f16 (p0, x0 - 8),
+	   z0 = svld1rq (p0, x0 - 8))
+
+/*
+** ld1rq_f16_m64:
+**	ld1rqh	z0\.h, p0/z, \[x0, #?-128\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f16_m64, svfloat16_t, float16_t,
+	   z0 = svld1rq_f16 (p0, x0 - 64),
+	   z0 = svld1rq (p0, x0 - 64))
+
+/*
+** ld1rq_f16_m72:
+**	sub	(x[0-9]+), x0, #?144
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f16_m72, svfloat16_t, float16_t,
+	   z0 = svld1rq_f16 (p0, x0 - 72),
+	   z0 = svld1rq (p0, x0 - 72))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f32.c
new file mode 100644
index 000000000..9be2b7412
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f32.c
@@ -0,0 +1,137 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1rq_f32_base:
+**	ld1rqw	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f32_base, svfloat32_t, float32_t,
+	   z0 = svld1rq_f32 (p0, x0),
+	   z0 = svld1rq (p0, x0))
+
+/*
+** ld1rq_f32_index:
+**	ld1rqw	z0\.s, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f32_index, svfloat32_t, float32_t,
+	   z0 = svld1rq_f32 (p0, x0 + x1),
+	   z0 = svld1rq (p0, x0 + x1))
+
+/*
+** ld1rq_f32_1:
+**	add	(x[0-9]+), x0, #?4
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f32_1, svfloat32_t, float32_t,
+	   z0 = svld1rq_f32 (p0, x0 + 1),
+	   z0 = svld1rq (p0, x0 + 1))
+
+/*
+** ld1rq_f32_2:
+**	add	(x[0-9]+), x0, #?8
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f32_2, svfloat32_t, float32_t,
+	   z0 = svld1rq_f32 (p0, x0 + 2),
+	   z0 = svld1rq (p0, x0 + 2))
+
+/*
+** ld1rq_f32_3:
+**	add	(x[0-9]+), x0, #?12
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f32_3, svfloat32_t, float32_t,
+	   z0 = svld1rq_f32 (p0, x0 + 3),
+	   z0 = svld1rq (p0, x0 + 3))
+
+/*
+** ld1rq_f32_4:
+**	ld1rqw	z0\.s, p0/z, \[x0, #?16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f32_4, svfloat32_t, float32_t,
+	   z0 = svld1rq_f32 (p0, x0 + 4),
+	   z0 = svld1rq (p0, x0 + 4))
+
+/*
+** ld1rq_f32_28:
+**	ld1rqw	z0\.s, p0/z, \[x0, #?112\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f32_28, svfloat32_t, float32_t,
+	   z0 = svld1rq_f32 (p0, x0 + 28),
+	   z0 = svld1rq (p0, x0 + 28))
+
+/*
+** ld1rq_f32_32:
+**	add	(x[0-9]+), x0, #?128
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f32_32, svfloat32_t, float32_t,
+	   z0 = svld1rq_f32 (p0, x0 + 32),
+	   z0 = svld1rq (p0, x0 + 32))
+
+/*
+** ld1rq_f32_m1:
+**	sub	(x[0-9]+), x0, #?4
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f32_m1, svfloat32_t, float32_t,
+	   z0 = svld1rq_f32 (p0, x0 - 1),
+	   z0 = svld1rq (p0, x0 - 1))
+
+/*
+** ld1rq_f32_m2:
+**	sub	(x[0-9]+), x0, #?8
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f32_m2, svfloat32_t, float32_t,
+	   z0 = svld1rq_f32 (p0, x0 - 2),
+	   z0 = svld1rq (p0, x0 - 2))
+
+/*
+** ld1rq_f32_m3:
+**	sub	(x[0-9]+), x0, #?12
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f32_m3, svfloat32_t, float32_t,
+	   z0 = svld1rq_f32 (p0, x0 - 3),
+	   z0 = svld1rq (p0, x0 - 3))
+
+/*
+** ld1rq_f32_m4:
+**	ld1rqw	z0\.s, p0/z, \[x0, #?-16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f32_m4, svfloat32_t, float32_t,
+	   z0 = svld1rq_f32 (p0, x0 - 4),
+	   z0 = svld1rq (p0, x0 - 4))
+
+/*
+** ld1rq_f32_m32:
+**	ld1rqw	z0\.s, p0/z, \[x0, #?-128\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f32_m32, svfloat32_t, float32_t,
+	   z0 = svld1rq_f32 (p0, x0 - 32),
+	   z0 = svld1rq (p0, x0 - 32))
+
+/*
+** ld1rq_f32_m36:
+**	sub	(x[0-9]+), x0, #?144
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f32_m36, svfloat32_t, float32_t,
+	   z0 = svld1rq_f32 (p0, x0 - 36),
+	   z0 = svld1rq (p0, x0 - 36))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f64.c
new file mode 100644
index 000000000..32105af17
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_f64.c
@@ -0,0 +1,97 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1rq_f64_base:
+**	ld1rqd	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f64_base, svfloat64_t, float64_t,
+	   z0 = svld1rq_f64 (p0, x0),
+	   z0 = svld1rq (p0, x0))
+
+/*
+** ld1rq_f64_index:
+**	ld1rqd	z0\.d, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f64_index, svfloat64_t, float64_t,
+	   z0 = svld1rq_f64 (p0, x0 + x1),
+	   z0 = svld1rq (p0, x0 + x1))
+
+/*
+** ld1rq_f64_1:
+**	add	(x[0-9]+), x0, #?8
+**	ld1rqd	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f64_1, svfloat64_t, float64_t,
+	   z0 = svld1rq_f64 (p0, x0 + 1),
+	   z0 = svld1rq (p0, x0 + 1))
+
+/*
+** ld1rq_f64_2:
+**	ld1rqd	z0\.d, p0/z, \[x0, #?16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f64_2, svfloat64_t, float64_t,
+	   z0 = svld1rq_f64 (p0, x0 + 2),
+	   z0 = svld1rq (p0, x0 + 2))
+
+/*
+** ld1rq_f64_14:
+**	ld1rqd	z0\.d, p0/z, \[x0, #?112\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f64_14, svfloat64_t, float64_t,
+	   z0 = svld1rq_f64 (p0, x0 + 14),
+	   z0 = svld1rq (p0, x0 + 14))
+
+/*
+** ld1rq_f64_16:
+**	add	(x[0-9]+), x0, #?128
+**	ld1rqd	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f64_16, svfloat64_t, float64_t,
+	   z0 = svld1rq_f64 (p0, x0 + 16),
+	   z0 = svld1rq (p0, x0 + 16))
+
+/*
+** ld1rq_f64_m1:
+**	sub	(x[0-9]+), x0, #?8
+**	ld1rqd	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f64_m1, svfloat64_t, float64_t,
+	   z0 = svld1rq_f64 (p0, x0 - 1),
+	   z0 = svld1rq (p0, x0 - 1))
+
+/*
+** ld1rq_f64_m2:
+**	ld1rqd	z0\.d, p0/z, \[x0, #?-16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f64_m2, svfloat64_t, float64_t,
+	   z0 = svld1rq_f64 (p0, x0 - 2),
+	   z0 = svld1rq (p0, x0 - 2))
+
+/*
+** ld1rq_f64_m16:
+**	ld1rqd	z0\.d, p0/z, \[x0, #?-128\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f64_m16, svfloat64_t, float64_t,
+	   z0 = svld1rq_f64 (p0, x0 - 16),
+	   z0 = svld1rq (p0, x0 - 16))
+
+/*
+** ld1rq_f64_m18:
+**	sub	(x[0-9]+), x0, #?144
+**	ld1rqd	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_f64_m18, svfloat64_t, float64_t,
+	   z0 = svld1rq_f64 (p0, x0 - 18),
+	   z0 = svld1rq (p0, x0 - 18))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s16.c
new file mode 100644
index 000000000..8903b96a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s16.c
@@ -0,0 +1,137 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1rq_s16_base:
+**	ld1rqh	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s16_base, svint16_t, int16_t,
+	   z0 = svld1rq_s16 (p0, x0),
+	   z0 = svld1rq (p0, x0))
+
+/*
+** ld1rq_s16_index:
+**	ld1rqh	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s16_index, svint16_t, int16_t,
+	   z0 = svld1rq_s16 (p0, x0 + x1),
+	   z0 = svld1rq (p0, x0 + x1))
+
+/*
+** ld1rq_s16_1:
+**	add	(x[0-9]+), x0, #?2
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s16_1, svint16_t, int16_t,
+	   z0 = svld1rq_s16 (p0, x0 + 1),
+	   z0 = svld1rq (p0, x0 + 1))
+
+/*
+** ld1rq_s16_4:
+**	add	(x[0-9]+), x0, #?8
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s16_4, svint16_t, int16_t,
+	   z0 = svld1rq_s16 (p0, x0 + 4),
+	   z0 = svld1rq (p0, x0 + 4))
+
+/*
+** ld1rq_s16_7:
+**	add	(x[0-9]+), x0, #?14
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s16_7, svint16_t, int16_t,
+	   z0 = svld1rq_s16 (p0, x0 + 7),
+	   z0 = svld1rq (p0, x0 + 7))
+
+/*
+** ld1rq_s16_8:
+**	ld1rqh	z0\.h, p0/z, \[x0, #?16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s16_8, svint16_t, int16_t,
+	   z0 = svld1rq_s16 (p0, x0 + 8),
+	   z0 = svld1rq (p0, x0 + 8))
+
+/*
+** ld1rq_s16_56:
+**	ld1rqh	z0\.h, p0/z, \[x0, #?112\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s16_56, svint16_t, int16_t,
+	   z0 = svld1rq_s16 (p0, x0 + 56),
+	   z0 = svld1rq (p0, x0 + 56))
+
+/*
+** ld1rq_s16_64:
+**	add	(x[0-9]+), x0, #?128
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s16_64, svint16_t, int16_t,
+	   z0 = svld1rq_s16 (p0, x0 + 64),
+	   z0 = svld1rq (p0, x0 + 64))
+
+/*
+** ld1rq_s16_m1:
+**	sub	(x[0-9]+), x0, #?2
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s16_m1, svint16_t, int16_t,
+	   z0 = svld1rq_s16 (p0, x0 - 1),
+	   z0 = svld1rq (p0, x0 - 1))
+
+/*
+** ld1rq_s16_m4:
+**	sub	(x[0-9]+), x0, #?8
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s16_m4, svint16_t, int16_t,
+	   z0 = svld1rq_s16 (p0, x0 - 4),
+	   z0 = svld1rq (p0, x0 - 4))
+
+/*
+** ld1rq_s16_m7:
+**	sub	(x[0-9]+), x0, #?14
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s16_m7, svint16_t, int16_t,
+	   z0 = svld1rq_s16 (p0, x0 - 7),
+	   z0 = svld1rq (p0, x0 - 7))
+
+/*
+** ld1rq_s16_m8:
+**	ld1rqh	z0\.h, p0/z, \[x0, #?-16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s16_m8, svint16_t, int16_t,
+	   z0 = svld1rq_s16 (p0, x0 - 8),
+	   z0 = svld1rq (p0, x0 - 8))
+
+/*
+** ld1rq_s16_m64:
+**	ld1rqh	z0\.h, p0/z, \[x0, #?-128\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s16_m64, svint16_t, int16_t,
+	   z0 = svld1rq_s16 (p0, x0 - 64),
+	   z0 = svld1rq (p0, x0 - 64))
+
+/*
+** ld1rq_s16_m72:
+**	sub	(x[0-9]+), x0, #?144
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s16_m72, svint16_t, int16_t,
+	   z0 = svld1rq_s16 (p0, x0 - 72),
+	   z0 = svld1rq (p0, x0 - 72))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s32.c
new file mode 100644
index 000000000..a428b4350
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s32.c
@@ -0,0 +1,137 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1rq_s32_base:
+**	ld1rqw	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s32_base, svint32_t, int32_t,
+	   z0 = svld1rq_s32 (p0, x0),
+	   z0 = svld1rq (p0, x0))
+
+/*
+** ld1rq_s32_index:
+**	ld1rqw	z0\.s, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s32_index, svint32_t, int32_t,
+	   z0 = svld1rq_s32 (p0, x0 + x1),
+	   z0 = svld1rq (p0, x0 + x1))
+
+/*
+** ld1rq_s32_1:
+**	add	(x[0-9]+), x0, #?4
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s32_1, svint32_t, int32_t,
+	   z0 = svld1rq_s32 (p0, x0 + 1),
+	   z0 = svld1rq (p0, x0 + 1))
+
+/*
+** ld1rq_s32_2:
+**	add	(x[0-9]+), x0, #?8
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s32_2, svint32_t, int32_t,
+	   z0 = svld1rq_s32 (p0, x0 + 2),
+	   z0 = svld1rq (p0, x0 + 2))
+
+/*
+** ld1rq_s32_3:
+**	add	(x[0-9]+), x0, #?12
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s32_3, svint32_t, int32_t,
+	   z0 = svld1rq_s32 (p0, x0 + 3),
+	   z0 = svld1rq (p0, x0 + 3))
+
+/*
+** ld1rq_s32_4:
+**	ld1rqw	z0\.s, p0/z, \[x0, #?16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s32_4, svint32_t, int32_t,
+	   z0 = svld1rq_s32 (p0, x0 + 4),
+	   z0 = svld1rq (p0, x0 + 4))
+
+/*
+** ld1rq_s32_28:
+**	ld1rqw	z0\.s, p0/z, \[x0, #?112\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s32_28, svint32_t, int32_t,
+	   z0 = svld1rq_s32 (p0, x0 + 28),
+	   z0 = svld1rq (p0, x0 + 28))
+
+/*
+** ld1rq_s32_32:
+**	add	(x[0-9]+), x0, #?128
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s32_32, svint32_t, int32_t,
+	   z0 = svld1rq_s32 (p0, x0 + 32),
+	   z0 = svld1rq (p0, x0 + 32))
+
+/*
+** ld1rq_s32_m1:
+**	sub	(x[0-9]+), x0, #?4
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s32_m1, svint32_t, int32_t,
+	   z0 = svld1rq_s32 (p0, x0 - 1),
+	   z0 = svld1rq (p0, x0 - 1))
+
+/*
+** ld1rq_s32_m2:
+**	sub	(x[0-9]+), x0, #?8
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s32_m2, svint32_t, int32_t,
+	   z0 = svld1rq_s32 (p0, x0 - 2),
+	   z0 = svld1rq (p0, x0 - 2))
+
+/*
+** ld1rq_s32_m3:
+**	sub	(x[0-9]+), x0, #?12
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s32_m3, svint32_t, int32_t,
+	   z0 = svld1rq_s32 (p0, x0 - 3),
+	   z0 = svld1rq (p0, x0 - 3))
+
+/*
+** ld1rq_s32_m4:
+**	ld1rqw	z0\.s, p0/z, \[x0, #?-16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s32_m4, svint32_t, int32_t,
+	   z0 = svld1rq_s32 (p0, x0 - 4),
+	   z0 = svld1rq (p0, x0 - 4))
+
+/*
+** ld1rq_s32_m32:
+**	ld1rqw	z0\.s, p0/z, \[x0, #?-128\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s32_m32, svint32_t, int32_t,
+	   z0 = svld1rq_s32 (p0, x0 - 32),
+	   z0 = svld1rq (p0, x0 - 32))
+
+/*
+** ld1rq_s32_m36:
+**	sub	(x[0-9]+), x0, #?144
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s32_m36, svint32_t, int32_t,
+	   z0 = svld1rq_s32 (p0, x0 - 36),
+	   z0 = svld1rq (p0, x0 - 36))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s64.c
new file mode 100644
index 000000000..efc0e740f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s64.c
@@ -0,0 +1,97 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1rq_s64_base:
+**	ld1rqd	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s64_base, svint64_t, int64_t,
+	   z0 = svld1rq_s64 (p0, x0),
+	   z0 = svld1rq (p0, x0))
+
+/*
+** ld1rq_s64_index:
+**	ld1rqd	z0\.d, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s64_index, svint64_t, int64_t,
+	   z0 = svld1rq_s64 (p0, x0 + x1),
+	   z0 = svld1rq (p0, x0 + x1))
+
+/*
+** ld1rq_s64_1:
+**	add	(x[0-9]+), x0, #?8
+**	ld1rqd	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s64_1, svint64_t, int64_t,
+	   z0 = svld1rq_s64 (p0, x0 + 1),
+	   z0 = svld1rq (p0, x0 + 1))
+
+/*
+** ld1rq_s64_2:
+**	ld1rqd	z0\.d, p0/z, \[x0, #?16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s64_2, svint64_t, int64_t,
+	   z0 = svld1rq_s64 (p0, x0 + 2),
+	   z0 = svld1rq (p0, x0 + 2))
+
+/*
+** ld1rq_s64_14:
+**	ld1rqd	z0\.d, p0/z, \[x0, #?112\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s64_14, svint64_t, int64_t,
+	   z0 = svld1rq_s64 (p0, x0 + 14),
+	   z0 = svld1rq (p0, x0 + 14))
+
+/*
+** ld1rq_s64_16:
+**	add	(x[0-9]+), x0, #?128
+**	ld1rqd	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s64_16, svint64_t, int64_t,
+	   z0 = svld1rq_s64 (p0, x0 + 16),
+	   z0 = svld1rq (p0, x0 + 16))
+
+/*
+** ld1rq_s64_m1:
+**	sub	(x[0-9]+), x0, #?8
+**	ld1rqd	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s64_m1, svint64_t, int64_t,
+	   z0 = svld1rq_s64 (p0, x0 - 1),
+	   z0 = svld1rq (p0, x0 - 1))
+
+/*
+** ld1rq_s64_m2:
+**	ld1rqd	z0\.d, p0/z, \[x0, #?-16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s64_m2, svint64_t, int64_t,
+	   z0 = svld1rq_s64 (p0, x0 - 2),
+	   z0 = svld1rq (p0, x0 - 2))
+
+/*
+** ld1rq_s64_m16:
+**	ld1rqd	z0\.d, p0/z, \[x0, #?-128\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s64_m16, svint64_t, int64_t,
+	   z0 = svld1rq_s64 (p0, x0 - 16),
+	   z0 = svld1rq (p0, x0 - 16))
+
+/*
+** ld1rq_s64_m18:
+**	sub	(x[0-9]+), x0, #?144
+**	ld1rqd	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s64_m18, svint64_t, int64_t,
+	   z0 = svld1rq_s64 (p0, x0 - 18),
+	   z0 = svld1rq (p0, x0 - 18))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s8.c
new file mode 100644
index 000000000..e183e472f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_s8.c
@@ -0,0 +1,137 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1rq_s8_base:
+**	ld1rqb	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s8_base, svint8_t, int8_t,
+	   z0 = svld1rq_s8 (p0, x0),
+	   z0 = svld1rq (p0, x0))
+
+/*
+** ld1rq_s8_index:
+**	ld1rqb	z0\.b, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s8_index, svint8_t, int8_t,
+	   z0 = svld1rq_s8 (p0, x0 + x1),
+	   z0 = svld1rq (p0, x0 + x1))
+
+/*
+** ld1rq_s8_1:
+**	add	(x[0-9]+), x0, #?1
+**	ld1rqb	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s8_1, svint8_t, int8_t,
+	   z0 = svld1rq_s8 (p0, x0 + 1),
+	   z0 = svld1rq (p0, x0 + 1))
+
+/*
+** ld1rq_s8_8:
+**	add	(x[0-9]+), x0, #?8
+**	ld1rqb	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s8_8, svint8_t, int8_t,
+	   z0 = svld1rq_s8 (p0, x0 + 8),
+	   z0 = svld1rq (p0, x0 + 8))
+
+/*
+** ld1rq_s8_15:
+**	add	(x[0-9]+), x0, #?15
+**	ld1rqb	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s8_15, svint8_t, int8_t,
+	   z0 = svld1rq_s8 (p0, x0 + 15),
+	   z0 = svld1rq (p0, x0 + 15))
+
+/*
+** ld1rq_s8_16:
+**	ld1rqb	z0\.b, p0/z, \[x0, #?16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s8_16, svint8_t, int8_t,
+	   z0 = svld1rq_s8 (p0, x0 + 16),
+	   z0 = svld1rq (p0, x0 + 16))
+
+/*
+** ld1rq_s8_112:
+**	ld1rqb	z0\.b, p0/z, \[x0, #?112\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s8_112, svint8_t, int8_t,
+	   z0 = svld1rq_s8 (p0, x0 + 112),
+	   z0 = svld1rq (p0, x0 + 112))
+
+/*
+** ld1rq_s8_128:
+**	add	(x[0-9]+), x0, #?128
+**	ld1rqb	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s8_128, svint8_t, int8_t,
+	   z0 = svld1rq_s8 (p0, x0 + 128),
+	   z0 = svld1rq (p0, x0 + 128))
+
+/*
+** ld1rq_s8_m1:
+**	sub	(x[0-9]+), x0, #?1
+**	ld1rqb	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s8_m1, svint8_t, int8_t,
+	   z0 = svld1rq_s8 (p0, x0 - 1),
+	   z0 = svld1rq (p0, x0 - 1))
+
+/*
+** ld1rq_s8_m8:
+**	sub	(x[0-9]+), x0, #?8
+**	ld1rqb	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s8_m8, svint8_t, int8_t,
+	   z0 = svld1rq_s8 (p0, x0 - 8),
+	   z0 = svld1rq (p0, x0 - 8))
+
+/*
+** ld1rq_s8_m15:
+**	sub	(x[0-9]+), x0, #?15
+**	ld1rqb	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s8_m15, svint8_t, int8_t,
+	   z0 = svld1rq_s8 (p0, x0 - 15),
+	   z0 = svld1rq (p0, x0 - 15))
+
+/*
+** ld1rq_s8_m16:
+**	ld1rqb	z0\.b, p0/z, \[x0, #?-16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s8_m16, svint8_t, int8_t,
+	   z0 = svld1rq_s8 (p0, x0 - 16),
+	   z0 = svld1rq (p0, x0 - 16))
+
+/*
+** ld1rq_s8_m128:
+**	ld1rqb	z0\.b, p0/z, \[x0, #?-128\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s8_m128, svint8_t, int8_t,
+	   z0 = svld1rq_s8 (p0, x0 - 128),
+	   z0 = svld1rq (p0, x0 - 128))
+
+/*
+** ld1rq_s8_m144:
+**	sub	(x[0-9]+), x0, #?144
+**	ld1rqb	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_s8_m144, svint8_t, int8_t,
+	   z0 = svld1rq_s8 (p0, x0 - 144),
+	   z0 = svld1rq (p0, x0 - 144))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u16.c
new file mode 100644
index 000000000..c24ab680a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u16.c
@@ -0,0 +1,137 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1rq_u16_base:
+**	ld1rqh	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u16_base, svuint16_t, uint16_t,
+	   z0 = svld1rq_u16 (p0, x0),
+	   z0 = svld1rq (p0, x0))
+
+/*
+** ld1rq_u16_index:
+**	ld1rqh	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u16_index, svuint16_t, uint16_t,
+	   z0 = svld1rq_u16 (p0, x0 + x1),
+	   z0 = svld1rq (p0, x0 + x1))
+
+/*
+** ld1rq_u16_1:
+**	add	(x[0-9]+), x0, #?2
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u16_1, svuint16_t, uint16_t,
+	   z0 = svld1rq_u16 (p0, x0 + 1),
+	   z0 = svld1rq (p0, x0 + 1))
+
+/*
+** ld1rq_u16_4:
+**	add	(x[0-9]+), x0, #?8
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u16_4, svuint16_t, uint16_t,
+	   z0 = svld1rq_u16 (p0, x0 + 4),
+	   z0 = svld1rq (p0, x0 + 4))
+
+/*
+** ld1rq_u16_7:
+**	add	(x[0-9]+), x0, #?14
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u16_7, svuint16_t, uint16_t,
+	   z0 = svld1rq_u16 (p0, x0 + 7),
+	   z0 = svld1rq (p0, x0 + 7))
+
+/*
+** ld1rq_u16_8:
+**	ld1rqh	z0\.h, p0/z, \[x0, #?16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u16_8, svuint16_t, uint16_t,
+	   z0 = svld1rq_u16 (p0, x0 + 8),
+	   z0 = svld1rq (p0, x0 + 8))
+
+/*
+** ld1rq_u16_56:
+**	ld1rqh	z0\.h, p0/z, \[x0, #?112\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u16_56, svuint16_t, uint16_t,
+	   z0 = svld1rq_u16 (p0, x0 + 56),
+	   z0 = svld1rq (p0, x0 + 56))
+
+/*
+** ld1rq_u16_64:
+**	add	(x[0-9]+), x0, #?128
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u16_64, svuint16_t, uint16_t,
+	   z0 = svld1rq_u16 (p0, x0 + 64),
+	   z0 = svld1rq (p0, x0 + 64))
+
+/*
+** ld1rq_u16_m1:
+**	sub	(x[0-9]+), x0, #?2
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u16_m1, svuint16_t, uint16_t,
+	   z0 = svld1rq_u16 (p0, x0 - 1),
+	   z0 = svld1rq (p0, x0 - 1))
+
+/*
+** ld1rq_u16_m4:
+**	sub	(x[0-9]+), x0, #?8
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u16_m4, svuint16_t, uint16_t,
+	   z0 = svld1rq_u16 (p0, x0 - 4),
+	   z0 = svld1rq (p0, x0 - 4))
+
+/*
+** ld1rq_u16_m7:
+**	sub	(x[0-9]+), x0, #?14
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u16_m7, svuint16_t, uint16_t,
+	   z0 = svld1rq_u16 (p0, x0 - 7),
+	   z0 = svld1rq (p0, x0 - 7))
+
+/*
+** ld1rq_u16_m8:
+**	ld1rqh	z0\.h, p0/z, \[x0, #?-16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u16_m8, svuint16_t, uint16_t,
+	   z0 = svld1rq_u16 (p0, x0 - 8),
+	   z0 = svld1rq (p0, x0 - 8))
+
+/*
+** ld1rq_u16_m64:
+**	ld1rqh	z0\.h, p0/z, \[x0, #?-128\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u16_m64, svuint16_t, uint16_t,
+	   z0 = svld1rq_u16 (p0, x0 - 64),
+	   z0 = svld1rq (p0, x0 - 64))
+
+/*
+** ld1rq_u16_m72:
+**	sub	(x[0-9]+), x0, #?144
+**	ld1rqh	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u16_m72, svuint16_t, uint16_t,
+	   z0 = svld1rq_u16 (p0, x0 - 72),
+	   z0 = svld1rq (p0, x0 - 72))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u32.c
new file mode 100644
index 000000000..722e34db3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u32.c
@@ -0,0 +1,137 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1rq_u32_base:
+**	ld1rqw	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u32_base, svuint32_t, uint32_t,
+	   z0 = svld1rq_u32 (p0, x0),
+	   z0 = svld1rq (p0, x0))
+
+/*
+** ld1rq_u32_index:
+**	ld1rqw	z0\.s, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u32_index, svuint32_t, uint32_t,
+	   z0 = svld1rq_u32 (p0, x0 + x1),
+	   z0 = svld1rq (p0, x0 + x1))
+
+/*
+** ld1rq_u32_1:
+**	add	(x[0-9]+), x0, #?4
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u32_1, svuint32_t, uint32_t,
+	   z0 = svld1rq_u32 (p0, x0 + 1),
+	   z0 = svld1rq (p0, x0 + 1))
+
+/*
+** ld1rq_u32_2:
+**	add	(x[0-9]+), x0, #?8
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u32_2, svuint32_t, uint32_t,
+	   z0 = svld1rq_u32 (p0, x0 + 2),
+	   z0 = svld1rq (p0, x0 + 2))
+
+/*
+** ld1rq_u32_3:
+**	add	(x[0-9]+), x0, #?12
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u32_3, svuint32_t, uint32_t,
+	   z0 = svld1rq_u32 (p0, x0 + 3),
+	   z0 = svld1rq (p0, x0 + 3))
+
+/*
+** ld1rq_u32_4:
+**	ld1rqw	z0\.s, p0/z, \[x0, #?16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u32_4, svuint32_t, uint32_t,
+	   z0 = svld1rq_u32 (p0, x0 + 4),
+	   z0 = svld1rq (p0, x0 + 4))
+
+/*
+** ld1rq_u32_28:
+**	ld1rqw	z0\.s, p0/z, \[x0, #?112\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u32_28, svuint32_t, uint32_t,
+	   z0 = svld1rq_u32 (p0, x0 + 28),
+	   z0 = svld1rq (p0, x0 + 28))
+
+/*
+** ld1rq_u32_32:
+**	add	(x[0-9]+), x0, #?128
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u32_32, svuint32_t, uint32_t,
+	   z0 = svld1rq_u32 (p0, x0 + 32),
+	   z0 = svld1rq (p0, x0 + 32))
+
+/*
+** ld1rq_u32_m1:
+**	sub	(x[0-9]+), x0, #?4
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u32_m1, svuint32_t, uint32_t,
+	   z0 = svld1rq_u32 (p0, x0 - 1),
+	   z0 = svld1rq (p0, x0 - 1))
+
+/*
+** ld1rq_u32_m2:
+**	sub	(x[0-9]+), x0, #?8
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u32_m2, svuint32_t, uint32_t,
+	   z0 = svld1rq_u32 (p0, x0 - 2),
+	   z0 = svld1rq (p0, x0 - 2))
+
+/*
+** ld1rq_u32_m3:
+**	sub	(x[0-9]+), x0, #?12
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u32_m3, svuint32_t, uint32_t,
+	   z0 = svld1rq_u32 (p0, x0 - 3),
+	   z0 = svld1rq (p0, x0 - 3))
+
+/*
+** ld1rq_u32_m4:
+**	ld1rqw	z0\.s, p0/z, \[x0, #?-16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u32_m4, svuint32_t, uint32_t,
+	   z0 = svld1rq_u32 (p0, x0 - 4),
+	   z0 = svld1rq (p0, x0 - 4))
+
+/*
+** ld1rq_u32_m32:
+**	ld1rqw	z0\.s, p0/z, \[x0, #?-128\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u32_m32, svuint32_t, uint32_t,
+	   z0 = svld1rq_u32 (p0, x0 - 32),
+	   z0 = svld1rq (p0, x0 - 32))
+
+/*
+** ld1rq_u32_m36:
+**	sub	(x[0-9]+), x0, #?144
+**	ld1rqw	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u32_m36, svuint32_t, uint32_t,
+	   z0 = svld1rq_u32 (p0, x0 - 36),
+	   z0 = svld1rq (p0, x0 - 36))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u64.c
new file mode 100644
index 000000000..a116b7fd9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u64.c
@@ -0,0 +1,97 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1rq_u64_base:
+**	ld1rqd	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u64_base, svuint64_t, uint64_t,
+	   z0 = svld1rq_u64 (p0, x0),
+	   z0 = svld1rq (p0, x0))
+
+/*
+** ld1rq_u64_index:
+**	ld1rqd	z0\.d, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u64_index, svuint64_t, uint64_t,
+	   z0 = svld1rq_u64 (p0, x0 + x1),
+	   z0 = svld1rq (p0, x0 + x1))
+
+/*
+** ld1rq_u64_1:
+**	add	(x[0-9]+), x0, #?8
+**	ld1rqd	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u64_1, svuint64_t, uint64_t,
+	   z0 = svld1rq_u64 (p0, x0 + 1),
+	   z0 = svld1rq (p0, x0 + 1))
+
+/*
+** ld1rq_u64_2:
+**	ld1rqd	z0\.d, p0/z, \[x0, #?16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u64_2, svuint64_t, uint64_t,
+	   z0 = svld1rq_u64 (p0, x0 + 2),
+	   z0 = svld1rq (p0, x0 + 2))
+
+/*
+** ld1rq_u64_14:
+**	ld1rqd	z0\.d, p0/z, \[x0, #?112\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u64_14, svuint64_t, uint64_t,
+	   z0 = svld1rq_u64 (p0, x0 + 14),
+	   z0 = svld1rq (p0, x0 + 14))
+
+/*
+** ld1rq_u64_16:
+**	add	(x[0-9]+), x0, #?128
+**	ld1rqd	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u64_16, svuint64_t, uint64_t,
+	   z0 = svld1rq_u64 (p0, x0 + 16),
+	   z0 = svld1rq (p0, x0 + 16))
+
+/*
+** ld1rq_u64_m1:
+**	sub	(x[0-9]+), x0, #?8
+**	ld1rqd	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u64_m1, svuint64_t, uint64_t,
+	   z0 = svld1rq_u64 (p0, x0 - 1),
+	   z0 = svld1rq (p0, x0 - 1))
+
+/*
+** ld1rq_u64_m2:
+**	ld1rqd	z0\.d, p0/z, \[x0, #?-16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u64_m2, svuint64_t, uint64_t,
+	   z0 = svld1rq_u64 (p0, x0 - 2),
+	   z0 = svld1rq (p0, x0 - 2))
+
+/*
+** ld1rq_u64_m16:
+**	ld1rqd	z0\.d, p0/z, \[x0, #?-128\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u64_m16, svuint64_t, uint64_t,
+	   z0 = svld1rq_u64 (p0, x0 - 16),
+	   z0 = svld1rq (p0, x0 - 16))
+
+/*
+** ld1rq_u64_m18:
+**	sub	(x[0-9]+), x0, #?144
+**	ld1rqd	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u64_m18, svuint64_t, uint64_t,
+	   z0 = svld1rq_u64 (p0, x0 - 18),
+	   z0 = svld1rq (p0, x0 - 18))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u8.c
new file mode 100644
index 000000000..74b72530e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1rq_u8.c
@@ -0,0 +1,137 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1rq_u8_base:
+**	ld1rqb	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u8_base, svuint8_t, uint8_t,
+	   z0 = svld1rq_u8 (p0, x0),
+	   z0 = svld1rq (p0, x0))
+
+/*
+** ld1rq_u8_index:
+**	ld1rqb	z0\.b, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u8_index, svuint8_t, uint8_t,
+	   z0 = svld1rq_u8 (p0, x0 + x1),
+	   z0 = svld1rq (p0, x0 + x1))
+
+/*
+** ld1rq_u8_1:
+**	add	(x[0-9]+), x0, #?1
+**	ld1rqb	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u8_1, svuint8_t, uint8_t,
+	   z0 = svld1rq_u8 (p0, x0 + 1),
+	   z0 = svld1rq (p0, x0 + 1))
+
+/*
+** ld1rq_u8_8:
+**	add	(x[0-9]+), x0, #?8
+**	ld1rqb	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u8_8, svuint8_t, uint8_t,
+	   z0 = svld1rq_u8 (p0, x0 + 8),
+	   z0 = svld1rq (p0, x0 + 8))
+
+/*
+** ld1rq_u8_15:
+**	add	(x[0-9]+), x0, #?15
+**	ld1rqb	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u8_15, svuint8_t, uint8_t,
+	   z0 = svld1rq_u8 (p0, x0 + 15),
+	   z0 = svld1rq (p0, x0 + 15))
+
+/*
+** ld1rq_u8_16:
+**	ld1rqb	z0\.b, p0/z, \[x0, #?16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u8_16, svuint8_t, uint8_t,
+	   z0 = svld1rq_u8 (p0, x0 + 16),
+	   z0 = svld1rq (p0, x0 + 16))
+
+/*
+** ld1rq_u8_112:
+**	ld1rqb	z0\.b, p0/z, \[x0, #?112\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u8_112, svuint8_t, uint8_t,
+	   z0 = svld1rq_u8 (p0, x0 + 112),
+	   z0 = svld1rq (p0, x0 + 112))
+
+/*
+** ld1rq_u8_128:
+**	add	(x[0-9]+), x0, #?128
+**	ld1rqb	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u8_128, svuint8_t, uint8_t,
+	   z0 = svld1rq_u8 (p0, x0 + 128),
+	   z0 = svld1rq (p0, x0 + 128))
+
+/*
+** ld1rq_u8_m1:
+**	sub	(x[0-9]+), x0, #?1
+**	ld1rqb	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u8_m1, svuint8_t, uint8_t,
+	   z0 = svld1rq_u8 (p0, x0 - 1),
+	   z0 = svld1rq (p0, x0 - 1))
+
+/*
+** ld1rq_u8_m8:
+**	sub	(x[0-9]+), x0, #?8
+**	ld1rqb	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u8_m8, svuint8_t, uint8_t,
+	   z0 = svld1rq_u8 (p0, x0 - 8),
+	   z0 = svld1rq (p0, x0 - 8))
+
+/*
+** ld1rq_u8_m15:
+**	sub	(x[0-9]+), x0, #?15
+**	ld1rqb	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u8_m15, svuint8_t, uint8_t,
+	   z0 = svld1rq_u8 (p0, x0 - 15),
+	   z0 = svld1rq (p0, x0 - 15))
+
+/*
+** ld1rq_u8_m16:
+**	ld1rqb	z0\.b, p0/z, \[x0, #?-16\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u8_m16, svuint8_t, uint8_t,
+	   z0 = svld1rq_u8 (p0, x0 - 16),
+	   z0 = svld1rq (p0, x0 - 16))
+
+/*
+** ld1rq_u8_m128:
+**	ld1rqb	z0\.b, p0/z, \[x0, #?-128\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u8_m128, svuint8_t, uint8_t,
+	   z0 = svld1rq_u8 (p0, x0 - 128),
+	   z0 = svld1rq (p0, x0 - 128))
+
+/*
+** ld1rq_u8_m144:
+**	sub	(x[0-9]+), x0, #?144
+**	ld1rqb	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld1rq_u8_m144, svuint8_t, uint8_t,
+	   z0 = svld1rq_u8 (p0, x0 - 144),
+	   z0 = svld1rq (p0, x0 - 144))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_s32.c
new file mode 100644
index 000000000..16a5316a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_s32.c
@@ -0,0 +1,131 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sb_gather_s32_tied1:
+**	ld1sb	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_s32_tied1, svint32_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32base_s32 (p0, z0),
+		     z0_res = svld1sb_gather_s32 (p0, z0))
+
+/*
+** ld1sb_gather_s32_untied:
+**	ld1sb	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_s32_untied, svint32_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32base_s32 (p0, z1),
+		     z0_res = svld1sb_gather_s32 (p0, z1))
+
+/*
+** ld1sb_gather_x0_s32_offset:
+**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_x0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32base_offset_s32 (p0, z0, x0),
+		     z0_res = svld1sb_gather_offset_s32 (p0, z0, x0))
+
+/*
+** ld1sb_gather_m1_s32_offset:
+**	mov	(x[0-9]+), #?-1
+**	ld1sb	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_m1_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32base_offset_s32 (p0, z0, -1),
+		     z0_res = svld1sb_gather_offset_s32 (p0, z0, -1))
+
+/*
+** ld1sb_gather_0_s32_offset:
+**	ld1sb	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32base_offset_s32 (p0, z0, 0),
+		     z0_res = svld1sb_gather_offset_s32 (p0, z0, 0))
+
+/*
+** ld1sb_gather_5_s32_offset:
+**	ld1sb	z0\.s, p0/z, \[z0\.s, #5\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_5_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32base_offset_s32 (p0, z0, 5),
+		     z0_res = svld1sb_gather_offset_s32 (p0, z0, 5))
+
+/*
+** ld1sb_gather_31_s32_offset:
+**	ld1sb	z0\.s, p0/z, \[z0\.s, #31\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_31_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32base_offset_s32 (p0, z0, 31),
+		     z0_res = svld1sb_gather_offset_s32 (p0, z0, 31))
+
+/*
+** ld1sb_gather_32_s32_offset:
+**	mov	(x[0-9]+), #?32
+**	ld1sb	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_32_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32base_offset_s32 (p0, z0, 32),
+		     z0_res = svld1sb_gather_offset_s32 (p0, z0, 32))
+
+/*
+** ld1sb_gather_x0_s32_s32offset:
+**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_x0_s32_s32offset, svint32_t, int8_t, svint32_t,
+		     z0_res = svld1sb_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svld1sb_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ld1sb_gather_tied1_s32_s32offset:
+**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_tied1_s32_s32offset, svint32_t, int8_t, svint32_t,
+		     z0_res = svld1sb_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svld1sb_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ld1sb_gather_untied_s32_s32offset:
+**	ld1sb	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_untied_s32_s32offset, svint32_t, int8_t, svint32_t,
+		     z0_res = svld1sb_gather_s32offset_s32 (p0, x0, z1),
+		     z0_res = svld1sb_gather_offset_s32 (p0, x0, z1))
+
+/*
+** ld1sb_gather_x0_s32_u32offset:
+**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_x0_s32_u32offset, svint32_t, int8_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svld1sb_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ld1sb_gather_tied1_s32_u32offset:
+**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_tied1_s32_u32offset, svint32_t, int8_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svld1sb_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ld1sb_gather_untied_s32_u32offset:
+**	ld1sb	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_untied_s32_u32offset, svint32_t, int8_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32offset_s32 (p0, x0, z1),
+		     z0_res = svld1sb_gather_offset_s32 (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_s64.c
new file mode 100644
index 000000000..3f953247e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_s64.c
@@ -0,0 +1,149 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sb_gather_s64_tied1:
+**	ld1sb	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_s64_tied1, svint64_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64base_s64 (p0, z0),
+		     z0_res = svld1sb_gather_s64 (p0, z0))
+
+/*
+** ld1sb_gather_s64_untied:
+**	ld1sb	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_s64_untied, svint64_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64base_s64 (p0, z1),
+		     z0_res = svld1sb_gather_s64 (p0, z1))
+
+/*
+** ld1sb_gather_x0_s64_offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_x0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64base_offset_s64 (p0, z0, x0),
+		     z0_res = svld1sb_gather_offset_s64 (p0, z0, x0))
+
+/*
+** ld1sb_gather_m1_s64_offset:
+**	mov	(x[0-9]+), #?-1
+**	ld1sb	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_m1_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64base_offset_s64 (p0, z0, -1),
+		     z0_res = svld1sb_gather_offset_s64 (p0, z0, -1))
+
+/*
+** ld1sb_gather_0_s64_offset:
+**	ld1sb	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64base_offset_s64 (p0, z0, 0),
+		     z0_res = svld1sb_gather_offset_s64 (p0, z0, 0))
+
+/*
+** ld1sb_gather_5_s64_offset:
+**	ld1sb	z0\.d, p0/z, \[z0\.d, #5\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_5_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64base_offset_s64 (p0, z0, 5),
+		     z0_res = svld1sb_gather_offset_s64 (p0, z0, 5))
+
+/*
+** ld1sb_gather_31_s64_offset:
+**	ld1sb	z0\.d, p0/z, \[z0\.d, #31\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_31_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64base_offset_s64 (p0, z0, 31),
+		     z0_res = svld1sb_gather_offset_s64 (p0, z0, 31))
+
+/*
+** ld1sb_gather_32_s64_offset:
+**	mov	(x[0-9]+), #?32
+**	ld1sb	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_32_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64base_offset_s64 (p0, z0, 32),
+		     z0_res = svld1sb_gather_offset_s64 (p0, z0, 32))
+
+/*
+** ld1sb_gather_x0_s64_s64offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_x0_s64_s64offset, svint64_t, int8_t, svint64_t,
+		     z0_res = svld1sb_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svld1sb_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1sb_gather_tied1_s64_s64offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_tied1_s64_s64offset, svint64_t, int8_t, svint64_t,
+		     z0_res = svld1sb_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svld1sb_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1sb_gather_untied_s64_s64offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_untied_s64_s64offset, svint64_t, int8_t, svint64_t,
+		     z0_res = svld1sb_gather_s64offset_s64 (p0, x0, z1),
+		     z0_res = svld1sb_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ld1sb_gather_ext_s64_s64offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_ext_s64_s64offset, svint64_t, int8_t, svint64_t,
+		     z0_res = svld1sb_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1sb_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1sb_gather_x0_s64_u64offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_x0_s64_u64offset, svint64_t, int8_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svld1sb_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1sb_gather_tied1_s64_u64offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_tied1_s64_u64offset, svint64_t, int8_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svld1sb_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1sb_gather_untied_s64_u64offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_untied_s64_u64offset, svint64_t, int8_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64offset_s64 (p0, x0, z1),
+		     z0_res = svld1sb_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ld1sb_gather_ext_s64_u64offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_ext_s64_u64offset, svint64_t, int8_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1sb_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_u32.c
new file mode 100644
index 000000000..424de65a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_u32.c
@@ -0,0 +1,131 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sb_gather_u32_tied1:
+**	ld1sb	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_u32_tied1, svuint32_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32base_u32 (p0, z0),
+		     z0_res = svld1sb_gather_u32 (p0, z0))
+
+/*
+** ld1sb_gather_u32_untied:
+**	ld1sb	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_u32_untied, svuint32_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32base_u32 (p0, z1),
+		     z0_res = svld1sb_gather_u32 (p0, z1))
+
+/*
+** ld1sb_gather_x0_u32_offset:
+**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_x0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32base_offset_u32 (p0, z0, x0),
+		     z0_res = svld1sb_gather_offset_u32 (p0, z0, x0))
+
+/*
+** ld1sb_gather_m1_u32_offset:
+**	mov	(x[0-9]+), #?-1
+**	ld1sb	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_m1_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32base_offset_u32 (p0, z0, -1),
+		     z0_res = svld1sb_gather_offset_u32 (p0, z0, -1))
+
+/*
+** ld1sb_gather_0_u32_offset:
+**	ld1sb	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32base_offset_u32 (p0, z0, 0),
+		     z0_res = svld1sb_gather_offset_u32 (p0, z0, 0))
+
+/*
+** ld1sb_gather_5_u32_offset:
+**	ld1sb	z0\.s, p0/z, \[z0\.s, #5\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_5_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32base_offset_u32 (p0, z0, 5),
+		     z0_res = svld1sb_gather_offset_u32 (p0, z0, 5))
+
+/*
+** ld1sb_gather_31_u32_offset:
+**	ld1sb	z0\.s, p0/z, \[z0\.s, #31\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_31_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32base_offset_u32 (p0, z0, 31),
+		     z0_res = svld1sb_gather_offset_u32 (p0, z0, 31))
+
+/*
+** ld1sb_gather_32_u32_offset:
+**	mov	(x[0-9]+), #?32
+**	ld1sb	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_32_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32base_offset_u32 (p0, z0, 32),
+		     z0_res = svld1sb_gather_offset_u32 (p0, z0, 32))
+
+/*
+** ld1sb_gather_x0_u32_s32offset:
+**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_x0_u32_s32offset, svuint32_t, int8_t, svint32_t,
+		     z0_res = svld1sb_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svld1sb_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ld1sb_gather_tied1_u32_s32offset:
+**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_tied1_u32_s32offset, svuint32_t, int8_t, svint32_t,
+		     z0_res = svld1sb_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svld1sb_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ld1sb_gather_untied_u32_s32offset:
+**	ld1sb	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_untied_u32_s32offset, svuint32_t, int8_t, svint32_t,
+		     z0_res = svld1sb_gather_s32offset_u32 (p0, x0, z1),
+		     z0_res = svld1sb_gather_offset_u32 (p0, x0, z1))
+
+/*
+** ld1sb_gather_x0_u32_u32offset:
+**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_x0_u32_u32offset, svuint32_t, int8_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svld1sb_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ld1sb_gather_tied1_u32_u32offset:
+**	ld1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_tied1_u32_u32offset, svuint32_t, int8_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svld1sb_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ld1sb_gather_untied_u32_u32offset:
+**	ld1sb	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_untied_u32_u32offset, svuint32_t, int8_t, svuint32_t,
+		     z0_res = svld1sb_gather_u32offset_u32 (p0, x0, z1),
+		     z0_res = svld1sb_gather_offset_u32 (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_u64.c
new file mode 100644
index 000000000..aa375bea2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_gather_u64.c
@@ -0,0 +1,149 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sb_gather_u64_tied1:
+**	ld1sb	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_u64_tied1, svuint64_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64base_u64 (p0, z0),
+		     z0_res = svld1sb_gather_u64 (p0, z0))
+
+/*
+** ld1sb_gather_u64_untied:
+**	ld1sb	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_u64_untied, svuint64_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64base_u64 (p0, z1),
+		     z0_res = svld1sb_gather_u64 (p0, z1))
+
+/*
+** ld1sb_gather_x0_u64_offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_x0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64base_offset_u64 (p0, z0, x0),
+		     z0_res = svld1sb_gather_offset_u64 (p0, z0, x0))
+
+/*
+** ld1sb_gather_m1_u64_offset:
+**	mov	(x[0-9]+), #?-1
+**	ld1sb	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_m1_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64base_offset_u64 (p0, z0, -1),
+		     z0_res = svld1sb_gather_offset_u64 (p0, z0, -1))
+
+/*
+** ld1sb_gather_0_u64_offset:
+**	ld1sb	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64base_offset_u64 (p0, z0, 0),
+		     z0_res = svld1sb_gather_offset_u64 (p0, z0, 0))
+
+/*
+** ld1sb_gather_5_u64_offset:
+**	ld1sb	z0\.d, p0/z, \[z0\.d, #5\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_5_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64base_offset_u64 (p0, z0, 5),
+		     z0_res = svld1sb_gather_offset_u64 (p0, z0, 5))
+
+/*
+** ld1sb_gather_31_u64_offset:
+**	ld1sb	z0\.d, p0/z, \[z0\.d, #31\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_31_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64base_offset_u64 (p0, z0, 31),
+		     z0_res = svld1sb_gather_offset_u64 (p0, z0, 31))
+
+/*
+** ld1sb_gather_32_u64_offset:
+**	mov	(x[0-9]+), #?32
+**	ld1sb	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sb_gather_32_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64base_offset_u64 (p0, z0, 32),
+		     z0_res = svld1sb_gather_offset_u64 (p0, z0, 32))
+
+/*
+** ld1sb_gather_x0_u64_s64offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_x0_u64_s64offset, svuint64_t, int8_t, svint64_t,
+		     z0_res = svld1sb_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svld1sb_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1sb_gather_tied1_u64_s64offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_tied1_u64_s64offset, svuint64_t, int8_t, svint64_t,
+		     z0_res = svld1sb_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svld1sb_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1sb_gather_untied_u64_s64offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_untied_u64_s64offset, svuint64_t, int8_t, svint64_t,
+		     z0_res = svld1sb_gather_s64offset_u64 (p0, x0, z1),
+		     z0_res = svld1sb_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ld1sb_gather_ext_u64_s64offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_ext_u64_s64offset, svuint64_t, int8_t, svint64_t,
+		     z0_res = svld1sb_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1sb_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1sb_gather_x0_u64_u64offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_x0_u64_u64offset, svuint64_t, int8_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svld1sb_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1sb_gather_tied1_u64_u64offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_tied1_u64_u64offset, svuint64_t, int8_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svld1sb_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1sb_gather_untied_u64_u64offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_untied_u64_u64offset, svuint64_t, int8_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64offset_u64 (p0, x0, z1),
+		     z0_res = svld1sb_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ld1sb_gather_ext_u64_u64offset:
+**	ld1sb	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sb_gather_ext_u64_u64offset, svuint64_t, int8_t, svuint64_t,
+		     z0_res = svld1sb_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1sb_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s16.c
new file mode 100644
index 000000000..70a793c14
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s16.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sb_s16_base:
+**	ld1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s16_base, svint16_t, int8_t,
+	   z0 = svld1sb_s16 (p0, x0),
+	   z0 = svld1sb_s16 (p0, x0))
+
+/*
+** ld1sb_s16_index:
+**	ld1sb	z0\.h, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s16_index, svint16_t, int8_t,
+	   z0 = svld1sb_s16 (p0, x0 + x1),
+	   z0 = svld1sb_s16 (p0, x0 + x1))
+
+/*
+** ld1sb_s16_1:
+**	ld1sb	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s16_1, svint16_t, int8_t,
+	   z0 = svld1sb_s16 (p0, x0 + svcnth ()),
+	   z0 = svld1sb_s16 (p0, x0 + svcnth ()))
+
+/*
+** ld1sb_s16_7:
+**	ld1sb	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s16_7, svint16_t, int8_t,
+	   z0 = svld1sb_s16 (p0, x0 + svcnth () * 7),
+	   z0 = svld1sb_s16 (p0, x0 + svcnth () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_s16_8:
+**	incb	x0, all, mul #4
+**	ld1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s16_8, svint16_t, int8_t,
+	   z0 = svld1sb_s16 (p0, x0 + svcnth () * 8),
+	   z0 = svld1sb_s16 (p0, x0 + svcnth () * 8))
+
+/*
+** ld1sb_s16_m1:
+**	ld1sb	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s16_m1, svint16_t, int8_t,
+	   z0 = svld1sb_s16 (p0, x0 - svcnth ()),
+	   z0 = svld1sb_s16 (p0, x0 - svcnth ()))
+
+/*
+** ld1sb_s16_m8:
+**	ld1sb	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s16_m8, svint16_t, int8_t,
+	   z0 = svld1sb_s16 (p0, x0 - svcnth () * 8),
+	   z0 = svld1sb_s16 (p0, x0 - svcnth () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_s16_m9:
+**	dech	x0, all, mul #9
+**	ld1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s16_m9, svint16_t, int8_t,
+	   z0 = svld1sb_s16 (p0, x0 - svcnth () * 9),
+	   z0 = svld1sb_s16 (p0, x0 - svcnth () * 9))
+
+/*
+** ld1sb_vnum_s16_0:
+**	ld1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s16_0, svint16_t, int8_t,
+	   z0 = svld1sb_vnum_s16 (p0, x0, 0),
+	   z0 = svld1sb_vnum_s16 (p0, x0, 0))
+
+/*
+** ld1sb_vnum_s16_1:
+**	ld1sb	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s16_1, svint16_t, int8_t,
+	   z0 = svld1sb_vnum_s16 (p0, x0, 1),
+	   z0 = svld1sb_vnum_s16 (p0, x0, 1))
+
+/*
+** ld1sb_vnum_s16_7:
+**	ld1sb	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s16_7, svint16_t, int8_t,
+	   z0 = svld1sb_vnum_s16 (p0, x0, 7),
+	   z0 = svld1sb_vnum_s16 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_vnum_s16_8:
+**	incb	x0, all, mul #4
+**	ld1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s16_8, svint16_t, int8_t,
+	   z0 = svld1sb_vnum_s16 (p0, x0, 8),
+	   z0 = svld1sb_vnum_s16 (p0, x0, 8))
+
+/*
+** ld1sb_vnum_s16_m1:
+**	ld1sb	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s16_m1, svint16_t, int8_t,
+	   z0 = svld1sb_vnum_s16 (p0, x0, -1),
+	   z0 = svld1sb_vnum_s16 (p0, x0, -1))
+
+/*
+** ld1sb_vnum_s16_m8:
+**	ld1sb	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s16_m8, svint16_t, int8_t,
+	   z0 = svld1sb_vnum_s16 (p0, x0, -8),
+	   z0 = svld1sb_vnum_s16 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_vnum_s16_m9:
+**	dech	x0, all, mul #9
+**	ld1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s16_m9, svint16_t, int8_t,
+	   z0 = svld1sb_vnum_s16 (p0, x0, -9),
+	   z0 = svld1sb_vnum_s16 (p0, x0, -9))
+
+/*
+** ld1sb_vnum_s16_x1:
+**	cnth	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld1sb	z0\.h, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld1sb	z0\.h, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s16_x1, svint16_t, int8_t,
+	   z0 = svld1sb_vnum_s16 (p0, x0, x1),
+	   z0 = svld1sb_vnum_s16 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s32.c
new file mode 100644
index 000000000..74b3a321b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s32.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sb_s32_base:
+**	ld1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s32_base, svint32_t, int8_t,
+	   z0 = svld1sb_s32 (p0, x0),
+	   z0 = svld1sb_s32 (p0, x0))
+
+/*
+** ld1sb_s32_index:
+**	ld1sb	z0\.s, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s32_index, svint32_t, int8_t,
+	   z0 = svld1sb_s32 (p0, x0 + x1),
+	   z0 = svld1sb_s32 (p0, x0 + x1))
+
+/*
+** ld1sb_s32_1:
+**	ld1sb	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s32_1, svint32_t, int8_t,
+	   z0 = svld1sb_s32 (p0, x0 + svcntw ()),
+	   z0 = svld1sb_s32 (p0, x0 + svcntw ()))
+
+/*
+** ld1sb_s32_7:
+**	ld1sb	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s32_7, svint32_t, int8_t,
+	   z0 = svld1sb_s32 (p0, x0 + svcntw () * 7),
+	   z0 = svld1sb_s32 (p0, x0 + svcntw () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_s32_8:
+**	incb	x0, all, mul #2
+**	ld1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s32_8, svint32_t, int8_t,
+	   z0 = svld1sb_s32 (p0, x0 + svcntw () * 8),
+	   z0 = svld1sb_s32 (p0, x0 + svcntw () * 8))
+
+/*
+** ld1sb_s32_m1:
+**	ld1sb	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s32_m1, svint32_t, int8_t,
+	   z0 = svld1sb_s32 (p0, x0 - svcntw ()),
+	   z0 = svld1sb_s32 (p0, x0 - svcntw ()))
+
+/*
+** ld1sb_s32_m8:
+**	ld1sb	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s32_m8, svint32_t, int8_t,
+	   z0 = svld1sb_s32 (p0, x0 - svcntw () * 8),
+	   z0 = svld1sb_s32 (p0, x0 - svcntw () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_s32_m9:
+**	decw	x0, all, mul #9
+**	ld1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s32_m9, svint32_t, int8_t,
+	   z0 = svld1sb_s32 (p0, x0 - svcntw () * 9),
+	   z0 = svld1sb_s32 (p0, x0 - svcntw () * 9))
+
+/*
+** ld1sb_vnum_s32_0:
+**	ld1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s32_0, svint32_t, int8_t,
+	   z0 = svld1sb_vnum_s32 (p0, x0, 0),
+	   z0 = svld1sb_vnum_s32 (p0, x0, 0))
+
+/*
+** ld1sb_vnum_s32_1:
+**	ld1sb	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s32_1, svint32_t, int8_t,
+	   z0 = svld1sb_vnum_s32 (p0, x0, 1),
+	   z0 = svld1sb_vnum_s32 (p0, x0, 1))
+
+/*
+** ld1sb_vnum_s32_7:
+**	ld1sb	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s32_7, svint32_t, int8_t,
+	   z0 = svld1sb_vnum_s32 (p0, x0, 7),
+	   z0 = svld1sb_vnum_s32 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_vnum_s32_8:
+**	incb	x0, all, mul #2
+**	ld1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s32_8, svint32_t, int8_t,
+	   z0 = svld1sb_vnum_s32 (p0, x0, 8),
+	   z0 = svld1sb_vnum_s32 (p0, x0, 8))
+
+/*
+** ld1sb_vnum_s32_m1:
+**	ld1sb	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s32_m1, svint32_t, int8_t,
+	   z0 = svld1sb_vnum_s32 (p0, x0, -1),
+	   z0 = svld1sb_vnum_s32 (p0, x0, -1))
+
+/*
+** ld1sb_vnum_s32_m8:
+**	ld1sb	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s32_m8, svint32_t, int8_t,
+	   z0 = svld1sb_vnum_s32 (p0, x0, -8),
+	   z0 = svld1sb_vnum_s32 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_vnum_s32_m9:
+**	decw	x0, all, mul #9
+**	ld1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s32_m9, svint32_t, int8_t,
+	   z0 = svld1sb_vnum_s32 (p0, x0, -9),
+	   z0 = svld1sb_vnum_s32 (p0, x0, -9))
+
+/*
+** ld1sb_vnum_s32_x1:
+**	cntw	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld1sb	z0\.s, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld1sb	z0\.s, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s32_x1, svint32_t, int8_t,
+	   z0 = svld1sb_vnum_s32 (p0, x0, x1),
+	   z0 = svld1sb_vnum_s32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s64.c
new file mode 100644
index 000000000..1984e1956
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_s64.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sb_s64_base:
+**	ld1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s64_base, svint64_t, int8_t,
+	   z0 = svld1sb_s64 (p0, x0),
+	   z0 = svld1sb_s64 (p0, x0))
+
+/*
+** ld1sb_s64_index:
+**	ld1sb	z0\.d, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s64_index, svint64_t, int8_t,
+	   z0 = svld1sb_s64 (p0, x0 + x1),
+	   z0 = svld1sb_s64 (p0, x0 + x1))
+
+/*
+** ld1sb_s64_1:
+**	ld1sb	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s64_1, svint64_t, int8_t,
+	   z0 = svld1sb_s64 (p0, x0 + svcntd ()),
+	   z0 = svld1sb_s64 (p0, x0 + svcntd ()))
+
+/*
+** ld1sb_s64_7:
+**	ld1sb	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s64_7, svint64_t, int8_t,
+	   z0 = svld1sb_s64 (p0, x0 + svcntd () * 7),
+	   z0 = svld1sb_s64 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_s64_8:
+**	incb	x0
+**	ld1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s64_8, svint64_t, int8_t,
+	   z0 = svld1sb_s64 (p0, x0 + svcntd () * 8),
+	   z0 = svld1sb_s64 (p0, x0 + svcntd () * 8))
+
+/*
+** ld1sb_s64_m1:
+**	ld1sb	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s64_m1, svint64_t, int8_t,
+	   z0 = svld1sb_s64 (p0, x0 - svcntd ()),
+	   z0 = svld1sb_s64 (p0, x0 - svcntd ()))
+
+/*
+** ld1sb_s64_m8:
+**	ld1sb	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s64_m8, svint64_t, int8_t,
+	   z0 = svld1sb_s64 (p0, x0 - svcntd () * 8),
+	   z0 = svld1sb_s64 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_s64_m9:
+**	decd	x0, all, mul #9
+**	ld1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_s64_m9, svint64_t, int8_t,
+	   z0 = svld1sb_s64 (p0, x0 - svcntd () * 9),
+	   z0 = svld1sb_s64 (p0, x0 - svcntd () * 9))
+
+/*
+** ld1sb_vnum_s64_0:
+**	ld1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s64_0, svint64_t, int8_t,
+	   z0 = svld1sb_vnum_s64 (p0, x0, 0),
+	   z0 = svld1sb_vnum_s64 (p0, x0, 0))
+
+/*
+** ld1sb_vnum_s64_1:
+**	ld1sb	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s64_1, svint64_t, int8_t,
+	   z0 = svld1sb_vnum_s64 (p0, x0, 1),
+	   z0 = svld1sb_vnum_s64 (p0, x0, 1))
+
+/*
+** ld1sb_vnum_s64_7:
+**	ld1sb	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s64_7, svint64_t, int8_t,
+	   z0 = svld1sb_vnum_s64 (p0, x0, 7),
+	   z0 = svld1sb_vnum_s64 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_vnum_s64_8:
+**	incb	x0
+**	ld1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s64_8, svint64_t, int8_t,
+	   z0 = svld1sb_vnum_s64 (p0, x0, 8),
+	   z0 = svld1sb_vnum_s64 (p0, x0, 8))
+
+/*
+** ld1sb_vnum_s64_m1:
+**	ld1sb	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s64_m1, svint64_t, int8_t,
+	   z0 = svld1sb_vnum_s64 (p0, x0, -1),
+	   z0 = svld1sb_vnum_s64 (p0, x0, -1))
+
+/*
+** ld1sb_vnum_s64_m8:
+**	ld1sb	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s64_m8, svint64_t, int8_t,
+	   z0 = svld1sb_vnum_s64 (p0, x0, -8),
+	   z0 = svld1sb_vnum_s64 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_vnum_s64_m9:
+**	decd	x0, all, mul #9
+**	ld1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s64_m9, svint64_t, int8_t,
+	   z0 = svld1sb_vnum_s64 (p0, x0, -9),
+	   z0 = svld1sb_vnum_s64 (p0, x0, -9))
+
+/*
+** ld1sb_vnum_s64_x1:
+**	cntd	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld1sb	z0\.d, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld1sb	z0\.d, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_s64_x1, svint64_t, int8_t,
+	   z0 = svld1sb_vnum_s64 (p0, x0, x1),
+	   z0 = svld1sb_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u16.c
new file mode 100644
index 000000000..cfa616251
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u16.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sb_u16_base:
+**	ld1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u16_base, svuint16_t, int8_t,
+	   z0 = svld1sb_u16 (p0, x0),
+	   z0 = svld1sb_u16 (p0, x0))
+
+/*
+** ld1sb_u16_index:
+**	ld1sb	z0\.h, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u16_index, svuint16_t, int8_t,
+	   z0 = svld1sb_u16 (p0, x0 + x1),
+	   z0 = svld1sb_u16 (p0, x0 + x1))
+
+/*
+** ld1sb_u16_1:
+**	ld1sb	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u16_1, svuint16_t, int8_t,
+	   z0 = svld1sb_u16 (p0, x0 + svcnth ()),
+	   z0 = svld1sb_u16 (p0, x0 + svcnth ()))
+
+/*
+** ld1sb_u16_7:
+**	ld1sb	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u16_7, svuint16_t, int8_t,
+	   z0 = svld1sb_u16 (p0, x0 + svcnth () * 7),
+	   z0 = svld1sb_u16 (p0, x0 + svcnth () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_u16_8:
+**	incb	x0, all, mul #4
+**	ld1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u16_8, svuint16_t, int8_t,
+	   z0 = svld1sb_u16 (p0, x0 + svcnth () * 8),
+	   z0 = svld1sb_u16 (p0, x0 + svcnth () * 8))
+
+/*
+** ld1sb_u16_m1:
+**	ld1sb	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u16_m1, svuint16_t, int8_t,
+	   z0 = svld1sb_u16 (p0, x0 - svcnth ()),
+	   z0 = svld1sb_u16 (p0, x0 - svcnth ()))
+
+/*
+** ld1sb_u16_m8:
+**	ld1sb	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u16_m8, svuint16_t, int8_t,
+	   z0 = svld1sb_u16 (p0, x0 - svcnth () * 8),
+	   z0 = svld1sb_u16 (p0, x0 - svcnth () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_u16_m9:
+**	dech	x0, all, mul #9
+**	ld1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u16_m9, svuint16_t, int8_t,
+	   z0 = svld1sb_u16 (p0, x0 - svcnth () * 9),
+	   z0 = svld1sb_u16 (p0, x0 - svcnth () * 9))
+
+/*
+** ld1sb_vnum_u16_0:
+**	ld1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u16_0, svuint16_t, int8_t,
+	   z0 = svld1sb_vnum_u16 (p0, x0, 0),
+	   z0 = svld1sb_vnum_u16 (p0, x0, 0))
+
+/*
+** ld1sb_vnum_u16_1:
+**	ld1sb	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u16_1, svuint16_t, int8_t,
+	   z0 = svld1sb_vnum_u16 (p0, x0, 1),
+	   z0 = svld1sb_vnum_u16 (p0, x0, 1))
+
+/*
+** ld1sb_vnum_u16_7:
+**	ld1sb	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u16_7, svuint16_t, int8_t,
+	   z0 = svld1sb_vnum_u16 (p0, x0, 7),
+	   z0 = svld1sb_vnum_u16 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_vnum_u16_8:
+**	incb	x0, all, mul #4
+**	ld1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u16_8, svuint16_t, int8_t,
+	   z0 = svld1sb_vnum_u16 (p0, x0, 8),
+	   z0 = svld1sb_vnum_u16 (p0, x0, 8))
+
+/*
+** ld1sb_vnum_u16_m1:
+**	ld1sb	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u16_m1, svuint16_t, int8_t,
+	   z0 = svld1sb_vnum_u16 (p0, x0, -1),
+	   z0 = svld1sb_vnum_u16 (p0, x0, -1))
+
+/*
+** ld1sb_vnum_u16_m8:
+**	ld1sb	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u16_m8, svuint16_t, int8_t,
+	   z0 = svld1sb_vnum_u16 (p0, x0, -8),
+	   z0 = svld1sb_vnum_u16 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_vnum_u16_m9:
+**	dech	x0, all, mul #9
+**	ld1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u16_m9, svuint16_t, int8_t,
+	   z0 = svld1sb_vnum_u16 (p0, x0, -9),
+	   z0 = svld1sb_vnum_u16 (p0, x0, -9))
+
+/*
+** ld1sb_vnum_u16_x1:
+**	cnth	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld1sb	z0\.h, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld1sb	z0\.h, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u16_x1, svuint16_t, int8_t,
+	   z0 = svld1sb_vnum_u16 (p0, x0, x1),
+	   z0 = svld1sb_vnum_u16 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u32.c
new file mode 100644
index 000000000..990ae5e1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u32.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sb_u32_base:
+**	ld1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u32_base, svuint32_t, int8_t,
+	   z0 = svld1sb_u32 (p0, x0),
+	   z0 = svld1sb_u32 (p0, x0))
+
+/*
+** ld1sb_u32_index:
+**	ld1sb	z0\.s, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u32_index, svuint32_t, int8_t,
+	   z0 = svld1sb_u32 (p0, x0 + x1),
+	   z0 = svld1sb_u32 (p0, x0 + x1))
+
+/*
+** ld1sb_u32_1:
+**	ld1sb	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u32_1, svuint32_t, int8_t,
+	   z0 = svld1sb_u32 (p0, x0 + svcntw ()),
+	   z0 = svld1sb_u32 (p0, x0 + svcntw ()))
+
+/*
+** ld1sb_u32_7:
+**	ld1sb	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u32_7, svuint32_t, int8_t,
+	   z0 = svld1sb_u32 (p0, x0 + svcntw () * 7),
+	   z0 = svld1sb_u32 (p0, x0 + svcntw () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_u32_8:
+**	incb	x0, all, mul #2
+**	ld1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u32_8, svuint32_t, int8_t,
+	   z0 = svld1sb_u32 (p0, x0 + svcntw () * 8),
+	   z0 = svld1sb_u32 (p0, x0 + svcntw () * 8))
+
+/*
+** ld1sb_u32_m1:
+**	ld1sb	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u32_m1, svuint32_t, int8_t,
+	   z0 = svld1sb_u32 (p0, x0 - svcntw ()),
+	   z0 = svld1sb_u32 (p0, x0 - svcntw ()))
+
+/*
+** ld1sb_u32_m8:
+**	ld1sb	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u32_m8, svuint32_t, int8_t,
+	   z0 = svld1sb_u32 (p0, x0 - svcntw () * 8),
+	   z0 = svld1sb_u32 (p0, x0 - svcntw () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_u32_m9:
+**	decw	x0, all, mul #9
+**	ld1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u32_m9, svuint32_t, int8_t,
+	   z0 = svld1sb_u32 (p0, x0 - svcntw () * 9),
+	   z0 = svld1sb_u32 (p0, x0 - svcntw () * 9))
+
+/*
+** ld1sb_vnum_u32_0:
+**	ld1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u32_0, svuint32_t, int8_t,
+	   z0 = svld1sb_vnum_u32 (p0, x0, 0),
+	   z0 = svld1sb_vnum_u32 (p0, x0, 0))
+
+/*
+** ld1sb_vnum_u32_1:
+**	ld1sb	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u32_1, svuint32_t, int8_t,
+	   z0 = svld1sb_vnum_u32 (p0, x0, 1),
+	   z0 = svld1sb_vnum_u32 (p0, x0, 1))
+
+/*
+** ld1sb_vnum_u32_7:
+**	ld1sb	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u32_7, svuint32_t, int8_t,
+	   z0 = svld1sb_vnum_u32 (p0, x0, 7),
+	   z0 = svld1sb_vnum_u32 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_vnum_u32_8:
+**	incb	x0, all, mul #2
+**	ld1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u32_8, svuint32_t, int8_t,
+	   z0 = svld1sb_vnum_u32 (p0, x0, 8),
+	   z0 = svld1sb_vnum_u32 (p0, x0, 8))
+
+/*
+** ld1sb_vnum_u32_m1:
+**	ld1sb	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u32_m1, svuint32_t, int8_t,
+	   z0 = svld1sb_vnum_u32 (p0, x0, -1),
+	   z0 = svld1sb_vnum_u32 (p0, x0, -1))
+
+/*
+** ld1sb_vnum_u32_m8:
+**	ld1sb	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u32_m8, svuint32_t, int8_t,
+	   z0 = svld1sb_vnum_u32 (p0, x0, -8),
+	   z0 = svld1sb_vnum_u32 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_vnum_u32_m9:
+**	decw	x0, all, mul #9
+**	ld1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u32_m9, svuint32_t, int8_t,
+	   z0 = svld1sb_vnum_u32 (p0, x0, -9),
+	   z0 = svld1sb_vnum_u32 (p0, x0, -9))
+
+/*
+** ld1sb_vnum_u32_x1:
+**	cntw	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld1sb	z0\.s, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld1sb	z0\.s, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u32_x1, svuint32_t, int8_t,
+	   z0 = svld1sb_vnum_u32 (p0, x0, x1),
+	   z0 = svld1sb_vnum_u32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u64.c
new file mode 100644
index 000000000..8051bf140
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sb_u64.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sb_u64_base:
+**	ld1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u64_base, svuint64_t, int8_t,
+	   z0 = svld1sb_u64 (p0, x0),
+	   z0 = svld1sb_u64 (p0, x0))
+
+/*
+** ld1sb_u64_index:
+**	ld1sb	z0\.d, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u64_index, svuint64_t, int8_t,
+	   z0 = svld1sb_u64 (p0, x0 + x1),
+	   z0 = svld1sb_u64 (p0, x0 + x1))
+
+/*
+** ld1sb_u64_1:
+**	ld1sb	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u64_1, svuint64_t, int8_t,
+	   z0 = svld1sb_u64 (p0, x0 + svcntd ()),
+	   z0 = svld1sb_u64 (p0, x0 + svcntd ()))
+
+/*
+** ld1sb_u64_7:
+**	ld1sb	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u64_7, svuint64_t, int8_t,
+	   z0 = svld1sb_u64 (p0, x0 + svcntd () * 7),
+	   z0 = svld1sb_u64 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_u64_8:
+**	incb	x0
+**	ld1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u64_8, svuint64_t, int8_t,
+	   z0 = svld1sb_u64 (p0, x0 + svcntd () * 8),
+	   z0 = svld1sb_u64 (p0, x0 + svcntd () * 8))
+
+/*
+** ld1sb_u64_m1:
+**	ld1sb	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u64_m1, svuint64_t, int8_t,
+	   z0 = svld1sb_u64 (p0, x0 - svcntd ()),
+	   z0 = svld1sb_u64 (p0, x0 - svcntd ()))
+
+/*
+** ld1sb_u64_m8:
+**	ld1sb	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u64_m8, svuint64_t, int8_t,
+	   z0 = svld1sb_u64 (p0, x0 - svcntd () * 8),
+	   z0 = svld1sb_u64 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_u64_m9:
+**	decd	x0, all, mul #9
+**	ld1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_u64_m9, svuint64_t, int8_t,
+	   z0 = svld1sb_u64 (p0, x0 - svcntd () * 9),
+	   z0 = svld1sb_u64 (p0, x0 - svcntd () * 9))
+
+/*
+** ld1sb_vnum_u64_0:
+**	ld1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u64_0, svuint64_t, int8_t,
+	   z0 = svld1sb_vnum_u64 (p0, x0, 0),
+	   z0 = svld1sb_vnum_u64 (p0, x0, 0))
+
+/*
+** ld1sb_vnum_u64_1:
+**	ld1sb	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u64_1, svuint64_t, int8_t,
+	   z0 = svld1sb_vnum_u64 (p0, x0, 1),
+	   z0 = svld1sb_vnum_u64 (p0, x0, 1))
+
+/*
+** ld1sb_vnum_u64_7:
+**	ld1sb	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u64_7, svuint64_t, int8_t,
+	   z0 = svld1sb_vnum_u64 (p0, x0, 7),
+	   z0 = svld1sb_vnum_u64 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_vnum_u64_8:
+**	incb	x0
+**	ld1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u64_8, svuint64_t, int8_t,
+	   z0 = svld1sb_vnum_u64 (p0, x0, 8),
+	   z0 = svld1sb_vnum_u64 (p0, x0, 8))
+
+/*
+** ld1sb_vnum_u64_m1:
+**	ld1sb	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u64_m1, svuint64_t, int8_t,
+	   z0 = svld1sb_vnum_u64 (p0, x0, -1),
+	   z0 = svld1sb_vnum_u64 (p0, x0, -1))
+
+/*
+** ld1sb_vnum_u64_m8:
+**	ld1sb	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u64_m8, svuint64_t, int8_t,
+	   z0 = svld1sb_vnum_u64 (p0, x0, -8),
+	   z0 = svld1sb_vnum_u64 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sb_vnum_u64_m9:
+**	decd	x0, all, mul #9
+**	ld1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u64_m9, svuint64_t, int8_t,
+	   z0 = svld1sb_vnum_u64 (p0, x0, -9),
+	   z0 = svld1sb_vnum_u64 (p0, x0, -9))
+
+/*
+** ld1sb_vnum_u64_x1:
+**	cntd	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld1sb	z0\.d, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld1sb	z0\.d, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld1sb_vnum_u64_x1, svuint64_t, int8_t,
+	   z0 = svld1sb_vnum_u64 (p0, x0, x1),
+	   z0 = svld1sb_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_s32.c
new file mode 100644
index 000000000..ed07b4dfc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_s32.c
@@ -0,0 +1,252 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sh_gather_s32_tied1:
+**	ld1sh	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_s32_tied1, svint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_s32 (p0, z0),
+		     z0_res = svld1sh_gather_s32 (p0, z0))
+
+/*
+** ld1sh_gather_s32_untied:
+**	ld1sh	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_s32_untied, svint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_s32 (p0, z1),
+		     z0_res = svld1sh_gather_s32 (p0, z1))
+
+/*
+** ld1sh_gather_x0_s32_offset:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_x0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_offset_s32 (p0, z0, x0),
+		     z0_res = svld1sh_gather_offset_s32 (p0, z0, x0))
+
+/*
+** ld1sh_gather_m2_s32_offset:
+**	mov	(x[0-9]+), #?-2
+**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_m2_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_offset_s32 (p0, z0, -2),
+		     z0_res = svld1sh_gather_offset_s32 (p0, z0, -2))
+
+/*
+** ld1sh_gather_0_s32_offset:
+**	ld1sh	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_offset_s32 (p0, z0, 0),
+		     z0_res = svld1sh_gather_offset_s32 (p0, z0, 0))
+
+/*
+** ld1sh_gather_5_s32_offset:
+**	mov	(x[0-9]+), #?5
+**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_5_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_offset_s32 (p0, z0, 5),
+		     z0_res = svld1sh_gather_offset_s32 (p0, z0, 5))
+
+/*
+** ld1sh_gather_6_s32_offset:
+**	ld1sh	z0\.s, p0/z, \[z0\.s, #6\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_6_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_offset_s32 (p0, z0, 6),
+		     z0_res = svld1sh_gather_offset_s32 (p0, z0, 6))
+
+/*
+** ld1sh_gather_62_s32_offset:
+**	ld1sh	z0\.s, p0/z, \[z0\.s, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_62_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_offset_s32 (p0, z0, 62),
+		     z0_res = svld1sh_gather_offset_s32 (p0, z0, 62))
+
+/*
+** ld1sh_gather_64_s32_offset:
+**	mov	(x[0-9]+), #?64
+**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_64_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_offset_s32 (p0, z0, 64),
+		     z0_res = svld1sh_gather_offset_s32 (p0, z0, 64))
+
+/*
+** ld1sh_gather_x0_s32_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_x0_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_index_s32 (p0, z0, x0),
+		     z0_res = svld1sh_gather_index_s32 (p0, z0, x0))
+
+/*
+** ld1sh_gather_m1_s32_index:
+**	mov	(x[0-9]+), #?-2
+**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_m1_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_index_s32 (p0, z0, -1),
+		     z0_res = svld1sh_gather_index_s32 (p0, z0, -1))
+
+/*
+** ld1sh_gather_0_s32_index:
+**	ld1sh	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_0_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_index_s32 (p0, z0, 0),
+		     z0_res = svld1sh_gather_index_s32 (p0, z0, 0))
+
+/*
+** ld1sh_gather_5_s32_index:
+**	ld1sh	z0\.s, p0/z, \[z0\.s, #10\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_5_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_index_s32 (p0, z0, 5),
+		     z0_res = svld1sh_gather_index_s32 (p0, z0, 5))
+
+/*
+** ld1sh_gather_31_s32_index:
+**	ld1sh	z0\.s, p0/z, \[z0\.s, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_31_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_index_s32 (p0, z0, 31),
+		     z0_res = svld1sh_gather_index_s32 (p0, z0, 31))
+
+/*
+** ld1sh_gather_32_s32_index:
+**	mov	(x[0-9]+), #?64
+**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_32_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_index_s32 (p0, z0, 32),
+		     z0_res = svld1sh_gather_index_s32 (p0, z0, 32))
+
+/*
+** ld1sh_gather_x0_s32_s32offset:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_s32_s32offset, svint32_t, int16_t, svint32_t,
+		     z0_res = svld1sh_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svld1sh_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ld1sh_gather_tied1_s32_s32offset:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_s32_s32offset, svint32_t, int16_t, svint32_t,
+		     z0_res = svld1sh_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svld1sh_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ld1sh_gather_untied_s32_s32offset:
+**	ld1sh	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_s32_s32offset, svint32_t, int16_t, svint32_t,
+		     z0_res = svld1sh_gather_s32offset_s32 (p0, x0, z1),
+		     z0_res = svld1sh_gather_offset_s32 (p0, x0, z1))
+
+/*
+** ld1sh_gather_x0_s32_u32offset:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_s32_u32offset, svint32_t, int16_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svld1sh_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ld1sh_gather_tied1_s32_u32offset:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_s32_u32offset, svint32_t, int16_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svld1sh_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ld1sh_gather_untied_s32_u32offset:
+**	ld1sh	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_s32_u32offset, svint32_t, int16_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32offset_s32 (p0, x0, z1),
+		     z0_res = svld1sh_gather_offset_s32 (p0, x0, z1))
+
+/*
+** ld1sh_gather_x0_s32_s32index:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_s32_s32index, svint32_t, int16_t, svint32_t,
+		     z0_res = svld1sh_gather_s32index_s32 (p0, x0, z0),
+		     z0_res = svld1sh_gather_index_s32 (p0, x0, z0))
+
+/*
+** ld1sh_gather_tied1_s32_s32index:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_s32_s32index, svint32_t, int16_t, svint32_t,
+		     z0_res = svld1sh_gather_s32index_s32 (p0, x0, z0),
+		     z0_res = svld1sh_gather_index_s32 (p0, x0, z0))
+
+/*
+** ld1sh_gather_untied_s32_s32index:
+**	ld1sh	z0\.s, p0/z, \[x0, z1\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_s32_s32index, svint32_t, int16_t, svint32_t,
+		     z0_res = svld1sh_gather_s32index_s32 (p0, x0, z1),
+		     z0_res = svld1sh_gather_index_s32 (p0, x0, z1))
+
+/*
+** ld1sh_gather_x0_s32_u32index:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_s32_u32index, svint32_t, int16_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32index_s32 (p0, x0, z0),
+		     z0_res = svld1sh_gather_index_s32 (p0, x0, z0))
+
+/*
+** ld1sh_gather_tied1_s32_u32index:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_s32_u32index, svint32_t, int16_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32index_s32 (p0, x0, z0),
+		     z0_res = svld1sh_gather_index_s32 (p0, x0, z0))
+
+/*
+** ld1sh_gather_untied_s32_u32index:
+**	ld1sh	z0\.s, p0/z, \[x0, z1\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_s32_u32index, svint32_t, int16_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32index_s32 (p0, x0, z1),
+		     z0_res = svld1sh_gather_index_s32 (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_s64.c
new file mode 100644
index 000000000..20ca42720
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_s64.c
@@ -0,0 +1,288 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sh_gather_s64_tied1:
+**	ld1sh	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_s64_tied1, svint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_s64 (p0, z0),
+		     z0_res = svld1sh_gather_s64 (p0, z0))
+
+/*
+** ld1sh_gather_s64_untied:
+**	ld1sh	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_s64_untied, svint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_s64 (p0, z1),
+		     z0_res = svld1sh_gather_s64 (p0, z1))
+
+/*
+** ld1sh_gather_x0_s64_offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_x0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_offset_s64 (p0, z0, x0),
+		     z0_res = svld1sh_gather_offset_s64 (p0, z0, x0))
+
+/*
+** ld1sh_gather_m2_s64_offset:
+**	mov	(x[0-9]+), #?-2
+**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_m2_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_offset_s64 (p0, z0, -2),
+		     z0_res = svld1sh_gather_offset_s64 (p0, z0, -2))
+
+/*
+** ld1sh_gather_0_s64_offset:
+**	ld1sh	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_offset_s64 (p0, z0, 0),
+		     z0_res = svld1sh_gather_offset_s64 (p0, z0, 0))
+
+/*
+** ld1sh_gather_5_s64_offset:
+**	mov	(x[0-9]+), #?5
+**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_5_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_offset_s64 (p0, z0, 5),
+		     z0_res = svld1sh_gather_offset_s64 (p0, z0, 5))
+
+/*
+** ld1sh_gather_6_s64_offset:
+**	ld1sh	z0\.d, p0/z, \[z0\.d, #6\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_6_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_offset_s64 (p0, z0, 6),
+		     z0_res = svld1sh_gather_offset_s64 (p0, z0, 6))
+
+/*
+** ld1sh_gather_62_s64_offset:
+**	ld1sh	z0\.d, p0/z, \[z0\.d, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_62_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_offset_s64 (p0, z0, 62),
+		     z0_res = svld1sh_gather_offset_s64 (p0, z0, 62))
+
+/*
+** ld1sh_gather_64_s64_offset:
+**	mov	(x[0-9]+), #?64
+**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_64_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_offset_s64 (p0, z0, 64),
+		     z0_res = svld1sh_gather_offset_s64 (p0, z0, 64))
+
+/*
+** ld1sh_gather_x0_s64_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_x0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_index_s64 (p0, z0, x0),
+		     z0_res = svld1sh_gather_index_s64 (p0, z0, x0))
+
+/*
+** ld1sh_gather_m1_s64_index:
+**	mov	(x[0-9]+), #?-2
+**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_m1_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_index_s64 (p0, z0, -1),
+		     z0_res = svld1sh_gather_index_s64 (p0, z0, -1))
+
+/*
+** ld1sh_gather_0_s64_index:
+**	ld1sh	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_index_s64 (p0, z0, 0),
+		     z0_res = svld1sh_gather_index_s64 (p0, z0, 0))
+
+/*
+** ld1sh_gather_5_s64_index:
+**	ld1sh	z0\.d, p0/z, \[z0\.d, #10\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_5_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_index_s64 (p0, z0, 5),
+		     z0_res = svld1sh_gather_index_s64 (p0, z0, 5))
+
+/*
+** ld1sh_gather_31_s64_index:
+**	ld1sh	z0\.d, p0/z, \[z0\.d, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_31_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_index_s64 (p0, z0, 31),
+		     z0_res = svld1sh_gather_index_s64 (p0, z0, 31))
+
+/*
+** ld1sh_gather_32_s64_index:
+**	mov	(x[0-9]+), #?64
+**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_32_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_index_s64 (p0, z0, 32),
+		     z0_res = svld1sh_gather_index_s64 (p0, z0, 32))
+
+/*
+** ld1sh_gather_x0_s64_s64offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_s64_s64offset, svint64_t, int16_t, svint64_t,
+		     z0_res = svld1sh_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svld1sh_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1sh_gather_tied1_s64_s64offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_s64_s64offset, svint64_t, int16_t, svint64_t,
+		     z0_res = svld1sh_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svld1sh_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1sh_gather_untied_s64_s64offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_s64_s64offset, svint64_t, int16_t, svint64_t,
+		     z0_res = svld1sh_gather_s64offset_s64 (p0, x0, z1),
+		     z0_res = svld1sh_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ld1sh_gather_ext_s64_s64offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_ext_s64_s64offset, svint64_t, int16_t, svint64_t,
+		     z0_res = svld1sh_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1sh_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1sh_gather_x0_s64_u64offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_s64_u64offset, svint64_t, int16_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svld1sh_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1sh_gather_tied1_s64_u64offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_s64_u64offset, svint64_t, int16_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svld1sh_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1sh_gather_untied_s64_u64offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_s64_u64offset, svint64_t, int16_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64offset_s64 (p0, x0, z1),
+		     z0_res = svld1sh_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ld1sh_gather_ext_s64_u64offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_ext_s64_u64offset, svint64_t, int16_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1sh_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1sh_gather_x0_s64_s64index:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_s64_s64index, svint64_t, int16_t, svint64_t,
+		     z0_res = svld1sh_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svld1sh_gather_index_s64 (p0, x0, z0))
+
+/*
+** ld1sh_gather_tied1_s64_s64index:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_s64_s64index, svint64_t, int16_t, svint64_t,
+		     z0_res = svld1sh_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svld1sh_gather_index_s64 (p0, x0, z0))
+
+/*
+** ld1sh_gather_untied_s64_s64index:
+**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_s64_s64index, svint64_t, int16_t, svint64_t,
+		     z0_res = svld1sh_gather_s64index_s64 (p0, x0, z1),
+		     z0_res = svld1sh_gather_index_s64 (p0, x0, z1))
+
+/*
+** ld1sh_gather_ext_s64_s64index:
+**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_ext_s64_s64index, svint64_t, int16_t, svint64_t,
+		     z0_res = svld1sh_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1sh_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1sh_gather_x0_s64_u64index:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_s64_u64index, svint64_t, int16_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svld1sh_gather_index_s64 (p0, x0, z0))
+
+/*
+** ld1sh_gather_tied1_s64_u64index:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_s64_u64index, svint64_t, int16_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svld1sh_gather_index_s64 (p0, x0, z0))
+
+/*
+** ld1sh_gather_untied_s64_u64index:
+**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_s64_u64index, svint64_t, int16_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64index_s64 (p0, x0, z1),
+		     z0_res = svld1sh_gather_index_s64 (p0, x0, z1))
+
+/*
+** ld1sh_gather_ext_s64_u64index:
+**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_ext_s64_u64index, svint64_t, int16_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1sh_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_u32.c
new file mode 100644
index 000000000..e3a85a23f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_u32.c
@@ -0,0 +1,252 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sh_gather_u32_tied1:
+**	ld1sh	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_u32_tied1, svuint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_u32 (p0, z0),
+		     z0_res = svld1sh_gather_u32 (p0, z0))
+
+/*
+** ld1sh_gather_u32_untied:
+**	ld1sh	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_u32_untied, svuint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_u32 (p0, z1),
+		     z0_res = svld1sh_gather_u32 (p0, z1))
+
+/*
+** ld1sh_gather_x0_u32_offset:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_x0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_offset_u32 (p0, z0, x0),
+		     z0_res = svld1sh_gather_offset_u32 (p0, z0, x0))
+
+/*
+** ld1sh_gather_m2_u32_offset:
+**	mov	(x[0-9]+), #?-2
+**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_m2_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_offset_u32 (p0, z0, -2),
+		     z0_res = svld1sh_gather_offset_u32 (p0, z0, -2))
+
+/*
+** ld1sh_gather_0_u32_offset:
+**	ld1sh	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_offset_u32 (p0, z0, 0),
+		     z0_res = svld1sh_gather_offset_u32 (p0, z0, 0))
+
+/*
+** ld1sh_gather_5_u32_offset:
+**	mov	(x[0-9]+), #?5
+**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_5_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_offset_u32 (p0, z0, 5),
+		     z0_res = svld1sh_gather_offset_u32 (p0, z0, 5))
+
+/*
+** ld1sh_gather_6_u32_offset:
+**	ld1sh	z0\.s, p0/z, \[z0\.s, #6\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_6_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_offset_u32 (p0, z0, 6),
+		     z0_res = svld1sh_gather_offset_u32 (p0, z0, 6))
+
+/*
+** ld1sh_gather_62_u32_offset:
+**	ld1sh	z0\.s, p0/z, \[z0\.s, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_62_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_offset_u32 (p0, z0, 62),
+		     z0_res = svld1sh_gather_offset_u32 (p0, z0, 62))
+
+/*
+** ld1sh_gather_64_u32_offset:
+**	mov	(x[0-9]+), #?64
+**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_64_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_offset_u32 (p0, z0, 64),
+		     z0_res = svld1sh_gather_offset_u32 (p0, z0, 64))
+
+/*
+** ld1sh_gather_x0_u32_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_x0_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_index_u32 (p0, z0, x0),
+		     z0_res = svld1sh_gather_index_u32 (p0, z0, x0))
+
+/*
+** ld1sh_gather_m1_u32_index:
+**	mov	(x[0-9]+), #?-2
+**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_m1_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_index_u32 (p0, z0, -1),
+		     z0_res = svld1sh_gather_index_u32 (p0, z0, -1))
+
+/*
+** ld1sh_gather_0_u32_index:
+**	ld1sh	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_0_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_index_u32 (p0, z0, 0),
+		     z0_res = svld1sh_gather_index_u32 (p0, z0, 0))
+
+/*
+** ld1sh_gather_5_u32_index:
+**	ld1sh	z0\.s, p0/z, \[z0\.s, #10\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_5_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_index_u32 (p0, z0, 5),
+		     z0_res = svld1sh_gather_index_u32 (p0, z0, 5))
+
+/*
+** ld1sh_gather_31_u32_index:
+**	ld1sh	z0\.s, p0/z, \[z0\.s, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_31_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_index_u32 (p0, z0, 31),
+		     z0_res = svld1sh_gather_index_u32 (p0, z0, 31))
+
+/*
+** ld1sh_gather_32_u32_index:
+**	mov	(x[0-9]+), #?64
+**	ld1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_32_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32base_index_u32 (p0, z0, 32),
+		     z0_res = svld1sh_gather_index_u32 (p0, z0, 32))
+
+/*
+** ld1sh_gather_x0_u32_s32offset:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_u32_s32offset, svuint32_t, int16_t, svint32_t,
+		     z0_res = svld1sh_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svld1sh_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ld1sh_gather_tied1_u32_s32offset:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_u32_s32offset, svuint32_t, int16_t, svint32_t,
+		     z0_res = svld1sh_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svld1sh_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ld1sh_gather_untied_u32_s32offset:
+**	ld1sh	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_u32_s32offset, svuint32_t, int16_t, svint32_t,
+		     z0_res = svld1sh_gather_s32offset_u32 (p0, x0, z1),
+		     z0_res = svld1sh_gather_offset_u32 (p0, x0, z1))
+
+/*
+** ld1sh_gather_x0_u32_u32offset:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_u32_u32offset, svuint32_t, int16_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svld1sh_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ld1sh_gather_tied1_u32_u32offset:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_u32_u32offset, svuint32_t, int16_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svld1sh_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ld1sh_gather_untied_u32_u32offset:
+**	ld1sh	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_u32_u32offset, svuint32_t, int16_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32offset_u32 (p0, x0, z1),
+		     z0_res = svld1sh_gather_offset_u32 (p0, x0, z1))
+
+/*
+** ld1sh_gather_x0_u32_s32index:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_u32_s32index, svuint32_t, int16_t, svint32_t,
+		     z0_res = svld1sh_gather_s32index_u32 (p0, x0, z0),
+		     z0_res = svld1sh_gather_index_u32 (p0, x0, z0))
+
+/*
+** ld1sh_gather_tied1_u32_s32index:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_u32_s32index, svuint32_t, int16_t, svint32_t,
+		     z0_res = svld1sh_gather_s32index_u32 (p0, x0, z0),
+		     z0_res = svld1sh_gather_index_u32 (p0, x0, z0))
+
+/*
+** ld1sh_gather_untied_u32_s32index:
+**	ld1sh	z0\.s, p0/z, \[x0, z1\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_u32_s32index, svuint32_t, int16_t, svint32_t,
+		     z0_res = svld1sh_gather_s32index_u32 (p0, x0, z1),
+		     z0_res = svld1sh_gather_index_u32 (p0, x0, z1))
+
+/*
+** ld1sh_gather_x0_u32_u32index:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_u32_u32index, svuint32_t, int16_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32index_u32 (p0, x0, z0),
+		     z0_res = svld1sh_gather_index_u32 (p0, x0, z0))
+
+/*
+** ld1sh_gather_tied1_u32_u32index:
+**	ld1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_u32_u32index, svuint32_t, int16_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32index_u32 (p0, x0, z0),
+		     z0_res = svld1sh_gather_index_u32 (p0, x0, z0))
+
+/*
+** ld1sh_gather_untied_u32_u32index:
+**	ld1sh	z0\.s, p0/z, \[x0, z1\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_u32_u32index, svuint32_t, int16_t, svuint32_t,
+		     z0_res = svld1sh_gather_u32index_u32 (p0, x0, z1),
+		     z0_res = svld1sh_gather_index_u32 (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_u64.c
new file mode 100644
index 000000000..3a0094fba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_gather_u64.c
@@ -0,0 +1,288 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sh_gather_u64_tied1:
+**	ld1sh	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_u64_tied1, svuint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_u64 (p0, z0),
+		     z0_res = svld1sh_gather_u64 (p0, z0))
+
+/*
+** ld1sh_gather_u64_untied:
+**	ld1sh	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_u64_untied, svuint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_u64 (p0, z1),
+		     z0_res = svld1sh_gather_u64 (p0, z1))
+
+/*
+** ld1sh_gather_x0_u64_offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_x0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_offset_u64 (p0, z0, x0),
+		     z0_res = svld1sh_gather_offset_u64 (p0, z0, x0))
+
+/*
+** ld1sh_gather_m2_u64_offset:
+**	mov	(x[0-9]+), #?-2
+**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_m2_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_offset_u64 (p0, z0, -2),
+		     z0_res = svld1sh_gather_offset_u64 (p0, z0, -2))
+
+/*
+** ld1sh_gather_0_u64_offset:
+**	ld1sh	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_offset_u64 (p0, z0, 0),
+		     z0_res = svld1sh_gather_offset_u64 (p0, z0, 0))
+
+/*
+** ld1sh_gather_5_u64_offset:
+**	mov	(x[0-9]+), #?5
+**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_5_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_offset_u64 (p0, z0, 5),
+		     z0_res = svld1sh_gather_offset_u64 (p0, z0, 5))
+
+/*
+** ld1sh_gather_6_u64_offset:
+**	ld1sh	z0\.d, p0/z, \[z0\.d, #6\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_6_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_offset_u64 (p0, z0, 6),
+		     z0_res = svld1sh_gather_offset_u64 (p0, z0, 6))
+
+/*
+** ld1sh_gather_62_u64_offset:
+**	ld1sh	z0\.d, p0/z, \[z0\.d, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_62_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_offset_u64 (p0, z0, 62),
+		     z0_res = svld1sh_gather_offset_u64 (p0, z0, 62))
+
+/*
+** ld1sh_gather_64_u64_offset:
+**	mov	(x[0-9]+), #?64
+**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_64_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_offset_u64 (p0, z0, 64),
+		     z0_res = svld1sh_gather_offset_u64 (p0, z0, 64))
+
+/*
+** ld1sh_gather_x0_u64_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_x0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_index_u64 (p0, z0, x0),
+		     z0_res = svld1sh_gather_index_u64 (p0, z0, x0))
+
+/*
+** ld1sh_gather_m1_u64_index:
+**	mov	(x[0-9]+), #?-2
+**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_m1_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_index_u64 (p0, z0, -1),
+		     z0_res = svld1sh_gather_index_u64 (p0, z0, -1))
+
+/*
+** ld1sh_gather_0_u64_index:
+**	ld1sh	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_index_u64 (p0, z0, 0),
+		     z0_res = svld1sh_gather_index_u64 (p0, z0, 0))
+
+/*
+** ld1sh_gather_5_u64_index:
+**	ld1sh	z0\.d, p0/z, \[z0\.d, #10\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_5_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_index_u64 (p0, z0, 5),
+		     z0_res = svld1sh_gather_index_u64 (p0, z0, 5))
+
+/*
+** ld1sh_gather_31_u64_index:
+**	ld1sh	z0\.d, p0/z, \[z0\.d, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_31_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_index_u64 (p0, z0, 31),
+		     z0_res = svld1sh_gather_index_u64 (p0, z0, 31))
+
+/*
+** ld1sh_gather_32_u64_index:
+**	mov	(x[0-9]+), #?64
+**	ld1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sh_gather_32_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64base_index_u64 (p0, z0, 32),
+		     z0_res = svld1sh_gather_index_u64 (p0, z0, 32))
+
+/*
+** ld1sh_gather_x0_u64_s64offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_u64_s64offset, svuint64_t, int16_t, svint64_t,
+		     z0_res = svld1sh_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svld1sh_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1sh_gather_tied1_u64_s64offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_u64_s64offset, svuint64_t, int16_t, svint64_t,
+		     z0_res = svld1sh_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svld1sh_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1sh_gather_untied_u64_s64offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_u64_s64offset, svuint64_t, int16_t, svint64_t,
+		     z0_res = svld1sh_gather_s64offset_u64 (p0, x0, z1),
+		     z0_res = svld1sh_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ld1sh_gather_ext_u64_s64offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_ext_u64_s64offset, svuint64_t, int16_t, svint64_t,
+		     z0_res = svld1sh_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1sh_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1sh_gather_x0_u64_u64offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_u64_u64offset, svuint64_t, int16_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svld1sh_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1sh_gather_tied1_u64_u64offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_u64_u64offset, svuint64_t, int16_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svld1sh_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1sh_gather_untied_u64_u64offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_u64_u64offset, svuint64_t, int16_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64offset_u64 (p0, x0, z1),
+		     z0_res = svld1sh_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ld1sh_gather_ext_u64_u64offset:
+**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_ext_u64_u64offset, svuint64_t, int16_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1sh_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1sh_gather_x0_u64_s64index:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_u64_s64index, svuint64_t, int16_t, svint64_t,
+		     z0_res = svld1sh_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svld1sh_gather_index_u64 (p0, x0, z0))
+
+/*
+** ld1sh_gather_tied1_u64_s64index:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_u64_s64index, svuint64_t, int16_t, svint64_t,
+		     z0_res = svld1sh_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svld1sh_gather_index_u64 (p0, x0, z0))
+
+/*
+** ld1sh_gather_untied_u64_s64index:
+**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_u64_s64index, svuint64_t, int16_t, svint64_t,
+		     z0_res = svld1sh_gather_s64index_u64 (p0, x0, z1),
+		     z0_res = svld1sh_gather_index_u64 (p0, x0, z1))
+
+/*
+** ld1sh_gather_ext_u64_s64index:
+**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_ext_u64_s64index, svuint64_t, int16_t, svint64_t,
+		     z0_res = svld1sh_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1sh_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1sh_gather_x0_u64_u64index:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_x0_u64_u64index, svuint64_t, int16_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svld1sh_gather_index_u64 (p0, x0, z0))
+
+/*
+** ld1sh_gather_tied1_u64_u64index:
+**	ld1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_tied1_u64_u64index, svuint64_t, int16_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svld1sh_gather_index_u64 (p0, x0, z0))
+
+/*
+** ld1sh_gather_untied_u64_u64index:
+**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_untied_u64_u64index, svuint64_t, int16_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64index_u64 (p0, x0, z1),
+		     z0_res = svld1sh_gather_index_u64 (p0, x0, z1))
+
+/*
+** ld1sh_gather_ext_u64_u64index:
+**	ld1sh	z0\.d, p0/z, \[x0, z1\.d, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sh_gather_ext_u64_u64index, svuint64_t, int16_t, svuint64_t,
+		     z0_res = svld1sh_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1sh_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_s32.c
new file mode 100644
index 000000000..8614f52c5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_s32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sh_s32_base:
+**	ld1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_s32_base, svint32_t, int16_t,
+	   z0 = svld1sh_s32 (p0, x0),
+	   z0 = svld1sh_s32 (p0, x0))
+
+/*
+** ld1sh_s32_index:
+**	ld1sh	z0\.s, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1sh_s32_index, svint32_t, int16_t,
+	   z0 = svld1sh_s32 (p0, x0 + x1),
+	   z0 = svld1sh_s32 (p0, x0 + x1))
+
+/*
+** ld1sh_s32_1:
+**	ld1sh	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_s32_1, svint32_t, int16_t,
+	   z0 = svld1sh_s32 (p0, x0 + svcntw ()),
+	   z0 = svld1sh_s32 (p0, x0 + svcntw ()))
+
+/*
+** ld1sh_s32_7:
+**	ld1sh	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_s32_7, svint32_t, int16_t,
+	   z0 = svld1sh_s32 (p0, x0 + svcntw () * 7),
+	   z0 = svld1sh_s32 (p0, x0 + svcntw () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sh_s32_8:
+**	incb	x0, all, mul #4
+**	ld1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_s32_8, svint32_t, int16_t,
+	   z0 = svld1sh_s32 (p0, x0 + svcntw () * 8),
+	   z0 = svld1sh_s32 (p0, x0 + svcntw () * 8))
+
+/*
+** ld1sh_s32_m1:
+**	ld1sh	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_s32_m1, svint32_t, int16_t,
+	   z0 = svld1sh_s32 (p0, x0 - svcntw ()),
+	   z0 = svld1sh_s32 (p0, x0 - svcntw ()))
+
+/*
+** ld1sh_s32_m8:
+**	ld1sh	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_s32_m8, svint32_t, int16_t,
+	   z0 = svld1sh_s32 (p0, x0 - svcntw () * 8),
+	   z0 = svld1sh_s32 (p0, x0 - svcntw () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sh_s32_m9:
+**	dech	x0, all, mul #9
+**	ld1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_s32_m9, svint32_t, int16_t,
+	   z0 = svld1sh_s32 (p0, x0 - svcntw () * 9),
+	   z0 = svld1sh_s32 (p0, x0 - svcntw () * 9))
+
+/*
+** ld1sh_vnum_s32_0:
+**	ld1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_s32_0, svint32_t, int16_t,
+	   z0 = svld1sh_vnum_s32 (p0, x0, 0),
+	   z0 = svld1sh_vnum_s32 (p0, x0, 0))
+
+/*
+** ld1sh_vnum_s32_1:
+**	ld1sh	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_s32_1, svint32_t, int16_t,
+	   z0 = svld1sh_vnum_s32 (p0, x0, 1),
+	   z0 = svld1sh_vnum_s32 (p0, x0, 1))
+
+/*
+** ld1sh_vnum_s32_7:
+**	ld1sh	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_s32_7, svint32_t, int16_t,
+	   z0 = svld1sh_vnum_s32 (p0, x0, 7),
+	   z0 = svld1sh_vnum_s32 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sh_vnum_s32_8:
+**	incb	x0, all, mul #4
+**	ld1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_s32_8, svint32_t, int16_t,
+	   z0 = svld1sh_vnum_s32 (p0, x0, 8),
+	   z0 = svld1sh_vnum_s32 (p0, x0, 8))
+
+/*
+** ld1sh_vnum_s32_m1:
+**	ld1sh	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_s32_m1, svint32_t, int16_t,
+	   z0 = svld1sh_vnum_s32 (p0, x0, -1),
+	   z0 = svld1sh_vnum_s32 (p0, x0, -1))
+
+/*
+** ld1sh_vnum_s32_m8:
+**	ld1sh	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_s32_m8, svint32_t, int16_t,
+	   z0 = svld1sh_vnum_s32 (p0, x0, -8),
+	   z0 = svld1sh_vnum_s32 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sh_vnum_s32_m9:
+**	dech	x0, all, mul #9
+**	ld1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_s32_m9, svint32_t, int16_t,
+	   z0 = svld1sh_vnum_s32 (p0, x0, -9),
+	   z0 = svld1sh_vnum_s32 (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1sh_vnum_s32_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1sh	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_s32_x1, svint32_t, int16_t,
+	   z0 = svld1sh_vnum_s32 (p0, x0, x1),
+	   z0 = svld1sh_vnum_s32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_s64.c
new file mode 100644
index 000000000..c02b40a76
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_s64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sh_s64_base:
+**	ld1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_s64_base, svint64_t, int16_t,
+	   z0 = svld1sh_s64 (p0, x0),
+	   z0 = svld1sh_s64 (p0, x0))
+
+/*
+** ld1sh_s64_index:
+**	ld1sh	z0\.d, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1sh_s64_index, svint64_t, int16_t,
+	   z0 = svld1sh_s64 (p0, x0 + x1),
+	   z0 = svld1sh_s64 (p0, x0 + x1))
+
+/*
+** ld1sh_s64_1:
+**	ld1sh	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_s64_1, svint64_t, int16_t,
+	   z0 = svld1sh_s64 (p0, x0 + svcntd ()),
+	   z0 = svld1sh_s64 (p0, x0 + svcntd ()))
+
+/*
+** ld1sh_s64_7:
+**	ld1sh	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_s64_7, svint64_t, int16_t,
+	   z0 = svld1sh_s64 (p0, x0 + svcntd () * 7),
+	   z0 = svld1sh_s64 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sh_s64_8:
+**	incb	x0, all, mul #2
+**	ld1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_s64_8, svint64_t, int16_t,
+	   z0 = svld1sh_s64 (p0, x0 + svcntd () * 8),
+	   z0 = svld1sh_s64 (p0, x0 + svcntd () * 8))
+
+/*
+** ld1sh_s64_m1:
+**	ld1sh	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_s64_m1, svint64_t, int16_t,
+	   z0 = svld1sh_s64 (p0, x0 - svcntd ()),
+	   z0 = svld1sh_s64 (p0, x0 - svcntd ()))
+
+/*
+** ld1sh_s64_m8:
+**	ld1sh	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_s64_m8, svint64_t, int16_t,
+	   z0 = svld1sh_s64 (p0, x0 - svcntd () * 8),
+	   z0 = svld1sh_s64 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sh_s64_m9:
+**	decw	x0, all, mul #9
+**	ld1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_s64_m9, svint64_t, int16_t,
+	   z0 = svld1sh_s64 (p0, x0 - svcntd () * 9),
+	   z0 = svld1sh_s64 (p0, x0 - svcntd () * 9))
+
+/*
+** ld1sh_vnum_s64_0:
+**	ld1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_s64_0, svint64_t, int16_t,
+	   z0 = svld1sh_vnum_s64 (p0, x0, 0),
+	   z0 = svld1sh_vnum_s64 (p0, x0, 0))
+
+/*
+** ld1sh_vnum_s64_1:
+**	ld1sh	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_s64_1, svint64_t, int16_t,
+	   z0 = svld1sh_vnum_s64 (p0, x0, 1),
+	   z0 = svld1sh_vnum_s64 (p0, x0, 1))
+
+/*
+** ld1sh_vnum_s64_7:
+**	ld1sh	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_s64_7, svint64_t, int16_t,
+	   z0 = svld1sh_vnum_s64 (p0, x0, 7),
+	   z0 = svld1sh_vnum_s64 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sh_vnum_s64_8:
+**	incb	x0, all, mul #2
+**	ld1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_s64_8, svint64_t, int16_t,
+	   z0 = svld1sh_vnum_s64 (p0, x0, 8),
+	   z0 = svld1sh_vnum_s64 (p0, x0, 8))
+
+/*
+** ld1sh_vnum_s64_m1:
+**	ld1sh	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_s64_m1, svint64_t, int16_t,
+	   z0 = svld1sh_vnum_s64 (p0, x0, -1),
+	   z0 = svld1sh_vnum_s64 (p0, x0, -1))
+
+/*
+** ld1sh_vnum_s64_m8:
+**	ld1sh	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_s64_m8, svint64_t, int16_t,
+	   z0 = svld1sh_vnum_s64 (p0, x0, -8),
+	   z0 = svld1sh_vnum_s64 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sh_vnum_s64_m9:
+**	decw	x0, all, mul #9
+**	ld1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_s64_m9, svint64_t, int16_t,
+	   z0 = svld1sh_vnum_s64 (p0, x0, -9),
+	   z0 = svld1sh_vnum_s64 (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1sh_vnum_s64_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1sh	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_s64_x1, svint64_t, int16_t,
+	   z0 = svld1sh_vnum_s64 (p0, x0, x1),
+	   z0 = svld1sh_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_u32.c
new file mode 100644
index 000000000..ead96174a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_u32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sh_u32_base:
+**	ld1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_u32_base, svuint32_t, int16_t,
+	   z0 = svld1sh_u32 (p0, x0),
+	   z0 = svld1sh_u32 (p0, x0))
+
+/*
+** ld1sh_u32_index:
+**	ld1sh	z0\.s, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1sh_u32_index, svuint32_t, int16_t,
+	   z0 = svld1sh_u32 (p0, x0 + x1),
+	   z0 = svld1sh_u32 (p0, x0 + x1))
+
+/*
+** ld1sh_u32_1:
+**	ld1sh	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_u32_1, svuint32_t, int16_t,
+	   z0 = svld1sh_u32 (p0, x0 + svcntw ()),
+	   z0 = svld1sh_u32 (p0, x0 + svcntw ()))
+
+/*
+** ld1sh_u32_7:
+**	ld1sh	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_u32_7, svuint32_t, int16_t,
+	   z0 = svld1sh_u32 (p0, x0 + svcntw () * 7),
+	   z0 = svld1sh_u32 (p0, x0 + svcntw () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sh_u32_8:
+**	incb	x0, all, mul #4
+**	ld1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_u32_8, svuint32_t, int16_t,
+	   z0 = svld1sh_u32 (p0, x0 + svcntw () * 8),
+	   z0 = svld1sh_u32 (p0, x0 + svcntw () * 8))
+
+/*
+** ld1sh_u32_m1:
+**	ld1sh	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_u32_m1, svuint32_t, int16_t,
+	   z0 = svld1sh_u32 (p0, x0 - svcntw ()),
+	   z0 = svld1sh_u32 (p0, x0 - svcntw ()))
+
+/*
+** ld1sh_u32_m8:
+**	ld1sh	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_u32_m8, svuint32_t, int16_t,
+	   z0 = svld1sh_u32 (p0, x0 - svcntw () * 8),
+	   z0 = svld1sh_u32 (p0, x0 - svcntw () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sh_u32_m9:
+**	dech	x0, all, mul #9
+**	ld1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_u32_m9, svuint32_t, int16_t,
+	   z0 = svld1sh_u32 (p0, x0 - svcntw () * 9),
+	   z0 = svld1sh_u32 (p0, x0 - svcntw () * 9))
+
+/*
+** ld1sh_vnum_u32_0:
+**	ld1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_u32_0, svuint32_t, int16_t,
+	   z0 = svld1sh_vnum_u32 (p0, x0, 0),
+	   z0 = svld1sh_vnum_u32 (p0, x0, 0))
+
+/*
+** ld1sh_vnum_u32_1:
+**	ld1sh	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_u32_1, svuint32_t, int16_t,
+	   z0 = svld1sh_vnum_u32 (p0, x0, 1),
+	   z0 = svld1sh_vnum_u32 (p0, x0, 1))
+
+/*
+** ld1sh_vnum_u32_7:
+**	ld1sh	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_u32_7, svuint32_t, int16_t,
+	   z0 = svld1sh_vnum_u32 (p0, x0, 7),
+	   z0 = svld1sh_vnum_u32 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sh_vnum_u32_8:
+**	incb	x0, all, mul #4
+**	ld1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_u32_8, svuint32_t, int16_t,
+	   z0 = svld1sh_vnum_u32 (p0, x0, 8),
+	   z0 = svld1sh_vnum_u32 (p0, x0, 8))
+
+/*
+** ld1sh_vnum_u32_m1:
+**	ld1sh	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_u32_m1, svuint32_t, int16_t,
+	   z0 = svld1sh_vnum_u32 (p0, x0, -1),
+	   z0 = svld1sh_vnum_u32 (p0, x0, -1))
+
+/*
+** ld1sh_vnum_u32_m8:
+**	ld1sh	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_u32_m8, svuint32_t, int16_t,
+	   z0 = svld1sh_vnum_u32 (p0, x0, -8),
+	   z0 = svld1sh_vnum_u32 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sh_vnum_u32_m9:
+**	dech	x0, all, mul #9
+**	ld1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_u32_m9, svuint32_t, int16_t,
+	   z0 = svld1sh_vnum_u32 (p0, x0, -9),
+	   z0 = svld1sh_vnum_u32 (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1sh_vnum_u32_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1sh	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_u32_x1, svuint32_t, int16_t,
+	   z0 = svld1sh_vnum_u32 (p0, x0, x1),
+	   z0 = svld1sh_vnum_u32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_u64.c
new file mode 100644
index 000000000..e407a08a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sh_u64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sh_u64_base:
+**	ld1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_u64_base, svuint64_t, int16_t,
+	   z0 = svld1sh_u64 (p0, x0),
+	   z0 = svld1sh_u64 (p0, x0))
+
+/*
+** ld1sh_u64_index:
+**	ld1sh	z0\.d, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1sh_u64_index, svuint64_t, int16_t,
+	   z0 = svld1sh_u64 (p0, x0 + x1),
+	   z0 = svld1sh_u64 (p0, x0 + x1))
+
+/*
+** ld1sh_u64_1:
+**	ld1sh	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_u64_1, svuint64_t, int16_t,
+	   z0 = svld1sh_u64 (p0, x0 + svcntd ()),
+	   z0 = svld1sh_u64 (p0, x0 + svcntd ()))
+
+/*
+** ld1sh_u64_7:
+**	ld1sh	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_u64_7, svuint64_t, int16_t,
+	   z0 = svld1sh_u64 (p0, x0 + svcntd () * 7),
+	   z0 = svld1sh_u64 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sh_u64_8:
+**	incb	x0, all, mul #2
+**	ld1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_u64_8, svuint64_t, int16_t,
+	   z0 = svld1sh_u64 (p0, x0 + svcntd () * 8),
+	   z0 = svld1sh_u64 (p0, x0 + svcntd () * 8))
+
+/*
+** ld1sh_u64_m1:
+**	ld1sh	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_u64_m1, svuint64_t, int16_t,
+	   z0 = svld1sh_u64 (p0, x0 - svcntd ()),
+	   z0 = svld1sh_u64 (p0, x0 - svcntd ()))
+
+/*
+** ld1sh_u64_m8:
+**	ld1sh	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_u64_m8, svuint64_t, int16_t,
+	   z0 = svld1sh_u64 (p0, x0 - svcntd () * 8),
+	   z0 = svld1sh_u64 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sh_u64_m9:
+**	decw	x0, all, mul #9
+**	ld1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_u64_m9, svuint64_t, int16_t,
+	   z0 = svld1sh_u64 (p0, x0 - svcntd () * 9),
+	   z0 = svld1sh_u64 (p0, x0 - svcntd () * 9))
+
+/*
+** ld1sh_vnum_u64_0:
+**	ld1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_u64_0, svuint64_t, int16_t,
+	   z0 = svld1sh_vnum_u64 (p0, x0, 0),
+	   z0 = svld1sh_vnum_u64 (p0, x0, 0))
+
+/*
+** ld1sh_vnum_u64_1:
+**	ld1sh	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_u64_1, svuint64_t, int16_t,
+	   z0 = svld1sh_vnum_u64 (p0, x0, 1),
+	   z0 = svld1sh_vnum_u64 (p0, x0, 1))
+
+/*
+** ld1sh_vnum_u64_7:
+**	ld1sh	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_u64_7, svuint64_t, int16_t,
+	   z0 = svld1sh_vnum_u64 (p0, x0, 7),
+	   z0 = svld1sh_vnum_u64 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sh_vnum_u64_8:
+**	incb	x0, all, mul #2
+**	ld1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_u64_8, svuint64_t, int16_t,
+	   z0 = svld1sh_vnum_u64 (p0, x0, 8),
+	   z0 = svld1sh_vnum_u64 (p0, x0, 8))
+
+/*
+** ld1sh_vnum_u64_m1:
+**	ld1sh	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_u64_m1, svuint64_t, int16_t,
+	   z0 = svld1sh_vnum_u64 (p0, x0, -1),
+	   z0 = svld1sh_vnum_u64 (p0, x0, -1))
+
+/*
+** ld1sh_vnum_u64_m8:
+**	ld1sh	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_u64_m8, svuint64_t, int16_t,
+	   z0 = svld1sh_vnum_u64 (p0, x0, -8),
+	   z0 = svld1sh_vnum_u64 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sh_vnum_u64_m9:
+**	decw	x0, all, mul #9
+**	ld1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_u64_m9, svuint64_t, int16_t,
+	   z0 = svld1sh_vnum_u64 (p0, x0, -9),
+	   z0 = svld1sh_vnum_u64 (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1sh_vnum_u64_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1sh	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1sh_vnum_u64_x1, svuint64_t, int16_t,
+	   z0 = svld1sh_vnum_u64 (p0, x0, x1),
+	   z0 = svld1sh_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_gather_s64.c
new file mode 100644
index 000000000..4d076b486
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_gather_s64.c
@@ -0,0 +1,308 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sw_gather_s64_tied1:
+**	ld1sw	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_s64_tied1, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_s64 (p0, z0),
+		     z0_res = svld1sw_gather_s64 (p0, z0))
+
+/*
+** ld1sw_gather_s64_untied:
+**	ld1sw	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_s64_untied, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_s64 (p0, z1),
+		     z0_res = svld1sw_gather_s64 (p0, z1))
+
+/*
+** ld1sw_gather_x0_s64_offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_x0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, x0),
+		     z0_res = svld1sw_gather_offset_s64 (p0, z0, x0))
+
+/*
+** ld1sw_gather_m4_s64_offset:
+**	mov	(x[0-9]+), #?-4
+**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_m4_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, -4),
+		     z0_res = svld1sw_gather_offset_s64 (p0, z0, -4))
+
+/*
+** ld1sw_gather_0_s64_offset:
+**	ld1sw	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, 0),
+		     z0_res = svld1sw_gather_offset_s64 (p0, z0, 0))
+
+/*
+** ld1sw_gather_5_s64_offset:
+**	mov	(x[0-9]+), #?5
+**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_5_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, 5),
+		     z0_res = svld1sw_gather_offset_s64 (p0, z0, 5))
+
+/*
+** ld1sw_gather_6_s64_offset:
+**	mov	(x[0-9]+), #?6
+**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_6_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, 6),
+		     z0_res = svld1sw_gather_offset_s64 (p0, z0, 6))
+
+/*
+** ld1sw_gather_7_s64_offset:
+**	mov	(x[0-9]+), #?7
+**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_7_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, 7),
+		     z0_res = svld1sw_gather_offset_s64 (p0, z0, 7))
+
+/*
+** ld1sw_gather_8_s64_offset:
+**	ld1sw	z0\.d, p0/z, \[z0\.d, #8\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_8_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, 8),
+		     z0_res = svld1sw_gather_offset_s64 (p0, z0, 8))
+
+/*
+** ld1sw_gather_124_s64_offset:
+**	ld1sw	z0\.d, p0/z, \[z0\.d, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_124_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, 124),
+		     z0_res = svld1sw_gather_offset_s64 (p0, z0, 124))
+
+/*
+** ld1sw_gather_128_s64_offset:
+**	mov	(x[0-9]+), #?128
+**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_128_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_s64 (p0, z0, 128),
+		     z0_res = svld1sw_gather_offset_s64 (p0, z0, 128))
+
+/*
+** ld1sw_gather_x0_s64_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_x0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_index_s64 (p0, z0, x0),
+		     z0_res = svld1sw_gather_index_s64 (p0, z0, x0))
+
+/*
+** ld1sw_gather_m1_s64_index:
+**	mov	(x[0-9]+), #?-4
+**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_m1_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_index_s64 (p0, z0, -1),
+		     z0_res = svld1sw_gather_index_s64 (p0, z0, -1))
+
+/*
+** ld1sw_gather_0_s64_index:
+**	ld1sw	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_index_s64 (p0, z0, 0),
+		     z0_res = svld1sw_gather_index_s64 (p0, z0, 0))
+
+/*
+** ld1sw_gather_5_s64_index:
+**	ld1sw	z0\.d, p0/z, \[z0\.d, #20\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_5_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_index_s64 (p0, z0, 5),
+		     z0_res = svld1sw_gather_index_s64 (p0, z0, 5))
+
+/*
+** ld1sw_gather_31_s64_index:
+**	ld1sw	z0\.d, p0/z, \[z0\.d, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_31_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_index_s64 (p0, z0, 31),
+		     z0_res = svld1sw_gather_index_s64 (p0, z0, 31))
+
+/*
+** ld1sw_gather_32_s64_index:
+**	mov	(x[0-9]+), #?128
+**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_32_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_index_s64 (p0, z0, 32),
+		     z0_res = svld1sw_gather_index_s64 (p0, z0, 32))
+
+/*
+** ld1sw_gather_x0_s64_s64offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_x0_s64_s64offset, svint64_t, int32_t, svint64_t,
+		     z0_res = svld1sw_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svld1sw_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1sw_gather_tied1_s64_s64offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_tied1_s64_s64offset, svint64_t, int32_t, svint64_t,
+		     z0_res = svld1sw_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svld1sw_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1sw_gather_untied_s64_s64offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_untied_s64_s64offset, svint64_t, int32_t, svint64_t,
+		     z0_res = svld1sw_gather_s64offset_s64 (p0, x0, z1),
+		     z0_res = svld1sw_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ld1sw_gather_ext_s64_s64offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_ext_s64_s64offset, svint64_t, int32_t, svint64_t,
+		     z0_res = svld1sw_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1sw_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1sw_gather_x0_s64_u64offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_x0_s64_u64offset, svint64_t, int32_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svld1sw_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1sw_gather_tied1_s64_u64offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_tied1_s64_u64offset, svint64_t, int32_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svld1sw_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1sw_gather_untied_s64_u64offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_untied_s64_u64offset, svint64_t, int32_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64offset_s64 (p0, x0, z1),
+		     z0_res = svld1sw_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ld1sw_gather_ext_s64_u64offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_ext_s64_u64offset, svint64_t, int32_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1sw_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1sw_gather_x0_s64_s64index:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_x0_s64_s64index, svint64_t, int32_t, svint64_t,
+		     z0_res = svld1sw_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svld1sw_gather_index_s64 (p0, x0, z0))
+
+/*
+** ld1sw_gather_tied1_s64_s64index:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_tied1_s64_s64index, svint64_t, int32_t, svint64_t,
+		     z0_res = svld1sw_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svld1sw_gather_index_s64 (p0, x0, z0))
+
+/*
+** ld1sw_gather_untied_s64_s64index:
+**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_untied_s64_s64index, svint64_t, int32_t, svint64_t,
+		     z0_res = svld1sw_gather_s64index_s64 (p0, x0, z1),
+		     z0_res = svld1sw_gather_index_s64 (p0, x0, z1))
+
+/*
+** ld1sw_gather_ext_s64_s64index:
+**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_ext_s64_s64index, svint64_t, int32_t, svint64_t,
+		     z0_res = svld1sw_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1sw_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1sw_gather_x0_s64_u64index:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_x0_s64_u64index, svint64_t, int32_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svld1sw_gather_index_s64 (p0, x0, z0))
+
+/*
+** ld1sw_gather_tied1_s64_u64index:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_tied1_s64_u64index, svint64_t, int32_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svld1sw_gather_index_s64 (p0, x0, z0))
+
+/*
+** ld1sw_gather_untied_s64_u64index:
+**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_untied_s64_u64index, svint64_t, int32_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64index_s64 (p0, x0, z1),
+		     z0_res = svld1sw_gather_index_s64 (p0, x0, z1))
+
+/*
+** ld1sw_gather_ext_s64_u64index:
+**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_ext_s64_u64index, svint64_t, int32_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1sw_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_gather_u64.c
new file mode 100644
index 000000000..ffa85eb3e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_gather_u64.c
@@ -0,0 +1,308 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sw_gather_u64_tied1:
+**	ld1sw	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_u64_tied1, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_u64 (p0, z0),
+		     z0_res = svld1sw_gather_u64 (p0, z0))
+
+/*
+** ld1sw_gather_u64_untied:
+**	ld1sw	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_u64_untied, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_u64 (p0, z1),
+		     z0_res = svld1sw_gather_u64 (p0, z1))
+
+/*
+** ld1sw_gather_x0_u64_offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_x0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, x0),
+		     z0_res = svld1sw_gather_offset_u64 (p0, z0, x0))
+
+/*
+** ld1sw_gather_m4_u64_offset:
+**	mov	(x[0-9]+), #?-4
+**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_m4_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, -4),
+		     z0_res = svld1sw_gather_offset_u64 (p0, z0, -4))
+
+/*
+** ld1sw_gather_0_u64_offset:
+**	ld1sw	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, 0),
+		     z0_res = svld1sw_gather_offset_u64 (p0, z0, 0))
+
+/*
+** ld1sw_gather_5_u64_offset:
+**	mov	(x[0-9]+), #?5
+**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_5_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, 5),
+		     z0_res = svld1sw_gather_offset_u64 (p0, z0, 5))
+
+/*
+** ld1sw_gather_6_u64_offset:
+**	mov	(x[0-9]+), #?6
+**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_6_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, 6),
+		     z0_res = svld1sw_gather_offset_u64 (p0, z0, 6))
+
+/*
+** ld1sw_gather_7_u64_offset:
+**	mov	(x[0-9]+), #?7
+**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_7_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, 7),
+		     z0_res = svld1sw_gather_offset_u64 (p0, z0, 7))
+
+/*
+** ld1sw_gather_8_u64_offset:
+**	ld1sw	z0\.d, p0/z, \[z0\.d, #8\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_8_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, 8),
+		     z0_res = svld1sw_gather_offset_u64 (p0, z0, 8))
+
+/*
+** ld1sw_gather_124_u64_offset:
+**	ld1sw	z0\.d, p0/z, \[z0\.d, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_124_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, 124),
+		     z0_res = svld1sw_gather_offset_u64 (p0, z0, 124))
+
+/*
+** ld1sw_gather_128_u64_offset:
+**	mov	(x[0-9]+), #?128
+**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_128_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_offset_u64 (p0, z0, 128),
+		     z0_res = svld1sw_gather_offset_u64 (p0, z0, 128))
+
+/*
+** ld1sw_gather_x0_u64_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_x0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_index_u64 (p0, z0, x0),
+		     z0_res = svld1sw_gather_index_u64 (p0, z0, x0))
+
+/*
+** ld1sw_gather_m1_u64_index:
+**	mov	(x[0-9]+), #?-4
+**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_m1_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_index_u64 (p0, z0, -1),
+		     z0_res = svld1sw_gather_index_u64 (p0, z0, -1))
+
+/*
+** ld1sw_gather_0_u64_index:
+**	ld1sw	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_index_u64 (p0, z0, 0),
+		     z0_res = svld1sw_gather_index_u64 (p0, z0, 0))
+
+/*
+** ld1sw_gather_5_u64_index:
+**	ld1sw	z0\.d, p0/z, \[z0\.d, #20\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_5_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_index_u64 (p0, z0, 5),
+		     z0_res = svld1sw_gather_index_u64 (p0, z0, 5))
+
+/*
+** ld1sw_gather_31_u64_index:
+**	ld1sw	z0\.d, p0/z, \[z0\.d, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_31_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_index_u64 (p0, z0, 31),
+		     z0_res = svld1sw_gather_index_u64 (p0, z0, 31))
+
+/*
+** ld1sw_gather_32_u64_index:
+**	mov	(x[0-9]+), #?128
+**	ld1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1sw_gather_32_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64base_index_u64 (p0, z0, 32),
+		     z0_res = svld1sw_gather_index_u64 (p0, z0, 32))
+
+/*
+** ld1sw_gather_x0_u64_s64offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_x0_u64_s64offset, svuint64_t, int32_t, svint64_t,
+		     z0_res = svld1sw_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svld1sw_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1sw_gather_tied1_u64_s64offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_tied1_u64_s64offset, svuint64_t, int32_t, svint64_t,
+		     z0_res = svld1sw_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svld1sw_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1sw_gather_untied_u64_s64offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_untied_u64_s64offset, svuint64_t, int32_t, svint64_t,
+		     z0_res = svld1sw_gather_s64offset_u64 (p0, x0, z1),
+		     z0_res = svld1sw_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ld1sw_gather_ext_u64_s64offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_ext_u64_s64offset, svuint64_t, int32_t, svint64_t,
+		     z0_res = svld1sw_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1sw_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1sw_gather_x0_u64_u64offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_x0_u64_u64offset, svuint64_t, int32_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svld1sw_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1sw_gather_tied1_u64_u64offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_tied1_u64_u64offset, svuint64_t, int32_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svld1sw_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1sw_gather_untied_u64_u64offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_untied_u64_u64offset, svuint64_t, int32_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64offset_u64 (p0, x0, z1),
+		     z0_res = svld1sw_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ld1sw_gather_ext_u64_u64offset:
+**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_ext_u64_u64offset, svuint64_t, int32_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1sw_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1sw_gather_x0_u64_s64index:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_x0_u64_s64index, svuint64_t, int32_t, svint64_t,
+		     z0_res = svld1sw_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svld1sw_gather_index_u64 (p0, x0, z0))
+
+/*
+** ld1sw_gather_tied1_u64_s64index:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_tied1_u64_s64index, svuint64_t, int32_t, svint64_t,
+		     z0_res = svld1sw_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svld1sw_gather_index_u64 (p0, x0, z0))
+
+/*
+** ld1sw_gather_untied_u64_s64index:
+**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_untied_u64_s64index, svuint64_t, int32_t, svint64_t,
+		     z0_res = svld1sw_gather_s64index_u64 (p0, x0, z1),
+		     z0_res = svld1sw_gather_index_u64 (p0, x0, z1))
+
+/*
+** ld1sw_gather_ext_u64_s64index:
+**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_ext_u64_s64index, svuint64_t, int32_t, svint64_t,
+		     z0_res = svld1sw_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1sw_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1sw_gather_x0_u64_u64index:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_x0_u64_u64index, svuint64_t, int32_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svld1sw_gather_index_u64 (p0, x0, z0))
+
+/*
+** ld1sw_gather_tied1_u64_u64index:
+**	ld1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_tied1_u64_u64index, svuint64_t, int32_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svld1sw_gather_index_u64 (p0, x0, z0))
+
+/*
+** ld1sw_gather_untied_u64_u64index:
+**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_untied_u64_u64index, svuint64_t, int32_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64index_u64 (p0, x0, z1),
+		     z0_res = svld1sw_gather_index_u64 (p0, x0, z1))
+
+/*
+** ld1sw_gather_ext_u64_u64index:
+**	ld1sw	z0\.d, p0/z, \[x0, z1\.d, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1sw_gather_ext_u64_u64index, svuint64_t, int32_t, svuint64_t,
+		     z0_res = svld1sw_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1sw_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_s64.c
new file mode 100644
index 000000000..019a12b20
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_s64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sw_s64_base:
+**	ld1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sw_s64_base, svint64_t, int32_t,
+	   z0 = svld1sw_s64 (p0, x0),
+	   z0 = svld1sw_s64 (p0, x0))
+
+/*
+** ld1sw_s64_index:
+**	ld1sw	z0\.d, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld1sw_s64_index, svint64_t, int32_t,
+	   z0 = svld1sw_s64 (p0, x0 + x1),
+	   z0 = svld1sw_s64 (p0, x0 + x1))
+
+/*
+** ld1sw_s64_1:
+**	ld1sw	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sw_s64_1, svint64_t, int32_t,
+	   z0 = svld1sw_s64 (p0, x0 + svcntd ()),
+	   z0 = svld1sw_s64 (p0, x0 + svcntd ()))
+
+/*
+** ld1sw_s64_7:
+**	ld1sw	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sw_s64_7, svint64_t, int32_t,
+	   z0 = svld1sw_s64 (p0, x0 + svcntd () * 7),
+	   z0 = svld1sw_s64 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sw_s64_8:
+**	incb	x0, all, mul #4
+**	ld1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sw_s64_8, svint64_t, int32_t,
+	   z0 = svld1sw_s64 (p0, x0 + svcntd () * 8),
+	   z0 = svld1sw_s64 (p0, x0 + svcntd () * 8))
+
+/*
+** ld1sw_s64_m1:
+**	ld1sw	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sw_s64_m1, svint64_t, int32_t,
+	   z0 = svld1sw_s64 (p0, x0 - svcntd ()),
+	   z0 = svld1sw_s64 (p0, x0 - svcntd ()))
+
+/*
+** ld1sw_s64_m8:
+**	ld1sw	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sw_s64_m8, svint64_t, int32_t,
+	   z0 = svld1sw_s64 (p0, x0 - svcntd () * 8),
+	   z0 = svld1sw_s64 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sw_s64_m9:
+**	dech	x0, all, mul #9
+**	ld1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sw_s64_m9, svint64_t, int32_t,
+	   z0 = svld1sw_s64 (p0, x0 - svcntd () * 9),
+	   z0 = svld1sw_s64 (p0, x0 - svcntd () * 9))
+
+/*
+** ld1sw_vnum_s64_0:
+**	ld1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sw_vnum_s64_0, svint64_t, int32_t,
+	   z0 = svld1sw_vnum_s64 (p0, x0, 0),
+	   z0 = svld1sw_vnum_s64 (p0, x0, 0))
+
+/*
+** ld1sw_vnum_s64_1:
+**	ld1sw	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sw_vnum_s64_1, svint64_t, int32_t,
+	   z0 = svld1sw_vnum_s64 (p0, x0, 1),
+	   z0 = svld1sw_vnum_s64 (p0, x0, 1))
+
+/*
+** ld1sw_vnum_s64_7:
+**	ld1sw	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sw_vnum_s64_7, svint64_t, int32_t,
+	   z0 = svld1sw_vnum_s64 (p0, x0, 7),
+	   z0 = svld1sw_vnum_s64 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sw_vnum_s64_8:
+**	incb	x0, all, mul #4
+**	ld1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sw_vnum_s64_8, svint64_t, int32_t,
+	   z0 = svld1sw_vnum_s64 (p0, x0, 8),
+	   z0 = svld1sw_vnum_s64 (p0, x0, 8))
+
+/*
+** ld1sw_vnum_s64_m1:
+**	ld1sw	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sw_vnum_s64_m1, svint64_t, int32_t,
+	   z0 = svld1sw_vnum_s64 (p0, x0, -1),
+	   z0 = svld1sw_vnum_s64 (p0, x0, -1))
+
+/*
+** ld1sw_vnum_s64_m8:
+**	ld1sw	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sw_vnum_s64_m8, svint64_t, int32_t,
+	   z0 = svld1sw_vnum_s64 (p0, x0, -8),
+	   z0 = svld1sw_vnum_s64 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sw_vnum_s64_m9:
+**	dech	x0, all, mul #9
+**	ld1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sw_vnum_s64_m9, svint64_t, int32_t,
+	   z0 = svld1sw_vnum_s64 (p0, x0, -9),
+	   z0 = svld1sw_vnum_s64 (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1sw_vnum_s64_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1sw	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1sw_vnum_s64_x1, svint64_t, int32_t,
+	   z0 = svld1sw_vnum_s64 (p0, x0, x1),
+	   z0 = svld1sw_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_u64.c
new file mode 100644
index 000000000..4c291c243
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1sw_u64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1sw_u64_base:
+**	ld1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sw_u64_base, svuint64_t, int32_t,
+	   z0 = svld1sw_u64 (p0, x0),
+	   z0 = svld1sw_u64 (p0, x0))
+
+/*
+** ld1sw_u64_index:
+**	ld1sw	z0\.d, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld1sw_u64_index, svuint64_t, int32_t,
+	   z0 = svld1sw_u64 (p0, x0 + x1),
+	   z0 = svld1sw_u64 (p0, x0 + x1))
+
+/*
+** ld1sw_u64_1:
+**	ld1sw	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sw_u64_1, svuint64_t, int32_t,
+	   z0 = svld1sw_u64 (p0, x0 + svcntd ()),
+	   z0 = svld1sw_u64 (p0, x0 + svcntd ()))
+
+/*
+** ld1sw_u64_7:
+**	ld1sw	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sw_u64_7, svuint64_t, int32_t,
+	   z0 = svld1sw_u64 (p0, x0 + svcntd () * 7),
+	   z0 = svld1sw_u64 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sw_u64_8:
+**	incb	x0, all, mul #4
+**	ld1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sw_u64_8, svuint64_t, int32_t,
+	   z0 = svld1sw_u64 (p0, x0 + svcntd () * 8),
+	   z0 = svld1sw_u64 (p0, x0 + svcntd () * 8))
+
+/*
+** ld1sw_u64_m1:
+**	ld1sw	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sw_u64_m1, svuint64_t, int32_t,
+	   z0 = svld1sw_u64 (p0, x0 - svcntd ()),
+	   z0 = svld1sw_u64 (p0, x0 - svcntd ()))
+
+/*
+** ld1sw_u64_m8:
+**	ld1sw	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sw_u64_m8, svuint64_t, int32_t,
+	   z0 = svld1sw_u64 (p0, x0 - svcntd () * 8),
+	   z0 = svld1sw_u64 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sw_u64_m9:
+**	dech	x0, all, mul #9
+**	ld1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sw_u64_m9, svuint64_t, int32_t,
+	   z0 = svld1sw_u64 (p0, x0 - svcntd () * 9),
+	   z0 = svld1sw_u64 (p0, x0 - svcntd () * 9))
+
+/*
+** ld1sw_vnum_u64_0:
+**	ld1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sw_vnum_u64_0, svuint64_t, int32_t,
+	   z0 = svld1sw_vnum_u64 (p0, x0, 0),
+	   z0 = svld1sw_vnum_u64 (p0, x0, 0))
+
+/*
+** ld1sw_vnum_u64_1:
+**	ld1sw	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sw_vnum_u64_1, svuint64_t, int32_t,
+	   z0 = svld1sw_vnum_u64 (p0, x0, 1),
+	   z0 = svld1sw_vnum_u64 (p0, x0, 1))
+
+/*
+** ld1sw_vnum_u64_7:
+**	ld1sw	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sw_vnum_u64_7, svuint64_t, int32_t,
+	   z0 = svld1sw_vnum_u64 (p0, x0, 7),
+	   z0 = svld1sw_vnum_u64 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sw_vnum_u64_8:
+**	incb	x0, all, mul #4
+**	ld1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sw_vnum_u64_8, svuint64_t, int32_t,
+	   z0 = svld1sw_vnum_u64 (p0, x0, 8),
+	   z0 = svld1sw_vnum_u64 (p0, x0, 8))
+
+/*
+** ld1sw_vnum_u64_m1:
+**	ld1sw	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sw_vnum_u64_m1, svuint64_t, int32_t,
+	   z0 = svld1sw_vnum_u64 (p0, x0, -1),
+	   z0 = svld1sw_vnum_u64 (p0, x0, -1))
+
+/*
+** ld1sw_vnum_u64_m8:
+**	ld1sw	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1sw_vnum_u64_m8, svuint64_t, int32_t,
+	   z0 = svld1sw_vnum_u64 (p0, x0, -8),
+	   z0 = svld1sw_vnum_u64 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1sw_vnum_u64_m9:
+**	dech	x0, all, mul #9
+**	ld1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1sw_vnum_u64_m9, svuint64_t, int32_t,
+	   z0 = svld1sw_vnum_u64 (p0, x0, -9),
+	   z0 = svld1sw_vnum_u64 (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1sw_vnum_u64_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1sw	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1sw_vnum_u64_x1, svuint64_t, int32_t,
+	   z0 = svld1sw_vnum_u64 (p0, x0, x1),
+	   z0 = svld1sw_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_s32.c
new file mode 100644
index 000000000..a9c418265
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_s32.c
@@ -0,0 +1,131 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ub_gather_s32_tied1:
+**	ld1b	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_s32_tied1, svint32_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32base_s32 (p0, z0),
+		     z0_res = svld1ub_gather_s32 (p0, z0))
+
+/*
+** ld1ub_gather_s32_untied:
+**	ld1b	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_s32_untied, svint32_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32base_s32 (p0, z1),
+		     z0_res = svld1ub_gather_s32 (p0, z1))
+
+/*
+** ld1ub_gather_x0_s32_offset:
+**	ld1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_x0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32base_offset_s32 (p0, z0, x0),
+		     z0_res = svld1ub_gather_offset_s32 (p0, z0, x0))
+
+/*
+** ld1ub_gather_m1_s32_offset:
+**	mov	(x[0-9]+), #?-1
+**	ld1b	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_m1_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32base_offset_s32 (p0, z0, -1),
+		     z0_res = svld1ub_gather_offset_s32 (p0, z0, -1))
+
+/*
+** ld1ub_gather_0_s32_offset:
+**	ld1b	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32base_offset_s32 (p0, z0, 0),
+		     z0_res = svld1ub_gather_offset_s32 (p0, z0, 0))
+
+/*
+** ld1ub_gather_5_s32_offset:
+**	ld1b	z0\.s, p0/z, \[z0\.s, #5\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_5_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32base_offset_s32 (p0, z0, 5),
+		     z0_res = svld1ub_gather_offset_s32 (p0, z0, 5))
+
+/*
+** ld1ub_gather_31_s32_offset:
+**	ld1b	z0\.s, p0/z, \[z0\.s, #31\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_31_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32base_offset_s32 (p0, z0, 31),
+		     z0_res = svld1ub_gather_offset_s32 (p0, z0, 31))
+
+/*
+** ld1ub_gather_32_s32_offset:
+**	mov	(x[0-9]+), #?32
+**	ld1b	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_32_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32base_offset_s32 (p0, z0, 32),
+		     z0_res = svld1ub_gather_offset_s32 (p0, z0, 32))
+
+/*
+** ld1ub_gather_x0_s32_s32offset:
+**	ld1b	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_x0_s32_s32offset, svint32_t, uint8_t, svint32_t,
+		     z0_res = svld1ub_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svld1ub_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ld1ub_gather_tied1_s32_s32offset:
+**	ld1b	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_tied1_s32_s32offset, svint32_t, uint8_t, svint32_t,
+		     z0_res = svld1ub_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svld1ub_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ld1ub_gather_untied_s32_s32offset:
+**	ld1b	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_untied_s32_s32offset, svint32_t, uint8_t, svint32_t,
+		     z0_res = svld1ub_gather_s32offset_s32 (p0, x0, z1),
+		     z0_res = svld1ub_gather_offset_s32 (p0, x0, z1))
+
+/*
+** ld1ub_gather_x0_s32_u32offset:
+**	ld1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_x0_s32_u32offset, svint32_t, uint8_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svld1ub_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ld1ub_gather_tied1_s32_u32offset:
+**	ld1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_tied1_s32_u32offset, svint32_t, uint8_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svld1ub_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ld1ub_gather_untied_s32_u32offset:
+**	ld1b	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_untied_s32_u32offset, svint32_t, uint8_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32offset_s32 (p0, x0, z1),
+		     z0_res = svld1ub_gather_offset_s32 (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_s64.c
new file mode 100644
index 000000000..99af86ddf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_s64.c
@@ -0,0 +1,149 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ub_gather_s64_tied1:
+**	ld1b	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_s64_tied1, svint64_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64base_s64 (p0, z0),
+		     z0_res = svld1ub_gather_s64 (p0, z0))
+
+/*
+** ld1ub_gather_s64_untied:
+**	ld1b	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_s64_untied, svint64_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64base_s64 (p0, z1),
+		     z0_res = svld1ub_gather_s64 (p0, z1))
+
+/*
+** ld1ub_gather_x0_s64_offset:
+**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_x0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64base_offset_s64 (p0, z0, x0),
+		     z0_res = svld1ub_gather_offset_s64 (p0, z0, x0))
+
+/*
+** ld1ub_gather_m1_s64_offset:
+**	mov	(x[0-9]+), #?-1
+**	ld1b	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_m1_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64base_offset_s64 (p0, z0, -1),
+		     z0_res = svld1ub_gather_offset_s64 (p0, z0, -1))
+
+/*
+** ld1ub_gather_0_s64_offset:
+**	ld1b	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64base_offset_s64 (p0, z0, 0),
+		     z0_res = svld1ub_gather_offset_s64 (p0, z0, 0))
+
+/*
+** ld1ub_gather_5_s64_offset:
+**	ld1b	z0\.d, p0/z, \[z0\.d, #5\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_5_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64base_offset_s64 (p0, z0, 5),
+		     z0_res = svld1ub_gather_offset_s64 (p0, z0, 5))
+
+/*
+** ld1ub_gather_31_s64_offset:
+**	ld1b	z0\.d, p0/z, \[z0\.d, #31\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_31_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64base_offset_s64 (p0, z0, 31),
+		     z0_res = svld1ub_gather_offset_s64 (p0, z0, 31))
+
+/*
+** ld1ub_gather_32_s64_offset:
+**	mov	(x[0-9]+), #?32
+**	ld1b	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_32_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64base_offset_s64 (p0, z0, 32),
+		     z0_res = svld1ub_gather_offset_s64 (p0, z0, 32))
+
+/*
+** ld1ub_gather_x0_s64_s64offset:
+**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_x0_s64_s64offset, svint64_t, uint8_t, svint64_t,
+		     z0_res = svld1ub_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svld1ub_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1ub_gather_tied1_s64_s64offset:
+**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_tied1_s64_s64offset, svint64_t, uint8_t, svint64_t,
+		     z0_res = svld1ub_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svld1ub_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1ub_gather_untied_s64_s64offset:
+**	ld1b	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_untied_s64_s64offset, svint64_t, uint8_t, svint64_t,
+		     z0_res = svld1ub_gather_s64offset_s64 (p0, x0, z1),
+		     z0_res = svld1ub_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ld1ub_gather_ext_s64_s64offset:
+**	ld1b	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_ext_s64_s64offset, svint64_t, uint8_t, svint64_t,
+		     z0_res = svld1ub_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1ub_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1ub_gather_x0_s64_u64offset:
+**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_x0_s64_u64offset, svint64_t, uint8_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svld1ub_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1ub_gather_tied1_s64_u64offset:
+**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_tied1_s64_u64offset, svint64_t, uint8_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svld1ub_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1ub_gather_untied_s64_u64offset:
+**	ld1b	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_untied_s64_u64offset, svint64_t, uint8_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64offset_s64 (p0, x0, z1),
+		     z0_res = svld1ub_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ld1ub_gather_ext_s64_u64offset:
+**	ld1b	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_ext_s64_u64offset, svint64_t, uint8_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1ub_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_u32.c
new file mode 100644
index 000000000..77c7e0a2d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_u32.c
@@ -0,0 +1,131 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ub_gather_u32_tied1:
+**	ld1b	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_u32_tied1, svuint32_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32base_u32 (p0, z0),
+		     z0_res = svld1ub_gather_u32 (p0, z0))
+
+/*
+** ld1ub_gather_u32_untied:
+**	ld1b	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_u32_untied, svuint32_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32base_u32 (p0, z1),
+		     z0_res = svld1ub_gather_u32 (p0, z1))
+
+/*
+** ld1ub_gather_x0_u32_offset:
+**	ld1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_x0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32base_offset_u32 (p0, z0, x0),
+		     z0_res = svld1ub_gather_offset_u32 (p0, z0, x0))
+
+/*
+** ld1ub_gather_m1_u32_offset:
+**	mov	(x[0-9]+), #?-1
+**	ld1b	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_m1_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32base_offset_u32 (p0, z0, -1),
+		     z0_res = svld1ub_gather_offset_u32 (p0, z0, -1))
+
+/*
+** ld1ub_gather_0_u32_offset:
+**	ld1b	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32base_offset_u32 (p0, z0, 0),
+		     z0_res = svld1ub_gather_offset_u32 (p0, z0, 0))
+
+/*
+** ld1ub_gather_5_u32_offset:
+**	ld1b	z0\.s, p0/z, \[z0\.s, #5\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_5_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32base_offset_u32 (p0, z0, 5),
+		     z0_res = svld1ub_gather_offset_u32 (p0, z0, 5))
+
+/*
+** ld1ub_gather_31_u32_offset:
+**	ld1b	z0\.s, p0/z, \[z0\.s, #31\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_31_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32base_offset_u32 (p0, z0, 31),
+		     z0_res = svld1ub_gather_offset_u32 (p0, z0, 31))
+
+/*
+** ld1ub_gather_32_u32_offset:
+**	mov	(x[0-9]+), #?32
+**	ld1b	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_32_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32base_offset_u32 (p0, z0, 32),
+		     z0_res = svld1ub_gather_offset_u32 (p0, z0, 32))
+
+/*
+** ld1ub_gather_x0_u32_s32offset:
+**	ld1b	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_x0_u32_s32offset, svuint32_t, uint8_t, svint32_t,
+		     z0_res = svld1ub_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svld1ub_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ld1ub_gather_tied1_u32_s32offset:
+**	ld1b	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_tied1_u32_s32offset, svuint32_t, uint8_t, svint32_t,
+		     z0_res = svld1ub_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svld1ub_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ld1ub_gather_untied_u32_s32offset:
+**	ld1b	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_untied_u32_s32offset, svuint32_t, uint8_t, svint32_t,
+		     z0_res = svld1ub_gather_s32offset_u32 (p0, x0, z1),
+		     z0_res = svld1ub_gather_offset_u32 (p0, x0, z1))
+
+/*
+** ld1ub_gather_x0_u32_u32offset:
+**	ld1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_x0_u32_u32offset, svuint32_t, uint8_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svld1ub_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ld1ub_gather_tied1_u32_u32offset:
+**	ld1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_tied1_u32_u32offset, svuint32_t, uint8_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svld1ub_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ld1ub_gather_untied_u32_u32offset:
+**	ld1b	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_untied_u32_u32offset, svuint32_t, uint8_t, svuint32_t,
+		     z0_res = svld1ub_gather_u32offset_u32 (p0, x0, z1),
+		     z0_res = svld1ub_gather_offset_u32 (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_u64.c
new file mode 100644
index 000000000..b605f8b67
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_gather_u64.c
@@ -0,0 +1,149 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ub_gather_u64_tied1:
+**	ld1b	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_u64_tied1, svuint64_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64base_u64 (p0, z0),
+		     z0_res = svld1ub_gather_u64 (p0, z0))
+
+/*
+** ld1ub_gather_u64_untied:
+**	ld1b	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_u64_untied, svuint64_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64base_u64 (p0, z1),
+		     z0_res = svld1ub_gather_u64 (p0, z1))
+
+/*
+** ld1ub_gather_x0_u64_offset:
+**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_x0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64base_offset_u64 (p0, z0, x0),
+		     z0_res = svld1ub_gather_offset_u64 (p0, z0, x0))
+
+/*
+** ld1ub_gather_m1_u64_offset:
+**	mov	(x[0-9]+), #?-1
+**	ld1b	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_m1_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64base_offset_u64 (p0, z0, -1),
+		     z0_res = svld1ub_gather_offset_u64 (p0, z0, -1))
+
+/*
+** ld1ub_gather_0_u64_offset:
+**	ld1b	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64base_offset_u64 (p0, z0, 0),
+		     z0_res = svld1ub_gather_offset_u64 (p0, z0, 0))
+
+/*
+** ld1ub_gather_5_u64_offset:
+**	ld1b	z0\.d, p0/z, \[z0\.d, #5\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_5_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64base_offset_u64 (p0, z0, 5),
+		     z0_res = svld1ub_gather_offset_u64 (p0, z0, 5))
+
+/*
+** ld1ub_gather_31_u64_offset:
+**	ld1b	z0\.d, p0/z, \[z0\.d, #31\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_31_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64base_offset_u64 (p0, z0, 31),
+		     z0_res = svld1ub_gather_offset_u64 (p0, z0, 31))
+
+/*
+** ld1ub_gather_32_u64_offset:
+**	mov	(x[0-9]+), #?32
+**	ld1b	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1ub_gather_32_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64base_offset_u64 (p0, z0, 32),
+		     z0_res = svld1ub_gather_offset_u64 (p0, z0, 32))
+
+/*
+** ld1ub_gather_x0_u64_s64offset:
+**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_x0_u64_s64offset, svuint64_t, uint8_t, svint64_t,
+		     z0_res = svld1ub_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svld1ub_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1ub_gather_tied1_u64_s64offset:
+**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_tied1_u64_s64offset, svuint64_t, uint8_t, svint64_t,
+		     z0_res = svld1ub_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svld1ub_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1ub_gather_untied_u64_s64offset:
+**	ld1b	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_untied_u64_s64offset, svuint64_t, uint8_t, svint64_t,
+		     z0_res = svld1ub_gather_s64offset_u64 (p0, x0, z1),
+		     z0_res = svld1ub_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ld1ub_gather_ext_u64_s64offset:
+**	ld1b	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_ext_u64_s64offset, svuint64_t, uint8_t, svint64_t,
+		     z0_res = svld1ub_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1ub_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1ub_gather_x0_u64_u64offset:
+**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_x0_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svld1ub_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1ub_gather_tied1_u64_u64offset:
+**	ld1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_tied1_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svld1ub_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1ub_gather_untied_u64_u64offset:
+**	ld1b	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_untied_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64offset_u64 (p0, x0, z1),
+		     z0_res = svld1ub_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ld1ub_gather_ext_u64_u64offset:
+**	ld1b	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1ub_gather_ext_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
+		     z0_res = svld1ub_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1ub_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s16.c
new file mode 100644
index 000000000..c492086b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s16.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ub_s16_base:
+**	ld1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s16_base, svint16_t, uint8_t,
+	   z0 = svld1ub_s16 (p0, x0),
+	   z0 = svld1ub_s16 (p0, x0))
+
+/*
+** ld1ub_s16_index:
+**	ld1b	z0\.h, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s16_index, svint16_t, uint8_t,
+	   z0 = svld1ub_s16 (p0, x0 + x1),
+	   z0 = svld1ub_s16 (p0, x0 + x1))
+
+/*
+** ld1ub_s16_1:
+**	ld1b	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s16_1, svint16_t, uint8_t,
+	   z0 = svld1ub_s16 (p0, x0 + svcnth ()),
+	   z0 = svld1ub_s16 (p0, x0 + svcnth ()))
+
+/*
+** ld1ub_s16_7:
+**	ld1b	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s16_7, svint16_t, uint8_t,
+	   z0 = svld1ub_s16 (p0, x0 + svcnth () * 7),
+	   z0 = svld1ub_s16 (p0, x0 + svcnth () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_s16_8:
+**	incb	x0, all, mul #4
+**	ld1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s16_8, svint16_t, uint8_t,
+	   z0 = svld1ub_s16 (p0, x0 + svcnth () * 8),
+	   z0 = svld1ub_s16 (p0, x0 + svcnth () * 8))
+
+/*
+** ld1ub_s16_m1:
+**	ld1b	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s16_m1, svint16_t, uint8_t,
+	   z0 = svld1ub_s16 (p0, x0 - svcnth ()),
+	   z0 = svld1ub_s16 (p0, x0 - svcnth ()))
+
+/*
+** ld1ub_s16_m8:
+**	ld1b	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s16_m8, svint16_t, uint8_t,
+	   z0 = svld1ub_s16 (p0, x0 - svcnth () * 8),
+	   z0 = svld1ub_s16 (p0, x0 - svcnth () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_s16_m9:
+**	dech	x0, all, mul #9
+**	ld1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s16_m9, svint16_t, uint8_t,
+	   z0 = svld1ub_s16 (p0, x0 - svcnth () * 9),
+	   z0 = svld1ub_s16 (p0, x0 - svcnth () * 9))
+
+/*
+** ld1ub_vnum_s16_0:
+**	ld1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s16_0, svint16_t, uint8_t,
+	   z0 = svld1ub_vnum_s16 (p0, x0, 0),
+	   z0 = svld1ub_vnum_s16 (p0, x0, 0))
+
+/*
+** ld1ub_vnum_s16_1:
+**	ld1b	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s16_1, svint16_t, uint8_t,
+	   z0 = svld1ub_vnum_s16 (p0, x0, 1),
+	   z0 = svld1ub_vnum_s16 (p0, x0, 1))
+
+/*
+** ld1ub_vnum_s16_7:
+**	ld1b	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s16_7, svint16_t, uint8_t,
+	   z0 = svld1ub_vnum_s16 (p0, x0, 7),
+	   z0 = svld1ub_vnum_s16 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_vnum_s16_8:
+**	incb	x0, all, mul #4
+**	ld1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s16_8, svint16_t, uint8_t,
+	   z0 = svld1ub_vnum_s16 (p0, x0, 8),
+	   z0 = svld1ub_vnum_s16 (p0, x0, 8))
+
+/*
+** ld1ub_vnum_s16_m1:
+**	ld1b	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s16_m1, svint16_t, uint8_t,
+	   z0 = svld1ub_vnum_s16 (p0, x0, -1),
+	   z0 = svld1ub_vnum_s16 (p0, x0, -1))
+
+/*
+** ld1ub_vnum_s16_m8:
+**	ld1b	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s16_m8, svint16_t, uint8_t,
+	   z0 = svld1ub_vnum_s16 (p0, x0, -8),
+	   z0 = svld1ub_vnum_s16 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_vnum_s16_m9:
+**	dech	x0, all, mul #9
+**	ld1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s16_m9, svint16_t, uint8_t,
+	   z0 = svld1ub_vnum_s16 (p0, x0, -9),
+	   z0 = svld1ub_vnum_s16 (p0, x0, -9))
+
+/*
+** ld1ub_vnum_s16_x1:
+**	cnth	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld1b	z0\.h, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld1b	z0\.h, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s16_x1, svint16_t, uint8_t,
+	   z0 = svld1ub_vnum_s16 (p0, x0, x1),
+	   z0 = svld1ub_vnum_s16 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s32.c
new file mode 100644
index 000000000..b2f8c4b04
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s32.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ub_s32_base:
+**	ld1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s32_base, svint32_t, uint8_t,
+	   z0 = svld1ub_s32 (p0, x0),
+	   z0 = svld1ub_s32 (p0, x0))
+
+/*
+** ld1ub_s32_index:
+**	ld1b	z0\.s, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s32_index, svint32_t, uint8_t,
+	   z0 = svld1ub_s32 (p0, x0 + x1),
+	   z0 = svld1ub_s32 (p0, x0 + x1))
+
+/*
+** ld1ub_s32_1:
+**	ld1b	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s32_1, svint32_t, uint8_t,
+	   z0 = svld1ub_s32 (p0, x0 + svcntw ()),
+	   z0 = svld1ub_s32 (p0, x0 + svcntw ()))
+
+/*
+** ld1ub_s32_7:
+**	ld1b	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s32_7, svint32_t, uint8_t,
+	   z0 = svld1ub_s32 (p0, x0 + svcntw () * 7),
+	   z0 = svld1ub_s32 (p0, x0 + svcntw () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_s32_8:
+**	incb	x0, all, mul #2
+**	ld1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s32_8, svint32_t, uint8_t,
+	   z0 = svld1ub_s32 (p0, x0 + svcntw () * 8),
+	   z0 = svld1ub_s32 (p0, x0 + svcntw () * 8))
+
+/*
+** ld1ub_s32_m1:
+**	ld1b	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s32_m1, svint32_t, uint8_t,
+	   z0 = svld1ub_s32 (p0, x0 - svcntw ()),
+	   z0 = svld1ub_s32 (p0, x0 - svcntw ()))
+
+/*
+** ld1ub_s32_m8:
+**	ld1b	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s32_m8, svint32_t, uint8_t,
+	   z0 = svld1ub_s32 (p0, x0 - svcntw () * 8),
+	   z0 = svld1ub_s32 (p0, x0 - svcntw () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_s32_m9:
+**	decw	x0, all, mul #9
+**	ld1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s32_m9, svint32_t, uint8_t,
+	   z0 = svld1ub_s32 (p0, x0 - svcntw () * 9),
+	   z0 = svld1ub_s32 (p0, x0 - svcntw () * 9))
+
+/*
+** ld1ub_vnum_s32_0:
+**	ld1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s32_0, svint32_t, uint8_t,
+	   z0 = svld1ub_vnum_s32 (p0, x0, 0),
+	   z0 = svld1ub_vnum_s32 (p0, x0, 0))
+
+/*
+** ld1ub_vnum_s32_1:
+**	ld1b	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s32_1, svint32_t, uint8_t,
+	   z0 = svld1ub_vnum_s32 (p0, x0, 1),
+	   z0 = svld1ub_vnum_s32 (p0, x0, 1))
+
+/*
+** ld1ub_vnum_s32_7:
+**	ld1b	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s32_7, svint32_t, uint8_t,
+	   z0 = svld1ub_vnum_s32 (p0, x0, 7),
+	   z0 = svld1ub_vnum_s32 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_vnum_s32_8:
+**	incb	x0, all, mul #2
+**	ld1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s32_8, svint32_t, uint8_t,
+	   z0 = svld1ub_vnum_s32 (p0, x0, 8),
+	   z0 = svld1ub_vnum_s32 (p0, x0, 8))
+
+/*
+** ld1ub_vnum_s32_m1:
+**	ld1b	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s32_m1, svint32_t, uint8_t,
+	   z0 = svld1ub_vnum_s32 (p0, x0, -1),
+	   z0 = svld1ub_vnum_s32 (p0, x0, -1))
+
+/*
+** ld1ub_vnum_s32_m8:
+**	ld1b	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s32_m8, svint32_t, uint8_t,
+	   z0 = svld1ub_vnum_s32 (p0, x0, -8),
+	   z0 = svld1ub_vnum_s32 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_vnum_s32_m9:
+**	decw	x0, all, mul #9
+**	ld1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s32_m9, svint32_t, uint8_t,
+	   z0 = svld1ub_vnum_s32 (p0, x0, -9),
+	   z0 = svld1ub_vnum_s32 (p0, x0, -9))
+
+/*
+** ld1ub_vnum_s32_x1:
+**	cntw	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld1b	z0\.s, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld1b	z0\.s, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s32_x1, svint32_t, uint8_t,
+	   z0 = svld1ub_vnum_s32 (p0, x0, x1),
+	   z0 = svld1ub_vnum_s32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s64.c
new file mode 100644
index 000000000..d8694bf28
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_s64.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ub_s64_base:
+**	ld1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s64_base, svint64_t, uint8_t,
+	   z0 = svld1ub_s64 (p0, x0),
+	   z0 = svld1ub_s64 (p0, x0))
+
+/*
+** ld1ub_s64_index:
+**	ld1b	z0\.d, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s64_index, svint64_t, uint8_t,
+	   z0 = svld1ub_s64 (p0, x0 + x1),
+	   z0 = svld1ub_s64 (p0, x0 + x1))
+
+/*
+** ld1ub_s64_1:
+**	ld1b	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s64_1, svint64_t, uint8_t,
+	   z0 = svld1ub_s64 (p0, x0 + svcntd ()),
+	   z0 = svld1ub_s64 (p0, x0 + svcntd ()))
+
+/*
+** ld1ub_s64_7:
+**	ld1b	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s64_7, svint64_t, uint8_t,
+	   z0 = svld1ub_s64 (p0, x0 + svcntd () * 7),
+	   z0 = svld1ub_s64 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_s64_8:
+**	incb	x0
+**	ld1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s64_8, svint64_t, uint8_t,
+	   z0 = svld1ub_s64 (p0, x0 + svcntd () * 8),
+	   z0 = svld1ub_s64 (p0, x0 + svcntd () * 8))
+
+/*
+** ld1ub_s64_m1:
+**	ld1b	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s64_m1, svint64_t, uint8_t,
+	   z0 = svld1ub_s64 (p0, x0 - svcntd ()),
+	   z0 = svld1ub_s64 (p0, x0 - svcntd ()))
+
+/*
+** ld1ub_s64_m8:
+**	ld1b	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s64_m8, svint64_t, uint8_t,
+	   z0 = svld1ub_s64 (p0, x0 - svcntd () * 8),
+	   z0 = svld1ub_s64 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_s64_m9:
+**	decd	x0, all, mul #9
+**	ld1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_s64_m9, svint64_t, uint8_t,
+	   z0 = svld1ub_s64 (p0, x0 - svcntd () * 9),
+	   z0 = svld1ub_s64 (p0, x0 - svcntd () * 9))
+
+/*
+** ld1ub_vnum_s64_0:
+**	ld1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s64_0, svint64_t, uint8_t,
+	   z0 = svld1ub_vnum_s64 (p0, x0, 0),
+	   z0 = svld1ub_vnum_s64 (p0, x0, 0))
+
+/*
+** ld1ub_vnum_s64_1:
+**	ld1b	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s64_1, svint64_t, uint8_t,
+	   z0 = svld1ub_vnum_s64 (p0, x0, 1),
+	   z0 = svld1ub_vnum_s64 (p0, x0, 1))
+
+/*
+** ld1ub_vnum_s64_7:
+**	ld1b	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s64_7, svint64_t, uint8_t,
+	   z0 = svld1ub_vnum_s64 (p0, x0, 7),
+	   z0 = svld1ub_vnum_s64 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_vnum_s64_8:
+**	incb	x0
+**	ld1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s64_8, svint64_t, uint8_t,
+	   z0 = svld1ub_vnum_s64 (p0, x0, 8),
+	   z0 = svld1ub_vnum_s64 (p0, x0, 8))
+
+/*
+** ld1ub_vnum_s64_m1:
+**	ld1b	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s64_m1, svint64_t, uint8_t,
+	   z0 = svld1ub_vnum_s64 (p0, x0, -1),
+	   z0 = svld1ub_vnum_s64 (p0, x0, -1))
+
+/*
+** ld1ub_vnum_s64_m8:
+**	ld1b	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s64_m8, svint64_t, uint8_t,
+	   z0 = svld1ub_vnum_s64 (p0, x0, -8),
+	   z0 = svld1ub_vnum_s64 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_vnum_s64_m9:
+**	decd	x0, all, mul #9
+**	ld1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s64_m9, svint64_t, uint8_t,
+	   z0 = svld1ub_vnum_s64 (p0, x0, -9),
+	   z0 = svld1ub_vnum_s64 (p0, x0, -9))
+
+/*
+** ld1ub_vnum_s64_x1:
+**	cntd	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld1b	z0\.d, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld1b	z0\.d, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_s64_x1, svint64_t, uint8_t,
+	   z0 = svld1ub_vnum_s64 (p0, x0, x1),
+	   z0 = svld1ub_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u16.c
new file mode 100644
index 000000000..049234ee4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u16.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ub_u16_base:
+**	ld1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u16_base, svuint16_t, uint8_t,
+	   z0 = svld1ub_u16 (p0, x0),
+	   z0 = svld1ub_u16 (p0, x0))
+
+/*
+** ld1ub_u16_index:
+**	ld1b	z0\.h, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u16_index, svuint16_t, uint8_t,
+	   z0 = svld1ub_u16 (p0, x0 + x1),
+	   z0 = svld1ub_u16 (p0, x0 + x1))
+
+/*
+** ld1ub_u16_1:
+**	ld1b	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u16_1, svuint16_t, uint8_t,
+	   z0 = svld1ub_u16 (p0, x0 + svcnth ()),
+	   z0 = svld1ub_u16 (p0, x0 + svcnth ()))
+
+/*
+** ld1ub_u16_7:
+**	ld1b	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u16_7, svuint16_t, uint8_t,
+	   z0 = svld1ub_u16 (p0, x0 + svcnth () * 7),
+	   z0 = svld1ub_u16 (p0, x0 + svcnth () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_u16_8:
+**	incb	x0, all, mul #4
+**	ld1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u16_8, svuint16_t, uint8_t,
+	   z0 = svld1ub_u16 (p0, x0 + svcnth () * 8),
+	   z0 = svld1ub_u16 (p0, x0 + svcnth () * 8))
+
+/*
+** ld1ub_u16_m1:
+**	ld1b	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u16_m1, svuint16_t, uint8_t,
+	   z0 = svld1ub_u16 (p0, x0 - svcnth ()),
+	   z0 = svld1ub_u16 (p0, x0 - svcnth ()))
+
+/*
+** ld1ub_u16_m8:
+**	ld1b	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u16_m8, svuint16_t, uint8_t,
+	   z0 = svld1ub_u16 (p0, x0 - svcnth () * 8),
+	   z0 = svld1ub_u16 (p0, x0 - svcnth () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_u16_m9:
+**	dech	x0, all, mul #9
+**	ld1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u16_m9, svuint16_t, uint8_t,
+	   z0 = svld1ub_u16 (p0, x0 - svcnth () * 9),
+	   z0 = svld1ub_u16 (p0, x0 - svcnth () * 9))
+
+/*
+** ld1ub_vnum_u16_0:
+**	ld1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u16_0, svuint16_t, uint8_t,
+	   z0 = svld1ub_vnum_u16 (p0, x0, 0),
+	   z0 = svld1ub_vnum_u16 (p0, x0, 0))
+
+/*
+** ld1ub_vnum_u16_1:
+**	ld1b	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u16_1, svuint16_t, uint8_t,
+	   z0 = svld1ub_vnum_u16 (p0, x0, 1),
+	   z0 = svld1ub_vnum_u16 (p0, x0, 1))
+
+/*
+** ld1ub_vnum_u16_7:
+**	ld1b	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u16_7, svuint16_t, uint8_t,
+	   z0 = svld1ub_vnum_u16 (p0, x0, 7),
+	   z0 = svld1ub_vnum_u16 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_vnum_u16_8:
+**	incb	x0, all, mul #4
+**	ld1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u16_8, svuint16_t, uint8_t,
+	   z0 = svld1ub_vnum_u16 (p0, x0, 8),
+	   z0 = svld1ub_vnum_u16 (p0, x0, 8))
+
+/*
+** ld1ub_vnum_u16_m1:
+**	ld1b	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u16_m1, svuint16_t, uint8_t,
+	   z0 = svld1ub_vnum_u16 (p0, x0, -1),
+	   z0 = svld1ub_vnum_u16 (p0, x0, -1))
+
+/*
+** ld1ub_vnum_u16_m8:
+**	ld1b	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u16_m8, svuint16_t, uint8_t,
+	   z0 = svld1ub_vnum_u16 (p0, x0, -8),
+	   z0 = svld1ub_vnum_u16 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_vnum_u16_m9:
+**	dech	x0, all, mul #9
+**	ld1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u16_m9, svuint16_t, uint8_t,
+	   z0 = svld1ub_vnum_u16 (p0, x0, -9),
+	   z0 = svld1ub_vnum_u16 (p0, x0, -9))
+
+/*
+** ld1ub_vnum_u16_x1:
+**	cnth	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld1b	z0\.h, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld1b	z0\.h, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u16_x1, svuint16_t, uint8_t,
+	   z0 = svld1ub_vnum_u16 (p0, x0, x1),
+	   z0 = svld1ub_vnum_u16 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u32.c
new file mode 100644
index 000000000..58d2ef527
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u32.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ub_u32_base:
+**	ld1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u32_base, svuint32_t, uint8_t,
+	   z0 = svld1ub_u32 (p0, x0),
+	   z0 = svld1ub_u32 (p0, x0))
+
+/*
+** ld1ub_u32_index:
+**	ld1b	z0\.s, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u32_index, svuint32_t, uint8_t,
+	   z0 = svld1ub_u32 (p0, x0 + x1),
+	   z0 = svld1ub_u32 (p0, x0 + x1))
+
+/*
+** ld1ub_u32_1:
+**	ld1b	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u32_1, svuint32_t, uint8_t,
+	   z0 = svld1ub_u32 (p0, x0 + svcntw ()),
+	   z0 = svld1ub_u32 (p0, x0 + svcntw ()))
+
+/*
+** ld1ub_u32_7:
+**	ld1b	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u32_7, svuint32_t, uint8_t,
+	   z0 = svld1ub_u32 (p0, x0 + svcntw () * 7),
+	   z0 = svld1ub_u32 (p0, x0 + svcntw () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_u32_8:
+**	incb	x0, all, mul #2
+**	ld1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u32_8, svuint32_t, uint8_t,
+	   z0 = svld1ub_u32 (p0, x0 + svcntw () * 8),
+	   z0 = svld1ub_u32 (p0, x0 + svcntw () * 8))
+
+/*
+** ld1ub_u32_m1:
+**	ld1b	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u32_m1, svuint32_t, uint8_t,
+	   z0 = svld1ub_u32 (p0, x0 - svcntw ()),
+	   z0 = svld1ub_u32 (p0, x0 - svcntw ()))
+
+/*
+** ld1ub_u32_m8:
+**	ld1b	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u32_m8, svuint32_t, uint8_t,
+	   z0 = svld1ub_u32 (p0, x0 - svcntw () * 8),
+	   z0 = svld1ub_u32 (p0, x0 - svcntw () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_u32_m9:
+**	decw	x0, all, mul #9
+**	ld1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u32_m9, svuint32_t, uint8_t,
+	   z0 = svld1ub_u32 (p0, x0 - svcntw () * 9),
+	   z0 = svld1ub_u32 (p0, x0 - svcntw () * 9))
+
+/*
+** ld1ub_vnum_u32_0:
+**	ld1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u32_0, svuint32_t, uint8_t,
+	   z0 = svld1ub_vnum_u32 (p0, x0, 0),
+	   z0 = svld1ub_vnum_u32 (p0, x0, 0))
+
+/*
+** ld1ub_vnum_u32_1:
+**	ld1b	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u32_1, svuint32_t, uint8_t,
+	   z0 = svld1ub_vnum_u32 (p0, x0, 1),
+	   z0 = svld1ub_vnum_u32 (p0, x0, 1))
+
+/*
+** ld1ub_vnum_u32_7:
+**	ld1b	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u32_7, svuint32_t, uint8_t,
+	   z0 = svld1ub_vnum_u32 (p0, x0, 7),
+	   z0 = svld1ub_vnum_u32 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_vnum_u32_8:
+**	incb	x0, all, mul #2
+**	ld1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u32_8, svuint32_t, uint8_t,
+	   z0 = svld1ub_vnum_u32 (p0, x0, 8),
+	   z0 = svld1ub_vnum_u32 (p0, x0, 8))
+
+/*
+** ld1ub_vnum_u32_m1:
+**	ld1b	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u32_m1, svuint32_t, uint8_t,
+	   z0 = svld1ub_vnum_u32 (p0, x0, -1),
+	   z0 = svld1ub_vnum_u32 (p0, x0, -1))
+
+/*
+** ld1ub_vnum_u32_m8:
+**	ld1b	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u32_m8, svuint32_t, uint8_t,
+	   z0 = svld1ub_vnum_u32 (p0, x0, -8),
+	   z0 = svld1ub_vnum_u32 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_vnum_u32_m9:
+**	decw	x0, all, mul #9
+**	ld1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u32_m9, svuint32_t, uint8_t,
+	   z0 = svld1ub_vnum_u32 (p0, x0, -9),
+	   z0 = svld1ub_vnum_u32 (p0, x0, -9))
+
+/*
+** ld1ub_vnum_u32_x1:
+**	cntw	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld1b	z0\.s, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld1b	z0\.s, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u32_x1, svuint32_t, uint8_t,
+	   z0 = svld1ub_vnum_u32 (p0, x0, x1),
+	   z0 = svld1ub_vnum_u32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u64.c
new file mode 100644
index 000000000..46d7250f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1ub_u64.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ub_u64_base:
+**	ld1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u64_base, svuint64_t, uint8_t,
+	   z0 = svld1ub_u64 (p0, x0),
+	   z0 = svld1ub_u64 (p0, x0))
+
+/*
+** ld1ub_u64_index:
+**	ld1b	z0\.d, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u64_index, svuint64_t, uint8_t,
+	   z0 = svld1ub_u64 (p0, x0 + x1),
+	   z0 = svld1ub_u64 (p0, x0 + x1))
+
+/*
+** ld1ub_u64_1:
+**	ld1b	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u64_1, svuint64_t, uint8_t,
+	   z0 = svld1ub_u64 (p0, x0 + svcntd ()),
+	   z0 = svld1ub_u64 (p0, x0 + svcntd ()))
+
+/*
+** ld1ub_u64_7:
+**	ld1b	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u64_7, svuint64_t, uint8_t,
+	   z0 = svld1ub_u64 (p0, x0 + svcntd () * 7),
+	   z0 = svld1ub_u64 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_u64_8:
+**	incb	x0
+**	ld1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u64_8, svuint64_t, uint8_t,
+	   z0 = svld1ub_u64 (p0, x0 + svcntd () * 8),
+	   z0 = svld1ub_u64 (p0, x0 + svcntd () * 8))
+
+/*
+** ld1ub_u64_m1:
+**	ld1b	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u64_m1, svuint64_t, uint8_t,
+	   z0 = svld1ub_u64 (p0, x0 - svcntd ()),
+	   z0 = svld1ub_u64 (p0, x0 - svcntd ()))
+
+/*
+** ld1ub_u64_m8:
+**	ld1b	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u64_m8, svuint64_t, uint8_t,
+	   z0 = svld1ub_u64 (p0, x0 - svcntd () * 8),
+	   z0 = svld1ub_u64 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_u64_m9:
+**	decd	x0, all, mul #9
+**	ld1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_u64_m9, svuint64_t, uint8_t,
+	   z0 = svld1ub_u64 (p0, x0 - svcntd () * 9),
+	   z0 = svld1ub_u64 (p0, x0 - svcntd () * 9))
+
+/*
+** ld1ub_vnum_u64_0:
+**	ld1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u64_0, svuint64_t, uint8_t,
+	   z0 = svld1ub_vnum_u64 (p0, x0, 0),
+	   z0 = svld1ub_vnum_u64 (p0, x0, 0))
+
+/*
+** ld1ub_vnum_u64_1:
+**	ld1b	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u64_1, svuint64_t, uint8_t,
+	   z0 = svld1ub_vnum_u64 (p0, x0, 1),
+	   z0 = svld1ub_vnum_u64 (p0, x0, 1))
+
+/*
+** ld1ub_vnum_u64_7:
+**	ld1b	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u64_7, svuint64_t, uint8_t,
+	   z0 = svld1ub_vnum_u64 (p0, x0, 7),
+	   z0 = svld1ub_vnum_u64 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_vnum_u64_8:
+**	incb	x0
+**	ld1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u64_8, svuint64_t, uint8_t,
+	   z0 = svld1ub_vnum_u64 (p0, x0, 8),
+	   z0 = svld1ub_vnum_u64 (p0, x0, 8))
+
+/*
+** ld1ub_vnum_u64_m1:
+**	ld1b	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u64_m1, svuint64_t, uint8_t,
+	   z0 = svld1ub_vnum_u64 (p0, x0, -1),
+	   z0 = svld1ub_vnum_u64 (p0, x0, -1))
+
+/*
+** ld1ub_vnum_u64_m8:
+**	ld1b	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u64_m8, svuint64_t, uint8_t,
+	   z0 = svld1ub_vnum_u64 (p0, x0, -8),
+	   z0 = svld1ub_vnum_u64 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1ub_vnum_u64_m9:
+**	decd	x0, all, mul #9
+**	ld1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u64_m9, svuint64_t, uint8_t,
+	   z0 = svld1ub_vnum_u64 (p0, x0, -9),
+	   z0 = svld1ub_vnum_u64 (p0, x0, -9))
+
+/*
+** ld1ub_vnum_u64_x1:
+**	cntd	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld1b	z0\.d, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld1b	z0\.d, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld1ub_vnum_u64_x1, svuint64_t, uint8_t,
+	   z0 = svld1ub_vnum_u64 (p0, x0, x1),
+	   z0 = svld1ub_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_s32.c
new file mode 100644
index 000000000..84fb5c335
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_s32.c
@@ -0,0 +1,252 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1uh_gather_s32_tied1:
+**	ld1h	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_s32_tied1, svint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_s32 (p0, z0),
+		     z0_res = svld1uh_gather_s32 (p0, z0))
+
+/*
+** ld1uh_gather_s32_untied:
+**	ld1h	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_s32_untied, svint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_s32 (p0, z1),
+		     z0_res = svld1uh_gather_s32 (p0, z1))
+
+/*
+** ld1uh_gather_x0_s32_offset:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_x0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_offset_s32 (p0, z0, x0),
+		     z0_res = svld1uh_gather_offset_s32 (p0, z0, x0))
+
+/*
+** ld1uh_gather_m2_s32_offset:
+**	mov	(x[0-9]+), #?-2
+**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_m2_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_offset_s32 (p0, z0, -2),
+		     z0_res = svld1uh_gather_offset_s32 (p0, z0, -2))
+
+/*
+** ld1uh_gather_0_s32_offset:
+**	ld1h	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_offset_s32 (p0, z0, 0),
+		     z0_res = svld1uh_gather_offset_s32 (p0, z0, 0))
+
+/*
+** ld1uh_gather_5_s32_offset:
+**	mov	(x[0-9]+), #?5
+**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_5_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_offset_s32 (p0, z0, 5),
+		     z0_res = svld1uh_gather_offset_s32 (p0, z0, 5))
+
+/*
+** ld1uh_gather_6_s32_offset:
+**	ld1h	z0\.s, p0/z, \[z0\.s, #6\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_6_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_offset_s32 (p0, z0, 6),
+		     z0_res = svld1uh_gather_offset_s32 (p0, z0, 6))
+
+/*
+** ld1uh_gather_62_s32_offset:
+**	ld1h	z0\.s, p0/z, \[z0\.s, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_62_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_offset_s32 (p0, z0, 62),
+		     z0_res = svld1uh_gather_offset_s32 (p0, z0, 62))
+
+/*
+** ld1uh_gather_64_s32_offset:
+**	mov	(x[0-9]+), #?64
+**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_64_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_offset_s32 (p0, z0, 64),
+		     z0_res = svld1uh_gather_offset_s32 (p0, z0, 64))
+
+/*
+** ld1uh_gather_x0_s32_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_x0_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_index_s32 (p0, z0, x0),
+		     z0_res = svld1uh_gather_index_s32 (p0, z0, x0))
+
+/*
+** ld1uh_gather_m1_s32_index:
+**	mov	(x[0-9]+), #?-2
+**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_m1_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_index_s32 (p0, z0, -1),
+		     z0_res = svld1uh_gather_index_s32 (p0, z0, -1))
+
+/*
+** ld1uh_gather_0_s32_index:
+**	ld1h	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_0_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_index_s32 (p0, z0, 0),
+		     z0_res = svld1uh_gather_index_s32 (p0, z0, 0))
+
+/*
+** ld1uh_gather_5_s32_index:
+**	ld1h	z0\.s, p0/z, \[z0\.s, #10\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_5_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_index_s32 (p0, z0, 5),
+		     z0_res = svld1uh_gather_index_s32 (p0, z0, 5))
+
+/*
+** ld1uh_gather_31_s32_index:
+**	ld1h	z0\.s, p0/z, \[z0\.s, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_31_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_index_s32 (p0, z0, 31),
+		     z0_res = svld1uh_gather_index_s32 (p0, z0, 31))
+
+/*
+** ld1uh_gather_32_s32_index:
+**	mov	(x[0-9]+), #?64
+**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_32_s32_index, svint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_index_s32 (p0, z0, 32),
+		     z0_res = svld1uh_gather_index_s32 (p0, z0, 32))
+
+/*
+** ld1uh_gather_x0_s32_s32offset:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_s32_s32offset, svint32_t, uint16_t, svint32_t,
+		     z0_res = svld1uh_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svld1uh_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ld1uh_gather_tied1_s32_s32offset:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_s32_s32offset, svint32_t, uint16_t, svint32_t,
+		     z0_res = svld1uh_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svld1uh_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ld1uh_gather_untied_s32_s32offset:
+**	ld1h	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_s32_s32offset, svint32_t, uint16_t, svint32_t,
+		     z0_res = svld1uh_gather_s32offset_s32 (p0, x0, z1),
+		     z0_res = svld1uh_gather_offset_s32 (p0, x0, z1))
+
+/*
+** ld1uh_gather_x0_s32_u32offset:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_s32_u32offset, svint32_t, uint16_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svld1uh_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ld1uh_gather_tied1_s32_u32offset:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_s32_u32offset, svint32_t, uint16_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svld1uh_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ld1uh_gather_untied_s32_u32offset:
+**	ld1h	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_s32_u32offset, svint32_t, uint16_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32offset_s32 (p0, x0, z1),
+		     z0_res = svld1uh_gather_offset_s32 (p0, x0, z1))
+
+/*
+** ld1uh_gather_x0_s32_s32index:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_s32_s32index, svint32_t, uint16_t, svint32_t,
+		     z0_res = svld1uh_gather_s32index_s32 (p0, x0, z0),
+		     z0_res = svld1uh_gather_index_s32 (p0, x0, z0))
+
+/*
+** ld1uh_gather_tied1_s32_s32index:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_s32_s32index, svint32_t, uint16_t, svint32_t,
+		     z0_res = svld1uh_gather_s32index_s32 (p0, x0, z0),
+		     z0_res = svld1uh_gather_index_s32 (p0, x0, z0))
+
+/*
+** ld1uh_gather_untied_s32_s32index:
+**	ld1h	z0\.s, p0/z, \[x0, z1\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_s32_s32index, svint32_t, uint16_t, svint32_t,
+		     z0_res = svld1uh_gather_s32index_s32 (p0, x0, z1),
+		     z0_res = svld1uh_gather_index_s32 (p0, x0, z1))
+
+/*
+** ld1uh_gather_x0_s32_u32index:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_s32_u32index, svint32_t, uint16_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32index_s32 (p0, x0, z0),
+		     z0_res = svld1uh_gather_index_s32 (p0, x0, z0))
+
+/*
+** ld1uh_gather_tied1_s32_u32index:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_s32_u32index, svint32_t, uint16_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32index_s32 (p0, x0, z0),
+		     z0_res = svld1uh_gather_index_s32 (p0, x0, z0))
+
+/*
+** ld1uh_gather_untied_s32_u32index:
+**	ld1h	z0\.s, p0/z, \[x0, z1\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_s32_u32index, svint32_t, uint16_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32index_s32 (p0, x0, z1),
+		     z0_res = svld1uh_gather_index_s32 (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_s64.c
new file mode 100644
index 000000000..447001793
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_s64.c
@@ -0,0 +1,288 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1uh_gather_s64_tied1:
+**	ld1h	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_s64_tied1, svint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_s64 (p0, z0),
+		     z0_res = svld1uh_gather_s64 (p0, z0))
+
+/*
+** ld1uh_gather_s64_untied:
+**	ld1h	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_s64_untied, svint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_s64 (p0, z1),
+		     z0_res = svld1uh_gather_s64 (p0, z1))
+
+/*
+** ld1uh_gather_x0_s64_offset:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_x0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_offset_s64 (p0, z0, x0),
+		     z0_res = svld1uh_gather_offset_s64 (p0, z0, x0))
+
+/*
+** ld1uh_gather_m2_s64_offset:
+**	mov	(x[0-9]+), #?-2
+**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_m2_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_offset_s64 (p0, z0, -2),
+		     z0_res = svld1uh_gather_offset_s64 (p0, z0, -2))
+
+/*
+** ld1uh_gather_0_s64_offset:
+**	ld1h	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_offset_s64 (p0, z0, 0),
+		     z0_res = svld1uh_gather_offset_s64 (p0, z0, 0))
+
+/*
+** ld1uh_gather_5_s64_offset:
+**	mov	(x[0-9]+), #?5
+**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_5_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_offset_s64 (p0, z0, 5),
+		     z0_res = svld1uh_gather_offset_s64 (p0, z0, 5))
+
+/*
+** ld1uh_gather_6_s64_offset:
+**	ld1h	z0\.d, p0/z, \[z0\.d, #6\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_6_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_offset_s64 (p0, z0, 6),
+		     z0_res = svld1uh_gather_offset_s64 (p0, z0, 6))
+
+/*
+** ld1uh_gather_62_s64_offset:
+**	ld1h	z0\.d, p0/z, \[z0\.d, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_62_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_offset_s64 (p0, z0, 62),
+		     z0_res = svld1uh_gather_offset_s64 (p0, z0, 62))
+
+/*
+** ld1uh_gather_64_s64_offset:
+**	mov	(x[0-9]+), #?64
+**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_64_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_offset_s64 (p0, z0, 64),
+		     z0_res = svld1uh_gather_offset_s64 (p0, z0, 64))
+
+/*
+** ld1uh_gather_x0_s64_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_x0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_index_s64 (p0, z0, x0),
+		     z0_res = svld1uh_gather_index_s64 (p0, z0, x0))
+
+/*
+** ld1uh_gather_m1_s64_index:
+**	mov	(x[0-9]+), #?-2
+**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_m1_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_index_s64 (p0, z0, -1),
+		     z0_res = svld1uh_gather_index_s64 (p0, z0, -1))
+
+/*
+** ld1uh_gather_0_s64_index:
+**	ld1h	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_index_s64 (p0, z0, 0),
+		     z0_res = svld1uh_gather_index_s64 (p0, z0, 0))
+
+/*
+** ld1uh_gather_5_s64_index:
+**	ld1h	z0\.d, p0/z, \[z0\.d, #10\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_5_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_index_s64 (p0, z0, 5),
+		     z0_res = svld1uh_gather_index_s64 (p0, z0, 5))
+
+/*
+** ld1uh_gather_31_s64_index:
+**	ld1h	z0\.d, p0/z, \[z0\.d, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_31_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_index_s64 (p0, z0, 31),
+		     z0_res = svld1uh_gather_index_s64 (p0, z0, 31))
+
+/*
+** ld1uh_gather_32_s64_index:
+**	mov	(x[0-9]+), #?64
+**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_32_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_index_s64 (p0, z0, 32),
+		     z0_res = svld1uh_gather_index_s64 (p0, z0, 32))
+
+/*
+** ld1uh_gather_x0_s64_s64offset:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_s64_s64offset, svint64_t, uint16_t, svint64_t,
+		     z0_res = svld1uh_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svld1uh_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1uh_gather_tied1_s64_s64offset:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_s64_s64offset, svint64_t, uint16_t, svint64_t,
+		     z0_res = svld1uh_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svld1uh_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1uh_gather_untied_s64_s64offset:
+**	ld1h	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_s64_s64offset, svint64_t, uint16_t, svint64_t,
+		     z0_res = svld1uh_gather_s64offset_s64 (p0, x0, z1),
+		     z0_res = svld1uh_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ld1uh_gather_ext_s64_s64offset:
+**	ld1h	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_ext_s64_s64offset, svint64_t, uint16_t, svint64_t,
+		     z0_res = svld1uh_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1uh_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1uh_gather_x0_s64_u64offset:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_s64_u64offset, svint64_t, uint16_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svld1uh_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1uh_gather_tied1_s64_u64offset:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_s64_u64offset, svint64_t, uint16_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svld1uh_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1uh_gather_untied_s64_u64offset:
+**	ld1h	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_s64_u64offset, svint64_t, uint16_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64offset_s64 (p0, x0, z1),
+		     z0_res = svld1uh_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ld1uh_gather_ext_s64_u64offset:
+**	ld1h	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_ext_s64_u64offset, svint64_t, uint16_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1uh_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1uh_gather_x0_s64_s64index:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_s64_s64index, svint64_t, uint16_t, svint64_t,
+		     z0_res = svld1uh_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svld1uh_gather_index_s64 (p0, x0, z0))
+
+/*
+** ld1uh_gather_tied1_s64_s64index:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_s64_s64index, svint64_t, uint16_t, svint64_t,
+		     z0_res = svld1uh_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svld1uh_gather_index_s64 (p0, x0, z0))
+
+/*
+** ld1uh_gather_untied_s64_s64index:
+**	ld1h	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_s64_s64index, svint64_t, uint16_t, svint64_t,
+		     z0_res = svld1uh_gather_s64index_s64 (p0, x0, z1),
+		     z0_res = svld1uh_gather_index_s64 (p0, x0, z1))
+
+/*
+** ld1uh_gather_ext_s64_s64index:
+**	ld1h	z0\.d, p0/z, \[x0, z1\.d, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_ext_s64_s64index, svint64_t, uint16_t, svint64_t,
+		     z0_res = svld1uh_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1uh_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1uh_gather_x0_s64_u64index:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_s64_u64index, svint64_t, uint16_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svld1uh_gather_index_s64 (p0, x0, z0))
+
+/*
+** ld1uh_gather_tied1_s64_u64index:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_s64_u64index, svint64_t, uint16_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svld1uh_gather_index_s64 (p0, x0, z0))
+
+/*
+** ld1uh_gather_untied_s64_u64index:
+**	ld1h	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_s64_u64index, svint64_t, uint16_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64index_s64 (p0, x0, z1),
+		     z0_res = svld1uh_gather_index_s64 (p0, x0, z1))
+
+/*
+** ld1uh_gather_ext_s64_u64index:
+**	ld1h	z0\.d, p0/z, \[x0, z1\.d, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_ext_s64_u64index, svint64_t, uint16_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1uh_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_u32.c
new file mode 100644
index 000000000..09d3cc8c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_u32.c
@@ -0,0 +1,252 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1uh_gather_u32_tied1:
+**	ld1h	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_u32_tied1, svuint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_u32 (p0, z0),
+		     z0_res = svld1uh_gather_u32 (p0, z0))
+
+/*
+** ld1uh_gather_u32_untied:
+**	ld1h	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_u32_untied, svuint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_u32 (p0, z1),
+		     z0_res = svld1uh_gather_u32 (p0, z1))
+
+/*
+** ld1uh_gather_x0_u32_offset:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_x0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_offset_u32 (p0, z0, x0),
+		     z0_res = svld1uh_gather_offset_u32 (p0, z0, x0))
+
+/*
+** ld1uh_gather_m2_u32_offset:
+**	mov	(x[0-9]+), #?-2
+**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_m2_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_offset_u32 (p0, z0, -2),
+		     z0_res = svld1uh_gather_offset_u32 (p0, z0, -2))
+
+/*
+** ld1uh_gather_0_u32_offset:
+**	ld1h	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_offset_u32 (p0, z0, 0),
+		     z0_res = svld1uh_gather_offset_u32 (p0, z0, 0))
+
+/*
+** ld1uh_gather_5_u32_offset:
+**	mov	(x[0-9]+), #?5
+**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_5_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_offset_u32 (p0, z0, 5),
+		     z0_res = svld1uh_gather_offset_u32 (p0, z0, 5))
+
+/*
+** ld1uh_gather_6_u32_offset:
+**	ld1h	z0\.s, p0/z, \[z0\.s, #6\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_6_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_offset_u32 (p0, z0, 6),
+		     z0_res = svld1uh_gather_offset_u32 (p0, z0, 6))
+
+/*
+** ld1uh_gather_62_u32_offset:
+**	ld1h	z0\.s, p0/z, \[z0\.s, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_62_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_offset_u32 (p0, z0, 62),
+		     z0_res = svld1uh_gather_offset_u32 (p0, z0, 62))
+
+/*
+** ld1uh_gather_64_u32_offset:
+**	mov	(x[0-9]+), #?64
+**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_64_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_offset_u32 (p0, z0, 64),
+		     z0_res = svld1uh_gather_offset_u32 (p0, z0, 64))
+
+/*
+** ld1uh_gather_x0_u32_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_x0_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_index_u32 (p0, z0, x0),
+		     z0_res = svld1uh_gather_index_u32 (p0, z0, x0))
+
+/*
+** ld1uh_gather_m1_u32_index:
+**	mov	(x[0-9]+), #?-2
+**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_m1_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_index_u32 (p0, z0, -1),
+		     z0_res = svld1uh_gather_index_u32 (p0, z0, -1))
+
+/*
+** ld1uh_gather_0_u32_index:
+**	ld1h	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_0_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_index_u32 (p0, z0, 0),
+		     z0_res = svld1uh_gather_index_u32 (p0, z0, 0))
+
+/*
+** ld1uh_gather_5_u32_index:
+**	ld1h	z0\.s, p0/z, \[z0\.s, #10\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_5_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_index_u32 (p0, z0, 5),
+		     z0_res = svld1uh_gather_index_u32 (p0, z0, 5))
+
+/*
+** ld1uh_gather_31_u32_index:
+**	ld1h	z0\.s, p0/z, \[z0\.s, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_31_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_index_u32 (p0, z0, 31),
+		     z0_res = svld1uh_gather_index_u32 (p0, z0, 31))
+
+/*
+** ld1uh_gather_32_u32_index:
+**	mov	(x[0-9]+), #?64
+**	ld1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_32_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32base_index_u32 (p0, z0, 32),
+		     z0_res = svld1uh_gather_index_u32 (p0, z0, 32))
+
+/*
+** ld1uh_gather_x0_u32_s32offset:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_u32_s32offset, svuint32_t, uint16_t, svint32_t,
+		     z0_res = svld1uh_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svld1uh_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ld1uh_gather_tied1_u32_s32offset:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_u32_s32offset, svuint32_t, uint16_t, svint32_t,
+		     z0_res = svld1uh_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svld1uh_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ld1uh_gather_untied_u32_s32offset:
+**	ld1h	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_u32_s32offset, svuint32_t, uint16_t, svint32_t,
+		     z0_res = svld1uh_gather_s32offset_u32 (p0, x0, z1),
+		     z0_res = svld1uh_gather_offset_u32 (p0, x0, z1))
+
+/*
+** ld1uh_gather_x0_u32_u32offset:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_u32_u32offset, svuint32_t, uint16_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svld1uh_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ld1uh_gather_tied1_u32_u32offset:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_u32_u32offset, svuint32_t, uint16_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svld1uh_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ld1uh_gather_untied_u32_u32offset:
+**	ld1h	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_u32_u32offset, svuint32_t, uint16_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32offset_u32 (p0, x0, z1),
+		     z0_res = svld1uh_gather_offset_u32 (p0, x0, z1))
+
+/*
+** ld1uh_gather_x0_u32_s32index:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_u32_s32index, svuint32_t, uint16_t, svint32_t,
+		     z0_res = svld1uh_gather_s32index_u32 (p0, x0, z0),
+		     z0_res = svld1uh_gather_index_u32 (p0, x0, z0))
+
+/*
+** ld1uh_gather_tied1_u32_s32index:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_u32_s32index, svuint32_t, uint16_t, svint32_t,
+		     z0_res = svld1uh_gather_s32index_u32 (p0, x0, z0),
+		     z0_res = svld1uh_gather_index_u32 (p0, x0, z0))
+
+/*
+** ld1uh_gather_untied_u32_s32index:
+**	ld1h	z0\.s, p0/z, \[x0, z1\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_u32_s32index, svuint32_t, uint16_t, svint32_t,
+		     z0_res = svld1uh_gather_s32index_u32 (p0, x0, z1),
+		     z0_res = svld1uh_gather_index_u32 (p0, x0, z1))
+
+/*
+** ld1uh_gather_x0_u32_u32index:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_u32_u32index, svuint32_t, uint16_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32index_u32 (p0, x0, z0),
+		     z0_res = svld1uh_gather_index_u32 (p0, x0, z0))
+
+/*
+** ld1uh_gather_tied1_u32_u32index:
+**	ld1h	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_u32_u32index, svuint32_t, uint16_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32index_u32 (p0, x0, z0),
+		     z0_res = svld1uh_gather_index_u32 (p0, x0, z0))
+
+/*
+** ld1uh_gather_untied_u32_u32index:
+**	ld1h	z0\.s, p0/z, \[x0, z1\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_u32_u32index, svuint32_t, uint16_t, svuint32_t,
+		     z0_res = svld1uh_gather_u32index_u32 (p0, x0, z1),
+		     z0_res = svld1uh_gather_index_u32 (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_u64.c
new file mode 100644
index 000000000..f3dcf03cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_gather_u64.c
@@ -0,0 +1,288 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1uh_gather_u64_tied1:
+**	ld1h	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_u64_tied1, svuint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_u64 (p0, z0),
+		     z0_res = svld1uh_gather_u64 (p0, z0))
+
+/*
+** ld1uh_gather_u64_untied:
+**	ld1h	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_u64_untied, svuint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_u64 (p0, z1),
+		     z0_res = svld1uh_gather_u64 (p0, z1))
+
+/*
+** ld1uh_gather_x0_u64_offset:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_x0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_offset_u64 (p0, z0, x0),
+		     z0_res = svld1uh_gather_offset_u64 (p0, z0, x0))
+
+/*
+** ld1uh_gather_m2_u64_offset:
+**	mov	(x[0-9]+), #?-2
+**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_m2_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_offset_u64 (p0, z0, -2),
+		     z0_res = svld1uh_gather_offset_u64 (p0, z0, -2))
+
+/*
+** ld1uh_gather_0_u64_offset:
+**	ld1h	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_offset_u64 (p0, z0, 0),
+		     z0_res = svld1uh_gather_offset_u64 (p0, z0, 0))
+
+/*
+** ld1uh_gather_5_u64_offset:
+**	mov	(x[0-9]+), #?5
+**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_5_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_offset_u64 (p0, z0, 5),
+		     z0_res = svld1uh_gather_offset_u64 (p0, z0, 5))
+
+/*
+** ld1uh_gather_6_u64_offset:
+**	ld1h	z0\.d, p0/z, \[z0\.d, #6\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_6_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_offset_u64 (p0, z0, 6),
+		     z0_res = svld1uh_gather_offset_u64 (p0, z0, 6))
+
+/*
+** ld1uh_gather_62_u64_offset:
+**	ld1h	z0\.d, p0/z, \[z0\.d, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_62_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_offset_u64 (p0, z0, 62),
+		     z0_res = svld1uh_gather_offset_u64 (p0, z0, 62))
+
+/*
+** ld1uh_gather_64_u64_offset:
+**	mov	(x[0-9]+), #?64
+**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_64_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_offset_u64 (p0, z0, 64),
+		     z0_res = svld1uh_gather_offset_u64 (p0, z0, 64))
+
+/*
+** ld1uh_gather_x0_u64_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_x0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_index_u64 (p0, z0, x0),
+		     z0_res = svld1uh_gather_index_u64 (p0, z0, x0))
+
+/*
+** ld1uh_gather_m1_u64_index:
+**	mov	(x[0-9]+), #?-2
+**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_m1_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_index_u64 (p0, z0, -1),
+		     z0_res = svld1uh_gather_index_u64 (p0, z0, -1))
+
+/*
+** ld1uh_gather_0_u64_index:
+**	ld1h	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_index_u64 (p0, z0, 0),
+		     z0_res = svld1uh_gather_index_u64 (p0, z0, 0))
+
+/*
+** ld1uh_gather_5_u64_index:
+**	ld1h	z0\.d, p0/z, \[z0\.d, #10\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_5_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_index_u64 (p0, z0, 5),
+		     z0_res = svld1uh_gather_index_u64 (p0, z0, 5))
+
+/*
+** ld1uh_gather_31_u64_index:
+**	ld1h	z0\.d, p0/z, \[z0\.d, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_31_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_index_u64 (p0, z0, 31),
+		     z0_res = svld1uh_gather_index_u64 (p0, z0, 31))
+
+/*
+** ld1uh_gather_32_u64_index:
+**	mov	(x[0-9]+), #?64
+**	ld1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uh_gather_32_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64base_index_u64 (p0, z0, 32),
+		     z0_res = svld1uh_gather_index_u64 (p0, z0, 32))
+
+/*
+** ld1uh_gather_x0_u64_s64offset:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_u64_s64offset, svuint64_t, uint16_t, svint64_t,
+		     z0_res = svld1uh_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svld1uh_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1uh_gather_tied1_u64_s64offset:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_u64_s64offset, svuint64_t, uint16_t, svint64_t,
+		     z0_res = svld1uh_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svld1uh_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1uh_gather_untied_u64_s64offset:
+**	ld1h	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_u64_s64offset, svuint64_t, uint16_t, svint64_t,
+		     z0_res = svld1uh_gather_s64offset_u64 (p0, x0, z1),
+		     z0_res = svld1uh_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ld1uh_gather_ext_u64_s64offset:
+**	ld1h	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_ext_u64_s64offset, svuint64_t, uint16_t, svint64_t,
+		     z0_res = svld1uh_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1uh_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1uh_gather_x0_u64_u64offset:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svld1uh_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1uh_gather_tied1_u64_u64offset:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svld1uh_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1uh_gather_untied_u64_u64offset:
+**	ld1h	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64offset_u64 (p0, x0, z1),
+		     z0_res = svld1uh_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ld1uh_gather_ext_u64_u64offset:
+**	ld1h	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_ext_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1uh_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1uh_gather_x0_u64_s64index:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_u64_s64index, svuint64_t, uint16_t, svint64_t,
+		     z0_res = svld1uh_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svld1uh_gather_index_u64 (p0, x0, z0))
+
+/*
+** ld1uh_gather_tied1_u64_s64index:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_u64_s64index, svuint64_t, uint16_t, svint64_t,
+		     z0_res = svld1uh_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svld1uh_gather_index_u64 (p0, x0, z0))
+
+/*
+** ld1uh_gather_untied_u64_s64index:
+**	ld1h	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_u64_s64index, svuint64_t, uint16_t, svint64_t,
+		     z0_res = svld1uh_gather_s64index_u64 (p0, x0, z1),
+		     z0_res = svld1uh_gather_index_u64 (p0, x0, z1))
+
+/*
+** ld1uh_gather_ext_u64_s64index:
+**	ld1h	z0\.d, p0/z, \[x0, z1\.d, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_ext_u64_s64index, svuint64_t, uint16_t, svint64_t,
+		     z0_res = svld1uh_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1uh_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1uh_gather_x0_u64_u64index:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_x0_u64_u64index, svuint64_t, uint16_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svld1uh_gather_index_u64 (p0, x0, z0))
+
+/*
+** ld1uh_gather_tied1_u64_u64index:
+**	ld1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_tied1_u64_u64index, svuint64_t, uint16_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svld1uh_gather_index_u64 (p0, x0, z0))
+
+/*
+** ld1uh_gather_untied_u64_u64index:
+**	ld1h	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_untied_u64_u64index, svuint64_t, uint16_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64index_u64 (p0, x0, z1),
+		     z0_res = svld1uh_gather_index_u64 (p0, x0, z1))
+
+/*
+** ld1uh_gather_ext_u64_u64index:
+**	ld1h	z0\.d, p0/z, \[x0, z1\.d, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uh_gather_ext_u64_u64index, svuint64_t, uint16_t, svuint64_t,
+		     z0_res = svld1uh_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1uh_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_s32.c
new file mode 100644
index 000000000..df1ce974b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_s32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1uh_s32_base:
+**	ld1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_s32_base, svint32_t, uint16_t,
+	   z0 = svld1uh_s32 (p0, x0),
+	   z0 = svld1uh_s32 (p0, x0))
+
+/*
+** ld1uh_s32_index:
+**	ld1h	z0\.s, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1uh_s32_index, svint32_t, uint16_t,
+	   z0 = svld1uh_s32 (p0, x0 + x1),
+	   z0 = svld1uh_s32 (p0, x0 + x1))
+
+/*
+** ld1uh_s32_1:
+**	ld1h	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_s32_1, svint32_t, uint16_t,
+	   z0 = svld1uh_s32 (p0, x0 + svcntw ()),
+	   z0 = svld1uh_s32 (p0, x0 + svcntw ()))
+
+/*
+** ld1uh_s32_7:
+**	ld1h	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_s32_7, svint32_t, uint16_t,
+	   z0 = svld1uh_s32 (p0, x0 + svcntw () * 7),
+	   z0 = svld1uh_s32 (p0, x0 + svcntw () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uh_s32_8:
+**	incb	x0, all, mul #4
+**	ld1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_s32_8, svint32_t, uint16_t,
+	   z0 = svld1uh_s32 (p0, x0 + svcntw () * 8),
+	   z0 = svld1uh_s32 (p0, x0 + svcntw () * 8))
+
+/*
+** ld1uh_s32_m1:
+**	ld1h	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_s32_m1, svint32_t, uint16_t,
+	   z0 = svld1uh_s32 (p0, x0 - svcntw ()),
+	   z0 = svld1uh_s32 (p0, x0 - svcntw ()))
+
+/*
+** ld1uh_s32_m8:
+**	ld1h	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_s32_m8, svint32_t, uint16_t,
+	   z0 = svld1uh_s32 (p0, x0 - svcntw () * 8),
+	   z0 = svld1uh_s32 (p0, x0 - svcntw () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uh_s32_m9:
+**	dech	x0, all, mul #9
+**	ld1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_s32_m9, svint32_t, uint16_t,
+	   z0 = svld1uh_s32 (p0, x0 - svcntw () * 9),
+	   z0 = svld1uh_s32 (p0, x0 - svcntw () * 9))
+
+/*
+** ld1uh_vnum_s32_0:
+**	ld1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_s32_0, svint32_t, uint16_t,
+	   z0 = svld1uh_vnum_s32 (p0, x0, 0),
+	   z0 = svld1uh_vnum_s32 (p0, x0, 0))
+
+/*
+** ld1uh_vnum_s32_1:
+**	ld1h	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_s32_1, svint32_t, uint16_t,
+	   z0 = svld1uh_vnum_s32 (p0, x0, 1),
+	   z0 = svld1uh_vnum_s32 (p0, x0, 1))
+
+/*
+** ld1uh_vnum_s32_7:
+**	ld1h	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_s32_7, svint32_t, uint16_t,
+	   z0 = svld1uh_vnum_s32 (p0, x0, 7),
+	   z0 = svld1uh_vnum_s32 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uh_vnum_s32_8:
+**	incb	x0, all, mul #4
+**	ld1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_s32_8, svint32_t, uint16_t,
+	   z0 = svld1uh_vnum_s32 (p0, x0, 8),
+	   z0 = svld1uh_vnum_s32 (p0, x0, 8))
+
+/*
+** ld1uh_vnum_s32_m1:
+**	ld1h	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_s32_m1, svint32_t, uint16_t,
+	   z0 = svld1uh_vnum_s32 (p0, x0, -1),
+	   z0 = svld1uh_vnum_s32 (p0, x0, -1))
+
+/*
+** ld1uh_vnum_s32_m8:
+**	ld1h	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_s32_m8, svint32_t, uint16_t,
+	   z0 = svld1uh_vnum_s32 (p0, x0, -8),
+	   z0 = svld1uh_vnum_s32 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uh_vnum_s32_m9:
+**	dech	x0, all, mul #9
+**	ld1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_s32_m9, svint32_t, uint16_t,
+	   z0 = svld1uh_vnum_s32 (p0, x0, -9),
+	   z0 = svld1uh_vnum_s32 (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1uh_vnum_s32_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1h	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_s32_x1, svint32_t, uint16_t,
+	   z0 = svld1uh_vnum_s32 (p0, x0, x1),
+	   z0 = svld1uh_vnum_s32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_s64.c
new file mode 100644
index 000000000..7c3ab0aee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_s64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1uh_s64_base:
+**	ld1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_s64_base, svint64_t, uint16_t,
+	   z0 = svld1uh_s64 (p0, x0),
+	   z0 = svld1uh_s64 (p0, x0))
+
+/*
+** ld1uh_s64_index:
+**	ld1h	z0\.d, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1uh_s64_index, svint64_t, uint16_t,
+	   z0 = svld1uh_s64 (p0, x0 + x1),
+	   z0 = svld1uh_s64 (p0, x0 + x1))
+
+/*
+** ld1uh_s64_1:
+**	ld1h	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_s64_1, svint64_t, uint16_t,
+	   z0 = svld1uh_s64 (p0, x0 + svcntd ()),
+	   z0 = svld1uh_s64 (p0, x0 + svcntd ()))
+
+/*
+** ld1uh_s64_7:
+**	ld1h	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_s64_7, svint64_t, uint16_t,
+	   z0 = svld1uh_s64 (p0, x0 + svcntd () * 7),
+	   z0 = svld1uh_s64 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uh_s64_8:
+**	incb	x0, all, mul #2
+**	ld1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_s64_8, svint64_t, uint16_t,
+	   z0 = svld1uh_s64 (p0, x0 + svcntd () * 8),
+	   z0 = svld1uh_s64 (p0, x0 + svcntd () * 8))
+
+/*
+** ld1uh_s64_m1:
+**	ld1h	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_s64_m1, svint64_t, uint16_t,
+	   z0 = svld1uh_s64 (p0, x0 - svcntd ()),
+	   z0 = svld1uh_s64 (p0, x0 - svcntd ()))
+
+/*
+** ld1uh_s64_m8:
+**	ld1h	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_s64_m8, svint64_t, uint16_t,
+	   z0 = svld1uh_s64 (p0, x0 - svcntd () * 8),
+	   z0 = svld1uh_s64 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uh_s64_m9:
+**	decw	x0, all, mul #9
+**	ld1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_s64_m9, svint64_t, uint16_t,
+	   z0 = svld1uh_s64 (p0, x0 - svcntd () * 9),
+	   z0 = svld1uh_s64 (p0, x0 - svcntd () * 9))
+
+/*
+** ld1uh_vnum_s64_0:
+**	ld1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_s64_0, svint64_t, uint16_t,
+	   z0 = svld1uh_vnum_s64 (p0, x0, 0),
+	   z0 = svld1uh_vnum_s64 (p0, x0, 0))
+
+/*
+** ld1uh_vnum_s64_1:
+**	ld1h	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_s64_1, svint64_t, uint16_t,
+	   z0 = svld1uh_vnum_s64 (p0, x0, 1),
+	   z0 = svld1uh_vnum_s64 (p0, x0, 1))
+
+/*
+** ld1uh_vnum_s64_7:
+**	ld1h	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_s64_7, svint64_t, uint16_t,
+	   z0 = svld1uh_vnum_s64 (p0, x0, 7),
+	   z0 = svld1uh_vnum_s64 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uh_vnum_s64_8:
+**	incb	x0, all, mul #2
+**	ld1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_s64_8, svint64_t, uint16_t,
+	   z0 = svld1uh_vnum_s64 (p0, x0, 8),
+	   z0 = svld1uh_vnum_s64 (p0, x0, 8))
+
+/*
+** ld1uh_vnum_s64_m1:
+**	ld1h	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_s64_m1, svint64_t, uint16_t,
+	   z0 = svld1uh_vnum_s64 (p0, x0, -1),
+	   z0 = svld1uh_vnum_s64 (p0, x0, -1))
+
+/*
+** ld1uh_vnum_s64_m8:
+**	ld1h	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_s64_m8, svint64_t, uint16_t,
+	   z0 = svld1uh_vnum_s64 (p0, x0, -8),
+	   z0 = svld1uh_vnum_s64 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uh_vnum_s64_m9:
+**	decw	x0, all, mul #9
+**	ld1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_s64_m9, svint64_t, uint16_t,
+	   z0 = svld1uh_vnum_s64 (p0, x0, -9),
+	   z0 = svld1uh_vnum_s64 (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1uh_vnum_s64_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1h	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_s64_x1, svint64_t, uint16_t,
+	   z0 = svld1uh_vnum_s64 (p0, x0, x1),
+	   z0 = svld1uh_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_u32.c
new file mode 100644
index 000000000..a07b19259
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_u32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1uh_u32_base:
+**	ld1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_u32_base, svuint32_t, uint16_t,
+	   z0 = svld1uh_u32 (p0, x0),
+	   z0 = svld1uh_u32 (p0, x0))
+
+/*
+** ld1uh_u32_index:
+**	ld1h	z0\.s, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1uh_u32_index, svuint32_t, uint16_t,
+	   z0 = svld1uh_u32 (p0, x0 + x1),
+	   z0 = svld1uh_u32 (p0, x0 + x1))
+
+/*
+** ld1uh_u32_1:
+**	ld1h	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_u32_1, svuint32_t, uint16_t,
+	   z0 = svld1uh_u32 (p0, x0 + svcntw ()),
+	   z0 = svld1uh_u32 (p0, x0 + svcntw ()))
+
+/*
+** ld1uh_u32_7:
+**	ld1h	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_u32_7, svuint32_t, uint16_t,
+	   z0 = svld1uh_u32 (p0, x0 + svcntw () * 7),
+	   z0 = svld1uh_u32 (p0, x0 + svcntw () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uh_u32_8:
+**	incb	x0, all, mul #4
+**	ld1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_u32_8, svuint32_t, uint16_t,
+	   z0 = svld1uh_u32 (p0, x0 + svcntw () * 8),
+	   z0 = svld1uh_u32 (p0, x0 + svcntw () * 8))
+
+/*
+** ld1uh_u32_m1:
+**	ld1h	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_u32_m1, svuint32_t, uint16_t,
+	   z0 = svld1uh_u32 (p0, x0 - svcntw ()),
+	   z0 = svld1uh_u32 (p0, x0 - svcntw ()))
+
+/*
+** ld1uh_u32_m8:
+**	ld1h	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_u32_m8, svuint32_t, uint16_t,
+	   z0 = svld1uh_u32 (p0, x0 - svcntw () * 8),
+	   z0 = svld1uh_u32 (p0, x0 - svcntw () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uh_u32_m9:
+**	dech	x0, all, mul #9
+**	ld1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_u32_m9, svuint32_t, uint16_t,
+	   z0 = svld1uh_u32 (p0, x0 - svcntw () * 9),
+	   z0 = svld1uh_u32 (p0, x0 - svcntw () * 9))
+
+/*
+** ld1uh_vnum_u32_0:
+**	ld1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_u32_0, svuint32_t, uint16_t,
+	   z0 = svld1uh_vnum_u32 (p0, x0, 0),
+	   z0 = svld1uh_vnum_u32 (p0, x0, 0))
+
+/*
+** ld1uh_vnum_u32_1:
+**	ld1h	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_u32_1, svuint32_t, uint16_t,
+	   z0 = svld1uh_vnum_u32 (p0, x0, 1),
+	   z0 = svld1uh_vnum_u32 (p0, x0, 1))
+
+/*
+** ld1uh_vnum_u32_7:
+**	ld1h	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_u32_7, svuint32_t, uint16_t,
+	   z0 = svld1uh_vnum_u32 (p0, x0, 7),
+	   z0 = svld1uh_vnum_u32 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uh_vnum_u32_8:
+**	incb	x0, all, mul #4
+**	ld1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_u32_8, svuint32_t, uint16_t,
+	   z0 = svld1uh_vnum_u32 (p0, x0, 8),
+	   z0 = svld1uh_vnum_u32 (p0, x0, 8))
+
+/*
+** ld1uh_vnum_u32_m1:
+**	ld1h	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_u32_m1, svuint32_t, uint16_t,
+	   z0 = svld1uh_vnum_u32 (p0, x0, -1),
+	   z0 = svld1uh_vnum_u32 (p0, x0, -1))
+
+/*
+** ld1uh_vnum_u32_m8:
+**	ld1h	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_u32_m8, svuint32_t, uint16_t,
+	   z0 = svld1uh_vnum_u32 (p0, x0, -8),
+	   z0 = svld1uh_vnum_u32 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uh_vnum_u32_m9:
+**	dech	x0, all, mul #9
+**	ld1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_u32_m9, svuint32_t, uint16_t,
+	   z0 = svld1uh_vnum_u32 (p0, x0, -9),
+	   z0 = svld1uh_vnum_u32 (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1uh_vnum_u32_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1h	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_u32_x1, svuint32_t, uint16_t,
+	   z0 = svld1uh_vnum_u32 (p0, x0, x1),
+	   z0 = svld1uh_vnum_u32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_u64.c
new file mode 100644
index 000000000..79be01fbd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uh_u64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1uh_u64_base:
+**	ld1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_u64_base, svuint64_t, uint16_t,
+	   z0 = svld1uh_u64 (p0, x0),
+	   z0 = svld1uh_u64 (p0, x0))
+
+/*
+** ld1uh_u64_index:
+**	ld1h	z0\.d, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld1uh_u64_index, svuint64_t, uint16_t,
+	   z0 = svld1uh_u64 (p0, x0 + x1),
+	   z0 = svld1uh_u64 (p0, x0 + x1))
+
+/*
+** ld1uh_u64_1:
+**	ld1h	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_u64_1, svuint64_t, uint16_t,
+	   z0 = svld1uh_u64 (p0, x0 + svcntd ()),
+	   z0 = svld1uh_u64 (p0, x0 + svcntd ()))
+
+/*
+** ld1uh_u64_7:
+**	ld1h	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_u64_7, svuint64_t, uint16_t,
+	   z0 = svld1uh_u64 (p0, x0 + svcntd () * 7),
+	   z0 = svld1uh_u64 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uh_u64_8:
+**	incb	x0, all, mul #2
+**	ld1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_u64_8, svuint64_t, uint16_t,
+	   z0 = svld1uh_u64 (p0, x0 + svcntd () * 8),
+	   z0 = svld1uh_u64 (p0, x0 + svcntd () * 8))
+
+/*
+** ld1uh_u64_m1:
+**	ld1h	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_u64_m1, svuint64_t, uint16_t,
+	   z0 = svld1uh_u64 (p0, x0 - svcntd ()),
+	   z0 = svld1uh_u64 (p0, x0 - svcntd ()))
+
+/*
+** ld1uh_u64_m8:
+**	ld1h	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_u64_m8, svuint64_t, uint16_t,
+	   z0 = svld1uh_u64 (p0, x0 - svcntd () * 8),
+	   z0 = svld1uh_u64 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uh_u64_m9:
+**	decw	x0, all, mul #9
+**	ld1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_u64_m9, svuint64_t, uint16_t,
+	   z0 = svld1uh_u64 (p0, x0 - svcntd () * 9),
+	   z0 = svld1uh_u64 (p0, x0 - svcntd () * 9))
+
+/*
+** ld1uh_vnum_u64_0:
+**	ld1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_u64_0, svuint64_t, uint16_t,
+	   z0 = svld1uh_vnum_u64 (p0, x0, 0),
+	   z0 = svld1uh_vnum_u64 (p0, x0, 0))
+
+/*
+** ld1uh_vnum_u64_1:
+**	ld1h	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_u64_1, svuint64_t, uint16_t,
+	   z0 = svld1uh_vnum_u64 (p0, x0, 1),
+	   z0 = svld1uh_vnum_u64 (p0, x0, 1))
+
+/*
+** ld1uh_vnum_u64_7:
+**	ld1h	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_u64_7, svuint64_t, uint16_t,
+	   z0 = svld1uh_vnum_u64 (p0, x0, 7),
+	   z0 = svld1uh_vnum_u64 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uh_vnum_u64_8:
+**	incb	x0, all, mul #2
+**	ld1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_u64_8, svuint64_t, uint16_t,
+	   z0 = svld1uh_vnum_u64 (p0, x0, 8),
+	   z0 = svld1uh_vnum_u64 (p0, x0, 8))
+
+/*
+** ld1uh_vnum_u64_m1:
+**	ld1h	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_u64_m1, svuint64_t, uint16_t,
+	   z0 = svld1uh_vnum_u64 (p0, x0, -1),
+	   z0 = svld1uh_vnum_u64 (p0, x0, -1))
+
+/*
+** ld1uh_vnum_u64_m8:
+**	ld1h	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_u64_m8, svuint64_t, uint16_t,
+	   z0 = svld1uh_vnum_u64 (p0, x0, -8),
+	   z0 = svld1uh_vnum_u64 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uh_vnum_u64_m9:
+**	decw	x0, all, mul #9
+**	ld1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_u64_m9, svuint64_t, uint16_t,
+	   z0 = svld1uh_vnum_u64 (p0, x0, -9),
+	   z0 = svld1uh_vnum_u64 (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1uh_vnum_u64_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1h	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1uh_vnum_u64_x1, svuint64_t, uint16_t,
+	   z0 = svld1uh_vnum_u64 (p0, x0, x1),
+	   z0 = svld1uh_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_gather_s64.c
new file mode 100644
index 000000000..f4e9d5db9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_gather_s64.c
@@ -0,0 +1,308 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1uw_gather_s64_tied1:
+**	ld1w	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_s64_tied1, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_s64 (p0, z0),
+		     z0_res = svld1uw_gather_s64 (p0, z0))
+
+/*
+** ld1uw_gather_s64_untied:
+**	ld1w	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_s64_untied, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_s64 (p0, z1),
+		     z0_res = svld1uw_gather_s64 (p0, z1))
+
+/*
+** ld1uw_gather_x0_s64_offset:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_x0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, x0),
+		     z0_res = svld1uw_gather_offset_s64 (p0, z0, x0))
+
+/*
+** ld1uw_gather_m4_s64_offset:
+**	mov	(x[0-9]+), #?-4
+**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_m4_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, -4),
+		     z0_res = svld1uw_gather_offset_s64 (p0, z0, -4))
+
+/*
+** ld1uw_gather_0_s64_offset:
+**	ld1w	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, 0),
+		     z0_res = svld1uw_gather_offset_s64 (p0, z0, 0))
+
+/*
+** ld1uw_gather_5_s64_offset:
+**	mov	(x[0-9]+), #?5
+**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_5_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, 5),
+		     z0_res = svld1uw_gather_offset_s64 (p0, z0, 5))
+
+/*
+** ld1uw_gather_6_s64_offset:
+**	mov	(x[0-9]+), #?6
+**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_6_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, 6),
+		     z0_res = svld1uw_gather_offset_s64 (p0, z0, 6))
+
+/*
+** ld1uw_gather_7_s64_offset:
+**	mov	(x[0-9]+), #?7
+**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_7_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, 7),
+		     z0_res = svld1uw_gather_offset_s64 (p0, z0, 7))
+
+/*
+** ld1uw_gather_8_s64_offset:
+**	ld1w	z0\.d, p0/z, \[z0\.d, #8\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_8_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, 8),
+		     z0_res = svld1uw_gather_offset_s64 (p0, z0, 8))
+
+/*
+** ld1uw_gather_124_s64_offset:
+**	ld1w	z0\.d, p0/z, \[z0\.d, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_124_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, 124),
+		     z0_res = svld1uw_gather_offset_s64 (p0, z0, 124))
+
+/*
+** ld1uw_gather_128_s64_offset:
+**	mov	(x[0-9]+), #?128
+**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_128_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_s64 (p0, z0, 128),
+		     z0_res = svld1uw_gather_offset_s64 (p0, z0, 128))
+
+/*
+** ld1uw_gather_x0_s64_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_x0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_index_s64 (p0, z0, x0),
+		     z0_res = svld1uw_gather_index_s64 (p0, z0, x0))
+
+/*
+** ld1uw_gather_m1_s64_index:
+**	mov	(x[0-9]+), #?-4
+**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_m1_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_index_s64 (p0, z0, -1),
+		     z0_res = svld1uw_gather_index_s64 (p0, z0, -1))
+
+/*
+** ld1uw_gather_0_s64_index:
+**	ld1w	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_index_s64 (p0, z0, 0),
+		     z0_res = svld1uw_gather_index_s64 (p0, z0, 0))
+
+/*
+** ld1uw_gather_5_s64_index:
+**	ld1w	z0\.d, p0/z, \[z0\.d, #20\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_5_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_index_s64 (p0, z0, 5),
+		     z0_res = svld1uw_gather_index_s64 (p0, z0, 5))
+
+/*
+** ld1uw_gather_31_s64_index:
+**	ld1w	z0\.d, p0/z, \[z0\.d, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_31_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_index_s64 (p0, z0, 31),
+		     z0_res = svld1uw_gather_index_s64 (p0, z0, 31))
+
+/*
+** ld1uw_gather_32_s64_index:
+**	mov	(x[0-9]+), #?128
+**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_32_s64_index, svint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_index_s64 (p0, z0, 32),
+		     z0_res = svld1uw_gather_index_s64 (p0, z0, 32))
+
+/*
+** ld1uw_gather_x0_s64_s64offset:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_x0_s64_s64offset, svint64_t, uint32_t, svint64_t,
+		     z0_res = svld1uw_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svld1uw_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1uw_gather_tied1_s64_s64offset:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_tied1_s64_s64offset, svint64_t, uint32_t, svint64_t,
+		     z0_res = svld1uw_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svld1uw_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1uw_gather_untied_s64_s64offset:
+**	ld1w	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_untied_s64_s64offset, svint64_t, uint32_t, svint64_t,
+		     z0_res = svld1uw_gather_s64offset_s64 (p0, x0, z1),
+		     z0_res = svld1uw_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ld1uw_gather_ext_s64_s64offset:
+**	ld1w	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_ext_s64_s64offset, svint64_t, uint32_t, svint64_t,
+		     z0_res = svld1uw_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1uw_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1uw_gather_x0_s64_u64offset:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_x0_s64_u64offset, svint64_t, uint32_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svld1uw_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1uw_gather_tied1_s64_u64offset:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_tied1_s64_u64offset, svint64_t, uint32_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svld1uw_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ld1uw_gather_untied_s64_u64offset:
+**	ld1w	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_untied_s64_u64offset, svint64_t, uint32_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64offset_s64 (p0, x0, z1),
+		     z0_res = svld1uw_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ld1uw_gather_ext_s64_u64offset:
+**	ld1w	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_ext_s64_u64offset, svint64_t, uint32_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1uw_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1uw_gather_x0_s64_s64index:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_x0_s64_s64index, svint64_t, uint32_t, svint64_t,
+		     z0_res = svld1uw_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svld1uw_gather_index_s64 (p0, x0, z0))
+
+/*
+** ld1uw_gather_tied1_s64_s64index:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_tied1_s64_s64index, svint64_t, uint32_t, svint64_t,
+		     z0_res = svld1uw_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svld1uw_gather_index_s64 (p0, x0, z0))
+
+/*
+** ld1uw_gather_untied_s64_s64index:
+**	ld1w	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_untied_s64_s64index, svint64_t, uint32_t, svint64_t,
+		     z0_res = svld1uw_gather_s64index_s64 (p0, x0, z1),
+		     z0_res = svld1uw_gather_index_s64 (p0, x0, z1))
+
+/*
+** ld1uw_gather_ext_s64_s64index:
+**	ld1w	z0\.d, p0/z, \[x0, z1\.d, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_ext_s64_s64index, svint64_t, uint32_t, svint64_t,
+		     z0_res = svld1uw_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1uw_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1uw_gather_x0_s64_u64index:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_x0_s64_u64index, svint64_t, uint32_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svld1uw_gather_index_s64 (p0, x0, z0))
+
+/*
+** ld1uw_gather_tied1_s64_u64index:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_tied1_s64_u64index, svint64_t, uint32_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svld1uw_gather_index_s64 (p0, x0, z0))
+
+/*
+** ld1uw_gather_untied_s64_u64index:
+**	ld1w	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_untied_s64_u64index, svint64_t, uint32_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64index_s64 (p0, x0, z1),
+		     z0_res = svld1uw_gather_index_s64 (p0, x0, z1))
+
+/*
+** ld1uw_gather_ext_s64_u64index:
+**	ld1w	z0\.d, p0/z, \[x0, z1\.d, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_ext_s64_u64index, svint64_t, uint32_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1uw_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_gather_u64.c
new file mode 100644
index 000000000..854d19233
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_gather_u64.c
@@ -0,0 +1,308 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1uw_gather_u64_tied1:
+**	ld1w	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_u64_tied1, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_u64 (p0, z0),
+		     z0_res = svld1uw_gather_u64 (p0, z0))
+
+/*
+** ld1uw_gather_u64_untied:
+**	ld1w	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_u64_untied, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_u64 (p0, z1),
+		     z0_res = svld1uw_gather_u64 (p0, z1))
+
+/*
+** ld1uw_gather_x0_u64_offset:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_x0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, x0),
+		     z0_res = svld1uw_gather_offset_u64 (p0, z0, x0))
+
+/*
+** ld1uw_gather_m4_u64_offset:
+**	mov	(x[0-9]+), #?-4
+**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_m4_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, -4),
+		     z0_res = svld1uw_gather_offset_u64 (p0, z0, -4))
+
+/*
+** ld1uw_gather_0_u64_offset:
+**	ld1w	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, 0),
+		     z0_res = svld1uw_gather_offset_u64 (p0, z0, 0))
+
+/*
+** ld1uw_gather_5_u64_offset:
+**	mov	(x[0-9]+), #?5
+**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_5_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, 5),
+		     z0_res = svld1uw_gather_offset_u64 (p0, z0, 5))
+
+/*
+** ld1uw_gather_6_u64_offset:
+**	mov	(x[0-9]+), #?6
+**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_6_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, 6),
+		     z0_res = svld1uw_gather_offset_u64 (p0, z0, 6))
+
+/*
+** ld1uw_gather_7_u64_offset:
+**	mov	(x[0-9]+), #?7
+**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_7_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, 7),
+		     z0_res = svld1uw_gather_offset_u64 (p0, z0, 7))
+
+/*
+** ld1uw_gather_8_u64_offset:
+**	ld1w	z0\.d, p0/z, \[z0\.d, #8\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_8_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, 8),
+		     z0_res = svld1uw_gather_offset_u64 (p0, z0, 8))
+
+/*
+** ld1uw_gather_124_u64_offset:
+**	ld1w	z0\.d, p0/z, \[z0\.d, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_124_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, 124),
+		     z0_res = svld1uw_gather_offset_u64 (p0, z0, 124))
+
+/*
+** ld1uw_gather_128_u64_offset:
+**	mov	(x[0-9]+), #?128
+**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_128_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_offset_u64 (p0, z0, 128),
+		     z0_res = svld1uw_gather_offset_u64 (p0, z0, 128))
+
+/*
+** ld1uw_gather_x0_u64_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_x0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_index_u64 (p0, z0, x0),
+		     z0_res = svld1uw_gather_index_u64 (p0, z0, x0))
+
+/*
+** ld1uw_gather_m1_u64_index:
+**	mov	(x[0-9]+), #?-4
+**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_m1_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_index_u64 (p0, z0, -1),
+		     z0_res = svld1uw_gather_index_u64 (p0, z0, -1))
+
+/*
+** ld1uw_gather_0_u64_index:
+**	ld1w	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_index_u64 (p0, z0, 0),
+		     z0_res = svld1uw_gather_index_u64 (p0, z0, 0))
+
+/*
+** ld1uw_gather_5_u64_index:
+**	ld1w	z0\.d, p0/z, \[z0\.d, #20\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_5_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_index_u64 (p0, z0, 5),
+		     z0_res = svld1uw_gather_index_u64 (p0, z0, 5))
+
+/*
+** ld1uw_gather_31_u64_index:
+**	ld1w	z0\.d, p0/z, \[z0\.d, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_31_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_index_u64 (p0, z0, 31),
+		     z0_res = svld1uw_gather_index_u64 (p0, z0, 31))
+
+/*
+** ld1uw_gather_32_u64_index:
+**	mov	(x[0-9]+), #?128
+**	ld1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ld1uw_gather_32_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64base_index_u64 (p0, z0, 32),
+		     z0_res = svld1uw_gather_index_u64 (p0, z0, 32))
+
+/*
+** ld1uw_gather_x0_u64_s64offset:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_x0_u64_s64offset, svuint64_t, uint32_t, svint64_t,
+		     z0_res = svld1uw_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svld1uw_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1uw_gather_tied1_u64_s64offset:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_tied1_u64_s64offset, svuint64_t, uint32_t, svint64_t,
+		     z0_res = svld1uw_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svld1uw_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1uw_gather_untied_u64_s64offset:
+**	ld1w	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_untied_u64_s64offset, svuint64_t, uint32_t, svint64_t,
+		     z0_res = svld1uw_gather_s64offset_u64 (p0, x0, z1),
+		     z0_res = svld1uw_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ld1uw_gather_ext_u64_s64offset:
+**	ld1w	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_ext_u64_s64offset, svuint64_t, uint32_t, svint64_t,
+		     z0_res = svld1uw_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1uw_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1uw_gather_x0_u64_u64offset:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_x0_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svld1uw_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1uw_gather_tied1_u64_u64offset:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_tied1_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svld1uw_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ld1uw_gather_untied_u64_u64offset:
+**	ld1w	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_untied_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64offset_u64 (p0, x0, z1),
+		     z0_res = svld1uw_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ld1uw_gather_ext_u64_u64offset:
+**	ld1w	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_ext_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1uw_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1uw_gather_x0_u64_s64index:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_x0_u64_s64index, svuint64_t, uint32_t, svint64_t,
+		     z0_res = svld1uw_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svld1uw_gather_index_u64 (p0, x0, z0))
+
+/*
+** ld1uw_gather_tied1_u64_s64index:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_tied1_u64_s64index, svuint64_t, uint32_t, svint64_t,
+		     z0_res = svld1uw_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svld1uw_gather_index_u64 (p0, x0, z0))
+
+/*
+** ld1uw_gather_untied_u64_s64index:
+**	ld1w	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_untied_u64_s64index, svuint64_t, uint32_t, svint64_t,
+		     z0_res = svld1uw_gather_s64index_u64 (p0, x0, z1),
+		     z0_res = svld1uw_gather_index_u64 (p0, x0, z1))
+
+/*
+** ld1uw_gather_ext_u64_s64index:
+**	ld1w	z0\.d, p0/z, \[x0, z1\.d, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_ext_u64_s64index, svuint64_t, uint32_t, svint64_t,
+		     z0_res = svld1uw_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svld1uw_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ld1uw_gather_x0_u64_u64index:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_x0_u64_u64index, svuint64_t, uint32_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svld1uw_gather_index_u64 (p0, x0, z0))
+
+/*
+** ld1uw_gather_tied1_u64_u64index:
+**	ld1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_tied1_u64_u64index, svuint64_t, uint32_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svld1uw_gather_index_u64 (p0, x0, z0))
+
+/*
+** ld1uw_gather_untied_u64_u64index:
+**	ld1w	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_untied_u64_u64index, svuint64_t, uint32_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64index_u64 (p0, x0, z1),
+		     z0_res = svld1uw_gather_index_u64 (p0, x0, z1))
+
+/*
+** ld1uw_gather_ext_u64_u64index:
+**	ld1w	z0\.d, p0/z, \[x0, z1\.d, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ld1uw_gather_ext_u64_u64index, svuint64_t, uint32_t, svuint64_t,
+		     z0_res = svld1uw_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svld1uw_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_s64.c
new file mode 100644
index 000000000..55f5cbad3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_s64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1uw_s64_base:
+**	ld1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uw_s64_base, svint64_t, uint32_t,
+	   z0 = svld1uw_s64 (p0, x0),
+	   z0 = svld1uw_s64 (p0, x0))
+
+/*
+** ld1uw_s64_index:
+**	ld1w	z0\.d, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld1uw_s64_index, svint64_t, uint32_t,
+	   z0 = svld1uw_s64 (p0, x0 + x1),
+	   z0 = svld1uw_s64 (p0, x0 + x1))
+
+/*
+** ld1uw_s64_1:
+**	ld1w	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uw_s64_1, svint64_t, uint32_t,
+	   z0 = svld1uw_s64 (p0, x0 + svcntd ()),
+	   z0 = svld1uw_s64 (p0, x0 + svcntd ()))
+
+/*
+** ld1uw_s64_7:
+**	ld1w	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uw_s64_7, svint64_t, uint32_t,
+	   z0 = svld1uw_s64 (p0, x0 + svcntd () * 7),
+	   z0 = svld1uw_s64 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uw_s64_8:
+**	incb	x0, all, mul #4
+**	ld1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uw_s64_8, svint64_t, uint32_t,
+	   z0 = svld1uw_s64 (p0, x0 + svcntd () * 8),
+	   z0 = svld1uw_s64 (p0, x0 + svcntd () * 8))
+
+/*
+** ld1uw_s64_m1:
+**	ld1w	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uw_s64_m1, svint64_t, uint32_t,
+	   z0 = svld1uw_s64 (p0, x0 - svcntd ()),
+	   z0 = svld1uw_s64 (p0, x0 - svcntd ()))
+
+/*
+** ld1uw_s64_m8:
+**	ld1w	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uw_s64_m8, svint64_t, uint32_t,
+	   z0 = svld1uw_s64 (p0, x0 - svcntd () * 8),
+	   z0 = svld1uw_s64 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uw_s64_m9:
+**	dech	x0, all, mul #9
+**	ld1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uw_s64_m9, svint64_t, uint32_t,
+	   z0 = svld1uw_s64 (p0, x0 - svcntd () * 9),
+	   z0 = svld1uw_s64 (p0, x0 - svcntd () * 9))
+
+/*
+** ld1uw_vnum_s64_0:
+**	ld1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uw_vnum_s64_0, svint64_t, uint32_t,
+	   z0 = svld1uw_vnum_s64 (p0, x0, 0),
+	   z0 = svld1uw_vnum_s64 (p0, x0, 0))
+
+/*
+** ld1uw_vnum_s64_1:
+**	ld1w	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uw_vnum_s64_1, svint64_t, uint32_t,
+	   z0 = svld1uw_vnum_s64 (p0, x0, 1),
+	   z0 = svld1uw_vnum_s64 (p0, x0, 1))
+
+/*
+** ld1uw_vnum_s64_7:
+**	ld1w	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uw_vnum_s64_7, svint64_t, uint32_t,
+	   z0 = svld1uw_vnum_s64 (p0, x0, 7),
+	   z0 = svld1uw_vnum_s64 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uw_vnum_s64_8:
+**	incb	x0, all, mul #4
+**	ld1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uw_vnum_s64_8, svint64_t, uint32_t,
+	   z0 = svld1uw_vnum_s64 (p0, x0, 8),
+	   z0 = svld1uw_vnum_s64 (p0, x0, 8))
+
+/*
+** ld1uw_vnum_s64_m1:
+**	ld1w	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uw_vnum_s64_m1, svint64_t, uint32_t,
+	   z0 = svld1uw_vnum_s64 (p0, x0, -1),
+	   z0 = svld1uw_vnum_s64 (p0, x0, -1))
+
+/*
+** ld1uw_vnum_s64_m8:
+**	ld1w	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uw_vnum_s64_m8, svint64_t, uint32_t,
+	   z0 = svld1uw_vnum_s64 (p0, x0, -8),
+	   z0 = svld1uw_vnum_s64 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uw_vnum_s64_m9:
+**	dech	x0, all, mul #9
+**	ld1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uw_vnum_s64_m9, svint64_t, uint32_t,
+	   z0 = svld1uw_vnum_s64 (p0, x0, -9),
+	   z0 = svld1uw_vnum_s64 (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1uw_vnum_s64_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1w	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1uw_vnum_s64_x1, svint64_t, uint32_t,
+	   z0 = svld1uw_vnum_s64 (p0, x0, x1),
+	   z0 = svld1uw_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_u64.c
new file mode 100644
index 000000000..175b593f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld1uw_u64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1uw_u64_base:
+**	ld1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uw_u64_base, svuint64_t, uint32_t,
+	   z0 = svld1uw_u64 (p0, x0),
+	   z0 = svld1uw_u64 (p0, x0))
+
+/*
+** ld1uw_u64_index:
+**	ld1w	z0\.d, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld1uw_u64_index, svuint64_t, uint32_t,
+	   z0 = svld1uw_u64 (p0, x0 + x1),
+	   z0 = svld1uw_u64 (p0, x0 + x1))
+
+/*
+** ld1uw_u64_1:
+**	ld1w	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uw_u64_1, svuint64_t, uint32_t,
+	   z0 = svld1uw_u64 (p0, x0 + svcntd ()),
+	   z0 = svld1uw_u64 (p0, x0 + svcntd ()))
+
+/*
+** ld1uw_u64_7:
+**	ld1w	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uw_u64_7, svuint64_t, uint32_t,
+	   z0 = svld1uw_u64 (p0, x0 + svcntd () * 7),
+	   z0 = svld1uw_u64 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uw_u64_8:
+**	incb	x0, all, mul #4
+**	ld1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uw_u64_8, svuint64_t, uint32_t,
+	   z0 = svld1uw_u64 (p0, x0 + svcntd () * 8),
+	   z0 = svld1uw_u64 (p0, x0 + svcntd () * 8))
+
+/*
+** ld1uw_u64_m1:
+**	ld1w	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uw_u64_m1, svuint64_t, uint32_t,
+	   z0 = svld1uw_u64 (p0, x0 - svcntd ()),
+	   z0 = svld1uw_u64 (p0, x0 - svcntd ()))
+
+/*
+** ld1uw_u64_m8:
+**	ld1w	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uw_u64_m8, svuint64_t, uint32_t,
+	   z0 = svld1uw_u64 (p0, x0 - svcntd () * 8),
+	   z0 = svld1uw_u64 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uw_u64_m9:
+**	dech	x0, all, mul #9
+**	ld1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uw_u64_m9, svuint64_t, uint32_t,
+	   z0 = svld1uw_u64 (p0, x0 - svcntd () * 9),
+	   z0 = svld1uw_u64 (p0, x0 - svcntd () * 9))
+
+/*
+** ld1uw_vnum_u64_0:
+**	ld1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uw_vnum_u64_0, svuint64_t, uint32_t,
+	   z0 = svld1uw_vnum_u64 (p0, x0, 0),
+	   z0 = svld1uw_vnum_u64 (p0, x0, 0))
+
+/*
+** ld1uw_vnum_u64_1:
+**	ld1w	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uw_vnum_u64_1, svuint64_t, uint32_t,
+	   z0 = svld1uw_vnum_u64 (p0, x0, 1),
+	   z0 = svld1uw_vnum_u64 (p0, x0, 1))
+
+/*
+** ld1uw_vnum_u64_7:
+**	ld1w	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uw_vnum_u64_7, svuint64_t, uint32_t,
+	   z0 = svld1uw_vnum_u64 (p0, x0, 7),
+	   z0 = svld1uw_vnum_u64 (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uw_vnum_u64_8:
+**	incb	x0, all, mul #4
+**	ld1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uw_vnum_u64_8, svuint64_t, uint32_t,
+	   z0 = svld1uw_vnum_u64 (p0, x0, 8),
+	   z0 = svld1uw_vnum_u64 (p0, x0, 8))
+
+/*
+** ld1uw_vnum_u64_m1:
+**	ld1w	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uw_vnum_u64_m1, svuint64_t, uint32_t,
+	   z0 = svld1uw_vnum_u64 (p0, x0, -1),
+	   z0 = svld1uw_vnum_u64 (p0, x0, -1))
+
+/*
+** ld1uw_vnum_u64_m8:
+**	ld1w	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld1uw_vnum_u64_m8, svuint64_t, uint32_t,
+	   z0 = svld1uw_vnum_u64 (p0, x0, -8),
+	   z0 = svld1uw_vnum_u64 (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld1uw_vnum_u64_m9:
+**	dech	x0, all, mul #9
+**	ld1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld1uw_vnum_u64_m9, svuint64_t, uint32_t,
+	   z0 = svld1uw_vnum_u64 (p0, x0, -9),
+	   z0 = svld1uw_vnum_u64 (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld1uw_vnum_u64_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld1w	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld1uw_vnum_u64_x1, svuint64_t, uint32_t,
+	   z0 = svld1uw_vnum_u64 (p0, x0, x1),
+	   z0 = svld1uw_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_bf16.c
new file mode 100644
index 000000000..5d08c1e6e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_bf16.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld2_bf16_base:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_bf16_base, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_bf16 (p0, x0),
+	   z0 = svld2 (p0, x0))
+
+/*
+** ld2_bf16_index:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld2_bf16_index, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_bf16 (p0, x0 + x1),
+	   z0 = svld2 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_bf16_1:
+**	incb	x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_bf16_1, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_bf16 (p0, x0 + svcnth ()),
+	   z0 = svld2 (p0, x0 + svcnth ()))
+
+/*
+** ld2_bf16_2:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_bf16_2, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_bf16 (p0, x0 + svcnth () * 2),
+	   z0 = svld2 (p0, x0 + svcnth () * 2))
+
+/*
+** ld2_bf16_14:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_bf16_14, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_bf16 (p0, x0 + svcnth () * 14),
+	   z0 = svld2 (p0, x0 + svcnth () * 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_bf16_16:
+**	incb	x0, all, mul #16
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_bf16_16, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_bf16 (p0, x0 + svcnth () * 16),
+	   z0 = svld2 (p0, x0 + svcnth () * 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_bf16_m1:
+**	decb	x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_bf16_m1, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_bf16 (p0, x0 - svcnth ()),
+	   z0 = svld2 (p0, x0 - svcnth ()))
+
+/*
+** ld2_bf16_m2:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_bf16_m2, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_bf16 (p0, x0 - svcnth () * 2),
+	   z0 = svld2 (p0, x0 - svcnth () * 2))
+
+/*
+** ld2_bf16_m16:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_bf16_m16, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_bf16 (p0, x0 - svcnth () * 16),
+	   z0 = svld2 (p0, x0 - svcnth () * 16))
+
+/*
+** ld2_bf16_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_bf16_m18, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_bf16 (p0, x0 - svcnth () * 18),
+	   z0 = svld2 (p0, x0 - svcnth () * 18))
+
+/*
+** ld2_vnum_bf16_0:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_bf16_0, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_vnum_bf16 (p0, x0, 0),
+	   z0 = svld2_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_bf16_1:
+**	incb	x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_bf16_1, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_vnum_bf16 (p0, x0, 1),
+	   z0 = svld2_vnum (p0, x0, 1))
+
+/*
+** ld2_vnum_bf16_2:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_bf16_2, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_vnum_bf16 (p0, x0, 2),
+	   z0 = svld2_vnum (p0, x0, 2))
+
+/*
+** ld2_vnum_bf16_14:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_bf16_14, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_vnum_bf16 (p0, x0, 14),
+	   z0 = svld2_vnum (p0, x0, 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_bf16_16:
+**	incb	x0, all, mul #16
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_bf16_16, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_vnum_bf16 (p0, x0, 16),
+	   z0 = svld2_vnum (p0, x0, 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_bf16_m1:
+**	decb	x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_bf16_m1, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_vnum_bf16 (p0, x0, -1),
+	   z0 = svld2_vnum (p0, x0, -1))
+
+/*
+** ld2_vnum_bf16_m2:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_bf16_m2, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_vnum_bf16 (p0, x0, -2),
+	   z0 = svld2_vnum (p0, x0, -2))
+
+/*
+** ld2_vnum_bf16_m16:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_bf16_m16, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_vnum_bf16 (p0, x0, -16),
+	   z0 = svld2_vnum (p0, x0, -16))
+
+/*
+** ld2_vnum_bf16_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_bf16_m18, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_vnum_bf16 (p0, x0, -18),
+	   z0 = svld2_vnum (p0, x0, -18))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld2_vnum_bf16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_bf16_x1, svbfloat16x2_t, bfloat16_t,
+	   z0 = svld2_vnum_bf16 (p0, x0, x1),
+	   z0 = svld2_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f16.c
new file mode 100644
index 000000000..43392b2b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f16.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld2_f16_base:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_f16_base, svfloat16x2_t, float16_t,
+	   z0 = svld2_f16 (p0, x0),
+	   z0 = svld2 (p0, x0))
+
+/*
+** ld2_f16_index:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld2_f16_index, svfloat16x2_t, float16_t,
+	   z0 = svld2_f16 (p0, x0 + x1),
+	   z0 = svld2 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_f16_1:
+**	incb	x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_f16_1, svfloat16x2_t, float16_t,
+	   z0 = svld2_f16 (p0, x0 + svcnth ()),
+	   z0 = svld2 (p0, x0 + svcnth ()))
+
+/*
+** ld2_f16_2:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_f16_2, svfloat16x2_t, float16_t,
+	   z0 = svld2_f16 (p0, x0 + svcnth () * 2),
+	   z0 = svld2 (p0, x0 + svcnth () * 2))
+
+/*
+** ld2_f16_14:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_f16_14, svfloat16x2_t, float16_t,
+	   z0 = svld2_f16 (p0, x0 + svcnth () * 14),
+	   z0 = svld2 (p0, x0 + svcnth () * 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_f16_16:
+**	incb	x0, all, mul #16
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_f16_16, svfloat16x2_t, float16_t,
+	   z0 = svld2_f16 (p0, x0 + svcnth () * 16),
+	   z0 = svld2 (p0, x0 + svcnth () * 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_f16_m1:
+**	decb	x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_f16_m1, svfloat16x2_t, float16_t,
+	   z0 = svld2_f16 (p0, x0 - svcnth ()),
+	   z0 = svld2 (p0, x0 - svcnth ()))
+
+/*
+** ld2_f16_m2:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_f16_m2, svfloat16x2_t, float16_t,
+	   z0 = svld2_f16 (p0, x0 - svcnth () * 2),
+	   z0 = svld2 (p0, x0 - svcnth () * 2))
+
+/*
+** ld2_f16_m16:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_f16_m16, svfloat16x2_t, float16_t,
+	   z0 = svld2_f16 (p0, x0 - svcnth () * 16),
+	   z0 = svld2 (p0, x0 - svcnth () * 16))
+
+/*
+** ld2_f16_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_f16_m18, svfloat16x2_t, float16_t,
+	   z0 = svld2_f16 (p0, x0 - svcnth () * 18),
+	   z0 = svld2 (p0, x0 - svcnth () * 18))
+
+/*
+** ld2_vnum_f16_0:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f16_0, svfloat16x2_t, float16_t,
+	   z0 = svld2_vnum_f16 (p0, x0, 0),
+	   z0 = svld2_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_f16_1:
+**	incb	x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f16_1, svfloat16x2_t, float16_t,
+	   z0 = svld2_vnum_f16 (p0, x0, 1),
+	   z0 = svld2_vnum (p0, x0, 1))
+
+/*
+** ld2_vnum_f16_2:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f16_2, svfloat16x2_t, float16_t,
+	   z0 = svld2_vnum_f16 (p0, x0, 2),
+	   z0 = svld2_vnum (p0, x0, 2))
+
+/*
+** ld2_vnum_f16_14:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f16_14, svfloat16x2_t, float16_t,
+	   z0 = svld2_vnum_f16 (p0, x0, 14),
+	   z0 = svld2_vnum (p0, x0, 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_f16_16:
+**	incb	x0, all, mul #16
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f16_16, svfloat16x2_t, float16_t,
+	   z0 = svld2_vnum_f16 (p0, x0, 16),
+	   z0 = svld2_vnum (p0, x0, 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_f16_m1:
+**	decb	x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f16_m1, svfloat16x2_t, float16_t,
+	   z0 = svld2_vnum_f16 (p0, x0, -1),
+	   z0 = svld2_vnum (p0, x0, -1))
+
+/*
+** ld2_vnum_f16_m2:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f16_m2, svfloat16x2_t, float16_t,
+	   z0 = svld2_vnum_f16 (p0, x0, -2),
+	   z0 = svld2_vnum (p0, x0, -2))
+
+/*
+** ld2_vnum_f16_m16:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f16_m16, svfloat16x2_t, float16_t,
+	   z0 = svld2_vnum_f16 (p0, x0, -16),
+	   z0 = svld2_vnum (p0, x0, -16))
+
+/*
+** ld2_vnum_f16_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f16_m18, svfloat16x2_t, float16_t,
+	   z0 = svld2_vnum_f16 (p0, x0, -18),
+	   z0 = svld2_vnum (p0, x0, -18))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld2_vnum_f16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f16_x1, svfloat16x2_t, float16_t,
+	   z0 = svld2_vnum_f16 (p0, x0, x1),
+	   z0 = svld2_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f32.c
new file mode 100644
index 000000000..379145e0c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f32.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld2_f32_base:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_f32_base, svfloat32x2_t, float32_t,
+	   z0 = svld2_f32 (p0, x0),
+	   z0 = svld2 (p0, x0))
+
+/*
+** ld2_f32_index:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld2_f32_index, svfloat32x2_t, float32_t,
+	   z0 = svld2_f32 (p0, x0 + x1),
+	   z0 = svld2 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_f32_1:
+**	incb	x0
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_f32_1, svfloat32x2_t, float32_t,
+	   z0 = svld2_f32 (p0, x0 + svcntw ()),
+	   z0 = svld2 (p0, x0 + svcntw ()))
+
+/*
+** ld2_f32_2:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_f32_2, svfloat32x2_t, float32_t,
+	   z0 = svld2_f32 (p0, x0 + svcntw () * 2),
+	   z0 = svld2 (p0, x0 + svcntw () * 2))
+
+/*
+** ld2_f32_14:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_f32_14, svfloat32x2_t, float32_t,
+	   z0 = svld2_f32 (p0, x0 + svcntw () * 14),
+	   z0 = svld2 (p0, x0 + svcntw () * 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_f32_16:
+**	incb	x0, all, mul #16
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_f32_16, svfloat32x2_t, float32_t,
+	   z0 = svld2_f32 (p0, x0 + svcntw () * 16),
+	   z0 = svld2 (p0, x0 + svcntw () * 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_f32_m1:
+**	decb	x0
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_f32_m1, svfloat32x2_t, float32_t,
+	   z0 = svld2_f32 (p0, x0 - svcntw ()),
+	   z0 = svld2 (p0, x0 - svcntw ()))
+
+/*
+** ld2_f32_m2:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_f32_m2, svfloat32x2_t, float32_t,
+	   z0 = svld2_f32 (p0, x0 - svcntw () * 2),
+	   z0 = svld2 (p0, x0 - svcntw () * 2))
+
+/*
+** ld2_f32_m16:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_f32_m16, svfloat32x2_t, float32_t,
+	   z0 = svld2_f32 (p0, x0 - svcntw () * 16),
+	   z0 = svld2 (p0, x0 - svcntw () * 16))
+
+/*
+** ld2_f32_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_f32_m18, svfloat32x2_t, float32_t,
+	   z0 = svld2_f32 (p0, x0 - svcntw () * 18),
+	   z0 = svld2 (p0, x0 - svcntw () * 18))
+
+/*
+** ld2_vnum_f32_0:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f32_0, svfloat32x2_t, float32_t,
+	   z0 = svld2_vnum_f32 (p0, x0, 0),
+	   z0 = svld2_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_f32_1:
+**	incb	x0
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f32_1, svfloat32x2_t, float32_t,
+	   z0 = svld2_vnum_f32 (p0, x0, 1),
+	   z0 = svld2_vnum (p0, x0, 1))
+
+/*
+** ld2_vnum_f32_2:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f32_2, svfloat32x2_t, float32_t,
+	   z0 = svld2_vnum_f32 (p0, x0, 2),
+	   z0 = svld2_vnum (p0, x0, 2))
+
+/*
+** ld2_vnum_f32_14:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f32_14, svfloat32x2_t, float32_t,
+	   z0 = svld2_vnum_f32 (p0, x0, 14),
+	   z0 = svld2_vnum (p0, x0, 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_f32_16:
+**	incb	x0, all, mul #16
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f32_16, svfloat32x2_t, float32_t,
+	   z0 = svld2_vnum_f32 (p0, x0, 16),
+	   z0 = svld2_vnum (p0, x0, 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_f32_m1:
+**	decb	x0
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f32_m1, svfloat32x2_t, float32_t,
+	   z0 = svld2_vnum_f32 (p0, x0, -1),
+	   z0 = svld2_vnum (p0, x0, -1))
+
+/*
+** ld2_vnum_f32_m2:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f32_m2, svfloat32x2_t, float32_t,
+	   z0 = svld2_vnum_f32 (p0, x0, -2),
+	   z0 = svld2_vnum (p0, x0, -2))
+
+/*
+** ld2_vnum_f32_m16:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f32_m16, svfloat32x2_t, float32_t,
+	   z0 = svld2_vnum_f32 (p0, x0, -16),
+	   z0 = svld2_vnum (p0, x0, -16))
+
+/*
+** ld2_vnum_f32_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f32_m18, svfloat32x2_t, float32_t,
+	   z0 = svld2_vnum_f32 (p0, x0, -18),
+	   z0 = svld2_vnum (p0, x0, -18))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld2_vnum_f32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f32_x1, svfloat32x2_t, float32_t,
+	   z0 = svld2_vnum_f32 (p0, x0, x1),
+	   z0 = svld2_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f64.c
new file mode 100644
index 000000000..1911612c6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_f64.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld2_f64_base:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_f64_base, svfloat64x2_t, float64_t,
+	   z0 = svld2_f64 (p0, x0),
+	   z0 = svld2 (p0, x0))
+
+/*
+** ld2_f64_index:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld2_f64_index, svfloat64x2_t, float64_t,
+	   z0 = svld2_f64 (p0, x0 + x1),
+	   z0 = svld2 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_f64_1:
+**	incb	x0
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_f64_1, svfloat64x2_t, float64_t,
+	   z0 = svld2_f64 (p0, x0 + svcntd ()),
+	   z0 = svld2 (p0, x0 + svcntd ()))
+
+/*
+** ld2_f64_2:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_f64_2, svfloat64x2_t, float64_t,
+	   z0 = svld2_f64 (p0, x0 + svcntd () * 2),
+	   z0 = svld2 (p0, x0 + svcntd () * 2))
+
+/*
+** ld2_f64_14:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_f64_14, svfloat64x2_t, float64_t,
+	   z0 = svld2_f64 (p0, x0 + svcntd () * 14),
+	   z0 = svld2 (p0, x0 + svcntd () * 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_f64_16:
+**	incb	x0, all, mul #16
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_f64_16, svfloat64x2_t, float64_t,
+	   z0 = svld2_f64 (p0, x0 + svcntd () * 16),
+	   z0 = svld2 (p0, x0 + svcntd () * 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_f64_m1:
+**	decb	x0
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_f64_m1, svfloat64x2_t, float64_t,
+	   z0 = svld2_f64 (p0, x0 - svcntd ()),
+	   z0 = svld2 (p0, x0 - svcntd ()))
+
+/*
+** ld2_f64_m2:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_f64_m2, svfloat64x2_t, float64_t,
+	   z0 = svld2_f64 (p0, x0 - svcntd () * 2),
+	   z0 = svld2 (p0, x0 - svcntd () * 2))
+
+/*
+** ld2_f64_m16:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_f64_m16, svfloat64x2_t, float64_t,
+	   z0 = svld2_f64 (p0, x0 - svcntd () * 16),
+	   z0 = svld2 (p0, x0 - svcntd () * 16))
+
+/*
+** ld2_f64_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_f64_m18, svfloat64x2_t, float64_t,
+	   z0 = svld2_f64 (p0, x0 - svcntd () * 18),
+	   z0 = svld2 (p0, x0 - svcntd () * 18))
+
+/*
+** ld2_vnum_f64_0:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f64_0, svfloat64x2_t, float64_t,
+	   z0 = svld2_vnum_f64 (p0, x0, 0),
+	   z0 = svld2_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_f64_1:
+**	incb	x0
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f64_1, svfloat64x2_t, float64_t,
+	   z0 = svld2_vnum_f64 (p0, x0, 1),
+	   z0 = svld2_vnum (p0, x0, 1))
+
+/*
+** ld2_vnum_f64_2:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f64_2, svfloat64x2_t, float64_t,
+	   z0 = svld2_vnum_f64 (p0, x0, 2),
+	   z0 = svld2_vnum (p0, x0, 2))
+
+/*
+** ld2_vnum_f64_14:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f64_14, svfloat64x2_t, float64_t,
+	   z0 = svld2_vnum_f64 (p0, x0, 14),
+	   z0 = svld2_vnum (p0, x0, 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_f64_16:
+**	incb	x0, all, mul #16
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f64_16, svfloat64x2_t, float64_t,
+	   z0 = svld2_vnum_f64 (p0, x0, 16),
+	   z0 = svld2_vnum (p0, x0, 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_f64_m1:
+**	decb	x0
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f64_m1, svfloat64x2_t, float64_t,
+	   z0 = svld2_vnum_f64 (p0, x0, -1),
+	   z0 = svld2_vnum (p0, x0, -1))
+
+/*
+** ld2_vnum_f64_m2:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f64_m2, svfloat64x2_t, float64_t,
+	   z0 = svld2_vnum_f64 (p0, x0, -2),
+	   z0 = svld2_vnum (p0, x0, -2))
+
+/*
+** ld2_vnum_f64_m16:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f64_m16, svfloat64x2_t, float64_t,
+	   z0 = svld2_vnum_f64 (p0, x0, -16),
+	   z0 = svld2_vnum (p0, x0, -16))
+
+/*
+** ld2_vnum_f64_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f64_m18, svfloat64x2_t, float64_t,
+	   z0 = svld2_vnum_f64 (p0, x0, -18),
+	   z0 = svld2_vnum (p0, x0, -18))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld2_vnum_f64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_f64_x1, svfloat64x2_t, float64_t,
+	   z0 = svld2_vnum_f64 (p0, x0, x1),
+	   z0 = svld2_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s16.c
new file mode 100644
index 000000000..90677d837
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s16.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld2_s16_base:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_s16_base, svint16x2_t, int16_t,
+	   z0 = svld2_s16 (p0, x0),
+	   z0 = svld2 (p0, x0))
+
+/*
+** ld2_s16_index:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld2_s16_index, svint16x2_t, int16_t,
+	   z0 = svld2_s16 (p0, x0 + x1),
+	   z0 = svld2 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_s16_1:
+**	incb	x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_s16_1, svint16x2_t, int16_t,
+	   z0 = svld2_s16 (p0, x0 + svcnth ()),
+	   z0 = svld2 (p0, x0 + svcnth ()))
+
+/*
+** ld2_s16_2:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_s16_2, svint16x2_t, int16_t,
+	   z0 = svld2_s16 (p0, x0 + svcnth () * 2),
+	   z0 = svld2 (p0, x0 + svcnth () * 2))
+
+/*
+** ld2_s16_14:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_s16_14, svint16x2_t, int16_t,
+	   z0 = svld2_s16 (p0, x0 + svcnth () * 14),
+	   z0 = svld2 (p0, x0 + svcnth () * 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_s16_16:
+**	incb	x0, all, mul #16
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_s16_16, svint16x2_t, int16_t,
+	   z0 = svld2_s16 (p0, x0 + svcnth () * 16),
+	   z0 = svld2 (p0, x0 + svcnth () * 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_s16_m1:
+**	decb	x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_s16_m1, svint16x2_t, int16_t,
+	   z0 = svld2_s16 (p0, x0 - svcnth ()),
+	   z0 = svld2 (p0, x0 - svcnth ()))
+
+/*
+** ld2_s16_m2:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_s16_m2, svint16x2_t, int16_t,
+	   z0 = svld2_s16 (p0, x0 - svcnth () * 2),
+	   z0 = svld2 (p0, x0 - svcnth () * 2))
+
+/*
+** ld2_s16_m16:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_s16_m16, svint16x2_t, int16_t,
+	   z0 = svld2_s16 (p0, x0 - svcnth () * 16),
+	   z0 = svld2 (p0, x0 - svcnth () * 16))
+
+/*
+** ld2_s16_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_s16_m18, svint16x2_t, int16_t,
+	   z0 = svld2_s16 (p0, x0 - svcnth () * 18),
+	   z0 = svld2 (p0, x0 - svcnth () * 18))
+
+/*
+** ld2_vnum_s16_0:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s16_0, svint16x2_t, int16_t,
+	   z0 = svld2_vnum_s16 (p0, x0, 0),
+	   z0 = svld2_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_s16_1:
+**	incb	x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s16_1, svint16x2_t, int16_t,
+	   z0 = svld2_vnum_s16 (p0, x0, 1),
+	   z0 = svld2_vnum (p0, x0, 1))
+
+/*
+** ld2_vnum_s16_2:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s16_2, svint16x2_t, int16_t,
+	   z0 = svld2_vnum_s16 (p0, x0, 2),
+	   z0 = svld2_vnum (p0, x0, 2))
+
+/*
+** ld2_vnum_s16_14:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s16_14, svint16x2_t, int16_t,
+	   z0 = svld2_vnum_s16 (p0, x0, 14),
+	   z0 = svld2_vnum (p0, x0, 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_s16_16:
+**	incb	x0, all, mul #16
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s16_16, svint16x2_t, int16_t,
+	   z0 = svld2_vnum_s16 (p0, x0, 16),
+	   z0 = svld2_vnum (p0, x0, 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_s16_m1:
+**	decb	x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s16_m1, svint16x2_t, int16_t,
+	   z0 = svld2_vnum_s16 (p0, x0, -1),
+	   z0 = svld2_vnum (p0, x0, -1))
+
+/*
+** ld2_vnum_s16_m2:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s16_m2, svint16x2_t, int16_t,
+	   z0 = svld2_vnum_s16 (p0, x0, -2),
+	   z0 = svld2_vnum (p0, x0, -2))
+
+/*
+** ld2_vnum_s16_m16:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s16_m16, svint16x2_t, int16_t,
+	   z0 = svld2_vnum_s16 (p0, x0, -16),
+	   z0 = svld2_vnum (p0, x0, -16))
+
+/*
+** ld2_vnum_s16_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s16_m18, svint16x2_t, int16_t,
+	   z0 = svld2_vnum_s16 (p0, x0, -18),
+	   z0 = svld2_vnum (p0, x0, -18))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld2_vnum_s16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s16_x1, svint16x2_t, int16_t,
+	   z0 = svld2_vnum_s16 (p0, x0, x1),
+	   z0 = svld2_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s32.c
new file mode 100644
index 000000000..10913c2d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s32.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld2_s32_base:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_s32_base, svint32x2_t, int32_t,
+	   z0 = svld2_s32 (p0, x0),
+	   z0 = svld2 (p0, x0))
+
+/*
+** ld2_s32_index:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld2_s32_index, svint32x2_t, int32_t,
+	   z0 = svld2_s32 (p0, x0 + x1),
+	   z0 = svld2 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_s32_1:
+**	incb	x0
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_s32_1, svint32x2_t, int32_t,
+	   z0 = svld2_s32 (p0, x0 + svcntw ()),
+	   z0 = svld2 (p0, x0 + svcntw ()))
+
+/*
+** ld2_s32_2:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_s32_2, svint32x2_t, int32_t,
+	   z0 = svld2_s32 (p0, x0 + svcntw () * 2),
+	   z0 = svld2 (p0, x0 + svcntw () * 2))
+
+/*
+** ld2_s32_14:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_s32_14, svint32x2_t, int32_t,
+	   z0 = svld2_s32 (p0, x0 + svcntw () * 14),
+	   z0 = svld2 (p0, x0 + svcntw () * 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_s32_16:
+**	incb	x0, all, mul #16
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_s32_16, svint32x2_t, int32_t,
+	   z0 = svld2_s32 (p0, x0 + svcntw () * 16),
+	   z0 = svld2 (p0, x0 + svcntw () * 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_s32_m1:
+**	decb	x0
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_s32_m1, svint32x2_t, int32_t,
+	   z0 = svld2_s32 (p0, x0 - svcntw ()),
+	   z0 = svld2 (p0, x0 - svcntw ()))
+
+/*
+** ld2_s32_m2:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_s32_m2, svint32x2_t, int32_t,
+	   z0 = svld2_s32 (p0, x0 - svcntw () * 2),
+	   z0 = svld2 (p0, x0 - svcntw () * 2))
+
+/*
+** ld2_s32_m16:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_s32_m16, svint32x2_t, int32_t,
+	   z0 = svld2_s32 (p0, x0 - svcntw () * 16),
+	   z0 = svld2 (p0, x0 - svcntw () * 16))
+
+/*
+** ld2_s32_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_s32_m18, svint32x2_t, int32_t,
+	   z0 = svld2_s32 (p0, x0 - svcntw () * 18),
+	   z0 = svld2 (p0, x0 - svcntw () * 18))
+
+/*
+** ld2_vnum_s32_0:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s32_0, svint32x2_t, int32_t,
+	   z0 = svld2_vnum_s32 (p0, x0, 0),
+	   z0 = svld2_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_s32_1:
+**	incb	x0
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s32_1, svint32x2_t, int32_t,
+	   z0 = svld2_vnum_s32 (p0, x0, 1),
+	   z0 = svld2_vnum (p0, x0, 1))
+
+/*
+** ld2_vnum_s32_2:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s32_2, svint32x2_t, int32_t,
+	   z0 = svld2_vnum_s32 (p0, x0, 2),
+	   z0 = svld2_vnum (p0, x0, 2))
+
+/*
+** ld2_vnum_s32_14:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s32_14, svint32x2_t, int32_t,
+	   z0 = svld2_vnum_s32 (p0, x0, 14),
+	   z0 = svld2_vnum (p0, x0, 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_s32_16:
+**	incb	x0, all, mul #16
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s32_16, svint32x2_t, int32_t,
+	   z0 = svld2_vnum_s32 (p0, x0, 16),
+	   z0 = svld2_vnum (p0, x0, 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_s32_m1:
+**	decb	x0
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s32_m1, svint32x2_t, int32_t,
+	   z0 = svld2_vnum_s32 (p0, x0, -1),
+	   z0 = svld2_vnum (p0, x0, -1))
+
+/*
+** ld2_vnum_s32_m2:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s32_m2, svint32x2_t, int32_t,
+	   z0 = svld2_vnum_s32 (p0, x0, -2),
+	   z0 = svld2_vnum (p0, x0, -2))
+
+/*
+** ld2_vnum_s32_m16:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s32_m16, svint32x2_t, int32_t,
+	   z0 = svld2_vnum_s32 (p0, x0, -16),
+	   z0 = svld2_vnum (p0, x0, -16))
+
+/*
+** ld2_vnum_s32_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s32_m18, svint32x2_t, int32_t,
+	   z0 = svld2_vnum_s32 (p0, x0, -18),
+	   z0 = svld2_vnum (p0, x0, -18))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld2_vnum_s32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s32_x1, svint32x2_t, int32_t,
+	   z0 = svld2_vnum_s32 (p0, x0, x1),
+	   z0 = svld2_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s64.c
new file mode 100644
index 000000000..9a43e86d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s64.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld2_s64_base:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_s64_base, svint64x2_t, int64_t,
+	   z0 = svld2_s64 (p0, x0),
+	   z0 = svld2 (p0, x0))
+
+/*
+** ld2_s64_index:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld2_s64_index, svint64x2_t, int64_t,
+	   z0 = svld2_s64 (p0, x0 + x1),
+	   z0 = svld2 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_s64_1:
+**	incb	x0
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_s64_1, svint64x2_t, int64_t,
+	   z0 = svld2_s64 (p0, x0 + svcntd ()),
+	   z0 = svld2 (p0, x0 + svcntd ()))
+
+/*
+** ld2_s64_2:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_s64_2, svint64x2_t, int64_t,
+	   z0 = svld2_s64 (p0, x0 + svcntd () * 2),
+	   z0 = svld2 (p0, x0 + svcntd () * 2))
+
+/*
+** ld2_s64_14:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_s64_14, svint64x2_t, int64_t,
+	   z0 = svld2_s64 (p0, x0 + svcntd () * 14),
+	   z0 = svld2 (p0, x0 + svcntd () * 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_s64_16:
+**	incb	x0, all, mul #16
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_s64_16, svint64x2_t, int64_t,
+	   z0 = svld2_s64 (p0, x0 + svcntd () * 16),
+	   z0 = svld2 (p0, x0 + svcntd () * 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_s64_m1:
+**	decb	x0
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_s64_m1, svint64x2_t, int64_t,
+	   z0 = svld2_s64 (p0, x0 - svcntd ()),
+	   z0 = svld2 (p0, x0 - svcntd ()))
+
+/*
+** ld2_s64_m2:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_s64_m2, svint64x2_t, int64_t,
+	   z0 = svld2_s64 (p0, x0 - svcntd () * 2),
+	   z0 = svld2 (p0, x0 - svcntd () * 2))
+
+/*
+** ld2_s64_m16:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_s64_m16, svint64x2_t, int64_t,
+	   z0 = svld2_s64 (p0, x0 - svcntd () * 16),
+	   z0 = svld2 (p0, x0 - svcntd () * 16))
+
+/*
+** ld2_s64_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_s64_m18, svint64x2_t, int64_t,
+	   z0 = svld2_s64 (p0, x0 - svcntd () * 18),
+	   z0 = svld2 (p0, x0 - svcntd () * 18))
+
+/*
+** ld2_vnum_s64_0:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s64_0, svint64x2_t, int64_t,
+	   z0 = svld2_vnum_s64 (p0, x0, 0),
+	   z0 = svld2_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_s64_1:
+**	incb	x0
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s64_1, svint64x2_t, int64_t,
+	   z0 = svld2_vnum_s64 (p0, x0, 1),
+	   z0 = svld2_vnum (p0, x0, 1))
+
+/*
+** ld2_vnum_s64_2:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s64_2, svint64x2_t, int64_t,
+	   z0 = svld2_vnum_s64 (p0, x0, 2),
+	   z0 = svld2_vnum (p0, x0, 2))
+
+/*
+** ld2_vnum_s64_14:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s64_14, svint64x2_t, int64_t,
+	   z0 = svld2_vnum_s64 (p0, x0, 14),
+	   z0 = svld2_vnum (p0, x0, 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_s64_16:
+**	incb	x0, all, mul #16
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s64_16, svint64x2_t, int64_t,
+	   z0 = svld2_vnum_s64 (p0, x0, 16),
+	   z0 = svld2_vnum (p0, x0, 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_s64_m1:
+**	decb	x0
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s64_m1, svint64x2_t, int64_t,
+	   z0 = svld2_vnum_s64 (p0, x0, -1),
+	   z0 = svld2_vnum (p0, x0, -1))
+
+/*
+** ld2_vnum_s64_m2:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s64_m2, svint64x2_t, int64_t,
+	   z0 = svld2_vnum_s64 (p0, x0, -2),
+	   z0 = svld2_vnum (p0, x0, -2))
+
+/*
+** ld2_vnum_s64_m16:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s64_m16, svint64x2_t, int64_t,
+	   z0 = svld2_vnum_s64 (p0, x0, -16),
+	   z0 = svld2_vnum (p0, x0, -16))
+
+/*
+** ld2_vnum_s64_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s64_m18, svint64x2_t, int64_t,
+	   z0 = svld2_vnum_s64 (p0, x0, -18),
+	   z0 = svld2_vnum (p0, x0, -18))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld2_vnum_s64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s64_x1, svint64x2_t, int64_t,
+	   z0 = svld2_vnum_s64 (p0, x0, x1),
+	   z0 = svld2_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s8.c
new file mode 100644
index 000000000..af5c04c66
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_s8.c
@@ -0,0 +1,204 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld2_s8_base:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_s8_base, svint8x2_t, int8_t,
+	   z0 = svld2_s8 (p0, x0),
+	   z0 = svld2 (p0, x0))
+
+/*
+** ld2_s8_index:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld2_s8_index, svint8x2_t, int8_t,
+	   z0 = svld2_s8 (p0, x0 + x1),
+	   z0 = svld2 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_s8_1:
+**	incb	x0
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_s8_1, svint8x2_t, int8_t,
+	   z0 = svld2_s8 (p0, x0 + svcntb ()),
+	   z0 = svld2 (p0, x0 + svcntb ()))
+
+/*
+** ld2_s8_2:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_s8_2, svint8x2_t, int8_t,
+	   z0 = svld2_s8 (p0, x0 + svcntb () * 2),
+	   z0 = svld2 (p0, x0 + svcntb () * 2))
+
+/*
+** ld2_s8_14:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_s8_14, svint8x2_t, int8_t,
+	   z0 = svld2_s8 (p0, x0 + svcntb () * 14),
+	   z0 = svld2 (p0, x0 + svcntb () * 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_s8_16:
+**	incb	x0, all, mul #16
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_s8_16, svint8x2_t, int8_t,
+	   z0 = svld2_s8 (p0, x0 + svcntb () * 16),
+	   z0 = svld2 (p0, x0 + svcntb () * 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_s8_m1:
+**	decb	x0
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_s8_m1, svint8x2_t, int8_t,
+	   z0 = svld2_s8 (p0, x0 - svcntb ()),
+	   z0 = svld2 (p0, x0 - svcntb ()))
+
+/*
+** ld2_s8_m2:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_s8_m2, svint8x2_t, int8_t,
+	   z0 = svld2_s8 (p0, x0 - svcntb () * 2),
+	   z0 = svld2 (p0, x0 - svcntb () * 2))
+
+/*
+** ld2_s8_m16:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_s8_m16, svint8x2_t, int8_t,
+	   z0 = svld2_s8 (p0, x0 - svcntb () * 16),
+	   z0 = svld2 (p0, x0 - svcntb () * 16))
+
+/*
+** ld2_s8_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_s8_m18, svint8x2_t, int8_t,
+	   z0 = svld2_s8 (p0, x0 - svcntb () * 18),
+	   z0 = svld2 (p0, x0 - svcntb () * 18))
+
+/*
+** ld2_vnum_s8_0:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s8_0, svint8x2_t, int8_t,
+	   z0 = svld2_vnum_s8 (p0, x0, 0),
+	   z0 = svld2_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_s8_1:
+**	incb	x0
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s8_1, svint8x2_t, int8_t,
+	   z0 = svld2_vnum_s8 (p0, x0, 1),
+	   z0 = svld2_vnum (p0, x0, 1))
+
+/*
+** ld2_vnum_s8_2:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s8_2, svint8x2_t, int8_t,
+	   z0 = svld2_vnum_s8 (p0, x0, 2),
+	   z0 = svld2_vnum (p0, x0, 2))
+
+/*
+** ld2_vnum_s8_14:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s8_14, svint8x2_t, int8_t,
+	   z0 = svld2_vnum_s8 (p0, x0, 14),
+	   z0 = svld2_vnum (p0, x0, 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_s8_16:
+**	incb	x0, all, mul #16
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s8_16, svint8x2_t, int8_t,
+	   z0 = svld2_vnum_s8 (p0, x0, 16),
+	   z0 = svld2_vnum (p0, x0, 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_s8_m1:
+**	decb	x0
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s8_m1, svint8x2_t, int8_t,
+	   z0 = svld2_vnum_s8 (p0, x0, -1),
+	   z0 = svld2_vnum (p0, x0, -1))
+
+/*
+** ld2_vnum_s8_m2:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s8_m2, svint8x2_t, int8_t,
+	   z0 = svld2_vnum_s8 (p0, x0, -2),
+	   z0 = svld2_vnum (p0, x0, -2))
+
+/*
+** ld2_vnum_s8_m16:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s8_m16, svint8x2_t, int8_t,
+	   z0 = svld2_vnum_s8 (p0, x0, -16),
+	   z0 = svld2_vnum (p0, x0, -16))
+
+/*
+** ld2_vnum_s8_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s8_m18, svint8x2_t, int8_t,
+	   z0 = svld2_vnum_s8 (p0, x0, -18),
+	   z0 = svld2_vnum (p0, x0, -18))
+
+/*
+** ld2_vnum_s8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld2_vnum_s8_x1, svint8x2_t, int8_t,
+	   z0 = svld2_vnum_s8 (p0, x0, x1),
+	   z0 = svld2_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u16.c
new file mode 100644
index 000000000..6c33322c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u16.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld2_u16_base:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_u16_base, svuint16x2_t, uint16_t,
+	   z0 = svld2_u16 (p0, x0),
+	   z0 = svld2 (p0, x0))
+
+/*
+** ld2_u16_index:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld2_u16_index, svuint16x2_t, uint16_t,
+	   z0 = svld2_u16 (p0, x0 + x1),
+	   z0 = svld2 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_u16_1:
+**	incb	x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_u16_1, svuint16x2_t, uint16_t,
+	   z0 = svld2_u16 (p0, x0 + svcnth ()),
+	   z0 = svld2 (p0, x0 + svcnth ()))
+
+/*
+** ld2_u16_2:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_u16_2, svuint16x2_t, uint16_t,
+	   z0 = svld2_u16 (p0, x0 + svcnth () * 2),
+	   z0 = svld2 (p0, x0 + svcnth () * 2))
+
+/*
+** ld2_u16_14:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_u16_14, svuint16x2_t, uint16_t,
+	   z0 = svld2_u16 (p0, x0 + svcnth () * 14),
+	   z0 = svld2 (p0, x0 + svcnth () * 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_u16_16:
+**	incb	x0, all, mul #16
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_u16_16, svuint16x2_t, uint16_t,
+	   z0 = svld2_u16 (p0, x0 + svcnth () * 16),
+	   z0 = svld2 (p0, x0 + svcnth () * 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_u16_m1:
+**	decb	x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_u16_m1, svuint16x2_t, uint16_t,
+	   z0 = svld2_u16 (p0, x0 - svcnth ()),
+	   z0 = svld2 (p0, x0 - svcnth ()))
+
+/*
+** ld2_u16_m2:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_u16_m2, svuint16x2_t, uint16_t,
+	   z0 = svld2_u16 (p0, x0 - svcnth () * 2),
+	   z0 = svld2 (p0, x0 - svcnth () * 2))
+
+/*
+** ld2_u16_m16:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_u16_m16, svuint16x2_t, uint16_t,
+	   z0 = svld2_u16 (p0, x0 - svcnth () * 16),
+	   z0 = svld2 (p0, x0 - svcnth () * 16))
+
+/*
+** ld2_u16_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_u16_m18, svuint16x2_t, uint16_t,
+	   z0 = svld2_u16 (p0, x0 - svcnth () * 18),
+	   z0 = svld2 (p0, x0 - svcnth () * 18))
+
+/*
+** ld2_vnum_u16_0:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u16_0, svuint16x2_t, uint16_t,
+	   z0 = svld2_vnum_u16 (p0, x0, 0),
+	   z0 = svld2_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_u16_1:
+**	incb	x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u16_1, svuint16x2_t, uint16_t,
+	   z0 = svld2_vnum_u16 (p0, x0, 1),
+	   z0 = svld2_vnum (p0, x0, 1))
+
+/*
+** ld2_vnum_u16_2:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u16_2, svuint16x2_t, uint16_t,
+	   z0 = svld2_vnum_u16 (p0, x0, 2),
+	   z0 = svld2_vnum (p0, x0, 2))
+
+/*
+** ld2_vnum_u16_14:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u16_14, svuint16x2_t, uint16_t,
+	   z0 = svld2_vnum_u16 (p0, x0, 14),
+	   z0 = svld2_vnum (p0, x0, 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_u16_16:
+**	incb	x0, all, mul #16
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u16_16, svuint16x2_t, uint16_t,
+	   z0 = svld2_vnum_u16 (p0, x0, 16),
+	   z0 = svld2_vnum (p0, x0, 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_u16_m1:
+**	decb	x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u16_m1, svuint16x2_t, uint16_t,
+	   z0 = svld2_vnum_u16 (p0, x0, -1),
+	   z0 = svld2_vnum (p0, x0, -1))
+
+/*
+** ld2_vnum_u16_m2:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u16_m2, svuint16x2_t, uint16_t,
+	   z0 = svld2_vnum_u16 (p0, x0, -2),
+	   z0 = svld2_vnum (p0, x0, -2))
+
+/*
+** ld2_vnum_u16_m16:
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u16_m16, svuint16x2_t, uint16_t,
+	   z0 = svld2_vnum_u16 (p0, x0, -16),
+	   z0 = svld2_vnum (p0, x0, -16))
+
+/*
+** ld2_vnum_u16_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u16_m18, svuint16x2_t, uint16_t,
+	   z0 = svld2_vnum_u16 (p0, x0, -18),
+	   z0 = svld2_vnum (p0, x0, -18))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld2_vnum_u16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld2h	{z0\.h(?: - |, )z1\.h}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u16_x1, svuint16x2_t, uint16_t,
+	   z0 = svld2_vnum_u16 (p0, x0, x1),
+	   z0 = svld2_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u32.c
new file mode 100644
index 000000000..84a23cf47
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u32.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld2_u32_base:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_u32_base, svuint32x2_t, uint32_t,
+	   z0 = svld2_u32 (p0, x0),
+	   z0 = svld2 (p0, x0))
+
+/*
+** ld2_u32_index:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld2_u32_index, svuint32x2_t, uint32_t,
+	   z0 = svld2_u32 (p0, x0 + x1),
+	   z0 = svld2 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_u32_1:
+**	incb	x0
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_u32_1, svuint32x2_t, uint32_t,
+	   z0 = svld2_u32 (p0, x0 + svcntw ()),
+	   z0 = svld2 (p0, x0 + svcntw ()))
+
+/*
+** ld2_u32_2:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_u32_2, svuint32x2_t, uint32_t,
+	   z0 = svld2_u32 (p0, x0 + svcntw () * 2),
+	   z0 = svld2 (p0, x0 + svcntw () * 2))
+
+/*
+** ld2_u32_14:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_u32_14, svuint32x2_t, uint32_t,
+	   z0 = svld2_u32 (p0, x0 + svcntw () * 14),
+	   z0 = svld2 (p0, x0 + svcntw () * 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_u32_16:
+**	incb	x0, all, mul #16
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_u32_16, svuint32x2_t, uint32_t,
+	   z0 = svld2_u32 (p0, x0 + svcntw () * 16),
+	   z0 = svld2 (p0, x0 + svcntw () * 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_u32_m1:
+**	decb	x0
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_u32_m1, svuint32x2_t, uint32_t,
+	   z0 = svld2_u32 (p0, x0 - svcntw ()),
+	   z0 = svld2 (p0, x0 - svcntw ()))
+
+/*
+** ld2_u32_m2:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_u32_m2, svuint32x2_t, uint32_t,
+	   z0 = svld2_u32 (p0, x0 - svcntw () * 2),
+	   z0 = svld2 (p0, x0 - svcntw () * 2))
+
+/*
+** ld2_u32_m16:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_u32_m16, svuint32x2_t, uint32_t,
+	   z0 = svld2_u32 (p0, x0 - svcntw () * 16),
+	   z0 = svld2 (p0, x0 - svcntw () * 16))
+
+/*
+** ld2_u32_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_u32_m18, svuint32x2_t, uint32_t,
+	   z0 = svld2_u32 (p0, x0 - svcntw () * 18),
+	   z0 = svld2 (p0, x0 - svcntw () * 18))
+
+/*
+** ld2_vnum_u32_0:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u32_0, svuint32x2_t, uint32_t,
+	   z0 = svld2_vnum_u32 (p0, x0, 0),
+	   z0 = svld2_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_u32_1:
+**	incb	x0
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u32_1, svuint32x2_t, uint32_t,
+	   z0 = svld2_vnum_u32 (p0, x0, 1),
+	   z0 = svld2_vnum (p0, x0, 1))
+
+/*
+** ld2_vnum_u32_2:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u32_2, svuint32x2_t, uint32_t,
+	   z0 = svld2_vnum_u32 (p0, x0, 2),
+	   z0 = svld2_vnum (p0, x0, 2))
+
+/*
+** ld2_vnum_u32_14:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u32_14, svuint32x2_t, uint32_t,
+	   z0 = svld2_vnum_u32 (p0, x0, 14),
+	   z0 = svld2_vnum (p0, x0, 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_u32_16:
+**	incb	x0, all, mul #16
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u32_16, svuint32x2_t, uint32_t,
+	   z0 = svld2_vnum_u32 (p0, x0, 16),
+	   z0 = svld2_vnum (p0, x0, 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_u32_m1:
+**	decb	x0
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u32_m1, svuint32x2_t, uint32_t,
+	   z0 = svld2_vnum_u32 (p0, x0, -1),
+	   z0 = svld2_vnum (p0, x0, -1))
+
+/*
+** ld2_vnum_u32_m2:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u32_m2, svuint32x2_t, uint32_t,
+	   z0 = svld2_vnum_u32 (p0, x0, -2),
+	   z0 = svld2_vnum (p0, x0, -2))
+
+/*
+** ld2_vnum_u32_m16:
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u32_m16, svuint32x2_t, uint32_t,
+	   z0 = svld2_vnum_u32 (p0, x0, -16),
+	   z0 = svld2_vnum (p0, x0, -16))
+
+/*
+** ld2_vnum_u32_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u32_m18, svuint32x2_t, uint32_t,
+	   z0 = svld2_vnum_u32 (p0, x0, -18),
+	   z0 = svld2_vnum (p0, x0, -18))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld2_vnum_u32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld2w	{z0\.s(?: - |, )z1\.s}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u32_x1, svuint32x2_t, uint32_t,
+	   z0 = svld2_vnum_u32 (p0, x0, x1),
+	   z0 = svld2_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u64.c
new file mode 100644
index 000000000..350b05792
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u64.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld2_u64_base:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_u64_base, svuint64x2_t, uint64_t,
+	   z0 = svld2_u64 (p0, x0),
+	   z0 = svld2 (p0, x0))
+
+/*
+** ld2_u64_index:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld2_u64_index, svuint64x2_t, uint64_t,
+	   z0 = svld2_u64 (p0, x0 + x1),
+	   z0 = svld2 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_u64_1:
+**	incb	x0
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_u64_1, svuint64x2_t, uint64_t,
+	   z0 = svld2_u64 (p0, x0 + svcntd ()),
+	   z0 = svld2 (p0, x0 + svcntd ()))
+
+/*
+** ld2_u64_2:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_u64_2, svuint64x2_t, uint64_t,
+	   z0 = svld2_u64 (p0, x0 + svcntd () * 2),
+	   z0 = svld2 (p0, x0 + svcntd () * 2))
+
+/*
+** ld2_u64_14:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_u64_14, svuint64x2_t, uint64_t,
+	   z0 = svld2_u64 (p0, x0 + svcntd () * 14),
+	   z0 = svld2 (p0, x0 + svcntd () * 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_u64_16:
+**	incb	x0, all, mul #16
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_u64_16, svuint64x2_t, uint64_t,
+	   z0 = svld2_u64 (p0, x0 + svcntd () * 16),
+	   z0 = svld2 (p0, x0 + svcntd () * 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_u64_m1:
+**	decb	x0
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_u64_m1, svuint64x2_t, uint64_t,
+	   z0 = svld2_u64 (p0, x0 - svcntd ()),
+	   z0 = svld2 (p0, x0 - svcntd ()))
+
+/*
+** ld2_u64_m2:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_u64_m2, svuint64x2_t, uint64_t,
+	   z0 = svld2_u64 (p0, x0 - svcntd () * 2),
+	   z0 = svld2 (p0, x0 - svcntd () * 2))
+
+/*
+** ld2_u64_m16:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_u64_m16, svuint64x2_t, uint64_t,
+	   z0 = svld2_u64 (p0, x0 - svcntd () * 16),
+	   z0 = svld2 (p0, x0 - svcntd () * 16))
+
+/*
+** ld2_u64_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_u64_m18, svuint64x2_t, uint64_t,
+	   z0 = svld2_u64 (p0, x0 - svcntd () * 18),
+	   z0 = svld2 (p0, x0 - svcntd () * 18))
+
+/*
+** ld2_vnum_u64_0:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u64_0, svuint64x2_t, uint64_t,
+	   z0 = svld2_vnum_u64 (p0, x0, 0),
+	   z0 = svld2_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_u64_1:
+**	incb	x0
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u64_1, svuint64x2_t, uint64_t,
+	   z0 = svld2_vnum_u64 (p0, x0, 1),
+	   z0 = svld2_vnum (p0, x0, 1))
+
+/*
+** ld2_vnum_u64_2:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u64_2, svuint64x2_t, uint64_t,
+	   z0 = svld2_vnum_u64 (p0, x0, 2),
+	   z0 = svld2_vnum (p0, x0, 2))
+
+/*
+** ld2_vnum_u64_14:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u64_14, svuint64x2_t, uint64_t,
+	   z0 = svld2_vnum_u64 (p0, x0, 14),
+	   z0 = svld2_vnum (p0, x0, 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_u64_16:
+**	incb	x0, all, mul #16
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u64_16, svuint64x2_t, uint64_t,
+	   z0 = svld2_vnum_u64 (p0, x0, 16),
+	   z0 = svld2_vnum (p0, x0, 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_u64_m1:
+**	decb	x0
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u64_m1, svuint64x2_t, uint64_t,
+	   z0 = svld2_vnum_u64 (p0, x0, -1),
+	   z0 = svld2_vnum (p0, x0, -1))
+
+/*
+** ld2_vnum_u64_m2:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u64_m2, svuint64x2_t, uint64_t,
+	   z0 = svld2_vnum_u64 (p0, x0, -2),
+	   z0 = svld2_vnum (p0, x0, -2))
+
+/*
+** ld2_vnum_u64_m16:
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u64_m16, svuint64x2_t, uint64_t,
+	   z0 = svld2_vnum_u64 (p0, x0, -16),
+	   z0 = svld2_vnum (p0, x0, -16))
+
+/*
+** ld2_vnum_u64_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u64_m18, svuint64x2_t, uint64_t,
+	   z0 = svld2_vnum_u64 (p0, x0, -18),
+	   z0 = svld2_vnum (p0, x0, -18))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld2_vnum_u64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld2d	{z0\.d(?: - |, )z1\.d}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u64_x1, svuint64x2_t, uint64_t,
+	   z0 = svld2_vnum_u64 (p0, x0, x1),
+	   z0 = svld2_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u8.c
new file mode 100644
index 000000000..e67634c4c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld2_u8.c
@@ -0,0 +1,204 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld2_u8_base:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_u8_base, svuint8x2_t, uint8_t,
+	   z0 = svld2_u8 (p0, x0),
+	   z0 = svld2 (p0, x0))
+
+/*
+** ld2_u8_index:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld2_u8_index, svuint8x2_t, uint8_t,
+	   z0 = svld2_u8 (p0, x0 + x1),
+	   z0 = svld2 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_u8_1:
+**	incb	x0
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_u8_1, svuint8x2_t, uint8_t,
+	   z0 = svld2_u8 (p0, x0 + svcntb ()),
+	   z0 = svld2 (p0, x0 + svcntb ()))
+
+/*
+** ld2_u8_2:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_u8_2, svuint8x2_t, uint8_t,
+	   z0 = svld2_u8 (p0, x0 + svcntb () * 2),
+	   z0 = svld2 (p0, x0 + svcntb () * 2))
+
+/*
+** ld2_u8_14:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_u8_14, svuint8x2_t, uint8_t,
+	   z0 = svld2_u8 (p0, x0 + svcntb () * 14),
+	   z0 = svld2 (p0, x0 + svcntb () * 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_u8_16:
+**	incb	x0, all, mul #16
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_u8_16, svuint8x2_t, uint8_t,
+	   z0 = svld2_u8 (p0, x0 + svcntb () * 16),
+	   z0 = svld2 (p0, x0 + svcntb () * 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_u8_m1:
+**	decb	x0
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_u8_m1, svuint8x2_t, uint8_t,
+	   z0 = svld2_u8 (p0, x0 - svcntb ()),
+	   z0 = svld2 (p0, x0 - svcntb ()))
+
+/*
+** ld2_u8_m2:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_u8_m2, svuint8x2_t, uint8_t,
+	   z0 = svld2_u8 (p0, x0 - svcntb () * 2),
+	   z0 = svld2 (p0, x0 - svcntb () * 2))
+
+/*
+** ld2_u8_m16:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_u8_m16, svuint8x2_t, uint8_t,
+	   z0 = svld2_u8 (p0, x0 - svcntb () * 16),
+	   z0 = svld2 (p0, x0 - svcntb () * 16))
+
+/*
+** ld2_u8_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_u8_m18, svuint8x2_t, uint8_t,
+	   z0 = svld2_u8 (p0, x0 - svcntb () * 18),
+	   z0 = svld2 (p0, x0 - svcntb () * 18))
+
+/*
+** ld2_vnum_u8_0:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u8_0, svuint8x2_t, uint8_t,
+	   z0 = svld2_vnum_u8 (p0, x0, 0),
+	   z0 = svld2_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_u8_1:
+**	incb	x0
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u8_1, svuint8x2_t, uint8_t,
+	   z0 = svld2_vnum_u8 (p0, x0, 1),
+	   z0 = svld2_vnum (p0, x0, 1))
+
+/*
+** ld2_vnum_u8_2:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u8_2, svuint8x2_t, uint8_t,
+	   z0 = svld2_vnum_u8 (p0, x0, 2),
+	   z0 = svld2_vnum (p0, x0, 2))
+
+/*
+** ld2_vnum_u8_14:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u8_14, svuint8x2_t, uint8_t,
+	   z0 = svld2_vnum_u8 (p0, x0, 14),
+	   z0 = svld2_vnum (p0, x0, 14))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_u8_16:
+**	incb	x0, all, mul #16
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u8_16, svuint8x2_t, uint8_t,
+	   z0 = svld2_vnum_u8 (p0, x0, 16),
+	   z0 = svld2_vnum (p0, x0, 16))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld2_vnum_u8_m1:
+**	decb	x0
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u8_m1, svuint8x2_t, uint8_t,
+	   z0 = svld2_vnum_u8 (p0, x0, -1),
+	   z0 = svld2_vnum (p0, x0, -1))
+
+/*
+** ld2_vnum_u8_m2:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u8_m2, svuint8x2_t, uint8_t,
+	   z0 = svld2_vnum_u8 (p0, x0, -2),
+	   z0 = svld2_vnum (p0, x0, -2))
+
+/*
+** ld2_vnum_u8_m16:
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u8_m16, svuint8x2_t, uint8_t,
+	   z0 = svld2_vnum_u8 (p0, x0, -16),
+	   z0 = svld2_vnum (p0, x0, -16))
+
+/*
+** ld2_vnum_u8_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u8_m18, svuint8x2_t, uint8_t,
+	   z0 = svld2_vnum_u8 (p0, x0, -18),
+	   z0 = svld2_vnum (p0, x0, -18))
+
+/*
+** ld2_vnum_u8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld2b	{z0\.b(?: - |, )z1\.b}, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld2_vnum_u8_x1, svuint8x2_t, uint8_t,
+	   z0 = svld2_vnum_u8 (p0, x0, x1),
+	   z0 = svld2_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_bf16.c
new file mode 100644
index 000000000..e0b4fb1af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_bf16.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld3_bf16_base:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_bf16_base, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_bf16 (p0, x0),
+	   z0 = svld3 (p0, x0))
+
+/*
+** ld3_bf16_index:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld3_bf16_index, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_bf16 (p0, x0 + x1),
+	   z0 = svld3 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_bf16_1:
+**	incb	x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_bf16_1, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_bf16 (p0, x0 + svcnth ()),
+	   z0 = svld3 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_bf16_2:
+**	incb	x0, all, mul #2
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_bf16_2, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_bf16 (p0, x0 + svcnth () * 2),
+	   z0 = svld3 (p0, x0 + svcnth () * 2))
+
+/*
+** ld3_bf16_3:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_bf16_3, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_bf16 (p0, x0 + svcnth () * 3),
+	   z0 = svld3 (p0, x0 + svcnth () * 3))
+
+/*
+** ld3_bf16_21:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_bf16_21, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_bf16 (p0, x0 + svcnth () * 21),
+	   z0 = svld3 (p0, x0 + svcnth () * 21))
+
+/*
+** ld3_bf16_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_bf16_24, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_bf16 (p0, x0 + svcnth () * 24),
+	   z0 = svld3 (p0, x0 + svcnth () * 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_bf16_m1:
+**	decb	x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_bf16_m1, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_bf16 (p0, x0 - svcnth ()),
+	   z0 = svld3 (p0, x0 - svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_bf16_m2:
+**	decb	x0, all, mul #2
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_bf16_m2, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_bf16 (p0, x0 - svcnth () * 2),
+	   z0 = svld3 (p0, x0 - svcnth () * 2))
+
+/*
+** ld3_bf16_m3:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_bf16_m3, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_bf16 (p0, x0 - svcnth () * 3),
+	   z0 = svld3 (p0, x0 - svcnth () * 3))
+
+/*
+** ld3_bf16_m24:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_bf16_m24, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_bf16 (p0, x0 - svcnth () * 24),
+	   z0 = svld3 (p0, x0 - svcnth () * 24))
+
+/*
+** ld3_bf16_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_bf16_m27, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_bf16 (p0, x0 - svcnth () * 27),
+	   z0 = svld3 (p0, x0 - svcnth () * 27))
+
+/*
+** ld3_vnum_bf16_0:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_bf16_0, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_vnum_bf16 (p0, x0, 0),
+	   z0 = svld3_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_bf16_1:
+**	incb	x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_bf16_1, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_vnum_bf16 (p0, x0, 1),
+	   z0 = svld3_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_bf16_2:
+**	incb	x0, all, mul #2
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_bf16_2, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_vnum_bf16 (p0, x0, 2),
+	   z0 = svld3_vnum (p0, x0, 2))
+
+/*
+** ld3_vnum_bf16_3:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_bf16_3, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_vnum_bf16 (p0, x0, 3),
+	   z0 = svld3_vnum (p0, x0, 3))
+
+/*
+** ld3_vnum_bf16_21:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_bf16_21, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_vnum_bf16 (p0, x0, 21),
+	   z0 = svld3_vnum (p0, x0, 21))
+
+/*
+** ld3_vnum_bf16_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_bf16_24, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_vnum_bf16 (p0, x0, 24),
+	   z0 = svld3_vnum (p0, x0, 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_bf16_m1:
+**	decb	x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_bf16_m1, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_vnum_bf16 (p0, x0, -1),
+	   z0 = svld3_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_bf16_m2:
+**	decb	x0, all, mul #2
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_bf16_m2, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_vnum_bf16 (p0, x0, -2),
+	   z0 = svld3_vnum (p0, x0, -2))
+
+/*
+** ld3_vnum_bf16_m3:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_bf16_m3, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_vnum_bf16 (p0, x0, -3),
+	   z0 = svld3_vnum (p0, x0, -3))
+
+/*
+** ld3_vnum_bf16_m24:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_bf16_m24, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_vnum_bf16 (p0, x0, -24),
+	   z0 = svld3_vnum (p0, x0, -24))
+
+/*
+** ld3_vnum_bf16_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_bf16_m27, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_vnum_bf16 (p0, x0, -27),
+	   z0 = svld3_vnum (p0, x0, -27))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld3_vnum_bf16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_bf16_x1, svbfloat16x3_t, bfloat16_t,
+	   z0 = svld3_vnum_bf16 (p0, x0, x1),
+	   z0 = svld3_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f16.c
new file mode 100644
index 000000000..3d7777e52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f16.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld3_f16_base:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_f16_base, svfloat16x3_t, float16_t,
+	   z0 = svld3_f16 (p0, x0),
+	   z0 = svld3 (p0, x0))
+
+/*
+** ld3_f16_index:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld3_f16_index, svfloat16x3_t, float16_t,
+	   z0 = svld3_f16 (p0, x0 + x1),
+	   z0 = svld3 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_f16_1:
+**	incb	x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_f16_1, svfloat16x3_t, float16_t,
+	   z0 = svld3_f16 (p0, x0 + svcnth ()),
+	   z0 = svld3 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_f16_2:
+**	incb	x0, all, mul #2
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_f16_2, svfloat16x3_t, float16_t,
+	   z0 = svld3_f16 (p0, x0 + svcnth () * 2),
+	   z0 = svld3 (p0, x0 + svcnth () * 2))
+
+/*
+** ld3_f16_3:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_f16_3, svfloat16x3_t, float16_t,
+	   z0 = svld3_f16 (p0, x0 + svcnth () * 3),
+	   z0 = svld3 (p0, x0 + svcnth () * 3))
+
+/*
+** ld3_f16_21:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_f16_21, svfloat16x3_t, float16_t,
+	   z0 = svld3_f16 (p0, x0 + svcnth () * 21),
+	   z0 = svld3 (p0, x0 + svcnth () * 21))
+
+/*
+** ld3_f16_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_f16_24, svfloat16x3_t, float16_t,
+	   z0 = svld3_f16 (p0, x0 + svcnth () * 24),
+	   z0 = svld3 (p0, x0 + svcnth () * 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_f16_m1:
+**	decb	x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_f16_m1, svfloat16x3_t, float16_t,
+	   z0 = svld3_f16 (p0, x0 - svcnth ()),
+	   z0 = svld3 (p0, x0 - svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_f16_m2:
+**	decb	x0, all, mul #2
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_f16_m2, svfloat16x3_t, float16_t,
+	   z0 = svld3_f16 (p0, x0 - svcnth () * 2),
+	   z0 = svld3 (p0, x0 - svcnth () * 2))
+
+/*
+** ld3_f16_m3:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_f16_m3, svfloat16x3_t, float16_t,
+	   z0 = svld3_f16 (p0, x0 - svcnth () * 3),
+	   z0 = svld3 (p0, x0 - svcnth () * 3))
+
+/*
+** ld3_f16_m24:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_f16_m24, svfloat16x3_t, float16_t,
+	   z0 = svld3_f16 (p0, x0 - svcnth () * 24),
+	   z0 = svld3 (p0, x0 - svcnth () * 24))
+
+/*
+** ld3_f16_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_f16_m27, svfloat16x3_t, float16_t,
+	   z0 = svld3_f16 (p0, x0 - svcnth () * 27),
+	   z0 = svld3 (p0, x0 - svcnth () * 27))
+
+/*
+** ld3_vnum_f16_0:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f16_0, svfloat16x3_t, float16_t,
+	   z0 = svld3_vnum_f16 (p0, x0, 0),
+	   z0 = svld3_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_f16_1:
+**	incb	x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f16_1, svfloat16x3_t, float16_t,
+	   z0 = svld3_vnum_f16 (p0, x0, 1),
+	   z0 = svld3_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_f16_2:
+**	incb	x0, all, mul #2
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f16_2, svfloat16x3_t, float16_t,
+	   z0 = svld3_vnum_f16 (p0, x0, 2),
+	   z0 = svld3_vnum (p0, x0, 2))
+
+/*
+** ld3_vnum_f16_3:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f16_3, svfloat16x3_t, float16_t,
+	   z0 = svld3_vnum_f16 (p0, x0, 3),
+	   z0 = svld3_vnum (p0, x0, 3))
+
+/*
+** ld3_vnum_f16_21:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f16_21, svfloat16x3_t, float16_t,
+	   z0 = svld3_vnum_f16 (p0, x0, 21),
+	   z0 = svld3_vnum (p0, x0, 21))
+
+/*
+** ld3_vnum_f16_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f16_24, svfloat16x3_t, float16_t,
+	   z0 = svld3_vnum_f16 (p0, x0, 24),
+	   z0 = svld3_vnum (p0, x0, 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_f16_m1:
+**	decb	x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f16_m1, svfloat16x3_t, float16_t,
+	   z0 = svld3_vnum_f16 (p0, x0, -1),
+	   z0 = svld3_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_f16_m2:
+**	decb	x0, all, mul #2
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f16_m2, svfloat16x3_t, float16_t,
+	   z0 = svld3_vnum_f16 (p0, x0, -2),
+	   z0 = svld3_vnum (p0, x0, -2))
+
+/*
+** ld3_vnum_f16_m3:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f16_m3, svfloat16x3_t, float16_t,
+	   z0 = svld3_vnum_f16 (p0, x0, -3),
+	   z0 = svld3_vnum (p0, x0, -3))
+
+/*
+** ld3_vnum_f16_m24:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f16_m24, svfloat16x3_t, float16_t,
+	   z0 = svld3_vnum_f16 (p0, x0, -24),
+	   z0 = svld3_vnum (p0, x0, -24))
+
+/*
+** ld3_vnum_f16_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f16_m27, svfloat16x3_t, float16_t,
+	   z0 = svld3_vnum_f16 (p0, x0, -27),
+	   z0 = svld3_vnum (p0, x0, -27))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld3_vnum_f16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f16_x1, svfloat16x3_t, float16_t,
+	   z0 = svld3_vnum_f16 (p0, x0, x1),
+	   z0 = svld3_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f32.c
new file mode 100644
index 000000000..4e4ad7521
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f32.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld3_f32_base:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_f32_base, svfloat32x3_t, float32_t,
+	   z0 = svld3_f32 (p0, x0),
+	   z0 = svld3 (p0, x0))
+
+/*
+** ld3_f32_index:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld3_f32_index, svfloat32x3_t, float32_t,
+	   z0 = svld3_f32 (p0, x0 + x1),
+	   z0 = svld3 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_f32_1:
+**	incb	x0
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_f32_1, svfloat32x3_t, float32_t,
+	   z0 = svld3_f32 (p0, x0 + svcntw ()),
+	   z0 = svld3 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_f32_2:
+**	incb	x0, all, mul #2
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_f32_2, svfloat32x3_t, float32_t,
+	   z0 = svld3_f32 (p0, x0 + svcntw () * 2),
+	   z0 = svld3 (p0, x0 + svcntw () * 2))
+
+/*
+** ld3_f32_3:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_f32_3, svfloat32x3_t, float32_t,
+	   z0 = svld3_f32 (p0, x0 + svcntw () * 3),
+	   z0 = svld3 (p0, x0 + svcntw () * 3))
+
+/*
+** ld3_f32_21:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_f32_21, svfloat32x3_t, float32_t,
+	   z0 = svld3_f32 (p0, x0 + svcntw () * 21),
+	   z0 = svld3 (p0, x0 + svcntw () * 21))
+
+/*
+** ld3_f32_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_f32_24, svfloat32x3_t, float32_t,
+	   z0 = svld3_f32 (p0, x0 + svcntw () * 24),
+	   z0 = svld3 (p0, x0 + svcntw () * 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_f32_m1:
+**	decb	x0
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_f32_m1, svfloat32x3_t, float32_t,
+	   z0 = svld3_f32 (p0, x0 - svcntw ()),
+	   z0 = svld3 (p0, x0 - svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_f32_m2:
+**	decb	x0, all, mul #2
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_f32_m2, svfloat32x3_t, float32_t,
+	   z0 = svld3_f32 (p0, x0 - svcntw () * 2),
+	   z0 = svld3 (p0, x0 - svcntw () * 2))
+
+/*
+** ld3_f32_m3:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_f32_m3, svfloat32x3_t, float32_t,
+	   z0 = svld3_f32 (p0, x0 - svcntw () * 3),
+	   z0 = svld3 (p0, x0 - svcntw () * 3))
+
+/*
+** ld3_f32_m24:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_f32_m24, svfloat32x3_t, float32_t,
+	   z0 = svld3_f32 (p0, x0 - svcntw () * 24),
+	   z0 = svld3 (p0, x0 - svcntw () * 24))
+
+/*
+** ld3_f32_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_f32_m27, svfloat32x3_t, float32_t,
+	   z0 = svld3_f32 (p0, x0 - svcntw () * 27),
+	   z0 = svld3 (p0, x0 - svcntw () * 27))
+
+/*
+** ld3_vnum_f32_0:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f32_0, svfloat32x3_t, float32_t,
+	   z0 = svld3_vnum_f32 (p0, x0, 0),
+	   z0 = svld3_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_f32_1:
+**	incb	x0
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f32_1, svfloat32x3_t, float32_t,
+	   z0 = svld3_vnum_f32 (p0, x0, 1),
+	   z0 = svld3_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_f32_2:
+**	incb	x0, all, mul #2
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f32_2, svfloat32x3_t, float32_t,
+	   z0 = svld3_vnum_f32 (p0, x0, 2),
+	   z0 = svld3_vnum (p0, x0, 2))
+
+/*
+** ld3_vnum_f32_3:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f32_3, svfloat32x3_t, float32_t,
+	   z0 = svld3_vnum_f32 (p0, x0, 3),
+	   z0 = svld3_vnum (p0, x0, 3))
+
+/*
+** ld3_vnum_f32_21:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f32_21, svfloat32x3_t, float32_t,
+	   z0 = svld3_vnum_f32 (p0, x0, 21),
+	   z0 = svld3_vnum (p0, x0, 21))
+
+/*
+** ld3_vnum_f32_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f32_24, svfloat32x3_t, float32_t,
+	   z0 = svld3_vnum_f32 (p0, x0, 24),
+	   z0 = svld3_vnum (p0, x0, 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_f32_m1:
+**	decb	x0
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f32_m1, svfloat32x3_t, float32_t,
+	   z0 = svld3_vnum_f32 (p0, x0, -1),
+	   z0 = svld3_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_f32_m2:
+**	decb	x0, all, mul #2
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f32_m2, svfloat32x3_t, float32_t,
+	   z0 = svld3_vnum_f32 (p0, x0, -2),
+	   z0 = svld3_vnum (p0, x0, -2))
+
+/*
+** ld3_vnum_f32_m3:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f32_m3, svfloat32x3_t, float32_t,
+	   z0 = svld3_vnum_f32 (p0, x0, -3),
+	   z0 = svld3_vnum (p0, x0, -3))
+
+/*
+** ld3_vnum_f32_m24:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f32_m24, svfloat32x3_t, float32_t,
+	   z0 = svld3_vnum_f32 (p0, x0, -24),
+	   z0 = svld3_vnum (p0, x0, -24))
+
+/*
+** ld3_vnum_f32_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f32_m27, svfloat32x3_t, float32_t,
+	   z0 = svld3_vnum_f32 (p0, x0, -27),
+	   z0 = svld3_vnum (p0, x0, -27))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld3_vnum_f32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f32_x1, svfloat32x3_t, float32_t,
+	   z0 = svld3_vnum_f32 (p0, x0, x1),
+	   z0 = svld3_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f64.c
new file mode 100644
index 000000000..7e6e1e749
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_f64.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld3_f64_base:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_f64_base, svfloat64x3_t, float64_t,
+	   z0 = svld3_f64 (p0, x0),
+	   z0 = svld3 (p0, x0))
+
+/*
+** ld3_f64_index:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld3_f64_index, svfloat64x3_t, float64_t,
+	   z0 = svld3_f64 (p0, x0 + x1),
+	   z0 = svld3 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_f64_1:
+**	incb	x0
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_f64_1, svfloat64x3_t, float64_t,
+	   z0 = svld3_f64 (p0, x0 + svcntd ()),
+	   z0 = svld3 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_f64_2:
+**	incb	x0, all, mul #2
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_f64_2, svfloat64x3_t, float64_t,
+	   z0 = svld3_f64 (p0, x0 + svcntd () * 2),
+	   z0 = svld3 (p0, x0 + svcntd () * 2))
+
+/*
+** ld3_f64_3:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_f64_3, svfloat64x3_t, float64_t,
+	   z0 = svld3_f64 (p0, x0 + svcntd () * 3),
+	   z0 = svld3 (p0, x0 + svcntd () * 3))
+
+/*
+** ld3_f64_21:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_f64_21, svfloat64x3_t, float64_t,
+	   z0 = svld3_f64 (p0, x0 + svcntd () * 21),
+	   z0 = svld3 (p0, x0 + svcntd () * 21))
+
+/*
+** ld3_f64_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_f64_24, svfloat64x3_t, float64_t,
+	   z0 = svld3_f64 (p0, x0 + svcntd () * 24),
+	   z0 = svld3 (p0, x0 + svcntd () * 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_f64_m1:
+**	decb	x0
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_f64_m1, svfloat64x3_t, float64_t,
+	   z0 = svld3_f64 (p0, x0 - svcntd ()),
+	   z0 = svld3 (p0, x0 - svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_f64_m2:
+**	decb	x0, all, mul #2
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_f64_m2, svfloat64x3_t, float64_t,
+	   z0 = svld3_f64 (p0, x0 - svcntd () * 2),
+	   z0 = svld3 (p0, x0 - svcntd () * 2))
+
+/*
+** ld3_f64_m3:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_f64_m3, svfloat64x3_t, float64_t,
+	   z0 = svld3_f64 (p0, x0 - svcntd () * 3),
+	   z0 = svld3 (p0, x0 - svcntd () * 3))
+
+/*
+** ld3_f64_m24:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_f64_m24, svfloat64x3_t, float64_t,
+	   z0 = svld3_f64 (p0, x0 - svcntd () * 24),
+	   z0 = svld3 (p0, x0 - svcntd () * 24))
+
+/*
+** ld3_f64_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_f64_m27, svfloat64x3_t, float64_t,
+	   z0 = svld3_f64 (p0, x0 - svcntd () * 27),
+	   z0 = svld3 (p0, x0 - svcntd () * 27))
+
+/*
+** ld3_vnum_f64_0:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f64_0, svfloat64x3_t, float64_t,
+	   z0 = svld3_vnum_f64 (p0, x0, 0),
+	   z0 = svld3_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_f64_1:
+**	incb	x0
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f64_1, svfloat64x3_t, float64_t,
+	   z0 = svld3_vnum_f64 (p0, x0, 1),
+	   z0 = svld3_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_f64_2:
+**	incb	x0, all, mul #2
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f64_2, svfloat64x3_t, float64_t,
+	   z0 = svld3_vnum_f64 (p0, x0, 2),
+	   z0 = svld3_vnum (p0, x0, 2))
+
+/*
+** ld3_vnum_f64_3:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f64_3, svfloat64x3_t, float64_t,
+	   z0 = svld3_vnum_f64 (p0, x0, 3),
+	   z0 = svld3_vnum (p0, x0, 3))
+
+/*
+** ld3_vnum_f64_21:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f64_21, svfloat64x3_t, float64_t,
+	   z0 = svld3_vnum_f64 (p0, x0, 21),
+	   z0 = svld3_vnum (p0, x0, 21))
+
+/*
+** ld3_vnum_f64_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f64_24, svfloat64x3_t, float64_t,
+	   z0 = svld3_vnum_f64 (p0, x0, 24),
+	   z0 = svld3_vnum (p0, x0, 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_f64_m1:
+**	decb	x0
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f64_m1, svfloat64x3_t, float64_t,
+	   z0 = svld3_vnum_f64 (p0, x0, -1),
+	   z0 = svld3_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_f64_m2:
+**	decb	x0, all, mul #2
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f64_m2, svfloat64x3_t, float64_t,
+	   z0 = svld3_vnum_f64 (p0, x0, -2),
+	   z0 = svld3_vnum (p0, x0, -2))
+
+/*
+** ld3_vnum_f64_m3:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f64_m3, svfloat64x3_t, float64_t,
+	   z0 = svld3_vnum_f64 (p0, x0, -3),
+	   z0 = svld3_vnum (p0, x0, -3))
+
+/*
+** ld3_vnum_f64_m24:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f64_m24, svfloat64x3_t, float64_t,
+	   z0 = svld3_vnum_f64 (p0, x0, -24),
+	   z0 = svld3_vnum (p0, x0, -24))
+
+/*
+** ld3_vnum_f64_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f64_m27, svfloat64x3_t, float64_t,
+	   z0 = svld3_vnum_f64 (p0, x0, -27),
+	   z0 = svld3_vnum (p0, x0, -27))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld3_vnum_f64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_f64_x1, svfloat64x3_t, float64_t,
+	   z0 = svld3_vnum_f64 (p0, x0, x1),
+	   z0 = svld3_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s16.c
new file mode 100644
index 000000000..d4a046c64
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s16.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld3_s16_base:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s16_base, svint16x3_t, int16_t,
+	   z0 = svld3_s16 (p0, x0),
+	   z0 = svld3 (p0, x0))
+
+/*
+** ld3_s16_index:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld3_s16_index, svint16x3_t, int16_t,
+	   z0 = svld3_s16 (p0, x0 + x1),
+	   z0 = svld3 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_s16_1:
+**	incb	x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s16_1, svint16x3_t, int16_t,
+	   z0 = svld3_s16 (p0, x0 + svcnth ()),
+	   z0 = svld3 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_s16_2:
+**	incb	x0, all, mul #2
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s16_2, svint16x3_t, int16_t,
+	   z0 = svld3_s16 (p0, x0 + svcnth () * 2),
+	   z0 = svld3 (p0, x0 + svcnth () * 2))
+
+/*
+** ld3_s16_3:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_s16_3, svint16x3_t, int16_t,
+	   z0 = svld3_s16 (p0, x0 + svcnth () * 3),
+	   z0 = svld3 (p0, x0 + svcnth () * 3))
+
+/*
+** ld3_s16_21:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_s16_21, svint16x3_t, int16_t,
+	   z0 = svld3_s16 (p0, x0 + svcnth () * 21),
+	   z0 = svld3 (p0, x0 + svcnth () * 21))
+
+/*
+** ld3_s16_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_s16_24, svint16x3_t, int16_t,
+	   z0 = svld3_s16 (p0, x0 + svcnth () * 24),
+	   z0 = svld3 (p0, x0 + svcnth () * 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_s16_m1:
+**	decb	x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s16_m1, svint16x3_t, int16_t,
+	   z0 = svld3_s16 (p0, x0 - svcnth ()),
+	   z0 = svld3 (p0, x0 - svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_s16_m2:
+**	decb	x0, all, mul #2
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s16_m2, svint16x3_t, int16_t,
+	   z0 = svld3_s16 (p0, x0 - svcnth () * 2),
+	   z0 = svld3 (p0, x0 - svcnth () * 2))
+
+/*
+** ld3_s16_m3:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_s16_m3, svint16x3_t, int16_t,
+	   z0 = svld3_s16 (p0, x0 - svcnth () * 3),
+	   z0 = svld3 (p0, x0 - svcnth () * 3))
+
+/*
+** ld3_s16_m24:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_s16_m24, svint16x3_t, int16_t,
+	   z0 = svld3_s16 (p0, x0 - svcnth () * 24),
+	   z0 = svld3 (p0, x0 - svcnth () * 24))
+
+/*
+** ld3_s16_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_s16_m27, svint16x3_t, int16_t,
+	   z0 = svld3_s16 (p0, x0 - svcnth () * 27),
+	   z0 = svld3 (p0, x0 - svcnth () * 27))
+
+/*
+** ld3_vnum_s16_0:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s16_0, svint16x3_t, int16_t,
+	   z0 = svld3_vnum_s16 (p0, x0, 0),
+	   z0 = svld3_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_s16_1:
+**	incb	x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s16_1, svint16x3_t, int16_t,
+	   z0 = svld3_vnum_s16 (p0, x0, 1),
+	   z0 = svld3_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_s16_2:
+**	incb	x0, all, mul #2
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s16_2, svint16x3_t, int16_t,
+	   z0 = svld3_vnum_s16 (p0, x0, 2),
+	   z0 = svld3_vnum (p0, x0, 2))
+
+/*
+** ld3_vnum_s16_3:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s16_3, svint16x3_t, int16_t,
+	   z0 = svld3_vnum_s16 (p0, x0, 3),
+	   z0 = svld3_vnum (p0, x0, 3))
+
+/*
+** ld3_vnum_s16_21:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s16_21, svint16x3_t, int16_t,
+	   z0 = svld3_vnum_s16 (p0, x0, 21),
+	   z0 = svld3_vnum (p0, x0, 21))
+
+/*
+** ld3_vnum_s16_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s16_24, svint16x3_t, int16_t,
+	   z0 = svld3_vnum_s16 (p0, x0, 24),
+	   z0 = svld3_vnum (p0, x0, 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_s16_m1:
+**	decb	x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s16_m1, svint16x3_t, int16_t,
+	   z0 = svld3_vnum_s16 (p0, x0, -1),
+	   z0 = svld3_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_s16_m2:
+**	decb	x0, all, mul #2
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s16_m2, svint16x3_t, int16_t,
+	   z0 = svld3_vnum_s16 (p0, x0, -2),
+	   z0 = svld3_vnum (p0, x0, -2))
+
+/*
+** ld3_vnum_s16_m3:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s16_m3, svint16x3_t, int16_t,
+	   z0 = svld3_vnum_s16 (p0, x0, -3),
+	   z0 = svld3_vnum (p0, x0, -3))
+
+/*
+** ld3_vnum_s16_m24:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s16_m24, svint16x3_t, int16_t,
+	   z0 = svld3_vnum_s16 (p0, x0, -24),
+	   z0 = svld3_vnum (p0, x0, -24))
+
+/*
+** ld3_vnum_s16_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s16_m27, svint16x3_t, int16_t,
+	   z0 = svld3_vnum_s16 (p0, x0, -27),
+	   z0 = svld3_vnum (p0, x0, -27))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld3_vnum_s16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s16_x1, svint16x3_t, int16_t,
+	   z0 = svld3_vnum_s16 (p0, x0, x1),
+	   z0 = svld3_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s32.c
new file mode 100644
index 000000000..3b0ba6e2a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s32.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld3_s32_base:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s32_base, svint32x3_t, int32_t,
+	   z0 = svld3_s32 (p0, x0),
+	   z0 = svld3 (p0, x0))
+
+/*
+** ld3_s32_index:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld3_s32_index, svint32x3_t, int32_t,
+	   z0 = svld3_s32 (p0, x0 + x1),
+	   z0 = svld3 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_s32_1:
+**	incb	x0
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s32_1, svint32x3_t, int32_t,
+	   z0 = svld3_s32 (p0, x0 + svcntw ()),
+	   z0 = svld3 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_s32_2:
+**	incb	x0, all, mul #2
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s32_2, svint32x3_t, int32_t,
+	   z0 = svld3_s32 (p0, x0 + svcntw () * 2),
+	   z0 = svld3 (p0, x0 + svcntw () * 2))
+
+/*
+** ld3_s32_3:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_s32_3, svint32x3_t, int32_t,
+	   z0 = svld3_s32 (p0, x0 + svcntw () * 3),
+	   z0 = svld3 (p0, x0 + svcntw () * 3))
+
+/*
+** ld3_s32_21:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_s32_21, svint32x3_t, int32_t,
+	   z0 = svld3_s32 (p0, x0 + svcntw () * 21),
+	   z0 = svld3 (p0, x0 + svcntw () * 21))
+
+/*
+** ld3_s32_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_s32_24, svint32x3_t, int32_t,
+	   z0 = svld3_s32 (p0, x0 + svcntw () * 24),
+	   z0 = svld3 (p0, x0 + svcntw () * 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_s32_m1:
+**	decb	x0
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s32_m1, svint32x3_t, int32_t,
+	   z0 = svld3_s32 (p0, x0 - svcntw ()),
+	   z0 = svld3 (p0, x0 - svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_s32_m2:
+**	decb	x0, all, mul #2
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s32_m2, svint32x3_t, int32_t,
+	   z0 = svld3_s32 (p0, x0 - svcntw () * 2),
+	   z0 = svld3 (p0, x0 - svcntw () * 2))
+
+/*
+** ld3_s32_m3:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_s32_m3, svint32x3_t, int32_t,
+	   z0 = svld3_s32 (p0, x0 - svcntw () * 3),
+	   z0 = svld3 (p0, x0 - svcntw () * 3))
+
+/*
+** ld3_s32_m24:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_s32_m24, svint32x3_t, int32_t,
+	   z0 = svld3_s32 (p0, x0 - svcntw () * 24),
+	   z0 = svld3 (p0, x0 - svcntw () * 24))
+
+/*
+** ld3_s32_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_s32_m27, svint32x3_t, int32_t,
+	   z0 = svld3_s32 (p0, x0 - svcntw () * 27),
+	   z0 = svld3 (p0, x0 - svcntw () * 27))
+
+/*
+** ld3_vnum_s32_0:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s32_0, svint32x3_t, int32_t,
+	   z0 = svld3_vnum_s32 (p0, x0, 0),
+	   z0 = svld3_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_s32_1:
+**	incb	x0
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s32_1, svint32x3_t, int32_t,
+	   z0 = svld3_vnum_s32 (p0, x0, 1),
+	   z0 = svld3_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_s32_2:
+**	incb	x0, all, mul #2
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s32_2, svint32x3_t, int32_t,
+	   z0 = svld3_vnum_s32 (p0, x0, 2),
+	   z0 = svld3_vnum (p0, x0, 2))
+
+/*
+** ld3_vnum_s32_3:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s32_3, svint32x3_t, int32_t,
+	   z0 = svld3_vnum_s32 (p0, x0, 3),
+	   z0 = svld3_vnum (p0, x0, 3))
+
+/*
+** ld3_vnum_s32_21:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s32_21, svint32x3_t, int32_t,
+	   z0 = svld3_vnum_s32 (p0, x0, 21),
+	   z0 = svld3_vnum (p0, x0, 21))
+
+/*
+** ld3_vnum_s32_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s32_24, svint32x3_t, int32_t,
+	   z0 = svld3_vnum_s32 (p0, x0, 24),
+	   z0 = svld3_vnum (p0, x0, 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_s32_m1:
+**	decb	x0
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s32_m1, svint32x3_t, int32_t,
+	   z0 = svld3_vnum_s32 (p0, x0, -1),
+	   z0 = svld3_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_s32_m2:
+**	decb	x0, all, mul #2
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s32_m2, svint32x3_t, int32_t,
+	   z0 = svld3_vnum_s32 (p0, x0, -2),
+	   z0 = svld3_vnum (p0, x0, -2))
+
+/*
+** ld3_vnum_s32_m3:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s32_m3, svint32x3_t, int32_t,
+	   z0 = svld3_vnum_s32 (p0, x0, -3),
+	   z0 = svld3_vnum (p0, x0, -3))
+
+/*
+** ld3_vnum_s32_m24:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s32_m24, svint32x3_t, int32_t,
+	   z0 = svld3_vnum_s32 (p0, x0, -24),
+	   z0 = svld3_vnum (p0, x0, -24))
+
+/*
+** ld3_vnum_s32_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s32_m27, svint32x3_t, int32_t,
+	   z0 = svld3_vnum_s32 (p0, x0, -27),
+	   z0 = svld3_vnum (p0, x0, -27))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld3_vnum_s32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s32_x1, svint32x3_t, int32_t,
+	   z0 = svld3_vnum_s32 (p0, x0, x1),
+	   z0 = svld3_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s64.c
new file mode 100644
index 000000000..080a10b8f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s64.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld3_s64_base:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s64_base, svint64x3_t, int64_t,
+	   z0 = svld3_s64 (p0, x0),
+	   z0 = svld3 (p0, x0))
+
+/*
+** ld3_s64_index:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld3_s64_index, svint64x3_t, int64_t,
+	   z0 = svld3_s64 (p0, x0 + x1),
+	   z0 = svld3 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_s64_1:
+**	incb	x0
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s64_1, svint64x3_t, int64_t,
+	   z0 = svld3_s64 (p0, x0 + svcntd ()),
+	   z0 = svld3 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_s64_2:
+**	incb	x0, all, mul #2
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s64_2, svint64x3_t, int64_t,
+	   z0 = svld3_s64 (p0, x0 + svcntd () * 2),
+	   z0 = svld3 (p0, x0 + svcntd () * 2))
+
+/*
+** ld3_s64_3:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_s64_3, svint64x3_t, int64_t,
+	   z0 = svld3_s64 (p0, x0 + svcntd () * 3),
+	   z0 = svld3 (p0, x0 + svcntd () * 3))
+
+/*
+** ld3_s64_21:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_s64_21, svint64x3_t, int64_t,
+	   z0 = svld3_s64 (p0, x0 + svcntd () * 21),
+	   z0 = svld3 (p0, x0 + svcntd () * 21))
+
+/*
+** ld3_s64_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_s64_24, svint64x3_t, int64_t,
+	   z0 = svld3_s64 (p0, x0 + svcntd () * 24),
+	   z0 = svld3 (p0, x0 + svcntd () * 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_s64_m1:
+**	decb	x0
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s64_m1, svint64x3_t, int64_t,
+	   z0 = svld3_s64 (p0, x0 - svcntd ()),
+	   z0 = svld3 (p0, x0 - svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_s64_m2:
+**	decb	x0, all, mul #2
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s64_m2, svint64x3_t, int64_t,
+	   z0 = svld3_s64 (p0, x0 - svcntd () * 2),
+	   z0 = svld3 (p0, x0 - svcntd () * 2))
+
+/*
+** ld3_s64_m3:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_s64_m3, svint64x3_t, int64_t,
+	   z0 = svld3_s64 (p0, x0 - svcntd () * 3),
+	   z0 = svld3 (p0, x0 - svcntd () * 3))
+
+/*
+** ld3_s64_m24:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_s64_m24, svint64x3_t, int64_t,
+	   z0 = svld3_s64 (p0, x0 - svcntd () * 24),
+	   z0 = svld3 (p0, x0 - svcntd () * 24))
+
+/*
+** ld3_s64_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_s64_m27, svint64x3_t, int64_t,
+	   z0 = svld3_s64 (p0, x0 - svcntd () * 27),
+	   z0 = svld3 (p0, x0 - svcntd () * 27))
+
+/*
+** ld3_vnum_s64_0:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s64_0, svint64x3_t, int64_t,
+	   z0 = svld3_vnum_s64 (p0, x0, 0),
+	   z0 = svld3_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_s64_1:
+**	incb	x0
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s64_1, svint64x3_t, int64_t,
+	   z0 = svld3_vnum_s64 (p0, x0, 1),
+	   z0 = svld3_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_s64_2:
+**	incb	x0, all, mul #2
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s64_2, svint64x3_t, int64_t,
+	   z0 = svld3_vnum_s64 (p0, x0, 2),
+	   z0 = svld3_vnum (p0, x0, 2))
+
+/*
+** ld3_vnum_s64_3:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s64_3, svint64x3_t, int64_t,
+	   z0 = svld3_vnum_s64 (p0, x0, 3),
+	   z0 = svld3_vnum (p0, x0, 3))
+
+/*
+** ld3_vnum_s64_21:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s64_21, svint64x3_t, int64_t,
+	   z0 = svld3_vnum_s64 (p0, x0, 21),
+	   z0 = svld3_vnum (p0, x0, 21))
+
+/*
+** ld3_vnum_s64_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s64_24, svint64x3_t, int64_t,
+	   z0 = svld3_vnum_s64 (p0, x0, 24),
+	   z0 = svld3_vnum (p0, x0, 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_s64_m1:
+**	decb	x0
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s64_m1, svint64x3_t, int64_t,
+	   z0 = svld3_vnum_s64 (p0, x0, -1),
+	   z0 = svld3_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_s64_m2:
+**	decb	x0, all, mul #2
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s64_m2, svint64x3_t, int64_t,
+	   z0 = svld3_vnum_s64 (p0, x0, -2),
+	   z0 = svld3_vnum (p0, x0, -2))
+
+/*
+** ld3_vnum_s64_m3:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s64_m3, svint64x3_t, int64_t,
+	   z0 = svld3_vnum_s64 (p0, x0, -3),
+	   z0 = svld3_vnum (p0, x0, -3))
+
+/*
+** ld3_vnum_s64_m24:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s64_m24, svint64x3_t, int64_t,
+	   z0 = svld3_vnum_s64 (p0, x0, -24),
+	   z0 = svld3_vnum (p0, x0, -24))
+
+/*
+** ld3_vnum_s64_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s64_m27, svint64x3_t, int64_t,
+	   z0 = svld3_vnum_s64 (p0, x0, -27),
+	   z0 = svld3_vnum (p0, x0, -27))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld3_vnum_s64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s64_x1, svint64x3_t, int64_t,
+	   z0 = svld3_vnum_s64 (p0, x0, x1),
+	   z0 = svld3_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s8.c
new file mode 100644
index 000000000..e0c551472
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_s8.c
@@ -0,0 +1,246 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld3_s8_base:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s8_base, svint8x3_t, int8_t,
+	   z0 = svld3_s8 (p0, x0),
+	   z0 = svld3 (p0, x0))
+
+/*
+** ld3_s8_index:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld3_s8_index, svint8x3_t, int8_t,
+	   z0 = svld3_s8 (p0, x0 + x1),
+	   z0 = svld3 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_s8_1:
+**	incb	x0
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s8_1, svint8x3_t, int8_t,
+	   z0 = svld3_s8 (p0, x0 + svcntb ()),
+	   z0 = svld3 (p0, x0 + svcntb ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_s8_2:
+**	incb	x0, all, mul #2
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s8_2, svint8x3_t, int8_t,
+	   z0 = svld3_s8 (p0, x0 + svcntb () * 2),
+	   z0 = svld3 (p0, x0 + svcntb () * 2))
+
+/*
+** ld3_s8_3:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_s8_3, svint8x3_t, int8_t,
+	   z0 = svld3_s8 (p0, x0 + svcntb () * 3),
+	   z0 = svld3 (p0, x0 + svcntb () * 3))
+
+/*
+** ld3_s8_21:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_s8_21, svint8x3_t, int8_t,
+	   z0 = svld3_s8 (p0, x0 + svcntb () * 21),
+	   z0 = svld3 (p0, x0 + svcntb () * 21))
+
+/*
+** ld3_s8_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_s8_24, svint8x3_t, int8_t,
+	   z0 = svld3_s8 (p0, x0 + svcntb () * 24),
+	   z0 = svld3 (p0, x0 + svcntb () * 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_s8_m1:
+**	decb	x0
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s8_m1, svint8x3_t, int8_t,
+	   z0 = svld3_s8 (p0, x0 - svcntb ()),
+	   z0 = svld3 (p0, x0 - svcntb ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_s8_m2:
+**	decb	x0, all, mul #2
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_s8_m2, svint8x3_t, int8_t,
+	   z0 = svld3_s8 (p0, x0 - svcntb () * 2),
+	   z0 = svld3 (p0, x0 - svcntb () * 2))
+
+/*
+** ld3_s8_m3:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_s8_m3, svint8x3_t, int8_t,
+	   z0 = svld3_s8 (p0, x0 - svcntb () * 3),
+	   z0 = svld3 (p0, x0 - svcntb () * 3))
+
+/*
+** ld3_s8_m24:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_s8_m24, svint8x3_t, int8_t,
+	   z0 = svld3_s8 (p0, x0 - svcntb () * 24),
+	   z0 = svld3 (p0, x0 - svcntb () * 24))
+
+/*
+** ld3_s8_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_s8_m27, svint8x3_t, int8_t,
+	   z0 = svld3_s8 (p0, x0 - svcntb () * 27),
+	   z0 = svld3 (p0, x0 - svcntb () * 27))
+
+/*
+** ld3_vnum_s8_0:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s8_0, svint8x3_t, int8_t,
+	   z0 = svld3_vnum_s8 (p0, x0, 0),
+	   z0 = svld3_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_s8_1:
+**	incb	x0
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s8_1, svint8x3_t, int8_t,
+	   z0 = svld3_vnum_s8 (p0, x0, 1),
+	   z0 = svld3_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_s8_2:
+**	incb	x0, all, mul #2
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s8_2, svint8x3_t, int8_t,
+	   z0 = svld3_vnum_s8 (p0, x0, 2),
+	   z0 = svld3_vnum (p0, x0, 2))
+
+/*
+** ld3_vnum_s8_3:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s8_3, svint8x3_t, int8_t,
+	   z0 = svld3_vnum_s8 (p0, x0, 3),
+	   z0 = svld3_vnum (p0, x0, 3))
+
+/*
+** ld3_vnum_s8_21:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s8_21, svint8x3_t, int8_t,
+	   z0 = svld3_vnum_s8 (p0, x0, 21),
+	   z0 = svld3_vnum (p0, x0, 21))
+
+/*
+** ld3_vnum_s8_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s8_24, svint8x3_t, int8_t,
+	   z0 = svld3_vnum_s8 (p0, x0, 24),
+	   z0 = svld3_vnum (p0, x0, 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_s8_m1:
+**	decb	x0
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s8_m1, svint8x3_t, int8_t,
+	   z0 = svld3_vnum_s8 (p0, x0, -1),
+	   z0 = svld3_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_s8_m2:
+**	decb	x0, all, mul #2
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s8_m2, svint8x3_t, int8_t,
+	   z0 = svld3_vnum_s8 (p0, x0, -2),
+	   z0 = svld3_vnum (p0, x0, -2))
+
+/*
+** ld3_vnum_s8_m3:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s8_m3, svint8x3_t, int8_t,
+	   z0 = svld3_vnum_s8 (p0, x0, -3),
+	   z0 = svld3_vnum (p0, x0, -3))
+
+/*
+** ld3_vnum_s8_m24:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s8_m24, svint8x3_t, int8_t,
+	   z0 = svld3_vnum_s8 (p0, x0, -24),
+	   z0 = svld3_vnum (p0, x0, -24))
+
+/*
+** ld3_vnum_s8_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s8_m27, svint8x3_t, int8_t,
+	   z0 = svld3_vnum_s8 (p0, x0, -27),
+	   z0 = svld3_vnum (p0, x0, -27))
+
+/*
+** ld3_vnum_s8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld3_vnum_s8_x1, svint8x3_t, int8_t,
+	   z0 = svld3_vnum_s8 (p0, x0, x1),
+	   z0 = svld3_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u16.c
new file mode 100644
index 000000000..12f6dd092
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u16.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld3_u16_base:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u16_base, svuint16x3_t, uint16_t,
+	   z0 = svld3_u16 (p0, x0),
+	   z0 = svld3 (p0, x0))
+
+/*
+** ld3_u16_index:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld3_u16_index, svuint16x3_t, uint16_t,
+	   z0 = svld3_u16 (p0, x0 + x1),
+	   z0 = svld3 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_u16_1:
+**	incb	x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u16_1, svuint16x3_t, uint16_t,
+	   z0 = svld3_u16 (p0, x0 + svcnth ()),
+	   z0 = svld3 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_u16_2:
+**	incb	x0, all, mul #2
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u16_2, svuint16x3_t, uint16_t,
+	   z0 = svld3_u16 (p0, x0 + svcnth () * 2),
+	   z0 = svld3 (p0, x0 + svcnth () * 2))
+
+/*
+** ld3_u16_3:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_u16_3, svuint16x3_t, uint16_t,
+	   z0 = svld3_u16 (p0, x0 + svcnth () * 3),
+	   z0 = svld3 (p0, x0 + svcnth () * 3))
+
+/*
+** ld3_u16_21:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_u16_21, svuint16x3_t, uint16_t,
+	   z0 = svld3_u16 (p0, x0 + svcnth () * 21),
+	   z0 = svld3 (p0, x0 + svcnth () * 21))
+
+/*
+** ld3_u16_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_u16_24, svuint16x3_t, uint16_t,
+	   z0 = svld3_u16 (p0, x0 + svcnth () * 24),
+	   z0 = svld3 (p0, x0 + svcnth () * 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_u16_m1:
+**	decb	x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u16_m1, svuint16x3_t, uint16_t,
+	   z0 = svld3_u16 (p0, x0 - svcnth ()),
+	   z0 = svld3 (p0, x0 - svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_u16_m2:
+**	decb	x0, all, mul #2
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u16_m2, svuint16x3_t, uint16_t,
+	   z0 = svld3_u16 (p0, x0 - svcnth () * 2),
+	   z0 = svld3 (p0, x0 - svcnth () * 2))
+
+/*
+** ld3_u16_m3:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_u16_m3, svuint16x3_t, uint16_t,
+	   z0 = svld3_u16 (p0, x0 - svcnth () * 3),
+	   z0 = svld3 (p0, x0 - svcnth () * 3))
+
+/*
+** ld3_u16_m24:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_u16_m24, svuint16x3_t, uint16_t,
+	   z0 = svld3_u16 (p0, x0 - svcnth () * 24),
+	   z0 = svld3 (p0, x0 - svcnth () * 24))
+
+/*
+** ld3_u16_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_u16_m27, svuint16x3_t, uint16_t,
+	   z0 = svld3_u16 (p0, x0 - svcnth () * 27),
+	   z0 = svld3 (p0, x0 - svcnth () * 27))
+
+/*
+** ld3_vnum_u16_0:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u16_0, svuint16x3_t, uint16_t,
+	   z0 = svld3_vnum_u16 (p0, x0, 0),
+	   z0 = svld3_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_u16_1:
+**	incb	x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u16_1, svuint16x3_t, uint16_t,
+	   z0 = svld3_vnum_u16 (p0, x0, 1),
+	   z0 = svld3_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_u16_2:
+**	incb	x0, all, mul #2
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u16_2, svuint16x3_t, uint16_t,
+	   z0 = svld3_vnum_u16 (p0, x0, 2),
+	   z0 = svld3_vnum (p0, x0, 2))
+
+/*
+** ld3_vnum_u16_3:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u16_3, svuint16x3_t, uint16_t,
+	   z0 = svld3_vnum_u16 (p0, x0, 3),
+	   z0 = svld3_vnum (p0, x0, 3))
+
+/*
+** ld3_vnum_u16_21:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u16_21, svuint16x3_t, uint16_t,
+	   z0 = svld3_vnum_u16 (p0, x0, 21),
+	   z0 = svld3_vnum (p0, x0, 21))
+
+/*
+** ld3_vnum_u16_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u16_24, svuint16x3_t, uint16_t,
+	   z0 = svld3_vnum_u16 (p0, x0, 24),
+	   z0 = svld3_vnum (p0, x0, 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_u16_m1:
+**	decb	x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u16_m1, svuint16x3_t, uint16_t,
+	   z0 = svld3_vnum_u16 (p0, x0, -1),
+	   z0 = svld3_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_u16_m2:
+**	decb	x0, all, mul #2
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u16_m2, svuint16x3_t, uint16_t,
+	   z0 = svld3_vnum_u16 (p0, x0, -2),
+	   z0 = svld3_vnum (p0, x0, -2))
+
+/*
+** ld3_vnum_u16_m3:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u16_m3, svuint16x3_t, uint16_t,
+	   z0 = svld3_vnum_u16 (p0, x0, -3),
+	   z0 = svld3_vnum (p0, x0, -3))
+
+/*
+** ld3_vnum_u16_m24:
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u16_m24, svuint16x3_t, uint16_t,
+	   z0 = svld3_vnum_u16 (p0, x0, -24),
+	   z0 = svld3_vnum (p0, x0, -24))
+
+/*
+** ld3_vnum_u16_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u16_m27, svuint16x3_t, uint16_t,
+	   z0 = svld3_vnum_u16 (p0, x0, -27),
+	   z0 = svld3_vnum (p0, x0, -27))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld3_vnum_u16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld3h	{z0\.h - z2\.h}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u16_x1, svuint16x3_t, uint16_t,
+	   z0 = svld3_vnum_u16 (p0, x0, x1),
+	   z0 = svld3_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u32.c
new file mode 100644
index 000000000..ffc6edfdc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u32.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld3_u32_base:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u32_base, svuint32x3_t, uint32_t,
+	   z0 = svld3_u32 (p0, x0),
+	   z0 = svld3 (p0, x0))
+
+/*
+** ld3_u32_index:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld3_u32_index, svuint32x3_t, uint32_t,
+	   z0 = svld3_u32 (p0, x0 + x1),
+	   z0 = svld3 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_u32_1:
+**	incb	x0
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u32_1, svuint32x3_t, uint32_t,
+	   z0 = svld3_u32 (p0, x0 + svcntw ()),
+	   z0 = svld3 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_u32_2:
+**	incb	x0, all, mul #2
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u32_2, svuint32x3_t, uint32_t,
+	   z0 = svld3_u32 (p0, x0 + svcntw () * 2),
+	   z0 = svld3 (p0, x0 + svcntw () * 2))
+
+/*
+** ld3_u32_3:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_u32_3, svuint32x3_t, uint32_t,
+	   z0 = svld3_u32 (p0, x0 + svcntw () * 3),
+	   z0 = svld3 (p0, x0 + svcntw () * 3))
+
+/*
+** ld3_u32_21:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_u32_21, svuint32x3_t, uint32_t,
+	   z0 = svld3_u32 (p0, x0 + svcntw () * 21),
+	   z0 = svld3 (p0, x0 + svcntw () * 21))
+
+/*
+** ld3_u32_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_u32_24, svuint32x3_t, uint32_t,
+	   z0 = svld3_u32 (p0, x0 + svcntw () * 24),
+	   z0 = svld3 (p0, x0 + svcntw () * 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_u32_m1:
+**	decb	x0
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u32_m1, svuint32x3_t, uint32_t,
+	   z0 = svld3_u32 (p0, x0 - svcntw ()),
+	   z0 = svld3 (p0, x0 - svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_u32_m2:
+**	decb	x0, all, mul #2
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u32_m2, svuint32x3_t, uint32_t,
+	   z0 = svld3_u32 (p0, x0 - svcntw () * 2),
+	   z0 = svld3 (p0, x0 - svcntw () * 2))
+
+/*
+** ld3_u32_m3:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_u32_m3, svuint32x3_t, uint32_t,
+	   z0 = svld3_u32 (p0, x0 - svcntw () * 3),
+	   z0 = svld3 (p0, x0 - svcntw () * 3))
+
+/*
+** ld3_u32_m24:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_u32_m24, svuint32x3_t, uint32_t,
+	   z0 = svld3_u32 (p0, x0 - svcntw () * 24),
+	   z0 = svld3 (p0, x0 - svcntw () * 24))
+
+/*
+** ld3_u32_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_u32_m27, svuint32x3_t, uint32_t,
+	   z0 = svld3_u32 (p0, x0 - svcntw () * 27),
+	   z0 = svld3 (p0, x0 - svcntw () * 27))
+
+/*
+** ld3_vnum_u32_0:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u32_0, svuint32x3_t, uint32_t,
+	   z0 = svld3_vnum_u32 (p0, x0, 0),
+	   z0 = svld3_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_u32_1:
+**	incb	x0
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u32_1, svuint32x3_t, uint32_t,
+	   z0 = svld3_vnum_u32 (p0, x0, 1),
+	   z0 = svld3_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_u32_2:
+**	incb	x0, all, mul #2
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u32_2, svuint32x3_t, uint32_t,
+	   z0 = svld3_vnum_u32 (p0, x0, 2),
+	   z0 = svld3_vnum (p0, x0, 2))
+
+/*
+** ld3_vnum_u32_3:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u32_3, svuint32x3_t, uint32_t,
+	   z0 = svld3_vnum_u32 (p0, x0, 3),
+	   z0 = svld3_vnum (p0, x0, 3))
+
+/*
+** ld3_vnum_u32_21:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u32_21, svuint32x3_t, uint32_t,
+	   z0 = svld3_vnum_u32 (p0, x0, 21),
+	   z0 = svld3_vnum (p0, x0, 21))
+
+/*
+** ld3_vnum_u32_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u32_24, svuint32x3_t, uint32_t,
+	   z0 = svld3_vnum_u32 (p0, x0, 24),
+	   z0 = svld3_vnum (p0, x0, 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_u32_m1:
+**	decb	x0
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u32_m1, svuint32x3_t, uint32_t,
+	   z0 = svld3_vnum_u32 (p0, x0, -1),
+	   z0 = svld3_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_u32_m2:
+**	decb	x0, all, mul #2
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u32_m2, svuint32x3_t, uint32_t,
+	   z0 = svld3_vnum_u32 (p0, x0, -2),
+	   z0 = svld3_vnum (p0, x0, -2))
+
+/*
+** ld3_vnum_u32_m3:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u32_m3, svuint32x3_t, uint32_t,
+	   z0 = svld3_vnum_u32 (p0, x0, -3),
+	   z0 = svld3_vnum (p0, x0, -3))
+
+/*
+** ld3_vnum_u32_m24:
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u32_m24, svuint32x3_t, uint32_t,
+	   z0 = svld3_vnum_u32 (p0, x0, -24),
+	   z0 = svld3_vnum (p0, x0, -24))
+
+/*
+** ld3_vnum_u32_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u32_m27, svuint32x3_t, uint32_t,
+	   z0 = svld3_vnum_u32 (p0, x0, -27),
+	   z0 = svld3_vnum (p0, x0, -27))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld3_vnum_u32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld3w	{z0\.s - z2\.s}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u32_x1, svuint32x3_t, uint32_t,
+	   z0 = svld3_vnum_u32 (p0, x0, x1),
+	   z0 = svld3_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u64.c
new file mode 100644
index 000000000..2c0dc2f1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u64.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld3_u64_base:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u64_base, svuint64x3_t, uint64_t,
+	   z0 = svld3_u64 (p0, x0),
+	   z0 = svld3 (p0, x0))
+
+/*
+** ld3_u64_index:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld3_u64_index, svuint64x3_t, uint64_t,
+	   z0 = svld3_u64 (p0, x0 + x1),
+	   z0 = svld3 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_u64_1:
+**	incb	x0
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u64_1, svuint64x3_t, uint64_t,
+	   z0 = svld3_u64 (p0, x0 + svcntd ()),
+	   z0 = svld3 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_u64_2:
+**	incb	x0, all, mul #2
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u64_2, svuint64x3_t, uint64_t,
+	   z0 = svld3_u64 (p0, x0 + svcntd () * 2),
+	   z0 = svld3 (p0, x0 + svcntd () * 2))
+
+/*
+** ld3_u64_3:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_u64_3, svuint64x3_t, uint64_t,
+	   z0 = svld3_u64 (p0, x0 + svcntd () * 3),
+	   z0 = svld3 (p0, x0 + svcntd () * 3))
+
+/*
+** ld3_u64_21:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_u64_21, svuint64x3_t, uint64_t,
+	   z0 = svld3_u64 (p0, x0 + svcntd () * 21),
+	   z0 = svld3 (p0, x0 + svcntd () * 21))
+
+/*
+** ld3_u64_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_u64_24, svuint64x3_t, uint64_t,
+	   z0 = svld3_u64 (p0, x0 + svcntd () * 24),
+	   z0 = svld3 (p0, x0 + svcntd () * 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_u64_m1:
+**	decb	x0
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u64_m1, svuint64x3_t, uint64_t,
+	   z0 = svld3_u64 (p0, x0 - svcntd ()),
+	   z0 = svld3 (p0, x0 - svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_u64_m2:
+**	decb	x0, all, mul #2
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u64_m2, svuint64x3_t, uint64_t,
+	   z0 = svld3_u64 (p0, x0 - svcntd () * 2),
+	   z0 = svld3 (p0, x0 - svcntd () * 2))
+
+/*
+** ld3_u64_m3:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_u64_m3, svuint64x3_t, uint64_t,
+	   z0 = svld3_u64 (p0, x0 - svcntd () * 3),
+	   z0 = svld3 (p0, x0 - svcntd () * 3))
+
+/*
+** ld3_u64_m24:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_u64_m24, svuint64x3_t, uint64_t,
+	   z0 = svld3_u64 (p0, x0 - svcntd () * 24),
+	   z0 = svld3 (p0, x0 - svcntd () * 24))
+
+/*
+** ld3_u64_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_u64_m27, svuint64x3_t, uint64_t,
+	   z0 = svld3_u64 (p0, x0 - svcntd () * 27),
+	   z0 = svld3 (p0, x0 - svcntd () * 27))
+
+/*
+** ld3_vnum_u64_0:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u64_0, svuint64x3_t, uint64_t,
+	   z0 = svld3_vnum_u64 (p0, x0, 0),
+	   z0 = svld3_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_u64_1:
+**	incb	x0
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u64_1, svuint64x3_t, uint64_t,
+	   z0 = svld3_vnum_u64 (p0, x0, 1),
+	   z0 = svld3_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_u64_2:
+**	incb	x0, all, mul #2
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u64_2, svuint64x3_t, uint64_t,
+	   z0 = svld3_vnum_u64 (p0, x0, 2),
+	   z0 = svld3_vnum (p0, x0, 2))
+
+/*
+** ld3_vnum_u64_3:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u64_3, svuint64x3_t, uint64_t,
+	   z0 = svld3_vnum_u64 (p0, x0, 3),
+	   z0 = svld3_vnum (p0, x0, 3))
+
+/*
+** ld3_vnum_u64_21:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u64_21, svuint64x3_t, uint64_t,
+	   z0 = svld3_vnum_u64 (p0, x0, 21),
+	   z0 = svld3_vnum (p0, x0, 21))
+
+/*
+** ld3_vnum_u64_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u64_24, svuint64x3_t, uint64_t,
+	   z0 = svld3_vnum_u64 (p0, x0, 24),
+	   z0 = svld3_vnum (p0, x0, 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_u64_m1:
+**	decb	x0
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u64_m1, svuint64x3_t, uint64_t,
+	   z0 = svld3_vnum_u64 (p0, x0, -1),
+	   z0 = svld3_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_u64_m2:
+**	decb	x0, all, mul #2
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u64_m2, svuint64x3_t, uint64_t,
+	   z0 = svld3_vnum_u64 (p0, x0, -2),
+	   z0 = svld3_vnum (p0, x0, -2))
+
+/*
+** ld3_vnum_u64_m3:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u64_m3, svuint64x3_t, uint64_t,
+	   z0 = svld3_vnum_u64 (p0, x0, -3),
+	   z0 = svld3_vnum (p0, x0, -3))
+
+/*
+** ld3_vnum_u64_m24:
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u64_m24, svuint64x3_t, uint64_t,
+	   z0 = svld3_vnum_u64 (p0, x0, -24),
+	   z0 = svld3_vnum (p0, x0, -24))
+
+/*
+** ld3_vnum_u64_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u64_m27, svuint64x3_t, uint64_t,
+	   z0 = svld3_vnum_u64 (p0, x0, -27),
+	   z0 = svld3_vnum (p0, x0, -27))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld3_vnum_u64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld3d	{z0\.d - z2\.d}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u64_x1, svuint64x3_t, uint64_t,
+	   z0 = svld3_vnum_u64 (p0, x0, x1),
+	   z0 = svld3_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u8.c
new file mode 100644
index 000000000..e9d1ab495
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld3_u8.c
@@ -0,0 +1,246 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld3_u8_base:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u8_base, svuint8x3_t, uint8_t,
+	   z0 = svld3_u8 (p0, x0),
+	   z0 = svld3 (p0, x0))
+
+/*
+** ld3_u8_index:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld3_u8_index, svuint8x3_t, uint8_t,
+	   z0 = svld3_u8 (p0, x0 + x1),
+	   z0 = svld3 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_u8_1:
+**	incb	x0
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u8_1, svuint8x3_t, uint8_t,
+	   z0 = svld3_u8 (p0, x0 + svcntb ()),
+	   z0 = svld3 (p0, x0 + svcntb ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_u8_2:
+**	incb	x0, all, mul #2
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u8_2, svuint8x3_t, uint8_t,
+	   z0 = svld3_u8 (p0, x0 + svcntb () * 2),
+	   z0 = svld3 (p0, x0 + svcntb () * 2))
+
+/*
+** ld3_u8_3:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_u8_3, svuint8x3_t, uint8_t,
+	   z0 = svld3_u8 (p0, x0 + svcntb () * 3),
+	   z0 = svld3 (p0, x0 + svcntb () * 3))
+
+/*
+** ld3_u8_21:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_u8_21, svuint8x3_t, uint8_t,
+	   z0 = svld3_u8 (p0, x0 + svcntb () * 21),
+	   z0 = svld3 (p0, x0 + svcntb () * 21))
+
+/*
+** ld3_u8_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_u8_24, svuint8x3_t, uint8_t,
+	   z0 = svld3_u8 (p0, x0 + svcntb () * 24),
+	   z0 = svld3 (p0, x0 + svcntb () * 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_u8_m1:
+**	decb	x0
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u8_m1, svuint8x3_t, uint8_t,
+	   z0 = svld3_u8 (p0, x0 - svcntb ()),
+	   z0 = svld3 (p0, x0 - svcntb ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_u8_m2:
+**	decb	x0, all, mul #2
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_u8_m2, svuint8x3_t, uint8_t,
+	   z0 = svld3_u8 (p0, x0 - svcntb () * 2),
+	   z0 = svld3 (p0, x0 - svcntb () * 2))
+
+/*
+** ld3_u8_m3:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_u8_m3, svuint8x3_t, uint8_t,
+	   z0 = svld3_u8 (p0, x0 - svcntb () * 3),
+	   z0 = svld3 (p0, x0 - svcntb () * 3))
+
+/*
+** ld3_u8_m24:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_u8_m24, svuint8x3_t, uint8_t,
+	   z0 = svld3_u8 (p0, x0 - svcntb () * 24),
+	   z0 = svld3 (p0, x0 - svcntb () * 24))
+
+/*
+** ld3_u8_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_u8_m27, svuint8x3_t, uint8_t,
+	   z0 = svld3_u8 (p0, x0 - svcntb () * 27),
+	   z0 = svld3 (p0, x0 - svcntb () * 27))
+
+/*
+** ld3_vnum_u8_0:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u8_0, svuint8x3_t, uint8_t,
+	   z0 = svld3_vnum_u8 (p0, x0, 0),
+	   z0 = svld3_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_u8_1:
+**	incb	x0
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u8_1, svuint8x3_t, uint8_t,
+	   z0 = svld3_vnum_u8 (p0, x0, 1),
+	   z0 = svld3_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_u8_2:
+**	incb	x0, all, mul #2
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u8_2, svuint8x3_t, uint8_t,
+	   z0 = svld3_vnum_u8 (p0, x0, 2),
+	   z0 = svld3_vnum (p0, x0, 2))
+
+/*
+** ld3_vnum_u8_3:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u8_3, svuint8x3_t, uint8_t,
+	   z0 = svld3_vnum_u8 (p0, x0, 3),
+	   z0 = svld3_vnum (p0, x0, 3))
+
+/*
+** ld3_vnum_u8_21:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u8_21, svuint8x3_t, uint8_t,
+	   z0 = svld3_vnum_u8 (p0, x0, 21),
+	   z0 = svld3_vnum (p0, x0, 21))
+
+/*
+** ld3_vnum_u8_24:
+**	addvl	(x[0-9]+), x0, #24
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u8_24, svuint8x3_t, uint8_t,
+	   z0 = svld3_vnum_u8 (p0, x0, 24),
+	   z0 = svld3_vnum (p0, x0, 24))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_u8_m1:
+**	decb	x0
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u8_m1, svuint8x3_t, uint8_t,
+	   z0 = svld3_vnum_u8 (p0, x0, -1),
+	   z0 = svld3_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld3_vnum_u8_m2:
+**	decb	x0, all, mul #2
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u8_m2, svuint8x3_t, uint8_t,
+	   z0 = svld3_vnum_u8 (p0, x0, -2),
+	   z0 = svld3_vnum (p0, x0, -2))
+
+/*
+** ld3_vnum_u8_m3:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u8_m3, svuint8x3_t, uint8_t,
+	   z0 = svld3_vnum_u8 (p0, x0, -3),
+	   z0 = svld3_vnum (p0, x0, -3))
+
+/*
+** ld3_vnum_u8_m24:
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u8_m24, svuint8x3_t, uint8_t,
+	   z0 = svld3_vnum_u8 (p0, x0, -24),
+	   z0 = svld3_vnum (p0, x0, -24))
+
+/*
+** ld3_vnum_u8_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u8_m27, svuint8x3_t, uint8_t,
+	   z0 = svld3_vnum_u8 (p0, x0, -27),
+	   z0 = svld3_vnum (p0, x0, -27))
+
+/*
+** ld3_vnum_u8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld3b	{z0\.b - z2\.b}, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld3_vnum_u8_x1, svuint8x3_t, uint8_t,
+	   z0 = svld3_vnum_u8 (p0, x0, x1),
+	   z0 = svld3_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_bf16.c
new file mode 100644
index 000000000..123ff6355
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_bf16.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld4_bf16_base:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_bf16_base, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_bf16 (p0, x0),
+	   z0 = svld4 (p0, x0))
+
+/*
+** ld4_bf16_index:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld4_bf16_index, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_bf16 (p0, x0 + x1),
+	   z0 = svld4 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_bf16_1:
+**	incb	x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_bf16_1, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_bf16 (p0, x0 + svcnth ()),
+	   z0 = svld4 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_bf16_2:
+**	incb	x0, all, mul #2
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_bf16_2, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_bf16 (p0, x0 + svcnth () * 2),
+	   z0 = svld4 (p0, x0 + svcnth () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_bf16_3:
+**	incb	x0, all, mul #3
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_bf16_3, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_bf16 (p0, x0 + svcnth () * 3),
+	   z0 = svld4 (p0, x0 + svcnth () * 3))
+
+/*
+** ld4_bf16_4:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_bf16_4, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_bf16 (p0, x0 + svcnth () * 4),
+	   z0 = svld4 (p0, x0 + svcnth () * 4))
+
+/*
+** ld4_bf16_28:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_bf16_28, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_bf16 (p0, x0 + svcnth () * 28),
+	   z0 = svld4 (p0, x0 + svcnth () * 28))
+
+/*
+** ld4_bf16_32:
+**	[^{]*
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_bf16_32, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_bf16 (p0, x0 + svcnth () * 32),
+	   z0 = svld4 (p0, x0 + svcnth () * 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_bf16_m1:
+**	decb	x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_bf16_m1, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_bf16 (p0, x0 - svcnth ()),
+	   z0 = svld4 (p0, x0 - svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_bf16_m2:
+**	decb	x0, all, mul #2
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_bf16_m2, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_bf16 (p0, x0 - svcnth () * 2),
+	   z0 = svld4 (p0, x0 - svcnth () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_bf16_m3:
+**	decb	x0, all, mul #3
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_bf16_m3, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_bf16 (p0, x0 - svcnth () * 3),
+	   z0 = svld4 (p0, x0 - svcnth () * 3))
+
+/*
+** ld4_bf16_m4:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_bf16_m4, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_bf16 (p0, x0 - svcnth () * 4),
+	   z0 = svld4 (p0, x0 - svcnth () * 4))
+
+/*
+** ld4_bf16_m32:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_bf16_m32, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_bf16 (p0, x0 - svcnth () * 32),
+	   z0 = svld4 (p0, x0 - svcnth () * 32))
+
+/*
+** ld4_bf16_m36:
+**	[^{]*
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_bf16_m36, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_bf16 (p0, x0 - svcnth () * 36),
+	   z0 = svld4 (p0, x0 - svcnth () * 36))
+
+/*
+** ld4_vnum_bf16_0:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_bf16_0, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_vnum_bf16 (p0, x0, 0),
+	   z0 = svld4_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_bf16_1:
+**	incb	x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_bf16_1, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_vnum_bf16 (p0, x0, 1),
+	   z0 = svld4_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_bf16_2:
+**	incb	x0, all, mul #2
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_bf16_2, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_vnum_bf16 (p0, x0, 2),
+	   z0 = svld4_vnum (p0, x0, 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_bf16_3:
+**	incb	x0, all, mul #3
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_bf16_3, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_vnum_bf16 (p0, x0, 3),
+	   z0 = svld4_vnum (p0, x0, 3))
+
+/*
+** ld4_vnum_bf16_4:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_bf16_4, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_vnum_bf16 (p0, x0, 4),
+	   z0 = svld4_vnum (p0, x0, 4))
+
+/*
+** ld4_vnum_bf16_28:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_bf16_28, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_vnum_bf16 (p0, x0, 28),
+	   z0 = svld4_vnum (p0, x0, 28))
+
+/*
+** ld4_vnum_bf16_32:
+**	[^{]*
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_bf16_32, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_vnum_bf16 (p0, x0, 32),
+	   z0 = svld4_vnum (p0, x0, 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_bf16_m1:
+**	decb	x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m1, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_vnum_bf16 (p0, x0, -1),
+	   z0 = svld4_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_bf16_m2:
+**	decb	x0, all, mul #2
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m2, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_vnum_bf16 (p0, x0, -2),
+	   z0 = svld4_vnum (p0, x0, -2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_bf16_m3:
+**	decb	x0, all, mul #3
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m3, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_vnum_bf16 (p0, x0, -3),
+	   z0 = svld4_vnum (p0, x0, -3))
+
+/*
+** ld4_vnum_bf16_m4:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m4, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_vnum_bf16 (p0, x0, -4),
+	   z0 = svld4_vnum (p0, x0, -4))
+
+/*
+** ld4_vnum_bf16_m32:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m32, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_vnum_bf16 (p0, x0, -32),
+	   z0 = svld4_vnum (p0, x0, -32))
+
+/*
+** ld4_vnum_bf16_m36:
+**	[^{]*
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m36, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_vnum_bf16 (p0, x0, -36),
+	   z0 = svld4_vnum (p0, x0, -36))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld4_vnum_bf16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_bf16_x1, svbfloat16x4_t, bfloat16_t,
+	   z0 = svld4_vnum_bf16 (p0, x0, x1),
+	   z0 = svld4_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f16.c
new file mode 100644
index 000000000..0d0ecf0af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f16.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld4_f16_base:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f16_base, svfloat16x4_t, float16_t,
+	   z0 = svld4_f16 (p0, x0),
+	   z0 = svld4 (p0, x0))
+
+/*
+** ld4_f16_index:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld4_f16_index, svfloat16x4_t, float16_t,
+	   z0 = svld4_f16 (p0, x0 + x1),
+	   z0 = svld4 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f16_1:
+**	incb	x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f16_1, svfloat16x4_t, float16_t,
+	   z0 = svld4_f16 (p0, x0 + svcnth ()),
+	   z0 = svld4 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f16_2:
+**	incb	x0, all, mul #2
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f16_2, svfloat16x4_t, float16_t,
+	   z0 = svld4_f16 (p0, x0 + svcnth () * 2),
+	   z0 = svld4 (p0, x0 + svcnth () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f16_3:
+**	incb	x0, all, mul #3
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f16_3, svfloat16x4_t, float16_t,
+	   z0 = svld4_f16 (p0, x0 + svcnth () * 3),
+	   z0 = svld4 (p0, x0 + svcnth () * 3))
+
+/*
+** ld4_f16_4:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_f16_4, svfloat16x4_t, float16_t,
+	   z0 = svld4_f16 (p0, x0 + svcnth () * 4),
+	   z0 = svld4 (p0, x0 + svcnth () * 4))
+
+/*
+** ld4_f16_28:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_f16_28, svfloat16x4_t, float16_t,
+	   z0 = svld4_f16 (p0, x0 + svcnth () * 28),
+	   z0 = svld4 (p0, x0 + svcnth () * 28))
+
+/*
+** ld4_f16_32:
+**	[^{]*
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_f16_32, svfloat16x4_t, float16_t,
+	   z0 = svld4_f16 (p0, x0 + svcnth () * 32),
+	   z0 = svld4 (p0, x0 + svcnth () * 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f16_m1:
+**	decb	x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f16_m1, svfloat16x4_t, float16_t,
+	   z0 = svld4_f16 (p0, x0 - svcnth ()),
+	   z0 = svld4 (p0, x0 - svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f16_m2:
+**	decb	x0, all, mul #2
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f16_m2, svfloat16x4_t, float16_t,
+	   z0 = svld4_f16 (p0, x0 - svcnth () * 2),
+	   z0 = svld4 (p0, x0 - svcnth () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f16_m3:
+**	decb	x0, all, mul #3
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f16_m3, svfloat16x4_t, float16_t,
+	   z0 = svld4_f16 (p0, x0 - svcnth () * 3),
+	   z0 = svld4 (p0, x0 - svcnth () * 3))
+
+/*
+** ld4_f16_m4:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_f16_m4, svfloat16x4_t, float16_t,
+	   z0 = svld4_f16 (p0, x0 - svcnth () * 4),
+	   z0 = svld4 (p0, x0 - svcnth () * 4))
+
+/*
+** ld4_f16_m32:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_f16_m32, svfloat16x4_t, float16_t,
+	   z0 = svld4_f16 (p0, x0 - svcnth () * 32),
+	   z0 = svld4 (p0, x0 - svcnth () * 32))
+
+/*
+** ld4_f16_m36:
+**	[^{]*
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_f16_m36, svfloat16x4_t, float16_t,
+	   z0 = svld4_f16 (p0, x0 - svcnth () * 36),
+	   z0 = svld4 (p0, x0 - svcnth () * 36))
+
+/*
+** ld4_vnum_f16_0:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f16_0, svfloat16x4_t, float16_t,
+	   z0 = svld4_vnum_f16 (p0, x0, 0),
+	   z0 = svld4_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f16_1:
+**	incb	x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f16_1, svfloat16x4_t, float16_t,
+	   z0 = svld4_vnum_f16 (p0, x0, 1),
+	   z0 = svld4_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f16_2:
+**	incb	x0, all, mul #2
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f16_2, svfloat16x4_t, float16_t,
+	   z0 = svld4_vnum_f16 (p0, x0, 2),
+	   z0 = svld4_vnum (p0, x0, 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f16_3:
+**	incb	x0, all, mul #3
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f16_3, svfloat16x4_t, float16_t,
+	   z0 = svld4_vnum_f16 (p0, x0, 3),
+	   z0 = svld4_vnum (p0, x0, 3))
+
+/*
+** ld4_vnum_f16_4:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f16_4, svfloat16x4_t, float16_t,
+	   z0 = svld4_vnum_f16 (p0, x0, 4),
+	   z0 = svld4_vnum (p0, x0, 4))
+
+/*
+** ld4_vnum_f16_28:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f16_28, svfloat16x4_t, float16_t,
+	   z0 = svld4_vnum_f16 (p0, x0, 28),
+	   z0 = svld4_vnum (p0, x0, 28))
+
+/*
+** ld4_vnum_f16_32:
+**	[^{]*
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f16_32, svfloat16x4_t, float16_t,
+	   z0 = svld4_vnum_f16 (p0, x0, 32),
+	   z0 = svld4_vnum (p0, x0, 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f16_m1:
+**	decb	x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f16_m1, svfloat16x4_t, float16_t,
+	   z0 = svld4_vnum_f16 (p0, x0, -1),
+	   z0 = svld4_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f16_m2:
+**	decb	x0, all, mul #2
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f16_m2, svfloat16x4_t, float16_t,
+	   z0 = svld4_vnum_f16 (p0, x0, -2),
+	   z0 = svld4_vnum (p0, x0, -2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f16_m3:
+**	decb	x0, all, mul #3
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f16_m3, svfloat16x4_t, float16_t,
+	   z0 = svld4_vnum_f16 (p0, x0, -3),
+	   z0 = svld4_vnum (p0, x0, -3))
+
+/*
+** ld4_vnum_f16_m4:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f16_m4, svfloat16x4_t, float16_t,
+	   z0 = svld4_vnum_f16 (p0, x0, -4),
+	   z0 = svld4_vnum (p0, x0, -4))
+
+/*
+** ld4_vnum_f16_m32:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f16_m32, svfloat16x4_t, float16_t,
+	   z0 = svld4_vnum_f16 (p0, x0, -32),
+	   z0 = svld4_vnum (p0, x0, -32))
+
+/*
+** ld4_vnum_f16_m36:
+**	[^{]*
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f16_m36, svfloat16x4_t, float16_t,
+	   z0 = svld4_vnum_f16 (p0, x0, -36),
+	   z0 = svld4_vnum (p0, x0, -36))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld4_vnum_f16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f16_x1, svfloat16x4_t, float16_t,
+	   z0 = svld4_vnum_f16 (p0, x0, x1),
+	   z0 = svld4_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f32.c
new file mode 100644
index 000000000..a433d1ffe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f32.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld4_f32_base:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f32_base, svfloat32x4_t, float32_t,
+	   z0 = svld4_f32 (p0, x0),
+	   z0 = svld4 (p0, x0))
+
+/*
+** ld4_f32_index:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld4_f32_index, svfloat32x4_t, float32_t,
+	   z0 = svld4_f32 (p0, x0 + x1),
+	   z0 = svld4 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f32_1:
+**	incb	x0
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f32_1, svfloat32x4_t, float32_t,
+	   z0 = svld4_f32 (p0, x0 + svcntw ()),
+	   z0 = svld4 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f32_2:
+**	incb	x0, all, mul #2
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f32_2, svfloat32x4_t, float32_t,
+	   z0 = svld4_f32 (p0, x0 + svcntw () * 2),
+	   z0 = svld4 (p0, x0 + svcntw () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f32_3:
+**	incb	x0, all, mul #3
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f32_3, svfloat32x4_t, float32_t,
+	   z0 = svld4_f32 (p0, x0 + svcntw () * 3),
+	   z0 = svld4 (p0, x0 + svcntw () * 3))
+
+/*
+** ld4_f32_4:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_f32_4, svfloat32x4_t, float32_t,
+	   z0 = svld4_f32 (p0, x0 + svcntw () * 4),
+	   z0 = svld4 (p0, x0 + svcntw () * 4))
+
+/*
+** ld4_f32_28:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_f32_28, svfloat32x4_t, float32_t,
+	   z0 = svld4_f32 (p0, x0 + svcntw () * 28),
+	   z0 = svld4 (p0, x0 + svcntw () * 28))
+
+/*
+** ld4_f32_32:
+**	[^{]*
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_f32_32, svfloat32x4_t, float32_t,
+	   z0 = svld4_f32 (p0, x0 + svcntw () * 32),
+	   z0 = svld4 (p0, x0 + svcntw () * 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f32_m1:
+**	decb	x0
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f32_m1, svfloat32x4_t, float32_t,
+	   z0 = svld4_f32 (p0, x0 - svcntw ()),
+	   z0 = svld4 (p0, x0 - svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f32_m2:
+**	decb	x0, all, mul #2
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f32_m2, svfloat32x4_t, float32_t,
+	   z0 = svld4_f32 (p0, x0 - svcntw () * 2),
+	   z0 = svld4 (p0, x0 - svcntw () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f32_m3:
+**	decb	x0, all, mul #3
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f32_m3, svfloat32x4_t, float32_t,
+	   z0 = svld4_f32 (p0, x0 - svcntw () * 3),
+	   z0 = svld4 (p0, x0 - svcntw () * 3))
+
+/*
+** ld4_f32_m4:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_f32_m4, svfloat32x4_t, float32_t,
+	   z0 = svld4_f32 (p0, x0 - svcntw () * 4),
+	   z0 = svld4 (p0, x0 - svcntw () * 4))
+
+/*
+** ld4_f32_m32:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_f32_m32, svfloat32x4_t, float32_t,
+	   z0 = svld4_f32 (p0, x0 - svcntw () * 32),
+	   z0 = svld4 (p0, x0 - svcntw () * 32))
+
+/*
+** ld4_f32_m36:
+**	[^{]*
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_f32_m36, svfloat32x4_t, float32_t,
+	   z0 = svld4_f32 (p0, x0 - svcntw () * 36),
+	   z0 = svld4 (p0, x0 - svcntw () * 36))
+
+/*
+** ld4_vnum_f32_0:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f32_0, svfloat32x4_t, float32_t,
+	   z0 = svld4_vnum_f32 (p0, x0, 0),
+	   z0 = svld4_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f32_1:
+**	incb	x0
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f32_1, svfloat32x4_t, float32_t,
+	   z0 = svld4_vnum_f32 (p0, x0, 1),
+	   z0 = svld4_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f32_2:
+**	incb	x0, all, mul #2
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f32_2, svfloat32x4_t, float32_t,
+	   z0 = svld4_vnum_f32 (p0, x0, 2),
+	   z0 = svld4_vnum (p0, x0, 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f32_3:
+**	incb	x0, all, mul #3
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f32_3, svfloat32x4_t, float32_t,
+	   z0 = svld4_vnum_f32 (p0, x0, 3),
+	   z0 = svld4_vnum (p0, x0, 3))
+
+/*
+** ld4_vnum_f32_4:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f32_4, svfloat32x4_t, float32_t,
+	   z0 = svld4_vnum_f32 (p0, x0, 4),
+	   z0 = svld4_vnum (p0, x0, 4))
+
+/*
+** ld4_vnum_f32_28:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f32_28, svfloat32x4_t, float32_t,
+	   z0 = svld4_vnum_f32 (p0, x0, 28),
+	   z0 = svld4_vnum (p0, x0, 28))
+
+/*
+** ld4_vnum_f32_32:
+**	[^{]*
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f32_32, svfloat32x4_t, float32_t,
+	   z0 = svld4_vnum_f32 (p0, x0, 32),
+	   z0 = svld4_vnum (p0, x0, 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f32_m1:
+**	decb	x0
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f32_m1, svfloat32x4_t, float32_t,
+	   z0 = svld4_vnum_f32 (p0, x0, -1),
+	   z0 = svld4_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f32_m2:
+**	decb	x0, all, mul #2
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f32_m2, svfloat32x4_t, float32_t,
+	   z0 = svld4_vnum_f32 (p0, x0, -2),
+	   z0 = svld4_vnum (p0, x0, -2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f32_m3:
+**	decb	x0, all, mul #3
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f32_m3, svfloat32x4_t, float32_t,
+	   z0 = svld4_vnum_f32 (p0, x0, -3),
+	   z0 = svld4_vnum (p0, x0, -3))
+
+/*
+** ld4_vnum_f32_m4:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f32_m4, svfloat32x4_t, float32_t,
+	   z0 = svld4_vnum_f32 (p0, x0, -4),
+	   z0 = svld4_vnum (p0, x0, -4))
+
+/*
+** ld4_vnum_f32_m32:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f32_m32, svfloat32x4_t, float32_t,
+	   z0 = svld4_vnum_f32 (p0, x0, -32),
+	   z0 = svld4_vnum (p0, x0, -32))
+
+/*
+** ld4_vnum_f32_m36:
+**	[^{]*
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f32_m36, svfloat32x4_t, float32_t,
+	   z0 = svld4_vnum_f32 (p0, x0, -36),
+	   z0 = svld4_vnum (p0, x0, -36))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld4_vnum_f32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f32_x1, svfloat32x4_t, float32_t,
+	   z0 = svld4_vnum_f32 (p0, x0, x1),
+	   z0 = svld4_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f64.c
new file mode 100644
index 000000000..bb18decec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_f64.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld4_f64_base:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f64_base, svfloat64x4_t, float64_t,
+	   z0 = svld4_f64 (p0, x0),
+	   z0 = svld4 (p0, x0))
+
+/*
+** ld4_f64_index:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld4_f64_index, svfloat64x4_t, float64_t,
+	   z0 = svld4_f64 (p0, x0 + x1),
+	   z0 = svld4 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f64_1:
+**	incb	x0
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f64_1, svfloat64x4_t, float64_t,
+	   z0 = svld4_f64 (p0, x0 + svcntd ()),
+	   z0 = svld4 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f64_2:
+**	incb	x0, all, mul #2
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f64_2, svfloat64x4_t, float64_t,
+	   z0 = svld4_f64 (p0, x0 + svcntd () * 2),
+	   z0 = svld4 (p0, x0 + svcntd () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f64_3:
+**	incb	x0, all, mul #3
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f64_3, svfloat64x4_t, float64_t,
+	   z0 = svld4_f64 (p0, x0 + svcntd () * 3),
+	   z0 = svld4 (p0, x0 + svcntd () * 3))
+
+/*
+** ld4_f64_4:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_f64_4, svfloat64x4_t, float64_t,
+	   z0 = svld4_f64 (p0, x0 + svcntd () * 4),
+	   z0 = svld4 (p0, x0 + svcntd () * 4))
+
+/*
+** ld4_f64_28:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_f64_28, svfloat64x4_t, float64_t,
+	   z0 = svld4_f64 (p0, x0 + svcntd () * 28),
+	   z0 = svld4 (p0, x0 + svcntd () * 28))
+
+/*
+** ld4_f64_32:
+**	[^{]*
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_f64_32, svfloat64x4_t, float64_t,
+	   z0 = svld4_f64 (p0, x0 + svcntd () * 32),
+	   z0 = svld4 (p0, x0 + svcntd () * 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f64_m1:
+**	decb	x0
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f64_m1, svfloat64x4_t, float64_t,
+	   z0 = svld4_f64 (p0, x0 - svcntd ()),
+	   z0 = svld4 (p0, x0 - svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f64_m2:
+**	decb	x0, all, mul #2
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f64_m2, svfloat64x4_t, float64_t,
+	   z0 = svld4_f64 (p0, x0 - svcntd () * 2),
+	   z0 = svld4 (p0, x0 - svcntd () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_f64_m3:
+**	decb	x0, all, mul #3
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_f64_m3, svfloat64x4_t, float64_t,
+	   z0 = svld4_f64 (p0, x0 - svcntd () * 3),
+	   z0 = svld4 (p0, x0 - svcntd () * 3))
+
+/*
+** ld4_f64_m4:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_f64_m4, svfloat64x4_t, float64_t,
+	   z0 = svld4_f64 (p0, x0 - svcntd () * 4),
+	   z0 = svld4 (p0, x0 - svcntd () * 4))
+
+/*
+** ld4_f64_m32:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_f64_m32, svfloat64x4_t, float64_t,
+	   z0 = svld4_f64 (p0, x0 - svcntd () * 32),
+	   z0 = svld4 (p0, x0 - svcntd () * 32))
+
+/*
+** ld4_f64_m36:
+**	[^{]*
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_f64_m36, svfloat64x4_t, float64_t,
+	   z0 = svld4_f64 (p0, x0 - svcntd () * 36),
+	   z0 = svld4 (p0, x0 - svcntd () * 36))
+
+/*
+** ld4_vnum_f64_0:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f64_0, svfloat64x4_t, float64_t,
+	   z0 = svld4_vnum_f64 (p0, x0, 0),
+	   z0 = svld4_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f64_1:
+**	incb	x0
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f64_1, svfloat64x4_t, float64_t,
+	   z0 = svld4_vnum_f64 (p0, x0, 1),
+	   z0 = svld4_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f64_2:
+**	incb	x0, all, mul #2
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f64_2, svfloat64x4_t, float64_t,
+	   z0 = svld4_vnum_f64 (p0, x0, 2),
+	   z0 = svld4_vnum (p0, x0, 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f64_3:
+**	incb	x0, all, mul #3
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f64_3, svfloat64x4_t, float64_t,
+	   z0 = svld4_vnum_f64 (p0, x0, 3),
+	   z0 = svld4_vnum (p0, x0, 3))
+
+/*
+** ld4_vnum_f64_4:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f64_4, svfloat64x4_t, float64_t,
+	   z0 = svld4_vnum_f64 (p0, x0, 4),
+	   z0 = svld4_vnum (p0, x0, 4))
+
+/*
+** ld4_vnum_f64_28:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f64_28, svfloat64x4_t, float64_t,
+	   z0 = svld4_vnum_f64 (p0, x0, 28),
+	   z0 = svld4_vnum (p0, x0, 28))
+
+/*
+** ld4_vnum_f64_32:
+**	[^{]*
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f64_32, svfloat64x4_t, float64_t,
+	   z0 = svld4_vnum_f64 (p0, x0, 32),
+	   z0 = svld4_vnum (p0, x0, 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f64_m1:
+**	decb	x0
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f64_m1, svfloat64x4_t, float64_t,
+	   z0 = svld4_vnum_f64 (p0, x0, -1),
+	   z0 = svld4_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f64_m2:
+**	decb	x0, all, mul #2
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f64_m2, svfloat64x4_t, float64_t,
+	   z0 = svld4_vnum_f64 (p0, x0, -2),
+	   z0 = svld4_vnum (p0, x0, -2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_f64_m3:
+**	decb	x0, all, mul #3
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f64_m3, svfloat64x4_t, float64_t,
+	   z0 = svld4_vnum_f64 (p0, x0, -3),
+	   z0 = svld4_vnum (p0, x0, -3))
+
+/*
+** ld4_vnum_f64_m4:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f64_m4, svfloat64x4_t, float64_t,
+	   z0 = svld4_vnum_f64 (p0, x0, -4),
+	   z0 = svld4_vnum (p0, x0, -4))
+
+/*
+** ld4_vnum_f64_m32:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f64_m32, svfloat64x4_t, float64_t,
+	   z0 = svld4_vnum_f64 (p0, x0, -32),
+	   z0 = svld4_vnum (p0, x0, -32))
+
+/*
+** ld4_vnum_f64_m36:
+**	[^{]*
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f64_m36, svfloat64x4_t, float64_t,
+	   z0 = svld4_vnum_f64 (p0, x0, -36),
+	   z0 = svld4_vnum (p0, x0, -36))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld4_vnum_f64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_f64_x1, svfloat64x4_t, float64_t,
+	   z0 = svld4_vnum_f64 (p0, x0, x1),
+	   z0 = svld4_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s16.c
new file mode 100644
index 000000000..15fb1b595
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s16.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld4_s16_base:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s16_base, svint16x4_t, int16_t,
+	   z0 = svld4_s16 (p0, x0),
+	   z0 = svld4 (p0, x0))
+
+/*
+** ld4_s16_index:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld4_s16_index, svint16x4_t, int16_t,
+	   z0 = svld4_s16 (p0, x0 + x1),
+	   z0 = svld4 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s16_1:
+**	incb	x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s16_1, svint16x4_t, int16_t,
+	   z0 = svld4_s16 (p0, x0 + svcnth ()),
+	   z0 = svld4 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s16_2:
+**	incb	x0, all, mul #2
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s16_2, svint16x4_t, int16_t,
+	   z0 = svld4_s16 (p0, x0 + svcnth () * 2),
+	   z0 = svld4 (p0, x0 + svcnth () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s16_3:
+**	incb	x0, all, mul #3
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s16_3, svint16x4_t, int16_t,
+	   z0 = svld4_s16 (p0, x0 + svcnth () * 3),
+	   z0 = svld4 (p0, x0 + svcnth () * 3))
+
+/*
+** ld4_s16_4:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_s16_4, svint16x4_t, int16_t,
+	   z0 = svld4_s16 (p0, x0 + svcnth () * 4),
+	   z0 = svld4 (p0, x0 + svcnth () * 4))
+
+/*
+** ld4_s16_28:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_s16_28, svint16x4_t, int16_t,
+	   z0 = svld4_s16 (p0, x0 + svcnth () * 28),
+	   z0 = svld4 (p0, x0 + svcnth () * 28))
+
+/*
+** ld4_s16_32:
+**	[^{]*
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_s16_32, svint16x4_t, int16_t,
+	   z0 = svld4_s16 (p0, x0 + svcnth () * 32),
+	   z0 = svld4 (p0, x0 + svcnth () * 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s16_m1:
+**	decb	x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s16_m1, svint16x4_t, int16_t,
+	   z0 = svld4_s16 (p0, x0 - svcnth ()),
+	   z0 = svld4 (p0, x0 - svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s16_m2:
+**	decb	x0, all, mul #2
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s16_m2, svint16x4_t, int16_t,
+	   z0 = svld4_s16 (p0, x0 - svcnth () * 2),
+	   z0 = svld4 (p0, x0 - svcnth () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s16_m3:
+**	decb	x0, all, mul #3
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s16_m3, svint16x4_t, int16_t,
+	   z0 = svld4_s16 (p0, x0 - svcnth () * 3),
+	   z0 = svld4 (p0, x0 - svcnth () * 3))
+
+/*
+** ld4_s16_m4:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_s16_m4, svint16x4_t, int16_t,
+	   z0 = svld4_s16 (p0, x0 - svcnth () * 4),
+	   z0 = svld4 (p0, x0 - svcnth () * 4))
+
+/*
+** ld4_s16_m32:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_s16_m32, svint16x4_t, int16_t,
+	   z0 = svld4_s16 (p0, x0 - svcnth () * 32),
+	   z0 = svld4 (p0, x0 - svcnth () * 32))
+
+/*
+** ld4_s16_m36:
+**	[^{]*
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_s16_m36, svint16x4_t, int16_t,
+	   z0 = svld4_s16 (p0, x0 - svcnth () * 36),
+	   z0 = svld4 (p0, x0 - svcnth () * 36))
+
+/*
+** ld4_vnum_s16_0:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s16_0, svint16x4_t, int16_t,
+	   z0 = svld4_vnum_s16 (p0, x0, 0),
+	   z0 = svld4_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s16_1:
+**	incb	x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s16_1, svint16x4_t, int16_t,
+	   z0 = svld4_vnum_s16 (p0, x0, 1),
+	   z0 = svld4_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s16_2:
+**	incb	x0, all, mul #2
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s16_2, svint16x4_t, int16_t,
+	   z0 = svld4_vnum_s16 (p0, x0, 2),
+	   z0 = svld4_vnum (p0, x0, 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s16_3:
+**	incb	x0, all, mul #3
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s16_3, svint16x4_t, int16_t,
+	   z0 = svld4_vnum_s16 (p0, x0, 3),
+	   z0 = svld4_vnum (p0, x0, 3))
+
+/*
+** ld4_vnum_s16_4:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s16_4, svint16x4_t, int16_t,
+	   z0 = svld4_vnum_s16 (p0, x0, 4),
+	   z0 = svld4_vnum (p0, x0, 4))
+
+/*
+** ld4_vnum_s16_28:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s16_28, svint16x4_t, int16_t,
+	   z0 = svld4_vnum_s16 (p0, x0, 28),
+	   z0 = svld4_vnum (p0, x0, 28))
+
+/*
+** ld4_vnum_s16_32:
+**	[^{]*
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s16_32, svint16x4_t, int16_t,
+	   z0 = svld4_vnum_s16 (p0, x0, 32),
+	   z0 = svld4_vnum (p0, x0, 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s16_m1:
+**	decb	x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s16_m1, svint16x4_t, int16_t,
+	   z0 = svld4_vnum_s16 (p0, x0, -1),
+	   z0 = svld4_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s16_m2:
+**	decb	x0, all, mul #2
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s16_m2, svint16x4_t, int16_t,
+	   z0 = svld4_vnum_s16 (p0, x0, -2),
+	   z0 = svld4_vnum (p0, x0, -2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s16_m3:
+**	decb	x0, all, mul #3
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s16_m3, svint16x4_t, int16_t,
+	   z0 = svld4_vnum_s16 (p0, x0, -3),
+	   z0 = svld4_vnum (p0, x0, -3))
+
+/*
+** ld4_vnum_s16_m4:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s16_m4, svint16x4_t, int16_t,
+	   z0 = svld4_vnum_s16 (p0, x0, -4),
+	   z0 = svld4_vnum (p0, x0, -4))
+
+/*
+** ld4_vnum_s16_m32:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s16_m32, svint16x4_t, int16_t,
+	   z0 = svld4_vnum_s16 (p0, x0, -32),
+	   z0 = svld4_vnum (p0, x0, -32))
+
+/*
+** ld4_vnum_s16_m36:
+**	[^{]*
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s16_m36, svint16x4_t, int16_t,
+	   z0 = svld4_vnum_s16 (p0, x0, -36),
+	   z0 = svld4_vnum (p0, x0, -36))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld4_vnum_s16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s16_x1, svint16x4_t, int16_t,
+	   z0 = svld4_vnum_s16 (p0, x0, x1),
+	   z0 = svld4_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s32.c
new file mode 100644
index 000000000..81c67710f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s32.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld4_s32_base:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s32_base, svint32x4_t, int32_t,
+	   z0 = svld4_s32 (p0, x0),
+	   z0 = svld4 (p0, x0))
+
+/*
+** ld4_s32_index:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld4_s32_index, svint32x4_t, int32_t,
+	   z0 = svld4_s32 (p0, x0 + x1),
+	   z0 = svld4 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s32_1:
+**	incb	x0
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s32_1, svint32x4_t, int32_t,
+	   z0 = svld4_s32 (p0, x0 + svcntw ()),
+	   z0 = svld4 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s32_2:
+**	incb	x0, all, mul #2
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s32_2, svint32x4_t, int32_t,
+	   z0 = svld4_s32 (p0, x0 + svcntw () * 2),
+	   z0 = svld4 (p0, x0 + svcntw () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s32_3:
+**	incb	x0, all, mul #3
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s32_3, svint32x4_t, int32_t,
+	   z0 = svld4_s32 (p0, x0 + svcntw () * 3),
+	   z0 = svld4 (p0, x0 + svcntw () * 3))
+
+/*
+** ld4_s32_4:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_s32_4, svint32x4_t, int32_t,
+	   z0 = svld4_s32 (p0, x0 + svcntw () * 4),
+	   z0 = svld4 (p0, x0 + svcntw () * 4))
+
+/*
+** ld4_s32_28:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_s32_28, svint32x4_t, int32_t,
+	   z0 = svld4_s32 (p0, x0 + svcntw () * 28),
+	   z0 = svld4 (p0, x0 + svcntw () * 28))
+
+/*
+** ld4_s32_32:
+**	[^{]*
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_s32_32, svint32x4_t, int32_t,
+	   z0 = svld4_s32 (p0, x0 + svcntw () * 32),
+	   z0 = svld4 (p0, x0 + svcntw () * 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s32_m1:
+**	decb	x0
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s32_m1, svint32x4_t, int32_t,
+	   z0 = svld4_s32 (p0, x0 - svcntw ()),
+	   z0 = svld4 (p0, x0 - svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s32_m2:
+**	decb	x0, all, mul #2
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s32_m2, svint32x4_t, int32_t,
+	   z0 = svld4_s32 (p0, x0 - svcntw () * 2),
+	   z0 = svld4 (p0, x0 - svcntw () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s32_m3:
+**	decb	x0, all, mul #3
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s32_m3, svint32x4_t, int32_t,
+	   z0 = svld4_s32 (p0, x0 - svcntw () * 3),
+	   z0 = svld4 (p0, x0 - svcntw () * 3))
+
+/*
+** ld4_s32_m4:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_s32_m4, svint32x4_t, int32_t,
+	   z0 = svld4_s32 (p0, x0 - svcntw () * 4),
+	   z0 = svld4 (p0, x0 - svcntw () * 4))
+
+/*
+** ld4_s32_m32:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_s32_m32, svint32x4_t, int32_t,
+	   z0 = svld4_s32 (p0, x0 - svcntw () * 32),
+	   z0 = svld4 (p0, x0 - svcntw () * 32))
+
+/*
+** ld4_s32_m36:
+**	[^{]*
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_s32_m36, svint32x4_t, int32_t,
+	   z0 = svld4_s32 (p0, x0 - svcntw () * 36),
+	   z0 = svld4 (p0, x0 - svcntw () * 36))
+
+/*
+** ld4_vnum_s32_0:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s32_0, svint32x4_t, int32_t,
+	   z0 = svld4_vnum_s32 (p0, x0, 0),
+	   z0 = svld4_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s32_1:
+**	incb	x0
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s32_1, svint32x4_t, int32_t,
+	   z0 = svld4_vnum_s32 (p0, x0, 1),
+	   z0 = svld4_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s32_2:
+**	incb	x0, all, mul #2
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s32_2, svint32x4_t, int32_t,
+	   z0 = svld4_vnum_s32 (p0, x0, 2),
+	   z0 = svld4_vnum (p0, x0, 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s32_3:
+**	incb	x0, all, mul #3
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s32_3, svint32x4_t, int32_t,
+	   z0 = svld4_vnum_s32 (p0, x0, 3),
+	   z0 = svld4_vnum (p0, x0, 3))
+
+/*
+** ld4_vnum_s32_4:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s32_4, svint32x4_t, int32_t,
+	   z0 = svld4_vnum_s32 (p0, x0, 4),
+	   z0 = svld4_vnum (p0, x0, 4))
+
+/*
+** ld4_vnum_s32_28:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s32_28, svint32x4_t, int32_t,
+	   z0 = svld4_vnum_s32 (p0, x0, 28),
+	   z0 = svld4_vnum (p0, x0, 28))
+
+/*
+** ld4_vnum_s32_32:
+**	[^{]*
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s32_32, svint32x4_t, int32_t,
+	   z0 = svld4_vnum_s32 (p0, x0, 32),
+	   z0 = svld4_vnum (p0, x0, 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s32_m1:
+**	decb	x0
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s32_m1, svint32x4_t, int32_t,
+	   z0 = svld4_vnum_s32 (p0, x0, -1),
+	   z0 = svld4_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s32_m2:
+**	decb	x0, all, mul #2
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s32_m2, svint32x4_t, int32_t,
+	   z0 = svld4_vnum_s32 (p0, x0, -2),
+	   z0 = svld4_vnum (p0, x0, -2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s32_m3:
+**	decb	x0, all, mul #3
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s32_m3, svint32x4_t, int32_t,
+	   z0 = svld4_vnum_s32 (p0, x0, -3),
+	   z0 = svld4_vnum (p0, x0, -3))
+
+/*
+** ld4_vnum_s32_m4:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s32_m4, svint32x4_t, int32_t,
+	   z0 = svld4_vnum_s32 (p0, x0, -4),
+	   z0 = svld4_vnum (p0, x0, -4))
+
+/*
+** ld4_vnum_s32_m32:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s32_m32, svint32x4_t, int32_t,
+	   z0 = svld4_vnum_s32 (p0, x0, -32),
+	   z0 = svld4_vnum (p0, x0, -32))
+
+/*
+** ld4_vnum_s32_m36:
+**	[^{]*
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s32_m36, svint32x4_t, int32_t,
+	   z0 = svld4_vnum_s32 (p0, x0, -36),
+	   z0 = svld4_vnum (p0, x0, -36))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld4_vnum_s32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s32_x1, svint32x4_t, int32_t,
+	   z0 = svld4_vnum_s32 (p0, x0, x1),
+	   z0 = svld4_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s64.c
new file mode 100644
index 000000000..d24c30dcf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s64.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld4_s64_base:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s64_base, svint64x4_t, int64_t,
+	   z0 = svld4_s64 (p0, x0),
+	   z0 = svld4 (p0, x0))
+
+/*
+** ld4_s64_index:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld4_s64_index, svint64x4_t, int64_t,
+	   z0 = svld4_s64 (p0, x0 + x1),
+	   z0 = svld4 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s64_1:
+**	incb	x0
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s64_1, svint64x4_t, int64_t,
+	   z0 = svld4_s64 (p0, x0 + svcntd ()),
+	   z0 = svld4 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s64_2:
+**	incb	x0, all, mul #2
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s64_2, svint64x4_t, int64_t,
+	   z0 = svld4_s64 (p0, x0 + svcntd () * 2),
+	   z0 = svld4 (p0, x0 + svcntd () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s64_3:
+**	incb	x0, all, mul #3
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s64_3, svint64x4_t, int64_t,
+	   z0 = svld4_s64 (p0, x0 + svcntd () * 3),
+	   z0 = svld4 (p0, x0 + svcntd () * 3))
+
+/*
+** ld4_s64_4:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_s64_4, svint64x4_t, int64_t,
+	   z0 = svld4_s64 (p0, x0 + svcntd () * 4),
+	   z0 = svld4 (p0, x0 + svcntd () * 4))
+
+/*
+** ld4_s64_28:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_s64_28, svint64x4_t, int64_t,
+	   z0 = svld4_s64 (p0, x0 + svcntd () * 28),
+	   z0 = svld4 (p0, x0 + svcntd () * 28))
+
+/*
+** ld4_s64_32:
+**	[^{]*
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_s64_32, svint64x4_t, int64_t,
+	   z0 = svld4_s64 (p0, x0 + svcntd () * 32),
+	   z0 = svld4 (p0, x0 + svcntd () * 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s64_m1:
+**	decb	x0
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s64_m1, svint64x4_t, int64_t,
+	   z0 = svld4_s64 (p0, x0 - svcntd ()),
+	   z0 = svld4 (p0, x0 - svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s64_m2:
+**	decb	x0, all, mul #2
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s64_m2, svint64x4_t, int64_t,
+	   z0 = svld4_s64 (p0, x0 - svcntd () * 2),
+	   z0 = svld4 (p0, x0 - svcntd () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s64_m3:
+**	decb	x0, all, mul #3
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s64_m3, svint64x4_t, int64_t,
+	   z0 = svld4_s64 (p0, x0 - svcntd () * 3),
+	   z0 = svld4 (p0, x0 - svcntd () * 3))
+
+/*
+** ld4_s64_m4:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_s64_m4, svint64x4_t, int64_t,
+	   z0 = svld4_s64 (p0, x0 - svcntd () * 4),
+	   z0 = svld4 (p0, x0 - svcntd () * 4))
+
+/*
+** ld4_s64_m32:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_s64_m32, svint64x4_t, int64_t,
+	   z0 = svld4_s64 (p0, x0 - svcntd () * 32),
+	   z0 = svld4 (p0, x0 - svcntd () * 32))
+
+/*
+** ld4_s64_m36:
+**	[^{]*
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_s64_m36, svint64x4_t, int64_t,
+	   z0 = svld4_s64 (p0, x0 - svcntd () * 36),
+	   z0 = svld4 (p0, x0 - svcntd () * 36))
+
+/*
+** ld4_vnum_s64_0:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s64_0, svint64x4_t, int64_t,
+	   z0 = svld4_vnum_s64 (p0, x0, 0),
+	   z0 = svld4_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s64_1:
+**	incb	x0
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s64_1, svint64x4_t, int64_t,
+	   z0 = svld4_vnum_s64 (p0, x0, 1),
+	   z0 = svld4_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s64_2:
+**	incb	x0, all, mul #2
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s64_2, svint64x4_t, int64_t,
+	   z0 = svld4_vnum_s64 (p0, x0, 2),
+	   z0 = svld4_vnum (p0, x0, 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s64_3:
+**	incb	x0, all, mul #3
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s64_3, svint64x4_t, int64_t,
+	   z0 = svld4_vnum_s64 (p0, x0, 3),
+	   z0 = svld4_vnum (p0, x0, 3))
+
+/*
+** ld4_vnum_s64_4:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s64_4, svint64x4_t, int64_t,
+	   z0 = svld4_vnum_s64 (p0, x0, 4),
+	   z0 = svld4_vnum (p0, x0, 4))
+
+/*
+** ld4_vnum_s64_28:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s64_28, svint64x4_t, int64_t,
+	   z0 = svld4_vnum_s64 (p0, x0, 28),
+	   z0 = svld4_vnum (p0, x0, 28))
+
+/*
+** ld4_vnum_s64_32:
+**	[^{]*
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s64_32, svint64x4_t, int64_t,
+	   z0 = svld4_vnum_s64 (p0, x0, 32),
+	   z0 = svld4_vnum (p0, x0, 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s64_m1:
+**	decb	x0
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s64_m1, svint64x4_t, int64_t,
+	   z0 = svld4_vnum_s64 (p0, x0, -1),
+	   z0 = svld4_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s64_m2:
+**	decb	x0, all, mul #2
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s64_m2, svint64x4_t, int64_t,
+	   z0 = svld4_vnum_s64 (p0, x0, -2),
+	   z0 = svld4_vnum (p0, x0, -2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s64_m3:
+**	decb	x0, all, mul #3
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s64_m3, svint64x4_t, int64_t,
+	   z0 = svld4_vnum_s64 (p0, x0, -3),
+	   z0 = svld4_vnum (p0, x0, -3))
+
+/*
+** ld4_vnum_s64_m4:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s64_m4, svint64x4_t, int64_t,
+	   z0 = svld4_vnum_s64 (p0, x0, -4),
+	   z0 = svld4_vnum (p0, x0, -4))
+
+/*
+** ld4_vnum_s64_m32:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s64_m32, svint64x4_t, int64_t,
+	   z0 = svld4_vnum_s64 (p0, x0, -32),
+	   z0 = svld4_vnum (p0, x0, -32))
+
+/*
+** ld4_vnum_s64_m36:
+**	[^{]*
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s64_m36, svint64x4_t, int64_t,
+	   z0 = svld4_vnum_s64 (p0, x0, -36),
+	   z0 = svld4_vnum (p0, x0, -36))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld4_vnum_s64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s64_x1, svint64x4_t, int64_t,
+	   z0 = svld4_vnum_s64 (p0, x0, x1),
+	   z0 = svld4_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s8.c
new file mode 100644
index 000000000..d7a17e266
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_s8.c
@@ -0,0 +1,290 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld4_s8_base:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s8_base, svint8x4_t, int8_t,
+	   z0 = svld4_s8 (p0, x0),
+	   z0 = svld4 (p0, x0))
+
+/*
+** ld4_s8_index:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld4_s8_index, svint8x4_t, int8_t,
+	   z0 = svld4_s8 (p0, x0 + x1),
+	   z0 = svld4 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s8_1:
+**	incb	x0
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s8_1, svint8x4_t, int8_t,
+	   z0 = svld4_s8 (p0, x0 + svcntb ()),
+	   z0 = svld4 (p0, x0 + svcntb ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s8_2:
+**	incb	x0, all, mul #2
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s8_2, svint8x4_t, int8_t,
+	   z0 = svld4_s8 (p0, x0 + svcntb () * 2),
+	   z0 = svld4 (p0, x0 + svcntb () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s8_3:
+**	incb	x0, all, mul #3
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s8_3, svint8x4_t, int8_t,
+	   z0 = svld4_s8 (p0, x0 + svcntb () * 3),
+	   z0 = svld4 (p0, x0 + svcntb () * 3))
+
+/*
+** ld4_s8_4:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_s8_4, svint8x4_t, int8_t,
+	   z0 = svld4_s8 (p0, x0 + svcntb () * 4),
+	   z0 = svld4 (p0, x0 + svcntb () * 4))
+
+/*
+** ld4_s8_28:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_s8_28, svint8x4_t, int8_t,
+	   z0 = svld4_s8 (p0, x0 + svcntb () * 28),
+	   z0 = svld4 (p0, x0 + svcntb () * 28))
+
+/*
+** ld4_s8_32:
+**	[^{]*
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_s8_32, svint8x4_t, int8_t,
+	   z0 = svld4_s8 (p0, x0 + svcntb () * 32),
+	   z0 = svld4 (p0, x0 + svcntb () * 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s8_m1:
+**	decb	x0
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s8_m1, svint8x4_t, int8_t,
+	   z0 = svld4_s8 (p0, x0 - svcntb ()),
+	   z0 = svld4 (p0, x0 - svcntb ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s8_m2:
+**	decb	x0, all, mul #2
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s8_m2, svint8x4_t, int8_t,
+	   z0 = svld4_s8 (p0, x0 - svcntb () * 2),
+	   z0 = svld4 (p0, x0 - svcntb () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_s8_m3:
+**	decb	x0, all, mul #3
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_s8_m3, svint8x4_t, int8_t,
+	   z0 = svld4_s8 (p0, x0 - svcntb () * 3),
+	   z0 = svld4 (p0, x0 - svcntb () * 3))
+
+/*
+** ld4_s8_m4:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_s8_m4, svint8x4_t, int8_t,
+	   z0 = svld4_s8 (p0, x0 - svcntb () * 4),
+	   z0 = svld4 (p0, x0 - svcntb () * 4))
+
+/*
+** ld4_s8_m32:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_s8_m32, svint8x4_t, int8_t,
+	   z0 = svld4_s8 (p0, x0 - svcntb () * 32),
+	   z0 = svld4 (p0, x0 - svcntb () * 32))
+
+/*
+** ld4_s8_m36:
+**	[^{]*
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_s8_m36, svint8x4_t, int8_t,
+	   z0 = svld4_s8 (p0, x0 - svcntb () * 36),
+	   z0 = svld4 (p0, x0 - svcntb () * 36))
+
+/*
+** ld4_vnum_s8_0:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s8_0, svint8x4_t, int8_t,
+	   z0 = svld4_vnum_s8 (p0, x0, 0),
+	   z0 = svld4_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s8_1:
+**	incb	x0
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s8_1, svint8x4_t, int8_t,
+	   z0 = svld4_vnum_s8 (p0, x0, 1),
+	   z0 = svld4_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s8_2:
+**	incb	x0, all, mul #2
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s8_2, svint8x4_t, int8_t,
+	   z0 = svld4_vnum_s8 (p0, x0, 2),
+	   z0 = svld4_vnum (p0, x0, 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s8_3:
+**	incb	x0, all, mul #3
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s8_3, svint8x4_t, int8_t,
+	   z0 = svld4_vnum_s8 (p0, x0, 3),
+	   z0 = svld4_vnum (p0, x0, 3))
+
+/*
+** ld4_vnum_s8_4:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s8_4, svint8x4_t, int8_t,
+	   z0 = svld4_vnum_s8 (p0, x0, 4),
+	   z0 = svld4_vnum (p0, x0, 4))
+
+/*
+** ld4_vnum_s8_28:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s8_28, svint8x4_t, int8_t,
+	   z0 = svld4_vnum_s8 (p0, x0, 28),
+	   z0 = svld4_vnum (p0, x0, 28))
+
+/*
+** ld4_vnum_s8_32:
+**	[^{]*
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s8_32, svint8x4_t, int8_t,
+	   z0 = svld4_vnum_s8 (p0, x0, 32),
+	   z0 = svld4_vnum (p0, x0, 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s8_m1:
+**	decb	x0
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s8_m1, svint8x4_t, int8_t,
+	   z0 = svld4_vnum_s8 (p0, x0, -1),
+	   z0 = svld4_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s8_m2:
+**	decb	x0, all, mul #2
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s8_m2, svint8x4_t, int8_t,
+	   z0 = svld4_vnum_s8 (p0, x0, -2),
+	   z0 = svld4_vnum (p0, x0, -2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_s8_m3:
+**	decb	x0, all, mul #3
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s8_m3, svint8x4_t, int8_t,
+	   z0 = svld4_vnum_s8 (p0, x0, -3),
+	   z0 = svld4_vnum (p0, x0, -3))
+
+/*
+** ld4_vnum_s8_m4:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s8_m4, svint8x4_t, int8_t,
+	   z0 = svld4_vnum_s8 (p0, x0, -4),
+	   z0 = svld4_vnum (p0, x0, -4))
+
+/*
+** ld4_vnum_s8_m32:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s8_m32, svint8x4_t, int8_t,
+	   z0 = svld4_vnum_s8 (p0, x0, -32),
+	   z0 = svld4_vnum (p0, x0, -32))
+
+/*
+** ld4_vnum_s8_m36:
+**	[^{]*
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s8_m36, svint8x4_t, int8_t,
+	   z0 = svld4_vnum_s8 (p0, x0, -36),
+	   z0 = svld4_vnum (p0, x0, -36))
+
+/*
+** ld4_vnum_s8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld4_vnum_s8_x1, svint8x4_t, int8_t,
+	   z0 = svld4_vnum_s8 (p0, x0, x1),
+	   z0 = svld4_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u16.c
new file mode 100644
index 000000000..234593d10
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u16.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld4_u16_base:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u16_base, svuint16x4_t, uint16_t,
+	   z0 = svld4_u16 (p0, x0),
+	   z0 = svld4 (p0, x0))
+
+/*
+** ld4_u16_index:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ld4_u16_index, svuint16x4_t, uint16_t,
+	   z0 = svld4_u16 (p0, x0 + x1),
+	   z0 = svld4 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u16_1:
+**	incb	x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u16_1, svuint16x4_t, uint16_t,
+	   z0 = svld4_u16 (p0, x0 + svcnth ()),
+	   z0 = svld4 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u16_2:
+**	incb	x0, all, mul #2
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u16_2, svuint16x4_t, uint16_t,
+	   z0 = svld4_u16 (p0, x0 + svcnth () * 2),
+	   z0 = svld4 (p0, x0 + svcnth () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u16_3:
+**	incb	x0, all, mul #3
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u16_3, svuint16x4_t, uint16_t,
+	   z0 = svld4_u16 (p0, x0 + svcnth () * 3),
+	   z0 = svld4 (p0, x0 + svcnth () * 3))
+
+/*
+** ld4_u16_4:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_u16_4, svuint16x4_t, uint16_t,
+	   z0 = svld4_u16 (p0, x0 + svcnth () * 4),
+	   z0 = svld4 (p0, x0 + svcnth () * 4))
+
+/*
+** ld4_u16_28:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_u16_28, svuint16x4_t, uint16_t,
+	   z0 = svld4_u16 (p0, x0 + svcnth () * 28),
+	   z0 = svld4 (p0, x0 + svcnth () * 28))
+
+/*
+** ld4_u16_32:
+**	[^{]*
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_u16_32, svuint16x4_t, uint16_t,
+	   z0 = svld4_u16 (p0, x0 + svcnth () * 32),
+	   z0 = svld4 (p0, x0 + svcnth () * 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u16_m1:
+**	decb	x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u16_m1, svuint16x4_t, uint16_t,
+	   z0 = svld4_u16 (p0, x0 - svcnth ()),
+	   z0 = svld4 (p0, x0 - svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u16_m2:
+**	decb	x0, all, mul #2
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u16_m2, svuint16x4_t, uint16_t,
+	   z0 = svld4_u16 (p0, x0 - svcnth () * 2),
+	   z0 = svld4 (p0, x0 - svcnth () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u16_m3:
+**	decb	x0, all, mul #3
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u16_m3, svuint16x4_t, uint16_t,
+	   z0 = svld4_u16 (p0, x0 - svcnth () * 3),
+	   z0 = svld4 (p0, x0 - svcnth () * 3))
+
+/*
+** ld4_u16_m4:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_u16_m4, svuint16x4_t, uint16_t,
+	   z0 = svld4_u16 (p0, x0 - svcnth () * 4),
+	   z0 = svld4 (p0, x0 - svcnth () * 4))
+
+/*
+** ld4_u16_m32:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_u16_m32, svuint16x4_t, uint16_t,
+	   z0 = svld4_u16 (p0, x0 - svcnth () * 32),
+	   z0 = svld4 (p0, x0 - svcnth () * 32))
+
+/*
+** ld4_u16_m36:
+**	[^{]*
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_u16_m36, svuint16x4_t, uint16_t,
+	   z0 = svld4_u16 (p0, x0 - svcnth () * 36),
+	   z0 = svld4 (p0, x0 - svcnth () * 36))
+
+/*
+** ld4_vnum_u16_0:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u16_0, svuint16x4_t, uint16_t,
+	   z0 = svld4_vnum_u16 (p0, x0, 0),
+	   z0 = svld4_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u16_1:
+**	incb	x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u16_1, svuint16x4_t, uint16_t,
+	   z0 = svld4_vnum_u16 (p0, x0, 1),
+	   z0 = svld4_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u16_2:
+**	incb	x0, all, mul #2
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u16_2, svuint16x4_t, uint16_t,
+	   z0 = svld4_vnum_u16 (p0, x0, 2),
+	   z0 = svld4_vnum (p0, x0, 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u16_3:
+**	incb	x0, all, mul #3
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u16_3, svuint16x4_t, uint16_t,
+	   z0 = svld4_vnum_u16 (p0, x0, 3),
+	   z0 = svld4_vnum (p0, x0, 3))
+
+/*
+** ld4_vnum_u16_4:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u16_4, svuint16x4_t, uint16_t,
+	   z0 = svld4_vnum_u16 (p0, x0, 4),
+	   z0 = svld4_vnum (p0, x0, 4))
+
+/*
+** ld4_vnum_u16_28:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u16_28, svuint16x4_t, uint16_t,
+	   z0 = svld4_vnum_u16 (p0, x0, 28),
+	   z0 = svld4_vnum (p0, x0, 28))
+
+/*
+** ld4_vnum_u16_32:
+**	[^{]*
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u16_32, svuint16x4_t, uint16_t,
+	   z0 = svld4_vnum_u16 (p0, x0, 32),
+	   z0 = svld4_vnum (p0, x0, 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u16_m1:
+**	decb	x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u16_m1, svuint16x4_t, uint16_t,
+	   z0 = svld4_vnum_u16 (p0, x0, -1),
+	   z0 = svld4_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u16_m2:
+**	decb	x0, all, mul #2
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u16_m2, svuint16x4_t, uint16_t,
+	   z0 = svld4_vnum_u16 (p0, x0, -2),
+	   z0 = svld4_vnum (p0, x0, -2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u16_m3:
+**	decb	x0, all, mul #3
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u16_m3, svuint16x4_t, uint16_t,
+	   z0 = svld4_vnum_u16 (p0, x0, -3),
+	   z0 = svld4_vnum (p0, x0, -3))
+
+/*
+** ld4_vnum_u16_m4:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u16_m4, svuint16x4_t, uint16_t,
+	   z0 = svld4_vnum_u16 (p0, x0, -4),
+	   z0 = svld4_vnum (p0, x0, -4))
+
+/*
+** ld4_vnum_u16_m32:
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u16_m32, svuint16x4_t, uint16_t,
+	   z0 = svld4_vnum_u16 (p0, x0, -32),
+	   z0 = svld4_vnum (p0, x0, -32))
+
+/*
+** ld4_vnum_u16_m36:
+**	[^{]*
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u16_m36, svuint16x4_t, uint16_t,
+	   z0 = svld4_vnum_u16 (p0, x0, -36),
+	   z0 = svld4_vnum (p0, x0, -36))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld4_vnum_u16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld4h	{z0\.h - z3\.h}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u16_x1, svuint16x4_t, uint16_t,
+	   z0 = svld4_vnum_u16 (p0, x0, x1),
+	   z0 = svld4_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u32.c
new file mode 100644
index 000000000..ad2627800
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u32.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld4_u32_base:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u32_base, svuint32x4_t, uint32_t,
+	   z0 = svld4_u32 (p0, x0),
+	   z0 = svld4 (p0, x0))
+
+/*
+** ld4_u32_index:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ld4_u32_index, svuint32x4_t, uint32_t,
+	   z0 = svld4_u32 (p0, x0 + x1),
+	   z0 = svld4 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u32_1:
+**	incb	x0
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u32_1, svuint32x4_t, uint32_t,
+	   z0 = svld4_u32 (p0, x0 + svcntw ()),
+	   z0 = svld4 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u32_2:
+**	incb	x0, all, mul #2
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u32_2, svuint32x4_t, uint32_t,
+	   z0 = svld4_u32 (p0, x0 + svcntw () * 2),
+	   z0 = svld4 (p0, x0 + svcntw () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u32_3:
+**	incb	x0, all, mul #3
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u32_3, svuint32x4_t, uint32_t,
+	   z0 = svld4_u32 (p0, x0 + svcntw () * 3),
+	   z0 = svld4 (p0, x0 + svcntw () * 3))
+
+/*
+** ld4_u32_4:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_u32_4, svuint32x4_t, uint32_t,
+	   z0 = svld4_u32 (p0, x0 + svcntw () * 4),
+	   z0 = svld4 (p0, x0 + svcntw () * 4))
+
+/*
+** ld4_u32_28:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_u32_28, svuint32x4_t, uint32_t,
+	   z0 = svld4_u32 (p0, x0 + svcntw () * 28),
+	   z0 = svld4 (p0, x0 + svcntw () * 28))
+
+/*
+** ld4_u32_32:
+**	[^{]*
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_u32_32, svuint32x4_t, uint32_t,
+	   z0 = svld4_u32 (p0, x0 + svcntw () * 32),
+	   z0 = svld4 (p0, x0 + svcntw () * 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u32_m1:
+**	decb	x0
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u32_m1, svuint32x4_t, uint32_t,
+	   z0 = svld4_u32 (p0, x0 - svcntw ()),
+	   z0 = svld4 (p0, x0 - svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u32_m2:
+**	decb	x0, all, mul #2
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u32_m2, svuint32x4_t, uint32_t,
+	   z0 = svld4_u32 (p0, x0 - svcntw () * 2),
+	   z0 = svld4 (p0, x0 - svcntw () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u32_m3:
+**	decb	x0, all, mul #3
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u32_m3, svuint32x4_t, uint32_t,
+	   z0 = svld4_u32 (p0, x0 - svcntw () * 3),
+	   z0 = svld4 (p0, x0 - svcntw () * 3))
+
+/*
+** ld4_u32_m4:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_u32_m4, svuint32x4_t, uint32_t,
+	   z0 = svld4_u32 (p0, x0 - svcntw () * 4),
+	   z0 = svld4 (p0, x0 - svcntw () * 4))
+
+/*
+** ld4_u32_m32:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_u32_m32, svuint32x4_t, uint32_t,
+	   z0 = svld4_u32 (p0, x0 - svcntw () * 32),
+	   z0 = svld4 (p0, x0 - svcntw () * 32))
+
+/*
+** ld4_u32_m36:
+**	[^{]*
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_u32_m36, svuint32x4_t, uint32_t,
+	   z0 = svld4_u32 (p0, x0 - svcntw () * 36),
+	   z0 = svld4 (p0, x0 - svcntw () * 36))
+
+/*
+** ld4_vnum_u32_0:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u32_0, svuint32x4_t, uint32_t,
+	   z0 = svld4_vnum_u32 (p0, x0, 0),
+	   z0 = svld4_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u32_1:
+**	incb	x0
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u32_1, svuint32x4_t, uint32_t,
+	   z0 = svld4_vnum_u32 (p0, x0, 1),
+	   z0 = svld4_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u32_2:
+**	incb	x0, all, mul #2
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u32_2, svuint32x4_t, uint32_t,
+	   z0 = svld4_vnum_u32 (p0, x0, 2),
+	   z0 = svld4_vnum (p0, x0, 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u32_3:
+**	incb	x0, all, mul #3
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u32_3, svuint32x4_t, uint32_t,
+	   z0 = svld4_vnum_u32 (p0, x0, 3),
+	   z0 = svld4_vnum (p0, x0, 3))
+
+/*
+** ld4_vnum_u32_4:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u32_4, svuint32x4_t, uint32_t,
+	   z0 = svld4_vnum_u32 (p0, x0, 4),
+	   z0 = svld4_vnum (p0, x0, 4))
+
+/*
+** ld4_vnum_u32_28:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u32_28, svuint32x4_t, uint32_t,
+	   z0 = svld4_vnum_u32 (p0, x0, 28),
+	   z0 = svld4_vnum (p0, x0, 28))
+
+/*
+** ld4_vnum_u32_32:
+**	[^{]*
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u32_32, svuint32x4_t, uint32_t,
+	   z0 = svld4_vnum_u32 (p0, x0, 32),
+	   z0 = svld4_vnum (p0, x0, 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u32_m1:
+**	decb	x0
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u32_m1, svuint32x4_t, uint32_t,
+	   z0 = svld4_vnum_u32 (p0, x0, -1),
+	   z0 = svld4_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u32_m2:
+**	decb	x0, all, mul #2
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u32_m2, svuint32x4_t, uint32_t,
+	   z0 = svld4_vnum_u32 (p0, x0, -2),
+	   z0 = svld4_vnum (p0, x0, -2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u32_m3:
+**	decb	x0, all, mul #3
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u32_m3, svuint32x4_t, uint32_t,
+	   z0 = svld4_vnum_u32 (p0, x0, -3),
+	   z0 = svld4_vnum (p0, x0, -3))
+
+/*
+** ld4_vnum_u32_m4:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u32_m4, svuint32x4_t, uint32_t,
+	   z0 = svld4_vnum_u32 (p0, x0, -4),
+	   z0 = svld4_vnum (p0, x0, -4))
+
+/*
+** ld4_vnum_u32_m32:
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u32_m32, svuint32x4_t, uint32_t,
+	   z0 = svld4_vnum_u32 (p0, x0, -32),
+	   z0 = svld4_vnum (p0, x0, -32))
+
+/*
+** ld4_vnum_u32_m36:
+**	[^{]*
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u32_m36, svuint32x4_t, uint32_t,
+	   z0 = svld4_vnum_u32 (p0, x0, -36),
+	   z0 = svld4_vnum (p0, x0, -36))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld4_vnum_u32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld4w	{z0\.s - z3\.s}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u32_x1, svuint32x4_t, uint32_t,
+	   z0 = svld4_vnum_u32 (p0, x0, x1),
+	   z0 = svld4_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u64.c
new file mode 100644
index 000000000..8772ba42d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u64.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld4_u64_base:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u64_base, svuint64x4_t, uint64_t,
+	   z0 = svld4_u64 (p0, x0),
+	   z0 = svld4 (p0, x0))
+
+/*
+** ld4_u64_index:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ld4_u64_index, svuint64x4_t, uint64_t,
+	   z0 = svld4_u64 (p0, x0 + x1),
+	   z0 = svld4 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u64_1:
+**	incb	x0
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u64_1, svuint64x4_t, uint64_t,
+	   z0 = svld4_u64 (p0, x0 + svcntd ()),
+	   z0 = svld4 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u64_2:
+**	incb	x0, all, mul #2
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u64_2, svuint64x4_t, uint64_t,
+	   z0 = svld4_u64 (p0, x0 + svcntd () * 2),
+	   z0 = svld4 (p0, x0 + svcntd () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u64_3:
+**	incb	x0, all, mul #3
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u64_3, svuint64x4_t, uint64_t,
+	   z0 = svld4_u64 (p0, x0 + svcntd () * 3),
+	   z0 = svld4 (p0, x0 + svcntd () * 3))
+
+/*
+** ld4_u64_4:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_u64_4, svuint64x4_t, uint64_t,
+	   z0 = svld4_u64 (p0, x0 + svcntd () * 4),
+	   z0 = svld4 (p0, x0 + svcntd () * 4))
+
+/*
+** ld4_u64_28:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_u64_28, svuint64x4_t, uint64_t,
+	   z0 = svld4_u64 (p0, x0 + svcntd () * 28),
+	   z0 = svld4 (p0, x0 + svcntd () * 28))
+
+/*
+** ld4_u64_32:
+**	[^{]*
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_u64_32, svuint64x4_t, uint64_t,
+	   z0 = svld4_u64 (p0, x0 + svcntd () * 32),
+	   z0 = svld4 (p0, x0 + svcntd () * 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u64_m1:
+**	decb	x0
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u64_m1, svuint64x4_t, uint64_t,
+	   z0 = svld4_u64 (p0, x0 - svcntd ()),
+	   z0 = svld4 (p0, x0 - svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u64_m2:
+**	decb	x0, all, mul #2
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u64_m2, svuint64x4_t, uint64_t,
+	   z0 = svld4_u64 (p0, x0 - svcntd () * 2),
+	   z0 = svld4 (p0, x0 - svcntd () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u64_m3:
+**	decb	x0, all, mul #3
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u64_m3, svuint64x4_t, uint64_t,
+	   z0 = svld4_u64 (p0, x0 - svcntd () * 3),
+	   z0 = svld4 (p0, x0 - svcntd () * 3))
+
+/*
+** ld4_u64_m4:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_u64_m4, svuint64x4_t, uint64_t,
+	   z0 = svld4_u64 (p0, x0 - svcntd () * 4),
+	   z0 = svld4 (p0, x0 - svcntd () * 4))
+
+/*
+** ld4_u64_m32:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_u64_m32, svuint64x4_t, uint64_t,
+	   z0 = svld4_u64 (p0, x0 - svcntd () * 32),
+	   z0 = svld4 (p0, x0 - svcntd () * 32))
+
+/*
+** ld4_u64_m36:
+**	[^{]*
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_u64_m36, svuint64x4_t, uint64_t,
+	   z0 = svld4_u64 (p0, x0 - svcntd () * 36),
+	   z0 = svld4 (p0, x0 - svcntd () * 36))
+
+/*
+** ld4_vnum_u64_0:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u64_0, svuint64x4_t, uint64_t,
+	   z0 = svld4_vnum_u64 (p0, x0, 0),
+	   z0 = svld4_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u64_1:
+**	incb	x0
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u64_1, svuint64x4_t, uint64_t,
+	   z0 = svld4_vnum_u64 (p0, x0, 1),
+	   z0 = svld4_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u64_2:
+**	incb	x0, all, mul #2
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u64_2, svuint64x4_t, uint64_t,
+	   z0 = svld4_vnum_u64 (p0, x0, 2),
+	   z0 = svld4_vnum (p0, x0, 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u64_3:
+**	incb	x0, all, mul #3
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u64_3, svuint64x4_t, uint64_t,
+	   z0 = svld4_vnum_u64 (p0, x0, 3),
+	   z0 = svld4_vnum (p0, x0, 3))
+
+/*
+** ld4_vnum_u64_4:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u64_4, svuint64x4_t, uint64_t,
+	   z0 = svld4_vnum_u64 (p0, x0, 4),
+	   z0 = svld4_vnum (p0, x0, 4))
+
+/*
+** ld4_vnum_u64_28:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u64_28, svuint64x4_t, uint64_t,
+	   z0 = svld4_vnum_u64 (p0, x0, 28),
+	   z0 = svld4_vnum (p0, x0, 28))
+
+/*
+** ld4_vnum_u64_32:
+**	[^{]*
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u64_32, svuint64x4_t, uint64_t,
+	   z0 = svld4_vnum_u64 (p0, x0, 32),
+	   z0 = svld4_vnum (p0, x0, 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u64_m1:
+**	decb	x0
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u64_m1, svuint64x4_t, uint64_t,
+	   z0 = svld4_vnum_u64 (p0, x0, -1),
+	   z0 = svld4_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u64_m2:
+**	decb	x0, all, mul #2
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u64_m2, svuint64x4_t, uint64_t,
+	   z0 = svld4_vnum_u64 (p0, x0, -2),
+	   z0 = svld4_vnum (p0, x0, -2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u64_m3:
+**	decb	x0, all, mul #3
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u64_m3, svuint64x4_t, uint64_t,
+	   z0 = svld4_vnum_u64 (p0, x0, -3),
+	   z0 = svld4_vnum (p0, x0, -3))
+
+/*
+** ld4_vnum_u64_m4:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u64_m4, svuint64x4_t, uint64_t,
+	   z0 = svld4_vnum_u64 (p0, x0, -4),
+	   z0 = svld4_vnum (p0, x0, -4))
+
+/*
+** ld4_vnum_u64_m32:
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u64_m32, svuint64x4_t, uint64_t,
+	   z0 = svld4_vnum_u64 (p0, x0, -32),
+	   z0 = svld4_vnum (p0, x0, -32))
+
+/*
+** ld4_vnum_u64_m36:
+**	[^{]*
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u64_m36, svuint64x4_t, uint64_t,
+	   z0 = svld4_vnum_u64 (p0, x0, -36),
+	   z0 = svld4_vnum (p0, x0, -36))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ld4_vnum_u64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ld4d	{z0\.d - z3\.d}, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u64_x1, svuint64x4_t, uint64_t,
+	   z0 = svld4_vnum_u64 (p0, x0, x1),
+	   z0 = svld4_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u8.c
new file mode 100644
index 000000000..85b2987ce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ld4_u8.c
@@ -0,0 +1,290 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld4_u8_base:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u8_base, svuint8x4_t, uint8_t,
+	   z0 = svld4_u8 (p0, x0),
+	   z0 = svld4 (p0, x0))
+
+/*
+** ld4_u8_index:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ld4_u8_index, svuint8x4_t, uint8_t,
+	   z0 = svld4_u8 (p0, x0 + x1),
+	   z0 = svld4 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u8_1:
+**	incb	x0
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u8_1, svuint8x4_t, uint8_t,
+	   z0 = svld4_u8 (p0, x0 + svcntb ()),
+	   z0 = svld4 (p0, x0 + svcntb ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u8_2:
+**	incb	x0, all, mul #2
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u8_2, svuint8x4_t, uint8_t,
+	   z0 = svld4_u8 (p0, x0 + svcntb () * 2),
+	   z0 = svld4 (p0, x0 + svcntb () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u8_3:
+**	incb	x0, all, mul #3
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u8_3, svuint8x4_t, uint8_t,
+	   z0 = svld4_u8 (p0, x0 + svcntb () * 3),
+	   z0 = svld4 (p0, x0 + svcntb () * 3))
+
+/*
+** ld4_u8_4:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_u8_4, svuint8x4_t, uint8_t,
+	   z0 = svld4_u8 (p0, x0 + svcntb () * 4),
+	   z0 = svld4 (p0, x0 + svcntb () * 4))
+
+/*
+** ld4_u8_28:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_u8_28, svuint8x4_t, uint8_t,
+	   z0 = svld4_u8 (p0, x0 + svcntb () * 28),
+	   z0 = svld4 (p0, x0 + svcntb () * 28))
+
+/*
+** ld4_u8_32:
+**	[^{]*
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_u8_32, svuint8x4_t, uint8_t,
+	   z0 = svld4_u8 (p0, x0 + svcntb () * 32),
+	   z0 = svld4 (p0, x0 + svcntb () * 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u8_m1:
+**	decb	x0
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u8_m1, svuint8x4_t, uint8_t,
+	   z0 = svld4_u8 (p0, x0 - svcntb ()),
+	   z0 = svld4 (p0, x0 - svcntb ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u8_m2:
+**	decb	x0, all, mul #2
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u8_m2, svuint8x4_t, uint8_t,
+	   z0 = svld4_u8 (p0, x0 - svcntb () * 2),
+	   z0 = svld4 (p0, x0 - svcntb () * 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_u8_m3:
+**	decb	x0, all, mul #3
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_u8_m3, svuint8x4_t, uint8_t,
+	   z0 = svld4_u8 (p0, x0 - svcntb () * 3),
+	   z0 = svld4 (p0, x0 - svcntb () * 3))
+
+/*
+** ld4_u8_m4:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_u8_m4, svuint8x4_t, uint8_t,
+	   z0 = svld4_u8 (p0, x0 - svcntb () * 4),
+	   z0 = svld4 (p0, x0 - svcntb () * 4))
+
+/*
+** ld4_u8_m32:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_u8_m32, svuint8x4_t, uint8_t,
+	   z0 = svld4_u8 (p0, x0 - svcntb () * 32),
+	   z0 = svld4 (p0, x0 - svcntb () * 32))
+
+/*
+** ld4_u8_m36:
+**	[^{]*
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_u8_m36, svuint8x4_t, uint8_t,
+	   z0 = svld4_u8 (p0, x0 - svcntb () * 36),
+	   z0 = svld4 (p0, x0 - svcntb () * 36))
+
+/*
+** ld4_vnum_u8_0:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u8_0, svuint8x4_t, uint8_t,
+	   z0 = svld4_vnum_u8 (p0, x0, 0),
+	   z0 = svld4_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u8_1:
+**	incb	x0
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u8_1, svuint8x4_t, uint8_t,
+	   z0 = svld4_vnum_u8 (p0, x0, 1),
+	   z0 = svld4_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u8_2:
+**	incb	x0, all, mul #2
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u8_2, svuint8x4_t, uint8_t,
+	   z0 = svld4_vnum_u8 (p0, x0, 2),
+	   z0 = svld4_vnum (p0, x0, 2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u8_3:
+**	incb	x0, all, mul #3
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u8_3, svuint8x4_t, uint8_t,
+	   z0 = svld4_vnum_u8 (p0, x0, 3),
+	   z0 = svld4_vnum (p0, x0, 3))
+
+/*
+** ld4_vnum_u8_4:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u8_4, svuint8x4_t, uint8_t,
+	   z0 = svld4_vnum_u8 (p0, x0, 4),
+	   z0 = svld4_vnum (p0, x0, 4))
+
+/*
+** ld4_vnum_u8_28:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u8_28, svuint8x4_t, uint8_t,
+	   z0 = svld4_vnum_u8 (p0, x0, 28),
+	   z0 = svld4_vnum (p0, x0, 28))
+
+/*
+** ld4_vnum_u8_32:
+**	[^{]*
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u8_32, svuint8x4_t, uint8_t,
+	   z0 = svld4_vnum_u8 (p0, x0, 32),
+	   z0 = svld4_vnum (p0, x0, 32))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u8_m1:
+**	decb	x0
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u8_m1, svuint8x4_t, uint8_t,
+	   z0 = svld4_vnum_u8 (p0, x0, -1),
+	   z0 = svld4_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u8_m2:
+**	decb	x0, all, mul #2
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u8_m2, svuint8x4_t, uint8_t,
+	   z0 = svld4_vnum_u8 (p0, x0, -2),
+	   z0 = svld4_vnum (p0, x0, -2))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ld4_vnum_u8_m3:
+**	decb	x0, all, mul #3
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u8_m3, svuint8x4_t, uint8_t,
+	   z0 = svld4_vnum_u8 (p0, x0, -3),
+	   z0 = svld4_vnum (p0, x0, -3))
+
+/*
+** ld4_vnum_u8_m4:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u8_m4, svuint8x4_t, uint8_t,
+	   z0 = svld4_vnum_u8 (p0, x0, -4),
+	   z0 = svld4_vnum (p0, x0, -4))
+
+/*
+** ld4_vnum_u8_m32:
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u8_m32, svuint8x4_t, uint8_t,
+	   z0 = svld4_vnum_u8 (p0, x0, -32),
+	   z0 = svld4_vnum (p0, x0, -32))
+
+/*
+** ld4_vnum_u8_m36:
+**	[^{]*
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, x[0-9]+\]
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u8_m36, svuint8x4_t, uint8_t,
+	   z0 = svld4_vnum_u8 (p0, x0, -36),
+	   z0 = svld4_vnum (p0, x0, -36))
+
+/*
+** ld4_vnum_u8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ld4b	{z0\.b - z3\.b}, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ld4_vnum_u8_x1, svuint8x4_t, uint8_t,
+	   z0 = svld4_vnum_u8 (p0, x0, x1),
+	   z0 = svld4_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_bf16.c
new file mode 100644
index 000000000..80f646870
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_bf16.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_bf16_base:
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_bf16_base, svbfloat16_t, bfloat16_t,
+	   z0 = svldff1_bf16 (p0, x0),
+	   z0 = svldff1 (p0, x0))
+
+/*
+** ldff1_bf16_index:
+**	ldff1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ldff1_bf16_index, svbfloat16_t, bfloat16_t,
+	   z0 = svldff1_bf16 (p0, x0 + x1),
+	   z0 = svldff1 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_bf16_1:
+**	incb	x0
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_bf16_1, svbfloat16_t, bfloat16_t,
+	   z0 = svldff1_bf16 (p0, x0 + svcnth ()),
+	   z0 = svldff1 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_bf16_m1:
+**	decb	x0
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_bf16_m1, svbfloat16_t, bfloat16_t,
+	   z0 = svldff1_bf16 (p0, x0 - svcnth ()),
+	   z0 = svldff1 (p0, x0 - svcnth ()))
+
+/*
+** ldff1_vnum_bf16_0:
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+	   z0 = svldff1_vnum_bf16 (p0, x0, 0),
+	   z0 = svldff1_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_bf16_1:
+**	incb	x0
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+	   z0 = svldff1_vnum_bf16 (p0, x0, 1),
+	   z0 = svldff1_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_bf16_m1:
+**	decb	x0
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+	   z0 = svldff1_vnum_bf16 (p0, x0, -1),
+	   z0 = svldff1_vnum (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1_vnum_bf16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1h	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+	   z0 = svldff1_vnum_bf16 (p0, x0, x1),
+	   z0 = svldff1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f16.c
new file mode 100644
index 000000000..13ce863c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f16.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_f16_base:
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_f16_base, svfloat16_t, float16_t,
+	   z0 = svldff1_f16 (p0, x0),
+	   z0 = svldff1 (p0, x0))
+
+/*
+** ldff1_f16_index:
+**	ldff1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ldff1_f16_index, svfloat16_t, float16_t,
+	   z0 = svldff1_f16 (p0, x0 + x1),
+	   z0 = svldff1 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_f16_1:
+**	incb	x0
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_f16_1, svfloat16_t, float16_t,
+	   z0 = svldff1_f16 (p0, x0 + svcnth ()),
+	   z0 = svldff1 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_f16_m1:
+**	decb	x0
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_f16_m1, svfloat16_t, float16_t,
+	   z0 = svldff1_f16 (p0, x0 - svcnth ()),
+	   z0 = svldff1 (p0, x0 - svcnth ()))
+
+/*
+** ldff1_vnum_f16_0:
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_f16_0, svfloat16_t, float16_t,
+	   z0 = svldff1_vnum_f16 (p0, x0, 0),
+	   z0 = svldff1_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_f16_1:
+**	incb	x0
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_f16_1, svfloat16_t, float16_t,
+	   z0 = svldff1_vnum_f16 (p0, x0, 1),
+	   z0 = svldff1_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_f16_m1:
+**	decb	x0
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_f16_m1, svfloat16_t, float16_t,
+	   z0 = svldff1_vnum_f16 (p0, x0, -1),
+	   z0 = svldff1_vnum (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1_vnum_f16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1h	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_f16_x1, svfloat16_t, float16_t,
+	   z0 = svldff1_vnum_f16 (p0, x0, x1),
+	   z0 = svldff1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f32.c
new file mode 100644
index 000000000..2fcc63390
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f32.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_f32_base:
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_f32_base, svfloat32_t, float32_t,
+	   z0 = svldff1_f32 (p0, x0),
+	   z0 = svldff1 (p0, x0))
+
+/*
+** ldff1_f32_index:
+**	ldff1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ldff1_f32_index, svfloat32_t, float32_t,
+	   z0 = svldff1_f32 (p0, x0 + x1),
+	   z0 = svldff1 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_f32_1:
+**	incb	x0
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_f32_1, svfloat32_t, float32_t,
+	   z0 = svldff1_f32 (p0, x0 + svcntw ()),
+	   z0 = svldff1 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_f32_m1:
+**	decb	x0
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_f32_m1, svfloat32_t, float32_t,
+	   z0 = svldff1_f32 (p0, x0 - svcntw ()),
+	   z0 = svldff1 (p0, x0 - svcntw ()))
+
+/*
+** ldff1_vnum_f32_0:
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_f32_0, svfloat32_t, float32_t,
+	   z0 = svldff1_vnum_f32 (p0, x0, 0),
+	   z0 = svldff1_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_f32_1:
+**	incb	x0
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_f32_1, svfloat32_t, float32_t,
+	   z0 = svldff1_vnum_f32 (p0, x0, 1),
+	   z0 = svldff1_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_f32_m1:
+**	decb	x0
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_f32_m1, svfloat32_t, float32_t,
+	   z0 = svldff1_vnum_f32 (p0, x0, -1),
+	   z0 = svldff1_vnum (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1_vnum_f32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1w	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_f32_x1, svfloat32_t, float32_t,
+	   z0 = svldff1_vnum_f32 (p0, x0, x1),
+	   z0 = svldff1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f64.c
new file mode 100644
index 000000000..cc15b927a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_f64.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_f64_base:
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_f64_base, svfloat64_t, float64_t,
+	   z0 = svldff1_f64 (p0, x0),
+	   z0 = svldff1 (p0, x0))
+
+/*
+** ldff1_f64_index:
+**	ldff1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ldff1_f64_index, svfloat64_t, float64_t,
+	   z0 = svldff1_f64 (p0, x0 + x1),
+	   z0 = svldff1 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_f64_1:
+**	incb	x0
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_f64_1, svfloat64_t, float64_t,
+	   z0 = svldff1_f64 (p0, x0 + svcntd ()),
+	   z0 = svldff1 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_f64_m1:
+**	decb	x0
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_f64_m1, svfloat64_t, float64_t,
+	   z0 = svldff1_f64 (p0, x0 - svcntd ()),
+	   z0 = svldff1 (p0, x0 - svcntd ()))
+
+/*
+** ldff1_vnum_f64_0:
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_f64_0, svfloat64_t, float64_t,
+	   z0 = svldff1_vnum_f64 (p0, x0, 0),
+	   z0 = svldff1_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_f64_1:
+**	incb	x0
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_f64_1, svfloat64_t, float64_t,
+	   z0 = svldff1_vnum_f64 (p0, x0, 1),
+	   z0 = svldff1_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_f64_m1:
+**	decb	x0
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_f64_m1, svfloat64_t, float64_t,
+	   z0 = svldff1_vnum_f64 (p0, x0, -1),
+	   z0 = svldff1_vnum (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1_vnum_f64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1d	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_f64_x1, svfloat64_t, float64_t,
+	   z0 = svldff1_vnum_f64 (p0, x0, x1),
+	   z0 = svldff1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_f32.c
new file mode 100644
index 000000000..7e330c042
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_f32.c
@@ -0,0 +1,272 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_gather_f32_tied1:
+**	ldff1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_f32_tied1, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_f32 (p0, z0),
+		     z0_res = svldff1_gather_f32 (p0, z0))
+
+/*
+** ldff1_gather_f32_untied:
+**	ldff1w	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_f32_untied, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_f32 (p0, z1),
+		     z0_res = svldff1_gather_f32 (p0, z1))
+
+/*
+** ldff1_gather_x0_f32_offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_x0_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, x0),
+		     z0_res = svldff1_gather_offset_f32 (p0, z0, x0))
+
+/*
+** ldff1_gather_m4_f32_offset:
+**	mov	(x[0-9]+), #?-4
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_m4_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, -4),
+		     z0_res = svldff1_gather_offset_f32 (p0, z0, -4))
+
+/*
+** ldff1_gather_0_f32_offset:
+**	ldff1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_0_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, 0),
+		     z0_res = svldff1_gather_offset_f32 (p0, z0, 0))
+
+/*
+** ldff1_gather_5_f32_offset:
+**	mov	(x[0-9]+), #?5
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_5_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, 5),
+		     z0_res = svldff1_gather_offset_f32 (p0, z0, 5))
+
+/*
+** ldff1_gather_6_f32_offset:
+**	mov	(x[0-9]+), #?6
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_6_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, 6),
+		     z0_res = svldff1_gather_offset_f32 (p0, z0, 6))
+
+/*
+** ldff1_gather_7_f32_offset:
+**	mov	(x[0-9]+), #?7
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_7_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, 7),
+		     z0_res = svldff1_gather_offset_f32 (p0, z0, 7))
+
+/*
+** ldff1_gather_8_f32_offset:
+**	ldff1w	z0\.s, p0/z, \[z0\.s, #8\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_8_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, 8),
+		     z0_res = svldff1_gather_offset_f32 (p0, z0, 8))
+
+/*
+** ldff1_gather_124_f32_offset:
+**	ldff1w	z0\.s, p0/z, \[z0\.s, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_124_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, 124),
+		     z0_res = svldff1_gather_offset_f32 (p0, z0, 124))
+
+/*
+** ldff1_gather_128_f32_offset:
+**	mov	(x[0-9]+), #?128
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_128_f32_offset, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_f32 (p0, z0, 128),
+		     z0_res = svldff1_gather_offset_f32 (p0, z0, 128))
+
+/*
+** ldff1_gather_x0_f32_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_x0_f32_index, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_f32 (p0, z0, x0),
+		     z0_res = svldff1_gather_index_f32 (p0, z0, x0))
+
+/*
+** ldff1_gather_m1_f32_index:
+**	mov	(x[0-9]+), #?-4
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_m1_f32_index, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_f32 (p0, z0, -1),
+		     z0_res = svldff1_gather_index_f32 (p0, z0, -1))
+
+/*
+** ldff1_gather_0_f32_index:
+**	ldff1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_0_f32_index, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_f32 (p0, z0, 0),
+		     z0_res = svldff1_gather_index_f32 (p0, z0, 0))
+
+/*
+** ldff1_gather_5_f32_index:
+**	ldff1w	z0\.s, p0/z, \[z0\.s, #20\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_5_f32_index, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_f32 (p0, z0, 5),
+		     z0_res = svldff1_gather_index_f32 (p0, z0, 5))
+
+/*
+** ldff1_gather_31_f32_index:
+**	ldff1w	z0\.s, p0/z, \[z0\.s, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_31_f32_index, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_f32 (p0, z0, 31),
+		     z0_res = svldff1_gather_index_f32 (p0, z0, 31))
+
+/*
+** ldff1_gather_32_f32_index:
+**	mov	(x[0-9]+), #?128
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_32_f32_index, svfloat32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_f32 (p0, z0, 32),
+		     z0_res = svldff1_gather_index_f32 (p0, z0, 32))
+
+/*
+** ldff1_gather_x0_f32_s32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_f32_s32offset, svfloat32_t, float32_t, svint32_t,
+		     z0_res = svldff1_gather_s32offset_f32 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_f32_s32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_f32_s32offset, svfloat32_t, float32_t, svint32_t,
+		     z0_res = svldff1_gather_s32offset_f32 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_f32_s32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_f32_s32offset, svfloat32_t, float32_t, svint32_t,
+		     z0_res = svldff1_gather_s32offset_f32 (p0, x0, z1),
+		     z0_res = svldff1_gather_offset (p0, x0, z1))
+
+/*
+** ldff1_gather_x0_f32_u32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_f32_u32offset, svfloat32_t, float32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32offset_f32 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_f32_u32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_f32_u32offset, svfloat32_t, float32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32offset_f32 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_f32_u32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_f32_u32offset, svfloat32_t, float32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32offset_f32 (p0, x0, z1),
+		     z0_res = svldff1_gather_offset (p0, x0, z1))
+
+/*
+** ldff1_gather_x0_f32_s32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_f32_s32index, svfloat32_t, float32_t, svint32_t,
+		     z0_res = svldff1_gather_s32index_f32 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_f32_s32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_f32_s32index, svfloat32_t, float32_t, svint32_t,
+		     z0_res = svldff1_gather_s32index_f32 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_f32_s32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_f32_s32index, svfloat32_t, float32_t, svint32_t,
+		     z0_res = svldff1_gather_s32index_f32 (p0, x0, z1),
+		     z0_res = svldff1_gather_index (p0, x0, z1))
+
+/*
+** ldff1_gather_x0_f32_u32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_f32_u32index, svfloat32_t, float32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32index_f32 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_f32_u32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_f32_u32index, svfloat32_t, float32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32index_f32 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_f32_u32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_f32_u32index, svfloat32_t, float32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32index_f32 (p0, x0, z1),
+		     z0_res = svldff1_gather_index (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_f64.c
new file mode 100644
index 000000000..d0e47f0bf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_f64.c
@@ -0,0 +1,348 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_gather_f64_tied1:
+**	ldff1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_f64_tied1, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_f64 (p0, z0),
+		     z0_res = svldff1_gather_f64 (p0, z0))
+
+/*
+** ldff1_gather_f64_untied:
+**	ldff1d	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_f64_untied, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_f64 (p0, z1),
+		     z0_res = svldff1_gather_f64 (p0, z1))
+
+/*
+** ldff1_gather_x0_f64_offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_x0_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, x0),
+		     z0_res = svldff1_gather_offset_f64 (p0, z0, x0))
+
+/*
+** ldff1_gather_m8_f64_offset:
+**	mov	(x[0-9]+), #?-8
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_m8_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, -8),
+		     z0_res = svldff1_gather_offset_f64 (p0, z0, -8))
+
+/*
+** ldff1_gather_0_f64_offset:
+**	ldff1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_0_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 0),
+		     z0_res = svldff1_gather_offset_f64 (p0, z0, 0))
+
+/*
+** ldff1_gather_9_f64_offset:
+**	mov	(x[0-9]+), #?9
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_9_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 9),
+		     z0_res = svldff1_gather_offset_f64 (p0, z0, 9))
+
+/*
+** ldff1_gather_10_f64_offset:
+**	mov	(x[0-9]+), #?10
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_10_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 10),
+		     z0_res = svldff1_gather_offset_f64 (p0, z0, 10))
+
+/*
+** ldff1_gather_11_f64_offset:
+**	mov	(x[0-9]+), #?11
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_11_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 11),
+		     z0_res = svldff1_gather_offset_f64 (p0, z0, 11))
+
+/*
+** ldff1_gather_12_f64_offset:
+**	mov	(x[0-9]+), #?12
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_12_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 12),
+		     z0_res = svldff1_gather_offset_f64 (p0, z0, 12))
+
+/*
+** ldff1_gather_13_f64_offset:
+**	mov	(x[0-9]+), #?13
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_13_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 13),
+		     z0_res = svldff1_gather_offset_f64 (p0, z0, 13))
+
+/*
+** ldff1_gather_14_f64_offset:
+**	mov	(x[0-9]+), #?14
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_14_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 14),
+		     z0_res = svldff1_gather_offset_f64 (p0, z0, 14))
+
+/*
+** ldff1_gather_15_f64_offset:
+**	mov	(x[0-9]+), #?15
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_15_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 15),
+		     z0_res = svldff1_gather_offset_f64 (p0, z0, 15))
+
+/*
+** ldff1_gather_16_f64_offset:
+**	ldff1d	z0\.d, p0/z, \[z0\.d, #16\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_16_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 16),
+		     z0_res = svldff1_gather_offset_f64 (p0, z0, 16))
+
+/*
+** ldff1_gather_248_f64_offset:
+**	ldff1d	z0\.d, p0/z, \[z0\.d, #248\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_248_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 248),
+		     z0_res = svldff1_gather_offset_f64 (p0, z0, 248))
+
+/*
+** ldff1_gather_256_f64_offset:
+**	mov	(x[0-9]+), #?256
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_256_f64_offset, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_f64 (p0, z0, 256),
+		     z0_res = svldff1_gather_offset_f64 (p0, z0, 256))
+
+/*
+** ldff1_gather_x0_f64_index:
+**	lsl	(x[0-9]+), x0, #?3
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_x0_f64_index, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_f64 (p0, z0, x0),
+		     z0_res = svldff1_gather_index_f64 (p0, z0, x0))
+
+/*
+** ldff1_gather_m1_f64_index:
+**	mov	(x[0-9]+), #?-8
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_m1_f64_index, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_f64 (p0, z0, -1),
+		     z0_res = svldff1_gather_index_f64 (p0, z0, -1))
+
+/*
+** ldff1_gather_0_f64_index:
+**	ldff1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_0_f64_index, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_f64 (p0, z0, 0),
+		     z0_res = svldff1_gather_index_f64 (p0, z0, 0))
+
+/*
+** ldff1_gather_5_f64_index:
+**	ldff1d	z0\.d, p0/z, \[z0\.d, #40\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_5_f64_index, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_f64 (p0, z0, 5),
+		     z0_res = svldff1_gather_index_f64 (p0, z0, 5))
+
+/*
+** ldff1_gather_31_f64_index:
+**	ldff1d	z0\.d, p0/z, \[z0\.d, #248\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_31_f64_index, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_f64 (p0, z0, 31),
+		     z0_res = svldff1_gather_index_f64 (p0, z0, 31))
+
+/*
+** ldff1_gather_32_f64_index:
+**	mov	(x[0-9]+), #?256
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_32_f64_index, svfloat64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_f64 (p0, z0, 32),
+		     z0_res = svldff1_gather_index_f64 (p0, z0, 32))
+
+/*
+** ldff1_gather_x0_f64_s64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_f64_s64offset, svfloat64_t, float64_t, svint64_t,
+		     z0_res = svldff1_gather_s64offset_f64 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_f64_s64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_f64_s64offset, svfloat64_t, float64_t, svint64_t,
+		     z0_res = svldff1_gather_s64offset_f64 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_f64_s64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_f64_s64offset, svfloat64_t, float64_t, svint64_t,
+		     z0_res = svldff1_gather_s64offset_f64 (p0, x0, z1),
+		     z0_res = svldff1_gather_offset (p0, x0, z1))
+
+/*
+** ldff1_gather_ext_f64_s64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_ext_f64_s64offset, svfloat64_t, float64_t, svint64_t,
+		     z0_res = svldff1_gather_s64offset_f64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1_gather_offset (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1_gather_x0_f64_u64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64offset_f64 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_f64_u64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64offset_f64 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_f64_u64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64offset_f64 (p0, x0, z1),
+		     z0_res = svldff1_gather_offset (p0, x0, z1))
+
+/*
+** ldff1_gather_ext_f64_u64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_ext_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64offset_f64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1_gather_offset (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1_gather_x0_f64_s64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_f64_s64index, svfloat64_t, float64_t, svint64_t,
+		     z0_res = svldff1_gather_s64index_f64 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_f64_s64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_f64_s64index, svfloat64_t, float64_t, svint64_t,
+		     z0_res = svldff1_gather_s64index_f64 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_f64_s64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_f64_s64index, svfloat64_t, float64_t, svint64_t,
+		     z0_res = svldff1_gather_s64index_f64 (p0, x0, z1),
+		     z0_res = svldff1_gather_index (p0, x0, z1))
+
+/*
+** ldff1_gather_ext_f64_s64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, sxtw 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_ext_f64_s64index, svfloat64_t, float64_t, svint64_t,
+		     z0_res = svldff1_gather_s64index_f64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1_gather_index (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1_gather_x0_f64_u64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_f64_u64index, svfloat64_t, float64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64index_f64 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_f64_u64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_f64_u64index, svfloat64_t, float64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64index_f64 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_f64_u64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_f64_u64index, svfloat64_t, float64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64index_f64 (p0, x0, z1),
+		     z0_res = svldff1_gather_index (p0, x0, z1))
+
+/*
+** ldff1_gather_ext_f64_u64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, uxtw 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_ext_f64_u64index, svfloat64_t, float64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64index_f64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1_gather_index (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_s32.c
new file mode 100644
index 000000000..66bf0f746
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_s32.c
@@ -0,0 +1,272 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_gather_s32_tied1:
+**	ldff1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_s32_tied1, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_s32 (p0, z0),
+		     z0_res = svldff1_gather_s32 (p0, z0))
+
+/*
+** ldff1_gather_s32_untied:
+**	ldff1w	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_s32_untied, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_s32 (p0, z1),
+		     z0_res = svldff1_gather_s32 (p0, z1))
+
+/*
+** ldff1_gather_x0_s32_offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_x0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, x0),
+		     z0_res = svldff1_gather_offset_s32 (p0, z0, x0))
+
+/*
+** ldff1_gather_m4_s32_offset:
+**	mov	(x[0-9]+), #?-4
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_m4_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, -4),
+		     z0_res = svldff1_gather_offset_s32 (p0, z0, -4))
+
+/*
+** ldff1_gather_0_s32_offset:
+**	ldff1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, 0),
+		     z0_res = svldff1_gather_offset_s32 (p0, z0, 0))
+
+/*
+** ldff1_gather_5_s32_offset:
+**	mov	(x[0-9]+), #?5
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_5_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, 5),
+		     z0_res = svldff1_gather_offset_s32 (p0, z0, 5))
+
+/*
+** ldff1_gather_6_s32_offset:
+**	mov	(x[0-9]+), #?6
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_6_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, 6),
+		     z0_res = svldff1_gather_offset_s32 (p0, z0, 6))
+
+/*
+** ldff1_gather_7_s32_offset:
+**	mov	(x[0-9]+), #?7
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_7_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, 7),
+		     z0_res = svldff1_gather_offset_s32 (p0, z0, 7))
+
+/*
+** ldff1_gather_8_s32_offset:
+**	ldff1w	z0\.s, p0/z, \[z0\.s, #8\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_8_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, 8),
+		     z0_res = svldff1_gather_offset_s32 (p0, z0, 8))
+
+/*
+** ldff1_gather_124_s32_offset:
+**	ldff1w	z0\.s, p0/z, \[z0\.s, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_124_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, 124),
+		     z0_res = svldff1_gather_offset_s32 (p0, z0, 124))
+
+/*
+** ldff1_gather_128_s32_offset:
+**	mov	(x[0-9]+), #?128
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_128_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_s32 (p0, z0, 128),
+		     z0_res = svldff1_gather_offset_s32 (p0, z0, 128))
+
+/*
+** ldff1_gather_x0_s32_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_x0_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_s32 (p0, z0, x0),
+		     z0_res = svldff1_gather_index_s32 (p0, z0, x0))
+
+/*
+** ldff1_gather_m1_s32_index:
+**	mov	(x[0-9]+), #?-4
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_m1_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_s32 (p0, z0, -1),
+		     z0_res = svldff1_gather_index_s32 (p0, z0, -1))
+
+/*
+** ldff1_gather_0_s32_index:
+**	ldff1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_0_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_s32 (p0, z0, 0),
+		     z0_res = svldff1_gather_index_s32 (p0, z0, 0))
+
+/*
+** ldff1_gather_5_s32_index:
+**	ldff1w	z0\.s, p0/z, \[z0\.s, #20\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_5_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_s32 (p0, z0, 5),
+		     z0_res = svldff1_gather_index_s32 (p0, z0, 5))
+
+/*
+** ldff1_gather_31_s32_index:
+**	ldff1w	z0\.s, p0/z, \[z0\.s, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_31_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_s32 (p0, z0, 31),
+		     z0_res = svldff1_gather_index_s32 (p0, z0, 31))
+
+/*
+** ldff1_gather_32_s32_index:
+**	mov	(x[0-9]+), #?128
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_32_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_s32 (p0, z0, 32),
+		     z0_res = svldff1_gather_index_s32 (p0, z0, 32))
+
+/*
+** ldff1_gather_x0_s32_s32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_s32_s32offset, svint32_t, int32_t, svint32_t,
+		     z0_res = svldff1_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_s32_s32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_s32_s32offset, svint32_t, int32_t, svint32_t,
+		     z0_res = svldff1_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_s32_s32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_s32_s32offset, svint32_t, int32_t, svint32_t,
+		     z0_res = svldff1_gather_s32offset_s32 (p0, x0, z1),
+		     z0_res = svldff1_gather_offset (p0, x0, z1))
+
+/*
+** ldff1_gather_x0_s32_u32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_s32_u32offset, svint32_t, int32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_s32_u32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_s32_u32offset, svint32_t, int32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_s32_u32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_s32_u32offset, svint32_t, int32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32offset_s32 (p0, x0, z1),
+		     z0_res = svldff1_gather_offset (p0, x0, z1))
+
+/*
+** ldff1_gather_x0_s32_s32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_s32_s32index, svint32_t, int32_t, svint32_t,
+		     z0_res = svldff1_gather_s32index_s32 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_s32_s32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_s32_s32index, svint32_t, int32_t, svint32_t,
+		     z0_res = svldff1_gather_s32index_s32 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_s32_s32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_s32_s32index, svint32_t, int32_t, svint32_t,
+		     z0_res = svldff1_gather_s32index_s32 (p0, x0, z1),
+		     z0_res = svldff1_gather_index (p0, x0, z1))
+
+/*
+** ldff1_gather_x0_s32_u32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_s32_u32index, svint32_t, int32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32index_s32 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_s32_u32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_s32_u32index, svint32_t, int32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32index_s32 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_s32_u32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_s32_u32index, svint32_t, int32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32index_s32 (p0, x0, z1),
+		     z0_res = svldff1_gather_index (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_s64.c
new file mode 100644
index 000000000..faf71bf9d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_s64.c
@@ -0,0 +1,348 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_gather_s64_tied1:
+**	ldff1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_s64_tied1, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_s64 (p0, z0),
+		     z0_res = svldff1_gather_s64 (p0, z0))
+
+/*
+** ldff1_gather_s64_untied:
+**	ldff1d	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_s64_untied, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_s64 (p0, z1),
+		     z0_res = svldff1_gather_s64 (p0, z1))
+
+/*
+** ldff1_gather_x0_s64_offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_x0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, x0),
+		     z0_res = svldff1_gather_offset_s64 (p0, z0, x0))
+
+/*
+** ldff1_gather_m8_s64_offset:
+**	mov	(x[0-9]+), #?-8
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_m8_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, -8),
+		     z0_res = svldff1_gather_offset_s64 (p0, z0, -8))
+
+/*
+** ldff1_gather_0_s64_offset:
+**	ldff1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 0),
+		     z0_res = svldff1_gather_offset_s64 (p0, z0, 0))
+
+/*
+** ldff1_gather_9_s64_offset:
+**	mov	(x[0-9]+), #?9
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_9_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 9),
+		     z0_res = svldff1_gather_offset_s64 (p0, z0, 9))
+
+/*
+** ldff1_gather_10_s64_offset:
+**	mov	(x[0-9]+), #?10
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_10_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 10),
+		     z0_res = svldff1_gather_offset_s64 (p0, z0, 10))
+
+/*
+** ldff1_gather_11_s64_offset:
+**	mov	(x[0-9]+), #?11
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_11_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 11),
+		     z0_res = svldff1_gather_offset_s64 (p0, z0, 11))
+
+/*
+** ldff1_gather_12_s64_offset:
+**	mov	(x[0-9]+), #?12
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_12_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 12),
+		     z0_res = svldff1_gather_offset_s64 (p0, z0, 12))
+
+/*
+** ldff1_gather_13_s64_offset:
+**	mov	(x[0-9]+), #?13
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_13_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 13),
+		     z0_res = svldff1_gather_offset_s64 (p0, z0, 13))
+
+/*
+** ldff1_gather_14_s64_offset:
+**	mov	(x[0-9]+), #?14
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_14_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 14),
+		     z0_res = svldff1_gather_offset_s64 (p0, z0, 14))
+
+/*
+** ldff1_gather_15_s64_offset:
+**	mov	(x[0-9]+), #?15
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_15_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 15),
+		     z0_res = svldff1_gather_offset_s64 (p0, z0, 15))
+
+/*
+** ldff1_gather_16_s64_offset:
+**	ldff1d	z0\.d, p0/z, \[z0\.d, #16\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_16_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 16),
+		     z0_res = svldff1_gather_offset_s64 (p0, z0, 16))
+
+/*
+** ldff1_gather_248_s64_offset:
+**	ldff1d	z0\.d, p0/z, \[z0\.d, #248\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_248_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 248),
+		     z0_res = svldff1_gather_offset_s64 (p0, z0, 248))
+
+/*
+** ldff1_gather_256_s64_offset:
+**	mov	(x[0-9]+), #?256
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_256_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_s64 (p0, z0, 256),
+		     z0_res = svldff1_gather_offset_s64 (p0, z0, 256))
+
+/*
+** ldff1_gather_x0_s64_index:
+**	lsl	(x[0-9]+), x0, #?3
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_x0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_s64 (p0, z0, x0),
+		     z0_res = svldff1_gather_index_s64 (p0, z0, x0))
+
+/*
+** ldff1_gather_m1_s64_index:
+**	mov	(x[0-9]+), #?-8
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_m1_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_s64 (p0, z0, -1),
+		     z0_res = svldff1_gather_index_s64 (p0, z0, -1))
+
+/*
+** ldff1_gather_0_s64_index:
+**	ldff1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_s64 (p0, z0, 0),
+		     z0_res = svldff1_gather_index_s64 (p0, z0, 0))
+
+/*
+** ldff1_gather_5_s64_index:
+**	ldff1d	z0\.d, p0/z, \[z0\.d, #40\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_5_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_s64 (p0, z0, 5),
+		     z0_res = svldff1_gather_index_s64 (p0, z0, 5))
+
+/*
+** ldff1_gather_31_s64_index:
+**	ldff1d	z0\.d, p0/z, \[z0\.d, #248\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_31_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_s64 (p0, z0, 31),
+		     z0_res = svldff1_gather_index_s64 (p0, z0, 31))
+
+/*
+** ldff1_gather_32_s64_index:
+**	mov	(x[0-9]+), #?256
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_32_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_s64 (p0, z0, 32),
+		     z0_res = svldff1_gather_index_s64 (p0, z0, 32))
+
+/*
+** ldff1_gather_x0_s64_s64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_s64_s64offset, svint64_t, int64_t, svint64_t,
+		     z0_res = svldff1_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_s64_s64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_s64_s64offset, svint64_t, int64_t, svint64_t,
+		     z0_res = svldff1_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_s64_s64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_s64_s64offset, svint64_t, int64_t, svint64_t,
+		     z0_res = svldff1_gather_s64offset_s64 (p0, x0, z1),
+		     z0_res = svldff1_gather_offset (p0, x0, z1))
+
+/*
+** ldff1_gather_ext_s64_s64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_ext_s64_s64offset, svint64_t, int64_t, svint64_t,
+		     z0_res = svldff1_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1_gather_offset (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1_gather_x0_s64_u64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_s64_u64offset, svint64_t, int64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_s64_u64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_s64_u64offset, svint64_t, int64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_s64_u64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_s64_u64offset, svint64_t, int64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64offset_s64 (p0, x0, z1),
+		     z0_res = svldff1_gather_offset (p0, x0, z1))
+
+/*
+** ldff1_gather_ext_s64_u64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_ext_s64_u64offset, svint64_t, int64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1_gather_offset (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1_gather_x0_s64_s64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_s64_s64index, svint64_t, int64_t, svint64_t,
+		     z0_res = svldff1_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_s64_s64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_s64_s64index, svint64_t, int64_t, svint64_t,
+		     z0_res = svldff1_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_s64_s64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_s64_s64index, svint64_t, int64_t, svint64_t,
+		     z0_res = svldff1_gather_s64index_s64 (p0, x0, z1),
+		     z0_res = svldff1_gather_index (p0, x0, z1))
+
+/*
+** ldff1_gather_ext_s64_s64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, sxtw 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_ext_s64_s64index, svint64_t, int64_t, svint64_t,
+		     z0_res = svldff1_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1_gather_index (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1_gather_x0_s64_u64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_s64_u64index, svint64_t, int64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_s64_u64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_s64_u64index, svint64_t, int64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_s64_u64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_s64_u64index, svint64_t, int64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64index_s64 (p0, x0, z1),
+		     z0_res = svldff1_gather_index (p0, x0, z1))
+
+/*
+** ldff1_gather_ext_s64_u64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, uxtw 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_ext_s64_u64index, svint64_t, int64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1_gather_index (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_u32.c
new file mode 100644
index 000000000..41c7dc9cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_u32.c
@@ -0,0 +1,272 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_gather_u32_tied1:
+**	ldff1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_u32_tied1, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_u32 (p0, z0),
+		     z0_res = svldff1_gather_u32 (p0, z0))
+
+/*
+** ldff1_gather_u32_untied:
+**	ldff1w	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_u32_untied, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_u32 (p0, z1),
+		     z0_res = svldff1_gather_u32 (p0, z1))
+
+/*
+** ldff1_gather_x0_u32_offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_x0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, x0),
+		     z0_res = svldff1_gather_offset_u32 (p0, z0, x0))
+
+/*
+** ldff1_gather_m4_u32_offset:
+**	mov	(x[0-9]+), #?-4
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_m4_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, -4),
+		     z0_res = svldff1_gather_offset_u32 (p0, z0, -4))
+
+/*
+** ldff1_gather_0_u32_offset:
+**	ldff1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, 0),
+		     z0_res = svldff1_gather_offset_u32 (p0, z0, 0))
+
+/*
+** ldff1_gather_5_u32_offset:
+**	mov	(x[0-9]+), #?5
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_5_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, 5),
+		     z0_res = svldff1_gather_offset_u32 (p0, z0, 5))
+
+/*
+** ldff1_gather_6_u32_offset:
+**	mov	(x[0-9]+), #?6
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_6_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, 6),
+		     z0_res = svldff1_gather_offset_u32 (p0, z0, 6))
+
+/*
+** ldff1_gather_7_u32_offset:
+**	mov	(x[0-9]+), #?7
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_7_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, 7),
+		     z0_res = svldff1_gather_offset_u32 (p0, z0, 7))
+
+/*
+** ldff1_gather_8_u32_offset:
+**	ldff1w	z0\.s, p0/z, \[z0\.s, #8\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_8_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, 8),
+		     z0_res = svldff1_gather_offset_u32 (p0, z0, 8))
+
+/*
+** ldff1_gather_124_u32_offset:
+**	ldff1w	z0\.s, p0/z, \[z0\.s, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_124_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, 124),
+		     z0_res = svldff1_gather_offset_u32 (p0, z0, 124))
+
+/*
+** ldff1_gather_128_u32_offset:
+**	mov	(x[0-9]+), #?128
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_128_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_offset_u32 (p0, z0, 128),
+		     z0_res = svldff1_gather_offset_u32 (p0, z0, 128))
+
+/*
+** ldff1_gather_x0_u32_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_x0_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_u32 (p0, z0, x0),
+		     z0_res = svldff1_gather_index_u32 (p0, z0, x0))
+
+/*
+** ldff1_gather_m1_u32_index:
+**	mov	(x[0-9]+), #?-4
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_m1_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_u32 (p0, z0, -1),
+		     z0_res = svldff1_gather_index_u32 (p0, z0, -1))
+
+/*
+** ldff1_gather_0_u32_index:
+**	ldff1w	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_0_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_u32 (p0, z0, 0),
+		     z0_res = svldff1_gather_index_u32 (p0, z0, 0))
+
+/*
+** ldff1_gather_5_u32_index:
+**	ldff1w	z0\.s, p0/z, \[z0\.s, #20\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_5_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_u32 (p0, z0, 5),
+		     z0_res = svldff1_gather_index_u32 (p0, z0, 5))
+
+/*
+** ldff1_gather_31_u32_index:
+**	ldff1w	z0\.s, p0/z, \[z0\.s, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_31_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_u32 (p0, z0, 31),
+		     z0_res = svldff1_gather_index_u32 (p0, z0, 31))
+
+/*
+** ldff1_gather_32_u32_index:
+**	mov	(x[0-9]+), #?128
+**	ldff1w	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_32_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32base_index_u32 (p0, z0, 32),
+		     z0_res = svldff1_gather_index_u32 (p0, z0, 32))
+
+/*
+** ldff1_gather_x0_u32_s32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_u32_s32offset, svuint32_t, uint32_t, svint32_t,
+		     z0_res = svldff1_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_u32_s32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_u32_s32offset, svuint32_t, uint32_t, svint32_t,
+		     z0_res = svldff1_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_u32_s32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_u32_s32offset, svuint32_t, uint32_t, svint32_t,
+		     z0_res = svldff1_gather_s32offset_u32 (p0, x0, z1),
+		     z0_res = svldff1_gather_offset (p0, x0, z1))
+
+/*
+** ldff1_gather_x0_u32_u32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_u32_u32offset, svuint32_t, uint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_u32_u32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_u32_u32offset, svuint32_t, uint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_u32_u32offset:
+**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_u32_u32offset, svuint32_t, uint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32offset_u32 (p0, x0, z1),
+		     z0_res = svldff1_gather_offset (p0, x0, z1))
+
+/*
+** ldff1_gather_x0_u32_s32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_u32_s32index, svuint32_t, uint32_t, svint32_t,
+		     z0_res = svldff1_gather_s32index_u32 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_u32_s32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_u32_s32index, svuint32_t, uint32_t, svint32_t,
+		     z0_res = svldff1_gather_s32index_u32 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_u32_s32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_u32_s32index, svuint32_t, uint32_t, svint32_t,
+		     z0_res = svldff1_gather_s32index_u32 (p0, x0, z1),
+		     z0_res = svldff1_gather_index (p0, x0, z1))
+
+/*
+** ldff1_gather_x0_u32_u32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_u32_u32index, svuint32_t, uint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32index_u32 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_u32_u32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z0\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_u32_u32index, svuint32_t, uint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32index_u32 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_u32_u32index:
+**	ldff1w	z0\.s, p0/z, \[x0, z1\.s, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_u32_u32index, svuint32_t, uint32_t, svuint32_t,
+		     z0_res = svldff1_gather_u32index_u32 (p0, x0, z1),
+		     z0_res = svldff1_gather_index (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_u64.c
new file mode 100644
index 000000000..8b53ce94f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_gather_u64.c
@@ -0,0 +1,348 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_gather_u64_tied1:
+**	ldff1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_u64_tied1, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_u64 (p0, z0),
+		     z0_res = svldff1_gather_u64 (p0, z0))
+
+/*
+** ldff1_gather_u64_untied:
+**	ldff1d	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_u64_untied, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_u64 (p0, z1),
+		     z0_res = svldff1_gather_u64 (p0, z1))
+
+/*
+** ldff1_gather_x0_u64_offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_x0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, x0),
+		     z0_res = svldff1_gather_offset_u64 (p0, z0, x0))
+
+/*
+** ldff1_gather_m8_u64_offset:
+**	mov	(x[0-9]+), #?-8
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_m8_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, -8),
+		     z0_res = svldff1_gather_offset_u64 (p0, z0, -8))
+
+/*
+** ldff1_gather_0_u64_offset:
+**	ldff1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 0),
+		     z0_res = svldff1_gather_offset_u64 (p0, z0, 0))
+
+/*
+** ldff1_gather_9_u64_offset:
+**	mov	(x[0-9]+), #?9
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_9_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 9),
+		     z0_res = svldff1_gather_offset_u64 (p0, z0, 9))
+
+/*
+** ldff1_gather_10_u64_offset:
+**	mov	(x[0-9]+), #?10
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_10_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 10),
+		     z0_res = svldff1_gather_offset_u64 (p0, z0, 10))
+
+/*
+** ldff1_gather_11_u64_offset:
+**	mov	(x[0-9]+), #?11
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_11_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 11),
+		     z0_res = svldff1_gather_offset_u64 (p0, z0, 11))
+
+/*
+** ldff1_gather_12_u64_offset:
+**	mov	(x[0-9]+), #?12
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_12_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 12),
+		     z0_res = svldff1_gather_offset_u64 (p0, z0, 12))
+
+/*
+** ldff1_gather_13_u64_offset:
+**	mov	(x[0-9]+), #?13
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_13_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 13),
+		     z0_res = svldff1_gather_offset_u64 (p0, z0, 13))
+
+/*
+** ldff1_gather_14_u64_offset:
+**	mov	(x[0-9]+), #?14
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_14_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 14),
+		     z0_res = svldff1_gather_offset_u64 (p0, z0, 14))
+
+/*
+** ldff1_gather_15_u64_offset:
+**	mov	(x[0-9]+), #?15
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_15_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 15),
+		     z0_res = svldff1_gather_offset_u64 (p0, z0, 15))
+
+/*
+** ldff1_gather_16_u64_offset:
+**	ldff1d	z0\.d, p0/z, \[z0\.d, #16\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_16_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 16),
+		     z0_res = svldff1_gather_offset_u64 (p0, z0, 16))
+
+/*
+** ldff1_gather_248_u64_offset:
+**	ldff1d	z0\.d, p0/z, \[z0\.d, #248\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_248_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 248),
+		     z0_res = svldff1_gather_offset_u64 (p0, z0, 248))
+
+/*
+** ldff1_gather_256_u64_offset:
+**	mov	(x[0-9]+), #?256
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_256_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_offset_u64 (p0, z0, 256),
+		     z0_res = svldff1_gather_offset_u64 (p0, z0, 256))
+
+/*
+** ldff1_gather_x0_u64_index:
+**	lsl	(x[0-9]+), x0, #?3
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_x0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_u64 (p0, z0, x0),
+		     z0_res = svldff1_gather_index_u64 (p0, z0, x0))
+
+/*
+** ldff1_gather_m1_u64_index:
+**	mov	(x[0-9]+), #?-8
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_m1_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_u64 (p0, z0, -1),
+		     z0_res = svldff1_gather_index_u64 (p0, z0, -1))
+
+/*
+** ldff1_gather_0_u64_index:
+**	ldff1d	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_u64 (p0, z0, 0),
+		     z0_res = svldff1_gather_index_u64 (p0, z0, 0))
+
+/*
+** ldff1_gather_5_u64_index:
+**	ldff1d	z0\.d, p0/z, \[z0\.d, #40\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_5_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_u64 (p0, z0, 5),
+		     z0_res = svldff1_gather_index_u64 (p0, z0, 5))
+
+/*
+** ldff1_gather_31_u64_index:
+**	ldff1d	z0\.d, p0/z, \[z0\.d, #248\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_31_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_u64 (p0, z0, 31),
+		     z0_res = svldff1_gather_index_u64 (p0, z0, 31))
+
+/*
+** ldff1_gather_32_u64_index:
+**	mov	(x[0-9]+), #?256
+**	ldff1d	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1_gather_32_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64base_index_u64 (p0, z0, 32),
+		     z0_res = svldff1_gather_index_u64 (p0, z0, 32))
+
+/*
+** ldff1_gather_x0_u64_s64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_u64_s64offset, svuint64_t, uint64_t, svint64_t,
+		     z0_res = svldff1_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_u64_s64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_u64_s64offset, svuint64_t, uint64_t, svint64_t,
+		     z0_res = svldff1_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_u64_s64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_u64_s64offset, svuint64_t, uint64_t, svint64_t,
+		     z0_res = svldff1_gather_s64offset_u64 (p0, x0, z1),
+		     z0_res = svldff1_gather_offset (p0, x0, z1))
+
+/*
+** ldff1_gather_ext_u64_s64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_ext_u64_s64offset, svuint64_t, uint64_t, svint64_t,
+		     z0_res = svldff1_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1_gather_offset (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1_gather_x0_u64_u64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_u64_u64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1_gather_offset (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_u64_u64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64offset_u64 (p0, x0, z1),
+		     z0_res = svldff1_gather_offset (p0, x0, z1))
+
+/*
+** ldff1_gather_ext_u64_u64offset:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_ext_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1_gather_offset (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1_gather_x0_u64_s64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_u64_s64index, svuint64_t, uint64_t, svint64_t,
+		     z0_res = svldff1_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_u64_s64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_u64_s64index, svuint64_t, uint64_t, svint64_t,
+		     z0_res = svldff1_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_u64_s64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_u64_s64index, svuint64_t, uint64_t, svint64_t,
+		     z0_res = svldff1_gather_s64index_u64 (p0, x0, z1),
+		     z0_res = svldff1_gather_index (p0, x0, z1))
+
+/*
+** ldff1_gather_ext_u64_s64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, sxtw 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_ext_u64_s64index, svuint64_t, uint64_t, svint64_t,
+		     z0_res = svldff1_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1_gather_index (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1_gather_x0_u64_u64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_x0_u64_u64index, svuint64_t, uint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_tied1_u64_u64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_tied1_u64_u64index, svuint64_t, uint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svldff1_gather_index (p0, x0, z0))
+
+/*
+** ldff1_gather_untied_u64_u64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_untied_u64_u64index, svuint64_t, uint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64index_u64 (p0, x0, z1),
+		     z0_res = svldff1_gather_index (p0, x0, z1))
+
+/*
+** ldff1_gather_ext_u64_u64index:
+**	ldff1d	z0\.d, p0/z, \[x0, z1\.d, uxtw 3\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1_gather_ext_u64_u64index, svuint64_t, uint64_t, svuint64_t,
+		     z0_res = svldff1_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1_gather_index (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s16.c
new file mode 100644
index 000000000..1d5fde0e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s16.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_s16_base:
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_s16_base, svint16_t, int16_t,
+	   z0 = svldff1_s16 (p0, x0),
+	   z0 = svldff1 (p0, x0))
+
+/*
+** ldff1_s16_index:
+**	ldff1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ldff1_s16_index, svint16_t, int16_t,
+	   z0 = svldff1_s16 (p0, x0 + x1),
+	   z0 = svldff1 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_s16_1:
+**	incb	x0
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_s16_1, svint16_t, int16_t,
+	   z0 = svldff1_s16 (p0, x0 + svcnth ()),
+	   z0 = svldff1 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_s16_m1:
+**	decb	x0
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_s16_m1, svint16_t, int16_t,
+	   z0 = svldff1_s16 (p0, x0 - svcnth ()),
+	   z0 = svldff1 (p0, x0 - svcnth ()))
+
+/*
+** ldff1_vnum_s16_0:
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_s16_0, svint16_t, int16_t,
+	   z0 = svldff1_vnum_s16 (p0, x0, 0),
+	   z0 = svldff1_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_s16_1:
+**	incb	x0
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_s16_1, svint16_t, int16_t,
+	   z0 = svldff1_vnum_s16 (p0, x0, 1),
+	   z0 = svldff1_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_s16_m1:
+**	decb	x0
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_s16_m1, svint16_t, int16_t,
+	   z0 = svldff1_vnum_s16 (p0, x0, -1),
+	   z0 = svldff1_vnum (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1_vnum_s16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1h	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_s16_x1, svint16_t, int16_t,
+	   z0 = svldff1_vnum_s16 (p0, x0, x1),
+	   z0 = svldff1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s32.c
new file mode 100644
index 000000000..97a36e884
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s32.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_s32_base:
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_s32_base, svint32_t, int32_t,
+	   z0 = svldff1_s32 (p0, x0),
+	   z0 = svldff1 (p0, x0))
+
+/*
+** ldff1_s32_index:
+**	ldff1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ldff1_s32_index, svint32_t, int32_t,
+	   z0 = svldff1_s32 (p0, x0 + x1),
+	   z0 = svldff1 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_s32_1:
+**	incb	x0
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_s32_1, svint32_t, int32_t,
+	   z0 = svldff1_s32 (p0, x0 + svcntw ()),
+	   z0 = svldff1 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_s32_m1:
+**	decb	x0
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_s32_m1, svint32_t, int32_t,
+	   z0 = svldff1_s32 (p0, x0 - svcntw ()),
+	   z0 = svldff1 (p0, x0 - svcntw ()))
+
+/*
+** ldff1_vnum_s32_0:
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_s32_0, svint32_t, int32_t,
+	   z0 = svldff1_vnum_s32 (p0, x0, 0),
+	   z0 = svldff1_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_s32_1:
+**	incb	x0
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_s32_1, svint32_t, int32_t,
+	   z0 = svldff1_vnum_s32 (p0, x0, 1),
+	   z0 = svldff1_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_s32_m1:
+**	decb	x0
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_s32_m1, svint32_t, int32_t,
+	   z0 = svldff1_vnum_s32 (p0, x0, -1),
+	   z0 = svldff1_vnum (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1_vnum_s32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1w	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_s32_x1, svint32_t, int32_t,
+	   z0 = svldff1_vnum_s32 (p0, x0, x1),
+	   z0 = svldff1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s64.c
new file mode 100644
index 000000000..c018a4c1c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s64.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_s64_base:
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_s64_base, svint64_t, int64_t,
+	   z0 = svldff1_s64 (p0, x0),
+	   z0 = svldff1 (p0, x0))
+
+/*
+** ldff1_s64_index:
+**	ldff1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ldff1_s64_index, svint64_t, int64_t,
+	   z0 = svldff1_s64 (p0, x0 + x1),
+	   z0 = svldff1 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_s64_1:
+**	incb	x0
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_s64_1, svint64_t, int64_t,
+	   z0 = svldff1_s64 (p0, x0 + svcntd ()),
+	   z0 = svldff1 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_s64_m1:
+**	decb	x0
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_s64_m1, svint64_t, int64_t,
+	   z0 = svldff1_s64 (p0, x0 - svcntd ()),
+	   z0 = svldff1 (p0, x0 - svcntd ()))
+
+/*
+** ldff1_vnum_s64_0:
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_s64_0, svint64_t, int64_t,
+	   z0 = svldff1_vnum_s64 (p0, x0, 0),
+	   z0 = svldff1_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_s64_1:
+**	incb	x0
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_s64_1, svint64_t, int64_t,
+	   z0 = svldff1_vnum_s64 (p0, x0, 1),
+	   z0 = svldff1_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_s64_m1:
+**	decb	x0
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_s64_m1, svint64_t, int64_t,
+	   z0 = svldff1_vnum_s64 (p0, x0, -1),
+	   z0 = svldff1_vnum (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1_vnum_s64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1d	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_s64_x1, svint64_t, int64_t,
+	   z0 = svldff1_vnum_s64 (p0, x0, x1),
+	   z0 = svldff1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s8.c
new file mode 100644
index 000000000..cf620d1f4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_s8.c
@@ -0,0 +1,90 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_s8_base:
+**	ldff1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_s8_base, svint8_t, int8_t,
+	   z0 = svldff1_s8 (p0, x0),
+	   z0 = svldff1 (p0, x0))
+
+/*
+** ldff1_s8_index:
+**	ldff1b	z0\.b, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ldff1_s8_index, svint8_t, int8_t,
+	   z0 = svldff1_s8 (p0, x0 + x1),
+	   z0 = svldff1 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_s8_1:
+**	incb	x0
+**	ldff1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_s8_1, svint8_t, int8_t,
+	   z0 = svldff1_s8 (p0, x0 + svcntb ()),
+	   z0 = svldff1 (p0, x0 + svcntb ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_s8_m1:
+**	decb	x0
+**	ldff1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_s8_m1, svint8_t, int8_t,
+	   z0 = svldff1_s8 (p0, x0 - svcntb ()),
+	   z0 = svldff1 (p0, x0 - svcntb ()))
+
+/*
+** ldff1_vnum_s8_0:
+**	ldff1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_s8_0, svint8_t, int8_t,
+	   z0 = svldff1_vnum_s8 (p0, x0, 0),
+	   z0 = svldff1_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_s8_1:
+**	incb	x0
+**	ldff1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_s8_1, svint8_t, int8_t,
+	   z0 = svldff1_vnum_s8 (p0, x0, 1),
+	   z0 = svldff1_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_s8_m1:
+**	decb	x0
+**	ldff1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_s8_m1, svint8_t, int8_t,
+	   z0 = svldff1_vnum_s8 (p0, x0, -1),
+	   z0 = svldff1_vnum (p0, x0, -1))
+
+/*
+** ldff1_vnum_s8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldff1b	z0\.b, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ldff1b	z0\.b, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_s8_x1, svint8_t, int8_t,
+	   z0 = svldff1_vnum_s8 (p0, x0, x1),
+	   z0 = svldff1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u16.c
new file mode 100644
index 000000000..1fa819296
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u16.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_u16_base:
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_u16_base, svuint16_t, uint16_t,
+	   z0 = svldff1_u16 (p0, x0),
+	   z0 = svldff1 (p0, x0))
+
+/*
+** ldff1_u16_index:
+**	ldff1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ldff1_u16_index, svuint16_t, uint16_t,
+	   z0 = svldff1_u16 (p0, x0 + x1),
+	   z0 = svldff1 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_u16_1:
+**	incb	x0
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_u16_1, svuint16_t, uint16_t,
+	   z0 = svldff1_u16 (p0, x0 + svcnth ()),
+	   z0 = svldff1 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_u16_m1:
+**	decb	x0
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_u16_m1, svuint16_t, uint16_t,
+	   z0 = svldff1_u16 (p0, x0 - svcnth ()),
+	   z0 = svldff1 (p0, x0 - svcnth ()))
+
+/*
+** ldff1_vnum_u16_0:
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_u16_0, svuint16_t, uint16_t,
+	   z0 = svldff1_vnum_u16 (p0, x0, 0),
+	   z0 = svldff1_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_u16_1:
+**	incb	x0
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_u16_1, svuint16_t, uint16_t,
+	   z0 = svldff1_vnum_u16 (p0, x0, 1),
+	   z0 = svldff1_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_u16_m1:
+**	decb	x0
+**	ldff1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_u16_m1, svuint16_t, uint16_t,
+	   z0 = svldff1_vnum_u16 (p0, x0, -1),
+	   z0 = svldff1_vnum (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1_vnum_u16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1h	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_u16_x1, svuint16_t, uint16_t,
+	   z0 = svldff1_vnum_u16 (p0, x0, x1),
+	   z0 = svldff1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u32.c
new file mode 100644
index 000000000..5224ec40a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u32.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_u32_base:
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_u32_base, svuint32_t, uint32_t,
+	   z0 = svldff1_u32 (p0, x0),
+	   z0 = svldff1 (p0, x0))
+
+/*
+** ldff1_u32_index:
+**	ldff1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ldff1_u32_index, svuint32_t, uint32_t,
+	   z0 = svldff1_u32 (p0, x0 + x1),
+	   z0 = svldff1 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_u32_1:
+**	incb	x0
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_u32_1, svuint32_t, uint32_t,
+	   z0 = svldff1_u32 (p0, x0 + svcntw ()),
+	   z0 = svldff1 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_u32_m1:
+**	decb	x0
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_u32_m1, svuint32_t, uint32_t,
+	   z0 = svldff1_u32 (p0, x0 - svcntw ()),
+	   z0 = svldff1 (p0, x0 - svcntw ()))
+
+/*
+** ldff1_vnum_u32_0:
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_u32_0, svuint32_t, uint32_t,
+	   z0 = svldff1_vnum_u32 (p0, x0, 0),
+	   z0 = svldff1_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_u32_1:
+**	incb	x0
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_u32_1, svuint32_t, uint32_t,
+	   z0 = svldff1_vnum_u32 (p0, x0, 1),
+	   z0 = svldff1_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_u32_m1:
+**	decb	x0
+**	ldff1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_u32_m1, svuint32_t, uint32_t,
+	   z0 = svldff1_vnum_u32 (p0, x0, -1),
+	   z0 = svldff1_vnum (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1_vnum_u32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1w	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_u32_x1, svuint32_t, uint32_t,
+	   z0 = svldff1_vnum_u32 (p0, x0, x1),
+	   z0 = svldff1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u64.c
new file mode 100644
index 000000000..18e87f2b8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u64.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_u64_base:
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_u64_base, svuint64_t, uint64_t,
+	   z0 = svldff1_u64 (p0, x0),
+	   z0 = svldff1 (p0, x0))
+
+/*
+** ldff1_u64_index:
+**	ldff1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ldff1_u64_index, svuint64_t, uint64_t,
+	   z0 = svldff1_u64 (p0, x0 + x1),
+	   z0 = svldff1 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_u64_1:
+**	incb	x0
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_u64_1, svuint64_t, uint64_t,
+	   z0 = svldff1_u64 (p0, x0 + svcntd ()),
+	   z0 = svldff1 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_u64_m1:
+**	decb	x0
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_u64_m1, svuint64_t, uint64_t,
+	   z0 = svldff1_u64 (p0, x0 - svcntd ()),
+	   z0 = svldff1 (p0, x0 - svcntd ()))
+
+/*
+** ldff1_vnum_u64_0:
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_u64_0, svuint64_t, uint64_t,
+	   z0 = svldff1_vnum_u64 (p0, x0, 0),
+	   z0 = svldff1_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_u64_1:
+**	incb	x0
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_u64_1, svuint64_t, uint64_t,
+	   z0 = svldff1_vnum_u64 (p0, x0, 1),
+	   z0 = svldff1_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_u64_m1:
+**	decb	x0
+**	ldff1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_u64_m1, svuint64_t, uint64_t,
+	   z0 = svldff1_vnum_u64 (p0, x0, -1),
+	   z0 = svldff1_vnum (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1_vnum_u64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1d	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_u64_x1, svuint64_t, uint64_t,
+	   z0 = svldff1_vnum_u64 (p0, x0, x1),
+	   z0 = svldff1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u8.c
new file mode 100644
index 000000000..83883fca4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1_u8.c
@@ -0,0 +1,90 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_u8_base:
+**	ldff1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_u8_base, svuint8_t, uint8_t,
+	   z0 = svldff1_u8 (p0, x0),
+	   z0 = svldff1 (p0, x0))
+
+/*
+** ldff1_u8_index:
+**	ldff1b	z0\.b, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ldff1_u8_index, svuint8_t, uint8_t,
+	   z0 = svldff1_u8 (p0, x0 + x1),
+	   z0 = svldff1 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_u8_1:
+**	incb	x0
+**	ldff1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_u8_1, svuint8_t, uint8_t,
+	   z0 = svldff1_u8 (p0, x0 + svcntb ()),
+	   z0 = svldff1 (p0, x0 + svcntb ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_u8_m1:
+**	decb	x0
+**	ldff1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_u8_m1, svuint8_t, uint8_t,
+	   z0 = svldff1_u8 (p0, x0 - svcntb ()),
+	   z0 = svldff1 (p0, x0 - svcntb ()))
+
+/*
+** ldff1_vnum_u8_0:
+**	ldff1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_u8_0, svuint8_t, uint8_t,
+	   z0 = svldff1_vnum_u8 (p0, x0, 0),
+	   z0 = svldff1_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_u8_1:
+**	incb	x0
+**	ldff1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_u8_1, svuint8_t, uint8_t,
+	   z0 = svldff1_vnum_u8 (p0, x0, 1),
+	   z0 = svldff1_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1_vnum_u8_m1:
+**	decb	x0
+**	ldff1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_u8_m1, svuint8_t, uint8_t,
+	   z0 = svldff1_vnum_u8 (p0, x0, -1),
+	   z0 = svldff1_vnum (p0, x0, -1))
+
+/*
+** ldff1_vnum_u8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldff1b	z0\.b, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ldff1b	z0\.b, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ldff1_vnum_u8_x1, svuint8_t, uint8_t,
+	   z0 = svldff1_vnum_u8 (p0, x0, x1),
+	   z0 = svldff1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_s32.c
new file mode 100644
index 000000000..c2a676807
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_s32.c
@@ -0,0 +1,131 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sb_gather_s32_tied1:
+**	ldff1sb	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_s32_tied1, svint32_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32base_s32 (p0, z0),
+		     z0_res = svldff1sb_gather_s32 (p0, z0))
+
+/*
+** ldff1sb_gather_s32_untied:
+**	ldff1sb	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_s32_untied, svint32_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32base_s32 (p0, z1),
+		     z0_res = svldff1sb_gather_s32 (p0, z1))
+
+/*
+** ldff1sb_gather_x0_s32_offset:
+**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_x0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32base_offset_s32 (p0, z0, x0),
+		     z0_res = svldff1sb_gather_offset_s32 (p0, z0, x0))
+
+/*
+** ldff1sb_gather_m1_s32_offset:
+**	mov	(x[0-9]+), #?-1
+**	ldff1sb	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_m1_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32base_offset_s32 (p0, z0, -1),
+		     z0_res = svldff1sb_gather_offset_s32 (p0, z0, -1))
+
+/*
+** ldff1sb_gather_0_s32_offset:
+**	ldff1sb	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32base_offset_s32 (p0, z0, 0),
+		     z0_res = svldff1sb_gather_offset_s32 (p0, z0, 0))
+
+/*
+** ldff1sb_gather_5_s32_offset:
+**	ldff1sb	z0\.s, p0/z, \[z0\.s, #5\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_5_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32base_offset_s32 (p0, z0, 5),
+		     z0_res = svldff1sb_gather_offset_s32 (p0, z0, 5))
+
+/*
+** ldff1sb_gather_31_s32_offset:
+**	ldff1sb	z0\.s, p0/z, \[z0\.s, #31\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_31_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32base_offset_s32 (p0, z0, 31),
+		     z0_res = svldff1sb_gather_offset_s32 (p0, z0, 31))
+
+/*
+** ldff1sb_gather_32_s32_offset:
+**	mov	(x[0-9]+), #?32
+**	ldff1sb	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_32_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32base_offset_s32 (p0, z0, 32),
+		     z0_res = svldff1sb_gather_offset_s32 (p0, z0, 32))
+
+/*
+** ldff1sb_gather_x0_s32_s32offset:
+**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_x0_s32_s32offset, svint32_t, int8_t, svint32_t,
+		     z0_res = svldff1sb_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1sb_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ldff1sb_gather_tied1_s32_s32offset:
+**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_tied1_s32_s32offset, svint32_t, int8_t, svint32_t,
+		     z0_res = svldff1sb_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1sb_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ldff1sb_gather_untied_s32_s32offset:
+**	ldff1sb	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_untied_s32_s32offset, svint32_t, int8_t, svint32_t,
+		     z0_res = svldff1sb_gather_s32offset_s32 (p0, x0, z1),
+		     z0_res = svldff1sb_gather_offset_s32 (p0, x0, z1))
+
+/*
+** ldff1sb_gather_x0_s32_u32offset:
+**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_x0_s32_u32offset, svint32_t, int8_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1sb_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ldff1sb_gather_tied1_s32_u32offset:
+**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_tied1_s32_u32offset, svint32_t, int8_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1sb_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ldff1sb_gather_untied_s32_u32offset:
+**	ldff1sb	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_untied_s32_u32offset, svint32_t, int8_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32offset_s32 (p0, x0, z1),
+		     z0_res = svldff1sb_gather_offset_s32 (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_s64.c
new file mode 100644
index 000000000..2f2a04d24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_s64.c
@@ -0,0 +1,149 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sb_gather_s64_tied1:
+**	ldff1sb	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_s64_tied1, svint64_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64base_s64 (p0, z0),
+		     z0_res = svldff1sb_gather_s64 (p0, z0))
+
+/*
+** ldff1sb_gather_s64_untied:
+**	ldff1sb	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_s64_untied, svint64_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64base_s64 (p0, z1),
+		     z0_res = svldff1sb_gather_s64 (p0, z1))
+
+/*
+** ldff1sb_gather_x0_s64_offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_x0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64base_offset_s64 (p0, z0, x0),
+		     z0_res = svldff1sb_gather_offset_s64 (p0, z0, x0))
+
+/*
+** ldff1sb_gather_m1_s64_offset:
+**	mov	(x[0-9]+), #?-1
+**	ldff1sb	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_m1_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64base_offset_s64 (p0, z0, -1),
+		     z0_res = svldff1sb_gather_offset_s64 (p0, z0, -1))
+
+/*
+** ldff1sb_gather_0_s64_offset:
+**	ldff1sb	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64base_offset_s64 (p0, z0, 0),
+		     z0_res = svldff1sb_gather_offset_s64 (p0, z0, 0))
+
+/*
+** ldff1sb_gather_5_s64_offset:
+**	ldff1sb	z0\.d, p0/z, \[z0\.d, #5\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_5_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64base_offset_s64 (p0, z0, 5),
+		     z0_res = svldff1sb_gather_offset_s64 (p0, z0, 5))
+
+/*
+** ldff1sb_gather_31_s64_offset:
+**	ldff1sb	z0\.d, p0/z, \[z0\.d, #31\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_31_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64base_offset_s64 (p0, z0, 31),
+		     z0_res = svldff1sb_gather_offset_s64 (p0, z0, 31))
+
+/*
+** ldff1sb_gather_32_s64_offset:
+**	mov	(x[0-9]+), #?32
+**	ldff1sb	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_32_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64base_offset_s64 (p0, z0, 32),
+		     z0_res = svldff1sb_gather_offset_s64 (p0, z0, 32))
+
+/*
+** ldff1sb_gather_x0_s64_s64offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_x0_s64_s64offset, svint64_t, int8_t, svint64_t,
+		     z0_res = svldff1sb_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1sb_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1sb_gather_tied1_s64_s64offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_tied1_s64_s64offset, svint64_t, int8_t, svint64_t,
+		     z0_res = svldff1sb_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1sb_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1sb_gather_untied_s64_s64offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_untied_s64_s64offset, svint64_t, int8_t, svint64_t,
+		     z0_res = svldff1sb_gather_s64offset_s64 (p0, x0, z1),
+		     z0_res = svldff1sb_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ldff1sb_gather_ext_s64_s64offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_ext_s64_s64offset, svint64_t, int8_t, svint64_t,
+		     z0_res = svldff1sb_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1sb_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1sb_gather_x0_s64_u64offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_x0_s64_u64offset, svint64_t, int8_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1sb_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1sb_gather_tied1_s64_u64offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_tied1_s64_u64offset, svint64_t, int8_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1sb_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1sb_gather_untied_s64_u64offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_untied_s64_u64offset, svint64_t, int8_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64offset_s64 (p0, x0, z1),
+		     z0_res = svldff1sb_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ldff1sb_gather_ext_s64_u64offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_ext_s64_u64offset, svint64_t, int8_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1sb_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_u32.c
new file mode 100644
index 000000000..e3e83a205
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_u32.c
@@ -0,0 +1,131 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sb_gather_u32_tied1:
+**	ldff1sb	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_u32_tied1, svuint32_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32base_u32 (p0, z0),
+		     z0_res = svldff1sb_gather_u32 (p0, z0))
+
+/*
+** ldff1sb_gather_u32_untied:
+**	ldff1sb	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_u32_untied, svuint32_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32base_u32 (p0, z1),
+		     z0_res = svldff1sb_gather_u32 (p0, z1))
+
+/*
+** ldff1sb_gather_x0_u32_offset:
+**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_x0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32base_offset_u32 (p0, z0, x0),
+		     z0_res = svldff1sb_gather_offset_u32 (p0, z0, x0))
+
+/*
+** ldff1sb_gather_m1_u32_offset:
+**	mov	(x[0-9]+), #?-1
+**	ldff1sb	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_m1_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32base_offset_u32 (p0, z0, -1),
+		     z0_res = svldff1sb_gather_offset_u32 (p0, z0, -1))
+
+/*
+** ldff1sb_gather_0_u32_offset:
+**	ldff1sb	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32base_offset_u32 (p0, z0, 0),
+		     z0_res = svldff1sb_gather_offset_u32 (p0, z0, 0))
+
+/*
+** ldff1sb_gather_5_u32_offset:
+**	ldff1sb	z0\.s, p0/z, \[z0\.s, #5\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_5_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32base_offset_u32 (p0, z0, 5),
+		     z0_res = svldff1sb_gather_offset_u32 (p0, z0, 5))
+
+/*
+** ldff1sb_gather_31_u32_offset:
+**	ldff1sb	z0\.s, p0/z, \[z0\.s, #31\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_31_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32base_offset_u32 (p0, z0, 31),
+		     z0_res = svldff1sb_gather_offset_u32 (p0, z0, 31))
+
+/*
+** ldff1sb_gather_32_u32_offset:
+**	mov	(x[0-9]+), #?32
+**	ldff1sb	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_32_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32base_offset_u32 (p0, z0, 32),
+		     z0_res = svldff1sb_gather_offset_u32 (p0, z0, 32))
+
+/*
+** ldff1sb_gather_x0_u32_s32offset:
+**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_x0_u32_s32offset, svuint32_t, int8_t, svint32_t,
+		     z0_res = svldff1sb_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1sb_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ldff1sb_gather_tied1_u32_s32offset:
+**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_tied1_u32_s32offset, svuint32_t, int8_t, svint32_t,
+		     z0_res = svldff1sb_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1sb_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ldff1sb_gather_untied_u32_s32offset:
+**	ldff1sb	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_untied_u32_s32offset, svuint32_t, int8_t, svint32_t,
+		     z0_res = svldff1sb_gather_s32offset_u32 (p0, x0, z1),
+		     z0_res = svldff1sb_gather_offset_u32 (p0, x0, z1))
+
+/*
+** ldff1sb_gather_x0_u32_u32offset:
+**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_x0_u32_u32offset, svuint32_t, int8_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1sb_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ldff1sb_gather_tied1_u32_u32offset:
+**	ldff1sb	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_tied1_u32_u32offset, svuint32_t, int8_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1sb_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ldff1sb_gather_untied_u32_u32offset:
+**	ldff1sb	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_untied_u32_u32offset, svuint32_t, int8_t, svuint32_t,
+		     z0_res = svldff1sb_gather_u32offset_u32 (p0, x0, z1),
+		     z0_res = svldff1sb_gather_offset_u32 (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_u64.c
new file mode 100644
index 000000000..769f2c266
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_gather_u64.c
@@ -0,0 +1,149 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sb_gather_u64_tied1:
+**	ldff1sb	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_u64_tied1, svuint64_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64base_u64 (p0, z0),
+		     z0_res = svldff1sb_gather_u64 (p0, z0))
+
+/*
+** ldff1sb_gather_u64_untied:
+**	ldff1sb	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_u64_untied, svuint64_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64base_u64 (p0, z1),
+		     z0_res = svldff1sb_gather_u64 (p0, z1))
+
+/*
+** ldff1sb_gather_x0_u64_offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_x0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64base_offset_u64 (p0, z0, x0),
+		     z0_res = svldff1sb_gather_offset_u64 (p0, z0, x0))
+
+/*
+** ldff1sb_gather_m1_u64_offset:
+**	mov	(x[0-9]+), #?-1
+**	ldff1sb	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_m1_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64base_offset_u64 (p0, z0, -1),
+		     z0_res = svldff1sb_gather_offset_u64 (p0, z0, -1))
+
+/*
+** ldff1sb_gather_0_u64_offset:
+**	ldff1sb	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64base_offset_u64 (p0, z0, 0),
+		     z0_res = svldff1sb_gather_offset_u64 (p0, z0, 0))
+
+/*
+** ldff1sb_gather_5_u64_offset:
+**	ldff1sb	z0\.d, p0/z, \[z0\.d, #5\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_5_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64base_offset_u64 (p0, z0, 5),
+		     z0_res = svldff1sb_gather_offset_u64 (p0, z0, 5))
+
+/*
+** ldff1sb_gather_31_u64_offset:
+**	ldff1sb	z0\.d, p0/z, \[z0\.d, #31\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_31_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64base_offset_u64 (p0, z0, 31),
+		     z0_res = svldff1sb_gather_offset_u64 (p0, z0, 31))
+
+/*
+** ldff1sb_gather_32_u64_offset:
+**	mov	(x[0-9]+), #?32
+**	ldff1sb	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sb_gather_32_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64base_offset_u64 (p0, z0, 32),
+		     z0_res = svldff1sb_gather_offset_u64 (p0, z0, 32))
+
+/*
+** ldff1sb_gather_x0_u64_s64offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_x0_u64_s64offset, svuint64_t, int8_t, svint64_t,
+		     z0_res = svldff1sb_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1sb_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1sb_gather_tied1_u64_s64offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_tied1_u64_s64offset, svuint64_t, int8_t, svint64_t,
+		     z0_res = svldff1sb_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1sb_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1sb_gather_untied_u64_s64offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_untied_u64_s64offset, svuint64_t, int8_t, svint64_t,
+		     z0_res = svldff1sb_gather_s64offset_u64 (p0, x0, z1),
+		     z0_res = svldff1sb_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ldff1sb_gather_ext_u64_s64offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_ext_u64_s64offset, svuint64_t, int8_t, svint64_t,
+		     z0_res = svldff1sb_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1sb_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1sb_gather_x0_u64_u64offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_x0_u64_u64offset, svuint64_t, int8_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1sb_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1sb_gather_tied1_u64_u64offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_tied1_u64_u64offset, svuint64_t, int8_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1sb_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1sb_gather_untied_u64_u64offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_untied_u64_u64offset, svuint64_t, int8_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64offset_u64 (p0, x0, z1),
+		     z0_res = svldff1sb_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ldff1sb_gather_ext_u64_u64offset:
+**	ldff1sb	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sb_gather_ext_u64_u64offset, svuint64_t, int8_t, svuint64_t,
+		     z0_res = svldff1sb_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1sb_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s16.c
new file mode 100644
index 000000000..e0a748c6a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s16.c
@@ -0,0 +1,90 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sb_s16_base:
+**	ldff1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_s16_base, svint16_t, int8_t,
+	   z0 = svldff1sb_s16 (p0, x0),
+	   z0 = svldff1sb_s16 (p0, x0))
+
+/*
+** ldff1sb_s16_index:
+**	ldff1sb	z0\.h, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_s16_index, svint16_t, int8_t,
+	   z0 = svldff1sb_s16 (p0, x0 + x1),
+	   z0 = svldff1sb_s16 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_s16_1:
+**	inch	x0
+**	ldff1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_s16_1, svint16_t, int8_t,
+	   z0 = svldff1sb_s16 (p0, x0 + svcnth ()),
+	   z0 = svldff1sb_s16 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_s16_m1:
+**	dech	x0
+**	ldff1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_s16_m1, svint16_t, int8_t,
+	   z0 = svldff1sb_s16 (p0, x0 - svcnth ()),
+	   z0 = svldff1sb_s16 (p0, x0 - svcnth ()))
+
+/*
+** ldff1sb_vnum_s16_0:
+**	ldff1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_s16_0, svint16_t, int8_t,
+	   z0 = svldff1sb_vnum_s16 (p0, x0, 0),
+	   z0 = svldff1sb_vnum_s16 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_vnum_s16_1:
+**	inch	x0
+**	ldff1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_s16_1, svint16_t, int8_t,
+	   z0 = svldff1sb_vnum_s16 (p0, x0, 1),
+	   z0 = svldff1sb_vnum_s16 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_vnum_s16_m1:
+**	dech	x0
+**	ldff1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_s16_m1, svint16_t, int8_t,
+	   z0 = svldff1sb_vnum_s16 (p0, x0, -1),
+	   z0 = svldff1sb_vnum_s16 (p0, x0, -1))
+
+/*
+** ldff1sb_vnum_s16_x1:
+**	cnth	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldff1sb	z0\.h, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ldff1sb	z0\.h, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_s16_x1, svint16_t, int8_t,
+	   z0 = svldff1sb_vnum_s16 (p0, x0, x1),
+	   z0 = svldff1sb_vnum_s16 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s32.c
new file mode 100644
index 000000000..86716da9b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s32.c
@@ -0,0 +1,90 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sb_s32_base:
+**	ldff1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_s32_base, svint32_t, int8_t,
+	   z0 = svldff1sb_s32 (p0, x0),
+	   z0 = svldff1sb_s32 (p0, x0))
+
+/*
+** ldff1sb_s32_index:
+**	ldff1sb	z0\.s, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_s32_index, svint32_t, int8_t,
+	   z0 = svldff1sb_s32 (p0, x0 + x1),
+	   z0 = svldff1sb_s32 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_s32_1:
+**	incw	x0
+**	ldff1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_s32_1, svint32_t, int8_t,
+	   z0 = svldff1sb_s32 (p0, x0 + svcntw ()),
+	   z0 = svldff1sb_s32 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_s32_m1:
+**	decw	x0
+**	ldff1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_s32_m1, svint32_t, int8_t,
+	   z0 = svldff1sb_s32 (p0, x0 - svcntw ()),
+	   z0 = svldff1sb_s32 (p0, x0 - svcntw ()))
+
+/*
+** ldff1sb_vnum_s32_0:
+**	ldff1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_s32_0, svint32_t, int8_t,
+	   z0 = svldff1sb_vnum_s32 (p0, x0, 0),
+	   z0 = svldff1sb_vnum_s32 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_vnum_s32_1:
+**	incw	x0
+**	ldff1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_s32_1, svint32_t, int8_t,
+	   z0 = svldff1sb_vnum_s32 (p0, x0, 1),
+	   z0 = svldff1sb_vnum_s32 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_vnum_s32_m1:
+**	decw	x0
+**	ldff1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_s32_m1, svint32_t, int8_t,
+	   z0 = svldff1sb_vnum_s32 (p0, x0, -1),
+	   z0 = svldff1sb_vnum_s32 (p0, x0, -1))
+
+/*
+** ldff1sb_vnum_s32_x1:
+**	cntw	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldff1sb	z0\.s, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ldff1sb	z0\.s, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_s32_x1, svint32_t, int8_t,
+	   z0 = svldff1sb_vnum_s32 (p0, x0, x1),
+	   z0 = svldff1sb_vnum_s32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s64.c
new file mode 100644
index 000000000..e7a4aa6e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_s64.c
@@ -0,0 +1,90 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sb_s64_base:
+**	ldff1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_s64_base, svint64_t, int8_t,
+	   z0 = svldff1sb_s64 (p0, x0),
+	   z0 = svldff1sb_s64 (p0, x0))
+
+/*
+** ldff1sb_s64_index:
+**	ldff1sb	z0\.d, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_s64_index, svint64_t, int8_t,
+	   z0 = svldff1sb_s64 (p0, x0 + x1),
+	   z0 = svldff1sb_s64 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_s64_1:
+**	incd	x0
+**	ldff1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_s64_1, svint64_t, int8_t,
+	   z0 = svldff1sb_s64 (p0, x0 + svcntd ()),
+	   z0 = svldff1sb_s64 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_s64_m1:
+**	decd	x0
+**	ldff1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_s64_m1, svint64_t, int8_t,
+	   z0 = svldff1sb_s64 (p0, x0 - svcntd ()),
+	   z0 = svldff1sb_s64 (p0, x0 - svcntd ()))
+
+/*
+** ldff1sb_vnum_s64_0:
+**	ldff1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_s64_0, svint64_t, int8_t,
+	   z0 = svldff1sb_vnum_s64 (p0, x0, 0),
+	   z0 = svldff1sb_vnum_s64 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_vnum_s64_1:
+**	incd	x0
+**	ldff1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_s64_1, svint64_t, int8_t,
+	   z0 = svldff1sb_vnum_s64 (p0, x0, 1),
+	   z0 = svldff1sb_vnum_s64 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_vnum_s64_m1:
+**	decd	x0
+**	ldff1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_s64_m1, svint64_t, int8_t,
+	   z0 = svldff1sb_vnum_s64 (p0, x0, -1),
+	   z0 = svldff1sb_vnum_s64 (p0, x0, -1))
+
+/*
+** ldff1sb_vnum_s64_x1:
+**	cntd	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldff1sb	z0\.d, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ldff1sb	z0\.d, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_s64_x1, svint64_t, int8_t,
+	   z0 = svldff1sb_vnum_s64 (p0, x0, x1),
+	   z0 = svldff1sb_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u16.c
new file mode 100644
index 000000000..69ba96d52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u16.c
@@ -0,0 +1,90 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sb_u16_base:
+**	ldff1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_u16_base, svuint16_t, int8_t,
+	   z0 = svldff1sb_u16 (p0, x0),
+	   z0 = svldff1sb_u16 (p0, x0))
+
+/*
+** ldff1sb_u16_index:
+**	ldff1sb	z0\.h, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_u16_index, svuint16_t, int8_t,
+	   z0 = svldff1sb_u16 (p0, x0 + x1),
+	   z0 = svldff1sb_u16 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_u16_1:
+**	inch	x0
+**	ldff1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_u16_1, svuint16_t, int8_t,
+	   z0 = svldff1sb_u16 (p0, x0 + svcnth ()),
+	   z0 = svldff1sb_u16 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_u16_m1:
+**	dech	x0
+**	ldff1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_u16_m1, svuint16_t, int8_t,
+	   z0 = svldff1sb_u16 (p0, x0 - svcnth ()),
+	   z0 = svldff1sb_u16 (p0, x0 - svcnth ()))
+
+/*
+** ldff1sb_vnum_u16_0:
+**	ldff1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_u16_0, svuint16_t, int8_t,
+	   z0 = svldff1sb_vnum_u16 (p0, x0, 0),
+	   z0 = svldff1sb_vnum_u16 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_vnum_u16_1:
+**	inch	x0
+**	ldff1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_u16_1, svuint16_t, int8_t,
+	   z0 = svldff1sb_vnum_u16 (p0, x0, 1),
+	   z0 = svldff1sb_vnum_u16 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_vnum_u16_m1:
+**	dech	x0
+**	ldff1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_u16_m1, svuint16_t, int8_t,
+	   z0 = svldff1sb_vnum_u16 (p0, x0, -1),
+	   z0 = svldff1sb_vnum_u16 (p0, x0, -1))
+
+/*
+** ldff1sb_vnum_u16_x1:
+**	cnth	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldff1sb	z0\.h, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ldff1sb	z0\.h, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_u16_x1, svuint16_t, int8_t,
+	   z0 = svldff1sb_vnum_u16 (p0, x0, x1),
+	   z0 = svldff1sb_vnum_u16 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u32.c
new file mode 100644
index 000000000..e1a1873f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u32.c
@@ -0,0 +1,90 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sb_u32_base:
+**	ldff1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_u32_base, svuint32_t, int8_t,
+	   z0 = svldff1sb_u32 (p0, x0),
+	   z0 = svldff1sb_u32 (p0, x0))
+
+/*
+** ldff1sb_u32_index:
+**	ldff1sb	z0\.s, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_u32_index, svuint32_t, int8_t,
+	   z0 = svldff1sb_u32 (p0, x0 + x1),
+	   z0 = svldff1sb_u32 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_u32_1:
+**	incw	x0
+**	ldff1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_u32_1, svuint32_t, int8_t,
+	   z0 = svldff1sb_u32 (p0, x0 + svcntw ()),
+	   z0 = svldff1sb_u32 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_u32_m1:
+**	decw	x0
+**	ldff1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_u32_m1, svuint32_t, int8_t,
+	   z0 = svldff1sb_u32 (p0, x0 - svcntw ()),
+	   z0 = svldff1sb_u32 (p0, x0 - svcntw ()))
+
+/*
+** ldff1sb_vnum_u32_0:
+**	ldff1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_u32_0, svuint32_t, int8_t,
+	   z0 = svldff1sb_vnum_u32 (p0, x0, 0),
+	   z0 = svldff1sb_vnum_u32 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_vnum_u32_1:
+**	incw	x0
+**	ldff1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_u32_1, svuint32_t, int8_t,
+	   z0 = svldff1sb_vnum_u32 (p0, x0, 1),
+	   z0 = svldff1sb_vnum_u32 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_vnum_u32_m1:
+**	decw	x0
+**	ldff1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_u32_m1, svuint32_t, int8_t,
+	   z0 = svldff1sb_vnum_u32 (p0, x0, -1),
+	   z0 = svldff1sb_vnum_u32 (p0, x0, -1))
+
+/*
+** ldff1sb_vnum_u32_x1:
+**	cntw	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldff1sb	z0\.s, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ldff1sb	z0\.s, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_u32_x1, svuint32_t, int8_t,
+	   z0 = svldff1sb_vnum_u32 (p0, x0, x1),
+	   z0 = svldff1sb_vnum_u32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u64.c
new file mode 100644
index 000000000..0a49cbcc0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sb_u64.c
@@ -0,0 +1,90 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sb_u64_base:
+**	ldff1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_u64_base, svuint64_t, int8_t,
+	   z0 = svldff1sb_u64 (p0, x0),
+	   z0 = svldff1sb_u64 (p0, x0))
+
+/*
+** ldff1sb_u64_index:
+**	ldff1sb	z0\.d, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_u64_index, svuint64_t, int8_t,
+	   z0 = svldff1sb_u64 (p0, x0 + x1),
+	   z0 = svldff1sb_u64 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_u64_1:
+**	incd	x0
+**	ldff1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_u64_1, svuint64_t, int8_t,
+	   z0 = svldff1sb_u64 (p0, x0 + svcntd ()),
+	   z0 = svldff1sb_u64 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_u64_m1:
+**	decd	x0
+**	ldff1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_u64_m1, svuint64_t, int8_t,
+	   z0 = svldff1sb_u64 (p0, x0 - svcntd ()),
+	   z0 = svldff1sb_u64 (p0, x0 - svcntd ()))
+
+/*
+** ldff1sb_vnum_u64_0:
+**	ldff1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_u64_0, svuint64_t, int8_t,
+	   z0 = svldff1sb_vnum_u64 (p0, x0, 0),
+	   z0 = svldff1sb_vnum_u64 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_vnum_u64_1:
+**	incd	x0
+**	ldff1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_u64_1, svuint64_t, int8_t,
+	   z0 = svldff1sb_vnum_u64 (p0, x0, 1),
+	   z0 = svldff1sb_vnum_u64 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sb_vnum_u64_m1:
+**	decd	x0
+**	ldff1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_u64_m1, svuint64_t, int8_t,
+	   z0 = svldff1sb_vnum_u64 (p0, x0, -1),
+	   z0 = svldff1sb_vnum_u64 (p0, x0, -1))
+
+/*
+** ldff1sb_vnum_u64_x1:
+**	cntd	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldff1sb	z0\.d, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ldff1sb	z0\.d, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ldff1sb_vnum_u64_x1, svuint64_t, int8_t,
+	   z0 = svldff1sb_vnum_u64 (p0, x0, x1),
+	   z0 = svldff1sb_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_s32.c
new file mode 100644
index 000000000..b633335dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_s32.c
@@ -0,0 +1,252 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sh_gather_s32_tied1:
+**	ldff1sh	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_s32_tied1, svint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_s32 (p0, z0),
+		     z0_res = svldff1sh_gather_s32 (p0, z0))
+
+/*
+** ldff1sh_gather_s32_untied:
+**	ldff1sh	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_s32_untied, svint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_s32 (p0, z1),
+		     z0_res = svldff1sh_gather_s32 (p0, z1))
+
+/*
+** ldff1sh_gather_x0_s32_offset:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_x0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_offset_s32 (p0, z0, x0),
+		     z0_res = svldff1sh_gather_offset_s32 (p0, z0, x0))
+
+/*
+** ldff1sh_gather_m2_s32_offset:
+**	mov	(x[0-9]+), #?-2
+**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_m2_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_offset_s32 (p0, z0, -2),
+		     z0_res = svldff1sh_gather_offset_s32 (p0, z0, -2))
+
+/*
+** ldff1sh_gather_0_s32_offset:
+**	ldff1sh	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_offset_s32 (p0, z0, 0),
+		     z0_res = svldff1sh_gather_offset_s32 (p0, z0, 0))
+
+/*
+** ldff1sh_gather_5_s32_offset:
+**	mov	(x[0-9]+), #?5
+**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_5_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_offset_s32 (p0, z0, 5),
+		     z0_res = svldff1sh_gather_offset_s32 (p0, z0, 5))
+
+/*
+** ldff1sh_gather_6_s32_offset:
+**	ldff1sh	z0\.s, p0/z, \[z0\.s, #6\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_6_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_offset_s32 (p0, z0, 6),
+		     z0_res = svldff1sh_gather_offset_s32 (p0, z0, 6))
+
+/*
+** ldff1sh_gather_62_s32_offset:
+**	ldff1sh	z0\.s, p0/z, \[z0\.s, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_62_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_offset_s32 (p0, z0, 62),
+		     z0_res = svldff1sh_gather_offset_s32 (p0, z0, 62))
+
+/*
+** ldff1sh_gather_64_s32_offset:
+**	mov	(x[0-9]+), #?64
+**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_64_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_offset_s32 (p0, z0, 64),
+		     z0_res = svldff1sh_gather_offset_s32 (p0, z0, 64))
+
+/*
+** ldff1sh_gather_x0_s32_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_x0_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_index_s32 (p0, z0, x0),
+		     z0_res = svldff1sh_gather_index_s32 (p0, z0, x0))
+
+/*
+** ldff1sh_gather_m1_s32_index:
+**	mov	(x[0-9]+), #?-2
+**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_m1_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_index_s32 (p0, z0, -1),
+		     z0_res = svldff1sh_gather_index_s32 (p0, z0, -1))
+
+/*
+** ldff1sh_gather_0_s32_index:
+**	ldff1sh	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_0_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_index_s32 (p0, z0, 0),
+		     z0_res = svldff1sh_gather_index_s32 (p0, z0, 0))
+
+/*
+** ldff1sh_gather_5_s32_index:
+**	ldff1sh	z0\.s, p0/z, \[z0\.s, #10\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_5_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_index_s32 (p0, z0, 5),
+		     z0_res = svldff1sh_gather_index_s32 (p0, z0, 5))
+
+/*
+** ldff1sh_gather_31_s32_index:
+**	ldff1sh	z0\.s, p0/z, \[z0\.s, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_31_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_index_s32 (p0, z0, 31),
+		     z0_res = svldff1sh_gather_index_s32 (p0, z0, 31))
+
+/*
+** ldff1sh_gather_32_s32_index:
+**	mov	(x[0-9]+), #?64
+**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_32_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_index_s32 (p0, z0, 32),
+		     z0_res = svldff1sh_gather_index_s32 (p0, z0, 32))
+
+/*
+** ldff1sh_gather_x0_s32_s32offset:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_s32_s32offset, svint32_t, int16_t, svint32_t,
+		     z0_res = svldff1sh_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_tied1_s32_s32offset:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_s32_s32offset, svint32_t, int16_t, svint32_t,
+		     z0_res = svldff1sh_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_untied_s32_s32offset:
+**	ldff1sh	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_s32_s32offset, svint32_t, int16_t, svint32_t,
+		     z0_res = svldff1sh_gather_s32offset_s32 (p0, x0, z1),
+		     z0_res = svldff1sh_gather_offset_s32 (p0, x0, z1))
+
+/*
+** ldff1sh_gather_x0_s32_u32offset:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_s32_u32offset, svint32_t, int16_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_tied1_s32_u32offset:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_s32_u32offset, svint32_t, int16_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_untied_s32_u32offset:
+**	ldff1sh	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_s32_u32offset, svint32_t, int16_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32offset_s32 (p0, x0, z1),
+		     z0_res = svldff1sh_gather_offset_s32 (p0, x0, z1))
+
+/*
+** ldff1sh_gather_x0_s32_s32index:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_s32_s32index, svint32_t, int16_t, svint32_t,
+		     z0_res = svldff1sh_gather_s32index_s32 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_index_s32 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_tied1_s32_s32index:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_s32_s32index, svint32_t, int16_t, svint32_t,
+		     z0_res = svldff1sh_gather_s32index_s32 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_index_s32 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_untied_s32_s32index:
+**	ldff1sh	z0\.s, p0/z, \[x0, z1\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_s32_s32index, svint32_t, int16_t, svint32_t,
+		     z0_res = svldff1sh_gather_s32index_s32 (p0, x0, z1),
+		     z0_res = svldff1sh_gather_index_s32 (p0, x0, z1))
+
+/*
+** ldff1sh_gather_x0_s32_u32index:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_s32_u32index, svint32_t, int16_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32index_s32 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_index_s32 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_tied1_s32_u32index:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_s32_u32index, svint32_t, int16_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32index_s32 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_index_s32 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_untied_s32_u32index:
+**	ldff1sh	z0\.s, p0/z, \[x0, z1\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_s32_u32index, svint32_t, int16_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32index_s32 (p0, x0, z1),
+		     z0_res = svldff1sh_gather_index_s32 (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_s64.c
new file mode 100644
index 000000000..32a4309b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_s64.c
@@ -0,0 +1,288 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sh_gather_s64_tied1:
+**	ldff1sh	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_s64_tied1, svint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_s64 (p0, z0),
+		     z0_res = svldff1sh_gather_s64 (p0, z0))
+
+/*
+** ldff1sh_gather_s64_untied:
+**	ldff1sh	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_s64_untied, svint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_s64 (p0, z1),
+		     z0_res = svldff1sh_gather_s64 (p0, z1))
+
+/*
+** ldff1sh_gather_x0_s64_offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_x0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_offset_s64 (p0, z0, x0),
+		     z0_res = svldff1sh_gather_offset_s64 (p0, z0, x0))
+
+/*
+** ldff1sh_gather_m2_s64_offset:
+**	mov	(x[0-9]+), #?-2
+**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_m2_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_offset_s64 (p0, z0, -2),
+		     z0_res = svldff1sh_gather_offset_s64 (p0, z0, -2))
+
+/*
+** ldff1sh_gather_0_s64_offset:
+**	ldff1sh	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_offset_s64 (p0, z0, 0),
+		     z0_res = svldff1sh_gather_offset_s64 (p0, z0, 0))
+
+/*
+** ldff1sh_gather_5_s64_offset:
+**	mov	(x[0-9]+), #?5
+**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_5_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_offset_s64 (p0, z0, 5),
+		     z0_res = svldff1sh_gather_offset_s64 (p0, z0, 5))
+
+/*
+** ldff1sh_gather_6_s64_offset:
+**	ldff1sh	z0\.d, p0/z, \[z0\.d, #6\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_6_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_offset_s64 (p0, z0, 6),
+		     z0_res = svldff1sh_gather_offset_s64 (p0, z0, 6))
+
+/*
+** ldff1sh_gather_62_s64_offset:
+**	ldff1sh	z0\.d, p0/z, \[z0\.d, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_62_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_offset_s64 (p0, z0, 62),
+		     z0_res = svldff1sh_gather_offset_s64 (p0, z0, 62))
+
+/*
+** ldff1sh_gather_64_s64_offset:
+**	mov	(x[0-9]+), #?64
+**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_64_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_offset_s64 (p0, z0, 64),
+		     z0_res = svldff1sh_gather_offset_s64 (p0, z0, 64))
+
+/*
+** ldff1sh_gather_x0_s64_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_x0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_index_s64 (p0, z0, x0),
+		     z0_res = svldff1sh_gather_index_s64 (p0, z0, x0))
+
+/*
+** ldff1sh_gather_m1_s64_index:
+**	mov	(x[0-9]+), #?-2
+**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_m1_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_index_s64 (p0, z0, -1),
+		     z0_res = svldff1sh_gather_index_s64 (p0, z0, -1))
+
+/*
+** ldff1sh_gather_0_s64_index:
+**	ldff1sh	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_index_s64 (p0, z0, 0),
+		     z0_res = svldff1sh_gather_index_s64 (p0, z0, 0))
+
+/*
+** ldff1sh_gather_5_s64_index:
+**	ldff1sh	z0\.d, p0/z, \[z0\.d, #10\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_5_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_index_s64 (p0, z0, 5),
+		     z0_res = svldff1sh_gather_index_s64 (p0, z0, 5))
+
+/*
+** ldff1sh_gather_31_s64_index:
+**	ldff1sh	z0\.d, p0/z, \[z0\.d, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_31_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_index_s64 (p0, z0, 31),
+		     z0_res = svldff1sh_gather_index_s64 (p0, z0, 31))
+
+/*
+** ldff1sh_gather_32_s64_index:
+**	mov	(x[0-9]+), #?64
+**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_32_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_index_s64 (p0, z0, 32),
+		     z0_res = svldff1sh_gather_index_s64 (p0, z0, 32))
+
+/*
+** ldff1sh_gather_x0_s64_s64offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_s64_s64offset, svint64_t, int16_t, svint64_t,
+		     z0_res = svldff1sh_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_tied1_s64_s64offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_s64_s64offset, svint64_t, int16_t, svint64_t,
+		     z0_res = svldff1sh_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_untied_s64_s64offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_s64_s64offset, svint64_t, int16_t, svint64_t,
+		     z0_res = svldff1sh_gather_s64offset_s64 (p0, x0, z1),
+		     z0_res = svldff1sh_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ldff1sh_gather_ext_s64_s64offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_ext_s64_s64offset, svint64_t, int16_t, svint64_t,
+		     z0_res = svldff1sh_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1sh_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1sh_gather_x0_s64_u64offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_s64_u64offset, svint64_t, int16_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_tied1_s64_u64offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_s64_u64offset, svint64_t, int16_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_untied_s64_u64offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_s64_u64offset, svint64_t, int16_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64offset_s64 (p0, x0, z1),
+		     z0_res = svldff1sh_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ldff1sh_gather_ext_s64_u64offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_ext_s64_u64offset, svint64_t, int16_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1sh_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1sh_gather_x0_s64_s64index:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_s64_s64index, svint64_t, int16_t, svint64_t,
+		     z0_res = svldff1sh_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_index_s64 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_tied1_s64_s64index:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_s64_s64index, svint64_t, int16_t, svint64_t,
+		     z0_res = svldff1sh_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_index_s64 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_untied_s64_s64index:
+**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_s64_s64index, svint64_t, int16_t, svint64_t,
+		     z0_res = svldff1sh_gather_s64index_s64 (p0, x0, z1),
+		     z0_res = svldff1sh_gather_index_s64 (p0, x0, z1))
+
+/*
+** ldff1sh_gather_ext_s64_s64index:
+**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_ext_s64_s64index, svint64_t, int16_t, svint64_t,
+		     z0_res = svldff1sh_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1sh_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1sh_gather_x0_s64_u64index:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_s64_u64index, svint64_t, int16_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_index_s64 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_tied1_s64_u64index:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_s64_u64index, svint64_t, int16_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_index_s64 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_untied_s64_u64index:
+**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_s64_u64index, svint64_t, int16_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64index_s64 (p0, x0, z1),
+		     z0_res = svldff1sh_gather_index_s64 (p0, x0, z1))
+
+/*
+** ldff1sh_gather_ext_s64_u64index:
+**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_ext_s64_u64index, svint64_t, int16_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1sh_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_u32.c
new file mode 100644
index 000000000..73a9be892
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_u32.c
@@ -0,0 +1,252 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sh_gather_u32_tied1:
+**	ldff1sh	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_u32_tied1, svuint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_u32 (p0, z0),
+		     z0_res = svldff1sh_gather_u32 (p0, z0))
+
+/*
+** ldff1sh_gather_u32_untied:
+**	ldff1sh	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_u32_untied, svuint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_u32 (p0, z1),
+		     z0_res = svldff1sh_gather_u32 (p0, z1))
+
+/*
+** ldff1sh_gather_x0_u32_offset:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_x0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_offset_u32 (p0, z0, x0),
+		     z0_res = svldff1sh_gather_offset_u32 (p0, z0, x0))
+
+/*
+** ldff1sh_gather_m2_u32_offset:
+**	mov	(x[0-9]+), #?-2
+**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_m2_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_offset_u32 (p0, z0, -2),
+		     z0_res = svldff1sh_gather_offset_u32 (p0, z0, -2))
+
+/*
+** ldff1sh_gather_0_u32_offset:
+**	ldff1sh	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_offset_u32 (p0, z0, 0),
+		     z0_res = svldff1sh_gather_offset_u32 (p0, z0, 0))
+
+/*
+** ldff1sh_gather_5_u32_offset:
+**	mov	(x[0-9]+), #?5
+**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_5_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_offset_u32 (p0, z0, 5),
+		     z0_res = svldff1sh_gather_offset_u32 (p0, z0, 5))
+
+/*
+** ldff1sh_gather_6_u32_offset:
+**	ldff1sh	z0\.s, p0/z, \[z0\.s, #6\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_6_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_offset_u32 (p0, z0, 6),
+		     z0_res = svldff1sh_gather_offset_u32 (p0, z0, 6))
+
+/*
+** ldff1sh_gather_62_u32_offset:
+**	ldff1sh	z0\.s, p0/z, \[z0\.s, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_62_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_offset_u32 (p0, z0, 62),
+		     z0_res = svldff1sh_gather_offset_u32 (p0, z0, 62))
+
+/*
+** ldff1sh_gather_64_u32_offset:
+**	mov	(x[0-9]+), #?64
+**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_64_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_offset_u32 (p0, z0, 64),
+		     z0_res = svldff1sh_gather_offset_u32 (p0, z0, 64))
+
+/*
+** ldff1sh_gather_x0_u32_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_x0_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_index_u32 (p0, z0, x0),
+		     z0_res = svldff1sh_gather_index_u32 (p0, z0, x0))
+
+/*
+** ldff1sh_gather_m1_u32_index:
+**	mov	(x[0-9]+), #?-2
+**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_m1_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_index_u32 (p0, z0, -1),
+		     z0_res = svldff1sh_gather_index_u32 (p0, z0, -1))
+
+/*
+** ldff1sh_gather_0_u32_index:
+**	ldff1sh	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_0_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_index_u32 (p0, z0, 0),
+		     z0_res = svldff1sh_gather_index_u32 (p0, z0, 0))
+
+/*
+** ldff1sh_gather_5_u32_index:
+**	ldff1sh	z0\.s, p0/z, \[z0\.s, #10\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_5_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_index_u32 (p0, z0, 5),
+		     z0_res = svldff1sh_gather_index_u32 (p0, z0, 5))
+
+/*
+** ldff1sh_gather_31_u32_index:
+**	ldff1sh	z0\.s, p0/z, \[z0\.s, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_31_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_index_u32 (p0, z0, 31),
+		     z0_res = svldff1sh_gather_index_u32 (p0, z0, 31))
+
+/*
+** ldff1sh_gather_32_u32_index:
+**	mov	(x[0-9]+), #?64
+**	ldff1sh	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_32_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32base_index_u32 (p0, z0, 32),
+		     z0_res = svldff1sh_gather_index_u32 (p0, z0, 32))
+
+/*
+** ldff1sh_gather_x0_u32_s32offset:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_u32_s32offset, svuint32_t, int16_t, svint32_t,
+		     z0_res = svldff1sh_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_tied1_u32_s32offset:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_u32_s32offset, svuint32_t, int16_t, svint32_t,
+		     z0_res = svldff1sh_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_untied_u32_s32offset:
+**	ldff1sh	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_u32_s32offset, svuint32_t, int16_t, svint32_t,
+		     z0_res = svldff1sh_gather_s32offset_u32 (p0, x0, z1),
+		     z0_res = svldff1sh_gather_offset_u32 (p0, x0, z1))
+
+/*
+** ldff1sh_gather_x0_u32_u32offset:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_u32_u32offset, svuint32_t, int16_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_tied1_u32_u32offset:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_u32_u32offset, svuint32_t, int16_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_untied_u32_u32offset:
+**	ldff1sh	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_u32_u32offset, svuint32_t, int16_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32offset_u32 (p0, x0, z1),
+		     z0_res = svldff1sh_gather_offset_u32 (p0, x0, z1))
+
+/*
+** ldff1sh_gather_x0_u32_s32index:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_u32_s32index, svuint32_t, int16_t, svint32_t,
+		     z0_res = svldff1sh_gather_s32index_u32 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_index_u32 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_tied1_u32_s32index:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_u32_s32index, svuint32_t, int16_t, svint32_t,
+		     z0_res = svldff1sh_gather_s32index_u32 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_index_u32 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_untied_u32_s32index:
+**	ldff1sh	z0\.s, p0/z, \[x0, z1\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_u32_s32index, svuint32_t, int16_t, svint32_t,
+		     z0_res = svldff1sh_gather_s32index_u32 (p0, x0, z1),
+		     z0_res = svldff1sh_gather_index_u32 (p0, x0, z1))
+
+/*
+** ldff1sh_gather_x0_u32_u32index:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_u32_u32index, svuint32_t, int16_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32index_u32 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_index_u32 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_tied1_u32_u32index:
+**	ldff1sh	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_u32_u32index, svuint32_t, int16_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32index_u32 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_index_u32 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_untied_u32_u32index:
+**	ldff1sh	z0\.s, p0/z, \[x0, z1\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_u32_u32index, svuint32_t, int16_t, svuint32_t,
+		     z0_res = svldff1sh_gather_u32index_u32 (p0, x0, z1),
+		     z0_res = svldff1sh_gather_index_u32 (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_u64.c
new file mode 100644
index 000000000..94ea73b63
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_gather_u64.c
@@ -0,0 +1,288 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sh_gather_u64_tied1:
+**	ldff1sh	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_u64_tied1, svuint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_u64 (p0, z0),
+		     z0_res = svldff1sh_gather_u64 (p0, z0))
+
+/*
+** ldff1sh_gather_u64_untied:
+**	ldff1sh	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_u64_untied, svuint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_u64 (p0, z1),
+		     z0_res = svldff1sh_gather_u64 (p0, z1))
+
+/*
+** ldff1sh_gather_x0_u64_offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_x0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_offset_u64 (p0, z0, x0),
+		     z0_res = svldff1sh_gather_offset_u64 (p0, z0, x0))
+
+/*
+** ldff1sh_gather_m2_u64_offset:
+**	mov	(x[0-9]+), #?-2
+**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_m2_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_offset_u64 (p0, z0, -2),
+		     z0_res = svldff1sh_gather_offset_u64 (p0, z0, -2))
+
+/*
+** ldff1sh_gather_0_u64_offset:
+**	ldff1sh	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_offset_u64 (p0, z0, 0),
+		     z0_res = svldff1sh_gather_offset_u64 (p0, z0, 0))
+
+/*
+** ldff1sh_gather_5_u64_offset:
+**	mov	(x[0-9]+), #?5
+**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_5_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_offset_u64 (p0, z0, 5),
+		     z0_res = svldff1sh_gather_offset_u64 (p0, z0, 5))
+
+/*
+** ldff1sh_gather_6_u64_offset:
+**	ldff1sh	z0\.d, p0/z, \[z0\.d, #6\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_6_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_offset_u64 (p0, z0, 6),
+		     z0_res = svldff1sh_gather_offset_u64 (p0, z0, 6))
+
+/*
+** ldff1sh_gather_62_u64_offset:
+**	ldff1sh	z0\.d, p0/z, \[z0\.d, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_62_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_offset_u64 (p0, z0, 62),
+		     z0_res = svldff1sh_gather_offset_u64 (p0, z0, 62))
+
+/*
+** ldff1sh_gather_64_u64_offset:
+**	mov	(x[0-9]+), #?64
+**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_64_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_offset_u64 (p0, z0, 64),
+		     z0_res = svldff1sh_gather_offset_u64 (p0, z0, 64))
+
+/*
+** ldff1sh_gather_x0_u64_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_x0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_index_u64 (p0, z0, x0),
+		     z0_res = svldff1sh_gather_index_u64 (p0, z0, x0))
+
+/*
+** ldff1sh_gather_m1_u64_index:
+**	mov	(x[0-9]+), #?-2
+**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_m1_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_index_u64 (p0, z0, -1),
+		     z0_res = svldff1sh_gather_index_u64 (p0, z0, -1))
+
+/*
+** ldff1sh_gather_0_u64_index:
+**	ldff1sh	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_index_u64 (p0, z0, 0),
+		     z0_res = svldff1sh_gather_index_u64 (p0, z0, 0))
+
+/*
+** ldff1sh_gather_5_u64_index:
+**	ldff1sh	z0\.d, p0/z, \[z0\.d, #10\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_5_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_index_u64 (p0, z0, 5),
+		     z0_res = svldff1sh_gather_index_u64 (p0, z0, 5))
+
+/*
+** ldff1sh_gather_31_u64_index:
+**	ldff1sh	z0\.d, p0/z, \[z0\.d, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_31_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_index_u64 (p0, z0, 31),
+		     z0_res = svldff1sh_gather_index_u64 (p0, z0, 31))
+
+/*
+** ldff1sh_gather_32_u64_index:
+**	mov	(x[0-9]+), #?64
+**	ldff1sh	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sh_gather_32_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64base_index_u64 (p0, z0, 32),
+		     z0_res = svldff1sh_gather_index_u64 (p0, z0, 32))
+
+/*
+** ldff1sh_gather_x0_u64_s64offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_u64_s64offset, svuint64_t, int16_t, svint64_t,
+		     z0_res = svldff1sh_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_tied1_u64_s64offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_u64_s64offset, svuint64_t, int16_t, svint64_t,
+		     z0_res = svldff1sh_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_untied_u64_s64offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_u64_s64offset, svuint64_t, int16_t, svint64_t,
+		     z0_res = svldff1sh_gather_s64offset_u64 (p0, x0, z1),
+		     z0_res = svldff1sh_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ldff1sh_gather_ext_u64_s64offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_ext_u64_s64offset, svuint64_t, int16_t, svint64_t,
+		     z0_res = svldff1sh_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1sh_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1sh_gather_x0_u64_u64offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_u64_u64offset, svuint64_t, int16_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_tied1_u64_u64offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_u64_u64offset, svuint64_t, int16_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_untied_u64_u64offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_u64_u64offset, svuint64_t, int16_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64offset_u64 (p0, x0, z1),
+		     z0_res = svldff1sh_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ldff1sh_gather_ext_u64_u64offset:
+**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_ext_u64_u64offset, svuint64_t, int16_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1sh_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1sh_gather_x0_u64_s64index:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_u64_s64index, svuint64_t, int16_t, svint64_t,
+		     z0_res = svldff1sh_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_index_u64 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_tied1_u64_s64index:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_u64_s64index, svuint64_t, int16_t, svint64_t,
+		     z0_res = svldff1sh_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_index_u64 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_untied_u64_s64index:
+**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_u64_s64index, svuint64_t, int16_t, svint64_t,
+		     z0_res = svldff1sh_gather_s64index_u64 (p0, x0, z1),
+		     z0_res = svldff1sh_gather_index_u64 (p0, x0, z1))
+
+/*
+** ldff1sh_gather_ext_u64_s64index:
+**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_ext_u64_s64index, svuint64_t, int16_t, svint64_t,
+		     z0_res = svldff1sh_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1sh_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1sh_gather_x0_u64_u64index:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_x0_u64_u64index, svuint64_t, int16_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_index_u64 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_tied1_u64_u64index:
+**	ldff1sh	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_tied1_u64_u64index, svuint64_t, int16_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svldff1sh_gather_index_u64 (p0, x0, z0))
+
+/*
+** ldff1sh_gather_untied_u64_u64index:
+**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_untied_u64_u64index, svuint64_t, int16_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64index_u64 (p0, x0, z1),
+		     z0_res = svldff1sh_gather_index_u64 (p0, x0, z1))
+
+/*
+** ldff1sh_gather_ext_u64_u64index:
+**	ldff1sh	z0\.d, p0/z, \[x0, z1\.d, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sh_gather_ext_u64_u64index, svuint64_t, int16_t, svuint64_t,
+		     z0_res = svldff1sh_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1sh_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_s32.c
new file mode 100644
index 000000000..81b64e836
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_s32.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sh_s32_base:
+**	ldff1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_s32_base, svint32_t, int16_t,
+	   z0 = svldff1sh_s32 (p0, x0),
+	   z0 = svldff1sh_s32 (p0, x0))
+
+/*
+** ldff1sh_s32_index:
+**	ldff1sh	z0\.s, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_s32_index, svint32_t, int16_t,
+	   z0 = svldff1sh_s32 (p0, x0 + x1),
+	   z0 = svldff1sh_s32 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sh_s32_1:
+**	inch	x0
+**	ldff1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_s32_1, svint32_t, int16_t,
+	   z0 = svldff1sh_s32 (p0, x0 + svcntw ()),
+	   z0 = svldff1sh_s32 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sh_s32_m1:
+**	dech	x0
+**	ldff1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_s32_m1, svint32_t, int16_t,
+	   z0 = svldff1sh_s32 (p0, x0 - svcntw ()),
+	   z0 = svldff1sh_s32 (p0, x0 - svcntw ()))
+
+/*
+** ldff1sh_vnum_s32_0:
+**	ldff1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_vnum_s32_0, svint32_t, int16_t,
+	   z0 = svldff1sh_vnum_s32 (p0, x0, 0),
+	   z0 = svldff1sh_vnum_s32 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sh_vnum_s32_1:
+**	inch	x0
+**	ldff1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_vnum_s32_1, svint32_t, int16_t,
+	   z0 = svldff1sh_vnum_s32 (p0, x0, 1),
+	   z0 = svldff1sh_vnum_s32 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sh_vnum_s32_m1:
+**	dech	x0
+**	ldff1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_vnum_s32_m1, svint32_t, int16_t,
+	   z0 = svldff1sh_vnum_s32 (p0, x0, -1),
+	   z0 = svldff1sh_vnum_s32 (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1sh_vnum_s32_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1sh	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_vnum_s32_x1, svint32_t, int16_t,
+	   z0 = svldff1sh_vnum_s32 (p0, x0, x1),
+	   z0 = svldff1sh_vnum_s32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_s64.c
new file mode 100644
index 000000000..453b3ff24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_s64.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sh_s64_base:
+**	ldff1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_s64_base, svint64_t, int16_t,
+	   z0 = svldff1sh_s64 (p0, x0),
+	   z0 = svldff1sh_s64 (p0, x0))
+
+/*
+** ldff1sh_s64_index:
+**	ldff1sh	z0\.d, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_s64_index, svint64_t, int16_t,
+	   z0 = svldff1sh_s64 (p0, x0 + x1),
+	   z0 = svldff1sh_s64 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sh_s64_1:
+**	incw	x0
+**	ldff1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_s64_1, svint64_t, int16_t,
+	   z0 = svldff1sh_s64 (p0, x0 + svcntd ()),
+	   z0 = svldff1sh_s64 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sh_s64_m1:
+**	decw	x0
+**	ldff1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_s64_m1, svint64_t, int16_t,
+	   z0 = svldff1sh_s64 (p0, x0 - svcntd ()),
+	   z0 = svldff1sh_s64 (p0, x0 - svcntd ()))
+
+/*
+** ldff1sh_vnum_s64_0:
+**	ldff1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_vnum_s64_0, svint64_t, int16_t,
+	   z0 = svldff1sh_vnum_s64 (p0, x0, 0),
+	   z0 = svldff1sh_vnum_s64 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sh_vnum_s64_1:
+**	incw	x0
+**	ldff1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_vnum_s64_1, svint64_t, int16_t,
+	   z0 = svldff1sh_vnum_s64 (p0, x0, 1),
+	   z0 = svldff1sh_vnum_s64 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sh_vnum_s64_m1:
+**	decw	x0
+**	ldff1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_vnum_s64_m1, svint64_t, int16_t,
+	   z0 = svldff1sh_vnum_s64 (p0, x0, -1),
+	   z0 = svldff1sh_vnum_s64 (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1sh_vnum_s64_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1sh	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_vnum_s64_x1, svint64_t, int16_t,
+	   z0 = svldff1sh_vnum_s64 (p0, x0, x1),
+	   z0 = svldff1sh_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_u32.c
new file mode 100644
index 000000000..bbbed79dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_u32.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sh_u32_base:
+**	ldff1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_u32_base, svuint32_t, int16_t,
+	   z0 = svldff1sh_u32 (p0, x0),
+	   z0 = svldff1sh_u32 (p0, x0))
+
+/*
+** ldff1sh_u32_index:
+**	ldff1sh	z0\.s, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_u32_index, svuint32_t, int16_t,
+	   z0 = svldff1sh_u32 (p0, x0 + x1),
+	   z0 = svldff1sh_u32 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sh_u32_1:
+**	inch	x0
+**	ldff1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_u32_1, svuint32_t, int16_t,
+	   z0 = svldff1sh_u32 (p0, x0 + svcntw ()),
+	   z0 = svldff1sh_u32 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sh_u32_m1:
+**	dech	x0
+**	ldff1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_u32_m1, svuint32_t, int16_t,
+	   z0 = svldff1sh_u32 (p0, x0 - svcntw ()),
+	   z0 = svldff1sh_u32 (p0, x0 - svcntw ()))
+
+/*
+** ldff1sh_vnum_u32_0:
+**	ldff1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_vnum_u32_0, svuint32_t, int16_t,
+	   z0 = svldff1sh_vnum_u32 (p0, x0, 0),
+	   z0 = svldff1sh_vnum_u32 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sh_vnum_u32_1:
+**	inch	x0
+**	ldff1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_vnum_u32_1, svuint32_t, int16_t,
+	   z0 = svldff1sh_vnum_u32 (p0, x0, 1),
+	   z0 = svldff1sh_vnum_u32 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sh_vnum_u32_m1:
+**	dech	x0
+**	ldff1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_vnum_u32_m1, svuint32_t, int16_t,
+	   z0 = svldff1sh_vnum_u32 (p0, x0, -1),
+	   z0 = svldff1sh_vnum_u32 (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1sh_vnum_u32_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1sh	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_vnum_u32_x1, svuint32_t, int16_t,
+	   z0 = svldff1sh_vnum_u32 (p0, x0, x1),
+	   z0 = svldff1sh_vnum_u32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_u64.c
new file mode 100644
index 000000000..5430e256b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sh_u64.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sh_u64_base:
+**	ldff1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_u64_base, svuint64_t, int16_t,
+	   z0 = svldff1sh_u64 (p0, x0),
+	   z0 = svldff1sh_u64 (p0, x0))
+
+/*
+** ldff1sh_u64_index:
+**	ldff1sh	z0\.d, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_u64_index, svuint64_t, int16_t,
+	   z0 = svldff1sh_u64 (p0, x0 + x1),
+	   z0 = svldff1sh_u64 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sh_u64_1:
+**	incw	x0
+**	ldff1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_u64_1, svuint64_t, int16_t,
+	   z0 = svldff1sh_u64 (p0, x0 + svcntd ()),
+	   z0 = svldff1sh_u64 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sh_u64_m1:
+**	decw	x0
+**	ldff1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_u64_m1, svuint64_t, int16_t,
+	   z0 = svldff1sh_u64 (p0, x0 - svcntd ()),
+	   z0 = svldff1sh_u64 (p0, x0 - svcntd ()))
+
+/*
+** ldff1sh_vnum_u64_0:
+**	ldff1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_vnum_u64_0, svuint64_t, int16_t,
+	   z0 = svldff1sh_vnum_u64 (p0, x0, 0),
+	   z0 = svldff1sh_vnum_u64 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sh_vnum_u64_1:
+**	incw	x0
+**	ldff1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_vnum_u64_1, svuint64_t, int16_t,
+	   z0 = svldff1sh_vnum_u64 (p0, x0, 1),
+	   z0 = svldff1sh_vnum_u64 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sh_vnum_u64_m1:
+**	decw	x0
+**	ldff1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_vnum_u64_m1, svuint64_t, int16_t,
+	   z0 = svldff1sh_vnum_u64 (p0, x0, -1),
+	   z0 = svldff1sh_vnum_u64 (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1sh_vnum_u64_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1sh	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1sh_vnum_u64_x1, svuint64_t, int16_t,
+	   z0 = svldff1sh_vnum_u64 (p0, x0, x1),
+	   z0 = svldff1sh_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_gather_s64.c
new file mode 100644
index 000000000..e5da8a83d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_gather_s64.c
@@ -0,0 +1,308 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sw_gather_s64_tied1:
+**	ldff1sw	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_s64_tied1, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_s64 (p0, z0),
+		     z0_res = svldff1sw_gather_s64 (p0, z0))
+
+/*
+** ldff1sw_gather_s64_untied:
+**	ldff1sw	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_s64_untied, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_s64 (p0, z1),
+		     z0_res = svldff1sw_gather_s64 (p0, z1))
+
+/*
+** ldff1sw_gather_x0_s64_offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_x0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, x0),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, x0))
+
+/*
+** ldff1sw_gather_m4_s64_offset:
+**	mov	(x[0-9]+), #?-4
+**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_m4_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, -4),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, -4))
+
+/*
+** ldff1sw_gather_0_s64_offset:
+**	ldff1sw	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, 0),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, 0))
+
+/*
+** ldff1sw_gather_5_s64_offset:
+**	mov	(x[0-9]+), #?5
+**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_5_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, 5),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, 5))
+
+/*
+** ldff1sw_gather_6_s64_offset:
+**	mov	(x[0-9]+), #?6
+**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_6_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, 6),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, 6))
+
+/*
+** ldff1sw_gather_7_s64_offset:
+**	mov	(x[0-9]+), #?7
+**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_7_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, 7),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, 7))
+
+/*
+** ldff1sw_gather_8_s64_offset:
+**	ldff1sw	z0\.d, p0/z, \[z0\.d, #8\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_8_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, 8),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, 8))
+
+/*
+** ldff1sw_gather_124_s64_offset:
+**	ldff1sw	z0\.d, p0/z, \[z0\.d, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_124_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, 124),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, 124))
+
+/*
+** ldff1sw_gather_128_s64_offset:
+**	mov	(x[0-9]+), #?128
+**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_128_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_s64 (p0, z0, 128),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, z0, 128))
+
+/*
+** ldff1sw_gather_x0_s64_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_x0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_index_s64 (p0, z0, x0),
+		     z0_res = svldff1sw_gather_index_s64 (p0, z0, x0))
+
+/*
+** ldff1sw_gather_m1_s64_index:
+**	mov	(x[0-9]+), #?-4
+**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_m1_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_index_s64 (p0, z0, -1),
+		     z0_res = svldff1sw_gather_index_s64 (p0, z0, -1))
+
+/*
+** ldff1sw_gather_0_s64_index:
+**	ldff1sw	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_index_s64 (p0, z0, 0),
+		     z0_res = svldff1sw_gather_index_s64 (p0, z0, 0))
+
+/*
+** ldff1sw_gather_5_s64_index:
+**	ldff1sw	z0\.d, p0/z, \[z0\.d, #20\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_5_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_index_s64 (p0, z0, 5),
+		     z0_res = svldff1sw_gather_index_s64 (p0, z0, 5))
+
+/*
+** ldff1sw_gather_31_s64_index:
+**	ldff1sw	z0\.d, p0/z, \[z0\.d, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_31_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_index_s64 (p0, z0, 31),
+		     z0_res = svldff1sw_gather_index_s64 (p0, z0, 31))
+
+/*
+** ldff1sw_gather_32_s64_index:
+**	mov	(x[0-9]+), #?128
+**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_32_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_index_s64 (p0, z0, 32),
+		     z0_res = svldff1sw_gather_index_s64 (p0, z0, 32))
+
+/*
+** ldff1sw_gather_x0_s64_s64offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_x0_s64_s64offset, svint64_t, int32_t, svint64_t,
+		     z0_res = svldff1sw_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1sw_gather_tied1_s64_s64offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_tied1_s64_s64offset, svint64_t, int32_t, svint64_t,
+		     z0_res = svldff1sw_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1sw_gather_untied_s64_s64offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_untied_s64_s64offset, svint64_t, int32_t, svint64_t,
+		     z0_res = svldff1sw_gather_s64offset_s64 (p0, x0, z1),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ldff1sw_gather_ext_s64_s64offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_ext_s64_s64offset, svint64_t, int32_t, svint64_t,
+		     z0_res = svldff1sw_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1sw_gather_x0_s64_u64offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_x0_s64_u64offset, svint64_t, int32_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1sw_gather_tied1_s64_u64offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_tied1_s64_u64offset, svint64_t, int32_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1sw_gather_untied_s64_u64offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_untied_s64_u64offset, svint64_t, int32_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64offset_s64 (p0, x0, z1),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ldff1sw_gather_ext_s64_u64offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_ext_s64_u64offset, svint64_t, int32_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1sw_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1sw_gather_x0_s64_s64index:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_x0_s64_s64index, svint64_t, int32_t, svint64_t,
+		     z0_res = svldff1sw_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svldff1sw_gather_index_s64 (p0, x0, z0))
+
+/*
+** ldff1sw_gather_tied1_s64_s64index:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_tied1_s64_s64index, svint64_t, int32_t, svint64_t,
+		     z0_res = svldff1sw_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svldff1sw_gather_index_s64 (p0, x0, z0))
+
+/*
+** ldff1sw_gather_untied_s64_s64index:
+**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_untied_s64_s64index, svint64_t, int32_t, svint64_t,
+		     z0_res = svldff1sw_gather_s64index_s64 (p0, x0, z1),
+		     z0_res = svldff1sw_gather_index_s64 (p0, x0, z1))
+
+/*
+** ldff1sw_gather_ext_s64_s64index:
+**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_ext_s64_s64index, svint64_t, int32_t, svint64_t,
+		     z0_res = svldff1sw_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1sw_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1sw_gather_x0_s64_u64index:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_x0_s64_u64index, svint64_t, int32_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svldff1sw_gather_index_s64 (p0, x0, z0))
+
+/*
+** ldff1sw_gather_tied1_s64_u64index:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_tied1_s64_u64index, svint64_t, int32_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svldff1sw_gather_index_s64 (p0, x0, z0))
+
+/*
+** ldff1sw_gather_untied_s64_u64index:
+**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_untied_s64_u64index, svint64_t, int32_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64index_s64 (p0, x0, z1),
+		     z0_res = svldff1sw_gather_index_s64 (p0, x0, z1))
+
+/*
+** ldff1sw_gather_ext_s64_u64index:
+**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_ext_s64_u64index, svint64_t, int32_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1sw_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_gather_u64.c
new file mode 100644
index 000000000..411428756
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_gather_u64.c
@@ -0,0 +1,308 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sw_gather_u64_tied1:
+**	ldff1sw	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_u64_tied1, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_u64 (p0, z0),
+		     z0_res = svldff1sw_gather_u64 (p0, z0))
+
+/*
+** ldff1sw_gather_u64_untied:
+**	ldff1sw	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_u64_untied, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_u64 (p0, z1),
+		     z0_res = svldff1sw_gather_u64 (p0, z1))
+
+/*
+** ldff1sw_gather_x0_u64_offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_x0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, x0),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, x0))
+
+/*
+** ldff1sw_gather_m4_u64_offset:
+**	mov	(x[0-9]+), #?-4
+**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_m4_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, -4),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, -4))
+
+/*
+** ldff1sw_gather_0_u64_offset:
+**	ldff1sw	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, 0),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, 0))
+
+/*
+** ldff1sw_gather_5_u64_offset:
+**	mov	(x[0-9]+), #?5
+**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_5_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, 5),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, 5))
+
+/*
+** ldff1sw_gather_6_u64_offset:
+**	mov	(x[0-9]+), #?6
+**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_6_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, 6),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, 6))
+
+/*
+** ldff1sw_gather_7_u64_offset:
+**	mov	(x[0-9]+), #?7
+**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_7_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, 7),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, 7))
+
+/*
+** ldff1sw_gather_8_u64_offset:
+**	ldff1sw	z0\.d, p0/z, \[z0\.d, #8\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_8_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, 8),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, 8))
+
+/*
+** ldff1sw_gather_124_u64_offset:
+**	ldff1sw	z0\.d, p0/z, \[z0\.d, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_124_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, 124),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, 124))
+
+/*
+** ldff1sw_gather_128_u64_offset:
+**	mov	(x[0-9]+), #?128
+**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_128_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_offset_u64 (p0, z0, 128),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, z0, 128))
+
+/*
+** ldff1sw_gather_x0_u64_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_x0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_index_u64 (p0, z0, x0),
+		     z0_res = svldff1sw_gather_index_u64 (p0, z0, x0))
+
+/*
+** ldff1sw_gather_m1_u64_index:
+**	mov	(x[0-9]+), #?-4
+**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_m1_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_index_u64 (p0, z0, -1),
+		     z0_res = svldff1sw_gather_index_u64 (p0, z0, -1))
+
+/*
+** ldff1sw_gather_0_u64_index:
+**	ldff1sw	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_index_u64 (p0, z0, 0),
+		     z0_res = svldff1sw_gather_index_u64 (p0, z0, 0))
+
+/*
+** ldff1sw_gather_5_u64_index:
+**	ldff1sw	z0\.d, p0/z, \[z0\.d, #20\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_5_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_index_u64 (p0, z0, 5),
+		     z0_res = svldff1sw_gather_index_u64 (p0, z0, 5))
+
+/*
+** ldff1sw_gather_31_u64_index:
+**	ldff1sw	z0\.d, p0/z, \[z0\.d, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_31_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_index_u64 (p0, z0, 31),
+		     z0_res = svldff1sw_gather_index_u64 (p0, z0, 31))
+
+/*
+** ldff1sw_gather_32_u64_index:
+**	mov	(x[0-9]+), #?128
+**	ldff1sw	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1sw_gather_32_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64base_index_u64 (p0, z0, 32),
+		     z0_res = svldff1sw_gather_index_u64 (p0, z0, 32))
+
+/*
+** ldff1sw_gather_x0_u64_s64offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_x0_u64_s64offset, svuint64_t, int32_t, svint64_t,
+		     z0_res = svldff1sw_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1sw_gather_tied1_u64_s64offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_tied1_u64_s64offset, svuint64_t, int32_t, svint64_t,
+		     z0_res = svldff1sw_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1sw_gather_untied_u64_s64offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_untied_u64_s64offset, svuint64_t, int32_t, svint64_t,
+		     z0_res = svldff1sw_gather_s64offset_u64 (p0, x0, z1),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ldff1sw_gather_ext_u64_s64offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_ext_u64_s64offset, svuint64_t, int32_t, svint64_t,
+		     z0_res = svldff1sw_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1sw_gather_x0_u64_u64offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_x0_u64_u64offset, svuint64_t, int32_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1sw_gather_tied1_u64_u64offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_tied1_u64_u64offset, svuint64_t, int32_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1sw_gather_untied_u64_u64offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_untied_u64_u64offset, svuint64_t, int32_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64offset_u64 (p0, x0, z1),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ldff1sw_gather_ext_u64_u64offset:
+**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_ext_u64_u64offset, svuint64_t, int32_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1sw_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1sw_gather_x0_u64_s64index:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_x0_u64_s64index, svuint64_t, int32_t, svint64_t,
+		     z0_res = svldff1sw_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svldff1sw_gather_index_u64 (p0, x0, z0))
+
+/*
+** ldff1sw_gather_tied1_u64_s64index:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_tied1_u64_s64index, svuint64_t, int32_t, svint64_t,
+		     z0_res = svldff1sw_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svldff1sw_gather_index_u64 (p0, x0, z0))
+
+/*
+** ldff1sw_gather_untied_u64_s64index:
+**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_untied_u64_s64index, svuint64_t, int32_t, svint64_t,
+		     z0_res = svldff1sw_gather_s64index_u64 (p0, x0, z1),
+		     z0_res = svldff1sw_gather_index_u64 (p0, x0, z1))
+
+/*
+** ldff1sw_gather_ext_u64_s64index:
+**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_ext_u64_s64index, svuint64_t, int32_t, svint64_t,
+		     z0_res = svldff1sw_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1sw_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1sw_gather_x0_u64_u64index:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_x0_u64_u64index, svuint64_t, int32_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svldff1sw_gather_index_u64 (p0, x0, z0))
+
+/*
+** ldff1sw_gather_tied1_u64_u64index:
+**	ldff1sw	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_tied1_u64_u64index, svuint64_t, int32_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svldff1sw_gather_index_u64 (p0, x0, z0))
+
+/*
+** ldff1sw_gather_untied_u64_u64index:
+**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_untied_u64_u64index, svuint64_t, int32_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64index_u64 (p0, x0, z1),
+		     z0_res = svldff1sw_gather_index_u64 (p0, x0, z1))
+
+/*
+** ldff1sw_gather_ext_u64_u64index:
+**	ldff1sw	z0\.d, p0/z, \[x0, z1\.d, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1sw_gather_ext_u64_u64index, svuint64_t, int32_t, svuint64_t,
+		     z0_res = svldff1sw_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1sw_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_s64.c
new file mode 100644
index 000000000..d795ace63
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_s64.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sw_s64_base:
+**	ldff1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sw_s64_base, svint64_t, int32_t,
+	   z0 = svldff1sw_s64 (p0, x0),
+	   z0 = svldff1sw_s64 (p0, x0))
+
+/*
+** ldff1sw_s64_index:
+**	ldff1sw	z0\.d, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ldff1sw_s64_index, svint64_t, int32_t,
+	   z0 = svldff1sw_s64 (p0, x0 + x1),
+	   z0 = svldff1sw_s64 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sw_s64_1:
+**	inch	x0
+**	ldff1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sw_s64_1, svint64_t, int32_t,
+	   z0 = svldff1sw_s64 (p0, x0 + svcntd ()),
+	   z0 = svldff1sw_s64 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sw_s64_m1:
+**	dech	x0
+**	ldff1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sw_s64_m1, svint64_t, int32_t,
+	   z0 = svldff1sw_s64 (p0, x0 - svcntd ()),
+	   z0 = svldff1sw_s64 (p0, x0 - svcntd ()))
+
+/*
+** ldff1sw_vnum_s64_0:
+**	ldff1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sw_vnum_s64_0, svint64_t, int32_t,
+	   z0 = svldff1sw_vnum_s64 (p0, x0, 0),
+	   z0 = svldff1sw_vnum_s64 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sw_vnum_s64_1:
+**	inch	x0
+**	ldff1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sw_vnum_s64_1, svint64_t, int32_t,
+	   z0 = svldff1sw_vnum_s64 (p0, x0, 1),
+	   z0 = svldff1sw_vnum_s64 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sw_vnum_s64_m1:
+**	dech	x0
+**	ldff1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sw_vnum_s64_m1, svint64_t, int32_t,
+	   z0 = svldff1sw_vnum_s64 (p0, x0, -1),
+	   z0 = svldff1sw_vnum_s64 (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1sw_vnum_s64_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1sw	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1sw_vnum_s64_x1, svint64_t, int32_t,
+	   z0 = svldff1sw_vnum_s64 (p0, x0, x1),
+	   z0 = svldff1sw_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_u64.c
new file mode 100644
index 000000000..6caf2f504
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1sw_u64.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1sw_u64_base:
+**	ldff1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sw_u64_base, svuint64_t, int32_t,
+	   z0 = svldff1sw_u64 (p0, x0),
+	   z0 = svldff1sw_u64 (p0, x0))
+
+/*
+** ldff1sw_u64_index:
+**	ldff1sw	z0\.d, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ldff1sw_u64_index, svuint64_t, int32_t,
+	   z0 = svldff1sw_u64 (p0, x0 + x1),
+	   z0 = svldff1sw_u64 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sw_u64_1:
+**	inch	x0
+**	ldff1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sw_u64_1, svuint64_t, int32_t,
+	   z0 = svldff1sw_u64 (p0, x0 + svcntd ()),
+	   z0 = svldff1sw_u64 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sw_u64_m1:
+**	dech	x0
+**	ldff1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sw_u64_m1, svuint64_t, int32_t,
+	   z0 = svldff1sw_u64 (p0, x0 - svcntd ()),
+	   z0 = svldff1sw_u64 (p0, x0 - svcntd ()))
+
+/*
+** ldff1sw_vnum_u64_0:
+**	ldff1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sw_vnum_u64_0, svuint64_t, int32_t,
+	   z0 = svldff1sw_vnum_u64 (p0, x0, 0),
+	   z0 = svldff1sw_vnum_u64 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sw_vnum_u64_1:
+**	inch	x0
+**	ldff1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sw_vnum_u64_1, svuint64_t, int32_t,
+	   z0 = svldff1sw_vnum_u64 (p0, x0, 1),
+	   z0 = svldff1sw_vnum_u64 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1sw_vnum_u64_m1:
+**	dech	x0
+**	ldff1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1sw_vnum_u64_m1, svuint64_t, int32_t,
+	   z0 = svldff1sw_vnum_u64 (p0, x0, -1),
+	   z0 = svldff1sw_vnum_u64 (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1sw_vnum_u64_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1sw	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1sw_vnum_u64_x1, svuint64_t, int32_t,
+	   z0 = svldff1sw_vnum_u64 (p0, x0, x1),
+	   z0 = svldff1sw_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_s32.c
new file mode 100644
index 000000000..af0be08d2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_s32.c
@@ -0,0 +1,131 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1ub_gather_s32_tied1:
+**	ldff1b	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_s32_tied1, svint32_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32base_s32 (p0, z0),
+		     z0_res = svldff1ub_gather_s32 (p0, z0))
+
+/*
+** ldff1ub_gather_s32_untied:
+**	ldff1b	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_s32_untied, svint32_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32base_s32 (p0, z1),
+		     z0_res = svldff1ub_gather_s32 (p0, z1))
+
+/*
+** ldff1ub_gather_x0_s32_offset:
+**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_x0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32base_offset_s32 (p0, z0, x0),
+		     z0_res = svldff1ub_gather_offset_s32 (p0, z0, x0))
+
+/*
+** ldff1ub_gather_m1_s32_offset:
+**	mov	(x[0-9]+), #?-1
+**	ldff1b	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_m1_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32base_offset_s32 (p0, z0, -1),
+		     z0_res = svldff1ub_gather_offset_s32 (p0, z0, -1))
+
+/*
+** ldff1ub_gather_0_s32_offset:
+**	ldff1b	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32base_offset_s32 (p0, z0, 0),
+		     z0_res = svldff1ub_gather_offset_s32 (p0, z0, 0))
+
+/*
+** ldff1ub_gather_5_s32_offset:
+**	ldff1b	z0\.s, p0/z, \[z0\.s, #5\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_5_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32base_offset_s32 (p0, z0, 5),
+		     z0_res = svldff1ub_gather_offset_s32 (p0, z0, 5))
+
+/*
+** ldff1ub_gather_31_s32_offset:
+**	ldff1b	z0\.s, p0/z, \[z0\.s, #31\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_31_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32base_offset_s32 (p0, z0, 31),
+		     z0_res = svldff1ub_gather_offset_s32 (p0, z0, 31))
+
+/*
+** ldff1ub_gather_32_s32_offset:
+**	mov	(x[0-9]+), #?32
+**	ldff1b	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_32_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32base_offset_s32 (p0, z0, 32),
+		     z0_res = svldff1ub_gather_offset_s32 (p0, z0, 32))
+
+/*
+** ldff1ub_gather_x0_s32_s32offset:
+**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_x0_s32_s32offset, svint32_t, uint8_t, svint32_t,
+		     z0_res = svldff1ub_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1ub_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ldff1ub_gather_tied1_s32_s32offset:
+**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_tied1_s32_s32offset, svint32_t, uint8_t, svint32_t,
+		     z0_res = svldff1ub_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1ub_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ldff1ub_gather_untied_s32_s32offset:
+**	ldff1b	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_untied_s32_s32offset, svint32_t, uint8_t, svint32_t,
+		     z0_res = svldff1ub_gather_s32offset_s32 (p0, x0, z1),
+		     z0_res = svldff1ub_gather_offset_s32 (p0, x0, z1))
+
+/*
+** ldff1ub_gather_x0_s32_u32offset:
+**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_x0_s32_u32offset, svint32_t, uint8_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1ub_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ldff1ub_gather_tied1_s32_u32offset:
+**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_tied1_s32_u32offset, svint32_t, uint8_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1ub_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ldff1ub_gather_untied_s32_u32offset:
+**	ldff1b	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_untied_s32_u32offset, svint32_t, uint8_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32offset_s32 (p0, x0, z1),
+		     z0_res = svldff1ub_gather_offset_s32 (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_s64.c
new file mode 100644
index 000000000..43124dd89
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_s64.c
@@ -0,0 +1,149 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1ub_gather_s64_tied1:
+**	ldff1b	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_s64_tied1, svint64_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64base_s64 (p0, z0),
+		     z0_res = svldff1ub_gather_s64 (p0, z0))
+
+/*
+** ldff1ub_gather_s64_untied:
+**	ldff1b	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_s64_untied, svint64_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64base_s64 (p0, z1),
+		     z0_res = svldff1ub_gather_s64 (p0, z1))
+
+/*
+** ldff1ub_gather_x0_s64_offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_x0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64base_offset_s64 (p0, z0, x0),
+		     z0_res = svldff1ub_gather_offset_s64 (p0, z0, x0))
+
+/*
+** ldff1ub_gather_m1_s64_offset:
+**	mov	(x[0-9]+), #?-1
+**	ldff1b	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_m1_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64base_offset_s64 (p0, z0, -1),
+		     z0_res = svldff1ub_gather_offset_s64 (p0, z0, -1))
+
+/*
+** ldff1ub_gather_0_s64_offset:
+**	ldff1b	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64base_offset_s64 (p0, z0, 0),
+		     z0_res = svldff1ub_gather_offset_s64 (p0, z0, 0))
+
+/*
+** ldff1ub_gather_5_s64_offset:
+**	ldff1b	z0\.d, p0/z, \[z0\.d, #5\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_5_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64base_offset_s64 (p0, z0, 5),
+		     z0_res = svldff1ub_gather_offset_s64 (p0, z0, 5))
+
+/*
+** ldff1ub_gather_31_s64_offset:
+**	ldff1b	z0\.d, p0/z, \[z0\.d, #31\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_31_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64base_offset_s64 (p0, z0, 31),
+		     z0_res = svldff1ub_gather_offset_s64 (p0, z0, 31))
+
+/*
+** ldff1ub_gather_32_s64_offset:
+**	mov	(x[0-9]+), #?32
+**	ldff1b	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_32_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64base_offset_s64 (p0, z0, 32),
+		     z0_res = svldff1ub_gather_offset_s64 (p0, z0, 32))
+
+/*
+** ldff1ub_gather_x0_s64_s64offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_x0_s64_s64offset, svint64_t, uint8_t, svint64_t,
+		     z0_res = svldff1ub_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1ub_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1ub_gather_tied1_s64_s64offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_tied1_s64_s64offset, svint64_t, uint8_t, svint64_t,
+		     z0_res = svldff1ub_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1ub_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1ub_gather_untied_s64_s64offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_untied_s64_s64offset, svint64_t, uint8_t, svint64_t,
+		     z0_res = svldff1ub_gather_s64offset_s64 (p0, x0, z1),
+		     z0_res = svldff1ub_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ldff1ub_gather_ext_s64_s64offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_ext_s64_s64offset, svint64_t, uint8_t, svint64_t,
+		     z0_res = svldff1ub_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1ub_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1ub_gather_x0_s64_u64offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_x0_s64_u64offset, svint64_t, uint8_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1ub_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1ub_gather_tied1_s64_u64offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_tied1_s64_u64offset, svint64_t, uint8_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1ub_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1ub_gather_untied_s64_u64offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_untied_s64_u64offset, svint64_t, uint8_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64offset_s64 (p0, x0, z1),
+		     z0_res = svldff1ub_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ldff1ub_gather_ext_s64_u64offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_ext_s64_u64offset, svint64_t, uint8_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1ub_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_u32.c
new file mode 100644
index 000000000..90c4e58a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_u32.c
@@ -0,0 +1,131 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1ub_gather_u32_tied1:
+**	ldff1b	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_u32_tied1, svuint32_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32base_u32 (p0, z0),
+		     z0_res = svldff1ub_gather_u32 (p0, z0))
+
+/*
+** ldff1ub_gather_u32_untied:
+**	ldff1b	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_u32_untied, svuint32_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32base_u32 (p0, z1),
+		     z0_res = svldff1ub_gather_u32 (p0, z1))
+
+/*
+** ldff1ub_gather_x0_u32_offset:
+**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_x0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32base_offset_u32 (p0, z0, x0),
+		     z0_res = svldff1ub_gather_offset_u32 (p0, z0, x0))
+
+/*
+** ldff1ub_gather_m1_u32_offset:
+**	mov	(x[0-9]+), #?-1
+**	ldff1b	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_m1_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32base_offset_u32 (p0, z0, -1),
+		     z0_res = svldff1ub_gather_offset_u32 (p0, z0, -1))
+
+/*
+** ldff1ub_gather_0_u32_offset:
+**	ldff1b	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32base_offset_u32 (p0, z0, 0),
+		     z0_res = svldff1ub_gather_offset_u32 (p0, z0, 0))
+
+/*
+** ldff1ub_gather_5_u32_offset:
+**	ldff1b	z0\.s, p0/z, \[z0\.s, #5\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_5_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32base_offset_u32 (p0, z0, 5),
+		     z0_res = svldff1ub_gather_offset_u32 (p0, z0, 5))
+
+/*
+** ldff1ub_gather_31_u32_offset:
+**	ldff1b	z0\.s, p0/z, \[z0\.s, #31\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_31_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32base_offset_u32 (p0, z0, 31),
+		     z0_res = svldff1ub_gather_offset_u32 (p0, z0, 31))
+
+/*
+** ldff1ub_gather_32_u32_offset:
+**	mov	(x[0-9]+), #?32
+**	ldff1b	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_32_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32base_offset_u32 (p0, z0, 32),
+		     z0_res = svldff1ub_gather_offset_u32 (p0, z0, 32))
+
+/*
+** ldff1ub_gather_x0_u32_s32offset:
+**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_x0_u32_s32offset, svuint32_t, uint8_t, svint32_t,
+		     z0_res = svldff1ub_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1ub_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ldff1ub_gather_tied1_u32_s32offset:
+**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_tied1_u32_s32offset, svuint32_t, uint8_t, svint32_t,
+		     z0_res = svldff1ub_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1ub_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ldff1ub_gather_untied_u32_s32offset:
+**	ldff1b	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_untied_u32_s32offset, svuint32_t, uint8_t, svint32_t,
+		     z0_res = svldff1ub_gather_s32offset_u32 (p0, x0, z1),
+		     z0_res = svldff1ub_gather_offset_u32 (p0, x0, z1))
+
+/*
+** ldff1ub_gather_x0_u32_u32offset:
+**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_x0_u32_u32offset, svuint32_t, uint8_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1ub_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ldff1ub_gather_tied1_u32_u32offset:
+**	ldff1b	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_tied1_u32_u32offset, svuint32_t, uint8_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1ub_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ldff1ub_gather_untied_u32_u32offset:
+**	ldff1b	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_untied_u32_u32offset, svuint32_t, uint8_t, svuint32_t,
+		     z0_res = svldff1ub_gather_u32offset_u32 (p0, x0, z1),
+		     z0_res = svldff1ub_gather_offset_u32 (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_u64.c
new file mode 100644
index 000000000..302623a40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_gather_u64.c
@@ -0,0 +1,149 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1ub_gather_u64_tied1:
+**	ldff1b	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_u64_tied1, svuint64_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64base_u64 (p0, z0),
+		     z0_res = svldff1ub_gather_u64 (p0, z0))
+
+/*
+** ldff1ub_gather_u64_untied:
+**	ldff1b	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_u64_untied, svuint64_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64base_u64 (p0, z1),
+		     z0_res = svldff1ub_gather_u64 (p0, z1))
+
+/*
+** ldff1ub_gather_x0_u64_offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_x0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64base_offset_u64 (p0, z0, x0),
+		     z0_res = svldff1ub_gather_offset_u64 (p0, z0, x0))
+
+/*
+** ldff1ub_gather_m1_u64_offset:
+**	mov	(x[0-9]+), #?-1
+**	ldff1b	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_m1_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64base_offset_u64 (p0, z0, -1),
+		     z0_res = svldff1ub_gather_offset_u64 (p0, z0, -1))
+
+/*
+** ldff1ub_gather_0_u64_offset:
+**	ldff1b	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64base_offset_u64 (p0, z0, 0),
+		     z0_res = svldff1ub_gather_offset_u64 (p0, z0, 0))
+
+/*
+** ldff1ub_gather_5_u64_offset:
+**	ldff1b	z0\.d, p0/z, \[z0\.d, #5\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_5_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64base_offset_u64 (p0, z0, 5),
+		     z0_res = svldff1ub_gather_offset_u64 (p0, z0, 5))
+
+/*
+** ldff1ub_gather_31_u64_offset:
+**	ldff1b	z0\.d, p0/z, \[z0\.d, #31\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_31_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64base_offset_u64 (p0, z0, 31),
+		     z0_res = svldff1ub_gather_offset_u64 (p0, z0, 31))
+
+/*
+** ldff1ub_gather_32_u64_offset:
+**	mov	(x[0-9]+), #?32
+**	ldff1b	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1ub_gather_32_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64base_offset_u64 (p0, z0, 32),
+		     z0_res = svldff1ub_gather_offset_u64 (p0, z0, 32))
+
+/*
+** ldff1ub_gather_x0_u64_s64offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_x0_u64_s64offset, svuint64_t, uint8_t, svint64_t,
+		     z0_res = svldff1ub_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1ub_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1ub_gather_tied1_u64_s64offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_tied1_u64_s64offset, svuint64_t, uint8_t, svint64_t,
+		     z0_res = svldff1ub_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1ub_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1ub_gather_untied_u64_s64offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_untied_u64_s64offset, svuint64_t, uint8_t, svint64_t,
+		     z0_res = svldff1ub_gather_s64offset_u64 (p0, x0, z1),
+		     z0_res = svldff1ub_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ldff1ub_gather_ext_u64_s64offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_ext_u64_s64offset, svuint64_t, uint8_t, svint64_t,
+		     z0_res = svldff1ub_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1ub_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1ub_gather_x0_u64_u64offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_x0_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1ub_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1ub_gather_tied1_u64_u64offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_tied1_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1ub_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1ub_gather_untied_u64_u64offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_untied_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64offset_u64 (p0, x0, z1),
+		     z0_res = svldff1ub_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ldff1ub_gather_ext_u64_u64offset:
+**	ldff1b	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1ub_gather_ext_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
+		     z0_res = svldff1ub_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1ub_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s16.c
new file mode 100644
index 000000000..88ad2d1dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s16.c
@@ -0,0 +1,90 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1ub_s16_base:
+**	ldff1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_s16_base, svint16_t, uint8_t,
+	   z0 = svldff1ub_s16 (p0, x0),
+	   z0 = svldff1ub_s16 (p0, x0))
+
+/*
+** ldff1ub_s16_index:
+**	ldff1b	z0\.h, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_s16_index, svint16_t, uint8_t,
+	   z0 = svldff1ub_s16 (p0, x0 + x1),
+	   z0 = svldff1ub_s16 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_s16_1:
+**	inch	x0
+**	ldff1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_s16_1, svint16_t, uint8_t,
+	   z0 = svldff1ub_s16 (p0, x0 + svcnth ()),
+	   z0 = svldff1ub_s16 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_s16_m1:
+**	dech	x0
+**	ldff1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_s16_m1, svint16_t, uint8_t,
+	   z0 = svldff1ub_s16 (p0, x0 - svcnth ()),
+	   z0 = svldff1ub_s16 (p0, x0 - svcnth ()))
+
+/*
+** ldff1ub_vnum_s16_0:
+**	ldff1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_s16_0, svint16_t, uint8_t,
+	   z0 = svldff1ub_vnum_s16 (p0, x0, 0),
+	   z0 = svldff1ub_vnum_s16 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_vnum_s16_1:
+**	inch	x0
+**	ldff1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_s16_1, svint16_t, uint8_t,
+	   z0 = svldff1ub_vnum_s16 (p0, x0, 1),
+	   z0 = svldff1ub_vnum_s16 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_vnum_s16_m1:
+**	dech	x0
+**	ldff1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_s16_m1, svint16_t, uint8_t,
+	   z0 = svldff1ub_vnum_s16 (p0, x0, -1),
+	   z0 = svldff1ub_vnum_s16 (p0, x0, -1))
+
+/*
+** ldff1ub_vnum_s16_x1:
+**	cnth	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldff1b	z0\.h, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ldff1b	z0\.h, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_s16_x1, svint16_t, uint8_t,
+	   z0 = svldff1ub_vnum_s16 (p0, x0, x1),
+	   z0 = svldff1ub_vnum_s16 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s32.c
new file mode 100644
index 000000000..e8e06411f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s32.c
@@ -0,0 +1,90 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1ub_s32_base:
+**	ldff1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_s32_base, svint32_t, uint8_t,
+	   z0 = svldff1ub_s32 (p0, x0),
+	   z0 = svldff1ub_s32 (p0, x0))
+
+/*
+** ldff1ub_s32_index:
+**	ldff1b	z0\.s, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_s32_index, svint32_t, uint8_t,
+	   z0 = svldff1ub_s32 (p0, x0 + x1),
+	   z0 = svldff1ub_s32 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_s32_1:
+**	incw	x0
+**	ldff1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_s32_1, svint32_t, uint8_t,
+	   z0 = svldff1ub_s32 (p0, x0 + svcntw ()),
+	   z0 = svldff1ub_s32 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_s32_m1:
+**	decw	x0
+**	ldff1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_s32_m1, svint32_t, uint8_t,
+	   z0 = svldff1ub_s32 (p0, x0 - svcntw ()),
+	   z0 = svldff1ub_s32 (p0, x0 - svcntw ()))
+
+/*
+** ldff1ub_vnum_s32_0:
+**	ldff1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_s32_0, svint32_t, uint8_t,
+	   z0 = svldff1ub_vnum_s32 (p0, x0, 0),
+	   z0 = svldff1ub_vnum_s32 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_vnum_s32_1:
+**	incw	x0
+**	ldff1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_s32_1, svint32_t, uint8_t,
+	   z0 = svldff1ub_vnum_s32 (p0, x0, 1),
+	   z0 = svldff1ub_vnum_s32 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_vnum_s32_m1:
+**	decw	x0
+**	ldff1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_s32_m1, svint32_t, uint8_t,
+	   z0 = svldff1ub_vnum_s32 (p0, x0, -1),
+	   z0 = svldff1ub_vnum_s32 (p0, x0, -1))
+
+/*
+** ldff1ub_vnum_s32_x1:
+**	cntw	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldff1b	z0\.s, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ldff1b	z0\.s, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_s32_x1, svint32_t, uint8_t,
+	   z0 = svldff1ub_vnum_s32 (p0, x0, x1),
+	   z0 = svldff1ub_vnum_s32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s64.c
new file mode 100644
index 000000000..21d02ddb7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_s64.c
@@ -0,0 +1,90 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1ub_s64_base:
+**	ldff1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_s64_base, svint64_t, uint8_t,
+	   z0 = svldff1ub_s64 (p0, x0),
+	   z0 = svldff1ub_s64 (p0, x0))
+
+/*
+** ldff1ub_s64_index:
+**	ldff1b	z0\.d, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_s64_index, svint64_t, uint8_t,
+	   z0 = svldff1ub_s64 (p0, x0 + x1),
+	   z0 = svldff1ub_s64 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_s64_1:
+**	incd	x0
+**	ldff1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_s64_1, svint64_t, uint8_t,
+	   z0 = svldff1ub_s64 (p0, x0 + svcntd ()),
+	   z0 = svldff1ub_s64 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_s64_m1:
+**	decd	x0
+**	ldff1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_s64_m1, svint64_t, uint8_t,
+	   z0 = svldff1ub_s64 (p0, x0 - svcntd ()),
+	   z0 = svldff1ub_s64 (p0, x0 - svcntd ()))
+
+/*
+** ldff1ub_vnum_s64_0:
+**	ldff1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_s64_0, svint64_t, uint8_t,
+	   z0 = svldff1ub_vnum_s64 (p0, x0, 0),
+	   z0 = svldff1ub_vnum_s64 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_vnum_s64_1:
+**	incd	x0
+**	ldff1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_s64_1, svint64_t, uint8_t,
+	   z0 = svldff1ub_vnum_s64 (p0, x0, 1),
+	   z0 = svldff1ub_vnum_s64 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_vnum_s64_m1:
+**	decd	x0
+**	ldff1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_s64_m1, svint64_t, uint8_t,
+	   z0 = svldff1ub_vnum_s64 (p0, x0, -1),
+	   z0 = svldff1ub_vnum_s64 (p0, x0, -1))
+
+/*
+** ldff1ub_vnum_s64_x1:
+**	cntd	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldff1b	z0\.d, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ldff1b	z0\.d, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_s64_x1, svint64_t, uint8_t,
+	   z0 = svldff1ub_vnum_s64 (p0, x0, x1),
+	   z0 = svldff1ub_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u16.c
new file mode 100644
index 000000000..904cb027e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u16.c
@@ -0,0 +1,90 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1ub_u16_base:
+**	ldff1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_u16_base, svuint16_t, uint8_t,
+	   z0 = svldff1ub_u16 (p0, x0),
+	   z0 = svldff1ub_u16 (p0, x0))
+
+/*
+** ldff1ub_u16_index:
+**	ldff1b	z0\.h, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_u16_index, svuint16_t, uint8_t,
+	   z0 = svldff1ub_u16 (p0, x0 + x1),
+	   z0 = svldff1ub_u16 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_u16_1:
+**	inch	x0
+**	ldff1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_u16_1, svuint16_t, uint8_t,
+	   z0 = svldff1ub_u16 (p0, x0 + svcnth ()),
+	   z0 = svldff1ub_u16 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_u16_m1:
+**	dech	x0
+**	ldff1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_u16_m1, svuint16_t, uint8_t,
+	   z0 = svldff1ub_u16 (p0, x0 - svcnth ()),
+	   z0 = svldff1ub_u16 (p0, x0 - svcnth ()))
+
+/*
+** ldff1ub_vnum_u16_0:
+**	ldff1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_u16_0, svuint16_t, uint8_t,
+	   z0 = svldff1ub_vnum_u16 (p0, x0, 0),
+	   z0 = svldff1ub_vnum_u16 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_vnum_u16_1:
+**	inch	x0
+**	ldff1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_u16_1, svuint16_t, uint8_t,
+	   z0 = svldff1ub_vnum_u16 (p0, x0, 1),
+	   z0 = svldff1ub_vnum_u16 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_vnum_u16_m1:
+**	dech	x0
+**	ldff1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_u16_m1, svuint16_t, uint8_t,
+	   z0 = svldff1ub_vnum_u16 (p0, x0, -1),
+	   z0 = svldff1ub_vnum_u16 (p0, x0, -1))
+
+/*
+** ldff1ub_vnum_u16_x1:
+**	cnth	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldff1b	z0\.h, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ldff1b	z0\.h, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_u16_x1, svuint16_t, uint8_t,
+	   z0 = svldff1ub_vnum_u16 (p0, x0, x1),
+	   z0 = svldff1ub_vnum_u16 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u32.c
new file mode 100644
index 000000000..a40012318
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u32.c
@@ -0,0 +1,90 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1ub_u32_base:
+**	ldff1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_u32_base, svuint32_t, uint8_t,
+	   z0 = svldff1ub_u32 (p0, x0),
+	   z0 = svldff1ub_u32 (p0, x0))
+
+/*
+** ldff1ub_u32_index:
+**	ldff1b	z0\.s, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_u32_index, svuint32_t, uint8_t,
+	   z0 = svldff1ub_u32 (p0, x0 + x1),
+	   z0 = svldff1ub_u32 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_u32_1:
+**	incw	x0
+**	ldff1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_u32_1, svuint32_t, uint8_t,
+	   z0 = svldff1ub_u32 (p0, x0 + svcntw ()),
+	   z0 = svldff1ub_u32 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_u32_m1:
+**	decw	x0
+**	ldff1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_u32_m1, svuint32_t, uint8_t,
+	   z0 = svldff1ub_u32 (p0, x0 - svcntw ()),
+	   z0 = svldff1ub_u32 (p0, x0 - svcntw ()))
+
+/*
+** ldff1ub_vnum_u32_0:
+**	ldff1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_u32_0, svuint32_t, uint8_t,
+	   z0 = svldff1ub_vnum_u32 (p0, x0, 0),
+	   z0 = svldff1ub_vnum_u32 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_vnum_u32_1:
+**	incw	x0
+**	ldff1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_u32_1, svuint32_t, uint8_t,
+	   z0 = svldff1ub_vnum_u32 (p0, x0, 1),
+	   z0 = svldff1ub_vnum_u32 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_vnum_u32_m1:
+**	decw	x0
+**	ldff1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_u32_m1, svuint32_t, uint8_t,
+	   z0 = svldff1ub_vnum_u32 (p0, x0, -1),
+	   z0 = svldff1ub_vnum_u32 (p0, x0, -1))
+
+/*
+** ldff1ub_vnum_u32_x1:
+**	cntw	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldff1b	z0\.s, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ldff1b	z0\.s, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_u32_x1, svuint32_t, uint8_t,
+	   z0 = svldff1ub_vnum_u32 (p0, x0, x1),
+	   z0 = svldff1ub_vnum_u32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u64.c
new file mode 100644
index 000000000..a9a98a683
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1ub_u64.c
@@ -0,0 +1,90 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1ub_u64_base:
+**	ldff1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_u64_base, svuint64_t, uint8_t,
+	   z0 = svldff1ub_u64 (p0, x0),
+	   z0 = svldff1ub_u64 (p0, x0))
+
+/*
+** ldff1ub_u64_index:
+**	ldff1b	z0\.d, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_u64_index, svuint64_t, uint8_t,
+	   z0 = svldff1ub_u64 (p0, x0 + x1),
+	   z0 = svldff1ub_u64 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_u64_1:
+**	incd	x0
+**	ldff1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_u64_1, svuint64_t, uint8_t,
+	   z0 = svldff1ub_u64 (p0, x0 + svcntd ()),
+	   z0 = svldff1ub_u64 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_u64_m1:
+**	decd	x0
+**	ldff1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_u64_m1, svuint64_t, uint8_t,
+	   z0 = svldff1ub_u64 (p0, x0 - svcntd ()),
+	   z0 = svldff1ub_u64 (p0, x0 - svcntd ()))
+
+/*
+** ldff1ub_vnum_u64_0:
+**	ldff1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_u64_0, svuint64_t, uint8_t,
+	   z0 = svldff1ub_vnum_u64 (p0, x0, 0),
+	   z0 = svldff1ub_vnum_u64 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_vnum_u64_1:
+**	incd	x0
+**	ldff1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_u64_1, svuint64_t, uint8_t,
+	   z0 = svldff1ub_vnum_u64 (p0, x0, 1),
+	   z0 = svldff1ub_vnum_u64 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1ub_vnum_u64_m1:
+**	decd	x0
+**	ldff1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_u64_m1, svuint64_t, uint8_t,
+	   z0 = svldff1ub_vnum_u64 (p0, x0, -1),
+	   z0 = svldff1ub_vnum_u64 (p0, x0, -1))
+
+/*
+** ldff1ub_vnum_u64_x1:
+**	cntd	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldff1b	z0\.d, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ldff1b	z0\.d, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ldff1ub_vnum_u64_x1, svuint64_t, uint8_t,
+	   z0 = svldff1ub_vnum_u64 (p0, x0, x1),
+	   z0 = svldff1ub_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_s32.c
new file mode 100644
index 000000000..d02e44342
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_s32.c
@@ -0,0 +1,252 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1uh_gather_s32_tied1:
+**	ldff1h	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_s32_tied1, svint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_s32 (p0, z0),
+		     z0_res = svldff1uh_gather_s32 (p0, z0))
+
+/*
+** ldff1uh_gather_s32_untied:
+**	ldff1h	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_s32_untied, svint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_s32 (p0, z1),
+		     z0_res = svldff1uh_gather_s32 (p0, z1))
+
+/*
+** ldff1uh_gather_x0_s32_offset:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_x0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_offset_s32 (p0, z0, x0),
+		     z0_res = svldff1uh_gather_offset_s32 (p0, z0, x0))
+
+/*
+** ldff1uh_gather_m2_s32_offset:
+**	mov	(x[0-9]+), #?-2
+**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_m2_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_offset_s32 (p0, z0, -2),
+		     z0_res = svldff1uh_gather_offset_s32 (p0, z0, -2))
+
+/*
+** ldff1uh_gather_0_s32_offset:
+**	ldff1h	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_0_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_offset_s32 (p0, z0, 0),
+		     z0_res = svldff1uh_gather_offset_s32 (p0, z0, 0))
+
+/*
+** ldff1uh_gather_5_s32_offset:
+**	mov	(x[0-9]+), #?5
+**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_5_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_offset_s32 (p0, z0, 5),
+		     z0_res = svldff1uh_gather_offset_s32 (p0, z0, 5))
+
+/*
+** ldff1uh_gather_6_s32_offset:
+**	ldff1h	z0\.s, p0/z, \[z0\.s, #6\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_6_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_offset_s32 (p0, z0, 6),
+		     z0_res = svldff1uh_gather_offset_s32 (p0, z0, 6))
+
+/*
+** ldff1uh_gather_62_s32_offset:
+**	ldff1h	z0\.s, p0/z, \[z0\.s, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_62_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_offset_s32 (p0, z0, 62),
+		     z0_res = svldff1uh_gather_offset_s32 (p0, z0, 62))
+
+/*
+** ldff1uh_gather_64_s32_offset:
+**	mov	(x[0-9]+), #?64
+**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_64_s32_offset, svint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_offset_s32 (p0, z0, 64),
+		     z0_res = svldff1uh_gather_offset_s32 (p0, z0, 64))
+
+/*
+** ldff1uh_gather_x0_s32_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_x0_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_index_s32 (p0, z0, x0),
+		     z0_res = svldff1uh_gather_index_s32 (p0, z0, x0))
+
+/*
+** ldff1uh_gather_m1_s32_index:
+**	mov	(x[0-9]+), #?-2
+**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_m1_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_index_s32 (p0, z0, -1),
+		     z0_res = svldff1uh_gather_index_s32 (p0, z0, -1))
+
+/*
+** ldff1uh_gather_0_s32_index:
+**	ldff1h	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_0_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_index_s32 (p0, z0, 0),
+		     z0_res = svldff1uh_gather_index_s32 (p0, z0, 0))
+
+/*
+** ldff1uh_gather_5_s32_index:
+**	ldff1h	z0\.s, p0/z, \[z0\.s, #10\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_5_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_index_s32 (p0, z0, 5),
+		     z0_res = svldff1uh_gather_index_s32 (p0, z0, 5))
+
+/*
+** ldff1uh_gather_31_s32_index:
+**	ldff1h	z0\.s, p0/z, \[z0\.s, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_31_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_index_s32 (p0, z0, 31),
+		     z0_res = svldff1uh_gather_index_s32 (p0, z0, 31))
+
+/*
+** ldff1uh_gather_32_s32_index:
+**	mov	(x[0-9]+), #?64
+**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_32_s32_index, svint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_index_s32 (p0, z0, 32),
+		     z0_res = svldff1uh_gather_index_s32 (p0, z0, 32))
+
+/*
+** ldff1uh_gather_x0_s32_s32offset:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_s32_s32offset, svint32_t, uint16_t, svint32_t,
+		     z0_res = svldff1uh_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_tied1_s32_s32offset:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_s32_s32offset, svint32_t, uint16_t, svint32_t,
+		     z0_res = svldff1uh_gather_s32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_untied_s32_s32offset:
+**	ldff1h	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_s32_s32offset, svint32_t, uint16_t, svint32_t,
+		     z0_res = svldff1uh_gather_s32offset_s32 (p0, x0, z1),
+		     z0_res = svldff1uh_gather_offset_s32 (p0, x0, z1))
+
+/*
+** ldff1uh_gather_x0_s32_u32offset:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_s32_u32offset, svint32_t, uint16_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_tied1_s32_u32offset:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_s32_u32offset, svint32_t, uint16_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32offset_s32 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_offset_s32 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_untied_s32_u32offset:
+**	ldff1h	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_s32_u32offset, svint32_t, uint16_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32offset_s32 (p0, x0, z1),
+		     z0_res = svldff1uh_gather_offset_s32 (p0, x0, z1))
+
+/*
+** ldff1uh_gather_x0_s32_s32index:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_s32_s32index, svint32_t, uint16_t, svint32_t,
+		     z0_res = svldff1uh_gather_s32index_s32 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_index_s32 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_tied1_s32_s32index:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_s32_s32index, svint32_t, uint16_t, svint32_t,
+		     z0_res = svldff1uh_gather_s32index_s32 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_index_s32 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_untied_s32_s32index:
+**	ldff1h	z0\.s, p0/z, \[x0, z1\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_s32_s32index, svint32_t, uint16_t, svint32_t,
+		     z0_res = svldff1uh_gather_s32index_s32 (p0, x0, z1),
+		     z0_res = svldff1uh_gather_index_s32 (p0, x0, z1))
+
+/*
+** ldff1uh_gather_x0_s32_u32index:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_s32_u32index, svint32_t, uint16_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32index_s32 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_index_s32 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_tied1_s32_u32index:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_s32_u32index, svint32_t, uint16_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32index_s32 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_index_s32 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_untied_s32_u32index:
+**	ldff1h	z0\.s, p0/z, \[x0, z1\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_s32_u32index, svint32_t, uint16_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32index_s32 (p0, x0, z1),
+		     z0_res = svldff1uh_gather_index_s32 (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_s64.c
new file mode 100644
index 000000000..663a73d27
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_s64.c
@@ -0,0 +1,288 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1uh_gather_s64_tied1:
+**	ldff1h	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_s64_tied1, svint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_s64 (p0, z0),
+		     z0_res = svldff1uh_gather_s64 (p0, z0))
+
+/*
+** ldff1uh_gather_s64_untied:
+**	ldff1h	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_s64_untied, svint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_s64 (p0, z1),
+		     z0_res = svldff1uh_gather_s64 (p0, z1))
+
+/*
+** ldff1uh_gather_x0_s64_offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_x0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_offset_s64 (p0, z0, x0),
+		     z0_res = svldff1uh_gather_offset_s64 (p0, z0, x0))
+
+/*
+** ldff1uh_gather_m2_s64_offset:
+**	mov	(x[0-9]+), #?-2
+**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_m2_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_offset_s64 (p0, z0, -2),
+		     z0_res = svldff1uh_gather_offset_s64 (p0, z0, -2))
+
+/*
+** ldff1uh_gather_0_s64_offset:
+**	ldff1h	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_offset_s64 (p0, z0, 0),
+		     z0_res = svldff1uh_gather_offset_s64 (p0, z0, 0))
+
+/*
+** ldff1uh_gather_5_s64_offset:
+**	mov	(x[0-9]+), #?5
+**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_5_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_offset_s64 (p0, z0, 5),
+		     z0_res = svldff1uh_gather_offset_s64 (p0, z0, 5))
+
+/*
+** ldff1uh_gather_6_s64_offset:
+**	ldff1h	z0\.d, p0/z, \[z0\.d, #6\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_6_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_offset_s64 (p0, z0, 6),
+		     z0_res = svldff1uh_gather_offset_s64 (p0, z0, 6))
+
+/*
+** ldff1uh_gather_62_s64_offset:
+**	ldff1h	z0\.d, p0/z, \[z0\.d, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_62_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_offset_s64 (p0, z0, 62),
+		     z0_res = svldff1uh_gather_offset_s64 (p0, z0, 62))
+
+/*
+** ldff1uh_gather_64_s64_offset:
+**	mov	(x[0-9]+), #?64
+**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_64_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_offset_s64 (p0, z0, 64),
+		     z0_res = svldff1uh_gather_offset_s64 (p0, z0, 64))
+
+/*
+** ldff1uh_gather_x0_s64_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_x0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_index_s64 (p0, z0, x0),
+		     z0_res = svldff1uh_gather_index_s64 (p0, z0, x0))
+
+/*
+** ldff1uh_gather_m1_s64_index:
+**	mov	(x[0-9]+), #?-2
+**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_m1_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_index_s64 (p0, z0, -1),
+		     z0_res = svldff1uh_gather_index_s64 (p0, z0, -1))
+
+/*
+** ldff1uh_gather_0_s64_index:
+**	ldff1h	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_index_s64 (p0, z0, 0),
+		     z0_res = svldff1uh_gather_index_s64 (p0, z0, 0))
+
+/*
+** ldff1uh_gather_5_s64_index:
+**	ldff1h	z0\.d, p0/z, \[z0\.d, #10\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_5_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_index_s64 (p0, z0, 5),
+		     z0_res = svldff1uh_gather_index_s64 (p0, z0, 5))
+
+/*
+** ldff1uh_gather_31_s64_index:
+**	ldff1h	z0\.d, p0/z, \[z0\.d, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_31_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_index_s64 (p0, z0, 31),
+		     z0_res = svldff1uh_gather_index_s64 (p0, z0, 31))
+
+/*
+** ldff1uh_gather_32_s64_index:
+**	mov	(x[0-9]+), #?64
+**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_32_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_index_s64 (p0, z0, 32),
+		     z0_res = svldff1uh_gather_index_s64 (p0, z0, 32))
+
+/*
+** ldff1uh_gather_x0_s64_s64offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_s64_s64offset, svint64_t, uint16_t, svint64_t,
+		     z0_res = svldff1uh_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_tied1_s64_s64offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_s64_s64offset, svint64_t, uint16_t, svint64_t,
+		     z0_res = svldff1uh_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_untied_s64_s64offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_s64_s64offset, svint64_t, uint16_t, svint64_t,
+		     z0_res = svldff1uh_gather_s64offset_s64 (p0, x0, z1),
+		     z0_res = svldff1uh_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ldff1uh_gather_ext_s64_s64offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_ext_s64_s64offset, svint64_t, uint16_t, svint64_t,
+		     z0_res = svldff1uh_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1uh_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1uh_gather_x0_s64_u64offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_s64_u64offset, svint64_t, uint16_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_tied1_s64_u64offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_s64_u64offset, svint64_t, uint16_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_untied_s64_u64offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_s64_u64offset, svint64_t, uint16_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64offset_s64 (p0, x0, z1),
+		     z0_res = svldff1uh_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ldff1uh_gather_ext_s64_u64offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_ext_s64_u64offset, svint64_t, uint16_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1uh_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1uh_gather_x0_s64_s64index:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_s64_s64index, svint64_t, uint16_t, svint64_t,
+		     z0_res = svldff1uh_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_index_s64 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_tied1_s64_s64index:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_s64_s64index, svint64_t, uint16_t, svint64_t,
+		     z0_res = svldff1uh_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_index_s64 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_untied_s64_s64index:
+**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_s64_s64index, svint64_t, uint16_t, svint64_t,
+		     z0_res = svldff1uh_gather_s64index_s64 (p0, x0, z1),
+		     z0_res = svldff1uh_gather_index_s64 (p0, x0, z1))
+
+/*
+** ldff1uh_gather_ext_s64_s64index:
+**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_ext_s64_s64index, svint64_t, uint16_t, svint64_t,
+		     z0_res = svldff1uh_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1uh_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1uh_gather_x0_s64_u64index:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_s64_u64index, svint64_t, uint16_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_index_s64 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_tied1_s64_u64index:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_s64_u64index, svint64_t, uint16_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_index_s64 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_untied_s64_u64index:
+**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_s64_u64index, svint64_t, uint16_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64index_s64 (p0, x0, z1),
+		     z0_res = svldff1uh_gather_index_s64 (p0, x0, z1))
+
+/*
+** ldff1uh_gather_ext_s64_u64index:
+**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_ext_s64_u64index, svint64_t, uint16_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1uh_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_u32.c
new file mode 100644
index 000000000..5e0ef067f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_u32.c
@@ -0,0 +1,252 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1uh_gather_u32_tied1:
+**	ldff1h	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_u32_tied1, svuint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_u32 (p0, z0),
+		     z0_res = svldff1uh_gather_u32 (p0, z0))
+
+/*
+** ldff1uh_gather_u32_untied:
+**	ldff1h	z0\.s, p0/z, \[z1\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_u32_untied, svuint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_u32 (p0, z1),
+		     z0_res = svldff1uh_gather_u32 (p0, z1))
+
+/*
+** ldff1uh_gather_x0_u32_offset:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_x0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_offset_u32 (p0, z0, x0),
+		     z0_res = svldff1uh_gather_offset_u32 (p0, z0, x0))
+
+/*
+** ldff1uh_gather_m2_u32_offset:
+**	mov	(x[0-9]+), #?-2
+**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_m2_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_offset_u32 (p0, z0, -2),
+		     z0_res = svldff1uh_gather_offset_u32 (p0, z0, -2))
+
+/*
+** ldff1uh_gather_0_u32_offset:
+**	ldff1h	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_0_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_offset_u32 (p0, z0, 0),
+		     z0_res = svldff1uh_gather_offset_u32 (p0, z0, 0))
+
+/*
+** ldff1uh_gather_5_u32_offset:
+**	mov	(x[0-9]+), #?5
+**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_5_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_offset_u32 (p0, z0, 5),
+		     z0_res = svldff1uh_gather_offset_u32 (p0, z0, 5))
+
+/*
+** ldff1uh_gather_6_u32_offset:
+**	ldff1h	z0\.s, p0/z, \[z0\.s, #6\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_6_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_offset_u32 (p0, z0, 6),
+		     z0_res = svldff1uh_gather_offset_u32 (p0, z0, 6))
+
+/*
+** ldff1uh_gather_62_u32_offset:
+**	ldff1h	z0\.s, p0/z, \[z0\.s, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_62_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_offset_u32 (p0, z0, 62),
+		     z0_res = svldff1uh_gather_offset_u32 (p0, z0, 62))
+
+/*
+** ldff1uh_gather_64_u32_offset:
+**	mov	(x[0-9]+), #?64
+**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_64_u32_offset, svuint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_offset_u32 (p0, z0, 64),
+		     z0_res = svldff1uh_gather_offset_u32 (p0, z0, 64))
+
+/*
+** ldff1uh_gather_x0_u32_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_x0_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_index_u32 (p0, z0, x0),
+		     z0_res = svldff1uh_gather_index_u32 (p0, z0, x0))
+
+/*
+** ldff1uh_gather_m1_u32_index:
+**	mov	(x[0-9]+), #?-2
+**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_m1_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_index_u32 (p0, z0, -1),
+		     z0_res = svldff1uh_gather_index_u32 (p0, z0, -1))
+
+/*
+** ldff1uh_gather_0_u32_index:
+**	ldff1h	z0\.s, p0/z, \[z0\.s\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_0_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_index_u32 (p0, z0, 0),
+		     z0_res = svldff1uh_gather_index_u32 (p0, z0, 0))
+
+/*
+** ldff1uh_gather_5_u32_index:
+**	ldff1h	z0\.s, p0/z, \[z0\.s, #10\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_5_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_index_u32 (p0, z0, 5),
+		     z0_res = svldff1uh_gather_index_u32 (p0, z0, 5))
+
+/*
+** ldff1uh_gather_31_u32_index:
+**	ldff1h	z0\.s, p0/z, \[z0\.s, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_31_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_index_u32 (p0, z0, 31),
+		     z0_res = svldff1uh_gather_index_u32 (p0, z0, 31))
+
+/*
+** ldff1uh_gather_32_u32_index:
+**	mov	(x[0-9]+), #?64
+**	ldff1h	z0\.s, p0/z, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_32_u32_index, svuint32_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32base_index_u32 (p0, z0, 32),
+		     z0_res = svldff1uh_gather_index_u32 (p0, z0, 32))
+
+/*
+** ldff1uh_gather_x0_u32_s32offset:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_u32_s32offset, svuint32_t, uint16_t, svint32_t,
+		     z0_res = svldff1uh_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_tied1_u32_s32offset:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_u32_s32offset, svuint32_t, uint16_t, svint32_t,
+		     z0_res = svldff1uh_gather_s32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_untied_u32_s32offset:
+**	ldff1h	z0\.s, p0/z, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_u32_s32offset, svuint32_t, uint16_t, svint32_t,
+		     z0_res = svldff1uh_gather_s32offset_u32 (p0, x0, z1),
+		     z0_res = svldff1uh_gather_offset_u32 (p0, x0, z1))
+
+/*
+** ldff1uh_gather_x0_u32_u32offset:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_u32_u32offset, svuint32_t, uint16_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_tied1_u32_u32offset:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_u32_u32offset, svuint32_t, uint16_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32offset_u32 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_offset_u32 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_untied_u32_u32offset:
+**	ldff1h	z0\.s, p0/z, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_u32_u32offset, svuint32_t, uint16_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32offset_u32 (p0, x0, z1),
+		     z0_res = svldff1uh_gather_offset_u32 (p0, x0, z1))
+
+/*
+** ldff1uh_gather_x0_u32_s32index:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_u32_s32index, svuint32_t, uint16_t, svint32_t,
+		     z0_res = svldff1uh_gather_s32index_u32 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_index_u32 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_tied1_u32_s32index:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_u32_s32index, svuint32_t, uint16_t, svint32_t,
+		     z0_res = svldff1uh_gather_s32index_u32 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_index_u32 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_untied_u32_s32index:
+**	ldff1h	z0\.s, p0/z, \[x0, z1\.s, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_u32_s32index, svuint32_t, uint16_t, svint32_t,
+		     z0_res = svldff1uh_gather_s32index_u32 (p0, x0, z1),
+		     z0_res = svldff1uh_gather_index_u32 (p0, x0, z1))
+
+/*
+** ldff1uh_gather_x0_u32_u32index:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_u32_u32index, svuint32_t, uint16_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32index_u32 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_index_u32 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_tied1_u32_u32index:
+**	ldff1h	z0\.s, p0/z, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_u32_u32index, svuint32_t, uint16_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32index_u32 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_index_u32 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_untied_u32_u32index:
+**	ldff1h	z0\.s, p0/z, \[x0, z1\.s, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_u32_u32index, svuint32_t, uint16_t, svuint32_t,
+		     z0_res = svldff1uh_gather_u32index_u32 (p0, x0, z1),
+		     z0_res = svldff1uh_gather_index_u32 (p0, x0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_u64.c
new file mode 100644
index 000000000..1cfae1b95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_gather_u64.c
@@ -0,0 +1,288 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1uh_gather_u64_tied1:
+**	ldff1h	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_u64_tied1, svuint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_u64 (p0, z0),
+		     z0_res = svldff1uh_gather_u64 (p0, z0))
+
+/*
+** ldff1uh_gather_u64_untied:
+**	ldff1h	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_u64_untied, svuint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_u64 (p0, z1),
+		     z0_res = svldff1uh_gather_u64 (p0, z1))
+
+/*
+** ldff1uh_gather_x0_u64_offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_x0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_offset_u64 (p0, z0, x0),
+		     z0_res = svldff1uh_gather_offset_u64 (p0, z0, x0))
+
+/*
+** ldff1uh_gather_m2_u64_offset:
+**	mov	(x[0-9]+), #?-2
+**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_m2_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_offset_u64 (p0, z0, -2),
+		     z0_res = svldff1uh_gather_offset_u64 (p0, z0, -2))
+
+/*
+** ldff1uh_gather_0_u64_offset:
+**	ldff1h	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_offset_u64 (p0, z0, 0),
+		     z0_res = svldff1uh_gather_offset_u64 (p0, z0, 0))
+
+/*
+** ldff1uh_gather_5_u64_offset:
+**	mov	(x[0-9]+), #?5
+**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_5_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_offset_u64 (p0, z0, 5),
+		     z0_res = svldff1uh_gather_offset_u64 (p0, z0, 5))
+
+/*
+** ldff1uh_gather_6_u64_offset:
+**	ldff1h	z0\.d, p0/z, \[z0\.d, #6\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_6_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_offset_u64 (p0, z0, 6),
+		     z0_res = svldff1uh_gather_offset_u64 (p0, z0, 6))
+
+/*
+** ldff1uh_gather_62_u64_offset:
+**	ldff1h	z0\.d, p0/z, \[z0\.d, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_62_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_offset_u64 (p0, z0, 62),
+		     z0_res = svldff1uh_gather_offset_u64 (p0, z0, 62))
+
+/*
+** ldff1uh_gather_64_u64_offset:
+**	mov	(x[0-9]+), #?64
+**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_64_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_offset_u64 (p0, z0, 64),
+		     z0_res = svldff1uh_gather_offset_u64 (p0, z0, 64))
+
+/*
+** ldff1uh_gather_x0_u64_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_x0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_index_u64 (p0, z0, x0),
+		     z0_res = svldff1uh_gather_index_u64 (p0, z0, x0))
+
+/*
+** ldff1uh_gather_m1_u64_index:
+**	mov	(x[0-9]+), #?-2
+**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_m1_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_index_u64 (p0, z0, -1),
+		     z0_res = svldff1uh_gather_index_u64 (p0, z0, -1))
+
+/*
+** ldff1uh_gather_0_u64_index:
+**	ldff1h	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_index_u64 (p0, z0, 0),
+		     z0_res = svldff1uh_gather_index_u64 (p0, z0, 0))
+
+/*
+** ldff1uh_gather_5_u64_index:
+**	ldff1h	z0\.d, p0/z, \[z0\.d, #10\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_5_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_index_u64 (p0, z0, 5),
+		     z0_res = svldff1uh_gather_index_u64 (p0, z0, 5))
+
+/*
+** ldff1uh_gather_31_u64_index:
+**	ldff1h	z0\.d, p0/z, \[z0\.d, #62\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_31_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_index_u64 (p0, z0, 31),
+		     z0_res = svldff1uh_gather_index_u64 (p0, z0, 31))
+
+/*
+** ldff1uh_gather_32_u64_index:
+**	mov	(x[0-9]+), #?64
+**	ldff1h	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uh_gather_32_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64base_index_u64 (p0, z0, 32),
+		     z0_res = svldff1uh_gather_index_u64 (p0, z0, 32))
+
+/*
+** ldff1uh_gather_x0_u64_s64offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_u64_s64offset, svuint64_t, uint16_t, svint64_t,
+		     z0_res = svldff1uh_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_tied1_u64_s64offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_u64_s64offset, svuint64_t, uint16_t, svint64_t,
+		     z0_res = svldff1uh_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_untied_u64_s64offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_u64_s64offset, svuint64_t, uint16_t, svint64_t,
+		     z0_res = svldff1uh_gather_s64offset_u64 (p0, x0, z1),
+		     z0_res = svldff1uh_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ldff1uh_gather_ext_u64_s64offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_ext_u64_s64offset, svuint64_t, uint16_t, svint64_t,
+		     z0_res = svldff1uh_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1uh_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1uh_gather_x0_u64_u64offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_tied1_u64_u64offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_untied_u64_u64offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64offset_u64 (p0, x0, z1),
+		     z0_res = svldff1uh_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ldff1uh_gather_ext_u64_u64offset:
+**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_ext_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1uh_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1uh_gather_x0_u64_s64index:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_u64_s64index, svuint64_t, uint16_t, svint64_t,
+		     z0_res = svldff1uh_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_index_u64 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_tied1_u64_s64index:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_u64_s64index, svuint64_t, uint16_t, svint64_t,
+		     z0_res = svldff1uh_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_index_u64 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_untied_u64_s64index:
+**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_u64_s64index, svuint64_t, uint16_t, svint64_t,
+		     z0_res = svldff1uh_gather_s64index_u64 (p0, x0, z1),
+		     z0_res = svldff1uh_gather_index_u64 (p0, x0, z1))
+
+/*
+** ldff1uh_gather_ext_u64_s64index:
+**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, sxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_ext_u64_s64index, svuint64_t, uint16_t, svint64_t,
+		     z0_res = svldff1uh_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1uh_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1uh_gather_x0_u64_u64index:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_x0_u64_u64index, svuint64_t, uint16_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_index_u64 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_tied1_u64_u64index:
+**	ldff1h	z0\.d, p0/z, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_tied1_u64_u64index, svuint64_t, uint16_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svldff1uh_gather_index_u64 (p0, x0, z0))
+
+/*
+** ldff1uh_gather_untied_u64_u64index:
+**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_untied_u64_u64index, svuint64_t, uint16_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64index_u64 (p0, x0, z1),
+		     z0_res = svldff1uh_gather_index_u64 (p0, x0, z1))
+
+/*
+** ldff1uh_gather_ext_u64_u64index:
+**	ldff1h	z0\.d, p0/z, \[x0, z1\.d, uxtw 1\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uh_gather_ext_u64_u64index, svuint64_t, uint16_t, svuint64_t,
+		     z0_res = svldff1uh_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1uh_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_s32.c
new file mode 100644
index 000000000..abb3d769a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_s32.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1uh_s32_base:
+**	ldff1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_s32_base, svint32_t, uint16_t,
+	   z0 = svldff1uh_s32 (p0, x0),
+	   z0 = svldff1uh_s32 (p0, x0))
+
+/*
+** ldff1uh_s32_index:
+**	ldff1h	z0\.s, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_s32_index, svint32_t, uint16_t,
+	   z0 = svldff1uh_s32 (p0, x0 + x1),
+	   z0 = svldff1uh_s32 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uh_s32_1:
+**	inch	x0
+**	ldff1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_s32_1, svint32_t, uint16_t,
+	   z0 = svldff1uh_s32 (p0, x0 + svcntw ()),
+	   z0 = svldff1uh_s32 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uh_s32_m1:
+**	dech	x0
+**	ldff1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_s32_m1, svint32_t, uint16_t,
+	   z0 = svldff1uh_s32 (p0, x0 - svcntw ()),
+	   z0 = svldff1uh_s32 (p0, x0 - svcntw ()))
+
+/*
+** ldff1uh_vnum_s32_0:
+**	ldff1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_vnum_s32_0, svint32_t, uint16_t,
+	   z0 = svldff1uh_vnum_s32 (p0, x0, 0),
+	   z0 = svldff1uh_vnum_s32 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uh_vnum_s32_1:
+**	inch	x0
+**	ldff1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_vnum_s32_1, svint32_t, uint16_t,
+	   z0 = svldff1uh_vnum_s32 (p0, x0, 1),
+	   z0 = svldff1uh_vnum_s32 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uh_vnum_s32_m1:
+**	dech	x0
+**	ldff1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_vnum_s32_m1, svint32_t, uint16_t,
+	   z0 = svldff1uh_vnum_s32 (p0, x0, -1),
+	   z0 = svldff1uh_vnum_s32 (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1uh_vnum_s32_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1h	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_vnum_s32_x1, svint32_t, uint16_t,
+	   z0 = svldff1uh_vnum_s32 (p0, x0, x1),
+	   z0 = svldff1uh_vnum_s32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_s64.c
new file mode 100644
index 000000000..6e330e8e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_s64.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1uh_s64_base:
+**	ldff1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_s64_base, svint64_t, uint16_t,
+	   z0 = svldff1uh_s64 (p0, x0),
+	   z0 = svldff1uh_s64 (p0, x0))
+
+/*
+** ldff1uh_s64_index:
+**	ldff1h	z0\.d, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_s64_index, svint64_t, uint16_t,
+	   z0 = svldff1uh_s64 (p0, x0 + x1),
+	   z0 = svldff1uh_s64 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uh_s64_1:
+**	incw	x0
+**	ldff1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_s64_1, svint64_t, uint16_t,
+	   z0 = svldff1uh_s64 (p0, x0 + svcntd ()),
+	   z0 = svldff1uh_s64 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uh_s64_m1:
+**	decw	x0
+**	ldff1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_s64_m1, svint64_t, uint16_t,
+	   z0 = svldff1uh_s64 (p0, x0 - svcntd ()),
+	   z0 = svldff1uh_s64 (p0, x0 - svcntd ()))
+
+/*
+** ldff1uh_vnum_s64_0:
+**	ldff1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_vnum_s64_0, svint64_t, uint16_t,
+	   z0 = svldff1uh_vnum_s64 (p0, x0, 0),
+	   z0 = svldff1uh_vnum_s64 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uh_vnum_s64_1:
+**	incw	x0
+**	ldff1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_vnum_s64_1, svint64_t, uint16_t,
+	   z0 = svldff1uh_vnum_s64 (p0, x0, 1),
+	   z0 = svldff1uh_vnum_s64 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uh_vnum_s64_m1:
+**	decw	x0
+**	ldff1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_vnum_s64_m1, svint64_t, uint16_t,
+	   z0 = svldff1uh_vnum_s64 (p0, x0, -1),
+	   z0 = svldff1uh_vnum_s64 (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1uh_vnum_s64_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1h	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_vnum_s64_x1, svint64_t, uint16_t,
+	   z0 = svldff1uh_vnum_s64 (p0, x0, x1),
+	   z0 = svldff1uh_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_u32.c
new file mode 100644
index 000000000..4eb5323e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_u32.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1uh_u32_base:
+**	ldff1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_u32_base, svuint32_t, uint16_t,
+	   z0 = svldff1uh_u32 (p0, x0),
+	   z0 = svldff1uh_u32 (p0, x0))
+
+/*
+** ldff1uh_u32_index:
+**	ldff1h	z0\.s, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_u32_index, svuint32_t, uint16_t,
+	   z0 = svldff1uh_u32 (p0, x0 + x1),
+	   z0 = svldff1uh_u32 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uh_u32_1:
+**	inch	x0
+**	ldff1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_u32_1, svuint32_t, uint16_t,
+	   z0 = svldff1uh_u32 (p0, x0 + svcntw ()),
+	   z0 = svldff1uh_u32 (p0, x0 + svcntw ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uh_u32_m1:
+**	dech	x0
+**	ldff1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_u32_m1, svuint32_t, uint16_t,
+	   z0 = svldff1uh_u32 (p0, x0 - svcntw ()),
+	   z0 = svldff1uh_u32 (p0, x0 - svcntw ()))
+
+/*
+** ldff1uh_vnum_u32_0:
+**	ldff1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_vnum_u32_0, svuint32_t, uint16_t,
+	   z0 = svldff1uh_vnum_u32 (p0, x0, 0),
+	   z0 = svldff1uh_vnum_u32 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uh_vnum_u32_1:
+**	inch	x0
+**	ldff1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_vnum_u32_1, svuint32_t, uint16_t,
+	   z0 = svldff1uh_vnum_u32 (p0, x0, 1),
+	   z0 = svldff1uh_vnum_u32 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uh_vnum_u32_m1:
+**	dech	x0
+**	ldff1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_vnum_u32_m1, svuint32_t, uint16_t,
+	   z0 = svldff1uh_vnum_u32 (p0, x0, -1),
+	   z0 = svldff1uh_vnum_u32 (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1uh_vnum_u32_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1h	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_vnum_u32_x1, svuint32_t, uint16_t,
+	   z0 = svldff1uh_vnum_u32 (p0, x0, x1),
+	   z0 = svldff1uh_vnum_u32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_u64.c
new file mode 100644
index 000000000..ebac26e7d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uh_u64.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1uh_u64_base:
+**	ldff1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_u64_base, svuint64_t, uint16_t,
+	   z0 = svldff1uh_u64 (p0, x0),
+	   z0 = svldff1uh_u64 (p0, x0))
+
+/*
+** ldff1uh_u64_index:
+**	ldff1h	z0\.d, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_u64_index, svuint64_t, uint16_t,
+	   z0 = svldff1uh_u64 (p0, x0 + x1),
+	   z0 = svldff1uh_u64 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uh_u64_1:
+**	incw	x0
+**	ldff1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_u64_1, svuint64_t, uint16_t,
+	   z0 = svldff1uh_u64 (p0, x0 + svcntd ()),
+	   z0 = svldff1uh_u64 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uh_u64_m1:
+**	decw	x0
+**	ldff1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_u64_m1, svuint64_t, uint16_t,
+	   z0 = svldff1uh_u64 (p0, x0 - svcntd ()),
+	   z0 = svldff1uh_u64 (p0, x0 - svcntd ()))
+
+/*
+** ldff1uh_vnum_u64_0:
+**	ldff1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_vnum_u64_0, svuint64_t, uint16_t,
+	   z0 = svldff1uh_vnum_u64 (p0, x0, 0),
+	   z0 = svldff1uh_vnum_u64 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uh_vnum_u64_1:
+**	incw	x0
+**	ldff1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_vnum_u64_1, svuint64_t, uint16_t,
+	   z0 = svldff1uh_vnum_u64 (p0, x0, 1),
+	   z0 = svldff1uh_vnum_u64 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uh_vnum_u64_m1:
+**	decw	x0
+**	ldff1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_vnum_u64_m1, svuint64_t, uint16_t,
+	   z0 = svldff1uh_vnum_u64 (p0, x0, -1),
+	   z0 = svldff1uh_vnum_u64 (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1uh_vnum_u64_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1h	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1uh_vnum_u64_x1, svuint64_t, uint16_t,
+	   z0 = svldff1uh_vnum_u64 (p0, x0, x1),
+	   z0 = svldff1uh_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_gather_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_gather_s64.c
new file mode 100644
index 000000000..6c0daea52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_gather_s64.c
@@ -0,0 +1,308 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1uw_gather_s64_tied1:
+**	ldff1w	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_s64_tied1, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_s64 (p0, z0),
+		     z0_res = svldff1uw_gather_s64 (p0, z0))
+
+/*
+** ldff1uw_gather_s64_untied:
+**	ldff1w	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_s64_untied, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_s64 (p0, z1),
+		     z0_res = svldff1uw_gather_s64 (p0, z1))
+
+/*
+** ldff1uw_gather_x0_s64_offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_x0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, x0),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, x0))
+
+/*
+** ldff1uw_gather_m4_s64_offset:
+**	mov	(x[0-9]+), #?-4
+**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_m4_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, -4),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, -4))
+
+/*
+** ldff1uw_gather_0_s64_offset:
+**	ldff1w	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_0_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, 0),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, 0))
+
+/*
+** ldff1uw_gather_5_s64_offset:
+**	mov	(x[0-9]+), #?5
+**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_5_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, 5),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, 5))
+
+/*
+** ldff1uw_gather_6_s64_offset:
+**	mov	(x[0-9]+), #?6
+**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_6_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, 6),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, 6))
+
+/*
+** ldff1uw_gather_7_s64_offset:
+**	mov	(x[0-9]+), #?7
+**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_7_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, 7),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, 7))
+
+/*
+** ldff1uw_gather_8_s64_offset:
+**	ldff1w	z0\.d, p0/z, \[z0\.d, #8\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_8_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, 8),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, 8))
+
+/*
+** ldff1uw_gather_124_s64_offset:
+**	ldff1w	z0\.d, p0/z, \[z0\.d, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_124_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, 124),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, 124))
+
+/*
+** ldff1uw_gather_128_s64_offset:
+**	mov	(x[0-9]+), #?128
+**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_128_s64_offset, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_s64 (p0, z0, 128),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, z0, 128))
+
+/*
+** ldff1uw_gather_x0_s64_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_x0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_index_s64 (p0, z0, x0),
+		     z0_res = svldff1uw_gather_index_s64 (p0, z0, x0))
+
+/*
+** ldff1uw_gather_m1_s64_index:
+**	mov	(x[0-9]+), #?-4
+**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_m1_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_index_s64 (p0, z0, -1),
+		     z0_res = svldff1uw_gather_index_s64 (p0, z0, -1))
+
+/*
+** ldff1uw_gather_0_s64_index:
+**	ldff1w	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_0_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_index_s64 (p0, z0, 0),
+		     z0_res = svldff1uw_gather_index_s64 (p0, z0, 0))
+
+/*
+** ldff1uw_gather_5_s64_index:
+**	ldff1w	z0\.d, p0/z, \[z0\.d, #20\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_5_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_index_s64 (p0, z0, 5),
+		     z0_res = svldff1uw_gather_index_s64 (p0, z0, 5))
+
+/*
+** ldff1uw_gather_31_s64_index:
+**	ldff1w	z0\.d, p0/z, \[z0\.d, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_31_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_index_s64 (p0, z0, 31),
+		     z0_res = svldff1uw_gather_index_s64 (p0, z0, 31))
+
+/*
+** ldff1uw_gather_32_s64_index:
+**	mov	(x[0-9]+), #?128
+**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_32_s64_index, svint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_index_s64 (p0, z0, 32),
+		     z0_res = svldff1uw_gather_index_s64 (p0, z0, 32))
+
+/*
+** ldff1uw_gather_x0_s64_s64offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_x0_s64_s64offset, svint64_t, uint32_t, svint64_t,
+		     z0_res = svldff1uw_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1uw_gather_tied1_s64_s64offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_tied1_s64_s64offset, svint64_t, uint32_t, svint64_t,
+		     z0_res = svldff1uw_gather_s64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1uw_gather_untied_s64_s64offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_untied_s64_s64offset, svint64_t, uint32_t, svint64_t,
+		     z0_res = svldff1uw_gather_s64offset_s64 (p0, x0, z1),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ldff1uw_gather_ext_s64_s64offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_ext_s64_s64offset, svint64_t, uint32_t, svint64_t,
+		     z0_res = svldff1uw_gather_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1uw_gather_x0_s64_u64offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_x0_s64_u64offset, svint64_t, uint32_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1uw_gather_tied1_s64_u64offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_tied1_s64_u64offset, svint64_t, uint32_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64offset_s64 (p0, x0, z0),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, x0, z0))
+
+/*
+** ldff1uw_gather_untied_s64_u64offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_untied_s64_u64offset, svint64_t, uint32_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64offset_s64 (p0, x0, z1),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, x0, z1))
+
+/*
+** ldff1uw_gather_ext_s64_u64offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_ext_s64_u64offset, svint64_t, uint32_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1uw_gather_offset_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1uw_gather_x0_s64_s64index:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_x0_s64_s64index, svint64_t, uint32_t, svint64_t,
+		     z0_res = svldff1uw_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svldff1uw_gather_index_s64 (p0, x0, z0))
+
+/*
+** ldff1uw_gather_tied1_s64_s64index:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_tied1_s64_s64index, svint64_t, uint32_t, svint64_t,
+		     z0_res = svldff1uw_gather_s64index_s64 (p0, x0, z0),
+		     z0_res = svldff1uw_gather_index_s64 (p0, x0, z0))
+
+/*
+** ldff1uw_gather_untied_s64_s64index:
+**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_untied_s64_s64index, svint64_t, uint32_t, svint64_t,
+		     z0_res = svldff1uw_gather_s64index_s64 (p0, x0, z1),
+		     z0_res = svldff1uw_gather_index_s64 (p0, x0, z1))
+
+/*
+** ldff1uw_gather_ext_s64_s64index:
+**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_ext_s64_s64index, svint64_t, uint32_t, svint64_t,
+		     z0_res = svldff1uw_gather_s64index_s64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1uw_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1uw_gather_x0_s64_u64index:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_x0_s64_u64index, svint64_t, uint32_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svldff1uw_gather_index_s64 (p0, x0, z0))
+
+/*
+** ldff1uw_gather_tied1_s64_u64index:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_tied1_s64_u64index, svint64_t, uint32_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64index_s64 (p0, x0, z0),
+		     z0_res = svldff1uw_gather_index_s64 (p0, x0, z0))
+
+/*
+** ldff1uw_gather_untied_s64_u64index:
+**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_untied_s64_u64index, svint64_t, uint32_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64index_s64 (p0, x0, z1),
+		     z0_res = svldff1uw_gather_index_s64 (p0, x0, z1))
+
+/*
+** ldff1uw_gather_ext_s64_u64index:
+**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_ext_s64_u64index, svint64_t, uint32_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64index_s64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1uw_gather_index_s64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_gather_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_gather_u64.c
new file mode 100644
index 000000000..0e400c679
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_gather_u64.c
@@ -0,0 +1,308 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1uw_gather_u64_tied1:
+**	ldff1w	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_u64_tied1, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_u64 (p0, z0),
+		     z0_res = svldff1uw_gather_u64 (p0, z0))
+
+/*
+** ldff1uw_gather_u64_untied:
+**	ldff1w	z0\.d, p0/z, \[z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_u64_untied, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_u64 (p0, z1),
+		     z0_res = svldff1uw_gather_u64 (p0, z1))
+
+/*
+** ldff1uw_gather_x0_u64_offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_x0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, x0),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, x0))
+
+/*
+** ldff1uw_gather_m4_u64_offset:
+**	mov	(x[0-9]+), #?-4
+**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_m4_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, -4),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, -4))
+
+/*
+** ldff1uw_gather_0_u64_offset:
+**	ldff1w	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_0_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, 0),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, 0))
+
+/*
+** ldff1uw_gather_5_u64_offset:
+**	mov	(x[0-9]+), #?5
+**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_5_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, 5),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, 5))
+
+/*
+** ldff1uw_gather_6_u64_offset:
+**	mov	(x[0-9]+), #?6
+**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_6_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, 6),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, 6))
+
+/*
+** ldff1uw_gather_7_u64_offset:
+**	mov	(x[0-9]+), #?7
+**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_7_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, 7),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, 7))
+
+/*
+** ldff1uw_gather_8_u64_offset:
+**	ldff1w	z0\.d, p0/z, \[z0\.d, #8\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_8_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, 8),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, 8))
+
+/*
+** ldff1uw_gather_124_u64_offset:
+**	ldff1w	z0\.d, p0/z, \[z0\.d, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_124_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, 124),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, 124))
+
+/*
+** ldff1uw_gather_128_u64_offset:
+**	mov	(x[0-9]+), #?128
+**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_128_u64_offset, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_offset_u64 (p0, z0, 128),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, z0, 128))
+
+/*
+** ldff1uw_gather_x0_u64_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_x0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_index_u64 (p0, z0, x0),
+		     z0_res = svldff1uw_gather_index_u64 (p0, z0, x0))
+
+/*
+** ldff1uw_gather_m1_u64_index:
+**	mov	(x[0-9]+), #?-4
+**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_m1_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_index_u64 (p0, z0, -1),
+		     z0_res = svldff1uw_gather_index_u64 (p0, z0, -1))
+
+/*
+** ldff1uw_gather_0_u64_index:
+**	ldff1w	z0\.d, p0/z, \[z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_0_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_index_u64 (p0, z0, 0),
+		     z0_res = svldff1uw_gather_index_u64 (p0, z0, 0))
+
+/*
+** ldff1uw_gather_5_u64_index:
+**	ldff1w	z0\.d, p0/z, \[z0\.d, #20\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_5_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_index_u64 (p0, z0, 5),
+		     z0_res = svldff1uw_gather_index_u64 (p0, z0, 5))
+
+/*
+** ldff1uw_gather_31_u64_index:
+**	ldff1w	z0\.d, p0/z, \[z0\.d, #124\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_31_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_index_u64 (p0, z0, 31),
+		     z0_res = svldff1uw_gather_index_u64 (p0, z0, 31))
+
+/*
+** ldff1uw_gather_32_u64_index:
+**	mov	(x[0-9]+), #?128
+**	ldff1w	z0\.d, p0/z, \[\1, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_ZS (ldff1uw_gather_32_u64_index, svuint64_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64base_index_u64 (p0, z0, 32),
+		     z0_res = svldff1uw_gather_index_u64 (p0, z0, 32))
+
+/*
+** ldff1uw_gather_x0_u64_s64offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_x0_u64_s64offset, svuint64_t, uint32_t, svint64_t,
+		     z0_res = svldff1uw_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1uw_gather_tied1_u64_s64offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_tied1_u64_s64offset, svuint64_t, uint32_t, svint64_t,
+		     z0_res = svldff1uw_gather_s64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1uw_gather_untied_u64_s64offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_untied_u64_s64offset, svuint64_t, uint32_t, svint64_t,
+		     z0_res = svldff1uw_gather_s64offset_u64 (p0, x0, z1),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ldff1uw_gather_ext_u64_s64offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_ext_u64_s64offset, svuint64_t, uint32_t, svint64_t,
+		     z0_res = svldff1uw_gather_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1uw_gather_x0_u64_u64offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_x0_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1uw_gather_tied1_u64_u64offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_tied1_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64offset_u64 (p0, x0, z0),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, x0, z0))
+
+/*
+** ldff1uw_gather_untied_u64_u64offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z1\.d\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_untied_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64offset_u64 (p0, x0, z1),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, x0, z1))
+
+/*
+** ldff1uw_gather_ext_u64_u64offset:
+**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_ext_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1uw_gather_offset_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1uw_gather_x0_u64_s64index:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_x0_u64_s64index, svuint64_t, uint32_t, svint64_t,
+		     z0_res = svldff1uw_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svldff1uw_gather_index_u64 (p0, x0, z0))
+
+/*
+** ldff1uw_gather_tied1_u64_s64index:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_tied1_u64_s64index, svuint64_t, uint32_t, svint64_t,
+		     z0_res = svldff1uw_gather_s64index_u64 (p0, x0, z0),
+		     z0_res = svldff1uw_gather_index_u64 (p0, x0, z0))
+
+/*
+** ldff1uw_gather_untied_u64_s64index:
+**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_untied_u64_s64index, svuint64_t, uint32_t, svint64_t,
+		     z0_res = svldff1uw_gather_s64index_u64 (p0, x0, z1),
+		     z0_res = svldff1uw_gather_index_u64 (p0, x0, z1))
+
+/*
+** ldff1uw_gather_ext_u64_s64index:
+**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, sxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_ext_u64_s64index, svuint64_t, uint32_t, svint64_t,
+		     z0_res = svldff1uw_gather_s64index_u64 (p0, x0, svextw_s64_x (p0, z1)),
+		     z0_res = svldff1uw_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
+
+/*
+** ldff1uw_gather_x0_u64_u64index:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_x0_u64_u64index, svuint64_t, uint32_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svldff1uw_gather_index_u64 (p0, x0, z0))
+
+/*
+** ldff1uw_gather_tied1_u64_u64index:
+**	ldff1w	z0\.d, p0/z, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_tied1_u64_u64index, svuint64_t, uint32_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64index_u64 (p0, x0, z0),
+		     z0_res = svldff1uw_gather_index_u64 (p0, x0, z0))
+
+/*
+** ldff1uw_gather_untied_u64_u64index:
+**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_untied_u64_u64index, svuint64_t, uint32_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64index_u64 (p0, x0, z1),
+		     z0_res = svldff1uw_gather_index_u64 (p0, x0, z1))
+
+/*
+** ldff1uw_gather_ext_u64_u64index:
+**	ldff1w	z0\.d, p0/z, \[x0, z1\.d, uxtw 2\]
+**	ret
+*/
+TEST_LOAD_GATHER_SZ (ldff1uw_gather_ext_u64_u64index, svuint64_t, uint32_t, svuint64_t,
+		     z0_res = svldff1uw_gather_u64index_u64 (p0, x0, svextw_u64_x (p0, z1)),
+		     z0_res = svldff1uw_gather_index_u64 (p0, x0, svextw_x (p0, z1)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_s64.c
new file mode 100644
index 000000000..ac9779899
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_s64.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1uw_s64_base:
+**	ldff1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uw_s64_base, svint64_t, uint32_t,
+	   z0 = svldff1uw_s64 (p0, x0),
+	   z0 = svldff1uw_s64 (p0, x0))
+
+/*
+** ldff1uw_s64_index:
+**	ldff1w	z0\.d, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ldff1uw_s64_index, svint64_t, uint32_t,
+	   z0 = svldff1uw_s64 (p0, x0 + x1),
+	   z0 = svldff1uw_s64 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uw_s64_1:
+**	inch	x0
+**	ldff1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uw_s64_1, svint64_t, uint32_t,
+	   z0 = svldff1uw_s64 (p0, x0 + svcntd ()),
+	   z0 = svldff1uw_s64 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uw_s64_m1:
+**	dech	x0
+**	ldff1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uw_s64_m1, svint64_t, uint32_t,
+	   z0 = svldff1uw_s64 (p0, x0 - svcntd ()),
+	   z0 = svldff1uw_s64 (p0, x0 - svcntd ()))
+
+/*
+** ldff1uw_vnum_s64_0:
+**	ldff1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uw_vnum_s64_0, svint64_t, uint32_t,
+	   z0 = svldff1uw_vnum_s64 (p0, x0, 0),
+	   z0 = svldff1uw_vnum_s64 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uw_vnum_s64_1:
+**	inch	x0
+**	ldff1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uw_vnum_s64_1, svint64_t, uint32_t,
+	   z0 = svldff1uw_vnum_s64 (p0, x0, 1),
+	   z0 = svldff1uw_vnum_s64 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uw_vnum_s64_m1:
+**	dech	x0
+**	ldff1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uw_vnum_s64_m1, svint64_t, uint32_t,
+	   z0 = svldff1uw_vnum_s64 (p0, x0, -1),
+	   z0 = svldff1uw_vnum_s64 (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1uw_vnum_s64_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1w	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1uw_vnum_s64_x1, svint64_t, uint32_t,
+	   z0 = svldff1uw_vnum_s64 (p0, x0, x1),
+	   z0 = svldff1uw_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_u64.c
new file mode 100644
index 000000000..c7ab06171
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldff1uw_u64.c
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1uw_u64_base:
+**	ldff1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uw_u64_base, svuint64_t, uint32_t,
+	   z0 = svldff1uw_u64 (p0, x0),
+	   z0 = svldff1uw_u64 (p0, x0))
+
+/*
+** ldff1uw_u64_index:
+**	ldff1w	z0\.d, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ldff1uw_u64_index, svuint64_t, uint32_t,
+	   z0 = svldff1uw_u64 (p0, x0 + x1),
+	   z0 = svldff1uw_u64 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uw_u64_1:
+**	inch	x0
+**	ldff1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uw_u64_1, svuint64_t, uint32_t,
+	   z0 = svldff1uw_u64 (p0, x0 + svcntd ()),
+	   z0 = svldff1uw_u64 (p0, x0 + svcntd ()))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uw_u64_m1:
+**	dech	x0
+**	ldff1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uw_u64_m1, svuint64_t, uint32_t,
+	   z0 = svldff1uw_u64 (p0, x0 - svcntd ()),
+	   z0 = svldff1uw_u64 (p0, x0 - svcntd ()))
+
+/*
+** ldff1uw_vnum_u64_0:
+**	ldff1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uw_vnum_u64_0, svuint64_t, uint32_t,
+	   z0 = svldff1uw_vnum_u64 (p0, x0, 0),
+	   z0 = svldff1uw_vnum_u64 (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uw_vnum_u64_1:
+**	inch	x0
+**	ldff1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uw_vnum_u64_1, svuint64_t, uint32_t,
+	   z0 = svldff1uw_vnum_u64 (p0, x0, 1),
+	   z0 = svldff1uw_vnum_u64 (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldff1uw_vnum_u64_m1:
+**	dech	x0
+**	ldff1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldff1uw_vnum_u64_m1, svuint64_t, uint32_t,
+	   z0 = svldff1uw_vnum_u64 (p0, x0, -1),
+	   z0 = svldff1uw_vnum_u64 (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldff1uw_vnum_u64_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldff1w	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldff1uw_vnum_u64_x1, svuint64_t, uint32_t,
+	   z0 = svldff1uw_vnum_u64 (p0, x0, x1),
+	   z0 = svldff1uw_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_bf16.c
new file mode 100644
index 000000000..947a896e7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_bf16.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1_bf16_base:
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_bf16_base, svbfloat16_t, bfloat16_t,
+	   z0 = svldnf1_bf16 (p0, x0),
+	   z0 = svldnf1 (p0, x0))
+
+/*
+** ldnf1_bf16_index:
+**	add	(x[0-9]+), x0, x1, lsl 1
+**	ldnf1h	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1_bf16_index, svbfloat16_t, bfloat16_t,
+	   z0 = svldnf1_bf16 (p0, x0 + x1),
+	   z0 = svldnf1 (p0, x0 + x1))
+
+/*
+** ldnf1_bf16_1:
+**	ldnf1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_bf16_1, svbfloat16_t, bfloat16_t,
+	   z0 = svldnf1_bf16 (p0, x0 + svcnth ()),
+	   z0 = svldnf1 (p0, x0 + svcnth ()))
+
+/*
+** ldnf1_bf16_7:
+**	ldnf1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_bf16_7, svbfloat16_t, bfloat16_t,
+	   z0 = svldnf1_bf16 (p0, x0 + svcnth () * 7),
+	   z0 = svldnf1 (p0, x0 + svcnth () * 7))
+
+/*
+** ldnf1_bf16_8:
+**	incb	x0, all, mul #8
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_bf16_8, svbfloat16_t, bfloat16_t,
+	   z0 = svldnf1_bf16 (p0, x0 + svcnth () * 8),
+	   z0 = svldnf1 (p0, x0 + svcnth () * 8))
+
+/*
+** ldnf1_bf16_m1:
+**	ldnf1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_bf16_m1, svbfloat16_t, bfloat16_t,
+	   z0 = svldnf1_bf16 (p0, x0 - svcnth ()),
+	   z0 = svldnf1 (p0, x0 - svcnth ()))
+
+/*
+** ldnf1_bf16_m8:
+**	ldnf1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_bf16_m8, svbfloat16_t, bfloat16_t,
+	   z0 = svldnf1_bf16 (p0, x0 - svcnth () * 8),
+	   z0 = svldnf1 (p0, x0 - svcnth () * 8))
+
+/*
+** ldnf1_bf16_m9:
+**	decb	x0, all, mul #9
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_bf16_m9, svbfloat16_t, bfloat16_t,
+	   z0 = svldnf1_bf16 (p0, x0 - svcnth () * 9),
+	   z0 = svldnf1 (p0, x0 - svcnth () * 9))
+
+/*
+** ldnf1_vnum_bf16_0:
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+	   z0 = svldnf1_vnum_bf16 (p0, x0, 0),
+	   z0 = svldnf1_vnum (p0, x0, 0))
+
+/*
+** ldnf1_vnum_bf16_1:
+**	ldnf1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+	   z0 = svldnf1_vnum_bf16 (p0, x0, 1),
+	   z0 = svldnf1_vnum (p0, x0, 1))
+
+/*
+** ldnf1_vnum_bf16_7:
+**	ldnf1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
+	   z0 = svldnf1_vnum_bf16 (p0, x0, 7),
+	   z0 = svldnf1_vnum (p0, x0, 7))
+
+/*
+** ldnf1_vnum_bf16_8:
+**	incb	x0, all, mul #8
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
+	   z0 = svldnf1_vnum_bf16 (p0, x0, 8),
+	   z0 = svldnf1_vnum (p0, x0, 8))
+
+/*
+** ldnf1_vnum_bf16_m1:
+**	ldnf1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+	   z0 = svldnf1_vnum_bf16 (p0, x0, -1),
+	   z0 = svldnf1_vnum (p0, x0, -1))
+
+/*
+** ldnf1_vnum_bf16_m8:
+**	ldnf1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
+	   z0 = svldnf1_vnum_bf16 (p0, x0, -8),
+	   z0 = svldnf1_vnum (p0, x0, -8))
+
+/*
+** ldnf1_vnum_bf16_m9:
+**	decb	x0, all, mul #9
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
+	   z0 = svldnf1_vnum_bf16 (p0, x0, -9),
+	   z0 = svldnf1_vnum (p0, x0, -9))
+
+/*
+** ldnf1_vnum_bf16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1h	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+	   z0 = svldnf1_vnum_bf16 (p0, x0, x1),
+	   z0 = svldnf1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f16.c
new file mode 100644
index 000000000..cf0178688
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f16.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1_f16_base:
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f16_base, svfloat16_t, float16_t,
+	   z0 = svldnf1_f16 (p0, x0),
+	   z0 = svldnf1 (p0, x0))
+
+/*
+** ldnf1_f16_index:
+**	add	(x[0-9]+), x0, x1, lsl 1
+**	ldnf1h	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f16_index, svfloat16_t, float16_t,
+	   z0 = svldnf1_f16 (p0, x0 + x1),
+	   z0 = svldnf1 (p0, x0 + x1))
+
+/*
+** ldnf1_f16_1:
+**	ldnf1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f16_1, svfloat16_t, float16_t,
+	   z0 = svldnf1_f16 (p0, x0 + svcnth ()),
+	   z0 = svldnf1 (p0, x0 + svcnth ()))
+
+/*
+** ldnf1_f16_7:
+**	ldnf1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f16_7, svfloat16_t, float16_t,
+	   z0 = svldnf1_f16 (p0, x0 + svcnth () * 7),
+	   z0 = svldnf1 (p0, x0 + svcnth () * 7))
+
+/*
+** ldnf1_f16_8:
+**	incb	x0, all, mul #8
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f16_8, svfloat16_t, float16_t,
+	   z0 = svldnf1_f16 (p0, x0 + svcnth () * 8),
+	   z0 = svldnf1 (p0, x0 + svcnth () * 8))
+
+/*
+** ldnf1_f16_m1:
+**	ldnf1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f16_m1, svfloat16_t, float16_t,
+	   z0 = svldnf1_f16 (p0, x0 - svcnth ()),
+	   z0 = svldnf1 (p0, x0 - svcnth ()))
+
+/*
+** ldnf1_f16_m8:
+**	ldnf1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f16_m8, svfloat16_t, float16_t,
+	   z0 = svldnf1_f16 (p0, x0 - svcnth () * 8),
+	   z0 = svldnf1 (p0, x0 - svcnth () * 8))
+
+/*
+** ldnf1_f16_m9:
+**	decb	x0, all, mul #9
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f16_m9, svfloat16_t, float16_t,
+	   z0 = svldnf1_f16 (p0, x0 - svcnth () * 9),
+	   z0 = svldnf1 (p0, x0 - svcnth () * 9))
+
+/*
+** ldnf1_vnum_f16_0:
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f16_0, svfloat16_t, float16_t,
+	   z0 = svldnf1_vnum_f16 (p0, x0, 0),
+	   z0 = svldnf1_vnum (p0, x0, 0))
+
+/*
+** ldnf1_vnum_f16_1:
+**	ldnf1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f16_1, svfloat16_t, float16_t,
+	   z0 = svldnf1_vnum_f16 (p0, x0, 1),
+	   z0 = svldnf1_vnum (p0, x0, 1))
+
+/*
+** ldnf1_vnum_f16_7:
+**	ldnf1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f16_7, svfloat16_t, float16_t,
+	   z0 = svldnf1_vnum_f16 (p0, x0, 7),
+	   z0 = svldnf1_vnum (p0, x0, 7))
+
+/*
+** ldnf1_vnum_f16_8:
+**	incb	x0, all, mul #8
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f16_8, svfloat16_t, float16_t,
+	   z0 = svldnf1_vnum_f16 (p0, x0, 8),
+	   z0 = svldnf1_vnum (p0, x0, 8))
+
+/*
+** ldnf1_vnum_f16_m1:
+**	ldnf1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f16_m1, svfloat16_t, float16_t,
+	   z0 = svldnf1_vnum_f16 (p0, x0, -1),
+	   z0 = svldnf1_vnum (p0, x0, -1))
+
+/*
+** ldnf1_vnum_f16_m8:
+**	ldnf1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f16_m8, svfloat16_t, float16_t,
+	   z0 = svldnf1_vnum_f16 (p0, x0, -8),
+	   z0 = svldnf1_vnum (p0, x0, -8))
+
+/*
+** ldnf1_vnum_f16_m9:
+**	decb	x0, all, mul #9
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f16_m9, svfloat16_t, float16_t,
+	   z0 = svldnf1_vnum_f16 (p0, x0, -9),
+	   z0 = svldnf1_vnum (p0, x0, -9))
+
+/*
+** ldnf1_vnum_f16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1h	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f16_x1, svfloat16_t, float16_t,
+	   z0 = svldnf1_vnum_f16 (p0, x0, x1),
+	   z0 = svldnf1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f32.c
new file mode 100644
index 000000000..83b73ec8e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f32.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1_f32_base:
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f32_base, svfloat32_t, float32_t,
+	   z0 = svldnf1_f32 (p0, x0),
+	   z0 = svldnf1 (p0, x0))
+
+/*
+** ldnf1_f32_index:
+**	add	(x[0-9]+), x0, x1, lsl 2
+**	ldnf1w	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f32_index, svfloat32_t, float32_t,
+	   z0 = svldnf1_f32 (p0, x0 + x1),
+	   z0 = svldnf1 (p0, x0 + x1))
+
+/*
+** ldnf1_f32_1:
+**	ldnf1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f32_1, svfloat32_t, float32_t,
+	   z0 = svldnf1_f32 (p0, x0 + svcntw ()),
+	   z0 = svldnf1 (p0, x0 + svcntw ()))
+
+/*
+** ldnf1_f32_7:
+**	ldnf1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f32_7, svfloat32_t, float32_t,
+	   z0 = svldnf1_f32 (p0, x0 + svcntw () * 7),
+	   z0 = svldnf1 (p0, x0 + svcntw () * 7))
+
+/*
+** ldnf1_f32_8:
+**	incb	x0, all, mul #8
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f32_8, svfloat32_t, float32_t,
+	   z0 = svldnf1_f32 (p0, x0 + svcntw () * 8),
+	   z0 = svldnf1 (p0, x0 + svcntw () * 8))
+
+/*
+** ldnf1_f32_m1:
+**	ldnf1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f32_m1, svfloat32_t, float32_t,
+	   z0 = svldnf1_f32 (p0, x0 - svcntw ()),
+	   z0 = svldnf1 (p0, x0 - svcntw ()))
+
+/*
+** ldnf1_f32_m8:
+**	ldnf1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f32_m8, svfloat32_t, float32_t,
+	   z0 = svldnf1_f32 (p0, x0 - svcntw () * 8),
+	   z0 = svldnf1 (p0, x0 - svcntw () * 8))
+
+/*
+** ldnf1_f32_m9:
+**	decb	x0, all, mul #9
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f32_m9, svfloat32_t, float32_t,
+	   z0 = svldnf1_f32 (p0, x0 - svcntw () * 9),
+	   z0 = svldnf1 (p0, x0 - svcntw () * 9))
+
+/*
+** ldnf1_vnum_f32_0:
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f32_0, svfloat32_t, float32_t,
+	   z0 = svldnf1_vnum_f32 (p0, x0, 0),
+	   z0 = svldnf1_vnum (p0, x0, 0))
+
+/*
+** ldnf1_vnum_f32_1:
+**	ldnf1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f32_1, svfloat32_t, float32_t,
+	   z0 = svldnf1_vnum_f32 (p0, x0, 1),
+	   z0 = svldnf1_vnum (p0, x0, 1))
+
+/*
+** ldnf1_vnum_f32_7:
+**	ldnf1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f32_7, svfloat32_t, float32_t,
+	   z0 = svldnf1_vnum_f32 (p0, x0, 7),
+	   z0 = svldnf1_vnum (p0, x0, 7))
+
+/*
+** ldnf1_vnum_f32_8:
+**	incb	x0, all, mul #8
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f32_8, svfloat32_t, float32_t,
+	   z0 = svldnf1_vnum_f32 (p0, x0, 8),
+	   z0 = svldnf1_vnum (p0, x0, 8))
+
+/*
+** ldnf1_vnum_f32_m1:
+**	ldnf1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f32_m1, svfloat32_t, float32_t,
+	   z0 = svldnf1_vnum_f32 (p0, x0, -1),
+	   z0 = svldnf1_vnum (p0, x0, -1))
+
+/*
+** ldnf1_vnum_f32_m8:
+**	ldnf1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f32_m8, svfloat32_t, float32_t,
+	   z0 = svldnf1_vnum_f32 (p0, x0, -8),
+	   z0 = svldnf1_vnum (p0, x0, -8))
+
+/*
+** ldnf1_vnum_f32_m9:
+**	decb	x0, all, mul #9
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f32_m9, svfloat32_t, float32_t,
+	   z0 = svldnf1_vnum_f32 (p0, x0, -9),
+	   z0 = svldnf1_vnum (p0, x0, -9))
+
+/*
+** ldnf1_vnum_f32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1w	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f32_x1, svfloat32_t, float32_t,
+	   z0 = svldnf1_vnum_f32 (p0, x0, x1),
+	   z0 = svldnf1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f64.c
new file mode 100644
index 000000000..778096e82
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_f64.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1_f64_base:
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f64_base, svfloat64_t, float64_t,
+	   z0 = svldnf1_f64 (p0, x0),
+	   z0 = svldnf1 (p0, x0))
+
+/*
+** ldnf1_f64_index:
+**	add	(x[0-9]+), x0, x1, lsl 3
+**	ldnf1d	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f64_index, svfloat64_t, float64_t,
+	   z0 = svldnf1_f64 (p0, x0 + x1),
+	   z0 = svldnf1 (p0, x0 + x1))
+
+/*
+** ldnf1_f64_1:
+**	ldnf1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f64_1, svfloat64_t, float64_t,
+	   z0 = svldnf1_f64 (p0, x0 + svcntd ()),
+	   z0 = svldnf1 (p0, x0 + svcntd ()))
+
+/*
+** ldnf1_f64_7:
+**	ldnf1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f64_7, svfloat64_t, float64_t,
+	   z0 = svldnf1_f64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnf1 (p0, x0 + svcntd () * 7))
+
+/*
+** ldnf1_f64_8:
+**	incb	x0, all, mul #8
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f64_8, svfloat64_t, float64_t,
+	   z0 = svldnf1_f64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnf1 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnf1_f64_m1:
+**	ldnf1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f64_m1, svfloat64_t, float64_t,
+	   z0 = svldnf1_f64 (p0, x0 - svcntd ()),
+	   z0 = svldnf1 (p0, x0 - svcntd ()))
+
+/*
+** ldnf1_f64_m8:
+**	ldnf1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f64_m8, svfloat64_t, float64_t,
+	   z0 = svldnf1_f64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnf1 (p0, x0 - svcntd () * 8))
+
+/*
+** ldnf1_f64_m9:
+**	decb	x0, all, mul #9
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_f64_m9, svfloat64_t, float64_t,
+	   z0 = svldnf1_f64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnf1 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnf1_vnum_f64_0:
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f64_0, svfloat64_t, float64_t,
+	   z0 = svldnf1_vnum_f64 (p0, x0, 0),
+	   z0 = svldnf1_vnum (p0, x0, 0))
+
+/*
+** ldnf1_vnum_f64_1:
+**	ldnf1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f64_1, svfloat64_t, float64_t,
+	   z0 = svldnf1_vnum_f64 (p0, x0, 1),
+	   z0 = svldnf1_vnum (p0, x0, 1))
+
+/*
+** ldnf1_vnum_f64_7:
+**	ldnf1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f64_7, svfloat64_t, float64_t,
+	   z0 = svldnf1_vnum_f64 (p0, x0, 7),
+	   z0 = svldnf1_vnum (p0, x0, 7))
+
+/*
+** ldnf1_vnum_f64_8:
+**	incb	x0, all, mul #8
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f64_8, svfloat64_t, float64_t,
+	   z0 = svldnf1_vnum_f64 (p0, x0, 8),
+	   z0 = svldnf1_vnum (p0, x0, 8))
+
+/*
+** ldnf1_vnum_f64_m1:
+**	ldnf1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f64_m1, svfloat64_t, float64_t,
+	   z0 = svldnf1_vnum_f64 (p0, x0, -1),
+	   z0 = svldnf1_vnum (p0, x0, -1))
+
+/*
+** ldnf1_vnum_f64_m8:
+**	ldnf1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f64_m8, svfloat64_t, float64_t,
+	   z0 = svldnf1_vnum_f64 (p0, x0, -8),
+	   z0 = svldnf1_vnum (p0, x0, -8))
+
+/*
+** ldnf1_vnum_f64_m9:
+**	decb	x0, all, mul #9
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f64_m9, svfloat64_t, float64_t,
+	   z0 = svldnf1_vnum_f64 (p0, x0, -9),
+	   z0 = svldnf1_vnum (p0, x0, -9))
+
+/*
+** ldnf1_vnum_f64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1d	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_f64_x1, svfloat64_t, float64_t,
+	   z0 = svldnf1_vnum_f64 (p0, x0, x1),
+	   z0 = svldnf1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s16.c
new file mode 100644
index 000000000..592c8237d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s16.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1_s16_base:
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s16_base, svint16_t, int16_t,
+	   z0 = svldnf1_s16 (p0, x0),
+	   z0 = svldnf1 (p0, x0))
+
+/*
+** ldnf1_s16_index:
+**	add	(x[0-9]+), x0, x1, lsl 1
+**	ldnf1h	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s16_index, svint16_t, int16_t,
+	   z0 = svldnf1_s16 (p0, x0 + x1),
+	   z0 = svldnf1 (p0, x0 + x1))
+
+/*
+** ldnf1_s16_1:
+**	ldnf1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s16_1, svint16_t, int16_t,
+	   z0 = svldnf1_s16 (p0, x0 + svcnth ()),
+	   z0 = svldnf1 (p0, x0 + svcnth ()))
+
+/*
+** ldnf1_s16_7:
+**	ldnf1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s16_7, svint16_t, int16_t,
+	   z0 = svldnf1_s16 (p0, x0 + svcnth () * 7),
+	   z0 = svldnf1 (p0, x0 + svcnth () * 7))
+
+/*
+** ldnf1_s16_8:
+**	incb	x0, all, mul #8
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s16_8, svint16_t, int16_t,
+	   z0 = svldnf1_s16 (p0, x0 + svcnth () * 8),
+	   z0 = svldnf1 (p0, x0 + svcnth () * 8))
+
+/*
+** ldnf1_s16_m1:
+**	ldnf1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s16_m1, svint16_t, int16_t,
+	   z0 = svldnf1_s16 (p0, x0 - svcnth ()),
+	   z0 = svldnf1 (p0, x0 - svcnth ()))
+
+/*
+** ldnf1_s16_m8:
+**	ldnf1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s16_m8, svint16_t, int16_t,
+	   z0 = svldnf1_s16 (p0, x0 - svcnth () * 8),
+	   z0 = svldnf1 (p0, x0 - svcnth () * 8))
+
+/*
+** ldnf1_s16_m9:
+**	decb	x0, all, mul #9
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s16_m9, svint16_t, int16_t,
+	   z0 = svldnf1_s16 (p0, x0 - svcnth () * 9),
+	   z0 = svldnf1 (p0, x0 - svcnth () * 9))
+
+/*
+** ldnf1_vnum_s16_0:
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s16_0, svint16_t, int16_t,
+	   z0 = svldnf1_vnum_s16 (p0, x0, 0),
+	   z0 = svldnf1_vnum (p0, x0, 0))
+
+/*
+** ldnf1_vnum_s16_1:
+**	ldnf1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s16_1, svint16_t, int16_t,
+	   z0 = svldnf1_vnum_s16 (p0, x0, 1),
+	   z0 = svldnf1_vnum (p0, x0, 1))
+
+/*
+** ldnf1_vnum_s16_7:
+**	ldnf1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s16_7, svint16_t, int16_t,
+	   z0 = svldnf1_vnum_s16 (p0, x0, 7),
+	   z0 = svldnf1_vnum (p0, x0, 7))
+
+/*
+** ldnf1_vnum_s16_8:
+**	incb	x0, all, mul #8
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s16_8, svint16_t, int16_t,
+	   z0 = svldnf1_vnum_s16 (p0, x0, 8),
+	   z0 = svldnf1_vnum (p0, x0, 8))
+
+/*
+** ldnf1_vnum_s16_m1:
+**	ldnf1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s16_m1, svint16_t, int16_t,
+	   z0 = svldnf1_vnum_s16 (p0, x0, -1),
+	   z0 = svldnf1_vnum (p0, x0, -1))
+
+/*
+** ldnf1_vnum_s16_m8:
+**	ldnf1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s16_m8, svint16_t, int16_t,
+	   z0 = svldnf1_vnum_s16 (p0, x0, -8),
+	   z0 = svldnf1_vnum (p0, x0, -8))
+
+/*
+** ldnf1_vnum_s16_m9:
+**	decb	x0, all, mul #9
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s16_m9, svint16_t, int16_t,
+	   z0 = svldnf1_vnum_s16 (p0, x0, -9),
+	   z0 = svldnf1_vnum (p0, x0, -9))
+
+/*
+** ldnf1_vnum_s16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1h	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s16_x1, svint16_t, int16_t,
+	   z0 = svldnf1_vnum_s16 (p0, x0, x1),
+	   z0 = svldnf1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s32.c
new file mode 100644
index 000000000..634092af8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s32.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1_s32_base:
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s32_base, svint32_t, int32_t,
+	   z0 = svldnf1_s32 (p0, x0),
+	   z0 = svldnf1 (p0, x0))
+
+/*
+** ldnf1_s32_index:
+**	add	(x[0-9]+), x0, x1, lsl 2
+**	ldnf1w	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s32_index, svint32_t, int32_t,
+	   z0 = svldnf1_s32 (p0, x0 + x1),
+	   z0 = svldnf1 (p0, x0 + x1))
+
+/*
+** ldnf1_s32_1:
+**	ldnf1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s32_1, svint32_t, int32_t,
+	   z0 = svldnf1_s32 (p0, x0 + svcntw ()),
+	   z0 = svldnf1 (p0, x0 + svcntw ()))
+
+/*
+** ldnf1_s32_7:
+**	ldnf1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s32_7, svint32_t, int32_t,
+	   z0 = svldnf1_s32 (p0, x0 + svcntw () * 7),
+	   z0 = svldnf1 (p0, x0 + svcntw () * 7))
+
+/*
+** ldnf1_s32_8:
+**	incb	x0, all, mul #8
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s32_8, svint32_t, int32_t,
+	   z0 = svldnf1_s32 (p0, x0 + svcntw () * 8),
+	   z0 = svldnf1 (p0, x0 + svcntw () * 8))
+
+/*
+** ldnf1_s32_m1:
+**	ldnf1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s32_m1, svint32_t, int32_t,
+	   z0 = svldnf1_s32 (p0, x0 - svcntw ()),
+	   z0 = svldnf1 (p0, x0 - svcntw ()))
+
+/*
+** ldnf1_s32_m8:
+**	ldnf1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s32_m8, svint32_t, int32_t,
+	   z0 = svldnf1_s32 (p0, x0 - svcntw () * 8),
+	   z0 = svldnf1 (p0, x0 - svcntw () * 8))
+
+/*
+** ldnf1_s32_m9:
+**	decb	x0, all, mul #9
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s32_m9, svint32_t, int32_t,
+	   z0 = svldnf1_s32 (p0, x0 - svcntw () * 9),
+	   z0 = svldnf1 (p0, x0 - svcntw () * 9))
+
+/*
+** ldnf1_vnum_s32_0:
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s32_0, svint32_t, int32_t,
+	   z0 = svldnf1_vnum_s32 (p0, x0, 0),
+	   z0 = svldnf1_vnum (p0, x0, 0))
+
+/*
+** ldnf1_vnum_s32_1:
+**	ldnf1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s32_1, svint32_t, int32_t,
+	   z0 = svldnf1_vnum_s32 (p0, x0, 1),
+	   z0 = svldnf1_vnum (p0, x0, 1))
+
+/*
+** ldnf1_vnum_s32_7:
+**	ldnf1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s32_7, svint32_t, int32_t,
+	   z0 = svldnf1_vnum_s32 (p0, x0, 7),
+	   z0 = svldnf1_vnum (p0, x0, 7))
+
+/*
+** ldnf1_vnum_s32_8:
+**	incb	x0, all, mul #8
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s32_8, svint32_t, int32_t,
+	   z0 = svldnf1_vnum_s32 (p0, x0, 8),
+	   z0 = svldnf1_vnum (p0, x0, 8))
+
+/*
+** ldnf1_vnum_s32_m1:
+**	ldnf1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s32_m1, svint32_t, int32_t,
+	   z0 = svldnf1_vnum_s32 (p0, x0, -1),
+	   z0 = svldnf1_vnum (p0, x0, -1))
+
+/*
+** ldnf1_vnum_s32_m8:
+**	ldnf1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s32_m8, svint32_t, int32_t,
+	   z0 = svldnf1_vnum_s32 (p0, x0, -8),
+	   z0 = svldnf1_vnum (p0, x0, -8))
+
+/*
+** ldnf1_vnum_s32_m9:
+**	decb	x0, all, mul #9
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s32_m9, svint32_t, int32_t,
+	   z0 = svldnf1_vnum_s32 (p0, x0, -9),
+	   z0 = svldnf1_vnum (p0, x0, -9))
+
+/*
+** ldnf1_vnum_s32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1w	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s32_x1, svint32_t, int32_t,
+	   z0 = svldnf1_vnum_s32 (p0, x0, x1),
+	   z0 = svldnf1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s64.c
new file mode 100644
index 000000000..4a03f6676
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s64.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1_s64_base:
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s64_base, svint64_t, int64_t,
+	   z0 = svldnf1_s64 (p0, x0),
+	   z0 = svldnf1 (p0, x0))
+
+/*
+** ldnf1_s64_index:
+**	add	(x[0-9]+), x0, x1, lsl 3
+**	ldnf1d	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s64_index, svint64_t, int64_t,
+	   z0 = svldnf1_s64 (p0, x0 + x1),
+	   z0 = svldnf1 (p0, x0 + x1))
+
+/*
+** ldnf1_s64_1:
+**	ldnf1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s64_1, svint64_t, int64_t,
+	   z0 = svldnf1_s64 (p0, x0 + svcntd ()),
+	   z0 = svldnf1 (p0, x0 + svcntd ()))
+
+/*
+** ldnf1_s64_7:
+**	ldnf1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s64_7, svint64_t, int64_t,
+	   z0 = svldnf1_s64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnf1 (p0, x0 + svcntd () * 7))
+
+/*
+** ldnf1_s64_8:
+**	incb	x0, all, mul #8
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s64_8, svint64_t, int64_t,
+	   z0 = svldnf1_s64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnf1 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnf1_s64_m1:
+**	ldnf1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s64_m1, svint64_t, int64_t,
+	   z0 = svldnf1_s64 (p0, x0 - svcntd ()),
+	   z0 = svldnf1 (p0, x0 - svcntd ()))
+
+/*
+** ldnf1_s64_m8:
+**	ldnf1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s64_m8, svint64_t, int64_t,
+	   z0 = svldnf1_s64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnf1 (p0, x0 - svcntd () * 8))
+
+/*
+** ldnf1_s64_m9:
+**	decb	x0, all, mul #9
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s64_m9, svint64_t, int64_t,
+	   z0 = svldnf1_s64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnf1 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnf1_vnum_s64_0:
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s64_0, svint64_t, int64_t,
+	   z0 = svldnf1_vnum_s64 (p0, x0, 0),
+	   z0 = svldnf1_vnum (p0, x0, 0))
+
+/*
+** ldnf1_vnum_s64_1:
+**	ldnf1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s64_1, svint64_t, int64_t,
+	   z0 = svldnf1_vnum_s64 (p0, x0, 1),
+	   z0 = svldnf1_vnum (p0, x0, 1))
+
+/*
+** ldnf1_vnum_s64_7:
+**	ldnf1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s64_7, svint64_t, int64_t,
+	   z0 = svldnf1_vnum_s64 (p0, x0, 7),
+	   z0 = svldnf1_vnum (p0, x0, 7))
+
+/*
+** ldnf1_vnum_s64_8:
+**	incb	x0, all, mul #8
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s64_8, svint64_t, int64_t,
+	   z0 = svldnf1_vnum_s64 (p0, x0, 8),
+	   z0 = svldnf1_vnum (p0, x0, 8))
+
+/*
+** ldnf1_vnum_s64_m1:
+**	ldnf1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s64_m1, svint64_t, int64_t,
+	   z0 = svldnf1_vnum_s64 (p0, x0, -1),
+	   z0 = svldnf1_vnum (p0, x0, -1))
+
+/*
+** ldnf1_vnum_s64_m8:
+**	ldnf1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s64_m8, svint64_t, int64_t,
+	   z0 = svldnf1_vnum_s64 (p0, x0, -8),
+	   z0 = svldnf1_vnum (p0, x0, -8))
+
+/*
+** ldnf1_vnum_s64_m9:
+**	decb	x0, all, mul #9
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s64_m9, svint64_t, int64_t,
+	   z0 = svldnf1_vnum_s64 (p0, x0, -9),
+	   z0 = svldnf1_vnum (p0, x0, -9))
+
+/*
+** ldnf1_vnum_s64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1d	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s64_x1, svint64_t, int64_t,
+	   z0 = svldnf1_vnum_s64 (p0, x0, x1),
+	   z0 = svldnf1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s8.c
new file mode 100644
index 000000000..162ee176a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_s8.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1_s8_base:
+**	ldnf1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s8_base, svint8_t, int8_t,
+	   z0 = svldnf1_s8 (p0, x0),
+	   z0 = svldnf1 (p0, x0))
+
+/*
+** ldnf1_s8_index:
+**	add	(x[0-9]+), x0, x1
+**	ldnf1b	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s8_index, svint8_t, int8_t,
+	   z0 = svldnf1_s8 (p0, x0 + x1),
+	   z0 = svldnf1 (p0, x0 + x1))
+
+/*
+** ldnf1_s8_1:
+**	ldnf1b	z0\.b, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s8_1, svint8_t, int8_t,
+	   z0 = svldnf1_s8 (p0, x0 + svcntb ()),
+	   z0 = svldnf1 (p0, x0 + svcntb ()))
+
+/*
+** ldnf1_s8_7:
+**	ldnf1b	z0\.b, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s8_7, svint8_t, int8_t,
+	   z0 = svldnf1_s8 (p0, x0 + svcntb () * 7),
+	   z0 = svldnf1 (p0, x0 + svcntb () * 7))
+
+/*
+** ldnf1_s8_8:
+**	incb	x0, all, mul #8
+**	ldnf1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s8_8, svint8_t, int8_t,
+	   z0 = svldnf1_s8 (p0, x0 + svcntb () * 8),
+	   z0 = svldnf1 (p0, x0 + svcntb () * 8))
+
+/*
+** ldnf1_s8_m1:
+**	ldnf1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s8_m1, svint8_t, int8_t,
+	   z0 = svldnf1_s8 (p0, x0 - svcntb ()),
+	   z0 = svldnf1 (p0, x0 - svcntb ()))
+
+/*
+** ldnf1_s8_m8:
+**	ldnf1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s8_m8, svint8_t, int8_t,
+	   z0 = svldnf1_s8 (p0, x0 - svcntb () * 8),
+	   z0 = svldnf1 (p0, x0 - svcntb () * 8))
+
+/*
+** ldnf1_s8_m9:
+**	decb	x0, all, mul #9
+**	ldnf1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_s8_m9, svint8_t, int8_t,
+	   z0 = svldnf1_s8 (p0, x0 - svcntb () * 9),
+	   z0 = svldnf1 (p0, x0 - svcntb () * 9))
+
+/*
+** ldnf1_vnum_s8_0:
+**	ldnf1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s8_0, svint8_t, int8_t,
+	   z0 = svldnf1_vnum_s8 (p0, x0, 0),
+	   z0 = svldnf1_vnum (p0, x0, 0))
+
+/*
+** ldnf1_vnum_s8_1:
+**	ldnf1b	z0\.b, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s8_1, svint8_t, int8_t,
+	   z0 = svldnf1_vnum_s8 (p0, x0, 1),
+	   z0 = svldnf1_vnum (p0, x0, 1))
+
+/*
+** ldnf1_vnum_s8_7:
+**	ldnf1b	z0\.b, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s8_7, svint8_t, int8_t,
+	   z0 = svldnf1_vnum_s8 (p0, x0, 7),
+	   z0 = svldnf1_vnum (p0, x0, 7))
+
+/*
+** ldnf1_vnum_s8_8:
+**	incb	x0, all, mul #8
+**	ldnf1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s8_8, svint8_t, int8_t,
+	   z0 = svldnf1_vnum_s8 (p0, x0, 8),
+	   z0 = svldnf1_vnum (p0, x0, 8))
+
+/*
+** ldnf1_vnum_s8_m1:
+**	ldnf1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s8_m1, svint8_t, int8_t,
+	   z0 = svldnf1_vnum_s8 (p0, x0, -1),
+	   z0 = svldnf1_vnum (p0, x0, -1))
+
+/*
+** ldnf1_vnum_s8_m8:
+**	ldnf1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s8_m8, svint8_t, int8_t,
+	   z0 = svldnf1_vnum_s8 (p0, x0, -8),
+	   z0 = svldnf1_vnum (p0, x0, -8))
+
+/*
+** ldnf1_vnum_s8_m9:
+**	decb	x0, all, mul #9
+**	ldnf1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s8_m9, svint8_t, int8_t,
+	   z0 = svldnf1_vnum_s8 (p0, x0, -9),
+	   z0 = svldnf1_vnum (p0, x0, -9))
+
+/*
+** ldnf1_vnum_s8_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1b	z0\.b, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_s8_x1, svint8_t, int8_t,
+	   z0 = svldnf1_vnum_s8 (p0, x0, x1),
+	   z0 = svldnf1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u16.c
new file mode 100644
index 000000000..e920ac43b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u16.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1_u16_base:
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u16_base, svuint16_t, uint16_t,
+	   z0 = svldnf1_u16 (p0, x0),
+	   z0 = svldnf1 (p0, x0))
+
+/*
+** ldnf1_u16_index:
+**	add	(x[0-9]+), x0, x1, lsl 1
+**	ldnf1h	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u16_index, svuint16_t, uint16_t,
+	   z0 = svldnf1_u16 (p0, x0 + x1),
+	   z0 = svldnf1 (p0, x0 + x1))
+
+/*
+** ldnf1_u16_1:
+**	ldnf1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u16_1, svuint16_t, uint16_t,
+	   z0 = svldnf1_u16 (p0, x0 + svcnth ()),
+	   z0 = svldnf1 (p0, x0 + svcnth ()))
+
+/*
+** ldnf1_u16_7:
+**	ldnf1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u16_7, svuint16_t, uint16_t,
+	   z0 = svldnf1_u16 (p0, x0 + svcnth () * 7),
+	   z0 = svldnf1 (p0, x0 + svcnth () * 7))
+
+/*
+** ldnf1_u16_8:
+**	incb	x0, all, mul #8
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u16_8, svuint16_t, uint16_t,
+	   z0 = svldnf1_u16 (p0, x0 + svcnth () * 8),
+	   z0 = svldnf1 (p0, x0 + svcnth () * 8))
+
+/*
+** ldnf1_u16_m1:
+**	ldnf1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u16_m1, svuint16_t, uint16_t,
+	   z0 = svldnf1_u16 (p0, x0 - svcnth ()),
+	   z0 = svldnf1 (p0, x0 - svcnth ()))
+
+/*
+** ldnf1_u16_m8:
+**	ldnf1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u16_m8, svuint16_t, uint16_t,
+	   z0 = svldnf1_u16 (p0, x0 - svcnth () * 8),
+	   z0 = svldnf1 (p0, x0 - svcnth () * 8))
+
+/*
+** ldnf1_u16_m9:
+**	decb	x0, all, mul #9
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u16_m9, svuint16_t, uint16_t,
+	   z0 = svldnf1_u16 (p0, x0 - svcnth () * 9),
+	   z0 = svldnf1 (p0, x0 - svcnth () * 9))
+
+/*
+** ldnf1_vnum_u16_0:
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u16_0, svuint16_t, uint16_t,
+	   z0 = svldnf1_vnum_u16 (p0, x0, 0),
+	   z0 = svldnf1_vnum (p0, x0, 0))
+
+/*
+** ldnf1_vnum_u16_1:
+**	ldnf1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u16_1, svuint16_t, uint16_t,
+	   z0 = svldnf1_vnum_u16 (p0, x0, 1),
+	   z0 = svldnf1_vnum (p0, x0, 1))
+
+/*
+** ldnf1_vnum_u16_7:
+**	ldnf1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u16_7, svuint16_t, uint16_t,
+	   z0 = svldnf1_vnum_u16 (p0, x0, 7),
+	   z0 = svldnf1_vnum (p0, x0, 7))
+
+/*
+** ldnf1_vnum_u16_8:
+**	incb	x0, all, mul #8
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u16_8, svuint16_t, uint16_t,
+	   z0 = svldnf1_vnum_u16 (p0, x0, 8),
+	   z0 = svldnf1_vnum (p0, x0, 8))
+
+/*
+** ldnf1_vnum_u16_m1:
+**	ldnf1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u16_m1, svuint16_t, uint16_t,
+	   z0 = svldnf1_vnum_u16 (p0, x0, -1),
+	   z0 = svldnf1_vnum (p0, x0, -1))
+
+/*
+** ldnf1_vnum_u16_m8:
+**	ldnf1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u16_m8, svuint16_t, uint16_t,
+	   z0 = svldnf1_vnum_u16 (p0, x0, -8),
+	   z0 = svldnf1_vnum (p0, x0, -8))
+
+/*
+** ldnf1_vnum_u16_m9:
+**	decb	x0, all, mul #9
+**	ldnf1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u16_m9, svuint16_t, uint16_t,
+	   z0 = svldnf1_vnum_u16 (p0, x0, -9),
+	   z0 = svldnf1_vnum (p0, x0, -9))
+
+/*
+** ldnf1_vnum_u16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1h	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u16_x1, svuint16_t, uint16_t,
+	   z0 = svldnf1_vnum_u16 (p0, x0, x1),
+	   z0 = svldnf1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u32.c
new file mode 100644
index 000000000..65e28c5c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u32.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1_u32_base:
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u32_base, svuint32_t, uint32_t,
+	   z0 = svldnf1_u32 (p0, x0),
+	   z0 = svldnf1 (p0, x0))
+
+/*
+** ldnf1_u32_index:
+**	add	(x[0-9]+), x0, x1, lsl 2
+**	ldnf1w	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u32_index, svuint32_t, uint32_t,
+	   z0 = svldnf1_u32 (p0, x0 + x1),
+	   z0 = svldnf1 (p0, x0 + x1))
+
+/*
+** ldnf1_u32_1:
+**	ldnf1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u32_1, svuint32_t, uint32_t,
+	   z0 = svldnf1_u32 (p0, x0 + svcntw ()),
+	   z0 = svldnf1 (p0, x0 + svcntw ()))
+
+/*
+** ldnf1_u32_7:
+**	ldnf1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u32_7, svuint32_t, uint32_t,
+	   z0 = svldnf1_u32 (p0, x0 + svcntw () * 7),
+	   z0 = svldnf1 (p0, x0 + svcntw () * 7))
+
+/*
+** ldnf1_u32_8:
+**	incb	x0, all, mul #8
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u32_8, svuint32_t, uint32_t,
+	   z0 = svldnf1_u32 (p0, x0 + svcntw () * 8),
+	   z0 = svldnf1 (p0, x0 + svcntw () * 8))
+
+/*
+** ldnf1_u32_m1:
+**	ldnf1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u32_m1, svuint32_t, uint32_t,
+	   z0 = svldnf1_u32 (p0, x0 - svcntw ()),
+	   z0 = svldnf1 (p0, x0 - svcntw ()))
+
+/*
+** ldnf1_u32_m8:
+**	ldnf1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u32_m8, svuint32_t, uint32_t,
+	   z0 = svldnf1_u32 (p0, x0 - svcntw () * 8),
+	   z0 = svldnf1 (p0, x0 - svcntw () * 8))
+
+/*
+** ldnf1_u32_m9:
+**	decb	x0, all, mul #9
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u32_m9, svuint32_t, uint32_t,
+	   z0 = svldnf1_u32 (p0, x0 - svcntw () * 9),
+	   z0 = svldnf1 (p0, x0 - svcntw () * 9))
+
+/*
+** ldnf1_vnum_u32_0:
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u32_0, svuint32_t, uint32_t,
+	   z0 = svldnf1_vnum_u32 (p0, x0, 0),
+	   z0 = svldnf1_vnum (p0, x0, 0))
+
+/*
+** ldnf1_vnum_u32_1:
+**	ldnf1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u32_1, svuint32_t, uint32_t,
+	   z0 = svldnf1_vnum_u32 (p0, x0, 1),
+	   z0 = svldnf1_vnum (p0, x0, 1))
+
+/*
+** ldnf1_vnum_u32_7:
+**	ldnf1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u32_7, svuint32_t, uint32_t,
+	   z0 = svldnf1_vnum_u32 (p0, x0, 7),
+	   z0 = svldnf1_vnum (p0, x0, 7))
+
+/*
+** ldnf1_vnum_u32_8:
+**	incb	x0, all, mul #8
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u32_8, svuint32_t, uint32_t,
+	   z0 = svldnf1_vnum_u32 (p0, x0, 8),
+	   z0 = svldnf1_vnum (p0, x0, 8))
+
+/*
+** ldnf1_vnum_u32_m1:
+**	ldnf1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u32_m1, svuint32_t, uint32_t,
+	   z0 = svldnf1_vnum_u32 (p0, x0, -1),
+	   z0 = svldnf1_vnum (p0, x0, -1))
+
+/*
+** ldnf1_vnum_u32_m8:
+**	ldnf1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u32_m8, svuint32_t, uint32_t,
+	   z0 = svldnf1_vnum_u32 (p0, x0, -8),
+	   z0 = svldnf1_vnum (p0, x0, -8))
+
+/*
+** ldnf1_vnum_u32_m9:
+**	decb	x0, all, mul #9
+**	ldnf1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u32_m9, svuint32_t, uint32_t,
+	   z0 = svldnf1_vnum_u32 (p0, x0, -9),
+	   z0 = svldnf1_vnum (p0, x0, -9))
+
+/*
+** ldnf1_vnum_u32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1w	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u32_x1, svuint32_t, uint32_t,
+	   z0 = svldnf1_vnum_u32 (p0, x0, x1),
+	   z0 = svldnf1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u64.c
new file mode 100644
index 000000000..70d3f27d8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u64.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1_u64_base:
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u64_base, svuint64_t, uint64_t,
+	   z0 = svldnf1_u64 (p0, x0),
+	   z0 = svldnf1 (p0, x0))
+
+/*
+** ldnf1_u64_index:
+**	add	(x[0-9]+), x0, x1, lsl 3
+**	ldnf1d	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u64_index, svuint64_t, uint64_t,
+	   z0 = svldnf1_u64 (p0, x0 + x1),
+	   z0 = svldnf1 (p0, x0 + x1))
+
+/*
+** ldnf1_u64_1:
+**	ldnf1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u64_1, svuint64_t, uint64_t,
+	   z0 = svldnf1_u64 (p0, x0 + svcntd ()),
+	   z0 = svldnf1 (p0, x0 + svcntd ()))
+
+/*
+** ldnf1_u64_7:
+**	ldnf1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u64_7, svuint64_t, uint64_t,
+	   z0 = svldnf1_u64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnf1 (p0, x0 + svcntd () * 7))
+
+/*
+** ldnf1_u64_8:
+**	incb	x0, all, mul #8
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u64_8, svuint64_t, uint64_t,
+	   z0 = svldnf1_u64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnf1 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnf1_u64_m1:
+**	ldnf1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u64_m1, svuint64_t, uint64_t,
+	   z0 = svldnf1_u64 (p0, x0 - svcntd ()),
+	   z0 = svldnf1 (p0, x0 - svcntd ()))
+
+/*
+** ldnf1_u64_m8:
+**	ldnf1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u64_m8, svuint64_t, uint64_t,
+	   z0 = svldnf1_u64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnf1 (p0, x0 - svcntd () * 8))
+
+/*
+** ldnf1_u64_m9:
+**	decb	x0, all, mul #9
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u64_m9, svuint64_t, uint64_t,
+	   z0 = svldnf1_u64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnf1 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnf1_vnum_u64_0:
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u64_0, svuint64_t, uint64_t,
+	   z0 = svldnf1_vnum_u64 (p0, x0, 0),
+	   z0 = svldnf1_vnum (p0, x0, 0))
+
+/*
+** ldnf1_vnum_u64_1:
+**	ldnf1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u64_1, svuint64_t, uint64_t,
+	   z0 = svldnf1_vnum_u64 (p0, x0, 1),
+	   z0 = svldnf1_vnum (p0, x0, 1))
+
+/*
+** ldnf1_vnum_u64_7:
+**	ldnf1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u64_7, svuint64_t, uint64_t,
+	   z0 = svldnf1_vnum_u64 (p0, x0, 7),
+	   z0 = svldnf1_vnum (p0, x0, 7))
+
+/*
+** ldnf1_vnum_u64_8:
+**	incb	x0, all, mul #8
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u64_8, svuint64_t, uint64_t,
+	   z0 = svldnf1_vnum_u64 (p0, x0, 8),
+	   z0 = svldnf1_vnum (p0, x0, 8))
+
+/*
+** ldnf1_vnum_u64_m1:
+**	ldnf1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u64_m1, svuint64_t, uint64_t,
+	   z0 = svldnf1_vnum_u64 (p0, x0, -1),
+	   z0 = svldnf1_vnum (p0, x0, -1))
+
+/*
+** ldnf1_vnum_u64_m8:
+**	ldnf1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u64_m8, svuint64_t, uint64_t,
+	   z0 = svldnf1_vnum_u64 (p0, x0, -8),
+	   z0 = svldnf1_vnum (p0, x0, -8))
+
+/*
+** ldnf1_vnum_u64_m9:
+**	decb	x0, all, mul #9
+**	ldnf1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u64_m9, svuint64_t, uint64_t,
+	   z0 = svldnf1_vnum_u64 (p0, x0, -9),
+	   z0 = svldnf1_vnum (p0, x0, -9))
+
+/*
+** ldnf1_vnum_u64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1d	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u64_x1, svuint64_t, uint64_t,
+	   z0 = svldnf1_vnum_u64 (p0, x0, x1),
+	   z0 = svldnf1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u8.c
new file mode 100644
index 000000000..5c29f1d19
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1_u8.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1_u8_base:
+**	ldnf1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u8_base, svuint8_t, uint8_t,
+	   z0 = svldnf1_u8 (p0, x0),
+	   z0 = svldnf1 (p0, x0))
+
+/*
+** ldnf1_u8_index:
+**	add	(x[0-9]+), x0, x1
+**	ldnf1b	z0\.b, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u8_index, svuint8_t, uint8_t,
+	   z0 = svldnf1_u8 (p0, x0 + x1),
+	   z0 = svldnf1 (p0, x0 + x1))
+
+/*
+** ldnf1_u8_1:
+**	ldnf1b	z0\.b, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u8_1, svuint8_t, uint8_t,
+	   z0 = svldnf1_u8 (p0, x0 + svcntb ()),
+	   z0 = svldnf1 (p0, x0 + svcntb ()))
+
+/*
+** ldnf1_u8_7:
+**	ldnf1b	z0\.b, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u8_7, svuint8_t, uint8_t,
+	   z0 = svldnf1_u8 (p0, x0 + svcntb () * 7),
+	   z0 = svldnf1 (p0, x0 + svcntb () * 7))
+
+/*
+** ldnf1_u8_8:
+**	incb	x0, all, mul #8
+**	ldnf1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u8_8, svuint8_t, uint8_t,
+	   z0 = svldnf1_u8 (p0, x0 + svcntb () * 8),
+	   z0 = svldnf1 (p0, x0 + svcntb () * 8))
+
+/*
+** ldnf1_u8_m1:
+**	ldnf1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u8_m1, svuint8_t, uint8_t,
+	   z0 = svldnf1_u8 (p0, x0 - svcntb ()),
+	   z0 = svldnf1 (p0, x0 - svcntb ()))
+
+/*
+** ldnf1_u8_m8:
+**	ldnf1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u8_m8, svuint8_t, uint8_t,
+	   z0 = svldnf1_u8 (p0, x0 - svcntb () * 8),
+	   z0 = svldnf1 (p0, x0 - svcntb () * 8))
+
+/*
+** ldnf1_u8_m9:
+**	decb	x0, all, mul #9
+**	ldnf1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_u8_m9, svuint8_t, uint8_t,
+	   z0 = svldnf1_u8 (p0, x0 - svcntb () * 9),
+	   z0 = svldnf1 (p0, x0 - svcntb () * 9))
+
+/*
+** ldnf1_vnum_u8_0:
+**	ldnf1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u8_0, svuint8_t, uint8_t,
+	   z0 = svldnf1_vnum_u8 (p0, x0, 0),
+	   z0 = svldnf1_vnum (p0, x0, 0))
+
+/*
+** ldnf1_vnum_u8_1:
+**	ldnf1b	z0\.b, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u8_1, svuint8_t, uint8_t,
+	   z0 = svldnf1_vnum_u8 (p0, x0, 1),
+	   z0 = svldnf1_vnum (p0, x0, 1))
+
+/*
+** ldnf1_vnum_u8_7:
+**	ldnf1b	z0\.b, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u8_7, svuint8_t, uint8_t,
+	   z0 = svldnf1_vnum_u8 (p0, x0, 7),
+	   z0 = svldnf1_vnum (p0, x0, 7))
+
+/*
+** ldnf1_vnum_u8_8:
+**	incb	x0, all, mul #8
+**	ldnf1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u8_8, svuint8_t, uint8_t,
+	   z0 = svldnf1_vnum_u8 (p0, x0, 8),
+	   z0 = svldnf1_vnum (p0, x0, 8))
+
+/*
+** ldnf1_vnum_u8_m1:
+**	ldnf1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u8_m1, svuint8_t, uint8_t,
+	   z0 = svldnf1_vnum_u8 (p0, x0, -1),
+	   z0 = svldnf1_vnum (p0, x0, -1))
+
+/*
+** ldnf1_vnum_u8_m8:
+**	ldnf1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u8_m8, svuint8_t, uint8_t,
+	   z0 = svldnf1_vnum_u8 (p0, x0, -8),
+	   z0 = svldnf1_vnum (p0, x0, -8))
+
+/*
+** ldnf1_vnum_u8_m9:
+**	decb	x0, all, mul #9
+**	ldnf1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u8_m9, svuint8_t, uint8_t,
+	   z0 = svldnf1_vnum_u8 (p0, x0, -9),
+	   z0 = svldnf1_vnum (p0, x0, -9))
+
+/*
+** ldnf1_vnum_u8_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1b	z0\.b, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1_vnum_u8_x1, svuint8_t, uint8_t,
+	   z0 = svldnf1_vnum_u8 (p0, x0, x1),
+	   z0 = svldnf1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s16.c
new file mode 100644
index 000000000..e04b9a788
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s16.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1sb_s16_base:
+**	ldnf1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s16_base, svint16_t, int8_t,
+	   z0 = svldnf1sb_s16 (p0, x0),
+	   z0 = svldnf1sb_s16 (p0, x0))
+
+/*
+** ldnf1sb_s16_index:
+**	add	(x[0-9]+), x0, x1
+**	ldnf1sb	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s16_index, svint16_t, int8_t,
+	   z0 = svldnf1sb_s16 (p0, x0 + x1),
+	   z0 = svldnf1sb_s16 (p0, x0 + x1))
+
+/*
+** ldnf1sb_s16_1:
+**	ldnf1sb	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s16_1, svint16_t, int8_t,
+	   z0 = svldnf1sb_s16 (p0, x0 + svcnth ()),
+	   z0 = svldnf1sb_s16 (p0, x0 + svcnth ()))
+
+/*
+** ldnf1sb_s16_7:
+**	ldnf1sb	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s16_7, svint16_t, int8_t,
+	   z0 = svldnf1sb_s16 (p0, x0 + svcnth () * 7),
+	   z0 = svldnf1sb_s16 (p0, x0 + svcnth () * 7))
+
+/*
+** ldnf1sb_s16_8:
+**	incb	x0, all, mul #4
+**	ldnf1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s16_8, svint16_t, int8_t,
+	   z0 = svldnf1sb_s16 (p0, x0 + svcnth () * 8),
+	   z0 = svldnf1sb_s16 (p0, x0 + svcnth () * 8))
+
+/*
+** ldnf1sb_s16_m1:
+**	ldnf1sb	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s16_m1, svint16_t, int8_t,
+	   z0 = svldnf1sb_s16 (p0, x0 - svcnth ()),
+	   z0 = svldnf1sb_s16 (p0, x0 - svcnth ()))
+
+/*
+** ldnf1sb_s16_m8:
+**	ldnf1sb	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s16_m8, svint16_t, int8_t,
+	   z0 = svldnf1sb_s16 (p0, x0 - svcnth () * 8),
+	   z0 = svldnf1sb_s16 (p0, x0 - svcnth () * 8))
+
+/*
+** ldnf1sb_s16_m9:
+**	dech	x0, all, mul #9
+**	ldnf1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s16_m9, svint16_t, int8_t,
+	   z0 = svldnf1sb_s16 (p0, x0 - svcnth () * 9),
+	   z0 = svldnf1sb_s16 (p0, x0 - svcnth () * 9))
+
+/*
+** ldnf1sb_vnum_s16_0:
+**	ldnf1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s16_0, svint16_t, int8_t,
+	   z0 = svldnf1sb_vnum_s16 (p0, x0, 0),
+	   z0 = svldnf1sb_vnum_s16 (p0, x0, 0))
+
+/*
+** ldnf1sb_vnum_s16_1:
+**	ldnf1sb	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s16_1, svint16_t, int8_t,
+	   z0 = svldnf1sb_vnum_s16 (p0, x0, 1),
+	   z0 = svldnf1sb_vnum_s16 (p0, x0, 1))
+
+/*
+** ldnf1sb_vnum_s16_7:
+**	ldnf1sb	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s16_7, svint16_t, int8_t,
+	   z0 = svldnf1sb_vnum_s16 (p0, x0, 7),
+	   z0 = svldnf1sb_vnum_s16 (p0, x0, 7))
+
+/*
+** ldnf1sb_vnum_s16_8:
+**	incb	x0, all, mul #4
+**	ldnf1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s16_8, svint16_t, int8_t,
+	   z0 = svldnf1sb_vnum_s16 (p0, x0, 8),
+	   z0 = svldnf1sb_vnum_s16 (p0, x0, 8))
+
+/*
+** ldnf1sb_vnum_s16_m1:
+**	ldnf1sb	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s16_m1, svint16_t, int8_t,
+	   z0 = svldnf1sb_vnum_s16 (p0, x0, -1),
+	   z0 = svldnf1sb_vnum_s16 (p0, x0, -1))
+
+/*
+** ldnf1sb_vnum_s16_m8:
+**	ldnf1sb	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s16_m8, svint16_t, int8_t,
+	   z0 = svldnf1sb_vnum_s16 (p0, x0, -8),
+	   z0 = svldnf1sb_vnum_s16 (p0, x0, -8))
+
+/*
+** ldnf1sb_vnum_s16_m9:
+**	dech	x0, all, mul #9
+**	ldnf1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s16_m9, svint16_t, int8_t,
+	   z0 = svldnf1sb_vnum_s16 (p0, x0, -9),
+	   z0 = svldnf1sb_vnum_s16 (p0, x0, -9))
+
+/*
+** ldnf1sb_vnum_s16_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1sb	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s16_x1, svint16_t, int8_t,
+	   z0 = svldnf1sb_vnum_s16 (p0, x0, x1),
+	   z0 = svldnf1sb_vnum_s16 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s32.c
new file mode 100644
index 000000000..0553fc98d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s32.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1sb_s32_base:
+**	ldnf1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s32_base, svint32_t, int8_t,
+	   z0 = svldnf1sb_s32 (p0, x0),
+	   z0 = svldnf1sb_s32 (p0, x0))
+
+/*
+** ldnf1sb_s32_index:
+**	add	(x[0-9]+), x0, x1
+**	ldnf1sb	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s32_index, svint32_t, int8_t,
+	   z0 = svldnf1sb_s32 (p0, x0 + x1),
+	   z0 = svldnf1sb_s32 (p0, x0 + x1))
+
+/*
+** ldnf1sb_s32_1:
+**	ldnf1sb	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s32_1, svint32_t, int8_t,
+	   z0 = svldnf1sb_s32 (p0, x0 + svcntw ()),
+	   z0 = svldnf1sb_s32 (p0, x0 + svcntw ()))
+
+/*
+** ldnf1sb_s32_7:
+**	ldnf1sb	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s32_7, svint32_t, int8_t,
+	   z0 = svldnf1sb_s32 (p0, x0 + svcntw () * 7),
+	   z0 = svldnf1sb_s32 (p0, x0 + svcntw () * 7))
+
+/*
+** ldnf1sb_s32_8:
+**	incb	x0, all, mul #2
+**	ldnf1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s32_8, svint32_t, int8_t,
+	   z0 = svldnf1sb_s32 (p0, x0 + svcntw () * 8),
+	   z0 = svldnf1sb_s32 (p0, x0 + svcntw () * 8))
+
+/*
+** ldnf1sb_s32_m1:
+**	ldnf1sb	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s32_m1, svint32_t, int8_t,
+	   z0 = svldnf1sb_s32 (p0, x0 - svcntw ()),
+	   z0 = svldnf1sb_s32 (p0, x0 - svcntw ()))
+
+/*
+** ldnf1sb_s32_m8:
+**	ldnf1sb	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s32_m8, svint32_t, int8_t,
+	   z0 = svldnf1sb_s32 (p0, x0 - svcntw () * 8),
+	   z0 = svldnf1sb_s32 (p0, x0 - svcntw () * 8))
+
+/*
+** ldnf1sb_s32_m9:
+**	decw	x0, all, mul #9
+**	ldnf1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s32_m9, svint32_t, int8_t,
+	   z0 = svldnf1sb_s32 (p0, x0 - svcntw () * 9),
+	   z0 = svldnf1sb_s32 (p0, x0 - svcntw () * 9))
+
+/*
+** ldnf1sb_vnum_s32_0:
+**	ldnf1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s32_0, svint32_t, int8_t,
+	   z0 = svldnf1sb_vnum_s32 (p0, x0, 0),
+	   z0 = svldnf1sb_vnum_s32 (p0, x0, 0))
+
+/*
+** ldnf1sb_vnum_s32_1:
+**	ldnf1sb	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s32_1, svint32_t, int8_t,
+	   z0 = svldnf1sb_vnum_s32 (p0, x0, 1),
+	   z0 = svldnf1sb_vnum_s32 (p0, x0, 1))
+
+/*
+** ldnf1sb_vnum_s32_7:
+**	ldnf1sb	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s32_7, svint32_t, int8_t,
+	   z0 = svldnf1sb_vnum_s32 (p0, x0, 7),
+	   z0 = svldnf1sb_vnum_s32 (p0, x0, 7))
+
+/*
+** ldnf1sb_vnum_s32_8:
+**	incb	x0, all, mul #2
+**	ldnf1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s32_8, svint32_t, int8_t,
+	   z0 = svldnf1sb_vnum_s32 (p0, x0, 8),
+	   z0 = svldnf1sb_vnum_s32 (p0, x0, 8))
+
+/*
+** ldnf1sb_vnum_s32_m1:
+**	ldnf1sb	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s32_m1, svint32_t, int8_t,
+	   z0 = svldnf1sb_vnum_s32 (p0, x0, -1),
+	   z0 = svldnf1sb_vnum_s32 (p0, x0, -1))
+
+/*
+** ldnf1sb_vnum_s32_m8:
+**	ldnf1sb	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s32_m8, svint32_t, int8_t,
+	   z0 = svldnf1sb_vnum_s32 (p0, x0, -8),
+	   z0 = svldnf1sb_vnum_s32 (p0, x0, -8))
+
+/*
+** ldnf1sb_vnum_s32_m9:
+**	decw	x0, all, mul #9
+**	ldnf1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s32_m9, svint32_t, int8_t,
+	   z0 = svldnf1sb_vnum_s32 (p0, x0, -9),
+	   z0 = svldnf1sb_vnum_s32 (p0, x0, -9))
+
+/*
+** ldnf1sb_vnum_s32_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1sb	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s32_x1, svint32_t, int8_t,
+	   z0 = svldnf1sb_vnum_s32 (p0, x0, x1),
+	   z0 = svldnf1sb_vnum_s32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s64.c
new file mode 100644
index 000000000..61a474fdf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_s64.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1sb_s64_base:
+**	ldnf1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s64_base, svint64_t, int8_t,
+	   z0 = svldnf1sb_s64 (p0, x0),
+	   z0 = svldnf1sb_s64 (p0, x0))
+
+/*
+** ldnf1sb_s64_index:
+**	add	(x[0-9]+), x0, x1
+**	ldnf1sb	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s64_index, svint64_t, int8_t,
+	   z0 = svldnf1sb_s64 (p0, x0 + x1),
+	   z0 = svldnf1sb_s64 (p0, x0 + x1))
+
+/*
+** ldnf1sb_s64_1:
+**	ldnf1sb	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s64_1, svint64_t, int8_t,
+	   z0 = svldnf1sb_s64 (p0, x0 + svcntd ()),
+	   z0 = svldnf1sb_s64 (p0, x0 + svcntd ()))
+
+/*
+** ldnf1sb_s64_7:
+**	ldnf1sb	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s64_7, svint64_t, int8_t,
+	   z0 = svldnf1sb_s64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnf1sb_s64 (p0, x0 + svcntd () * 7))
+
+/*
+** ldnf1sb_s64_8:
+**	incb	x0
+**	ldnf1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s64_8, svint64_t, int8_t,
+	   z0 = svldnf1sb_s64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnf1sb_s64 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnf1sb_s64_m1:
+**	ldnf1sb	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s64_m1, svint64_t, int8_t,
+	   z0 = svldnf1sb_s64 (p0, x0 - svcntd ()),
+	   z0 = svldnf1sb_s64 (p0, x0 - svcntd ()))
+
+/*
+** ldnf1sb_s64_m8:
+**	ldnf1sb	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s64_m8, svint64_t, int8_t,
+	   z0 = svldnf1sb_s64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnf1sb_s64 (p0, x0 - svcntd () * 8))
+
+/*
+** ldnf1sb_s64_m9:
+**	decd	x0, all, mul #9
+**	ldnf1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_s64_m9, svint64_t, int8_t,
+	   z0 = svldnf1sb_s64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnf1sb_s64 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnf1sb_vnum_s64_0:
+**	ldnf1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s64_0, svint64_t, int8_t,
+	   z0 = svldnf1sb_vnum_s64 (p0, x0, 0),
+	   z0 = svldnf1sb_vnum_s64 (p0, x0, 0))
+
+/*
+** ldnf1sb_vnum_s64_1:
+**	ldnf1sb	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s64_1, svint64_t, int8_t,
+	   z0 = svldnf1sb_vnum_s64 (p0, x0, 1),
+	   z0 = svldnf1sb_vnum_s64 (p0, x0, 1))
+
+/*
+** ldnf1sb_vnum_s64_7:
+**	ldnf1sb	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s64_7, svint64_t, int8_t,
+	   z0 = svldnf1sb_vnum_s64 (p0, x0, 7),
+	   z0 = svldnf1sb_vnum_s64 (p0, x0, 7))
+
+/*
+** ldnf1sb_vnum_s64_8:
+**	incb	x0
+**	ldnf1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s64_8, svint64_t, int8_t,
+	   z0 = svldnf1sb_vnum_s64 (p0, x0, 8),
+	   z0 = svldnf1sb_vnum_s64 (p0, x0, 8))
+
+/*
+** ldnf1sb_vnum_s64_m1:
+**	ldnf1sb	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s64_m1, svint64_t, int8_t,
+	   z0 = svldnf1sb_vnum_s64 (p0, x0, -1),
+	   z0 = svldnf1sb_vnum_s64 (p0, x0, -1))
+
+/*
+** ldnf1sb_vnum_s64_m8:
+**	ldnf1sb	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s64_m8, svint64_t, int8_t,
+	   z0 = svldnf1sb_vnum_s64 (p0, x0, -8),
+	   z0 = svldnf1sb_vnum_s64 (p0, x0, -8))
+
+/*
+** ldnf1sb_vnum_s64_m9:
+**	decd	x0, all, mul #9
+**	ldnf1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s64_m9, svint64_t, int8_t,
+	   z0 = svldnf1sb_vnum_s64 (p0, x0, -9),
+	   z0 = svldnf1sb_vnum_s64 (p0, x0, -9))
+
+/*
+** ldnf1sb_vnum_s64_x1:
+**	cntd	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1sb	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_s64_x1, svint64_t, int8_t,
+	   z0 = svldnf1sb_vnum_s64 (p0, x0, x1),
+	   z0 = svldnf1sb_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u16.c
new file mode 100644
index 000000000..be63d8bf9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u16.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1sb_u16_base:
+**	ldnf1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u16_base, svuint16_t, int8_t,
+	   z0 = svldnf1sb_u16 (p0, x0),
+	   z0 = svldnf1sb_u16 (p0, x0))
+
+/*
+** ldnf1sb_u16_index:
+**	add	(x[0-9]+), x0, x1
+**	ldnf1sb	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u16_index, svuint16_t, int8_t,
+	   z0 = svldnf1sb_u16 (p0, x0 + x1),
+	   z0 = svldnf1sb_u16 (p0, x0 + x1))
+
+/*
+** ldnf1sb_u16_1:
+**	ldnf1sb	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u16_1, svuint16_t, int8_t,
+	   z0 = svldnf1sb_u16 (p0, x0 + svcnth ()),
+	   z0 = svldnf1sb_u16 (p0, x0 + svcnth ()))
+
+/*
+** ldnf1sb_u16_7:
+**	ldnf1sb	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u16_7, svuint16_t, int8_t,
+	   z0 = svldnf1sb_u16 (p0, x0 + svcnth () * 7),
+	   z0 = svldnf1sb_u16 (p0, x0 + svcnth () * 7))
+
+/*
+** ldnf1sb_u16_8:
+**	incb	x0, all, mul #4
+**	ldnf1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u16_8, svuint16_t, int8_t,
+	   z0 = svldnf1sb_u16 (p0, x0 + svcnth () * 8),
+	   z0 = svldnf1sb_u16 (p0, x0 + svcnth () * 8))
+
+/*
+** ldnf1sb_u16_m1:
+**	ldnf1sb	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u16_m1, svuint16_t, int8_t,
+	   z0 = svldnf1sb_u16 (p0, x0 - svcnth ()),
+	   z0 = svldnf1sb_u16 (p0, x0 - svcnth ()))
+
+/*
+** ldnf1sb_u16_m8:
+**	ldnf1sb	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u16_m8, svuint16_t, int8_t,
+	   z0 = svldnf1sb_u16 (p0, x0 - svcnth () * 8),
+	   z0 = svldnf1sb_u16 (p0, x0 - svcnth () * 8))
+
+/*
+** ldnf1sb_u16_m9:
+**	dech	x0, all, mul #9
+**	ldnf1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u16_m9, svuint16_t, int8_t,
+	   z0 = svldnf1sb_u16 (p0, x0 - svcnth () * 9),
+	   z0 = svldnf1sb_u16 (p0, x0 - svcnth () * 9))
+
+/*
+** ldnf1sb_vnum_u16_0:
+**	ldnf1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u16_0, svuint16_t, int8_t,
+	   z0 = svldnf1sb_vnum_u16 (p0, x0, 0),
+	   z0 = svldnf1sb_vnum_u16 (p0, x0, 0))
+
+/*
+** ldnf1sb_vnum_u16_1:
+**	ldnf1sb	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u16_1, svuint16_t, int8_t,
+	   z0 = svldnf1sb_vnum_u16 (p0, x0, 1),
+	   z0 = svldnf1sb_vnum_u16 (p0, x0, 1))
+
+/*
+** ldnf1sb_vnum_u16_7:
+**	ldnf1sb	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u16_7, svuint16_t, int8_t,
+	   z0 = svldnf1sb_vnum_u16 (p0, x0, 7),
+	   z0 = svldnf1sb_vnum_u16 (p0, x0, 7))
+
+/*
+** ldnf1sb_vnum_u16_8:
+**	incb	x0, all, mul #4
+**	ldnf1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u16_8, svuint16_t, int8_t,
+	   z0 = svldnf1sb_vnum_u16 (p0, x0, 8),
+	   z0 = svldnf1sb_vnum_u16 (p0, x0, 8))
+
+/*
+** ldnf1sb_vnum_u16_m1:
+**	ldnf1sb	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u16_m1, svuint16_t, int8_t,
+	   z0 = svldnf1sb_vnum_u16 (p0, x0, -1),
+	   z0 = svldnf1sb_vnum_u16 (p0, x0, -1))
+
+/*
+** ldnf1sb_vnum_u16_m8:
+**	ldnf1sb	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u16_m8, svuint16_t, int8_t,
+	   z0 = svldnf1sb_vnum_u16 (p0, x0, -8),
+	   z0 = svldnf1sb_vnum_u16 (p0, x0, -8))
+
+/*
+** ldnf1sb_vnum_u16_m9:
+**	dech	x0, all, mul #9
+**	ldnf1sb	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u16_m9, svuint16_t, int8_t,
+	   z0 = svldnf1sb_vnum_u16 (p0, x0, -9),
+	   z0 = svldnf1sb_vnum_u16 (p0, x0, -9))
+
+/*
+** ldnf1sb_vnum_u16_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1sb	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u16_x1, svuint16_t, int8_t,
+	   z0 = svldnf1sb_vnum_u16 (p0, x0, x1),
+	   z0 = svldnf1sb_vnum_u16 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u32.c
new file mode 100644
index 000000000..4f52490b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u32.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1sb_u32_base:
+**	ldnf1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u32_base, svuint32_t, int8_t,
+	   z0 = svldnf1sb_u32 (p0, x0),
+	   z0 = svldnf1sb_u32 (p0, x0))
+
+/*
+** ldnf1sb_u32_index:
+**	add	(x[0-9]+), x0, x1
+**	ldnf1sb	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u32_index, svuint32_t, int8_t,
+	   z0 = svldnf1sb_u32 (p0, x0 + x1),
+	   z0 = svldnf1sb_u32 (p0, x0 + x1))
+
+/*
+** ldnf1sb_u32_1:
+**	ldnf1sb	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u32_1, svuint32_t, int8_t,
+	   z0 = svldnf1sb_u32 (p0, x0 + svcntw ()),
+	   z0 = svldnf1sb_u32 (p0, x0 + svcntw ()))
+
+/*
+** ldnf1sb_u32_7:
+**	ldnf1sb	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u32_7, svuint32_t, int8_t,
+	   z0 = svldnf1sb_u32 (p0, x0 + svcntw () * 7),
+	   z0 = svldnf1sb_u32 (p0, x0 + svcntw () * 7))
+
+/*
+** ldnf1sb_u32_8:
+**	incb	x0, all, mul #2
+**	ldnf1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u32_8, svuint32_t, int8_t,
+	   z0 = svldnf1sb_u32 (p0, x0 + svcntw () * 8),
+	   z0 = svldnf1sb_u32 (p0, x0 + svcntw () * 8))
+
+/*
+** ldnf1sb_u32_m1:
+**	ldnf1sb	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u32_m1, svuint32_t, int8_t,
+	   z0 = svldnf1sb_u32 (p0, x0 - svcntw ()),
+	   z0 = svldnf1sb_u32 (p0, x0 - svcntw ()))
+
+/*
+** ldnf1sb_u32_m8:
+**	ldnf1sb	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u32_m8, svuint32_t, int8_t,
+	   z0 = svldnf1sb_u32 (p0, x0 - svcntw () * 8),
+	   z0 = svldnf1sb_u32 (p0, x0 - svcntw () * 8))
+
+/*
+** ldnf1sb_u32_m9:
+**	decw	x0, all, mul #9
+**	ldnf1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u32_m9, svuint32_t, int8_t,
+	   z0 = svldnf1sb_u32 (p0, x0 - svcntw () * 9),
+	   z0 = svldnf1sb_u32 (p0, x0 - svcntw () * 9))
+
+/*
+** ldnf1sb_vnum_u32_0:
+**	ldnf1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u32_0, svuint32_t, int8_t,
+	   z0 = svldnf1sb_vnum_u32 (p0, x0, 0),
+	   z0 = svldnf1sb_vnum_u32 (p0, x0, 0))
+
+/*
+** ldnf1sb_vnum_u32_1:
+**	ldnf1sb	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u32_1, svuint32_t, int8_t,
+	   z0 = svldnf1sb_vnum_u32 (p0, x0, 1),
+	   z0 = svldnf1sb_vnum_u32 (p0, x0, 1))
+
+/*
+** ldnf1sb_vnum_u32_7:
+**	ldnf1sb	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u32_7, svuint32_t, int8_t,
+	   z0 = svldnf1sb_vnum_u32 (p0, x0, 7),
+	   z0 = svldnf1sb_vnum_u32 (p0, x0, 7))
+
+/*
+** ldnf1sb_vnum_u32_8:
+**	incb	x0, all, mul #2
+**	ldnf1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u32_8, svuint32_t, int8_t,
+	   z0 = svldnf1sb_vnum_u32 (p0, x0, 8),
+	   z0 = svldnf1sb_vnum_u32 (p0, x0, 8))
+
+/*
+** ldnf1sb_vnum_u32_m1:
+**	ldnf1sb	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u32_m1, svuint32_t, int8_t,
+	   z0 = svldnf1sb_vnum_u32 (p0, x0, -1),
+	   z0 = svldnf1sb_vnum_u32 (p0, x0, -1))
+
+/*
+** ldnf1sb_vnum_u32_m8:
+**	ldnf1sb	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u32_m8, svuint32_t, int8_t,
+	   z0 = svldnf1sb_vnum_u32 (p0, x0, -8),
+	   z0 = svldnf1sb_vnum_u32 (p0, x0, -8))
+
+/*
+** ldnf1sb_vnum_u32_m9:
+**	decw	x0, all, mul #9
+**	ldnf1sb	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u32_m9, svuint32_t, int8_t,
+	   z0 = svldnf1sb_vnum_u32 (p0, x0, -9),
+	   z0 = svldnf1sb_vnum_u32 (p0, x0, -9))
+
+/*
+** ldnf1sb_vnum_u32_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1sb	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u32_x1, svuint32_t, int8_t,
+	   z0 = svldnf1sb_vnum_u32 (p0, x0, x1),
+	   z0 = svldnf1sb_vnum_u32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u64.c
new file mode 100644
index 000000000..73f50d182
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sb_u64.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1sb_u64_base:
+**	ldnf1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u64_base, svuint64_t, int8_t,
+	   z0 = svldnf1sb_u64 (p0, x0),
+	   z0 = svldnf1sb_u64 (p0, x0))
+
+/*
+** ldnf1sb_u64_index:
+**	add	(x[0-9]+), x0, x1
+**	ldnf1sb	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u64_index, svuint64_t, int8_t,
+	   z0 = svldnf1sb_u64 (p0, x0 + x1),
+	   z0 = svldnf1sb_u64 (p0, x0 + x1))
+
+/*
+** ldnf1sb_u64_1:
+**	ldnf1sb	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u64_1, svuint64_t, int8_t,
+	   z0 = svldnf1sb_u64 (p0, x0 + svcntd ()),
+	   z0 = svldnf1sb_u64 (p0, x0 + svcntd ()))
+
+/*
+** ldnf1sb_u64_7:
+**	ldnf1sb	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u64_7, svuint64_t, int8_t,
+	   z0 = svldnf1sb_u64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnf1sb_u64 (p0, x0 + svcntd () * 7))
+
+/*
+** ldnf1sb_u64_8:
+**	incb	x0
+**	ldnf1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u64_8, svuint64_t, int8_t,
+	   z0 = svldnf1sb_u64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnf1sb_u64 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnf1sb_u64_m1:
+**	ldnf1sb	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u64_m1, svuint64_t, int8_t,
+	   z0 = svldnf1sb_u64 (p0, x0 - svcntd ()),
+	   z0 = svldnf1sb_u64 (p0, x0 - svcntd ()))
+
+/*
+** ldnf1sb_u64_m8:
+**	ldnf1sb	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u64_m8, svuint64_t, int8_t,
+	   z0 = svldnf1sb_u64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnf1sb_u64 (p0, x0 - svcntd () * 8))
+
+/*
+** ldnf1sb_u64_m9:
+**	decd	x0, all, mul #9
+**	ldnf1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_u64_m9, svuint64_t, int8_t,
+	   z0 = svldnf1sb_u64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnf1sb_u64 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnf1sb_vnum_u64_0:
+**	ldnf1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u64_0, svuint64_t, int8_t,
+	   z0 = svldnf1sb_vnum_u64 (p0, x0, 0),
+	   z0 = svldnf1sb_vnum_u64 (p0, x0, 0))
+
+/*
+** ldnf1sb_vnum_u64_1:
+**	ldnf1sb	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u64_1, svuint64_t, int8_t,
+	   z0 = svldnf1sb_vnum_u64 (p0, x0, 1),
+	   z0 = svldnf1sb_vnum_u64 (p0, x0, 1))
+
+/*
+** ldnf1sb_vnum_u64_7:
+**	ldnf1sb	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u64_7, svuint64_t, int8_t,
+	   z0 = svldnf1sb_vnum_u64 (p0, x0, 7),
+	   z0 = svldnf1sb_vnum_u64 (p0, x0, 7))
+
+/*
+** ldnf1sb_vnum_u64_8:
+**	incb	x0
+**	ldnf1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u64_8, svuint64_t, int8_t,
+	   z0 = svldnf1sb_vnum_u64 (p0, x0, 8),
+	   z0 = svldnf1sb_vnum_u64 (p0, x0, 8))
+
+/*
+** ldnf1sb_vnum_u64_m1:
+**	ldnf1sb	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u64_m1, svuint64_t, int8_t,
+	   z0 = svldnf1sb_vnum_u64 (p0, x0, -1),
+	   z0 = svldnf1sb_vnum_u64 (p0, x0, -1))
+
+/*
+** ldnf1sb_vnum_u64_m8:
+**	ldnf1sb	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u64_m8, svuint64_t, int8_t,
+	   z0 = svldnf1sb_vnum_u64 (p0, x0, -8),
+	   z0 = svldnf1sb_vnum_u64 (p0, x0, -8))
+
+/*
+** ldnf1sb_vnum_u64_m9:
+**	decd	x0, all, mul #9
+**	ldnf1sb	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u64_m9, svuint64_t, int8_t,
+	   z0 = svldnf1sb_vnum_u64 (p0, x0, -9),
+	   z0 = svldnf1sb_vnum_u64 (p0, x0, -9))
+
+/*
+** ldnf1sb_vnum_u64_x1:
+**	cntd	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1sb	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1sb_vnum_u64_x1, svuint64_t, int8_t,
+	   z0 = svldnf1sb_vnum_u64 (p0, x0, x1),
+	   z0 = svldnf1sb_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_s32.c
new file mode 100644
index 000000000..08c7dc6dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_s32.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1sh_s32_base:
+**	ldnf1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_s32_base, svint32_t, int16_t,
+	   z0 = svldnf1sh_s32 (p0, x0),
+	   z0 = svldnf1sh_s32 (p0, x0))
+
+/*
+** ldnf1sh_s32_index:
+**	add	(x[0-9]+), x0, x1, lsl 1
+**	ldnf1sh	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_s32_index, svint32_t, int16_t,
+	   z0 = svldnf1sh_s32 (p0, x0 + x1),
+	   z0 = svldnf1sh_s32 (p0, x0 + x1))
+
+/*
+** ldnf1sh_s32_1:
+**	ldnf1sh	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_s32_1, svint32_t, int16_t,
+	   z0 = svldnf1sh_s32 (p0, x0 + svcntw ()),
+	   z0 = svldnf1sh_s32 (p0, x0 + svcntw ()))
+
+/*
+** ldnf1sh_s32_7:
+**	ldnf1sh	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_s32_7, svint32_t, int16_t,
+	   z0 = svldnf1sh_s32 (p0, x0 + svcntw () * 7),
+	   z0 = svldnf1sh_s32 (p0, x0 + svcntw () * 7))
+
+/*
+** ldnf1sh_s32_8:
+**	incb	x0, all, mul #4
+**	ldnf1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_s32_8, svint32_t, int16_t,
+	   z0 = svldnf1sh_s32 (p0, x0 + svcntw () * 8),
+	   z0 = svldnf1sh_s32 (p0, x0 + svcntw () * 8))
+
+/*
+** ldnf1sh_s32_m1:
+**	ldnf1sh	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_s32_m1, svint32_t, int16_t,
+	   z0 = svldnf1sh_s32 (p0, x0 - svcntw ()),
+	   z0 = svldnf1sh_s32 (p0, x0 - svcntw ()))
+
+/*
+** ldnf1sh_s32_m8:
+**	ldnf1sh	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_s32_m8, svint32_t, int16_t,
+	   z0 = svldnf1sh_s32 (p0, x0 - svcntw () * 8),
+	   z0 = svldnf1sh_s32 (p0, x0 - svcntw () * 8))
+
+/*
+** ldnf1sh_s32_m9:
+**	dech	x0, all, mul #9
+**	ldnf1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_s32_m9, svint32_t, int16_t,
+	   z0 = svldnf1sh_s32 (p0, x0 - svcntw () * 9),
+	   z0 = svldnf1sh_s32 (p0, x0 - svcntw () * 9))
+
+/*
+** ldnf1sh_vnum_s32_0:
+**	ldnf1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_s32_0, svint32_t, int16_t,
+	   z0 = svldnf1sh_vnum_s32 (p0, x0, 0),
+	   z0 = svldnf1sh_vnum_s32 (p0, x0, 0))
+
+/*
+** ldnf1sh_vnum_s32_1:
+**	ldnf1sh	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_s32_1, svint32_t, int16_t,
+	   z0 = svldnf1sh_vnum_s32 (p0, x0, 1),
+	   z0 = svldnf1sh_vnum_s32 (p0, x0, 1))
+
+/*
+** ldnf1sh_vnum_s32_7:
+**	ldnf1sh	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_s32_7, svint32_t, int16_t,
+	   z0 = svldnf1sh_vnum_s32 (p0, x0, 7),
+	   z0 = svldnf1sh_vnum_s32 (p0, x0, 7))
+
+/*
+** ldnf1sh_vnum_s32_8:
+**	incb	x0, all, mul #4
+**	ldnf1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_s32_8, svint32_t, int16_t,
+	   z0 = svldnf1sh_vnum_s32 (p0, x0, 8),
+	   z0 = svldnf1sh_vnum_s32 (p0, x0, 8))
+
+/*
+** ldnf1sh_vnum_s32_m1:
+**	ldnf1sh	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_s32_m1, svint32_t, int16_t,
+	   z0 = svldnf1sh_vnum_s32 (p0, x0, -1),
+	   z0 = svldnf1sh_vnum_s32 (p0, x0, -1))
+
+/*
+** ldnf1sh_vnum_s32_m8:
+**	ldnf1sh	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_s32_m8, svint32_t, int16_t,
+	   z0 = svldnf1sh_vnum_s32 (p0, x0, -8),
+	   z0 = svldnf1sh_vnum_s32 (p0, x0, -8))
+
+/*
+** ldnf1sh_vnum_s32_m9:
+**	dech	x0, all, mul #9
+**	ldnf1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_s32_m9, svint32_t, int16_t,
+	   z0 = svldnf1sh_vnum_s32 (p0, x0, -9),
+	   z0 = svldnf1sh_vnum_s32 (p0, x0, -9))
+
+/*
+** ldnf1sh_vnum_s32_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1sh	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_s32_x1, svint32_t, int16_t,
+	   z0 = svldnf1sh_vnum_s32 (p0, x0, x1),
+	   z0 = svldnf1sh_vnum_s32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_s64.c
new file mode 100644
index 000000000..6a41bc26b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_s64.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1sh_s64_base:
+**	ldnf1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_s64_base, svint64_t, int16_t,
+	   z0 = svldnf1sh_s64 (p0, x0),
+	   z0 = svldnf1sh_s64 (p0, x0))
+
+/*
+** ldnf1sh_s64_index:
+**	add	(x[0-9]+), x0, x1, lsl 1
+**	ldnf1sh	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_s64_index, svint64_t, int16_t,
+	   z0 = svldnf1sh_s64 (p0, x0 + x1),
+	   z0 = svldnf1sh_s64 (p0, x0 + x1))
+
+/*
+** ldnf1sh_s64_1:
+**	ldnf1sh	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_s64_1, svint64_t, int16_t,
+	   z0 = svldnf1sh_s64 (p0, x0 + svcntd ()),
+	   z0 = svldnf1sh_s64 (p0, x0 + svcntd ()))
+
+/*
+** ldnf1sh_s64_7:
+**	ldnf1sh	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_s64_7, svint64_t, int16_t,
+	   z0 = svldnf1sh_s64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnf1sh_s64 (p0, x0 + svcntd () * 7))
+
+/*
+** ldnf1sh_s64_8:
+**	incb	x0, all, mul #2
+**	ldnf1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_s64_8, svint64_t, int16_t,
+	   z0 = svldnf1sh_s64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnf1sh_s64 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnf1sh_s64_m1:
+**	ldnf1sh	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_s64_m1, svint64_t, int16_t,
+	   z0 = svldnf1sh_s64 (p0, x0 - svcntd ()),
+	   z0 = svldnf1sh_s64 (p0, x0 - svcntd ()))
+
+/*
+** ldnf1sh_s64_m8:
+**	ldnf1sh	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_s64_m8, svint64_t, int16_t,
+	   z0 = svldnf1sh_s64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnf1sh_s64 (p0, x0 - svcntd () * 8))
+
+/*
+** ldnf1sh_s64_m9:
+**	decw	x0, all, mul #9
+**	ldnf1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_s64_m9, svint64_t, int16_t,
+	   z0 = svldnf1sh_s64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnf1sh_s64 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnf1sh_vnum_s64_0:
+**	ldnf1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_s64_0, svint64_t, int16_t,
+	   z0 = svldnf1sh_vnum_s64 (p0, x0, 0),
+	   z0 = svldnf1sh_vnum_s64 (p0, x0, 0))
+
+/*
+** ldnf1sh_vnum_s64_1:
+**	ldnf1sh	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_s64_1, svint64_t, int16_t,
+	   z0 = svldnf1sh_vnum_s64 (p0, x0, 1),
+	   z0 = svldnf1sh_vnum_s64 (p0, x0, 1))
+
+/*
+** ldnf1sh_vnum_s64_7:
+**	ldnf1sh	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_s64_7, svint64_t, int16_t,
+	   z0 = svldnf1sh_vnum_s64 (p0, x0, 7),
+	   z0 = svldnf1sh_vnum_s64 (p0, x0, 7))
+
+/*
+** ldnf1sh_vnum_s64_8:
+**	incb	x0, all, mul #2
+**	ldnf1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_s64_8, svint64_t, int16_t,
+	   z0 = svldnf1sh_vnum_s64 (p0, x0, 8),
+	   z0 = svldnf1sh_vnum_s64 (p0, x0, 8))
+
+/*
+** ldnf1sh_vnum_s64_m1:
+**	ldnf1sh	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_s64_m1, svint64_t, int16_t,
+	   z0 = svldnf1sh_vnum_s64 (p0, x0, -1),
+	   z0 = svldnf1sh_vnum_s64 (p0, x0, -1))
+
+/*
+** ldnf1sh_vnum_s64_m8:
+**	ldnf1sh	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_s64_m8, svint64_t, int16_t,
+	   z0 = svldnf1sh_vnum_s64 (p0, x0, -8),
+	   z0 = svldnf1sh_vnum_s64 (p0, x0, -8))
+
+/*
+** ldnf1sh_vnum_s64_m9:
+**	decw	x0, all, mul #9
+**	ldnf1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_s64_m9, svint64_t, int16_t,
+	   z0 = svldnf1sh_vnum_s64 (p0, x0, -9),
+	   z0 = svldnf1sh_vnum_s64 (p0, x0, -9))
+
+/*
+** ldnf1sh_vnum_s64_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1sh	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_s64_x1, svint64_t, int16_t,
+	   z0 = svldnf1sh_vnum_s64 (p0, x0, x1),
+	   z0 = svldnf1sh_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_u32.c
new file mode 100644
index 000000000..2f7718730
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_u32.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1sh_u32_base:
+**	ldnf1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_u32_base, svuint32_t, int16_t,
+	   z0 = svldnf1sh_u32 (p0, x0),
+	   z0 = svldnf1sh_u32 (p0, x0))
+
+/*
+** ldnf1sh_u32_index:
+**	add	(x[0-9]+), x0, x1, lsl 1
+**	ldnf1sh	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_u32_index, svuint32_t, int16_t,
+	   z0 = svldnf1sh_u32 (p0, x0 + x1),
+	   z0 = svldnf1sh_u32 (p0, x0 + x1))
+
+/*
+** ldnf1sh_u32_1:
+**	ldnf1sh	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_u32_1, svuint32_t, int16_t,
+	   z0 = svldnf1sh_u32 (p0, x0 + svcntw ()),
+	   z0 = svldnf1sh_u32 (p0, x0 + svcntw ()))
+
+/*
+** ldnf1sh_u32_7:
+**	ldnf1sh	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_u32_7, svuint32_t, int16_t,
+	   z0 = svldnf1sh_u32 (p0, x0 + svcntw () * 7),
+	   z0 = svldnf1sh_u32 (p0, x0 + svcntw () * 7))
+
+/*
+** ldnf1sh_u32_8:
+**	incb	x0, all, mul #4
+**	ldnf1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_u32_8, svuint32_t, int16_t,
+	   z0 = svldnf1sh_u32 (p0, x0 + svcntw () * 8),
+	   z0 = svldnf1sh_u32 (p0, x0 + svcntw () * 8))
+
+/*
+** ldnf1sh_u32_m1:
+**	ldnf1sh	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_u32_m1, svuint32_t, int16_t,
+	   z0 = svldnf1sh_u32 (p0, x0 - svcntw ()),
+	   z0 = svldnf1sh_u32 (p0, x0 - svcntw ()))
+
+/*
+** ldnf1sh_u32_m8:
+**	ldnf1sh	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_u32_m8, svuint32_t, int16_t,
+	   z0 = svldnf1sh_u32 (p0, x0 - svcntw () * 8),
+	   z0 = svldnf1sh_u32 (p0, x0 - svcntw () * 8))
+
+/*
+** ldnf1sh_u32_m9:
+**	dech	x0, all, mul #9
+**	ldnf1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_u32_m9, svuint32_t, int16_t,
+	   z0 = svldnf1sh_u32 (p0, x0 - svcntw () * 9),
+	   z0 = svldnf1sh_u32 (p0, x0 - svcntw () * 9))
+
+/*
+** ldnf1sh_vnum_u32_0:
+**	ldnf1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_u32_0, svuint32_t, int16_t,
+	   z0 = svldnf1sh_vnum_u32 (p0, x0, 0),
+	   z0 = svldnf1sh_vnum_u32 (p0, x0, 0))
+
+/*
+** ldnf1sh_vnum_u32_1:
+**	ldnf1sh	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_u32_1, svuint32_t, int16_t,
+	   z0 = svldnf1sh_vnum_u32 (p0, x0, 1),
+	   z0 = svldnf1sh_vnum_u32 (p0, x0, 1))
+
+/*
+** ldnf1sh_vnum_u32_7:
+**	ldnf1sh	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_u32_7, svuint32_t, int16_t,
+	   z0 = svldnf1sh_vnum_u32 (p0, x0, 7),
+	   z0 = svldnf1sh_vnum_u32 (p0, x0, 7))
+
+/*
+** ldnf1sh_vnum_u32_8:
+**	incb	x0, all, mul #4
+**	ldnf1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_u32_8, svuint32_t, int16_t,
+	   z0 = svldnf1sh_vnum_u32 (p0, x0, 8),
+	   z0 = svldnf1sh_vnum_u32 (p0, x0, 8))
+
+/*
+** ldnf1sh_vnum_u32_m1:
+**	ldnf1sh	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_u32_m1, svuint32_t, int16_t,
+	   z0 = svldnf1sh_vnum_u32 (p0, x0, -1),
+	   z0 = svldnf1sh_vnum_u32 (p0, x0, -1))
+
+/*
+** ldnf1sh_vnum_u32_m8:
+**	ldnf1sh	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_u32_m8, svuint32_t, int16_t,
+	   z0 = svldnf1sh_vnum_u32 (p0, x0, -8),
+	   z0 = svldnf1sh_vnum_u32 (p0, x0, -8))
+
+/*
+** ldnf1sh_vnum_u32_m9:
+**	dech	x0, all, mul #9
+**	ldnf1sh	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_u32_m9, svuint32_t, int16_t,
+	   z0 = svldnf1sh_vnum_u32 (p0, x0, -9),
+	   z0 = svldnf1sh_vnum_u32 (p0, x0, -9))
+
+/*
+** ldnf1sh_vnum_u32_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1sh	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_u32_x1, svuint32_t, int16_t,
+	   z0 = svldnf1sh_vnum_u32 (p0, x0, x1),
+	   z0 = svldnf1sh_vnum_u32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_u64.c
new file mode 100644
index 000000000..d7f1a68a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sh_u64.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1sh_u64_base:
+**	ldnf1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_u64_base, svuint64_t, int16_t,
+	   z0 = svldnf1sh_u64 (p0, x0),
+	   z0 = svldnf1sh_u64 (p0, x0))
+
+/*
+** ldnf1sh_u64_index:
+**	add	(x[0-9]+), x0, x1, lsl 1
+**	ldnf1sh	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_u64_index, svuint64_t, int16_t,
+	   z0 = svldnf1sh_u64 (p0, x0 + x1),
+	   z0 = svldnf1sh_u64 (p0, x0 + x1))
+
+/*
+** ldnf1sh_u64_1:
+**	ldnf1sh	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_u64_1, svuint64_t, int16_t,
+	   z0 = svldnf1sh_u64 (p0, x0 + svcntd ()),
+	   z0 = svldnf1sh_u64 (p0, x0 + svcntd ()))
+
+/*
+** ldnf1sh_u64_7:
+**	ldnf1sh	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_u64_7, svuint64_t, int16_t,
+	   z0 = svldnf1sh_u64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnf1sh_u64 (p0, x0 + svcntd () * 7))
+
+/*
+** ldnf1sh_u64_8:
+**	incb	x0, all, mul #2
+**	ldnf1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_u64_8, svuint64_t, int16_t,
+	   z0 = svldnf1sh_u64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnf1sh_u64 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnf1sh_u64_m1:
+**	ldnf1sh	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_u64_m1, svuint64_t, int16_t,
+	   z0 = svldnf1sh_u64 (p0, x0 - svcntd ()),
+	   z0 = svldnf1sh_u64 (p0, x0 - svcntd ()))
+
+/*
+** ldnf1sh_u64_m8:
+**	ldnf1sh	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_u64_m8, svuint64_t, int16_t,
+	   z0 = svldnf1sh_u64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnf1sh_u64 (p0, x0 - svcntd () * 8))
+
+/*
+** ldnf1sh_u64_m9:
+**	decw	x0, all, mul #9
+**	ldnf1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_u64_m9, svuint64_t, int16_t,
+	   z0 = svldnf1sh_u64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnf1sh_u64 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnf1sh_vnum_u64_0:
+**	ldnf1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_u64_0, svuint64_t, int16_t,
+	   z0 = svldnf1sh_vnum_u64 (p0, x0, 0),
+	   z0 = svldnf1sh_vnum_u64 (p0, x0, 0))
+
+/*
+** ldnf1sh_vnum_u64_1:
+**	ldnf1sh	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_u64_1, svuint64_t, int16_t,
+	   z0 = svldnf1sh_vnum_u64 (p0, x0, 1),
+	   z0 = svldnf1sh_vnum_u64 (p0, x0, 1))
+
+/*
+** ldnf1sh_vnum_u64_7:
+**	ldnf1sh	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_u64_7, svuint64_t, int16_t,
+	   z0 = svldnf1sh_vnum_u64 (p0, x0, 7),
+	   z0 = svldnf1sh_vnum_u64 (p0, x0, 7))
+
+/*
+** ldnf1sh_vnum_u64_8:
+**	incb	x0, all, mul #2
+**	ldnf1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_u64_8, svuint64_t, int16_t,
+	   z0 = svldnf1sh_vnum_u64 (p0, x0, 8),
+	   z0 = svldnf1sh_vnum_u64 (p0, x0, 8))
+
+/*
+** ldnf1sh_vnum_u64_m1:
+**	ldnf1sh	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_u64_m1, svuint64_t, int16_t,
+	   z0 = svldnf1sh_vnum_u64 (p0, x0, -1),
+	   z0 = svldnf1sh_vnum_u64 (p0, x0, -1))
+
+/*
+** ldnf1sh_vnum_u64_m8:
+**	ldnf1sh	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_u64_m8, svuint64_t, int16_t,
+	   z0 = svldnf1sh_vnum_u64 (p0, x0, -8),
+	   z0 = svldnf1sh_vnum_u64 (p0, x0, -8))
+
+/*
+** ldnf1sh_vnum_u64_m9:
+**	decw	x0, all, mul #9
+**	ldnf1sh	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_u64_m9, svuint64_t, int16_t,
+	   z0 = svldnf1sh_vnum_u64 (p0, x0, -9),
+	   z0 = svldnf1sh_vnum_u64 (p0, x0, -9))
+
+/*
+** ldnf1sh_vnum_u64_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1sh	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1sh_vnum_u64_x1, svuint64_t, int16_t,
+	   z0 = svldnf1sh_vnum_u64 (p0, x0, x1),
+	   z0 = svldnf1sh_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sw_s64.c
new file mode 100644
index 000000000..5b483e4aa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sw_s64.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1sw_s64_base:
+**	ldnf1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_s64_base, svint64_t, int32_t,
+	   z0 = svldnf1sw_s64 (p0, x0),
+	   z0 = svldnf1sw_s64 (p0, x0))
+
+/*
+** ldnf1sw_s64_index:
+**	add	(x[0-9]+), x0, x1, lsl 2
+**	ldnf1sw	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_s64_index, svint64_t, int32_t,
+	   z0 = svldnf1sw_s64 (p0, x0 + x1),
+	   z0 = svldnf1sw_s64 (p0, x0 + x1))
+
+/*
+** ldnf1sw_s64_1:
+**	ldnf1sw	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_s64_1, svint64_t, int32_t,
+	   z0 = svldnf1sw_s64 (p0, x0 + svcntd ()),
+	   z0 = svldnf1sw_s64 (p0, x0 + svcntd ()))
+
+/*
+** ldnf1sw_s64_7:
+**	ldnf1sw	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_s64_7, svint64_t, int32_t,
+	   z0 = svldnf1sw_s64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnf1sw_s64 (p0, x0 + svcntd () * 7))
+
+/*
+** ldnf1sw_s64_8:
+**	incb	x0, all, mul #4
+**	ldnf1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_s64_8, svint64_t, int32_t,
+	   z0 = svldnf1sw_s64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnf1sw_s64 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnf1sw_s64_m1:
+**	ldnf1sw	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_s64_m1, svint64_t, int32_t,
+	   z0 = svldnf1sw_s64 (p0, x0 - svcntd ()),
+	   z0 = svldnf1sw_s64 (p0, x0 - svcntd ()))
+
+/*
+** ldnf1sw_s64_m8:
+**	ldnf1sw	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_s64_m8, svint64_t, int32_t,
+	   z0 = svldnf1sw_s64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnf1sw_s64 (p0, x0 - svcntd () * 8))
+
+/*
+** ldnf1sw_s64_m9:
+**	dech	x0, all, mul #9
+**	ldnf1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_s64_m9, svint64_t, int32_t,
+	   z0 = svldnf1sw_s64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnf1sw_s64 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnf1sw_vnum_s64_0:
+**	ldnf1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_vnum_s64_0, svint64_t, int32_t,
+	   z0 = svldnf1sw_vnum_s64 (p0, x0, 0),
+	   z0 = svldnf1sw_vnum_s64 (p0, x0, 0))
+
+/*
+** ldnf1sw_vnum_s64_1:
+**	ldnf1sw	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_vnum_s64_1, svint64_t, int32_t,
+	   z0 = svldnf1sw_vnum_s64 (p0, x0, 1),
+	   z0 = svldnf1sw_vnum_s64 (p0, x0, 1))
+
+/*
+** ldnf1sw_vnum_s64_7:
+**	ldnf1sw	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_vnum_s64_7, svint64_t, int32_t,
+	   z0 = svldnf1sw_vnum_s64 (p0, x0, 7),
+	   z0 = svldnf1sw_vnum_s64 (p0, x0, 7))
+
+/*
+** ldnf1sw_vnum_s64_8:
+**	incb	x0, all, mul #4
+**	ldnf1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_vnum_s64_8, svint64_t, int32_t,
+	   z0 = svldnf1sw_vnum_s64 (p0, x0, 8),
+	   z0 = svldnf1sw_vnum_s64 (p0, x0, 8))
+
+/*
+** ldnf1sw_vnum_s64_m1:
+**	ldnf1sw	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_vnum_s64_m1, svint64_t, int32_t,
+	   z0 = svldnf1sw_vnum_s64 (p0, x0, -1),
+	   z0 = svldnf1sw_vnum_s64 (p0, x0, -1))
+
+/*
+** ldnf1sw_vnum_s64_m8:
+**	ldnf1sw	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_vnum_s64_m8, svint64_t, int32_t,
+	   z0 = svldnf1sw_vnum_s64 (p0, x0, -8),
+	   z0 = svldnf1sw_vnum_s64 (p0, x0, -8))
+
+/*
+** ldnf1sw_vnum_s64_m9:
+**	dech	x0, all, mul #9
+**	ldnf1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_vnum_s64_m9, svint64_t, int32_t,
+	   z0 = svldnf1sw_vnum_s64 (p0, x0, -9),
+	   z0 = svldnf1sw_vnum_s64 (p0, x0, -9))
+
+/*
+** ldnf1sw_vnum_s64_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1sw	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_vnum_s64_x1, svint64_t, int32_t,
+	   z0 = svldnf1sw_vnum_s64 (p0, x0, x1),
+	   z0 = svldnf1sw_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sw_u64.c
new file mode 100644
index 000000000..62121ce0a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1sw_u64.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1sw_u64_base:
+**	ldnf1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_u64_base, svuint64_t, int32_t,
+	   z0 = svldnf1sw_u64 (p0, x0),
+	   z0 = svldnf1sw_u64 (p0, x0))
+
+/*
+** ldnf1sw_u64_index:
+**	add	(x[0-9]+), x0, x1, lsl 2
+**	ldnf1sw	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_u64_index, svuint64_t, int32_t,
+	   z0 = svldnf1sw_u64 (p0, x0 + x1),
+	   z0 = svldnf1sw_u64 (p0, x0 + x1))
+
+/*
+** ldnf1sw_u64_1:
+**	ldnf1sw	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_u64_1, svuint64_t, int32_t,
+	   z0 = svldnf1sw_u64 (p0, x0 + svcntd ()),
+	   z0 = svldnf1sw_u64 (p0, x0 + svcntd ()))
+
+/*
+** ldnf1sw_u64_7:
+**	ldnf1sw	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_u64_7, svuint64_t, int32_t,
+	   z0 = svldnf1sw_u64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnf1sw_u64 (p0, x0 + svcntd () * 7))
+
+/*
+** ldnf1sw_u64_8:
+**	incb	x0, all, mul #4
+**	ldnf1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_u64_8, svuint64_t, int32_t,
+	   z0 = svldnf1sw_u64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnf1sw_u64 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnf1sw_u64_m1:
+**	ldnf1sw	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_u64_m1, svuint64_t, int32_t,
+	   z0 = svldnf1sw_u64 (p0, x0 - svcntd ()),
+	   z0 = svldnf1sw_u64 (p0, x0 - svcntd ()))
+
+/*
+** ldnf1sw_u64_m8:
+**	ldnf1sw	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_u64_m8, svuint64_t, int32_t,
+	   z0 = svldnf1sw_u64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnf1sw_u64 (p0, x0 - svcntd () * 8))
+
+/*
+** ldnf1sw_u64_m9:
+**	dech	x0, all, mul #9
+**	ldnf1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_u64_m9, svuint64_t, int32_t,
+	   z0 = svldnf1sw_u64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnf1sw_u64 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnf1sw_vnum_u64_0:
+**	ldnf1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_vnum_u64_0, svuint64_t, int32_t,
+	   z0 = svldnf1sw_vnum_u64 (p0, x0, 0),
+	   z0 = svldnf1sw_vnum_u64 (p0, x0, 0))
+
+/*
+** ldnf1sw_vnum_u64_1:
+**	ldnf1sw	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_vnum_u64_1, svuint64_t, int32_t,
+	   z0 = svldnf1sw_vnum_u64 (p0, x0, 1),
+	   z0 = svldnf1sw_vnum_u64 (p0, x0, 1))
+
+/*
+** ldnf1sw_vnum_u64_7:
+**	ldnf1sw	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_vnum_u64_7, svuint64_t, int32_t,
+	   z0 = svldnf1sw_vnum_u64 (p0, x0, 7),
+	   z0 = svldnf1sw_vnum_u64 (p0, x0, 7))
+
+/*
+** ldnf1sw_vnum_u64_8:
+**	incb	x0, all, mul #4
+**	ldnf1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_vnum_u64_8, svuint64_t, int32_t,
+	   z0 = svldnf1sw_vnum_u64 (p0, x0, 8),
+	   z0 = svldnf1sw_vnum_u64 (p0, x0, 8))
+
+/*
+** ldnf1sw_vnum_u64_m1:
+**	ldnf1sw	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_vnum_u64_m1, svuint64_t, int32_t,
+	   z0 = svldnf1sw_vnum_u64 (p0, x0, -1),
+	   z0 = svldnf1sw_vnum_u64 (p0, x0, -1))
+
+/*
+** ldnf1sw_vnum_u64_m8:
+**	ldnf1sw	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_vnum_u64_m8, svuint64_t, int32_t,
+	   z0 = svldnf1sw_vnum_u64 (p0, x0, -8),
+	   z0 = svldnf1sw_vnum_u64 (p0, x0, -8))
+
+/*
+** ldnf1sw_vnum_u64_m9:
+**	dech	x0, all, mul #9
+**	ldnf1sw	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_vnum_u64_m9, svuint64_t, int32_t,
+	   z0 = svldnf1sw_vnum_u64 (p0, x0, -9),
+	   z0 = svldnf1sw_vnum_u64 (p0, x0, -9))
+
+/*
+** ldnf1sw_vnum_u64_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1sw	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1sw_vnum_u64_x1, svuint64_t, int32_t,
+	   z0 = svldnf1sw_vnum_u64 (p0, x0, x1),
+	   z0 = svldnf1sw_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s16.c
new file mode 100644
index 000000000..8fe13411f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s16.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1ub_s16_base:
+**	ldnf1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s16_base, svint16_t, uint8_t,
+	   z0 = svldnf1ub_s16 (p0, x0),
+	   z0 = svldnf1ub_s16 (p0, x0))
+
+/*
+** ldnf1ub_s16_index:
+**	add	(x[0-9]+), x0, x1
+**	ldnf1b	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s16_index, svint16_t, uint8_t,
+	   z0 = svldnf1ub_s16 (p0, x0 + x1),
+	   z0 = svldnf1ub_s16 (p0, x0 + x1))
+
+/*
+** ldnf1ub_s16_1:
+**	ldnf1b	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s16_1, svint16_t, uint8_t,
+	   z0 = svldnf1ub_s16 (p0, x0 + svcnth ()),
+	   z0 = svldnf1ub_s16 (p0, x0 + svcnth ()))
+
+/*
+** ldnf1ub_s16_7:
+**	ldnf1b	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s16_7, svint16_t, uint8_t,
+	   z0 = svldnf1ub_s16 (p0, x0 + svcnth () * 7),
+	   z0 = svldnf1ub_s16 (p0, x0 + svcnth () * 7))
+
+/*
+** ldnf1ub_s16_8:
+**	incb	x0, all, mul #4
+**	ldnf1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s16_8, svint16_t, uint8_t,
+	   z0 = svldnf1ub_s16 (p0, x0 + svcnth () * 8),
+	   z0 = svldnf1ub_s16 (p0, x0 + svcnth () * 8))
+
+/*
+** ldnf1ub_s16_m1:
+**	ldnf1b	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s16_m1, svint16_t, uint8_t,
+	   z0 = svldnf1ub_s16 (p0, x0 - svcnth ()),
+	   z0 = svldnf1ub_s16 (p0, x0 - svcnth ()))
+
+/*
+** ldnf1ub_s16_m8:
+**	ldnf1b	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s16_m8, svint16_t, uint8_t,
+	   z0 = svldnf1ub_s16 (p0, x0 - svcnth () * 8),
+	   z0 = svldnf1ub_s16 (p0, x0 - svcnth () * 8))
+
+/*
+** ldnf1ub_s16_m9:
+**	dech	x0, all, mul #9
+**	ldnf1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s16_m9, svint16_t, uint8_t,
+	   z0 = svldnf1ub_s16 (p0, x0 - svcnth () * 9),
+	   z0 = svldnf1ub_s16 (p0, x0 - svcnth () * 9))
+
+/*
+** ldnf1ub_vnum_s16_0:
+**	ldnf1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s16_0, svint16_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s16 (p0, x0, 0),
+	   z0 = svldnf1ub_vnum_s16 (p0, x0, 0))
+
+/*
+** ldnf1ub_vnum_s16_1:
+**	ldnf1b	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s16_1, svint16_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s16 (p0, x0, 1),
+	   z0 = svldnf1ub_vnum_s16 (p0, x0, 1))
+
+/*
+** ldnf1ub_vnum_s16_7:
+**	ldnf1b	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s16_7, svint16_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s16 (p0, x0, 7),
+	   z0 = svldnf1ub_vnum_s16 (p0, x0, 7))
+
+/*
+** ldnf1ub_vnum_s16_8:
+**	incb	x0, all, mul #4
+**	ldnf1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s16_8, svint16_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s16 (p0, x0, 8),
+	   z0 = svldnf1ub_vnum_s16 (p0, x0, 8))
+
+/*
+** ldnf1ub_vnum_s16_m1:
+**	ldnf1b	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s16_m1, svint16_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s16 (p0, x0, -1),
+	   z0 = svldnf1ub_vnum_s16 (p0, x0, -1))
+
+/*
+** ldnf1ub_vnum_s16_m8:
+**	ldnf1b	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s16_m8, svint16_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s16 (p0, x0, -8),
+	   z0 = svldnf1ub_vnum_s16 (p0, x0, -8))
+
+/*
+** ldnf1ub_vnum_s16_m9:
+**	dech	x0, all, mul #9
+**	ldnf1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s16_m9, svint16_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s16 (p0, x0, -9),
+	   z0 = svldnf1ub_vnum_s16 (p0, x0, -9))
+
+/*
+** ldnf1ub_vnum_s16_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1b	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s16_x1, svint16_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s16 (p0, x0, x1),
+	   z0 = svldnf1ub_vnum_s16 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s32.c
new file mode 100644
index 000000000..50122e3b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s32.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1ub_s32_base:
+**	ldnf1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s32_base, svint32_t, uint8_t,
+	   z0 = svldnf1ub_s32 (p0, x0),
+	   z0 = svldnf1ub_s32 (p0, x0))
+
+/*
+** ldnf1ub_s32_index:
+**	add	(x[0-9]+), x0, x1
+**	ldnf1b	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s32_index, svint32_t, uint8_t,
+	   z0 = svldnf1ub_s32 (p0, x0 + x1),
+	   z0 = svldnf1ub_s32 (p0, x0 + x1))
+
+/*
+** ldnf1ub_s32_1:
+**	ldnf1b	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s32_1, svint32_t, uint8_t,
+	   z0 = svldnf1ub_s32 (p0, x0 + svcntw ()),
+	   z0 = svldnf1ub_s32 (p0, x0 + svcntw ()))
+
+/*
+** ldnf1ub_s32_7:
+**	ldnf1b	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s32_7, svint32_t, uint8_t,
+	   z0 = svldnf1ub_s32 (p0, x0 + svcntw () * 7),
+	   z0 = svldnf1ub_s32 (p0, x0 + svcntw () * 7))
+
+/*
+** ldnf1ub_s32_8:
+**	incb	x0, all, mul #2
+**	ldnf1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s32_8, svint32_t, uint8_t,
+	   z0 = svldnf1ub_s32 (p0, x0 + svcntw () * 8),
+	   z0 = svldnf1ub_s32 (p0, x0 + svcntw () * 8))
+
+/*
+** ldnf1ub_s32_m1:
+**	ldnf1b	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s32_m1, svint32_t, uint8_t,
+	   z0 = svldnf1ub_s32 (p0, x0 - svcntw ()),
+	   z0 = svldnf1ub_s32 (p0, x0 - svcntw ()))
+
+/*
+** ldnf1ub_s32_m8:
+**	ldnf1b	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s32_m8, svint32_t, uint8_t,
+	   z0 = svldnf1ub_s32 (p0, x0 - svcntw () * 8),
+	   z0 = svldnf1ub_s32 (p0, x0 - svcntw () * 8))
+
+/*
+** ldnf1ub_s32_m9:
+**	decw	x0, all, mul #9
+**	ldnf1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s32_m9, svint32_t, uint8_t,
+	   z0 = svldnf1ub_s32 (p0, x0 - svcntw () * 9),
+	   z0 = svldnf1ub_s32 (p0, x0 - svcntw () * 9))
+
+/*
+** ldnf1ub_vnum_s32_0:
+**	ldnf1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s32_0, svint32_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s32 (p0, x0, 0),
+	   z0 = svldnf1ub_vnum_s32 (p0, x0, 0))
+
+/*
+** ldnf1ub_vnum_s32_1:
+**	ldnf1b	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s32_1, svint32_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s32 (p0, x0, 1),
+	   z0 = svldnf1ub_vnum_s32 (p0, x0, 1))
+
+/*
+** ldnf1ub_vnum_s32_7:
+**	ldnf1b	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s32_7, svint32_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s32 (p0, x0, 7),
+	   z0 = svldnf1ub_vnum_s32 (p0, x0, 7))
+
+/*
+** ldnf1ub_vnum_s32_8:
+**	incb	x0, all, mul #2
+**	ldnf1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s32_8, svint32_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s32 (p0, x0, 8),
+	   z0 = svldnf1ub_vnum_s32 (p0, x0, 8))
+
+/*
+** ldnf1ub_vnum_s32_m1:
+**	ldnf1b	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s32_m1, svint32_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s32 (p0, x0, -1),
+	   z0 = svldnf1ub_vnum_s32 (p0, x0, -1))
+
+/*
+** ldnf1ub_vnum_s32_m8:
+**	ldnf1b	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s32_m8, svint32_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s32 (p0, x0, -8),
+	   z0 = svldnf1ub_vnum_s32 (p0, x0, -8))
+
+/*
+** ldnf1ub_vnum_s32_m9:
+**	decw	x0, all, mul #9
+**	ldnf1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s32_m9, svint32_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s32 (p0, x0, -9),
+	   z0 = svldnf1ub_vnum_s32 (p0, x0, -9))
+
+/*
+** ldnf1ub_vnum_s32_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1b	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s32_x1, svint32_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s32 (p0, x0, x1),
+	   z0 = svldnf1ub_vnum_s32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s64.c
new file mode 100644
index 000000000..d7cce11b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_s64.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1ub_s64_base:
+**	ldnf1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s64_base, svint64_t, uint8_t,
+	   z0 = svldnf1ub_s64 (p0, x0),
+	   z0 = svldnf1ub_s64 (p0, x0))
+
+/*
+** ldnf1ub_s64_index:
+**	add	(x[0-9]+), x0, x1
+**	ldnf1b	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s64_index, svint64_t, uint8_t,
+	   z0 = svldnf1ub_s64 (p0, x0 + x1),
+	   z0 = svldnf1ub_s64 (p0, x0 + x1))
+
+/*
+** ldnf1ub_s64_1:
+**	ldnf1b	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s64_1, svint64_t, uint8_t,
+	   z0 = svldnf1ub_s64 (p0, x0 + svcntd ()),
+	   z0 = svldnf1ub_s64 (p0, x0 + svcntd ()))
+
+/*
+** ldnf1ub_s64_7:
+**	ldnf1b	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s64_7, svint64_t, uint8_t,
+	   z0 = svldnf1ub_s64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnf1ub_s64 (p0, x0 + svcntd () * 7))
+
+/*
+** ldnf1ub_s64_8:
+**	incb	x0
+**	ldnf1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s64_8, svint64_t, uint8_t,
+	   z0 = svldnf1ub_s64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnf1ub_s64 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnf1ub_s64_m1:
+**	ldnf1b	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s64_m1, svint64_t, uint8_t,
+	   z0 = svldnf1ub_s64 (p0, x0 - svcntd ()),
+	   z0 = svldnf1ub_s64 (p0, x0 - svcntd ()))
+
+/*
+** ldnf1ub_s64_m8:
+**	ldnf1b	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s64_m8, svint64_t, uint8_t,
+	   z0 = svldnf1ub_s64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnf1ub_s64 (p0, x0 - svcntd () * 8))
+
+/*
+** ldnf1ub_s64_m9:
+**	decd	x0, all, mul #9
+**	ldnf1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_s64_m9, svint64_t, uint8_t,
+	   z0 = svldnf1ub_s64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnf1ub_s64 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnf1ub_vnum_s64_0:
+**	ldnf1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s64_0, svint64_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s64 (p0, x0, 0),
+	   z0 = svldnf1ub_vnum_s64 (p0, x0, 0))
+
+/*
+** ldnf1ub_vnum_s64_1:
+**	ldnf1b	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s64_1, svint64_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s64 (p0, x0, 1),
+	   z0 = svldnf1ub_vnum_s64 (p0, x0, 1))
+
+/*
+** ldnf1ub_vnum_s64_7:
+**	ldnf1b	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s64_7, svint64_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s64 (p0, x0, 7),
+	   z0 = svldnf1ub_vnum_s64 (p0, x0, 7))
+
+/*
+** ldnf1ub_vnum_s64_8:
+**	incb	x0
+**	ldnf1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s64_8, svint64_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s64 (p0, x0, 8),
+	   z0 = svldnf1ub_vnum_s64 (p0, x0, 8))
+
+/*
+** ldnf1ub_vnum_s64_m1:
+**	ldnf1b	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s64_m1, svint64_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s64 (p0, x0, -1),
+	   z0 = svldnf1ub_vnum_s64 (p0, x0, -1))
+
+/*
+** ldnf1ub_vnum_s64_m8:
+**	ldnf1b	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s64_m8, svint64_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s64 (p0, x0, -8),
+	   z0 = svldnf1ub_vnum_s64 (p0, x0, -8))
+
+/*
+** ldnf1ub_vnum_s64_m9:
+**	decd	x0, all, mul #9
+**	ldnf1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s64_m9, svint64_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s64 (p0, x0, -9),
+	   z0 = svldnf1ub_vnum_s64 (p0, x0, -9))
+
+/*
+** ldnf1ub_vnum_s64_x1:
+**	cntd	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1b	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_s64_x1, svint64_t, uint8_t,
+	   z0 = svldnf1ub_vnum_s64 (p0, x0, x1),
+	   z0 = svldnf1ub_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u16.c
new file mode 100644
index 000000000..7bf82c3b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u16.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1ub_u16_base:
+**	ldnf1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u16_base, svuint16_t, uint8_t,
+	   z0 = svldnf1ub_u16 (p0, x0),
+	   z0 = svldnf1ub_u16 (p0, x0))
+
+/*
+** ldnf1ub_u16_index:
+**	add	(x[0-9]+), x0, x1
+**	ldnf1b	z0\.h, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u16_index, svuint16_t, uint8_t,
+	   z0 = svldnf1ub_u16 (p0, x0 + x1),
+	   z0 = svldnf1ub_u16 (p0, x0 + x1))
+
+/*
+** ldnf1ub_u16_1:
+**	ldnf1b	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u16_1, svuint16_t, uint8_t,
+	   z0 = svldnf1ub_u16 (p0, x0 + svcnth ()),
+	   z0 = svldnf1ub_u16 (p0, x0 + svcnth ()))
+
+/*
+** ldnf1ub_u16_7:
+**	ldnf1b	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u16_7, svuint16_t, uint8_t,
+	   z0 = svldnf1ub_u16 (p0, x0 + svcnth () * 7),
+	   z0 = svldnf1ub_u16 (p0, x0 + svcnth () * 7))
+
+/*
+** ldnf1ub_u16_8:
+**	incb	x0, all, mul #4
+**	ldnf1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u16_8, svuint16_t, uint8_t,
+	   z0 = svldnf1ub_u16 (p0, x0 + svcnth () * 8),
+	   z0 = svldnf1ub_u16 (p0, x0 + svcnth () * 8))
+
+/*
+** ldnf1ub_u16_m1:
+**	ldnf1b	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u16_m1, svuint16_t, uint8_t,
+	   z0 = svldnf1ub_u16 (p0, x0 - svcnth ()),
+	   z0 = svldnf1ub_u16 (p0, x0 - svcnth ()))
+
+/*
+** ldnf1ub_u16_m8:
+**	ldnf1b	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u16_m8, svuint16_t, uint8_t,
+	   z0 = svldnf1ub_u16 (p0, x0 - svcnth () * 8),
+	   z0 = svldnf1ub_u16 (p0, x0 - svcnth () * 8))
+
+/*
+** ldnf1ub_u16_m9:
+**	dech	x0, all, mul #9
+**	ldnf1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u16_m9, svuint16_t, uint8_t,
+	   z0 = svldnf1ub_u16 (p0, x0 - svcnth () * 9),
+	   z0 = svldnf1ub_u16 (p0, x0 - svcnth () * 9))
+
+/*
+** ldnf1ub_vnum_u16_0:
+**	ldnf1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u16_0, svuint16_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u16 (p0, x0, 0),
+	   z0 = svldnf1ub_vnum_u16 (p0, x0, 0))
+
+/*
+** ldnf1ub_vnum_u16_1:
+**	ldnf1b	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u16_1, svuint16_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u16 (p0, x0, 1),
+	   z0 = svldnf1ub_vnum_u16 (p0, x0, 1))
+
+/*
+** ldnf1ub_vnum_u16_7:
+**	ldnf1b	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u16_7, svuint16_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u16 (p0, x0, 7),
+	   z0 = svldnf1ub_vnum_u16 (p0, x0, 7))
+
+/*
+** ldnf1ub_vnum_u16_8:
+**	incb	x0, all, mul #4
+**	ldnf1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u16_8, svuint16_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u16 (p0, x0, 8),
+	   z0 = svldnf1ub_vnum_u16 (p0, x0, 8))
+
+/*
+** ldnf1ub_vnum_u16_m1:
+**	ldnf1b	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u16_m1, svuint16_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u16 (p0, x0, -1),
+	   z0 = svldnf1ub_vnum_u16 (p0, x0, -1))
+
+/*
+** ldnf1ub_vnum_u16_m8:
+**	ldnf1b	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u16_m8, svuint16_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u16 (p0, x0, -8),
+	   z0 = svldnf1ub_vnum_u16 (p0, x0, -8))
+
+/*
+** ldnf1ub_vnum_u16_m9:
+**	dech	x0, all, mul #9
+**	ldnf1b	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u16_m9, svuint16_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u16 (p0, x0, -9),
+	   z0 = svldnf1ub_vnum_u16 (p0, x0, -9))
+
+/*
+** ldnf1ub_vnum_u16_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1b	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u16_x1, svuint16_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u16 (p0, x0, x1),
+	   z0 = svldnf1ub_vnum_u16 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u32.c
new file mode 100644
index 000000000..e2fef064b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u32.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1ub_u32_base:
+**	ldnf1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u32_base, svuint32_t, uint8_t,
+	   z0 = svldnf1ub_u32 (p0, x0),
+	   z0 = svldnf1ub_u32 (p0, x0))
+
+/*
+** ldnf1ub_u32_index:
+**	add	(x[0-9]+), x0, x1
+**	ldnf1b	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u32_index, svuint32_t, uint8_t,
+	   z0 = svldnf1ub_u32 (p0, x0 + x1),
+	   z0 = svldnf1ub_u32 (p0, x0 + x1))
+
+/*
+** ldnf1ub_u32_1:
+**	ldnf1b	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u32_1, svuint32_t, uint8_t,
+	   z0 = svldnf1ub_u32 (p0, x0 + svcntw ()),
+	   z0 = svldnf1ub_u32 (p0, x0 + svcntw ()))
+
+/*
+** ldnf1ub_u32_7:
+**	ldnf1b	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u32_7, svuint32_t, uint8_t,
+	   z0 = svldnf1ub_u32 (p0, x0 + svcntw () * 7),
+	   z0 = svldnf1ub_u32 (p0, x0 + svcntw () * 7))
+
+/*
+** ldnf1ub_u32_8:
+**	incb	x0, all, mul #2
+**	ldnf1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u32_8, svuint32_t, uint8_t,
+	   z0 = svldnf1ub_u32 (p0, x0 + svcntw () * 8),
+	   z0 = svldnf1ub_u32 (p0, x0 + svcntw () * 8))
+
+/*
+** ldnf1ub_u32_m1:
+**	ldnf1b	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u32_m1, svuint32_t, uint8_t,
+	   z0 = svldnf1ub_u32 (p0, x0 - svcntw ()),
+	   z0 = svldnf1ub_u32 (p0, x0 - svcntw ()))
+
+/*
+** ldnf1ub_u32_m8:
+**	ldnf1b	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u32_m8, svuint32_t, uint8_t,
+	   z0 = svldnf1ub_u32 (p0, x0 - svcntw () * 8),
+	   z0 = svldnf1ub_u32 (p0, x0 - svcntw () * 8))
+
+/*
+** ldnf1ub_u32_m9:
+**	decw	x0, all, mul #9
+**	ldnf1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u32_m9, svuint32_t, uint8_t,
+	   z0 = svldnf1ub_u32 (p0, x0 - svcntw () * 9),
+	   z0 = svldnf1ub_u32 (p0, x0 - svcntw () * 9))
+
+/*
+** ldnf1ub_vnum_u32_0:
+**	ldnf1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u32_0, svuint32_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u32 (p0, x0, 0),
+	   z0 = svldnf1ub_vnum_u32 (p0, x0, 0))
+
+/*
+** ldnf1ub_vnum_u32_1:
+**	ldnf1b	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u32_1, svuint32_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u32 (p0, x0, 1),
+	   z0 = svldnf1ub_vnum_u32 (p0, x0, 1))
+
+/*
+** ldnf1ub_vnum_u32_7:
+**	ldnf1b	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u32_7, svuint32_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u32 (p0, x0, 7),
+	   z0 = svldnf1ub_vnum_u32 (p0, x0, 7))
+
+/*
+** ldnf1ub_vnum_u32_8:
+**	incb	x0, all, mul #2
+**	ldnf1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u32_8, svuint32_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u32 (p0, x0, 8),
+	   z0 = svldnf1ub_vnum_u32 (p0, x0, 8))
+
+/*
+** ldnf1ub_vnum_u32_m1:
+**	ldnf1b	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u32_m1, svuint32_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u32 (p0, x0, -1),
+	   z0 = svldnf1ub_vnum_u32 (p0, x0, -1))
+
+/*
+** ldnf1ub_vnum_u32_m8:
+**	ldnf1b	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u32_m8, svuint32_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u32 (p0, x0, -8),
+	   z0 = svldnf1ub_vnum_u32 (p0, x0, -8))
+
+/*
+** ldnf1ub_vnum_u32_m9:
+**	decw	x0, all, mul #9
+**	ldnf1b	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u32_m9, svuint32_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u32 (p0, x0, -9),
+	   z0 = svldnf1ub_vnum_u32 (p0, x0, -9))
+
+/*
+** ldnf1ub_vnum_u32_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1b	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u32_x1, svuint32_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u32 (p0, x0, x1),
+	   z0 = svldnf1ub_vnum_u32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u64.c
new file mode 100644
index 000000000..57c61e122
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1ub_u64.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1ub_u64_base:
+**	ldnf1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u64_base, svuint64_t, uint8_t,
+	   z0 = svldnf1ub_u64 (p0, x0),
+	   z0 = svldnf1ub_u64 (p0, x0))
+
+/*
+** ldnf1ub_u64_index:
+**	add	(x[0-9]+), x0, x1
+**	ldnf1b	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u64_index, svuint64_t, uint8_t,
+	   z0 = svldnf1ub_u64 (p0, x0 + x1),
+	   z0 = svldnf1ub_u64 (p0, x0 + x1))
+
+/*
+** ldnf1ub_u64_1:
+**	ldnf1b	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u64_1, svuint64_t, uint8_t,
+	   z0 = svldnf1ub_u64 (p0, x0 + svcntd ()),
+	   z0 = svldnf1ub_u64 (p0, x0 + svcntd ()))
+
+/*
+** ldnf1ub_u64_7:
+**	ldnf1b	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u64_7, svuint64_t, uint8_t,
+	   z0 = svldnf1ub_u64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnf1ub_u64 (p0, x0 + svcntd () * 7))
+
+/*
+** ldnf1ub_u64_8:
+**	incb	x0
+**	ldnf1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u64_8, svuint64_t, uint8_t,
+	   z0 = svldnf1ub_u64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnf1ub_u64 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnf1ub_u64_m1:
+**	ldnf1b	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u64_m1, svuint64_t, uint8_t,
+	   z0 = svldnf1ub_u64 (p0, x0 - svcntd ()),
+	   z0 = svldnf1ub_u64 (p0, x0 - svcntd ()))
+
+/*
+** ldnf1ub_u64_m8:
+**	ldnf1b	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u64_m8, svuint64_t, uint8_t,
+	   z0 = svldnf1ub_u64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnf1ub_u64 (p0, x0 - svcntd () * 8))
+
+/*
+** ldnf1ub_u64_m9:
+**	decd	x0, all, mul #9
+**	ldnf1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_u64_m9, svuint64_t, uint8_t,
+	   z0 = svldnf1ub_u64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnf1ub_u64 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnf1ub_vnum_u64_0:
+**	ldnf1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u64_0, svuint64_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u64 (p0, x0, 0),
+	   z0 = svldnf1ub_vnum_u64 (p0, x0, 0))
+
+/*
+** ldnf1ub_vnum_u64_1:
+**	ldnf1b	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u64_1, svuint64_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u64 (p0, x0, 1),
+	   z0 = svldnf1ub_vnum_u64 (p0, x0, 1))
+
+/*
+** ldnf1ub_vnum_u64_7:
+**	ldnf1b	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u64_7, svuint64_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u64 (p0, x0, 7),
+	   z0 = svldnf1ub_vnum_u64 (p0, x0, 7))
+
+/*
+** ldnf1ub_vnum_u64_8:
+**	incb	x0
+**	ldnf1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u64_8, svuint64_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u64 (p0, x0, 8),
+	   z0 = svldnf1ub_vnum_u64 (p0, x0, 8))
+
+/*
+** ldnf1ub_vnum_u64_m1:
+**	ldnf1b	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u64_m1, svuint64_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u64 (p0, x0, -1),
+	   z0 = svldnf1ub_vnum_u64 (p0, x0, -1))
+
+/*
+** ldnf1ub_vnum_u64_m8:
+**	ldnf1b	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u64_m8, svuint64_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u64 (p0, x0, -8),
+	   z0 = svldnf1ub_vnum_u64 (p0, x0, -8))
+
+/*
+** ldnf1ub_vnum_u64_m9:
+**	decd	x0, all, mul #9
+**	ldnf1b	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u64_m9, svuint64_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u64 (p0, x0, -9),
+	   z0 = svldnf1ub_vnum_u64 (p0, x0, -9))
+
+/*
+** ldnf1ub_vnum_u64_x1:
+**	cntd	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1b	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1ub_vnum_u64_x1, svuint64_t, uint8_t,
+	   z0 = svldnf1ub_vnum_u64 (p0, x0, x1),
+	   z0 = svldnf1ub_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_s32.c
new file mode 100644
index 000000000..ed9686c4e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_s32.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1uh_s32_base:
+**	ldnf1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_s32_base, svint32_t, uint16_t,
+	   z0 = svldnf1uh_s32 (p0, x0),
+	   z0 = svldnf1uh_s32 (p0, x0))
+
+/*
+** ldnf1uh_s32_index:
+**	add	(x[0-9]+), x0, x1, lsl 1
+**	ldnf1h	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_s32_index, svint32_t, uint16_t,
+	   z0 = svldnf1uh_s32 (p0, x0 + x1),
+	   z0 = svldnf1uh_s32 (p0, x0 + x1))
+
+/*
+** ldnf1uh_s32_1:
+**	ldnf1h	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_s32_1, svint32_t, uint16_t,
+	   z0 = svldnf1uh_s32 (p0, x0 + svcntw ()),
+	   z0 = svldnf1uh_s32 (p0, x0 + svcntw ()))
+
+/*
+** ldnf1uh_s32_7:
+**	ldnf1h	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_s32_7, svint32_t, uint16_t,
+	   z0 = svldnf1uh_s32 (p0, x0 + svcntw () * 7),
+	   z0 = svldnf1uh_s32 (p0, x0 + svcntw () * 7))
+
+/*
+** ldnf1uh_s32_8:
+**	incb	x0, all, mul #4
+**	ldnf1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_s32_8, svint32_t, uint16_t,
+	   z0 = svldnf1uh_s32 (p0, x0 + svcntw () * 8),
+	   z0 = svldnf1uh_s32 (p0, x0 + svcntw () * 8))
+
+/*
+** ldnf1uh_s32_m1:
+**	ldnf1h	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_s32_m1, svint32_t, uint16_t,
+	   z0 = svldnf1uh_s32 (p0, x0 - svcntw ()),
+	   z0 = svldnf1uh_s32 (p0, x0 - svcntw ()))
+
+/*
+** ldnf1uh_s32_m8:
+**	ldnf1h	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_s32_m8, svint32_t, uint16_t,
+	   z0 = svldnf1uh_s32 (p0, x0 - svcntw () * 8),
+	   z0 = svldnf1uh_s32 (p0, x0 - svcntw () * 8))
+
+/*
+** ldnf1uh_s32_m9:
+**	dech	x0, all, mul #9
+**	ldnf1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_s32_m9, svint32_t, uint16_t,
+	   z0 = svldnf1uh_s32 (p0, x0 - svcntw () * 9),
+	   z0 = svldnf1uh_s32 (p0, x0 - svcntw () * 9))
+
+/*
+** ldnf1uh_vnum_s32_0:
+**	ldnf1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_s32_0, svint32_t, uint16_t,
+	   z0 = svldnf1uh_vnum_s32 (p0, x0, 0),
+	   z0 = svldnf1uh_vnum_s32 (p0, x0, 0))
+
+/*
+** ldnf1uh_vnum_s32_1:
+**	ldnf1h	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_s32_1, svint32_t, uint16_t,
+	   z0 = svldnf1uh_vnum_s32 (p0, x0, 1),
+	   z0 = svldnf1uh_vnum_s32 (p0, x0, 1))
+
+/*
+** ldnf1uh_vnum_s32_7:
+**	ldnf1h	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_s32_7, svint32_t, uint16_t,
+	   z0 = svldnf1uh_vnum_s32 (p0, x0, 7),
+	   z0 = svldnf1uh_vnum_s32 (p0, x0, 7))
+
+/*
+** ldnf1uh_vnum_s32_8:
+**	incb	x0, all, mul #4
+**	ldnf1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_s32_8, svint32_t, uint16_t,
+	   z0 = svldnf1uh_vnum_s32 (p0, x0, 8),
+	   z0 = svldnf1uh_vnum_s32 (p0, x0, 8))
+
+/*
+** ldnf1uh_vnum_s32_m1:
+**	ldnf1h	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_s32_m1, svint32_t, uint16_t,
+	   z0 = svldnf1uh_vnum_s32 (p0, x0, -1),
+	   z0 = svldnf1uh_vnum_s32 (p0, x0, -1))
+
+/*
+** ldnf1uh_vnum_s32_m8:
+**	ldnf1h	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_s32_m8, svint32_t, uint16_t,
+	   z0 = svldnf1uh_vnum_s32 (p0, x0, -8),
+	   z0 = svldnf1uh_vnum_s32 (p0, x0, -8))
+
+/*
+** ldnf1uh_vnum_s32_m9:
+**	dech	x0, all, mul #9
+**	ldnf1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_s32_m9, svint32_t, uint16_t,
+	   z0 = svldnf1uh_vnum_s32 (p0, x0, -9),
+	   z0 = svldnf1uh_vnum_s32 (p0, x0, -9))
+
+/*
+** ldnf1uh_vnum_s32_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1h	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_s32_x1, svint32_t, uint16_t,
+	   z0 = svldnf1uh_vnum_s32 (p0, x0, x1),
+	   z0 = svldnf1uh_vnum_s32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_s64.c
new file mode 100644
index 000000000..a3107f562
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_s64.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1uh_s64_base:
+**	ldnf1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_s64_base, svint64_t, uint16_t,
+	   z0 = svldnf1uh_s64 (p0, x0),
+	   z0 = svldnf1uh_s64 (p0, x0))
+
+/*
+** ldnf1uh_s64_index:
+**	add	(x[0-9]+), x0, x1, lsl 1
+**	ldnf1h	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_s64_index, svint64_t, uint16_t,
+	   z0 = svldnf1uh_s64 (p0, x0 + x1),
+	   z0 = svldnf1uh_s64 (p0, x0 + x1))
+
+/*
+** ldnf1uh_s64_1:
+**	ldnf1h	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_s64_1, svint64_t, uint16_t,
+	   z0 = svldnf1uh_s64 (p0, x0 + svcntd ()),
+	   z0 = svldnf1uh_s64 (p0, x0 + svcntd ()))
+
+/*
+** ldnf1uh_s64_7:
+**	ldnf1h	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_s64_7, svint64_t, uint16_t,
+	   z0 = svldnf1uh_s64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnf1uh_s64 (p0, x0 + svcntd () * 7))
+
+/*
+** ldnf1uh_s64_8:
+**	incb	x0, all, mul #2
+**	ldnf1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_s64_8, svint64_t, uint16_t,
+	   z0 = svldnf1uh_s64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnf1uh_s64 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnf1uh_s64_m1:
+**	ldnf1h	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_s64_m1, svint64_t, uint16_t,
+	   z0 = svldnf1uh_s64 (p0, x0 - svcntd ()),
+	   z0 = svldnf1uh_s64 (p0, x0 - svcntd ()))
+
+/*
+** ldnf1uh_s64_m8:
+**	ldnf1h	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_s64_m8, svint64_t, uint16_t,
+	   z0 = svldnf1uh_s64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnf1uh_s64 (p0, x0 - svcntd () * 8))
+
+/*
+** ldnf1uh_s64_m9:
+**	decw	x0, all, mul #9
+**	ldnf1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_s64_m9, svint64_t, uint16_t,
+	   z0 = svldnf1uh_s64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnf1uh_s64 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnf1uh_vnum_s64_0:
+**	ldnf1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_s64_0, svint64_t, uint16_t,
+	   z0 = svldnf1uh_vnum_s64 (p0, x0, 0),
+	   z0 = svldnf1uh_vnum_s64 (p0, x0, 0))
+
+/*
+** ldnf1uh_vnum_s64_1:
+**	ldnf1h	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_s64_1, svint64_t, uint16_t,
+	   z0 = svldnf1uh_vnum_s64 (p0, x0, 1),
+	   z0 = svldnf1uh_vnum_s64 (p0, x0, 1))
+
+/*
+** ldnf1uh_vnum_s64_7:
+**	ldnf1h	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_s64_7, svint64_t, uint16_t,
+	   z0 = svldnf1uh_vnum_s64 (p0, x0, 7),
+	   z0 = svldnf1uh_vnum_s64 (p0, x0, 7))
+
+/*
+** ldnf1uh_vnum_s64_8:
+**	incb	x0, all, mul #2
+**	ldnf1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_s64_8, svint64_t, uint16_t,
+	   z0 = svldnf1uh_vnum_s64 (p0, x0, 8),
+	   z0 = svldnf1uh_vnum_s64 (p0, x0, 8))
+
+/*
+** ldnf1uh_vnum_s64_m1:
+**	ldnf1h	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_s64_m1, svint64_t, uint16_t,
+	   z0 = svldnf1uh_vnum_s64 (p0, x0, -1),
+	   z0 = svldnf1uh_vnum_s64 (p0, x0, -1))
+
+/*
+** ldnf1uh_vnum_s64_m8:
+**	ldnf1h	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_s64_m8, svint64_t, uint16_t,
+	   z0 = svldnf1uh_vnum_s64 (p0, x0, -8),
+	   z0 = svldnf1uh_vnum_s64 (p0, x0, -8))
+
+/*
+** ldnf1uh_vnum_s64_m9:
+**	decw	x0, all, mul #9
+**	ldnf1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_s64_m9, svint64_t, uint16_t,
+	   z0 = svldnf1uh_vnum_s64 (p0, x0, -9),
+	   z0 = svldnf1uh_vnum_s64 (p0, x0, -9))
+
+/*
+** ldnf1uh_vnum_s64_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1h	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_s64_x1, svint64_t, uint16_t,
+	   z0 = svldnf1uh_vnum_s64 (p0, x0, x1),
+	   z0 = svldnf1uh_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_u32.c
new file mode 100644
index 000000000..93d5abaf7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_u32.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1uh_u32_base:
+**	ldnf1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_u32_base, svuint32_t, uint16_t,
+	   z0 = svldnf1uh_u32 (p0, x0),
+	   z0 = svldnf1uh_u32 (p0, x0))
+
+/*
+** ldnf1uh_u32_index:
+**	add	(x[0-9]+), x0, x1, lsl 1
+**	ldnf1h	z0\.s, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_u32_index, svuint32_t, uint16_t,
+	   z0 = svldnf1uh_u32 (p0, x0 + x1),
+	   z0 = svldnf1uh_u32 (p0, x0 + x1))
+
+/*
+** ldnf1uh_u32_1:
+**	ldnf1h	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_u32_1, svuint32_t, uint16_t,
+	   z0 = svldnf1uh_u32 (p0, x0 + svcntw ()),
+	   z0 = svldnf1uh_u32 (p0, x0 + svcntw ()))
+
+/*
+** ldnf1uh_u32_7:
+**	ldnf1h	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_u32_7, svuint32_t, uint16_t,
+	   z0 = svldnf1uh_u32 (p0, x0 + svcntw () * 7),
+	   z0 = svldnf1uh_u32 (p0, x0 + svcntw () * 7))
+
+/*
+** ldnf1uh_u32_8:
+**	incb	x0, all, mul #4
+**	ldnf1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_u32_8, svuint32_t, uint16_t,
+	   z0 = svldnf1uh_u32 (p0, x0 + svcntw () * 8),
+	   z0 = svldnf1uh_u32 (p0, x0 + svcntw () * 8))
+
+/*
+** ldnf1uh_u32_m1:
+**	ldnf1h	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_u32_m1, svuint32_t, uint16_t,
+	   z0 = svldnf1uh_u32 (p0, x0 - svcntw ()),
+	   z0 = svldnf1uh_u32 (p0, x0 - svcntw ()))
+
+/*
+** ldnf1uh_u32_m8:
+**	ldnf1h	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_u32_m8, svuint32_t, uint16_t,
+	   z0 = svldnf1uh_u32 (p0, x0 - svcntw () * 8),
+	   z0 = svldnf1uh_u32 (p0, x0 - svcntw () * 8))
+
+/*
+** ldnf1uh_u32_m9:
+**	dech	x0, all, mul #9
+**	ldnf1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_u32_m9, svuint32_t, uint16_t,
+	   z0 = svldnf1uh_u32 (p0, x0 - svcntw () * 9),
+	   z0 = svldnf1uh_u32 (p0, x0 - svcntw () * 9))
+
+/*
+** ldnf1uh_vnum_u32_0:
+**	ldnf1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_u32_0, svuint32_t, uint16_t,
+	   z0 = svldnf1uh_vnum_u32 (p0, x0, 0),
+	   z0 = svldnf1uh_vnum_u32 (p0, x0, 0))
+
+/*
+** ldnf1uh_vnum_u32_1:
+**	ldnf1h	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_u32_1, svuint32_t, uint16_t,
+	   z0 = svldnf1uh_vnum_u32 (p0, x0, 1),
+	   z0 = svldnf1uh_vnum_u32 (p0, x0, 1))
+
+/*
+** ldnf1uh_vnum_u32_7:
+**	ldnf1h	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_u32_7, svuint32_t, uint16_t,
+	   z0 = svldnf1uh_vnum_u32 (p0, x0, 7),
+	   z0 = svldnf1uh_vnum_u32 (p0, x0, 7))
+
+/*
+** ldnf1uh_vnum_u32_8:
+**	incb	x0, all, mul #4
+**	ldnf1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_u32_8, svuint32_t, uint16_t,
+	   z0 = svldnf1uh_vnum_u32 (p0, x0, 8),
+	   z0 = svldnf1uh_vnum_u32 (p0, x0, 8))
+
+/*
+** ldnf1uh_vnum_u32_m1:
+**	ldnf1h	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_u32_m1, svuint32_t, uint16_t,
+	   z0 = svldnf1uh_vnum_u32 (p0, x0, -1),
+	   z0 = svldnf1uh_vnum_u32 (p0, x0, -1))
+
+/*
+** ldnf1uh_vnum_u32_m8:
+**	ldnf1h	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_u32_m8, svuint32_t, uint16_t,
+	   z0 = svldnf1uh_vnum_u32 (p0, x0, -8),
+	   z0 = svldnf1uh_vnum_u32 (p0, x0, -8))
+
+/*
+** ldnf1uh_vnum_u32_m9:
+**	dech	x0, all, mul #9
+**	ldnf1h	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_u32_m9, svuint32_t, uint16_t,
+	   z0 = svldnf1uh_vnum_u32 (p0, x0, -9),
+	   z0 = svldnf1uh_vnum_u32 (p0, x0, -9))
+
+/*
+** ldnf1uh_vnum_u32_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1h	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_u32_x1, svuint32_t, uint16_t,
+	   z0 = svldnf1uh_vnum_u32 (p0, x0, x1),
+	   z0 = svldnf1uh_vnum_u32 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_u64.c
new file mode 100644
index 000000000..32d36a84c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uh_u64.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1uh_u64_base:
+**	ldnf1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_u64_base, svuint64_t, uint16_t,
+	   z0 = svldnf1uh_u64 (p0, x0),
+	   z0 = svldnf1uh_u64 (p0, x0))
+
+/*
+** ldnf1uh_u64_index:
+**	add	(x[0-9]+), x0, x1, lsl 1
+**	ldnf1h	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_u64_index, svuint64_t, uint16_t,
+	   z0 = svldnf1uh_u64 (p0, x0 + x1),
+	   z0 = svldnf1uh_u64 (p0, x0 + x1))
+
+/*
+** ldnf1uh_u64_1:
+**	ldnf1h	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_u64_1, svuint64_t, uint16_t,
+	   z0 = svldnf1uh_u64 (p0, x0 + svcntd ()),
+	   z0 = svldnf1uh_u64 (p0, x0 + svcntd ()))
+
+/*
+** ldnf1uh_u64_7:
+**	ldnf1h	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_u64_7, svuint64_t, uint16_t,
+	   z0 = svldnf1uh_u64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnf1uh_u64 (p0, x0 + svcntd () * 7))
+
+/*
+** ldnf1uh_u64_8:
+**	incb	x0, all, mul #2
+**	ldnf1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_u64_8, svuint64_t, uint16_t,
+	   z0 = svldnf1uh_u64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnf1uh_u64 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnf1uh_u64_m1:
+**	ldnf1h	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_u64_m1, svuint64_t, uint16_t,
+	   z0 = svldnf1uh_u64 (p0, x0 - svcntd ()),
+	   z0 = svldnf1uh_u64 (p0, x0 - svcntd ()))
+
+/*
+** ldnf1uh_u64_m8:
+**	ldnf1h	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_u64_m8, svuint64_t, uint16_t,
+	   z0 = svldnf1uh_u64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnf1uh_u64 (p0, x0 - svcntd () * 8))
+
+/*
+** ldnf1uh_u64_m9:
+**	decw	x0, all, mul #9
+**	ldnf1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_u64_m9, svuint64_t, uint16_t,
+	   z0 = svldnf1uh_u64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnf1uh_u64 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnf1uh_vnum_u64_0:
+**	ldnf1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_u64_0, svuint64_t, uint16_t,
+	   z0 = svldnf1uh_vnum_u64 (p0, x0, 0),
+	   z0 = svldnf1uh_vnum_u64 (p0, x0, 0))
+
+/*
+** ldnf1uh_vnum_u64_1:
+**	ldnf1h	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_u64_1, svuint64_t, uint16_t,
+	   z0 = svldnf1uh_vnum_u64 (p0, x0, 1),
+	   z0 = svldnf1uh_vnum_u64 (p0, x0, 1))
+
+/*
+** ldnf1uh_vnum_u64_7:
+**	ldnf1h	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_u64_7, svuint64_t, uint16_t,
+	   z0 = svldnf1uh_vnum_u64 (p0, x0, 7),
+	   z0 = svldnf1uh_vnum_u64 (p0, x0, 7))
+
+/*
+** ldnf1uh_vnum_u64_8:
+**	incb	x0, all, mul #2
+**	ldnf1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_u64_8, svuint64_t, uint16_t,
+	   z0 = svldnf1uh_vnum_u64 (p0, x0, 8),
+	   z0 = svldnf1uh_vnum_u64 (p0, x0, 8))
+
+/*
+** ldnf1uh_vnum_u64_m1:
+**	ldnf1h	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_u64_m1, svuint64_t, uint16_t,
+	   z0 = svldnf1uh_vnum_u64 (p0, x0, -1),
+	   z0 = svldnf1uh_vnum_u64 (p0, x0, -1))
+
+/*
+** ldnf1uh_vnum_u64_m8:
+**	ldnf1h	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_u64_m8, svuint64_t, uint16_t,
+	   z0 = svldnf1uh_vnum_u64 (p0, x0, -8),
+	   z0 = svldnf1uh_vnum_u64 (p0, x0, -8))
+
+/*
+** ldnf1uh_vnum_u64_m9:
+**	decw	x0, all, mul #9
+**	ldnf1h	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_u64_m9, svuint64_t, uint16_t,
+	   z0 = svldnf1uh_vnum_u64 (p0, x0, -9),
+	   z0 = svldnf1uh_vnum_u64 (p0, x0, -9))
+
+/*
+** ldnf1uh_vnum_u64_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1h	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1uh_vnum_u64_x1, svuint64_t, uint16_t,
+	   z0 = svldnf1uh_vnum_u64 (p0, x0, x1),
+	   z0 = svldnf1uh_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uw_s64.c
new file mode 100644
index 000000000..373922791
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uw_s64.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1uw_s64_base:
+**	ldnf1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_s64_base, svint64_t, uint32_t,
+	   z0 = svldnf1uw_s64 (p0, x0),
+	   z0 = svldnf1uw_s64 (p0, x0))
+
+/*
+** ldnf1uw_s64_index:
+**	add	(x[0-9]+), x0, x1, lsl 2
+**	ldnf1w	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_s64_index, svint64_t, uint32_t,
+	   z0 = svldnf1uw_s64 (p0, x0 + x1),
+	   z0 = svldnf1uw_s64 (p0, x0 + x1))
+
+/*
+** ldnf1uw_s64_1:
+**	ldnf1w	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_s64_1, svint64_t, uint32_t,
+	   z0 = svldnf1uw_s64 (p0, x0 + svcntd ()),
+	   z0 = svldnf1uw_s64 (p0, x0 + svcntd ()))
+
+/*
+** ldnf1uw_s64_7:
+**	ldnf1w	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_s64_7, svint64_t, uint32_t,
+	   z0 = svldnf1uw_s64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnf1uw_s64 (p0, x0 + svcntd () * 7))
+
+/*
+** ldnf1uw_s64_8:
+**	incb	x0, all, mul #4
+**	ldnf1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_s64_8, svint64_t, uint32_t,
+	   z0 = svldnf1uw_s64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnf1uw_s64 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnf1uw_s64_m1:
+**	ldnf1w	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_s64_m1, svint64_t, uint32_t,
+	   z0 = svldnf1uw_s64 (p0, x0 - svcntd ()),
+	   z0 = svldnf1uw_s64 (p0, x0 - svcntd ()))
+
+/*
+** ldnf1uw_s64_m8:
+**	ldnf1w	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_s64_m8, svint64_t, uint32_t,
+	   z0 = svldnf1uw_s64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnf1uw_s64 (p0, x0 - svcntd () * 8))
+
+/*
+** ldnf1uw_s64_m9:
+**	dech	x0, all, mul #9
+**	ldnf1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_s64_m9, svint64_t, uint32_t,
+	   z0 = svldnf1uw_s64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnf1uw_s64 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnf1uw_vnum_s64_0:
+**	ldnf1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_vnum_s64_0, svint64_t, uint32_t,
+	   z0 = svldnf1uw_vnum_s64 (p0, x0, 0),
+	   z0 = svldnf1uw_vnum_s64 (p0, x0, 0))
+
+/*
+** ldnf1uw_vnum_s64_1:
+**	ldnf1w	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_vnum_s64_1, svint64_t, uint32_t,
+	   z0 = svldnf1uw_vnum_s64 (p0, x0, 1),
+	   z0 = svldnf1uw_vnum_s64 (p0, x0, 1))
+
+/*
+** ldnf1uw_vnum_s64_7:
+**	ldnf1w	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_vnum_s64_7, svint64_t, uint32_t,
+	   z0 = svldnf1uw_vnum_s64 (p0, x0, 7),
+	   z0 = svldnf1uw_vnum_s64 (p0, x0, 7))
+
+/*
+** ldnf1uw_vnum_s64_8:
+**	incb	x0, all, mul #4
+**	ldnf1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_vnum_s64_8, svint64_t, uint32_t,
+	   z0 = svldnf1uw_vnum_s64 (p0, x0, 8),
+	   z0 = svldnf1uw_vnum_s64 (p0, x0, 8))
+
+/*
+** ldnf1uw_vnum_s64_m1:
+**	ldnf1w	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_vnum_s64_m1, svint64_t, uint32_t,
+	   z0 = svldnf1uw_vnum_s64 (p0, x0, -1),
+	   z0 = svldnf1uw_vnum_s64 (p0, x0, -1))
+
+/*
+** ldnf1uw_vnum_s64_m8:
+**	ldnf1w	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_vnum_s64_m8, svint64_t, uint32_t,
+	   z0 = svldnf1uw_vnum_s64 (p0, x0, -8),
+	   z0 = svldnf1uw_vnum_s64 (p0, x0, -8))
+
+/*
+** ldnf1uw_vnum_s64_m9:
+**	dech	x0, all, mul #9
+**	ldnf1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_vnum_s64_m9, svint64_t, uint32_t,
+	   z0 = svldnf1uw_vnum_s64 (p0, x0, -9),
+	   z0 = svldnf1uw_vnum_s64 (p0, x0, -9))
+
+/*
+** ldnf1uw_vnum_s64_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1w	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_vnum_s64_x1, svint64_t, uint32_t,
+	   z0 = svldnf1uw_vnum_s64 (p0, x0, x1),
+	   z0 = svldnf1uw_vnum_s64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uw_u64.c
new file mode 100644
index 000000000..b3c3be1d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnf1uw_u64.c
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1uw_u64_base:
+**	ldnf1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_u64_base, svuint64_t, uint32_t,
+	   z0 = svldnf1uw_u64 (p0, x0),
+	   z0 = svldnf1uw_u64 (p0, x0))
+
+/*
+** ldnf1uw_u64_index:
+**	add	(x[0-9]+), x0, x1, lsl 2
+**	ldnf1w	z0\.d, p0/z, \[\1\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_u64_index, svuint64_t, uint32_t,
+	   z0 = svldnf1uw_u64 (p0, x0 + x1),
+	   z0 = svldnf1uw_u64 (p0, x0 + x1))
+
+/*
+** ldnf1uw_u64_1:
+**	ldnf1w	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_u64_1, svuint64_t, uint32_t,
+	   z0 = svldnf1uw_u64 (p0, x0 + svcntd ()),
+	   z0 = svldnf1uw_u64 (p0, x0 + svcntd ()))
+
+/*
+** ldnf1uw_u64_7:
+**	ldnf1w	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_u64_7, svuint64_t, uint32_t,
+	   z0 = svldnf1uw_u64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnf1uw_u64 (p0, x0 + svcntd () * 7))
+
+/*
+** ldnf1uw_u64_8:
+**	incb	x0, all, mul #4
+**	ldnf1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_u64_8, svuint64_t, uint32_t,
+	   z0 = svldnf1uw_u64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnf1uw_u64 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnf1uw_u64_m1:
+**	ldnf1w	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_u64_m1, svuint64_t, uint32_t,
+	   z0 = svldnf1uw_u64 (p0, x0 - svcntd ()),
+	   z0 = svldnf1uw_u64 (p0, x0 - svcntd ()))
+
+/*
+** ldnf1uw_u64_m8:
+**	ldnf1w	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_u64_m8, svuint64_t, uint32_t,
+	   z0 = svldnf1uw_u64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnf1uw_u64 (p0, x0 - svcntd () * 8))
+
+/*
+** ldnf1uw_u64_m9:
+**	dech	x0, all, mul #9
+**	ldnf1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_u64_m9, svuint64_t, uint32_t,
+	   z0 = svldnf1uw_u64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnf1uw_u64 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnf1uw_vnum_u64_0:
+**	ldnf1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_vnum_u64_0, svuint64_t, uint32_t,
+	   z0 = svldnf1uw_vnum_u64 (p0, x0, 0),
+	   z0 = svldnf1uw_vnum_u64 (p0, x0, 0))
+
+/*
+** ldnf1uw_vnum_u64_1:
+**	ldnf1w	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_vnum_u64_1, svuint64_t, uint32_t,
+	   z0 = svldnf1uw_vnum_u64 (p0, x0, 1),
+	   z0 = svldnf1uw_vnum_u64 (p0, x0, 1))
+
+/*
+** ldnf1uw_vnum_u64_7:
+**	ldnf1w	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_vnum_u64_7, svuint64_t, uint32_t,
+	   z0 = svldnf1uw_vnum_u64 (p0, x0, 7),
+	   z0 = svldnf1uw_vnum_u64 (p0, x0, 7))
+
+/*
+** ldnf1uw_vnum_u64_8:
+**	incb	x0, all, mul #4
+**	ldnf1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_vnum_u64_8, svuint64_t, uint32_t,
+	   z0 = svldnf1uw_vnum_u64 (p0, x0, 8),
+	   z0 = svldnf1uw_vnum_u64 (p0, x0, 8))
+
+/*
+** ldnf1uw_vnum_u64_m1:
+**	ldnf1w	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_vnum_u64_m1, svuint64_t, uint32_t,
+	   z0 = svldnf1uw_vnum_u64 (p0, x0, -1),
+	   z0 = svldnf1uw_vnum_u64 (p0, x0, -1))
+
+/*
+** ldnf1uw_vnum_u64_m8:
+**	ldnf1w	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_vnum_u64_m8, svuint64_t, uint32_t,
+	   z0 = svldnf1uw_vnum_u64 (p0, x0, -8),
+	   z0 = svldnf1uw_vnum_u64 (p0, x0, -8))
+
+/*
+** ldnf1uw_vnum_u64_m9:
+**	dech	x0, all, mul #9
+**	ldnf1w	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_vnum_u64_m9, svuint64_t, uint32_t,
+	   z0 = svldnf1uw_vnum_u64 (p0, x0, -9),
+	   z0 = svldnf1uw_vnum_u64 (p0, x0, -9))
+
+/*
+** ldnf1uw_vnum_u64_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnf1w	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnf1uw_vnum_u64_x1, svuint64_t, uint32_t,
+	   z0 = svldnf1uw_vnum_u64 (p0, x0, x1),
+	   z0 = svldnf1uw_vnum_u64 (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_bf16.c
new file mode 100644
index 000000000..b083901fa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_bf16.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnt1_bf16_base:
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_bf16_base, svbfloat16_t, bfloat16_t,
+	   z0 = svldnt1_bf16 (p0, x0),
+	   z0 = svldnt1 (p0, x0))
+
+/*
+** ldnt1_bf16_index:
+**	ldnt1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ldnt1_bf16_index, svbfloat16_t, bfloat16_t,
+	   z0 = svldnt1_bf16 (p0, x0 + x1),
+	   z0 = svldnt1 (p0, x0 + x1))
+
+/*
+** ldnt1_bf16_1:
+**	ldnt1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_bf16_1, svbfloat16_t, bfloat16_t,
+	   z0 = svldnt1_bf16 (p0, x0 + svcnth ()),
+	   z0 = svldnt1 (p0, x0 + svcnth ()))
+
+/*
+** ldnt1_bf16_7:
+**	ldnt1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_bf16_7, svbfloat16_t, bfloat16_t,
+	   z0 = svldnt1_bf16 (p0, x0 + svcnth () * 7),
+	   z0 = svldnt1 (p0, x0 + svcnth () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_bf16_8:
+**	incb	x0, all, mul #8
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_bf16_8, svbfloat16_t, bfloat16_t,
+	   z0 = svldnt1_bf16 (p0, x0 + svcnth () * 8),
+	   z0 = svldnt1 (p0, x0 + svcnth () * 8))
+
+/*
+** ldnt1_bf16_m1:
+**	ldnt1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_bf16_m1, svbfloat16_t, bfloat16_t,
+	   z0 = svldnt1_bf16 (p0, x0 - svcnth ()),
+	   z0 = svldnt1 (p0, x0 - svcnth ()))
+
+/*
+** ldnt1_bf16_m8:
+**	ldnt1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_bf16_m8, svbfloat16_t, bfloat16_t,
+	   z0 = svldnt1_bf16 (p0, x0 - svcnth () * 8),
+	   z0 = svldnt1 (p0, x0 - svcnth () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_bf16_m9:
+**	decb	x0, all, mul #9
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_bf16_m9, svbfloat16_t, bfloat16_t,
+	   z0 = svldnt1_bf16 (p0, x0 - svcnth () * 9),
+	   z0 = svldnt1 (p0, x0 - svcnth () * 9))
+
+/*
+** ldnt1_vnum_bf16_0:
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+	   z0 = svldnt1_vnum_bf16 (p0, x0, 0),
+	   z0 = svldnt1_vnum (p0, x0, 0))
+
+/*
+** ldnt1_vnum_bf16_1:
+**	ldnt1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+	   z0 = svldnt1_vnum_bf16 (p0, x0, 1),
+	   z0 = svldnt1_vnum (p0, x0, 1))
+
+/*
+** ldnt1_vnum_bf16_7:
+**	ldnt1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
+	   z0 = svldnt1_vnum_bf16 (p0, x0, 7),
+	   z0 = svldnt1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_bf16_8:
+**	incb	x0, all, mul #8
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
+	   z0 = svldnt1_vnum_bf16 (p0, x0, 8),
+	   z0 = svldnt1_vnum (p0, x0, 8))
+
+/*
+** ldnt1_vnum_bf16_m1:
+**	ldnt1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+	   z0 = svldnt1_vnum_bf16 (p0, x0, -1),
+	   z0 = svldnt1_vnum (p0, x0, -1))
+
+/*
+** ldnt1_vnum_bf16_m8:
+**	ldnt1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
+	   z0 = svldnt1_vnum_bf16 (p0, x0, -8),
+	   z0 = svldnt1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_bf16_m9:
+**	decb	x0, all, mul #9
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
+	   z0 = svldnt1_vnum_bf16 (p0, x0, -9),
+	   z0 = svldnt1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldnt1_vnum_bf16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldnt1h	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+	   z0 = svldnt1_vnum_bf16 (p0, x0, x1),
+	   z0 = svldnt1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f16.c
new file mode 100644
index 000000000..c98ab2da4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f16.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnt1_f16_base:
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f16_base, svfloat16_t, float16_t,
+	   z0 = svldnt1_f16 (p0, x0),
+	   z0 = svldnt1 (p0, x0))
+
+/*
+** ldnt1_f16_index:
+**	ldnt1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f16_index, svfloat16_t, float16_t,
+	   z0 = svldnt1_f16 (p0, x0 + x1),
+	   z0 = svldnt1 (p0, x0 + x1))
+
+/*
+** ldnt1_f16_1:
+**	ldnt1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f16_1, svfloat16_t, float16_t,
+	   z0 = svldnt1_f16 (p0, x0 + svcnth ()),
+	   z0 = svldnt1 (p0, x0 + svcnth ()))
+
+/*
+** ldnt1_f16_7:
+**	ldnt1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f16_7, svfloat16_t, float16_t,
+	   z0 = svldnt1_f16 (p0, x0 + svcnth () * 7),
+	   z0 = svldnt1 (p0, x0 + svcnth () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_f16_8:
+**	incb	x0, all, mul #8
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f16_8, svfloat16_t, float16_t,
+	   z0 = svldnt1_f16 (p0, x0 + svcnth () * 8),
+	   z0 = svldnt1 (p0, x0 + svcnth () * 8))
+
+/*
+** ldnt1_f16_m1:
+**	ldnt1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f16_m1, svfloat16_t, float16_t,
+	   z0 = svldnt1_f16 (p0, x0 - svcnth ()),
+	   z0 = svldnt1 (p0, x0 - svcnth ()))
+
+/*
+** ldnt1_f16_m8:
+**	ldnt1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f16_m8, svfloat16_t, float16_t,
+	   z0 = svldnt1_f16 (p0, x0 - svcnth () * 8),
+	   z0 = svldnt1 (p0, x0 - svcnth () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_f16_m9:
+**	decb	x0, all, mul #9
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f16_m9, svfloat16_t, float16_t,
+	   z0 = svldnt1_f16 (p0, x0 - svcnth () * 9),
+	   z0 = svldnt1 (p0, x0 - svcnth () * 9))
+
+/*
+** ldnt1_vnum_f16_0:
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f16_0, svfloat16_t, float16_t,
+	   z0 = svldnt1_vnum_f16 (p0, x0, 0),
+	   z0 = svldnt1_vnum (p0, x0, 0))
+
+/*
+** ldnt1_vnum_f16_1:
+**	ldnt1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f16_1, svfloat16_t, float16_t,
+	   z0 = svldnt1_vnum_f16 (p0, x0, 1),
+	   z0 = svldnt1_vnum (p0, x0, 1))
+
+/*
+** ldnt1_vnum_f16_7:
+**	ldnt1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f16_7, svfloat16_t, float16_t,
+	   z0 = svldnt1_vnum_f16 (p0, x0, 7),
+	   z0 = svldnt1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_f16_8:
+**	incb	x0, all, mul #8
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f16_8, svfloat16_t, float16_t,
+	   z0 = svldnt1_vnum_f16 (p0, x0, 8),
+	   z0 = svldnt1_vnum (p0, x0, 8))
+
+/*
+** ldnt1_vnum_f16_m1:
+**	ldnt1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f16_m1, svfloat16_t, float16_t,
+	   z0 = svldnt1_vnum_f16 (p0, x0, -1),
+	   z0 = svldnt1_vnum (p0, x0, -1))
+
+/*
+** ldnt1_vnum_f16_m8:
+**	ldnt1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f16_m8, svfloat16_t, float16_t,
+	   z0 = svldnt1_vnum_f16 (p0, x0, -8),
+	   z0 = svldnt1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_f16_m9:
+**	decb	x0, all, mul #9
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f16_m9, svfloat16_t, float16_t,
+	   z0 = svldnt1_vnum_f16 (p0, x0, -9),
+	   z0 = svldnt1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldnt1_vnum_f16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldnt1h	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f16_x1, svfloat16_t, float16_t,
+	   z0 = svldnt1_vnum_f16 (p0, x0, x1),
+	   z0 = svldnt1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f32.c
new file mode 100644
index 000000000..fb09a8a6d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnt1_f32_base:
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f32_base, svfloat32_t, float32_t,
+	   z0 = svldnt1_f32 (p0, x0),
+	   z0 = svldnt1 (p0, x0))
+
+/*
+** ldnt1_f32_index:
+**	ldnt1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f32_index, svfloat32_t, float32_t,
+	   z0 = svldnt1_f32 (p0, x0 + x1),
+	   z0 = svldnt1 (p0, x0 + x1))
+
+/*
+** ldnt1_f32_1:
+**	ldnt1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f32_1, svfloat32_t, float32_t,
+	   z0 = svldnt1_f32 (p0, x0 + svcntw ()),
+	   z0 = svldnt1 (p0, x0 + svcntw ()))
+
+/*
+** ldnt1_f32_7:
+**	ldnt1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f32_7, svfloat32_t, float32_t,
+	   z0 = svldnt1_f32 (p0, x0 + svcntw () * 7),
+	   z0 = svldnt1 (p0, x0 + svcntw () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_f32_8:
+**	incb	x0, all, mul #8
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f32_8, svfloat32_t, float32_t,
+	   z0 = svldnt1_f32 (p0, x0 + svcntw () * 8),
+	   z0 = svldnt1 (p0, x0 + svcntw () * 8))
+
+/*
+** ldnt1_f32_m1:
+**	ldnt1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f32_m1, svfloat32_t, float32_t,
+	   z0 = svldnt1_f32 (p0, x0 - svcntw ()),
+	   z0 = svldnt1 (p0, x0 - svcntw ()))
+
+/*
+** ldnt1_f32_m8:
+**	ldnt1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f32_m8, svfloat32_t, float32_t,
+	   z0 = svldnt1_f32 (p0, x0 - svcntw () * 8),
+	   z0 = svldnt1 (p0, x0 - svcntw () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_f32_m9:
+**	decb	x0, all, mul #9
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f32_m9, svfloat32_t, float32_t,
+	   z0 = svldnt1_f32 (p0, x0 - svcntw () * 9),
+	   z0 = svldnt1 (p0, x0 - svcntw () * 9))
+
+/*
+** ldnt1_vnum_f32_0:
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f32_0, svfloat32_t, float32_t,
+	   z0 = svldnt1_vnum_f32 (p0, x0, 0),
+	   z0 = svldnt1_vnum (p0, x0, 0))
+
+/*
+** ldnt1_vnum_f32_1:
+**	ldnt1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f32_1, svfloat32_t, float32_t,
+	   z0 = svldnt1_vnum_f32 (p0, x0, 1),
+	   z0 = svldnt1_vnum (p0, x0, 1))
+
+/*
+** ldnt1_vnum_f32_7:
+**	ldnt1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f32_7, svfloat32_t, float32_t,
+	   z0 = svldnt1_vnum_f32 (p0, x0, 7),
+	   z0 = svldnt1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_f32_8:
+**	incb	x0, all, mul #8
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f32_8, svfloat32_t, float32_t,
+	   z0 = svldnt1_vnum_f32 (p0, x0, 8),
+	   z0 = svldnt1_vnum (p0, x0, 8))
+
+/*
+** ldnt1_vnum_f32_m1:
+**	ldnt1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f32_m1, svfloat32_t, float32_t,
+	   z0 = svldnt1_vnum_f32 (p0, x0, -1),
+	   z0 = svldnt1_vnum (p0, x0, -1))
+
+/*
+** ldnt1_vnum_f32_m8:
+**	ldnt1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f32_m8, svfloat32_t, float32_t,
+	   z0 = svldnt1_vnum_f32 (p0, x0, -8),
+	   z0 = svldnt1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_f32_m9:
+**	decb	x0, all, mul #9
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f32_m9, svfloat32_t, float32_t,
+	   z0 = svldnt1_vnum_f32 (p0, x0, -9),
+	   z0 = svldnt1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldnt1_vnum_f32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldnt1w	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f32_x1, svfloat32_t, float32_t,
+	   z0 = svldnt1_vnum_f32 (p0, x0, x1),
+	   z0 = svldnt1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f64.c
new file mode 100644
index 000000000..2a7863282
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_f64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnt1_f64_base:
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f64_base, svfloat64_t, float64_t,
+	   z0 = svldnt1_f64 (p0, x0),
+	   z0 = svldnt1 (p0, x0))
+
+/*
+** ldnt1_f64_index:
+**	ldnt1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f64_index, svfloat64_t, float64_t,
+	   z0 = svldnt1_f64 (p0, x0 + x1),
+	   z0 = svldnt1 (p0, x0 + x1))
+
+/*
+** ldnt1_f64_1:
+**	ldnt1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f64_1, svfloat64_t, float64_t,
+	   z0 = svldnt1_f64 (p0, x0 + svcntd ()),
+	   z0 = svldnt1 (p0, x0 + svcntd ()))
+
+/*
+** ldnt1_f64_7:
+**	ldnt1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f64_7, svfloat64_t, float64_t,
+	   z0 = svldnt1_f64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnt1 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_f64_8:
+**	incb	x0, all, mul #8
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f64_8, svfloat64_t, float64_t,
+	   z0 = svldnt1_f64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnt1 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnt1_f64_m1:
+**	ldnt1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f64_m1, svfloat64_t, float64_t,
+	   z0 = svldnt1_f64 (p0, x0 - svcntd ()),
+	   z0 = svldnt1 (p0, x0 - svcntd ()))
+
+/*
+** ldnt1_f64_m8:
+**	ldnt1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f64_m8, svfloat64_t, float64_t,
+	   z0 = svldnt1_f64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnt1 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_f64_m9:
+**	decb	x0, all, mul #9
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_f64_m9, svfloat64_t, float64_t,
+	   z0 = svldnt1_f64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnt1 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnt1_vnum_f64_0:
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f64_0, svfloat64_t, float64_t,
+	   z0 = svldnt1_vnum_f64 (p0, x0, 0),
+	   z0 = svldnt1_vnum (p0, x0, 0))
+
+/*
+** ldnt1_vnum_f64_1:
+**	ldnt1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f64_1, svfloat64_t, float64_t,
+	   z0 = svldnt1_vnum_f64 (p0, x0, 1),
+	   z0 = svldnt1_vnum (p0, x0, 1))
+
+/*
+** ldnt1_vnum_f64_7:
+**	ldnt1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f64_7, svfloat64_t, float64_t,
+	   z0 = svldnt1_vnum_f64 (p0, x0, 7),
+	   z0 = svldnt1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_f64_8:
+**	incb	x0, all, mul #8
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f64_8, svfloat64_t, float64_t,
+	   z0 = svldnt1_vnum_f64 (p0, x0, 8),
+	   z0 = svldnt1_vnum (p0, x0, 8))
+
+/*
+** ldnt1_vnum_f64_m1:
+**	ldnt1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f64_m1, svfloat64_t, float64_t,
+	   z0 = svldnt1_vnum_f64 (p0, x0, -1),
+	   z0 = svldnt1_vnum (p0, x0, -1))
+
+/*
+** ldnt1_vnum_f64_m8:
+**	ldnt1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f64_m8, svfloat64_t, float64_t,
+	   z0 = svldnt1_vnum_f64 (p0, x0, -8),
+	   z0 = svldnt1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_f64_m9:
+**	decb	x0, all, mul #9
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f64_m9, svfloat64_t, float64_t,
+	   z0 = svldnt1_vnum_f64 (p0, x0, -9),
+	   z0 = svldnt1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldnt1_vnum_f64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldnt1d	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_f64_x1, svfloat64_t, float64_t,
+	   z0 = svldnt1_vnum_f64 (p0, x0, x1),
+	   z0 = svldnt1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s16.c
new file mode 100644
index 000000000..c307ed51f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s16.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnt1_s16_base:
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s16_base, svint16_t, int16_t,
+	   z0 = svldnt1_s16 (p0, x0),
+	   z0 = svldnt1 (p0, x0))
+
+/*
+** ldnt1_s16_index:
+**	ldnt1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s16_index, svint16_t, int16_t,
+	   z0 = svldnt1_s16 (p0, x0 + x1),
+	   z0 = svldnt1 (p0, x0 + x1))
+
+/*
+** ldnt1_s16_1:
+**	ldnt1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s16_1, svint16_t, int16_t,
+	   z0 = svldnt1_s16 (p0, x0 + svcnth ()),
+	   z0 = svldnt1 (p0, x0 + svcnth ()))
+
+/*
+** ldnt1_s16_7:
+**	ldnt1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s16_7, svint16_t, int16_t,
+	   z0 = svldnt1_s16 (p0, x0 + svcnth () * 7),
+	   z0 = svldnt1 (p0, x0 + svcnth () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_s16_8:
+**	incb	x0, all, mul #8
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s16_8, svint16_t, int16_t,
+	   z0 = svldnt1_s16 (p0, x0 + svcnth () * 8),
+	   z0 = svldnt1 (p0, x0 + svcnth () * 8))
+
+/*
+** ldnt1_s16_m1:
+**	ldnt1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s16_m1, svint16_t, int16_t,
+	   z0 = svldnt1_s16 (p0, x0 - svcnth ()),
+	   z0 = svldnt1 (p0, x0 - svcnth ()))
+
+/*
+** ldnt1_s16_m8:
+**	ldnt1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s16_m8, svint16_t, int16_t,
+	   z0 = svldnt1_s16 (p0, x0 - svcnth () * 8),
+	   z0 = svldnt1 (p0, x0 - svcnth () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_s16_m9:
+**	decb	x0, all, mul #9
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s16_m9, svint16_t, int16_t,
+	   z0 = svldnt1_s16 (p0, x0 - svcnth () * 9),
+	   z0 = svldnt1 (p0, x0 - svcnth () * 9))
+
+/*
+** ldnt1_vnum_s16_0:
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s16_0, svint16_t, int16_t,
+	   z0 = svldnt1_vnum_s16 (p0, x0, 0),
+	   z0 = svldnt1_vnum (p0, x0, 0))
+
+/*
+** ldnt1_vnum_s16_1:
+**	ldnt1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s16_1, svint16_t, int16_t,
+	   z0 = svldnt1_vnum_s16 (p0, x0, 1),
+	   z0 = svldnt1_vnum (p0, x0, 1))
+
+/*
+** ldnt1_vnum_s16_7:
+**	ldnt1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s16_7, svint16_t, int16_t,
+	   z0 = svldnt1_vnum_s16 (p0, x0, 7),
+	   z0 = svldnt1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_s16_8:
+**	incb	x0, all, mul #8
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s16_8, svint16_t, int16_t,
+	   z0 = svldnt1_vnum_s16 (p0, x0, 8),
+	   z0 = svldnt1_vnum (p0, x0, 8))
+
+/*
+** ldnt1_vnum_s16_m1:
+**	ldnt1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s16_m1, svint16_t, int16_t,
+	   z0 = svldnt1_vnum_s16 (p0, x0, -1),
+	   z0 = svldnt1_vnum (p0, x0, -1))
+
+/*
+** ldnt1_vnum_s16_m8:
+**	ldnt1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s16_m8, svint16_t, int16_t,
+	   z0 = svldnt1_vnum_s16 (p0, x0, -8),
+	   z0 = svldnt1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_s16_m9:
+**	decb	x0, all, mul #9
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s16_m9, svint16_t, int16_t,
+	   z0 = svldnt1_vnum_s16 (p0, x0, -9),
+	   z0 = svldnt1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldnt1_vnum_s16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldnt1h	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s16_x1, svint16_t, int16_t,
+	   z0 = svldnt1_vnum_s16 (p0, x0, x1),
+	   z0 = svldnt1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s32.c
new file mode 100644
index 000000000..2b9df1781
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnt1_s32_base:
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s32_base, svint32_t, int32_t,
+	   z0 = svldnt1_s32 (p0, x0),
+	   z0 = svldnt1 (p0, x0))
+
+/*
+** ldnt1_s32_index:
+**	ldnt1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s32_index, svint32_t, int32_t,
+	   z0 = svldnt1_s32 (p0, x0 + x1),
+	   z0 = svldnt1 (p0, x0 + x1))
+
+/*
+** ldnt1_s32_1:
+**	ldnt1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s32_1, svint32_t, int32_t,
+	   z0 = svldnt1_s32 (p0, x0 + svcntw ()),
+	   z0 = svldnt1 (p0, x0 + svcntw ()))
+
+/*
+** ldnt1_s32_7:
+**	ldnt1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s32_7, svint32_t, int32_t,
+	   z0 = svldnt1_s32 (p0, x0 + svcntw () * 7),
+	   z0 = svldnt1 (p0, x0 + svcntw () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_s32_8:
+**	incb	x0, all, mul #8
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s32_8, svint32_t, int32_t,
+	   z0 = svldnt1_s32 (p0, x0 + svcntw () * 8),
+	   z0 = svldnt1 (p0, x0 + svcntw () * 8))
+
+/*
+** ldnt1_s32_m1:
+**	ldnt1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s32_m1, svint32_t, int32_t,
+	   z0 = svldnt1_s32 (p0, x0 - svcntw ()),
+	   z0 = svldnt1 (p0, x0 - svcntw ()))
+
+/*
+** ldnt1_s32_m8:
+**	ldnt1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s32_m8, svint32_t, int32_t,
+	   z0 = svldnt1_s32 (p0, x0 - svcntw () * 8),
+	   z0 = svldnt1 (p0, x0 - svcntw () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_s32_m9:
+**	decb	x0, all, mul #9
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s32_m9, svint32_t, int32_t,
+	   z0 = svldnt1_s32 (p0, x0 - svcntw () * 9),
+	   z0 = svldnt1 (p0, x0 - svcntw () * 9))
+
+/*
+** ldnt1_vnum_s32_0:
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s32_0, svint32_t, int32_t,
+	   z0 = svldnt1_vnum_s32 (p0, x0, 0),
+	   z0 = svldnt1_vnum (p0, x0, 0))
+
+/*
+** ldnt1_vnum_s32_1:
+**	ldnt1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s32_1, svint32_t, int32_t,
+	   z0 = svldnt1_vnum_s32 (p0, x0, 1),
+	   z0 = svldnt1_vnum (p0, x0, 1))
+
+/*
+** ldnt1_vnum_s32_7:
+**	ldnt1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s32_7, svint32_t, int32_t,
+	   z0 = svldnt1_vnum_s32 (p0, x0, 7),
+	   z0 = svldnt1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_s32_8:
+**	incb	x0, all, mul #8
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s32_8, svint32_t, int32_t,
+	   z0 = svldnt1_vnum_s32 (p0, x0, 8),
+	   z0 = svldnt1_vnum (p0, x0, 8))
+
+/*
+** ldnt1_vnum_s32_m1:
+**	ldnt1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s32_m1, svint32_t, int32_t,
+	   z0 = svldnt1_vnum_s32 (p0, x0, -1),
+	   z0 = svldnt1_vnum (p0, x0, -1))
+
+/*
+** ldnt1_vnum_s32_m8:
+**	ldnt1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s32_m8, svint32_t, int32_t,
+	   z0 = svldnt1_vnum_s32 (p0, x0, -8),
+	   z0 = svldnt1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_s32_m9:
+**	decb	x0, all, mul #9
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s32_m9, svint32_t, int32_t,
+	   z0 = svldnt1_vnum_s32 (p0, x0, -9),
+	   z0 = svldnt1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldnt1_vnum_s32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldnt1w	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s32_x1, svint32_t, int32_t,
+	   z0 = svldnt1_vnum_s32 (p0, x0, x1),
+	   z0 = svldnt1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s64.c
new file mode 100644
index 000000000..5bc7ac6ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnt1_s64_base:
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s64_base, svint64_t, int64_t,
+	   z0 = svldnt1_s64 (p0, x0),
+	   z0 = svldnt1 (p0, x0))
+
+/*
+** ldnt1_s64_index:
+**	ldnt1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s64_index, svint64_t, int64_t,
+	   z0 = svldnt1_s64 (p0, x0 + x1),
+	   z0 = svldnt1 (p0, x0 + x1))
+
+/*
+** ldnt1_s64_1:
+**	ldnt1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s64_1, svint64_t, int64_t,
+	   z0 = svldnt1_s64 (p0, x0 + svcntd ()),
+	   z0 = svldnt1 (p0, x0 + svcntd ()))
+
+/*
+** ldnt1_s64_7:
+**	ldnt1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s64_7, svint64_t, int64_t,
+	   z0 = svldnt1_s64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnt1 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_s64_8:
+**	incb	x0, all, mul #8
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s64_8, svint64_t, int64_t,
+	   z0 = svldnt1_s64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnt1 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnt1_s64_m1:
+**	ldnt1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s64_m1, svint64_t, int64_t,
+	   z0 = svldnt1_s64 (p0, x0 - svcntd ()),
+	   z0 = svldnt1 (p0, x0 - svcntd ()))
+
+/*
+** ldnt1_s64_m8:
+**	ldnt1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s64_m8, svint64_t, int64_t,
+	   z0 = svldnt1_s64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnt1 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_s64_m9:
+**	decb	x0, all, mul #9
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s64_m9, svint64_t, int64_t,
+	   z0 = svldnt1_s64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnt1 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnt1_vnum_s64_0:
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s64_0, svint64_t, int64_t,
+	   z0 = svldnt1_vnum_s64 (p0, x0, 0),
+	   z0 = svldnt1_vnum (p0, x0, 0))
+
+/*
+** ldnt1_vnum_s64_1:
+**	ldnt1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s64_1, svint64_t, int64_t,
+	   z0 = svldnt1_vnum_s64 (p0, x0, 1),
+	   z0 = svldnt1_vnum (p0, x0, 1))
+
+/*
+** ldnt1_vnum_s64_7:
+**	ldnt1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s64_7, svint64_t, int64_t,
+	   z0 = svldnt1_vnum_s64 (p0, x0, 7),
+	   z0 = svldnt1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_s64_8:
+**	incb	x0, all, mul #8
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s64_8, svint64_t, int64_t,
+	   z0 = svldnt1_vnum_s64 (p0, x0, 8),
+	   z0 = svldnt1_vnum (p0, x0, 8))
+
+/*
+** ldnt1_vnum_s64_m1:
+**	ldnt1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s64_m1, svint64_t, int64_t,
+	   z0 = svldnt1_vnum_s64 (p0, x0, -1),
+	   z0 = svldnt1_vnum (p0, x0, -1))
+
+/*
+** ldnt1_vnum_s64_m8:
+**	ldnt1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s64_m8, svint64_t, int64_t,
+	   z0 = svldnt1_vnum_s64 (p0, x0, -8),
+	   z0 = svldnt1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_s64_m9:
+**	decb	x0, all, mul #9
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s64_m9, svint64_t, int64_t,
+	   z0 = svldnt1_vnum_s64 (p0, x0, -9),
+	   z0 = svldnt1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldnt1_vnum_s64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldnt1d	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s64_x1, svint64_t, int64_t,
+	   z0 = svldnt1_vnum_s64 (p0, x0, x1),
+	   z0 = svldnt1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s8.c
new file mode 100644
index 000000000..eb8e2e548
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_s8.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnt1_s8_base:
+**	ldnt1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s8_base, svint8_t, int8_t,
+	   z0 = svldnt1_s8 (p0, x0),
+	   z0 = svldnt1 (p0, x0))
+
+/*
+** ldnt1_s8_index:
+**	ldnt1b	z0\.b, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s8_index, svint8_t, int8_t,
+	   z0 = svldnt1_s8 (p0, x0 + x1),
+	   z0 = svldnt1 (p0, x0 + x1))
+
+/*
+** ldnt1_s8_1:
+**	ldnt1b	z0\.b, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s8_1, svint8_t, int8_t,
+	   z0 = svldnt1_s8 (p0, x0 + svcntb ()),
+	   z0 = svldnt1 (p0, x0 + svcntb ()))
+
+/*
+** ldnt1_s8_7:
+**	ldnt1b	z0\.b, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s8_7, svint8_t, int8_t,
+	   z0 = svldnt1_s8 (p0, x0 + svcntb () * 7),
+	   z0 = svldnt1 (p0, x0 + svcntb () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_s8_8:
+**	incb	x0, all, mul #8
+**	ldnt1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s8_8, svint8_t, int8_t,
+	   z0 = svldnt1_s8 (p0, x0 + svcntb () * 8),
+	   z0 = svldnt1 (p0, x0 + svcntb () * 8))
+
+/*
+** ldnt1_s8_m1:
+**	ldnt1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s8_m1, svint8_t, int8_t,
+	   z0 = svldnt1_s8 (p0, x0 - svcntb ()),
+	   z0 = svldnt1 (p0, x0 - svcntb ()))
+
+/*
+** ldnt1_s8_m8:
+**	ldnt1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s8_m8, svint8_t, int8_t,
+	   z0 = svldnt1_s8 (p0, x0 - svcntb () * 8),
+	   z0 = svldnt1 (p0, x0 - svcntb () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_s8_m9:
+**	decb	x0, all, mul #9
+**	ldnt1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_s8_m9, svint8_t, int8_t,
+	   z0 = svldnt1_s8 (p0, x0 - svcntb () * 9),
+	   z0 = svldnt1 (p0, x0 - svcntb () * 9))
+
+/*
+** ldnt1_vnum_s8_0:
+**	ldnt1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s8_0, svint8_t, int8_t,
+	   z0 = svldnt1_vnum_s8 (p0, x0, 0),
+	   z0 = svldnt1_vnum (p0, x0, 0))
+
+/*
+** ldnt1_vnum_s8_1:
+**	ldnt1b	z0\.b, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s8_1, svint8_t, int8_t,
+	   z0 = svldnt1_vnum_s8 (p0, x0, 1),
+	   z0 = svldnt1_vnum (p0, x0, 1))
+
+/*
+** ldnt1_vnum_s8_7:
+**	ldnt1b	z0\.b, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s8_7, svint8_t, int8_t,
+	   z0 = svldnt1_vnum_s8 (p0, x0, 7),
+	   z0 = svldnt1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_s8_8:
+**	incb	x0, all, mul #8
+**	ldnt1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s8_8, svint8_t, int8_t,
+	   z0 = svldnt1_vnum_s8 (p0, x0, 8),
+	   z0 = svldnt1_vnum (p0, x0, 8))
+
+/*
+** ldnt1_vnum_s8_m1:
+**	ldnt1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s8_m1, svint8_t, int8_t,
+	   z0 = svldnt1_vnum_s8 (p0, x0, -1),
+	   z0 = svldnt1_vnum (p0, x0, -1))
+
+/*
+** ldnt1_vnum_s8_m8:
+**	ldnt1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s8_m8, svint8_t, int8_t,
+	   z0 = svldnt1_vnum_s8 (p0, x0, -8),
+	   z0 = svldnt1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_s8_m9:
+**	decb	x0, all, mul #9
+**	ldnt1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s8_m9, svint8_t, int8_t,
+	   z0 = svldnt1_vnum_s8 (p0, x0, -9),
+	   z0 = svldnt1_vnum (p0, x0, -9))
+
+/*
+** ldnt1_vnum_s8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnt1b	z0\.b, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ldnt1b	z0\.b, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_s8_x1, svint8_t, int8_t,
+	   z0 = svldnt1_vnum_s8 (p0, x0, x1),
+	   z0 = svldnt1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u16.c
new file mode 100644
index 000000000..c032c3d93
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u16.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnt1_u16_base:
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u16_base, svuint16_t, uint16_t,
+	   z0 = svldnt1_u16 (p0, x0),
+	   z0 = svldnt1 (p0, x0))
+
+/*
+** ldnt1_u16_index:
+**	ldnt1h	z0\.h, p0/z, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u16_index, svuint16_t, uint16_t,
+	   z0 = svldnt1_u16 (p0, x0 + x1),
+	   z0 = svldnt1 (p0, x0 + x1))
+
+/*
+** ldnt1_u16_1:
+**	ldnt1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u16_1, svuint16_t, uint16_t,
+	   z0 = svldnt1_u16 (p0, x0 + svcnth ()),
+	   z0 = svldnt1 (p0, x0 + svcnth ()))
+
+/*
+** ldnt1_u16_7:
+**	ldnt1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u16_7, svuint16_t, uint16_t,
+	   z0 = svldnt1_u16 (p0, x0 + svcnth () * 7),
+	   z0 = svldnt1 (p0, x0 + svcnth () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_u16_8:
+**	incb	x0, all, mul #8
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u16_8, svuint16_t, uint16_t,
+	   z0 = svldnt1_u16 (p0, x0 + svcnth () * 8),
+	   z0 = svldnt1 (p0, x0 + svcnth () * 8))
+
+/*
+** ldnt1_u16_m1:
+**	ldnt1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u16_m1, svuint16_t, uint16_t,
+	   z0 = svldnt1_u16 (p0, x0 - svcnth ()),
+	   z0 = svldnt1 (p0, x0 - svcnth ()))
+
+/*
+** ldnt1_u16_m8:
+**	ldnt1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u16_m8, svuint16_t, uint16_t,
+	   z0 = svldnt1_u16 (p0, x0 - svcnth () * 8),
+	   z0 = svldnt1 (p0, x0 - svcnth () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_u16_m9:
+**	decb	x0, all, mul #9
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u16_m9, svuint16_t, uint16_t,
+	   z0 = svldnt1_u16 (p0, x0 - svcnth () * 9),
+	   z0 = svldnt1 (p0, x0 - svcnth () * 9))
+
+/*
+** ldnt1_vnum_u16_0:
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u16_0, svuint16_t, uint16_t,
+	   z0 = svldnt1_vnum_u16 (p0, x0, 0),
+	   z0 = svldnt1_vnum (p0, x0, 0))
+
+/*
+** ldnt1_vnum_u16_1:
+**	ldnt1h	z0\.h, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u16_1, svuint16_t, uint16_t,
+	   z0 = svldnt1_vnum_u16 (p0, x0, 1),
+	   z0 = svldnt1_vnum (p0, x0, 1))
+
+/*
+** ldnt1_vnum_u16_7:
+**	ldnt1h	z0\.h, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u16_7, svuint16_t, uint16_t,
+	   z0 = svldnt1_vnum_u16 (p0, x0, 7),
+	   z0 = svldnt1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_u16_8:
+**	incb	x0, all, mul #8
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u16_8, svuint16_t, uint16_t,
+	   z0 = svldnt1_vnum_u16 (p0, x0, 8),
+	   z0 = svldnt1_vnum (p0, x0, 8))
+
+/*
+** ldnt1_vnum_u16_m1:
+**	ldnt1h	z0\.h, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u16_m1, svuint16_t, uint16_t,
+	   z0 = svldnt1_vnum_u16 (p0, x0, -1),
+	   z0 = svldnt1_vnum (p0, x0, -1))
+
+/*
+** ldnt1_vnum_u16_m8:
+**	ldnt1h	z0\.h, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u16_m8, svuint16_t, uint16_t,
+	   z0 = svldnt1_vnum_u16 (p0, x0, -8),
+	   z0 = svldnt1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_u16_m9:
+**	decb	x0, all, mul #9
+**	ldnt1h	z0\.h, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u16_m9, svuint16_t, uint16_t,
+	   z0 = svldnt1_vnum_u16 (p0, x0, -9),
+	   z0 = svldnt1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldnt1_vnum_u16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldnt1h	z0\.h, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u16_x1, svuint16_t, uint16_t,
+	   z0 = svldnt1_vnum_u16 (p0, x0, x1),
+	   z0 = svldnt1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u32.c
new file mode 100644
index 000000000..278794459
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnt1_u32_base:
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u32_base, svuint32_t, uint32_t,
+	   z0 = svldnt1_u32 (p0, x0),
+	   z0 = svldnt1 (p0, x0))
+
+/*
+** ldnt1_u32_index:
+**	ldnt1w	z0\.s, p0/z, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u32_index, svuint32_t, uint32_t,
+	   z0 = svldnt1_u32 (p0, x0 + x1),
+	   z0 = svldnt1 (p0, x0 + x1))
+
+/*
+** ldnt1_u32_1:
+**	ldnt1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u32_1, svuint32_t, uint32_t,
+	   z0 = svldnt1_u32 (p0, x0 + svcntw ()),
+	   z0 = svldnt1 (p0, x0 + svcntw ()))
+
+/*
+** ldnt1_u32_7:
+**	ldnt1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u32_7, svuint32_t, uint32_t,
+	   z0 = svldnt1_u32 (p0, x0 + svcntw () * 7),
+	   z0 = svldnt1 (p0, x0 + svcntw () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_u32_8:
+**	incb	x0, all, mul #8
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u32_8, svuint32_t, uint32_t,
+	   z0 = svldnt1_u32 (p0, x0 + svcntw () * 8),
+	   z0 = svldnt1 (p0, x0 + svcntw () * 8))
+
+/*
+** ldnt1_u32_m1:
+**	ldnt1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u32_m1, svuint32_t, uint32_t,
+	   z0 = svldnt1_u32 (p0, x0 - svcntw ()),
+	   z0 = svldnt1 (p0, x0 - svcntw ()))
+
+/*
+** ldnt1_u32_m8:
+**	ldnt1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u32_m8, svuint32_t, uint32_t,
+	   z0 = svldnt1_u32 (p0, x0 - svcntw () * 8),
+	   z0 = svldnt1 (p0, x0 - svcntw () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_u32_m9:
+**	decb	x0, all, mul #9
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u32_m9, svuint32_t, uint32_t,
+	   z0 = svldnt1_u32 (p0, x0 - svcntw () * 9),
+	   z0 = svldnt1 (p0, x0 - svcntw () * 9))
+
+/*
+** ldnt1_vnum_u32_0:
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u32_0, svuint32_t, uint32_t,
+	   z0 = svldnt1_vnum_u32 (p0, x0, 0),
+	   z0 = svldnt1_vnum (p0, x0, 0))
+
+/*
+** ldnt1_vnum_u32_1:
+**	ldnt1w	z0\.s, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u32_1, svuint32_t, uint32_t,
+	   z0 = svldnt1_vnum_u32 (p0, x0, 1),
+	   z0 = svldnt1_vnum (p0, x0, 1))
+
+/*
+** ldnt1_vnum_u32_7:
+**	ldnt1w	z0\.s, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u32_7, svuint32_t, uint32_t,
+	   z0 = svldnt1_vnum_u32 (p0, x0, 7),
+	   z0 = svldnt1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_u32_8:
+**	incb	x0, all, mul #8
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u32_8, svuint32_t, uint32_t,
+	   z0 = svldnt1_vnum_u32 (p0, x0, 8),
+	   z0 = svldnt1_vnum (p0, x0, 8))
+
+/*
+** ldnt1_vnum_u32_m1:
+**	ldnt1w	z0\.s, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u32_m1, svuint32_t, uint32_t,
+	   z0 = svldnt1_vnum_u32 (p0, x0, -1),
+	   z0 = svldnt1_vnum (p0, x0, -1))
+
+/*
+** ldnt1_vnum_u32_m8:
+**	ldnt1w	z0\.s, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u32_m8, svuint32_t, uint32_t,
+	   z0 = svldnt1_vnum_u32 (p0, x0, -8),
+	   z0 = svldnt1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_u32_m9:
+**	decb	x0, all, mul #9
+**	ldnt1w	z0\.s, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u32_m9, svuint32_t, uint32_t,
+	   z0 = svldnt1_vnum_u32 (p0, x0, -9),
+	   z0 = svldnt1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldnt1_vnum_u32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldnt1w	z0\.s, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u32_x1, svuint32_t, uint32_t,
+	   z0 = svldnt1_vnum_u32 (p0, x0, x1),
+	   z0 = svldnt1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u64.c
new file mode 100644
index 000000000..abafee6f7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnt1_u64_base:
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u64_base, svuint64_t, uint64_t,
+	   z0 = svldnt1_u64 (p0, x0),
+	   z0 = svldnt1 (p0, x0))
+
+/*
+** ldnt1_u64_index:
+**	ldnt1d	z0\.d, p0/z, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u64_index, svuint64_t, uint64_t,
+	   z0 = svldnt1_u64 (p0, x0 + x1),
+	   z0 = svldnt1 (p0, x0 + x1))
+
+/*
+** ldnt1_u64_1:
+**	ldnt1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u64_1, svuint64_t, uint64_t,
+	   z0 = svldnt1_u64 (p0, x0 + svcntd ()),
+	   z0 = svldnt1 (p0, x0 + svcntd ()))
+
+/*
+** ldnt1_u64_7:
+**	ldnt1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u64_7, svuint64_t, uint64_t,
+	   z0 = svldnt1_u64 (p0, x0 + svcntd () * 7),
+	   z0 = svldnt1 (p0, x0 + svcntd () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_u64_8:
+**	incb	x0, all, mul #8
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u64_8, svuint64_t, uint64_t,
+	   z0 = svldnt1_u64 (p0, x0 + svcntd () * 8),
+	   z0 = svldnt1 (p0, x0 + svcntd () * 8))
+
+/*
+** ldnt1_u64_m1:
+**	ldnt1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u64_m1, svuint64_t, uint64_t,
+	   z0 = svldnt1_u64 (p0, x0 - svcntd ()),
+	   z0 = svldnt1 (p0, x0 - svcntd ()))
+
+/*
+** ldnt1_u64_m8:
+**	ldnt1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u64_m8, svuint64_t, uint64_t,
+	   z0 = svldnt1_u64 (p0, x0 - svcntd () * 8),
+	   z0 = svldnt1 (p0, x0 - svcntd () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_u64_m9:
+**	decb	x0, all, mul #9
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u64_m9, svuint64_t, uint64_t,
+	   z0 = svldnt1_u64 (p0, x0 - svcntd () * 9),
+	   z0 = svldnt1 (p0, x0 - svcntd () * 9))
+
+/*
+** ldnt1_vnum_u64_0:
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u64_0, svuint64_t, uint64_t,
+	   z0 = svldnt1_vnum_u64 (p0, x0, 0),
+	   z0 = svldnt1_vnum (p0, x0, 0))
+
+/*
+** ldnt1_vnum_u64_1:
+**	ldnt1d	z0\.d, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u64_1, svuint64_t, uint64_t,
+	   z0 = svldnt1_vnum_u64 (p0, x0, 1),
+	   z0 = svldnt1_vnum (p0, x0, 1))
+
+/*
+** ldnt1_vnum_u64_7:
+**	ldnt1d	z0\.d, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u64_7, svuint64_t, uint64_t,
+	   z0 = svldnt1_vnum_u64 (p0, x0, 7),
+	   z0 = svldnt1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_u64_8:
+**	incb	x0, all, mul #8
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u64_8, svuint64_t, uint64_t,
+	   z0 = svldnt1_vnum_u64 (p0, x0, 8),
+	   z0 = svldnt1_vnum (p0, x0, 8))
+
+/*
+** ldnt1_vnum_u64_m1:
+**	ldnt1d	z0\.d, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u64_m1, svuint64_t, uint64_t,
+	   z0 = svldnt1_vnum_u64 (p0, x0, -1),
+	   z0 = svldnt1_vnum (p0, x0, -1))
+
+/*
+** ldnt1_vnum_u64_m8:
+**	ldnt1d	z0\.d, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u64_m8, svuint64_t, uint64_t,
+	   z0 = svldnt1_vnum_u64 (p0, x0, -8),
+	   z0 = svldnt1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_u64_m9:
+**	decb	x0, all, mul #9
+**	ldnt1d	z0\.d, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u64_m9, svuint64_t, uint64_t,
+	   z0 = svldnt1_vnum_u64 (p0, x0, -9),
+	   z0 = svldnt1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** ldnt1_vnum_u64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	ldnt1d	z0\.d, p0/z, \[\2\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u64_x1, svuint64_t, uint64_t,
+	   z0 = svldnt1_vnum_u64 (p0, x0, x1),
+	   z0 = svldnt1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u8.c
new file mode 100644
index 000000000..7bf9acc26
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ldnt1_u8.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnt1_u8_base:
+**	ldnt1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u8_base, svuint8_t, uint8_t,
+	   z0 = svldnt1_u8 (p0, x0),
+	   z0 = svldnt1 (p0, x0))
+
+/*
+** ldnt1_u8_index:
+**	ldnt1b	z0\.b, p0/z, \[x0, x1\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u8_index, svuint8_t, uint8_t,
+	   z0 = svldnt1_u8 (p0, x0 + x1),
+	   z0 = svldnt1 (p0, x0 + x1))
+
+/*
+** ldnt1_u8_1:
+**	ldnt1b	z0\.b, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u8_1, svuint8_t, uint8_t,
+	   z0 = svldnt1_u8 (p0, x0 + svcntb ()),
+	   z0 = svldnt1 (p0, x0 + svcntb ()))
+
+/*
+** ldnt1_u8_7:
+**	ldnt1b	z0\.b, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u8_7, svuint8_t, uint8_t,
+	   z0 = svldnt1_u8 (p0, x0 + svcntb () * 7),
+	   z0 = svldnt1 (p0, x0 + svcntb () * 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_u8_8:
+**	incb	x0, all, mul #8
+**	ldnt1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u8_8, svuint8_t, uint8_t,
+	   z0 = svldnt1_u8 (p0, x0 + svcntb () * 8),
+	   z0 = svldnt1 (p0, x0 + svcntb () * 8))
+
+/*
+** ldnt1_u8_m1:
+**	ldnt1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u8_m1, svuint8_t, uint8_t,
+	   z0 = svldnt1_u8 (p0, x0 - svcntb ()),
+	   z0 = svldnt1 (p0, x0 - svcntb ()))
+
+/*
+** ldnt1_u8_m8:
+**	ldnt1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u8_m8, svuint8_t, uint8_t,
+	   z0 = svldnt1_u8 (p0, x0 - svcntb () * 8),
+	   z0 = svldnt1 (p0, x0 - svcntb () * 8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_u8_m9:
+**	decb	x0, all, mul #9
+**	ldnt1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_u8_m9, svuint8_t, uint8_t,
+	   z0 = svldnt1_u8 (p0, x0 - svcntb () * 9),
+	   z0 = svldnt1 (p0, x0 - svcntb () * 9))
+
+/*
+** ldnt1_vnum_u8_0:
+**	ldnt1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u8_0, svuint8_t, uint8_t,
+	   z0 = svldnt1_vnum_u8 (p0, x0, 0),
+	   z0 = svldnt1_vnum (p0, x0, 0))
+
+/*
+** ldnt1_vnum_u8_1:
+**	ldnt1b	z0\.b, p0/z, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u8_1, svuint8_t, uint8_t,
+	   z0 = svldnt1_vnum_u8 (p0, x0, 1),
+	   z0 = svldnt1_vnum (p0, x0, 1))
+
+/*
+** ldnt1_vnum_u8_7:
+**	ldnt1b	z0\.b, p0/z, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u8_7, svuint8_t, uint8_t,
+	   z0 = svldnt1_vnum_u8 (p0, x0, 7),
+	   z0 = svldnt1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_u8_8:
+**	incb	x0, all, mul #8
+**	ldnt1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u8_8, svuint8_t, uint8_t,
+	   z0 = svldnt1_vnum_u8 (p0, x0, 8),
+	   z0 = svldnt1_vnum (p0, x0, 8))
+
+/*
+** ldnt1_vnum_u8_m1:
+**	ldnt1b	z0\.b, p0/z, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u8_m1, svuint8_t, uint8_t,
+	   z0 = svldnt1_vnum_u8 (p0, x0, -1),
+	   z0 = svldnt1_vnum (p0, x0, -1))
+
+/*
+** ldnt1_vnum_u8_m8:
+**	ldnt1b	z0\.b, p0/z, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u8_m8, svuint8_t, uint8_t,
+	   z0 = svldnt1_vnum_u8 (p0, x0, -8),
+	   z0 = svldnt1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** ldnt1_vnum_u8_m9:
+**	decb	x0, all, mul #9
+**	ldnt1b	z0\.b, p0/z, \[x0\]
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u8_m9, svuint8_t, uint8_t,
+	   z0 = svldnt1_vnum_u8 (p0, x0, -9),
+	   z0 = svldnt1_vnum (p0, x0, -9))
+
+/*
+** ldnt1_vnum_u8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	ldnt1b	z0\.b, p0/z, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	ldnt1b	z0\.b, p0/z, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_LOAD (ldnt1_vnum_u8_x1, svuint8_t, uint8_t,
+	   z0 = svldnt1_vnum_u8 (p0, x0, x1),
+	   z0 = svldnt1_vnum (p0, x0, x1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_bf16.c
new file mode 100644
index 000000000..cd91ff48d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_bf16.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** len_x0_bf16:
+**	cnth	x0
+**	ret
+*/
+TEST_REDUCTION_X (len_x0_bf16, uint64_t, svbfloat16_t,
+		  x0 = svlen_bf16 (z0),
+		  x0 = svlen (z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f16.c
new file mode 100644
index 000000000..aa6d94bbc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f16.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** len_x0_f16:
+**	cnth	x0
+**	ret
+*/
+TEST_REDUCTION_X (len_x0_f16, uint64_t, svfloat16_t,
+		  x0 = svlen_f16 (z0),
+		  x0 = svlen (z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f32.c
new file mode 100644
index 000000000..1dd50cee0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f32.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** len_x0_f32:
+**	cntw	x0
+**	ret
+*/
+TEST_REDUCTION_X (len_x0_f32, uint64_t, svfloat32_t,
+		  x0 = svlen_f32 (z0),
+		  x0 = svlen (z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f64.c
new file mode 100644
index 000000000..1f210653e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_f64.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** len_x0_f64:
+**	cntd	x0
+**	ret
+*/
+TEST_REDUCTION_X (len_x0_f64, uint64_t, svfloat64_t,
+		  x0 = svlen_f64 (z0),
+		  x0 = svlen (z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s16.c
new file mode 100644
index 000000000..f56796182
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s16.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** len_x0_s16:
+**	cnth	x0
+**	ret
+*/
+TEST_REDUCTION_X (len_x0_s16, uint64_t, svint16_t,
+		  x0 = svlen_s16 (z0),
+		  x0 = svlen (z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s32.c
new file mode 100644
index 000000000..662fac177
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s32.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** len_x0_s32:
+**	cntw	x0
+**	ret
+*/
+TEST_REDUCTION_X (len_x0_s32, uint64_t, svint32_t,
+		  x0 = svlen_s32 (z0),
+		  x0 = svlen (z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s64.c
new file mode 100644
index 000000000..f95770302
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s64.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** len_x0_s64:
+**	cntd	x0
+**	ret
+*/
+TEST_REDUCTION_X (len_x0_s64, uint64_t, svint64_t,
+		  x0 = svlen_s64 (z0),
+		  x0 = svlen (z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s8.c
new file mode 100644
index 000000000..6ed8a7177
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_s8.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** len_x0_s8:
+**	cntb	x0
+**	ret
+*/
+TEST_REDUCTION_X (len_x0_s8, uint64_t, svint8_t,
+		  x0 = svlen_s8 (z0),
+		  x0 = svlen (z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u16.c
new file mode 100644
index 000000000..13692c927
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u16.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** len_x0_u16:
+**	cnth	x0
+**	ret
+*/
+TEST_REDUCTION_X (len_x0_u16, uint64_t, svuint16_t,
+		  x0 = svlen_u16 (z0),
+		  x0 = svlen (z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u32.c
new file mode 100644
index 000000000..b03146089
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u32.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** len_x0_u32:
+**	cntw	x0
+**	ret
+*/
+TEST_REDUCTION_X (len_x0_u32, uint64_t, svuint32_t,
+		  x0 = svlen_u32 (z0),
+		  x0 = svlen (z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u64.c
new file mode 100644
index 000000000..11f2e4b81
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u64.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** len_x0_u64:
+**	cntd	x0
+**	ret
+*/
+TEST_REDUCTION_X (len_x0_u64, uint64_t, svuint64_t,
+		  x0 = svlen_u64 (z0),
+		  x0 = svlen (z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u8.c
new file mode 100644
index 000000000..fbd39a432
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/len_u8.c
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** len_x0_u8:
+**	cntb	x0
+**	ret
+*/
+TEST_REDUCTION_X (len_x0_u8, uint64_t, svuint8_t,
+		  x0 = svlen_u8 (z0),
+		  x0 = svlen (z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s16.c
new file mode 100644
index 000000000..edaaca5f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s16.c
@@ -0,0 +1,351 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsl_s16_m_tied1:
+**	lsl	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (lsl_s16_m_tied1, svint16_t, svuint16_t,
+	     z0 = svlsl_s16_m (p0, z0, z4),
+	     z0 = svlsl_m (p0, z0, z4))
+
+/*
+** lsl_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	lsl	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_s16_m_tied2, svint16_t, svuint16_t,
+		 z0_res = svlsl_s16_m (p0, z4, z0),
+		 z0_res = svlsl_m (p0, z4, z0))
+
+/*
+** lsl_s16_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (lsl_s16_m_untied, svint16_t, svuint16_t,
+	     z0 = svlsl_s16_m (p0, z1, z4),
+	     z0 = svlsl_m (p0, z1, z4))
+
+/*
+** lsl_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s16_m_tied1, svint16_t, uint16_t,
+		 z0 = svlsl_n_s16_m (p0, z0, x0),
+		 z0 = svlsl_m (p0, z0, x0))
+
+/*
+** lsl_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s16_m_untied, svint16_t, uint16_t,
+		 z0 = svlsl_n_s16_m (p0, z1, x0),
+		 z0 = svlsl_m (p0, z1, x0))
+
+/*
+** lsl_1_s16_m_tied1:
+**	lsl	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s16_m_tied1, svint16_t,
+		z0 = svlsl_n_s16_m (p0, z0, 1),
+		z0 = svlsl_m (p0, z0, 1))
+
+/*
+** lsl_1_s16_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s16_m_untied, svint16_t,
+		z0 = svlsl_n_s16_m (p0, z1, 1),
+		z0 = svlsl_m (p0, z1, 1))
+
+/*
+** lsl_15_s16_m_tied1:
+**	lsl	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_15_s16_m_tied1, svint16_t,
+		z0 = svlsl_n_s16_m (p0, z0, 15),
+		z0 = svlsl_m (p0, z0, 15))
+
+/*
+** lsl_15_s16_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_15_s16_m_untied, svint16_t,
+		z0 = svlsl_n_s16_m (p0, z1, 15),
+		z0 = svlsl_m (p0, z1, 15))
+
+/*
+** lsl_16_s16_m_tied1:
+**	mov	(z[0-9]+\.h), #16
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_16_s16_m_tied1, svint16_t,
+		z0 = svlsl_n_s16_m (p0, z0, 16),
+		z0 = svlsl_m (p0, z0, 16))
+
+/*
+** lsl_16_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #16
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_16_s16_m_untied, svint16_t,
+		z0 = svlsl_n_s16_m (p0, z1, 16),
+		z0 = svlsl_m (p0, z1, 16))
+
+/*
+** lsl_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (lsl_s16_z_tied1, svint16_t, svuint16_t,
+	     z0 = svlsl_s16_z (p0, z0, z4),
+	     z0 = svlsl_z (p0, z0, z4))
+
+/*
+** lsl_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lslr	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_s16_z_tied2, svint16_t, svuint16_t,
+		 z0_res = svlsl_s16_z (p0, z4, z0),
+		 z0_res = svlsl_z (p0, z4, z0))
+
+/*
+** lsl_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, z4\.h
+** |
+**	movprfx	z0\.h, p0/z, z4\.h
+**	lslr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_DUAL_Z (lsl_s16_z_untied, svint16_t, svuint16_t,
+	     z0 = svlsl_s16_z (p0, z1, z4),
+	     z0 = svlsl_z (p0, z1, z4))
+
+/*
+** lsl_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s16_z_tied1, svint16_t, uint16_t,
+		 z0 = svlsl_n_s16_z (p0, z0, x0),
+		 z0 = svlsl_z (p0, z0, x0))
+
+/*
+** lsl_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	lslr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s16_z_untied, svint16_t, uint16_t,
+		 z0 = svlsl_n_s16_z (p0, z1, x0),
+		 z0 = svlsl_z (p0, z1, x0))
+
+/*
+** lsl_1_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s16_z_tied1, svint16_t,
+		z0 = svlsl_n_s16_z (p0, z0, 1),
+		z0 = svlsl_z (p0, z0, 1))
+
+/*
+** lsl_1_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s16_z_untied, svint16_t,
+		z0 = svlsl_n_s16_z (p0, z1, 1),
+		z0 = svlsl_z (p0, z1, 1))
+
+/*
+** lsl_15_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_15_s16_z_tied1, svint16_t,
+		z0 = svlsl_n_s16_z (p0, z0, 15),
+		z0 = svlsl_z (p0, z0, 15))
+
+/*
+** lsl_15_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_15_s16_z_untied, svint16_t,
+		z0 = svlsl_n_s16_z (p0, z1, 15),
+		z0 = svlsl_z (p0, z1, 15))
+
+/*
+** lsl_16_s16_z_tied1:
+**	mov	(z[0-9]+\.h), #16
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_16_s16_z_tied1, svint16_t,
+		z0 = svlsl_n_s16_z (p0, z0, 16),
+		z0 = svlsl_z (p0, z0, 16))
+
+/*
+** lsl_16_s16_z_untied:
+**	mov	(z[0-9]+\.h), #16
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	lslr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_16_s16_z_untied, svint16_t,
+		z0 = svlsl_n_s16_z (p0, z1, 16),
+		z0 = svlsl_z (p0, z1, 16))
+
+/*
+** lsl_s16_x_tied1:
+**	lsl	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (lsl_s16_x_tied1, svint16_t, svuint16_t,
+	     z0 = svlsl_s16_x (p0, z0, z4),
+	     z0 = svlsl_x (p0, z0, z4))
+
+/*
+** lsl_s16_x_tied2:
+**	lslr	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_s16_x_tied2, svint16_t, svuint16_t,
+		 z0_res = svlsl_s16_x (p0, z4, z0),
+		 z0_res = svlsl_x (p0, z4, z0))
+
+/*
+** lsl_s16_x_untied:
+** (
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, z4\.h
+** |
+**	movprfx	z0, z4
+**	lslr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_DUAL_Z (lsl_s16_x_untied, svint16_t, svuint16_t,
+	     z0 = svlsl_s16_x (p0, z1, z4),
+	     z0 = svlsl_x (p0, z1, z4))
+
+/*
+** lsl_w0_s16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s16_x_tied1, svint16_t, uint16_t,
+		 z0 = svlsl_n_s16_x (p0, z0, x0),
+		 z0 = svlsl_x (p0, z0, x0))
+
+/*
+** lsl_w0_s16_x_untied:
+**	mov	z0\.h, w0
+**	lslr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s16_x_untied, svint16_t, uint16_t,
+		 z0 = svlsl_n_s16_x (p0, z1, x0),
+		 z0 = svlsl_x (p0, z1, x0))
+
+/*
+** lsl_1_s16_x_tied1:
+**	lsl	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s16_x_tied1, svint16_t,
+		z0 = svlsl_n_s16_x (p0, z0, 1),
+		z0 = svlsl_x (p0, z0, 1))
+
+/*
+** lsl_1_s16_x_untied:
+**	lsl	z0\.h, z1\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s16_x_untied, svint16_t,
+		z0 = svlsl_n_s16_x (p0, z1, 1),
+		z0 = svlsl_x (p0, z1, 1))
+
+/*
+** lsl_15_s16_x_tied1:
+**	lsl	z0\.h, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_15_s16_x_tied1, svint16_t,
+		z0 = svlsl_n_s16_x (p0, z0, 15),
+		z0 = svlsl_x (p0, z0, 15))
+
+/*
+** lsl_15_s16_x_untied:
+**	lsl	z0\.h, z1\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_15_s16_x_untied, svint16_t,
+		z0 = svlsl_n_s16_x (p0, z1, 15),
+		z0 = svlsl_x (p0, z1, 15))
+
+/*
+** lsl_16_s16_x_tied1:
+**	mov	(z[0-9]+\.h), #16
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_16_s16_x_tied1, svint16_t,
+		z0 = svlsl_n_s16_x (p0, z0, 16),
+		z0 = svlsl_x (p0, z0, 16))
+
+/*
+** lsl_16_s16_x_untied:
+**	mov	z0\.h, #16
+**	lslr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_16_s16_x_untied, svint16_t,
+		z0 = svlsl_n_s16_x (p0, z1, 16),
+		z0 = svlsl_x (p0, z1, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s32.c
new file mode 100644
index 000000000..f98f1f94b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s32.c
@@ -0,0 +1,351 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsl_s32_m_tied1:
+**	lsl	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (lsl_s32_m_tied1, svint32_t, svuint32_t,
+	     z0 = svlsl_s32_m (p0, z0, z4),
+	     z0 = svlsl_m (p0, z0, z4))
+
+/*
+** lsl_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	lsl	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_s32_m_tied2, svint32_t, svuint32_t,
+		 z0_res = svlsl_s32_m (p0, z4, z0),
+		 z0_res = svlsl_m (p0, z4, z0))
+
+/*
+** lsl_s32_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (lsl_s32_m_untied, svint32_t, svuint32_t,
+	     z0 = svlsl_s32_m (p0, z1, z4),
+	     z0 = svlsl_m (p0, z1, z4))
+
+/*
+** lsl_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s32_m_tied1, svint32_t, uint32_t,
+		 z0 = svlsl_n_s32_m (p0, z0, x0),
+		 z0 = svlsl_m (p0, z0, x0))
+
+/*
+** lsl_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s32_m_untied, svint32_t, uint32_t,
+		 z0 = svlsl_n_s32_m (p0, z1, x0),
+		 z0 = svlsl_m (p0, z1, x0))
+
+/*
+** lsl_1_s32_m_tied1:
+**	lsl	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s32_m_tied1, svint32_t,
+		z0 = svlsl_n_s32_m (p0, z0, 1),
+		z0 = svlsl_m (p0, z0, 1))
+
+/*
+** lsl_1_s32_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s32_m_untied, svint32_t,
+		z0 = svlsl_n_s32_m (p0, z1, 1),
+		z0 = svlsl_m (p0, z1, 1))
+
+/*
+** lsl_31_s32_m_tied1:
+**	lsl	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_31_s32_m_tied1, svint32_t,
+		z0 = svlsl_n_s32_m (p0, z0, 31),
+		z0 = svlsl_m (p0, z0, 31))
+
+/*
+** lsl_31_s32_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_31_s32_m_untied, svint32_t,
+		z0 = svlsl_n_s32_m (p0, z1, 31),
+		z0 = svlsl_m (p0, z1, 31))
+
+/*
+** lsl_32_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #32
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_32_s32_m_tied1, svint32_t,
+		z0 = svlsl_n_s32_m (p0, z0, 32),
+		z0 = svlsl_m (p0, z0, 32))
+
+/*
+** lsl_32_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #32
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_32_s32_m_untied, svint32_t,
+		z0 = svlsl_n_s32_m (p0, z1, 32),
+		z0 = svlsl_m (p0, z1, 32))
+
+/*
+** lsl_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (lsl_s32_z_tied1, svint32_t, svuint32_t,
+	     z0 = svlsl_s32_z (p0, z0, z4),
+	     z0 = svlsl_z (p0, z0, z4))
+
+/*
+** lsl_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lslr	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_s32_z_tied2, svint32_t, svuint32_t,
+		 z0_res = svlsl_s32_z (p0, z4, z0),
+		 z0_res = svlsl_z (p0, z4, z0))
+
+/*
+** lsl_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, z4\.s
+** |
+**	movprfx	z0\.s, p0/z, z4\.s
+**	lslr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_DUAL_Z (lsl_s32_z_untied, svint32_t, svuint32_t,
+	     z0 = svlsl_s32_z (p0, z1, z4),
+	     z0 = svlsl_z (p0, z1, z4))
+
+/*
+** lsl_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s32_z_tied1, svint32_t, uint32_t,
+		 z0 = svlsl_n_s32_z (p0, z0, x0),
+		 z0 = svlsl_z (p0, z0, x0))
+
+/*
+** lsl_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	lslr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s32_z_untied, svint32_t, uint32_t,
+		 z0 = svlsl_n_s32_z (p0, z1, x0),
+		 z0 = svlsl_z (p0, z1, x0))
+
+/*
+** lsl_1_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s32_z_tied1, svint32_t,
+		z0 = svlsl_n_s32_z (p0, z0, 1),
+		z0 = svlsl_z (p0, z0, 1))
+
+/*
+** lsl_1_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s32_z_untied, svint32_t,
+		z0 = svlsl_n_s32_z (p0, z1, 1),
+		z0 = svlsl_z (p0, z1, 1))
+
+/*
+** lsl_31_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_31_s32_z_tied1, svint32_t,
+		z0 = svlsl_n_s32_z (p0, z0, 31),
+		z0 = svlsl_z (p0, z0, 31))
+
+/*
+** lsl_31_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_31_s32_z_untied, svint32_t,
+		z0 = svlsl_n_s32_z (p0, z1, 31),
+		z0 = svlsl_z (p0, z1, 31))
+
+/*
+** lsl_32_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #32
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_32_s32_z_tied1, svint32_t,
+		z0 = svlsl_n_s32_z (p0, z0, 32),
+		z0 = svlsl_z (p0, z0, 32))
+
+/*
+** lsl_32_s32_z_untied:
+**	mov	(z[0-9]+\.s), #32
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	lslr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_32_s32_z_untied, svint32_t,
+		z0 = svlsl_n_s32_z (p0, z1, 32),
+		z0 = svlsl_z (p0, z1, 32))
+
+/*
+** lsl_s32_x_tied1:
+**	lsl	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (lsl_s32_x_tied1, svint32_t, svuint32_t,
+	     z0 = svlsl_s32_x (p0, z0, z4),
+	     z0 = svlsl_x (p0, z0, z4))
+
+/*
+** lsl_s32_x_tied2:
+**	lslr	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_s32_x_tied2, svint32_t, svuint32_t,
+		 z0_res = svlsl_s32_x (p0, z4, z0),
+		 z0_res = svlsl_x (p0, z4, z0))
+
+/*
+** lsl_s32_x_untied:
+** (
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, z4\.s
+** |
+**	movprfx	z0, z4
+**	lslr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_DUAL_Z (lsl_s32_x_untied, svint32_t, svuint32_t,
+	     z0 = svlsl_s32_x (p0, z1, z4),
+	     z0 = svlsl_x (p0, z1, z4))
+
+/*
+** lsl_w0_s32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s32_x_tied1, svint32_t, uint32_t,
+		 z0 = svlsl_n_s32_x (p0, z0, x0),
+		 z0 = svlsl_x (p0, z0, x0))
+
+/*
+** lsl_w0_s32_x_untied:
+**	mov	z0\.s, w0
+**	lslr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s32_x_untied, svint32_t, uint32_t,
+		 z0 = svlsl_n_s32_x (p0, z1, x0),
+		 z0 = svlsl_x (p0, z1, x0))
+
+/*
+** lsl_1_s32_x_tied1:
+**	lsl	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s32_x_tied1, svint32_t,
+		z0 = svlsl_n_s32_x (p0, z0, 1),
+		z0 = svlsl_x (p0, z0, 1))
+
+/*
+** lsl_1_s32_x_untied:
+**	lsl	z0\.s, z1\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s32_x_untied, svint32_t,
+		z0 = svlsl_n_s32_x (p0, z1, 1),
+		z0 = svlsl_x (p0, z1, 1))
+
+/*
+** lsl_31_s32_x_tied1:
+**	lsl	z0\.s, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_31_s32_x_tied1, svint32_t,
+		z0 = svlsl_n_s32_x (p0, z0, 31),
+		z0 = svlsl_x (p0, z0, 31))
+
+/*
+** lsl_31_s32_x_untied:
+**	lsl	z0\.s, z1\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_31_s32_x_untied, svint32_t,
+		z0 = svlsl_n_s32_x (p0, z1, 31),
+		z0 = svlsl_x (p0, z1, 31))
+
+/*
+** lsl_32_s32_x_tied1:
+**	mov	(z[0-9]+\.s), #32
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_32_s32_x_tied1, svint32_t,
+		z0 = svlsl_n_s32_x (p0, z0, 32),
+		z0 = svlsl_x (p0, z0, 32))
+
+/*
+** lsl_32_s32_x_untied:
+**	mov	z0\.s, #32
+**	lslr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_32_s32_x_untied, svint32_t,
+		z0 = svlsl_n_s32_x (p0, z1, 32),
+		z0 = svlsl_x (p0, z1, 32))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s64.c
new file mode 100644
index 000000000..39753986b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s64.c
@@ -0,0 +1,351 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsl_s64_m_tied1:
+**	lsl	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_s64_m_tied1, svint64_t, svuint64_t,
+	     z0 = svlsl_s64_m (p0, z0, z4),
+	     z0 = svlsl_m (p0, z0, z4))
+
+/*
+** lsl_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_s64_m_tied2, svint64_t, svuint64_t,
+		 z0_res = svlsl_s64_m (p0, z4, z0),
+		 z0_res = svlsl_m (p0, z4, z0))
+
+/*
+** lsl_s64_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_s64_m_untied, svint64_t, svuint64_t,
+	     z0 = svlsl_s64_m (p0, z1, z4),
+	     z0 = svlsl_m (p0, z1, z4))
+
+/*
+** lsl_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_x0_s64_m_tied1, svint64_t, uint64_t,
+		 z0 = svlsl_n_s64_m (p0, z0, x0),
+		 z0 = svlsl_m (p0, z0, x0))
+
+/*
+** lsl_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_x0_s64_m_untied, svint64_t, uint64_t,
+		 z0 = svlsl_n_s64_m (p0, z1, x0),
+		 z0 = svlsl_m (p0, z1, x0))
+
+/*
+** lsl_1_s64_m_tied1:
+**	lsl	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s64_m_tied1, svint64_t,
+		z0 = svlsl_n_s64_m (p0, z0, 1),
+		z0 = svlsl_m (p0, z0, 1))
+
+/*
+** lsl_1_s64_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s64_m_untied, svint64_t,
+		z0 = svlsl_n_s64_m (p0, z1, 1),
+		z0 = svlsl_m (p0, z1, 1))
+
+/*
+** lsl_63_s64_m_tied1:
+**	lsl	z0\.d, p0/m, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_63_s64_m_tied1, svint64_t,
+		z0 = svlsl_n_s64_m (p0, z0, 63),
+		z0 = svlsl_m (p0, z0, 63))
+
+/*
+** lsl_63_s64_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.d, p0/m, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_63_s64_m_untied, svint64_t,
+		z0 = svlsl_n_s64_m (p0, z1, 63),
+		z0 = svlsl_m (p0, z1, 63))
+
+/*
+** lsl_64_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #64
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_64_s64_m_tied1, svint64_t,
+		z0 = svlsl_n_s64_m (p0, z0, 64),
+		z0 = svlsl_m (p0, z0, 64))
+
+/*
+** lsl_64_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #64
+**	movprfx	z0, z1
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_64_s64_m_untied, svint64_t,
+		z0 = svlsl_n_s64_m (p0, z1, 64),
+		z0 = svlsl_m (p0, z1, 64))
+
+/*
+** lsl_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lsl	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_s64_z_tied1, svint64_t, svuint64_t,
+	     z0 = svlsl_s64_z (p0, z0, z4),
+	     z0 = svlsl_z (p0, z0, z4))
+
+/*
+** lsl_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lslr	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_s64_z_tied2, svint64_t, svuint64_t,
+		 z0_res = svlsl_s64_z (p0, z4, z0),
+		 z0_res = svlsl_z (p0, z4, z0))
+
+/*
+** lsl_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	lsl	z0\.d, p0/m, z0\.d, z4\.d
+** |
+**	movprfx	z0\.d, p0/z, z4\.d
+**	lslr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_DUAL_Z (lsl_s64_z_untied, svint64_t, svuint64_t,
+	     z0 = svlsl_s64_z (p0, z1, z4),
+	     z0 = svlsl_z (p0, z1, z4))
+
+/*
+** lsl_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_x0_s64_z_tied1, svint64_t, uint64_t,
+		 z0 = svlsl_n_s64_z (p0, z0, x0),
+		 z0 = svlsl_z (p0, z0, x0))
+
+/*
+** lsl_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	lsl	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	lslr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_x0_s64_z_untied, svint64_t, uint64_t,
+		 z0 = svlsl_n_s64_z (p0, z1, x0),
+		 z0 = svlsl_z (p0, z1, x0))
+
+/*
+** lsl_1_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lsl	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s64_z_tied1, svint64_t,
+		z0 = svlsl_n_s64_z (p0, z0, 1),
+		z0 = svlsl_z (p0, z0, 1))
+
+/*
+** lsl_1_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	lsl	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s64_z_untied, svint64_t,
+		z0 = svlsl_n_s64_z (p0, z1, 1),
+		z0 = svlsl_z (p0, z1, 1))
+
+/*
+** lsl_63_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lsl	z0\.d, p0/m, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_63_s64_z_tied1, svint64_t,
+		z0 = svlsl_n_s64_z (p0, z0, 63),
+		z0 = svlsl_z (p0, z0, 63))
+
+/*
+** lsl_63_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	lsl	z0\.d, p0/m, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_63_s64_z_untied, svint64_t,
+		z0 = svlsl_n_s64_z (p0, z1, 63),
+		z0 = svlsl_z (p0, z1, 63))
+
+/*
+** lsl_64_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #64
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_64_s64_z_tied1, svint64_t,
+		z0 = svlsl_n_s64_z (p0, z0, 64),
+		z0 = svlsl_z (p0, z0, 64))
+
+/*
+** lsl_64_s64_z_untied:
+**	mov	(z[0-9]+\.d), #64
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	lsl	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	lslr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_64_s64_z_untied, svint64_t,
+		z0 = svlsl_n_s64_z (p0, z1, 64),
+		z0 = svlsl_z (p0, z1, 64))
+
+/*
+** lsl_s64_x_tied1:
+**	lsl	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_s64_x_tied1, svint64_t, svuint64_t,
+	     z0 = svlsl_s64_x (p0, z0, z4),
+	     z0 = svlsl_x (p0, z0, z4))
+
+/*
+** lsl_s64_x_tied2:
+**	lslr	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_s64_x_tied2, svint64_t, svuint64_t,
+		 z0_res = svlsl_s64_x (p0, z4, z0),
+		 z0_res = svlsl_x (p0, z4, z0))
+
+/*
+** lsl_s64_x_untied:
+** (
+**	movprfx	z0, z1
+**	lsl	z0\.d, p0/m, z0\.d, z4\.d
+** |
+**	movprfx	z0, z4
+**	lslr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_DUAL_Z (lsl_s64_x_untied, svint64_t, svuint64_t,
+	     z0 = svlsl_s64_x (p0, z1, z4),
+	     z0 = svlsl_x (p0, z1, z4))
+
+/*
+** lsl_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_x0_s64_x_tied1, svint64_t, uint64_t,
+		 z0 = svlsl_n_s64_x (p0, z0, x0),
+		 z0 = svlsl_x (p0, z0, x0))
+
+/*
+** lsl_x0_s64_x_untied:
+**	mov	z0\.d, x0
+**	lslr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_x0_s64_x_untied, svint64_t, uint64_t,
+		 z0 = svlsl_n_s64_x (p0, z1, x0),
+		 z0 = svlsl_x (p0, z1, x0))
+
+/*
+** lsl_1_s64_x_tied1:
+**	lsl	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s64_x_tied1, svint64_t,
+		z0 = svlsl_n_s64_x (p0, z0, 1),
+		z0 = svlsl_x (p0, z0, 1))
+
+/*
+** lsl_1_s64_x_untied:
+**	lsl	z0\.d, z1\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s64_x_untied, svint64_t,
+		z0 = svlsl_n_s64_x (p0, z1, 1),
+		z0 = svlsl_x (p0, z1, 1))
+
+/*
+** lsl_63_s64_x_tied1:
+**	lsl	z0\.d, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_63_s64_x_tied1, svint64_t,
+		z0 = svlsl_n_s64_x (p0, z0, 63),
+		z0 = svlsl_x (p0, z0, 63))
+
+/*
+** lsl_63_s64_x_untied:
+**	lsl	z0\.d, z1\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_63_s64_x_untied, svint64_t,
+		z0 = svlsl_n_s64_x (p0, z1, 63),
+		z0 = svlsl_x (p0, z1, 63))
+
+/*
+** lsl_64_s64_x_tied1:
+**	mov	(z[0-9]+\.d), #64
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_64_s64_x_tied1, svint64_t,
+		z0 = svlsl_n_s64_x (p0, z0, 64),
+		z0 = svlsl_x (p0, z0, 64))
+
+/*
+** lsl_64_s64_x_untied:
+**	mov	z0\.d, #64
+**	lslr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_64_s64_x_untied, svint64_t,
+		z0 = svlsl_n_s64_x (p0, z1, 64),
+		z0 = svlsl_x (p0, z1, 64))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s8.c
new file mode 100644
index 000000000..9a9cc959c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_s8.c
@@ -0,0 +1,351 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsl_s8_m_tied1:
+**	lsl	z0\.b, p0/m, z0\.b, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (lsl_s8_m_tied1, svint8_t, svuint8_t,
+	     z0 = svlsl_s8_m (p0, z0, z4),
+	     z0 = svlsl_m (p0, z0, z4))
+
+/*
+** lsl_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	lsl	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_s8_m_tied2, svint8_t, svuint8_t,
+		 z0_res = svlsl_s8_m (p0, z4, z0),
+		 z0_res = svlsl_m (p0, z4, z0))
+
+/*
+** lsl_s8_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (lsl_s8_m_untied, svint8_t, svuint8_t,
+	     z0 = svlsl_s8_m (p0, z1, z4),
+	     z0 = svlsl_m (p0, z1, z4))
+
+/*
+** lsl_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s8_m_tied1, svint8_t, uint8_t,
+		 z0 = svlsl_n_s8_m (p0, z0, x0),
+		 z0 = svlsl_m (p0, z0, x0))
+
+/*
+** lsl_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s8_m_untied, svint8_t, uint8_t,
+		 z0 = svlsl_n_s8_m (p0, z1, x0),
+		 z0 = svlsl_m (p0, z1, x0))
+
+/*
+** lsl_1_s8_m_tied1:
+**	lsl	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s8_m_tied1, svint8_t,
+		z0 = svlsl_n_s8_m (p0, z0, 1),
+		z0 = svlsl_m (p0, z0, 1))
+
+/*
+** lsl_1_s8_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s8_m_untied, svint8_t,
+		z0 = svlsl_n_s8_m (p0, z1, 1),
+		z0 = svlsl_m (p0, z1, 1))
+
+/*
+** lsl_7_s8_m_tied1:
+**	lsl	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_7_s8_m_tied1, svint8_t,
+		z0 = svlsl_n_s8_m (p0, z0, 7),
+		z0 = svlsl_m (p0, z0, 7))
+
+/*
+** lsl_7_s8_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_7_s8_m_untied, svint8_t,
+		z0 = svlsl_n_s8_m (p0, z1, 7),
+		z0 = svlsl_m (p0, z1, 7))
+
+/*
+** lsl_8_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #8
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_8_s8_m_tied1, svint8_t,
+		z0 = svlsl_n_s8_m (p0, z0, 8),
+		z0 = svlsl_m (p0, z0, 8))
+
+/*
+** lsl_8_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #8
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_8_s8_m_untied, svint8_t,
+		z0 = svlsl_n_s8_m (p0, z1, 8),
+		z0 = svlsl_m (p0, z1, 8))
+
+/*
+** lsl_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (lsl_s8_z_tied1, svint8_t, svuint8_t,
+	     z0 = svlsl_s8_z (p0, z0, z4),
+	     z0 = svlsl_z (p0, z0, z4))
+
+/*
+** lsl_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lslr	z0\.b, p0/m, z0\.b, z4\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_s8_z_tied2, svint8_t, svuint8_t,
+		 z0_res = svlsl_s8_z (p0, z4, z0),
+		 z0_res = svlsl_z (p0, z4, z0))
+
+/*
+** lsl_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, z4\.b
+** |
+**	movprfx	z0\.b, p0/z, z4\.b
+**	lslr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_DUAL_Z (lsl_s8_z_untied, svint8_t, svuint8_t,
+	     z0 = svlsl_s8_z (p0, z1, z4),
+	     z0 = svlsl_z (p0, z1, z4))
+
+/*
+** lsl_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s8_z_tied1, svint8_t, uint8_t,
+		 z0 = svlsl_n_s8_z (p0, z0, x0),
+		 z0 = svlsl_z (p0, z0, x0))
+
+/*
+** lsl_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	lslr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s8_z_untied, svint8_t, uint8_t,
+		 z0 = svlsl_n_s8_z (p0, z1, x0),
+		 z0 = svlsl_z (p0, z1, x0))
+
+/*
+** lsl_1_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s8_z_tied1, svint8_t,
+		z0 = svlsl_n_s8_z (p0, z0, 1),
+		z0 = svlsl_z (p0, z0, 1))
+
+/*
+** lsl_1_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s8_z_untied, svint8_t,
+		z0 = svlsl_n_s8_z (p0, z1, 1),
+		z0 = svlsl_z (p0, z1, 1))
+
+/*
+** lsl_7_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_7_s8_z_tied1, svint8_t,
+		z0 = svlsl_n_s8_z (p0, z0, 7),
+		z0 = svlsl_z (p0, z0, 7))
+
+/*
+** lsl_7_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_7_s8_z_untied, svint8_t,
+		z0 = svlsl_n_s8_z (p0, z1, 7),
+		z0 = svlsl_z (p0, z1, 7))
+
+/*
+** lsl_8_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #8
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_8_s8_z_tied1, svint8_t,
+		z0 = svlsl_n_s8_z (p0, z0, 8),
+		z0 = svlsl_z (p0, z0, 8))
+
+/*
+** lsl_8_s8_z_untied:
+**	mov	(z[0-9]+\.b), #8
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	lslr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_8_s8_z_untied, svint8_t,
+		z0 = svlsl_n_s8_z (p0, z1, 8),
+		z0 = svlsl_z (p0, z1, 8))
+
+/*
+** lsl_s8_x_tied1:
+**	lsl	z0\.b, p0/m, z0\.b, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (lsl_s8_x_tied1, svint8_t, svuint8_t,
+	     z0 = svlsl_s8_x (p0, z0, z4),
+	     z0 = svlsl_x (p0, z0, z4))
+
+/*
+** lsl_s8_x_tied2:
+**	lslr	z0\.b, p0/m, z0\.b, z4\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_s8_x_tied2, svint8_t, svuint8_t,
+		 z0_res = svlsl_s8_x (p0, z4, z0),
+		 z0_res = svlsl_x (p0, z4, z0))
+
+/*
+** lsl_s8_x_untied:
+** (
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, z4\.b
+** |
+**	movprfx	z0, z4
+**	lslr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_DUAL_Z (lsl_s8_x_untied, svint8_t, svuint8_t,
+	     z0 = svlsl_s8_x (p0, z1, z4),
+	     z0 = svlsl_x (p0, z1, z4))
+
+/*
+** lsl_w0_s8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s8_x_tied1, svint8_t, uint8_t,
+		 z0 = svlsl_n_s8_x (p0, z0, x0),
+		 z0 = svlsl_x (p0, z0, x0))
+
+/*
+** lsl_w0_s8_x_untied:
+**	mov	z0\.b, w0
+**	lslr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_s8_x_untied, svint8_t, uint8_t,
+		 z0 = svlsl_n_s8_x (p0, z1, x0),
+		 z0 = svlsl_x (p0, z1, x0))
+
+/*
+** lsl_1_s8_x_tied1:
+**	lsl	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s8_x_tied1, svint8_t,
+		z0 = svlsl_n_s8_x (p0, z0, 1),
+		z0 = svlsl_x (p0, z0, 1))
+
+/*
+** lsl_1_s8_x_untied:
+**	lsl	z0\.b, z1\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_s8_x_untied, svint8_t,
+		z0 = svlsl_n_s8_x (p0, z1, 1),
+		z0 = svlsl_x (p0, z1, 1))
+
+/*
+** lsl_7_s8_x_tied1:
+**	lsl	z0\.b, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_7_s8_x_tied1, svint8_t,
+		z0 = svlsl_n_s8_x (p0, z0, 7),
+		z0 = svlsl_x (p0, z0, 7))
+
+/*
+** lsl_7_s8_x_untied:
+**	lsl	z0\.b, z1\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_7_s8_x_untied, svint8_t,
+		z0 = svlsl_n_s8_x (p0, z1, 7),
+		z0 = svlsl_x (p0, z1, 7))
+
+/*
+** lsl_8_s8_x_tied1:
+**	mov	(z[0-9]+\.b), #8
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_8_s8_x_tied1, svint8_t,
+		z0 = svlsl_n_s8_x (p0, z0, 8),
+		z0 = svlsl_x (p0, z0, 8))
+
+/*
+** lsl_8_s8_x_untied:
+**	mov	z0\.b, #8
+**	lslr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_8_s8_x_untied, svint8_t,
+		z0 = svlsl_n_s8_x (p0, z1, 8),
+		z0 = svlsl_x (p0, z1, 8))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u16.c
new file mode 100644
index 000000000..57db0fda6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u16.c
@@ -0,0 +1,351 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsl_u16_m_tied1:
+**	lsl	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u16_m_tied1, svuint16_t,
+		z0 = svlsl_u16_m (p0, z0, z1),
+		z0 = svlsl_m (p0, z0, z1))
+
+/*
+** lsl_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u16_m_tied2, svuint16_t,
+		z0 = svlsl_u16_m (p0, z1, z0),
+		z0 = svlsl_m (p0, z1, z0))
+
+/*
+** lsl_u16_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u16_m_untied, svuint16_t,
+		z0 = svlsl_u16_m (p0, z1, z2),
+		z0 = svlsl_m (p0, z1, z2))
+
+/*
+** lsl_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svlsl_n_u16_m (p0, z0, x0),
+		 z0 = svlsl_m (p0, z0, x0))
+
+/*
+** lsl_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svlsl_n_u16_m (p0, z1, x0),
+		 z0 = svlsl_m (p0, z1, x0))
+
+/*
+** lsl_1_u16_m_tied1:
+**	lsl	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u16_m_tied1, svuint16_t,
+		z0 = svlsl_n_u16_m (p0, z0, 1),
+		z0 = svlsl_m (p0, z0, 1))
+
+/*
+** lsl_1_u16_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u16_m_untied, svuint16_t,
+		z0 = svlsl_n_u16_m (p0, z1, 1),
+		z0 = svlsl_m (p0, z1, 1))
+
+/*
+** lsl_15_u16_m_tied1:
+**	lsl	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_15_u16_m_tied1, svuint16_t,
+		z0 = svlsl_n_u16_m (p0, z0, 15),
+		z0 = svlsl_m (p0, z0, 15))
+
+/*
+** lsl_15_u16_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_15_u16_m_untied, svuint16_t,
+		z0 = svlsl_n_u16_m (p0, z1, 15),
+		z0 = svlsl_m (p0, z1, 15))
+
+/*
+** lsl_16_u16_m_tied1:
+**	mov	(z[0-9]+\.h), #16
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_16_u16_m_tied1, svuint16_t,
+		z0 = svlsl_n_u16_m (p0, z0, 16),
+		z0 = svlsl_m (p0, z0, 16))
+
+/*
+** lsl_16_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #16
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_16_u16_m_untied, svuint16_t,
+		z0 = svlsl_n_u16_m (p0, z1, 16),
+		z0 = svlsl_m (p0, z1, 16))
+
+/*
+** lsl_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u16_z_tied1, svuint16_t,
+		z0 = svlsl_u16_z (p0, z0, z1),
+		z0 = svlsl_z (p0, z0, z1))
+
+/*
+** lsl_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lslr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u16_z_tied2, svuint16_t,
+		z0 = svlsl_u16_z (p0, z1, z0),
+		z0 = svlsl_z (p0, z1, z0))
+
+/*
+** lsl_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	lslr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u16_z_untied, svuint16_t,
+		z0 = svlsl_u16_z (p0, z1, z2),
+		z0 = svlsl_z (p0, z1, z2))
+
+/*
+** lsl_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svlsl_n_u16_z (p0, z0, x0),
+		 z0 = svlsl_z (p0, z0, x0))
+
+/*
+** lsl_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	lslr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svlsl_n_u16_z (p0, z1, x0),
+		 z0 = svlsl_z (p0, z1, x0))
+
+/*
+** lsl_1_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u16_z_tied1, svuint16_t,
+		z0 = svlsl_n_u16_z (p0, z0, 1),
+		z0 = svlsl_z (p0, z0, 1))
+
+/*
+** lsl_1_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u16_z_untied, svuint16_t,
+		z0 = svlsl_n_u16_z (p0, z1, 1),
+		z0 = svlsl_z (p0, z1, 1))
+
+/*
+** lsl_15_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_15_u16_z_tied1, svuint16_t,
+		z0 = svlsl_n_u16_z (p0, z0, 15),
+		z0 = svlsl_z (p0, z0, 15))
+
+/*
+** lsl_15_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_15_u16_z_untied, svuint16_t,
+		z0 = svlsl_n_u16_z (p0, z1, 15),
+		z0 = svlsl_z (p0, z1, 15))
+
+/*
+** lsl_16_u16_z_tied1:
+**	mov	(z[0-9]+\.h), #16
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_16_u16_z_tied1, svuint16_t,
+		z0 = svlsl_n_u16_z (p0, z0, 16),
+		z0 = svlsl_z (p0, z0, 16))
+
+/*
+** lsl_16_u16_z_untied:
+**	mov	(z[0-9]+\.h), #16
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	lslr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_16_u16_z_untied, svuint16_t,
+		z0 = svlsl_n_u16_z (p0, z1, 16),
+		z0 = svlsl_z (p0, z1, 16))
+
+/*
+** lsl_u16_x_tied1:
+**	lsl	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u16_x_tied1, svuint16_t,
+		z0 = svlsl_u16_x (p0, z0, z1),
+		z0 = svlsl_x (p0, z0, z1))
+
+/*
+** lsl_u16_x_tied2:
+**	lslr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u16_x_tied2, svuint16_t,
+		z0 = svlsl_u16_x (p0, z1, z0),
+		z0 = svlsl_x (p0, z1, z0))
+
+/*
+** lsl_u16_x_untied:
+** (
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	lslr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u16_x_untied, svuint16_t,
+		z0 = svlsl_u16_x (p0, z1, z2),
+		z0 = svlsl_x (p0, z1, z2))
+
+/*
+** lsl_w0_u16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svlsl_n_u16_x (p0, z0, x0),
+		 z0 = svlsl_x (p0, z0, x0))
+
+/*
+** lsl_w0_u16_x_untied:
+**	mov	z0\.h, w0
+**	lslr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svlsl_n_u16_x (p0, z1, x0),
+		 z0 = svlsl_x (p0, z1, x0))
+
+/*
+** lsl_1_u16_x_tied1:
+**	lsl	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u16_x_tied1, svuint16_t,
+		z0 = svlsl_n_u16_x (p0, z0, 1),
+		z0 = svlsl_x (p0, z0, 1))
+
+/*
+** lsl_1_u16_x_untied:
+**	lsl	z0\.h, z1\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u16_x_untied, svuint16_t,
+		z0 = svlsl_n_u16_x (p0, z1, 1),
+		z0 = svlsl_x (p0, z1, 1))
+
+/*
+** lsl_15_u16_x_tied1:
+**	lsl	z0\.h, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_15_u16_x_tied1, svuint16_t,
+		z0 = svlsl_n_u16_x (p0, z0, 15),
+		z0 = svlsl_x (p0, z0, 15))
+
+/*
+** lsl_15_u16_x_untied:
+**	lsl	z0\.h, z1\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_15_u16_x_untied, svuint16_t,
+		z0 = svlsl_n_u16_x (p0, z1, 15),
+		z0 = svlsl_x (p0, z1, 15))
+
+/*
+** lsl_16_u16_x_tied1:
+**	mov	(z[0-9]+\.h), #16
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_16_u16_x_tied1, svuint16_t,
+		z0 = svlsl_n_u16_x (p0, z0, 16),
+		z0 = svlsl_x (p0, z0, 16))
+
+/*
+** lsl_16_u16_x_untied:
+**	mov	z0\.h, #16
+**	lslr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_16_u16_x_untied, svuint16_t,
+		z0 = svlsl_n_u16_x (p0, z1, 16),
+		z0 = svlsl_x (p0, z1, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u32.c
new file mode 100644
index 000000000..8773f15db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u32.c
@@ -0,0 +1,351 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsl_u32_m_tied1:
+**	lsl	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u32_m_tied1, svuint32_t,
+		z0 = svlsl_u32_m (p0, z0, z1),
+		z0 = svlsl_m (p0, z0, z1))
+
+/*
+** lsl_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u32_m_tied2, svuint32_t,
+		z0 = svlsl_u32_m (p0, z1, z0),
+		z0 = svlsl_m (p0, z1, z0))
+
+/*
+** lsl_u32_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u32_m_untied, svuint32_t,
+		z0 = svlsl_u32_m (p0, z1, z2),
+		z0 = svlsl_m (p0, z1, z2))
+
+/*
+** lsl_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svlsl_n_u32_m (p0, z0, x0),
+		 z0 = svlsl_m (p0, z0, x0))
+
+/*
+** lsl_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svlsl_n_u32_m (p0, z1, x0),
+		 z0 = svlsl_m (p0, z1, x0))
+
+/*
+** lsl_1_u32_m_tied1:
+**	lsl	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u32_m_tied1, svuint32_t,
+		z0 = svlsl_n_u32_m (p0, z0, 1),
+		z0 = svlsl_m (p0, z0, 1))
+
+/*
+** lsl_1_u32_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u32_m_untied, svuint32_t,
+		z0 = svlsl_n_u32_m (p0, z1, 1),
+		z0 = svlsl_m (p0, z1, 1))
+
+/*
+** lsl_31_u32_m_tied1:
+**	lsl	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_31_u32_m_tied1, svuint32_t,
+		z0 = svlsl_n_u32_m (p0, z0, 31),
+		z0 = svlsl_m (p0, z0, 31))
+
+/*
+** lsl_31_u32_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_31_u32_m_untied, svuint32_t,
+		z0 = svlsl_n_u32_m (p0, z1, 31),
+		z0 = svlsl_m (p0, z1, 31))
+
+/*
+** lsl_32_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #32
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_32_u32_m_tied1, svuint32_t,
+		z0 = svlsl_n_u32_m (p0, z0, 32),
+		z0 = svlsl_m (p0, z0, 32))
+
+/*
+** lsl_32_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #32
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_32_u32_m_untied, svuint32_t,
+		z0 = svlsl_n_u32_m (p0, z1, 32),
+		z0 = svlsl_m (p0, z1, 32))
+
+/*
+** lsl_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u32_z_tied1, svuint32_t,
+		z0 = svlsl_u32_z (p0, z0, z1),
+		z0 = svlsl_z (p0, z0, z1))
+
+/*
+** lsl_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lslr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u32_z_tied2, svuint32_t,
+		z0 = svlsl_u32_z (p0, z1, z0),
+		z0 = svlsl_z (p0, z1, z0))
+
+/*
+** lsl_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	lslr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u32_z_untied, svuint32_t,
+		z0 = svlsl_u32_z (p0, z1, z2),
+		z0 = svlsl_z (p0, z1, z2))
+
+/*
+** lsl_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svlsl_n_u32_z (p0, z0, x0),
+		 z0 = svlsl_z (p0, z0, x0))
+
+/*
+** lsl_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	lslr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svlsl_n_u32_z (p0, z1, x0),
+		 z0 = svlsl_z (p0, z1, x0))
+
+/*
+** lsl_1_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u32_z_tied1, svuint32_t,
+		z0 = svlsl_n_u32_z (p0, z0, 1),
+		z0 = svlsl_z (p0, z0, 1))
+
+/*
+** lsl_1_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u32_z_untied, svuint32_t,
+		z0 = svlsl_n_u32_z (p0, z1, 1),
+		z0 = svlsl_z (p0, z1, 1))
+
+/*
+** lsl_31_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_31_u32_z_tied1, svuint32_t,
+		z0 = svlsl_n_u32_z (p0, z0, 31),
+		z0 = svlsl_z (p0, z0, 31))
+
+/*
+** lsl_31_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_31_u32_z_untied, svuint32_t,
+		z0 = svlsl_n_u32_z (p0, z1, 31),
+		z0 = svlsl_z (p0, z1, 31))
+
+/*
+** lsl_32_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #32
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_32_u32_z_tied1, svuint32_t,
+		z0 = svlsl_n_u32_z (p0, z0, 32),
+		z0 = svlsl_z (p0, z0, 32))
+
+/*
+** lsl_32_u32_z_untied:
+**	mov	(z[0-9]+\.s), #32
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	lslr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_32_u32_z_untied, svuint32_t,
+		z0 = svlsl_n_u32_z (p0, z1, 32),
+		z0 = svlsl_z (p0, z1, 32))
+
+/*
+** lsl_u32_x_tied1:
+**	lsl	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u32_x_tied1, svuint32_t,
+		z0 = svlsl_u32_x (p0, z0, z1),
+		z0 = svlsl_x (p0, z0, z1))
+
+/*
+** lsl_u32_x_tied2:
+**	lslr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u32_x_tied2, svuint32_t,
+		z0 = svlsl_u32_x (p0, z1, z0),
+		z0 = svlsl_x (p0, z1, z0))
+
+/*
+** lsl_u32_x_untied:
+** (
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	lslr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u32_x_untied, svuint32_t,
+		z0 = svlsl_u32_x (p0, z1, z2),
+		z0 = svlsl_x (p0, z1, z2))
+
+/*
+** lsl_w0_u32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svlsl_n_u32_x (p0, z0, x0),
+		 z0 = svlsl_x (p0, z0, x0))
+
+/*
+** lsl_w0_u32_x_untied:
+**	mov	z0\.s, w0
+**	lslr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svlsl_n_u32_x (p0, z1, x0),
+		 z0 = svlsl_x (p0, z1, x0))
+
+/*
+** lsl_1_u32_x_tied1:
+**	lsl	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u32_x_tied1, svuint32_t,
+		z0 = svlsl_n_u32_x (p0, z0, 1),
+		z0 = svlsl_x (p0, z0, 1))
+
+/*
+** lsl_1_u32_x_untied:
+**	lsl	z0\.s, z1\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u32_x_untied, svuint32_t,
+		z0 = svlsl_n_u32_x (p0, z1, 1),
+		z0 = svlsl_x (p0, z1, 1))
+
+/*
+** lsl_31_u32_x_tied1:
+**	lsl	z0\.s, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_31_u32_x_tied1, svuint32_t,
+		z0 = svlsl_n_u32_x (p0, z0, 31),
+		z0 = svlsl_x (p0, z0, 31))
+
+/*
+** lsl_31_u32_x_untied:
+**	lsl	z0\.s, z1\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_31_u32_x_untied, svuint32_t,
+		z0 = svlsl_n_u32_x (p0, z1, 31),
+		z0 = svlsl_x (p0, z1, 31))
+
+/*
+** lsl_32_u32_x_tied1:
+**	mov	(z[0-9]+\.s), #32
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_32_u32_x_tied1, svuint32_t,
+		z0 = svlsl_n_u32_x (p0, z0, 32),
+		z0 = svlsl_x (p0, z0, 32))
+
+/*
+** lsl_32_u32_x_untied:
+**	mov	z0\.s, #32
+**	lslr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_32_u32_x_untied, svuint32_t,
+		z0 = svlsl_n_u32_x (p0, z1, 32),
+		z0 = svlsl_x (p0, z1, 32))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u64.c
new file mode 100644
index 000000000..7b12bd43e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u64.c
@@ -0,0 +1,351 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsl_u64_m_tied1:
+**	lsl	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u64_m_tied1, svuint64_t,
+		z0 = svlsl_u64_m (p0, z0, z1),
+		z0 = svlsl_m (p0, z0, z1))
+
+/*
+** lsl_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u64_m_tied2, svuint64_t,
+		z0 = svlsl_u64_m (p0, z1, z0),
+		z0 = svlsl_m (p0, z1, z0))
+
+/*
+** lsl_u64_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u64_m_untied, svuint64_t,
+		z0 = svlsl_u64_m (p0, z1, z2),
+		z0 = svlsl_m (p0, z1, z2))
+
+/*
+** lsl_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svlsl_n_u64_m (p0, z0, x0),
+		 z0 = svlsl_m (p0, z0, x0))
+
+/*
+** lsl_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svlsl_n_u64_m (p0, z1, x0),
+		 z0 = svlsl_m (p0, z1, x0))
+
+/*
+** lsl_1_u64_m_tied1:
+**	lsl	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u64_m_tied1, svuint64_t,
+		z0 = svlsl_n_u64_m (p0, z0, 1),
+		z0 = svlsl_m (p0, z0, 1))
+
+/*
+** lsl_1_u64_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u64_m_untied, svuint64_t,
+		z0 = svlsl_n_u64_m (p0, z1, 1),
+		z0 = svlsl_m (p0, z1, 1))
+
+/*
+** lsl_63_u64_m_tied1:
+**	lsl	z0\.d, p0/m, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_63_u64_m_tied1, svuint64_t,
+		z0 = svlsl_n_u64_m (p0, z0, 63),
+		z0 = svlsl_m (p0, z0, 63))
+
+/*
+** lsl_63_u64_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.d, p0/m, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_63_u64_m_untied, svuint64_t,
+		z0 = svlsl_n_u64_m (p0, z1, 63),
+		z0 = svlsl_m (p0, z1, 63))
+
+/*
+** lsl_64_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #64
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_64_u64_m_tied1, svuint64_t,
+		z0 = svlsl_n_u64_m (p0, z0, 64),
+		z0 = svlsl_m (p0, z0, 64))
+
+/*
+** lsl_64_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #64
+**	movprfx	z0, z1
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_64_u64_m_untied, svuint64_t,
+		z0 = svlsl_n_u64_m (p0, z1, 64),
+		z0 = svlsl_m (p0, z1, 64))
+
+/*
+** lsl_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lsl	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u64_z_tied1, svuint64_t,
+		z0 = svlsl_u64_z (p0, z0, z1),
+		z0 = svlsl_z (p0, z0, z1))
+
+/*
+** lsl_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lslr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u64_z_tied2, svuint64_t,
+		z0 = svlsl_u64_z (p0, z1, z0),
+		z0 = svlsl_z (p0, z1, z0))
+
+/*
+** lsl_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	lsl	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	lslr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u64_z_untied, svuint64_t,
+		z0 = svlsl_u64_z (p0, z1, z2),
+		z0 = svlsl_z (p0, z1, z2))
+
+/*
+** lsl_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svlsl_n_u64_z (p0, z0, x0),
+		 z0 = svlsl_z (p0, z0, x0))
+
+/*
+** lsl_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	lsl	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	lslr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svlsl_n_u64_z (p0, z1, x0),
+		 z0 = svlsl_z (p0, z1, x0))
+
+/*
+** lsl_1_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lsl	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u64_z_tied1, svuint64_t,
+		z0 = svlsl_n_u64_z (p0, z0, 1),
+		z0 = svlsl_z (p0, z0, 1))
+
+/*
+** lsl_1_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	lsl	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u64_z_untied, svuint64_t,
+		z0 = svlsl_n_u64_z (p0, z1, 1),
+		z0 = svlsl_z (p0, z1, 1))
+
+/*
+** lsl_63_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lsl	z0\.d, p0/m, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_63_u64_z_tied1, svuint64_t,
+		z0 = svlsl_n_u64_z (p0, z0, 63),
+		z0 = svlsl_z (p0, z0, 63))
+
+/*
+** lsl_63_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	lsl	z0\.d, p0/m, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_63_u64_z_untied, svuint64_t,
+		z0 = svlsl_n_u64_z (p0, z1, 63),
+		z0 = svlsl_z (p0, z1, 63))
+
+/*
+** lsl_64_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #64
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_64_u64_z_tied1, svuint64_t,
+		z0 = svlsl_n_u64_z (p0, z0, 64),
+		z0 = svlsl_z (p0, z0, 64))
+
+/*
+** lsl_64_u64_z_untied:
+**	mov	(z[0-9]+\.d), #64
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	lsl	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	lslr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_64_u64_z_untied, svuint64_t,
+		z0 = svlsl_n_u64_z (p0, z1, 64),
+		z0 = svlsl_z (p0, z1, 64))
+
+/*
+** lsl_u64_x_tied1:
+**	lsl	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u64_x_tied1, svuint64_t,
+		z0 = svlsl_u64_x (p0, z0, z1),
+		z0 = svlsl_x (p0, z0, z1))
+
+/*
+** lsl_u64_x_tied2:
+**	lslr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u64_x_tied2, svuint64_t,
+		z0 = svlsl_u64_x (p0, z1, z0),
+		z0 = svlsl_x (p0, z1, z0))
+
+/*
+** lsl_u64_x_untied:
+** (
+**	movprfx	z0, z1
+**	lsl	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	lslr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u64_x_untied, svuint64_t,
+		z0 = svlsl_u64_x (p0, z1, z2),
+		z0 = svlsl_x (p0, z1, z2))
+
+/*
+** lsl_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svlsl_n_u64_x (p0, z0, x0),
+		 z0 = svlsl_x (p0, z0, x0))
+
+/*
+** lsl_x0_u64_x_untied:
+**	mov	z0\.d, x0
+**	lslr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svlsl_n_u64_x (p0, z1, x0),
+		 z0 = svlsl_x (p0, z1, x0))
+
+/*
+** lsl_1_u64_x_tied1:
+**	lsl	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u64_x_tied1, svuint64_t,
+		z0 = svlsl_n_u64_x (p0, z0, 1),
+		z0 = svlsl_x (p0, z0, 1))
+
+/*
+** lsl_1_u64_x_untied:
+**	lsl	z0\.d, z1\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u64_x_untied, svuint64_t,
+		z0 = svlsl_n_u64_x (p0, z1, 1),
+		z0 = svlsl_x (p0, z1, 1))
+
+/*
+** lsl_63_u64_x_tied1:
+**	lsl	z0\.d, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_63_u64_x_tied1, svuint64_t,
+		z0 = svlsl_n_u64_x (p0, z0, 63),
+		z0 = svlsl_x (p0, z0, 63))
+
+/*
+** lsl_63_u64_x_untied:
+**	lsl	z0\.d, z1\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_63_u64_x_untied, svuint64_t,
+		z0 = svlsl_n_u64_x (p0, z1, 63),
+		z0 = svlsl_x (p0, z1, 63))
+
+/*
+** lsl_64_u64_x_tied1:
+**	mov	(z[0-9]+\.d), #64
+**	lsl	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_64_u64_x_tied1, svuint64_t,
+		z0 = svlsl_n_u64_x (p0, z0, 64),
+		z0 = svlsl_x (p0, z0, 64))
+
+/*
+** lsl_64_u64_x_untied:
+**	mov	z0\.d, #64
+**	lslr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_64_u64_x_untied, svuint64_t,
+		z0 = svlsl_n_u64_x (p0, z1, 64),
+		z0 = svlsl_x (p0, z1, 64))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u8.c
new file mode 100644
index 000000000..894b55138
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_u8.c
@@ -0,0 +1,351 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsl_u8_m_tied1:
+**	lsl	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u8_m_tied1, svuint8_t,
+		z0 = svlsl_u8_m (p0, z0, z1),
+		z0 = svlsl_m (p0, z0, z1))
+
+/*
+** lsl_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u8_m_tied2, svuint8_t,
+		z0 = svlsl_u8_m (p0, z1, z0),
+		z0 = svlsl_m (p0, z1, z0))
+
+/*
+** lsl_u8_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u8_m_untied, svuint8_t,
+		z0 = svlsl_u8_m (p0, z1, z2),
+		z0 = svlsl_m (p0, z1, z2))
+
+/*
+** lsl_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svlsl_n_u8_m (p0, z0, x0),
+		 z0 = svlsl_m (p0, z0, x0))
+
+/*
+** lsl_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svlsl_n_u8_m (p0, z1, x0),
+		 z0 = svlsl_m (p0, z1, x0))
+
+/*
+** lsl_1_u8_m_tied1:
+**	lsl	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u8_m_tied1, svuint8_t,
+		z0 = svlsl_n_u8_m (p0, z0, 1),
+		z0 = svlsl_m (p0, z0, 1))
+
+/*
+** lsl_1_u8_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u8_m_untied, svuint8_t,
+		z0 = svlsl_n_u8_m (p0, z1, 1),
+		z0 = svlsl_m (p0, z1, 1))
+
+/*
+** lsl_7_u8_m_tied1:
+**	lsl	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_7_u8_m_tied1, svuint8_t,
+		z0 = svlsl_n_u8_m (p0, z0, 7),
+		z0 = svlsl_m (p0, z0, 7))
+
+/*
+** lsl_7_u8_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_7_u8_m_untied, svuint8_t,
+		z0 = svlsl_n_u8_m (p0, z1, 7),
+		z0 = svlsl_m (p0, z1, 7))
+
+/*
+** lsl_8_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #8
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_8_u8_m_tied1, svuint8_t,
+		z0 = svlsl_n_u8_m (p0, z0, 8),
+		z0 = svlsl_m (p0, z0, 8))
+
+/*
+** lsl_8_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #8
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_8_u8_m_untied, svuint8_t,
+		z0 = svlsl_n_u8_m (p0, z1, 8),
+		z0 = svlsl_m (p0, z1, 8))
+
+/*
+** lsl_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u8_z_tied1, svuint8_t,
+		z0 = svlsl_u8_z (p0, z0, z1),
+		z0 = svlsl_z (p0, z0, z1))
+
+/*
+** lsl_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lslr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u8_z_tied2, svuint8_t,
+		z0 = svlsl_u8_z (p0, z1, z0),
+		z0 = svlsl_z (p0, z1, z0))
+
+/*
+** lsl_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	lslr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u8_z_untied, svuint8_t,
+		z0 = svlsl_u8_z (p0, z1, z2),
+		z0 = svlsl_z (p0, z1, z2))
+
+/*
+** lsl_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svlsl_n_u8_z (p0, z0, x0),
+		 z0 = svlsl_z (p0, z0, x0))
+
+/*
+** lsl_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	lslr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svlsl_n_u8_z (p0, z1, x0),
+		 z0 = svlsl_z (p0, z1, x0))
+
+/*
+** lsl_1_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u8_z_tied1, svuint8_t,
+		z0 = svlsl_n_u8_z (p0, z0, 1),
+		z0 = svlsl_z (p0, z0, 1))
+
+/*
+** lsl_1_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u8_z_untied, svuint8_t,
+		z0 = svlsl_n_u8_z (p0, z1, 1),
+		z0 = svlsl_z (p0, z1, 1))
+
+/*
+** lsl_7_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_7_u8_z_tied1, svuint8_t,
+		z0 = svlsl_n_u8_z (p0, z0, 7),
+		z0 = svlsl_z (p0, z0, 7))
+
+/*
+** lsl_7_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_7_u8_z_untied, svuint8_t,
+		z0 = svlsl_n_u8_z (p0, z1, 7),
+		z0 = svlsl_z (p0, z1, 7))
+
+/*
+** lsl_8_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #8
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_8_u8_z_tied1, svuint8_t,
+		z0 = svlsl_n_u8_z (p0, z0, 8),
+		z0 = svlsl_z (p0, z0, 8))
+
+/*
+** lsl_8_u8_z_untied:
+**	mov	(z[0-9]+\.b), #8
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	lslr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_8_u8_z_untied, svuint8_t,
+		z0 = svlsl_n_u8_z (p0, z1, 8),
+		z0 = svlsl_z (p0, z1, 8))
+
+/*
+** lsl_u8_x_tied1:
+**	lsl	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u8_x_tied1, svuint8_t,
+		z0 = svlsl_u8_x (p0, z0, z1),
+		z0 = svlsl_x (p0, z0, z1))
+
+/*
+** lsl_u8_x_tied2:
+**	lslr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u8_x_tied2, svuint8_t,
+		z0 = svlsl_u8_x (p0, z1, z0),
+		z0 = svlsl_x (p0, z1, z0))
+
+/*
+** lsl_u8_x_untied:
+** (
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0, z2
+**	lslr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_u8_x_untied, svuint8_t,
+		z0 = svlsl_u8_x (p0, z1, z2),
+		z0 = svlsl_x (p0, z1, z2))
+
+/*
+** lsl_w0_u8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svlsl_n_u8_x (p0, z0, x0),
+		 z0 = svlsl_x (p0, z0, x0))
+
+/*
+** lsl_w0_u8_x_untied:
+**	mov	z0\.b, w0
+**	lslr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svlsl_n_u8_x (p0, z1, x0),
+		 z0 = svlsl_x (p0, z1, x0))
+
+/*
+** lsl_1_u8_x_tied1:
+**	lsl	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u8_x_tied1, svuint8_t,
+		z0 = svlsl_n_u8_x (p0, z0, 1),
+		z0 = svlsl_x (p0, z0, 1))
+
+/*
+** lsl_1_u8_x_untied:
+**	lsl	z0\.b, z1\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_1_u8_x_untied, svuint8_t,
+		z0 = svlsl_n_u8_x (p0, z1, 1),
+		z0 = svlsl_x (p0, z1, 1))
+
+/*
+** lsl_7_u8_x_tied1:
+**	lsl	z0\.b, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_7_u8_x_tied1, svuint8_t,
+		z0 = svlsl_n_u8_x (p0, z0, 7),
+		z0 = svlsl_x (p0, z0, 7))
+
+/*
+** lsl_7_u8_x_untied:
+**	lsl	z0\.b, z1\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_7_u8_x_untied, svuint8_t,
+		z0 = svlsl_n_u8_x (p0, z1, 7),
+		z0 = svlsl_x (p0, z1, 7))
+
+/*
+** lsl_8_u8_x_tied1:
+**	mov	(z[0-9]+\.b), #8
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_8_u8_x_tied1, svuint8_t,
+		z0 = svlsl_n_u8_x (p0, z0, 8),
+		z0 = svlsl_x (p0, z0, 8))
+
+/*
+** lsl_8_u8_x_untied:
+**	mov	z0\.b, #8
+**	lslr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_8_u8_x_untied, svuint8_t,
+		z0 = svlsl_n_u8_x (p0, z1, 8),
+		z0 = svlsl_x (p0, z1, 8))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s16.c
new file mode 100644
index 000000000..8d63d3909
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s16.c
@@ -0,0 +1,331 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsl_wide_s16_m_tied1:
+**	lsl	z0\.h, p0/m, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s16_m_tied1, svint16_t, svuint64_t,
+	     z0 = svlsl_wide_s16_m (p0, z0, z4),
+	     z0 = svlsl_wide_m (p0, z0, z4))
+
+/*
+** lsl_wide_s16_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_s16_m_tied2, svint16_t, svuint64_t,
+		 z0_res = svlsl_wide_s16_m (p0, z4, z0),
+		 z0_res = svlsl_wide_m (p0, z4, z0))
+
+/*
+** lsl_wide_s16_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s16_m_untied, svint16_t, svuint64_t,
+	     z0 = svlsl_wide_s16_m (p0, z1, z4),
+	     z0 = svlsl_wide_m (p0, z1, z4))
+
+/*
+** lsl_wide_x0_s16_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s16_m_tied1, svint16_t, uint64_t,
+		 z0 = svlsl_wide_n_s16_m (p0, z0, x0),
+		 z0 = svlsl_wide_m (p0, z0, x0))
+
+/*
+** lsl_wide_x0_s16_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s16_m_untied, svint16_t, uint64_t,
+		 z0 = svlsl_wide_n_s16_m (p0, z1, x0),
+		 z0 = svlsl_wide_m (p0, z1, x0))
+
+/*
+** lsl_wide_1_s16_m_tied1:
+**	lsl	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s16_m_tied1, svint16_t,
+		z0 = svlsl_wide_n_s16_m (p0, z0, 1),
+		z0 = svlsl_wide_m (p0, z0, 1))
+
+/*
+** lsl_wide_1_s16_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s16_m_untied, svint16_t,
+		z0 = svlsl_wide_n_s16_m (p0, z1, 1),
+		z0 = svlsl_wide_m (p0, z1, 1))
+
+/*
+** lsl_wide_15_s16_m_tied1:
+**	lsl	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_15_s16_m_tied1, svint16_t,
+		z0 = svlsl_wide_n_s16_m (p0, z0, 15),
+		z0 = svlsl_wide_m (p0, z0, 15))
+
+/*
+** lsl_wide_15_s16_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_15_s16_m_untied, svint16_t,
+		z0 = svlsl_wide_n_s16_m (p0, z1, 15),
+		z0 = svlsl_wide_m (p0, z1, 15))
+
+/*
+** lsl_wide_16_s16_m_tied1:
+**	mov	(z[0-9]+\.d), #16
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_16_s16_m_tied1, svint16_t,
+		z0 = svlsl_wide_n_s16_m (p0, z0, 16),
+		z0 = svlsl_wide_m (p0, z0, 16))
+
+/*
+** lsl_wide_16_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #16
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_16_s16_m_untied, svint16_t,
+		z0 = svlsl_wide_n_s16_m (p0, z1, 16),
+		z0 = svlsl_wide_m (p0, z1, 16))
+
+/*
+** lsl_wide_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s16_z_tied1, svint16_t, svuint64_t,
+	     z0 = svlsl_wide_s16_z (p0, z0, z4),
+	     z0 = svlsl_wide_z (p0, z0, z4))
+
+/*
+** lsl_wide_s16_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.h, p0/z, z4\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_s16_z_tied2, svint16_t, svuint64_t,
+		 z0_res = svlsl_wide_s16_z (p0, z4, z0),
+		 z0_res = svlsl_wide_z (p0, z4, z0))
+
+/*
+** lsl_wide_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s16_z_untied, svint16_t, svuint64_t,
+	     z0 = svlsl_wide_s16_z (p0, z1, z4),
+	     z0 = svlsl_wide_z (p0, z1, z4))
+
+/*
+** lsl_wide_x0_s16_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s16_z_tied1, svint16_t, uint64_t,
+		 z0 = svlsl_wide_n_s16_z (p0, z0, x0),
+		 z0 = svlsl_wide_z (p0, z0, x0))
+
+/*
+** lsl_wide_x0_s16_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s16_z_untied, svint16_t, uint64_t,
+		 z0 = svlsl_wide_n_s16_z (p0, z1, x0),
+		 z0 = svlsl_wide_z (p0, z1, x0))
+
+/*
+** lsl_wide_1_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s16_z_tied1, svint16_t,
+		z0 = svlsl_wide_n_s16_z (p0, z0, 1),
+		z0 = svlsl_wide_z (p0, z0, 1))
+
+/*
+** lsl_wide_1_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s16_z_untied, svint16_t,
+		z0 = svlsl_wide_n_s16_z (p0, z1, 1),
+		z0 = svlsl_wide_z (p0, z1, 1))
+
+/*
+** lsl_wide_15_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_15_s16_z_tied1, svint16_t,
+		z0 = svlsl_wide_n_s16_z (p0, z0, 15),
+		z0 = svlsl_wide_z (p0, z0, 15))
+
+/*
+** lsl_wide_15_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_15_s16_z_untied, svint16_t,
+		z0 = svlsl_wide_n_s16_z (p0, z1, 15),
+		z0 = svlsl_wide_z (p0, z1, 15))
+
+/*
+** lsl_wide_16_s16_z_tied1:
+**	mov	(z[0-9]+\.d), #16
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_16_s16_z_tied1, svint16_t,
+		z0 = svlsl_wide_n_s16_z (p0, z0, 16),
+		z0 = svlsl_wide_z (p0, z0, 16))
+
+/*
+** lsl_wide_16_s16_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #16
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_16_s16_z_untied, svint16_t,
+		z0 = svlsl_wide_n_s16_z (p0, z1, 16),
+		z0 = svlsl_wide_z (p0, z1, 16))
+
+/*
+** lsl_wide_s16_x_tied1:
+**	lsl	z0\.h, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s16_x_tied1, svint16_t, svuint64_t,
+	     z0 = svlsl_wide_s16_x (p0, z0, z4),
+	     z0 = svlsl_wide_x (p0, z0, z4))
+
+/*
+** lsl_wide_s16_x_tied2:
+**	lsl	z0\.h, z4\.h, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_s16_x_tied2, svint16_t, svuint64_t,
+		 z0_res = svlsl_wide_s16_x (p0, z4, z0),
+		 z0_res = svlsl_wide_x (p0, z4, z0))
+
+/*
+** lsl_wide_s16_x_untied:
+**	lsl	z0\.h, z1\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s16_x_untied, svint16_t, svuint64_t,
+	     z0 = svlsl_wide_s16_x (p0, z1, z4),
+	     z0 = svlsl_wide_x (p0, z1, z4))
+
+/*
+** lsl_wide_x0_s16_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s16_x_tied1, svint16_t, uint64_t,
+		 z0 = svlsl_wide_n_s16_x (p0, z0, x0),
+		 z0 = svlsl_wide_x (p0, z0, x0))
+
+/*
+** lsl_wide_x0_s16_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.h, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s16_x_untied, svint16_t, uint64_t,
+		 z0 = svlsl_wide_n_s16_x (p0, z1, x0),
+		 z0 = svlsl_wide_x (p0, z1, x0))
+
+/*
+** lsl_wide_1_s16_x_tied1:
+**	lsl	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s16_x_tied1, svint16_t,
+		z0 = svlsl_wide_n_s16_x (p0, z0, 1),
+		z0 = svlsl_wide_x (p0, z0, 1))
+
+/*
+** lsl_wide_1_s16_x_untied:
+**	lsl	z0\.h, z1\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s16_x_untied, svint16_t,
+		z0 = svlsl_wide_n_s16_x (p0, z1, 1),
+		z0 = svlsl_wide_x (p0, z1, 1))
+
+/*
+** lsl_wide_15_s16_x_tied1:
+**	lsl	z0\.h, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_15_s16_x_tied1, svint16_t,
+		z0 = svlsl_wide_n_s16_x (p0, z0, 15),
+		z0 = svlsl_wide_x (p0, z0, 15))
+
+/*
+** lsl_wide_15_s16_x_untied:
+**	lsl	z0\.h, z1\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_15_s16_x_untied, svint16_t,
+		z0 = svlsl_wide_n_s16_x (p0, z1, 15),
+		z0 = svlsl_wide_x (p0, z1, 15))
+
+/*
+** lsl_wide_16_s16_x_tied1:
+**	mov	(z[0-9]+\.d), #16
+**	lsl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_16_s16_x_tied1, svint16_t,
+		z0 = svlsl_wide_n_s16_x (p0, z0, 16),
+		z0 = svlsl_wide_x (p0, z0, 16))
+
+/*
+** lsl_wide_16_s16_x_untied:
+**	mov	(z[0-9]+\.d), #16
+**	lsl	z0\.h, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_16_s16_x_untied, svint16_t,
+		z0 = svlsl_wide_n_s16_x (p0, z1, 16),
+		z0 = svlsl_wide_x (p0, z1, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s32.c
new file mode 100644
index 000000000..acd813df3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s32.c
@@ -0,0 +1,331 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsl_wide_s32_m_tied1:
+**	lsl	z0\.s, p0/m, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s32_m_tied1, svint32_t, svuint64_t,
+	     z0 = svlsl_wide_s32_m (p0, z0, z4),
+	     z0 = svlsl_wide_m (p0, z0, z4))
+
+/*
+** lsl_wide_s32_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_s32_m_tied2, svint32_t, svuint64_t,
+		 z0_res = svlsl_wide_s32_m (p0, z4, z0),
+		 z0_res = svlsl_wide_m (p0, z4, z0))
+
+/*
+** lsl_wide_s32_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s32_m_untied, svint32_t, svuint64_t,
+	     z0 = svlsl_wide_s32_m (p0, z1, z4),
+	     z0 = svlsl_wide_m (p0, z1, z4))
+
+/*
+** lsl_wide_x0_s32_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s32_m_tied1, svint32_t, uint64_t,
+		 z0 = svlsl_wide_n_s32_m (p0, z0, x0),
+		 z0 = svlsl_wide_m (p0, z0, x0))
+
+/*
+** lsl_wide_x0_s32_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s32_m_untied, svint32_t, uint64_t,
+		 z0 = svlsl_wide_n_s32_m (p0, z1, x0),
+		 z0 = svlsl_wide_m (p0, z1, x0))
+
+/*
+** lsl_wide_1_s32_m_tied1:
+**	lsl	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s32_m_tied1, svint32_t,
+		z0 = svlsl_wide_n_s32_m (p0, z0, 1),
+		z0 = svlsl_wide_m (p0, z0, 1))
+
+/*
+** lsl_wide_1_s32_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s32_m_untied, svint32_t,
+		z0 = svlsl_wide_n_s32_m (p0, z1, 1),
+		z0 = svlsl_wide_m (p0, z1, 1))
+
+/*
+** lsl_wide_31_s32_m_tied1:
+**	lsl	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_31_s32_m_tied1, svint32_t,
+		z0 = svlsl_wide_n_s32_m (p0, z0, 31),
+		z0 = svlsl_wide_m (p0, z0, 31))
+
+/*
+** lsl_wide_31_s32_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_31_s32_m_untied, svint32_t,
+		z0 = svlsl_wide_n_s32_m (p0, z1, 31),
+		z0 = svlsl_wide_m (p0, z1, 31))
+
+/*
+** lsl_wide_32_s32_m_tied1:
+**	mov	(z[0-9]+\.d), #32
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_32_s32_m_tied1, svint32_t,
+		z0 = svlsl_wide_n_s32_m (p0, z0, 32),
+		z0 = svlsl_wide_m (p0, z0, 32))
+
+/*
+** lsl_wide_32_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #32
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_32_s32_m_untied, svint32_t,
+		z0 = svlsl_wide_n_s32_m (p0, z1, 32),
+		z0 = svlsl_wide_m (p0, z1, 32))
+
+/*
+** lsl_wide_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s32_z_tied1, svint32_t, svuint64_t,
+	     z0 = svlsl_wide_s32_z (p0, z0, z4),
+	     z0 = svlsl_wide_z (p0, z0, z4))
+
+/*
+** lsl_wide_s32_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.s, p0/z, z4\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_s32_z_tied2, svint32_t, svuint64_t,
+		 z0_res = svlsl_wide_s32_z (p0, z4, z0),
+		 z0_res = svlsl_wide_z (p0, z4, z0))
+
+/*
+** lsl_wide_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s32_z_untied, svint32_t, svuint64_t,
+	     z0 = svlsl_wide_s32_z (p0, z1, z4),
+	     z0 = svlsl_wide_z (p0, z1, z4))
+
+/*
+** lsl_wide_x0_s32_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s32_z_tied1, svint32_t, uint64_t,
+		 z0 = svlsl_wide_n_s32_z (p0, z0, x0),
+		 z0 = svlsl_wide_z (p0, z0, x0))
+
+/*
+** lsl_wide_x0_s32_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s32_z_untied, svint32_t, uint64_t,
+		 z0 = svlsl_wide_n_s32_z (p0, z1, x0),
+		 z0 = svlsl_wide_z (p0, z1, x0))
+
+/*
+** lsl_wide_1_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s32_z_tied1, svint32_t,
+		z0 = svlsl_wide_n_s32_z (p0, z0, 1),
+		z0 = svlsl_wide_z (p0, z0, 1))
+
+/*
+** lsl_wide_1_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s32_z_untied, svint32_t,
+		z0 = svlsl_wide_n_s32_z (p0, z1, 1),
+		z0 = svlsl_wide_z (p0, z1, 1))
+
+/*
+** lsl_wide_31_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_31_s32_z_tied1, svint32_t,
+		z0 = svlsl_wide_n_s32_z (p0, z0, 31),
+		z0 = svlsl_wide_z (p0, z0, 31))
+
+/*
+** lsl_wide_31_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_31_s32_z_untied, svint32_t,
+		z0 = svlsl_wide_n_s32_z (p0, z1, 31),
+		z0 = svlsl_wide_z (p0, z1, 31))
+
+/*
+** lsl_wide_32_s32_z_tied1:
+**	mov	(z[0-9]+\.d), #32
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_32_s32_z_tied1, svint32_t,
+		z0 = svlsl_wide_n_s32_z (p0, z0, 32),
+		z0 = svlsl_wide_z (p0, z0, 32))
+
+/*
+** lsl_wide_32_s32_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #32
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_32_s32_z_untied, svint32_t,
+		z0 = svlsl_wide_n_s32_z (p0, z1, 32),
+		z0 = svlsl_wide_z (p0, z1, 32))
+
+/*
+** lsl_wide_s32_x_tied1:
+**	lsl	z0\.s, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s32_x_tied1, svint32_t, svuint64_t,
+	     z0 = svlsl_wide_s32_x (p0, z0, z4),
+	     z0 = svlsl_wide_x (p0, z0, z4))
+
+/*
+** lsl_wide_s32_x_tied2:
+**	lsl	z0\.s, z4\.s, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_s32_x_tied2, svint32_t, svuint64_t,
+		 z0_res = svlsl_wide_s32_x (p0, z4, z0),
+		 z0_res = svlsl_wide_x (p0, z4, z0))
+
+/*
+** lsl_wide_s32_x_untied:
+**	lsl	z0\.s, z1\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s32_x_untied, svint32_t, svuint64_t,
+	     z0 = svlsl_wide_s32_x (p0, z1, z4),
+	     z0 = svlsl_wide_x (p0, z1, z4))
+
+/*
+** lsl_wide_x0_s32_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s32_x_tied1, svint32_t, uint64_t,
+		 z0 = svlsl_wide_n_s32_x (p0, z0, x0),
+		 z0 = svlsl_wide_x (p0, z0, x0))
+
+/*
+** lsl_wide_x0_s32_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.s, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s32_x_untied, svint32_t, uint64_t,
+		 z0 = svlsl_wide_n_s32_x (p0, z1, x0),
+		 z0 = svlsl_wide_x (p0, z1, x0))
+
+/*
+** lsl_wide_1_s32_x_tied1:
+**	lsl	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s32_x_tied1, svint32_t,
+		z0 = svlsl_wide_n_s32_x (p0, z0, 1),
+		z0 = svlsl_wide_x (p0, z0, 1))
+
+/*
+** lsl_wide_1_s32_x_untied:
+**	lsl	z0\.s, z1\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s32_x_untied, svint32_t,
+		z0 = svlsl_wide_n_s32_x (p0, z1, 1),
+		z0 = svlsl_wide_x (p0, z1, 1))
+
+/*
+** lsl_wide_31_s32_x_tied1:
+**	lsl	z0\.s, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_31_s32_x_tied1, svint32_t,
+		z0 = svlsl_wide_n_s32_x (p0, z0, 31),
+		z0 = svlsl_wide_x (p0, z0, 31))
+
+/*
+** lsl_wide_31_s32_x_untied:
+**	lsl	z0\.s, z1\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_31_s32_x_untied, svint32_t,
+		z0 = svlsl_wide_n_s32_x (p0, z1, 31),
+		z0 = svlsl_wide_x (p0, z1, 31))
+
+/*
+** lsl_wide_32_s32_x_tied1:
+**	mov	(z[0-9]+\.d), #32
+**	lsl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_32_s32_x_tied1, svint32_t,
+		z0 = svlsl_wide_n_s32_x (p0, z0, 32),
+		z0 = svlsl_wide_x (p0, z0, 32))
+
+/*
+** lsl_wide_32_s32_x_untied:
+**	mov	(z[0-9]+\.d), #32
+**	lsl	z0\.s, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_32_s32_x_untied, svint32_t,
+		z0 = svlsl_wide_n_s32_x (p0, z1, 32),
+		z0 = svlsl_wide_x (p0, z1, 32))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s8.c
new file mode 100644
index 000000000..17e8e8685
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_s8.c
@@ -0,0 +1,331 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsl_wide_s8_m_tied1:
+**	lsl	z0\.b, p0/m, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s8_m_tied1, svint8_t, svuint64_t,
+	     z0 = svlsl_wide_s8_m (p0, z0, z4),
+	     z0 = svlsl_wide_m (p0, z0, z4))
+
+/*
+** lsl_wide_s8_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_s8_m_tied2, svint8_t, svuint64_t,
+		 z0_res = svlsl_wide_s8_m (p0, z4, z0),
+		 z0_res = svlsl_wide_m (p0, z4, z0))
+
+/*
+** lsl_wide_s8_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s8_m_untied, svint8_t, svuint64_t,
+	     z0 = svlsl_wide_s8_m (p0, z1, z4),
+	     z0 = svlsl_wide_m (p0, z1, z4))
+
+/*
+** lsl_wide_x0_s8_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s8_m_tied1, svint8_t, uint64_t,
+		 z0 = svlsl_wide_n_s8_m (p0, z0, x0),
+		 z0 = svlsl_wide_m (p0, z0, x0))
+
+/*
+** lsl_wide_x0_s8_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s8_m_untied, svint8_t, uint64_t,
+		 z0 = svlsl_wide_n_s8_m (p0, z1, x0),
+		 z0 = svlsl_wide_m (p0, z1, x0))
+
+/*
+** lsl_wide_1_s8_m_tied1:
+**	lsl	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s8_m_tied1, svint8_t,
+		z0 = svlsl_wide_n_s8_m (p0, z0, 1),
+		z0 = svlsl_wide_m (p0, z0, 1))
+
+/*
+** lsl_wide_1_s8_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s8_m_untied, svint8_t,
+		z0 = svlsl_wide_n_s8_m (p0, z1, 1),
+		z0 = svlsl_wide_m (p0, z1, 1))
+
+/*
+** lsl_wide_7_s8_m_tied1:
+**	lsl	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_7_s8_m_tied1, svint8_t,
+		z0 = svlsl_wide_n_s8_m (p0, z0, 7),
+		z0 = svlsl_wide_m (p0, z0, 7))
+
+/*
+** lsl_wide_7_s8_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_7_s8_m_untied, svint8_t,
+		z0 = svlsl_wide_n_s8_m (p0, z1, 7),
+		z0 = svlsl_wide_m (p0, z1, 7))
+
+/*
+** lsl_wide_8_s8_m_tied1:
+**	mov	(z[0-9]+\.d), #8
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_8_s8_m_tied1, svint8_t,
+		z0 = svlsl_wide_n_s8_m (p0, z0, 8),
+		z0 = svlsl_wide_m (p0, z0, 8))
+
+/*
+** lsl_wide_8_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #8
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_8_s8_m_untied, svint8_t,
+		z0 = svlsl_wide_n_s8_m (p0, z1, 8),
+		z0 = svlsl_wide_m (p0, z1, 8))
+
+/*
+** lsl_wide_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s8_z_tied1, svint8_t, svuint64_t,
+	     z0 = svlsl_wide_s8_z (p0, z0, z4),
+	     z0 = svlsl_wide_z (p0, z0, z4))
+
+/*
+** lsl_wide_s8_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.b, p0/z, z4\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_s8_z_tied2, svint8_t, svuint64_t,
+		 z0_res = svlsl_wide_s8_z (p0, z4, z0),
+		 z0_res = svlsl_wide_z (p0, z4, z0))
+
+/*
+** lsl_wide_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s8_z_untied, svint8_t, svuint64_t,
+	     z0 = svlsl_wide_s8_z (p0, z1, z4),
+	     z0 = svlsl_wide_z (p0, z1, z4))
+
+/*
+** lsl_wide_x0_s8_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s8_z_tied1, svint8_t, uint64_t,
+		 z0 = svlsl_wide_n_s8_z (p0, z0, x0),
+		 z0 = svlsl_wide_z (p0, z0, x0))
+
+/*
+** lsl_wide_x0_s8_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s8_z_untied, svint8_t, uint64_t,
+		 z0 = svlsl_wide_n_s8_z (p0, z1, x0),
+		 z0 = svlsl_wide_z (p0, z1, x0))
+
+/*
+** lsl_wide_1_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s8_z_tied1, svint8_t,
+		z0 = svlsl_wide_n_s8_z (p0, z0, 1),
+		z0 = svlsl_wide_z (p0, z0, 1))
+
+/*
+** lsl_wide_1_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s8_z_untied, svint8_t,
+		z0 = svlsl_wide_n_s8_z (p0, z1, 1),
+		z0 = svlsl_wide_z (p0, z1, 1))
+
+/*
+** lsl_wide_7_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_7_s8_z_tied1, svint8_t,
+		z0 = svlsl_wide_n_s8_z (p0, z0, 7),
+		z0 = svlsl_wide_z (p0, z0, 7))
+
+/*
+** lsl_wide_7_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_7_s8_z_untied, svint8_t,
+		z0 = svlsl_wide_n_s8_z (p0, z1, 7),
+		z0 = svlsl_wide_z (p0, z1, 7))
+
+/*
+** lsl_wide_8_s8_z_tied1:
+**	mov	(z[0-9]+\.d), #8
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_8_s8_z_tied1, svint8_t,
+		z0 = svlsl_wide_n_s8_z (p0, z0, 8),
+		z0 = svlsl_wide_z (p0, z0, 8))
+
+/*
+** lsl_wide_8_s8_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #8
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_8_s8_z_untied, svint8_t,
+		z0 = svlsl_wide_n_s8_z (p0, z1, 8),
+		z0 = svlsl_wide_z (p0, z1, 8))
+
+/*
+** lsl_wide_s8_x_tied1:
+**	lsl	z0\.b, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s8_x_tied1, svint8_t, svuint64_t,
+	     z0 = svlsl_wide_s8_x (p0, z0, z4),
+	     z0 = svlsl_wide_x (p0, z0, z4))
+
+/*
+** lsl_wide_s8_x_tied2:
+**	lsl	z0\.b, z4\.b, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_s8_x_tied2, svint8_t, svuint64_t,
+		 z0_res = svlsl_wide_s8_x (p0, z4, z0),
+		 z0_res = svlsl_wide_x (p0, z4, z0))
+
+/*
+** lsl_wide_s8_x_untied:
+**	lsl	z0\.b, z1\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_s8_x_untied, svint8_t, svuint64_t,
+	     z0 = svlsl_wide_s8_x (p0, z1, z4),
+	     z0 = svlsl_wide_x (p0, z1, z4))
+
+/*
+** lsl_wide_x0_s8_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.b, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s8_x_tied1, svint8_t, uint64_t,
+		 z0 = svlsl_wide_n_s8_x (p0, z0, x0),
+		 z0 = svlsl_wide_x (p0, z0, x0))
+
+/*
+** lsl_wide_x0_s8_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.b, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_s8_x_untied, svint8_t, uint64_t,
+		 z0 = svlsl_wide_n_s8_x (p0, z1, x0),
+		 z0 = svlsl_wide_x (p0, z1, x0))
+
+/*
+** lsl_wide_1_s8_x_tied1:
+**	lsl	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s8_x_tied1, svint8_t,
+		z0 = svlsl_wide_n_s8_x (p0, z0, 1),
+		z0 = svlsl_wide_x (p0, z0, 1))
+
+/*
+** lsl_wide_1_s8_x_untied:
+**	lsl	z0\.b, z1\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_s8_x_untied, svint8_t,
+		z0 = svlsl_wide_n_s8_x (p0, z1, 1),
+		z0 = svlsl_wide_x (p0, z1, 1))
+
+/*
+** lsl_wide_7_s8_x_tied1:
+**	lsl	z0\.b, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_7_s8_x_tied1, svint8_t,
+		z0 = svlsl_wide_n_s8_x (p0, z0, 7),
+		z0 = svlsl_wide_x (p0, z0, 7))
+
+/*
+** lsl_wide_7_s8_x_untied:
+**	lsl	z0\.b, z1\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_7_s8_x_untied, svint8_t,
+		z0 = svlsl_wide_n_s8_x (p0, z1, 7),
+		z0 = svlsl_wide_x (p0, z1, 7))
+
+/*
+** lsl_wide_8_s8_x_tied1:
+**	mov	(z[0-9]+\.d), #8
+**	lsl	z0\.b, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_8_s8_x_tied1, svint8_t,
+		z0 = svlsl_wide_n_s8_x (p0, z0, 8),
+		z0 = svlsl_wide_x (p0, z0, 8))
+
+/*
+** lsl_wide_8_s8_x_untied:
+**	mov	(z[0-9]+\.d), #8
+**	lsl	z0\.b, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_8_s8_x_untied, svint8_t,
+		z0 = svlsl_wide_n_s8_x (p0, z1, 8),
+		z0 = svlsl_wide_x (p0, z1, 8))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u16.c
new file mode 100644
index 000000000..cff24a850
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u16.c
@@ -0,0 +1,331 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsl_wide_u16_m_tied1:
+**	lsl	z0\.h, p0/m, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u16_m_tied1, svuint16_t, svuint64_t,
+	     z0 = svlsl_wide_u16_m (p0, z0, z4),
+	     z0 = svlsl_wide_m (p0, z0, z4))
+
+/*
+** lsl_wide_u16_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_u16_m_tied2, svuint16_t, svuint64_t,
+		 z0_res = svlsl_wide_u16_m (p0, z4, z0),
+		 z0_res = svlsl_wide_m (p0, z4, z0))
+
+/*
+** lsl_wide_u16_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u16_m_untied, svuint16_t, svuint64_t,
+	     z0 = svlsl_wide_u16_m (p0, z1, z4),
+	     z0 = svlsl_wide_m (p0, z1, z4))
+
+/*
+** lsl_wide_x0_u16_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u16_m_tied1, svuint16_t, uint64_t,
+		 z0 = svlsl_wide_n_u16_m (p0, z0, x0),
+		 z0 = svlsl_wide_m (p0, z0, x0))
+
+/*
+** lsl_wide_x0_u16_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u16_m_untied, svuint16_t, uint64_t,
+		 z0 = svlsl_wide_n_u16_m (p0, z1, x0),
+		 z0 = svlsl_wide_m (p0, z1, x0))
+
+/*
+** lsl_wide_1_u16_m_tied1:
+**	lsl	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u16_m_tied1, svuint16_t,
+		z0 = svlsl_wide_n_u16_m (p0, z0, 1),
+		z0 = svlsl_wide_m (p0, z0, 1))
+
+/*
+** lsl_wide_1_u16_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u16_m_untied, svuint16_t,
+		z0 = svlsl_wide_n_u16_m (p0, z1, 1),
+		z0 = svlsl_wide_m (p0, z1, 1))
+
+/*
+** lsl_wide_15_u16_m_tied1:
+**	lsl	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_15_u16_m_tied1, svuint16_t,
+		z0 = svlsl_wide_n_u16_m (p0, z0, 15),
+		z0 = svlsl_wide_m (p0, z0, 15))
+
+/*
+** lsl_wide_15_u16_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_15_u16_m_untied, svuint16_t,
+		z0 = svlsl_wide_n_u16_m (p0, z1, 15),
+		z0 = svlsl_wide_m (p0, z1, 15))
+
+/*
+** lsl_wide_16_u16_m_tied1:
+**	mov	(z[0-9]+\.d), #16
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_16_u16_m_tied1, svuint16_t,
+		z0 = svlsl_wide_n_u16_m (p0, z0, 16),
+		z0 = svlsl_wide_m (p0, z0, 16))
+
+/*
+** lsl_wide_16_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #16
+**	movprfx	z0, z1
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_16_u16_m_untied, svuint16_t,
+		z0 = svlsl_wide_n_u16_m (p0, z1, 16),
+		z0 = svlsl_wide_m (p0, z1, 16))
+
+/*
+** lsl_wide_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u16_z_tied1, svuint16_t, svuint64_t,
+	     z0 = svlsl_wide_u16_z (p0, z0, z4),
+	     z0 = svlsl_wide_z (p0, z0, z4))
+
+/*
+** lsl_wide_u16_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.h, p0/z, z4\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_u16_z_tied2, svuint16_t, svuint64_t,
+		 z0_res = svlsl_wide_u16_z (p0, z4, z0),
+		 z0_res = svlsl_wide_z (p0, z4, z0))
+
+/*
+** lsl_wide_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u16_z_untied, svuint16_t, svuint64_t,
+	     z0 = svlsl_wide_u16_z (p0, z1, z4),
+	     z0 = svlsl_wide_z (p0, z1, z4))
+
+/*
+** lsl_wide_x0_u16_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u16_z_tied1, svuint16_t, uint64_t,
+		 z0 = svlsl_wide_n_u16_z (p0, z0, x0),
+		 z0 = svlsl_wide_z (p0, z0, x0))
+
+/*
+** lsl_wide_x0_u16_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u16_z_untied, svuint16_t, uint64_t,
+		 z0 = svlsl_wide_n_u16_z (p0, z1, x0),
+		 z0 = svlsl_wide_z (p0, z1, x0))
+
+/*
+** lsl_wide_1_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u16_z_tied1, svuint16_t,
+		z0 = svlsl_wide_n_u16_z (p0, z0, 1),
+		z0 = svlsl_wide_z (p0, z0, 1))
+
+/*
+** lsl_wide_1_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u16_z_untied, svuint16_t,
+		z0 = svlsl_wide_n_u16_z (p0, z1, 1),
+		z0 = svlsl_wide_z (p0, z1, 1))
+
+/*
+** lsl_wide_15_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_15_u16_z_tied1, svuint16_t,
+		z0 = svlsl_wide_n_u16_z (p0, z0, 15),
+		z0 = svlsl_wide_z (p0, z0, 15))
+
+/*
+** lsl_wide_15_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_15_u16_z_untied, svuint16_t,
+		z0 = svlsl_wide_n_u16_z (p0, z1, 15),
+		z0 = svlsl_wide_z (p0, z1, 15))
+
+/*
+** lsl_wide_16_u16_z_tied1:
+**	mov	(z[0-9]+\.d), #16
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_16_u16_z_tied1, svuint16_t,
+		z0 = svlsl_wide_n_u16_z (p0, z0, 16),
+		z0 = svlsl_wide_z (p0, z0, 16))
+
+/*
+** lsl_wide_16_u16_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #16
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsl	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_16_u16_z_untied, svuint16_t,
+		z0 = svlsl_wide_n_u16_z (p0, z1, 16),
+		z0 = svlsl_wide_z (p0, z1, 16))
+
+/*
+** lsl_wide_u16_x_tied1:
+**	lsl	z0\.h, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u16_x_tied1, svuint16_t, svuint64_t,
+	     z0 = svlsl_wide_u16_x (p0, z0, z4),
+	     z0 = svlsl_wide_x (p0, z0, z4))
+
+/*
+** lsl_wide_u16_x_tied2:
+**	lsl	z0\.h, z4\.h, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_u16_x_tied2, svuint16_t, svuint64_t,
+		 z0_res = svlsl_wide_u16_x (p0, z4, z0),
+		 z0_res = svlsl_wide_x (p0, z4, z0))
+
+/*
+** lsl_wide_u16_x_untied:
+**	lsl	z0\.h, z1\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u16_x_untied, svuint16_t, svuint64_t,
+	     z0 = svlsl_wide_u16_x (p0, z1, z4),
+	     z0 = svlsl_wide_x (p0, z1, z4))
+
+/*
+** lsl_wide_x0_u16_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u16_x_tied1, svuint16_t, uint64_t,
+		 z0 = svlsl_wide_n_u16_x (p0, z0, x0),
+		 z0 = svlsl_wide_x (p0, z0, x0))
+
+/*
+** lsl_wide_x0_u16_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.h, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u16_x_untied, svuint16_t, uint64_t,
+		 z0 = svlsl_wide_n_u16_x (p0, z1, x0),
+		 z0 = svlsl_wide_x (p0, z1, x0))
+
+/*
+** lsl_wide_1_u16_x_tied1:
+**	lsl	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u16_x_tied1, svuint16_t,
+		z0 = svlsl_wide_n_u16_x (p0, z0, 1),
+		z0 = svlsl_wide_x (p0, z0, 1))
+
+/*
+** lsl_wide_1_u16_x_untied:
+**	lsl	z0\.h, z1\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u16_x_untied, svuint16_t,
+		z0 = svlsl_wide_n_u16_x (p0, z1, 1),
+		z0 = svlsl_wide_x (p0, z1, 1))
+
+/*
+** lsl_wide_15_u16_x_tied1:
+**	lsl	z0\.h, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_15_u16_x_tied1, svuint16_t,
+		z0 = svlsl_wide_n_u16_x (p0, z0, 15),
+		z0 = svlsl_wide_x (p0, z0, 15))
+
+/*
+** lsl_wide_15_u16_x_untied:
+**	lsl	z0\.h, z1\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_15_u16_x_untied, svuint16_t,
+		z0 = svlsl_wide_n_u16_x (p0, z1, 15),
+		z0 = svlsl_wide_x (p0, z1, 15))
+
+/*
+** lsl_wide_16_u16_x_tied1:
+**	mov	(z[0-9]+\.d), #16
+**	lsl	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_16_u16_x_tied1, svuint16_t,
+		z0 = svlsl_wide_n_u16_x (p0, z0, 16),
+		z0 = svlsl_wide_x (p0, z0, 16))
+
+/*
+** lsl_wide_16_u16_x_untied:
+**	mov	(z[0-9]+\.d), #16
+**	lsl	z0\.h, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_16_u16_x_untied, svuint16_t,
+		z0 = svlsl_wide_n_u16_x (p0, z1, 16),
+		z0 = svlsl_wide_x (p0, z1, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u32.c
new file mode 100644
index 000000000..7b1afab49
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u32.c
@@ -0,0 +1,331 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsl_wide_u32_m_tied1:
+**	lsl	z0\.s, p0/m, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u32_m_tied1, svuint32_t, svuint64_t,
+	     z0 = svlsl_wide_u32_m (p0, z0, z4),
+	     z0 = svlsl_wide_m (p0, z0, z4))
+
+/*
+** lsl_wide_u32_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_u32_m_tied2, svuint32_t, svuint64_t,
+		 z0_res = svlsl_wide_u32_m (p0, z4, z0),
+		 z0_res = svlsl_wide_m (p0, z4, z0))
+
+/*
+** lsl_wide_u32_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u32_m_untied, svuint32_t, svuint64_t,
+	     z0 = svlsl_wide_u32_m (p0, z1, z4),
+	     z0 = svlsl_wide_m (p0, z1, z4))
+
+/*
+** lsl_wide_x0_u32_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u32_m_tied1, svuint32_t, uint64_t,
+		 z0 = svlsl_wide_n_u32_m (p0, z0, x0),
+		 z0 = svlsl_wide_m (p0, z0, x0))
+
+/*
+** lsl_wide_x0_u32_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u32_m_untied, svuint32_t, uint64_t,
+		 z0 = svlsl_wide_n_u32_m (p0, z1, x0),
+		 z0 = svlsl_wide_m (p0, z1, x0))
+
+/*
+** lsl_wide_1_u32_m_tied1:
+**	lsl	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u32_m_tied1, svuint32_t,
+		z0 = svlsl_wide_n_u32_m (p0, z0, 1),
+		z0 = svlsl_wide_m (p0, z0, 1))
+
+/*
+** lsl_wide_1_u32_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u32_m_untied, svuint32_t,
+		z0 = svlsl_wide_n_u32_m (p0, z1, 1),
+		z0 = svlsl_wide_m (p0, z1, 1))
+
+/*
+** lsl_wide_31_u32_m_tied1:
+**	lsl	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_31_u32_m_tied1, svuint32_t,
+		z0 = svlsl_wide_n_u32_m (p0, z0, 31),
+		z0 = svlsl_wide_m (p0, z0, 31))
+
+/*
+** lsl_wide_31_u32_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_31_u32_m_untied, svuint32_t,
+		z0 = svlsl_wide_n_u32_m (p0, z1, 31),
+		z0 = svlsl_wide_m (p0, z1, 31))
+
+/*
+** lsl_wide_32_u32_m_tied1:
+**	mov	(z[0-9]+\.d), #32
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_32_u32_m_tied1, svuint32_t,
+		z0 = svlsl_wide_n_u32_m (p0, z0, 32),
+		z0 = svlsl_wide_m (p0, z0, 32))
+
+/*
+** lsl_wide_32_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #32
+**	movprfx	z0, z1
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_32_u32_m_untied, svuint32_t,
+		z0 = svlsl_wide_n_u32_m (p0, z1, 32),
+		z0 = svlsl_wide_m (p0, z1, 32))
+
+/*
+** lsl_wide_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u32_z_tied1, svuint32_t, svuint64_t,
+	     z0 = svlsl_wide_u32_z (p0, z0, z4),
+	     z0 = svlsl_wide_z (p0, z0, z4))
+
+/*
+** lsl_wide_u32_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.s, p0/z, z4\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_u32_z_tied2, svuint32_t, svuint64_t,
+		 z0_res = svlsl_wide_u32_z (p0, z4, z0),
+		 z0_res = svlsl_wide_z (p0, z4, z0))
+
+/*
+** lsl_wide_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u32_z_untied, svuint32_t, svuint64_t,
+	     z0 = svlsl_wide_u32_z (p0, z1, z4),
+	     z0 = svlsl_wide_z (p0, z1, z4))
+
+/*
+** lsl_wide_x0_u32_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u32_z_tied1, svuint32_t, uint64_t,
+		 z0 = svlsl_wide_n_u32_z (p0, z0, x0),
+		 z0 = svlsl_wide_z (p0, z0, x0))
+
+/*
+** lsl_wide_x0_u32_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u32_z_untied, svuint32_t, uint64_t,
+		 z0 = svlsl_wide_n_u32_z (p0, z1, x0),
+		 z0 = svlsl_wide_z (p0, z1, x0))
+
+/*
+** lsl_wide_1_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u32_z_tied1, svuint32_t,
+		z0 = svlsl_wide_n_u32_z (p0, z0, 1),
+		z0 = svlsl_wide_z (p0, z0, 1))
+
+/*
+** lsl_wide_1_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u32_z_untied, svuint32_t,
+		z0 = svlsl_wide_n_u32_z (p0, z1, 1),
+		z0 = svlsl_wide_z (p0, z1, 1))
+
+/*
+** lsl_wide_31_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_31_u32_z_tied1, svuint32_t,
+		z0 = svlsl_wide_n_u32_z (p0, z0, 31),
+		z0 = svlsl_wide_z (p0, z0, 31))
+
+/*
+** lsl_wide_31_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_31_u32_z_untied, svuint32_t,
+		z0 = svlsl_wide_n_u32_z (p0, z1, 31),
+		z0 = svlsl_wide_z (p0, z1, 31))
+
+/*
+** lsl_wide_32_u32_z_tied1:
+**	mov	(z[0-9]+\.d), #32
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_32_u32_z_tied1, svuint32_t,
+		z0 = svlsl_wide_n_u32_z (p0, z0, 32),
+		z0 = svlsl_wide_z (p0, z0, 32))
+
+/*
+** lsl_wide_32_u32_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #32
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsl	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_32_u32_z_untied, svuint32_t,
+		z0 = svlsl_wide_n_u32_z (p0, z1, 32),
+		z0 = svlsl_wide_z (p0, z1, 32))
+
+/*
+** lsl_wide_u32_x_tied1:
+**	lsl	z0\.s, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u32_x_tied1, svuint32_t, svuint64_t,
+	     z0 = svlsl_wide_u32_x (p0, z0, z4),
+	     z0 = svlsl_wide_x (p0, z0, z4))
+
+/*
+** lsl_wide_u32_x_tied2:
+**	lsl	z0\.s, z4\.s, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_u32_x_tied2, svuint32_t, svuint64_t,
+		 z0_res = svlsl_wide_u32_x (p0, z4, z0),
+		 z0_res = svlsl_wide_x (p0, z4, z0))
+
+/*
+** lsl_wide_u32_x_untied:
+**	lsl	z0\.s, z1\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u32_x_untied, svuint32_t, svuint64_t,
+	     z0 = svlsl_wide_u32_x (p0, z1, z4),
+	     z0 = svlsl_wide_x (p0, z1, z4))
+
+/*
+** lsl_wide_x0_u32_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u32_x_tied1, svuint32_t, uint64_t,
+		 z0 = svlsl_wide_n_u32_x (p0, z0, x0),
+		 z0 = svlsl_wide_x (p0, z0, x0))
+
+/*
+** lsl_wide_x0_u32_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.s, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u32_x_untied, svuint32_t, uint64_t,
+		 z0 = svlsl_wide_n_u32_x (p0, z1, x0),
+		 z0 = svlsl_wide_x (p0, z1, x0))
+
+/*
+** lsl_wide_1_u32_x_tied1:
+**	lsl	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u32_x_tied1, svuint32_t,
+		z0 = svlsl_wide_n_u32_x (p0, z0, 1),
+		z0 = svlsl_wide_x (p0, z0, 1))
+
+/*
+** lsl_wide_1_u32_x_untied:
+**	lsl	z0\.s, z1\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u32_x_untied, svuint32_t,
+		z0 = svlsl_wide_n_u32_x (p0, z1, 1),
+		z0 = svlsl_wide_x (p0, z1, 1))
+
+/*
+** lsl_wide_31_u32_x_tied1:
+**	lsl	z0\.s, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_31_u32_x_tied1, svuint32_t,
+		z0 = svlsl_wide_n_u32_x (p0, z0, 31),
+		z0 = svlsl_wide_x (p0, z0, 31))
+
+/*
+** lsl_wide_31_u32_x_untied:
+**	lsl	z0\.s, z1\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_31_u32_x_untied, svuint32_t,
+		z0 = svlsl_wide_n_u32_x (p0, z1, 31),
+		z0 = svlsl_wide_x (p0, z1, 31))
+
+/*
+** lsl_wide_32_u32_x_tied1:
+**	mov	(z[0-9]+\.d), #32
+**	lsl	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_32_u32_x_tied1, svuint32_t,
+		z0 = svlsl_wide_n_u32_x (p0, z0, 32),
+		z0 = svlsl_wide_x (p0, z0, 32))
+
+/*
+** lsl_wide_32_u32_x_untied:
+**	mov	(z[0-9]+\.d), #32
+**	lsl	z0\.s, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_32_u32_x_untied, svuint32_t,
+		z0 = svlsl_wide_n_u32_x (p0, z1, 32),
+		z0 = svlsl_wide_x (p0, z1, 32))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u8.c
new file mode 100644
index 000000000..df8b1ec86
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsl_wide_u8.c
@@ -0,0 +1,331 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsl_wide_u8_m_tied1:
+**	lsl	z0\.b, p0/m, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u8_m_tied1, svuint8_t, svuint64_t,
+	     z0 = svlsl_wide_u8_m (p0, z0, z4),
+	     z0 = svlsl_wide_m (p0, z0, z4))
+
+/*
+** lsl_wide_u8_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_u8_m_tied2, svuint8_t, svuint64_t,
+		 z0_res = svlsl_wide_u8_m (p0, z4, z0),
+		 z0_res = svlsl_wide_m (p0, z4, z0))
+
+/*
+** lsl_wide_u8_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u8_m_untied, svuint8_t, svuint64_t,
+	     z0 = svlsl_wide_u8_m (p0, z1, z4),
+	     z0 = svlsl_wide_m (p0, z1, z4))
+
+/*
+** lsl_wide_x0_u8_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u8_m_tied1, svuint8_t, uint64_t,
+		 z0 = svlsl_wide_n_u8_m (p0, z0, x0),
+		 z0 = svlsl_wide_m (p0, z0, x0))
+
+/*
+** lsl_wide_x0_u8_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u8_m_untied, svuint8_t, uint64_t,
+		 z0 = svlsl_wide_n_u8_m (p0, z1, x0),
+		 z0 = svlsl_wide_m (p0, z1, x0))
+
+/*
+** lsl_wide_1_u8_m_tied1:
+**	lsl	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u8_m_tied1, svuint8_t,
+		z0 = svlsl_wide_n_u8_m (p0, z0, 1),
+		z0 = svlsl_wide_m (p0, z0, 1))
+
+/*
+** lsl_wide_1_u8_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u8_m_untied, svuint8_t,
+		z0 = svlsl_wide_n_u8_m (p0, z1, 1),
+		z0 = svlsl_wide_m (p0, z1, 1))
+
+/*
+** lsl_wide_7_u8_m_tied1:
+**	lsl	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_7_u8_m_tied1, svuint8_t,
+		z0 = svlsl_wide_n_u8_m (p0, z0, 7),
+		z0 = svlsl_wide_m (p0, z0, 7))
+
+/*
+** lsl_wide_7_u8_m_untied:
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_7_u8_m_untied, svuint8_t,
+		z0 = svlsl_wide_n_u8_m (p0, z1, 7),
+		z0 = svlsl_wide_m (p0, z1, 7))
+
+/*
+** lsl_wide_8_u8_m_tied1:
+**	mov	(z[0-9]+\.d), #8
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_8_u8_m_tied1, svuint8_t,
+		z0 = svlsl_wide_n_u8_m (p0, z0, 8),
+		z0 = svlsl_wide_m (p0, z0, 8))
+
+/*
+** lsl_wide_8_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #8
+**	movprfx	z0, z1
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_8_u8_m_untied, svuint8_t,
+		z0 = svlsl_wide_n_u8_m (p0, z1, 8),
+		z0 = svlsl_wide_m (p0, z1, 8))
+
+/*
+** lsl_wide_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u8_z_tied1, svuint8_t, svuint64_t,
+	     z0 = svlsl_wide_u8_z (p0, z0, z4),
+	     z0 = svlsl_wide_z (p0, z0, z4))
+
+/*
+** lsl_wide_u8_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.b, p0/z, z4\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_u8_z_tied2, svuint8_t, svuint64_t,
+		 z0_res = svlsl_wide_u8_z (p0, z4, z0),
+		 z0_res = svlsl_wide_z (p0, z4, z0))
+
+/*
+** lsl_wide_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u8_z_untied, svuint8_t, svuint64_t,
+	     z0 = svlsl_wide_u8_z (p0, z1, z4),
+	     z0 = svlsl_wide_z (p0, z1, z4))
+
+/*
+** lsl_wide_x0_u8_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u8_z_tied1, svuint8_t, uint64_t,
+		 z0 = svlsl_wide_n_u8_z (p0, z0, x0),
+		 z0 = svlsl_wide_z (p0, z0, x0))
+
+/*
+** lsl_wide_x0_u8_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u8_z_untied, svuint8_t, uint64_t,
+		 z0 = svlsl_wide_n_u8_z (p0, z1, x0),
+		 z0 = svlsl_wide_z (p0, z1, x0))
+
+/*
+** lsl_wide_1_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u8_z_tied1, svuint8_t,
+		z0 = svlsl_wide_n_u8_z (p0, z0, 1),
+		z0 = svlsl_wide_z (p0, z0, 1))
+
+/*
+** lsl_wide_1_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u8_z_untied, svuint8_t,
+		z0 = svlsl_wide_n_u8_z (p0, z1, 1),
+		z0 = svlsl_wide_z (p0, z1, 1))
+
+/*
+** lsl_wide_7_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_7_u8_z_tied1, svuint8_t,
+		z0 = svlsl_wide_n_u8_z (p0, z0, 7),
+		z0 = svlsl_wide_z (p0, z0, 7))
+
+/*
+** lsl_wide_7_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_7_u8_z_untied, svuint8_t,
+		z0 = svlsl_wide_n_u8_z (p0, z1, 7),
+		z0 = svlsl_wide_z (p0, z1, 7))
+
+/*
+** lsl_wide_8_u8_z_tied1:
+**	mov	(z[0-9]+\.d), #8
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_8_u8_z_tied1, svuint8_t,
+		z0 = svlsl_wide_n_u8_z (p0, z0, 8),
+		z0 = svlsl_wide_z (p0, z0, 8))
+
+/*
+** lsl_wide_8_u8_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #8
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsl	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_8_u8_z_untied, svuint8_t,
+		z0 = svlsl_wide_n_u8_z (p0, z1, 8),
+		z0 = svlsl_wide_z (p0, z1, 8))
+
+/*
+** lsl_wide_u8_x_tied1:
+**	lsl	z0\.b, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u8_x_tied1, svuint8_t, svuint64_t,
+	     z0 = svlsl_wide_u8_x (p0, z0, z4),
+	     z0 = svlsl_wide_x (p0, z0, z4))
+
+/*
+** lsl_wide_u8_x_tied2:
+**	lsl	z0\.b, z4\.b, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (lsl_wide_u8_x_tied2, svuint8_t, svuint64_t,
+		 z0_res = svlsl_wide_u8_x (p0, z4, z0),
+		 z0_res = svlsl_wide_x (p0, z4, z0))
+
+/*
+** lsl_wide_u8_x_untied:
+**	lsl	z0\.b, z1\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsl_wide_u8_x_untied, svuint8_t, svuint64_t,
+	     z0 = svlsl_wide_u8_x (p0, z1, z4),
+	     z0 = svlsl_wide_x (p0, z1, z4))
+
+/*
+** lsl_wide_x0_u8_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.b, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u8_x_tied1, svuint8_t, uint64_t,
+		 z0 = svlsl_wide_n_u8_x (p0, z0, x0),
+		 z0 = svlsl_wide_x (p0, z0, x0))
+
+/*
+** lsl_wide_x0_u8_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	lsl	z0\.b, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsl_wide_x0_u8_x_untied, svuint8_t, uint64_t,
+		 z0 = svlsl_wide_n_u8_x (p0, z1, x0),
+		 z0 = svlsl_wide_x (p0, z1, x0))
+
+/*
+** lsl_wide_1_u8_x_tied1:
+**	lsl	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u8_x_tied1, svuint8_t,
+		z0 = svlsl_wide_n_u8_x (p0, z0, 1),
+		z0 = svlsl_wide_x (p0, z0, 1))
+
+/*
+** lsl_wide_1_u8_x_untied:
+**	lsl	z0\.b, z1\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_1_u8_x_untied, svuint8_t,
+		z0 = svlsl_wide_n_u8_x (p0, z1, 1),
+		z0 = svlsl_wide_x (p0, z1, 1))
+
+/*
+** lsl_wide_7_u8_x_tied1:
+**	lsl	z0\.b, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_7_u8_x_tied1, svuint8_t,
+		z0 = svlsl_wide_n_u8_x (p0, z0, 7),
+		z0 = svlsl_wide_x (p0, z0, 7))
+
+/*
+** lsl_wide_7_u8_x_untied:
+**	lsl	z0\.b, z1\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_7_u8_x_untied, svuint8_t,
+		z0 = svlsl_wide_n_u8_x (p0, z1, 7),
+		z0 = svlsl_wide_x (p0, z1, 7))
+
+/*
+** lsl_wide_8_u8_x_tied1:
+**	mov	(z[0-9]+\.d), #8
+**	lsl	z0\.b, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_8_u8_x_tied1, svuint8_t,
+		z0 = svlsl_wide_n_u8_x (p0, z0, 8),
+		z0 = svlsl_wide_x (p0, z0, 8))
+
+/*
+** lsl_wide_8_u8_x_untied:
+**	mov	(z[0-9]+\.d), #8
+**	lsl	z0\.b, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsl_wide_8_u8_x_untied, svuint8_t,
+		z0 = svlsl_wide_n_u8_x (p0, z1, 8),
+		z0 = svlsl_wide_x (p0, z1, 8))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u16.c
new file mode 100644
index 000000000..61575645f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u16.c
@@ -0,0 +1,340 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsr_u16_m_tied1:
+**	lsr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u16_m_tied1, svuint16_t,
+		z0 = svlsr_u16_m (p0, z0, z1),
+		z0 = svlsr_m (p0, z0, z1))
+
+/*
+** lsr_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	lsr	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u16_m_tied2, svuint16_t,
+		z0 = svlsr_u16_m (p0, z1, z0),
+		z0 = svlsr_m (p0, z1, z0))
+
+/*
+** lsr_u16_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u16_m_untied, svuint16_t,
+		z0 = svlsr_u16_m (p0, z1, z2),
+		z0 = svlsr_m (p0, z1, z2))
+
+/*
+** lsr_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	lsr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svlsr_n_u16_m (p0, z0, x0),
+		 z0 = svlsr_m (p0, z0, x0))
+
+/*
+** lsr_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	lsr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svlsr_n_u16_m (p0, z1, x0),
+		 z0 = svlsr_m (p0, z1, x0))
+
+/*
+** lsr_1_u16_m_tied1:
+**	lsr	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u16_m_tied1, svuint16_t,
+		z0 = svlsr_n_u16_m (p0, z0, 1),
+		z0 = svlsr_m (p0, z0, 1))
+
+/*
+** lsr_1_u16_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u16_m_untied, svuint16_t,
+		z0 = svlsr_n_u16_m (p0, z1, 1),
+		z0 = svlsr_m (p0, z1, 1))
+
+/*
+** lsr_15_u16_m_tied1:
+**	lsr	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_15_u16_m_tied1, svuint16_t,
+		z0 = svlsr_n_u16_m (p0, z0, 15),
+		z0 = svlsr_m (p0, z0, 15))
+
+/*
+** lsr_15_u16_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_15_u16_m_untied, svuint16_t,
+		z0 = svlsr_n_u16_m (p0, z1, 15),
+		z0 = svlsr_m (p0, z1, 15))
+
+/*
+** lsr_16_u16_m_tied1:
+**	lsr	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_16_u16_m_tied1, svuint16_t,
+		z0 = svlsr_n_u16_m (p0, z0, 16),
+		z0 = svlsr_m (p0, z0, 16))
+
+/*
+** lsr_16_u16_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_16_u16_m_untied, svuint16_t,
+		z0 = svlsr_n_u16_m (p0, z1, 16),
+		z0 = svlsr_m (p0, z1, 16))
+
+/*
+** lsr_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u16_z_tied1, svuint16_t,
+		z0 = svlsr_u16_z (p0, z0, z1),
+		z0 = svlsr_z (p0, z0, z1))
+
+/*
+** lsr_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsrr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u16_z_tied2, svuint16_t,
+		z0 = svlsr_u16_z (p0, z1, z0),
+		z0 = svlsr_z (p0, z1, z0))
+
+/*
+** lsr_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsr	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	lsrr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u16_z_untied, svuint16_t,
+		z0 = svlsr_u16_z (p0, z1, z2),
+		z0 = svlsr_z (p0, z1, z2))
+
+/*
+** lsr_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svlsr_n_u16_z (p0, z0, x0),
+		 z0 = svlsr_z (p0, z0, x0))
+
+/*
+** lsr_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsr	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	lsrr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svlsr_n_u16_z (p0, z1, x0),
+		 z0 = svlsr_z (p0, z1, x0))
+
+/*
+** lsr_1_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsr	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u16_z_tied1, svuint16_t,
+		z0 = svlsr_n_u16_z (p0, z0, 1),
+		z0 = svlsr_z (p0, z0, 1))
+
+/*
+** lsr_1_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsr	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u16_z_untied, svuint16_t,
+		z0 = svlsr_n_u16_z (p0, z1, 1),
+		z0 = svlsr_z (p0, z1, 1))
+
+/*
+** lsr_15_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsr	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_15_u16_z_tied1, svuint16_t,
+		z0 = svlsr_n_u16_z (p0, z0, 15),
+		z0 = svlsr_z (p0, z0, 15))
+
+/*
+** lsr_15_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsr	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_15_u16_z_untied, svuint16_t,
+		z0 = svlsr_n_u16_z (p0, z1, 15),
+		z0 = svlsr_z (p0, z1, 15))
+
+/*
+** lsr_16_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsr	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_16_u16_z_tied1, svuint16_t,
+		z0 = svlsr_n_u16_z (p0, z0, 16),
+		z0 = svlsr_z (p0, z0, 16))
+
+/*
+** lsr_16_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsr	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_16_u16_z_untied, svuint16_t,
+		z0 = svlsr_n_u16_z (p0, z1, 16),
+		z0 = svlsr_z (p0, z1, 16))
+
+/*
+** lsr_u16_x_tied1:
+**	lsr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u16_x_tied1, svuint16_t,
+		z0 = svlsr_u16_x (p0, z0, z1),
+		z0 = svlsr_x (p0, z0, z1))
+
+/*
+** lsr_u16_x_tied2:
+**	lsrr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u16_x_tied2, svuint16_t,
+		z0 = svlsr_u16_x (p0, z1, z0),
+		z0 = svlsr_x (p0, z1, z0))
+
+/*
+** lsr_u16_x_untied:
+** (
+**	movprfx	z0, z1
+**	lsr	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	lsrr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u16_x_untied, svuint16_t,
+		z0 = svlsr_u16_x (p0, z1, z2),
+		z0 = svlsr_x (p0, z1, z2))
+
+/*
+** lsr_w0_u16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	lsr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svlsr_n_u16_x (p0, z0, x0),
+		 z0 = svlsr_x (p0, z0, x0))
+
+/*
+** lsr_w0_u16_x_untied:
+**	mov	z0\.h, w0
+**	lsrr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svlsr_n_u16_x (p0, z1, x0),
+		 z0 = svlsr_x (p0, z1, x0))
+
+/*
+** lsr_1_u16_x_tied1:
+**	lsr	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u16_x_tied1, svuint16_t,
+		z0 = svlsr_n_u16_x (p0, z0, 1),
+		z0 = svlsr_x (p0, z0, 1))
+
+/*
+** lsr_1_u16_x_untied:
+**	lsr	z0\.h, z1\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u16_x_untied, svuint16_t,
+		z0 = svlsr_n_u16_x (p0, z1, 1),
+		z0 = svlsr_x (p0, z1, 1))
+
+/*
+** lsr_15_u16_x_tied1:
+**	lsr	z0\.h, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_15_u16_x_tied1, svuint16_t,
+		z0 = svlsr_n_u16_x (p0, z0, 15),
+		z0 = svlsr_x (p0, z0, 15))
+
+/*
+** lsr_15_u16_x_untied:
+**	lsr	z0\.h, z1\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_15_u16_x_untied, svuint16_t,
+		z0 = svlsr_n_u16_x (p0, z1, 15),
+		z0 = svlsr_x (p0, z1, 15))
+
+/*
+** lsr_16_u16_x_tied1:
+**	lsr	z0\.h, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_16_u16_x_tied1, svuint16_t,
+		z0 = svlsr_n_u16_x (p0, z0, 16),
+		z0 = svlsr_x (p0, z0, 16))
+
+/*
+** lsr_16_u16_x_untied:
+**	lsr	z0\.h, z1\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_16_u16_x_untied, svuint16_t,
+		z0 = svlsr_n_u16_x (p0, z1, 16),
+		z0 = svlsr_x (p0, z1, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u32.c
new file mode 100644
index 000000000..796867ef8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u32.c
@@ -0,0 +1,340 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsr_u32_m_tied1:
+**	lsr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u32_m_tied1, svuint32_t,
+		z0 = svlsr_u32_m (p0, z0, z1),
+		z0 = svlsr_m (p0, z0, z1))
+
+/*
+** lsr_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	lsr	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u32_m_tied2, svuint32_t,
+		z0 = svlsr_u32_m (p0, z1, z0),
+		z0 = svlsr_m (p0, z1, z0))
+
+/*
+** lsr_u32_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u32_m_untied, svuint32_t,
+		z0 = svlsr_u32_m (p0, z1, z2),
+		z0 = svlsr_m (p0, z1, z2))
+
+/*
+** lsr_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	lsr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svlsr_n_u32_m (p0, z0, x0),
+		 z0 = svlsr_m (p0, z0, x0))
+
+/*
+** lsr_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	lsr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svlsr_n_u32_m (p0, z1, x0),
+		 z0 = svlsr_m (p0, z1, x0))
+
+/*
+** lsr_1_u32_m_tied1:
+**	lsr	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u32_m_tied1, svuint32_t,
+		z0 = svlsr_n_u32_m (p0, z0, 1),
+		z0 = svlsr_m (p0, z0, 1))
+
+/*
+** lsr_1_u32_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u32_m_untied, svuint32_t,
+		z0 = svlsr_n_u32_m (p0, z1, 1),
+		z0 = svlsr_m (p0, z1, 1))
+
+/*
+** lsr_31_u32_m_tied1:
+**	lsr	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_31_u32_m_tied1, svuint32_t,
+		z0 = svlsr_n_u32_m (p0, z0, 31),
+		z0 = svlsr_m (p0, z0, 31))
+
+/*
+** lsr_31_u32_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_31_u32_m_untied, svuint32_t,
+		z0 = svlsr_n_u32_m (p0, z1, 31),
+		z0 = svlsr_m (p0, z1, 31))
+
+/*
+** lsr_32_u32_m_tied1:
+**	lsr	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_32_u32_m_tied1, svuint32_t,
+		z0 = svlsr_n_u32_m (p0, z0, 32),
+		z0 = svlsr_m (p0, z0, 32))
+
+/*
+** lsr_32_u32_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_32_u32_m_untied, svuint32_t,
+		z0 = svlsr_n_u32_m (p0, z1, 32),
+		z0 = svlsr_m (p0, z1, 32))
+
+/*
+** lsr_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u32_z_tied1, svuint32_t,
+		z0 = svlsr_u32_z (p0, z0, z1),
+		z0 = svlsr_z (p0, z0, z1))
+
+/*
+** lsr_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsrr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u32_z_tied2, svuint32_t,
+		z0 = svlsr_u32_z (p0, z1, z0),
+		z0 = svlsr_z (p0, z1, z0))
+
+/*
+** lsr_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsr	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	lsrr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u32_z_untied, svuint32_t,
+		z0 = svlsr_u32_z (p0, z1, z2),
+		z0 = svlsr_z (p0, z1, z2))
+
+/*
+** lsr_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svlsr_n_u32_z (p0, z0, x0),
+		 z0 = svlsr_z (p0, z0, x0))
+
+/*
+** lsr_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	lsrr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svlsr_n_u32_z (p0, z1, x0),
+		 z0 = svlsr_z (p0, z1, x0))
+
+/*
+** lsr_1_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsr	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u32_z_tied1, svuint32_t,
+		z0 = svlsr_n_u32_z (p0, z0, 1),
+		z0 = svlsr_z (p0, z0, 1))
+
+/*
+** lsr_1_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsr	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u32_z_untied, svuint32_t,
+		z0 = svlsr_n_u32_z (p0, z1, 1),
+		z0 = svlsr_z (p0, z1, 1))
+
+/*
+** lsr_31_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsr	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_31_u32_z_tied1, svuint32_t,
+		z0 = svlsr_n_u32_z (p0, z0, 31),
+		z0 = svlsr_z (p0, z0, 31))
+
+/*
+** lsr_31_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsr	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_31_u32_z_untied, svuint32_t,
+		z0 = svlsr_n_u32_z (p0, z1, 31),
+		z0 = svlsr_z (p0, z1, 31))
+
+/*
+** lsr_32_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsr	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_32_u32_z_tied1, svuint32_t,
+		z0 = svlsr_n_u32_z (p0, z0, 32),
+		z0 = svlsr_z (p0, z0, 32))
+
+/*
+** lsr_32_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsr	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_32_u32_z_untied, svuint32_t,
+		z0 = svlsr_n_u32_z (p0, z1, 32),
+		z0 = svlsr_z (p0, z1, 32))
+
+/*
+** lsr_u32_x_tied1:
+**	lsr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u32_x_tied1, svuint32_t,
+		z0 = svlsr_u32_x (p0, z0, z1),
+		z0 = svlsr_x (p0, z0, z1))
+
+/*
+** lsr_u32_x_tied2:
+**	lsrr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u32_x_tied2, svuint32_t,
+		z0 = svlsr_u32_x (p0, z1, z0),
+		z0 = svlsr_x (p0, z1, z0))
+
+/*
+** lsr_u32_x_untied:
+** (
+**	movprfx	z0, z1
+**	lsr	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	lsrr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u32_x_untied, svuint32_t,
+		z0 = svlsr_u32_x (p0, z1, z2),
+		z0 = svlsr_x (p0, z1, z2))
+
+/*
+** lsr_w0_u32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	lsr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svlsr_n_u32_x (p0, z0, x0),
+		 z0 = svlsr_x (p0, z0, x0))
+
+/*
+** lsr_w0_u32_x_untied:
+**	mov	z0\.s, w0
+**	lsrr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svlsr_n_u32_x (p0, z1, x0),
+		 z0 = svlsr_x (p0, z1, x0))
+
+/*
+** lsr_1_u32_x_tied1:
+**	lsr	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u32_x_tied1, svuint32_t,
+		z0 = svlsr_n_u32_x (p0, z0, 1),
+		z0 = svlsr_x (p0, z0, 1))
+
+/*
+** lsr_1_u32_x_untied:
+**	lsr	z0\.s, z1\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u32_x_untied, svuint32_t,
+		z0 = svlsr_n_u32_x (p0, z1, 1),
+		z0 = svlsr_x (p0, z1, 1))
+
+/*
+** lsr_31_u32_x_tied1:
+**	lsr	z0\.s, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_31_u32_x_tied1, svuint32_t,
+		z0 = svlsr_n_u32_x (p0, z0, 31),
+		z0 = svlsr_x (p0, z0, 31))
+
+/*
+** lsr_31_u32_x_untied:
+**	lsr	z0\.s, z1\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_31_u32_x_untied, svuint32_t,
+		z0 = svlsr_n_u32_x (p0, z1, 31),
+		z0 = svlsr_x (p0, z1, 31))
+
+/*
+** lsr_32_u32_x_tied1:
+**	lsr	z0\.s, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_32_u32_x_tied1, svuint32_t,
+		z0 = svlsr_n_u32_x (p0, z0, 32),
+		z0 = svlsr_x (p0, z0, 32))
+
+/*
+** lsr_32_u32_x_untied:
+**	lsr	z0\.s, z1\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_32_u32_x_untied, svuint32_t,
+		z0 = svlsr_n_u32_x (p0, z1, 32),
+		z0 = svlsr_x (p0, z1, 32))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u64.c
new file mode 100644
index 000000000..b50777f50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u64.c
@@ -0,0 +1,340 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsr_u64_m_tied1:
+**	lsr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u64_m_tied1, svuint64_t,
+		z0 = svlsr_u64_m (p0, z0, z1),
+		z0 = svlsr_m (p0, z0, z1))
+
+/*
+** lsr_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	lsr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u64_m_tied2, svuint64_t,
+		z0 = svlsr_u64_m (p0, z1, z0),
+		z0 = svlsr_m (p0, z1, z0))
+
+/*
+** lsr_u64_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u64_m_untied, svuint64_t,
+		z0 = svlsr_u64_m (p0, z1, z2),
+		z0 = svlsr_m (p0, z1, z2))
+
+/*
+** lsr_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svlsr_n_u64_m (p0, z0, x0),
+		 z0 = svlsr_m (p0, z0, x0))
+
+/*
+** lsr_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	lsr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svlsr_n_u64_m (p0, z1, x0),
+		 z0 = svlsr_m (p0, z1, x0))
+
+/*
+** lsr_1_u64_m_tied1:
+**	lsr	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u64_m_tied1, svuint64_t,
+		z0 = svlsr_n_u64_m (p0, z0, 1),
+		z0 = svlsr_m (p0, z0, 1))
+
+/*
+** lsr_1_u64_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u64_m_untied, svuint64_t,
+		z0 = svlsr_n_u64_m (p0, z1, 1),
+		z0 = svlsr_m (p0, z1, 1))
+
+/*
+** lsr_63_u64_m_tied1:
+**	lsr	z0\.d, p0/m, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_63_u64_m_tied1, svuint64_t,
+		z0 = svlsr_n_u64_m (p0, z0, 63),
+		z0 = svlsr_m (p0, z0, 63))
+
+/*
+** lsr_63_u64_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.d, p0/m, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_63_u64_m_untied, svuint64_t,
+		z0 = svlsr_n_u64_m (p0, z1, 63),
+		z0 = svlsr_m (p0, z1, 63))
+
+/*
+** lsr_64_u64_m_tied1:
+**	lsr	z0\.d, p0/m, z0\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_64_u64_m_tied1, svuint64_t,
+		z0 = svlsr_n_u64_m (p0, z0, 64),
+		z0 = svlsr_m (p0, z0, 64))
+
+/*
+** lsr_64_u64_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.d, p0/m, z0\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_64_u64_m_untied, svuint64_t,
+		z0 = svlsr_n_u64_m (p0, z1, 64),
+		z0 = svlsr_m (p0, z1, 64))
+
+/*
+** lsr_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lsr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u64_z_tied1, svuint64_t,
+		z0 = svlsr_u64_z (p0, z0, z1),
+		z0 = svlsr_z (p0, z0, z1))
+
+/*
+** lsr_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lsrr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u64_z_tied2, svuint64_t,
+		z0 = svlsr_u64_z (p0, z1, z0),
+		z0 = svlsr_z (p0, z1, z0))
+
+/*
+** lsr_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	lsr	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	lsrr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u64_z_untied, svuint64_t,
+		z0 = svlsr_u64_z (p0, z1, z2),
+		z0 = svlsr_z (p0, z1, z2))
+
+/*
+** lsr_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lsr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svlsr_n_u64_z (p0, z0, x0),
+		 z0 = svlsr_z (p0, z0, x0))
+
+/*
+** lsr_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	lsr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	lsrr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svlsr_n_u64_z (p0, z1, x0),
+		 z0 = svlsr_z (p0, z1, x0))
+
+/*
+** lsr_1_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lsr	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u64_z_tied1, svuint64_t,
+		z0 = svlsr_n_u64_z (p0, z0, 1),
+		z0 = svlsr_z (p0, z0, 1))
+
+/*
+** lsr_1_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	lsr	z0\.d, p0/m, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u64_z_untied, svuint64_t,
+		z0 = svlsr_n_u64_z (p0, z1, 1),
+		z0 = svlsr_z (p0, z1, 1))
+
+/*
+** lsr_63_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lsr	z0\.d, p0/m, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_63_u64_z_tied1, svuint64_t,
+		z0 = svlsr_n_u64_z (p0, z0, 63),
+		z0 = svlsr_z (p0, z0, 63))
+
+/*
+** lsr_63_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	lsr	z0\.d, p0/m, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_63_u64_z_untied, svuint64_t,
+		z0 = svlsr_n_u64_z (p0, z1, 63),
+		z0 = svlsr_z (p0, z1, 63))
+
+/*
+** lsr_64_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	lsr	z0\.d, p0/m, z0\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_64_u64_z_tied1, svuint64_t,
+		z0 = svlsr_n_u64_z (p0, z0, 64),
+		z0 = svlsr_z (p0, z0, 64))
+
+/*
+** lsr_64_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	lsr	z0\.d, p0/m, z0\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_64_u64_z_untied, svuint64_t,
+		z0 = svlsr_n_u64_z (p0, z1, 64),
+		z0 = svlsr_z (p0, z1, 64))
+
+/*
+** lsr_u64_x_tied1:
+**	lsr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u64_x_tied1, svuint64_t,
+		z0 = svlsr_u64_x (p0, z0, z1),
+		z0 = svlsr_x (p0, z0, z1))
+
+/*
+** lsr_u64_x_tied2:
+**	lsrr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u64_x_tied2, svuint64_t,
+		z0 = svlsr_u64_x (p0, z1, z0),
+		z0 = svlsr_x (p0, z1, z0))
+
+/*
+** lsr_u64_x_untied:
+** (
+**	movprfx	z0, z1
+**	lsr	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	lsrr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u64_x_untied, svuint64_t,
+		z0 = svlsr_u64_x (p0, z1, z2),
+		z0 = svlsr_x (p0, z1, z2))
+
+/*
+** lsr_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svlsr_n_u64_x (p0, z0, x0),
+		 z0 = svlsr_x (p0, z0, x0))
+
+/*
+** lsr_x0_u64_x_untied:
+**	mov	z0\.d, x0
+**	lsrr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svlsr_n_u64_x (p0, z1, x0),
+		 z0 = svlsr_x (p0, z1, x0))
+
+/*
+** lsr_1_u64_x_tied1:
+**	lsr	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u64_x_tied1, svuint64_t,
+		z0 = svlsr_n_u64_x (p0, z0, 1),
+		z0 = svlsr_x (p0, z0, 1))
+
+/*
+** lsr_1_u64_x_untied:
+**	lsr	z0\.d, z1\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u64_x_untied, svuint64_t,
+		z0 = svlsr_n_u64_x (p0, z1, 1),
+		z0 = svlsr_x (p0, z1, 1))
+
+/*
+** lsr_63_u64_x_tied1:
+**	lsr	z0\.d, z0\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_63_u64_x_tied1, svuint64_t,
+		z0 = svlsr_n_u64_x (p0, z0, 63),
+		z0 = svlsr_x (p0, z0, 63))
+
+/*
+** lsr_63_u64_x_untied:
+**	lsr	z0\.d, z1\.d, #63
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_63_u64_x_untied, svuint64_t,
+		z0 = svlsr_n_u64_x (p0, z1, 63),
+		z0 = svlsr_x (p0, z1, 63))
+
+/*
+** lsr_64_u64_x_tied1:
+**	lsr	z0\.d, z0\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_64_u64_x_tied1, svuint64_t,
+		z0 = svlsr_n_u64_x (p0, z0, 64),
+		z0 = svlsr_x (p0, z0, 64))
+
+/*
+** lsr_64_u64_x_untied:
+**	lsr	z0\.d, z1\.d, #64
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_64_u64_x_untied, svuint64_t,
+		z0 = svlsr_n_u64_x (p0, z1, 64),
+		z0 = svlsr_x (p0, z1, 64))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u8.c
new file mode 100644
index 000000000..a049ca905
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_u8.c
@@ -0,0 +1,340 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsr_u8_m_tied1:
+**	lsr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u8_m_tied1, svuint8_t,
+		z0 = svlsr_u8_m (p0, z0, z1),
+		z0 = svlsr_m (p0, z0, z1))
+
+/*
+** lsr_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	lsr	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u8_m_tied2, svuint8_t,
+		z0 = svlsr_u8_m (p0, z1, z0),
+		z0 = svlsr_m (p0, z1, z0))
+
+/*
+** lsr_u8_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u8_m_untied, svuint8_t,
+		z0 = svlsr_u8_m (p0, z1, z2),
+		z0 = svlsr_m (p0, z1, z2))
+
+/*
+** lsr_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	lsr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svlsr_n_u8_m (p0, z0, x0),
+		 z0 = svlsr_m (p0, z0, x0))
+
+/*
+** lsr_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	lsr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svlsr_n_u8_m (p0, z1, x0),
+		 z0 = svlsr_m (p0, z1, x0))
+
+/*
+** lsr_1_u8_m_tied1:
+**	lsr	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u8_m_tied1, svuint8_t,
+		z0 = svlsr_n_u8_m (p0, z0, 1),
+		z0 = svlsr_m (p0, z0, 1))
+
+/*
+** lsr_1_u8_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u8_m_untied, svuint8_t,
+		z0 = svlsr_n_u8_m (p0, z1, 1),
+		z0 = svlsr_m (p0, z1, 1))
+
+/*
+** lsr_7_u8_m_tied1:
+**	lsr	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_7_u8_m_tied1, svuint8_t,
+		z0 = svlsr_n_u8_m (p0, z0, 7),
+		z0 = svlsr_m (p0, z0, 7))
+
+/*
+** lsr_7_u8_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_7_u8_m_untied, svuint8_t,
+		z0 = svlsr_n_u8_m (p0, z1, 7),
+		z0 = svlsr_m (p0, z1, 7))
+
+/*
+** lsr_8_u8_m_tied1:
+**	lsr	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_8_u8_m_tied1, svuint8_t,
+		z0 = svlsr_n_u8_m (p0, z0, 8),
+		z0 = svlsr_m (p0, z0, 8))
+
+/*
+** lsr_8_u8_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_8_u8_m_untied, svuint8_t,
+		z0 = svlsr_n_u8_m (p0, z1, 8),
+		z0 = svlsr_m (p0, z1, 8))
+
+/*
+** lsr_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u8_z_tied1, svuint8_t,
+		z0 = svlsr_u8_z (p0, z0, z1),
+		z0 = svlsr_z (p0, z0, z1))
+
+/*
+** lsr_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsrr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u8_z_tied2, svuint8_t,
+		z0 = svlsr_u8_z (p0, z1, z0),
+		z0 = svlsr_z (p0, z1, z0))
+
+/*
+** lsr_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsr	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	lsrr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u8_z_untied, svuint8_t,
+		z0 = svlsr_u8_z (p0, z1, z2),
+		z0 = svlsr_z (p0, z1, z2))
+
+/*
+** lsr_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svlsr_n_u8_z (p0, z0, x0),
+		 z0 = svlsr_z (p0, z0, x0))
+
+/*
+** lsr_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsr	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	lsrr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svlsr_n_u8_z (p0, z1, x0),
+		 z0 = svlsr_z (p0, z1, x0))
+
+/*
+** lsr_1_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsr	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u8_z_tied1, svuint8_t,
+		z0 = svlsr_n_u8_z (p0, z0, 1),
+		z0 = svlsr_z (p0, z0, 1))
+
+/*
+** lsr_1_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsr	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u8_z_untied, svuint8_t,
+		z0 = svlsr_n_u8_z (p0, z1, 1),
+		z0 = svlsr_z (p0, z1, 1))
+
+/*
+** lsr_7_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsr	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_7_u8_z_tied1, svuint8_t,
+		z0 = svlsr_n_u8_z (p0, z0, 7),
+		z0 = svlsr_z (p0, z0, 7))
+
+/*
+** lsr_7_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsr	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_7_u8_z_untied, svuint8_t,
+		z0 = svlsr_n_u8_z (p0, z1, 7),
+		z0 = svlsr_z (p0, z1, 7))
+
+/*
+** lsr_8_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsr	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_8_u8_z_tied1, svuint8_t,
+		z0 = svlsr_n_u8_z (p0, z0, 8),
+		z0 = svlsr_z (p0, z0, 8))
+
+/*
+** lsr_8_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsr	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_8_u8_z_untied, svuint8_t,
+		z0 = svlsr_n_u8_z (p0, z1, 8),
+		z0 = svlsr_z (p0, z1, 8))
+
+/*
+** lsr_u8_x_tied1:
+**	lsr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u8_x_tied1, svuint8_t,
+		z0 = svlsr_u8_x (p0, z0, z1),
+		z0 = svlsr_x (p0, z0, z1))
+
+/*
+** lsr_u8_x_tied2:
+**	lsrr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u8_x_tied2, svuint8_t,
+		z0 = svlsr_u8_x (p0, z1, z0),
+		z0 = svlsr_x (p0, z1, z0))
+
+/*
+** lsr_u8_x_untied:
+** (
+**	movprfx	z0, z1
+**	lsr	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0, z2
+**	lsrr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_u8_x_untied, svuint8_t,
+		z0 = svlsr_u8_x (p0, z1, z2),
+		z0 = svlsr_x (p0, z1, z2))
+
+/*
+** lsr_w0_u8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	lsr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svlsr_n_u8_x (p0, z0, x0),
+		 z0 = svlsr_x (p0, z0, x0))
+
+/*
+** lsr_w0_u8_x_untied:
+**	mov	z0\.b, w0
+**	lsrr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svlsr_n_u8_x (p0, z1, x0),
+		 z0 = svlsr_x (p0, z1, x0))
+
+/*
+** lsr_1_u8_x_tied1:
+**	lsr	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u8_x_tied1, svuint8_t,
+		z0 = svlsr_n_u8_x (p0, z0, 1),
+		z0 = svlsr_x (p0, z0, 1))
+
+/*
+** lsr_1_u8_x_untied:
+**	lsr	z0\.b, z1\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_1_u8_x_untied, svuint8_t,
+		z0 = svlsr_n_u8_x (p0, z1, 1),
+		z0 = svlsr_x (p0, z1, 1))
+
+/*
+** lsr_7_u8_x_tied1:
+**	lsr	z0\.b, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_7_u8_x_tied1, svuint8_t,
+		z0 = svlsr_n_u8_x (p0, z0, 7),
+		z0 = svlsr_x (p0, z0, 7))
+
+/*
+** lsr_7_u8_x_untied:
+**	lsr	z0\.b, z1\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_7_u8_x_untied, svuint8_t,
+		z0 = svlsr_n_u8_x (p0, z1, 7),
+		z0 = svlsr_x (p0, z1, 7))
+
+/*
+** lsr_8_u8_x_tied1:
+**	lsr	z0\.b, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_8_u8_x_tied1, svuint8_t,
+		z0 = svlsr_n_u8_x (p0, z0, 8),
+		z0 = svlsr_x (p0, z0, 8))
+
+/*
+** lsr_8_u8_x_untied:
+**	lsr	z0\.b, z1\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_8_u8_x_untied, svuint8_t,
+		z0 = svlsr_n_u8_x (p0, z1, 8),
+		z0 = svlsr_x (p0, z1, 8))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u16.c
new file mode 100644
index 000000000..863b51a2f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u16.c
@@ -0,0 +1,325 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsr_wide_u16_m_tied1:
+**	lsr	z0\.h, p0/m, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u16_m_tied1, svuint16_t, svuint64_t,
+	     z0 = svlsr_wide_u16_m (p0, z0, z4),
+	     z0 = svlsr_wide_m (p0, z0, z4))
+
+/*
+** lsr_wide_u16_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	lsr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsr_wide_u16_m_tied2, svuint16_t, svuint64_t,
+		 z0_res = svlsr_wide_u16_m (p0, z4, z0),
+		 z0_res = svlsr_wide_m (p0, z4, z0))
+
+/*
+** lsr_wide_u16_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.h, p0/m, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u16_m_untied, svuint16_t, svuint64_t,
+	     z0 = svlsr_wide_u16_m (p0, z1, z4),
+	     z0 = svlsr_wide_m (p0, z1, z4))
+
+/*
+** lsr_wide_x0_u16_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u16_m_tied1, svuint16_t, uint64_t,
+		 z0 = svlsr_wide_n_u16_m (p0, z0, x0),
+		 z0 = svlsr_wide_m (p0, z0, x0))
+
+/*
+** lsr_wide_x0_u16_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	lsr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u16_m_untied, svuint16_t, uint64_t,
+		 z0 = svlsr_wide_n_u16_m (p0, z1, x0),
+		 z0 = svlsr_wide_m (p0, z1, x0))
+
+/*
+** lsr_wide_1_u16_m_tied1:
+**	lsr	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u16_m_tied1, svuint16_t,
+		z0 = svlsr_wide_n_u16_m (p0, z0, 1),
+		z0 = svlsr_wide_m (p0, z0, 1))
+
+/*
+** lsr_wide_1_u16_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u16_m_untied, svuint16_t,
+		z0 = svlsr_wide_n_u16_m (p0, z1, 1),
+		z0 = svlsr_wide_m (p0, z1, 1))
+
+/*
+** lsr_wide_15_u16_m_tied1:
+**	lsr	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_15_u16_m_tied1, svuint16_t,
+		z0 = svlsr_wide_n_u16_m (p0, z0, 15),
+		z0 = svlsr_wide_m (p0, z0, 15))
+
+/*
+** lsr_wide_15_u16_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_15_u16_m_untied, svuint16_t,
+		z0 = svlsr_wide_n_u16_m (p0, z1, 15),
+		z0 = svlsr_wide_m (p0, z1, 15))
+
+/*
+** lsr_wide_16_u16_m_tied1:
+**	lsr	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_16_u16_m_tied1, svuint16_t,
+		z0 = svlsr_wide_n_u16_m (p0, z0, 16),
+		z0 = svlsr_wide_m (p0, z0, 16))
+
+/*
+** lsr_wide_16_u16_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_16_u16_m_untied, svuint16_t,
+		z0 = svlsr_wide_n_u16_m (p0, z1, 16),
+		z0 = svlsr_wide_m (p0, z1, 16))
+
+/*
+** lsr_wide_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsr	z0\.h, p0/m, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u16_z_tied1, svuint16_t, svuint64_t,
+	     z0 = svlsr_wide_u16_z (p0, z0, z4),
+	     z0 = svlsr_wide_z (p0, z0, z4))
+
+/*
+** lsr_wide_u16_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.h, p0/z, z4\.h
+**	lsr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsr_wide_u16_z_tied2, svuint16_t, svuint64_t,
+		 z0_res = svlsr_wide_u16_z (p0, z4, z0),
+		 z0_res = svlsr_wide_z (p0, z4, z0))
+
+/*
+** lsr_wide_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsr	z0\.h, p0/m, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u16_z_untied, svuint16_t, svuint64_t,
+	     z0 = svlsr_wide_u16_z (p0, z1, z4),
+	     z0 = svlsr_wide_z (p0, z1, z4))
+
+/*
+** lsr_wide_x0_u16_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u16_z_tied1, svuint16_t, uint64_t,
+		 z0 = svlsr_wide_n_u16_z (p0, z0, x0),
+		 z0 = svlsr_wide_z (p0, z0, x0))
+
+/*
+** lsr_wide_x0_u16_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u16_z_untied, svuint16_t, uint64_t,
+		 z0 = svlsr_wide_n_u16_z (p0, z1, x0),
+		 z0 = svlsr_wide_z (p0, z1, x0))
+
+/*
+** lsr_wide_1_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsr	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u16_z_tied1, svuint16_t,
+		z0 = svlsr_wide_n_u16_z (p0, z0, 1),
+		z0 = svlsr_wide_z (p0, z0, 1))
+
+/*
+** lsr_wide_1_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsr	z0\.h, p0/m, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u16_z_untied, svuint16_t,
+		z0 = svlsr_wide_n_u16_z (p0, z1, 1),
+		z0 = svlsr_wide_z (p0, z1, 1))
+
+/*
+** lsr_wide_15_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsr	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_15_u16_z_tied1, svuint16_t,
+		z0 = svlsr_wide_n_u16_z (p0, z0, 15),
+		z0 = svlsr_wide_z (p0, z0, 15))
+
+/*
+** lsr_wide_15_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsr	z0\.h, p0/m, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_15_u16_z_untied, svuint16_t,
+		z0 = svlsr_wide_n_u16_z (p0, z1, 15),
+		z0 = svlsr_wide_z (p0, z1, 15))
+
+/*
+** lsr_wide_16_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	lsr	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_16_u16_z_tied1, svuint16_t,
+		z0 = svlsr_wide_n_u16_z (p0, z0, 16),
+		z0 = svlsr_wide_z (p0, z0, 16))
+
+/*
+** lsr_wide_16_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	lsr	z0\.h, p0/m, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_16_u16_z_untied, svuint16_t,
+		z0 = svlsr_wide_n_u16_z (p0, z1, 16),
+		z0 = svlsr_wide_z (p0, z1, 16))
+
+/*
+** lsr_wide_u16_x_tied1:
+**	lsr	z0\.h, z0\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u16_x_tied1, svuint16_t, svuint64_t,
+	     z0 = svlsr_wide_u16_x (p0, z0, z4),
+	     z0 = svlsr_wide_x (p0, z0, z4))
+
+/*
+** lsr_wide_u16_x_tied2:
+**	lsr	z0\.h, z4\.h, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (lsr_wide_u16_x_tied2, svuint16_t, svuint64_t,
+		 z0_res = svlsr_wide_u16_x (p0, z4, z0),
+		 z0_res = svlsr_wide_x (p0, z4, z0))
+
+/*
+** lsr_wide_u16_x_untied:
+**	lsr	z0\.h, z1\.h, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u16_x_untied, svuint16_t, svuint64_t,
+	     z0 = svlsr_wide_u16_x (p0, z1, z4),
+	     z0 = svlsr_wide_x (p0, z1, z4))
+
+/*
+** lsr_wide_x0_u16_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsr	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u16_x_tied1, svuint16_t, uint64_t,
+		 z0 = svlsr_wide_n_u16_x (p0, z0, x0),
+		 z0 = svlsr_wide_x (p0, z0, x0))
+
+/*
+** lsr_wide_x0_u16_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	lsr	z0\.h, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u16_x_untied, svuint16_t, uint64_t,
+		 z0 = svlsr_wide_n_u16_x (p0, z1, x0),
+		 z0 = svlsr_wide_x (p0, z1, x0))
+
+/*
+** lsr_wide_1_u16_x_tied1:
+**	lsr	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u16_x_tied1, svuint16_t,
+		z0 = svlsr_wide_n_u16_x (p0, z0, 1),
+		z0 = svlsr_wide_x (p0, z0, 1))
+
+/*
+** lsr_wide_1_u16_x_untied:
+**	lsr	z0\.h, z1\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u16_x_untied, svuint16_t,
+		z0 = svlsr_wide_n_u16_x (p0, z1, 1),
+		z0 = svlsr_wide_x (p0, z1, 1))
+
+/*
+** lsr_wide_15_u16_x_tied1:
+**	lsr	z0\.h, z0\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_15_u16_x_tied1, svuint16_t,
+		z0 = svlsr_wide_n_u16_x (p0, z0, 15),
+		z0 = svlsr_wide_x (p0, z0, 15))
+
+/*
+** lsr_wide_15_u16_x_untied:
+**	lsr	z0\.h, z1\.h, #15
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_15_u16_x_untied, svuint16_t,
+		z0 = svlsr_wide_n_u16_x (p0, z1, 15),
+		z0 = svlsr_wide_x (p0, z1, 15))
+
+/*
+** lsr_wide_16_u16_x_tied1:
+**	lsr	z0\.h, z0\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_16_u16_x_tied1, svuint16_t,
+		z0 = svlsr_wide_n_u16_x (p0, z0, 16),
+		z0 = svlsr_wide_x (p0, z0, 16))
+
+/*
+** lsr_wide_16_u16_x_untied:
+**	lsr	z0\.h, z1\.h, #16
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_16_u16_x_untied, svuint16_t,
+		z0 = svlsr_wide_n_u16_x (p0, z1, 16),
+		z0 = svlsr_wide_x (p0, z1, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u32.c
new file mode 100644
index 000000000..73c2cf86e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u32.c
@@ -0,0 +1,325 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsr_wide_u32_m_tied1:
+**	lsr	z0\.s, p0/m, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u32_m_tied1, svuint32_t, svuint64_t,
+	     z0 = svlsr_wide_u32_m (p0, z0, z4),
+	     z0 = svlsr_wide_m (p0, z0, z4))
+
+/*
+** lsr_wide_u32_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	lsr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsr_wide_u32_m_tied2, svuint32_t, svuint64_t,
+		 z0_res = svlsr_wide_u32_m (p0, z4, z0),
+		 z0_res = svlsr_wide_m (p0, z4, z0))
+
+/*
+** lsr_wide_u32_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.s, p0/m, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u32_m_untied, svuint32_t, svuint64_t,
+	     z0 = svlsr_wide_u32_m (p0, z1, z4),
+	     z0 = svlsr_wide_m (p0, z1, z4))
+
+/*
+** lsr_wide_x0_u32_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u32_m_tied1, svuint32_t, uint64_t,
+		 z0 = svlsr_wide_n_u32_m (p0, z0, x0),
+		 z0 = svlsr_wide_m (p0, z0, x0))
+
+/*
+** lsr_wide_x0_u32_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	lsr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u32_m_untied, svuint32_t, uint64_t,
+		 z0 = svlsr_wide_n_u32_m (p0, z1, x0),
+		 z0 = svlsr_wide_m (p0, z1, x0))
+
+/*
+** lsr_wide_1_u32_m_tied1:
+**	lsr	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u32_m_tied1, svuint32_t,
+		z0 = svlsr_wide_n_u32_m (p0, z0, 1),
+		z0 = svlsr_wide_m (p0, z0, 1))
+
+/*
+** lsr_wide_1_u32_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u32_m_untied, svuint32_t,
+		z0 = svlsr_wide_n_u32_m (p0, z1, 1),
+		z0 = svlsr_wide_m (p0, z1, 1))
+
+/*
+** lsr_wide_31_u32_m_tied1:
+**	lsr	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_31_u32_m_tied1, svuint32_t,
+		z0 = svlsr_wide_n_u32_m (p0, z0, 31),
+		z0 = svlsr_wide_m (p0, z0, 31))
+
+/*
+** lsr_wide_31_u32_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_31_u32_m_untied, svuint32_t,
+		z0 = svlsr_wide_n_u32_m (p0, z1, 31),
+		z0 = svlsr_wide_m (p0, z1, 31))
+
+/*
+** lsr_wide_32_u32_m_tied1:
+**	lsr	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_32_u32_m_tied1, svuint32_t,
+		z0 = svlsr_wide_n_u32_m (p0, z0, 32),
+		z0 = svlsr_wide_m (p0, z0, 32))
+
+/*
+** lsr_wide_32_u32_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_32_u32_m_untied, svuint32_t,
+		z0 = svlsr_wide_n_u32_m (p0, z1, 32),
+		z0 = svlsr_wide_m (p0, z1, 32))
+
+/*
+** lsr_wide_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsr	z0\.s, p0/m, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u32_z_tied1, svuint32_t, svuint64_t,
+	     z0 = svlsr_wide_u32_z (p0, z0, z4),
+	     z0 = svlsr_wide_z (p0, z0, z4))
+
+/*
+** lsr_wide_u32_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.s, p0/z, z4\.s
+**	lsr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsr_wide_u32_z_tied2, svuint32_t, svuint64_t,
+		 z0_res = svlsr_wide_u32_z (p0, z4, z0),
+		 z0_res = svlsr_wide_z (p0, z4, z0))
+
+/*
+** lsr_wide_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsr	z0\.s, p0/m, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u32_z_untied, svuint32_t, svuint64_t,
+	     z0 = svlsr_wide_u32_z (p0, z1, z4),
+	     z0 = svlsr_wide_z (p0, z1, z4))
+
+/*
+** lsr_wide_x0_u32_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u32_z_tied1, svuint32_t, uint64_t,
+		 z0 = svlsr_wide_n_u32_z (p0, z0, x0),
+		 z0 = svlsr_wide_z (p0, z0, x0))
+
+/*
+** lsr_wide_x0_u32_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u32_z_untied, svuint32_t, uint64_t,
+		 z0 = svlsr_wide_n_u32_z (p0, z1, x0),
+		 z0 = svlsr_wide_z (p0, z1, x0))
+
+/*
+** lsr_wide_1_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsr	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u32_z_tied1, svuint32_t,
+		z0 = svlsr_wide_n_u32_z (p0, z0, 1),
+		z0 = svlsr_wide_z (p0, z0, 1))
+
+/*
+** lsr_wide_1_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsr	z0\.s, p0/m, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u32_z_untied, svuint32_t,
+		z0 = svlsr_wide_n_u32_z (p0, z1, 1),
+		z0 = svlsr_wide_z (p0, z1, 1))
+
+/*
+** lsr_wide_31_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsr	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_31_u32_z_tied1, svuint32_t,
+		z0 = svlsr_wide_n_u32_z (p0, z0, 31),
+		z0 = svlsr_wide_z (p0, z0, 31))
+
+/*
+** lsr_wide_31_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsr	z0\.s, p0/m, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_31_u32_z_untied, svuint32_t,
+		z0 = svlsr_wide_n_u32_z (p0, z1, 31),
+		z0 = svlsr_wide_z (p0, z1, 31))
+
+/*
+** lsr_wide_32_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	lsr	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_32_u32_z_tied1, svuint32_t,
+		z0 = svlsr_wide_n_u32_z (p0, z0, 32),
+		z0 = svlsr_wide_z (p0, z0, 32))
+
+/*
+** lsr_wide_32_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	lsr	z0\.s, p0/m, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_32_u32_z_untied, svuint32_t,
+		z0 = svlsr_wide_n_u32_z (p0, z1, 32),
+		z0 = svlsr_wide_z (p0, z1, 32))
+
+/*
+** lsr_wide_u32_x_tied1:
+**	lsr	z0\.s, z0\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u32_x_tied1, svuint32_t, svuint64_t,
+	     z0 = svlsr_wide_u32_x (p0, z0, z4),
+	     z0 = svlsr_wide_x (p0, z0, z4))
+
+/*
+** lsr_wide_u32_x_tied2:
+**	lsr	z0\.s, z4\.s, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (lsr_wide_u32_x_tied2, svuint32_t, svuint64_t,
+		 z0_res = svlsr_wide_u32_x (p0, z4, z0),
+		 z0_res = svlsr_wide_x (p0, z4, z0))
+
+/*
+** lsr_wide_u32_x_untied:
+**	lsr	z0\.s, z1\.s, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u32_x_untied, svuint32_t, svuint64_t,
+	     z0 = svlsr_wide_u32_x (p0, z1, z4),
+	     z0 = svlsr_wide_x (p0, z1, z4))
+
+/*
+** lsr_wide_x0_u32_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsr	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u32_x_tied1, svuint32_t, uint64_t,
+		 z0 = svlsr_wide_n_u32_x (p0, z0, x0),
+		 z0 = svlsr_wide_x (p0, z0, x0))
+
+/*
+** lsr_wide_x0_u32_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	lsr	z0\.s, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u32_x_untied, svuint32_t, uint64_t,
+		 z0 = svlsr_wide_n_u32_x (p0, z1, x0),
+		 z0 = svlsr_wide_x (p0, z1, x0))
+
+/*
+** lsr_wide_1_u32_x_tied1:
+**	lsr	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u32_x_tied1, svuint32_t,
+		z0 = svlsr_wide_n_u32_x (p0, z0, 1),
+		z0 = svlsr_wide_x (p0, z0, 1))
+
+/*
+** lsr_wide_1_u32_x_untied:
+**	lsr	z0\.s, z1\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u32_x_untied, svuint32_t,
+		z0 = svlsr_wide_n_u32_x (p0, z1, 1),
+		z0 = svlsr_wide_x (p0, z1, 1))
+
+/*
+** lsr_wide_31_u32_x_tied1:
+**	lsr	z0\.s, z0\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_31_u32_x_tied1, svuint32_t,
+		z0 = svlsr_wide_n_u32_x (p0, z0, 31),
+		z0 = svlsr_wide_x (p0, z0, 31))
+
+/*
+** lsr_wide_31_u32_x_untied:
+**	lsr	z0\.s, z1\.s, #31
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_31_u32_x_untied, svuint32_t,
+		z0 = svlsr_wide_n_u32_x (p0, z1, 31),
+		z0 = svlsr_wide_x (p0, z1, 31))
+
+/*
+** lsr_wide_32_u32_x_tied1:
+**	lsr	z0\.s, z0\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_32_u32_x_tied1, svuint32_t,
+		z0 = svlsr_wide_n_u32_x (p0, z0, 32),
+		z0 = svlsr_wide_x (p0, z0, 32))
+
+/*
+** lsr_wide_32_u32_x_untied:
+**	lsr	z0\.s, z1\.s, #32
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_32_u32_x_untied, svuint32_t,
+		z0 = svlsr_wide_n_u32_x (p0, z1, 32),
+		z0 = svlsr_wide_x (p0, z1, 32))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u8.c
new file mode 100644
index 000000000..fe44eabda
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/lsr_wide_u8.c
@@ -0,0 +1,325 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lsr_wide_u8_m_tied1:
+**	lsr	z0\.b, p0/m, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u8_m_tied1, svuint8_t, svuint64_t,
+	     z0 = svlsr_wide_u8_m (p0, z0, z4),
+	     z0 = svlsr_wide_m (p0, z0, z4))
+
+/*
+** lsr_wide_u8_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	lsr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsr_wide_u8_m_tied2, svuint8_t, svuint64_t,
+		 z0_res = svlsr_wide_u8_m (p0, z4, z0),
+		 z0_res = svlsr_wide_m (p0, z4, z0))
+
+/*
+** lsr_wide_u8_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.b, p0/m, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u8_m_untied, svuint8_t, svuint64_t,
+	     z0 = svlsr_wide_u8_m (p0, z1, z4),
+	     z0 = svlsr_wide_m (p0, z1, z4))
+
+/*
+** lsr_wide_x0_u8_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u8_m_tied1, svuint8_t, uint64_t,
+		 z0 = svlsr_wide_n_u8_m (p0, z0, x0),
+		 z0 = svlsr_wide_m (p0, z0, x0))
+
+/*
+** lsr_wide_x0_u8_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	lsr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u8_m_untied, svuint8_t, uint64_t,
+		 z0 = svlsr_wide_n_u8_m (p0, z1, x0),
+		 z0 = svlsr_wide_m (p0, z1, x0))
+
+/*
+** lsr_wide_1_u8_m_tied1:
+**	lsr	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u8_m_tied1, svuint8_t,
+		z0 = svlsr_wide_n_u8_m (p0, z0, 1),
+		z0 = svlsr_wide_m (p0, z0, 1))
+
+/*
+** lsr_wide_1_u8_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u8_m_untied, svuint8_t,
+		z0 = svlsr_wide_n_u8_m (p0, z1, 1),
+		z0 = svlsr_wide_m (p0, z1, 1))
+
+/*
+** lsr_wide_7_u8_m_tied1:
+**	lsr	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_7_u8_m_tied1, svuint8_t,
+		z0 = svlsr_wide_n_u8_m (p0, z0, 7),
+		z0 = svlsr_wide_m (p0, z0, 7))
+
+/*
+** lsr_wide_7_u8_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_7_u8_m_untied, svuint8_t,
+		z0 = svlsr_wide_n_u8_m (p0, z1, 7),
+		z0 = svlsr_wide_m (p0, z1, 7))
+
+/*
+** lsr_wide_8_u8_m_tied1:
+**	lsr	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_8_u8_m_tied1, svuint8_t,
+		z0 = svlsr_wide_n_u8_m (p0, z0, 8),
+		z0 = svlsr_wide_m (p0, z0, 8))
+
+/*
+** lsr_wide_8_u8_m_untied:
+**	movprfx	z0, z1
+**	lsr	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_8_u8_m_untied, svuint8_t,
+		z0 = svlsr_wide_n_u8_m (p0, z1, 8),
+		z0 = svlsr_wide_m (p0, z1, 8))
+
+/*
+** lsr_wide_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsr	z0\.b, p0/m, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u8_z_tied1, svuint8_t, svuint64_t,
+	     z0 = svlsr_wide_u8_z (p0, z0, z4),
+	     z0 = svlsr_wide_z (p0, z0, z4))
+
+/*
+** lsr_wide_u8_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.b, p0/z, z4\.b
+**	lsr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (lsr_wide_u8_z_tied2, svuint8_t, svuint64_t,
+		 z0_res = svlsr_wide_u8_z (p0, z4, z0),
+		 z0_res = svlsr_wide_z (p0, z4, z0))
+
+/*
+** lsr_wide_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsr	z0\.b, p0/m, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u8_z_untied, svuint8_t, svuint64_t,
+	     z0 = svlsr_wide_u8_z (p0, z1, z4),
+	     z0 = svlsr_wide_z (p0, z1, z4))
+
+/*
+** lsr_wide_x0_u8_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u8_z_tied1, svuint8_t, uint64_t,
+		 z0 = svlsr_wide_n_u8_z (p0, z0, x0),
+		 z0 = svlsr_wide_z (p0, z0, x0))
+
+/*
+** lsr_wide_x0_u8_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u8_z_untied, svuint8_t, uint64_t,
+		 z0 = svlsr_wide_n_u8_z (p0, z1, x0),
+		 z0 = svlsr_wide_z (p0, z1, x0))
+
+/*
+** lsr_wide_1_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsr	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u8_z_tied1, svuint8_t,
+		z0 = svlsr_wide_n_u8_z (p0, z0, 1),
+		z0 = svlsr_wide_z (p0, z0, 1))
+
+/*
+** lsr_wide_1_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsr	z0\.b, p0/m, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u8_z_untied, svuint8_t,
+		z0 = svlsr_wide_n_u8_z (p0, z1, 1),
+		z0 = svlsr_wide_z (p0, z1, 1))
+
+/*
+** lsr_wide_7_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsr	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_7_u8_z_tied1, svuint8_t,
+		z0 = svlsr_wide_n_u8_z (p0, z0, 7),
+		z0 = svlsr_wide_z (p0, z0, 7))
+
+/*
+** lsr_wide_7_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsr	z0\.b, p0/m, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_7_u8_z_untied, svuint8_t,
+		z0 = svlsr_wide_n_u8_z (p0, z1, 7),
+		z0 = svlsr_wide_z (p0, z1, 7))
+
+/*
+** lsr_wide_8_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	lsr	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_8_u8_z_tied1, svuint8_t,
+		z0 = svlsr_wide_n_u8_z (p0, z0, 8),
+		z0 = svlsr_wide_z (p0, z0, 8))
+
+/*
+** lsr_wide_8_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	lsr	z0\.b, p0/m, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_8_u8_z_untied, svuint8_t,
+		z0 = svlsr_wide_n_u8_z (p0, z1, 8),
+		z0 = svlsr_wide_z (p0, z1, 8))
+
+/*
+** lsr_wide_u8_x_tied1:
+**	lsr	z0\.b, z0\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u8_x_tied1, svuint8_t, svuint64_t,
+	     z0 = svlsr_wide_u8_x (p0, z0, z4),
+	     z0 = svlsr_wide_x (p0, z0, z4))
+
+/*
+** lsr_wide_u8_x_tied2:
+**	lsr	z0\.b, z4\.b, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (lsr_wide_u8_x_tied2, svuint8_t, svuint64_t,
+		 z0_res = svlsr_wide_u8_x (p0, z4, z0),
+		 z0_res = svlsr_wide_x (p0, z4, z0))
+
+/*
+** lsr_wide_u8_x_untied:
+**	lsr	z0\.b, z1\.b, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (lsr_wide_u8_x_untied, svuint8_t, svuint64_t,
+	     z0 = svlsr_wide_u8_x (p0, z1, z4),
+	     z0 = svlsr_wide_x (p0, z1, z4))
+
+/*
+** lsr_wide_x0_u8_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	lsr	z0\.b, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u8_x_tied1, svuint8_t, uint64_t,
+		 z0 = svlsr_wide_n_u8_x (p0, z0, x0),
+		 z0 = svlsr_wide_x (p0, z0, x0))
+
+/*
+** lsr_wide_x0_u8_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	lsr	z0\.b, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (lsr_wide_x0_u8_x_untied, svuint8_t, uint64_t,
+		 z0 = svlsr_wide_n_u8_x (p0, z1, x0),
+		 z0 = svlsr_wide_x (p0, z1, x0))
+
+/*
+** lsr_wide_1_u8_x_tied1:
+**	lsr	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u8_x_tied1, svuint8_t,
+		z0 = svlsr_wide_n_u8_x (p0, z0, 1),
+		z0 = svlsr_wide_x (p0, z0, 1))
+
+/*
+** lsr_wide_1_u8_x_untied:
+**	lsr	z0\.b, z1\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_1_u8_x_untied, svuint8_t,
+		z0 = svlsr_wide_n_u8_x (p0, z1, 1),
+		z0 = svlsr_wide_x (p0, z1, 1))
+
+/*
+** lsr_wide_7_u8_x_tied1:
+**	lsr	z0\.b, z0\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_7_u8_x_tied1, svuint8_t,
+		z0 = svlsr_wide_n_u8_x (p0, z0, 7),
+		z0 = svlsr_wide_x (p0, z0, 7))
+
+/*
+** lsr_wide_7_u8_x_untied:
+**	lsr	z0\.b, z1\.b, #7
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_7_u8_x_untied, svuint8_t,
+		z0 = svlsr_wide_n_u8_x (p0, z1, 7),
+		z0 = svlsr_wide_x (p0, z1, 7))
+
+/*
+** lsr_wide_8_u8_x_tied1:
+**	lsr	z0\.b, z0\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_8_u8_x_tied1, svuint8_t,
+		z0 = svlsr_wide_n_u8_x (p0, z0, 8),
+		z0 = svlsr_wide_x (p0, z0, 8))
+
+/*
+** lsr_wide_8_u8_x_untied:
+**	lsr	z0\.b, z1\.b, #8
+**	ret
+*/
+TEST_UNIFORM_Z (lsr_wide_8_u8_x_untied, svuint8_t,
+		z0 = svlsr_wide_n_u8_x (p0, z1, 8),
+		z0 = svlsr_wide_x (p0, z1, 8))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f16.c
new file mode 100644
index 000000000..7656f9e54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f16.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mad_f16_m_tied1:
+**	fmad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f16_m_tied1, svfloat16_t,
+		z0 = svmad_f16_m (p0, z0, z1, z2),
+		z0 = svmad_m (p0, z0, z1, z2))
+
+/*
+** mad_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmad	z0\.h, p0/m, \1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f16_m_tied2, svfloat16_t,
+		z0 = svmad_f16_m (p0, z1, z0, z2),
+		z0 = svmad_m (p0, z1, z0, z2))
+
+/*
+** mad_f16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmad	z0\.h, p0/m, z2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f16_m_tied3, svfloat16_t,
+		z0 = svmad_f16_m (p0, z1, z2, z0),
+		z0 = svmad_m (p0, z1, z2, z0))
+
+/*
+** mad_f16_m_untied:
+**	movprfx	z0, z1
+**	fmad	z0\.h, p0/m, z2\.h, z3\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f16_m_untied, svfloat16_t,
+		z0 = svmad_f16_m (p0, z1, z2, z3),
+		z0 = svmad_m (p0, z1, z2, z3))
+
+/*
+** mad_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svmad_n_f16_m (p0, z0, z1, d4),
+		 z0 = svmad_m (p0, z0, z1, d4))
+
+/*
+** mad_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fmad	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svmad_n_f16_m (p0, z1, z2, d4),
+		 z0 = svmad_m (p0, z1, z2, d4))
+
+/*
+** mad_2_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f16_m_tied1, svfloat16_t,
+		z0 = svmad_n_f16_m (p0, z0, z1, 2),
+		z0 = svmad_m (p0, z0, z1, 2))
+
+/*
+** mad_2_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmad	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f16_m_untied, svfloat16_t,
+		z0 = svmad_n_f16_m (p0, z1, z2, 2),
+		z0 = svmad_m (p0, z1, z2, 2))
+
+/*
+** mad_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f16_z_tied1, svfloat16_t,
+		z0 = svmad_f16_z (p0, z0, z1, z2),
+		z0 = svmad_z (p0, z0, z1, z2))
+
+/*
+** mad_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f16_z_tied2, svfloat16_t,
+		z0 = svmad_f16_z (p0, z1, z0, z2),
+		z0 = svmad_z (p0, z1, z0, z2))
+
+/*
+** mad_f16_z_tied3:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f16_z_tied3, svfloat16_t,
+		z0 = svmad_f16_z (p0, z1, z2, z0),
+		z0 = svmad_z (p0, z1, z2, z0))
+
+/*
+** mad_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmad	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmad	z0\.h, p0/m, z1\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z3\.h
+**	fmla	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f16_z_untied, svfloat16_t,
+		z0 = svmad_f16_z (p0, z1, z2, z3),
+		z0 = svmad_z (p0, z1, z2, z3))
+
+/*
+** mad_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svmad_n_f16_z (p0, z0, z1, d4),
+		 z0 = svmad_z (p0, z0, z1, d4))
+
+/*
+** mad_h4_f16_z_tied2:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_h4_f16_z_tied2, svfloat16_t, __fp16,
+		 z0 = svmad_n_f16_z (p0, z1, z0, d4),
+		 z0 = svmad_z (p0, z1, z0, d4))
+
+/*
+** mad_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmad	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmad	z0\.h, p0/m, z1\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmla	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svmad_n_f16_z (p0, z1, z2, d4),
+		 z0 = svmad_z (p0, z1, z2, d4))
+
+/*
+** mad_2_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f16_z_tied1, svfloat16_t,
+		z0 = svmad_n_f16_z (p0, z0, z1, 2),
+		z0 = svmad_z (p0, z0, z1, 2))
+
+/*
+** mad_2_f16_z_tied2:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f16_z_tied2, svfloat16_t,
+		z0 = svmad_n_f16_z (p0, z1, z0, 2),
+		z0 = svmad_z (p0, z1, z0, 2))
+
+/*
+** mad_2_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmad	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmad	z0\.h, p0/m, z1\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmla	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f16_z_untied, svfloat16_t,
+		z0 = svmad_n_f16_z (p0, z1, z2, 2),
+		z0 = svmad_z (p0, z1, z2, 2))
+
+/*
+** mad_f16_x_tied1:
+**	fmad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f16_x_tied1, svfloat16_t,
+		z0 = svmad_f16_x (p0, z0, z1, z2),
+		z0 = svmad_x (p0, z0, z1, z2))
+
+/*
+** mad_f16_x_tied2:
+**	fmad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f16_x_tied2, svfloat16_t,
+		z0 = svmad_f16_x (p0, z1, z0, z2),
+		z0 = svmad_x (p0, z1, z0, z2))
+
+/*
+** mad_f16_x_tied3:
+**	fmla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f16_x_tied3, svfloat16_t,
+		z0 = svmad_f16_x (p0, z1, z2, z0),
+		z0 = svmad_x (p0, z1, z2, z0))
+
+/*
+** mad_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmad	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0, z2
+**	fmad	z0\.h, p0/m, z1\.h, z3\.h
+** |
+**	movprfx	z0, z3
+**	fmla	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f16_x_untied, svfloat16_t,
+		z0 = svmad_f16_x (p0, z1, z2, z3),
+		z0 = svmad_x (p0, z1, z2, z3))
+
+/*
+** mad_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svmad_n_f16_x (p0, z0, z1, d4),
+		 z0 = svmad_x (p0, z0, z1, d4))
+
+/*
+** mad_h4_f16_x_tied2:
+**	mov	(z[0-9]+\.h), h4
+**	fmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_h4_f16_x_tied2, svfloat16_t, __fp16,
+		 z0 = svmad_n_f16_x (p0, z1, z0, d4),
+		 z0 = svmad_x (p0, z1, z0, d4))
+
+/*
+** mad_h4_f16_x_untied: { xfail *-*-* }
+**	mov	z0\.h, h4
+**	fmla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svmad_n_f16_x (p0, z1, z2, d4),
+		 z0 = svmad_x (p0, z1, z2, d4))
+
+/*
+** mad_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f16_x_tied1, svfloat16_t,
+		z0 = svmad_n_f16_x (p0, z0, z1, 2),
+		z0 = svmad_x (p0, z0, z1, 2))
+
+/*
+** mad_2_f16_x_tied2:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f16_x_tied2, svfloat16_t,
+		z0 = svmad_n_f16_x (p0, z1, z0, 2),
+		z0 = svmad_x (p0, z1, z0, 2))
+
+/*
+** mad_2_f16_x_untied:
+**	fmov	z0\.h, #2\.0(?:e\+0)?
+**	fmla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f16_x_untied, svfloat16_t,
+		z0 = svmad_n_f16_x (p0, z1, z2, 2),
+		z0 = svmad_x (p0, z1, z2, 2))
+
+/*
+** ptrue_mad_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_f16_x_tied1, svfloat16_t,
+		z0 = svmad_f16_x (svptrue_b16 (), z0, z1, z2),
+		z0 = svmad_x (svptrue_b16 (), z0, z1, z2))
+
+/*
+** ptrue_mad_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_f16_x_tied2, svfloat16_t,
+		z0 = svmad_f16_x (svptrue_b16 (), z1, z0, z2),
+		z0 = svmad_x (svptrue_b16 (), z1, z0, z2))
+
+/*
+** ptrue_mad_f16_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_f16_x_tied3, svfloat16_t,
+		z0 = svmad_f16_x (svptrue_b16 (), z1, z2, z0),
+		z0 = svmad_x (svptrue_b16 (), z1, z2, z0))
+
+/*
+** ptrue_mad_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_f16_x_untied, svfloat16_t,
+		z0 = svmad_f16_x (svptrue_b16 (), z1, z2, z3),
+		z0 = svmad_x (svptrue_b16 (), z1, z2, z3))
+
+/*
+** ptrue_mad_2_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_2_f16_x_tied1, svfloat16_t,
+		z0 = svmad_n_f16_x (svptrue_b16 (), z0, z1, 2),
+		z0 = svmad_x (svptrue_b16 (), z0, z1, 2))
+
+/*
+** ptrue_mad_2_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_2_f16_x_tied2, svfloat16_t,
+		z0 = svmad_n_f16_x (svptrue_b16 (), z1, z0, 2),
+		z0 = svmad_x (svptrue_b16 (), z1, z0, 2))
+
+/*
+** ptrue_mad_2_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_2_f16_x_untied, svfloat16_t,
+		z0 = svmad_n_f16_x (svptrue_b16 (), z1, z2, 2),
+		z0 = svmad_x (svptrue_b16 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f32.c
new file mode 100644
index 000000000..dbdd2b9d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f32.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mad_f32_m_tied1:
+**	fmad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f32_m_tied1, svfloat32_t,
+		z0 = svmad_f32_m (p0, z0, z1, z2),
+		z0 = svmad_m (p0, z0, z1, z2))
+
+/*
+** mad_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmad	z0\.s, p0/m, \1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f32_m_tied2, svfloat32_t,
+		z0 = svmad_f32_m (p0, z1, z0, z2),
+		z0 = svmad_m (p0, z1, z0, z2))
+
+/*
+** mad_f32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmad	z0\.s, p0/m, z2\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f32_m_tied3, svfloat32_t,
+		z0 = svmad_f32_m (p0, z1, z2, z0),
+		z0 = svmad_m (p0, z1, z2, z0))
+
+/*
+** mad_f32_m_untied:
+**	movprfx	z0, z1
+**	fmad	z0\.s, p0/m, z2\.s, z3\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f32_m_untied, svfloat32_t,
+		z0 = svmad_f32_m (p0, z1, z2, z3),
+		z0 = svmad_m (p0, z1, z2, z3))
+
+/*
+** mad_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svmad_n_f32_m (p0, z0, z1, d4),
+		 z0 = svmad_m (p0, z0, z1, d4))
+
+/*
+** mad_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fmad	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svmad_n_f32_m (p0, z1, z2, d4),
+		 z0 = svmad_m (p0, z1, z2, d4))
+
+/*
+** mad_2_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f32_m_tied1, svfloat32_t,
+		z0 = svmad_n_f32_m (p0, z0, z1, 2),
+		z0 = svmad_m (p0, z0, z1, 2))
+
+/*
+** mad_2_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmad	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f32_m_untied, svfloat32_t,
+		z0 = svmad_n_f32_m (p0, z1, z2, 2),
+		z0 = svmad_m (p0, z1, z2, 2))
+
+/*
+** mad_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f32_z_tied1, svfloat32_t,
+		z0 = svmad_f32_z (p0, z0, z1, z2),
+		z0 = svmad_z (p0, z0, z1, z2))
+
+/*
+** mad_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f32_z_tied2, svfloat32_t,
+		z0 = svmad_f32_z (p0, z1, z0, z2),
+		z0 = svmad_z (p0, z1, z0, z2))
+
+/*
+** mad_f32_z_tied3:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f32_z_tied3, svfloat32_t,
+		z0 = svmad_f32_z (p0, z1, z2, z0),
+		z0 = svmad_z (p0, z1, z2, z0))
+
+/*
+** mad_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmad	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmad	z0\.s, p0/m, z1\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z3\.s
+**	fmla	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f32_z_untied, svfloat32_t,
+		z0 = svmad_f32_z (p0, z1, z2, z3),
+		z0 = svmad_z (p0, z1, z2, z3))
+
+/*
+** mad_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svmad_n_f32_z (p0, z0, z1, d4),
+		 z0 = svmad_z (p0, z0, z1, d4))
+
+/*
+** mad_s4_f32_z_tied2:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_s4_f32_z_tied2, svfloat32_t, float,
+		 z0 = svmad_n_f32_z (p0, z1, z0, d4),
+		 z0 = svmad_z (p0, z1, z0, d4))
+
+/*
+** mad_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmad	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmad	z0\.s, p0/m, z1\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmla	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svmad_n_f32_z (p0, z1, z2, d4),
+		 z0 = svmad_z (p0, z1, z2, d4))
+
+/*
+** mad_2_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f32_z_tied1, svfloat32_t,
+		z0 = svmad_n_f32_z (p0, z0, z1, 2),
+		z0 = svmad_z (p0, z0, z1, 2))
+
+/*
+** mad_2_f32_z_tied2:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f32_z_tied2, svfloat32_t,
+		z0 = svmad_n_f32_z (p0, z1, z0, 2),
+		z0 = svmad_z (p0, z1, z0, 2))
+
+/*
+** mad_2_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmad	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmad	z0\.s, p0/m, z1\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmla	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f32_z_untied, svfloat32_t,
+		z0 = svmad_n_f32_z (p0, z1, z2, 2),
+		z0 = svmad_z (p0, z1, z2, 2))
+
+/*
+** mad_f32_x_tied1:
+**	fmad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f32_x_tied1, svfloat32_t,
+		z0 = svmad_f32_x (p0, z0, z1, z2),
+		z0 = svmad_x (p0, z0, z1, z2))
+
+/*
+** mad_f32_x_tied2:
+**	fmad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f32_x_tied2, svfloat32_t,
+		z0 = svmad_f32_x (p0, z1, z0, z2),
+		z0 = svmad_x (p0, z1, z0, z2))
+
+/*
+** mad_f32_x_tied3:
+**	fmla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f32_x_tied3, svfloat32_t,
+		z0 = svmad_f32_x (p0, z1, z2, z0),
+		z0 = svmad_x (p0, z1, z2, z0))
+
+/*
+** mad_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmad	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0, z2
+**	fmad	z0\.s, p0/m, z1\.s, z3\.s
+** |
+**	movprfx	z0, z3
+**	fmla	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f32_x_untied, svfloat32_t,
+		z0 = svmad_f32_x (p0, z1, z2, z3),
+		z0 = svmad_x (p0, z1, z2, z3))
+
+/*
+** mad_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svmad_n_f32_x (p0, z0, z1, d4),
+		 z0 = svmad_x (p0, z0, z1, d4))
+
+/*
+** mad_s4_f32_x_tied2:
+**	mov	(z[0-9]+\.s), s4
+**	fmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_s4_f32_x_tied2, svfloat32_t, float,
+		 z0 = svmad_n_f32_x (p0, z1, z0, d4),
+		 z0 = svmad_x (p0, z1, z0, d4))
+
+/*
+** mad_s4_f32_x_untied: { xfail *-*-* }
+**	mov	z0\.s, s4
+**	fmla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svmad_n_f32_x (p0, z1, z2, d4),
+		 z0 = svmad_x (p0, z1, z2, d4))
+
+/*
+** mad_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f32_x_tied1, svfloat32_t,
+		z0 = svmad_n_f32_x (p0, z0, z1, 2),
+		z0 = svmad_x (p0, z0, z1, 2))
+
+/*
+** mad_2_f32_x_tied2:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f32_x_tied2, svfloat32_t,
+		z0 = svmad_n_f32_x (p0, z1, z0, 2),
+		z0 = svmad_x (p0, z1, z0, 2))
+
+/*
+** mad_2_f32_x_untied:
+**	fmov	z0\.s, #2\.0(?:e\+0)?
+**	fmla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f32_x_untied, svfloat32_t,
+		z0 = svmad_n_f32_x (p0, z1, z2, 2),
+		z0 = svmad_x (p0, z1, z2, 2))
+
+/*
+** ptrue_mad_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_f32_x_tied1, svfloat32_t,
+		z0 = svmad_f32_x (svptrue_b32 (), z0, z1, z2),
+		z0 = svmad_x (svptrue_b32 (), z0, z1, z2))
+
+/*
+** ptrue_mad_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_f32_x_tied2, svfloat32_t,
+		z0 = svmad_f32_x (svptrue_b32 (), z1, z0, z2),
+		z0 = svmad_x (svptrue_b32 (), z1, z0, z2))
+
+/*
+** ptrue_mad_f32_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_f32_x_tied3, svfloat32_t,
+		z0 = svmad_f32_x (svptrue_b32 (), z1, z2, z0),
+		z0 = svmad_x (svptrue_b32 (), z1, z2, z0))
+
+/*
+** ptrue_mad_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_f32_x_untied, svfloat32_t,
+		z0 = svmad_f32_x (svptrue_b32 (), z1, z2, z3),
+		z0 = svmad_x (svptrue_b32 (), z1, z2, z3))
+
+/*
+** ptrue_mad_2_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_2_f32_x_tied1, svfloat32_t,
+		z0 = svmad_n_f32_x (svptrue_b32 (), z0, z1, 2),
+		z0 = svmad_x (svptrue_b32 (), z0, z1, 2))
+
+/*
+** ptrue_mad_2_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_2_f32_x_tied2, svfloat32_t,
+		z0 = svmad_n_f32_x (svptrue_b32 (), z1, z0, 2),
+		z0 = svmad_x (svptrue_b32 (), z1, z0, 2))
+
+/*
+** ptrue_mad_2_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_2_f32_x_untied, svfloat32_t,
+		z0 = svmad_n_f32_x (svptrue_b32 (), z1, z2, 2),
+		z0 = svmad_x (svptrue_b32 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f64.c
new file mode 100644
index 000000000..978281295
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_f64.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mad_f64_m_tied1:
+**	fmad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f64_m_tied1, svfloat64_t,
+		z0 = svmad_f64_m (p0, z0, z1, z2),
+		z0 = svmad_m (p0, z0, z1, z2))
+
+/*
+** mad_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmad	z0\.d, p0/m, \1, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f64_m_tied2, svfloat64_t,
+		z0 = svmad_f64_m (p0, z1, z0, z2),
+		z0 = svmad_m (p0, z1, z0, z2))
+
+/*
+** mad_f64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmad	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f64_m_tied3, svfloat64_t,
+		z0 = svmad_f64_m (p0, z1, z2, z0),
+		z0 = svmad_m (p0, z1, z2, z0))
+
+/*
+** mad_f64_m_untied:
+**	movprfx	z0, z1
+**	fmad	z0\.d, p0/m, z2\.d, z3\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f64_m_untied, svfloat64_t,
+		z0 = svmad_f64_m (p0, z1, z2, z3),
+		z0 = svmad_m (p0, z1, z2, z3))
+
+/*
+** mad_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svmad_n_f64_m (p0, z0, z1, d4),
+		 z0 = svmad_m (p0, z0, z1, d4))
+
+/*
+** mad_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fmad	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svmad_n_f64_m (p0, z1, z2, d4),
+		 z0 = svmad_m (p0, z1, z2, d4))
+
+/*
+** mad_2_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f64_m_tied1, svfloat64_t,
+		z0 = svmad_n_f64_m (p0, z0, z1, 2),
+		z0 = svmad_m (p0, z0, z1, 2))
+
+/*
+** mad_2_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmad	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f64_m_untied, svfloat64_t,
+		z0 = svmad_n_f64_m (p0, z1, z2, 2),
+		z0 = svmad_m (p0, z1, z2, 2))
+
+/*
+** mad_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f64_z_tied1, svfloat64_t,
+		z0 = svmad_f64_z (p0, z0, z1, z2),
+		z0 = svmad_z (p0, z0, z1, z2))
+
+/*
+** mad_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f64_z_tied2, svfloat64_t,
+		z0 = svmad_f64_z (p0, z1, z0, z2),
+		z0 = svmad_z (p0, z1, z0, z2))
+
+/*
+** mad_f64_z_tied3:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f64_z_tied3, svfloat64_t,
+		z0 = svmad_f64_z (p0, z1, z2, z0),
+		z0 = svmad_z (p0, z1, z2, z0))
+
+/*
+** mad_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmad	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmad	z0\.d, p0/m, z1\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z3\.d
+**	fmla	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f64_z_untied, svfloat64_t,
+		z0 = svmad_f64_z (p0, z1, z2, z3),
+		z0 = svmad_z (p0, z1, z2, z3))
+
+/*
+** mad_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svmad_n_f64_z (p0, z0, z1, d4),
+		 z0 = svmad_z (p0, z0, z1, d4))
+
+/*
+** mad_d4_f64_z_tied2:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_d4_f64_z_tied2, svfloat64_t, double,
+		 z0 = svmad_n_f64_z (p0, z1, z0, d4),
+		 z0 = svmad_z (p0, z1, z0, d4))
+
+/*
+** mad_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmad	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmad	z0\.d, p0/m, z1\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmla	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svmad_n_f64_z (p0, z1, z2, d4),
+		 z0 = svmad_z (p0, z1, z2, d4))
+
+/*
+** mad_2_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f64_z_tied1, svfloat64_t,
+		z0 = svmad_n_f64_z (p0, z0, z1, 2),
+		z0 = svmad_z (p0, z0, z1, 2))
+
+/*
+** mad_2_f64_z_tied2:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f64_z_tied2, svfloat64_t,
+		z0 = svmad_n_f64_z (p0, z1, z0, 2),
+		z0 = svmad_z (p0, z1, z0, 2))
+
+/*
+** mad_2_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmad	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmad	z0\.d, p0/m, z1\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmla	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f64_z_untied, svfloat64_t,
+		z0 = svmad_n_f64_z (p0, z1, z2, 2),
+		z0 = svmad_z (p0, z1, z2, 2))
+
+/*
+** mad_f64_x_tied1:
+**	fmad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f64_x_tied1, svfloat64_t,
+		z0 = svmad_f64_x (p0, z0, z1, z2),
+		z0 = svmad_x (p0, z0, z1, z2))
+
+/*
+** mad_f64_x_tied2:
+**	fmad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f64_x_tied2, svfloat64_t,
+		z0 = svmad_f64_x (p0, z1, z0, z2),
+		z0 = svmad_x (p0, z1, z0, z2))
+
+/*
+** mad_f64_x_tied3:
+**	fmla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f64_x_tied3, svfloat64_t,
+		z0 = svmad_f64_x (p0, z1, z2, z0),
+		z0 = svmad_x (p0, z1, z2, z0))
+
+/*
+** mad_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmad	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0, z2
+**	fmad	z0\.d, p0/m, z1\.d, z3\.d
+** |
+**	movprfx	z0, z3
+**	fmla	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_f64_x_untied, svfloat64_t,
+		z0 = svmad_f64_x (p0, z1, z2, z3),
+		z0 = svmad_x (p0, z1, z2, z3))
+
+/*
+** mad_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svmad_n_f64_x (p0, z0, z1, d4),
+		 z0 = svmad_x (p0, z0, z1, d4))
+
+/*
+** mad_d4_f64_x_tied2:
+**	mov	(z[0-9]+\.d), d4
+**	fmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_d4_f64_x_tied2, svfloat64_t, double,
+		 z0 = svmad_n_f64_x (p0, z1, z0, d4),
+		 z0 = svmad_x (p0, z1, z0, d4))
+
+/*
+** mad_d4_f64_x_untied: { xfail *-*-* }
+**	mov	z0\.d, d4
+**	fmla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (mad_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svmad_n_f64_x (p0, z1, z2, d4),
+		 z0 = svmad_x (p0, z1, z2, d4))
+
+/*
+** mad_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f64_x_tied1, svfloat64_t,
+		z0 = svmad_n_f64_x (p0, z0, z1, 2),
+		z0 = svmad_x (p0, z0, z1, 2))
+
+/*
+** mad_2_f64_x_tied2:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f64_x_tied2, svfloat64_t,
+		z0 = svmad_n_f64_x (p0, z1, z0, 2),
+		z0 = svmad_x (p0, z1, z0, 2))
+
+/*
+** mad_2_f64_x_untied:
+**	fmov	z0\.d, #2\.0(?:e\+0)?
+**	fmla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_2_f64_x_untied, svfloat64_t,
+		z0 = svmad_n_f64_x (p0, z1, z2, 2),
+		z0 = svmad_x (p0, z1, z2, 2))
+
+/*
+** ptrue_mad_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_f64_x_tied1, svfloat64_t,
+		z0 = svmad_f64_x (svptrue_b64 (), z0, z1, z2),
+		z0 = svmad_x (svptrue_b64 (), z0, z1, z2))
+
+/*
+** ptrue_mad_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_f64_x_tied2, svfloat64_t,
+		z0 = svmad_f64_x (svptrue_b64 (), z1, z0, z2),
+		z0 = svmad_x (svptrue_b64 (), z1, z0, z2))
+
+/*
+** ptrue_mad_f64_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_f64_x_tied3, svfloat64_t,
+		z0 = svmad_f64_x (svptrue_b64 (), z1, z2, z0),
+		z0 = svmad_x (svptrue_b64 (), z1, z2, z0))
+
+/*
+** ptrue_mad_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_f64_x_untied, svfloat64_t,
+		z0 = svmad_f64_x (svptrue_b64 (), z1, z2, z3),
+		z0 = svmad_x (svptrue_b64 (), z1, z2, z3))
+
+/*
+** ptrue_mad_2_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_2_f64_x_tied1, svfloat64_t,
+		z0 = svmad_n_f64_x (svptrue_b64 (), z0, z1, 2),
+		z0 = svmad_x (svptrue_b64 (), z0, z1, 2))
+
+/*
+** ptrue_mad_2_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_2_f64_x_tied2, svfloat64_t,
+		z0 = svmad_n_f64_x (svptrue_b64 (), z1, z0, 2),
+		z0 = svmad_x (svptrue_b64 (), z1, z0, 2))
+
+/*
+** ptrue_mad_2_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mad_2_f64_x_untied, svfloat64_t,
+		z0 = svmad_n_f64_x (svptrue_b64 (), z1, z2, 2),
+		z0 = svmad_x (svptrue_b64 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s16.c
new file mode 100644
index 000000000..02a6d4588
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s16.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mad_s16_m_tied1:
+**	mad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s16_m_tied1, svint16_t,
+		z0 = svmad_s16_m (p0, z0, z1, z2),
+		z0 = svmad_m (p0, z0, z1, z2))
+
+/*
+** mad_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mad	z0\.h, p0/m, \1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s16_m_tied2, svint16_t,
+		z0 = svmad_s16_m (p0, z1, z0, z2),
+		z0 = svmad_m (p0, z1, z0, z2))
+
+/*
+** mad_s16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mad	z0\.h, p0/m, z2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s16_m_tied3, svint16_t,
+		z0 = svmad_s16_m (p0, z1, z2, z0),
+		z0 = svmad_m (p0, z1, z2, z0))
+
+/*
+** mad_s16_m_untied:
+**	movprfx	z0, z1
+**	mad	z0\.h, p0/m, z2\.h, z3\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s16_m_untied, svint16_t,
+		z0 = svmad_s16_m (p0, z1, z2, z3),
+		z0 = svmad_m (p0, z1, z2, z3))
+
+/*
+** mad_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s16_m_tied1, svint16_t, int16_t,
+		 z0 = svmad_n_s16_m (p0, z0, z1, x0),
+		 z0 = svmad_m (p0, z0, z1, x0))
+
+/*
+** mad_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	mad	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s16_m_untied, svint16_t, int16_t,
+		 z0 = svmad_n_s16_m (p0, z1, z2, x0),
+		 z0 = svmad_m (p0, z1, z2, x0))
+
+/*
+** mad_11_s16_m_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s16_m_tied1, svint16_t,
+		z0 = svmad_n_s16_m (p0, z0, z1, 11),
+		z0 = svmad_m (p0, z0, z1, 11))
+
+/*
+** mad_11_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0, z1
+**	mad	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s16_m_untied, svint16_t,
+		z0 = svmad_n_s16_m (p0, z1, z2, 11),
+		z0 = svmad_m (p0, z1, z2, 11))
+
+/*
+** mad_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s16_z_tied1, svint16_t,
+		z0 = svmad_s16_z (p0, z0, z1, z2),
+		z0 = svmad_z (p0, z0, z1, z2))
+
+/*
+** mad_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s16_z_tied2, svint16_t,
+		z0 = svmad_s16_z (p0, z1, z0, z2),
+		z0 = svmad_z (p0, z1, z0, z2))
+
+/*
+** mad_s16_z_tied3:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s16_z_tied3, svint16_t,
+		z0 = svmad_s16_z (p0, z1, z2, z0),
+		z0 = svmad_z (p0, z1, z2, z0))
+
+/*
+** mad_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mad	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	mad	z0\.h, p0/m, z1\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z3\.h
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s16_z_untied, svint16_t,
+		z0 = svmad_s16_z (p0, z1, z2, z3),
+		z0 = svmad_z (p0, z1, z2, z3))
+
+/*
+** mad_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s16_z_tied1, svint16_t, int16_t,
+		 z0 = svmad_n_s16_z (p0, z0, z1, x0),
+		 z0 = svmad_z (p0, z0, z1, x0))
+
+/*
+** mad_w0_s16_z_tied2:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s16_z_tied2, svint16_t, int16_t,
+		 z0 = svmad_n_s16_z (p0, z1, z0, x0),
+		 z0 = svmad_z (p0, z1, z0, x0))
+
+/*
+** mad_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mad	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	mad	z0\.h, p0/m, z1\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s16_z_untied, svint16_t, int16_t,
+		 z0 = svmad_n_s16_z (p0, z1, z2, x0),
+		 z0 = svmad_z (p0, z1, z2, x0))
+
+/*
+** mad_11_s16_z_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s16_z_tied1, svint16_t,
+		z0 = svmad_n_s16_z (p0, z0, z1, 11),
+		z0 = svmad_z (p0, z0, z1, 11))
+
+/*
+** mad_11_s16_z_tied2:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s16_z_tied2, svint16_t,
+		z0 = svmad_n_s16_z (p0, z1, z0, 11),
+		z0 = svmad_z (p0, z1, z0, 11))
+
+/*
+** mad_11_s16_z_untied:
+**	mov	(z[0-9]+\.h), #11
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mad	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	mad	z0\.h, p0/m, z1\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s16_z_untied, svint16_t,
+		z0 = svmad_n_s16_z (p0, z1, z2, 11),
+		z0 = svmad_z (p0, z1, z2, 11))
+
+/*
+** mad_s16_x_tied1:
+**	mad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s16_x_tied1, svint16_t,
+		z0 = svmad_s16_x (p0, z0, z1, z2),
+		z0 = svmad_x (p0, z0, z1, z2))
+
+/*
+** mad_s16_x_tied2:
+**	mad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s16_x_tied2, svint16_t,
+		z0 = svmad_s16_x (p0, z1, z0, z2),
+		z0 = svmad_x (p0, z1, z0, z2))
+
+/*
+** mad_s16_x_tied3:
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s16_x_tied3, svint16_t,
+		z0 = svmad_s16_x (p0, z1, z2, z0),
+		z0 = svmad_x (p0, z1, z2, z0))
+
+/*
+** mad_s16_x_untied:
+** (
+**	movprfx	z0, z1
+**	mad	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0, z2
+**	mad	z0\.h, p0/m, z1\.h, z3\.h
+** |
+**	movprfx	z0, z3
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s16_x_untied, svint16_t,
+		z0 = svmad_s16_x (p0, z1, z2, z3),
+		z0 = svmad_x (p0, z1, z2, z3))
+
+/*
+** mad_w0_s16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s16_x_tied1, svint16_t, int16_t,
+		 z0 = svmad_n_s16_x (p0, z0, z1, x0),
+		 z0 = svmad_x (p0, z0, z1, x0))
+
+/*
+** mad_w0_s16_x_tied2:
+**	mov	(z[0-9]+\.h), w0
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s16_x_tied2, svint16_t, int16_t,
+		 z0 = svmad_n_s16_x (p0, z1, z0, x0),
+		 z0 = svmad_x (p0, z1, z0, x0))
+
+/*
+** mad_w0_s16_x_untied:
+**	mov	z0\.h, w0
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s16_x_untied, svint16_t, int16_t,
+		 z0 = svmad_n_s16_x (p0, z1, z2, x0),
+		 z0 = svmad_x (p0, z1, z2, x0))
+
+/*
+** mad_11_s16_x_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s16_x_tied1, svint16_t,
+		z0 = svmad_n_s16_x (p0, z0, z1, 11),
+		z0 = svmad_x (p0, z0, z1, 11))
+
+/*
+** mad_11_s16_x_tied2:
+**	mov	(z[0-9]+\.h), #11
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s16_x_tied2, svint16_t,
+		z0 = svmad_n_s16_x (p0, z1, z0, 11),
+		z0 = svmad_x (p0, z1, z0, 11))
+
+/*
+** mad_11_s16_x_untied:
+**	mov	z0\.h, #11
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s16_x_untied, svint16_t,
+		z0 = svmad_n_s16_x (p0, z1, z2, 11),
+		z0 = svmad_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s32.c
new file mode 100644
index 000000000..d676a0c11
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s32.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mad_s32_m_tied1:
+**	mad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s32_m_tied1, svint32_t,
+		z0 = svmad_s32_m (p0, z0, z1, z2),
+		z0 = svmad_m (p0, z0, z1, z2))
+
+/*
+** mad_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mad	z0\.s, p0/m, \1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s32_m_tied2, svint32_t,
+		z0 = svmad_s32_m (p0, z1, z0, z2),
+		z0 = svmad_m (p0, z1, z0, z2))
+
+/*
+** mad_s32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mad	z0\.s, p0/m, z2\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s32_m_tied3, svint32_t,
+		z0 = svmad_s32_m (p0, z1, z2, z0),
+		z0 = svmad_m (p0, z1, z2, z0))
+
+/*
+** mad_s32_m_untied:
+**	movprfx	z0, z1
+**	mad	z0\.s, p0/m, z2\.s, z3\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s32_m_untied, svint32_t,
+		z0 = svmad_s32_m (p0, z1, z2, z3),
+		z0 = svmad_m (p0, z1, z2, z3))
+
+/*
+** mad_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svmad_n_s32_m (p0, z0, z1, x0),
+		 z0 = svmad_m (p0, z0, z1, x0))
+
+/*
+** mad_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	mad	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svmad_n_s32_m (p0, z1, z2, x0),
+		 z0 = svmad_m (p0, z1, z2, x0))
+
+/*
+** mad_11_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s32_m_tied1, svint32_t,
+		z0 = svmad_n_s32_m (p0, z0, z1, 11),
+		z0 = svmad_m (p0, z0, z1, 11))
+
+/*
+** mad_11_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0, z1
+**	mad	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s32_m_untied, svint32_t,
+		z0 = svmad_n_s32_m (p0, z1, z2, 11),
+		z0 = svmad_m (p0, z1, z2, 11))
+
+/*
+** mad_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s32_z_tied1, svint32_t,
+		z0 = svmad_s32_z (p0, z0, z1, z2),
+		z0 = svmad_z (p0, z0, z1, z2))
+
+/*
+** mad_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s32_z_tied2, svint32_t,
+		z0 = svmad_s32_z (p0, z1, z0, z2),
+		z0 = svmad_z (p0, z1, z0, z2))
+
+/*
+** mad_s32_z_tied3:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s32_z_tied3, svint32_t,
+		z0 = svmad_s32_z (p0, z1, z2, z0),
+		z0 = svmad_z (p0, z1, z2, z0))
+
+/*
+** mad_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mad	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	mad	z0\.s, p0/m, z1\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z3\.s
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s32_z_untied, svint32_t,
+		z0 = svmad_s32_z (p0, z1, z2, z3),
+		z0 = svmad_z (p0, z1, z2, z3))
+
+/*
+** mad_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svmad_n_s32_z (p0, z0, z1, x0),
+		 z0 = svmad_z (p0, z0, z1, x0))
+
+/*
+** mad_w0_s32_z_tied2:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s32_z_tied2, svint32_t, int32_t,
+		 z0 = svmad_n_s32_z (p0, z1, z0, x0),
+		 z0 = svmad_z (p0, z1, z0, x0))
+
+/*
+** mad_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mad	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	mad	z0\.s, p0/m, z1\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svmad_n_s32_z (p0, z1, z2, x0),
+		 z0 = svmad_z (p0, z1, z2, x0))
+
+/*
+** mad_11_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s32_z_tied1, svint32_t,
+		z0 = svmad_n_s32_z (p0, z0, z1, 11),
+		z0 = svmad_z (p0, z0, z1, 11))
+
+/*
+** mad_11_s32_z_tied2:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s32_z_tied2, svint32_t,
+		z0 = svmad_n_s32_z (p0, z1, z0, 11),
+		z0 = svmad_z (p0, z1, z0, 11))
+
+/*
+** mad_11_s32_z_untied:
+**	mov	(z[0-9]+\.s), #11
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mad	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	mad	z0\.s, p0/m, z1\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s32_z_untied, svint32_t,
+		z0 = svmad_n_s32_z (p0, z1, z2, 11),
+		z0 = svmad_z (p0, z1, z2, 11))
+
+/*
+** mad_s32_x_tied1:
+**	mad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s32_x_tied1, svint32_t,
+		z0 = svmad_s32_x (p0, z0, z1, z2),
+		z0 = svmad_x (p0, z0, z1, z2))
+
+/*
+** mad_s32_x_tied2:
+**	mad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s32_x_tied2, svint32_t,
+		z0 = svmad_s32_x (p0, z1, z0, z2),
+		z0 = svmad_x (p0, z1, z0, z2))
+
+/*
+** mad_s32_x_tied3:
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s32_x_tied3, svint32_t,
+		z0 = svmad_s32_x (p0, z1, z2, z0),
+		z0 = svmad_x (p0, z1, z2, z0))
+
+/*
+** mad_s32_x_untied:
+** (
+**	movprfx	z0, z1
+**	mad	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0, z2
+**	mad	z0\.s, p0/m, z1\.s, z3\.s
+** |
+**	movprfx	z0, z3
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s32_x_untied, svint32_t,
+		z0 = svmad_s32_x (p0, z1, z2, z3),
+		z0 = svmad_x (p0, z1, z2, z3))
+
+/*
+** mad_w0_s32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svmad_n_s32_x (p0, z0, z1, x0),
+		 z0 = svmad_x (p0, z0, z1, x0))
+
+/*
+** mad_w0_s32_x_tied2:
+**	mov	(z[0-9]+\.s), w0
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s32_x_tied2, svint32_t, int32_t,
+		 z0 = svmad_n_s32_x (p0, z1, z0, x0),
+		 z0 = svmad_x (p0, z1, z0, x0))
+
+/*
+** mad_w0_s32_x_untied:
+**	mov	z0\.s, w0
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svmad_n_s32_x (p0, z1, z2, x0),
+		 z0 = svmad_x (p0, z1, z2, x0))
+
+/*
+** mad_11_s32_x_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s32_x_tied1, svint32_t,
+		z0 = svmad_n_s32_x (p0, z0, z1, 11),
+		z0 = svmad_x (p0, z0, z1, 11))
+
+/*
+** mad_11_s32_x_tied2:
+**	mov	(z[0-9]+\.s), #11
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s32_x_tied2, svint32_t,
+		z0 = svmad_n_s32_x (p0, z1, z0, 11),
+		z0 = svmad_x (p0, z1, z0, 11))
+
+/*
+** mad_11_s32_x_untied:
+**	mov	z0\.s, #11
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s32_x_untied, svint32_t,
+		z0 = svmad_n_s32_x (p0, z1, z2, 11),
+		z0 = svmad_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s64.c
new file mode 100644
index 000000000..7aa017536
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s64.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mad_s64_m_tied1:
+**	mad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s64_m_tied1, svint64_t,
+		z0 = svmad_s64_m (p0, z0, z1, z2),
+		z0 = svmad_m (p0, z0, z1, z2))
+
+/*
+** mad_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	mad	z0\.d, p0/m, \1, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s64_m_tied2, svint64_t,
+		z0 = svmad_s64_m (p0, z1, z0, z2),
+		z0 = svmad_m (p0, z1, z0, z2))
+
+/*
+** mad_s64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	mad	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s64_m_tied3, svint64_t,
+		z0 = svmad_s64_m (p0, z1, z2, z0),
+		z0 = svmad_m (p0, z1, z2, z0))
+
+/*
+** mad_s64_m_untied:
+**	movprfx	z0, z1
+**	mad	z0\.d, p0/m, z2\.d, z3\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s64_m_untied, svint64_t,
+		z0 = svmad_s64_m (p0, z1, z2, z3),
+		z0 = svmad_m (p0, z1, z2, z3))
+
+/*
+** mad_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svmad_n_s64_m (p0, z0, z1, x0),
+		 z0 = svmad_m (p0, z0, z1, x0))
+
+/*
+** mad_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	mad	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svmad_n_s64_m (p0, z1, z2, x0),
+		 z0 = svmad_m (p0, z1, z2, x0))
+
+/*
+** mad_11_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s64_m_tied1, svint64_t,
+		z0 = svmad_n_s64_m (p0, z0, z1, 11),
+		z0 = svmad_m (p0, z0, z1, 11))
+
+/*
+** mad_11_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0, z1
+**	mad	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s64_m_untied, svint64_t,
+		z0 = svmad_n_s64_m (p0, z1, z2, 11),
+		z0 = svmad_m (p0, z1, z2, 11))
+
+/*
+** mad_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s64_z_tied1, svint64_t,
+		z0 = svmad_s64_z (p0, z0, z1, z2),
+		z0 = svmad_z (p0, z0, z1, z2))
+
+/*
+** mad_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s64_z_tied2, svint64_t,
+		z0 = svmad_s64_z (p0, z1, z0, z2),
+		z0 = svmad_z (p0, z1, z0, z2))
+
+/*
+** mad_s64_z_tied3:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s64_z_tied3, svint64_t,
+		z0 = svmad_s64_z (p0, z1, z2, z0),
+		z0 = svmad_z (p0, z1, z2, z0))
+
+/*
+** mad_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mad	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	mad	z0\.d, p0/m, z1\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z3\.d
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s64_z_untied, svint64_t,
+		z0 = svmad_s64_z (p0, z1, z2, z3),
+		z0 = svmad_z (p0, z1, z2, z3))
+
+/*
+** mad_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svmad_n_s64_z (p0, z0, z1, x0),
+		 z0 = svmad_z (p0, z0, z1, x0))
+
+/*
+** mad_x0_s64_z_tied2:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_x0_s64_z_tied2, svint64_t, int64_t,
+		 z0 = svmad_n_s64_z (p0, z1, z0, x0),
+		 z0 = svmad_z (p0, z1, z0, x0))
+
+/*
+** mad_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mad	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	mad	z0\.d, p0/m, z1\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svmad_n_s64_z (p0, z1, z2, x0),
+		 z0 = svmad_z (p0, z1, z2, x0))
+
+/*
+** mad_11_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s64_z_tied1, svint64_t,
+		z0 = svmad_n_s64_z (p0, z0, z1, 11),
+		z0 = svmad_z (p0, z0, z1, 11))
+
+/*
+** mad_11_s64_z_tied2:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s64_z_tied2, svint64_t,
+		z0 = svmad_n_s64_z (p0, z1, z0, 11),
+		z0 = svmad_z (p0, z1, z0, 11))
+
+/*
+** mad_11_s64_z_untied:
+**	mov	(z[0-9]+\.d), #11
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mad	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	mad	z0\.d, p0/m, z1\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s64_z_untied, svint64_t,
+		z0 = svmad_n_s64_z (p0, z1, z2, 11),
+		z0 = svmad_z (p0, z1, z2, 11))
+
+/*
+** mad_s64_x_tied1:
+**	mad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s64_x_tied1, svint64_t,
+		z0 = svmad_s64_x (p0, z0, z1, z2),
+		z0 = svmad_x (p0, z0, z1, z2))
+
+/*
+** mad_s64_x_tied2:
+**	mad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s64_x_tied2, svint64_t,
+		z0 = svmad_s64_x (p0, z1, z0, z2),
+		z0 = svmad_x (p0, z1, z0, z2))
+
+/*
+** mad_s64_x_tied3:
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s64_x_tied3, svint64_t,
+		z0 = svmad_s64_x (p0, z1, z2, z0),
+		z0 = svmad_x (p0, z1, z2, z0))
+
+/*
+** mad_s64_x_untied:
+** (
+**	movprfx	z0, z1
+**	mad	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0, z2
+**	mad	z0\.d, p0/m, z1\.d, z3\.d
+** |
+**	movprfx	z0, z3
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s64_x_untied, svint64_t,
+		z0 = svmad_s64_x (p0, z1, z2, z3),
+		z0 = svmad_x (p0, z1, z2, z3))
+
+/*
+** mad_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svmad_n_s64_x (p0, z0, z1, x0),
+		 z0 = svmad_x (p0, z0, z1, x0))
+
+/*
+** mad_x0_s64_x_tied2:
+**	mov	(z[0-9]+\.d), x0
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_x0_s64_x_tied2, svint64_t, int64_t,
+		 z0 = svmad_n_s64_x (p0, z1, z0, x0),
+		 z0 = svmad_x (p0, z1, z0, x0))
+
+/*
+** mad_x0_s64_x_untied:
+**	mov	z0\.d, x0
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svmad_n_s64_x (p0, z1, z2, x0),
+		 z0 = svmad_x (p0, z1, z2, x0))
+
+/*
+** mad_11_s64_x_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s64_x_tied1, svint64_t,
+		z0 = svmad_n_s64_x (p0, z0, z1, 11),
+		z0 = svmad_x (p0, z0, z1, 11))
+
+/*
+** mad_11_s64_x_tied2:
+**	mov	(z[0-9]+\.d), #11
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s64_x_tied2, svint64_t,
+		z0 = svmad_n_s64_x (p0, z1, z0, 11),
+		z0 = svmad_x (p0, z1, z0, 11))
+
+/*
+** mad_11_s64_x_untied:
+**	mov	z0\.d, #11
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s64_x_untied, svint64_t,
+		z0 = svmad_n_s64_x (p0, z1, z2, 11),
+		z0 = svmad_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s8.c
new file mode 100644
index 000000000..90d712686
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_s8.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mad_s8_m_tied1:
+**	mad	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s8_m_tied1, svint8_t,
+		z0 = svmad_s8_m (p0, z0, z1, z2),
+		z0 = svmad_m (p0, z0, z1, z2))
+
+/*
+** mad_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mad	z0\.b, p0/m, \1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s8_m_tied2, svint8_t,
+		z0 = svmad_s8_m (p0, z1, z0, z2),
+		z0 = svmad_m (p0, z1, z0, z2))
+
+/*
+** mad_s8_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mad	z0\.b, p0/m, z2\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s8_m_tied3, svint8_t,
+		z0 = svmad_s8_m (p0, z1, z2, z0),
+		z0 = svmad_m (p0, z1, z2, z0))
+
+/*
+** mad_s8_m_untied:
+**	movprfx	z0, z1
+**	mad	z0\.b, p0/m, z2\.b, z3\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s8_m_untied, svint8_t,
+		z0 = svmad_s8_m (p0, z1, z2, z3),
+		z0 = svmad_m (p0, z1, z2, z3))
+
+/*
+** mad_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s8_m_tied1, svint8_t, int8_t,
+		 z0 = svmad_n_s8_m (p0, z0, z1, x0),
+		 z0 = svmad_m (p0, z0, z1, x0))
+
+/*
+** mad_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	mad	z0\.b, p0/m, z2\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s8_m_untied, svint8_t, int8_t,
+		 z0 = svmad_n_s8_m (p0, z1, z2, x0),
+		 z0 = svmad_m (p0, z1, z2, x0))
+
+/*
+** mad_11_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s8_m_tied1, svint8_t,
+		z0 = svmad_n_s8_m (p0, z0, z1, 11),
+		z0 = svmad_m (p0, z0, z1, 11))
+
+/*
+** mad_11_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0, z1
+**	mad	z0\.b, p0/m, z2\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s8_m_untied, svint8_t,
+		z0 = svmad_n_s8_m (p0, z1, z2, 11),
+		z0 = svmad_m (p0, z1, z2, 11))
+
+/*
+** mad_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s8_z_tied1, svint8_t,
+		z0 = svmad_s8_z (p0, z0, z1, z2),
+		z0 = svmad_z (p0, z0, z1, z2))
+
+/*
+** mad_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s8_z_tied2, svint8_t,
+		z0 = svmad_s8_z (p0, z1, z0, z2),
+		z0 = svmad_z (p0, z1, z0, z2))
+
+/*
+** mad_s8_z_tied3:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s8_z_tied3, svint8_t,
+		z0 = svmad_s8_z (p0, z1, z2, z0),
+		z0 = svmad_z (p0, z1, z2, z0))
+
+/*
+** mad_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mad	z0\.b, p0/m, z2\.b, z3\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	mad	z0\.b, p0/m, z1\.b, z3\.b
+** |
+**	movprfx	z0\.b, p0/z, z3\.b
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s8_z_untied, svint8_t,
+		z0 = svmad_s8_z (p0, z1, z2, z3),
+		z0 = svmad_z (p0, z1, z2, z3))
+
+/*
+** mad_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s8_z_tied1, svint8_t, int8_t,
+		 z0 = svmad_n_s8_z (p0, z0, z1, x0),
+		 z0 = svmad_z (p0, z0, z1, x0))
+
+/*
+** mad_w0_s8_z_tied2:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s8_z_tied2, svint8_t, int8_t,
+		 z0 = svmad_n_s8_z (p0, z1, z0, x0),
+		 z0 = svmad_z (p0, z1, z0, x0))
+
+/*
+** mad_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mad	z0\.b, p0/m, z2\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	mad	z0\.b, p0/m, z1\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s8_z_untied, svint8_t, int8_t,
+		 z0 = svmad_n_s8_z (p0, z1, z2, x0),
+		 z0 = svmad_z (p0, z1, z2, x0))
+
+/*
+** mad_11_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s8_z_tied1, svint8_t,
+		z0 = svmad_n_s8_z (p0, z0, z1, 11),
+		z0 = svmad_z (p0, z0, z1, 11))
+
+/*
+** mad_11_s8_z_tied2:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s8_z_tied2, svint8_t,
+		z0 = svmad_n_s8_z (p0, z1, z0, 11),
+		z0 = svmad_z (p0, z1, z0, 11))
+
+/*
+** mad_11_s8_z_untied:
+**	mov	(z[0-9]+\.b), #11
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mad	z0\.b, p0/m, z2\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	mad	z0\.b, p0/m, z1\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s8_z_untied, svint8_t,
+		z0 = svmad_n_s8_z (p0, z1, z2, 11),
+		z0 = svmad_z (p0, z1, z2, 11))
+
+/*
+** mad_s8_x_tied1:
+**	mad	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s8_x_tied1, svint8_t,
+		z0 = svmad_s8_x (p0, z0, z1, z2),
+		z0 = svmad_x (p0, z0, z1, z2))
+
+/*
+** mad_s8_x_tied2:
+**	mad	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s8_x_tied2, svint8_t,
+		z0 = svmad_s8_x (p0, z1, z0, z2),
+		z0 = svmad_x (p0, z1, z0, z2))
+
+/*
+** mad_s8_x_tied3:
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s8_x_tied3, svint8_t,
+		z0 = svmad_s8_x (p0, z1, z2, z0),
+		z0 = svmad_x (p0, z1, z2, z0))
+
+/*
+** mad_s8_x_untied:
+** (
+**	movprfx	z0, z1
+**	mad	z0\.b, p0/m, z2\.b, z3\.b
+** |
+**	movprfx	z0, z2
+**	mad	z0\.b, p0/m, z1\.b, z3\.b
+** |
+**	movprfx	z0, z3
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_s8_x_untied, svint8_t,
+		z0 = svmad_s8_x (p0, z1, z2, z3),
+		z0 = svmad_x (p0, z1, z2, z3))
+
+/*
+** mad_w0_s8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s8_x_tied1, svint8_t, int8_t,
+		 z0 = svmad_n_s8_x (p0, z0, z1, x0),
+		 z0 = svmad_x (p0, z0, z1, x0))
+
+/*
+** mad_w0_s8_x_tied2:
+**	mov	(z[0-9]+\.b), w0
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s8_x_tied2, svint8_t, int8_t,
+		 z0 = svmad_n_s8_x (p0, z1, z0, x0),
+		 z0 = svmad_x (p0, z1, z0, x0))
+
+/*
+** mad_w0_s8_x_untied:
+**	mov	z0\.b, w0
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_s8_x_untied, svint8_t, int8_t,
+		 z0 = svmad_n_s8_x (p0, z1, z2, x0),
+		 z0 = svmad_x (p0, z1, z2, x0))
+
+/*
+** mad_11_s8_x_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s8_x_tied1, svint8_t,
+		z0 = svmad_n_s8_x (p0, z0, z1, 11),
+		z0 = svmad_x (p0, z0, z1, 11))
+
+/*
+** mad_11_s8_x_tied2:
+**	mov	(z[0-9]+\.b), #11
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s8_x_tied2, svint8_t,
+		z0 = svmad_n_s8_x (p0, z1, z0, 11),
+		z0 = svmad_x (p0, z1, z0, 11))
+
+/*
+** mad_11_s8_x_untied:
+**	mov	z0\.b, #11
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_s8_x_untied, svint8_t,
+		z0 = svmad_n_s8_x (p0, z1, z2, 11),
+		z0 = svmad_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u16.c
new file mode 100644
index 000000000..1d2ad9c5f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u16.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mad_u16_m_tied1:
+**	mad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u16_m_tied1, svuint16_t,
+		z0 = svmad_u16_m (p0, z0, z1, z2),
+		z0 = svmad_m (p0, z0, z1, z2))
+
+/*
+** mad_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mad	z0\.h, p0/m, \1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u16_m_tied2, svuint16_t,
+		z0 = svmad_u16_m (p0, z1, z0, z2),
+		z0 = svmad_m (p0, z1, z0, z2))
+
+/*
+** mad_u16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mad	z0\.h, p0/m, z2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u16_m_tied3, svuint16_t,
+		z0 = svmad_u16_m (p0, z1, z2, z0),
+		z0 = svmad_m (p0, z1, z2, z0))
+
+/*
+** mad_u16_m_untied:
+**	movprfx	z0, z1
+**	mad	z0\.h, p0/m, z2\.h, z3\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u16_m_untied, svuint16_t,
+		z0 = svmad_u16_m (p0, z1, z2, z3),
+		z0 = svmad_m (p0, z1, z2, z3))
+
+/*
+** mad_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svmad_n_u16_m (p0, z0, z1, x0),
+		 z0 = svmad_m (p0, z0, z1, x0))
+
+/*
+** mad_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	mad	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svmad_n_u16_m (p0, z1, z2, x0),
+		 z0 = svmad_m (p0, z1, z2, x0))
+
+/*
+** mad_11_u16_m_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u16_m_tied1, svuint16_t,
+		z0 = svmad_n_u16_m (p0, z0, z1, 11),
+		z0 = svmad_m (p0, z0, z1, 11))
+
+/*
+** mad_11_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0, z1
+**	mad	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u16_m_untied, svuint16_t,
+		z0 = svmad_n_u16_m (p0, z1, z2, 11),
+		z0 = svmad_m (p0, z1, z2, 11))
+
+/*
+** mad_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u16_z_tied1, svuint16_t,
+		z0 = svmad_u16_z (p0, z0, z1, z2),
+		z0 = svmad_z (p0, z0, z1, z2))
+
+/*
+** mad_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u16_z_tied2, svuint16_t,
+		z0 = svmad_u16_z (p0, z1, z0, z2),
+		z0 = svmad_z (p0, z1, z0, z2))
+
+/*
+** mad_u16_z_tied3:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u16_z_tied3, svuint16_t,
+		z0 = svmad_u16_z (p0, z1, z2, z0),
+		z0 = svmad_z (p0, z1, z2, z0))
+
+/*
+** mad_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mad	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	mad	z0\.h, p0/m, z1\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z3\.h
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u16_z_untied, svuint16_t,
+		z0 = svmad_u16_z (p0, z1, z2, z3),
+		z0 = svmad_z (p0, z1, z2, z3))
+
+/*
+** mad_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svmad_n_u16_z (p0, z0, z1, x0),
+		 z0 = svmad_z (p0, z0, z1, x0))
+
+/*
+** mad_w0_u16_z_tied2:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u16_z_tied2, svuint16_t, uint16_t,
+		 z0 = svmad_n_u16_z (p0, z1, z0, x0),
+		 z0 = svmad_z (p0, z1, z0, x0))
+
+/*
+** mad_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mad	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	mad	z0\.h, p0/m, z1\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svmad_n_u16_z (p0, z1, z2, x0),
+		 z0 = svmad_z (p0, z1, z2, x0))
+
+/*
+** mad_11_u16_z_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u16_z_tied1, svuint16_t,
+		z0 = svmad_n_u16_z (p0, z0, z1, 11),
+		z0 = svmad_z (p0, z0, z1, 11))
+
+/*
+** mad_11_u16_z_tied2:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u16_z_tied2, svuint16_t,
+		z0 = svmad_n_u16_z (p0, z1, z0, 11),
+		z0 = svmad_z (p0, z1, z0, 11))
+
+/*
+** mad_11_u16_z_untied:
+**	mov	(z[0-9]+\.h), #11
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mad	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	mad	z0\.h, p0/m, z1\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u16_z_untied, svuint16_t,
+		z0 = svmad_n_u16_z (p0, z1, z2, 11),
+		z0 = svmad_z (p0, z1, z2, 11))
+
+/*
+** mad_u16_x_tied1:
+**	mad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u16_x_tied1, svuint16_t,
+		z0 = svmad_u16_x (p0, z0, z1, z2),
+		z0 = svmad_x (p0, z0, z1, z2))
+
+/*
+** mad_u16_x_tied2:
+**	mad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u16_x_tied2, svuint16_t,
+		z0 = svmad_u16_x (p0, z1, z0, z2),
+		z0 = svmad_x (p0, z1, z0, z2))
+
+/*
+** mad_u16_x_tied3:
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u16_x_tied3, svuint16_t,
+		z0 = svmad_u16_x (p0, z1, z2, z0),
+		z0 = svmad_x (p0, z1, z2, z0))
+
+/*
+** mad_u16_x_untied:
+** (
+**	movprfx	z0, z1
+**	mad	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0, z2
+**	mad	z0\.h, p0/m, z1\.h, z3\.h
+** |
+**	movprfx	z0, z3
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u16_x_untied, svuint16_t,
+		z0 = svmad_u16_x (p0, z1, z2, z3),
+		z0 = svmad_x (p0, z1, z2, z3))
+
+/*
+** mad_w0_u16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svmad_n_u16_x (p0, z0, z1, x0),
+		 z0 = svmad_x (p0, z0, z1, x0))
+
+/*
+** mad_w0_u16_x_tied2:
+**	mov	(z[0-9]+\.h), w0
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u16_x_tied2, svuint16_t, uint16_t,
+		 z0 = svmad_n_u16_x (p0, z1, z0, x0),
+		 z0 = svmad_x (p0, z1, z0, x0))
+
+/*
+** mad_w0_u16_x_untied:
+**	mov	z0\.h, w0
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svmad_n_u16_x (p0, z1, z2, x0),
+		 z0 = svmad_x (p0, z1, z2, x0))
+
+/*
+** mad_11_u16_x_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u16_x_tied1, svuint16_t,
+		z0 = svmad_n_u16_x (p0, z0, z1, 11),
+		z0 = svmad_x (p0, z0, z1, 11))
+
+/*
+** mad_11_u16_x_tied2:
+**	mov	(z[0-9]+\.h), #11
+**	mad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u16_x_tied2, svuint16_t,
+		z0 = svmad_n_u16_x (p0, z1, z0, 11),
+		z0 = svmad_x (p0, z1, z0, 11))
+
+/*
+** mad_11_u16_x_untied:
+**	mov	z0\.h, #11
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u16_x_untied, svuint16_t,
+		z0 = svmad_n_u16_x (p0, z1, z2, 11),
+		z0 = svmad_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u32.c
new file mode 100644
index 000000000..4b51958b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u32.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mad_u32_m_tied1:
+**	mad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u32_m_tied1, svuint32_t,
+		z0 = svmad_u32_m (p0, z0, z1, z2),
+		z0 = svmad_m (p0, z0, z1, z2))
+
+/*
+** mad_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mad	z0\.s, p0/m, \1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u32_m_tied2, svuint32_t,
+		z0 = svmad_u32_m (p0, z1, z0, z2),
+		z0 = svmad_m (p0, z1, z0, z2))
+
+/*
+** mad_u32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mad	z0\.s, p0/m, z2\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u32_m_tied3, svuint32_t,
+		z0 = svmad_u32_m (p0, z1, z2, z0),
+		z0 = svmad_m (p0, z1, z2, z0))
+
+/*
+** mad_u32_m_untied:
+**	movprfx	z0, z1
+**	mad	z0\.s, p0/m, z2\.s, z3\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u32_m_untied, svuint32_t,
+		z0 = svmad_u32_m (p0, z1, z2, z3),
+		z0 = svmad_m (p0, z1, z2, z3))
+
+/*
+** mad_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svmad_n_u32_m (p0, z0, z1, x0),
+		 z0 = svmad_m (p0, z0, z1, x0))
+
+/*
+** mad_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	mad	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svmad_n_u32_m (p0, z1, z2, x0),
+		 z0 = svmad_m (p0, z1, z2, x0))
+
+/*
+** mad_11_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u32_m_tied1, svuint32_t,
+		z0 = svmad_n_u32_m (p0, z0, z1, 11),
+		z0 = svmad_m (p0, z0, z1, 11))
+
+/*
+** mad_11_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0, z1
+**	mad	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u32_m_untied, svuint32_t,
+		z0 = svmad_n_u32_m (p0, z1, z2, 11),
+		z0 = svmad_m (p0, z1, z2, 11))
+
+/*
+** mad_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u32_z_tied1, svuint32_t,
+		z0 = svmad_u32_z (p0, z0, z1, z2),
+		z0 = svmad_z (p0, z0, z1, z2))
+
+/*
+** mad_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u32_z_tied2, svuint32_t,
+		z0 = svmad_u32_z (p0, z1, z0, z2),
+		z0 = svmad_z (p0, z1, z0, z2))
+
+/*
+** mad_u32_z_tied3:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u32_z_tied3, svuint32_t,
+		z0 = svmad_u32_z (p0, z1, z2, z0),
+		z0 = svmad_z (p0, z1, z2, z0))
+
+/*
+** mad_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mad	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	mad	z0\.s, p0/m, z1\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z3\.s
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u32_z_untied, svuint32_t,
+		z0 = svmad_u32_z (p0, z1, z2, z3),
+		z0 = svmad_z (p0, z1, z2, z3))
+
+/*
+** mad_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svmad_n_u32_z (p0, z0, z1, x0),
+		 z0 = svmad_z (p0, z0, z1, x0))
+
+/*
+** mad_w0_u32_z_tied2:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u32_z_tied2, svuint32_t, uint32_t,
+		 z0 = svmad_n_u32_z (p0, z1, z0, x0),
+		 z0 = svmad_z (p0, z1, z0, x0))
+
+/*
+** mad_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mad	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	mad	z0\.s, p0/m, z1\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svmad_n_u32_z (p0, z1, z2, x0),
+		 z0 = svmad_z (p0, z1, z2, x0))
+
+/*
+** mad_11_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u32_z_tied1, svuint32_t,
+		z0 = svmad_n_u32_z (p0, z0, z1, 11),
+		z0 = svmad_z (p0, z0, z1, 11))
+
+/*
+** mad_11_u32_z_tied2:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u32_z_tied2, svuint32_t,
+		z0 = svmad_n_u32_z (p0, z1, z0, 11),
+		z0 = svmad_z (p0, z1, z0, 11))
+
+/*
+** mad_11_u32_z_untied:
+**	mov	(z[0-9]+\.s), #11
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mad	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	mad	z0\.s, p0/m, z1\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u32_z_untied, svuint32_t,
+		z0 = svmad_n_u32_z (p0, z1, z2, 11),
+		z0 = svmad_z (p0, z1, z2, 11))
+
+/*
+** mad_u32_x_tied1:
+**	mad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u32_x_tied1, svuint32_t,
+		z0 = svmad_u32_x (p0, z0, z1, z2),
+		z0 = svmad_x (p0, z0, z1, z2))
+
+/*
+** mad_u32_x_tied2:
+**	mad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u32_x_tied2, svuint32_t,
+		z0 = svmad_u32_x (p0, z1, z0, z2),
+		z0 = svmad_x (p0, z1, z0, z2))
+
+/*
+** mad_u32_x_tied3:
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u32_x_tied3, svuint32_t,
+		z0 = svmad_u32_x (p0, z1, z2, z0),
+		z0 = svmad_x (p0, z1, z2, z0))
+
+/*
+** mad_u32_x_untied:
+** (
+**	movprfx	z0, z1
+**	mad	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0, z2
+**	mad	z0\.s, p0/m, z1\.s, z3\.s
+** |
+**	movprfx	z0, z3
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u32_x_untied, svuint32_t,
+		z0 = svmad_u32_x (p0, z1, z2, z3),
+		z0 = svmad_x (p0, z1, z2, z3))
+
+/*
+** mad_w0_u32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svmad_n_u32_x (p0, z0, z1, x0),
+		 z0 = svmad_x (p0, z0, z1, x0))
+
+/*
+** mad_w0_u32_x_tied2:
+**	mov	(z[0-9]+\.s), w0
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u32_x_tied2, svuint32_t, uint32_t,
+		 z0 = svmad_n_u32_x (p0, z1, z0, x0),
+		 z0 = svmad_x (p0, z1, z0, x0))
+
+/*
+** mad_w0_u32_x_untied:
+**	mov	z0\.s, w0
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svmad_n_u32_x (p0, z1, z2, x0),
+		 z0 = svmad_x (p0, z1, z2, x0))
+
+/*
+** mad_11_u32_x_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u32_x_tied1, svuint32_t,
+		z0 = svmad_n_u32_x (p0, z0, z1, 11),
+		z0 = svmad_x (p0, z0, z1, 11))
+
+/*
+** mad_11_u32_x_tied2:
+**	mov	(z[0-9]+\.s), #11
+**	mad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u32_x_tied2, svuint32_t,
+		z0 = svmad_n_u32_x (p0, z1, z0, 11),
+		z0 = svmad_x (p0, z1, z0, 11))
+
+/*
+** mad_11_u32_x_untied:
+**	mov	z0\.s, #11
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u32_x_untied, svuint32_t,
+		z0 = svmad_n_u32_x (p0, z1, z2, 11),
+		z0 = svmad_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u64.c
new file mode 100644
index 000000000..c4939093e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u64.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mad_u64_m_tied1:
+**	mad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u64_m_tied1, svuint64_t,
+		z0 = svmad_u64_m (p0, z0, z1, z2),
+		z0 = svmad_m (p0, z0, z1, z2))
+
+/*
+** mad_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	mad	z0\.d, p0/m, \1, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u64_m_tied2, svuint64_t,
+		z0 = svmad_u64_m (p0, z1, z0, z2),
+		z0 = svmad_m (p0, z1, z0, z2))
+
+/*
+** mad_u64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	mad	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u64_m_tied3, svuint64_t,
+		z0 = svmad_u64_m (p0, z1, z2, z0),
+		z0 = svmad_m (p0, z1, z2, z0))
+
+/*
+** mad_u64_m_untied:
+**	movprfx	z0, z1
+**	mad	z0\.d, p0/m, z2\.d, z3\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u64_m_untied, svuint64_t,
+		z0 = svmad_u64_m (p0, z1, z2, z3),
+		z0 = svmad_m (p0, z1, z2, z3))
+
+/*
+** mad_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svmad_n_u64_m (p0, z0, z1, x0),
+		 z0 = svmad_m (p0, z0, z1, x0))
+
+/*
+** mad_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	mad	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svmad_n_u64_m (p0, z1, z2, x0),
+		 z0 = svmad_m (p0, z1, z2, x0))
+
+/*
+** mad_11_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u64_m_tied1, svuint64_t,
+		z0 = svmad_n_u64_m (p0, z0, z1, 11),
+		z0 = svmad_m (p0, z0, z1, 11))
+
+/*
+** mad_11_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0, z1
+**	mad	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u64_m_untied, svuint64_t,
+		z0 = svmad_n_u64_m (p0, z1, z2, 11),
+		z0 = svmad_m (p0, z1, z2, 11))
+
+/*
+** mad_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u64_z_tied1, svuint64_t,
+		z0 = svmad_u64_z (p0, z0, z1, z2),
+		z0 = svmad_z (p0, z0, z1, z2))
+
+/*
+** mad_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u64_z_tied2, svuint64_t,
+		z0 = svmad_u64_z (p0, z1, z0, z2),
+		z0 = svmad_z (p0, z1, z0, z2))
+
+/*
+** mad_u64_z_tied3:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u64_z_tied3, svuint64_t,
+		z0 = svmad_u64_z (p0, z1, z2, z0),
+		z0 = svmad_z (p0, z1, z2, z0))
+
+/*
+** mad_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mad	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	mad	z0\.d, p0/m, z1\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z3\.d
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u64_z_untied, svuint64_t,
+		z0 = svmad_u64_z (p0, z1, z2, z3),
+		z0 = svmad_z (p0, z1, z2, z3))
+
+/*
+** mad_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svmad_n_u64_z (p0, z0, z1, x0),
+		 z0 = svmad_z (p0, z0, z1, x0))
+
+/*
+** mad_x0_u64_z_tied2:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_x0_u64_z_tied2, svuint64_t, uint64_t,
+		 z0 = svmad_n_u64_z (p0, z1, z0, x0),
+		 z0 = svmad_z (p0, z1, z0, x0))
+
+/*
+** mad_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mad	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	mad	z0\.d, p0/m, z1\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svmad_n_u64_z (p0, z1, z2, x0),
+		 z0 = svmad_z (p0, z1, z2, x0))
+
+/*
+** mad_11_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u64_z_tied1, svuint64_t,
+		z0 = svmad_n_u64_z (p0, z0, z1, 11),
+		z0 = svmad_z (p0, z0, z1, 11))
+
+/*
+** mad_11_u64_z_tied2:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u64_z_tied2, svuint64_t,
+		z0 = svmad_n_u64_z (p0, z1, z0, 11),
+		z0 = svmad_z (p0, z1, z0, 11))
+
+/*
+** mad_11_u64_z_untied:
+**	mov	(z[0-9]+\.d), #11
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mad	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	mad	z0\.d, p0/m, z1\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u64_z_untied, svuint64_t,
+		z0 = svmad_n_u64_z (p0, z1, z2, 11),
+		z0 = svmad_z (p0, z1, z2, 11))
+
+/*
+** mad_u64_x_tied1:
+**	mad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u64_x_tied1, svuint64_t,
+		z0 = svmad_u64_x (p0, z0, z1, z2),
+		z0 = svmad_x (p0, z0, z1, z2))
+
+/*
+** mad_u64_x_tied2:
+**	mad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u64_x_tied2, svuint64_t,
+		z0 = svmad_u64_x (p0, z1, z0, z2),
+		z0 = svmad_x (p0, z1, z0, z2))
+
+/*
+** mad_u64_x_tied3:
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u64_x_tied3, svuint64_t,
+		z0 = svmad_u64_x (p0, z1, z2, z0),
+		z0 = svmad_x (p0, z1, z2, z0))
+
+/*
+** mad_u64_x_untied:
+** (
+**	movprfx	z0, z1
+**	mad	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0, z2
+**	mad	z0\.d, p0/m, z1\.d, z3\.d
+** |
+**	movprfx	z0, z3
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u64_x_untied, svuint64_t,
+		z0 = svmad_u64_x (p0, z1, z2, z3),
+		z0 = svmad_x (p0, z1, z2, z3))
+
+/*
+** mad_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svmad_n_u64_x (p0, z0, z1, x0),
+		 z0 = svmad_x (p0, z0, z1, x0))
+
+/*
+** mad_x0_u64_x_tied2:
+**	mov	(z[0-9]+\.d), x0
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_x0_u64_x_tied2, svuint64_t, uint64_t,
+		 z0 = svmad_n_u64_x (p0, z1, z0, x0),
+		 z0 = svmad_x (p0, z1, z0, x0))
+
+/*
+** mad_x0_u64_x_untied:
+**	mov	z0\.d, x0
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svmad_n_u64_x (p0, z1, z2, x0),
+		 z0 = svmad_x (p0, z1, z2, x0))
+
+/*
+** mad_11_u64_x_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u64_x_tied1, svuint64_t,
+		z0 = svmad_n_u64_x (p0, z0, z1, 11),
+		z0 = svmad_x (p0, z0, z1, 11))
+
+/*
+** mad_11_u64_x_tied2:
+**	mov	(z[0-9]+\.d), #11
+**	mad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u64_x_tied2, svuint64_t,
+		z0 = svmad_n_u64_x (p0, z1, z0, 11),
+		z0 = svmad_x (p0, z1, z0, 11))
+
+/*
+** mad_11_u64_x_untied:
+**	mov	z0\.d, #11
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u64_x_untied, svuint64_t,
+		z0 = svmad_n_u64_x (p0, z1, z2, 11),
+		z0 = svmad_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u8.c
new file mode 100644
index 000000000..0b4b1b8cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mad_u8.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mad_u8_m_tied1:
+**	mad	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u8_m_tied1, svuint8_t,
+		z0 = svmad_u8_m (p0, z0, z1, z2),
+		z0 = svmad_m (p0, z0, z1, z2))
+
+/*
+** mad_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mad	z0\.b, p0/m, \1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u8_m_tied2, svuint8_t,
+		z0 = svmad_u8_m (p0, z1, z0, z2),
+		z0 = svmad_m (p0, z1, z0, z2))
+
+/*
+** mad_u8_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mad	z0\.b, p0/m, z2\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u8_m_tied3, svuint8_t,
+		z0 = svmad_u8_m (p0, z1, z2, z0),
+		z0 = svmad_m (p0, z1, z2, z0))
+
+/*
+** mad_u8_m_untied:
+**	movprfx	z0, z1
+**	mad	z0\.b, p0/m, z2\.b, z3\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u8_m_untied, svuint8_t,
+		z0 = svmad_u8_m (p0, z1, z2, z3),
+		z0 = svmad_m (p0, z1, z2, z3))
+
+/*
+** mad_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svmad_n_u8_m (p0, z0, z1, x0),
+		 z0 = svmad_m (p0, z0, z1, x0))
+
+/*
+** mad_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	mad	z0\.b, p0/m, z2\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svmad_n_u8_m (p0, z1, z2, x0),
+		 z0 = svmad_m (p0, z1, z2, x0))
+
+/*
+** mad_11_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u8_m_tied1, svuint8_t,
+		z0 = svmad_n_u8_m (p0, z0, z1, 11),
+		z0 = svmad_m (p0, z0, z1, 11))
+
+/*
+** mad_11_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0, z1
+**	mad	z0\.b, p0/m, z2\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u8_m_untied, svuint8_t,
+		z0 = svmad_n_u8_m (p0, z1, z2, 11),
+		z0 = svmad_m (p0, z1, z2, 11))
+
+/*
+** mad_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u8_z_tied1, svuint8_t,
+		z0 = svmad_u8_z (p0, z0, z1, z2),
+		z0 = svmad_z (p0, z0, z1, z2))
+
+/*
+** mad_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u8_z_tied2, svuint8_t,
+		z0 = svmad_u8_z (p0, z1, z0, z2),
+		z0 = svmad_z (p0, z1, z0, z2))
+
+/*
+** mad_u8_z_tied3:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u8_z_tied3, svuint8_t,
+		z0 = svmad_u8_z (p0, z1, z2, z0),
+		z0 = svmad_z (p0, z1, z2, z0))
+
+/*
+** mad_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mad	z0\.b, p0/m, z2\.b, z3\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	mad	z0\.b, p0/m, z1\.b, z3\.b
+** |
+**	movprfx	z0\.b, p0/z, z3\.b
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u8_z_untied, svuint8_t,
+		z0 = svmad_u8_z (p0, z1, z2, z3),
+		z0 = svmad_z (p0, z1, z2, z3))
+
+/*
+** mad_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svmad_n_u8_z (p0, z0, z1, x0),
+		 z0 = svmad_z (p0, z0, z1, x0))
+
+/*
+** mad_w0_u8_z_tied2:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u8_z_tied2, svuint8_t, uint8_t,
+		 z0 = svmad_n_u8_z (p0, z1, z0, x0),
+		 z0 = svmad_z (p0, z1, z0, x0))
+
+/*
+** mad_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mad	z0\.b, p0/m, z2\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	mad	z0\.b, p0/m, z1\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svmad_n_u8_z (p0, z1, z2, x0),
+		 z0 = svmad_z (p0, z1, z2, x0))
+
+/*
+** mad_11_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u8_z_tied1, svuint8_t,
+		z0 = svmad_n_u8_z (p0, z0, z1, 11),
+		z0 = svmad_z (p0, z0, z1, 11))
+
+/*
+** mad_11_u8_z_tied2:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u8_z_tied2, svuint8_t,
+		z0 = svmad_n_u8_z (p0, z1, z0, 11),
+		z0 = svmad_z (p0, z1, z0, 11))
+
+/*
+** mad_11_u8_z_untied:
+**	mov	(z[0-9]+\.b), #11
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mad	z0\.b, p0/m, z2\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	mad	z0\.b, p0/m, z1\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u8_z_untied, svuint8_t,
+		z0 = svmad_n_u8_z (p0, z1, z2, 11),
+		z0 = svmad_z (p0, z1, z2, 11))
+
+/*
+** mad_u8_x_tied1:
+**	mad	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u8_x_tied1, svuint8_t,
+		z0 = svmad_u8_x (p0, z0, z1, z2),
+		z0 = svmad_x (p0, z0, z1, z2))
+
+/*
+** mad_u8_x_tied2:
+**	mad	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u8_x_tied2, svuint8_t,
+		z0 = svmad_u8_x (p0, z1, z0, z2),
+		z0 = svmad_x (p0, z1, z0, z2))
+
+/*
+** mad_u8_x_tied3:
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u8_x_tied3, svuint8_t,
+		z0 = svmad_u8_x (p0, z1, z2, z0),
+		z0 = svmad_x (p0, z1, z2, z0))
+
+/*
+** mad_u8_x_untied:
+** (
+**	movprfx	z0, z1
+**	mad	z0\.b, p0/m, z2\.b, z3\.b
+** |
+**	movprfx	z0, z2
+**	mad	z0\.b, p0/m, z1\.b, z3\.b
+** |
+**	movprfx	z0, z3
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mad_u8_x_untied, svuint8_t,
+		z0 = svmad_u8_x (p0, z1, z2, z3),
+		z0 = svmad_x (p0, z1, z2, z3))
+
+/*
+** mad_w0_u8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svmad_n_u8_x (p0, z0, z1, x0),
+		 z0 = svmad_x (p0, z0, z1, x0))
+
+/*
+** mad_w0_u8_x_tied2:
+**	mov	(z[0-9]+\.b), w0
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u8_x_tied2, svuint8_t, uint8_t,
+		 z0 = svmad_n_u8_x (p0, z1, z0, x0),
+		 z0 = svmad_x (p0, z1, z0, x0))
+
+/*
+** mad_w0_u8_x_untied:
+**	mov	z0\.b, w0
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mad_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svmad_n_u8_x (p0, z1, z2, x0),
+		 z0 = svmad_x (p0, z1, z2, x0))
+
+/*
+** mad_11_u8_x_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u8_x_tied1, svuint8_t,
+		z0 = svmad_n_u8_x (p0, z0, z1, 11),
+		z0 = svmad_x (p0, z0, z1, 11))
+
+/*
+** mad_11_u8_x_tied2:
+**	mov	(z[0-9]+\.b), #11
+**	mad	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u8_x_tied2, svuint8_t,
+		z0 = svmad_n_u8_x (p0, z1, z0, 11),
+		z0 = svmad_x (p0, z1, z0, 11))
+
+/*
+** mad_11_u8_x_untied:
+**	mov	z0\.b, #11
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mad_11_u8_x_untied, svuint8_t,
+		z0 = svmad_n_u8_x (p0, z1, z2, 11),
+		z0 = svmad_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f16.c
new file mode 100644
index 000000000..f21099a24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f16.c
@@ -0,0 +1,425 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** max_f16_m_tied1:
+**	fmax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_f16_m_tied1, svfloat16_t,
+		z0 = svmax_f16_m (p0, z0, z1),
+		z0 = svmax_m (p0, z0, z1))
+
+/*
+** max_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmax	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_f16_m_tied2, svfloat16_t,
+		z0 = svmax_f16_m (p0, z1, z0),
+		z0 = svmax_m (p0, z1, z0))
+
+/*
+** max_f16_m_untied:
+**	movprfx	z0, z1
+**	fmax	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_f16_m_untied, svfloat16_t,
+		z0 = svmax_f16_m (p0, z1, z2),
+		z0 = svmax_m (p0, z1, z2))
+
+/*
+** max_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (max_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svmax_n_f16_m (p0, z0, d4),
+		 z0 = svmax_m (p0, z0, d4))
+
+/*
+** max_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fmax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (max_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svmax_n_f16_m (p0, z1, d4),
+		 z0 = svmax_m (p0, z1, d4))
+
+/*
+** max_0_f16_m_tied1:
+**	fmax	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f16_m_tied1, svfloat16_t,
+		z0 = svmax_n_f16_m (p0, z0, 0),
+		z0 = svmax_m (p0, z0, 0))
+
+/*
+** max_0_f16_m_untied:
+**	movprfx	z0, z1
+**	fmax	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f16_m_untied, svfloat16_t,
+		z0 = svmax_n_f16_m (p0, z1, 0),
+		z0 = svmax_m (p0, z1, 0))
+
+/*
+** max_1_f16_m_tied1:
+**	fmax	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f16_m_tied1, svfloat16_t,
+		z0 = svmax_n_f16_m (p0, z0, 1),
+		z0 = svmax_m (p0, z0, 1))
+
+/*
+** max_1_f16_m_untied:
+**	movprfx	z0, z1
+**	fmax	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f16_m_untied, svfloat16_t,
+		z0 = svmax_n_f16_m (p0, z1, 1),
+		z0 = svmax_m (p0, z1, 1))
+
+/*
+** max_2_f16_m:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_2_f16_m, svfloat16_t,
+		z0 = svmax_n_f16_m (p0, z0, 2),
+		z0 = svmax_m (p0, z0, 2))
+
+/*
+** max_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_f16_z_tied1, svfloat16_t,
+		z0 = svmax_f16_z (p0, z0, z1),
+		z0 = svmax_z (p0, z0, z1))
+
+/*
+** max_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_f16_z_tied2, svfloat16_t,
+		z0 = svmax_f16_z (p0, z1, z0),
+		z0 = svmax_z (p0, z1, z0))
+
+/*
+** max_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmax	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmax	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_f16_z_untied, svfloat16_t,
+		z0 = svmax_f16_z (p0, z1, z2),
+		z0 = svmax_z (p0, z1, z2))
+
+/*
+** max_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (max_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svmax_n_f16_z (p0, z0, d4),
+		 z0 = svmax_z (p0, z0, d4))
+
+/*
+** max_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmax	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmax	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (max_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svmax_n_f16_z (p0, z1, d4),
+		 z0 = svmax_z (p0, z1, d4))
+
+/*
+** max_0_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmax	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f16_z_tied1, svfloat16_t,
+		z0 = svmax_n_f16_z (p0, z0, 0),
+		z0 = svmax_z (p0, z0, 0))
+
+/*
+** max_0_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmax	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f16_z_untied, svfloat16_t,
+		z0 = svmax_n_f16_z (p0, z1, 0),
+		z0 = svmax_z (p0, z1, 0))
+
+/*
+** max_1_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmax	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f16_z_tied1, svfloat16_t,
+		z0 = svmax_n_f16_z (p0, z0, 1),
+		z0 = svmax_z (p0, z0, 1))
+
+/*
+** max_1_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmax	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f16_z_untied, svfloat16_t,
+		z0 = svmax_n_f16_z (p0, z1, 1),
+		z0 = svmax_z (p0, z1, 1))
+
+/*
+** max_2_f16_z:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_2_f16_z, svfloat16_t,
+		z0 = svmax_n_f16_z (p0, z0, 2),
+		z0 = svmax_z (p0, z0, 2))
+
+/*
+** max_f16_x_tied1:
+**	fmax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_f16_x_tied1, svfloat16_t,
+		z0 = svmax_f16_x (p0, z0, z1),
+		z0 = svmax_x (p0, z0, z1))
+
+/*
+** max_f16_x_tied2:
+**	fmax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_f16_x_tied2, svfloat16_t,
+		z0 = svmax_f16_x (p0, z1, z0),
+		z0 = svmax_x (p0, z1, z0))
+
+/*
+** max_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmax	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	fmax	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_f16_x_untied, svfloat16_t,
+		z0 = svmax_f16_x (p0, z1, z2),
+		z0 = svmax_x (p0, z1, z2))
+
+/*
+** max_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (max_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svmax_n_f16_x (p0, z0, d4),
+		 z0 = svmax_x (p0, z0, d4))
+
+/*
+** max_h4_f16_x_untied:
+**	mov	z0\.h, h4
+**	fmax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (max_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svmax_n_f16_x (p0, z1, d4),
+		 z0 = svmax_x (p0, z1, d4))
+
+/*
+** max_0_f16_x_tied1:
+**	fmax	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f16_x_tied1, svfloat16_t,
+		z0 = svmax_n_f16_x (p0, z0, 0),
+		z0 = svmax_x (p0, z0, 0))
+
+/*
+** max_0_f16_x_untied:
+**	movprfx	z0, z1
+**	fmax	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f16_x_untied, svfloat16_t,
+		z0 = svmax_n_f16_x (p0, z1, 0),
+		z0 = svmax_x (p0, z1, 0))
+
+/*
+** max_1_f16_x_tied1:
+**	fmax	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f16_x_tied1, svfloat16_t,
+		z0 = svmax_n_f16_x (p0, z0, 1),
+		z0 = svmax_x (p0, z0, 1))
+
+/*
+** max_1_f16_x_untied:
+**	movprfx	z0, z1
+**	fmax	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f16_x_untied, svfloat16_t,
+		z0 = svmax_n_f16_x (p0, z1, 1),
+		z0 = svmax_x (p0, z1, 1))
+
+/*
+** max_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_2_f16_x_tied1, svfloat16_t,
+		z0 = svmax_n_f16_x (p0, z0, 2),
+		z0 = svmax_x (p0, z0, 2))
+
+/*
+** max_2_f16_x_untied:
+**	fmov	z0\.h, #2\.0(?:e\+0)?
+**	fmax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_2_f16_x_untied, svfloat16_t,
+		z0 = svmax_n_f16_x (p0, z1, 2),
+		z0 = svmax_x (p0, z1, 2))
+
+/*
+** ptrue_max_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_f16_x_tied1, svfloat16_t,
+		z0 = svmax_f16_x (svptrue_b16 (), z0, z1),
+		z0 = svmax_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_max_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_f16_x_tied2, svfloat16_t,
+		z0 = svmax_f16_x (svptrue_b16 (), z1, z0),
+		z0 = svmax_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_max_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_f16_x_untied, svfloat16_t,
+		z0 = svmax_f16_x (svptrue_b16 (), z1, z2),
+		z0 = svmax_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_max_0_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_0_f16_x_tied1, svfloat16_t,
+		z0 = svmax_n_f16_x (svptrue_b16 (), z0, 0),
+		z0 = svmax_x (svptrue_b16 (), z0, 0))
+
+/*
+** ptrue_max_0_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_0_f16_x_untied, svfloat16_t,
+		z0 = svmax_n_f16_x (svptrue_b16 (), z1, 0),
+		z0 = svmax_x (svptrue_b16 (), z1, 0))
+
+/*
+** ptrue_max_1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_1_f16_x_tied1, svfloat16_t,
+		z0 = svmax_n_f16_x (svptrue_b16 (), z0, 1),
+		z0 = svmax_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_max_1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_1_f16_x_untied, svfloat16_t,
+		z0 = svmax_n_f16_x (svptrue_b16 (), z1, 1),
+		z0 = svmax_x (svptrue_b16 (), z1, 1))
+
+/*
+** ptrue_max_2_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_2_f16_x_tied1, svfloat16_t,
+		z0 = svmax_n_f16_x (svptrue_b16 (), z0, 2),
+		z0 = svmax_x (svptrue_b16 (), z0, 2))
+
+/*
+** ptrue_max_2_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_2_f16_x_untied, svfloat16_t,
+		z0 = svmax_n_f16_x (svptrue_b16 (), z1, 2),
+		z0 = svmax_x (svptrue_b16 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f32.c
new file mode 100644
index 000000000..6f5c92c9f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f32.c
@@ -0,0 +1,425 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** max_f32_m_tied1:
+**	fmax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_f32_m_tied1, svfloat32_t,
+		z0 = svmax_f32_m (p0, z0, z1),
+		z0 = svmax_m (p0, z0, z1))
+
+/*
+** max_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmax	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_f32_m_tied2, svfloat32_t,
+		z0 = svmax_f32_m (p0, z1, z0),
+		z0 = svmax_m (p0, z1, z0))
+
+/*
+** max_f32_m_untied:
+**	movprfx	z0, z1
+**	fmax	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_f32_m_untied, svfloat32_t,
+		z0 = svmax_f32_m (p0, z1, z2),
+		z0 = svmax_m (p0, z1, z2))
+
+/*
+** max_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (max_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svmax_n_f32_m (p0, z0, d4),
+		 z0 = svmax_m (p0, z0, d4))
+
+/*
+** max_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fmax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (max_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svmax_n_f32_m (p0, z1, d4),
+		 z0 = svmax_m (p0, z1, d4))
+
+/*
+** max_0_f32_m_tied1:
+**	fmax	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f32_m_tied1, svfloat32_t,
+		z0 = svmax_n_f32_m (p0, z0, 0),
+		z0 = svmax_m (p0, z0, 0))
+
+/*
+** max_0_f32_m_untied:
+**	movprfx	z0, z1
+**	fmax	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f32_m_untied, svfloat32_t,
+		z0 = svmax_n_f32_m (p0, z1, 0),
+		z0 = svmax_m (p0, z1, 0))
+
+/*
+** max_1_f32_m_tied1:
+**	fmax	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f32_m_tied1, svfloat32_t,
+		z0 = svmax_n_f32_m (p0, z0, 1),
+		z0 = svmax_m (p0, z0, 1))
+
+/*
+** max_1_f32_m_untied:
+**	movprfx	z0, z1
+**	fmax	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f32_m_untied, svfloat32_t,
+		z0 = svmax_n_f32_m (p0, z1, 1),
+		z0 = svmax_m (p0, z1, 1))
+
+/*
+** max_2_f32_m:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_2_f32_m, svfloat32_t,
+		z0 = svmax_n_f32_m (p0, z0, 2),
+		z0 = svmax_m (p0, z0, 2))
+
+/*
+** max_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_f32_z_tied1, svfloat32_t,
+		z0 = svmax_f32_z (p0, z0, z1),
+		z0 = svmax_z (p0, z0, z1))
+
+/*
+** max_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_f32_z_tied2, svfloat32_t,
+		z0 = svmax_f32_z (p0, z1, z0),
+		z0 = svmax_z (p0, z1, z0))
+
+/*
+** max_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmax	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmax	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_f32_z_untied, svfloat32_t,
+		z0 = svmax_f32_z (p0, z1, z2),
+		z0 = svmax_z (p0, z1, z2))
+
+/*
+** max_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (max_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svmax_n_f32_z (p0, z0, d4),
+		 z0 = svmax_z (p0, z0, d4))
+
+/*
+** max_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmax	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmax	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (max_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svmax_n_f32_z (p0, z1, d4),
+		 z0 = svmax_z (p0, z1, d4))
+
+/*
+** max_0_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmax	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f32_z_tied1, svfloat32_t,
+		z0 = svmax_n_f32_z (p0, z0, 0),
+		z0 = svmax_z (p0, z0, 0))
+
+/*
+** max_0_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmax	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f32_z_untied, svfloat32_t,
+		z0 = svmax_n_f32_z (p0, z1, 0),
+		z0 = svmax_z (p0, z1, 0))
+
+/*
+** max_1_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmax	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f32_z_tied1, svfloat32_t,
+		z0 = svmax_n_f32_z (p0, z0, 1),
+		z0 = svmax_z (p0, z0, 1))
+
+/*
+** max_1_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmax	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f32_z_untied, svfloat32_t,
+		z0 = svmax_n_f32_z (p0, z1, 1),
+		z0 = svmax_z (p0, z1, 1))
+
+/*
+** max_2_f32_z:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_2_f32_z, svfloat32_t,
+		z0 = svmax_n_f32_z (p0, z0, 2),
+		z0 = svmax_z (p0, z0, 2))
+
+/*
+** max_f32_x_tied1:
+**	fmax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_f32_x_tied1, svfloat32_t,
+		z0 = svmax_f32_x (p0, z0, z1),
+		z0 = svmax_x (p0, z0, z1))
+
+/*
+** max_f32_x_tied2:
+**	fmax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_f32_x_tied2, svfloat32_t,
+		z0 = svmax_f32_x (p0, z1, z0),
+		z0 = svmax_x (p0, z1, z0))
+
+/*
+** max_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmax	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	fmax	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_f32_x_untied, svfloat32_t,
+		z0 = svmax_f32_x (p0, z1, z2),
+		z0 = svmax_x (p0, z1, z2))
+
+/*
+** max_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (max_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svmax_n_f32_x (p0, z0, d4),
+		 z0 = svmax_x (p0, z0, d4))
+
+/*
+** max_s4_f32_x_untied:
+**	mov	z0\.s, s4
+**	fmax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (max_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svmax_n_f32_x (p0, z1, d4),
+		 z0 = svmax_x (p0, z1, d4))
+
+/*
+** max_0_f32_x_tied1:
+**	fmax	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f32_x_tied1, svfloat32_t,
+		z0 = svmax_n_f32_x (p0, z0, 0),
+		z0 = svmax_x (p0, z0, 0))
+
+/*
+** max_0_f32_x_untied:
+**	movprfx	z0, z1
+**	fmax	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f32_x_untied, svfloat32_t,
+		z0 = svmax_n_f32_x (p0, z1, 0),
+		z0 = svmax_x (p0, z1, 0))
+
+/*
+** max_1_f32_x_tied1:
+**	fmax	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f32_x_tied1, svfloat32_t,
+		z0 = svmax_n_f32_x (p0, z0, 1),
+		z0 = svmax_x (p0, z0, 1))
+
+/*
+** max_1_f32_x_untied:
+**	movprfx	z0, z1
+**	fmax	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f32_x_untied, svfloat32_t,
+		z0 = svmax_n_f32_x (p0, z1, 1),
+		z0 = svmax_x (p0, z1, 1))
+
+/*
+** max_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_2_f32_x_tied1, svfloat32_t,
+		z0 = svmax_n_f32_x (p0, z0, 2),
+		z0 = svmax_x (p0, z0, 2))
+
+/*
+** max_2_f32_x_untied:
+**	fmov	z0\.s, #2\.0(?:e\+0)?
+**	fmax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_2_f32_x_untied, svfloat32_t,
+		z0 = svmax_n_f32_x (p0, z1, 2),
+		z0 = svmax_x (p0, z1, 2))
+
+/*
+** ptrue_max_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_f32_x_tied1, svfloat32_t,
+		z0 = svmax_f32_x (svptrue_b32 (), z0, z1),
+		z0 = svmax_x (svptrue_b32 (), z0, z1))
+
+/*
+** ptrue_max_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_f32_x_tied2, svfloat32_t,
+		z0 = svmax_f32_x (svptrue_b32 (), z1, z0),
+		z0 = svmax_x (svptrue_b32 (), z1, z0))
+
+/*
+** ptrue_max_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_f32_x_untied, svfloat32_t,
+		z0 = svmax_f32_x (svptrue_b32 (), z1, z2),
+		z0 = svmax_x (svptrue_b32 (), z1, z2))
+
+/*
+** ptrue_max_0_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_0_f32_x_tied1, svfloat32_t,
+		z0 = svmax_n_f32_x (svptrue_b32 (), z0, 0),
+		z0 = svmax_x (svptrue_b32 (), z0, 0))
+
+/*
+** ptrue_max_0_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_0_f32_x_untied, svfloat32_t,
+		z0 = svmax_n_f32_x (svptrue_b32 (), z1, 0),
+		z0 = svmax_x (svptrue_b32 (), z1, 0))
+
+/*
+** ptrue_max_1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_1_f32_x_tied1, svfloat32_t,
+		z0 = svmax_n_f32_x (svptrue_b32 (), z0, 1),
+		z0 = svmax_x (svptrue_b32 (), z0, 1))
+
+/*
+** ptrue_max_1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_1_f32_x_untied, svfloat32_t,
+		z0 = svmax_n_f32_x (svptrue_b32 (), z1, 1),
+		z0 = svmax_x (svptrue_b32 (), z1, 1))
+
+/*
+** ptrue_max_2_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_2_f32_x_tied1, svfloat32_t,
+		z0 = svmax_n_f32_x (svptrue_b32 (), z0, 2),
+		z0 = svmax_x (svptrue_b32 (), z0, 2))
+
+/*
+** ptrue_max_2_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_2_f32_x_untied, svfloat32_t,
+		z0 = svmax_n_f32_x (svptrue_b32 (), z1, 2),
+		z0 = svmax_x (svptrue_b32 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f64.c
new file mode 100644
index 000000000..8ac6cca75
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_f64.c
@@ -0,0 +1,425 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** max_f64_m_tied1:
+**	fmax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_f64_m_tied1, svfloat64_t,
+		z0 = svmax_f64_m (p0, z0, z1),
+		z0 = svmax_m (p0, z0, z1))
+
+/*
+** max_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_f64_m_tied2, svfloat64_t,
+		z0 = svmax_f64_m (p0, z1, z0),
+		z0 = svmax_m (p0, z1, z0))
+
+/*
+** max_f64_m_untied:
+**	movprfx	z0, z1
+**	fmax	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_f64_m_untied, svfloat64_t,
+		z0 = svmax_f64_m (p0, z1, z2),
+		z0 = svmax_m (p0, z1, z2))
+
+/*
+** max_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (max_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svmax_n_f64_m (p0, z0, d4),
+		 z0 = svmax_m (p0, z0, d4))
+
+/*
+** max_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fmax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (max_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svmax_n_f64_m (p0, z1, d4),
+		 z0 = svmax_m (p0, z1, d4))
+
+/*
+** max_0_f64_m_tied1:
+**	fmax	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f64_m_tied1, svfloat64_t,
+		z0 = svmax_n_f64_m (p0, z0, 0),
+		z0 = svmax_m (p0, z0, 0))
+
+/*
+** max_0_f64_m_untied:
+**	movprfx	z0, z1
+**	fmax	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f64_m_untied, svfloat64_t,
+		z0 = svmax_n_f64_m (p0, z1, 0),
+		z0 = svmax_m (p0, z1, 0))
+
+/*
+** max_1_f64_m_tied1:
+**	fmax	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f64_m_tied1, svfloat64_t,
+		z0 = svmax_n_f64_m (p0, z0, 1),
+		z0 = svmax_m (p0, z0, 1))
+
+/*
+** max_1_f64_m_untied:
+**	movprfx	z0, z1
+**	fmax	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f64_m_untied, svfloat64_t,
+		z0 = svmax_n_f64_m (p0, z1, 1),
+		z0 = svmax_m (p0, z1, 1))
+
+/*
+** max_2_f64_m:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_2_f64_m, svfloat64_t,
+		z0 = svmax_n_f64_m (p0, z0, 2),
+		z0 = svmax_m (p0, z0, 2))
+
+/*
+** max_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_f64_z_tied1, svfloat64_t,
+		z0 = svmax_f64_z (p0, z0, z1),
+		z0 = svmax_z (p0, z0, z1))
+
+/*
+** max_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_f64_z_tied2, svfloat64_t,
+		z0 = svmax_f64_z (p0, z1, z0),
+		z0 = svmax_z (p0, z1, z0))
+
+/*
+** max_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmax	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmax	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_f64_z_untied, svfloat64_t,
+		z0 = svmax_f64_z (p0, z1, z2),
+		z0 = svmax_z (p0, z1, z2))
+
+/*
+** max_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (max_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svmax_n_f64_z (p0, z0, d4),
+		 z0 = svmax_z (p0, z0, d4))
+
+/*
+** max_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmax	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmax	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (max_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svmax_n_f64_z (p0, z1, d4),
+		 z0 = svmax_z (p0, z1, d4))
+
+/*
+** max_0_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmax	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f64_z_tied1, svfloat64_t,
+		z0 = svmax_n_f64_z (p0, z0, 0),
+		z0 = svmax_z (p0, z0, 0))
+
+/*
+** max_0_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmax	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f64_z_untied, svfloat64_t,
+		z0 = svmax_n_f64_z (p0, z1, 0),
+		z0 = svmax_z (p0, z1, 0))
+
+/*
+** max_1_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmax	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f64_z_tied1, svfloat64_t,
+		z0 = svmax_n_f64_z (p0, z0, 1),
+		z0 = svmax_z (p0, z0, 1))
+
+/*
+** max_1_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmax	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f64_z_untied, svfloat64_t,
+		z0 = svmax_n_f64_z (p0, z1, 1),
+		z0 = svmax_z (p0, z1, 1))
+
+/*
+** max_2_f64_z:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_2_f64_z, svfloat64_t,
+		z0 = svmax_n_f64_z (p0, z0, 2),
+		z0 = svmax_z (p0, z0, 2))
+
+/*
+** max_f64_x_tied1:
+**	fmax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_f64_x_tied1, svfloat64_t,
+		z0 = svmax_f64_x (p0, z0, z1),
+		z0 = svmax_x (p0, z0, z1))
+
+/*
+** max_f64_x_tied2:
+**	fmax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_f64_x_tied2, svfloat64_t,
+		z0 = svmax_f64_x (p0, z1, z0),
+		z0 = svmax_x (p0, z1, z0))
+
+/*
+** max_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmax	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	fmax	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_f64_x_untied, svfloat64_t,
+		z0 = svmax_f64_x (p0, z1, z2),
+		z0 = svmax_x (p0, z1, z2))
+
+/*
+** max_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (max_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svmax_n_f64_x (p0, z0, d4),
+		 z0 = svmax_x (p0, z0, d4))
+
+/*
+** max_d4_f64_x_untied:
+**	mov	z0\.d, d4
+**	fmax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (max_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svmax_n_f64_x (p0, z1, d4),
+		 z0 = svmax_x (p0, z1, d4))
+
+/*
+** max_0_f64_x_tied1:
+**	fmax	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f64_x_tied1, svfloat64_t,
+		z0 = svmax_n_f64_x (p0, z0, 0),
+		z0 = svmax_x (p0, z0, 0))
+
+/*
+** max_0_f64_x_untied:
+**	movprfx	z0, z1
+**	fmax	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_0_f64_x_untied, svfloat64_t,
+		z0 = svmax_n_f64_x (p0, z1, 0),
+		z0 = svmax_x (p0, z1, 0))
+
+/*
+** max_1_f64_x_tied1:
+**	fmax	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f64_x_tied1, svfloat64_t,
+		z0 = svmax_n_f64_x (p0, z0, 1),
+		z0 = svmax_x (p0, z0, 1))
+
+/*
+** max_1_f64_x_untied:
+**	movprfx	z0, z1
+**	fmax	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_f64_x_untied, svfloat64_t,
+		z0 = svmax_n_f64_x (p0, z1, 1),
+		z0 = svmax_x (p0, z1, 1))
+
+/*
+** max_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_2_f64_x_tied1, svfloat64_t,
+		z0 = svmax_n_f64_x (p0, z0, 2),
+		z0 = svmax_x (p0, z0, 2))
+
+/*
+** max_2_f64_x_untied:
+**	fmov	z0\.d, #2\.0(?:e\+0)?
+**	fmax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_2_f64_x_untied, svfloat64_t,
+		z0 = svmax_n_f64_x (p0, z1, 2),
+		z0 = svmax_x (p0, z1, 2))
+
+/*
+** ptrue_max_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_f64_x_tied1, svfloat64_t,
+		z0 = svmax_f64_x (svptrue_b64 (), z0, z1),
+		z0 = svmax_x (svptrue_b64 (), z0, z1))
+
+/*
+** ptrue_max_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_f64_x_tied2, svfloat64_t,
+		z0 = svmax_f64_x (svptrue_b64 (), z1, z0),
+		z0 = svmax_x (svptrue_b64 (), z1, z0))
+
+/*
+** ptrue_max_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_f64_x_untied, svfloat64_t,
+		z0 = svmax_f64_x (svptrue_b64 (), z1, z2),
+		z0 = svmax_x (svptrue_b64 (), z1, z2))
+
+/*
+** ptrue_max_0_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_0_f64_x_tied1, svfloat64_t,
+		z0 = svmax_n_f64_x (svptrue_b64 (), z0, 0),
+		z0 = svmax_x (svptrue_b64 (), z0, 0))
+
+/*
+** ptrue_max_0_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_0_f64_x_untied, svfloat64_t,
+		z0 = svmax_n_f64_x (svptrue_b64 (), z1, 0),
+		z0 = svmax_x (svptrue_b64 (), z1, 0))
+
+/*
+** ptrue_max_1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_1_f64_x_tied1, svfloat64_t,
+		z0 = svmax_n_f64_x (svptrue_b64 (), z0, 1),
+		z0 = svmax_x (svptrue_b64 (), z0, 1))
+
+/*
+** ptrue_max_1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_1_f64_x_untied, svfloat64_t,
+		z0 = svmax_n_f64_x (svptrue_b64 (), z1, 1),
+		z0 = svmax_x (svptrue_b64 (), z1, 1))
+
+/*
+** ptrue_max_2_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_2_f64_x_tied1, svfloat64_t,
+		z0 = svmax_n_f64_x (svptrue_b64 (), z0, 2),
+		z0 = svmax_x (svptrue_b64 (), z0, 2))
+
+/*
+** ptrue_max_2_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_max_2_f64_x_untied, svfloat64_t,
+		z0 = svmax_n_f64_x (svptrue_b64 (), z1, 2),
+		z0 = svmax_x (svptrue_b64 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s16.c
new file mode 100644
index 000000000..6a2167522
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s16.c
@@ -0,0 +1,293 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** max_s16_m_tied1:
+**	smax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_s16_m_tied1, svint16_t,
+		z0 = svmax_s16_m (p0, z0, z1),
+		z0 = svmax_m (p0, z0, z1))
+
+/*
+** max_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	smax	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_s16_m_tied2, svint16_t,
+		z0 = svmax_s16_m (p0, z1, z0),
+		z0 = svmax_m (p0, z1, z0))
+
+/*
+** max_s16_m_untied:
+**	movprfx	z0, z1
+**	smax	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_s16_m_untied, svint16_t,
+		z0 = svmax_s16_m (p0, z1, z2),
+		z0 = svmax_m (p0, z1, z2))
+
+/*
+** max_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	smax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s16_m_tied1, svint16_t, int16_t,
+		 z0 = svmax_n_s16_m (p0, z0, x0),
+		 z0 = svmax_m (p0, z0, x0))
+
+/*
+** max_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	smax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s16_m_untied, svint16_t, int16_t,
+		 z0 = svmax_n_s16_m (p0, z1, x0),
+		 z0 = svmax_m (p0, z1, x0))
+
+/*
+** max_1_s16_m_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	smax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s16_m_tied1, svint16_t,
+		z0 = svmax_n_s16_m (p0, z0, 1),
+		z0 = svmax_m (p0, z0, 1))
+
+/*
+** max_1_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0, z1
+**	smax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s16_m_untied, svint16_t,
+		z0 = svmax_n_s16_m (p0, z1, 1),
+		z0 = svmax_m (p0, z1, 1))
+
+/*
+** max_m1_s16_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	smax	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_m1_s16_m, svint16_t,
+		z0 = svmax_n_s16_m (p0, z0, -1),
+		z0 = svmax_m (p0, z0, -1))
+
+/*
+** max_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	smax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_s16_z_tied1, svint16_t,
+		z0 = svmax_s16_z (p0, z0, z1),
+		z0 = svmax_z (p0, z0, z1))
+
+/*
+** max_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	smax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_s16_z_tied2, svint16_t,
+		z0 = svmax_s16_z (p0, z1, z0),
+		z0 = svmax_z (p0, z1, z0))
+
+/*
+** max_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	smax	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	smax	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_s16_z_untied, svint16_t,
+		z0 = svmax_s16_z (p0, z1, z2),
+		z0 = svmax_z (p0, z1, z2))
+
+/*
+** max_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	smax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s16_z_tied1, svint16_t, int16_t,
+		 z0 = svmax_n_s16_z (p0, z0, x0),
+		 z0 = svmax_z (p0, z0, x0))
+
+/*
+** max_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	smax	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	smax	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s16_z_untied, svint16_t, int16_t,
+		 z0 = svmax_n_s16_z (p0, z1, x0),
+		 z0 = svmax_z (p0, z1, x0))
+
+/*
+** max_1_s16_z_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	smax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s16_z_tied1, svint16_t,
+		z0 = svmax_n_s16_z (p0, z0, 1),
+		z0 = svmax_z (p0, z0, 1))
+
+/*
+** max_1_s16_z_untied:
+**	mov	(z[0-9]+\.h), #1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	smax	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	smax	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s16_z_untied, svint16_t,
+		z0 = svmax_n_s16_z (p0, z1, 1),
+		z0 = svmax_z (p0, z1, 1))
+
+/*
+** max_s16_x_tied1:
+**	smax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_s16_x_tied1, svint16_t,
+		z0 = svmax_s16_x (p0, z0, z1),
+		z0 = svmax_x (p0, z0, z1))
+
+/*
+** max_s16_x_tied2:
+**	smax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_s16_x_tied2, svint16_t,
+		z0 = svmax_s16_x (p0, z1, z0),
+		z0 = svmax_x (p0, z1, z0))
+
+/*
+** max_s16_x_untied:
+** (
+**	movprfx	z0, z1
+**	smax	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	smax	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_s16_x_untied, svint16_t,
+		z0 = svmax_s16_x (p0, z1, z2),
+		z0 = svmax_x (p0, z1, z2))
+
+/*
+** max_w0_s16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	smax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s16_x_tied1, svint16_t, int16_t,
+		 z0 = svmax_n_s16_x (p0, z0, x0),
+		 z0 = svmax_x (p0, z0, x0))
+
+/*
+** max_w0_s16_x_untied:
+**	mov	z0\.h, w0
+**	smax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s16_x_untied, svint16_t, int16_t,
+		 z0 = svmax_n_s16_x (p0, z1, x0),
+		 z0 = svmax_x (p0, z1, x0))
+
+/*
+** max_1_s16_x_tied1:
+**	smax	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s16_x_tied1, svint16_t,
+		z0 = svmax_n_s16_x (p0, z0, 1),
+		z0 = svmax_x (p0, z0, 1))
+
+/*
+** max_1_s16_x_untied:
+**	movprfx	z0, z1
+**	smax	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s16_x_untied, svint16_t,
+		z0 = svmax_n_s16_x (p0, z1, 1),
+		z0 = svmax_x (p0, z1, 1))
+
+/*
+** max_127_s16_x:
+**	smax	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (max_127_s16_x, svint16_t,
+		z0 = svmax_n_s16_x (p0, z0, 127),
+		z0 = svmax_x (p0, z0, 127))
+
+/*
+** max_128_s16_x:
+**	mov	(z[0-9]+\.h), #128
+**	smax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_128_s16_x, svint16_t,
+		z0 = svmax_n_s16_x (p0, z0, 128),
+		z0 = svmax_x (p0, z0, 128))
+
+/*
+** max_m1_s16_x:
+**	smax	z0\.h, z0\.h, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (max_m1_s16_x, svint16_t,
+		z0 = svmax_n_s16_x (p0, z0, -1),
+		z0 = svmax_x (p0, z0, -1))
+
+/*
+** max_m128_s16_x:
+**	smax	z0\.h, z0\.h, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (max_m128_s16_x, svint16_t,
+		z0 = svmax_n_s16_x (p0, z0, -128),
+		z0 = svmax_x (p0, z0, -128))
+
+/*
+** max_m129_s16_x:
+**	mov	(z[0-9]+\.h), #-129
+**	smax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_m129_s16_x, svint16_t,
+		z0 = svmax_n_s16_x (p0, z0, -129),
+		z0 = svmax_x (p0, z0, -129))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s32.c
new file mode 100644
index 000000000..07402c7a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s32.c
@@ -0,0 +1,293 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** max_s32_m_tied1:
+**	smax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_s32_m_tied1, svint32_t,
+		z0 = svmax_s32_m (p0, z0, z1),
+		z0 = svmax_m (p0, z0, z1))
+
+/*
+** max_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	smax	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_s32_m_tied2, svint32_t,
+		z0 = svmax_s32_m (p0, z1, z0),
+		z0 = svmax_m (p0, z1, z0))
+
+/*
+** max_s32_m_untied:
+**	movprfx	z0, z1
+**	smax	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_s32_m_untied, svint32_t,
+		z0 = svmax_s32_m (p0, z1, z2),
+		z0 = svmax_m (p0, z1, z2))
+
+/*
+** max_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	smax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svmax_n_s32_m (p0, z0, x0),
+		 z0 = svmax_m (p0, z0, x0))
+
+/*
+** max_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	smax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svmax_n_s32_m (p0, z1, x0),
+		 z0 = svmax_m (p0, z1, x0))
+
+/*
+** max_1_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	smax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s32_m_tied1, svint32_t,
+		z0 = svmax_n_s32_m (p0, z0, 1),
+		z0 = svmax_m (p0, z0, 1))
+
+/*
+** max_1_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0, z1
+**	smax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s32_m_untied, svint32_t,
+		z0 = svmax_n_s32_m (p0, z1, 1),
+		z0 = svmax_m (p0, z1, 1))
+
+/*
+** max_m1_s32_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	smax	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_m1_s32_m, svint32_t,
+		z0 = svmax_n_s32_m (p0, z0, -1),
+		z0 = svmax_m (p0, z0, -1))
+
+/*
+** max_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	smax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_s32_z_tied1, svint32_t,
+		z0 = svmax_s32_z (p0, z0, z1),
+		z0 = svmax_z (p0, z0, z1))
+
+/*
+** max_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	smax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_s32_z_tied2, svint32_t,
+		z0 = svmax_s32_z (p0, z1, z0),
+		z0 = svmax_z (p0, z1, z0))
+
+/*
+** max_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	smax	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	smax	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_s32_z_untied, svint32_t,
+		z0 = svmax_s32_z (p0, z1, z2),
+		z0 = svmax_z (p0, z1, z2))
+
+/*
+** max_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	smax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svmax_n_s32_z (p0, z0, x0),
+		 z0 = svmax_z (p0, z0, x0))
+
+/*
+** max_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	smax	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	smax	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svmax_n_s32_z (p0, z1, x0),
+		 z0 = svmax_z (p0, z1, x0))
+
+/*
+** max_1_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	smax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s32_z_tied1, svint32_t,
+		z0 = svmax_n_s32_z (p0, z0, 1),
+		z0 = svmax_z (p0, z0, 1))
+
+/*
+** max_1_s32_z_untied:
+**	mov	(z[0-9]+\.s), #1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	smax	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	smax	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s32_z_untied, svint32_t,
+		z0 = svmax_n_s32_z (p0, z1, 1),
+		z0 = svmax_z (p0, z1, 1))
+
+/*
+** max_s32_x_tied1:
+**	smax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_s32_x_tied1, svint32_t,
+		z0 = svmax_s32_x (p0, z0, z1),
+		z0 = svmax_x (p0, z0, z1))
+
+/*
+** max_s32_x_tied2:
+**	smax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_s32_x_tied2, svint32_t,
+		z0 = svmax_s32_x (p0, z1, z0),
+		z0 = svmax_x (p0, z1, z0))
+
+/*
+** max_s32_x_untied:
+** (
+**	movprfx	z0, z1
+**	smax	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	smax	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_s32_x_untied, svint32_t,
+		z0 = svmax_s32_x (p0, z1, z2),
+		z0 = svmax_x (p0, z1, z2))
+
+/*
+** max_w0_s32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	smax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svmax_n_s32_x (p0, z0, x0),
+		 z0 = svmax_x (p0, z0, x0))
+
+/*
+** max_w0_s32_x_untied:
+**	mov	z0\.s, w0
+**	smax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svmax_n_s32_x (p0, z1, x0),
+		 z0 = svmax_x (p0, z1, x0))
+
+/*
+** max_1_s32_x_tied1:
+**	smax	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s32_x_tied1, svint32_t,
+		z0 = svmax_n_s32_x (p0, z0, 1),
+		z0 = svmax_x (p0, z0, 1))
+
+/*
+** max_1_s32_x_untied:
+**	movprfx	z0, z1
+**	smax	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s32_x_untied, svint32_t,
+		z0 = svmax_n_s32_x (p0, z1, 1),
+		z0 = svmax_x (p0, z1, 1))
+
+/*
+** max_127_s32_x:
+**	smax	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (max_127_s32_x, svint32_t,
+		z0 = svmax_n_s32_x (p0, z0, 127),
+		z0 = svmax_x (p0, z0, 127))
+
+/*
+** max_128_s32_x:
+**	mov	(z[0-9]+\.s), #128
+**	smax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_128_s32_x, svint32_t,
+		z0 = svmax_n_s32_x (p0, z0, 128),
+		z0 = svmax_x (p0, z0, 128))
+
+/*
+** max_m1_s32_x:
+**	smax	z0\.s, z0\.s, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (max_m1_s32_x, svint32_t,
+		z0 = svmax_n_s32_x (p0, z0, -1),
+		z0 = svmax_x (p0, z0, -1))
+
+/*
+** max_m128_s32_x:
+**	smax	z0\.s, z0\.s, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (max_m128_s32_x, svint32_t,
+		z0 = svmax_n_s32_x (p0, z0, -128),
+		z0 = svmax_x (p0, z0, -128))
+
+/*
+** max_m129_s32_x:
+**	mov	(z[0-9]+\.s), #-129
+**	smax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_m129_s32_x, svint32_t,
+		z0 = svmax_n_s32_x (p0, z0, -129),
+		z0 = svmax_x (p0, z0, -129))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s64.c
new file mode 100644
index 000000000..66f00fdf1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s64.c
@@ -0,0 +1,293 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** max_s64_m_tied1:
+**	smax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_s64_m_tied1, svint64_t,
+		z0 = svmax_s64_m (p0, z0, z1),
+		z0 = svmax_m (p0, z0, z1))
+
+/*
+** max_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	smax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_s64_m_tied2, svint64_t,
+		z0 = svmax_s64_m (p0, z1, z0),
+		z0 = svmax_m (p0, z1, z0))
+
+/*
+** max_s64_m_untied:
+**	movprfx	z0, z1
+**	smax	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_s64_m_untied, svint64_t,
+		z0 = svmax_s64_m (p0, z1, z2),
+		z0 = svmax_m (p0, z1, z2))
+
+/*
+** max_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	smax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svmax_n_s64_m (p0, z0, x0),
+		 z0 = svmax_m (p0, z0, x0))
+
+/*
+** max_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	smax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svmax_n_s64_m (p0, z1, x0),
+		 z0 = svmax_m (p0, z1, x0))
+
+/*
+** max_1_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	smax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s64_m_tied1, svint64_t,
+		z0 = svmax_n_s64_m (p0, z0, 1),
+		z0 = svmax_m (p0, z0, 1))
+
+/*
+** max_1_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0, z1
+**	smax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s64_m_untied, svint64_t,
+		z0 = svmax_n_s64_m (p0, z1, 1),
+		z0 = svmax_m (p0, z1, 1))
+
+/*
+** max_m1_s64_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	smax	z0\.d, p0/m, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_m1_s64_m, svint64_t,
+		z0 = svmax_n_s64_m (p0, z0, -1),
+		z0 = svmax_m (p0, z0, -1))
+
+/*
+** max_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	smax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_s64_z_tied1, svint64_t,
+		z0 = svmax_s64_z (p0, z0, z1),
+		z0 = svmax_z (p0, z0, z1))
+
+/*
+** max_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	smax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_s64_z_tied2, svint64_t,
+		z0 = svmax_s64_z (p0, z1, z0),
+		z0 = svmax_z (p0, z1, z0))
+
+/*
+** max_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	smax	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	smax	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_s64_z_untied, svint64_t,
+		z0 = svmax_s64_z (p0, z1, z2),
+		z0 = svmax_z (p0, z1, z2))
+
+/*
+** max_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	smax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svmax_n_s64_z (p0, z0, x0),
+		 z0 = svmax_z (p0, z0, x0))
+
+/*
+** max_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	smax	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	smax	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (max_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svmax_n_s64_z (p0, z1, x0),
+		 z0 = svmax_z (p0, z1, x0))
+
+/*
+** max_1_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	smax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s64_z_tied1, svint64_t,
+		z0 = svmax_n_s64_z (p0, z0, 1),
+		z0 = svmax_z (p0, z0, 1))
+
+/*
+** max_1_s64_z_untied:
+**	mov	(z[0-9]+\.d), #1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	smax	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	smax	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s64_z_untied, svint64_t,
+		z0 = svmax_n_s64_z (p0, z1, 1),
+		z0 = svmax_z (p0, z1, 1))
+
+/*
+** max_s64_x_tied1:
+**	smax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_s64_x_tied1, svint64_t,
+		z0 = svmax_s64_x (p0, z0, z1),
+		z0 = svmax_x (p0, z0, z1))
+
+/*
+** max_s64_x_tied2:
+**	smax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_s64_x_tied2, svint64_t,
+		z0 = svmax_s64_x (p0, z1, z0),
+		z0 = svmax_x (p0, z1, z0))
+
+/*
+** max_s64_x_untied:
+** (
+**	movprfx	z0, z1
+**	smax	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	smax	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_s64_x_untied, svint64_t,
+		z0 = svmax_s64_x (p0, z1, z2),
+		z0 = svmax_x (p0, z1, z2))
+
+/*
+** max_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	smax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svmax_n_s64_x (p0, z0, x0),
+		 z0 = svmax_x (p0, z0, x0))
+
+/*
+** max_x0_s64_x_untied:
+**	mov	z0\.d, x0
+**	smax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (max_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svmax_n_s64_x (p0, z1, x0),
+		 z0 = svmax_x (p0, z1, x0))
+
+/*
+** max_1_s64_x_tied1:
+**	smax	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s64_x_tied1, svint64_t,
+		z0 = svmax_n_s64_x (p0, z0, 1),
+		z0 = svmax_x (p0, z0, 1))
+
+/*
+** max_1_s64_x_untied:
+**	movprfx	z0, z1
+**	smax	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s64_x_untied, svint64_t,
+		z0 = svmax_n_s64_x (p0, z1, 1),
+		z0 = svmax_x (p0, z1, 1))
+
+/*
+** max_127_s64_x:
+**	smax	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (max_127_s64_x, svint64_t,
+		z0 = svmax_n_s64_x (p0, z0, 127),
+		z0 = svmax_x (p0, z0, 127))
+
+/*
+** max_128_s64_x:
+**	mov	(z[0-9]+\.d), #128
+**	smax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_128_s64_x, svint64_t,
+		z0 = svmax_n_s64_x (p0, z0, 128),
+		z0 = svmax_x (p0, z0, 128))
+
+/*
+** max_m1_s64_x:
+**	smax	z0\.d, z0\.d, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (max_m1_s64_x, svint64_t,
+		z0 = svmax_n_s64_x (p0, z0, -1),
+		z0 = svmax_x (p0, z0, -1))
+
+/*
+** max_m128_s64_x:
+**	smax	z0\.d, z0\.d, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (max_m128_s64_x, svint64_t,
+		z0 = svmax_n_s64_x (p0, z0, -128),
+		z0 = svmax_x (p0, z0, -128))
+
+/*
+** max_m129_s64_x:
+**	mov	(z[0-9]+\.d), #-129
+**	smax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_m129_s64_x, svint64_t,
+		z0 = svmax_n_s64_x (p0, z0, -129),
+		z0 = svmax_x (p0, z0, -129))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s8.c
new file mode 100644
index 000000000..c651a26f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_s8.c
@@ -0,0 +1,273 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** max_s8_m_tied1:
+**	smax	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (max_s8_m_tied1, svint8_t,
+		z0 = svmax_s8_m (p0, z0, z1),
+		z0 = svmax_m (p0, z0, z1))
+
+/*
+** max_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	smax	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (max_s8_m_tied2, svint8_t,
+		z0 = svmax_s8_m (p0, z1, z0),
+		z0 = svmax_m (p0, z1, z0))
+
+/*
+** max_s8_m_untied:
+**	movprfx	z0, z1
+**	smax	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (max_s8_m_untied, svint8_t,
+		z0 = svmax_s8_m (p0, z1, z2),
+		z0 = svmax_m (p0, z1, z2))
+
+/*
+** max_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	smax	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s8_m_tied1, svint8_t, int8_t,
+		 z0 = svmax_n_s8_m (p0, z0, x0),
+		 z0 = svmax_m (p0, z0, x0))
+
+/*
+** max_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	smax	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s8_m_untied, svint8_t, int8_t,
+		 z0 = svmax_n_s8_m (p0, z1, x0),
+		 z0 = svmax_m (p0, z1, x0))
+
+/*
+** max_1_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	smax	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s8_m_tied1, svint8_t,
+		z0 = svmax_n_s8_m (p0, z0, 1),
+		z0 = svmax_m (p0, z0, 1))
+
+/*
+** max_1_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0, z1
+**	smax	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s8_m_untied, svint8_t,
+		z0 = svmax_n_s8_m (p0, z1, 1),
+		z0 = svmax_m (p0, z1, 1))
+
+/*
+** max_m1_s8_m:
+**	mov	(z[0-9]+\.b), #-1
+**	smax	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_m1_s8_m, svint8_t,
+		z0 = svmax_n_s8_m (p0, z0, -1),
+		z0 = svmax_m (p0, z0, -1))
+
+/*
+** max_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	smax	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (max_s8_z_tied1, svint8_t,
+		z0 = svmax_s8_z (p0, z0, z1),
+		z0 = svmax_z (p0, z0, z1))
+
+/*
+** max_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	smax	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (max_s8_z_tied2, svint8_t,
+		z0 = svmax_s8_z (p0, z1, z0),
+		z0 = svmax_z (p0, z1, z0))
+
+/*
+** max_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	smax	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	smax	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_s8_z_untied, svint8_t,
+		z0 = svmax_s8_z (p0, z1, z2),
+		z0 = svmax_z (p0, z1, z2))
+
+/*
+** max_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	smax	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s8_z_tied1, svint8_t, int8_t,
+		 z0 = svmax_n_s8_z (p0, z0, x0),
+		 z0 = svmax_z (p0, z0, x0))
+
+/*
+** max_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	smax	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	smax	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s8_z_untied, svint8_t, int8_t,
+		 z0 = svmax_n_s8_z (p0, z1, x0),
+		 z0 = svmax_z (p0, z1, x0))
+
+/*
+** max_1_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	smax	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s8_z_tied1, svint8_t,
+		z0 = svmax_n_s8_z (p0, z0, 1),
+		z0 = svmax_z (p0, z0, 1))
+
+/*
+** max_1_s8_z_untied:
+**	mov	(z[0-9]+\.b), #1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	smax	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	smax	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s8_z_untied, svint8_t,
+		z0 = svmax_n_s8_z (p0, z1, 1),
+		z0 = svmax_z (p0, z1, 1))
+
+/*
+** max_s8_x_tied1:
+**	smax	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (max_s8_x_tied1, svint8_t,
+		z0 = svmax_s8_x (p0, z0, z1),
+		z0 = svmax_x (p0, z0, z1))
+
+/*
+** max_s8_x_tied2:
+**	smax	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (max_s8_x_tied2, svint8_t,
+		z0 = svmax_s8_x (p0, z1, z0),
+		z0 = svmax_x (p0, z1, z0))
+
+/*
+** max_s8_x_untied:
+** (
+**	movprfx	z0, z1
+**	smax	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0, z2
+**	smax	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_s8_x_untied, svint8_t,
+		z0 = svmax_s8_x (p0, z1, z2),
+		z0 = svmax_x (p0, z1, z2))
+
+/*
+** max_w0_s8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	smax	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s8_x_tied1, svint8_t, int8_t,
+		 z0 = svmax_n_s8_x (p0, z0, x0),
+		 z0 = svmax_x (p0, z0, x0))
+
+/*
+** max_w0_s8_x_untied:
+**	mov	z0\.b, w0
+**	smax	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_s8_x_untied, svint8_t, int8_t,
+		 z0 = svmax_n_s8_x (p0, z1, x0),
+		 z0 = svmax_x (p0, z1, x0))
+
+/*
+** max_1_s8_x_tied1:
+**	smax	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s8_x_tied1, svint8_t,
+		z0 = svmax_n_s8_x (p0, z0, 1),
+		z0 = svmax_x (p0, z0, 1))
+
+/*
+** max_1_s8_x_untied:
+**	movprfx	z0, z1
+**	smax	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_s8_x_untied, svint8_t,
+		z0 = svmax_n_s8_x (p0, z1, 1),
+		z0 = svmax_x (p0, z1, 1))
+
+/*
+** max_127_s8_x:
+**	smax	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (max_127_s8_x, svint8_t,
+		z0 = svmax_n_s8_x (p0, z0, 127),
+		z0 = svmax_x (p0, z0, 127))
+
+/*
+** max_m1_s8_x:
+**	smax	z0\.b, z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (max_m1_s8_x, svint8_t,
+		z0 = svmax_n_s8_x (p0, z0, -1),
+		z0 = svmax_x (p0, z0, -1))
+
+/*
+** max_m127_s8_x:
+**	smax	z0\.b, z0\.b, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (max_m127_s8_x, svint8_t,
+		z0 = svmax_n_s8_x (p0, z0, -127),
+		z0 = svmax_x (p0, z0, -127))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u16.c
new file mode 100644
index 000000000..9a0b95431
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u16.c
@@ -0,0 +1,293 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** max_u16_m_tied1:
+**	umax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_u16_m_tied1, svuint16_t,
+		z0 = svmax_u16_m (p0, z0, z1),
+		z0 = svmax_m (p0, z0, z1))
+
+/*
+** max_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	umax	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_u16_m_tied2, svuint16_t,
+		z0 = svmax_u16_m (p0, z1, z0),
+		z0 = svmax_m (p0, z1, z0))
+
+/*
+** max_u16_m_untied:
+**	movprfx	z0, z1
+**	umax	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_u16_m_untied, svuint16_t,
+		z0 = svmax_u16_m (p0, z1, z2),
+		z0 = svmax_m (p0, z1, z2))
+
+/*
+** max_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	umax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svmax_n_u16_m (p0, z0, x0),
+		 z0 = svmax_m (p0, z0, x0))
+
+/*
+** max_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	umax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svmax_n_u16_m (p0, z1, x0),
+		 z0 = svmax_m (p0, z1, x0))
+
+/*
+** max_1_u16_m_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	umax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u16_m_tied1, svuint16_t,
+		z0 = svmax_n_u16_m (p0, z0, 1),
+		z0 = svmax_m (p0, z0, 1))
+
+/*
+** max_1_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0, z1
+**	umax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u16_m_untied, svuint16_t,
+		z0 = svmax_n_u16_m (p0, z1, 1),
+		z0 = svmax_m (p0, z1, 1))
+
+/*
+** max_m1_u16_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	umax	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_m1_u16_m, svuint16_t,
+		z0 = svmax_n_u16_m (p0, z0, -1),
+		z0 = svmax_m (p0, z0, -1))
+
+/*
+** max_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	umax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_u16_z_tied1, svuint16_t,
+		z0 = svmax_u16_z (p0, z0, z1),
+		z0 = svmax_z (p0, z0, z1))
+
+/*
+** max_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	umax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_u16_z_tied2, svuint16_t,
+		z0 = svmax_u16_z (p0, z1, z0),
+		z0 = svmax_z (p0, z1, z0))
+
+/*
+** max_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	umax	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	umax	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_u16_z_untied, svuint16_t,
+		z0 = svmax_u16_z (p0, z1, z2),
+		z0 = svmax_z (p0, z1, z2))
+
+/*
+** max_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	umax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svmax_n_u16_z (p0, z0, x0),
+		 z0 = svmax_z (p0, z0, x0))
+
+/*
+** max_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	umax	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	umax	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svmax_n_u16_z (p0, z1, x0),
+		 z0 = svmax_z (p0, z1, x0))
+
+/*
+** max_1_u16_z_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	umax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u16_z_tied1, svuint16_t,
+		z0 = svmax_n_u16_z (p0, z0, 1),
+		z0 = svmax_z (p0, z0, 1))
+
+/*
+** max_1_u16_z_untied:
+**	mov	(z[0-9]+\.h), #1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	umax	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	umax	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u16_z_untied, svuint16_t,
+		z0 = svmax_n_u16_z (p0, z1, 1),
+		z0 = svmax_z (p0, z1, 1))
+
+/*
+** max_u16_x_tied1:
+**	umax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_u16_x_tied1, svuint16_t,
+		z0 = svmax_u16_x (p0, z0, z1),
+		z0 = svmax_x (p0, z0, z1))
+
+/*
+** max_u16_x_tied2:
+**	umax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (max_u16_x_tied2, svuint16_t,
+		z0 = svmax_u16_x (p0, z1, z0),
+		z0 = svmax_x (p0, z1, z0))
+
+/*
+** max_u16_x_untied:
+** (
+**	movprfx	z0, z1
+**	umax	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	umax	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_u16_x_untied, svuint16_t,
+		z0 = svmax_u16_x (p0, z1, z2),
+		z0 = svmax_x (p0, z1, z2))
+
+/*
+** max_w0_u16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	umax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svmax_n_u16_x (p0, z0, x0),
+		 z0 = svmax_x (p0, z0, x0))
+
+/*
+** max_w0_u16_x_untied:
+**	mov	z0\.h, w0
+**	umax	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svmax_n_u16_x (p0, z1, x0),
+		 z0 = svmax_x (p0, z1, x0))
+
+/*
+** max_1_u16_x_tied1:
+**	umax	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u16_x_tied1, svuint16_t,
+		z0 = svmax_n_u16_x (p0, z0, 1),
+		z0 = svmax_x (p0, z0, 1))
+
+/*
+** max_1_u16_x_untied:
+**	movprfx	z0, z1
+**	umax	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u16_x_untied, svuint16_t,
+		z0 = svmax_n_u16_x (p0, z1, 1),
+		z0 = svmax_x (p0, z1, 1))
+
+/*
+** max_127_u16_x:
+**	umax	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (max_127_u16_x, svuint16_t,
+		z0 = svmax_n_u16_x (p0, z0, 127),
+		z0 = svmax_x (p0, z0, 127))
+
+/*
+** max_128_u16_x:
+**	umax	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (max_128_u16_x, svuint16_t,
+		z0 = svmax_n_u16_x (p0, z0, 128),
+		z0 = svmax_x (p0, z0, 128))
+
+/*
+** max_255_u16_x:
+**	umax	z0\.h, z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (max_255_u16_x, svuint16_t,
+		z0 = svmax_n_u16_x (p0, z0, 255),
+		z0 = svmax_x (p0, z0, 255))
+
+/*
+** max_256_u16_x:
+**	mov	(z[0-9]+\.h), #256
+**	umax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_256_u16_x, svuint16_t,
+		z0 = svmax_n_u16_x (p0, z0, 256),
+		z0 = svmax_x (p0, z0, 256))
+
+/*
+** max_m2_u16_x:
+**	mov	(z[0-9]+\.h), #-2
+**	umax	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_m2_u16_x, svuint16_t,
+		z0 = svmax_n_u16_x (p0, z0, -2),
+		z0 = svmax_x (p0, z0, -2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u32.c
new file mode 100644
index 000000000..91eba25c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u32.c
@@ -0,0 +1,293 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** max_u32_m_tied1:
+**	umax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_u32_m_tied1, svuint32_t,
+		z0 = svmax_u32_m (p0, z0, z1),
+		z0 = svmax_m (p0, z0, z1))
+
+/*
+** max_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	umax	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_u32_m_tied2, svuint32_t,
+		z0 = svmax_u32_m (p0, z1, z0),
+		z0 = svmax_m (p0, z1, z0))
+
+/*
+** max_u32_m_untied:
+**	movprfx	z0, z1
+**	umax	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_u32_m_untied, svuint32_t,
+		z0 = svmax_u32_m (p0, z1, z2),
+		z0 = svmax_m (p0, z1, z2))
+
+/*
+** max_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	umax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svmax_n_u32_m (p0, z0, x0),
+		 z0 = svmax_m (p0, z0, x0))
+
+/*
+** max_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	umax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svmax_n_u32_m (p0, z1, x0),
+		 z0 = svmax_m (p0, z1, x0))
+
+/*
+** max_1_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	umax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u32_m_tied1, svuint32_t,
+		z0 = svmax_n_u32_m (p0, z0, 1),
+		z0 = svmax_m (p0, z0, 1))
+
+/*
+** max_1_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0, z1
+**	umax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u32_m_untied, svuint32_t,
+		z0 = svmax_n_u32_m (p0, z1, 1),
+		z0 = svmax_m (p0, z1, 1))
+
+/*
+** max_m1_u32_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	umax	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_m1_u32_m, svuint32_t,
+		z0 = svmax_n_u32_m (p0, z0, -1),
+		z0 = svmax_m (p0, z0, -1))
+
+/*
+** max_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	umax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_u32_z_tied1, svuint32_t,
+		z0 = svmax_u32_z (p0, z0, z1),
+		z0 = svmax_z (p0, z0, z1))
+
+/*
+** max_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	umax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_u32_z_tied2, svuint32_t,
+		z0 = svmax_u32_z (p0, z1, z0),
+		z0 = svmax_z (p0, z1, z0))
+
+/*
+** max_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	umax	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	umax	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_u32_z_untied, svuint32_t,
+		z0 = svmax_u32_z (p0, z1, z2),
+		z0 = svmax_z (p0, z1, z2))
+
+/*
+** max_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	umax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svmax_n_u32_z (p0, z0, x0),
+		 z0 = svmax_z (p0, z0, x0))
+
+/*
+** max_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	umax	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	umax	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svmax_n_u32_z (p0, z1, x0),
+		 z0 = svmax_z (p0, z1, x0))
+
+/*
+** max_1_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	umax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u32_z_tied1, svuint32_t,
+		z0 = svmax_n_u32_z (p0, z0, 1),
+		z0 = svmax_z (p0, z0, 1))
+
+/*
+** max_1_u32_z_untied:
+**	mov	(z[0-9]+\.s), #1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	umax	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	umax	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u32_z_untied, svuint32_t,
+		z0 = svmax_n_u32_z (p0, z1, 1),
+		z0 = svmax_z (p0, z1, 1))
+
+/*
+** max_u32_x_tied1:
+**	umax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_u32_x_tied1, svuint32_t,
+		z0 = svmax_u32_x (p0, z0, z1),
+		z0 = svmax_x (p0, z0, z1))
+
+/*
+** max_u32_x_tied2:
+**	umax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (max_u32_x_tied2, svuint32_t,
+		z0 = svmax_u32_x (p0, z1, z0),
+		z0 = svmax_x (p0, z1, z0))
+
+/*
+** max_u32_x_untied:
+** (
+**	movprfx	z0, z1
+**	umax	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	umax	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_u32_x_untied, svuint32_t,
+		z0 = svmax_u32_x (p0, z1, z2),
+		z0 = svmax_x (p0, z1, z2))
+
+/*
+** max_w0_u32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	umax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svmax_n_u32_x (p0, z0, x0),
+		 z0 = svmax_x (p0, z0, x0))
+
+/*
+** max_w0_u32_x_untied:
+**	mov	z0\.s, w0
+**	umax	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svmax_n_u32_x (p0, z1, x0),
+		 z0 = svmax_x (p0, z1, x0))
+
+/*
+** max_1_u32_x_tied1:
+**	umax	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u32_x_tied1, svuint32_t,
+		z0 = svmax_n_u32_x (p0, z0, 1),
+		z0 = svmax_x (p0, z0, 1))
+
+/*
+** max_1_u32_x_untied:
+**	movprfx	z0, z1
+**	umax	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u32_x_untied, svuint32_t,
+		z0 = svmax_n_u32_x (p0, z1, 1),
+		z0 = svmax_x (p0, z1, 1))
+
+/*
+** max_127_u32_x:
+**	umax	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (max_127_u32_x, svuint32_t,
+		z0 = svmax_n_u32_x (p0, z0, 127),
+		z0 = svmax_x (p0, z0, 127))
+
+/*
+** max_128_u32_x:
+**	umax	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (max_128_u32_x, svuint32_t,
+		z0 = svmax_n_u32_x (p0, z0, 128),
+		z0 = svmax_x (p0, z0, 128))
+
+/*
+** max_255_u32_x:
+**	umax	z0\.s, z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (max_255_u32_x, svuint32_t,
+		z0 = svmax_n_u32_x (p0, z0, 255),
+		z0 = svmax_x (p0, z0, 255))
+
+/*
+** max_256_u32_x:
+**	mov	(z[0-9]+\.s), #256
+**	umax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_256_u32_x, svuint32_t,
+		z0 = svmax_n_u32_x (p0, z0, 256),
+		z0 = svmax_x (p0, z0, 256))
+
+/*
+** max_m2_u32_x:
+**	mov	(z[0-9]+\.s), #-2
+**	umax	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_m2_u32_x, svuint32_t,
+		z0 = svmax_n_u32_x (p0, z0, -2),
+		z0 = svmax_x (p0, z0, -2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u64.c
new file mode 100644
index 000000000..5be4c9fb7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u64.c
@@ -0,0 +1,293 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** max_u64_m_tied1:
+**	umax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_u64_m_tied1, svuint64_t,
+		z0 = svmax_u64_m (p0, z0, z1),
+		z0 = svmax_m (p0, z0, z1))
+
+/*
+** max_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	umax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_u64_m_tied2, svuint64_t,
+		z0 = svmax_u64_m (p0, z1, z0),
+		z0 = svmax_m (p0, z1, z0))
+
+/*
+** max_u64_m_untied:
+**	movprfx	z0, z1
+**	umax	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_u64_m_untied, svuint64_t,
+		z0 = svmax_u64_m (p0, z1, z2),
+		z0 = svmax_m (p0, z1, z2))
+
+/*
+** max_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	umax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svmax_n_u64_m (p0, z0, x0),
+		 z0 = svmax_m (p0, z0, x0))
+
+/*
+** max_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	umax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svmax_n_u64_m (p0, z1, x0),
+		 z0 = svmax_m (p0, z1, x0))
+
+/*
+** max_1_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	umax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u64_m_tied1, svuint64_t,
+		z0 = svmax_n_u64_m (p0, z0, 1),
+		z0 = svmax_m (p0, z0, 1))
+
+/*
+** max_1_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0, z1
+**	umax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u64_m_untied, svuint64_t,
+		z0 = svmax_n_u64_m (p0, z1, 1),
+		z0 = svmax_m (p0, z1, 1))
+
+/*
+** max_m1_u64_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	umax	z0\.d, p0/m, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_m1_u64_m, svuint64_t,
+		z0 = svmax_n_u64_m (p0, z0, -1),
+		z0 = svmax_m (p0, z0, -1))
+
+/*
+** max_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	umax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_u64_z_tied1, svuint64_t,
+		z0 = svmax_u64_z (p0, z0, z1),
+		z0 = svmax_z (p0, z0, z1))
+
+/*
+** max_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	umax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_u64_z_tied2, svuint64_t,
+		z0 = svmax_u64_z (p0, z1, z0),
+		z0 = svmax_z (p0, z1, z0))
+
+/*
+** max_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	umax	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	umax	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_u64_z_untied, svuint64_t,
+		z0 = svmax_u64_z (p0, z1, z2),
+		z0 = svmax_z (p0, z1, z2))
+
+/*
+** max_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	umax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svmax_n_u64_z (p0, z0, x0),
+		 z0 = svmax_z (p0, z0, x0))
+
+/*
+** max_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	umax	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	umax	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (max_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svmax_n_u64_z (p0, z1, x0),
+		 z0 = svmax_z (p0, z1, x0))
+
+/*
+** max_1_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	umax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u64_z_tied1, svuint64_t,
+		z0 = svmax_n_u64_z (p0, z0, 1),
+		z0 = svmax_z (p0, z0, 1))
+
+/*
+** max_1_u64_z_untied:
+**	mov	(z[0-9]+\.d), #1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	umax	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	umax	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u64_z_untied, svuint64_t,
+		z0 = svmax_n_u64_z (p0, z1, 1),
+		z0 = svmax_z (p0, z1, 1))
+
+/*
+** max_u64_x_tied1:
+**	umax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_u64_x_tied1, svuint64_t,
+		z0 = svmax_u64_x (p0, z0, z1),
+		z0 = svmax_x (p0, z0, z1))
+
+/*
+** max_u64_x_tied2:
+**	umax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (max_u64_x_tied2, svuint64_t,
+		z0 = svmax_u64_x (p0, z1, z0),
+		z0 = svmax_x (p0, z1, z0))
+
+/*
+** max_u64_x_untied:
+** (
+**	movprfx	z0, z1
+**	umax	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	umax	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_u64_x_untied, svuint64_t,
+		z0 = svmax_u64_x (p0, z1, z2),
+		z0 = svmax_x (p0, z1, z2))
+
+/*
+** max_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	umax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svmax_n_u64_x (p0, z0, x0),
+		 z0 = svmax_x (p0, z0, x0))
+
+/*
+** max_x0_u64_x_untied:
+**	mov	z0\.d, x0
+**	umax	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (max_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svmax_n_u64_x (p0, z1, x0),
+		 z0 = svmax_x (p0, z1, x0))
+
+/*
+** max_1_u64_x_tied1:
+**	umax	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u64_x_tied1, svuint64_t,
+		z0 = svmax_n_u64_x (p0, z0, 1),
+		z0 = svmax_x (p0, z0, 1))
+
+/*
+** max_1_u64_x_untied:
+**	movprfx	z0, z1
+**	umax	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u64_x_untied, svuint64_t,
+		z0 = svmax_n_u64_x (p0, z1, 1),
+		z0 = svmax_x (p0, z1, 1))
+
+/*
+** max_127_u64_x:
+**	umax	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (max_127_u64_x, svuint64_t,
+		z0 = svmax_n_u64_x (p0, z0, 127),
+		z0 = svmax_x (p0, z0, 127))
+
+/*
+** max_128_u64_x:
+**	umax	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (max_128_u64_x, svuint64_t,
+		z0 = svmax_n_u64_x (p0, z0, 128),
+		z0 = svmax_x (p0, z0, 128))
+
+/*
+** max_255_u64_x:
+**	umax	z0\.d, z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (max_255_u64_x, svuint64_t,
+		z0 = svmax_n_u64_x (p0, z0, 255),
+		z0 = svmax_x (p0, z0, 255))
+
+/*
+** max_256_u64_x:
+**	mov	(z[0-9]+\.d), #256
+**	umax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_256_u64_x, svuint64_t,
+		z0 = svmax_n_u64_x (p0, z0, 256),
+		z0 = svmax_x (p0, z0, 256))
+
+/*
+** max_m2_u64_x:
+**	mov	(z[0-9]+\.d), #-2
+**	umax	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_m2_u64_x, svuint64_t,
+		z0 = svmax_n_u64_x (p0, z0, -2),
+		z0 = svmax_x (p0, z0, -2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u8.c
new file mode 100644
index 000000000..04c9ddb36
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/max_u8.c
@@ -0,0 +1,273 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** max_u8_m_tied1:
+**	umax	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (max_u8_m_tied1, svuint8_t,
+		z0 = svmax_u8_m (p0, z0, z1),
+		z0 = svmax_m (p0, z0, z1))
+
+/*
+** max_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	umax	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (max_u8_m_tied2, svuint8_t,
+		z0 = svmax_u8_m (p0, z1, z0),
+		z0 = svmax_m (p0, z1, z0))
+
+/*
+** max_u8_m_untied:
+**	movprfx	z0, z1
+**	umax	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (max_u8_m_untied, svuint8_t,
+		z0 = svmax_u8_m (p0, z1, z2),
+		z0 = svmax_m (p0, z1, z2))
+
+/*
+** max_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	umax	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svmax_n_u8_m (p0, z0, x0),
+		 z0 = svmax_m (p0, z0, x0))
+
+/*
+** max_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	umax	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svmax_n_u8_m (p0, z1, x0),
+		 z0 = svmax_m (p0, z1, x0))
+
+/*
+** max_1_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	umax	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u8_m_tied1, svuint8_t,
+		z0 = svmax_n_u8_m (p0, z0, 1),
+		z0 = svmax_m (p0, z0, 1))
+
+/*
+** max_1_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0, z1
+**	umax	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u8_m_untied, svuint8_t,
+		z0 = svmax_n_u8_m (p0, z1, 1),
+		z0 = svmax_m (p0, z1, 1))
+
+/*
+** max_m1_u8_m:
+**	mov	(z[0-9]+\.b), #-1
+**	umax	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_m1_u8_m, svuint8_t,
+		z0 = svmax_n_u8_m (p0, z0, -1),
+		z0 = svmax_m (p0, z0, -1))
+
+/*
+** max_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	umax	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (max_u8_z_tied1, svuint8_t,
+		z0 = svmax_u8_z (p0, z0, z1),
+		z0 = svmax_z (p0, z0, z1))
+
+/*
+** max_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	umax	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (max_u8_z_tied2, svuint8_t,
+		z0 = svmax_u8_z (p0, z1, z0),
+		z0 = svmax_z (p0, z1, z0))
+
+/*
+** max_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	umax	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	umax	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_u8_z_untied, svuint8_t,
+		z0 = svmax_u8_z (p0, z1, z2),
+		z0 = svmax_z (p0, z1, z2))
+
+/*
+** max_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	umax	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svmax_n_u8_z (p0, z0, x0),
+		 z0 = svmax_z (p0, z0, x0))
+
+/*
+** max_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	umax	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	umax	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svmax_n_u8_z (p0, z1, x0),
+		 z0 = svmax_z (p0, z1, x0))
+
+/*
+** max_1_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	umax	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u8_z_tied1, svuint8_t,
+		z0 = svmax_n_u8_z (p0, z0, 1),
+		z0 = svmax_z (p0, z0, 1))
+
+/*
+** max_1_u8_z_untied:
+**	mov	(z[0-9]+\.b), #1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	umax	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	umax	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u8_z_untied, svuint8_t,
+		z0 = svmax_n_u8_z (p0, z1, 1),
+		z0 = svmax_z (p0, z1, 1))
+
+/*
+** max_u8_x_tied1:
+**	umax	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (max_u8_x_tied1, svuint8_t,
+		z0 = svmax_u8_x (p0, z0, z1),
+		z0 = svmax_x (p0, z0, z1))
+
+/*
+** max_u8_x_tied2:
+**	umax	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (max_u8_x_tied2, svuint8_t,
+		z0 = svmax_u8_x (p0, z1, z0),
+		z0 = svmax_x (p0, z1, z0))
+
+/*
+** max_u8_x_untied:
+** (
+**	movprfx	z0, z1
+**	umax	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0, z2
+**	umax	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (max_u8_x_untied, svuint8_t,
+		z0 = svmax_u8_x (p0, z1, z2),
+		z0 = svmax_x (p0, z1, z2))
+
+/*
+** max_w0_u8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	umax	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svmax_n_u8_x (p0, z0, x0),
+		 z0 = svmax_x (p0, z0, x0))
+
+/*
+** max_w0_u8_x_untied:
+**	mov	z0\.b, w0
+**	umax	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (max_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svmax_n_u8_x (p0, z1, x0),
+		 z0 = svmax_x (p0, z1, x0))
+
+/*
+** max_1_u8_x_tied1:
+**	umax	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u8_x_tied1, svuint8_t,
+		z0 = svmax_n_u8_x (p0, z0, 1),
+		z0 = svmax_x (p0, z0, 1))
+
+/*
+** max_1_u8_x_untied:
+**	movprfx	z0, z1
+**	umax	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (max_1_u8_x_untied, svuint8_t,
+		z0 = svmax_n_u8_x (p0, z1, 1),
+		z0 = svmax_x (p0, z1, 1))
+
+/*
+** max_127_u8_x:
+**	umax	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (max_127_u8_x, svuint8_t,
+		z0 = svmax_n_u8_x (p0, z0, 127),
+		z0 = svmax_x (p0, z0, 127))
+
+/*
+** max_128_u8_x:
+**	umax	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (max_128_u8_x, svuint8_t,
+		z0 = svmax_n_u8_x (p0, z0, 128),
+		z0 = svmax_x (p0, z0, 128))
+
+/*
+** max_254_u8_x:
+**	umax	z0\.b, z0\.b, #254
+**	ret
+*/
+TEST_UNIFORM_Z (max_254_u8_x, svuint8_t,
+		z0 = svmax_n_u8_x (p0, z0, 254),
+		z0 = svmax_x (p0, z0, 254))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f16.c
new file mode 100644
index 000000000..a9da710d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f16.c
@@ -0,0 +1,425 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxnm_f16_m_tied1:
+**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f16_m_tied1, svfloat16_t,
+		z0 = svmaxnm_f16_m (p0, z0, z1),
+		z0 = svmaxnm_m (p0, z0, z1))
+
+/*
+** maxnm_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmaxnm	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f16_m_tied2, svfloat16_t,
+		z0 = svmaxnm_f16_m (p0, z1, z0),
+		z0 = svmaxnm_m (p0, z1, z0))
+
+/*
+** maxnm_f16_m_untied:
+**	movprfx	z0, z1
+**	fmaxnm	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f16_m_untied, svfloat16_t,
+		z0 = svmaxnm_f16_m (p0, z1, z2),
+		z0 = svmaxnm_m (p0, z1, z2))
+
+/*
+** maxnm_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmaxnm	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svmaxnm_n_f16_m (p0, z0, d4),
+		 z0 = svmaxnm_m (p0, z0, d4))
+
+/*
+** maxnm_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fmaxnm	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svmaxnm_n_f16_m (p0, z1, d4),
+		 z0 = svmaxnm_m (p0, z1, d4))
+
+/*
+** maxnm_0_f16_m_tied1:
+**	fmaxnm	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f16_m_tied1, svfloat16_t,
+		z0 = svmaxnm_n_f16_m (p0, z0, 0),
+		z0 = svmaxnm_m (p0, z0, 0))
+
+/*
+** maxnm_0_f16_m_untied:
+**	movprfx	z0, z1
+**	fmaxnm	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f16_m_untied, svfloat16_t,
+		z0 = svmaxnm_n_f16_m (p0, z1, 0),
+		z0 = svmaxnm_m (p0, z1, 0))
+
+/*
+** maxnm_1_f16_m_tied1:
+**	fmaxnm	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f16_m_tied1, svfloat16_t,
+		z0 = svmaxnm_n_f16_m (p0, z0, 1),
+		z0 = svmaxnm_m (p0, z0, 1))
+
+/*
+** maxnm_1_f16_m_untied:
+**	movprfx	z0, z1
+**	fmaxnm	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f16_m_untied, svfloat16_t,
+		z0 = svmaxnm_n_f16_m (p0, z1, 1),
+		z0 = svmaxnm_m (p0, z1, 1))
+
+/*
+** maxnm_2_f16_m:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmaxnm	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_2_f16_m, svfloat16_t,
+		z0 = svmaxnm_n_f16_m (p0, z0, 2),
+		z0 = svmaxnm_m (p0, z0, 2))
+
+/*
+** maxnm_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f16_z_tied1, svfloat16_t,
+		z0 = svmaxnm_f16_z (p0, z0, z1),
+		z0 = svmaxnm_z (p0, z0, z1))
+
+/*
+** maxnm_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f16_z_tied2, svfloat16_t,
+		z0 = svmaxnm_f16_z (p0, z1, z0),
+		z0 = svmaxnm_z (p0, z1, z0))
+
+/*
+** maxnm_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmaxnm	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f16_z_untied, svfloat16_t,
+		z0 = svmaxnm_f16_z (p0, z1, z2),
+		z0 = svmaxnm_z (p0, z1, z2))
+
+/*
+** maxnm_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmaxnm	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svmaxnm_n_f16_z (p0, z0, d4),
+		 z0 = svmaxnm_z (p0, z0, d4))
+
+/*
+** maxnm_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmaxnm	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svmaxnm_n_f16_z (p0, z1, d4),
+		 z0 = svmaxnm_z (p0, z1, d4))
+
+/*
+** maxnm_0_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmaxnm	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f16_z_tied1, svfloat16_t,
+		z0 = svmaxnm_n_f16_z (p0, z0, 0),
+		z0 = svmaxnm_z (p0, z0, 0))
+
+/*
+** maxnm_0_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmaxnm	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f16_z_untied, svfloat16_t,
+		z0 = svmaxnm_n_f16_z (p0, z1, 0),
+		z0 = svmaxnm_z (p0, z1, 0))
+
+/*
+** maxnm_1_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmaxnm	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f16_z_tied1, svfloat16_t,
+		z0 = svmaxnm_n_f16_z (p0, z0, 1),
+		z0 = svmaxnm_z (p0, z0, 1))
+
+/*
+** maxnm_1_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmaxnm	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f16_z_untied, svfloat16_t,
+		z0 = svmaxnm_n_f16_z (p0, z1, 1),
+		z0 = svmaxnm_z (p0, z1, 1))
+
+/*
+** maxnm_2_f16_z:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmaxnm	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_2_f16_z, svfloat16_t,
+		z0 = svmaxnm_n_f16_z (p0, z0, 2),
+		z0 = svmaxnm_z (p0, z0, 2))
+
+/*
+** maxnm_f16_x_tied1:
+**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f16_x_tied1, svfloat16_t,
+		z0 = svmaxnm_f16_x (p0, z0, z1),
+		z0 = svmaxnm_x (p0, z0, z1))
+
+/*
+** maxnm_f16_x_tied2:
+**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f16_x_tied2, svfloat16_t,
+		z0 = svmaxnm_f16_x (p0, z1, z0),
+		z0 = svmaxnm_x (p0, z1, z0))
+
+/*
+** maxnm_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmaxnm	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f16_x_untied, svfloat16_t,
+		z0 = svmaxnm_f16_x (p0, z1, z2),
+		z0 = svmaxnm_x (p0, z1, z2))
+
+/*
+** maxnm_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmaxnm	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svmaxnm_n_f16_x (p0, z0, d4),
+		 z0 = svmaxnm_x (p0, z0, d4))
+
+/*
+** maxnm_h4_f16_x_untied:
+**	mov	z0\.h, h4
+**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svmaxnm_n_f16_x (p0, z1, d4),
+		 z0 = svmaxnm_x (p0, z1, d4))
+
+/*
+** maxnm_0_f16_x_tied1:
+**	fmaxnm	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f16_x_tied1, svfloat16_t,
+		z0 = svmaxnm_n_f16_x (p0, z0, 0),
+		z0 = svmaxnm_x (p0, z0, 0))
+
+/*
+** maxnm_0_f16_x_untied:
+**	movprfx	z0, z1
+**	fmaxnm	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f16_x_untied, svfloat16_t,
+		z0 = svmaxnm_n_f16_x (p0, z1, 0),
+		z0 = svmaxnm_x (p0, z1, 0))
+
+/*
+** maxnm_1_f16_x_tied1:
+**	fmaxnm	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f16_x_tied1, svfloat16_t,
+		z0 = svmaxnm_n_f16_x (p0, z0, 1),
+		z0 = svmaxnm_x (p0, z0, 1))
+
+/*
+** maxnm_1_f16_x_untied:
+**	movprfx	z0, z1
+**	fmaxnm	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f16_x_untied, svfloat16_t,
+		z0 = svmaxnm_n_f16_x (p0, z1, 1),
+		z0 = svmaxnm_x (p0, z1, 1))
+
+/*
+** maxnm_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmaxnm	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_2_f16_x_tied1, svfloat16_t,
+		z0 = svmaxnm_n_f16_x (p0, z0, 2),
+		z0 = svmaxnm_x (p0, z0, 2))
+
+/*
+** maxnm_2_f16_x_untied:
+**	fmov	z0\.h, #2\.0(?:e\+0)?
+**	fmaxnm	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_2_f16_x_untied, svfloat16_t,
+		z0 = svmaxnm_n_f16_x (p0, z1, 2),
+		z0 = svmaxnm_x (p0, z1, 2))
+
+/*
+** ptrue_maxnm_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_f16_x_tied1, svfloat16_t,
+		z0 = svmaxnm_f16_x (svptrue_b16 (), z0, z1),
+		z0 = svmaxnm_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_maxnm_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_f16_x_tied2, svfloat16_t,
+		z0 = svmaxnm_f16_x (svptrue_b16 (), z1, z0),
+		z0 = svmaxnm_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_maxnm_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_f16_x_untied, svfloat16_t,
+		z0 = svmaxnm_f16_x (svptrue_b16 (), z1, z2),
+		z0 = svmaxnm_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_maxnm_0_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_0_f16_x_tied1, svfloat16_t,
+		z0 = svmaxnm_n_f16_x (svptrue_b16 (), z0, 0),
+		z0 = svmaxnm_x (svptrue_b16 (), z0, 0))
+
+/*
+** ptrue_maxnm_0_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_0_f16_x_untied, svfloat16_t,
+		z0 = svmaxnm_n_f16_x (svptrue_b16 (), z1, 0),
+		z0 = svmaxnm_x (svptrue_b16 (), z1, 0))
+
+/*
+** ptrue_maxnm_1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_1_f16_x_tied1, svfloat16_t,
+		z0 = svmaxnm_n_f16_x (svptrue_b16 (), z0, 1),
+		z0 = svmaxnm_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_maxnm_1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_1_f16_x_untied, svfloat16_t,
+		z0 = svmaxnm_n_f16_x (svptrue_b16 (), z1, 1),
+		z0 = svmaxnm_x (svptrue_b16 (), z1, 1))
+
+/*
+** ptrue_maxnm_2_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_2_f16_x_tied1, svfloat16_t,
+		z0 = svmaxnm_n_f16_x (svptrue_b16 (), z0, 2),
+		z0 = svmaxnm_x (svptrue_b16 (), z0, 2))
+
+/*
+** ptrue_maxnm_2_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_2_f16_x_untied, svfloat16_t,
+		z0 = svmaxnm_n_f16_x (svptrue_b16 (), z1, 2),
+		z0 = svmaxnm_x (svptrue_b16 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f32.c
new file mode 100644
index 000000000..4657d57c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f32.c
@@ -0,0 +1,425 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxnm_f32_m_tied1:
+**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f32_m_tied1, svfloat32_t,
+		z0 = svmaxnm_f32_m (p0, z0, z1),
+		z0 = svmaxnm_m (p0, z0, z1))
+
+/*
+** maxnm_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmaxnm	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f32_m_tied2, svfloat32_t,
+		z0 = svmaxnm_f32_m (p0, z1, z0),
+		z0 = svmaxnm_m (p0, z1, z0))
+
+/*
+** maxnm_f32_m_untied:
+**	movprfx	z0, z1
+**	fmaxnm	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f32_m_untied, svfloat32_t,
+		z0 = svmaxnm_f32_m (p0, z1, z2),
+		z0 = svmaxnm_m (p0, z1, z2))
+
+/*
+** maxnm_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmaxnm	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svmaxnm_n_f32_m (p0, z0, d4),
+		 z0 = svmaxnm_m (p0, z0, d4))
+
+/*
+** maxnm_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fmaxnm	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svmaxnm_n_f32_m (p0, z1, d4),
+		 z0 = svmaxnm_m (p0, z1, d4))
+
+/*
+** maxnm_0_f32_m_tied1:
+**	fmaxnm	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f32_m_tied1, svfloat32_t,
+		z0 = svmaxnm_n_f32_m (p0, z0, 0),
+		z0 = svmaxnm_m (p0, z0, 0))
+
+/*
+** maxnm_0_f32_m_untied:
+**	movprfx	z0, z1
+**	fmaxnm	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f32_m_untied, svfloat32_t,
+		z0 = svmaxnm_n_f32_m (p0, z1, 0),
+		z0 = svmaxnm_m (p0, z1, 0))
+
+/*
+** maxnm_1_f32_m_tied1:
+**	fmaxnm	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f32_m_tied1, svfloat32_t,
+		z0 = svmaxnm_n_f32_m (p0, z0, 1),
+		z0 = svmaxnm_m (p0, z0, 1))
+
+/*
+** maxnm_1_f32_m_untied:
+**	movprfx	z0, z1
+**	fmaxnm	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f32_m_untied, svfloat32_t,
+		z0 = svmaxnm_n_f32_m (p0, z1, 1),
+		z0 = svmaxnm_m (p0, z1, 1))
+
+/*
+** maxnm_2_f32_m:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmaxnm	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_2_f32_m, svfloat32_t,
+		z0 = svmaxnm_n_f32_m (p0, z0, 2),
+		z0 = svmaxnm_m (p0, z0, 2))
+
+/*
+** maxnm_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f32_z_tied1, svfloat32_t,
+		z0 = svmaxnm_f32_z (p0, z0, z1),
+		z0 = svmaxnm_z (p0, z0, z1))
+
+/*
+** maxnm_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f32_z_tied2, svfloat32_t,
+		z0 = svmaxnm_f32_z (p0, z1, z0),
+		z0 = svmaxnm_z (p0, z1, z0))
+
+/*
+** maxnm_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmaxnm	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f32_z_untied, svfloat32_t,
+		z0 = svmaxnm_f32_z (p0, z1, z2),
+		z0 = svmaxnm_z (p0, z1, z2))
+
+/*
+** maxnm_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmaxnm	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svmaxnm_n_f32_z (p0, z0, d4),
+		 z0 = svmaxnm_z (p0, z0, d4))
+
+/*
+** maxnm_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmaxnm	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svmaxnm_n_f32_z (p0, z1, d4),
+		 z0 = svmaxnm_z (p0, z1, d4))
+
+/*
+** maxnm_0_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmaxnm	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f32_z_tied1, svfloat32_t,
+		z0 = svmaxnm_n_f32_z (p0, z0, 0),
+		z0 = svmaxnm_z (p0, z0, 0))
+
+/*
+** maxnm_0_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmaxnm	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f32_z_untied, svfloat32_t,
+		z0 = svmaxnm_n_f32_z (p0, z1, 0),
+		z0 = svmaxnm_z (p0, z1, 0))
+
+/*
+** maxnm_1_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmaxnm	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f32_z_tied1, svfloat32_t,
+		z0 = svmaxnm_n_f32_z (p0, z0, 1),
+		z0 = svmaxnm_z (p0, z0, 1))
+
+/*
+** maxnm_1_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmaxnm	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f32_z_untied, svfloat32_t,
+		z0 = svmaxnm_n_f32_z (p0, z1, 1),
+		z0 = svmaxnm_z (p0, z1, 1))
+
+/*
+** maxnm_2_f32_z:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmaxnm	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_2_f32_z, svfloat32_t,
+		z0 = svmaxnm_n_f32_z (p0, z0, 2),
+		z0 = svmaxnm_z (p0, z0, 2))
+
+/*
+** maxnm_f32_x_tied1:
+**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f32_x_tied1, svfloat32_t,
+		z0 = svmaxnm_f32_x (p0, z0, z1),
+		z0 = svmaxnm_x (p0, z0, z1))
+
+/*
+** maxnm_f32_x_tied2:
+**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f32_x_tied2, svfloat32_t,
+		z0 = svmaxnm_f32_x (p0, z1, z0),
+		z0 = svmaxnm_x (p0, z1, z0))
+
+/*
+** maxnm_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmaxnm	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f32_x_untied, svfloat32_t,
+		z0 = svmaxnm_f32_x (p0, z1, z2),
+		z0 = svmaxnm_x (p0, z1, z2))
+
+/*
+** maxnm_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmaxnm	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svmaxnm_n_f32_x (p0, z0, d4),
+		 z0 = svmaxnm_x (p0, z0, d4))
+
+/*
+** maxnm_s4_f32_x_untied:
+**	mov	z0\.s, s4
+**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svmaxnm_n_f32_x (p0, z1, d4),
+		 z0 = svmaxnm_x (p0, z1, d4))
+
+/*
+** maxnm_0_f32_x_tied1:
+**	fmaxnm	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f32_x_tied1, svfloat32_t,
+		z0 = svmaxnm_n_f32_x (p0, z0, 0),
+		z0 = svmaxnm_x (p0, z0, 0))
+
+/*
+** maxnm_0_f32_x_untied:
+**	movprfx	z0, z1
+**	fmaxnm	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f32_x_untied, svfloat32_t,
+		z0 = svmaxnm_n_f32_x (p0, z1, 0),
+		z0 = svmaxnm_x (p0, z1, 0))
+
+/*
+** maxnm_1_f32_x_tied1:
+**	fmaxnm	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f32_x_tied1, svfloat32_t,
+		z0 = svmaxnm_n_f32_x (p0, z0, 1),
+		z0 = svmaxnm_x (p0, z0, 1))
+
+/*
+** maxnm_1_f32_x_untied:
+**	movprfx	z0, z1
+**	fmaxnm	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f32_x_untied, svfloat32_t,
+		z0 = svmaxnm_n_f32_x (p0, z1, 1),
+		z0 = svmaxnm_x (p0, z1, 1))
+
+/*
+** maxnm_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmaxnm	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_2_f32_x_tied1, svfloat32_t,
+		z0 = svmaxnm_n_f32_x (p0, z0, 2),
+		z0 = svmaxnm_x (p0, z0, 2))
+
+/*
+** maxnm_2_f32_x_untied:
+**	fmov	z0\.s, #2\.0(?:e\+0)?
+**	fmaxnm	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_2_f32_x_untied, svfloat32_t,
+		z0 = svmaxnm_n_f32_x (p0, z1, 2),
+		z0 = svmaxnm_x (p0, z1, 2))
+
+/*
+** ptrue_maxnm_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_f32_x_tied1, svfloat32_t,
+		z0 = svmaxnm_f32_x (svptrue_b32 (), z0, z1),
+		z0 = svmaxnm_x (svptrue_b32 (), z0, z1))
+
+/*
+** ptrue_maxnm_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_f32_x_tied2, svfloat32_t,
+		z0 = svmaxnm_f32_x (svptrue_b32 (), z1, z0),
+		z0 = svmaxnm_x (svptrue_b32 (), z1, z0))
+
+/*
+** ptrue_maxnm_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_f32_x_untied, svfloat32_t,
+		z0 = svmaxnm_f32_x (svptrue_b32 (), z1, z2),
+		z0 = svmaxnm_x (svptrue_b32 (), z1, z2))
+
+/*
+** ptrue_maxnm_0_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_0_f32_x_tied1, svfloat32_t,
+		z0 = svmaxnm_n_f32_x (svptrue_b32 (), z0, 0),
+		z0 = svmaxnm_x (svptrue_b32 (), z0, 0))
+
+/*
+** ptrue_maxnm_0_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_0_f32_x_untied, svfloat32_t,
+		z0 = svmaxnm_n_f32_x (svptrue_b32 (), z1, 0),
+		z0 = svmaxnm_x (svptrue_b32 (), z1, 0))
+
+/*
+** ptrue_maxnm_1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_1_f32_x_tied1, svfloat32_t,
+		z0 = svmaxnm_n_f32_x (svptrue_b32 (), z0, 1),
+		z0 = svmaxnm_x (svptrue_b32 (), z0, 1))
+
+/*
+** ptrue_maxnm_1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_1_f32_x_untied, svfloat32_t,
+		z0 = svmaxnm_n_f32_x (svptrue_b32 (), z1, 1),
+		z0 = svmaxnm_x (svptrue_b32 (), z1, 1))
+
+/*
+** ptrue_maxnm_2_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_2_f32_x_tied1, svfloat32_t,
+		z0 = svmaxnm_n_f32_x (svptrue_b32 (), z0, 2),
+		z0 = svmaxnm_x (svptrue_b32 (), z0, 2))
+
+/*
+** ptrue_maxnm_2_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_2_f32_x_untied, svfloat32_t,
+		z0 = svmaxnm_n_f32_x (svptrue_b32 (), z1, 2),
+		z0 = svmaxnm_x (svptrue_b32 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f64.c
new file mode 100644
index 000000000..07d88e6c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnm_f64.c
@@ -0,0 +1,425 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxnm_f64_m_tied1:
+**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f64_m_tied1, svfloat64_t,
+		z0 = svmaxnm_f64_m (p0, z0, z1),
+		z0 = svmaxnm_m (p0, z0, z1))
+
+/*
+** maxnm_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmaxnm	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f64_m_tied2, svfloat64_t,
+		z0 = svmaxnm_f64_m (p0, z1, z0),
+		z0 = svmaxnm_m (p0, z1, z0))
+
+/*
+** maxnm_f64_m_untied:
+**	movprfx	z0, z1
+**	fmaxnm	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f64_m_untied, svfloat64_t,
+		z0 = svmaxnm_f64_m (p0, z1, z2),
+		z0 = svmaxnm_m (p0, z1, z2))
+
+/*
+** maxnm_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmaxnm	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svmaxnm_n_f64_m (p0, z0, d4),
+		 z0 = svmaxnm_m (p0, z0, d4))
+
+/*
+** maxnm_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fmaxnm	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svmaxnm_n_f64_m (p0, z1, d4),
+		 z0 = svmaxnm_m (p0, z1, d4))
+
+/*
+** maxnm_0_f64_m_tied1:
+**	fmaxnm	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f64_m_tied1, svfloat64_t,
+		z0 = svmaxnm_n_f64_m (p0, z0, 0),
+		z0 = svmaxnm_m (p0, z0, 0))
+
+/*
+** maxnm_0_f64_m_untied:
+**	movprfx	z0, z1
+**	fmaxnm	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f64_m_untied, svfloat64_t,
+		z0 = svmaxnm_n_f64_m (p0, z1, 0),
+		z0 = svmaxnm_m (p0, z1, 0))
+
+/*
+** maxnm_1_f64_m_tied1:
+**	fmaxnm	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f64_m_tied1, svfloat64_t,
+		z0 = svmaxnm_n_f64_m (p0, z0, 1),
+		z0 = svmaxnm_m (p0, z0, 1))
+
+/*
+** maxnm_1_f64_m_untied:
+**	movprfx	z0, z1
+**	fmaxnm	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f64_m_untied, svfloat64_t,
+		z0 = svmaxnm_n_f64_m (p0, z1, 1),
+		z0 = svmaxnm_m (p0, z1, 1))
+
+/*
+** maxnm_2_f64_m:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmaxnm	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_2_f64_m, svfloat64_t,
+		z0 = svmaxnm_n_f64_m (p0, z0, 2),
+		z0 = svmaxnm_m (p0, z0, 2))
+
+/*
+** maxnm_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f64_z_tied1, svfloat64_t,
+		z0 = svmaxnm_f64_z (p0, z0, z1),
+		z0 = svmaxnm_z (p0, z0, z1))
+
+/*
+** maxnm_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f64_z_tied2, svfloat64_t,
+		z0 = svmaxnm_f64_z (p0, z1, z0),
+		z0 = svmaxnm_z (p0, z1, z0))
+
+/*
+** maxnm_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmaxnm	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f64_z_untied, svfloat64_t,
+		z0 = svmaxnm_f64_z (p0, z1, z2),
+		z0 = svmaxnm_z (p0, z1, z2))
+
+/*
+** maxnm_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmaxnm	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svmaxnm_n_f64_z (p0, z0, d4),
+		 z0 = svmaxnm_z (p0, z0, d4))
+
+/*
+** maxnm_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmaxnm	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svmaxnm_n_f64_z (p0, z1, d4),
+		 z0 = svmaxnm_z (p0, z1, d4))
+
+/*
+** maxnm_0_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmaxnm	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f64_z_tied1, svfloat64_t,
+		z0 = svmaxnm_n_f64_z (p0, z0, 0),
+		z0 = svmaxnm_z (p0, z0, 0))
+
+/*
+** maxnm_0_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmaxnm	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f64_z_untied, svfloat64_t,
+		z0 = svmaxnm_n_f64_z (p0, z1, 0),
+		z0 = svmaxnm_z (p0, z1, 0))
+
+/*
+** maxnm_1_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmaxnm	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f64_z_tied1, svfloat64_t,
+		z0 = svmaxnm_n_f64_z (p0, z0, 1),
+		z0 = svmaxnm_z (p0, z0, 1))
+
+/*
+** maxnm_1_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmaxnm	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f64_z_untied, svfloat64_t,
+		z0 = svmaxnm_n_f64_z (p0, z1, 1),
+		z0 = svmaxnm_z (p0, z1, 1))
+
+/*
+** maxnm_2_f64_z:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmaxnm	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_2_f64_z, svfloat64_t,
+		z0 = svmaxnm_n_f64_z (p0, z0, 2),
+		z0 = svmaxnm_z (p0, z0, 2))
+
+/*
+** maxnm_f64_x_tied1:
+**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f64_x_tied1, svfloat64_t,
+		z0 = svmaxnm_f64_x (p0, z0, z1),
+		z0 = svmaxnm_x (p0, z0, z1))
+
+/*
+** maxnm_f64_x_tied2:
+**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f64_x_tied2, svfloat64_t,
+		z0 = svmaxnm_f64_x (p0, z1, z0),
+		z0 = svmaxnm_x (p0, z1, z0))
+
+/*
+** maxnm_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmaxnm	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_f64_x_untied, svfloat64_t,
+		z0 = svmaxnm_f64_x (p0, z1, z2),
+		z0 = svmaxnm_x (p0, z1, z2))
+
+/*
+** maxnm_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmaxnm	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svmaxnm_n_f64_x (p0, z0, d4),
+		 z0 = svmaxnm_x (p0, z0, d4))
+
+/*
+** maxnm_d4_f64_x_untied:
+**	mov	z0\.d, d4
+**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (maxnm_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svmaxnm_n_f64_x (p0, z1, d4),
+		 z0 = svmaxnm_x (p0, z1, d4))
+
+/*
+** maxnm_0_f64_x_tied1:
+**	fmaxnm	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f64_x_tied1, svfloat64_t,
+		z0 = svmaxnm_n_f64_x (p0, z0, 0),
+		z0 = svmaxnm_x (p0, z0, 0))
+
+/*
+** maxnm_0_f64_x_untied:
+**	movprfx	z0, z1
+**	fmaxnm	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_0_f64_x_untied, svfloat64_t,
+		z0 = svmaxnm_n_f64_x (p0, z1, 0),
+		z0 = svmaxnm_x (p0, z1, 0))
+
+/*
+** maxnm_1_f64_x_tied1:
+**	fmaxnm	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f64_x_tied1, svfloat64_t,
+		z0 = svmaxnm_n_f64_x (p0, z0, 1),
+		z0 = svmaxnm_x (p0, z0, 1))
+
+/*
+** maxnm_1_f64_x_untied:
+**	movprfx	z0, z1
+**	fmaxnm	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_1_f64_x_untied, svfloat64_t,
+		z0 = svmaxnm_n_f64_x (p0, z1, 1),
+		z0 = svmaxnm_x (p0, z1, 1))
+
+/*
+** maxnm_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmaxnm	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_2_f64_x_tied1, svfloat64_t,
+		z0 = svmaxnm_n_f64_x (p0, z0, 2),
+		z0 = svmaxnm_x (p0, z0, 2))
+
+/*
+** maxnm_2_f64_x_untied:
+**	fmov	z0\.d, #2\.0(?:e\+0)?
+**	fmaxnm	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (maxnm_2_f64_x_untied, svfloat64_t,
+		z0 = svmaxnm_n_f64_x (p0, z1, 2),
+		z0 = svmaxnm_x (p0, z1, 2))
+
+/*
+** ptrue_maxnm_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_f64_x_tied1, svfloat64_t,
+		z0 = svmaxnm_f64_x (svptrue_b64 (), z0, z1),
+		z0 = svmaxnm_x (svptrue_b64 (), z0, z1))
+
+/*
+** ptrue_maxnm_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_f64_x_tied2, svfloat64_t,
+		z0 = svmaxnm_f64_x (svptrue_b64 (), z1, z0),
+		z0 = svmaxnm_x (svptrue_b64 (), z1, z0))
+
+/*
+** ptrue_maxnm_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_f64_x_untied, svfloat64_t,
+		z0 = svmaxnm_f64_x (svptrue_b64 (), z1, z2),
+		z0 = svmaxnm_x (svptrue_b64 (), z1, z2))
+
+/*
+** ptrue_maxnm_0_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_0_f64_x_tied1, svfloat64_t,
+		z0 = svmaxnm_n_f64_x (svptrue_b64 (), z0, 0),
+		z0 = svmaxnm_x (svptrue_b64 (), z0, 0))
+
+/*
+** ptrue_maxnm_0_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_0_f64_x_untied, svfloat64_t,
+		z0 = svmaxnm_n_f64_x (svptrue_b64 (), z1, 0),
+		z0 = svmaxnm_x (svptrue_b64 (), z1, 0))
+
+/*
+** ptrue_maxnm_1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_1_f64_x_tied1, svfloat64_t,
+		z0 = svmaxnm_n_f64_x (svptrue_b64 (), z0, 1),
+		z0 = svmaxnm_x (svptrue_b64 (), z0, 1))
+
+/*
+** ptrue_maxnm_1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_1_f64_x_untied, svfloat64_t,
+		z0 = svmaxnm_n_f64_x (svptrue_b64 (), z1, 1),
+		z0 = svmaxnm_x (svptrue_b64 (), z1, 1))
+
+/*
+** ptrue_maxnm_2_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_2_f64_x_tied1, svfloat64_t,
+		z0 = svmaxnm_n_f64_x (svptrue_b64 (), z0, 2),
+		z0 = svmaxnm_x (svptrue_b64 (), z0, 2))
+
+/*
+** ptrue_maxnm_2_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_2_f64_x_untied, svfloat64_t,
+		z0 = svmaxnm_n_f64_x (svptrue_b64 (), z1, 2),
+		z0 = svmaxnm_x (svptrue_b64 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f16.c
new file mode 100644
index 000000000..086bcf974
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxnmv_d0_f16_tied:
+**	fmaxnmv	h0, p0, z0\.h
+**	ret
+*/
+TEST_REDUCTION_D (maxnmv_d0_f16_tied, float16_t, svfloat16_t,
+		  d0 = svmaxnmv_f16 (p0, z0),
+		  d0 = svmaxnmv (p0, z0))
+
+/*
+** maxnmv_d0_f16_untied:
+**	fmaxnmv	h0, p0, z1\.h
+**	ret
+*/
+TEST_REDUCTION_D (maxnmv_d0_f16_untied, float16_t, svfloat16_t,
+		  d0 = svmaxnmv_f16 (p0, z1),
+		  d0 = svmaxnmv (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f32.c
new file mode 100644
index 000000000..7fca8bc9e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxnmv_d0_f32_tied:
+**	fmaxnmv	s0, p0, z0\.s
+**	ret
+*/
+TEST_REDUCTION_D (maxnmv_d0_f32_tied, float32_t, svfloat32_t,
+		  d0 = svmaxnmv_f32 (p0, z0),
+		  d0 = svmaxnmv (p0, z0))
+
+/*
+** maxnmv_d0_f32_untied:
+**	fmaxnmv	s0, p0, z1\.s
+**	ret
+*/
+TEST_REDUCTION_D (maxnmv_d0_f32_untied, float32_t, svfloat32_t,
+		  d0 = svmaxnmv_f32 (p0, z1),
+		  d0 = svmaxnmv (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f64.c
new file mode 100644
index 000000000..8b0884479
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxnmv_f64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxnmv_d0_f64_tied:
+**	fmaxnmv	d0, p0, z0\.d
+**	ret
+*/
+TEST_REDUCTION_D (maxnmv_d0_f64_tied, float64_t, svfloat64_t,
+		  d0 = svmaxnmv_f64 (p0, z0),
+		  d0 = svmaxnmv (p0, z0))
+
+/*
+** maxnmv_d0_f64_untied:
+**	fmaxnmv	d0, p0, z1\.d
+**	ret
+*/
+TEST_REDUCTION_D (maxnmv_d0_f64_untied, float64_t, svfloat64_t,
+		  d0 = svmaxnmv_f64 (p0, z1),
+		  d0 = svmaxnmv (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f16.c
new file mode 100644
index 000000000..a16823987
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxv_d0_f16_tied:
+**	fmaxv	h0, p0, z0\.h
+**	ret
+*/
+TEST_REDUCTION_D (maxv_d0_f16_tied, float16_t, svfloat16_t,
+		  d0 = svmaxv_f16 (p0, z0),
+		  d0 = svmaxv (p0, z0))
+
+/*
+** maxv_d0_f16_untied:
+**	fmaxv	h0, p0, z1\.h
+**	ret
+*/
+TEST_REDUCTION_D (maxv_d0_f16_untied, float16_t, svfloat16_t,
+		  d0 = svmaxv_f16 (p0, z1),
+		  d0 = svmaxv (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f32.c
new file mode 100644
index 000000000..64e5edfef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxv_d0_f32_tied:
+**	fmaxv	s0, p0, z0\.s
+**	ret
+*/
+TEST_REDUCTION_D (maxv_d0_f32_tied, float32_t, svfloat32_t,
+		  d0 = svmaxv_f32 (p0, z0),
+		  d0 = svmaxv (p0, z0))
+
+/*
+** maxv_d0_f32_untied:
+**	fmaxv	s0, p0, z1\.s
+**	ret
+*/
+TEST_REDUCTION_D (maxv_d0_f32_untied, float32_t, svfloat32_t,
+		  d0 = svmaxv_f32 (p0, z1),
+		  d0 = svmaxv (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f64.c
new file mode 100644
index 000000000..837d6dfdc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_f64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxv_d0_f64_tied:
+**	fmaxv	d0, p0, z0\.d
+**	ret
+*/
+TEST_REDUCTION_D (maxv_d0_f64_tied, float64_t, svfloat64_t,
+		  d0 = svmaxv_f64 (p0, z0),
+		  d0 = svmaxv (p0, z0))
+
+/*
+** maxv_d0_f64_untied:
+**	fmaxv	d0, p0, z1\.d
+**	ret
+*/
+TEST_REDUCTION_D (maxv_d0_f64_untied, float64_t, svfloat64_t,
+		  d0 = svmaxv_f64 (p0, z1),
+		  d0 = svmaxv (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s16.c
new file mode 100644
index 000000000..bbf36a110
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s16.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxv_x0_s16:
+**	smaxv	h([0-9]+), p0, z0\.h
+**	umov	w0, v\1\.h\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (maxv_x0_s16, int16_t, svint16_t,
+		  x0 = svmaxv_s16 (p0, z0),
+		  x0 = svmaxv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s32.c
new file mode 100644
index 000000000..645169ee8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s32.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxv_x0_s32:
+**	smaxv	(s[0-9]+), p0, z0\.s
+**	fmov	w0, \1
+**	ret
+*/
+TEST_REDUCTION_X (maxv_x0_s32, int32_t, svint32_t,
+		  x0 = svmaxv_s32 (p0, z0),
+		  x0 = svmaxv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s64.c
new file mode 100644
index 000000000..009c1e9e2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s64.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxv_x0_s64:
+**	smaxv	(d[0-9]+), p0, z0\.d
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (maxv_x0_s64, int64_t, svint64_t,
+		  x0 = svmaxv_s64 (p0, z0),
+		  x0 = svmaxv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s8.c
new file mode 100644
index 000000000..2c1f1b9b3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_s8.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxv_x0_s8:
+**	smaxv	b([0-9]+), p0, z0\.b
+**	umov	w0, v\1\.b\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (maxv_x0_s8, int8_t, svint8_t,
+		  x0 = svmaxv_s8 (p0, z0),
+		  x0 = svmaxv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u16.c
new file mode 100644
index 000000000..978b8251a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u16.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxv_x0_u16:
+**	umaxv	h([0-9]+), p0, z0\.h
+**	umov	w0, v\1\.h\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (maxv_x0_u16, uint16_t, svuint16_t,
+		  x0 = svmaxv_u16 (p0, z0),
+		  x0 = svmaxv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u32.c
new file mode 100644
index 000000000..85853b4b0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u32.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxv_x0_u32:
+**	umaxv	(s[0-9]+), p0, z0\.s
+**	fmov	w0, \1
+**	ret
+*/
+TEST_REDUCTION_X (maxv_x0_u32, uint32_t, svuint32_t,
+		  x0 = svmaxv_u32 (p0, z0),
+		  x0 = svmaxv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u64.c
new file mode 100644
index 000000000..95980ed34
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u64.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxv_x0_u64:
+**	umaxv	(d[0-9]+), p0, z0\.d
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (maxv_x0_u64, uint64_t, svuint64_t,
+		  x0 = svmaxv_u64 (p0, z0),
+		  x0 = svmaxv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u8.c
new file mode 100644
index 000000000..a0b23d242
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/maxv_u8.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** maxv_x0_u8:
+**	umaxv	b([0-9]+), p0, z0\.b
+**	umov	w0, v\1\.b\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (maxv_x0_u8, uint8_t, svuint8_t,
+		  x0 = svmaxv_u8 (p0, z0),
+		  x0 = svmaxv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f16.c
new file mode 100644
index 000000000..721ee7389
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f16.c
@@ -0,0 +1,425 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** min_f16_m_tied1:
+**	fmin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_f16_m_tied1, svfloat16_t,
+		z0 = svmin_f16_m (p0, z0, z1),
+		z0 = svmin_m (p0, z0, z1))
+
+/*
+** min_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmin	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_f16_m_tied2, svfloat16_t,
+		z0 = svmin_f16_m (p0, z1, z0),
+		z0 = svmin_m (p0, z1, z0))
+
+/*
+** min_f16_m_untied:
+**	movprfx	z0, z1
+**	fmin	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_f16_m_untied, svfloat16_t,
+		z0 = svmin_f16_m (p0, z1, z2),
+		z0 = svmin_m (p0, z1, z2))
+
+/*
+** min_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (min_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svmin_n_f16_m (p0, z0, d4),
+		 z0 = svmin_m (p0, z0, d4))
+
+/*
+** min_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fmin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (min_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svmin_n_f16_m (p0, z1, d4),
+		 z0 = svmin_m (p0, z1, d4))
+
+/*
+** min_0_f16_m_tied1:
+**	fmin	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f16_m_tied1, svfloat16_t,
+		z0 = svmin_n_f16_m (p0, z0, 0),
+		z0 = svmin_m (p0, z0, 0))
+
+/*
+** min_0_f16_m_untied:
+**	movprfx	z0, z1
+**	fmin	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f16_m_untied, svfloat16_t,
+		z0 = svmin_n_f16_m (p0, z1, 0),
+		z0 = svmin_m (p0, z1, 0))
+
+/*
+** min_1_f16_m_tied1:
+**	fmin	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f16_m_tied1, svfloat16_t,
+		z0 = svmin_n_f16_m (p0, z0, 1),
+		z0 = svmin_m (p0, z0, 1))
+
+/*
+** min_1_f16_m_untied:
+**	movprfx	z0, z1
+**	fmin	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f16_m_untied, svfloat16_t,
+		z0 = svmin_n_f16_m (p0, z1, 1),
+		z0 = svmin_m (p0, z1, 1))
+
+/*
+** min_2_f16_m:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_2_f16_m, svfloat16_t,
+		z0 = svmin_n_f16_m (p0, z0, 2),
+		z0 = svmin_m (p0, z0, 2))
+
+/*
+** min_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_f16_z_tied1, svfloat16_t,
+		z0 = svmin_f16_z (p0, z0, z1),
+		z0 = svmin_z (p0, z0, z1))
+
+/*
+** min_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_f16_z_tied2, svfloat16_t,
+		z0 = svmin_f16_z (p0, z1, z0),
+		z0 = svmin_z (p0, z1, z0))
+
+/*
+** min_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmin	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmin	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_f16_z_untied, svfloat16_t,
+		z0 = svmin_f16_z (p0, z1, z2),
+		z0 = svmin_z (p0, z1, z2))
+
+/*
+** min_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (min_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svmin_n_f16_z (p0, z0, d4),
+		 z0 = svmin_z (p0, z0, d4))
+
+/*
+** min_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmin	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmin	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (min_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svmin_n_f16_z (p0, z1, d4),
+		 z0 = svmin_z (p0, z1, d4))
+
+/*
+** min_0_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmin	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f16_z_tied1, svfloat16_t,
+		z0 = svmin_n_f16_z (p0, z0, 0),
+		z0 = svmin_z (p0, z0, 0))
+
+/*
+** min_0_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmin	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f16_z_untied, svfloat16_t,
+		z0 = svmin_n_f16_z (p0, z1, 0),
+		z0 = svmin_z (p0, z1, 0))
+
+/*
+** min_1_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmin	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f16_z_tied1, svfloat16_t,
+		z0 = svmin_n_f16_z (p0, z0, 1),
+		z0 = svmin_z (p0, z0, 1))
+
+/*
+** min_1_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmin	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f16_z_untied, svfloat16_t,
+		z0 = svmin_n_f16_z (p0, z1, 1),
+		z0 = svmin_z (p0, z1, 1))
+
+/*
+** min_2_f16_z:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_2_f16_z, svfloat16_t,
+		z0 = svmin_n_f16_z (p0, z0, 2),
+		z0 = svmin_z (p0, z0, 2))
+
+/*
+** min_f16_x_tied1:
+**	fmin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_f16_x_tied1, svfloat16_t,
+		z0 = svmin_f16_x (p0, z0, z1),
+		z0 = svmin_x (p0, z0, z1))
+
+/*
+** min_f16_x_tied2:
+**	fmin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_f16_x_tied2, svfloat16_t,
+		z0 = svmin_f16_x (p0, z1, z0),
+		z0 = svmin_x (p0, z1, z0))
+
+/*
+** min_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmin	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	fmin	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_f16_x_untied, svfloat16_t,
+		z0 = svmin_f16_x (p0, z1, z2),
+		z0 = svmin_x (p0, z1, z2))
+
+/*
+** min_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (min_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svmin_n_f16_x (p0, z0, d4),
+		 z0 = svmin_x (p0, z0, d4))
+
+/*
+** min_h4_f16_x_untied:
+**	mov	z0\.h, h4
+**	fmin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (min_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svmin_n_f16_x (p0, z1, d4),
+		 z0 = svmin_x (p0, z1, d4))
+
+/*
+** min_0_f16_x_tied1:
+**	fmin	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f16_x_tied1, svfloat16_t,
+		z0 = svmin_n_f16_x (p0, z0, 0),
+		z0 = svmin_x (p0, z0, 0))
+
+/*
+** min_0_f16_x_untied:
+**	movprfx	z0, z1
+**	fmin	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f16_x_untied, svfloat16_t,
+		z0 = svmin_n_f16_x (p0, z1, 0),
+		z0 = svmin_x (p0, z1, 0))
+
+/*
+** min_1_f16_x_tied1:
+**	fmin	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f16_x_tied1, svfloat16_t,
+		z0 = svmin_n_f16_x (p0, z0, 1),
+		z0 = svmin_x (p0, z0, 1))
+
+/*
+** min_1_f16_x_untied:
+**	movprfx	z0, z1
+**	fmin	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f16_x_untied, svfloat16_t,
+		z0 = svmin_n_f16_x (p0, z1, 1),
+		z0 = svmin_x (p0, z1, 1))
+
+/*
+** min_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_2_f16_x_tied1, svfloat16_t,
+		z0 = svmin_n_f16_x (p0, z0, 2),
+		z0 = svmin_x (p0, z0, 2))
+
+/*
+** min_2_f16_x_untied:
+**	fmov	z0\.h, #2\.0(?:e\+0)?
+**	fmin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_2_f16_x_untied, svfloat16_t,
+		z0 = svmin_n_f16_x (p0, z1, 2),
+		z0 = svmin_x (p0, z1, 2))
+
+/*
+** ptrue_min_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_f16_x_tied1, svfloat16_t,
+		z0 = svmin_f16_x (svptrue_b16 (), z0, z1),
+		z0 = svmin_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_min_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_f16_x_tied2, svfloat16_t,
+		z0 = svmin_f16_x (svptrue_b16 (), z1, z0),
+		z0 = svmin_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_min_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_f16_x_untied, svfloat16_t,
+		z0 = svmin_f16_x (svptrue_b16 (), z1, z2),
+		z0 = svmin_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_min_0_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_0_f16_x_tied1, svfloat16_t,
+		z0 = svmin_n_f16_x (svptrue_b16 (), z0, 0),
+		z0 = svmin_x (svptrue_b16 (), z0, 0))
+
+/*
+** ptrue_min_0_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_0_f16_x_untied, svfloat16_t,
+		z0 = svmin_n_f16_x (svptrue_b16 (), z1, 0),
+		z0 = svmin_x (svptrue_b16 (), z1, 0))
+
+/*
+** ptrue_min_1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_1_f16_x_tied1, svfloat16_t,
+		z0 = svmin_n_f16_x (svptrue_b16 (), z0, 1),
+		z0 = svmin_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_min_1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_1_f16_x_untied, svfloat16_t,
+		z0 = svmin_n_f16_x (svptrue_b16 (), z1, 1),
+		z0 = svmin_x (svptrue_b16 (), z1, 1))
+
+/*
+** ptrue_min_2_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_2_f16_x_tied1, svfloat16_t,
+		z0 = svmin_n_f16_x (svptrue_b16 (), z0, 2),
+		z0 = svmin_x (svptrue_b16 (), z0, 2))
+
+/*
+** ptrue_min_2_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_2_f16_x_untied, svfloat16_t,
+		z0 = svmin_n_f16_x (svptrue_b16 (), z1, 2),
+		z0 = svmin_x (svptrue_b16 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f32.c
new file mode 100644
index 000000000..a3b1cf5c5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f32.c
@@ -0,0 +1,425 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** min_f32_m_tied1:
+**	fmin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_f32_m_tied1, svfloat32_t,
+		z0 = svmin_f32_m (p0, z0, z1),
+		z0 = svmin_m (p0, z0, z1))
+
+/*
+** min_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmin	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_f32_m_tied2, svfloat32_t,
+		z0 = svmin_f32_m (p0, z1, z0),
+		z0 = svmin_m (p0, z1, z0))
+
+/*
+** min_f32_m_untied:
+**	movprfx	z0, z1
+**	fmin	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_f32_m_untied, svfloat32_t,
+		z0 = svmin_f32_m (p0, z1, z2),
+		z0 = svmin_m (p0, z1, z2))
+
+/*
+** min_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (min_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svmin_n_f32_m (p0, z0, d4),
+		 z0 = svmin_m (p0, z0, d4))
+
+/*
+** min_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fmin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (min_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svmin_n_f32_m (p0, z1, d4),
+		 z0 = svmin_m (p0, z1, d4))
+
+/*
+** min_0_f32_m_tied1:
+**	fmin	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f32_m_tied1, svfloat32_t,
+		z0 = svmin_n_f32_m (p0, z0, 0),
+		z0 = svmin_m (p0, z0, 0))
+
+/*
+** min_0_f32_m_untied:
+**	movprfx	z0, z1
+**	fmin	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f32_m_untied, svfloat32_t,
+		z0 = svmin_n_f32_m (p0, z1, 0),
+		z0 = svmin_m (p0, z1, 0))
+
+/*
+** min_1_f32_m_tied1:
+**	fmin	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f32_m_tied1, svfloat32_t,
+		z0 = svmin_n_f32_m (p0, z0, 1),
+		z0 = svmin_m (p0, z0, 1))
+
+/*
+** min_1_f32_m_untied:
+**	movprfx	z0, z1
+**	fmin	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f32_m_untied, svfloat32_t,
+		z0 = svmin_n_f32_m (p0, z1, 1),
+		z0 = svmin_m (p0, z1, 1))
+
+/*
+** min_2_f32_m:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_2_f32_m, svfloat32_t,
+		z0 = svmin_n_f32_m (p0, z0, 2),
+		z0 = svmin_m (p0, z0, 2))
+
+/*
+** min_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_f32_z_tied1, svfloat32_t,
+		z0 = svmin_f32_z (p0, z0, z1),
+		z0 = svmin_z (p0, z0, z1))
+
+/*
+** min_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_f32_z_tied2, svfloat32_t,
+		z0 = svmin_f32_z (p0, z1, z0),
+		z0 = svmin_z (p0, z1, z0))
+
+/*
+** min_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmin	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmin	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_f32_z_untied, svfloat32_t,
+		z0 = svmin_f32_z (p0, z1, z2),
+		z0 = svmin_z (p0, z1, z2))
+
+/*
+** min_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (min_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svmin_n_f32_z (p0, z0, d4),
+		 z0 = svmin_z (p0, z0, d4))
+
+/*
+** min_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmin	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmin	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (min_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svmin_n_f32_z (p0, z1, d4),
+		 z0 = svmin_z (p0, z1, d4))
+
+/*
+** min_0_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmin	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f32_z_tied1, svfloat32_t,
+		z0 = svmin_n_f32_z (p0, z0, 0),
+		z0 = svmin_z (p0, z0, 0))
+
+/*
+** min_0_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmin	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f32_z_untied, svfloat32_t,
+		z0 = svmin_n_f32_z (p0, z1, 0),
+		z0 = svmin_z (p0, z1, 0))
+
+/*
+** min_1_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmin	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f32_z_tied1, svfloat32_t,
+		z0 = svmin_n_f32_z (p0, z0, 1),
+		z0 = svmin_z (p0, z0, 1))
+
+/*
+** min_1_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmin	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f32_z_untied, svfloat32_t,
+		z0 = svmin_n_f32_z (p0, z1, 1),
+		z0 = svmin_z (p0, z1, 1))
+
+/*
+** min_2_f32_z:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_2_f32_z, svfloat32_t,
+		z0 = svmin_n_f32_z (p0, z0, 2),
+		z0 = svmin_z (p0, z0, 2))
+
+/*
+** min_f32_x_tied1:
+**	fmin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_f32_x_tied1, svfloat32_t,
+		z0 = svmin_f32_x (p0, z0, z1),
+		z0 = svmin_x (p0, z0, z1))
+
+/*
+** min_f32_x_tied2:
+**	fmin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_f32_x_tied2, svfloat32_t,
+		z0 = svmin_f32_x (p0, z1, z0),
+		z0 = svmin_x (p0, z1, z0))
+
+/*
+** min_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmin	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	fmin	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_f32_x_untied, svfloat32_t,
+		z0 = svmin_f32_x (p0, z1, z2),
+		z0 = svmin_x (p0, z1, z2))
+
+/*
+** min_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (min_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svmin_n_f32_x (p0, z0, d4),
+		 z0 = svmin_x (p0, z0, d4))
+
+/*
+** min_s4_f32_x_untied:
+**	mov	z0\.s, s4
+**	fmin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (min_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svmin_n_f32_x (p0, z1, d4),
+		 z0 = svmin_x (p0, z1, d4))
+
+/*
+** min_0_f32_x_tied1:
+**	fmin	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f32_x_tied1, svfloat32_t,
+		z0 = svmin_n_f32_x (p0, z0, 0),
+		z0 = svmin_x (p0, z0, 0))
+
+/*
+** min_0_f32_x_untied:
+**	movprfx	z0, z1
+**	fmin	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f32_x_untied, svfloat32_t,
+		z0 = svmin_n_f32_x (p0, z1, 0),
+		z0 = svmin_x (p0, z1, 0))
+
+/*
+** min_1_f32_x_tied1:
+**	fmin	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f32_x_tied1, svfloat32_t,
+		z0 = svmin_n_f32_x (p0, z0, 1),
+		z0 = svmin_x (p0, z0, 1))
+
+/*
+** min_1_f32_x_untied:
+**	movprfx	z0, z1
+**	fmin	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f32_x_untied, svfloat32_t,
+		z0 = svmin_n_f32_x (p0, z1, 1),
+		z0 = svmin_x (p0, z1, 1))
+
+/*
+** min_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_2_f32_x_tied1, svfloat32_t,
+		z0 = svmin_n_f32_x (p0, z0, 2),
+		z0 = svmin_x (p0, z0, 2))
+
+/*
+** min_2_f32_x_untied:
+**	fmov	z0\.s, #2\.0(?:e\+0)?
+**	fmin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_2_f32_x_untied, svfloat32_t,
+		z0 = svmin_n_f32_x (p0, z1, 2),
+		z0 = svmin_x (p0, z1, 2))
+
+/*
+** ptrue_min_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_f32_x_tied1, svfloat32_t,
+		z0 = svmin_f32_x (svptrue_b32 (), z0, z1),
+		z0 = svmin_x (svptrue_b32 (), z0, z1))
+
+/*
+** ptrue_min_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_f32_x_tied2, svfloat32_t,
+		z0 = svmin_f32_x (svptrue_b32 (), z1, z0),
+		z0 = svmin_x (svptrue_b32 (), z1, z0))
+
+/*
+** ptrue_min_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_f32_x_untied, svfloat32_t,
+		z0 = svmin_f32_x (svptrue_b32 (), z1, z2),
+		z0 = svmin_x (svptrue_b32 (), z1, z2))
+
+/*
+** ptrue_min_0_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_0_f32_x_tied1, svfloat32_t,
+		z0 = svmin_n_f32_x (svptrue_b32 (), z0, 0),
+		z0 = svmin_x (svptrue_b32 (), z0, 0))
+
+/*
+** ptrue_min_0_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_0_f32_x_untied, svfloat32_t,
+		z0 = svmin_n_f32_x (svptrue_b32 (), z1, 0),
+		z0 = svmin_x (svptrue_b32 (), z1, 0))
+
+/*
+** ptrue_min_1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_1_f32_x_tied1, svfloat32_t,
+		z0 = svmin_n_f32_x (svptrue_b32 (), z0, 1),
+		z0 = svmin_x (svptrue_b32 (), z0, 1))
+
+/*
+** ptrue_min_1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_1_f32_x_untied, svfloat32_t,
+		z0 = svmin_n_f32_x (svptrue_b32 (), z1, 1),
+		z0 = svmin_x (svptrue_b32 (), z1, 1))
+
+/*
+** ptrue_min_2_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_2_f32_x_tied1, svfloat32_t,
+		z0 = svmin_n_f32_x (svptrue_b32 (), z0, 2),
+		z0 = svmin_x (svptrue_b32 (), z0, 2))
+
+/*
+** ptrue_min_2_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_2_f32_x_untied, svfloat32_t,
+		z0 = svmin_n_f32_x (svptrue_b32 (), z1, 2),
+		z0 = svmin_x (svptrue_b32 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f64.c
new file mode 100644
index 000000000..bb31102e2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_f64.c
@@ -0,0 +1,425 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** min_f64_m_tied1:
+**	fmin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_f64_m_tied1, svfloat64_t,
+		z0 = svmin_f64_m (p0, z0, z1),
+		z0 = svmin_m (p0, z0, z1))
+
+/*
+** min_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_f64_m_tied2, svfloat64_t,
+		z0 = svmin_f64_m (p0, z1, z0),
+		z0 = svmin_m (p0, z1, z0))
+
+/*
+** min_f64_m_untied:
+**	movprfx	z0, z1
+**	fmin	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_f64_m_untied, svfloat64_t,
+		z0 = svmin_f64_m (p0, z1, z2),
+		z0 = svmin_m (p0, z1, z2))
+
+/*
+** min_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (min_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svmin_n_f64_m (p0, z0, d4),
+		 z0 = svmin_m (p0, z0, d4))
+
+/*
+** min_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fmin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (min_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svmin_n_f64_m (p0, z1, d4),
+		 z0 = svmin_m (p0, z1, d4))
+
+/*
+** min_0_f64_m_tied1:
+**	fmin	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f64_m_tied1, svfloat64_t,
+		z0 = svmin_n_f64_m (p0, z0, 0),
+		z0 = svmin_m (p0, z0, 0))
+
+/*
+** min_0_f64_m_untied:
+**	movprfx	z0, z1
+**	fmin	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f64_m_untied, svfloat64_t,
+		z0 = svmin_n_f64_m (p0, z1, 0),
+		z0 = svmin_m (p0, z1, 0))
+
+/*
+** min_1_f64_m_tied1:
+**	fmin	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f64_m_tied1, svfloat64_t,
+		z0 = svmin_n_f64_m (p0, z0, 1),
+		z0 = svmin_m (p0, z0, 1))
+
+/*
+** min_1_f64_m_untied:
+**	movprfx	z0, z1
+**	fmin	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f64_m_untied, svfloat64_t,
+		z0 = svmin_n_f64_m (p0, z1, 1),
+		z0 = svmin_m (p0, z1, 1))
+
+/*
+** min_2_f64_m:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_2_f64_m, svfloat64_t,
+		z0 = svmin_n_f64_m (p0, z0, 2),
+		z0 = svmin_m (p0, z0, 2))
+
+/*
+** min_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_f64_z_tied1, svfloat64_t,
+		z0 = svmin_f64_z (p0, z0, z1),
+		z0 = svmin_z (p0, z0, z1))
+
+/*
+** min_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_f64_z_tied2, svfloat64_t,
+		z0 = svmin_f64_z (p0, z1, z0),
+		z0 = svmin_z (p0, z1, z0))
+
+/*
+** min_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmin	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmin	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_f64_z_untied, svfloat64_t,
+		z0 = svmin_f64_z (p0, z1, z2),
+		z0 = svmin_z (p0, z1, z2))
+
+/*
+** min_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (min_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svmin_n_f64_z (p0, z0, d4),
+		 z0 = svmin_z (p0, z0, d4))
+
+/*
+** min_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmin	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmin	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (min_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svmin_n_f64_z (p0, z1, d4),
+		 z0 = svmin_z (p0, z1, d4))
+
+/*
+** min_0_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmin	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f64_z_tied1, svfloat64_t,
+		z0 = svmin_n_f64_z (p0, z0, 0),
+		z0 = svmin_z (p0, z0, 0))
+
+/*
+** min_0_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmin	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f64_z_untied, svfloat64_t,
+		z0 = svmin_n_f64_z (p0, z1, 0),
+		z0 = svmin_z (p0, z1, 0))
+
+/*
+** min_1_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmin	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f64_z_tied1, svfloat64_t,
+		z0 = svmin_n_f64_z (p0, z0, 1),
+		z0 = svmin_z (p0, z0, 1))
+
+/*
+** min_1_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmin	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f64_z_untied, svfloat64_t,
+		z0 = svmin_n_f64_z (p0, z1, 1),
+		z0 = svmin_z (p0, z1, 1))
+
+/*
+** min_2_f64_z:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_2_f64_z, svfloat64_t,
+		z0 = svmin_n_f64_z (p0, z0, 2),
+		z0 = svmin_z (p0, z0, 2))
+
+/*
+** min_f64_x_tied1:
+**	fmin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_f64_x_tied1, svfloat64_t,
+		z0 = svmin_f64_x (p0, z0, z1),
+		z0 = svmin_x (p0, z0, z1))
+
+/*
+** min_f64_x_tied2:
+**	fmin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_f64_x_tied2, svfloat64_t,
+		z0 = svmin_f64_x (p0, z1, z0),
+		z0 = svmin_x (p0, z1, z0))
+
+/*
+** min_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmin	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	fmin	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_f64_x_untied, svfloat64_t,
+		z0 = svmin_f64_x (p0, z1, z2),
+		z0 = svmin_x (p0, z1, z2))
+
+/*
+** min_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (min_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svmin_n_f64_x (p0, z0, d4),
+		 z0 = svmin_x (p0, z0, d4))
+
+/*
+** min_d4_f64_x_untied:
+**	mov	z0\.d, d4
+**	fmin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (min_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svmin_n_f64_x (p0, z1, d4),
+		 z0 = svmin_x (p0, z1, d4))
+
+/*
+** min_0_f64_x_tied1:
+**	fmin	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f64_x_tied1, svfloat64_t,
+		z0 = svmin_n_f64_x (p0, z0, 0),
+		z0 = svmin_x (p0, z0, 0))
+
+/*
+** min_0_f64_x_untied:
+**	movprfx	z0, z1
+**	fmin	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_0_f64_x_untied, svfloat64_t,
+		z0 = svmin_n_f64_x (p0, z1, 0),
+		z0 = svmin_x (p0, z1, 0))
+
+/*
+** min_1_f64_x_tied1:
+**	fmin	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f64_x_tied1, svfloat64_t,
+		z0 = svmin_n_f64_x (p0, z0, 1),
+		z0 = svmin_x (p0, z0, 1))
+
+/*
+** min_1_f64_x_untied:
+**	movprfx	z0, z1
+**	fmin	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_f64_x_untied, svfloat64_t,
+		z0 = svmin_n_f64_x (p0, z1, 1),
+		z0 = svmin_x (p0, z1, 1))
+
+/*
+** min_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_2_f64_x_tied1, svfloat64_t,
+		z0 = svmin_n_f64_x (p0, z0, 2),
+		z0 = svmin_x (p0, z0, 2))
+
+/*
+** min_2_f64_x_untied:
+**	fmov	z0\.d, #2\.0(?:e\+0)?
+**	fmin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_2_f64_x_untied, svfloat64_t,
+		z0 = svmin_n_f64_x (p0, z1, 2),
+		z0 = svmin_x (p0, z1, 2))
+
+/*
+** ptrue_min_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_f64_x_tied1, svfloat64_t,
+		z0 = svmin_f64_x (svptrue_b64 (), z0, z1),
+		z0 = svmin_x (svptrue_b64 (), z0, z1))
+
+/*
+** ptrue_min_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_f64_x_tied2, svfloat64_t,
+		z0 = svmin_f64_x (svptrue_b64 (), z1, z0),
+		z0 = svmin_x (svptrue_b64 (), z1, z0))
+
+/*
+** ptrue_min_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_f64_x_untied, svfloat64_t,
+		z0 = svmin_f64_x (svptrue_b64 (), z1, z2),
+		z0 = svmin_x (svptrue_b64 (), z1, z2))
+
+/*
+** ptrue_min_0_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_0_f64_x_tied1, svfloat64_t,
+		z0 = svmin_n_f64_x (svptrue_b64 (), z0, 0),
+		z0 = svmin_x (svptrue_b64 (), z0, 0))
+
+/*
+** ptrue_min_0_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_0_f64_x_untied, svfloat64_t,
+		z0 = svmin_n_f64_x (svptrue_b64 (), z1, 0),
+		z0 = svmin_x (svptrue_b64 (), z1, 0))
+
+/*
+** ptrue_min_1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_1_f64_x_tied1, svfloat64_t,
+		z0 = svmin_n_f64_x (svptrue_b64 (), z0, 1),
+		z0 = svmin_x (svptrue_b64 (), z0, 1))
+
+/*
+** ptrue_min_1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_1_f64_x_untied, svfloat64_t,
+		z0 = svmin_n_f64_x (svptrue_b64 (), z1, 1),
+		z0 = svmin_x (svptrue_b64 (), z1, 1))
+
+/*
+** ptrue_min_2_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_2_f64_x_tied1, svfloat64_t,
+		z0 = svmin_n_f64_x (svptrue_b64 (), z0, 2),
+		z0 = svmin_x (svptrue_b64 (), z0, 2))
+
+/*
+** ptrue_min_2_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_min_2_f64_x_untied, svfloat64_t,
+		z0 = svmin_n_f64_x (svptrue_b64 (), z1, 2),
+		z0 = svmin_x (svptrue_b64 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s16.c
new file mode 100644
index 000000000..14dfcc4c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s16.c
@@ -0,0 +1,293 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** min_s16_m_tied1:
+**	smin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_s16_m_tied1, svint16_t,
+		z0 = svmin_s16_m (p0, z0, z1),
+		z0 = svmin_m (p0, z0, z1))
+
+/*
+** min_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	smin	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_s16_m_tied2, svint16_t,
+		z0 = svmin_s16_m (p0, z1, z0),
+		z0 = svmin_m (p0, z1, z0))
+
+/*
+** min_s16_m_untied:
+**	movprfx	z0, z1
+**	smin	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_s16_m_untied, svint16_t,
+		z0 = svmin_s16_m (p0, z1, z2),
+		z0 = svmin_m (p0, z1, z2))
+
+/*
+** min_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	smin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s16_m_tied1, svint16_t, int16_t,
+		 z0 = svmin_n_s16_m (p0, z0, x0),
+		 z0 = svmin_m (p0, z0, x0))
+
+/*
+** min_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	smin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s16_m_untied, svint16_t, int16_t,
+		 z0 = svmin_n_s16_m (p0, z1, x0),
+		 z0 = svmin_m (p0, z1, x0))
+
+/*
+** min_1_s16_m_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	smin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s16_m_tied1, svint16_t,
+		z0 = svmin_n_s16_m (p0, z0, 1),
+		z0 = svmin_m (p0, z0, 1))
+
+/*
+** min_1_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0, z1
+**	smin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s16_m_untied, svint16_t,
+		z0 = svmin_n_s16_m (p0, z1, 1),
+		z0 = svmin_m (p0, z1, 1))
+
+/*
+** min_m1_s16_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	smin	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_m1_s16_m, svint16_t,
+		z0 = svmin_n_s16_m (p0, z0, -1),
+		z0 = svmin_m (p0, z0, -1))
+
+/*
+** min_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	smin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_s16_z_tied1, svint16_t,
+		z0 = svmin_s16_z (p0, z0, z1),
+		z0 = svmin_z (p0, z0, z1))
+
+/*
+** min_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	smin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_s16_z_tied2, svint16_t,
+		z0 = svmin_s16_z (p0, z1, z0),
+		z0 = svmin_z (p0, z1, z0))
+
+/*
+** min_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	smin	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	smin	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_s16_z_untied, svint16_t,
+		z0 = svmin_s16_z (p0, z1, z2),
+		z0 = svmin_z (p0, z1, z2))
+
+/*
+** min_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	smin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s16_z_tied1, svint16_t, int16_t,
+		 z0 = svmin_n_s16_z (p0, z0, x0),
+		 z0 = svmin_z (p0, z0, x0))
+
+/*
+** min_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	smin	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	smin	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s16_z_untied, svint16_t, int16_t,
+		 z0 = svmin_n_s16_z (p0, z1, x0),
+		 z0 = svmin_z (p0, z1, x0))
+
+/*
+** min_1_s16_z_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	smin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s16_z_tied1, svint16_t,
+		z0 = svmin_n_s16_z (p0, z0, 1),
+		z0 = svmin_z (p0, z0, 1))
+
+/*
+** min_1_s16_z_untied:
+**	mov	(z[0-9]+\.h), #1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	smin	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	smin	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s16_z_untied, svint16_t,
+		z0 = svmin_n_s16_z (p0, z1, 1),
+		z0 = svmin_z (p0, z1, 1))
+
+/*
+** min_s16_x_tied1:
+**	smin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_s16_x_tied1, svint16_t,
+		z0 = svmin_s16_x (p0, z0, z1),
+		z0 = svmin_x (p0, z0, z1))
+
+/*
+** min_s16_x_tied2:
+**	smin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_s16_x_tied2, svint16_t,
+		z0 = svmin_s16_x (p0, z1, z0),
+		z0 = svmin_x (p0, z1, z0))
+
+/*
+** min_s16_x_untied:
+** (
+**	movprfx	z0, z1
+**	smin	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	smin	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_s16_x_untied, svint16_t,
+		z0 = svmin_s16_x (p0, z1, z2),
+		z0 = svmin_x (p0, z1, z2))
+
+/*
+** min_w0_s16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	smin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s16_x_tied1, svint16_t, int16_t,
+		 z0 = svmin_n_s16_x (p0, z0, x0),
+		 z0 = svmin_x (p0, z0, x0))
+
+/*
+** min_w0_s16_x_untied:
+**	mov	z0\.h, w0
+**	smin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s16_x_untied, svint16_t, int16_t,
+		 z0 = svmin_n_s16_x (p0, z1, x0),
+		 z0 = svmin_x (p0, z1, x0))
+
+/*
+** min_1_s16_x_tied1:
+**	smin	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s16_x_tied1, svint16_t,
+		z0 = svmin_n_s16_x (p0, z0, 1),
+		z0 = svmin_x (p0, z0, 1))
+
+/*
+** min_1_s16_x_untied:
+**	movprfx	z0, z1
+**	smin	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s16_x_untied, svint16_t,
+		z0 = svmin_n_s16_x (p0, z1, 1),
+		z0 = svmin_x (p0, z1, 1))
+
+/*
+** min_127_s16_x:
+**	smin	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (min_127_s16_x, svint16_t,
+		z0 = svmin_n_s16_x (p0, z0, 127),
+		z0 = svmin_x (p0, z0, 127))
+
+/*
+** min_128_s16_x:
+**	mov	(z[0-9]+\.h), #128
+**	smin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_128_s16_x, svint16_t,
+		z0 = svmin_n_s16_x (p0, z0, 128),
+		z0 = svmin_x (p0, z0, 128))
+
+/*
+** min_m1_s16_x:
+**	smin	z0\.h, z0\.h, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (min_m1_s16_x, svint16_t,
+		z0 = svmin_n_s16_x (p0, z0, -1),
+		z0 = svmin_x (p0, z0, -1))
+
+/*
+** min_m128_s16_x:
+**	smin	z0\.h, z0\.h, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (min_m128_s16_x, svint16_t,
+		z0 = svmin_n_s16_x (p0, z0, -128),
+		z0 = svmin_x (p0, z0, -128))
+
+/*
+** min_m129_s16_x:
+**	mov	(z[0-9]+\.h), #-129
+**	smin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_m129_s16_x, svint16_t,
+		z0 = svmin_n_s16_x (p0, z0, -129),
+		z0 = svmin_x (p0, z0, -129))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s32.c
new file mode 100644
index 000000000..cee2b649d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s32.c
@@ -0,0 +1,293 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** min_s32_m_tied1:
+**	smin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_s32_m_tied1, svint32_t,
+		z0 = svmin_s32_m (p0, z0, z1),
+		z0 = svmin_m (p0, z0, z1))
+
+/*
+** min_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	smin	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_s32_m_tied2, svint32_t,
+		z0 = svmin_s32_m (p0, z1, z0),
+		z0 = svmin_m (p0, z1, z0))
+
+/*
+** min_s32_m_untied:
+**	movprfx	z0, z1
+**	smin	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_s32_m_untied, svint32_t,
+		z0 = svmin_s32_m (p0, z1, z2),
+		z0 = svmin_m (p0, z1, z2))
+
+/*
+** min_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	smin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svmin_n_s32_m (p0, z0, x0),
+		 z0 = svmin_m (p0, z0, x0))
+
+/*
+** min_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	smin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svmin_n_s32_m (p0, z1, x0),
+		 z0 = svmin_m (p0, z1, x0))
+
+/*
+** min_1_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	smin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s32_m_tied1, svint32_t,
+		z0 = svmin_n_s32_m (p0, z0, 1),
+		z0 = svmin_m (p0, z0, 1))
+
+/*
+** min_1_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0, z1
+**	smin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s32_m_untied, svint32_t,
+		z0 = svmin_n_s32_m (p0, z1, 1),
+		z0 = svmin_m (p0, z1, 1))
+
+/*
+** min_m1_s32_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	smin	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_m1_s32_m, svint32_t,
+		z0 = svmin_n_s32_m (p0, z0, -1),
+		z0 = svmin_m (p0, z0, -1))
+
+/*
+** min_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	smin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_s32_z_tied1, svint32_t,
+		z0 = svmin_s32_z (p0, z0, z1),
+		z0 = svmin_z (p0, z0, z1))
+
+/*
+** min_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	smin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_s32_z_tied2, svint32_t,
+		z0 = svmin_s32_z (p0, z1, z0),
+		z0 = svmin_z (p0, z1, z0))
+
+/*
+** min_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	smin	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	smin	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_s32_z_untied, svint32_t,
+		z0 = svmin_s32_z (p0, z1, z2),
+		z0 = svmin_z (p0, z1, z2))
+
+/*
+** min_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	smin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svmin_n_s32_z (p0, z0, x0),
+		 z0 = svmin_z (p0, z0, x0))
+
+/*
+** min_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	smin	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	smin	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svmin_n_s32_z (p0, z1, x0),
+		 z0 = svmin_z (p0, z1, x0))
+
+/*
+** min_1_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	smin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s32_z_tied1, svint32_t,
+		z0 = svmin_n_s32_z (p0, z0, 1),
+		z0 = svmin_z (p0, z0, 1))
+
+/*
+** min_1_s32_z_untied:
+**	mov	(z[0-9]+\.s), #1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	smin	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	smin	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s32_z_untied, svint32_t,
+		z0 = svmin_n_s32_z (p0, z1, 1),
+		z0 = svmin_z (p0, z1, 1))
+
+/*
+** min_s32_x_tied1:
+**	smin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_s32_x_tied1, svint32_t,
+		z0 = svmin_s32_x (p0, z0, z1),
+		z0 = svmin_x (p0, z0, z1))
+
+/*
+** min_s32_x_tied2:
+**	smin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_s32_x_tied2, svint32_t,
+		z0 = svmin_s32_x (p0, z1, z0),
+		z0 = svmin_x (p0, z1, z0))
+
+/*
+** min_s32_x_untied:
+** (
+**	movprfx	z0, z1
+**	smin	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	smin	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_s32_x_untied, svint32_t,
+		z0 = svmin_s32_x (p0, z1, z2),
+		z0 = svmin_x (p0, z1, z2))
+
+/*
+** min_w0_s32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	smin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svmin_n_s32_x (p0, z0, x0),
+		 z0 = svmin_x (p0, z0, x0))
+
+/*
+** min_w0_s32_x_untied:
+**	mov	z0\.s, w0
+**	smin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svmin_n_s32_x (p0, z1, x0),
+		 z0 = svmin_x (p0, z1, x0))
+
+/*
+** min_1_s32_x_tied1:
+**	smin	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s32_x_tied1, svint32_t,
+		z0 = svmin_n_s32_x (p0, z0, 1),
+		z0 = svmin_x (p0, z0, 1))
+
+/*
+** min_1_s32_x_untied:
+**	movprfx	z0, z1
+**	smin	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s32_x_untied, svint32_t,
+		z0 = svmin_n_s32_x (p0, z1, 1),
+		z0 = svmin_x (p0, z1, 1))
+
+/*
+** min_127_s32_x:
+**	smin	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (min_127_s32_x, svint32_t,
+		z0 = svmin_n_s32_x (p0, z0, 127),
+		z0 = svmin_x (p0, z0, 127))
+
+/*
+** min_128_s32_x:
+**	mov	(z[0-9]+\.s), #128
+**	smin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_128_s32_x, svint32_t,
+		z0 = svmin_n_s32_x (p0, z0, 128),
+		z0 = svmin_x (p0, z0, 128))
+
+/*
+** min_m1_s32_x:
+**	smin	z0\.s, z0\.s, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (min_m1_s32_x, svint32_t,
+		z0 = svmin_n_s32_x (p0, z0, -1),
+		z0 = svmin_x (p0, z0, -1))
+
+/*
+** min_m128_s32_x:
+**	smin	z0\.s, z0\.s, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (min_m128_s32_x, svint32_t,
+		z0 = svmin_n_s32_x (p0, z0, -128),
+		z0 = svmin_x (p0, z0, -128))
+
+/*
+** min_m129_s32_x:
+**	mov	(z[0-9]+\.s), #-129
+**	smin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_m129_s32_x, svint32_t,
+		z0 = svmin_n_s32_x (p0, z0, -129),
+		z0 = svmin_x (p0, z0, -129))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s64.c
new file mode 100644
index 000000000..0d20bd0b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s64.c
@@ -0,0 +1,293 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** min_s64_m_tied1:
+**	smin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_s64_m_tied1, svint64_t,
+		z0 = svmin_s64_m (p0, z0, z1),
+		z0 = svmin_m (p0, z0, z1))
+
+/*
+** min_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	smin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_s64_m_tied2, svint64_t,
+		z0 = svmin_s64_m (p0, z1, z0),
+		z0 = svmin_m (p0, z1, z0))
+
+/*
+** min_s64_m_untied:
+**	movprfx	z0, z1
+**	smin	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_s64_m_untied, svint64_t,
+		z0 = svmin_s64_m (p0, z1, z2),
+		z0 = svmin_m (p0, z1, z2))
+
+/*
+** min_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	smin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svmin_n_s64_m (p0, z0, x0),
+		 z0 = svmin_m (p0, z0, x0))
+
+/*
+** min_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	smin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svmin_n_s64_m (p0, z1, x0),
+		 z0 = svmin_m (p0, z1, x0))
+
+/*
+** min_1_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	smin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s64_m_tied1, svint64_t,
+		z0 = svmin_n_s64_m (p0, z0, 1),
+		z0 = svmin_m (p0, z0, 1))
+
+/*
+** min_1_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0, z1
+**	smin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s64_m_untied, svint64_t,
+		z0 = svmin_n_s64_m (p0, z1, 1),
+		z0 = svmin_m (p0, z1, 1))
+
+/*
+** min_m1_s64_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	smin	z0\.d, p0/m, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_m1_s64_m, svint64_t,
+		z0 = svmin_n_s64_m (p0, z0, -1),
+		z0 = svmin_m (p0, z0, -1))
+
+/*
+** min_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	smin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_s64_z_tied1, svint64_t,
+		z0 = svmin_s64_z (p0, z0, z1),
+		z0 = svmin_z (p0, z0, z1))
+
+/*
+** min_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	smin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_s64_z_tied2, svint64_t,
+		z0 = svmin_s64_z (p0, z1, z0),
+		z0 = svmin_z (p0, z1, z0))
+
+/*
+** min_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	smin	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	smin	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_s64_z_untied, svint64_t,
+		z0 = svmin_s64_z (p0, z1, z2),
+		z0 = svmin_z (p0, z1, z2))
+
+/*
+** min_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	smin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svmin_n_s64_z (p0, z0, x0),
+		 z0 = svmin_z (p0, z0, x0))
+
+/*
+** min_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	smin	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	smin	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (min_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svmin_n_s64_z (p0, z1, x0),
+		 z0 = svmin_z (p0, z1, x0))
+
+/*
+** min_1_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	smin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s64_z_tied1, svint64_t,
+		z0 = svmin_n_s64_z (p0, z0, 1),
+		z0 = svmin_z (p0, z0, 1))
+
+/*
+** min_1_s64_z_untied:
+**	mov	(z[0-9]+\.d), #1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	smin	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	smin	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s64_z_untied, svint64_t,
+		z0 = svmin_n_s64_z (p0, z1, 1),
+		z0 = svmin_z (p0, z1, 1))
+
+/*
+** min_s64_x_tied1:
+**	smin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_s64_x_tied1, svint64_t,
+		z0 = svmin_s64_x (p0, z0, z1),
+		z0 = svmin_x (p0, z0, z1))
+
+/*
+** min_s64_x_tied2:
+**	smin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_s64_x_tied2, svint64_t,
+		z0 = svmin_s64_x (p0, z1, z0),
+		z0 = svmin_x (p0, z1, z0))
+
+/*
+** min_s64_x_untied:
+** (
+**	movprfx	z0, z1
+**	smin	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	smin	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_s64_x_untied, svint64_t,
+		z0 = svmin_s64_x (p0, z1, z2),
+		z0 = svmin_x (p0, z1, z2))
+
+/*
+** min_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	smin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svmin_n_s64_x (p0, z0, x0),
+		 z0 = svmin_x (p0, z0, x0))
+
+/*
+** min_x0_s64_x_untied:
+**	mov	z0\.d, x0
+**	smin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (min_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svmin_n_s64_x (p0, z1, x0),
+		 z0 = svmin_x (p0, z1, x0))
+
+/*
+** min_1_s64_x_tied1:
+**	smin	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s64_x_tied1, svint64_t,
+		z0 = svmin_n_s64_x (p0, z0, 1),
+		z0 = svmin_x (p0, z0, 1))
+
+/*
+** min_1_s64_x_untied:
+**	movprfx	z0, z1
+**	smin	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s64_x_untied, svint64_t,
+		z0 = svmin_n_s64_x (p0, z1, 1),
+		z0 = svmin_x (p0, z1, 1))
+
+/*
+** min_127_s64_x:
+**	smin	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (min_127_s64_x, svint64_t,
+		z0 = svmin_n_s64_x (p0, z0, 127),
+		z0 = svmin_x (p0, z0, 127))
+
+/*
+** min_128_s64_x:
+**	mov	(z[0-9]+\.d), #128
+**	smin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_128_s64_x, svint64_t,
+		z0 = svmin_n_s64_x (p0, z0, 128),
+		z0 = svmin_x (p0, z0, 128))
+
+/*
+** min_m1_s64_x:
+**	smin	z0\.d, z0\.d, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (min_m1_s64_x, svint64_t,
+		z0 = svmin_n_s64_x (p0, z0, -1),
+		z0 = svmin_x (p0, z0, -1))
+
+/*
+** min_m128_s64_x:
+**	smin	z0\.d, z0\.d, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (min_m128_s64_x, svint64_t,
+		z0 = svmin_n_s64_x (p0, z0, -128),
+		z0 = svmin_x (p0, z0, -128))
+
+/*
+** min_m129_s64_x:
+**	mov	(z[0-9]+\.d), #-129
+**	smin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_m129_s64_x, svint64_t,
+		z0 = svmin_n_s64_x (p0, z0, -129),
+		z0 = svmin_x (p0, z0, -129))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s8.c
new file mode 100644
index 000000000..714b1576d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_s8.c
@@ -0,0 +1,273 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** min_s8_m_tied1:
+**	smin	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (min_s8_m_tied1, svint8_t,
+		z0 = svmin_s8_m (p0, z0, z1),
+		z0 = svmin_m (p0, z0, z1))
+
+/*
+** min_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	smin	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (min_s8_m_tied2, svint8_t,
+		z0 = svmin_s8_m (p0, z1, z0),
+		z0 = svmin_m (p0, z1, z0))
+
+/*
+** min_s8_m_untied:
+**	movprfx	z0, z1
+**	smin	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (min_s8_m_untied, svint8_t,
+		z0 = svmin_s8_m (p0, z1, z2),
+		z0 = svmin_m (p0, z1, z2))
+
+/*
+** min_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	smin	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s8_m_tied1, svint8_t, int8_t,
+		 z0 = svmin_n_s8_m (p0, z0, x0),
+		 z0 = svmin_m (p0, z0, x0))
+
+/*
+** min_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	smin	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s8_m_untied, svint8_t, int8_t,
+		 z0 = svmin_n_s8_m (p0, z1, x0),
+		 z0 = svmin_m (p0, z1, x0))
+
+/*
+** min_1_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	smin	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s8_m_tied1, svint8_t,
+		z0 = svmin_n_s8_m (p0, z0, 1),
+		z0 = svmin_m (p0, z0, 1))
+
+/*
+** min_1_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0, z1
+**	smin	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s8_m_untied, svint8_t,
+		z0 = svmin_n_s8_m (p0, z1, 1),
+		z0 = svmin_m (p0, z1, 1))
+
+/*
+** min_m1_s8_m:
+**	mov	(z[0-9]+\.b), #-1
+**	smin	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_m1_s8_m, svint8_t,
+		z0 = svmin_n_s8_m (p0, z0, -1),
+		z0 = svmin_m (p0, z0, -1))
+
+/*
+** min_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	smin	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (min_s8_z_tied1, svint8_t,
+		z0 = svmin_s8_z (p0, z0, z1),
+		z0 = svmin_z (p0, z0, z1))
+
+/*
+** min_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	smin	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (min_s8_z_tied2, svint8_t,
+		z0 = svmin_s8_z (p0, z1, z0),
+		z0 = svmin_z (p0, z1, z0))
+
+/*
+** min_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	smin	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	smin	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_s8_z_untied, svint8_t,
+		z0 = svmin_s8_z (p0, z1, z2),
+		z0 = svmin_z (p0, z1, z2))
+
+/*
+** min_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	smin	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s8_z_tied1, svint8_t, int8_t,
+		 z0 = svmin_n_s8_z (p0, z0, x0),
+		 z0 = svmin_z (p0, z0, x0))
+
+/*
+** min_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	smin	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	smin	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s8_z_untied, svint8_t, int8_t,
+		 z0 = svmin_n_s8_z (p0, z1, x0),
+		 z0 = svmin_z (p0, z1, x0))
+
+/*
+** min_1_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	smin	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s8_z_tied1, svint8_t,
+		z0 = svmin_n_s8_z (p0, z0, 1),
+		z0 = svmin_z (p0, z0, 1))
+
+/*
+** min_1_s8_z_untied:
+**	mov	(z[0-9]+\.b), #1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	smin	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	smin	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s8_z_untied, svint8_t,
+		z0 = svmin_n_s8_z (p0, z1, 1),
+		z0 = svmin_z (p0, z1, 1))
+
+/*
+** min_s8_x_tied1:
+**	smin	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (min_s8_x_tied1, svint8_t,
+		z0 = svmin_s8_x (p0, z0, z1),
+		z0 = svmin_x (p0, z0, z1))
+
+/*
+** min_s8_x_tied2:
+**	smin	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (min_s8_x_tied2, svint8_t,
+		z0 = svmin_s8_x (p0, z1, z0),
+		z0 = svmin_x (p0, z1, z0))
+
+/*
+** min_s8_x_untied:
+** (
+**	movprfx	z0, z1
+**	smin	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0, z2
+**	smin	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_s8_x_untied, svint8_t,
+		z0 = svmin_s8_x (p0, z1, z2),
+		z0 = svmin_x (p0, z1, z2))
+
+/*
+** min_w0_s8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	smin	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s8_x_tied1, svint8_t, int8_t,
+		 z0 = svmin_n_s8_x (p0, z0, x0),
+		 z0 = svmin_x (p0, z0, x0))
+
+/*
+** min_w0_s8_x_untied:
+**	mov	z0\.b, w0
+**	smin	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_s8_x_untied, svint8_t, int8_t,
+		 z0 = svmin_n_s8_x (p0, z1, x0),
+		 z0 = svmin_x (p0, z1, x0))
+
+/*
+** min_1_s8_x_tied1:
+**	smin	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s8_x_tied1, svint8_t,
+		z0 = svmin_n_s8_x (p0, z0, 1),
+		z0 = svmin_x (p0, z0, 1))
+
+/*
+** min_1_s8_x_untied:
+**	movprfx	z0, z1
+**	smin	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_s8_x_untied, svint8_t,
+		z0 = svmin_n_s8_x (p0, z1, 1),
+		z0 = svmin_x (p0, z1, 1))
+
+/*
+** min_127_s8_x:
+**	smin	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (min_127_s8_x, svint8_t,
+		z0 = svmin_n_s8_x (p0, z0, 127),
+		z0 = svmin_x (p0, z0, 127))
+
+/*
+** min_m1_s8_x:
+**	smin	z0\.b, z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (min_m1_s8_x, svint8_t,
+		z0 = svmin_n_s8_x (p0, z0, -1),
+		z0 = svmin_x (p0, z0, -1))
+
+/*
+** min_m127_s8_x:
+**	smin	z0\.b, z0\.b, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (min_m127_s8_x, svint8_t,
+		z0 = svmin_n_s8_x (p0, z0, -127),
+		z0 = svmin_x (p0, z0, -127))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u16.c
new file mode 100644
index 000000000..df35cf113
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u16.c
@@ -0,0 +1,293 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** min_u16_m_tied1:
+**	umin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_u16_m_tied1, svuint16_t,
+		z0 = svmin_u16_m (p0, z0, z1),
+		z0 = svmin_m (p0, z0, z1))
+
+/*
+** min_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	umin	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_u16_m_tied2, svuint16_t,
+		z0 = svmin_u16_m (p0, z1, z0),
+		z0 = svmin_m (p0, z1, z0))
+
+/*
+** min_u16_m_untied:
+**	movprfx	z0, z1
+**	umin	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_u16_m_untied, svuint16_t,
+		z0 = svmin_u16_m (p0, z1, z2),
+		z0 = svmin_m (p0, z1, z2))
+
+/*
+** min_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	umin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svmin_n_u16_m (p0, z0, x0),
+		 z0 = svmin_m (p0, z0, x0))
+
+/*
+** min_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	umin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svmin_n_u16_m (p0, z1, x0),
+		 z0 = svmin_m (p0, z1, x0))
+
+/*
+** min_1_u16_m_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	umin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u16_m_tied1, svuint16_t,
+		z0 = svmin_n_u16_m (p0, z0, 1),
+		z0 = svmin_m (p0, z0, 1))
+
+/*
+** min_1_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0, z1
+**	umin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u16_m_untied, svuint16_t,
+		z0 = svmin_n_u16_m (p0, z1, 1),
+		z0 = svmin_m (p0, z1, 1))
+
+/*
+** min_m1_u16_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	umin	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_m1_u16_m, svuint16_t,
+		z0 = svmin_n_u16_m (p0, z0, -1),
+		z0 = svmin_m (p0, z0, -1))
+
+/*
+** min_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	umin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_u16_z_tied1, svuint16_t,
+		z0 = svmin_u16_z (p0, z0, z1),
+		z0 = svmin_z (p0, z0, z1))
+
+/*
+** min_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	umin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_u16_z_tied2, svuint16_t,
+		z0 = svmin_u16_z (p0, z1, z0),
+		z0 = svmin_z (p0, z1, z0))
+
+/*
+** min_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	umin	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	umin	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_u16_z_untied, svuint16_t,
+		z0 = svmin_u16_z (p0, z1, z2),
+		z0 = svmin_z (p0, z1, z2))
+
+/*
+** min_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	umin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svmin_n_u16_z (p0, z0, x0),
+		 z0 = svmin_z (p0, z0, x0))
+
+/*
+** min_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	umin	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	umin	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svmin_n_u16_z (p0, z1, x0),
+		 z0 = svmin_z (p0, z1, x0))
+
+/*
+** min_1_u16_z_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	umin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u16_z_tied1, svuint16_t,
+		z0 = svmin_n_u16_z (p0, z0, 1),
+		z0 = svmin_z (p0, z0, 1))
+
+/*
+** min_1_u16_z_untied:
+**	mov	(z[0-9]+\.h), #1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	umin	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	umin	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u16_z_untied, svuint16_t,
+		z0 = svmin_n_u16_z (p0, z1, 1),
+		z0 = svmin_z (p0, z1, 1))
+
+/*
+** min_u16_x_tied1:
+**	umin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_u16_x_tied1, svuint16_t,
+		z0 = svmin_u16_x (p0, z0, z1),
+		z0 = svmin_x (p0, z0, z1))
+
+/*
+** min_u16_x_tied2:
+**	umin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (min_u16_x_tied2, svuint16_t,
+		z0 = svmin_u16_x (p0, z1, z0),
+		z0 = svmin_x (p0, z1, z0))
+
+/*
+** min_u16_x_untied:
+** (
+**	movprfx	z0, z1
+**	umin	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	umin	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_u16_x_untied, svuint16_t,
+		z0 = svmin_u16_x (p0, z1, z2),
+		z0 = svmin_x (p0, z1, z2))
+
+/*
+** min_w0_u16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	umin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svmin_n_u16_x (p0, z0, x0),
+		 z0 = svmin_x (p0, z0, x0))
+
+/*
+** min_w0_u16_x_untied:
+**	mov	z0\.h, w0
+**	umin	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svmin_n_u16_x (p0, z1, x0),
+		 z0 = svmin_x (p0, z1, x0))
+
+/*
+** min_1_u16_x_tied1:
+**	umin	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u16_x_tied1, svuint16_t,
+		z0 = svmin_n_u16_x (p0, z0, 1),
+		z0 = svmin_x (p0, z0, 1))
+
+/*
+** min_1_u16_x_untied:
+**	movprfx	z0, z1
+**	umin	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u16_x_untied, svuint16_t,
+		z0 = svmin_n_u16_x (p0, z1, 1),
+		z0 = svmin_x (p0, z1, 1))
+
+/*
+** min_127_u16_x:
+**	umin	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (min_127_u16_x, svuint16_t,
+		z0 = svmin_n_u16_x (p0, z0, 127),
+		z0 = svmin_x (p0, z0, 127))
+
+/*
+** min_128_u16_x:
+**	umin	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (min_128_u16_x, svuint16_t,
+		z0 = svmin_n_u16_x (p0, z0, 128),
+		z0 = svmin_x (p0, z0, 128))
+
+/*
+** min_255_u16_x:
+**	umin	z0\.h, z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (min_255_u16_x, svuint16_t,
+		z0 = svmin_n_u16_x (p0, z0, 255),
+		z0 = svmin_x (p0, z0, 255))
+
+/*
+** min_256_u16_x:
+**	mov	(z[0-9]+\.h), #256
+**	umin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_256_u16_x, svuint16_t,
+		z0 = svmin_n_u16_x (p0, z0, 256),
+		z0 = svmin_x (p0, z0, 256))
+
+/*
+** min_m2_u16_x:
+**	mov	(z[0-9]+\.h), #-2
+**	umin	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_m2_u16_x, svuint16_t,
+		z0 = svmin_n_u16_x (p0, z0, -2),
+		z0 = svmin_x (p0, z0, -2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u32.c
new file mode 100644
index 000000000..7f84d099d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u32.c
@@ -0,0 +1,293 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** min_u32_m_tied1:
+**	umin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_u32_m_tied1, svuint32_t,
+		z0 = svmin_u32_m (p0, z0, z1),
+		z0 = svmin_m (p0, z0, z1))
+
+/*
+** min_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	umin	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_u32_m_tied2, svuint32_t,
+		z0 = svmin_u32_m (p0, z1, z0),
+		z0 = svmin_m (p0, z1, z0))
+
+/*
+** min_u32_m_untied:
+**	movprfx	z0, z1
+**	umin	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_u32_m_untied, svuint32_t,
+		z0 = svmin_u32_m (p0, z1, z2),
+		z0 = svmin_m (p0, z1, z2))
+
+/*
+** min_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	umin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svmin_n_u32_m (p0, z0, x0),
+		 z0 = svmin_m (p0, z0, x0))
+
+/*
+** min_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	umin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svmin_n_u32_m (p0, z1, x0),
+		 z0 = svmin_m (p0, z1, x0))
+
+/*
+** min_1_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	umin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u32_m_tied1, svuint32_t,
+		z0 = svmin_n_u32_m (p0, z0, 1),
+		z0 = svmin_m (p0, z0, 1))
+
+/*
+** min_1_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0, z1
+**	umin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u32_m_untied, svuint32_t,
+		z0 = svmin_n_u32_m (p0, z1, 1),
+		z0 = svmin_m (p0, z1, 1))
+
+/*
+** min_m1_u32_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	umin	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_m1_u32_m, svuint32_t,
+		z0 = svmin_n_u32_m (p0, z0, -1),
+		z0 = svmin_m (p0, z0, -1))
+
+/*
+** min_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	umin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_u32_z_tied1, svuint32_t,
+		z0 = svmin_u32_z (p0, z0, z1),
+		z0 = svmin_z (p0, z0, z1))
+
+/*
+** min_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	umin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_u32_z_tied2, svuint32_t,
+		z0 = svmin_u32_z (p0, z1, z0),
+		z0 = svmin_z (p0, z1, z0))
+
+/*
+** min_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	umin	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	umin	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_u32_z_untied, svuint32_t,
+		z0 = svmin_u32_z (p0, z1, z2),
+		z0 = svmin_z (p0, z1, z2))
+
+/*
+** min_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	umin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svmin_n_u32_z (p0, z0, x0),
+		 z0 = svmin_z (p0, z0, x0))
+
+/*
+** min_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	umin	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	umin	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svmin_n_u32_z (p0, z1, x0),
+		 z0 = svmin_z (p0, z1, x0))
+
+/*
+** min_1_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	umin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u32_z_tied1, svuint32_t,
+		z0 = svmin_n_u32_z (p0, z0, 1),
+		z0 = svmin_z (p0, z0, 1))
+
+/*
+** min_1_u32_z_untied:
+**	mov	(z[0-9]+\.s), #1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	umin	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	umin	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u32_z_untied, svuint32_t,
+		z0 = svmin_n_u32_z (p0, z1, 1),
+		z0 = svmin_z (p0, z1, 1))
+
+/*
+** min_u32_x_tied1:
+**	umin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_u32_x_tied1, svuint32_t,
+		z0 = svmin_u32_x (p0, z0, z1),
+		z0 = svmin_x (p0, z0, z1))
+
+/*
+** min_u32_x_tied2:
+**	umin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (min_u32_x_tied2, svuint32_t,
+		z0 = svmin_u32_x (p0, z1, z0),
+		z0 = svmin_x (p0, z1, z0))
+
+/*
+** min_u32_x_untied:
+** (
+**	movprfx	z0, z1
+**	umin	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	umin	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_u32_x_untied, svuint32_t,
+		z0 = svmin_u32_x (p0, z1, z2),
+		z0 = svmin_x (p0, z1, z2))
+
+/*
+** min_w0_u32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	umin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svmin_n_u32_x (p0, z0, x0),
+		 z0 = svmin_x (p0, z0, x0))
+
+/*
+** min_w0_u32_x_untied:
+**	mov	z0\.s, w0
+**	umin	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svmin_n_u32_x (p0, z1, x0),
+		 z0 = svmin_x (p0, z1, x0))
+
+/*
+** min_1_u32_x_tied1:
+**	umin	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u32_x_tied1, svuint32_t,
+		z0 = svmin_n_u32_x (p0, z0, 1),
+		z0 = svmin_x (p0, z0, 1))
+
+/*
+** min_1_u32_x_untied:
+**	movprfx	z0, z1
+**	umin	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u32_x_untied, svuint32_t,
+		z0 = svmin_n_u32_x (p0, z1, 1),
+		z0 = svmin_x (p0, z1, 1))
+
+/*
+** min_127_u32_x:
+**	umin	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (min_127_u32_x, svuint32_t,
+		z0 = svmin_n_u32_x (p0, z0, 127),
+		z0 = svmin_x (p0, z0, 127))
+
+/*
+** min_128_u32_x:
+**	umin	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (min_128_u32_x, svuint32_t,
+		z0 = svmin_n_u32_x (p0, z0, 128),
+		z0 = svmin_x (p0, z0, 128))
+
+/*
+** min_255_u32_x:
+**	umin	z0\.s, z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (min_255_u32_x, svuint32_t,
+		z0 = svmin_n_u32_x (p0, z0, 255),
+		z0 = svmin_x (p0, z0, 255))
+
+/*
+** min_256_u32_x:
+**	mov	(z[0-9]+\.s), #256
+**	umin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_256_u32_x, svuint32_t,
+		z0 = svmin_n_u32_x (p0, z0, 256),
+		z0 = svmin_x (p0, z0, 256))
+
+/*
+** min_m2_u32_x:
+**	mov	(z[0-9]+\.s), #-2
+**	umin	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_m2_u32_x, svuint32_t,
+		z0 = svmin_n_u32_x (p0, z0, -2),
+		z0 = svmin_x (p0, z0, -2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u64.c
new file mode 100644
index 000000000..06e6e5099
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u64.c
@@ -0,0 +1,293 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** min_u64_m_tied1:
+**	umin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_u64_m_tied1, svuint64_t,
+		z0 = svmin_u64_m (p0, z0, z1),
+		z0 = svmin_m (p0, z0, z1))
+
+/*
+** min_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	umin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_u64_m_tied2, svuint64_t,
+		z0 = svmin_u64_m (p0, z1, z0),
+		z0 = svmin_m (p0, z1, z0))
+
+/*
+** min_u64_m_untied:
+**	movprfx	z0, z1
+**	umin	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_u64_m_untied, svuint64_t,
+		z0 = svmin_u64_m (p0, z1, z2),
+		z0 = svmin_m (p0, z1, z2))
+
+/*
+** min_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	umin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svmin_n_u64_m (p0, z0, x0),
+		 z0 = svmin_m (p0, z0, x0))
+
+/*
+** min_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	umin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svmin_n_u64_m (p0, z1, x0),
+		 z0 = svmin_m (p0, z1, x0))
+
+/*
+** min_1_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	umin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u64_m_tied1, svuint64_t,
+		z0 = svmin_n_u64_m (p0, z0, 1),
+		z0 = svmin_m (p0, z0, 1))
+
+/*
+** min_1_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0, z1
+**	umin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u64_m_untied, svuint64_t,
+		z0 = svmin_n_u64_m (p0, z1, 1),
+		z0 = svmin_m (p0, z1, 1))
+
+/*
+** min_m1_u64_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	umin	z0\.d, p0/m, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_m1_u64_m, svuint64_t,
+		z0 = svmin_n_u64_m (p0, z0, -1),
+		z0 = svmin_m (p0, z0, -1))
+
+/*
+** min_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	umin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_u64_z_tied1, svuint64_t,
+		z0 = svmin_u64_z (p0, z0, z1),
+		z0 = svmin_z (p0, z0, z1))
+
+/*
+** min_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	umin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_u64_z_tied2, svuint64_t,
+		z0 = svmin_u64_z (p0, z1, z0),
+		z0 = svmin_z (p0, z1, z0))
+
+/*
+** min_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	umin	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	umin	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_u64_z_untied, svuint64_t,
+		z0 = svmin_u64_z (p0, z1, z2),
+		z0 = svmin_z (p0, z1, z2))
+
+/*
+** min_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	umin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svmin_n_u64_z (p0, z0, x0),
+		 z0 = svmin_z (p0, z0, x0))
+
+/*
+** min_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	umin	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	umin	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (min_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svmin_n_u64_z (p0, z1, x0),
+		 z0 = svmin_z (p0, z1, x0))
+
+/*
+** min_1_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	umin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u64_z_tied1, svuint64_t,
+		z0 = svmin_n_u64_z (p0, z0, 1),
+		z0 = svmin_z (p0, z0, 1))
+
+/*
+** min_1_u64_z_untied:
+**	mov	(z[0-9]+\.d), #1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	umin	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	umin	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u64_z_untied, svuint64_t,
+		z0 = svmin_n_u64_z (p0, z1, 1),
+		z0 = svmin_z (p0, z1, 1))
+
+/*
+** min_u64_x_tied1:
+**	umin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_u64_x_tied1, svuint64_t,
+		z0 = svmin_u64_x (p0, z0, z1),
+		z0 = svmin_x (p0, z0, z1))
+
+/*
+** min_u64_x_tied2:
+**	umin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (min_u64_x_tied2, svuint64_t,
+		z0 = svmin_u64_x (p0, z1, z0),
+		z0 = svmin_x (p0, z1, z0))
+
+/*
+** min_u64_x_untied:
+** (
+**	movprfx	z0, z1
+**	umin	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	umin	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_u64_x_untied, svuint64_t,
+		z0 = svmin_u64_x (p0, z1, z2),
+		z0 = svmin_x (p0, z1, z2))
+
+/*
+** min_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	umin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svmin_n_u64_x (p0, z0, x0),
+		 z0 = svmin_x (p0, z0, x0))
+
+/*
+** min_x0_u64_x_untied:
+**	mov	z0\.d, x0
+**	umin	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (min_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svmin_n_u64_x (p0, z1, x0),
+		 z0 = svmin_x (p0, z1, x0))
+
+/*
+** min_1_u64_x_tied1:
+**	umin	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u64_x_tied1, svuint64_t,
+		z0 = svmin_n_u64_x (p0, z0, 1),
+		z0 = svmin_x (p0, z0, 1))
+
+/*
+** min_1_u64_x_untied:
+**	movprfx	z0, z1
+**	umin	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u64_x_untied, svuint64_t,
+		z0 = svmin_n_u64_x (p0, z1, 1),
+		z0 = svmin_x (p0, z1, 1))
+
+/*
+** min_127_u64_x:
+**	umin	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (min_127_u64_x, svuint64_t,
+		z0 = svmin_n_u64_x (p0, z0, 127),
+		z0 = svmin_x (p0, z0, 127))
+
+/*
+** min_128_u64_x:
+**	umin	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (min_128_u64_x, svuint64_t,
+		z0 = svmin_n_u64_x (p0, z0, 128),
+		z0 = svmin_x (p0, z0, 128))
+
+/*
+** min_255_u64_x:
+**	umin	z0\.d, z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (min_255_u64_x, svuint64_t,
+		z0 = svmin_n_u64_x (p0, z0, 255),
+		z0 = svmin_x (p0, z0, 255))
+
+/*
+** min_256_u64_x:
+**	mov	(z[0-9]+\.d), #256
+**	umin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_256_u64_x, svuint64_t,
+		z0 = svmin_n_u64_x (p0, z0, 256),
+		z0 = svmin_x (p0, z0, 256))
+
+/*
+** min_m2_u64_x:
+**	mov	(z[0-9]+\.d), #-2
+**	umin	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_m2_u64_x, svuint64_t,
+		z0 = svmin_n_u64_x (p0, z0, -2),
+		z0 = svmin_x (p0, z0, -2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u8.c
new file mode 100644
index 000000000..2ca274278
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/min_u8.c
@@ -0,0 +1,273 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** min_u8_m_tied1:
+**	umin	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (min_u8_m_tied1, svuint8_t,
+		z0 = svmin_u8_m (p0, z0, z1),
+		z0 = svmin_m (p0, z0, z1))
+
+/*
+** min_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	umin	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (min_u8_m_tied2, svuint8_t,
+		z0 = svmin_u8_m (p0, z1, z0),
+		z0 = svmin_m (p0, z1, z0))
+
+/*
+** min_u8_m_untied:
+**	movprfx	z0, z1
+**	umin	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (min_u8_m_untied, svuint8_t,
+		z0 = svmin_u8_m (p0, z1, z2),
+		z0 = svmin_m (p0, z1, z2))
+
+/*
+** min_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	umin	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svmin_n_u8_m (p0, z0, x0),
+		 z0 = svmin_m (p0, z0, x0))
+
+/*
+** min_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	umin	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svmin_n_u8_m (p0, z1, x0),
+		 z0 = svmin_m (p0, z1, x0))
+
+/*
+** min_1_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	umin	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u8_m_tied1, svuint8_t,
+		z0 = svmin_n_u8_m (p0, z0, 1),
+		z0 = svmin_m (p0, z0, 1))
+
+/*
+** min_1_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0, z1
+**	umin	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u8_m_untied, svuint8_t,
+		z0 = svmin_n_u8_m (p0, z1, 1),
+		z0 = svmin_m (p0, z1, 1))
+
+/*
+** min_m1_u8_m:
+**	mov	(z[0-9]+\.b), #-1
+**	umin	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_m1_u8_m, svuint8_t,
+		z0 = svmin_n_u8_m (p0, z0, -1),
+		z0 = svmin_m (p0, z0, -1))
+
+/*
+** min_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	umin	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (min_u8_z_tied1, svuint8_t,
+		z0 = svmin_u8_z (p0, z0, z1),
+		z0 = svmin_z (p0, z0, z1))
+
+/*
+** min_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	umin	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (min_u8_z_tied2, svuint8_t,
+		z0 = svmin_u8_z (p0, z1, z0),
+		z0 = svmin_z (p0, z1, z0))
+
+/*
+** min_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	umin	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	umin	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_u8_z_untied, svuint8_t,
+		z0 = svmin_u8_z (p0, z1, z2),
+		z0 = svmin_z (p0, z1, z2))
+
+/*
+** min_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	umin	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svmin_n_u8_z (p0, z0, x0),
+		 z0 = svmin_z (p0, z0, x0))
+
+/*
+** min_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	umin	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	umin	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svmin_n_u8_z (p0, z1, x0),
+		 z0 = svmin_z (p0, z1, x0))
+
+/*
+** min_1_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	umin	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u8_z_tied1, svuint8_t,
+		z0 = svmin_n_u8_z (p0, z0, 1),
+		z0 = svmin_z (p0, z0, 1))
+
+/*
+** min_1_u8_z_untied:
+**	mov	(z[0-9]+\.b), #1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	umin	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	umin	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u8_z_untied, svuint8_t,
+		z0 = svmin_n_u8_z (p0, z1, 1),
+		z0 = svmin_z (p0, z1, 1))
+
+/*
+** min_u8_x_tied1:
+**	umin	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (min_u8_x_tied1, svuint8_t,
+		z0 = svmin_u8_x (p0, z0, z1),
+		z0 = svmin_x (p0, z0, z1))
+
+/*
+** min_u8_x_tied2:
+**	umin	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (min_u8_x_tied2, svuint8_t,
+		z0 = svmin_u8_x (p0, z1, z0),
+		z0 = svmin_x (p0, z1, z0))
+
+/*
+** min_u8_x_untied:
+** (
+**	movprfx	z0, z1
+**	umin	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0, z2
+**	umin	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (min_u8_x_untied, svuint8_t,
+		z0 = svmin_u8_x (p0, z1, z2),
+		z0 = svmin_x (p0, z1, z2))
+
+/*
+** min_w0_u8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	umin	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svmin_n_u8_x (p0, z0, x0),
+		 z0 = svmin_x (p0, z0, x0))
+
+/*
+** min_w0_u8_x_untied:
+**	mov	z0\.b, w0
+**	umin	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (min_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svmin_n_u8_x (p0, z1, x0),
+		 z0 = svmin_x (p0, z1, x0))
+
+/*
+** min_1_u8_x_tied1:
+**	umin	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u8_x_tied1, svuint8_t,
+		z0 = svmin_n_u8_x (p0, z0, 1),
+		z0 = svmin_x (p0, z0, 1))
+
+/*
+** min_1_u8_x_untied:
+**	movprfx	z0, z1
+**	umin	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (min_1_u8_x_untied, svuint8_t,
+		z0 = svmin_n_u8_x (p0, z1, 1),
+		z0 = svmin_x (p0, z1, 1))
+
+/*
+** min_127_u8_x:
+**	umin	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (min_127_u8_x, svuint8_t,
+		z0 = svmin_n_u8_x (p0, z0, 127),
+		z0 = svmin_x (p0, z0, 127))
+
+/*
+** min_128_u8_x:
+**	umin	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (min_128_u8_x, svuint8_t,
+		z0 = svmin_n_u8_x (p0, z0, 128),
+		z0 = svmin_x (p0, z0, 128))
+
+/*
+** min_254_u8_x:
+**	umin	z0\.b, z0\.b, #254
+**	ret
+*/
+TEST_UNIFORM_Z (min_254_u8_x, svuint8_t,
+		z0 = svmin_n_u8_x (p0, z0, 254),
+		z0 = svmin_x (p0, z0, 254))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f16.c
new file mode 100644
index 000000000..43caaa14e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f16.c
@@ -0,0 +1,425 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minnm_f16_m_tied1:
+**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f16_m_tied1, svfloat16_t,
+		z0 = svminnm_f16_m (p0, z0, z1),
+		z0 = svminnm_m (p0, z0, z1))
+
+/*
+** minnm_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fminnm	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f16_m_tied2, svfloat16_t,
+		z0 = svminnm_f16_m (p0, z1, z0),
+		z0 = svminnm_m (p0, z1, z0))
+
+/*
+** minnm_f16_m_untied:
+**	movprfx	z0, z1
+**	fminnm	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f16_m_untied, svfloat16_t,
+		z0 = svminnm_f16_m (p0, z1, z2),
+		z0 = svminnm_m (p0, z1, z2))
+
+/*
+** minnm_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fminnm	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svminnm_n_f16_m (p0, z0, d4),
+		 z0 = svminnm_m (p0, z0, d4))
+
+/*
+** minnm_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fminnm	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svminnm_n_f16_m (p0, z1, d4),
+		 z0 = svminnm_m (p0, z1, d4))
+
+/*
+** minnm_0_f16_m_tied1:
+**	fminnm	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f16_m_tied1, svfloat16_t,
+		z0 = svminnm_n_f16_m (p0, z0, 0),
+		z0 = svminnm_m (p0, z0, 0))
+
+/*
+** minnm_0_f16_m_untied:
+**	movprfx	z0, z1
+**	fminnm	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f16_m_untied, svfloat16_t,
+		z0 = svminnm_n_f16_m (p0, z1, 0),
+		z0 = svminnm_m (p0, z1, 0))
+
+/*
+** minnm_1_f16_m_tied1:
+**	fminnm	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f16_m_tied1, svfloat16_t,
+		z0 = svminnm_n_f16_m (p0, z0, 1),
+		z0 = svminnm_m (p0, z0, 1))
+
+/*
+** minnm_1_f16_m_untied:
+**	movprfx	z0, z1
+**	fminnm	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f16_m_untied, svfloat16_t,
+		z0 = svminnm_n_f16_m (p0, z1, 1),
+		z0 = svminnm_m (p0, z1, 1))
+
+/*
+** minnm_2_f16_m:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fminnm	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_2_f16_m, svfloat16_t,
+		z0 = svminnm_n_f16_m (p0, z0, 2),
+		z0 = svminnm_m (p0, z0, 2))
+
+/*
+** minnm_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f16_z_tied1, svfloat16_t,
+		z0 = svminnm_f16_z (p0, z0, z1),
+		z0 = svminnm_z (p0, z0, z1))
+
+/*
+** minnm_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f16_z_tied2, svfloat16_t,
+		z0 = svminnm_f16_z (p0, z1, z0),
+		z0 = svminnm_z (p0, z1, z0))
+
+/*
+** minnm_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fminnm	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f16_z_untied, svfloat16_t,
+		z0 = svminnm_f16_z (p0, z1, z2),
+		z0 = svminnm_z (p0, z1, z2))
+
+/*
+** minnm_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fminnm	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svminnm_n_f16_z (p0, z0, d4),
+		 z0 = svminnm_z (p0, z0, d4))
+
+/*
+** minnm_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fminnm	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svminnm_n_f16_z (p0, z1, d4),
+		 z0 = svminnm_z (p0, z1, d4))
+
+/*
+** minnm_0_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fminnm	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f16_z_tied1, svfloat16_t,
+		z0 = svminnm_n_f16_z (p0, z0, 0),
+		z0 = svminnm_z (p0, z0, 0))
+
+/*
+** minnm_0_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fminnm	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f16_z_untied, svfloat16_t,
+		z0 = svminnm_n_f16_z (p0, z1, 0),
+		z0 = svminnm_z (p0, z1, 0))
+
+/*
+** minnm_1_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fminnm	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f16_z_tied1, svfloat16_t,
+		z0 = svminnm_n_f16_z (p0, z0, 1),
+		z0 = svminnm_z (p0, z0, 1))
+
+/*
+** minnm_1_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fminnm	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f16_z_untied, svfloat16_t,
+		z0 = svminnm_n_f16_z (p0, z1, 1),
+		z0 = svminnm_z (p0, z1, 1))
+
+/*
+** minnm_2_f16_z:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fminnm	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_2_f16_z, svfloat16_t,
+		z0 = svminnm_n_f16_z (p0, z0, 2),
+		z0 = svminnm_z (p0, z0, 2))
+
+/*
+** minnm_f16_x_tied1:
+**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f16_x_tied1, svfloat16_t,
+		z0 = svminnm_f16_x (p0, z0, z1),
+		z0 = svminnm_x (p0, z0, z1))
+
+/*
+** minnm_f16_x_tied2:
+**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f16_x_tied2, svfloat16_t,
+		z0 = svminnm_f16_x (p0, z1, z0),
+		z0 = svminnm_x (p0, z1, z0))
+
+/*
+** minnm_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fminnm	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f16_x_untied, svfloat16_t,
+		z0 = svminnm_f16_x (p0, z1, z2),
+		z0 = svminnm_x (p0, z1, z2))
+
+/*
+** minnm_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fminnm	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svminnm_n_f16_x (p0, z0, d4),
+		 z0 = svminnm_x (p0, z0, d4))
+
+/*
+** minnm_h4_f16_x_untied:
+**	mov	z0\.h, h4
+**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svminnm_n_f16_x (p0, z1, d4),
+		 z0 = svminnm_x (p0, z1, d4))
+
+/*
+** minnm_0_f16_x_tied1:
+**	fminnm	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f16_x_tied1, svfloat16_t,
+		z0 = svminnm_n_f16_x (p0, z0, 0),
+		z0 = svminnm_x (p0, z0, 0))
+
+/*
+** minnm_0_f16_x_untied:
+**	movprfx	z0, z1
+**	fminnm	z0\.h, p0/m, z0\.h, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f16_x_untied, svfloat16_t,
+		z0 = svminnm_n_f16_x (p0, z1, 0),
+		z0 = svminnm_x (p0, z1, 0))
+
+/*
+** minnm_1_f16_x_tied1:
+**	fminnm	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f16_x_tied1, svfloat16_t,
+		z0 = svminnm_n_f16_x (p0, z0, 1),
+		z0 = svminnm_x (p0, z0, 1))
+
+/*
+** minnm_1_f16_x_untied:
+**	movprfx	z0, z1
+**	fminnm	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f16_x_untied, svfloat16_t,
+		z0 = svminnm_n_f16_x (p0, z1, 1),
+		z0 = svminnm_x (p0, z1, 1))
+
+/*
+** minnm_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fminnm	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_2_f16_x_tied1, svfloat16_t,
+		z0 = svminnm_n_f16_x (p0, z0, 2),
+		z0 = svminnm_x (p0, z0, 2))
+
+/*
+** minnm_2_f16_x_untied:
+**	fmov	z0\.h, #2\.0(?:e\+0)?
+**	fminnm	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_2_f16_x_untied, svfloat16_t,
+		z0 = svminnm_n_f16_x (p0, z1, 2),
+		z0 = svminnm_x (p0, z1, 2))
+
+/*
+** ptrue_minnm_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_f16_x_tied1, svfloat16_t,
+		z0 = svminnm_f16_x (svptrue_b16 (), z0, z1),
+		z0 = svminnm_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_minnm_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_f16_x_tied2, svfloat16_t,
+		z0 = svminnm_f16_x (svptrue_b16 (), z1, z0),
+		z0 = svminnm_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_minnm_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_f16_x_untied, svfloat16_t,
+		z0 = svminnm_f16_x (svptrue_b16 (), z1, z2),
+		z0 = svminnm_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_minnm_0_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_0_f16_x_tied1, svfloat16_t,
+		z0 = svminnm_n_f16_x (svptrue_b16 (), z0, 0),
+		z0 = svminnm_x (svptrue_b16 (), z0, 0))
+
+/*
+** ptrue_minnm_0_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_0_f16_x_untied, svfloat16_t,
+		z0 = svminnm_n_f16_x (svptrue_b16 (), z1, 0),
+		z0 = svminnm_x (svptrue_b16 (), z1, 0))
+
+/*
+** ptrue_minnm_1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_1_f16_x_tied1, svfloat16_t,
+		z0 = svminnm_n_f16_x (svptrue_b16 (), z0, 1),
+		z0 = svminnm_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_minnm_1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_1_f16_x_untied, svfloat16_t,
+		z0 = svminnm_n_f16_x (svptrue_b16 (), z1, 1),
+		z0 = svminnm_x (svptrue_b16 (), z1, 1))
+
+/*
+** ptrue_minnm_2_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_2_f16_x_tied1, svfloat16_t,
+		z0 = svminnm_n_f16_x (svptrue_b16 (), z0, 2),
+		z0 = svminnm_x (svptrue_b16 (), z0, 2))
+
+/*
+** ptrue_minnm_2_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_2_f16_x_untied, svfloat16_t,
+		z0 = svminnm_n_f16_x (svptrue_b16 (), z1, 2),
+		z0 = svminnm_x (svptrue_b16 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f32.c
new file mode 100644
index 000000000..4fac8e8ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f32.c
@@ -0,0 +1,425 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minnm_f32_m_tied1:
+**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f32_m_tied1, svfloat32_t,
+		z0 = svminnm_f32_m (p0, z0, z1),
+		z0 = svminnm_m (p0, z0, z1))
+
+/*
+** minnm_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fminnm	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f32_m_tied2, svfloat32_t,
+		z0 = svminnm_f32_m (p0, z1, z0),
+		z0 = svminnm_m (p0, z1, z0))
+
+/*
+** minnm_f32_m_untied:
+**	movprfx	z0, z1
+**	fminnm	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f32_m_untied, svfloat32_t,
+		z0 = svminnm_f32_m (p0, z1, z2),
+		z0 = svminnm_m (p0, z1, z2))
+
+/*
+** minnm_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fminnm	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svminnm_n_f32_m (p0, z0, d4),
+		 z0 = svminnm_m (p0, z0, d4))
+
+/*
+** minnm_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fminnm	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svminnm_n_f32_m (p0, z1, d4),
+		 z0 = svminnm_m (p0, z1, d4))
+
+/*
+** minnm_0_f32_m_tied1:
+**	fminnm	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f32_m_tied1, svfloat32_t,
+		z0 = svminnm_n_f32_m (p0, z0, 0),
+		z0 = svminnm_m (p0, z0, 0))
+
+/*
+** minnm_0_f32_m_untied:
+**	movprfx	z0, z1
+**	fminnm	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f32_m_untied, svfloat32_t,
+		z0 = svminnm_n_f32_m (p0, z1, 0),
+		z0 = svminnm_m (p0, z1, 0))
+
+/*
+** minnm_1_f32_m_tied1:
+**	fminnm	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f32_m_tied1, svfloat32_t,
+		z0 = svminnm_n_f32_m (p0, z0, 1),
+		z0 = svminnm_m (p0, z0, 1))
+
+/*
+** minnm_1_f32_m_untied:
+**	movprfx	z0, z1
+**	fminnm	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f32_m_untied, svfloat32_t,
+		z0 = svminnm_n_f32_m (p0, z1, 1),
+		z0 = svminnm_m (p0, z1, 1))
+
+/*
+** minnm_2_f32_m:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fminnm	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_2_f32_m, svfloat32_t,
+		z0 = svminnm_n_f32_m (p0, z0, 2),
+		z0 = svminnm_m (p0, z0, 2))
+
+/*
+** minnm_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f32_z_tied1, svfloat32_t,
+		z0 = svminnm_f32_z (p0, z0, z1),
+		z0 = svminnm_z (p0, z0, z1))
+
+/*
+** minnm_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f32_z_tied2, svfloat32_t,
+		z0 = svminnm_f32_z (p0, z1, z0),
+		z0 = svminnm_z (p0, z1, z0))
+
+/*
+** minnm_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fminnm	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f32_z_untied, svfloat32_t,
+		z0 = svminnm_f32_z (p0, z1, z2),
+		z0 = svminnm_z (p0, z1, z2))
+
+/*
+** minnm_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fminnm	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svminnm_n_f32_z (p0, z0, d4),
+		 z0 = svminnm_z (p0, z0, d4))
+
+/*
+** minnm_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fminnm	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svminnm_n_f32_z (p0, z1, d4),
+		 z0 = svminnm_z (p0, z1, d4))
+
+/*
+** minnm_0_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fminnm	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f32_z_tied1, svfloat32_t,
+		z0 = svminnm_n_f32_z (p0, z0, 0),
+		z0 = svminnm_z (p0, z0, 0))
+
+/*
+** minnm_0_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fminnm	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f32_z_untied, svfloat32_t,
+		z0 = svminnm_n_f32_z (p0, z1, 0),
+		z0 = svminnm_z (p0, z1, 0))
+
+/*
+** minnm_1_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fminnm	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f32_z_tied1, svfloat32_t,
+		z0 = svminnm_n_f32_z (p0, z0, 1),
+		z0 = svminnm_z (p0, z0, 1))
+
+/*
+** minnm_1_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fminnm	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f32_z_untied, svfloat32_t,
+		z0 = svminnm_n_f32_z (p0, z1, 1),
+		z0 = svminnm_z (p0, z1, 1))
+
+/*
+** minnm_2_f32_z:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fminnm	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_2_f32_z, svfloat32_t,
+		z0 = svminnm_n_f32_z (p0, z0, 2),
+		z0 = svminnm_z (p0, z0, 2))
+
+/*
+** minnm_f32_x_tied1:
+**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f32_x_tied1, svfloat32_t,
+		z0 = svminnm_f32_x (p0, z0, z1),
+		z0 = svminnm_x (p0, z0, z1))
+
+/*
+** minnm_f32_x_tied2:
+**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f32_x_tied2, svfloat32_t,
+		z0 = svminnm_f32_x (p0, z1, z0),
+		z0 = svminnm_x (p0, z1, z0))
+
+/*
+** minnm_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fminnm	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f32_x_untied, svfloat32_t,
+		z0 = svminnm_f32_x (p0, z1, z2),
+		z0 = svminnm_x (p0, z1, z2))
+
+/*
+** minnm_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fminnm	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svminnm_n_f32_x (p0, z0, d4),
+		 z0 = svminnm_x (p0, z0, d4))
+
+/*
+** minnm_s4_f32_x_untied:
+**	mov	z0\.s, s4
+**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svminnm_n_f32_x (p0, z1, d4),
+		 z0 = svminnm_x (p0, z1, d4))
+
+/*
+** minnm_0_f32_x_tied1:
+**	fminnm	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f32_x_tied1, svfloat32_t,
+		z0 = svminnm_n_f32_x (p0, z0, 0),
+		z0 = svminnm_x (p0, z0, 0))
+
+/*
+** minnm_0_f32_x_untied:
+**	movprfx	z0, z1
+**	fminnm	z0\.s, p0/m, z0\.s, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f32_x_untied, svfloat32_t,
+		z0 = svminnm_n_f32_x (p0, z1, 0),
+		z0 = svminnm_x (p0, z1, 0))
+
+/*
+** minnm_1_f32_x_tied1:
+**	fminnm	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f32_x_tied1, svfloat32_t,
+		z0 = svminnm_n_f32_x (p0, z0, 1),
+		z0 = svminnm_x (p0, z0, 1))
+
+/*
+** minnm_1_f32_x_untied:
+**	movprfx	z0, z1
+**	fminnm	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f32_x_untied, svfloat32_t,
+		z0 = svminnm_n_f32_x (p0, z1, 1),
+		z0 = svminnm_x (p0, z1, 1))
+
+/*
+** minnm_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fminnm	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_2_f32_x_tied1, svfloat32_t,
+		z0 = svminnm_n_f32_x (p0, z0, 2),
+		z0 = svminnm_x (p0, z0, 2))
+
+/*
+** minnm_2_f32_x_untied:
+**	fmov	z0\.s, #2\.0(?:e\+0)?
+**	fminnm	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_2_f32_x_untied, svfloat32_t,
+		z0 = svminnm_n_f32_x (p0, z1, 2),
+		z0 = svminnm_x (p0, z1, 2))
+
+/*
+** ptrue_minnm_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_f32_x_tied1, svfloat32_t,
+		z0 = svminnm_f32_x (svptrue_b32 (), z0, z1),
+		z0 = svminnm_x (svptrue_b32 (), z0, z1))
+
+/*
+** ptrue_minnm_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_f32_x_tied2, svfloat32_t,
+		z0 = svminnm_f32_x (svptrue_b32 (), z1, z0),
+		z0 = svminnm_x (svptrue_b32 (), z1, z0))
+
+/*
+** ptrue_minnm_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_f32_x_untied, svfloat32_t,
+		z0 = svminnm_f32_x (svptrue_b32 (), z1, z2),
+		z0 = svminnm_x (svptrue_b32 (), z1, z2))
+
+/*
+** ptrue_minnm_0_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_0_f32_x_tied1, svfloat32_t,
+		z0 = svminnm_n_f32_x (svptrue_b32 (), z0, 0),
+		z0 = svminnm_x (svptrue_b32 (), z0, 0))
+
+/*
+** ptrue_minnm_0_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_0_f32_x_untied, svfloat32_t,
+		z0 = svminnm_n_f32_x (svptrue_b32 (), z1, 0),
+		z0 = svminnm_x (svptrue_b32 (), z1, 0))
+
+/*
+** ptrue_minnm_1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_1_f32_x_tied1, svfloat32_t,
+		z0 = svminnm_n_f32_x (svptrue_b32 (), z0, 1),
+		z0 = svminnm_x (svptrue_b32 (), z0, 1))
+
+/*
+** ptrue_minnm_1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_1_f32_x_untied, svfloat32_t,
+		z0 = svminnm_n_f32_x (svptrue_b32 (), z1, 1),
+		z0 = svminnm_x (svptrue_b32 (), z1, 1))
+
+/*
+** ptrue_minnm_2_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_2_f32_x_tied1, svfloat32_t,
+		z0 = svminnm_n_f32_x (svptrue_b32 (), z0, 2),
+		z0 = svminnm_x (svptrue_b32 (), z0, 2))
+
+/*
+** ptrue_minnm_2_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_2_f32_x_untied, svfloat32_t,
+		z0 = svminnm_n_f32_x (svptrue_b32 (), z1, 2),
+		z0 = svminnm_x (svptrue_b32 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f64.c
new file mode 100644
index 000000000..67993928f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnm_f64.c
@@ -0,0 +1,425 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minnm_f64_m_tied1:
+**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f64_m_tied1, svfloat64_t,
+		z0 = svminnm_f64_m (p0, z0, z1),
+		z0 = svminnm_m (p0, z0, z1))
+
+/*
+** minnm_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fminnm	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f64_m_tied2, svfloat64_t,
+		z0 = svminnm_f64_m (p0, z1, z0),
+		z0 = svminnm_m (p0, z1, z0))
+
+/*
+** minnm_f64_m_untied:
+**	movprfx	z0, z1
+**	fminnm	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f64_m_untied, svfloat64_t,
+		z0 = svminnm_f64_m (p0, z1, z2),
+		z0 = svminnm_m (p0, z1, z2))
+
+/*
+** minnm_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fminnm	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svminnm_n_f64_m (p0, z0, d4),
+		 z0 = svminnm_m (p0, z0, d4))
+
+/*
+** minnm_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fminnm	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svminnm_n_f64_m (p0, z1, d4),
+		 z0 = svminnm_m (p0, z1, d4))
+
+/*
+** minnm_0_f64_m_tied1:
+**	fminnm	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f64_m_tied1, svfloat64_t,
+		z0 = svminnm_n_f64_m (p0, z0, 0),
+		z0 = svminnm_m (p0, z0, 0))
+
+/*
+** minnm_0_f64_m_untied:
+**	movprfx	z0, z1
+**	fminnm	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f64_m_untied, svfloat64_t,
+		z0 = svminnm_n_f64_m (p0, z1, 0),
+		z0 = svminnm_m (p0, z1, 0))
+
+/*
+** minnm_1_f64_m_tied1:
+**	fminnm	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f64_m_tied1, svfloat64_t,
+		z0 = svminnm_n_f64_m (p0, z0, 1),
+		z0 = svminnm_m (p0, z0, 1))
+
+/*
+** minnm_1_f64_m_untied:
+**	movprfx	z0, z1
+**	fminnm	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f64_m_untied, svfloat64_t,
+		z0 = svminnm_n_f64_m (p0, z1, 1),
+		z0 = svminnm_m (p0, z1, 1))
+
+/*
+** minnm_2_f64_m:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fminnm	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_2_f64_m, svfloat64_t,
+		z0 = svminnm_n_f64_m (p0, z0, 2),
+		z0 = svminnm_m (p0, z0, 2))
+
+/*
+** minnm_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f64_z_tied1, svfloat64_t,
+		z0 = svminnm_f64_z (p0, z0, z1),
+		z0 = svminnm_z (p0, z0, z1))
+
+/*
+** minnm_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f64_z_tied2, svfloat64_t,
+		z0 = svminnm_f64_z (p0, z1, z0),
+		z0 = svminnm_z (p0, z1, z0))
+
+/*
+** minnm_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fminnm	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f64_z_untied, svfloat64_t,
+		z0 = svminnm_f64_z (p0, z1, z2),
+		z0 = svminnm_z (p0, z1, z2))
+
+/*
+** minnm_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fminnm	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svminnm_n_f64_z (p0, z0, d4),
+		 z0 = svminnm_z (p0, z0, d4))
+
+/*
+** minnm_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fminnm	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svminnm_n_f64_z (p0, z1, d4),
+		 z0 = svminnm_z (p0, z1, d4))
+
+/*
+** minnm_0_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fminnm	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f64_z_tied1, svfloat64_t,
+		z0 = svminnm_n_f64_z (p0, z0, 0),
+		z0 = svminnm_z (p0, z0, 0))
+
+/*
+** minnm_0_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fminnm	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f64_z_untied, svfloat64_t,
+		z0 = svminnm_n_f64_z (p0, z1, 0),
+		z0 = svminnm_z (p0, z1, 0))
+
+/*
+** minnm_1_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fminnm	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f64_z_tied1, svfloat64_t,
+		z0 = svminnm_n_f64_z (p0, z0, 1),
+		z0 = svminnm_z (p0, z0, 1))
+
+/*
+** minnm_1_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fminnm	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f64_z_untied, svfloat64_t,
+		z0 = svminnm_n_f64_z (p0, z1, 1),
+		z0 = svminnm_z (p0, z1, 1))
+
+/*
+** minnm_2_f64_z:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fminnm	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_2_f64_z, svfloat64_t,
+		z0 = svminnm_n_f64_z (p0, z0, 2),
+		z0 = svminnm_z (p0, z0, 2))
+
+/*
+** minnm_f64_x_tied1:
+**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f64_x_tied1, svfloat64_t,
+		z0 = svminnm_f64_x (p0, z0, z1),
+		z0 = svminnm_x (p0, z0, z1))
+
+/*
+** minnm_f64_x_tied2:
+**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f64_x_tied2, svfloat64_t,
+		z0 = svminnm_f64_x (p0, z1, z0),
+		z0 = svminnm_x (p0, z1, z0))
+
+/*
+** minnm_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fminnm	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_f64_x_untied, svfloat64_t,
+		z0 = svminnm_f64_x (p0, z1, z2),
+		z0 = svminnm_x (p0, z1, z2))
+
+/*
+** minnm_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fminnm	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svminnm_n_f64_x (p0, z0, d4),
+		 z0 = svminnm_x (p0, z0, d4))
+
+/*
+** minnm_d4_f64_x_untied:
+**	mov	z0\.d, d4
+**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (minnm_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svminnm_n_f64_x (p0, z1, d4),
+		 z0 = svminnm_x (p0, z1, d4))
+
+/*
+** minnm_0_f64_x_tied1:
+**	fminnm	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f64_x_tied1, svfloat64_t,
+		z0 = svminnm_n_f64_x (p0, z0, 0),
+		z0 = svminnm_x (p0, z0, 0))
+
+/*
+** minnm_0_f64_x_untied:
+**	movprfx	z0, z1
+**	fminnm	z0\.d, p0/m, z0\.d, #0\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_0_f64_x_untied, svfloat64_t,
+		z0 = svminnm_n_f64_x (p0, z1, 0),
+		z0 = svminnm_x (p0, z1, 0))
+
+/*
+** minnm_1_f64_x_tied1:
+**	fminnm	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f64_x_tied1, svfloat64_t,
+		z0 = svminnm_n_f64_x (p0, z0, 1),
+		z0 = svminnm_x (p0, z0, 1))
+
+/*
+** minnm_1_f64_x_untied:
+**	movprfx	z0, z1
+**	fminnm	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_1_f64_x_untied, svfloat64_t,
+		z0 = svminnm_n_f64_x (p0, z1, 1),
+		z0 = svminnm_x (p0, z1, 1))
+
+/*
+** minnm_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fminnm	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_2_f64_x_tied1, svfloat64_t,
+		z0 = svminnm_n_f64_x (p0, z0, 2),
+		z0 = svminnm_x (p0, z0, 2))
+
+/*
+** minnm_2_f64_x_untied:
+**	fmov	z0\.d, #2\.0(?:e\+0)?
+**	fminnm	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (minnm_2_f64_x_untied, svfloat64_t,
+		z0 = svminnm_n_f64_x (p0, z1, 2),
+		z0 = svminnm_x (p0, z1, 2))
+
+/*
+** ptrue_minnm_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_f64_x_tied1, svfloat64_t,
+		z0 = svminnm_f64_x (svptrue_b64 (), z0, z1),
+		z0 = svminnm_x (svptrue_b64 (), z0, z1))
+
+/*
+** ptrue_minnm_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_f64_x_tied2, svfloat64_t,
+		z0 = svminnm_f64_x (svptrue_b64 (), z1, z0),
+		z0 = svminnm_x (svptrue_b64 (), z1, z0))
+
+/*
+** ptrue_minnm_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_f64_x_untied, svfloat64_t,
+		z0 = svminnm_f64_x (svptrue_b64 (), z1, z2),
+		z0 = svminnm_x (svptrue_b64 (), z1, z2))
+
+/*
+** ptrue_minnm_0_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_0_f64_x_tied1, svfloat64_t,
+		z0 = svminnm_n_f64_x (svptrue_b64 (), z0, 0),
+		z0 = svminnm_x (svptrue_b64 (), z0, 0))
+
+/*
+** ptrue_minnm_0_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_0_f64_x_untied, svfloat64_t,
+		z0 = svminnm_n_f64_x (svptrue_b64 (), z1, 0),
+		z0 = svminnm_x (svptrue_b64 (), z1, 0))
+
+/*
+** ptrue_minnm_1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_1_f64_x_tied1, svfloat64_t,
+		z0 = svminnm_n_f64_x (svptrue_b64 (), z0, 1),
+		z0 = svminnm_x (svptrue_b64 (), z0, 1))
+
+/*
+** ptrue_minnm_1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_1_f64_x_untied, svfloat64_t,
+		z0 = svminnm_n_f64_x (svptrue_b64 (), z1, 1),
+		z0 = svminnm_x (svptrue_b64 (), z1, 1))
+
+/*
+** ptrue_minnm_2_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_2_f64_x_tied1, svfloat64_t,
+		z0 = svminnm_n_f64_x (svptrue_b64 (), z0, 2),
+		z0 = svminnm_x (svptrue_b64 (), z0, 2))
+
+/*
+** ptrue_minnm_2_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_2_f64_x_untied, svfloat64_t,
+		z0 = svminnm_n_f64_x (svptrue_b64 (), z1, 2),
+		z0 = svminnm_x (svptrue_b64 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f16.c
new file mode 100644
index 000000000..827f41bfe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minnmv_d0_f16_tied:
+**	fminnmv	h0, p0, z0\.h
+**	ret
+*/
+TEST_REDUCTION_D (minnmv_d0_f16_tied, float16_t, svfloat16_t,
+		  d0 = svminnmv_f16 (p0, z0),
+		  d0 = svminnmv (p0, z0))
+
+/*
+** minnmv_d0_f16_untied:
+**	fminnmv	h0, p0, z1\.h
+**	ret
+*/
+TEST_REDUCTION_D (minnmv_d0_f16_untied, float16_t, svfloat16_t,
+		  d0 = svminnmv_f16 (p0, z1),
+		  d0 = svminnmv (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f32.c
new file mode 100644
index 000000000..2352ec2a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minnmv_d0_f32_tied:
+**	fminnmv	s0, p0, z0\.s
+**	ret
+*/
+TEST_REDUCTION_D (minnmv_d0_f32_tied, float32_t, svfloat32_t,
+		  d0 = svminnmv_f32 (p0, z0),
+		  d0 = svminnmv (p0, z0))
+
+/*
+** minnmv_d0_f32_untied:
+**	fminnmv	s0, p0, z1\.s
+**	ret
+*/
+TEST_REDUCTION_D (minnmv_d0_f32_untied, float32_t, svfloat32_t,
+		  d0 = svminnmv_f32 (p0, z1),
+		  d0 = svminnmv (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f64.c
new file mode 100644
index 000000000..3d769a3d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minnmv_f64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minnmv_d0_f64_tied:
+**	fminnmv	d0, p0, z0\.d
+**	ret
+*/
+TEST_REDUCTION_D (minnmv_d0_f64_tied, float64_t, svfloat64_t,
+		  d0 = svminnmv_f64 (p0, z0),
+		  d0 = svminnmv (p0, z0))
+
+/*
+** minnmv_d0_f64_untied:
+**	fminnmv	d0, p0, z1\.d
+**	ret
+*/
+TEST_REDUCTION_D (minnmv_d0_f64_untied, float64_t, svfloat64_t,
+		  d0 = svminnmv_f64 (p0, z1),
+		  d0 = svminnmv (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f16.c
new file mode 100644
index 000000000..190aa16e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minv_d0_f16_tied:
+**	fminv	h0, p0, z0\.h
+**	ret
+*/
+TEST_REDUCTION_D (minv_d0_f16_tied, float16_t, svfloat16_t,
+		  d0 = svminv_f16 (p0, z0),
+		  d0 = svminv (p0, z0))
+
+/*
+** minv_d0_f16_untied:
+**	fminv	h0, p0, z1\.h
+**	ret
+*/
+TEST_REDUCTION_D (minv_d0_f16_untied, float16_t, svfloat16_t,
+		  d0 = svminv_f16 (p0, z1),
+		  d0 = svminv (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f32.c
new file mode 100644
index 000000000..07871b893
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minv_d0_f32_tied:
+**	fminv	s0, p0, z0\.s
+**	ret
+*/
+TEST_REDUCTION_D (minv_d0_f32_tied, float32_t, svfloat32_t,
+		  d0 = svminv_f32 (p0, z0),
+		  d0 = svminv (p0, z0))
+
+/*
+** minv_d0_f32_untied:
+**	fminv	s0, p0, z1\.s
+**	ret
+*/
+TEST_REDUCTION_D (minv_d0_f32_untied, float32_t, svfloat32_t,
+		  d0 = svminv_f32 (p0, z1),
+		  d0 = svminv (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f64.c
new file mode 100644
index 000000000..7435f306f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_f64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minv_d0_f64_tied:
+**	fminv	d0, p0, z0\.d
+**	ret
+*/
+TEST_REDUCTION_D (minv_d0_f64_tied, float64_t, svfloat64_t,
+		  d0 = svminv_f64 (p0, z0),
+		  d0 = svminv (p0, z0))
+
+/*
+** minv_d0_f64_untied:
+**	fminv	d0, p0, z1\.d
+**	ret
+*/
+TEST_REDUCTION_D (minv_d0_f64_untied, float64_t, svfloat64_t,
+		  d0 = svminv_f64 (p0, z1),
+		  d0 = svminv (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s16.c
new file mode 100644
index 000000000..dfb66a9f7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s16.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minv_x0_s16:
+**	sminv	h([0-9]+), p0, z0\.h
+**	umov	w0, v\1\.h\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (minv_x0_s16, int16_t, svint16_t,
+		  x0 = svminv_s16 (p0, z0),
+		  x0 = svminv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s32.c
new file mode 100644
index 000000000..c02df5dd3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s32.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minv_x0_s32:
+**	sminv	(s[0-9]+), p0, z0\.s
+**	fmov	w0, \1
+**	ret
+*/
+TEST_REDUCTION_X (minv_x0_s32, int32_t, svint32_t,
+		  x0 = svminv_s32 (p0, z0),
+		  x0 = svminv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s64.c
new file mode 100644
index 000000000..784973231
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s64.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minv_x0_s64:
+**	sminv	(d[0-9]+), p0, z0\.d
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (minv_x0_s64, int64_t, svint64_t,
+		  x0 = svminv_s64 (p0, z0),
+		  x0 = svminv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s8.c
new file mode 100644
index 000000000..0b1bce5de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_s8.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minv_x0_s8:
+**	sminv	b([0-9]+), p0, z0\.b
+**	umov	w0, v\1\.b\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (minv_x0_s8, int8_t, svint8_t,
+		  x0 = svminv_s8 (p0, z0),
+		  x0 = svminv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u16.c
new file mode 100644
index 000000000..b499de33e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u16.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minv_x0_u16:
+**	uminv	h([0-9]+), p0, z0\.h
+**	umov	w0, v\1\.h\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (minv_x0_u16, uint16_t, svuint16_t,
+		  x0 = svminv_u16 (p0, z0),
+		  x0 = svminv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u32.c
new file mode 100644
index 000000000..18c9d8c6d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u32.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minv_x0_u32:
+**	uminv	(s[0-9]+), p0, z0\.s
+**	fmov	w0, \1
+**	ret
+*/
+TEST_REDUCTION_X (minv_x0_u32, uint32_t, svuint32_t,
+		  x0 = svminv_u32 (p0, z0),
+		  x0 = svminv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u64.c
new file mode 100644
index 000000000..374d5e426
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u64.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minv_x0_u64:
+**	uminv	(d[0-9]+), p0, z0\.d
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (minv_x0_u64, uint64_t, svuint64_t,
+		  x0 = svminv_u64 (p0, z0),
+		  x0 = svminv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u8.c
new file mode 100644
index 000000000..d9f6f5835
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/minv_u8.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** minv_x0_u8:
+**	uminv	b([0-9]+), p0, z0\.b
+**	umov	w0, v\1\.b\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (minv_x0_u8, uint8_t, svuint8_t,
+		  x0 = svminv_u8 (p0, z0),
+		  x0 = svminv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f16.c
new file mode 100644
index 000000000..f22a582ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f16.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mla_f16_m_tied1:
+**	fmla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f16_m_tied1, svfloat16_t,
+		z0 = svmla_f16_m (p0, z0, z1, z2),
+		z0 = svmla_m (p0, z0, z1, z2))
+
+/*
+** mla_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmla	z0\.h, p0/m, \1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f16_m_tied2, svfloat16_t,
+		z0 = svmla_f16_m (p0, z1, z0, z2),
+		z0 = svmla_m (p0, z1, z0, z2))
+
+/*
+** mla_f16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmla	z0\.h, p0/m, z2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f16_m_tied3, svfloat16_t,
+		z0 = svmla_f16_m (p0, z1, z2, z0),
+		z0 = svmla_m (p0, z1, z2, z0))
+
+/*
+** mla_f16_m_untied:
+**	movprfx	z0, z1
+**	fmla	z0\.h, p0/m, z2\.h, z3\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f16_m_untied, svfloat16_t,
+		z0 = svmla_f16_m (p0, z1, z2, z3),
+		z0 = svmla_m (p0, z1, z2, z3))
+
+/*
+** mla_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svmla_n_f16_m (p0, z0, z1, d4),
+		 z0 = svmla_m (p0, z0, z1, d4))
+
+/*
+** mla_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fmla	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svmla_n_f16_m (p0, z1, z2, d4),
+		 z0 = svmla_m (p0, z1, z2, d4))
+
+/*
+** mla_2_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f16_m_tied1, svfloat16_t,
+		z0 = svmla_n_f16_m (p0, z0, z1, 2),
+		z0 = svmla_m (p0, z0, z1, 2))
+
+/*
+** mla_2_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmla	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f16_m_untied, svfloat16_t,
+		z0 = svmla_n_f16_m (p0, z1, z2, 2),
+		z0 = svmla_m (p0, z1, z2, 2))
+
+/*
+** mla_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f16_z_tied1, svfloat16_t,
+		z0 = svmla_f16_z (p0, z0, z1, z2),
+		z0 = svmla_z (p0, z0, z1, z2))
+
+/*
+** mla_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f16_z_tied2, svfloat16_t,
+		z0 = svmla_f16_z (p0, z1, z0, z2),
+		z0 = svmla_z (p0, z1, z0, z2))
+
+/*
+** mla_f16_z_tied3:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f16_z_tied3, svfloat16_t,
+		z0 = svmla_f16_z (p0, z1, z2, z0),
+		z0 = svmla_z (p0, z1, z2, z0))
+
+/*
+** mla_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmla	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmad	z0\.h, p0/m, z3\.h, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, z3\.h
+**	fmad	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f16_z_untied, svfloat16_t,
+		z0 = svmla_f16_z (p0, z1, z2, z3),
+		z0 = svmla_z (p0, z1, z2, z3))
+
+/*
+** mla_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svmla_n_f16_z (p0, z0, z1, d4),
+		 z0 = svmla_z (p0, z0, z1, d4))
+
+/*
+** mla_h4_f16_z_tied2:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmad	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_h4_f16_z_tied2, svfloat16_t, __fp16,
+		 z0 = svmla_n_f16_z (p0, z1, z0, d4),
+		 z0 = svmla_z (p0, z1, z0, d4))
+
+/*
+** mla_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmla	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmad	z0\.h, p0/m, \1, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmad	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svmla_n_f16_z (p0, z1, z2, d4),
+		 z0 = svmla_z (p0, z1, z2, d4))
+
+/*
+** mla_2_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f16_z_tied1, svfloat16_t,
+		z0 = svmla_n_f16_z (p0, z0, z1, 2),
+		z0 = svmla_z (p0, z0, z1, 2))
+
+/*
+** mla_2_f16_z_tied2:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmad	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f16_z_tied2, svfloat16_t,
+		z0 = svmla_n_f16_z (p0, z1, z0, 2),
+		z0 = svmla_z (p0, z1, z0, 2))
+
+/*
+** mla_2_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmla	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmad	z0\.h, p0/m, \1, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmad	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f16_z_untied, svfloat16_t,
+		z0 = svmla_n_f16_z (p0, z1, z2, 2),
+		z0 = svmla_z (p0, z1, z2, 2))
+
+/*
+** mla_f16_x_tied1:
+**	fmla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f16_x_tied1, svfloat16_t,
+		z0 = svmla_f16_x (p0, z0, z1, z2),
+		z0 = svmla_x (p0, z0, z1, z2))
+
+/*
+** mla_f16_x_tied2:
+**	fmad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f16_x_tied2, svfloat16_t,
+		z0 = svmla_f16_x (p0, z1, z0, z2),
+		z0 = svmla_x (p0, z1, z0, z2))
+
+/*
+** mla_f16_x_tied3:
+**	fmad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f16_x_tied3, svfloat16_t,
+		z0 = svmla_f16_x (p0, z1, z2, z0),
+		z0 = svmla_x (p0, z1, z2, z0))
+
+/*
+** mla_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmla	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0, z2
+**	fmad	z0\.h, p0/m, z3\.h, z1\.h
+** |
+**	movprfx	z0, z3
+**	fmad	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f16_x_untied, svfloat16_t,
+		z0 = svmla_f16_x (p0, z1, z2, z3),
+		z0 = svmla_x (p0, z1, z2, z3))
+
+/*
+** mla_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svmla_n_f16_x (p0, z0, z1, d4),
+		 z0 = svmla_x (p0, z0, z1, d4))
+
+/*
+** mla_h4_f16_x_tied2:
+**	mov	(z[0-9]+\.h), h4
+**	fmad	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_h4_f16_x_tied2, svfloat16_t, __fp16,
+		 z0 = svmla_n_f16_x (p0, z1, z0, d4),
+		 z0 = svmla_x (p0, z1, z0, d4))
+
+/*
+** mla_h4_f16_x_untied: { xfail *-*-* }
+**	mov	z0\.h, h4
+**	fmad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svmla_n_f16_x (p0, z1, z2, d4),
+		 z0 = svmla_x (p0, z1, z2, d4))
+
+/*
+** mla_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f16_x_tied1, svfloat16_t,
+		z0 = svmla_n_f16_x (p0, z0, z1, 2),
+		z0 = svmla_x (p0, z0, z1, 2))
+
+/*
+** mla_2_f16_x_tied2:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmad	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f16_x_tied2, svfloat16_t,
+		z0 = svmla_n_f16_x (p0, z1, z0, 2),
+		z0 = svmla_x (p0, z1, z0, 2))
+
+/*
+** mla_2_f16_x_untied:
+**	fmov	z0\.h, #2\.0(?:e\+0)?
+**	fmad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f16_x_untied, svfloat16_t,
+		z0 = svmla_n_f16_x (p0, z1, z2, 2),
+		z0 = svmla_x (p0, z1, z2, 2))
+
+/*
+** ptrue_mla_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_f16_x_tied1, svfloat16_t,
+		z0 = svmla_f16_x (svptrue_b16 (), z0, z1, z2),
+		z0 = svmla_x (svptrue_b16 (), z0, z1, z2))
+
+/*
+** ptrue_mla_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_f16_x_tied2, svfloat16_t,
+		z0 = svmla_f16_x (svptrue_b16 (), z1, z0, z2),
+		z0 = svmla_x (svptrue_b16 (), z1, z0, z2))
+
+/*
+** ptrue_mla_f16_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_f16_x_tied3, svfloat16_t,
+		z0 = svmla_f16_x (svptrue_b16 (), z1, z2, z0),
+		z0 = svmla_x (svptrue_b16 (), z1, z2, z0))
+
+/*
+** ptrue_mla_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_f16_x_untied, svfloat16_t,
+		z0 = svmla_f16_x (svptrue_b16 (), z1, z2, z3),
+		z0 = svmla_x (svptrue_b16 (), z1, z2, z3))
+
+/*
+** ptrue_mla_2_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_2_f16_x_tied1, svfloat16_t,
+		z0 = svmla_n_f16_x (svptrue_b16 (), z0, z1, 2),
+		z0 = svmla_x (svptrue_b16 (), z0, z1, 2))
+
+/*
+** ptrue_mla_2_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_2_f16_x_tied2, svfloat16_t,
+		z0 = svmla_n_f16_x (svptrue_b16 (), z1, z0, 2),
+		z0 = svmla_x (svptrue_b16 (), z1, z0, 2))
+
+/*
+** ptrue_mla_2_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_2_f16_x_untied, svfloat16_t,
+		z0 = svmla_n_f16_x (svptrue_b16 (), z1, z2, 2),
+		z0 = svmla_x (svptrue_b16 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f32.c
new file mode 100644
index 000000000..1d95eb0a7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f32.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mla_f32_m_tied1:
+**	fmla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f32_m_tied1, svfloat32_t,
+		z0 = svmla_f32_m (p0, z0, z1, z2),
+		z0 = svmla_m (p0, z0, z1, z2))
+
+/*
+** mla_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmla	z0\.s, p0/m, \1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f32_m_tied2, svfloat32_t,
+		z0 = svmla_f32_m (p0, z1, z0, z2),
+		z0 = svmla_m (p0, z1, z0, z2))
+
+/*
+** mla_f32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmla	z0\.s, p0/m, z2\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f32_m_tied3, svfloat32_t,
+		z0 = svmla_f32_m (p0, z1, z2, z0),
+		z0 = svmla_m (p0, z1, z2, z0))
+
+/*
+** mla_f32_m_untied:
+**	movprfx	z0, z1
+**	fmla	z0\.s, p0/m, z2\.s, z3\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f32_m_untied, svfloat32_t,
+		z0 = svmla_f32_m (p0, z1, z2, z3),
+		z0 = svmla_m (p0, z1, z2, z3))
+
+/*
+** mla_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svmla_n_f32_m (p0, z0, z1, d4),
+		 z0 = svmla_m (p0, z0, z1, d4))
+
+/*
+** mla_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fmla	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svmla_n_f32_m (p0, z1, z2, d4),
+		 z0 = svmla_m (p0, z1, z2, d4))
+
+/*
+** mla_2_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f32_m_tied1, svfloat32_t,
+		z0 = svmla_n_f32_m (p0, z0, z1, 2),
+		z0 = svmla_m (p0, z0, z1, 2))
+
+/*
+** mla_2_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmla	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f32_m_untied, svfloat32_t,
+		z0 = svmla_n_f32_m (p0, z1, z2, 2),
+		z0 = svmla_m (p0, z1, z2, 2))
+
+/*
+** mla_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f32_z_tied1, svfloat32_t,
+		z0 = svmla_f32_z (p0, z0, z1, z2),
+		z0 = svmla_z (p0, z0, z1, z2))
+
+/*
+** mla_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f32_z_tied2, svfloat32_t,
+		z0 = svmla_f32_z (p0, z1, z0, z2),
+		z0 = svmla_z (p0, z1, z0, z2))
+
+/*
+** mla_f32_z_tied3:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f32_z_tied3, svfloat32_t,
+		z0 = svmla_f32_z (p0, z1, z2, z0),
+		z0 = svmla_z (p0, z1, z2, z0))
+
+/*
+** mla_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmla	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmad	z0\.s, p0/m, z3\.s, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, z3\.s
+**	fmad	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f32_z_untied, svfloat32_t,
+		z0 = svmla_f32_z (p0, z1, z2, z3),
+		z0 = svmla_z (p0, z1, z2, z3))
+
+/*
+** mla_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svmla_n_f32_z (p0, z0, z1, d4),
+		 z0 = svmla_z (p0, z0, z1, d4))
+
+/*
+** mla_s4_f32_z_tied2:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmad	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_s4_f32_z_tied2, svfloat32_t, float,
+		 z0 = svmla_n_f32_z (p0, z1, z0, d4),
+		 z0 = svmla_z (p0, z1, z0, d4))
+
+/*
+** mla_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmla	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmad	z0\.s, p0/m, \1, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmad	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svmla_n_f32_z (p0, z1, z2, d4),
+		 z0 = svmla_z (p0, z1, z2, d4))
+
+/*
+** mla_2_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f32_z_tied1, svfloat32_t,
+		z0 = svmla_n_f32_z (p0, z0, z1, 2),
+		z0 = svmla_z (p0, z0, z1, 2))
+
+/*
+** mla_2_f32_z_tied2:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmad	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f32_z_tied2, svfloat32_t,
+		z0 = svmla_n_f32_z (p0, z1, z0, 2),
+		z0 = svmla_z (p0, z1, z0, 2))
+
+/*
+** mla_2_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmla	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmad	z0\.s, p0/m, \1, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmad	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f32_z_untied, svfloat32_t,
+		z0 = svmla_n_f32_z (p0, z1, z2, 2),
+		z0 = svmla_z (p0, z1, z2, 2))
+
+/*
+** mla_f32_x_tied1:
+**	fmla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f32_x_tied1, svfloat32_t,
+		z0 = svmla_f32_x (p0, z0, z1, z2),
+		z0 = svmla_x (p0, z0, z1, z2))
+
+/*
+** mla_f32_x_tied2:
+**	fmad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f32_x_tied2, svfloat32_t,
+		z0 = svmla_f32_x (p0, z1, z0, z2),
+		z0 = svmla_x (p0, z1, z0, z2))
+
+/*
+** mla_f32_x_tied3:
+**	fmad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f32_x_tied3, svfloat32_t,
+		z0 = svmla_f32_x (p0, z1, z2, z0),
+		z0 = svmla_x (p0, z1, z2, z0))
+
+/*
+** mla_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmla	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0, z2
+**	fmad	z0\.s, p0/m, z3\.s, z1\.s
+** |
+**	movprfx	z0, z3
+**	fmad	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f32_x_untied, svfloat32_t,
+		z0 = svmla_f32_x (p0, z1, z2, z3),
+		z0 = svmla_x (p0, z1, z2, z3))
+
+/*
+** mla_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svmla_n_f32_x (p0, z0, z1, d4),
+		 z0 = svmla_x (p0, z0, z1, d4))
+
+/*
+** mla_s4_f32_x_tied2:
+**	mov	(z[0-9]+\.s), s4
+**	fmad	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_s4_f32_x_tied2, svfloat32_t, float,
+		 z0 = svmla_n_f32_x (p0, z1, z0, d4),
+		 z0 = svmla_x (p0, z1, z0, d4))
+
+/*
+** mla_s4_f32_x_untied: { xfail *-*-* }
+**	mov	z0\.s, s4
+**	fmad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svmla_n_f32_x (p0, z1, z2, d4),
+		 z0 = svmla_x (p0, z1, z2, d4))
+
+/*
+** mla_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f32_x_tied1, svfloat32_t,
+		z0 = svmla_n_f32_x (p0, z0, z1, 2),
+		z0 = svmla_x (p0, z0, z1, 2))
+
+/*
+** mla_2_f32_x_tied2:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmad	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f32_x_tied2, svfloat32_t,
+		z0 = svmla_n_f32_x (p0, z1, z0, 2),
+		z0 = svmla_x (p0, z1, z0, 2))
+
+/*
+** mla_2_f32_x_untied:
+**	fmov	z0\.s, #2\.0(?:e\+0)?
+**	fmad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f32_x_untied, svfloat32_t,
+		z0 = svmla_n_f32_x (p0, z1, z2, 2),
+		z0 = svmla_x (p0, z1, z2, 2))
+
+/*
+** ptrue_mla_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_f32_x_tied1, svfloat32_t,
+		z0 = svmla_f32_x (svptrue_b32 (), z0, z1, z2),
+		z0 = svmla_x (svptrue_b32 (), z0, z1, z2))
+
+/*
+** ptrue_mla_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_f32_x_tied2, svfloat32_t,
+		z0 = svmla_f32_x (svptrue_b32 (), z1, z0, z2),
+		z0 = svmla_x (svptrue_b32 (), z1, z0, z2))
+
+/*
+** ptrue_mla_f32_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_f32_x_tied3, svfloat32_t,
+		z0 = svmla_f32_x (svptrue_b32 (), z1, z2, z0),
+		z0 = svmla_x (svptrue_b32 (), z1, z2, z0))
+
+/*
+** ptrue_mla_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_f32_x_untied, svfloat32_t,
+		z0 = svmla_f32_x (svptrue_b32 (), z1, z2, z3),
+		z0 = svmla_x (svptrue_b32 (), z1, z2, z3))
+
+/*
+** ptrue_mla_2_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_2_f32_x_tied1, svfloat32_t,
+		z0 = svmla_n_f32_x (svptrue_b32 (), z0, z1, 2),
+		z0 = svmla_x (svptrue_b32 (), z0, z1, 2))
+
+/*
+** ptrue_mla_2_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_2_f32_x_tied2, svfloat32_t,
+		z0 = svmla_n_f32_x (svptrue_b32 (), z1, z0, 2),
+		z0 = svmla_x (svptrue_b32 (), z1, z0, 2))
+
+/*
+** ptrue_mla_2_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_2_f32_x_untied, svfloat32_t,
+		z0 = svmla_n_f32_x (svptrue_b32 (), z1, z2, 2),
+		z0 = svmla_x (svptrue_b32 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f64.c
new file mode 100644
index 000000000..74fd29267
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_f64.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mla_f64_m_tied1:
+**	fmla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f64_m_tied1, svfloat64_t,
+		z0 = svmla_f64_m (p0, z0, z1, z2),
+		z0 = svmla_m (p0, z0, z1, z2))
+
+/*
+** mla_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmla	z0\.d, p0/m, \1, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f64_m_tied2, svfloat64_t,
+		z0 = svmla_f64_m (p0, z1, z0, z2),
+		z0 = svmla_m (p0, z1, z0, z2))
+
+/*
+** mla_f64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmla	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f64_m_tied3, svfloat64_t,
+		z0 = svmla_f64_m (p0, z1, z2, z0),
+		z0 = svmla_m (p0, z1, z2, z0))
+
+/*
+** mla_f64_m_untied:
+**	movprfx	z0, z1
+**	fmla	z0\.d, p0/m, z2\.d, z3\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f64_m_untied, svfloat64_t,
+		z0 = svmla_f64_m (p0, z1, z2, z3),
+		z0 = svmla_m (p0, z1, z2, z3))
+
+/*
+** mla_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svmla_n_f64_m (p0, z0, z1, d4),
+		 z0 = svmla_m (p0, z0, z1, d4))
+
+/*
+** mla_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fmla	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svmla_n_f64_m (p0, z1, z2, d4),
+		 z0 = svmla_m (p0, z1, z2, d4))
+
+/*
+** mla_2_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f64_m_tied1, svfloat64_t,
+		z0 = svmla_n_f64_m (p0, z0, z1, 2),
+		z0 = svmla_m (p0, z0, z1, 2))
+
+/*
+** mla_2_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmla	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f64_m_untied, svfloat64_t,
+		z0 = svmla_n_f64_m (p0, z1, z2, 2),
+		z0 = svmla_m (p0, z1, z2, 2))
+
+/*
+** mla_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f64_z_tied1, svfloat64_t,
+		z0 = svmla_f64_z (p0, z0, z1, z2),
+		z0 = svmla_z (p0, z0, z1, z2))
+
+/*
+** mla_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f64_z_tied2, svfloat64_t,
+		z0 = svmla_f64_z (p0, z1, z0, z2),
+		z0 = svmla_z (p0, z1, z0, z2))
+
+/*
+** mla_f64_z_tied3:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f64_z_tied3, svfloat64_t,
+		z0 = svmla_f64_z (p0, z1, z2, z0),
+		z0 = svmla_z (p0, z1, z2, z0))
+
+/*
+** mla_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmla	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmad	z0\.d, p0/m, z3\.d, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, z3\.d
+**	fmad	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f64_z_untied, svfloat64_t,
+		z0 = svmla_f64_z (p0, z1, z2, z3),
+		z0 = svmla_z (p0, z1, z2, z3))
+
+/*
+** mla_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svmla_n_f64_z (p0, z0, z1, d4),
+		 z0 = svmla_z (p0, z0, z1, d4))
+
+/*
+** mla_d4_f64_z_tied2:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmad	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_d4_f64_z_tied2, svfloat64_t, double,
+		 z0 = svmla_n_f64_z (p0, z1, z0, d4),
+		 z0 = svmla_z (p0, z1, z0, d4))
+
+/*
+** mla_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmla	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmad	z0\.d, p0/m, \1, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmad	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svmla_n_f64_z (p0, z1, z2, d4),
+		 z0 = svmla_z (p0, z1, z2, d4))
+
+/*
+** mla_2_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f64_z_tied1, svfloat64_t,
+		z0 = svmla_n_f64_z (p0, z0, z1, 2),
+		z0 = svmla_z (p0, z0, z1, 2))
+
+/*
+** mla_2_f64_z_tied2:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmad	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f64_z_tied2, svfloat64_t,
+		z0 = svmla_n_f64_z (p0, z1, z0, 2),
+		z0 = svmla_z (p0, z1, z0, 2))
+
+/*
+** mla_2_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmla	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmad	z0\.d, p0/m, \1, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmad	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f64_z_untied, svfloat64_t,
+		z0 = svmla_n_f64_z (p0, z1, z2, 2),
+		z0 = svmla_z (p0, z1, z2, 2))
+
+/*
+** mla_f64_x_tied1:
+**	fmla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f64_x_tied1, svfloat64_t,
+		z0 = svmla_f64_x (p0, z0, z1, z2),
+		z0 = svmla_x (p0, z0, z1, z2))
+
+/*
+** mla_f64_x_tied2:
+**	fmad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f64_x_tied2, svfloat64_t,
+		z0 = svmla_f64_x (p0, z1, z0, z2),
+		z0 = svmla_x (p0, z1, z0, z2))
+
+/*
+** mla_f64_x_tied3:
+**	fmad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f64_x_tied3, svfloat64_t,
+		z0 = svmla_f64_x (p0, z1, z2, z0),
+		z0 = svmla_x (p0, z1, z2, z0))
+
+/*
+** mla_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmla	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0, z2
+**	fmad	z0\.d, p0/m, z3\.d, z1\.d
+** |
+**	movprfx	z0, z3
+**	fmad	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_f64_x_untied, svfloat64_t,
+		z0 = svmla_f64_x (p0, z1, z2, z3),
+		z0 = svmla_x (p0, z1, z2, z3))
+
+/*
+** mla_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svmla_n_f64_x (p0, z0, z1, d4),
+		 z0 = svmla_x (p0, z0, z1, d4))
+
+/*
+** mla_d4_f64_x_tied2:
+**	mov	(z[0-9]+\.d), d4
+**	fmad	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_d4_f64_x_tied2, svfloat64_t, double,
+		 z0 = svmla_n_f64_x (p0, z1, z0, d4),
+		 z0 = svmla_x (p0, z1, z0, d4))
+
+/*
+** mla_d4_f64_x_untied: { xfail *-*-* }
+**	mov	z0\.d, d4
+**	fmad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (mla_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svmla_n_f64_x (p0, z1, z2, d4),
+		 z0 = svmla_x (p0, z1, z2, d4))
+
+/*
+** mla_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f64_x_tied1, svfloat64_t,
+		z0 = svmla_n_f64_x (p0, z0, z1, 2),
+		z0 = svmla_x (p0, z0, z1, 2))
+
+/*
+** mla_2_f64_x_tied2:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmad	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f64_x_tied2, svfloat64_t,
+		z0 = svmla_n_f64_x (p0, z1, z0, 2),
+		z0 = svmla_x (p0, z1, z0, 2))
+
+/*
+** mla_2_f64_x_untied:
+**	fmov	z0\.d, #2\.0(?:e\+0)?
+**	fmad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_2_f64_x_untied, svfloat64_t,
+		z0 = svmla_n_f64_x (p0, z1, z2, 2),
+		z0 = svmla_x (p0, z1, z2, 2))
+
+/*
+** ptrue_mla_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_f64_x_tied1, svfloat64_t,
+		z0 = svmla_f64_x (svptrue_b64 (), z0, z1, z2),
+		z0 = svmla_x (svptrue_b64 (), z0, z1, z2))
+
+/*
+** ptrue_mla_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_f64_x_tied2, svfloat64_t,
+		z0 = svmla_f64_x (svptrue_b64 (), z1, z0, z2),
+		z0 = svmla_x (svptrue_b64 (), z1, z0, z2))
+
+/*
+** ptrue_mla_f64_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_f64_x_tied3, svfloat64_t,
+		z0 = svmla_f64_x (svptrue_b64 (), z1, z2, z0),
+		z0 = svmla_x (svptrue_b64 (), z1, z2, z0))
+
+/*
+** ptrue_mla_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_f64_x_untied, svfloat64_t,
+		z0 = svmla_f64_x (svptrue_b64 (), z1, z2, z3),
+		z0 = svmla_x (svptrue_b64 (), z1, z2, z3))
+
+/*
+** ptrue_mla_2_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_2_f64_x_tied1, svfloat64_t,
+		z0 = svmla_n_f64_x (svptrue_b64 (), z0, z1, 2),
+		z0 = svmla_x (svptrue_b64 (), z0, z1, 2))
+
+/*
+** ptrue_mla_2_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_2_f64_x_tied2, svfloat64_t,
+		z0 = svmla_n_f64_x (svptrue_b64 (), z1, z0, 2),
+		z0 = svmla_x (svptrue_b64 (), z1, z0, 2))
+
+/*
+** ptrue_mla_2_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_2_f64_x_untied, svfloat64_t,
+		z0 = svmla_n_f64_x (svptrue_b64 (), z1, z2, 2),
+		z0 = svmla_x (svptrue_b64 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f16.c
new file mode 100644
index 000000000..949e3bb47
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f16.c
@@ -0,0 +1,128 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mla_lane_0_f16_tied1:
+**	fmla	z0\.h, z1\.h, z2\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_0_f16_tied1, svfloat16_t,
+		z0 = svmla_lane_f16 (z0, z1, z2, 0),
+		z0 = svmla_lane (z0, z1, z2, 0))
+
+/*
+** mla_lane_0_f16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmla	z0\.h, \1\.h, z2\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_0_f16_tied2, svfloat16_t,
+		z0 = svmla_lane_f16 (z1, z0, z2, 0),
+		z0 = svmla_lane (z1, z0, z2, 0))
+
+/*
+** mla_lane_0_f16_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmla	z0\.h, z2\.h, \1\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_0_f16_tied3, svfloat16_t,
+		z0 = svmla_lane_f16 (z1, z2, z0, 0),
+		z0 = svmla_lane (z1, z2, z0, 0))
+
+/*
+** mla_lane_0_f16_untied:
+**	movprfx	z0, z1
+**	fmla	z0\.h, z2\.h, z3\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_0_f16_untied, svfloat16_t,
+		z0 = svmla_lane_f16 (z1, z2, z3, 0),
+		z0 = svmla_lane (z1, z2, z3, 0))
+
+/*
+** mla_lane_1_f16:
+**	fmla	z0\.h, z1\.h, z2\.h\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_1_f16, svfloat16_t,
+		z0 = svmla_lane_f16 (z0, z1, z2, 1),
+		z0 = svmla_lane (z0, z1, z2, 1))
+
+/*
+** mla_lane_2_f16:
+**	fmla	z0\.h, z1\.h, z2\.h\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_2_f16, svfloat16_t,
+		z0 = svmla_lane_f16 (z0, z1, z2, 2),
+		z0 = svmla_lane (z0, z1, z2, 2))
+
+/*
+** mla_lane_3_f16:
+**	fmla	z0\.h, z1\.h, z2\.h\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_3_f16, svfloat16_t,
+		z0 = svmla_lane_f16 (z0, z1, z2, 3),
+		z0 = svmla_lane (z0, z1, z2, 3))
+
+/*
+** mla_lane_4_f16:
+**	fmla	z0\.h, z1\.h, z2\.h\[4\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_4_f16, svfloat16_t,
+		z0 = svmla_lane_f16 (z0, z1, z2, 4),
+		z0 = svmla_lane (z0, z1, z2, 4))
+
+/*
+** mla_lane_5_f16:
+**	fmla	z0\.h, z1\.h, z2\.h\[5\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_5_f16, svfloat16_t,
+		z0 = svmla_lane_f16 (z0, z1, z2, 5),
+		z0 = svmla_lane (z0, z1, z2, 5))
+
+/*
+** mla_lane_6_f16:
+**	fmla	z0\.h, z1\.h, z2\.h\[6\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_6_f16, svfloat16_t,
+		z0 = svmla_lane_f16 (z0, z1, z2, 6),
+		z0 = svmla_lane (z0, z1, z2, 6))
+
+/*
+** mla_lane_7_f16:
+**	fmla	z0\.h, z1\.h, z2\.h\[7\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_7_f16, svfloat16_t,
+		z0 = svmla_lane_f16 (z0, z1, z2, 7),
+		z0 = svmla_lane (z0, z1, z2, 7))
+
+/*
+** mla_lane_z7_f16:
+**	fmla	z0\.h, z1\.h, z7\.h\[7\]
+**	ret
+*/
+TEST_DUAL_Z (mla_lane_z7_f16, svfloat16_t, svfloat16_t,
+	     z0 = svmla_lane_f16 (z0, z1, z7, 7),
+	     z0 = svmla_lane (z0, z1, z7, 7))
+
+/*
+** mla_lane_z8_f16:
+**	str	d8, \[sp, -16\]!
+**	mov	(z[0-7])\.d, z8\.d
+**	fmla	z0\.h, z1\.h, \1\.h\[7\]
+**	ldr	d8, \[sp\], 16
+**	ret
+*/
+TEST_DUAL_LANE_REG (mla_lane_z8_f16, svfloat16_t, svfloat16_t, z8,
+		    z0 = svmla_lane_f16 (z0, z1, z8, 7),
+		    z0 = svmla_lane (z0, z1, z8, 7))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f32.c
new file mode 100644
index 000000000..d376532d6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f32.c
@@ -0,0 +1,92 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mla_lane_0_f32_tied1:
+**	fmla	z0\.s, z1\.s, z2\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_0_f32_tied1, svfloat32_t,
+		z0 = svmla_lane_f32 (z0, z1, z2, 0),
+		z0 = svmla_lane (z0, z1, z2, 0))
+
+/*
+** mla_lane_0_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmla	z0\.s, \1\.s, z2\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_0_f32_tied2, svfloat32_t,
+		z0 = svmla_lane_f32 (z1, z0, z2, 0),
+		z0 = svmla_lane (z1, z0, z2, 0))
+
+/*
+** mla_lane_0_f32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmla	z0\.s, z2\.s, \1\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_0_f32_tied3, svfloat32_t,
+		z0 = svmla_lane_f32 (z1, z2, z0, 0),
+		z0 = svmla_lane (z1, z2, z0, 0))
+
+/*
+** mla_lane_0_f32_untied:
+**	movprfx	z0, z1
+**	fmla	z0\.s, z2\.s, z3\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_0_f32_untied, svfloat32_t,
+		z0 = svmla_lane_f32 (z1, z2, z3, 0),
+		z0 = svmla_lane (z1, z2, z3, 0))
+
+/*
+** mla_lane_1_f32:
+**	fmla	z0\.s, z1\.s, z2\.s\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_1_f32, svfloat32_t,
+		z0 = svmla_lane_f32 (z0, z1, z2, 1),
+		z0 = svmla_lane (z0, z1, z2, 1))
+
+/*
+** mla_lane_2_f32:
+**	fmla	z0\.s, z1\.s, z2\.s\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_2_f32, svfloat32_t,
+		z0 = svmla_lane_f32 (z0, z1, z2, 2),
+		z0 = svmla_lane (z0, z1, z2, 2))
+
+/*
+** mla_lane_3_f32:
+**	fmla	z0\.s, z1\.s, z2\.s\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_3_f32, svfloat32_t,
+		z0 = svmla_lane_f32 (z0, z1, z2, 3),
+		z0 = svmla_lane (z0, z1, z2, 3))
+
+/*
+** mla_lane_z7_f32:
+**	fmla	z0\.s, z1\.s, z7\.s\[3\]
+**	ret
+*/
+TEST_DUAL_Z (mla_lane_z7_f32, svfloat32_t, svfloat32_t,
+	     z0 = svmla_lane_f32 (z0, z1, z7, 3),
+	     z0 = svmla_lane (z0, z1, z7, 3))
+
+/*
+** mla_lane_z8_f32:
+**	str	d8, \[sp, -16\]!
+**	mov	(z[0-7])\.d, z8\.d
+**	fmla	z0\.s, z1\.s, \1\.s\[3\]
+**	ldr	d8, \[sp\], 16
+**	ret
+*/
+TEST_DUAL_LANE_REG (mla_lane_z8_f32, svfloat32_t, svfloat32_t, z8,
+		    z0 = svmla_lane_f32 (z0, z1, z8, 3),
+		    z0 = svmla_lane (z0, z1, z8, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f64.c
new file mode 100644
index 000000000..7c58a8a57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_lane_f64.c
@@ -0,0 +1,83 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mla_lane_0_f64_tied1:
+**	fmla	z0\.d, z1\.d, z2\.d\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_0_f64_tied1, svfloat64_t,
+		z0 = svmla_lane_f64 (z0, z1, z2, 0),
+		z0 = svmla_lane (z0, z1, z2, 0))
+
+/*
+** mla_lane_0_f64_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmla	z0\.d, \1, z2\.d\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_0_f64_tied2, svfloat64_t,
+		z0 = svmla_lane_f64 (z1, z0, z2, 0),
+		z0 = svmla_lane (z1, z0, z2, 0))
+
+/*
+** mla_lane_0_f64_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmla	z0\.d, z2\.d, \1\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_0_f64_tied3, svfloat64_t,
+		z0 = svmla_lane_f64 (z1, z2, z0, 0),
+		z0 = svmla_lane (z1, z2, z0, 0))
+
+/*
+** mla_lane_0_f64_untied:
+**	movprfx	z0, z1
+**	fmla	z0\.d, z2\.d, z3\.d\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_0_f64_untied, svfloat64_t,
+		z0 = svmla_lane_f64 (z1, z2, z3, 0),
+		z0 = svmla_lane (z1, z2, z3, 0))
+
+/*
+** mla_lane_1_f64:
+**	fmla	z0\.d, z1\.d, z2\.d\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (mla_lane_1_f64, svfloat64_t,
+		z0 = svmla_lane_f64 (z0, z1, z2, 1),
+		z0 = svmla_lane (z0, z1, z2, 1))
+
+/*
+** mla_lane_z7_f64:
+**	fmla	z0\.d, z1\.d, z7\.d\[1\]
+**	ret
+*/
+TEST_DUAL_Z (mla_lane_z7_f64, svfloat64_t, svfloat64_t,
+	     z0 = svmla_lane_f64 (z0, z1, z7, 1),
+	     z0 = svmla_lane (z0, z1, z7, 1))
+
+/*
+** mla_lane_z15_f64:
+**	str	d15, \[sp, -16\]!
+**	fmla	z0\.d, z1\.d, z15\.d\[1\]
+**	ldr	d15, \[sp\], 16
+**	ret
+*/
+TEST_DUAL_LANE_REG (mla_lane_z15_f64, svfloat64_t, svfloat64_t, z15,
+		    z0 = svmla_lane_f64 (z0, z1, z15, 1),
+		    z0 = svmla_lane (z0, z1, z15, 1))
+
+/*
+** mla_lane_z16_f64:
+**	mov	(z[0-9]|z1[0-5])\.d, z16\.d
+**	fmla	z0\.d, z1\.d, \1\.d\[1\]
+**	ret
+*/
+TEST_DUAL_LANE_REG (mla_lane_z16_f64, svfloat64_t, svfloat64_t, z16,
+		    z0 = svmla_lane_f64 (z0, z1, z16, 1),
+		    z0 = svmla_lane (z0, z1, z16, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s16.c
new file mode 100644
index 000000000..f3ed191db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s16.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mla_s16_m_tied1:
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s16_m_tied1, svint16_t,
+		z0 = svmla_s16_m (p0, z0, z1, z2),
+		z0 = svmla_m (p0, z0, z1, z2))
+
+/*
+** mla_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mla	z0\.h, p0/m, \1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s16_m_tied2, svint16_t,
+		z0 = svmla_s16_m (p0, z1, z0, z2),
+		z0 = svmla_m (p0, z1, z0, z2))
+
+/*
+** mla_s16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mla	z0\.h, p0/m, z2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s16_m_tied3, svint16_t,
+		z0 = svmla_s16_m (p0, z1, z2, z0),
+		z0 = svmla_m (p0, z1, z2, z0))
+
+/*
+** mla_s16_m_untied:
+**	movprfx	z0, z1
+**	mla	z0\.h, p0/m, z2\.h, z3\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s16_m_untied, svint16_t,
+		z0 = svmla_s16_m (p0, z1, z2, z3),
+		z0 = svmla_m (p0, z1, z2, z3))
+
+/*
+** mla_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	mla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s16_m_tied1, svint16_t, int16_t,
+		 z0 = svmla_n_s16_m (p0, z0, z1, x0),
+		 z0 = svmla_m (p0, z0, z1, x0))
+
+/*
+** mla_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	mla	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s16_m_untied, svint16_t, int16_t,
+		 z0 = svmla_n_s16_m (p0, z1, z2, x0),
+		 z0 = svmla_m (p0, z1, z2, x0))
+
+/*
+** mla_11_s16_m_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	mla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s16_m_tied1, svint16_t,
+		z0 = svmla_n_s16_m (p0, z0, z1, 11),
+		z0 = svmla_m (p0, z0, z1, 11))
+
+/*
+** mla_11_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0, z1
+**	mla	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s16_m_untied, svint16_t,
+		z0 = svmla_n_s16_m (p0, z1, z2, 11),
+		z0 = svmla_m (p0, z1, z2, 11))
+
+/*
+** mla_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s16_z_tied1, svint16_t,
+		z0 = svmla_s16_z (p0, z0, z1, z2),
+		z0 = svmla_z (p0, z0, z1, z2))
+
+/*
+** mla_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s16_z_tied2, svint16_t,
+		z0 = svmla_s16_z (p0, z1, z0, z2),
+		z0 = svmla_z (p0, z1, z0, z2))
+
+/*
+** mla_s16_z_tied3:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s16_z_tied3, svint16_t,
+		z0 = svmla_s16_z (p0, z1, z2, z0),
+		z0 = svmla_z (p0, z1, z2, z0))
+
+/*
+** mla_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mla	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	mad	z0\.h, p0/m, z3\.h, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, z3\.h
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s16_z_untied, svint16_t,
+		z0 = svmla_s16_z (p0, z1, z2, z3),
+		z0 = svmla_z (p0, z1, z2, z3))
+
+/*
+** mla_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s16_z_tied1, svint16_t, int16_t,
+		 z0 = svmla_n_s16_z (p0, z0, z1, x0),
+		 z0 = svmla_z (p0, z0, z1, x0))
+
+/*
+** mla_w0_s16_z_tied2:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s16_z_tied2, svint16_t, int16_t,
+		 z0 = svmla_n_s16_z (p0, z1, z0, x0),
+		 z0 = svmla_z (p0, z1, z0, x0))
+
+/*
+** mla_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mla	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	mad	z0\.h, p0/m, \1, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s16_z_untied, svint16_t, int16_t,
+		 z0 = svmla_n_s16_z (p0, z1, z2, x0),
+		 z0 = svmla_z (p0, z1, z2, x0))
+
+/*
+** mla_11_s16_z_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s16_z_tied1, svint16_t,
+		z0 = svmla_n_s16_z (p0, z0, z1, 11),
+		z0 = svmla_z (p0, z0, z1, 11))
+
+/*
+** mla_11_s16_z_tied2:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s16_z_tied2, svint16_t,
+		z0 = svmla_n_s16_z (p0, z1, z0, 11),
+		z0 = svmla_z (p0, z1, z0, 11))
+
+/*
+** mla_11_s16_z_untied:
+**	mov	(z[0-9]+\.h), #11
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mla	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	mad	z0\.h, p0/m, \1, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s16_z_untied, svint16_t,
+		z0 = svmla_n_s16_z (p0, z1, z2, 11),
+		z0 = svmla_z (p0, z1, z2, 11))
+
+/*
+** mla_s16_x_tied1:
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s16_x_tied1, svint16_t,
+		z0 = svmla_s16_x (p0, z0, z1, z2),
+		z0 = svmla_x (p0, z0, z1, z2))
+
+/*
+** mla_s16_x_tied2:
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s16_x_tied2, svint16_t,
+		z0 = svmla_s16_x (p0, z1, z0, z2),
+		z0 = svmla_x (p0, z1, z0, z2))
+
+/*
+** mla_s16_x_tied3:
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s16_x_tied3, svint16_t,
+		z0 = svmla_s16_x (p0, z1, z2, z0),
+		z0 = svmla_x (p0, z1, z2, z0))
+
+/*
+** mla_s16_x_untied:
+** (
+**	movprfx	z0, z1
+**	mla	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0, z2
+**	mad	z0\.h, p0/m, z3\.h, z1\.h
+** |
+**	movprfx	z0, z3
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s16_x_untied, svint16_t,
+		z0 = svmla_s16_x (p0, z1, z2, z3),
+		z0 = svmla_x (p0, z1, z2, z3))
+
+/*
+** mla_w0_s16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	mla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s16_x_tied1, svint16_t, int16_t,
+		 z0 = svmla_n_s16_x (p0, z0, z1, x0),
+		 z0 = svmla_x (p0, z0, z1, x0))
+
+/*
+** mla_w0_s16_x_tied2:
+**	mov	(z[0-9]+\.h), w0
+**	mad	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s16_x_tied2, svint16_t, int16_t,
+		 z0 = svmla_n_s16_x (p0, z1, z0, x0),
+		 z0 = svmla_x (p0, z1, z0, x0))
+
+/*
+** mla_w0_s16_x_untied:
+**	mov	z0\.h, w0
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s16_x_untied, svint16_t, int16_t,
+		 z0 = svmla_n_s16_x (p0, z1, z2, x0),
+		 z0 = svmla_x (p0, z1, z2, x0))
+
+/*
+** mla_11_s16_x_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	mla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s16_x_tied1, svint16_t,
+		z0 = svmla_n_s16_x (p0, z0, z1, 11),
+		z0 = svmla_x (p0, z0, z1, 11))
+
+/*
+** mla_11_s16_x_tied2:
+**	mov	(z[0-9]+\.h), #11
+**	mad	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s16_x_tied2, svint16_t,
+		z0 = svmla_n_s16_x (p0, z1, z0, 11),
+		z0 = svmla_x (p0, z1, z0, 11))
+
+/*
+** mla_11_s16_x_untied:
+**	mov	z0\.h, #11
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s16_x_untied, svint16_t,
+		z0 = svmla_n_s16_x (p0, z1, z2, 11),
+		z0 = svmla_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s32.c
new file mode 100644
index 000000000..5e8001a71
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s32.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mla_s32_m_tied1:
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s32_m_tied1, svint32_t,
+		z0 = svmla_s32_m (p0, z0, z1, z2),
+		z0 = svmla_m (p0, z0, z1, z2))
+
+/*
+** mla_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mla	z0\.s, p0/m, \1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s32_m_tied2, svint32_t,
+		z0 = svmla_s32_m (p0, z1, z0, z2),
+		z0 = svmla_m (p0, z1, z0, z2))
+
+/*
+** mla_s32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mla	z0\.s, p0/m, z2\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s32_m_tied3, svint32_t,
+		z0 = svmla_s32_m (p0, z1, z2, z0),
+		z0 = svmla_m (p0, z1, z2, z0))
+
+/*
+** mla_s32_m_untied:
+**	movprfx	z0, z1
+**	mla	z0\.s, p0/m, z2\.s, z3\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s32_m_untied, svint32_t,
+		z0 = svmla_s32_m (p0, z1, z2, z3),
+		z0 = svmla_m (p0, z1, z2, z3))
+
+/*
+** mla_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	mla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svmla_n_s32_m (p0, z0, z1, x0),
+		 z0 = svmla_m (p0, z0, z1, x0))
+
+/*
+** mla_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	mla	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svmla_n_s32_m (p0, z1, z2, x0),
+		 z0 = svmla_m (p0, z1, z2, x0))
+
+/*
+** mla_11_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	mla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s32_m_tied1, svint32_t,
+		z0 = svmla_n_s32_m (p0, z0, z1, 11),
+		z0 = svmla_m (p0, z0, z1, 11))
+
+/*
+** mla_11_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0, z1
+**	mla	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s32_m_untied, svint32_t,
+		z0 = svmla_n_s32_m (p0, z1, z2, 11),
+		z0 = svmla_m (p0, z1, z2, 11))
+
+/*
+** mla_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s32_z_tied1, svint32_t,
+		z0 = svmla_s32_z (p0, z0, z1, z2),
+		z0 = svmla_z (p0, z0, z1, z2))
+
+/*
+** mla_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s32_z_tied2, svint32_t,
+		z0 = svmla_s32_z (p0, z1, z0, z2),
+		z0 = svmla_z (p0, z1, z0, z2))
+
+/*
+** mla_s32_z_tied3:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s32_z_tied3, svint32_t,
+		z0 = svmla_s32_z (p0, z1, z2, z0),
+		z0 = svmla_z (p0, z1, z2, z0))
+
+/*
+** mla_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mla	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	mad	z0\.s, p0/m, z3\.s, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, z3\.s
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s32_z_untied, svint32_t,
+		z0 = svmla_s32_z (p0, z1, z2, z3),
+		z0 = svmla_z (p0, z1, z2, z3))
+
+/*
+** mla_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svmla_n_s32_z (p0, z0, z1, x0),
+		 z0 = svmla_z (p0, z0, z1, x0))
+
+/*
+** mla_w0_s32_z_tied2:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s32_z_tied2, svint32_t, int32_t,
+		 z0 = svmla_n_s32_z (p0, z1, z0, x0),
+		 z0 = svmla_z (p0, z1, z0, x0))
+
+/*
+** mla_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mla	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	mad	z0\.s, p0/m, \1, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svmla_n_s32_z (p0, z1, z2, x0),
+		 z0 = svmla_z (p0, z1, z2, x0))
+
+/*
+** mla_11_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s32_z_tied1, svint32_t,
+		z0 = svmla_n_s32_z (p0, z0, z1, 11),
+		z0 = svmla_z (p0, z0, z1, 11))
+
+/*
+** mla_11_s32_z_tied2:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s32_z_tied2, svint32_t,
+		z0 = svmla_n_s32_z (p0, z1, z0, 11),
+		z0 = svmla_z (p0, z1, z0, 11))
+
+/*
+** mla_11_s32_z_untied:
+**	mov	(z[0-9]+\.s), #11
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mla	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	mad	z0\.s, p0/m, \1, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s32_z_untied, svint32_t,
+		z0 = svmla_n_s32_z (p0, z1, z2, 11),
+		z0 = svmla_z (p0, z1, z2, 11))
+
+/*
+** mla_s32_x_tied1:
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s32_x_tied1, svint32_t,
+		z0 = svmla_s32_x (p0, z0, z1, z2),
+		z0 = svmla_x (p0, z0, z1, z2))
+
+/*
+** mla_s32_x_tied2:
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s32_x_tied2, svint32_t,
+		z0 = svmla_s32_x (p0, z1, z0, z2),
+		z0 = svmla_x (p0, z1, z0, z2))
+
+/*
+** mla_s32_x_tied3:
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s32_x_tied3, svint32_t,
+		z0 = svmla_s32_x (p0, z1, z2, z0),
+		z0 = svmla_x (p0, z1, z2, z0))
+
+/*
+** mla_s32_x_untied:
+** (
+**	movprfx	z0, z1
+**	mla	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0, z2
+**	mad	z0\.s, p0/m, z3\.s, z1\.s
+** |
+**	movprfx	z0, z3
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s32_x_untied, svint32_t,
+		z0 = svmla_s32_x (p0, z1, z2, z3),
+		z0 = svmla_x (p0, z1, z2, z3))
+
+/*
+** mla_w0_s32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	mla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svmla_n_s32_x (p0, z0, z1, x0),
+		 z0 = svmla_x (p0, z0, z1, x0))
+
+/*
+** mla_w0_s32_x_tied2:
+**	mov	(z[0-9]+\.s), w0
+**	mad	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s32_x_tied2, svint32_t, int32_t,
+		 z0 = svmla_n_s32_x (p0, z1, z0, x0),
+		 z0 = svmla_x (p0, z1, z0, x0))
+
+/*
+** mla_w0_s32_x_untied:
+**	mov	z0\.s, w0
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svmla_n_s32_x (p0, z1, z2, x0),
+		 z0 = svmla_x (p0, z1, z2, x0))
+
+/*
+** mla_11_s32_x_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	mla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s32_x_tied1, svint32_t,
+		z0 = svmla_n_s32_x (p0, z0, z1, 11),
+		z0 = svmla_x (p0, z0, z1, 11))
+
+/*
+** mla_11_s32_x_tied2:
+**	mov	(z[0-9]+\.s), #11
+**	mad	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s32_x_tied2, svint32_t,
+		z0 = svmla_n_s32_x (p0, z1, z0, 11),
+		z0 = svmla_x (p0, z1, z0, 11))
+
+/*
+** mla_11_s32_x_untied:
+**	mov	z0\.s, #11
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s32_x_untied, svint32_t,
+		z0 = svmla_n_s32_x (p0, z1, z2, 11),
+		z0 = svmla_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s64.c
new file mode 100644
index 000000000..7b619e521
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s64.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mla_s64_m_tied1:
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s64_m_tied1, svint64_t,
+		z0 = svmla_s64_m (p0, z0, z1, z2),
+		z0 = svmla_m (p0, z0, z1, z2))
+
+/*
+** mla_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	mla	z0\.d, p0/m, \1, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s64_m_tied2, svint64_t,
+		z0 = svmla_s64_m (p0, z1, z0, z2),
+		z0 = svmla_m (p0, z1, z0, z2))
+
+/*
+** mla_s64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	mla	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s64_m_tied3, svint64_t,
+		z0 = svmla_s64_m (p0, z1, z2, z0),
+		z0 = svmla_m (p0, z1, z2, z0))
+
+/*
+** mla_s64_m_untied:
+**	movprfx	z0, z1
+**	mla	z0\.d, p0/m, z2\.d, z3\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s64_m_untied, svint64_t,
+		z0 = svmla_s64_m (p0, z1, z2, z3),
+		z0 = svmla_m (p0, z1, z2, z3))
+
+/*
+** mla_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	mla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svmla_n_s64_m (p0, z0, z1, x0),
+		 z0 = svmla_m (p0, z0, z1, x0))
+
+/*
+** mla_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	mla	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svmla_n_s64_m (p0, z1, z2, x0),
+		 z0 = svmla_m (p0, z1, z2, x0))
+
+/*
+** mla_11_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	mla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s64_m_tied1, svint64_t,
+		z0 = svmla_n_s64_m (p0, z0, z1, 11),
+		z0 = svmla_m (p0, z0, z1, 11))
+
+/*
+** mla_11_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0, z1
+**	mla	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s64_m_untied, svint64_t,
+		z0 = svmla_n_s64_m (p0, z1, z2, 11),
+		z0 = svmla_m (p0, z1, z2, 11))
+
+/*
+** mla_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s64_z_tied1, svint64_t,
+		z0 = svmla_s64_z (p0, z0, z1, z2),
+		z0 = svmla_z (p0, z0, z1, z2))
+
+/*
+** mla_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s64_z_tied2, svint64_t,
+		z0 = svmla_s64_z (p0, z1, z0, z2),
+		z0 = svmla_z (p0, z1, z0, z2))
+
+/*
+** mla_s64_z_tied3:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s64_z_tied3, svint64_t,
+		z0 = svmla_s64_z (p0, z1, z2, z0),
+		z0 = svmla_z (p0, z1, z2, z0))
+
+/*
+** mla_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mla	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	mad	z0\.d, p0/m, z3\.d, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, z3\.d
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s64_z_untied, svint64_t,
+		z0 = svmla_s64_z (p0, z1, z2, z3),
+		z0 = svmla_z (p0, z1, z2, z3))
+
+/*
+** mla_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svmla_n_s64_z (p0, z0, z1, x0),
+		 z0 = svmla_z (p0, z0, z1, x0))
+
+/*
+** mla_x0_s64_z_tied2:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_x0_s64_z_tied2, svint64_t, int64_t,
+		 z0 = svmla_n_s64_z (p0, z1, z0, x0),
+		 z0 = svmla_z (p0, z1, z0, x0))
+
+/*
+** mla_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mla	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	mad	z0\.d, p0/m, \1, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svmla_n_s64_z (p0, z1, z2, x0),
+		 z0 = svmla_z (p0, z1, z2, x0))
+
+/*
+** mla_11_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s64_z_tied1, svint64_t,
+		z0 = svmla_n_s64_z (p0, z0, z1, 11),
+		z0 = svmla_z (p0, z0, z1, 11))
+
+/*
+** mla_11_s64_z_tied2:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s64_z_tied2, svint64_t,
+		z0 = svmla_n_s64_z (p0, z1, z0, 11),
+		z0 = svmla_z (p0, z1, z0, 11))
+
+/*
+** mla_11_s64_z_untied:
+**	mov	(z[0-9]+\.d), #11
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mla	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	mad	z0\.d, p0/m, \1, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s64_z_untied, svint64_t,
+		z0 = svmla_n_s64_z (p0, z1, z2, 11),
+		z0 = svmla_z (p0, z1, z2, 11))
+
+/*
+** mla_s64_x_tied1:
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s64_x_tied1, svint64_t,
+		z0 = svmla_s64_x (p0, z0, z1, z2),
+		z0 = svmla_x (p0, z0, z1, z2))
+
+/*
+** mla_s64_x_tied2:
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s64_x_tied2, svint64_t,
+		z0 = svmla_s64_x (p0, z1, z0, z2),
+		z0 = svmla_x (p0, z1, z0, z2))
+
+/*
+** mla_s64_x_tied3:
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s64_x_tied3, svint64_t,
+		z0 = svmla_s64_x (p0, z1, z2, z0),
+		z0 = svmla_x (p0, z1, z2, z0))
+
+/*
+** mla_s64_x_untied:
+** (
+**	movprfx	z0, z1
+**	mla	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0, z2
+**	mad	z0\.d, p0/m, z3\.d, z1\.d
+** |
+**	movprfx	z0, z3
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s64_x_untied, svint64_t,
+		z0 = svmla_s64_x (p0, z1, z2, z3),
+		z0 = svmla_x (p0, z1, z2, z3))
+
+/*
+** mla_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	mla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svmla_n_s64_x (p0, z0, z1, x0),
+		 z0 = svmla_x (p0, z0, z1, x0))
+
+/*
+** mla_x0_s64_x_tied2:
+**	mov	(z[0-9]+\.d), x0
+**	mad	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_x0_s64_x_tied2, svint64_t, int64_t,
+		 z0 = svmla_n_s64_x (p0, z1, z0, x0),
+		 z0 = svmla_x (p0, z1, z0, x0))
+
+/*
+** mla_x0_s64_x_untied:
+**	mov	z0\.d, x0
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svmla_n_s64_x (p0, z1, z2, x0),
+		 z0 = svmla_x (p0, z1, z2, x0))
+
+/*
+** mla_11_s64_x_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	mla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s64_x_tied1, svint64_t,
+		z0 = svmla_n_s64_x (p0, z0, z1, 11),
+		z0 = svmla_x (p0, z0, z1, 11))
+
+/*
+** mla_11_s64_x_tied2:
+**	mov	(z[0-9]+\.d), #11
+**	mad	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s64_x_tied2, svint64_t,
+		z0 = svmla_n_s64_x (p0, z1, z0, 11),
+		z0 = svmla_x (p0, z1, z0, 11))
+
+/*
+** mla_11_s64_x_untied:
+**	mov	z0\.d, #11
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s64_x_untied, svint64_t,
+		z0 = svmla_n_s64_x (p0, z1, z2, 11),
+		z0 = svmla_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s8.c
new file mode 100644
index 000000000..47468947d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_s8.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mla_s8_m_tied1:
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s8_m_tied1, svint8_t,
+		z0 = svmla_s8_m (p0, z0, z1, z2),
+		z0 = svmla_m (p0, z0, z1, z2))
+
+/*
+** mla_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mla	z0\.b, p0/m, \1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s8_m_tied2, svint8_t,
+		z0 = svmla_s8_m (p0, z1, z0, z2),
+		z0 = svmla_m (p0, z1, z0, z2))
+
+/*
+** mla_s8_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mla	z0\.b, p0/m, z2\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s8_m_tied3, svint8_t,
+		z0 = svmla_s8_m (p0, z1, z2, z0),
+		z0 = svmla_m (p0, z1, z2, z0))
+
+/*
+** mla_s8_m_untied:
+**	movprfx	z0, z1
+**	mla	z0\.b, p0/m, z2\.b, z3\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s8_m_untied, svint8_t,
+		z0 = svmla_s8_m (p0, z1, z2, z3),
+		z0 = svmla_m (p0, z1, z2, z3))
+
+/*
+** mla_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	mla	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s8_m_tied1, svint8_t, int8_t,
+		 z0 = svmla_n_s8_m (p0, z0, z1, x0),
+		 z0 = svmla_m (p0, z0, z1, x0))
+
+/*
+** mla_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	mla	z0\.b, p0/m, z2\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s8_m_untied, svint8_t, int8_t,
+		 z0 = svmla_n_s8_m (p0, z1, z2, x0),
+		 z0 = svmla_m (p0, z1, z2, x0))
+
+/*
+** mla_11_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	mla	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s8_m_tied1, svint8_t,
+		z0 = svmla_n_s8_m (p0, z0, z1, 11),
+		z0 = svmla_m (p0, z0, z1, 11))
+
+/*
+** mla_11_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0, z1
+**	mla	z0\.b, p0/m, z2\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s8_m_untied, svint8_t,
+		z0 = svmla_n_s8_m (p0, z1, z2, 11),
+		z0 = svmla_m (p0, z1, z2, 11))
+
+/*
+** mla_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s8_z_tied1, svint8_t,
+		z0 = svmla_s8_z (p0, z0, z1, z2),
+		z0 = svmla_z (p0, z0, z1, z2))
+
+/*
+** mla_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s8_z_tied2, svint8_t,
+		z0 = svmla_s8_z (p0, z1, z0, z2),
+		z0 = svmla_z (p0, z1, z0, z2))
+
+/*
+** mla_s8_z_tied3:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s8_z_tied3, svint8_t,
+		z0 = svmla_s8_z (p0, z1, z2, z0),
+		z0 = svmla_z (p0, z1, z2, z0))
+
+/*
+** mla_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mla	z0\.b, p0/m, z2\.b, z3\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	mad	z0\.b, p0/m, z3\.b, z1\.b
+** |
+**	movprfx	z0\.b, p0/z, z3\.b
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s8_z_untied, svint8_t,
+		z0 = svmla_s8_z (p0, z1, z2, z3),
+		z0 = svmla_z (p0, z1, z2, z3))
+
+/*
+** mla_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mla	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s8_z_tied1, svint8_t, int8_t,
+		 z0 = svmla_n_s8_z (p0, z0, z1, x0),
+		 z0 = svmla_z (p0, z0, z1, x0))
+
+/*
+** mla_w0_s8_z_tied2:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s8_z_tied2, svint8_t, int8_t,
+		 z0 = svmla_n_s8_z (p0, z1, z0, x0),
+		 z0 = svmla_z (p0, z1, z0, x0))
+
+/*
+** mla_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mla	z0\.b, p0/m, z2\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	mad	z0\.b, p0/m, \1, z1\.b
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s8_z_untied, svint8_t, int8_t,
+		 z0 = svmla_n_s8_z (p0, z1, z2, x0),
+		 z0 = svmla_z (p0, z1, z2, x0))
+
+/*
+** mla_11_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mla	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s8_z_tied1, svint8_t,
+		z0 = svmla_n_s8_z (p0, z0, z1, 11),
+		z0 = svmla_z (p0, z0, z1, 11))
+
+/*
+** mla_11_s8_z_tied2:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s8_z_tied2, svint8_t,
+		z0 = svmla_n_s8_z (p0, z1, z0, 11),
+		z0 = svmla_z (p0, z1, z0, 11))
+
+/*
+** mla_11_s8_z_untied:
+**	mov	(z[0-9]+\.b), #11
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mla	z0\.b, p0/m, z2\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	mad	z0\.b, p0/m, \1, z1\.b
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s8_z_untied, svint8_t,
+		z0 = svmla_n_s8_z (p0, z1, z2, 11),
+		z0 = svmla_z (p0, z1, z2, 11))
+
+/*
+** mla_s8_x_tied1:
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s8_x_tied1, svint8_t,
+		z0 = svmla_s8_x (p0, z0, z1, z2),
+		z0 = svmla_x (p0, z0, z1, z2))
+
+/*
+** mla_s8_x_tied2:
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s8_x_tied2, svint8_t,
+		z0 = svmla_s8_x (p0, z1, z0, z2),
+		z0 = svmla_x (p0, z1, z0, z2))
+
+/*
+** mla_s8_x_tied3:
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s8_x_tied3, svint8_t,
+		z0 = svmla_s8_x (p0, z1, z2, z0),
+		z0 = svmla_x (p0, z1, z2, z0))
+
+/*
+** mla_s8_x_untied:
+** (
+**	movprfx	z0, z1
+**	mla	z0\.b, p0/m, z2\.b, z3\.b
+** |
+**	movprfx	z0, z2
+**	mad	z0\.b, p0/m, z3\.b, z1\.b
+** |
+**	movprfx	z0, z3
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_s8_x_untied, svint8_t,
+		z0 = svmla_s8_x (p0, z1, z2, z3),
+		z0 = svmla_x (p0, z1, z2, z3))
+
+/*
+** mla_w0_s8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	mla	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s8_x_tied1, svint8_t, int8_t,
+		 z0 = svmla_n_s8_x (p0, z0, z1, x0),
+		 z0 = svmla_x (p0, z0, z1, x0))
+
+/*
+** mla_w0_s8_x_tied2:
+**	mov	(z[0-9]+\.b), w0
+**	mad	z0\.b, p0/m, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s8_x_tied2, svint8_t, int8_t,
+		 z0 = svmla_n_s8_x (p0, z1, z0, x0),
+		 z0 = svmla_x (p0, z1, z0, x0))
+
+/*
+** mla_w0_s8_x_untied:
+**	mov	z0\.b, w0
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_s8_x_untied, svint8_t, int8_t,
+		 z0 = svmla_n_s8_x (p0, z1, z2, x0),
+		 z0 = svmla_x (p0, z1, z2, x0))
+
+/*
+** mla_11_s8_x_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	mla	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s8_x_tied1, svint8_t,
+		z0 = svmla_n_s8_x (p0, z0, z1, 11),
+		z0 = svmla_x (p0, z0, z1, 11))
+
+/*
+** mla_11_s8_x_tied2:
+**	mov	(z[0-9]+\.b), #11
+**	mad	z0\.b, p0/m, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s8_x_tied2, svint8_t,
+		z0 = svmla_n_s8_x (p0, z1, z0, 11),
+		z0 = svmla_x (p0, z1, z0, 11))
+
+/*
+** mla_11_s8_x_untied:
+**	mov	z0\.b, #11
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_s8_x_untied, svint8_t,
+		z0 = svmla_n_s8_x (p0, z1, z2, 11),
+		z0 = svmla_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u16.c
new file mode 100644
index 000000000..7238e428f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u16.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mla_u16_m_tied1:
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u16_m_tied1, svuint16_t,
+		z0 = svmla_u16_m (p0, z0, z1, z2),
+		z0 = svmla_m (p0, z0, z1, z2))
+
+/*
+** mla_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mla	z0\.h, p0/m, \1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u16_m_tied2, svuint16_t,
+		z0 = svmla_u16_m (p0, z1, z0, z2),
+		z0 = svmla_m (p0, z1, z0, z2))
+
+/*
+** mla_u16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mla	z0\.h, p0/m, z2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u16_m_tied3, svuint16_t,
+		z0 = svmla_u16_m (p0, z1, z2, z0),
+		z0 = svmla_m (p0, z1, z2, z0))
+
+/*
+** mla_u16_m_untied:
+**	movprfx	z0, z1
+**	mla	z0\.h, p0/m, z2\.h, z3\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u16_m_untied, svuint16_t,
+		z0 = svmla_u16_m (p0, z1, z2, z3),
+		z0 = svmla_m (p0, z1, z2, z3))
+
+/*
+** mla_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	mla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svmla_n_u16_m (p0, z0, z1, x0),
+		 z0 = svmla_m (p0, z0, z1, x0))
+
+/*
+** mla_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	mla	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svmla_n_u16_m (p0, z1, z2, x0),
+		 z0 = svmla_m (p0, z1, z2, x0))
+
+/*
+** mla_11_u16_m_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	mla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u16_m_tied1, svuint16_t,
+		z0 = svmla_n_u16_m (p0, z0, z1, 11),
+		z0 = svmla_m (p0, z0, z1, 11))
+
+/*
+** mla_11_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0, z1
+**	mla	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u16_m_untied, svuint16_t,
+		z0 = svmla_n_u16_m (p0, z1, z2, 11),
+		z0 = svmla_m (p0, z1, z2, 11))
+
+/*
+** mla_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u16_z_tied1, svuint16_t,
+		z0 = svmla_u16_z (p0, z0, z1, z2),
+		z0 = svmla_z (p0, z0, z1, z2))
+
+/*
+** mla_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u16_z_tied2, svuint16_t,
+		z0 = svmla_u16_z (p0, z1, z0, z2),
+		z0 = svmla_z (p0, z1, z0, z2))
+
+/*
+** mla_u16_z_tied3:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u16_z_tied3, svuint16_t,
+		z0 = svmla_u16_z (p0, z1, z2, z0),
+		z0 = svmla_z (p0, z1, z2, z0))
+
+/*
+** mla_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mla	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	mad	z0\.h, p0/m, z3\.h, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, z3\.h
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u16_z_untied, svuint16_t,
+		z0 = svmla_u16_z (p0, z1, z2, z3),
+		z0 = svmla_z (p0, z1, z2, z3))
+
+/*
+** mla_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svmla_n_u16_z (p0, z0, z1, x0),
+		 z0 = svmla_z (p0, z0, z1, x0))
+
+/*
+** mla_w0_u16_z_tied2:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u16_z_tied2, svuint16_t, uint16_t,
+		 z0 = svmla_n_u16_z (p0, z1, z0, x0),
+		 z0 = svmla_z (p0, z1, z0, x0))
+
+/*
+** mla_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mla	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	mad	z0\.h, p0/m, \1, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svmla_n_u16_z (p0, z1, z2, x0),
+		 z0 = svmla_z (p0, z1, z2, x0))
+
+/*
+** mla_11_u16_z_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u16_z_tied1, svuint16_t,
+		z0 = svmla_n_u16_z (p0, z0, z1, 11),
+		z0 = svmla_z (p0, z0, z1, 11))
+
+/*
+** mla_11_u16_z_tied2:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mad	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u16_z_tied2, svuint16_t,
+		z0 = svmla_n_u16_z (p0, z1, z0, 11),
+		z0 = svmla_z (p0, z1, z0, 11))
+
+/*
+** mla_11_u16_z_untied:
+**	mov	(z[0-9]+\.h), #11
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mla	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	mad	z0\.h, p0/m, \1, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u16_z_untied, svuint16_t,
+		z0 = svmla_n_u16_z (p0, z1, z2, 11),
+		z0 = svmla_z (p0, z1, z2, 11))
+
+/*
+** mla_u16_x_tied1:
+**	mla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u16_x_tied1, svuint16_t,
+		z0 = svmla_u16_x (p0, z0, z1, z2),
+		z0 = svmla_x (p0, z0, z1, z2))
+
+/*
+** mla_u16_x_tied2:
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u16_x_tied2, svuint16_t,
+		z0 = svmla_u16_x (p0, z1, z0, z2),
+		z0 = svmla_x (p0, z1, z0, z2))
+
+/*
+** mla_u16_x_tied3:
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u16_x_tied3, svuint16_t,
+		z0 = svmla_u16_x (p0, z1, z2, z0),
+		z0 = svmla_x (p0, z1, z2, z0))
+
+/*
+** mla_u16_x_untied:
+** (
+**	movprfx	z0, z1
+**	mla	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0, z2
+**	mad	z0\.h, p0/m, z3\.h, z1\.h
+** |
+**	movprfx	z0, z3
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u16_x_untied, svuint16_t,
+		z0 = svmla_u16_x (p0, z1, z2, z3),
+		z0 = svmla_x (p0, z1, z2, z3))
+
+/*
+** mla_w0_u16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	mla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svmla_n_u16_x (p0, z0, z1, x0),
+		 z0 = svmla_x (p0, z0, z1, x0))
+
+/*
+** mla_w0_u16_x_tied2:
+**	mov	(z[0-9]+\.h), w0
+**	mad	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u16_x_tied2, svuint16_t, uint16_t,
+		 z0 = svmla_n_u16_x (p0, z1, z0, x0),
+		 z0 = svmla_x (p0, z1, z0, x0))
+
+/*
+** mla_w0_u16_x_untied:
+**	mov	z0\.h, w0
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svmla_n_u16_x (p0, z1, z2, x0),
+		 z0 = svmla_x (p0, z1, z2, x0))
+
+/*
+** mla_11_u16_x_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	mla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u16_x_tied1, svuint16_t,
+		z0 = svmla_n_u16_x (p0, z0, z1, 11),
+		z0 = svmla_x (p0, z0, z1, 11))
+
+/*
+** mla_11_u16_x_tied2:
+**	mov	(z[0-9]+\.h), #11
+**	mad	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u16_x_tied2, svuint16_t,
+		z0 = svmla_n_u16_x (p0, z1, z0, 11),
+		z0 = svmla_x (p0, z1, z0, 11))
+
+/*
+** mla_11_u16_x_untied:
+**	mov	z0\.h, #11
+**	mad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u16_x_untied, svuint16_t,
+		z0 = svmla_n_u16_x (p0, z1, z2, 11),
+		z0 = svmla_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u32.c
new file mode 100644
index 000000000..7a68bce3d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u32.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mla_u32_m_tied1:
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u32_m_tied1, svuint32_t,
+		z0 = svmla_u32_m (p0, z0, z1, z2),
+		z0 = svmla_m (p0, z0, z1, z2))
+
+/*
+** mla_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mla	z0\.s, p0/m, \1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u32_m_tied2, svuint32_t,
+		z0 = svmla_u32_m (p0, z1, z0, z2),
+		z0 = svmla_m (p0, z1, z0, z2))
+
+/*
+** mla_u32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mla	z0\.s, p0/m, z2\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u32_m_tied3, svuint32_t,
+		z0 = svmla_u32_m (p0, z1, z2, z0),
+		z0 = svmla_m (p0, z1, z2, z0))
+
+/*
+** mla_u32_m_untied:
+**	movprfx	z0, z1
+**	mla	z0\.s, p0/m, z2\.s, z3\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u32_m_untied, svuint32_t,
+		z0 = svmla_u32_m (p0, z1, z2, z3),
+		z0 = svmla_m (p0, z1, z2, z3))
+
+/*
+** mla_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	mla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svmla_n_u32_m (p0, z0, z1, x0),
+		 z0 = svmla_m (p0, z0, z1, x0))
+
+/*
+** mla_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	mla	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svmla_n_u32_m (p0, z1, z2, x0),
+		 z0 = svmla_m (p0, z1, z2, x0))
+
+/*
+** mla_11_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	mla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u32_m_tied1, svuint32_t,
+		z0 = svmla_n_u32_m (p0, z0, z1, 11),
+		z0 = svmla_m (p0, z0, z1, 11))
+
+/*
+** mla_11_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0, z1
+**	mla	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u32_m_untied, svuint32_t,
+		z0 = svmla_n_u32_m (p0, z1, z2, 11),
+		z0 = svmla_m (p0, z1, z2, 11))
+
+/*
+** mla_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u32_z_tied1, svuint32_t,
+		z0 = svmla_u32_z (p0, z0, z1, z2),
+		z0 = svmla_z (p0, z0, z1, z2))
+
+/*
+** mla_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u32_z_tied2, svuint32_t,
+		z0 = svmla_u32_z (p0, z1, z0, z2),
+		z0 = svmla_z (p0, z1, z0, z2))
+
+/*
+** mla_u32_z_tied3:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u32_z_tied3, svuint32_t,
+		z0 = svmla_u32_z (p0, z1, z2, z0),
+		z0 = svmla_z (p0, z1, z2, z0))
+
+/*
+** mla_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mla	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	mad	z0\.s, p0/m, z3\.s, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, z3\.s
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u32_z_untied, svuint32_t,
+		z0 = svmla_u32_z (p0, z1, z2, z3),
+		z0 = svmla_z (p0, z1, z2, z3))
+
+/*
+** mla_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svmla_n_u32_z (p0, z0, z1, x0),
+		 z0 = svmla_z (p0, z0, z1, x0))
+
+/*
+** mla_w0_u32_z_tied2:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u32_z_tied2, svuint32_t, uint32_t,
+		 z0 = svmla_n_u32_z (p0, z1, z0, x0),
+		 z0 = svmla_z (p0, z1, z0, x0))
+
+/*
+** mla_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mla	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	mad	z0\.s, p0/m, \1, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svmla_n_u32_z (p0, z1, z2, x0),
+		 z0 = svmla_z (p0, z1, z2, x0))
+
+/*
+** mla_11_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u32_z_tied1, svuint32_t,
+		z0 = svmla_n_u32_z (p0, z0, z1, 11),
+		z0 = svmla_z (p0, z0, z1, 11))
+
+/*
+** mla_11_u32_z_tied2:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mad	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u32_z_tied2, svuint32_t,
+		z0 = svmla_n_u32_z (p0, z1, z0, 11),
+		z0 = svmla_z (p0, z1, z0, 11))
+
+/*
+** mla_11_u32_z_untied:
+**	mov	(z[0-9]+\.s), #11
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mla	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	mad	z0\.s, p0/m, \1, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u32_z_untied, svuint32_t,
+		z0 = svmla_n_u32_z (p0, z1, z2, 11),
+		z0 = svmla_z (p0, z1, z2, 11))
+
+/*
+** mla_u32_x_tied1:
+**	mla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u32_x_tied1, svuint32_t,
+		z0 = svmla_u32_x (p0, z0, z1, z2),
+		z0 = svmla_x (p0, z0, z1, z2))
+
+/*
+** mla_u32_x_tied2:
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u32_x_tied2, svuint32_t,
+		z0 = svmla_u32_x (p0, z1, z0, z2),
+		z0 = svmla_x (p0, z1, z0, z2))
+
+/*
+** mla_u32_x_tied3:
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u32_x_tied3, svuint32_t,
+		z0 = svmla_u32_x (p0, z1, z2, z0),
+		z0 = svmla_x (p0, z1, z2, z0))
+
+/*
+** mla_u32_x_untied:
+** (
+**	movprfx	z0, z1
+**	mla	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0, z2
+**	mad	z0\.s, p0/m, z3\.s, z1\.s
+** |
+**	movprfx	z0, z3
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u32_x_untied, svuint32_t,
+		z0 = svmla_u32_x (p0, z1, z2, z3),
+		z0 = svmla_x (p0, z1, z2, z3))
+
+/*
+** mla_w0_u32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	mla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svmla_n_u32_x (p0, z0, z1, x0),
+		 z0 = svmla_x (p0, z0, z1, x0))
+
+/*
+** mla_w0_u32_x_tied2:
+**	mov	(z[0-9]+\.s), w0
+**	mad	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u32_x_tied2, svuint32_t, uint32_t,
+		 z0 = svmla_n_u32_x (p0, z1, z0, x0),
+		 z0 = svmla_x (p0, z1, z0, x0))
+
+/*
+** mla_w0_u32_x_untied:
+**	mov	z0\.s, w0
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svmla_n_u32_x (p0, z1, z2, x0),
+		 z0 = svmla_x (p0, z1, z2, x0))
+
+/*
+** mla_11_u32_x_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	mla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u32_x_tied1, svuint32_t,
+		z0 = svmla_n_u32_x (p0, z0, z1, 11),
+		z0 = svmla_x (p0, z0, z1, 11))
+
+/*
+** mla_11_u32_x_tied2:
+**	mov	(z[0-9]+\.s), #11
+**	mad	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u32_x_tied2, svuint32_t,
+		z0 = svmla_n_u32_x (p0, z1, z0, 11),
+		z0 = svmla_x (p0, z1, z0, 11))
+
+/*
+** mla_11_u32_x_untied:
+**	mov	z0\.s, #11
+**	mad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u32_x_untied, svuint32_t,
+		z0 = svmla_n_u32_x (p0, z1, z2, 11),
+		z0 = svmla_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u64.c
new file mode 100644
index 000000000..6233265c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u64.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mla_u64_m_tied1:
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u64_m_tied1, svuint64_t,
+		z0 = svmla_u64_m (p0, z0, z1, z2),
+		z0 = svmla_m (p0, z0, z1, z2))
+
+/*
+** mla_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	mla	z0\.d, p0/m, \1, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u64_m_tied2, svuint64_t,
+		z0 = svmla_u64_m (p0, z1, z0, z2),
+		z0 = svmla_m (p0, z1, z0, z2))
+
+/*
+** mla_u64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	mla	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u64_m_tied3, svuint64_t,
+		z0 = svmla_u64_m (p0, z1, z2, z0),
+		z0 = svmla_m (p0, z1, z2, z0))
+
+/*
+** mla_u64_m_untied:
+**	movprfx	z0, z1
+**	mla	z0\.d, p0/m, z2\.d, z3\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u64_m_untied, svuint64_t,
+		z0 = svmla_u64_m (p0, z1, z2, z3),
+		z0 = svmla_m (p0, z1, z2, z3))
+
+/*
+** mla_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	mla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svmla_n_u64_m (p0, z0, z1, x0),
+		 z0 = svmla_m (p0, z0, z1, x0))
+
+/*
+** mla_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	mla	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svmla_n_u64_m (p0, z1, z2, x0),
+		 z0 = svmla_m (p0, z1, z2, x0))
+
+/*
+** mla_11_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	mla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u64_m_tied1, svuint64_t,
+		z0 = svmla_n_u64_m (p0, z0, z1, 11),
+		z0 = svmla_m (p0, z0, z1, 11))
+
+/*
+** mla_11_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0, z1
+**	mla	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u64_m_untied, svuint64_t,
+		z0 = svmla_n_u64_m (p0, z1, z2, 11),
+		z0 = svmla_m (p0, z1, z2, 11))
+
+/*
+** mla_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u64_z_tied1, svuint64_t,
+		z0 = svmla_u64_z (p0, z0, z1, z2),
+		z0 = svmla_z (p0, z0, z1, z2))
+
+/*
+** mla_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u64_z_tied2, svuint64_t,
+		z0 = svmla_u64_z (p0, z1, z0, z2),
+		z0 = svmla_z (p0, z1, z0, z2))
+
+/*
+** mla_u64_z_tied3:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u64_z_tied3, svuint64_t,
+		z0 = svmla_u64_z (p0, z1, z2, z0),
+		z0 = svmla_z (p0, z1, z2, z0))
+
+/*
+** mla_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mla	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	mad	z0\.d, p0/m, z3\.d, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, z3\.d
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u64_z_untied, svuint64_t,
+		z0 = svmla_u64_z (p0, z1, z2, z3),
+		z0 = svmla_z (p0, z1, z2, z3))
+
+/*
+** mla_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svmla_n_u64_z (p0, z0, z1, x0),
+		 z0 = svmla_z (p0, z0, z1, x0))
+
+/*
+** mla_x0_u64_z_tied2:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_x0_u64_z_tied2, svuint64_t, uint64_t,
+		 z0 = svmla_n_u64_z (p0, z1, z0, x0),
+		 z0 = svmla_z (p0, z1, z0, x0))
+
+/*
+** mla_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mla	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	mad	z0\.d, p0/m, \1, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svmla_n_u64_z (p0, z1, z2, x0),
+		 z0 = svmla_z (p0, z1, z2, x0))
+
+/*
+** mla_11_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u64_z_tied1, svuint64_t,
+		z0 = svmla_n_u64_z (p0, z0, z1, 11),
+		z0 = svmla_z (p0, z0, z1, 11))
+
+/*
+** mla_11_u64_z_tied2:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mad	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u64_z_tied2, svuint64_t,
+		z0 = svmla_n_u64_z (p0, z1, z0, 11),
+		z0 = svmla_z (p0, z1, z0, 11))
+
+/*
+** mla_11_u64_z_untied:
+**	mov	(z[0-9]+\.d), #11
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mla	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	mad	z0\.d, p0/m, \1, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u64_z_untied, svuint64_t,
+		z0 = svmla_n_u64_z (p0, z1, z2, 11),
+		z0 = svmla_z (p0, z1, z2, 11))
+
+/*
+** mla_u64_x_tied1:
+**	mla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u64_x_tied1, svuint64_t,
+		z0 = svmla_u64_x (p0, z0, z1, z2),
+		z0 = svmla_x (p0, z0, z1, z2))
+
+/*
+** mla_u64_x_tied2:
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u64_x_tied2, svuint64_t,
+		z0 = svmla_u64_x (p0, z1, z0, z2),
+		z0 = svmla_x (p0, z1, z0, z2))
+
+/*
+** mla_u64_x_tied3:
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u64_x_tied3, svuint64_t,
+		z0 = svmla_u64_x (p0, z1, z2, z0),
+		z0 = svmla_x (p0, z1, z2, z0))
+
+/*
+** mla_u64_x_untied:
+** (
+**	movprfx	z0, z1
+**	mla	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0, z2
+**	mad	z0\.d, p0/m, z3\.d, z1\.d
+** |
+**	movprfx	z0, z3
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u64_x_untied, svuint64_t,
+		z0 = svmla_u64_x (p0, z1, z2, z3),
+		z0 = svmla_x (p0, z1, z2, z3))
+
+/*
+** mla_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	mla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svmla_n_u64_x (p0, z0, z1, x0),
+		 z0 = svmla_x (p0, z0, z1, x0))
+
+/*
+** mla_x0_u64_x_tied2:
+**	mov	(z[0-9]+\.d), x0
+**	mad	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_x0_u64_x_tied2, svuint64_t, uint64_t,
+		 z0 = svmla_n_u64_x (p0, z1, z0, x0),
+		 z0 = svmla_x (p0, z1, z0, x0))
+
+/*
+** mla_x0_u64_x_untied:
+**	mov	z0\.d, x0
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svmla_n_u64_x (p0, z1, z2, x0),
+		 z0 = svmla_x (p0, z1, z2, x0))
+
+/*
+** mla_11_u64_x_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	mla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u64_x_tied1, svuint64_t,
+		z0 = svmla_n_u64_x (p0, z0, z1, 11),
+		z0 = svmla_x (p0, z0, z1, 11))
+
+/*
+** mla_11_u64_x_tied2:
+**	mov	(z[0-9]+\.d), #11
+**	mad	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u64_x_tied2, svuint64_t,
+		z0 = svmla_n_u64_x (p0, z1, z0, 11),
+		z0 = svmla_x (p0, z1, z0, 11))
+
+/*
+** mla_11_u64_x_untied:
+**	mov	z0\.d, #11
+**	mad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u64_x_untied, svuint64_t,
+		z0 = svmla_n_u64_x (p0, z1, z2, 11),
+		z0 = svmla_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u8.c
new file mode 100644
index 000000000..832ed4141
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mla_u8.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mla_u8_m_tied1:
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u8_m_tied1, svuint8_t,
+		z0 = svmla_u8_m (p0, z0, z1, z2),
+		z0 = svmla_m (p0, z0, z1, z2))
+
+/*
+** mla_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mla	z0\.b, p0/m, \1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u8_m_tied2, svuint8_t,
+		z0 = svmla_u8_m (p0, z1, z0, z2),
+		z0 = svmla_m (p0, z1, z0, z2))
+
+/*
+** mla_u8_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mla	z0\.b, p0/m, z2\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u8_m_tied3, svuint8_t,
+		z0 = svmla_u8_m (p0, z1, z2, z0),
+		z0 = svmla_m (p0, z1, z2, z0))
+
+/*
+** mla_u8_m_untied:
+**	movprfx	z0, z1
+**	mla	z0\.b, p0/m, z2\.b, z3\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u8_m_untied, svuint8_t,
+		z0 = svmla_u8_m (p0, z1, z2, z3),
+		z0 = svmla_m (p0, z1, z2, z3))
+
+/*
+** mla_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	mla	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svmla_n_u8_m (p0, z0, z1, x0),
+		 z0 = svmla_m (p0, z0, z1, x0))
+
+/*
+** mla_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	mla	z0\.b, p0/m, z2\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svmla_n_u8_m (p0, z1, z2, x0),
+		 z0 = svmla_m (p0, z1, z2, x0))
+
+/*
+** mla_11_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	mla	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u8_m_tied1, svuint8_t,
+		z0 = svmla_n_u8_m (p0, z0, z1, 11),
+		z0 = svmla_m (p0, z0, z1, 11))
+
+/*
+** mla_11_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0, z1
+**	mla	z0\.b, p0/m, z2\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u8_m_untied, svuint8_t,
+		z0 = svmla_n_u8_m (p0, z1, z2, 11),
+		z0 = svmla_m (p0, z1, z2, 11))
+
+/*
+** mla_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u8_z_tied1, svuint8_t,
+		z0 = svmla_u8_z (p0, z0, z1, z2),
+		z0 = svmla_z (p0, z0, z1, z2))
+
+/*
+** mla_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u8_z_tied2, svuint8_t,
+		z0 = svmla_u8_z (p0, z1, z0, z2),
+		z0 = svmla_z (p0, z1, z0, z2))
+
+/*
+** mla_u8_z_tied3:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u8_z_tied3, svuint8_t,
+		z0 = svmla_u8_z (p0, z1, z2, z0),
+		z0 = svmla_z (p0, z1, z2, z0))
+
+/*
+** mla_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mla	z0\.b, p0/m, z2\.b, z3\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	mad	z0\.b, p0/m, z3\.b, z1\.b
+** |
+**	movprfx	z0\.b, p0/z, z3\.b
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u8_z_untied, svuint8_t,
+		z0 = svmla_u8_z (p0, z1, z2, z3),
+		z0 = svmla_z (p0, z1, z2, z3))
+
+/*
+** mla_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mla	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svmla_n_u8_z (p0, z0, z1, x0),
+		 z0 = svmla_z (p0, z0, z1, x0))
+
+/*
+** mla_w0_u8_z_tied2:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u8_z_tied2, svuint8_t, uint8_t,
+		 z0 = svmla_n_u8_z (p0, z1, z0, x0),
+		 z0 = svmla_z (p0, z1, z0, x0))
+
+/*
+** mla_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mla	z0\.b, p0/m, z2\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	mad	z0\.b, p0/m, \1, z1\.b
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svmla_n_u8_z (p0, z1, z2, x0),
+		 z0 = svmla_z (p0, z1, z2, x0))
+
+/*
+** mla_11_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mla	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u8_z_tied1, svuint8_t,
+		z0 = svmla_n_u8_z (p0, z0, z1, 11),
+		z0 = svmla_z (p0, z0, z1, 11))
+
+/*
+** mla_11_u8_z_tied2:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mad	z0\.b, p0/m, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u8_z_tied2, svuint8_t,
+		z0 = svmla_n_u8_z (p0, z1, z0, 11),
+		z0 = svmla_z (p0, z1, z0, 11))
+
+/*
+** mla_11_u8_z_untied:
+**	mov	(z[0-9]+\.b), #11
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mla	z0\.b, p0/m, z2\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	mad	z0\.b, p0/m, \1, z1\.b
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u8_z_untied, svuint8_t,
+		z0 = svmla_n_u8_z (p0, z1, z2, 11),
+		z0 = svmla_z (p0, z1, z2, 11))
+
+/*
+** mla_u8_x_tied1:
+**	mla	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u8_x_tied1, svuint8_t,
+		z0 = svmla_u8_x (p0, z0, z1, z2),
+		z0 = svmla_x (p0, z0, z1, z2))
+
+/*
+** mla_u8_x_tied2:
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u8_x_tied2, svuint8_t,
+		z0 = svmla_u8_x (p0, z1, z0, z2),
+		z0 = svmla_x (p0, z1, z0, z2))
+
+/*
+** mla_u8_x_tied3:
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u8_x_tied3, svuint8_t,
+		z0 = svmla_u8_x (p0, z1, z2, z0),
+		z0 = svmla_x (p0, z1, z2, z0))
+
+/*
+** mla_u8_x_untied:
+** (
+**	movprfx	z0, z1
+**	mla	z0\.b, p0/m, z2\.b, z3\.b
+** |
+**	movprfx	z0, z2
+**	mad	z0\.b, p0/m, z3\.b, z1\.b
+** |
+**	movprfx	z0, z3
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mla_u8_x_untied, svuint8_t,
+		z0 = svmla_u8_x (p0, z1, z2, z3),
+		z0 = svmla_x (p0, z1, z2, z3))
+
+/*
+** mla_w0_u8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	mla	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svmla_n_u8_x (p0, z0, z1, x0),
+		 z0 = svmla_x (p0, z0, z1, x0))
+
+/*
+** mla_w0_u8_x_tied2:
+**	mov	(z[0-9]+\.b), w0
+**	mad	z0\.b, p0/m, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u8_x_tied2, svuint8_t, uint8_t,
+		 z0 = svmla_n_u8_x (p0, z1, z0, x0),
+		 z0 = svmla_x (p0, z1, z0, x0))
+
+/*
+** mla_w0_u8_x_untied:
+**	mov	z0\.b, w0
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mla_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svmla_n_u8_x (p0, z1, z2, x0),
+		 z0 = svmla_x (p0, z1, z2, x0))
+
+/*
+** mla_11_u8_x_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	mla	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u8_x_tied1, svuint8_t,
+		z0 = svmla_n_u8_x (p0, z0, z1, 11),
+		z0 = svmla_x (p0, z0, z1, 11))
+
+/*
+** mla_11_u8_x_tied2:
+**	mov	(z[0-9]+\.b), #11
+**	mad	z0\.b, p0/m, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u8_x_tied2, svuint8_t,
+		z0 = svmla_n_u8_x (p0, z1, z0, 11),
+		z0 = svmla_x (p0, z1, z0, 11))
+
+/*
+** mla_11_u8_x_untied:
+**	mov	z0\.b, #11
+**	mad	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mla_11_u8_x_untied, svuint8_t,
+		z0 = svmla_n_u8_x (p0, z1, z2, 11),
+		z0 = svmla_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f16.c
new file mode 100644
index 000000000..87fba3da7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f16.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mls_f16_m_tied1:
+**	fmls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f16_m_tied1, svfloat16_t,
+		z0 = svmls_f16_m (p0, z0, z1, z2),
+		z0 = svmls_m (p0, z0, z1, z2))
+
+/*
+** mls_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmls	z0\.h, p0/m, \1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f16_m_tied2, svfloat16_t,
+		z0 = svmls_f16_m (p0, z1, z0, z2),
+		z0 = svmls_m (p0, z1, z0, z2))
+
+/*
+** mls_f16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmls	z0\.h, p0/m, z2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f16_m_tied3, svfloat16_t,
+		z0 = svmls_f16_m (p0, z1, z2, z0),
+		z0 = svmls_m (p0, z1, z2, z0))
+
+/*
+** mls_f16_m_untied:
+**	movprfx	z0, z1
+**	fmls	z0\.h, p0/m, z2\.h, z3\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f16_m_untied, svfloat16_t,
+		z0 = svmls_f16_m (p0, z1, z2, z3),
+		z0 = svmls_m (p0, z1, z2, z3))
+
+/*
+** mls_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svmls_n_f16_m (p0, z0, z1, d4),
+		 z0 = svmls_m (p0, z0, z1, d4))
+
+/*
+** mls_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fmls	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svmls_n_f16_m (p0, z1, z2, d4),
+		 z0 = svmls_m (p0, z1, z2, d4))
+
+/*
+** mls_2_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f16_m_tied1, svfloat16_t,
+		z0 = svmls_n_f16_m (p0, z0, z1, 2),
+		z0 = svmls_m (p0, z0, z1, 2))
+
+/*
+** mls_2_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmls	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f16_m_untied, svfloat16_t,
+		z0 = svmls_n_f16_m (p0, z1, z2, 2),
+		z0 = svmls_m (p0, z1, z2, 2))
+
+/*
+** mls_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f16_z_tied1, svfloat16_t,
+		z0 = svmls_f16_z (p0, z0, z1, z2),
+		z0 = svmls_z (p0, z0, z1, z2))
+
+/*
+** mls_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f16_z_tied2, svfloat16_t,
+		z0 = svmls_f16_z (p0, z1, z0, z2),
+		z0 = svmls_z (p0, z1, z0, z2))
+
+/*
+** mls_f16_z_tied3:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f16_z_tied3, svfloat16_t,
+		z0 = svmls_f16_z (p0, z1, z2, z0),
+		z0 = svmls_z (p0, z1, z2, z0))
+
+/*
+** mls_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmls	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmsb	z0\.h, p0/m, z3\.h, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, z3\.h
+**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f16_z_untied, svfloat16_t,
+		z0 = svmls_f16_z (p0, z1, z2, z3),
+		z0 = svmls_z (p0, z1, z2, z3))
+
+/*
+** mls_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svmls_n_f16_z (p0, z0, z1, d4),
+		 z0 = svmls_z (p0, z0, z1, d4))
+
+/*
+** mls_h4_f16_z_tied2:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmsb	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_h4_f16_z_tied2, svfloat16_t, __fp16,
+		 z0 = svmls_n_f16_z (p0, z1, z0, d4),
+		 z0 = svmls_z (p0, z1, z0, d4))
+
+/*
+** mls_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmls	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmsb	z0\.h, p0/m, \1, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svmls_n_f16_z (p0, z1, z2, d4),
+		 z0 = svmls_z (p0, z1, z2, d4))
+
+/*
+** mls_2_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f16_z_tied1, svfloat16_t,
+		z0 = svmls_n_f16_z (p0, z0, z1, 2),
+		z0 = svmls_z (p0, z0, z1, 2))
+
+/*
+** mls_2_f16_z_tied2:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmsb	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f16_z_tied2, svfloat16_t,
+		z0 = svmls_n_f16_z (p0, z1, z0, 2),
+		z0 = svmls_z (p0, z1, z0, 2))
+
+/*
+** mls_2_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmls	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmsb	z0\.h, p0/m, \1, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f16_z_untied, svfloat16_t,
+		z0 = svmls_n_f16_z (p0, z1, z2, 2),
+		z0 = svmls_z (p0, z1, z2, 2))
+
+/*
+** mls_f16_x_tied1:
+**	fmls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f16_x_tied1, svfloat16_t,
+		z0 = svmls_f16_x (p0, z0, z1, z2),
+		z0 = svmls_x (p0, z0, z1, z2))
+
+/*
+** mls_f16_x_tied2:
+**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f16_x_tied2, svfloat16_t,
+		z0 = svmls_f16_x (p0, z1, z0, z2),
+		z0 = svmls_x (p0, z1, z0, z2))
+
+/*
+** mls_f16_x_tied3:
+**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f16_x_tied3, svfloat16_t,
+		z0 = svmls_f16_x (p0, z1, z2, z0),
+		z0 = svmls_x (p0, z1, z2, z0))
+
+/*
+** mls_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmls	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0, z2
+**	fmsb	z0\.h, p0/m, z3\.h, z1\.h
+** |
+**	movprfx	z0, z3
+**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f16_x_untied, svfloat16_t,
+		z0 = svmls_f16_x (p0, z1, z2, z3),
+		z0 = svmls_x (p0, z1, z2, z3))
+
+/*
+** mls_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svmls_n_f16_x (p0, z0, z1, d4),
+		 z0 = svmls_x (p0, z0, z1, d4))
+
+/*
+** mls_h4_f16_x_tied2:
+**	mov	(z[0-9]+\.h), h4
+**	fmsb	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_h4_f16_x_tied2, svfloat16_t, __fp16,
+		 z0 = svmls_n_f16_x (p0, z1, z0, d4),
+		 z0 = svmls_x (p0, z1, z0, d4))
+
+/*
+** mls_h4_f16_x_untied: { xfail *-*-* }
+**	mov	z0\.h, h4
+**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svmls_n_f16_x (p0, z1, z2, d4),
+		 z0 = svmls_x (p0, z1, z2, d4))
+
+/*
+** mls_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f16_x_tied1, svfloat16_t,
+		z0 = svmls_n_f16_x (p0, z0, z1, 2),
+		z0 = svmls_x (p0, z0, z1, 2))
+
+/*
+** mls_2_f16_x_tied2:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmsb	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f16_x_tied2, svfloat16_t,
+		z0 = svmls_n_f16_x (p0, z1, z0, 2),
+		z0 = svmls_x (p0, z1, z0, 2))
+
+/*
+** mls_2_f16_x_untied:
+**	fmov	z0\.h, #2\.0(?:e\+0)?
+**	fmsb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f16_x_untied, svfloat16_t,
+		z0 = svmls_n_f16_x (p0, z1, z2, 2),
+		z0 = svmls_x (p0, z1, z2, 2))
+
+/*
+** ptrue_mls_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_f16_x_tied1, svfloat16_t,
+		z0 = svmls_f16_x (svptrue_b16 (), z0, z1, z2),
+		z0 = svmls_x (svptrue_b16 (), z0, z1, z2))
+
+/*
+** ptrue_mls_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_f16_x_tied2, svfloat16_t,
+		z0 = svmls_f16_x (svptrue_b16 (), z1, z0, z2),
+		z0 = svmls_x (svptrue_b16 (), z1, z0, z2))
+
+/*
+** ptrue_mls_f16_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_f16_x_tied3, svfloat16_t,
+		z0 = svmls_f16_x (svptrue_b16 (), z1, z2, z0),
+		z0 = svmls_x (svptrue_b16 (), z1, z2, z0))
+
+/*
+** ptrue_mls_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_f16_x_untied, svfloat16_t,
+		z0 = svmls_f16_x (svptrue_b16 (), z1, z2, z3),
+		z0 = svmls_x (svptrue_b16 (), z1, z2, z3))
+
+/*
+** ptrue_mls_2_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_2_f16_x_tied1, svfloat16_t,
+		z0 = svmls_n_f16_x (svptrue_b16 (), z0, z1, 2),
+		z0 = svmls_x (svptrue_b16 (), z0, z1, 2))
+
+/*
+** ptrue_mls_2_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_2_f16_x_tied2, svfloat16_t,
+		z0 = svmls_n_f16_x (svptrue_b16 (), z1, z0, 2),
+		z0 = svmls_x (svptrue_b16 (), z1, z0, 2))
+
+/*
+** ptrue_mls_2_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_2_f16_x_untied, svfloat16_t,
+		z0 = svmls_n_f16_x (svptrue_b16 (), z1, z2, 2),
+		z0 = svmls_x (svptrue_b16 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f32.c
new file mode 100644
index 000000000..04ce1ec46
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f32.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mls_f32_m_tied1:
+**	fmls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f32_m_tied1, svfloat32_t,
+		z0 = svmls_f32_m (p0, z0, z1, z2),
+		z0 = svmls_m (p0, z0, z1, z2))
+
+/*
+** mls_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmls	z0\.s, p0/m, \1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f32_m_tied2, svfloat32_t,
+		z0 = svmls_f32_m (p0, z1, z0, z2),
+		z0 = svmls_m (p0, z1, z0, z2))
+
+/*
+** mls_f32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmls	z0\.s, p0/m, z2\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f32_m_tied3, svfloat32_t,
+		z0 = svmls_f32_m (p0, z1, z2, z0),
+		z0 = svmls_m (p0, z1, z2, z0))
+
+/*
+** mls_f32_m_untied:
+**	movprfx	z0, z1
+**	fmls	z0\.s, p0/m, z2\.s, z3\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f32_m_untied, svfloat32_t,
+		z0 = svmls_f32_m (p0, z1, z2, z3),
+		z0 = svmls_m (p0, z1, z2, z3))
+
+/*
+** mls_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svmls_n_f32_m (p0, z0, z1, d4),
+		 z0 = svmls_m (p0, z0, z1, d4))
+
+/*
+** mls_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fmls	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svmls_n_f32_m (p0, z1, z2, d4),
+		 z0 = svmls_m (p0, z1, z2, d4))
+
+/*
+** mls_2_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f32_m_tied1, svfloat32_t,
+		z0 = svmls_n_f32_m (p0, z0, z1, 2),
+		z0 = svmls_m (p0, z0, z1, 2))
+
+/*
+** mls_2_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmls	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f32_m_untied, svfloat32_t,
+		z0 = svmls_n_f32_m (p0, z1, z2, 2),
+		z0 = svmls_m (p0, z1, z2, 2))
+
+/*
+** mls_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f32_z_tied1, svfloat32_t,
+		z0 = svmls_f32_z (p0, z0, z1, z2),
+		z0 = svmls_z (p0, z0, z1, z2))
+
+/*
+** mls_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f32_z_tied2, svfloat32_t,
+		z0 = svmls_f32_z (p0, z1, z0, z2),
+		z0 = svmls_z (p0, z1, z0, z2))
+
+/*
+** mls_f32_z_tied3:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f32_z_tied3, svfloat32_t,
+		z0 = svmls_f32_z (p0, z1, z2, z0),
+		z0 = svmls_z (p0, z1, z2, z0))
+
+/*
+** mls_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmls	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmsb	z0\.s, p0/m, z3\.s, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, z3\.s
+**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f32_z_untied, svfloat32_t,
+		z0 = svmls_f32_z (p0, z1, z2, z3),
+		z0 = svmls_z (p0, z1, z2, z3))
+
+/*
+** mls_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svmls_n_f32_z (p0, z0, z1, d4),
+		 z0 = svmls_z (p0, z0, z1, d4))
+
+/*
+** mls_s4_f32_z_tied2:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmsb	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_s4_f32_z_tied2, svfloat32_t, float,
+		 z0 = svmls_n_f32_z (p0, z1, z0, d4),
+		 z0 = svmls_z (p0, z1, z0, d4))
+
+/*
+** mls_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmls	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmsb	z0\.s, p0/m, \1, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svmls_n_f32_z (p0, z1, z2, d4),
+		 z0 = svmls_z (p0, z1, z2, d4))
+
+/*
+** mls_2_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f32_z_tied1, svfloat32_t,
+		z0 = svmls_n_f32_z (p0, z0, z1, 2),
+		z0 = svmls_z (p0, z0, z1, 2))
+
+/*
+** mls_2_f32_z_tied2:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmsb	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f32_z_tied2, svfloat32_t,
+		z0 = svmls_n_f32_z (p0, z1, z0, 2),
+		z0 = svmls_z (p0, z1, z0, 2))
+
+/*
+** mls_2_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmls	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmsb	z0\.s, p0/m, \1, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f32_z_untied, svfloat32_t,
+		z0 = svmls_n_f32_z (p0, z1, z2, 2),
+		z0 = svmls_z (p0, z1, z2, 2))
+
+/*
+** mls_f32_x_tied1:
+**	fmls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f32_x_tied1, svfloat32_t,
+		z0 = svmls_f32_x (p0, z0, z1, z2),
+		z0 = svmls_x (p0, z0, z1, z2))
+
+/*
+** mls_f32_x_tied2:
+**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f32_x_tied2, svfloat32_t,
+		z0 = svmls_f32_x (p0, z1, z0, z2),
+		z0 = svmls_x (p0, z1, z0, z2))
+
+/*
+** mls_f32_x_tied3:
+**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f32_x_tied3, svfloat32_t,
+		z0 = svmls_f32_x (p0, z1, z2, z0),
+		z0 = svmls_x (p0, z1, z2, z0))
+
+/*
+** mls_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmls	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0, z2
+**	fmsb	z0\.s, p0/m, z3\.s, z1\.s
+** |
+**	movprfx	z0, z3
+**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f32_x_untied, svfloat32_t,
+		z0 = svmls_f32_x (p0, z1, z2, z3),
+		z0 = svmls_x (p0, z1, z2, z3))
+
+/*
+** mls_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svmls_n_f32_x (p0, z0, z1, d4),
+		 z0 = svmls_x (p0, z0, z1, d4))
+
+/*
+** mls_s4_f32_x_tied2:
+**	mov	(z[0-9]+\.s), s4
+**	fmsb	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_s4_f32_x_tied2, svfloat32_t, float,
+		 z0 = svmls_n_f32_x (p0, z1, z0, d4),
+		 z0 = svmls_x (p0, z1, z0, d4))
+
+/*
+** mls_s4_f32_x_untied: { xfail *-*-* }
+**	mov	z0\.s, s4
+**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svmls_n_f32_x (p0, z1, z2, d4),
+		 z0 = svmls_x (p0, z1, z2, d4))
+
+/*
+** mls_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f32_x_tied1, svfloat32_t,
+		z0 = svmls_n_f32_x (p0, z0, z1, 2),
+		z0 = svmls_x (p0, z0, z1, 2))
+
+/*
+** mls_2_f32_x_tied2:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmsb	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f32_x_tied2, svfloat32_t,
+		z0 = svmls_n_f32_x (p0, z1, z0, 2),
+		z0 = svmls_x (p0, z1, z0, 2))
+
+/*
+** mls_2_f32_x_untied:
+**	fmov	z0\.s, #2\.0(?:e\+0)?
+**	fmsb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f32_x_untied, svfloat32_t,
+		z0 = svmls_n_f32_x (p0, z1, z2, 2),
+		z0 = svmls_x (p0, z1, z2, 2))
+
+/*
+** ptrue_mls_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_f32_x_tied1, svfloat32_t,
+		z0 = svmls_f32_x (svptrue_b32 (), z0, z1, z2),
+		z0 = svmls_x (svptrue_b32 (), z0, z1, z2))
+
+/*
+** ptrue_mls_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_f32_x_tied2, svfloat32_t,
+		z0 = svmls_f32_x (svptrue_b32 (), z1, z0, z2),
+		z0 = svmls_x (svptrue_b32 (), z1, z0, z2))
+
+/*
+** ptrue_mls_f32_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_f32_x_tied3, svfloat32_t,
+		z0 = svmls_f32_x (svptrue_b32 (), z1, z2, z0),
+		z0 = svmls_x (svptrue_b32 (), z1, z2, z0))
+
+/*
+** ptrue_mls_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_f32_x_untied, svfloat32_t,
+		z0 = svmls_f32_x (svptrue_b32 (), z1, z2, z3),
+		z0 = svmls_x (svptrue_b32 (), z1, z2, z3))
+
+/*
+** ptrue_mls_2_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_2_f32_x_tied1, svfloat32_t,
+		z0 = svmls_n_f32_x (svptrue_b32 (), z0, z1, 2),
+		z0 = svmls_x (svptrue_b32 (), z0, z1, 2))
+
+/*
+** ptrue_mls_2_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_2_f32_x_tied2, svfloat32_t,
+		z0 = svmls_n_f32_x (svptrue_b32 (), z1, z0, 2),
+		z0 = svmls_x (svptrue_b32 (), z1, z0, 2))
+
+/*
+** ptrue_mls_2_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_2_f32_x_untied, svfloat32_t,
+		z0 = svmls_n_f32_x (svptrue_b32 (), z1, z2, 2),
+		z0 = svmls_x (svptrue_b32 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f64.c
new file mode 100644
index 000000000..1e2108af6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_f64.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mls_f64_m_tied1:
+**	fmls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f64_m_tied1, svfloat64_t,
+		z0 = svmls_f64_m (p0, z0, z1, z2),
+		z0 = svmls_m (p0, z0, z1, z2))
+
+/*
+** mls_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmls	z0\.d, p0/m, \1, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f64_m_tied2, svfloat64_t,
+		z0 = svmls_f64_m (p0, z1, z0, z2),
+		z0 = svmls_m (p0, z1, z0, z2))
+
+/*
+** mls_f64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmls	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f64_m_tied3, svfloat64_t,
+		z0 = svmls_f64_m (p0, z1, z2, z0),
+		z0 = svmls_m (p0, z1, z2, z0))
+
+/*
+** mls_f64_m_untied:
+**	movprfx	z0, z1
+**	fmls	z0\.d, p0/m, z2\.d, z3\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f64_m_untied, svfloat64_t,
+		z0 = svmls_f64_m (p0, z1, z2, z3),
+		z0 = svmls_m (p0, z1, z2, z3))
+
+/*
+** mls_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svmls_n_f64_m (p0, z0, z1, d4),
+		 z0 = svmls_m (p0, z0, z1, d4))
+
+/*
+** mls_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fmls	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svmls_n_f64_m (p0, z1, z2, d4),
+		 z0 = svmls_m (p0, z1, z2, d4))
+
+/*
+** mls_2_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f64_m_tied1, svfloat64_t,
+		z0 = svmls_n_f64_m (p0, z0, z1, 2),
+		z0 = svmls_m (p0, z0, z1, 2))
+
+/*
+** mls_2_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmls	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f64_m_untied, svfloat64_t,
+		z0 = svmls_n_f64_m (p0, z1, z2, 2),
+		z0 = svmls_m (p0, z1, z2, 2))
+
+/*
+** mls_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f64_z_tied1, svfloat64_t,
+		z0 = svmls_f64_z (p0, z0, z1, z2),
+		z0 = svmls_z (p0, z0, z1, z2))
+
+/*
+** mls_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f64_z_tied2, svfloat64_t,
+		z0 = svmls_f64_z (p0, z1, z0, z2),
+		z0 = svmls_z (p0, z1, z0, z2))
+
+/*
+** mls_f64_z_tied3:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f64_z_tied3, svfloat64_t,
+		z0 = svmls_f64_z (p0, z1, z2, z0),
+		z0 = svmls_z (p0, z1, z2, z0))
+
+/*
+** mls_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmls	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmsb	z0\.d, p0/m, z3\.d, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, z3\.d
+**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f64_z_untied, svfloat64_t,
+		z0 = svmls_f64_z (p0, z1, z2, z3),
+		z0 = svmls_z (p0, z1, z2, z3))
+
+/*
+** mls_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svmls_n_f64_z (p0, z0, z1, d4),
+		 z0 = svmls_z (p0, z0, z1, d4))
+
+/*
+** mls_d4_f64_z_tied2:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmsb	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_d4_f64_z_tied2, svfloat64_t, double,
+		 z0 = svmls_n_f64_z (p0, z1, z0, d4),
+		 z0 = svmls_z (p0, z1, z0, d4))
+
+/*
+** mls_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmls	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmsb	z0\.d, p0/m, \1, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svmls_n_f64_z (p0, z1, z2, d4),
+		 z0 = svmls_z (p0, z1, z2, d4))
+
+/*
+** mls_2_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f64_z_tied1, svfloat64_t,
+		z0 = svmls_n_f64_z (p0, z0, z1, 2),
+		z0 = svmls_z (p0, z0, z1, 2))
+
+/*
+** mls_2_f64_z_tied2:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmsb	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f64_z_tied2, svfloat64_t,
+		z0 = svmls_n_f64_z (p0, z1, z0, 2),
+		z0 = svmls_z (p0, z1, z0, 2))
+
+/*
+** mls_2_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmls	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmsb	z0\.d, p0/m, \1, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f64_z_untied, svfloat64_t,
+		z0 = svmls_n_f64_z (p0, z1, z2, 2),
+		z0 = svmls_z (p0, z1, z2, 2))
+
+/*
+** mls_f64_x_tied1:
+**	fmls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f64_x_tied1, svfloat64_t,
+		z0 = svmls_f64_x (p0, z0, z1, z2),
+		z0 = svmls_x (p0, z0, z1, z2))
+
+/*
+** mls_f64_x_tied2:
+**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f64_x_tied2, svfloat64_t,
+		z0 = svmls_f64_x (p0, z1, z0, z2),
+		z0 = svmls_x (p0, z1, z0, z2))
+
+/*
+** mls_f64_x_tied3:
+**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f64_x_tied3, svfloat64_t,
+		z0 = svmls_f64_x (p0, z1, z2, z0),
+		z0 = svmls_x (p0, z1, z2, z0))
+
+/*
+** mls_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmls	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0, z2
+**	fmsb	z0\.d, p0/m, z3\.d, z1\.d
+** |
+**	movprfx	z0, z3
+**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_f64_x_untied, svfloat64_t,
+		z0 = svmls_f64_x (p0, z1, z2, z3),
+		z0 = svmls_x (p0, z1, z2, z3))
+
+/*
+** mls_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svmls_n_f64_x (p0, z0, z1, d4),
+		 z0 = svmls_x (p0, z0, z1, d4))
+
+/*
+** mls_d4_f64_x_tied2:
+**	mov	(z[0-9]+\.d), d4
+**	fmsb	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_d4_f64_x_tied2, svfloat64_t, double,
+		 z0 = svmls_n_f64_x (p0, z1, z0, d4),
+		 z0 = svmls_x (p0, z1, z0, d4))
+
+/*
+** mls_d4_f64_x_untied: { xfail *-*-* }
+**	mov	z0\.d, d4
+**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (mls_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svmls_n_f64_x (p0, z1, z2, d4),
+		 z0 = svmls_x (p0, z1, z2, d4))
+
+/*
+** mls_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f64_x_tied1, svfloat64_t,
+		z0 = svmls_n_f64_x (p0, z0, z1, 2),
+		z0 = svmls_x (p0, z0, z1, 2))
+
+/*
+** mls_2_f64_x_tied2:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmsb	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f64_x_tied2, svfloat64_t,
+		z0 = svmls_n_f64_x (p0, z1, z0, 2),
+		z0 = svmls_x (p0, z1, z0, 2))
+
+/*
+** mls_2_f64_x_untied:
+**	fmov	z0\.d, #2\.0(?:e\+0)?
+**	fmsb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_2_f64_x_untied, svfloat64_t,
+		z0 = svmls_n_f64_x (p0, z1, z2, 2),
+		z0 = svmls_x (p0, z1, z2, 2))
+
+/*
+** ptrue_mls_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_f64_x_tied1, svfloat64_t,
+		z0 = svmls_f64_x (svptrue_b64 (), z0, z1, z2),
+		z0 = svmls_x (svptrue_b64 (), z0, z1, z2))
+
+/*
+** ptrue_mls_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_f64_x_tied2, svfloat64_t,
+		z0 = svmls_f64_x (svptrue_b64 (), z1, z0, z2),
+		z0 = svmls_x (svptrue_b64 (), z1, z0, z2))
+
+/*
+** ptrue_mls_f64_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_f64_x_tied3, svfloat64_t,
+		z0 = svmls_f64_x (svptrue_b64 (), z1, z2, z0),
+		z0 = svmls_x (svptrue_b64 (), z1, z2, z0))
+
+/*
+** ptrue_mls_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_f64_x_untied, svfloat64_t,
+		z0 = svmls_f64_x (svptrue_b64 (), z1, z2, z3),
+		z0 = svmls_x (svptrue_b64 (), z1, z2, z3))
+
+/*
+** ptrue_mls_2_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_2_f64_x_tied1, svfloat64_t,
+		z0 = svmls_n_f64_x (svptrue_b64 (), z0, z1, 2),
+		z0 = svmls_x (svptrue_b64 (), z0, z1, 2))
+
+/*
+** ptrue_mls_2_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_2_f64_x_tied2, svfloat64_t,
+		z0 = svmls_n_f64_x (svptrue_b64 (), z1, z0, 2),
+		z0 = svmls_x (svptrue_b64 (), z1, z0, 2))
+
+/*
+** ptrue_mls_2_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_2_f64_x_untied, svfloat64_t,
+		z0 = svmls_n_f64_x (svptrue_b64 (), z1, z2, 2),
+		z0 = svmls_x (svptrue_b64 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f16.c
new file mode 100644
index 000000000..832376d0b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f16.c
@@ -0,0 +1,128 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mls_lane_0_f16_tied1:
+**	fmls	z0\.h, z1\.h, z2\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_0_f16_tied1, svfloat16_t,
+		z0 = svmls_lane_f16 (z0, z1, z2, 0),
+		z0 = svmls_lane (z0, z1, z2, 0))
+
+/*
+** mls_lane_0_f16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmls	z0\.h, \1\.h, z2\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_0_f16_tied2, svfloat16_t,
+		z0 = svmls_lane_f16 (z1, z0, z2, 0),
+		z0 = svmls_lane (z1, z0, z2, 0))
+
+/*
+** mls_lane_0_f16_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmls	z0\.h, z2\.h, \1\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_0_f16_tied3, svfloat16_t,
+		z0 = svmls_lane_f16 (z1, z2, z0, 0),
+		z0 = svmls_lane (z1, z2, z0, 0))
+
+/*
+** mls_lane_0_f16_untied:
+**	movprfx	z0, z1
+**	fmls	z0\.h, z2\.h, z3\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_0_f16_untied, svfloat16_t,
+		z0 = svmls_lane_f16 (z1, z2, z3, 0),
+		z0 = svmls_lane (z1, z2, z3, 0))
+
+/*
+** mls_lane_1_f16:
+**	fmls	z0\.h, z1\.h, z2\.h\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_1_f16, svfloat16_t,
+		z0 = svmls_lane_f16 (z0, z1, z2, 1),
+		z0 = svmls_lane (z0, z1, z2, 1))
+
+/*
+** mls_lane_2_f16:
+**	fmls	z0\.h, z1\.h, z2\.h\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_2_f16, svfloat16_t,
+		z0 = svmls_lane_f16 (z0, z1, z2, 2),
+		z0 = svmls_lane (z0, z1, z2, 2))
+
+/*
+** mls_lane_3_f16:
+**	fmls	z0\.h, z1\.h, z2\.h\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_3_f16, svfloat16_t,
+		z0 = svmls_lane_f16 (z0, z1, z2, 3),
+		z0 = svmls_lane (z0, z1, z2, 3))
+
+/*
+** mls_lane_4_f16:
+**	fmls	z0\.h, z1\.h, z2\.h\[4\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_4_f16, svfloat16_t,
+		z0 = svmls_lane_f16 (z0, z1, z2, 4),
+		z0 = svmls_lane (z0, z1, z2, 4))
+
+/*
+** mls_lane_5_f16:
+**	fmls	z0\.h, z1\.h, z2\.h\[5\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_5_f16, svfloat16_t,
+		z0 = svmls_lane_f16 (z0, z1, z2, 5),
+		z0 = svmls_lane (z0, z1, z2, 5))
+
+/*
+** mls_lane_6_f16:
+**	fmls	z0\.h, z1\.h, z2\.h\[6\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_6_f16, svfloat16_t,
+		z0 = svmls_lane_f16 (z0, z1, z2, 6),
+		z0 = svmls_lane (z0, z1, z2, 6))
+
+/*
+** mls_lane_7_f16:
+**	fmls	z0\.h, z1\.h, z2\.h\[7\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_7_f16, svfloat16_t,
+		z0 = svmls_lane_f16 (z0, z1, z2, 7),
+		z0 = svmls_lane (z0, z1, z2, 7))
+
+/*
+** mls_lane_z7_f16:
+**	fmls	z0\.h, z1\.h, z7\.h\[7\]
+**	ret
+*/
+TEST_DUAL_Z (mls_lane_z7_f16, svfloat16_t, svfloat16_t,
+	     z0 = svmls_lane_f16 (z0, z1, z7, 7),
+	     z0 = svmls_lane (z0, z1, z7, 7))
+
+/*
+** mls_lane_z8_f16:
+**	str	d8, \[sp, -16\]!
+**	mov	(z[0-7])\.d, z8\.d
+**	fmls	z0\.h, z1\.h, \1\.h\[7\]
+**	ldr	d8, \[sp\], 16
+**	ret
+*/
+TEST_DUAL_LANE_REG (mls_lane_z8_f16, svfloat16_t, svfloat16_t, z8,
+		    z0 = svmls_lane_f16 (z0, z1, z8, 7),
+		    z0 = svmls_lane (z0, z1, z8, 7))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f32.c
new file mode 100644
index 000000000..3244b972f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f32.c
@@ -0,0 +1,92 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mls_lane_0_f32_tied1:
+**	fmls	z0\.s, z1\.s, z2\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_0_f32_tied1, svfloat32_t,
+		z0 = svmls_lane_f32 (z0, z1, z2, 0),
+		z0 = svmls_lane (z0, z1, z2, 0))
+
+/*
+** mls_lane_0_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmls	z0\.s, \1\.s, z2\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_0_f32_tied2, svfloat32_t,
+		z0 = svmls_lane_f32 (z1, z0, z2, 0),
+		z0 = svmls_lane (z1, z0, z2, 0))
+
+/*
+** mls_lane_0_f32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmls	z0\.s, z2\.s, \1\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_0_f32_tied3, svfloat32_t,
+		z0 = svmls_lane_f32 (z1, z2, z0, 0),
+		z0 = svmls_lane (z1, z2, z0, 0))
+
+/*
+** mls_lane_0_f32_untied:
+**	movprfx	z0, z1
+**	fmls	z0\.s, z2\.s, z3\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_0_f32_untied, svfloat32_t,
+		z0 = svmls_lane_f32 (z1, z2, z3, 0),
+		z0 = svmls_lane (z1, z2, z3, 0))
+
+/*
+** mls_lane_1_f32:
+**	fmls	z0\.s, z1\.s, z2\.s\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_1_f32, svfloat32_t,
+		z0 = svmls_lane_f32 (z0, z1, z2, 1),
+		z0 = svmls_lane (z0, z1, z2, 1))
+
+/*
+** mls_lane_2_f32:
+**	fmls	z0\.s, z1\.s, z2\.s\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_2_f32, svfloat32_t,
+		z0 = svmls_lane_f32 (z0, z1, z2, 2),
+		z0 = svmls_lane (z0, z1, z2, 2))
+
+/*
+** mls_lane_3_f32:
+**	fmls	z0\.s, z1\.s, z2\.s\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_3_f32, svfloat32_t,
+		z0 = svmls_lane_f32 (z0, z1, z2, 3),
+		z0 = svmls_lane (z0, z1, z2, 3))
+
+/*
+** mls_lane_z7_f32:
+**	fmls	z0\.s, z1\.s, z7\.s\[3\]
+**	ret
+*/
+TEST_DUAL_Z (mls_lane_z7_f32, svfloat32_t, svfloat32_t,
+	     z0 = svmls_lane_f32 (z0, z1, z7, 3),
+	     z0 = svmls_lane (z0, z1, z7, 3))
+
+/*
+** mls_lane_z8_f32:
+**	str	d8, \[sp, -16\]!
+**	mov	(z[0-7])\.d, z8\.d
+**	fmls	z0\.s, z1\.s, \1\.s\[3\]
+**	ldr	d8, \[sp\], 16
+**	ret
+*/
+TEST_DUAL_LANE_REG (mls_lane_z8_f32, svfloat32_t, svfloat32_t, z8,
+		    z0 = svmls_lane_f32 (z0, z1, z8, 3),
+		    z0 = svmls_lane (z0, z1, z8, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f64.c
new file mode 100644
index 000000000..16f20ca53
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_lane_f64.c
@@ -0,0 +1,83 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mls_lane_0_f64_tied1:
+**	fmls	z0\.d, z1\.d, z2\.d\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_0_f64_tied1, svfloat64_t,
+		z0 = svmls_lane_f64 (z0, z1, z2, 0),
+		z0 = svmls_lane (z0, z1, z2, 0))
+
+/*
+** mls_lane_0_f64_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmls	z0\.d, \1, z2\.d\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_0_f64_tied2, svfloat64_t,
+		z0 = svmls_lane_f64 (z1, z0, z2, 0),
+		z0 = svmls_lane (z1, z0, z2, 0))
+
+/*
+** mls_lane_0_f64_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmls	z0\.d, z2\.d, \1\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_0_f64_tied3, svfloat64_t,
+		z0 = svmls_lane_f64 (z1, z2, z0, 0),
+		z0 = svmls_lane (z1, z2, z0, 0))
+
+/*
+** mls_lane_0_f64_untied:
+**	movprfx	z0, z1
+**	fmls	z0\.d, z2\.d, z3\.d\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_0_f64_untied, svfloat64_t,
+		z0 = svmls_lane_f64 (z1, z2, z3, 0),
+		z0 = svmls_lane (z1, z2, z3, 0))
+
+/*
+** mls_lane_1_f64:
+**	fmls	z0\.d, z1\.d, z2\.d\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (mls_lane_1_f64, svfloat64_t,
+		z0 = svmls_lane_f64 (z0, z1, z2, 1),
+		z0 = svmls_lane (z0, z1, z2, 1))
+
+/*
+** mls_lane_z7_f64:
+**	fmls	z0\.d, z1\.d, z7\.d\[1\]
+**	ret
+*/
+TEST_DUAL_Z (mls_lane_z7_f64, svfloat64_t, svfloat64_t,
+	     z0 = svmls_lane_f64 (z0, z1, z7, 1),
+	     z0 = svmls_lane (z0, z1, z7, 1))
+
+/*
+** mls_lane_z15_f64:
+**	str	d15, \[sp, -16\]!
+**	fmls	z0\.d, z1\.d, z15\.d\[1\]
+**	ldr	d15, \[sp\], 16
+**	ret
+*/
+TEST_DUAL_LANE_REG (mls_lane_z15_f64, svfloat64_t, svfloat64_t, z15,
+		    z0 = svmls_lane_f64 (z0, z1, z15, 1),
+		    z0 = svmls_lane (z0, z1, z15, 1))
+
+/*
+** mls_lane_z16_f64:
+**	mov	(z[0-9]|z1[0-5])\.d, z16\.d
+**	fmls	z0\.d, z1\.d, \1\.d\[1\]
+**	ret
+*/
+TEST_DUAL_LANE_REG (mls_lane_z16_f64, svfloat64_t, svfloat64_t, z16,
+		    z0 = svmls_lane_f64 (z0, z1, z16, 1),
+		    z0 = svmls_lane (z0, z1, z16, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s16.c
new file mode 100644
index 000000000..e199829c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s16.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mls_s16_m_tied1:
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s16_m_tied1, svint16_t,
+		z0 = svmls_s16_m (p0, z0, z1, z2),
+		z0 = svmls_m (p0, z0, z1, z2))
+
+/*
+** mls_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mls	z0\.h, p0/m, \1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s16_m_tied2, svint16_t,
+		z0 = svmls_s16_m (p0, z1, z0, z2),
+		z0 = svmls_m (p0, z1, z0, z2))
+
+/*
+** mls_s16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mls	z0\.h, p0/m, z2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s16_m_tied3, svint16_t,
+		z0 = svmls_s16_m (p0, z1, z2, z0),
+		z0 = svmls_m (p0, z1, z2, z0))
+
+/*
+** mls_s16_m_untied:
+**	movprfx	z0, z1
+**	mls	z0\.h, p0/m, z2\.h, z3\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s16_m_untied, svint16_t,
+		z0 = svmls_s16_m (p0, z1, z2, z3),
+		z0 = svmls_m (p0, z1, z2, z3))
+
+/*
+** mls_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	mls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s16_m_tied1, svint16_t, int16_t,
+		 z0 = svmls_n_s16_m (p0, z0, z1, x0),
+		 z0 = svmls_m (p0, z0, z1, x0))
+
+/*
+** mls_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	mls	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s16_m_untied, svint16_t, int16_t,
+		 z0 = svmls_n_s16_m (p0, z1, z2, x0),
+		 z0 = svmls_m (p0, z1, z2, x0))
+
+/*
+** mls_11_s16_m_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	mls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s16_m_tied1, svint16_t,
+		z0 = svmls_n_s16_m (p0, z0, z1, 11),
+		z0 = svmls_m (p0, z0, z1, 11))
+
+/*
+** mls_11_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0, z1
+**	mls	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s16_m_untied, svint16_t,
+		z0 = svmls_n_s16_m (p0, z1, z2, 11),
+		z0 = svmls_m (p0, z1, z2, 11))
+
+/*
+** mls_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s16_z_tied1, svint16_t,
+		z0 = svmls_s16_z (p0, z0, z1, z2),
+		z0 = svmls_z (p0, z0, z1, z2))
+
+/*
+** mls_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s16_z_tied2, svint16_t,
+		z0 = svmls_s16_z (p0, z1, z0, z2),
+		z0 = svmls_z (p0, z1, z0, z2))
+
+/*
+** mls_s16_z_tied3:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s16_z_tied3, svint16_t,
+		z0 = svmls_s16_z (p0, z1, z2, z0),
+		z0 = svmls_z (p0, z1, z2, z0))
+
+/*
+** mls_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mls	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	msb	z0\.h, p0/m, z3\.h, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, z3\.h
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s16_z_untied, svint16_t,
+		z0 = svmls_s16_z (p0, z1, z2, z3),
+		z0 = svmls_z (p0, z1, z2, z3))
+
+/*
+** mls_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s16_z_tied1, svint16_t, int16_t,
+		 z0 = svmls_n_s16_z (p0, z0, z1, x0),
+		 z0 = svmls_z (p0, z0, z1, x0))
+
+/*
+** mls_w0_s16_z_tied2:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s16_z_tied2, svint16_t, int16_t,
+		 z0 = svmls_n_s16_z (p0, z1, z0, x0),
+		 z0 = svmls_z (p0, z1, z0, x0))
+
+/*
+** mls_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mls	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	msb	z0\.h, p0/m, \1, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s16_z_untied, svint16_t, int16_t,
+		 z0 = svmls_n_s16_z (p0, z1, z2, x0),
+		 z0 = svmls_z (p0, z1, z2, x0))
+
+/*
+** mls_11_s16_z_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s16_z_tied1, svint16_t,
+		z0 = svmls_n_s16_z (p0, z0, z1, 11),
+		z0 = svmls_z (p0, z0, z1, 11))
+
+/*
+** mls_11_s16_z_tied2:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s16_z_tied2, svint16_t,
+		z0 = svmls_n_s16_z (p0, z1, z0, 11),
+		z0 = svmls_z (p0, z1, z0, 11))
+
+/*
+** mls_11_s16_z_untied:
+**	mov	(z[0-9]+\.h), #11
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mls	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	msb	z0\.h, p0/m, \1, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s16_z_untied, svint16_t,
+		z0 = svmls_n_s16_z (p0, z1, z2, 11),
+		z0 = svmls_z (p0, z1, z2, 11))
+
+/*
+** mls_s16_x_tied1:
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s16_x_tied1, svint16_t,
+		z0 = svmls_s16_x (p0, z0, z1, z2),
+		z0 = svmls_x (p0, z0, z1, z2))
+
+/*
+** mls_s16_x_tied2:
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s16_x_tied2, svint16_t,
+		z0 = svmls_s16_x (p0, z1, z0, z2),
+		z0 = svmls_x (p0, z1, z0, z2))
+
+/*
+** mls_s16_x_tied3:
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s16_x_tied3, svint16_t,
+		z0 = svmls_s16_x (p0, z1, z2, z0),
+		z0 = svmls_x (p0, z1, z2, z0))
+
+/*
+** mls_s16_x_untied:
+** (
+**	movprfx	z0, z1
+**	mls	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0, z2
+**	msb	z0\.h, p0/m, z3\.h, z1\.h
+** |
+**	movprfx	z0, z3
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s16_x_untied, svint16_t,
+		z0 = svmls_s16_x (p0, z1, z2, z3),
+		z0 = svmls_x (p0, z1, z2, z3))
+
+/*
+** mls_w0_s16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	mls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s16_x_tied1, svint16_t, int16_t,
+		 z0 = svmls_n_s16_x (p0, z0, z1, x0),
+		 z0 = svmls_x (p0, z0, z1, x0))
+
+/*
+** mls_w0_s16_x_tied2:
+**	mov	(z[0-9]+\.h), w0
+**	msb	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s16_x_tied2, svint16_t, int16_t,
+		 z0 = svmls_n_s16_x (p0, z1, z0, x0),
+		 z0 = svmls_x (p0, z1, z0, x0))
+
+/*
+** mls_w0_s16_x_untied:
+**	mov	z0\.h, w0
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s16_x_untied, svint16_t, int16_t,
+		 z0 = svmls_n_s16_x (p0, z1, z2, x0),
+		 z0 = svmls_x (p0, z1, z2, x0))
+
+/*
+** mls_11_s16_x_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	mls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s16_x_tied1, svint16_t,
+		z0 = svmls_n_s16_x (p0, z0, z1, 11),
+		z0 = svmls_x (p0, z0, z1, 11))
+
+/*
+** mls_11_s16_x_tied2:
+**	mov	(z[0-9]+\.h), #11
+**	msb	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s16_x_tied2, svint16_t,
+		z0 = svmls_n_s16_x (p0, z1, z0, 11),
+		z0 = svmls_x (p0, z1, z0, 11))
+
+/*
+** mls_11_s16_x_untied:
+**	mov	z0\.h, #11
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s16_x_untied, svint16_t,
+		z0 = svmls_n_s16_x (p0, z1, z2, 11),
+		z0 = svmls_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s32.c
new file mode 100644
index 000000000..fe386d01c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s32.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mls_s32_m_tied1:
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s32_m_tied1, svint32_t,
+		z0 = svmls_s32_m (p0, z0, z1, z2),
+		z0 = svmls_m (p0, z0, z1, z2))
+
+/*
+** mls_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mls	z0\.s, p0/m, \1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s32_m_tied2, svint32_t,
+		z0 = svmls_s32_m (p0, z1, z0, z2),
+		z0 = svmls_m (p0, z1, z0, z2))
+
+/*
+** mls_s32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mls	z0\.s, p0/m, z2\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s32_m_tied3, svint32_t,
+		z0 = svmls_s32_m (p0, z1, z2, z0),
+		z0 = svmls_m (p0, z1, z2, z0))
+
+/*
+** mls_s32_m_untied:
+**	movprfx	z0, z1
+**	mls	z0\.s, p0/m, z2\.s, z3\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s32_m_untied, svint32_t,
+		z0 = svmls_s32_m (p0, z1, z2, z3),
+		z0 = svmls_m (p0, z1, z2, z3))
+
+/*
+** mls_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	mls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svmls_n_s32_m (p0, z0, z1, x0),
+		 z0 = svmls_m (p0, z0, z1, x0))
+
+/*
+** mls_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	mls	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svmls_n_s32_m (p0, z1, z2, x0),
+		 z0 = svmls_m (p0, z1, z2, x0))
+
+/*
+** mls_11_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	mls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s32_m_tied1, svint32_t,
+		z0 = svmls_n_s32_m (p0, z0, z1, 11),
+		z0 = svmls_m (p0, z0, z1, 11))
+
+/*
+** mls_11_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0, z1
+**	mls	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s32_m_untied, svint32_t,
+		z0 = svmls_n_s32_m (p0, z1, z2, 11),
+		z0 = svmls_m (p0, z1, z2, 11))
+
+/*
+** mls_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s32_z_tied1, svint32_t,
+		z0 = svmls_s32_z (p0, z0, z1, z2),
+		z0 = svmls_z (p0, z0, z1, z2))
+
+/*
+** mls_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s32_z_tied2, svint32_t,
+		z0 = svmls_s32_z (p0, z1, z0, z2),
+		z0 = svmls_z (p0, z1, z0, z2))
+
+/*
+** mls_s32_z_tied3:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s32_z_tied3, svint32_t,
+		z0 = svmls_s32_z (p0, z1, z2, z0),
+		z0 = svmls_z (p0, z1, z2, z0))
+
+/*
+** mls_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mls	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	msb	z0\.s, p0/m, z3\.s, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, z3\.s
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s32_z_untied, svint32_t,
+		z0 = svmls_s32_z (p0, z1, z2, z3),
+		z0 = svmls_z (p0, z1, z2, z3))
+
+/*
+** mls_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svmls_n_s32_z (p0, z0, z1, x0),
+		 z0 = svmls_z (p0, z0, z1, x0))
+
+/*
+** mls_w0_s32_z_tied2:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s32_z_tied2, svint32_t, int32_t,
+		 z0 = svmls_n_s32_z (p0, z1, z0, x0),
+		 z0 = svmls_z (p0, z1, z0, x0))
+
+/*
+** mls_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mls	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	msb	z0\.s, p0/m, \1, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svmls_n_s32_z (p0, z1, z2, x0),
+		 z0 = svmls_z (p0, z1, z2, x0))
+
+/*
+** mls_11_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s32_z_tied1, svint32_t,
+		z0 = svmls_n_s32_z (p0, z0, z1, 11),
+		z0 = svmls_z (p0, z0, z1, 11))
+
+/*
+** mls_11_s32_z_tied2:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s32_z_tied2, svint32_t,
+		z0 = svmls_n_s32_z (p0, z1, z0, 11),
+		z0 = svmls_z (p0, z1, z0, 11))
+
+/*
+** mls_11_s32_z_untied:
+**	mov	(z[0-9]+\.s), #11
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mls	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	msb	z0\.s, p0/m, \1, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s32_z_untied, svint32_t,
+		z0 = svmls_n_s32_z (p0, z1, z2, 11),
+		z0 = svmls_z (p0, z1, z2, 11))
+
+/*
+** mls_s32_x_tied1:
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s32_x_tied1, svint32_t,
+		z0 = svmls_s32_x (p0, z0, z1, z2),
+		z0 = svmls_x (p0, z0, z1, z2))
+
+/*
+** mls_s32_x_tied2:
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s32_x_tied2, svint32_t,
+		z0 = svmls_s32_x (p0, z1, z0, z2),
+		z0 = svmls_x (p0, z1, z0, z2))
+
+/*
+** mls_s32_x_tied3:
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s32_x_tied3, svint32_t,
+		z0 = svmls_s32_x (p0, z1, z2, z0),
+		z0 = svmls_x (p0, z1, z2, z0))
+
+/*
+** mls_s32_x_untied:
+** (
+**	movprfx	z0, z1
+**	mls	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0, z2
+**	msb	z0\.s, p0/m, z3\.s, z1\.s
+** |
+**	movprfx	z0, z3
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s32_x_untied, svint32_t,
+		z0 = svmls_s32_x (p0, z1, z2, z3),
+		z0 = svmls_x (p0, z1, z2, z3))
+
+/*
+** mls_w0_s32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	mls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svmls_n_s32_x (p0, z0, z1, x0),
+		 z0 = svmls_x (p0, z0, z1, x0))
+
+/*
+** mls_w0_s32_x_tied2:
+**	mov	(z[0-9]+\.s), w0
+**	msb	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s32_x_tied2, svint32_t, int32_t,
+		 z0 = svmls_n_s32_x (p0, z1, z0, x0),
+		 z0 = svmls_x (p0, z1, z0, x0))
+
+/*
+** mls_w0_s32_x_untied:
+**	mov	z0\.s, w0
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svmls_n_s32_x (p0, z1, z2, x0),
+		 z0 = svmls_x (p0, z1, z2, x0))
+
+/*
+** mls_11_s32_x_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	mls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s32_x_tied1, svint32_t,
+		z0 = svmls_n_s32_x (p0, z0, z1, 11),
+		z0 = svmls_x (p0, z0, z1, 11))
+
+/*
+** mls_11_s32_x_tied2:
+**	mov	(z[0-9]+\.s), #11
+**	msb	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s32_x_tied2, svint32_t,
+		z0 = svmls_n_s32_x (p0, z1, z0, 11),
+		z0 = svmls_x (p0, z1, z0, 11))
+
+/*
+** mls_11_s32_x_untied:
+**	mov	z0\.s, #11
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s32_x_untied, svint32_t,
+		z0 = svmls_n_s32_x (p0, z1, z2, 11),
+		z0 = svmls_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s64.c
new file mode 100644
index 000000000..2998d733f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s64.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mls_s64_m_tied1:
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s64_m_tied1, svint64_t,
+		z0 = svmls_s64_m (p0, z0, z1, z2),
+		z0 = svmls_m (p0, z0, z1, z2))
+
+/*
+** mls_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	mls	z0\.d, p0/m, \1, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s64_m_tied2, svint64_t,
+		z0 = svmls_s64_m (p0, z1, z0, z2),
+		z0 = svmls_m (p0, z1, z0, z2))
+
+/*
+** mls_s64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	mls	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s64_m_tied3, svint64_t,
+		z0 = svmls_s64_m (p0, z1, z2, z0),
+		z0 = svmls_m (p0, z1, z2, z0))
+
+/*
+** mls_s64_m_untied:
+**	movprfx	z0, z1
+**	mls	z0\.d, p0/m, z2\.d, z3\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s64_m_untied, svint64_t,
+		z0 = svmls_s64_m (p0, z1, z2, z3),
+		z0 = svmls_m (p0, z1, z2, z3))
+
+/*
+** mls_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	mls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svmls_n_s64_m (p0, z0, z1, x0),
+		 z0 = svmls_m (p0, z0, z1, x0))
+
+/*
+** mls_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	mls	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svmls_n_s64_m (p0, z1, z2, x0),
+		 z0 = svmls_m (p0, z1, z2, x0))
+
+/*
+** mls_11_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	mls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s64_m_tied1, svint64_t,
+		z0 = svmls_n_s64_m (p0, z0, z1, 11),
+		z0 = svmls_m (p0, z0, z1, 11))
+
+/*
+** mls_11_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0, z1
+**	mls	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s64_m_untied, svint64_t,
+		z0 = svmls_n_s64_m (p0, z1, z2, 11),
+		z0 = svmls_m (p0, z1, z2, 11))
+
+/*
+** mls_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s64_z_tied1, svint64_t,
+		z0 = svmls_s64_z (p0, z0, z1, z2),
+		z0 = svmls_z (p0, z0, z1, z2))
+
+/*
+** mls_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s64_z_tied2, svint64_t,
+		z0 = svmls_s64_z (p0, z1, z0, z2),
+		z0 = svmls_z (p0, z1, z0, z2))
+
+/*
+** mls_s64_z_tied3:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s64_z_tied3, svint64_t,
+		z0 = svmls_s64_z (p0, z1, z2, z0),
+		z0 = svmls_z (p0, z1, z2, z0))
+
+/*
+** mls_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mls	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	msb	z0\.d, p0/m, z3\.d, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, z3\.d
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s64_z_untied, svint64_t,
+		z0 = svmls_s64_z (p0, z1, z2, z3),
+		z0 = svmls_z (p0, z1, z2, z3))
+
+/*
+** mls_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svmls_n_s64_z (p0, z0, z1, x0),
+		 z0 = svmls_z (p0, z0, z1, x0))
+
+/*
+** mls_x0_s64_z_tied2:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_x0_s64_z_tied2, svint64_t, int64_t,
+		 z0 = svmls_n_s64_z (p0, z1, z0, x0),
+		 z0 = svmls_z (p0, z1, z0, x0))
+
+/*
+** mls_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mls	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	msb	z0\.d, p0/m, \1, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svmls_n_s64_z (p0, z1, z2, x0),
+		 z0 = svmls_z (p0, z1, z2, x0))
+
+/*
+** mls_11_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s64_z_tied1, svint64_t,
+		z0 = svmls_n_s64_z (p0, z0, z1, 11),
+		z0 = svmls_z (p0, z0, z1, 11))
+
+/*
+** mls_11_s64_z_tied2:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s64_z_tied2, svint64_t,
+		z0 = svmls_n_s64_z (p0, z1, z0, 11),
+		z0 = svmls_z (p0, z1, z0, 11))
+
+/*
+** mls_11_s64_z_untied:
+**	mov	(z[0-9]+\.d), #11
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mls	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	msb	z0\.d, p0/m, \1, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s64_z_untied, svint64_t,
+		z0 = svmls_n_s64_z (p0, z1, z2, 11),
+		z0 = svmls_z (p0, z1, z2, 11))
+
+/*
+** mls_s64_x_tied1:
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s64_x_tied1, svint64_t,
+		z0 = svmls_s64_x (p0, z0, z1, z2),
+		z0 = svmls_x (p0, z0, z1, z2))
+
+/*
+** mls_s64_x_tied2:
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s64_x_tied2, svint64_t,
+		z0 = svmls_s64_x (p0, z1, z0, z2),
+		z0 = svmls_x (p0, z1, z0, z2))
+
+/*
+** mls_s64_x_tied3:
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s64_x_tied3, svint64_t,
+		z0 = svmls_s64_x (p0, z1, z2, z0),
+		z0 = svmls_x (p0, z1, z2, z0))
+
+/*
+** mls_s64_x_untied:
+** (
+**	movprfx	z0, z1
+**	mls	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0, z2
+**	msb	z0\.d, p0/m, z3\.d, z1\.d
+** |
+**	movprfx	z0, z3
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s64_x_untied, svint64_t,
+		z0 = svmls_s64_x (p0, z1, z2, z3),
+		z0 = svmls_x (p0, z1, z2, z3))
+
+/*
+** mls_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	mls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svmls_n_s64_x (p0, z0, z1, x0),
+		 z0 = svmls_x (p0, z0, z1, x0))
+
+/*
+** mls_x0_s64_x_tied2:
+**	mov	(z[0-9]+\.d), x0
+**	msb	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_x0_s64_x_tied2, svint64_t, int64_t,
+		 z0 = svmls_n_s64_x (p0, z1, z0, x0),
+		 z0 = svmls_x (p0, z1, z0, x0))
+
+/*
+** mls_x0_s64_x_untied:
+**	mov	z0\.d, x0
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svmls_n_s64_x (p0, z1, z2, x0),
+		 z0 = svmls_x (p0, z1, z2, x0))
+
+/*
+** mls_11_s64_x_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	mls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s64_x_tied1, svint64_t,
+		z0 = svmls_n_s64_x (p0, z0, z1, 11),
+		z0 = svmls_x (p0, z0, z1, 11))
+
+/*
+** mls_11_s64_x_tied2:
+**	mov	(z[0-9]+\.d), #11
+**	msb	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s64_x_tied2, svint64_t,
+		z0 = svmls_n_s64_x (p0, z1, z0, 11),
+		z0 = svmls_x (p0, z1, z0, 11))
+
+/*
+** mls_11_s64_x_untied:
+**	mov	z0\.d, #11
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s64_x_untied, svint64_t,
+		z0 = svmls_n_s64_x (p0, z1, z2, 11),
+		z0 = svmls_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s8.c
new file mode 100644
index 000000000..c60c43145
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_s8.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mls_s8_m_tied1:
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s8_m_tied1, svint8_t,
+		z0 = svmls_s8_m (p0, z0, z1, z2),
+		z0 = svmls_m (p0, z0, z1, z2))
+
+/*
+** mls_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mls	z0\.b, p0/m, \1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s8_m_tied2, svint8_t,
+		z0 = svmls_s8_m (p0, z1, z0, z2),
+		z0 = svmls_m (p0, z1, z0, z2))
+
+/*
+** mls_s8_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mls	z0\.b, p0/m, z2\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s8_m_tied3, svint8_t,
+		z0 = svmls_s8_m (p0, z1, z2, z0),
+		z0 = svmls_m (p0, z1, z2, z0))
+
+/*
+** mls_s8_m_untied:
+**	movprfx	z0, z1
+**	mls	z0\.b, p0/m, z2\.b, z3\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s8_m_untied, svint8_t,
+		z0 = svmls_s8_m (p0, z1, z2, z3),
+		z0 = svmls_m (p0, z1, z2, z3))
+
+/*
+** mls_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	mls	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s8_m_tied1, svint8_t, int8_t,
+		 z0 = svmls_n_s8_m (p0, z0, z1, x0),
+		 z0 = svmls_m (p0, z0, z1, x0))
+
+/*
+** mls_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	mls	z0\.b, p0/m, z2\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s8_m_untied, svint8_t, int8_t,
+		 z0 = svmls_n_s8_m (p0, z1, z2, x0),
+		 z0 = svmls_m (p0, z1, z2, x0))
+
+/*
+** mls_11_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	mls	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s8_m_tied1, svint8_t,
+		z0 = svmls_n_s8_m (p0, z0, z1, 11),
+		z0 = svmls_m (p0, z0, z1, 11))
+
+/*
+** mls_11_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0, z1
+**	mls	z0\.b, p0/m, z2\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s8_m_untied, svint8_t,
+		z0 = svmls_n_s8_m (p0, z1, z2, 11),
+		z0 = svmls_m (p0, z1, z2, 11))
+
+/*
+** mls_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s8_z_tied1, svint8_t,
+		z0 = svmls_s8_z (p0, z0, z1, z2),
+		z0 = svmls_z (p0, z0, z1, z2))
+
+/*
+** mls_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s8_z_tied2, svint8_t,
+		z0 = svmls_s8_z (p0, z1, z0, z2),
+		z0 = svmls_z (p0, z1, z0, z2))
+
+/*
+** mls_s8_z_tied3:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s8_z_tied3, svint8_t,
+		z0 = svmls_s8_z (p0, z1, z2, z0),
+		z0 = svmls_z (p0, z1, z2, z0))
+
+/*
+** mls_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mls	z0\.b, p0/m, z2\.b, z3\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	msb	z0\.b, p0/m, z3\.b, z1\.b
+** |
+**	movprfx	z0\.b, p0/z, z3\.b
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s8_z_untied, svint8_t,
+		z0 = svmls_s8_z (p0, z1, z2, z3),
+		z0 = svmls_z (p0, z1, z2, z3))
+
+/*
+** mls_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mls	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s8_z_tied1, svint8_t, int8_t,
+		 z0 = svmls_n_s8_z (p0, z0, z1, x0),
+		 z0 = svmls_z (p0, z0, z1, x0))
+
+/*
+** mls_w0_s8_z_tied2:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s8_z_tied2, svint8_t, int8_t,
+		 z0 = svmls_n_s8_z (p0, z1, z0, x0),
+		 z0 = svmls_z (p0, z1, z0, x0))
+
+/*
+** mls_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mls	z0\.b, p0/m, z2\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	msb	z0\.b, p0/m, \1, z1\.b
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s8_z_untied, svint8_t, int8_t,
+		 z0 = svmls_n_s8_z (p0, z1, z2, x0),
+		 z0 = svmls_z (p0, z1, z2, x0))
+
+/*
+** mls_11_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mls	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s8_z_tied1, svint8_t,
+		z0 = svmls_n_s8_z (p0, z0, z1, 11),
+		z0 = svmls_z (p0, z0, z1, 11))
+
+/*
+** mls_11_s8_z_tied2:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s8_z_tied2, svint8_t,
+		z0 = svmls_n_s8_z (p0, z1, z0, 11),
+		z0 = svmls_z (p0, z1, z0, 11))
+
+/*
+** mls_11_s8_z_untied:
+**	mov	(z[0-9]+\.b), #11
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mls	z0\.b, p0/m, z2\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	msb	z0\.b, p0/m, \1, z1\.b
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s8_z_untied, svint8_t,
+		z0 = svmls_n_s8_z (p0, z1, z2, 11),
+		z0 = svmls_z (p0, z1, z2, 11))
+
+/*
+** mls_s8_x_tied1:
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s8_x_tied1, svint8_t,
+		z0 = svmls_s8_x (p0, z0, z1, z2),
+		z0 = svmls_x (p0, z0, z1, z2))
+
+/*
+** mls_s8_x_tied2:
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s8_x_tied2, svint8_t,
+		z0 = svmls_s8_x (p0, z1, z0, z2),
+		z0 = svmls_x (p0, z1, z0, z2))
+
+/*
+** mls_s8_x_tied3:
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s8_x_tied3, svint8_t,
+		z0 = svmls_s8_x (p0, z1, z2, z0),
+		z0 = svmls_x (p0, z1, z2, z0))
+
+/*
+** mls_s8_x_untied:
+** (
+**	movprfx	z0, z1
+**	mls	z0\.b, p0/m, z2\.b, z3\.b
+** |
+**	movprfx	z0, z2
+**	msb	z0\.b, p0/m, z3\.b, z1\.b
+** |
+**	movprfx	z0, z3
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_s8_x_untied, svint8_t,
+		z0 = svmls_s8_x (p0, z1, z2, z3),
+		z0 = svmls_x (p0, z1, z2, z3))
+
+/*
+** mls_w0_s8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	mls	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s8_x_tied1, svint8_t, int8_t,
+		 z0 = svmls_n_s8_x (p0, z0, z1, x0),
+		 z0 = svmls_x (p0, z0, z1, x0))
+
+/*
+** mls_w0_s8_x_tied2:
+**	mov	(z[0-9]+\.b), w0
+**	msb	z0\.b, p0/m, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s8_x_tied2, svint8_t, int8_t,
+		 z0 = svmls_n_s8_x (p0, z1, z0, x0),
+		 z0 = svmls_x (p0, z1, z0, x0))
+
+/*
+** mls_w0_s8_x_untied:
+**	mov	z0\.b, w0
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_s8_x_untied, svint8_t, int8_t,
+		 z0 = svmls_n_s8_x (p0, z1, z2, x0),
+		 z0 = svmls_x (p0, z1, z2, x0))
+
+/*
+** mls_11_s8_x_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	mls	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s8_x_tied1, svint8_t,
+		z0 = svmls_n_s8_x (p0, z0, z1, 11),
+		z0 = svmls_x (p0, z0, z1, 11))
+
+/*
+** mls_11_s8_x_tied2:
+**	mov	(z[0-9]+\.b), #11
+**	msb	z0\.b, p0/m, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s8_x_tied2, svint8_t,
+		z0 = svmls_n_s8_x (p0, z1, z0, 11),
+		z0 = svmls_x (p0, z1, z0, 11))
+
+/*
+** mls_11_s8_x_untied:
+**	mov	z0\.b, #11
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_s8_x_untied, svint8_t,
+		z0 = svmls_n_s8_x (p0, z1, z2, 11),
+		z0 = svmls_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u16.c
new file mode 100644
index 000000000..e8a9f5cd9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u16.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mls_u16_m_tied1:
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u16_m_tied1, svuint16_t,
+		z0 = svmls_u16_m (p0, z0, z1, z2),
+		z0 = svmls_m (p0, z0, z1, z2))
+
+/*
+** mls_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mls	z0\.h, p0/m, \1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u16_m_tied2, svuint16_t,
+		z0 = svmls_u16_m (p0, z1, z0, z2),
+		z0 = svmls_m (p0, z1, z0, z2))
+
+/*
+** mls_u16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mls	z0\.h, p0/m, z2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u16_m_tied3, svuint16_t,
+		z0 = svmls_u16_m (p0, z1, z2, z0),
+		z0 = svmls_m (p0, z1, z2, z0))
+
+/*
+** mls_u16_m_untied:
+**	movprfx	z0, z1
+**	mls	z0\.h, p0/m, z2\.h, z3\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u16_m_untied, svuint16_t,
+		z0 = svmls_u16_m (p0, z1, z2, z3),
+		z0 = svmls_m (p0, z1, z2, z3))
+
+/*
+** mls_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	mls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svmls_n_u16_m (p0, z0, z1, x0),
+		 z0 = svmls_m (p0, z0, z1, x0))
+
+/*
+** mls_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	mls	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svmls_n_u16_m (p0, z1, z2, x0),
+		 z0 = svmls_m (p0, z1, z2, x0))
+
+/*
+** mls_11_u16_m_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	mls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u16_m_tied1, svuint16_t,
+		z0 = svmls_n_u16_m (p0, z0, z1, 11),
+		z0 = svmls_m (p0, z0, z1, 11))
+
+/*
+** mls_11_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0, z1
+**	mls	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u16_m_untied, svuint16_t,
+		z0 = svmls_n_u16_m (p0, z1, z2, 11),
+		z0 = svmls_m (p0, z1, z2, 11))
+
+/*
+** mls_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u16_z_tied1, svuint16_t,
+		z0 = svmls_u16_z (p0, z0, z1, z2),
+		z0 = svmls_z (p0, z0, z1, z2))
+
+/*
+** mls_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u16_z_tied2, svuint16_t,
+		z0 = svmls_u16_z (p0, z1, z0, z2),
+		z0 = svmls_z (p0, z1, z0, z2))
+
+/*
+** mls_u16_z_tied3:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u16_z_tied3, svuint16_t,
+		z0 = svmls_u16_z (p0, z1, z2, z0),
+		z0 = svmls_z (p0, z1, z2, z0))
+
+/*
+** mls_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mls	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	msb	z0\.h, p0/m, z3\.h, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, z3\.h
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u16_z_untied, svuint16_t,
+		z0 = svmls_u16_z (p0, z1, z2, z3),
+		z0 = svmls_z (p0, z1, z2, z3))
+
+/*
+** mls_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svmls_n_u16_z (p0, z0, z1, x0),
+		 z0 = svmls_z (p0, z0, z1, x0))
+
+/*
+** mls_w0_u16_z_tied2:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u16_z_tied2, svuint16_t, uint16_t,
+		 z0 = svmls_n_u16_z (p0, z1, z0, x0),
+		 z0 = svmls_z (p0, z1, z0, x0))
+
+/*
+** mls_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mls	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	msb	z0\.h, p0/m, \1, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svmls_n_u16_z (p0, z1, z2, x0),
+		 z0 = svmls_z (p0, z1, z2, x0))
+
+/*
+** mls_11_u16_z_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u16_z_tied1, svuint16_t,
+		z0 = svmls_n_u16_z (p0, z0, z1, 11),
+		z0 = svmls_z (p0, z0, z1, 11))
+
+/*
+** mls_11_u16_z_tied2:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u16_z_tied2, svuint16_t,
+		z0 = svmls_n_u16_z (p0, z1, z0, 11),
+		z0 = svmls_z (p0, z1, z0, 11))
+
+/*
+** mls_11_u16_z_untied:
+**	mov	(z[0-9]+\.h), #11
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mls	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	msb	z0\.h, p0/m, \1, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u16_z_untied, svuint16_t,
+		z0 = svmls_n_u16_z (p0, z1, z2, 11),
+		z0 = svmls_z (p0, z1, z2, 11))
+
+/*
+** mls_u16_x_tied1:
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u16_x_tied1, svuint16_t,
+		z0 = svmls_u16_x (p0, z0, z1, z2),
+		z0 = svmls_x (p0, z0, z1, z2))
+
+/*
+** mls_u16_x_tied2:
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u16_x_tied2, svuint16_t,
+		z0 = svmls_u16_x (p0, z1, z0, z2),
+		z0 = svmls_x (p0, z1, z0, z2))
+
+/*
+** mls_u16_x_tied3:
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u16_x_tied3, svuint16_t,
+		z0 = svmls_u16_x (p0, z1, z2, z0),
+		z0 = svmls_x (p0, z1, z2, z0))
+
+/*
+** mls_u16_x_untied:
+** (
+**	movprfx	z0, z1
+**	mls	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0, z2
+**	msb	z0\.h, p0/m, z3\.h, z1\.h
+** |
+**	movprfx	z0, z3
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u16_x_untied, svuint16_t,
+		z0 = svmls_u16_x (p0, z1, z2, z3),
+		z0 = svmls_x (p0, z1, z2, z3))
+
+/*
+** mls_w0_u16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	mls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svmls_n_u16_x (p0, z0, z1, x0),
+		 z0 = svmls_x (p0, z0, z1, x0))
+
+/*
+** mls_w0_u16_x_tied2:
+**	mov	(z[0-9]+\.h), w0
+**	msb	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u16_x_tied2, svuint16_t, uint16_t,
+		 z0 = svmls_n_u16_x (p0, z1, z0, x0),
+		 z0 = svmls_x (p0, z1, z0, x0))
+
+/*
+** mls_w0_u16_x_untied:
+**	mov	z0\.h, w0
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svmls_n_u16_x (p0, z1, z2, x0),
+		 z0 = svmls_x (p0, z1, z2, x0))
+
+/*
+** mls_11_u16_x_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	mls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u16_x_tied1, svuint16_t,
+		z0 = svmls_n_u16_x (p0, z0, z1, 11),
+		z0 = svmls_x (p0, z0, z1, 11))
+
+/*
+** mls_11_u16_x_tied2:
+**	mov	(z[0-9]+\.h), #11
+**	msb	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u16_x_tied2, svuint16_t,
+		z0 = svmls_n_u16_x (p0, z1, z0, 11),
+		z0 = svmls_x (p0, z1, z0, 11))
+
+/*
+** mls_11_u16_x_untied:
+**	mov	z0\.h, #11
+**	msb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u16_x_untied, svuint16_t,
+		z0 = svmls_n_u16_x (p0, z1, z2, 11),
+		z0 = svmls_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u32.c
new file mode 100644
index 000000000..47e885012
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u32.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mls_u32_m_tied1:
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u32_m_tied1, svuint32_t,
+		z0 = svmls_u32_m (p0, z0, z1, z2),
+		z0 = svmls_m (p0, z0, z1, z2))
+
+/*
+** mls_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mls	z0\.s, p0/m, \1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u32_m_tied2, svuint32_t,
+		z0 = svmls_u32_m (p0, z1, z0, z2),
+		z0 = svmls_m (p0, z1, z0, z2))
+
+/*
+** mls_u32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mls	z0\.s, p0/m, z2\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u32_m_tied3, svuint32_t,
+		z0 = svmls_u32_m (p0, z1, z2, z0),
+		z0 = svmls_m (p0, z1, z2, z0))
+
+/*
+** mls_u32_m_untied:
+**	movprfx	z0, z1
+**	mls	z0\.s, p0/m, z2\.s, z3\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u32_m_untied, svuint32_t,
+		z0 = svmls_u32_m (p0, z1, z2, z3),
+		z0 = svmls_m (p0, z1, z2, z3))
+
+/*
+** mls_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	mls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svmls_n_u32_m (p0, z0, z1, x0),
+		 z0 = svmls_m (p0, z0, z1, x0))
+
+/*
+** mls_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	mls	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svmls_n_u32_m (p0, z1, z2, x0),
+		 z0 = svmls_m (p0, z1, z2, x0))
+
+/*
+** mls_11_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	mls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u32_m_tied1, svuint32_t,
+		z0 = svmls_n_u32_m (p0, z0, z1, 11),
+		z0 = svmls_m (p0, z0, z1, 11))
+
+/*
+** mls_11_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0, z1
+**	mls	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u32_m_untied, svuint32_t,
+		z0 = svmls_n_u32_m (p0, z1, z2, 11),
+		z0 = svmls_m (p0, z1, z2, 11))
+
+/*
+** mls_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u32_z_tied1, svuint32_t,
+		z0 = svmls_u32_z (p0, z0, z1, z2),
+		z0 = svmls_z (p0, z0, z1, z2))
+
+/*
+** mls_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u32_z_tied2, svuint32_t,
+		z0 = svmls_u32_z (p0, z1, z0, z2),
+		z0 = svmls_z (p0, z1, z0, z2))
+
+/*
+** mls_u32_z_tied3:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u32_z_tied3, svuint32_t,
+		z0 = svmls_u32_z (p0, z1, z2, z0),
+		z0 = svmls_z (p0, z1, z2, z0))
+
+/*
+** mls_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mls	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	msb	z0\.s, p0/m, z3\.s, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, z3\.s
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u32_z_untied, svuint32_t,
+		z0 = svmls_u32_z (p0, z1, z2, z3),
+		z0 = svmls_z (p0, z1, z2, z3))
+
+/*
+** mls_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svmls_n_u32_z (p0, z0, z1, x0),
+		 z0 = svmls_z (p0, z0, z1, x0))
+
+/*
+** mls_w0_u32_z_tied2:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u32_z_tied2, svuint32_t, uint32_t,
+		 z0 = svmls_n_u32_z (p0, z1, z0, x0),
+		 z0 = svmls_z (p0, z1, z0, x0))
+
+/*
+** mls_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mls	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	msb	z0\.s, p0/m, \1, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svmls_n_u32_z (p0, z1, z2, x0),
+		 z0 = svmls_z (p0, z1, z2, x0))
+
+/*
+** mls_11_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u32_z_tied1, svuint32_t,
+		z0 = svmls_n_u32_z (p0, z0, z1, 11),
+		z0 = svmls_z (p0, z0, z1, 11))
+
+/*
+** mls_11_u32_z_tied2:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u32_z_tied2, svuint32_t,
+		z0 = svmls_n_u32_z (p0, z1, z0, 11),
+		z0 = svmls_z (p0, z1, z0, 11))
+
+/*
+** mls_11_u32_z_untied:
+**	mov	(z[0-9]+\.s), #11
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mls	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	msb	z0\.s, p0/m, \1, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u32_z_untied, svuint32_t,
+		z0 = svmls_n_u32_z (p0, z1, z2, 11),
+		z0 = svmls_z (p0, z1, z2, 11))
+
+/*
+** mls_u32_x_tied1:
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u32_x_tied1, svuint32_t,
+		z0 = svmls_u32_x (p0, z0, z1, z2),
+		z0 = svmls_x (p0, z0, z1, z2))
+
+/*
+** mls_u32_x_tied2:
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u32_x_tied2, svuint32_t,
+		z0 = svmls_u32_x (p0, z1, z0, z2),
+		z0 = svmls_x (p0, z1, z0, z2))
+
+/*
+** mls_u32_x_tied3:
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u32_x_tied3, svuint32_t,
+		z0 = svmls_u32_x (p0, z1, z2, z0),
+		z0 = svmls_x (p0, z1, z2, z0))
+
+/*
+** mls_u32_x_untied:
+** (
+**	movprfx	z0, z1
+**	mls	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0, z2
+**	msb	z0\.s, p0/m, z3\.s, z1\.s
+** |
+**	movprfx	z0, z3
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u32_x_untied, svuint32_t,
+		z0 = svmls_u32_x (p0, z1, z2, z3),
+		z0 = svmls_x (p0, z1, z2, z3))
+
+/*
+** mls_w0_u32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	mls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svmls_n_u32_x (p0, z0, z1, x0),
+		 z0 = svmls_x (p0, z0, z1, x0))
+
+/*
+** mls_w0_u32_x_tied2:
+**	mov	(z[0-9]+\.s), w0
+**	msb	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u32_x_tied2, svuint32_t, uint32_t,
+		 z0 = svmls_n_u32_x (p0, z1, z0, x0),
+		 z0 = svmls_x (p0, z1, z0, x0))
+
+/*
+** mls_w0_u32_x_untied:
+**	mov	z0\.s, w0
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svmls_n_u32_x (p0, z1, z2, x0),
+		 z0 = svmls_x (p0, z1, z2, x0))
+
+/*
+** mls_11_u32_x_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	mls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u32_x_tied1, svuint32_t,
+		z0 = svmls_n_u32_x (p0, z0, z1, 11),
+		z0 = svmls_x (p0, z0, z1, 11))
+
+/*
+** mls_11_u32_x_tied2:
+**	mov	(z[0-9]+\.s), #11
+**	msb	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u32_x_tied2, svuint32_t,
+		z0 = svmls_n_u32_x (p0, z1, z0, 11),
+		z0 = svmls_x (p0, z1, z0, 11))
+
+/*
+** mls_11_u32_x_untied:
+**	mov	z0\.s, #11
+**	msb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u32_x_untied, svuint32_t,
+		z0 = svmls_n_u32_x (p0, z1, z2, 11),
+		z0 = svmls_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u64.c
new file mode 100644
index 000000000..4d441b759
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u64.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mls_u64_m_tied1:
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u64_m_tied1, svuint64_t,
+		z0 = svmls_u64_m (p0, z0, z1, z2),
+		z0 = svmls_m (p0, z0, z1, z2))
+
+/*
+** mls_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	mls	z0\.d, p0/m, \1, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u64_m_tied2, svuint64_t,
+		z0 = svmls_u64_m (p0, z1, z0, z2),
+		z0 = svmls_m (p0, z1, z0, z2))
+
+/*
+** mls_u64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	mls	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u64_m_tied3, svuint64_t,
+		z0 = svmls_u64_m (p0, z1, z2, z0),
+		z0 = svmls_m (p0, z1, z2, z0))
+
+/*
+** mls_u64_m_untied:
+**	movprfx	z0, z1
+**	mls	z0\.d, p0/m, z2\.d, z3\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u64_m_untied, svuint64_t,
+		z0 = svmls_u64_m (p0, z1, z2, z3),
+		z0 = svmls_m (p0, z1, z2, z3))
+
+/*
+** mls_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	mls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svmls_n_u64_m (p0, z0, z1, x0),
+		 z0 = svmls_m (p0, z0, z1, x0))
+
+/*
+** mls_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	mls	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svmls_n_u64_m (p0, z1, z2, x0),
+		 z0 = svmls_m (p0, z1, z2, x0))
+
+/*
+** mls_11_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	mls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u64_m_tied1, svuint64_t,
+		z0 = svmls_n_u64_m (p0, z0, z1, 11),
+		z0 = svmls_m (p0, z0, z1, 11))
+
+/*
+** mls_11_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0, z1
+**	mls	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u64_m_untied, svuint64_t,
+		z0 = svmls_n_u64_m (p0, z1, z2, 11),
+		z0 = svmls_m (p0, z1, z2, 11))
+
+/*
+** mls_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u64_z_tied1, svuint64_t,
+		z0 = svmls_u64_z (p0, z0, z1, z2),
+		z0 = svmls_z (p0, z0, z1, z2))
+
+/*
+** mls_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u64_z_tied2, svuint64_t,
+		z0 = svmls_u64_z (p0, z1, z0, z2),
+		z0 = svmls_z (p0, z1, z0, z2))
+
+/*
+** mls_u64_z_tied3:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u64_z_tied3, svuint64_t,
+		z0 = svmls_u64_z (p0, z1, z2, z0),
+		z0 = svmls_z (p0, z1, z2, z0))
+
+/*
+** mls_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mls	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	msb	z0\.d, p0/m, z3\.d, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, z3\.d
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u64_z_untied, svuint64_t,
+		z0 = svmls_u64_z (p0, z1, z2, z3),
+		z0 = svmls_z (p0, z1, z2, z3))
+
+/*
+** mls_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svmls_n_u64_z (p0, z0, z1, x0),
+		 z0 = svmls_z (p0, z0, z1, x0))
+
+/*
+** mls_x0_u64_z_tied2:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_x0_u64_z_tied2, svuint64_t, uint64_t,
+		 z0 = svmls_n_u64_z (p0, z1, z0, x0),
+		 z0 = svmls_z (p0, z1, z0, x0))
+
+/*
+** mls_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mls	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	msb	z0\.d, p0/m, \1, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svmls_n_u64_z (p0, z1, z2, x0),
+		 z0 = svmls_z (p0, z1, z2, x0))
+
+/*
+** mls_11_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u64_z_tied1, svuint64_t,
+		z0 = svmls_n_u64_z (p0, z0, z1, 11),
+		z0 = svmls_z (p0, z0, z1, 11))
+
+/*
+** mls_11_u64_z_tied2:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u64_z_tied2, svuint64_t,
+		z0 = svmls_n_u64_z (p0, z1, z0, 11),
+		z0 = svmls_z (p0, z1, z0, 11))
+
+/*
+** mls_11_u64_z_untied:
+**	mov	(z[0-9]+\.d), #11
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mls	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	msb	z0\.d, p0/m, \1, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u64_z_untied, svuint64_t,
+		z0 = svmls_n_u64_z (p0, z1, z2, 11),
+		z0 = svmls_z (p0, z1, z2, 11))
+
+/*
+** mls_u64_x_tied1:
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u64_x_tied1, svuint64_t,
+		z0 = svmls_u64_x (p0, z0, z1, z2),
+		z0 = svmls_x (p0, z0, z1, z2))
+
+/*
+** mls_u64_x_tied2:
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u64_x_tied2, svuint64_t,
+		z0 = svmls_u64_x (p0, z1, z0, z2),
+		z0 = svmls_x (p0, z1, z0, z2))
+
+/*
+** mls_u64_x_tied3:
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u64_x_tied3, svuint64_t,
+		z0 = svmls_u64_x (p0, z1, z2, z0),
+		z0 = svmls_x (p0, z1, z2, z0))
+
+/*
+** mls_u64_x_untied:
+** (
+**	movprfx	z0, z1
+**	mls	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0, z2
+**	msb	z0\.d, p0/m, z3\.d, z1\.d
+** |
+**	movprfx	z0, z3
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u64_x_untied, svuint64_t,
+		z0 = svmls_u64_x (p0, z1, z2, z3),
+		z0 = svmls_x (p0, z1, z2, z3))
+
+/*
+** mls_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	mls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svmls_n_u64_x (p0, z0, z1, x0),
+		 z0 = svmls_x (p0, z0, z1, x0))
+
+/*
+** mls_x0_u64_x_tied2:
+**	mov	(z[0-9]+\.d), x0
+**	msb	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_x0_u64_x_tied2, svuint64_t, uint64_t,
+		 z0 = svmls_n_u64_x (p0, z1, z0, x0),
+		 z0 = svmls_x (p0, z1, z0, x0))
+
+/*
+** mls_x0_u64_x_untied:
+**	mov	z0\.d, x0
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svmls_n_u64_x (p0, z1, z2, x0),
+		 z0 = svmls_x (p0, z1, z2, x0))
+
+/*
+** mls_11_u64_x_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	mls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u64_x_tied1, svuint64_t,
+		z0 = svmls_n_u64_x (p0, z0, z1, 11),
+		z0 = svmls_x (p0, z0, z1, 11))
+
+/*
+** mls_11_u64_x_tied2:
+**	mov	(z[0-9]+\.d), #11
+**	msb	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u64_x_tied2, svuint64_t,
+		z0 = svmls_n_u64_x (p0, z1, z0, 11),
+		z0 = svmls_x (p0, z1, z0, 11))
+
+/*
+** mls_11_u64_x_untied:
+**	mov	z0\.d, #11
+**	msb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u64_x_untied, svuint64_t,
+		z0 = svmls_n_u64_x (p0, z1, z2, 11),
+		z0 = svmls_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u8.c
new file mode 100644
index 000000000..0489aaa7c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mls_u8.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mls_u8_m_tied1:
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u8_m_tied1, svuint8_t,
+		z0 = svmls_u8_m (p0, z0, z1, z2),
+		z0 = svmls_m (p0, z0, z1, z2))
+
+/*
+** mls_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mls	z0\.b, p0/m, \1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u8_m_tied2, svuint8_t,
+		z0 = svmls_u8_m (p0, z1, z0, z2),
+		z0 = svmls_m (p0, z1, z0, z2))
+
+/*
+** mls_u8_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mls	z0\.b, p0/m, z2\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u8_m_tied3, svuint8_t,
+		z0 = svmls_u8_m (p0, z1, z2, z0),
+		z0 = svmls_m (p0, z1, z2, z0))
+
+/*
+** mls_u8_m_untied:
+**	movprfx	z0, z1
+**	mls	z0\.b, p0/m, z2\.b, z3\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u8_m_untied, svuint8_t,
+		z0 = svmls_u8_m (p0, z1, z2, z3),
+		z0 = svmls_m (p0, z1, z2, z3))
+
+/*
+** mls_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	mls	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svmls_n_u8_m (p0, z0, z1, x0),
+		 z0 = svmls_m (p0, z0, z1, x0))
+
+/*
+** mls_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	mls	z0\.b, p0/m, z2\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svmls_n_u8_m (p0, z1, z2, x0),
+		 z0 = svmls_m (p0, z1, z2, x0))
+
+/*
+** mls_11_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	mls	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u8_m_tied1, svuint8_t,
+		z0 = svmls_n_u8_m (p0, z0, z1, 11),
+		z0 = svmls_m (p0, z0, z1, 11))
+
+/*
+** mls_11_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0, z1
+**	mls	z0\.b, p0/m, z2\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u8_m_untied, svuint8_t,
+		z0 = svmls_n_u8_m (p0, z1, z2, 11),
+		z0 = svmls_m (p0, z1, z2, 11))
+
+/*
+** mls_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u8_z_tied1, svuint8_t,
+		z0 = svmls_u8_z (p0, z0, z1, z2),
+		z0 = svmls_z (p0, z0, z1, z2))
+
+/*
+** mls_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u8_z_tied2, svuint8_t,
+		z0 = svmls_u8_z (p0, z1, z0, z2),
+		z0 = svmls_z (p0, z1, z0, z2))
+
+/*
+** mls_u8_z_tied3:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u8_z_tied3, svuint8_t,
+		z0 = svmls_u8_z (p0, z1, z2, z0),
+		z0 = svmls_z (p0, z1, z2, z0))
+
+/*
+** mls_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mls	z0\.b, p0/m, z2\.b, z3\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	msb	z0\.b, p0/m, z3\.b, z1\.b
+** |
+**	movprfx	z0\.b, p0/z, z3\.b
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u8_z_untied, svuint8_t,
+		z0 = svmls_u8_z (p0, z1, z2, z3),
+		z0 = svmls_z (p0, z1, z2, z3))
+
+/*
+** mls_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mls	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svmls_n_u8_z (p0, z0, z1, x0),
+		 z0 = svmls_z (p0, z0, z1, x0))
+
+/*
+** mls_w0_u8_z_tied2:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u8_z_tied2, svuint8_t, uint8_t,
+		 z0 = svmls_n_u8_z (p0, z1, z0, x0),
+		 z0 = svmls_z (p0, z1, z0, x0))
+
+/*
+** mls_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mls	z0\.b, p0/m, z2\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	msb	z0\.b, p0/m, \1, z1\.b
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svmls_n_u8_z (p0, z1, z2, x0),
+		 z0 = svmls_z (p0, z1, z2, x0))
+
+/*
+** mls_11_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mls	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u8_z_tied1, svuint8_t,
+		z0 = svmls_n_u8_z (p0, z0, z1, 11),
+		z0 = svmls_z (p0, z0, z1, 11))
+
+/*
+** mls_11_u8_z_tied2:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u8_z_tied2, svuint8_t,
+		z0 = svmls_n_u8_z (p0, z1, z0, 11),
+		z0 = svmls_z (p0, z1, z0, 11))
+
+/*
+** mls_11_u8_z_untied:
+**	mov	(z[0-9]+\.b), #11
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mls	z0\.b, p0/m, z2\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	msb	z0\.b, p0/m, \1, z1\.b
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u8_z_untied, svuint8_t,
+		z0 = svmls_n_u8_z (p0, z1, z2, 11),
+		z0 = svmls_z (p0, z1, z2, 11))
+
+/*
+** mls_u8_x_tied1:
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u8_x_tied1, svuint8_t,
+		z0 = svmls_u8_x (p0, z0, z1, z2),
+		z0 = svmls_x (p0, z0, z1, z2))
+
+/*
+** mls_u8_x_tied2:
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u8_x_tied2, svuint8_t,
+		z0 = svmls_u8_x (p0, z1, z0, z2),
+		z0 = svmls_x (p0, z1, z0, z2))
+
+/*
+** mls_u8_x_tied3:
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u8_x_tied3, svuint8_t,
+		z0 = svmls_u8_x (p0, z1, z2, z0),
+		z0 = svmls_x (p0, z1, z2, z0))
+
+/*
+** mls_u8_x_untied:
+** (
+**	movprfx	z0, z1
+**	mls	z0\.b, p0/m, z2\.b, z3\.b
+** |
+**	movprfx	z0, z2
+**	msb	z0\.b, p0/m, z3\.b, z1\.b
+** |
+**	movprfx	z0, z3
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mls_u8_x_untied, svuint8_t,
+		z0 = svmls_u8_x (p0, z1, z2, z3),
+		z0 = svmls_x (p0, z1, z2, z3))
+
+/*
+** mls_w0_u8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	mls	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svmls_n_u8_x (p0, z0, z1, x0),
+		 z0 = svmls_x (p0, z0, z1, x0))
+
+/*
+** mls_w0_u8_x_tied2:
+**	mov	(z[0-9]+\.b), w0
+**	msb	z0\.b, p0/m, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u8_x_tied2, svuint8_t, uint8_t,
+		 z0 = svmls_n_u8_x (p0, z1, z0, x0),
+		 z0 = svmls_x (p0, z1, z0, x0))
+
+/*
+** mls_w0_u8_x_untied:
+**	mov	z0\.b, w0
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mls_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svmls_n_u8_x (p0, z1, z2, x0),
+		 z0 = svmls_x (p0, z1, z2, x0))
+
+/*
+** mls_11_u8_x_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	mls	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u8_x_tied1, svuint8_t,
+		z0 = svmls_n_u8_x (p0, z0, z1, 11),
+		z0 = svmls_x (p0, z0, z1, 11))
+
+/*
+** mls_11_u8_x_tied2:
+**	mov	(z[0-9]+\.b), #11
+**	msb	z0\.b, p0/m, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u8_x_tied2, svuint8_t,
+		z0 = svmls_n_u8_x (p0, z1, z0, 11),
+		z0 = svmls_x (p0, z1, z0, 11))
+
+/*
+** mls_11_u8_x_untied:
+**	mov	z0\.b, #11
+**	msb	z0\.b, p0/m, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mls_11_u8_x_untied, svuint8_t,
+		z0 = svmls_n_u8_x (p0, z1, z2, 11),
+		z0 = svmls_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_f32.c
new file mode 100644
index 000000000..f66dbf397
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_f32.c
@@ -0,0 +1,46 @@
+/* { dg-require-effective-target aarch64_asm_f32mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f32mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mmla_f32_tied1:
+**	fmmla	z0\.s, z4\.s, z5\.s
+**	ret
+*/
+TEST_DUAL_Z (mmla_f32_tied1, svfloat32_t, svfloat32_t,
+	     z0 = svmmla_f32 (z0, z4, z5),
+	     z0 = svmmla (z0, z4, z5))
+
+/*
+** mmla_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fmmla	z0\.s, \1\.s, z1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (mmla_f32_tied2, svfloat32_t, svfloat32_t,
+		 z0_res = svmmla_f32 (z4, z0, z1),
+		 z0_res = svmmla (z4, z0, z1))
+
+/*
+** mmla_f32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fmmla	z0\.s, z1\.s, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (mmla_f32_tied3, svfloat32_t, svfloat32_t,
+		 z0_res = svmmla_f32 (z4, z1, z0),
+		 z0_res = svmmla (z4, z1, z0))
+
+/*
+** mmla_f32_untied:
+**	movprfx	z0, z1
+**	fmmla	z0\.s, z4\.s, z5\.s
+**	ret
+*/
+TEST_DUAL_Z (mmla_f32_untied, svfloat32_t, svfloat32_t,
+	     z0 = svmmla_f32 (z1, z4, z5),
+	     z0 = svmmla (z1, z4, z5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_f64.c
new file mode 100644
index 000000000..49dc0607c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_f64.c
@@ -0,0 +1,46 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mmla_f64_tied1:
+**	fmmla	z0\.d, z4\.d, z5\.d
+**	ret
+*/
+TEST_DUAL_Z (mmla_f64_tied1, svfloat64_t, svfloat64_t,
+	     z0 = svmmla_f64 (z0, z4, z5),
+	     z0 = svmmla (z0, z4, z5))
+
+/*
+** mmla_f64_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	fmmla	z0\.d, \1, z1\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (mmla_f64_tied2, svfloat64_t, svfloat64_t,
+		 z0_res = svmmla_f64 (z4, z0, z1),
+		 z0_res = svmmla (z4, z0, z1))
+
+/*
+** mmla_f64_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	fmmla	z0\.d, z1\.d, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (mmla_f64_tied3, svfloat64_t, svfloat64_t,
+		 z0_res = svmmla_f64 (z4, z1, z0),
+		 z0_res = svmmla (z4, z1, z0))
+
+/*
+** mmla_f64_untied:
+**	movprfx	z0, z1
+**	fmmla	z0\.d, z4\.d, z5\.d
+**	ret
+*/
+TEST_DUAL_Z (mmla_f64_untied, svfloat64_t, svfloat64_t,
+	     z0 = svmmla_f64 (z1, z4, z5),
+	     z0 = svmmla (z1, z4, z5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_s32.c
new file mode 100644
index 000000000..e7ce009ac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_s32.c
@@ -0,0 +1,46 @@
+/* { dg-require-effective-target aarch64_asm_i8mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+sve+i8mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mmla_s32_tied1:
+**	smmla	z0\.s, z4\.b, z5\.b
+**	ret
+*/
+TEST_DUAL_Z (mmla_s32_tied1, svint32_t, svint8_t,
+	     z0 = svmmla_s32 (z0, z4, z5),
+	     z0 = svmmla (z0, z4, z5))
+
+/*
+** mmla_s32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	smmla	z0\.s, \1\.b, z1\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (mmla_s32_tied2, svint32_t, svint8_t,
+		 z0_res = svmmla_s32 (z4, z0, z1),
+		 z0_res = svmmla (z4, z0, z1))
+
+/*
+** mmla_s32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	smmla	z0\.s, z1\.b, \1\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (mmla_s32_tied3, svint32_t, svint8_t,
+		 z0_res = svmmla_s32 (z4, z1, z0),
+		 z0_res = svmmla (z4, z1, z0))
+
+/*
+** mmla_s32_untied:
+**	movprfx	z0, z1
+**	smmla	z0\.s, z4\.b, z5\.b
+**	ret
+*/
+TEST_DUAL_Z (mmla_s32_untied, svint32_t, svint8_t,
+	     z0 = svmmla_s32 (z1, z4, z5),
+	     z0 = svmmla (z1, z4, z5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_u32.c
new file mode 100644
index 000000000..81f5166fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mmla_u32.c
@@ -0,0 +1,46 @@
+/* { dg-require-effective-target aarch64_asm_i8mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+sve+i8mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mmla_u32_tied1:
+**	ummla	z0\.s, z4\.b, z5\.b
+**	ret
+*/
+TEST_DUAL_Z (mmla_u32_tied1, svuint32_t, svuint8_t,
+	     z0 = svmmla_u32 (z0, z4, z5),
+	     z0 = svmmla (z0, z4, z5))
+
+/*
+** mmla_u32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	ummla	z0\.s, \1\.b, z1\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (mmla_u32_tied2, svuint32_t, svuint8_t,
+		 z0_res = svmmla_u32 (z4, z0, z1),
+		 z0_res = svmmla (z4, z0, z1))
+
+/*
+** mmla_u32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	ummla	z0\.s, z1\.b, \1\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (mmla_u32_tied3, svuint32_t, svuint8_t,
+		 z0_res = svmmla_u32 (z4, z1, z0),
+		 z0_res = svmmla (z4, z1, z0))
+
+/*
+** mmla_u32_untied:
+**	movprfx	z0, z1
+**	ummla	z0\.s, z4\.b, z5\.b
+**	ret
+*/
+TEST_DUAL_Z (mmla_u32_untied, svuint32_t, svuint8_t,
+	     z0 = svmmla_u32 (z1, z4, z5),
+	     z0 = svmmla (z1, z4, z5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mov_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mov_b.c
new file mode 100644
index 000000000..6b78f348f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mov_b.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mov_b_z_tied1:
+**	and	p0\.b, (?:p3/z, p0\.b, p0\.b|p0/z, p3\.b, p3\.b)
+**	ret
+*/
+TEST_UNIFORM_P (mov_b_z_tied1,
+		p0 = svmov_b_z (p3, p0),
+		p0 = svmov_z (p3, p0))
+
+/*
+** mov_b_z_untied:
+**	and	p0\.b, (?:p3/z, p1\.b, p1\.b|p1/z, p3\.b, p3\.b)
+**	ret
+*/
+TEST_UNIFORM_P (mov_b_z_untied,
+		p0 = svmov_b_z (p3, p1),
+		p0 = svmov_z (p3, p1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f16.c
new file mode 100644
index 000000000..fe11457c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f16.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** msb_f16_m_tied1:
+**	fmsb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f16_m_tied1, svfloat16_t,
+		z0 = svmsb_f16_m (p0, z0, z1, z2),
+		z0 = svmsb_m (p0, z0, z1, z2))
+
+/*
+** msb_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmsb	z0\.h, p0/m, \1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f16_m_tied2, svfloat16_t,
+		z0 = svmsb_f16_m (p0, z1, z0, z2),
+		z0 = svmsb_m (p0, z1, z0, z2))
+
+/*
+** msb_f16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmsb	z0\.h, p0/m, z2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f16_m_tied3, svfloat16_t,
+		z0 = svmsb_f16_m (p0, z1, z2, z0),
+		z0 = svmsb_m (p0, z1, z2, z0))
+
+/*
+** msb_f16_m_untied:
+**	movprfx	z0, z1
+**	fmsb	z0\.h, p0/m, z2\.h, z3\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f16_m_untied, svfloat16_t,
+		z0 = svmsb_f16_m (p0, z1, z2, z3),
+		z0 = svmsb_m (p0, z1, z2, z3))
+
+/*
+** msb_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svmsb_n_f16_m (p0, z0, z1, d4),
+		 z0 = svmsb_m (p0, z0, z1, d4))
+
+/*
+** msb_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fmsb	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svmsb_n_f16_m (p0, z1, z2, d4),
+		 z0 = svmsb_m (p0, z1, z2, d4))
+
+/*
+** msb_2_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f16_m_tied1, svfloat16_t,
+		z0 = svmsb_n_f16_m (p0, z0, z1, 2),
+		z0 = svmsb_m (p0, z0, z1, 2))
+
+/*
+** msb_2_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmsb	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f16_m_untied, svfloat16_t,
+		z0 = svmsb_n_f16_m (p0, z1, z2, 2),
+		z0 = svmsb_m (p0, z1, z2, 2))
+
+/*
+** msb_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmsb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f16_z_tied1, svfloat16_t,
+		z0 = svmsb_f16_z (p0, z0, z1, z2),
+		z0 = svmsb_z (p0, z0, z1, z2))
+
+/*
+** msb_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmsb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f16_z_tied2, svfloat16_t,
+		z0 = svmsb_f16_z (p0, z1, z0, z2),
+		z0 = svmsb_z (p0, z1, z0, z2))
+
+/*
+** msb_f16_z_tied3:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f16_z_tied3, svfloat16_t,
+		z0 = svmsb_f16_z (p0, z1, z2, z0),
+		z0 = svmsb_z (p0, z1, z2, z0))
+
+/*
+** msb_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmsb	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmsb	z0\.h, p0/m, z1\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z3\.h
+**	fmls	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f16_z_untied, svfloat16_t,
+		z0 = svmsb_f16_z (p0, z1, z2, z3),
+		z0 = svmsb_z (p0, z1, z2, z3))
+
+/*
+** msb_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svmsb_n_f16_z (p0, z0, z1, d4),
+		 z0 = svmsb_z (p0, z0, z1, d4))
+
+/*
+** msb_h4_f16_z_tied2:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_h4_f16_z_tied2, svfloat16_t, __fp16,
+		 z0 = svmsb_n_f16_z (p0, z1, z0, d4),
+		 z0 = svmsb_z (p0, z1, z0, d4))
+
+/*
+** msb_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmsb	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmsb	z0\.h, p0/m, z1\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmls	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svmsb_n_f16_z (p0, z1, z2, d4),
+		 z0 = svmsb_z (p0, z1, z2, d4))
+
+/*
+** msb_2_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f16_z_tied1, svfloat16_t,
+		z0 = svmsb_n_f16_z (p0, z0, z1, 2),
+		z0 = svmsb_z (p0, z0, z1, 2))
+
+/*
+** msb_2_f16_z_tied2:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f16_z_tied2, svfloat16_t,
+		z0 = svmsb_n_f16_z (p0, z1, z0, 2),
+		z0 = svmsb_z (p0, z1, z0, 2))
+
+/*
+** msb_2_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmsb	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmsb	z0\.h, p0/m, z1\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmls	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f16_z_untied, svfloat16_t,
+		z0 = svmsb_n_f16_z (p0, z1, z2, 2),
+		z0 = svmsb_z (p0, z1, z2, 2))
+
+/*
+** msb_f16_x_tied1:
+**	fmsb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f16_x_tied1, svfloat16_t,
+		z0 = svmsb_f16_x (p0, z0, z1, z2),
+		z0 = svmsb_x (p0, z0, z1, z2))
+
+/*
+** msb_f16_x_tied2:
+**	fmsb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f16_x_tied2, svfloat16_t,
+		z0 = svmsb_f16_x (p0, z1, z0, z2),
+		z0 = svmsb_x (p0, z1, z0, z2))
+
+/*
+** msb_f16_x_tied3:
+**	fmls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f16_x_tied3, svfloat16_t,
+		z0 = svmsb_f16_x (p0, z1, z2, z0),
+		z0 = svmsb_x (p0, z1, z2, z0))
+
+/*
+** msb_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmsb	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0, z2
+**	fmsb	z0\.h, p0/m, z1\.h, z3\.h
+** |
+**	movprfx	z0, z3
+**	fmls	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f16_x_untied, svfloat16_t,
+		z0 = svmsb_f16_x (p0, z1, z2, z3),
+		z0 = svmsb_x (p0, z1, z2, z3))
+
+/*
+** msb_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svmsb_n_f16_x (p0, z0, z1, d4),
+		 z0 = svmsb_x (p0, z0, z1, d4))
+
+/*
+** msb_h4_f16_x_tied2:
+**	mov	(z[0-9]+\.h), h4
+**	fmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_h4_f16_x_tied2, svfloat16_t, __fp16,
+		 z0 = svmsb_n_f16_x (p0, z1, z0, d4),
+		 z0 = svmsb_x (p0, z1, z0, d4))
+
+/*
+** msb_h4_f16_x_untied: { xfail *-*-* }
+**	mov	z0\.h, h4
+**	fmls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svmsb_n_f16_x (p0, z1, z2, d4),
+		 z0 = svmsb_x (p0, z1, z2, d4))
+
+/*
+** msb_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f16_x_tied1, svfloat16_t,
+		z0 = svmsb_n_f16_x (p0, z0, z1, 2),
+		z0 = svmsb_x (p0, z0, z1, 2))
+
+/*
+** msb_2_f16_x_tied2:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f16_x_tied2, svfloat16_t,
+		z0 = svmsb_n_f16_x (p0, z1, z0, 2),
+		z0 = svmsb_x (p0, z1, z0, 2))
+
+/*
+** msb_2_f16_x_untied:
+**	fmov	z0\.h, #2\.0(?:e\+0)?
+**	fmls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f16_x_untied, svfloat16_t,
+		z0 = svmsb_n_f16_x (p0, z1, z2, 2),
+		z0 = svmsb_x (p0, z1, z2, 2))
+
+/*
+** ptrue_msb_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_f16_x_tied1, svfloat16_t,
+		z0 = svmsb_f16_x (svptrue_b16 (), z0, z1, z2),
+		z0 = svmsb_x (svptrue_b16 (), z0, z1, z2))
+
+/*
+** ptrue_msb_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_f16_x_tied2, svfloat16_t,
+		z0 = svmsb_f16_x (svptrue_b16 (), z1, z0, z2),
+		z0 = svmsb_x (svptrue_b16 (), z1, z0, z2))
+
+/*
+** ptrue_msb_f16_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_f16_x_tied3, svfloat16_t,
+		z0 = svmsb_f16_x (svptrue_b16 (), z1, z2, z0),
+		z0 = svmsb_x (svptrue_b16 (), z1, z2, z0))
+
+/*
+** ptrue_msb_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_f16_x_untied, svfloat16_t,
+		z0 = svmsb_f16_x (svptrue_b16 (), z1, z2, z3),
+		z0 = svmsb_x (svptrue_b16 (), z1, z2, z3))
+
+/*
+** ptrue_msb_2_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_2_f16_x_tied1, svfloat16_t,
+		z0 = svmsb_n_f16_x (svptrue_b16 (), z0, z1, 2),
+		z0 = svmsb_x (svptrue_b16 (), z0, z1, 2))
+
+/*
+** ptrue_msb_2_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_2_f16_x_tied2, svfloat16_t,
+		z0 = svmsb_n_f16_x (svptrue_b16 (), z1, z0, 2),
+		z0 = svmsb_x (svptrue_b16 (), z1, z0, 2))
+
+/*
+** ptrue_msb_2_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_2_f16_x_untied, svfloat16_t,
+		z0 = svmsb_n_f16_x (svptrue_b16 (), z1, z2, 2),
+		z0 = svmsb_x (svptrue_b16 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f32.c
new file mode 100644
index 000000000..f7a9f2767
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f32.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** msb_f32_m_tied1:
+**	fmsb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f32_m_tied1, svfloat32_t,
+		z0 = svmsb_f32_m (p0, z0, z1, z2),
+		z0 = svmsb_m (p0, z0, z1, z2))
+
+/*
+** msb_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmsb	z0\.s, p0/m, \1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f32_m_tied2, svfloat32_t,
+		z0 = svmsb_f32_m (p0, z1, z0, z2),
+		z0 = svmsb_m (p0, z1, z0, z2))
+
+/*
+** msb_f32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmsb	z0\.s, p0/m, z2\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f32_m_tied3, svfloat32_t,
+		z0 = svmsb_f32_m (p0, z1, z2, z0),
+		z0 = svmsb_m (p0, z1, z2, z0))
+
+/*
+** msb_f32_m_untied:
+**	movprfx	z0, z1
+**	fmsb	z0\.s, p0/m, z2\.s, z3\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f32_m_untied, svfloat32_t,
+		z0 = svmsb_f32_m (p0, z1, z2, z3),
+		z0 = svmsb_m (p0, z1, z2, z3))
+
+/*
+** msb_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svmsb_n_f32_m (p0, z0, z1, d4),
+		 z0 = svmsb_m (p0, z0, z1, d4))
+
+/*
+** msb_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fmsb	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svmsb_n_f32_m (p0, z1, z2, d4),
+		 z0 = svmsb_m (p0, z1, z2, d4))
+
+/*
+** msb_2_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f32_m_tied1, svfloat32_t,
+		z0 = svmsb_n_f32_m (p0, z0, z1, 2),
+		z0 = svmsb_m (p0, z0, z1, 2))
+
+/*
+** msb_2_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmsb	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f32_m_untied, svfloat32_t,
+		z0 = svmsb_n_f32_m (p0, z1, z2, 2),
+		z0 = svmsb_m (p0, z1, z2, 2))
+
+/*
+** msb_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmsb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f32_z_tied1, svfloat32_t,
+		z0 = svmsb_f32_z (p0, z0, z1, z2),
+		z0 = svmsb_z (p0, z0, z1, z2))
+
+/*
+** msb_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmsb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f32_z_tied2, svfloat32_t,
+		z0 = svmsb_f32_z (p0, z1, z0, z2),
+		z0 = svmsb_z (p0, z1, z0, z2))
+
+/*
+** msb_f32_z_tied3:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f32_z_tied3, svfloat32_t,
+		z0 = svmsb_f32_z (p0, z1, z2, z0),
+		z0 = svmsb_z (p0, z1, z2, z0))
+
+/*
+** msb_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmsb	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmsb	z0\.s, p0/m, z1\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z3\.s
+**	fmls	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f32_z_untied, svfloat32_t,
+		z0 = svmsb_f32_z (p0, z1, z2, z3),
+		z0 = svmsb_z (p0, z1, z2, z3))
+
+/*
+** msb_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svmsb_n_f32_z (p0, z0, z1, d4),
+		 z0 = svmsb_z (p0, z0, z1, d4))
+
+/*
+** msb_s4_f32_z_tied2:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_s4_f32_z_tied2, svfloat32_t, float,
+		 z0 = svmsb_n_f32_z (p0, z1, z0, d4),
+		 z0 = svmsb_z (p0, z1, z0, d4))
+
+/*
+** msb_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmsb	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmsb	z0\.s, p0/m, z1\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmls	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svmsb_n_f32_z (p0, z1, z2, d4),
+		 z0 = svmsb_z (p0, z1, z2, d4))
+
+/*
+** msb_2_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f32_z_tied1, svfloat32_t,
+		z0 = svmsb_n_f32_z (p0, z0, z1, 2),
+		z0 = svmsb_z (p0, z0, z1, 2))
+
+/*
+** msb_2_f32_z_tied2:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f32_z_tied2, svfloat32_t,
+		z0 = svmsb_n_f32_z (p0, z1, z0, 2),
+		z0 = svmsb_z (p0, z1, z0, 2))
+
+/*
+** msb_2_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmsb	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmsb	z0\.s, p0/m, z1\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmls	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f32_z_untied, svfloat32_t,
+		z0 = svmsb_n_f32_z (p0, z1, z2, 2),
+		z0 = svmsb_z (p0, z1, z2, 2))
+
+/*
+** msb_f32_x_tied1:
+**	fmsb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f32_x_tied1, svfloat32_t,
+		z0 = svmsb_f32_x (p0, z0, z1, z2),
+		z0 = svmsb_x (p0, z0, z1, z2))
+
+/*
+** msb_f32_x_tied2:
+**	fmsb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f32_x_tied2, svfloat32_t,
+		z0 = svmsb_f32_x (p0, z1, z0, z2),
+		z0 = svmsb_x (p0, z1, z0, z2))
+
+/*
+** msb_f32_x_tied3:
+**	fmls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f32_x_tied3, svfloat32_t,
+		z0 = svmsb_f32_x (p0, z1, z2, z0),
+		z0 = svmsb_x (p0, z1, z2, z0))
+
+/*
+** msb_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmsb	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0, z2
+**	fmsb	z0\.s, p0/m, z1\.s, z3\.s
+** |
+**	movprfx	z0, z3
+**	fmls	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f32_x_untied, svfloat32_t,
+		z0 = svmsb_f32_x (p0, z1, z2, z3),
+		z0 = svmsb_x (p0, z1, z2, z3))
+
+/*
+** msb_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svmsb_n_f32_x (p0, z0, z1, d4),
+		 z0 = svmsb_x (p0, z0, z1, d4))
+
+/*
+** msb_s4_f32_x_tied2:
+**	mov	(z[0-9]+\.s), s4
+**	fmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_s4_f32_x_tied2, svfloat32_t, float,
+		 z0 = svmsb_n_f32_x (p0, z1, z0, d4),
+		 z0 = svmsb_x (p0, z1, z0, d4))
+
+/*
+** msb_s4_f32_x_untied: { xfail *-*-* }
+**	mov	z0\.s, s4
+**	fmls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svmsb_n_f32_x (p0, z1, z2, d4),
+		 z0 = svmsb_x (p0, z1, z2, d4))
+
+/*
+** msb_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f32_x_tied1, svfloat32_t,
+		z0 = svmsb_n_f32_x (p0, z0, z1, 2),
+		z0 = svmsb_x (p0, z0, z1, 2))
+
+/*
+** msb_2_f32_x_tied2:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f32_x_tied2, svfloat32_t,
+		z0 = svmsb_n_f32_x (p0, z1, z0, 2),
+		z0 = svmsb_x (p0, z1, z0, 2))
+
+/*
+** msb_2_f32_x_untied:
+**	fmov	z0\.s, #2\.0(?:e\+0)?
+**	fmls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f32_x_untied, svfloat32_t,
+		z0 = svmsb_n_f32_x (p0, z1, z2, 2),
+		z0 = svmsb_x (p0, z1, z2, 2))
+
+/*
+** ptrue_msb_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_f32_x_tied1, svfloat32_t,
+		z0 = svmsb_f32_x (svptrue_b32 (), z0, z1, z2),
+		z0 = svmsb_x (svptrue_b32 (), z0, z1, z2))
+
+/*
+** ptrue_msb_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_f32_x_tied2, svfloat32_t,
+		z0 = svmsb_f32_x (svptrue_b32 (), z1, z0, z2),
+		z0 = svmsb_x (svptrue_b32 (), z1, z0, z2))
+
+/*
+** ptrue_msb_f32_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_f32_x_tied3, svfloat32_t,
+		z0 = svmsb_f32_x (svptrue_b32 (), z1, z2, z0),
+		z0 = svmsb_x (svptrue_b32 (), z1, z2, z0))
+
+/*
+** ptrue_msb_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_f32_x_untied, svfloat32_t,
+		z0 = svmsb_f32_x (svptrue_b32 (), z1, z2, z3),
+		z0 = svmsb_x (svptrue_b32 (), z1, z2, z3))
+
+/*
+** ptrue_msb_2_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_2_f32_x_tied1, svfloat32_t,
+		z0 = svmsb_n_f32_x (svptrue_b32 (), z0, z1, 2),
+		z0 = svmsb_x (svptrue_b32 (), z0, z1, 2))
+
+/*
+** ptrue_msb_2_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_2_f32_x_tied2, svfloat32_t,
+		z0 = svmsb_n_f32_x (svptrue_b32 (), z1, z0, 2),
+		z0 = svmsb_x (svptrue_b32 (), z1, z0, 2))
+
+/*
+** ptrue_msb_2_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_2_f32_x_untied, svfloat32_t,
+		z0 = svmsb_n_f32_x (svptrue_b32 (), z1, z2, 2),
+		z0 = svmsb_x (svptrue_b32 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f64.c
new file mode 100644
index 000000000..e3ff414d8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_f64.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** msb_f64_m_tied1:
+**	fmsb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f64_m_tied1, svfloat64_t,
+		z0 = svmsb_f64_m (p0, z0, z1, z2),
+		z0 = svmsb_m (p0, z0, z1, z2))
+
+/*
+** msb_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmsb	z0\.d, p0/m, \1, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f64_m_tied2, svfloat64_t,
+		z0 = svmsb_f64_m (p0, z1, z0, z2),
+		z0 = svmsb_m (p0, z1, z0, z2))
+
+/*
+** msb_f64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmsb	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f64_m_tied3, svfloat64_t,
+		z0 = svmsb_f64_m (p0, z1, z2, z0),
+		z0 = svmsb_m (p0, z1, z2, z0))
+
+/*
+** msb_f64_m_untied:
+**	movprfx	z0, z1
+**	fmsb	z0\.d, p0/m, z2\.d, z3\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f64_m_untied, svfloat64_t,
+		z0 = svmsb_f64_m (p0, z1, z2, z3),
+		z0 = svmsb_m (p0, z1, z2, z3))
+
+/*
+** msb_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svmsb_n_f64_m (p0, z0, z1, d4),
+		 z0 = svmsb_m (p0, z0, z1, d4))
+
+/*
+** msb_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fmsb	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svmsb_n_f64_m (p0, z1, z2, d4),
+		 z0 = svmsb_m (p0, z1, z2, d4))
+
+/*
+** msb_2_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f64_m_tied1, svfloat64_t,
+		z0 = svmsb_n_f64_m (p0, z0, z1, 2),
+		z0 = svmsb_m (p0, z0, z1, 2))
+
+/*
+** msb_2_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmsb	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f64_m_untied, svfloat64_t,
+		z0 = svmsb_n_f64_m (p0, z1, z2, 2),
+		z0 = svmsb_m (p0, z1, z2, 2))
+
+/*
+** msb_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmsb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f64_z_tied1, svfloat64_t,
+		z0 = svmsb_f64_z (p0, z0, z1, z2),
+		z0 = svmsb_z (p0, z0, z1, z2))
+
+/*
+** msb_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmsb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f64_z_tied2, svfloat64_t,
+		z0 = svmsb_f64_z (p0, z1, z0, z2),
+		z0 = svmsb_z (p0, z1, z0, z2))
+
+/*
+** msb_f64_z_tied3:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f64_z_tied3, svfloat64_t,
+		z0 = svmsb_f64_z (p0, z1, z2, z0),
+		z0 = svmsb_z (p0, z1, z2, z0))
+
+/*
+** msb_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmsb	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmsb	z0\.d, p0/m, z1\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z3\.d
+**	fmls	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f64_z_untied, svfloat64_t,
+		z0 = svmsb_f64_z (p0, z1, z2, z3),
+		z0 = svmsb_z (p0, z1, z2, z3))
+
+/*
+** msb_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svmsb_n_f64_z (p0, z0, z1, d4),
+		 z0 = svmsb_z (p0, z0, z1, d4))
+
+/*
+** msb_d4_f64_z_tied2:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_d4_f64_z_tied2, svfloat64_t, double,
+		 z0 = svmsb_n_f64_z (p0, z1, z0, d4),
+		 z0 = svmsb_z (p0, z1, z0, d4))
+
+/*
+** msb_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmsb	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmsb	z0\.d, p0/m, z1\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmls	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svmsb_n_f64_z (p0, z1, z2, d4),
+		 z0 = svmsb_z (p0, z1, z2, d4))
+
+/*
+** msb_2_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f64_z_tied1, svfloat64_t,
+		z0 = svmsb_n_f64_z (p0, z0, z1, 2),
+		z0 = svmsb_z (p0, z0, z1, 2))
+
+/*
+** msb_2_f64_z_tied2:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f64_z_tied2, svfloat64_t,
+		z0 = svmsb_n_f64_z (p0, z1, z0, 2),
+		z0 = svmsb_z (p0, z1, z0, 2))
+
+/*
+** msb_2_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmsb	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmsb	z0\.d, p0/m, z1\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmls	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f64_z_untied, svfloat64_t,
+		z0 = svmsb_n_f64_z (p0, z1, z2, 2),
+		z0 = svmsb_z (p0, z1, z2, 2))
+
+/*
+** msb_f64_x_tied1:
+**	fmsb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f64_x_tied1, svfloat64_t,
+		z0 = svmsb_f64_x (p0, z0, z1, z2),
+		z0 = svmsb_x (p0, z0, z1, z2))
+
+/*
+** msb_f64_x_tied2:
+**	fmsb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f64_x_tied2, svfloat64_t,
+		z0 = svmsb_f64_x (p0, z1, z0, z2),
+		z0 = svmsb_x (p0, z1, z0, z2))
+
+/*
+** msb_f64_x_tied3:
+**	fmls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f64_x_tied3, svfloat64_t,
+		z0 = svmsb_f64_x (p0, z1, z2, z0),
+		z0 = svmsb_x (p0, z1, z2, z0))
+
+/*
+** msb_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmsb	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0, z2
+**	fmsb	z0\.d, p0/m, z1\.d, z3\.d
+** |
+**	movprfx	z0, z3
+**	fmls	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_f64_x_untied, svfloat64_t,
+		z0 = svmsb_f64_x (p0, z1, z2, z3),
+		z0 = svmsb_x (p0, z1, z2, z3))
+
+/*
+** msb_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svmsb_n_f64_x (p0, z0, z1, d4),
+		 z0 = svmsb_x (p0, z0, z1, d4))
+
+/*
+** msb_d4_f64_x_tied2:
+**	mov	(z[0-9]+\.d), d4
+**	fmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_d4_f64_x_tied2, svfloat64_t, double,
+		 z0 = svmsb_n_f64_x (p0, z1, z0, d4),
+		 z0 = svmsb_x (p0, z1, z0, d4))
+
+/*
+** msb_d4_f64_x_untied: { xfail *-*-* }
+**	mov	z0\.d, d4
+**	fmls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (msb_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svmsb_n_f64_x (p0, z1, z2, d4),
+		 z0 = svmsb_x (p0, z1, z2, d4))
+
+/*
+** msb_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f64_x_tied1, svfloat64_t,
+		z0 = svmsb_n_f64_x (p0, z0, z1, 2),
+		z0 = svmsb_x (p0, z0, z1, 2))
+
+/*
+** msb_2_f64_x_tied2:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f64_x_tied2, svfloat64_t,
+		z0 = svmsb_n_f64_x (p0, z1, z0, 2),
+		z0 = svmsb_x (p0, z1, z0, 2))
+
+/*
+** msb_2_f64_x_untied:
+**	fmov	z0\.d, #2\.0(?:e\+0)?
+**	fmls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_2_f64_x_untied, svfloat64_t,
+		z0 = svmsb_n_f64_x (p0, z1, z2, 2),
+		z0 = svmsb_x (p0, z1, z2, 2))
+
+/*
+** ptrue_msb_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_f64_x_tied1, svfloat64_t,
+		z0 = svmsb_f64_x (svptrue_b64 (), z0, z1, z2),
+		z0 = svmsb_x (svptrue_b64 (), z0, z1, z2))
+
+/*
+** ptrue_msb_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_f64_x_tied2, svfloat64_t,
+		z0 = svmsb_f64_x (svptrue_b64 (), z1, z0, z2),
+		z0 = svmsb_x (svptrue_b64 (), z1, z0, z2))
+
+/*
+** ptrue_msb_f64_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_f64_x_tied3, svfloat64_t,
+		z0 = svmsb_f64_x (svptrue_b64 (), z1, z2, z0),
+		z0 = svmsb_x (svptrue_b64 (), z1, z2, z0))
+
+/*
+** ptrue_msb_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_f64_x_untied, svfloat64_t,
+		z0 = svmsb_f64_x (svptrue_b64 (), z1, z2, z3),
+		z0 = svmsb_x (svptrue_b64 (), z1, z2, z3))
+
+/*
+** ptrue_msb_2_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_2_f64_x_tied1, svfloat64_t,
+		z0 = svmsb_n_f64_x (svptrue_b64 (), z0, z1, 2),
+		z0 = svmsb_x (svptrue_b64 (), z0, z1, 2))
+
+/*
+** ptrue_msb_2_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_2_f64_x_tied2, svfloat64_t,
+		z0 = svmsb_n_f64_x (svptrue_b64 (), z1, z0, 2),
+		z0 = svmsb_x (svptrue_b64 (), z1, z0, 2))
+
+/*
+** ptrue_msb_2_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_msb_2_f64_x_untied, svfloat64_t,
+		z0 = svmsb_n_f64_x (svptrue_b64 (), z1, z2, 2),
+		z0 = svmsb_x (svptrue_b64 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s16.c
new file mode 100644
index 000000000..56347cfb9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s16.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** msb_s16_m_tied1:
+**	msb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s16_m_tied1, svint16_t,
+		z0 = svmsb_s16_m (p0, z0, z1, z2),
+		z0 = svmsb_m (p0, z0, z1, z2))
+
+/*
+** msb_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	msb	z0\.h, p0/m, \1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s16_m_tied2, svint16_t,
+		z0 = svmsb_s16_m (p0, z1, z0, z2),
+		z0 = svmsb_m (p0, z1, z0, z2))
+
+/*
+** msb_s16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	msb	z0\.h, p0/m, z2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s16_m_tied3, svint16_t,
+		z0 = svmsb_s16_m (p0, z1, z2, z0),
+		z0 = svmsb_m (p0, z1, z2, z0))
+
+/*
+** msb_s16_m_untied:
+**	movprfx	z0, z1
+**	msb	z0\.h, p0/m, z2\.h, z3\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s16_m_untied, svint16_t,
+		z0 = svmsb_s16_m (p0, z1, z2, z3),
+		z0 = svmsb_m (p0, z1, z2, z3))
+
+/*
+** msb_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s16_m_tied1, svint16_t, int16_t,
+		 z0 = svmsb_n_s16_m (p0, z0, z1, x0),
+		 z0 = svmsb_m (p0, z0, z1, x0))
+
+/*
+** msb_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	msb	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s16_m_untied, svint16_t, int16_t,
+		 z0 = svmsb_n_s16_m (p0, z1, z2, x0),
+		 z0 = svmsb_m (p0, z1, z2, x0))
+
+/*
+** msb_11_s16_m_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s16_m_tied1, svint16_t,
+		z0 = svmsb_n_s16_m (p0, z0, z1, 11),
+		z0 = svmsb_m (p0, z0, z1, 11))
+
+/*
+** msb_11_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0, z1
+**	msb	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s16_m_untied, svint16_t,
+		z0 = svmsb_n_s16_m (p0, z1, z2, 11),
+		z0 = svmsb_m (p0, z1, z2, 11))
+
+/*
+** msb_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s16_z_tied1, svint16_t,
+		z0 = svmsb_s16_z (p0, z0, z1, z2),
+		z0 = svmsb_z (p0, z0, z1, z2))
+
+/*
+** msb_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s16_z_tied2, svint16_t,
+		z0 = svmsb_s16_z (p0, z1, z0, z2),
+		z0 = svmsb_z (p0, z1, z0, z2))
+
+/*
+** msb_s16_z_tied3:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s16_z_tied3, svint16_t,
+		z0 = svmsb_s16_z (p0, z1, z2, z0),
+		z0 = svmsb_z (p0, z1, z2, z0))
+
+/*
+** msb_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	msb	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	msb	z0\.h, p0/m, z1\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z3\.h
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s16_z_untied, svint16_t,
+		z0 = svmsb_s16_z (p0, z1, z2, z3),
+		z0 = svmsb_z (p0, z1, z2, z3))
+
+/*
+** msb_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s16_z_tied1, svint16_t, int16_t,
+		 z0 = svmsb_n_s16_z (p0, z0, z1, x0),
+		 z0 = svmsb_z (p0, z0, z1, x0))
+
+/*
+** msb_w0_s16_z_tied2:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s16_z_tied2, svint16_t, int16_t,
+		 z0 = svmsb_n_s16_z (p0, z1, z0, x0),
+		 z0 = svmsb_z (p0, z1, z0, x0))
+
+/*
+** msb_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	msb	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	msb	z0\.h, p0/m, z1\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s16_z_untied, svint16_t, int16_t,
+		 z0 = svmsb_n_s16_z (p0, z1, z2, x0),
+		 z0 = svmsb_z (p0, z1, z2, x0))
+
+/*
+** msb_11_s16_z_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s16_z_tied1, svint16_t,
+		z0 = svmsb_n_s16_z (p0, z0, z1, 11),
+		z0 = svmsb_z (p0, z0, z1, 11))
+
+/*
+** msb_11_s16_z_tied2:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s16_z_tied2, svint16_t,
+		z0 = svmsb_n_s16_z (p0, z1, z0, 11),
+		z0 = svmsb_z (p0, z1, z0, 11))
+
+/*
+** msb_11_s16_z_untied:
+**	mov	(z[0-9]+\.h), #11
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	msb	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	msb	z0\.h, p0/m, z1\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s16_z_untied, svint16_t,
+		z0 = svmsb_n_s16_z (p0, z1, z2, 11),
+		z0 = svmsb_z (p0, z1, z2, 11))
+
+/*
+** msb_s16_x_tied1:
+**	msb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s16_x_tied1, svint16_t,
+		z0 = svmsb_s16_x (p0, z0, z1, z2),
+		z0 = svmsb_x (p0, z0, z1, z2))
+
+/*
+** msb_s16_x_tied2:
+**	msb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s16_x_tied2, svint16_t,
+		z0 = svmsb_s16_x (p0, z1, z0, z2),
+		z0 = svmsb_x (p0, z1, z0, z2))
+
+/*
+** msb_s16_x_tied3:
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s16_x_tied3, svint16_t,
+		z0 = svmsb_s16_x (p0, z1, z2, z0),
+		z0 = svmsb_x (p0, z1, z2, z0))
+
+/*
+** msb_s16_x_untied:
+** (
+**	movprfx	z0, z1
+**	msb	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0, z2
+**	msb	z0\.h, p0/m, z1\.h, z3\.h
+** |
+**	movprfx	z0, z3
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s16_x_untied, svint16_t,
+		z0 = svmsb_s16_x (p0, z1, z2, z3),
+		z0 = svmsb_x (p0, z1, z2, z3))
+
+/*
+** msb_w0_s16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s16_x_tied1, svint16_t, int16_t,
+		 z0 = svmsb_n_s16_x (p0, z0, z1, x0),
+		 z0 = svmsb_x (p0, z0, z1, x0))
+
+/*
+** msb_w0_s16_x_tied2:
+**	mov	(z[0-9]+\.h), w0
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s16_x_tied2, svint16_t, int16_t,
+		 z0 = svmsb_n_s16_x (p0, z1, z0, x0),
+		 z0 = svmsb_x (p0, z1, z0, x0))
+
+/*
+** msb_w0_s16_x_untied:
+**	mov	z0\.h, w0
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s16_x_untied, svint16_t, int16_t,
+		 z0 = svmsb_n_s16_x (p0, z1, z2, x0),
+		 z0 = svmsb_x (p0, z1, z2, x0))
+
+/*
+** msb_11_s16_x_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s16_x_tied1, svint16_t,
+		z0 = svmsb_n_s16_x (p0, z0, z1, 11),
+		z0 = svmsb_x (p0, z0, z1, 11))
+
+/*
+** msb_11_s16_x_tied2:
+**	mov	(z[0-9]+\.h), #11
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s16_x_tied2, svint16_t,
+		z0 = svmsb_n_s16_x (p0, z1, z0, 11),
+		z0 = svmsb_x (p0, z1, z0, 11))
+
+/*
+** msb_11_s16_x_untied:
+**	mov	z0\.h, #11
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s16_x_untied, svint16_t,
+		z0 = svmsb_n_s16_x (p0, z1, z2, 11),
+		z0 = svmsb_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s32.c
new file mode 100644
index 000000000..fb7a7815b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s32.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** msb_s32_m_tied1:
+**	msb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s32_m_tied1, svint32_t,
+		z0 = svmsb_s32_m (p0, z0, z1, z2),
+		z0 = svmsb_m (p0, z0, z1, z2))
+
+/*
+** msb_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	msb	z0\.s, p0/m, \1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s32_m_tied2, svint32_t,
+		z0 = svmsb_s32_m (p0, z1, z0, z2),
+		z0 = svmsb_m (p0, z1, z0, z2))
+
+/*
+** msb_s32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	msb	z0\.s, p0/m, z2\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s32_m_tied3, svint32_t,
+		z0 = svmsb_s32_m (p0, z1, z2, z0),
+		z0 = svmsb_m (p0, z1, z2, z0))
+
+/*
+** msb_s32_m_untied:
+**	movprfx	z0, z1
+**	msb	z0\.s, p0/m, z2\.s, z3\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s32_m_untied, svint32_t,
+		z0 = svmsb_s32_m (p0, z1, z2, z3),
+		z0 = svmsb_m (p0, z1, z2, z3))
+
+/*
+** msb_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svmsb_n_s32_m (p0, z0, z1, x0),
+		 z0 = svmsb_m (p0, z0, z1, x0))
+
+/*
+** msb_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	msb	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svmsb_n_s32_m (p0, z1, z2, x0),
+		 z0 = svmsb_m (p0, z1, z2, x0))
+
+/*
+** msb_11_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s32_m_tied1, svint32_t,
+		z0 = svmsb_n_s32_m (p0, z0, z1, 11),
+		z0 = svmsb_m (p0, z0, z1, 11))
+
+/*
+** msb_11_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0, z1
+**	msb	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s32_m_untied, svint32_t,
+		z0 = svmsb_n_s32_m (p0, z1, z2, 11),
+		z0 = svmsb_m (p0, z1, z2, 11))
+
+/*
+** msb_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s32_z_tied1, svint32_t,
+		z0 = svmsb_s32_z (p0, z0, z1, z2),
+		z0 = svmsb_z (p0, z0, z1, z2))
+
+/*
+** msb_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s32_z_tied2, svint32_t,
+		z0 = svmsb_s32_z (p0, z1, z0, z2),
+		z0 = svmsb_z (p0, z1, z0, z2))
+
+/*
+** msb_s32_z_tied3:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s32_z_tied3, svint32_t,
+		z0 = svmsb_s32_z (p0, z1, z2, z0),
+		z0 = svmsb_z (p0, z1, z2, z0))
+
+/*
+** msb_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	msb	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	msb	z0\.s, p0/m, z1\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z3\.s
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s32_z_untied, svint32_t,
+		z0 = svmsb_s32_z (p0, z1, z2, z3),
+		z0 = svmsb_z (p0, z1, z2, z3))
+
+/*
+** msb_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svmsb_n_s32_z (p0, z0, z1, x0),
+		 z0 = svmsb_z (p0, z0, z1, x0))
+
+/*
+** msb_w0_s32_z_tied2:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s32_z_tied2, svint32_t, int32_t,
+		 z0 = svmsb_n_s32_z (p0, z1, z0, x0),
+		 z0 = svmsb_z (p0, z1, z0, x0))
+
+/*
+** msb_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	msb	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	msb	z0\.s, p0/m, z1\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svmsb_n_s32_z (p0, z1, z2, x0),
+		 z0 = svmsb_z (p0, z1, z2, x0))
+
+/*
+** msb_11_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s32_z_tied1, svint32_t,
+		z0 = svmsb_n_s32_z (p0, z0, z1, 11),
+		z0 = svmsb_z (p0, z0, z1, 11))
+
+/*
+** msb_11_s32_z_tied2:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s32_z_tied2, svint32_t,
+		z0 = svmsb_n_s32_z (p0, z1, z0, 11),
+		z0 = svmsb_z (p0, z1, z0, 11))
+
+/*
+** msb_11_s32_z_untied:
+**	mov	(z[0-9]+\.s), #11
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	msb	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	msb	z0\.s, p0/m, z1\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s32_z_untied, svint32_t,
+		z0 = svmsb_n_s32_z (p0, z1, z2, 11),
+		z0 = svmsb_z (p0, z1, z2, 11))
+
+/*
+** msb_s32_x_tied1:
+**	msb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s32_x_tied1, svint32_t,
+		z0 = svmsb_s32_x (p0, z0, z1, z2),
+		z0 = svmsb_x (p0, z0, z1, z2))
+
+/*
+** msb_s32_x_tied2:
+**	msb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s32_x_tied2, svint32_t,
+		z0 = svmsb_s32_x (p0, z1, z0, z2),
+		z0 = svmsb_x (p0, z1, z0, z2))
+
+/*
+** msb_s32_x_tied3:
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s32_x_tied3, svint32_t,
+		z0 = svmsb_s32_x (p0, z1, z2, z0),
+		z0 = svmsb_x (p0, z1, z2, z0))
+
+/*
+** msb_s32_x_untied:
+** (
+**	movprfx	z0, z1
+**	msb	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0, z2
+**	msb	z0\.s, p0/m, z1\.s, z3\.s
+** |
+**	movprfx	z0, z3
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s32_x_untied, svint32_t,
+		z0 = svmsb_s32_x (p0, z1, z2, z3),
+		z0 = svmsb_x (p0, z1, z2, z3))
+
+/*
+** msb_w0_s32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svmsb_n_s32_x (p0, z0, z1, x0),
+		 z0 = svmsb_x (p0, z0, z1, x0))
+
+/*
+** msb_w0_s32_x_tied2:
+**	mov	(z[0-9]+\.s), w0
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s32_x_tied2, svint32_t, int32_t,
+		 z0 = svmsb_n_s32_x (p0, z1, z0, x0),
+		 z0 = svmsb_x (p0, z1, z0, x0))
+
+/*
+** msb_w0_s32_x_untied:
+**	mov	z0\.s, w0
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svmsb_n_s32_x (p0, z1, z2, x0),
+		 z0 = svmsb_x (p0, z1, z2, x0))
+
+/*
+** msb_11_s32_x_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s32_x_tied1, svint32_t,
+		z0 = svmsb_n_s32_x (p0, z0, z1, 11),
+		z0 = svmsb_x (p0, z0, z1, 11))
+
+/*
+** msb_11_s32_x_tied2:
+**	mov	(z[0-9]+\.s), #11
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s32_x_tied2, svint32_t,
+		z0 = svmsb_n_s32_x (p0, z1, z0, 11),
+		z0 = svmsb_x (p0, z1, z0, 11))
+
+/*
+** msb_11_s32_x_untied:
+**	mov	z0\.s, #11
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s32_x_untied, svint32_t,
+		z0 = svmsb_n_s32_x (p0, z1, z2, 11),
+		z0 = svmsb_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s64.c
new file mode 100644
index 000000000..6829fab36
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s64.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** msb_s64_m_tied1:
+**	msb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s64_m_tied1, svint64_t,
+		z0 = svmsb_s64_m (p0, z0, z1, z2),
+		z0 = svmsb_m (p0, z0, z1, z2))
+
+/*
+** msb_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	msb	z0\.d, p0/m, \1, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s64_m_tied2, svint64_t,
+		z0 = svmsb_s64_m (p0, z1, z0, z2),
+		z0 = svmsb_m (p0, z1, z0, z2))
+
+/*
+** msb_s64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	msb	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s64_m_tied3, svint64_t,
+		z0 = svmsb_s64_m (p0, z1, z2, z0),
+		z0 = svmsb_m (p0, z1, z2, z0))
+
+/*
+** msb_s64_m_untied:
+**	movprfx	z0, z1
+**	msb	z0\.d, p0/m, z2\.d, z3\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s64_m_untied, svint64_t,
+		z0 = svmsb_s64_m (p0, z1, z2, z3),
+		z0 = svmsb_m (p0, z1, z2, z3))
+
+/*
+** msb_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svmsb_n_s64_m (p0, z0, z1, x0),
+		 z0 = svmsb_m (p0, z0, z1, x0))
+
+/*
+** msb_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	msb	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svmsb_n_s64_m (p0, z1, z2, x0),
+		 z0 = svmsb_m (p0, z1, z2, x0))
+
+/*
+** msb_11_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s64_m_tied1, svint64_t,
+		z0 = svmsb_n_s64_m (p0, z0, z1, 11),
+		z0 = svmsb_m (p0, z0, z1, 11))
+
+/*
+** msb_11_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0, z1
+**	msb	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s64_m_untied, svint64_t,
+		z0 = svmsb_n_s64_m (p0, z1, z2, 11),
+		z0 = svmsb_m (p0, z1, z2, 11))
+
+/*
+** msb_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s64_z_tied1, svint64_t,
+		z0 = svmsb_s64_z (p0, z0, z1, z2),
+		z0 = svmsb_z (p0, z0, z1, z2))
+
+/*
+** msb_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s64_z_tied2, svint64_t,
+		z0 = svmsb_s64_z (p0, z1, z0, z2),
+		z0 = svmsb_z (p0, z1, z0, z2))
+
+/*
+** msb_s64_z_tied3:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s64_z_tied3, svint64_t,
+		z0 = svmsb_s64_z (p0, z1, z2, z0),
+		z0 = svmsb_z (p0, z1, z2, z0))
+
+/*
+** msb_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	msb	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	msb	z0\.d, p0/m, z1\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z3\.d
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s64_z_untied, svint64_t,
+		z0 = svmsb_s64_z (p0, z1, z2, z3),
+		z0 = svmsb_z (p0, z1, z2, z3))
+
+/*
+** msb_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svmsb_n_s64_z (p0, z0, z1, x0),
+		 z0 = svmsb_z (p0, z0, z1, x0))
+
+/*
+** msb_x0_s64_z_tied2:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_x0_s64_z_tied2, svint64_t, int64_t,
+		 z0 = svmsb_n_s64_z (p0, z1, z0, x0),
+		 z0 = svmsb_z (p0, z1, z0, x0))
+
+/*
+** msb_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	msb	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	msb	z0\.d, p0/m, z1\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svmsb_n_s64_z (p0, z1, z2, x0),
+		 z0 = svmsb_z (p0, z1, z2, x0))
+
+/*
+** msb_11_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s64_z_tied1, svint64_t,
+		z0 = svmsb_n_s64_z (p0, z0, z1, 11),
+		z0 = svmsb_z (p0, z0, z1, 11))
+
+/*
+** msb_11_s64_z_tied2:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s64_z_tied2, svint64_t,
+		z0 = svmsb_n_s64_z (p0, z1, z0, 11),
+		z0 = svmsb_z (p0, z1, z0, 11))
+
+/*
+** msb_11_s64_z_untied:
+**	mov	(z[0-9]+\.d), #11
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	msb	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	msb	z0\.d, p0/m, z1\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s64_z_untied, svint64_t,
+		z0 = svmsb_n_s64_z (p0, z1, z2, 11),
+		z0 = svmsb_z (p0, z1, z2, 11))
+
+/*
+** msb_s64_x_tied1:
+**	msb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s64_x_tied1, svint64_t,
+		z0 = svmsb_s64_x (p0, z0, z1, z2),
+		z0 = svmsb_x (p0, z0, z1, z2))
+
+/*
+** msb_s64_x_tied2:
+**	msb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s64_x_tied2, svint64_t,
+		z0 = svmsb_s64_x (p0, z1, z0, z2),
+		z0 = svmsb_x (p0, z1, z0, z2))
+
+/*
+** msb_s64_x_tied3:
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s64_x_tied3, svint64_t,
+		z0 = svmsb_s64_x (p0, z1, z2, z0),
+		z0 = svmsb_x (p0, z1, z2, z0))
+
+/*
+** msb_s64_x_untied:
+** (
+**	movprfx	z0, z1
+**	msb	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0, z2
+**	msb	z0\.d, p0/m, z1\.d, z3\.d
+** |
+**	movprfx	z0, z3
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s64_x_untied, svint64_t,
+		z0 = svmsb_s64_x (p0, z1, z2, z3),
+		z0 = svmsb_x (p0, z1, z2, z3))
+
+/*
+** msb_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svmsb_n_s64_x (p0, z0, z1, x0),
+		 z0 = svmsb_x (p0, z0, z1, x0))
+
+/*
+** msb_x0_s64_x_tied2:
+**	mov	(z[0-9]+\.d), x0
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_x0_s64_x_tied2, svint64_t, int64_t,
+		 z0 = svmsb_n_s64_x (p0, z1, z0, x0),
+		 z0 = svmsb_x (p0, z1, z0, x0))
+
+/*
+** msb_x0_s64_x_untied:
+**	mov	z0\.d, x0
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svmsb_n_s64_x (p0, z1, z2, x0),
+		 z0 = svmsb_x (p0, z1, z2, x0))
+
+/*
+** msb_11_s64_x_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s64_x_tied1, svint64_t,
+		z0 = svmsb_n_s64_x (p0, z0, z1, 11),
+		z0 = svmsb_x (p0, z0, z1, 11))
+
+/*
+** msb_11_s64_x_tied2:
+**	mov	(z[0-9]+\.d), #11
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s64_x_tied2, svint64_t,
+		z0 = svmsb_n_s64_x (p0, z1, z0, 11),
+		z0 = svmsb_x (p0, z1, z0, 11))
+
+/*
+** msb_11_s64_x_untied:
+**	mov	z0\.d, #11
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s64_x_untied, svint64_t,
+		z0 = svmsb_n_s64_x (p0, z1, z2, 11),
+		z0 = svmsb_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s8.c
new file mode 100644
index 000000000..d7fcafdd0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_s8.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** msb_s8_m_tied1:
+**	msb	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s8_m_tied1, svint8_t,
+		z0 = svmsb_s8_m (p0, z0, z1, z2),
+		z0 = svmsb_m (p0, z0, z1, z2))
+
+/*
+** msb_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	msb	z0\.b, p0/m, \1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s8_m_tied2, svint8_t,
+		z0 = svmsb_s8_m (p0, z1, z0, z2),
+		z0 = svmsb_m (p0, z1, z0, z2))
+
+/*
+** msb_s8_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	msb	z0\.b, p0/m, z2\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s8_m_tied3, svint8_t,
+		z0 = svmsb_s8_m (p0, z1, z2, z0),
+		z0 = svmsb_m (p0, z1, z2, z0))
+
+/*
+** msb_s8_m_untied:
+**	movprfx	z0, z1
+**	msb	z0\.b, p0/m, z2\.b, z3\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s8_m_untied, svint8_t,
+		z0 = svmsb_s8_m (p0, z1, z2, z3),
+		z0 = svmsb_m (p0, z1, z2, z3))
+
+/*
+** msb_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s8_m_tied1, svint8_t, int8_t,
+		 z0 = svmsb_n_s8_m (p0, z0, z1, x0),
+		 z0 = svmsb_m (p0, z0, z1, x0))
+
+/*
+** msb_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	msb	z0\.b, p0/m, z2\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s8_m_untied, svint8_t, int8_t,
+		 z0 = svmsb_n_s8_m (p0, z1, z2, x0),
+		 z0 = svmsb_m (p0, z1, z2, x0))
+
+/*
+** msb_11_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s8_m_tied1, svint8_t,
+		z0 = svmsb_n_s8_m (p0, z0, z1, 11),
+		z0 = svmsb_m (p0, z0, z1, 11))
+
+/*
+** msb_11_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0, z1
+**	msb	z0\.b, p0/m, z2\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s8_m_untied, svint8_t,
+		z0 = svmsb_n_s8_m (p0, z1, z2, 11),
+		z0 = svmsb_m (p0, z1, z2, 11))
+
+/*
+** msb_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s8_z_tied1, svint8_t,
+		z0 = svmsb_s8_z (p0, z0, z1, z2),
+		z0 = svmsb_z (p0, z0, z1, z2))
+
+/*
+** msb_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s8_z_tied2, svint8_t,
+		z0 = svmsb_s8_z (p0, z1, z0, z2),
+		z0 = svmsb_z (p0, z1, z0, z2))
+
+/*
+** msb_s8_z_tied3:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s8_z_tied3, svint8_t,
+		z0 = svmsb_s8_z (p0, z1, z2, z0),
+		z0 = svmsb_z (p0, z1, z2, z0))
+
+/*
+** msb_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	msb	z0\.b, p0/m, z2\.b, z3\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	msb	z0\.b, p0/m, z1\.b, z3\.b
+** |
+**	movprfx	z0\.b, p0/z, z3\.b
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s8_z_untied, svint8_t,
+		z0 = svmsb_s8_z (p0, z1, z2, z3),
+		z0 = svmsb_z (p0, z1, z2, z3))
+
+/*
+** msb_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s8_z_tied1, svint8_t, int8_t,
+		 z0 = svmsb_n_s8_z (p0, z0, z1, x0),
+		 z0 = svmsb_z (p0, z0, z1, x0))
+
+/*
+** msb_w0_s8_z_tied2:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s8_z_tied2, svint8_t, int8_t,
+		 z0 = svmsb_n_s8_z (p0, z1, z0, x0),
+		 z0 = svmsb_z (p0, z1, z0, x0))
+
+/*
+** msb_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	msb	z0\.b, p0/m, z2\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	msb	z0\.b, p0/m, z1\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s8_z_untied, svint8_t, int8_t,
+		 z0 = svmsb_n_s8_z (p0, z1, z2, x0),
+		 z0 = svmsb_z (p0, z1, z2, x0))
+
+/*
+** msb_11_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s8_z_tied1, svint8_t,
+		z0 = svmsb_n_s8_z (p0, z0, z1, 11),
+		z0 = svmsb_z (p0, z0, z1, 11))
+
+/*
+** msb_11_s8_z_tied2:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s8_z_tied2, svint8_t,
+		z0 = svmsb_n_s8_z (p0, z1, z0, 11),
+		z0 = svmsb_z (p0, z1, z0, 11))
+
+/*
+** msb_11_s8_z_untied:
+**	mov	(z[0-9]+\.b), #11
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	msb	z0\.b, p0/m, z2\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	msb	z0\.b, p0/m, z1\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s8_z_untied, svint8_t,
+		z0 = svmsb_n_s8_z (p0, z1, z2, 11),
+		z0 = svmsb_z (p0, z1, z2, 11))
+
+/*
+** msb_s8_x_tied1:
+**	msb	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s8_x_tied1, svint8_t,
+		z0 = svmsb_s8_x (p0, z0, z1, z2),
+		z0 = svmsb_x (p0, z0, z1, z2))
+
+/*
+** msb_s8_x_tied2:
+**	msb	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s8_x_tied2, svint8_t,
+		z0 = svmsb_s8_x (p0, z1, z0, z2),
+		z0 = svmsb_x (p0, z1, z0, z2))
+
+/*
+** msb_s8_x_tied3:
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s8_x_tied3, svint8_t,
+		z0 = svmsb_s8_x (p0, z1, z2, z0),
+		z0 = svmsb_x (p0, z1, z2, z0))
+
+/*
+** msb_s8_x_untied:
+** (
+**	movprfx	z0, z1
+**	msb	z0\.b, p0/m, z2\.b, z3\.b
+** |
+**	movprfx	z0, z2
+**	msb	z0\.b, p0/m, z1\.b, z3\.b
+** |
+**	movprfx	z0, z3
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_s8_x_untied, svint8_t,
+		z0 = svmsb_s8_x (p0, z1, z2, z3),
+		z0 = svmsb_x (p0, z1, z2, z3))
+
+/*
+** msb_w0_s8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s8_x_tied1, svint8_t, int8_t,
+		 z0 = svmsb_n_s8_x (p0, z0, z1, x0),
+		 z0 = svmsb_x (p0, z0, z1, x0))
+
+/*
+** msb_w0_s8_x_tied2:
+**	mov	(z[0-9]+\.b), w0
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s8_x_tied2, svint8_t, int8_t,
+		 z0 = svmsb_n_s8_x (p0, z1, z0, x0),
+		 z0 = svmsb_x (p0, z1, z0, x0))
+
+/*
+** msb_w0_s8_x_untied:
+**	mov	z0\.b, w0
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_s8_x_untied, svint8_t, int8_t,
+		 z0 = svmsb_n_s8_x (p0, z1, z2, x0),
+		 z0 = svmsb_x (p0, z1, z2, x0))
+
+/*
+** msb_11_s8_x_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s8_x_tied1, svint8_t,
+		z0 = svmsb_n_s8_x (p0, z0, z1, 11),
+		z0 = svmsb_x (p0, z0, z1, 11))
+
+/*
+** msb_11_s8_x_tied2:
+**	mov	(z[0-9]+\.b), #11
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s8_x_tied2, svint8_t,
+		z0 = svmsb_n_s8_x (p0, z1, z0, 11),
+		z0 = svmsb_x (p0, z1, z0, 11))
+
+/*
+** msb_11_s8_x_untied:
+**	mov	z0\.b, #11
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_s8_x_untied, svint8_t,
+		z0 = svmsb_n_s8_x (p0, z1, z2, 11),
+		z0 = svmsb_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u16.c
new file mode 100644
index 000000000..437a96040
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u16.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** msb_u16_m_tied1:
+**	msb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u16_m_tied1, svuint16_t,
+		z0 = svmsb_u16_m (p0, z0, z1, z2),
+		z0 = svmsb_m (p0, z0, z1, z2))
+
+/*
+** msb_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	msb	z0\.h, p0/m, \1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u16_m_tied2, svuint16_t,
+		z0 = svmsb_u16_m (p0, z1, z0, z2),
+		z0 = svmsb_m (p0, z1, z0, z2))
+
+/*
+** msb_u16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	msb	z0\.h, p0/m, z2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u16_m_tied3, svuint16_t,
+		z0 = svmsb_u16_m (p0, z1, z2, z0),
+		z0 = svmsb_m (p0, z1, z2, z0))
+
+/*
+** msb_u16_m_untied:
+**	movprfx	z0, z1
+**	msb	z0\.h, p0/m, z2\.h, z3\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u16_m_untied, svuint16_t,
+		z0 = svmsb_u16_m (p0, z1, z2, z3),
+		z0 = svmsb_m (p0, z1, z2, z3))
+
+/*
+** msb_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svmsb_n_u16_m (p0, z0, z1, x0),
+		 z0 = svmsb_m (p0, z0, z1, x0))
+
+/*
+** msb_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	msb	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svmsb_n_u16_m (p0, z1, z2, x0),
+		 z0 = svmsb_m (p0, z1, z2, x0))
+
+/*
+** msb_11_u16_m_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u16_m_tied1, svuint16_t,
+		z0 = svmsb_n_u16_m (p0, z0, z1, 11),
+		z0 = svmsb_m (p0, z0, z1, 11))
+
+/*
+** msb_11_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0, z1
+**	msb	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u16_m_untied, svuint16_t,
+		z0 = svmsb_n_u16_m (p0, z1, z2, 11),
+		z0 = svmsb_m (p0, z1, z2, 11))
+
+/*
+** msb_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u16_z_tied1, svuint16_t,
+		z0 = svmsb_u16_z (p0, z0, z1, z2),
+		z0 = svmsb_z (p0, z0, z1, z2))
+
+/*
+** msb_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u16_z_tied2, svuint16_t,
+		z0 = svmsb_u16_z (p0, z1, z0, z2),
+		z0 = svmsb_z (p0, z1, z0, z2))
+
+/*
+** msb_u16_z_tied3:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u16_z_tied3, svuint16_t,
+		z0 = svmsb_u16_z (p0, z1, z2, z0),
+		z0 = svmsb_z (p0, z1, z2, z0))
+
+/*
+** msb_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	msb	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	msb	z0\.h, p0/m, z1\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z3\.h
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u16_z_untied, svuint16_t,
+		z0 = svmsb_u16_z (p0, z1, z2, z3),
+		z0 = svmsb_z (p0, z1, z2, z3))
+
+/*
+** msb_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svmsb_n_u16_z (p0, z0, z1, x0),
+		 z0 = svmsb_z (p0, z0, z1, x0))
+
+/*
+** msb_w0_u16_z_tied2:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u16_z_tied2, svuint16_t, uint16_t,
+		 z0 = svmsb_n_u16_z (p0, z1, z0, x0),
+		 z0 = svmsb_z (p0, z1, z0, x0))
+
+/*
+** msb_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	msb	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	msb	z0\.h, p0/m, z1\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svmsb_n_u16_z (p0, z1, z2, x0),
+		 z0 = svmsb_z (p0, z1, z2, x0))
+
+/*
+** msb_11_u16_z_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u16_z_tied1, svuint16_t,
+		z0 = svmsb_n_u16_z (p0, z0, z1, 11),
+		z0 = svmsb_z (p0, z0, z1, 11))
+
+/*
+** msb_11_u16_z_tied2:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u16_z_tied2, svuint16_t,
+		z0 = svmsb_n_u16_z (p0, z1, z0, 11),
+		z0 = svmsb_z (p0, z1, z0, 11))
+
+/*
+** msb_11_u16_z_untied:
+**	mov	(z[0-9]+\.h), #11
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	msb	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	msb	z0\.h, p0/m, z1\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u16_z_untied, svuint16_t,
+		z0 = svmsb_n_u16_z (p0, z1, z2, 11),
+		z0 = svmsb_z (p0, z1, z2, 11))
+
+/*
+** msb_u16_x_tied1:
+**	msb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u16_x_tied1, svuint16_t,
+		z0 = svmsb_u16_x (p0, z0, z1, z2),
+		z0 = svmsb_x (p0, z0, z1, z2))
+
+/*
+** msb_u16_x_tied2:
+**	msb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u16_x_tied2, svuint16_t,
+		z0 = svmsb_u16_x (p0, z1, z0, z2),
+		z0 = svmsb_x (p0, z1, z0, z2))
+
+/*
+** msb_u16_x_tied3:
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u16_x_tied3, svuint16_t,
+		z0 = svmsb_u16_x (p0, z1, z2, z0),
+		z0 = svmsb_x (p0, z1, z2, z0))
+
+/*
+** msb_u16_x_untied:
+** (
+**	movprfx	z0, z1
+**	msb	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0, z2
+**	msb	z0\.h, p0/m, z1\.h, z3\.h
+** |
+**	movprfx	z0, z3
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u16_x_untied, svuint16_t,
+		z0 = svmsb_u16_x (p0, z1, z2, z3),
+		z0 = svmsb_x (p0, z1, z2, z3))
+
+/*
+** msb_w0_u16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svmsb_n_u16_x (p0, z0, z1, x0),
+		 z0 = svmsb_x (p0, z0, z1, x0))
+
+/*
+** msb_w0_u16_x_tied2:
+**	mov	(z[0-9]+\.h), w0
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u16_x_tied2, svuint16_t, uint16_t,
+		 z0 = svmsb_n_u16_x (p0, z1, z0, x0),
+		 z0 = svmsb_x (p0, z1, z0, x0))
+
+/*
+** msb_w0_u16_x_untied:
+**	mov	z0\.h, w0
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svmsb_n_u16_x (p0, z1, z2, x0),
+		 z0 = svmsb_x (p0, z1, z2, x0))
+
+/*
+** msb_11_u16_x_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u16_x_tied1, svuint16_t,
+		z0 = svmsb_n_u16_x (p0, z0, z1, 11),
+		z0 = svmsb_x (p0, z0, z1, 11))
+
+/*
+** msb_11_u16_x_tied2:
+**	mov	(z[0-9]+\.h), #11
+**	msb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u16_x_tied2, svuint16_t,
+		z0 = svmsb_n_u16_x (p0, z1, z0, 11),
+		z0 = svmsb_x (p0, z1, z0, 11))
+
+/*
+** msb_11_u16_x_untied:
+**	mov	z0\.h, #11
+**	mls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u16_x_untied, svuint16_t,
+		z0 = svmsb_n_u16_x (p0, z1, z2, 11),
+		z0 = svmsb_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u32.c
new file mode 100644
index 000000000..aaaf0344a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u32.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** msb_u32_m_tied1:
+**	msb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u32_m_tied1, svuint32_t,
+		z0 = svmsb_u32_m (p0, z0, z1, z2),
+		z0 = svmsb_m (p0, z0, z1, z2))
+
+/*
+** msb_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	msb	z0\.s, p0/m, \1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u32_m_tied2, svuint32_t,
+		z0 = svmsb_u32_m (p0, z1, z0, z2),
+		z0 = svmsb_m (p0, z1, z0, z2))
+
+/*
+** msb_u32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	msb	z0\.s, p0/m, z2\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u32_m_tied3, svuint32_t,
+		z0 = svmsb_u32_m (p0, z1, z2, z0),
+		z0 = svmsb_m (p0, z1, z2, z0))
+
+/*
+** msb_u32_m_untied:
+**	movprfx	z0, z1
+**	msb	z0\.s, p0/m, z2\.s, z3\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u32_m_untied, svuint32_t,
+		z0 = svmsb_u32_m (p0, z1, z2, z3),
+		z0 = svmsb_m (p0, z1, z2, z3))
+
+/*
+** msb_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svmsb_n_u32_m (p0, z0, z1, x0),
+		 z0 = svmsb_m (p0, z0, z1, x0))
+
+/*
+** msb_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	msb	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svmsb_n_u32_m (p0, z1, z2, x0),
+		 z0 = svmsb_m (p0, z1, z2, x0))
+
+/*
+** msb_11_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u32_m_tied1, svuint32_t,
+		z0 = svmsb_n_u32_m (p0, z0, z1, 11),
+		z0 = svmsb_m (p0, z0, z1, 11))
+
+/*
+** msb_11_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0, z1
+**	msb	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u32_m_untied, svuint32_t,
+		z0 = svmsb_n_u32_m (p0, z1, z2, 11),
+		z0 = svmsb_m (p0, z1, z2, 11))
+
+/*
+** msb_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u32_z_tied1, svuint32_t,
+		z0 = svmsb_u32_z (p0, z0, z1, z2),
+		z0 = svmsb_z (p0, z0, z1, z2))
+
+/*
+** msb_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u32_z_tied2, svuint32_t,
+		z0 = svmsb_u32_z (p0, z1, z0, z2),
+		z0 = svmsb_z (p0, z1, z0, z2))
+
+/*
+** msb_u32_z_tied3:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u32_z_tied3, svuint32_t,
+		z0 = svmsb_u32_z (p0, z1, z2, z0),
+		z0 = svmsb_z (p0, z1, z2, z0))
+
+/*
+** msb_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	msb	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	msb	z0\.s, p0/m, z1\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z3\.s
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u32_z_untied, svuint32_t,
+		z0 = svmsb_u32_z (p0, z1, z2, z3),
+		z0 = svmsb_z (p0, z1, z2, z3))
+
+/*
+** msb_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svmsb_n_u32_z (p0, z0, z1, x0),
+		 z0 = svmsb_z (p0, z0, z1, x0))
+
+/*
+** msb_w0_u32_z_tied2:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u32_z_tied2, svuint32_t, uint32_t,
+		 z0 = svmsb_n_u32_z (p0, z1, z0, x0),
+		 z0 = svmsb_z (p0, z1, z0, x0))
+
+/*
+** msb_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	msb	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	msb	z0\.s, p0/m, z1\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svmsb_n_u32_z (p0, z1, z2, x0),
+		 z0 = svmsb_z (p0, z1, z2, x0))
+
+/*
+** msb_11_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u32_z_tied1, svuint32_t,
+		z0 = svmsb_n_u32_z (p0, z0, z1, 11),
+		z0 = svmsb_z (p0, z0, z1, 11))
+
+/*
+** msb_11_u32_z_tied2:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u32_z_tied2, svuint32_t,
+		z0 = svmsb_n_u32_z (p0, z1, z0, 11),
+		z0 = svmsb_z (p0, z1, z0, 11))
+
+/*
+** msb_11_u32_z_untied:
+**	mov	(z[0-9]+\.s), #11
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	msb	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	msb	z0\.s, p0/m, z1\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u32_z_untied, svuint32_t,
+		z0 = svmsb_n_u32_z (p0, z1, z2, 11),
+		z0 = svmsb_z (p0, z1, z2, 11))
+
+/*
+** msb_u32_x_tied1:
+**	msb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u32_x_tied1, svuint32_t,
+		z0 = svmsb_u32_x (p0, z0, z1, z2),
+		z0 = svmsb_x (p0, z0, z1, z2))
+
+/*
+** msb_u32_x_tied2:
+**	msb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u32_x_tied2, svuint32_t,
+		z0 = svmsb_u32_x (p0, z1, z0, z2),
+		z0 = svmsb_x (p0, z1, z0, z2))
+
+/*
+** msb_u32_x_tied3:
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u32_x_tied3, svuint32_t,
+		z0 = svmsb_u32_x (p0, z1, z2, z0),
+		z0 = svmsb_x (p0, z1, z2, z0))
+
+/*
+** msb_u32_x_untied:
+** (
+**	movprfx	z0, z1
+**	msb	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0, z2
+**	msb	z0\.s, p0/m, z1\.s, z3\.s
+** |
+**	movprfx	z0, z3
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u32_x_untied, svuint32_t,
+		z0 = svmsb_u32_x (p0, z1, z2, z3),
+		z0 = svmsb_x (p0, z1, z2, z3))
+
+/*
+** msb_w0_u32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svmsb_n_u32_x (p0, z0, z1, x0),
+		 z0 = svmsb_x (p0, z0, z1, x0))
+
+/*
+** msb_w0_u32_x_tied2:
+**	mov	(z[0-9]+\.s), w0
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u32_x_tied2, svuint32_t, uint32_t,
+		 z0 = svmsb_n_u32_x (p0, z1, z0, x0),
+		 z0 = svmsb_x (p0, z1, z0, x0))
+
+/*
+** msb_w0_u32_x_untied:
+**	mov	z0\.s, w0
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svmsb_n_u32_x (p0, z1, z2, x0),
+		 z0 = svmsb_x (p0, z1, z2, x0))
+
+/*
+** msb_11_u32_x_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u32_x_tied1, svuint32_t,
+		z0 = svmsb_n_u32_x (p0, z0, z1, 11),
+		z0 = svmsb_x (p0, z0, z1, 11))
+
+/*
+** msb_11_u32_x_tied2:
+**	mov	(z[0-9]+\.s), #11
+**	msb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u32_x_tied2, svuint32_t,
+		z0 = svmsb_n_u32_x (p0, z1, z0, 11),
+		z0 = svmsb_x (p0, z1, z0, 11))
+
+/*
+** msb_11_u32_x_untied:
+**	mov	z0\.s, #11
+**	mls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u32_x_untied, svuint32_t,
+		z0 = svmsb_n_u32_x (p0, z1, z2, 11),
+		z0 = svmsb_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u64.c
new file mode 100644
index 000000000..5c5d33073
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u64.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** msb_u64_m_tied1:
+**	msb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u64_m_tied1, svuint64_t,
+		z0 = svmsb_u64_m (p0, z0, z1, z2),
+		z0 = svmsb_m (p0, z0, z1, z2))
+
+/*
+** msb_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	msb	z0\.d, p0/m, \1, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u64_m_tied2, svuint64_t,
+		z0 = svmsb_u64_m (p0, z1, z0, z2),
+		z0 = svmsb_m (p0, z1, z0, z2))
+
+/*
+** msb_u64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	msb	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u64_m_tied3, svuint64_t,
+		z0 = svmsb_u64_m (p0, z1, z2, z0),
+		z0 = svmsb_m (p0, z1, z2, z0))
+
+/*
+** msb_u64_m_untied:
+**	movprfx	z0, z1
+**	msb	z0\.d, p0/m, z2\.d, z3\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u64_m_untied, svuint64_t,
+		z0 = svmsb_u64_m (p0, z1, z2, z3),
+		z0 = svmsb_m (p0, z1, z2, z3))
+
+/*
+** msb_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svmsb_n_u64_m (p0, z0, z1, x0),
+		 z0 = svmsb_m (p0, z0, z1, x0))
+
+/*
+** msb_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	msb	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svmsb_n_u64_m (p0, z1, z2, x0),
+		 z0 = svmsb_m (p0, z1, z2, x0))
+
+/*
+** msb_11_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u64_m_tied1, svuint64_t,
+		z0 = svmsb_n_u64_m (p0, z0, z1, 11),
+		z0 = svmsb_m (p0, z0, z1, 11))
+
+/*
+** msb_11_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0, z1
+**	msb	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u64_m_untied, svuint64_t,
+		z0 = svmsb_n_u64_m (p0, z1, z2, 11),
+		z0 = svmsb_m (p0, z1, z2, 11))
+
+/*
+** msb_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u64_z_tied1, svuint64_t,
+		z0 = svmsb_u64_z (p0, z0, z1, z2),
+		z0 = svmsb_z (p0, z0, z1, z2))
+
+/*
+** msb_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u64_z_tied2, svuint64_t,
+		z0 = svmsb_u64_z (p0, z1, z0, z2),
+		z0 = svmsb_z (p0, z1, z0, z2))
+
+/*
+** msb_u64_z_tied3:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u64_z_tied3, svuint64_t,
+		z0 = svmsb_u64_z (p0, z1, z2, z0),
+		z0 = svmsb_z (p0, z1, z2, z0))
+
+/*
+** msb_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	msb	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	msb	z0\.d, p0/m, z1\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z3\.d
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u64_z_untied, svuint64_t,
+		z0 = svmsb_u64_z (p0, z1, z2, z3),
+		z0 = svmsb_z (p0, z1, z2, z3))
+
+/*
+** msb_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svmsb_n_u64_z (p0, z0, z1, x0),
+		 z0 = svmsb_z (p0, z0, z1, x0))
+
+/*
+** msb_x0_u64_z_tied2:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_x0_u64_z_tied2, svuint64_t, uint64_t,
+		 z0 = svmsb_n_u64_z (p0, z1, z0, x0),
+		 z0 = svmsb_z (p0, z1, z0, x0))
+
+/*
+** msb_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	msb	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	msb	z0\.d, p0/m, z1\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svmsb_n_u64_z (p0, z1, z2, x0),
+		 z0 = svmsb_z (p0, z1, z2, x0))
+
+/*
+** msb_11_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u64_z_tied1, svuint64_t,
+		z0 = svmsb_n_u64_z (p0, z0, z1, 11),
+		z0 = svmsb_z (p0, z0, z1, 11))
+
+/*
+** msb_11_u64_z_tied2:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u64_z_tied2, svuint64_t,
+		z0 = svmsb_n_u64_z (p0, z1, z0, 11),
+		z0 = svmsb_z (p0, z1, z0, 11))
+
+/*
+** msb_11_u64_z_untied:
+**	mov	(z[0-9]+\.d), #11
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	msb	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	msb	z0\.d, p0/m, z1\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u64_z_untied, svuint64_t,
+		z0 = svmsb_n_u64_z (p0, z1, z2, 11),
+		z0 = svmsb_z (p0, z1, z2, 11))
+
+/*
+** msb_u64_x_tied1:
+**	msb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u64_x_tied1, svuint64_t,
+		z0 = svmsb_u64_x (p0, z0, z1, z2),
+		z0 = svmsb_x (p0, z0, z1, z2))
+
+/*
+** msb_u64_x_tied2:
+**	msb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u64_x_tied2, svuint64_t,
+		z0 = svmsb_u64_x (p0, z1, z0, z2),
+		z0 = svmsb_x (p0, z1, z0, z2))
+
+/*
+** msb_u64_x_tied3:
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u64_x_tied3, svuint64_t,
+		z0 = svmsb_u64_x (p0, z1, z2, z0),
+		z0 = svmsb_x (p0, z1, z2, z0))
+
+/*
+** msb_u64_x_untied:
+** (
+**	movprfx	z0, z1
+**	msb	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0, z2
+**	msb	z0\.d, p0/m, z1\.d, z3\.d
+** |
+**	movprfx	z0, z3
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u64_x_untied, svuint64_t,
+		z0 = svmsb_u64_x (p0, z1, z2, z3),
+		z0 = svmsb_x (p0, z1, z2, z3))
+
+/*
+** msb_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svmsb_n_u64_x (p0, z0, z1, x0),
+		 z0 = svmsb_x (p0, z0, z1, x0))
+
+/*
+** msb_x0_u64_x_tied2:
+**	mov	(z[0-9]+\.d), x0
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_x0_u64_x_tied2, svuint64_t, uint64_t,
+		 z0 = svmsb_n_u64_x (p0, z1, z0, x0),
+		 z0 = svmsb_x (p0, z1, z0, x0))
+
+/*
+** msb_x0_u64_x_untied:
+**	mov	z0\.d, x0
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svmsb_n_u64_x (p0, z1, z2, x0),
+		 z0 = svmsb_x (p0, z1, z2, x0))
+
+/*
+** msb_11_u64_x_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u64_x_tied1, svuint64_t,
+		z0 = svmsb_n_u64_x (p0, z0, z1, 11),
+		z0 = svmsb_x (p0, z0, z1, 11))
+
+/*
+** msb_11_u64_x_tied2:
+**	mov	(z[0-9]+\.d), #11
+**	msb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u64_x_tied2, svuint64_t,
+		z0 = svmsb_n_u64_x (p0, z1, z0, 11),
+		z0 = svmsb_x (p0, z1, z0, 11))
+
+/*
+** msb_11_u64_x_untied:
+**	mov	z0\.d, #11
+**	mls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u64_x_untied, svuint64_t,
+		z0 = svmsb_n_u64_x (p0, z1, z2, 11),
+		z0 = svmsb_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u8.c
new file mode 100644
index 000000000..5665ec9e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/msb_u8.c
@@ -0,0 +1,321 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** msb_u8_m_tied1:
+**	msb	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u8_m_tied1, svuint8_t,
+		z0 = svmsb_u8_m (p0, z0, z1, z2),
+		z0 = svmsb_m (p0, z0, z1, z2))
+
+/*
+** msb_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	msb	z0\.b, p0/m, \1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u8_m_tied2, svuint8_t,
+		z0 = svmsb_u8_m (p0, z1, z0, z2),
+		z0 = svmsb_m (p0, z1, z0, z2))
+
+/*
+** msb_u8_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	msb	z0\.b, p0/m, z2\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u8_m_tied3, svuint8_t,
+		z0 = svmsb_u8_m (p0, z1, z2, z0),
+		z0 = svmsb_m (p0, z1, z2, z0))
+
+/*
+** msb_u8_m_untied:
+**	movprfx	z0, z1
+**	msb	z0\.b, p0/m, z2\.b, z3\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u8_m_untied, svuint8_t,
+		z0 = svmsb_u8_m (p0, z1, z2, z3),
+		z0 = svmsb_m (p0, z1, z2, z3))
+
+/*
+** msb_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svmsb_n_u8_m (p0, z0, z1, x0),
+		 z0 = svmsb_m (p0, z0, z1, x0))
+
+/*
+** msb_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	msb	z0\.b, p0/m, z2\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svmsb_n_u8_m (p0, z1, z2, x0),
+		 z0 = svmsb_m (p0, z1, z2, x0))
+
+/*
+** msb_11_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u8_m_tied1, svuint8_t,
+		z0 = svmsb_n_u8_m (p0, z0, z1, 11),
+		z0 = svmsb_m (p0, z0, z1, 11))
+
+/*
+** msb_11_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0, z1
+**	msb	z0\.b, p0/m, z2\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u8_m_untied, svuint8_t,
+		z0 = svmsb_n_u8_m (p0, z1, z2, 11),
+		z0 = svmsb_m (p0, z1, z2, 11))
+
+/*
+** msb_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u8_z_tied1, svuint8_t,
+		z0 = svmsb_u8_z (p0, z0, z1, z2),
+		z0 = svmsb_z (p0, z0, z1, z2))
+
+/*
+** msb_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u8_z_tied2, svuint8_t,
+		z0 = svmsb_u8_z (p0, z1, z0, z2),
+		z0 = svmsb_z (p0, z1, z0, z2))
+
+/*
+** msb_u8_z_tied3:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u8_z_tied3, svuint8_t,
+		z0 = svmsb_u8_z (p0, z1, z2, z0),
+		z0 = svmsb_z (p0, z1, z2, z0))
+
+/*
+** msb_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	msb	z0\.b, p0/m, z2\.b, z3\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	msb	z0\.b, p0/m, z1\.b, z3\.b
+** |
+**	movprfx	z0\.b, p0/z, z3\.b
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u8_z_untied, svuint8_t,
+		z0 = svmsb_u8_z (p0, z1, z2, z3),
+		z0 = svmsb_z (p0, z1, z2, z3))
+
+/*
+** msb_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svmsb_n_u8_z (p0, z0, z1, x0),
+		 z0 = svmsb_z (p0, z0, z1, x0))
+
+/*
+** msb_w0_u8_z_tied2:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u8_z_tied2, svuint8_t, uint8_t,
+		 z0 = svmsb_n_u8_z (p0, z1, z0, x0),
+		 z0 = svmsb_z (p0, z1, z0, x0))
+
+/*
+** msb_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	msb	z0\.b, p0/m, z2\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	msb	z0\.b, p0/m, z1\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svmsb_n_u8_z (p0, z1, z2, x0),
+		 z0 = svmsb_z (p0, z1, z2, x0))
+
+/*
+** msb_11_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u8_z_tied1, svuint8_t,
+		z0 = svmsb_n_u8_z (p0, z0, z1, 11),
+		z0 = svmsb_z (p0, z0, z1, 11))
+
+/*
+** msb_11_u8_z_tied2:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u8_z_tied2, svuint8_t,
+		z0 = svmsb_n_u8_z (p0, z1, z0, 11),
+		z0 = svmsb_z (p0, z1, z0, 11))
+
+/*
+** msb_11_u8_z_untied:
+**	mov	(z[0-9]+\.b), #11
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	msb	z0\.b, p0/m, z2\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	msb	z0\.b, p0/m, z1\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u8_z_untied, svuint8_t,
+		z0 = svmsb_n_u8_z (p0, z1, z2, 11),
+		z0 = svmsb_z (p0, z1, z2, 11))
+
+/*
+** msb_u8_x_tied1:
+**	msb	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u8_x_tied1, svuint8_t,
+		z0 = svmsb_u8_x (p0, z0, z1, z2),
+		z0 = svmsb_x (p0, z0, z1, z2))
+
+/*
+** msb_u8_x_tied2:
+**	msb	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u8_x_tied2, svuint8_t,
+		z0 = svmsb_u8_x (p0, z1, z0, z2),
+		z0 = svmsb_x (p0, z1, z0, z2))
+
+/*
+** msb_u8_x_tied3:
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u8_x_tied3, svuint8_t,
+		z0 = svmsb_u8_x (p0, z1, z2, z0),
+		z0 = svmsb_x (p0, z1, z2, z0))
+
+/*
+** msb_u8_x_untied:
+** (
+**	movprfx	z0, z1
+**	msb	z0\.b, p0/m, z2\.b, z3\.b
+** |
+**	movprfx	z0, z2
+**	msb	z0\.b, p0/m, z1\.b, z3\.b
+** |
+**	movprfx	z0, z3
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (msb_u8_x_untied, svuint8_t,
+		z0 = svmsb_u8_x (p0, z1, z2, z3),
+		z0 = svmsb_x (p0, z1, z2, z3))
+
+/*
+** msb_w0_u8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svmsb_n_u8_x (p0, z0, z1, x0),
+		 z0 = svmsb_x (p0, z0, z1, x0))
+
+/*
+** msb_w0_u8_x_tied2:
+**	mov	(z[0-9]+\.b), w0
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u8_x_tied2, svuint8_t, uint8_t,
+		 z0 = svmsb_n_u8_x (p0, z1, z0, x0),
+		 z0 = svmsb_x (p0, z1, z0, x0))
+
+/*
+** msb_w0_u8_x_untied:
+**	mov	z0\.b, w0
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (msb_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svmsb_n_u8_x (p0, z1, z2, x0),
+		 z0 = svmsb_x (p0, z1, z2, x0))
+
+/*
+** msb_11_u8_x_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u8_x_tied1, svuint8_t,
+		z0 = svmsb_n_u8_x (p0, z0, z1, 11),
+		z0 = svmsb_x (p0, z0, z1, 11))
+
+/*
+** msb_11_u8_x_tied2:
+**	mov	(z[0-9]+\.b), #11
+**	msb	z0\.b, p0/m, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u8_x_tied2, svuint8_t,
+		z0 = svmsb_n_u8_x (p0, z1, z0, 11),
+		z0 = svmsb_x (p0, z1, z0, 11))
+
+/*
+** msb_11_u8_x_untied:
+**	mov	z0\.b, #11
+**	mls	z0\.b, p0/m, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (msb_11_u8_x_untied, svuint8_t,
+		z0 = svmsb_n_u8_x (p0, z1, z2, 11),
+		z0 = svmsb_x (p0, z1, z2, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f16.c
new file mode 100644
index 000000000..ef3de0c59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f16.c
@@ -0,0 +1,444 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_f16_m_tied1:
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_m_tied1, svfloat16_t,
+		z0 = svmul_f16_m (p0, z0, z1),
+		z0 = svmul_m (p0, z0, z1))
+
+/*
+** mul_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_m_tied2, svfloat16_t,
+		z0 = svmul_f16_m (p0, z1, z0),
+		z0 = svmul_m (p0, z1, z0))
+
+/*
+** mul_f16_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_m_untied, svfloat16_t,
+		z0 = svmul_f16_m (p0, z1, z2),
+		z0 = svmul_m (p0, z1, z2))
+
+/*
+** mul_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svmul_n_f16_m (p0, z0, d4),
+		 z0 = svmul_m (p0, z0, d4))
+
+/*
+** mul_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svmul_n_f16_m (p0, z1, d4),
+		 z0 = svmul_m (p0, z1, d4))
+
+/*
+** mul_1_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fmul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f16_m_tied1, svfloat16_t,
+		z0 = svmul_n_f16_m (p0, z0, 1),
+		z0 = svmul_m (p0, z0, 1))
+
+/*
+** mul_1_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f16_m_untied, svfloat16_t,
+		z0 = svmul_n_f16_m (p0, z1, 1),
+		z0 = svmul_m (p0, z1, 1))
+
+/*
+** mul_0p5_f16_m_tied1:
+**	fmul	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f16_m_tied1, svfloat16_t,
+		z0 = svmul_n_f16_m (p0, z0, 0.5),
+		z0 = svmul_m (p0, z0, 0.5))
+
+/*
+** mul_0p5_f16_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f16_m_untied, svfloat16_t,
+		z0 = svmul_n_f16_m (p0, z1, 0.5),
+		z0 = svmul_m (p0, z1, 0.5))
+
+/*
+** mul_2_f16_m_tied1:
+**	fmul	z0\.h, p0/m, z0\.h, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f16_m_tied1, svfloat16_t,
+		z0 = svmul_n_f16_m (p0, z0, 2),
+		z0 = svmul_m (p0, z0, 2))
+
+/*
+** mul_2_f16_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f16_m_untied, svfloat16_t,
+		z0 = svmul_n_f16_m (p0, z1, 2),
+		z0 = svmul_m (p0, z1, 2))
+
+/*
+** mul_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_z_tied1, svfloat16_t,
+		z0 = svmul_f16_z (p0, z0, z1),
+		z0 = svmul_z (p0, z0, z1))
+
+/*
+** mul_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_z_tied2, svfloat16_t,
+		z0 = svmul_f16_z (p0, z1, z0),
+		z0 = svmul_z (p0, z1, z0))
+
+/*
+** mul_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmul	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_z_untied, svfloat16_t,
+		z0 = svmul_f16_z (p0, z1, z2),
+		z0 = svmul_z (p0, z1, z2))
+
+/*
+** mul_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svmul_n_f16_z (p0, z0, d4),
+		 z0 = svmul_z (p0, z0, d4))
+
+/*
+** mul_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmul	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svmul_n_f16_z (p0, z1, d4),
+		 z0 = svmul_z (p0, z1, d4))
+
+/*
+** mul_1_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f16_z_tied1, svfloat16_t,
+		z0 = svmul_n_f16_z (p0, z0, 1),
+		z0 = svmul_z (p0, z0, 1))
+
+/*
+** mul_1_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmul	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f16_z_untied, svfloat16_t,
+		z0 = svmul_n_f16_z (p0, z1, 1),
+		z0 = svmul_z (p0, z1, 1))
+
+/*
+** mul_0p5_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmul	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f16_z_tied1, svfloat16_t,
+		z0 = svmul_n_f16_z (p0, z0, 0.5),
+		z0 = svmul_z (p0, z0, 0.5))
+
+/*
+** mul_0p5_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmul	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f16_z_untied, svfloat16_t,
+		z0 = svmul_n_f16_z (p0, z1, 0.5),
+		z0 = svmul_z (p0, z1, 0.5))
+
+/*
+** mul_2_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmul	z0\.h, p0/m, z0\.h, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f16_z_tied1, svfloat16_t,
+		z0 = svmul_n_f16_z (p0, z0, 2),
+		z0 = svmul_z (p0, z0, 2))
+
+/*
+** mul_2_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmul	z0\.h, p0/m, z0\.h, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f16_z_untied, svfloat16_t,
+		z0 = svmul_n_f16_z (p0, z1, 2),
+		z0 = svmul_z (p0, z1, 2))
+
+/*
+** mul_f16_x_tied1:
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_x_tied1, svfloat16_t,
+		z0 = svmul_f16_x (p0, z0, z1),
+		z0 = svmul_x (p0, z0, z1))
+
+/*
+** mul_f16_x_tied2:
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_x_tied2, svfloat16_t,
+		z0 = svmul_f16_x (p0, z1, z0),
+		z0 = svmul_x (p0, z1, z0))
+
+/*
+** mul_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_x_untied, svfloat16_t,
+		z0 = svmul_f16_x (p0, z1, z2),
+		z0 = svmul_x (p0, z1, z2))
+
+/*
+** mul_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svmul_n_f16_x (p0, z0, d4),
+		 z0 = svmul_x (p0, z0, d4))
+
+/*
+** mul_h4_f16_x_untied:
+**	mov	z0\.h, h4
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svmul_n_f16_x (p0, z1, d4),
+		 z0 = svmul_x (p0, z1, d4))
+
+/*
+** mul_1_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fmul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f16_x_tied1, svfloat16_t,
+		z0 = svmul_n_f16_x (p0, z0, 1),
+		z0 = svmul_x (p0, z0, 1))
+
+/*
+** mul_1_f16_x_untied:
+**	fmov	z0\.h, #1\.0(?:e\+0)?
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f16_x_untied, svfloat16_t,
+		z0 = svmul_n_f16_x (p0, z1, 1),
+		z0 = svmul_x (p0, z1, 1))
+
+/*
+** mul_0p5_f16_x_tied1:
+**	fmul	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svmul_n_f16_x (p0, z0, 0.5),
+		z0 = svmul_x (p0, z0, 0.5))
+
+/*
+** mul_0p5_f16_x_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f16_x_untied, svfloat16_t,
+		z0 = svmul_n_f16_x (p0, z1, 0.5),
+		z0 = svmul_x (p0, z1, 0.5))
+
+/*
+** mul_2_f16_x_tied1:
+**	fmul	z0\.h, p0/m, z0\.h, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f16_x_tied1, svfloat16_t,
+		z0 = svmul_n_f16_x (p0, z0, 2),
+		z0 = svmul_x (p0, z0, 2))
+
+/*
+** mul_2_f16_x_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f16_x_untied, svfloat16_t,
+		z0 = svmul_n_f16_x (p0, z1, 2),
+		z0 = svmul_x (p0, z1, 2))
+
+/*
+** ptrue_mul_f16_x_tied1:
+**	fmul	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f16_x_tied1, svfloat16_t,
+		z0 = svmul_f16_x (svptrue_b16 (), z0, z1),
+		z0 = svmul_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_mul_f16_x_tied2:
+**	fmul	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f16_x_tied2, svfloat16_t,
+		z0 = svmul_f16_x (svptrue_b16 (), z1, z0),
+		z0 = svmul_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_mul_f16_x_untied:
+**	fmul	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f16_x_untied, svfloat16_t,
+		z0 = svmul_f16_x (svptrue_b16 (), z1, z2),
+		z0 = svmul_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_mul_1_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fmul	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_1_f16_x_tied1, svfloat16_t,
+		z0 = svmul_n_f16_x (svptrue_b16 (), z0, 1),
+		z0 = svmul_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_mul_1_f16_x_untied:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fmul	z0\.h, (z1\.h, \1|\1, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_1_f16_x_untied, svfloat16_t,
+		z0 = svmul_n_f16_x (svptrue_b16 (), z1, 1),
+		z0 = svmul_x (svptrue_b16 (), z1, 1))
+
+/*
+** ptrue_mul_0p5_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svmul_n_f16_x (svptrue_b16 (), z0, 0.5),
+		z0 = svmul_x (svptrue_b16 (), z0, 0.5))
+
+/*
+** ptrue_mul_0p5_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_0p5_f16_x_untied, svfloat16_t,
+		z0 = svmul_n_f16_x (svptrue_b16 (), z1, 0.5),
+		z0 = svmul_x (svptrue_b16 (), z1, 0.5))
+
+/*
+** ptrue_mul_2_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_2_f16_x_tied1, svfloat16_t,
+		z0 = svmul_n_f16_x (svptrue_b16 (), z0, 2),
+		z0 = svmul_x (svptrue_b16 (), z0, 2))
+
+/*
+** ptrue_mul_2_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_2_f16_x_untied, svfloat16_t,
+		z0 = svmul_n_f16_x (svptrue_b16 (), z1, 2),
+		z0 = svmul_x (svptrue_b16 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f16_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f16_notrap.c
new file mode 100644
index 000000000..481fe999c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f16_notrap.c
@@ -0,0 +1,439 @@
+/* { dg-additional-options "-fno-trapping-math" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_f16_m_tied1:
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_m_tied1, svfloat16_t,
+		z0 = svmul_f16_m (p0, z0, z1),
+		z0 = svmul_m (p0, z0, z1))
+
+/*
+** mul_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_m_tied2, svfloat16_t,
+		z0 = svmul_f16_m (p0, z1, z0),
+		z0 = svmul_m (p0, z1, z0))
+
+/*
+** mul_f16_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_m_untied, svfloat16_t,
+		z0 = svmul_f16_m (p0, z1, z2),
+		z0 = svmul_m (p0, z1, z2))
+
+/*
+** mul_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svmul_n_f16_m (p0, z0, d4),
+		 z0 = svmul_m (p0, z0, d4))
+
+/*
+** mul_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svmul_n_f16_m (p0, z1, d4),
+		 z0 = svmul_m (p0, z1, d4))
+
+/*
+** mul_1_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fmul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f16_m_tied1, svfloat16_t,
+		z0 = svmul_n_f16_m (p0, z0, 1),
+		z0 = svmul_m (p0, z0, 1))
+
+/*
+** mul_1_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f16_m_untied, svfloat16_t,
+		z0 = svmul_n_f16_m (p0, z1, 1),
+		z0 = svmul_m (p0, z1, 1))
+
+/*
+** mul_0p5_f16_m_tied1:
+**	fmul	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f16_m_tied1, svfloat16_t,
+		z0 = svmul_n_f16_m (p0, z0, 0.5),
+		z0 = svmul_m (p0, z0, 0.5))
+
+/*
+** mul_0p5_f16_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f16_m_untied, svfloat16_t,
+		z0 = svmul_n_f16_m (p0, z1, 0.5),
+		z0 = svmul_m (p0, z1, 0.5))
+
+/*
+** mul_2_f16_m_tied1:
+**	fmul	z0\.h, p0/m, z0\.h, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f16_m_tied1, svfloat16_t,
+		z0 = svmul_n_f16_m (p0, z0, 2),
+		z0 = svmul_m (p0, z0, 2))
+
+/*
+** mul_2_f16_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f16_m_untied, svfloat16_t,
+		z0 = svmul_n_f16_m (p0, z1, 2),
+		z0 = svmul_m (p0, z1, 2))
+
+/*
+** mul_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_z_tied1, svfloat16_t,
+		z0 = svmul_f16_z (p0, z0, z1),
+		z0 = svmul_z (p0, z0, z1))
+
+/*
+** mul_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_z_tied2, svfloat16_t,
+		z0 = svmul_f16_z (p0, z1, z0),
+		z0 = svmul_z (p0, z1, z0))
+
+/*
+** mul_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmul	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_z_untied, svfloat16_t,
+		z0 = svmul_f16_z (p0, z1, z2),
+		z0 = svmul_z (p0, z1, z2))
+
+/*
+** mul_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svmul_n_f16_z (p0, z0, d4),
+		 z0 = svmul_z (p0, z0, d4))
+
+/*
+** mul_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmul	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svmul_n_f16_z (p0, z1, d4),
+		 z0 = svmul_z (p0, z1, d4))
+
+/*
+** mul_1_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f16_z_tied1, svfloat16_t,
+		z0 = svmul_n_f16_z (p0, z0, 1),
+		z0 = svmul_z (p0, z0, 1))
+
+/*
+** mul_1_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmul	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmul	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f16_z_untied, svfloat16_t,
+		z0 = svmul_n_f16_z (p0, z1, 1),
+		z0 = svmul_z (p0, z1, 1))
+
+/*
+** mul_0p5_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmul	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f16_z_tied1, svfloat16_t,
+		z0 = svmul_n_f16_z (p0, z0, 0.5),
+		z0 = svmul_z (p0, z0, 0.5))
+
+/*
+** mul_0p5_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmul	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f16_z_untied, svfloat16_t,
+		z0 = svmul_n_f16_z (p0, z1, 0.5),
+		z0 = svmul_z (p0, z1, 0.5))
+
+/*
+** mul_2_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmul	z0\.h, p0/m, z0\.h, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f16_z_tied1, svfloat16_t,
+		z0 = svmul_n_f16_z (p0, z0, 2),
+		z0 = svmul_z (p0, z0, 2))
+
+/*
+** mul_2_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmul	z0\.h, p0/m, z0\.h, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f16_z_untied, svfloat16_t,
+		z0 = svmul_n_f16_z (p0, z1, 2),
+		z0 = svmul_z (p0, z1, 2))
+
+/*
+** mul_f16_x_tied1:
+**	fmul	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_x_tied1, svfloat16_t,
+		z0 = svmul_f16_x (p0, z0, z1),
+		z0 = svmul_x (p0, z0, z1))
+
+/*
+** mul_f16_x_tied2:
+**	fmul	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_x_tied2, svfloat16_t,
+		z0 = svmul_f16_x (p0, z1, z0),
+		z0 = svmul_x (p0, z1, z0))
+
+/*
+** mul_f16_x_untied:
+**	fmul	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f16_x_untied, svfloat16_t,
+		z0 = svmul_f16_x (p0, z1, z2),
+		z0 = svmul_x (p0, z1, z2))
+
+/*
+** mul_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmul	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svmul_n_f16_x (p0, z0, d4),
+		 z0 = svmul_x (p0, z0, d4))
+
+/*
+** mul_h4_f16_x_untied:
+**	mov	(z[0-9]+\.h), h4
+**	fmul	z0\.h, (z1\.h, \1|\1, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svmul_n_f16_x (p0, z1, d4),
+		 z0 = svmul_x (p0, z1, d4))
+
+/*
+** mul_1_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fmul	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f16_x_tied1, svfloat16_t,
+		z0 = svmul_n_f16_x (p0, z0, 1),
+		z0 = svmul_x (p0, z0, 1))
+
+/*
+** mul_1_f16_x_untied:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fmul	z0\.h, (z1\.h, \1|\1, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f16_x_untied, svfloat16_t,
+		z0 = svmul_n_f16_x (p0, z1, 1),
+		z0 = svmul_x (p0, z1, 1))
+
+/*
+** mul_0p5_f16_x_tied1:
+**	fmul	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svmul_n_f16_x (p0, z0, 0.5),
+		z0 = svmul_x (p0, z0, 0.5))
+
+/*
+** mul_0p5_f16_x_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f16_x_untied, svfloat16_t,
+		z0 = svmul_n_f16_x (p0, z1, 0.5),
+		z0 = svmul_x (p0, z1, 0.5))
+
+/*
+** mul_2_f16_x_tied1:
+**	fmul	z0\.h, p0/m, z0\.h, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f16_x_tied1, svfloat16_t,
+		z0 = svmul_n_f16_x (p0, z0, 2),
+		z0 = svmul_x (p0, z0, 2))
+
+/*
+** mul_2_f16_x_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.h, p0/m, z0\.h, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f16_x_untied, svfloat16_t,
+		z0 = svmul_n_f16_x (p0, z1, 2),
+		z0 = svmul_x (p0, z1, 2))
+
+/*
+** ptrue_mul_f16_x_tied1:
+**	fmul	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f16_x_tied1, svfloat16_t,
+		z0 = svmul_f16_x (svptrue_b16 (), z0, z1),
+		z0 = svmul_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_mul_f16_x_tied2:
+**	fmul	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f16_x_tied2, svfloat16_t,
+		z0 = svmul_f16_x (svptrue_b16 (), z1, z0),
+		z0 = svmul_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_mul_f16_x_untied:
+**	fmul	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f16_x_untied, svfloat16_t,
+		z0 = svmul_f16_x (svptrue_b16 (), z1, z2),
+		z0 = svmul_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_mul_1_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fmul	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_1_f16_x_tied1, svfloat16_t,
+		z0 = svmul_n_f16_x (svptrue_b16 (), z0, 1),
+		z0 = svmul_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_mul_1_f16_x_untied:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fmul	z0\.h, (z1\.h, \1|\1, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_1_f16_x_untied, svfloat16_t,
+		z0 = svmul_n_f16_x (svptrue_b16 (), z1, 1),
+		z0 = svmul_x (svptrue_b16 (), z1, 1))
+
+/*
+** ptrue_mul_0p5_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svmul_n_f16_x (svptrue_b16 (), z0, 0.5),
+		z0 = svmul_x (svptrue_b16 (), z0, 0.5))
+
+/*
+** ptrue_mul_0p5_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_0p5_f16_x_untied, svfloat16_t,
+		z0 = svmul_n_f16_x (svptrue_b16 (), z1, 0.5),
+		z0 = svmul_x (svptrue_b16 (), z1, 0.5))
+
+/*
+** ptrue_mul_2_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_2_f16_x_tied1, svfloat16_t,
+		z0 = svmul_n_f16_x (svptrue_b16 (), z0, 2),
+		z0 = svmul_x (svptrue_b16 (), z0, 2))
+
+/*
+** ptrue_mul_2_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_2_f16_x_untied, svfloat16_t,
+		z0 = svmul_n_f16_x (svptrue_b16 (), z1, 2),
+		z0 = svmul_x (svptrue_b16 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f32.c
new file mode 100644
index 000000000..5b3df6fde
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f32.c
@@ -0,0 +1,444 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_f32_m_tied1:
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_m_tied1, svfloat32_t,
+		z0 = svmul_f32_m (p0, z0, z1),
+		z0 = svmul_m (p0, z0, z1))
+
+/*
+** mul_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_m_tied2, svfloat32_t,
+		z0 = svmul_f32_m (p0, z1, z0),
+		z0 = svmul_m (p0, z1, z0))
+
+/*
+** mul_f32_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_m_untied, svfloat32_t,
+		z0 = svmul_f32_m (p0, z1, z2),
+		z0 = svmul_m (p0, z1, z2))
+
+/*
+** mul_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svmul_n_f32_m (p0, z0, d4),
+		 z0 = svmul_m (p0, z0, d4))
+
+/*
+** mul_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svmul_n_f32_m (p0, z1, d4),
+		 z0 = svmul_m (p0, z1, d4))
+
+/*
+** mul_1_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fmul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f32_m_tied1, svfloat32_t,
+		z0 = svmul_n_f32_m (p0, z0, 1),
+		z0 = svmul_m (p0, z0, 1))
+
+/*
+** mul_1_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f32_m_untied, svfloat32_t,
+		z0 = svmul_n_f32_m (p0, z1, 1),
+		z0 = svmul_m (p0, z1, 1))
+
+/*
+** mul_0p5_f32_m_tied1:
+**	fmul	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f32_m_tied1, svfloat32_t,
+		z0 = svmul_n_f32_m (p0, z0, 0.5),
+		z0 = svmul_m (p0, z0, 0.5))
+
+/*
+** mul_0p5_f32_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f32_m_untied, svfloat32_t,
+		z0 = svmul_n_f32_m (p0, z1, 0.5),
+		z0 = svmul_m (p0, z1, 0.5))
+
+/*
+** mul_2_f32_m_tied1:
+**	fmul	z0\.s, p0/m, z0\.s, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f32_m_tied1, svfloat32_t,
+		z0 = svmul_n_f32_m (p0, z0, 2),
+		z0 = svmul_m (p0, z0, 2))
+
+/*
+** mul_2_f32_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f32_m_untied, svfloat32_t,
+		z0 = svmul_n_f32_m (p0, z1, 2),
+		z0 = svmul_m (p0, z1, 2))
+
+/*
+** mul_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_z_tied1, svfloat32_t,
+		z0 = svmul_f32_z (p0, z0, z1),
+		z0 = svmul_z (p0, z0, z1))
+
+/*
+** mul_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_z_tied2, svfloat32_t,
+		z0 = svmul_f32_z (p0, z1, z0),
+		z0 = svmul_z (p0, z1, z0))
+
+/*
+** mul_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmul	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_z_untied, svfloat32_t,
+		z0 = svmul_f32_z (p0, z1, z2),
+		z0 = svmul_z (p0, z1, z2))
+
+/*
+** mul_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svmul_n_f32_z (p0, z0, d4),
+		 z0 = svmul_z (p0, z0, d4))
+
+/*
+** mul_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmul	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svmul_n_f32_z (p0, z1, d4),
+		 z0 = svmul_z (p0, z1, d4))
+
+/*
+** mul_1_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f32_z_tied1, svfloat32_t,
+		z0 = svmul_n_f32_z (p0, z0, 1),
+		z0 = svmul_z (p0, z0, 1))
+
+/*
+** mul_1_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmul	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f32_z_untied, svfloat32_t,
+		z0 = svmul_n_f32_z (p0, z1, 1),
+		z0 = svmul_z (p0, z1, 1))
+
+/*
+** mul_0p5_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmul	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f32_z_tied1, svfloat32_t,
+		z0 = svmul_n_f32_z (p0, z0, 0.5),
+		z0 = svmul_z (p0, z0, 0.5))
+
+/*
+** mul_0p5_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmul	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f32_z_untied, svfloat32_t,
+		z0 = svmul_n_f32_z (p0, z1, 0.5),
+		z0 = svmul_z (p0, z1, 0.5))
+
+/*
+** mul_2_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmul	z0\.s, p0/m, z0\.s, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f32_z_tied1, svfloat32_t,
+		z0 = svmul_n_f32_z (p0, z0, 2),
+		z0 = svmul_z (p0, z0, 2))
+
+/*
+** mul_2_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmul	z0\.s, p0/m, z0\.s, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f32_z_untied, svfloat32_t,
+		z0 = svmul_n_f32_z (p0, z1, 2),
+		z0 = svmul_z (p0, z1, 2))
+
+/*
+** mul_f32_x_tied1:
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_x_tied1, svfloat32_t,
+		z0 = svmul_f32_x (p0, z0, z1),
+		z0 = svmul_x (p0, z0, z1))
+
+/*
+** mul_f32_x_tied2:
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_x_tied2, svfloat32_t,
+		z0 = svmul_f32_x (p0, z1, z0),
+		z0 = svmul_x (p0, z1, z0))
+
+/*
+** mul_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_x_untied, svfloat32_t,
+		z0 = svmul_f32_x (p0, z1, z2),
+		z0 = svmul_x (p0, z1, z2))
+
+/*
+** mul_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svmul_n_f32_x (p0, z0, d4),
+		 z0 = svmul_x (p0, z0, d4))
+
+/*
+** mul_s4_f32_x_untied:
+**	mov	z0\.s, s4
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svmul_n_f32_x (p0, z1, d4),
+		 z0 = svmul_x (p0, z1, d4))
+
+/*
+** mul_1_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fmul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f32_x_tied1, svfloat32_t,
+		z0 = svmul_n_f32_x (p0, z0, 1),
+		z0 = svmul_x (p0, z0, 1))
+
+/*
+** mul_1_f32_x_untied:
+**	fmov	z0\.s, #1\.0(?:e\+0)?
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f32_x_untied, svfloat32_t,
+		z0 = svmul_n_f32_x (p0, z1, 1),
+		z0 = svmul_x (p0, z1, 1))
+
+/*
+** mul_0p5_f32_x_tied1:
+**	fmul	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svmul_n_f32_x (p0, z0, 0.5),
+		z0 = svmul_x (p0, z0, 0.5))
+
+/*
+** mul_0p5_f32_x_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f32_x_untied, svfloat32_t,
+		z0 = svmul_n_f32_x (p0, z1, 0.5),
+		z0 = svmul_x (p0, z1, 0.5))
+
+/*
+** mul_2_f32_x_tied1:
+**	fmul	z0\.s, p0/m, z0\.s, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f32_x_tied1, svfloat32_t,
+		z0 = svmul_n_f32_x (p0, z0, 2),
+		z0 = svmul_x (p0, z0, 2))
+
+/*
+** mul_2_f32_x_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f32_x_untied, svfloat32_t,
+		z0 = svmul_n_f32_x (p0, z1, 2),
+		z0 = svmul_x (p0, z1, 2))
+
+/*
+** ptrue_mul_f32_x_tied1:
+**	fmul	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f32_x_tied1, svfloat32_t,
+		z0 = svmul_f32_x (svptrue_b32 (), z0, z1),
+		z0 = svmul_x (svptrue_b32 (), z0, z1))
+
+/*
+** ptrue_mul_f32_x_tied2:
+**	fmul	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f32_x_tied2, svfloat32_t,
+		z0 = svmul_f32_x (svptrue_b32 (), z1, z0),
+		z0 = svmul_x (svptrue_b32 (), z1, z0))
+
+/*
+** ptrue_mul_f32_x_untied:
+**	fmul	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f32_x_untied, svfloat32_t,
+		z0 = svmul_f32_x (svptrue_b32 (), z1, z2),
+		z0 = svmul_x (svptrue_b32 (), z1, z2))
+
+/*
+** ptrue_mul_1_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fmul	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_1_f32_x_tied1, svfloat32_t,
+		z0 = svmul_n_f32_x (svptrue_b32 (), z0, 1),
+		z0 = svmul_x (svptrue_b32 (), z0, 1))
+
+/*
+** ptrue_mul_1_f32_x_untied:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fmul	z0\.s, (z1\.s, \1|\1, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_1_f32_x_untied, svfloat32_t,
+		z0 = svmul_n_f32_x (svptrue_b32 (), z1, 1),
+		z0 = svmul_x (svptrue_b32 (), z1, 1))
+
+/*
+** ptrue_mul_0p5_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svmul_n_f32_x (svptrue_b32 (), z0, 0.5),
+		z0 = svmul_x (svptrue_b32 (), z0, 0.5))
+
+/*
+** ptrue_mul_0p5_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_0p5_f32_x_untied, svfloat32_t,
+		z0 = svmul_n_f32_x (svptrue_b32 (), z1, 0.5),
+		z0 = svmul_x (svptrue_b32 (), z1, 0.5))
+
+/*
+** ptrue_mul_2_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_2_f32_x_tied1, svfloat32_t,
+		z0 = svmul_n_f32_x (svptrue_b32 (), z0, 2),
+		z0 = svmul_x (svptrue_b32 (), z0, 2))
+
+/*
+** ptrue_mul_2_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_2_f32_x_untied, svfloat32_t,
+		z0 = svmul_n_f32_x (svptrue_b32 (), z1, 2),
+		z0 = svmul_x (svptrue_b32 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f32_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f32_notrap.c
new file mode 100644
index 000000000..eb2d240ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f32_notrap.c
@@ -0,0 +1,439 @@
+/* { dg-additional-options "-fno-trapping-math" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_f32_m_tied1:
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_m_tied1, svfloat32_t,
+		z0 = svmul_f32_m (p0, z0, z1),
+		z0 = svmul_m (p0, z0, z1))
+
+/*
+** mul_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_m_tied2, svfloat32_t,
+		z0 = svmul_f32_m (p0, z1, z0),
+		z0 = svmul_m (p0, z1, z0))
+
+/*
+** mul_f32_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_m_untied, svfloat32_t,
+		z0 = svmul_f32_m (p0, z1, z2),
+		z0 = svmul_m (p0, z1, z2))
+
+/*
+** mul_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svmul_n_f32_m (p0, z0, d4),
+		 z0 = svmul_m (p0, z0, d4))
+
+/*
+** mul_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svmul_n_f32_m (p0, z1, d4),
+		 z0 = svmul_m (p0, z1, d4))
+
+/*
+** mul_1_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fmul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f32_m_tied1, svfloat32_t,
+		z0 = svmul_n_f32_m (p0, z0, 1),
+		z0 = svmul_m (p0, z0, 1))
+
+/*
+** mul_1_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f32_m_untied, svfloat32_t,
+		z0 = svmul_n_f32_m (p0, z1, 1),
+		z0 = svmul_m (p0, z1, 1))
+
+/*
+** mul_0p5_f32_m_tied1:
+**	fmul	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f32_m_tied1, svfloat32_t,
+		z0 = svmul_n_f32_m (p0, z0, 0.5),
+		z0 = svmul_m (p0, z0, 0.5))
+
+/*
+** mul_0p5_f32_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f32_m_untied, svfloat32_t,
+		z0 = svmul_n_f32_m (p0, z1, 0.5),
+		z0 = svmul_m (p0, z1, 0.5))
+
+/*
+** mul_2_f32_m_tied1:
+**	fmul	z0\.s, p0/m, z0\.s, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f32_m_tied1, svfloat32_t,
+		z0 = svmul_n_f32_m (p0, z0, 2),
+		z0 = svmul_m (p0, z0, 2))
+
+/*
+** mul_2_f32_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f32_m_untied, svfloat32_t,
+		z0 = svmul_n_f32_m (p0, z1, 2),
+		z0 = svmul_m (p0, z1, 2))
+
+/*
+** mul_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_z_tied1, svfloat32_t,
+		z0 = svmul_f32_z (p0, z0, z1),
+		z0 = svmul_z (p0, z0, z1))
+
+/*
+** mul_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_z_tied2, svfloat32_t,
+		z0 = svmul_f32_z (p0, z1, z0),
+		z0 = svmul_z (p0, z1, z0))
+
+/*
+** mul_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmul	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_z_untied, svfloat32_t,
+		z0 = svmul_f32_z (p0, z1, z2),
+		z0 = svmul_z (p0, z1, z2))
+
+/*
+** mul_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svmul_n_f32_z (p0, z0, d4),
+		 z0 = svmul_z (p0, z0, d4))
+
+/*
+** mul_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmul	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svmul_n_f32_z (p0, z1, d4),
+		 z0 = svmul_z (p0, z1, d4))
+
+/*
+** mul_1_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f32_z_tied1, svfloat32_t,
+		z0 = svmul_n_f32_z (p0, z0, 1),
+		z0 = svmul_z (p0, z0, 1))
+
+/*
+** mul_1_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmul	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmul	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f32_z_untied, svfloat32_t,
+		z0 = svmul_n_f32_z (p0, z1, 1),
+		z0 = svmul_z (p0, z1, 1))
+
+/*
+** mul_0p5_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmul	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f32_z_tied1, svfloat32_t,
+		z0 = svmul_n_f32_z (p0, z0, 0.5),
+		z0 = svmul_z (p0, z0, 0.5))
+
+/*
+** mul_0p5_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmul	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f32_z_untied, svfloat32_t,
+		z0 = svmul_n_f32_z (p0, z1, 0.5),
+		z0 = svmul_z (p0, z1, 0.5))
+
+/*
+** mul_2_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmul	z0\.s, p0/m, z0\.s, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f32_z_tied1, svfloat32_t,
+		z0 = svmul_n_f32_z (p0, z0, 2),
+		z0 = svmul_z (p0, z0, 2))
+
+/*
+** mul_2_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmul	z0\.s, p0/m, z0\.s, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f32_z_untied, svfloat32_t,
+		z0 = svmul_n_f32_z (p0, z1, 2),
+		z0 = svmul_z (p0, z1, 2))
+
+/*
+** mul_f32_x_tied1:
+**	fmul	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_x_tied1, svfloat32_t,
+		z0 = svmul_f32_x (p0, z0, z1),
+		z0 = svmul_x (p0, z0, z1))
+
+/*
+** mul_f32_x_tied2:
+**	fmul	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_x_tied2, svfloat32_t,
+		z0 = svmul_f32_x (p0, z1, z0),
+		z0 = svmul_x (p0, z1, z0))
+
+/*
+** mul_f32_x_untied:
+**	fmul	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f32_x_untied, svfloat32_t,
+		z0 = svmul_f32_x (p0, z1, z2),
+		z0 = svmul_x (p0, z1, z2))
+
+/*
+** mul_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmul	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svmul_n_f32_x (p0, z0, d4),
+		 z0 = svmul_x (p0, z0, d4))
+
+/*
+** mul_s4_f32_x_untied:
+**	mov	(z[0-9]+\.s), s4
+**	fmul	z0\.s, (z1\.s, \1|\1, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svmul_n_f32_x (p0, z1, d4),
+		 z0 = svmul_x (p0, z1, d4))
+
+/*
+** mul_1_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fmul	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f32_x_tied1, svfloat32_t,
+		z0 = svmul_n_f32_x (p0, z0, 1),
+		z0 = svmul_x (p0, z0, 1))
+
+/*
+** mul_1_f32_x_untied:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fmul	z0\.s, (z1\.s, \1|\1, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f32_x_untied, svfloat32_t,
+		z0 = svmul_n_f32_x (p0, z1, 1),
+		z0 = svmul_x (p0, z1, 1))
+
+/*
+** mul_0p5_f32_x_tied1:
+**	fmul	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svmul_n_f32_x (p0, z0, 0.5),
+		z0 = svmul_x (p0, z0, 0.5))
+
+/*
+** mul_0p5_f32_x_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f32_x_untied, svfloat32_t,
+		z0 = svmul_n_f32_x (p0, z1, 0.5),
+		z0 = svmul_x (p0, z1, 0.5))
+
+/*
+** mul_2_f32_x_tied1:
+**	fmul	z0\.s, p0/m, z0\.s, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f32_x_tied1, svfloat32_t,
+		z0 = svmul_n_f32_x (p0, z0, 2),
+		z0 = svmul_x (p0, z0, 2))
+
+/*
+** mul_2_f32_x_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.s, p0/m, z0\.s, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f32_x_untied, svfloat32_t,
+		z0 = svmul_n_f32_x (p0, z1, 2),
+		z0 = svmul_x (p0, z1, 2))
+
+/*
+** ptrue_mul_f32_x_tied1:
+**	fmul	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f32_x_tied1, svfloat32_t,
+		z0 = svmul_f32_x (svptrue_b32 (), z0, z1),
+		z0 = svmul_x (svptrue_b32 (), z0, z1))
+
+/*
+** ptrue_mul_f32_x_tied2:
+**	fmul	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f32_x_tied2, svfloat32_t,
+		z0 = svmul_f32_x (svptrue_b32 (), z1, z0),
+		z0 = svmul_x (svptrue_b32 (), z1, z0))
+
+/*
+** ptrue_mul_f32_x_untied:
+**	fmul	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f32_x_untied, svfloat32_t,
+		z0 = svmul_f32_x (svptrue_b32 (), z1, z2),
+		z0 = svmul_x (svptrue_b32 (), z1, z2))
+
+/*
+** ptrue_mul_1_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fmul	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_1_f32_x_tied1, svfloat32_t,
+		z0 = svmul_n_f32_x (svptrue_b32 (), z0, 1),
+		z0 = svmul_x (svptrue_b32 (), z0, 1))
+
+/*
+** ptrue_mul_1_f32_x_untied:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fmul	z0\.s, (z1\.s, \1|\1, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_1_f32_x_untied, svfloat32_t,
+		z0 = svmul_n_f32_x (svptrue_b32 (), z1, 1),
+		z0 = svmul_x (svptrue_b32 (), z1, 1))
+
+/*
+** ptrue_mul_0p5_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svmul_n_f32_x (svptrue_b32 (), z0, 0.5),
+		z0 = svmul_x (svptrue_b32 (), z0, 0.5))
+
+/*
+** ptrue_mul_0p5_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_0p5_f32_x_untied, svfloat32_t,
+		z0 = svmul_n_f32_x (svptrue_b32 (), z1, 0.5),
+		z0 = svmul_x (svptrue_b32 (), z1, 0.5))
+
+/*
+** ptrue_mul_2_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_2_f32_x_tied1, svfloat32_t,
+		z0 = svmul_n_f32_x (svptrue_b32 (), z0, 2),
+		z0 = svmul_x (svptrue_b32 (), z0, 2))
+
+/*
+** ptrue_mul_2_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_2_f32_x_untied, svfloat32_t,
+		z0 = svmul_n_f32_x (svptrue_b32 (), z1, 2),
+		z0 = svmul_x (svptrue_b32 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f64.c
new file mode 100644
index 000000000..f5654a9f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f64.c
@@ -0,0 +1,444 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_f64_m_tied1:
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_m_tied1, svfloat64_t,
+		z0 = svmul_f64_m (p0, z0, z1),
+		z0 = svmul_m (p0, z0, z1))
+
+/*
+** mul_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_m_tied2, svfloat64_t,
+		z0 = svmul_f64_m (p0, z1, z0),
+		z0 = svmul_m (p0, z1, z0))
+
+/*
+** mul_f64_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_m_untied, svfloat64_t,
+		z0 = svmul_f64_m (p0, z1, z2),
+		z0 = svmul_m (p0, z1, z2))
+
+/*
+** mul_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svmul_n_f64_m (p0, z0, d4),
+		 z0 = svmul_m (p0, z0, d4))
+
+/*
+** mul_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svmul_n_f64_m (p0, z1, d4),
+		 z0 = svmul_m (p0, z1, d4))
+
+/*
+** mul_1_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fmul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f64_m_tied1, svfloat64_t,
+		z0 = svmul_n_f64_m (p0, z0, 1),
+		z0 = svmul_m (p0, z0, 1))
+
+/*
+** mul_1_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f64_m_untied, svfloat64_t,
+		z0 = svmul_n_f64_m (p0, z1, 1),
+		z0 = svmul_m (p0, z1, 1))
+
+/*
+** mul_0p5_f64_m_tied1:
+**	fmul	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f64_m_tied1, svfloat64_t,
+		z0 = svmul_n_f64_m (p0, z0, 0.5),
+		z0 = svmul_m (p0, z0, 0.5))
+
+/*
+** mul_0p5_f64_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f64_m_untied, svfloat64_t,
+		z0 = svmul_n_f64_m (p0, z1, 0.5),
+		z0 = svmul_m (p0, z1, 0.5))
+
+/*
+** mul_2_f64_m_tied1:
+**	fmul	z0\.d, p0/m, z0\.d, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f64_m_tied1, svfloat64_t,
+		z0 = svmul_n_f64_m (p0, z0, 2),
+		z0 = svmul_m (p0, z0, 2))
+
+/*
+** mul_2_f64_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f64_m_untied, svfloat64_t,
+		z0 = svmul_n_f64_m (p0, z1, 2),
+		z0 = svmul_m (p0, z1, 2))
+
+/*
+** mul_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_z_tied1, svfloat64_t,
+		z0 = svmul_f64_z (p0, z0, z1),
+		z0 = svmul_z (p0, z0, z1))
+
+/*
+** mul_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_z_tied2, svfloat64_t,
+		z0 = svmul_f64_z (p0, z1, z0),
+		z0 = svmul_z (p0, z1, z0))
+
+/*
+** mul_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmul	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_z_untied, svfloat64_t,
+		z0 = svmul_f64_z (p0, z1, z2),
+		z0 = svmul_z (p0, z1, z2))
+
+/*
+** mul_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svmul_n_f64_z (p0, z0, d4),
+		 z0 = svmul_z (p0, z0, d4))
+
+/*
+** mul_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmul	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svmul_n_f64_z (p0, z1, d4),
+		 z0 = svmul_z (p0, z1, d4))
+
+/*
+** mul_1_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f64_z_tied1, svfloat64_t,
+		z0 = svmul_n_f64_z (p0, z0, 1),
+		z0 = svmul_z (p0, z0, 1))
+
+/*
+** mul_1_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmul	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f64_z_untied, svfloat64_t,
+		z0 = svmul_n_f64_z (p0, z1, 1),
+		z0 = svmul_z (p0, z1, 1))
+
+/*
+** mul_0p5_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmul	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f64_z_tied1, svfloat64_t,
+		z0 = svmul_n_f64_z (p0, z0, 0.5),
+		z0 = svmul_z (p0, z0, 0.5))
+
+/*
+** mul_0p5_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmul	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f64_z_untied, svfloat64_t,
+		z0 = svmul_n_f64_z (p0, z1, 0.5),
+		z0 = svmul_z (p0, z1, 0.5))
+
+/*
+** mul_2_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmul	z0\.d, p0/m, z0\.d, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f64_z_tied1, svfloat64_t,
+		z0 = svmul_n_f64_z (p0, z0, 2),
+		z0 = svmul_z (p0, z0, 2))
+
+/*
+** mul_2_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmul	z0\.d, p0/m, z0\.d, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f64_z_untied, svfloat64_t,
+		z0 = svmul_n_f64_z (p0, z1, 2),
+		z0 = svmul_z (p0, z1, 2))
+
+/*
+** mul_f64_x_tied1:
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_x_tied1, svfloat64_t,
+		z0 = svmul_f64_x (p0, z0, z1),
+		z0 = svmul_x (p0, z0, z1))
+
+/*
+** mul_f64_x_tied2:
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_x_tied2, svfloat64_t,
+		z0 = svmul_f64_x (p0, z1, z0),
+		z0 = svmul_x (p0, z1, z0))
+
+/*
+** mul_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_x_untied, svfloat64_t,
+		z0 = svmul_f64_x (p0, z1, z2),
+		z0 = svmul_x (p0, z1, z2))
+
+/*
+** mul_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svmul_n_f64_x (p0, z0, d4),
+		 z0 = svmul_x (p0, z0, d4))
+
+/*
+** mul_d4_f64_x_untied:
+**	mov	z0\.d, d4
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svmul_n_f64_x (p0, z1, d4),
+		 z0 = svmul_x (p0, z1, d4))
+
+/*
+** mul_1_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fmul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f64_x_tied1, svfloat64_t,
+		z0 = svmul_n_f64_x (p0, z0, 1),
+		z0 = svmul_x (p0, z0, 1))
+
+/*
+** mul_1_f64_x_untied:
+**	fmov	z0\.d, #1\.0(?:e\+0)?
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f64_x_untied, svfloat64_t,
+		z0 = svmul_n_f64_x (p0, z1, 1),
+		z0 = svmul_x (p0, z1, 1))
+
+/*
+** mul_0p5_f64_x_tied1:
+**	fmul	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svmul_n_f64_x (p0, z0, 0.5),
+		z0 = svmul_x (p0, z0, 0.5))
+
+/*
+** mul_0p5_f64_x_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f64_x_untied, svfloat64_t,
+		z0 = svmul_n_f64_x (p0, z1, 0.5),
+		z0 = svmul_x (p0, z1, 0.5))
+
+/*
+** mul_2_f64_x_tied1:
+**	fmul	z0\.d, p0/m, z0\.d, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f64_x_tied1, svfloat64_t,
+		z0 = svmul_n_f64_x (p0, z0, 2),
+		z0 = svmul_x (p0, z0, 2))
+
+/*
+** mul_2_f64_x_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f64_x_untied, svfloat64_t,
+		z0 = svmul_n_f64_x (p0, z1, 2),
+		z0 = svmul_x (p0, z1, 2))
+
+/*
+** ptrue_mul_f64_x_tied1:
+**	fmul	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f64_x_tied1, svfloat64_t,
+		z0 = svmul_f64_x (svptrue_b64 (), z0, z1),
+		z0 = svmul_x (svptrue_b64 (), z0, z1))
+
+/*
+** ptrue_mul_f64_x_tied2:
+**	fmul	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f64_x_tied2, svfloat64_t,
+		z0 = svmul_f64_x (svptrue_b64 (), z1, z0),
+		z0 = svmul_x (svptrue_b64 (), z1, z0))
+
+/*
+** ptrue_mul_f64_x_untied:
+**	fmul	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f64_x_untied, svfloat64_t,
+		z0 = svmul_f64_x (svptrue_b64 (), z1, z2),
+		z0 = svmul_x (svptrue_b64 (), z1, z2))
+
+/*
+** ptrue_mul_1_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fmul	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_1_f64_x_tied1, svfloat64_t,
+		z0 = svmul_n_f64_x (svptrue_b64 (), z0, 1),
+		z0 = svmul_x (svptrue_b64 (), z0, 1))
+
+/*
+** ptrue_mul_1_f64_x_untied:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fmul	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_1_f64_x_untied, svfloat64_t,
+		z0 = svmul_n_f64_x (svptrue_b64 (), z1, 1),
+		z0 = svmul_x (svptrue_b64 (), z1, 1))
+
+/*
+** ptrue_mul_0p5_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svmul_n_f64_x (svptrue_b64 (), z0, 0.5),
+		z0 = svmul_x (svptrue_b64 (), z0, 0.5))
+
+/*
+** ptrue_mul_0p5_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_0p5_f64_x_untied, svfloat64_t,
+		z0 = svmul_n_f64_x (svptrue_b64 (), z1, 0.5),
+		z0 = svmul_x (svptrue_b64 (), z1, 0.5))
+
+/*
+** ptrue_mul_2_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_2_f64_x_tied1, svfloat64_t,
+		z0 = svmul_n_f64_x (svptrue_b64 (), z0, 2),
+		z0 = svmul_x (svptrue_b64 (), z0, 2))
+
+/*
+** ptrue_mul_2_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_2_f64_x_untied, svfloat64_t,
+		z0 = svmul_n_f64_x (svptrue_b64 (), z1, 2),
+		z0 = svmul_x (svptrue_b64 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f64_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f64_notrap.c
new file mode 100644
index 000000000..d865618d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_f64_notrap.c
@@ -0,0 +1,439 @@
+/* { dg-additional-options "-fno-trapping-math" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_f64_m_tied1:
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_m_tied1, svfloat64_t,
+		z0 = svmul_f64_m (p0, z0, z1),
+		z0 = svmul_m (p0, z0, z1))
+
+/*
+** mul_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_m_tied2, svfloat64_t,
+		z0 = svmul_f64_m (p0, z1, z0),
+		z0 = svmul_m (p0, z1, z0))
+
+/*
+** mul_f64_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_m_untied, svfloat64_t,
+		z0 = svmul_f64_m (p0, z1, z2),
+		z0 = svmul_m (p0, z1, z2))
+
+/*
+** mul_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svmul_n_f64_m (p0, z0, d4),
+		 z0 = svmul_m (p0, z0, d4))
+
+/*
+** mul_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svmul_n_f64_m (p0, z1, d4),
+		 z0 = svmul_m (p0, z1, d4))
+
+/*
+** mul_1_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fmul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f64_m_tied1, svfloat64_t,
+		z0 = svmul_n_f64_m (p0, z0, 1),
+		z0 = svmul_m (p0, z0, 1))
+
+/*
+** mul_1_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f64_m_untied, svfloat64_t,
+		z0 = svmul_n_f64_m (p0, z1, 1),
+		z0 = svmul_m (p0, z1, 1))
+
+/*
+** mul_0p5_f64_m_tied1:
+**	fmul	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f64_m_tied1, svfloat64_t,
+		z0 = svmul_n_f64_m (p0, z0, 0.5),
+		z0 = svmul_m (p0, z0, 0.5))
+
+/*
+** mul_0p5_f64_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f64_m_untied, svfloat64_t,
+		z0 = svmul_n_f64_m (p0, z1, 0.5),
+		z0 = svmul_m (p0, z1, 0.5))
+
+/*
+** mul_2_f64_m_tied1:
+**	fmul	z0\.d, p0/m, z0\.d, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f64_m_tied1, svfloat64_t,
+		z0 = svmul_n_f64_m (p0, z0, 2),
+		z0 = svmul_m (p0, z0, 2))
+
+/*
+** mul_2_f64_m_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f64_m_untied, svfloat64_t,
+		z0 = svmul_n_f64_m (p0, z1, 2),
+		z0 = svmul_m (p0, z1, 2))
+
+/*
+** mul_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_z_tied1, svfloat64_t,
+		z0 = svmul_f64_z (p0, z0, z1),
+		z0 = svmul_z (p0, z0, z1))
+
+/*
+** mul_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_z_tied2, svfloat64_t,
+		z0 = svmul_f64_z (p0, z1, z0),
+		z0 = svmul_z (p0, z1, z0))
+
+/*
+** mul_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmul	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_z_untied, svfloat64_t,
+		z0 = svmul_f64_z (p0, z1, z2),
+		z0 = svmul_z (p0, z1, z2))
+
+/*
+** mul_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svmul_n_f64_z (p0, z0, d4),
+		 z0 = svmul_z (p0, z0, d4))
+
+/*
+** mul_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmul	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svmul_n_f64_z (p0, z1, d4),
+		 z0 = svmul_z (p0, z1, d4))
+
+/*
+** mul_1_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f64_z_tied1, svfloat64_t,
+		z0 = svmul_n_f64_z (p0, z0, 1),
+		z0 = svmul_z (p0, z0, 1))
+
+/*
+** mul_1_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmul	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmul	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f64_z_untied, svfloat64_t,
+		z0 = svmul_n_f64_z (p0, z1, 1),
+		z0 = svmul_z (p0, z1, 1))
+
+/*
+** mul_0p5_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmul	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f64_z_tied1, svfloat64_t,
+		z0 = svmul_n_f64_z (p0, z0, 0.5),
+		z0 = svmul_z (p0, z0, 0.5))
+
+/*
+** mul_0p5_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmul	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f64_z_untied, svfloat64_t,
+		z0 = svmul_n_f64_z (p0, z1, 0.5),
+		z0 = svmul_z (p0, z1, 0.5))
+
+/*
+** mul_2_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmul	z0\.d, p0/m, z0\.d, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f64_z_tied1, svfloat64_t,
+		z0 = svmul_n_f64_z (p0, z0, 2),
+		z0 = svmul_z (p0, z0, 2))
+
+/*
+** mul_2_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmul	z0\.d, p0/m, z0\.d, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f64_z_untied, svfloat64_t,
+		z0 = svmul_n_f64_z (p0, z1, 2),
+		z0 = svmul_z (p0, z1, 2))
+
+/*
+** mul_f64_x_tied1:
+**	fmul	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_x_tied1, svfloat64_t,
+		z0 = svmul_f64_x (p0, z0, z1),
+		z0 = svmul_x (p0, z0, z1))
+
+/*
+** mul_f64_x_tied2:
+**	fmul	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_x_tied2, svfloat64_t,
+		z0 = svmul_f64_x (p0, z1, z0),
+		z0 = svmul_x (p0, z1, z0))
+
+/*
+** mul_f64_x_untied:
+**	fmul	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (mul_f64_x_untied, svfloat64_t,
+		z0 = svmul_f64_x (p0, z1, z2),
+		z0 = svmul_x (p0, z1, z2))
+
+/*
+** mul_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmul	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svmul_n_f64_x (p0, z0, d4),
+		 z0 = svmul_x (p0, z0, d4))
+
+/*
+** mul_d4_f64_x_untied:
+**	mov	(z[0-9]+\.d), d4
+**	fmul	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZD (mul_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svmul_n_f64_x (p0, z1, d4),
+		 z0 = svmul_x (p0, z1, d4))
+
+/*
+** mul_1_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fmul	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f64_x_tied1, svfloat64_t,
+		z0 = svmul_n_f64_x (p0, z0, 1),
+		z0 = svmul_x (p0, z0, 1))
+
+/*
+** mul_1_f64_x_untied:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fmul	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (mul_1_f64_x_untied, svfloat64_t,
+		z0 = svmul_n_f64_x (p0, z1, 1),
+		z0 = svmul_x (p0, z1, 1))
+
+/*
+** mul_0p5_f64_x_tied1:
+**	fmul	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svmul_n_f64_x (p0, z0, 0.5),
+		z0 = svmul_x (p0, z0, 0.5))
+
+/*
+** mul_0p5_f64_x_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (mul_0p5_f64_x_untied, svfloat64_t,
+		z0 = svmul_n_f64_x (p0, z1, 0.5),
+		z0 = svmul_x (p0, z1, 0.5))
+
+/*
+** mul_2_f64_x_tied1:
+**	fmul	z0\.d, p0/m, z0\.d, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f64_x_tied1, svfloat64_t,
+		z0 = svmul_n_f64_x (p0, z0, 2),
+		z0 = svmul_x (p0, z0, 2))
+
+/*
+** mul_2_f64_x_untied:
+**	movprfx	z0, z1
+**	fmul	z0\.d, p0/m, z0\.d, #2\.0
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_f64_x_untied, svfloat64_t,
+		z0 = svmul_n_f64_x (p0, z1, 2),
+		z0 = svmul_x (p0, z1, 2))
+
+/*
+** ptrue_mul_f64_x_tied1:
+**	fmul	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f64_x_tied1, svfloat64_t,
+		z0 = svmul_f64_x (svptrue_b64 (), z0, z1),
+		z0 = svmul_x (svptrue_b64 (), z0, z1))
+
+/*
+** ptrue_mul_f64_x_tied2:
+**	fmul	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f64_x_tied2, svfloat64_t,
+		z0 = svmul_f64_x (svptrue_b64 (), z1, z0),
+		z0 = svmul_x (svptrue_b64 (), z1, z0))
+
+/*
+** ptrue_mul_f64_x_untied:
+**	fmul	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_f64_x_untied, svfloat64_t,
+		z0 = svmul_f64_x (svptrue_b64 (), z1, z2),
+		z0 = svmul_x (svptrue_b64 (), z1, z2))
+
+/*
+** ptrue_mul_1_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fmul	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_1_f64_x_tied1, svfloat64_t,
+		z0 = svmul_n_f64_x (svptrue_b64 (), z0, 1),
+		z0 = svmul_x (svptrue_b64 (), z0, 1))
+
+/*
+** ptrue_mul_1_f64_x_untied:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fmul	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_1_f64_x_untied, svfloat64_t,
+		z0 = svmul_n_f64_x (svptrue_b64 (), z1, 1),
+		z0 = svmul_x (svptrue_b64 (), z1, 1))
+
+/*
+** ptrue_mul_0p5_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svmul_n_f64_x (svptrue_b64 (), z0, 0.5),
+		z0 = svmul_x (svptrue_b64 (), z0, 0.5))
+
+/*
+** ptrue_mul_0p5_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_0p5_f64_x_untied, svfloat64_t,
+		z0 = svmul_n_f64_x (svptrue_b64 (), z1, 0.5),
+		z0 = svmul_x (svptrue_b64 (), z1, 0.5))
+
+/*
+** ptrue_mul_2_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_2_f64_x_tied1, svfloat64_t,
+		z0 = svmul_n_f64_x (svptrue_b64 (), z0, 2),
+		z0 = svmul_x (svptrue_b64 (), z0, 2))
+
+/*
+** ptrue_mul_2_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_2_f64_x_untied, svfloat64_t,
+		z0 = svmul_n_f64_x (svptrue_b64 (), z1, 2),
+		z0 = svmul_x (svptrue_b64 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f16.c
new file mode 100644
index 000000000..1c7503bfd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f16.c
@@ -0,0 +1,114 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_lane_0_f16_tied1:
+**	fmul	z0\.h, z0\.h, z1\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_0_f16_tied1, svfloat16_t,
+		z0 = svmul_lane_f16 (z0, z1, 0),
+		z0 = svmul_lane (z0, z1, 0))
+
+/*
+** mul_lane_0_f16_tied2:
+**	fmul	z0\.h, z1\.h, z0\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_0_f16_tied2, svfloat16_t,
+		z0 = svmul_lane_f16 (z1, z0, 0),
+		z0 = svmul_lane (z1, z0, 0))
+
+/*
+** mul_lane_0_f16_untied:
+**	fmul	z0\.h, z1\.h, z2\.h\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_0_f16_untied, svfloat16_t,
+		z0 = svmul_lane_f16 (z1, z2, 0),
+		z0 = svmul_lane (z1, z2, 0))
+
+/*
+** mul_lane_1_f16:
+**	fmul	z0\.h, z1\.h, z2\.h\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_1_f16, svfloat16_t,
+		z0 = svmul_lane_f16 (z1, z2, 1),
+		z0 = svmul_lane (z1, z2, 1))
+
+/*
+** mul_lane_2_f16:
+**	fmul	z0\.h, z1\.h, z2\.h\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_2_f16, svfloat16_t,
+		z0 = svmul_lane_f16 (z1, z2, 2),
+		z0 = svmul_lane (z1, z2, 2))
+
+/*
+** mul_lane_3_f16:
+**	fmul	z0\.h, z1\.h, z2\.h\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_3_f16, svfloat16_t,
+		z0 = svmul_lane_f16 (z1, z2, 3),
+		z0 = svmul_lane (z1, z2, 3))
+
+/*
+** mul_lane_4_f16:
+**	fmul	z0\.h, z1\.h, z2\.h\[4\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_4_f16, svfloat16_t,
+		z0 = svmul_lane_f16 (z1, z2, 4),
+		z0 = svmul_lane (z1, z2, 4))
+
+/*
+** mul_lane_5_f16:
+**	fmul	z0\.h, z1\.h, z2\.h\[5\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_5_f16, svfloat16_t,
+		z0 = svmul_lane_f16 (z1, z2, 5),
+		z0 = svmul_lane (z1, z2, 5))
+
+/*
+** mul_lane_6_f16:
+**	fmul	z0\.h, z1\.h, z2\.h\[6\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_6_f16, svfloat16_t,
+		z0 = svmul_lane_f16 (z1, z2, 6),
+		z0 = svmul_lane (z1, z2, 6))
+
+/*
+** mul_lane_7_f16:
+**	fmul	z0\.h, z1\.h, z2\.h\[7\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_7_f16, svfloat16_t,
+		z0 = svmul_lane_f16 (z1, z2, 7),
+		z0 = svmul_lane (z1, z2, 7))
+
+/*
+** mul_lane_z7_f16:
+**	fmul	z0\.h, z1\.h, z7\.h\[7\]
+**	ret
+*/
+TEST_DUAL_Z (mul_lane_z7_f16, svfloat16_t, svfloat16_t,
+	     z0 = svmul_lane_f16 (z1, z7, 7),
+	     z0 = svmul_lane (z1, z7, 7))
+
+/*
+** mul_lane_z8_f16:
+**	str	d8, \[sp, -16\]!
+**	mov	(z[0-7])\.d, z8\.d
+**	fmul	z0\.h, z1\.h, \1\.h\[7\]
+**	ldr	d8, \[sp\], 16
+**	ret
+*/
+TEST_DUAL_LANE_REG (mul_lane_z8_f16, svfloat16_t, svfloat16_t, z8,
+		    z0 = svmul_lane_f16 (z1, z8, 7),
+		    z0 = svmul_lane (z1, z8, 7))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f32.c
new file mode 100644
index 000000000..5355e7e0b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f32.c
@@ -0,0 +1,78 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_lane_0_f32_tied1:
+**	fmul	z0\.s, z0\.s, z1\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_0_f32_tied1, svfloat32_t,
+		z0 = svmul_lane_f32 (z0, z1, 0),
+		z0 = svmul_lane (z0, z1, 0))
+
+/*
+** mul_lane_0_f32_tied2:
+**	fmul	z0\.s, z1\.s, z0\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_0_f32_tied2, svfloat32_t,
+		z0 = svmul_lane_f32 (z1, z0, 0),
+		z0 = svmul_lane (z1, z0, 0))
+
+/*
+** mul_lane_0_f32_untied:
+**	fmul	z0\.s, z1\.s, z2\.s\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_0_f32_untied, svfloat32_t,
+		z0 = svmul_lane_f32 (z1, z2, 0),
+		z0 = svmul_lane (z1, z2, 0))
+
+/*
+** mul_lane_1_f32:
+**	fmul	z0\.s, z1\.s, z2\.s\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_1_f32, svfloat32_t,
+		z0 = svmul_lane_f32 (z1, z2, 1),
+		z0 = svmul_lane (z1, z2, 1))
+
+/*
+** mul_lane_2_f32:
+**	fmul	z0\.s, z1\.s, z2\.s\[2\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_2_f32, svfloat32_t,
+		z0 = svmul_lane_f32 (z1, z2, 2),
+		z0 = svmul_lane (z1, z2, 2))
+
+/*
+** mul_lane_3_f32:
+**	fmul	z0\.s, z1\.s, z2\.s\[3\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_3_f32, svfloat32_t,
+		z0 = svmul_lane_f32 (z1, z2, 3),
+		z0 = svmul_lane (z1, z2, 3))
+
+/*
+** mul_lane_z7_f32:
+**	fmul	z0\.s, z1\.s, z7\.s\[3\]
+**	ret
+*/
+TEST_DUAL_Z (mul_lane_z7_f32, svfloat32_t, svfloat32_t,
+	     z0 = svmul_lane_f32 (z1, z7, 3),
+	     z0 = svmul_lane (z1, z7, 3))
+
+/*
+** mul_lane_z8_f32:
+**	str	d8, \[sp, -16\]!
+**	mov	(z[0-7])\.d, z8\.d
+**	fmul	z0\.s, z1\.s, \1\.s\[3\]
+**	ldr	d8, \[sp\], 16
+**	ret
+*/
+TEST_DUAL_LANE_REG (mul_lane_z8_f32, svfloat32_t, svfloat32_t, z8,
+		    z0 = svmul_lane_f32 (z1, z8, 3),
+		    z0 = svmul_lane (z1, z8, 3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f64.c
new file mode 100644
index 000000000..a53a013c5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_lane_f64.c
@@ -0,0 +1,69 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_lane_0_f64_tied1:
+**	fmul	z0\.d, z0\.d, z1\.d\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_0_f64_tied1, svfloat64_t,
+		z0 = svmul_lane_f64 (z0, z1, 0),
+		z0 = svmul_lane (z0, z1, 0))
+
+/*
+** mul_lane_0_f64_tied2:
+**	fmul	z0\.d, z1\.d, z0\.d\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_0_f64_tied2, svfloat64_t,
+		z0 = svmul_lane_f64 (z1, z0, 0),
+		z0 = svmul_lane (z1, z0, 0))
+
+/*
+** mul_lane_0_f64_untied:
+**	fmul	z0\.d, z1\.d, z2\.d\[0\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_0_f64_untied, svfloat64_t,
+		z0 = svmul_lane_f64 (z1, z2, 0),
+		z0 = svmul_lane (z1, z2, 0))
+
+/*
+** mul_lane_1_f64:
+**	fmul	z0\.d, z1\.d, z2\.d\[1\]
+**	ret
+*/
+TEST_UNIFORM_Z (mul_lane_1_f64, svfloat64_t,
+		z0 = svmul_lane_f64 (z1, z2, 1),
+		z0 = svmul_lane (z1, z2, 1))
+
+/*
+** mul_lane_z7_f64:
+**	fmul	z0\.d, z1\.d, z7\.d\[1\]
+**	ret
+*/
+TEST_DUAL_Z (mul_lane_z7_f64, svfloat64_t, svfloat64_t,
+	     z0 = svmul_lane_f64 (z1, z7, 1),
+	     z0 = svmul_lane (z1, z7, 1))
+
+/*
+** mul_lane_z15_f64:
+**	str	d15, \[sp, -16\]!
+**	fmul	z0\.d, z1\.d, z15\.d\[1\]
+**	ldr	d15, \[sp\], 16
+**	ret
+*/
+TEST_DUAL_LANE_REG (mul_lane_z15_f64, svfloat64_t, svfloat64_t, z15,
+		    z0 = svmul_lane_f64 (z1, z15, 1),
+		    z0 = svmul_lane (z1, z15, 1))
+
+/*
+** mul_lane_z16_f64:
+**	mov	(z[0-9]|z1[0-5])\.d, z16\.d
+**	fmul	z0\.d, z1\.d, \1\.d\[1\]
+**	ret
+*/
+TEST_DUAL_LANE_REG (mul_lane_z16_f64, svfloat64_t, svfloat64_t, z16,
+		    z0 = svmul_lane_f64 (z1, z16, 1),
+		    z0 = svmul_lane (z1, z16, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s16.c
new file mode 100644
index 000000000..aa08bc274
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s16.c
@@ -0,0 +1,302 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_s16_m_tied1:
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s16_m_tied1, svint16_t,
+		z0 = svmul_s16_m (p0, z0, z1),
+		z0 = svmul_m (p0, z0, z1))
+
+/*
+** mul_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mul	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s16_m_tied2, svint16_t,
+		z0 = svmul_s16_m (p0, z1, z0),
+		z0 = svmul_m (p0, z1, z0))
+
+/*
+** mul_s16_m_untied:
+**	movprfx	z0, z1
+**	mul	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s16_m_untied, svint16_t,
+		z0 = svmul_s16_m (p0, z1, z2),
+		z0 = svmul_m (p0, z1, z2))
+
+/*
+** mul_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s16_m_tied1, svint16_t, int16_t,
+		 z0 = svmul_n_s16_m (p0, z0, x0),
+		 z0 = svmul_m (p0, z0, x0))
+
+/*
+** mul_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s16_m_untied, svint16_t, int16_t,
+		 z0 = svmul_n_s16_m (p0, z1, x0),
+		 z0 = svmul_m (p0, z1, x0))
+
+/*
+** mul_2_s16_m_tied1:
+**	mov	(z[0-9]+\.h), #2
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s16_m_tied1, svint16_t,
+		z0 = svmul_n_s16_m (p0, z0, 2),
+		z0 = svmul_m (p0, z0, 2))
+
+/*
+** mul_2_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #2
+**	movprfx	z0, z1
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s16_m_untied, svint16_t,
+		z0 = svmul_n_s16_m (p0, z1, 2),
+		z0 = svmul_m (p0, z1, 2))
+
+/*
+** mul_m1_s16_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	mul	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m1_s16_m, svint16_t,
+		z0 = svmul_n_s16_m (p0, z0, -1),
+		z0 = svmul_m (p0, z0, -1))
+
+/*
+** mul_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s16_z_tied1, svint16_t,
+		z0 = svmul_s16_z (p0, z0, z1),
+		z0 = svmul_z (p0, z0, z1))
+
+/*
+** mul_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s16_z_tied2, svint16_t,
+		z0 = svmul_s16_z (p0, z1, z0),
+		z0 = svmul_z (p0, z1, z0))
+
+/*
+** mul_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mul	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s16_z_untied, svint16_t,
+		z0 = svmul_s16_z (p0, z1, z2),
+		z0 = svmul_z (p0, z1, z2))
+
+/*
+** mul_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s16_z_tied1, svint16_t, int16_t,
+		 z0 = svmul_n_s16_z (p0, z0, x0),
+		 z0 = svmul_z (p0, z0, x0))
+
+/*
+** mul_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mul	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s16_z_untied, svint16_t, int16_t,
+		 z0 = svmul_n_s16_z (p0, z1, x0),
+		 z0 = svmul_z (p0, z1, x0))
+
+/*
+** mul_2_s16_z_tied1:
+**	mov	(z[0-9]+\.h), #2
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s16_z_tied1, svint16_t,
+		z0 = svmul_n_s16_z (p0, z0, 2),
+		z0 = svmul_z (p0, z0, 2))
+
+/*
+** mul_2_s16_z_untied:
+**	mov	(z[0-9]+\.h), #2
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mul	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s16_z_untied, svint16_t,
+		z0 = svmul_n_s16_z (p0, z1, 2),
+		z0 = svmul_z (p0, z1, 2))
+
+/*
+** mul_s16_x_tied1:
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s16_x_tied1, svint16_t,
+		z0 = svmul_s16_x (p0, z0, z1),
+		z0 = svmul_x (p0, z0, z1))
+
+/*
+** mul_s16_x_tied2:
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s16_x_tied2, svint16_t,
+		z0 = svmul_s16_x (p0, z1, z0),
+		z0 = svmul_x (p0, z1, z0))
+
+/*
+** mul_s16_x_untied:
+** (
+**	movprfx	z0, z1
+**	mul	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s16_x_untied, svint16_t,
+		z0 = svmul_s16_x (p0, z1, z2),
+		z0 = svmul_x (p0, z1, z2))
+
+/*
+** mul_w0_s16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s16_x_tied1, svint16_t, int16_t,
+		 z0 = svmul_n_s16_x (p0, z0, x0),
+		 z0 = svmul_x (p0, z0, x0))
+
+/*
+** mul_w0_s16_x_untied:
+**	mov	z0\.h, w0
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s16_x_untied, svint16_t, int16_t,
+		 z0 = svmul_n_s16_x (p0, z1, x0),
+		 z0 = svmul_x (p0, z1, x0))
+
+/*
+** mul_2_s16_x_tied1:
+**	mul	z0\.h, z0\.h, #2
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s16_x_tied1, svint16_t,
+		z0 = svmul_n_s16_x (p0, z0, 2),
+		z0 = svmul_x (p0, z0, 2))
+
+/*
+** mul_2_s16_x_untied:
+**	movprfx	z0, z1
+**	mul	z0\.h, z0\.h, #2
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s16_x_untied, svint16_t,
+		z0 = svmul_n_s16_x (p0, z1, 2),
+		z0 = svmul_x (p0, z1, 2))
+
+/*
+** mul_127_s16_x:
+**	mul	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (mul_127_s16_x, svint16_t,
+		z0 = svmul_n_s16_x (p0, z0, 127),
+		z0 = svmul_x (p0, z0, 127))
+
+/*
+** mul_128_s16_x:
+**	mov	(z[0-9]+\.h), #128
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_128_s16_x, svint16_t,
+		z0 = svmul_n_s16_x (p0, z0, 128),
+		z0 = svmul_x (p0, z0, 128))
+
+/*
+** mul_255_s16_x:
+**	mov	(z[0-9]+\.h), #255
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_255_s16_x, svint16_t,
+		z0 = svmul_n_s16_x (p0, z0, 255),
+		z0 = svmul_x (p0, z0, 255))
+
+/*
+** mul_m1_s16_x:
+**	mul	z0\.h, z0\.h, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m1_s16_x, svint16_t,
+		z0 = svmul_n_s16_x (p0, z0, -1),
+		z0 = svmul_x (p0, z0, -1))
+
+/*
+** mul_m127_s16_x:
+**	mul	z0\.h, z0\.h, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m127_s16_x, svint16_t,
+		z0 = svmul_n_s16_x (p0, z0, -127),
+		z0 = svmul_x (p0, z0, -127))
+
+/*
+** mul_m128_s16_x:
+**	mul	z0\.h, z0\.h, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m128_s16_x, svint16_t,
+		z0 = svmul_n_s16_x (p0, z0, -128),
+		z0 = svmul_x (p0, z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c
new file mode 100644
index 000000000..7acf77fdb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c
@@ -0,0 +1,302 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_s32_m_tied1:
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s32_m_tied1, svint32_t,
+		z0 = svmul_s32_m (p0, z0, z1),
+		z0 = svmul_m (p0, z0, z1))
+
+/*
+** mul_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mul	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s32_m_tied2, svint32_t,
+		z0 = svmul_s32_m (p0, z1, z0),
+		z0 = svmul_m (p0, z1, z0))
+
+/*
+** mul_s32_m_untied:
+**	movprfx	z0, z1
+**	mul	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s32_m_untied, svint32_t,
+		z0 = svmul_s32_m (p0, z1, z2),
+		z0 = svmul_m (p0, z1, z2))
+
+/*
+** mul_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svmul_n_s32_m (p0, z0, x0),
+		 z0 = svmul_m (p0, z0, x0))
+
+/*
+** mul_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svmul_n_s32_m (p0, z1, x0),
+		 z0 = svmul_m (p0, z1, x0))
+
+/*
+** mul_2_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #2
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s32_m_tied1, svint32_t,
+		z0 = svmul_n_s32_m (p0, z0, 2),
+		z0 = svmul_m (p0, z0, 2))
+
+/*
+** mul_2_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #2
+**	movprfx	z0, z1
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s32_m_untied, svint32_t,
+		z0 = svmul_n_s32_m (p0, z1, 2),
+		z0 = svmul_m (p0, z1, 2))
+
+/*
+** mul_m1_s32_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	mul	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m1_s32_m, svint32_t,
+		z0 = svmul_n_s32_m (p0, z0, -1),
+		z0 = svmul_m (p0, z0, -1))
+
+/*
+** mul_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s32_z_tied1, svint32_t,
+		z0 = svmul_s32_z (p0, z0, z1),
+		z0 = svmul_z (p0, z0, z1))
+
+/*
+** mul_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s32_z_tied2, svint32_t,
+		z0 = svmul_s32_z (p0, z1, z0),
+		z0 = svmul_z (p0, z1, z0))
+
+/*
+** mul_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mul	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s32_z_untied, svint32_t,
+		z0 = svmul_s32_z (p0, z1, z2),
+		z0 = svmul_z (p0, z1, z2))
+
+/*
+** mul_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svmul_n_s32_z (p0, z0, x0),
+		 z0 = svmul_z (p0, z0, x0))
+
+/*
+** mul_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mul	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svmul_n_s32_z (p0, z1, x0),
+		 z0 = svmul_z (p0, z1, x0))
+
+/*
+** mul_2_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #2
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s32_z_tied1, svint32_t,
+		z0 = svmul_n_s32_z (p0, z0, 2),
+		z0 = svmul_z (p0, z0, 2))
+
+/*
+** mul_2_s32_z_untied:
+**	mov	(z[0-9]+\.s), #2
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mul	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s32_z_untied, svint32_t,
+		z0 = svmul_n_s32_z (p0, z1, 2),
+		z0 = svmul_z (p0, z1, 2))
+
+/*
+** mul_s32_x_tied1:
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s32_x_tied1, svint32_t,
+		z0 = svmul_s32_x (p0, z0, z1),
+		z0 = svmul_x (p0, z0, z1))
+
+/*
+** mul_s32_x_tied2:
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s32_x_tied2, svint32_t,
+		z0 = svmul_s32_x (p0, z1, z0),
+		z0 = svmul_x (p0, z1, z0))
+
+/*
+** mul_s32_x_untied:
+** (
+**	movprfx	z0, z1
+**	mul	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s32_x_untied, svint32_t,
+		z0 = svmul_s32_x (p0, z1, z2),
+		z0 = svmul_x (p0, z1, z2))
+
+/*
+** mul_w0_s32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svmul_n_s32_x (p0, z0, x0),
+		 z0 = svmul_x (p0, z0, x0))
+
+/*
+** mul_w0_s32_x_untied:
+**	mov	z0\.s, w0
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svmul_n_s32_x (p0, z1, x0),
+		 z0 = svmul_x (p0, z1, x0))
+
+/*
+** mul_2_s32_x_tied1:
+**	mul	z0\.s, z0\.s, #2
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s32_x_tied1, svint32_t,
+		z0 = svmul_n_s32_x (p0, z0, 2),
+		z0 = svmul_x (p0, z0, 2))
+
+/*
+** mul_2_s32_x_untied:
+**	movprfx	z0, z1
+**	mul	z0\.s, z0\.s, #2
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s32_x_untied, svint32_t,
+		z0 = svmul_n_s32_x (p0, z1, 2),
+		z0 = svmul_x (p0, z1, 2))
+
+/*
+** mul_127_s32_x:
+**	mul	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (mul_127_s32_x, svint32_t,
+		z0 = svmul_n_s32_x (p0, z0, 127),
+		z0 = svmul_x (p0, z0, 127))
+
+/*
+** mul_128_s32_x:
+**	mov	(z[0-9]+\.s), #128
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_128_s32_x, svint32_t,
+		z0 = svmul_n_s32_x (p0, z0, 128),
+		z0 = svmul_x (p0, z0, 128))
+
+/*
+** mul_255_s32_x:
+**	mov	(z[0-9]+\.s), #255
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_255_s32_x, svint32_t,
+		z0 = svmul_n_s32_x (p0, z0, 255),
+		z0 = svmul_x (p0, z0, 255))
+
+/*
+** mul_m1_s32_x:
+**	mul	z0\.s, z0\.s, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m1_s32_x, svint32_t,
+		z0 = svmul_n_s32_x (p0, z0, -1),
+		z0 = svmul_x (p0, z0, -1))
+
+/*
+** mul_m127_s32_x:
+**	mul	z0\.s, z0\.s, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m127_s32_x, svint32_t,
+		z0 = svmul_n_s32_x (p0, z0, -127),
+		z0 = svmul_x (p0, z0, -127))
+
+/*
+** mul_m128_s32_x:
+**	mul	z0\.s, z0\.s, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m128_s32_x, svint32_t,
+		z0 = svmul_n_s32_x (p0, z0, -128),
+		z0 = svmul_x (p0, z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c
new file mode 100644
index 000000000..549105f1e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c
@@ -0,0 +1,302 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_s64_m_tied1:
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s64_m_tied1, svint64_t,
+		z0 = svmul_s64_m (p0, z0, z1),
+		z0 = svmul_m (p0, z0, z1))
+
+/*
+** mul_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s64_m_tied2, svint64_t,
+		z0 = svmul_s64_m (p0, z1, z0),
+		z0 = svmul_m (p0, z1, z0))
+
+/*
+** mul_s64_m_untied:
+**	movprfx	z0, z1
+**	mul	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s64_m_untied, svint64_t,
+		z0 = svmul_s64_m (p0, z1, z2),
+		z0 = svmul_m (p0, z1, z2))
+
+/*
+** mul_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svmul_n_s64_m (p0, z0, x0),
+		 z0 = svmul_m (p0, z0, x0))
+
+/*
+** mul_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svmul_n_s64_m (p0, z1, x0),
+		 z0 = svmul_m (p0, z1, x0))
+
+/*
+** mul_2_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #2
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s64_m_tied1, svint64_t,
+		z0 = svmul_n_s64_m (p0, z0, 2),
+		z0 = svmul_m (p0, z0, 2))
+
+/*
+** mul_2_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #2
+**	movprfx	z0, z1
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s64_m_untied, svint64_t,
+		z0 = svmul_n_s64_m (p0, z1, 2),
+		z0 = svmul_m (p0, z1, 2))
+
+/*
+** mul_m1_s64_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	mul	z0\.d, p0/m, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m1_s64_m, svint64_t,
+		z0 = svmul_n_s64_m (p0, z0, -1),
+		z0 = svmul_m (p0, z0, -1))
+
+/*
+** mul_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s64_z_tied1, svint64_t,
+		z0 = svmul_s64_z (p0, z0, z1),
+		z0 = svmul_z (p0, z0, z1))
+
+/*
+** mul_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s64_z_tied2, svint64_t,
+		z0 = svmul_s64_z (p0, z1, z0),
+		z0 = svmul_z (p0, z1, z0))
+
+/*
+** mul_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mul	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s64_z_untied, svint64_t,
+		z0 = svmul_s64_z (p0, z1, z2),
+		z0 = svmul_z (p0, z1, z2))
+
+/*
+** mul_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svmul_n_s64_z (p0, z0, x0),
+		 z0 = svmul_z (p0, z0, x0))
+
+/*
+** mul_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mul	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svmul_n_s64_z (p0, z1, x0),
+		 z0 = svmul_z (p0, z1, x0))
+
+/*
+** mul_2_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #2
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s64_z_tied1, svint64_t,
+		z0 = svmul_n_s64_z (p0, z0, 2),
+		z0 = svmul_z (p0, z0, 2))
+
+/*
+** mul_2_s64_z_untied:
+**	mov	(z[0-9]+\.d), #2
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mul	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s64_z_untied, svint64_t,
+		z0 = svmul_n_s64_z (p0, z1, 2),
+		z0 = svmul_z (p0, z1, 2))
+
+/*
+** mul_s64_x_tied1:
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s64_x_tied1, svint64_t,
+		z0 = svmul_s64_x (p0, z0, z1),
+		z0 = svmul_x (p0, z0, z1))
+
+/*
+** mul_s64_x_tied2:
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s64_x_tied2, svint64_t,
+		z0 = svmul_s64_x (p0, z1, z0),
+		z0 = svmul_x (p0, z1, z0))
+
+/*
+** mul_s64_x_untied:
+** (
+**	movprfx	z0, z1
+**	mul	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s64_x_untied, svint64_t,
+		z0 = svmul_s64_x (p0, z1, z2),
+		z0 = svmul_x (p0, z1, z2))
+
+/*
+** mul_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svmul_n_s64_x (p0, z0, x0),
+		 z0 = svmul_x (p0, z0, x0))
+
+/*
+** mul_x0_s64_x_untied:
+**	mov	z0\.d, x0
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svmul_n_s64_x (p0, z1, x0),
+		 z0 = svmul_x (p0, z1, x0))
+
+/*
+** mul_2_s64_x_tied1:
+**	mul	z0\.d, z0\.d, #2
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s64_x_tied1, svint64_t,
+		z0 = svmul_n_s64_x (p0, z0, 2),
+		z0 = svmul_x (p0, z0, 2))
+
+/*
+** mul_2_s64_x_untied:
+**	movprfx	z0, z1
+**	mul	z0\.d, z0\.d, #2
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s64_x_untied, svint64_t,
+		z0 = svmul_n_s64_x (p0, z1, 2),
+		z0 = svmul_x (p0, z1, 2))
+
+/*
+** mul_127_s64_x:
+**	mul	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (mul_127_s64_x, svint64_t,
+		z0 = svmul_n_s64_x (p0, z0, 127),
+		z0 = svmul_x (p0, z0, 127))
+
+/*
+** mul_128_s64_x:
+**	mov	(z[0-9]+\.d), #128
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_128_s64_x, svint64_t,
+		z0 = svmul_n_s64_x (p0, z0, 128),
+		z0 = svmul_x (p0, z0, 128))
+
+/*
+** mul_255_s64_x:
+**	mov	(z[0-9]+\.d), #255
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_255_s64_x, svint64_t,
+		z0 = svmul_n_s64_x (p0, z0, 255),
+		z0 = svmul_x (p0, z0, 255))
+
+/*
+** mul_m1_s64_x:
+**	mul	z0\.d, z0\.d, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m1_s64_x, svint64_t,
+		z0 = svmul_n_s64_x (p0, z0, -1),
+		z0 = svmul_x (p0, z0, -1))
+
+/*
+** mul_m127_s64_x:
+**	mul	z0\.d, z0\.d, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m127_s64_x, svint64_t,
+		z0 = svmul_n_s64_x (p0, z0, -127),
+		z0 = svmul_x (p0, z0, -127))
+
+/*
+** mul_m128_s64_x:
+**	mul	z0\.d, z0\.d, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m128_s64_x, svint64_t,
+		z0 = svmul_n_s64_x (p0, z0, -128),
+		z0 = svmul_x (p0, z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c
new file mode 100644
index 000000000..012e6f250
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c
@@ -0,0 +1,300 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_s8_m_tied1:
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s8_m_tied1, svint8_t,
+		z0 = svmul_s8_m (p0, z0, z1),
+		z0 = svmul_m (p0, z0, z1))
+
+/*
+** mul_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mul	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s8_m_tied2, svint8_t,
+		z0 = svmul_s8_m (p0, z1, z0),
+		z0 = svmul_m (p0, z1, z0))
+
+/*
+** mul_s8_m_untied:
+**	movprfx	z0, z1
+**	mul	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s8_m_untied, svint8_t,
+		z0 = svmul_s8_m (p0, z1, z2),
+		z0 = svmul_m (p0, z1, z2))
+
+/*
+** mul_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	mul	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s8_m_tied1, svint8_t, int8_t,
+		 z0 = svmul_n_s8_m (p0, z0, x0),
+		 z0 = svmul_m (p0, z0, x0))
+
+/*
+** mul_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	mul	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s8_m_untied, svint8_t, int8_t,
+		 z0 = svmul_n_s8_m (p0, z1, x0),
+		 z0 = svmul_m (p0, z1, x0))
+
+/*
+** mul_2_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #2
+**	mul	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s8_m_tied1, svint8_t,
+		z0 = svmul_n_s8_m (p0, z0, 2),
+		z0 = svmul_m (p0, z0, 2))
+
+/*
+** mul_2_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #2
+**	movprfx	z0, z1
+**	mul	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s8_m_untied, svint8_t,
+		z0 = svmul_n_s8_m (p0, z1, 2),
+		z0 = svmul_m (p0, z1, 2))
+
+/*
+** mul_m1_s8_m:
+**	mov	(z[0-9]+\.b), #-1
+**	mul	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m1_s8_m, svint8_t,
+		z0 = svmul_n_s8_m (p0, z0, -1),
+		z0 = svmul_m (p0, z0, -1))
+
+/*
+** mul_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s8_z_tied1, svint8_t,
+		z0 = svmul_s8_z (p0, z0, z1),
+		z0 = svmul_z (p0, z0, z1))
+
+/*
+** mul_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s8_z_tied2, svint8_t,
+		z0 = svmul_s8_z (p0, z1, z0),
+		z0 = svmul_z (p0, z1, z0))
+
+/*
+** mul_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mul	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s8_z_untied, svint8_t,
+		z0 = svmul_s8_z (p0, z1, z2),
+		z0 = svmul_z (p0, z1, z2))
+
+/*
+** mul_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mul	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s8_z_tied1, svint8_t, int8_t,
+		 z0 = svmul_n_s8_z (p0, z0, x0),
+		 z0 = svmul_z (p0, z0, x0))
+
+/*
+** mul_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mul	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s8_z_untied, svint8_t, int8_t,
+		 z0 = svmul_n_s8_z (p0, z1, x0),
+		 z0 = svmul_z (p0, z1, x0))
+
+/*
+** mul_2_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #2
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mul	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s8_z_tied1, svint8_t,
+		z0 = svmul_n_s8_z (p0, z0, 2),
+		z0 = svmul_z (p0, z0, 2))
+
+/*
+** mul_2_s8_z_untied:
+**	mov	(z[0-9]+\.b), #2
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mul	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s8_z_untied, svint8_t,
+		z0 = svmul_n_s8_z (p0, z1, 2),
+		z0 = svmul_z (p0, z1, 2))
+
+/*
+** mul_s8_x_tied1:
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s8_x_tied1, svint8_t,
+		z0 = svmul_s8_x (p0, z0, z1),
+		z0 = svmul_x (p0, z0, z1))
+
+/*
+** mul_s8_x_tied2:
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s8_x_tied2, svint8_t,
+		z0 = svmul_s8_x (p0, z1, z0),
+		z0 = svmul_x (p0, z1, z0))
+
+/*
+** mul_s8_x_untied:
+** (
+**	movprfx	z0, z1
+**	mul	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0, z2
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_s8_x_untied, svint8_t,
+		z0 = svmul_s8_x (p0, z1, z2),
+		z0 = svmul_x (p0, z1, z2))
+
+/*
+** mul_w0_s8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	mul	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s8_x_tied1, svint8_t, int8_t,
+		 z0 = svmul_n_s8_x (p0, z0, x0),
+		 z0 = svmul_x (p0, z0, x0))
+
+/*
+** mul_w0_s8_x_untied:
+**	mov	z0\.b, w0
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_s8_x_untied, svint8_t, int8_t,
+		 z0 = svmul_n_s8_x (p0, z1, x0),
+		 z0 = svmul_x (p0, z1, x0))
+
+/*
+** mul_2_s8_x_tied1:
+**	mul	z0\.b, z0\.b, #2
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s8_x_tied1, svint8_t,
+		z0 = svmul_n_s8_x (p0, z0, 2),
+		z0 = svmul_x (p0, z0, 2))
+
+/*
+** mul_2_s8_x_untied:
+**	movprfx	z0, z1
+**	mul	z0\.b, z0\.b, #2
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_s8_x_untied, svint8_t,
+		z0 = svmul_n_s8_x (p0, z1, 2),
+		z0 = svmul_x (p0, z1, 2))
+
+/*
+** mul_127_s8_x:
+**	mul	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (mul_127_s8_x, svint8_t,
+		z0 = svmul_n_s8_x (p0, z0, 127),
+		z0 = svmul_x (p0, z0, 127))
+
+/*
+** mul_128_s8_x:
+**	mul	z0\.b, z0\.b, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (mul_128_s8_x, svint8_t,
+		z0 = svmul_n_s8_x (p0, z0, 128),
+		z0 = svmul_x (p0, z0, 128))
+
+/*
+** mul_255_s8_x:
+**	mul	z0\.b, z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_255_s8_x, svint8_t,
+		z0 = svmul_n_s8_x (p0, z0, 255),
+		z0 = svmul_x (p0, z0, 255))
+
+/*
+** mul_m1_s8_x:
+**	mul	z0\.b, z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m1_s8_x, svint8_t,
+		z0 = svmul_n_s8_x (p0, z0, -1),
+		z0 = svmul_x (p0, z0, -1))
+
+/*
+** mul_m127_s8_x:
+**	mul	z0\.b, z0\.b, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m127_s8_x, svint8_t,
+		z0 = svmul_n_s8_x (p0, z0, -127),
+		z0 = svmul_x (p0, z0, -127))
+
+/*
+** mul_m128_s8_x:
+**	mul	z0\.b, z0\.b, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m128_s8_x, svint8_t,
+		z0 = svmul_n_s8_x (p0, z0, -128),
+		z0 = svmul_x (p0, z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u16.c
new file mode 100644
index 000000000..300987eb6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u16.c
@@ -0,0 +1,302 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_u16_m_tied1:
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u16_m_tied1, svuint16_t,
+		z0 = svmul_u16_m (p0, z0, z1),
+		z0 = svmul_m (p0, z0, z1))
+
+/*
+** mul_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mul	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u16_m_tied2, svuint16_t,
+		z0 = svmul_u16_m (p0, z1, z0),
+		z0 = svmul_m (p0, z1, z0))
+
+/*
+** mul_u16_m_untied:
+**	movprfx	z0, z1
+**	mul	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u16_m_untied, svuint16_t,
+		z0 = svmul_u16_m (p0, z1, z2),
+		z0 = svmul_m (p0, z1, z2))
+
+/*
+** mul_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svmul_n_u16_m (p0, z0, x0),
+		 z0 = svmul_m (p0, z0, x0))
+
+/*
+** mul_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svmul_n_u16_m (p0, z1, x0),
+		 z0 = svmul_m (p0, z1, x0))
+
+/*
+** mul_2_u16_m_tied1:
+**	mov	(z[0-9]+\.h), #2
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u16_m_tied1, svuint16_t,
+		z0 = svmul_n_u16_m (p0, z0, 2),
+		z0 = svmul_m (p0, z0, 2))
+
+/*
+** mul_2_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #2
+**	movprfx	z0, z1
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u16_m_untied, svuint16_t,
+		z0 = svmul_n_u16_m (p0, z1, 2),
+		z0 = svmul_m (p0, z1, 2))
+
+/*
+** mul_m1_u16_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	mul	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m1_u16_m, svuint16_t,
+		z0 = svmul_n_u16_m (p0, z0, -1),
+		z0 = svmul_m (p0, z0, -1))
+
+/*
+** mul_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u16_z_tied1, svuint16_t,
+		z0 = svmul_u16_z (p0, z0, z1),
+		z0 = svmul_z (p0, z0, z1))
+
+/*
+** mul_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u16_z_tied2, svuint16_t,
+		z0 = svmul_u16_z (p0, z1, z0),
+		z0 = svmul_z (p0, z1, z0))
+
+/*
+** mul_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mul	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u16_z_untied, svuint16_t,
+		z0 = svmul_u16_z (p0, z1, z2),
+		z0 = svmul_z (p0, z1, z2))
+
+/*
+** mul_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svmul_n_u16_z (p0, z0, x0),
+		 z0 = svmul_z (p0, z0, x0))
+
+/*
+** mul_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mul	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svmul_n_u16_z (p0, z1, x0),
+		 z0 = svmul_z (p0, z1, x0))
+
+/*
+** mul_2_u16_z_tied1:
+**	mov	(z[0-9]+\.h), #2
+**	movprfx	z0\.h, p0/z, z0\.h
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u16_z_tied1, svuint16_t,
+		z0 = svmul_n_u16_z (p0, z0, 2),
+		z0 = svmul_z (p0, z0, 2))
+
+/*
+** mul_2_u16_z_untied:
+**	mov	(z[0-9]+\.h), #2
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	mul	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u16_z_untied, svuint16_t,
+		z0 = svmul_n_u16_z (p0, z1, 2),
+		z0 = svmul_z (p0, z1, 2))
+
+/*
+** mul_u16_x_tied1:
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u16_x_tied1, svuint16_t,
+		z0 = svmul_u16_x (p0, z0, z1),
+		z0 = svmul_x (p0, z0, z1))
+
+/*
+** mul_u16_x_tied2:
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u16_x_tied2, svuint16_t,
+		z0 = svmul_u16_x (p0, z1, z0),
+		z0 = svmul_x (p0, z1, z0))
+
+/*
+** mul_u16_x_untied:
+** (
+**	movprfx	z0, z1
+**	mul	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u16_x_untied, svuint16_t,
+		z0 = svmul_u16_x (p0, z1, z2),
+		z0 = svmul_x (p0, z1, z2))
+
+/*
+** mul_w0_u16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svmul_n_u16_x (p0, z0, x0),
+		 z0 = svmul_x (p0, z0, x0))
+
+/*
+** mul_w0_u16_x_untied:
+**	mov	z0\.h, w0
+**	mul	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svmul_n_u16_x (p0, z1, x0),
+		 z0 = svmul_x (p0, z1, x0))
+
+/*
+** mul_2_u16_x_tied1:
+**	mul	z0\.h, z0\.h, #2
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u16_x_tied1, svuint16_t,
+		z0 = svmul_n_u16_x (p0, z0, 2),
+		z0 = svmul_x (p0, z0, 2))
+
+/*
+** mul_2_u16_x_untied:
+**	movprfx	z0, z1
+**	mul	z0\.h, z0\.h, #2
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u16_x_untied, svuint16_t,
+		z0 = svmul_n_u16_x (p0, z1, 2),
+		z0 = svmul_x (p0, z1, 2))
+
+/*
+** mul_127_u16_x:
+**	mul	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (mul_127_u16_x, svuint16_t,
+		z0 = svmul_n_u16_x (p0, z0, 127),
+		z0 = svmul_x (p0, z0, 127))
+
+/*
+** mul_128_u16_x:
+**	mov	(z[0-9]+\.h), #128
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_128_u16_x, svuint16_t,
+		z0 = svmul_n_u16_x (p0, z0, 128),
+		z0 = svmul_x (p0, z0, 128))
+
+/*
+** mul_255_u16_x:
+**	mov	(z[0-9]+\.h), #255
+**	mul	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_255_u16_x, svuint16_t,
+		z0 = svmul_n_u16_x (p0, z0, 255),
+		z0 = svmul_x (p0, z0, 255))
+
+/*
+** mul_m1_u16_x:
+**	mul	z0\.h, z0\.h, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m1_u16_x, svuint16_t,
+		z0 = svmul_n_u16_x (p0, z0, -1),
+		z0 = svmul_x (p0, z0, -1))
+
+/*
+** mul_m127_u16_x:
+**	mul	z0\.h, z0\.h, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m127_u16_x, svuint16_t,
+		z0 = svmul_n_u16_x (p0, z0, -127),
+		z0 = svmul_x (p0, z0, -127))
+
+/*
+** mul_m128_u16_x:
+**	mul	z0\.h, z0\.h, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m128_u16_x, svuint16_t,
+		z0 = svmul_n_u16_x (p0, z0, -128),
+		z0 = svmul_x (p0, z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u32.c
new file mode 100644
index 000000000..288d17b16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u32.c
@@ -0,0 +1,302 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_u32_m_tied1:
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u32_m_tied1, svuint32_t,
+		z0 = svmul_u32_m (p0, z0, z1),
+		z0 = svmul_m (p0, z0, z1))
+
+/*
+** mul_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mul	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u32_m_tied2, svuint32_t,
+		z0 = svmul_u32_m (p0, z1, z0),
+		z0 = svmul_m (p0, z1, z0))
+
+/*
+** mul_u32_m_untied:
+**	movprfx	z0, z1
+**	mul	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u32_m_untied, svuint32_t,
+		z0 = svmul_u32_m (p0, z1, z2),
+		z0 = svmul_m (p0, z1, z2))
+
+/*
+** mul_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svmul_n_u32_m (p0, z0, x0),
+		 z0 = svmul_m (p0, z0, x0))
+
+/*
+** mul_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svmul_n_u32_m (p0, z1, x0),
+		 z0 = svmul_m (p0, z1, x0))
+
+/*
+** mul_2_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #2
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u32_m_tied1, svuint32_t,
+		z0 = svmul_n_u32_m (p0, z0, 2),
+		z0 = svmul_m (p0, z0, 2))
+
+/*
+** mul_2_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #2
+**	movprfx	z0, z1
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u32_m_untied, svuint32_t,
+		z0 = svmul_n_u32_m (p0, z1, 2),
+		z0 = svmul_m (p0, z1, 2))
+
+/*
+** mul_m1_u32_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	mul	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m1_u32_m, svuint32_t,
+		z0 = svmul_n_u32_m (p0, z0, -1),
+		z0 = svmul_m (p0, z0, -1))
+
+/*
+** mul_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u32_z_tied1, svuint32_t,
+		z0 = svmul_u32_z (p0, z0, z1),
+		z0 = svmul_z (p0, z0, z1))
+
+/*
+** mul_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u32_z_tied2, svuint32_t,
+		z0 = svmul_u32_z (p0, z1, z0),
+		z0 = svmul_z (p0, z1, z0))
+
+/*
+** mul_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mul	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u32_z_untied, svuint32_t,
+		z0 = svmul_u32_z (p0, z1, z2),
+		z0 = svmul_z (p0, z1, z2))
+
+/*
+** mul_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svmul_n_u32_z (p0, z0, x0),
+		 z0 = svmul_z (p0, z0, x0))
+
+/*
+** mul_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mul	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svmul_n_u32_z (p0, z1, x0),
+		 z0 = svmul_z (p0, z1, x0))
+
+/*
+** mul_2_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #2
+**	movprfx	z0\.s, p0/z, z0\.s
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u32_z_tied1, svuint32_t,
+		z0 = svmul_n_u32_z (p0, z0, 2),
+		z0 = svmul_z (p0, z0, 2))
+
+/*
+** mul_2_u32_z_untied:
+**	mov	(z[0-9]+\.s), #2
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	mul	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u32_z_untied, svuint32_t,
+		z0 = svmul_n_u32_z (p0, z1, 2),
+		z0 = svmul_z (p0, z1, 2))
+
+/*
+** mul_u32_x_tied1:
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u32_x_tied1, svuint32_t,
+		z0 = svmul_u32_x (p0, z0, z1),
+		z0 = svmul_x (p0, z0, z1))
+
+/*
+** mul_u32_x_tied2:
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u32_x_tied2, svuint32_t,
+		z0 = svmul_u32_x (p0, z1, z0),
+		z0 = svmul_x (p0, z1, z0))
+
+/*
+** mul_u32_x_untied:
+** (
+**	movprfx	z0, z1
+**	mul	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u32_x_untied, svuint32_t,
+		z0 = svmul_u32_x (p0, z1, z2),
+		z0 = svmul_x (p0, z1, z2))
+
+/*
+** mul_w0_u32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svmul_n_u32_x (p0, z0, x0),
+		 z0 = svmul_x (p0, z0, x0))
+
+/*
+** mul_w0_u32_x_untied:
+**	mov	z0\.s, w0
+**	mul	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svmul_n_u32_x (p0, z1, x0),
+		 z0 = svmul_x (p0, z1, x0))
+
+/*
+** mul_2_u32_x_tied1:
+**	mul	z0\.s, z0\.s, #2
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u32_x_tied1, svuint32_t,
+		z0 = svmul_n_u32_x (p0, z0, 2),
+		z0 = svmul_x (p0, z0, 2))
+
+/*
+** mul_2_u32_x_untied:
+**	movprfx	z0, z1
+**	mul	z0\.s, z0\.s, #2
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u32_x_untied, svuint32_t,
+		z0 = svmul_n_u32_x (p0, z1, 2),
+		z0 = svmul_x (p0, z1, 2))
+
+/*
+** mul_127_u32_x:
+**	mul	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (mul_127_u32_x, svuint32_t,
+		z0 = svmul_n_u32_x (p0, z0, 127),
+		z0 = svmul_x (p0, z0, 127))
+
+/*
+** mul_128_u32_x:
+**	mov	(z[0-9]+\.s), #128
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_128_u32_x, svuint32_t,
+		z0 = svmul_n_u32_x (p0, z0, 128),
+		z0 = svmul_x (p0, z0, 128))
+
+/*
+** mul_255_u32_x:
+**	mov	(z[0-9]+\.s), #255
+**	mul	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_255_u32_x, svuint32_t,
+		z0 = svmul_n_u32_x (p0, z0, 255),
+		z0 = svmul_x (p0, z0, 255))
+
+/*
+** mul_m1_u32_x:
+**	mul	z0\.s, z0\.s, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m1_u32_x, svuint32_t,
+		z0 = svmul_n_u32_x (p0, z0, -1),
+		z0 = svmul_x (p0, z0, -1))
+
+/*
+** mul_m127_u32_x:
+**	mul	z0\.s, z0\.s, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m127_u32_x, svuint32_t,
+		z0 = svmul_n_u32_x (p0, z0, -127),
+		z0 = svmul_x (p0, z0, -127))
+
+/*
+** mul_m128_u32_x:
+**	mul	z0\.s, z0\.s, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m128_u32_x, svuint32_t,
+		z0 = svmul_n_u32_x (p0, z0, -128),
+		z0 = svmul_x (p0, z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u64.c
new file mode 100644
index 000000000..f6959dbc7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u64.c
@@ -0,0 +1,302 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_u64_m_tied1:
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u64_m_tied1, svuint64_t,
+		z0 = svmul_u64_m (p0, z0, z1),
+		z0 = svmul_m (p0, z0, z1))
+
+/*
+** mul_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u64_m_tied2, svuint64_t,
+		z0 = svmul_u64_m (p0, z1, z0),
+		z0 = svmul_m (p0, z1, z0))
+
+/*
+** mul_u64_m_untied:
+**	movprfx	z0, z1
+**	mul	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u64_m_untied, svuint64_t,
+		z0 = svmul_u64_m (p0, z1, z2),
+		z0 = svmul_m (p0, z1, z2))
+
+/*
+** mul_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svmul_n_u64_m (p0, z0, x0),
+		 z0 = svmul_m (p0, z0, x0))
+
+/*
+** mul_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svmul_n_u64_m (p0, z1, x0),
+		 z0 = svmul_m (p0, z1, x0))
+
+/*
+** mul_2_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #2
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u64_m_tied1, svuint64_t,
+		z0 = svmul_n_u64_m (p0, z0, 2),
+		z0 = svmul_m (p0, z0, 2))
+
+/*
+** mul_2_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #2
+**	movprfx	z0, z1
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u64_m_untied, svuint64_t,
+		z0 = svmul_n_u64_m (p0, z1, 2),
+		z0 = svmul_m (p0, z1, 2))
+
+/*
+** mul_m1_u64_m:
+**	mov	(z[0-9]+)\.b, #-1
+**	mul	z0\.d, p0/m, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m1_u64_m, svuint64_t,
+		z0 = svmul_n_u64_m (p0, z0, -1),
+		z0 = svmul_m (p0, z0, -1))
+
+/*
+** mul_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u64_z_tied1, svuint64_t,
+		z0 = svmul_u64_z (p0, z0, z1),
+		z0 = svmul_z (p0, z0, z1))
+
+/*
+** mul_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u64_z_tied2, svuint64_t,
+		z0 = svmul_u64_z (p0, z1, z0),
+		z0 = svmul_z (p0, z1, z0))
+
+/*
+** mul_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mul	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u64_z_untied, svuint64_t,
+		z0 = svmul_u64_z (p0, z1, z2),
+		z0 = svmul_z (p0, z1, z2))
+
+/*
+** mul_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svmul_n_u64_z (p0, z0, x0),
+		 z0 = svmul_z (p0, z0, x0))
+
+/*
+** mul_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mul	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svmul_n_u64_z (p0, z1, x0),
+		 z0 = svmul_z (p0, z1, x0))
+
+/*
+** mul_2_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #2
+**	movprfx	z0\.d, p0/z, z0\.d
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u64_z_tied1, svuint64_t,
+		z0 = svmul_n_u64_z (p0, z0, 2),
+		z0 = svmul_z (p0, z0, 2))
+
+/*
+** mul_2_u64_z_untied:
+**	mov	(z[0-9]+\.d), #2
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	mul	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u64_z_untied, svuint64_t,
+		z0 = svmul_n_u64_z (p0, z1, 2),
+		z0 = svmul_z (p0, z1, 2))
+
+/*
+** mul_u64_x_tied1:
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u64_x_tied1, svuint64_t,
+		z0 = svmul_u64_x (p0, z0, z1),
+		z0 = svmul_x (p0, z0, z1))
+
+/*
+** mul_u64_x_tied2:
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u64_x_tied2, svuint64_t,
+		z0 = svmul_u64_x (p0, z1, z0),
+		z0 = svmul_x (p0, z1, z0))
+
+/*
+** mul_u64_x_untied:
+** (
+**	movprfx	z0, z1
+**	mul	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u64_x_untied, svuint64_t,
+		z0 = svmul_u64_x (p0, z1, z2),
+		z0 = svmul_x (p0, z1, z2))
+
+/*
+** mul_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svmul_n_u64_x (p0, z0, x0),
+		 z0 = svmul_x (p0, z0, x0))
+
+/*
+** mul_x0_u64_x_untied:
+**	mov	z0\.d, x0
+**	mul	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svmul_n_u64_x (p0, z1, x0),
+		 z0 = svmul_x (p0, z1, x0))
+
+/*
+** mul_2_u64_x_tied1:
+**	mul	z0\.d, z0\.d, #2
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u64_x_tied1, svuint64_t,
+		z0 = svmul_n_u64_x (p0, z0, 2),
+		z0 = svmul_x (p0, z0, 2))
+
+/*
+** mul_2_u64_x_untied:
+**	movprfx	z0, z1
+**	mul	z0\.d, z0\.d, #2
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u64_x_untied, svuint64_t,
+		z0 = svmul_n_u64_x (p0, z1, 2),
+		z0 = svmul_x (p0, z1, 2))
+
+/*
+** mul_127_u64_x:
+**	mul	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (mul_127_u64_x, svuint64_t,
+		z0 = svmul_n_u64_x (p0, z0, 127),
+		z0 = svmul_x (p0, z0, 127))
+
+/*
+** mul_128_u64_x:
+**	mov	(z[0-9]+\.d), #128
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_128_u64_x, svuint64_t,
+		z0 = svmul_n_u64_x (p0, z0, 128),
+		z0 = svmul_x (p0, z0, 128))
+
+/*
+** mul_255_u64_x:
+**	mov	(z[0-9]+\.d), #255
+**	mul	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_255_u64_x, svuint64_t,
+		z0 = svmul_n_u64_x (p0, z0, 255),
+		z0 = svmul_x (p0, z0, 255))
+
+/*
+** mul_m1_u64_x:
+**	mul	z0\.d, z0\.d, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m1_u64_x, svuint64_t,
+		z0 = svmul_n_u64_x (p0, z0, -1),
+		z0 = svmul_x (p0, z0, -1))
+
+/*
+** mul_m127_u64_x:
+**	mul	z0\.d, z0\.d, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m127_u64_x, svuint64_t,
+		z0 = svmul_n_u64_x (p0, z0, -127),
+		z0 = svmul_x (p0, z0, -127))
+
+/*
+** mul_m128_u64_x:
+**	mul	z0\.d, z0\.d, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m128_u64_x, svuint64_t,
+		z0 = svmul_n_u64_x (p0, z0, -128),
+		z0 = svmul_x (p0, z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u8.c
new file mode 100644
index 000000000..b2745a48f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_u8.c
@@ -0,0 +1,300 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mul_u8_m_tied1:
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u8_m_tied1, svuint8_t,
+		z0 = svmul_u8_m (p0, z0, z1),
+		z0 = svmul_m (p0, z0, z1))
+
+/*
+** mul_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	mul	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u8_m_tied2, svuint8_t,
+		z0 = svmul_u8_m (p0, z1, z0),
+		z0 = svmul_m (p0, z1, z0))
+
+/*
+** mul_u8_m_untied:
+**	movprfx	z0, z1
+**	mul	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u8_m_untied, svuint8_t,
+		z0 = svmul_u8_m (p0, z1, z2),
+		z0 = svmul_m (p0, z1, z2))
+
+/*
+** mul_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	mul	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svmul_n_u8_m (p0, z0, x0),
+		 z0 = svmul_m (p0, z0, x0))
+
+/*
+** mul_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	mul	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svmul_n_u8_m (p0, z1, x0),
+		 z0 = svmul_m (p0, z1, x0))
+
+/*
+** mul_2_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #2
+**	mul	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u8_m_tied1, svuint8_t,
+		z0 = svmul_n_u8_m (p0, z0, 2),
+		z0 = svmul_m (p0, z0, 2))
+
+/*
+** mul_2_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #2
+**	movprfx	z0, z1
+**	mul	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u8_m_untied, svuint8_t,
+		z0 = svmul_n_u8_m (p0, z1, 2),
+		z0 = svmul_m (p0, z1, 2))
+
+/*
+** mul_m1_u8_m:
+**	mov	(z[0-9]+\.b), #-1
+**	mul	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m1_u8_m, svuint8_t,
+		z0 = svmul_n_u8_m (p0, z0, -1),
+		z0 = svmul_m (p0, z0, -1))
+
+/*
+** mul_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u8_z_tied1, svuint8_t,
+		z0 = svmul_u8_z (p0, z0, z1),
+		z0 = svmul_z (p0, z0, z1))
+
+/*
+** mul_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u8_z_tied2, svuint8_t,
+		z0 = svmul_u8_z (p0, z1, z0),
+		z0 = svmul_z (p0, z1, z0))
+
+/*
+** mul_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mul	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u8_z_untied, svuint8_t,
+		z0 = svmul_u8_z (p0, z1, z2),
+		z0 = svmul_z (p0, z1, z2))
+
+/*
+** mul_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mul	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svmul_n_u8_z (p0, z0, x0),
+		 z0 = svmul_z (p0, z0, x0))
+
+/*
+** mul_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mul	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svmul_n_u8_z (p0, z1, x0),
+		 z0 = svmul_z (p0, z1, x0))
+
+/*
+** mul_2_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #2
+**	movprfx	z0\.b, p0/z, z0\.b
+**	mul	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u8_z_tied1, svuint8_t,
+		z0 = svmul_n_u8_z (p0, z0, 2),
+		z0 = svmul_z (p0, z0, 2))
+
+/*
+** mul_2_u8_z_untied:
+**	mov	(z[0-9]+\.b), #2
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	mul	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u8_z_untied, svuint8_t,
+		z0 = svmul_n_u8_z (p0, z1, 2),
+		z0 = svmul_z (p0, z1, 2))
+
+/*
+** mul_u8_x_tied1:
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u8_x_tied1, svuint8_t,
+		z0 = svmul_u8_x (p0, z0, z1),
+		z0 = svmul_x (p0, z0, z1))
+
+/*
+** mul_u8_x_tied2:
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u8_x_tied2, svuint8_t,
+		z0 = svmul_u8_x (p0, z1, z0),
+		z0 = svmul_x (p0, z1, z0))
+
+/*
+** mul_u8_x_untied:
+** (
+**	movprfx	z0, z1
+**	mul	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0, z2
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mul_u8_x_untied, svuint8_t,
+		z0 = svmul_u8_x (p0, z1, z2),
+		z0 = svmul_x (p0, z1, z2))
+
+/*
+** mul_w0_u8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	mul	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svmul_n_u8_x (p0, z0, x0),
+		 z0 = svmul_x (p0, z0, x0))
+
+/*
+** mul_w0_u8_x_untied:
+**	mov	z0\.b, w0
+**	mul	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mul_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svmul_n_u8_x (p0, z1, x0),
+		 z0 = svmul_x (p0, z1, x0))
+
+/*
+** mul_2_u8_x_tied1:
+**	mul	z0\.b, z0\.b, #2
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u8_x_tied1, svuint8_t,
+		z0 = svmul_n_u8_x (p0, z0, 2),
+		z0 = svmul_x (p0, z0, 2))
+
+/*
+** mul_2_u8_x_untied:
+**	movprfx	z0, z1
+**	mul	z0\.b, z0\.b, #2
+**	ret
+*/
+TEST_UNIFORM_Z (mul_2_u8_x_untied, svuint8_t,
+		z0 = svmul_n_u8_x (p0, z1, 2),
+		z0 = svmul_x (p0, z1, 2))
+
+/*
+** mul_127_u8_x:
+**	mul	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (mul_127_u8_x, svuint8_t,
+		z0 = svmul_n_u8_x (p0, z0, 127),
+		z0 = svmul_x (p0, z0, 127))
+
+/*
+** mul_128_u8_x:
+**	mul	z0\.b, z0\.b, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (mul_128_u8_x, svuint8_t,
+		z0 = svmul_n_u8_x (p0, z0, 128),
+		z0 = svmul_x (p0, z0, 128))
+
+/*
+** mul_255_u8_x:
+**	mul	z0\.b, z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_255_u8_x, svuint8_t,
+		z0 = svmul_n_u8_x (p0, z0, 255),
+		z0 = svmul_x (p0, z0, 255))
+
+/*
+** mul_m1_u8_x:
+**	mul	z0\.b, z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m1_u8_x, svuint8_t,
+		z0 = svmul_n_u8_x (p0, z0, -1),
+		z0 = svmul_x (p0, z0, -1))
+
+/*
+** mul_m127_u8_x:
+**	mul	z0\.b, z0\.b, #-127
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m127_u8_x, svuint8_t,
+		z0 = svmul_n_u8_x (p0, z0, -127),
+		z0 = svmul_x (p0, z0, -127))
+
+/*
+** mul_m128_u8_x:
+**	mul	z0\.b, z0\.b, #-128
+**	ret
+*/
+TEST_UNIFORM_Z (mul_m128_u8_x, svuint8_t,
+		z0 = svmul_n_u8_x (p0, z0, -128),
+		z0 = svmul_x (p0, z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s16.c
new file mode 100644
index 000000000..a81532f5d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s16.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mulh_s16_m_tied1:
+**	smulh	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s16_m_tied1, svint16_t,
+		z0 = svmulh_s16_m (p0, z0, z1),
+		z0 = svmulh_m (p0, z0, z1))
+
+/*
+** mulh_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	smulh	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s16_m_tied2, svint16_t,
+		z0 = svmulh_s16_m (p0, z1, z0),
+		z0 = svmulh_m (p0, z1, z0))
+
+/*
+** mulh_s16_m_untied:
+**	movprfx	z0, z1
+**	smulh	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s16_m_untied, svint16_t,
+		z0 = svmulh_s16_m (p0, z1, z2),
+		z0 = svmulh_m (p0, z1, z2))
+
+/*
+** mulh_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	smulh	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s16_m_tied1, svint16_t, int16_t,
+		 z0 = svmulh_n_s16_m (p0, z0, x0),
+		 z0 = svmulh_m (p0, z0, x0))
+
+/*
+** mulh_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	smulh	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s16_m_untied, svint16_t, int16_t,
+		 z0 = svmulh_n_s16_m (p0, z1, x0),
+		 z0 = svmulh_m (p0, z1, x0))
+
+/*
+** mulh_11_s16_m_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	smulh	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s16_m_tied1, svint16_t,
+		z0 = svmulh_n_s16_m (p0, z0, 11),
+		z0 = svmulh_m (p0, z0, 11))
+
+/*
+** mulh_11_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0, z1
+**	smulh	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s16_m_untied, svint16_t,
+		z0 = svmulh_n_s16_m (p0, z1, 11),
+		z0 = svmulh_m (p0, z1, 11))
+
+/*
+** mulh_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	smulh	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s16_z_tied1, svint16_t,
+		z0 = svmulh_s16_z (p0, z0, z1),
+		z0 = svmulh_z (p0, z0, z1))
+
+/*
+** mulh_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	smulh	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s16_z_tied2, svint16_t,
+		z0 = svmulh_s16_z (p0, z1, z0),
+		z0 = svmulh_z (p0, z1, z0))
+
+/*
+** mulh_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	smulh	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	smulh	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s16_z_untied, svint16_t,
+		z0 = svmulh_s16_z (p0, z1, z2),
+		z0 = svmulh_z (p0, z1, z2))
+
+/*
+** mulh_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	smulh	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s16_z_tied1, svint16_t, int16_t,
+		 z0 = svmulh_n_s16_z (p0, z0, x0),
+		 z0 = svmulh_z (p0, z0, x0))
+
+/*
+** mulh_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	smulh	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	smulh	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s16_z_untied, svint16_t, int16_t,
+		 z0 = svmulh_n_s16_z (p0, z1, x0),
+		 z0 = svmulh_z (p0, z1, x0))
+
+/*
+** mulh_11_s16_z_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	smulh	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s16_z_tied1, svint16_t,
+		z0 = svmulh_n_s16_z (p0, z0, 11),
+		z0 = svmulh_z (p0, z0, 11))
+
+/*
+** mulh_11_s16_z_untied:
+**	mov	(z[0-9]+\.h), #11
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	smulh	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	smulh	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s16_z_untied, svint16_t,
+		z0 = svmulh_n_s16_z (p0, z1, 11),
+		z0 = svmulh_z (p0, z1, 11))
+
+/*
+** mulh_s16_x_tied1:
+**	smulh	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s16_x_tied1, svint16_t,
+		z0 = svmulh_s16_x (p0, z0, z1),
+		z0 = svmulh_x (p0, z0, z1))
+
+/*
+** mulh_s16_x_tied2:
+**	smulh	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s16_x_tied2, svint16_t,
+		z0 = svmulh_s16_x (p0, z1, z0),
+		z0 = svmulh_x (p0, z1, z0))
+
+/*
+** mulh_s16_x_untied:
+** (
+**	movprfx	z0, z1
+**	smulh	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	smulh	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s16_x_untied, svint16_t,
+		z0 = svmulh_s16_x (p0, z1, z2),
+		z0 = svmulh_x (p0, z1, z2))
+
+/*
+** mulh_w0_s16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	smulh	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s16_x_tied1, svint16_t, int16_t,
+		 z0 = svmulh_n_s16_x (p0, z0, x0),
+		 z0 = svmulh_x (p0, z0, x0))
+
+/*
+** mulh_w0_s16_x_untied:
+**	mov	z0\.h, w0
+**	smulh	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s16_x_untied, svint16_t, int16_t,
+		 z0 = svmulh_n_s16_x (p0, z1, x0),
+		 z0 = svmulh_x (p0, z1, x0))
+
+/*
+** mulh_11_s16_x_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	smulh	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s16_x_tied1, svint16_t,
+		z0 = svmulh_n_s16_x (p0, z0, 11),
+		z0 = svmulh_x (p0, z0, 11))
+
+/*
+** mulh_11_s16_x_untied:
+**	mov	z0\.h, #11
+**	smulh	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s16_x_untied, svint16_t,
+		z0 = svmulh_n_s16_x (p0, z1, 11),
+		z0 = svmulh_x (p0, z1, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s32.c
new file mode 100644
index 000000000..078feeb6a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s32.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mulh_s32_m_tied1:
+**	smulh	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s32_m_tied1, svint32_t,
+		z0 = svmulh_s32_m (p0, z0, z1),
+		z0 = svmulh_m (p0, z0, z1))
+
+/*
+** mulh_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	smulh	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s32_m_tied2, svint32_t,
+		z0 = svmulh_s32_m (p0, z1, z0),
+		z0 = svmulh_m (p0, z1, z0))
+
+/*
+** mulh_s32_m_untied:
+**	movprfx	z0, z1
+**	smulh	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s32_m_untied, svint32_t,
+		z0 = svmulh_s32_m (p0, z1, z2),
+		z0 = svmulh_m (p0, z1, z2))
+
+/*
+** mulh_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	smulh	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svmulh_n_s32_m (p0, z0, x0),
+		 z0 = svmulh_m (p0, z0, x0))
+
+/*
+** mulh_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	smulh	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svmulh_n_s32_m (p0, z1, x0),
+		 z0 = svmulh_m (p0, z1, x0))
+
+/*
+** mulh_11_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	smulh	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s32_m_tied1, svint32_t,
+		z0 = svmulh_n_s32_m (p0, z0, 11),
+		z0 = svmulh_m (p0, z0, 11))
+
+/*
+** mulh_11_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0, z1
+**	smulh	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s32_m_untied, svint32_t,
+		z0 = svmulh_n_s32_m (p0, z1, 11),
+		z0 = svmulh_m (p0, z1, 11))
+
+/*
+** mulh_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	smulh	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s32_z_tied1, svint32_t,
+		z0 = svmulh_s32_z (p0, z0, z1),
+		z0 = svmulh_z (p0, z0, z1))
+
+/*
+** mulh_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	smulh	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s32_z_tied2, svint32_t,
+		z0 = svmulh_s32_z (p0, z1, z0),
+		z0 = svmulh_z (p0, z1, z0))
+
+/*
+** mulh_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	smulh	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	smulh	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s32_z_untied, svint32_t,
+		z0 = svmulh_s32_z (p0, z1, z2),
+		z0 = svmulh_z (p0, z1, z2))
+
+/*
+** mulh_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	smulh	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svmulh_n_s32_z (p0, z0, x0),
+		 z0 = svmulh_z (p0, z0, x0))
+
+/*
+** mulh_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	smulh	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	smulh	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svmulh_n_s32_z (p0, z1, x0),
+		 z0 = svmulh_z (p0, z1, x0))
+
+/*
+** mulh_11_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	smulh	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s32_z_tied1, svint32_t,
+		z0 = svmulh_n_s32_z (p0, z0, 11),
+		z0 = svmulh_z (p0, z0, 11))
+
+/*
+** mulh_11_s32_z_untied:
+**	mov	(z[0-9]+\.s), #11
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	smulh	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	smulh	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s32_z_untied, svint32_t,
+		z0 = svmulh_n_s32_z (p0, z1, 11),
+		z0 = svmulh_z (p0, z1, 11))
+
+/*
+** mulh_s32_x_tied1:
+**	smulh	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s32_x_tied1, svint32_t,
+		z0 = svmulh_s32_x (p0, z0, z1),
+		z0 = svmulh_x (p0, z0, z1))
+
+/*
+** mulh_s32_x_tied2:
+**	smulh	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s32_x_tied2, svint32_t,
+		z0 = svmulh_s32_x (p0, z1, z0),
+		z0 = svmulh_x (p0, z1, z0))
+
+/*
+** mulh_s32_x_untied:
+** (
+**	movprfx	z0, z1
+**	smulh	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	smulh	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s32_x_untied, svint32_t,
+		z0 = svmulh_s32_x (p0, z1, z2),
+		z0 = svmulh_x (p0, z1, z2))
+
+/*
+** mulh_w0_s32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	smulh	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svmulh_n_s32_x (p0, z0, x0),
+		 z0 = svmulh_x (p0, z0, x0))
+
+/*
+** mulh_w0_s32_x_untied:
+**	mov	z0\.s, w0
+**	smulh	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svmulh_n_s32_x (p0, z1, x0),
+		 z0 = svmulh_x (p0, z1, x0))
+
+/*
+** mulh_11_s32_x_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	smulh	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s32_x_tied1, svint32_t,
+		z0 = svmulh_n_s32_x (p0, z0, 11),
+		z0 = svmulh_x (p0, z0, 11))
+
+/*
+** mulh_11_s32_x_untied:
+**	mov	z0\.s, #11
+**	smulh	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s32_x_untied, svint32_t,
+		z0 = svmulh_n_s32_x (p0, z1, 11),
+		z0 = svmulh_x (p0, z1, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s64.c
new file mode 100644
index 000000000..a87d4d5ce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s64.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mulh_s64_m_tied1:
+**	smulh	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s64_m_tied1, svint64_t,
+		z0 = svmulh_s64_m (p0, z0, z1),
+		z0 = svmulh_m (p0, z0, z1))
+
+/*
+** mulh_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	smulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s64_m_tied2, svint64_t,
+		z0 = svmulh_s64_m (p0, z1, z0),
+		z0 = svmulh_m (p0, z1, z0))
+
+/*
+** mulh_s64_m_untied:
+**	movprfx	z0, z1
+**	smulh	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s64_m_untied, svint64_t,
+		z0 = svmulh_s64_m (p0, z1, z2),
+		z0 = svmulh_m (p0, z1, z2))
+
+/*
+** mulh_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	smulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svmulh_n_s64_m (p0, z0, x0),
+		 z0 = svmulh_m (p0, z0, x0))
+
+/*
+** mulh_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	smulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svmulh_n_s64_m (p0, z1, x0),
+		 z0 = svmulh_m (p0, z1, x0))
+
+/*
+** mulh_11_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	smulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s64_m_tied1, svint64_t,
+		z0 = svmulh_n_s64_m (p0, z0, 11),
+		z0 = svmulh_m (p0, z0, 11))
+
+/*
+** mulh_11_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0, z1
+**	smulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s64_m_untied, svint64_t,
+		z0 = svmulh_n_s64_m (p0, z1, 11),
+		z0 = svmulh_m (p0, z1, 11))
+
+/*
+** mulh_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	smulh	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s64_z_tied1, svint64_t,
+		z0 = svmulh_s64_z (p0, z0, z1),
+		z0 = svmulh_z (p0, z0, z1))
+
+/*
+** mulh_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	smulh	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s64_z_tied2, svint64_t,
+		z0 = svmulh_s64_z (p0, z1, z0),
+		z0 = svmulh_z (p0, z1, z0))
+
+/*
+** mulh_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	smulh	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	smulh	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s64_z_untied, svint64_t,
+		z0 = svmulh_s64_z (p0, z1, z2),
+		z0 = svmulh_z (p0, z1, z2))
+
+/*
+** mulh_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	smulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svmulh_n_s64_z (p0, z0, x0),
+		 z0 = svmulh_z (p0, z0, x0))
+
+/*
+** mulh_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	smulh	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	smulh	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svmulh_n_s64_z (p0, z1, x0),
+		 z0 = svmulh_z (p0, z1, x0))
+
+/*
+** mulh_11_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	smulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s64_z_tied1, svint64_t,
+		z0 = svmulh_n_s64_z (p0, z0, 11),
+		z0 = svmulh_z (p0, z0, 11))
+
+/*
+** mulh_11_s64_z_untied:
+**	mov	(z[0-9]+\.d), #11
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	smulh	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	smulh	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s64_z_untied, svint64_t,
+		z0 = svmulh_n_s64_z (p0, z1, 11),
+		z0 = svmulh_z (p0, z1, 11))
+
+/*
+** mulh_s64_x_tied1:
+**	smulh	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s64_x_tied1, svint64_t,
+		z0 = svmulh_s64_x (p0, z0, z1),
+		z0 = svmulh_x (p0, z0, z1))
+
+/*
+** mulh_s64_x_tied2:
+**	smulh	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s64_x_tied2, svint64_t,
+		z0 = svmulh_s64_x (p0, z1, z0),
+		z0 = svmulh_x (p0, z1, z0))
+
+/*
+** mulh_s64_x_untied:
+** (
+**	movprfx	z0, z1
+**	smulh	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	smulh	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s64_x_untied, svint64_t,
+		z0 = svmulh_s64_x (p0, z1, z2),
+		z0 = svmulh_x (p0, z1, z2))
+
+/*
+** mulh_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	smulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svmulh_n_s64_x (p0, z0, x0),
+		 z0 = svmulh_x (p0, z0, x0))
+
+/*
+** mulh_x0_s64_x_untied:
+**	mov	z0\.d, x0
+**	smulh	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svmulh_n_s64_x (p0, z1, x0),
+		 z0 = svmulh_x (p0, z1, x0))
+
+/*
+** mulh_11_s64_x_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	smulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s64_x_tied1, svint64_t,
+		z0 = svmulh_n_s64_x (p0, z0, 11),
+		z0 = svmulh_x (p0, z0, 11))
+
+/*
+** mulh_11_s64_x_untied:
+**	mov	z0\.d, #11
+**	smulh	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s64_x_untied, svint64_t,
+		z0 = svmulh_n_s64_x (p0, z1, 11),
+		z0 = svmulh_x (p0, z1, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s8.c
new file mode 100644
index 000000000..f9cd01afd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_s8.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mulh_s8_m_tied1:
+**	smulh	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s8_m_tied1, svint8_t,
+		z0 = svmulh_s8_m (p0, z0, z1),
+		z0 = svmulh_m (p0, z0, z1))
+
+/*
+** mulh_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	smulh	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s8_m_tied2, svint8_t,
+		z0 = svmulh_s8_m (p0, z1, z0),
+		z0 = svmulh_m (p0, z1, z0))
+
+/*
+** mulh_s8_m_untied:
+**	movprfx	z0, z1
+**	smulh	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s8_m_untied, svint8_t,
+		z0 = svmulh_s8_m (p0, z1, z2),
+		z0 = svmulh_m (p0, z1, z2))
+
+/*
+** mulh_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	smulh	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s8_m_tied1, svint8_t, int8_t,
+		 z0 = svmulh_n_s8_m (p0, z0, x0),
+		 z0 = svmulh_m (p0, z0, x0))
+
+/*
+** mulh_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	smulh	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s8_m_untied, svint8_t, int8_t,
+		 z0 = svmulh_n_s8_m (p0, z1, x0),
+		 z0 = svmulh_m (p0, z1, x0))
+
+/*
+** mulh_11_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	smulh	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s8_m_tied1, svint8_t,
+		z0 = svmulh_n_s8_m (p0, z0, 11),
+		z0 = svmulh_m (p0, z0, 11))
+
+/*
+** mulh_11_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0, z1
+**	smulh	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s8_m_untied, svint8_t,
+		z0 = svmulh_n_s8_m (p0, z1, 11),
+		z0 = svmulh_m (p0, z1, 11))
+
+/*
+** mulh_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	smulh	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s8_z_tied1, svint8_t,
+		z0 = svmulh_s8_z (p0, z0, z1),
+		z0 = svmulh_z (p0, z0, z1))
+
+/*
+** mulh_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	smulh	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s8_z_tied2, svint8_t,
+		z0 = svmulh_s8_z (p0, z1, z0),
+		z0 = svmulh_z (p0, z1, z0))
+
+/*
+** mulh_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	smulh	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	smulh	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s8_z_untied, svint8_t,
+		z0 = svmulh_s8_z (p0, z1, z2),
+		z0 = svmulh_z (p0, z1, z2))
+
+/*
+** mulh_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	smulh	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s8_z_tied1, svint8_t, int8_t,
+		 z0 = svmulh_n_s8_z (p0, z0, x0),
+		 z0 = svmulh_z (p0, z0, x0))
+
+/*
+** mulh_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	smulh	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	smulh	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s8_z_untied, svint8_t, int8_t,
+		 z0 = svmulh_n_s8_z (p0, z1, x0),
+		 z0 = svmulh_z (p0, z1, x0))
+
+/*
+** mulh_11_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	smulh	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s8_z_tied1, svint8_t,
+		z0 = svmulh_n_s8_z (p0, z0, 11),
+		z0 = svmulh_z (p0, z0, 11))
+
+/*
+** mulh_11_s8_z_untied:
+**	mov	(z[0-9]+\.b), #11
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	smulh	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	smulh	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s8_z_untied, svint8_t,
+		z0 = svmulh_n_s8_z (p0, z1, 11),
+		z0 = svmulh_z (p0, z1, 11))
+
+/*
+** mulh_s8_x_tied1:
+**	smulh	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s8_x_tied1, svint8_t,
+		z0 = svmulh_s8_x (p0, z0, z1),
+		z0 = svmulh_x (p0, z0, z1))
+
+/*
+** mulh_s8_x_tied2:
+**	smulh	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s8_x_tied2, svint8_t,
+		z0 = svmulh_s8_x (p0, z1, z0),
+		z0 = svmulh_x (p0, z1, z0))
+
+/*
+** mulh_s8_x_untied:
+** (
+**	movprfx	z0, z1
+**	smulh	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0, z2
+**	smulh	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_s8_x_untied, svint8_t,
+		z0 = svmulh_s8_x (p0, z1, z2),
+		z0 = svmulh_x (p0, z1, z2))
+
+/*
+** mulh_w0_s8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	smulh	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s8_x_tied1, svint8_t, int8_t,
+		 z0 = svmulh_n_s8_x (p0, z0, x0),
+		 z0 = svmulh_x (p0, z0, x0))
+
+/*
+** mulh_w0_s8_x_untied:
+**	mov	z0\.b, w0
+**	smulh	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_s8_x_untied, svint8_t, int8_t,
+		 z0 = svmulh_n_s8_x (p0, z1, x0),
+		 z0 = svmulh_x (p0, z1, x0))
+
+/*
+** mulh_11_s8_x_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	smulh	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s8_x_tied1, svint8_t,
+		z0 = svmulh_n_s8_x (p0, z0, 11),
+		z0 = svmulh_x (p0, z0, 11))
+
+/*
+** mulh_11_s8_x_untied:
+**	mov	z0\.b, #11
+**	smulh	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_s8_x_untied, svint8_t,
+		z0 = svmulh_n_s8_x (p0, z1, 11),
+		z0 = svmulh_x (p0, z1, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u16.c
new file mode 100644
index 000000000..e9173eb24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u16.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mulh_u16_m_tied1:
+**	umulh	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u16_m_tied1, svuint16_t,
+		z0 = svmulh_u16_m (p0, z0, z1),
+		z0 = svmulh_m (p0, z0, z1))
+
+/*
+** mulh_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	umulh	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u16_m_tied2, svuint16_t,
+		z0 = svmulh_u16_m (p0, z1, z0),
+		z0 = svmulh_m (p0, z1, z0))
+
+/*
+** mulh_u16_m_untied:
+**	movprfx	z0, z1
+**	umulh	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u16_m_untied, svuint16_t,
+		z0 = svmulh_u16_m (p0, z1, z2),
+		z0 = svmulh_m (p0, z1, z2))
+
+/*
+** mulh_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	umulh	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svmulh_n_u16_m (p0, z0, x0),
+		 z0 = svmulh_m (p0, z0, x0))
+
+/*
+** mulh_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	umulh	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svmulh_n_u16_m (p0, z1, x0),
+		 z0 = svmulh_m (p0, z1, x0))
+
+/*
+** mulh_11_u16_m_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	umulh	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u16_m_tied1, svuint16_t,
+		z0 = svmulh_n_u16_m (p0, z0, 11),
+		z0 = svmulh_m (p0, z0, 11))
+
+/*
+** mulh_11_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0, z1
+**	umulh	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u16_m_untied, svuint16_t,
+		z0 = svmulh_n_u16_m (p0, z1, 11),
+		z0 = svmulh_m (p0, z1, 11))
+
+/*
+** mulh_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	umulh	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u16_z_tied1, svuint16_t,
+		z0 = svmulh_u16_z (p0, z0, z1),
+		z0 = svmulh_z (p0, z0, z1))
+
+/*
+** mulh_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	umulh	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u16_z_tied2, svuint16_t,
+		z0 = svmulh_u16_z (p0, z1, z0),
+		z0 = svmulh_z (p0, z1, z0))
+
+/*
+** mulh_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	umulh	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	umulh	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u16_z_untied, svuint16_t,
+		z0 = svmulh_u16_z (p0, z1, z2),
+		z0 = svmulh_z (p0, z1, z2))
+
+/*
+** mulh_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	umulh	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svmulh_n_u16_z (p0, z0, x0),
+		 z0 = svmulh_z (p0, z0, x0))
+
+/*
+** mulh_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	umulh	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	umulh	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svmulh_n_u16_z (p0, z1, x0),
+		 z0 = svmulh_z (p0, z1, x0))
+
+/*
+** mulh_11_u16_z_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	movprfx	z0\.h, p0/z, z0\.h
+**	umulh	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u16_z_tied1, svuint16_t,
+		z0 = svmulh_n_u16_z (p0, z0, 11),
+		z0 = svmulh_z (p0, z0, 11))
+
+/*
+** mulh_11_u16_z_untied:
+**	mov	(z[0-9]+\.h), #11
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	umulh	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	umulh	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u16_z_untied, svuint16_t,
+		z0 = svmulh_n_u16_z (p0, z1, 11),
+		z0 = svmulh_z (p0, z1, 11))
+
+/*
+** mulh_u16_x_tied1:
+**	umulh	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u16_x_tied1, svuint16_t,
+		z0 = svmulh_u16_x (p0, z0, z1),
+		z0 = svmulh_x (p0, z0, z1))
+
+/*
+** mulh_u16_x_tied2:
+**	umulh	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u16_x_tied2, svuint16_t,
+		z0 = svmulh_u16_x (p0, z1, z0),
+		z0 = svmulh_x (p0, z1, z0))
+
+/*
+** mulh_u16_x_untied:
+** (
+**	movprfx	z0, z1
+**	umulh	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	umulh	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u16_x_untied, svuint16_t,
+		z0 = svmulh_u16_x (p0, z1, z2),
+		z0 = svmulh_x (p0, z1, z2))
+
+/*
+** mulh_w0_u16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	umulh	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svmulh_n_u16_x (p0, z0, x0),
+		 z0 = svmulh_x (p0, z0, x0))
+
+/*
+** mulh_w0_u16_x_untied:
+**	mov	z0\.h, w0
+**	umulh	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svmulh_n_u16_x (p0, z1, x0),
+		 z0 = svmulh_x (p0, z1, x0))
+
+/*
+** mulh_11_u16_x_tied1:
+**	mov	(z[0-9]+\.h), #11
+**	umulh	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u16_x_tied1, svuint16_t,
+		z0 = svmulh_n_u16_x (p0, z0, 11),
+		z0 = svmulh_x (p0, z0, 11))
+
+/*
+** mulh_11_u16_x_untied:
+**	mov	z0\.h, #11
+**	umulh	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u16_x_untied, svuint16_t,
+		z0 = svmulh_n_u16_x (p0, z1, 11),
+		z0 = svmulh_x (p0, z1, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u32.c
new file mode 100644
index 000000000..de1f24f09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u32.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mulh_u32_m_tied1:
+**	umulh	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u32_m_tied1, svuint32_t,
+		z0 = svmulh_u32_m (p0, z0, z1),
+		z0 = svmulh_m (p0, z0, z1))
+
+/*
+** mulh_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	umulh	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u32_m_tied2, svuint32_t,
+		z0 = svmulh_u32_m (p0, z1, z0),
+		z0 = svmulh_m (p0, z1, z0))
+
+/*
+** mulh_u32_m_untied:
+**	movprfx	z0, z1
+**	umulh	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u32_m_untied, svuint32_t,
+		z0 = svmulh_u32_m (p0, z1, z2),
+		z0 = svmulh_m (p0, z1, z2))
+
+/*
+** mulh_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	umulh	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svmulh_n_u32_m (p0, z0, x0),
+		 z0 = svmulh_m (p0, z0, x0))
+
+/*
+** mulh_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	umulh	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svmulh_n_u32_m (p0, z1, x0),
+		 z0 = svmulh_m (p0, z1, x0))
+
+/*
+** mulh_11_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	umulh	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u32_m_tied1, svuint32_t,
+		z0 = svmulh_n_u32_m (p0, z0, 11),
+		z0 = svmulh_m (p0, z0, 11))
+
+/*
+** mulh_11_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0, z1
+**	umulh	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u32_m_untied, svuint32_t,
+		z0 = svmulh_n_u32_m (p0, z1, 11),
+		z0 = svmulh_m (p0, z1, 11))
+
+/*
+** mulh_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	umulh	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u32_z_tied1, svuint32_t,
+		z0 = svmulh_u32_z (p0, z0, z1),
+		z0 = svmulh_z (p0, z0, z1))
+
+/*
+** mulh_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	umulh	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u32_z_tied2, svuint32_t,
+		z0 = svmulh_u32_z (p0, z1, z0),
+		z0 = svmulh_z (p0, z1, z0))
+
+/*
+** mulh_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	umulh	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	umulh	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u32_z_untied, svuint32_t,
+		z0 = svmulh_u32_z (p0, z1, z2),
+		z0 = svmulh_z (p0, z1, z2))
+
+/*
+** mulh_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	umulh	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svmulh_n_u32_z (p0, z0, x0),
+		 z0 = svmulh_z (p0, z0, x0))
+
+/*
+** mulh_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	umulh	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	umulh	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svmulh_n_u32_z (p0, z1, x0),
+		 z0 = svmulh_z (p0, z1, x0))
+
+/*
+** mulh_11_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	movprfx	z0\.s, p0/z, z0\.s
+**	umulh	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u32_z_tied1, svuint32_t,
+		z0 = svmulh_n_u32_z (p0, z0, 11),
+		z0 = svmulh_z (p0, z0, 11))
+
+/*
+** mulh_11_u32_z_untied:
+**	mov	(z[0-9]+\.s), #11
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	umulh	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	umulh	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u32_z_untied, svuint32_t,
+		z0 = svmulh_n_u32_z (p0, z1, 11),
+		z0 = svmulh_z (p0, z1, 11))
+
+/*
+** mulh_u32_x_tied1:
+**	umulh	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u32_x_tied1, svuint32_t,
+		z0 = svmulh_u32_x (p0, z0, z1),
+		z0 = svmulh_x (p0, z0, z1))
+
+/*
+** mulh_u32_x_tied2:
+**	umulh	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u32_x_tied2, svuint32_t,
+		z0 = svmulh_u32_x (p0, z1, z0),
+		z0 = svmulh_x (p0, z1, z0))
+
+/*
+** mulh_u32_x_untied:
+** (
+**	movprfx	z0, z1
+**	umulh	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	umulh	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u32_x_untied, svuint32_t,
+		z0 = svmulh_u32_x (p0, z1, z2),
+		z0 = svmulh_x (p0, z1, z2))
+
+/*
+** mulh_w0_u32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	umulh	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svmulh_n_u32_x (p0, z0, x0),
+		 z0 = svmulh_x (p0, z0, x0))
+
+/*
+** mulh_w0_u32_x_untied:
+**	mov	z0\.s, w0
+**	umulh	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svmulh_n_u32_x (p0, z1, x0),
+		 z0 = svmulh_x (p0, z1, x0))
+
+/*
+** mulh_11_u32_x_tied1:
+**	mov	(z[0-9]+\.s), #11
+**	umulh	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u32_x_tied1, svuint32_t,
+		z0 = svmulh_n_u32_x (p0, z0, 11),
+		z0 = svmulh_x (p0, z0, 11))
+
+/*
+** mulh_11_u32_x_untied:
+**	mov	z0\.s, #11
+**	umulh	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u32_x_untied, svuint32_t,
+		z0 = svmulh_n_u32_x (p0, z1, 11),
+		z0 = svmulh_x (p0, z1, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u64.c
new file mode 100644
index 000000000..0d7e12a7c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u64.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mulh_u64_m_tied1:
+**	umulh	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u64_m_tied1, svuint64_t,
+		z0 = svmulh_u64_m (p0, z0, z1),
+		z0 = svmulh_m (p0, z0, z1))
+
+/*
+** mulh_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	umulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u64_m_tied2, svuint64_t,
+		z0 = svmulh_u64_m (p0, z1, z0),
+		z0 = svmulh_m (p0, z1, z0))
+
+/*
+** mulh_u64_m_untied:
+**	movprfx	z0, z1
+**	umulh	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u64_m_untied, svuint64_t,
+		z0 = svmulh_u64_m (p0, z1, z2),
+		z0 = svmulh_m (p0, z1, z2))
+
+/*
+** mulh_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	umulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svmulh_n_u64_m (p0, z0, x0),
+		 z0 = svmulh_m (p0, z0, x0))
+
+/*
+** mulh_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	umulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svmulh_n_u64_m (p0, z1, x0),
+		 z0 = svmulh_m (p0, z1, x0))
+
+/*
+** mulh_11_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	umulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u64_m_tied1, svuint64_t,
+		z0 = svmulh_n_u64_m (p0, z0, 11),
+		z0 = svmulh_m (p0, z0, 11))
+
+/*
+** mulh_11_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0, z1
+**	umulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u64_m_untied, svuint64_t,
+		z0 = svmulh_n_u64_m (p0, z1, 11),
+		z0 = svmulh_m (p0, z1, 11))
+
+/*
+** mulh_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	umulh	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u64_z_tied1, svuint64_t,
+		z0 = svmulh_u64_z (p0, z0, z1),
+		z0 = svmulh_z (p0, z0, z1))
+
+/*
+** mulh_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	umulh	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u64_z_tied2, svuint64_t,
+		z0 = svmulh_u64_z (p0, z1, z0),
+		z0 = svmulh_z (p0, z1, z0))
+
+/*
+** mulh_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	umulh	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	umulh	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u64_z_untied, svuint64_t,
+		z0 = svmulh_u64_z (p0, z1, z2),
+		z0 = svmulh_z (p0, z1, z2))
+
+/*
+** mulh_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	umulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svmulh_n_u64_z (p0, z0, x0),
+		 z0 = svmulh_z (p0, z0, x0))
+
+/*
+** mulh_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	umulh	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	umulh	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svmulh_n_u64_z (p0, z1, x0),
+		 z0 = svmulh_z (p0, z1, x0))
+
+/*
+** mulh_11_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	movprfx	z0\.d, p0/z, z0\.d
+**	umulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u64_z_tied1, svuint64_t,
+		z0 = svmulh_n_u64_z (p0, z0, 11),
+		z0 = svmulh_z (p0, z0, 11))
+
+/*
+** mulh_11_u64_z_untied:
+**	mov	(z[0-9]+\.d), #11
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	umulh	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	umulh	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u64_z_untied, svuint64_t,
+		z0 = svmulh_n_u64_z (p0, z1, 11),
+		z0 = svmulh_z (p0, z1, 11))
+
+/*
+** mulh_u64_x_tied1:
+**	umulh	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u64_x_tied1, svuint64_t,
+		z0 = svmulh_u64_x (p0, z0, z1),
+		z0 = svmulh_x (p0, z0, z1))
+
+/*
+** mulh_u64_x_tied2:
+**	umulh	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u64_x_tied2, svuint64_t,
+		z0 = svmulh_u64_x (p0, z1, z0),
+		z0 = svmulh_x (p0, z1, z0))
+
+/*
+** mulh_u64_x_untied:
+** (
+**	movprfx	z0, z1
+**	umulh	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	umulh	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u64_x_untied, svuint64_t,
+		z0 = svmulh_u64_x (p0, z1, z2),
+		z0 = svmulh_x (p0, z1, z2))
+
+/*
+** mulh_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	umulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svmulh_n_u64_x (p0, z0, x0),
+		 z0 = svmulh_x (p0, z0, x0))
+
+/*
+** mulh_x0_u64_x_untied:
+**	mov	z0\.d, x0
+**	umulh	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svmulh_n_u64_x (p0, z1, x0),
+		 z0 = svmulh_x (p0, z1, x0))
+
+/*
+** mulh_11_u64_x_tied1:
+**	mov	(z[0-9]+\.d), #11
+**	umulh	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u64_x_tied1, svuint64_t,
+		z0 = svmulh_n_u64_x (p0, z0, 11),
+		z0 = svmulh_x (p0, z0, 11))
+
+/*
+** mulh_11_u64_x_untied:
+**	mov	z0\.d, #11
+**	umulh	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u64_x_untied, svuint64_t,
+		z0 = svmulh_n_u64_x (p0, z1, 11),
+		z0 = svmulh_x (p0, z1, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u8.c
new file mode 100644
index 000000000..db7b1be1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulh_u8.c
@@ -0,0 +1,237 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mulh_u8_m_tied1:
+**	umulh	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u8_m_tied1, svuint8_t,
+		z0 = svmulh_u8_m (p0, z0, z1),
+		z0 = svmulh_m (p0, z0, z1))
+
+/*
+** mulh_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	umulh	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u8_m_tied2, svuint8_t,
+		z0 = svmulh_u8_m (p0, z1, z0),
+		z0 = svmulh_m (p0, z1, z0))
+
+/*
+** mulh_u8_m_untied:
+**	movprfx	z0, z1
+**	umulh	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u8_m_untied, svuint8_t,
+		z0 = svmulh_u8_m (p0, z1, z2),
+		z0 = svmulh_m (p0, z1, z2))
+
+/*
+** mulh_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	umulh	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svmulh_n_u8_m (p0, z0, x0),
+		 z0 = svmulh_m (p0, z0, x0))
+
+/*
+** mulh_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	umulh	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svmulh_n_u8_m (p0, z1, x0),
+		 z0 = svmulh_m (p0, z1, x0))
+
+/*
+** mulh_11_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	umulh	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u8_m_tied1, svuint8_t,
+		z0 = svmulh_n_u8_m (p0, z0, 11),
+		z0 = svmulh_m (p0, z0, 11))
+
+/*
+** mulh_11_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0, z1
+**	umulh	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u8_m_untied, svuint8_t,
+		z0 = svmulh_n_u8_m (p0, z1, 11),
+		z0 = svmulh_m (p0, z1, 11))
+
+/*
+** mulh_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	umulh	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u8_z_tied1, svuint8_t,
+		z0 = svmulh_u8_z (p0, z0, z1),
+		z0 = svmulh_z (p0, z0, z1))
+
+/*
+** mulh_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	umulh	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u8_z_tied2, svuint8_t,
+		z0 = svmulh_u8_z (p0, z1, z0),
+		z0 = svmulh_z (p0, z1, z0))
+
+/*
+** mulh_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	umulh	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	umulh	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u8_z_untied, svuint8_t,
+		z0 = svmulh_u8_z (p0, z1, z2),
+		z0 = svmulh_z (p0, z1, z2))
+
+/*
+** mulh_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	umulh	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svmulh_n_u8_z (p0, z0, x0),
+		 z0 = svmulh_z (p0, z0, x0))
+
+/*
+** mulh_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	umulh	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	umulh	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svmulh_n_u8_z (p0, z1, x0),
+		 z0 = svmulh_z (p0, z1, x0))
+
+/*
+** mulh_11_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	movprfx	z0\.b, p0/z, z0\.b
+**	umulh	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u8_z_tied1, svuint8_t,
+		z0 = svmulh_n_u8_z (p0, z0, 11),
+		z0 = svmulh_z (p0, z0, 11))
+
+/*
+** mulh_11_u8_z_untied:
+**	mov	(z[0-9]+\.b), #11
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	umulh	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	umulh	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u8_z_untied, svuint8_t,
+		z0 = svmulh_n_u8_z (p0, z1, 11),
+		z0 = svmulh_z (p0, z1, 11))
+
+/*
+** mulh_u8_x_tied1:
+**	umulh	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u8_x_tied1, svuint8_t,
+		z0 = svmulh_u8_x (p0, z0, z1),
+		z0 = svmulh_x (p0, z0, z1))
+
+/*
+** mulh_u8_x_tied2:
+**	umulh	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u8_x_tied2, svuint8_t,
+		z0 = svmulh_u8_x (p0, z1, z0),
+		z0 = svmulh_x (p0, z1, z0))
+
+/*
+** mulh_u8_x_untied:
+** (
+**	movprfx	z0, z1
+**	umulh	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0, z2
+**	umulh	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_u8_x_untied, svuint8_t,
+		z0 = svmulh_u8_x (p0, z1, z2),
+		z0 = svmulh_x (p0, z1, z2))
+
+/*
+** mulh_w0_u8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	umulh	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svmulh_n_u8_x (p0, z0, x0),
+		 z0 = svmulh_x (p0, z0, x0))
+
+/*
+** mulh_w0_u8_x_untied:
+**	mov	z0\.b, w0
+**	umulh	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (mulh_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svmulh_n_u8_x (p0, z1, x0),
+		 z0 = svmulh_x (p0, z1, x0))
+
+/*
+** mulh_11_u8_x_tied1:
+**	mov	(z[0-9]+\.b), #11
+**	umulh	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u8_x_tied1, svuint8_t,
+		z0 = svmulh_n_u8_x (p0, z0, 11),
+		z0 = svmulh_x (p0, z0, 11))
+
+/*
+** mulh_11_u8_x_untied:
+**	mov	z0\.b, #11
+**	umulh	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (mulh_11_u8_x_untied, svuint8_t,
+		z0 = svmulh_n_u8_x (p0, z1, 11),
+		z0 = svmulh_x (p0, z1, 11))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f16.c
new file mode 100644
index 000000000..ce02c3caa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f16.c
@@ -0,0 +1,472 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mulx_f16_m_tied1:
+**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f16_m_tied1, svfloat16_t,
+		z0 = svmulx_f16_m (p0, z0, z1),
+		z0 = svmulx_m (p0, z0, z1))
+
+/*
+** mulx_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmulx	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f16_m_tied2, svfloat16_t,
+		z0 = svmulx_f16_m (p0, z1, z0),
+		z0 = svmulx_m (p0, z1, z0))
+
+/*
+** mulx_f16_m_untied:
+**	movprfx	z0, z1
+**	fmulx	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f16_m_untied, svfloat16_t,
+		z0 = svmulx_f16_m (p0, z1, z2),
+		z0 = svmulx_m (p0, z1, z2))
+
+/*
+** mulx_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svmulx_n_f16_m (p0, z0, d4),
+		 z0 = svmulx_m (p0, z0, d4))
+
+/*
+** mulx_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svmulx_n_f16_m (p0, z1, d4),
+		 z0 = svmulx_m (p0, z1, d4))
+
+/*
+** mulx_1_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f16_m_tied1, svfloat16_t,
+		z0 = svmulx_n_f16_m (p0, z0, 1),
+		z0 = svmulx_m (p0, z0, 1))
+
+/*
+** mulx_1_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f16_m_untied, svfloat16_t,
+		z0 = svmulx_n_f16_m (p0, z1, 1),
+		z0 = svmulx_m (p0, z1, 1))
+
+/*
+** mulx_0p5_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f16_m_tied1, svfloat16_t,
+		z0 = svmulx_n_f16_m (p0, z0, 0.5),
+		z0 = svmulx_m (p0, z0, 0.5))
+
+/*
+** mulx_0p5_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
+**	movprfx	z0, z1
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f16_m_untied, svfloat16_t,
+		z0 = svmulx_n_f16_m (p0, z1, 0.5),
+		z0 = svmulx_m (p0, z1, 0.5))
+
+/*
+** mulx_2_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f16_m_tied1, svfloat16_t,
+		z0 = svmulx_n_f16_m (p0, z0, 2),
+		z0 = svmulx_m (p0, z0, 2))
+
+/*
+** mulx_2_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f16_m_untied, svfloat16_t,
+		z0 = svmulx_n_f16_m (p0, z1, 2),
+		z0 = svmulx_m (p0, z1, 2))
+
+/*
+** mulx_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f16_z_tied1, svfloat16_t,
+		z0 = svmulx_f16_z (p0, z0, z1),
+		z0 = svmulx_z (p0, z0, z1))
+
+/*
+** mulx_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f16_z_tied2, svfloat16_t,
+		z0 = svmulx_f16_z (p0, z1, z0),
+		z0 = svmulx_z (p0, z1, z0))
+
+/*
+** mulx_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmulx	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f16_z_untied, svfloat16_t,
+		z0 = svmulx_f16_z (p0, z1, z2),
+		z0 = svmulx_z (p0, z1, z2))
+
+/*
+** mulx_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svmulx_n_f16_z (p0, z0, d4),
+		 z0 = svmulx_z (p0, z0, d4))
+
+/*
+** mulx_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svmulx_n_f16_z (p0, z1, d4),
+		 z0 = svmulx_z (p0, z1, d4))
+
+/*
+** mulx_1_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f16_z_tied1, svfloat16_t,
+		z0 = svmulx_n_f16_z (p0, z0, 1),
+		z0 = svmulx_z (p0, z0, 1))
+
+/*
+** mulx_1_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f16_z_untied, svfloat16_t,
+		z0 = svmulx_n_f16_z (p0, z1, 1),
+		z0 = svmulx_z (p0, z1, 1))
+
+/*
+** mulx_0p5_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f16_z_tied1, svfloat16_t,
+		z0 = svmulx_n_f16_z (p0, z0, 0.5),
+		z0 = svmulx_z (p0, z0, 0.5))
+
+/*
+** mulx_0p5_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f16_z_untied, svfloat16_t,
+		z0 = svmulx_n_f16_z (p0, z1, 0.5),
+		z0 = svmulx_z (p0, z1, 0.5))
+
+/*
+** mulx_2_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f16_z_tied1, svfloat16_t,
+		z0 = svmulx_n_f16_z (p0, z0, 2),
+		z0 = svmulx_z (p0, z0, 2))
+
+/*
+** mulx_2_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f16_z_untied, svfloat16_t,
+		z0 = svmulx_n_f16_z (p0, z1, 2),
+		z0 = svmulx_z (p0, z1, 2))
+
+/*
+** mulx_f16_x_tied1:
+**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f16_x_tied1, svfloat16_t,
+		z0 = svmulx_f16_x (p0, z0, z1),
+		z0 = svmulx_x (p0, z0, z1))
+
+/*
+** mulx_f16_x_tied2:
+**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f16_x_tied2, svfloat16_t,
+		z0 = svmulx_f16_x (p0, z1, z0),
+		z0 = svmulx_x (p0, z1, z0))
+
+/*
+** mulx_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmulx	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f16_x_untied, svfloat16_t,
+		z0 = svmulx_f16_x (p0, z1, z2),
+		z0 = svmulx_x (p0, z1, z2))
+
+/*
+** mulx_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svmulx_n_f16_x (p0, z0, d4),
+		 z0 = svmulx_x (p0, z0, d4))
+
+/*
+** mulx_h4_f16_x_untied: { xfail *-*-* }
+**	mov	z0\.h, h4
+**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svmulx_n_f16_x (p0, z1, d4),
+		 z0 = svmulx_x (p0, z1, d4))
+
+/*
+** mulx_1_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #1\.0(?:e\+0)?
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f16_x_tied1, svfloat16_t,
+		z0 = svmulx_n_f16_x (p0, z0, 1),
+		z0 = svmulx_x (p0, z0, 1))
+
+/*
+** mulx_1_f16_x_untied:
+**	fmov	z0\.h, #1\.0(?:e\+0)?
+**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f16_x_untied, svfloat16_t,
+		z0 = svmulx_n_f16_x (p0, z1, 1),
+		z0 = svmulx_x (p0, z1, 1))
+
+/*
+** mulx_0p5_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #(?:0\.5|5\.0e-1)
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svmulx_n_f16_x (p0, z0, 0.5),
+		z0 = svmulx_x (p0, z0, 0.5))
+
+/*
+** mulx_0p5_f16_x_untied:
+**	fmov	z0\.h, #(?:0\.5|5\.0e-1)
+**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f16_x_untied, svfloat16_t,
+		z0 = svmulx_n_f16_x (p0, z1, 0.5),
+		z0 = svmulx_x (p0, z1, 0.5))
+
+/*
+** mulx_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fmulx	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f16_x_tied1, svfloat16_t,
+		z0 = svmulx_n_f16_x (p0, z0, 2),
+		z0 = svmulx_x (p0, z0, 2))
+
+/*
+** mulx_2_f16_x_untied:
+**	fmov	z0\.h, #2\.0(?:e\+0)?
+**	fmulx	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f16_x_untied, svfloat16_t,
+		z0 = svmulx_n_f16_x (p0, z1, 2),
+		z0 = svmulx_x (p0, z1, 2))
+
+/*
+** ptrue_mulx_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_f16_x_tied1, svfloat16_t,
+		z0 = svmulx_f16_x (svptrue_b16 (), z0, z1),
+		z0 = svmulx_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_mulx_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_f16_x_tied2, svfloat16_t,
+		z0 = svmulx_f16_x (svptrue_b16 (), z1, z0),
+		z0 = svmulx_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_mulx_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_f16_x_untied, svfloat16_t,
+		z0 = svmulx_f16_x (svptrue_b16 (), z1, z2),
+		z0 = svmulx_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_mulx_1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_1_f16_x_tied1, svfloat16_t,
+		z0 = svmulx_n_f16_x (svptrue_b16 (), z0, 1),
+		z0 = svmulx_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_mulx_1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_1_f16_x_untied, svfloat16_t,
+		z0 = svmulx_n_f16_x (svptrue_b16 (), z1, 1),
+		z0 = svmulx_x (svptrue_b16 (), z1, 1))
+
+/*
+** ptrue_mulx_0p5_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svmulx_n_f16_x (svptrue_b16 (), z0, 0.5),
+		z0 = svmulx_x (svptrue_b16 (), z0, 0.5))
+
+/*
+** ptrue_mulx_0p5_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_0p5_f16_x_untied, svfloat16_t,
+		z0 = svmulx_n_f16_x (svptrue_b16 (), z1, 0.5),
+		z0 = svmulx_x (svptrue_b16 (), z1, 0.5))
+
+/*
+** ptrue_mulx_2_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_2_f16_x_tied1, svfloat16_t,
+		z0 = svmulx_n_f16_x (svptrue_b16 (), z0, 2),
+		z0 = svmulx_x (svptrue_b16 (), z0, 2))
+
+/*
+** ptrue_mulx_2_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_2_f16_x_untied, svfloat16_t,
+		z0 = svmulx_n_f16_x (svptrue_b16 (), z1, 2),
+		z0 = svmulx_x (svptrue_b16 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f32.c
new file mode 100644
index 000000000..e0d369593
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f32.c
@@ -0,0 +1,472 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mulx_f32_m_tied1:
+**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f32_m_tied1, svfloat32_t,
+		z0 = svmulx_f32_m (p0, z0, z1),
+		z0 = svmulx_m (p0, z0, z1))
+
+/*
+** mulx_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fmulx	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f32_m_tied2, svfloat32_t,
+		z0 = svmulx_f32_m (p0, z1, z0),
+		z0 = svmulx_m (p0, z1, z0))
+
+/*
+** mulx_f32_m_untied:
+**	movprfx	z0, z1
+**	fmulx	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f32_m_untied, svfloat32_t,
+		z0 = svmulx_f32_m (p0, z1, z2),
+		z0 = svmulx_m (p0, z1, z2))
+
+/*
+** mulx_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svmulx_n_f32_m (p0, z0, d4),
+		 z0 = svmulx_m (p0, z0, d4))
+
+/*
+** mulx_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svmulx_n_f32_m (p0, z1, d4),
+		 z0 = svmulx_m (p0, z1, d4))
+
+/*
+** mulx_1_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f32_m_tied1, svfloat32_t,
+		z0 = svmulx_n_f32_m (p0, z0, 1),
+		z0 = svmulx_m (p0, z0, 1))
+
+/*
+** mulx_1_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f32_m_untied, svfloat32_t,
+		z0 = svmulx_n_f32_m (p0, z1, 1),
+		z0 = svmulx_m (p0, z1, 1))
+
+/*
+** mulx_0p5_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f32_m_tied1, svfloat32_t,
+		z0 = svmulx_n_f32_m (p0, z0, 0.5),
+		z0 = svmulx_m (p0, z0, 0.5))
+
+/*
+** mulx_0p5_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
+**	movprfx	z0, z1
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f32_m_untied, svfloat32_t,
+		z0 = svmulx_n_f32_m (p0, z1, 0.5),
+		z0 = svmulx_m (p0, z1, 0.5))
+
+/*
+** mulx_2_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f32_m_tied1, svfloat32_t,
+		z0 = svmulx_n_f32_m (p0, z0, 2),
+		z0 = svmulx_m (p0, z0, 2))
+
+/*
+** mulx_2_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f32_m_untied, svfloat32_t,
+		z0 = svmulx_n_f32_m (p0, z1, 2),
+		z0 = svmulx_m (p0, z1, 2))
+
+/*
+** mulx_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f32_z_tied1, svfloat32_t,
+		z0 = svmulx_f32_z (p0, z0, z1),
+		z0 = svmulx_z (p0, z0, z1))
+
+/*
+** mulx_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f32_z_tied2, svfloat32_t,
+		z0 = svmulx_f32_z (p0, z1, z0),
+		z0 = svmulx_z (p0, z1, z0))
+
+/*
+** mulx_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmulx	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f32_z_untied, svfloat32_t,
+		z0 = svmulx_f32_z (p0, z1, z2),
+		z0 = svmulx_z (p0, z1, z2))
+
+/*
+** mulx_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svmulx_n_f32_z (p0, z0, d4),
+		 z0 = svmulx_z (p0, z0, d4))
+
+/*
+** mulx_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svmulx_n_f32_z (p0, z1, d4),
+		 z0 = svmulx_z (p0, z1, d4))
+
+/*
+** mulx_1_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f32_z_tied1, svfloat32_t,
+		z0 = svmulx_n_f32_z (p0, z0, 1),
+		z0 = svmulx_z (p0, z0, 1))
+
+/*
+** mulx_1_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f32_z_untied, svfloat32_t,
+		z0 = svmulx_n_f32_z (p0, z1, 1),
+		z0 = svmulx_z (p0, z1, 1))
+
+/*
+** mulx_0p5_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f32_z_tied1, svfloat32_t,
+		z0 = svmulx_n_f32_z (p0, z0, 0.5),
+		z0 = svmulx_z (p0, z0, 0.5))
+
+/*
+** mulx_0p5_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f32_z_untied, svfloat32_t,
+		z0 = svmulx_n_f32_z (p0, z1, 0.5),
+		z0 = svmulx_z (p0, z1, 0.5))
+
+/*
+** mulx_2_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f32_z_tied1, svfloat32_t,
+		z0 = svmulx_n_f32_z (p0, z0, 2),
+		z0 = svmulx_z (p0, z0, 2))
+
+/*
+** mulx_2_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f32_z_untied, svfloat32_t,
+		z0 = svmulx_n_f32_z (p0, z1, 2),
+		z0 = svmulx_z (p0, z1, 2))
+
+/*
+** mulx_f32_x_tied1:
+**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f32_x_tied1, svfloat32_t,
+		z0 = svmulx_f32_x (p0, z0, z1),
+		z0 = svmulx_x (p0, z0, z1))
+
+/*
+** mulx_f32_x_tied2:
+**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f32_x_tied2, svfloat32_t,
+		z0 = svmulx_f32_x (p0, z1, z0),
+		z0 = svmulx_x (p0, z1, z0))
+
+/*
+** mulx_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmulx	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f32_x_untied, svfloat32_t,
+		z0 = svmulx_f32_x (p0, z1, z2),
+		z0 = svmulx_x (p0, z1, z2))
+
+/*
+** mulx_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svmulx_n_f32_x (p0, z0, d4),
+		 z0 = svmulx_x (p0, z0, d4))
+
+/*
+** mulx_s4_f32_x_untied: { xfail *-*-* }
+**	mov	z0\.s, s4
+**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svmulx_n_f32_x (p0, z1, d4),
+		 z0 = svmulx_x (p0, z1, d4))
+
+/*
+** mulx_1_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #1\.0(?:e\+0)?
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f32_x_tied1, svfloat32_t,
+		z0 = svmulx_n_f32_x (p0, z0, 1),
+		z0 = svmulx_x (p0, z0, 1))
+
+/*
+** mulx_1_f32_x_untied:
+**	fmov	z0\.s, #1\.0(?:e\+0)?
+**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f32_x_untied, svfloat32_t,
+		z0 = svmulx_n_f32_x (p0, z1, 1),
+		z0 = svmulx_x (p0, z1, 1))
+
+/*
+** mulx_0p5_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #(?:0\.5|5\.0e-1)
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svmulx_n_f32_x (p0, z0, 0.5),
+		z0 = svmulx_x (p0, z0, 0.5))
+
+/*
+** mulx_0p5_f32_x_untied:
+**	fmov	z0\.s, #(?:0\.5|5\.0e-1)
+**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f32_x_untied, svfloat32_t,
+		z0 = svmulx_n_f32_x (p0, z1, 0.5),
+		z0 = svmulx_x (p0, z1, 0.5))
+
+/*
+** mulx_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fmulx	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f32_x_tied1, svfloat32_t,
+		z0 = svmulx_n_f32_x (p0, z0, 2),
+		z0 = svmulx_x (p0, z0, 2))
+
+/*
+** mulx_2_f32_x_untied:
+**	fmov	z0\.s, #2\.0(?:e\+0)?
+**	fmulx	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f32_x_untied, svfloat32_t,
+		z0 = svmulx_n_f32_x (p0, z1, 2),
+		z0 = svmulx_x (p0, z1, 2))
+
+/*
+** ptrue_mulx_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_f32_x_tied1, svfloat32_t,
+		z0 = svmulx_f32_x (svptrue_b32 (), z0, z1),
+		z0 = svmulx_x (svptrue_b32 (), z0, z1))
+
+/*
+** ptrue_mulx_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_f32_x_tied2, svfloat32_t,
+		z0 = svmulx_f32_x (svptrue_b32 (), z1, z0),
+		z0 = svmulx_x (svptrue_b32 (), z1, z0))
+
+/*
+** ptrue_mulx_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_f32_x_untied, svfloat32_t,
+		z0 = svmulx_f32_x (svptrue_b32 (), z1, z2),
+		z0 = svmulx_x (svptrue_b32 (), z1, z2))
+
+/*
+** ptrue_mulx_1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_1_f32_x_tied1, svfloat32_t,
+		z0 = svmulx_n_f32_x (svptrue_b32 (), z0, 1),
+		z0 = svmulx_x (svptrue_b32 (), z0, 1))
+
+/*
+** ptrue_mulx_1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_1_f32_x_untied, svfloat32_t,
+		z0 = svmulx_n_f32_x (svptrue_b32 (), z1, 1),
+		z0 = svmulx_x (svptrue_b32 (), z1, 1))
+
+/*
+** ptrue_mulx_0p5_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svmulx_n_f32_x (svptrue_b32 (), z0, 0.5),
+		z0 = svmulx_x (svptrue_b32 (), z0, 0.5))
+
+/*
+** ptrue_mulx_0p5_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_0p5_f32_x_untied, svfloat32_t,
+		z0 = svmulx_n_f32_x (svptrue_b32 (), z1, 0.5),
+		z0 = svmulx_x (svptrue_b32 (), z1, 0.5))
+
+/*
+** ptrue_mulx_2_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_2_f32_x_tied1, svfloat32_t,
+		z0 = svmulx_n_f32_x (svptrue_b32 (), z0, 2),
+		z0 = svmulx_x (svptrue_b32 (), z0, 2))
+
+/*
+** ptrue_mulx_2_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_2_f32_x_untied, svfloat32_t,
+		z0 = svmulx_n_f32_x (svptrue_b32 (), z1, 2),
+		z0 = svmulx_x (svptrue_b32 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f64.c
new file mode 100644
index 000000000..6af5703ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mulx_f64.c
@@ -0,0 +1,472 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** mulx_f64_m_tied1:
+**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f64_m_tied1, svfloat64_t,
+		z0 = svmulx_f64_m (p0, z0, z1),
+		z0 = svmulx_m (p0, z0, z1))
+
+/*
+** mulx_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f64_m_tied2, svfloat64_t,
+		z0 = svmulx_f64_m (p0, z1, z0),
+		z0 = svmulx_m (p0, z1, z0))
+
+/*
+** mulx_f64_m_untied:
+**	movprfx	z0, z1
+**	fmulx	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f64_m_untied, svfloat64_t,
+		z0 = svmulx_f64_m (p0, z1, z2),
+		z0 = svmulx_m (p0, z1, z2))
+
+/*
+** mulx_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svmulx_n_f64_m (p0, z0, d4),
+		 z0 = svmulx_m (p0, z0, d4))
+
+/*
+** mulx_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svmulx_n_f64_m (p0, z1, d4),
+		 z0 = svmulx_m (p0, z1, d4))
+
+/*
+** mulx_1_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f64_m_tied1, svfloat64_t,
+		z0 = svmulx_n_f64_m (p0, z0, 1),
+		z0 = svmulx_m (p0, z0, 1))
+
+/*
+** mulx_1_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f64_m_untied, svfloat64_t,
+		z0 = svmulx_n_f64_m (p0, z1, 1),
+		z0 = svmulx_m (p0, z1, 1))
+
+/*
+** mulx_0p5_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f64_m_tied1, svfloat64_t,
+		z0 = svmulx_n_f64_m (p0, z0, 0.5),
+		z0 = svmulx_m (p0, z0, 0.5))
+
+/*
+** mulx_0p5_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
+**	movprfx	z0, z1
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f64_m_untied, svfloat64_t,
+		z0 = svmulx_n_f64_m (p0, z1, 0.5),
+		z0 = svmulx_m (p0, z1, 0.5))
+
+/*
+** mulx_2_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f64_m_tied1, svfloat64_t,
+		z0 = svmulx_n_f64_m (p0, z0, 2),
+		z0 = svmulx_m (p0, z0, 2))
+
+/*
+** mulx_2_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f64_m_untied, svfloat64_t,
+		z0 = svmulx_n_f64_m (p0, z1, 2),
+		z0 = svmulx_m (p0, z1, 2))
+
+/*
+** mulx_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f64_z_tied1, svfloat64_t,
+		z0 = svmulx_f64_z (p0, z0, z1),
+		z0 = svmulx_z (p0, z0, z1))
+
+/*
+** mulx_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f64_z_tied2, svfloat64_t,
+		z0 = svmulx_f64_z (p0, z1, z0),
+		z0 = svmulx_z (p0, z1, z0))
+
+/*
+** mulx_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmulx	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f64_z_untied, svfloat64_t,
+		z0 = svmulx_f64_z (p0, z1, z2),
+		z0 = svmulx_z (p0, z1, z2))
+
+/*
+** mulx_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svmulx_n_f64_z (p0, z0, d4),
+		 z0 = svmulx_z (p0, z0, d4))
+
+/*
+** mulx_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svmulx_n_f64_z (p0, z1, d4),
+		 z0 = svmulx_z (p0, z1, d4))
+
+/*
+** mulx_1_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f64_z_tied1, svfloat64_t,
+		z0 = svmulx_n_f64_z (p0, z0, 1),
+		z0 = svmulx_z (p0, z0, 1))
+
+/*
+** mulx_1_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f64_z_untied, svfloat64_t,
+		z0 = svmulx_n_f64_z (p0, z1, 1),
+		z0 = svmulx_z (p0, z1, 1))
+
+/*
+** mulx_0p5_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f64_z_tied1, svfloat64_t,
+		z0 = svmulx_n_f64_z (p0, z0, 0.5),
+		z0 = svmulx_z (p0, z0, 0.5))
+
+/*
+** mulx_0p5_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f64_z_untied, svfloat64_t,
+		z0 = svmulx_n_f64_z (p0, z1, 0.5),
+		z0 = svmulx_z (p0, z1, 0.5))
+
+/*
+** mulx_2_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f64_z_tied1, svfloat64_t,
+		z0 = svmulx_n_f64_z (p0, z0, 2),
+		z0 = svmulx_z (p0, z0, 2))
+
+/*
+** mulx_2_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f64_z_untied, svfloat64_t,
+		z0 = svmulx_n_f64_z (p0, z1, 2),
+		z0 = svmulx_z (p0, z1, 2))
+
+/*
+** mulx_f64_x_tied1:
+**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f64_x_tied1, svfloat64_t,
+		z0 = svmulx_f64_x (p0, z0, z1),
+		z0 = svmulx_x (p0, z0, z1))
+
+/*
+** mulx_f64_x_tied2:
+**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f64_x_tied2, svfloat64_t,
+		z0 = svmulx_f64_x (p0, z1, z0),
+		z0 = svmulx_x (p0, z1, z0))
+
+/*
+** mulx_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fmulx	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_f64_x_untied, svfloat64_t,
+		z0 = svmulx_f64_x (p0, z1, z2),
+		z0 = svmulx_x (p0, z1, z2))
+
+/*
+** mulx_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svmulx_n_f64_x (p0, z0, d4),
+		 z0 = svmulx_x (p0, z0, d4))
+
+/*
+** mulx_d4_f64_x_untied: { xfail *-*-* }
+**	mov	z0\.d, d4
+**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (mulx_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svmulx_n_f64_x (p0, z1, d4),
+		 z0 = svmulx_x (p0, z1, d4))
+
+/*
+** mulx_1_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #1\.0(?:e\+0)?
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f64_x_tied1, svfloat64_t,
+		z0 = svmulx_n_f64_x (p0, z0, 1),
+		z0 = svmulx_x (p0, z0, 1))
+
+/*
+** mulx_1_f64_x_untied:
+**	fmov	z0\.d, #1\.0(?:e\+0)?
+**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_1_f64_x_untied, svfloat64_t,
+		z0 = svmulx_n_f64_x (p0, z1, 1),
+		z0 = svmulx_x (p0, z1, 1))
+
+/*
+** mulx_0p5_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #(?:0\.5|5\.0e-1)
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svmulx_n_f64_x (p0, z0, 0.5),
+		z0 = svmulx_x (p0, z0, 0.5))
+
+/*
+** mulx_0p5_f64_x_untied:
+**	fmov	z0\.d, #(?:0\.5|5\.0e-1)
+**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_0p5_f64_x_untied, svfloat64_t,
+		z0 = svmulx_n_f64_x (p0, z1, 0.5),
+		z0 = svmulx_x (p0, z1, 0.5))
+
+/*
+** mulx_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fmulx	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f64_x_tied1, svfloat64_t,
+		z0 = svmulx_n_f64_x (p0, z0, 2),
+		z0 = svmulx_x (p0, z0, 2))
+
+/*
+** mulx_2_f64_x_untied:
+**	fmov	z0\.d, #2\.0(?:e\+0)?
+**	fmulx	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (mulx_2_f64_x_untied, svfloat64_t,
+		z0 = svmulx_n_f64_x (p0, z1, 2),
+		z0 = svmulx_x (p0, z1, 2))
+
+/*
+** ptrue_mulx_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_f64_x_tied1, svfloat64_t,
+		z0 = svmulx_f64_x (svptrue_b64 (), z0, z1),
+		z0 = svmulx_x (svptrue_b64 (), z0, z1))
+
+/*
+** ptrue_mulx_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_f64_x_tied2, svfloat64_t,
+		z0 = svmulx_f64_x (svptrue_b64 (), z1, z0),
+		z0 = svmulx_x (svptrue_b64 (), z1, z0))
+
+/*
+** ptrue_mulx_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_f64_x_untied, svfloat64_t,
+		z0 = svmulx_f64_x (svptrue_b64 (), z1, z2),
+		z0 = svmulx_x (svptrue_b64 (), z1, z2))
+
+/*
+** ptrue_mulx_1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_1_f64_x_tied1, svfloat64_t,
+		z0 = svmulx_n_f64_x (svptrue_b64 (), z0, 1),
+		z0 = svmulx_x (svptrue_b64 (), z0, 1))
+
+/*
+** ptrue_mulx_1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_1_f64_x_untied, svfloat64_t,
+		z0 = svmulx_n_f64_x (svptrue_b64 (), z1, 1),
+		z0 = svmulx_x (svptrue_b64 (), z1, 1))
+
+/*
+** ptrue_mulx_0p5_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svmulx_n_f64_x (svptrue_b64 (), z0, 0.5),
+		z0 = svmulx_x (svptrue_b64 (), z0, 0.5))
+
+/*
+** ptrue_mulx_0p5_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_0p5_f64_x_untied, svfloat64_t,
+		z0 = svmulx_n_f64_x (svptrue_b64 (), z1, 0.5),
+		z0 = svmulx_x (svptrue_b64 (), z1, 0.5))
+
+/*
+** ptrue_mulx_2_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_2_f64_x_tied1, svfloat64_t,
+		z0 = svmulx_n_f64_x (svptrue_b64 (), z0, 2),
+		z0 = svmulx_x (svptrue_b64 (), z0, 2))
+
+/*
+** ptrue_mulx_2_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_mulx_2_f64_x_untied, svfloat64_t,
+		z0 = svmulx_n_f64_x (svptrue_b64 (), z1, 2),
+		z0 = svmulx_x (svptrue_b64 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nand_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nand_b.c
new file mode 100644
index 000000000..c306b80c7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nand_b.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** nand_b_z_tied1:
+**	nand	p0\.b, p3/z, p0\.b, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (nand_b_z_tied1,
+		p0 = svnand_b_z (p3, p0, p1),
+		p0 = svnand_z (p3, p0, p1))
+
+/*
+** nand_b_z_tied2:
+**	nand	p0\.b, p3/z, p1\.b, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (nand_b_z_tied2,
+		p0 = svnand_b_z (p3, p1, p0),
+		p0 = svnand_z (p3, p1, p0))
+
+/*
+** nand_b_z_untied:
+**	nand	p0\.b, p3/z, p1\.b, p2\.b
+**	ret
+*/
+TEST_UNIFORM_P (nand_b_z_untied,
+		p0 = svnand_b_z (p3, p1, p2),
+		p0 = svnand_z (p3, p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f16.c
new file mode 100644
index 000000000..c31eba922
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f16.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** neg_f16_m_tied12:
+**	fneg	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f16_m_tied12, svfloat16_t,
+		z0 = svneg_f16_m (z0, p0, z0),
+		z0 = svneg_m (z0, p0, z0))
+
+/*
+** neg_f16_m_tied1:
+**	fneg	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f16_m_tied1, svfloat16_t,
+		z0 = svneg_f16_m (z0, p0, z1),
+		z0 = svneg_m (z0, p0, z1))
+
+/*
+** neg_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fneg	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f16_m_tied2, svfloat16_t,
+		z0 = svneg_f16_m (z1, p0, z0),
+		z0 = svneg_m (z1, p0, z0))
+
+/*
+** neg_f16_m_untied:
+**	movprfx	z0, z2
+**	fneg	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f16_m_untied, svfloat16_t,
+		z0 = svneg_f16_m (z2, p0, z1),
+		z0 = svneg_m (z2, p0, z1))
+
+/*
+** neg_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	fneg	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f16_z_tied1, svfloat16_t,
+		z0 = svneg_f16_z (p0, z0),
+		z0 = svneg_z (p0, z0))
+
+/*
+** neg_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fneg	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f16_z_untied, svfloat16_t,
+		z0 = svneg_f16_z (p0, z1),
+		z0 = svneg_z (p0, z1))
+
+/*
+** neg_f16_x_tied1:
+**	fneg	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f16_x_tied1, svfloat16_t,
+		z0 = svneg_f16_x (p0, z0),
+		z0 = svneg_x (p0, z0))
+
+/*
+** neg_f16_x_untied:
+**	fneg	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f16_x_untied, svfloat16_t,
+		z0 = svneg_f16_x (p0, z1),
+		z0 = svneg_x (p0, z1))
+
+/*
+** ptrue_neg_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_neg_f16_x_tied1, svfloat16_t,
+		z0 = svneg_f16_x (svptrue_b16 (), z0),
+		z0 = svneg_x (svptrue_b16 (), z0))
+
+/*
+** ptrue_neg_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_neg_f16_x_untied, svfloat16_t,
+		z0 = svneg_f16_x (svptrue_b16 (), z1),
+		z0 = svneg_x (svptrue_b16 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f32.c
new file mode 100644
index 000000000..a57d264ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f32.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** neg_f32_m_tied12:
+**	fneg	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f32_m_tied12, svfloat32_t,
+		z0 = svneg_f32_m (z0, p0, z0),
+		z0 = svneg_m (z0, p0, z0))
+
+/*
+** neg_f32_m_tied1:
+**	fneg	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f32_m_tied1, svfloat32_t,
+		z0 = svneg_f32_m (z0, p0, z1),
+		z0 = svneg_m (z0, p0, z1))
+
+/*
+** neg_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fneg	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f32_m_tied2, svfloat32_t,
+		z0 = svneg_f32_m (z1, p0, z0),
+		z0 = svneg_m (z1, p0, z0))
+
+/*
+** neg_f32_m_untied:
+**	movprfx	z0, z2
+**	fneg	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f32_m_untied, svfloat32_t,
+		z0 = svneg_f32_m (z2, p0, z1),
+		z0 = svneg_m (z2, p0, z1))
+
+/*
+** neg_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	fneg	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f32_z_tied1, svfloat32_t,
+		z0 = svneg_f32_z (p0, z0),
+		z0 = svneg_z (p0, z0))
+
+/*
+** neg_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fneg	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f32_z_untied, svfloat32_t,
+		z0 = svneg_f32_z (p0, z1),
+		z0 = svneg_z (p0, z1))
+
+/*
+** neg_f32_x_tied1:
+**	fneg	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f32_x_tied1, svfloat32_t,
+		z0 = svneg_f32_x (p0, z0),
+		z0 = svneg_x (p0, z0))
+
+/*
+** neg_f32_x_untied:
+**	fneg	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f32_x_untied, svfloat32_t,
+		z0 = svneg_f32_x (p0, z1),
+		z0 = svneg_x (p0, z1))
+
+/*
+** ptrue_neg_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_neg_f32_x_tied1, svfloat32_t,
+		z0 = svneg_f32_x (svptrue_b32 (), z0),
+		z0 = svneg_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_neg_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_neg_f32_x_untied, svfloat32_t,
+		z0 = svneg_f32_x (svptrue_b32 (), z1),
+		z0 = svneg_x (svptrue_b32 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f64.c
new file mode 100644
index 000000000..90cadd4f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_f64.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** neg_f64_m_tied12:
+**	fneg	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f64_m_tied12, svfloat64_t,
+		z0 = svneg_f64_m (z0, p0, z0),
+		z0 = svneg_m (z0, p0, z0))
+
+/*
+** neg_f64_m_tied1:
+**	fneg	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f64_m_tied1, svfloat64_t,
+		z0 = svneg_f64_m (z0, p0, z1),
+		z0 = svneg_m (z0, p0, z1))
+
+/*
+** neg_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fneg	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f64_m_tied2, svfloat64_t,
+		z0 = svneg_f64_m (z1, p0, z0),
+		z0 = svneg_m (z1, p0, z0))
+
+/*
+** neg_f64_m_untied:
+**	movprfx	z0, z2
+**	fneg	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f64_m_untied, svfloat64_t,
+		z0 = svneg_f64_m (z2, p0, z1),
+		z0 = svneg_m (z2, p0, z1))
+
+/*
+** neg_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	fneg	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f64_z_tied1, svfloat64_t,
+		z0 = svneg_f64_z (p0, z0),
+		z0 = svneg_z (p0, z0))
+
+/*
+** neg_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fneg	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f64_z_untied, svfloat64_t,
+		z0 = svneg_f64_z (p0, z1),
+		z0 = svneg_z (p0, z1))
+
+/*
+** neg_f64_x_tied1:
+**	fneg	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f64_x_tied1, svfloat64_t,
+		z0 = svneg_f64_x (p0, z0),
+		z0 = svneg_x (p0, z0))
+
+/*
+** neg_f64_x_untied:
+**	fneg	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (neg_f64_x_untied, svfloat64_t,
+		z0 = svneg_f64_x (p0, z1),
+		z0 = svneg_x (p0, z1))
+
+/*
+** ptrue_neg_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_neg_f64_x_tied1, svfloat64_t,
+		z0 = svneg_f64_x (svptrue_b64 (), z0),
+		z0 = svneg_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_neg_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_neg_f64_x_untied, svfloat64_t,
+		z0 = svneg_f64_x (svptrue_b64 (), z1),
+		z0 = svneg_x (svptrue_b64 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s16.c
new file mode 100644
index 000000000..80b2ee0f7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s16.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** neg_s16_m_tied12:
+**	neg	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s16_m_tied12, svint16_t,
+		z0 = svneg_s16_m (z0, p0, z0),
+		z0 = svneg_m (z0, p0, z0))
+
+/*
+** neg_s16_m_tied1:
+**	neg	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s16_m_tied1, svint16_t,
+		z0 = svneg_s16_m (z0, p0, z1),
+		z0 = svneg_m (z0, p0, z1))
+
+/*
+** neg_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	neg	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s16_m_tied2, svint16_t,
+		z0 = svneg_s16_m (z1, p0, z0),
+		z0 = svneg_m (z1, p0, z0))
+
+/*
+** neg_s16_m_untied:
+**	movprfx	z0, z2
+**	neg	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s16_m_untied, svint16_t,
+		z0 = svneg_s16_m (z2, p0, z1),
+		z0 = svneg_m (z2, p0, z1))
+
+/*
+** neg_s16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	neg	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s16_z_tied1, svint16_t,
+		z0 = svneg_s16_z (p0, z0),
+		z0 = svneg_z (p0, z0))
+
+/*
+** neg_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	neg	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s16_z_untied, svint16_t,
+		z0 = svneg_s16_z (p0, z1),
+		z0 = svneg_z (p0, z1))
+
+/*
+** neg_s16_x_tied1:
+**	neg	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s16_x_tied1, svint16_t,
+		z0 = svneg_s16_x (p0, z0),
+		z0 = svneg_x (p0, z0))
+
+/*
+** neg_s16_x_untied:
+**	neg	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s16_x_untied, svint16_t,
+		z0 = svneg_s16_x (p0, z1),
+		z0 = svneg_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s32.c
new file mode 100644
index 000000000..b8805034e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s32.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** neg_s32_m_tied12:
+**	neg	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s32_m_tied12, svint32_t,
+		z0 = svneg_s32_m (z0, p0, z0),
+		z0 = svneg_m (z0, p0, z0))
+
+/*
+** neg_s32_m_tied1:
+**	neg	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s32_m_tied1, svint32_t,
+		z0 = svneg_s32_m (z0, p0, z1),
+		z0 = svneg_m (z0, p0, z1))
+
+/*
+** neg_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	neg	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s32_m_tied2, svint32_t,
+		z0 = svneg_s32_m (z1, p0, z0),
+		z0 = svneg_m (z1, p0, z0))
+
+/*
+** neg_s32_m_untied:
+**	movprfx	z0, z2
+**	neg	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s32_m_untied, svint32_t,
+		z0 = svneg_s32_m (z2, p0, z1),
+		z0 = svneg_m (z2, p0, z1))
+
+/*
+** neg_s32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	neg	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s32_z_tied1, svint32_t,
+		z0 = svneg_s32_z (p0, z0),
+		z0 = svneg_z (p0, z0))
+
+/*
+** neg_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	neg	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s32_z_untied, svint32_t,
+		z0 = svneg_s32_z (p0, z1),
+		z0 = svneg_z (p0, z1))
+
+/*
+** neg_s32_x_tied1:
+**	neg	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s32_x_tied1, svint32_t,
+		z0 = svneg_s32_x (p0, z0),
+		z0 = svneg_x (p0, z0))
+
+/*
+** neg_s32_x_untied:
+**	neg	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s32_x_untied, svint32_t,
+		z0 = svneg_s32_x (p0, z1),
+		z0 = svneg_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s64.c
new file mode 100644
index 000000000..82abe6723
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** neg_s64_m_tied12:
+**	neg	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s64_m_tied12, svint64_t,
+		z0 = svneg_s64_m (z0, p0, z0),
+		z0 = svneg_m (z0, p0, z0))
+
+/*
+** neg_s64_m_tied1:
+**	neg	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s64_m_tied1, svint64_t,
+		z0 = svneg_s64_m (z0, p0, z1),
+		z0 = svneg_m (z0, p0, z1))
+
+/*
+** neg_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	neg	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s64_m_tied2, svint64_t,
+		z0 = svneg_s64_m (z1, p0, z0),
+		z0 = svneg_m (z1, p0, z0))
+
+/*
+** neg_s64_m_untied:
+**	movprfx	z0, z2
+**	neg	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s64_m_untied, svint64_t,
+		z0 = svneg_s64_m (z2, p0, z1),
+		z0 = svneg_m (z2, p0, z1))
+
+/*
+** neg_s64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	neg	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s64_z_tied1, svint64_t,
+		z0 = svneg_s64_z (p0, z0),
+		z0 = svneg_z (p0, z0))
+
+/*
+** neg_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	neg	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s64_z_untied, svint64_t,
+		z0 = svneg_s64_z (p0, z1),
+		z0 = svneg_z (p0, z1))
+
+/*
+** neg_s64_x_tied1:
+**	neg	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s64_x_tied1, svint64_t,
+		z0 = svneg_s64_x (p0, z0),
+		z0 = svneg_x (p0, z0))
+
+/*
+** neg_s64_x_untied:
+**	neg	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s64_x_untied, svint64_t,
+		z0 = svneg_s64_x (p0, z1),
+		z0 = svneg_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s8.c
new file mode 100644
index 000000000..b7c9949ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/neg_s8.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** neg_s8_m_tied12:
+**	neg	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s8_m_tied12, svint8_t,
+		z0 = svneg_s8_m (z0, p0, z0),
+		z0 = svneg_m (z0, p0, z0))
+
+/*
+** neg_s8_m_tied1:
+**	neg	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s8_m_tied1, svint8_t,
+		z0 = svneg_s8_m (z0, p0, z1),
+		z0 = svneg_m (z0, p0, z1))
+
+/*
+** neg_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	neg	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s8_m_tied2, svint8_t,
+		z0 = svneg_s8_m (z1, p0, z0),
+		z0 = svneg_m (z1, p0, z0))
+
+/*
+** neg_s8_m_untied:
+**	movprfx	z0, z2
+**	neg	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s8_m_untied, svint8_t,
+		z0 = svneg_s8_m (z2, p0, z1),
+		z0 = svneg_m (z2, p0, z1))
+
+/*
+** neg_s8_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.b, p0/z, \1\.b
+**	neg	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s8_z_tied1, svint8_t,
+		z0 = svneg_s8_z (p0, z0),
+		z0 = svneg_z (p0, z0))
+
+/*
+** neg_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	neg	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s8_z_untied, svint8_t,
+		z0 = svneg_s8_z (p0, z1),
+		z0 = svneg_z (p0, z1))
+
+/*
+** neg_s8_x_tied1:
+**	neg	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s8_x_tied1, svint8_t,
+		z0 = svneg_s8_x (p0, z0),
+		z0 = svneg_x (p0, z0))
+
+/*
+** neg_s8_x_untied:
+**	neg	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (neg_s8_x_untied, svint8_t,
+		z0 = svneg_s8_x (p0, z1),
+		z0 = svneg_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f16.c
new file mode 100644
index 000000000..abfe0a0c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f16.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** nmad_f16_m_tied1:
+**	fnmad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f16_m_tied1, svfloat16_t,
+		z0 = svnmad_f16_m (p0, z0, z1, z2),
+		z0 = svnmad_m (p0, z0, z1, z2))
+
+/*
+** nmad_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fnmad	z0\.h, p0/m, \1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f16_m_tied2, svfloat16_t,
+		z0 = svnmad_f16_m (p0, z1, z0, z2),
+		z0 = svnmad_m (p0, z1, z0, z2))
+
+/*
+** nmad_f16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fnmad	z0\.h, p0/m, z2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f16_m_tied3, svfloat16_t,
+		z0 = svnmad_f16_m (p0, z1, z2, z0),
+		z0 = svnmad_m (p0, z1, z2, z0))
+
+/*
+** nmad_f16_m_untied:
+**	movprfx	z0, z1
+**	fnmad	z0\.h, p0/m, z2\.h, z3\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f16_m_untied, svfloat16_t,
+		z0 = svnmad_f16_m (p0, z1, z2, z3),
+		z0 = svnmad_m (p0, z1, z2, z3))
+
+/*
+** nmad_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fnmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svnmad_n_f16_m (p0, z0, z1, d4),
+		 z0 = svnmad_m (p0, z0, z1, d4))
+
+/*
+** nmad_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fnmad	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svnmad_n_f16_m (p0, z1, z2, d4),
+		 z0 = svnmad_m (p0, z1, z2, d4))
+
+/*
+** nmad_2_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fnmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f16_m_tied1, svfloat16_t,
+		z0 = svnmad_n_f16_m (p0, z0, z1, 2),
+		z0 = svnmad_m (p0, z0, z1, 2))
+
+/*
+** nmad_2_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fnmad	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f16_m_untied, svfloat16_t,
+		z0 = svnmad_n_f16_m (p0, z1, z2, 2),
+		z0 = svnmad_m (p0, z1, z2, 2))
+
+/*
+** nmad_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f16_z_tied1, svfloat16_t,
+		z0 = svnmad_f16_z (p0, z0, z1, z2),
+		z0 = svnmad_z (p0, z0, z1, z2))
+
+/*
+** nmad_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f16_z_tied2, svfloat16_t,
+		z0 = svnmad_f16_z (p0, z1, z0, z2),
+		z0 = svnmad_z (p0, z1, z0, z2))
+
+/*
+** nmad_f16_z_tied3:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f16_z_tied3, svfloat16_t,
+		z0 = svnmad_f16_z (p0, z1, z2, z0),
+		z0 = svnmad_z (p0, z1, z2, z0))
+
+/*
+** nmad_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fnmad	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fnmad	z0\.h, p0/m, z1\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z3\.h
+**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f16_z_untied, svfloat16_t,
+		z0 = svnmad_f16_z (p0, z1, z2, z3),
+		z0 = svnmad_z (p0, z1, z2, z3))
+
+/*
+** nmad_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svnmad_n_f16_z (p0, z0, z1, d4),
+		 z0 = svnmad_z (p0, z0, z1, d4))
+
+/*
+** nmad_h4_f16_z_tied2:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_h4_f16_z_tied2, svfloat16_t, __fp16,
+		 z0 = svnmad_n_f16_z (p0, z1, z0, d4),
+		 z0 = svnmad_z (p0, z1, z0, d4))
+
+/*
+** nmad_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fnmad	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fnmad	z0\.h, p0/m, z1\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svnmad_n_f16_z (p0, z1, z2, d4),
+		 z0 = svnmad_z (p0, z1, z2, d4))
+
+/*
+** nmad_2_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f16_z_tied1, svfloat16_t,
+		z0 = svnmad_n_f16_z (p0, z0, z1, 2),
+		z0 = svnmad_z (p0, z0, z1, 2))
+
+/*
+** nmad_2_f16_z_tied2:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f16_z_tied2, svfloat16_t,
+		z0 = svnmad_n_f16_z (p0, z1, z0, 2),
+		z0 = svnmad_z (p0, z1, z0, 2))
+
+/*
+** nmad_2_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fnmad	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fnmad	z0\.h, p0/m, z1\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f16_z_untied, svfloat16_t,
+		z0 = svnmad_n_f16_z (p0, z1, z2, 2),
+		z0 = svnmad_z (p0, z1, z2, 2))
+
+/*
+** nmad_f16_x_tied1:
+**	fnmad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f16_x_tied1, svfloat16_t,
+		z0 = svnmad_f16_x (p0, z0, z1, z2),
+		z0 = svnmad_x (p0, z0, z1, z2))
+
+/*
+** nmad_f16_x_tied2:
+**	fnmad	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f16_x_tied2, svfloat16_t,
+		z0 = svnmad_f16_x (p0, z1, z0, z2),
+		z0 = svnmad_x (p0, z1, z0, z2))
+
+/*
+** nmad_f16_x_tied3:
+**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f16_x_tied3, svfloat16_t,
+		z0 = svnmad_f16_x (p0, z1, z2, z0),
+		z0 = svnmad_x (p0, z1, z2, z0))
+
+/*
+** nmad_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fnmad	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0, z2
+**	fnmad	z0\.h, p0/m, z1\.h, z3\.h
+** |
+**	movprfx	z0, z3
+**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f16_x_untied, svfloat16_t,
+		z0 = svnmad_f16_x (p0, z1, z2, z3),
+		z0 = svnmad_x (p0, z1, z2, z3))
+
+/*
+** nmad_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fnmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svnmad_n_f16_x (p0, z0, z1, d4),
+		 z0 = svnmad_x (p0, z0, z1, d4))
+
+/*
+** nmad_h4_f16_x_tied2:
+**	mov	(z[0-9]+\.h), h4
+**	fnmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_h4_f16_x_tied2, svfloat16_t, __fp16,
+		 z0 = svnmad_n_f16_x (p0, z1, z0, d4),
+		 z0 = svnmad_x (p0, z1, z0, d4))
+
+/*
+** nmad_h4_f16_x_untied: { xfail *-*-* }
+**	mov	z0\.h, h4
+**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svnmad_n_f16_x (p0, z1, z2, d4),
+		 z0 = svnmad_x (p0, z1, z2, d4))
+
+/*
+** nmad_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fnmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f16_x_tied1, svfloat16_t,
+		z0 = svnmad_n_f16_x (p0, z0, z1, 2),
+		z0 = svnmad_x (p0, z0, z1, 2))
+
+/*
+** nmad_2_f16_x_tied2:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fnmad	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f16_x_tied2, svfloat16_t,
+		z0 = svnmad_n_f16_x (p0, z1, z0, 2),
+		z0 = svnmad_x (p0, z1, z0, 2))
+
+/*
+** nmad_2_f16_x_untied:
+**	fmov	z0\.h, #2\.0(?:e\+0)?
+**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f16_x_untied, svfloat16_t,
+		z0 = svnmad_n_f16_x (p0, z1, z2, 2),
+		z0 = svnmad_x (p0, z1, z2, 2))
+
+/*
+** ptrue_nmad_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_f16_x_tied1, svfloat16_t,
+		z0 = svnmad_f16_x (svptrue_b16 (), z0, z1, z2),
+		z0 = svnmad_x (svptrue_b16 (), z0, z1, z2))
+
+/*
+** ptrue_nmad_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_f16_x_tied2, svfloat16_t,
+		z0 = svnmad_f16_x (svptrue_b16 (), z1, z0, z2),
+		z0 = svnmad_x (svptrue_b16 (), z1, z0, z2))
+
+/*
+** ptrue_nmad_f16_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_f16_x_tied3, svfloat16_t,
+		z0 = svnmad_f16_x (svptrue_b16 (), z1, z2, z0),
+		z0 = svnmad_x (svptrue_b16 (), z1, z2, z0))
+
+/*
+** ptrue_nmad_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_f16_x_untied, svfloat16_t,
+		z0 = svnmad_f16_x (svptrue_b16 (), z1, z2, z3),
+		z0 = svnmad_x (svptrue_b16 (), z1, z2, z3))
+
+/*
+** ptrue_nmad_2_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_2_f16_x_tied1, svfloat16_t,
+		z0 = svnmad_n_f16_x (svptrue_b16 (), z0, z1, 2),
+		z0 = svnmad_x (svptrue_b16 (), z0, z1, 2))
+
+/*
+** ptrue_nmad_2_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_2_f16_x_tied2, svfloat16_t,
+		z0 = svnmad_n_f16_x (svptrue_b16 (), z1, z0, 2),
+		z0 = svnmad_x (svptrue_b16 (), z1, z0, 2))
+
+/*
+** ptrue_nmad_2_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_2_f16_x_untied, svfloat16_t,
+		z0 = svnmad_n_f16_x (svptrue_b16 (), z1, z2, 2),
+		z0 = svnmad_x (svptrue_b16 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f32.c
new file mode 100644
index 000000000..ab86385c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f32.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** nmad_f32_m_tied1:
+**	fnmad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f32_m_tied1, svfloat32_t,
+		z0 = svnmad_f32_m (p0, z0, z1, z2),
+		z0 = svnmad_m (p0, z0, z1, z2))
+
+/*
+** nmad_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fnmad	z0\.s, p0/m, \1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f32_m_tied2, svfloat32_t,
+		z0 = svnmad_f32_m (p0, z1, z0, z2),
+		z0 = svnmad_m (p0, z1, z0, z2))
+
+/*
+** nmad_f32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fnmad	z0\.s, p0/m, z2\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f32_m_tied3, svfloat32_t,
+		z0 = svnmad_f32_m (p0, z1, z2, z0),
+		z0 = svnmad_m (p0, z1, z2, z0))
+
+/*
+** nmad_f32_m_untied:
+**	movprfx	z0, z1
+**	fnmad	z0\.s, p0/m, z2\.s, z3\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f32_m_untied, svfloat32_t,
+		z0 = svnmad_f32_m (p0, z1, z2, z3),
+		z0 = svnmad_m (p0, z1, z2, z3))
+
+/*
+** nmad_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fnmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svnmad_n_f32_m (p0, z0, z1, d4),
+		 z0 = svnmad_m (p0, z0, z1, d4))
+
+/*
+** nmad_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fnmad	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svnmad_n_f32_m (p0, z1, z2, d4),
+		 z0 = svnmad_m (p0, z1, z2, d4))
+
+/*
+** nmad_2_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fnmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f32_m_tied1, svfloat32_t,
+		z0 = svnmad_n_f32_m (p0, z0, z1, 2),
+		z0 = svnmad_m (p0, z0, z1, 2))
+
+/*
+** nmad_2_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fnmad	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f32_m_untied, svfloat32_t,
+		z0 = svnmad_n_f32_m (p0, z1, z2, 2),
+		z0 = svnmad_m (p0, z1, z2, 2))
+
+/*
+** nmad_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f32_z_tied1, svfloat32_t,
+		z0 = svnmad_f32_z (p0, z0, z1, z2),
+		z0 = svnmad_z (p0, z0, z1, z2))
+
+/*
+** nmad_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f32_z_tied2, svfloat32_t,
+		z0 = svnmad_f32_z (p0, z1, z0, z2),
+		z0 = svnmad_z (p0, z1, z0, z2))
+
+/*
+** nmad_f32_z_tied3:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f32_z_tied3, svfloat32_t,
+		z0 = svnmad_f32_z (p0, z1, z2, z0),
+		z0 = svnmad_z (p0, z1, z2, z0))
+
+/*
+** nmad_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fnmad	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fnmad	z0\.s, p0/m, z1\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z3\.s
+**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f32_z_untied, svfloat32_t,
+		z0 = svnmad_f32_z (p0, z1, z2, z3),
+		z0 = svnmad_z (p0, z1, z2, z3))
+
+/*
+** nmad_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svnmad_n_f32_z (p0, z0, z1, d4),
+		 z0 = svnmad_z (p0, z0, z1, d4))
+
+/*
+** nmad_s4_f32_z_tied2:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_s4_f32_z_tied2, svfloat32_t, float,
+		 z0 = svnmad_n_f32_z (p0, z1, z0, d4),
+		 z0 = svnmad_z (p0, z1, z0, d4))
+
+/*
+** nmad_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fnmad	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fnmad	z0\.s, p0/m, z1\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svnmad_n_f32_z (p0, z1, z2, d4),
+		 z0 = svnmad_z (p0, z1, z2, d4))
+
+/*
+** nmad_2_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f32_z_tied1, svfloat32_t,
+		z0 = svnmad_n_f32_z (p0, z0, z1, 2),
+		z0 = svnmad_z (p0, z0, z1, 2))
+
+/*
+** nmad_2_f32_z_tied2:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f32_z_tied2, svfloat32_t,
+		z0 = svnmad_n_f32_z (p0, z1, z0, 2),
+		z0 = svnmad_z (p0, z1, z0, 2))
+
+/*
+** nmad_2_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fnmad	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fnmad	z0\.s, p0/m, z1\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f32_z_untied, svfloat32_t,
+		z0 = svnmad_n_f32_z (p0, z1, z2, 2),
+		z0 = svnmad_z (p0, z1, z2, 2))
+
+/*
+** nmad_f32_x_tied1:
+**	fnmad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f32_x_tied1, svfloat32_t,
+		z0 = svnmad_f32_x (p0, z0, z1, z2),
+		z0 = svnmad_x (p0, z0, z1, z2))
+
+/*
+** nmad_f32_x_tied2:
+**	fnmad	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f32_x_tied2, svfloat32_t,
+		z0 = svnmad_f32_x (p0, z1, z0, z2),
+		z0 = svnmad_x (p0, z1, z0, z2))
+
+/*
+** nmad_f32_x_tied3:
+**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f32_x_tied3, svfloat32_t,
+		z0 = svnmad_f32_x (p0, z1, z2, z0),
+		z0 = svnmad_x (p0, z1, z2, z0))
+
+/*
+** nmad_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fnmad	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0, z2
+**	fnmad	z0\.s, p0/m, z1\.s, z3\.s
+** |
+**	movprfx	z0, z3
+**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f32_x_untied, svfloat32_t,
+		z0 = svnmad_f32_x (p0, z1, z2, z3),
+		z0 = svnmad_x (p0, z1, z2, z3))
+
+/*
+** nmad_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fnmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svnmad_n_f32_x (p0, z0, z1, d4),
+		 z0 = svnmad_x (p0, z0, z1, d4))
+
+/*
+** nmad_s4_f32_x_tied2:
+**	mov	(z[0-9]+\.s), s4
+**	fnmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_s4_f32_x_tied2, svfloat32_t, float,
+		 z0 = svnmad_n_f32_x (p0, z1, z0, d4),
+		 z0 = svnmad_x (p0, z1, z0, d4))
+
+/*
+** nmad_s4_f32_x_untied: { xfail *-*-* }
+**	mov	z0\.s, s4
+**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svnmad_n_f32_x (p0, z1, z2, d4),
+		 z0 = svnmad_x (p0, z1, z2, d4))
+
+/*
+** nmad_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fnmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f32_x_tied1, svfloat32_t,
+		z0 = svnmad_n_f32_x (p0, z0, z1, 2),
+		z0 = svnmad_x (p0, z0, z1, 2))
+
+/*
+** nmad_2_f32_x_tied2:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fnmad	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f32_x_tied2, svfloat32_t,
+		z0 = svnmad_n_f32_x (p0, z1, z0, 2),
+		z0 = svnmad_x (p0, z1, z0, 2))
+
+/*
+** nmad_2_f32_x_untied:
+**	fmov	z0\.s, #2\.0(?:e\+0)?
+**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f32_x_untied, svfloat32_t,
+		z0 = svnmad_n_f32_x (p0, z1, z2, 2),
+		z0 = svnmad_x (p0, z1, z2, 2))
+
+/*
+** ptrue_nmad_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_f32_x_tied1, svfloat32_t,
+		z0 = svnmad_f32_x (svptrue_b32 (), z0, z1, z2),
+		z0 = svnmad_x (svptrue_b32 (), z0, z1, z2))
+
+/*
+** ptrue_nmad_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_f32_x_tied2, svfloat32_t,
+		z0 = svnmad_f32_x (svptrue_b32 (), z1, z0, z2),
+		z0 = svnmad_x (svptrue_b32 (), z1, z0, z2))
+
+/*
+** ptrue_nmad_f32_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_f32_x_tied3, svfloat32_t,
+		z0 = svnmad_f32_x (svptrue_b32 (), z1, z2, z0),
+		z0 = svnmad_x (svptrue_b32 (), z1, z2, z0))
+
+/*
+** ptrue_nmad_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_f32_x_untied, svfloat32_t,
+		z0 = svnmad_f32_x (svptrue_b32 (), z1, z2, z3),
+		z0 = svnmad_x (svptrue_b32 (), z1, z2, z3))
+
+/*
+** ptrue_nmad_2_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_2_f32_x_tied1, svfloat32_t,
+		z0 = svnmad_n_f32_x (svptrue_b32 (), z0, z1, 2),
+		z0 = svnmad_x (svptrue_b32 (), z0, z1, 2))
+
+/*
+** ptrue_nmad_2_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_2_f32_x_tied2, svfloat32_t,
+		z0 = svnmad_n_f32_x (svptrue_b32 (), z1, z0, 2),
+		z0 = svnmad_x (svptrue_b32 (), z1, z0, 2))
+
+/*
+** ptrue_nmad_2_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_2_f32_x_untied, svfloat32_t,
+		z0 = svnmad_n_f32_x (svptrue_b32 (), z1, z2, 2),
+		z0 = svnmad_x (svptrue_b32 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f64.c
new file mode 100644
index 000000000..c236ff5a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmad_f64.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** nmad_f64_m_tied1:
+**	fnmad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f64_m_tied1, svfloat64_t,
+		z0 = svnmad_f64_m (p0, z0, z1, z2),
+		z0 = svnmad_m (p0, z0, z1, z2))
+
+/*
+** nmad_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fnmad	z0\.d, p0/m, \1, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f64_m_tied2, svfloat64_t,
+		z0 = svnmad_f64_m (p0, z1, z0, z2),
+		z0 = svnmad_m (p0, z1, z0, z2))
+
+/*
+** nmad_f64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fnmad	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f64_m_tied3, svfloat64_t,
+		z0 = svnmad_f64_m (p0, z1, z2, z0),
+		z0 = svnmad_m (p0, z1, z2, z0))
+
+/*
+** nmad_f64_m_untied:
+**	movprfx	z0, z1
+**	fnmad	z0\.d, p0/m, z2\.d, z3\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f64_m_untied, svfloat64_t,
+		z0 = svnmad_f64_m (p0, z1, z2, z3),
+		z0 = svnmad_m (p0, z1, z2, z3))
+
+/*
+** nmad_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fnmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svnmad_n_f64_m (p0, z0, z1, d4),
+		 z0 = svnmad_m (p0, z0, z1, d4))
+
+/*
+** nmad_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fnmad	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svnmad_n_f64_m (p0, z1, z2, d4),
+		 z0 = svnmad_m (p0, z1, z2, d4))
+
+/*
+** nmad_2_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fnmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f64_m_tied1, svfloat64_t,
+		z0 = svnmad_n_f64_m (p0, z0, z1, 2),
+		z0 = svnmad_m (p0, z0, z1, 2))
+
+/*
+** nmad_2_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fnmad	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f64_m_untied, svfloat64_t,
+		z0 = svnmad_n_f64_m (p0, z1, z2, 2),
+		z0 = svnmad_m (p0, z1, z2, 2))
+
+/*
+** nmad_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f64_z_tied1, svfloat64_t,
+		z0 = svnmad_f64_z (p0, z0, z1, z2),
+		z0 = svnmad_z (p0, z0, z1, z2))
+
+/*
+** nmad_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f64_z_tied2, svfloat64_t,
+		z0 = svnmad_f64_z (p0, z1, z0, z2),
+		z0 = svnmad_z (p0, z1, z0, z2))
+
+/*
+** nmad_f64_z_tied3:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f64_z_tied3, svfloat64_t,
+		z0 = svnmad_f64_z (p0, z1, z2, z0),
+		z0 = svnmad_z (p0, z1, z2, z0))
+
+/*
+** nmad_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fnmad	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fnmad	z0\.d, p0/m, z1\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z3\.d
+**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f64_z_untied, svfloat64_t,
+		z0 = svnmad_f64_z (p0, z1, z2, z3),
+		z0 = svnmad_z (p0, z1, z2, z3))
+
+/*
+** nmad_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svnmad_n_f64_z (p0, z0, z1, d4),
+		 z0 = svnmad_z (p0, z0, z1, d4))
+
+/*
+** nmad_d4_f64_z_tied2:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_d4_f64_z_tied2, svfloat64_t, double,
+		 z0 = svnmad_n_f64_z (p0, z1, z0, d4),
+		 z0 = svnmad_z (p0, z1, z0, d4))
+
+/*
+** nmad_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fnmad	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fnmad	z0\.d, p0/m, z1\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svnmad_n_f64_z (p0, z1, z2, d4),
+		 z0 = svnmad_z (p0, z1, z2, d4))
+
+/*
+** nmad_2_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f64_z_tied1, svfloat64_t,
+		z0 = svnmad_n_f64_z (p0, z0, z1, 2),
+		z0 = svnmad_z (p0, z0, z1, 2))
+
+/*
+** nmad_2_f64_z_tied2:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f64_z_tied2, svfloat64_t,
+		z0 = svnmad_n_f64_z (p0, z1, z0, 2),
+		z0 = svnmad_z (p0, z1, z0, 2))
+
+/*
+** nmad_2_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fnmad	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fnmad	z0\.d, p0/m, z1\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f64_z_untied, svfloat64_t,
+		z0 = svnmad_n_f64_z (p0, z1, z2, 2),
+		z0 = svnmad_z (p0, z1, z2, 2))
+
+/*
+** nmad_f64_x_tied1:
+**	fnmad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f64_x_tied1, svfloat64_t,
+		z0 = svnmad_f64_x (p0, z0, z1, z2),
+		z0 = svnmad_x (p0, z0, z1, z2))
+
+/*
+** nmad_f64_x_tied2:
+**	fnmad	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f64_x_tied2, svfloat64_t,
+		z0 = svnmad_f64_x (p0, z1, z0, z2),
+		z0 = svnmad_x (p0, z1, z0, z2))
+
+/*
+** nmad_f64_x_tied3:
+**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f64_x_tied3, svfloat64_t,
+		z0 = svnmad_f64_x (p0, z1, z2, z0),
+		z0 = svnmad_x (p0, z1, z2, z0))
+
+/*
+** nmad_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fnmad	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0, z2
+**	fnmad	z0\.d, p0/m, z1\.d, z3\.d
+** |
+**	movprfx	z0, z3
+**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_f64_x_untied, svfloat64_t,
+		z0 = svnmad_f64_x (p0, z1, z2, z3),
+		z0 = svnmad_x (p0, z1, z2, z3))
+
+/*
+** nmad_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fnmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svnmad_n_f64_x (p0, z0, z1, d4),
+		 z0 = svnmad_x (p0, z0, z1, d4))
+
+/*
+** nmad_d4_f64_x_tied2:
+**	mov	(z[0-9]+\.d), d4
+**	fnmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_d4_f64_x_tied2, svfloat64_t, double,
+		 z0 = svnmad_n_f64_x (p0, z1, z0, d4),
+		 z0 = svnmad_x (p0, z1, z0, d4))
+
+/*
+** nmad_d4_f64_x_untied: { xfail *-*-* }
+**	mov	z0\.d, d4
+**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (nmad_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svnmad_n_f64_x (p0, z1, z2, d4),
+		 z0 = svnmad_x (p0, z1, z2, d4))
+
+/*
+** nmad_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fnmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f64_x_tied1, svfloat64_t,
+		z0 = svnmad_n_f64_x (p0, z0, z1, 2),
+		z0 = svnmad_x (p0, z0, z1, 2))
+
+/*
+** nmad_2_f64_x_tied2:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fnmad	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f64_x_tied2, svfloat64_t,
+		z0 = svnmad_n_f64_x (p0, z1, z0, 2),
+		z0 = svnmad_x (p0, z1, z0, 2))
+
+/*
+** nmad_2_f64_x_untied:
+**	fmov	z0\.d, #2\.0(?:e\+0)?
+**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmad_2_f64_x_untied, svfloat64_t,
+		z0 = svnmad_n_f64_x (p0, z1, z2, 2),
+		z0 = svnmad_x (p0, z1, z2, 2))
+
+/*
+** ptrue_nmad_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_f64_x_tied1, svfloat64_t,
+		z0 = svnmad_f64_x (svptrue_b64 (), z0, z1, z2),
+		z0 = svnmad_x (svptrue_b64 (), z0, z1, z2))
+
+/*
+** ptrue_nmad_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_f64_x_tied2, svfloat64_t,
+		z0 = svnmad_f64_x (svptrue_b64 (), z1, z0, z2),
+		z0 = svnmad_x (svptrue_b64 (), z1, z0, z2))
+
+/*
+** ptrue_nmad_f64_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_f64_x_tied3, svfloat64_t,
+		z0 = svnmad_f64_x (svptrue_b64 (), z1, z2, z0),
+		z0 = svnmad_x (svptrue_b64 (), z1, z2, z0))
+
+/*
+** ptrue_nmad_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_f64_x_untied, svfloat64_t,
+		z0 = svnmad_f64_x (svptrue_b64 (), z1, z2, z3),
+		z0 = svnmad_x (svptrue_b64 (), z1, z2, z3))
+
+/*
+** ptrue_nmad_2_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_2_f64_x_tied1, svfloat64_t,
+		z0 = svnmad_n_f64_x (svptrue_b64 (), z0, z1, 2),
+		z0 = svnmad_x (svptrue_b64 (), z0, z1, 2))
+
+/*
+** ptrue_nmad_2_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_2_f64_x_tied2, svfloat64_t,
+		z0 = svnmad_n_f64_x (svptrue_b64 (), z1, z0, 2),
+		z0 = svnmad_x (svptrue_b64 (), z1, z0, 2))
+
+/*
+** ptrue_nmad_2_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmad_2_f64_x_untied, svfloat64_t,
+		z0 = svnmad_n_f64_x (svptrue_b64 (), z1, z2, 2),
+		z0 = svnmad_x (svptrue_b64 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f16.c
new file mode 100644
index 000000000..f7ac377fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f16.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** nmla_f16_m_tied1:
+**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f16_m_tied1, svfloat16_t,
+		z0 = svnmla_f16_m (p0, z0, z1, z2),
+		z0 = svnmla_m (p0, z0, z1, z2))
+
+/*
+** nmla_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fnmla	z0\.h, p0/m, \1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f16_m_tied2, svfloat16_t,
+		z0 = svnmla_f16_m (p0, z1, z0, z2),
+		z0 = svnmla_m (p0, z1, z0, z2))
+
+/*
+** nmla_f16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fnmla	z0\.h, p0/m, z2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f16_m_tied3, svfloat16_t,
+		z0 = svnmla_f16_m (p0, z1, z2, z0),
+		z0 = svnmla_m (p0, z1, z2, z0))
+
+/*
+** nmla_f16_m_untied:
+**	movprfx	z0, z1
+**	fnmla	z0\.h, p0/m, z2\.h, z3\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f16_m_untied, svfloat16_t,
+		z0 = svnmla_f16_m (p0, z1, z2, z3),
+		z0 = svnmla_m (p0, z1, z2, z3))
+
+/*
+** nmla_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fnmla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svnmla_n_f16_m (p0, z0, z1, d4),
+		 z0 = svnmla_m (p0, z0, z1, d4))
+
+/*
+** nmla_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fnmla	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svnmla_n_f16_m (p0, z1, z2, d4),
+		 z0 = svnmla_m (p0, z1, z2, d4))
+
+/*
+** nmla_2_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fnmla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f16_m_tied1, svfloat16_t,
+		z0 = svnmla_n_f16_m (p0, z0, z1, 2),
+		z0 = svnmla_m (p0, z0, z1, 2))
+
+/*
+** nmla_2_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fnmla	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f16_m_untied, svfloat16_t,
+		z0 = svnmla_n_f16_m (p0, z1, z2, 2),
+		z0 = svnmla_m (p0, z1, z2, 2))
+
+/*
+** nmla_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f16_z_tied1, svfloat16_t,
+		z0 = svnmla_f16_z (p0, z0, z1, z2),
+		z0 = svnmla_z (p0, z0, z1, z2))
+
+/*
+** nmla_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f16_z_tied2, svfloat16_t,
+		z0 = svnmla_f16_z (p0, z1, z0, z2),
+		z0 = svnmla_z (p0, z1, z0, z2))
+
+/*
+** nmla_f16_z_tied3:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f16_z_tied3, svfloat16_t,
+		z0 = svnmla_f16_z (p0, z1, z2, z0),
+		z0 = svnmla_z (p0, z1, z2, z0))
+
+/*
+** nmla_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fnmla	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fnmad	z0\.h, p0/m, z3\.h, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, z3\.h
+**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f16_z_untied, svfloat16_t,
+		z0 = svnmla_f16_z (p0, z1, z2, z3),
+		z0 = svnmla_z (p0, z1, z2, z3))
+
+/*
+** nmla_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svnmla_n_f16_z (p0, z0, z1, d4),
+		 z0 = svnmla_z (p0, z0, z1, d4))
+
+/*
+** nmla_h4_f16_z_tied2:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmad	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_h4_f16_z_tied2, svfloat16_t, __fp16,
+		 z0 = svnmla_n_f16_z (p0, z1, z0, d4),
+		 z0 = svnmla_z (p0, z1, z0, d4))
+
+/*
+** nmla_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fnmla	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fnmad	z0\.h, p0/m, \1, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svnmla_n_f16_z (p0, z1, z2, d4),
+		 z0 = svnmla_z (p0, z1, z2, d4))
+
+/*
+** nmla_2_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f16_z_tied1, svfloat16_t,
+		z0 = svnmla_n_f16_z (p0, z0, z1, 2),
+		z0 = svnmla_z (p0, z0, z1, 2))
+
+/*
+** nmla_2_f16_z_tied2:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmad	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f16_z_tied2, svfloat16_t,
+		z0 = svnmla_n_f16_z (p0, z1, z0, 2),
+		z0 = svnmla_z (p0, z1, z0, 2))
+
+/*
+** nmla_2_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fnmla	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fnmad	z0\.h, p0/m, \1, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f16_z_untied, svfloat16_t,
+		z0 = svnmla_n_f16_z (p0, z1, z2, 2),
+		z0 = svnmla_z (p0, z1, z2, 2))
+
+/*
+** nmla_f16_x_tied1:
+**	fnmla	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f16_x_tied1, svfloat16_t,
+		z0 = svnmla_f16_x (p0, z0, z1, z2),
+		z0 = svnmla_x (p0, z0, z1, z2))
+
+/*
+** nmla_f16_x_tied2:
+**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f16_x_tied2, svfloat16_t,
+		z0 = svnmla_f16_x (p0, z1, z0, z2),
+		z0 = svnmla_x (p0, z1, z0, z2))
+
+/*
+** nmla_f16_x_tied3:
+**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f16_x_tied3, svfloat16_t,
+		z0 = svnmla_f16_x (p0, z1, z2, z0),
+		z0 = svnmla_x (p0, z1, z2, z0))
+
+/*
+** nmla_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fnmla	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0, z2
+**	fnmad	z0\.h, p0/m, z3\.h, z1\.h
+** |
+**	movprfx	z0, z3
+**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f16_x_untied, svfloat16_t,
+		z0 = svnmla_f16_x (p0, z1, z2, z3),
+		z0 = svnmla_x (p0, z1, z2, z3))
+
+/*
+** nmla_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fnmla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svnmla_n_f16_x (p0, z0, z1, d4),
+		 z0 = svnmla_x (p0, z0, z1, d4))
+
+/*
+** nmla_h4_f16_x_tied2:
+**	mov	(z[0-9]+\.h), h4
+**	fnmad	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_h4_f16_x_tied2, svfloat16_t, __fp16,
+		 z0 = svnmla_n_f16_x (p0, z1, z0, d4),
+		 z0 = svnmla_x (p0, z1, z0, d4))
+
+/*
+** nmla_h4_f16_x_untied: { xfail *-*-* }
+**	mov	z0\.h, h4
+**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svnmla_n_f16_x (p0, z1, z2, d4),
+		 z0 = svnmla_x (p0, z1, z2, d4))
+
+/*
+** nmla_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fnmla	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f16_x_tied1, svfloat16_t,
+		z0 = svnmla_n_f16_x (p0, z0, z1, 2),
+		z0 = svnmla_x (p0, z0, z1, 2))
+
+/*
+** nmla_2_f16_x_tied2:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fnmad	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f16_x_tied2, svfloat16_t,
+		z0 = svnmla_n_f16_x (p0, z1, z0, 2),
+		z0 = svnmla_x (p0, z1, z0, 2))
+
+/*
+** nmla_2_f16_x_untied:
+**	fmov	z0\.h, #2\.0(?:e\+0)?
+**	fnmad	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f16_x_untied, svfloat16_t,
+		z0 = svnmla_n_f16_x (p0, z1, z2, 2),
+		z0 = svnmla_x (p0, z1, z2, 2))
+
+/*
+** ptrue_nmla_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_f16_x_tied1, svfloat16_t,
+		z0 = svnmla_f16_x (svptrue_b16 (), z0, z1, z2),
+		z0 = svnmla_x (svptrue_b16 (), z0, z1, z2))
+
+/*
+** ptrue_nmla_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_f16_x_tied2, svfloat16_t,
+		z0 = svnmla_f16_x (svptrue_b16 (), z1, z0, z2),
+		z0 = svnmla_x (svptrue_b16 (), z1, z0, z2))
+
+/*
+** ptrue_nmla_f16_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_f16_x_tied3, svfloat16_t,
+		z0 = svnmla_f16_x (svptrue_b16 (), z1, z2, z0),
+		z0 = svnmla_x (svptrue_b16 (), z1, z2, z0))
+
+/*
+** ptrue_nmla_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_f16_x_untied, svfloat16_t,
+		z0 = svnmla_f16_x (svptrue_b16 (), z1, z2, z3),
+		z0 = svnmla_x (svptrue_b16 (), z1, z2, z3))
+
+/*
+** ptrue_nmla_2_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_2_f16_x_tied1, svfloat16_t,
+		z0 = svnmla_n_f16_x (svptrue_b16 (), z0, z1, 2),
+		z0 = svnmla_x (svptrue_b16 (), z0, z1, 2))
+
+/*
+** ptrue_nmla_2_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_2_f16_x_tied2, svfloat16_t,
+		z0 = svnmla_n_f16_x (svptrue_b16 (), z1, z0, 2),
+		z0 = svnmla_x (svptrue_b16 (), z1, z0, 2))
+
+/*
+** ptrue_nmla_2_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_2_f16_x_untied, svfloat16_t,
+		z0 = svnmla_n_f16_x (svptrue_b16 (), z1, z2, 2),
+		z0 = svnmla_x (svptrue_b16 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f32.c
new file mode 100644
index 000000000..ef9542d74
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f32.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** nmla_f32_m_tied1:
+**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f32_m_tied1, svfloat32_t,
+		z0 = svnmla_f32_m (p0, z0, z1, z2),
+		z0 = svnmla_m (p0, z0, z1, z2))
+
+/*
+** nmla_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fnmla	z0\.s, p0/m, \1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f32_m_tied2, svfloat32_t,
+		z0 = svnmla_f32_m (p0, z1, z0, z2),
+		z0 = svnmla_m (p0, z1, z0, z2))
+
+/*
+** nmla_f32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fnmla	z0\.s, p0/m, z2\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f32_m_tied3, svfloat32_t,
+		z0 = svnmla_f32_m (p0, z1, z2, z0),
+		z0 = svnmla_m (p0, z1, z2, z0))
+
+/*
+** nmla_f32_m_untied:
+**	movprfx	z0, z1
+**	fnmla	z0\.s, p0/m, z2\.s, z3\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f32_m_untied, svfloat32_t,
+		z0 = svnmla_f32_m (p0, z1, z2, z3),
+		z0 = svnmla_m (p0, z1, z2, z3))
+
+/*
+** nmla_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fnmla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svnmla_n_f32_m (p0, z0, z1, d4),
+		 z0 = svnmla_m (p0, z0, z1, d4))
+
+/*
+** nmla_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fnmla	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svnmla_n_f32_m (p0, z1, z2, d4),
+		 z0 = svnmla_m (p0, z1, z2, d4))
+
+/*
+** nmla_2_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fnmla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f32_m_tied1, svfloat32_t,
+		z0 = svnmla_n_f32_m (p0, z0, z1, 2),
+		z0 = svnmla_m (p0, z0, z1, 2))
+
+/*
+** nmla_2_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fnmla	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f32_m_untied, svfloat32_t,
+		z0 = svnmla_n_f32_m (p0, z1, z2, 2),
+		z0 = svnmla_m (p0, z1, z2, 2))
+
+/*
+** nmla_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f32_z_tied1, svfloat32_t,
+		z0 = svnmla_f32_z (p0, z0, z1, z2),
+		z0 = svnmla_z (p0, z0, z1, z2))
+
+/*
+** nmla_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f32_z_tied2, svfloat32_t,
+		z0 = svnmla_f32_z (p0, z1, z0, z2),
+		z0 = svnmla_z (p0, z1, z0, z2))
+
+/*
+** nmla_f32_z_tied3:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f32_z_tied3, svfloat32_t,
+		z0 = svnmla_f32_z (p0, z1, z2, z0),
+		z0 = svnmla_z (p0, z1, z2, z0))
+
+/*
+** nmla_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fnmla	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fnmad	z0\.s, p0/m, z3\.s, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, z3\.s
+**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f32_z_untied, svfloat32_t,
+		z0 = svnmla_f32_z (p0, z1, z2, z3),
+		z0 = svnmla_z (p0, z1, z2, z3))
+
+/*
+** nmla_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svnmla_n_f32_z (p0, z0, z1, d4),
+		 z0 = svnmla_z (p0, z0, z1, d4))
+
+/*
+** nmla_s4_f32_z_tied2:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmad	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_s4_f32_z_tied2, svfloat32_t, float,
+		 z0 = svnmla_n_f32_z (p0, z1, z0, d4),
+		 z0 = svnmla_z (p0, z1, z0, d4))
+
+/*
+** nmla_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fnmla	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fnmad	z0\.s, p0/m, \1, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svnmla_n_f32_z (p0, z1, z2, d4),
+		 z0 = svnmla_z (p0, z1, z2, d4))
+
+/*
+** nmla_2_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f32_z_tied1, svfloat32_t,
+		z0 = svnmla_n_f32_z (p0, z0, z1, 2),
+		z0 = svnmla_z (p0, z0, z1, 2))
+
+/*
+** nmla_2_f32_z_tied2:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmad	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f32_z_tied2, svfloat32_t,
+		z0 = svnmla_n_f32_z (p0, z1, z0, 2),
+		z0 = svnmla_z (p0, z1, z0, 2))
+
+/*
+** nmla_2_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fnmla	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fnmad	z0\.s, p0/m, \1, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f32_z_untied, svfloat32_t,
+		z0 = svnmla_n_f32_z (p0, z1, z2, 2),
+		z0 = svnmla_z (p0, z1, z2, 2))
+
+/*
+** nmla_f32_x_tied1:
+**	fnmla	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f32_x_tied1, svfloat32_t,
+		z0 = svnmla_f32_x (p0, z0, z1, z2),
+		z0 = svnmla_x (p0, z0, z1, z2))
+
+/*
+** nmla_f32_x_tied2:
+**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f32_x_tied2, svfloat32_t,
+		z0 = svnmla_f32_x (p0, z1, z0, z2),
+		z0 = svnmla_x (p0, z1, z0, z2))
+
+/*
+** nmla_f32_x_tied3:
+**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f32_x_tied3, svfloat32_t,
+		z0 = svnmla_f32_x (p0, z1, z2, z0),
+		z0 = svnmla_x (p0, z1, z2, z0))
+
+/*
+** nmla_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fnmla	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0, z2
+**	fnmad	z0\.s, p0/m, z3\.s, z1\.s
+** |
+**	movprfx	z0, z3
+**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f32_x_untied, svfloat32_t,
+		z0 = svnmla_f32_x (p0, z1, z2, z3),
+		z0 = svnmla_x (p0, z1, z2, z3))
+
+/*
+** nmla_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fnmla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svnmla_n_f32_x (p0, z0, z1, d4),
+		 z0 = svnmla_x (p0, z0, z1, d4))
+
+/*
+** nmla_s4_f32_x_tied2:
+**	mov	(z[0-9]+\.s), s4
+**	fnmad	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_s4_f32_x_tied2, svfloat32_t, float,
+		 z0 = svnmla_n_f32_x (p0, z1, z0, d4),
+		 z0 = svnmla_x (p0, z1, z0, d4))
+
+/*
+** nmla_s4_f32_x_untied: { xfail *-*-* }
+**	mov	z0\.s, s4
+**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svnmla_n_f32_x (p0, z1, z2, d4),
+		 z0 = svnmla_x (p0, z1, z2, d4))
+
+/*
+** nmla_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fnmla	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f32_x_tied1, svfloat32_t,
+		z0 = svnmla_n_f32_x (p0, z0, z1, 2),
+		z0 = svnmla_x (p0, z0, z1, 2))
+
+/*
+** nmla_2_f32_x_tied2:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fnmad	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f32_x_tied2, svfloat32_t,
+		z0 = svnmla_n_f32_x (p0, z1, z0, 2),
+		z0 = svnmla_x (p0, z1, z0, 2))
+
+/*
+** nmla_2_f32_x_untied:
+**	fmov	z0\.s, #2\.0(?:e\+0)?
+**	fnmad	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f32_x_untied, svfloat32_t,
+		z0 = svnmla_n_f32_x (p0, z1, z2, 2),
+		z0 = svnmla_x (p0, z1, z2, 2))
+
+/*
+** ptrue_nmla_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_f32_x_tied1, svfloat32_t,
+		z0 = svnmla_f32_x (svptrue_b32 (), z0, z1, z2),
+		z0 = svnmla_x (svptrue_b32 (), z0, z1, z2))
+
+/*
+** ptrue_nmla_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_f32_x_tied2, svfloat32_t,
+		z0 = svnmla_f32_x (svptrue_b32 (), z1, z0, z2),
+		z0 = svnmla_x (svptrue_b32 (), z1, z0, z2))
+
+/*
+** ptrue_nmla_f32_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_f32_x_tied3, svfloat32_t,
+		z0 = svnmla_f32_x (svptrue_b32 (), z1, z2, z0),
+		z0 = svnmla_x (svptrue_b32 (), z1, z2, z0))
+
+/*
+** ptrue_nmla_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_f32_x_untied, svfloat32_t,
+		z0 = svnmla_f32_x (svptrue_b32 (), z1, z2, z3),
+		z0 = svnmla_x (svptrue_b32 (), z1, z2, z3))
+
+/*
+** ptrue_nmla_2_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_2_f32_x_tied1, svfloat32_t,
+		z0 = svnmla_n_f32_x (svptrue_b32 (), z0, z1, 2),
+		z0 = svnmla_x (svptrue_b32 (), z0, z1, 2))
+
+/*
+** ptrue_nmla_2_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_2_f32_x_tied2, svfloat32_t,
+		z0 = svnmla_n_f32_x (svptrue_b32 (), z1, z0, 2),
+		z0 = svnmla_x (svptrue_b32 (), z1, z0, 2))
+
+/*
+** ptrue_nmla_2_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_2_f32_x_untied, svfloat32_t,
+		z0 = svnmla_n_f32_x (svptrue_b32 (), z1, z2, 2),
+		z0 = svnmla_x (svptrue_b32 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f64.c
new file mode 100644
index 000000000..441821f60
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmla_f64.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** nmla_f64_m_tied1:
+**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f64_m_tied1, svfloat64_t,
+		z0 = svnmla_f64_m (p0, z0, z1, z2),
+		z0 = svnmla_m (p0, z0, z1, z2))
+
+/*
+** nmla_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fnmla	z0\.d, p0/m, \1, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f64_m_tied2, svfloat64_t,
+		z0 = svnmla_f64_m (p0, z1, z0, z2),
+		z0 = svnmla_m (p0, z1, z0, z2))
+
+/*
+** nmla_f64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fnmla	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f64_m_tied3, svfloat64_t,
+		z0 = svnmla_f64_m (p0, z1, z2, z0),
+		z0 = svnmla_m (p0, z1, z2, z0))
+
+/*
+** nmla_f64_m_untied:
+**	movprfx	z0, z1
+**	fnmla	z0\.d, p0/m, z2\.d, z3\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f64_m_untied, svfloat64_t,
+		z0 = svnmla_f64_m (p0, z1, z2, z3),
+		z0 = svnmla_m (p0, z1, z2, z3))
+
+/*
+** nmla_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fnmla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svnmla_n_f64_m (p0, z0, z1, d4),
+		 z0 = svnmla_m (p0, z0, z1, d4))
+
+/*
+** nmla_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fnmla	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svnmla_n_f64_m (p0, z1, z2, d4),
+		 z0 = svnmla_m (p0, z1, z2, d4))
+
+/*
+** nmla_2_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fnmla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f64_m_tied1, svfloat64_t,
+		z0 = svnmla_n_f64_m (p0, z0, z1, 2),
+		z0 = svnmla_m (p0, z0, z1, 2))
+
+/*
+** nmla_2_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fnmla	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f64_m_untied, svfloat64_t,
+		z0 = svnmla_n_f64_m (p0, z1, z2, 2),
+		z0 = svnmla_m (p0, z1, z2, 2))
+
+/*
+** nmla_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f64_z_tied1, svfloat64_t,
+		z0 = svnmla_f64_z (p0, z0, z1, z2),
+		z0 = svnmla_z (p0, z0, z1, z2))
+
+/*
+** nmla_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f64_z_tied2, svfloat64_t,
+		z0 = svnmla_f64_z (p0, z1, z0, z2),
+		z0 = svnmla_z (p0, z1, z0, z2))
+
+/*
+** nmla_f64_z_tied3:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f64_z_tied3, svfloat64_t,
+		z0 = svnmla_f64_z (p0, z1, z2, z0),
+		z0 = svnmla_z (p0, z1, z2, z0))
+
+/*
+** nmla_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fnmla	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fnmad	z0\.d, p0/m, z3\.d, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, z3\.d
+**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f64_z_untied, svfloat64_t,
+		z0 = svnmla_f64_z (p0, z1, z2, z3),
+		z0 = svnmla_z (p0, z1, z2, z3))
+
+/*
+** nmla_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svnmla_n_f64_z (p0, z0, z1, d4),
+		 z0 = svnmla_z (p0, z0, z1, d4))
+
+/*
+** nmla_d4_f64_z_tied2:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmad	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_d4_f64_z_tied2, svfloat64_t, double,
+		 z0 = svnmla_n_f64_z (p0, z1, z0, d4),
+		 z0 = svnmla_z (p0, z1, z0, d4))
+
+/*
+** nmla_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fnmla	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fnmad	z0\.d, p0/m, \1, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svnmla_n_f64_z (p0, z1, z2, d4),
+		 z0 = svnmla_z (p0, z1, z2, d4))
+
+/*
+** nmla_2_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f64_z_tied1, svfloat64_t,
+		z0 = svnmla_n_f64_z (p0, z0, z1, 2),
+		z0 = svnmla_z (p0, z0, z1, 2))
+
+/*
+** nmla_2_f64_z_tied2:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmad	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f64_z_tied2, svfloat64_t,
+		z0 = svnmla_n_f64_z (p0, z1, z0, 2),
+		z0 = svnmla_z (p0, z1, z0, 2))
+
+/*
+** nmla_2_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fnmla	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fnmad	z0\.d, p0/m, \1, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f64_z_untied, svfloat64_t,
+		z0 = svnmla_n_f64_z (p0, z1, z2, 2),
+		z0 = svnmla_z (p0, z1, z2, 2))
+
+/*
+** nmla_f64_x_tied1:
+**	fnmla	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f64_x_tied1, svfloat64_t,
+		z0 = svnmla_f64_x (p0, z0, z1, z2),
+		z0 = svnmla_x (p0, z0, z1, z2))
+
+/*
+** nmla_f64_x_tied2:
+**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f64_x_tied2, svfloat64_t,
+		z0 = svnmla_f64_x (p0, z1, z0, z2),
+		z0 = svnmla_x (p0, z1, z0, z2))
+
+/*
+** nmla_f64_x_tied3:
+**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f64_x_tied3, svfloat64_t,
+		z0 = svnmla_f64_x (p0, z1, z2, z0),
+		z0 = svnmla_x (p0, z1, z2, z0))
+
+/*
+** nmla_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fnmla	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0, z2
+**	fnmad	z0\.d, p0/m, z3\.d, z1\.d
+** |
+**	movprfx	z0, z3
+**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_f64_x_untied, svfloat64_t,
+		z0 = svnmla_f64_x (p0, z1, z2, z3),
+		z0 = svnmla_x (p0, z1, z2, z3))
+
+/*
+** nmla_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fnmla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svnmla_n_f64_x (p0, z0, z1, d4),
+		 z0 = svnmla_x (p0, z0, z1, d4))
+
+/*
+** nmla_d4_f64_x_tied2:
+**	mov	(z[0-9]+\.d), d4
+**	fnmad	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_d4_f64_x_tied2, svfloat64_t, double,
+		 z0 = svnmla_n_f64_x (p0, z1, z0, d4),
+		 z0 = svnmla_x (p0, z1, z0, d4))
+
+/*
+** nmla_d4_f64_x_untied: { xfail *-*-* }
+**	mov	z0\.d, d4
+**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (nmla_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svnmla_n_f64_x (p0, z1, z2, d4),
+		 z0 = svnmla_x (p0, z1, z2, d4))
+
+/*
+** nmla_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fnmla	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f64_x_tied1, svfloat64_t,
+		z0 = svnmla_n_f64_x (p0, z0, z1, 2),
+		z0 = svnmla_x (p0, z0, z1, 2))
+
+/*
+** nmla_2_f64_x_tied2:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fnmad	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f64_x_tied2, svfloat64_t,
+		z0 = svnmla_n_f64_x (p0, z1, z0, 2),
+		z0 = svnmla_x (p0, z1, z0, 2))
+
+/*
+** nmla_2_f64_x_untied:
+**	fmov	z0\.d, #2\.0(?:e\+0)?
+**	fnmad	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmla_2_f64_x_untied, svfloat64_t,
+		z0 = svnmla_n_f64_x (p0, z1, z2, 2),
+		z0 = svnmla_x (p0, z1, z2, 2))
+
+/*
+** ptrue_nmla_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_f64_x_tied1, svfloat64_t,
+		z0 = svnmla_f64_x (svptrue_b64 (), z0, z1, z2),
+		z0 = svnmla_x (svptrue_b64 (), z0, z1, z2))
+
+/*
+** ptrue_nmla_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_f64_x_tied2, svfloat64_t,
+		z0 = svnmla_f64_x (svptrue_b64 (), z1, z0, z2),
+		z0 = svnmla_x (svptrue_b64 (), z1, z0, z2))
+
+/*
+** ptrue_nmla_f64_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_f64_x_tied3, svfloat64_t,
+		z0 = svnmla_f64_x (svptrue_b64 (), z1, z2, z0),
+		z0 = svnmla_x (svptrue_b64 (), z1, z2, z0))
+
+/*
+** ptrue_nmla_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_f64_x_untied, svfloat64_t,
+		z0 = svnmla_f64_x (svptrue_b64 (), z1, z2, z3),
+		z0 = svnmla_x (svptrue_b64 (), z1, z2, z3))
+
+/*
+** ptrue_nmla_2_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_2_f64_x_tied1, svfloat64_t,
+		z0 = svnmla_n_f64_x (svptrue_b64 (), z0, z1, 2),
+		z0 = svnmla_x (svptrue_b64 (), z0, z1, 2))
+
+/*
+** ptrue_nmla_2_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_2_f64_x_tied2, svfloat64_t,
+		z0 = svnmla_n_f64_x (svptrue_b64 (), z1, z0, 2),
+		z0 = svnmla_x (svptrue_b64 (), z1, z0, 2))
+
+/*
+** ptrue_nmla_2_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmla_2_f64_x_untied, svfloat64_t,
+		z0 = svnmla_n_f64_x (svptrue_b64 (), z1, z2, 2),
+		z0 = svnmla_x (svptrue_b64 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f16.c
new file mode 100644
index 000000000..8aa6c7509
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f16.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** nmls_f16_m_tied1:
+**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f16_m_tied1, svfloat16_t,
+		z0 = svnmls_f16_m (p0, z0, z1, z2),
+		z0 = svnmls_m (p0, z0, z1, z2))
+
+/*
+** nmls_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fnmls	z0\.h, p0/m, \1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f16_m_tied2, svfloat16_t,
+		z0 = svnmls_f16_m (p0, z1, z0, z2),
+		z0 = svnmls_m (p0, z1, z0, z2))
+
+/*
+** nmls_f16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fnmls	z0\.h, p0/m, z2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f16_m_tied3, svfloat16_t,
+		z0 = svnmls_f16_m (p0, z1, z2, z0),
+		z0 = svnmls_m (p0, z1, z2, z0))
+
+/*
+** nmls_f16_m_untied:
+**	movprfx	z0, z1
+**	fnmls	z0\.h, p0/m, z2\.h, z3\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f16_m_untied, svfloat16_t,
+		z0 = svnmls_f16_m (p0, z1, z2, z3),
+		z0 = svnmls_m (p0, z1, z2, z3))
+
+/*
+** nmls_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fnmls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svnmls_n_f16_m (p0, z0, z1, d4),
+		 z0 = svnmls_m (p0, z0, z1, d4))
+
+/*
+** nmls_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fnmls	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svnmls_n_f16_m (p0, z1, z2, d4),
+		 z0 = svnmls_m (p0, z1, z2, d4))
+
+/*
+** nmls_2_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fnmls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f16_m_tied1, svfloat16_t,
+		z0 = svnmls_n_f16_m (p0, z0, z1, 2),
+		z0 = svnmls_m (p0, z0, z1, 2))
+
+/*
+** nmls_2_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fnmls	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f16_m_untied, svfloat16_t,
+		z0 = svnmls_n_f16_m (p0, z1, z2, 2),
+		z0 = svnmls_m (p0, z1, z2, 2))
+
+/*
+** nmls_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f16_z_tied1, svfloat16_t,
+		z0 = svnmls_f16_z (p0, z0, z1, z2),
+		z0 = svnmls_z (p0, z0, z1, z2))
+
+/*
+** nmls_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f16_z_tied2, svfloat16_t,
+		z0 = svnmls_f16_z (p0, z1, z0, z2),
+		z0 = svnmls_z (p0, z1, z0, z2))
+
+/*
+** nmls_f16_z_tied3:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f16_z_tied3, svfloat16_t,
+		z0 = svnmls_f16_z (p0, z1, z2, z0),
+		z0 = svnmls_z (p0, z1, z2, z0))
+
+/*
+** nmls_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fnmls	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fnmsb	z0\.h, p0/m, z3\.h, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, z3\.h
+**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f16_z_untied, svfloat16_t,
+		z0 = svnmls_f16_z (p0, z1, z2, z3),
+		z0 = svnmls_z (p0, z1, z2, z3))
+
+/*
+** nmls_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svnmls_n_f16_z (p0, z0, z1, d4),
+		 z0 = svnmls_z (p0, z0, z1, d4))
+
+/*
+** nmls_h4_f16_z_tied2:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmsb	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_h4_f16_z_tied2, svfloat16_t, __fp16,
+		 z0 = svnmls_n_f16_z (p0, z1, z0, d4),
+		 z0 = svnmls_z (p0, z1, z0, d4))
+
+/*
+** nmls_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fnmls	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fnmsb	z0\.h, p0/m, \1, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svnmls_n_f16_z (p0, z1, z2, d4),
+		 z0 = svnmls_z (p0, z1, z2, d4))
+
+/*
+** nmls_2_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f16_z_tied1, svfloat16_t,
+		z0 = svnmls_n_f16_z (p0, z0, z1, 2),
+		z0 = svnmls_z (p0, z0, z1, 2))
+
+/*
+** nmls_2_f16_z_tied2:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmsb	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f16_z_tied2, svfloat16_t,
+		z0 = svnmls_n_f16_z (p0, z1, z0, 2),
+		z0 = svnmls_z (p0, z1, z0, 2))
+
+/*
+** nmls_2_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fnmls	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fnmsb	z0\.h, p0/m, \1, z1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f16_z_untied, svfloat16_t,
+		z0 = svnmls_n_f16_z (p0, z1, z2, 2),
+		z0 = svnmls_z (p0, z1, z2, 2))
+
+/*
+** nmls_f16_x_tied1:
+**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f16_x_tied1, svfloat16_t,
+		z0 = svnmls_f16_x (p0, z0, z1, z2),
+		z0 = svnmls_x (p0, z0, z1, z2))
+
+/*
+** nmls_f16_x_tied2:
+**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f16_x_tied2, svfloat16_t,
+		z0 = svnmls_f16_x (p0, z1, z0, z2),
+		z0 = svnmls_x (p0, z1, z0, z2))
+
+/*
+** nmls_f16_x_tied3:
+**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f16_x_tied3, svfloat16_t,
+		z0 = svnmls_f16_x (p0, z1, z2, z0),
+		z0 = svnmls_x (p0, z1, z2, z0))
+
+/*
+** nmls_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fnmls	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0, z2
+**	fnmsb	z0\.h, p0/m, z3\.h, z1\.h
+** |
+**	movprfx	z0, z3
+**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f16_x_untied, svfloat16_t,
+		z0 = svnmls_f16_x (p0, z1, z2, z3),
+		z0 = svnmls_x (p0, z1, z2, z3))
+
+/*
+** nmls_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fnmls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svnmls_n_f16_x (p0, z0, z1, d4),
+		 z0 = svnmls_x (p0, z0, z1, d4))
+
+/*
+** nmls_h4_f16_x_tied2:
+**	mov	(z[0-9]+\.h), h4
+**	fnmsb	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_h4_f16_x_tied2, svfloat16_t, __fp16,
+		 z0 = svnmls_n_f16_x (p0, z1, z0, d4),
+		 z0 = svnmls_x (p0, z1, z0, d4))
+
+/*
+** nmls_h4_f16_x_untied: { xfail *-*-* }
+**	mov	z0\.h, h4
+**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svnmls_n_f16_x (p0, z1, z2, d4),
+		 z0 = svnmls_x (p0, z1, z2, d4))
+
+/*
+** nmls_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fnmls	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f16_x_tied1, svfloat16_t,
+		z0 = svnmls_n_f16_x (p0, z0, z1, 2),
+		z0 = svnmls_x (p0, z0, z1, 2))
+
+/*
+** nmls_2_f16_x_tied2:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fnmsb	z0\.h, p0/m, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f16_x_tied2, svfloat16_t,
+		z0 = svnmls_n_f16_x (p0, z1, z0, 2),
+		z0 = svnmls_x (p0, z1, z0, 2))
+
+/*
+** nmls_2_f16_x_untied:
+**	fmov	z0\.h, #2\.0(?:e\+0)?
+**	fnmsb	z0\.h, p0/m, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f16_x_untied, svfloat16_t,
+		z0 = svnmls_n_f16_x (p0, z1, z2, 2),
+		z0 = svnmls_x (p0, z1, z2, 2))
+
+/*
+** ptrue_nmls_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_f16_x_tied1, svfloat16_t,
+		z0 = svnmls_f16_x (svptrue_b16 (), z0, z1, z2),
+		z0 = svnmls_x (svptrue_b16 (), z0, z1, z2))
+
+/*
+** ptrue_nmls_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_f16_x_tied2, svfloat16_t,
+		z0 = svnmls_f16_x (svptrue_b16 (), z1, z0, z2),
+		z0 = svnmls_x (svptrue_b16 (), z1, z0, z2))
+
+/*
+** ptrue_nmls_f16_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_f16_x_tied3, svfloat16_t,
+		z0 = svnmls_f16_x (svptrue_b16 (), z1, z2, z0),
+		z0 = svnmls_x (svptrue_b16 (), z1, z2, z0))
+
+/*
+** ptrue_nmls_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_f16_x_untied, svfloat16_t,
+		z0 = svnmls_f16_x (svptrue_b16 (), z1, z2, z3),
+		z0 = svnmls_x (svptrue_b16 (), z1, z2, z3))
+
+/*
+** ptrue_nmls_2_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_2_f16_x_tied1, svfloat16_t,
+		z0 = svnmls_n_f16_x (svptrue_b16 (), z0, z1, 2),
+		z0 = svnmls_x (svptrue_b16 (), z0, z1, 2))
+
+/*
+** ptrue_nmls_2_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_2_f16_x_tied2, svfloat16_t,
+		z0 = svnmls_n_f16_x (svptrue_b16 (), z1, z0, 2),
+		z0 = svnmls_x (svptrue_b16 (), z1, z0, 2))
+
+/*
+** ptrue_nmls_2_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_2_f16_x_untied, svfloat16_t,
+		z0 = svnmls_n_f16_x (svptrue_b16 (), z1, z2, 2),
+		z0 = svnmls_x (svptrue_b16 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f32.c
new file mode 100644
index 000000000..42ea13fac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f32.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** nmls_f32_m_tied1:
+**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f32_m_tied1, svfloat32_t,
+		z0 = svnmls_f32_m (p0, z0, z1, z2),
+		z0 = svnmls_m (p0, z0, z1, z2))
+
+/*
+** nmls_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fnmls	z0\.s, p0/m, \1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f32_m_tied2, svfloat32_t,
+		z0 = svnmls_f32_m (p0, z1, z0, z2),
+		z0 = svnmls_m (p0, z1, z0, z2))
+
+/*
+** nmls_f32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fnmls	z0\.s, p0/m, z2\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f32_m_tied3, svfloat32_t,
+		z0 = svnmls_f32_m (p0, z1, z2, z0),
+		z0 = svnmls_m (p0, z1, z2, z0))
+
+/*
+** nmls_f32_m_untied:
+**	movprfx	z0, z1
+**	fnmls	z0\.s, p0/m, z2\.s, z3\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f32_m_untied, svfloat32_t,
+		z0 = svnmls_f32_m (p0, z1, z2, z3),
+		z0 = svnmls_m (p0, z1, z2, z3))
+
+/*
+** nmls_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fnmls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svnmls_n_f32_m (p0, z0, z1, d4),
+		 z0 = svnmls_m (p0, z0, z1, d4))
+
+/*
+** nmls_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fnmls	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svnmls_n_f32_m (p0, z1, z2, d4),
+		 z0 = svnmls_m (p0, z1, z2, d4))
+
+/*
+** nmls_2_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fnmls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f32_m_tied1, svfloat32_t,
+		z0 = svnmls_n_f32_m (p0, z0, z1, 2),
+		z0 = svnmls_m (p0, z0, z1, 2))
+
+/*
+** nmls_2_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fnmls	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f32_m_untied, svfloat32_t,
+		z0 = svnmls_n_f32_m (p0, z1, z2, 2),
+		z0 = svnmls_m (p0, z1, z2, 2))
+
+/*
+** nmls_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f32_z_tied1, svfloat32_t,
+		z0 = svnmls_f32_z (p0, z0, z1, z2),
+		z0 = svnmls_z (p0, z0, z1, z2))
+
+/*
+** nmls_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f32_z_tied2, svfloat32_t,
+		z0 = svnmls_f32_z (p0, z1, z0, z2),
+		z0 = svnmls_z (p0, z1, z0, z2))
+
+/*
+** nmls_f32_z_tied3:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f32_z_tied3, svfloat32_t,
+		z0 = svnmls_f32_z (p0, z1, z2, z0),
+		z0 = svnmls_z (p0, z1, z2, z0))
+
+/*
+** nmls_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fnmls	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fnmsb	z0\.s, p0/m, z3\.s, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, z3\.s
+**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f32_z_untied, svfloat32_t,
+		z0 = svnmls_f32_z (p0, z1, z2, z3),
+		z0 = svnmls_z (p0, z1, z2, z3))
+
+/*
+** nmls_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svnmls_n_f32_z (p0, z0, z1, d4),
+		 z0 = svnmls_z (p0, z0, z1, d4))
+
+/*
+** nmls_s4_f32_z_tied2:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmsb	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_s4_f32_z_tied2, svfloat32_t, float,
+		 z0 = svnmls_n_f32_z (p0, z1, z0, d4),
+		 z0 = svnmls_z (p0, z1, z0, d4))
+
+/*
+** nmls_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fnmls	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fnmsb	z0\.s, p0/m, \1, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svnmls_n_f32_z (p0, z1, z2, d4),
+		 z0 = svnmls_z (p0, z1, z2, d4))
+
+/*
+** nmls_2_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f32_z_tied1, svfloat32_t,
+		z0 = svnmls_n_f32_z (p0, z0, z1, 2),
+		z0 = svnmls_z (p0, z0, z1, 2))
+
+/*
+** nmls_2_f32_z_tied2:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmsb	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f32_z_tied2, svfloat32_t,
+		z0 = svnmls_n_f32_z (p0, z1, z0, 2),
+		z0 = svnmls_z (p0, z1, z0, 2))
+
+/*
+** nmls_2_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fnmls	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fnmsb	z0\.s, p0/m, \1, z1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f32_z_untied, svfloat32_t,
+		z0 = svnmls_n_f32_z (p0, z1, z2, 2),
+		z0 = svnmls_z (p0, z1, z2, 2))
+
+/*
+** nmls_f32_x_tied1:
+**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f32_x_tied1, svfloat32_t,
+		z0 = svnmls_f32_x (p0, z0, z1, z2),
+		z0 = svnmls_x (p0, z0, z1, z2))
+
+/*
+** nmls_f32_x_tied2:
+**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f32_x_tied2, svfloat32_t,
+		z0 = svnmls_f32_x (p0, z1, z0, z2),
+		z0 = svnmls_x (p0, z1, z0, z2))
+
+/*
+** nmls_f32_x_tied3:
+**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f32_x_tied3, svfloat32_t,
+		z0 = svnmls_f32_x (p0, z1, z2, z0),
+		z0 = svnmls_x (p0, z1, z2, z0))
+
+/*
+** nmls_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fnmls	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0, z2
+**	fnmsb	z0\.s, p0/m, z3\.s, z1\.s
+** |
+**	movprfx	z0, z3
+**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f32_x_untied, svfloat32_t,
+		z0 = svnmls_f32_x (p0, z1, z2, z3),
+		z0 = svnmls_x (p0, z1, z2, z3))
+
+/*
+** nmls_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fnmls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svnmls_n_f32_x (p0, z0, z1, d4),
+		 z0 = svnmls_x (p0, z0, z1, d4))
+
+/*
+** nmls_s4_f32_x_tied2:
+**	mov	(z[0-9]+\.s), s4
+**	fnmsb	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_s4_f32_x_tied2, svfloat32_t, float,
+		 z0 = svnmls_n_f32_x (p0, z1, z0, d4),
+		 z0 = svnmls_x (p0, z1, z0, d4))
+
+/*
+** nmls_s4_f32_x_untied: { xfail *-*-* }
+**	mov	z0\.s, s4
+**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svnmls_n_f32_x (p0, z1, z2, d4),
+		 z0 = svnmls_x (p0, z1, z2, d4))
+
+/*
+** nmls_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fnmls	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f32_x_tied1, svfloat32_t,
+		z0 = svnmls_n_f32_x (p0, z0, z1, 2),
+		z0 = svnmls_x (p0, z0, z1, 2))
+
+/*
+** nmls_2_f32_x_tied2:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fnmsb	z0\.s, p0/m, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f32_x_tied2, svfloat32_t,
+		z0 = svnmls_n_f32_x (p0, z1, z0, 2),
+		z0 = svnmls_x (p0, z1, z0, 2))
+
+/*
+** nmls_2_f32_x_untied:
+**	fmov	z0\.s, #2\.0(?:e\+0)?
+**	fnmsb	z0\.s, p0/m, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f32_x_untied, svfloat32_t,
+		z0 = svnmls_n_f32_x (p0, z1, z2, 2),
+		z0 = svnmls_x (p0, z1, z2, 2))
+
+/*
+** ptrue_nmls_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_f32_x_tied1, svfloat32_t,
+		z0 = svnmls_f32_x (svptrue_b32 (), z0, z1, z2),
+		z0 = svnmls_x (svptrue_b32 (), z0, z1, z2))
+
+/*
+** ptrue_nmls_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_f32_x_tied2, svfloat32_t,
+		z0 = svnmls_f32_x (svptrue_b32 (), z1, z0, z2),
+		z0 = svnmls_x (svptrue_b32 (), z1, z0, z2))
+
+/*
+** ptrue_nmls_f32_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_f32_x_tied3, svfloat32_t,
+		z0 = svnmls_f32_x (svptrue_b32 (), z1, z2, z0),
+		z0 = svnmls_x (svptrue_b32 (), z1, z2, z0))
+
+/*
+** ptrue_nmls_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_f32_x_untied, svfloat32_t,
+		z0 = svnmls_f32_x (svptrue_b32 (), z1, z2, z3),
+		z0 = svnmls_x (svptrue_b32 (), z1, z2, z3))
+
+/*
+** ptrue_nmls_2_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_2_f32_x_tied1, svfloat32_t,
+		z0 = svnmls_n_f32_x (svptrue_b32 (), z0, z1, 2),
+		z0 = svnmls_x (svptrue_b32 (), z0, z1, 2))
+
+/*
+** ptrue_nmls_2_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_2_f32_x_tied2, svfloat32_t,
+		z0 = svnmls_n_f32_x (svptrue_b32 (), z1, z0, 2),
+		z0 = svnmls_x (svptrue_b32 (), z1, z0, 2))
+
+/*
+** ptrue_nmls_2_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_2_f32_x_untied, svfloat32_t,
+		z0 = svnmls_n_f32_x (svptrue_b32 (), z1, z2, 2),
+		z0 = svnmls_x (svptrue_b32 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f64.c
new file mode 100644
index 000000000..994c2a74e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmls_f64.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** nmls_f64_m_tied1:
+**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f64_m_tied1, svfloat64_t,
+		z0 = svnmls_f64_m (p0, z0, z1, z2),
+		z0 = svnmls_m (p0, z0, z1, z2))
+
+/*
+** nmls_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fnmls	z0\.d, p0/m, \1, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f64_m_tied2, svfloat64_t,
+		z0 = svnmls_f64_m (p0, z1, z0, z2),
+		z0 = svnmls_m (p0, z1, z0, z2))
+
+/*
+** nmls_f64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fnmls	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f64_m_tied3, svfloat64_t,
+		z0 = svnmls_f64_m (p0, z1, z2, z0),
+		z0 = svnmls_m (p0, z1, z2, z0))
+
+/*
+** nmls_f64_m_untied:
+**	movprfx	z0, z1
+**	fnmls	z0\.d, p0/m, z2\.d, z3\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f64_m_untied, svfloat64_t,
+		z0 = svnmls_f64_m (p0, z1, z2, z3),
+		z0 = svnmls_m (p0, z1, z2, z3))
+
+/*
+** nmls_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fnmls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svnmls_n_f64_m (p0, z0, z1, d4),
+		 z0 = svnmls_m (p0, z0, z1, d4))
+
+/*
+** nmls_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fnmls	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svnmls_n_f64_m (p0, z1, z2, d4),
+		 z0 = svnmls_m (p0, z1, z2, d4))
+
+/*
+** nmls_2_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fnmls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f64_m_tied1, svfloat64_t,
+		z0 = svnmls_n_f64_m (p0, z0, z1, 2),
+		z0 = svnmls_m (p0, z0, z1, 2))
+
+/*
+** nmls_2_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fnmls	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f64_m_untied, svfloat64_t,
+		z0 = svnmls_n_f64_m (p0, z1, z2, 2),
+		z0 = svnmls_m (p0, z1, z2, 2))
+
+/*
+** nmls_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f64_z_tied1, svfloat64_t,
+		z0 = svnmls_f64_z (p0, z0, z1, z2),
+		z0 = svnmls_z (p0, z0, z1, z2))
+
+/*
+** nmls_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f64_z_tied2, svfloat64_t,
+		z0 = svnmls_f64_z (p0, z1, z0, z2),
+		z0 = svnmls_z (p0, z1, z0, z2))
+
+/*
+** nmls_f64_z_tied3:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f64_z_tied3, svfloat64_t,
+		z0 = svnmls_f64_z (p0, z1, z2, z0),
+		z0 = svnmls_z (p0, z1, z2, z0))
+
+/*
+** nmls_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fnmls	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fnmsb	z0\.d, p0/m, z3\.d, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, z3\.d
+**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f64_z_untied, svfloat64_t,
+		z0 = svnmls_f64_z (p0, z1, z2, z3),
+		z0 = svnmls_z (p0, z1, z2, z3))
+
+/*
+** nmls_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svnmls_n_f64_z (p0, z0, z1, d4),
+		 z0 = svnmls_z (p0, z0, z1, d4))
+
+/*
+** nmls_d4_f64_z_tied2:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmsb	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_d4_f64_z_tied2, svfloat64_t, double,
+		 z0 = svnmls_n_f64_z (p0, z1, z0, d4),
+		 z0 = svnmls_z (p0, z1, z0, d4))
+
+/*
+** nmls_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fnmls	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fnmsb	z0\.d, p0/m, \1, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svnmls_n_f64_z (p0, z1, z2, d4),
+		 z0 = svnmls_z (p0, z1, z2, d4))
+
+/*
+** nmls_2_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f64_z_tied1, svfloat64_t,
+		z0 = svnmls_n_f64_z (p0, z0, z1, 2),
+		z0 = svnmls_z (p0, z0, z1, 2))
+
+/*
+** nmls_2_f64_z_tied2:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmsb	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f64_z_tied2, svfloat64_t,
+		z0 = svnmls_n_f64_z (p0, z1, z0, 2),
+		z0 = svnmls_z (p0, z1, z0, 2))
+
+/*
+** nmls_2_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fnmls	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fnmsb	z0\.d, p0/m, \1, z1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f64_z_untied, svfloat64_t,
+		z0 = svnmls_n_f64_z (p0, z1, z2, 2),
+		z0 = svnmls_z (p0, z1, z2, 2))
+
+/*
+** nmls_f64_x_tied1:
+**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f64_x_tied1, svfloat64_t,
+		z0 = svnmls_f64_x (p0, z0, z1, z2),
+		z0 = svnmls_x (p0, z0, z1, z2))
+
+/*
+** nmls_f64_x_tied2:
+**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f64_x_tied2, svfloat64_t,
+		z0 = svnmls_f64_x (p0, z1, z0, z2),
+		z0 = svnmls_x (p0, z1, z0, z2))
+
+/*
+** nmls_f64_x_tied3:
+**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f64_x_tied3, svfloat64_t,
+		z0 = svnmls_f64_x (p0, z1, z2, z0),
+		z0 = svnmls_x (p0, z1, z2, z0))
+
+/*
+** nmls_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fnmls	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0, z2
+**	fnmsb	z0\.d, p0/m, z3\.d, z1\.d
+** |
+**	movprfx	z0, z3
+**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_f64_x_untied, svfloat64_t,
+		z0 = svnmls_f64_x (p0, z1, z2, z3),
+		z0 = svnmls_x (p0, z1, z2, z3))
+
+/*
+** nmls_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fnmls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svnmls_n_f64_x (p0, z0, z1, d4),
+		 z0 = svnmls_x (p0, z0, z1, d4))
+
+/*
+** nmls_d4_f64_x_tied2:
+**	mov	(z[0-9]+\.d), d4
+**	fnmsb	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_d4_f64_x_tied2, svfloat64_t, double,
+		 z0 = svnmls_n_f64_x (p0, z1, z0, d4),
+		 z0 = svnmls_x (p0, z1, z0, d4))
+
+/*
+** nmls_d4_f64_x_untied: { xfail *-*-* }
+**	mov	z0\.d, d4
+**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (nmls_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svnmls_n_f64_x (p0, z1, z2, d4),
+		 z0 = svnmls_x (p0, z1, z2, d4))
+
+/*
+** nmls_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fnmls	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f64_x_tied1, svfloat64_t,
+		z0 = svnmls_n_f64_x (p0, z0, z1, 2),
+		z0 = svnmls_x (p0, z0, z1, 2))
+
+/*
+** nmls_2_f64_x_tied2:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fnmsb	z0\.d, p0/m, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f64_x_tied2, svfloat64_t,
+		z0 = svnmls_n_f64_x (p0, z1, z0, 2),
+		z0 = svnmls_x (p0, z1, z0, 2))
+
+/*
+** nmls_2_f64_x_untied:
+**	fmov	z0\.d, #2\.0(?:e\+0)?
+**	fnmsb	z0\.d, p0/m, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmls_2_f64_x_untied, svfloat64_t,
+		z0 = svnmls_n_f64_x (p0, z1, z2, 2),
+		z0 = svnmls_x (p0, z1, z2, 2))
+
+/*
+** ptrue_nmls_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_f64_x_tied1, svfloat64_t,
+		z0 = svnmls_f64_x (svptrue_b64 (), z0, z1, z2),
+		z0 = svnmls_x (svptrue_b64 (), z0, z1, z2))
+
+/*
+** ptrue_nmls_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_f64_x_tied2, svfloat64_t,
+		z0 = svnmls_f64_x (svptrue_b64 (), z1, z0, z2),
+		z0 = svnmls_x (svptrue_b64 (), z1, z0, z2))
+
+/*
+** ptrue_nmls_f64_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_f64_x_tied3, svfloat64_t,
+		z0 = svnmls_f64_x (svptrue_b64 (), z1, z2, z0),
+		z0 = svnmls_x (svptrue_b64 (), z1, z2, z0))
+
+/*
+** ptrue_nmls_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_f64_x_untied, svfloat64_t,
+		z0 = svnmls_f64_x (svptrue_b64 (), z1, z2, z3),
+		z0 = svnmls_x (svptrue_b64 (), z1, z2, z3))
+
+/*
+** ptrue_nmls_2_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_2_f64_x_tied1, svfloat64_t,
+		z0 = svnmls_n_f64_x (svptrue_b64 (), z0, z1, 2),
+		z0 = svnmls_x (svptrue_b64 (), z0, z1, 2))
+
+/*
+** ptrue_nmls_2_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_2_f64_x_tied2, svfloat64_t,
+		z0 = svnmls_n_f64_x (svptrue_b64 (), z1, z0, 2),
+		z0 = svnmls_x (svptrue_b64 (), z1, z0, 2))
+
+/*
+** ptrue_nmls_2_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmls_2_f64_x_untied, svfloat64_t,
+		z0 = svnmls_n_f64_x (svptrue_b64 (), z1, z2, 2),
+		z0 = svnmls_x (svptrue_b64 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f16.c
new file mode 100644
index 000000000..c11401485
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f16.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** nmsb_f16_m_tied1:
+**	fnmsb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f16_m_tied1, svfloat16_t,
+		z0 = svnmsb_f16_m (p0, z0, z1, z2),
+		z0 = svnmsb_m (p0, z0, z1, z2))
+
+/*
+** nmsb_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fnmsb	z0\.h, p0/m, \1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f16_m_tied2, svfloat16_t,
+		z0 = svnmsb_f16_m (p0, z1, z0, z2),
+		z0 = svnmsb_m (p0, z1, z0, z2))
+
+/*
+** nmsb_f16_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fnmsb	z0\.h, p0/m, z2\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f16_m_tied3, svfloat16_t,
+		z0 = svnmsb_f16_m (p0, z1, z2, z0),
+		z0 = svnmsb_m (p0, z1, z2, z0))
+
+/*
+** nmsb_f16_m_untied:
+**	movprfx	z0, z1
+**	fnmsb	z0\.h, p0/m, z2\.h, z3\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f16_m_untied, svfloat16_t,
+		z0 = svnmsb_f16_m (p0, z1, z2, z3),
+		z0 = svnmsb_m (p0, z1, z2, z3))
+
+/*
+** nmsb_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fnmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svnmsb_n_f16_m (p0, z0, z1, d4),
+		 z0 = svnmsb_m (p0, z0, z1, d4))
+
+/*
+** nmsb_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fnmsb	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svnmsb_n_f16_m (p0, z1, z2, d4),
+		 z0 = svnmsb_m (p0, z1, z2, d4))
+
+/*
+** nmsb_2_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fnmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f16_m_tied1, svfloat16_t,
+		z0 = svnmsb_n_f16_m (p0, z0, z1, 2),
+		z0 = svnmsb_m (p0, z0, z1, 2))
+
+/*
+** nmsb_2_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fnmsb	z0\.h, p0/m, z2\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f16_m_untied, svfloat16_t,
+		z0 = svnmsb_n_f16_m (p0, z1, z2, 2),
+		z0 = svnmsb_m (p0, z1, z2, 2))
+
+/*
+** nmsb_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmsb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f16_z_tied1, svfloat16_t,
+		z0 = svnmsb_f16_z (p0, z0, z1, z2),
+		z0 = svnmsb_z (p0, z0, z1, z2))
+
+/*
+** nmsb_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmsb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f16_z_tied2, svfloat16_t,
+		z0 = svnmsb_f16_z (p0, z1, z0, z2),
+		z0 = svnmsb_z (p0, z1, z0, z2))
+
+/*
+** nmsb_f16_z_tied3:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f16_z_tied3, svfloat16_t,
+		z0 = svnmsb_f16_z (p0, z1, z2, z0),
+		z0 = svnmsb_z (p0, z1, z2, z0))
+
+/*
+** nmsb_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fnmsb	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fnmsb	z0\.h, p0/m, z1\.h, z3\.h
+** |
+**	movprfx	z0\.h, p0/z, z3\.h
+**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f16_z_untied, svfloat16_t,
+		z0 = svnmsb_f16_z (p0, z1, z2, z3),
+		z0 = svnmsb_z (p0, z1, z2, z3))
+
+/*
+** nmsb_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svnmsb_n_f16_z (p0, z0, z1, d4),
+		 z0 = svnmsb_z (p0, z0, z1, d4))
+
+/*
+** nmsb_h4_f16_z_tied2:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_h4_f16_z_tied2, svfloat16_t, __fp16,
+		 z0 = svnmsb_n_f16_z (p0, z1, z0, d4),
+		 z0 = svnmsb_z (p0, z1, z0, d4))
+
+/*
+** nmsb_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fnmsb	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fnmsb	z0\.h, p0/m, z1\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svnmsb_n_f16_z (p0, z1, z2, d4),
+		 z0 = svnmsb_z (p0, z1, z2, d4))
+
+/*
+** nmsb_2_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f16_z_tied1, svfloat16_t,
+		z0 = svnmsb_n_f16_z (p0, z0, z1, 2),
+		z0 = svnmsb_z (p0, z0, z1, 2))
+
+/*
+** nmsb_2_f16_z_tied2:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fnmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f16_z_tied2, svfloat16_t,
+		z0 = svnmsb_n_f16_z (p0, z1, z0, 2),
+		z0 = svnmsb_z (p0, z1, z0, 2))
+
+/*
+** nmsb_2_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fnmsb	z0\.h, p0/m, z2\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fnmsb	z0\.h, p0/m, z1\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f16_z_untied, svfloat16_t,
+		z0 = svnmsb_n_f16_z (p0, z1, z2, 2),
+		z0 = svnmsb_z (p0, z1, z2, 2))
+
+/*
+** nmsb_f16_x_tied1:
+**	fnmsb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f16_x_tied1, svfloat16_t,
+		z0 = svnmsb_f16_x (p0, z0, z1, z2),
+		z0 = svnmsb_x (p0, z0, z1, z2))
+
+/*
+** nmsb_f16_x_tied2:
+**	fnmsb	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f16_x_tied2, svfloat16_t,
+		z0 = svnmsb_f16_x (p0, z1, z0, z2),
+		z0 = svnmsb_x (p0, z1, z0, z2))
+
+/*
+** nmsb_f16_x_tied3:
+**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f16_x_tied3, svfloat16_t,
+		z0 = svnmsb_f16_x (p0, z1, z2, z0),
+		z0 = svnmsb_x (p0, z1, z2, z0))
+
+/*
+** nmsb_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fnmsb	z0\.h, p0/m, z2\.h, z3\.h
+** |
+**	movprfx	z0, z2
+**	fnmsb	z0\.h, p0/m, z1\.h, z3\.h
+** |
+**	movprfx	z0, z3
+**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f16_x_untied, svfloat16_t,
+		z0 = svnmsb_f16_x (p0, z1, z2, z3),
+		z0 = svnmsb_x (p0, z1, z2, z3))
+
+/*
+** nmsb_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fnmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svnmsb_n_f16_x (p0, z0, z1, d4),
+		 z0 = svnmsb_x (p0, z0, z1, d4))
+
+/*
+** nmsb_h4_f16_x_tied2:
+**	mov	(z[0-9]+\.h), h4
+**	fnmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_h4_f16_x_tied2, svfloat16_t, __fp16,
+		 z0 = svnmsb_n_f16_x (p0, z1, z0, d4),
+		 z0 = svnmsb_x (p0, z1, z0, d4))
+
+/*
+** nmsb_h4_f16_x_untied: { xfail *-*-* }
+**	mov	z0\.h, h4
+**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svnmsb_n_f16_x (p0, z1, z2, d4),
+		 z0 = svnmsb_x (p0, z1, z2, d4))
+
+/*
+** nmsb_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fnmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f16_x_tied1, svfloat16_t,
+		z0 = svnmsb_n_f16_x (p0, z0, z1, 2),
+		z0 = svnmsb_x (p0, z0, z1, 2))
+
+/*
+** nmsb_2_f16_x_tied2:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fnmsb	z0\.h, p0/m, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f16_x_tied2, svfloat16_t,
+		z0 = svnmsb_n_f16_x (p0, z1, z0, 2),
+		z0 = svnmsb_x (p0, z1, z0, 2))
+
+/*
+** nmsb_2_f16_x_untied:
+**	fmov	z0\.h, #2\.0(?:e\+0)?
+**	fnmls	z0\.h, p0/m, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f16_x_untied, svfloat16_t,
+		z0 = svnmsb_n_f16_x (p0, z1, z2, 2),
+		z0 = svnmsb_x (p0, z1, z2, 2))
+
+/*
+** ptrue_nmsb_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_f16_x_tied1, svfloat16_t,
+		z0 = svnmsb_f16_x (svptrue_b16 (), z0, z1, z2),
+		z0 = svnmsb_x (svptrue_b16 (), z0, z1, z2))
+
+/*
+** ptrue_nmsb_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_f16_x_tied2, svfloat16_t,
+		z0 = svnmsb_f16_x (svptrue_b16 (), z1, z0, z2),
+		z0 = svnmsb_x (svptrue_b16 (), z1, z0, z2))
+
+/*
+** ptrue_nmsb_f16_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_f16_x_tied3, svfloat16_t,
+		z0 = svnmsb_f16_x (svptrue_b16 (), z1, z2, z0),
+		z0 = svnmsb_x (svptrue_b16 (), z1, z2, z0))
+
+/*
+** ptrue_nmsb_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_f16_x_untied, svfloat16_t,
+		z0 = svnmsb_f16_x (svptrue_b16 (), z1, z2, z3),
+		z0 = svnmsb_x (svptrue_b16 (), z1, z2, z3))
+
+/*
+** ptrue_nmsb_2_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_2_f16_x_tied1, svfloat16_t,
+		z0 = svnmsb_n_f16_x (svptrue_b16 (), z0, z1, 2),
+		z0 = svnmsb_x (svptrue_b16 (), z0, z1, 2))
+
+/*
+** ptrue_nmsb_2_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_2_f16_x_tied2, svfloat16_t,
+		z0 = svnmsb_n_f16_x (svptrue_b16 (), z1, z0, 2),
+		z0 = svnmsb_x (svptrue_b16 (), z1, z0, 2))
+
+/*
+** ptrue_nmsb_2_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_2_f16_x_untied, svfloat16_t,
+		z0 = svnmsb_n_f16_x (svptrue_b16 (), z1, z2, 2),
+		z0 = svnmsb_x (svptrue_b16 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f32.c
new file mode 100644
index 000000000..c2204e040
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f32.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** nmsb_f32_m_tied1:
+**	fnmsb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f32_m_tied1, svfloat32_t,
+		z0 = svnmsb_f32_m (p0, z0, z1, z2),
+		z0 = svnmsb_m (p0, z0, z1, z2))
+
+/*
+** nmsb_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fnmsb	z0\.s, p0/m, \1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f32_m_tied2, svfloat32_t,
+		z0 = svnmsb_f32_m (p0, z1, z0, z2),
+		z0 = svnmsb_m (p0, z1, z0, z2))
+
+/*
+** nmsb_f32_m_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fnmsb	z0\.s, p0/m, z2\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f32_m_tied3, svfloat32_t,
+		z0 = svnmsb_f32_m (p0, z1, z2, z0),
+		z0 = svnmsb_m (p0, z1, z2, z0))
+
+/*
+** nmsb_f32_m_untied:
+**	movprfx	z0, z1
+**	fnmsb	z0\.s, p0/m, z2\.s, z3\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f32_m_untied, svfloat32_t,
+		z0 = svnmsb_f32_m (p0, z1, z2, z3),
+		z0 = svnmsb_m (p0, z1, z2, z3))
+
+/*
+** nmsb_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fnmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svnmsb_n_f32_m (p0, z0, z1, d4),
+		 z0 = svnmsb_m (p0, z0, z1, d4))
+
+/*
+** nmsb_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fnmsb	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svnmsb_n_f32_m (p0, z1, z2, d4),
+		 z0 = svnmsb_m (p0, z1, z2, d4))
+
+/*
+** nmsb_2_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fnmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f32_m_tied1, svfloat32_t,
+		z0 = svnmsb_n_f32_m (p0, z0, z1, 2),
+		z0 = svnmsb_m (p0, z0, z1, 2))
+
+/*
+** nmsb_2_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fnmsb	z0\.s, p0/m, z2\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f32_m_untied, svfloat32_t,
+		z0 = svnmsb_n_f32_m (p0, z1, z2, 2),
+		z0 = svnmsb_m (p0, z1, z2, 2))
+
+/*
+** nmsb_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmsb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f32_z_tied1, svfloat32_t,
+		z0 = svnmsb_f32_z (p0, z0, z1, z2),
+		z0 = svnmsb_z (p0, z0, z1, z2))
+
+/*
+** nmsb_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmsb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f32_z_tied2, svfloat32_t,
+		z0 = svnmsb_f32_z (p0, z1, z0, z2),
+		z0 = svnmsb_z (p0, z1, z0, z2))
+
+/*
+** nmsb_f32_z_tied3:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f32_z_tied3, svfloat32_t,
+		z0 = svnmsb_f32_z (p0, z1, z2, z0),
+		z0 = svnmsb_z (p0, z1, z2, z0))
+
+/*
+** nmsb_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fnmsb	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fnmsb	z0\.s, p0/m, z1\.s, z3\.s
+** |
+**	movprfx	z0\.s, p0/z, z3\.s
+**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f32_z_untied, svfloat32_t,
+		z0 = svnmsb_f32_z (p0, z1, z2, z3),
+		z0 = svnmsb_z (p0, z1, z2, z3))
+
+/*
+** nmsb_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svnmsb_n_f32_z (p0, z0, z1, d4),
+		 z0 = svnmsb_z (p0, z0, z1, d4))
+
+/*
+** nmsb_s4_f32_z_tied2:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_s4_f32_z_tied2, svfloat32_t, float,
+		 z0 = svnmsb_n_f32_z (p0, z1, z0, d4),
+		 z0 = svnmsb_z (p0, z1, z0, d4))
+
+/*
+** nmsb_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fnmsb	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fnmsb	z0\.s, p0/m, z1\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svnmsb_n_f32_z (p0, z1, z2, d4),
+		 z0 = svnmsb_z (p0, z1, z2, d4))
+
+/*
+** nmsb_2_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f32_z_tied1, svfloat32_t,
+		z0 = svnmsb_n_f32_z (p0, z0, z1, 2),
+		z0 = svnmsb_z (p0, z0, z1, 2))
+
+/*
+** nmsb_2_f32_z_tied2:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fnmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f32_z_tied2, svfloat32_t,
+		z0 = svnmsb_n_f32_z (p0, z1, z0, 2),
+		z0 = svnmsb_z (p0, z1, z0, 2))
+
+/*
+** nmsb_2_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fnmsb	z0\.s, p0/m, z2\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fnmsb	z0\.s, p0/m, z1\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f32_z_untied, svfloat32_t,
+		z0 = svnmsb_n_f32_z (p0, z1, z2, 2),
+		z0 = svnmsb_z (p0, z1, z2, 2))
+
+/*
+** nmsb_f32_x_tied1:
+**	fnmsb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f32_x_tied1, svfloat32_t,
+		z0 = svnmsb_f32_x (p0, z0, z1, z2),
+		z0 = svnmsb_x (p0, z0, z1, z2))
+
+/*
+** nmsb_f32_x_tied2:
+**	fnmsb	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f32_x_tied2, svfloat32_t,
+		z0 = svnmsb_f32_x (p0, z1, z0, z2),
+		z0 = svnmsb_x (p0, z1, z0, z2))
+
+/*
+** nmsb_f32_x_tied3:
+**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f32_x_tied3, svfloat32_t,
+		z0 = svnmsb_f32_x (p0, z1, z2, z0),
+		z0 = svnmsb_x (p0, z1, z2, z0))
+
+/*
+** nmsb_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fnmsb	z0\.s, p0/m, z2\.s, z3\.s
+** |
+**	movprfx	z0, z2
+**	fnmsb	z0\.s, p0/m, z1\.s, z3\.s
+** |
+**	movprfx	z0, z3
+**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f32_x_untied, svfloat32_t,
+		z0 = svnmsb_f32_x (p0, z1, z2, z3),
+		z0 = svnmsb_x (p0, z1, z2, z3))
+
+/*
+** nmsb_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fnmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svnmsb_n_f32_x (p0, z0, z1, d4),
+		 z0 = svnmsb_x (p0, z0, z1, d4))
+
+/*
+** nmsb_s4_f32_x_tied2:
+**	mov	(z[0-9]+\.s), s4
+**	fnmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_s4_f32_x_tied2, svfloat32_t, float,
+		 z0 = svnmsb_n_f32_x (p0, z1, z0, d4),
+		 z0 = svnmsb_x (p0, z1, z0, d4))
+
+/*
+** nmsb_s4_f32_x_untied: { xfail *-*-* }
+**	mov	z0\.s, s4
+**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svnmsb_n_f32_x (p0, z1, z2, d4),
+		 z0 = svnmsb_x (p0, z1, z2, d4))
+
+/*
+** nmsb_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fnmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f32_x_tied1, svfloat32_t,
+		z0 = svnmsb_n_f32_x (p0, z0, z1, 2),
+		z0 = svnmsb_x (p0, z0, z1, 2))
+
+/*
+** nmsb_2_f32_x_tied2:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fnmsb	z0\.s, p0/m, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f32_x_tied2, svfloat32_t,
+		z0 = svnmsb_n_f32_x (p0, z1, z0, 2),
+		z0 = svnmsb_x (p0, z1, z0, 2))
+
+/*
+** nmsb_2_f32_x_untied:
+**	fmov	z0\.s, #2\.0(?:e\+0)?
+**	fnmls	z0\.s, p0/m, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f32_x_untied, svfloat32_t,
+		z0 = svnmsb_n_f32_x (p0, z1, z2, 2),
+		z0 = svnmsb_x (p0, z1, z2, 2))
+
+/*
+** ptrue_nmsb_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_f32_x_tied1, svfloat32_t,
+		z0 = svnmsb_f32_x (svptrue_b32 (), z0, z1, z2),
+		z0 = svnmsb_x (svptrue_b32 (), z0, z1, z2))
+
+/*
+** ptrue_nmsb_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_f32_x_tied2, svfloat32_t,
+		z0 = svnmsb_f32_x (svptrue_b32 (), z1, z0, z2),
+		z0 = svnmsb_x (svptrue_b32 (), z1, z0, z2))
+
+/*
+** ptrue_nmsb_f32_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_f32_x_tied3, svfloat32_t,
+		z0 = svnmsb_f32_x (svptrue_b32 (), z1, z2, z0),
+		z0 = svnmsb_x (svptrue_b32 (), z1, z2, z0))
+
+/*
+** ptrue_nmsb_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_f32_x_untied, svfloat32_t,
+		z0 = svnmsb_f32_x (svptrue_b32 (), z1, z2, z3),
+		z0 = svnmsb_x (svptrue_b32 (), z1, z2, z3))
+
+/*
+** ptrue_nmsb_2_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_2_f32_x_tied1, svfloat32_t,
+		z0 = svnmsb_n_f32_x (svptrue_b32 (), z0, z1, 2),
+		z0 = svnmsb_x (svptrue_b32 (), z0, z1, 2))
+
+/*
+** ptrue_nmsb_2_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_2_f32_x_tied2, svfloat32_t,
+		z0 = svnmsb_n_f32_x (svptrue_b32 (), z1, z0, 2),
+		z0 = svnmsb_x (svptrue_b32 (), z1, z0, 2))
+
+/*
+** ptrue_nmsb_2_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_2_f32_x_untied, svfloat32_t,
+		z0 = svnmsb_n_f32_x (svptrue_b32 (), z1, z2, 2),
+		z0 = svnmsb_x (svptrue_b32 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f64.c
new file mode 100644
index 000000000..56592d3ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nmsb_f64.c
@@ -0,0 +1,398 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** nmsb_f64_m_tied1:
+**	fnmsb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f64_m_tied1, svfloat64_t,
+		z0 = svnmsb_f64_m (p0, z0, z1, z2),
+		z0 = svnmsb_m (p0, z0, z1, z2))
+
+/*
+** nmsb_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fnmsb	z0\.d, p0/m, \1, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f64_m_tied2, svfloat64_t,
+		z0 = svnmsb_f64_m (p0, z1, z0, z2),
+		z0 = svnmsb_m (p0, z1, z0, z2))
+
+/*
+** nmsb_f64_m_tied3:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fnmsb	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f64_m_tied3, svfloat64_t,
+		z0 = svnmsb_f64_m (p0, z1, z2, z0),
+		z0 = svnmsb_m (p0, z1, z2, z0))
+
+/*
+** nmsb_f64_m_untied:
+**	movprfx	z0, z1
+**	fnmsb	z0\.d, p0/m, z2\.d, z3\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f64_m_untied, svfloat64_t,
+		z0 = svnmsb_f64_m (p0, z1, z2, z3),
+		z0 = svnmsb_m (p0, z1, z2, z3))
+
+/*
+** nmsb_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fnmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svnmsb_n_f64_m (p0, z0, z1, d4),
+		 z0 = svnmsb_m (p0, z0, z1, d4))
+
+/*
+** nmsb_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fnmsb	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svnmsb_n_f64_m (p0, z1, z2, d4),
+		 z0 = svnmsb_m (p0, z1, z2, d4))
+
+/*
+** nmsb_2_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fnmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f64_m_tied1, svfloat64_t,
+		z0 = svnmsb_n_f64_m (p0, z0, z1, 2),
+		z0 = svnmsb_m (p0, z0, z1, 2))
+
+/*
+** nmsb_2_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fnmsb	z0\.d, p0/m, z2\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f64_m_untied, svfloat64_t,
+		z0 = svnmsb_n_f64_m (p0, z1, z2, 2),
+		z0 = svnmsb_m (p0, z1, z2, 2))
+
+/*
+** nmsb_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmsb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f64_z_tied1, svfloat64_t,
+		z0 = svnmsb_f64_z (p0, z0, z1, z2),
+		z0 = svnmsb_z (p0, z0, z1, z2))
+
+/*
+** nmsb_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmsb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f64_z_tied2, svfloat64_t,
+		z0 = svnmsb_f64_z (p0, z1, z0, z2),
+		z0 = svnmsb_z (p0, z1, z0, z2))
+
+/*
+** nmsb_f64_z_tied3:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f64_z_tied3, svfloat64_t,
+		z0 = svnmsb_f64_z (p0, z1, z2, z0),
+		z0 = svnmsb_z (p0, z1, z2, z0))
+
+/*
+** nmsb_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fnmsb	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fnmsb	z0\.d, p0/m, z1\.d, z3\.d
+** |
+**	movprfx	z0\.d, p0/z, z3\.d
+**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f64_z_untied, svfloat64_t,
+		z0 = svnmsb_f64_z (p0, z1, z2, z3),
+		z0 = svnmsb_z (p0, z1, z2, z3))
+
+/*
+** nmsb_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svnmsb_n_f64_z (p0, z0, z1, d4),
+		 z0 = svnmsb_z (p0, z0, z1, d4))
+
+/*
+** nmsb_d4_f64_z_tied2:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_d4_f64_z_tied2, svfloat64_t, double,
+		 z0 = svnmsb_n_f64_z (p0, z1, z0, d4),
+		 z0 = svnmsb_z (p0, z1, z0, d4))
+
+/*
+** nmsb_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fnmsb	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fnmsb	z0\.d, p0/m, z1\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svnmsb_n_f64_z (p0, z1, z2, d4),
+		 z0 = svnmsb_z (p0, z1, z2, d4))
+
+/*
+** nmsb_2_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f64_z_tied1, svfloat64_t,
+		z0 = svnmsb_n_f64_z (p0, z0, z1, 2),
+		z0 = svnmsb_z (p0, z0, z1, 2))
+
+/*
+** nmsb_2_f64_z_tied2:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fnmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f64_z_tied2, svfloat64_t,
+		z0 = svnmsb_n_f64_z (p0, z1, z0, 2),
+		z0 = svnmsb_z (p0, z1, z0, 2))
+
+/*
+** nmsb_2_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fnmsb	z0\.d, p0/m, z2\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fnmsb	z0\.d, p0/m, z1\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f64_z_untied, svfloat64_t,
+		z0 = svnmsb_n_f64_z (p0, z1, z2, 2),
+		z0 = svnmsb_z (p0, z1, z2, 2))
+
+/*
+** nmsb_f64_x_tied1:
+**	fnmsb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f64_x_tied1, svfloat64_t,
+		z0 = svnmsb_f64_x (p0, z0, z1, z2),
+		z0 = svnmsb_x (p0, z0, z1, z2))
+
+/*
+** nmsb_f64_x_tied2:
+**	fnmsb	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f64_x_tied2, svfloat64_t,
+		z0 = svnmsb_f64_x (p0, z1, z0, z2),
+		z0 = svnmsb_x (p0, z1, z0, z2))
+
+/*
+** nmsb_f64_x_tied3:
+**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f64_x_tied3, svfloat64_t,
+		z0 = svnmsb_f64_x (p0, z1, z2, z0),
+		z0 = svnmsb_x (p0, z1, z2, z0))
+
+/*
+** nmsb_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fnmsb	z0\.d, p0/m, z2\.d, z3\.d
+** |
+**	movprfx	z0, z2
+**	fnmsb	z0\.d, p0/m, z1\.d, z3\.d
+** |
+**	movprfx	z0, z3
+**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_f64_x_untied, svfloat64_t,
+		z0 = svnmsb_f64_x (p0, z1, z2, z3),
+		z0 = svnmsb_x (p0, z1, z2, z3))
+
+/*
+** nmsb_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fnmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svnmsb_n_f64_x (p0, z0, z1, d4),
+		 z0 = svnmsb_x (p0, z0, z1, d4))
+
+/*
+** nmsb_d4_f64_x_tied2:
+**	mov	(z[0-9]+\.d), d4
+**	fnmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_d4_f64_x_tied2, svfloat64_t, double,
+		 z0 = svnmsb_n_f64_x (p0, z1, z0, d4),
+		 z0 = svnmsb_x (p0, z1, z0, d4))
+
+/*
+** nmsb_d4_f64_x_untied: { xfail *-*-* }
+**	mov	z0\.d, d4
+**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (nmsb_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svnmsb_n_f64_x (p0, z1, z2, d4),
+		 z0 = svnmsb_x (p0, z1, z2, d4))
+
+/*
+** nmsb_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fnmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f64_x_tied1, svfloat64_t,
+		z0 = svnmsb_n_f64_x (p0, z0, z1, 2),
+		z0 = svnmsb_x (p0, z0, z1, 2))
+
+/*
+** nmsb_2_f64_x_tied2:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fnmsb	z0\.d, p0/m, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f64_x_tied2, svfloat64_t,
+		z0 = svnmsb_n_f64_x (p0, z1, z0, 2),
+		z0 = svnmsb_x (p0, z1, z0, 2))
+
+/*
+** nmsb_2_f64_x_untied:
+**	fmov	z0\.d, #2\.0(?:e\+0)?
+**	fnmls	z0\.d, p0/m, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (nmsb_2_f64_x_untied, svfloat64_t,
+		z0 = svnmsb_n_f64_x (p0, z1, z2, 2),
+		z0 = svnmsb_x (p0, z1, z2, 2))
+
+/*
+** ptrue_nmsb_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_f64_x_tied1, svfloat64_t,
+		z0 = svnmsb_f64_x (svptrue_b64 (), z0, z1, z2),
+		z0 = svnmsb_x (svptrue_b64 (), z0, z1, z2))
+
+/*
+** ptrue_nmsb_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_f64_x_tied2, svfloat64_t,
+		z0 = svnmsb_f64_x (svptrue_b64 (), z1, z0, z2),
+		z0 = svnmsb_x (svptrue_b64 (), z1, z0, z2))
+
+/*
+** ptrue_nmsb_f64_x_tied3:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_f64_x_tied3, svfloat64_t,
+		z0 = svnmsb_f64_x (svptrue_b64 (), z1, z2, z0),
+		z0 = svnmsb_x (svptrue_b64 (), z1, z2, z0))
+
+/*
+** ptrue_nmsb_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_f64_x_untied, svfloat64_t,
+		z0 = svnmsb_f64_x (svptrue_b64 (), z1, z2, z3),
+		z0 = svnmsb_x (svptrue_b64 (), z1, z2, z3))
+
+/*
+** ptrue_nmsb_2_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_2_f64_x_tied1, svfloat64_t,
+		z0 = svnmsb_n_f64_x (svptrue_b64 (), z0, z1, 2),
+		z0 = svnmsb_x (svptrue_b64 (), z0, z1, 2))
+
+/*
+** ptrue_nmsb_2_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_2_f64_x_tied2, svfloat64_t,
+		z0 = svnmsb_n_f64_x (svptrue_b64 (), z1, z0, 2),
+		z0 = svnmsb_x (svptrue_b64 (), z1, z0, 2))
+
+/*
+** ptrue_nmsb_2_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_nmsb_2_f64_x_untied, svfloat64_t,
+		z0 = svnmsb_n_f64_x (svptrue_b64 (), z1, z2, 2),
+		z0 = svnmsb_x (svptrue_b64 (), z1, z2, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nor_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nor_b.c
new file mode 100644
index 000000000..997e34537
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/nor_b.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** nor_b_z_tied1:
+**	nor	p0\.b, p3/z, p0\.b, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (nor_b_z_tied1,
+		p0 = svnor_b_z (p3, p0, p1),
+		p0 = svnor_z (p3, p0, p1))
+
+/*
+** nor_b_z_tied2:
+**	nor	p0\.b, p3/z, p1\.b, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (nor_b_z_tied2,
+		p0 = svnor_b_z (p3, p1, p0),
+		p0 = svnor_z (p3, p1, p0))
+
+/*
+** nor_b_z_untied:
+**	nor	p0\.b, p3/z, p1\.b, p2\.b
+**	ret
+*/
+TEST_UNIFORM_P (nor_b_z_untied,
+		p0 = svnor_b_z (p3, p1, p2),
+		p0 = svnor_z (p3, p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_b.c
new file mode 100644
index 000000000..23a3a6aae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_b.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** not_b_z_tied1:
+**	not	p0\.b, p3/z, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (not_b_z_tied1,
+		p0 = svnot_b_z (p3, p0),
+		p0 = svnot_z (p3, p0))
+
+/*
+** not_b_z_untied:
+**	not	p0\.b, p3/z, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (not_b_z_untied,
+		p0 = svnot_b_z (p3, p1),
+		p0 = svnot_z (p3, p1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s16.c
new file mode 100644
index 000000000..bacd6b12c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s16.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** not_s16_m_tied12:
+**	not	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (not_s16_m_tied12, svint16_t,
+		z0 = svnot_s16_m (z0, p0, z0),
+		z0 = svnot_m (z0, p0, z0))
+
+/*
+** not_s16_m_tied1:
+**	not	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (not_s16_m_tied1, svint16_t,
+		z0 = svnot_s16_m (z0, p0, z1),
+		z0 = svnot_m (z0, p0, z1))
+
+/*
+** not_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	not	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (not_s16_m_tied2, svint16_t,
+		z0 = svnot_s16_m (z1, p0, z0),
+		z0 = svnot_m (z1, p0, z0))
+
+/*
+** not_s16_m_untied:
+**	movprfx	z0, z2
+**	not	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (not_s16_m_untied, svint16_t,
+		z0 = svnot_s16_m (z2, p0, z1),
+		z0 = svnot_m (z2, p0, z1))
+
+/*
+** not_s16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	not	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (not_s16_z_tied1, svint16_t,
+		z0 = svnot_s16_z (p0, z0),
+		z0 = svnot_z (p0, z0))
+
+/*
+** not_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	not	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (not_s16_z_untied, svint16_t,
+		z0 = svnot_s16_z (p0, z1),
+		z0 = svnot_z (p0, z1))
+
+/*
+** not_s16_x_tied1:
+**	not	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (not_s16_x_tied1, svint16_t,
+		z0 = svnot_s16_x (p0, z0),
+		z0 = svnot_x (p0, z0))
+
+/*
+** not_s16_x_untied:
+**	not	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (not_s16_x_untied, svint16_t,
+		z0 = svnot_s16_x (p0, z1),
+		z0 = svnot_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s32.c
new file mode 100644
index 000000000..8b15d6e91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s32.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** not_s32_m_tied12:
+**	not	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (not_s32_m_tied12, svint32_t,
+		z0 = svnot_s32_m (z0, p0, z0),
+		z0 = svnot_m (z0, p0, z0))
+
+/*
+** not_s32_m_tied1:
+**	not	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (not_s32_m_tied1, svint32_t,
+		z0 = svnot_s32_m (z0, p0, z1),
+		z0 = svnot_m (z0, p0, z1))
+
+/*
+** not_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	not	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (not_s32_m_tied2, svint32_t,
+		z0 = svnot_s32_m (z1, p0, z0),
+		z0 = svnot_m (z1, p0, z0))
+
+/*
+** not_s32_m_untied:
+**	movprfx	z0, z2
+**	not	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (not_s32_m_untied, svint32_t,
+		z0 = svnot_s32_m (z2, p0, z1),
+		z0 = svnot_m (z2, p0, z1))
+
+/*
+** not_s32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	not	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (not_s32_z_tied1, svint32_t,
+		z0 = svnot_s32_z (p0, z0),
+		z0 = svnot_z (p0, z0))
+
+/*
+** not_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	not	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (not_s32_z_untied, svint32_t,
+		z0 = svnot_s32_z (p0, z1),
+		z0 = svnot_z (p0, z1))
+
+/*
+** not_s32_x_tied1:
+**	not	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (not_s32_x_tied1, svint32_t,
+		z0 = svnot_s32_x (p0, z0),
+		z0 = svnot_x (p0, z0))
+
+/*
+** not_s32_x_untied:
+**	not	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (not_s32_x_untied, svint32_t,
+		z0 = svnot_s32_x (p0, z1),
+		z0 = svnot_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s64.c
new file mode 100644
index 000000000..8e7f7b9e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** not_s64_m_tied12:
+**	not	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (not_s64_m_tied12, svint64_t,
+		z0 = svnot_s64_m (z0, p0, z0),
+		z0 = svnot_m (z0, p0, z0))
+
+/*
+** not_s64_m_tied1:
+**	not	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (not_s64_m_tied1, svint64_t,
+		z0 = svnot_s64_m (z0, p0, z1),
+		z0 = svnot_m (z0, p0, z1))
+
+/*
+** not_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	not	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (not_s64_m_tied2, svint64_t,
+		z0 = svnot_s64_m (z1, p0, z0),
+		z0 = svnot_m (z1, p0, z0))
+
+/*
+** not_s64_m_untied:
+**	movprfx	z0, z2
+**	not	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (not_s64_m_untied, svint64_t,
+		z0 = svnot_s64_m (z2, p0, z1),
+		z0 = svnot_m (z2, p0, z1))
+
+/*
+** not_s64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	not	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (not_s64_z_tied1, svint64_t,
+		z0 = svnot_s64_z (p0, z0),
+		z0 = svnot_z (p0, z0))
+
+/*
+** not_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	not	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (not_s64_z_untied, svint64_t,
+		z0 = svnot_s64_z (p0, z1),
+		z0 = svnot_z (p0, z1))
+
+/*
+** not_s64_x_tied1:
+**	not	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (not_s64_x_tied1, svint64_t,
+		z0 = svnot_s64_x (p0, z0),
+		z0 = svnot_x (p0, z0))
+
+/*
+** not_s64_x_untied:
+**	not	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (not_s64_x_untied, svint64_t,
+		z0 = svnot_s64_x (p0, z1),
+		z0 = svnot_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s8.c
new file mode 100644
index 000000000..e807f08f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_s8.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** not_s8_m_tied12:
+**	not	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (not_s8_m_tied12, svint8_t,
+		z0 = svnot_s8_m (z0, p0, z0),
+		z0 = svnot_m (z0, p0, z0))
+
+/*
+** not_s8_m_tied1:
+**	not	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (not_s8_m_tied1, svint8_t,
+		z0 = svnot_s8_m (z0, p0, z1),
+		z0 = svnot_m (z0, p0, z1))
+
+/*
+** not_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	not	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (not_s8_m_tied2, svint8_t,
+		z0 = svnot_s8_m (z1, p0, z0),
+		z0 = svnot_m (z1, p0, z0))
+
+/*
+** not_s8_m_untied:
+**	movprfx	z0, z2
+**	not	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (not_s8_m_untied, svint8_t,
+		z0 = svnot_s8_m (z2, p0, z1),
+		z0 = svnot_m (z2, p0, z1))
+
+/*
+** not_s8_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.b, p0/z, \1\.b
+**	not	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (not_s8_z_tied1, svint8_t,
+		z0 = svnot_s8_z (p0, z0),
+		z0 = svnot_z (p0, z0))
+
+/*
+** not_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	not	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (not_s8_z_untied, svint8_t,
+		z0 = svnot_s8_z (p0, z1),
+		z0 = svnot_z (p0, z1))
+
+/*
+** not_s8_x_tied1:
+**	not	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (not_s8_x_tied1, svint8_t,
+		z0 = svnot_s8_x (p0, z0),
+		z0 = svnot_x (p0, z0))
+
+/*
+** not_s8_x_untied:
+**	not	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (not_s8_x_untied, svint8_t,
+		z0 = svnot_s8_x (p0, z1),
+		z0 = svnot_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u16.c
new file mode 100644
index 000000000..c812005f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u16.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** not_u16_m_tied12:
+**	not	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (not_u16_m_tied12, svuint16_t,
+		z0 = svnot_u16_m (z0, p0, z0),
+		z0 = svnot_m (z0, p0, z0))
+
+/*
+** not_u16_m_tied1:
+**	not	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (not_u16_m_tied1, svuint16_t,
+		z0 = svnot_u16_m (z0, p0, z1),
+		z0 = svnot_m (z0, p0, z1))
+
+/*
+** not_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	not	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (not_u16_m_tied2, svuint16_t,
+		z0 = svnot_u16_m (z1, p0, z0),
+		z0 = svnot_m (z1, p0, z0))
+
+/*
+** not_u16_m_untied:
+**	movprfx	z0, z2
+**	not	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (not_u16_m_untied, svuint16_t,
+		z0 = svnot_u16_m (z2, p0, z1),
+		z0 = svnot_m (z2, p0, z1))
+
+/*
+** not_u16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	not	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (not_u16_z_tied1, svuint16_t,
+		z0 = svnot_u16_z (p0, z0),
+		z0 = svnot_z (p0, z0))
+
+/*
+** not_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	not	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (not_u16_z_untied, svuint16_t,
+		z0 = svnot_u16_z (p0, z1),
+		z0 = svnot_z (p0, z1))
+
+/*
+** not_u16_x_tied1:
+**	not	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (not_u16_x_tied1, svuint16_t,
+		z0 = svnot_u16_x (p0, z0),
+		z0 = svnot_x (p0, z0))
+
+/*
+** not_u16_x_untied:
+**	not	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (not_u16_x_untied, svuint16_t,
+		z0 = svnot_u16_x (p0, z1),
+		z0 = svnot_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u32.c
new file mode 100644
index 000000000..7b7e9ca21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u32.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** not_u32_m_tied12:
+**	not	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (not_u32_m_tied12, svuint32_t,
+		z0 = svnot_u32_m (z0, p0, z0),
+		z0 = svnot_m (z0, p0, z0))
+
+/*
+** not_u32_m_tied1:
+**	not	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (not_u32_m_tied1, svuint32_t,
+		z0 = svnot_u32_m (z0, p0, z1),
+		z0 = svnot_m (z0, p0, z1))
+
+/*
+** not_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	not	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (not_u32_m_tied2, svuint32_t,
+		z0 = svnot_u32_m (z1, p0, z0),
+		z0 = svnot_m (z1, p0, z0))
+
+/*
+** not_u32_m_untied:
+**	movprfx	z0, z2
+**	not	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (not_u32_m_untied, svuint32_t,
+		z0 = svnot_u32_m (z2, p0, z1),
+		z0 = svnot_m (z2, p0, z1))
+
+/*
+** not_u32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	not	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (not_u32_z_tied1, svuint32_t,
+		z0 = svnot_u32_z (p0, z0),
+		z0 = svnot_z (p0, z0))
+
+/*
+** not_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	not	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (not_u32_z_untied, svuint32_t,
+		z0 = svnot_u32_z (p0, z1),
+		z0 = svnot_z (p0, z1))
+
+/*
+** not_u32_x_tied1:
+**	not	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (not_u32_x_tied1, svuint32_t,
+		z0 = svnot_u32_x (p0, z0),
+		z0 = svnot_x (p0, z0))
+
+/*
+** not_u32_x_untied:
+**	not	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (not_u32_x_untied, svuint32_t,
+		z0 = svnot_u32_x (p0, z1),
+		z0 = svnot_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u64.c
new file mode 100644
index 000000000..27b92ad84
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** not_u64_m_tied12:
+**	not	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (not_u64_m_tied12, svuint64_t,
+		z0 = svnot_u64_m (z0, p0, z0),
+		z0 = svnot_m (z0, p0, z0))
+
+/*
+** not_u64_m_tied1:
+**	not	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (not_u64_m_tied1, svuint64_t,
+		z0 = svnot_u64_m (z0, p0, z1),
+		z0 = svnot_m (z0, p0, z1))
+
+/*
+** not_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	not	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (not_u64_m_tied2, svuint64_t,
+		z0 = svnot_u64_m (z1, p0, z0),
+		z0 = svnot_m (z1, p0, z0))
+
+/*
+** not_u64_m_untied:
+**	movprfx	z0, z2
+**	not	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (not_u64_m_untied, svuint64_t,
+		z0 = svnot_u64_m (z2, p0, z1),
+		z0 = svnot_m (z2, p0, z1))
+
+/*
+** not_u64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	not	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (not_u64_z_tied1, svuint64_t,
+		z0 = svnot_u64_z (p0, z0),
+		z0 = svnot_z (p0, z0))
+
+/*
+** not_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	not	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (not_u64_z_untied, svuint64_t,
+		z0 = svnot_u64_z (p0, z1),
+		z0 = svnot_z (p0, z1))
+
+/*
+** not_u64_x_tied1:
+**	not	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (not_u64_x_tied1, svuint64_t,
+		z0 = svnot_u64_x (p0, z0),
+		z0 = svnot_x (p0, z0))
+
+/*
+** not_u64_x_untied:
+**	not	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (not_u64_x_untied, svuint64_t,
+		z0 = svnot_u64_x (p0, z1),
+		z0 = svnot_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u8.c
new file mode 100644
index 000000000..bd2f36cad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/not_u8.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** not_u8_m_tied12:
+**	not	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (not_u8_m_tied12, svuint8_t,
+		z0 = svnot_u8_m (z0, p0, z0),
+		z0 = svnot_m (z0, p0, z0))
+
+/*
+** not_u8_m_tied1:
+**	not	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (not_u8_m_tied1, svuint8_t,
+		z0 = svnot_u8_m (z0, p0, z1),
+		z0 = svnot_m (z0, p0, z1))
+
+/*
+** not_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	not	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (not_u8_m_tied2, svuint8_t,
+		z0 = svnot_u8_m (z1, p0, z0),
+		z0 = svnot_m (z1, p0, z0))
+
+/*
+** not_u8_m_untied:
+**	movprfx	z0, z2
+**	not	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (not_u8_m_untied, svuint8_t,
+		z0 = svnot_u8_m (z2, p0, z1),
+		z0 = svnot_m (z2, p0, z1))
+
+/*
+** not_u8_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.b, p0/z, \1\.b
+**	not	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (not_u8_z_tied1, svuint8_t,
+		z0 = svnot_u8_z (p0, z0),
+		z0 = svnot_z (p0, z0))
+
+/*
+** not_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	not	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (not_u8_z_untied, svuint8_t,
+		z0 = svnot_u8_z (p0, z1),
+		z0 = svnot_z (p0, z1))
+
+/*
+** not_u8_x_tied1:
+**	not	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (not_u8_x_tied1, svuint8_t,
+		z0 = svnot_u8_x (p0, z0),
+		z0 = svnot_x (p0, z0))
+
+/*
+** not_u8_x_untied:
+**	not	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (not_u8_x_untied, svuint8_t,
+		z0 = svnot_u8_x (p0, z1),
+		z0 = svnot_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orn_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orn_b.c
new file mode 100644
index 000000000..423a18bc7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orn_b.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orn_b_z_tied1:
+**	orn	p0\.b, p3/z, p0\.b, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (orn_b_z_tied1,
+		p0 = svorn_b_z (p3, p0, p1),
+		p0 = svorn_z (p3, p0, p1))
+
+/*
+** orn_b_z_tied2:
+**	orn	p0\.b, p3/z, p1\.b, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (orn_b_z_tied2,
+		p0 = svorn_b_z (p3, p1, p0),
+		p0 = svorn_z (p3, p1, p0))
+
+/*
+** orn_b_z_untied:
+**	orn	p0\.b, p3/z, p1\.b, p2\.b
+**	ret
+*/
+TEST_UNIFORM_P (orn_b_z_untied,
+		p0 = svorn_b_z (p3, p1, p2),
+		p0 = svorn_z (p3, p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_b.c
new file mode 100644
index 000000000..fba9ba7df
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_b.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orr_b_z_tied1:
+**	orr	p0\.b, p3/z, (p0\.b, p1\.b|p1\.b, p0\.b)
+**	ret
+*/
+TEST_UNIFORM_P (orr_b_z_tied1,
+		p0 = svorr_b_z (p3, p0, p1),
+		p0 = svorr_z (p3, p0, p1))
+
+/*
+** orr_b_z_tied2:
+**	orr	p0\.b, p3/z, (p0\.b, p1\.b|p1\.b, p0\.b)
+**	ret
+*/
+TEST_UNIFORM_P (orr_b_z_tied2,
+		p0 = svorr_b_z (p3, p1, p0),
+		p0 = svorr_z (p3, p1, p0))
+
+/*
+** orr_b_z_untied:
+**	orr	p0\.b, p3/z, (p1\.b, p2\.b|p2\.b, p1\.b)
+**	ret
+*/
+TEST_UNIFORM_P (orr_b_z_untied,
+		p0 = svorr_b_z (p3, p1, p2),
+		p0 = svorr_z (p3, p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s16.c
new file mode 100644
index 000000000..62b707a9c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s16.c
@@ -0,0 +1,376 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orr_s16_m_tied1:
+**	orr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s16_m_tied1, svint16_t,
+		z0 = svorr_s16_m (p0, z0, z1),
+		z0 = svorr_m (p0, z0, z1))
+
+/*
+** orr_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	orr	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s16_m_tied2, svint16_t,
+		z0 = svorr_s16_m (p0, z1, z0),
+		z0 = svorr_m (p0, z1, z0))
+
+/*
+** orr_s16_m_untied:
+**	movprfx	z0, z1
+**	orr	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s16_m_untied, svint16_t,
+		z0 = svorr_s16_m (p0, z1, z2),
+		z0 = svorr_m (p0, z1, z2))
+
+/*
+** orr_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	orr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s16_m_tied1, svint16_t, int16_t,
+		 z0 = svorr_n_s16_m (p0, z0, x0),
+		 z0 = svorr_m (p0, z0, x0))
+
+/*
+** orr_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	orr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s16_m_untied, svint16_t, int16_t,
+		 z0 = svorr_n_s16_m (p0, z1, x0),
+		 z0 = svorr_m (p0, z1, x0))
+
+/*
+** orr_1_s16_m_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	orr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s16_m_tied1, svint16_t,
+		z0 = svorr_n_s16_m (p0, z0, 1),
+		z0 = svorr_m (p0, z0, 1))
+
+/*
+** orr_1_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0, z1
+**	orr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s16_m_untied, svint16_t,
+		z0 = svorr_n_s16_m (p0, z1, 1),
+		z0 = svorr_m (p0, z1, 1))
+
+/*
+** orr_m2_s16_m:
+**	mov	(z[0-9]+\.h), #-2
+**	orr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m2_s16_m, svint16_t,
+		z0 = svorr_n_s16_m (p0, z0, -2),
+		z0 = svorr_m (p0, z0, -2))
+
+/*
+** orr_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	orr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s16_z_tied1, svint16_t,
+		z0 = svorr_s16_z (p0, z0, z1),
+		z0 = svorr_z (p0, z0, z1))
+
+/*
+** orr_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	orr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s16_z_tied2, svint16_t,
+		z0 = svorr_s16_z (p0, z1, z0),
+		z0 = svorr_z (p0, z1, z0))
+
+/*
+** orr_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	orr	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	orr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s16_z_untied, svint16_t,
+		z0 = svorr_s16_z (p0, z1, z2),
+		z0 = svorr_z (p0, z1, z2))
+
+/*
+** orr_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	orr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s16_z_tied1, svint16_t, int16_t,
+		 z0 = svorr_n_s16_z (p0, z0, x0),
+		 z0 = svorr_z (p0, z0, x0))
+
+/*
+** orr_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	orr	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	orr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s16_z_untied, svint16_t, int16_t,
+		 z0 = svorr_n_s16_z (p0, z1, x0),
+		 z0 = svorr_z (p0, z1, x0))
+
+/*
+** orr_1_s16_z_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	orr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s16_z_tied1, svint16_t,
+		z0 = svorr_n_s16_z (p0, z0, 1),
+		z0 = svorr_z (p0, z0, 1))
+
+/*
+** orr_1_s16_z_untied:
+**	mov	(z[0-9]+\.h), #1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	orr	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	orr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s16_z_untied, svint16_t,
+		z0 = svorr_n_s16_z (p0, z1, 1),
+		z0 = svorr_z (p0, z1, 1))
+
+/*
+** orr_s16_x_tied1:
+**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s16_x_tied1, svint16_t,
+		z0 = svorr_s16_x (p0, z0, z1),
+		z0 = svorr_x (p0, z0, z1))
+
+/*
+** orr_s16_x_tied2:
+**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s16_x_tied2, svint16_t,
+		z0 = svorr_s16_x (p0, z1, z0),
+		z0 = svorr_x (p0, z1, z0))
+
+/*
+** orr_s16_x_untied:
+**	orr	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s16_x_untied, svint16_t,
+		z0 = svorr_s16_x (p0, z1, z2),
+		z0 = svorr_x (p0, z1, z2))
+
+/*
+** orr_w0_s16_x_tied1:
+**	mov	(z[0-9]+)\.h, w0
+**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s16_x_tied1, svint16_t, int16_t,
+		 z0 = svorr_n_s16_x (p0, z0, x0),
+		 z0 = svorr_x (p0, z0, x0))
+
+/*
+** orr_w0_s16_x_untied:
+**	mov	(z[0-9]+)\.h, w0
+**	orr	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s16_x_untied, svint16_t, int16_t,
+		 z0 = svorr_n_s16_x (p0, z1, x0),
+		 z0 = svorr_x (p0, z1, x0))
+
+/*
+** orr_1_s16_x_tied1:
+**	orr	z0\.h, z0\.h, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s16_x_tied1, svint16_t,
+		z0 = svorr_n_s16_x (p0, z0, 1),
+		z0 = svorr_x (p0, z0, 1))
+
+/*
+** orr_1_s16_x_untied:
+**	movprfx	z0, z1
+**	orr	z0\.h, z0\.h, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s16_x_untied, svint16_t,
+		z0 = svorr_n_s16_x (p0, z1, 1),
+		z0 = svorr_x (p0, z1, 1))
+
+/*
+** orr_127_s16_x:
+**	orr	z0\.h, z0\.h, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (orr_127_s16_x, svint16_t,
+		z0 = svorr_n_s16_x (p0, z0, 127),
+		z0 = svorr_x (p0, z0, 127))
+
+/*
+** orr_128_s16_x:
+**	orr	z0\.h, z0\.h, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (orr_128_s16_x, svint16_t,
+		z0 = svorr_n_s16_x (p0, z0, 128),
+		z0 = svorr_x (p0, z0, 128))
+
+/*
+** orr_255_s16_x:
+**	orr	z0\.h, z0\.h, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (orr_255_s16_x, svint16_t,
+		z0 = svorr_n_s16_x (p0, z0, 255),
+		z0 = svorr_x (p0, z0, 255))
+
+/*
+** orr_256_s16_x:
+**	orr	z0\.h, z0\.h, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (orr_256_s16_x, svint16_t,
+		z0 = svorr_n_s16_x (p0, z0, 256),
+		z0 = svorr_x (p0, z0, 256))
+
+/*
+** orr_257_s16_x:
+**	orr	z0\.h, z0\.h, #0x101
+**	ret
+*/
+TEST_UNIFORM_Z (orr_257_s16_x, svint16_t,
+		z0 = svorr_n_s16_x (p0, z0, 257),
+		z0 = svorr_x (p0, z0, 257))
+
+/*
+** orr_512_s16_x:
+**	orr	z0\.h, z0\.h, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (orr_512_s16_x, svint16_t,
+		z0 = svorr_n_s16_x (p0, z0, 512),
+		z0 = svorr_x (p0, z0, 512))
+
+/*
+** orr_65280_s16_x:
+**	orr	z0\.h, z0\.h, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_65280_s16_x, svint16_t,
+		z0 = svorr_n_s16_x (p0, z0, 0xff00),
+		z0 = svorr_x (p0, z0, 0xff00))
+
+/*
+** orr_m127_s16_x:
+**	orr	z0\.h, z0\.h, #0xff81
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m127_s16_x, svint16_t,
+		z0 = svorr_n_s16_x (p0, z0, -127),
+		z0 = svorr_x (p0, z0, -127))
+
+/*
+** orr_m128_s16_x:
+**	orr	z0\.h, z0\.h, #0xff80
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m128_s16_x, svint16_t,
+		z0 = svorr_n_s16_x (p0, z0, -128),
+		z0 = svorr_x (p0, z0, -128))
+
+/*
+** orr_m255_s16_x:
+**	orr	z0\.h, z0\.h, #0xff01
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m255_s16_x, svint16_t,
+		z0 = svorr_n_s16_x (p0, z0, -255),
+		z0 = svorr_x (p0, z0, -255))
+
+/*
+** orr_m256_s16_x:
+**	orr	z0\.h, z0\.h, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m256_s16_x, svint16_t,
+		z0 = svorr_n_s16_x (p0, z0, -256),
+		z0 = svorr_x (p0, z0, -256))
+
+/*
+** orr_m257_s16_x:
+**	orr	z0\.h, z0\.h, #0xfeff
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m257_s16_x, svint16_t,
+		z0 = svorr_n_s16_x (p0, z0, -257),
+		z0 = svorr_x (p0, z0, -257))
+
+/*
+** orr_m512_s16_x:
+**	orr	z0\.h, z0\.h, #0xfe00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m512_s16_x, svint16_t,
+		z0 = svorr_n_s16_x (p0, z0, -512),
+		z0 = svorr_x (p0, z0, -512))
+
+/*
+** orr_m32768_s16_x:
+**	orr	z0\.h, z0\.h, #0x8000
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m32768_s16_x, svint16_t,
+		z0 = svorr_n_s16_x (p0, z0, -0x8000),
+		z0 = svorr_x (p0, z0, -0x8000))
+
+/*
+** orr_5_s16_x:
+**	mov	(z[0-9]+)\.h, #5
+**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_5_s16_x, svint16_t,
+		z0 = svorr_n_s16_x (p0, z0, 5),
+		z0 = svorr_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s32.c
new file mode 100644
index 000000000..2e0e1e888
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s32.c
@@ -0,0 +1,372 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orr_s32_m_tied1:
+**	orr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s32_m_tied1, svint32_t,
+		z0 = svorr_s32_m (p0, z0, z1),
+		z0 = svorr_m (p0, z0, z1))
+
+/*
+** orr_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	orr	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s32_m_tied2, svint32_t,
+		z0 = svorr_s32_m (p0, z1, z0),
+		z0 = svorr_m (p0, z1, z0))
+
+/*
+** orr_s32_m_untied:
+**	movprfx	z0, z1
+**	orr	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s32_m_untied, svint32_t,
+		z0 = svorr_s32_m (p0, z1, z2),
+		z0 = svorr_m (p0, z1, z2))
+
+/*
+** orr_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	orr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svorr_n_s32_m (p0, z0, x0),
+		 z0 = svorr_m (p0, z0, x0))
+
+/*
+** orr_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	orr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svorr_n_s32_m (p0, z1, x0),
+		 z0 = svorr_m (p0, z1, x0))
+
+/*
+** orr_1_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	orr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s32_m_tied1, svint32_t,
+		z0 = svorr_n_s32_m (p0, z0, 1),
+		z0 = svorr_m (p0, z0, 1))
+
+/*
+** orr_1_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0, z1
+**	orr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s32_m_untied, svint32_t,
+		z0 = svorr_n_s32_m (p0, z1, 1),
+		z0 = svorr_m (p0, z1, 1))
+
+/*
+** orr_m2_s32_m:
+**	mov	(z[0-9]+\.s), #-2
+**	orr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m2_s32_m, svint32_t,
+		z0 = svorr_n_s32_m (p0, z0, -2),
+		z0 = svorr_m (p0, z0, -2))
+
+/*
+** orr_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	orr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s32_z_tied1, svint32_t,
+		z0 = svorr_s32_z (p0, z0, z1),
+		z0 = svorr_z (p0, z0, z1))
+
+/*
+** orr_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	orr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s32_z_tied2, svint32_t,
+		z0 = svorr_s32_z (p0, z1, z0),
+		z0 = svorr_z (p0, z1, z0))
+
+/*
+** orr_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	orr	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	orr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s32_z_untied, svint32_t,
+		z0 = svorr_s32_z (p0, z1, z2),
+		z0 = svorr_z (p0, z1, z2))
+
+/*
+** orr_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	orr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svorr_n_s32_z (p0, z0, x0),
+		 z0 = svorr_z (p0, z0, x0))
+
+/*
+** orr_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	orr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	orr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svorr_n_s32_z (p0, z1, x0),
+		 z0 = svorr_z (p0, z1, x0))
+
+/*
+** orr_1_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	orr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s32_z_tied1, svint32_t,
+		z0 = svorr_n_s32_z (p0, z0, 1),
+		z0 = svorr_z (p0, z0, 1))
+
+/*
+** orr_1_s32_z_untied:
+**	mov	(z[0-9]+\.s), #1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	orr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	orr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s32_z_untied, svint32_t,
+		z0 = svorr_n_s32_z (p0, z1, 1),
+		z0 = svorr_z (p0, z1, 1))
+
+/*
+** orr_s32_x_tied1:
+**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s32_x_tied1, svint32_t,
+		z0 = svorr_s32_x (p0, z0, z1),
+		z0 = svorr_x (p0, z0, z1))
+
+/*
+** orr_s32_x_tied2:
+**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s32_x_tied2, svint32_t,
+		z0 = svorr_s32_x (p0, z1, z0),
+		z0 = svorr_x (p0, z1, z0))
+
+/*
+** orr_s32_x_untied:
+**	orr	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s32_x_untied, svint32_t,
+		z0 = svorr_s32_x (p0, z1, z2),
+		z0 = svorr_x (p0, z1, z2))
+
+/*
+** orr_w0_s32_x_tied1:
+**	mov	(z[0-9]+)\.s, w0
+**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svorr_n_s32_x (p0, z0, x0),
+		 z0 = svorr_x (p0, z0, x0))
+
+/*
+** orr_w0_s32_x_untied:
+**	mov	(z[0-9]+)\.s, w0
+**	orr	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svorr_n_s32_x (p0, z1, x0),
+		 z0 = svorr_x (p0, z1, x0))
+
+/*
+** orr_1_s32_x_tied1:
+**	orr	z0\.s, z0\.s, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s32_x_tied1, svint32_t,
+		z0 = svorr_n_s32_x (p0, z0, 1),
+		z0 = svorr_x (p0, z0, 1))
+
+/*
+** orr_1_s32_x_untied:
+**	movprfx	z0, z1
+**	orr	z0\.s, z0\.s, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s32_x_untied, svint32_t,
+		z0 = svorr_n_s32_x (p0, z1, 1),
+		z0 = svorr_x (p0, z1, 1))
+
+/*
+** orr_127_s32_x:
+**	orr	z0\.s, z0\.s, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (orr_127_s32_x, svint32_t,
+		z0 = svorr_n_s32_x (p0, z0, 127),
+		z0 = svorr_x (p0, z0, 127))
+
+/*
+** orr_128_s32_x:
+**	orr	z0\.s, z0\.s, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (orr_128_s32_x, svint32_t,
+		z0 = svorr_n_s32_x (p0, z0, 128),
+		z0 = svorr_x (p0, z0, 128))
+
+/*
+** orr_255_s32_x:
+**	orr	z0\.s, z0\.s, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (orr_255_s32_x, svint32_t,
+		z0 = svorr_n_s32_x (p0, z0, 255),
+		z0 = svorr_x (p0, z0, 255))
+
+/*
+** orr_256_s32_x:
+**	orr	z0\.s, z0\.s, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (orr_256_s32_x, svint32_t,
+		z0 = svorr_n_s32_x (p0, z0, 256),
+		z0 = svorr_x (p0, z0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (orr_257_s32_x, svint32_t,
+		z0 = svorr_n_s32_x (p0, z0, 257),
+		z0 = svorr_x (p0, z0, 257))
+
+/*
+** orr_512_s32_x:
+**	orr	z0\.s, z0\.s, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (orr_512_s32_x, svint32_t,
+		z0 = svorr_n_s32_x (p0, z0, 512),
+		z0 = svorr_x (p0, z0, 512))
+
+/*
+** orr_65280_s32_x:
+**	orr	z0\.s, z0\.s, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_65280_s32_x, svint32_t,
+		z0 = svorr_n_s32_x (p0, z0, 0xff00),
+		z0 = svorr_x (p0, z0, 0xff00))
+
+/*
+** orr_m127_s32_x:
+**	orr	z0\.s, z0\.s, #0xffffff81
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m127_s32_x, svint32_t,
+		z0 = svorr_n_s32_x (p0, z0, -127),
+		z0 = svorr_x (p0, z0, -127))
+
+/*
+** orr_m128_s32_x:
+**	orr	z0\.s, z0\.s, #0xffffff80
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m128_s32_x, svint32_t,
+		z0 = svorr_n_s32_x (p0, z0, -128),
+		z0 = svorr_x (p0, z0, -128))
+
+/*
+** orr_m255_s32_x:
+**	orr	z0\.s, z0\.s, #0xffffff01
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m255_s32_x, svint32_t,
+		z0 = svorr_n_s32_x (p0, z0, -255),
+		z0 = svorr_x (p0, z0, -255))
+
+/*
+** orr_m256_s32_x:
+**	orr	z0\.s, z0\.s, #0xffffff00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m256_s32_x, svint32_t,
+		z0 = svorr_n_s32_x (p0, z0, -256),
+		z0 = svorr_x (p0, z0, -256))
+
+/*
+** orr_m257_s32_x:
+**	orr	z0\.s, z0\.s, #0xfffffeff
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m257_s32_x, svint32_t,
+		z0 = svorr_n_s32_x (p0, z0, -257),
+		z0 = svorr_x (p0, z0, -257))
+
+/*
+** orr_m512_s32_x:
+**	orr	z0\.s, z0\.s, #0xfffffe00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m512_s32_x, svint32_t,
+		z0 = svorr_n_s32_x (p0, z0, -512),
+		z0 = svorr_x (p0, z0, -512))
+
+/*
+** orr_m32768_s32_x:
+**	orr	z0\.s, z0\.s, #0xffff8000
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m32768_s32_x, svint32_t,
+		z0 = svorr_n_s32_x (p0, z0, -0x8000),
+		z0 = svorr_x (p0, z0, -0x8000))
+
+/*
+** orr_5_s32_x:
+**	mov	(z[0-9]+)\.s, #5
+**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_5_s32_x, svint32_t,
+		z0 = svorr_n_s32_x (p0, z0, 5),
+		z0 = svorr_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s64.c
new file mode 100644
index 000000000..1538fdd14
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s64.c
@@ -0,0 +1,372 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orr_s64_m_tied1:
+**	orr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s64_m_tied1, svint64_t,
+		z0 = svorr_s64_m (p0, z0, z1),
+		z0 = svorr_m (p0, z0, z1))
+
+/*
+** orr_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	orr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s64_m_tied2, svint64_t,
+		z0 = svorr_s64_m (p0, z1, z0),
+		z0 = svorr_m (p0, z1, z0))
+
+/*
+** orr_s64_m_untied:
+**	movprfx	z0, z1
+**	orr	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s64_m_untied, svint64_t,
+		z0 = svorr_s64_m (p0, z1, z2),
+		z0 = svorr_m (p0, z1, z2))
+
+/*
+** orr_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	orr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svorr_n_s64_m (p0, z0, x0),
+		 z0 = svorr_m (p0, z0, x0))
+
+/*
+** orr_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	orr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svorr_n_s64_m (p0, z1, x0),
+		 z0 = svorr_m (p0, z1, x0))
+
+/*
+** orr_1_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	orr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s64_m_tied1, svint64_t,
+		z0 = svorr_n_s64_m (p0, z0, 1),
+		z0 = svorr_m (p0, z0, 1))
+
+/*
+** orr_1_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0, z1
+**	orr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s64_m_untied, svint64_t,
+		z0 = svorr_n_s64_m (p0, z1, 1),
+		z0 = svorr_m (p0, z1, 1))
+
+/*
+** orr_m2_s64_m:
+**	mov	(z[0-9]+\.d), #-2
+**	orr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m2_s64_m, svint64_t,
+		z0 = svorr_n_s64_m (p0, z0, -2),
+		z0 = svorr_m (p0, z0, -2))
+
+/*
+** orr_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	orr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s64_z_tied1, svint64_t,
+		z0 = svorr_s64_z (p0, z0, z1),
+		z0 = svorr_z (p0, z0, z1))
+
+/*
+** orr_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	orr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s64_z_tied2, svint64_t,
+		z0 = svorr_s64_z (p0, z1, z0),
+		z0 = svorr_z (p0, z1, z0))
+
+/*
+** orr_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	orr	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	orr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s64_z_untied, svint64_t,
+		z0 = svorr_s64_z (p0, z1, z2),
+		z0 = svorr_z (p0, z1, z2))
+
+/*
+** orr_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	orr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svorr_n_s64_z (p0, z0, x0),
+		 z0 = svorr_z (p0, z0, x0))
+
+/*
+** orr_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	orr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	orr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svorr_n_s64_z (p0, z1, x0),
+		 z0 = svorr_z (p0, z1, x0))
+
+/*
+** orr_1_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	orr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s64_z_tied1, svint64_t,
+		z0 = svorr_n_s64_z (p0, z0, 1),
+		z0 = svorr_z (p0, z0, 1))
+
+/*
+** orr_1_s64_z_untied:
+**	mov	(z[0-9]+\.d), #1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	orr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	orr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s64_z_untied, svint64_t,
+		z0 = svorr_n_s64_z (p0, z1, 1),
+		z0 = svorr_z (p0, z1, 1))
+
+/*
+** orr_s64_x_tied1:
+**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s64_x_tied1, svint64_t,
+		z0 = svorr_s64_x (p0, z0, z1),
+		z0 = svorr_x (p0, z0, z1))
+
+/*
+** orr_s64_x_tied2:
+**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s64_x_tied2, svint64_t,
+		z0 = svorr_s64_x (p0, z1, z0),
+		z0 = svorr_x (p0, z1, z0))
+
+/*
+** orr_s64_x_untied:
+**	orr	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s64_x_untied, svint64_t,
+		z0 = svorr_s64_x (p0, z1, z2),
+		z0 = svorr_x (p0, z1, z2))
+
+/*
+** orr_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	orr	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svorr_n_s64_x (p0, z0, x0),
+		 z0 = svorr_x (p0, z0, x0))
+
+/*
+** orr_x0_s64_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	orr	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svorr_n_s64_x (p0, z1, x0),
+		 z0 = svorr_x (p0, z1, x0))
+
+/*
+** orr_1_s64_x_tied1:
+**	orr	z0\.d, z0\.d, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s64_x_tied1, svint64_t,
+		z0 = svorr_n_s64_x (p0, z0, 1),
+		z0 = svorr_x (p0, z0, 1))
+
+/*
+** orr_1_s64_x_untied:
+**	movprfx	z0, z1
+**	orr	z0\.d, z0\.d, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s64_x_untied, svint64_t,
+		z0 = svorr_n_s64_x (p0, z1, 1),
+		z0 = svorr_x (p0, z1, 1))
+
+/*
+** orr_127_s64_x:
+**	orr	z0\.d, z0\.d, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (orr_127_s64_x, svint64_t,
+		z0 = svorr_n_s64_x (p0, z0, 127),
+		z0 = svorr_x (p0, z0, 127))
+
+/*
+** orr_128_s64_x:
+**	orr	z0\.d, z0\.d, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (orr_128_s64_x, svint64_t,
+		z0 = svorr_n_s64_x (p0, z0, 128),
+		z0 = svorr_x (p0, z0, 128))
+
+/*
+** orr_255_s64_x:
+**	orr	z0\.d, z0\.d, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (orr_255_s64_x, svint64_t,
+		z0 = svorr_n_s64_x (p0, z0, 255),
+		z0 = svorr_x (p0, z0, 255))
+
+/*
+** orr_256_s64_x:
+**	orr	z0\.d, z0\.d, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (orr_256_s64_x, svint64_t,
+		z0 = svorr_n_s64_x (p0, z0, 256),
+		z0 = svorr_x (p0, z0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (orr_257_s64_x, svint64_t,
+		z0 = svorr_n_s64_x (p0, z0, 257),
+		z0 = svorr_x (p0, z0, 257))
+
+/*
+** orr_512_s64_x:
+**	orr	z0\.d, z0\.d, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (orr_512_s64_x, svint64_t,
+		z0 = svorr_n_s64_x (p0, z0, 512),
+		z0 = svorr_x (p0, z0, 512))
+
+/*
+** orr_65280_s64_x:
+**	orr	z0\.d, z0\.d, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_65280_s64_x, svint64_t,
+		z0 = svorr_n_s64_x (p0, z0, 0xff00),
+		z0 = svorr_x (p0, z0, 0xff00))
+
+/*
+** orr_m127_s64_x:
+**	orr	z0\.d, z0\.d, #0xffffffffffffff81
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m127_s64_x, svint64_t,
+		z0 = svorr_n_s64_x (p0, z0, -127),
+		z0 = svorr_x (p0, z0, -127))
+
+/*
+** orr_m128_s64_x:
+**	orr	z0\.d, z0\.d, #0xffffffffffffff80
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m128_s64_x, svint64_t,
+		z0 = svorr_n_s64_x (p0, z0, -128),
+		z0 = svorr_x (p0, z0, -128))
+
+/*
+** orr_m255_s64_x:
+**	orr	z0\.d, z0\.d, #0xffffffffffffff01
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m255_s64_x, svint64_t,
+		z0 = svorr_n_s64_x (p0, z0, -255),
+		z0 = svorr_x (p0, z0, -255))
+
+/*
+** orr_m256_s64_x:
+**	orr	z0\.d, z0\.d, #0xffffffffffffff00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m256_s64_x, svint64_t,
+		z0 = svorr_n_s64_x (p0, z0, -256),
+		z0 = svorr_x (p0, z0, -256))
+
+/*
+** orr_m257_s64_x:
+**	orr	z0\.d, z0\.d, #0xfffffffffffffeff
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m257_s64_x, svint64_t,
+		z0 = svorr_n_s64_x (p0, z0, -257),
+		z0 = svorr_x (p0, z0, -257))
+
+/*
+** orr_m512_s64_x:
+**	orr	z0\.d, z0\.d, #0xfffffffffffffe00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m512_s64_x, svint64_t,
+		z0 = svorr_n_s64_x (p0, z0, -512),
+		z0 = svorr_x (p0, z0, -512))
+
+/*
+** orr_m32768_s64_x:
+**	orr	z0\.d, z0\.d, #0xffffffffffff8000
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m32768_s64_x, svint64_t,
+		z0 = svorr_n_s64_x (p0, z0, -0x8000),
+		z0 = svorr_x (p0, z0, -0x8000))
+
+/*
+** orr_5_s64_x:
+**	mov	(z[0-9]+\.d), #5
+**	orr	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_5_s64_x, svint64_t,
+		z0 = svorr_n_s64_x (p0, z0, 5),
+		z0 = svorr_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s8.c
new file mode 100644
index 000000000..b6483b6e7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_s8.c
@@ -0,0 +1,295 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orr_s8_m_tied1:
+**	orr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s8_m_tied1, svint8_t,
+		z0 = svorr_s8_m (p0, z0, z1),
+		z0 = svorr_m (p0, z0, z1))
+
+/*
+** orr_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	orr	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s8_m_tied2, svint8_t,
+		z0 = svorr_s8_m (p0, z1, z0),
+		z0 = svorr_m (p0, z1, z0))
+
+/*
+** orr_s8_m_untied:
+**	movprfx	z0, z1
+**	orr	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s8_m_untied, svint8_t,
+		z0 = svorr_s8_m (p0, z1, z2),
+		z0 = svorr_m (p0, z1, z2))
+
+/*
+** orr_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	orr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s8_m_tied1, svint8_t, int8_t,
+		 z0 = svorr_n_s8_m (p0, z0, x0),
+		 z0 = svorr_m (p0, z0, x0))
+
+/*
+** orr_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	orr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s8_m_untied, svint8_t, int8_t,
+		 z0 = svorr_n_s8_m (p0, z1, x0),
+		 z0 = svorr_m (p0, z1, x0))
+
+/*
+** orr_1_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	orr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s8_m_tied1, svint8_t,
+		z0 = svorr_n_s8_m (p0, z0, 1),
+		z0 = svorr_m (p0, z0, 1))
+
+/*
+** orr_1_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0, z1
+**	orr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s8_m_untied, svint8_t,
+		z0 = svorr_n_s8_m (p0, z1, 1),
+		z0 = svorr_m (p0, z1, 1))
+
+/*
+** orr_m2_s8_m:
+**	mov	(z[0-9]+\.b), #-2
+**	orr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m2_s8_m, svint8_t,
+		z0 = svorr_n_s8_m (p0, z0, -2),
+		z0 = svorr_m (p0, z0, -2))
+
+/*
+** orr_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	orr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s8_z_tied1, svint8_t,
+		z0 = svorr_s8_z (p0, z0, z1),
+		z0 = svorr_z (p0, z0, z1))
+
+/*
+** orr_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	orr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s8_z_tied2, svint8_t,
+		z0 = svorr_s8_z (p0, z1, z0),
+		z0 = svorr_z (p0, z1, z0))
+
+/*
+** orr_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	orr	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	orr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s8_z_untied, svint8_t,
+		z0 = svorr_s8_z (p0, z1, z2),
+		z0 = svorr_z (p0, z1, z2))
+
+/*
+** orr_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	orr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s8_z_tied1, svint8_t, int8_t,
+		 z0 = svorr_n_s8_z (p0, z0, x0),
+		 z0 = svorr_z (p0, z0, x0))
+
+/*
+** orr_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	orr	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	orr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s8_z_untied, svint8_t, int8_t,
+		 z0 = svorr_n_s8_z (p0, z1, x0),
+		 z0 = svorr_z (p0, z1, x0))
+
+/*
+** orr_1_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	orr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s8_z_tied1, svint8_t,
+		z0 = svorr_n_s8_z (p0, z0, 1),
+		z0 = svorr_z (p0, z0, 1))
+
+/*
+** orr_1_s8_z_untied:
+**	mov	(z[0-9]+\.b), #1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	orr	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	orr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s8_z_untied, svint8_t,
+		z0 = svorr_n_s8_z (p0, z1, 1),
+		z0 = svorr_z (p0, z1, 1))
+
+/*
+** orr_s8_x_tied1:
+**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s8_x_tied1, svint8_t,
+		z0 = svorr_s8_x (p0, z0, z1),
+		z0 = svorr_x (p0, z0, z1))
+
+/*
+** orr_s8_x_tied2:
+**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s8_x_tied2, svint8_t,
+		z0 = svorr_s8_x (p0, z1, z0),
+		z0 = svorr_x (p0, z1, z0))
+
+/*
+** orr_s8_x_untied:
+**	orr	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_s8_x_untied, svint8_t,
+		z0 = svorr_s8_x (p0, z1, z2),
+		z0 = svorr_x (p0, z1, z2))
+
+/*
+** orr_w0_s8_x_tied1:
+**	mov	(z[0-9]+)\.b, w0
+**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s8_x_tied1, svint8_t, int8_t,
+		 z0 = svorr_n_s8_x (p0, z0, x0),
+		 z0 = svorr_x (p0, z0, x0))
+
+/*
+** orr_w0_s8_x_untied:
+**	mov	(z[0-9]+)\.b, w0
+**	orr	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_s8_x_untied, svint8_t, int8_t,
+		 z0 = svorr_n_s8_x (p0, z1, x0),
+		 z0 = svorr_x (p0, z1, x0))
+
+/*
+** orr_1_s8_x_tied1:
+**	orr	z0\.b, z0\.b, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s8_x_tied1, svint8_t,
+		z0 = svorr_n_s8_x (p0, z0, 1),
+		z0 = svorr_x (p0, z0, 1))
+
+/*
+** orr_1_s8_x_untied:
+**	movprfx	z0, z1
+**	orr	z0\.b, z0\.b, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_s8_x_untied, svint8_t,
+		z0 = svorr_n_s8_x (p0, z1, 1),
+		z0 = svorr_x (p0, z1, 1))
+
+/*
+** orr_127_s8_x:
+**	orr	z0\.b, z0\.b, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (orr_127_s8_x, svint8_t,
+		z0 = svorr_n_s8_x (p0, z0, 127),
+		z0 = svorr_x (p0, z0, 127))
+
+/*
+** orr_128_s8_x:
+**	orr	z0\.b, z0\.b, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (orr_128_s8_x, svint8_t,
+		z0 = svorr_n_s8_x (p0, z0, 128),
+		z0 = svorr_x (p0, z0, 128))
+
+/*
+** orr_255_s8_x:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_255_s8_x, svint8_t,
+		z0 = svorr_n_s8_x (p0, z0, 255),
+		z0 = svorr_x (p0, z0, 255))
+
+/*
+** orr_m127_s8_x:
+**	orr	z0\.b, z0\.b, #0x81
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m127_s8_x, svint8_t,
+		z0 = svorr_n_s8_x (p0, z0, -127),
+		z0 = svorr_x (p0, z0, -127))
+
+/*
+** orr_m128_s8_x:
+**	orr	z0\.b, z0\.b, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m128_s8_x, svint8_t,
+		z0 = svorr_n_s8_x (p0, z0, -128),
+		z0 = svorr_x (p0, z0, -128))
+
+/*
+** orr_5_s8_x:
+**	mov	(z[0-9]+)\.b, #5
+**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_5_s8_x, svint8_t,
+		z0 = svorr_n_s8_x (p0, z0, 5),
+		z0 = svorr_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u16.c
new file mode 100644
index 000000000..000a0444c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u16.c
@@ -0,0 +1,376 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orr_u16_m_tied1:
+**	orr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u16_m_tied1, svuint16_t,
+		z0 = svorr_u16_m (p0, z0, z1),
+		z0 = svorr_m (p0, z0, z1))
+
+/*
+** orr_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	orr	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u16_m_tied2, svuint16_t,
+		z0 = svorr_u16_m (p0, z1, z0),
+		z0 = svorr_m (p0, z1, z0))
+
+/*
+** orr_u16_m_untied:
+**	movprfx	z0, z1
+**	orr	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u16_m_untied, svuint16_t,
+		z0 = svorr_u16_m (p0, z1, z2),
+		z0 = svorr_m (p0, z1, z2))
+
+/*
+** orr_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	orr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svorr_n_u16_m (p0, z0, x0),
+		 z0 = svorr_m (p0, z0, x0))
+
+/*
+** orr_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	orr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svorr_n_u16_m (p0, z1, x0),
+		 z0 = svorr_m (p0, z1, x0))
+
+/*
+** orr_1_u16_m_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	orr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u16_m_tied1, svuint16_t,
+		z0 = svorr_n_u16_m (p0, z0, 1),
+		z0 = svorr_m (p0, z0, 1))
+
+/*
+** orr_1_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0, z1
+**	orr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u16_m_untied, svuint16_t,
+		z0 = svorr_n_u16_m (p0, z1, 1),
+		z0 = svorr_m (p0, z1, 1))
+
+/*
+** orr_m2_u16_m:
+**	mov	(z[0-9]+\.h), #-2
+**	orr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m2_u16_m, svuint16_t,
+		z0 = svorr_n_u16_m (p0, z0, -2),
+		z0 = svorr_m (p0, z0, -2))
+
+/*
+** orr_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	orr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u16_z_tied1, svuint16_t,
+		z0 = svorr_u16_z (p0, z0, z1),
+		z0 = svorr_z (p0, z0, z1))
+
+/*
+** orr_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	orr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u16_z_tied2, svuint16_t,
+		z0 = svorr_u16_z (p0, z1, z0),
+		z0 = svorr_z (p0, z1, z0))
+
+/*
+** orr_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	orr	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	orr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u16_z_untied, svuint16_t,
+		z0 = svorr_u16_z (p0, z1, z2),
+		z0 = svorr_z (p0, z1, z2))
+
+/*
+** orr_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	orr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svorr_n_u16_z (p0, z0, x0),
+		 z0 = svorr_z (p0, z0, x0))
+
+/*
+** orr_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	orr	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	orr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svorr_n_u16_z (p0, z1, x0),
+		 z0 = svorr_z (p0, z1, x0))
+
+/*
+** orr_1_u16_z_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	orr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u16_z_tied1, svuint16_t,
+		z0 = svorr_n_u16_z (p0, z0, 1),
+		z0 = svorr_z (p0, z0, 1))
+
+/*
+** orr_1_u16_z_untied:
+**	mov	(z[0-9]+\.h), #1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	orr	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	orr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u16_z_untied, svuint16_t,
+		z0 = svorr_n_u16_z (p0, z1, 1),
+		z0 = svorr_z (p0, z1, 1))
+
+/*
+** orr_u16_x_tied1:
+**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u16_x_tied1, svuint16_t,
+		z0 = svorr_u16_x (p0, z0, z1),
+		z0 = svorr_x (p0, z0, z1))
+
+/*
+** orr_u16_x_tied2:
+**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u16_x_tied2, svuint16_t,
+		z0 = svorr_u16_x (p0, z1, z0),
+		z0 = svorr_x (p0, z1, z0))
+
+/*
+** orr_u16_x_untied:
+**	orr	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u16_x_untied, svuint16_t,
+		z0 = svorr_u16_x (p0, z1, z2),
+		z0 = svorr_x (p0, z1, z2))
+
+/*
+** orr_w0_u16_x_tied1:
+**	mov	(z[0-9]+)\.h, w0
+**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svorr_n_u16_x (p0, z0, x0),
+		 z0 = svorr_x (p0, z0, x0))
+
+/*
+** orr_w0_u16_x_untied:
+**	mov	(z[0-9]+)\.h, w0
+**	orr	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svorr_n_u16_x (p0, z1, x0),
+		 z0 = svorr_x (p0, z1, x0))
+
+/*
+** orr_1_u16_x_tied1:
+**	orr	z0\.h, z0\.h, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u16_x_tied1, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z0, 1),
+		z0 = svorr_x (p0, z0, 1))
+
+/*
+** orr_1_u16_x_untied:
+**	movprfx	z0, z1
+**	orr	z0\.h, z0\.h, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u16_x_untied, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z1, 1),
+		z0 = svorr_x (p0, z1, 1))
+
+/*
+** orr_127_u16_x:
+**	orr	z0\.h, z0\.h, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (orr_127_u16_x, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z0, 127),
+		z0 = svorr_x (p0, z0, 127))
+
+/*
+** orr_128_u16_x:
+**	orr	z0\.h, z0\.h, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (orr_128_u16_x, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z0, 128),
+		z0 = svorr_x (p0, z0, 128))
+
+/*
+** orr_255_u16_x:
+**	orr	z0\.h, z0\.h, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (orr_255_u16_x, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z0, 255),
+		z0 = svorr_x (p0, z0, 255))
+
+/*
+** orr_256_u16_x:
+**	orr	z0\.h, z0\.h, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (orr_256_u16_x, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z0, 256),
+		z0 = svorr_x (p0, z0, 256))
+
+/*
+** orr_257_u16_x:
+**	orr	z0\.h, z0\.h, #0x101
+**	ret
+*/
+TEST_UNIFORM_Z (orr_257_u16_x, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z0, 257),
+		z0 = svorr_x (p0, z0, 257))
+
+/*
+** orr_512_u16_x:
+**	orr	z0\.h, z0\.h, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (orr_512_u16_x, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z0, 512),
+		z0 = svorr_x (p0, z0, 512))
+
+/*
+** orr_65280_u16_x:
+**	orr	z0\.h, z0\.h, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_65280_u16_x, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z0, 0xff00),
+		z0 = svorr_x (p0, z0, 0xff00))
+
+/*
+** orr_m127_u16_x:
+**	orr	z0\.h, z0\.h, #0xff81
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m127_u16_x, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z0, -127),
+		z0 = svorr_x (p0, z0, -127))
+
+/*
+** orr_m128_u16_x:
+**	orr	z0\.h, z0\.h, #0xff80
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m128_u16_x, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z0, -128),
+		z0 = svorr_x (p0, z0, -128))
+
+/*
+** orr_m255_u16_x:
+**	orr	z0\.h, z0\.h, #0xff01
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m255_u16_x, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z0, -255),
+		z0 = svorr_x (p0, z0, -255))
+
+/*
+** orr_m256_u16_x:
+**	orr	z0\.h, z0\.h, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m256_u16_x, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z0, -256),
+		z0 = svorr_x (p0, z0, -256))
+
+/*
+** orr_m257_u16_x:
+**	orr	z0\.h, z0\.h, #0xfeff
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m257_u16_x, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z0, -257),
+		z0 = svorr_x (p0, z0, -257))
+
+/*
+** orr_m512_u16_x:
+**	orr	z0\.h, z0\.h, #0xfe00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m512_u16_x, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z0, -512),
+		z0 = svorr_x (p0, z0, -512))
+
+/*
+** orr_m32768_u16_x:
+**	orr	z0\.h, z0\.h, #0x8000
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m32768_u16_x, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z0, -0x8000),
+		z0 = svorr_x (p0, z0, -0x8000))
+
+/*
+** orr_5_u16_x:
+**	mov	(z[0-9]+)\.h, #5
+**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_5_u16_x, svuint16_t,
+		z0 = svorr_n_u16_x (p0, z0, 5),
+		z0 = svorr_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u32.c
new file mode 100644
index 000000000..8e2351d16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u32.c
@@ -0,0 +1,372 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orr_u32_m_tied1:
+**	orr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u32_m_tied1, svuint32_t,
+		z0 = svorr_u32_m (p0, z0, z1),
+		z0 = svorr_m (p0, z0, z1))
+
+/*
+** orr_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	orr	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u32_m_tied2, svuint32_t,
+		z0 = svorr_u32_m (p0, z1, z0),
+		z0 = svorr_m (p0, z1, z0))
+
+/*
+** orr_u32_m_untied:
+**	movprfx	z0, z1
+**	orr	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u32_m_untied, svuint32_t,
+		z0 = svorr_u32_m (p0, z1, z2),
+		z0 = svorr_m (p0, z1, z2))
+
+/*
+** orr_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	orr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svorr_n_u32_m (p0, z0, x0),
+		 z0 = svorr_m (p0, z0, x0))
+
+/*
+** orr_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	orr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svorr_n_u32_m (p0, z1, x0),
+		 z0 = svorr_m (p0, z1, x0))
+
+/*
+** orr_1_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	orr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u32_m_tied1, svuint32_t,
+		z0 = svorr_n_u32_m (p0, z0, 1),
+		z0 = svorr_m (p0, z0, 1))
+
+/*
+** orr_1_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0, z1
+**	orr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u32_m_untied, svuint32_t,
+		z0 = svorr_n_u32_m (p0, z1, 1),
+		z0 = svorr_m (p0, z1, 1))
+
+/*
+** orr_m2_u32_m:
+**	mov	(z[0-9]+\.s), #-2
+**	orr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m2_u32_m, svuint32_t,
+		z0 = svorr_n_u32_m (p0, z0, -2),
+		z0 = svorr_m (p0, z0, -2))
+
+/*
+** orr_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	orr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u32_z_tied1, svuint32_t,
+		z0 = svorr_u32_z (p0, z0, z1),
+		z0 = svorr_z (p0, z0, z1))
+
+/*
+** orr_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	orr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u32_z_tied2, svuint32_t,
+		z0 = svorr_u32_z (p0, z1, z0),
+		z0 = svorr_z (p0, z1, z0))
+
+/*
+** orr_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	orr	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	orr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u32_z_untied, svuint32_t,
+		z0 = svorr_u32_z (p0, z1, z2),
+		z0 = svorr_z (p0, z1, z2))
+
+/*
+** orr_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	orr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svorr_n_u32_z (p0, z0, x0),
+		 z0 = svorr_z (p0, z0, x0))
+
+/*
+** orr_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	orr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	orr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svorr_n_u32_z (p0, z1, x0),
+		 z0 = svorr_z (p0, z1, x0))
+
+/*
+** orr_1_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	orr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u32_z_tied1, svuint32_t,
+		z0 = svorr_n_u32_z (p0, z0, 1),
+		z0 = svorr_z (p0, z0, 1))
+
+/*
+** orr_1_u32_z_untied:
+**	mov	(z[0-9]+\.s), #1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	orr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	orr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u32_z_untied, svuint32_t,
+		z0 = svorr_n_u32_z (p0, z1, 1),
+		z0 = svorr_z (p0, z1, 1))
+
+/*
+** orr_u32_x_tied1:
+**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u32_x_tied1, svuint32_t,
+		z0 = svorr_u32_x (p0, z0, z1),
+		z0 = svorr_x (p0, z0, z1))
+
+/*
+** orr_u32_x_tied2:
+**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u32_x_tied2, svuint32_t,
+		z0 = svorr_u32_x (p0, z1, z0),
+		z0 = svorr_x (p0, z1, z0))
+
+/*
+** orr_u32_x_untied:
+**	orr	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u32_x_untied, svuint32_t,
+		z0 = svorr_u32_x (p0, z1, z2),
+		z0 = svorr_x (p0, z1, z2))
+
+/*
+** orr_w0_u32_x_tied1:
+**	mov	(z[0-9]+)\.s, w0
+**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svorr_n_u32_x (p0, z0, x0),
+		 z0 = svorr_x (p0, z0, x0))
+
+/*
+** orr_w0_u32_x_untied:
+**	mov	(z[0-9]+)\.s, w0
+**	orr	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svorr_n_u32_x (p0, z1, x0),
+		 z0 = svorr_x (p0, z1, x0))
+
+/*
+** orr_1_u32_x_tied1:
+**	orr	z0\.s, z0\.s, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u32_x_tied1, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z0, 1),
+		z0 = svorr_x (p0, z0, 1))
+
+/*
+** orr_1_u32_x_untied:
+**	movprfx	z0, z1
+**	orr	z0\.s, z0\.s, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u32_x_untied, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z1, 1),
+		z0 = svorr_x (p0, z1, 1))
+
+/*
+** orr_127_u32_x:
+**	orr	z0\.s, z0\.s, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (orr_127_u32_x, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z0, 127),
+		z0 = svorr_x (p0, z0, 127))
+
+/*
+** orr_128_u32_x:
+**	orr	z0\.s, z0\.s, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (orr_128_u32_x, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z0, 128),
+		z0 = svorr_x (p0, z0, 128))
+
+/*
+** orr_255_u32_x:
+**	orr	z0\.s, z0\.s, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (orr_255_u32_x, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z0, 255),
+		z0 = svorr_x (p0, z0, 255))
+
+/*
+** orr_256_u32_x:
+**	orr	z0\.s, z0\.s, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (orr_256_u32_x, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z0, 256),
+		z0 = svorr_x (p0, z0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (orr_257_u32_x, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z0, 257),
+		z0 = svorr_x (p0, z0, 257))
+
+/*
+** orr_512_u32_x:
+**	orr	z0\.s, z0\.s, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (orr_512_u32_x, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z0, 512),
+		z0 = svorr_x (p0, z0, 512))
+
+/*
+** orr_65280_u32_x:
+**	orr	z0\.s, z0\.s, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_65280_u32_x, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z0, 0xff00),
+		z0 = svorr_x (p0, z0, 0xff00))
+
+/*
+** orr_m127_u32_x:
+**	orr	z0\.s, z0\.s, #0xffffff81
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m127_u32_x, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z0, -127),
+		z0 = svorr_x (p0, z0, -127))
+
+/*
+** orr_m128_u32_x:
+**	orr	z0\.s, z0\.s, #0xffffff80
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m128_u32_x, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z0, -128),
+		z0 = svorr_x (p0, z0, -128))
+
+/*
+** orr_m255_u32_x:
+**	orr	z0\.s, z0\.s, #0xffffff01
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m255_u32_x, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z0, -255),
+		z0 = svorr_x (p0, z0, -255))
+
+/*
+** orr_m256_u32_x:
+**	orr	z0\.s, z0\.s, #0xffffff00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m256_u32_x, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z0, -256),
+		z0 = svorr_x (p0, z0, -256))
+
+/*
+** orr_m257_u32_x:
+**	orr	z0\.s, z0\.s, #0xfffffeff
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m257_u32_x, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z0, -257),
+		z0 = svorr_x (p0, z0, -257))
+
+/*
+** orr_m512_u32_x:
+**	orr	z0\.s, z0\.s, #0xfffffe00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m512_u32_x, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z0, -512),
+		z0 = svorr_x (p0, z0, -512))
+
+/*
+** orr_m32768_u32_x:
+**	orr	z0\.s, z0\.s, #0xffff8000
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m32768_u32_x, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z0, -0x8000),
+		z0 = svorr_x (p0, z0, -0x8000))
+
+/*
+** orr_5_u32_x:
+**	mov	(z[0-9]+)\.s, #5
+**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_5_u32_x, svuint32_t,
+		z0 = svorr_n_u32_x (p0, z0, 5),
+		z0 = svorr_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u64.c
new file mode 100644
index 000000000..323e2101e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u64.c
@@ -0,0 +1,372 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orr_u64_m_tied1:
+**	orr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u64_m_tied1, svuint64_t,
+		z0 = svorr_u64_m (p0, z0, z1),
+		z0 = svorr_m (p0, z0, z1))
+
+/*
+** orr_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	orr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u64_m_tied2, svuint64_t,
+		z0 = svorr_u64_m (p0, z1, z0),
+		z0 = svorr_m (p0, z1, z0))
+
+/*
+** orr_u64_m_untied:
+**	movprfx	z0, z1
+**	orr	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u64_m_untied, svuint64_t,
+		z0 = svorr_u64_m (p0, z1, z2),
+		z0 = svorr_m (p0, z1, z2))
+
+/*
+** orr_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	orr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svorr_n_u64_m (p0, z0, x0),
+		 z0 = svorr_m (p0, z0, x0))
+
+/*
+** orr_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	orr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svorr_n_u64_m (p0, z1, x0),
+		 z0 = svorr_m (p0, z1, x0))
+
+/*
+** orr_1_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	orr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u64_m_tied1, svuint64_t,
+		z0 = svorr_n_u64_m (p0, z0, 1),
+		z0 = svorr_m (p0, z0, 1))
+
+/*
+** orr_1_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0, z1
+**	orr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u64_m_untied, svuint64_t,
+		z0 = svorr_n_u64_m (p0, z1, 1),
+		z0 = svorr_m (p0, z1, 1))
+
+/*
+** orr_m2_u64_m:
+**	mov	(z[0-9]+\.d), #-2
+**	orr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m2_u64_m, svuint64_t,
+		z0 = svorr_n_u64_m (p0, z0, -2),
+		z0 = svorr_m (p0, z0, -2))
+
+/*
+** orr_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	orr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u64_z_tied1, svuint64_t,
+		z0 = svorr_u64_z (p0, z0, z1),
+		z0 = svorr_z (p0, z0, z1))
+
+/*
+** orr_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	orr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u64_z_tied2, svuint64_t,
+		z0 = svorr_u64_z (p0, z1, z0),
+		z0 = svorr_z (p0, z1, z0))
+
+/*
+** orr_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	orr	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	orr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u64_z_untied, svuint64_t,
+		z0 = svorr_u64_z (p0, z1, z2),
+		z0 = svorr_z (p0, z1, z2))
+
+/*
+** orr_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	orr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svorr_n_u64_z (p0, z0, x0),
+		 z0 = svorr_z (p0, z0, x0))
+
+/*
+** orr_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	orr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	orr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svorr_n_u64_z (p0, z1, x0),
+		 z0 = svorr_z (p0, z1, x0))
+
+/*
+** orr_1_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	orr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u64_z_tied1, svuint64_t,
+		z0 = svorr_n_u64_z (p0, z0, 1),
+		z0 = svorr_z (p0, z0, 1))
+
+/*
+** orr_1_u64_z_untied:
+**	mov	(z[0-9]+\.d), #1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	orr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	orr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u64_z_untied, svuint64_t,
+		z0 = svorr_n_u64_z (p0, z1, 1),
+		z0 = svorr_z (p0, z1, 1))
+
+/*
+** orr_u64_x_tied1:
+**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u64_x_tied1, svuint64_t,
+		z0 = svorr_u64_x (p0, z0, z1),
+		z0 = svorr_x (p0, z0, z1))
+
+/*
+** orr_u64_x_tied2:
+**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u64_x_tied2, svuint64_t,
+		z0 = svorr_u64_x (p0, z1, z0),
+		z0 = svorr_x (p0, z1, z0))
+
+/*
+** orr_u64_x_untied:
+**	orr	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u64_x_untied, svuint64_t,
+		z0 = svorr_u64_x (p0, z1, z2),
+		z0 = svorr_x (p0, z1, z2))
+
+/*
+** orr_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	orr	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svorr_n_u64_x (p0, z0, x0),
+		 z0 = svorr_x (p0, z0, x0))
+
+/*
+** orr_x0_u64_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	orr	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svorr_n_u64_x (p0, z1, x0),
+		 z0 = svorr_x (p0, z1, x0))
+
+/*
+** orr_1_u64_x_tied1:
+**	orr	z0\.d, z0\.d, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u64_x_tied1, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z0, 1),
+		z0 = svorr_x (p0, z0, 1))
+
+/*
+** orr_1_u64_x_untied:
+**	movprfx	z0, z1
+**	orr	z0\.d, z0\.d, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u64_x_untied, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z1, 1),
+		z0 = svorr_x (p0, z1, 1))
+
+/*
+** orr_127_u64_x:
+**	orr	z0\.d, z0\.d, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (orr_127_u64_x, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z0, 127),
+		z0 = svorr_x (p0, z0, 127))
+
+/*
+** orr_128_u64_x:
+**	orr	z0\.d, z0\.d, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (orr_128_u64_x, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z0, 128),
+		z0 = svorr_x (p0, z0, 128))
+
+/*
+** orr_255_u64_x:
+**	orr	z0\.d, z0\.d, #0xff
+**	ret
+*/
+TEST_UNIFORM_Z (orr_255_u64_x, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z0, 255),
+		z0 = svorr_x (p0, z0, 255))
+
+/*
+** orr_256_u64_x:
+**	orr	z0\.d, z0\.d, #0x100
+**	ret
+*/
+TEST_UNIFORM_Z (orr_256_u64_x, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z0, 256),
+		z0 = svorr_x (p0, z0, 256))
+
+/* TODO: Bad code and needs fixing.  */
+TEST_UNIFORM_Z (orr_257_u64_x, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z0, 257),
+		z0 = svorr_x (p0, z0, 257))
+
+/*
+** orr_512_u64_x:
+**	orr	z0\.d, z0\.d, #0x200
+**	ret
+*/
+TEST_UNIFORM_Z (orr_512_u64_x, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z0, 512),
+		z0 = svorr_x (p0, z0, 512))
+
+/*
+** orr_65280_u64_x:
+**	orr	z0\.d, z0\.d, #0xff00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_65280_u64_x, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z0, 0xff00),
+		z0 = svorr_x (p0, z0, 0xff00))
+
+/*
+** orr_m127_u64_x:
+**	orr	z0\.d, z0\.d, #0xffffffffffffff81
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m127_u64_x, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z0, -127),
+		z0 = svorr_x (p0, z0, -127))
+
+/*
+** orr_m128_u64_x:
+**	orr	z0\.d, z0\.d, #0xffffffffffffff80
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m128_u64_x, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z0, -128),
+		z0 = svorr_x (p0, z0, -128))
+
+/*
+** orr_m255_u64_x:
+**	orr	z0\.d, z0\.d, #0xffffffffffffff01
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m255_u64_x, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z0, -255),
+		z0 = svorr_x (p0, z0, -255))
+
+/*
+** orr_m256_u64_x:
+**	orr	z0\.d, z0\.d, #0xffffffffffffff00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m256_u64_x, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z0, -256),
+		z0 = svorr_x (p0, z0, -256))
+
+/*
+** orr_m257_u64_x:
+**	orr	z0\.d, z0\.d, #0xfffffffffffffeff
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m257_u64_x, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z0, -257),
+		z0 = svorr_x (p0, z0, -257))
+
+/*
+** orr_m512_u64_x:
+**	orr	z0\.d, z0\.d, #0xfffffffffffffe00
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m512_u64_x, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z0, -512),
+		z0 = svorr_x (p0, z0, -512))
+
+/*
+** orr_m32768_u64_x:
+**	orr	z0\.d, z0\.d, #0xffffffffffff8000
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m32768_u64_x, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z0, -0x8000),
+		z0 = svorr_x (p0, z0, -0x8000))
+
+/*
+** orr_5_u64_x:
+**	mov	(z[0-9]+\.d), #5
+**	orr	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_5_u64_x, svuint64_t,
+		z0 = svorr_n_u64_x (p0, z0, 5),
+		z0 = svorr_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u8.c
new file mode 100644
index 000000000..efe5591b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orr_u8.c
@@ -0,0 +1,295 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orr_u8_m_tied1:
+**	orr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u8_m_tied1, svuint8_t,
+		z0 = svorr_u8_m (p0, z0, z1),
+		z0 = svorr_m (p0, z0, z1))
+
+/*
+** orr_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	orr	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u8_m_tied2, svuint8_t,
+		z0 = svorr_u8_m (p0, z1, z0),
+		z0 = svorr_m (p0, z1, z0))
+
+/*
+** orr_u8_m_untied:
+**	movprfx	z0, z1
+**	orr	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u8_m_untied, svuint8_t,
+		z0 = svorr_u8_m (p0, z1, z2),
+		z0 = svorr_m (p0, z1, z2))
+
+/*
+** orr_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	orr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svorr_n_u8_m (p0, z0, x0),
+		 z0 = svorr_m (p0, z0, x0))
+
+/*
+** orr_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	orr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svorr_n_u8_m (p0, z1, x0),
+		 z0 = svorr_m (p0, z1, x0))
+
+/*
+** orr_1_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	orr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u8_m_tied1, svuint8_t,
+		z0 = svorr_n_u8_m (p0, z0, 1),
+		z0 = svorr_m (p0, z0, 1))
+
+/*
+** orr_1_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0, z1
+**	orr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u8_m_untied, svuint8_t,
+		z0 = svorr_n_u8_m (p0, z1, 1),
+		z0 = svorr_m (p0, z1, 1))
+
+/*
+** orr_m2_u8_m:
+**	mov	(z[0-9]+\.b), #-2
+**	orr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m2_u8_m, svuint8_t,
+		z0 = svorr_n_u8_m (p0, z0, -2),
+		z0 = svorr_m (p0, z0, -2))
+
+/*
+** orr_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	orr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u8_z_tied1, svuint8_t,
+		z0 = svorr_u8_z (p0, z0, z1),
+		z0 = svorr_z (p0, z0, z1))
+
+/*
+** orr_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	orr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u8_z_tied2, svuint8_t,
+		z0 = svorr_u8_z (p0, z1, z0),
+		z0 = svorr_z (p0, z1, z0))
+
+/*
+** orr_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	orr	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	orr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u8_z_untied, svuint8_t,
+		z0 = svorr_u8_z (p0, z1, z2),
+		z0 = svorr_z (p0, z1, z2))
+
+/*
+** orr_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	orr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svorr_n_u8_z (p0, z0, x0),
+		 z0 = svorr_z (p0, z0, x0))
+
+/*
+** orr_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	orr	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	orr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svorr_n_u8_z (p0, z1, x0),
+		 z0 = svorr_z (p0, z1, x0))
+
+/*
+** orr_1_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	orr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u8_z_tied1, svuint8_t,
+		z0 = svorr_n_u8_z (p0, z0, 1),
+		z0 = svorr_z (p0, z0, 1))
+
+/*
+** orr_1_u8_z_untied:
+**	mov	(z[0-9]+\.b), #1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	orr	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	orr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u8_z_untied, svuint8_t,
+		z0 = svorr_n_u8_z (p0, z1, 1),
+		z0 = svorr_z (p0, z1, 1))
+
+/*
+** orr_u8_x_tied1:
+**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u8_x_tied1, svuint8_t,
+		z0 = svorr_u8_x (p0, z0, z1),
+		z0 = svorr_x (p0, z0, z1))
+
+/*
+** orr_u8_x_tied2:
+**	orr	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u8_x_tied2, svuint8_t,
+		z0 = svorr_u8_x (p0, z1, z0),
+		z0 = svorr_x (p0, z1, z0))
+
+/*
+** orr_u8_x_untied:
+**	orr	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_u8_x_untied, svuint8_t,
+		z0 = svorr_u8_x (p0, z1, z2),
+		z0 = svorr_x (p0, z1, z2))
+
+/*
+** orr_w0_u8_x_tied1:
+**	mov	(z[0-9]+)\.b, w0
+**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svorr_n_u8_x (p0, z0, x0),
+		 z0 = svorr_x (p0, z0, x0))
+
+/*
+** orr_w0_u8_x_untied:
+**	mov	(z[0-9]+)\.b, w0
+**	orr	z0\.d, (z1\.d, \1\.d|\1\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (orr_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svorr_n_u8_x (p0, z1, x0),
+		 z0 = svorr_x (p0, z1, x0))
+
+/*
+** orr_1_u8_x_tied1:
+**	orr	z0\.b, z0\.b, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u8_x_tied1, svuint8_t,
+		z0 = svorr_n_u8_x (p0, z0, 1),
+		z0 = svorr_x (p0, z0, 1))
+
+/*
+** orr_1_u8_x_untied:
+**	movprfx	z0, z1
+**	orr	z0\.b, z0\.b, #0x1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_1_u8_x_untied, svuint8_t,
+		z0 = svorr_n_u8_x (p0, z1, 1),
+		z0 = svorr_x (p0, z1, 1))
+
+/*
+** orr_127_u8_x:
+**	orr	z0\.b, z0\.b, #0x7f
+**	ret
+*/
+TEST_UNIFORM_Z (orr_127_u8_x, svuint8_t,
+		z0 = svorr_n_u8_x (p0, z0, 127),
+		z0 = svorr_x (p0, z0, 127))
+
+/*
+** orr_128_u8_x:
+**	orr	z0\.b, z0\.b, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (orr_128_u8_x, svuint8_t,
+		z0 = svorr_n_u8_x (p0, z0, 128),
+		z0 = svorr_x (p0, z0, 128))
+
+/*
+** orr_255_u8_x:
+**	mov	z0\.b, #-1
+**	ret
+*/
+TEST_UNIFORM_Z (orr_255_u8_x, svuint8_t,
+		z0 = svorr_n_u8_x (p0, z0, 255),
+		z0 = svorr_x (p0, z0, 255))
+
+/*
+** orr_m127_u8_x:
+**	orr	z0\.b, z0\.b, #0x81
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m127_u8_x, svuint8_t,
+		z0 = svorr_n_u8_x (p0, z0, -127),
+		z0 = svorr_x (p0, z0, -127))
+
+/*
+** orr_m128_u8_x:
+**	orr	z0\.b, z0\.b, #0x80
+**	ret
+*/
+TEST_UNIFORM_Z (orr_m128_u8_x, svuint8_t,
+		z0 = svorr_n_u8_x (p0, z0, -128),
+		z0 = svorr_x (p0, z0, -128))
+
+/*
+** orr_5_u8_x:
+**	mov	(z[0-9]+)\.b, #5
+**	orr	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (orr_5_u8_x, svuint8_t,
+		z0 = svorr_n_u8_x (p0, z0, 5),
+		z0 = svorr_x (p0, z0, 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s16.c
new file mode 100644
index 000000000..c9b268d3d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s16.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orv_x0_s16:
+**	orv	h([0-9]+), p0, z0\.h
+**	umov	w0, v\1\.h\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (orv_x0_s16, int16_t, svint16_t,
+		  x0 = svorv_s16 (p0, z0),
+		  x0 = svorv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s32.c
new file mode 100644
index 000000000..df4025f54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s32.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orv_x0_s32:
+**	orv	(s[0-9]+), p0, z0\.s
+**	fmov	w0, \1
+**	ret
+*/
+TEST_REDUCTION_X (orv_x0_s32, int32_t, svint32_t,
+		  x0 = svorv_s32 (p0, z0),
+		  x0 = svorv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s64.c
new file mode 100644
index 000000000..76a835ce3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s64.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orv_x0_s64:
+**	orv	(d[0-9]+), p0, z0\.d
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (orv_x0_s64, int64_t, svint64_t,
+		  x0 = svorv_s64 (p0, z0),
+		  x0 = svorv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s8.c
new file mode 100644
index 000000000..3f2031d9c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_s8.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orv_x0_s8:
+**	orv	b([0-9]+), p0, z0\.b
+**	umov	w0, v\1\.b\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (orv_x0_s8, int8_t, svint8_t,
+		  x0 = svorv_s8 (p0, z0),
+		  x0 = svorv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u16.c
new file mode 100644
index 000000000..28bfbecb0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u16.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orv_x0_u16:
+**	orv	h([0-9]+), p0, z0\.h
+**	umov	w0, v\1\.h\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (orv_x0_u16, uint16_t, svuint16_t,
+		  x0 = svorv_u16 (p0, z0),
+		  x0 = svorv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u32.c
new file mode 100644
index 000000000..1988d5623
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u32.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orv_x0_u32:
+**	orv	(s[0-9]+), p0, z0\.s
+**	fmov	w0, \1
+**	ret
+*/
+TEST_REDUCTION_X (orv_x0_u32, uint32_t, svuint32_t,
+		  x0 = svorv_u32 (p0, z0),
+		  x0 = svorv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u64.c
new file mode 100644
index 000000000..c8a8429a7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u64.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orv_x0_u64:
+**	orv	(d[0-9]+), p0, z0\.d
+**	fmov	x0, \1
+**	ret
+*/
+TEST_REDUCTION_X (orv_x0_u64, uint64_t, svuint64_t,
+		  x0 = svorv_u64 (p0, z0),
+		  x0 = svorv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u8.c
new file mode 100644
index 000000000..bcab32d8b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/orv_u8.c
@@ -0,0 +1,13 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** orv_x0_u8:
+**	orv	b([0-9]+), p0, z0\.b
+**	umov	w0, v\1\.b\[0\]
+**	ret
+*/
+TEST_REDUCTION_X (orv_x0_u8, uint8_t, svuint8_t,
+		  x0 = svorv_u8 (p0, z0),
+		  x0 = svorv (p0, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pfalse.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pfalse.c
new file mode 100644
index 000000000..a74a59283
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pfalse.c
@@ -0,0 +1,13 @@
+/* { dg-additional-options "-msve-vector-bits=scalable" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** pfalse_b:
+**	pfalse	p0\.b
+**	ret
+*/
+TEST_P (pfalse_b,
+	p0 = svpfalse_b (),
+	p0 = svpfalse ());
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pfirst_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pfirst_b.c
new file mode 100644
index 000000000..a32099656
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pfirst_b.c
@@ -0,0 +1,22 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** pfirst_b_tied1:
+**	pfirst	p0\.b, p3, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (pfirst_b_tied1,
+		p0 = svpfirst_b (p3, p0),
+		p0 = svpfirst (p3, p0))
+
+/*
+** pfirst_b_untied:
+**	mov	p0\.b, p1\.b
+**	pfirst	p0\.b, p3, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (pfirst_b_untied,
+		p0 = svpfirst_b (p3, p1),
+		p0 = svpfirst (p3, p1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b16.c
new file mode 100644
index 000000000..ad0efe5e7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b16.c
@@ -0,0 +1,22 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** pnext_b16_tied1:
+**	pnext	p0\.h, p3, p0\.h
+**	ret
+*/
+TEST_UNIFORM_P (pnext_b16_tied1,
+		p0 = svpnext_b16 (p3, p0),
+		p0 = svpnext_b16 (p3, p0))
+
+/*
+** pnext_b16_untied:
+**	mov	p0\.b, p1\.b
+**	pnext	p0\.h, p3, p0\.h
+**	ret
+*/
+TEST_UNIFORM_P (pnext_b16_untied,
+		p0 = svpnext_b16 (p3, p1),
+		p0 = svpnext_b16 (p3, p1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b32.c
new file mode 100644
index 000000000..a0030fae1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b32.c
@@ -0,0 +1,22 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** pnext_b32_tied1:
+**	pnext	p0\.s, p3, p0\.s
+**	ret
+*/
+TEST_UNIFORM_P (pnext_b32_tied1,
+		p0 = svpnext_b32 (p3, p0),
+		p0 = svpnext_b32 (p3, p0))
+
+/*
+** pnext_b32_untied:
+**	mov	p0\.b, p1\.b
+**	pnext	p0\.s, p3, p0\.s
+**	ret
+*/
+TEST_UNIFORM_P (pnext_b32_untied,
+		p0 = svpnext_b32 (p3, p1),
+		p0 = svpnext_b32 (p3, p1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b64.c
new file mode 100644
index 000000000..59db2f04f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b64.c
@@ -0,0 +1,22 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** pnext_b64_tied1:
+**	pnext	p0\.d, p3, p0\.d
+**	ret
+*/
+TEST_UNIFORM_P (pnext_b64_tied1,
+		p0 = svpnext_b64 (p3, p0),
+		p0 = svpnext_b64 (p3, p0))
+
+/*
+** pnext_b64_untied:
+**	mov	p0\.b, p1\.b
+**	pnext	p0\.d, p3, p0\.d
+**	ret
+*/
+TEST_UNIFORM_P (pnext_b64_untied,
+		p0 = svpnext_b64 (p3, p1),
+		p0 = svpnext_b64 (p3, p1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b8.c
new file mode 100644
index 000000000..cfc2e907c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/pnext_b8.c
@@ -0,0 +1,22 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** pnext_b8_tied1:
+**	pnext	p0\.b, p3, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (pnext_b8_tied1,
+		p0 = svpnext_b8 (p3, p0),
+		p0 = svpnext_b8 (p3, p0))
+
+/*
+** pnext_b8_untied:
+**	mov	p0\.b, p1\.b
+**	pnext	p0\.b, p3, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (pnext_b8_untied,
+		p0 = svpnext_b8 (p3, p1),
+		p0 = svpnext_b8 (p3, p1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c
new file mode 100644
index 000000000..d2b2777e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c
@@ -0,0 +1,245 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** prfb_base:
+**	prfb	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfb_base, uint8_t,
+	       svprfb (p0, x0, SV_PLDL1KEEP),
+	       svprfb (p0, x0, SV_PLDL1KEEP))
+
+/*
+** prfb_u8_index:
+**	prfb	pldl1keep, p0, \[x0, x1\]
+**	ret
+*/
+TEST_PREFETCH (prfb_u8_index, uint8_t,
+	       svprfb (p0, x0 + x1, SV_PLDL1KEEP),
+	       svprfb (p0, x0 + x1, SV_PLDL1KEEP))
+
+/*
+** prfb_u8_1:
+**	add	(x[0-9+]), x0, #?1
+**	prfb	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfb_u8_1, uint8_t,
+	       svprfb (p0, x0 + 1, SV_PLDL1KEEP),
+	       svprfb (p0, x0 + 1, SV_PLDL1KEEP))
+
+/*
+** prfb_u16_index:
+**	add	(x[0-9+]), x0, x1, lsl #?1
+**	prfb	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfb_u16_index, uint16_t,
+	       svprfb (p0, x0 + x1, SV_PLDL1KEEP),
+	       svprfb (p0, x0 + x1, SV_PLDL1KEEP))
+
+/*
+** prfb_u16_1:
+**	add	(x[0-9+]), x0, #?2
+**	prfb	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfb_u16_1, uint16_t,
+	       svprfb (p0, x0 + 1, SV_PLDL1KEEP),
+	       svprfb (p0, x0 + 1, SV_PLDL1KEEP))
+
+/*
+** prfb_u32_index:
+**	add	(x[0-9+]), x0, x1, lsl #?2
+**	prfb	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfb_u32_index, uint32_t,
+	       svprfb (p0, x0 + x1, SV_PLDL1KEEP),
+	       svprfb (p0, x0 + x1, SV_PLDL1KEEP))
+
+/*
+** prfb_u32_1:
+**	add	(x[0-9+]), x0, #?4
+**	prfb	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfb_u32_1, uint32_t,
+	       svprfb (p0, x0 + 1, SV_PLDL1KEEP),
+	       svprfb (p0, x0 + 1, SV_PLDL1KEEP))
+
+/*
+** prfb_u64_index:
+**	add	(x[0-9+]), x0, x1, lsl #?3
+**	prfb	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfb_u64_index, uint64_t,
+	       svprfb (p0, x0 + x1, SV_PLDL1KEEP),
+	       svprfb (p0, x0 + x1, SV_PLDL1KEEP))
+
+/*
+** prfb_u64_1:
+**	add	(x[0-9+]), x0, #?8
+**	prfb	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfb_u64_1, uint64_t,
+	       svprfb (p0, x0 + 1, SV_PLDL1KEEP),
+	       svprfb (p0, x0 + 1, SV_PLDL1KEEP))
+
+/*
+** prfb_pldl1strm:
+**	prfb	pldl1strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfb_pldl1strm, uint8_t,
+	       svprfb (p0, x0, SV_PLDL1STRM),
+	       svprfb (p0, x0, SV_PLDL1STRM))
+
+/*
+** prfb_pldl2keep:
+**	prfb	pldl2keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfb_pldl2keep, uint8_t,
+	       svprfb (p0, x0, SV_PLDL2KEEP),
+	       svprfb (p0, x0, SV_PLDL2KEEP))
+
+/*
+** prfb_pldl2strm:
+**	prfb	pldl2strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfb_pldl2strm, uint8_t,
+	       svprfb (p0, x0, SV_PLDL2STRM),
+	       svprfb (p0, x0, SV_PLDL2STRM))
+
+/*
+** prfb_pldl3keep:
+**	prfb	pldl3keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfb_pldl3keep, uint8_t,
+	       svprfb (p0, x0, SV_PLDL3KEEP),
+	       svprfb (p0, x0, SV_PLDL3KEEP))
+
+/*
+** prfb_pldl3strm:
+**	prfb	pldl3strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfb_pldl3strm, uint8_t,
+	       svprfb (p0, x0, SV_PLDL3STRM),
+	       svprfb (p0, x0, SV_PLDL3STRM))
+
+/*
+** prfb_pstl1keep:
+**	prfb	pstl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfb_pstl1keep, uint8_t,
+	       svprfb (p0, x0, SV_PSTL1KEEP),
+	       svprfb (p0, x0, SV_PSTL1KEEP))
+
+/*
+** prfb_pstl1strm:
+**	prfb	pstl1strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfb_pstl1strm, uint8_t,
+	       svprfb (p0, x0, SV_PSTL1STRM),
+	       svprfb (p0, x0, SV_PSTL1STRM))
+
+/*
+** prfb_pstl2keep:
+**	prfb	pstl2keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfb_pstl2keep, uint8_t,
+	       svprfb (p0, x0, SV_PSTL2KEEP),
+	       svprfb (p0, x0, SV_PSTL2KEEP))
+
+/*
+** prfb_pstl2strm:
+**	prfb	pstl2strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfb_pstl2strm, uint8_t,
+	       svprfb (p0, x0, SV_PSTL2STRM),
+	       svprfb (p0, x0, SV_PSTL2STRM))
+
+/*
+** prfb_pstl3keep:
+**	prfb	pstl3keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfb_pstl3keep, uint8_t,
+	       svprfb (p0, x0, SV_PSTL3KEEP),
+	       svprfb (p0, x0, SV_PSTL3KEEP))
+
+/*
+** prfb_pstl3strm:
+**	prfb	pstl3strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfb_pstl3strm, uint8_t,
+	       svprfb (p0, x0, SV_PSTL3STRM),
+	       svprfb (p0, x0, SV_PSTL3STRM))
+
+/*
+** prfb_vnum_0:
+**	prfb	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfb_vnum_0, uint8_t,
+	       svprfb_vnum (p0, x0, 0, SV_PLDL1KEEP),
+	       svprfb_vnum (p0, x0, 0, SV_PLDL1KEEP))
+
+/*
+** prfb_vnum_1:
+**	incb	x0
+**	prfb	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfb_vnum_1, uint16_t,
+	       svprfb_vnum (p0, x0, 1, SV_PLDL1KEEP),
+	       svprfb_vnum (p0, x0, 1, SV_PLDL1KEEP))
+
+/*
+** prfb_vnum_2:
+**	incb	x0, all, mul #2
+**	prfb	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfb_vnum_2, uint32_t,
+	       svprfb_vnum (p0, x0, 2, SV_PLDL1KEEP),
+	       svprfb_vnum (p0, x0, 2, SV_PLDL1KEEP))
+
+/*
+** prfb_vnum_3:
+**	incb	x0, all, mul #3
+**	prfb	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfb_vnum_3, uint64_t,
+	       svprfb_vnum (p0, x0, 3, SV_PLDL1KEEP),
+	       svprfb_vnum (p0, x0, 3, SV_PLDL1KEEP))
+
+/*
+** prfb_vnum_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	prfb	pldl1keep, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	prfb	zldl1keep, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_PREFETCH (prfb_vnum_x1, uint64_t,
+	       svprfb_vnum (p0, x0, x1, SV_PLDL1KEEP),
+	       svprfb_vnum (p0, x0, x1, SV_PLDL1KEEP))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb_gather.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb_gather.c
new file mode 100644
index 000000000..c4bfbbbf7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb_gather.c
@@ -0,0 +1,223 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** prfb_gather_u32base:
+**	prfb	pldl1keep, p0, \[z0\.s\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfb_gather_u32base, svuint32_t,
+			 svprfb_gather_u32base (p0, z0, SV_PLDL1KEEP),
+			 svprfb_gather (p0, z0, SV_PLDL1KEEP))
+
+/*
+** prfb_gather_u64base:
+**	prfb	pldl1strm, p0, \[z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfb_gather_u64base, svuint64_t,
+			 svprfb_gather_u64base (p0, z0, SV_PLDL1STRM),
+			 svprfb_gather (p0, z0, SV_PLDL1STRM))
+
+/*
+** prfb_gather_x0_u32base_offset:
+**	prfb	pldl2keep, p0, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfb_gather_x0_u32base_offset, svuint32_t,
+			 svprfb_gather_u32base_offset (p0, z0, x0, SV_PLDL2KEEP),
+			 svprfb_gather_offset (p0, z0, x0, SV_PLDL2KEEP))
+
+/*
+** prfb_gather_m1_u32base_offset:
+**	mov	(x[0-9]+), #?-1
+**	prfb	pldl2strm, p0, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfb_gather_m1_u32base_offset, svuint32_t,
+			 svprfb_gather_u32base_offset (p0, z0, -1, SV_PLDL2STRM),
+			 svprfb_gather_offset (p0, z0, -1, SV_PLDL2STRM))
+
+/*
+** prfb_gather_0_u32base_offset:
+**	prfb	pldl3keep, p0, \[z0\.s\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfb_gather_0_u32base_offset, svuint32_t,
+			 svprfb_gather_u32base_offset (p0, z0, 0, SV_PLDL3KEEP),
+			 svprfb_gather_offset (p0, z0, 0, SV_PLDL3KEEP))
+
+/*
+** prfb_gather_5_u32base_offset:
+**	prfb	pldl3strm, p0, \[z0\.s, #5\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfb_gather_5_u32base_offset, svuint32_t,
+			 svprfb_gather_u32base_offset (p0, z0, 5, SV_PLDL3STRM),
+			 svprfb_gather_offset (p0, z0, 5, SV_PLDL3STRM))
+
+/*
+** prfb_gather_31_u32base_offset:
+**	prfb	pstl1keep, p0, \[z0\.s, #31\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfb_gather_31_u32base_offset, svuint32_t,
+			 svprfb_gather_u32base_offset (p0, z0, 31, SV_PSTL1KEEP),
+			 svprfb_gather_offset (p0, z0, 31, SV_PSTL1KEEP))
+
+/*
+** prfb_gather_32_u32base_offset:
+**	mov	(x[0-9]+), #?32
+**	prfb	pstl1strm, p0, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfb_gather_32_u32base_offset, svuint32_t,
+			 svprfb_gather_u32base_offset (p0, z0, 32, SV_PSTL1STRM),
+			 svprfb_gather_offset (p0, z0, 32, SV_PSTL1STRM))
+
+/*
+** prfb_gather_x0_u64base_offset:
+**	prfb	pstl2keep, p0, \[x0, z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfb_gather_x0_u64base_offset, svuint64_t,
+			 svprfb_gather_u64base_offset (p0, z0, x0, SV_PSTL2KEEP),
+			 svprfb_gather_offset (p0, z0, x0, SV_PSTL2KEEP))
+
+/*
+** prfb_gather_m1_u64base_offset:
+**	mov	(x[0-9]+), #?-1
+**	prfb	pstl2strm, p0, \[\1, z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfb_gather_m1_u64base_offset, svuint64_t,
+			 svprfb_gather_u64base_offset (p0, z0, -1, SV_PSTL2STRM),
+			 svprfb_gather_offset (p0, z0, -1, SV_PSTL2STRM))
+
+/*
+** prfb_gather_0_u64base_offset:
+**	prfb	pstl3keep, p0, \[z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfb_gather_0_u64base_offset, svuint64_t,
+			 svprfb_gather_u64base_offset (p0, z0, 0, SV_PSTL3KEEP),
+			 svprfb_gather_offset (p0, z0, 0, SV_PSTL3KEEP))
+
+/*
+** prfb_gather_5_u64base_offset:
+**	prfb	pstl3strm, p0, \[z0\.d, #5\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfb_gather_5_u64base_offset, svuint64_t,
+			 svprfb_gather_u64base_offset (p0, z0, 5, SV_PSTL3STRM),
+			 svprfb_gather_offset (p0, z0, 5, SV_PSTL3STRM))
+
+/*
+** prfb_gather_31_u64base_offset:
+**	prfb	pldl1keep, p0, \[z0\.d, #31\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfb_gather_31_u64base_offset, svuint64_t,
+			 svprfb_gather_u64base_offset (p0, z0, 31, SV_PLDL1KEEP),
+			 svprfb_gather_offset (p0, z0, 31, SV_PLDL1KEEP))
+
+/*
+** prfb_gather_32_u64base_offset:
+**	mov	(x[0-9]+), #?32
+**	prfb	pldl1strm, p0, \[\1, z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfb_gather_32_u64base_offset, svuint64_t,
+			 svprfb_gather_u64base_offset (p0, z0, 32, SV_PLDL1STRM),
+			 svprfb_gather_offset (p0, z0, 32, SV_PLDL1STRM))
+
+/*
+** prfb_gather_x0_s32offset:
+**	prfb	pldl2keep, p0, \[x0, z0\.s, sxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfb_gather_x0_s32offset, svint32_t,
+			 svprfb_gather_s32offset (p0, x0, z0, SV_PLDL2KEEP),
+			 svprfb_gather_offset (p0, x0, z0, SV_PLDL2KEEP))
+
+/*
+** prfb_gather_s32offset:
+**	prfb	pldl2strm, p0, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfb_gather_s32offset, svint32_t,
+			 svprfb_gather_s32offset (p0, x0, z1, SV_PLDL2STRM),
+			 svprfb_gather_offset (p0, x0, z1, SV_PLDL2STRM))
+
+/*
+** prfb_gather_x0_u32offset:
+**	prfb	pldl3keep, p0, \[x0, z0\.s, uxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfb_gather_x0_u32offset, svuint32_t,
+			 svprfb_gather_u32offset (p0, x0, z0, SV_PLDL3KEEP),
+			 svprfb_gather_offset (p0, x0, z0, SV_PLDL3KEEP))
+
+/*
+** prfb_gather_u32offset:
+**	prfb	pldl3strm, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfb_gather_u32offset, svuint32_t,
+			 svprfb_gather_u32offset (p0, x0, z1, SV_PLDL3STRM),
+			 svprfb_gather_offset (p0, x0, z1, SV_PLDL3STRM))
+
+/*
+** prfb_gather_x0_s64offset:
+**	prfb	pstl1keep, p0, \[x0, z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfb_gather_x0_s64offset, svint64_t,
+			 svprfb_gather_s64offset (p0, x0, z0, SV_PSTL1KEEP),
+			 svprfb_gather_offset (p0, x0, z0, SV_PSTL1KEEP))
+
+/*
+** prfb_gather_s64offset:
+**	prfb	pstl1strm, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfb_gather_s64offset, svint64_t,
+			 svprfb_gather_s64offset (p0, x0, z1, SV_PSTL1STRM),
+			 svprfb_gather_offset (p0, x0, z1, SV_PSTL1STRM))
+
+/*
+** prfb_gather_ext_s64offset:
+**	prfb	pstl1strm, p0, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfb_gather_ext_s64offset, svint64_t,
+			 svprfb_gather_s64offset (p0, x0, svextw_s64_x (p0, z1), SV_PSTL1STRM),
+			 svprfb_gather_offset (p0, x0, svextw_x (p0, z1), SV_PSTL1STRM))
+
+/*
+** prfb_gather_x0_u64offset:
+**	prfb	pstl2keep, p0, \[x0, z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfb_gather_x0_u64offset, svuint64_t,
+			 svprfb_gather_u64offset (p0, x0, z0, SV_PSTL2KEEP),
+			 svprfb_gather_offset (p0, x0, z0, SV_PSTL2KEEP))
+
+/*
+** prfb_gather_u64offset:
+**	prfb	pstl2strm, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfb_gather_u64offset, svuint64_t,
+			 svprfb_gather_u64offset (p0, x0, z1, SV_PSTL2STRM),
+			 svprfb_gather_offset (p0, x0, z1, SV_PSTL2STRM))
+
+/*
+** prfb_gather_ext_u64offset:
+**	prfb	pstl2strm, p0, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfb_gather_ext_u64offset, svuint64_t,
+			 svprfb_gather_u64offset (p0, x0, svextw_u64_x (p0, z1), SV_PSTL2STRM),
+			 svprfb_gather_offset (p0, x0, svextw_x (p0, z1), SV_PSTL2STRM))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c
new file mode 100644
index 000000000..72b2e6415
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c
@@ -0,0 +1,245 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** prfd_base:
+**	prfd	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_base, uint8_t,
+	       svprfd (p0, x0, SV_PLDL1KEEP),
+	       svprfd (p0, x0, SV_PLDL1KEEP))
+
+/*
+** prfd_u8_index:
+**	add	(x[0-9+]), (x0, x1|x1, x0)
+**	prfd	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_u8_index, uint8_t,
+	       svprfd (p0, x0 + x1, SV_PLDL1KEEP),
+	       svprfd (p0, x0 + x1, SV_PLDL1KEEP))
+
+/*
+** prfd_u8_1:
+**	add	(x[0-9+]), x0, #?1
+**	prfd	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfd_u8_1, uint8_t,
+	       svprfd (p0, x0 + 1, SV_PLDL1KEEP),
+	       svprfd (p0, x0 + 1, SV_PLDL1KEEP))
+
+/*
+** prfd_u16_index:
+**	add	(x[0-9+]), x0, x1, lsl #?1
+**	prfd	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfd_u16_index, uint16_t,
+	       svprfd (p0, x0 + x1, SV_PLDL1KEEP),
+	       svprfd (p0, x0 + x1, SV_PLDL1KEEP))
+
+/*
+** prfd_u16_1:
+**	add	(x[0-9+]), x0, #?2
+**	prfd	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfd_u16_1, uint16_t,
+	       svprfd (p0, x0 + 1, SV_PLDL1KEEP),
+	       svprfd (p0, x0 + 1, SV_PLDL1KEEP))
+
+/*
+** prfd_u32_index:
+**	add	(x[0-9+]), x0, x1, lsl #?2
+**	prfd	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfd_u32_index, uint32_t,
+	       svprfd (p0, x0 + x1, SV_PLDL1KEEP),
+	       svprfd (p0, x0 + x1, SV_PLDL1KEEP))
+
+/*
+** prfd_u32_1:
+**	add	(x[0-9+]), x0, #?4
+**	prfd	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfd_u32_1, uint32_t,
+	       svprfd (p0, x0 + 1, SV_PLDL1KEEP),
+	       svprfd (p0, x0 + 1, SV_PLDL1KEEP))
+
+/*
+** prfd_u64_index:
+**	prfd	pldl1keep, p0, \[x0, x1, lsl #?3\]
+**	ret
+*/
+TEST_PREFETCH (prfd_u64_index, uint64_t,
+	       svprfd (p0, x0 + x1, SV_PLDL1KEEP),
+	       svprfd (p0, x0 + x1, SV_PLDL1KEEP))
+
+/*
+** prfd_u64_1:
+**	add	(x[0-9+]), x0, #?8
+**	prfd	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfd_u64_1, uint64_t,
+	       svprfd (p0, x0 + 1, SV_PLDL1KEEP),
+	       svprfd (p0, x0 + 1, SV_PLDL1KEEP))
+
+/*
+** prfd_pldl1strm:
+**	prfd	pldl1strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_pldl1strm, uint8_t,
+	       svprfd (p0, x0, SV_PLDL1STRM),
+	       svprfd (p0, x0, SV_PLDL1STRM))
+
+/*
+** prfd_pldl2keep:
+**	prfd	pldl2keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_pldl2keep, uint8_t,
+	       svprfd (p0, x0, SV_PLDL2KEEP),
+	       svprfd (p0, x0, SV_PLDL2KEEP))
+
+/*
+** prfd_pldl2strm:
+**	prfd	pldl2strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_pldl2strm, uint8_t,
+	       svprfd (p0, x0, SV_PLDL2STRM),
+	       svprfd (p0, x0, SV_PLDL2STRM))
+
+/*
+** prfd_pldl3keep:
+**	prfd	pldl3keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_pldl3keep, uint8_t,
+	       svprfd (p0, x0, SV_PLDL3KEEP),
+	       svprfd (p0, x0, SV_PLDL3KEEP))
+
+/*
+** prfd_pldl3strm:
+**	prfd	pldl3strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_pldl3strm, uint8_t,
+	       svprfd (p0, x0, SV_PLDL3STRM),
+	       svprfd (p0, x0, SV_PLDL3STRM))
+
+/*
+** prfd_pstl1keep:
+**	prfd	pstl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_pstl1keep, uint8_t,
+	       svprfd (p0, x0, SV_PSTL1KEEP),
+	       svprfd (p0, x0, SV_PSTL1KEEP))
+
+/*
+** prfd_pstl1strm:
+**	prfd	pstl1strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_pstl1strm, uint8_t,
+	       svprfd (p0, x0, SV_PSTL1STRM),
+	       svprfd (p0, x0, SV_PSTL1STRM))
+
+/*
+** prfd_pstl2keep:
+**	prfd	pstl2keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_pstl2keep, uint8_t,
+	       svprfd (p0, x0, SV_PSTL2KEEP),
+	       svprfd (p0, x0, SV_PSTL2KEEP))
+
+/*
+** prfd_pstl2strm:
+**	prfd	pstl2strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_pstl2strm, uint8_t,
+	       svprfd (p0, x0, SV_PSTL2STRM),
+	       svprfd (p0, x0, SV_PSTL2STRM))
+
+/*
+** prfd_pstl3keep:
+**	prfd	pstl3keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_pstl3keep, uint8_t,
+	       svprfd (p0, x0, SV_PSTL3KEEP),
+	       svprfd (p0, x0, SV_PSTL3KEEP))
+
+/*
+** prfd_pstl3strm:
+**	prfd	pstl3strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_pstl3strm, uint8_t,
+	       svprfd (p0, x0, SV_PSTL3STRM),
+	       svprfd (p0, x0, SV_PSTL3STRM))
+
+/*
+** prfd_vnum_0:
+**	prfd	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_vnum_0, uint8_t,
+	       svprfd_vnum (p0, x0, 0, SV_PLDL1KEEP),
+	       svprfd_vnum (p0, x0, 0, SV_PLDL1KEEP))
+
+/*
+** prfd_vnum_1:
+**	incb	x0
+**	prfd	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_vnum_1, uint16_t,
+	       svprfd_vnum (p0, x0, 1, SV_PLDL1KEEP),
+	       svprfd_vnum (p0, x0, 1, SV_PLDL1KEEP))
+
+/*
+** prfd_vnum_2:
+**	incb	x0, all, mul #2
+**	prfd	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_vnum_2, uint32_t,
+	       svprfd_vnum (p0, x0, 2, SV_PLDL1KEEP),
+	       svprfd_vnum (p0, x0, 2, SV_PLDL1KEEP))
+
+/*
+** prfd_vnum_3:
+**	incb	x0, all, mul #3
+**	prfd	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfd_vnum_3, uint64_t,
+	       svprfd_vnum (p0, x0, 3, SV_PLDL1KEEP),
+	       svprfd_vnum (p0, x0, 3, SV_PLDL1KEEP))
+
+/*
+** prfd_vnum_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	prfd	pldl1keep, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	prfd	zldl1keep, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_PREFETCH (prfd_vnum_x1, uint64_t,
+	       svprfd_vnum (p0, x0, x1, SV_PLDL1KEEP),
+	       svprfd_vnum (p0, x0, x1, SV_PLDL1KEEP))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd_gather.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd_gather.c
new file mode 100644
index 000000000..a84acb1a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd_gather.c
@@ -0,0 +1,225 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** prfd_gather_u32base:
+**	prfd	pldl1keep, p0, \[z0\.s\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfd_gather_u32base, svuint32_t,
+			 svprfd_gather_u32base (p0, z0, SV_PLDL1KEEP),
+			 svprfd_gather (p0, z0, SV_PLDL1KEEP))
+
+/*
+** prfd_gather_u64base:
+**	prfd	pldl1strm, p0, \[z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfd_gather_u64base, svuint64_t,
+			 svprfd_gather_u64base (p0, z0, SV_PLDL1STRM),
+			 svprfd_gather (p0, z0, SV_PLDL1STRM))
+
+/*
+** prfd_gather_x0_u32base_index:
+**	lsl	(x[0-9]+), x0, #?3
+**	prfb	pldl2keep, p0, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfd_gather_x0_u32base_index, svuint32_t,
+			 svprfd_gather_u32base_index (p0, z0, x0, SV_PLDL2KEEP),
+			 svprfd_gather_index (p0, z0, x0, SV_PLDL2KEEP))
+
+/*
+** prfd_gather_m1_u32base_index:
+**	mov	(x[0-9]+), #?-8
+**	prfb	pldl2strm, p0, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfd_gather_m1_u32base_index, svuint32_t,
+			 svprfd_gather_u32base_index (p0, z0, -1, SV_PLDL2STRM),
+			 svprfd_gather_index (p0, z0, -1, SV_PLDL2STRM))
+
+/*
+** prfd_gather_0_u32base_index:
+**	prfd	pldl3keep, p0, \[z0\.s\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfd_gather_0_u32base_index, svuint32_t,
+			 svprfd_gather_u32base_index (p0, z0, 0, SV_PLDL3KEEP),
+			 svprfd_gather_index (p0, z0, 0, SV_PLDL3KEEP))
+
+/*
+** prfd_gather_5_u32base_index:
+**	prfd	pldl3strm, p0, \[z0\.s, #40\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfd_gather_5_u32base_index, svuint32_t,
+			 svprfd_gather_u32base_index (p0, z0, 5, SV_PLDL3STRM),
+			 svprfd_gather_index (p0, z0, 5, SV_PLDL3STRM))
+
+/*
+** prfd_gather_31_u32base_index:
+**	prfd	pstl1keep, p0, \[z0\.s, #248\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfd_gather_31_u32base_index, svuint32_t,
+			 svprfd_gather_u32base_index (p0, z0, 31, SV_PSTL1KEEP),
+			 svprfd_gather_index (p0, z0, 31, SV_PSTL1KEEP))
+
+/*
+** prfd_gather_32_u32base_index:
+**	mov	(x[0-9]+), #?256
+**	prfb	pstl1strm, p0, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfd_gather_32_u32base_index, svuint32_t,
+			 svprfd_gather_u32base_index (p0, z0, 32, SV_PSTL1STRM),
+			 svprfd_gather_index (p0, z0, 32, SV_PSTL1STRM))
+
+/*
+** prfd_gather_x0_u64base_index:
+**	lsl	(x[0-9]+), x0, #?3
+**	prfb	pstl2keep, p0, \[\1, z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfd_gather_x0_u64base_index, svuint64_t,
+			 svprfd_gather_u64base_index (p0, z0, x0, SV_PSTL2KEEP),
+			 svprfd_gather_index (p0, z0, x0, SV_PSTL2KEEP))
+
+/*
+** prfd_gather_m1_u64base_index:
+**	mov	(x[0-9]+), #?-8
+**	prfb	pstl2strm, p0, \[\1, z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfd_gather_m1_u64base_index, svuint64_t,
+			 svprfd_gather_u64base_index (p0, z0, -1, SV_PSTL2STRM),
+			 svprfd_gather_index (p0, z0, -1, SV_PSTL2STRM))
+
+/*
+** prfd_gather_0_u64base_index:
+**	prfd	pstl3keep, p0, \[z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfd_gather_0_u64base_index, svuint64_t,
+			 svprfd_gather_u64base_index (p0, z0, 0, SV_PSTL3KEEP),
+			 svprfd_gather_index (p0, z0, 0, SV_PSTL3KEEP))
+
+/*
+** prfd_gather_5_u64base_index:
+**	prfd	pstl3strm, p0, \[z0\.d, #40\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfd_gather_5_u64base_index, svuint64_t,
+			 svprfd_gather_u64base_index (p0, z0, 5, SV_PSTL3STRM),
+			 svprfd_gather_index (p0, z0, 5, SV_PSTL3STRM))
+
+/*
+** prfd_gather_31_u64base_index:
+**	prfd	pldl1keep, p0, \[z0\.d, #248\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfd_gather_31_u64base_index, svuint64_t,
+			 svprfd_gather_u64base_index (p0, z0, 31, SV_PLDL1KEEP),
+			 svprfd_gather_index (p0, z0, 31, SV_PLDL1KEEP))
+
+/*
+** prfd_gather_32_u64base_index:
+**	mov	(x[0-9]+), #?256
+**	prfb	pldl1strm, p0, \[\1, z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfd_gather_32_u64base_index, svuint64_t,
+			 svprfd_gather_u64base_index (p0, z0, 32, SV_PLDL1STRM),
+			 svprfd_gather_index (p0, z0, 32, SV_PLDL1STRM))
+
+/*
+** prfd_gather_x0_s32index:
+**	prfd	pldl2keep, p0, \[x0, z0\.s, sxtw 3\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfd_gather_x0_s32index, svint32_t,
+			 svprfd_gather_s32index (p0, x0, z0, SV_PLDL2KEEP),
+			 svprfd_gather_index (p0, x0, z0, SV_PLDL2KEEP))
+
+/*
+** prfd_gather_s32index:
+**	prfd	pldl2strm, p0, \[x0, z1\.s, sxtw 3\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfd_gather_s32index, svint32_t,
+			 svprfd_gather_s32index (p0, x0, z1, SV_PLDL2STRM),
+			 svprfd_gather_index (p0, x0, z1, SV_PLDL2STRM))
+
+/*
+** prfd_gather_x0_u32index:
+**	prfd	pldl3keep, p0, \[x0, z0\.s, uxtw 3\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfd_gather_x0_u32index, svuint32_t,
+			 svprfd_gather_u32index (p0, x0, z0, SV_PLDL3KEEP),
+			 svprfd_gather_index (p0, x0, z0, SV_PLDL3KEEP))
+
+/*
+** prfd_gather_u32index:
+**	prfd	pldl3strm, p0, \[x0, z1\.s, uxtw 3\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfd_gather_u32index, svuint32_t,
+			 svprfd_gather_u32index (p0, x0, z1, SV_PLDL3STRM),
+			 svprfd_gather_index (p0, x0, z1, SV_PLDL3STRM))
+
+/*
+** prfd_gather_x0_s64index:
+**	prfd	pstl1keep, p0, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfd_gather_x0_s64index, svint64_t,
+			 svprfd_gather_s64index (p0, x0, z0, SV_PSTL1KEEP),
+			 svprfd_gather_index (p0, x0, z0, SV_PSTL1KEEP))
+
+/*
+** prfd_gather_s64index:
+**	prfd	pstl1strm, p0, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfd_gather_s64index, svint64_t,
+			 svprfd_gather_s64index (p0, x0, z1, SV_PSTL1STRM),
+			 svprfd_gather_index (p0, x0, z1, SV_PSTL1STRM))
+
+/*
+** prfd_gather_ext_s64index:
+**	prfd	pstl1strm, p0, \[x0, z1\.d, sxtw 3\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfd_gather_ext_s64index, svint64_t,
+			 svprfd_gather_s64index (p0, x0, svextw_s64_x (p0, z1), SV_PSTL1STRM),
+			 svprfd_gather_index (p0, x0, svextw_x (p0, z1), SV_PSTL1STRM))
+
+/*
+** prfd_gather_x0_u64index:
+**	prfd	pstl2keep, p0, \[x0, z0\.d, lsl 3\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfd_gather_x0_u64index, svuint64_t,
+			 svprfd_gather_u64index (p0, x0, z0, SV_PSTL2KEEP),
+			 svprfd_gather_index (p0, x0, z0, SV_PSTL2KEEP))
+
+/*
+** prfd_gather_u64index:
+**	prfd	pstl2strm, p0, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfd_gather_u64index, svuint64_t,
+			 svprfd_gather_u64index (p0, x0, z1, SV_PSTL2STRM),
+			 svprfd_gather_index (p0, x0, z1, SV_PSTL2STRM))
+
+/*
+** prfd_gather_ext_u64index:
+**	prfd	pstl2strm, p0, \[x0, z1\.d, uxtw 3\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfd_gather_ext_u64index, svuint64_t,
+			 svprfd_gather_u64index (p0, x0, svextw_u64_x (p0, z1), SV_PSTL2STRM),
+			 svprfd_gather_index (p0, x0, svextw_x (p0, z1), SV_PSTL2STRM))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c
new file mode 100644
index 000000000..89069f9b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c
@@ -0,0 +1,245 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** prfh_base:
+**	prfh	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_base, uint8_t,
+	       svprfh (p0, x0, SV_PLDL1KEEP),
+	       svprfh (p0, x0, SV_PLDL1KEEP))
+
+/*
+** prfh_u8_index:
+**	add	(x[0-9+]), (x0, x1|x1, x0)
+**	prfh	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_u8_index, uint8_t,
+	       svprfh (p0, x0 + x1, SV_PLDL1KEEP),
+	       svprfh (p0, x0 + x1, SV_PLDL1KEEP))
+
+/*
+** prfh_u8_1:
+**	add	(x[0-9+]), x0, #?1
+**	prfh	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfh_u8_1, uint8_t,
+	       svprfh (p0, x0 + 1, SV_PLDL1KEEP),
+	       svprfh (p0, x0 + 1, SV_PLDL1KEEP))
+
+/*
+** prfh_u16_index:
+**	prfh	pldl1keep, p0, \[x0, x1, lsl #?1\]
+**	ret
+*/
+TEST_PREFETCH (prfh_u16_index, uint16_t,
+	       svprfh (p0, x0 + x1, SV_PLDL1KEEP),
+	       svprfh (p0, x0 + x1, SV_PLDL1KEEP))
+
+/*
+** prfh_u16_1:
+**	add	(x[0-9+]), x0, #?2
+**	prfh	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfh_u16_1, uint16_t,
+	       svprfh (p0, x0 + 1, SV_PLDL1KEEP),
+	       svprfh (p0, x0 + 1, SV_PLDL1KEEP))
+
+/*
+** prfh_u32_index:
+**	add	(x[0-9+]), x0, x1, lsl #?2
+**	prfh	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfh_u32_index, uint32_t,
+	       svprfh (p0, x0 + x1, SV_PLDL1KEEP),
+	       svprfh (p0, x0 + x1, SV_PLDL1KEEP))
+
+/*
+** prfh_u32_1:
+**	add	(x[0-9+]), x0, #?4
+**	prfh	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfh_u32_1, uint32_t,
+	       svprfh (p0, x0 + 1, SV_PLDL1KEEP),
+	       svprfh (p0, x0 + 1, SV_PLDL1KEEP))
+
+/*
+** prfh_u64_index:
+**	add	(x[0-9+]), x0, x1, lsl #?3
+**	prfh	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfh_u64_index, uint64_t,
+	       svprfh (p0, x0 + x1, SV_PLDL1KEEP),
+	       svprfh (p0, x0 + x1, SV_PLDL1KEEP))
+
+/*
+** prfh_u64_1:
+**	add	(x[0-9+]), x0, #?8
+**	prfh	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfh_u64_1, uint64_t,
+	       svprfh (p0, x0 + 1, SV_PLDL1KEEP),
+	       svprfh (p0, x0 + 1, SV_PLDL1KEEP))
+
+/*
+** prfh_pldl1strm:
+**	prfh	pldl1strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_pldl1strm, uint8_t,
+	       svprfh (p0, x0, SV_PLDL1STRM),
+	       svprfh (p0, x0, SV_PLDL1STRM))
+
+/*
+** prfh_pldl2keep:
+**	prfh	pldl2keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_pldl2keep, uint8_t,
+	       svprfh (p0, x0, SV_PLDL2KEEP),
+	       svprfh (p0, x0, SV_PLDL2KEEP))
+
+/*
+** prfh_pldl2strm:
+**	prfh	pldl2strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_pldl2strm, uint8_t,
+	       svprfh (p0, x0, SV_PLDL2STRM),
+	       svprfh (p0, x0, SV_PLDL2STRM))
+
+/*
+** prfh_pldl3keep:
+**	prfh	pldl3keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_pldl3keep, uint8_t,
+	       svprfh (p0, x0, SV_PLDL3KEEP),
+	       svprfh (p0, x0, SV_PLDL3KEEP))
+
+/*
+** prfh_pldl3strm:
+**	prfh	pldl3strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_pldl3strm, uint8_t,
+	       svprfh (p0, x0, SV_PLDL3STRM),
+	       svprfh (p0, x0, SV_PLDL3STRM))
+
+/*
+** prfh_pstl1keep:
+**	prfh	pstl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_pstl1keep, uint8_t,
+	       svprfh (p0, x0, SV_PSTL1KEEP),
+	       svprfh (p0, x0, SV_PSTL1KEEP))
+
+/*
+** prfh_pstl1strm:
+**	prfh	pstl1strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_pstl1strm, uint8_t,
+	       svprfh (p0, x0, SV_PSTL1STRM),
+	       svprfh (p0, x0, SV_PSTL1STRM))
+
+/*
+** prfh_pstl2keep:
+**	prfh	pstl2keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_pstl2keep, uint8_t,
+	       svprfh (p0, x0, SV_PSTL2KEEP),
+	       svprfh (p0, x0, SV_PSTL2KEEP))
+
+/*
+** prfh_pstl2strm:
+**	prfh	pstl2strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_pstl2strm, uint8_t,
+	       svprfh (p0, x0, SV_PSTL2STRM),
+	       svprfh (p0, x0, SV_PSTL2STRM))
+
+/*
+** prfh_pstl3keep:
+**	prfh	pstl3keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_pstl3keep, uint8_t,
+	       svprfh (p0, x0, SV_PSTL3KEEP),
+	       svprfh (p0, x0, SV_PSTL3KEEP))
+
+/*
+** prfh_pstl3strm:
+**	prfh	pstl3strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_pstl3strm, uint8_t,
+	       svprfh (p0, x0, SV_PSTL3STRM),
+	       svprfh (p0, x0, SV_PSTL3STRM))
+
+/*
+** prfh_vnum_0:
+**	prfh	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_vnum_0, uint8_t,
+	       svprfh_vnum (p0, x0, 0, SV_PLDL1KEEP),
+	       svprfh_vnum (p0, x0, 0, SV_PLDL1KEEP))
+
+/*
+** prfh_vnum_1:
+**	incb	x0
+**	prfh	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_vnum_1, uint16_t,
+	       svprfh_vnum (p0, x0, 1, SV_PLDL1KEEP),
+	       svprfh_vnum (p0, x0, 1, SV_PLDL1KEEP))
+
+/*
+** prfh_vnum_2:
+**	incb	x0, all, mul #2
+**	prfh	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_vnum_2, uint32_t,
+	       svprfh_vnum (p0, x0, 2, SV_PLDL1KEEP),
+	       svprfh_vnum (p0, x0, 2, SV_PLDL1KEEP))
+
+/*
+** prfh_vnum_3:
+**	incb	x0, all, mul #3
+**	prfh	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfh_vnum_3, uint64_t,
+	       svprfh_vnum (p0, x0, 3, SV_PLDL1KEEP),
+	       svprfh_vnum (p0, x0, 3, SV_PLDL1KEEP))
+
+/*
+** prfh_vnum_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	prfh	pldl1keep, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	prfh	zldl1keep, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_PREFETCH (prfh_vnum_x1, uint64_t,
+	       svprfh_vnum (p0, x0, x1, SV_PLDL1KEEP),
+	       svprfh_vnum (p0, x0, x1, SV_PLDL1KEEP))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh_gather.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh_gather.c
new file mode 100644
index 000000000..04b7a1575
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh_gather.c
@@ -0,0 +1,225 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** prfh_gather_u32base:
+**	prfh	pldl1keep, p0, \[z0\.s\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfh_gather_u32base, svuint32_t,
+			 svprfh_gather_u32base (p0, z0, SV_PLDL1KEEP),
+			 svprfh_gather (p0, z0, SV_PLDL1KEEP))
+
+/*
+** prfh_gather_u64base:
+**	prfh	pldl1strm, p0, \[z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfh_gather_u64base, svuint64_t,
+			 svprfh_gather_u64base (p0, z0, SV_PLDL1STRM),
+			 svprfh_gather (p0, z0, SV_PLDL1STRM))
+
+/*
+** prfh_gather_x0_u32base_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	prfb	pldl2keep, p0, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfh_gather_x0_u32base_index, svuint32_t,
+			 svprfh_gather_u32base_index (p0, z0, x0, SV_PLDL2KEEP),
+			 svprfh_gather_index (p0, z0, x0, SV_PLDL2KEEP))
+
+/*
+** prfh_gather_m1_u32base_index:
+**	mov	(x[0-9]+), #?-2
+**	prfb	pldl2strm, p0, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfh_gather_m1_u32base_index, svuint32_t,
+			 svprfh_gather_u32base_index (p0, z0, -1, SV_PLDL2STRM),
+			 svprfh_gather_index (p0, z0, -1, SV_PLDL2STRM))
+
+/*
+** prfh_gather_0_u32base_index:
+**	prfh	pldl3keep, p0, \[z0\.s\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfh_gather_0_u32base_index, svuint32_t,
+			 svprfh_gather_u32base_index (p0, z0, 0, SV_PLDL3KEEP),
+			 svprfh_gather_index (p0, z0, 0, SV_PLDL3KEEP))
+
+/*
+** prfh_gather_5_u32base_index:
+**	prfh	pldl3strm, p0, \[z0\.s, #10\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfh_gather_5_u32base_index, svuint32_t,
+			 svprfh_gather_u32base_index (p0, z0, 5, SV_PLDL3STRM),
+			 svprfh_gather_index (p0, z0, 5, SV_PLDL3STRM))
+
+/*
+** prfh_gather_31_u32base_index:
+**	prfh	pstl1keep, p0, \[z0\.s, #62\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfh_gather_31_u32base_index, svuint32_t,
+			 svprfh_gather_u32base_index (p0, z0, 31, SV_PSTL1KEEP),
+			 svprfh_gather_index (p0, z0, 31, SV_PSTL1KEEP))
+
+/*
+** prfh_gather_32_u32base_index:
+**	mov	(x[0-9]+), #?64
+**	prfb	pstl1strm, p0, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfh_gather_32_u32base_index, svuint32_t,
+			 svprfh_gather_u32base_index (p0, z0, 32, SV_PSTL1STRM),
+			 svprfh_gather_index (p0, z0, 32, SV_PSTL1STRM))
+
+/*
+** prfh_gather_x0_u64base_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	prfb	pstl2keep, p0, \[\1, z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfh_gather_x0_u64base_index, svuint64_t,
+			 svprfh_gather_u64base_index (p0, z0, x0, SV_PSTL2KEEP),
+			 svprfh_gather_index (p0, z0, x0, SV_PSTL2KEEP))
+
+/*
+** prfh_gather_m1_u64base_index:
+**	mov	(x[0-9]+), #?-2
+**	prfb	pstl2strm, p0, \[\1, z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfh_gather_m1_u64base_index, svuint64_t,
+			 svprfh_gather_u64base_index (p0, z0, -1, SV_PSTL2STRM),
+			 svprfh_gather_index (p0, z0, -1, SV_PSTL2STRM))
+
+/*
+** prfh_gather_0_u64base_index:
+**	prfh	pstl3keep, p0, \[z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfh_gather_0_u64base_index, svuint64_t,
+			 svprfh_gather_u64base_index (p0, z0, 0, SV_PSTL3KEEP),
+			 svprfh_gather_index (p0, z0, 0, SV_PSTL3KEEP))
+
+/*
+** prfh_gather_5_u64base_index:
+**	prfh	pstl3strm, p0, \[z0\.d, #10\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfh_gather_5_u64base_index, svuint64_t,
+			 svprfh_gather_u64base_index (p0, z0, 5, SV_PSTL3STRM),
+			 svprfh_gather_index (p0, z0, 5, SV_PSTL3STRM))
+
+/*
+** prfh_gather_31_u64base_index:
+**	prfh	pldl1keep, p0, \[z0\.d, #62\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfh_gather_31_u64base_index, svuint64_t,
+			 svprfh_gather_u64base_index (p0, z0, 31, SV_PLDL1KEEP),
+			 svprfh_gather_index (p0, z0, 31, SV_PLDL1KEEP))
+
+/*
+** prfh_gather_32_u64base_index:
+**	mov	(x[0-9]+), #?64
+**	prfb	pldl1strm, p0, \[\1, z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfh_gather_32_u64base_index, svuint64_t,
+			 svprfh_gather_u64base_index (p0, z0, 32, SV_PLDL1STRM),
+			 svprfh_gather_index (p0, z0, 32, SV_PLDL1STRM))
+
+/*
+** prfh_gather_x0_s32index:
+**	prfh	pldl2keep, p0, \[x0, z0\.s, sxtw 1\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfh_gather_x0_s32index, svint32_t,
+			 svprfh_gather_s32index (p0, x0, z0, SV_PLDL2KEEP),
+			 svprfh_gather_index (p0, x0, z0, SV_PLDL2KEEP))
+
+/*
+** prfh_gather_s32index:
+**	prfh	pldl2strm, p0, \[x0, z1\.s, sxtw 1\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfh_gather_s32index, svint32_t,
+			 svprfh_gather_s32index (p0, x0, z1, SV_PLDL2STRM),
+			 svprfh_gather_index (p0, x0, z1, SV_PLDL2STRM))
+
+/*
+** prfh_gather_x0_u32index:
+**	prfh	pldl3keep, p0, \[x0, z0\.s, uxtw 1\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfh_gather_x0_u32index, svuint32_t,
+			 svprfh_gather_u32index (p0, x0, z0, SV_PLDL3KEEP),
+			 svprfh_gather_index (p0, x0, z0, SV_PLDL3KEEP))
+
+/*
+** prfh_gather_u32index:
+**	prfh	pldl3strm, p0, \[x0, z1\.s, uxtw 1\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfh_gather_u32index, svuint32_t,
+			 svprfh_gather_u32index (p0, x0, z1, SV_PLDL3STRM),
+			 svprfh_gather_index (p0, x0, z1, SV_PLDL3STRM))
+
+/*
+** prfh_gather_x0_s64index:
+**	prfh	pstl1keep, p0, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfh_gather_x0_s64index, svint64_t,
+			 svprfh_gather_s64index (p0, x0, z0, SV_PSTL1KEEP),
+			 svprfh_gather_index (p0, x0, z0, SV_PSTL1KEEP))
+
+/*
+** prfh_gather_s64index:
+**	prfh	pstl1strm, p0, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfh_gather_s64index, svint64_t,
+			 svprfh_gather_s64index (p0, x0, z1, SV_PSTL1STRM),
+			 svprfh_gather_index (p0, x0, z1, SV_PSTL1STRM))
+
+/*
+** prfh_gather_ext_s64index:
+**	prfh	pstl1strm, p0, \[x0, z1\.d, sxtw 1\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfh_gather_ext_s64index, svint64_t,
+			 svprfh_gather_s64index (p0, x0, svextw_s64_x (p0, z1), SV_PSTL1STRM),
+			 svprfh_gather_index (p0, x0, svextw_x (p0, z1), SV_PSTL1STRM))
+
+/*
+** prfh_gather_x0_u64index:
+**	prfh	pstl2keep, p0, \[x0, z0\.d, lsl 1\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfh_gather_x0_u64index, svuint64_t,
+			 svprfh_gather_u64index (p0, x0, z0, SV_PSTL2KEEP),
+			 svprfh_gather_index (p0, x0, z0, SV_PSTL2KEEP))
+
+/*
+** prfh_gather_u64index:
+**	prfh	pstl2strm, p0, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfh_gather_u64index, svuint64_t,
+			 svprfh_gather_u64index (p0, x0, z1, SV_PSTL2STRM),
+			 svprfh_gather_index (p0, x0, z1, SV_PSTL2STRM))
+
+/*
+** prfh_gather_ext_u64index:
+**	prfh	pstl2strm, p0, \[x0, z1\.d, uxtw 1\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfh_gather_ext_u64index, svuint64_t,
+			 svprfh_gather_u64index (p0, x0, svextw_u64_x (p0, z1), SV_PSTL2STRM),
+			 svprfh_gather_index (p0, x0, svextw_x (p0, z1), SV_PSTL2STRM))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c
new file mode 100644
index 000000000..bbf6a45c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c
@@ -0,0 +1,245 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** prfw_base:
+**	prfw	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_base, uint8_t,
+	       svprfw (p0, x0, SV_PLDL1KEEP),
+	       svprfw (p0, x0, SV_PLDL1KEEP))
+
+/*
+** prfw_u8_index:
+**	add	(x[0-9+]), (x0, x1|x1, x0)
+**	prfw	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_u8_index, uint8_t,
+	       svprfw (p0, x0 + x1, SV_PLDL1KEEP),
+	       svprfw (p0, x0 + x1, SV_PLDL1KEEP))
+
+/*
+** prfw_u8_1:
+**	add	(x[0-9+]), x0, #?1
+**	prfw	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfw_u8_1, uint8_t,
+	       svprfw (p0, x0 + 1, SV_PLDL1KEEP),
+	       svprfw (p0, x0 + 1, SV_PLDL1KEEP))
+
+/*
+** prfw_u16_index:
+**	add	(x[0-9+]), x0, x1, lsl #?1
+**	prfw	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfw_u16_index, uint16_t,
+	       svprfw (p0, x0 + x1, SV_PLDL1KEEP),
+	       svprfw (p0, x0 + x1, SV_PLDL1KEEP))
+
+/*
+** prfw_u16_1:
+**	add	(x[0-9+]), x0, #?2
+**	prfw	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfw_u16_1, uint16_t,
+	       svprfw (p0, x0 + 1, SV_PLDL1KEEP),
+	       svprfw (p0, x0 + 1, SV_PLDL1KEEP))
+
+/*
+** prfw_u32_index:
+**	prfw	pldl1keep, p0, \[x0, x1, lsl #?2\]
+**	ret
+*/
+TEST_PREFETCH (prfw_u32_index, uint32_t,
+	       svprfw (p0, x0 + x1, SV_PLDL1KEEP),
+	       svprfw (p0, x0 + x1, SV_PLDL1KEEP))
+
+/*
+** prfw_u32_1:
+**	add	(x[0-9+]), x0, #?4
+**	prfw	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfw_u32_1, uint32_t,
+	       svprfw (p0, x0 + 1, SV_PLDL1KEEP),
+	       svprfw (p0, x0 + 1, SV_PLDL1KEEP))
+
+/*
+** prfw_u64_index:
+**	add	(x[0-9+]), x0, x1, lsl #?3
+**	prfw	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfw_u64_index, uint64_t,
+	       svprfw (p0, x0 + x1, SV_PLDL1KEEP),
+	       svprfw (p0, x0 + x1, SV_PLDL1KEEP))
+
+/*
+** prfw_u64_1:
+**	add	(x[0-9+]), x0, #?8
+**	prfw	pldl1keep, p0, \[\1\]
+**	ret
+*/
+TEST_PREFETCH (prfw_u64_1, uint64_t,
+	       svprfw (p0, x0 + 1, SV_PLDL1KEEP),
+	       svprfw (p0, x0 + 1, SV_PLDL1KEEP))
+
+/*
+** prfw_pldl1strm:
+**	prfw	pldl1strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_pldl1strm, uint8_t,
+	       svprfw (p0, x0, SV_PLDL1STRM),
+	       svprfw (p0, x0, SV_PLDL1STRM))
+
+/*
+** prfw_pldl2keep:
+**	prfw	pldl2keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_pldl2keep, uint8_t,
+	       svprfw (p0, x0, SV_PLDL2KEEP),
+	       svprfw (p0, x0, SV_PLDL2KEEP))
+
+/*
+** prfw_pldl2strm:
+**	prfw	pldl2strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_pldl2strm, uint8_t,
+	       svprfw (p0, x0, SV_PLDL2STRM),
+	       svprfw (p0, x0, SV_PLDL2STRM))
+
+/*
+** prfw_pldl3keep:
+**	prfw	pldl3keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_pldl3keep, uint8_t,
+	       svprfw (p0, x0, SV_PLDL3KEEP),
+	       svprfw (p0, x0, SV_PLDL3KEEP))
+
+/*
+** prfw_pldl3strm:
+**	prfw	pldl3strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_pldl3strm, uint8_t,
+	       svprfw (p0, x0, SV_PLDL3STRM),
+	       svprfw (p0, x0, SV_PLDL3STRM))
+
+/*
+** prfw_pstl1keep:
+**	prfw	pstl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_pstl1keep, uint8_t,
+	       svprfw (p0, x0, SV_PSTL1KEEP),
+	       svprfw (p0, x0, SV_PSTL1KEEP))
+
+/*
+** prfw_pstl1strm:
+**	prfw	pstl1strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_pstl1strm, uint8_t,
+	       svprfw (p0, x0, SV_PSTL1STRM),
+	       svprfw (p0, x0, SV_PSTL1STRM))
+
+/*
+** prfw_pstl2keep:
+**	prfw	pstl2keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_pstl2keep, uint8_t,
+	       svprfw (p0, x0, SV_PSTL2KEEP),
+	       svprfw (p0, x0, SV_PSTL2KEEP))
+
+/*
+** prfw_pstl2strm:
+**	prfw	pstl2strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_pstl2strm, uint8_t,
+	       svprfw (p0, x0, SV_PSTL2STRM),
+	       svprfw (p0, x0, SV_PSTL2STRM))
+
+/*
+** prfw_pstl3keep:
+**	prfw	pstl3keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_pstl3keep, uint8_t,
+	       svprfw (p0, x0, SV_PSTL3KEEP),
+	       svprfw (p0, x0, SV_PSTL3KEEP))
+
+/*
+** prfw_pstl3strm:
+**	prfw	pstl3strm, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_pstl3strm, uint8_t,
+	       svprfw (p0, x0, SV_PSTL3STRM),
+	       svprfw (p0, x0, SV_PSTL3STRM))
+
+/*
+** prfw_vnum_0:
+**	prfw	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_vnum_0, uint8_t,
+	       svprfw_vnum (p0, x0, 0, SV_PLDL1KEEP),
+	       svprfw_vnum (p0, x0, 0, SV_PLDL1KEEP))
+
+/*
+** prfw_vnum_1:
+**	incb	x0
+**	prfw	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_vnum_1, uint16_t,
+	       svprfw_vnum (p0, x0, 1, SV_PLDL1KEEP),
+	       svprfw_vnum (p0, x0, 1, SV_PLDL1KEEP))
+
+/*
+** prfw_vnum_2:
+**	incb	x0, all, mul #2
+**	prfw	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_vnum_2, uint32_t,
+	       svprfw_vnum (p0, x0, 2, SV_PLDL1KEEP),
+	       svprfw_vnum (p0, x0, 2, SV_PLDL1KEEP))
+
+/*
+** prfw_vnum_3:
+**	incb	x0, all, mul #3
+**	prfw	pldl1keep, p0, \[x0\]
+**	ret
+*/
+TEST_PREFETCH (prfw_vnum_3, uint64_t,
+	       svprfw_vnum (p0, x0, 3, SV_PLDL1KEEP),
+	       svprfw_vnum (p0, x0, 3, SV_PLDL1KEEP))
+
+/*
+** prfw_vnum_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	prfw	pldl1keep, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	prfw	zldl1keep, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_PREFETCH (prfw_vnum_x1, uint64_t,
+	       svprfw_vnum (p0, x0, x1, SV_PLDL1KEEP),
+	       svprfw_vnum (p0, x0, x1, SV_PLDL1KEEP))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw_gather.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw_gather.c
new file mode 100644
index 000000000..2bbae1b9e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw_gather.c
@@ -0,0 +1,225 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** prfw_gather_u32base:
+**	prfw	pldl1keep, p0, \[z0\.s\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfw_gather_u32base, svuint32_t,
+			 svprfw_gather_u32base (p0, z0, SV_PLDL1KEEP),
+			 svprfw_gather (p0, z0, SV_PLDL1KEEP))
+
+/*
+** prfw_gather_u64base:
+**	prfw	pldl1strm, p0, \[z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfw_gather_u64base, svuint64_t,
+			 svprfw_gather_u64base (p0, z0, SV_PLDL1STRM),
+			 svprfw_gather (p0, z0, SV_PLDL1STRM))
+
+/*
+** prfw_gather_x0_u32base_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	prfb	pldl2keep, p0, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfw_gather_x0_u32base_index, svuint32_t,
+			 svprfw_gather_u32base_index (p0, z0, x0, SV_PLDL2KEEP),
+			 svprfw_gather_index (p0, z0, x0, SV_PLDL2KEEP))
+
+/*
+** prfw_gather_m1_u32base_index:
+**	mov	(x[0-9]+), #?-4
+**	prfb	pldl2strm, p0, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfw_gather_m1_u32base_index, svuint32_t,
+			 svprfw_gather_u32base_index (p0, z0, -1, SV_PLDL2STRM),
+			 svprfw_gather_index (p0, z0, -1, SV_PLDL2STRM))
+
+/*
+** prfw_gather_0_u32base_index:
+**	prfw	pldl3keep, p0, \[z0\.s\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfw_gather_0_u32base_index, svuint32_t,
+			 svprfw_gather_u32base_index (p0, z0, 0, SV_PLDL3KEEP),
+			 svprfw_gather_index (p0, z0, 0, SV_PLDL3KEEP))
+
+/*
+** prfw_gather_5_u32base_index:
+**	prfw	pldl3strm, p0, \[z0\.s, #20\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfw_gather_5_u32base_index, svuint32_t,
+			 svprfw_gather_u32base_index (p0, z0, 5, SV_PLDL3STRM),
+			 svprfw_gather_index (p0, z0, 5, SV_PLDL3STRM))
+
+/*
+** prfw_gather_31_u32base_index:
+**	prfw	pstl1keep, p0, \[z0\.s, #124\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfw_gather_31_u32base_index, svuint32_t,
+			 svprfw_gather_u32base_index (p0, z0, 31, SV_PSTL1KEEP),
+			 svprfw_gather_index (p0, z0, 31, SV_PSTL1KEEP))
+
+/*
+** prfw_gather_32_u32base_index:
+**	mov	(x[0-9]+), #?128
+**	prfb	pstl1strm, p0, \[\1, z0\.s, uxtw\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfw_gather_32_u32base_index, svuint32_t,
+			 svprfw_gather_u32base_index (p0, z0, 32, SV_PSTL1STRM),
+			 svprfw_gather_index (p0, z0, 32, SV_PSTL1STRM))
+
+/*
+** prfw_gather_x0_u64base_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	prfb	pstl2keep, p0, \[\1, z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfw_gather_x0_u64base_index, svuint64_t,
+			 svprfw_gather_u64base_index (p0, z0, x0, SV_PSTL2KEEP),
+			 svprfw_gather_index (p0, z0, x0, SV_PSTL2KEEP))
+
+/*
+** prfw_gather_m1_u64base_index:
+**	mov	(x[0-9]+), #?-4
+**	prfb	pstl2strm, p0, \[\1, z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfw_gather_m1_u64base_index, svuint64_t,
+			 svprfw_gather_u64base_index (p0, z0, -1, SV_PSTL2STRM),
+			 svprfw_gather_index (p0, z0, -1, SV_PSTL2STRM))
+
+/*
+** prfw_gather_0_u64base_index:
+**	prfw	pstl3keep, p0, \[z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfw_gather_0_u64base_index, svuint64_t,
+			 svprfw_gather_u64base_index (p0, z0, 0, SV_PSTL3KEEP),
+			 svprfw_gather_index (p0, z0, 0, SV_PSTL3KEEP))
+
+/*
+** prfw_gather_5_u64base_index:
+**	prfw	pstl3strm, p0, \[z0\.d, #20\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfw_gather_5_u64base_index, svuint64_t,
+			 svprfw_gather_u64base_index (p0, z0, 5, SV_PSTL3STRM),
+			 svprfw_gather_index (p0, z0, 5, SV_PSTL3STRM))
+
+/*
+** prfw_gather_31_u64base_index:
+**	prfw	pldl1keep, p0, \[z0\.d, #124\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfw_gather_31_u64base_index, svuint64_t,
+			 svprfw_gather_u64base_index (p0, z0, 31, SV_PLDL1KEEP),
+			 svprfw_gather_index (p0, z0, 31, SV_PLDL1KEEP))
+
+/*
+** prfw_gather_32_u64base_index:
+**	mov	(x[0-9]+), #?128
+**	prfb	pldl1strm, p0, \[\1, z0\.d\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_ZS (prfw_gather_32_u64base_index, svuint64_t,
+			 svprfw_gather_u64base_index (p0, z0, 32, SV_PLDL1STRM),
+			 svprfw_gather_index (p0, z0, 32, SV_PLDL1STRM))
+
+/*
+** prfw_gather_x0_s32index:
+**	prfw	pldl2keep, p0, \[x0, z0\.s, sxtw 2\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfw_gather_x0_s32index, svint32_t,
+			 svprfw_gather_s32index (p0, x0, z0, SV_PLDL2KEEP),
+			 svprfw_gather_index (p0, x0, z0, SV_PLDL2KEEP))
+
+/*
+** prfw_gather_s32index:
+**	prfw	pldl2strm, p0, \[x0, z1\.s, sxtw 2\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfw_gather_s32index, svint32_t,
+			 svprfw_gather_s32index (p0, x0, z1, SV_PLDL2STRM),
+			 svprfw_gather_index (p0, x0, z1, SV_PLDL2STRM))
+
+/*
+** prfw_gather_x0_u32index:
+**	prfw	pldl3keep, p0, \[x0, z0\.s, uxtw 2\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfw_gather_x0_u32index, svuint32_t,
+			 svprfw_gather_u32index (p0, x0, z0, SV_PLDL3KEEP),
+			 svprfw_gather_index (p0, x0, z0, SV_PLDL3KEEP))
+
+/*
+** prfw_gather_u32index:
+**	prfw	pldl3strm, p0, \[x0, z1\.s, uxtw 2\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfw_gather_u32index, svuint32_t,
+			 svprfw_gather_u32index (p0, x0, z1, SV_PLDL3STRM),
+			 svprfw_gather_index (p0, x0, z1, SV_PLDL3STRM))
+
+/*
+** prfw_gather_x0_s64index:
+**	prfw	pstl1keep, p0, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfw_gather_x0_s64index, svint64_t,
+			 svprfw_gather_s64index (p0, x0, z0, SV_PSTL1KEEP),
+			 svprfw_gather_index (p0, x0, z0, SV_PSTL1KEEP))
+
+/*
+** prfw_gather_s64index:
+**	prfw	pstl1strm, p0, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfw_gather_s64index, svint64_t,
+			 svprfw_gather_s64index (p0, x0, z1, SV_PSTL1STRM),
+			 svprfw_gather_index (p0, x0, z1, SV_PSTL1STRM))
+
+/*
+** prfw_gather_ext_s64index:
+**	prfw	pstl1strm, p0, \[x0, z1\.d, sxtw 2\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfw_gather_ext_s64index, svint64_t,
+			 svprfw_gather_s64index (p0, x0, svextw_s64_x (p0, z1), SV_PSTL1STRM),
+			 svprfw_gather_index (p0, x0, svextw_x (p0, z1), SV_PSTL1STRM))
+
+/*
+** prfw_gather_x0_u64index:
+**	prfw	pstl2keep, p0, \[x0, z0\.d, lsl 2\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfw_gather_x0_u64index, svuint64_t,
+			 svprfw_gather_u64index (p0, x0, z0, SV_PSTL2KEEP),
+			 svprfw_gather_index (p0, x0, z0, SV_PSTL2KEEP))
+
+/*
+** prfw_gather_u64index:
+**	prfw	pstl2strm, p0, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfw_gather_u64index, svuint64_t,
+			 svprfw_gather_u64index (p0, x0, z1, SV_PSTL2STRM),
+			 svprfw_gather_index (p0, x0, z1, SV_PSTL2STRM))
+
+/*
+** prfw_gather_ext_u64index:
+**	prfw	pstl2strm, p0, \[x0, z1\.d, uxtw 2\]
+**	ret
+*/
+TEST_PREFETCH_GATHER_SZ (prfw_gather_ext_u64index, svuint64_t,
+			 svprfw_gather_u64index (p0, x0, svextw_u64_x (p0, z1), SV_PSTL2STRM),
+			 svprfw_gather_index (p0, x0, svextw_x (p0, z1), SV_PSTL2STRM))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_any.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_any.c
new file mode 100644
index 000000000..33280d388
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_any.c
@@ -0,0 +1,77 @@
+/* { dg-additional-options "-msve-vector-bits=scalable" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+#include <stdbool.h>
+
+/*
+** test_bool_any:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, any
+**	ret
+*/
+TEST_PTEST (test_bool_any, bool,
+	    x0 = svptest_any (p0, p1));
+
+/*
+** test_bool_none:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, none
+**	ret
+*/
+TEST_PTEST (test_bool_none, bool,
+	    x0 = !svptest_any (p0, p1));
+
+/*
+** test_int_any:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, any
+**	ret
+*/
+TEST_PTEST (test_int_any, int,
+	    x0 = svptest_any (p0, p1));
+
+/*
+** test_int_none:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, none
+**	ret
+*/
+TEST_PTEST (test_int_none, int,
+	    x0 = !svptest_any (p0, p1));
+
+/*
+** test_int64_t_any:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, any
+**	ret
+*/
+TEST_PTEST (test_int64_t_any, int64_t,
+	    x0 = svptest_any (p0, p1));
+
+/*
+** test_int64_t_none:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, none
+**	ret
+*/
+TEST_PTEST (test_int64_t_none, int64_t,
+	    x0 = !svptest_any (p0, p1));
+
+/*
+** sel_any:
+**	ptest	p0, p1\.b
+**	csel	x0, (x0, x1, any|x1, x0, none)
+**	ret
+*/
+TEST_PTEST (sel_any, int64_t,
+	    x0 = svptest_any (p0, p1) ? x0 : x1);
+
+/*
+** sel_none:
+**	ptest	p0, p1\.b
+**	csel	x0, (x0, x1, none|x1, x0, any)
+**	ret
+*/
+TEST_PTEST (sel_none, int64_t,
+	    x0 = !svptest_any (p0, p1) ? x0 : x1);
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_first.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_first.c
new file mode 100644
index 000000000..991dabd3d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_first.c
@@ -0,0 +1,77 @@
+/* { dg-additional-options "-msve-vector-bits=scalable" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+#include <stdbool.h>
+
+/*
+** test_bool_first:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, first
+**	ret
+*/
+TEST_PTEST (test_bool_first, bool,
+	    x0 = svptest_first (p0, p1));
+
+/*
+** test_bool_nfrst:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, nfrst
+**	ret
+*/
+TEST_PTEST (test_bool_nfrst, bool,
+	    x0 = !svptest_first (p0, p1));
+
+/*
+** test_int_first:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, first
+**	ret
+*/
+TEST_PTEST (test_int_first, int,
+	    x0 = svptest_first (p0, p1));
+
+/*
+** test_int_nfrst:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, nfrst
+**	ret
+*/
+TEST_PTEST (test_int_nfrst, int,
+	    x0 = !svptest_first (p0, p1));
+
+/*
+** test_int64_t_first:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, first
+**	ret
+*/
+TEST_PTEST (test_int64_t_first, int64_t,
+	    x0 = svptest_first (p0, p1));
+
+/*
+** test_int64_t_nfrst:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, nfrst
+**	ret
+*/
+TEST_PTEST (test_int64_t_nfrst, int64_t,
+	    x0 = !svptest_first (p0, p1));
+
+/*
+** sel_first:
+**	ptest	p0, p1\.b
+**	csel	x0, (x0, x1, first|x1, x0, nfrst)
+**	ret
+*/
+TEST_PTEST (sel_first, int64_t,
+	    x0 = svptest_first (p0, p1) ? x0 : x1);
+
+/*
+** sel_nfrst:
+**	ptest	p0, p1\.b
+**	csel	x0, (x0, x1, nfrst|x1, x0, first)
+**	ret
+*/
+TEST_PTEST (sel_nfrst, int64_t,
+	    x0 = !svptest_first (p0, p1) ? x0 : x1);
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_last.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_last.c
new file mode 100644
index 000000000..b952a4149
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptest_last.c
@@ -0,0 +1,77 @@
+/* { dg-additional-options "-msve-vector-bits=scalable" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+#include <stdbool.h>
+
+/*
+** test_bool_last:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, last
+**	ret
+*/
+TEST_PTEST (test_bool_last, bool,
+	    x0 = svptest_last (p0, p1));
+
+/*
+** test_bool_nlast:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, nlast
+**	ret
+*/
+TEST_PTEST (test_bool_nlast, bool,
+	    x0 = !svptest_last (p0, p1));
+
+/*
+** test_int_last:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, last
+**	ret
+*/
+TEST_PTEST (test_int_last, int,
+	    x0 = svptest_last (p0, p1));
+
+/*
+** test_int_nlast:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, nlast
+**	ret
+*/
+TEST_PTEST (test_int_nlast, int,
+	    x0 = !svptest_last (p0, p1));
+
+/*
+** test_int64_t_last:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, last
+**	ret
+*/
+TEST_PTEST (test_int64_t_last, int64_t,
+	    x0 = svptest_last (p0, p1));
+
+/*
+** test_int64_t_nlast:
+**	ptest	p0, p1\.b
+**	cset	[wx]0, nlast
+**	ret
+*/
+TEST_PTEST (test_int64_t_nlast, int64_t,
+	    x0 = !svptest_last (p0, p1));
+
+/*
+** sel_last:
+**	ptest	p0, p1\.b
+**	csel	x0, (x0, x1, last|x1, x0, nlast)
+**	ret
+*/
+TEST_PTEST (sel_last, int64_t,
+	    x0 = svptest_last (p0, p1) ? x0 : x1);
+
+/*
+** sel_nlast:
+**	ptest	p0, p1\.b
+**	csel	x0, (x0, x1, nlast|x1, x0, last)
+**	ret
+*/
+TEST_PTEST (sel_nlast, int64_t,
+	    x0 = !svptest_last (p0, p1) ? x0 : x1);
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue.c
new file mode 100644
index 000000000..9c86170cb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue.c
@@ -0,0 +1,40 @@
+/* { dg-additional-options "-msve-vector-bits=scalable" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ptrue_b8:
+**	ptrue	p0\.b, all
+**	ret
+*/
+TEST_P (ptrue_b8,
+	p0 = svptrue_b8 (),
+	p0 = svptrue_b8 ());
+
+/*
+** ptrue_b16:
+**	ptrue	p0\.h, all
+**	ret
+*/
+TEST_P (ptrue_b16,
+	p0 = svptrue_b16 (),
+	p0 = svptrue_b16 ());
+
+/*
+** ptrue_b32:
+**	ptrue	p0\.s, all
+**	ret
+*/
+TEST_P (ptrue_b32,
+	p0 = svptrue_b32 (),
+	p0 = svptrue_b32 ());
+
+/*
+** ptrue_b64:
+**	ptrue	p0\.d, all
+**	ret
+*/
+TEST_P (ptrue_b64,
+	p0 = svptrue_b64 (),
+	p0 = svptrue_b64 ());
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b16.c
new file mode 100644
index 000000000..d7f83f5c6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b16.c
@@ -0,0 +1,156 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ptrue_pat_pow2_b16:
+**	ptrue	p0\.h, pow2
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_pow2_b16,
+		p0 = svptrue_pat_b16 (SV_POW2),
+		p0 = svptrue_pat_b16 (SV_POW2))
+
+/*
+** ptrue_pat_vl1_b16:
+**	ptrue	p0\.[bhsd], vl1
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl1_b16,
+		p0 = svptrue_pat_b16 (SV_VL1),
+		p0 = svptrue_pat_b16 (SV_VL1))
+
+/*
+** ptrue_pat_vl2_b16:
+**	ptrue	p0\.h, vl2
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl2_b16,
+		p0 = svptrue_pat_b16 (SV_VL2),
+		p0 = svptrue_pat_b16 (SV_VL2))
+
+/*
+** ptrue_pat_vl3_b16:
+**	ptrue	p0\.h, vl3
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl3_b16,
+		p0 = svptrue_pat_b16 (SV_VL3),
+		p0 = svptrue_pat_b16 (SV_VL3))
+
+/*
+** ptrue_pat_vl4_b16:
+**	ptrue	p0\.h, vl4
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl4_b16,
+		p0 = svptrue_pat_b16 (SV_VL4),
+		p0 = svptrue_pat_b16 (SV_VL4))
+
+/*
+** ptrue_pat_vl5_b16:
+**	ptrue	p0\.h, vl5
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl5_b16,
+		p0 = svptrue_pat_b16 (SV_VL5),
+		p0 = svptrue_pat_b16 (SV_VL5))
+
+/*
+** ptrue_pat_vl6_b16:
+**	ptrue	p0\.h, vl6
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl6_b16,
+		p0 = svptrue_pat_b16 (SV_VL6),
+		p0 = svptrue_pat_b16 (SV_VL6))
+
+/*
+** ptrue_pat_vl7_b16:
+**	ptrue	p0\.h, vl7
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl7_b16,
+		p0 = svptrue_pat_b16 (SV_VL7),
+		p0 = svptrue_pat_b16 (SV_VL7))
+
+/*
+** ptrue_pat_vl8_b16:
+**	ptrue	p0\.h, vl8
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl8_b16,
+		p0 = svptrue_pat_b16 (SV_VL8),
+		p0 = svptrue_pat_b16 (SV_VL8))
+
+/*
+** ptrue_pat_vl16_b16:
+**	ptrue	p0\.[bhsd], vl16
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl16_b16,
+		p0 = svptrue_pat_b16 (SV_VL16),
+		p0 = svptrue_pat_b16 (SV_VL16))
+
+/*
+** ptrue_pat_vl32_b16:
+**	ptrue	p0\.h, vl32
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl32_b16,
+		p0 = svptrue_pat_b16 (SV_VL32),
+		p0 = svptrue_pat_b16 (SV_VL32))
+
+/*
+** ptrue_pat_vl64_b16:
+**	ptrue	p0\.h, vl64
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl64_b16,
+		p0 = svptrue_pat_b16 (SV_VL64),
+		p0 = svptrue_pat_b16 (SV_VL64))
+
+/*
+** ptrue_pat_vl128_b16:
+**	ptrue	p0\.[bhsd], vl128
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl128_b16,
+		p0 = svptrue_pat_b16 (SV_VL128),
+		p0 = svptrue_pat_b16 (SV_VL128))
+
+/*
+** ptrue_pat_vl256_b16:
+**	ptrue	p0\.h, vl256
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl256_b16,
+		p0 = svptrue_pat_b16 (SV_VL256),
+		p0 = svptrue_pat_b16 (SV_VL256))
+
+/*
+** ptrue_pat_mul4_b16:
+**	ptrue	p0\.h, mul4
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_mul4_b16,
+		p0 = svptrue_pat_b16 (SV_MUL4),
+		p0 = svptrue_pat_b16 (SV_MUL4))
+
+/*
+** ptrue_pat_mul3_b16:
+**	ptrue	p0\.h, mul3
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_mul3_b16,
+		p0 = svptrue_pat_b16 (SV_MUL3),
+		p0 = svptrue_pat_b16 (SV_MUL3))
+
+/*
+** ptrue_pat_all_b16:
+**	ptrue	p0\.h[^\n]*
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_all_b16,
+		p0 = svptrue_pat_b16 (SV_ALL),
+		p0 = svptrue_pat_b16 (SV_ALL))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b32.c
new file mode 100644
index 000000000..11cf5aebb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b32.c
@@ -0,0 +1,156 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ptrue_pat_pow2_b32:
+**	ptrue	p0\.s, pow2
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_pow2_b32,
+		p0 = svptrue_pat_b32 (SV_POW2),
+		p0 = svptrue_pat_b32 (SV_POW2))
+
+/*
+** ptrue_pat_vl1_b32:
+**	ptrue	p0\.[bhsd], vl1
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl1_b32,
+		p0 = svptrue_pat_b32 (SV_VL1),
+		p0 = svptrue_pat_b32 (SV_VL1))
+
+/*
+** ptrue_pat_vl2_b32:
+**	ptrue	p0\.s, vl2
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl2_b32,
+		p0 = svptrue_pat_b32 (SV_VL2),
+		p0 = svptrue_pat_b32 (SV_VL2))
+
+/*
+** ptrue_pat_vl3_b32:
+**	ptrue	p0\.s, vl3
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl3_b32,
+		p0 = svptrue_pat_b32 (SV_VL3),
+		p0 = svptrue_pat_b32 (SV_VL3))
+
+/*
+** ptrue_pat_vl4_b32:
+**	ptrue	p0\.s, vl4
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl4_b32,
+		p0 = svptrue_pat_b32 (SV_VL4),
+		p0 = svptrue_pat_b32 (SV_VL4))
+
+/*
+** ptrue_pat_vl5_b32:
+**	ptrue	p0\.s, vl5
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl5_b32,
+		p0 = svptrue_pat_b32 (SV_VL5),
+		p0 = svptrue_pat_b32 (SV_VL5))
+
+/*
+** ptrue_pat_vl6_b32:
+**	ptrue	p0\.s, vl6
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl6_b32,
+		p0 = svptrue_pat_b32 (SV_VL6),
+		p0 = svptrue_pat_b32 (SV_VL6))
+
+/*
+** ptrue_pat_vl7_b32:
+**	ptrue	p0\.s, vl7
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl7_b32,
+		p0 = svptrue_pat_b32 (SV_VL7),
+		p0 = svptrue_pat_b32 (SV_VL7))
+
+/*
+** ptrue_pat_vl8_b32:
+**	ptrue	p0\.s, vl8
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl8_b32,
+		p0 = svptrue_pat_b32 (SV_VL8),
+		p0 = svptrue_pat_b32 (SV_VL8))
+
+/*
+** ptrue_pat_vl16_b32:
+**	ptrue	p0\.[bhsd], vl16
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl16_b32,
+		p0 = svptrue_pat_b32 (SV_VL16),
+		p0 = svptrue_pat_b32 (SV_VL16))
+
+/*
+** ptrue_pat_vl32_b32:
+**	ptrue	p0\.s, vl32
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl32_b32,
+		p0 = svptrue_pat_b32 (SV_VL32),
+		p0 = svptrue_pat_b32 (SV_VL32))
+
+/*
+** ptrue_pat_vl64_b32:
+**	ptrue	p0\.s, vl64
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl64_b32,
+		p0 = svptrue_pat_b32 (SV_VL64),
+		p0 = svptrue_pat_b32 (SV_VL64))
+
+/*
+** ptrue_pat_vl128_b32:
+**	ptrue	p0\.[bhsd], vl128
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl128_b32,
+		p0 = svptrue_pat_b32 (SV_VL128),
+		p0 = svptrue_pat_b32 (SV_VL128))
+
+/*
+** ptrue_pat_vl256_b32:
+**	ptrue	p0\.s, vl256
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl256_b32,
+		p0 = svptrue_pat_b32 (SV_VL256),
+		p0 = svptrue_pat_b32 (SV_VL256))
+
+/*
+** ptrue_pat_mul4_b32:
+**	ptrue	p0\.s, mul4
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_mul4_b32,
+		p0 = svptrue_pat_b32 (SV_MUL4),
+		p0 = svptrue_pat_b32 (SV_MUL4))
+
+/*
+** ptrue_pat_mul3_b32:
+**	ptrue	p0\.s, mul3
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_mul3_b32,
+		p0 = svptrue_pat_b32 (SV_MUL3),
+		p0 = svptrue_pat_b32 (SV_MUL3))
+
+/*
+** ptrue_pat_all_b32:
+**	ptrue	p0\.s[^\n]*
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_all_b32,
+		p0 = svptrue_pat_b32 (SV_ALL),
+		p0 = svptrue_pat_b32 (SV_ALL))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b64.c
new file mode 100644
index 000000000..4c4202bb3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b64.c
@@ -0,0 +1,156 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ptrue_pat_pow2_b64:
+**	ptrue	p0\.d, pow2
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_pow2_b64,
+		p0 = svptrue_pat_b64 (SV_POW2),
+		p0 = svptrue_pat_b64 (SV_POW2))
+
+/*
+** ptrue_pat_vl1_b64:
+**	ptrue	p0\.[bhsd], vl1
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl1_b64,
+		p0 = svptrue_pat_b64 (SV_VL1),
+		p0 = svptrue_pat_b64 (SV_VL1))
+
+/*
+** ptrue_pat_vl2_b64:
+**	ptrue	p0\.d, vl2
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl2_b64,
+		p0 = svptrue_pat_b64 (SV_VL2),
+		p0 = svptrue_pat_b64 (SV_VL2))
+
+/*
+** ptrue_pat_vl3_b64:
+**	ptrue	p0\.d, vl3
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl3_b64,
+		p0 = svptrue_pat_b64 (SV_VL3),
+		p0 = svptrue_pat_b64 (SV_VL3))
+
+/*
+** ptrue_pat_vl4_b64:
+**	ptrue	p0\.d, vl4
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl4_b64,
+		p0 = svptrue_pat_b64 (SV_VL4),
+		p0 = svptrue_pat_b64 (SV_VL4))
+
+/*
+** ptrue_pat_vl5_b64:
+**	ptrue	p0\.d, vl5
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl5_b64,
+		p0 = svptrue_pat_b64 (SV_VL5),
+		p0 = svptrue_pat_b64 (SV_VL5))
+
+/*
+** ptrue_pat_vl6_b64:
+**	ptrue	p0\.d, vl6
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl6_b64,
+		p0 = svptrue_pat_b64 (SV_VL6),
+		p0 = svptrue_pat_b64 (SV_VL6))
+
+/*
+** ptrue_pat_vl7_b64:
+**	ptrue	p0\.d, vl7
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl7_b64,
+		p0 = svptrue_pat_b64 (SV_VL7),
+		p0 = svptrue_pat_b64 (SV_VL7))
+
+/*
+** ptrue_pat_vl8_b64:
+**	ptrue	p0\.d, vl8
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl8_b64,
+		p0 = svptrue_pat_b64 (SV_VL8),
+		p0 = svptrue_pat_b64 (SV_VL8))
+
+/*
+** ptrue_pat_vl16_b64:
+**	ptrue	p0\.[bhsd], vl16
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl16_b64,
+		p0 = svptrue_pat_b64 (SV_VL16),
+		p0 = svptrue_pat_b64 (SV_VL16))
+
+/*
+** ptrue_pat_vl32_b64:
+**	ptrue	p0\.d, vl32
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl32_b64,
+		p0 = svptrue_pat_b64 (SV_VL32),
+		p0 = svptrue_pat_b64 (SV_VL32))
+
+/*
+** ptrue_pat_vl64_b64:
+**	ptrue	p0\.d, vl64
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl64_b64,
+		p0 = svptrue_pat_b64 (SV_VL64),
+		p0 = svptrue_pat_b64 (SV_VL64))
+
+/*
+** ptrue_pat_vl128_b64:
+**	ptrue	p0\.[bhsd], vl128
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl128_b64,
+		p0 = svptrue_pat_b64 (SV_VL128),
+		p0 = svptrue_pat_b64 (SV_VL128))
+
+/*
+** ptrue_pat_vl256_b64:
+**	ptrue	p0\.d, vl256
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl256_b64,
+		p0 = svptrue_pat_b64 (SV_VL256),
+		p0 = svptrue_pat_b64 (SV_VL256))
+
+/*
+** ptrue_pat_mul4_b64:
+**	ptrue	p0\.d, mul4
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_mul4_b64,
+		p0 = svptrue_pat_b64 (SV_MUL4),
+		p0 = svptrue_pat_b64 (SV_MUL4))
+
+/*
+** ptrue_pat_mul3_b64:
+**	ptrue	p0\.d, mul3
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_mul3_b64,
+		p0 = svptrue_pat_b64 (SV_MUL3),
+		p0 = svptrue_pat_b64 (SV_MUL3))
+
+/*
+** ptrue_pat_all_b64:
+**	ptrue	p0\.d[^\n]*
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_all_b64,
+		p0 = svptrue_pat_b64 (SV_ALL),
+		p0 = svptrue_pat_b64 (SV_ALL))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b8.c
new file mode 100644
index 000000000..49fb8c555
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/ptrue_pat_b8.c
@@ -0,0 +1,156 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ptrue_pat_pow2_b8:
+**	ptrue	p0\.b, pow2
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_pow2_b8,
+		p0 = svptrue_pat_b8 (SV_POW2),
+		p0 = svptrue_pat_b8 (SV_POW2))
+
+/*
+** ptrue_pat_vl1_b8:
+**	ptrue	p0\.[bhsd], vl1
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl1_b8,
+		p0 = svptrue_pat_b8 (SV_VL1),
+		p0 = svptrue_pat_b8 (SV_VL1))
+
+/*
+** ptrue_pat_vl2_b8:
+**	ptrue	p0\.b, vl2
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl2_b8,
+		p0 = svptrue_pat_b8 (SV_VL2),
+		p0 = svptrue_pat_b8 (SV_VL2))
+
+/*
+** ptrue_pat_vl3_b8:
+**	ptrue	p0\.b, vl3
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl3_b8,
+		p0 = svptrue_pat_b8 (SV_VL3),
+		p0 = svptrue_pat_b8 (SV_VL3))
+
+/*
+** ptrue_pat_vl4_b8:
+**	ptrue	p0\.b, vl4
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl4_b8,
+		p0 = svptrue_pat_b8 (SV_VL4),
+		p0 = svptrue_pat_b8 (SV_VL4))
+
+/*
+** ptrue_pat_vl5_b8:
+**	ptrue	p0\.b, vl5
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl5_b8,
+		p0 = svptrue_pat_b8 (SV_VL5),
+		p0 = svptrue_pat_b8 (SV_VL5))
+
+/*
+** ptrue_pat_vl6_b8:
+**	ptrue	p0\.b, vl6
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl6_b8,
+		p0 = svptrue_pat_b8 (SV_VL6),
+		p0 = svptrue_pat_b8 (SV_VL6))
+
+/*
+** ptrue_pat_vl7_b8:
+**	ptrue	p0\.b, vl7
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl7_b8,
+		p0 = svptrue_pat_b8 (SV_VL7),
+		p0 = svptrue_pat_b8 (SV_VL7))
+
+/*
+** ptrue_pat_vl8_b8:
+**	ptrue	p0\.b, vl8
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl8_b8,
+		p0 = svptrue_pat_b8 (SV_VL8),
+		p0 = svptrue_pat_b8 (SV_VL8))
+
+/*
+** ptrue_pat_vl16_b8:
+**	ptrue	p0\.[bhsd], vl16
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl16_b8,
+		p0 = svptrue_pat_b8 (SV_VL16),
+		p0 = svptrue_pat_b8 (SV_VL16))
+
+/*
+** ptrue_pat_vl32_b8:
+**	ptrue	p0\.b, vl32
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl32_b8,
+		p0 = svptrue_pat_b8 (SV_VL32),
+		p0 = svptrue_pat_b8 (SV_VL32))
+
+/*
+** ptrue_pat_vl64_b8:
+**	ptrue	p0\.b, vl64
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl64_b8,
+		p0 = svptrue_pat_b8 (SV_VL64),
+		p0 = svptrue_pat_b8 (SV_VL64))
+
+/*
+** ptrue_pat_vl128_b8:
+**	ptrue	p0\.[bhsd], vl128
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl128_b8,
+		p0 = svptrue_pat_b8 (SV_VL128),
+		p0 = svptrue_pat_b8 (SV_VL128))
+
+/*
+** ptrue_pat_vl256_b8:
+**	ptrue	p0\.b, vl256
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_vl256_b8,
+		p0 = svptrue_pat_b8 (SV_VL256),
+		p0 = svptrue_pat_b8 (SV_VL256))
+
+/*
+** ptrue_pat_mul4_b8:
+**	ptrue	p0\.b, mul4
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_mul4_b8,
+		p0 = svptrue_pat_b8 (SV_MUL4),
+		p0 = svptrue_pat_b8 (SV_MUL4))
+
+/*
+** ptrue_pat_mul3_b8:
+**	ptrue	p0\.b, mul3
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_mul3_b8,
+		p0 = svptrue_pat_b8 (SV_MUL3),
+		p0 = svptrue_pat_b8 (SV_MUL3))
+
+/*
+** ptrue_pat_all_b8:
+**	ptrue	p0\.b[^\n]*
+**	ret
+*/
+TEST_UNIFORM_P (ptrue_pat_all_b8,
+		p0 = svptrue_pat_b8 (SV_ALL),
+		p0 = svptrue_pat_b8 (SV_ALL))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s16.c
new file mode 100644
index 000000000..03255c41c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s16.c
@@ -0,0 +1,123 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qadd_s16_tied1:
+**	sqadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_s16_tied1, svint16_t,
+		z0 = svqadd_s16 (z0, z1),
+		z0 = svqadd (z0, z1))
+
+/*
+** qadd_s16_tied2:
+**	sqadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_s16_tied2, svint16_t,
+		z0 = svqadd_s16 (z1, z0),
+		z0 = svqadd (z1, z0))
+
+/*
+** qadd_s16_untied:
+**	sqadd	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_s16_untied, svint16_t,
+		z0 = svqadd_s16 (z1, z2),
+		z0 = svqadd (z1, z2))
+
+/*
+** qadd_w0_s16_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	sqadd	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_ZX (qadd_w0_s16_tied1, svint16_t, int16_t,
+		 z0 = svqadd_n_s16 (z0, x0),
+		 z0 = svqadd (z0, x0))
+
+/*
+** qadd_w0_s16_untied:
+**	mov	(z[0-9]+\.h), w0
+**	sqadd	z0\.h, (z1\.h, \1|\1, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_ZX (qadd_w0_s16_untied, svint16_t, int16_t,
+		 z0 = svqadd_n_s16 (z1, x0),
+		 z0 = svqadd (z1, x0))
+
+/*
+** qadd_1_s16_tied1:
+**	sqadd	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_1_s16_tied1, svint16_t,
+		z0 = svqadd_n_s16 (z0, 1),
+		z0 = svqadd (z0, 1))
+
+/*
+** qadd_1_s16_untied:
+**	movprfx	z0, z1
+**	sqadd	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_1_s16_untied, svint16_t,
+		z0 = svqadd_n_s16 (z1, 1),
+		z0 = svqadd (z1, 1))
+
+/*
+** qadd_127_s16:
+**	sqadd	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_127_s16, svint16_t,
+		z0 = svqadd_n_s16 (z0, 127),
+		z0 = svqadd (z0, 127))
+
+/*
+** qadd_128_s16:
+**	sqadd	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_128_s16, svint16_t,
+		z0 = svqadd_n_s16 (z0, 128),
+		z0 = svqadd (z0, 128))
+
+/*
+** qadd_255_s16:
+**	sqadd	z0\.h, z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_255_s16, svint16_t,
+		z0 = svqadd_n_s16 (z0, 255),
+		z0 = svqadd (z0, 255))
+
+/*
+** qadd_m1_s16:
+**	sqsub	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m1_s16, svint16_t,
+		z0 = svqadd_n_s16 (z0, -1),
+		z0 = svqadd (z0, -1))
+
+/*
+** qadd_m127_s16:
+**	sqsub	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m127_s16, svint16_t,
+		z0 = svqadd_n_s16 (z0, -127),
+		z0 = svqadd (z0, -127))
+
+/*
+** qadd_m128_s16:
+**	sqsub	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m128_s16, svint16_t,
+		z0 = svqadd_n_s16 (z0, -128),
+		z0 = svqadd (z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s32.c
new file mode 100644
index 000000000..197cc3840
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s32.c
@@ -0,0 +1,123 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qadd_s32_tied1:
+**	sqadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_s32_tied1, svint32_t,
+		z0 = svqadd_s32 (z0, z1),
+		z0 = svqadd (z0, z1))
+
+/*
+** qadd_s32_tied2:
+**	sqadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_s32_tied2, svint32_t,
+		z0 = svqadd_s32 (z1, z0),
+		z0 = svqadd (z1, z0))
+
+/*
+** qadd_s32_untied:
+**	sqadd	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_s32_untied, svint32_t,
+		z0 = svqadd_s32 (z1, z2),
+		z0 = svqadd (z1, z2))
+
+/*
+** qadd_w0_s32_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	sqadd	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_ZX (qadd_w0_s32_tied1, svint32_t, int32_t,
+		 z0 = svqadd_n_s32 (z0, x0),
+		 z0 = svqadd (z0, x0))
+
+/*
+** qadd_w0_s32_untied:
+**	mov	(z[0-9]+\.s), w0
+**	sqadd	z0\.s, (z1\.s, \1|\1, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_ZX (qadd_w0_s32_untied, svint32_t, int32_t,
+		 z0 = svqadd_n_s32 (z1, x0),
+		 z0 = svqadd (z1, x0))
+
+/*
+** qadd_1_s32_tied1:
+**	sqadd	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_1_s32_tied1, svint32_t,
+		z0 = svqadd_n_s32 (z0, 1),
+		z0 = svqadd (z0, 1))
+
+/*
+** qadd_1_s32_untied:
+**	movprfx	z0, z1
+**	sqadd	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_1_s32_untied, svint32_t,
+		z0 = svqadd_n_s32 (z1, 1),
+		z0 = svqadd (z1, 1))
+
+/*
+** qadd_127_s32:
+**	sqadd	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_127_s32, svint32_t,
+		z0 = svqadd_n_s32 (z0, 127),
+		z0 = svqadd (z0, 127))
+
+/*
+** qadd_128_s32:
+**	sqadd	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_128_s32, svint32_t,
+		z0 = svqadd_n_s32 (z0, 128),
+		z0 = svqadd (z0, 128))
+
+/*
+** qadd_255_s32:
+**	sqadd	z0\.s, z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_255_s32, svint32_t,
+		z0 = svqadd_n_s32 (z0, 255),
+		z0 = svqadd (z0, 255))
+
+/*
+** qadd_m1_s32:
+**	sqsub	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m1_s32, svint32_t,
+		z0 = svqadd_n_s32 (z0, -1),
+		z0 = svqadd (z0, -1))
+
+/*
+** qadd_m127_s32:
+**	sqsub	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m127_s32, svint32_t,
+		z0 = svqadd_n_s32 (z0, -127),
+		z0 = svqadd (z0, -127))
+
+/*
+** qadd_m128_s32:
+**	sqsub	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m128_s32, svint32_t,
+		z0 = svqadd_n_s32 (z0, -128),
+		z0 = svqadd (z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s64.c
new file mode 100644
index 000000000..0218866ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s64.c
@@ -0,0 +1,123 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qadd_s64_tied1:
+**	sqadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_s64_tied1, svint64_t,
+		z0 = svqadd_s64 (z0, z1),
+		z0 = svqadd (z0, z1))
+
+/*
+** qadd_s64_tied2:
+**	sqadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_s64_tied2, svint64_t,
+		z0 = svqadd_s64 (z1, z0),
+		z0 = svqadd (z1, z0))
+
+/*
+** qadd_s64_untied:
+**	sqadd	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_s64_untied, svint64_t,
+		z0 = svqadd_s64 (z1, z2),
+		z0 = svqadd (z1, z2))
+
+/*
+** qadd_x0_s64_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	sqadd	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (qadd_x0_s64_tied1, svint64_t, int64_t,
+		 z0 = svqadd_n_s64 (z0, x0),
+		 z0 = svqadd (z0, x0))
+
+/*
+** qadd_x0_s64_untied:
+**	mov	(z[0-9]+\.d), x0
+**	sqadd	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (qadd_x0_s64_untied, svint64_t, int64_t,
+		 z0 = svqadd_n_s64 (z1, x0),
+		 z0 = svqadd (z1, x0))
+
+/*
+** qadd_1_s64_tied1:
+**	sqadd	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_1_s64_tied1, svint64_t,
+		z0 = svqadd_n_s64 (z0, 1),
+		z0 = svqadd (z0, 1))
+
+/*
+** qadd_1_s64_untied:
+**	movprfx	z0, z1
+**	sqadd	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_1_s64_untied, svint64_t,
+		z0 = svqadd_n_s64 (z1, 1),
+		z0 = svqadd (z1, 1))
+
+/*
+** qadd_127_s64:
+**	sqadd	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_127_s64, svint64_t,
+		z0 = svqadd_n_s64 (z0, 127),
+		z0 = svqadd (z0, 127))
+
+/*
+** qadd_128_s64:
+**	sqadd	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_128_s64, svint64_t,
+		z0 = svqadd_n_s64 (z0, 128),
+		z0 = svqadd (z0, 128))
+
+/*
+** qadd_255_s64:
+**	sqadd	z0\.d, z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_255_s64, svint64_t,
+		z0 = svqadd_n_s64 (z0, 255),
+		z0 = svqadd (z0, 255))
+
+/*
+** qadd_m1_s64:
+**	sqsub	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m1_s64, svint64_t,
+		z0 = svqadd_n_s64 (z0, -1),
+		z0 = svqadd (z0, -1))
+
+/*
+** qadd_m127_s64:
+**	sqsub	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m127_s64, svint64_t,
+		z0 = svqadd_n_s64 (z0, -127),
+		z0 = svqadd (z0, -127))
+
+/*
+** qadd_m128_s64:
+**	sqsub	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m128_s64, svint64_t,
+		z0 = svqadd_n_s64 (z0, -128),
+		z0 = svqadd (z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s8.c
new file mode 100644
index 000000000..c8b88fa82
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_s8.c
@@ -0,0 +1,123 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qadd_s8_tied1:
+**	sqadd	z0\.b, (z0\.b, z1\.b|z1\.b, z0\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_s8_tied1, svint8_t,
+		z0 = svqadd_s8 (z0, z1),
+		z0 = svqadd (z0, z1))
+
+/*
+** qadd_s8_tied2:
+**	sqadd	z0\.b, (z0\.b, z1\.b|z1\.b, z0\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_s8_tied2, svint8_t,
+		z0 = svqadd_s8 (z1, z0),
+		z0 = svqadd (z1, z0))
+
+/*
+** qadd_s8_untied:
+**	sqadd	z0\.b, (z1\.b, z2\.b|z2\.b, z1\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_s8_untied, svint8_t,
+		z0 = svqadd_s8 (z1, z2),
+		z0 = svqadd (z1, z2))
+
+/*
+** qadd_w0_s8_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	sqadd	z0\.b, (z0\.b, \1|\1, z0\.b)
+**	ret
+*/
+TEST_UNIFORM_ZX (qadd_w0_s8_tied1, svint8_t, int8_t,
+		 z0 = svqadd_n_s8 (z0, x0),
+		 z0 = svqadd (z0, x0))
+
+/*
+** qadd_w0_s8_untied:
+**	mov	(z[0-9]+\.b), w0
+**	sqadd	z0\.b, (z1\.b, \1|\1, z1\.b)
+**	ret
+*/
+TEST_UNIFORM_ZX (qadd_w0_s8_untied, svint8_t, int8_t,
+		 z0 = svqadd_n_s8 (z1, x0),
+		 z0 = svqadd (z1, x0))
+
+/*
+** qadd_1_s8_tied1:
+**	sqadd	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_1_s8_tied1, svint8_t,
+		z0 = svqadd_n_s8 (z0, 1),
+		z0 = svqadd (z0, 1))
+
+/*
+** qadd_1_s8_untied:
+**	movprfx	z0, z1
+**	sqadd	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_1_s8_untied, svint8_t,
+		z0 = svqadd_n_s8 (z1, 1),
+		z0 = svqadd (z1, 1))
+
+/*
+** qadd_127_s8:
+**	sqadd	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_127_s8, svint8_t,
+		z0 = svqadd_n_s8 (z0, 127),
+		z0 = svqadd (z0, 127))
+
+/*
+** qadd_128_s8:
+**	sqsub	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_128_s8, svint8_t,
+		z0 = svqadd_n_s8 (z0, 128),
+		z0 = svqadd (z0, 128))
+
+/*
+** qadd_255_s8:
+**	sqsub	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_255_s8, svint8_t,
+		z0 = svqadd_n_s8 (z0, 255),
+		z0 = svqadd (z0, 255))
+
+/*
+** qadd_m1_s8:
+**	sqsub	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m1_s8, svint8_t,
+		z0 = svqadd_n_s8 (z0, -1),
+		z0 = svqadd (z0, -1))
+
+/*
+** qadd_m127_s8:
+**	sqsub	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m127_s8, svint8_t,
+		z0 = svqadd_n_s8 (z0, -127),
+		z0 = svqadd (z0, -127))
+
+/*
+** qadd_m128_s8:
+**	sqsub	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m128_s8, svint8_t,
+		z0 = svqadd_n_s8 (z0, -128),
+		z0 = svqadd (z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u16.c
new file mode 100644
index 000000000..dd7bc5b6a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u16.c
@@ -0,0 +1,126 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qadd_u16_tied1:
+**	uqadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_u16_tied1, svuint16_t,
+		z0 = svqadd_u16 (z0, z1),
+		z0 = svqadd (z0, z1))
+
+/*
+** qadd_u16_tied2:
+**	uqadd	z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_u16_tied2, svuint16_t,
+		z0 = svqadd_u16 (z1, z0),
+		z0 = svqadd (z1, z0))
+
+/*
+** qadd_u16_untied:
+**	uqadd	z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_u16_untied, svuint16_t,
+		z0 = svqadd_u16 (z1, z2),
+		z0 = svqadd (z1, z2))
+
+/*
+** qadd_w0_u16_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	uqadd	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_ZX (qadd_w0_u16_tied1, svuint16_t, uint16_t,
+		 z0 = svqadd_n_u16 (z0, x0),
+		 z0 = svqadd (z0, x0))
+
+/*
+** qadd_w0_u16_untied:
+**	mov	(z[0-9]+\.h), w0
+**	uqadd	z0\.h, (z1\.h, \1|\1, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_ZX (qadd_w0_u16_untied, svuint16_t, uint16_t,
+		 z0 = svqadd_n_u16 (z1, x0),
+		 z0 = svqadd (z1, x0))
+
+/*
+** qadd_1_u16_tied1:
+**	uqadd	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_1_u16_tied1, svuint16_t,
+		z0 = svqadd_n_u16 (z0, 1),
+		z0 = svqadd (z0, 1))
+
+/*
+** qadd_1_u16_untied:
+**	movprfx	z0, z1
+**	uqadd	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_1_u16_untied, svuint16_t,
+		z0 = svqadd_n_u16 (z1, 1),
+		z0 = svqadd (z1, 1))
+
+/*
+** qadd_127_u16:
+**	uqadd	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_127_u16, svuint16_t,
+		z0 = svqadd_n_u16 (z0, 127),
+		z0 = svqadd (z0, 127))
+
+/*
+** qadd_128_u16:
+**	uqadd	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_128_u16, svuint16_t,
+		z0 = svqadd_n_u16 (z0, 128),
+		z0 = svqadd (z0, 128))
+
+/*
+** qadd_255_u16:
+**	uqadd	z0\.h, z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_255_u16, svuint16_t,
+		z0 = svqadd_n_u16 (z0, 255),
+		z0 = svqadd (z0, 255))
+
+/*
+** qadd_m1_u16:
+**	mov	(z[0-9]+)\.b, #-1
+**	uqadd	z0\.h, (z0\.h, \1\.h|\1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m1_u16, svuint16_t,
+		z0 = svqadd_n_u16 (z0, -1),
+		z0 = svqadd (z0, -1))
+
+/*
+** qadd_m127_u16:
+**	mov	(z[0-9]+\.h), #-127
+**	uqadd	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m127_u16, svuint16_t,
+		z0 = svqadd_n_u16 (z0, -127),
+		z0 = svqadd (z0, -127))
+
+/*
+** qadd_m128_u16:
+**	mov	(z[0-9]+\.h), #-128
+**	uqadd	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m128_u16, svuint16_t,
+		z0 = svqadd_n_u16 (z0, -128),
+		z0 = svqadd (z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u32.c
new file mode 100644
index 000000000..0f846e44e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u32.c
@@ -0,0 +1,126 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qadd_u32_tied1:
+**	uqadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_u32_tied1, svuint32_t,
+		z0 = svqadd_u32 (z0, z1),
+		z0 = svqadd (z0, z1))
+
+/*
+** qadd_u32_tied2:
+**	uqadd	z0\.s, (z0\.s, z1\.s|z1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_u32_tied2, svuint32_t,
+		z0 = svqadd_u32 (z1, z0),
+		z0 = svqadd (z1, z0))
+
+/*
+** qadd_u32_untied:
+**	uqadd	z0\.s, (z1\.s, z2\.s|z2\.s, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_u32_untied, svuint32_t,
+		z0 = svqadd_u32 (z1, z2),
+		z0 = svqadd (z1, z2))
+
+/*
+** qadd_w0_u32_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	uqadd	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_ZX (qadd_w0_u32_tied1, svuint32_t, uint32_t,
+		 z0 = svqadd_n_u32 (z0, x0),
+		 z0 = svqadd (z0, x0))
+
+/*
+** qadd_w0_u32_untied:
+**	mov	(z[0-9]+\.s), w0
+**	uqadd	z0\.s, (z1\.s, \1|\1, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_ZX (qadd_w0_u32_untied, svuint32_t, uint32_t,
+		 z0 = svqadd_n_u32 (z1, x0),
+		 z0 = svqadd (z1, x0))
+
+/*
+** qadd_1_u32_tied1:
+**	uqadd	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_1_u32_tied1, svuint32_t,
+		z0 = svqadd_n_u32 (z0, 1),
+		z0 = svqadd (z0, 1))
+
+/*
+** qadd_1_u32_untied:
+**	movprfx	z0, z1
+**	uqadd	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_1_u32_untied, svuint32_t,
+		z0 = svqadd_n_u32 (z1, 1),
+		z0 = svqadd (z1, 1))
+
+/*
+** qadd_127_u32:
+**	uqadd	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_127_u32, svuint32_t,
+		z0 = svqadd_n_u32 (z0, 127),
+		z0 = svqadd (z0, 127))
+
+/*
+** qadd_128_u32:
+**	uqadd	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_128_u32, svuint32_t,
+		z0 = svqadd_n_u32 (z0, 128),
+		z0 = svqadd (z0, 128))
+
+/*
+** qadd_255_u32:
+**	uqadd	z0\.s, z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_255_u32, svuint32_t,
+		z0 = svqadd_n_u32 (z0, 255),
+		z0 = svqadd (z0, 255))
+
+/*
+** qadd_m1_u32:
+**	mov	(z[0-9]+)\.b, #-1
+**	uqadd	z0\.s, (z0\.s, \1\.s|\1\.s, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m1_u32, svuint32_t,
+		z0 = svqadd_n_u32 (z0, -1),
+		z0 = svqadd (z0, -1))
+
+/*
+** qadd_m127_u32:
+**	mov	(z[0-9]+\.s), #-127
+**	uqadd	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m127_u32, svuint32_t,
+		z0 = svqadd_n_u32 (z0, -127),
+		z0 = svqadd (z0, -127))
+
+/*
+** qadd_m128_u32:
+**	mov	(z[0-9]+\.s), #-128
+**	uqadd	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m128_u32, svuint32_t,
+		z0 = svqadd_n_u32 (z0, -128),
+		z0 = svqadd (z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u64.c
new file mode 100644
index 000000000..454fb1d63
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u64.c
@@ -0,0 +1,126 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qadd_u64_tied1:
+**	uqadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_u64_tied1, svuint64_t,
+		z0 = svqadd_u64 (z0, z1),
+		z0 = svqadd (z0, z1))
+
+/*
+** qadd_u64_tied2:
+**	uqadd	z0\.d, (z0\.d, z1\.d|z1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_u64_tied2, svuint64_t,
+		z0 = svqadd_u64 (z1, z0),
+		z0 = svqadd (z1, z0))
+
+/*
+** qadd_u64_untied:
+**	uqadd	z0\.d, (z1\.d, z2\.d|z2\.d, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_u64_untied, svuint64_t,
+		z0 = svqadd_u64 (z1, z2),
+		z0 = svqadd (z1, z2))
+
+/*
+** qadd_x0_u64_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	uqadd	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (qadd_x0_u64_tied1, svuint64_t, uint64_t,
+		 z0 = svqadd_n_u64 (z0, x0),
+		 z0 = svqadd (z0, x0))
+
+/*
+** qadd_x0_u64_untied:
+**	mov	(z[0-9]+\.d), x0
+**	uqadd	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_ZX (qadd_x0_u64_untied, svuint64_t, uint64_t,
+		 z0 = svqadd_n_u64 (z1, x0),
+		 z0 = svqadd (z1, x0))
+
+/*
+** qadd_1_u64_tied1:
+**	uqadd	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_1_u64_tied1, svuint64_t,
+		z0 = svqadd_n_u64 (z0, 1),
+		z0 = svqadd (z0, 1))
+
+/*
+** qadd_1_u64_untied:
+**	movprfx	z0, z1
+**	uqadd	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_1_u64_untied, svuint64_t,
+		z0 = svqadd_n_u64 (z1, 1),
+		z0 = svqadd (z1, 1))
+
+/*
+** qadd_127_u64:
+**	uqadd	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_127_u64, svuint64_t,
+		z0 = svqadd_n_u64 (z0, 127),
+		z0 = svqadd (z0, 127))
+
+/*
+** qadd_128_u64:
+**	uqadd	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_128_u64, svuint64_t,
+		z0 = svqadd_n_u64 (z0, 128),
+		z0 = svqadd (z0, 128))
+
+/*
+** qadd_255_u64:
+**	uqadd	z0\.d, z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_255_u64, svuint64_t,
+		z0 = svqadd_n_u64 (z0, 255),
+		z0 = svqadd (z0, 255))
+
+/*
+** qadd_m1_u64:
+**	mov	(z[0-9]+)\.b, #-1
+**	uqadd	z0\.d, (z0\.d, \1\.d|\1\.d, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m1_u64, svuint64_t,
+		z0 = svqadd_n_u64 (z0, -1),
+		z0 = svqadd (z0, -1))
+
+/*
+** qadd_m127_u64:
+**	mov	(z[0-9]+\.d), #-127
+**	uqadd	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m127_u64, svuint64_t,
+		z0 = svqadd_n_u64 (z0, -127),
+		z0 = svqadd (z0, -127))
+
+/*
+** qadd_m128_u64:
+**	mov	(z[0-9]+\.d), #-128
+**	uqadd	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m128_u64, svuint64_t,
+		z0 = svqadd_n_u64 (z0, -128),
+		z0 = svqadd (z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u8.c
new file mode 100644
index 000000000..e86b8988c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qadd_u8.c
@@ -0,0 +1,123 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qadd_u8_tied1:
+**	uqadd	z0\.b, (z0\.b, z1\.b|z1\.b, z0\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_u8_tied1, svuint8_t,
+		z0 = svqadd_u8 (z0, z1),
+		z0 = svqadd (z0, z1))
+
+/*
+** qadd_u8_tied2:
+**	uqadd	z0\.b, (z0\.b, z1\.b|z1\.b, z0\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_u8_tied2, svuint8_t,
+		z0 = svqadd_u8 (z1, z0),
+		z0 = svqadd (z1, z0))
+
+/*
+** qadd_u8_untied:
+**	uqadd	z0\.b, (z1\.b, z2\.b|z2\.b, z1\.b)
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_u8_untied, svuint8_t,
+		z0 = svqadd_u8 (z1, z2),
+		z0 = svqadd (z1, z2))
+
+/*
+** qadd_w0_u8_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	uqadd	z0\.b, (z0\.b, \1|\1, z0\.b)
+**	ret
+*/
+TEST_UNIFORM_ZX (qadd_w0_u8_tied1, svuint8_t, uint8_t,
+		 z0 = svqadd_n_u8 (z0, x0),
+		 z0 = svqadd (z0, x0))
+
+/*
+** qadd_w0_u8_untied:
+**	mov	(z[0-9]+\.b), w0
+**	uqadd	z0\.b, (z1\.b, \1|\1, z1\.b)
+**	ret
+*/
+TEST_UNIFORM_ZX (qadd_w0_u8_untied, svuint8_t, uint8_t,
+		 z0 = svqadd_n_u8 (z1, x0),
+		 z0 = svqadd (z1, x0))
+
+/*
+** qadd_1_u8_tied1:
+**	uqadd	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_1_u8_tied1, svuint8_t,
+		z0 = svqadd_n_u8 (z0, 1),
+		z0 = svqadd (z0, 1))
+
+/*
+** qadd_1_u8_untied:
+**	movprfx	z0, z1
+**	uqadd	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_1_u8_untied, svuint8_t,
+		z0 = svqadd_n_u8 (z1, 1),
+		z0 = svqadd (z1, 1))
+
+/*
+** qadd_127_u8:
+**	uqadd	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_127_u8, svuint8_t,
+		z0 = svqadd_n_u8 (z0, 127),
+		z0 = svqadd (z0, 127))
+
+/*
+** qadd_128_u8:
+**	uqadd	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_128_u8, svuint8_t,
+		z0 = svqadd_n_u8 (z0, 128),
+		z0 = svqadd (z0, 128))
+
+/*
+** qadd_255_u8:
+**	uqadd	z0\.b, z0\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_255_u8, svuint8_t,
+		z0 = svqadd_n_u8 (z0, 255),
+		z0 = svqadd (z0, 255))
+
+/*
+** qadd_m1_u8:
+**	uqadd	z0\.b, z0\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m1_u8, svuint8_t,
+		z0 = svqadd_n_u8 (z0, -1),
+		z0 = svqadd (z0, -1))
+
+/*
+** qadd_m127_u8:
+**	uqadd	z0\.b, z0\.b, #129
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m127_u8, svuint8_t,
+		z0 = svqadd_n_u8 (z0, -127),
+		z0 = svqadd (z0, -127))
+
+/*
+** qadd_m128_u8:
+**	uqadd	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qadd_m128_u8, svuint8_t,
+		z0 = svqadd_n_u8 (z0, -128),
+		z0 = svqadd (z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_s32.c
new file mode 100644
index 000000000..22b3afef7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_s32.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecb_pat_n_1_s32_tied:
+**	sqdecb	x0, w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_1_s32_tied, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_POW2, 1),
+		x0 = svqdecb_pat (x0, SV_POW2, 1))
+
+/*
+** qdecb_pat_n_1_s32_untied:
+**	mov	w0, w1
+**	sqdecb	x0, w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_1_s32_untied, int32_t,
+		x0 = svqdecb_pat_n_s32 (x1, SV_POW2, 1),
+		x0 = svqdecb_pat (x1, SV_POW2, 1))
+
+/*
+** qdecb_pat_n_2_s32:
+**	sqdecb	x0, w0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_2_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_POW2, 2),
+		x0 = svqdecb_pat (x0, SV_POW2, 2))
+
+/*
+** qdecb_pat_n_7_s32:
+**	sqdecb	x0, w0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_7_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_POW2, 7),
+		x0 = svqdecb_pat (x0, SV_POW2, 7))
+
+/*
+** qdecb_pat_n_15_s32:
+**	sqdecb	x0, w0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_15_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_POW2, 15),
+		x0 = svqdecb_pat (x0, SV_POW2, 15))
+
+/*
+** qdecb_pat_n_16_s32:
+**	sqdecb	x0, w0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_16_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_POW2, 16),
+		x0 = svqdecb_pat (x0, SV_POW2, 16))
+
+/*
+** qdecb_pat_n_vl1_s32:
+**	sqdecb	x0, w0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl1_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_VL1, 16),
+		x0 = svqdecb_pat (x0, SV_VL1, 16))
+
+/*
+** qdecb_pat_n_vl2_s32:
+**	sqdecb	x0, w0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl2_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_VL2, 16),
+		x0 = svqdecb_pat (x0, SV_VL2, 16))
+
+/*
+** qdecb_pat_n_vl3_s32:
+**	sqdecb	x0, w0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl3_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_VL3, 16),
+		x0 = svqdecb_pat (x0, SV_VL3, 16))
+
+/*
+** qdecb_pat_n_vl4_s32:
+**	sqdecb	x0, w0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl4_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_VL4, 16),
+		x0 = svqdecb_pat (x0, SV_VL4, 16))
+
+/*
+** qdecb_pat_n_vl5_s32:
+**	sqdecb	x0, w0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl5_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_VL5, 16),
+		x0 = svqdecb_pat (x0, SV_VL5, 16))
+
+/*
+** qdecb_pat_n_vl6_s32:
+**	sqdecb	x0, w0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl6_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_VL6, 16),
+		x0 = svqdecb_pat (x0, SV_VL6, 16))
+
+/*
+** qdecb_pat_n_vl7_s32:
+**	sqdecb	x0, w0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl7_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_VL7, 16),
+		x0 = svqdecb_pat (x0, SV_VL7, 16))
+
+/*
+** qdecb_pat_n_vl8_s32:
+**	sqdecb	x0, w0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl8_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_VL8, 16),
+		x0 = svqdecb_pat (x0, SV_VL8, 16))
+
+/*
+** qdecb_pat_n_vl16_s32:
+**	sqdecb	x0, w0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl16_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_VL16, 16),
+		x0 = svqdecb_pat (x0, SV_VL16, 16))
+
+/*
+** qdecb_pat_n_vl32_s32:
+**	sqdecb	x0, w0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl32_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_VL32, 16),
+		x0 = svqdecb_pat (x0, SV_VL32, 16))
+
+/*
+** qdecb_pat_n_vl64_s32:
+**	sqdecb	x0, w0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl64_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_VL64, 16),
+		x0 = svqdecb_pat (x0, SV_VL64, 16))
+
+/*
+** qdecb_pat_n_vl128_s32:
+**	sqdecb	x0, w0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl128_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_VL128, 16),
+		x0 = svqdecb_pat (x0, SV_VL128, 16))
+
+/*
+** qdecb_pat_n_vl256_s32:
+**	sqdecb	x0, w0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl256_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_VL256, 16),
+		x0 = svqdecb_pat (x0, SV_VL256, 16))
+
+/*
+** qdecb_pat_n_mul4_s32:
+**	sqdecb	x0, w0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_mul4_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_MUL4, 16),
+		x0 = svqdecb_pat (x0, SV_MUL4, 16))
+
+/*
+** qdecb_pat_n_mul3_s32:
+**	sqdecb	x0, w0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_mul3_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_MUL3, 16),
+		x0 = svqdecb_pat (x0, SV_MUL3, 16))
+
+/*
+** qdecb_pat_n_all_s32:
+**	sqdecb	x0, w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_all_s32, int32_t,
+		x0 = svqdecb_pat_n_s32 (x0, SV_ALL, 16),
+		x0 = svqdecb_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_s64.c
new file mode 100644
index 000000000..1380e6c8e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_s64.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecb_pat_n_1_s64_tied:
+**	sqdecb	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_1_s64_tied, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_POW2, 1),
+		x0 = svqdecb_pat (x0, SV_POW2, 1))
+
+/*
+** qdecb_pat_n_1_s64_untied:
+**	mov	x0, x1
+**	sqdecb	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_1_s64_untied, int64_t,
+		x0 = svqdecb_pat_n_s64 (x1, SV_POW2, 1),
+		x0 = svqdecb_pat (x1, SV_POW2, 1))
+
+/*
+** qdecb_pat_n_2_s64:
+**	sqdecb	x0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_2_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_POW2, 2),
+		x0 = svqdecb_pat (x0, SV_POW2, 2))
+
+/*
+** qdecb_pat_n_7_s64:
+**	sqdecb	x0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_7_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_POW2, 7),
+		x0 = svqdecb_pat (x0, SV_POW2, 7))
+
+/*
+** qdecb_pat_n_15_s64:
+**	sqdecb	x0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_15_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_POW2, 15),
+		x0 = svqdecb_pat (x0, SV_POW2, 15))
+
+/*
+** qdecb_pat_n_16_s64:
+**	sqdecb	x0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_16_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_POW2, 16),
+		x0 = svqdecb_pat (x0, SV_POW2, 16))
+
+/*
+** qdecb_pat_n_vl1_s64:
+**	sqdecb	x0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl1_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_VL1, 16),
+		x0 = svqdecb_pat (x0, SV_VL1, 16))
+
+/*
+** qdecb_pat_n_vl2_s64:
+**	sqdecb	x0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl2_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_VL2, 16),
+		x0 = svqdecb_pat (x0, SV_VL2, 16))
+
+/*
+** qdecb_pat_n_vl3_s64:
+**	sqdecb	x0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl3_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_VL3, 16),
+		x0 = svqdecb_pat (x0, SV_VL3, 16))
+
+/*
+** qdecb_pat_n_vl4_s64:
+**	sqdecb	x0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl4_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_VL4, 16),
+		x0 = svqdecb_pat (x0, SV_VL4, 16))
+
+/*
+** qdecb_pat_n_vl5_s64:
+**	sqdecb	x0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl5_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_VL5, 16),
+		x0 = svqdecb_pat (x0, SV_VL5, 16))
+
+/*
+** qdecb_pat_n_vl6_s64:
+**	sqdecb	x0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl6_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_VL6, 16),
+		x0 = svqdecb_pat (x0, SV_VL6, 16))
+
+/*
+** qdecb_pat_n_vl7_s64:
+**	sqdecb	x0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl7_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_VL7, 16),
+		x0 = svqdecb_pat (x0, SV_VL7, 16))
+
+/*
+** qdecb_pat_n_vl8_s64:
+**	sqdecb	x0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl8_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_VL8, 16),
+		x0 = svqdecb_pat (x0, SV_VL8, 16))
+
+/*
+** qdecb_pat_n_vl16_s64:
+**	sqdecb	x0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl16_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_VL16, 16),
+		x0 = svqdecb_pat (x0, SV_VL16, 16))
+
+/*
+** qdecb_pat_n_vl32_s64:
+**	sqdecb	x0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl32_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_VL32, 16),
+		x0 = svqdecb_pat (x0, SV_VL32, 16))
+
+/*
+** qdecb_pat_n_vl64_s64:
+**	sqdecb	x0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl64_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_VL64, 16),
+		x0 = svqdecb_pat (x0, SV_VL64, 16))
+
+/*
+** qdecb_pat_n_vl128_s64:
+**	sqdecb	x0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl128_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_VL128, 16),
+		x0 = svqdecb_pat (x0, SV_VL128, 16))
+
+/*
+** qdecb_pat_n_vl256_s64:
+**	sqdecb	x0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl256_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_VL256, 16),
+		x0 = svqdecb_pat (x0, SV_VL256, 16))
+
+/*
+** qdecb_pat_n_mul4_s64:
+**	sqdecb	x0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_mul4_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_MUL4, 16),
+		x0 = svqdecb_pat (x0, SV_MUL4, 16))
+
+/*
+** qdecb_pat_n_mul3_s64:
+**	sqdecb	x0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_mul3_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_MUL3, 16),
+		x0 = svqdecb_pat (x0, SV_MUL3, 16))
+
+/*
+** qdecb_pat_n_all_s64:
+**	sqdecb	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_all_s64, int64_t,
+		x0 = svqdecb_pat_n_s64 (x0, SV_ALL, 16),
+		x0 = svqdecb_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_u32.c
new file mode 100644
index 000000000..3db3da866
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_u32.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecb_pat_n_1_u32_tied:
+**	uqdecb	w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_1_u32_tied, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_POW2, 1),
+		x0 = svqdecb_pat (x0, SV_POW2, 1))
+
+/*
+** qdecb_pat_n_1_u32_untied:
+**	mov	w0, w1
+**	uqdecb	w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_1_u32_untied, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x1, SV_POW2, 1),
+		x0 = svqdecb_pat (x1, SV_POW2, 1))
+
+/*
+** qdecb_pat_n_2_u32:
+**	uqdecb	w0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_2_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_POW2, 2),
+		x0 = svqdecb_pat (x0, SV_POW2, 2))
+
+/*
+** qdecb_pat_n_7_u32:
+**	uqdecb	w0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_7_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_POW2, 7),
+		x0 = svqdecb_pat (x0, SV_POW2, 7))
+
+/*
+** qdecb_pat_n_15_u32:
+**	uqdecb	w0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_15_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_POW2, 15),
+		x0 = svqdecb_pat (x0, SV_POW2, 15))
+
+/*
+** qdecb_pat_n_16_u32:
+**	uqdecb	w0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_16_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_POW2, 16),
+		x0 = svqdecb_pat (x0, SV_POW2, 16))
+
+/*
+** qdecb_pat_n_vl1_u32:
+**	uqdecb	w0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl1_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_VL1, 16),
+		x0 = svqdecb_pat (x0, SV_VL1, 16))
+
+/*
+** qdecb_pat_n_vl2_u32:
+**	uqdecb	w0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl2_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_VL2, 16),
+		x0 = svqdecb_pat (x0, SV_VL2, 16))
+
+/*
+** qdecb_pat_n_vl3_u32:
+**	uqdecb	w0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl3_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_VL3, 16),
+		x0 = svqdecb_pat (x0, SV_VL3, 16))
+
+/*
+** qdecb_pat_n_vl4_u32:
+**	uqdecb	w0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl4_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_VL4, 16),
+		x0 = svqdecb_pat (x0, SV_VL4, 16))
+
+/*
+** qdecb_pat_n_vl5_u32:
+**	uqdecb	w0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl5_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_VL5, 16),
+		x0 = svqdecb_pat (x0, SV_VL5, 16))
+
+/*
+** qdecb_pat_n_vl6_u32:
+**	uqdecb	w0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl6_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_VL6, 16),
+		x0 = svqdecb_pat (x0, SV_VL6, 16))
+
+/*
+** qdecb_pat_n_vl7_u32:
+**	uqdecb	w0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl7_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_VL7, 16),
+		x0 = svqdecb_pat (x0, SV_VL7, 16))
+
+/*
+** qdecb_pat_n_vl8_u32:
+**	uqdecb	w0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl8_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_VL8, 16),
+		x0 = svqdecb_pat (x0, SV_VL8, 16))
+
+/*
+** qdecb_pat_n_vl16_u32:
+**	uqdecb	w0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl16_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_VL16, 16),
+		x0 = svqdecb_pat (x0, SV_VL16, 16))
+
+/*
+** qdecb_pat_n_vl32_u32:
+**	uqdecb	w0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl32_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_VL32, 16),
+		x0 = svqdecb_pat (x0, SV_VL32, 16))
+
+/*
+** qdecb_pat_n_vl64_u32:
+**	uqdecb	w0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl64_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_VL64, 16),
+		x0 = svqdecb_pat (x0, SV_VL64, 16))
+
+/*
+** qdecb_pat_n_vl128_u32:
+**	uqdecb	w0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl128_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_VL128, 16),
+		x0 = svqdecb_pat (x0, SV_VL128, 16))
+
+/*
+** qdecb_pat_n_vl256_u32:
+**	uqdecb	w0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl256_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_VL256, 16),
+		x0 = svqdecb_pat (x0, SV_VL256, 16))
+
+/*
+** qdecb_pat_n_mul4_u32:
+**	uqdecb	w0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_mul4_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_MUL4, 16),
+		x0 = svqdecb_pat (x0, SV_MUL4, 16))
+
+/*
+** qdecb_pat_n_mul3_u32:
+**	uqdecb	w0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_mul3_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_MUL3, 16),
+		x0 = svqdecb_pat (x0, SV_MUL3, 16))
+
+/*
+** qdecb_pat_n_all_u32:
+**	uqdecb	w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_all_u32, uint32_t,
+		x0 = svqdecb_pat_n_u32 (x0, SV_ALL, 16),
+		x0 = svqdecb_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_u64.c
new file mode 100644
index 000000000..2f4c3c7aa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_pat_u64.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecb_pat_n_1_u64_tied:
+**	uqdecb	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_1_u64_tied, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_POW2, 1),
+		x0 = svqdecb_pat (x0, SV_POW2, 1))
+
+/*
+** qdecb_pat_n_1_u64_untied:
+**	mov	x0, x1
+**	uqdecb	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_1_u64_untied, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x1, SV_POW2, 1),
+		x0 = svqdecb_pat (x1, SV_POW2, 1))
+
+/*
+** qdecb_pat_n_2_u64:
+**	uqdecb	x0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_2_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_POW2, 2),
+		x0 = svqdecb_pat (x0, SV_POW2, 2))
+
+/*
+** qdecb_pat_n_7_u64:
+**	uqdecb	x0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_7_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_POW2, 7),
+		x0 = svqdecb_pat (x0, SV_POW2, 7))
+
+/*
+** qdecb_pat_n_15_u64:
+**	uqdecb	x0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_15_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_POW2, 15),
+		x0 = svqdecb_pat (x0, SV_POW2, 15))
+
+/*
+** qdecb_pat_n_16_u64:
+**	uqdecb	x0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_16_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_POW2, 16),
+		x0 = svqdecb_pat (x0, SV_POW2, 16))
+
+/*
+** qdecb_pat_n_vl1_u64:
+**	uqdecb	x0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl1_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_VL1, 16),
+		x0 = svqdecb_pat (x0, SV_VL1, 16))
+
+/*
+** qdecb_pat_n_vl2_u64:
+**	uqdecb	x0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl2_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_VL2, 16),
+		x0 = svqdecb_pat (x0, SV_VL2, 16))
+
+/*
+** qdecb_pat_n_vl3_u64:
+**	uqdecb	x0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl3_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_VL3, 16),
+		x0 = svqdecb_pat (x0, SV_VL3, 16))
+
+/*
+** qdecb_pat_n_vl4_u64:
+**	uqdecb	x0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl4_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_VL4, 16),
+		x0 = svqdecb_pat (x0, SV_VL4, 16))
+
+/*
+** qdecb_pat_n_vl5_u64:
+**	uqdecb	x0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl5_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_VL5, 16),
+		x0 = svqdecb_pat (x0, SV_VL5, 16))
+
+/*
+** qdecb_pat_n_vl6_u64:
+**	uqdecb	x0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl6_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_VL6, 16),
+		x0 = svqdecb_pat (x0, SV_VL6, 16))
+
+/*
+** qdecb_pat_n_vl7_u64:
+**	uqdecb	x0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl7_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_VL7, 16),
+		x0 = svqdecb_pat (x0, SV_VL7, 16))
+
+/*
+** qdecb_pat_n_vl8_u64:
+**	uqdecb	x0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl8_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_VL8, 16),
+		x0 = svqdecb_pat (x0, SV_VL8, 16))
+
+/*
+** qdecb_pat_n_vl16_u64:
+**	uqdecb	x0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl16_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_VL16, 16),
+		x0 = svqdecb_pat (x0, SV_VL16, 16))
+
+/*
+** qdecb_pat_n_vl32_u64:
+**	uqdecb	x0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl32_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_VL32, 16),
+		x0 = svqdecb_pat (x0, SV_VL32, 16))
+
+/*
+** qdecb_pat_n_vl64_u64:
+**	uqdecb	x0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl64_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_VL64, 16),
+		x0 = svqdecb_pat (x0, SV_VL64, 16))
+
+/*
+** qdecb_pat_n_vl128_u64:
+**	uqdecb	x0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl128_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_VL128, 16),
+		x0 = svqdecb_pat (x0, SV_VL128, 16))
+
+/*
+** qdecb_pat_n_vl256_u64:
+**	uqdecb	x0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_vl256_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_VL256, 16),
+		x0 = svqdecb_pat (x0, SV_VL256, 16))
+
+/*
+** qdecb_pat_n_mul4_u64:
+**	uqdecb	x0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_mul4_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_MUL4, 16),
+		x0 = svqdecb_pat (x0, SV_MUL4, 16))
+
+/*
+** qdecb_pat_n_mul3_u64:
+**	uqdecb	x0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_mul3_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_MUL3, 16),
+		x0 = svqdecb_pat (x0, SV_MUL3, 16))
+
+/*
+** qdecb_pat_n_all_u64:
+**	uqdecb	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_pat_n_all_u64, uint64_t,
+		x0 = svqdecb_pat_n_u64 (x0, SV_ALL, 16),
+		x0 = svqdecb_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_s32.c
new file mode 100644
index 000000000..11180654e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_s32.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecb_n_1_s32_tied:
+**	sqdecb	x0, w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_1_s32_tied, int32_t,
+		x0 = svqdecb_n_s32 (x0, 1),
+		x0 = svqdecb (x0, 1))
+
+/*
+** qdecb_n_1_s32_untied:
+**	mov	w0, w1
+**	sqdecb	x0, w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_1_s32_untied, int32_t,
+		x0 = svqdecb_n_s32 (x1, 1),
+		x0 = svqdecb (x1, 1))
+
+/*
+** qdecb_n_2_s32:
+**	sqdecb	x0, w0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_2_s32, int32_t,
+		x0 = svqdecb_n_s32 (x0, 2),
+		x0 = svqdecb (x0, 2))
+
+/*
+** qdecb_n_7_s32:
+**	sqdecb	x0, w0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_7_s32, int32_t,
+		x0 = svqdecb_n_s32 (x0, 7),
+		x0 = svqdecb (x0, 7))
+
+/*
+** qdecb_n_15_s32:
+**	sqdecb	x0, w0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_15_s32, int32_t,
+		x0 = svqdecb_n_s32 (x0, 15),
+		x0 = svqdecb (x0, 15))
+
+/*
+** qdecb_n_16_s32:
+**	sqdecb	x0, w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_16_s32, int32_t,
+		x0 = svqdecb_n_s32 (x0, 16),
+		x0 = svqdecb (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_s64.c
new file mode 100644
index 000000000..17b765655
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_s64.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecb_n_1_s64_tied:
+**	sqdecb	x0
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_1_s64_tied, int64_t,
+		x0 = svqdecb_n_s64 (x0, 1),
+		x0 = svqdecb (x0, 1))
+
+/*
+** qdecb_n_1_s64_untied:
+**	mov	x0, x1
+**	sqdecb	x0
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_1_s64_untied, int64_t,
+		x0 = svqdecb_n_s64 (x1, 1),
+		x0 = svqdecb (x1, 1))
+
+/*
+** qdecb_n_2_s64:
+**	sqdecb	x0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_2_s64, int64_t,
+		x0 = svqdecb_n_s64 (x0, 2),
+		x0 = svqdecb (x0, 2))
+
+/*
+** qdecb_n_7_s64:
+**	sqdecb	x0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_7_s64, int64_t,
+		x0 = svqdecb_n_s64 (x0, 7),
+		x0 = svqdecb (x0, 7))
+
+/*
+** qdecb_n_15_s64:
+**	sqdecb	x0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_15_s64, int64_t,
+		x0 = svqdecb_n_s64 (x0, 15),
+		x0 = svqdecb (x0, 15))
+
+/*
+** qdecb_n_16_s64:
+**	sqdecb	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_16_s64, int64_t,
+		x0 = svqdecb_n_s64 (x0, 16),
+		x0 = svqdecb (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_u32.c
new file mode 100644
index 000000000..b31e04de5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_u32.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecb_n_1_u32_tied:
+**	uqdecb	w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_1_u32_tied, uint32_t,
+		x0 = svqdecb_n_u32 (x0, 1),
+		x0 = svqdecb (x0, 1))
+
+/*
+** qdecb_n_1_u32_untied:
+**	mov	w0, w1
+**	uqdecb	w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_1_u32_untied, uint32_t,
+		x0 = svqdecb_n_u32 (x1, 1),
+		x0 = svqdecb (x1, 1))
+
+/*
+** qdecb_n_2_u32:
+**	uqdecb	w0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_2_u32, uint32_t,
+		x0 = svqdecb_n_u32 (x0, 2),
+		x0 = svqdecb (x0, 2))
+
+/*
+** qdecb_n_7_u32:
+**	uqdecb	w0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_7_u32, uint32_t,
+		x0 = svqdecb_n_u32 (x0, 7),
+		x0 = svqdecb (x0, 7))
+
+/*
+** qdecb_n_15_u32:
+**	uqdecb	w0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_15_u32, uint32_t,
+		x0 = svqdecb_n_u32 (x0, 15),
+		x0 = svqdecb (x0, 15))
+
+/*
+** qdecb_n_16_u32:
+**	uqdecb	w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_16_u32, uint32_t,
+		x0 = svqdecb_n_u32 (x0, 16),
+		x0 = svqdecb (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_u64.c
new file mode 100644
index 000000000..aab6faba9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecb_u64.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecb_n_1_u64_tied:
+**	uqdecb	x0
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_1_u64_tied, uint64_t,
+		x0 = svqdecb_n_u64 (x0, 1),
+		x0 = svqdecb (x0, 1))
+
+/*
+** qdecb_n_1_u64_untied:
+**	mov	x0, x1
+**	uqdecb	x0
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_1_u64_untied, uint64_t,
+		x0 = svqdecb_n_u64 (x1, 1),
+		x0 = svqdecb (x1, 1))
+
+/*
+** qdecb_n_2_u64:
+**	uqdecb	x0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_2_u64, uint64_t,
+		x0 = svqdecb_n_u64 (x0, 2),
+		x0 = svqdecb (x0, 2))
+
+/*
+** qdecb_n_7_u64:
+**	uqdecb	x0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_7_u64, uint64_t,
+		x0 = svqdecb_n_u64 (x0, 7),
+		x0 = svqdecb (x0, 7))
+
+/*
+** qdecb_n_15_u64:
+**	uqdecb	x0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_15_u64, uint64_t,
+		x0 = svqdecb_n_u64 (x0, 15),
+		x0 = svqdecb (x0, 15))
+
+/*
+** qdecb_n_16_u64:
+**	uqdecb	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecb_n_16_u64, uint64_t,
+		x0 = svqdecb_n_u64 (x0, 16),
+		x0 = svqdecb (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_s32.c
new file mode 100644
index 000000000..bc491d397
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_s32.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecd_pat_n_1_s32_tied:
+**	sqdecd	x0, w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_1_s32_tied, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_POW2, 1),
+		x0 = svqdecd_pat (x0, SV_POW2, 1))
+
+/*
+** qdecd_pat_n_1_s32_untied:
+**	mov	w0, w1
+**	sqdecd	x0, w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_1_s32_untied, int32_t,
+		x0 = svqdecd_pat_n_s32 (x1, SV_POW2, 1),
+		x0 = svqdecd_pat (x1, SV_POW2, 1))
+
+/*
+** qdecd_pat_n_2_s32:
+**	sqdecd	x0, w0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_2_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_POW2, 2),
+		x0 = svqdecd_pat (x0, SV_POW2, 2))
+
+/*
+** qdecd_pat_n_7_s32:
+**	sqdecd	x0, w0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_7_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_POW2, 7),
+		x0 = svqdecd_pat (x0, SV_POW2, 7))
+
+/*
+** qdecd_pat_n_15_s32:
+**	sqdecd	x0, w0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_15_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_POW2, 15),
+		x0 = svqdecd_pat (x0, SV_POW2, 15))
+
+/*
+** qdecd_pat_n_16_s32:
+**	sqdecd	x0, w0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_16_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_POW2, 16),
+		x0 = svqdecd_pat (x0, SV_POW2, 16))
+
+/*
+** qdecd_pat_n_vl1_s32:
+**	sqdecd	x0, w0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl1_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_VL1, 16),
+		x0 = svqdecd_pat (x0, SV_VL1, 16))
+
+/*
+** qdecd_pat_n_vl2_s32:
+**	sqdecd	x0, w0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl2_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_VL2, 16),
+		x0 = svqdecd_pat (x0, SV_VL2, 16))
+
+/*
+** qdecd_pat_n_vl3_s32:
+**	sqdecd	x0, w0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl3_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_VL3, 16),
+		x0 = svqdecd_pat (x0, SV_VL3, 16))
+
+/*
+** qdecd_pat_n_vl4_s32:
+**	sqdecd	x0, w0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl4_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_VL4, 16),
+		x0 = svqdecd_pat (x0, SV_VL4, 16))
+
+/*
+** qdecd_pat_n_vl5_s32:
+**	sqdecd	x0, w0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl5_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_VL5, 16),
+		x0 = svqdecd_pat (x0, SV_VL5, 16))
+
+/*
+** qdecd_pat_n_vl6_s32:
+**	sqdecd	x0, w0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl6_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_VL6, 16),
+		x0 = svqdecd_pat (x0, SV_VL6, 16))
+
+/*
+** qdecd_pat_n_vl7_s32:
+**	sqdecd	x0, w0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl7_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_VL7, 16),
+		x0 = svqdecd_pat (x0, SV_VL7, 16))
+
+/*
+** qdecd_pat_n_vl8_s32:
+**	sqdecd	x0, w0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl8_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_VL8, 16),
+		x0 = svqdecd_pat (x0, SV_VL8, 16))
+
+/*
+** qdecd_pat_n_vl16_s32:
+**	sqdecd	x0, w0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl16_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_VL16, 16),
+		x0 = svqdecd_pat (x0, SV_VL16, 16))
+
+/*
+** qdecd_pat_n_vl32_s32:
+**	sqdecd	x0, w0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl32_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_VL32, 16),
+		x0 = svqdecd_pat (x0, SV_VL32, 16))
+
+/*
+** qdecd_pat_n_vl64_s32:
+**	sqdecd	x0, w0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl64_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_VL64, 16),
+		x0 = svqdecd_pat (x0, SV_VL64, 16))
+
+/*
+** qdecd_pat_n_vl128_s32:
+**	sqdecd	x0, w0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl128_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_VL128, 16),
+		x0 = svqdecd_pat (x0, SV_VL128, 16))
+
+/*
+** qdecd_pat_n_vl256_s32:
+**	sqdecd	x0, w0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl256_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_VL256, 16),
+		x0 = svqdecd_pat (x0, SV_VL256, 16))
+
+/*
+** qdecd_pat_n_mul4_s32:
+**	sqdecd	x0, w0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_mul4_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_MUL4, 16),
+		x0 = svqdecd_pat (x0, SV_MUL4, 16))
+
+/*
+** qdecd_pat_n_mul3_s32:
+**	sqdecd	x0, w0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_mul3_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_MUL3, 16),
+		x0 = svqdecd_pat (x0, SV_MUL3, 16))
+
+/*
+** qdecd_pat_n_all_s32:
+**	sqdecd	x0, w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_all_s32, int32_t,
+		x0 = svqdecd_pat_n_s32 (x0, SV_ALL, 16),
+		x0 = svqdecd_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_s64.c
new file mode 100644
index 000000000..3970ff058
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_s64.c
@@ -0,0 +1,401 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecd_pat_1_s64_tied:
+**	sqdecd	z0\.d, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_1_s64_tied, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_POW2, 1),
+		z0 = svqdecd_pat (z0, SV_POW2, 1))
+
+/*
+** qdecd_pat_1_s64_untied:
+**	movprfx	z0, z1
+**	sqdecd	z0\.d, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_1_s64_untied, svint64_t,
+		z0 = svqdecd_pat_s64 (z1, SV_POW2, 1),
+		z0 = svqdecd_pat (z1, SV_POW2, 1))
+
+/*
+** qdecd_pat_2_s64:
+**	sqdecd	z0\.d, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_2_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_POW2, 2),
+		z0 = svqdecd_pat (z0, SV_POW2, 2))
+
+/*
+** qdecd_pat_7_s64:
+**	sqdecd	z0\.d, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_7_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_POW2, 7),
+		z0 = svqdecd_pat (z0, SV_POW2, 7))
+
+/*
+** qdecd_pat_15_s64:
+**	sqdecd	z0\.d, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_15_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_POW2, 15),
+		z0 = svqdecd_pat (z0, SV_POW2, 15))
+
+/*
+** qdecd_pat_16_s64:
+**	sqdecd	z0\.d, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_16_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_POW2, 16),
+		z0 = svqdecd_pat (z0, SV_POW2, 16))
+
+/*
+** qdecd_pat_vl1_s64:
+**	sqdecd	z0\.d, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl1_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_VL1, 16),
+		z0 = svqdecd_pat (z0, SV_VL1, 16))
+
+/*
+** qdecd_pat_vl2_s64:
+**	sqdecd	z0\.d, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl2_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_VL2, 16),
+		z0 = svqdecd_pat (z0, SV_VL2, 16))
+
+/*
+** qdecd_pat_vl3_s64:
+**	sqdecd	z0\.d, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl3_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_VL3, 16),
+		z0 = svqdecd_pat (z0, SV_VL3, 16))
+
+/*
+** qdecd_pat_vl4_s64:
+**	sqdecd	z0\.d, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl4_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_VL4, 16),
+		z0 = svqdecd_pat (z0, SV_VL4, 16))
+
+/*
+** qdecd_pat_vl5_s64:
+**	sqdecd	z0\.d, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl5_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_VL5, 16),
+		z0 = svqdecd_pat (z0, SV_VL5, 16))
+
+/*
+** qdecd_pat_vl6_s64:
+**	sqdecd	z0\.d, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl6_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_VL6, 16),
+		z0 = svqdecd_pat (z0, SV_VL6, 16))
+
+/*
+** qdecd_pat_vl7_s64:
+**	sqdecd	z0\.d, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl7_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_VL7, 16),
+		z0 = svqdecd_pat (z0, SV_VL7, 16))
+
+/*
+** qdecd_pat_vl8_s64:
+**	sqdecd	z0\.d, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl8_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_VL8, 16),
+		z0 = svqdecd_pat (z0, SV_VL8, 16))
+
+/*
+** qdecd_pat_vl16_s64:
+**	sqdecd	z0\.d, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl16_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_VL16, 16),
+		z0 = svqdecd_pat (z0, SV_VL16, 16))
+
+/*
+** qdecd_pat_vl32_s64:
+**	sqdecd	z0\.d, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl32_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_VL32, 16),
+		z0 = svqdecd_pat (z0, SV_VL32, 16))
+
+/*
+** qdecd_pat_vl64_s64:
+**	sqdecd	z0\.d, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl64_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_VL64, 16),
+		z0 = svqdecd_pat (z0, SV_VL64, 16))
+
+/*
+** qdecd_pat_vl128_s64:
+**	sqdecd	z0\.d, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl128_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_VL128, 16),
+		z0 = svqdecd_pat (z0, SV_VL128, 16))
+
+/*
+** qdecd_pat_vl256_s64:
+**	sqdecd	z0\.d, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl256_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_VL256, 16),
+		z0 = svqdecd_pat (z0, SV_VL256, 16))
+
+/*
+** qdecd_pat_mul4_s64:
+**	sqdecd	z0\.d, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_mul4_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_MUL4, 16),
+		z0 = svqdecd_pat (z0, SV_MUL4, 16))
+
+/*
+** qdecd_pat_mul3_s64:
+**	sqdecd	z0\.d, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_mul3_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_MUL3, 16),
+		z0 = svqdecd_pat (z0, SV_MUL3, 16))
+
+/*
+** qdecd_pat_all_s64:
+**	sqdecd	z0\.d, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_all_s64, svint64_t,
+		z0 = svqdecd_pat_s64 (z0, SV_ALL, 16),
+		z0 = svqdecd_pat (z0, SV_ALL, 16))
+
+/*
+** qdecd_pat_n_1_s64_tied:
+**	sqdecd	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_1_s64_tied, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_POW2, 1),
+		x0 = svqdecd_pat (x0, SV_POW2, 1))
+
+/*
+** qdecd_pat_n_1_s64_untied:
+**	mov	x0, x1
+**	sqdecd	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_1_s64_untied, int64_t,
+		x0 = svqdecd_pat_n_s64 (x1, SV_POW2, 1),
+		x0 = svqdecd_pat (x1, SV_POW2, 1))
+
+/*
+** qdecd_pat_n_2_s64:
+**	sqdecd	x0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_2_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_POW2, 2),
+		x0 = svqdecd_pat (x0, SV_POW2, 2))
+
+/*
+** qdecd_pat_n_7_s64:
+**	sqdecd	x0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_7_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_POW2, 7),
+		x0 = svqdecd_pat (x0, SV_POW2, 7))
+
+/*
+** qdecd_pat_n_15_s64:
+**	sqdecd	x0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_15_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_POW2, 15),
+		x0 = svqdecd_pat (x0, SV_POW2, 15))
+
+/*
+** qdecd_pat_n_16_s64:
+**	sqdecd	x0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_16_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_POW2, 16),
+		x0 = svqdecd_pat (x0, SV_POW2, 16))
+
+/*
+** qdecd_pat_n_vl1_s64:
+**	sqdecd	x0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl1_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_VL1, 16),
+		x0 = svqdecd_pat (x0, SV_VL1, 16))
+
+/*
+** qdecd_pat_n_vl2_s64:
+**	sqdecd	x0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl2_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_VL2, 16),
+		x0 = svqdecd_pat (x0, SV_VL2, 16))
+
+/*
+** qdecd_pat_n_vl3_s64:
+**	sqdecd	x0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl3_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_VL3, 16),
+		x0 = svqdecd_pat (x0, SV_VL3, 16))
+
+/*
+** qdecd_pat_n_vl4_s64:
+**	sqdecd	x0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl4_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_VL4, 16),
+		x0 = svqdecd_pat (x0, SV_VL4, 16))
+
+/*
+** qdecd_pat_n_vl5_s64:
+**	sqdecd	x0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl5_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_VL5, 16),
+		x0 = svqdecd_pat (x0, SV_VL5, 16))
+
+/*
+** qdecd_pat_n_vl6_s64:
+**	sqdecd	x0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl6_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_VL6, 16),
+		x0 = svqdecd_pat (x0, SV_VL6, 16))
+
+/*
+** qdecd_pat_n_vl7_s64:
+**	sqdecd	x0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl7_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_VL7, 16),
+		x0 = svqdecd_pat (x0, SV_VL7, 16))
+
+/*
+** qdecd_pat_n_vl8_s64:
+**	sqdecd	x0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl8_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_VL8, 16),
+		x0 = svqdecd_pat (x0, SV_VL8, 16))
+
+/*
+** qdecd_pat_n_vl16_s64:
+**	sqdecd	x0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl16_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_VL16, 16),
+		x0 = svqdecd_pat (x0, SV_VL16, 16))
+
+/*
+** qdecd_pat_n_vl32_s64:
+**	sqdecd	x0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl32_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_VL32, 16),
+		x0 = svqdecd_pat (x0, SV_VL32, 16))
+
+/*
+** qdecd_pat_n_vl64_s64:
+**	sqdecd	x0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl64_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_VL64, 16),
+		x0 = svqdecd_pat (x0, SV_VL64, 16))
+
+/*
+** qdecd_pat_n_vl128_s64:
+**	sqdecd	x0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl128_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_VL128, 16),
+		x0 = svqdecd_pat (x0, SV_VL128, 16))
+
+/*
+** qdecd_pat_n_vl256_s64:
+**	sqdecd	x0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl256_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_VL256, 16),
+		x0 = svqdecd_pat (x0, SV_VL256, 16))
+
+/*
+** qdecd_pat_n_mul4_s64:
+**	sqdecd	x0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_mul4_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_MUL4, 16),
+		x0 = svqdecd_pat (x0, SV_MUL4, 16))
+
+/*
+** qdecd_pat_n_mul3_s64:
+**	sqdecd	x0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_mul3_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_MUL3, 16),
+		x0 = svqdecd_pat (x0, SV_MUL3, 16))
+
+/*
+** qdecd_pat_n_all_s64:
+**	sqdecd	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_all_s64, int64_t,
+		x0 = svqdecd_pat_n_s64 (x0, SV_ALL, 16),
+		x0 = svqdecd_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_u32.c
new file mode 100644
index 000000000..b33e402f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_u32.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecd_pat_n_1_u32_tied:
+**	uqdecd	w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_1_u32_tied, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_POW2, 1),
+		x0 = svqdecd_pat (x0, SV_POW2, 1))
+
+/*
+** qdecd_pat_n_1_u32_untied:
+**	mov	w0, w1
+**	uqdecd	w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_1_u32_untied, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x1, SV_POW2, 1),
+		x0 = svqdecd_pat (x1, SV_POW2, 1))
+
+/*
+** qdecd_pat_n_2_u32:
+**	uqdecd	w0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_2_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_POW2, 2),
+		x0 = svqdecd_pat (x0, SV_POW2, 2))
+
+/*
+** qdecd_pat_n_7_u32:
+**	uqdecd	w0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_7_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_POW2, 7),
+		x0 = svqdecd_pat (x0, SV_POW2, 7))
+
+/*
+** qdecd_pat_n_15_u32:
+**	uqdecd	w0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_15_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_POW2, 15),
+		x0 = svqdecd_pat (x0, SV_POW2, 15))
+
+/*
+** qdecd_pat_n_16_u32:
+**	uqdecd	w0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_16_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_POW2, 16),
+		x0 = svqdecd_pat (x0, SV_POW2, 16))
+
+/*
+** qdecd_pat_n_vl1_u32:
+**	uqdecd	w0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl1_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_VL1, 16),
+		x0 = svqdecd_pat (x0, SV_VL1, 16))
+
+/*
+** qdecd_pat_n_vl2_u32:
+**	uqdecd	w0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl2_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_VL2, 16),
+		x0 = svqdecd_pat (x0, SV_VL2, 16))
+
+/*
+** qdecd_pat_n_vl3_u32:
+**	uqdecd	w0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl3_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_VL3, 16),
+		x0 = svqdecd_pat (x0, SV_VL3, 16))
+
+/*
+** qdecd_pat_n_vl4_u32:
+**	uqdecd	w0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl4_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_VL4, 16),
+		x0 = svqdecd_pat (x0, SV_VL4, 16))
+
+/*
+** qdecd_pat_n_vl5_u32:
+**	uqdecd	w0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl5_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_VL5, 16),
+		x0 = svqdecd_pat (x0, SV_VL5, 16))
+
+/*
+** qdecd_pat_n_vl6_u32:
+**	uqdecd	w0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl6_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_VL6, 16),
+		x0 = svqdecd_pat (x0, SV_VL6, 16))
+
+/*
+** qdecd_pat_n_vl7_u32:
+**	uqdecd	w0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl7_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_VL7, 16),
+		x0 = svqdecd_pat (x0, SV_VL7, 16))
+
+/*
+** qdecd_pat_n_vl8_u32:
+**	uqdecd	w0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl8_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_VL8, 16),
+		x0 = svqdecd_pat (x0, SV_VL8, 16))
+
+/*
+** qdecd_pat_n_vl16_u32:
+**	uqdecd	w0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl16_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_VL16, 16),
+		x0 = svqdecd_pat (x0, SV_VL16, 16))
+
+/*
+** qdecd_pat_n_vl32_u32:
+**	uqdecd	w0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl32_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_VL32, 16),
+		x0 = svqdecd_pat (x0, SV_VL32, 16))
+
+/*
+** qdecd_pat_n_vl64_u32:
+**	uqdecd	w0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl64_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_VL64, 16),
+		x0 = svqdecd_pat (x0, SV_VL64, 16))
+
+/*
+** qdecd_pat_n_vl128_u32:
+**	uqdecd	w0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl128_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_VL128, 16),
+		x0 = svqdecd_pat (x0, SV_VL128, 16))
+
+/*
+** qdecd_pat_n_vl256_u32:
+**	uqdecd	w0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl256_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_VL256, 16),
+		x0 = svqdecd_pat (x0, SV_VL256, 16))
+
+/*
+** qdecd_pat_n_mul4_u32:
+**	uqdecd	w0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_mul4_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_MUL4, 16),
+		x0 = svqdecd_pat (x0, SV_MUL4, 16))
+
+/*
+** qdecd_pat_n_mul3_u32:
+**	uqdecd	w0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_mul3_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_MUL3, 16),
+		x0 = svqdecd_pat (x0, SV_MUL3, 16))
+
+/*
+** qdecd_pat_n_all_u32:
+**	uqdecd	w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_all_u32, uint32_t,
+		x0 = svqdecd_pat_n_u32 (x0, SV_ALL, 16),
+		x0 = svqdecd_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_u64.c
new file mode 100644
index 000000000..f0d1bd357
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_pat_u64.c
@@ -0,0 +1,401 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecd_pat_1_u64_tied:
+**	uqdecd	z0\.d, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_1_u64_tied, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_POW2, 1),
+		z0 = svqdecd_pat (z0, SV_POW2, 1))
+
+/*
+** qdecd_pat_1_u64_untied:
+**	movprfx	z0, z1
+**	uqdecd	z0\.d, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_1_u64_untied, svuint64_t,
+		z0 = svqdecd_pat_u64 (z1, SV_POW2, 1),
+		z0 = svqdecd_pat (z1, SV_POW2, 1))
+
+/*
+** qdecd_pat_2_u64:
+**	uqdecd	z0\.d, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_2_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_POW2, 2),
+		z0 = svqdecd_pat (z0, SV_POW2, 2))
+
+/*
+** qdecd_pat_7_u64:
+**	uqdecd	z0\.d, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_7_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_POW2, 7),
+		z0 = svqdecd_pat (z0, SV_POW2, 7))
+
+/*
+** qdecd_pat_15_u64:
+**	uqdecd	z0\.d, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_15_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_POW2, 15),
+		z0 = svqdecd_pat (z0, SV_POW2, 15))
+
+/*
+** qdecd_pat_16_u64:
+**	uqdecd	z0\.d, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_16_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_POW2, 16),
+		z0 = svqdecd_pat (z0, SV_POW2, 16))
+
+/*
+** qdecd_pat_vl1_u64:
+**	uqdecd	z0\.d, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl1_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_VL1, 16),
+		z0 = svqdecd_pat (z0, SV_VL1, 16))
+
+/*
+** qdecd_pat_vl2_u64:
+**	uqdecd	z0\.d, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl2_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_VL2, 16),
+		z0 = svqdecd_pat (z0, SV_VL2, 16))
+
+/*
+** qdecd_pat_vl3_u64:
+**	uqdecd	z0\.d, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl3_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_VL3, 16),
+		z0 = svqdecd_pat (z0, SV_VL3, 16))
+
+/*
+** qdecd_pat_vl4_u64:
+**	uqdecd	z0\.d, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl4_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_VL4, 16),
+		z0 = svqdecd_pat (z0, SV_VL4, 16))
+
+/*
+** qdecd_pat_vl5_u64:
+**	uqdecd	z0\.d, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl5_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_VL5, 16),
+		z0 = svqdecd_pat (z0, SV_VL5, 16))
+
+/*
+** qdecd_pat_vl6_u64:
+**	uqdecd	z0\.d, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl6_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_VL6, 16),
+		z0 = svqdecd_pat (z0, SV_VL6, 16))
+
+/*
+** qdecd_pat_vl7_u64:
+**	uqdecd	z0\.d, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl7_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_VL7, 16),
+		z0 = svqdecd_pat (z0, SV_VL7, 16))
+
+/*
+** qdecd_pat_vl8_u64:
+**	uqdecd	z0\.d, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl8_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_VL8, 16),
+		z0 = svqdecd_pat (z0, SV_VL8, 16))
+
+/*
+** qdecd_pat_vl16_u64:
+**	uqdecd	z0\.d, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl16_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_VL16, 16),
+		z0 = svqdecd_pat (z0, SV_VL16, 16))
+
+/*
+** qdecd_pat_vl32_u64:
+**	uqdecd	z0\.d, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl32_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_VL32, 16),
+		z0 = svqdecd_pat (z0, SV_VL32, 16))
+
+/*
+** qdecd_pat_vl64_u64:
+**	uqdecd	z0\.d, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl64_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_VL64, 16),
+		z0 = svqdecd_pat (z0, SV_VL64, 16))
+
+/*
+** qdecd_pat_vl128_u64:
+**	uqdecd	z0\.d, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl128_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_VL128, 16),
+		z0 = svqdecd_pat (z0, SV_VL128, 16))
+
+/*
+** qdecd_pat_vl256_u64:
+**	uqdecd	z0\.d, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_vl256_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_VL256, 16),
+		z0 = svqdecd_pat (z0, SV_VL256, 16))
+
+/*
+** qdecd_pat_mul4_u64:
+**	uqdecd	z0\.d, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_mul4_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_MUL4, 16),
+		z0 = svqdecd_pat (z0, SV_MUL4, 16))
+
+/*
+** qdecd_pat_mul3_u64:
+**	uqdecd	z0\.d, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_mul3_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_MUL3, 16),
+		z0 = svqdecd_pat (z0, SV_MUL3, 16))
+
+/*
+** qdecd_pat_all_u64:
+**	uqdecd	z0\.d, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_pat_all_u64, svuint64_t,
+		z0 = svqdecd_pat_u64 (z0, SV_ALL, 16),
+		z0 = svqdecd_pat (z0, SV_ALL, 16))
+
+/*
+** qdecd_pat_n_1_u64_tied:
+**	uqdecd	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_1_u64_tied, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_POW2, 1),
+		x0 = svqdecd_pat (x0, SV_POW2, 1))
+
+/*
+** qdecd_pat_n_1_u64_untied:
+**	mov	x0, x1
+**	uqdecd	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_1_u64_untied, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x1, SV_POW2, 1),
+		x0 = svqdecd_pat (x1, SV_POW2, 1))
+
+/*
+** qdecd_pat_n_2_u64:
+**	uqdecd	x0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_2_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_POW2, 2),
+		x0 = svqdecd_pat (x0, SV_POW2, 2))
+
+/*
+** qdecd_pat_n_7_u64:
+**	uqdecd	x0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_7_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_POW2, 7),
+		x0 = svqdecd_pat (x0, SV_POW2, 7))
+
+/*
+** qdecd_pat_n_15_u64:
+**	uqdecd	x0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_15_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_POW2, 15),
+		x0 = svqdecd_pat (x0, SV_POW2, 15))
+
+/*
+** qdecd_pat_n_16_u64:
+**	uqdecd	x0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_16_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_POW2, 16),
+		x0 = svqdecd_pat (x0, SV_POW2, 16))
+
+/*
+** qdecd_pat_n_vl1_u64:
+**	uqdecd	x0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl1_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_VL1, 16),
+		x0 = svqdecd_pat (x0, SV_VL1, 16))
+
+/*
+** qdecd_pat_n_vl2_u64:
+**	uqdecd	x0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl2_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_VL2, 16),
+		x0 = svqdecd_pat (x0, SV_VL2, 16))
+
+/*
+** qdecd_pat_n_vl3_u64:
+**	uqdecd	x0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl3_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_VL3, 16),
+		x0 = svqdecd_pat (x0, SV_VL3, 16))
+
+/*
+** qdecd_pat_n_vl4_u64:
+**	uqdecd	x0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl4_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_VL4, 16),
+		x0 = svqdecd_pat (x0, SV_VL4, 16))
+
+/*
+** qdecd_pat_n_vl5_u64:
+**	uqdecd	x0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl5_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_VL5, 16),
+		x0 = svqdecd_pat (x0, SV_VL5, 16))
+
+/*
+** qdecd_pat_n_vl6_u64:
+**	uqdecd	x0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl6_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_VL6, 16),
+		x0 = svqdecd_pat (x0, SV_VL6, 16))
+
+/*
+** qdecd_pat_n_vl7_u64:
+**	uqdecd	x0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl7_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_VL7, 16),
+		x0 = svqdecd_pat (x0, SV_VL7, 16))
+
+/*
+** qdecd_pat_n_vl8_u64:
+**	uqdecd	x0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl8_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_VL8, 16),
+		x0 = svqdecd_pat (x0, SV_VL8, 16))
+
+/*
+** qdecd_pat_n_vl16_u64:
+**	uqdecd	x0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl16_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_VL16, 16),
+		x0 = svqdecd_pat (x0, SV_VL16, 16))
+
+/*
+** qdecd_pat_n_vl32_u64:
+**	uqdecd	x0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl32_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_VL32, 16),
+		x0 = svqdecd_pat (x0, SV_VL32, 16))
+
+/*
+** qdecd_pat_n_vl64_u64:
+**	uqdecd	x0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl64_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_VL64, 16),
+		x0 = svqdecd_pat (x0, SV_VL64, 16))
+
+/*
+** qdecd_pat_n_vl128_u64:
+**	uqdecd	x0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl128_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_VL128, 16),
+		x0 = svqdecd_pat (x0, SV_VL128, 16))
+
+/*
+** qdecd_pat_n_vl256_u64:
+**	uqdecd	x0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_vl256_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_VL256, 16),
+		x0 = svqdecd_pat (x0, SV_VL256, 16))
+
+/*
+** qdecd_pat_n_mul4_u64:
+**	uqdecd	x0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_mul4_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_MUL4, 16),
+		x0 = svqdecd_pat (x0, SV_MUL4, 16))
+
+/*
+** qdecd_pat_n_mul3_u64:
+**	uqdecd	x0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_mul3_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_MUL3, 16),
+		x0 = svqdecd_pat (x0, SV_MUL3, 16))
+
+/*
+** qdecd_pat_n_all_u64:
+**	uqdecd	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_pat_n_all_u64, uint64_t,
+		x0 = svqdecd_pat_n_u64 (x0, SV_ALL, 16),
+		x0 = svqdecd_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_s32.c
new file mode 100644
index 000000000..1912ed53f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_s32.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecd_n_1_s32_tied:
+**	sqdecd	x0, w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_1_s32_tied, int32_t,
+		x0 = svqdecd_n_s32 (x0, 1),
+		x0 = svqdecd (x0, 1))
+
+/*
+** qdecd_n_1_s32_untied:
+**	mov	w0, w1
+**	sqdecd	x0, w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_1_s32_untied, int32_t,
+		x0 = svqdecd_n_s32 (x1, 1),
+		x0 = svqdecd (x1, 1))
+
+/*
+** qdecd_n_2_s32:
+**	sqdecd	x0, w0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_2_s32, int32_t,
+		x0 = svqdecd_n_s32 (x0, 2),
+		x0 = svqdecd (x0, 2))
+
+/*
+** qdecd_n_7_s32:
+**	sqdecd	x0, w0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_7_s32, int32_t,
+		x0 = svqdecd_n_s32 (x0, 7),
+		x0 = svqdecd (x0, 7))
+
+/*
+** qdecd_n_15_s32:
+**	sqdecd	x0, w0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_15_s32, int32_t,
+		x0 = svqdecd_n_s32 (x0, 15),
+		x0 = svqdecd (x0, 15))
+
+/*
+** qdecd_n_16_s32:
+**	sqdecd	x0, w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_16_s32, int32_t,
+		x0 = svqdecd_n_s32 (x0, 16),
+		x0 = svqdecd (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_s64.c
new file mode 100644
index 000000000..bd113fc66
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_s64.c
@@ -0,0 +1,113 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecd_1_s64_tied:
+**	sqdecd	z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_1_s64_tied, svint64_t,
+		z0 = svqdecd_s64 (z0, 1),
+		z0 = svqdecd (z0, 1))
+
+/*
+** qdecd_1_s64_untied:
+**	movprfx	z0, z1
+**	sqdecd	z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_1_s64_untied, svint64_t,
+		z0 = svqdecd_s64 (z1, 1),
+		z0 = svqdecd (z1, 1))
+
+/*
+** qdecd_2_s64:
+**	sqdecd	z0\.d, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_2_s64, svint64_t,
+		z0 = svqdecd_s64 (z0, 2),
+		z0 = svqdecd (z0, 2))
+
+/*
+** qdecd_7_s64:
+**	sqdecd	z0\.d, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_7_s64, svint64_t,
+		z0 = svqdecd_s64 (z0, 7),
+		z0 = svqdecd (z0, 7))
+
+/*
+** qdecd_15_s64:
+**	sqdecd	z0\.d, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_15_s64, svint64_t,
+		z0 = svqdecd_s64 (z0, 15),
+		z0 = svqdecd (z0, 15))
+
+/*
+** qdecd_16_s64:
+**	sqdecd	z0\.d, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_16_s64, svint64_t,
+		z0 = svqdecd_s64 (z0, 16),
+		z0 = svqdecd (z0, 16))
+
+/*
+** qdecd_n_1_s64_tied:
+**	sqdecd	x0
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_1_s64_tied, int64_t,
+		x0 = svqdecd_n_s64 (x0, 1),
+		x0 = svqdecd (x0, 1))
+
+/*
+** qdecd_n_1_s64_untied:
+**	mov	x0, x1
+**	sqdecd	x0
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_1_s64_untied, int64_t,
+		x0 = svqdecd_n_s64 (x1, 1),
+		x0 = svqdecd (x1, 1))
+
+/*
+** qdecd_n_2_s64:
+**	sqdecd	x0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_2_s64, int64_t,
+		x0 = svqdecd_n_s64 (x0, 2),
+		x0 = svqdecd (x0, 2))
+
+/*
+** qdecd_n_7_s64:
+**	sqdecd	x0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_7_s64, int64_t,
+		x0 = svqdecd_n_s64 (x0, 7),
+		x0 = svqdecd (x0, 7))
+
+/*
+** qdecd_n_15_s64:
+**	sqdecd	x0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_15_s64, int64_t,
+		x0 = svqdecd_n_s64 (x0, 15),
+		x0 = svqdecd (x0, 15))
+
+/*
+** qdecd_n_16_s64:
+**	sqdecd	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_16_s64, int64_t,
+		x0 = svqdecd_n_s64 (x0, 16),
+		x0 = svqdecd (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_u32.c
new file mode 100644
index 000000000..a672dc215
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_u32.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecd_n_1_u32_tied:
+**	uqdecd	w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_1_u32_tied, uint32_t,
+		x0 = svqdecd_n_u32 (x0, 1),
+		x0 = svqdecd (x0, 1))
+
+/*
+** qdecd_n_1_u32_untied:
+**	mov	w0, w1
+**	uqdecd	w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_1_u32_untied, uint32_t,
+		x0 = svqdecd_n_u32 (x1, 1),
+		x0 = svqdecd (x1, 1))
+
+/*
+** qdecd_n_2_u32:
+**	uqdecd	w0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_2_u32, uint32_t,
+		x0 = svqdecd_n_u32 (x0, 2),
+		x0 = svqdecd (x0, 2))
+
+/*
+** qdecd_n_7_u32:
+**	uqdecd	w0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_7_u32, uint32_t,
+		x0 = svqdecd_n_u32 (x0, 7),
+		x0 = svqdecd (x0, 7))
+
+/*
+** qdecd_n_15_u32:
+**	uqdecd	w0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_15_u32, uint32_t,
+		x0 = svqdecd_n_u32 (x0, 15),
+		x0 = svqdecd (x0, 15))
+
+/*
+** qdecd_n_16_u32:
+**	uqdecd	w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_16_u32, uint32_t,
+		x0 = svqdecd_n_u32 (x0, 16),
+		x0 = svqdecd (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_u64.c
new file mode 100644
index 000000000..fca8868f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecd_u64.c
@@ -0,0 +1,113 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecd_1_u64_tied:
+**	uqdecd	z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_1_u64_tied, svuint64_t,
+		z0 = svqdecd_u64 (z0, 1),
+		z0 = svqdecd (z0, 1))
+
+/*
+** qdecd_1_u64_untied:
+**	movprfx	z0, z1
+**	uqdecd	z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_1_u64_untied, svuint64_t,
+		z0 = svqdecd_u64 (z1, 1),
+		z0 = svqdecd (z1, 1))
+
+/*
+** qdecd_2_u64:
+**	uqdecd	z0\.d, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_2_u64, svuint64_t,
+		z0 = svqdecd_u64 (z0, 2),
+		z0 = svqdecd (z0, 2))
+
+/*
+** qdecd_7_u64:
+**	uqdecd	z0\.d, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_7_u64, svuint64_t,
+		z0 = svqdecd_u64 (z0, 7),
+		z0 = svqdecd (z0, 7))
+
+/*
+** qdecd_15_u64:
+**	uqdecd	z0\.d, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_15_u64, svuint64_t,
+		z0 = svqdecd_u64 (z0, 15),
+		z0 = svqdecd (z0, 15))
+
+/*
+** qdecd_16_u64:
+**	uqdecd	z0\.d, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecd_16_u64, svuint64_t,
+		z0 = svqdecd_u64 (z0, 16),
+		z0 = svqdecd (z0, 16))
+
+/*
+** qdecd_n_1_u64_tied:
+**	uqdecd	x0
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_1_u64_tied, uint64_t,
+		x0 = svqdecd_n_u64 (x0, 1),
+		x0 = svqdecd (x0, 1))
+
+/*
+** qdecd_n_1_u64_untied:
+**	mov	x0, x1
+**	uqdecd	x0
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_1_u64_untied, uint64_t,
+		x0 = svqdecd_n_u64 (x1, 1),
+		x0 = svqdecd (x1, 1))
+
+/*
+** qdecd_n_2_u64:
+**	uqdecd	x0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_2_u64, uint64_t,
+		x0 = svqdecd_n_u64 (x0, 2),
+		x0 = svqdecd (x0, 2))
+
+/*
+** qdecd_n_7_u64:
+**	uqdecd	x0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_7_u64, uint64_t,
+		x0 = svqdecd_n_u64 (x0, 7),
+		x0 = svqdecd (x0, 7))
+
+/*
+** qdecd_n_15_u64:
+**	uqdecd	x0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_15_u64, uint64_t,
+		x0 = svqdecd_n_u64 (x0, 15),
+		x0 = svqdecd (x0, 15))
+
+/*
+** qdecd_n_16_u64:
+**	uqdecd	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecd_n_16_u64, uint64_t,
+		x0 = svqdecd_n_u64 (x0, 16),
+		x0 = svqdecd (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s16.c
new file mode 100644
index 000000000..c084043f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s16.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdech_pat_1_s16_tied:
+**	sqdech	z0\.h, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_1_s16_tied, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_POW2, 1),
+		z0 = svqdech_pat (z0, SV_POW2, 1))
+
+/*
+** qdech_pat_1_s16_untied:
+**	movprfx	z0, z1
+**	sqdech	z0\.h, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_1_s16_untied, svint16_t,
+		z0 = svqdech_pat_s16 (z1, SV_POW2, 1),
+		z0 = svqdech_pat (z1, SV_POW2, 1))
+
+/*
+** qdech_pat_2_s16:
+**	sqdech	z0\.h, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_2_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_POW2, 2),
+		z0 = svqdech_pat (z0, SV_POW2, 2))
+
+/*
+** qdech_pat_7_s16:
+**	sqdech	z0\.h, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_7_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_POW2, 7),
+		z0 = svqdech_pat (z0, SV_POW2, 7))
+
+/*
+** qdech_pat_15_s16:
+**	sqdech	z0\.h, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_15_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_POW2, 15),
+		z0 = svqdech_pat (z0, SV_POW2, 15))
+
+/*
+** qdech_pat_16_s16:
+**	sqdech	z0\.h, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_16_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_POW2, 16),
+		z0 = svqdech_pat (z0, SV_POW2, 16))
+
+/*
+** qdech_pat_vl1_s16:
+**	sqdech	z0\.h, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl1_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_VL1, 16),
+		z0 = svqdech_pat (z0, SV_VL1, 16))
+
+/*
+** qdech_pat_vl2_s16:
+**	sqdech	z0\.h, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl2_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_VL2, 16),
+		z0 = svqdech_pat (z0, SV_VL2, 16))
+
+/*
+** qdech_pat_vl3_s16:
+**	sqdech	z0\.h, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl3_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_VL3, 16),
+		z0 = svqdech_pat (z0, SV_VL3, 16))
+
+/*
+** qdech_pat_vl4_s16:
+**	sqdech	z0\.h, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl4_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_VL4, 16),
+		z0 = svqdech_pat (z0, SV_VL4, 16))
+
+/*
+** qdech_pat_vl5_s16:
+**	sqdech	z0\.h, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl5_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_VL5, 16),
+		z0 = svqdech_pat (z0, SV_VL5, 16))
+
+/*
+** qdech_pat_vl6_s16:
+**	sqdech	z0\.h, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl6_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_VL6, 16),
+		z0 = svqdech_pat (z0, SV_VL6, 16))
+
+/*
+** qdech_pat_vl7_s16:
+**	sqdech	z0\.h, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl7_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_VL7, 16),
+		z0 = svqdech_pat (z0, SV_VL7, 16))
+
+/*
+** qdech_pat_vl8_s16:
+**	sqdech	z0\.h, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl8_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_VL8, 16),
+		z0 = svqdech_pat (z0, SV_VL8, 16))
+
+/*
+** qdech_pat_vl16_s16:
+**	sqdech	z0\.h, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl16_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_VL16, 16),
+		z0 = svqdech_pat (z0, SV_VL16, 16))
+
+/*
+** qdech_pat_vl32_s16:
+**	sqdech	z0\.h, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl32_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_VL32, 16),
+		z0 = svqdech_pat (z0, SV_VL32, 16))
+
+/*
+** qdech_pat_vl64_s16:
+**	sqdech	z0\.h, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl64_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_VL64, 16),
+		z0 = svqdech_pat (z0, SV_VL64, 16))
+
+/*
+** qdech_pat_vl128_s16:
+**	sqdech	z0\.h, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl128_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_VL128, 16),
+		z0 = svqdech_pat (z0, SV_VL128, 16))
+
+/*
+** qdech_pat_vl256_s16:
+**	sqdech	z0\.h, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl256_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_VL256, 16),
+		z0 = svqdech_pat (z0, SV_VL256, 16))
+
+/*
+** qdech_pat_mul4_s16:
+**	sqdech	z0\.h, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_mul4_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_MUL4, 16),
+		z0 = svqdech_pat (z0, SV_MUL4, 16))
+
+/*
+** qdech_pat_mul3_s16:
+**	sqdech	z0\.h, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_mul3_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_MUL3, 16),
+		z0 = svqdech_pat (z0, SV_MUL3, 16))
+
+/*
+** qdech_pat_all_s16:
+**	sqdech	z0\.h, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_all_s16, svint16_t,
+		z0 = svqdech_pat_s16 (z0, SV_ALL, 16),
+		z0 = svqdech_pat (z0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s32.c
new file mode 100644
index 000000000..b56306db7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s32.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdech_pat_n_1_s32_tied:
+**	sqdech	x0, w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_1_s32_tied, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_POW2, 1),
+		x0 = svqdech_pat (x0, SV_POW2, 1))
+
+/*
+** qdech_pat_n_1_s32_untied:
+**	mov	w0, w1
+**	sqdech	x0, w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_1_s32_untied, int32_t,
+		x0 = svqdech_pat_n_s32 (x1, SV_POW2, 1),
+		x0 = svqdech_pat (x1, SV_POW2, 1))
+
+/*
+** qdech_pat_n_2_s32:
+**	sqdech	x0, w0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_2_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_POW2, 2),
+		x0 = svqdech_pat (x0, SV_POW2, 2))
+
+/*
+** qdech_pat_n_7_s32:
+**	sqdech	x0, w0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_7_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_POW2, 7),
+		x0 = svqdech_pat (x0, SV_POW2, 7))
+
+/*
+** qdech_pat_n_15_s32:
+**	sqdech	x0, w0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_15_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_POW2, 15),
+		x0 = svqdech_pat (x0, SV_POW2, 15))
+
+/*
+** qdech_pat_n_16_s32:
+**	sqdech	x0, w0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_16_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_POW2, 16),
+		x0 = svqdech_pat (x0, SV_POW2, 16))
+
+/*
+** qdech_pat_n_vl1_s32:
+**	sqdech	x0, w0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl1_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_VL1, 16),
+		x0 = svqdech_pat (x0, SV_VL1, 16))
+
+/*
+** qdech_pat_n_vl2_s32:
+**	sqdech	x0, w0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl2_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_VL2, 16),
+		x0 = svqdech_pat (x0, SV_VL2, 16))
+
+/*
+** qdech_pat_n_vl3_s32:
+**	sqdech	x0, w0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl3_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_VL3, 16),
+		x0 = svqdech_pat (x0, SV_VL3, 16))
+
+/*
+** qdech_pat_n_vl4_s32:
+**	sqdech	x0, w0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl4_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_VL4, 16),
+		x0 = svqdech_pat (x0, SV_VL4, 16))
+
+/*
+** qdech_pat_n_vl5_s32:
+**	sqdech	x0, w0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl5_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_VL5, 16),
+		x0 = svqdech_pat (x0, SV_VL5, 16))
+
+/*
+** qdech_pat_n_vl6_s32:
+**	sqdech	x0, w0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl6_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_VL6, 16),
+		x0 = svqdech_pat (x0, SV_VL6, 16))
+
+/*
+** qdech_pat_n_vl7_s32:
+**	sqdech	x0, w0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl7_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_VL7, 16),
+		x0 = svqdech_pat (x0, SV_VL7, 16))
+
+/*
+** qdech_pat_n_vl8_s32:
+**	sqdech	x0, w0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl8_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_VL8, 16),
+		x0 = svqdech_pat (x0, SV_VL8, 16))
+
+/*
+** qdech_pat_n_vl16_s32:
+**	sqdech	x0, w0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl16_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_VL16, 16),
+		x0 = svqdech_pat (x0, SV_VL16, 16))
+
+/*
+** qdech_pat_n_vl32_s32:
+**	sqdech	x0, w0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl32_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_VL32, 16),
+		x0 = svqdech_pat (x0, SV_VL32, 16))
+
+/*
+** qdech_pat_n_vl64_s32:
+**	sqdech	x0, w0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl64_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_VL64, 16),
+		x0 = svqdech_pat (x0, SV_VL64, 16))
+
+/*
+** qdech_pat_n_vl128_s32:
+**	sqdech	x0, w0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl128_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_VL128, 16),
+		x0 = svqdech_pat (x0, SV_VL128, 16))
+
+/*
+** qdech_pat_n_vl256_s32:
+**	sqdech	x0, w0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl256_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_VL256, 16),
+		x0 = svqdech_pat (x0, SV_VL256, 16))
+
+/*
+** qdech_pat_n_mul4_s32:
+**	sqdech	x0, w0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_mul4_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_MUL4, 16),
+		x0 = svqdech_pat (x0, SV_MUL4, 16))
+
+/*
+** qdech_pat_n_mul3_s32:
+**	sqdech	x0, w0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_mul3_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_MUL3, 16),
+		x0 = svqdech_pat (x0, SV_MUL3, 16))
+
+/*
+** qdech_pat_n_all_s32:
+**	sqdech	x0, w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_all_s32, int32_t,
+		x0 = svqdech_pat_n_s32 (x0, SV_ALL, 16),
+		x0 = svqdech_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s64.c
new file mode 100644
index 000000000..591658f54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_s64.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdech_pat_n_1_s64_tied:
+**	sqdech	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_1_s64_tied, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_POW2, 1),
+		x0 = svqdech_pat (x0, SV_POW2, 1))
+
+/*
+** qdech_pat_n_1_s64_untied:
+**	mov	x0, x1
+**	sqdech	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_1_s64_untied, int64_t,
+		x0 = svqdech_pat_n_s64 (x1, SV_POW2, 1),
+		x0 = svqdech_pat (x1, SV_POW2, 1))
+
+/*
+** qdech_pat_n_2_s64:
+**	sqdech	x0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_2_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_POW2, 2),
+		x0 = svqdech_pat (x0, SV_POW2, 2))
+
+/*
+** qdech_pat_n_7_s64:
+**	sqdech	x0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_7_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_POW2, 7),
+		x0 = svqdech_pat (x0, SV_POW2, 7))
+
+/*
+** qdech_pat_n_15_s64:
+**	sqdech	x0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_15_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_POW2, 15),
+		x0 = svqdech_pat (x0, SV_POW2, 15))
+
+/*
+** qdech_pat_n_16_s64:
+**	sqdech	x0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_16_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_POW2, 16),
+		x0 = svqdech_pat (x0, SV_POW2, 16))
+
+/*
+** qdech_pat_n_vl1_s64:
+**	sqdech	x0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl1_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_VL1, 16),
+		x0 = svqdech_pat (x0, SV_VL1, 16))
+
+/*
+** qdech_pat_n_vl2_s64:
+**	sqdech	x0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl2_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_VL2, 16),
+		x0 = svqdech_pat (x0, SV_VL2, 16))
+
+/*
+** qdech_pat_n_vl3_s64:
+**	sqdech	x0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl3_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_VL3, 16),
+		x0 = svqdech_pat (x0, SV_VL3, 16))
+
+/*
+** qdech_pat_n_vl4_s64:
+**	sqdech	x0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl4_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_VL4, 16),
+		x0 = svqdech_pat (x0, SV_VL4, 16))
+
+/*
+** qdech_pat_n_vl5_s64:
+**	sqdech	x0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl5_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_VL5, 16),
+		x0 = svqdech_pat (x0, SV_VL5, 16))
+
+/*
+** qdech_pat_n_vl6_s64:
+**	sqdech	x0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl6_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_VL6, 16),
+		x0 = svqdech_pat (x0, SV_VL6, 16))
+
+/*
+** qdech_pat_n_vl7_s64:
+**	sqdech	x0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl7_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_VL7, 16),
+		x0 = svqdech_pat (x0, SV_VL7, 16))
+
+/*
+** qdech_pat_n_vl8_s64:
+**	sqdech	x0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl8_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_VL8, 16),
+		x0 = svqdech_pat (x0, SV_VL8, 16))
+
+/*
+** qdech_pat_n_vl16_s64:
+**	sqdech	x0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl16_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_VL16, 16),
+		x0 = svqdech_pat (x0, SV_VL16, 16))
+
+/*
+** qdech_pat_n_vl32_s64:
+**	sqdech	x0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl32_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_VL32, 16),
+		x0 = svqdech_pat (x0, SV_VL32, 16))
+
+/*
+** qdech_pat_n_vl64_s64:
+**	sqdech	x0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl64_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_VL64, 16),
+		x0 = svqdech_pat (x0, SV_VL64, 16))
+
+/*
+** qdech_pat_n_vl128_s64:
+**	sqdech	x0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl128_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_VL128, 16),
+		x0 = svqdech_pat (x0, SV_VL128, 16))
+
+/*
+** qdech_pat_n_vl256_s64:
+**	sqdech	x0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl256_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_VL256, 16),
+		x0 = svqdech_pat (x0, SV_VL256, 16))
+
+/*
+** qdech_pat_n_mul4_s64:
+**	sqdech	x0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_mul4_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_MUL4, 16),
+		x0 = svqdech_pat (x0, SV_MUL4, 16))
+
+/*
+** qdech_pat_n_mul3_s64:
+**	sqdech	x0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_mul3_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_MUL3, 16),
+		x0 = svqdech_pat (x0, SV_MUL3, 16))
+
+/*
+** qdech_pat_n_all_s64:
+**	sqdech	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_all_s64, int64_t,
+		x0 = svqdech_pat_n_s64 (x0, SV_ALL, 16),
+		x0 = svqdech_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u16.c
new file mode 100644
index 000000000..ce0b5f3e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u16.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdech_pat_1_u16_tied:
+**	uqdech	z0\.h, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_1_u16_tied, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_POW2, 1),
+		z0 = svqdech_pat (z0, SV_POW2, 1))
+
+/*
+** qdech_pat_1_u16_untied:
+**	movprfx	z0, z1
+**	uqdech	z0\.h, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_1_u16_untied, svuint16_t,
+		z0 = svqdech_pat_u16 (z1, SV_POW2, 1),
+		z0 = svqdech_pat (z1, SV_POW2, 1))
+
+/*
+** qdech_pat_2_u16:
+**	uqdech	z0\.h, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_2_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_POW2, 2),
+		z0 = svqdech_pat (z0, SV_POW2, 2))
+
+/*
+** qdech_pat_7_u16:
+**	uqdech	z0\.h, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_7_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_POW2, 7),
+		z0 = svqdech_pat (z0, SV_POW2, 7))
+
+/*
+** qdech_pat_15_u16:
+**	uqdech	z0\.h, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_15_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_POW2, 15),
+		z0 = svqdech_pat (z0, SV_POW2, 15))
+
+/*
+** qdech_pat_16_u16:
+**	uqdech	z0\.h, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_16_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_POW2, 16),
+		z0 = svqdech_pat (z0, SV_POW2, 16))
+
+/*
+** qdech_pat_vl1_u16:
+**	uqdech	z0\.h, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl1_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_VL1, 16),
+		z0 = svqdech_pat (z0, SV_VL1, 16))
+
+/*
+** qdech_pat_vl2_u16:
+**	uqdech	z0\.h, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl2_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_VL2, 16),
+		z0 = svqdech_pat (z0, SV_VL2, 16))
+
+/*
+** qdech_pat_vl3_u16:
+**	uqdech	z0\.h, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl3_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_VL3, 16),
+		z0 = svqdech_pat (z0, SV_VL3, 16))
+
+/*
+** qdech_pat_vl4_u16:
+**	uqdech	z0\.h, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl4_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_VL4, 16),
+		z0 = svqdech_pat (z0, SV_VL4, 16))
+
+/*
+** qdech_pat_vl5_u16:
+**	uqdech	z0\.h, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl5_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_VL5, 16),
+		z0 = svqdech_pat (z0, SV_VL5, 16))
+
+/*
+** qdech_pat_vl6_u16:
+**	uqdech	z0\.h, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl6_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_VL6, 16),
+		z0 = svqdech_pat (z0, SV_VL6, 16))
+
+/*
+** qdech_pat_vl7_u16:
+**	uqdech	z0\.h, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl7_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_VL7, 16),
+		z0 = svqdech_pat (z0, SV_VL7, 16))
+
+/*
+** qdech_pat_vl8_u16:
+**	uqdech	z0\.h, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl8_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_VL8, 16),
+		z0 = svqdech_pat (z0, SV_VL8, 16))
+
+/*
+** qdech_pat_vl16_u16:
+**	uqdech	z0\.h, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl16_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_VL16, 16),
+		z0 = svqdech_pat (z0, SV_VL16, 16))
+
+/*
+** qdech_pat_vl32_u16:
+**	uqdech	z0\.h, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl32_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_VL32, 16),
+		z0 = svqdech_pat (z0, SV_VL32, 16))
+
+/*
+** qdech_pat_vl64_u16:
+**	uqdech	z0\.h, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl64_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_VL64, 16),
+		z0 = svqdech_pat (z0, SV_VL64, 16))
+
+/*
+** qdech_pat_vl128_u16:
+**	uqdech	z0\.h, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl128_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_VL128, 16),
+		z0 = svqdech_pat (z0, SV_VL128, 16))
+
+/*
+** qdech_pat_vl256_u16:
+**	uqdech	z0\.h, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_vl256_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_VL256, 16),
+		z0 = svqdech_pat (z0, SV_VL256, 16))
+
+/*
+** qdech_pat_mul4_u16:
+**	uqdech	z0\.h, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_mul4_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_MUL4, 16),
+		z0 = svqdech_pat (z0, SV_MUL4, 16))
+
+/*
+** qdech_pat_mul3_u16:
+**	uqdech	z0\.h, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_mul3_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_MUL3, 16),
+		z0 = svqdech_pat (z0, SV_MUL3, 16))
+
+/*
+** qdech_pat_all_u16:
+**	uqdech	z0\.h, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_pat_all_u16, svuint16_t,
+		z0 = svqdech_pat_u16 (z0, SV_ALL, 16),
+		z0 = svqdech_pat (z0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u32.c
new file mode 100644
index 000000000..177f32ec7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u32.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdech_pat_n_1_u32_tied:
+**	uqdech	w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_1_u32_tied, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_POW2, 1),
+		x0 = svqdech_pat (x0, SV_POW2, 1))
+
+/*
+** qdech_pat_n_1_u32_untied:
+**	mov	w0, w1
+**	uqdech	w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_1_u32_untied, uint32_t,
+		x0 = svqdech_pat_n_u32 (x1, SV_POW2, 1),
+		x0 = svqdech_pat (x1, SV_POW2, 1))
+
+/*
+** qdech_pat_n_2_u32:
+**	uqdech	w0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_2_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_POW2, 2),
+		x0 = svqdech_pat (x0, SV_POW2, 2))
+
+/*
+** qdech_pat_n_7_u32:
+**	uqdech	w0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_7_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_POW2, 7),
+		x0 = svqdech_pat (x0, SV_POW2, 7))
+
+/*
+** qdech_pat_n_15_u32:
+**	uqdech	w0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_15_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_POW2, 15),
+		x0 = svqdech_pat (x0, SV_POW2, 15))
+
+/*
+** qdech_pat_n_16_u32:
+**	uqdech	w0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_16_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_POW2, 16),
+		x0 = svqdech_pat (x0, SV_POW2, 16))
+
+/*
+** qdech_pat_n_vl1_u32:
+**	uqdech	w0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl1_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_VL1, 16),
+		x0 = svqdech_pat (x0, SV_VL1, 16))
+
+/*
+** qdech_pat_n_vl2_u32:
+**	uqdech	w0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl2_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_VL2, 16),
+		x0 = svqdech_pat (x0, SV_VL2, 16))
+
+/*
+** qdech_pat_n_vl3_u32:
+**	uqdech	w0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl3_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_VL3, 16),
+		x0 = svqdech_pat (x0, SV_VL3, 16))
+
+/*
+** qdech_pat_n_vl4_u32:
+**	uqdech	w0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl4_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_VL4, 16),
+		x0 = svqdech_pat (x0, SV_VL4, 16))
+
+/*
+** qdech_pat_n_vl5_u32:
+**	uqdech	w0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl5_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_VL5, 16),
+		x0 = svqdech_pat (x0, SV_VL5, 16))
+
+/*
+** qdech_pat_n_vl6_u32:
+**	uqdech	w0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl6_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_VL6, 16),
+		x0 = svqdech_pat (x0, SV_VL6, 16))
+
+/*
+** qdech_pat_n_vl7_u32:
+**	uqdech	w0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl7_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_VL7, 16),
+		x0 = svqdech_pat (x0, SV_VL7, 16))
+
+/*
+** qdech_pat_n_vl8_u32:
+**	uqdech	w0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl8_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_VL8, 16),
+		x0 = svqdech_pat (x0, SV_VL8, 16))
+
+/*
+** qdech_pat_n_vl16_u32:
+**	uqdech	w0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl16_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_VL16, 16),
+		x0 = svqdech_pat (x0, SV_VL16, 16))
+
+/*
+** qdech_pat_n_vl32_u32:
+**	uqdech	w0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl32_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_VL32, 16),
+		x0 = svqdech_pat (x0, SV_VL32, 16))
+
+/*
+** qdech_pat_n_vl64_u32:
+**	uqdech	w0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl64_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_VL64, 16),
+		x0 = svqdech_pat (x0, SV_VL64, 16))
+
+/*
+** qdech_pat_n_vl128_u32:
+**	uqdech	w0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl128_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_VL128, 16),
+		x0 = svqdech_pat (x0, SV_VL128, 16))
+
+/*
+** qdech_pat_n_vl256_u32:
+**	uqdech	w0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl256_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_VL256, 16),
+		x0 = svqdech_pat (x0, SV_VL256, 16))
+
+/*
+** qdech_pat_n_mul4_u32:
+**	uqdech	w0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_mul4_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_MUL4, 16),
+		x0 = svqdech_pat (x0, SV_MUL4, 16))
+
+/*
+** qdech_pat_n_mul3_u32:
+**	uqdech	w0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_mul3_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_MUL3, 16),
+		x0 = svqdech_pat (x0, SV_MUL3, 16))
+
+/*
+** qdech_pat_n_all_u32:
+**	uqdech	w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_all_u32, uint32_t,
+		x0 = svqdech_pat_n_u32 (x0, SV_ALL, 16),
+		x0 = svqdech_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u64.c
new file mode 100644
index 000000000..7092127f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_pat_u64.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdech_pat_n_1_u64_tied:
+**	uqdech	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_1_u64_tied, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_POW2, 1),
+		x0 = svqdech_pat (x0, SV_POW2, 1))
+
+/*
+** qdech_pat_n_1_u64_untied:
+**	mov	x0, x1
+**	uqdech	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_1_u64_untied, uint64_t,
+		x0 = svqdech_pat_n_u64 (x1, SV_POW2, 1),
+		x0 = svqdech_pat (x1, SV_POW2, 1))
+
+/*
+** qdech_pat_n_2_u64:
+**	uqdech	x0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_2_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_POW2, 2),
+		x0 = svqdech_pat (x0, SV_POW2, 2))
+
+/*
+** qdech_pat_n_7_u64:
+**	uqdech	x0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_7_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_POW2, 7),
+		x0 = svqdech_pat (x0, SV_POW2, 7))
+
+/*
+** qdech_pat_n_15_u64:
+**	uqdech	x0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_15_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_POW2, 15),
+		x0 = svqdech_pat (x0, SV_POW2, 15))
+
+/*
+** qdech_pat_n_16_u64:
+**	uqdech	x0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_16_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_POW2, 16),
+		x0 = svqdech_pat (x0, SV_POW2, 16))
+
+/*
+** qdech_pat_n_vl1_u64:
+**	uqdech	x0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl1_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_VL1, 16),
+		x0 = svqdech_pat (x0, SV_VL1, 16))
+
+/*
+** qdech_pat_n_vl2_u64:
+**	uqdech	x0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl2_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_VL2, 16),
+		x0 = svqdech_pat (x0, SV_VL2, 16))
+
+/*
+** qdech_pat_n_vl3_u64:
+**	uqdech	x0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl3_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_VL3, 16),
+		x0 = svqdech_pat (x0, SV_VL3, 16))
+
+/*
+** qdech_pat_n_vl4_u64:
+**	uqdech	x0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl4_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_VL4, 16),
+		x0 = svqdech_pat (x0, SV_VL4, 16))
+
+/*
+** qdech_pat_n_vl5_u64:
+**	uqdech	x0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl5_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_VL5, 16),
+		x0 = svqdech_pat (x0, SV_VL5, 16))
+
+/*
+** qdech_pat_n_vl6_u64:
+**	uqdech	x0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl6_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_VL6, 16),
+		x0 = svqdech_pat (x0, SV_VL6, 16))
+
+/*
+** qdech_pat_n_vl7_u64:
+**	uqdech	x0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl7_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_VL7, 16),
+		x0 = svqdech_pat (x0, SV_VL7, 16))
+
+/*
+** qdech_pat_n_vl8_u64:
+**	uqdech	x0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl8_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_VL8, 16),
+		x0 = svqdech_pat (x0, SV_VL8, 16))
+
+/*
+** qdech_pat_n_vl16_u64:
+**	uqdech	x0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl16_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_VL16, 16),
+		x0 = svqdech_pat (x0, SV_VL16, 16))
+
+/*
+** qdech_pat_n_vl32_u64:
+**	uqdech	x0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl32_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_VL32, 16),
+		x0 = svqdech_pat (x0, SV_VL32, 16))
+
+/*
+** qdech_pat_n_vl64_u64:
+**	uqdech	x0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl64_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_VL64, 16),
+		x0 = svqdech_pat (x0, SV_VL64, 16))
+
+/*
+** qdech_pat_n_vl128_u64:
+**	uqdech	x0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl128_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_VL128, 16),
+		x0 = svqdech_pat (x0, SV_VL128, 16))
+
+/*
+** qdech_pat_n_vl256_u64:
+**	uqdech	x0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_vl256_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_VL256, 16),
+		x0 = svqdech_pat (x0, SV_VL256, 16))
+
+/*
+** qdech_pat_n_mul4_u64:
+**	uqdech	x0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_mul4_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_MUL4, 16),
+		x0 = svqdech_pat (x0, SV_MUL4, 16))
+
+/*
+** qdech_pat_n_mul3_u64:
+**	uqdech	x0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_mul3_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_MUL3, 16),
+		x0 = svqdech_pat (x0, SV_MUL3, 16))
+
+/*
+** qdech_pat_n_all_u64:
+**	uqdech	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_pat_n_all_u64, uint64_t,
+		x0 = svqdech_pat_n_u64 (x0, SV_ALL, 16),
+		x0 = svqdech_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s16.c
new file mode 100644
index 000000000..2a7a8f7a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s16.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdech_1_s16_tied:
+**	sqdech	z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_1_s16_tied, svint16_t,
+		z0 = svqdech_s16 (z0, 1),
+		z0 = svqdech (z0, 1))
+
+/*
+** qdech_1_s16_untied:
+**	movprfx	z0, z1
+**	sqdech	z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_1_s16_untied, svint16_t,
+		z0 = svqdech_s16 (z1, 1),
+		z0 = svqdech (z1, 1))
+
+/*
+** qdech_2_s16:
+**	sqdech	z0\.h, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_2_s16, svint16_t,
+		z0 = svqdech_s16 (z0, 2),
+		z0 = svqdech (z0, 2))
+
+/*
+** qdech_7_s16:
+**	sqdech	z0\.h, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_7_s16, svint16_t,
+		z0 = svqdech_s16 (z0, 7),
+		z0 = svqdech (z0, 7))
+
+/*
+** qdech_15_s16:
+**	sqdech	z0\.h, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_15_s16, svint16_t,
+		z0 = svqdech_s16 (z0, 15),
+		z0 = svqdech (z0, 15))
+
+/*
+** qdech_16_s16:
+**	sqdech	z0\.h, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_16_s16, svint16_t,
+		z0 = svqdech_s16 (z0, 16),
+		z0 = svqdech (z0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s32.c
new file mode 100644
index 000000000..7fd57d85a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s32.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdech_n_1_s32_tied:
+**	sqdech	x0, w0
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_1_s32_tied, int32_t,
+		x0 = svqdech_n_s32 (x0, 1),
+		x0 = svqdech (x0, 1))
+
+/*
+** qdech_n_1_s32_untied:
+**	mov	w0, w1
+**	sqdech	x0, w0
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_1_s32_untied, int32_t,
+		x0 = svqdech_n_s32 (x1, 1),
+		x0 = svqdech (x1, 1))
+
+/*
+** qdech_n_2_s32:
+**	sqdech	x0, w0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_2_s32, int32_t,
+		x0 = svqdech_n_s32 (x0, 2),
+		x0 = svqdech (x0, 2))
+
+/*
+** qdech_n_7_s32:
+**	sqdech	x0, w0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_7_s32, int32_t,
+		x0 = svqdech_n_s32 (x0, 7),
+		x0 = svqdech (x0, 7))
+
+/*
+** qdech_n_15_s32:
+**	sqdech	x0, w0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_15_s32, int32_t,
+		x0 = svqdech_n_s32 (x0, 15),
+		x0 = svqdech (x0, 15))
+
+/*
+** qdech_n_16_s32:
+**	sqdech	x0, w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_16_s32, int32_t,
+		x0 = svqdech_n_s32 (x0, 16),
+		x0 = svqdech (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s64.c
new file mode 100644
index 000000000..61989f8d6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_s64.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdech_n_1_s64_tied:
+**	sqdech	x0
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_1_s64_tied, int64_t,
+		x0 = svqdech_n_s64 (x0, 1),
+		x0 = svqdech (x0, 1))
+
+/*
+** qdech_n_1_s64_untied:
+**	mov	x0, x1
+**	sqdech	x0
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_1_s64_untied, int64_t,
+		x0 = svqdech_n_s64 (x1, 1),
+		x0 = svqdech (x1, 1))
+
+/*
+** qdech_n_2_s64:
+**	sqdech	x0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_2_s64, int64_t,
+		x0 = svqdech_n_s64 (x0, 2),
+		x0 = svqdech (x0, 2))
+
+/*
+** qdech_n_7_s64:
+**	sqdech	x0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_7_s64, int64_t,
+		x0 = svqdech_n_s64 (x0, 7),
+		x0 = svqdech (x0, 7))
+
+/*
+** qdech_n_15_s64:
+**	sqdech	x0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_15_s64, int64_t,
+		x0 = svqdech_n_s64 (x0, 15),
+		x0 = svqdech (x0, 15))
+
+/*
+** qdech_n_16_s64:
+**	sqdech	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_16_s64, int64_t,
+		x0 = svqdech_n_s64 (x0, 16),
+		x0 = svqdech (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u16.c
new file mode 100644
index 000000000..0d6587851
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u16.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdech_1_u16_tied:
+**	uqdech	z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_1_u16_tied, svuint16_t,
+		z0 = svqdech_u16 (z0, 1),
+		z0 = svqdech (z0, 1))
+
+/*
+** qdech_1_u16_untied:
+**	movprfx	z0, z1
+**	uqdech	z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_1_u16_untied, svuint16_t,
+		z0 = svqdech_u16 (z1, 1),
+		z0 = svqdech (z1, 1))
+
+/*
+** qdech_2_u16:
+**	uqdech	z0\.h, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_2_u16, svuint16_t,
+		z0 = svqdech_u16 (z0, 2),
+		z0 = svqdech (z0, 2))
+
+/*
+** qdech_7_u16:
+**	uqdech	z0\.h, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_7_u16, svuint16_t,
+		z0 = svqdech_u16 (z0, 7),
+		z0 = svqdech (z0, 7))
+
+/*
+** qdech_15_u16:
+**	uqdech	z0\.h, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_15_u16, svuint16_t,
+		z0 = svqdech_u16 (z0, 15),
+		z0 = svqdech (z0, 15))
+
+/*
+** qdech_16_u16:
+**	uqdech	z0\.h, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdech_16_u16, svuint16_t,
+		z0 = svqdech_u16 (z0, 16),
+		z0 = svqdech (z0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u32.c
new file mode 100644
index 000000000..179d67953
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u32.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdech_n_1_u32_tied:
+**	uqdech	w0
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_1_u32_tied, uint32_t,
+		x0 = svqdech_n_u32 (x0, 1),
+		x0 = svqdech (x0, 1))
+
+/*
+** qdech_n_1_u32_untied:
+**	mov	w0, w1
+**	uqdech	w0
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_1_u32_untied, uint32_t,
+		x0 = svqdech_n_u32 (x1, 1),
+		x0 = svqdech (x1, 1))
+
+/*
+** qdech_n_2_u32:
+**	uqdech	w0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_2_u32, uint32_t,
+		x0 = svqdech_n_u32 (x0, 2),
+		x0 = svqdech (x0, 2))
+
+/*
+** qdech_n_7_u32:
+**	uqdech	w0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_7_u32, uint32_t,
+		x0 = svqdech_n_u32 (x0, 7),
+		x0 = svqdech (x0, 7))
+
+/*
+** qdech_n_15_u32:
+**	uqdech	w0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_15_u32, uint32_t,
+		x0 = svqdech_n_u32 (x0, 15),
+		x0 = svqdech (x0, 15))
+
+/*
+** qdech_n_16_u32:
+**	uqdech	w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_16_u32, uint32_t,
+		x0 = svqdech_n_u32 (x0, 16),
+		x0 = svqdech (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u64.c
new file mode 100644
index 000000000..da2f051af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdech_u64.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdech_n_1_u64_tied:
+**	uqdech	x0
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_1_u64_tied, uint64_t,
+		x0 = svqdech_n_u64 (x0, 1),
+		x0 = svqdech (x0, 1))
+
+/*
+** qdech_n_1_u64_untied:
+**	mov	x0, x1
+**	uqdech	x0
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_1_u64_untied, uint64_t,
+		x0 = svqdech_n_u64 (x1, 1),
+		x0 = svqdech (x1, 1))
+
+/*
+** qdech_n_2_u64:
+**	uqdech	x0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_2_u64, uint64_t,
+		x0 = svqdech_n_u64 (x0, 2),
+		x0 = svqdech (x0, 2))
+
+/*
+** qdech_n_7_u64:
+**	uqdech	x0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_7_u64, uint64_t,
+		x0 = svqdech_n_u64 (x0, 7),
+		x0 = svqdech (x0, 7))
+
+/*
+** qdech_n_15_u64:
+**	uqdech	x0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_15_u64, uint64_t,
+		x0 = svqdech_n_u64 (x0, 15),
+		x0 = svqdech (x0, 15))
+
+/*
+** qdech_n_16_u64:
+**	uqdech	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdech_n_16_u64, uint64_t,
+		x0 = svqdech_n_u64 (x0, 16),
+		x0 = svqdech (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s16.c
new file mode 100644
index 000000000..71b40c152
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s16.c
@@ -0,0 +1,22 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecp_s16_tied:
+**	sqdecp	z0\.h, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qdecp_s16_tied, svint16_t,
+		z0 = svqdecp_s16 (z0, p0),
+		z0 = svqdecp (z0, p0))
+
+/*
+** qdecp_s16_untied:
+**	movprfx	z0, z1
+**	sqdecp	z0\.h, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qdecp_s16_untied, svint16_t,
+		z0 = svqdecp_s16 (z1, p0),
+		z0 = svqdecp (z1, p0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s32.c
new file mode 100644
index 000000000..55e4067d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s32.c
@@ -0,0 +1,98 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecp_s32_tied:
+**	sqdecp	z0\.s, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qdecp_s32_tied, svint32_t,
+		z0 = svqdecp_s32 (z0, p0),
+		z0 = svqdecp (z0, p0))
+
+/*
+** qdecp_s32_untied:
+**	movprfx	z0, z1
+**	sqdecp	z0\.s, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qdecp_s32_untied, svint32_t,
+		z0 = svqdecp_s32 (z1, p0),
+		z0 = svqdecp (z1, p0))
+
+/*
+** qdecp_n_s32_b8_tied:
+**	sqdecp	x0, p0\.b, w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_s32_b8_tied, int32_t,
+		x0 = svqdecp_n_s32_b8 (x0, p0),
+		x0 = svqdecp_b8 (x0, p0))
+
+/*
+** qdecp_n_s32_b8_untied:
+**	mov	w0, w1
+**	sqdecp	x0, p0\.b, w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_s32_b8_untied, int32_t,
+		x0 = svqdecp_n_s32_b8 (x1, p0),
+		x0 = svqdecp_b8 (x1, p0))
+
+/*
+** qdecp_n_s32_b16_tied:
+**	sqdecp	x0, p0\.h, w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_s32_b16_tied, int32_t,
+		x0 = svqdecp_n_s32_b16 (x0, p0),
+		x0 = svqdecp_b16 (x0, p0))
+
+/*
+** qdecp_n_s32_b16_untied:
+**	mov	w0, w1
+**	sqdecp	x0, p0\.h, w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_s32_b16_untied, int32_t,
+		x0 = svqdecp_n_s32_b16 (x1, p0),
+		x0 = svqdecp_b16 (x1, p0))
+
+/*
+** qdecp_n_s32_b32_tied:
+**	sqdecp	x0, p0\.s, w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_s32_b32_tied, int32_t,
+		x0 = svqdecp_n_s32_b32 (x0, p0),
+		x0 = svqdecp_b32 (x0, p0))
+
+/*
+** qdecp_n_s32_b32_untied:
+**	mov	w0, w1
+**	sqdecp	x0, p0\.s, w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_s32_b32_untied, int32_t,
+		x0 = svqdecp_n_s32_b32 (x1, p0),
+		x0 = svqdecp_b32 (x1, p0))
+
+/*
+** qdecp_n_s32_b64_tied:
+**	sqdecp	x0, p0\.d, w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_s32_b64_tied, int32_t,
+		x0 = svqdecp_n_s32_b64 (x0, p0),
+		x0 = svqdecp_b64 (x0, p0))
+
+/*
+** qdecp_n_s32_b64_untied:
+**	mov	w0, w1
+**	sqdecp	x0, p0\.d, w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_s32_b64_untied, int32_t,
+		x0 = svqdecp_n_s32_b64 (x1, p0),
+		x0 = svqdecp_b64 (x1, p0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s64.c
new file mode 100644
index 000000000..9527999c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_s64.c
@@ -0,0 +1,98 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecp_s64_tied:
+**	sqdecp	z0\.d, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qdecp_s64_tied, svint64_t,
+		z0 = svqdecp_s64 (z0, p0),
+		z0 = svqdecp (z0, p0))
+
+/*
+** qdecp_s64_untied:
+**	movprfx	z0, z1
+**	sqdecp	z0\.d, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qdecp_s64_untied, svint64_t,
+		z0 = svqdecp_s64 (z1, p0),
+		z0 = svqdecp (z1, p0))
+
+/*
+** qdecp_n_s64_b8_tied:
+**	sqdecp	x0, p0\.b
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_s64_b8_tied, int64_t,
+		x0 = svqdecp_n_s64_b8 (x0, p0),
+		x0 = svqdecp_b8 (x0, p0))
+
+/*
+** qdecp_n_s64_b8_untied:
+**	mov	x0, x1
+**	sqdecp	x0, p0\.b
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_s64_b8_untied, int64_t,
+		x0 = svqdecp_n_s64_b8 (x1, p0),
+		x0 = svqdecp_b8 (x1, p0))
+
+/*
+** qdecp_n_s64_b16_tied:
+**	sqdecp	x0, p0\.h
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_s64_b16_tied, int64_t,
+		x0 = svqdecp_n_s64_b16 (x0, p0),
+		x0 = svqdecp_b16 (x0, p0))
+
+/*
+** qdecp_n_s64_b16_untied:
+**	mov	x0, x1
+**	sqdecp	x0, p0\.h
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_s64_b16_untied, int64_t,
+		x0 = svqdecp_n_s64_b16 (x1, p0),
+		x0 = svqdecp_b16 (x1, p0))
+
+/*
+** qdecp_n_s64_b32_tied:
+**	sqdecp	x0, p0\.s
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_s64_b32_tied, int64_t,
+		x0 = svqdecp_n_s64_b32 (x0, p0),
+		x0 = svqdecp_b32 (x0, p0))
+
+/*
+** qdecp_n_s64_b32_untied:
+**	mov	x0, x1
+**	sqdecp	x0, p0\.s
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_s64_b32_untied, int64_t,
+		x0 = svqdecp_n_s64_b32 (x1, p0),
+		x0 = svqdecp_b32 (x1, p0))
+
+/*
+** qdecp_n_s64_b64_tied:
+**	sqdecp	x0, p0\.d
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_s64_b64_tied, int64_t,
+		x0 = svqdecp_n_s64_b64 (x0, p0),
+		x0 = svqdecp_b64 (x0, p0))
+
+/*
+** qdecp_n_s64_b64_untied:
+**	mov	x0, x1
+**	sqdecp	x0, p0\.d
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_s64_b64_untied, int64_t,
+		x0 = svqdecp_n_s64_b64 (x1, p0),
+		x0 = svqdecp_b64 (x1, p0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u16.c
new file mode 100644
index 000000000..33357ada4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u16.c
@@ -0,0 +1,22 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecp_u16_tied:
+**	uqdecp	z0\.h, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qdecp_u16_tied, svuint16_t,
+		z0 = svqdecp_u16 (z0, p0),
+		z0 = svqdecp (z0, p0))
+
+/*
+** qdecp_u16_untied:
+**	movprfx	z0, z1
+**	uqdecp	z0\.h, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qdecp_u16_untied, svuint16_t,
+		z0 = svqdecp_u16 (z1, p0),
+		z0 = svqdecp (z1, p0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u32.c
new file mode 100644
index 000000000..58e9a642e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u32.c
@@ -0,0 +1,98 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecp_u32_tied:
+**	uqdecp	z0\.s, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qdecp_u32_tied, svuint32_t,
+		z0 = svqdecp_u32 (z0, p0),
+		z0 = svqdecp (z0, p0))
+
+/*
+** qdecp_u32_untied:
+**	movprfx	z0, z1
+**	uqdecp	z0\.s, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qdecp_u32_untied, svuint32_t,
+		z0 = svqdecp_u32 (z1, p0),
+		z0 = svqdecp (z1, p0))
+
+/*
+** qdecp_n_u32_b8_tied:
+**	uqdecp	w0, p0\.b
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_u32_b8_tied, uint32_t,
+		x0 = svqdecp_n_u32_b8 (x0, p0),
+		x0 = svqdecp_b8 (x0, p0))
+
+/*
+** qdecp_n_u32_b8_untied:
+**	mov	w0, w1
+**	uqdecp	w0, p0\.b
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_u32_b8_untied, uint32_t,
+		x0 = svqdecp_n_u32_b8 (x1, p0),
+		x0 = svqdecp_b8 (x1, p0))
+
+/*
+** qdecp_n_u32_b16_tied:
+**	uqdecp	w0, p0\.h
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_u32_b16_tied, uint32_t,
+		x0 = svqdecp_n_u32_b16 (x0, p0),
+		x0 = svqdecp_b16 (x0, p0))
+
+/*
+** qdecp_n_u32_b16_untied:
+**	mov	w0, w1
+**	uqdecp	w0, p0\.h
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_u32_b16_untied, uint32_t,
+		x0 = svqdecp_n_u32_b16 (x1, p0),
+		x0 = svqdecp_b16 (x1, p0))
+
+/*
+** qdecp_n_u32_b32_tied:
+**	uqdecp	w0, p0\.s
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_u32_b32_tied, uint32_t,
+		x0 = svqdecp_n_u32_b32 (x0, p0),
+		x0 = svqdecp_b32 (x0, p0))
+
+/*
+** qdecp_n_u32_b32_untied:
+**	mov	w0, w1
+**	uqdecp	w0, p0\.s
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_u32_b32_untied, uint32_t,
+		x0 = svqdecp_n_u32_b32 (x1, p0),
+		x0 = svqdecp_b32 (x1, p0))
+
+/*
+** qdecp_n_u32_b64_tied:
+**	uqdecp	w0, p0\.d
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_u32_b64_tied, uint32_t,
+		x0 = svqdecp_n_u32_b64 (x0, p0),
+		x0 = svqdecp_b64 (x0, p0))
+
+/*
+** qdecp_n_u32_b64_untied:
+**	mov	w0, w1
+**	uqdecp	w0, p0\.d
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_u32_b64_untied, uint32_t,
+		x0 = svqdecp_n_u32_b64 (x1, p0),
+		x0 = svqdecp_b64 (x1, p0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u64.c
new file mode 100644
index 000000000..e2091d8ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecp_u64.c
@@ -0,0 +1,98 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecp_u64_tied:
+**	uqdecp	z0\.d, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qdecp_u64_tied, svuint64_t,
+		z0 = svqdecp_u64 (z0, p0),
+		z0 = svqdecp (z0, p0))
+
+/*
+** qdecp_u64_untied:
+**	movprfx	z0, z1
+**	uqdecp	z0\.d, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qdecp_u64_untied, svuint64_t,
+		z0 = svqdecp_u64 (z1, p0),
+		z0 = svqdecp (z1, p0))
+
+/*
+** qdecp_n_u64_b8_tied:
+**	uqdecp	x0, p0\.b
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_u64_b8_tied, uint64_t,
+		x0 = svqdecp_n_u64_b8 (x0, p0),
+		x0 = svqdecp_b8 (x0, p0))
+
+/*
+** qdecp_n_u64_b8_untied:
+**	mov	x0, x1
+**	uqdecp	x0, p0\.b
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_u64_b8_untied, uint64_t,
+		x0 = svqdecp_n_u64_b8 (x1, p0),
+		x0 = svqdecp_b8 (x1, p0))
+
+/*
+** qdecp_n_u64_b16_tied:
+**	uqdecp	x0, p0\.h
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_u64_b16_tied, uint64_t,
+		x0 = svqdecp_n_u64_b16 (x0, p0),
+		x0 = svqdecp_b16 (x0, p0))
+
+/*
+** qdecp_n_u64_b16_untied:
+**	mov	x0, x1
+**	uqdecp	x0, p0\.h
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_u64_b16_untied, uint64_t,
+		x0 = svqdecp_n_u64_b16 (x1, p0),
+		x0 = svqdecp_b16 (x1, p0))
+
+/*
+** qdecp_n_u64_b32_tied:
+**	uqdecp	x0, p0\.s
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_u64_b32_tied, uint64_t,
+		x0 = svqdecp_n_u64_b32 (x0, p0),
+		x0 = svqdecp_b32 (x0, p0))
+
+/*
+** qdecp_n_u64_b32_untied:
+**	mov	x0, x1
+**	uqdecp	x0, p0\.s
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_u64_b32_untied, uint64_t,
+		x0 = svqdecp_n_u64_b32 (x1, p0),
+		x0 = svqdecp_b32 (x1, p0))
+
+/*
+** qdecp_n_u64_b64_tied:
+**	uqdecp	x0, p0\.d
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_u64_b64_tied, uint64_t,
+		x0 = svqdecp_n_u64_b64 (x0, p0),
+		x0 = svqdecp_b64 (x0, p0))
+
+/*
+** qdecp_n_u64_b64_untied:
+**	mov	x0, x1
+**	uqdecp	x0, p0\.d
+**	ret
+*/
+TEST_UNIFORM_S (qdecp_n_u64_b64_untied, uint64_t,
+		x0 = svqdecp_n_u64_b64 (x1, p0),
+		x0 = svqdecp_b64 (x1, p0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_s32.c
new file mode 100644
index 000000000..d80f7be4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_s32.c
@@ -0,0 +1,401 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecw_pat_1_s32_tied:
+**	sqdecw	z0\.s, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_1_s32_tied, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_POW2, 1),
+		z0 = svqdecw_pat (z0, SV_POW2, 1))
+
+/*
+** qdecw_pat_1_s32_untied:
+**	movprfx	z0, z1
+**	sqdecw	z0\.s, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_1_s32_untied, svint32_t,
+		z0 = svqdecw_pat_s32 (z1, SV_POW2, 1),
+		z0 = svqdecw_pat (z1, SV_POW2, 1))
+
+/*
+** qdecw_pat_2_s32:
+**	sqdecw	z0\.s, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_2_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_POW2, 2),
+		z0 = svqdecw_pat (z0, SV_POW2, 2))
+
+/*
+** qdecw_pat_7_s32:
+**	sqdecw	z0\.s, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_7_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_POW2, 7),
+		z0 = svqdecw_pat (z0, SV_POW2, 7))
+
+/*
+** qdecw_pat_15_s32:
+**	sqdecw	z0\.s, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_15_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_POW2, 15),
+		z0 = svqdecw_pat (z0, SV_POW2, 15))
+
+/*
+** qdecw_pat_16_s32:
+**	sqdecw	z0\.s, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_16_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_POW2, 16),
+		z0 = svqdecw_pat (z0, SV_POW2, 16))
+
+/*
+** qdecw_pat_vl1_s32:
+**	sqdecw	z0\.s, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl1_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_VL1, 16),
+		z0 = svqdecw_pat (z0, SV_VL1, 16))
+
+/*
+** qdecw_pat_vl2_s32:
+**	sqdecw	z0\.s, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl2_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_VL2, 16),
+		z0 = svqdecw_pat (z0, SV_VL2, 16))
+
+/*
+** qdecw_pat_vl3_s32:
+**	sqdecw	z0\.s, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl3_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_VL3, 16),
+		z0 = svqdecw_pat (z0, SV_VL3, 16))
+
+/*
+** qdecw_pat_vl4_s32:
+**	sqdecw	z0\.s, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl4_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_VL4, 16),
+		z0 = svqdecw_pat (z0, SV_VL4, 16))
+
+/*
+** qdecw_pat_vl5_s32:
+**	sqdecw	z0\.s, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl5_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_VL5, 16),
+		z0 = svqdecw_pat (z0, SV_VL5, 16))
+
+/*
+** qdecw_pat_vl6_s32:
+**	sqdecw	z0\.s, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl6_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_VL6, 16),
+		z0 = svqdecw_pat (z0, SV_VL6, 16))
+
+/*
+** qdecw_pat_vl7_s32:
+**	sqdecw	z0\.s, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl7_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_VL7, 16),
+		z0 = svqdecw_pat (z0, SV_VL7, 16))
+
+/*
+** qdecw_pat_vl8_s32:
+**	sqdecw	z0\.s, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl8_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_VL8, 16),
+		z0 = svqdecw_pat (z0, SV_VL8, 16))
+
+/*
+** qdecw_pat_vl16_s32:
+**	sqdecw	z0\.s, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl16_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_VL16, 16),
+		z0 = svqdecw_pat (z0, SV_VL16, 16))
+
+/*
+** qdecw_pat_vl32_s32:
+**	sqdecw	z0\.s, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl32_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_VL32, 16),
+		z0 = svqdecw_pat (z0, SV_VL32, 16))
+
+/*
+** qdecw_pat_vl64_s32:
+**	sqdecw	z0\.s, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl64_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_VL64, 16),
+		z0 = svqdecw_pat (z0, SV_VL64, 16))
+
+/*
+** qdecw_pat_vl128_s32:
+**	sqdecw	z0\.s, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl128_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_VL128, 16),
+		z0 = svqdecw_pat (z0, SV_VL128, 16))
+
+/*
+** qdecw_pat_vl256_s32:
+**	sqdecw	z0\.s, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl256_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_VL256, 16),
+		z0 = svqdecw_pat (z0, SV_VL256, 16))
+
+/*
+** qdecw_pat_mul4_s32:
+**	sqdecw	z0\.s, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_mul4_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_MUL4, 16),
+		z0 = svqdecw_pat (z0, SV_MUL4, 16))
+
+/*
+** qdecw_pat_mul3_s32:
+**	sqdecw	z0\.s, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_mul3_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_MUL3, 16),
+		z0 = svqdecw_pat (z0, SV_MUL3, 16))
+
+/*
+** qdecw_pat_all_s32:
+**	sqdecw	z0\.s, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_all_s32, svint32_t,
+		z0 = svqdecw_pat_s32 (z0, SV_ALL, 16),
+		z0 = svqdecw_pat (z0, SV_ALL, 16))
+
+/*
+** qdecw_pat_n_1_s32_tied:
+**	sqdecw	x0, w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_1_s32_tied, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_POW2, 1),
+		x0 = svqdecw_pat (x0, SV_POW2, 1))
+
+/*
+** qdecw_pat_n_1_s32_untied:
+**	mov	w0, w1
+**	sqdecw	x0, w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_1_s32_untied, int32_t,
+		x0 = svqdecw_pat_n_s32 (x1, SV_POW2, 1),
+		x0 = svqdecw_pat (x1, SV_POW2, 1))
+
+/*
+** qdecw_pat_n_2_s32:
+**	sqdecw	x0, w0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_2_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_POW2, 2),
+		x0 = svqdecw_pat (x0, SV_POW2, 2))
+
+/*
+** qdecw_pat_n_7_s32:
+**	sqdecw	x0, w0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_7_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_POW2, 7),
+		x0 = svqdecw_pat (x0, SV_POW2, 7))
+
+/*
+** qdecw_pat_n_15_s32:
+**	sqdecw	x0, w0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_15_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_POW2, 15),
+		x0 = svqdecw_pat (x0, SV_POW2, 15))
+
+/*
+** qdecw_pat_n_16_s32:
+**	sqdecw	x0, w0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_16_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_POW2, 16),
+		x0 = svqdecw_pat (x0, SV_POW2, 16))
+
+/*
+** qdecw_pat_n_vl1_s32:
+**	sqdecw	x0, w0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl1_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_VL1, 16),
+		x0 = svqdecw_pat (x0, SV_VL1, 16))
+
+/*
+** qdecw_pat_n_vl2_s32:
+**	sqdecw	x0, w0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl2_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_VL2, 16),
+		x0 = svqdecw_pat (x0, SV_VL2, 16))
+
+/*
+** qdecw_pat_n_vl3_s32:
+**	sqdecw	x0, w0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl3_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_VL3, 16),
+		x0 = svqdecw_pat (x0, SV_VL3, 16))
+
+/*
+** qdecw_pat_n_vl4_s32:
+**	sqdecw	x0, w0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl4_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_VL4, 16),
+		x0 = svqdecw_pat (x0, SV_VL4, 16))
+
+/*
+** qdecw_pat_n_vl5_s32:
+**	sqdecw	x0, w0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl5_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_VL5, 16),
+		x0 = svqdecw_pat (x0, SV_VL5, 16))
+
+/*
+** qdecw_pat_n_vl6_s32:
+**	sqdecw	x0, w0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl6_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_VL6, 16),
+		x0 = svqdecw_pat (x0, SV_VL6, 16))
+
+/*
+** qdecw_pat_n_vl7_s32:
+**	sqdecw	x0, w0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl7_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_VL7, 16),
+		x0 = svqdecw_pat (x0, SV_VL7, 16))
+
+/*
+** qdecw_pat_n_vl8_s32:
+**	sqdecw	x0, w0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl8_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_VL8, 16),
+		x0 = svqdecw_pat (x0, SV_VL8, 16))
+
+/*
+** qdecw_pat_n_vl16_s32:
+**	sqdecw	x0, w0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl16_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_VL16, 16),
+		x0 = svqdecw_pat (x0, SV_VL16, 16))
+
+/*
+** qdecw_pat_n_vl32_s32:
+**	sqdecw	x0, w0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl32_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_VL32, 16),
+		x0 = svqdecw_pat (x0, SV_VL32, 16))
+
+/*
+** qdecw_pat_n_vl64_s32:
+**	sqdecw	x0, w0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl64_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_VL64, 16),
+		x0 = svqdecw_pat (x0, SV_VL64, 16))
+
+/*
+** qdecw_pat_n_vl128_s32:
+**	sqdecw	x0, w0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl128_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_VL128, 16),
+		x0 = svqdecw_pat (x0, SV_VL128, 16))
+
+/*
+** qdecw_pat_n_vl256_s32:
+**	sqdecw	x0, w0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl256_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_VL256, 16),
+		x0 = svqdecw_pat (x0, SV_VL256, 16))
+
+/*
+** qdecw_pat_n_mul4_s32:
+**	sqdecw	x0, w0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_mul4_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_MUL4, 16),
+		x0 = svqdecw_pat (x0, SV_MUL4, 16))
+
+/*
+** qdecw_pat_n_mul3_s32:
+**	sqdecw	x0, w0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_mul3_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_MUL3, 16),
+		x0 = svqdecw_pat (x0, SV_MUL3, 16))
+
+/*
+** qdecw_pat_n_all_s32:
+**	sqdecw	x0, w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_all_s32, int32_t,
+		x0 = svqdecw_pat_n_s32 (x0, SV_ALL, 16),
+		x0 = svqdecw_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_s64.c
new file mode 100644
index 000000000..9c684a7c7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_s64.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecw_pat_n_1_s64_tied:
+**	sqdecw	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_1_s64_tied, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_POW2, 1),
+		x0 = svqdecw_pat (x0, SV_POW2, 1))
+
+/*
+** qdecw_pat_n_1_s64_untied:
+**	mov	x0, x1
+**	sqdecw	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_1_s64_untied, int64_t,
+		x0 = svqdecw_pat_n_s64 (x1, SV_POW2, 1),
+		x0 = svqdecw_pat (x1, SV_POW2, 1))
+
+/*
+** qdecw_pat_n_2_s64:
+**	sqdecw	x0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_2_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_POW2, 2),
+		x0 = svqdecw_pat (x0, SV_POW2, 2))
+
+/*
+** qdecw_pat_n_7_s64:
+**	sqdecw	x0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_7_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_POW2, 7),
+		x0 = svqdecw_pat (x0, SV_POW2, 7))
+
+/*
+** qdecw_pat_n_15_s64:
+**	sqdecw	x0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_15_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_POW2, 15),
+		x0 = svqdecw_pat (x0, SV_POW2, 15))
+
+/*
+** qdecw_pat_n_16_s64:
+**	sqdecw	x0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_16_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_POW2, 16),
+		x0 = svqdecw_pat (x0, SV_POW2, 16))
+
+/*
+** qdecw_pat_n_vl1_s64:
+**	sqdecw	x0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl1_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_VL1, 16),
+		x0 = svqdecw_pat (x0, SV_VL1, 16))
+
+/*
+** qdecw_pat_n_vl2_s64:
+**	sqdecw	x0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl2_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_VL2, 16),
+		x0 = svqdecw_pat (x0, SV_VL2, 16))
+
+/*
+** qdecw_pat_n_vl3_s64:
+**	sqdecw	x0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl3_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_VL3, 16),
+		x0 = svqdecw_pat (x0, SV_VL3, 16))
+
+/*
+** qdecw_pat_n_vl4_s64:
+**	sqdecw	x0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl4_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_VL4, 16),
+		x0 = svqdecw_pat (x0, SV_VL4, 16))
+
+/*
+** qdecw_pat_n_vl5_s64:
+**	sqdecw	x0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl5_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_VL5, 16),
+		x0 = svqdecw_pat (x0, SV_VL5, 16))
+
+/*
+** qdecw_pat_n_vl6_s64:
+**	sqdecw	x0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl6_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_VL6, 16),
+		x0 = svqdecw_pat (x0, SV_VL6, 16))
+
+/*
+** qdecw_pat_n_vl7_s64:
+**	sqdecw	x0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl7_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_VL7, 16),
+		x0 = svqdecw_pat (x0, SV_VL7, 16))
+
+/*
+** qdecw_pat_n_vl8_s64:
+**	sqdecw	x0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl8_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_VL8, 16),
+		x0 = svqdecw_pat (x0, SV_VL8, 16))
+
+/*
+** qdecw_pat_n_vl16_s64:
+**	sqdecw	x0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl16_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_VL16, 16),
+		x0 = svqdecw_pat (x0, SV_VL16, 16))
+
+/*
+** qdecw_pat_n_vl32_s64:
+**	sqdecw	x0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl32_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_VL32, 16),
+		x0 = svqdecw_pat (x0, SV_VL32, 16))
+
+/*
+** qdecw_pat_n_vl64_s64:
+**	sqdecw	x0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl64_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_VL64, 16),
+		x0 = svqdecw_pat (x0, SV_VL64, 16))
+
+/*
+** qdecw_pat_n_vl128_s64:
+**	sqdecw	x0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl128_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_VL128, 16),
+		x0 = svqdecw_pat (x0, SV_VL128, 16))
+
+/*
+** qdecw_pat_n_vl256_s64:
+**	sqdecw	x0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl256_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_VL256, 16),
+		x0 = svqdecw_pat (x0, SV_VL256, 16))
+
+/*
+** qdecw_pat_n_mul4_s64:
+**	sqdecw	x0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_mul4_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_MUL4, 16),
+		x0 = svqdecw_pat (x0, SV_MUL4, 16))
+
+/*
+** qdecw_pat_n_mul3_s64:
+**	sqdecw	x0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_mul3_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_MUL3, 16),
+		x0 = svqdecw_pat (x0, SV_MUL3, 16))
+
+/*
+** qdecw_pat_n_all_s64:
+**	sqdecw	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_all_s64, int64_t,
+		x0 = svqdecw_pat_n_s64 (x0, SV_ALL, 16),
+		x0 = svqdecw_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_u32.c
new file mode 100644
index 000000000..8d3fcb473
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_u32.c
@@ -0,0 +1,401 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecw_pat_1_u32_tied:
+**	uqdecw	z0\.s, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_1_u32_tied, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_POW2, 1),
+		z0 = svqdecw_pat (z0, SV_POW2, 1))
+
+/*
+** qdecw_pat_1_u32_untied:
+**	movprfx	z0, z1
+**	uqdecw	z0\.s, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_1_u32_untied, svuint32_t,
+		z0 = svqdecw_pat_u32 (z1, SV_POW2, 1),
+		z0 = svqdecw_pat (z1, SV_POW2, 1))
+
+/*
+** qdecw_pat_2_u32:
+**	uqdecw	z0\.s, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_2_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_POW2, 2),
+		z0 = svqdecw_pat (z0, SV_POW2, 2))
+
+/*
+** qdecw_pat_7_u32:
+**	uqdecw	z0\.s, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_7_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_POW2, 7),
+		z0 = svqdecw_pat (z0, SV_POW2, 7))
+
+/*
+** qdecw_pat_15_u32:
+**	uqdecw	z0\.s, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_15_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_POW2, 15),
+		z0 = svqdecw_pat (z0, SV_POW2, 15))
+
+/*
+** qdecw_pat_16_u32:
+**	uqdecw	z0\.s, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_16_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_POW2, 16),
+		z0 = svqdecw_pat (z0, SV_POW2, 16))
+
+/*
+** qdecw_pat_vl1_u32:
+**	uqdecw	z0\.s, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl1_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_VL1, 16),
+		z0 = svqdecw_pat (z0, SV_VL1, 16))
+
+/*
+** qdecw_pat_vl2_u32:
+**	uqdecw	z0\.s, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl2_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_VL2, 16),
+		z0 = svqdecw_pat (z0, SV_VL2, 16))
+
+/*
+** qdecw_pat_vl3_u32:
+**	uqdecw	z0\.s, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl3_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_VL3, 16),
+		z0 = svqdecw_pat (z0, SV_VL3, 16))
+
+/*
+** qdecw_pat_vl4_u32:
+**	uqdecw	z0\.s, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl4_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_VL4, 16),
+		z0 = svqdecw_pat (z0, SV_VL4, 16))
+
+/*
+** qdecw_pat_vl5_u32:
+**	uqdecw	z0\.s, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl5_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_VL5, 16),
+		z0 = svqdecw_pat (z0, SV_VL5, 16))
+
+/*
+** qdecw_pat_vl6_u32:
+**	uqdecw	z0\.s, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl6_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_VL6, 16),
+		z0 = svqdecw_pat (z0, SV_VL6, 16))
+
+/*
+** qdecw_pat_vl7_u32:
+**	uqdecw	z0\.s, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl7_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_VL7, 16),
+		z0 = svqdecw_pat (z0, SV_VL7, 16))
+
+/*
+** qdecw_pat_vl8_u32:
+**	uqdecw	z0\.s, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl8_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_VL8, 16),
+		z0 = svqdecw_pat (z0, SV_VL8, 16))
+
+/*
+** qdecw_pat_vl16_u32:
+**	uqdecw	z0\.s, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl16_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_VL16, 16),
+		z0 = svqdecw_pat (z0, SV_VL16, 16))
+
+/*
+** qdecw_pat_vl32_u32:
+**	uqdecw	z0\.s, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl32_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_VL32, 16),
+		z0 = svqdecw_pat (z0, SV_VL32, 16))
+
+/*
+** qdecw_pat_vl64_u32:
+**	uqdecw	z0\.s, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl64_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_VL64, 16),
+		z0 = svqdecw_pat (z0, SV_VL64, 16))
+
+/*
+** qdecw_pat_vl128_u32:
+**	uqdecw	z0\.s, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl128_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_VL128, 16),
+		z0 = svqdecw_pat (z0, SV_VL128, 16))
+
+/*
+** qdecw_pat_vl256_u32:
+**	uqdecw	z0\.s, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_vl256_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_VL256, 16),
+		z0 = svqdecw_pat (z0, SV_VL256, 16))
+
+/*
+** qdecw_pat_mul4_u32:
+**	uqdecw	z0\.s, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_mul4_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_MUL4, 16),
+		z0 = svqdecw_pat (z0, SV_MUL4, 16))
+
+/*
+** qdecw_pat_mul3_u32:
+**	uqdecw	z0\.s, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_mul3_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_MUL3, 16),
+		z0 = svqdecw_pat (z0, SV_MUL3, 16))
+
+/*
+** qdecw_pat_all_u32:
+**	uqdecw	z0\.s, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_pat_all_u32, svuint32_t,
+		z0 = svqdecw_pat_u32 (z0, SV_ALL, 16),
+		z0 = svqdecw_pat (z0, SV_ALL, 16))
+
+/*
+** qdecw_pat_n_1_u32_tied:
+**	uqdecw	w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_1_u32_tied, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_POW2, 1),
+		x0 = svqdecw_pat (x0, SV_POW2, 1))
+
+/*
+** qdecw_pat_n_1_u32_untied:
+**	mov	w0, w1
+**	uqdecw	w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_1_u32_untied, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x1, SV_POW2, 1),
+		x0 = svqdecw_pat (x1, SV_POW2, 1))
+
+/*
+** qdecw_pat_n_2_u32:
+**	uqdecw	w0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_2_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_POW2, 2),
+		x0 = svqdecw_pat (x0, SV_POW2, 2))
+
+/*
+** qdecw_pat_n_7_u32:
+**	uqdecw	w0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_7_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_POW2, 7),
+		x0 = svqdecw_pat (x0, SV_POW2, 7))
+
+/*
+** qdecw_pat_n_15_u32:
+**	uqdecw	w0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_15_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_POW2, 15),
+		x0 = svqdecw_pat (x0, SV_POW2, 15))
+
+/*
+** qdecw_pat_n_16_u32:
+**	uqdecw	w0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_16_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_POW2, 16),
+		x0 = svqdecw_pat (x0, SV_POW2, 16))
+
+/*
+** qdecw_pat_n_vl1_u32:
+**	uqdecw	w0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl1_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_VL1, 16),
+		x0 = svqdecw_pat (x0, SV_VL1, 16))
+
+/*
+** qdecw_pat_n_vl2_u32:
+**	uqdecw	w0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl2_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_VL2, 16),
+		x0 = svqdecw_pat (x0, SV_VL2, 16))
+
+/*
+** qdecw_pat_n_vl3_u32:
+**	uqdecw	w0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl3_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_VL3, 16),
+		x0 = svqdecw_pat (x0, SV_VL3, 16))
+
+/*
+** qdecw_pat_n_vl4_u32:
+**	uqdecw	w0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl4_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_VL4, 16),
+		x0 = svqdecw_pat (x0, SV_VL4, 16))
+
+/*
+** qdecw_pat_n_vl5_u32:
+**	uqdecw	w0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl5_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_VL5, 16),
+		x0 = svqdecw_pat (x0, SV_VL5, 16))
+
+/*
+** qdecw_pat_n_vl6_u32:
+**	uqdecw	w0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl6_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_VL6, 16),
+		x0 = svqdecw_pat (x0, SV_VL6, 16))
+
+/*
+** qdecw_pat_n_vl7_u32:
+**	uqdecw	w0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl7_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_VL7, 16),
+		x0 = svqdecw_pat (x0, SV_VL7, 16))
+
+/*
+** qdecw_pat_n_vl8_u32:
+**	uqdecw	w0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl8_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_VL8, 16),
+		x0 = svqdecw_pat (x0, SV_VL8, 16))
+
+/*
+** qdecw_pat_n_vl16_u32:
+**	uqdecw	w0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl16_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_VL16, 16),
+		x0 = svqdecw_pat (x0, SV_VL16, 16))
+
+/*
+** qdecw_pat_n_vl32_u32:
+**	uqdecw	w0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl32_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_VL32, 16),
+		x0 = svqdecw_pat (x0, SV_VL32, 16))
+
+/*
+** qdecw_pat_n_vl64_u32:
+**	uqdecw	w0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl64_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_VL64, 16),
+		x0 = svqdecw_pat (x0, SV_VL64, 16))
+
+/*
+** qdecw_pat_n_vl128_u32:
+**	uqdecw	w0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl128_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_VL128, 16),
+		x0 = svqdecw_pat (x0, SV_VL128, 16))
+
+/*
+** qdecw_pat_n_vl256_u32:
+**	uqdecw	w0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl256_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_VL256, 16),
+		x0 = svqdecw_pat (x0, SV_VL256, 16))
+
+/*
+** qdecw_pat_n_mul4_u32:
+**	uqdecw	w0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_mul4_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_MUL4, 16),
+		x0 = svqdecw_pat (x0, SV_MUL4, 16))
+
+/*
+** qdecw_pat_n_mul3_u32:
+**	uqdecw	w0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_mul3_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_MUL3, 16),
+		x0 = svqdecw_pat (x0, SV_MUL3, 16))
+
+/*
+** qdecw_pat_n_all_u32:
+**	uqdecw	w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_all_u32, uint32_t,
+		x0 = svqdecw_pat_n_u32 (x0, SV_ALL, 16),
+		x0 = svqdecw_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_u64.c
new file mode 100644
index 000000000..015775b17
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_pat_u64.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecw_pat_n_1_u64_tied:
+**	uqdecw	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_1_u64_tied, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_POW2, 1),
+		x0 = svqdecw_pat (x0, SV_POW2, 1))
+
+/*
+** qdecw_pat_n_1_u64_untied:
+**	mov	x0, x1
+**	uqdecw	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_1_u64_untied, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x1, SV_POW2, 1),
+		x0 = svqdecw_pat (x1, SV_POW2, 1))
+
+/*
+** qdecw_pat_n_2_u64:
+**	uqdecw	x0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_2_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_POW2, 2),
+		x0 = svqdecw_pat (x0, SV_POW2, 2))
+
+/*
+** qdecw_pat_n_7_u64:
+**	uqdecw	x0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_7_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_POW2, 7),
+		x0 = svqdecw_pat (x0, SV_POW2, 7))
+
+/*
+** qdecw_pat_n_15_u64:
+**	uqdecw	x0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_15_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_POW2, 15),
+		x0 = svqdecw_pat (x0, SV_POW2, 15))
+
+/*
+** qdecw_pat_n_16_u64:
+**	uqdecw	x0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_16_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_POW2, 16),
+		x0 = svqdecw_pat (x0, SV_POW2, 16))
+
+/*
+** qdecw_pat_n_vl1_u64:
+**	uqdecw	x0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl1_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_VL1, 16),
+		x0 = svqdecw_pat (x0, SV_VL1, 16))
+
+/*
+** qdecw_pat_n_vl2_u64:
+**	uqdecw	x0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl2_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_VL2, 16),
+		x0 = svqdecw_pat (x0, SV_VL2, 16))
+
+/*
+** qdecw_pat_n_vl3_u64:
+**	uqdecw	x0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl3_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_VL3, 16),
+		x0 = svqdecw_pat (x0, SV_VL3, 16))
+
+/*
+** qdecw_pat_n_vl4_u64:
+**	uqdecw	x0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl4_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_VL4, 16),
+		x0 = svqdecw_pat (x0, SV_VL4, 16))
+
+/*
+** qdecw_pat_n_vl5_u64:
+**	uqdecw	x0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl5_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_VL5, 16),
+		x0 = svqdecw_pat (x0, SV_VL5, 16))
+
+/*
+** qdecw_pat_n_vl6_u64:
+**	uqdecw	x0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl6_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_VL6, 16),
+		x0 = svqdecw_pat (x0, SV_VL6, 16))
+
+/*
+** qdecw_pat_n_vl7_u64:
+**	uqdecw	x0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl7_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_VL7, 16),
+		x0 = svqdecw_pat (x0, SV_VL7, 16))
+
+/*
+** qdecw_pat_n_vl8_u64:
+**	uqdecw	x0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl8_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_VL8, 16),
+		x0 = svqdecw_pat (x0, SV_VL8, 16))
+
+/*
+** qdecw_pat_n_vl16_u64:
+**	uqdecw	x0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl16_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_VL16, 16),
+		x0 = svqdecw_pat (x0, SV_VL16, 16))
+
+/*
+** qdecw_pat_n_vl32_u64:
+**	uqdecw	x0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl32_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_VL32, 16),
+		x0 = svqdecw_pat (x0, SV_VL32, 16))
+
+/*
+** qdecw_pat_n_vl64_u64:
+**	uqdecw	x0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl64_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_VL64, 16),
+		x0 = svqdecw_pat (x0, SV_VL64, 16))
+
+/*
+** qdecw_pat_n_vl128_u64:
+**	uqdecw	x0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl128_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_VL128, 16),
+		x0 = svqdecw_pat (x0, SV_VL128, 16))
+
+/*
+** qdecw_pat_n_vl256_u64:
+**	uqdecw	x0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_vl256_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_VL256, 16),
+		x0 = svqdecw_pat (x0, SV_VL256, 16))
+
+/*
+** qdecw_pat_n_mul4_u64:
+**	uqdecw	x0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_mul4_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_MUL4, 16),
+		x0 = svqdecw_pat (x0, SV_MUL4, 16))
+
+/*
+** qdecw_pat_n_mul3_u64:
+**	uqdecw	x0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_mul3_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_MUL3, 16),
+		x0 = svqdecw_pat (x0, SV_MUL3, 16))
+
+/*
+** qdecw_pat_n_all_u64:
+**	uqdecw	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_pat_n_all_u64, uint64_t,
+		x0 = svqdecw_pat_n_u64 (x0, SV_ALL, 16),
+		x0 = svqdecw_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_s32.c
new file mode 100644
index 000000000..8dfe8a177
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_s32.c
@@ -0,0 +1,113 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecw_1_s32_tied:
+**	sqdecw	z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_1_s32_tied, svint32_t,
+		z0 = svqdecw_s32 (z0, 1),
+		z0 = svqdecw (z0, 1))
+
+/*
+** qdecw_1_s32_untied:
+**	movprfx	z0, z1
+**	sqdecw	z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_1_s32_untied, svint32_t,
+		z0 = svqdecw_s32 (z1, 1),
+		z0 = svqdecw (z1, 1))
+
+/*
+** qdecw_2_s32:
+**	sqdecw	z0\.s, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_2_s32, svint32_t,
+		z0 = svqdecw_s32 (z0, 2),
+		z0 = svqdecw (z0, 2))
+
+/*
+** qdecw_7_s32:
+**	sqdecw	z0\.s, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_7_s32, svint32_t,
+		z0 = svqdecw_s32 (z0, 7),
+		z0 = svqdecw (z0, 7))
+
+/*
+** qdecw_15_s32:
+**	sqdecw	z0\.s, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_15_s32, svint32_t,
+		z0 = svqdecw_s32 (z0, 15),
+		z0 = svqdecw (z0, 15))
+
+/*
+** qdecw_16_s32:
+**	sqdecw	z0\.s, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_16_s32, svint32_t,
+		z0 = svqdecw_s32 (z0, 16),
+		z0 = svqdecw (z0, 16))
+
+/*
+** qdecw_n_1_s32_tied:
+**	sqdecw	x0, w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_1_s32_tied, int32_t,
+		x0 = svqdecw_n_s32 (x0, 1),
+		x0 = svqdecw (x0, 1))
+
+/*
+** qdecw_n_1_s32_untied:
+**	mov	w0, w1
+**	sqdecw	x0, w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_1_s32_untied, int32_t,
+		x0 = svqdecw_n_s32 (x1, 1),
+		x0 = svqdecw (x1, 1))
+
+/*
+** qdecw_n_2_s32:
+**	sqdecw	x0, w0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_2_s32, int32_t,
+		x0 = svqdecw_n_s32 (x0, 2),
+		x0 = svqdecw (x0, 2))
+
+/*
+** qdecw_n_7_s32:
+**	sqdecw	x0, w0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_7_s32, int32_t,
+		x0 = svqdecw_n_s32 (x0, 7),
+		x0 = svqdecw (x0, 7))
+
+/*
+** qdecw_n_15_s32:
+**	sqdecw	x0, w0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_15_s32, int32_t,
+		x0 = svqdecw_n_s32 (x0, 15),
+		x0 = svqdecw (x0, 15))
+
+/*
+** qdecw_n_16_s32:
+**	sqdecw	x0, w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_16_s32, int32_t,
+		x0 = svqdecw_n_s32 (x0, 16),
+		x0 = svqdecw (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_s64.c
new file mode 100644
index 000000000..b0841a8b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_s64.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecw_n_1_s64_tied:
+**	sqdecw	x0
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_1_s64_tied, int64_t,
+		x0 = svqdecw_n_s64 (x0, 1),
+		x0 = svqdecw (x0, 1))
+
+/*
+** qdecw_n_1_s64_untied:
+**	mov	x0, x1
+**	sqdecw	x0
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_1_s64_untied, int64_t,
+		x0 = svqdecw_n_s64 (x1, 1),
+		x0 = svqdecw (x1, 1))
+
+/*
+** qdecw_n_2_s64:
+**	sqdecw	x0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_2_s64, int64_t,
+		x0 = svqdecw_n_s64 (x0, 2),
+		x0 = svqdecw (x0, 2))
+
+/*
+** qdecw_n_7_s64:
+**	sqdecw	x0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_7_s64, int64_t,
+		x0 = svqdecw_n_s64 (x0, 7),
+		x0 = svqdecw (x0, 7))
+
+/*
+** qdecw_n_15_s64:
+**	sqdecw	x0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_15_s64, int64_t,
+		x0 = svqdecw_n_s64 (x0, 15),
+		x0 = svqdecw (x0, 15))
+
+/*
+** qdecw_n_16_s64:
+**	sqdecw	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_16_s64, int64_t,
+		x0 = svqdecw_n_s64 (x0, 16),
+		x0 = svqdecw (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_u32.c
new file mode 100644
index 000000000..22e8a8d69
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_u32.c
@@ -0,0 +1,113 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecw_1_u32_tied:
+**	uqdecw	z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_1_u32_tied, svuint32_t,
+		z0 = svqdecw_u32 (z0, 1),
+		z0 = svqdecw (z0, 1))
+
+/*
+** qdecw_1_u32_untied:
+**	movprfx	z0, z1
+**	uqdecw	z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_1_u32_untied, svuint32_t,
+		z0 = svqdecw_u32 (z1, 1),
+		z0 = svqdecw (z1, 1))
+
+/*
+** qdecw_2_u32:
+**	uqdecw	z0\.s, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_2_u32, svuint32_t,
+		z0 = svqdecw_u32 (z0, 2),
+		z0 = svqdecw (z0, 2))
+
+/*
+** qdecw_7_u32:
+**	uqdecw	z0\.s, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_7_u32, svuint32_t,
+		z0 = svqdecw_u32 (z0, 7),
+		z0 = svqdecw (z0, 7))
+
+/*
+** qdecw_15_u32:
+**	uqdecw	z0\.s, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_15_u32, svuint32_t,
+		z0 = svqdecw_u32 (z0, 15),
+		z0 = svqdecw (z0, 15))
+
+/*
+** qdecw_16_u32:
+**	uqdecw	z0\.s, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qdecw_16_u32, svuint32_t,
+		z0 = svqdecw_u32 (z0, 16),
+		z0 = svqdecw (z0, 16))
+
+/*
+** qdecw_n_1_u32_tied:
+**	uqdecw	w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_1_u32_tied, uint32_t,
+		x0 = svqdecw_n_u32 (x0, 1),
+		x0 = svqdecw (x0, 1))
+
+/*
+** qdecw_n_1_u32_untied:
+**	mov	w0, w1
+**	uqdecw	w0
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_1_u32_untied, uint32_t,
+		x0 = svqdecw_n_u32 (x1, 1),
+		x0 = svqdecw (x1, 1))
+
+/*
+** qdecw_n_2_u32:
+**	uqdecw	w0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_2_u32, uint32_t,
+		x0 = svqdecw_n_u32 (x0, 2),
+		x0 = svqdecw (x0, 2))
+
+/*
+** qdecw_n_7_u32:
+**	uqdecw	w0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_7_u32, uint32_t,
+		x0 = svqdecw_n_u32 (x0, 7),
+		x0 = svqdecw (x0, 7))
+
+/*
+** qdecw_n_15_u32:
+**	uqdecw	w0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_15_u32, uint32_t,
+		x0 = svqdecw_n_u32 (x0, 15),
+		x0 = svqdecw (x0, 15))
+
+/*
+** qdecw_n_16_u32:
+**	uqdecw	w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_16_u32, uint32_t,
+		x0 = svqdecw_n_u32 (x0, 16),
+		x0 = svqdecw (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_u64.c
new file mode 100644
index 000000000..88c484e8b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qdecw_u64.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qdecw_n_1_u64_tied:
+**	uqdecw	x0
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_1_u64_tied, uint64_t,
+		x0 = svqdecw_n_u64 (x0, 1),
+		x0 = svqdecw (x0, 1))
+
+/*
+** qdecw_n_1_u64_untied:
+**	mov	x0, x1
+**	uqdecw	x0
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_1_u64_untied, uint64_t,
+		x0 = svqdecw_n_u64 (x1, 1),
+		x0 = svqdecw (x1, 1))
+
+/*
+** qdecw_n_2_u64:
+**	uqdecw	x0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_2_u64, uint64_t,
+		x0 = svqdecw_n_u64 (x0, 2),
+		x0 = svqdecw (x0, 2))
+
+/*
+** qdecw_n_7_u64:
+**	uqdecw	x0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_7_u64, uint64_t,
+		x0 = svqdecw_n_u64 (x0, 7),
+		x0 = svqdecw (x0, 7))
+
+/*
+** qdecw_n_15_u64:
+**	uqdecw	x0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_15_u64, uint64_t,
+		x0 = svqdecw_n_u64 (x0, 15),
+		x0 = svqdecw (x0, 15))
+
+/*
+** qdecw_n_16_u64:
+**	uqdecw	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qdecw_n_16_u64, uint64_t,
+		x0 = svqdecw_n_u64 (x0, 16),
+		x0 = svqdecw (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_s32.c
new file mode 100644
index 000000000..16a8d8e9a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_s32.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincb_pat_n_1_s32_tied:
+**	sqincb	x0, w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_1_s32_tied, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_POW2, 1),
+		x0 = svqincb_pat (x0, SV_POW2, 1))
+
+/*
+** qincb_pat_n_1_s32_untied:
+**	mov	w0, w1
+**	sqincb	x0, w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_1_s32_untied, int32_t,
+		x0 = svqincb_pat_n_s32 (x1, SV_POW2, 1),
+		x0 = svqincb_pat (x1, SV_POW2, 1))
+
+/*
+** qincb_pat_n_2_s32:
+**	sqincb	x0, w0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_2_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_POW2, 2),
+		x0 = svqincb_pat (x0, SV_POW2, 2))
+
+/*
+** qincb_pat_n_7_s32:
+**	sqincb	x0, w0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_7_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_POW2, 7),
+		x0 = svqincb_pat (x0, SV_POW2, 7))
+
+/*
+** qincb_pat_n_15_s32:
+**	sqincb	x0, w0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_15_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_POW2, 15),
+		x0 = svqincb_pat (x0, SV_POW2, 15))
+
+/*
+** qincb_pat_n_16_s32:
+**	sqincb	x0, w0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_16_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_POW2, 16),
+		x0 = svqincb_pat (x0, SV_POW2, 16))
+
+/*
+** qincb_pat_n_vl1_s32:
+**	sqincb	x0, w0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl1_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_VL1, 16),
+		x0 = svqincb_pat (x0, SV_VL1, 16))
+
+/*
+** qincb_pat_n_vl2_s32:
+**	sqincb	x0, w0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl2_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_VL2, 16),
+		x0 = svqincb_pat (x0, SV_VL2, 16))
+
+/*
+** qincb_pat_n_vl3_s32:
+**	sqincb	x0, w0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl3_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_VL3, 16),
+		x0 = svqincb_pat (x0, SV_VL3, 16))
+
+/*
+** qincb_pat_n_vl4_s32:
+**	sqincb	x0, w0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl4_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_VL4, 16),
+		x0 = svqincb_pat (x0, SV_VL4, 16))
+
+/*
+** qincb_pat_n_vl5_s32:
+**	sqincb	x0, w0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl5_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_VL5, 16),
+		x0 = svqincb_pat (x0, SV_VL5, 16))
+
+/*
+** qincb_pat_n_vl6_s32:
+**	sqincb	x0, w0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl6_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_VL6, 16),
+		x0 = svqincb_pat (x0, SV_VL6, 16))
+
+/*
+** qincb_pat_n_vl7_s32:
+**	sqincb	x0, w0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl7_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_VL7, 16),
+		x0 = svqincb_pat (x0, SV_VL7, 16))
+
+/*
+** qincb_pat_n_vl8_s32:
+**	sqincb	x0, w0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl8_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_VL8, 16),
+		x0 = svqincb_pat (x0, SV_VL8, 16))
+
+/*
+** qincb_pat_n_vl16_s32:
+**	sqincb	x0, w0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl16_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_VL16, 16),
+		x0 = svqincb_pat (x0, SV_VL16, 16))
+
+/*
+** qincb_pat_n_vl32_s32:
+**	sqincb	x0, w0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl32_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_VL32, 16),
+		x0 = svqincb_pat (x0, SV_VL32, 16))
+
+/*
+** qincb_pat_n_vl64_s32:
+**	sqincb	x0, w0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl64_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_VL64, 16),
+		x0 = svqincb_pat (x0, SV_VL64, 16))
+
+/*
+** qincb_pat_n_vl128_s32:
+**	sqincb	x0, w0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl128_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_VL128, 16),
+		x0 = svqincb_pat (x0, SV_VL128, 16))
+
+/*
+** qincb_pat_n_vl256_s32:
+**	sqincb	x0, w0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl256_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_VL256, 16),
+		x0 = svqincb_pat (x0, SV_VL256, 16))
+
+/*
+** qincb_pat_n_mul4_s32:
+**	sqincb	x0, w0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_mul4_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_MUL4, 16),
+		x0 = svqincb_pat (x0, SV_MUL4, 16))
+
+/*
+** qincb_pat_n_mul3_s32:
+**	sqincb	x0, w0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_mul3_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_MUL3, 16),
+		x0 = svqincb_pat (x0, SV_MUL3, 16))
+
+/*
+** qincb_pat_n_all_s32:
+**	sqincb	x0, w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_all_s32, int32_t,
+		x0 = svqincb_pat_n_s32 (x0, SV_ALL, 16),
+		x0 = svqincb_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_s64.c
new file mode 100644
index 000000000..79ed73ba7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_s64.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincb_pat_n_1_s64_tied:
+**	sqincb	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_1_s64_tied, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_POW2, 1),
+		x0 = svqincb_pat (x0, SV_POW2, 1))
+
+/*
+** qincb_pat_n_1_s64_untied:
+**	mov	x0, x1
+**	sqincb	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_1_s64_untied, int64_t,
+		x0 = svqincb_pat_n_s64 (x1, SV_POW2, 1),
+		x0 = svqincb_pat (x1, SV_POW2, 1))
+
+/*
+** qincb_pat_n_2_s64:
+**	sqincb	x0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_2_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_POW2, 2),
+		x0 = svqincb_pat (x0, SV_POW2, 2))
+
+/*
+** qincb_pat_n_7_s64:
+**	sqincb	x0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_7_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_POW2, 7),
+		x0 = svqincb_pat (x0, SV_POW2, 7))
+
+/*
+** qincb_pat_n_15_s64:
+**	sqincb	x0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_15_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_POW2, 15),
+		x0 = svqincb_pat (x0, SV_POW2, 15))
+
+/*
+** qincb_pat_n_16_s64:
+**	sqincb	x0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_16_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_POW2, 16),
+		x0 = svqincb_pat (x0, SV_POW2, 16))
+
+/*
+** qincb_pat_n_vl1_s64:
+**	sqincb	x0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl1_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_VL1, 16),
+		x0 = svqincb_pat (x0, SV_VL1, 16))
+
+/*
+** qincb_pat_n_vl2_s64:
+**	sqincb	x0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl2_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_VL2, 16),
+		x0 = svqincb_pat (x0, SV_VL2, 16))
+
+/*
+** qincb_pat_n_vl3_s64:
+**	sqincb	x0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl3_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_VL3, 16),
+		x0 = svqincb_pat (x0, SV_VL3, 16))
+
+/*
+** qincb_pat_n_vl4_s64:
+**	sqincb	x0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl4_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_VL4, 16),
+		x0 = svqincb_pat (x0, SV_VL4, 16))
+
+/*
+** qincb_pat_n_vl5_s64:
+**	sqincb	x0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl5_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_VL5, 16),
+		x0 = svqincb_pat (x0, SV_VL5, 16))
+
+/*
+** qincb_pat_n_vl6_s64:
+**	sqincb	x0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl6_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_VL6, 16),
+		x0 = svqincb_pat (x0, SV_VL6, 16))
+
+/*
+** qincb_pat_n_vl7_s64:
+**	sqincb	x0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl7_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_VL7, 16),
+		x0 = svqincb_pat (x0, SV_VL7, 16))
+
+/*
+** qincb_pat_n_vl8_s64:
+**	sqincb	x0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl8_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_VL8, 16),
+		x0 = svqincb_pat (x0, SV_VL8, 16))
+
+/*
+** qincb_pat_n_vl16_s64:
+**	sqincb	x0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl16_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_VL16, 16),
+		x0 = svqincb_pat (x0, SV_VL16, 16))
+
+/*
+** qincb_pat_n_vl32_s64:
+**	sqincb	x0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl32_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_VL32, 16),
+		x0 = svqincb_pat (x0, SV_VL32, 16))
+
+/*
+** qincb_pat_n_vl64_s64:
+**	sqincb	x0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl64_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_VL64, 16),
+		x0 = svqincb_pat (x0, SV_VL64, 16))
+
+/*
+** qincb_pat_n_vl128_s64:
+**	sqincb	x0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl128_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_VL128, 16),
+		x0 = svqincb_pat (x0, SV_VL128, 16))
+
+/*
+** qincb_pat_n_vl256_s64:
+**	sqincb	x0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl256_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_VL256, 16),
+		x0 = svqincb_pat (x0, SV_VL256, 16))
+
+/*
+** qincb_pat_n_mul4_s64:
+**	sqincb	x0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_mul4_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_MUL4, 16),
+		x0 = svqincb_pat (x0, SV_MUL4, 16))
+
+/*
+** qincb_pat_n_mul3_s64:
+**	sqincb	x0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_mul3_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_MUL3, 16),
+		x0 = svqincb_pat (x0, SV_MUL3, 16))
+
+/*
+** qincb_pat_n_all_s64:
+**	sqincb	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_all_s64, int64_t,
+		x0 = svqincb_pat_n_s64 (x0, SV_ALL, 16),
+		x0 = svqincb_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_u32.c
new file mode 100644
index 000000000..30e5f28ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_u32.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincb_pat_n_1_u32_tied:
+**	uqincb	w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_1_u32_tied, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_POW2, 1),
+		x0 = svqincb_pat (x0, SV_POW2, 1))
+
+/*
+** qincb_pat_n_1_u32_untied:
+**	mov	w0, w1
+**	uqincb	w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_1_u32_untied, uint32_t,
+		x0 = svqincb_pat_n_u32 (x1, SV_POW2, 1),
+		x0 = svqincb_pat (x1, SV_POW2, 1))
+
+/*
+** qincb_pat_n_2_u32:
+**	uqincb	w0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_2_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_POW2, 2),
+		x0 = svqincb_pat (x0, SV_POW2, 2))
+
+/*
+** qincb_pat_n_7_u32:
+**	uqincb	w0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_7_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_POW2, 7),
+		x0 = svqincb_pat (x0, SV_POW2, 7))
+
+/*
+** qincb_pat_n_15_u32:
+**	uqincb	w0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_15_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_POW2, 15),
+		x0 = svqincb_pat (x0, SV_POW2, 15))
+
+/*
+** qincb_pat_n_16_u32:
+**	uqincb	w0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_16_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_POW2, 16),
+		x0 = svqincb_pat (x0, SV_POW2, 16))
+
+/*
+** qincb_pat_n_vl1_u32:
+**	uqincb	w0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl1_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_VL1, 16),
+		x0 = svqincb_pat (x0, SV_VL1, 16))
+
+/*
+** qincb_pat_n_vl2_u32:
+**	uqincb	w0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl2_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_VL2, 16),
+		x0 = svqincb_pat (x0, SV_VL2, 16))
+
+/*
+** qincb_pat_n_vl3_u32:
+**	uqincb	w0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl3_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_VL3, 16),
+		x0 = svqincb_pat (x0, SV_VL3, 16))
+
+/*
+** qincb_pat_n_vl4_u32:
+**	uqincb	w0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl4_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_VL4, 16),
+		x0 = svqincb_pat (x0, SV_VL4, 16))
+
+/*
+** qincb_pat_n_vl5_u32:
+**	uqincb	w0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl5_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_VL5, 16),
+		x0 = svqincb_pat (x0, SV_VL5, 16))
+
+/*
+** qincb_pat_n_vl6_u32:
+**	uqincb	w0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl6_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_VL6, 16),
+		x0 = svqincb_pat (x0, SV_VL6, 16))
+
+/*
+** qincb_pat_n_vl7_u32:
+**	uqincb	w0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl7_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_VL7, 16),
+		x0 = svqincb_pat (x0, SV_VL7, 16))
+
+/*
+** qincb_pat_n_vl8_u32:
+**	uqincb	w0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl8_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_VL8, 16),
+		x0 = svqincb_pat (x0, SV_VL8, 16))
+
+/*
+** qincb_pat_n_vl16_u32:
+**	uqincb	w0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl16_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_VL16, 16),
+		x0 = svqincb_pat (x0, SV_VL16, 16))
+
+/*
+** qincb_pat_n_vl32_u32:
+**	uqincb	w0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl32_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_VL32, 16),
+		x0 = svqincb_pat (x0, SV_VL32, 16))
+
+/*
+** qincb_pat_n_vl64_u32:
+**	uqincb	w0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl64_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_VL64, 16),
+		x0 = svqincb_pat (x0, SV_VL64, 16))
+
+/*
+** qincb_pat_n_vl128_u32:
+**	uqincb	w0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl128_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_VL128, 16),
+		x0 = svqincb_pat (x0, SV_VL128, 16))
+
+/*
+** qincb_pat_n_vl256_u32:
+**	uqincb	w0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl256_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_VL256, 16),
+		x0 = svqincb_pat (x0, SV_VL256, 16))
+
+/*
+** qincb_pat_n_mul4_u32:
+**	uqincb	w0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_mul4_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_MUL4, 16),
+		x0 = svqincb_pat (x0, SV_MUL4, 16))
+
+/*
+** qincb_pat_n_mul3_u32:
+**	uqincb	w0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_mul3_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_MUL3, 16),
+		x0 = svqincb_pat (x0, SV_MUL3, 16))
+
+/*
+** qincb_pat_n_all_u32:
+**	uqincb	w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_all_u32, uint32_t,
+		x0 = svqincb_pat_n_u32 (x0, SV_ALL, 16),
+		x0 = svqincb_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_u64.c
new file mode 100644
index 000000000..038b1edb6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_pat_u64.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincb_pat_n_1_u64_tied:
+**	uqincb	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_1_u64_tied, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_POW2, 1),
+		x0 = svqincb_pat (x0, SV_POW2, 1))
+
+/*
+** qincb_pat_n_1_u64_untied:
+**	mov	x0, x1
+**	uqincb	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_1_u64_untied, uint64_t,
+		x0 = svqincb_pat_n_u64 (x1, SV_POW2, 1),
+		x0 = svqincb_pat (x1, SV_POW2, 1))
+
+/*
+** qincb_pat_n_2_u64:
+**	uqincb	x0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_2_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_POW2, 2),
+		x0 = svqincb_pat (x0, SV_POW2, 2))
+
+/*
+** qincb_pat_n_7_u64:
+**	uqincb	x0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_7_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_POW2, 7),
+		x0 = svqincb_pat (x0, SV_POW2, 7))
+
+/*
+** qincb_pat_n_15_u64:
+**	uqincb	x0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_15_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_POW2, 15),
+		x0 = svqincb_pat (x0, SV_POW2, 15))
+
+/*
+** qincb_pat_n_16_u64:
+**	uqincb	x0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_16_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_POW2, 16),
+		x0 = svqincb_pat (x0, SV_POW2, 16))
+
+/*
+** qincb_pat_n_vl1_u64:
+**	uqincb	x0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl1_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_VL1, 16),
+		x0 = svqincb_pat (x0, SV_VL1, 16))
+
+/*
+** qincb_pat_n_vl2_u64:
+**	uqincb	x0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl2_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_VL2, 16),
+		x0 = svqincb_pat (x0, SV_VL2, 16))
+
+/*
+** qincb_pat_n_vl3_u64:
+**	uqincb	x0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl3_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_VL3, 16),
+		x0 = svqincb_pat (x0, SV_VL3, 16))
+
+/*
+** qincb_pat_n_vl4_u64:
+**	uqincb	x0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl4_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_VL4, 16),
+		x0 = svqincb_pat (x0, SV_VL4, 16))
+
+/*
+** qincb_pat_n_vl5_u64:
+**	uqincb	x0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl5_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_VL5, 16),
+		x0 = svqincb_pat (x0, SV_VL5, 16))
+
+/*
+** qincb_pat_n_vl6_u64:
+**	uqincb	x0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl6_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_VL6, 16),
+		x0 = svqincb_pat (x0, SV_VL6, 16))
+
+/*
+** qincb_pat_n_vl7_u64:
+**	uqincb	x0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl7_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_VL7, 16),
+		x0 = svqincb_pat (x0, SV_VL7, 16))
+
+/*
+** qincb_pat_n_vl8_u64:
+**	uqincb	x0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl8_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_VL8, 16),
+		x0 = svqincb_pat (x0, SV_VL8, 16))
+
+/*
+** qincb_pat_n_vl16_u64:
+**	uqincb	x0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl16_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_VL16, 16),
+		x0 = svqincb_pat (x0, SV_VL16, 16))
+
+/*
+** qincb_pat_n_vl32_u64:
+**	uqincb	x0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl32_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_VL32, 16),
+		x0 = svqincb_pat (x0, SV_VL32, 16))
+
+/*
+** qincb_pat_n_vl64_u64:
+**	uqincb	x0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl64_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_VL64, 16),
+		x0 = svqincb_pat (x0, SV_VL64, 16))
+
+/*
+** qincb_pat_n_vl128_u64:
+**	uqincb	x0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl128_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_VL128, 16),
+		x0 = svqincb_pat (x0, SV_VL128, 16))
+
+/*
+** qincb_pat_n_vl256_u64:
+**	uqincb	x0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_vl256_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_VL256, 16),
+		x0 = svqincb_pat (x0, SV_VL256, 16))
+
+/*
+** qincb_pat_n_mul4_u64:
+**	uqincb	x0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_mul4_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_MUL4, 16),
+		x0 = svqincb_pat (x0, SV_MUL4, 16))
+
+/*
+** qincb_pat_n_mul3_u64:
+**	uqincb	x0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_mul3_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_MUL3, 16),
+		x0 = svqincb_pat (x0, SV_MUL3, 16))
+
+/*
+** qincb_pat_n_all_u64:
+**	uqincb	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_pat_n_all_u64, uint64_t,
+		x0 = svqincb_pat_n_u64 (x0, SV_ALL, 16),
+		x0 = svqincb_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_s32.c
new file mode 100644
index 000000000..8e74073de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_s32.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincb_n_1_s32_tied:
+**	sqincb	x0, w0
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_1_s32_tied, int32_t,
+		x0 = svqincb_n_s32 (x0, 1),
+		x0 = svqincb (x0, 1))
+
+/*
+** qincb_n_1_s32_untied:
+**	mov	w0, w1
+**	sqincb	x0, w0
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_1_s32_untied, int32_t,
+		x0 = svqincb_n_s32 (x1, 1),
+		x0 = svqincb (x1, 1))
+
+/*
+** qincb_n_2_s32:
+**	sqincb	x0, w0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_2_s32, int32_t,
+		x0 = svqincb_n_s32 (x0, 2),
+		x0 = svqincb (x0, 2))
+
+/*
+** qincb_n_7_s32:
+**	sqincb	x0, w0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_7_s32, int32_t,
+		x0 = svqincb_n_s32 (x0, 7),
+		x0 = svqincb (x0, 7))
+
+/*
+** qincb_n_15_s32:
+**	sqincb	x0, w0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_15_s32, int32_t,
+		x0 = svqincb_n_s32 (x0, 15),
+		x0 = svqincb (x0, 15))
+
+/*
+** qincb_n_16_s32:
+**	sqincb	x0, w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_16_s32, int32_t,
+		x0 = svqincb_n_s32 (x0, 16),
+		x0 = svqincb (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_s64.c
new file mode 100644
index 000000000..b064c1264
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_s64.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincb_n_1_s64_tied:
+**	sqincb	x0
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_1_s64_tied, int64_t,
+		x0 = svqincb_n_s64 (x0, 1),
+		x0 = svqincb (x0, 1))
+
+/*
+** qincb_n_1_s64_untied:
+**	mov	x0, x1
+**	sqincb	x0
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_1_s64_untied, int64_t,
+		x0 = svqincb_n_s64 (x1, 1),
+		x0 = svqincb (x1, 1))
+
+/*
+** qincb_n_2_s64:
+**	sqincb	x0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_2_s64, int64_t,
+		x0 = svqincb_n_s64 (x0, 2),
+		x0 = svqincb (x0, 2))
+
+/*
+** qincb_n_7_s64:
+**	sqincb	x0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_7_s64, int64_t,
+		x0 = svqincb_n_s64 (x0, 7),
+		x0 = svqincb (x0, 7))
+
+/*
+** qincb_n_15_s64:
+**	sqincb	x0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_15_s64, int64_t,
+		x0 = svqincb_n_s64 (x0, 15),
+		x0 = svqincb (x0, 15))
+
+/*
+** qincb_n_16_s64:
+**	sqincb	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_16_s64, int64_t,
+		x0 = svqincb_n_s64 (x0, 16),
+		x0 = svqincb (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_u32.c
new file mode 100644
index 000000000..df3add73e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_u32.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincb_n_1_u32_tied:
+**	uqincb	w0
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_1_u32_tied, uint32_t,
+		x0 = svqincb_n_u32 (x0, 1),
+		x0 = svqincb (x0, 1))
+
+/*
+** qincb_n_1_u32_untied:
+**	mov	w0, w1
+**	uqincb	w0
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_1_u32_untied, uint32_t,
+		x0 = svqincb_n_u32 (x1, 1),
+		x0 = svqincb (x1, 1))
+
+/*
+** qincb_n_2_u32:
+**	uqincb	w0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_2_u32, uint32_t,
+		x0 = svqincb_n_u32 (x0, 2),
+		x0 = svqincb (x0, 2))
+
+/*
+** qincb_n_7_u32:
+**	uqincb	w0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_7_u32, uint32_t,
+		x0 = svqincb_n_u32 (x0, 7),
+		x0 = svqincb (x0, 7))
+
+/*
+** qincb_n_15_u32:
+**	uqincb	w0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_15_u32, uint32_t,
+		x0 = svqincb_n_u32 (x0, 15),
+		x0 = svqincb (x0, 15))
+
+/*
+** qincb_n_16_u32:
+**	uqincb	w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_16_u32, uint32_t,
+		x0 = svqincb_n_u32 (x0, 16),
+		x0 = svqincb (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_u64.c
new file mode 100644
index 000000000..d9a08c865
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincb_u64.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincb_n_1_u64_tied:
+**	uqincb	x0
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_1_u64_tied, uint64_t,
+		x0 = svqincb_n_u64 (x0, 1),
+		x0 = svqincb (x0, 1))
+
+/*
+** qincb_n_1_u64_untied:
+**	mov	x0, x1
+**	uqincb	x0
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_1_u64_untied, uint64_t,
+		x0 = svqincb_n_u64 (x1, 1),
+		x0 = svqincb (x1, 1))
+
+/*
+** qincb_n_2_u64:
+**	uqincb	x0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_2_u64, uint64_t,
+		x0 = svqincb_n_u64 (x0, 2),
+		x0 = svqincb (x0, 2))
+
+/*
+** qincb_n_7_u64:
+**	uqincb	x0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_7_u64, uint64_t,
+		x0 = svqincb_n_u64 (x0, 7),
+		x0 = svqincb (x0, 7))
+
+/*
+** qincb_n_15_u64:
+**	uqincb	x0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_15_u64, uint64_t,
+		x0 = svqincb_n_u64 (x0, 15),
+		x0 = svqincb (x0, 15))
+
+/*
+** qincb_n_16_u64:
+**	uqincb	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincb_n_16_u64, uint64_t,
+		x0 = svqincb_n_u64 (x0, 16),
+		x0 = svqincb (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_s32.c
new file mode 100644
index 000000000..061f88314
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_s32.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincd_pat_n_1_s32_tied:
+**	sqincd	x0, w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_1_s32_tied, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_POW2, 1),
+		x0 = svqincd_pat (x0, SV_POW2, 1))
+
+/*
+** qincd_pat_n_1_s32_untied:
+**	mov	w0, w1
+**	sqincd	x0, w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_1_s32_untied, int32_t,
+		x0 = svqincd_pat_n_s32 (x1, SV_POW2, 1),
+		x0 = svqincd_pat (x1, SV_POW2, 1))
+
+/*
+** qincd_pat_n_2_s32:
+**	sqincd	x0, w0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_2_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_POW2, 2),
+		x0 = svqincd_pat (x0, SV_POW2, 2))
+
+/*
+** qincd_pat_n_7_s32:
+**	sqincd	x0, w0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_7_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_POW2, 7),
+		x0 = svqincd_pat (x0, SV_POW2, 7))
+
+/*
+** qincd_pat_n_15_s32:
+**	sqincd	x0, w0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_15_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_POW2, 15),
+		x0 = svqincd_pat (x0, SV_POW2, 15))
+
+/*
+** qincd_pat_n_16_s32:
+**	sqincd	x0, w0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_16_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_POW2, 16),
+		x0 = svqincd_pat (x0, SV_POW2, 16))
+
+/*
+** qincd_pat_n_vl1_s32:
+**	sqincd	x0, w0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl1_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_VL1, 16),
+		x0 = svqincd_pat (x0, SV_VL1, 16))
+
+/*
+** qincd_pat_n_vl2_s32:
+**	sqincd	x0, w0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl2_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_VL2, 16),
+		x0 = svqincd_pat (x0, SV_VL2, 16))
+
+/*
+** qincd_pat_n_vl3_s32:
+**	sqincd	x0, w0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl3_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_VL3, 16),
+		x0 = svqincd_pat (x0, SV_VL3, 16))
+
+/*
+** qincd_pat_n_vl4_s32:
+**	sqincd	x0, w0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl4_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_VL4, 16),
+		x0 = svqincd_pat (x0, SV_VL4, 16))
+
+/*
+** qincd_pat_n_vl5_s32:
+**	sqincd	x0, w0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl5_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_VL5, 16),
+		x0 = svqincd_pat (x0, SV_VL5, 16))
+
+/*
+** qincd_pat_n_vl6_s32:
+**	sqincd	x0, w0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl6_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_VL6, 16),
+		x0 = svqincd_pat (x0, SV_VL6, 16))
+
+/*
+** qincd_pat_n_vl7_s32:
+**	sqincd	x0, w0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl7_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_VL7, 16),
+		x0 = svqincd_pat (x0, SV_VL7, 16))
+
+/*
+** qincd_pat_n_vl8_s32:
+**	sqincd	x0, w0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl8_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_VL8, 16),
+		x0 = svqincd_pat (x0, SV_VL8, 16))
+
+/*
+** qincd_pat_n_vl16_s32:
+**	sqincd	x0, w0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl16_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_VL16, 16),
+		x0 = svqincd_pat (x0, SV_VL16, 16))
+
+/*
+** qincd_pat_n_vl32_s32:
+**	sqincd	x0, w0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl32_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_VL32, 16),
+		x0 = svqincd_pat (x0, SV_VL32, 16))
+
+/*
+** qincd_pat_n_vl64_s32:
+**	sqincd	x0, w0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl64_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_VL64, 16),
+		x0 = svqincd_pat (x0, SV_VL64, 16))
+
+/*
+** qincd_pat_n_vl128_s32:
+**	sqincd	x0, w0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl128_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_VL128, 16),
+		x0 = svqincd_pat (x0, SV_VL128, 16))
+
+/*
+** qincd_pat_n_vl256_s32:
+**	sqincd	x0, w0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl256_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_VL256, 16),
+		x0 = svqincd_pat (x0, SV_VL256, 16))
+
+/*
+** qincd_pat_n_mul4_s32:
+**	sqincd	x0, w0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_mul4_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_MUL4, 16),
+		x0 = svqincd_pat (x0, SV_MUL4, 16))
+
+/*
+** qincd_pat_n_mul3_s32:
+**	sqincd	x0, w0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_mul3_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_MUL3, 16),
+		x0 = svqincd_pat (x0, SV_MUL3, 16))
+
+/*
+** qincd_pat_n_all_s32:
+**	sqincd	x0, w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_all_s32, int32_t,
+		x0 = svqincd_pat_n_s32 (x0, SV_ALL, 16),
+		x0 = svqincd_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_s64.c
new file mode 100644
index 000000000..02b53e1bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_s64.c
@@ -0,0 +1,401 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincd_pat_1_s64_tied:
+**	sqincd	z0\.d, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_1_s64_tied, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_POW2, 1),
+		z0 = svqincd_pat (z0, SV_POW2, 1))
+
+/*
+** qincd_pat_1_s64_untied:
+**	movprfx	z0, z1
+**	sqincd	z0\.d, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_1_s64_untied, svint64_t,
+		z0 = svqincd_pat_s64 (z1, SV_POW2, 1),
+		z0 = svqincd_pat (z1, SV_POW2, 1))
+
+/*
+** qincd_pat_2_s64:
+**	sqincd	z0\.d, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_2_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_POW2, 2),
+		z0 = svqincd_pat (z0, SV_POW2, 2))
+
+/*
+** qincd_pat_7_s64:
+**	sqincd	z0\.d, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_7_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_POW2, 7),
+		z0 = svqincd_pat (z0, SV_POW2, 7))
+
+/*
+** qincd_pat_15_s64:
+**	sqincd	z0\.d, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_15_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_POW2, 15),
+		z0 = svqincd_pat (z0, SV_POW2, 15))
+
+/*
+** qincd_pat_16_s64:
+**	sqincd	z0\.d, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_16_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_POW2, 16),
+		z0 = svqincd_pat (z0, SV_POW2, 16))
+
+/*
+** qincd_pat_vl1_s64:
+**	sqincd	z0\.d, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl1_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_VL1, 16),
+		z0 = svqincd_pat (z0, SV_VL1, 16))
+
+/*
+** qincd_pat_vl2_s64:
+**	sqincd	z0\.d, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl2_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_VL2, 16),
+		z0 = svqincd_pat (z0, SV_VL2, 16))
+
+/*
+** qincd_pat_vl3_s64:
+**	sqincd	z0\.d, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl3_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_VL3, 16),
+		z0 = svqincd_pat (z0, SV_VL3, 16))
+
+/*
+** qincd_pat_vl4_s64:
+**	sqincd	z0\.d, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl4_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_VL4, 16),
+		z0 = svqincd_pat (z0, SV_VL4, 16))
+
+/*
+** qincd_pat_vl5_s64:
+**	sqincd	z0\.d, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl5_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_VL5, 16),
+		z0 = svqincd_pat (z0, SV_VL5, 16))
+
+/*
+** qincd_pat_vl6_s64:
+**	sqincd	z0\.d, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl6_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_VL6, 16),
+		z0 = svqincd_pat (z0, SV_VL6, 16))
+
+/*
+** qincd_pat_vl7_s64:
+**	sqincd	z0\.d, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl7_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_VL7, 16),
+		z0 = svqincd_pat (z0, SV_VL7, 16))
+
+/*
+** qincd_pat_vl8_s64:
+**	sqincd	z0\.d, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl8_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_VL8, 16),
+		z0 = svqincd_pat (z0, SV_VL8, 16))
+
+/*
+** qincd_pat_vl16_s64:
+**	sqincd	z0\.d, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl16_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_VL16, 16),
+		z0 = svqincd_pat (z0, SV_VL16, 16))
+
+/*
+** qincd_pat_vl32_s64:
+**	sqincd	z0\.d, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl32_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_VL32, 16),
+		z0 = svqincd_pat (z0, SV_VL32, 16))
+
+/*
+** qincd_pat_vl64_s64:
+**	sqincd	z0\.d, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl64_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_VL64, 16),
+		z0 = svqincd_pat (z0, SV_VL64, 16))
+
+/*
+** qincd_pat_vl128_s64:
+**	sqincd	z0\.d, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl128_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_VL128, 16),
+		z0 = svqincd_pat (z0, SV_VL128, 16))
+
+/*
+** qincd_pat_vl256_s64:
+**	sqincd	z0\.d, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl256_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_VL256, 16),
+		z0 = svqincd_pat (z0, SV_VL256, 16))
+
+/*
+** qincd_pat_mul4_s64:
+**	sqincd	z0\.d, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_mul4_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_MUL4, 16),
+		z0 = svqincd_pat (z0, SV_MUL4, 16))
+
+/*
+** qincd_pat_mul3_s64:
+**	sqincd	z0\.d, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_mul3_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_MUL3, 16),
+		z0 = svqincd_pat (z0, SV_MUL3, 16))
+
+/*
+** qincd_pat_all_s64:
+**	sqincd	z0\.d, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_all_s64, svint64_t,
+		z0 = svqincd_pat_s64 (z0, SV_ALL, 16),
+		z0 = svqincd_pat (z0, SV_ALL, 16))
+
+/*
+** qincd_pat_n_1_s64_tied:
+**	sqincd	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_1_s64_tied, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_POW2, 1),
+		x0 = svqincd_pat (x0, SV_POW2, 1))
+
+/*
+** qincd_pat_n_1_s64_untied:
+**	mov	x0, x1
+**	sqincd	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_1_s64_untied, int64_t,
+		x0 = svqincd_pat_n_s64 (x1, SV_POW2, 1),
+		x0 = svqincd_pat (x1, SV_POW2, 1))
+
+/*
+** qincd_pat_n_2_s64:
+**	sqincd	x0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_2_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_POW2, 2),
+		x0 = svqincd_pat (x0, SV_POW2, 2))
+
+/*
+** qincd_pat_n_7_s64:
+**	sqincd	x0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_7_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_POW2, 7),
+		x0 = svqincd_pat (x0, SV_POW2, 7))
+
+/*
+** qincd_pat_n_15_s64:
+**	sqincd	x0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_15_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_POW2, 15),
+		x0 = svqincd_pat (x0, SV_POW2, 15))
+
+/*
+** qincd_pat_n_16_s64:
+**	sqincd	x0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_16_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_POW2, 16),
+		x0 = svqincd_pat (x0, SV_POW2, 16))
+
+/*
+** qincd_pat_n_vl1_s64:
+**	sqincd	x0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl1_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_VL1, 16),
+		x0 = svqincd_pat (x0, SV_VL1, 16))
+
+/*
+** qincd_pat_n_vl2_s64:
+**	sqincd	x0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl2_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_VL2, 16),
+		x0 = svqincd_pat (x0, SV_VL2, 16))
+
+/*
+** qincd_pat_n_vl3_s64:
+**	sqincd	x0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl3_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_VL3, 16),
+		x0 = svqincd_pat (x0, SV_VL3, 16))
+
+/*
+** qincd_pat_n_vl4_s64:
+**	sqincd	x0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl4_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_VL4, 16),
+		x0 = svqincd_pat (x0, SV_VL4, 16))
+
+/*
+** qincd_pat_n_vl5_s64:
+**	sqincd	x0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl5_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_VL5, 16),
+		x0 = svqincd_pat (x0, SV_VL5, 16))
+
+/*
+** qincd_pat_n_vl6_s64:
+**	sqincd	x0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl6_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_VL6, 16),
+		x0 = svqincd_pat (x0, SV_VL6, 16))
+
+/*
+** qincd_pat_n_vl7_s64:
+**	sqincd	x0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl7_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_VL7, 16),
+		x0 = svqincd_pat (x0, SV_VL7, 16))
+
+/*
+** qincd_pat_n_vl8_s64:
+**	sqincd	x0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl8_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_VL8, 16),
+		x0 = svqincd_pat (x0, SV_VL8, 16))
+
+/*
+** qincd_pat_n_vl16_s64:
+**	sqincd	x0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl16_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_VL16, 16),
+		x0 = svqincd_pat (x0, SV_VL16, 16))
+
+/*
+** qincd_pat_n_vl32_s64:
+**	sqincd	x0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl32_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_VL32, 16),
+		x0 = svqincd_pat (x0, SV_VL32, 16))
+
+/*
+** qincd_pat_n_vl64_s64:
+**	sqincd	x0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl64_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_VL64, 16),
+		x0 = svqincd_pat (x0, SV_VL64, 16))
+
+/*
+** qincd_pat_n_vl128_s64:
+**	sqincd	x0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl128_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_VL128, 16),
+		x0 = svqincd_pat (x0, SV_VL128, 16))
+
+/*
+** qincd_pat_n_vl256_s64:
+**	sqincd	x0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl256_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_VL256, 16),
+		x0 = svqincd_pat (x0, SV_VL256, 16))
+
+/*
+** qincd_pat_n_mul4_s64:
+**	sqincd	x0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_mul4_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_MUL4, 16),
+		x0 = svqincd_pat (x0, SV_MUL4, 16))
+
+/*
+** qincd_pat_n_mul3_s64:
+**	sqincd	x0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_mul3_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_MUL3, 16),
+		x0 = svqincd_pat (x0, SV_MUL3, 16))
+
+/*
+** qincd_pat_n_all_s64:
+**	sqincd	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_all_s64, int64_t,
+		x0 = svqincd_pat_n_s64 (x0, SV_ALL, 16),
+		x0 = svqincd_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_u32.c
new file mode 100644
index 000000000..0e3cbdb54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_u32.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincd_pat_n_1_u32_tied:
+**	uqincd	w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_1_u32_tied, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_POW2, 1),
+		x0 = svqincd_pat (x0, SV_POW2, 1))
+
+/*
+** qincd_pat_n_1_u32_untied:
+**	mov	w0, w1
+**	uqincd	w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_1_u32_untied, uint32_t,
+		x0 = svqincd_pat_n_u32 (x1, SV_POW2, 1),
+		x0 = svqincd_pat (x1, SV_POW2, 1))
+
+/*
+** qincd_pat_n_2_u32:
+**	uqincd	w0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_2_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_POW2, 2),
+		x0 = svqincd_pat (x0, SV_POW2, 2))
+
+/*
+** qincd_pat_n_7_u32:
+**	uqincd	w0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_7_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_POW2, 7),
+		x0 = svqincd_pat (x0, SV_POW2, 7))
+
+/*
+** qincd_pat_n_15_u32:
+**	uqincd	w0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_15_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_POW2, 15),
+		x0 = svqincd_pat (x0, SV_POW2, 15))
+
+/*
+** qincd_pat_n_16_u32:
+**	uqincd	w0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_16_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_POW2, 16),
+		x0 = svqincd_pat (x0, SV_POW2, 16))
+
+/*
+** qincd_pat_n_vl1_u32:
+**	uqincd	w0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl1_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_VL1, 16),
+		x0 = svqincd_pat (x0, SV_VL1, 16))
+
+/*
+** qincd_pat_n_vl2_u32:
+**	uqincd	w0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl2_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_VL2, 16),
+		x0 = svqincd_pat (x0, SV_VL2, 16))
+
+/*
+** qincd_pat_n_vl3_u32:
+**	uqincd	w0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl3_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_VL3, 16),
+		x0 = svqincd_pat (x0, SV_VL3, 16))
+
+/*
+** qincd_pat_n_vl4_u32:
+**	uqincd	w0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl4_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_VL4, 16),
+		x0 = svqincd_pat (x0, SV_VL4, 16))
+
+/*
+** qincd_pat_n_vl5_u32:
+**	uqincd	w0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl5_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_VL5, 16),
+		x0 = svqincd_pat (x0, SV_VL5, 16))
+
+/*
+** qincd_pat_n_vl6_u32:
+**	uqincd	w0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl6_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_VL6, 16),
+		x0 = svqincd_pat (x0, SV_VL6, 16))
+
+/*
+** qincd_pat_n_vl7_u32:
+**	uqincd	w0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl7_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_VL7, 16),
+		x0 = svqincd_pat (x0, SV_VL7, 16))
+
+/*
+** qincd_pat_n_vl8_u32:
+**	uqincd	w0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl8_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_VL8, 16),
+		x0 = svqincd_pat (x0, SV_VL8, 16))
+
+/*
+** qincd_pat_n_vl16_u32:
+**	uqincd	w0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl16_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_VL16, 16),
+		x0 = svqincd_pat (x0, SV_VL16, 16))
+
+/*
+** qincd_pat_n_vl32_u32:
+**	uqincd	w0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl32_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_VL32, 16),
+		x0 = svqincd_pat (x0, SV_VL32, 16))
+
+/*
+** qincd_pat_n_vl64_u32:
+**	uqincd	w0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl64_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_VL64, 16),
+		x0 = svqincd_pat (x0, SV_VL64, 16))
+
+/*
+** qincd_pat_n_vl128_u32:
+**	uqincd	w0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl128_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_VL128, 16),
+		x0 = svqincd_pat (x0, SV_VL128, 16))
+
+/*
+** qincd_pat_n_vl256_u32:
+**	uqincd	w0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl256_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_VL256, 16),
+		x0 = svqincd_pat (x0, SV_VL256, 16))
+
+/*
+** qincd_pat_n_mul4_u32:
+**	uqincd	w0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_mul4_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_MUL4, 16),
+		x0 = svqincd_pat (x0, SV_MUL4, 16))
+
+/*
+** qincd_pat_n_mul3_u32:
+**	uqincd	w0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_mul3_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_MUL3, 16),
+		x0 = svqincd_pat (x0, SV_MUL3, 16))
+
+/*
+** qincd_pat_n_all_u32:
+**	uqincd	w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_all_u32, uint32_t,
+		x0 = svqincd_pat_n_u32 (x0, SV_ALL, 16),
+		x0 = svqincd_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_u64.c
new file mode 100644
index 000000000..49dc350df
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_pat_u64.c
@@ -0,0 +1,401 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincd_pat_1_u64_tied:
+**	uqincd	z0\.d, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_1_u64_tied, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_POW2, 1),
+		z0 = svqincd_pat (z0, SV_POW2, 1))
+
+/*
+** qincd_pat_1_u64_untied:
+**	movprfx	z0, z1
+**	uqincd	z0\.d, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_1_u64_untied, svuint64_t,
+		z0 = svqincd_pat_u64 (z1, SV_POW2, 1),
+		z0 = svqincd_pat (z1, SV_POW2, 1))
+
+/*
+** qincd_pat_2_u64:
+**	uqincd	z0\.d, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_2_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_POW2, 2),
+		z0 = svqincd_pat (z0, SV_POW2, 2))
+
+/*
+** qincd_pat_7_u64:
+**	uqincd	z0\.d, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_7_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_POW2, 7),
+		z0 = svqincd_pat (z0, SV_POW2, 7))
+
+/*
+** qincd_pat_15_u64:
+**	uqincd	z0\.d, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_15_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_POW2, 15),
+		z0 = svqincd_pat (z0, SV_POW2, 15))
+
+/*
+** qincd_pat_16_u64:
+**	uqincd	z0\.d, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_16_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_POW2, 16),
+		z0 = svqincd_pat (z0, SV_POW2, 16))
+
+/*
+** qincd_pat_vl1_u64:
+**	uqincd	z0\.d, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl1_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_VL1, 16),
+		z0 = svqincd_pat (z0, SV_VL1, 16))
+
+/*
+** qincd_pat_vl2_u64:
+**	uqincd	z0\.d, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl2_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_VL2, 16),
+		z0 = svqincd_pat (z0, SV_VL2, 16))
+
+/*
+** qincd_pat_vl3_u64:
+**	uqincd	z0\.d, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl3_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_VL3, 16),
+		z0 = svqincd_pat (z0, SV_VL3, 16))
+
+/*
+** qincd_pat_vl4_u64:
+**	uqincd	z0\.d, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl4_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_VL4, 16),
+		z0 = svqincd_pat (z0, SV_VL4, 16))
+
+/*
+** qincd_pat_vl5_u64:
+**	uqincd	z0\.d, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl5_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_VL5, 16),
+		z0 = svqincd_pat (z0, SV_VL5, 16))
+
+/*
+** qincd_pat_vl6_u64:
+**	uqincd	z0\.d, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl6_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_VL6, 16),
+		z0 = svqincd_pat (z0, SV_VL6, 16))
+
+/*
+** qincd_pat_vl7_u64:
+**	uqincd	z0\.d, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl7_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_VL7, 16),
+		z0 = svqincd_pat (z0, SV_VL7, 16))
+
+/*
+** qincd_pat_vl8_u64:
+**	uqincd	z0\.d, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl8_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_VL8, 16),
+		z0 = svqincd_pat (z0, SV_VL8, 16))
+
+/*
+** qincd_pat_vl16_u64:
+**	uqincd	z0\.d, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl16_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_VL16, 16),
+		z0 = svqincd_pat (z0, SV_VL16, 16))
+
+/*
+** qincd_pat_vl32_u64:
+**	uqincd	z0\.d, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl32_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_VL32, 16),
+		z0 = svqincd_pat (z0, SV_VL32, 16))
+
+/*
+** qincd_pat_vl64_u64:
+**	uqincd	z0\.d, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl64_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_VL64, 16),
+		z0 = svqincd_pat (z0, SV_VL64, 16))
+
+/*
+** qincd_pat_vl128_u64:
+**	uqincd	z0\.d, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl128_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_VL128, 16),
+		z0 = svqincd_pat (z0, SV_VL128, 16))
+
+/*
+** qincd_pat_vl256_u64:
+**	uqincd	z0\.d, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_vl256_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_VL256, 16),
+		z0 = svqincd_pat (z0, SV_VL256, 16))
+
+/*
+** qincd_pat_mul4_u64:
+**	uqincd	z0\.d, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_mul4_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_MUL4, 16),
+		z0 = svqincd_pat (z0, SV_MUL4, 16))
+
+/*
+** qincd_pat_mul3_u64:
+**	uqincd	z0\.d, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_mul3_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_MUL3, 16),
+		z0 = svqincd_pat (z0, SV_MUL3, 16))
+
+/*
+** qincd_pat_all_u64:
+**	uqincd	z0\.d, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_pat_all_u64, svuint64_t,
+		z0 = svqincd_pat_u64 (z0, SV_ALL, 16),
+		z0 = svqincd_pat (z0, SV_ALL, 16))
+
+/*
+** qincd_pat_n_1_u64_tied:
+**	uqincd	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_1_u64_tied, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_POW2, 1),
+		x0 = svqincd_pat (x0, SV_POW2, 1))
+
+/*
+** qincd_pat_n_1_u64_untied:
+**	mov	x0, x1
+**	uqincd	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_1_u64_untied, uint64_t,
+		x0 = svqincd_pat_n_u64 (x1, SV_POW2, 1),
+		x0 = svqincd_pat (x1, SV_POW2, 1))
+
+/*
+** qincd_pat_n_2_u64:
+**	uqincd	x0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_2_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_POW2, 2),
+		x0 = svqincd_pat (x0, SV_POW2, 2))
+
+/*
+** qincd_pat_n_7_u64:
+**	uqincd	x0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_7_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_POW2, 7),
+		x0 = svqincd_pat (x0, SV_POW2, 7))
+
+/*
+** qincd_pat_n_15_u64:
+**	uqincd	x0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_15_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_POW2, 15),
+		x0 = svqincd_pat (x0, SV_POW2, 15))
+
+/*
+** qincd_pat_n_16_u64:
+**	uqincd	x0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_16_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_POW2, 16),
+		x0 = svqincd_pat (x0, SV_POW2, 16))
+
+/*
+** qincd_pat_n_vl1_u64:
+**	uqincd	x0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl1_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_VL1, 16),
+		x0 = svqincd_pat (x0, SV_VL1, 16))
+
+/*
+** qincd_pat_n_vl2_u64:
+**	uqincd	x0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl2_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_VL2, 16),
+		x0 = svqincd_pat (x0, SV_VL2, 16))
+
+/*
+** qincd_pat_n_vl3_u64:
+**	uqincd	x0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl3_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_VL3, 16),
+		x0 = svqincd_pat (x0, SV_VL3, 16))
+
+/*
+** qincd_pat_n_vl4_u64:
+**	uqincd	x0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl4_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_VL4, 16),
+		x0 = svqincd_pat (x0, SV_VL4, 16))
+
+/*
+** qincd_pat_n_vl5_u64:
+**	uqincd	x0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl5_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_VL5, 16),
+		x0 = svqincd_pat (x0, SV_VL5, 16))
+
+/*
+** qincd_pat_n_vl6_u64:
+**	uqincd	x0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl6_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_VL6, 16),
+		x0 = svqincd_pat (x0, SV_VL6, 16))
+
+/*
+** qincd_pat_n_vl7_u64:
+**	uqincd	x0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl7_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_VL7, 16),
+		x0 = svqincd_pat (x0, SV_VL7, 16))
+
+/*
+** qincd_pat_n_vl8_u64:
+**	uqincd	x0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl8_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_VL8, 16),
+		x0 = svqincd_pat (x0, SV_VL8, 16))
+
+/*
+** qincd_pat_n_vl16_u64:
+**	uqincd	x0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl16_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_VL16, 16),
+		x0 = svqincd_pat (x0, SV_VL16, 16))
+
+/*
+** qincd_pat_n_vl32_u64:
+**	uqincd	x0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl32_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_VL32, 16),
+		x0 = svqincd_pat (x0, SV_VL32, 16))
+
+/*
+** qincd_pat_n_vl64_u64:
+**	uqincd	x0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl64_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_VL64, 16),
+		x0 = svqincd_pat (x0, SV_VL64, 16))
+
+/*
+** qincd_pat_n_vl128_u64:
+**	uqincd	x0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl128_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_VL128, 16),
+		x0 = svqincd_pat (x0, SV_VL128, 16))
+
+/*
+** qincd_pat_n_vl256_u64:
+**	uqincd	x0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_vl256_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_VL256, 16),
+		x0 = svqincd_pat (x0, SV_VL256, 16))
+
+/*
+** qincd_pat_n_mul4_u64:
+**	uqincd	x0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_mul4_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_MUL4, 16),
+		x0 = svqincd_pat (x0, SV_MUL4, 16))
+
+/*
+** qincd_pat_n_mul3_u64:
+**	uqincd	x0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_mul3_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_MUL3, 16),
+		x0 = svqincd_pat (x0, SV_MUL3, 16))
+
+/*
+** qincd_pat_n_all_u64:
+**	uqincd	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_pat_n_all_u64, uint64_t,
+		x0 = svqincd_pat_n_u64 (x0, SV_ALL, 16),
+		x0 = svqincd_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_s32.c
new file mode 100644
index 000000000..2fa0438a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_s32.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincd_n_1_s32_tied:
+**	sqincd	x0, w0
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_1_s32_tied, int32_t,
+		x0 = svqincd_n_s32 (x0, 1),
+		x0 = svqincd (x0, 1))
+
+/*
+** qincd_n_1_s32_untied:
+**	mov	w0, w1
+**	sqincd	x0, w0
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_1_s32_untied, int32_t,
+		x0 = svqincd_n_s32 (x1, 1),
+		x0 = svqincd (x1, 1))
+
+/*
+** qincd_n_2_s32:
+**	sqincd	x0, w0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_2_s32, int32_t,
+		x0 = svqincd_n_s32 (x0, 2),
+		x0 = svqincd (x0, 2))
+
+/*
+** qincd_n_7_s32:
+**	sqincd	x0, w0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_7_s32, int32_t,
+		x0 = svqincd_n_s32 (x0, 7),
+		x0 = svqincd (x0, 7))
+
+/*
+** qincd_n_15_s32:
+**	sqincd	x0, w0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_15_s32, int32_t,
+		x0 = svqincd_n_s32 (x0, 15),
+		x0 = svqincd (x0, 15))
+
+/*
+** qincd_n_16_s32:
+**	sqincd	x0, w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_16_s32, int32_t,
+		x0 = svqincd_n_s32 (x0, 16),
+		x0 = svqincd (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_s64.c
new file mode 100644
index 000000000..0920ac2ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_s64.c
@@ -0,0 +1,113 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincd_1_s64_tied:
+**	sqincd	z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_1_s64_tied, svint64_t,
+		z0 = svqincd_s64 (z0, 1),
+		z0 = svqincd (z0, 1))
+
+/*
+** qincd_1_s64_untied:
+**	movprfx	z0, z1
+**	sqincd	z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_1_s64_untied, svint64_t,
+		z0 = svqincd_s64 (z1, 1),
+		z0 = svqincd (z1, 1))
+
+/*
+** qincd_2_s64:
+**	sqincd	z0\.d, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_2_s64, svint64_t,
+		z0 = svqincd_s64 (z0, 2),
+		z0 = svqincd (z0, 2))
+
+/*
+** qincd_7_s64:
+**	sqincd	z0\.d, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_7_s64, svint64_t,
+		z0 = svqincd_s64 (z0, 7),
+		z0 = svqincd (z0, 7))
+
+/*
+** qincd_15_s64:
+**	sqincd	z0\.d, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_15_s64, svint64_t,
+		z0 = svqincd_s64 (z0, 15),
+		z0 = svqincd (z0, 15))
+
+/*
+** qincd_16_s64:
+**	sqincd	z0\.d, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_16_s64, svint64_t,
+		z0 = svqincd_s64 (z0, 16),
+		z0 = svqincd (z0, 16))
+
+/*
+** qincd_n_1_s64_tied:
+**	sqincd	x0
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_1_s64_tied, int64_t,
+		x0 = svqincd_n_s64 (x0, 1),
+		x0 = svqincd (x0, 1))
+
+/*
+** qincd_n_1_s64_untied:
+**	mov	x0, x1
+**	sqincd	x0
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_1_s64_untied, int64_t,
+		x0 = svqincd_n_s64 (x1, 1),
+		x0 = svqincd (x1, 1))
+
+/*
+** qincd_n_2_s64:
+**	sqincd	x0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_2_s64, int64_t,
+		x0 = svqincd_n_s64 (x0, 2),
+		x0 = svqincd (x0, 2))
+
+/*
+** qincd_n_7_s64:
+**	sqincd	x0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_7_s64, int64_t,
+		x0 = svqincd_n_s64 (x0, 7),
+		x0 = svqincd (x0, 7))
+
+/*
+** qincd_n_15_s64:
+**	sqincd	x0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_15_s64, int64_t,
+		x0 = svqincd_n_s64 (x0, 15),
+		x0 = svqincd (x0, 15))
+
+/*
+** qincd_n_16_s64:
+**	sqincd	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_16_s64, int64_t,
+		x0 = svqincd_n_s64 (x0, 16),
+		x0 = svqincd (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_u32.c
new file mode 100644
index 000000000..33dc12cb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_u32.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincd_n_1_u32_tied:
+**	uqincd	w0
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_1_u32_tied, uint32_t,
+		x0 = svqincd_n_u32 (x0, 1),
+		x0 = svqincd (x0, 1))
+
+/*
+** qincd_n_1_u32_untied:
+**	mov	w0, w1
+**	uqincd	w0
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_1_u32_untied, uint32_t,
+		x0 = svqincd_n_u32 (x1, 1),
+		x0 = svqincd (x1, 1))
+
+/*
+** qincd_n_2_u32:
+**	uqincd	w0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_2_u32, uint32_t,
+		x0 = svqincd_n_u32 (x0, 2),
+		x0 = svqincd (x0, 2))
+
+/*
+** qincd_n_7_u32:
+**	uqincd	w0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_7_u32, uint32_t,
+		x0 = svqincd_n_u32 (x0, 7),
+		x0 = svqincd (x0, 7))
+
+/*
+** qincd_n_15_u32:
+**	uqincd	w0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_15_u32, uint32_t,
+		x0 = svqincd_n_u32 (x0, 15),
+		x0 = svqincd (x0, 15))
+
+/*
+** qincd_n_16_u32:
+**	uqincd	w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_16_u32, uint32_t,
+		x0 = svqincd_n_u32 (x0, 16),
+		x0 = svqincd (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_u64.c
new file mode 100644
index 000000000..28c611a8f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincd_u64.c
@@ -0,0 +1,113 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincd_1_u64_tied:
+**	uqincd	z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_1_u64_tied, svuint64_t,
+		z0 = svqincd_u64 (z0, 1),
+		z0 = svqincd (z0, 1))
+
+/*
+** qincd_1_u64_untied:
+**	movprfx	z0, z1
+**	uqincd	z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_1_u64_untied, svuint64_t,
+		z0 = svqincd_u64 (z1, 1),
+		z0 = svqincd (z1, 1))
+
+/*
+** qincd_2_u64:
+**	uqincd	z0\.d, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_2_u64, svuint64_t,
+		z0 = svqincd_u64 (z0, 2),
+		z0 = svqincd (z0, 2))
+
+/*
+** qincd_7_u64:
+**	uqincd	z0\.d, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_7_u64, svuint64_t,
+		z0 = svqincd_u64 (z0, 7),
+		z0 = svqincd (z0, 7))
+
+/*
+** qincd_15_u64:
+**	uqincd	z0\.d, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_15_u64, svuint64_t,
+		z0 = svqincd_u64 (z0, 15),
+		z0 = svqincd (z0, 15))
+
+/*
+** qincd_16_u64:
+**	uqincd	z0\.d, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincd_16_u64, svuint64_t,
+		z0 = svqincd_u64 (z0, 16),
+		z0 = svqincd (z0, 16))
+
+/*
+** qincd_n_1_u64_tied:
+**	uqincd	x0
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_1_u64_tied, uint64_t,
+		x0 = svqincd_n_u64 (x0, 1),
+		x0 = svqincd (x0, 1))
+
+/*
+** qincd_n_1_u64_untied:
+**	mov	x0, x1
+**	uqincd	x0
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_1_u64_untied, uint64_t,
+		x0 = svqincd_n_u64 (x1, 1),
+		x0 = svqincd (x1, 1))
+
+/*
+** qincd_n_2_u64:
+**	uqincd	x0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_2_u64, uint64_t,
+		x0 = svqincd_n_u64 (x0, 2),
+		x0 = svqincd (x0, 2))
+
+/*
+** qincd_n_7_u64:
+**	uqincd	x0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_7_u64, uint64_t,
+		x0 = svqincd_n_u64 (x0, 7),
+		x0 = svqincd (x0, 7))
+
+/*
+** qincd_n_15_u64:
+**	uqincd	x0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_15_u64, uint64_t,
+		x0 = svqincd_n_u64 (x0, 15),
+		x0 = svqincd (x0, 15))
+
+/*
+** qincd_n_16_u64:
+**	uqincd	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincd_n_16_u64, uint64_t,
+		x0 = svqincd_n_u64 (x0, 16),
+		x0 = svqincd (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s16.c
new file mode 100644
index 000000000..708d635c5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s16.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qinch_pat_1_s16_tied:
+**	sqinch	z0\.h, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_1_s16_tied, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_POW2, 1),
+		z0 = svqinch_pat (z0, SV_POW2, 1))
+
+/*
+** qinch_pat_1_s16_untied:
+**	movprfx	z0, z1
+**	sqinch	z0\.h, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_1_s16_untied, svint16_t,
+		z0 = svqinch_pat_s16 (z1, SV_POW2, 1),
+		z0 = svqinch_pat (z1, SV_POW2, 1))
+
+/*
+** qinch_pat_2_s16:
+**	sqinch	z0\.h, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_2_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_POW2, 2),
+		z0 = svqinch_pat (z0, SV_POW2, 2))
+
+/*
+** qinch_pat_7_s16:
+**	sqinch	z0\.h, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_7_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_POW2, 7),
+		z0 = svqinch_pat (z0, SV_POW2, 7))
+
+/*
+** qinch_pat_15_s16:
+**	sqinch	z0\.h, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_15_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_POW2, 15),
+		z0 = svqinch_pat (z0, SV_POW2, 15))
+
+/*
+** qinch_pat_16_s16:
+**	sqinch	z0\.h, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_16_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_POW2, 16),
+		z0 = svqinch_pat (z0, SV_POW2, 16))
+
+/*
+** qinch_pat_vl1_s16:
+**	sqinch	z0\.h, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl1_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_VL1, 16),
+		z0 = svqinch_pat (z0, SV_VL1, 16))
+
+/*
+** qinch_pat_vl2_s16:
+**	sqinch	z0\.h, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl2_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_VL2, 16),
+		z0 = svqinch_pat (z0, SV_VL2, 16))
+
+/*
+** qinch_pat_vl3_s16:
+**	sqinch	z0\.h, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl3_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_VL3, 16),
+		z0 = svqinch_pat (z0, SV_VL3, 16))
+
+/*
+** qinch_pat_vl4_s16:
+**	sqinch	z0\.h, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl4_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_VL4, 16),
+		z0 = svqinch_pat (z0, SV_VL4, 16))
+
+/*
+** qinch_pat_vl5_s16:
+**	sqinch	z0\.h, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl5_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_VL5, 16),
+		z0 = svqinch_pat (z0, SV_VL5, 16))
+
+/*
+** qinch_pat_vl6_s16:
+**	sqinch	z0\.h, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl6_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_VL6, 16),
+		z0 = svqinch_pat (z0, SV_VL6, 16))
+
+/*
+** qinch_pat_vl7_s16:
+**	sqinch	z0\.h, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl7_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_VL7, 16),
+		z0 = svqinch_pat (z0, SV_VL7, 16))
+
+/*
+** qinch_pat_vl8_s16:
+**	sqinch	z0\.h, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl8_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_VL8, 16),
+		z0 = svqinch_pat (z0, SV_VL8, 16))
+
+/*
+** qinch_pat_vl16_s16:
+**	sqinch	z0\.h, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl16_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_VL16, 16),
+		z0 = svqinch_pat (z0, SV_VL16, 16))
+
+/*
+** qinch_pat_vl32_s16:
+**	sqinch	z0\.h, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl32_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_VL32, 16),
+		z0 = svqinch_pat (z0, SV_VL32, 16))
+
+/*
+** qinch_pat_vl64_s16:
+**	sqinch	z0\.h, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl64_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_VL64, 16),
+		z0 = svqinch_pat (z0, SV_VL64, 16))
+
+/*
+** qinch_pat_vl128_s16:
+**	sqinch	z0\.h, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl128_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_VL128, 16),
+		z0 = svqinch_pat (z0, SV_VL128, 16))
+
+/*
+** qinch_pat_vl256_s16:
+**	sqinch	z0\.h, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl256_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_VL256, 16),
+		z0 = svqinch_pat (z0, SV_VL256, 16))
+
+/*
+** qinch_pat_mul4_s16:
+**	sqinch	z0\.h, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_mul4_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_MUL4, 16),
+		z0 = svqinch_pat (z0, SV_MUL4, 16))
+
+/*
+** qinch_pat_mul3_s16:
+**	sqinch	z0\.h, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_mul3_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_MUL3, 16),
+		z0 = svqinch_pat (z0, SV_MUL3, 16))
+
+/*
+** qinch_pat_all_s16:
+**	sqinch	z0\.h, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_all_s16, svint16_t,
+		z0 = svqinch_pat_s16 (z0, SV_ALL, 16),
+		z0 = svqinch_pat (z0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s32.c
new file mode 100644
index 000000000..7c91c6202
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s32.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qinch_pat_n_1_s32_tied:
+**	sqinch	x0, w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_1_s32_tied, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_POW2, 1),
+		x0 = svqinch_pat (x0, SV_POW2, 1))
+
+/*
+** qinch_pat_n_1_s32_untied:
+**	mov	w0, w1
+**	sqinch	x0, w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_1_s32_untied, int32_t,
+		x0 = svqinch_pat_n_s32 (x1, SV_POW2, 1),
+		x0 = svqinch_pat (x1, SV_POW2, 1))
+
+/*
+** qinch_pat_n_2_s32:
+**	sqinch	x0, w0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_2_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_POW2, 2),
+		x0 = svqinch_pat (x0, SV_POW2, 2))
+
+/*
+** qinch_pat_n_7_s32:
+**	sqinch	x0, w0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_7_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_POW2, 7),
+		x0 = svqinch_pat (x0, SV_POW2, 7))
+
+/*
+** qinch_pat_n_15_s32:
+**	sqinch	x0, w0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_15_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_POW2, 15),
+		x0 = svqinch_pat (x0, SV_POW2, 15))
+
+/*
+** qinch_pat_n_16_s32:
+**	sqinch	x0, w0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_16_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_POW2, 16),
+		x0 = svqinch_pat (x0, SV_POW2, 16))
+
+/*
+** qinch_pat_n_vl1_s32:
+**	sqinch	x0, w0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl1_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_VL1, 16),
+		x0 = svqinch_pat (x0, SV_VL1, 16))
+
+/*
+** qinch_pat_n_vl2_s32:
+**	sqinch	x0, w0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl2_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_VL2, 16),
+		x0 = svqinch_pat (x0, SV_VL2, 16))
+
+/*
+** qinch_pat_n_vl3_s32:
+**	sqinch	x0, w0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl3_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_VL3, 16),
+		x0 = svqinch_pat (x0, SV_VL3, 16))
+
+/*
+** qinch_pat_n_vl4_s32:
+**	sqinch	x0, w0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl4_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_VL4, 16),
+		x0 = svqinch_pat (x0, SV_VL4, 16))
+
+/*
+** qinch_pat_n_vl5_s32:
+**	sqinch	x0, w0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl5_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_VL5, 16),
+		x0 = svqinch_pat (x0, SV_VL5, 16))
+
+/*
+** qinch_pat_n_vl6_s32:
+**	sqinch	x0, w0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl6_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_VL6, 16),
+		x0 = svqinch_pat (x0, SV_VL6, 16))
+
+/*
+** qinch_pat_n_vl7_s32:
+**	sqinch	x0, w0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl7_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_VL7, 16),
+		x0 = svqinch_pat (x0, SV_VL7, 16))
+
+/*
+** qinch_pat_n_vl8_s32:
+**	sqinch	x0, w0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl8_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_VL8, 16),
+		x0 = svqinch_pat (x0, SV_VL8, 16))
+
+/*
+** qinch_pat_n_vl16_s32:
+**	sqinch	x0, w0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl16_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_VL16, 16),
+		x0 = svqinch_pat (x0, SV_VL16, 16))
+
+/*
+** qinch_pat_n_vl32_s32:
+**	sqinch	x0, w0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl32_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_VL32, 16),
+		x0 = svqinch_pat (x0, SV_VL32, 16))
+
+/*
+** qinch_pat_n_vl64_s32:
+**	sqinch	x0, w0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl64_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_VL64, 16),
+		x0 = svqinch_pat (x0, SV_VL64, 16))
+
+/*
+** qinch_pat_n_vl128_s32:
+**	sqinch	x0, w0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl128_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_VL128, 16),
+		x0 = svqinch_pat (x0, SV_VL128, 16))
+
+/*
+** qinch_pat_n_vl256_s32:
+**	sqinch	x0, w0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl256_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_VL256, 16),
+		x0 = svqinch_pat (x0, SV_VL256, 16))
+
+/*
+** qinch_pat_n_mul4_s32:
+**	sqinch	x0, w0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_mul4_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_MUL4, 16),
+		x0 = svqinch_pat (x0, SV_MUL4, 16))
+
+/*
+** qinch_pat_n_mul3_s32:
+**	sqinch	x0, w0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_mul3_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_MUL3, 16),
+		x0 = svqinch_pat (x0, SV_MUL3, 16))
+
+/*
+** qinch_pat_n_all_s32:
+**	sqinch	x0, w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_all_s32, int32_t,
+		x0 = svqinch_pat_n_s32 (x0, SV_ALL, 16),
+		x0 = svqinch_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s64.c
new file mode 100644
index 000000000..2cde6482f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_s64.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qinch_pat_n_1_s64_tied:
+**	sqinch	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_1_s64_tied, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_POW2, 1),
+		x0 = svqinch_pat (x0, SV_POW2, 1))
+
+/*
+** qinch_pat_n_1_s64_untied:
+**	mov	x0, x1
+**	sqinch	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_1_s64_untied, int64_t,
+		x0 = svqinch_pat_n_s64 (x1, SV_POW2, 1),
+		x0 = svqinch_pat (x1, SV_POW2, 1))
+
+/*
+** qinch_pat_n_2_s64:
+**	sqinch	x0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_2_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_POW2, 2),
+		x0 = svqinch_pat (x0, SV_POW2, 2))
+
+/*
+** qinch_pat_n_7_s64:
+**	sqinch	x0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_7_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_POW2, 7),
+		x0 = svqinch_pat (x0, SV_POW2, 7))
+
+/*
+** qinch_pat_n_15_s64:
+**	sqinch	x0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_15_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_POW2, 15),
+		x0 = svqinch_pat (x0, SV_POW2, 15))
+
+/*
+** qinch_pat_n_16_s64:
+**	sqinch	x0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_16_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_POW2, 16),
+		x0 = svqinch_pat (x0, SV_POW2, 16))
+
+/*
+** qinch_pat_n_vl1_s64:
+**	sqinch	x0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl1_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_VL1, 16),
+		x0 = svqinch_pat (x0, SV_VL1, 16))
+
+/*
+** qinch_pat_n_vl2_s64:
+**	sqinch	x0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl2_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_VL2, 16),
+		x0 = svqinch_pat (x0, SV_VL2, 16))
+
+/*
+** qinch_pat_n_vl3_s64:
+**	sqinch	x0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl3_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_VL3, 16),
+		x0 = svqinch_pat (x0, SV_VL3, 16))
+
+/*
+** qinch_pat_n_vl4_s64:
+**	sqinch	x0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl4_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_VL4, 16),
+		x0 = svqinch_pat (x0, SV_VL4, 16))
+
+/*
+** qinch_pat_n_vl5_s64:
+**	sqinch	x0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl5_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_VL5, 16),
+		x0 = svqinch_pat (x0, SV_VL5, 16))
+
+/*
+** qinch_pat_n_vl6_s64:
+**	sqinch	x0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl6_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_VL6, 16),
+		x0 = svqinch_pat (x0, SV_VL6, 16))
+
+/*
+** qinch_pat_n_vl7_s64:
+**	sqinch	x0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl7_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_VL7, 16),
+		x0 = svqinch_pat (x0, SV_VL7, 16))
+
+/*
+** qinch_pat_n_vl8_s64:
+**	sqinch	x0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl8_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_VL8, 16),
+		x0 = svqinch_pat (x0, SV_VL8, 16))
+
+/*
+** qinch_pat_n_vl16_s64:
+**	sqinch	x0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl16_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_VL16, 16),
+		x0 = svqinch_pat (x0, SV_VL16, 16))
+
+/*
+** qinch_pat_n_vl32_s64:
+**	sqinch	x0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl32_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_VL32, 16),
+		x0 = svqinch_pat (x0, SV_VL32, 16))
+
+/*
+** qinch_pat_n_vl64_s64:
+**	sqinch	x0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl64_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_VL64, 16),
+		x0 = svqinch_pat (x0, SV_VL64, 16))
+
+/*
+** qinch_pat_n_vl128_s64:
+**	sqinch	x0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl128_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_VL128, 16),
+		x0 = svqinch_pat (x0, SV_VL128, 16))
+
+/*
+** qinch_pat_n_vl256_s64:
+**	sqinch	x0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl256_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_VL256, 16),
+		x0 = svqinch_pat (x0, SV_VL256, 16))
+
+/*
+** qinch_pat_n_mul4_s64:
+**	sqinch	x0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_mul4_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_MUL4, 16),
+		x0 = svqinch_pat (x0, SV_MUL4, 16))
+
+/*
+** qinch_pat_n_mul3_s64:
+**	sqinch	x0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_mul3_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_MUL3, 16),
+		x0 = svqinch_pat (x0, SV_MUL3, 16))
+
+/*
+** qinch_pat_n_all_s64:
+**	sqinch	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_all_s64, int64_t,
+		x0 = svqinch_pat_n_s64 (x0, SV_ALL, 16),
+		x0 = svqinch_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u16.c
new file mode 100644
index 000000000..5a1a846a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u16.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qinch_pat_1_u16_tied:
+**	uqinch	z0\.h, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_1_u16_tied, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_POW2, 1),
+		z0 = svqinch_pat (z0, SV_POW2, 1))
+
+/*
+** qinch_pat_1_u16_untied:
+**	movprfx	z0, z1
+**	uqinch	z0\.h, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_1_u16_untied, svuint16_t,
+		z0 = svqinch_pat_u16 (z1, SV_POW2, 1),
+		z0 = svqinch_pat (z1, SV_POW2, 1))
+
+/*
+** qinch_pat_2_u16:
+**	uqinch	z0\.h, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_2_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_POW2, 2),
+		z0 = svqinch_pat (z0, SV_POW2, 2))
+
+/*
+** qinch_pat_7_u16:
+**	uqinch	z0\.h, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_7_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_POW2, 7),
+		z0 = svqinch_pat (z0, SV_POW2, 7))
+
+/*
+** qinch_pat_15_u16:
+**	uqinch	z0\.h, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_15_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_POW2, 15),
+		z0 = svqinch_pat (z0, SV_POW2, 15))
+
+/*
+** qinch_pat_16_u16:
+**	uqinch	z0\.h, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_16_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_POW2, 16),
+		z0 = svqinch_pat (z0, SV_POW2, 16))
+
+/*
+** qinch_pat_vl1_u16:
+**	uqinch	z0\.h, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl1_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_VL1, 16),
+		z0 = svqinch_pat (z0, SV_VL1, 16))
+
+/*
+** qinch_pat_vl2_u16:
+**	uqinch	z0\.h, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl2_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_VL2, 16),
+		z0 = svqinch_pat (z0, SV_VL2, 16))
+
+/*
+** qinch_pat_vl3_u16:
+**	uqinch	z0\.h, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl3_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_VL3, 16),
+		z0 = svqinch_pat (z0, SV_VL3, 16))
+
+/*
+** qinch_pat_vl4_u16:
+**	uqinch	z0\.h, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl4_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_VL4, 16),
+		z0 = svqinch_pat (z0, SV_VL4, 16))
+
+/*
+** qinch_pat_vl5_u16:
+**	uqinch	z0\.h, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl5_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_VL5, 16),
+		z0 = svqinch_pat (z0, SV_VL5, 16))
+
+/*
+** qinch_pat_vl6_u16:
+**	uqinch	z0\.h, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl6_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_VL6, 16),
+		z0 = svqinch_pat (z0, SV_VL6, 16))
+
+/*
+** qinch_pat_vl7_u16:
+**	uqinch	z0\.h, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl7_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_VL7, 16),
+		z0 = svqinch_pat (z0, SV_VL7, 16))
+
+/*
+** qinch_pat_vl8_u16:
+**	uqinch	z0\.h, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl8_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_VL8, 16),
+		z0 = svqinch_pat (z0, SV_VL8, 16))
+
+/*
+** qinch_pat_vl16_u16:
+**	uqinch	z0\.h, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl16_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_VL16, 16),
+		z0 = svqinch_pat (z0, SV_VL16, 16))
+
+/*
+** qinch_pat_vl32_u16:
+**	uqinch	z0\.h, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl32_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_VL32, 16),
+		z0 = svqinch_pat (z0, SV_VL32, 16))
+
+/*
+** qinch_pat_vl64_u16:
+**	uqinch	z0\.h, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl64_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_VL64, 16),
+		z0 = svqinch_pat (z0, SV_VL64, 16))
+
+/*
+** qinch_pat_vl128_u16:
+**	uqinch	z0\.h, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl128_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_VL128, 16),
+		z0 = svqinch_pat (z0, SV_VL128, 16))
+
+/*
+** qinch_pat_vl256_u16:
+**	uqinch	z0\.h, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_vl256_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_VL256, 16),
+		z0 = svqinch_pat (z0, SV_VL256, 16))
+
+/*
+** qinch_pat_mul4_u16:
+**	uqinch	z0\.h, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_mul4_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_MUL4, 16),
+		z0 = svqinch_pat (z0, SV_MUL4, 16))
+
+/*
+** qinch_pat_mul3_u16:
+**	uqinch	z0\.h, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_mul3_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_MUL3, 16),
+		z0 = svqinch_pat (z0, SV_MUL3, 16))
+
+/*
+** qinch_pat_all_u16:
+**	uqinch	z0\.h, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_pat_all_u16, svuint16_t,
+		z0 = svqinch_pat_u16 (z0, SV_ALL, 16),
+		z0 = svqinch_pat (z0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u32.c
new file mode 100644
index 000000000..8398c5689
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u32.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qinch_pat_n_1_u32_tied:
+**	uqinch	w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_1_u32_tied, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_POW2, 1),
+		x0 = svqinch_pat (x0, SV_POW2, 1))
+
+/*
+** qinch_pat_n_1_u32_untied:
+**	mov	w0, w1
+**	uqinch	w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_1_u32_untied, uint32_t,
+		x0 = svqinch_pat_n_u32 (x1, SV_POW2, 1),
+		x0 = svqinch_pat (x1, SV_POW2, 1))
+
+/*
+** qinch_pat_n_2_u32:
+**	uqinch	w0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_2_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_POW2, 2),
+		x0 = svqinch_pat (x0, SV_POW2, 2))
+
+/*
+** qinch_pat_n_7_u32:
+**	uqinch	w0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_7_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_POW2, 7),
+		x0 = svqinch_pat (x0, SV_POW2, 7))
+
+/*
+** qinch_pat_n_15_u32:
+**	uqinch	w0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_15_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_POW2, 15),
+		x0 = svqinch_pat (x0, SV_POW2, 15))
+
+/*
+** qinch_pat_n_16_u32:
+**	uqinch	w0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_16_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_POW2, 16),
+		x0 = svqinch_pat (x0, SV_POW2, 16))
+
+/*
+** qinch_pat_n_vl1_u32:
+**	uqinch	w0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl1_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_VL1, 16),
+		x0 = svqinch_pat (x0, SV_VL1, 16))
+
+/*
+** qinch_pat_n_vl2_u32:
+**	uqinch	w0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl2_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_VL2, 16),
+		x0 = svqinch_pat (x0, SV_VL2, 16))
+
+/*
+** qinch_pat_n_vl3_u32:
+**	uqinch	w0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl3_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_VL3, 16),
+		x0 = svqinch_pat (x0, SV_VL3, 16))
+
+/*
+** qinch_pat_n_vl4_u32:
+**	uqinch	w0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl4_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_VL4, 16),
+		x0 = svqinch_pat (x0, SV_VL4, 16))
+
+/*
+** qinch_pat_n_vl5_u32:
+**	uqinch	w0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl5_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_VL5, 16),
+		x0 = svqinch_pat (x0, SV_VL5, 16))
+
+/*
+** qinch_pat_n_vl6_u32:
+**	uqinch	w0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl6_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_VL6, 16),
+		x0 = svqinch_pat (x0, SV_VL6, 16))
+
+/*
+** qinch_pat_n_vl7_u32:
+**	uqinch	w0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl7_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_VL7, 16),
+		x0 = svqinch_pat (x0, SV_VL7, 16))
+
+/*
+** qinch_pat_n_vl8_u32:
+**	uqinch	w0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl8_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_VL8, 16),
+		x0 = svqinch_pat (x0, SV_VL8, 16))
+
+/*
+** qinch_pat_n_vl16_u32:
+**	uqinch	w0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl16_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_VL16, 16),
+		x0 = svqinch_pat (x0, SV_VL16, 16))
+
+/*
+** qinch_pat_n_vl32_u32:
+**	uqinch	w0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl32_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_VL32, 16),
+		x0 = svqinch_pat (x0, SV_VL32, 16))
+
+/*
+** qinch_pat_n_vl64_u32:
+**	uqinch	w0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl64_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_VL64, 16),
+		x0 = svqinch_pat (x0, SV_VL64, 16))
+
+/*
+** qinch_pat_n_vl128_u32:
+**	uqinch	w0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl128_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_VL128, 16),
+		x0 = svqinch_pat (x0, SV_VL128, 16))
+
+/*
+** qinch_pat_n_vl256_u32:
+**	uqinch	w0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl256_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_VL256, 16),
+		x0 = svqinch_pat (x0, SV_VL256, 16))
+
+/*
+** qinch_pat_n_mul4_u32:
+**	uqinch	w0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_mul4_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_MUL4, 16),
+		x0 = svqinch_pat (x0, SV_MUL4, 16))
+
+/*
+** qinch_pat_n_mul3_u32:
+**	uqinch	w0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_mul3_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_MUL3, 16),
+		x0 = svqinch_pat (x0, SV_MUL3, 16))
+
+/*
+** qinch_pat_n_all_u32:
+**	uqinch	w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_all_u32, uint32_t,
+		x0 = svqinch_pat_n_u32 (x0, SV_ALL, 16),
+		x0 = svqinch_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u64.c
new file mode 100644
index 000000000..51722646d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_pat_u64.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qinch_pat_n_1_u64_tied:
+**	uqinch	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_1_u64_tied, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_POW2, 1),
+		x0 = svqinch_pat (x0, SV_POW2, 1))
+
+/*
+** qinch_pat_n_1_u64_untied:
+**	mov	x0, x1
+**	uqinch	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_1_u64_untied, uint64_t,
+		x0 = svqinch_pat_n_u64 (x1, SV_POW2, 1),
+		x0 = svqinch_pat (x1, SV_POW2, 1))
+
+/*
+** qinch_pat_n_2_u64:
+**	uqinch	x0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_2_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_POW2, 2),
+		x0 = svqinch_pat (x0, SV_POW2, 2))
+
+/*
+** qinch_pat_n_7_u64:
+**	uqinch	x0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_7_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_POW2, 7),
+		x0 = svqinch_pat (x0, SV_POW2, 7))
+
+/*
+** qinch_pat_n_15_u64:
+**	uqinch	x0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_15_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_POW2, 15),
+		x0 = svqinch_pat (x0, SV_POW2, 15))
+
+/*
+** qinch_pat_n_16_u64:
+**	uqinch	x0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_16_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_POW2, 16),
+		x0 = svqinch_pat (x0, SV_POW2, 16))
+
+/*
+** qinch_pat_n_vl1_u64:
+**	uqinch	x0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl1_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_VL1, 16),
+		x0 = svqinch_pat (x0, SV_VL1, 16))
+
+/*
+** qinch_pat_n_vl2_u64:
+**	uqinch	x0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl2_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_VL2, 16),
+		x0 = svqinch_pat (x0, SV_VL2, 16))
+
+/*
+** qinch_pat_n_vl3_u64:
+**	uqinch	x0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl3_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_VL3, 16),
+		x0 = svqinch_pat (x0, SV_VL3, 16))
+
+/*
+** qinch_pat_n_vl4_u64:
+**	uqinch	x0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl4_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_VL4, 16),
+		x0 = svqinch_pat (x0, SV_VL4, 16))
+
+/*
+** qinch_pat_n_vl5_u64:
+**	uqinch	x0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl5_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_VL5, 16),
+		x0 = svqinch_pat (x0, SV_VL5, 16))
+
+/*
+** qinch_pat_n_vl6_u64:
+**	uqinch	x0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl6_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_VL6, 16),
+		x0 = svqinch_pat (x0, SV_VL6, 16))
+
+/*
+** qinch_pat_n_vl7_u64:
+**	uqinch	x0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl7_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_VL7, 16),
+		x0 = svqinch_pat (x0, SV_VL7, 16))
+
+/*
+** qinch_pat_n_vl8_u64:
+**	uqinch	x0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl8_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_VL8, 16),
+		x0 = svqinch_pat (x0, SV_VL8, 16))
+
+/*
+** qinch_pat_n_vl16_u64:
+**	uqinch	x0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl16_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_VL16, 16),
+		x0 = svqinch_pat (x0, SV_VL16, 16))
+
+/*
+** qinch_pat_n_vl32_u64:
+**	uqinch	x0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl32_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_VL32, 16),
+		x0 = svqinch_pat (x0, SV_VL32, 16))
+
+/*
+** qinch_pat_n_vl64_u64:
+**	uqinch	x0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl64_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_VL64, 16),
+		x0 = svqinch_pat (x0, SV_VL64, 16))
+
+/*
+** qinch_pat_n_vl128_u64:
+**	uqinch	x0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl128_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_VL128, 16),
+		x0 = svqinch_pat (x0, SV_VL128, 16))
+
+/*
+** qinch_pat_n_vl256_u64:
+**	uqinch	x0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_vl256_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_VL256, 16),
+		x0 = svqinch_pat (x0, SV_VL256, 16))
+
+/*
+** qinch_pat_n_mul4_u64:
+**	uqinch	x0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_mul4_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_MUL4, 16),
+		x0 = svqinch_pat (x0, SV_MUL4, 16))
+
+/*
+** qinch_pat_n_mul3_u64:
+**	uqinch	x0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_mul3_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_MUL3, 16),
+		x0 = svqinch_pat (x0, SV_MUL3, 16))
+
+/*
+** qinch_pat_n_all_u64:
+**	uqinch	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_pat_n_all_u64, uint64_t,
+		x0 = svqinch_pat_n_u64 (x0, SV_ALL, 16),
+		x0 = svqinch_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s16.c
new file mode 100644
index 000000000..1f460db8e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s16.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qinch_1_s16_tied:
+**	sqinch	z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_1_s16_tied, svint16_t,
+		z0 = svqinch_s16 (z0, 1),
+		z0 = svqinch (z0, 1))
+
+/*
+** qinch_1_s16_untied:
+**	movprfx	z0, z1
+**	sqinch	z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_1_s16_untied, svint16_t,
+		z0 = svqinch_s16 (z1, 1),
+		z0 = svqinch (z1, 1))
+
+/*
+** qinch_2_s16:
+**	sqinch	z0\.h, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_2_s16, svint16_t,
+		z0 = svqinch_s16 (z0, 2),
+		z0 = svqinch (z0, 2))
+
+/*
+** qinch_7_s16:
+**	sqinch	z0\.h, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_7_s16, svint16_t,
+		z0 = svqinch_s16 (z0, 7),
+		z0 = svqinch (z0, 7))
+
+/*
+** qinch_15_s16:
+**	sqinch	z0\.h, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_15_s16, svint16_t,
+		z0 = svqinch_s16 (z0, 15),
+		z0 = svqinch (z0, 15))
+
+/*
+** qinch_16_s16:
+**	sqinch	z0\.h, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_16_s16, svint16_t,
+		z0 = svqinch_s16 (z0, 16),
+		z0 = svqinch (z0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s32.c
new file mode 100644
index 000000000..a7b1aac80
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s32.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qinch_n_1_s32_tied:
+**	sqinch	x0, w0
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_1_s32_tied, int32_t,
+		x0 = svqinch_n_s32 (x0, 1),
+		x0 = svqinch (x0, 1))
+
+/*
+** qinch_n_1_s32_untied:
+**	mov	w0, w1
+**	sqinch	x0, w0
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_1_s32_untied, int32_t,
+		x0 = svqinch_n_s32 (x1, 1),
+		x0 = svqinch (x1, 1))
+
+/*
+** qinch_n_2_s32:
+**	sqinch	x0, w0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_2_s32, int32_t,
+		x0 = svqinch_n_s32 (x0, 2),
+		x0 = svqinch (x0, 2))
+
+/*
+** qinch_n_7_s32:
+**	sqinch	x0, w0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_7_s32, int32_t,
+		x0 = svqinch_n_s32 (x0, 7),
+		x0 = svqinch (x0, 7))
+
+/*
+** qinch_n_15_s32:
+**	sqinch	x0, w0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_15_s32, int32_t,
+		x0 = svqinch_n_s32 (x0, 15),
+		x0 = svqinch (x0, 15))
+
+/*
+** qinch_n_16_s32:
+**	sqinch	x0, w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_16_s32, int32_t,
+		x0 = svqinch_n_s32 (x0, 16),
+		x0 = svqinch (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s64.c
new file mode 100644
index 000000000..74ac6a3df
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_s64.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qinch_n_1_s64_tied:
+**	sqinch	x0
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_1_s64_tied, int64_t,
+		x0 = svqinch_n_s64 (x0, 1),
+		x0 = svqinch (x0, 1))
+
+/*
+** qinch_n_1_s64_untied:
+**	mov	x0, x1
+**	sqinch	x0
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_1_s64_untied, int64_t,
+		x0 = svqinch_n_s64 (x1, 1),
+		x0 = svqinch (x1, 1))
+
+/*
+** qinch_n_2_s64:
+**	sqinch	x0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_2_s64, int64_t,
+		x0 = svqinch_n_s64 (x0, 2),
+		x0 = svqinch (x0, 2))
+
+/*
+** qinch_n_7_s64:
+**	sqinch	x0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_7_s64, int64_t,
+		x0 = svqinch_n_s64 (x0, 7),
+		x0 = svqinch (x0, 7))
+
+/*
+** qinch_n_15_s64:
+**	sqinch	x0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_15_s64, int64_t,
+		x0 = svqinch_n_s64 (x0, 15),
+		x0 = svqinch (x0, 15))
+
+/*
+** qinch_n_16_s64:
+**	sqinch	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_16_s64, int64_t,
+		x0 = svqinch_n_s64 (x0, 16),
+		x0 = svqinch (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u16.c
new file mode 100644
index 000000000..aa9905897
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u16.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qinch_1_u16_tied:
+**	uqinch	z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_1_u16_tied, svuint16_t,
+		z0 = svqinch_u16 (z0, 1),
+		z0 = svqinch (z0, 1))
+
+/*
+** qinch_1_u16_untied:
+**	movprfx	z0, z1
+**	uqinch	z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_1_u16_untied, svuint16_t,
+		z0 = svqinch_u16 (z1, 1),
+		z0 = svqinch (z1, 1))
+
+/*
+** qinch_2_u16:
+**	uqinch	z0\.h, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_2_u16, svuint16_t,
+		z0 = svqinch_u16 (z0, 2),
+		z0 = svqinch (z0, 2))
+
+/*
+** qinch_7_u16:
+**	uqinch	z0\.h, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_7_u16, svuint16_t,
+		z0 = svqinch_u16 (z0, 7),
+		z0 = svqinch (z0, 7))
+
+/*
+** qinch_15_u16:
+**	uqinch	z0\.h, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_15_u16, svuint16_t,
+		z0 = svqinch_u16 (z0, 15),
+		z0 = svqinch (z0, 15))
+
+/*
+** qinch_16_u16:
+**	uqinch	z0\.h, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qinch_16_u16, svuint16_t,
+		z0 = svqinch_u16 (z0, 16),
+		z0 = svqinch (z0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u32.c
new file mode 100644
index 000000000..396f95b2a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u32.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qinch_n_1_u32_tied:
+**	uqinch	w0
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_1_u32_tied, uint32_t,
+		x0 = svqinch_n_u32 (x0, 1),
+		x0 = svqinch (x0, 1))
+
+/*
+** qinch_n_1_u32_untied:
+**	mov	w0, w1
+**	uqinch	w0
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_1_u32_untied, uint32_t,
+		x0 = svqinch_n_u32 (x1, 1),
+		x0 = svqinch (x1, 1))
+
+/*
+** qinch_n_2_u32:
+**	uqinch	w0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_2_u32, uint32_t,
+		x0 = svqinch_n_u32 (x0, 2),
+		x0 = svqinch (x0, 2))
+
+/*
+** qinch_n_7_u32:
+**	uqinch	w0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_7_u32, uint32_t,
+		x0 = svqinch_n_u32 (x0, 7),
+		x0 = svqinch (x0, 7))
+
+/*
+** qinch_n_15_u32:
+**	uqinch	w0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_15_u32, uint32_t,
+		x0 = svqinch_n_u32 (x0, 15),
+		x0 = svqinch (x0, 15))
+
+/*
+** qinch_n_16_u32:
+**	uqinch	w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_16_u32, uint32_t,
+		x0 = svqinch_n_u32 (x0, 16),
+		x0 = svqinch (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u64.c
new file mode 100644
index 000000000..5a9231722
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qinch_u64.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qinch_n_1_u64_tied:
+**	uqinch	x0
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_1_u64_tied, uint64_t,
+		x0 = svqinch_n_u64 (x0, 1),
+		x0 = svqinch (x0, 1))
+
+/*
+** qinch_n_1_u64_untied:
+**	mov	x0, x1
+**	uqinch	x0
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_1_u64_untied, uint64_t,
+		x0 = svqinch_n_u64 (x1, 1),
+		x0 = svqinch (x1, 1))
+
+/*
+** qinch_n_2_u64:
+**	uqinch	x0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_2_u64, uint64_t,
+		x0 = svqinch_n_u64 (x0, 2),
+		x0 = svqinch (x0, 2))
+
+/*
+** qinch_n_7_u64:
+**	uqinch	x0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_7_u64, uint64_t,
+		x0 = svqinch_n_u64 (x0, 7),
+		x0 = svqinch (x0, 7))
+
+/*
+** qinch_n_15_u64:
+**	uqinch	x0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_15_u64, uint64_t,
+		x0 = svqinch_n_u64 (x0, 15),
+		x0 = svqinch (x0, 15))
+
+/*
+** qinch_n_16_u64:
+**	uqinch	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qinch_n_16_u64, uint64_t,
+		x0 = svqinch_n_u64 (x0, 16),
+		x0 = svqinch (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s16.c
new file mode 100644
index 000000000..979b57476
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s16.c
@@ -0,0 +1,22 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincp_s16_tied:
+**	sqincp	z0\.h, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qincp_s16_tied, svint16_t,
+		z0 = svqincp_s16 (z0, p0),
+		z0 = svqincp (z0, p0))
+
+/*
+** qincp_s16_untied:
+**	movprfx	z0, z1
+**	sqincp	z0\.h, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qincp_s16_untied, svint16_t,
+		z0 = svqincp_s16 (z1, p0),
+		z0 = svqincp (z1, p0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s32.c
new file mode 100644
index 000000000..46ad51b01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s32.c
@@ -0,0 +1,98 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincp_s32_tied:
+**	sqincp	z0\.s, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qincp_s32_tied, svint32_t,
+		z0 = svqincp_s32 (z0, p0),
+		z0 = svqincp (z0, p0))
+
+/*
+** qincp_s32_untied:
+**	movprfx	z0, z1
+**	sqincp	z0\.s, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qincp_s32_untied, svint32_t,
+		z0 = svqincp_s32 (z1, p0),
+		z0 = svqincp (z1, p0))
+
+/*
+** qincp_n_s32_b8_tied:
+**	sqincp	x0, p0\.b, w0
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_s32_b8_tied, int32_t,
+		x0 = svqincp_n_s32_b8 (x0, p0),
+		x0 = svqincp_b8 (x0, p0))
+
+/*
+** qincp_n_s32_b8_untied:
+**	mov	w0, w1
+**	sqincp	x0, p0\.b, w0
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_s32_b8_untied, int32_t,
+		x0 = svqincp_n_s32_b8 (x1, p0),
+		x0 = svqincp_b8 (x1, p0))
+
+/*
+** qincp_n_s32_b16_tied:
+**	sqincp	x0, p0\.h, w0
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_s32_b16_tied, int32_t,
+		x0 = svqincp_n_s32_b16 (x0, p0),
+		x0 = svqincp_b16 (x0, p0))
+
+/*
+** qincp_n_s32_b16_untied:
+**	mov	w0, w1
+**	sqincp	x0, p0\.h, w0
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_s32_b16_untied, int32_t,
+		x0 = svqincp_n_s32_b16 (x1, p0),
+		x0 = svqincp_b16 (x1, p0))
+
+/*
+** qincp_n_s32_b32_tied:
+**	sqincp	x0, p0\.s, w0
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_s32_b32_tied, int32_t,
+		x0 = svqincp_n_s32_b32 (x0, p0),
+		x0 = svqincp_b32 (x0, p0))
+
+/*
+** qincp_n_s32_b32_untied:
+**	mov	w0, w1
+**	sqincp	x0, p0\.s, w0
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_s32_b32_untied, int32_t,
+		x0 = svqincp_n_s32_b32 (x1, p0),
+		x0 = svqincp_b32 (x1, p0))
+
+/*
+** qincp_n_s32_b64_tied:
+**	sqincp	x0, p0\.d, w0
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_s32_b64_tied, int32_t,
+		x0 = svqincp_n_s32_b64 (x0, p0),
+		x0 = svqincp_b64 (x0, p0))
+
+/*
+** qincp_n_s32_b64_untied:
+**	mov	w0, w1
+**	sqincp	x0, p0\.d, w0
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_s32_b64_untied, int32_t,
+		x0 = svqincp_n_s32_b64 (x1, p0),
+		x0 = svqincp_b64 (x1, p0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s64.c
new file mode 100644
index 000000000..226502328
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_s64.c
@@ -0,0 +1,98 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincp_s64_tied:
+**	sqincp	z0\.d, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qincp_s64_tied, svint64_t,
+		z0 = svqincp_s64 (z0, p0),
+		z0 = svqincp (z0, p0))
+
+/*
+** qincp_s64_untied:
+**	movprfx	z0, z1
+**	sqincp	z0\.d, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qincp_s64_untied, svint64_t,
+		z0 = svqincp_s64 (z1, p0),
+		z0 = svqincp (z1, p0))
+
+/*
+** qincp_n_s64_b8_tied:
+**	sqincp	x0, p0\.b
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_s64_b8_tied, int64_t,
+		x0 = svqincp_n_s64_b8 (x0, p0),
+		x0 = svqincp_b8 (x0, p0))
+
+/*
+** qincp_n_s64_b8_untied:
+**	mov	x0, x1
+**	sqincp	x0, p0\.b
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_s64_b8_untied, int64_t,
+		x0 = svqincp_n_s64_b8 (x1, p0),
+		x0 = svqincp_b8 (x1, p0))
+
+/*
+** qincp_n_s64_b16_tied:
+**	sqincp	x0, p0\.h
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_s64_b16_tied, int64_t,
+		x0 = svqincp_n_s64_b16 (x0, p0),
+		x0 = svqincp_b16 (x0, p0))
+
+/*
+** qincp_n_s64_b16_untied:
+**	mov	x0, x1
+**	sqincp	x0, p0\.h
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_s64_b16_untied, int64_t,
+		x0 = svqincp_n_s64_b16 (x1, p0),
+		x0 = svqincp_b16 (x1, p0))
+
+/*
+** qincp_n_s64_b32_tied:
+**	sqincp	x0, p0\.s
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_s64_b32_tied, int64_t,
+		x0 = svqincp_n_s64_b32 (x0, p0),
+		x0 = svqincp_b32 (x0, p0))
+
+/*
+** qincp_n_s64_b32_untied:
+**	mov	x0, x1
+**	sqincp	x0, p0\.s
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_s64_b32_untied, int64_t,
+		x0 = svqincp_n_s64_b32 (x1, p0),
+		x0 = svqincp_b32 (x1, p0))
+
+/*
+** qincp_n_s64_b64_tied:
+**	sqincp	x0, p0\.d
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_s64_b64_tied, int64_t,
+		x0 = svqincp_n_s64_b64 (x0, p0),
+		x0 = svqincp_b64 (x0, p0))
+
+/*
+** qincp_n_s64_b64_untied:
+**	mov	x0, x1
+**	sqincp	x0, p0\.d
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_s64_b64_untied, int64_t,
+		x0 = svqincp_n_s64_b64 (x1, p0),
+		x0 = svqincp_b64 (x1, p0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u16.c
new file mode 100644
index 000000000..ecd84470c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u16.c
@@ -0,0 +1,22 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincp_u16_tied:
+**	uqincp	z0\.h, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qincp_u16_tied, svuint16_t,
+		z0 = svqincp_u16 (z0, p0),
+		z0 = svqincp (z0, p0))
+
+/*
+** qincp_u16_untied:
+**	movprfx	z0, z1
+**	uqincp	z0\.h, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qincp_u16_untied, svuint16_t,
+		z0 = svqincp_u16 (z1, p0),
+		z0 = svqincp (z1, p0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u32.c
new file mode 100644
index 000000000..011a26253
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u32.c
@@ -0,0 +1,98 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincp_u32_tied:
+**	uqincp	z0\.s, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qincp_u32_tied, svuint32_t,
+		z0 = svqincp_u32 (z0, p0),
+		z0 = svqincp (z0, p0))
+
+/*
+** qincp_u32_untied:
+**	movprfx	z0, z1
+**	uqincp	z0\.s, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qincp_u32_untied, svuint32_t,
+		z0 = svqincp_u32 (z1, p0),
+		z0 = svqincp (z1, p0))
+
+/*
+** qincp_n_u32_b8_tied:
+**	uqincp	w0, p0\.b
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_u32_b8_tied, uint32_t,
+		x0 = svqincp_n_u32_b8 (x0, p0),
+		x0 = svqincp_b8 (x0, p0))
+
+/*
+** qincp_n_u32_b8_untied:
+**	mov	w0, w1
+**	uqincp	w0, p0\.b
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_u32_b8_untied, uint32_t,
+		x0 = svqincp_n_u32_b8 (x1, p0),
+		x0 = svqincp_b8 (x1, p0))
+
+/*
+** qincp_n_u32_b16_tied:
+**	uqincp	w0, p0\.h
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_u32_b16_tied, uint32_t,
+		x0 = svqincp_n_u32_b16 (x0, p0),
+		x0 = svqincp_b16 (x0, p0))
+
+/*
+** qincp_n_u32_b16_untied:
+**	mov	w0, w1
+**	uqincp	w0, p0\.h
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_u32_b16_untied, uint32_t,
+		x0 = svqincp_n_u32_b16 (x1, p0),
+		x0 = svqincp_b16 (x1, p0))
+
+/*
+** qincp_n_u32_b32_tied:
+**	uqincp	w0, p0\.s
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_u32_b32_tied, uint32_t,
+		x0 = svqincp_n_u32_b32 (x0, p0),
+		x0 = svqincp_b32 (x0, p0))
+
+/*
+** qincp_n_u32_b32_untied:
+**	mov	w0, w1
+**	uqincp	w0, p0\.s
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_u32_b32_untied, uint32_t,
+		x0 = svqincp_n_u32_b32 (x1, p0),
+		x0 = svqincp_b32 (x1, p0))
+
+/*
+** qincp_n_u32_b64_tied:
+**	uqincp	w0, p0\.d
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_u32_b64_tied, uint32_t,
+		x0 = svqincp_n_u32_b64 (x0, p0),
+		x0 = svqincp_b64 (x0, p0))
+
+/*
+** qincp_n_u32_b64_untied:
+**	mov	w0, w1
+**	uqincp	w0, p0\.d
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_u32_b64_untied, uint32_t,
+		x0 = svqincp_n_u32_b64 (x1, p0),
+		x0 = svqincp_b64 (x1, p0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u64.c
new file mode 100644
index 000000000..761ac553a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincp_u64.c
@@ -0,0 +1,98 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincp_u64_tied:
+**	uqincp	z0\.d, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qincp_u64_tied, svuint64_t,
+		z0 = svqincp_u64 (z0, p0),
+		z0 = svqincp (z0, p0))
+
+/*
+** qincp_u64_untied:
+**	movprfx	z0, z1
+**	uqincp	z0\.d, p0
+**	ret
+*/
+TEST_UNIFORM_Z (qincp_u64_untied, svuint64_t,
+		z0 = svqincp_u64 (z1, p0),
+		z0 = svqincp (z1, p0))
+
+/*
+** qincp_n_u64_b8_tied:
+**	uqincp	x0, p0\.b
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_u64_b8_tied, uint64_t,
+		x0 = svqincp_n_u64_b8 (x0, p0),
+		x0 = svqincp_b8 (x0, p0))
+
+/*
+** qincp_n_u64_b8_untied:
+**	mov	x0, x1
+**	uqincp	x0, p0\.b
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_u64_b8_untied, uint64_t,
+		x0 = svqincp_n_u64_b8 (x1, p0),
+		x0 = svqincp_b8 (x1, p0))
+
+/*
+** qincp_n_u64_b16_tied:
+**	uqincp	x0, p0\.h
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_u64_b16_tied, uint64_t,
+		x0 = svqincp_n_u64_b16 (x0, p0),
+		x0 = svqincp_b16 (x0, p0))
+
+/*
+** qincp_n_u64_b16_untied:
+**	mov	x0, x1
+**	uqincp	x0, p0\.h
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_u64_b16_untied, uint64_t,
+		x0 = svqincp_n_u64_b16 (x1, p0),
+		x0 = svqincp_b16 (x1, p0))
+
+/*
+** qincp_n_u64_b32_tied:
+**	uqincp	x0, p0\.s
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_u64_b32_tied, uint64_t,
+		x0 = svqincp_n_u64_b32 (x0, p0),
+		x0 = svqincp_b32 (x0, p0))
+
+/*
+** qincp_n_u64_b32_untied:
+**	mov	x0, x1
+**	uqincp	x0, p0\.s
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_u64_b32_untied, uint64_t,
+		x0 = svqincp_n_u64_b32 (x1, p0),
+		x0 = svqincp_b32 (x1, p0))
+
+/*
+** qincp_n_u64_b64_tied:
+**	uqincp	x0, p0\.d
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_u64_b64_tied, uint64_t,
+		x0 = svqincp_n_u64_b64 (x0, p0),
+		x0 = svqincp_b64 (x0, p0))
+
+/*
+** qincp_n_u64_b64_untied:
+**	mov	x0, x1
+**	uqincp	x0, p0\.d
+**	ret
+*/
+TEST_UNIFORM_S (qincp_n_u64_b64_untied, uint64_t,
+		x0 = svqincp_n_u64_b64 (x1, p0),
+		x0 = svqincp_b64 (x1, p0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_s32.c
new file mode 100644
index 000000000..6ceb003ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_s32.c
@@ -0,0 +1,401 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincw_pat_1_s32_tied:
+**	sqincw	z0\.s, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_1_s32_tied, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_POW2, 1),
+		z0 = svqincw_pat (z0, SV_POW2, 1))
+
+/*
+** qincw_pat_1_s32_untied:
+**	movprfx	z0, z1
+**	sqincw	z0\.s, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_1_s32_untied, svint32_t,
+		z0 = svqincw_pat_s32 (z1, SV_POW2, 1),
+		z0 = svqincw_pat (z1, SV_POW2, 1))
+
+/*
+** qincw_pat_2_s32:
+**	sqincw	z0\.s, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_2_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_POW2, 2),
+		z0 = svqincw_pat (z0, SV_POW2, 2))
+
+/*
+** qincw_pat_7_s32:
+**	sqincw	z0\.s, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_7_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_POW2, 7),
+		z0 = svqincw_pat (z0, SV_POW2, 7))
+
+/*
+** qincw_pat_15_s32:
+**	sqincw	z0\.s, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_15_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_POW2, 15),
+		z0 = svqincw_pat (z0, SV_POW2, 15))
+
+/*
+** qincw_pat_16_s32:
+**	sqincw	z0\.s, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_16_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_POW2, 16),
+		z0 = svqincw_pat (z0, SV_POW2, 16))
+
+/*
+** qincw_pat_vl1_s32:
+**	sqincw	z0\.s, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl1_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_VL1, 16),
+		z0 = svqincw_pat (z0, SV_VL1, 16))
+
+/*
+** qincw_pat_vl2_s32:
+**	sqincw	z0\.s, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl2_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_VL2, 16),
+		z0 = svqincw_pat (z0, SV_VL2, 16))
+
+/*
+** qincw_pat_vl3_s32:
+**	sqincw	z0\.s, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl3_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_VL3, 16),
+		z0 = svqincw_pat (z0, SV_VL3, 16))
+
+/*
+** qincw_pat_vl4_s32:
+**	sqincw	z0\.s, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl4_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_VL4, 16),
+		z0 = svqincw_pat (z0, SV_VL4, 16))
+
+/*
+** qincw_pat_vl5_s32:
+**	sqincw	z0\.s, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl5_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_VL5, 16),
+		z0 = svqincw_pat (z0, SV_VL5, 16))
+
+/*
+** qincw_pat_vl6_s32:
+**	sqincw	z0\.s, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl6_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_VL6, 16),
+		z0 = svqincw_pat (z0, SV_VL6, 16))
+
+/*
+** qincw_pat_vl7_s32:
+**	sqincw	z0\.s, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl7_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_VL7, 16),
+		z0 = svqincw_pat (z0, SV_VL7, 16))
+
+/*
+** qincw_pat_vl8_s32:
+**	sqincw	z0\.s, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl8_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_VL8, 16),
+		z0 = svqincw_pat (z0, SV_VL8, 16))
+
+/*
+** qincw_pat_vl16_s32:
+**	sqincw	z0\.s, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl16_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_VL16, 16),
+		z0 = svqincw_pat (z0, SV_VL16, 16))
+
+/*
+** qincw_pat_vl32_s32:
+**	sqincw	z0\.s, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl32_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_VL32, 16),
+		z0 = svqincw_pat (z0, SV_VL32, 16))
+
+/*
+** qincw_pat_vl64_s32:
+**	sqincw	z0\.s, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl64_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_VL64, 16),
+		z0 = svqincw_pat (z0, SV_VL64, 16))
+
+/*
+** qincw_pat_vl128_s32:
+**	sqincw	z0\.s, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl128_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_VL128, 16),
+		z0 = svqincw_pat (z0, SV_VL128, 16))
+
+/*
+** qincw_pat_vl256_s32:
+**	sqincw	z0\.s, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl256_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_VL256, 16),
+		z0 = svqincw_pat (z0, SV_VL256, 16))
+
+/*
+** qincw_pat_mul4_s32:
+**	sqincw	z0\.s, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_mul4_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_MUL4, 16),
+		z0 = svqincw_pat (z0, SV_MUL4, 16))
+
+/*
+** qincw_pat_mul3_s32:
+**	sqincw	z0\.s, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_mul3_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_MUL3, 16),
+		z0 = svqincw_pat (z0, SV_MUL3, 16))
+
+/*
+** qincw_pat_all_s32:
+**	sqincw	z0\.s, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_all_s32, svint32_t,
+		z0 = svqincw_pat_s32 (z0, SV_ALL, 16),
+		z0 = svqincw_pat (z0, SV_ALL, 16))
+
+/*
+** qincw_pat_n_1_s32_tied:
+**	sqincw	x0, w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_1_s32_tied, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_POW2, 1),
+		x0 = svqincw_pat (x0, SV_POW2, 1))
+
+/*
+** qincw_pat_n_1_s32_untied:
+**	mov	w0, w1
+**	sqincw	x0, w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_1_s32_untied, int32_t,
+		x0 = svqincw_pat_n_s32 (x1, SV_POW2, 1),
+		x0 = svqincw_pat (x1, SV_POW2, 1))
+
+/*
+** qincw_pat_n_2_s32:
+**	sqincw	x0, w0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_2_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_POW2, 2),
+		x0 = svqincw_pat (x0, SV_POW2, 2))
+
+/*
+** qincw_pat_n_7_s32:
+**	sqincw	x0, w0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_7_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_POW2, 7),
+		x0 = svqincw_pat (x0, SV_POW2, 7))
+
+/*
+** qincw_pat_n_15_s32:
+**	sqincw	x0, w0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_15_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_POW2, 15),
+		x0 = svqincw_pat (x0, SV_POW2, 15))
+
+/*
+** qincw_pat_n_16_s32:
+**	sqincw	x0, w0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_16_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_POW2, 16),
+		x0 = svqincw_pat (x0, SV_POW2, 16))
+
+/*
+** qincw_pat_n_vl1_s32:
+**	sqincw	x0, w0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl1_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_VL1, 16),
+		x0 = svqincw_pat (x0, SV_VL1, 16))
+
+/*
+** qincw_pat_n_vl2_s32:
+**	sqincw	x0, w0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl2_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_VL2, 16),
+		x0 = svqincw_pat (x0, SV_VL2, 16))
+
+/*
+** qincw_pat_n_vl3_s32:
+**	sqincw	x0, w0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl3_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_VL3, 16),
+		x0 = svqincw_pat (x0, SV_VL3, 16))
+
+/*
+** qincw_pat_n_vl4_s32:
+**	sqincw	x0, w0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl4_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_VL4, 16),
+		x0 = svqincw_pat (x0, SV_VL4, 16))
+
+/*
+** qincw_pat_n_vl5_s32:
+**	sqincw	x0, w0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl5_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_VL5, 16),
+		x0 = svqincw_pat (x0, SV_VL5, 16))
+
+/*
+** qincw_pat_n_vl6_s32:
+**	sqincw	x0, w0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl6_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_VL6, 16),
+		x0 = svqincw_pat (x0, SV_VL6, 16))
+
+/*
+** qincw_pat_n_vl7_s32:
+**	sqincw	x0, w0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl7_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_VL7, 16),
+		x0 = svqincw_pat (x0, SV_VL7, 16))
+
+/*
+** qincw_pat_n_vl8_s32:
+**	sqincw	x0, w0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl8_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_VL8, 16),
+		x0 = svqincw_pat (x0, SV_VL8, 16))
+
+/*
+** qincw_pat_n_vl16_s32:
+**	sqincw	x0, w0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl16_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_VL16, 16),
+		x0 = svqincw_pat (x0, SV_VL16, 16))
+
+/*
+** qincw_pat_n_vl32_s32:
+**	sqincw	x0, w0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl32_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_VL32, 16),
+		x0 = svqincw_pat (x0, SV_VL32, 16))
+
+/*
+** qincw_pat_n_vl64_s32:
+**	sqincw	x0, w0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl64_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_VL64, 16),
+		x0 = svqincw_pat (x0, SV_VL64, 16))
+
+/*
+** qincw_pat_n_vl128_s32:
+**	sqincw	x0, w0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl128_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_VL128, 16),
+		x0 = svqincw_pat (x0, SV_VL128, 16))
+
+/*
+** qincw_pat_n_vl256_s32:
+**	sqincw	x0, w0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl256_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_VL256, 16),
+		x0 = svqincw_pat (x0, SV_VL256, 16))
+
+/*
+** qincw_pat_n_mul4_s32:
+**	sqincw	x0, w0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_mul4_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_MUL4, 16),
+		x0 = svqincw_pat (x0, SV_MUL4, 16))
+
+/*
+** qincw_pat_n_mul3_s32:
+**	sqincw	x0, w0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_mul3_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_MUL3, 16),
+		x0 = svqincw_pat (x0, SV_MUL3, 16))
+
+/*
+** qincw_pat_n_all_s32:
+**	sqincw	x0, w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_all_s32, int32_t,
+		x0 = svqincw_pat_n_s32 (x0, SV_ALL, 16),
+		x0 = svqincw_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_s64.c
new file mode 100644
index 000000000..feebc25cc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_s64.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincw_pat_n_1_s64_tied:
+**	sqincw	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_1_s64_tied, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_POW2, 1),
+		x0 = svqincw_pat (x0, SV_POW2, 1))
+
+/*
+** qincw_pat_n_1_s64_untied:
+**	mov	x0, x1
+**	sqincw	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_1_s64_untied, int64_t,
+		x0 = svqincw_pat_n_s64 (x1, SV_POW2, 1),
+		x0 = svqincw_pat (x1, SV_POW2, 1))
+
+/*
+** qincw_pat_n_2_s64:
+**	sqincw	x0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_2_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_POW2, 2),
+		x0 = svqincw_pat (x0, SV_POW2, 2))
+
+/*
+** qincw_pat_n_7_s64:
+**	sqincw	x0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_7_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_POW2, 7),
+		x0 = svqincw_pat (x0, SV_POW2, 7))
+
+/*
+** qincw_pat_n_15_s64:
+**	sqincw	x0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_15_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_POW2, 15),
+		x0 = svqincw_pat (x0, SV_POW2, 15))
+
+/*
+** qincw_pat_n_16_s64:
+**	sqincw	x0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_16_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_POW2, 16),
+		x0 = svqincw_pat (x0, SV_POW2, 16))
+
+/*
+** qincw_pat_n_vl1_s64:
+**	sqincw	x0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl1_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_VL1, 16),
+		x0 = svqincw_pat (x0, SV_VL1, 16))
+
+/*
+** qincw_pat_n_vl2_s64:
+**	sqincw	x0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl2_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_VL2, 16),
+		x0 = svqincw_pat (x0, SV_VL2, 16))
+
+/*
+** qincw_pat_n_vl3_s64:
+**	sqincw	x0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl3_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_VL3, 16),
+		x0 = svqincw_pat (x0, SV_VL3, 16))
+
+/*
+** qincw_pat_n_vl4_s64:
+**	sqincw	x0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl4_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_VL4, 16),
+		x0 = svqincw_pat (x0, SV_VL4, 16))
+
+/*
+** qincw_pat_n_vl5_s64:
+**	sqincw	x0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl5_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_VL5, 16),
+		x0 = svqincw_pat (x0, SV_VL5, 16))
+
+/*
+** qincw_pat_n_vl6_s64:
+**	sqincw	x0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl6_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_VL6, 16),
+		x0 = svqincw_pat (x0, SV_VL6, 16))
+
+/*
+** qincw_pat_n_vl7_s64:
+**	sqincw	x0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl7_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_VL7, 16),
+		x0 = svqincw_pat (x0, SV_VL7, 16))
+
+/*
+** qincw_pat_n_vl8_s64:
+**	sqincw	x0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl8_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_VL8, 16),
+		x0 = svqincw_pat (x0, SV_VL8, 16))
+
+/*
+** qincw_pat_n_vl16_s64:
+**	sqincw	x0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl16_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_VL16, 16),
+		x0 = svqincw_pat (x0, SV_VL16, 16))
+
+/*
+** qincw_pat_n_vl32_s64:
+**	sqincw	x0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl32_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_VL32, 16),
+		x0 = svqincw_pat (x0, SV_VL32, 16))
+
+/*
+** qincw_pat_n_vl64_s64:
+**	sqincw	x0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl64_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_VL64, 16),
+		x0 = svqincw_pat (x0, SV_VL64, 16))
+
+/*
+** qincw_pat_n_vl128_s64:
+**	sqincw	x0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl128_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_VL128, 16),
+		x0 = svqincw_pat (x0, SV_VL128, 16))
+
+/*
+** qincw_pat_n_vl256_s64:
+**	sqincw	x0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl256_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_VL256, 16),
+		x0 = svqincw_pat (x0, SV_VL256, 16))
+
+/*
+** qincw_pat_n_mul4_s64:
+**	sqincw	x0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_mul4_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_MUL4, 16),
+		x0 = svqincw_pat (x0, SV_MUL4, 16))
+
+/*
+** qincw_pat_n_mul3_s64:
+**	sqincw	x0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_mul3_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_MUL3, 16),
+		x0 = svqincw_pat (x0, SV_MUL3, 16))
+
+/*
+** qincw_pat_n_all_s64:
+**	sqincw	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_all_s64, int64_t,
+		x0 = svqincw_pat_n_s64 (x0, SV_ALL, 16),
+		x0 = svqincw_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_u32.c
new file mode 100644
index 000000000..e08e91d09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_u32.c
@@ -0,0 +1,401 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincw_pat_1_u32_tied:
+**	uqincw	z0\.s, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_1_u32_tied, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_POW2, 1),
+		z0 = svqincw_pat (z0, SV_POW2, 1))
+
+/*
+** qincw_pat_1_u32_untied:
+**	movprfx	z0, z1
+**	uqincw	z0\.s, pow2
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_1_u32_untied, svuint32_t,
+		z0 = svqincw_pat_u32 (z1, SV_POW2, 1),
+		z0 = svqincw_pat (z1, SV_POW2, 1))
+
+/*
+** qincw_pat_2_u32:
+**	uqincw	z0\.s, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_2_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_POW2, 2),
+		z0 = svqincw_pat (z0, SV_POW2, 2))
+
+/*
+** qincw_pat_7_u32:
+**	uqincw	z0\.s, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_7_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_POW2, 7),
+		z0 = svqincw_pat (z0, SV_POW2, 7))
+
+/*
+** qincw_pat_15_u32:
+**	uqincw	z0\.s, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_15_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_POW2, 15),
+		z0 = svqincw_pat (z0, SV_POW2, 15))
+
+/*
+** qincw_pat_16_u32:
+**	uqincw	z0\.s, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_16_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_POW2, 16),
+		z0 = svqincw_pat (z0, SV_POW2, 16))
+
+/*
+** qincw_pat_vl1_u32:
+**	uqincw	z0\.s, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl1_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_VL1, 16),
+		z0 = svqincw_pat (z0, SV_VL1, 16))
+
+/*
+** qincw_pat_vl2_u32:
+**	uqincw	z0\.s, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl2_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_VL2, 16),
+		z0 = svqincw_pat (z0, SV_VL2, 16))
+
+/*
+** qincw_pat_vl3_u32:
+**	uqincw	z0\.s, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl3_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_VL3, 16),
+		z0 = svqincw_pat (z0, SV_VL3, 16))
+
+/*
+** qincw_pat_vl4_u32:
+**	uqincw	z0\.s, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl4_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_VL4, 16),
+		z0 = svqincw_pat (z0, SV_VL4, 16))
+
+/*
+** qincw_pat_vl5_u32:
+**	uqincw	z0\.s, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl5_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_VL5, 16),
+		z0 = svqincw_pat (z0, SV_VL5, 16))
+
+/*
+** qincw_pat_vl6_u32:
+**	uqincw	z0\.s, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl6_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_VL6, 16),
+		z0 = svqincw_pat (z0, SV_VL6, 16))
+
+/*
+** qincw_pat_vl7_u32:
+**	uqincw	z0\.s, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl7_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_VL7, 16),
+		z0 = svqincw_pat (z0, SV_VL7, 16))
+
+/*
+** qincw_pat_vl8_u32:
+**	uqincw	z0\.s, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl8_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_VL8, 16),
+		z0 = svqincw_pat (z0, SV_VL8, 16))
+
+/*
+** qincw_pat_vl16_u32:
+**	uqincw	z0\.s, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl16_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_VL16, 16),
+		z0 = svqincw_pat (z0, SV_VL16, 16))
+
+/*
+** qincw_pat_vl32_u32:
+**	uqincw	z0\.s, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl32_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_VL32, 16),
+		z0 = svqincw_pat (z0, SV_VL32, 16))
+
+/*
+** qincw_pat_vl64_u32:
+**	uqincw	z0\.s, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl64_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_VL64, 16),
+		z0 = svqincw_pat (z0, SV_VL64, 16))
+
+/*
+** qincw_pat_vl128_u32:
+**	uqincw	z0\.s, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl128_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_VL128, 16),
+		z0 = svqincw_pat (z0, SV_VL128, 16))
+
+/*
+** qincw_pat_vl256_u32:
+**	uqincw	z0\.s, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_vl256_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_VL256, 16),
+		z0 = svqincw_pat (z0, SV_VL256, 16))
+
+/*
+** qincw_pat_mul4_u32:
+**	uqincw	z0\.s, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_mul4_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_MUL4, 16),
+		z0 = svqincw_pat (z0, SV_MUL4, 16))
+
+/*
+** qincw_pat_mul3_u32:
+**	uqincw	z0\.s, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_mul3_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_MUL3, 16),
+		z0 = svqincw_pat (z0, SV_MUL3, 16))
+
+/*
+** qincw_pat_all_u32:
+**	uqincw	z0\.s, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_pat_all_u32, svuint32_t,
+		z0 = svqincw_pat_u32 (z0, SV_ALL, 16),
+		z0 = svqincw_pat (z0, SV_ALL, 16))
+
+/*
+** qincw_pat_n_1_u32_tied:
+**	uqincw	w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_1_u32_tied, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_POW2, 1),
+		x0 = svqincw_pat (x0, SV_POW2, 1))
+
+/*
+** qincw_pat_n_1_u32_untied:
+**	mov	w0, w1
+**	uqincw	w0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_1_u32_untied, uint32_t,
+		x0 = svqincw_pat_n_u32 (x1, SV_POW2, 1),
+		x0 = svqincw_pat (x1, SV_POW2, 1))
+
+/*
+** qincw_pat_n_2_u32:
+**	uqincw	w0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_2_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_POW2, 2),
+		x0 = svqincw_pat (x0, SV_POW2, 2))
+
+/*
+** qincw_pat_n_7_u32:
+**	uqincw	w0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_7_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_POW2, 7),
+		x0 = svqincw_pat (x0, SV_POW2, 7))
+
+/*
+** qincw_pat_n_15_u32:
+**	uqincw	w0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_15_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_POW2, 15),
+		x0 = svqincw_pat (x0, SV_POW2, 15))
+
+/*
+** qincw_pat_n_16_u32:
+**	uqincw	w0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_16_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_POW2, 16),
+		x0 = svqincw_pat (x0, SV_POW2, 16))
+
+/*
+** qincw_pat_n_vl1_u32:
+**	uqincw	w0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl1_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_VL1, 16),
+		x0 = svqincw_pat (x0, SV_VL1, 16))
+
+/*
+** qincw_pat_n_vl2_u32:
+**	uqincw	w0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl2_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_VL2, 16),
+		x0 = svqincw_pat (x0, SV_VL2, 16))
+
+/*
+** qincw_pat_n_vl3_u32:
+**	uqincw	w0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl3_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_VL3, 16),
+		x0 = svqincw_pat (x0, SV_VL3, 16))
+
+/*
+** qincw_pat_n_vl4_u32:
+**	uqincw	w0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl4_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_VL4, 16),
+		x0 = svqincw_pat (x0, SV_VL4, 16))
+
+/*
+** qincw_pat_n_vl5_u32:
+**	uqincw	w0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl5_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_VL5, 16),
+		x0 = svqincw_pat (x0, SV_VL5, 16))
+
+/*
+** qincw_pat_n_vl6_u32:
+**	uqincw	w0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl6_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_VL6, 16),
+		x0 = svqincw_pat (x0, SV_VL6, 16))
+
+/*
+** qincw_pat_n_vl7_u32:
+**	uqincw	w0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl7_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_VL7, 16),
+		x0 = svqincw_pat (x0, SV_VL7, 16))
+
+/*
+** qincw_pat_n_vl8_u32:
+**	uqincw	w0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl8_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_VL8, 16),
+		x0 = svqincw_pat (x0, SV_VL8, 16))
+
+/*
+** qincw_pat_n_vl16_u32:
+**	uqincw	w0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl16_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_VL16, 16),
+		x0 = svqincw_pat (x0, SV_VL16, 16))
+
+/*
+** qincw_pat_n_vl32_u32:
+**	uqincw	w0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl32_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_VL32, 16),
+		x0 = svqincw_pat (x0, SV_VL32, 16))
+
+/*
+** qincw_pat_n_vl64_u32:
+**	uqincw	w0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl64_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_VL64, 16),
+		x0 = svqincw_pat (x0, SV_VL64, 16))
+
+/*
+** qincw_pat_n_vl128_u32:
+**	uqincw	w0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl128_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_VL128, 16),
+		x0 = svqincw_pat (x0, SV_VL128, 16))
+
+/*
+** qincw_pat_n_vl256_u32:
+**	uqincw	w0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl256_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_VL256, 16),
+		x0 = svqincw_pat (x0, SV_VL256, 16))
+
+/*
+** qincw_pat_n_mul4_u32:
+**	uqincw	w0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_mul4_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_MUL4, 16),
+		x0 = svqincw_pat (x0, SV_MUL4, 16))
+
+/*
+** qincw_pat_n_mul3_u32:
+**	uqincw	w0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_mul3_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_MUL3, 16),
+		x0 = svqincw_pat (x0, SV_MUL3, 16))
+
+/*
+** qincw_pat_n_all_u32:
+**	uqincw	w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_all_u32, uint32_t,
+		x0 = svqincw_pat_n_u32 (x0, SV_ALL, 16),
+		x0 = svqincw_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_u64.c
new file mode 100644
index 000000000..a2ac9ee72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_pat_u64.c
@@ -0,0 +1,202 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincw_pat_n_1_u64_tied:
+**	uqincw	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_1_u64_tied, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_POW2, 1),
+		x0 = svqincw_pat (x0, SV_POW2, 1))
+
+/*
+** qincw_pat_n_1_u64_untied:
+**	mov	x0, x1
+**	uqincw	x0, pow2
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_1_u64_untied, uint64_t,
+		x0 = svqincw_pat_n_u64 (x1, SV_POW2, 1),
+		x0 = svqincw_pat (x1, SV_POW2, 1))
+
+/*
+** qincw_pat_n_2_u64:
+**	uqincw	x0, pow2, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_2_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_POW2, 2),
+		x0 = svqincw_pat (x0, SV_POW2, 2))
+
+/*
+** qincw_pat_n_7_u64:
+**	uqincw	x0, pow2, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_7_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_POW2, 7),
+		x0 = svqincw_pat (x0, SV_POW2, 7))
+
+/*
+** qincw_pat_n_15_u64:
+**	uqincw	x0, pow2, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_15_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_POW2, 15),
+		x0 = svqincw_pat (x0, SV_POW2, 15))
+
+/*
+** qincw_pat_n_16_u64:
+**	uqincw	x0, pow2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_16_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_POW2, 16),
+		x0 = svqincw_pat (x0, SV_POW2, 16))
+
+/*
+** qincw_pat_n_vl1_u64:
+**	uqincw	x0, vl1, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl1_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_VL1, 16),
+		x0 = svqincw_pat (x0, SV_VL1, 16))
+
+/*
+** qincw_pat_n_vl2_u64:
+**	uqincw	x0, vl2, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl2_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_VL2, 16),
+		x0 = svqincw_pat (x0, SV_VL2, 16))
+
+/*
+** qincw_pat_n_vl3_u64:
+**	uqincw	x0, vl3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl3_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_VL3, 16),
+		x0 = svqincw_pat (x0, SV_VL3, 16))
+
+/*
+** qincw_pat_n_vl4_u64:
+**	uqincw	x0, vl4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl4_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_VL4, 16),
+		x0 = svqincw_pat (x0, SV_VL4, 16))
+
+/*
+** qincw_pat_n_vl5_u64:
+**	uqincw	x0, vl5, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl5_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_VL5, 16),
+		x0 = svqincw_pat (x0, SV_VL5, 16))
+
+/*
+** qincw_pat_n_vl6_u64:
+**	uqincw	x0, vl6, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl6_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_VL6, 16),
+		x0 = svqincw_pat (x0, SV_VL6, 16))
+
+/*
+** qincw_pat_n_vl7_u64:
+**	uqincw	x0, vl7, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl7_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_VL7, 16),
+		x0 = svqincw_pat (x0, SV_VL7, 16))
+
+/*
+** qincw_pat_n_vl8_u64:
+**	uqincw	x0, vl8, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl8_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_VL8, 16),
+		x0 = svqincw_pat (x0, SV_VL8, 16))
+
+/*
+** qincw_pat_n_vl16_u64:
+**	uqincw	x0, vl16, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl16_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_VL16, 16),
+		x0 = svqincw_pat (x0, SV_VL16, 16))
+
+/*
+** qincw_pat_n_vl32_u64:
+**	uqincw	x0, vl32, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl32_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_VL32, 16),
+		x0 = svqincw_pat (x0, SV_VL32, 16))
+
+/*
+** qincw_pat_n_vl64_u64:
+**	uqincw	x0, vl64, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl64_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_VL64, 16),
+		x0 = svqincw_pat (x0, SV_VL64, 16))
+
+/*
+** qincw_pat_n_vl128_u64:
+**	uqincw	x0, vl128, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl128_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_VL128, 16),
+		x0 = svqincw_pat (x0, SV_VL128, 16))
+
+/*
+** qincw_pat_n_vl256_u64:
+**	uqincw	x0, vl256, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_vl256_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_VL256, 16),
+		x0 = svqincw_pat (x0, SV_VL256, 16))
+
+/*
+** qincw_pat_n_mul4_u64:
+**	uqincw	x0, mul4, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_mul4_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_MUL4, 16),
+		x0 = svqincw_pat (x0, SV_MUL4, 16))
+
+/*
+** qincw_pat_n_mul3_u64:
+**	uqincw	x0, mul3, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_mul3_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_MUL3, 16),
+		x0 = svqincw_pat (x0, SV_MUL3, 16))
+
+/*
+** qincw_pat_n_all_u64:
+**	uqincw	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_pat_n_all_u64, uint64_t,
+		x0 = svqincw_pat_n_u64 (x0, SV_ALL, 16),
+		x0 = svqincw_pat (x0, SV_ALL, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_s32.c
new file mode 100644
index 000000000..031824acf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_s32.c
@@ -0,0 +1,113 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincw_1_s32_tied:
+**	sqincw	z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_1_s32_tied, svint32_t,
+		z0 = svqincw_s32 (z0, 1),
+		z0 = svqincw (z0, 1))
+
+/*
+** qincw_1_s32_untied:
+**	movprfx	z0, z1
+**	sqincw	z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_1_s32_untied, svint32_t,
+		z0 = svqincw_s32 (z1, 1),
+		z0 = svqincw (z1, 1))
+
+/*
+** qincw_2_s32:
+**	sqincw	z0\.s, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_2_s32, svint32_t,
+		z0 = svqincw_s32 (z0, 2),
+		z0 = svqincw (z0, 2))
+
+/*
+** qincw_7_s32:
+**	sqincw	z0\.s, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_7_s32, svint32_t,
+		z0 = svqincw_s32 (z0, 7),
+		z0 = svqincw (z0, 7))
+
+/*
+** qincw_15_s32:
+**	sqincw	z0\.s, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_15_s32, svint32_t,
+		z0 = svqincw_s32 (z0, 15),
+		z0 = svqincw (z0, 15))
+
+/*
+** qincw_16_s32:
+**	sqincw	z0\.s, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_16_s32, svint32_t,
+		z0 = svqincw_s32 (z0, 16),
+		z0 = svqincw (z0, 16))
+
+/*
+** qincw_n_1_s32_tied:
+**	sqincw	x0, w0
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_1_s32_tied, int32_t,
+		x0 = svqincw_n_s32 (x0, 1),
+		x0 = svqincw (x0, 1))
+
+/*
+** qincw_n_1_s32_untied:
+**	mov	w0, w1
+**	sqincw	x0, w0
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_1_s32_untied, int32_t,
+		x0 = svqincw_n_s32 (x1, 1),
+		x0 = svqincw (x1, 1))
+
+/*
+** qincw_n_2_s32:
+**	sqincw	x0, w0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_2_s32, int32_t,
+		x0 = svqincw_n_s32 (x0, 2),
+		x0 = svqincw (x0, 2))
+
+/*
+** qincw_n_7_s32:
+**	sqincw	x0, w0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_7_s32, int32_t,
+		x0 = svqincw_n_s32 (x0, 7),
+		x0 = svqincw (x0, 7))
+
+/*
+** qincw_n_15_s32:
+**	sqincw	x0, w0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_15_s32, int32_t,
+		x0 = svqincw_n_s32 (x0, 15),
+		x0 = svqincw (x0, 15))
+
+/*
+** qincw_n_16_s32:
+**	sqincw	x0, w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_16_s32, int32_t,
+		x0 = svqincw_n_s32 (x0, 16),
+		x0 = svqincw (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_s64.c
new file mode 100644
index 000000000..df61f909f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_s64.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincw_n_1_s64_tied:
+**	sqincw	x0
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_1_s64_tied, int64_t,
+		x0 = svqincw_n_s64 (x0, 1),
+		x0 = svqincw (x0, 1))
+
+/*
+** qincw_n_1_s64_untied:
+**	mov	x0, x1
+**	sqincw	x0
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_1_s64_untied, int64_t,
+		x0 = svqincw_n_s64 (x1, 1),
+		x0 = svqincw (x1, 1))
+
+/*
+** qincw_n_2_s64:
+**	sqincw	x0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_2_s64, int64_t,
+		x0 = svqincw_n_s64 (x0, 2),
+		x0 = svqincw (x0, 2))
+
+/*
+** qincw_n_7_s64:
+**	sqincw	x0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_7_s64, int64_t,
+		x0 = svqincw_n_s64 (x0, 7),
+		x0 = svqincw (x0, 7))
+
+/*
+** qincw_n_15_s64:
+**	sqincw	x0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_15_s64, int64_t,
+		x0 = svqincw_n_s64 (x0, 15),
+		x0 = svqincw (x0, 15))
+
+/*
+** qincw_n_16_s64:
+**	sqincw	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_16_s64, int64_t,
+		x0 = svqincw_n_s64 (x0, 16),
+		x0 = svqincw (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_u32.c
new file mode 100644
index 000000000..65a446ab6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_u32.c
@@ -0,0 +1,113 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincw_1_u32_tied:
+**	uqincw	z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_1_u32_tied, svuint32_t,
+		z0 = svqincw_u32 (z0, 1),
+		z0 = svqincw (z0, 1))
+
+/*
+** qincw_1_u32_untied:
+**	movprfx	z0, z1
+**	uqincw	z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_1_u32_untied, svuint32_t,
+		z0 = svqincw_u32 (z1, 1),
+		z0 = svqincw (z1, 1))
+
+/*
+** qincw_2_u32:
+**	uqincw	z0\.s, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_2_u32, svuint32_t,
+		z0 = svqincw_u32 (z0, 2),
+		z0 = svqincw (z0, 2))
+
+/*
+** qincw_7_u32:
+**	uqincw	z0\.s, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_7_u32, svuint32_t,
+		z0 = svqincw_u32 (z0, 7),
+		z0 = svqincw (z0, 7))
+
+/*
+** qincw_15_u32:
+**	uqincw	z0\.s, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_15_u32, svuint32_t,
+		z0 = svqincw_u32 (z0, 15),
+		z0 = svqincw (z0, 15))
+
+/*
+** qincw_16_u32:
+**	uqincw	z0\.s, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_Z (qincw_16_u32, svuint32_t,
+		z0 = svqincw_u32 (z0, 16),
+		z0 = svqincw (z0, 16))
+
+/*
+** qincw_n_1_u32_tied:
+**	uqincw	w0
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_1_u32_tied, uint32_t,
+		x0 = svqincw_n_u32 (x0, 1),
+		x0 = svqincw (x0, 1))
+
+/*
+** qincw_n_1_u32_untied:
+**	mov	w0, w1
+**	uqincw	w0
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_1_u32_untied, uint32_t,
+		x0 = svqincw_n_u32 (x1, 1),
+		x0 = svqincw (x1, 1))
+
+/*
+** qincw_n_2_u32:
+**	uqincw	w0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_2_u32, uint32_t,
+		x0 = svqincw_n_u32 (x0, 2),
+		x0 = svqincw (x0, 2))
+
+/*
+** qincw_n_7_u32:
+**	uqincw	w0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_7_u32, uint32_t,
+		x0 = svqincw_n_u32 (x0, 7),
+		x0 = svqincw (x0, 7))
+
+/*
+** qincw_n_15_u32:
+**	uqincw	w0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_15_u32, uint32_t,
+		x0 = svqincw_n_u32 (x0, 15),
+		x0 = svqincw (x0, 15))
+
+/*
+** qincw_n_16_u32:
+**	uqincw	w0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_16_u32, uint32_t,
+		x0 = svqincw_n_u32 (x0, 16),
+		x0 = svqincw (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_u64.c
new file mode 100644
index 000000000..806a79945
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qincw_u64.c
@@ -0,0 +1,58 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qincw_n_1_u64_tied:
+**	uqincw	x0
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_1_u64_tied, uint64_t,
+		x0 = svqincw_n_u64 (x0, 1),
+		x0 = svqincw (x0, 1))
+
+/*
+** qincw_n_1_u64_untied:
+**	mov	x0, x1
+**	uqincw	x0
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_1_u64_untied, uint64_t,
+		x0 = svqincw_n_u64 (x1, 1),
+		x0 = svqincw (x1, 1))
+
+/*
+** qincw_n_2_u64:
+**	uqincw	x0, all, mul #2
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_2_u64, uint64_t,
+		x0 = svqincw_n_u64 (x0, 2),
+		x0 = svqincw (x0, 2))
+
+/*
+** qincw_n_7_u64:
+**	uqincw	x0, all, mul #7
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_7_u64, uint64_t,
+		x0 = svqincw_n_u64 (x0, 7),
+		x0 = svqincw (x0, 7))
+
+/*
+** qincw_n_15_u64:
+**	uqincw	x0, all, mul #15
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_15_u64, uint64_t,
+		x0 = svqincw_n_u64 (x0, 15),
+		x0 = svqincw (x0, 15))
+
+/*
+** qincw_n_16_u64:
+**	uqincw	x0, all, mul #16
+**	ret
+*/
+TEST_UNIFORM_S (qincw_n_16_u64, uint64_t,
+		x0 = svqincw_n_u64 (x0, 16),
+		x0 = svqincw (x0, 16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s16.c
new file mode 100644
index 000000000..8dd8381dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s16.c
@@ -0,0 +1,123 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qsub_s16_tied1:
+**	sqsub	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_s16_tied1, svint16_t,
+		z0 = svqsub_s16 (z0, z1),
+		z0 = svqsub (z0, z1))
+
+/*
+** qsub_s16_tied2:
+**	sqsub	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_s16_tied2, svint16_t,
+		z0 = svqsub_s16 (z1, z0),
+		z0 = svqsub (z1, z0))
+
+/*
+** qsub_s16_untied:
+**	sqsub	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_s16_untied, svint16_t,
+		z0 = svqsub_s16 (z1, z2),
+		z0 = svqsub (z1, z2))
+
+/*
+** qsub_w0_s16_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	sqsub	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (qsub_w0_s16_tied1, svint16_t, int16_t,
+		 z0 = svqsub_n_s16 (z0, x0),
+		 z0 = svqsub (z0, x0))
+
+/*
+** qsub_w0_s16_untied:
+**	mov	(z[0-9]+\.h), w0
+**	sqsub	z0\.h, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (qsub_w0_s16_untied, svint16_t, int16_t,
+		 z0 = svqsub_n_s16 (z1, x0),
+		 z0 = svqsub (z1, x0))
+
+/*
+** qsub_1_s16_tied1:
+**	sqsub	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_1_s16_tied1, svint16_t,
+		z0 = svqsub_n_s16 (z0, 1),
+		z0 = svqsub (z0, 1))
+
+/*
+** qsub_1_s16_untied:
+**	movprfx	z0, z1
+**	sqsub	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_1_s16_untied, svint16_t,
+		z0 = svqsub_n_s16 (z1, 1),
+		z0 = svqsub (z1, 1))
+
+/*
+** qsub_127_s16:
+**	sqsub	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_127_s16, svint16_t,
+		z0 = svqsub_n_s16 (z0, 127),
+		z0 = svqsub (z0, 127))
+
+/*
+** qsub_128_s16:
+**	sqsub	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_128_s16, svint16_t,
+		z0 = svqsub_n_s16 (z0, 128),
+		z0 = svqsub (z0, 128))
+
+/*
+** qsub_255_s16:
+**	sqsub	z0\.h, z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_255_s16, svint16_t,
+		z0 = svqsub_n_s16 (z0, 255),
+		z0 = svqsub (z0, 255))
+
+/*
+** qsub_m1_s16:
+**	sqadd	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m1_s16, svint16_t,
+		z0 = svqsub_n_s16 (z0, -1),
+		z0 = svqsub (z0, -1))
+
+/*
+** qsub_m127_s16:
+**	sqadd	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m127_s16, svint16_t,
+		z0 = svqsub_n_s16 (z0, -127),
+		z0 = svqsub (z0, -127))
+
+/*
+** qsub_m128_s16:
+**	sqadd	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m128_s16, svint16_t,
+		z0 = svqsub_n_s16 (z0, -128),
+		z0 = svqsub (z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s32.c
new file mode 100644
index 000000000..920736aec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s32.c
@@ -0,0 +1,123 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qsub_s32_tied1:
+**	sqsub	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_s32_tied1, svint32_t,
+		z0 = svqsub_s32 (z0, z1),
+		z0 = svqsub (z0, z1))
+
+/*
+** qsub_s32_tied2:
+**	sqsub	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_s32_tied2, svint32_t,
+		z0 = svqsub_s32 (z1, z0),
+		z0 = svqsub (z1, z0))
+
+/*
+** qsub_s32_untied:
+**	sqsub	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_s32_untied, svint32_t,
+		z0 = svqsub_s32 (z1, z2),
+		z0 = svqsub (z1, z2))
+
+/*
+** qsub_w0_s32_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	sqsub	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (qsub_w0_s32_tied1, svint32_t, int32_t,
+		 z0 = svqsub_n_s32 (z0, x0),
+		 z0 = svqsub (z0, x0))
+
+/*
+** qsub_w0_s32_untied:
+**	mov	(z[0-9]+\.s), w0
+**	sqsub	z0\.s, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (qsub_w0_s32_untied, svint32_t, int32_t,
+		 z0 = svqsub_n_s32 (z1, x0),
+		 z0 = svqsub (z1, x0))
+
+/*
+** qsub_1_s32_tied1:
+**	sqsub	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_1_s32_tied1, svint32_t,
+		z0 = svqsub_n_s32 (z0, 1),
+		z0 = svqsub (z0, 1))
+
+/*
+** qsub_1_s32_untied:
+**	movprfx	z0, z1
+**	sqsub	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_1_s32_untied, svint32_t,
+		z0 = svqsub_n_s32 (z1, 1),
+		z0 = svqsub (z1, 1))
+
+/*
+** qsub_127_s32:
+**	sqsub	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_127_s32, svint32_t,
+		z0 = svqsub_n_s32 (z0, 127),
+		z0 = svqsub (z0, 127))
+
+/*
+** qsub_128_s32:
+**	sqsub	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_128_s32, svint32_t,
+		z0 = svqsub_n_s32 (z0, 128),
+		z0 = svqsub (z0, 128))
+
+/*
+** qsub_255_s32:
+**	sqsub	z0\.s, z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_255_s32, svint32_t,
+		z0 = svqsub_n_s32 (z0, 255),
+		z0 = svqsub (z0, 255))
+
+/*
+** qsub_m1_s32:
+**	sqadd	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m1_s32, svint32_t,
+		z0 = svqsub_n_s32 (z0, -1),
+		z0 = svqsub (z0, -1))
+
+/*
+** qsub_m127_s32:
+**	sqadd	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m127_s32, svint32_t,
+		z0 = svqsub_n_s32 (z0, -127),
+		z0 = svqsub (z0, -127))
+
+/*
+** qsub_m128_s32:
+**	sqadd	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m128_s32, svint32_t,
+		z0 = svqsub_n_s32 (z0, -128),
+		z0 = svqsub (z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s64.c
new file mode 100644
index 000000000..3d0fc2bcc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s64.c
@@ -0,0 +1,123 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qsub_s64_tied1:
+**	sqsub	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_s64_tied1, svint64_t,
+		z0 = svqsub_s64 (z0, z1),
+		z0 = svqsub (z0, z1))
+
+/*
+** qsub_s64_tied2:
+**	sqsub	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_s64_tied2, svint64_t,
+		z0 = svqsub_s64 (z1, z0),
+		z0 = svqsub (z1, z0))
+
+/*
+** qsub_s64_untied:
+**	sqsub	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_s64_untied, svint64_t,
+		z0 = svqsub_s64 (z1, z2),
+		z0 = svqsub (z1, z2))
+
+/*
+** qsub_x0_s64_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	sqsub	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (qsub_x0_s64_tied1, svint64_t, int64_t,
+		 z0 = svqsub_n_s64 (z0, x0),
+		 z0 = svqsub (z0, x0))
+
+/*
+** qsub_x0_s64_untied:
+**	mov	(z[0-9]+\.d), x0
+**	sqsub	z0\.d, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (qsub_x0_s64_untied, svint64_t, int64_t,
+		 z0 = svqsub_n_s64 (z1, x0),
+		 z0 = svqsub (z1, x0))
+
+/*
+** qsub_1_s64_tied1:
+**	sqsub	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_1_s64_tied1, svint64_t,
+		z0 = svqsub_n_s64 (z0, 1),
+		z0 = svqsub (z0, 1))
+
+/*
+** qsub_1_s64_untied:
+**	movprfx	z0, z1
+**	sqsub	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_1_s64_untied, svint64_t,
+		z0 = svqsub_n_s64 (z1, 1),
+		z0 = svqsub (z1, 1))
+
+/*
+** qsub_127_s64:
+**	sqsub	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_127_s64, svint64_t,
+		z0 = svqsub_n_s64 (z0, 127),
+		z0 = svqsub (z0, 127))
+
+/*
+** qsub_128_s64:
+**	sqsub	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_128_s64, svint64_t,
+		z0 = svqsub_n_s64 (z0, 128),
+		z0 = svqsub (z0, 128))
+
+/*
+** qsub_255_s64:
+**	sqsub	z0\.d, z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_255_s64, svint64_t,
+		z0 = svqsub_n_s64 (z0, 255),
+		z0 = svqsub (z0, 255))
+
+/*
+** qsub_m1_s64:
+**	sqadd	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m1_s64, svint64_t,
+		z0 = svqsub_n_s64 (z0, -1),
+		z0 = svqsub (z0, -1))
+
+/*
+** qsub_m127_s64:
+**	sqadd	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m127_s64, svint64_t,
+		z0 = svqsub_n_s64 (z0, -127),
+		z0 = svqsub (z0, -127))
+
+/*
+** qsub_m128_s64:
+**	sqadd	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m128_s64, svint64_t,
+		z0 = svqsub_n_s64 (z0, -128),
+		z0 = svqsub (z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s8.c
new file mode 100644
index 000000000..3e7e84c77
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_s8.c
@@ -0,0 +1,123 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qsub_s8_tied1:
+**	sqsub	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_s8_tied1, svint8_t,
+		z0 = svqsub_s8 (z0, z1),
+		z0 = svqsub (z0, z1))
+
+/*
+** qsub_s8_tied2:
+**	sqsub	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_s8_tied2, svint8_t,
+		z0 = svqsub_s8 (z1, z0),
+		z0 = svqsub (z1, z0))
+
+/*
+** qsub_s8_untied:
+**	sqsub	z0\.b, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_s8_untied, svint8_t,
+		z0 = svqsub_s8 (z1, z2),
+		z0 = svqsub (z1, z2))
+
+/*
+** qsub_w0_s8_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	sqsub	z0\.b, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (qsub_w0_s8_tied1, svint8_t, int8_t,
+		 z0 = svqsub_n_s8 (z0, x0),
+		 z0 = svqsub (z0, x0))
+
+/*
+** qsub_w0_s8_untied:
+**	mov	(z[0-9]+\.b), w0
+**	sqsub	z0\.b, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (qsub_w0_s8_untied, svint8_t, int8_t,
+		 z0 = svqsub_n_s8 (z1, x0),
+		 z0 = svqsub (z1, x0))
+
+/*
+** qsub_1_s8_tied1:
+**	sqsub	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_1_s8_tied1, svint8_t,
+		z0 = svqsub_n_s8 (z0, 1),
+		z0 = svqsub (z0, 1))
+
+/*
+** qsub_1_s8_untied:
+**	movprfx	z0, z1
+**	sqsub	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_1_s8_untied, svint8_t,
+		z0 = svqsub_n_s8 (z1, 1),
+		z0 = svqsub (z1, 1))
+
+/*
+** qsub_127_s8:
+**	sqsub	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_127_s8, svint8_t,
+		z0 = svqsub_n_s8 (z0, 127),
+		z0 = svqsub (z0, 127))
+
+/*
+** qsub_128_s8:
+**	sqadd	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_128_s8, svint8_t,
+		z0 = svqsub_n_s8 (z0, 128),
+		z0 = svqsub (z0, 128))
+
+/*
+** qsub_255_s8:
+**	sqadd	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_255_s8, svint8_t,
+		z0 = svqsub_n_s8 (z0, 255),
+		z0 = svqsub (z0, 255))
+
+/*
+** qsub_m1_s8:
+**	sqadd	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m1_s8, svint8_t,
+		z0 = svqsub_n_s8 (z0, -1),
+		z0 = svqsub (z0, -1))
+
+/*
+** qsub_m127_s8:
+**	sqadd	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m127_s8, svint8_t,
+		z0 = svqsub_n_s8 (z0, -127),
+		z0 = svqsub (z0, -127))
+
+/*
+** qsub_m128_s8:
+**	sqadd	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m128_s8, svint8_t,
+		z0 = svqsub_n_s8 (z0, -128),
+		z0 = svqsub (z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u16.c
new file mode 100644
index 000000000..6d4d68e20
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u16.c
@@ -0,0 +1,126 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qsub_u16_tied1:
+**	uqsub	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_u16_tied1, svuint16_t,
+		z0 = svqsub_u16 (z0, z1),
+		z0 = svqsub (z0, z1))
+
+/*
+** qsub_u16_tied2:
+**	uqsub	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_u16_tied2, svuint16_t,
+		z0 = svqsub_u16 (z1, z0),
+		z0 = svqsub (z1, z0))
+
+/*
+** qsub_u16_untied:
+**	uqsub	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_u16_untied, svuint16_t,
+		z0 = svqsub_u16 (z1, z2),
+		z0 = svqsub (z1, z2))
+
+/*
+** qsub_w0_u16_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	uqsub	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (qsub_w0_u16_tied1, svuint16_t, uint16_t,
+		 z0 = svqsub_n_u16 (z0, x0),
+		 z0 = svqsub (z0, x0))
+
+/*
+** qsub_w0_u16_untied:
+**	mov	(z[0-9]+\.h), w0
+**	uqsub	z0\.h, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (qsub_w0_u16_untied, svuint16_t, uint16_t,
+		 z0 = svqsub_n_u16 (z1, x0),
+		 z0 = svqsub (z1, x0))
+
+/*
+** qsub_1_u16_tied1:
+**	uqsub	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_1_u16_tied1, svuint16_t,
+		z0 = svqsub_n_u16 (z0, 1),
+		z0 = svqsub (z0, 1))
+
+/*
+** qsub_1_u16_untied:
+**	movprfx	z0, z1
+**	uqsub	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_1_u16_untied, svuint16_t,
+		z0 = svqsub_n_u16 (z1, 1),
+		z0 = svqsub (z1, 1))
+
+/*
+** qsub_127_u16:
+**	uqsub	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_127_u16, svuint16_t,
+		z0 = svqsub_n_u16 (z0, 127),
+		z0 = svqsub (z0, 127))
+
+/*
+** qsub_128_u16:
+**	uqsub	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_128_u16, svuint16_t,
+		z0 = svqsub_n_u16 (z0, 128),
+		z0 = svqsub (z0, 128))
+
+/*
+** qsub_255_u16:
+**	uqsub	z0\.h, z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_255_u16, svuint16_t,
+		z0 = svqsub_n_u16 (z0, 255),
+		z0 = svqsub (z0, 255))
+
+/*
+** qsub_m1_u16:
+**	mov	(z[0-9]+)\.b, #-1
+**	uqsub	z0\.h, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m1_u16, svuint16_t,
+		z0 = svqsub_n_u16 (z0, -1),
+		z0 = svqsub (z0, -1))
+
+/*
+** qsub_m127_u16:
+**	mov	(z[0-9]+\.h), #-127
+**	uqsub	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m127_u16, svuint16_t,
+		z0 = svqsub_n_u16 (z0, -127),
+		z0 = svqsub (z0, -127))
+
+/*
+** qsub_m128_u16:
+**	mov	(z[0-9]+\.h), #-128
+**	uqsub	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m128_u16, svuint16_t,
+		z0 = svqsub_n_u16 (z0, -128),
+		z0 = svqsub (z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u32.c
new file mode 100644
index 000000000..9c93cfc45
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u32.c
@@ -0,0 +1,126 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qsub_u32_tied1:
+**	uqsub	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_u32_tied1, svuint32_t,
+		z0 = svqsub_u32 (z0, z1),
+		z0 = svqsub (z0, z1))
+
+/*
+** qsub_u32_tied2:
+**	uqsub	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_u32_tied2, svuint32_t,
+		z0 = svqsub_u32 (z1, z0),
+		z0 = svqsub (z1, z0))
+
+/*
+** qsub_u32_untied:
+**	uqsub	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_u32_untied, svuint32_t,
+		z0 = svqsub_u32 (z1, z2),
+		z0 = svqsub (z1, z2))
+
+/*
+** qsub_w0_u32_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	uqsub	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (qsub_w0_u32_tied1, svuint32_t, uint32_t,
+		 z0 = svqsub_n_u32 (z0, x0),
+		 z0 = svqsub (z0, x0))
+
+/*
+** qsub_w0_u32_untied:
+**	mov	(z[0-9]+\.s), w0
+**	uqsub	z0\.s, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (qsub_w0_u32_untied, svuint32_t, uint32_t,
+		 z0 = svqsub_n_u32 (z1, x0),
+		 z0 = svqsub (z1, x0))
+
+/*
+** qsub_1_u32_tied1:
+**	uqsub	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_1_u32_tied1, svuint32_t,
+		z0 = svqsub_n_u32 (z0, 1),
+		z0 = svqsub (z0, 1))
+
+/*
+** qsub_1_u32_untied:
+**	movprfx	z0, z1
+**	uqsub	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_1_u32_untied, svuint32_t,
+		z0 = svqsub_n_u32 (z1, 1),
+		z0 = svqsub (z1, 1))
+
+/*
+** qsub_127_u32:
+**	uqsub	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_127_u32, svuint32_t,
+		z0 = svqsub_n_u32 (z0, 127),
+		z0 = svqsub (z0, 127))
+
+/*
+** qsub_128_u32:
+**	uqsub	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_128_u32, svuint32_t,
+		z0 = svqsub_n_u32 (z0, 128),
+		z0 = svqsub (z0, 128))
+
+/*
+** qsub_255_u32:
+**	uqsub	z0\.s, z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_255_u32, svuint32_t,
+		z0 = svqsub_n_u32 (z0, 255),
+		z0 = svqsub (z0, 255))
+
+/*
+** qsub_m1_u32:
+**	mov	(z[0-9]+)\.b, #-1
+**	uqsub	z0\.s, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m1_u32, svuint32_t,
+		z0 = svqsub_n_u32 (z0, -1),
+		z0 = svqsub (z0, -1))
+
+/*
+** qsub_m127_u32:
+**	mov	(z[0-9]+\.s), #-127
+**	uqsub	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m127_u32, svuint32_t,
+		z0 = svqsub_n_u32 (z0, -127),
+		z0 = svqsub (z0, -127))
+
+/*
+** qsub_m128_u32:
+**	mov	(z[0-9]+\.s), #-128
+**	uqsub	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m128_u32, svuint32_t,
+		z0 = svqsub_n_u32 (z0, -128),
+		z0 = svqsub (z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u64.c
new file mode 100644
index 000000000..6109b5f29
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u64.c
@@ -0,0 +1,126 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qsub_u64_tied1:
+**	uqsub	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_u64_tied1, svuint64_t,
+		z0 = svqsub_u64 (z0, z1),
+		z0 = svqsub (z0, z1))
+
+/*
+** qsub_u64_tied2:
+**	uqsub	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_u64_tied2, svuint64_t,
+		z0 = svqsub_u64 (z1, z0),
+		z0 = svqsub (z1, z0))
+
+/*
+** qsub_u64_untied:
+**	uqsub	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_u64_untied, svuint64_t,
+		z0 = svqsub_u64 (z1, z2),
+		z0 = svqsub (z1, z2))
+
+/*
+** qsub_x0_u64_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	uqsub	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (qsub_x0_u64_tied1, svuint64_t, uint64_t,
+		 z0 = svqsub_n_u64 (z0, x0),
+		 z0 = svqsub (z0, x0))
+
+/*
+** qsub_x0_u64_untied:
+**	mov	(z[0-9]+\.d), x0
+**	uqsub	z0\.d, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (qsub_x0_u64_untied, svuint64_t, uint64_t,
+		 z0 = svqsub_n_u64 (z1, x0),
+		 z0 = svqsub (z1, x0))
+
+/*
+** qsub_1_u64_tied1:
+**	uqsub	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_1_u64_tied1, svuint64_t,
+		z0 = svqsub_n_u64 (z0, 1),
+		z0 = svqsub (z0, 1))
+
+/*
+** qsub_1_u64_untied:
+**	movprfx	z0, z1
+**	uqsub	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_1_u64_untied, svuint64_t,
+		z0 = svqsub_n_u64 (z1, 1),
+		z0 = svqsub (z1, 1))
+
+/*
+** qsub_127_u64:
+**	uqsub	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_127_u64, svuint64_t,
+		z0 = svqsub_n_u64 (z0, 127),
+		z0 = svqsub (z0, 127))
+
+/*
+** qsub_128_u64:
+**	uqsub	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_128_u64, svuint64_t,
+		z0 = svqsub_n_u64 (z0, 128),
+		z0 = svqsub (z0, 128))
+
+/*
+** qsub_255_u64:
+**	uqsub	z0\.d, z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_255_u64, svuint64_t,
+		z0 = svqsub_n_u64 (z0, 255),
+		z0 = svqsub (z0, 255))
+
+/*
+** qsub_m1_u64:
+**	mov	(z[0-9]+)\.b, #-1
+**	uqsub	z0\.d, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m1_u64, svuint64_t,
+		z0 = svqsub_n_u64 (z0, -1),
+		z0 = svqsub (z0, -1))
+
+/*
+** qsub_m127_u64:
+**	mov	(z[0-9]+\.d), #-127
+**	uqsub	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m127_u64, svuint64_t,
+		z0 = svqsub_n_u64 (z0, -127),
+		z0 = svqsub (z0, -127))
+
+/*
+** qsub_m128_u64:
+**	mov	(z[0-9]+\.d), #-128
+**	uqsub	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m128_u64, svuint64_t,
+		z0 = svqsub_n_u64 (z0, -128),
+		z0 = svqsub (z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u8.c
new file mode 100644
index 000000000..40aa74e8d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/qsub_u8.c
@@ -0,0 +1,123 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** qsub_u8_tied1:
+**	uqsub	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_u8_tied1, svuint8_t,
+		z0 = svqsub_u8 (z0, z1),
+		z0 = svqsub (z0, z1))
+
+/*
+** qsub_u8_tied2:
+**	uqsub	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_u8_tied2, svuint8_t,
+		z0 = svqsub_u8 (z1, z0),
+		z0 = svqsub (z1, z0))
+
+/*
+** qsub_u8_untied:
+**	uqsub	z0\.b, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_u8_untied, svuint8_t,
+		z0 = svqsub_u8 (z1, z2),
+		z0 = svqsub (z1, z2))
+
+/*
+** qsub_w0_u8_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	uqsub	z0\.b, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (qsub_w0_u8_tied1, svuint8_t, uint8_t,
+		 z0 = svqsub_n_u8 (z0, x0),
+		 z0 = svqsub (z0, x0))
+
+/*
+** qsub_w0_u8_untied:
+**	mov	(z[0-9]+\.b), w0
+**	uqsub	z0\.b, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (qsub_w0_u8_untied, svuint8_t, uint8_t,
+		 z0 = svqsub_n_u8 (z1, x0),
+		 z0 = svqsub (z1, x0))
+
+/*
+** qsub_1_u8_tied1:
+**	uqsub	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_1_u8_tied1, svuint8_t,
+		z0 = svqsub_n_u8 (z0, 1),
+		z0 = svqsub (z0, 1))
+
+/*
+** qsub_1_u8_untied:
+**	movprfx	z0, z1
+**	uqsub	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_1_u8_untied, svuint8_t,
+		z0 = svqsub_n_u8 (z1, 1),
+		z0 = svqsub (z1, 1))
+
+/*
+** qsub_127_u8:
+**	uqsub	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_127_u8, svuint8_t,
+		z0 = svqsub_n_u8 (z0, 127),
+		z0 = svqsub (z0, 127))
+
+/*
+** qsub_128_u8:
+**	uqsub	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_128_u8, svuint8_t,
+		z0 = svqsub_n_u8 (z0, 128),
+		z0 = svqsub (z0, 128))
+
+/*
+** qsub_255_u8:
+**	uqsub	z0\.b, z0\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_255_u8, svuint8_t,
+		z0 = svqsub_n_u8 (z0, 255),
+		z0 = svqsub (z0, 255))
+
+/*
+** qsub_m1_u8:
+**	uqsub	z0\.b, z0\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m1_u8, svuint8_t,
+		z0 = svqsub_n_u8 (z0, -1),
+		z0 = svqsub (z0, -1))
+
+/*
+** qsub_m127_u8:
+**	uqsub	z0\.b, z0\.b, #129
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m127_u8, svuint8_t,
+		z0 = svqsub_n_u8 (z0, -127),
+		z0 = svqsub (z0, -127))
+
+/*
+** qsub_m128_u8:
+**	uqsub	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (qsub_m128_u8, svuint8_t,
+		z0 = svqsub_n_u8 (z0, -128),
+		z0 = svqsub (z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s16.c
new file mode 100644
index 000000000..4f794f600
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s16.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rbit_s16_m_tied12:
+**	rbit	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s16_m_tied12, svint16_t,
+		z0 = svrbit_s16_m (z0, p0, z0),
+		z0 = svrbit_m (z0, p0, z0))
+
+/*
+** rbit_s16_m_tied1:
+**	rbit	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s16_m_tied1, svint16_t,
+		z0 = svrbit_s16_m (z0, p0, z1),
+		z0 = svrbit_m (z0, p0, z1))
+
+/*
+** rbit_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	rbit	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s16_m_tied2, svint16_t,
+		z0 = svrbit_s16_m (z1, p0, z0),
+		z0 = svrbit_m (z1, p0, z0))
+
+/*
+** rbit_s16_m_untied:
+**	movprfx	z0, z2
+**	rbit	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s16_m_untied, svint16_t,
+		z0 = svrbit_s16_m (z2, p0, z1),
+		z0 = svrbit_m (z2, p0, z1))
+
+/*
+** rbit_s16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	rbit	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s16_z_tied1, svint16_t,
+		z0 = svrbit_s16_z (p0, z0),
+		z0 = svrbit_z (p0, z0))
+
+/*
+** rbit_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	rbit	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s16_z_untied, svint16_t,
+		z0 = svrbit_s16_z (p0, z1),
+		z0 = svrbit_z (p0, z1))
+
+/*
+** rbit_s16_x_tied1:
+**	rbit	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s16_x_tied1, svint16_t,
+		z0 = svrbit_s16_x (p0, z0),
+		z0 = svrbit_x (p0, z0))
+
+/*
+** rbit_s16_x_untied:
+**	rbit	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s16_x_untied, svint16_t,
+		z0 = svrbit_s16_x (p0, z1),
+		z0 = svrbit_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s32.c
new file mode 100644
index 000000000..8b5e1a463
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s32.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rbit_s32_m_tied12:
+**	rbit	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s32_m_tied12, svint32_t,
+		z0 = svrbit_s32_m (z0, p0, z0),
+		z0 = svrbit_m (z0, p0, z0))
+
+/*
+** rbit_s32_m_tied1:
+**	rbit	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s32_m_tied1, svint32_t,
+		z0 = svrbit_s32_m (z0, p0, z1),
+		z0 = svrbit_m (z0, p0, z1))
+
+/*
+** rbit_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	rbit	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s32_m_tied2, svint32_t,
+		z0 = svrbit_s32_m (z1, p0, z0),
+		z0 = svrbit_m (z1, p0, z0))
+
+/*
+** rbit_s32_m_untied:
+**	movprfx	z0, z2
+**	rbit	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s32_m_untied, svint32_t,
+		z0 = svrbit_s32_m (z2, p0, z1),
+		z0 = svrbit_m (z2, p0, z1))
+
+/*
+** rbit_s32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	rbit	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s32_z_tied1, svint32_t,
+		z0 = svrbit_s32_z (p0, z0),
+		z0 = svrbit_z (p0, z0))
+
+/*
+** rbit_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	rbit	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s32_z_untied, svint32_t,
+		z0 = svrbit_s32_z (p0, z1),
+		z0 = svrbit_z (p0, z1))
+
+/*
+** rbit_s32_x_tied1:
+**	rbit	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s32_x_tied1, svint32_t,
+		z0 = svrbit_s32_x (p0, z0),
+		z0 = svrbit_x (p0, z0))
+
+/*
+** rbit_s32_x_untied:
+**	rbit	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s32_x_untied, svint32_t,
+		z0 = svrbit_s32_x (p0, z1),
+		z0 = svrbit_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s64.c
new file mode 100644
index 000000000..cec27a421
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rbit_s64_m_tied12:
+**	rbit	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s64_m_tied12, svint64_t,
+		z0 = svrbit_s64_m (z0, p0, z0),
+		z0 = svrbit_m (z0, p0, z0))
+
+/*
+** rbit_s64_m_tied1:
+**	rbit	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s64_m_tied1, svint64_t,
+		z0 = svrbit_s64_m (z0, p0, z1),
+		z0 = svrbit_m (z0, p0, z1))
+
+/*
+** rbit_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	rbit	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s64_m_tied2, svint64_t,
+		z0 = svrbit_s64_m (z1, p0, z0),
+		z0 = svrbit_m (z1, p0, z0))
+
+/*
+** rbit_s64_m_untied:
+**	movprfx	z0, z2
+**	rbit	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s64_m_untied, svint64_t,
+		z0 = svrbit_s64_m (z2, p0, z1),
+		z0 = svrbit_m (z2, p0, z1))
+
+/*
+** rbit_s64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	rbit	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s64_z_tied1, svint64_t,
+		z0 = svrbit_s64_z (p0, z0),
+		z0 = svrbit_z (p0, z0))
+
+/*
+** rbit_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	rbit	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s64_z_untied, svint64_t,
+		z0 = svrbit_s64_z (p0, z1),
+		z0 = svrbit_z (p0, z1))
+
+/*
+** rbit_s64_x_tied1:
+**	rbit	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s64_x_tied1, svint64_t,
+		z0 = svrbit_s64_x (p0, z0),
+		z0 = svrbit_x (p0, z0))
+
+/*
+** rbit_s64_x_untied:
+**	rbit	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s64_x_untied, svint64_t,
+		z0 = svrbit_s64_x (p0, z1),
+		z0 = svrbit_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s8.c
new file mode 100644
index 000000000..9c152116a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_s8.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rbit_s8_m_tied12:
+**	rbit	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s8_m_tied12, svint8_t,
+		z0 = svrbit_s8_m (z0, p0, z0),
+		z0 = svrbit_m (z0, p0, z0))
+
+/*
+** rbit_s8_m_tied1:
+**	rbit	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s8_m_tied1, svint8_t,
+		z0 = svrbit_s8_m (z0, p0, z1),
+		z0 = svrbit_m (z0, p0, z1))
+
+/*
+** rbit_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	rbit	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s8_m_tied2, svint8_t,
+		z0 = svrbit_s8_m (z1, p0, z0),
+		z0 = svrbit_m (z1, p0, z0))
+
+/*
+** rbit_s8_m_untied:
+**	movprfx	z0, z2
+**	rbit	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s8_m_untied, svint8_t,
+		z0 = svrbit_s8_m (z2, p0, z1),
+		z0 = svrbit_m (z2, p0, z1))
+
+/*
+** rbit_s8_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.b, p0/z, \1\.b
+**	rbit	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s8_z_tied1, svint8_t,
+		z0 = svrbit_s8_z (p0, z0),
+		z0 = svrbit_z (p0, z0))
+
+/*
+** rbit_s8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	rbit	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s8_z_untied, svint8_t,
+		z0 = svrbit_s8_z (p0, z1),
+		z0 = svrbit_z (p0, z1))
+
+/*
+** rbit_s8_x_tied1:
+**	rbit	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s8_x_tied1, svint8_t,
+		z0 = svrbit_s8_x (p0, z0),
+		z0 = svrbit_x (p0, z0))
+
+/*
+** rbit_s8_x_untied:
+**	rbit	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_s8_x_untied, svint8_t,
+		z0 = svrbit_s8_x (p0, z1),
+		z0 = svrbit_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u16.c
new file mode 100644
index 000000000..001ef2bf0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u16.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rbit_u16_m_tied12:
+**	rbit	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u16_m_tied12, svuint16_t,
+		z0 = svrbit_u16_m (z0, p0, z0),
+		z0 = svrbit_m (z0, p0, z0))
+
+/*
+** rbit_u16_m_tied1:
+**	rbit	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u16_m_tied1, svuint16_t,
+		z0 = svrbit_u16_m (z0, p0, z1),
+		z0 = svrbit_m (z0, p0, z1))
+
+/*
+** rbit_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	rbit	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u16_m_tied2, svuint16_t,
+		z0 = svrbit_u16_m (z1, p0, z0),
+		z0 = svrbit_m (z1, p0, z0))
+
+/*
+** rbit_u16_m_untied:
+**	movprfx	z0, z2
+**	rbit	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u16_m_untied, svuint16_t,
+		z0 = svrbit_u16_m (z2, p0, z1),
+		z0 = svrbit_m (z2, p0, z1))
+
+/*
+** rbit_u16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	rbit	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u16_z_tied1, svuint16_t,
+		z0 = svrbit_u16_z (p0, z0),
+		z0 = svrbit_z (p0, z0))
+
+/*
+** rbit_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	rbit	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u16_z_untied, svuint16_t,
+		z0 = svrbit_u16_z (p0, z1),
+		z0 = svrbit_z (p0, z1))
+
+/*
+** rbit_u16_x_tied1:
+**	rbit	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u16_x_tied1, svuint16_t,
+		z0 = svrbit_u16_x (p0, z0),
+		z0 = svrbit_x (p0, z0))
+
+/*
+** rbit_u16_x_untied:
+**	rbit	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u16_x_untied, svuint16_t,
+		z0 = svrbit_u16_x (p0, z1),
+		z0 = svrbit_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u32.c
new file mode 100644
index 000000000..4d91e954d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u32.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rbit_u32_m_tied12:
+**	rbit	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u32_m_tied12, svuint32_t,
+		z0 = svrbit_u32_m (z0, p0, z0),
+		z0 = svrbit_m (z0, p0, z0))
+
+/*
+** rbit_u32_m_tied1:
+**	rbit	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u32_m_tied1, svuint32_t,
+		z0 = svrbit_u32_m (z0, p0, z1),
+		z0 = svrbit_m (z0, p0, z1))
+
+/*
+** rbit_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	rbit	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u32_m_tied2, svuint32_t,
+		z0 = svrbit_u32_m (z1, p0, z0),
+		z0 = svrbit_m (z1, p0, z0))
+
+/*
+** rbit_u32_m_untied:
+**	movprfx	z0, z2
+**	rbit	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u32_m_untied, svuint32_t,
+		z0 = svrbit_u32_m (z2, p0, z1),
+		z0 = svrbit_m (z2, p0, z1))
+
+/*
+** rbit_u32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	rbit	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u32_z_tied1, svuint32_t,
+		z0 = svrbit_u32_z (p0, z0),
+		z0 = svrbit_z (p0, z0))
+
+/*
+** rbit_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	rbit	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u32_z_untied, svuint32_t,
+		z0 = svrbit_u32_z (p0, z1),
+		z0 = svrbit_z (p0, z1))
+
+/*
+** rbit_u32_x_tied1:
+**	rbit	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u32_x_tied1, svuint32_t,
+		z0 = svrbit_u32_x (p0, z0),
+		z0 = svrbit_x (p0, z0))
+
+/*
+** rbit_u32_x_untied:
+**	rbit	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u32_x_untied, svuint32_t,
+		z0 = svrbit_u32_x (p0, z1),
+		z0 = svrbit_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u64.c
new file mode 100644
index 000000000..77f88d116
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rbit_u64_m_tied12:
+**	rbit	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u64_m_tied12, svuint64_t,
+		z0 = svrbit_u64_m (z0, p0, z0),
+		z0 = svrbit_m (z0, p0, z0))
+
+/*
+** rbit_u64_m_tied1:
+**	rbit	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u64_m_tied1, svuint64_t,
+		z0 = svrbit_u64_m (z0, p0, z1),
+		z0 = svrbit_m (z0, p0, z1))
+
+/*
+** rbit_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	rbit	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u64_m_tied2, svuint64_t,
+		z0 = svrbit_u64_m (z1, p0, z0),
+		z0 = svrbit_m (z1, p0, z0))
+
+/*
+** rbit_u64_m_untied:
+**	movprfx	z0, z2
+**	rbit	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u64_m_untied, svuint64_t,
+		z0 = svrbit_u64_m (z2, p0, z1),
+		z0 = svrbit_m (z2, p0, z1))
+
+/*
+** rbit_u64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	rbit	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u64_z_tied1, svuint64_t,
+		z0 = svrbit_u64_z (p0, z0),
+		z0 = svrbit_z (p0, z0))
+
+/*
+** rbit_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	rbit	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u64_z_untied, svuint64_t,
+		z0 = svrbit_u64_z (p0, z1),
+		z0 = svrbit_z (p0, z1))
+
+/*
+** rbit_u64_x_tied1:
+**	rbit	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u64_x_tied1, svuint64_t,
+		z0 = svrbit_u64_x (p0, z0),
+		z0 = svrbit_x (p0, z0))
+
+/*
+** rbit_u64_x_untied:
+**	rbit	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u64_x_untied, svuint64_t,
+		z0 = svrbit_u64_x (p0, z1),
+		z0 = svrbit_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u8.c
new file mode 100644
index 000000000..fa347e4c7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rbit_u8.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rbit_u8_m_tied12:
+**	rbit	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u8_m_tied12, svuint8_t,
+		z0 = svrbit_u8_m (z0, p0, z0),
+		z0 = svrbit_m (z0, p0, z0))
+
+/*
+** rbit_u8_m_tied1:
+**	rbit	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u8_m_tied1, svuint8_t,
+		z0 = svrbit_u8_m (z0, p0, z1),
+		z0 = svrbit_m (z0, p0, z1))
+
+/*
+** rbit_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	rbit	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u8_m_tied2, svuint8_t,
+		z0 = svrbit_u8_m (z1, p0, z0),
+		z0 = svrbit_m (z1, p0, z0))
+
+/*
+** rbit_u8_m_untied:
+**	movprfx	z0, z2
+**	rbit	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u8_m_untied, svuint8_t,
+		z0 = svrbit_u8_m (z2, p0, z1),
+		z0 = svrbit_m (z2, p0, z1))
+
+/*
+** rbit_u8_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.b, p0/z, \1\.b
+**	rbit	z0\.b, p0/m, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u8_z_tied1, svuint8_t,
+		z0 = svrbit_u8_z (p0, z0),
+		z0 = svrbit_z (p0, z0))
+
+/*
+** rbit_u8_z_untied:
+**	movprfx	z0\.b, p0/z, z1\.b
+**	rbit	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u8_z_untied, svuint8_t,
+		z0 = svrbit_u8_z (p0, z1),
+		z0 = svrbit_z (p0, z1))
+
+/*
+** rbit_u8_x_tied1:
+**	rbit	z0\.b, p0/m, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u8_x_tied1, svuint8_t,
+		z0 = svrbit_u8_x (p0, z0),
+		z0 = svrbit_x (p0, z0))
+
+/*
+** rbit_u8_x_untied:
+**	rbit	z0\.b, p0/m, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rbit_u8_x_untied, svuint8_t,
+		z0 = svrbit_u8_x (p0, z1),
+		z0 = svrbit_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rdffr_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rdffr_1.c
new file mode 100644
index 000000000..5564e967f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rdffr_1.c
@@ -0,0 +1,59 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** setffr_rdffr_1:
+**	ptrue	p0\.b, all
+**	ret
+*/
+TEST_UNIFORM_P_SINGLE (setffr_rdffr_1,
+		       svsetffr ();
+		       p0 = svrdffr ());
+
+/*
+** setffr_rdffr_2:
+**	ret
+*/
+TEST_UNIFORM_P_SINGLE (setffr_rdffr_2,
+		       svsetffr ();
+		       svrdffr ());
+
+/*
+** setffr_rdffr_3:
+**	ptrue	p0\.b, all
+**	ret
+*/
+TEST_UNIFORM_P_SINGLE (setffr_rdffr_3,
+		       svsetffr ();
+		       svsetffr ();
+		       svrdffr ();
+		       p0 = svrdffr ());
+
+/*
+** wrffr_rdffr_1:
+**	mov	p0\.b, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P_SINGLE (wrffr_rdffr_1,
+		       svwrffr (p1);
+		       p0 = svrdffr ());
+
+/*
+** wrffr_rdffr_2:
+**	ret
+*/
+TEST_UNIFORM_P_SINGLE (wrffr_rdffr_2,
+		       svwrffr (p1);
+		       svrdffr ());
+
+/*
+** wrffr_rdffr_3:
+**	mov	p0\.b, p2\.b
+**	ret
+*/
+TEST_UNIFORM_P_SINGLE (wrffr_rdffr_3,
+		       svwrffr (p1);
+		       svwrffr (p2);
+		       svrdffr ();
+		       p0 = svrdffr ());
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f16.c
new file mode 100644
index 000000000..d0cd8281a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** recpe_f16_tied1:
+**	frecpe	z0\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (recpe_f16_tied1, svfloat16_t,
+		z0 = svrecpe_f16 (z0),
+		z0 = svrecpe (z0))
+
+/*
+** recpe_f16_untied:
+**	frecpe	z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (recpe_f16_untied, svfloat16_t,
+		z0 = svrecpe_f16 (z1),
+		z0 = svrecpe (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f32.c
new file mode 100644
index 000000000..013ed8c43
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** recpe_f32_tied1:
+**	frecpe	z0\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (recpe_f32_tied1, svfloat32_t,
+		z0 = svrecpe_f32 (z0),
+		z0 = svrecpe (z0))
+
+/*
+** recpe_f32_untied:
+**	frecpe	z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (recpe_f32_untied, svfloat32_t,
+		z0 = svrecpe_f32 (z1),
+		z0 = svrecpe (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f64.c
new file mode 100644
index 000000000..40b3df292
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpe_f64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** recpe_f64_tied1:
+**	frecpe	z0\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (recpe_f64_tied1, svfloat64_t,
+		z0 = svrecpe_f64 (z0),
+		z0 = svrecpe (z0))
+
+/*
+** recpe_f64_untied:
+**	frecpe	z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (recpe_f64_untied, svfloat64_t,
+		z0 = svrecpe_f64 (z1),
+		z0 = svrecpe (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f16.c
new file mode 100644
index 000000000..e35c5c545
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** recps_f16_tied1:
+**	frecps	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (recps_f16_tied1, svfloat16_t,
+		z0 = svrecps_f16 (z0, z1),
+		z0 = svrecps (z0, z1))
+
+/*
+** recps_f16_tied2:
+**	frecps	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (recps_f16_tied2, svfloat16_t,
+		z0 = svrecps_f16 (z1, z0),
+		z0 = svrecps (z1, z0))
+
+/*
+** recps_f16_untied:
+**	frecps	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (recps_f16_untied, svfloat16_t,
+		z0 = svrecps_f16 (z1, z2),
+		z0 = svrecps (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f32.c
new file mode 100644
index 000000000..3f3aa203e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** recps_f32_tied1:
+**	frecps	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (recps_f32_tied1, svfloat32_t,
+		z0 = svrecps_f32 (z0, z1),
+		z0 = svrecps (z0, z1))
+
+/*
+** recps_f32_tied2:
+**	frecps	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (recps_f32_tied2, svfloat32_t,
+		z0 = svrecps_f32 (z1, z0),
+		z0 = svrecps (z1, z0))
+
+/*
+** recps_f32_untied:
+**	frecps	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (recps_f32_untied, svfloat32_t,
+		z0 = svrecps_f32 (z1, z2),
+		z0 = svrecps (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f64.c
new file mode 100644
index 000000000..eca421d5e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recps_f64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** recps_f64_tied1:
+**	frecps	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (recps_f64_tied1, svfloat64_t,
+		z0 = svrecps_f64 (z0, z1),
+		z0 = svrecps (z0, z1))
+
+/*
+** recps_f64_tied2:
+**	frecps	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (recps_f64_tied2, svfloat64_t,
+		z0 = svrecps_f64 (z1, z0),
+		z0 = svrecps (z1, z0))
+
+/*
+** recps_f64_untied:
+**	frecps	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (recps_f64_untied, svfloat64_t,
+		z0 = svrecps_f64 (z1, z2),
+		z0 = svrecps (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f16.c
new file mode 100644
index 000000000..2dd7ada2c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f16.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** recpx_f16_m_tied12:
+**	frecpx	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f16_m_tied12, svfloat16_t,
+		z0 = svrecpx_f16_m (z0, p0, z0),
+		z0 = svrecpx_m (z0, p0, z0))
+
+/*
+** recpx_f16_m_tied1:
+**	frecpx	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f16_m_tied1, svfloat16_t,
+		z0 = svrecpx_f16_m (z0, p0, z1),
+		z0 = svrecpx_m (z0, p0, z1))
+
+/*
+** recpx_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	frecpx	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f16_m_tied2, svfloat16_t,
+		z0 = svrecpx_f16_m (z1, p0, z0),
+		z0 = svrecpx_m (z1, p0, z0))
+
+/*
+** recpx_f16_m_untied:
+**	movprfx	z0, z2
+**	frecpx	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f16_m_untied, svfloat16_t,
+		z0 = svrecpx_f16_m (z2, p0, z1),
+		z0 = svrecpx_m (z2, p0, z1))
+
+/*
+** recpx_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	frecpx	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f16_z_tied1, svfloat16_t,
+		z0 = svrecpx_f16_z (p0, z0),
+		z0 = svrecpx_z (p0, z0))
+
+/*
+** recpx_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	frecpx	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f16_z_untied, svfloat16_t,
+		z0 = svrecpx_f16_z (p0, z1),
+		z0 = svrecpx_z (p0, z1))
+
+/*
+** recpx_f16_x_tied1:
+**	frecpx	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f16_x_tied1, svfloat16_t,
+		z0 = svrecpx_f16_x (p0, z0),
+		z0 = svrecpx_x (p0, z0))
+
+/*
+** recpx_f16_x_untied:
+**	frecpx	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f16_x_untied, svfloat16_t,
+		z0 = svrecpx_f16_x (p0, z1),
+		z0 = svrecpx_x (p0, z1))
+
+/*
+** ptrue_recpx_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_recpx_f16_x_tied1, svfloat16_t,
+		z0 = svrecpx_f16_x (svptrue_b16 (), z0),
+		z0 = svrecpx_x (svptrue_b16 (), z0))
+
+/*
+** ptrue_recpx_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_recpx_f16_x_untied, svfloat16_t,
+		z0 = svrecpx_f16_x (svptrue_b16 (), z1),
+		z0 = svrecpx_x (svptrue_b16 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f32.c
new file mode 100644
index 000000000..6364fb83b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f32.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** recpx_f32_m_tied12:
+**	frecpx	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f32_m_tied12, svfloat32_t,
+		z0 = svrecpx_f32_m (z0, p0, z0),
+		z0 = svrecpx_m (z0, p0, z0))
+
+/*
+** recpx_f32_m_tied1:
+**	frecpx	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f32_m_tied1, svfloat32_t,
+		z0 = svrecpx_f32_m (z0, p0, z1),
+		z0 = svrecpx_m (z0, p0, z1))
+
+/*
+** recpx_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	frecpx	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f32_m_tied2, svfloat32_t,
+		z0 = svrecpx_f32_m (z1, p0, z0),
+		z0 = svrecpx_m (z1, p0, z0))
+
+/*
+** recpx_f32_m_untied:
+**	movprfx	z0, z2
+**	frecpx	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f32_m_untied, svfloat32_t,
+		z0 = svrecpx_f32_m (z2, p0, z1),
+		z0 = svrecpx_m (z2, p0, z1))
+
+/*
+** recpx_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	frecpx	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f32_z_tied1, svfloat32_t,
+		z0 = svrecpx_f32_z (p0, z0),
+		z0 = svrecpx_z (p0, z0))
+
+/*
+** recpx_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	frecpx	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f32_z_untied, svfloat32_t,
+		z0 = svrecpx_f32_z (p0, z1),
+		z0 = svrecpx_z (p0, z1))
+
+/*
+** recpx_f32_x_tied1:
+**	frecpx	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f32_x_tied1, svfloat32_t,
+		z0 = svrecpx_f32_x (p0, z0),
+		z0 = svrecpx_x (p0, z0))
+
+/*
+** recpx_f32_x_untied:
+**	frecpx	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f32_x_untied, svfloat32_t,
+		z0 = svrecpx_f32_x (p0, z1),
+		z0 = svrecpx_x (p0, z1))
+
+/*
+** ptrue_recpx_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_recpx_f32_x_tied1, svfloat32_t,
+		z0 = svrecpx_f32_x (svptrue_b32 (), z0),
+		z0 = svrecpx_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_recpx_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_recpx_f32_x_untied, svfloat32_t,
+		z0 = svrecpx_f32_x (svptrue_b32 (), z1),
+		z0 = svrecpx_x (svptrue_b32 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f64.c
new file mode 100644
index 000000000..ca5232331
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/recpx_f64.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** recpx_f64_m_tied12:
+**	frecpx	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f64_m_tied12, svfloat64_t,
+		z0 = svrecpx_f64_m (z0, p0, z0),
+		z0 = svrecpx_m (z0, p0, z0))
+
+/*
+** recpx_f64_m_tied1:
+**	frecpx	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f64_m_tied1, svfloat64_t,
+		z0 = svrecpx_f64_m (z0, p0, z1),
+		z0 = svrecpx_m (z0, p0, z1))
+
+/*
+** recpx_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	frecpx	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f64_m_tied2, svfloat64_t,
+		z0 = svrecpx_f64_m (z1, p0, z0),
+		z0 = svrecpx_m (z1, p0, z0))
+
+/*
+** recpx_f64_m_untied:
+**	movprfx	z0, z2
+**	frecpx	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f64_m_untied, svfloat64_t,
+		z0 = svrecpx_f64_m (z2, p0, z1),
+		z0 = svrecpx_m (z2, p0, z1))
+
+/*
+** recpx_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	frecpx	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f64_z_tied1, svfloat64_t,
+		z0 = svrecpx_f64_z (p0, z0),
+		z0 = svrecpx_z (p0, z0))
+
+/*
+** recpx_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	frecpx	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f64_z_untied, svfloat64_t,
+		z0 = svrecpx_f64_z (p0, z1),
+		z0 = svrecpx_z (p0, z1))
+
+/*
+** recpx_f64_x_tied1:
+**	frecpx	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f64_x_tied1, svfloat64_t,
+		z0 = svrecpx_f64_x (p0, z0),
+		z0 = svrecpx_x (p0, z0))
+
+/*
+** recpx_f64_x_untied:
+**	frecpx	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (recpx_f64_x_untied, svfloat64_t,
+		z0 = svrecpx_f64_x (p0, z1),
+		z0 = svrecpx_x (p0, z1))
+
+/*
+** ptrue_recpx_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_recpx_f64_x_tied1, svfloat64_t,
+		z0 = svrecpx_f64_x (svptrue_b64 (), z0),
+		z0 = svrecpx_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_recpx_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_recpx_f64_x_untied, svfloat64_t,
+		z0 = svrecpx_f64_x (svptrue_b64 (), z1),
+		z0 = svrecpx_x (svptrue_b64 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_bf16.c
new file mode 100644
index 000000000..2d2c2a714
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_bf16.c
@@ -0,0 +1,207 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** reinterpret_bf16_bf16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_bf16_tied1, svbfloat16_t, svbfloat16_t,
+		 z0_res = svreinterpret_bf16_bf16 (z0),
+		 z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_bf16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_bf16_untied, svbfloat16_t, svbfloat16_t,
+	     z0 = svreinterpret_bf16_bf16 (z4),
+	     z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_f16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_f16_tied1, svbfloat16_t, svfloat16_t,
+		 z0_res = svreinterpret_bf16_f16 (z0),
+		 z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_f16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_f16_untied, svbfloat16_t, svfloat16_t,
+	     z0 = svreinterpret_bf16_f16 (z4),
+	     z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_f32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_f32_tied1, svbfloat16_t, svfloat32_t,
+		 z0_res = svreinterpret_bf16_f32 (z0),
+		 z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_f32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_f32_untied, svbfloat16_t, svfloat32_t,
+	     z0 = svreinterpret_bf16_f32 (z4),
+	     z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_f64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_f64_tied1, svbfloat16_t, svfloat64_t,
+		 z0_res = svreinterpret_bf16_f64 (z0),
+		 z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_f64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_f64_untied, svbfloat16_t, svfloat64_t,
+	     z0 = svreinterpret_bf16_f64 (z4),
+	     z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_s8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_s8_tied1, svbfloat16_t, svint8_t,
+		 z0_res = svreinterpret_bf16_s8 (z0),
+		 z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_s8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_s8_untied, svbfloat16_t, svint8_t,
+	     z0 = svreinterpret_bf16_s8 (z4),
+	     z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_s16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_s16_tied1, svbfloat16_t, svint16_t,
+		 z0_res = svreinterpret_bf16_s16 (z0),
+		 z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_s16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_s16_untied, svbfloat16_t, svint16_t,
+	     z0 = svreinterpret_bf16_s16 (z4),
+	     z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_s32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_s32_tied1, svbfloat16_t, svint32_t,
+		 z0_res = svreinterpret_bf16_s32 (z0),
+		 z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_s32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_s32_untied, svbfloat16_t, svint32_t,
+	     z0 = svreinterpret_bf16_s32 (z4),
+	     z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_s64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_s64_tied1, svbfloat16_t, svint64_t,
+		 z0_res = svreinterpret_bf16_s64 (z0),
+		 z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_s64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_s64_untied, svbfloat16_t, svint64_t,
+	     z0 = svreinterpret_bf16_s64 (z4),
+	     z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_u8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_u8_tied1, svbfloat16_t, svuint8_t,
+		 z0_res = svreinterpret_bf16_u8 (z0),
+		 z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_u8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_u8_untied, svbfloat16_t, svuint8_t,
+	     z0 = svreinterpret_bf16_u8 (z4),
+	     z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_u16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_u16_tied1, svbfloat16_t, svuint16_t,
+		 z0_res = svreinterpret_bf16_u16 (z0),
+		 z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_u16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_u16_untied, svbfloat16_t, svuint16_t,
+	     z0 = svreinterpret_bf16_u16 (z4),
+	     z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_u32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_u32_tied1, svbfloat16_t, svuint32_t,
+		 z0_res = svreinterpret_bf16_u32 (z0),
+		 z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_u32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_u32_untied, svbfloat16_t, svuint32_t,
+	     z0 = svreinterpret_bf16_u32 (z4),
+	     z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_u64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_u64_tied1, svbfloat16_t, svuint64_t,
+		 z0_res = svreinterpret_bf16_u64 (z0),
+		 z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_u64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_u64_untied, svbfloat16_t, svuint64_t,
+	     z0 = svreinterpret_bf16_u64 (z4),
+	     z0 = svreinterpret_bf16 (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f16.c
new file mode 100644
index 000000000..60705e628
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f16.c
@@ -0,0 +1,207 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** reinterpret_f16_bf16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f16_bf16_tied1, svfloat16_t, svbfloat16_t,
+		 z0_res = svreinterpret_f16_bf16 (z0),
+		 z0_res = svreinterpret_f16 (z0))
+
+/*
+** reinterpret_f16_bf16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f16_bf16_untied, svfloat16_t, svbfloat16_t,
+	     z0 = svreinterpret_f16_bf16 (z4),
+	     z0 = svreinterpret_f16 (z4))
+
+/*
+** reinterpret_f16_f16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f16_f16_tied1, svfloat16_t, svfloat16_t,
+		 z0_res = svreinterpret_f16_f16 (z0),
+		 z0_res = svreinterpret_f16 (z0))
+
+/*
+** reinterpret_f16_f16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f16_f16_untied, svfloat16_t, svfloat16_t,
+	     z0 = svreinterpret_f16_f16 (z4),
+	     z0 = svreinterpret_f16 (z4))
+
+/*
+** reinterpret_f16_f32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f16_f32_tied1, svfloat16_t, svfloat32_t,
+		 z0_res = svreinterpret_f16_f32 (z0),
+		 z0_res = svreinterpret_f16 (z0))
+
+/*
+** reinterpret_f16_f32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f16_f32_untied, svfloat16_t, svfloat32_t,
+	     z0 = svreinterpret_f16_f32 (z4),
+	     z0 = svreinterpret_f16 (z4))
+
+/*
+** reinterpret_f16_f64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f16_f64_tied1, svfloat16_t, svfloat64_t,
+		 z0_res = svreinterpret_f16_f64 (z0),
+		 z0_res = svreinterpret_f16 (z0))
+
+/*
+** reinterpret_f16_f64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f16_f64_untied, svfloat16_t, svfloat64_t,
+	     z0 = svreinterpret_f16_f64 (z4),
+	     z0 = svreinterpret_f16 (z4))
+
+/*
+** reinterpret_f16_s8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f16_s8_tied1, svfloat16_t, svint8_t,
+		 z0_res = svreinterpret_f16_s8 (z0),
+		 z0_res = svreinterpret_f16 (z0))
+
+/*
+** reinterpret_f16_s8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f16_s8_untied, svfloat16_t, svint8_t,
+	     z0 = svreinterpret_f16_s8 (z4),
+	     z0 = svreinterpret_f16 (z4))
+
+/*
+** reinterpret_f16_s16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f16_s16_tied1, svfloat16_t, svint16_t,
+		 z0_res = svreinterpret_f16_s16 (z0),
+		 z0_res = svreinterpret_f16 (z0))
+
+/*
+** reinterpret_f16_s16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f16_s16_untied, svfloat16_t, svint16_t,
+	     z0 = svreinterpret_f16_s16 (z4),
+	     z0 = svreinterpret_f16 (z4))
+
+/*
+** reinterpret_f16_s32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f16_s32_tied1, svfloat16_t, svint32_t,
+		 z0_res = svreinterpret_f16_s32 (z0),
+		 z0_res = svreinterpret_f16 (z0))
+
+/*
+** reinterpret_f16_s32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f16_s32_untied, svfloat16_t, svint32_t,
+	     z0 = svreinterpret_f16_s32 (z4),
+	     z0 = svreinterpret_f16 (z4))
+
+/*
+** reinterpret_f16_s64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f16_s64_tied1, svfloat16_t, svint64_t,
+		 z0_res = svreinterpret_f16_s64 (z0),
+		 z0_res = svreinterpret_f16 (z0))
+
+/*
+** reinterpret_f16_s64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f16_s64_untied, svfloat16_t, svint64_t,
+	     z0 = svreinterpret_f16_s64 (z4),
+	     z0 = svreinterpret_f16 (z4))
+
+/*
+** reinterpret_f16_u8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f16_u8_tied1, svfloat16_t, svuint8_t,
+		 z0_res = svreinterpret_f16_u8 (z0),
+		 z0_res = svreinterpret_f16 (z0))
+
+/*
+** reinterpret_f16_u8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f16_u8_untied, svfloat16_t, svuint8_t,
+	     z0 = svreinterpret_f16_u8 (z4),
+	     z0 = svreinterpret_f16 (z4))
+
+/*
+** reinterpret_f16_u16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f16_u16_tied1, svfloat16_t, svuint16_t,
+		 z0_res = svreinterpret_f16_u16 (z0),
+		 z0_res = svreinterpret_f16 (z0))
+
+/*
+** reinterpret_f16_u16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f16_u16_untied, svfloat16_t, svuint16_t,
+	     z0 = svreinterpret_f16_u16 (z4),
+	     z0 = svreinterpret_f16 (z4))
+
+/*
+** reinterpret_f16_u32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f16_u32_tied1, svfloat16_t, svuint32_t,
+		 z0_res = svreinterpret_f16_u32 (z0),
+		 z0_res = svreinterpret_f16 (z0))
+
+/*
+** reinterpret_f16_u32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f16_u32_untied, svfloat16_t, svuint32_t,
+	     z0 = svreinterpret_f16_u32 (z4),
+	     z0 = svreinterpret_f16 (z4))
+
+/*
+** reinterpret_f16_u64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f16_u64_tied1, svfloat16_t, svuint64_t,
+		 z0_res = svreinterpret_f16_u64 (z0),
+		 z0_res = svreinterpret_f16 (z0))
+
+/*
+** reinterpret_f16_u64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f16_u64_untied, svfloat16_t, svuint64_t,
+	     z0 = svreinterpret_f16_u64 (z4),
+	     z0 = svreinterpret_f16 (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f32.c
new file mode 100644
index 000000000..06fc46f25
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f32.c
@@ -0,0 +1,207 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** reinterpret_f32_bf16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f32_bf16_tied1, svfloat32_t, svbfloat16_t,
+		 z0_res = svreinterpret_f32_bf16 (z0),
+		 z0_res = svreinterpret_f32 (z0))
+
+/*
+** reinterpret_f32_bf16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f32_bf16_untied, svfloat32_t, svbfloat16_t,
+	     z0 = svreinterpret_f32_bf16 (z4),
+	     z0 = svreinterpret_f32 (z4))
+
+/*
+** reinterpret_f32_f16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f32_f16_tied1, svfloat32_t, svfloat16_t,
+		 z0_res = svreinterpret_f32_f16 (z0),
+		 z0_res = svreinterpret_f32 (z0))
+
+/*
+** reinterpret_f32_f16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f32_f16_untied, svfloat32_t, svfloat16_t,
+	     z0 = svreinterpret_f32_f16 (z4),
+	     z0 = svreinterpret_f32 (z4))
+
+/*
+** reinterpret_f32_f32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f32_f32_tied1, svfloat32_t, svfloat32_t,
+		 z0_res = svreinterpret_f32_f32 (z0),
+		 z0_res = svreinterpret_f32 (z0))
+
+/*
+** reinterpret_f32_f32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f32_f32_untied, svfloat32_t, svfloat32_t,
+	     z0 = svreinterpret_f32_f32 (z4),
+	     z0 = svreinterpret_f32 (z4))
+
+/*
+** reinterpret_f32_f64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f32_f64_tied1, svfloat32_t, svfloat64_t,
+		 z0_res = svreinterpret_f32_f64 (z0),
+		 z0_res = svreinterpret_f32 (z0))
+
+/*
+** reinterpret_f32_f64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f32_f64_untied, svfloat32_t, svfloat64_t,
+	     z0 = svreinterpret_f32_f64 (z4),
+	     z0 = svreinterpret_f32 (z4))
+
+/*
+** reinterpret_f32_s8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f32_s8_tied1, svfloat32_t, svint8_t,
+		 z0_res = svreinterpret_f32_s8 (z0),
+		 z0_res = svreinterpret_f32 (z0))
+
+/*
+** reinterpret_f32_s8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f32_s8_untied, svfloat32_t, svint8_t,
+	     z0 = svreinterpret_f32_s8 (z4),
+	     z0 = svreinterpret_f32 (z4))
+
+/*
+** reinterpret_f32_s16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f32_s16_tied1, svfloat32_t, svint16_t,
+		 z0_res = svreinterpret_f32_s16 (z0),
+		 z0_res = svreinterpret_f32 (z0))
+
+/*
+** reinterpret_f32_s16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f32_s16_untied, svfloat32_t, svint16_t,
+	     z0 = svreinterpret_f32_s16 (z4),
+	     z0 = svreinterpret_f32 (z4))
+
+/*
+** reinterpret_f32_s32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f32_s32_tied1, svfloat32_t, svint32_t,
+		 z0_res = svreinterpret_f32_s32 (z0),
+		 z0_res = svreinterpret_f32 (z0))
+
+/*
+** reinterpret_f32_s32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f32_s32_untied, svfloat32_t, svint32_t,
+	     z0 = svreinterpret_f32_s32 (z4),
+	     z0 = svreinterpret_f32 (z4))
+
+/*
+** reinterpret_f32_s64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f32_s64_tied1, svfloat32_t, svint64_t,
+		 z0_res = svreinterpret_f32_s64 (z0),
+		 z0_res = svreinterpret_f32 (z0))
+
+/*
+** reinterpret_f32_s64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f32_s64_untied, svfloat32_t, svint64_t,
+	     z0 = svreinterpret_f32_s64 (z4),
+	     z0 = svreinterpret_f32 (z4))
+
+/*
+** reinterpret_f32_u8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f32_u8_tied1, svfloat32_t, svuint8_t,
+		 z0_res = svreinterpret_f32_u8 (z0),
+		 z0_res = svreinterpret_f32 (z0))
+
+/*
+** reinterpret_f32_u8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f32_u8_untied, svfloat32_t, svuint8_t,
+	     z0 = svreinterpret_f32_u8 (z4),
+	     z0 = svreinterpret_f32 (z4))
+
+/*
+** reinterpret_f32_u16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f32_u16_tied1, svfloat32_t, svuint16_t,
+		 z0_res = svreinterpret_f32_u16 (z0),
+		 z0_res = svreinterpret_f32 (z0))
+
+/*
+** reinterpret_f32_u16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f32_u16_untied, svfloat32_t, svuint16_t,
+	     z0 = svreinterpret_f32_u16 (z4),
+	     z0 = svreinterpret_f32 (z4))
+
+/*
+** reinterpret_f32_u32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f32_u32_tied1, svfloat32_t, svuint32_t,
+		 z0_res = svreinterpret_f32_u32 (z0),
+		 z0_res = svreinterpret_f32 (z0))
+
+/*
+** reinterpret_f32_u32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f32_u32_untied, svfloat32_t, svuint32_t,
+	     z0 = svreinterpret_f32_u32 (z4),
+	     z0 = svreinterpret_f32 (z4))
+
+/*
+** reinterpret_f32_u64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f32_u64_tied1, svfloat32_t, svuint64_t,
+		 z0_res = svreinterpret_f32_u64 (z0),
+		 z0_res = svreinterpret_f32 (z0))
+
+/*
+** reinterpret_f32_u64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f32_u64_untied, svfloat32_t, svuint64_t,
+	     z0 = svreinterpret_f32_u64 (z4),
+	     z0 = svreinterpret_f32 (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f64.c
new file mode 100644
index 000000000..003ee3fe2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_f64.c
@@ -0,0 +1,207 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** reinterpret_f64_bf16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f64_bf16_tied1, svfloat64_t, svbfloat16_t,
+		 z0_res = svreinterpret_f64_bf16 (z0),
+		 z0_res = svreinterpret_f64 (z0))
+
+/*
+** reinterpret_f64_bf16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f64_bf16_untied, svfloat64_t, svbfloat16_t,
+	     z0 = svreinterpret_f64_bf16 (z4),
+	     z0 = svreinterpret_f64 (z4))
+
+/*
+** reinterpret_f64_f16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f64_f16_tied1, svfloat64_t, svfloat16_t,
+		 z0_res = svreinterpret_f64_f16 (z0),
+		 z0_res = svreinterpret_f64 (z0))
+
+/*
+** reinterpret_f64_f16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f64_f16_untied, svfloat64_t, svfloat16_t,
+	     z0 = svreinterpret_f64_f16 (z4),
+	     z0 = svreinterpret_f64 (z4))
+
+/*
+** reinterpret_f64_f32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f64_f32_tied1, svfloat64_t, svfloat32_t,
+		 z0_res = svreinterpret_f64_f32 (z0),
+		 z0_res = svreinterpret_f64 (z0))
+
+/*
+** reinterpret_f64_f32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f64_f32_untied, svfloat64_t, svfloat32_t,
+	     z0 = svreinterpret_f64_f32 (z4),
+	     z0 = svreinterpret_f64 (z4))
+
+/*
+** reinterpret_f64_f64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f64_f64_tied1, svfloat64_t, svfloat64_t,
+		 z0_res = svreinterpret_f64_f64 (z0),
+		 z0_res = svreinterpret_f64 (z0))
+
+/*
+** reinterpret_f64_f64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f64_f64_untied, svfloat64_t, svfloat64_t,
+	     z0 = svreinterpret_f64_f64 (z4),
+	     z0 = svreinterpret_f64 (z4))
+
+/*
+** reinterpret_f64_s8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f64_s8_tied1, svfloat64_t, svint8_t,
+		 z0_res = svreinterpret_f64_s8 (z0),
+		 z0_res = svreinterpret_f64 (z0))
+
+/*
+** reinterpret_f64_s8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f64_s8_untied, svfloat64_t, svint8_t,
+	     z0 = svreinterpret_f64_s8 (z4),
+	     z0 = svreinterpret_f64 (z4))
+
+/*
+** reinterpret_f64_s16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f64_s16_tied1, svfloat64_t, svint16_t,
+		 z0_res = svreinterpret_f64_s16 (z0),
+		 z0_res = svreinterpret_f64 (z0))
+
+/*
+** reinterpret_f64_s16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f64_s16_untied, svfloat64_t, svint16_t,
+	     z0 = svreinterpret_f64_s16 (z4),
+	     z0 = svreinterpret_f64 (z4))
+
+/*
+** reinterpret_f64_s32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f64_s32_tied1, svfloat64_t, svint32_t,
+		 z0_res = svreinterpret_f64_s32 (z0),
+		 z0_res = svreinterpret_f64 (z0))
+
+/*
+** reinterpret_f64_s32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f64_s32_untied, svfloat64_t, svint32_t,
+	     z0 = svreinterpret_f64_s32 (z4),
+	     z0 = svreinterpret_f64 (z4))
+
+/*
+** reinterpret_f64_s64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f64_s64_tied1, svfloat64_t, svint64_t,
+		 z0_res = svreinterpret_f64_s64 (z0),
+		 z0_res = svreinterpret_f64 (z0))
+
+/*
+** reinterpret_f64_s64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f64_s64_untied, svfloat64_t, svint64_t,
+	     z0 = svreinterpret_f64_s64 (z4),
+	     z0 = svreinterpret_f64 (z4))
+
+/*
+** reinterpret_f64_u8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f64_u8_tied1, svfloat64_t, svuint8_t,
+		 z0_res = svreinterpret_f64_u8 (z0),
+		 z0_res = svreinterpret_f64 (z0))
+
+/*
+** reinterpret_f64_u8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f64_u8_untied, svfloat64_t, svuint8_t,
+	     z0 = svreinterpret_f64_u8 (z4),
+	     z0 = svreinterpret_f64 (z4))
+
+/*
+** reinterpret_f64_u16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f64_u16_tied1, svfloat64_t, svuint16_t,
+		 z0_res = svreinterpret_f64_u16 (z0),
+		 z0_res = svreinterpret_f64 (z0))
+
+/*
+** reinterpret_f64_u16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f64_u16_untied, svfloat64_t, svuint16_t,
+	     z0 = svreinterpret_f64_u16 (z4),
+	     z0 = svreinterpret_f64 (z4))
+
+/*
+** reinterpret_f64_u32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f64_u32_tied1, svfloat64_t, svuint32_t,
+		 z0_res = svreinterpret_f64_u32 (z0),
+		 z0_res = svreinterpret_f64 (z0))
+
+/*
+** reinterpret_f64_u32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f64_u32_untied, svfloat64_t, svuint32_t,
+	     z0 = svreinterpret_f64_u32 (z4),
+	     z0 = svreinterpret_f64 (z4))
+
+/*
+** reinterpret_f64_u64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f64_u64_tied1, svfloat64_t, svuint64_t,
+		 z0_res = svreinterpret_f64_u64 (z0),
+		 z0_res = svreinterpret_f64 (z0))
+
+/*
+** reinterpret_f64_u64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_f64_u64_untied, svfloat64_t, svuint64_t,
+	     z0 = svreinterpret_f64_u64 (z4),
+	     z0 = svreinterpret_f64 (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s16.c
new file mode 100644
index 000000000..d62817c2c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s16.c
@@ -0,0 +1,207 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** reinterpret_s16_bf16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s16_bf16_tied1, svint16_t, svbfloat16_t,
+		 z0_res = svreinterpret_s16_bf16 (z0),
+		 z0_res = svreinterpret_s16 (z0))
+
+/*
+** reinterpret_s16_bf16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s16_bf16_untied, svint16_t, svbfloat16_t,
+	     z0 = svreinterpret_s16_bf16 (z4),
+	     z0 = svreinterpret_s16 (z4))
+
+/*
+** reinterpret_s16_f16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s16_f16_tied1, svint16_t, svfloat16_t,
+		 z0_res = svreinterpret_s16_f16 (z0),
+		 z0_res = svreinterpret_s16 (z0))
+
+/*
+** reinterpret_s16_f16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s16_f16_untied, svint16_t, svfloat16_t,
+	     z0 = svreinterpret_s16_f16 (z4),
+	     z0 = svreinterpret_s16 (z4))
+
+/*
+** reinterpret_s16_f32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s16_f32_tied1, svint16_t, svfloat32_t,
+		 z0_res = svreinterpret_s16_f32 (z0),
+		 z0_res = svreinterpret_s16 (z0))
+
+/*
+** reinterpret_s16_f32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s16_f32_untied, svint16_t, svfloat32_t,
+	     z0 = svreinterpret_s16_f32 (z4),
+	     z0 = svreinterpret_s16 (z4))
+
+/*
+** reinterpret_s16_f64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s16_f64_tied1, svint16_t, svfloat64_t,
+		 z0_res = svreinterpret_s16_f64 (z0),
+		 z0_res = svreinterpret_s16 (z0))
+
+/*
+** reinterpret_s16_f64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s16_f64_untied, svint16_t, svfloat64_t,
+	     z0 = svreinterpret_s16_f64 (z4),
+	     z0 = svreinterpret_s16 (z4))
+
+/*
+** reinterpret_s16_s8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s16_s8_tied1, svint16_t, svint8_t,
+		 z0_res = svreinterpret_s16_s8 (z0),
+		 z0_res = svreinterpret_s16 (z0))
+
+/*
+** reinterpret_s16_s8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s16_s8_untied, svint16_t, svint8_t,
+	     z0 = svreinterpret_s16_s8 (z4),
+	     z0 = svreinterpret_s16 (z4))
+
+/*
+** reinterpret_s16_s16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s16_s16_tied1, svint16_t, svint16_t,
+		 z0_res = svreinterpret_s16_s16 (z0),
+		 z0_res = svreinterpret_s16 (z0))
+
+/*
+** reinterpret_s16_s16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s16_s16_untied, svint16_t, svint16_t,
+	     z0 = svreinterpret_s16_s16 (z4),
+	     z0 = svreinterpret_s16 (z4))
+
+/*
+** reinterpret_s16_s32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s16_s32_tied1, svint16_t, svint32_t,
+		 z0_res = svreinterpret_s16_s32 (z0),
+		 z0_res = svreinterpret_s16 (z0))
+
+/*
+** reinterpret_s16_s32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s16_s32_untied, svint16_t, svint32_t,
+	     z0 = svreinterpret_s16_s32 (z4),
+	     z0 = svreinterpret_s16 (z4))
+
+/*
+** reinterpret_s16_s64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s16_s64_tied1, svint16_t, svint64_t,
+		 z0_res = svreinterpret_s16_s64 (z0),
+		 z0_res = svreinterpret_s16 (z0))
+
+/*
+** reinterpret_s16_s64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s16_s64_untied, svint16_t, svint64_t,
+	     z0 = svreinterpret_s16_s64 (z4),
+	     z0 = svreinterpret_s16 (z4))
+
+/*
+** reinterpret_s16_u8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s16_u8_tied1, svint16_t, svuint8_t,
+		 z0_res = svreinterpret_s16_u8 (z0),
+		 z0_res = svreinterpret_s16 (z0))
+
+/*
+** reinterpret_s16_u8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s16_u8_untied, svint16_t, svuint8_t,
+	     z0 = svreinterpret_s16_u8 (z4),
+	     z0 = svreinterpret_s16 (z4))
+
+/*
+** reinterpret_s16_u16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s16_u16_tied1, svint16_t, svuint16_t,
+		 z0_res = svreinterpret_s16_u16 (z0),
+		 z0_res = svreinterpret_s16 (z0))
+
+/*
+** reinterpret_s16_u16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s16_u16_untied, svint16_t, svuint16_t,
+	     z0 = svreinterpret_s16_u16 (z4),
+	     z0 = svreinterpret_s16 (z4))
+
+/*
+** reinterpret_s16_u32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s16_u32_tied1, svint16_t, svuint32_t,
+		 z0_res = svreinterpret_s16_u32 (z0),
+		 z0_res = svreinterpret_s16 (z0))
+
+/*
+** reinterpret_s16_u32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s16_u32_untied, svint16_t, svuint32_t,
+	     z0 = svreinterpret_s16_u32 (z4),
+	     z0 = svreinterpret_s16 (z4))
+
+/*
+** reinterpret_s16_u64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s16_u64_tied1, svint16_t, svuint64_t,
+		 z0_res = svreinterpret_s16_u64 (z0),
+		 z0_res = svreinterpret_s16 (z0))
+
+/*
+** reinterpret_s16_u64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s16_u64_untied, svint16_t, svuint64_t,
+	     z0 = svreinterpret_s16_u64 (z4),
+	     z0 = svreinterpret_s16 (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s32.c
new file mode 100644
index 000000000..e1068f244
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s32.c
@@ -0,0 +1,207 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** reinterpret_s32_bf16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s32_bf16_tied1, svint32_t, svbfloat16_t,
+		 z0_res = svreinterpret_s32_bf16 (z0),
+		 z0_res = svreinterpret_s32 (z0))
+
+/*
+** reinterpret_s32_bf16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s32_bf16_untied, svint32_t, svbfloat16_t,
+	     z0 = svreinterpret_s32_bf16 (z4),
+	     z0 = svreinterpret_s32 (z4))
+
+/*
+** reinterpret_s32_f16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s32_f16_tied1, svint32_t, svfloat16_t,
+		 z0_res = svreinterpret_s32_f16 (z0),
+		 z0_res = svreinterpret_s32 (z0))
+
+/*
+** reinterpret_s32_f16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s32_f16_untied, svint32_t, svfloat16_t,
+	     z0 = svreinterpret_s32_f16 (z4),
+	     z0 = svreinterpret_s32 (z4))
+
+/*
+** reinterpret_s32_f32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s32_f32_tied1, svint32_t, svfloat32_t,
+		 z0_res = svreinterpret_s32_f32 (z0),
+		 z0_res = svreinterpret_s32 (z0))
+
+/*
+** reinterpret_s32_f32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s32_f32_untied, svint32_t, svfloat32_t,
+	     z0 = svreinterpret_s32_f32 (z4),
+	     z0 = svreinterpret_s32 (z4))
+
+/*
+** reinterpret_s32_f64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s32_f64_tied1, svint32_t, svfloat64_t,
+		 z0_res = svreinterpret_s32_f64 (z0),
+		 z0_res = svreinterpret_s32 (z0))
+
+/*
+** reinterpret_s32_f64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s32_f64_untied, svint32_t, svfloat64_t,
+	     z0 = svreinterpret_s32_f64 (z4),
+	     z0 = svreinterpret_s32 (z4))
+
+/*
+** reinterpret_s32_s8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s32_s8_tied1, svint32_t, svint8_t,
+		 z0_res = svreinterpret_s32_s8 (z0),
+		 z0_res = svreinterpret_s32 (z0))
+
+/*
+** reinterpret_s32_s8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s32_s8_untied, svint32_t, svint8_t,
+	     z0 = svreinterpret_s32_s8 (z4),
+	     z0 = svreinterpret_s32 (z4))
+
+/*
+** reinterpret_s32_s16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s32_s16_tied1, svint32_t, svint16_t,
+		 z0_res = svreinterpret_s32_s16 (z0),
+		 z0_res = svreinterpret_s32 (z0))
+
+/*
+** reinterpret_s32_s16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s32_s16_untied, svint32_t, svint16_t,
+	     z0 = svreinterpret_s32_s16 (z4),
+	     z0 = svreinterpret_s32 (z4))
+
+/*
+** reinterpret_s32_s32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s32_s32_tied1, svint32_t, svint32_t,
+		 z0_res = svreinterpret_s32_s32 (z0),
+		 z0_res = svreinterpret_s32 (z0))
+
+/*
+** reinterpret_s32_s32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s32_s32_untied, svint32_t, svint32_t,
+	     z0 = svreinterpret_s32_s32 (z4),
+	     z0 = svreinterpret_s32 (z4))
+
+/*
+** reinterpret_s32_s64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s32_s64_tied1, svint32_t, svint64_t,
+		 z0_res = svreinterpret_s32_s64 (z0),
+		 z0_res = svreinterpret_s32 (z0))
+
+/*
+** reinterpret_s32_s64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s32_s64_untied, svint32_t, svint64_t,
+	     z0 = svreinterpret_s32_s64 (z4),
+	     z0 = svreinterpret_s32 (z4))
+
+/*
+** reinterpret_s32_u8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s32_u8_tied1, svint32_t, svuint8_t,
+		 z0_res = svreinterpret_s32_u8 (z0),
+		 z0_res = svreinterpret_s32 (z0))
+
+/*
+** reinterpret_s32_u8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s32_u8_untied, svint32_t, svuint8_t,
+	     z0 = svreinterpret_s32_u8 (z4),
+	     z0 = svreinterpret_s32 (z4))
+
+/*
+** reinterpret_s32_u16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s32_u16_tied1, svint32_t, svuint16_t,
+		 z0_res = svreinterpret_s32_u16 (z0),
+		 z0_res = svreinterpret_s32 (z0))
+
+/*
+** reinterpret_s32_u16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s32_u16_untied, svint32_t, svuint16_t,
+	     z0 = svreinterpret_s32_u16 (z4),
+	     z0 = svreinterpret_s32 (z4))
+
+/*
+** reinterpret_s32_u32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s32_u32_tied1, svint32_t, svuint32_t,
+		 z0_res = svreinterpret_s32_u32 (z0),
+		 z0_res = svreinterpret_s32 (z0))
+
+/*
+** reinterpret_s32_u32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s32_u32_untied, svint32_t, svuint32_t,
+	     z0 = svreinterpret_s32_u32 (z4),
+	     z0 = svreinterpret_s32 (z4))
+
+/*
+** reinterpret_s32_u64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s32_u64_tied1, svint32_t, svuint64_t,
+		 z0_res = svreinterpret_s32_u64 (z0),
+		 z0_res = svreinterpret_s32 (z0))
+
+/*
+** reinterpret_s32_u64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s32_u64_untied, svint32_t, svuint64_t,
+	     z0 = svreinterpret_s32_u64 (z4),
+	     z0 = svreinterpret_s32 (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s64.c
new file mode 100644
index 000000000..cada7533c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s64.c
@@ -0,0 +1,207 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** reinterpret_s64_bf16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s64_bf16_tied1, svint64_t, svbfloat16_t,
+		 z0_res = svreinterpret_s64_bf16 (z0),
+		 z0_res = svreinterpret_s64 (z0))
+
+/*
+** reinterpret_s64_bf16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s64_bf16_untied, svint64_t, svbfloat16_t,
+	     z0 = svreinterpret_s64_bf16 (z4),
+	     z0 = svreinterpret_s64 (z4))
+
+/*
+** reinterpret_s64_f16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s64_f16_tied1, svint64_t, svfloat16_t,
+		 z0_res = svreinterpret_s64_f16 (z0),
+		 z0_res = svreinterpret_s64 (z0))
+
+/*
+** reinterpret_s64_f16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s64_f16_untied, svint64_t, svfloat16_t,
+	     z0 = svreinterpret_s64_f16 (z4),
+	     z0 = svreinterpret_s64 (z4))
+
+/*
+** reinterpret_s64_f32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s64_f32_tied1, svint64_t, svfloat32_t,
+		 z0_res = svreinterpret_s64_f32 (z0),
+		 z0_res = svreinterpret_s64 (z0))
+
+/*
+** reinterpret_s64_f32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s64_f32_untied, svint64_t, svfloat32_t,
+	     z0 = svreinterpret_s64_f32 (z4),
+	     z0 = svreinterpret_s64 (z4))
+
+/*
+** reinterpret_s64_f64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s64_f64_tied1, svint64_t, svfloat64_t,
+		 z0_res = svreinterpret_s64_f64 (z0),
+		 z0_res = svreinterpret_s64 (z0))
+
+/*
+** reinterpret_s64_f64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s64_f64_untied, svint64_t, svfloat64_t,
+	     z0 = svreinterpret_s64_f64 (z4),
+	     z0 = svreinterpret_s64 (z4))
+
+/*
+** reinterpret_s64_s8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s64_s8_tied1, svint64_t, svint8_t,
+		 z0_res = svreinterpret_s64_s8 (z0),
+		 z0_res = svreinterpret_s64 (z0))
+
+/*
+** reinterpret_s64_s8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s64_s8_untied, svint64_t, svint8_t,
+	     z0 = svreinterpret_s64_s8 (z4),
+	     z0 = svreinterpret_s64 (z4))
+
+/*
+** reinterpret_s64_s16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s64_s16_tied1, svint64_t, svint16_t,
+		 z0_res = svreinterpret_s64_s16 (z0),
+		 z0_res = svreinterpret_s64 (z0))
+
+/*
+** reinterpret_s64_s16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s64_s16_untied, svint64_t, svint16_t,
+	     z0 = svreinterpret_s64_s16 (z4),
+	     z0 = svreinterpret_s64 (z4))
+
+/*
+** reinterpret_s64_s32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s64_s32_tied1, svint64_t, svint32_t,
+		 z0_res = svreinterpret_s64_s32 (z0),
+		 z0_res = svreinterpret_s64 (z0))
+
+/*
+** reinterpret_s64_s32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s64_s32_untied, svint64_t, svint32_t,
+	     z0 = svreinterpret_s64_s32 (z4),
+	     z0 = svreinterpret_s64 (z4))
+
+/*
+** reinterpret_s64_s64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s64_s64_tied1, svint64_t, svint64_t,
+		 z0_res = svreinterpret_s64_s64 (z0),
+		 z0_res = svreinterpret_s64 (z0))
+
+/*
+** reinterpret_s64_s64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s64_s64_untied, svint64_t, svint64_t,
+	     z0 = svreinterpret_s64_s64 (z4),
+	     z0 = svreinterpret_s64 (z4))
+
+/*
+** reinterpret_s64_u8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s64_u8_tied1, svint64_t, svuint8_t,
+		 z0_res = svreinterpret_s64_u8 (z0),
+		 z0_res = svreinterpret_s64 (z0))
+
+/*
+** reinterpret_s64_u8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s64_u8_untied, svint64_t, svuint8_t,
+	     z0 = svreinterpret_s64_u8 (z4),
+	     z0 = svreinterpret_s64 (z4))
+
+/*
+** reinterpret_s64_u16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s64_u16_tied1, svint64_t, svuint16_t,
+		 z0_res = svreinterpret_s64_u16 (z0),
+		 z0_res = svreinterpret_s64 (z0))
+
+/*
+** reinterpret_s64_u16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s64_u16_untied, svint64_t, svuint16_t,
+	     z0 = svreinterpret_s64_u16 (z4),
+	     z0 = svreinterpret_s64 (z4))
+
+/*
+** reinterpret_s64_u32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s64_u32_tied1, svint64_t, svuint32_t,
+		 z0_res = svreinterpret_s64_u32 (z0),
+		 z0_res = svreinterpret_s64 (z0))
+
+/*
+** reinterpret_s64_u32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s64_u32_untied, svint64_t, svuint32_t,
+	     z0 = svreinterpret_s64_u32 (z4),
+	     z0 = svreinterpret_s64 (z4))
+
+/*
+** reinterpret_s64_u64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s64_u64_tied1, svint64_t, svuint64_t,
+		 z0_res = svreinterpret_s64_u64 (z0),
+		 z0_res = svreinterpret_s64 (z0))
+
+/*
+** reinterpret_s64_u64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s64_u64_untied, svint64_t, svuint64_t,
+	     z0 = svreinterpret_s64_u64 (z4),
+	     z0 = svreinterpret_s64 (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s8.c
new file mode 100644
index 000000000..23a40d0ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_s8.c
@@ -0,0 +1,207 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** reinterpret_s8_bf16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s8_bf16_tied1, svint8_t, svbfloat16_t,
+		 z0_res = svreinterpret_s8_bf16 (z0),
+		 z0_res = svreinterpret_s8 (z0))
+
+/*
+** reinterpret_s8_bf16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s8_bf16_untied, svint8_t, svbfloat16_t,
+	     z0 = svreinterpret_s8_bf16 (z4),
+	     z0 = svreinterpret_s8 (z4))
+
+/*
+** reinterpret_s8_f16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s8_f16_tied1, svint8_t, svfloat16_t,
+		 z0_res = svreinterpret_s8_f16 (z0),
+		 z0_res = svreinterpret_s8 (z0))
+
+/*
+** reinterpret_s8_f16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s8_f16_untied, svint8_t, svfloat16_t,
+	     z0 = svreinterpret_s8_f16 (z4),
+	     z0 = svreinterpret_s8 (z4))
+
+/*
+** reinterpret_s8_f32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s8_f32_tied1, svint8_t, svfloat32_t,
+		 z0_res = svreinterpret_s8_f32 (z0),
+		 z0_res = svreinterpret_s8 (z0))
+
+/*
+** reinterpret_s8_f32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s8_f32_untied, svint8_t, svfloat32_t,
+	     z0 = svreinterpret_s8_f32 (z4),
+	     z0 = svreinterpret_s8 (z4))
+
+/*
+** reinterpret_s8_f64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s8_f64_tied1, svint8_t, svfloat64_t,
+		 z0_res = svreinterpret_s8_f64 (z0),
+		 z0_res = svreinterpret_s8 (z0))
+
+/*
+** reinterpret_s8_f64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s8_f64_untied, svint8_t, svfloat64_t,
+	     z0 = svreinterpret_s8_f64 (z4),
+	     z0 = svreinterpret_s8 (z4))
+
+/*
+** reinterpret_s8_s8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s8_s8_tied1, svint8_t, svint8_t,
+		 z0_res = svreinterpret_s8_s8 (z0),
+		 z0_res = svreinterpret_s8 (z0))
+
+/*
+** reinterpret_s8_s8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s8_s8_untied, svint8_t, svint8_t,
+	     z0 = svreinterpret_s8_s8 (z4),
+	     z0 = svreinterpret_s8 (z4))
+
+/*
+** reinterpret_s8_s16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s8_s16_tied1, svint8_t, svint16_t,
+		 z0_res = svreinterpret_s8_s16 (z0),
+		 z0_res = svreinterpret_s8 (z0))
+
+/*
+** reinterpret_s8_s16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s8_s16_untied, svint8_t, svint16_t,
+	     z0 = svreinterpret_s8_s16 (z4),
+	     z0 = svreinterpret_s8 (z4))
+
+/*
+** reinterpret_s8_s32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s8_s32_tied1, svint8_t, svint32_t,
+		 z0_res = svreinterpret_s8_s32 (z0),
+		 z0_res = svreinterpret_s8 (z0))
+
+/*
+** reinterpret_s8_s32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s8_s32_untied, svint8_t, svint32_t,
+	     z0 = svreinterpret_s8_s32 (z4),
+	     z0 = svreinterpret_s8 (z4))
+
+/*
+** reinterpret_s8_s64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s8_s64_tied1, svint8_t, svint64_t,
+		 z0_res = svreinterpret_s8_s64 (z0),
+		 z0_res = svreinterpret_s8 (z0))
+
+/*
+** reinterpret_s8_s64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s8_s64_untied, svint8_t, svint64_t,
+	     z0 = svreinterpret_s8_s64 (z4),
+	     z0 = svreinterpret_s8 (z4))
+
+/*
+** reinterpret_s8_u8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s8_u8_tied1, svint8_t, svuint8_t,
+		 z0_res = svreinterpret_s8_u8 (z0),
+		 z0_res = svreinterpret_s8 (z0))
+
+/*
+** reinterpret_s8_u8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s8_u8_untied, svint8_t, svuint8_t,
+	     z0 = svreinterpret_s8_u8 (z4),
+	     z0 = svreinterpret_s8 (z4))
+
+/*
+** reinterpret_s8_u16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s8_u16_tied1, svint8_t, svuint16_t,
+		 z0_res = svreinterpret_s8_u16 (z0),
+		 z0_res = svreinterpret_s8 (z0))
+
+/*
+** reinterpret_s8_u16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s8_u16_untied, svint8_t, svuint16_t,
+	     z0 = svreinterpret_s8_u16 (z4),
+	     z0 = svreinterpret_s8 (z4))
+
+/*
+** reinterpret_s8_u32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s8_u32_tied1, svint8_t, svuint32_t,
+		 z0_res = svreinterpret_s8_u32 (z0),
+		 z0_res = svreinterpret_s8 (z0))
+
+/*
+** reinterpret_s8_u32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s8_u32_untied, svint8_t, svuint32_t,
+	     z0 = svreinterpret_s8_u32 (z4),
+	     z0 = svreinterpret_s8 (z4))
+
+/*
+** reinterpret_s8_u64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s8_u64_tied1, svint8_t, svuint64_t,
+		 z0_res = svreinterpret_s8_u64 (z0),
+		 z0_res = svreinterpret_s8 (z0))
+
+/*
+** reinterpret_s8_u64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_s8_u64_untied, svint8_t, svuint64_t,
+	     z0 = svreinterpret_s8_u64 (z4),
+	     z0 = svreinterpret_s8 (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u16.c
new file mode 100644
index 000000000..48e8ecaff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u16.c
@@ -0,0 +1,207 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** reinterpret_u16_bf16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u16_bf16_tied1, svuint16_t, svbfloat16_t,
+		 z0_res = svreinterpret_u16_bf16 (z0),
+		 z0_res = svreinterpret_u16 (z0))
+
+/*
+** reinterpret_u16_bf16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u16_bf16_untied, svuint16_t, svbfloat16_t,
+	     z0 = svreinterpret_u16_bf16 (z4),
+	     z0 = svreinterpret_u16 (z4))
+
+/*
+** reinterpret_u16_f16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u16_f16_tied1, svuint16_t, svfloat16_t,
+		 z0_res = svreinterpret_u16_f16 (z0),
+		 z0_res = svreinterpret_u16 (z0))
+
+/*
+** reinterpret_u16_f16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u16_f16_untied, svuint16_t, svfloat16_t,
+	     z0 = svreinterpret_u16_f16 (z4),
+	     z0 = svreinterpret_u16 (z4))
+
+/*
+** reinterpret_u16_f32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u16_f32_tied1, svuint16_t, svfloat32_t,
+		 z0_res = svreinterpret_u16_f32 (z0),
+		 z0_res = svreinterpret_u16 (z0))
+
+/*
+** reinterpret_u16_f32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u16_f32_untied, svuint16_t, svfloat32_t,
+	     z0 = svreinterpret_u16_f32 (z4),
+	     z0 = svreinterpret_u16 (z4))
+
+/*
+** reinterpret_u16_f64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u16_f64_tied1, svuint16_t, svfloat64_t,
+		 z0_res = svreinterpret_u16_f64 (z0),
+		 z0_res = svreinterpret_u16 (z0))
+
+/*
+** reinterpret_u16_f64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u16_f64_untied, svuint16_t, svfloat64_t,
+	     z0 = svreinterpret_u16_f64 (z4),
+	     z0 = svreinterpret_u16 (z4))
+
+/*
+** reinterpret_u16_s8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u16_s8_tied1, svuint16_t, svint8_t,
+		 z0_res = svreinterpret_u16_s8 (z0),
+		 z0_res = svreinterpret_u16 (z0))
+
+/*
+** reinterpret_u16_s8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u16_s8_untied, svuint16_t, svint8_t,
+	     z0 = svreinterpret_u16_s8 (z4),
+	     z0 = svreinterpret_u16 (z4))
+
+/*
+** reinterpret_u16_s16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u16_s16_tied1, svuint16_t, svint16_t,
+		 z0_res = svreinterpret_u16_s16 (z0),
+		 z0_res = svreinterpret_u16 (z0))
+
+/*
+** reinterpret_u16_s16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u16_s16_untied, svuint16_t, svint16_t,
+	     z0 = svreinterpret_u16_s16 (z4),
+	     z0 = svreinterpret_u16 (z4))
+
+/*
+** reinterpret_u16_s32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u16_s32_tied1, svuint16_t, svint32_t,
+		 z0_res = svreinterpret_u16_s32 (z0),
+		 z0_res = svreinterpret_u16 (z0))
+
+/*
+** reinterpret_u16_s32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u16_s32_untied, svuint16_t, svint32_t,
+	     z0 = svreinterpret_u16_s32 (z4),
+	     z0 = svreinterpret_u16 (z4))
+
+/*
+** reinterpret_u16_s64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u16_s64_tied1, svuint16_t, svint64_t,
+		 z0_res = svreinterpret_u16_s64 (z0),
+		 z0_res = svreinterpret_u16 (z0))
+
+/*
+** reinterpret_u16_s64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u16_s64_untied, svuint16_t, svint64_t,
+	     z0 = svreinterpret_u16_s64 (z4),
+	     z0 = svreinterpret_u16 (z4))
+
+/*
+** reinterpret_u16_u8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u16_u8_tied1, svuint16_t, svuint8_t,
+		 z0_res = svreinterpret_u16_u8 (z0),
+		 z0_res = svreinterpret_u16 (z0))
+
+/*
+** reinterpret_u16_u8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u16_u8_untied, svuint16_t, svuint8_t,
+	     z0 = svreinterpret_u16_u8 (z4),
+	     z0 = svreinterpret_u16 (z4))
+
+/*
+** reinterpret_u16_u16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u16_u16_tied1, svuint16_t, svuint16_t,
+		 z0_res = svreinterpret_u16_u16 (z0),
+		 z0_res = svreinterpret_u16 (z0))
+
+/*
+** reinterpret_u16_u16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u16_u16_untied, svuint16_t, svuint16_t,
+	     z0 = svreinterpret_u16_u16 (z4),
+	     z0 = svreinterpret_u16 (z4))
+
+/*
+** reinterpret_u16_u32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u16_u32_tied1, svuint16_t, svuint32_t,
+		 z0_res = svreinterpret_u16_u32 (z0),
+		 z0_res = svreinterpret_u16 (z0))
+
+/*
+** reinterpret_u16_u32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u16_u32_untied, svuint16_t, svuint32_t,
+	     z0 = svreinterpret_u16_u32 (z4),
+	     z0 = svreinterpret_u16 (z4))
+
+/*
+** reinterpret_u16_u64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u16_u64_tied1, svuint16_t, svuint64_t,
+		 z0_res = svreinterpret_u16_u64 (z0),
+		 z0_res = svreinterpret_u16 (z0))
+
+/*
+** reinterpret_u16_u64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u16_u64_untied, svuint16_t, svuint64_t,
+	     z0 = svreinterpret_u16_u64 (z4),
+	     z0 = svreinterpret_u16 (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u32.c
new file mode 100644
index 000000000..1d4e85712
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u32.c
@@ -0,0 +1,207 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** reinterpret_u32_bf16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u32_bf16_tied1, svuint32_t, svbfloat16_t,
+		 z0_res = svreinterpret_u32_bf16 (z0),
+		 z0_res = svreinterpret_u32 (z0))
+
+/*
+** reinterpret_u32_bf16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u32_bf16_untied, svuint32_t, svbfloat16_t,
+	     z0 = svreinterpret_u32_bf16 (z4),
+	     z0 = svreinterpret_u32 (z4))
+
+/*
+** reinterpret_u32_f16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u32_f16_tied1, svuint32_t, svfloat16_t,
+		 z0_res = svreinterpret_u32_f16 (z0),
+		 z0_res = svreinterpret_u32 (z0))
+
+/*
+** reinterpret_u32_f16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u32_f16_untied, svuint32_t, svfloat16_t,
+	     z0 = svreinterpret_u32_f16 (z4),
+	     z0 = svreinterpret_u32 (z4))
+
+/*
+** reinterpret_u32_f32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u32_f32_tied1, svuint32_t, svfloat32_t,
+		 z0_res = svreinterpret_u32_f32 (z0),
+		 z0_res = svreinterpret_u32 (z0))
+
+/*
+** reinterpret_u32_f32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u32_f32_untied, svuint32_t, svfloat32_t,
+	     z0 = svreinterpret_u32_f32 (z4),
+	     z0 = svreinterpret_u32 (z4))
+
+/*
+** reinterpret_u32_f64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u32_f64_tied1, svuint32_t, svfloat64_t,
+		 z0_res = svreinterpret_u32_f64 (z0),
+		 z0_res = svreinterpret_u32 (z0))
+
+/*
+** reinterpret_u32_f64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u32_f64_untied, svuint32_t, svfloat64_t,
+	     z0 = svreinterpret_u32_f64 (z4),
+	     z0 = svreinterpret_u32 (z4))
+
+/*
+** reinterpret_u32_s8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u32_s8_tied1, svuint32_t, svint8_t,
+		 z0_res = svreinterpret_u32_s8 (z0),
+		 z0_res = svreinterpret_u32 (z0))
+
+/*
+** reinterpret_u32_s8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u32_s8_untied, svuint32_t, svint8_t,
+	     z0 = svreinterpret_u32_s8 (z4),
+	     z0 = svreinterpret_u32 (z4))
+
+/*
+** reinterpret_u32_s16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u32_s16_tied1, svuint32_t, svint16_t,
+		 z0_res = svreinterpret_u32_s16 (z0),
+		 z0_res = svreinterpret_u32 (z0))
+
+/*
+** reinterpret_u32_s16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u32_s16_untied, svuint32_t, svint16_t,
+	     z0 = svreinterpret_u32_s16 (z4),
+	     z0 = svreinterpret_u32 (z4))
+
+/*
+** reinterpret_u32_s32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u32_s32_tied1, svuint32_t, svint32_t,
+		 z0_res = svreinterpret_u32_s32 (z0),
+		 z0_res = svreinterpret_u32 (z0))
+
+/*
+** reinterpret_u32_s32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u32_s32_untied, svuint32_t, svint32_t,
+	     z0 = svreinterpret_u32_s32 (z4),
+	     z0 = svreinterpret_u32 (z4))
+
+/*
+** reinterpret_u32_s64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u32_s64_tied1, svuint32_t, svint64_t,
+		 z0_res = svreinterpret_u32_s64 (z0),
+		 z0_res = svreinterpret_u32 (z0))
+
+/*
+** reinterpret_u32_s64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u32_s64_untied, svuint32_t, svint64_t,
+	     z0 = svreinterpret_u32_s64 (z4),
+	     z0 = svreinterpret_u32 (z4))
+
+/*
+** reinterpret_u32_u8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u32_u8_tied1, svuint32_t, svuint8_t,
+		 z0_res = svreinterpret_u32_u8 (z0),
+		 z0_res = svreinterpret_u32 (z0))
+
+/*
+** reinterpret_u32_u8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u32_u8_untied, svuint32_t, svuint8_t,
+	     z0 = svreinterpret_u32_u8 (z4),
+	     z0 = svreinterpret_u32 (z4))
+
+/*
+** reinterpret_u32_u16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u32_u16_tied1, svuint32_t, svuint16_t,
+		 z0_res = svreinterpret_u32_u16 (z0),
+		 z0_res = svreinterpret_u32 (z0))
+
+/*
+** reinterpret_u32_u16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u32_u16_untied, svuint32_t, svuint16_t,
+	     z0 = svreinterpret_u32_u16 (z4),
+	     z0 = svreinterpret_u32 (z4))
+
+/*
+** reinterpret_u32_u32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u32_u32_tied1, svuint32_t, svuint32_t,
+		 z0_res = svreinterpret_u32_u32 (z0),
+		 z0_res = svreinterpret_u32 (z0))
+
+/*
+** reinterpret_u32_u32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u32_u32_untied, svuint32_t, svuint32_t,
+	     z0 = svreinterpret_u32_u32 (z4),
+	     z0 = svreinterpret_u32 (z4))
+
+/*
+** reinterpret_u32_u64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u32_u64_tied1, svuint32_t, svuint64_t,
+		 z0_res = svreinterpret_u32_u64 (z0),
+		 z0_res = svreinterpret_u32 (z0))
+
+/*
+** reinterpret_u32_u64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u32_u64_untied, svuint32_t, svuint64_t,
+	     z0 = svreinterpret_u32_u64 (z4),
+	     z0 = svreinterpret_u32 (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u64.c
new file mode 100644
index 000000000..07af69dce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u64.c
@@ -0,0 +1,207 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** reinterpret_u64_bf16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u64_bf16_tied1, svuint64_t, svbfloat16_t,
+		 z0_res = svreinterpret_u64_bf16 (z0),
+		 z0_res = svreinterpret_u64 (z0))
+
+/*
+** reinterpret_u64_bf16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u64_bf16_untied, svuint64_t, svbfloat16_t,
+	     z0 = svreinterpret_u64_bf16 (z4),
+	     z0 = svreinterpret_u64 (z4))
+
+/*
+** reinterpret_u64_f16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u64_f16_tied1, svuint64_t, svfloat16_t,
+		 z0_res = svreinterpret_u64_f16 (z0),
+		 z0_res = svreinterpret_u64 (z0))
+
+/*
+** reinterpret_u64_f16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u64_f16_untied, svuint64_t, svfloat16_t,
+	     z0 = svreinterpret_u64_f16 (z4),
+	     z0 = svreinterpret_u64 (z4))
+
+/*
+** reinterpret_u64_f32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u64_f32_tied1, svuint64_t, svfloat32_t,
+		 z0_res = svreinterpret_u64_f32 (z0),
+		 z0_res = svreinterpret_u64 (z0))
+
+/*
+** reinterpret_u64_f32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u64_f32_untied, svuint64_t, svfloat32_t,
+	     z0 = svreinterpret_u64_f32 (z4),
+	     z0 = svreinterpret_u64 (z4))
+
+/*
+** reinterpret_u64_f64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u64_f64_tied1, svuint64_t, svfloat64_t,
+		 z0_res = svreinterpret_u64_f64 (z0),
+		 z0_res = svreinterpret_u64 (z0))
+
+/*
+** reinterpret_u64_f64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u64_f64_untied, svuint64_t, svfloat64_t,
+	     z0 = svreinterpret_u64_f64 (z4),
+	     z0 = svreinterpret_u64 (z4))
+
+/*
+** reinterpret_u64_s8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u64_s8_tied1, svuint64_t, svint8_t,
+		 z0_res = svreinterpret_u64_s8 (z0),
+		 z0_res = svreinterpret_u64 (z0))
+
+/*
+** reinterpret_u64_s8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u64_s8_untied, svuint64_t, svint8_t,
+	     z0 = svreinterpret_u64_s8 (z4),
+	     z0 = svreinterpret_u64 (z4))
+
+/*
+** reinterpret_u64_s16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u64_s16_tied1, svuint64_t, svint16_t,
+		 z0_res = svreinterpret_u64_s16 (z0),
+		 z0_res = svreinterpret_u64 (z0))
+
+/*
+** reinterpret_u64_s16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u64_s16_untied, svuint64_t, svint16_t,
+	     z0 = svreinterpret_u64_s16 (z4),
+	     z0 = svreinterpret_u64 (z4))
+
+/*
+** reinterpret_u64_s32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u64_s32_tied1, svuint64_t, svint32_t,
+		 z0_res = svreinterpret_u64_s32 (z0),
+		 z0_res = svreinterpret_u64 (z0))
+
+/*
+** reinterpret_u64_s32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u64_s32_untied, svuint64_t, svint32_t,
+	     z0 = svreinterpret_u64_s32 (z4),
+	     z0 = svreinterpret_u64 (z4))
+
+/*
+** reinterpret_u64_s64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u64_s64_tied1, svuint64_t, svint64_t,
+		 z0_res = svreinterpret_u64_s64 (z0),
+		 z0_res = svreinterpret_u64 (z0))
+
+/*
+** reinterpret_u64_s64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u64_s64_untied, svuint64_t, svint64_t,
+	     z0 = svreinterpret_u64_s64 (z4),
+	     z0 = svreinterpret_u64 (z4))
+
+/*
+** reinterpret_u64_u8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u64_u8_tied1, svuint64_t, svuint8_t,
+		 z0_res = svreinterpret_u64_u8 (z0),
+		 z0_res = svreinterpret_u64 (z0))
+
+/*
+** reinterpret_u64_u8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u64_u8_untied, svuint64_t, svuint8_t,
+	     z0 = svreinterpret_u64_u8 (z4),
+	     z0 = svreinterpret_u64 (z4))
+
+/*
+** reinterpret_u64_u16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u64_u16_tied1, svuint64_t, svuint16_t,
+		 z0_res = svreinterpret_u64_u16 (z0),
+		 z0_res = svreinterpret_u64 (z0))
+
+/*
+** reinterpret_u64_u16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u64_u16_untied, svuint64_t, svuint16_t,
+	     z0 = svreinterpret_u64_u16 (z4),
+	     z0 = svreinterpret_u64 (z4))
+
+/*
+** reinterpret_u64_u32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u64_u32_tied1, svuint64_t, svuint32_t,
+		 z0_res = svreinterpret_u64_u32 (z0),
+		 z0_res = svreinterpret_u64 (z0))
+
+/*
+** reinterpret_u64_u32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u64_u32_untied, svuint64_t, svuint32_t,
+	     z0 = svreinterpret_u64_u32 (z4),
+	     z0 = svreinterpret_u64 (z4))
+
+/*
+** reinterpret_u64_u64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u64_u64_tied1, svuint64_t, svuint64_t,
+		 z0_res = svreinterpret_u64_u64 (z0),
+		 z0_res = svreinterpret_u64 (z0))
+
+/*
+** reinterpret_u64_u64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u64_u64_untied, svuint64_t, svuint64_t,
+	     z0 = svreinterpret_u64_u64 (z4),
+	     z0 = svreinterpret_u64 (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u8.c
new file mode 100644
index 000000000..a4c7f4c8d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/reinterpret_u8.c
@@ -0,0 +1,207 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** reinterpret_u8_bf16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u8_bf16_tied1, svuint8_t, svbfloat16_t,
+		 z0_res = svreinterpret_u8_bf16 (z0),
+		 z0_res = svreinterpret_u8 (z0))
+
+/*
+** reinterpret_u8_bf16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u8_bf16_untied, svuint8_t, svbfloat16_t,
+	     z0 = svreinterpret_u8_bf16 (z4),
+	     z0 = svreinterpret_u8 (z4))
+
+/*
+** reinterpret_u8_f16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u8_f16_tied1, svuint8_t, svfloat16_t,
+		 z0_res = svreinterpret_u8_f16 (z0),
+		 z0_res = svreinterpret_u8 (z0))
+
+/*
+** reinterpret_u8_f16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u8_f16_untied, svuint8_t, svfloat16_t,
+	     z0 = svreinterpret_u8_f16 (z4),
+	     z0 = svreinterpret_u8 (z4))
+
+/*
+** reinterpret_u8_f32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u8_f32_tied1, svuint8_t, svfloat32_t,
+		 z0_res = svreinterpret_u8_f32 (z0),
+		 z0_res = svreinterpret_u8 (z0))
+
+/*
+** reinterpret_u8_f32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u8_f32_untied, svuint8_t, svfloat32_t,
+	     z0 = svreinterpret_u8_f32 (z4),
+	     z0 = svreinterpret_u8 (z4))
+
+/*
+** reinterpret_u8_f64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u8_f64_tied1, svuint8_t, svfloat64_t,
+		 z0_res = svreinterpret_u8_f64 (z0),
+		 z0_res = svreinterpret_u8 (z0))
+
+/*
+** reinterpret_u8_f64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u8_f64_untied, svuint8_t, svfloat64_t,
+	     z0 = svreinterpret_u8_f64 (z4),
+	     z0 = svreinterpret_u8 (z4))
+
+/*
+** reinterpret_u8_s8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u8_s8_tied1, svuint8_t, svint8_t,
+		 z0_res = svreinterpret_u8_s8 (z0),
+		 z0_res = svreinterpret_u8 (z0))
+
+/*
+** reinterpret_u8_s8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u8_s8_untied, svuint8_t, svint8_t,
+	     z0 = svreinterpret_u8_s8 (z4),
+	     z0 = svreinterpret_u8 (z4))
+
+/*
+** reinterpret_u8_s16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u8_s16_tied1, svuint8_t, svint16_t,
+		 z0_res = svreinterpret_u8_s16 (z0),
+		 z0_res = svreinterpret_u8 (z0))
+
+/*
+** reinterpret_u8_s16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u8_s16_untied, svuint8_t, svint16_t,
+	     z0 = svreinterpret_u8_s16 (z4),
+	     z0 = svreinterpret_u8 (z4))
+
+/*
+** reinterpret_u8_s32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u8_s32_tied1, svuint8_t, svint32_t,
+		 z0_res = svreinterpret_u8_s32 (z0),
+		 z0_res = svreinterpret_u8 (z0))
+
+/*
+** reinterpret_u8_s32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u8_s32_untied, svuint8_t, svint32_t,
+	     z0 = svreinterpret_u8_s32 (z4),
+	     z0 = svreinterpret_u8 (z4))
+
+/*
+** reinterpret_u8_s64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u8_s64_tied1, svuint8_t, svint64_t,
+		 z0_res = svreinterpret_u8_s64 (z0),
+		 z0_res = svreinterpret_u8 (z0))
+
+/*
+** reinterpret_u8_s64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u8_s64_untied, svuint8_t, svint64_t,
+	     z0 = svreinterpret_u8_s64 (z4),
+	     z0 = svreinterpret_u8 (z4))
+
+/*
+** reinterpret_u8_u8_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u8_u8_tied1, svuint8_t, svuint8_t,
+		 z0_res = svreinterpret_u8_u8 (z0),
+		 z0_res = svreinterpret_u8 (z0))
+
+/*
+** reinterpret_u8_u8_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u8_u8_untied, svuint8_t, svuint8_t,
+	     z0 = svreinterpret_u8_u8 (z4),
+	     z0 = svreinterpret_u8 (z4))
+
+/*
+** reinterpret_u8_u16_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u8_u16_tied1, svuint8_t, svuint16_t,
+		 z0_res = svreinterpret_u8_u16 (z0),
+		 z0_res = svreinterpret_u8 (z0))
+
+/*
+** reinterpret_u8_u16_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u8_u16_untied, svuint8_t, svuint16_t,
+	     z0 = svreinterpret_u8_u16 (z4),
+	     z0 = svreinterpret_u8 (z4))
+
+/*
+** reinterpret_u8_u32_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u8_u32_tied1, svuint8_t, svuint32_t,
+		 z0_res = svreinterpret_u8_u32 (z0),
+		 z0_res = svreinterpret_u8 (z0))
+
+/*
+** reinterpret_u8_u32_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u8_u32_untied, svuint8_t, svuint32_t,
+	     z0 = svreinterpret_u8_u32 (z4),
+	     z0 = svreinterpret_u8 (z4))
+
+/*
+** reinterpret_u8_u64_tied1:
+**	ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u8_u64_tied1, svuint8_t, svuint64_t,
+		 z0_res = svreinterpret_u8_u64 (z0),
+		 z0_res = svreinterpret_u8 (z0))
+
+/*
+** reinterpret_u8_u64_untied:
+**	mov	z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (reinterpret_u8_u64_untied, svuint8_t, svuint64_t,
+	     z0 = svreinterpret_u8_u64 (z4),
+	     z0 = svreinterpret_u8 (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b16.c
new file mode 100644
index 000000000..7d5c67d5c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_b16_tied1:
+**	rev	p0\.h, p0\.h
+**	ret
+*/
+TEST_UNIFORM_P (rev_b16_tied1,
+		p0 = svrev_b16 (p0),
+		p0 = svrev_b16 (p0))
+
+/*
+** rev_b16_untied:
+**	rev	p0\.h, p1\.h
+**	ret
+*/
+TEST_UNIFORM_P (rev_b16_untied,
+		p0 = svrev_b16 (p1),
+		p0 = svrev_b16 (p1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b32.c
new file mode 100644
index 000000000..3f8c810c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_b32_tied1:
+**	rev	p0\.s, p0\.s
+**	ret
+*/
+TEST_UNIFORM_P (rev_b32_tied1,
+		p0 = svrev_b32 (p0),
+		p0 = svrev_b32 (p0))
+
+/*
+** rev_b32_untied:
+**	rev	p0\.s, p1\.s
+**	ret
+*/
+TEST_UNIFORM_P (rev_b32_untied,
+		p0 = svrev_b32 (p1),
+		p0 = svrev_b32 (p1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b64.c
new file mode 100644
index 000000000..fe937ecc6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_b64_tied1:
+**	rev	p0\.d, p0\.d
+**	ret
+*/
+TEST_UNIFORM_P (rev_b64_tied1,
+		p0 = svrev_b64 (p0),
+		p0 = svrev_b64 (p0))
+
+/*
+** rev_b64_untied:
+**	rev	p0\.d, p1\.d
+**	ret
+*/
+TEST_UNIFORM_P (rev_b64_untied,
+		p0 = svrev_b64 (p1),
+		p0 = svrev_b64 (p1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b8.c
new file mode 100644
index 000000000..d23e50407
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_b8.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_b8_tied1:
+**	rev	p0\.b, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (rev_b8_tied1,
+		p0 = svrev_b8 (p0),
+		p0 = svrev_b8 (p0))
+
+/*
+** rev_b8_untied:
+**	rev	p0\.b, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (rev_b8_untied,
+		p0 = svrev_b8 (p1),
+		p0 = svrev_b8 (p1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_bf16.c
new file mode 100644
index 000000000..fe587d42c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_bf16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_bf16_tied1:
+**	rev	z0\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rev_bf16_tied1, svbfloat16_t,
+		z0 = svrev_bf16 (z0),
+		z0 = svrev (z0))
+
+/*
+** rev_bf16_untied:
+**	rev	z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rev_bf16_untied, svbfloat16_t,
+		z0 = svrev_bf16 (z1),
+		z0 = svrev (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f16.c
new file mode 100644
index 000000000..321e2f900
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_f16_tied1:
+**	rev	z0\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rev_f16_tied1, svfloat16_t,
+		z0 = svrev_f16 (z0),
+		z0 = svrev (z0))
+
+/*
+** rev_f16_untied:
+**	rev	z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rev_f16_untied, svfloat16_t,
+		z0 = svrev_f16 (z1),
+		z0 = svrev (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f32.c
new file mode 100644
index 000000000..6f31928b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_f32_tied1:
+**	rev	z0\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rev_f32_tied1, svfloat32_t,
+		z0 = svrev_f32 (z0),
+		z0 = svrev (z0))
+
+/*
+** rev_f32_untied:
+**	rev	z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rev_f32_untied, svfloat32_t,
+		z0 = svrev_f32 (z1),
+		z0 = svrev (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f64.c
new file mode 100644
index 000000000..6f14078a7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_f64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_f64_tied1:
+**	rev	z0\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rev_f64_tied1, svfloat64_t,
+		z0 = svrev_f64 (z0),
+		z0 = svrev (z0))
+
+/*
+** rev_f64_untied:
+**	rev	z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rev_f64_untied, svfloat64_t,
+		z0 = svrev_f64 (z1),
+		z0 = svrev (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s16.c
new file mode 100644
index 000000000..63f6ea73c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_s16_tied1:
+**	rev	z0\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rev_s16_tied1, svint16_t,
+		z0 = svrev_s16 (z0),
+		z0 = svrev (z0))
+
+/*
+** rev_s16_untied:
+**	rev	z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rev_s16_untied, svint16_t,
+		z0 = svrev_s16 (z1),
+		z0 = svrev (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s32.c
new file mode 100644
index 000000000..38240b7ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_s32_tied1:
+**	rev	z0\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rev_s32_tied1, svint32_t,
+		z0 = svrev_s32 (z0),
+		z0 = svrev (z0))
+
+/*
+** rev_s32_untied:
+**	rev	z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rev_s32_untied, svint32_t,
+		z0 = svrev_s32 (z1),
+		z0 = svrev (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s64.c
new file mode 100644
index 000000000..0004e4586
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_s64_tied1:
+**	rev	z0\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rev_s64_tied1, svint64_t,
+		z0 = svrev_s64 (z0),
+		z0 = svrev (z0))
+
+/*
+** rev_s64_untied:
+**	rev	z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rev_s64_untied, svint64_t,
+		z0 = svrev_s64 (z1),
+		z0 = svrev (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s8.c
new file mode 100644
index 000000000..44b874c92
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_s8.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_s8_tied1:
+**	rev	z0\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rev_s8_tied1, svint8_t,
+		z0 = svrev_s8 (z0),
+		z0 = svrev (z0))
+
+/*
+** rev_s8_untied:
+**	rev	z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rev_s8_untied, svint8_t,
+		z0 = svrev_s8 (z1),
+		z0 = svrev (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u16.c
new file mode 100644
index 000000000..2b4c88854
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_u16_tied1:
+**	rev	z0\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rev_u16_tied1, svuint16_t,
+		z0 = svrev_u16 (z0),
+		z0 = svrev (z0))
+
+/*
+** rev_u16_untied:
+**	rev	z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rev_u16_untied, svuint16_t,
+		z0 = svrev_u16 (z1),
+		z0 = svrev (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u32.c
new file mode 100644
index 000000000..e14351f30
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_u32_tied1:
+**	rev	z0\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rev_u32_tied1, svuint32_t,
+		z0 = svrev_u32 (z0),
+		z0 = svrev (z0))
+
+/*
+** rev_u32_untied:
+**	rev	z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rev_u32_untied, svuint32_t,
+		z0 = svrev_u32 (z1),
+		z0 = svrev (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u64.c
new file mode 100644
index 000000000..5fc987475
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_u64_tied1:
+**	rev	z0\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rev_u64_tied1, svuint64_t,
+		z0 = svrev_u64 (z0),
+		z0 = svrev (z0))
+
+/*
+** rev_u64_untied:
+**	rev	z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rev_u64_untied, svuint64_t,
+		z0 = svrev_u64 (z1),
+		z0 = svrev (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u8.c
new file mode 100644
index 000000000..9dd4f440b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rev_u8.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_u8_tied1:
+**	rev	z0\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rev_u8_tied1, svuint8_t,
+		z0 = svrev_u8 (z0),
+		z0 = svrev (z0))
+
+/*
+** rev_u8_untied:
+**	rev	z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (rev_u8_untied, svuint8_t,
+		z0 = svrev_u8 (z1),
+		z0 = svrev (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s16.c
new file mode 100644
index 000000000..ecfabe668
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s16.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** revb_s16_m_tied12:
+**	revb	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s16_m_tied12, svint16_t,
+		z0 = svrevb_s16_m (z0, p0, z0),
+		z0 = svrevb_m (z0, p0, z0))
+
+/*
+** revb_s16_m_tied1:
+**	revb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s16_m_tied1, svint16_t,
+		z0 = svrevb_s16_m (z0, p0, z1),
+		z0 = svrevb_m (z0, p0, z1))
+
+/*
+** revb_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	revb	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s16_m_tied2, svint16_t,
+		z0 = svrevb_s16_m (z1, p0, z0),
+		z0 = svrevb_m (z1, p0, z0))
+
+/*
+** revb_s16_m_untied:
+**	movprfx	z0, z2
+**	revb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s16_m_untied, svint16_t,
+		z0 = svrevb_s16_m (z2, p0, z1),
+		z0 = svrevb_m (z2, p0, z1))
+
+/*
+** revb_s16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	revb	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s16_z_tied1, svint16_t,
+		z0 = svrevb_s16_z (p0, z0),
+		z0 = svrevb_z (p0, z0))
+
+/*
+** revb_s16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	revb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s16_z_untied, svint16_t,
+		z0 = svrevb_s16_z (p0, z1),
+		z0 = svrevb_z (p0, z1))
+
+/*
+** revb_s16_x_tied1:
+**	revb	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s16_x_tied1, svint16_t,
+		z0 = svrevb_s16_x (p0, z0),
+		z0 = svrevb_x (p0, z0))
+
+/*
+** revb_s16_x_untied:
+**	revb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s16_x_untied, svint16_t,
+		z0 = svrevb_s16_x (p0, z1),
+		z0 = svrevb_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s32.c
new file mode 100644
index 000000000..a46a81973
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s32.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** revb_s32_m_tied12:
+**	revb	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s32_m_tied12, svint32_t,
+		z0 = svrevb_s32_m (z0, p0, z0),
+		z0 = svrevb_m (z0, p0, z0))
+
+/*
+** revb_s32_m_tied1:
+**	revb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s32_m_tied1, svint32_t,
+		z0 = svrevb_s32_m (z0, p0, z1),
+		z0 = svrevb_m (z0, p0, z1))
+
+/*
+** revb_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	revb	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s32_m_tied2, svint32_t,
+		z0 = svrevb_s32_m (z1, p0, z0),
+		z0 = svrevb_m (z1, p0, z0))
+
+/*
+** revb_s32_m_untied:
+**	movprfx	z0, z2
+**	revb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s32_m_untied, svint32_t,
+		z0 = svrevb_s32_m (z2, p0, z1),
+		z0 = svrevb_m (z2, p0, z1))
+
+/*
+** revb_s32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	revb	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s32_z_tied1, svint32_t,
+		z0 = svrevb_s32_z (p0, z0),
+		z0 = svrevb_z (p0, z0))
+
+/*
+** revb_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	revb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s32_z_untied, svint32_t,
+		z0 = svrevb_s32_z (p0, z1),
+		z0 = svrevb_z (p0, z1))
+
+/*
+** revb_s32_x_tied1:
+**	revb	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s32_x_tied1, svint32_t,
+		z0 = svrevb_s32_x (p0, z0),
+		z0 = svrevb_x (p0, z0))
+
+/*
+** revb_s32_x_untied:
+**	revb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s32_x_untied, svint32_t,
+		z0 = svrevb_s32_x (p0, z1),
+		z0 = svrevb_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s64.c
new file mode 100644
index 000000000..21547238c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_s64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** revb_s64_m_tied12:
+**	revb	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s64_m_tied12, svint64_t,
+		z0 = svrevb_s64_m (z0, p0, z0),
+		z0 = svrevb_m (z0, p0, z0))
+
+/*
+** revb_s64_m_tied1:
+**	revb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s64_m_tied1, svint64_t,
+		z0 = svrevb_s64_m (z0, p0, z1),
+		z0 = svrevb_m (z0, p0, z1))
+
+/*
+** revb_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	revb	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s64_m_tied2, svint64_t,
+		z0 = svrevb_s64_m (z1, p0, z0),
+		z0 = svrevb_m (z1, p0, z0))
+
+/*
+** revb_s64_m_untied:
+**	movprfx	z0, z2
+**	revb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s64_m_untied, svint64_t,
+		z0 = svrevb_s64_m (z2, p0, z1),
+		z0 = svrevb_m (z2, p0, z1))
+
+/*
+** revb_s64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	revb	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s64_z_tied1, svint64_t,
+		z0 = svrevb_s64_z (p0, z0),
+		z0 = svrevb_z (p0, z0))
+
+/*
+** revb_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	revb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s64_z_untied, svint64_t,
+		z0 = svrevb_s64_z (p0, z1),
+		z0 = svrevb_z (p0, z1))
+
+/*
+** revb_s64_x_tied1:
+**	revb	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s64_x_tied1, svint64_t,
+		z0 = svrevb_s64_x (p0, z0),
+		z0 = svrevb_x (p0, z0))
+
+/*
+** revb_s64_x_untied:
+**	revb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revb_s64_x_untied, svint64_t,
+		z0 = svrevb_s64_x (p0, z1),
+		z0 = svrevb_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u16.c
new file mode 100644
index 000000000..d58bd3d74
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u16.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** revb_u16_m_tied12:
+**	revb	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u16_m_tied12, svuint16_t,
+		z0 = svrevb_u16_m (z0, p0, z0),
+		z0 = svrevb_m (z0, p0, z0))
+
+/*
+** revb_u16_m_tied1:
+**	revb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u16_m_tied1, svuint16_t,
+		z0 = svrevb_u16_m (z0, p0, z1),
+		z0 = svrevb_m (z0, p0, z1))
+
+/*
+** revb_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	revb	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u16_m_tied2, svuint16_t,
+		z0 = svrevb_u16_m (z1, p0, z0),
+		z0 = svrevb_m (z1, p0, z0))
+
+/*
+** revb_u16_m_untied:
+**	movprfx	z0, z2
+**	revb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u16_m_untied, svuint16_t,
+		z0 = svrevb_u16_m (z2, p0, z1),
+		z0 = svrevb_m (z2, p0, z1))
+
+/*
+** revb_u16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	revb	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u16_z_tied1, svuint16_t,
+		z0 = svrevb_u16_z (p0, z0),
+		z0 = svrevb_z (p0, z0))
+
+/*
+** revb_u16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	revb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u16_z_untied, svuint16_t,
+		z0 = svrevb_u16_z (p0, z1),
+		z0 = svrevb_z (p0, z1))
+
+/*
+** revb_u16_x_tied1:
+**	revb	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u16_x_tied1, svuint16_t,
+		z0 = svrevb_u16_x (p0, z0),
+		z0 = svrevb_x (p0, z0))
+
+/*
+** revb_u16_x_untied:
+**	revb	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u16_x_untied, svuint16_t,
+		z0 = svrevb_u16_x (p0, z1),
+		z0 = svrevb_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u32.c
new file mode 100644
index 000000000..33df990d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u32.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** revb_u32_m_tied12:
+**	revb	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u32_m_tied12, svuint32_t,
+		z0 = svrevb_u32_m (z0, p0, z0),
+		z0 = svrevb_m (z0, p0, z0))
+
+/*
+** revb_u32_m_tied1:
+**	revb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u32_m_tied1, svuint32_t,
+		z0 = svrevb_u32_m (z0, p0, z1),
+		z0 = svrevb_m (z0, p0, z1))
+
+/*
+** revb_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	revb	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u32_m_tied2, svuint32_t,
+		z0 = svrevb_u32_m (z1, p0, z0),
+		z0 = svrevb_m (z1, p0, z0))
+
+/*
+** revb_u32_m_untied:
+**	movprfx	z0, z2
+**	revb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u32_m_untied, svuint32_t,
+		z0 = svrevb_u32_m (z2, p0, z1),
+		z0 = svrevb_m (z2, p0, z1))
+
+/*
+** revb_u32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	revb	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u32_z_tied1, svuint32_t,
+		z0 = svrevb_u32_z (p0, z0),
+		z0 = svrevb_z (p0, z0))
+
+/*
+** revb_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	revb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u32_z_untied, svuint32_t,
+		z0 = svrevb_u32_z (p0, z1),
+		z0 = svrevb_z (p0, z1))
+
+/*
+** revb_u32_x_tied1:
+**	revb	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u32_x_tied1, svuint32_t,
+		z0 = svrevb_u32_x (p0, z0),
+		z0 = svrevb_x (p0, z0))
+
+/*
+** revb_u32_x_untied:
+**	revb	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u32_x_untied, svuint32_t,
+		z0 = svrevb_u32_x (p0, z1),
+		z0 = svrevb_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u64.c
new file mode 100644
index 000000000..50ad618cc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revb_u64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** revb_u64_m_tied12:
+**	revb	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u64_m_tied12, svuint64_t,
+		z0 = svrevb_u64_m (z0, p0, z0),
+		z0 = svrevb_m (z0, p0, z0))
+
+/*
+** revb_u64_m_tied1:
+**	revb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u64_m_tied1, svuint64_t,
+		z0 = svrevb_u64_m (z0, p0, z1),
+		z0 = svrevb_m (z0, p0, z1))
+
+/*
+** revb_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	revb	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u64_m_tied2, svuint64_t,
+		z0 = svrevb_u64_m (z1, p0, z0),
+		z0 = svrevb_m (z1, p0, z0))
+
+/*
+** revb_u64_m_untied:
+**	movprfx	z0, z2
+**	revb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u64_m_untied, svuint64_t,
+		z0 = svrevb_u64_m (z2, p0, z1),
+		z0 = svrevb_m (z2, p0, z1))
+
+/*
+** revb_u64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	revb	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u64_z_tied1, svuint64_t,
+		z0 = svrevb_u64_z (p0, z0),
+		z0 = svrevb_z (p0, z0))
+
+/*
+** revb_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	revb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u64_z_untied, svuint64_t,
+		z0 = svrevb_u64_z (p0, z1),
+		z0 = svrevb_z (p0, z1))
+
+/*
+** revb_u64_x_tied1:
+**	revb	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u64_x_tied1, svuint64_t,
+		z0 = svrevb_u64_x (p0, z0),
+		z0 = svrevb_x (p0, z0))
+
+/*
+** revb_u64_x_untied:
+**	revb	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revb_u64_x_untied, svuint64_t,
+		z0 = svrevb_u64_x (p0, z1),
+		z0 = svrevb_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_s32.c
new file mode 100644
index 000000000..07d512ddb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_s32.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** revh_s32_m_tied12:
+**	revh	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revh_s32_m_tied12, svint32_t,
+		z0 = svrevh_s32_m (z0, p0, z0),
+		z0 = svrevh_m (z0, p0, z0))
+
+/*
+** revh_s32_m_tied1:
+**	revh	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revh_s32_m_tied1, svint32_t,
+		z0 = svrevh_s32_m (z0, p0, z1),
+		z0 = svrevh_m (z0, p0, z1))
+
+/*
+** revh_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	revh	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revh_s32_m_tied2, svint32_t,
+		z0 = svrevh_s32_m (z1, p0, z0),
+		z0 = svrevh_m (z1, p0, z0))
+
+/*
+** revh_s32_m_untied:
+**	movprfx	z0, z2
+**	revh	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revh_s32_m_untied, svint32_t,
+		z0 = svrevh_s32_m (z2, p0, z1),
+		z0 = svrevh_m (z2, p0, z1))
+
+/*
+** revh_s32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	revh	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revh_s32_z_tied1, svint32_t,
+		z0 = svrevh_s32_z (p0, z0),
+		z0 = svrevh_z (p0, z0))
+
+/*
+** revh_s32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	revh	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revh_s32_z_untied, svint32_t,
+		z0 = svrevh_s32_z (p0, z1),
+		z0 = svrevh_z (p0, z1))
+
+/*
+** revh_s32_x_tied1:
+**	revh	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revh_s32_x_tied1, svint32_t,
+		z0 = svrevh_s32_x (p0, z0),
+		z0 = svrevh_x (p0, z0))
+
+/*
+** revh_s32_x_untied:
+**	revh	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revh_s32_x_untied, svint32_t,
+		z0 = svrevh_s32_x (p0, z1),
+		z0 = svrevh_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_s64.c
new file mode 100644
index 000000000..b1446347c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_s64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** revh_s64_m_tied12:
+**	revh	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revh_s64_m_tied12, svint64_t,
+		z0 = svrevh_s64_m (z0, p0, z0),
+		z0 = svrevh_m (z0, p0, z0))
+
+/*
+** revh_s64_m_tied1:
+**	revh	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revh_s64_m_tied1, svint64_t,
+		z0 = svrevh_s64_m (z0, p0, z1),
+		z0 = svrevh_m (z0, p0, z1))
+
+/*
+** revh_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	revh	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (revh_s64_m_tied2, svint64_t,
+		z0 = svrevh_s64_m (z1, p0, z0),
+		z0 = svrevh_m (z1, p0, z0))
+
+/*
+** revh_s64_m_untied:
+**	movprfx	z0, z2
+**	revh	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revh_s64_m_untied, svint64_t,
+		z0 = svrevh_s64_m (z2, p0, z1),
+		z0 = svrevh_m (z2, p0, z1))
+
+/*
+** revh_s64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	revh	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (revh_s64_z_tied1, svint64_t,
+		z0 = svrevh_s64_z (p0, z0),
+		z0 = svrevh_z (p0, z0))
+
+/*
+** revh_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	revh	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revh_s64_z_untied, svint64_t,
+		z0 = svrevh_s64_z (p0, z1),
+		z0 = svrevh_z (p0, z1))
+
+/*
+** revh_s64_x_tied1:
+**	revh	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revh_s64_x_tied1, svint64_t,
+		z0 = svrevh_s64_x (p0, z0),
+		z0 = svrevh_x (p0, z0))
+
+/*
+** revh_s64_x_untied:
+**	revh	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revh_s64_x_untied, svint64_t,
+		z0 = svrevh_s64_x (p0, z1),
+		z0 = svrevh_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_u32.c
new file mode 100644
index 000000000..9ea51884d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_u32.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** revh_u32_m_tied12:
+**	revh	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revh_u32_m_tied12, svuint32_t,
+		z0 = svrevh_u32_m (z0, p0, z0),
+		z0 = svrevh_m (z0, p0, z0))
+
+/*
+** revh_u32_m_tied1:
+**	revh	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revh_u32_m_tied1, svuint32_t,
+		z0 = svrevh_u32_m (z0, p0, z1),
+		z0 = svrevh_m (z0, p0, z1))
+
+/*
+** revh_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	revh	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revh_u32_m_tied2, svuint32_t,
+		z0 = svrevh_u32_m (z1, p0, z0),
+		z0 = svrevh_m (z1, p0, z0))
+
+/*
+** revh_u32_m_untied:
+**	movprfx	z0, z2
+**	revh	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revh_u32_m_untied, svuint32_t,
+		z0 = svrevh_u32_m (z2, p0, z1),
+		z0 = svrevh_m (z2, p0, z1))
+
+/*
+** revh_u32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	revh	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revh_u32_z_tied1, svuint32_t,
+		z0 = svrevh_u32_z (p0, z0),
+		z0 = svrevh_z (p0, z0))
+
+/*
+** revh_u32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	revh	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revh_u32_z_untied, svuint32_t,
+		z0 = svrevh_u32_z (p0, z1),
+		z0 = svrevh_z (p0, z1))
+
+/*
+** revh_u32_x_tied1:
+**	revh	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revh_u32_x_tied1, svuint32_t,
+		z0 = svrevh_u32_x (p0, z0),
+		z0 = svrevh_x (p0, z0))
+
+/*
+** revh_u32_x_untied:
+**	revh	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (revh_u32_x_untied, svuint32_t,
+		z0 = svrevh_u32_x (p0, z1),
+		z0 = svrevh_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_u64.c
new file mode 100644
index 000000000..7b2da2701
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revh_u64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** revh_u64_m_tied12:
+**	revh	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revh_u64_m_tied12, svuint64_t,
+		z0 = svrevh_u64_m (z0, p0, z0),
+		z0 = svrevh_m (z0, p0, z0))
+
+/*
+** revh_u64_m_tied1:
+**	revh	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revh_u64_m_tied1, svuint64_t,
+		z0 = svrevh_u64_m (z0, p0, z1),
+		z0 = svrevh_m (z0, p0, z1))
+
+/*
+** revh_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	revh	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (revh_u64_m_tied2, svuint64_t,
+		z0 = svrevh_u64_m (z1, p0, z0),
+		z0 = svrevh_m (z1, p0, z0))
+
+/*
+** revh_u64_m_untied:
+**	movprfx	z0, z2
+**	revh	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revh_u64_m_untied, svuint64_t,
+		z0 = svrevh_u64_m (z2, p0, z1),
+		z0 = svrevh_m (z2, p0, z1))
+
+/*
+** revh_u64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	revh	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (revh_u64_z_tied1, svuint64_t,
+		z0 = svrevh_u64_z (p0, z0),
+		z0 = svrevh_z (p0, z0))
+
+/*
+** revh_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	revh	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revh_u64_z_untied, svuint64_t,
+		z0 = svrevh_u64_z (p0, z1),
+		z0 = svrevh_z (p0, z1))
+
+/*
+** revh_u64_x_tied1:
+**	revh	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revh_u64_x_tied1, svuint64_t,
+		z0 = svrevh_u64_x (p0, z0),
+		z0 = svrevh_x (p0, z0))
+
+/*
+** revh_u64_x_untied:
+**	revh	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revh_u64_x_untied, svuint64_t,
+		z0 = svrevh_u64_x (p0, z1),
+		z0 = svrevh_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revw_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revw_s64.c
new file mode 100644
index 000000000..26ca0f0bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revw_s64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** revw_s64_m_tied12:
+**	revw	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revw_s64_m_tied12, svint64_t,
+		z0 = svrevw_s64_m (z0, p0, z0),
+		z0 = svrevw_m (z0, p0, z0))
+
+/*
+** revw_s64_m_tied1:
+**	revw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revw_s64_m_tied1, svint64_t,
+		z0 = svrevw_s64_m (z0, p0, z1),
+		z0 = svrevw_m (z0, p0, z1))
+
+/*
+** revw_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	revw	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (revw_s64_m_tied2, svint64_t,
+		z0 = svrevw_s64_m (z1, p0, z0),
+		z0 = svrevw_m (z1, p0, z0))
+
+/*
+** revw_s64_m_untied:
+**	movprfx	z0, z2
+**	revw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revw_s64_m_untied, svint64_t,
+		z0 = svrevw_s64_m (z2, p0, z1),
+		z0 = svrevw_m (z2, p0, z1))
+
+/*
+** revw_s64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	revw	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (revw_s64_z_tied1, svint64_t,
+		z0 = svrevw_s64_z (p0, z0),
+		z0 = svrevw_z (p0, z0))
+
+/*
+** revw_s64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	revw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revw_s64_z_untied, svint64_t,
+		z0 = svrevw_s64_z (p0, z1),
+		z0 = svrevw_z (p0, z1))
+
+/*
+** revw_s64_x_tied1:
+**	revw	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revw_s64_x_tied1, svint64_t,
+		z0 = svrevw_s64_x (p0, z0),
+		z0 = svrevw_x (p0, z0))
+
+/*
+** revw_s64_x_untied:
+**	revw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revw_s64_x_untied, svint64_t,
+		z0 = svrevw_s64_x (p0, z1),
+		z0 = svrevw_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revw_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revw_u64.c
new file mode 100644
index 000000000..c70cdb428
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/revw_u64.c
@@ -0,0 +1,81 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** revw_u64_m_tied12:
+**	revw	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revw_u64_m_tied12, svuint64_t,
+		z0 = svrevw_u64_m (z0, p0, z0),
+		z0 = svrevw_m (z0, p0, z0))
+
+/*
+** revw_u64_m_tied1:
+**	revw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revw_u64_m_tied1, svuint64_t,
+		z0 = svrevw_u64_m (z0, p0, z1),
+		z0 = svrevw_m (z0, p0, z1))
+
+/*
+** revw_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	revw	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (revw_u64_m_tied2, svuint64_t,
+		z0 = svrevw_u64_m (z1, p0, z0),
+		z0 = svrevw_m (z1, p0, z0))
+
+/*
+** revw_u64_m_untied:
+**	movprfx	z0, z2
+**	revw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revw_u64_m_untied, svuint64_t,
+		z0 = svrevw_u64_m (z2, p0, z1),
+		z0 = svrevw_m (z2, p0, z1))
+
+/*
+** revw_u64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	revw	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (revw_u64_z_tied1, svuint64_t,
+		z0 = svrevw_u64_z (p0, z0),
+		z0 = svrevw_z (p0, z0))
+
+/*
+** revw_u64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	revw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revw_u64_z_untied, svuint64_t,
+		z0 = svrevw_u64_z (p0, z1),
+		z0 = svrevw_z (p0, z1))
+
+/*
+** revw_u64_x_tied1:
+**	revw	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revw_u64_x_tied1, svuint64_t,
+		z0 = svrevw_u64_x (p0, z0),
+		z0 = svrevw_x (p0, z0))
+
+/*
+** revw_u64_x_untied:
+**	revw	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (revw_u64_x_untied, svuint64_t,
+		z0 = svrevw_u64_x (p0, z1),
+		z0 = svrevw_x (p0, z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f16.c
new file mode 100644
index 000000000..99a604209
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f16.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rinta_f16_m_tied12:
+**	frinta	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f16_m_tied12, svfloat16_t,
+		z0 = svrinta_f16_m (z0, p0, z0),
+		z0 = svrinta_m (z0, p0, z0))
+
+/*
+** rinta_f16_m_tied1:
+**	frinta	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f16_m_tied1, svfloat16_t,
+		z0 = svrinta_f16_m (z0, p0, z1),
+		z0 = svrinta_m (z0, p0, z1))
+
+/*
+** rinta_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	frinta	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f16_m_tied2, svfloat16_t,
+		z0 = svrinta_f16_m (z1, p0, z0),
+		z0 = svrinta_m (z1, p0, z0))
+
+/*
+** rinta_f16_m_untied:
+**	movprfx	z0, z2
+**	frinta	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f16_m_untied, svfloat16_t,
+		z0 = svrinta_f16_m (z2, p0, z1),
+		z0 = svrinta_m (z2, p0, z1))
+
+/*
+** rinta_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	frinta	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f16_z_tied1, svfloat16_t,
+		z0 = svrinta_f16_z (p0, z0),
+		z0 = svrinta_z (p0, z0))
+
+/*
+** rinta_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	frinta	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f16_z_untied, svfloat16_t,
+		z0 = svrinta_f16_z (p0, z1),
+		z0 = svrinta_z (p0, z1))
+
+/*
+** rinta_f16_x_tied1:
+**	frinta	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f16_x_tied1, svfloat16_t,
+		z0 = svrinta_f16_x (p0, z0),
+		z0 = svrinta_x (p0, z0))
+
+/*
+** rinta_f16_x_untied:
+**	frinta	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f16_x_untied, svfloat16_t,
+		z0 = svrinta_f16_x (p0, z1),
+		z0 = svrinta_x (p0, z1))
+
+/*
+** ptrue_rinta_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rinta_f16_x_tied1, svfloat16_t,
+		z0 = svrinta_f16_x (svptrue_b16 (), z0),
+		z0 = svrinta_x (svptrue_b16 (), z0))
+
+/*
+** ptrue_rinta_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rinta_f16_x_untied, svfloat16_t,
+		z0 = svrinta_f16_x (svptrue_b16 (), z1),
+		z0 = svrinta_x (svptrue_b16 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f32.c
new file mode 100644
index 000000000..b4e3714bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f32.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rinta_f32_m_tied12:
+**	frinta	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f32_m_tied12, svfloat32_t,
+		z0 = svrinta_f32_m (z0, p0, z0),
+		z0 = svrinta_m (z0, p0, z0))
+
+/*
+** rinta_f32_m_tied1:
+**	frinta	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f32_m_tied1, svfloat32_t,
+		z0 = svrinta_f32_m (z0, p0, z1),
+		z0 = svrinta_m (z0, p0, z1))
+
+/*
+** rinta_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	frinta	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f32_m_tied2, svfloat32_t,
+		z0 = svrinta_f32_m (z1, p0, z0),
+		z0 = svrinta_m (z1, p0, z0))
+
+/*
+** rinta_f32_m_untied:
+**	movprfx	z0, z2
+**	frinta	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f32_m_untied, svfloat32_t,
+		z0 = svrinta_f32_m (z2, p0, z1),
+		z0 = svrinta_m (z2, p0, z1))
+
+/*
+** rinta_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	frinta	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f32_z_tied1, svfloat32_t,
+		z0 = svrinta_f32_z (p0, z0),
+		z0 = svrinta_z (p0, z0))
+
+/*
+** rinta_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	frinta	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f32_z_untied, svfloat32_t,
+		z0 = svrinta_f32_z (p0, z1),
+		z0 = svrinta_z (p0, z1))
+
+/*
+** rinta_f32_x_tied1:
+**	frinta	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f32_x_tied1, svfloat32_t,
+		z0 = svrinta_f32_x (p0, z0),
+		z0 = svrinta_x (p0, z0))
+
+/*
+** rinta_f32_x_untied:
+**	frinta	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f32_x_untied, svfloat32_t,
+		z0 = svrinta_f32_x (p0, z1),
+		z0 = svrinta_x (p0, z1))
+
+/*
+** ptrue_rinta_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rinta_f32_x_tied1, svfloat32_t,
+		z0 = svrinta_f32_x (svptrue_b32 (), z0),
+		z0 = svrinta_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_rinta_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rinta_f32_x_untied, svfloat32_t,
+		z0 = svrinta_f32_x (svptrue_b32 (), z1),
+		z0 = svrinta_x (svptrue_b32 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f64.c
new file mode 100644
index 000000000..24d6b7dc8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinta_f64.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rinta_f64_m_tied12:
+**	frinta	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f64_m_tied12, svfloat64_t,
+		z0 = svrinta_f64_m (z0, p0, z0),
+		z0 = svrinta_m (z0, p0, z0))
+
+/*
+** rinta_f64_m_tied1:
+**	frinta	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f64_m_tied1, svfloat64_t,
+		z0 = svrinta_f64_m (z0, p0, z1),
+		z0 = svrinta_m (z0, p0, z1))
+
+/*
+** rinta_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	frinta	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f64_m_tied2, svfloat64_t,
+		z0 = svrinta_f64_m (z1, p0, z0),
+		z0 = svrinta_m (z1, p0, z0))
+
+/*
+** rinta_f64_m_untied:
+**	movprfx	z0, z2
+**	frinta	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f64_m_untied, svfloat64_t,
+		z0 = svrinta_f64_m (z2, p0, z1),
+		z0 = svrinta_m (z2, p0, z1))
+
+/*
+** rinta_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	frinta	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f64_z_tied1, svfloat64_t,
+		z0 = svrinta_f64_z (p0, z0),
+		z0 = svrinta_z (p0, z0))
+
+/*
+** rinta_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	frinta	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f64_z_untied, svfloat64_t,
+		z0 = svrinta_f64_z (p0, z1),
+		z0 = svrinta_z (p0, z1))
+
+/*
+** rinta_f64_x_tied1:
+**	frinta	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f64_x_tied1, svfloat64_t,
+		z0 = svrinta_f64_x (p0, z0),
+		z0 = svrinta_x (p0, z0))
+
+/*
+** rinta_f64_x_untied:
+**	frinta	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rinta_f64_x_untied, svfloat64_t,
+		z0 = svrinta_f64_x (p0, z1),
+		z0 = svrinta_x (p0, z1))
+
+/*
+** ptrue_rinta_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rinta_f64_x_tied1, svfloat64_t,
+		z0 = svrinta_f64_x (svptrue_b64 (), z0),
+		z0 = svrinta_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_rinta_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rinta_f64_x_untied, svfloat64_t,
+		z0 = svrinta_f64_x (svptrue_b64 (), z1),
+		z0 = svrinta_x (svptrue_b64 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f16.c
new file mode 100644
index 000000000..1f0ac85e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f16.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rinti_f16_m_tied12:
+**	frinti	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f16_m_tied12, svfloat16_t,
+		z0 = svrinti_f16_m (z0, p0, z0),
+		z0 = svrinti_m (z0, p0, z0))
+
+/*
+** rinti_f16_m_tied1:
+**	frinti	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f16_m_tied1, svfloat16_t,
+		z0 = svrinti_f16_m (z0, p0, z1),
+		z0 = svrinti_m (z0, p0, z1))
+
+/*
+** rinti_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	frinti	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f16_m_tied2, svfloat16_t,
+		z0 = svrinti_f16_m (z1, p0, z0),
+		z0 = svrinti_m (z1, p0, z0))
+
+/*
+** rinti_f16_m_untied:
+**	movprfx	z0, z2
+**	frinti	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f16_m_untied, svfloat16_t,
+		z0 = svrinti_f16_m (z2, p0, z1),
+		z0 = svrinti_m (z2, p0, z1))
+
+/*
+** rinti_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	frinti	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f16_z_tied1, svfloat16_t,
+		z0 = svrinti_f16_z (p0, z0),
+		z0 = svrinti_z (p0, z0))
+
+/*
+** rinti_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	frinti	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f16_z_untied, svfloat16_t,
+		z0 = svrinti_f16_z (p0, z1),
+		z0 = svrinti_z (p0, z1))
+
+/*
+** rinti_f16_x_tied1:
+**	frinti	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f16_x_tied1, svfloat16_t,
+		z0 = svrinti_f16_x (p0, z0),
+		z0 = svrinti_x (p0, z0))
+
+/*
+** rinti_f16_x_untied:
+**	frinti	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f16_x_untied, svfloat16_t,
+		z0 = svrinti_f16_x (p0, z1),
+		z0 = svrinti_x (p0, z1))
+
+/*
+** ptrue_rinti_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rinti_f16_x_tied1, svfloat16_t,
+		z0 = svrinti_f16_x (svptrue_b16 (), z0),
+		z0 = svrinti_x (svptrue_b16 (), z0))
+
+/*
+** ptrue_rinti_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rinti_f16_x_untied, svfloat16_t,
+		z0 = svrinti_f16_x (svptrue_b16 (), z1),
+		z0 = svrinti_x (svptrue_b16 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f32.c
new file mode 100644
index 000000000..cf54fde5c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f32.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rinti_f32_m_tied12:
+**	frinti	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f32_m_tied12, svfloat32_t,
+		z0 = svrinti_f32_m (z0, p0, z0),
+		z0 = svrinti_m (z0, p0, z0))
+
+/*
+** rinti_f32_m_tied1:
+**	frinti	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f32_m_tied1, svfloat32_t,
+		z0 = svrinti_f32_m (z0, p0, z1),
+		z0 = svrinti_m (z0, p0, z1))
+
+/*
+** rinti_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	frinti	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f32_m_tied2, svfloat32_t,
+		z0 = svrinti_f32_m (z1, p0, z0),
+		z0 = svrinti_m (z1, p0, z0))
+
+/*
+** rinti_f32_m_untied:
+**	movprfx	z0, z2
+**	frinti	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f32_m_untied, svfloat32_t,
+		z0 = svrinti_f32_m (z2, p0, z1),
+		z0 = svrinti_m (z2, p0, z1))
+
+/*
+** rinti_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	frinti	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f32_z_tied1, svfloat32_t,
+		z0 = svrinti_f32_z (p0, z0),
+		z0 = svrinti_z (p0, z0))
+
+/*
+** rinti_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	frinti	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f32_z_untied, svfloat32_t,
+		z0 = svrinti_f32_z (p0, z1),
+		z0 = svrinti_z (p0, z1))
+
+/*
+** rinti_f32_x_tied1:
+**	frinti	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f32_x_tied1, svfloat32_t,
+		z0 = svrinti_f32_x (p0, z0),
+		z0 = svrinti_x (p0, z0))
+
+/*
+** rinti_f32_x_untied:
+**	frinti	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f32_x_untied, svfloat32_t,
+		z0 = svrinti_f32_x (p0, z1),
+		z0 = svrinti_x (p0, z1))
+
+/*
+** ptrue_rinti_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rinti_f32_x_tied1, svfloat32_t,
+		z0 = svrinti_f32_x (svptrue_b32 (), z0),
+		z0 = svrinti_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_rinti_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rinti_f32_x_untied, svfloat32_t,
+		z0 = svrinti_f32_x (svptrue_b32 (), z1),
+		z0 = svrinti_x (svptrue_b32 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f64.c
new file mode 100644
index 000000000..08b861caa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rinti_f64.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rinti_f64_m_tied12:
+**	frinti	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f64_m_tied12, svfloat64_t,
+		z0 = svrinti_f64_m (z0, p0, z0),
+		z0 = svrinti_m (z0, p0, z0))
+
+/*
+** rinti_f64_m_tied1:
+**	frinti	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f64_m_tied1, svfloat64_t,
+		z0 = svrinti_f64_m (z0, p0, z1),
+		z0 = svrinti_m (z0, p0, z1))
+
+/*
+** rinti_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	frinti	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f64_m_tied2, svfloat64_t,
+		z0 = svrinti_f64_m (z1, p0, z0),
+		z0 = svrinti_m (z1, p0, z0))
+
+/*
+** rinti_f64_m_untied:
+**	movprfx	z0, z2
+**	frinti	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f64_m_untied, svfloat64_t,
+		z0 = svrinti_f64_m (z2, p0, z1),
+		z0 = svrinti_m (z2, p0, z1))
+
+/*
+** rinti_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	frinti	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f64_z_tied1, svfloat64_t,
+		z0 = svrinti_f64_z (p0, z0),
+		z0 = svrinti_z (p0, z0))
+
+/*
+** rinti_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	frinti	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f64_z_untied, svfloat64_t,
+		z0 = svrinti_f64_z (p0, z1),
+		z0 = svrinti_z (p0, z1))
+
+/*
+** rinti_f64_x_tied1:
+**	frinti	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f64_x_tied1, svfloat64_t,
+		z0 = svrinti_f64_x (p0, z0),
+		z0 = svrinti_x (p0, z0))
+
+/*
+** rinti_f64_x_untied:
+**	frinti	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rinti_f64_x_untied, svfloat64_t,
+		z0 = svrinti_f64_x (p0, z1),
+		z0 = svrinti_x (p0, z1))
+
+/*
+** ptrue_rinti_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rinti_f64_x_tied1, svfloat64_t,
+		z0 = svrinti_f64_x (svptrue_b64 (), z0),
+		z0 = svrinti_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_rinti_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rinti_f64_x_untied, svfloat64_t,
+		z0 = svrinti_f64_x (svptrue_b64 (), z1),
+		z0 = svrinti_x (svptrue_b64 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f16.c
new file mode 100644
index 000000000..194d01cbd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f16.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rintm_f16_m_tied12:
+**	frintm	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f16_m_tied12, svfloat16_t,
+		z0 = svrintm_f16_m (z0, p0, z0),
+		z0 = svrintm_m (z0, p0, z0))
+
+/*
+** rintm_f16_m_tied1:
+**	frintm	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f16_m_tied1, svfloat16_t,
+		z0 = svrintm_f16_m (z0, p0, z1),
+		z0 = svrintm_m (z0, p0, z1))
+
+/*
+** rintm_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	frintm	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f16_m_tied2, svfloat16_t,
+		z0 = svrintm_f16_m (z1, p0, z0),
+		z0 = svrintm_m (z1, p0, z0))
+
+/*
+** rintm_f16_m_untied:
+**	movprfx	z0, z2
+**	frintm	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f16_m_untied, svfloat16_t,
+		z0 = svrintm_f16_m (z2, p0, z1),
+		z0 = svrintm_m (z2, p0, z1))
+
+/*
+** rintm_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	frintm	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f16_z_tied1, svfloat16_t,
+		z0 = svrintm_f16_z (p0, z0),
+		z0 = svrintm_z (p0, z0))
+
+/*
+** rintm_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	frintm	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f16_z_untied, svfloat16_t,
+		z0 = svrintm_f16_z (p0, z1),
+		z0 = svrintm_z (p0, z1))
+
+/*
+** rintm_f16_x_tied1:
+**	frintm	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f16_x_tied1, svfloat16_t,
+		z0 = svrintm_f16_x (p0, z0),
+		z0 = svrintm_x (p0, z0))
+
+/*
+** rintm_f16_x_untied:
+**	frintm	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f16_x_untied, svfloat16_t,
+		z0 = svrintm_f16_x (p0, z1),
+		z0 = svrintm_x (p0, z1))
+
+/*
+** ptrue_rintm_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintm_f16_x_tied1, svfloat16_t,
+		z0 = svrintm_f16_x (svptrue_b16 (), z0),
+		z0 = svrintm_x (svptrue_b16 (), z0))
+
+/*
+** ptrue_rintm_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintm_f16_x_untied, svfloat16_t,
+		z0 = svrintm_f16_x (svptrue_b16 (), z1),
+		z0 = svrintm_x (svptrue_b16 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f32.c
new file mode 100644
index 000000000..6c3297aa1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f32.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rintm_f32_m_tied12:
+**	frintm	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f32_m_tied12, svfloat32_t,
+		z0 = svrintm_f32_m (z0, p0, z0),
+		z0 = svrintm_m (z0, p0, z0))
+
+/*
+** rintm_f32_m_tied1:
+**	frintm	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f32_m_tied1, svfloat32_t,
+		z0 = svrintm_f32_m (z0, p0, z1),
+		z0 = svrintm_m (z0, p0, z1))
+
+/*
+** rintm_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	frintm	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f32_m_tied2, svfloat32_t,
+		z0 = svrintm_f32_m (z1, p0, z0),
+		z0 = svrintm_m (z1, p0, z0))
+
+/*
+** rintm_f32_m_untied:
+**	movprfx	z0, z2
+**	frintm	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f32_m_untied, svfloat32_t,
+		z0 = svrintm_f32_m (z2, p0, z1),
+		z0 = svrintm_m (z2, p0, z1))
+
+/*
+** rintm_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	frintm	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f32_z_tied1, svfloat32_t,
+		z0 = svrintm_f32_z (p0, z0),
+		z0 = svrintm_z (p0, z0))
+
+/*
+** rintm_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	frintm	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f32_z_untied, svfloat32_t,
+		z0 = svrintm_f32_z (p0, z1),
+		z0 = svrintm_z (p0, z1))
+
+/*
+** rintm_f32_x_tied1:
+**	frintm	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f32_x_tied1, svfloat32_t,
+		z0 = svrintm_f32_x (p0, z0),
+		z0 = svrintm_x (p0, z0))
+
+/*
+** rintm_f32_x_untied:
+**	frintm	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f32_x_untied, svfloat32_t,
+		z0 = svrintm_f32_x (p0, z1),
+		z0 = svrintm_x (p0, z1))
+
+/*
+** ptrue_rintm_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintm_f32_x_tied1, svfloat32_t,
+		z0 = svrintm_f32_x (svptrue_b32 (), z0),
+		z0 = svrintm_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_rintm_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintm_f32_x_untied, svfloat32_t,
+		z0 = svrintm_f32_x (svptrue_b32 (), z1),
+		z0 = svrintm_x (svptrue_b32 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f64.c
new file mode 100644
index 000000000..ecbb24447
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintm_f64.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rintm_f64_m_tied12:
+**	frintm	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f64_m_tied12, svfloat64_t,
+		z0 = svrintm_f64_m (z0, p0, z0),
+		z0 = svrintm_m (z0, p0, z0))
+
+/*
+** rintm_f64_m_tied1:
+**	frintm	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f64_m_tied1, svfloat64_t,
+		z0 = svrintm_f64_m (z0, p0, z1),
+		z0 = svrintm_m (z0, p0, z1))
+
+/*
+** rintm_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	frintm	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f64_m_tied2, svfloat64_t,
+		z0 = svrintm_f64_m (z1, p0, z0),
+		z0 = svrintm_m (z1, p0, z0))
+
+/*
+** rintm_f64_m_untied:
+**	movprfx	z0, z2
+**	frintm	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f64_m_untied, svfloat64_t,
+		z0 = svrintm_f64_m (z2, p0, z1),
+		z0 = svrintm_m (z2, p0, z1))
+
+/*
+** rintm_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	frintm	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f64_z_tied1, svfloat64_t,
+		z0 = svrintm_f64_z (p0, z0),
+		z0 = svrintm_z (p0, z0))
+
+/*
+** rintm_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	frintm	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f64_z_untied, svfloat64_t,
+		z0 = svrintm_f64_z (p0, z1),
+		z0 = svrintm_z (p0, z1))
+
+/*
+** rintm_f64_x_tied1:
+**	frintm	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f64_x_tied1, svfloat64_t,
+		z0 = svrintm_f64_x (p0, z0),
+		z0 = svrintm_x (p0, z0))
+
+/*
+** rintm_f64_x_untied:
+**	frintm	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintm_f64_x_untied, svfloat64_t,
+		z0 = svrintm_f64_x (p0, z1),
+		z0 = svrintm_x (p0, z1))
+
+/*
+** ptrue_rintm_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintm_f64_x_tied1, svfloat64_t,
+		z0 = svrintm_f64_x (svptrue_b64 (), z0),
+		z0 = svrintm_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_rintm_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintm_f64_x_untied, svfloat64_t,
+		z0 = svrintm_f64_x (svptrue_b64 (), z1),
+		z0 = svrintm_x (svptrue_b64 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f16.c
new file mode 100644
index 000000000..273307ef1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f16.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rintn_f16_m_tied12:
+**	frintn	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f16_m_tied12, svfloat16_t,
+		z0 = svrintn_f16_m (z0, p0, z0),
+		z0 = svrintn_m (z0, p0, z0))
+
+/*
+** rintn_f16_m_tied1:
+**	frintn	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f16_m_tied1, svfloat16_t,
+		z0 = svrintn_f16_m (z0, p0, z1),
+		z0 = svrintn_m (z0, p0, z1))
+
+/*
+** rintn_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	frintn	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f16_m_tied2, svfloat16_t,
+		z0 = svrintn_f16_m (z1, p0, z0),
+		z0 = svrintn_m (z1, p0, z0))
+
+/*
+** rintn_f16_m_untied:
+**	movprfx	z0, z2
+**	frintn	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f16_m_untied, svfloat16_t,
+		z0 = svrintn_f16_m (z2, p0, z1),
+		z0 = svrintn_m (z2, p0, z1))
+
+/*
+** rintn_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	frintn	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f16_z_tied1, svfloat16_t,
+		z0 = svrintn_f16_z (p0, z0),
+		z0 = svrintn_z (p0, z0))
+
+/*
+** rintn_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	frintn	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f16_z_untied, svfloat16_t,
+		z0 = svrintn_f16_z (p0, z1),
+		z0 = svrintn_z (p0, z1))
+
+/*
+** rintn_f16_x_tied1:
+**	frintn	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f16_x_tied1, svfloat16_t,
+		z0 = svrintn_f16_x (p0, z0),
+		z0 = svrintn_x (p0, z0))
+
+/*
+** rintn_f16_x_untied:
+**	frintn	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f16_x_untied, svfloat16_t,
+		z0 = svrintn_f16_x (p0, z1),
+		z0 = svrintn_x (p0, z1))
+
+/*
+** ptrue_rintn_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintn_f16_x_tied1, svfloat16_t,
+		z0 = svrintn_f16_x (svptrue_b16 (), z0),
+		z0 = svrintn_x (svptrue_b16 (), z0))
+
+/*
+** ptrue_rintn_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintn_f16_x_untied, svfloat16_t,
+		z0 = svrintn_f16_x (svptrue_b16 (), z1),
+		z0 = svrintn_x (svptrue_b16 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f32.c
new file mode 100644
index 000000000..bafd43106
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f32.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rintn_f32_m_tied12:
+**	frintn	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f32_m_tied12, svfloat32_t,
+		z0 = svrintn_f32_m (z0, p0, z0),
+		z0 = svrintn_m (z0, p0, z0))
+
+/*
+** rintn_f32_m_tied1:
+**	frintn	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f32_m_tied1, svfloat32_t,
+		z0 = svrintn_f32_m (z0, p0, z1),
+		z0 = svrintn_m (z0, p0, z1))
+
+/*
+** rintn_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	frintn	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f32_m_tied2, svfloat32_t,
+		z0 = svrintn_f32_m (z1, p0, z0),
+		z0 = svrintn_m (z1, p0, z0))
+
+/*
+** rintn_f32_m_untied:
+**	movprfx	z0, z2
+**	frintn	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f32_m_untied, svfloat32_t,
+		z0 = svrintn_f32_m (z2, p0, z1),
+		z0 = svrintn_m (z2, p0, z1))
+
+/*
+** rintn_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	frintn	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f32_z_tied1, svfloat32_t,
+		z0 = svrintn_f32_z (p0, z0),
+		z0 = svrintn_z (p0, z0))
+
+/*
+** rintn_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	frintn	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f32_z_untied, svfloat32_t,
+		z0 = svrintn_f32_z (p0, z1),
+		z0 = svrintn_z (p0, z1))
+
+/*
+** rintn_f32_x_tied1:
+**	frintn	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f32_x_tied1, svfloat32_t,
+		z0 = svrintn_f32_x (p0, z0),
+		z0 = svrintn_x (p0, z0))
+
+/*
+** rintn_f32_x_untied:
+**	frintn	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f32_x_untied, svfloat32_t,
+		z0 = svrintn_f32_x (p0, z1),
+		z0 = svrintn_x (p0, z1))
+
+/*
+** ptrue_rintn_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintn_f32_x_tied1, svfloat32_t,
+		z0 = svrintn_f32_x (svptrue_b32 (), z0),
+		z0 = svrintn_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_rintn_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintn_f32_x_untied, svfloat32_t,
+		z0 = svrintn_f32_x (svptrue_b32 (), z1),
+		z0 = svrintn_x (svptrue_b32 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f64.c
new file mode 100644
index 000000000..0142315e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintn_f64.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rintn_f64_m_tied12:
+**	frintn	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f64_m_tied12, svfloat64_t,
+		z0 = svrintn_f64_m (z0, p0, z0),
+		z0 = svrintn_m (z0, p0, z0))
+
+/*
+** rintn_f64_m_tied1:
+**	frintn	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f64_m_tied1, svfloat64_t,
+		z0 = svrintn_f64_m (z0, p0, z1),
+		z0 = svrintn_m (z0, p0, z1))
+
+/*
+** rintn_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	frintn	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f64_m_tied2, svfloat64_t,
+		z0 = svrintn_f64_m (z1, p0, z0),
+		z0 = svrintn_m (z1, p0, z0))
+
+/*
+** rintn_f64_m_untied:
+**	movprfx	z0, z2
+**	frintn	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f64_m_untied, svfloat64_t,
+		z0 = svrintn_f64_m (z2, p0, z1),
+		z0 = svrintn_m (z2, p0, z1))
+
+/*
+** rintn_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	frintn	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f64_z_tied1, svfloat64_t,
+		z0 = svrintn_f64_z (p0, z0),
+		z0 = svrintn_z (p0, z0))
+
+/*
+** rintn_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	frintn	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f64_z_untied, svfloat64_t,
+		z0 = svrintn_f64_z (p0, z1),
+		z0 = svrintn_z (p0, z1))
+
+/*
+** rintn_f64_x_tied1:
+**	frintn	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f64_x_tied1, svfloat64_t,
+		z0 = svrintn_f64_x (p0, z0),
+		z0 = svrintn_x (p0, z0))
+
+/*
+** rintn_f64_x_untied:
+**	frintn	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintn_f64_x_untied, svfloat64_t,
+		z0 = svrintn_f64_x (p0, z1),
+		z0 = svrintn_x (p0, z1))
+
+/*
+** ptrue_rintn_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintn_f64_x_tied1, svfloat64_t,
+		z0 = svrintn_f64_x (svptrue_b64 (), z0),
+		z0 = svrintn_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_rintn_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintn_f64_x_untied, svfloat64_t,
+		z0 = svrintn_f64_x (svptrue_b64 (), z1),
+		z0 = svrintn_x (svptrue_b64 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f16.c
new file mode 100644
index 000000000..0e85c3448
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f16.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rintp_f16_m_tied12:
+**	frintp	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f16_m_tied12, svfloat16_t,
+		z0 = svrintp_f16_m (z0, p0, z0),
+		z0 = svrintp_m (z0, p0, z0))
+
+/*
+** rintp_f16_m_tied1:
+**	frintp	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f16_m_tied1, svfloat16_t,
+		z0 = svrintp_f16_m (z0, p0, z1),
+		z0 = svrintp_m (z0, p0, z1))
+
+/*
+** rintp_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	frintp	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f16_m_tied2, svfloat16_t,
+		z0 = svrintp_f16_m (z1, p0, z0),
+		z0 = svrintp_m (z1, p0, z0))
+
+/*
+** rintp_f16_m_untied:
+**	movprfx	z0, z2
+**	frintp	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f16_m_untied, svfloat16_t,
+		z0 = svrintp_f16_m (z2, p0, z1),
+		z0 = svrintp_m (z2, p0, z1))
+
+/*
+** rintp_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	frintp	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f16_z_tied1, svfloat16_t,
+		z0 = svrintp_f16_z (p0, z0),
+		z0 = svrintp_z (p0, z0))
+
+/*
+** rintp_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	frintp	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f16_z_untied, svfloat16_t,
+		z0 = svrintp_f16_z (p0, z1),
+		z0 = svrintp_z (p0, z1))
+
+/*
+** rintp_f16_x_tied1:
+**	frintp	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f16_x_tied1, svfloat16_t,
+		z0 = svrintp_f16_x (p0, z0),
+		z0 = svrintp_x (p0, z0))
+
+/*
+** rintp_f16_x_untied:
+**	frintp	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f16_x_untied, svfloat16_t,
+		z0 = svrintp_f16_x (p0, z1),
+		z0 = svrintp_x (p0, z1))
+
+/*
+** ptrue_rintp_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintp_f16_x_tied1, svfloat16_t,
+		z0 = svrintp_f16_x (svptrue_b16 (), z0),
+		z0 = svrintp_x (svptrue_b16 (), z0))
+
+/*
+** ptrue_rintp_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintp_f16_x_untied, svfloat16_t,
+		z0 = svrintp_f16_x (svptrue_b16 (), z1),
+		z0 = svrintp_x (svptrue_b16 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f32.c
new file mode 100644
index 000000000..cec360d7c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f32.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rintp_f32_m_tied12:
+**	frintp	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f32_m_tied12, svfloat32_t,
+		z0 = svrintp_f32_m (z0, p0, z0),
+		z0 = svrintp_m (z0, p0, z0))
+
+/*
+** rintp_f32_m_tied1:
+**	frintp	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f32_m_tied1, svfloat32_t,
+		z0 = svrintp_f32_m (z0, p0, z1),
+		z0 = svrintp_m (z0, p0, z1))
+
+/*
+** rintp_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	frintp	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f32_m_tied2, svfloat32_t,
+		z0 = svrintp_f32_m (z1, p0, z0),
+		z0 = svrintp_m (z1, p0, z0))
+
+/*
+** rintp_f32_m_untied:
+**	movprfx	z0, z2
+**	frintp	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f32_m_untied, svfloat32_t,
+		z0 = svrintp_f32_m (z2, p0, z1),
+		z0 = svrintp_m (z2, p0, z1))
+
+/*
+** rintp_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	frintp	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f32_z_tied1, svfloat32_t,
+		z0 = svrintp_f32_z (p0, z0),
+		z0 = svrintp_z (p0, z0))
+
+/*
+** rintp_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	frintp	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f32_z_untied, svfloat32_t,
+		z0 = svrintp_f32_z (p0, z1),
+		z0 = svrintp_z (p0, z1))
+
+/*
+** rintp_f32_x_tied1:
+**	frintp	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f32_x_tied1, svfloat32_t,
+		z0 = svrintp_f32_x (p0, z0),
+		z0 = svrintp_x (p0, z0))
+
+/*
+** rintp_f32_x_untied:
+**	frintp	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f32_x_untied, svfloat32_t,
+		z0 = svrintp_f32_x (p0, z1),
+		z0 = svrintp_x (p0, z1))
+
+/*
+** ptrue_rintp_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintp_f32_x_tied1, svfloat32_t,
+		z0 = svrintp_f32_x (svptrue_b32 (), z0),
+		z0 = svrintp_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_rintp_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintp_f32_x_untied, svfloat32_t,
+		z0 = svrintp_f32_x (svptrue_b32 (), z1),
+		z0 = svrintp_x (svptrue_b32 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f64.c
new file mode 100644
index 000000000..1305fb682
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintp_f64.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rintp_f64_m_tied12:
+**	frintp	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f64_m_tied12, svfloat64_t,
+		z0 = svrintp_f64_m (z0, p0, z0),
+		z0 = svrintp_m (z0, p0, z0))
+
+/*
+** rintp_f64_m_tied1:
+**	frintp	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f64_m_tied1, svfloat64_t,
+		z0 = svrintp_f64_m (z0, p0, z1),
+		z0 = svrintp_m (z0, p0, z1))
+
+/*
+** rintp_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	frintp	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f64_m_tied2, svfloat64_t,
+		z0 = svrintp_f64_m (z1, p0, z0),
+		z0 = svrintp_m (z1, p0, z0))
+
+/*
+** rintp_f64_m_untied:
+**	movprfx	z0, z2
+**	frintp	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f64_m_untied, svfloat64_t,
+		z0 = svrintp_f64_m (z2, p0, z1),
+		z0 = svrintp_m (z2, p0, z1))
+
+/*
+** rintp_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	frintp	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f64_z_tied1, svfloat64_t,
+		z0 = svrintp_f64_z (p0, z0),
+		z0 = svrintp_z (p0, z0))
+
+/*
+** rintp_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	frintp	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f64_z_untied, svfloat64_t,
+		z0 = svrintp_f64_z (p0, z1),
+		z0 = svrintp_z (p0, z1))
+
+/*
+** rintp_f64_x_tied1:
+**	frintp	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f64_x_tied1, svfloat64_t,
+		z0 = svrintp_f64_x (p0, z0),
+		z0 = svrintp_x (p0, z0))
+
+/*
+** rintp_f64_x_untied:
+**	frintp	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintp_f64_x_untied, svfloat64_t,
+		z0 = svrintp_f64_x (p0, z1),
+		z0 = svrintp_x (p0, z1))
+
+/*
+** ptrue_rintp_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintp_f64_x_tied1, svfloat64_t,
+		z0 = svrintp_f64_x (svptrue_b64 (), z0),
+		z0 = svrintp_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_rintp_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintp_f64_x_untied, svfloat64_t,
+		z0 = svrintp_f64_x (svptrue_b64 (), z1),
+		z0 = svrintp_x (svptrue_b64 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f16.c
new file mode 100644
index 000000000..96f7f2c72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f16.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rintx_f16_m_tied12:
+**	frintx	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f16_m_tied12, svfloat16_t,
+		z0 = svrintx_f16_m (z0, p0, z0),
+		z0 = svrintx_m (z0, p0, z0))
+
+/*
+** rintx_f16_m_tied1:
+**	frintx	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f16_m_tied1, svfloat16_t,
+		z0 = svrintx_f16_m (z0, p0, z1),
+		z0 = svrintx_m (z0, p0, z1))
+
+/*
+** rintx_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	frintx	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f16_m_tied2, svfloat16_t,
+		z0 = svrintx_f16_m (z1, p0, z0),
+		z0 = svrintx_m (z1, p0, z0))
+
+/*
+** rintx_f16_m_untied:
+**	movprfx	z0, z2
+**	frintx	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f16_m_untied, svfloat16_t,
+		z0 = svrintx_f16_m (z2, p0, z1),
+		z0 = svrintx_m (z2, p0, z1))
+
+/*
+** rintx_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	frintx	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f16_z_tied1, svfloat16_t,
+		z0 = svrintx_f16_z (p0, z0),
+		z0 = svrintx_z (p0, z0))
+
+/*
+** rintx_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	frintx	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f16_z_untied, svfloat16_t,
+		z0 = svrintx_f16_z (p0, z1),
+		z0 = svrintx_z (p0, z1))
+
+/*
+** rintx_f16_x_tied1:
+**	frintx	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f16_x_tied1, svfloat16_t,
+		z0 = svrintx_f16_x (p0, z0),
+		z0 = svrintx_x (p0, z0))
+
+/*
+** rintx_f16_x_untied:
+**	frintx	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f16_x_untied, svfloat16_t,
+		z0 = svrintx_f16_x (p0, z1),
+		z0 = svrintx_x (p0, z1))
+
+/*
+** ptrue_rintx_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintx_f16_x_tied1, svfloat16_t,
+		z0 = svrintx_f16_x (svptrue_b16 (), z0),
+		z0 = svrintx_x (svptrue_b16 (), z0))
+
+/*
+** ptrue_rintx_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintx_f16_x_untied, svfloat16_t,
+		z0 = svrintx_f16_x (svptrue_b16 (), z1),
+		z0 = svrintx_x (svptrue_b16 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f32.c
new file mode 100644
index 000000000..1c42d2a94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f32.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rintx_f32_m_tied12:
+**	frintx	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f32_m_tied12, svfloat32_t,
+		z0 = svrintx_f32_m (z0, p0, z0),
+		z0 = svrintx_m (z0, p0, z0))
+
+/*
+** rintx_f32_m_tied1:
+**	frintx	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f32_m_tied1, svfloat32_t,
+		z0 = svrintx_f32_m (z0, p0, z1),
+		z0 = svrintx_m (z0, p0, z1))
+
+/*
+** rintx_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	frintx	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f32_m_tied2, svfloat32_t,
+		z0 = svrintx_f32_m (z1, p0, z0),
+		z0 = svrintx_m (z1, p0, z0))
+
+/*
+** rintx_f32_m_untied:
+**	movprfx	z0, z2
+**	frintx	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f32_m_untied, svfloat32_t,
+		z0 = svrintx_f32_m (z2, p0, z1),
+		z0 = svrintx_m (z2, p0, z1))
+
+/*
+** rintx_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	frintx	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f32_z_tied1, svfloat32_t,
+		z0 = svrintx_f32_z (p0, z0),
+		z0 = svrintx_z (p0, z0))
+
+/*
+** rintx_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	frintx	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f32_z_untied, svfloat32_t,
+		z0 = svrintx_f32_z (p0, z1),
+		z0 = svrintx_z (p0, z1))
+
+/*
+** rintx_f32_x_tied1:
+**	frintx	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f32_x_tied1, svfloat32_t,
+		z0 = svrintx_f32_x (p0, z0),
+		z0 = svrintx_x (p0, z0))
+
+/*
+** rintx_f32_x_untied:
+**	frintx	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f32_x_untied, svfloat32_t,
+		z0 = svrintx_f32_x (p0, z1),
+		z0 = svrintx_x (p0, z1))
+
+/*
+** ptrue_rintx_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintx_f32_x_tied1, svfloat32_t,
+		z0 = svrintx_f32_x (svptrue_b32 (), z0),
+		z0 = svrintx_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_rintx_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintx_f32_x_untied, svfloat32_t,
+		z0 = svrintx_f32_x (svptrue_b32 (), z1),
+		z0 = svrintx_x (svptrue_b32 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f64.c
new file mode 100644
index 000000000..bee806b3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintx_f64.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rintx_f64_m_tied12:
+**	frintx	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f64_m_tied12, svfloat64_t,
+		z0 = svrintx_f64_m (z0, p0, z0),
+		z0 = svrintx_m (z0, p0, z0))
+
+/*
+** rintx_f64_m_tied1:
+**	frintx	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f64_m_tied1, svfloat64_t,
+		z0 = svrintx_f64_m (z0, p0, z1),
+		z0 = svrintx_m (z0, p0, z1))
+
+/*
+** rintx_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	frintx	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f64_m_tied2, svfloat64_t,
+		z0 = svrintx_f64_m (z1, p0, z0),
+		z0 = svrintx_m (z1, p0, z0))
+
+/*
+** rintx_f64_m_untied:
+**	movprfx	z0, z2
+**	frintx	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f64_m_untied, svfloat64_t,
+		z0 = svrintx_f64_m (z2, p0, z1),
+		z0 = svrintx_m (z2, p0, z1))
+
+/*
+** rintx_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	frintx	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f64_z_tied1, svfloat64_t,
+		z0 = svrintx_f64_z (p0, z0),
+		z0 = svrintx_z (p0, z0))
+
+/*
+** rintx_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	frintx	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f64_z_untied, svfloat64_t,
+		z0 = svrintx_f64_z (p0, z1),
+		z0 = svrintx_z (p0, z1))
+
+/*
+** rintx_f64_x_tied1:
+**	frintx	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f64_x_tied1, svfloat64_t,
+		z0 = svrintx_f64_x (p0, z0),
+		z0 = svrintx_x (p0, z0))
+
+/*
+** rintx_f64_x_untied:
+**	frintx	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintx_f64_x_untied, svfloat64_t,
+		z0 = svrintx_f64_x (p0, z1),
+		z0 = svrintx_x (p0, z1))
+
+/*
+** ptrue_rintx_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintx_f64_x_tied1, svfloat64_t,
+		z0 = svrintx_f64_x (svptrue_b64 (), z0),
+		z0 = svrintx_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_rintx_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintx_f64_x_untied, svfloat64_t,
+		z0 = svrintx_f64_x (svptrue_b64 (), z1),
+		z0 = svrintx_x (svptrue_b64 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f16.c
new file mode 100644
index 000000000..be13d82b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f16.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rintz_f16_m_tied12:
+**	frintz	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f16_m_tied12, svfloat16_t,
+		z0 = svrintz_f16_m (z0, p0, z0),
+		z0 = svrintz_m (z0, p0, z0))
+
+/*
+** rintz_f16_m_tied1:
+**	frintz	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f16_m_tied1, svfloat16_t,
+		z0 = svrintz_f16_m (z0, p0, z1),
+		z0 = svrintz_m (z0, p0, z1))
+
+/*
+** rintz_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	frintz	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f16_m_tied2, svfloat16_t,
+		z0 = svrintz_f16_m (z1, p0, z0),
+		z0 = svrintz_m (z1, p0, z0))
+
+/*
+** rintz_f16_m_untied:
+**	movprfx	z0, z2
+**	frintz	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f16_m_untied, svfloat16_t,
+		z0 = svrintz_f16_m (z2, p0, z1),
+		z0 = svrintz_m (z2, p0, z1))
+
+/*
+** rintz_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	frintz	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f16_z_tied1, svfloat16_t,
+		z0 = svrintz_f16_z (p0, z0),
+		z0 = svrintz_z (p0, z0))
+
+/*
+** rintz_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	frintz	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f16_z_untied, svfloat16_t,
+		z0 = svrintz_f16_z (p0, z1),
+		z0 = svrintz_z (p0, z1))
+
+/*
+** rintz_f16_x_tied1:
+**	frintz	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f16_x_tied1, svfloat16_t,
+		z0 = svrintz_f16_x (p0, z0),
+		z0 = svrintz_x (p0, z0))
+
+/*
+** rintz_f16_x_untied:
+**	frintz	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f16_x_untied, svfloat16_t,
+		z0 = svrintz_f16_x (p0, z1),
+		z0 = svrintz_x (p0, z1))
+
+/*
+** ptrue_rintz_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintz_f16_x_tied1, svfloat16_t,
+		z0 = svrintz_f16_x (svptrue_b16 (), z0),
+		z0 = svrintz_x (svptrue_b16 (), z0))
+
+/*
+** ptrue_rintz_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintz_f16_x_untied, svfloat16_t,
+		z0 = svrintz_f16_x (svptrue_b16 (), z1),
+		z0 = svrintz_x (svptrue_b16 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f32.c
new file mode 100644
index 000000000..873c0d468
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f32.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rintz_f32_m_tied12:
+**	frintz	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f32_m_tied12, svfloat32_t,
+		z0 = svrintz_f32_m (z0, p0, z0),
+		z0 = svrintz_m (z0, p0, z0))
+
+/*
+** rintz_f32_m_tied1:
+**	frintz	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f32_m_tied1, svfloat32_t,
+		z0 = svrintz_f32_m (z0, p0, z1),
+		z0 = svrintz_m (z0, p0, z1))
+
+/*
+** rintz_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	frintz	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f32_m_tied2, svfloat32_t,
+		z0 = svrintz_f32_m (z1, p0, z0),
+		z0 = svrintz_m (z1, p0, z0))
+
+/*
+** rintz_f32_m_untied:
+**	movprfx	z0, z2
+**	frintz	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f32_m_untied, svfloat32_t,
+		z0 = svrintz_f32_m (z2, p0, z1),
+		z0 = svrintz_m (z2, p0, z1))
+
+/*
+** rintz_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	frintz	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f32_z_tied1, svfloat32_t,
+		z0 = svrintz_f32_z (p0, z0),
+		z0 = svrintz_z (p0, z0))
+
+/*
+** rintz_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	frintz	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f32_z_untied, svfloat32_t,
+		z0 = svrintz_f32_z (p0, z1),
+		z0 = svrintz_z (p0, z1))
+
+/*
+** rintz_f32_x_tied1:
+**	frintz	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f32_x_tied1, svfloat32_t,
+		z0 = svrintz_f32_x (p0, z0),
+		z0 = svrintz_x (p0, z0))
+
+/*
+** rintz_f32_x_untied:
+**	frintz	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f32_x_untied, svfloat32_t,
+		z0 = svrintz_f32_x (p0, z1),
+		z0 = svrintz_x (p0, z1))
+
+/*
+** ptrue_rintz_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintz_f32_x_tied1, svfloat32_t,
+		z0 = svrintz_f32_x (svptrue_b32 (), z0),
+		z0 = svrintz_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_rintz_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintz_f32_x_untied, svfloat32_t,
+		z0 = svrintz_f32_x (svptrue_b32 (), z1),
+		z0 = svrintz_x (svptrue_b32 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f64.c
new file mode 100644
index 000000000..e6c9d1fc8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rintz_f64.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rintz_f64_m_tied12:
+**	frintz	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f64_m_tied12, svfloat64_t,
+		z0 = svrintz_f64_m (z0, p0, z0),
+		z0 = svrintz_m (z0, p0, z0))
+
+/*
+** rintz_f64_m_tied1:
+**	frintz	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f64_m_tied1, svfloat64_t,
+		z0 = svrintz_f64_m (z0, p0, z1),
+		z0 = svrintz_m (z0, p0, z1))
+
+/*
+** rintz_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	frintz	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f64_m_tied2, svfloat64_t,
+		z0 = svrintz_f64_m (z1, p0, z0),
+		z0 = svrintz_m (z1, p0, z0))
+
+/*
+** rintz_f64_m_untied:
+**	movprfx	z0, z2
+**	frintz	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f64_m_untied, svfloat64_t,
+		z0 = svrintz_f64_m (z2, p0, z1),
+		z0 = svrintz_m (z2, p0, z1))
+
+/*
+** rintz_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	frintz	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f64_z_tied1, svfloat64_t,
+		z0 = svrintz_f64_z (p0, z0),
+		z0 = svrintz_z (p0, z0))
+
+/*
+** rintz_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	frintz	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f64_z_untied, svfloat64_t,
+		z0 = svrintz_f64_z (p0, z1),
+		z0 = svrintz_z (p0, z1))
+
+/*
+** rintz_f64_x_tied1:
+**	frintz	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f64_x_tied1, svfloat64_t,
+		z0 = svrintz_f64_x (p0, z0),
+		z0 = svrintz_x (p0, z0))
+
+/*
+** rintz_f64_x_untied:
+**	frintz	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rintz_f64_x_untied, svfloat64_t,
+		z0 = svrintz_f64_x (p0, z1),
+		z0 = svrintz_x (p0, z1))
+
+/*
+** ptrue_rintz_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintz_f64_x_tied1, svfloat64_t,
+		z0 = svrintz_f64_x (svptrue_b64 (), z0),
+		z0 = svrintz_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_rintz_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_rintz_f64_x_untied, svfloat64_t,
+		z0 = svrintz_f64_x (svptrue_b64 (), z1),
+		z0 = svrintz_x (svptrue_b64 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f16.c
new file mode 100644
index 000000000..adfdc2b9c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rsqrte_f16_tied1:
+**	frsqrte	z0\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rsqrte_f16_tied1, svfloat16_t,
+		z0 = svrsqrte_f16 (z0),
+		z0 = svrsqrte (z0))
+
+/*
+** rsqrte_f16_untied:
+**	frsqrte	z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rsqrte_f16_untied, svfloat16_t,
+		z0 = svrsqrte_f16 (z1),
+		z0 = svrsqrte (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f32.c
new file mode 100644
index 000000000..fd938ebdf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rsqrte_f32_tied1:
+**	frsqrte	z0\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rsqrte_f32_tied1, svfloat32_t,
+		z0 = svrsqrte_f32 (z0),
+		z0 = svrsqrte (z0))
+
+/*
+** rsqrte_f32_untied:
+**	frsqrte	z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rsqrte_f32_untied, svfloat32_t,
+		z0 = svrsqrte_f32 (z1),
+		z0 = svrsqrte (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f64.c
new file mode 100644
index 000000000..3ac0f4053
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrte_f64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rsqrte_f64_tied1:
+**	frsqrte	z0\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rsqrte_f64_tied1, svfloat64_t,
+		z0 = svrsqrte_f64 (z0),
+		z0 = svrsqrte (z0))
+
+/*
+** rsqrte_f64_untied:
+**	frsqrte	z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rsqrte_f64_untied, svfloat64_t,
+		z0 = svrsqrte_f64 (z1),
+		z0 = svrsqrte (z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f16.c
new file mode 100644
index 000000000..2d88be3d6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rsqrts_f16_tied1:
+**	frsqrts	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rsqrts_f16_tied1, svfloat16_t,
+		z0 = svrsqrts_f16 (z0, z1),
+		z0 = svrsqrts (z0, z1))
+
+/*
+** rsqrts_f16_tied2:
+**	frsqrts	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rsqrts_f16_tied2, svfloat16_t,
+		z0 = svrsqrts_f16 (z1, z0),
+		z0 = svrsqrts (z1, z0))
+
+/*
+** rsqrts_f16_untied:
+**	frsqrts	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (rsqrts_f16_untied, svfloat16_t,
+		z0 = svrsqrts_f16 (z1, z2),
+		z0 = svrsqrts (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f32.c
new file mode 100644
index 000000000..cd76aef4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rsqrts_f32_tied1:
+**	frsqrts	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rsqrts_f32_tied1, svfloat32_t,
+		z0 = svrsqrts_f32 (z0, z1),
+		z0 = svrsqrts (z0, z1))
+
+/*
+** rsqrts_f32_tied2:
+**	frsqrts	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rsqrts_f32_tied2, svfloat32_t,
+		z0 = svrsqrts_f32 (z1, z0),
+		z0 = svrsqrts (z1, z0))
+
+/*
+** rsqrts_f32_untied:
+**	frsqrts	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (rsqrts_f32_untied, svfloat32_t,
+		z0 = svrsqrts_f32 (z1, z2),
+		z0 = svrsqrts (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f64.c
new file mode 100644
index 000000000..e72a82fcb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/rsqrts_f64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rsqrts_f64_tied1:
+**	frsqrts	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rsqrts_f64_tied1, svfloat64_t,
+		z0 = svrsqrts_f64 (z0, z1),
+		z0 = svrsqrts (z0, z1))
+
+/*
+** rsqrts_f64_tied2:
+**	frsqrts	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rsqrts_f64_tied2, svfloat64_t,
+		z0 = svrsqrts_f64 (z1, z0),
+		z0 = svrsqrts (z1, z0))
+
+/*
+** rsqrts_f64_untied:
+**	frsqrts	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (rsqrts_f64_untied, svfloat64_t,
+		z0 = svrsqrts_f64 (z1, z2),
+		z0 = svrsqrts (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f16.c
new file mode 100644
index 000000000..9c554255b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f16.c
@@ -0,0 +1,330 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** scale_f16_m_tied1:
+**	fscale	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (scale_f16_m_tied1, svfloat16_t, svint16_t,
+	     z0 = svscale_f16_m (p0, z0, z4),
+	     z0 = svscale_m (p0, z0, z4))
+
+/*
+** scale_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fscale	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (scale_f16_m_tied2, svfloat16_t, svint16_t,
+		 z0_res = svscale_f16_m (p0, z4, z0),
+		 z0_res = svscale_m (p0, z4, z0))
+
+/*
+** scale_f16_m_untied:
+**	movprfx	z0, z1
+**	fscale	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (scale_f16_m_untied, svfloat16_t, svint16_t,
+	     z0 = svscale_f16_m (p0, z1, z4),
+	     z0 = svscale_m (p0, z1, z4))
+
+/*
+** scale_w0_f16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	fscale	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_w0_f16_m_tied1, svfloat16_t, int16_t,
+		 z0 = svscale_n_f16_m (p0, z0, x0),
+		 z0 = svscale_m (p0, z0, x0))
+
+/*
+** scale_w0_f16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	fscale	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_w0_f16_m_untied, svfloat16_t, int16_t,
+		 z0 = svscale_n_f16_m (p0, z1, x0),
+		 z0 = svscale_m (p0, z1, x0))
+
+/*
+** scale_3_f16_m_tied1:
+**	mov	(z[0-9]+\.h), #3
+**	fscale	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f16_m_tied1, svfloat16_t,
+		z0 = svscale_n_f16_m (p0, z0, 3),
+		z0 = svscale_m (p0, z0, 3))
+
+/*
+** scale_3_f16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #3
+**	movprfx	z0, z1
+**	fscale	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f16_m_untied, svfloat16_t,
+		z0 = svscale_n_f16_m (p0, z1, 3),
+		z0 = svscale_m (p0, z1, 3))
+
+/*
+** scale_m3_f16_m:
+**	mov	(z[0-9]+\.h), #-3
+**	fscale	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_m3_f16_m, svfloat16_t,
+		z0 = svscale_n_f16_m (p0, z0, -3),
+		z0 = svscale_m (p0, z0, -3))
+
+/*
+** scale_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fscale	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (scale_f16_z_tied1, svfloat16_t, svint16_t,
+	     z0 = svscale_f16_z (p0, z0, z4),
+	     z0 = svscale_z (p0, z0, z4))
+
+/*
+** scale_f16_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, z4\.h
+**	fscale	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (scale_f16_z_tied2, svfloat16_t, svint16_t,
+		 z0_res = svscale_f16_z (p0, z4, z0),
+		 z0_res = svscale_z (p0, z4, z0))
+
+/*
+** scale_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fscale	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (scale_f16_z_untied, svfloat16_t, svint16_t,
+	     z0 = svscale_f16_z (p0, z1, z4),
+	     z0 = svscale_z (p0, z1, z4))
+
+/*
+** scale_w0_f16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fscale	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_w0_f16_z_tied1, svfloat16_t, int16_t,
+		 z0 = svscale_n_f16_z (p0, z0, x0),
+		 z0 = svscale_z (p0, z0, x0))
+
+/*
+** scale_w0_f16_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fscale	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_w0_f16_z_untied, svfloat16_t, int16_t,
+		 z0 = svscale_n_f16_z (p0, z1, x0),
+		 z0 = svscale_z (p0, z1, x0))
+
+/*
+** scale_3_f16_z_tied1:
+**	mov	(z[0-9]+\.h), #3
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fscale	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f16_z_tied1, svfloat16_t,
+		z0 = svscale_n_f16_z (p0, z0, 3),
+		z0 = svscale_z (p0, z0, 3))
+
+/*
+** scale_3_f16_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #3
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fscale	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f16_z_untied, svfloat16_t,
+		z0 = svscale_n_f16_z (p0, z1, 3),
+		z0 = svscale_z (p0, z1, 3))
+
+/*
+** scale_m3_f16_z:
+**	mov	(z[0-9]+\.h), #-3
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fscale	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_m3_f16_z, svfloat16_t,
+		z0 = svscale_n_f16_z (p0, z0, -3),
+		z0 = svscale_z (p0, z0, -3))
+
+/*
+** scale_f16_x_tied1:
+**	fscale	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (scale_f16_x_tied1, svfloat16_t, svint16_t,
+	     z0 = svscale_f16_x (p0, z0, z4),
+	     z0 = svscale_x (p0, z0, z4))
+
+/*
+** scale_f16_x_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fscale	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (scale_f16_x_tied2, svfloat16_t, svint16_t,
+		 z0_res = svscale_f16_x (p0, z4, z0),
+		 z0_res = svscale_x (p0, z4, z0))
+
+/*
+** scale_f16_x_untied:
+**	movprfx	z0, z1
+**	fscale	z0\.h, p0/m, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (scale_f16_x_untied, svfloat16_t, svint16_t,
+	     z0 = svscale_f16_x (p0, z1, z4),
+	     z0 = svscale_x (p0, z1, z4))
+
+/*
+** scale_w0_f16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	fscale	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_w0_f16_x_tied1, svfloat16_t, int16_t,
+		 z0 = svscale_n_f16_x (p0, z0, x0),
+		 z0 = svscale_x (p0, z0, x0))
+
+/*
+** scale_w0_f16_x_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	fscale	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_w0_f16_x_untied, svfloat16_t, int16_t,
+		 z0 = svscale_n_f16_x (p0, z1, x0),
+		 z0 = svscale_x (p0, z1, x0))
+
+/*
+** scale_3_f16_x_tied1:
+**	mov	(z[0-9]+\.h), #3
+**	fscale	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f16_x_tied1, svfloat16_t,
+		z0 = svscale_n_f16_x (p0, z0, 3),
+		z0 = svscale_x (p0, z0, 3))
+
+/*
+** scale_3_f16_x_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #3
+**	movprfx	z0, z1
+**	fscale	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f16_x_untied, svfloat16_t,
+		z0 = svscale_n_f16_x (p0, z1, 3),
+		z0 = svscale_x (p0, z1, 3))
+
+/*
+** scale_m3_f16_x:
+**	mov	(z[0-9]+\.h), #-3
+**	fscale	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_m3_f16_x, svfloat16_t,
+		z0 = svscale_n_f16_x (p0, z0, -3),
+		z0 = svscale_x (p0, z0, -3))
+
+/*
+** ptrue_scale_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_scale_f16_x_tied1, svfloat16_t, svint16_t,
+	     z0 = svscale_f16_x (svptrue_b16 (), z0, z4),
+	     z0 = svscale_x (svptrue_b16 (), z0, z4))
+
+/*
+** ptrue_scale_f16_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_scale_f16_x_tied2, svfloat16_t, svint16_t,
+		 z0_res = svscale_f16_x (svptrue_b16 (), z4, z0),
+		 z0_res = svscale_x (svptrue_b16 (), z4, z0))
+
+/*
+** ptrue_scale_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_scale_f16_x_untied, svfloat16_t, svint16_t,
+	     z0 = svscale_f16_x (svptrue_b16 (), z1, z4),
+	     z0 = svscale_x (svptrue_b16 (), z1, z4))
+
+/*
+** ptrue_scale_3_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_scale_3_f16_x_tied1, svfloat16_t,
+		z0 = svscale_n_f16_x (svptrue_b16 (), z0, 3),
+		z0 = svscale_x (svptrue_b16 (), z0, 3))
+
+/*
+** ptrue_scale_3_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_scale_3_f16_x_untied, svfloat16_t,
+		z0 = svscale_n_f16_x (svptrue_b16 (), z1, 3),
+		z0 = svscale_x (svptrue_b16 (), z1, 3))
+
+/*
+** ptrue_scale_m3_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_scale_m3_f16_x_tied1, svfloat16_t,
+		z0 = svscale_n_f16_x (svptrue_b16 (), z0, -3),
+		z0 = svscale_x (svptrue_b16 (), z0, -3))
+
+/*
+** ptrue_scale_m3_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_scale_m3_f16_x_untied, svfloat16_t,
+		z0 = svscale_n_f16_x (svptrue_b16 (), z1, -3),
+		z0 = svscale_x (svptrue_b16 (), z1, -3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f32.c
new file mode 100644
index 000000000..747f8a639
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f32.c
@@ -0,0 +1,330 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** scale_f32_m_tied1:
+**	fscale	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (scale_f32_m_tied1, svfloat32_t, svint32_t,
+	     z0 = svscale_f32_m (p0, z0, z4),
+	     z0 = svscale_m (p0, z0, z4))
+
+/*
+** scale_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fscale	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (scale_f32_m_tied2, svfloat32_t, svint32_t,
+		 z0_res = svscale_f32_m (p0, z4, z0),
+		 z0_res = svscale_m (p0, z4, z0))
+
+/*
+** scale_f32_m_untied:
+**	movprfx	z0, z1
+**	fscale	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (scale_f32_m_untied, svfloat32_t, svint32_t,
+	     z0 = svscale_f32_m (p0, z1, z4),
+	     z0 = svscale_m (p0, z1, z4))
+
+/*
+** scale_w0_f32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	fscale	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_w0_f32_m_tied1, svfloat32_t, int32_t,
+		 z0 = svscale_n_f32_m (p0, z0, x0),
+		 z0 = svscale_m (p0, z0, x0))
+
+/*
+** scale_w0_f32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	fscale	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_w0_f32_m_untied, svfloat32_t, int32_t,
+		 z0 = svscale_n_f32_m (p0, z1, x0),
+		 z0 = svscale_m (p0, z1, x0))
+
+/*
+** scale_3_f32_m_tied1:
+**	mov	(z[0-9]+\.s), #3
+**	fscale	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f32_m_tied1, svfloat32_t,
+		z0 = svscale_n_f32_m (p0, z0, 3),
+		z0 = svscale_m (p0, z0, 3))
+
+/*
+** scale_3_f32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #3
+**	movprfx	z0, z1
+**	fscale	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f32_m_untied, svfloat32_t,
+		z0 = svscale_n_f32_m (p0, z1, 3),
+		z0 = svscale_m (p0, z1, 3))
+
+/*
+** scale_m3_f32_m:
+**	mov	(z[0-9]+\.s), #-3
+**	fscale	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_m3_f32_m, svfloat32_t,
+		z0 = svscale_n_f32_m (p0, z0, -3),
+		z0 = svscale_m (p0, z0, -3))
+
+/*
+** scale_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fscale	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (scale_f32_z_tied1, svfloat32_t, svint32_t,
+	     z0 = svscale_f32_z (p0, z0, z4),
+	     z0 = svscale_z (p0, z0, z4))
+
+/*
+** scale_f32_z_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, z4\.s
+**	fscale	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (scale_f32_z_tied2, svfloat32_t, svint32_t,
+		 z0_res = svscale_f32_z (p0, z4, z0),
+		 z0_res = svscale_z (p0, z4, z0))
+
+/*
+** scale_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fscale	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (scale_f32_z_untied, svfloat32_t, svint32_t,
+	     z0 = svscale_f32_z (p0, z1, z4),
+	     z0 = svscale_z (p0, z1, z4))
+
+/*
+** scale_w0_f32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fscale	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_w0_f32_z_tied1, svfloat32_t, int32_t,
+		 z0 = svscale_n_f32_z (p0, z0, x0),
+		 z0 = svscale_z (p0, z0, x0))
+
+/*
+** scale_w0_f32_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fscale	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_w0_f32_z_untied, svfloat32_t, int32_t,
+		 z0 = svscale_n_f32_z (p0, z1, x0),
+		 z0 = svscale_z (p0, z1, x0))
+
+/*
+** scale_3_f32_z_tied1:
+**	mov	(z[0-9]+\.s), #3
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fscale	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f32_z_tied1, svfloat32_t,
+		z0 = svscale_n_f32_z (p0, z0, 3),
+		z0 = svscale_z (p0, z0, 3))
+
+/*
+** scale_3_f32_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #3
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fscale	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f32_z_untied, svfloat32_t,
+		z0 = svscale_n_f32_z (p0, z1, 3),
+		z0 = svscale_z (p0, z1, 3))
+
+/*
+** scale_m3_f32_z:
+**	mov	(z[0-9]+\.s), #-3
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fscale	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_m3_f32_z, svfloat32_t,
+		z0 = svscale_n_f32_z (p0, z0, -3),
+		z0 = svscale_z (p0, z0, -3))
+
+/*
+** scale_f32_x_tied1:
+**	fscale	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (scale_f32_x_tied1, svfloat32_t, svint32_t,
+	     z0 = svscale_f32_x (p0, z0, z4),
+	     z0 = svscale_x (p0, z0, z4))
+
+/*
+** scale_f32_x_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	fscale	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (scale_f32_x_tied2, svfloat32_t, svint32_t,
+		 z0_res = svscale_f32_x (p0, z4, z0),
+		 z0_res = svscale_x (p0, z4, z0))
+
+/*
+** scale_f32_x_untied:
+**	movprfx	z0, z1
+**	fscale	z0\.s, p0/m, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (scale_f32_x_untied, svfloat32_t, svint32_t,
+	     z0 = svscale_f32_x (p0, z1, z4),
+	     z0 = svscale_x (p0, z1, z4))
+
+/*
+** scale_w0_f32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	fscale	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_w0_f32_x_tied1, svfloat32_t, int32_t,
+		 z0 = svscale_n_f32_x (p0, z0, x0),
+		 z0 = svscale_x (p0, z0, x0))
+
+/*
+** scale_w0_f32_x_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	fscale	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_w0_f32_x_untied, svfloat32_t, int32_t,
+		 z0 = svscale_n_f32_x (p0, z1, x0),
+		 z0 = svscale_x (p0, z1, x0))
+
+/*
+** scale_3_f32_x_tied1:
+**	mov	(z[0-9]+\.s), #3
+**	fscale	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f32_x_tied1, svfloat32_t,
+		z0 = svscale_n_f32_x (p0, z0, 3),
+		z0 = svscale_x (p0, z0, 3))
+
+/*
+** scale_3_f32_x_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #3
+**	movprfx	z0, z1
+**	fscale	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f32_x_untied, svfloat32_t,
+		z0 = svscale_n_f32_x (p0, z1, 3),
+		z0 = svscale_x (p0, z1, 3))
+
+/*
+** scale_m3_f32_x:
+**	mov	(z[0-9]+\.s), #-3
+**	fscale	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_m3_f32_x, svfloat32_t,
+		z0 = svscale_n_f32_x (p0, z0, -3),
+		z0 = svscale_x (p0, z0, -3))
+
+/*
+** ptrue_scale_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_scale_f32_x_tied1, svfloat32_t, svint32_t,
+	     z0 = svscale_f32_x (svptrue_b32 (), z0, z4),
+	     z0 = svscale_x (svptrue_b32 (), z0, z4))
+
+/*
+** ptrue_scale_f32_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_scale_f32_x_tied2, svfloat32_t, svint32_t,
+		 z0_res = svscale_f32_x (svptrue_b32 (), z4, z0),
+		 z0_res = svscale_x (svptrue_b32 (), z4, z0))
+
+/*
+** ptrue_scale_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_scale_f32_x_untied, svfloat32_t, svint32_t,
+	     z0 = svscale_f32_x (svptrue_b32 (), z1, z4),
+	     z0 = svscale_x (svptrue_b32 (), z1, z4))
+
+/*
+** ptrue_scale_3_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_scale_3_f32_x_tied1, svfloat32_t,
+		z0 = svscale_n_f32_x (svptrue_b32 (), z0, 3),
+		z0 = svscale_x (svptrue_b32 (), z0, 3))
+
+/*
+** ptrue_scale_3_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_scale_3_f32_x_untied, svfloat32_t,
+		z0 = svscale_n_f32_x (svptrue_b32 (), z1, 3),
+		z0 = svscale_x (svptrue_b32 (), z1, 3))
+
+/*
+** ptrue_scale_m3_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_scale_m3_f32_x_tied1, svfloat32_t,
+		z0 = svscale_n_f32_x (svptrue_b32 (), z0, -3),
+		z0 = svscale_x (svptrue_b32 (), z0, -3))
+
+/*
+** ptrue_scale_m3_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_scale_m3_f32_x_untied, svfloat32_t,
+		z0 = svscale_n_f32_x (svptrue_b32 (), z1, -3),
+		z0 = svscale_x (svptrue_b32 (), z1, -3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f64.c
new file mode 100644
index 000000000..004cbfa3e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_f64.c
@@ -0,0 +1,330 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** scale_f64_m_tied1:
+**	fscale	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (scale_f64_m_tied1, svfloat64_t, svint64_t,
+	     z0 = svscale_f64_m (p0, z0, z4),
+	     z0 = svscale_m (p0, z0, z4))
+
+/*
+** scale_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (scale_f64_m_tied2, svfloat64_t, svint64_t,
+		 z0_res = svscale_f64_m (p0, z4, z0),
+		 z0_res = svscale_m (p0, z4, z0))
+
+/*
+** scale_f64_m_untied:
+**	movprfx	z0, z1
+**	fscale	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (scale_f64_m_untied, svfloat64_t, svint64_t,
+	     z0 = svscale_f64_m (p0, z1, z4),
+	     z0 = svscale_m (p0, z1, z4))
+
+/*
+** scale_x0_f64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_x0_f64_m_tied1, svfloat64_t, int64_t,
+		 z0 = svscale_n_f64_m (p0, z0, x0),
+		 z0 = svscale_m (p0, z0, x0))
+
+/*
+** scale_x0_f64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_x0_f64_m_untied, svfloat64_t, int64_t,
+		 z0 = svscale_n_f64_m (p0, z1, x0),
+		 z0 = svscale_m (p0, z1, x0))
+
+/*
+** scale_3_f64_m_tied1:
+**	mov	(z[0-9]+\.d), #3
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f64_m_tied1, svfloat64_t,
+		z0 = svscale_n_f64_m (p0, z0, 3),
+		z0 = svscale_m (p0, z0, 3))
+
+/*
+** scale_3_f64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #3
+**	movprfx	z0, z1
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f64_m_untied, svfloat64_t,
+		z0 = svscale_n_f64_m (p0, z1, 3),
+		z0 = svscale_m (p0, z1, 3))
+
+/*
+** scale_m3_f64_m:
+**	mov	(z[0-9]+\.d), #-3
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_m3_f64_m, svfloat64_t,
+		z0 = svscale_n_f64_m (p0, z0, -3),
+		z0 = svscale_m (p0, z0, -3))
+
+/*
+** scale_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fscale	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (scale_f64_z_tied1, svfloat64_t, svint64_t,
+	     z0 = svscale_f64_z (p0, z0, z4),
+	     z0 = svscale_z (p0, z0, z4))
+
+/*
+** scale_f64_z_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, z4\.d
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (scale_f64_z_tied2, svfloat64_t, svint64_t,
+		 z0_res = svscale_f64_z (p0, z4, z0),
+		 z0_res = svscale_z (p0, z4, z0))
+
+/*
+** scale_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fscale	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (scale_f64_z_untied, svfloat64_t, svint64_t,
+	     z0 = svscale_f64_z (p0, z1, z4),
+	     z0 = svscale_z (p0, z1, z4))
+
+/*
+** scale_x0_f64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_x0_f64_z_tied1, svfloat64_t, int64_t,
+		 z0 = svscale_n_f64_z (p0, z0, x0),
+		 z0 = svscale_z (p0, z0, x0))
+
+/*
+** scale_x0_f64_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_x0_f64_z_untied, svfloat64_t, int64_t,
+		 z0 = svscale_n_f64_z (p0, z1, x0),
+		 z0 = svscale_z (p0, z1, x0))
+
+/*
+** scale_3_f64_z_tied1:
+**	mov	(z[0-9]+\.d), #3
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f64_z_tied1, svfloat64_t,
+		z0 = svscale_n_f64_z (p0, z0, 3),
+		z0 = svscale_z (p0, z0, 3))
+
+/*
+** scale_3_f64_z_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #3
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f64_z_untied, svfloat64_t,
+		z0 = svscale_n_f64_z (p0, z1, 3),
+		z0 = svscale_z (p0, z1, 3))
+
+/*
+** scale_m3_f64_z:
+**	mov	(z[0-9]+\.d), #-3
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_m3_f64_z, svfloat64_t,
+		z0 = svscale_n_f64_z (p0, z0, -3),
+		z0 = svscale_z (p0, z0, -3))
+
+/*
+** scale_f64_x_tied1:
+**	fscale	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (scale_f64_x_tied1, svfloat64_t, svint64_t,
+	     z0 = svscale_f64_x (p0, z0, z4),
+	     z0 = svscale_x (p0, z0, z4))
+
+/*
+** scale_f64_x_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z4
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_DUAL_Z_REV (scale_f64_x_tied2, svfloat64_t, svint64_t,
+		 z0_res = svscale_f64_x (p0, z4, z0),
+		 z0_res = svscale_x (p0, z4, z0))
+
+/*
+** scale_f64_x_untied:
+**	movprfx	z0, z1
+**	fscale	z0\.d, p0/m, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (scale_f64_x_untied, svfloat64_t, svint64_t,
+	     z0 = svscale_f64_x (p0, z1, z4),
+	     z0 = svscale_x (p0, z1, z4))
+
+/*
+** scale_x0_f64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_x0_f64_x_tied1, svfloat64_t, int64_t,
+		 z0 = svscale_n_f64_x (p0, z0, x0),
+		 z0 = svscale_x (p0, z0, x0))
+
+/*
+** scale_x0_f64_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (scale_x0_f64_x_untied, svfloat64_t, int64_t,
+		 z0 = svscale_n_f64_x (p0, z1, x0),
+		 z0 = svscale_x (p0, z1, x0))
+
+/*
+** scale_3_f64_x_tied1:
+**	mov	(z[0-9]+\.d), #3
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f64_x_tied1, svfloat64_t,
+		z0 = svscale_n_f64_x (p0, z0, 3),
+		z0 = svscale_x (p0, z0, 3))
+
+/*
+** scale_3_f64_x_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #3
+**	movprfx	z0, z1
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_3_f64_x_untied, svfloat64_t,
+		z0 = svscale_n_f64_x (p0, z1, 3),
+		z0 = svscale_x (p0, z1, 3))
+
+/*
+** scale_m3_f64_x:
+**	mov	(z[0-9]+\.d), #-3
+**	fscale	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (scale_m3_f64_x, svfloat64_t,
+		z0 = svscale_n_f64_x (p0, z0, -3),
+		z0 = svscale_x (p0, z0, -3))
+
+/*
+** ptrue_scale_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_scale_f64_x_tied1, svfloat64_t, svint64_t,
+	     z0 = svscale_f64_x (svptrue_b64 (), z0, z4),
+	     z0 = svscale_x (svptrue_b64 (), z0, z4))
+
+/*
+** ptrue_scale_f64_x_tied2:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z_REV (ptrue_scale_f64_x_tied2, svfloat64_t, svint64_t,
+		 z0_res = svscale_f64_x (svptrue_b64 (), z4, z0),
+		 z0_res = svscale_x (svptrue_b64 (), z4, z0))
+
+/*
+** ptrue_scale_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_DUAL_Z (ptrue_scale_f64_x_untied, svfloat64_t, svint64_t,
+	     z0 = svscale_f64_x (svptrue_b64 (), z1, z4),
+	     z0 = svscale_x (svptrue_b64 (), z1, z4))
+
+/*
+** ptrue_scale_3_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_scale_3_f64_x_tied1, svfloat64_t,
+		z0 = svscale_n_f64_x (svptrue_b64 (), z0, 3),
+		z0 = svscale_x (svptrue_b64 (), z0, 3))
+
+/*
+** ptrue_scale_3_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_scale_3_f64_x_untied, svfloat64_t,
+		z0 = svscale_n_f64_x (svptrue_b64 (), z1, 3),
+		z0 = svscale_x (svptrue_b64 (), z1, 3))
+
+/*
+** ptrue_scale_m3_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_scale_m3_f64_x_tied1, svfloat64_t,
+		z0 = svscale_n_f64_x (svptrue_b64 (), z0, -3),
+		z0 = svscale_x (svptrue_b64 (), z0, -3))
+
+/*
+** ptrue_scale_m3_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_scale_m3_f64_x_untied, svfloat64_t,
+		z0 = svscale_n_f64_x (svptrue_b64 (), z1, -3),
+		z0 = svscale_x (svptrue_b64 (), z1, -3))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_b.c
new file mode 100644
index 000000000..a135e9c99
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_b.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sel_b_tied1:
+**	sel	p0\.b, p3, p0\.b, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (sel_b_tied1,
+		p0 = svsel_b (p3, p0, p1),
+		p0 = svsel (p3, p0, p1))
+
+/*
+** sel_b_tied2:
+**	sel	p0\.b, p3, p1\.b, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (sel_b_tied2,
+		p0 = svsel_b (p3, p1, p0),
+		p0 = svsel (p3, p1, p0))
+
+/*
+** sel_b_untied:
+**	sel	p0\.b, p3, p1\.b, p2\.b
+**	ret
+*/
+TEST_UNIFORM_P (sel_b_untied,
+		p0 = svsel_b (p3, p1, p2),
+		p0 = svsel (p3, p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_bf16.c
new file mode 100644
index 000000000..44636d8f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_bf16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sel_bf16_tied1:
+**	sel	z0\.h, p0, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sel_bf16_tied1, svbfloat16_t,
+		z0 = svsel_bf16 (p0, z0, z1),
+		z0 = svsel (p0, z0, z1))
+
+/*
+** sel_bf16_tied2:
+**	sel	z0\.h, p0, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sel_bf16_tied2, svbfloat16_t,
+		z0 = svsel_bf16 (p0, z1, z0),
+		z0 = svsel (p0, z1, z0))
+
+/*
+** sel_bf16_untied:
+**	sel	z0\.h, p0, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sel_bf16_untied, svbfloat16_t,
+		z0 = svsel_bf16 (p0, z1, z2),
+		z0 = svsel (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f16.c
new file mode 100644
index 000000000..35750ea81
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sel_f16_tied1:
+**	sel	z0\.h, p0, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sel_f16_tied1, svfloat16_t,
+		z0 = svsel_f16 (p0, z0, z1),
+		z0 = svsel (p0, z0, z1))
+
+/*
+** sel_f16_tied2:
+**	sel	z0\.h, p0, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sel_f16_tied2, svfloat16_t,
+		z0 = svsel_f16 (p0, z1, z0),
+		z0 = svsel (p0, z1, z0))
+
+/*
+** sel_f16_untied:
+**	sel	z0\.h, p0, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sel_f16_untied, svfloat16_t,
+		z0 = svsel_f16 (p0, z1, z2),
+		z0 = svsel (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f32.c
new file mode 100644
index 000000000..639a84724
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sel_f32_tied1:
+**	sel	z0\.s, p0, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sel_f32_tied1, svfloat32_t,
+		z0 = svsel_f32 (p0, z0, z1),
+		z0 = svsel (p0, z0, z1))
+
+/*
+** sel_f32_tied2:
+**	sel	z0\.s, p0, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sel_f32_tied2, svfloat32_t,
+		z0 = svsel_f32 (p0, z1, z0),
+		z0 = svsel (p0, z1, z0))
+
+/*
+** sel_f32_untied:
+**	sel	z0\.s, p0, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sel_f32_untied, svfloat32_t,
+		z0 = svsel_f32 (p0, z1, z2),
+		z0 = svsel (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f64.c
new file mode 100644
index 000000000..048d6e52a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_f64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sel_f64_tied1:
+**	sel	z0\.d, p0, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sel_f64_tied1, svfloat64_t,
+		z0 = svsel_f64 (p0, z0, z1),
+		z0 = svsel (p0, z0, z1))
+
+/*
+** sel_f64_tied2:
+**	sel	z0\.d, p0, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sel_f64_tied2, svfloat64_t,
+		z0 = svsel_f64 (p0, z1, z0),
+		z0 = svsel (p0, z1, z0))
+
+/*
+** sel_f64_untied:
+**	sel	z0\.d, p0, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sel_f64_untied, svfloat64_t,
+		z0 = svsel_f64 (p0, z1, z2),
+		z0 = svsel (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s16.c
new file mode 100644
index 000000000..e162da499
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sel_s16_tied1:
+**	sel	z0\.h, p0, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sel_s16_tied1, svint16_t,
+		z0 = svsel_s16 (p0, z0, z1),
+		z0 = svsel (p0, z0, z1))
+
+/*
+** sel_s16_tied2:
+**	sel	z0\.h, p0, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sel_s16_tied2, svint16_t,
+		z0 = svsel_s16 (p0, z1, z0),
+		z0 = svsel (p0, z1, z0))
+
+/*
+** sel_s16_untied:
+**	sel	z0\.h, p0, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sel_s16_untied, svint16_t,
+		z0 = svsel_s16 (p0, z1, z2),
+		z0 = svsel (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s32.c
new file mode 100644
index 000000000..80839d803
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sel_s32_tied1:
+**	sel	z0\.s, p0, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sel_s32_tied1, svint32_t,
+		z0 = svsel_s32 (p0, z0, z1),
+		z0 = svsel (p0, z0, z1))
+
+/*
+** sel_s32_tied2:
+**	sel	z0\.s, p0, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sel_s32_tied2, svint32_t,
+		z0 = svsel_s32 (p0, z1, z0),
+		z0 = svsel (p0, z1, z0))
+
+/*
+** sel_s32_untied:
+**	sel	z0\.s, p0, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sel_s32_untied, svint32_t,
+		z0 = svsel_s32 (p0, z1, z2),
+		z0 = svsel (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s64.c
new file mode 100644
index 000000000..85a77eafb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sel_s64_tied1:
+**	sel	z0\.d, p0, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sel_s64_tied1, svint64_t,
+		z0 = svsel_s64 (p0, z0, z1),
+		z0 = svsel (p0, z0, z1))
+
+/*
+** sel_s64_tied2:
+**	sel	z0\.d, p0, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sel_s64_tied2, svint64_t,
+		z0 = svsel_s64 (p0, z1, z0),
+		z0 = svsel (p0, z1, z0))
+
+/*
+** sel_s64_untied:
+**	sel	z0\.d, p0, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sel_s64_untied, svint64_t,
+		z0 = svsel_s64 (p0, z1, z2),
+		z0 = svsel (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s8.c
new file mode 100644
index 000000000..28c43f627
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_s8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sel_s8_tied1:
+**	sel	z0\.b, p0, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sel_s8_tied1, svint8_t,
+		z0 = svsel_s8 (p0, z0, z1),
+		z0 = svsel (p0, z0, z1))
+
+/*
+** sel_s8_tied2:
+**	sel	z0\.b, p0, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sel_s8_tied2, svint8_t,
+		z0 = svsel_s8 (p0, z1, z0),
+		z0 = svsel (p0, z1, z0))
+
+/*
+** sel_s8_untied:
+**	sel	z0\.b, p0, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sel_s8_untied, svint8_t,
+		z0 = svsel_s8 (p0, z1, z2),
+		z0 = svsel (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u16.c
new file mode 100644
index 000000000..b85ede803
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sel_u16_tied1:
+**	sel	z0\.h, p0, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sel_u16_tied1, svuint16_t,
+		z0 = svsel_u16 (p0, z0, z1),
+		z0 = svsel (p0, z0, z1))
+
+/*
+** sel_u16_tied2:
+**	sel	z0\.h, p0, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sel_u16_tied2, svuint16_t,
+		z0 = svsel_u16 (p0, z1, z0),
+		z0 = svsel (p0, z1, z0))
+
+/*
+** sel_u16_untied:
+**	sel	z0\.h, p0, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sel_u16_untied, svuint16_t,
+		z0 = svsel_u16 (p0, z1, z2),
+		z0 = svsel (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u32.c
new file mode 100644
index 000000000..636cf8790
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sel_u32_tied1:
+**	sel	z0\.s, p0, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sel_u32_tied1, svuint32_t,
+		z0 = svsel_u32 (p0, z0, z1),
+		z0 = svsel (p0, z0, z1))
+
+/*
+** sel_u32_tied2:
+**	sel	z0\.s, p0, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sel_u32_tied2, svuint32_t,
+		z0 = svsel_u32 (p0, z1, z0),
+		z0 = svsel (p0, z1, z0))
+
+/*
+** sel_u32_untied:
+**	sel	z0\.s, p0, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sel_u32_untied, svuint32_t,
+		z0 = svsel_u32 (p0, z1, z2),
+		z0 = svsel (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u64.c
new file mode 100644
index 000000000..6325ca56f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sel_u64_tied1:
+**	sel	z0\.d, p0, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sel_u64_tied1, svuint64_t,
+		z0 = svsel_u64 (p0, z0, z1),
+		z0 = svsel (p0, z0, z1))
+
+/*
+** sel_u64_tied2:
+**	sel	z0\.d, p0, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sel_u64_tied2, svuint64_t,
+		z0 = svsel_u64 (p0, z1, z0),
+		z0 = svsel (p0, z1, z0))
+
+/*
+** sel_u64_untied:
+**	sel	z0\.d, p0, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sel_u64_untied, svuint64_t,
+		z0 = svsel_u64 (p0, z1, z2),
+		z0 = svsel (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u8.c
new file mode 100644
index 000000000..5af53dccd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sel_u8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sel_u8_tied1:
+**	sel	z0\.b, p0, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sel_u8_tied1, svuint8_t,
+		z0 = svsel_u8 (p0, z0, z1),
+		z0 = svsel (p0, z0, z1))
+
+/*
+** sel_u8_tied2:
+**	sel	z0\.b, p0, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sel_u8_tied2, svuint8_t,
+		z0 = svsel_u8 (p0, z1, z0),
+		z0 = svsel (p0, z1, z0))
+
+/*
+** sel_u8_untied:
+**	sel	z0\.b, p0, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sel_u8_untied, svuint8_t,
+		z0 = svsel_u8 (p0, z1, z2),
+		z0 = svsel (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_bf16.c
new file mode 100644
index 000000000..b160a2517
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_bf16.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set2_bf16_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_bf16_z24_0, svbfloat16x2_t, svbfloat16_t,
+	  z24 = svset2_bf16 (z4, 0, z0),
+	  z24 = svset2 (z4, 0, z0))
+
+/*
+** set2_bf16_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_bf16_z24_1, svbfloat16x2_t, svbfloat16_t,
+	  z24 = svset2_bf16 (z4, 1, z0),
+	  z24 = svset2 (z4, 1, z0))
+
+/*
+** set2_bf16_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_bf16_z4_0, svbfloat16x2_t, svbfloat16_t,
+	  z4 = svset2_bf16 (z4, 0, z0),
+	  z4 = svset2 (z4, 0, z0))
+
+/*
+** set2_bf16_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_bf16_z4_1, svbfloat16x2_t, svbfloat16_t,
+	  z4 = svset2_bf16 (z4, 1, z0),
+	  z4 = svset2 (z4, 1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f16.c
new file mode 100644
index 000000000..859600698
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f16.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set2_f16_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_f16_z24_0, svfloat16x2_t, svfloat16_t,
+	  z24 = svset2_f16 (z4, 0, z0),
+	  z24 = svset2 (z4, 0, z0))
+
+/*
+** set2_f16_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_f16_z24_1, svfloat16x2_t, svfloat16_t,
+	  z24 = svset2_f16 (z4, 1, z0),
+	  z24 = svset2 (z4, 1, z0))
+
+/*
+** set2_f16_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_f16_z4_0, svfloat16x2_t, svfloat16_t,
+	  z4 = svset2_f16 (z4, 0, z0),
+	  z4 = svset2 (z4, 0, z0))
+
+/*
+** set2_f16_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_f16_z4_1, svfloat16x2_t, svfloat16_t,
+	  z4 = svset2_f16 (z4, 1, z0),
+	  z4 = svset2 (z4, 1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f32.c
new file mode 100644
index 000000000..a95ff2fc5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f32.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set2_f32_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_f32_z24_0, svfloat32x2_t, svfloat32_t,
+	  z24 = svset2_f32 (z4, 0, z0),
+	  z24 = svset2 (z4, 0, z0))
+
+/*
+** set2_f32_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_f32_z24_1, svfloat32x2_t, svfloat32_t,
+	  z24 = svset2_f32 (z4, 1, z0),
+	  z24 = svset2 (z4, 1, z0))
+
+/*
+** set2_f32_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_f32_z4_0, svfloat32x2_t, svfloat32_t,
+	  z4 = svset2_f32 (z4, 0, z0),
+	  z4 = svset2 (z4, 0, z0))
+
+/*
+** set2_f32_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_f32_z4_1, svfloat32x2_t, svfloat32_t,
+	  z4 = svset2_f32 (z4, 1, z0),
+	  z4 = svset2 (z4, 1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f64.c
new file mode 100644
index 000000000..77837b7d8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_f64.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set2_f64_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_f64_z24_0, svfloat64x2_t, svfloat64_t,
+	  z24 = svset2_f64 (z4, 0, z0),
+	  z24 = svset2 (z4, 0, z0))
+
+/*
+** set2_f64_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_f64_z24_1, svfloat64x2_t, svfloat64_t,
+	  z24 = svset2_f64 (z4, 1, z0),
+	  z24 = svset2 (z4, 1, z0))
+
+/*
+** set2_f64_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_f64_z4_0, svfloat64x2_t, svfloat64_t,
+	  z4 = svset2_f64 (z4, 0, z0),
+	  z4 = svset2 (z4, 0, z0))
+
+/*
+** set2_f64_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_f64_z4_1, svfloat64x2_t, svfloat64_t,
+	  z4 = svset2_f64 (z4, 1, z0),
+	  z4 = svset2 (z4, 1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s16.c
new file mode 100644
index 000000000..aa2e70fd1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s16.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set2_s16_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_s16_z24_0, svint16x2_t, svint16_t,
+	  z24 = svset2_s16 (z4, 0, z0),
+	  z24 = svset2 (z4, 0, z0))
+
+/*
+** set2_s16_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_s16_z24_1, svint16x2_t, svint16_t,
+	  z24 = svset2_s16 (z4, 1, z0),
+	  z24 = svset2 (z4, 1, z0))
+
+/*
+** set2_s16_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_s16_z4_0, svint16x2_t, svint16_t,
+	  z4 = svset2_s16 (z4, 0, z0),
+	  z4 = svset2 (z4, 0, z0))
+
+/*
+** set2_s16_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_s16_z4_1, svint16x2_t, svint16_t,
+	  z4 = svset2_s16 (z4, 1, z0),
+	  z4 = svset2 (z4, 1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s32.c
new file mode 100644
index 000000000..3a7c289aa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s32.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set2_s32_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_s32_z24_0, svint32x2_t, svint32_t,
+	  z24 = svset2_s32 (z4, 0, z0),
+	  z24 = svset2 (z4, 0, z0))
+
+/*
+** set2_s32_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_s32_z24_1, svint32x2_t, svint32_t,
+	  z24 = svset2_s32 (z4, 1, z0),
+	  z24 = svset2 (z4, 1, z0))
+
+/*
+** set2_s32_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_s32_z4_0, svint32x2_t, svint32_t,
+	  z4 = svset2_s32 (z4, 0, z0),
+	  z4 = svset2 (z4, 0, z0))
+
+/*
+** set2_s32_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_s32_z4_1, svint32x2_t, svint32_t,
+	  z4 = svset2_s32 (z4, 1, z0),
+	  z4 = svset2 (z4, 1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s64.c
new file mode 100644
index 000000000..ca6df54d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s64.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set2_s64_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_s64_z24_0, svint64x2_t, svint64_t,
+	  z24 = svset2_s64 (z4, 0, z0),
+	  z24 = svset2 (z4, 0, z0))
+
+/*
+** set2_s64_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_s64_z24_1, svint64x2_t, svint64_t,
+	  z24 = svset2_s64 (z4, 1, z0),
+	  z24 = svset2 (z4, 1, z0))
+
+/*
+** set2_s64_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_s64_z4_0, svint64x2_t, svint64_t,
+	  z4 = svset2_s64 (z4, 0, z0),
+	  z4 = svset2 (z4, 0, z0))
+
+/*
+** set2_s64_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_s64_z4_1, svint64x2_t, svint64_t,
+	  z4 = svset2_s64 (z4, 1, z0),
+	  z4 = svset2 (z4, 1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s8.c
new file mode 100644
index 000000000..e143128a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_s8.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set2_s8_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_s8_z24_0, svint8x2_t, svint8_t,
+	  z24 = svset2_s8 (z4, 0, z0),
+	  z24 = svset2 (z4, 0, z0))
+
+/*
+** set2_s8_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_s8_z24_1, svint8x2_t, svint8_t,
+	  z24 = svset2_s8 (z4, 1, z0),
+	  z24 = svset2 (z4, 1, z0))
+
+/*
+** set2_s8_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_s8_z4_0, svint8x2_t, svint8_t,
+	  z4 = svset2_s8 (z4, 0, z0),
+	  z4 = svset2 (z4, 0, z0))
+
+/*
+** set2_s8_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_s8_z4_1, svint8x2_t, svint8_t,
+	  z4 = svset2_s8 (z4, 1, z0),
+	  z4 = svset2 (z4, 1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u16.c
new file mode 100644
index 000000000..53da08398
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u16.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set2_u16_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_u16_z24_0, svuint16x2_t, svuint16_t,
+	  z24 = svset2_u16 (z4, 0, z0),
+	  z24 = svset2 (z4, 0, z0))
+
+/*
+** set2_u16_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_u16_z24_1, svuint16x2_t, svuint16_t,
+	  z24 = svset2_u16 (z4, 1, z0),
+	  z24 = svset2 (z4, 1, z0))
+
+/*
+** set2_u16_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_u16_z4_0, svuint16x2_t, svuint16_t,
+	  z4 = svset2_u16 (z4, 0, z0),
+	  z4 = svset2 (z4, 0, z0))
+
+/*
+** set2_u16_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_u16_z4_1, svuint16x2_t, svuint16_t,
+	  z4 = svset2_u16 (z4, 1, z0),
+	  z4 = svset2 (z4, 1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u32.c
new file mode 100644
index 000000000..5266a62d8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u32.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set2_u32_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_u32_z24_0, svuint32x2_t, svuint32_t,
+	  z24 = svset2_u32 (z4, 0, z0),
+	  z24 = svset2 (z4, 0, z0))
+
+/*
+** set2_u32_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_u32_z24_1, svuint32x2_t, svuint32_t,
+	  z24 = svset2_u32 (z4, 1, z0),
+	  z24 = svset2 (z4, 1, z0))
+
+/*
+** set2_u32_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_u32_z4_0, svuint32x2_t, svuint32_t,
+	  z4 = svset2_u32 (z4, 0, z0),
+	  z4 = svset2 (z4, 0, z0))
+
+/*
+** set2_u32_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_u32_z4_1, svuint32x2_t, svuint32_t,
+	  z4 = svset2_u32 (z4, 1, z0),
+	  z4 = svset2 (z4, 1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u64.c
new file mode 100644
index 000000000..f7d2a1807
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u64.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set2_u64_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_u64_z24_0, svuint64x2_t, svuint64_t,
+	  z24 = svset2_u64 (z4, 0, z0),
+	  z24 = svset2 (z4, 0, z0))
+
+/*
+** set2_u64_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_u64_z24_1, svuint64x2_t, svuint64_t,
+	  z24 = svset2_u64 (z4, 1, z0),
+	  z24 = svset2 (z4, 1, z0))
+
+/*
+** set2_u64_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_u64_z4_0, svuint64x2_t, svuint64_t,
+	  z4 = svset2_u64 (z4, 0, z0),
+	  z4 = svset2 (z4, 0, z0))
+
+/*
+** set2_u64_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_u64_z4_1, svuint64x2_t, svuint64_t,
+	  z4 = svset2_u64 (z4, 1, z0),
+	  z4 = svset2 (z4, 1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u8.c
new file mode 100644
index 000000000..9494a0e54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set2_u8.c
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set2_u8_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_u8_z24_0, svuint8x2_t, svuint8_t,
+	  z24 = svset2_u8 (z4, 0, z0),
+	  z24 = svset2 (z4, 0, z0))
+
+/*
+** set2_u8_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_u8_z24_1, svuint8x2_t, svuint8_t,
+	  z24 = svset2_u8 (z4, 1, z0),
+	  z24 = svset2 (z4, 1, z0))
+
+/*
+** set2_u8_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_u8_z4_0, svuint8x2_t, svuint8_t,
+	  z4 = svset2_u8 (z4, 0, z0),
+	  z4 = svset2 (z4, 0, z0))
+
+/*
+** set2_u8_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set2_u8_z4_1, svuint8x2_t, svuint8_t,
+	  z4 = svset2_u8 (z4, 1, z0),
+	  z4 = svset2 (z4, 1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_bf16.c
new file mode 100644
index 000000000..4e0707d09
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_bf16.c
@@ -0,0 +1,63 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set3_bf16_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_bf16_z24_0, svbfloat16x3_t, svbfloat16_t,
+	  z24 = svset3_bf16 (z4, 0, z0),
+	  z24 = svset3 (z4, 0, z0))
+
+/*
+** set3_bf16_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_bf16_z24_1, svbfloat16x3_t, svbfloat16_t,
+	  z24 = svset3_bf16 (z4, 1, z0),
+	  z24 = svset3 (z4, 1, z0))
+
+/*
+** set3_bf16_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_bf16_z24_2, svbfloat16x3_t, svbfloat16_t,
+	  z24 = svset3_bf16 (z4, 2, z0),
+	  z24 = svset3 (z4, 2, z0))
+
+/*
+** set3_bf16_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_bf16_z4_0, svbfloat16x3_t, svbfloat16_t,
+	  z4 = svset3_bf16 (z4, 0, z0),
+	  z4 = svset3 (z4, 0, z0))
+
+/*
+** set3_bf16_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_bf16_z4_1, svbfloat16x3_t, svbfloat16_t,
+	  z4 = svset3_bf16 (z4, 1, z0),
+	  z4 = svset3 (z4, 1, z0))
+
+/*
+** set3_bf16_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_bf16_z4_2, svbfloat16x3_t, svbfloat16_t,
+	  z4 = svset3_bf16 (z4, 2, z0),
+	  z4 = svset3 (z4, 2, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f16.c
new file mode 100644
index 000000000..b6bb3a2bf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f16.c
@@ -0,0 +1,63 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set3_f16_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f16_z24_0, svfloat16x3_t, svfloat16_t,
+	  z24 = svset3_f16 (z4, 0, z0),
+	  z24 = svset3 (z4, 0, z0))
+
+/*
+** set3_f16_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f16_z24_1, svfloat16x3_t, svfloat16_t,
+	  z24 = svset3_f16 (z4, 1, z0),
+	  z24 = svset3 (z4, 1, z0))
+
+/*
+** set3_f16_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f16_z24_2, svfloat16x3_t, svfloat16_t,
+	  z24 = svset3_f16 (z4, 2, z0),
+	  z24 = svset3 (z4, 2, z0))
+
+/*
+** set3_f16_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f16_z4_0, svfloat16x3_t, svfloat16_t,
+	  z4 = svset3_f16 (z4, 0, z0),
+	  z4 = svset3 (z4, 0, z0))
+
+/*
+** set3_f16_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f16_z4_1, svfloat16x3_t, svfloat16_t,
+	  z4 = svset3_f16 (z4, 1, z0),
+	  z4 = svset3 (z4, 1, z0))
+
+/*
+** set3_f16_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f16_z4_2, svfloat16x3_t, svfloat16_t,
+	  z4 = svset3_f16 (z4, 2, z0),
+	  z4 = svset3 (z4, 2, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f32.c
new file mode 100644
index 000000000..659bc713f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f32.c
@@ -0,0 +1,63 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set3_f32_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f32_z24_0, svfloat32x3_t, svfloat32_t,
+	  z24 = svset3_f32 (z4, 0, z0),
+	  z24 = svset3 (z4, 0, z0))
+
+/*
+** set3_f32_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f32_z24_1, svfloat32x3_t, svfloat32_t,
+	  z24 = svset3_f32 (z4, 1, z0),
+	  z24 = svset3 (z4, 1, z0))
+
+/*
+** set3_f32_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f32_z24_2, svfloat32x3_t, svfloat32_t,
+	  z24 = svset3_f32 (z4, 2, z0),
+	  z24 = svset3 (z4, 2, z0))
+
+/*
+** set3_f32_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f32_z4_0, svfloat32x3_t, svfloat32_t,
+	  z4 = svset3_f32 (z4, 0, z0),
+	  z4 = svset3 (z4, 0, z0))
+
+/*
+** set3_f32_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f32_z4_1, svfloat32x3_t, svfloat32_t,
+	  z4 = svset3_f32 (z4, 1, z0),
+	  z4 = svset3 (z4, 1, z0))
+
+/*
+** set3_f32_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f32_z4_2, svfloat32x3_t, svfloat32_t,
+	  z4 = svset3_f32 (z4, 2, z0),
+	  z4 = svset3 (z4, 2, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f64.c
new file mode 100644
index 000000000..2cf3b6015
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_f64.c
@@ -0,0 +1,63 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set3_f64_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f64_z24_0, svfloat64x3_t, svfloat64_t,
+	  z24 = svset3_f64 (z4, 0, z0),
+	  z24 = svset3 (z4, 0, z0))
+
+/*
+** set3_f64_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f64_z24_1, svfloat64x3_t, svfloat64_t,
+	  z24 = svset3_f64 (z4, 1, z0),
+	  z24 = svset3 (z4, 1, z0))
+
+/*
+** set3_f64_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f64_z24_2, svfloat64x3_t, svfloat64_t,
+	  z24 = svset3_f64 (z4, 2, z0),
+	  z24 = svset3 (z4, 2, z0))
+
+/*
+** set3_f64_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f64_z4_0, svfloat64x3_t, svfloat64_t,
+	  z4 = svset3_f64 (z4, 0, z0),
+	  z4 = svset3 (z4, 0, z0))
+
+/*
+** set3_f64_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f64_z4_1, svfloat64x3_t, svfloat64_t,
+	  z4 = svset3_f64 (z4, 1, z0),
+	  z4 = svset3 (z4, 1, z0))
+
+/*
+** set3_f64_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_f64_z4_2, svfloat64x3_t, svfloat64_t,
+	  z4 = svset3_f64 (z4, 2, z0),
+	  z4 = svset3 (z4, 2, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s16.c
new file mode 100644
index 000000000..907ae9894
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s16.c
@@ -0,0 +1,63 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set3_s16_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s16_z24_0, svint16x3_t, svint16_t,
+	  z24 = svset3_s16 (z4, 0, z0),
+	  z24 = svset3 (z4, 0, z0))
+
+/*
+** set3_s16_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s16_z24_1, svint16x3_t, svint16_t,
+	  z24 = svset3_s16 (z4, 1, z0),
+	  z24 = svset3 (z4, 1, z0))
+
+/*
+** set3_s16_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s16_z24_2, svint16x3_t, svint16_t,
+	  z24 = svset3_s16 (z4, 2, z0),
+	  z24 = svset3 (z4, 2, z0))
+
+/*
+** set3_s16_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s16_z4_0, svint16x3_t, svint16_t,
+	  z4 = svset3_s16 (z4, 0, z0),
+	  z4 = svset3 (z4, 0, z0))
+
+/*
+** set3_s16_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s16_z4_1, svint16x3_t, svint16_t,
+	  z4 = svset3_s16 (z4, 1, z0),
+	  z4 = svset3 (z4, 1, z0))
+
+/*
+** set3_s16_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s16_z4_2, svint16x3_t, svint16_t,
+	  z4 = svset3_s16 (z4, 2, z0),
+	  z4 = svset3 (z4, 2, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s32.c
new file mode 100644
index 000000000..0baa33c3a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s32.c
@@ -0,0 +1,63 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set3_s32_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s32_z24_0, svint32x3_t, svint32_t,
+	  z24 = svset3_s32 (z4, 0, z0),
+	  z24 = svset3 (z4, 0, z0))
+
+/*
+** set3_s32_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s32_z24_1, svint32x3_t, svint32_t,
+	  z24 = svset3_s32 (z4, 1, z0),
+	  z24 = svset3 (z4, 1, z0))
+
+/*
+** set3_s32_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s32_z24_2, svint32x3_t, svint32_t,
+	  z24 = svset3_s32 (z4, 2, z0),
+	  z24 = svset3 (z4, 2, z0))
+
+/*
+** set3_s32_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s32_z4_0, svint32x3_t, svint32_t,
+	  z4 = svset3_s32 (z4, 0, z0),
+	  z4 = svset3 (z4, 0, z0))
+
+/*
+** set3_s32_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s32_z4_1, svint32x3_t, svint32_t,
+	  z4 = svset3_s32 (z4, 1, z0),
+	  z4 = svset3 (z4, 1, z0))
+
+/*
+** set3_s32_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s32_z4_2, svint32x3_t, svint32_t,
+	  z4 = svset3_s32 (z4, 2, z0),
+	  z4 = svset3 (z4, 2, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s64.c
new file mode 100644
index 000000000..d1d142c71
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s64.c
@@ -0,0 +1,63 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set3_s64_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s64_z24_0, svint64x3_t, svint64_t,
+	  z24 = svset3_s64 (z4, 0, z0),
+	  z24 = svset3 (z4, 0, z0))
+
+/*
+** set3_s64_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s64_z24_1, svint64x3_t, svint64_t,
+	  z24 = svset3_s64 (z4, 1, z0),
+	  z24 = svset3 (z4, 1, z0))
+
+/*
+** set3_s64_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s64_z24_2, svint64x3_t, svint64_t,
+	  z24 = svset3_s64 (z4, 2, z0),
+	  z24 = svset3 (z4, 2, z0))
+
+/*
+** set3_s64_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s64_z4_0, svint64x3_t, svint64_t,
+	  z4 = svset3_s64 (z4, 0, z0),
+	  z4 = svset3 (z4, 0, z0))
+
+/*
+** set3_s64_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s64_z4_1, svint64x3_t, svint64_t,
+	  z4 = svset3_s64 (z4, 1, z0),
+	  z4 = svset3 (z4, 1, z0))
+
+/*
+** set3_s64_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s64_z4_2, svint64x3_t, svint64_t,
+	  z4 = svset3_s64 (z4, 2, z0),
+	  z4 = svset3 (z4, 2, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s8.c
new file mode 100644
index 000000000..8badf4b1d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_s8.c
@@ -0,0 +1,63 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set3_s8_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s8_z24_0, svint8x3_t, svint8_t,
+	  z24 = svset3_s8 (z4, 0, z0),
+	  z24 = svset3 (z4, 0, z0))
+
+/*
+** set3_s8_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s8_z24_1, svint8x3_t, svint8_t,
+	  z24 = svset3_s8 (z4, 1, z0),
+	  z24 = svset3 (z4, 1, z0))
+
+/*
+** set3_s8_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s8_z24_2, svint8x3_t, svint8_t,
+	  z24 = svset3_s8 (z4, 2, z0),
+	  z24 = svset3 (z4, 2, z0))
+
+/*
+** set3_s8_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s8_z4_0, svint8x3_t, svint8_t,
+	  z4 = svset3_s8 (z4, 0, z0),
+	  z4 = svset3 (z4, 0, z0))
+
+/*
+** set3_s8_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s8_z4_1, svint8x3_t, svint8_t,
+	  z4 = svset3_s8 (z4, 1, z0),
+	  z4 = svset3 (z4, 1, z0))
+
+/*
+** set3_s8_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_s8_z4_2, svint8x3_t, svint8_t,
+	  z4 = svset3_s8 (z4, 2, z0),
+	  z4 = svset3 (z4, 2, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u16.c
new file mode 100644
index 000000000..df7ce88d8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u16.c
@@ -0,0 +1,63 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set3_u16_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u16_z24_0, svuint16x3_t, svuint16_t,
+	  z24 = svset3_u16 (z4, 0, z0),
+	  z24 = svset3 (z4, 0, z0))
+
+/*
+** set3_u16_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u16_z24_1, svuint16x3_t, svuint16_t,
+	  z24 = svset3_u16 (z4, 1, z0),
+	  z24 = svset3 (z4, 1, z0))
+
+/*
+** set3_u16_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u16_z24_2, svuint16x3_t, svuint16_t,
+	  z24 = svset3_u16 (z4, 2, z0),
+	  z24 = svset3 (z4, 2, z0))
+
+/*
+** set3_u16_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u16_z4_0, svuint16x3_t, svuint16_t,
+	  z4 = svset3_u16 (z4, 0, z0),
+	  z4 = svset3 (z4, 0, z0))
+
+/*
+** set3_u16_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u16_z4_1, svuint16x3_t, svuint16_t,
+	  z4 = svset3_u16 (z4, 1, z0),
+	  z4 = svset3 (z4, 1, z0))
+
+/*
+** set3_u16_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u16_z4_2, svuint16x3_t, svuint16_t,
+	  z4 = svset3_u16 (z4, 2, z0),
+	  z4 = svset3 (z4, 2, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u32.c
new file mode 100644
index 000000000..703a68f5c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u32.c
@@ -0,0 +1,63 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set3_u32_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u32_z24_0, svuint32x3_t, svuint32_t,
+	  z24 = svset3_u32 (z4, 0, z0),
+	  z24 = svset3 (z4, 0, z0))
+
+/*
+** set3_u32_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u32_z24_1, svuint32x3_t, svuint32_t,
+	  z24 = svset3_u32 (z4, 1, z0),
+	  z24 = svset3 (z4, 1, z0))
+
+/*
+** set3_u32_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u32_z24_2, svuint32x3_t, svuint32_t,
+	  z24 = svset3_u32 (z4, 2, z0),
+	  z24 = svset3 (z4, 2, z0))
+
+/*
+** set3_u32_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u32_z4_0, svuint32x3_t, svuint32_t,
+	  z4 = svset3_u32 (z4, 0, z0),
+	  z4 = svset3 (z4, 0, z0))
+
+/*
+** set3_u32_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u32_z4_1, svuint32x3_t, svuint32_t,
+	  z4 = svset3_u32 (z4, 1, z0),
+	  z4 = svset3 (z4, 1, z0))
+
+/*
+** set3_u32_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u32_z4_2, svuint32x3_t, svuint32_t,
+	  z4 = svset3_u32 (z4, 2, z0),
+	  z4 = svset3 (z4, 2, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u64.c
new file mode 100644
index 000000000..bff5b3539
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u64.c
@@ -0,0 +1,63 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set3_u64_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u64_z24_0, svuint64x3_t, svuint64_t,
+	  z24 = svset3_u64 (z4, 0, z0),
+	  z24 = svset3 (z4, 0, z0))
+
+/*
+** set3_u64_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u64_z24_1, svuint64x3_t, svuint64_t,
+	  z24 = svset3_u64 (z4, 1, z0),
+	  z24 = svset3 (z4, 1, z0))
+
+/*
+** set3_u64_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u64_z24_2, svuint64x3_t, svuint64_t,
+	  z24 = svset3_u64 (z4, 2, z0),
+	  z24 = svset3 (z4, 2, z0))
+
+/*
+** set3_u64_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u64_z4_0, svuint64x3_t, svuint64_t,
+	  z4 = svset3_u64 (z4, 0, z0),
+	  z4 = svset3 (z4, 0, z0))
+
+/*
+** set3_u64_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u64_z4_1, svuint64x3_t, svuint64_t,
+	  z4 = svset3_u64 (z4, 1, z0),
+	  z4 = svset3 (z4, 1, z0))
+
+/*
+** set3_u64_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u64_z4_2, svuint64x3_t, svuint64_t,
+	  z4 = svset3_u64 (z4, 2, z0),
+	  z4 = svset3 (z4, 2, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u8.c
new file mode 100644
index 000000000..9f40001c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set3_u8.c
@@ -0,0 +1,63 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set3_u8_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u8_z24_0, svuint8x3_t, svuint8_t,
+	  z24 = svset3_u8 (z4, 0, z0),
+	  z24 = svset3 (z4, 0, z0))
+
+/*
+** set3_u8_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u8_z24_1, svuint8x3_t, svuint8_t,
+	  z24 = svset3_u8 (z4, 1, z0),
+	  z24 = svset3 (z4, 1, z0))
+
+/*
+** set3_u8_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u8_z24_2, svuint8x3_t, svuint8_t,
+	  z24 = svset3_u8 (z4, 2, z0),
+	  z24 = svset3 (z4, 2, z0))
+
+/*
+** set3_u8_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u8_z4_0, svuint8x3_t, svuint8_t,
+	  z4 = svset3_u8 (z4, 0, z0),
+	  z4 = svset3 (z4, 0, z0))
+
+/*
+** set3_u8_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u8_z4_1, svuint8x3_t, svuint8_t,
+	  z4 = svset3_u8 (z4, 1, z0),
+	  z4 = svset3 (z4, 1, z0))
+
+/*
+** set3_u8_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set3_u8_z4_2, svuint8x3_t, svuint8_t,
+	  z4 = svset3_u8 (z4, 2, z0),
+	  z4 = svset3 (z4, 2, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_bf16.c
new file mode 100644
index 000000000..4e26c1117
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_bf16.c
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set4_bf16_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_bf16_z24_0, svbfloat16x4_t, svbfloat16_t,
+	  z24 = svset4_bf16 (z4, 0, z0),
+	  z24 = svset4 (z4, 0, z0))
+
+/*
+** set4_bf16_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_bf16_z24_1, svbfloat16x4_t, svbfloat16_t,
+	  z24 = svset4_bf16 (z4, 1, z0),
+	  z24 = svset4 (z4, 1, z0))
+
+/*
+** set4_bf16_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z27\.d, z7\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_bf16_z24_2, svbfloat16x4_t, svbfloat16_t,
+	  z24 = svset4_bf16 (z4, 2, z0),
+	  z24 = svset4 (z4, 2, z0))
+
+/*
+** set4_bf16_z24_3:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_bf16_z24_3, svbfloat16x4_t, svbfloat16_t,
+	  z24 = svset4_bf16 (z4, 3, z0),
+	  z24 = svset4 (z4, 3, z0))
+
+/*
+** set4_bf16_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_bf16_z4_0, svbfloat16x4_t, svbfloat16_t,
+	  z4 = svset4_bf16 (z4, 0, z0),
+	  z4 = svset4 (z4, 0, z0))
+
+/*
+** set4_bf16_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_bf16_z4_1, svbfloat16x4_t, svbfloat16_t,
+	  z4 = svset4_bf16 (z4, 1, z0),
+	  z4 = svset4 (z4, 1, z0))
+
+/*
+** set4_bf16_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_bf16_z4_2, svbfloat16x4_t, svbfloat16_t,
+	  z4 = svset4_bf16 (z4, 2, z0),
+	  z4 = svset4 (z4, 2, z0))
+
+/*
+** set4_bf16_z4_3:
+**	mov	z7\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_bf16_z4_3, svbfloat16x4_t, svbfloat16_t,
+	  z4 = svset4_bf16 (z4, 3, z0),
+	  z4 = svset4 (z4, 3, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f16.c
new file mode 100644
index 000000000..a28ff9ca6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f16.c
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set4_f16_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f16_z24_0, svfloat16x4_t, svfloat16_t,
+	  z24 = svset4_f16 (z4, 0, z0),
+	  z24 = svset4 (z4, 0, z0))
+
+/*
+** set4_f16_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f16_z24_1, svfloat16x4_t, svfloat16_t,
+	  z24 = svset4_f16 (z4, 1, z0),
+	  z24 = svset4 (z4, 1, z0))
+
+/*
+** set4_f16_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z27\.d, z7\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f16_z24_2, svfloat16x4_t, svfloat16_t,
+	  z24 = svset4_f16 (z4, 2, z0),
+	  z24 = svset4 (z4, 2, z0))
+
+/*
+** set4_f16_z24_3:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f16_z24_3, svfloat16x4_t, svfloat16_t,
+	  z24 = svset4_f16 (z4, 3, z0),
+	  z24 = svset4 (z4, 3, z0))
+
+/*
+** set4_f16_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f16_z4_0, svfloat16x4_t, svfloat16_t,
+	  z4 = svset4_f16 (z4, 0, z0),
+	  z4 = svset4 (z4, 0, z0))
+
+/*
+** set4_f16_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f16_z4_1, svfloat16x4_t, svfloat16_t,
+	  z4 = svset4_f16 (z4, 1, z0),
+	  z4 = svset4 (z4, 1, z0))
+
+/*
+** set4_f16_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f16_z4_2, svfloat16x4_t, svfloat16_t,
+	  z4 = svset4_f16 (z4, 2, z0),
+	  z4 = svset4 (z4, 2, z0))
+
+/*
+** set4_f16_z4_3:
+**	mov	z7\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f16_z4_3, svfloat16x4_t, svfloat16_t,
+	  z4 = svset4_f16 (z4, 3, z0),
+	  z4 = svset4 (z4, 3, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f32.c
new file mode 100644
index 000000000..e6e3f5ebd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f32.c
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set4_f32_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f32_z24_0, svfloat32x4_t, svfloat32_t,
+	  z24 = svset4_f32 (z4, 0, z0),
+	  z24 = svset4 (z4, 0, z0))
+
+/*
+** set4_f32_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f32_z24_1, svfloat32x4_t, svfloat32_t,
+	  z24 = svset4_f32 (z4, 1, z0),
+	  z24 = svset4 (z4, 1, z0))
+
+/*
+** set4_f32_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z27\.d, z7\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f32_z24_2, svfloat32x4_t, svfloat32_t,
+	  z24 = svset4_f32 (z4, 2, z0),
+	  z24 = svset4 (z4, 2, z0))
+
+/*
+** set4_f32_z24_3:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f32_z24_3, svfloat32x4_t, svfloat32_t,
+	  z24 = svset4_f32 (z4, 3, z0),
+	  z24 = svset4 (z4, 3, z0))
+
+/*
+** set4_f32_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f32_z4_0, svfloat32x4_t, svfloat32_t,
+	  z4 = svset4_f32 (z4, 0, z0),
+	  z4 = svset4 (z4, 0, z0))
+
+/*
+** set4_f32_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f32_z4_1, svfloat32x4_t, svfloat32_t,
+	  z4 = svset4_f32 (z4, 1, z0),
+	  z4 = svset4 (z4, 1, z0))
+
+/*
+** set4_f32_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f32_z4_2, svfloat32x4_t, svfloat32_t,
+	  z4 = svset4_f32 (z4, 2, z0),
+	  z4 = svset4 (z4, 2, z0))
+
+/*
+** set4_f32_z4_3:
+**	mov	z7\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f32_z4_3, svfloat32x4_t, svfloat32_t,
+	  z4 = svset4_f32 (z4, 3, z0),
+	  z4 = svset4 (z4, 3, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f64.c
new file mode 100644
index 000000000..3ceaa459a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_f64.c
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set4_f64_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f64_z24_0, svfloat64x4_t, svfloat64_t,
+	  z24 = svset4_f64 (z4, 0, z0),
+	  z24 = svset4 (z4, 0, z0))
+
+/*
+** set4_f64_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f64_z24_1, svfloat64x4_t, svfloat64_t,
+	  z24 = svset4_f64 (z4, 1, z0),
+	  z24 = svset4 (z4, 1, z0))
+
+/*
+** set4_f64_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z27\.d, z7\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f64_z24_2, svfloat64x4_t, svfloat64_t,
+	  z24 = svset4_f64 (z4, 2, z0),
+	  z24 = svset4 (z4, 2, z0))
+
+/*
+** set4_f64_z24_3:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f64_z24_3, svfloat64x4_t, svfloat64_t,
+	  z24 = svset4_f64 (z4, 3, z0),
+	  z24 = svset4 (z4, 3, z0))
+
+/*
+** set4_f64_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f64_z4_0, svfloat64x4_t, svfloat64_t,
+	  z4 = svset4_f64 (z4, 0, z0),
+	  z4 = svset4 (z4, 0, z0))
+
+/*
+** set4_f64_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f64_z4_1, svfloat64x4_t, svfloat64_t,
+	  z4 = svset4_f64 (z4, 1, z0),
+	  z4 = svset4 (z4, 1, z0))
+
+/*
+** set4_f64_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f64_z4_2, svfloat64x4_t, svfloat64_t,
+	  z4 = svset4_f64 (z4, 2, z0),
+	  z4 = svset4 (z4, 2, z0))
+
+/*
+** set4_f64_z4_3:
+**	mov	z7\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_f64_z4_3, svfloat64x4_t, svfloat64_t,
+	  z4 = svset4_f64 (z4, 3, z0),
+	  z4 = svset4 (z4, 3, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s16.c
new file mode 100644
index 000000000..3cef6ebe8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s16.c
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set4_s16_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s16_z24_0, svint16x4_t, svint16_t,
+	  z24 = svset4_s16 (z4, 0, z0),
+	  z24 = svset4 (z4, 0, z0))
+
+/*
+** set4_s16_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s16_z24_1, svint16x4_t, svint16_t,
+	  z24 = svset4_s16 (z4, 1, z0),
+	  z24 = svset4 (z4, 1, z0))
+
+/*
+** set4_s16_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z27\.d, z7\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s16_z24_2, svint16x4_t, svint16_t,
+	  z24 = svset4_s16 (z4, 2, z0),
+	  z24 = svset4 (z4, 2, z0))
+
+/*
+** set4_s16_z24_3:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s16_z24_3, svint16x4_t, svint16_t,
+	  z24 = svset4_s16 (z4, 3, z0),
+	  z24 = svset4 (z4, 3, z0))
+
+/*
+** set4_s16_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s16_z4_0, svint16x4_t, svint16_t,
+	  z4 = svset4_s16 (z4, 0, z0),
+	  z4 = svset4 (z4, 0, z0))
+
+/*
+** set4_s16_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s16_z4_1, svint16x4_t, svint16_t,
+	  z4 = svset4_s16 (z4, 1, z0),
+	  z4 = svset4 (z4, 1, z0))
+
+/*
+** set4_s16_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s16_z4_2, svint16x4_t, svint16_t,
+	  z4 = svset4_s16 (z4, 2, z0),
+	  z4 = svset4 (z4, 2, z0))
+
+/*
+** set4_s16_z4_3:
+**	mov	z7\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s16_z4_3, svint16x4_t, svint16_t,
+	  z4 = svset4_s16 (z4, 3, z0),
+	  z4 = svset4 (z4, 3, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s32.c
new file mode 100644
index 000000000..49f646e8d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s32.c
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set4_s32_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s32_z24_0, svint32x4_t, svint32_t,
+	  z24 = svset4_s32 (z4, 0, z0),
+	  z24 = svset4 (z4, 0, z0))
+
+/*
+** set4_s32_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s32_z24_1, svint32x4_t, svint32_t,
+	  z24 = svset4_s32 (z4, 1, z0),
+	  z24 = svset4 (z4, 1, z0))
+
+/*
+** set4_s32_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z27\.d, z7\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s32_z24_2, svint32x4_t, svint32_t,
+	  z24 = svset4_s32 (z4, 2, z0),
+	  z24 = svset4 (z4, 2, z0))
+
+/*
+** set4_s32_z24_3:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s32_z24_3, svint32x4_t, svint32_t,
+	  z24 = svset4_s32 (z4, 3, z0),
+	  z24 = svset4 (z4, 3, z0))
+
+/*
+** set4_s32_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s32_z4_0, svint32x4_t, svint32_t,
+	  z4 = svset4_s32 (z4, 0, z0),
+	  z4 = svset4 (z4, 0, z0))
+
+/*
+** set4_s32_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s32_z4_1, svint32x4_t, svint32_t,
+	  z4 = svset4_s32 (z4, 1, z0),
+	  z4 = svset4 (z4, 1, z0))
+
+/*
+** set4_s32_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s32_z4_2, svint32x4_t, svint32_t,
+	  z4 = svset4_s32 (z4, 2, z0),
+	  z4 = svset4 (z4, 2, z0))
+
+/*
+** set4_s32_z4_3:
+**	mov	z7\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s32_z4_3, svint32x4_t, svint32_t,
+	  z4 = svset4_s32 (z4, 3, z0),
+	  z4 = svset4 (z4, 3, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s64.c
new file mode 100644
index 000000000..7544e25a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s64.c
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set4_s64_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s64_z24_0, svint64x4_t, svint64_t,
+	  z24 = svset4_s64 (z4, 0, z0),
+	  z24 = svset4 (z4, 0, z0))
+
+/*
+** set4_s64_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s64_z24_1, svint64x4_t, svint64_t,
+	  z24 = svset4_s64 (z4, 1, z0),
+	  z24 = svset4 (z4, 1, z0))
+
+/*
+** set4_s64_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z27\.d, z7\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s64_z24_2, svint64x4_t, svint64_t,
+	  z24 = svset4_s64 (z4, 2, z0),
+	  z24 = svset4 (z4, 2, z0))
+
+/*
+** set4_s64_z24_3:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s64_z24_3, svint64x4_t, svint64_t,
+	  z24 = svset4_s64 (z4, 3, z0),
+	  z24 = svset4 (z4, 3, z0))
+
+/*
+** set4_s64_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s64_z4_0, svint64x4_t, svint64_t,
+	  z4 = svset4_s64 (z4, 0, z0),
+	  z4 = svset4 (z4, 0, z0))
+
+/*
+** set4_s64_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s64_z4_1, svint64x4_t, svint64_t,
+	  z4 = svset4_s64 (z4, 1, z0),
+	  z4 = svset4 (z4, 1, z0))
+
+/*
+** set4_s64_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s64_z4_2, svint64x4_t, svint64_t,
+	  z4 = svset4_s64 (z4, 2, z0),
+	  z4 = svset4 (z4, 2, z0))
+
+/*
+** set4_s64_z4_3:
+**	mov	z7\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s64_z4_3, svint64x4_t, svint64_t,
+	  z4 = svset4_s64 (z4, 3, z0),
+	  z4 = svset4 (z4, 3, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s8.c
new file mode 100644
index 000000000..2ec9ff059
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_s8.c
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set4_s8_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s8_z24_0, svint8x4_t, svint8_t,
+	  z24 = svset4_s8 (z4, 0, z0),
+	  z24 = svset4 (z4, 0, z0))
+
+/*
+** set4_s8_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s8_z24_1, svint8x4_t, svint8_t,
+	  z24 = svset4_s8 (z4, 1, z0),
+	  z24 = svset4 (z4, 1, z0))
+
+/*
+** set4_s8_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z27\.d, z7\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s8_z24_2, svint8x4_t, svint8_t,
+	  z24 = svset4_s8 (z4, 2, z0),
+	  z24 = svset4 (z4, 2, z0))
+
+/*
+** set4_s8_z24_3:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s8_z24_3, svint8x4_t, svint8_t,
+	  z24 = svset4_s8 (z4, 3, z0),
+	  z24 = svset4 (z4, 3, z0))
+
+/*
+** set4_s8_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s8_z4_0, svint8x4_t, svint8_t,
+	  z4 = svset4_s8 (z4, 0, z0),
+	  z4 = svset4 (z4, 0, z0))
+
+/*
+** set4_s8_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s8_z4_1, svint8x4_t, svint8_t,
+	  z4 = svset4_s8 (z4, 1, z0),
+	  z4 = svset4 (z4, 1, z0))
+
+/*
+** set4_s8_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s8_z4_2, svint8x4_t, svint8_t,
+	  z4 = svset4_s8 (z4, 2, z0),
+	  z4 = svset4 (z4, 2, z0))
+
+/*
+** set4_s8_z4_3:
+**	mov	z7\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_s8_z4_3, svint8x4_t, svint8_t,
+	  z4 = svset4_s8 (z4, 3, z0),
+	  z4 = svset4 (z4, 3, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u16.c
new file mode 100644
index 000000000..c9499b044
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u16.c
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set4_u16_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u16_z24_0, svuint16x4_t, svuint16_t,
+	  z24 = svset4_u16 (z4, 0, z0),
+	  z24 = svset4 (z4, 0, z0))
+
+/*
+** set4_u16_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u16_z24_1, svuint16x4_t, svuint16_t,
+	  z24 = svset4_u16 (z4, 1, z0),
+	  z24 = svset4 (z4, 1, z0))
+
+/*
+** set4_u16_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z27\.d, z7\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u16_z24_2, svuint16x4_t, svuint16_t,
+	  z24 = svset4_u16 (z4, 2, z0),
+	  z24 = svset4 (z4, 2, z0))
+
+/*
+** set4_u16_z24_3:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u16_z24_3, svuint16x4_t, svuint16_t,
+	  z24 = svset4_u16 (z4, 3, z0),
+	  z24 = svset4 (z4, 3, z0))
+
+/*
+** set4_u16_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u16_z4_0, svuint16x4_t, svuint16_t,
+	  z4 = svset4_u16 (z4, 0, z0),
+	  z4 = svset4 (z4, 0, z0))
+
+/*
+** set4_u16_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u16_z4_1, svuint16x4_t, svuint16_t,
+	  z4 = svset4_u16 (z4, 1, z0),
+	  z4 = svset4 (z4, 1, z0))
+
+/*
+** set4_u16_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u16_z4_2, svuint16x4_t, svuint16_t,
+	  z4 = svset4_u16 (z4, 2, z0),
+	  z4 = svset4 (z4, 2, z0))
+
+/*
+** set4_u16_z4_3:
+**	mov	z7\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u16_z4_3, svuint16x4_t, svuint16_t,
+	  z4 = svset4_u16 (z4, 3, z0),
+	  z4 = svset4 (z4, 3, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u32.c
new file mode 100644
index 000000000..00b3dc513
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u32.c
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set4_u32_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u32_z24_0, svuint32x4_t, svuint32_t,
+	  z24 = svset4_u32 (z4, 0, z0),
+	  z24 = svset4 (z4, 0, z0))
+
+/*
+** set4_u32_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u32_z24_1, svuint32x4_t, svuint32_t,
+	  z24 = svset4_u32 (z4, 1, z0),
+	  z24 = svset4 (z4, 1, z0))
+
+/*
+** set4_u32_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z27\.d, z7\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u32_z24_2, svuint32x4_t, svuint32_t,
+	  z24 = svset4_u32 (z4, 2, z0),
+	  z24 = svset4 (z4, 2, z0))
+
+/*
+** set4_u32_z24_3:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u32_z24_3, svuint32x4_t, svuint32_t,
+	  z24 = svset4_u32 (z4, 3, z0),
+	  z24 = svset4 (z4, 3, z0))
+
+/*
+** set4_u32_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u32_z4_0, svuint32x4_t, svuint32_t,
+	  z4 = svset4_u32 (z4, 0, z0),
+	  z4 = svset4 (z4, 0, z0))
+
+/*
+** set4_u32_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u32_z4_1, svuint32x4_t, svuint32_t,
+	  z4 = svset4_u32 (z4, 1, z0),
+	  z4 = svset4 (z4, 1, z0))
+
+/*
+** set4_u32_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u32_z4_2, svuint32x4_t, svuint32_t,
+	  z4 = svset4_u32 (z4, 2, z0),
+	  z4 = svset4 (z4, 2, z0))
+
+/*
+** set4_u32_z4_3:
+**	mov	z7\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u32_z4_3, svuint32x4_t, svuint32_t,
+	  z4 = svset4_u32 (z4, 3, z0),
+	  z4 = svset4 (z4, 3, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u64.c
new file mode 100644
index 000000000..d2f048b82
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u64.c
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set4_u64_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u64_z24_0, svuint64x4_t, svuint64_t,
+	  z24 = svset4_u64 (z4, 0, z0),
+	  z24 = svset4 (z4, 0, z0))
+
+/*
+** set4_u64_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u64_z24_1, svuint64x4_t, svuint64_t,
+	  z24 = svset4_u64 (z4, 1, z0),
+	  z24 = svset4 (z4, 1, z0))
+
+/*
+** set4_u64_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z27\.d, z7\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u64_z24_2, svuint64x4_t, svuint64_t,
+	  z24 = svset4_u64 (z4, 2, z0),
+	  z24 = svset4 (z4, 2, z0))
+
+/*
+** set4_u64_z24_3:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u64_z24_3, svuint64x4_t, svuint64_t,
+	  z24 = svset4_u64 (z4, 3, z0),
+	  z24 = svset4 (z4, 3, z0))
+
+/*
+** set4_u64_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u64_z4_0, svuint64x4_t, svuint64_t,
+	  z4 = svset4_u64 (z4, 0, z0),
+	  z4 = svset4 (z4, 0, z0))
+
+/*
+** set4_u64_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u64_z4_1, svuint64x4_t, svuint64_t,
+	  z4 = svset4_u64 (z4, 1, z0),
+	  z4 = svset4 (z4, 1, z0))
+
+/*
+** set4_u64_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u64_z4_2, svuint64x4_t, svuint64_t,
+	  z4 = svset4_u64 (z4, 2, z0),
+	  z4 = svset4 (z4, 2, z0))
+
+/*
+** set4_u64_z4_3:
+**	mov	z7\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u64_z4_3, svuint64x4_t, svuint64_t,
+	  z4 = svset4_u64 (z4, 3, z0),
+	  z4 = svset4 (z4, 3, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u8.c
new file mode 100644
index 000000000..b4f27c6f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/set4_u8.c
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set4_u8_z24_0:
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z24\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u8_z24_0, svuint8x4_t, svuint8_t,
+	  z24 = svset4_u8 (z4, 0, z0),
+	  z24 = svset4 (z4, 0, z0))
+
+/*
+** set4_u8_z24_1:
+**	mov	z24\.d, z4\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z7\.d
+**	mov	z25\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u8_z24_1, svuint8x4_t, svuint8_t,
+	  z24 = svset4_u8 (z4, 1, z0),
+	  z24 = svset4 (z4, 1, z0))
+
+/*
+** set4_u8_z24_2:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z27\.d, z7\.d
+**	mov	z26\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u8_z24_2, svuint8x4_t, svuint8_t,
+	  z24 = svset4_u8 (z4, 2, z0),
+	  z24 = svset4 (z4, 2, z0))
+
+/*
+** set4_u8_z24_3:
+**	mov	z24\.d, z4\.d
+**	mov	z25\.d, z5\.d
+**	mov	z26\.d, z6\.d
+**	mov	z27\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u8_z24_3, svuint8x4_t, svuint8_t,
+	  z24 = svset4_u8 (z4, 3, z0),
+	  z24 = svset4 (z4, 3, z0))
+
+/*
+** set4_u8_z4_0:
+**	mov	z4\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u8_z4_0, svuint8x4_t, svuint8_t,
+	  z4 = svset4_u8 (z4, 0, z0),
+	  z4 = svset4 (z4, 0, z0))
+
+/*
+** set4_u8_z4_1:
+**	mov	z5\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u8_z4_1, svuint8x4_t, svuint8_t,
+	  z4 = svset4_u8 (z4, 1, z0),
+	  z4 = svset4 (z4, 1, z0))
+
+/*
+** set4_u8_z4_2:
+**	mov	z6\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u8_z4_2, svuint8x4_t, svuint8_t,
+	  z4 = svset4_u8 (z4, 2, z0),
+	  z4 = svset4 (z4, 2, z0))
+
+/*
+** set4_u8_z4_3:
+**	mov	z7\.d, z0\.d
+**	ret
+*/
+TEST_SET (set4_u8_z4_3, svuint8x4_t, svuint8_t,
+	  z4 = svset4_u8 (z4, 3, z0),
+	  z4 = svset4 (z4, 3, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_bf16.c
new file mode 100644
index 000000000..3d2dbf20d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_bf16.c
@@ -0,0 +1,33 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** splice_bf16_tied1:
+**	splice	z0\.h, p0, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (splice_bf16_tied1, svbfloat16_t,
+		z0 = svsplice_bf16 (p0, z0, z1),
+		z0 = svsplice (p0, z0, z1))
+
+/*
+** splice_bf16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	splice	z0\.h, p0, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (splice_bf16_tied2, svbfloat16_t,
+		z0 = svsplice_bf16 (p0, z1, z0),
+		z0 = svsplice (p0, z1, z0))
+
+/*
+** splice_bf16_untied:
+**	movprfx	z0, z1
+**	splice	z0\.h, p0, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (splice_bf16_untied, svbfloat16_t,
+		z0 = svsplice_bf16 (p0, z1, z2),
+		z0 = svsplice (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f16.c
new file mode 100644
index 000000000..b796eaf3d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f16.c
@@ -0,0 +1,33 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** splice_f16_tied1:
+**	splice	z0\.h, p0, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (splice_f16_tied1, svfloat16_t,
+		z0 = svsplice_f16 (p0, z0, z1),
+		z0 = svsplice (p0, z0, z1))
+
+/*
+** splice_f16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	splice	z0\.h, p0, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (splice_f16_tied2, svfloat16_t,
+		z0 = svsplice_f16 (p0, z1, z0),
+		z0 = svsplice (p0, z1, z0))
+
+/*
+** splice_f16_untied:
+**	movprfx	z0, z1
+**	splice	z0\.h, p0, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (splice_f16_untied, svfloat16_t,
+		z0 = svsplice_f16 (p0, z1, z2),
+		z0 = svsplice (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f32.c
new file mode 100644
index 000000000..1fc552bc3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f32.c
@@ -0,0 +1,33 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** splice_f32_tied1:
+**	splice	z0\.s, p0, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (splice_f32_tied1, svfloat32_t,
+		z0 = svsplice_f32 (p0, z0, z1),
+		z0 = svsplice (p0, z0, z1))
+
+/*
+** splice_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	splice	z0\.s, p0, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (splice_f32_tied2, svfloat32_t,
+		z0 = svsplice_f32 (p0, z1, z0),
+		z0 = svsplice (p0, z1, z0))
+
+/*
+** splice_f32_untied:
+**	movprfx	z0, z1
+**	splice	z0\.s, p0, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (splice_f32_untied, svfloat32_t,
+		z0 = svsplice_f32 (p0, z1, z2),
+		z0 = svsplice (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f64.c
new file mode 100644
index 000000000..26b523520
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_f64.c
@@ -0,0 +1,33 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** splice_f64_tied1:
+**	splice	z0\.d, p0, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (splice_f64_tied1, svfloat64_t,
+		z0 = svsplice_f64 (p0, z0, z1),
+		z0 = svsplice (p0, z0, z1))
+
+/*
+** splice_f64_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	splice	z0\.d, p0, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (splice_f64_tied2, svfloat64_t,
+		z0 = svsplice_f64 (p0, z1, z0),
+		z0 = svsplice (p0, z1, z0))
+
+/*
+** splice_f64_untied:
+**	movprfx	z0, z1
+**	splice	z0\.d, p0, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (splice_f64_untied, svfloat64_t,
+		z0 = svsplice_f64 (p0, z1, z2),
+		z0 = svsplice (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s16.c
new file mode 100644
index 000000000..8796c6ecd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s16.c
@@ -0,0 +1,33 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** splice_s16_tied1:
+**	splice	z0\.h, p0, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (splice_s16_tied1, svint16_t,
+		z0 = svsplice_s16 (p0, z0, z1),
+		z0 = svsplice (p0, z0, z1))
+
+/*
+** splice_s16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	splice	z0\.h, p0, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (splice_s16_tied2, svint16_t,
+		z0 = svsplice_s16 (p0, z1, z0),
+		z0 = svsplice (p0, z1, z0))
+
+/*
+** splice_s16_untied:
+**	movprfx	z0, z1
+**	splice	z0\.h, p0, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (splice_s16_untied, svint16_t,
+		z0 = svsplice_s16 (p0, z1, z2),
+		z0 = svsplice (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s32.c
new file mode 100644
index 000000000..5f2798e06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s32.c
@@ -0,0 +1,33 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** splice_s32_tied1:
+**	splice	z0\.s, p0, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (splice_s32_tied1, svint32_t,
+		z0 = svsplice_s32 (p0, z0, z1),
+		z0 = svsplice (p0, z0, z1))
+
+/*
+** splice_s32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	splice	z0\.s, p0, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (splice_s32_tied2, svint32_t,
+		z0 = svsplice_s32 (p0, z1, z0),
+		z0 = svsplice (p0, z1, z0))
+
+/*
+** splice_s32_untied:
+**	movprfx	z0, z1
+**	splice	z0\.s, p0, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (splice_s32_untied, svint32_t,
+		z0 = svsplice_s32 (p0, z1, z2),
+		z0 = svsplice (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s64.c
new file mode 100644
index 000000000..024bfa479
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s64.c
@@ -0,0 +1,33 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** splice_s64_tied1:
+**	splice	z0\.d, p0, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (splice_s64_tied1, svint64_t,
+		z0 = svsplice_s64 (p0, z0, z1),
+		z0 = svsplice (p0, z0, z1))
+
+/*
+** splice_s64_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	splice	z0\.d, p0, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (splice_s64_tied2, svint64_t,
+		z0 = svsplice_s64 (p0, z1, z0),
+		z0 = svsplice (p0, z1, z0))
+
+/*
+** splice_s64_untied:
+**	movprfx	z0, z1
+**	splice	z0\.d, p0, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (splice_s64_untied, svint64_t,
+		z0 = svsplice_s64 (p0, z1, z2),
+		z0 = svsplice (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s8.c
new file mode 100644
index 000000000..cd91ee245
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_s8.c
@@ -0,0 +1,33 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** splice_s8_tied1:
+**	splice	z0\.b, p0, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (splice_s8_tied1, svint8_t,
+		z0 = svsplice_s8 (p0, z0, z1),
+		z0 = svsplice (p0, z0, z1))
+
+/*
+** splice_s8_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	splice	z0\.b, p0, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (splice_s8_tied2, svint8_t,
+		z0 = svsplice_s8 (p0, z1, z0),
+		z0 = svsplice (p0, z1, z0))
+
+/*
+** splice_s8_untied:
+**	movprfx	z0, z1
+**	splice	z0\.b, p0, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (splice_s8_untied, svint8_t,
+		z0 = svsplice_s8 (p0, z1, z2),
+		z0 = svsplice (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u16.c
new file mode 100644
index 000000000..821ebaee6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u16.c
@@ -0,0 +1,33 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** splice_u16_tied1:
+**	splice	z0\.h, p0, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (splice_u16_tied1, svuint16_t,
+		z0 = svsplice_u16 (p0, z0, z1),
+		z0 = svsplice (p0, z0, z1))
+
+/*
+** splice_u16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	splice	z0\.h, p0, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (splice_u16_tied2, svuint16_t,
+		z0 = svsplice_u16 (p0, z1, z0),
+		z0 = svsplice (p0, z1, z0))
+
+/*
+** splice_u16_untied:
+**	movprfx	z0, z1
+**	splice	z0\.h, p0, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (splice_u16_untied, svuint16_t,
+		z0 = svsplice_u16 (p0, z1, z2),
+		z0 = svsplice (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u32.c
new file mode 100644
index 000000000..200364f20
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u32.c
@@ -0,0 +1,33 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** splice_u32_tied1:
+**	splice	z0\.s, p0, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (splice_u32_tied1, svuint32_t,
+		z0 = svsplice_u32 (p0, z0, z1),
+		z0 = svsplice (p0, z0, z1))
+
+/*
+** splice_u32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	splice	z0\.s, p0, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (splice_u32_tied2, svuint32_t,
+		z0 = svsplice_u32 (p0, z1, z0),
+		z0 = svsplice (p0, z1, z0))
+
+/*
+** splice_u32_untied:
+**	movprfx	z0, z1
+**	splice	z0\.s, p0, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (splice_u32_untied, svuint32_t,
+		z0 = svsplice_u32 (p0, z1, z2),
+		z0 = svsplice (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u64.c
new file mode 100644
index 000000000..352bcdeed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u64.c
@@ -0,0 +1,33 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** splice_u64_tied1:
+**	splice	z0\.d, p0, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (splice_u64_tied1, svuint64_t,
+		z0 = svsplice_u64 (p0, z0, z1),
+		z0 = svsplice (p0, z0, z1))
+
+/*
+** splice_u64_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	splice	z0\.d, p0, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (splice_u64_tied2, svuint64_t,
+		z0 = svsplice_u64 (p0, z1, z0),
+		z0 = svsplice (p0, z1, z0))
+
+/*
+** splice_u64_untied:
+**	movprfx	z0, z1
+**	splice	z0\.d, p0, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (splice_u64_untied, svuint64_t,
+		z0 = svsplice_u64 (p0, z1, z2),
+		z0 = svsplice (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u8.c
new file mode 100644
index 000000000..6c24fe64d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/splice_u8.c
@@ -0,0 +1,33 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** splice_u8_tied1:
+**	splice	z0\.b, p0, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (splice_u8_tied1, svuint8_t,
+		z0 = svsplice_u8 (p0, z0, z1),
+		z0 = svsplice (p0, z0, z1))
+
+/*
+** splice_u8_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	splice	z0\.b, p0, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (splice_u8_tied2, svuint8_t,
+		z0 = svsplice_u8 (p0, z1, z0),
+		z0 = svsplice (p0, z1, z0))
+
+/*
+** splice_u8_untied:
+**	movprfx	z0, z1
+**	splice	z0\.b, p0, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (splice_u8_untied, svuint8_t,
+		z0 = svsplice_u8 (p0, z1, z2),
+		z0 = svsplice (p0, z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f16.c
new file mode 100644
index 000000000..6dc5940fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f16.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sqrt_f16_m_tied12:
+**	fsqrt	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f16_m_tied12, svfloat16_t,
+		z0 = svsqrt_f16_m (z0, p0, z0),
+		z0 = svsqrt_m (z0, p0, z0))
+
+/*
+** sqrt_f16_m_tied1:
+**	fsqrt	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f16_m_tied1, svfloat16_t,
+		z0 = svsqrt_f16_m (z0, p0, z1),
+		z0 = svsqrt_m (z0, p0, z1))
+
+/*
+** sqrt_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fsqrt	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f16_m_tied2, svfloat16_t,
+		z0 = svsqrt_f16_m (z1, p0, z0),
+		z0 = svsqrt_m (z1, p0, z0))
+
+/*
+** sqrt_f16_m_untied:
+**	movprfx	z0, z2
+**	fsqrt	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f16_m_untied, svfloat16_t,
+		z0 = svsqrt_f16_m (z2, p0, z1),
+		z0 = svsqrt_m (z2, p0, z1))
+
+/*
+** sqrt_f16_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.h, p0/z, \1\.h
+**	fsqrt	z0\.h, p0/m, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f16_z_tied1, svfloat16_t,
+		z0 = svsqrt_f16_z (p0, z0),
+		z0 = svsqrt_z (p0, z0))
+
+/*
+** sqrt_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsqrt	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f16_z_untied, svfloat16_t,
+		z0 = svsqrt_f16_z (p0, z1),
+		z0 = svsqrt_z (p0, z1))
+
+/*
+** sqrt_f16_x_tied1:
+**	fsqrt	z0\.h, p0/m, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f16_x_tied1, svfloat16_t,
+		z0 = svsqrt_f16_x (p0, z0),
+		z0 = svsqrt_x (p0, z0))
+
+/*
+** sqrt_f16_x_untied:
+**	fsqrt	z0\.h, p0/m, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f16_x_untied, svfloat16_t,
+		z0 = svsqrt_f16_x (p0, z1),
+		z0 = svsqrt_x (p0, z1))
+
+/*
+** ptrue_sqrt_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sqrt_f16_x_tied1, svfloat16_t,
+		z0 = svsqrt_f16_x (svptrue_b16 (), z0),
+		z0 = svsqrt_x (svptrue_b16 (), z0))
+
+/*
+** ptrue_sqrt_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sqrt_f16_x_untied, svfloat16_t,
+		z0 = svsqrt_f16_x (svptrue_b16 (), z1),
+		z0 = svsqrt_x (svptrue_b16 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f32.c
new file mode 100644
index 000000000..71d1f8f74
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f32.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sqrt_f32_m_tied12:
+**	fsqrt	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f32_m_tied12, svfloat32_t,
+		z0 = svsqrt_f32_m (z0, p0, z0),
+		z0 = svsqrt_m (z0, p0, z0))
+
+/*
+** sqrt_f32_m_tied1:
+**	fsqrt	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f32_m_tied1, svfloat32_t,
+		z0 = svsqrt_f32_m (z0, p0, z1),
+		z0 = svsqrt_m (z0, p0, z1))
+
+/*
+** sqrt_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fsqrt	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f32_m_tied2, svfloat32_t,
+		z0 = svsqrt_f32_m (z1, p0, z0),
+		z0 = svsqrt_m (z1, p0, z0))
+
+/*
+** sqrt_f32_m_untied:
+**	movprfx	z0, z2
+**	fsqrt	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f32_m_untied, svfloat32_t,
+		z0 = svsqrt_f32_m (z2, p0, z1),
+		z0 = svsqrt_m (z2, p0, z1))
+
+/*
+** sqrt_f32_z_tied1:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0\.s, p0/z, \1\.s
+**	fsqrt	z0\.s, p0/m, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f32_z_tied1, svfloat32_t,
+		z0 = svsqrt_f32_z (p0, z0),
+		z0 = svsqrt_z (p0, z0))
+
+/*
+** sqrt_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsqrt	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f32_z_untied, svfloat32_t,
+		z0 = svsqrt_f32_z (p0, z1),
+		z0 = svsqrt_z (p0, z1))
+
+/*
+** sqrt_f32_x_tied1:
+**	fsqrt	z0\.s, p0/m, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f32_x_tied1, svfloat32_t,
+		z0 = svsqrt_f32_x (p0, z0),
+		z0 = svsqrt_x (p0, z0))
+
+/*
+** sqrt_f32_x_untied:
+**	fsqrt	z0\.s, p0/m, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f32_x_untied, svfloat32_t,
+		z0 = svsqrt_f32_x (p0, z1),
+		z0 = svsqrt_x (p0, z1))
+
+/*
+** ptrue_sqrt_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sqrt_f32_x_tied1, svfloat32_t,
+		z0 = svsqrt_f32_x (svptrue_b32 (), z0),
+		z0 = svsqrt_x (svptrue_b32 (), z0))
+
+/*
+** ptrue_sqrt_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sqrt_f32_x_untied, svfloat32_t,
+		z0 = svsqrt_f32_x (svptrue_b32 (), z1),
+		z0 = svsqrt_x (svptrue_b32 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f64.c
new file mode 100644
index 000000000..7771df545
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sqrt_f64.c
@@ -0,0 +1,103 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sqrt_f64_m_tied12:
+**	fsqrt	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f64_m_tied12, svfloat64_t,
+		z0 = svsqrt_f64_m (z0, p0, z0),
+		z0 = svsqrt_m (z0, p0, z0))
+
+/*
+** sqrt_f64_m_tied1:
+**	fsqrt	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f64_m_tied1, svfloat64_t,
+		z0 = svsqrt_f64_m (z0, p0, z1),
+		z0 = svsqrt_m (z0, p0, z1))
+
+/*
+** sqrt_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fsqrt	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f64_m_tied2, svfloat64_t,
+		z0 = svsqrt_f64_m (z1, p0, z0),
+		z0 = svsqrt_m (z1, p0, z0))
+
+/*
+** sqrt_f64_m_untied:
+**	movprfx	z0, z2
+**	fsqrt	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f64_m_untied, svfloat64_t,
+		z0 = svsqrt_f64_m (z2, p0, z1),
+		z0 = svsqrt_m (z2, p0, z1))
+
+/*
+** sqrt_f64_z_tied1:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0\.d, p0/z, \1
+**	fsqrt	z0\.d, p0/m, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f64_z_tied1, svfloat64_t,
+		z0 = svsqrt_f64_z (p0, z0),
+		z0 = svsqrt_z (p0, z0))
+
+/*
+** sqrt_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsqrt	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f64_z_untied, svfloat64_t,
+		z0 = svsqrt_f64_z (p0, z1),
+		z0 = svsqrt_z (p0, z1))
+
+/*
+** sqrt_f64_x_tied1:
+**	fsqrt	z0\.d, p0/m, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f64_x_tied1, svfloat64_t,
+		z0 = svsqrt_f64_x (p0, z0),
+		z0 = svsqrt_x (p0, z0))
+
+/*
+** sqrt_f64_x_untied:
+**	fsqrt	z0\.d, p0/m, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sqrt_f64_x_untied, svfloat64_t,
+		z0 = svsqrt_f64_x (p0, z1),
+		z0 = svsqrt_x (p0, z1))
+
+/*
+** ptrue_sqrt_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sqrt_f64_x_tied1, svfloat64_t,
+		z0 = svsqrt_f64_x (svptrue_b64 (), z0),
+		z0 = svsqrt_x (svptrue_b64 (), z0))
+
+/*
+** ptrue_sqrt_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sqrt_f64_x_untied, svfloat64_t,
+		z0 = svsqrt_f64_x (svptrue_b64 (), z1),
+		z0 = svsqrt_x (svptrue_b64 (), z1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_bf16.c
new file mode 100644
index 000000000..ec3dbe318
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_bf16.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_bf16_base:
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_bf16_base, svbfloat16_t, bfloat16_t,
+	    svst1_bf16 (p0, x0, z0),
+	    svst1 (p0, x0, z0))
+
+/*
+** st1_bf16_index:
+**	st1h	z0\.h, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st1_bf16_index, svbfloat16_t, bfloat16_t,
+	    svst1_bf16 (p0, x0 + x1, z0),
+	    svst1 (p0, x0 + x1, z0))
+
+/*
+** st1_bf16_1:
+**	st1h	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_bf16_1, svbfloat16_t, bfloat16_t,
+	    svst1_bf16 (p0, x0 + svcnth (), z0),
+	    svst1 (p0, x0 + svcnth (), z0))
+
+/*
+** st1_bf16_7:
+**	st1h	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_bf16_7, svbfloat16_t, bfloat16_t,
+	    svst1_bf16 (p0, x0 + svcnth () * 7, z0),
+	    svst1 (p0, x0 + svcnth () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_bf16_8:
+**	incb	x0, all, mul #8
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_bf16_8, svbfloat16_t, bfloat16_t,
+	    svst1_bf16 (p0, x0 + svcnth () * 8, z0),
+	    svst1 (p0, x0 + svcnth () * 8, z0))
+
+/*
+** st1_bf16_m1:
+**	st1h	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_bf16_m1, svbfloat16_t, bfloat16_t,
+	    svst1_bf16 (p0, x0 - svcnth (), z0),
+	    svst1 (p0, x0 - svcnth (), z0))
+
+/*
+** st1_bf16_m8:
+**	st1h	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_bf16_m8, svbfloat16_t, bfloat16_t,
+	    svst1_bf16 (p0, x0 - svcnth () * 8, z0),
+	    svst1 (p0, x0 - svcnth () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_bf16_m9:
+**	decb	x0, all, mul #9
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_bf16_m9, svbfloat16_t, bfloat16_t,
+	    svst1_bf16 (p0, x0 - svcnth () * 9, z0),
+	    svst1 (p0, x0 - svcnth () * 9, z0))
+
+/*
+** st1_vnum_bf16_0:
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+	    svst1_vnum_bf16 (p0, x0, 0, z0),
+	    svst1_vnum (p0, x0, 0, z0))
+
+/*
+** st1_vnum_bf16_1:
+**	st1h	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+	    svst1_vnum_bf16 (p0, x0, 1, z0),
+	    svst1_vnum (p0, x0, 1, z0))
+
+/*
+** st1_vnum_bf16_7:
+**	st1h	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
+	    svst1_vnum_bf16 (p0, x0, 7, z0),
+	    svst1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_bf16_8:
+**	incb	x0, all, mul #8
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
+	    svst1_vnum_bf16 (p0, x0, 8, z0),
+	    svst1_vnum (p0, x0, 8, z0))
+
+/*
+** st1_vnum_bf16_m1:
+**	st1h	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+	    svst1_vnum_bf16 (p0, x0, -1, z0),
+	    svst1_vnum (p0, x0, -1, z0))
+
+/*
+** st1_vnum_bf16_m8:
+**	st1h	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
+	    svst1_vnum_bf16 (p0, x0, -8, z0),
+	    svst1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_bf16_m9:
+**	decb	x0, all, mul #9
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
+	    svst1_vnum_bf16 (p0, x0, -9, z0),
+	    svst1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st1_vnum_bf16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st1h	z0\.h, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+	    svst1_vnum_bf16 (p0, x0, x1, z0),
+	    svst1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f16.c
new file mode 100644
index 000000000..2406cfd97
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f16.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_f16_base:
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_f16_base, svfloat16_t, float16_t,
+	    svst1_f16 (p0, x0, z0),
+	    svst1 (p0, x0, z0))
+
+/*
+** st1_f16_index:
+**	st1h	z0\.h, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st1_f16_index, svfloat16_t, float16_t,
+	    svst1_f16 (p0, x0 + x1, z0),
+	    svst1 (p0, x0 + x1, z0))
+
+/*
+** st1_f16_1:
+**	st1h	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_f16_1, svfloat16_t, float16_t,
+	    svst1_f16 (p0, x0 + svcnth (), z0),
+	    svst1 (p0, x0 + svcnth (), z0))
+
+/*
+** st1_f16_7:
+**	st1h	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_f16_7, svfloat16_t, float16_t,
+	    svst1_f16 (p0, x0 + svcnth () * 7, z0),
+	    svst1 (p0, x0 + svcnth () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_f16_8:
+**	incb	x0, all, mul #8
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_f16_8, svfloat16_t, float16_t,
+	    svst1_f16 (p0, x0 + svcnth () * 8, z0),
+	    svst1 (p0, x0 + svcnth () * 8, z0))
+
+/*
+** st1_f16_m1:
+**	st1h	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_f16_m1, svfloat16_t, float16_t,
+	    svst1_f16 (p0, x0 - svcnth (), z0),
+	    svst1 (p0, x0 - svcnth (), z0))
+
+/*
+** st1_f16_m8:
+**	st1h	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_f16_m8, svfloat16_t, float16_t,
+	    svst1_f16 (p0, x0 - svcnth () * 8, z0),
+	    svst1 (p0, x0 - svcnth () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_f16_m9:
+**	decb	x0, all, mul #9
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_f16_m9, svfloat16_t, float16_t,
+	    svst1_f16 (p0, x0 - svcnth () * 9, z0),
+	    svst1 (p0, x0 - svcnth () * 9, z0))
+
+/*
+** st1_vnum_f16_0:
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f16_0, svfloat16_t, float16_t,
+	    svst1_vnum_f16 (p0, x0, 0, z0),
+	    svst1_vnum (p0, x0, 0, z0))
+
+/*
+** st1_vnum_f16_1:
+**	st1h	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f16_1, svfloat16_t, float16_t,
+	    svst1_vnum_f16 (p0, x0, 1, z0),
+	    svst1_vnum (p0, x0, 1, z0))
+
+/*
+** st1_vnum_f16_7:
+**	st1h	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f16_7, svfloat16_t, float16_t,
+	    svst1_vnum_f16 (p0, x0, 7, z0),
+	    svst1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_f16_8:
+**	incb	x0, all, mul #8
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f16_8, svfloat16_t, float16_t,
+	    svst1_vnum_f16 (p0, x0, 8, z0),
+	    svst1_vnum (p0, x0, 8, z0))
+
+/*
+** st1_vnum_f16_m1:
+**	st1h	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f16_m1, svfloat16_t, float16_t,
+	    svst1_vnum_f16 (p0, x0, -1, z0),
+	    svst1_vnum (p0, x0, -1, z0))
+
+/*
+** st1_vnum_f16_m8:
+**	st1h	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f16_m8, svfloat16_t, float16_t,
+	    svst1_vnum_f16 (p0, x0, -8, z0),
+	    svst1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_f16_m9:
+**	decb	x0, all, mul #9
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f16_m9, svfloat16_t, float16_t,
+	    svst1_vnum_f16 (p0, x0, -9, z0),
+	    svst1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st1_vnum_f16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st1h	z0\.h, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f16_x1, svfloat16_t, float16_t,
+	    svst1_vnum_f16 (p0, x0, x1, z0),
+	    svst1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f32.c
new file mode 100644
index 000000000..5fad7f06f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_f32_base:
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_f32_base, svfloat32_t, float32_t,
+	    svst1_f32 (p0, x0, z0),
+	    svst1 (p0, x0, z0))
+
+/*
+** st1_f32_index:
+**	st1w	z0\.s, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (st1_f32_index, svfloat32_t, float32_t,
+	    svst1_f32 (p0, x0 + x1, z0),
+	    svst1 (p0, x0 + x1, z0))
+
+/*
+** st1_f32_1:
+**	st1w	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_f32_1, svfloat32_t, float32_t,
+	    svst1_f32 (p0, x0 + svcntw (), z0),
+	    svst1 (p0, x0 + svcntw (), z0))
+
+/*
+** st1_f32_7:
+**	st1w	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_f32_7, svfloat32_t, float32_t,
+	    svst1_f32 (p0, x0 + svcntw () * 7, z0),
+	    svst1 (p0, x0 + svcntw () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_f32_8:
+**	incb	x0, all, mul #8
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_f32_8, svfloat32_t, float32_t,
+	    svst1_f32 (p0, x0 + svcntw () * 8, z0),
+	    svst1 (p0, x0 + svcntw () * 8, z0))
+
+/*
+** st1_f32_m1:
+**	st1w	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_f32_m1, svfloat32_t, float32_t,
+	    svst1_f32 (p0, x0 - svcntw (), z0),
+	    svst1 (p0, x0 - svcntw (), z0))
+
+/*
+** st1_f32_m8:
+**	st1w	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_f32_m8, svfloat32_t, float32_t,
+	    svst1_f32 (p0, x0 - svcntw () * 8, z0),
+	    svst1 (p0, x0 - svcntw () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_f32_m9:
+**	decb	x0, all, mul #9
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_f32_m9, svfloat32_t, float32_t,
+	    svst1_f32 (p0, x0 - svcntw () * 9, z0),
+	    svst1 (p0, x0 - svcntw () * 9, z0))
+
+/*
+** st1_vnum_f32_0:
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f32_0, svfloat32_t, float32_t,
+	    svst1_vnum_f32 (p0, x0, 0, z0),
+	    svst1_vnum (p0, x0, 0, z0))
+
+/*
+** st1_vnum_f32_1:
+**	st1w	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f32_1, svfloat32_t, float32_t,
+	    svst1_vnum_f32 (p0, x0, 1, z0),
+	    svst1_vnum (p0, x0, 1, z0))
+
+/*
+** st1_vnum_f32_7:
+**	st1w	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f32_7, svfloat32_t, float32_t,
+	    svst1_vnum_f32 (p0, x0, 7, z0),
+	    svst1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_f32_8:
+**	incb	x0, all, mul #8
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f32_8, svfloat32_t, float32_t,
+	    svst1_vnum_f32 (p0, x0, 8, z0),
+	    svst1_vnum (p0, x0, 8, z0))
+
+/*
+** st1_vnum_f32_m1:
+**	st1w	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f32_m1, svfloat32_t, float32_t,
+	    svst1_vnum_f32 (p0, x0, -1, z0),
+	    svst1_vnum (p0, x0, -1, z0))
+
+/*
+** st1_vnum_f32_m8:
+**	st1w	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f32_m8, svfloat32_t, float32_t,
+	    svst1_vnum_f32 (p0, x0, -8, z0),
+	    svst1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_f32_m9:
+**	decb	x0, all, mul #9
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f32_m9, svfloat32_t, float32_t,
+	    svst1_vnum_f32 (p0, x0, -9, z0),
+	    svst1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st1_vnum_f32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st1w	z0\.s, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f32_x1, svfloat32_t, float32_t,
+	    svst1_vnum_f32 (p0, x0, x1, z0),
+	    svst1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f64.c
new file mode 100644
index 000000000..486f92beb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_f64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_f64_base:
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_f64_base, svfloat64_t, float64_t,
+	    svst1_f64 (p0, x0, z0),
+	    svst1 (p0, x0, z0))
+
+/*
+** st1_f64_index:
+**	st1d	z0\.d, p0, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_STORE (st1_f64_index, svfloat64_t, float64_t,
+	    svst1_f64 (p0, x0 + x1, z0),
+	    svst1 (p0, x0 + x1, z0))
+
+/*
+** st1_f64_1:
+**	st1d	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_f64_1, svfloat64_t, float64_t,
+	    svst1_f64 (p0, x0 + svcntd (), z0),
+	    svst1 (p0, x0 + svcntd (), z0))
+
+/*
+** st1_f64_7:
+**	st1d	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_f64_7, svfloat64_t, float64_t,
+	    svst1_f64 (p0, x0 + svcntd () * 7, z0),
+	    svst1 (p0, x0 + svcntd () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_f64_8:
+**	incb	x0, all, mul #8
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_f64_8, svfloat64_t, float64_t,
+	    svst1_f64 (p0, x0 + svcntd () * 8, z0),
+	    svst1 (p0, x0 + svcntd () * 8, z0))
+
+/*
+** st1_f64_m1:
+**	st1d	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_f64_m1, svfloat64_t, float64_t,
+	    svst1_f64 (p0, x0 - svcntd (), z0),
+	    svst1 (p0, x0 - svcntd (), z0))
+
+/*
+** st1_f64_m8:
+**	st1d	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_f64_m8, svfloat64_t, float64_t,
+	    svst1_f64 (p0, x0 - svcntd () * 8, z0),
+	    svst1 (p0, x0 - svcntd () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_f64_m9:
+**	decb	x0, all, mul #9
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_f64_m9, svfloat64_t, float64_t,
+	    svst1_f64 (p0, x0 - svcntd () * 9, z0),
+	    svst1 (p0, x0 - svcntd () * 9, z0))
+
+/*
+** st1_vnum_f64_0:
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f64_0, svfloat64_t, float64_t,
+	    svst1_vnum_f64 (p0, x0, 0, z0),
+	    svst1_vnum (p0, x0, 0, z0))
+
+/*
+** st1_vnum_f64_1:
+**	st1d	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f64_1, svfloat64_t, float64_t,
+	    svst1_vnum_f64 (p0, x0, 1, z0),
+	    svst1_vnum (p0, x0, 1, z0))
+
+/*
+** st1_vnum_f64_7:
+**	st1d	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f64_7, svfloat64_t, float64_t,
+	    svst1_vnum_f64 (p0, x0, 7, z0),
+	    svst1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_f64_8:
+**	incb	x0, all, mul #8
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f64_8, svfloat64_t, float64_t,
+	    svst1_vnum_f64 (p0, x0, 8, z0),
+	    svst1_vnum (p0, x0, 8, z0))
+
+/*
+** st1_vnum_f64_m1:
+**	st1d	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f64_m1, svfloat64_t, float64_t,
+	    svst1_vnum_f64 (p0, x0, -1, z0),
+	    svst1_vnum (p0, x0, -1, z0))
+
+/*
+** st1_vnum_f64_m8:
+**	st1d	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f64_m8, svfloat64_t, float64_t,
+	    svst1_vnum_f64 (p0, x0, -8, z0),
+	    svst1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_f64_m9:
+**	decb	x0, all, mul #9
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f64_m9, svfloat64_t, float64_t,
+	    svst1_vnum_f64 (p0, x0, -9, z0),
+	    svst1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st1_vnum_f64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st1d	z0\.d, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st1_vnum_f64_x1, svfloat64_t, float64_t,
+	    svst1_vnum_f64 (p0, x0, x1, z0),
+	    svst1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s16.c
new file mode 100644
index 000000000..7d4ac25d2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s16.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_s16_base:
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_s16_base, svint16_t, int16_t,
+	    svst1_s16 (p0, x0, z0),
+	    svst1 (p0, x0, z0))
+
+/*
+** st1_s16_index:
+**	st1h	z0\.h, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st1_s16_index, svint16_t, int16_t,
+	    svst1_s16 (p0, x0 + x1, z0),
+	    svst1 (p0, x0 + x1, z0))
+
+/*
+** st1_s16_1:
+**	st1h	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_s16_1, svint16_t, int16_t,
+	    svst1_s16 (p0, x0 + svcnth (), z0),
+	    svst1 (p0, x0 + svcnth (), z0))
+
+/*
+** st1_s16_7:
+**	st1h	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_s16_7, svint16_t, int16_t,
+	    svst1_s16 (p0, x0 + svcnth () * 7, z0),
+	    svst1 (p0, x0 + svcnth () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_s16_8:
+**	incb	x0, all, mul #8
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_s16_8, svint16_t, int16_t,
+	    svst1_s16 (p0, x0 + svcnth () * 8, z0),
+	    svst1 (p0, x0 + svcnth () * 8, z0))
+
+/*
+** st1_s16_m1:
+**	st1h	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_s16_m1, svint16_t, int16_t,
+	    svst1_s16 (p0, x0 - svcnth (), z0),
+	    svst1 (p0, x0 - svcnth (), z0))
+
+/*
+** st1_s16_m8:
+**	st1h	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_s16_m8, svint16_t, int16_t,
+	    svst1_s16 (p0, x0 - svcnth () * 8, z0),
+	    svst1 (p0, x0 - svcnth () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_s16_m9:
+**	decb	x0, all, mul #9
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_s16_m9, svint16_t, int16_t,
+	    svst1_s16 (p0, x0 - svcnth () * 9, z0),
+	    svst1 (p0, x0 - svcnth () * 9, z0))
+
+/*
+** st1_vnum_s16_0:
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s16_0, svint16_t, int16_t,
+	    svst1_vnum_s16 (p0, x0, 0, z0),
+	    svst1_vnum (p0, x0, 0, z0))
+
+/*
+** st1_vnum_s16_1:
+**	st1h	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s16_1, svint16_t, int16_t,
+	    svst1_vnum_s16 (p0, x0, 1, z0),
+	    svst1_vnum (p0, x0, 1, z0))
+
+/*
+** st1_vnum_s16_7:
+**	st1h	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s16_7, svint16_t, int16_t,
+	    svst1_vnum_s16 (p0, x0, 7, z0),
+	    svst1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_s16_8:
+**	incb	x0, all, mul #8
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s16_8, svint16_t, int16_t,
+	    svst1_vnum_s16 (p0, x0, 8, z0),
+	    svst1_vnum (p0, x0, 8, z0))
+
+/*
+** st1_vnum_s16_m1:
+**	st1h	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s16_m1, svint16_t, int16_t,
+	    svst1_vnum_s16 (p0, x0, -1, z0),
+	    svst1_vnum (p0, x0, -1, z0))
+
+/*
+** st1_vnum_s16_m8:
+**	st1h	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s16_m8, svint16_t, int16_t,
+	    svst1_vnum_s16 (p0, x0, -8, z0),
+	    svst1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_s16_m9:
+**	decb	x0, all, mul #9
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s16_m9, svint16_t, int16_t,
+	    svst1_vnum_s16 (p0, x0, -9, z0),
+	    svst1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st1_vnum_s16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st1h	z0\.h, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s16_x1, svint16_t, int16_t,
+	    svst1_vnum_s16 (p0, x0, x1, z0),
+	    svst1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s32.c
new file mode 100644
index 000000000..e2bcc3403
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_s32_base:
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_s32_base, svint32_t, int32_t,
+	    svst1_s32 (p0, x0, z0),
+	    svst1 (p0, x0, z0))
+
+/*
+** st1_s32_index:
+**	st1w	z0\.s, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (st1_s32_index, svint32_t, int32_t,
+	    svst1_s32 (p0, x0 + x1, z0),
+	    svst1 (p0, x0 + x1, z0))
+
+/*
+** st1_s32_1:
+**	st1w	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_s32_1, svint32_t, int32_t,
+	    svst1_s32 (p0, x0 + svcntw (), z0),
+	    svst1 (p0, x0 + svcntw (), z0))
+
+/*
+** st1_s32_7:
+**	st1w	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_s32_7, svint32_t, int32_t,
+	    svst1_s32 (p0, x0 + svcntw () * 7, z0),
+	    svst1 (p0, x0 + svcntw () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_s32_8:
+**	incb	x0, all, mul #8
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_s32_8, svint32_t, int32_t,
+	    svst1_s32 (p0, x0 + svcntw () * 8, z0),
+	    svst1 (p0, x0 + svcntw () * 8, z0))
+
+/*
+** st1_s32_m1:
+**	st1w	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_s32_m1, svint32_t, int32_t,
+	    svst1_s32 (p0, x0 - svcntw (), z0),
+	    svst1 (p0, x0 - svcntw (), z0))
+
+/*
+** st1_s32_m8:
+**	st1w	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_s32_m8, svint32_t, int32_t,
+	    svst1_s32 (p0, x0 - svcntw () * 8, z0),
+	    svst1 (p0, x0 - svcntw () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_s32_m9:
+**	decb	x0, all, mul #9
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_s32_m9, svint32_t, int32_t,
+	    svst1_s32 (p0, x0 - svcntw () * 9, z0),
+	    svst1 (p0, x0 - svcntw () * 9, z0))
+
+/*
+** st1_vnum_s32_0:
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s32_0, svint32_t, int32_t,
+	    svst1_vnum_s32 (p0, x0, 0, z0),
+	    svst1_vnum (p0, x0, 0, z0))
+
+/*
+** st1_vnum_s32_1:
+**	st1w	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s32_1, svint32_t, int32_t,
+	    svst1_vnum_s32 (p0, x0, 1, z0),
+	    svst1_vnum (p0, x0, 1, z0))
+
+/*
+** st1_vnum_s32_7:
+**	st1w	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s32_7, svint32_t, int32_t,
+	    svst1_vnum_s32 (p0, x0, 7, z0),
+	    svst1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_s32_8:
+**	incb	x0, all, mul #8
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s32_8, svint32_t, int32_t,
+	    svst1_vnum_s32 (p0, x0, 8, z0),
+	    svst1_vnum (p0, x0, 8, z0))
+
+/*
+** st1_vnum_s32_m1:
+**	st1w	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s32_m1, svint32_t, int32_t,
+	    svst1_vnum_s32 (p0, x0, -1, z0),
+	    svst1_vnum (p0, x0, -1, z0))
+
+/*
+** st1_vnum_s32_m8:
+**	st1w	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s32_m8, svint32_t, int32_t,
+	    svst1_vnum_s32 (p0, x0, -8, z0),
+	    svst1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_s32_m9:
+**	decb	x0, all, mul #9
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s32_m9, svint32_t, int32_t,
+	    svst1_vnum_s32 (p0, x0, -9, z0),
+	    svst1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st1_vnum_s32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st1w	z0\.s, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s32_x1, svint32_t, int32_t,
+	    svst1_vnum_s32 (p0, x0, x1, z0),
+	    svst1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s64.c
new file mode 100644
index 000000000..8e0b69f73
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_s64_base:
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_s64_base, svint64_t, int64_t,
+	    svst1_s64 (p0, x0, z0),
+	    svst1 (p0, x0, z0))
+
+/*
+** st1_s64_index:
+**	st1d	z0\.d, p0, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_STORE (st1_s64_index, svint64_t, int64_t,
+	    svst1_s64 (p0, x0 + x1, z0),
+	    svst1 (p0, x0 + x1, z0))
+
+/*
+** st1_s64_1:
+**	st1d	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_s64_1, svint64_t, int64_t,
+	    svst1_s64 (p0, x0 + svcntd (), z0),
+	    svst1 (p0, x0 + svcntd (), z0))
+
+/*
+** st1_s64_7:
+**	st1d	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_s64_7, svint64_t, int64_t,
+	    svst1_s64 (p0, x0 + svcntd () * 7, z0),
+	    svst1 (p0, x0 + svcntd () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_s64_8:
+**	incb	x0, all, mul #8
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_s64_8, svint64_t, int64_t,
+	    svst1_s64 (p0, x0 + svcntd () * 8, z0),
+	    svst1 (p0, x0 + svcntd () * 8, z0))
+
+/*
+** st1_s64_m1:
+**	st1d	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_s64_m1, svint64_t, int64_t,
+	    svst1_s64 (p0, x0 - svcntd (), z0),
+	    svst1 (p0, x0 - svcntd (), z0))
+
+/*
+** st1_s64_m8:
+**	st1d	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_s64_m8, svint64_t, int64_t,
+	    svst1_s64 (p0, x0 - svcntd () * 8, z0),
+	    svst1 (p0, x0 - svcntd () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_s64_m9:
+**	decb	x0, all, mul #9
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_s64_m9, svint64_t, int64_t,
+	    svst1_s64 (p0, x0 - svcntd () * 9, z0),
+	    svst1 (p0, x0 - svcntd () * 9, z0))
+
+/*
+** st1_vnum_s64_0:
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s64_0, svint64_t, int64_t,
+	    svst1_vnum_s64 (p0, x0, 0, z0),
+	    svst1_vnum (p0, x0, 0, z0))
+
+/*
+** st1_vnum_s64_1:
+**	st1d	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s64_1, svint64_t, int64_t,
+	    svst1_vnum_s64 (p0, x0, 1, z0),
+	    svst1_vnum (p0, x0, 1, z0))
+
+/*
+** st1_vnum_s64_7:
+**	st1d	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s64_7, svint64_t, int64_t,
+	    svst1_vnum_s64 (p0, x0, 7, z0),
+	    svst1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_s64_8:
+**	incb	x0, all, mul #8
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s64_8, svint64_t, int64_t,
+	    svst1_vnum_s64 (p0, x0, 8, z0),
+	    svst1_vnum (p0, x0, 8, z0))
+
+/*
+** st1_vnum_s64_m1:
+**	st1d	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s64_m1, svint64_t, int64_t,
+	    svst1_vnum_s64 (p0, x0, -1, z0),
+	    svst1_vnum (p0, x0, -1, z0))
+
+/*
+** st1_vnum_s64_m8:
+**	st1d	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s64_m8, svint64_t, int64_t,
+	    svst1_vnum_s64 (p0, x0, -8, z0),
+	    svst1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_s64_m9:
+**	decb	x0, all, mul #9
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s64_m9, svint64_t, int64_t,
+	    svst1_vnum_s64 (p0, x0, -9, z0),
+	    svst1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st1_vnum_s64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st1d	z0\.d, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s64_x1, svint64_t, int64_t,
+	    svst1_vnum_s64 (p0, x0, x1, z0),
+	    svst1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s8.c
new file mode 100644
index 000000000..4155683ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_s8.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_s8_base:
+**	st1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_s8_base, svint8_t, int8_t,
+	    svst1_s8 (p0, x0, z0),
+	    svst1 (p0, x0, z0))
+
+/*
+** st1_s8_index:
+**	st1b	z0\.b, p0, \[x0, x1\]
+**	ret
+*/
+TEST_STORE (st1_s8_index, svint8_t, int8_t,
+	    svst1_s8 (p0, x0 + x1, z0),
+	    svst1 (p0, x0 + x1, z0))
+
+/*
+** st1_s8_1:
+**	st1b	z0\.b, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_s8_1, svint8_t, int8_t,
+	    svst1_s8 (p0, x0 + svcntb (), z0),
+	    svst1 (p0, x0 + svcntb (), z0))
+
+/*
+** st1_s8_7:
+**	st1b	z0\.b, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_s8_7, svint8_t, int8_t,
+	    svst1_s8 (p0, x0 + svcntb () * 7, z0),
+	    svst1 (p0, x0 + svcntb () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_s8_8:
+**	incb	x0, all, mul #8
+**	st1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_s8_8, svint8_t, int8_t,
+	    svst1_s8 (p0, x0 + svcntb () * 8, z0),
+	    svst1 (p0, x0 + svcntb () * 8, z0))
+
+/*
+** st1_s8_m1:
+**	st1b	z0\.b, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_s8_m1, svint8_t, int8_t,
+	    svst1_s8 (p0, x0 - svcntb (), z0),
+	    svst1 (p0, x0 - svcntb (), z0))
+
+/*
+** st1_s8_m8:
+**	st1b	z0\.b, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_s8_m8, svint8_t, int8_t,
+	    svst1_s8 (p0, x0 - svcntb () * 8, z0),
+	    svst1 (p0, x0 - svcntb () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_s8_m9:
+**	decb	x0, all, mul #9
+**	st1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_s8_m9, svint8_t, int8_t,
+	    svst1_s8 (p0, x0 - svcntb () * 9, z0),
+	    svst1 (p0, x0 - svcntb () * 9, z0))
+
+/*
+** st1_vnum_s8_0:
+**	st1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s8_0, svint8_t, int8_t,
+	    svst1_vnum_s8 (p0, x0, 0, z0),
+	    svst1_vnum (p0, x0, 0, z0))
+
+/*
+** st1_vnum_s8_1:
+**	st1b	z0\.b, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s8_1, svint8_t, int8_t,
+	    svst1_vnum_s8 (p0, x0, 1, z0),
+	    svst1_vnum (p0, x0, 1, z0))
+
+/*
+** st1_vnum_s8_7:
+**	st1b	z0\.b, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s8_7, svint8_t, int8_t,
+	    svst1_vnum_s8 (p0, x0, 7, z0),
+	    svst1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_s8_8:
+**	incb	x0, all, mul #8
+**	st1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s8_8, svint8_t, int8_t,
+	    svst1_vnum_s8 (p0, x0, 8, z0),
+	    svst1_vnum (p0, x0, 8, z0))
+
+/*
+** st1_vnum_s8_m1:
+**	st1b	z0\.b, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s8_m1, svint8_t, int8_t,
+	    svst1_vnum_s8 (p0, x0, -1, z0),
+	    svst1_vnum (p0, x0, -1, z0))
+
+/*
+** st1_vnum_s8_m8:
+**	st1b	z0\.b, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s8_m8, svint8_t, int8_t,
+	    svst1_vnum_s8 (p0, x0, -8, z0),
+	    svst1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_s8_m9:
+**	decb	x0, all, mul #9
+**	st1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_s8_m9, svint8_t, int8_t,
+	    svst1_vnum_s8 (p0, x0, -9, z0),
+	    svst1_vnum (p0, x0, -9, z0))
+
+/*
+** st1_vnum_s8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	st1b	z0\.b, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	st1b	z0\.b, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_STORE (st1_vnum_s8_x1, svint8_t, int8_t,
+	    svst1_vnum_s8 (p0, x0, x1, z0),
+	    svst1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_f32.c
new file mode 100644
index 000000000..cb6774ad0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_f32.c
@@ -0,0 +1,227 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_scatter_f32:
+**	st1w	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_f32, svfloat32_t, svuint32_t,
+		       svst1_scatter_u32base_f32 (p0, z1, z0),
+		       svst1_scatter (p0, z1, z0))
+
+/*
+** st1_scatter_x0_f32_offset:
+**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_x0_f32_offset, svfloat32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_f32 (p0, z1, x0, z0),
+		       svst1_scatter_offset (p0, z1, x0, z0))
+
+/*
+** st1_scatter_m4_f32_offset:
+**	mov	(x[0-9]+), #?-4
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_m4_f32_offset, svfloat32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_f32 (p0, z1, -4, z0),
+		       svst1_scatter_offset (p0, z1, -4, z0))
+
+/*
+** st1_scatter_0_f32_offset:
+**	st1w	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_0_f32_offset, svfloat32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_f32 (p0, z1, 0, z0),
+		       svst1_scatter_offset (p0, z1, 0, z0))
+
+/*
+** st1_scatter_5_f32_offset:
+**	mov	(x[0-9]+), #?5
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_5_f32_offset, svfloat32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_f32 (p0, z1, 5, z0),
+		       svst1_scatter_offset (p0, z1, 5, z0))
+
+/*
+** st1_scatter_6_f32_offset:
+**	mov	(x[0-9]+), #?6
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_6_f32_offset, svfloat32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_f32 (p0, z1, 6, z0),
+		       svst1_scatter_offset (p0, z1, 6, z0))
+
+/*
+** st1_scatter_7_f32_offset:
+**	mov	(x[0-9]+), #?7
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_7_f32_offset, svfloat32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_f32 (p0, z1, 7, z0),
+		       svst1_scatter_offset (p0, z1, 7, z0))
+
+/*
+** st1_scatter_8_f32_offset:
+**	st1w	z0\.s, p0, \[z1\.s, #8\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_8_f32_offset, svfloat32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_f32 (p0, z1, 8, z0),
+		       svst1_scatter_offset (p0, z1, 8, z0))
+
+/*
+** st1_scatter_124_f32_offset:
+**	st1w	z0\.s, p0, \[z1\.s, #124\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_124_f32_offset, svfloat32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_f32 (p0, z1, 124, z0),
+		       svst1_scatter_offset (p0, z1, 124, z0))
+
+/*
+** st1_scatter_128_f32_offset:
+**	mov	(x[0-9]+), #?128
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_128_f32_offset, svfloat32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_f32 (p0, z1, 128, z0),
+		       svst1_scatter_offset (p0, z1, 128, z0))
+
+/*
+** st1_scatter_x0_f32_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_x0_f32_index, svfloat32_t, svuint32_t,
+		       svst1_scatter_u32base_index_f32 (p0, z1, x0, z0),
+		       svst1_scatter_index (p0, z1, x0, z0))
+
+/*
+** st1_scatter_m1_f32_index:
+**	mov	(x[0-9]+), #?-4
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_m1_f32_index, svfloat32_t, svuint32_t,
+		       svst1_scatter_u32base_index_f32 (p0, z1, -1, z0),
+		       svst1_scatter_index (p0, z1, -1, z0))
+
+/*
+** st1_scatter_0_f32_index:
+**	st1w	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_0_f32_index, svfloat32_t, svuint32_t,
+		       svst1_scatter_u32base_index_f32 (p0, z1, 0, z0),
+		       svst1_scatter_index (p0, z1, 0, z0))
+
+/*
+** st1_scatter_5_f32_index:
+**	st1w	z0\.s, p0, \[z1\.s, #20\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_5_f32_index, svfloat32_t, svuint32_t,
+		       svst1_scatter_u32base_index_f32 (p0, z1, 5, z0),
+		       svst1_scatter_index (p0, z1, 5, z0))
+
+/*
+** st1_scatter_31_f32_index:
+**	st1w	z0\.s, p0, \[z1\.s, #124\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_31_f32_index, svfloat32_t, svuint32_t,
+		       svst1_scatter_u32base_index_f32 (p0, z1, 31, z0),
+		       svst1_scatter_index (p0, z1, 31, z0))
+
+/*
+** st1_scatter_32_f32_index:
+**	mov	(x[0-9]+), #?128
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_32_f32_index, svfloat32_t, svuint32_t,
+		       svst1_scatter_u32base_index_f32 (p0, z1, 32, z0),
+		       svst1_scatter_index (p0, z1, 32, z0))
+
+/*
+** st1_scatter_x0_f32_s32offset:
+**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_f32_s32offset, svfloat32_t, float32_t, svint32_t,
+		       svst1_scatter_s32offset_f32 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_f32_s32offset:
+**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_f32_s32offset, svfloat32_t, float32_t, svint32_t,
+		       svst1_scatter_s32offset_f32 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_x0_f32_u32offset:
+**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_f32_u32offset, svfloat32_t, float32_t, svuint32_t,
+		       svst1_scatter_u32offset_f32 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_f32_u32offset:
+**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_f32_u32offset, svfloat32_t, float32_t, svuint32_t,
+		       svst1_scatter_u32offset_f32 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_x0_f32_s32index:
+**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_f32_s32index, svfloat32_t, float32_t, svint32_t,
+		       svst1_scatter_s32index_f32 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_f32_s32index:
+**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_f32_s32index, svfloat32_t, float32_t, svint32_t,
+		       svst1_scatter_s32index_f32 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_x0_f32_u32index:
+**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_f32_u32index, svfloat32_t, float32_t, svuint32_t,
+		       svst1_scatter_u32index_f32 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_f32_u32index:
+**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_f32_u32index, svfloat32_t, float32_t, svuint32_t,
+		       svst1_scatter_u32index_f32 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_f64.c
new file mode 100644
index 000000000..fe978bbe5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_f64.c
@@ -0,0 +1,303 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_scatter_f64:
+**	st1d	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_f64, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_f64 (p0, z1, z0),
+		       svst1_scatter (p0, z1, z0))
+
+/*
+** st1_scatter_x0_f64_offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_x0_f64_offset, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_f64 (p0, z1, x0, z0),
+		       svst1_scatter_offset (p0, z1, x0, z0))
+
+/*
+** st1_scatter_m8_f64_offset:
+**	mov	(x[0-9]+), #?-8
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_m8_f64_offset, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_f64 (p0, z1, -8, z0),
+		       svst1_scatter_offset (p0, z1, -8, z0))
+
+/*
+** st1_scatter_0_f64_offset:
+**	st1d	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_0_f64_offset, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_f64 (p0, z1, 0, z0),
+		       svst1_scatter_offset (p0, z1, 0, z0))
+
+/*
+** st1_scatter_9_f64_offset:
+**	mov	(x[0-9]+), #?9
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_9_f64_offset, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_f64 (p0, z1, 9, z0),
+		       svst1_scatter_offset (p0, z1, 9, z0))
+
+/*
+** st1_scatter_10_f64_offset:
+**	mov	(x[0-9]+), #?10
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_10_f64_offset, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_f64 (p0, z1, 10, z0),
+		       svst1_scatter_offset (p0, z1, 10, z0))
+
+/*
+** st1_scatter_11_f64_offset:
+**	mov	(x[0-9]+), #?11
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_11_f64_offset, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_f64 (p0, z1, 11, z0),
+		       svst1_scatter_offset (p0, z1, 11, z0))
+
+/*
+** st1_scatter_12_f64_offset:
+**	mov	(x[0-9]+), #?12
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_12_f64_offset, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_f64 (p0, z1, 12, z0),
+		       svst1_scatter_offset (p0, z1, 12, z0))
+
+/*
+** st1_scatter_13_f64_offset:
+**	mov	(x[0-9]+), #?13
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_13_f64_offset, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_f64 (p0, z1, 13, z0),
+		       svst1_scatter_offset (p0, z1, 13, z0))
+
+/*
+** st1_scatter_14_f64_offset:
+**	mov	(x[0-9]+), #?14
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_14_f64_offset, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_f64 (p0, z1, 14, z0),
+		       svst1_scatter_offset (p0, z1, 14, z0))
+
+/*
+** st1_scatter_15_f64_offset:
+**	mov	(x[0-9]+), #?15
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_15_f64_offset, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_f64 (p0, z1, 15, z0),
+		       svst1_scatter_offset (p0, z1, 15, z0))
+
+/*
+** st1_scatter_16_f64_offset:
+**	st1d	z0\.d, p0, \[z1\.d, #16\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_16_f64_offset, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_f64 (p0, z1, 16, z0),
+		       svst1_scatter_offset (p0, z1, 16, z0))
+
+/*
+** st1_scatter_248_f64_offset:
+**	st1d	z0\.d, p0, \[z1\.d, #248\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_248_f64_offset, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_f64 (p0, z1, 248, z0),
+		       svst1_scatter_offset (p0, z1, 248, z0))
+
+/*
+** st1_scatter_256_f64_offset:
+**	mov	(x[0-9]+), #?256
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_256_f64_offset, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_f64 (p0, z1, 256, z0),
+		       svst1_scatter_offset (p0, z1, 256, z0))
+
+/*
+** st1_scatter_x0_f64_index:
+**	lsl	(x[0-9]+), x0, #?3
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_x0_f64_index, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_index_f64 (p0, z1, x0, z0),
+		       svst1_scatter_index (p0, z1, x0, z0))
+
+/*
+** st1_scatter_m1_f64_index:
+**	mov	(x[0-9]+), #?-8
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_m1_f64_index, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_index_f64 (p0, z1, -1, z0),
+		       svst1_scatter_index (p0, z1, -1, z0))
+
+/*
+** st1_scatter_0_f64_index:
+**	st1d	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_0_f64_index, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_index_f64 (p0, z1, 0, z0),
+		       svst1_scatter_index (p0, z1, 0, z0))
+
+/*
+** st1_scatter_5_f64_index:
+**	st1d	z0\.d, p0, \[z1\.d, #40\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_5_f64_index, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_index_f64 (p0, z1, 5, z0),
+		       svst1_scatter_index (p0, z1, 5, z0))
+
+/*
+** st1_scatter_31_f64_index:
+**	st1d	z0\.d, p0, \[z1\.d, #248\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_31_f64_index, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_index_f64 (p0, z1, 31, z0),
+		       svst1_scatter_index (p0, z1, 31, z0))
+
+/*
+** st1_scatter_32_f64_index:
+**	mov	(x[0-9]+), #?256
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_32_f64_index, svfloat64_t, svuint64_t,
+		       svst1_scatter_u64base_index_f64 (p0, z1, 32, z0),
+		       svst1_scatter_index (p0, z1, 32, z0))
+
+/*
+** st1_scatter_x0_f64_s64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_f64_s64offset, svfloat64_t, float64_t, svint64_t,
+		       svst1_scatter_s64offset_f64 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_f64_s64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_f64_s64offset, svfloat64_t, float64_t, svint64_t,
+		       svst1_scatter_s64offset_f64 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_ext_f64_s64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_ext_f64_s64offset, svfloat64_t, float64_t, svint64_t,
+		       svst1_scatter_s64offset_f64 (p0, x0, svextw_s64_x (p0, z1), z0),
+		       svst1_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1_scatter_x0_f64_u64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
+		       svst1_scatter_u64offset_f64 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_f64_u64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
+		       svst1_scatter_u64offset_f64 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_ext_f64_u64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_ext_f64_u64offset, svfloat64_t, float64_t, svuint64_t,
+		       svst1_scatter_u64offset_f64 (p0, x0, svextw_u64_x (p0, z1), z0),
+		       svst1_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1_scatter_x0_f64_s64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_f64_s64index, svfloat64_t, float64_t, svint64_t,
+		       svst1_scatter_s64index_f64 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_f64_s64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_f64_s64index, svfloat64_t, float64_t, svint64_t,
+		       svst1_scatter_s64index_f64 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_ext_f64_s64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, sxtw 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_ext_f64_s64index, svfloat64_t, float64_t, svint64_t,
+		       svst1_scatter_s64index_f64 (p0, x0, svextw_s64_x (p0, z1), z0),
+		       svst1_scatter_index (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1_scatter_x0_f64_u64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_f64_u64index, svfloat64_t, float64_t, svuint64_t,
+		       svst1_scatter_u64index_f64 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_f64_u64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_f64_u64index, svfloat64_t, float64_t, svuint64_t,
+		       svst1_scatter_u64index_f64 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_ext_f64_u64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, uxtw 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_ext_f64_u64index, svfloat64_t, float64_t, svuint64_t,
+		       svst1_scatter_u64index_f64 (p0, x0, svextw_u64_x (p0, z1), z0),
+		       svst1_scatter_index (p0, x0, svextw_x (p0, z1), z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_s32.c
new file mode 100644
index 000000000..d244e701a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_s32.c
@@ -0,0 +1,227 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_scatter_s32:
+**	st1w	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_s32, svint32_t, svuint32_t,
+		       svst1_scatter_u32base_s32 (p0, z1, z0),
+		       svst1_scatter (p0, z1, z0))
+
+/*
+** st1_scatter_x0_s32_offset:
+**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_x0_s32_offset, svint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_s32 (p0, z1, x0, z0),
+		       svst1_scatter_offset (p0, z1, x0, z0))
+
+/*
+** st1_scatter_m4_s32_offset:
+**	mov	(x[0-9]+), #?-4
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_m4_s32_offset, svint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_s32 (p0, z1, -4, z0),
+		       svst1_scatter_offset (p0, z1, -4, z0))
+
+/*
+** st1_scatter_0_s32_offset:
+**	st1w	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_0_s32_offset, svint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_s32 (p0, z1, 0, z0),
+		       svst1_scatter_offset (p0, z1, 0, z0))
+
+/*
+** st1_scatter_5_s32_offset:
+**	mov	(x[0-9]+), #?5
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_5_s32_offset, svint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_s32 (p0, z1, 5, z0),
+		       svst1_scatter_offset (p0, z1, 5, z0))
+
+/*
+** st1_scatter_6_s32_offset:
+**	mov	(x[0-9]+), #?6
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_6_s32_offset, svint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_s32 (p0, z1, 6, z0),
+		       svst1_scatter_offset (p0, z1, 6, z0))
+
+/*
+** st1_scatter_7_s32_offset:
+**	mov	(x[0-9]+), #?7
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_7_s32_offset, svint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_s32 (p0, z1, 7, z0),
+		       svst1_scatter_offset (p0, z1, 7, z0))
+
+/*
+** st1_scatter_8_s32_offset:
+**	st1w	z0\.s, p0, \[z1\.s, #8\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_8_s32_offset, svint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_s32 (p0, z1, 8, z0),
+		       svst1_scatter_offset (p0, z1, 8, z0))
+
+/*
+** st1_scatter_124_s32_offset:
+**	st1w	z0\.s, p0, \[z1\.s, #124\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_124_s32_offset, svint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_s32 (p0, z1, 124, z0),
+		       svst1_scatter_offset (p0, z1, 124, z0))
+
+/*
+** st1_scatter_128_s32_offset:
+**	mov	(x[0-9]+), #?128
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_128_s32_offset, svint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_s32 (p0, z1, 128, z0),
+		       svst1_scatter_offset (p0, z1, 128, z0))
+
+/*
+** st1_scatter_x0_s32_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_x0_s32_index, svint32_t, svuint32_t,
+		       svst1_scatter_u32base_index_s32 (p0, z1, x0, z0),
+		       svst1_scatter_index (p0, z1, x0, z0))
+
+/*
+** st1_scatter_m1_s32_index:
+**	mov	(x[0-9]+), #?-4
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_m1_s32_index, svint32_t, svuint32_t,
+		       svst1_scatter_u32base_index_s32 (p0, z1, -1, z0),
+		       svst1_scatter_index (p0, z1, -1, z0))
+
+/*
+** st1_scatter_0_s32_index:
+**	st1w	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_0_s32_index, svint32_t, svuint32_t,
+		       svst1_scatter_u32base_index_s32 (p0, z1, 0, z0),
+		       svst1_scatter_index (p0, z1, 0, z0))
+
+/*
+** st1_scatter_5_s32_index:
+**	st1w	z0\.s, p0, \[z1\.s, #20\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_5_s32_index, svint32_t, svuint32_t,
+		       svst1_scatter_u32base_index_s32 (p0, z1, 5, z0),
+		       svst1_scatter_index (p0, z1, 5, z0))
+
+/*
+** st1_scatter_31_s32_index:
+**	st1w	z0\.s, p0, \[z1\.s, #124\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_31_s32_index, svint32_t, svuint32_t,
+		       svst1_scatter_u32base_index_s32 (p0, z1, 31, z0),
+		       svst1_scatter_index (p0, z1, 31, z0))
+
+/*
+** st1_scatter_32_s32_index:
+**	mov	(x[0-9]+), #?128
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_32_s32_index, svint32_t, svuint32_t,
+		       svst1_scatter_u32base_index_s32 (p0, z1, 32, z0),
+		       svst1_scatter_index (p0, z1, 32, z0))
+
+/*
+** st1_scatter_x0_s32_s32offset:
+**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_s32_s32offset, svint32_t, int32_t, svint32_t,
+		       svst1_scatter_s32offset_s32 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_s32_s32offset:
+**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_s32_s32offset, svint32_t, int32_t, svint32_t,
+		       svst1_scatter_s32offset_s32 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_x0_s32_u32offset:
+**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_s32_u32offset, svint32_t, int32_t, svuint32_t,
+		       svst1_scatter_u32offset_s32 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_s32_u32offset:
+**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_s32_u32offset, svint32_t, int32_t, svuint32_t,
+		       svst1_scatter_u32offset_s32 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_x0_s32_s32index:
+**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_s32_s32index, svint32_t, int32_t, svint32_t,
+		       svst1_scatter_s32index_s32 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_s32_s32index:
+**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_s32_s32index, svint32_t, int32_t, svint32_t,
+		       svst1_scatter_s32index_s32 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_x0_s32_u32index:
+**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_s32_u32index, svint32_t, int32_t, svuint32_t,
+		       svst1_scatter_u32index_s32 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_s32_u32index:
+**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_s32_u32index, svint32_t, int32_t, svuint32_t,
+		       svst1_scatter_u32index_s32 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_s64.c
new file mode 100644
index 000000000..5c4ebf440
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_s64.c
@@ -0,0 +1,303 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_scatter_s64:
+**	st1d	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_s64, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_s64 (p0, z1, z0),
+		       svst1_scatter (p0, z1, z0))
+
+/*
+** st1_scatter_x0_s64_offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_x0_s64_offset, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_s64 (p0, z1, x0, z0),
+		       svst1_scatter_offset (p0, z1, x0, z0))
+
+/*
+** st1_scatter_m8_s64_offset:
+**	mov	(x[0-9]+), #?-8
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_m8_s64_offset, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_s64 (p0, z1, -8, z0),
+		       svst1_scatter_offset (p0, z1, -8, z0))
+
+/*
+** st1_scatter_0_s64_offset:
+**	st1d	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_0_s64_offset, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_s64 (p0, z1, 0, z0),
+		       svst1_scatter_offset (p0, z1, 0, z0))
+
+/*
+** st1_scatter_9_s64_offset:
+**	mov	(x[0-9]+), #?9
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_9_s64_offset, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_s64 (p0, z1, 9, z0),
+		       svst1_scatter_offset (p0, z1, 9, z0))
+
+/*
+** st1_scatter_10_s64_offset:
+**	mov	(x[0-9]+), #?10
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_10_s64_offset, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_s64 (p0, z1, 10, z0),
+		       svst1_scatter_offset (p0, z1, 10, z0))
+
+/*
+** st1_scatter_11_s64_offset:
+**	mov	(x[0-9]+), #?11
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_11_s64_offset, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_s64 (p0, z1, 11, z0),
+		       svst1_scatter_offset (p0, z1, 11, z0))
+
+/*
+** st1_scatter_12_s64_offset:
+**	mov	(x[0-9]+), #?12
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_12_s64_offset, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_s64 (p0, z1, 12, z0),
+		       svst1_scatter_offset (p0, z1, 12, z0))
+
+/*
+** st1_scatter_13_s64_offset:
+**	mov	(x[0-9]+), #?13
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_13_s64_offset, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_s64 (p0, z1, 13, z0),
+		       svst1_scatter_offset (p0, z1, 13, z0))
+
+/*
+** st1_scatter_14_s64_offset:
+**	mov	(x[0-9]+), #?14
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_14_s64_offset, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_s64 (p0, z1, 14, z0),
+		       svst1_scatter_offset (p0, z1, 14, z0))
+
+/*
+** st1_scatter_15_s64_offset:
+**	mov	(x[0-9]+), #?15
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_15_s64_offset, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_s64 (p0, z1, 15, z0),
+		       svst1_scatter_offset (p0, z1, 15, z0))
+
+/*
+** st1_scatter_16_s64_offset:
+**	st1d	z0\.d, p0, \[z1\.d, #16\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_16_s64_offset, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_s64 (p0, z1, 16, z0),
+		       svst1_scatter_offset (p0, z1, 16, z0))
+
+/*
+** st1_scatter_248_s64_offset:
+**	st1d	z0\.d, p0, \[z1\.d, #248\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_248_s64_offset, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_s64 (p0, z1, 248, z0),
+		       svst1_scatter_offset (p0, z1, 248, z0))
+
+/*
+** st1_scatter_256_s64_offset:
+**	mov	(x[0-9]+), #?256
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_256_s64_offset, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_s64 (p0, z1, 256, z0),
+		       svst1_scatter_offset (p0, z1, 256, z0))
+
+/*
+** st1_scatter_x0_s64_index:
+**	lsl	(x[0-9]+), x0, #?3
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_x0_s64_index, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_index_s64 (p0, z1, x0, z0),
+		       svst1_scatter_index (p0, z1, x0, z0))
+
+/*
+** st1_scatter_m1_s64_index:
+**	mov	(x[0-9]+), #?-8
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_m1_s64_index, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_index_s64 (p0, z1, -1, z0),
+		       svst1_scatter_index (p0, z1, -1, z0))
+
+/*
+** st1_scatter_0_s64_index:
+**	st1d	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_0_s64_index, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_index_s64 (p0, z1, 0, z0),
+		       svst1_scatter_index (p0, z1, 0, z0))
+
+/*
+** st1_scatter_5_s64_index:
+**	st1d	z0\.d, p0, \[z1\.d, #40\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_5_s64_index, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_index_s64 (p0, z1, 5, z0),
+		       svst1_scatter_index (p0, z1, 5, z0))
+
+/*
+** st1_scatter_31_s64_index:
+**	st1d	z0\.d, p0, \[z1\.d, #248\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_31_s64_index, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_index_s64 (p0, z1, 31, z0),
+		       svst1_scatter_index (p0, z1, 31, z0))
+
+/*
+** st1_scatter_32_s64_index:
+**	mov	(x[0-9]+), #?256
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_32_s64_index, svint64_t, svuint64_t,
+		       svst1_scatter_u64base_index_s64 (p0, z1, 32, z0),
+		       svst1_scatter_index (p0, z1, 32, z0))
+
+/*
+** st1_scatter_x0_s64_s64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_s64_s64offset, svint64_t, int64_t, svint64_t,
+		       svst1_scatter_s64offset_s64 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_s64_s64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_s64_s64offset, svint64_t, int64_t, svint64_t,
+		       svst1_scatter_s64offset_s64 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_ext_s64_s64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_ext_s64_s64offset, svint64_t, int64_t, svint64_t,
+		       svst1_scatter_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1), z0),
+		       svst1_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1_scatter_x0_s64_u64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_s64_u64offset, svint64_t, int64_t, svuint64_t,
+		       svst1_scatter_u64offset_s64 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_s64_u64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_s64_u64offset, svint64_t, int64_t, svuint64_t,
+		       svst1_scatter_u64offset_s64 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_ext_s64_u64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_ext_s64_u64offset, svint64_t, int64_t, svuint64_t,
+		       svst1_scatter_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1), z0),
+		       svst1_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1_scatter_x0_s64_s64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_s64_s64index, svint64_t, int64_t, svint64_t,
+		       svst1_scatter_s64index_s64 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_s64_s64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_s64_s64index, svint64_t, int64_t, svint64_t,
+		       svst1_scatter_s64index_s64 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_ext_s64_s64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, sxtw 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_ext_s64_s64index, svint64_t, int64_t, svint64_t,
+		       svst1_scatter_s64index_s64 (p0, x0, svextw_s64_x (p0, z1), z0),
+		       svst1_scatter_index (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1_scatter_x0_s64_u64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_s64_u64index, svint64_t, int64_t, svuint64_t,
+		       svst1_scatter_u64index_s64 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_s64_u64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_s64_u64index, svint64_t, int64_t, svuint64_t,
+		       svst1_scatter_u64index_s64 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_ext_s64_u64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, uxtw 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_ext_s64_u64index, svint64_t, int64_t, svuint64_t,
+		       svst1_scatter_u64index_s64 (p0, x0, svextw_u64_x (p0, z1), z0),
+		       svst1_scatter_index (p0, x0, svextw_x (p0, z1), z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_u32.c
new file mode 100644
index 000000000..fe3f7259f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_u32.c
@@ -0,0 +1,227 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_scatter_u32:
+**	st1w	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_u32, svuint32_t, svuint32_t,
+		       svst1_scatter_u32base_u32 (p0, z1, z0),
+		       svst1_scatter (p0, z1, z0))
+
+/*
+** st1_scatter_x0_u32_offset:
+**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_x0_u32_offset, svuint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_u32 (p0, z1, x0, z0),
+		       svst1_scatter_offset (p0, z1, x0, z0))
+
+/*
+** st1_scatter_m4_u32_offset:
+**	mov	(x[0-9]+), #?-4
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_m4_u32_offset, svuint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_u32 (p0, z1, -4, z0),
+		       svst1_scatter_offset (p0, z1, -4, z0))
+
+/*
+** st1_scatter_0_u32_offset:
+**	st1w	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_0_u32_offset, svuint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_u32 (p0, z1, 0, z0),
+		       svst1_scatter_offset (p0, z1, 0, z0))
+
+/*
+** st1_scatter_5_u32_offset:
+**	mov	(x[0-9]+), #?5
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_5_u32_offset, svuint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_u32 (p0, z1, 5, z0),
+		       svst1_scatter_offset (p0, z1, 5, z0))
+
+/*
+** st1_scatter_6_u32_offset:
+**	mov	(x[0-9]+), #?6
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_6_u32_offset, svuint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_u32 (p0, z1, 6, z0),
+		       svst1_scatter_offset (p0, z1, 6, z0))
+
+/*
+** st1_scatter_7_u32_offset:
+**	mov	(x[0-9]+), #?7
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_7_u32_offset, svuint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_u32 (p0, z1, 7, z0),
+		       svst1_scatter_offset (p0, z1, 7, z0))
+
+/*
+** st1_scatter_8_u32_offset:
+**	st1w	z0\.s, p0, \[z1\.s, #8\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_8_u32_offset, svuint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_u32 (p0, z1, 8, z0),
+		       svst1_scatter_offset (p0, z1, 8, z0))
+
+/*
+** st1_scatter_124_u32_offset:
+**	st1w	z0\.s, p0, \[z1\.s, #124\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_124_u32_offset, svuint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_u32 (p0, z1, 124, z0),
+		       svst1_scatter_offset (p0, z1, 124, z0))
+
+/*
+** st1_scatter_128_u32_offset:
+**	mov	(x[0-9]+), #?128
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_128_u32_offset, svuint32_t, svuint32_t,
+		       svst1_scatter_u32base_offset_u32 (p0, z1, 128, z0),
+		       svst1_scatter_offset (p0, z1, 128, z0))
+
+/*
+** st1_scatter_x0_u32_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_x0_u32_index, svuint32_t, svuint32_t,
+		       svst1_scatter_u32base_index_u32 (p0, z1, x0, z0),
+		       svst1_scatter_index (p0, z1, x0, z0))
+
+/*
+** st1_scatter_m1_u32_index:
+**	mov	(x[0-9]+), #?-4
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_m1_u32_index, svuint32_t, svuint32_t,
+		       svst1_scatter_u32base_index_u32 (p0, z1, -1, z0),
+		       svst1_scatter_index (p0, z1, -1, z0))
+
+/*
+** st1_scatter_0_u32_index:
+**	st1w	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_0_u32_index, svuint32_t, svuint32_t,
+		       svst1_scatter_u32base_index_u32 (p0, z1, 0, z0),
+		       svst1_scatter_index (p0, z1, 0, z0))
+
+/*
+** st1_scatter_5_u32_index:
+**	st1w	z0\.s, p0, \[z1\.s, #20\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_5_u32_index, svuint32_t, svuint32_t,
+		       svst1_scatter_u32base_index_u32 (p0, z1, 5, z0),
+		       svst1_scatter_index (p0, z1, 5, z0))
+
+/*
+** st1_scatter_31_u32_index:
+**	st1w	z0\.s, p0, \[z1\.s, #124\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_31_u32_index, svuint32_t, svuint32_t,
+		       svst1_scatter_u32base_index_u32 (p0, z1, 31, z0),
+		       svst1_scatter_index (p0, z1, 31, z0))
+
+/*
+** st1_scatter_32_u32_index:
+**	mov	(x[0-9]+), #?128
+**	st1w	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_32_u32_index, svuint32_t, svuint32_t,
+		       svst1_scatter_u32base_index_u32 (p0, z1, 32, z0),
+		       svst1_scatter_index (p0, z1, 32, z0))
+
+/*
+** st1_scatter_x0_u32_s32offset:
+**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_u32_s32offset, svuint32_t, uint32_t, svint32_t,
+		       svst1_scatter_s32offset_u32 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_u32_s32offset:
+**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_u32_s32offset, svuint32_t, uint32_t, svint32_t,
+		       svst1_scatter_s32offset_u32 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_x0_u32_u32offset:
+**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_u32_u32offset, svuint32_t, uint32_t, svuint32_t,
+		       svst1_scatter_u32offset_u32 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_u32_u32offset:
+**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_u32_u32offset, svuint32_t, uint32_t, svuint32_t,
+		       svst1_scatter_u32offset_u32 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_x0_u32_s32index:
+**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_u32_s32index, svuint32_t, uint32_t, svint32_t,
+		       svst1_scatter_s32index_u32 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_u32_s32index:
+**	st1w	z0\.s, p0, \[x0, z1\.s, sxtw 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_u32_s32index, svuint32_t, uint32_t, svint32_t,
+		       svst1_scatter_s32index_u32 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_x0_u32_u32index:
+**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_u32_u32index, svuint32_t, uint32_t, svuint32_t,
+		       svst1_scatter_u32index_u32 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_u32_u32index:
+**	st1w	z0\.s, p0, \[x0, z1\.s, uxtw 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_u32_u32index, svuint32_t, uint32_t, svuint32_t,
+		       svst1_scatter_u32index_u32 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_u64.c
new file mode 100644
index 000000000..232123566
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_scatter_u64.c
@@ -0,0 +1,303 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_scatter_u64:
+**	st1d	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_u64, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_u64 (p0, z1, z0),
+		       svst1_scatter (p0, z1, z0))
+
+/*
+** st1_scatter_x0_u64_offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_x0_u64_offset, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_u64 (p0, z1, x0, z0),
+		       svst1_scatter_offset (p0, z1, x0, z0))
+
+/*
+** st1_scatter_m8_u64_offset:
+**	mov	(x[0-9]+), #?-8
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_m8_u64_offset, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_u64 (p0, z1, -8, z0),
+		       svst1_scatter_offset (p0, z1, -8, z0))
+
+/*
+** st1_scatter_0_u64_offset:
+**	st1d	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_0_u64_offset, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_u64 (p0, z1, 0, z0),
+		       svst1_scatter_offset (p0, z1, 0, z0))
+
+/*
+** st1_scatter_9_u64_offset:
+**	mov	(x[0-9]+), #?9
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_9_u64_offset, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_u64 (p0, z1, 9, z0),
+		       svst1_scatter_offset (p0, z1, 9, z0))
+
+/*
+** st1_scatter_10_u64_offset:
+**	mov	(x[0-9]+), #?10
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_10_u64_offset, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_u64 (p0, z1, 10, z0),
+		       svst1_scatter_offset (p0, z1, 10, z0))
+
+/*
+** st1_scatter_11_u64_offset:
+**	mov	(x[0-9]+), #?11
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_11_u64_offset, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_u64 (p0, z1, 11, z0),
+		       svst1_scatter_offset (p0, z1, 11, z0))
+
+/*
+** st1_scatter_12_u64_offset:
+**	mov	(x[0-9]+), #?12
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_12_u64_offset, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_u64 (p0, z1, 12, z0),
+		       svst1_scatter_offset (p0, z1, 12, z0))
+
+/*
+** st1_scatter_13_u64_offset:
+**	mov	(x[0-9]+), #?13
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_13_u64_offset, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_u64 (p0, z1, 13, z0),
+		       svst1_scatter_offset (p0, z1, 13, z0))
+
+/*
+** st1_scatter_14_u64_offset:
+**	mov	(x[0-9]+), #?14
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_14_u64_offset, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_u64 (p0, z1, 14, z0),
+		       svst1_scatter_offset (p0, z1, 14, z0))
+
+/*
+** st1_scatter_15_u64_offset:
+**	mov	(x[0-9]+), #?15
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_15_u64_offset, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_u64 (p0, z1, 15, z0),
+		       svst1_scatter_offset (p0, z1, 15, z0))
+
+/*
+** st1_scatter_16_u64_offset:
+**	st1d	z0\.d, p0, \[z1\.d, #16\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_16_u64_offset, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_u64 (p0, z1, 16, z0),
+		       svst1_scatter_offset (p0, z1, 16, z0))
+
+/*
+** st1_scatter_248_u64_offset:
+**	st1d	z0\.d, p0, \[z1\.d, #248\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_248_u64_offset, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_u64 (p0, z1, 248, z0),
+		       svst1_scatter_offset (p0, z1, 248, z0))
+
+/*
+** st1_scatter_256_u64_offset:
+**	mov	(x[0-9]+), #?256
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_256_u64_offset, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_offset_u64 (p0, z1, 256, z0),
+		       svst1_scatter_offset (p0, z1, 256, z0))
+
+/*
+** st1_scatter_x0_u64_index:
+**	lsl	(x[0-9]+), x0, #?3
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_x0_u64_index, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_index_u64 (p0, z1, x0, z0),
+		       svst1_scatter_index (p0, z1, x0, z0))
+
+/*
+** st1_scatter_m1_u64_index:
+**	mov	(x[0-9]+), #?-8
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_m1_u64_index, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_index_u64 (p0, z1, -1, z0),
+		       svst1_scatter_index (p0, z1, -1, z0))
+
+/*
+** st1_scatter_0_u64_index:
+**	st1d	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_0_u64_index, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_index_u64 (p0, z1, 0, z0),
+		       svst1_scatter_index (p0, z1, 0, z0))
+
+/*
+** st1_scatter_5_u64_index:
+**	st1d	z0\.d, p0, \[z1\.d, #40\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_5_u64_index, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_index_u64 (p0, z1, 5, z0),
+		       svst1_scatter_index (p0, z1, 5, z0))
+
+/*
+** st1_scatter_31_u64_index:
+**	st1d	z0\.d, p0, \[z1\.d, #248\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_31_u64_index, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_index_u64 (p0, z1, 31, z0),
+		       svst1_scatter_index (p0, z1, 31, z0))
+
+/*
+** st1_scatter_32_u64_index:
+**	mov	(x[0-9]+), #?256
+**	st1d	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1_scatter_32_u64_index, svuint64_t, svuint64_t,
+		       svst1_scatter_u64base_index_u64 (p0, z1, 32, z0),
+		       svst1_scatter_index (p0, z1, 32, z0))
+
+/*
+** st1_scatter_x0_u64_s64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_u64_s64offset, svuint64_t, uint64_t, svint64_t,
+		       svst1_scatter_s64offset_u64 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_u64_s64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_u64_s64offset, svuint64_t, uint64_t, svint64_t,
+		       svst1_scatter_s64offset_u64 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_ext_u64_s64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_ext_u64_s64offset, svuint64_t, uint64_t, svint64_t,
+		       svst1_scatter_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1), z0),
+		       svst1_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1_scatter_x0_u64_u64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
+		       svst1_scatter_u64offset_u64 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_u64_u64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
+		       svst1_scatter_u64offset_u64 (p0, x0, z1, z0),
+		       svst1_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1_scatter_ext_u64_u64offset:
+**	st1d	z0\.d, p0, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_ext_u64_u64offset, svuint64_t, uint64_t, svuint64_t,
+		       svst1_scatter_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1), z0),
+		       svst1_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1_scatter_x0_u64_s64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_u64_s64index, svuint64_t, uint64_t, svint64_t,
+		       svst1_scatter_s64index_u64 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_u64_s64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_u64_s64index, svuint64_t, uint64_t, svint64_t,
+		       svst1_scatter_s64index_u64 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_ext_u64_s64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, sxtw 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_ext_u64_s64index, svuint64_t, uint64_t, svint64_t,
+		       svst1_scatter_s64index_u64 (p0, x0, svextw_s64_x (p0, z1), z0),
+		       svst1_scatter_index (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1_scatter_x0_u64_u64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_x0_u64_u64index, svuint64_t, uint64_t, svuint64_t,
+		       svst1_scatter_u64index_u64 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_u64_u64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, lsl 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_u64_u64index, svuint64_t, uint64_t, svuint64_t,
+		       svst1_scatter_u64index_u64 (p0, x0, z1, z0),
+		       svst1_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1_scatter_ext_u64_u64index:
+**	st1d	z0\.d, p0, \[x0, z1\.d, uxtw 3\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1_scatter_ext_u64_u64index, svuint64_t, uint64_t, svuint64_t,
+		       svst1_scatter_u64index_u64 (p0, x0, svextw_u64_x (p0, z1), z0),
+		       svst1_scatter_index (p0, x0, svextw_x (p0, z1), z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u16.c
new file mode 100644
index 000000000..e9dc05219
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u16.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_u16_base:
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_u16_base, svuint16_t, uint16_t,
+	    svst1_u16 (p0, x0, z0),
+	    svst1 (p0, x0, z0))
+
+/*
+** st1_u16_index:
+**	st1h	z0\.h, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st1_u16_index, svuint16_t, uint16_t,
+	    svst1_u16 (p0, x0 + x1, z0),
+	    svst1 (p0, x0 + x1, z0))
+
+/*
+** st1_u16_1:
+**	st1h	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_u16_1, svuint16_t, uint16_t,
+	    svst1_u16 (p0, x0 + svcnth (), z0),
+	    svst1 (p0, x0 + svcnth (), z0))
+
+/*
+** st1_u16_7:
+**	st1h	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_u16_7, svuint16_t, uint16_t,
+	    svst1_u16 (p0, x0 + svcnth () * 7, z0),
+	    svst1 (p0, x0 + svcnth () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_u16_8:
+**	incb	x0, all, mul #8
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_u16_8, svuint16_t, uint16_t,
+	    svst1_u16 (p0, x0 + svcnth () * 8, z0),
+	    svst1 (p0, x0 + svcnth () * 8, z0))
+
+/*
+** st1_u16_m1:
+**	st1h	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_u16_m1, svuint16_t, uint16_t,
+	    svst1_u16 (p0, x0 - svcnth (), z0),
+	    svst1 (p0, x0 - svcnth (), z0))
+
+/*
+** st1_u16_m8:
+**	st1h	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_u16_m8, svuint16_t, uint16_t,
+	    svst1_u16 (p0, x0 - svcnth () * 8, z0),
+	    svst1 (p0, x0 - svcnth () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_u16_m9:
+**	decb	x0, all, mul #9
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_u16_m9, svuint16_t, uint16_t,
+	    svst1_u16 (p0, x0 - svcnth () * 9, z0),
+	    svst1 (p0, x0 - svcnth () * 9, z0))
+
+/*
+** st1_vnum_u16_0:
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u16_0, svuint16_t, uint16_t,
+	    svst1_vnum_u16 (p0, x0, 0, z0),
+	    svst1_vnum (p0, x0, 0, z0))
+
+/*
+** st1_vnum_u16_1:
+**	st1h	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u16_1, svuint16_t, uint16_t,
+	    svst1_vnum_u16 (p0, x0, 1, z0),
+	    svst1_vnum (p0, x0, 1, z0))
+
+/*
+** st1_vnum_u16_7:
+**	st1h	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u16_7, svuint16_t, uint16_t,
+	    svst1_vnum_u16 (p0, x0, 7, z0),
+	    svst1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_u16_8:
+**	incb	x0, all, mul #8
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u16_8, svuint16_t, uint16_t,
+	    svst1_vnum_u16 (p0, x0, 8, z0),
+	    svst1_vnum (p0, x0, 8, z0))
+
+/*
+** st1_vnum_u16_m1:
+**	st1h	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u16_m1, svuint16_t, uint16_t,
+	    svst1_vnum_u16 (p0, x0, -1, z0),
+	    svst1_vnum (p0, x0, -1, z0))
+
+/*
+** st1_vnum_u16_m8:
+**	st1h	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u16_m8, svuint16_t, uint16_t,
+	    svst1_vnum_u16 (p0, x0, -8, z0),
+	    svst1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_u16_m9:
+**	decb	x0, all, mul #9
+**	st1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u16_m9, svuint16_t, uint16_t,
+	    svst1_vnum_u16 (p0, x0, -9, z0),
+	    svst1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st1_vnum_u16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st1h	z0\.h, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u16_x1, svuint16_t, uint16_t,
+	    svst1_vnum_u16 (p0, x0, x1, z0),
+	    svst1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u32.c
new file mode 100644
index 000000000..8610ae4c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_u32_base:
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_u32_base, svuint32_t, uint32_t,
+	    svst1_u32 (p0, x0, z0),
+	    svst1 (p0, x0, z0))
+
+/*
+** st1_u32_index:
+**	st1w	z0\.s, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (st1_u32_index, svuint32_t, uint32_t,
+	    svst1_u32 (p0, x0 + x1, z0),
+	    svst1 (p0, x0 + x1, z0))
+
+/*
+** st1_u32_1:
+**	st1w	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_u32_1, svuint32_t, uint32_t,
+	    svst1_u32 (p0, x0 + svcntw (), z0),
+	    svst1 (p0, x0 + svcntw (), z0))
+
+/*
+** st1_u32_7:
+**	st1w	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_u32_7, svuint32_t, uint32_t,
+	    svst1_u32 (p0, x0 + svcntw () * 7, z0),
+	    svst1 (p0, x0 + svcntw () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_u32_8:
+**	incb	x0, all, mul #8
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_u32_8, svuint32_t, uint32_t,
+	    svst1_u32 (p0, x0 + svcntw () * 8, z0),
+	    svst1 (p0, x0 + svcntw () * 8, z0))
+
+/*
+** st1_u32_m1:
+**	st1w	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_u32_m1, svuint32_t, uint32_t,
+	    svst1_u32 (p0, x0 - svcntw (), z0),
+	    svst1 (p0, x0 - svcntw (), z0))
+
+/*
+** st1_u32_m8:
+**	st1w	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_u32_m8, svuint32_t, uint32_t,
+	    svst1_u32 (p0, x0 - svcntw () * 8, z0),
+	    svst1 (p0, x0 - svcntw () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_u32_m9:
+**	decb	x0, all, mul #9
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_u32_m9, svuint32_t, uint32_t,
+	    svst1_u32 (p0, x0 - svcntw () * 9, z0),
+	    svst1 (p0, x0 - svcntw () * 9, z0))
+
+/*
+** st1_vnum_u32_0:
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u32_0, svuint32_t, uint32_t,
+	    svst1_vnum_u32 (p0, x0, 0, z0),
+	    svst1_vnum (p0, x0, 0, z0))
+
+/*
+** st1_vnum_u32_1:
+**	st1w	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u32_1, svuint32_t, uint32_t,
+	    svst1_vnum_u32 (p0, x0, 1, z0),
+	    svst1_vnum (p0, x0, 1, z0))
+
+/*
+** st1_vnum_u32_7:
+**	st1w	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u32_7, svuint32_t, uint32_t,
+	    svst1_vnum_u32 (p0, x0, 7, z0),
+	    svst1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_u32_8:
+**	incb	x0, all, mul #8
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u32_8, svuint32_t, uint32_t,
+	    svst1_vnum_u32 (p0, x0, 8, z0),
+	    svst1_vnum (p0, x0, 8, z0))
+
+/*
+** st1_vnum_u32_m1:
+**	st1w	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u32_m1, svuint32_t, uint32_t,
+	    svst1_vnum_u32 (p0, x0, -1, z0),
+	    svst1_vnum (p0, x0, -1, z0))
+
+/*
+** st1_vnum_u32_m8:
+**	st1w	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u32_m8, svuint32_t, uint32_t,
+	    svst1_vnum_u32 (p0, x0, -8, z0),
+	    svst1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_u32_m9:
+**	decb	x0, all, mul #9
+**	st1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u32_m9, svuint32_t, uint32_t,
+	    svst1_vnum_u32 (p0, x0, -9, z0),
+	    svst1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st1_vnum_u32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st1w	z0\.s, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u32_x1, svuint32_t, uint32_t,
+	    svst1_vnum_u32 (p0, x0, x1, z0),
+	    svst1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u64.c
new file mode 100644
index 000000000..5d4fae932
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_u64_base:
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_u64_base, svuint64_t, uint64_t,
+	    svst1_u64 (p0, x0, z0),
+	    svst1 (p0, x0, z0))
+
+/*
+** st1_u64_index:
+**	st1d	z0\.d, p0, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_STORE (st1_u64_index, svuint64_t, uint64_t,
+	    svst1_u64 (p0, x0 + x1, z0),
+	    svst1 (p0, x0 + x1, z0))
+
+/*
+** st1_u64_1:
+**	st1d	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_u64_1, svuint64_t, uint64_t,
+	    svst1_u64 (p0, x0 + svcntd (), z0),
+	    svst1 (p0, x0 + svcntd (), z0))
+
+/*
+** st1_u64_7:
+**	st1d	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_u64_7, svuint64_t, uint64_t,
+	    svst1_u64 (p0, x0 + svcntd () * 7, z0),
+	    svst1 (p0, x0 + svcntd () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_u64_8:
+**	incb	x0, all, mul #8
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_u64_8, svuint64_t, uint64_t,
+	    svst1_u64 (p0, x0 + svcntd () * 8, z0),
+	    svst1 (p0, x0 + svcntd () * 8, z0))
+
+/*
+** st1_u64_m1:
+**	st1d	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_u64_m1, svuint64_t, uint64_t,
+	    svst1_u64 (p0, x0 - svcntd (), z0),
+	    svst1 (p0, x0 - svcntd (), z0))
+
+/*
+** st1_u64_m8:
+**	st1d	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_u64_m8, svuint64_t, uint64_t,
+	    svst1_u64 (p0, x0 - svcntd () * 8, z0),
+	    svst1 (p0, x0 - svcntd () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_u64_m9:
+**	decb	x0, all, mul #9
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_u64_m9, svuint64_t, uint64_t,
+	    svst1_u64 (p0, x0 - svcntd () * 9, z0),
+	    svst1 (p0, x0 - svcntd () * 9, z0))
+
+/*
+** st1_vnum_u64_0:
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u64_0, svuint64_t, uint64_t,
+	    svst1_vnum_u64 (p0, x0, 0, z0),
+	    svst1_vnum (p0, x0, 0, z0))
+
+/*
+** st1_vnum_u64_1:
+**	st1d	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u64_1, svuint64_t, uint64_t,
+	    svst1_vnum_u64 (p0, x0, 1, z0),
+	    svst1_vnum (p0, x0, 1, z0))
+
+/*
+** st1_vnum_u64_7:
+**	st1d	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u64_7, svuint64_t, uint64_t,
+	    svst1_vnum_u64 (p0, x0, 7, z0),
+	    svst1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_u64_8:
+**	incb	x0, all, mul #8
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u64_8, svuint64_t, uint64_t,
+	    svst1_vnum_u64 (p0, x0, 8, z0),
+	    svst1_vnum (p0, x0, 8, z0))
+
+/*
+** st1_vnum_u64_m1:
+**	st1d	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u64_m1, svuint64_t, uint64_t,
+	    svst1_vnum_u64 (p0, x0, -1, z0),
+	    svst1_vnum (p0, x0, -1, z0))
+
+/*
+** st1_vnum_u64_m8:
+**	st1d	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u64_m8, svuint64_t, uint64_t,
+	    svst1_vnum_u64 (p0, x0, -8, z0),
+	    svst1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_u64_m9:
+**	decb	x0, all, mul #9
+**	st1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u64_m9, svuint64_t, uint64_t,
+	    svst1_vnum_u64 (p0, x0, -9, z0),
+	    svst1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st1_vnum_u64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st1d	z0\.d, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u64_x1, svuint64_t, uint64_t,
+	    svst1_vnum_u64 (p0, x0, x1, z0),
+	    svst1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u8.c
new file mode 100644
index 000000000..52c79d0e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1_u8.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_u8_base:
+**	st1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_u8_base, svuint8_t, uint8_t,
+	    svst1_u8 (p0, x0, z0),
+	    svst1 (p0, x0, z0))
+
+/*
+** st1_u8_index:
+**	st1b	z0\.b, p0, \[x0, x1\]
+**	ret
+*/
+TEST_STORE (st1_u8_index, svuint8_t, uint8_t,
+	    svst1_u8 (p0, x0 + x1, z0),
+	    svst1 (p0, x0 + x1, z0))
+
+/*
+** st1_u8_1:
+**	st1b	z0\.b, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_u8_1, svuint8_t, uint8_t,
+	    svst1_u8 (p0, x0 + svcntb (), z0),
+	    svst1 (p0, x0 + svcntb (), z0))
+
+/*
+** st1_u8_7:
+**	st1b	z0\.b, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_u8_7, svuint8_t, uint8_t,
+	    svst1_u8 (p0, x0 + svcntb () * 7, z0),
+	    svst1 (p0, x0 + svcntb () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_u8_8:
+**	incb	x0, all, mul #8
+**	st1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_u8_8, svuint8_t, uint8_t,
+	    svst1_u8 (p0, x0 + svcntb () * 8, z0),
+	    svst1 (p0, x0 + svcntb () * 8, z0))
+
+/*
+** st1_u8_m1:
+**	st1b	z0\.b, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_u8_m1, svuint8_t, uint8_t,
+	    svst1_u8 (p0, x0 - svcntb (), z0),
+	    svst1 (p0, x0 - svcntb (), z0))
+
+/*
+** st1_u8_m8:
+**	st1b	z0\.b, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_u8_m8, svuint8_t, uint8_t,
+	    svst1_u8 (p0, x0 - svcntb () * 8, z0),
+	    svst1 (p0, x0 - svcntb () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_u8_m9:
+**	decb	x0, all, mul #9
+**	st1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_u8_m9, svuint8_t, uint8_t,
+	    svst1_u8 (p0, x0 - svcntb () * 9, z0),
+	    svst1 (p0, x0 - svcntb () * 9, z0))
+
+/*
+** st1_vnum_u8_0:
+**	st1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u8_0, svuint8_t, uint8_t,
+	    svst1_vnum_u8 (p0, x0, 0, z0),
+	    svst1_vnum (p0, x0, 0, z0))
+
+/*
+** st1_vnum_u8_1:
+**	st1b	z0\.b, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u8_1, svuint8_t, uint8_t,
+	    svst1_vnum_u8 (p0, x0, 1, z0),
+	    svst1_vnum (p0, x0, 1, z0))
+
+/*
+** st1_vnum_u8_7:
+**	st1b	z0\.b, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u8_7, svuint8_t, uint8_t,
+	    svst1_vnum_u8 (p0, x0, 7, z0),
+	    svst1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_u8_8:
+**	incb	x0, all, mul #8
+**	st1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u8_8, svuint8_t, uint8_t,
+	    svst1_vnum_u8 (p0, x0, 8, z0),
+	    svst1_vnum (p0, x0, 8, z0))
+
+/*
+** st1_vnum_u8_m1:
+**	st1b	z0\.b, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u8_m1, svuint8_t, uint8_t,
+	    svst1_vnum_u8 (p0, x0, -1, z0),
+	    svst1_vnum (p0, x0, -1, z0))
+
+/*
+** st1_vnum_u8_m8:
+**	st1b	z0\.b, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u8_m8, svuint8_t, uint8_t,
+	    svst1_vnum_u8 (p0, x0, -8, z0),
+	    svst1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1_vnum_u8_m9:
+**	decb	x0, all, mul #9
+**	st1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1_vnum_u8_m9, svuint8_t, uint8_t,
+	    svst1_vnum_u8 (p0, x0, -9, z0),
+	    svst1_vnum (p0, x0, -9, z0))
+
+/*
+** st1_vnum_u8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	st1b	z0\.b, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	st1b	z0\.b, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_STORE (st1_vnum_u8_x1, svuint8_t, uint8_t,
+	    svst1_vnum_u8 (p0, x0, x1, z0),
+	    svst1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s16.c
new file mode 100644
index 000000000..770fb61e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s16.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1b_s16_base:
+**	st1b	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_s16_base, svint16_t, int8_t,
+	    svst1b_s16 (p0, x0, z0),
+	    svst1b (p0, x0, z0))
+
+/*
+** st1b_s16_index:
+**	st1b	z0\.h, p0, \[x0, x1\]
+**	ret
+*/
+TEST_STORE (st1b_s16_index, svint16_t, int8_t,
+	    svst1b_s16 (p0, x0 + x1, z0),
+	    svst1b (p0, x0 + x1, z0))
+
+/*
+** st1b_s16_1:
+**	st1b	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_s16_1, svint16_t, int8_t,
+	    svst1b_s16 (p0, x0 + svcnth (), z0),
+	    svst1b (p0, x0 + svcnth (), z0))
+
+/*
+** st1b_s16_7:
+**	st1b	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_s16_7, svint16_t, int8_t,
+	    svst1b_s16 (p0, x0 + svcnth () * 7, z0),
+	    svst1b (p0, x0 + svcnth () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_s16_8:
+**	incb	x0, all, mul #4
+**	st1b	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_s16_8, svint16_t, int8_t,
+	    svst1b_s16 (p0, x0 + svcnth () * 8, z0),
+	    svst1b (p0, x0 + svcnth () * 8, z0))
+
+/*
+** st1b_s16_m1:
+**	st1b	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_s16_m1, svint16_t, int8_t,
+	    svst1b_s16 (p0, x0 - svcnth (), z0),
+	    svst1b (p0, x0 - svcnth (), z0))
+
+/*
+** st1b_s16_m8:
+**	st1b	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_s16_m8, svint16_t, int8_t,
+	    svst1b_s16 (p0, x0 - svcnth () * 8, z0),
+	    svst1b (p0, x0 - svcnth () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_s16_m9:
+**	dech	x0, all, mul #9
+**	st1b	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_s16_m9, svint16_t, int8_t,
+	    svst1b_s16 (p0, x0 - svcnth () * 9, z0),
+	    svst1b (p0, x0 - svcnth () * 9, z0))
+
+/*
+** st1b_vnum_s16_0:
+**	st1b	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s16_0, svint16_t, int8_t,
+	    svst1b_vnum_s16 (p0, x0, 0, z0),
+	    svst1b_vnum (p0, x0, 0, z0))
+
+/*
+** st1b_vnum_s16_1:
+**	st1b	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s16_1, svint16_t, int8_t,
+	    svst1b_vnum_s16 (p0, x0, 1, z0),
+	    svst1b_vnum (p0, x0, 1, z0))
+
+/*
+** st1b_vnum_s16_7:
+**	st1b	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s16_7, svint16_t, int8_t,
+	    svst1b_vnum_s16 (p0, x0, 7, z0),
+	    svst1b_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_vnum_s16_8:
+**	incb	x0, all, mul #4
+**	st1b	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s16_8, svint16_t, int8_t,
+	    svst1b_vnum_s16 (p0, x0, 8, z0),
+	    svst1b_vnum (p0, x0, 8, z0))
+
+/*
+** st1b_vnum_s16_m1:
+**	st1b	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s16_m1, svint16_t, int8_t,
+	    svst1b_vnum_s16 (p0, x0, -1, z0),
+	    svst1b_vnum (p0, x0, -1, z0))
+
+/*
+** st1b_vnum_s16_m8:
+**	st1b	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s16_m8, svint16_t, int8_t,
+	    svst1b_vnum_s16 (p0, x0, -8, z0),
+	    svst1b_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_vnum_s16_m9:
+**	dech	x0, all, mul #9
+**	st1b	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s16_m9, svint16_t, int8_t,
+	    svst1b_vnum_s16 (p0, x0, -9, z0),
+	    svst1b_vnum (p0, x0, -9, z0))
+
+/*
+** st1b_vnum_s16_x1:
+**	cnth	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	st1b	z0\.h, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	st1b	z0\.h, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_STORE (st1b_vnum_s16_x1, svint16_t, int8_t,
+	    svst1b_vnum_s16 (p0, x0, x1, z0),
+	    svst1b_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s32.c
new file mode 100644
index 000000000..85333aea9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s32.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1b_s32_base:
+**	st1b	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_s32_base, svint32_t, int8_t,
+	    svst1b_s32 (p0, x0, z0),
+	    svst1b (p0, x0, z0))
+
+/*
+** st1b_s32_index:
+**	st1b	z0\.s, p0, \[x0, x1\]
+**	ret
+*/
+TEST_STORE (st1b_s32_index, svint32_t, int8_t,
+	    svst1b_s32 (p0, x0 + x1, z0),
+	    svst1b (p0, x0 + x1, z0))
+
+/*
+** st1b_s32_1:
+**	st1b	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_s32_1, svint32_t, int8_t,
+	    svst1b_s32 (p0, x0 + svcntw (), z0),
+	    svst1b (p0, x0 + svcntw (), z0))
+
+/*
+** st1b_s32_7:
+**	st1b	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_s32_7, svint32_t, int8_t,
+	    svst1b_s32 (p0, x0 + svcntw () * 7, z0),
+	    svst1b (p0, x0 + svcntw () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_s32_8:
+**	incb	x0, all, mul #2
+**	st1b	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_s32_8, svint32_t, int8_t,
+	    svst1b_s32 (p0, x0 + svcntw () * 8, z0),
+	    svst1b (p0, x0 + svcntw () * 8, z0))
+
+/*
+** st1b_s32_m1:
+**	st1b	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_s32_m1, svint32_t, int8_t,
+	    svst1b_s32 (p0, x0 - svcntw (), z0),
+	    svst1b (p0, x0 - svcntw (), z0))
+
+/*
+** st1b_s32_m8:
+**	st1b	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_s32_m8, svint32_t, int8_t,
+	    svst1b_s32 (p0, x0 - svcntw () * 8, z0),
+	    svst1b (p0, x0 - svcntw () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_s32_m9:
+**	decw	x0, all, mul #9
+**	st1b	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_s32_m9, svint32_t, int8_t,
+	    svst1b_s32 (p0, x0 - svcntw () * 9, z0),
+	    svst1b (p0, x0 - svcntw () * 9, z0))
+
+/*
+** st1b_vnum_s32_0:
+**	st1b	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s32_0, svint32_t, int8_t,
+	    svst1b_vnum_s32 (p0, x0, 0, z0),
+	    svst1b_vnum (p0, x0, 0, z0))
+
+/*
+** st1b_vnum_s32_1:
+**	st1b	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s32_1, svint32_t, int8_t,
+	    svst1b_vnum_s32 (p0, x0, 1, z0),
+	    svst1b_vnum (p0, x0, 1, z0))
+
+/*
+** st1b_vnum_s32_7:
+**	st1b	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s32_7, svint32_t, int8_t,
+	    svst1b_vnum_s32 (p0, x0, 7, z0),
+	    svst1b_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_vnum_s32_8:
+**	incb	x0, all, mul #2
+**	st1b	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s32_8, svint32_t, int8_t,
+	    svst1b_vnum_s32 (p0, x0, 8, z0),
+	    svst1b_vnum (p0, x0, 8, z0))
+
+/*
+** st1b_vnum_s32_m1:
+**	st1b	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s32_m1, svint32_t, int8_t,
+	    svst1b_vnum_s32 (p0, x0, -1, z0),
+	    svst1b_vnum (p0, x0, -1, z0))
+
+/*
+** st1b_vnum_s32_m8:
+**	st1b	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s32_m8, svint32_t, int8_t,
+	    svst1b_vnum_s32 (p0, x0, -8, z0),
+	    svst1b_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_vnum_s32_m9:
+**	decw	x0, all, mul #9
+**	st1b	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s32_m9, svint32_t, int8_t,
+	    svst1b_vnum_s32 (p0, x0, -9, z0),
+	    svst1b_vnum (p0, x0, -9, z0))
+
+/*
+** st1b_vnum_s32_x1:
+**	cntw	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	st1b	z0\.s, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	st1b	z0\.s, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_STORE (st1b_vnum_s32_x1, svint32_t, int8_t,
+	    svst1b_vnum_s32 (p0, x0, x1, z0),
+	    svst1b_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s64.c
new file mode 100644
index 000000000..321f168d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_s64.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1b_s64_base:
+**	st1b	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_s64_base, svint64_t, int8_t,
+	    svst1b_s64 (p0, x0, z0),
+	    svst1b (p0, x0, z0))
+
+/*
+** st1b_s64_index:
+**	st1b	z0\.d, p0, \[x0, x1\]
+**	ret
+*/
+TEST_STORE (st1b_s64_index, svint64_t, int8_t,
+	    svst1b_s64 (p0, x0 + x1, z0),
+	    svst1b (p0, x0 + x1, z0))
+
+/*
+** st1b_s64_1:
+**	st1b	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_s64_1, svint64_t, int8_t,
+	    svst1b_s64 (p0, x0 + svcntd (), z0),
+	    svst1b (p0, x0 + svcntd (), z0))
+
+/*
+** st1b_s64_7:
+**	st1b	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_s64_7, svint64_t, int8_t,
+	    svst1b_s64 (p0, x0 + svcntd () * 7, z0),
+	    svst1b (p0, x0 + svcntd () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_s64_8:
+**	incb	x0
+**	st1b	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_s64_8, svint64_t, int8_t,
+	    svst1b_s64 (p0, x0 + svcntd () * 8, z0),
+	    svst1b (p0, x0 + svcntd () * 8, z0))
+
+/*
+** st1b_s64_m1:
+**	st1b	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_s64_m1, svint64_t, int8_t,
+	    svst1b_s64 (p0, x0 - svcntd (), z0),
+	    svst1b (p0, x0 - svcntd (), z0))
+
+/*
+** st1b_s64_m8:
+**	st1b	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_s64_m8, svint64_t, int8_t,
+	    svst1b_s64 (p0, x0 - svcntd () * 8, z0),
+	    svst1b (p0, x0 - svcntd () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_s64_m9:
+**	decd	x0, all, mul #9
+**	st1b	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_s64_m9, svint64_t, int8_t,
+	    svst1b_s64 (p0, x0 - svcntd () * 9, z0),
+	    svst1b (p0, x0 - svcntd () * 9, z0))
+
+/*
+** st1b_vnum_s64_0:
+**	st1b	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s64_0, svint64_t, int8_t,
+	    svst1b_vnum_s64 (p0, x0, 0, z0),
+	    svst1b_vnum (p0, x0, 0, z0))
+
+/*
+** st1b_vnum_s64_1:
+**	st1b	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s64_1, svint64_t, int8_t,
+	    svst1b_vnum_s64 (p0, x0, 1, z0),
+	    svst1b_vnum (p0, x0, 1, z0))
+
+/*
+** st1b_vnum_s64_7:
+**	st1b	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s64_7, svint64_t, int8_t,
+	    svst1b_vnum_s64 (p0, x0, 7, z0),
+	    svst1b_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_vnum_s64_8:
+**	incb	x0
+**	st1b	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s64_8, svint64_t, int8_t,
+	    svst1b_vnum_s64 (p0, x0, 8, z0),
+	    svst1b_vnum (p0, x0, 8, z0))
+
+/*
+** st1b_vnum_s64_m1:
+**	st1b	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s64_m1, svint64_t, int8_t,
+	    svst1b_vnum_s64 (p0, x0, -1, z0),
+	    svst1b_vnum (p0, x0, -1, z0))
+
+/*
+** st1b_vnum_s64_m8:
+**	st1b	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s64_m8, svint64_t, int8_t,
+	    svst1b_vnum_s64 (p0, x0, -8, z0),
+	    svst1b_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_vnum_s64_m9:
+**	decd	x0, all, mul #9
+**	st1b	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_s64_m9, svint64_t, int8_t,
+	    svst1b_vnum_s64 (p0, x0, -9, z0),
+	    svst1b_vnum (p0, x0, -9, z0))
+
+/*
+** st1b_vnum_s64_x1:
+**	cntd	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	st1b	z0\.d, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	st1b	z0\.d, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_STORE (st1b_vnum_s64_x1, svint64_t, int8_t,
+	    svst1b_vnum_s64 (p0, x0, x1, z0),
+	    svst1b_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_s32.c
new file mode 100644
index 000000000..d59033356
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_s32.c
@@ -0,0 +1,104 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1b_scatter_s32:
+**	st1b	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_s32, svint32_t, svuint32_t,
+		       svst1b_scatter_u32base_s32 (p0, z1, z0),
+		       svst1b_scatter (p0, z1, z0))
+
+/*
+** st1b_scatter_x0_s32_offset:
+**	st1b	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_x0_s32_offset, svint32_t, svuint32_t,
+		       svst1b_scatter_u32base_offset_s32 (p0, z1, x0, z0),
+		       svst1b_scatter_offset (p0, z1, x0, z0))
+
+/*
+** st1b_scatter_m1_s32_offset:
+**	mov	(x[0-9]+), #?-1
+**	st1b	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_m1_s32_offset, svint32_t, svuint32_t,
+		       svst1b_scatter_u32base_offset_s32 (p0, z1, -1, z0),
+		       svst1b_scatter_offset (p0, z1, -1, z0))
+
+/*
+** st1b_scatter_0_s32_offset:
+**	st1b	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_0_s32_offset, svint32_t, svuint32_t,
+		       svst1b_scatter_u32base_offset_s32 (p0, z1, 0, z0),
+		       svst1b_scatter_offset (p0, z1, 0, z0))
+
+/*
+** st1b_scatter_5_s32_offset:
+**	st1b	z0\.s, p0, \[z1\.s, #5\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_5_s32_offset, svint32_t, svuint32_t,
+		       svst1b_scatter_u32base_offset_s32 (p0, z1, 5, z0),
+		       svst1b_scatter_offset (p0, z1, 5, z0))
+
+/*
+** st1b_scatter_31_s32_offset:
+**	st1b	z0\.s, p0, \[z1\.s, #31\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_31_s32_offset, svint32_t, svuint32_t,
+		       svst1b_scatter_u32base_offset_s32 (p0, z1, 31, z0),
+		       svst1b_scatter_offset (p0, z1, 31, z0))
+
+/*
+** st1b_scatter_32_s32_offset:
+**	mov	(x[0-9]+), #?32
+**	st1b	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_32_s32_offset, svint32_t, svuint32_t,
+		       svst1b_scatter_u32base_offset_s32 (p0, z1, 32, z0),
+		       svst1b_scatter_offset (p0, z1, 32, z0))
+
+/*
+** st1b_scatter_x0_s32_s32offset:
+**	st1b	z0\.s, p0, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_x0_s32_s32offset, svint32_t, int8_t, svint32_t,
+		       svst1b_scatter_s32offset_s32 (p0, x0, z1, z0),
+		       svst1b_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1b_scatter_s32_s32offset:
+**	st1b	z0\.s, p0, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_s32_s32offset, svint32_t, int8_t, svint32_t,
+		       svst1b_scatter_s32offset_s32 (p0, x0, z1, z0),
+		       svst1b_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1b_scatter_x0_s32_u32offset:
+**	st1b	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_x0_s32_u32offset, svint32_t, int8_t, svuint32_t,
+		       svst1b_scatter_u32offset_s32 (p0, x0, z1, z0),
+		       svst1b_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1b_scatter_s32_u32offset:
+**	st1b	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_s32_u32offset, svint32_t, int8_t, svuint32_t,
+		       svst1b_scatter_u32offset_s32 (p0, x0, z1, z0),
+		       svst1b_scatter_offset (p0, x0, z1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_s64.c
new file mode 100644
index 000000000..c7a35f1b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_s64.c
@@ -0,0 +1,122 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1b_scatter_s64:
+**	st1b	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_s64, svint64_t, svuint64_t,
+		       svst1b_scatter_u64base_s64 (p0, z1, z0),
+		       svst1b_scatter (p0, z1, z0))
+
+/*
+** st1b_scatter_x0_s64_offset:
+**	st1b	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_x0_s64_offset, svint64_t, svuint64_t,
+		       svst1b_scatter_u64base_offset_s64 (p0, z1, x0, z0),
+		       svst1b_scatter_offset (p0, z1, x0, z0))
+
+/*
+** st1b_scatter_m1_s64_offset:
+**	mov	(x[0-9]+), #?-1
+**	st1b	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_m1_s64_offset, svint64_t, svuint64_t,
+		       svst1b_scatter_u64base_offset_s64 (p0, z1, -1, z0),
+		       svst1b_scatter_offset (p0, z1, -1, z0))
+
+/*
+** st1b_scatter_0_s64_offset:
+**	st1b	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_0_s64_offset, svint64_t, svuint64_t,
+		       svst1b_scatter_u64base_offset_s64 (p0, z1, 0, z0),
+		       svst1b_scatter_offset (p0, z1, 0, z0))
+
+/*
+** st1b_scatter_5_s64_offset:
+**	st1b	z0\.d, p0, \[z1\.d, #5\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_5_s64_offset, svint64_t, svuint64_t,
+		       svst1b_scatter_u64base_offset_s64 (p0, z1, 5, z0),
+		       svst1b_scatter_offset (p0, z1, 5, z0))
+
+/*
+** st1b_scatter_31_s64_offset:
+**	st1b	z0\.d, p0, \[z1\.d, #31\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_31_s64_offset, svint64_t, svuint64_t,
+		       svst1b_scatter_u64base_offset_s64 (p0, z1, 31, z0),
+		       svst1b_scatter_offset (p0, z1, 31, z0))
+
+/*
+** st1b_scatter_32_s64_offset:
+**	mov	(x[0-9]+), #?32
+**	st1b	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_32_s64_offset, svint64_t, svuint64_t,
+		       svst1b_scatter_u64base_offset_s64 (p0, z1, 32, z0),
+		       svst1b_scatter_offset (p0, z1, 32, z0))
+
+/*
+** st1b_scatter_x0_s64_s64offset:
+**	st1b	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_x0_s64_s64offset, svint64_t, int8_t, svint64_t,
+		       svst1b_scatter_s64offset_s64 (p0, x0, z1, z0),
+		       svst1b_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1b_scatter_s64_s64offset:
+**	st1b	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_s64_s64offset, svint64_t, int8_t, svint64_t,
+		       svst1b_scatter_s64offset_s64 (p0, x0, z1, z0),
+		       svst1b_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1b_scatter_ext_s64_s64offset:
+**	st1b	z0\.d, p0, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_ext_s64_s64offset, svint64_t, int8_t, svint64_t,
+		       svst1b_scatter_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1), z0),
+		       svst1b_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1b_scatter_x0_s64_u64offset:
+**	st1b	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_x0_s64_u64offset, svint64_t, int8_t, svuint64_t,
+		       svst1b_scatter_u64offset_s64 (p0, x0, z1, z0),
+		       svst1b_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1b_scatter_s64_u64offset:
+**	st1b	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_s64_u64offset, svint64_t, int8_t, svuint64_t,
+		       svst1b_scatter_u64offset_s64 (p0, x0, z1, z0),
+		       svst1b_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1b_scatter_ext_s64_u64offset:
+**	st1b	z0\.d, p0, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_ext_s64_u64offset, svint64_t, int8_t, svuint64_t,
+		       svst1b_scatter_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1), z0),
+		       svst1b_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_u32.c
new file mode 100644
index 000000000..e098cb9b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_u32.c
@@ -0,0 +1,104 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1b_scatter_u32:
+**	st1b	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_u32, svuint32_t, svuint32_t,
+		       svst1b_scatter_u32base_u32 (p0, z1, z0),
+		       svst1b_scatter (p0, z1, z0))
+
+/*
+** st1b_scatter_x0_u32_offset:
+**	st1b	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_x0_u32_offset, svuint32_t, svuint32_t,
+		       svst1b_scatter_u32base_offset_u32 (p0, z1, x0, z0),
+		       svst1b_scatter_offset (p0, z1, x0, z0))
+
+/*
+** st1b_scatter_m1_u32_offset:
+**	mov	(x[0-9]+), #?-1
+**	st1b	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_m1_u32_offset, svuint32_t, svuint32_t,
+		       svst1b_scatter_u32base_offset_u32 (p0, z1, -1, z0),
+		       svst1b_scatter_offset (p0, z1, -1, z0))
+
+/*
+** st1b_scatter_0_u32_offset:
+**	st1b	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_0_u32_offset, svuint32_t, svuint32_t,
+		       svst1b_scatter_u32base_offset_u32 (p0, z1, 0, z0),
+		       svst1b_scatter_offset (p0, z1, 0, z0))
+
+/*
+** st1b_scatter_5_u32_offset:
+**	st1b	z0\.s, p0, \[z1\.s, #5\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_5_u32_offset, svuint32_t, svuint32_t,
+		       svst1b_scatter_u32base_offset_u32 (p0, z1, 5, z0),
+		       svst1b_scatter_offset (p0, z1, 5, z0))
+
+/*
+** st1b_scatter_31_u32_offset:
+**	st1b	z0\.s, p0, \[z1\.s, #31\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_31_u32_offset, svuint32_t, svuint32_t,
+		       svst1b_scatter_u32base_offset_u32 (p0, z1, 31, z0),
+		       svst1b_scatter_offset (p0, z1, 31, z0))
+
+/*
+** st1b_scatter_32_u32_offset:
+**	mov	(x[0-9]+), #?32
+**	st1b	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_32_u32_offset, svuint32_t, svuint32_t,
+		       svst1b_scatter_u32base_offset_u32 (p0, z1, 32, z0),
+		       svst1b_scatter_offset (p0, z1, 32, z0))
+
+/*
+** st1b_scatter_x0_u32_s32offset:
+**	st1b	z0\.s, p0, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_x0_u32_s32offset, svuint32_t, uint8_t, svint32_t,
+		       svst1b_scatter_s32offset_u32 (p0, x0, z1, z0),
+		       svst1b_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1b_scatter_u32_s32offset:
+**	st1b	z0\.s, p0, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_u32_s32offset, svuint32_t, uint8_t, svint32_t,
+		       svst1b_scatter_s32offset_u32 (p0, x0, z1, z0),
+		       svst1b_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1b_scatter_x0_u32_u32offset:
+**	st1b	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_x0_u32_u32offset, svuint32_t, uint8_t, svuint32_t,
+		       svst1b_scatter_u32offset_u32 (p0, x0, z1, z0),
+		       svst1b_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1b_scatter_u32_u32offset:
+**	st1b	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_u32_u32offset, svuint32_t, uint8_t, svuint32_t,
+		       svst1b_scatter_u32offset_u32 (p0, x0, z1, z0),
+		       svst1b_scatter_offset (p0, x0, z1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_u64.c
new file mode 100644
index 000000000..058d1313f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_scatter_u64.c
@@ -0,0 +1,122 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1b_scatter_u64:
+**	st1b	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_u64, svuint64_t, svuint64_t,
+		       svst1b_scatter_u64base_u64 (p0, z1, z0),
+		       svst1b_scatter (p0, z1, z0))
+
+/*
+** st1b_scatter_x0_u64_offset:
+**	st1b	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_x0_u64_offset, svuint64_t, svuint64_t,
+		       svst1b_scatter_u64base_offset_u64 (p0, z1, x0, z0),
+		       svst1b_scatter_offset (p0, z1, x0, z0))
+
+/*
+** st1b_scatter_m1_u64_offset:
+**	mov	(x[0-9]+), #?-1
+**	st1b	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_m1_u64_offset, svuint64_t, svuint64_t,
+		       svst1b_scatter_u64base_offset_u64 (p0, z1, -1, z0),
+		       svst1b_scatter_offset (p0, z1, -1, z0))
+
+/*
+** st1b_scatter_0_u64_offset:
+**	st1b	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_0_u64_offset, svuint64_t, svuint64_t,
+		       svst1b_scatter_u64base_offset_u64 (p0, z1, 0, z0),
+		       svst1b_scatter_offset (p0, z1, 0, z0))
+
+/*
+** st1b_scatter_5_u64_offset:
+**	st1b	z0\.d, p0, \[z1\.d, #5\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_5_u64_offset, svuint64_t, svuint64_t,
+		       svst1b_scatter_u64base_offset_u64 (p0, z1, 5, z0),
+		       svst1b_scatter_offset (p0, z1, 5, z0))
+
+/*
+** st1b_scatter_31_u64_offset:
+**	st1b	z0\.d, p0, \[z1\.d, #31\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_31_u64_offset, svuint64_t, svuint64_t,
+		       svst1b_scatter_u64base_offset_u64 (p0, z1, 31, z0),
+		       svst1b_scatter_offset (p0, z1, 31, z0))
+
+/*
+** st1b_scatter_32_u64_offset:
+**	mov	(x[0-9]+), #?32
+**	st1b	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1b_scatter_32_u64_offset, svuint64_t, svuint64_t,
+		       svst1b_scatter_u64base_offset_u64 (p0, z1, 32, z0),
+		       svst1b_scatter_offset (p0, z1, 32, z0))
+
+/*
+** st1b_scatter_x0_u64_s64offset:
+**	st1b	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_x0_u64_s64offset, svuint64_t, uint8_t, svint64_t,
+		       svst1b_scatter_s64offset_u64 (p0, x0, z1, z0),
+		       svst1b_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1b_scatter_u64_s64offset:
+**	st1b	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_u64_s64offset, svuint64_t, uint8_t, svint64_t,
+		       svst1b_scatter_s64offset_u64 (p0, x0, z1, z0),
+		       svst1b_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1b_scatter_ext_u64_s64offset:
+**	st1b	z0\.d, p0, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_ext_u64_s64offset, svuint64_t, uint8_t, svint64_t,
+		       svst1b_scatter_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1), z0),
+		       svst1b_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1b_scatter_x0_u64_u64offset:
+**	st1b	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_x0_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
+		       svst1b_scatter_u64offset_u64 (p0, x0, z1, z0),
+		       svst1b_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1b_scatter_u64_u64offset:
+**	st1b	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
+		       svst1b_scatter_u64offset_u64 (p0, x0, z1, z0),
+		       svst1b_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1b_scatter_ext_u64_u64offset:
+**	st1b	z0\.d, p0, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1b_scatter_ext_u64_u64offset, svuint64_t, uint8_t, svuint64_t,
+		       svst1b_scatter_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1), z0),
+		       svst1b_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u16.c
new file mode 100644
index 000000000..025a2212a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u16.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1b_u16_base:
+**	st1b	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_u16_base, svuint16_t, uint8_t,
+	    svst1b_u16 (p0, x0, z0),
+	    svst1b (p0, x0, z0))
+
+/*
+** st1b_u16_index:
+**	st1b	z0\.h, p0, \[x0, x1\]
+**	ret
+*/
+TEST_STORE (st1b_u16_index, svuint16_t, uint8_t,
+	    svst1b_u16 (p0, x0 + x1, z0),
+	    svst1b (p0, x0 + x1, z0))
+
+/*
+** st1b_u16_1:
+**	st1b	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_u16_1, svuint16_t, uint8_t,
+	    svst1b_u16 (p0, x0 + svcnth (), z0),
+	    svst1b (p0, x0 + svcnth (), z0))
+
+/*
+** st1b_u16_7:
+**	st1b	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_u16_7, svuint16_t, uint8_t,
+	    svst1b_u16 (p0, x0 + svcnth () * 7, z0),
+	    svst1b (p0, x0 + svcnth () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_u16_8:
+**	incb	x0, all, mul #4
+**	st1b	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_u16_8, svuint16_t, uint8_t,
+	    svst1b_u16 (p0, x0 + svcnth () * 8, z0),
+	    svst1b (p0, x0 + svcnth () * 8, z0))
+
+/*
+** st1b_u16_m1:
+**	st1b	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_u16_m1, svuint16_t, uint8_t,
+	    svst1b_u16 (p0, x0 - svcnth (), z0),
+	    svst1b (p0, x0 - svcnth (), z0))
+
+/*
+** st1b_u16_m8:
+**	st1b	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_u16_m8, svuint16_t, uint8_t,
+	    svst1b_u16 (p0, x0 - svcnth () * 8, z0),
+	    svst1b (p0, x0 - svcnth () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_u16_m9:
+**	dech	x0, all, mul #9
+**	st1b	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_u16_m9, svuint16_t, uint8_t,
+	    svst1b_u16 (p0, x0 - svcnth () * 9, z0),
+	    svst1b (p0, x0 - svcnth () * 9, z0))
+
+/*
+** st1b_vnum_u16_0:
+**	st1b	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u16_0, svuint16_t, uint8_t,
+	    svst1b_vnum_u16 (p0, x0, 0, z0),
+	    svst1b_vnum (p0, x0, 0, z0))
+
+/*
+** st1b_vnum_u16_1:
+**	st1b	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u16_1, svuint16_t, uint8_t,
+	    svst1b_vnum_u16 (p0, x0, 1, z0),
+	    svst1b_vnum (p0, x0, 1, z0))
+
+/*
+** st1b_vnum_u16_7:
+**	st1b	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u16_7, svuint16_t, uint8_t,
+	    svst1b_vnum_u16 (p0, x0, 7, z0),
+	    svst1b_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_vnum_u16_8:
+**	incb	x0, all, mul #4
+**	st1b	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u16_8, svuint16_t, uint8_t,
+	    svst1b_vnum_u16 (p0, x0, 8, z0),
+	    svst1b_vnum (p0, x0, 8, z0))
+
+/*
+** st1b_vnum_u16_m1:
+**	st1b	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u16_m1, svuint16_t, uint8_t,
+	    svst1b_vnum_u16 (p0, x0, -1, z0),
+	    svst1b_vnum (p0, x0, -1, z0))
+
+/*
+** st1b_vnum_u16_m8:
+**	st1b	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u16_m8, svuint16_t, uint8_t,
+	    svst1b_vnum_u16 (p0, x0, -8, z0),
+	    svst1b_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_vnum_u16_m9:
+**	dech	x0, all, mul #9
+**	st1b	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u16_m9, svuint16_t, uint8_t,
+	    svst1b_vnum_u16 (p0, x0, -9, z0),
+	    svst1b_vnum (p0, x0, -9, z0))
+
+/*
+** st1b_vnum_u16_x1:
+**	cnth	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	st1b	z0\.h, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	st1b	z0\.h, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_STORE (st1b_vnum_u16_x1, svuint16_t, uint8_t,
+	    svst1b_vnum_u16 (p0, x0, x1, z0),
+	    svst1b_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u32.c
new file mode 100644
index 000000000..5833cb44b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u32.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1b_u32_base:
+**	st1b	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_u32_base, svuint32_t, uint8_t,
+	    svst1b_u32 (p0, x0, z0),
+	    svst1b (p0, x0, z0))
+
+/*
+** st1b_u32_index:
+**	st1b	z0\.s, p0, \[x0, x1\]
+**	ret
+*/
+TEST_STORE (st1b_u32_index, svuint32_t, uint8_t,
+	    svst1b_u32 (p0, x0 + x1, z0),
+	    svst1b (p0, x0 + x1, z0))
+
+/*
+** st1b_u32_1:
+**	st1b	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_u32_1, svuint32_t, uint8_t,
+	    svst1b_u32 (p0, x0 + svcntw (), z0),
+	    svst1b (p0, x0 + svcntw (), z0))
+
+/*
+** st1b_u32_7:
+**	st1b	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_u32_7, svuint32_t, uint8_t,
+	    svst1b_u32 (p0, x0 + svcntw () * 7, z0),
+	    svst1b (p0, x0 + svcntw () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_u32_8:
+**	incb	x0, all, mul #2
+**	st1b	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_u32_8, svuint32_t, uint8_t,
+	    svst1b_u32 (p0, x0 + svcntw () * 8, z0),
+	    svst1b (p0, x0 + svcntw () * 8, z0))
+
+/*
+** st1b_u32_m1:
+**	st1b	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_u32_m1, svuint32_t, uint8_t,
+	    svst1b_u32 (p0, x0 - svcntw (), z0),
+	    svst1b (p0, x0 - svcntw (), z0))
+
+/*
+** st1b_u32_m8:
+**	st1b	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_u32_m8, svuint32_t, uint8_t,
+	    svst1b_u32 (p0, x0 - svcntw () * 8, z0),
+	    svst1b (p0, x0 - svcntw () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_u32_m9:
+**	decw	x0, all, mul #9
+**	st1b	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_u32_m9, svuint32_t, uint8_t,
+	    svst1b_u32 (p0, x0 - svcntw () * 9, z0),
+	    svst1b (p0, x0 - svcntw () * 9, z0))
+
+/*
+** st1b_vnum_u32_0:
+**	st1b	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u32_0, svuint32_t, uint8_t,
+	    svst1b_vnum_u32 (p0, x0, 0, z0),
+	    svst1b_vnum (p0, x0, 0, z0))
+
+/*
+** st1b_vnum_u32_1:
+**	st1b	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u32_1, svuint32_t, uint8_t,
+	    svst1b_vnum_u32 (p0, x0, 1, z0),
+	    svst1b_vnum (p0, x0, 1, z0))
+
+/*
+** st1b_vnum_u32_7:
+**	st1b	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u32_7, svuint32_t, uint8_t,
+	    svst1b_vnum_u32 (p0, x0, 7, z0),
+	    svst1b_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_vnum_u32_8:
+**	incb	x0, all, mul #2
+**	st1b	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u32_8, svuint32_t, uint8_t,
+	    svst1b_vnum_u32 (p0, x0, 8, z0),
+	    svst1b_vnum (p0, x0, 8, z0))
+
+/*
+** st1b_vnum_u32_m1:
+**	st1b	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u32_m1, svuint32_t, uint8_t,
+	    svst1b_vnum_u32 (p0, x0, -1, z0),
+	    svst1b_vnum (p0, x0, -1, z0))
+
+/*
+** st1b_vnum_u32_m8:
+**	st1b	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u32_m8, svuint32_t, uint8_t,
+	    svst1b_vnum_u32 (p0, x0, -8, z0),
+	    svst1b_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_vnum_u32_m9:
+**	decw	x0, all, mul #9
+**	st1b	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u32_m9, svuint32_t, uint8_t,
+	    svst1b_vnum_u32 (p0, x0, -9, z0),
+	    svst1b_vnum (p0, x0, -9, z0))
+
+/*
+** st1b_vnum_u32_x1:
+**	cntw	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	st1b	z0\.s, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	st1b	z0\.s, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_STORE (st1b_vnum_u32_x1, svuint32_t, uint8_t,
+	    svst1b_vnum_u32 (p0, x0, x1, z0),
+	    svst1b_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u64.c
new file mode 100644
index 000000000..e96f4c486
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1b_u64.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1b_u64_base:
+**	st1b	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_u64_base, svuint64_t, uint8_t,
+	    svst1b_u64 (p0, x0, z0),
+	    svst1b (p0, x0, z0))
+
+/*
+** st1b_u64_index:
+**	st1b	z0\.d, p0, \[x0, x1\]
+**	ret
+*/
+TEST_STORE (st1b_u64_index, svuint64_t, uint8_t,
+	    svst1b_u64 (p0, x0 + x1, z0),
+	    svst1b (p0, x0 + x1, z0))
+
+/*
+** st1b_u64_1:
+**	st1b	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_u64_1, svuint64_t, uint8_t,
+	    svst1b_u64 (p0, x0 + svcntd (), z0),
+	    svst1b (p0, x0 + svcntd (), z0))
+
+/*
+** st1b_u64_7:
+**	st1b	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_u64_7, svuint64_t, uint8_t,
+	    svst1b_u64 (p0, x0 + svcntd () * 7, z0),
+	    svst1b (p0, x0 + svcntd () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_u64_8:
+**	incb	x0
+**	st1b	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_u64_8, svuint64_t, uint8_t,
+	    svst1b_u64 (p0, x0 + svcntd () * 8, z0),
+	    svst1b (p0, x0 + svcntd () * 8, z0))
+
+/*
+** st1b_u64_m1:
+**	st1b	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_u64_m1, svuint64_t, uint8_t,
+	    svst1b_u64 (p0, x0 - svcntd (), z0),
+	    svst1b (p0, x0 - svcntd (), z0))
+
+/*
+** st1b_u64_m8:
+**	st1b	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_u64_m8, svuint64_t, uint8_t,
+	    svst1b_u64 (p0, x0 - svcntd () * 8, z0),
+	    svst1b (p0, x0 - svcntd () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_u64_m9:
+**	decd	x0, all, mul #9
+**	st1b	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_u64_m9, svuint64_t, uint8_t,
+	    svst1b_u64 (p0, x0 - svcntd () * 9, z0),
+	    svst1b (p0, x0 - svcntd () * 9, z0))
+
+/*
+** st1b_vnum_u64_0:
+**	st1b	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u64_0, svuint64_t, uint8_t,
+	    svst1b_vnum_u64 (p0, x0, 0, z0),
+	    svst1b_vnum (p0, x0, 0, z0))
+
+/*
+** st1b_vnum_u64_1:
+**	st1b	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u64_1, svuint64_t, uint8_t,
+	    svst1b_vnum_u64 (p0, x0, 1, z0),
+	    svst1b_vnum (p0, x0, 1, z0))
+
+/*
+** st1b_vnum_u64_7:
+**	st1b	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u64_7, svuint64_t, uint8_t,
+	    svst1b_vnum_u64 (p0, x0, 7, z0),
+	    svst1b_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_vnum_u64_8:
+**	incb	x0
+**	st1b	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u64_8, svuint64_t, uint8_t,
+	    svst1b_vnum_u64 (p0, x0, 8, z0),
+	    svst1b_vnum (p0, x0, 8, z0))
+
+/*
+** st1b_vnum_u64_m1:
+**	st1b	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u64_m1, svuint64_t, uint8_t,
+	    svst1b_vnum_u64 (p0, x0, -1, z0),
+	    svst1b_vnum (p0, x0, -1, z0))
+
+/*
+** st1b_vnum_u64_m8:
+**	st1b	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u64_m8, svuint64_t, uint8_t,
+	    svst1b_vnum_u64 (p0, x0, -8, z0),
+	    svst1b_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1b_vnum_u64_m9:
+**	decd	x0, all, mul #9
+**	st1b	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1b_vnum_u64_m9, svuint64_t, uint8_t,
+	    svst1b_vnum_u64 (p0, x0, -9, z0),
+	    svst1b_vnum (p0, x0, -9, z0))
+
+/*
+** st1b_vnum_u64_x1:
+**	cntd	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	st1b	z0\.d, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	st1b	z0\.d, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_STORE (st1b_vnum_u64_x1, svuint64_t, uint8_t,
+	    svst1b_vnum_u64 (p0, x0, x1, z0),
+	    svst1b_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_s32.c
new file mode 100644
index 000000000..3466e3293
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_s32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1h_s32_base:
+**	st1h	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_s32_base, svint32_t, int16_t,
+	    svst1h_s32 (p0, x0, z0),
+	    svst1h (p0, x0, z0))
+
+/*
+** st1h_s32_index:
+**	st1h	z0\.s, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st1h_s32_index, svint32_t, int16_t,
+	    svst1h_s32 (p0, x0 + x1, z0),
+	    svst1h (p0, x0 + x1, z0))
+
+/*
+** st1h_s32_1:
+**	st1h	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_s32_1, svint32_t, int16_t,
+	    svst1h_s32 (p0, x0 + svcntw (), z0),
+	    svst1h (p0, x0 + svcntw (), z0))
+
+/*
+** st1h_s32_7:
+**	st1h	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_s32_7, svint32_t, int16_t,
+	    svst1h_s32 (p0, x0 + svcntw () * 7, z0),
+	    svst1h (p0, x0 + svcntw () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1h_s32_8:
+**	incb	x0, all, mul #4
+**	st1h	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_s32_8, svint32_t, int16_t,
+	    svst1h_s32 (p0, x0 + svcntw () * 8, z0),
+	    svst1h (p0, x0 + svcntw () * 8, z0))
+
+/*
+** st1h_s32_m1:
+**	st1h	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_s32_m1, svint32_t, int16_t,
+	    svst1h_s32 (p0, x0 - svcntw (), z0),
+	    svst1h (p0, x0 - svcntw (), z0))
+
+/*
+** st1h_s32_m8:
+**	st1h	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_s32_m8, svint32_t, int16_t,
+	    svst1h_s32 (p0, x0 - svcntw () * 8, z0),
+	    svst1h (p0, x0 - svcntw () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1h_s32_m9:
+**	dech	x0, all, mul #9
+**	st1h	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_s32_m9, svint32_t, int16_t,
+	    svst1h_s32 (p0, x0 - svcntw () * 9, z0),
+	    svst1h (p0, x0 - svcntw () * 9, z0))
+
+/*
+** st1h_vnum_s32_0:
+**	st1h	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_s32_0, svint32_t, int16_t,
+	    svst1h_vnum_s32 (p0, x0, 0, z0),
+	    svst1h_vnum (p0, x0, 0, z0))
+
+/*
+** st1h_vnum_s32_1:
+**	st1h	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_s32_1, svint32_t, int16_t,
+	    svst1h_vnum_s32 (p0, x0, 1, z0),
+	    svst1h_vnum (p0, x0, 1, z0))
+
+/*
+** st1h_vnum_s32_7:
+**	st1h	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_s32_7, svint32_t, int16_t,
+	    svst1h_vnum_s32 (p0, x0, 7, z0),
+	    svst1h_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1h_vnum_s32_8:
+**	incb	x0, all, mul #4
+**	st1h	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_s32_8, svint32_t, int16_t,
+	    svst1h_vnum_s32 (p0, x0, 8, z0),
+	    svst1h_vnum (p0, x0, 8, z0))
+
+/*
+** st1h_vnum_s32_m1:
+**	st1h	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_s32_m1, svint32_t, int16_t,
+	    svst1h_vnum_s32 (p0, x0, -1, z0),
+	    svst1h_vnum (p0, x0, -1, z0))
+
+/*
+** st1h_vnum_s32_m8:
+**	st1h	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_s32_m8, svint32_t, int16_t,
+	    svst1h_vnum_s32 (p0, x0, -8, z0),
+	    svst1h_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1h_vnum_s32_m9:
+**	dech	x0, all, mul #9
+**	st1h	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_s32_m9, svint32_t, int16_t,
+	    svst1h_vnum_s32 (p0, x0, -9, z0),
+	    svst1h_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st1h_vnum_s32_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st1h	z0\.s, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_s32_x1, svint32_t, int16_t,
+	    svst1h_vnum_s32 (p0, x0, x1, z0),
+	    svst1h_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_s64.c
new file mode 100644
index 000000000..c5df3b0c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_s64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1h_s64_base:
+**	st1h	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_s64_base, svint64_t, int16_t,
+	    svst1h_s64 (p0, x0, z0),
+	    svst1h (p0, x0, z0))
+
+/*
+** st1h_s64_index:
+**	st1h	z0\.d, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st1h_s64_index, svint64_t, int16_t,
+	    svst1h_s64 (p0, x0 + x1, z0),
+	    svst1h (p0, x0 + x1, z0))
+
+/*
+** st1h_s64_1:
+**	st1h	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_s64_1, svint64_t, int16_t,
+	    svst1h_s64 (p0, x0 + svcntd (), z0),
+	    svst1h (p0, x0 + svcntd (), z0))
+
+/*
+** st1h_s64_7:
+**	st1h	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_s64_7, svint64_t, int16_t,
+	    svst1h_s64 (p0, x0 + svcntd () * 7, z0),
+	    svst1h (p0, x0 + svcntd () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1h_s64_8:
+**	incb	x0, all, mul #2
+**	st1h	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_s64_8, svint64_t, int16_t,
+	    svst1h_s64 (p0, x0 + svcntd () * 8, z0),
+	    svst1h (p0, x0 + svcntd () * 8, z0))
+
+/*
+** st1h_s64_m1:
+**	st1h	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_s64_m1, svint64_t, int16_t,
+	    svst1h_s64 (p0, x0 - svcntd (), z0),
+	    svst1h (p0, x0 - svcntd (), z0))
+
+/*
+** st1h_s64_m8:
+**	st1h	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_s64_m8, svint64_t, int16_t,
+	    svst1h_s64 (p0, x0 - svcntd () * 8, z0),
+	    svst1h (p0, x0 - svcntd () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1h_s64_m9:
+**	decw	x0, all, mul #9
+**	st1h	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_s64_m9, svint64_t, int16_t,
+	    svst1h_s64 (p0, x0 - svcntd () * 9, z0),
+	    svst1h (p0, x0 - svcntd () * 9, z0))
+
+/*
+** st1h_vnum_s64_0:
+**	st1h	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_s64_0, svint64_t, int16_t,
+	    svst1h_vnum_s64 (p0, x0, 0, z0),
+	    svst1h_vnum (p0, x0, 0, z0))
+
+/*
+** st1h_vnum_s64_1:
+**	st1h	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_s64_1, svint64_t, int16_t,
+	    svst1h_vnum_s64 (p0, x0, 1, z0),
+	    svst1h_vnum (p0, x0, 1, z0))
+
+/*
+** st1h_vnum_s64_7:
+**	st1h	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_s64_7, svint64_t, int16_t,
+	    svst1h_vnum_s64 (p0, x0, 7, z0),
+	    svst1h_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1h_vnum_s64_8:
+**	incb	x0, all, mul #2
+**	st1h	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_s64_8, svint64_t, int16_t,
+	    svst1h_vnum_s64 (p0, x0, 8, z0),
+	    svst1h_vnum (p0, x0, 8, z0))
+
+/*
+** st1h_vnum_s64_m1:
+**	st1h	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_s64_m1, svint64_t, int16_t,
+	    svst1h_vnum_s64 (p0, x0, -1, z0),
+	    svst1h_vnum (p0, x0, -1, z0))
+
+/*
+** st1h_vnum_s64_m8:
+**	st1h	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_s64_m8, svint64_t, int16_t,
+	    svst1h_vnum_s64 (p0, x0, -8, z0),
+	    svst1h_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1h_vnum_s64_m9:
+**	decw	x0, all, mul #9
+**	st1h	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_s64_m9, svint64_t, int16_t,
+	    svst1h_vnum_s64 (p0, x0, -9, z0),
+	    svst1h_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st1h_vnum_s64_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st1h	z0\.d, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_s64_x1, svint64_t, int16_t,
+	    svst1h_vnum_s64 (p0, x0, x1, z0),
+	    svst1h_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_s32.c
new file mode 100644
index 000000000..2a23d41f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_s32.c
@@ -0,0 +1,207 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1h_scatter_s32:
+**	st1h	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_s32, svint32_t, svuint32_t,
+		       svst1h_scatter_u32base_s32 (p0, z1, z0),
+		       svst1h_scatter (p0, z1, z0))
+
+/*
+** st1h_scatter_x0_s32_offset:
+**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_x0_s32_offset, svint32_t, svuint32_t,
+		       svst1h_scatter_u32base_offset_s32 (p0, z1, x0, z0),
+		       svst1h_scatter_offset (p0, z1, x0, z0))
+
+/*
+** st1h_scatter_m2_s32_offset:
+**	mov	(x[0-9]+), #?-2
+**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_m2_s32_offset, svint32_t, svuint32_t,
+		       svst1h_scatter_u32base_offset_s32 (p0, z1, -2, z0),
+		       svst1h_scatter_offset (p0, z1, -2, z0))
+
+/*
+** st1h_scatter_0_s32_offset:
+**	st1h	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_0_s32_offset, svint32_t, svuint32_t,
+		       svst1h_scatter_u32base_offset_s32 (p0, z1, 0, z0),
+		       svst1h_scatter_offset (p0, z1, 0, z0))
+
+/*
+** st1h_scatter_5_s32_offset:
+**	mov	(x[0-9]+), #?5
+**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_5_s32_offset, svint32_t, svuint32_t,
+		       svst1h_scatter_u32base_offset_s32 (p0, z1, 5, z0),
+		       svst1h_scatter_offset (p0, z1, 5, z0))
+
+/*
+** st1h_scatter_6_s32_offset:
+**	st1h	z0\.s, p0, \[z1\.s, #6\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_6_s32_offset, svint32_t, svuint32_t,
+		       svst1h_scatter_u32base_offset_s32 (p0, z1, 6, z0),
+		       svst1h_scatter_offset (p0, z1, 6, z0))
+
+/*
+** st1h_scatter_62_s32_offset:
+**	st1h	z0\.s, p0, \[z1\.s, #62\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_62_s32_offset, svint32_t, svuint32_t,
+		       svst1h_scatter_u32base_offset_s32 (p0, z1, 62, z0),
+		       svst1h_scatter_offset (p0, z1, 62, z0))
+
+/*
+** st1h_scatter_64_s32_offset:
+**	mov	(x[0-9]+), #?64
+**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_64_s32_offset, svint32_t, svuint32_t,
+		       svst1h_scatter_u32base_offset_s32 (p0, z1, 64, z0),
+		       svst1h_scatter_offset (p0, z1, 64, z0))
+
+/*
+** st1h_scatter_x0_s32_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_x0_s32_index, svint32_t, svuint32_t,
+		       svst1h_scatter_u32base_index_s32 (p0, z1, x0, z0),
+		       svst1h_scatter_index (p0, z1, x0, z0))
+
+/*
+** st1h_scatter_m1_s32_index:
+**	mov	(x[0-9]+), #?-2
+**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_m1_s32_index, svint32_t, svuint32_t,
+		       svst1h_scatter_u32base_index_s32 (p0, z1, -1, z0),
+		       svst1h_scatter_index (p0, z1, -1, z0))
+
+/*
+** st1h_scatter_0_s32_index:
+**	st1h	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_0_s32_index, svint32_t, svuint32_t,
+		       svst1h_scatter_u32base_index_s32 (p0, z1, 0, z0),
+		       svst1h_scatter_index (p0, z1, 0, z0))
+
+/*
+** st1h_scatter_5_s32_index:
+**	st1h	z0\.s, p0, \[z1\.s, #10\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_5_s32_index, svint32_t, svuint32_t,
+		       svst1h_scatter_u32base_index_s32 (p0, z1, 5, z0),
+		       svst1h_scatter_index (p0, z1, 5, z0))
+
+/*
+** st1h_scatter_31_s32_index:
+**	st1h	z0\.s, p0, \[z1\.s, #62\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_31_s32_index, svint32_t, svuint32_t,
+		       svst1h_scatter_u32base_index_s32 (p0, z1, 31, z0),
+		       svst1h_scatter_index (p0, z1, 31, z0))
+
+/*
+** st1h_scatter_32_s32_index:
+**	mov	(x[0-9]+), #?64
+**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_32_s32_index, svint32_t, svuint32_t,
+		       svst1h_scatter_u32base_index_s32 (p0, z1, 32, z0),
+		       svst1h_scatter_index (p0, z1, 32, z0))
+
+/*
+** st1h_scatter_x0_s32_s32offset:
+**	st1h	z0\.s, p0, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_x0_s32_s32offset, svint32_t, int16_t, svint32_t,
+		       svst1h_scatter_s32offset_s32 (p0, x0, z1, z0),
+		       svst1h_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_s32_s32offset:
+**	st1h	z0\.s, p0, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_s32_s32offset, svint32_t, int16_t, svint32_t,
+		       svst1h_scatter_s32offset_s32 (p0, x0, z1, z0),
+		       svst1h_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_x0_s32_u32offset:
+**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_x0_s32_u32offset, svint32_t, int16_t, svuint32_t,
+		       svst1h_scatter_u32offset_s32 (p0, x0, z1, z0),
+		       svst1h_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_s32_u32offset:
+**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_s32_u32offset, svint32_t, int16_t, svuint32_t,
+		       svst1h_scatter_u32offset_s32 (p0, x0, z1, z0),
+		       svst1h_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_x0_s32_s32index:
+**	st1h	z0\.s, p0, \[x0, z1\.s, sxtw 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_x0_s32_s32index, svint32_t, int16_t, svint32_t,
+		       svst1h_scatter_s32index_s32 (p0, x0, z1, z0),
+		       svst1h_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_s32_s32index:
+**	st1h	z0\.s, p0, \[x0, z1\.s, sxtw 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_s32_s32index, svint32_t, int16_t, svint32_t,
+		       svst1h_scatter_s32index_s32 (p0, x0, z1, z0),
+		       svst1h_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_x0_s32_u32index:
+**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_x0_s32_u32index, svint32_t, int16_t, svuint32_t,
+		       svst1h_scatter_u32index_s32 (p0, x0, z1, z0),
+		       svst1h_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_s32_u32index:
+**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_s32_u32index, svint32_t, int16_t, svuint32_t,
+		       svst1h_scatter_u32index_s32 (p0, x0, z1, z0),
+		       svst1h_scatter_index (p0, x0, z1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_s64.c
new file mode 100644
index 000000000..6a1adb056
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_s64.c
@@ -0,0 +1,243 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1h_scatter_s64:
+**	st1h	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_s64, svint64_t, svuint64_t,
+		       svst1h_scatter_u64base_s64 (p0, z1, z0),
+		       svst1h_scatter (p0, z1, z0))
+
+/*
+** st1h_scatter_x0_s64_offset:
+**	st1h	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_x0_s64_offset, svint64_t, svuint64_t,
+		       svst1h_scatter_u64base_offset_s64 (p0, z1, x0, z0),
+		       svst1h_scatter_offset (p0, z1, x0, z0))
+
+/*
+** st1h_scatter_m2_s64_offset:
+**	mov	(x[0-9]+), #?-2
+**	st1h	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_m2_s64_offset, svint64_t, svuint64_t,
+		       svst1h_scatter_u64base_offset_s64 (p0, z1, -2, z0),
+		       svst1h_scatter_offset (p0, z1, -2, z0))
+
+/*
+** st1h_scatter_0_s64_offset:
+**	st1h	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_0_s64_offset, svint64_t, svuint64_t,
+		       svst1h_scatter_u64base_offset_s64 (p0, z1, 0, z0),
+		       svst1h_scatter_offset (p0, z1, 0, z0))
+
+/*
+** st1h_scatter_5_s64_offset:
+**	mov	(x[0-9]+), #?5
+**	st1h	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_5_s64_offset, svint64_t, svuint64_t,
+		       svst1h_scatter_u64base_offset_s64 (p0, z1, 5, z0),
+		       svst1h_scatter_offset (p0, z1, 5, z0))
+
+/*
+** st1h_scatter_6_s64_offset:
+**	st1h	z0\.d, p0, \[z1\.d, #6\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_6_s64_offset, svint64_t, svuint64_t,
+		       svst1h_scatter_u64base_offset_s64 (p0, z1, 6, z0),
+		       svst1h_scatter_offset (p0, z1, 6, z0))
+
+/*
+** st1h_scatter_62_s64_offset:
+**	st1h	z0\.d, p0, \[z1\.d, #62\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_62_s64_offset, svint64_t, svuint64_t,
+		       svst1h_scatter_u64base_offset_s64 (p0, z1, 62, z0),
+		       svst1h_scatter_offset (p0, z1, 62, z0))
+
+/*
+** st1h_scatter_64_s64_offset:
+**	mov	(x[0-9]+), #?64
+**	st1h	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_64_s64_offset, svint64_t, svuint64_t,
+		       svst1h_scatter_u64base_offset_s64 (p0, z1, 64, z0),
+		       svst1h_scatter_offset (p0, z1, 64, z0))
+
+/*
+** st1h_scatter_x0_s64_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	st1h	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_x0_s64_index, svint64_t, svuint64_t,
+		       svst1h_scatter_u64base_index_s64 (p0, z1, x0, z0),
+		       svst1h_scatter_index (p0, z1, x0, z0))
+
+/*
+** st1h_scatter_m1_s64_index:
+**	mov	(x[0-9]+), #?-2
+**	st1h	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_m1_s64_index, svint64_t, svuint64_t,
+		       svst1h_scatter_u64base_index_s64 (p0, z1, -1, z0),
+		       svst1h_scatter_index (p0, z1, -1, z0))
+
+/*
+** st1h_scatter_0_s64_index:
+**	st1h	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_0_s64_index, svint64_t, svuint64_t,
+		       svst1h_scatter_u64base_index_s64 (p0, z1, 0, z0),
+		       svst1h_scatter_index (p0, z1, 0, z0))
+
+/*
+** st1h_scatter_5_s64_index:
+**	st1h	z0\.d, p0, \[z1\.d, #10\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_5_s64_index, svint64_t, svuint64_t,
+		       svst1h_scatter_u64base_index_s64 (p0, z1, 5, z0),
+		       svst1h_scatter_index (p0, z1, 5, z0))
+
+/*
+** st1h_scatter_31_s64_index:
+**	st1h	z0\.d, p0, \[z1\.d, #62\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_31_s64_index, svint64_t, svuint64_t,
+		       svst1h_scatter_u64base_index_s64 (p0, z1, 31, z0),
+		       svst1h_scatter_index (p0, z1, 31, z0))
+
+/*
+** st1h_scatter_32_s64_index:
+**	mov	(x[0-9]+), #?64
+**	st1h	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_32_s64_index, svint64_t, svuint64_t,
+		       svst1h_scatter_u64base_index_s64 (p0, z1, 32, z0),
+		       svst1h_scatter_index (p0, z1, 32, z0))
+
+/*
+** st1h_scatter_x0_s64_s64offset:
+**	st1h	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_x0_s64_s64offset, svint64_t, int16_t, svint64_t,
+		       svst1h_scatter_s64offset_s64 (p0, x0, z1, z0),
+		       svst1h_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_s64_s64offset:
+**	st1h	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_s64_s64offset, svint64_t, int16_t, svint64_t,
+		       svst1h_scatter_s64offset_s64 (p0, x0, z1, z0),
+		       svst1h_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_ext_s64_s64offset:
+**	st1h	z0\.d, p0, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_ext_s64_s64offset, svint64_t, int16_t, svint64_t,
+		       svst1h_scatter_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1), z0),
+		       svst1h_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1h_scatter_x0_s64_u64offset:
+**	st1h	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_x0_s64_u64offset, svint64_t, int16_t, svuint64_t,
+		       svst1h_scatter_u64offset_s64 (p0, x0, z1, z0),
+		       svst1h_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_s64_u64offset:
+**	st1h	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_s64_u64offset, svint64_t, int16_t, svuint64_t,
+		       svst1h_scatter_u64offset_s64 (p0, x0, z1, z0),
+		       svst1h_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_ext_s64_u64offset:
+**	st1h	z0\.d, p0, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_ext_s64_u64offset, svint64_t, int16_t, svuint64_t,
+		       svst1h_scatter_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1), z0),
+		       svst1h_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1h_scatter_x0_s64_s64index:
+**	st1h	z0\.d, p0, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_x0_s64_s64index, svint64_t, int16_t, svint64_t,
+		       svst1h_scatter_s64index_s64 (p0, x0, z1, z0),
+		       svst1h_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_s64_s64index:
+**	st1h	z0\.d, p0, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_s64_s64index, svint64_t, int16_t, svint64_t,
+		       svst1h_scatter_s64index_s64 (p0, x0, z1, z0),
+		       svst1h_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_ext_s64_s64index:
+**	st1h	z0\.d, p0, \[x0, z1\.d, sxtw 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_ext_s64_s64index, svint64_t, int16_t, svint64_t,
+		       svst1h_scatter_s64index_s64 (p0, x0, svextw_s64_x (p0, z1), z0),
+		       svst1h_scatter_index (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1h_scatter_x0_s64_u64index:
+**	st1h	z0\.d, p0, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_x0_s64_u64index, svint64_t, int16_t, svuint64_t,
+		       svst1h_scatter_u64index_s64 (p0, x0, z1, z0),
+		       svst1h_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_s64_u64index:
+**	st1h	z0\.d, p0, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_s64_u64index, svint64_t, int16_t, svuint64_t,
+		       svst1h_scatter_u64index_s64 (p0, x0, z1, z0),
+		       svst1h_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_ext_s64_u64index:
+**	st1h	z0\.d, p0, \[x0, z1\.d, uxtw 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_ext_s64_u64index, svint64_t, int16_t, svuint64_t,
+		       svst1h_scatter_u64index_s64 (p0, x0, svextw_u64_x (p0, z1), z0),
+		       svst1h_scatter_index (p0, x0, svextw_x (p0, z1), z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_u32.c
new file mode 100644
index 000000000..12197315d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_u32.c
@@ -0,0 +1,207 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1h_scatter_u32:
+**	st1h	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_u32, svuint32_t, svuint32_t,
+		       svst1h_scatter_u32base_u32 (p0, z1, z0),
+		       svst1h_scatter (p0, z1, z0))
+
+/*
+** st1h_scatter_x0_u32_offset:
+**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_x0_u32_offset, svuint32_t, svuint32_t,
+		       svst1h_scatter_u32base_offset_u32 (p0, z1, x0, z0),
+		       svst1h_scatter_offset (p0, z1, x0, z0))
+
+/*
+** st1h_scatter_m2_u32_offset:
+**	mov	(x[0-9]+), #?-2
+**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_m2_u32_offset, svuint32_t, svuint32_t,
+		       svst1h_scatter_u32base_offset_u32 (p0, z1, -2, z0),
+		       svst1h_scatter_offset (p0, z1, -2, z0))
+
+/*
+** st1h_scatter_0_u32_offset:
+**	st1h	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_0_u32_offset, svuint32_t, svuint32_t,
+		       svst1h_scatter_u32base_offset_u32 (p0, z1, 0, z0),
+		       svst1h_scatter_offset (p0, z1, 0, z0))
+
+/*
+** st1h_scatter_5_u32_offset:
+**	mov	(x[0-9]+), #?5
+**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_5_u32_offset, svuint32_t, svuint32_t,
+		       svst1h_scatter_u32base_offset_u32 (p0, z1, 5, z0),
+		       svst1h_scatter_offset (p0, z1, 5, z0))
+
+/*
+** st1h_scatter_6_u32_offset:
+**	st1h	z0\.s, p0, \[z1\.s, #6\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_6_u32_offset, svuint32_t, svuint32_t,
+		       svst1h_scatter_u32base_offset_u32 (p0, z1, 6, z0),
+		       svst1h_scatter_offset (p0, z1, 6, z0))
+
+/*
+** st1h_scatter_62_u32_offset:
+**	st1h	z0\.s, p0, \[z1\.s, #62\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_62_u32_offset, svuint32_t, svuint32_t,
+		       svst1h_scatter_u32base_offset_u32 (p0, z1, 62, z0),
+		       svst1h_scatter_offset (p0, z1, 62, z0))
+
+/*
+** st1h_scatter_64_u32_offset:
+**	mov	(x[0-9]+), #?64
+**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_64_u32_offset, svuint32_t, svuint32_t,
+		       svst1h_scatter_u32base_offset_u32 (p0, z1, 64, z0),
+		       svst1h_scatter_offset (p0, z1, 64, z0))
+
+/*
+** st1h_scatter_x0_u32_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_x0_u32_index, svuint32_t, svuint32_t,
+		       svst1h_scatter_u32base_index_u32 (p0, z1, x0, z0),
+		       svst1h_scatter_index (p0, z1, x0, z0))
+
+/*
+** st1h_scatter_m1_u32_index:
+**	mov	(x[0-9]+), #?-2
+**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_m1_u32_index, svuint32_t, svuint32_t,
+		       svst1h_scatter_u32base_index_u32 (p0, z1, -1, z0),
+		       svst1h_scatter_index (p0, z1, -1, z0))
+
+/*
+** st1h_scatter_0_u32_index:
+**	st1h	z0\.s, p0, \[z1\.s\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_0_u32_index, svuint32_t, svuint32_t,
+		       svst1h_scatter_u32base_index_u32 (p0, z1, 0, z0),
+		       svst1h_scatter_index (p0, z1, 0, z0))
+
+/*
+** st1h_scatter_5_u32_index:
+**	st1h	z0\.s, p0, \[z1\.s, #10\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_5_u32_index, svuint32_t, svuint32_t,
+		       svst1h_scatter_u32base_index_u32 (p0, z1, 5, z0),
+		       svst1h_scatter_index (p0, z1, 5, z0))
+
+/*
+** st1h_scatter_31_u32_index:
+**	st1h	z0\.s, p0, \[z1\.s, #62\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_31_u32_index, svuint32_t, svuint32_t,
+		       svst1h_scatter_u32base_index_u32 (p0, z1, 31, z0),
+		       svst1h_scatter_index (p0, z1, 31, z0))
+
+/*
+** st1h_scatter_32_u32_index:
+**	mov	(x[0-9]+), #?64
+**	st1h	z0\.s, p0, \[\1, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_32_u32_index, svuint32_t, svuint32_t,
+		       svst1h_scatter_u32base_index_u32 (p0, z1, 32, z0),
+		       svst1h_scatter_index (p0, z1, 32, z0))
+
+/*
+** st1h_scatter_x0_u32_s32offset:
+**	st1h	z0\.s, p0, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_x0_u32_s32offset, svuint32_t, uint16_t, svint32_t,
+		       svst1h_scatter_s32offset_u32 (p0, x0, z1, z0),
+		       svst1h_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_u32_s32offset:
+**	st1h	z0\.s, p0, \[x0, z1\.s, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_u32_s32offset, svuint32_t, uint16_t, svint32_t,
+		       svst1h_scatter_s32offset_u32 (p0, x0, z1, z0),
+		       svst1h_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_x0_u32_u32offset:
+**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_x0_u32_u32offset, svuint32_t, uint16_t, svuint32_t,
+		       svst1h_scatter_u32offset_u32 (p0, x0, z1, z0),
+		       svst1h_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_u32_u32offset:
+**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_u32_u32offset, svuint32_t, uint16_t, svuint32_t,
+		       svst1h_scatter_u32offset_u32 (p0, x0, z1, z0),
+		       svst1h_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_x0_u32_s32index:
+**	st1h	z0\.s, p0, \[x0, z1\.s, sxtw 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_x0_u32_s32index, svuint32_t, uint16_t, svint32_t,
+		       svst1h_scatter_s32index_u32 (p0, x0, z1, z0),
+		       svst1h_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_u32_s32index:
+**	st1h	z0\.s, p0, \[x0, z1\.s, sxtw 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_u32_s32index, svuint32_t, uint16_t, svint32_t,
+		       svst1h_scatter_s32index_u32 (p0, x0, z1, z0),
+		       svst1h_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_x0_u32_u32index:
+**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_x0_u32_u32index, svuint32_t, uint16_t, svuint32_t,
+		       svst1h_scatter_u32index_u32 (p0, x0, z1, z0),
+		       svst1h_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_u32_u32index:
+**	st1h	z0\.s, p0, \[x0, z1\.s, uxtw 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_u32_u32index, svuint32_t, uint16_t, svuint32_t,
+		       svst1h_scatter_u32index_u32 (p0, x0, z1, z0),
+		       svst1h_scatter_index (p0, x0, z1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_u64.c
new file mode 100644
index 000000000..7021ea68f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_scatter_u64.c
@@ -0,0 +1,243 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1h_scatter_u64:
+**	st1h	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_u64, svuint64_t, svuint64_t,
+		       svst1h_scatter_u64base_u64 (p0, z1, z0),
+		       svst1h_scatter (p0, z1, z0))
+
+/*
+** st1h_scatter_x0_u64_offset:
+**	st1h	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_x0_u64_offset, svuint64_t, svuint64_t,
+		       svst1h_scatter_u64base_offset_u64 (p0, z1, x0, z0),
+		       svst1h_scatter_offset (p0, z1, x0, z0))
+
+/*
+** st1h_scatter_m2_u64_offset:
+**	mov	(x[0-9]+), #?-2
+**	st1h	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_m2_u64_offset, svuint64_t, svuint64_t,
+		       svst1h_scatter_u64base_offset_u64 (p0, z1, -2, z0),
+		       svst1h_scatter_offset (p0, z1, -2, z0))
+
+/*
+** st1h_scatter_0_u64_offset:
+**	st1h	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_0_u64_offset, svuint64_t, svuint64_t,
+		       svst1h_scatter_u64base_offset_u64 (p0, z1, 0, z0),
+		       svst1h_scatter_offset (p0, z1, 0, z0))
+
+/*
+** st1h_scatter_5_u64_offset:
+**	mov	(x[0-9]+), #?5
+**	st1h	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_5_u64_offset, svuint64_t, svuint64_t,
+		       svst1h_scatter_u64base_offset_u64 (p0, z1, 5, z0),
+		       svst1h_scatter_offset (p0, z1, 5, z0))
+
+/*
+** st1h_scatter_6_u64_offset:
+**	st1h	z0\.d, p0, \[z1\.d, #6\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_6_u64_offset, svuint64_t, svuint64_t,
+		       svst1h_scatter_u64base_offset_u64 (p0, z1, 6, z0),
+		       svst1h_scatter_offset (p0, z1, 6, z0))
+
+/*
+** st1h_scatter_62_u64_offset:
+**	st1h	z0\.d, p0, \[z1\.d, #62\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_62_u64_offset, svuint64_t, svuint64_t,
+		       svst1h_scatter_u64base_offset_u64 (p0, z1, 62, z0),
+		       svst1h_scatter_offset (p0, z1, 62, z0))
+
+/*
+** st1h_scatter_64_u64_offset:
+**	mov	(x[0-9]+), #?64
+**	st1h	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_64_u64_offset, svuint64_t, svuint64_t,
+		       svst1h_scatter_u64base_offset_u64 (p0, z1, 64, z0),
+		       svst1h_scatter_offset (p0, z1, 64, z0))
+
+/*
+** st1h_scatter_x0_u64_index:
+**	lsl	(x[0-9]+), x0, #?1
+**	st1h	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_x0_u64_index, svuint64_t, svuint64_t,
+		       svst1h_scatter_u64base_index_u64 (p0, z1, x0, z0),
+		       svst1h_scatter_index (p0, z1, x0, z0))
+
+/*
+** st1h_scatter_m1_u64_index:
+**	mov	(x[0-9]+), #?-2
+**	st1h	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_m1_u64_index, svuint64_t, svuint64_t,
+		       svst1h_scatter_u64base_index_u64 (p0, z1, -1, z0),
+		       svst1h_scatter_index (p0, z1, -1, z0))
+
+/*
+** st1h_scatter_0_u64_index:
+**	st1h	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_0_u64_index, svuint64_t, svuint64_t,
+		       svst1h_scatter_u64base_index_u64 (p0, z1, 0, z0),
+		       svst1h_scatter_index (p0, z1, 0, z0))
+
+/*
+** st1h_scatter_5_u64_index:
+**	st1h	z0\.d, p0, \[z1\.d, #10\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_5_u64_index, svuint64_t, svuint64_t,
+		       svst1h_scatter_u64base_index_u64 (p0, z1, 5, z0),
+		       svst1h_scatter_index (p0, z1, 5, z0))
+
+/*
+** st1h_scatter_31_u64_index:
+**	st1h	z0\.d, p0, \[z1\.d, #62\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_31_u64_index, svuint64_t, svuint64_t,
+		       svst1h_scatter_u64base_index_u64 (p0, z1, 31, z0),
+		       svst1h_scatter_index (p0, z1, 31, z0))
+
+/*
+** st1h_scatter_32_u64_index:
+**	mov	(x[0-9]+), #?64
+**	st1h	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1h_scatter_32_u64_index, svuint64_t, svuint64_t,
+		       svst1h_scatter_u64base_index_u64 (p0, z1, 32, z0),
+		       svst1h_scatter_index (p0, z1, 32, z0))
+
+/*
+** st1h_scatter_x0_u64_s64offset:
+**	st1h	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_x0_u64_s64offset, svuint64_t, uint16_t, svint64_t,
+		       svst1h_scatter_s64offset_u64 (p0, x0, z1, z0),
+		       svst1h_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_u64_s64offset:
+**	st1h	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_u64_s64offset, svuint64_t, uint16_t, svint64_t,
+		       svst1h_scatter_s64offset_u64 (p0, x0, z1, z0),
+		       svst1h_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_ext_u64_s64offset:
+**	st1h	z0\.d, p0, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_ext_u64_s64offset, svuint64_t, uint16_t, svint64_t,
+		       svst1h_scatter_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1), z0),
+		       svst1h_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1h_scatter_x0_u64_u64offset:
+**	st1h	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_x0_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
+		       svst1h_scatter_u64offset_u64 (p0, x0, z1, z0),
+		       svst1h_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_u64_u64offset:
+**	st1h	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
+		       svst1h_scatter_u64offset_u64 (p0, x0, z1, z0),
+		       svst1h_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_ext_u64_u64offset:
+**	st1h	z0\.d, p0, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_ext_u64_u64offset, svuint64_t, uint16_t, svuint64_t,
+		       svst1h_scatter_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1), z0),
+		       svst1h_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1h_scatter_x0_u64_s64index:
+**	st1h	z0\.d, p0, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_x0_u64_s64index, svuint64_t, uint16_t, svint64_t,
+		       svst1h_scatter_s64index_u64 (p0, x0, z1, z0),
+		       svst1h_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_u64_s64index:
+**	st1h	z0\.d, p0, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_u64_s64index, svuint64_t, uint16_t, svint64_t,
+		       svst1h_scatter_s64index_u64 (p0, x0, z1, z0),
+		       svst1h_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_ext_u64_s64index:
+**	st1h	z0\.d, p0, \[x0, z1\.d, sxtw 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_ext_u64_s64index, svuint64_t, uint16_t, svint64_t,
+		       svst1h_scatter_s64index_u64 (p0, x0, svextw_s64_x (p0, z1), z0),
+		       svst1h_scatter_index (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1h_scatter_x0_u64_u64index:
+**	st1h	z0\.d, p0, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_x0_u64_u64index, svuint64_t, uint16_t, svuint64_t,
+		       svst1h_scatter_u64index_u64 (p0, x0, z1, z0),
+		       svst1h_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_u64_u64index:
+**	st1h	z0\.d, p0, \[x0, z1\.d, lsl 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_u64_u64index, svuint64_t, uint16_t, svuint64_t,
+		       svst1h_scatter_u64index_u64 (p0, x0, z1, z0),
+		       svst1h_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1h_scatter_ext_u64_u64index:
+**	st1h	z0\.d, p0, \[x0, z1\.d, uxtw 1\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1h_scatter_ext_u64_u64index, svuint64_t, uint16_t, svuint64_t,
+		       svst1h_scatter_u64index_u64 (p0, x0, svextw_u64_x (p0, z1), z0),
+		       svst1h_scatter_index (p0, x0, svextw_x (p0, z1), z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_u32.c
new file mode 100644
index 000000000..49111043b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_u32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1h_u32_base:
+**	st1h	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_u32_base, svuint32_t, uint16_t,
+	    svst1h_u32 (p0, x0, z0),
+	    svst1h (p0, x0, z0))
+
+/*
+** st1h_u32_index:
+**	st1h	z0\.s, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st1h_u32_index, svuint32_t, uint16_t,
+	    svst1h_u32 (p0, x0 + x1, z0),
+	    svst1h (p0, x0 + x1, z0))
+
+/*
+** st1h_u32_1:
+**	st1h	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_u32_1, svuint32_t, uint16_t,
+	    svst1h_u32 (p0, x0 + svcntw (), z0),
+	    svst1h (p0, x0 + svcntw (), z0))
+
+/*
+** st1h_u32_7:
+**	st1h	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_u32_7, svuint32_t, uint16_t,
+	    svst1h_u32 (p0, x0 + svcntw () * 7, z0),
+	    svst1h (p0, x0 + svcntw () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1h_u32_8:
+**	incb	x0, all, mul #4
+**	st1h	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_u32_8, svuint32_t, uint16_t,
+	    svst1h_u32 (p0, x0 + svcntw () * 8, z0),
+	    svst1h (p0, x0 + svcntw () * 8, z0))
+
+/*
+** st1h_u32_m1:
+**	st1h	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_u32_m1, svuint32_t, uint16_t,
+	    svst1h_u32 (p0, x0 - svcntw (), z0),
+	    svst1h (p0, x0 - svcntw (), z0))
+
+/*
+** st1h_u32_m8:
+**	st1h	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_u32_m8, svuint32_t, uint16_t,
+	    svst1h_u32 (p0, x0 - svcntw () * 8, z0),
+	    svst1h (p0, x0 - svcntw () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1h_u32_m9:
+**	dech	x0, all, mul #9
+**	st1h	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_u32_m9, svuint32_t, uint16_t,
+	    svst1h_u32 (p0, x0 - svcntw () * 9, z0),
+	    svst1h (p0, x0 - svcntw () * 9, z0))
+
+/*
+** st1h_vnum_u32_0:
+**	st1h	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_u32_0, svuint32_t, uint16_t,
+	    svst1h_vnum_u32 (p0, x0, 0, z0),
+	    svst1h_vnum (p0, x0, 0, z0))
+
+/*
+** st1h_vnum_u32_1:
+**	st1h	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_u32_1, svuint32_t, uint16_t,
+	    svst1h_vnum_u32 (p0, x0, 1, z0),
+	    svst1h_vnum (p0, x0, 1, z0))
+
+/*
+** st1h_vnum_u32_7:
+**	st1h	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_u32_7, svuint32_t, uint16_t,
+	    svst1h_vnum_u32 (p0, x0, 7, z0),
+	    svst1h_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1h_vnum_u32_8:
+**	incb	x0, all, mul #4
+**	st1h	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_u32_8, svuint32_t, uint16_t,
+	    svst1h_vnum_u32 (p0, x0, 8, z0),
+	    svst1h_vnum (p0, x0, 8, z0))
+
+/*
+** st1h_vnum_u32_m1:
+**	st1h	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_u32_m1, svuint32_t, uint16_t,
+	    svst1h_vnum_u32 (p0, x0, -1, z0),
+	    svst1h_vnum (p0, x0, -1, z0))
+
+/*
+** st1h_vnum_u32_m8:
+**	st1h	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_u32_m8, svuint32_t, uint16_t,
+	    svst1h_vnum_u32 (p0, x0, -8, z0),
+	    svst1h_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1h_vnum_u32_m9:
+**	dech	x0, all, mul #9
+**	st1h	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_u32_m9, svuint32_t, uint16_t,
+	    svst1h_vnum_u32 (p0, x0, -9, z0),
+	    svst1h_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st1h_vnum_u32_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st1h	z0\.s, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_u32_x1, svuint32_t, uint16_t,
+	    svst1h_vnum_u32 (p0, x0, x1, z0),
+	    svst1h_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_u64.c
new file mode 100644
index 000000000..448cadb49
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1h_u64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1h_u64_base:
+**	st1h	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_u64_base, svuint64_t, uint16_t,
+	    svst1h_u64 (p0, x0, z0),
+	    svst1h (p0, x0, z0))
+
+/*
+** st1h_u64_index:
+**	st1h	z0\.d, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st1h_u64_index, svuint64_t, uint16_t,
+	    svst1h_u64 (p0, x0 + x1, z0),
+	    svst1h (p0, x0 + x1, z0))
+
+/*
+** st1h_u64_1:
+**	st1h	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_u64_1, svuint64_t, uint16_t,
+	    svst1h_u64 (p0, x0 + svcntd (), z0),
+	    svst1h (p0, x0 + svcntd (), z0))
+
+/*
+** st1h_u64_7:
+**	st1h	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_u64_7, svuint64_t, uint16_t,
+	    svst1h_u64 (p0, x0 + svcntd () * 7, z0),
+	    svst1h (p0, x0 + svcntd () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1h_u64_8:
+**	incb	x0, all, mul #2
+**	st1h	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_u64_8, svuint64_t, uint16_t,
+	    svst1h_u64 (p0, x0 + svcntd () * 8, z0),
+	    svst1h (p0, x0 + svcntd () * 8, z0))
+
+/*
+** st1h_u64_m1:
+**	st1h	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_u64_m1, svuint64_t, uint16_t,
+	    svst1h_u64 (p0, x0 - svcntd (), z0),
+	    svst1h (p0, x0 - svcntd (), z0))
+
+/*
+** st1h_u64_m8:
+**	st1h	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_u64_m8, svuint64_t, uint16_t,
+	    svst1h_u64 (p0, x0 - svcntd () * 8, z0),
+	    svst1h (p0, x0 - svcntd () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1h_u64_m9:
+**	decw	x0, all, mul #9
+**	st1h	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_u64_m9, svuint64_t, uint16_t,
+	    svst1h_u64 (p0, x0 - svcntd () * 9, z0),
+	    svst1h (p0, x0 - svcntd () * 9, z0))
+
+/*
+** st1h_vnum_u64_0:
+**	st1h	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_u64_0, svuint64_t, uint16_t,
+	    svst1h_vnum_u64 (p0, x0, 0, z0),
+	    svst1h_vnum (p0, x0, 0, z0))
+
+/*
+** st1h_vnum_u64_1:
+**	st1h	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_u64_1, svuint64_t, uint16_t,
+	    svst1h_vnum_u64 (p0, x0, 1, z0),
+	    svst1h_vnum (p0, x0, 1, z0))
+
+/*
+** st1h_vnum_u64_7:
+**	st1h	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_u64_7, svuint64_t, uint16_t,
+	    svst1h_vnum_u64 (p0, x0, 7, z0),
+	    svst1h_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1h_vnum_u64_8:
+**	incb	x0, all, mul #2
+**	st1h	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_u64_8, svuint64_t, uint16_t,
+	    svst1h_vnum_u64 (p0, x0, 8, z0),
+	    svst1h_vnum (p0, x0, 8, z0))
+
+/*
+** st1h_vnum_u64_m1:
+**	st1h	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_u64_m1, svuint64_t, uint16_t,
+	    svst1h_vnum_u64 (p0, x0, -1, z0),
+	    svst1h_vnum (p0, x0, -1, z0))
+
+/*
+** st1h_vnum_u64_m8:
+**	st1h	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_u64_m8, svuint64_t, uint16_t,
+	    svst1h_vnum_u64 (p0, x0, -8, z0),
+	    svst1h_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1h_vnum_u64_m9:
+**	decw	x0, all, mul #9
+**	st1h	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_u64_m9, svuint64_t, uint16_t,
+	    svst1h_vnum_u64 (p0, x0, -9, z0),
+	    svst1h_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st1h_vnum_u64_x1:
+**	cntw	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st1h	z0\.d, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st1h_vnum_u64_x1, svuint64_t, uint16_t,
+	    svst1h_vnum_u64 (p0, x0, x1, z0),
+	    svst1h_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_s64.c
new file mode 100644
index 000000000..0893ce926
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_s64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1w_s64_base:
+**	st1w	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1w_s64_base, svint64_t, int32_t,
+	    svst1w_s64 (p0, x0, z0),
+	    svst1w (p0, x0, z0))
+
+/*
+** st1w_s64_index:
+**	st1w	z0\.d, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (st1w_s64_index, svint64_t, int32_t,
+	    svst1w_s64 (p0, x0 + x1, z0),
+	    svst1w (p0, x0 + x1, z0))
+
+/*
+** st1w_s64_1:
+**	st1w	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1w_s64_1, svint64_t, int32_t,
+	    svst1w_s64 (p0, x0 + svcntd (), z0),
+	    svst1w (p0, x0 + svcntd (), z0))
+
+/*
+** st1w_s64_7:
+**	st1w	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1w_s64_7, svint64_t, int32_t,
+	    svst1w_s64 (p0, x0 + svcntd () * 7, z0),
+	    svst1w (p0, x0 + svcntd () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1w_s64_8:
+**	incb	x0, all, mul #4
+**	st1w	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1w_s64_8, svint64_t, int32_t,
+	    svst1w_s64 (p0, x0 + svcntd () * 8, z0),
+	    svst1w (p0, x0 + svcntd () * 8, z0))
+
+/*
+** st1w_s64_m1:
+**	st1w	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1w_s64_m1, svint64_t, int32_t,
+	    svst1w_s64 (p0, x0 - svcntd (), z0),
+	    svst1w (p0, x0 - svcntd (), z0))
+
+/*
+** st1w_s64_m8:
+**	st1w	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1w_s64_m8, svint64_t, int32_t,
+	    svst1w_s64 (p0, x0 - svcntd () * 8, z0),
+	    svst1w (p0, x0 - svcntd () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1w_s64_m9:
+**	dech	x0, all, mul #9
+**	st1w	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1w_s64_m9, svint64_t, int32_t,
+	    svst1w_s64 (p0, x0 - svcntd () * 9, z0),
+	    svst1w (p0, x0 - svcntd () * 9, z0))
+
+/*
+** st1w_vnum_s64_0:
+**	st1w	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1w_vnum_s64_0, svint64_t, int32_t,
+	    svst1w_vnum_s64 (p0, x0, 0, z0),
+	    svst1w_vnum (p0, x0, 0, z0))
+
+/*
+** st1w_vnum_s64_1:
+**	st1w	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1w_vnum_s64_1, svint64_t, int32_t,
+	    svst1w_vnum_s64 (p0, x0, 1, z0),
+	    svst1w_vnum (p0, x0, 1, z0))
+
+/*
+** st1w_vnum_s64_7:
+**	st1w	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1w_vnum_s64_7, svint64_t, int32_t,
+	    svst1w_vnum_s64 (p0, x0, 7, z0),
+	    svst1w_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1w_vnum_s64_8:
+**	incb	x0, all, mul #4
+**	st1w	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1w_vnum_s64_8, svint64_t, int32_t,
+	    svst1w_vnum_s64 (p0, x0, 8, z0),
+	    svst1w_vnum (p0, x0, 8, z0))
+
+/*
+** st1w_vnum_s64_m1:
+**	st1w	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1w_vnum_s64_m1, svint64_t, int32_t,
+	    svst1w_vnum_s64 (p0, x0, -1, z0),
+	    svst1w_vnum (p0, x0, -1, z0))
+
+/*
+** st1w_vnum_s64_m8:
+**	st1w	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1w_vnum_s64_m8, svint64_t, int32_t,
+	    svst1w_vnum_s64 (p0, x0, -8, z0),
+	    svst1w_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1w_vnum_s64_m9:
+**	dech	x0, all, mul #9
+**	st1w	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1w_vnum_s64_m9, svint64_t, int32_t,
+	    svst1w_vnum_s64 (p0, x0, -9, z0),
+	    svst1w_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st1w_vnum_s64_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st1w	z0\.d, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st1w_vnum_s64_x1, svint64_t, int32_t,
+	    svst1w_vnum_s64 (p0, x0, x1, z0),
+	    svst1w_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_scatter_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_scatter_s64.c
new file mode 100644
index 000000000..2363f592b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_scatter_s64.c
@@ -0,0 +1,263 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1w_scatter_s64:
+**	st1w	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_s64, svint64_t, svuint64_t,
+		       svst1w_scatter_u64base_s64 (p0, z1, z0),
+		       svst1w_scatter (p0, z1, z0))
+
+/*
+** st1w_scatter_x0_s64_offset:
+**	st1w	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_x0_s64_offset, svint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_s64 (p0, z1, x0, z0),
+		       svst1w_scatter_offset (p0, z1, x0, z0))
+
+/*
+** st1w_scatter_m4_s64_offset:
+**	mov	(x[0-9]+), #?-4
+**	st1w	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_m4_s64_offset, svint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_s64 (p0, z1, -4, z0),
+		       svst1w_scatter_offset (p0, z1, -4, z0))
+
+/*
+** st1w_scatter_0_s64_offset:
+**	st1w	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_0_s64_offset, svint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_s64 (p0, z1, 0, z0),
+		       svst1w_scatter_offset (p0, z1, 0, z0))
+
+/*
+** st1w_scatter_5_s64_offset:
+**	mov	(x[0-9]+), #?5
+**	st1w	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_5_s64_offset, svint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_s64 (p0, z1, 5, z0),
+		       svst1w_scatter_offset (p0, z1, 5, z0))
+
+/*
+** st1w_scatter_6_s64_offset:
+**	mov	(x[0-9]+), #?6
+**	st1w	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_6_s64_offset, svint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_s64 (p0, z1, 6, z0),
+		       svst1w_scatter_offset (p0, z1, 6, z0))
+
+/*
+** st1w_scatter_7_s64_offset:
+**	mov	(x[0-9]+), #?7
+**	st1w	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_7_s64_offset, svint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_s64 (p0, z1, 7, z0),
+		       svst1w_scatter_offset (p0, z1, 7, z0))
+
+/*
+** st1w_scatter_8_s64_offset:
+**	st1w	z0\.d, p0, \[z1\.d, #8\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_8_s64_offset, svint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_s64 (p0, z1, 8, z0),
+		       svst1w_scatter_offset (p0, z1, 8, z0))
+
+/*
+** st1w_scatter_124_s64_offset:
+**	st1w	z0\.d, p0, \[z1\.d, #124\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_124_s64_offset, svint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_s64 (p0, z1, 124, z0),
+		       svst1w_scatter_offset (p0, z1, 124, z0))
+
+/*
+** st1w_scatter_128_s64_offset:
+**	mov	(x[0-9]+), #?128
+**	st1w	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_128_s64_offset, svint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_s64 (p0, z1, 128, z0),
+		       svst1w_scatter_offset (p0, z1, 128, z0))
+
+/*
+** st1w_scatter_x0_s64_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	st1w	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_x0_s64_index, svint64_t, svuint64_t,
+		       svst1w_scatter_u64base_index_s64 (p0, z1, x0, z0),
+		       svst1w_scatter_index (p0, z1, x0, z0))
+
+/*
+** st1w_scatter_m1_s64_index:
+**	mov	(x[0-9]+), #?-4
+**	st1w	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_m1_s64_index, svint64_t, svuint64_t,
+		       svst1w_scatter_u64base_index_s64 (p0, z1, -1, z0),
+		       svst1w_scatter_index (p0, z1, -1, z0))
+
+/*
+** st1w_scatter_0_s64_index:
+**	st1w	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_0_s64_index, svint64_t, svuint64_t,
+		       svst1w_scatter_u64base_index_s64 (p0, z1, 0, z0),
+		       svst1w_scatter_index (p0, z1, 0, z0))
+
+/*
+** st1w_scatter_5_s64_index:
+**	st1w	z0\.d, p0, \[z1\.d, #20\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_5_s64_index, svint64_t, svuint64_t,
+		       svst1w_scatter_u64base_index_s64 (p0, z1, 5, z0),
+		       svst1w_scatter_index (p0, z1, 5, z0))
+
+/*
+** st1w_scatter_31_s64_index:
+**	st1w	z0\.d, p0, \[z1\.d, #124\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_31_s64_index, svint64_t, svuint64_t,
+		       svst1w_scatter_u64base_index_s64 (p0, z1, 31, z0),
+		       svst1w_scatter_index (p0, z1, 31, z0))
+
+/*
+** st1w_scatter_32_s64_index:
+**	mov	(x[0-9]+), #?128
+**	st1w	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_32_s64_index, svint64_t, svuint64_t,
+		       svst1w_scatter_u64base_index_s64 (p0, z1, 32, z0),
+		       svst1w_scatter_index (p0, z1, 32, z0))
+
+/*
+** st1w_scatter_x0_s64_s64offset:
+**	st1w	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_x0_s64_s64offset, svint64_t, int32_t, svint64_t,
+		       svst1w_scatter_s64offset_s64 (p0, x0, z1, z0),
+		       svst1w_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1w_scatter_s64_s64offset:
+**	st1w	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_s64_s64offset, svint64_t, int32_t, svint64_t,
+		       svst1w_scatter_s64offset_s64 (p0, x0, z1, z0),
+		       svst1w_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1w_scatter_ext_s64_s64offset:
+**	st1w	z0\.d, p0, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_ext_s64_s64offset, svint64_t, int32_t, svint64_t,
+		       svst1w_scatter_s64offset_s64 (p0, x0, svextw_s64_x (p0, z1), z0),
+		       svst1w_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1w_scatter_x0_s64_u64offset:
+**	st1w	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_x0_s64_u64offset, svint64_t, int32_t, svuint64_t,
+		       svst1w_scatter_u64offset_s64 (p0, x0, z1, z0),
+		       svst1w_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1w_scatter_s64_u64offset:
+**	st1w	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_s64_u64offset, svint64_t, int32_t, svuint64_t,
+		       svst1w_scatter_u64offset_s64 (p0, x0, z1, z0),
+		       svst1w_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1w_scatter_ext_s64_u64offset:
+**	st1w	z0\.d, p0, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_ext_s64_u64offset, svint64_t, int32_t, svuint64_t,
+		       svst1w_scatter_u64offset_s64 (p0, x0, svextw_u64_x (p0, z1), z0),
+		       svst1w_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1w_scatter_x0_s64_s64index:
+**	st1w	z0\.d, p0, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_x0_s64_s64index, svint64_t, int32_t, svint64_t,
+		       svst1w_scatter_s64index_s64 (p0, x0, z1, z0),
+		       svst1w_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1w_scatter_s64_s64index:
+**	st1w	z0\.d, p0, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_s64_s64index, svint64_t, int32_t, svint64_t,
+		       svst1w_scatter_s64index_s64 (p0, x0, z1, z0),
+		       svst1w_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1w_scatter_ext_s64_s64index:
+**	st1w	z0\.d, p0, \[x0, z1\.d, sxtw 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_ext_s64_s64index, svint64_t, int32_t, svint64_t,
+		       svst1w_scatter_s64index_s64 (p0, x0, svextw_s64_x (p0, z1), z0),
+		       svst1w_scatter_index (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1w_scatter_x0_s64_u64index:
+**	st1w	z0\.d, p0, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_x0_s64_u64index, svint64_t, int32_t, svuint64_t,
+		       svst1w_scatter_u64index_s64 (p0, x0, z1, z0),
+		       svst1w_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1w_scatter_s64_u64index:
+**	st1w	z0\.d, p0, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_s64_u64index, svint64_t, int32_t, svuint64_t,
+		       svst1w_scatter_u64index_s64 (p0, x0, z1, z0),
+		       svst1w_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1w_scatter_ext_s64_u64index:
+**	st1w	z0\.d, p0, \[x0, z1\.d, uxtw 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_ext_s64_u64index, svint64_t, int32_t, svuint64_t,
+		       svst1w_scatter_u64index_s64 (p0, x0, svextw_u64_x (p0, z1), z0),
+		       svst1w_scatter_index (p0, x0, svextw_x (p0, z1), z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_scatter_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_scatter_u64.c
new file mode 100644
index 000000000..767c009b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_scatter_u64.c
@@ -0,0 +1,263 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1w_scatter_u64:
+**	st1w	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_u64, svuint64_t, svuint64_t,
+		       svst1w_scatter_u64base_u64 (p0, z1, z0),
+		       svst1w_scatter (p0, z1, z0))
+
+/*
+** st1w_scatter_x0_u64_offset:
+**	st1w	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_x0_u64_offset, svuint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_u64 (p0, z1, x0, z0),
+		       svst1w_scatter_offset (p0, z1, x0, z0))
+
+/*
+** st1w_scatter_m4_u64_offset:
+**	mov	(x[0-9]+), #?-4
+**	st1w	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_m4_u64_offset, svuint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_u64 (p0, z1, -4, z0),
+		       svst1w_scatter_offset (p0, z1, -4, z0))
+
+/*
+** st1w_scatter_0_u64_offset:
+**	st1w	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_0_u64_offset, svuint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_u64 (p0, z1, 0, z0),
+		       svst1w_scatter_offset (p0, z1, 0, z0))
+
+/*
+** st1w_scatter_5_u64_offset:
+**	mov	(x[0-9]+), #?5
+**	st1w	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_5_u64_offset, svuint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_u64 (p0, z1, 5, z0),
+		       svst1w_scatter_offset (p0, z1, 5, z0))
+
+/*
+** st1w_scatter_6_u64_offset:
+**	mov	(x[0-9]+), #?6
+**	st1w	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_6_u64_offset, svuint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_u64 (p0, z1, 6, z0),
+		       svst1w_scatter_offset (p0, z1, 6, z0))
+
+/*
+** st1w_scatter_7_u64_offset:
+**	mov	(x[0-9]+), #?7
+**	st1w	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_7_u64_offset, svuint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_u64 (p0, z1, 7, z0),
+		       svst1w_scatter_offset (p0, z1, 7, z0))
+
+/*
+** st1w_scatter_8_u64_offset:
+**	st1w	z0\.d, p0, \[z1\.d, #8\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_8_u64_offset, svuint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_u64 (p0, z1, 8, z0),
+		       svst1w_scatter_offset (p0, z1, 8, z0))
+
+/*
+** st1w_scatter_124_u64_offset:
+**	st1w	z0\.d, p0, \[z1\.d, #124\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_124_u64_offset, svuint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_u64 (p0, z1, 124, z0),
+		       svst1w_scatter_offset (p0, z1, 124, z0))
+
+/*
+** st1w_scatter_128_u64_offset:
+**	mov	(x[0-9]+), #?128
+**	st1w	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_128_u64_offset, svuint64_t, svuint64_t,
+		       svst1w_scatter_u64base_offset_u64 (p0, z1, 128, z0),
+		       svst1w_scatter_offset (p0, z1, 128, z0))
+
+/*
+** st1w_scatter_x0_u64_index:
+**	lsl	(x[0-9]+), x0, #?2
+**	st1w	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_x0_u64_index, svuint64_t, svuint64_t,
+		       svst1w_scatter_u64base_index_u64 (p0, z1, x0, z0),
+		       svst1w_scatter_index (p0, z1, x0, z0))
+
+/*
+** st1w_scatter_m1_u64_index:
+**	mov	(x[0-9]+), #?-4
+**	st1w	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_m1_u64_index, svuint64_t, svuint64_t,
+		       svst1w_scatter_u64base_index_u64 (p0, z1, -1, z0),
+		       svst1w_scatter_index (p0, z1, -1, z0))
+
+/*
+** st1w_scatter_0_u64_index:
+**	st1w	z0\.d, p0, \[z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_0_u64_index, svuint64_t, svuint64_t,
+		       svst1w_scatter_u64base_index_u64 (p0, z1, 0, z0),
+		       svst1w_scatter_index (p0, z1, 0, z0))
+
+/*
+** st1w_scatter_5_u64_index:
+**	st1w	z0\.d, p0, \[z1\.d, #20\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_5_u64_index, svuint64_t, svuint64_t,
+		       svst1w_scatter_u64base_index_u64 (p0, z1, 5, z0),
+		       svst1w_scatter_index (p0, z1, 5, z0))
+
+/*
+** st1w_scatter_31_u64_index:
+**	st1w	z0\.d, p0, \[z1\.d, #124\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_31_u64_index, svuint64_t, svuint64_t,
+		       svst1w_scatter_u64base_index_u64 (p0, z1, 31, z0),
+		       svst1w_scatter_index (p0, z1, 31, z0))
+
+/*
+** st1w_scatter_32_u64_index:
+**	mov	(x[0-9]+), #?128
+**	st1w	z0\.d, p0, \[\1, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_ZS (st1w_scatter_32_u64_index, svuint64_t, svuint64_t,
+		       svst1w_scatter_u64base_index_u64 (p0, z1, 32, z0),
+		       svst1w_scatter_index (p0, z1, 32, z0))
+
+/*
+** st1w_scatter_x0_u64_s64offset:
+**	st1w	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_x0_u64_s64offset, svuint64_t, uint32_t, svint64_t,
+		       svst1w_scatter_s64offset_u64 (p0, x0, z1, z0),
+		       svst1w_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1w_scatter_u64_s64offset:
+**	st1w	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_u64_s64offset, svuint64_t, uint32_t, svint64_t,
+		       svst1w_scatter_s64offset_u64 (p0, x0, z1, z0),
+		       svst1w_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1w_scatter_ext_u64_s64offset:
+**	st1w	z0\.d, p0, \[x0, z1\.d, sxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_ext_u64_s64offset, svuint64_t, uint32_t, svint64_t,
+		       svst1w_scatter_s64offset_u64 (p0, x0, svextw_s64_x (p0, z1), z0),
+		       svst1w_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1w_scatter_x0_u64_u64offset:
+**	st1w	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_x0_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
+		       svst1w_scatter_u64offset_u64 (p0, x0, z1, z0),
+		       svst1w_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1w_scatter_u64_u64offset:
+**	st1w	z0\.d, p0, \[x0, z1\.d\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
+		       svst1w_scatter_u64offset_u64 (p0, x0, z1, z0),
+		       svst1w_scatter_offset (p0, x0, z1, z0))
+
+/*
+** st1w_scatter_ext_u64_u64offset:
+**	st1w	z0\.d, p0, \[x0, z1\.d, uxtw\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_ext_u64_u64offset, svuint64_t, uint32_t, svuint64_t,
+		       svst1w_scatter_u64offset_u64 (p0, x0, svextw_u64_x (p0, z1), z0),
+		       svst1w_scatter_offset (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1w_scatter_x0_u64_s64index:
+**	st1w	z0\.d, p0, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_x0_u64_s64index, svuint64_t, uint32_t, svint64_t,
+		       svst1w_scatter_s64index_u64 (p0, x0, z1, z0),
+		       svst1w_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1w_scatter_u64_s64index:
+**	st1w	z0\.d, p0, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_u64_s64index, svuint64_t, uint32_t, svint64_t,
+		       svst1w_scatter_s64index_u64 (p0, x0, z1, z0),
+		       svst1w_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1w_scatter_ext_u64_s64index:
+**	st1w	z0\.d, p0, \[x0, z1\.d, sxtw 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_ext_u64_s64index, svuint64_t, uint32_t, svint64_t,
+		       svst1w_scatter_s64index_u64 (p0, x0, svextw_s64_x (p0, z1), z0),
+		       svst1w_scatter_index (p0, x0, svextw_x (p0, z1), z0))
+
+/*
+** st1w_scatter_x0_u64_u64index:
+**	st1w	z0\.d, p0, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_x0_u64_u64index, svuint64_t, uint32_t, svuint64_t,
+		       svst1w_scatter_u64index_u64 (p0, x0, z1, z0),
+		       svst1w_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1w_scatter_u64_u64index:
+**	st1w	z0\.d, p0, \[x0, z1\.d, lsl 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_u64_u64index, svuint64_t, uint32_t, svuint64_t,
+		       svst1w_scatter_u64index_u64 (p0, x0, z1, z0),
+		       svst1w_scatter_index (p0, x0, z1, z0))
+
+/*
+** st1w_scatter_ext_u64_u64index:
+**	st1w	z0\.d, p0, \[x0, z1\.d, uxtw 2\]
+**	ret
+*/
+TEST_STORE_SCATTER_SZ (st1w_scatter_ext_u64_u64index, svuint64_t, uint32_t, svuint64_t,
+		       svst1w_scatter_u64index_u64 (p0, x0, svextw_u64_x (p0, z1), z0),
+		       svst1w_scatter_index (p0, x0, svextw_x (p0, z1), z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_u64.c
new file mode 100644
index 000000000..882abebbb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st1w_u64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1w_u64_base:
+**	st1w	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1w_u64_base, svuint64_t, uint32_t,
+	    svst1w_u64 (p0, x0, z0),
+	    svst1w (p0, x0, z0))
+
+/*
+** st1w_u64_index:
+**	st1w	z0\.d, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (st1w_u64_index, svuint64_t, uint32_t,
+	    svst1w_u64 (p0, x0 + x1, z0),
+	    svst1w (p0, x0 + x1, z0))
+
+/*
+** st1w_u64_1:
+**	st1w	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1w_u64_1, svuint64_t, uint32_t,
+	    svst1w_u64 (p0, x0 + svcntd (), z0),
+	    svst1w (p0, x0 + svcntd (), z0))
+
+/*
+** st1w_u64_7:
+**	st1w	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1w_u64_7, svuint64_t, uint32_t,
+	    svst1w_u64 (p0, x0 + svcntd () * 7, z0),
+	    svst1w (p0, x0 + svcntd () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1w_u64_8:
+**	incb	x0, all, mul #4
+**	st1w	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1w_u64_8, svuint64_t, uint32_t,
+	    svst1w_u64 (p0, x0 + svcntd () * 8, z0),
+	    svst1w (p0, x0 + svcntd () * 8, z0))
+
+/*
+** st1w_u64_m1:
+**	st1w	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1w_u64_m1, svuint64_t, uint32_t,
+	    svst1w_u64 (p0, x0 - svcntd (), z0),
+	    svst1w (p0, x0 - svcntd (), z0))
+
+/*
+** st1w_u64_m8:
+**	st1w	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1w_u64_m8, svuint64_t, uint32_t,
+	    svst1w_u64 (p0, x0 - svcntd () * 8, z0),
+	    svst1w (p0, x0 - svcntd () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1w_u64_m9:
+**	dech	x0, all, mul #9
+**	st1w	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1w_u64_m9, svuint64_t, uint32_t,
+	    svst1w_u64 (p0, x0 - svcntd () * 9, z0),
+	    svst1w (p0, x0 - svcntd () * 9, z0))
+
+/*
+** st1w_vnum_u64_0:
+**	st1w	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1w_vnum_u64_0, svuint64_t, uint32_t,
+	    svst1w_vnum_u64 (p0, x0, 0, z0),
+	    svst1w_vnum (p0, x0, 0, z0))
+
+/*
+** st1w_vnum_u64_1:
+**	st1w	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1w_vnum_u64_1, svuint64_t, uint32_t,
+	    svst1w_vnum_u64 (p0, x0, 1, z0),
+	    svst1w_vnum (p0, x0, 1, z0))
+
+/*
+** st1w_vnum_u64_7:
+**	st1w	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (st1w_vnum_u64_7, svuint64_t, uint32_t,
+	    svst1w_vnum_u64 (p0, x0, 7, z0),
+	    svst1w_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1w_vnum_u64_8:
+**	incb	x0, all, mul #4
+**	st1w	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1w_vnum_u64_8, svuint64_t, uint32_t,
+	    svst1w_vnum_u64 (p0, x0, 8, z0),
+	    svst1w_vnum (p0, x0, 8, z0))
+
+/*
+** st1w_vnum_u64_m1:
+**	st1w	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (st1w_vnum_u64_m1, svuint64_t, uint32_t,
+	    svst1w_vnum_u64 (p0, x0, -1, z0),
+	    svst1w_vnum (p0, x0, -1, z0))
+
+/*
+** st1w_vnum_u64_m8:
+**	st1w	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (st1w_vnum_u64_m8, svuint64_t, uint32_t,
+	    svst1w_vnum_u64 (p0, x0, -8, z0),
+	    svst1w_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st1w_vnum_u64_m9:
+**	dech	x0, all, mul #9
+**	st1w	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st1w_vnum_u64_m9, svuint64_t, uint32_t,
+	    svst1w_vnum_u64 (p0, x0, -9, z0),
+	    svst1w_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st1w_vnum_u64_x1:
+**	cnth	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st1w	z0\.d, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st1w_vnum_u64_x1, svuint64_t, uint32_t,
+	    svst1w_vnum_u64 (p0, x0, x1, z0),
+	    svst1w_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_bf16.c
new file mode 100644
index 000000000..a4a57af08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_bf16.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st2_bf16_base:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_bf16_base, svbfloat16x2_t, bfloat16_t,
+	    svst2_bf16 (p0, x0, z0),
+	    svst2 (p0, x0, z0))
+
+/*
+** st2_bf16_index:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st2_bf16_index, svbfloat16x2_t, bfloat16_t,
+	    svst2_bf16 (p0, x0 + x1, z0),
+	    svst2 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_bf16_1:
+**	incb	x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_bf16_1, svbfloat16x2_t, bfloat16_t,
+	    svst2_bf16 (p0, x0 + svcnth (), z0),
+	    svst2 (p0, x0 + svcnth (), z0))
+
+/*
+** st2_bf16_2:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_bf16_2, svbfloat16x2_t, bfloat16_t,
+	    svst2_bf16 (p0, x0 + svcnth () * 2, z0),
+	    svst2 (p0, x0 + svcnth () * 2, z0))
+
+/*
+** st2_bf16_14:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_bf16_14, svbfloat16x2_t, bfloat16_t,
+	    svst2_bf16 (p0, x0 + svcnth () * 14, z0),
+	    svst2 (p0, x0 + svcnth () * 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_bf16_16:
+**	incb	x0, all, mul #16
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_bf16_16, svbfloat16x2_t, bfloat16_t,
+	    svst2_bf16 (p0, x0 + svcnth () * 16, z0),
+	    svst2 (p0, x0 + svcnth () * 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_bf16_m1:
+**	decb	x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_bf16_m1, svbfloat16x2_t, bfloat16_t,
+	    svst2_bf16 (p0, x0 - svcnth (), z0),
+	    svst2 (p0, x0 - svcnth (), z0))
+
+/*
+** st2_bf16_m2:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_bf16_m2, svbfloat16x2_t, bfloat16_t,
+	    svst2_bf16 (p0, x0 - svcnth () * 2, z0),
+	    svst2 (p0, x0 - svcnth () * 2, z0))
+
+/*
+** st2_bf16_m16:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_bf16_m16, svbfloat16x2_t, bfloat16_t,
+	    svst2_bf16 (p0, x0 - svcnth () * 16, z0),
+	    svst2 (p0, x0 - svcnth () * 16, z0))
+
+/*
+** st2_bf16_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_bf16_m18, svbfloat16x2_t, bfloat16_t,
+	    svst2_bf16 (p0, x0 - svcnth () * 18, z0),
+	    svst2 (p0, x0 - svcnth () * 18, z0))
+
+/*
+** st2_vnum_bf16_0:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_bf16_0, svbfloat16x2_t, bfloat16_t,
+	    svst2_vnum_bf16 (p0, x0, 0, z0),
+	    svst2_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_bf16_1:
+**	incb	x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_bf16_1, svbfloat16x2_t, bfloat16_t,
+	    svst2_vnum_bf16 (p0, x0, 1, z0),
+	    svst2_vnum (p0, x0, 1, z0))
+
+/*
+** st2_vnum_bf16_2:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_bf16_2, svbfloat16x2_t, bfloat16_t,
+	    svst2_vnum_bf16 (p0, x0, 2, z0),
+	    svst2_vnum (p0, x0, 2, z0))
+
+/*
+** st2_vnum_bf16_14:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_bf16_14, svbfloat16x2_t, bfloat16_t,
+	    svst2_vnum_bf16 (p0, x0, 14, z0),
+	    svst2_vnum (p0, x0, 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_bf16_16:
+**	incb	x0, all, mul #16
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_bf16_16, svbfloat16x2_t, bfloat16_t,
+	    svst2_vnum_bf16 (p0, x0, 16, z0),
+	    svst2_vnum (p0, x0, 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_bf16_m1:
+**	decb	x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_bf16_m1, svbfloat16x2_t, bfloat16_t,
+	    svst2_vnum_bf16 (p0, x0, -1, z0),
+	    svst2_vnum (p0, x0, -1, z0))
+
+/*
+** st2_vnum_bf16_m2:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_bf16_m2, svbfloat16x2_t, bfloat16_t,
+	    svst2_vnum_bf16 (p0, x0, -2, z0),
+	    svst2_vnum (p0, x0, -2, z0))
+
+/*
+** st2_vnum_bf16_m16:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_bf16_m16, svbfloat16x2_t, bfloat16_t,
+	    svst2_vnum_bf16 (p0, x0, -16, z0),
+	    svst2_vnum (p0, x0, -16, z0))
+
+/*
+** st2_vnum_bf16_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_vnum_bf16_m18, svbfloat16x2_t, bfloat16_t,
+	    svst2_vnum_bf16 (p0, x0, -18, z0),
+	    svst2_vnum (p0, x0, -18, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st2_vnum_bf16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st2_vnum_bf16_x1, svbfloat16x2_t, bfloat16_t,
+	    svst2_vnum_bf16 (p0, x0, x1, z0),
+	    svst2_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f16.c
new file mode 100644
index 000000000..014203be6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f16.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st2_f16_base:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_f16_base, svfloat16x2_t, float16_t,
+	    svst2_f16 (p0, x0, z0),
+	    svst2 (p0, x0, z0))
+
+/*
+** st2_f16_index:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st2_f16_index, svfloat16x2_t, float16_t,
+	    svst2_f16 (p0, x0 + x1, z0),
+	    svst2 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_f16_1:
+**	incb	x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_f16_1, svfloat16x2_t, float16_t,
+	    svst2_f16 (p0, x0 + svcnth (), z0),
+	    svst2 (p0, x0 + svcnth (), z0))
+
+/*
+** st2_f16_2:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_f16_2, svfloat16x2_t, float16_t,
+	    svst2_f16 (p0, x0 + svcnth () * 2, z0),
+	    svst2 (p0, x0 + svcnth () * 2, z0))
+
+/*
+** st2_f16_14:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_f16_14, svfloat16x2_t, float16_t,
+	    svst2_f16 (p0, x0 + svcnth () * 14, z0),
+	    svst2 (p0, x0 + svcnth () * 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_f16_16:
+**	incb	x0, all, mul #16
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_f16_16, svfloat16x2_t, float16_t,
+	    svst2_f16 (p0, x0 + svcnth () * 16, z0),
+	    svst2 (p0, x0 + svcnth () * 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_f16_m1:
+**	decb	x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_f16_m1, svfloat16x2_t, float16_t,
+	    svst2_f16 (p0, x0 - svcnth (), z0),
+	    svst2 (p0, x0 - svcnth (), z0))
+
+/*
+** st2_f16_m2:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_f16_m2, svfloat16x2_t, float16_t,
+	    svst2_f16 (p0, x0 - svcnth () * 2, z0),
+	    svst2 (p0, x0 - svcnth () * 2, z0))
+
+/*
+** st2_f16_m16:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_f16_m16, svfloat16x2_t, float16_t,
+	    svst2_f16 (p0, x0 - svcnth () * 16, z0),
+	    svst2 (p0, x0 - svcnth () * 16, z0))
+
+/*
+** st2_f16_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_f16_m18, svfloat16x2_t, float16_t,
+	    svst2_f16 (p0, x0 - svcnth () * 18, z0),
+	    svst2 (p0, x0 - svcnth () * 18, z0))
+
+/*
+** st2_vnum_f16_0:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f16_0, svfloat16x2_t, float16_t,
+	    svst2_vnum_f16 (p0, x0, 0, z0),
+	    svst2_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_f16_1:
+**	incb	x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f16_1, svfloat16x2_t, float16_t,
+	    svst2_vnum_f16 (p0, x0, 1, z0),
+	    svst2_vnum (p0, x0, 1, z0))
+
+/*
+** st2_vnum_f16_2:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f16_2, svfloat16x2_t, float16_t,
+	    svst2_vnum_f16 (p0, x0, 2, z0),
+	    svst2_vnum (p0, x0, 2, z0))
+
+/*
+** st2_vnum_f16_14:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f16_14, svfloat16x2_t, float16_t,
+	    svst2_vnum_f16 (p0, x0, 14, z0),
+	    svst2_vnum (p0, x0, 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_f16_16:
+**	incb	x0, all, mul #16
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f16_16, svfloat16x2_t, float16_t,
+	    svst2_vnum_f16 (p0, x0, 16, z0),
+	    svst2_vnum (p0, x0, 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_f16_m1:
+**	decb	x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f16_m1, svfloat16x2_t, float16_t,
+	    svst2_vnum_f16 (p0, x0, -1, z0),
+	    svst2_vnum (p0, x0, -1, z0))
+
+/*
+** st2_vnum_f16_m2:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f16_m2, svfloat16x2_t, float16_t,
+	    svst2_vnum_f16 (p0, x0, -2, z0),
+	    svst2_vnum (p0, x0, -2, z0))
+
+/*
+** st2_vnum_f16_m16:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f16_m16, svfloat16x2_t, float16_t,
+	    svst2_vnum_f16 (p0, x0, -16, z0),
+	    svst2_vnum (p0, x0, -16, z0))
+
+/*
+** st2_vnum_f16_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f16_m18, svfloat16x2_t, float16_t,
+	    svst2_vnum_f16 (p0, x0, -18, z0),
+	    svst2_vnum (p0, x0, -18, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st2_vnum_f16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f16_x1, svfloat16x2_t, float16_t,
+	    svst2_vnum_f16 (p0, x0, x1, z0),
+	    svst2_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f32.c
new file mode 100644
index 000000000..ba271882e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f32.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st2_f32_base:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_f32_base, svfloat32x2_t, float32_t,
+	    svst2_f32 (p0, x0, z0),
+	    svst2 (p0, x0, z0))
+
+/*
+** st2_f32_index:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (st2_f32_index, svfloat32x2_t, float32_t,
+	    svst2_f32 (p0, x0 + x1, z0),
+	    svst2 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_f32_1:
+**	incb	x0
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_f32_1, svfloat32x2_t, float32_t,
+	    svst2_f32 (p0, x0 + svcntw (), z0),
+	    svst2 (p0, x0 + svcntw (), z0))
+
+/*
+** st2_f32_2:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_f32_2, svfloat32x2_t, float32_t,
+	    svst2_f32 (p0, x0 + svcntw () * 2, z0),
+	    svst2 (p0, x0 + svcntw () * 2, z0))
+
+/*
+** st2_f32_14:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_f32_14, svfloat32x2_t, float32_t,
+	    svst2_f32 (p0, x0 + svcntw () * 14, z0),
+	    svst2 (p0, x0 + svcntw () * 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_f32_16:
+**	incb	x0, all, mul #16
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_f32_16, svfloat32x2_t, float32_t,
+	    svst2_f32 (p0, x0 + svcntw () * 16, z0),
+	    svst2 (p0, x0 + svcntw () * 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_f32_m1:
+**	decb	x0
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_f32_m1, svfloat32x2_t, float32_t,
+	    svst2_f32 (p0, x0 - svcntw (), z0),
+	    svst2 (p0, x0 - svcntw (), z0))
+
+/*
+** st2_f32_m2:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_f32_m2, svfloat32x2_t, float32_t,
+	    svst2_f32 (p0, x0 - svcntw () * 2, z0),
+	    svst2 (p0, x0 - svcntw () * 2, z0))
+
+/*
+** st2_f32_m16:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_f32_m16, svfloat32x2_t, float32_t,
+	    svst2_f32 (p0, x0 - svcntw () * 16, z0),
+	    svst2 (p0, x0 - svcntw () * 16, z0))
+
+/*
+** st2_f32_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_f32_m18, svfloat32x2_t, float32_t,
+	    svst2_f32 (p0, x0 - svcntw () * 18, z0),
+	    svst2 (p0, x0 - svcntw () * 18, z0))
+
+/*
+** st2_vnum_f32_0:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f32_0, svfloat32x2_t, float32_t,
+	    svst2_vnum_f32 (p0, x0, 0, z0),
+	    svst2_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_f32_1:
+**	incb	x0
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f32_1, svfloat32x2_t, float32_t,
+	    svst2_vnum_f32 (p0, x0, 1, z0),
+	    svst2_vnum (p0, x0, 1, z0))
+
+/*
+** st2_vnum_f32_2:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f32_2, svfloat32x2_t, float32_t,
+	    svst2_vnum_f32 (p0, x0, 2, z0),
+	    svst2_vnum (p0, x0, 2, z0))
+
+/*
+** st2_vnum_f32_14:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f32_14, svfloat32x2_t, float32_t,
+	    svst2_vnum_f32 (p0, x0, 14, z0),
+	    svst2_vnum (p0, x0, 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_f32_16:
+**	incb	x0, all, mul #16
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f32_16, svfloat32x2_t, float32_t,
+	    svst2_vnum_f32 (p0, x0, 16, z0),
+	    svst2_vnum (p0, x0, 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_f32_m1:
+**	decb	x0
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f32_m1, svfloat32x2_t, float32_t,
+	    svst2_vnum_f32 (p0, x0, -1, z0),
+	    svst2_vnum (p0, x0, -1, z0))
+
+/*
+** st2_vnum_f32_m2:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f32_m2, svfloat32x2_t, float32_t,
+	    svst2_vnum_f32 (p0, x0, -2, z0),
+	    svst2_vnum (p0, x0, -2, z0))
+
+/*
+** st2_vnum_f32_m16:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f32_m16, svfloat32x2_t, float32_t,
+	    svst2_vnum_f32 (p0, x0, -16, z0),
+	    svst2_vnum (p0, x0, -16, z0))
+
+/*
+** st2_vnum_f32_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f32_m18, svfloat32x2_t, float32_t,
+	    svst2_vnum_f32 (p0, x0, -18, z0),
+	    svst2_vnum (p0, x0, -18, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st2_vnum_f32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f32_x1, svfloat32x2_t, float32_t,
+	    svst2_vnum_f32 (p0, x0, x1, z0),
+	    svst2_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f64.c
new file mode 100644
index 000000000..c499ba0fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_f64.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st2_f64_base:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_f64_base, svfloat64x2_t, float64_t,
+	    svst2_f64 (p0, x0, z0),
+	    svst2 (p0, x0, z0))
+
+/*
+** st2_f64_index:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_STORE (st2_f64_index, svfloat64x2_t, float64_t,
+	    svst2_f64 (p0, x0 + x1, z0),
+	    svst2 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_f64_1:
+**	incb	x0
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_f64_1, svfloat64x2_t, float64_t,
+	    svst2_f64 (p0, x0 + svcntd (), z0),
+	    svst2 (p0, x0 + svcntd (), z0))
+
+/*
+** st2_f64_2:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_f64_2, svfloat64x2_t, float64_t,
+	    svst2_f64 (p0, x0 + svcntd () * 2, z0),
+	    svst2 (p0, x0 + svcntd () * 2, z0))
+
+/*
+** st2_f64_14:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_f64_14, svfloat64x2_t, float64_t,
+	    svst2_f64 (p0, x0 + svcntd () * 14, z0),
+	    svst2 (p0, x0 + svcntd () * 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_f64_16:
+**	incb	x0, all, mul #16
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_f64_16, svfloat64x2_t, float64_t,
+	    svst2_f64 (p0, x0 + svcntd () * 16, z0),
+	    svst2 (p0, x0 + svcntd () * 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_f64_m1:
+**	decb	x0
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_f64_m1, svfloat64x2_t, float64_t,
+	    svst2_f64 (p0, x0 - svcntd (), z0),
+	    svst2 (p0, x0 - svcntd (), z0))
+
+/*
+** st2_f64_m2:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_f64_m2, svfloat64x2_t, float64_t,
+	    svst2_f64 (p0, x0 - svcntd () * 2, z0),
+	    svst2 (p0, x0 - svcntd () * 2, z0))
+
+/*
+** st2_f64_m16:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_f64_m16, svfloat64x2_t, float64_t,
+	    svst2_f64 (p0, x0 - svcntd () * 16, z0),
+	    svst2 (p0, x0 - svcntd () * 16, z0))
+
+/*
+** st2_f64_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_f64_m18, svfloat64x2_t, float64_t,
+	    svst2_f64 (p0, x0 - svcntd () * 18, z0),
+	    svst2 (p0, x0 - svcntd () * 18, z0))
+
+/*
+** st2_vnum_f64_0:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f64_0, svfloat64x2_t, float64_t,
+	    svst2_vnum_f64 (p0, x0, 0, z0),
+	    svst2_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_f64_1:
+**	incb	x0
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f64_1, svfloat64x2_t, float64_t,
+	    svst2_vnum_f64 (p0, x0, 1, z0),
+	    svst2_vnum (p0, x0, 1, z0))
+
+/*
+** st2_vnum_f64_2:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f64_2, svfloat64x2_t, float64_t,
+	    svst2_vnum_f64 (p0, x0, 2, z0),
+	    svst2_vnum (p0, x0, 2, z0))
+
+/*
+** st2_vnum_f64_14:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f64_14, svfloat64x2_t, float64_t,
+	    svst2_vnum_f64 (p0, x0, 14, z0),
+	    svst2_vnum (p0, x0, 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_f64_16:
+**	incb	x0, all, mul #16
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f64_16, svfloat64x2_t, float64_t,
+	    svst2_vnum_f64 (p0, x0, 16, z0),
+	    svst2_vnum (p0, x0, 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_f64_m1:
+**	decb	x0
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f64_m1, svfloat64x2_t, float64_t,
+	    svst2_vnum_f64 (p0, x0, -1, z0),
+	    svst2_vnum (p0, x0, -1, z0))
+
+/*
+** st2_vnum_f64_m2:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f64_m2, svfloat64x2_t, float64_t,
+	    svst2_vnum_f64 (p0, x0, -2, z0),
+	    svst2_vnum (p0, x0, -2, z0))
+
+/*
+** st2_vnum_f64_m16:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f64_m16, svfloat64x2_t, float64_t,
+	    svst2_vnum_f64 (p0, x0, -16, z0),
+	    svst2_vnum (p0, x0, -16, z0))
+
+/*
+** st2_vnum_f64_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f64_m18, svfloat64x2_t, float64_t,
+	    svst2_vnum_f64 (p0, x0, -18, z0),
+	    svst2_vnum (p0, x0, -18, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st2_vnum_f64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st2_vnum_f64_x1, svfloat64x2_t, float64_t,
+	    svst2_vnum_f64 (p0, x0, x1, z0),
+	    svst2_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s16.c
new file mode 100644
index 000000000..860b45eac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s16.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st2_s16_base:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_s16_base, svint16x2_t, int16_t,
+	    svst2_s16 (p0, x0, z0),
+	    svst2 (p0, x0, z0))
+
+/*
+** st2_s16_index:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st2_s16_index, svint16x2_t, int16_t,
+	    svst2_s16 (p0, x0 + x1, z0),
+	    svst2 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_s16_1:
+**	incb	x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_s16_1, svint16x2_t, int16_t,
+	    svst2_s16 (p0, x0 + svcnth (), z0),
+	    svst2 (p0, x0 + svcnth (), z0))
+
+/*
+** st2_s16_2:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_s16_2, svint16x2_t, int16_t,
+	    svst2_s16 (p0, x0 + svcnth () * 2, z0),
+	    svst2 (p0, x0 + svcnth () * 2, z0))
+
+/*
+** st2_s16_14:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_s16_14, svint16x2_t, int16_t,
+	    svst2_s16 (p0, x0 + svcnth () * 14, z0),
+	    svst2 (p0, x0 + svcnth () * 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_s16_16:
+**	incb	x0, all, mul #16
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_s16_16, svint16x2_t, int16_t,
+	    svst2_s16 (p0, x0 + svcnth () * 16, z0),
+	    svst2 (p0, x0 + svcnth () * 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_s16_m1:
+**	decb	x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_s16_m1, svint16x2_t, int16_t,
+	    svst2_s16 (p0, x0 - svcnth (), z0),
+	    svst2 (p0, x0 - svcnth (), z0))
+
+/*
+** st2_s16_m2:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_s16_m2, svint16x2_t, int16_t,
+	    svst2_s16 (p0, x0 - svcnth () * 2, z0),
+	    svst2 (p0, x0 - svcnth () * 2, z0))
+
+/*
+** st2_s16_m16:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_s16_m16, svint16x2_t, int16_t,
+	    svst2_s16 (p0, x0 - svcnth () * 16, z0),
+	    svst2 (p0, x0 - svcnth () * 16, z0))
+
+/*
+** st2_s16_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_s16_m18, svint16x2_t, int16_t,
+	    svst2_s16 (p0, x0 - svcnth () * 18, z0),
+	    svst2 (p0, x0 - svcnth () * 18, z0))
+
+/*
+** st2_vnum_s16_0:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s16_0, svint16x2_t, int16_t,
+	    svst2_vnum_s16 (p0, x0, 0, z0),
+	    svst2_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_s16_1:
+**	incb	x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s16_1, svint16x2_t, int16_t,
+	    svst2_vnum_s16 (p0, x0, 1, z0),
+	    svst2_vnum (p0, x0, 1, z0))
+
+/*
+** st2_vnum_s16_2:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s16_2, svint16x2_t, int16_t,
+	    svst2_vnum_s16 (p0, x0, 2, z0),
+	    svst2_vnum (p0, x0, 2, z0))
+
+/*
+** st2_vnum_s16_14:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s16_14, svint16x2_t, int16_t,
+	    svst2_vnum_s16 (p0, x0, 14, z0),
+	    svst2_vnum (p0, x0, 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_s16_16:
+**	incb	x0, all, mul #16
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s16_16, svint16x2_t, int16_t,
+	    svst2_vnum_s16 (p0, x0, 16, z0),
+	    svst2_vnum (p0, x0, 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_s16_m1:
+**	decb	x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s16_m1, svint16x2_t, int16_t,
+	    svst2_vnum_s16 (p0, x0, -1, z0),
+	    svst2_vnum (p0, x0, -1, z0))
+
+/*
+** st2_vnum_s16_m2:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s16_m2, svint16x2_t, int16_t,
+	    svst2_vnum_s16 (p0, x0, -2, z0),
+	    svst2_vnum (p0, x0, -2, z0))
+
+/*
+** st2_vnum_s16_m16:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s16_m16, svint16x2_t, int16_t,
+	    svst2_vnum_s16 (p0, x0, -16, z0),
+	    svst2_vnum (p0, x0, -16, z0))
+
+/*
+** st2_vnum_s16_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s16_m18, svint16x2_t, int16_t,
+	    svst2_vnum_s16 (p0, x0, -18, z0),
+	    svst2_vnum (p0, x0, -18, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st2_vnum_s16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s16_x1, svint16x2_t, int16_t,
+	    svst2_vnum_s16 (p0, x0, x1, z0),
+	    svst2_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s32.c
new file mode 100644
index 000000000..16b674992
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s32.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st2_s32_base:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_s32_base, svint32x2_t, int32_t,
+	    svst2_s32 (p0, x0, z0),
+	    svst2 (p0, x0, z0))
+
+/*
+** st2_s32_index:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (st2_s32_index, svint32x2_t, int32_t,
+	    svst2_s32 (p0, x0 + x1, z0),
+	    svst2 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_s32_1:
+**	incb	x0
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_s32_1, svint32x2_t, int32_t,
+	    svst2_s32 (p0, x0 + svcntw (), z0),
+	    svst2 (p0, x0 + svcntw (), z0))
+
+/*
+** st2_s32_2:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_s32_2, svint32x2_t, int32_t,
+	    svst2_s32 (p0, x0 + svcntw () * 2, z0),
+	    svst2 (p0, x0 + svcntw () * 2, z0))
+
+/*
+** st2_s32_14:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_s32_14, svint32x2_t, int32_t,
+	    svst2_s32 (p0, x0 + svcntw () * 14, z0),
+	    svst2 (p0, x0 + svcntw () * 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_s32_16:
+**	incb	x0, all, mul #16
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_s32_16, svint32x2_t, int32_t,
+	    svst2_s32 (p0, x0 + svcntw () * 16, z0),
+	    svst2 (p0, x0 + svcntw () * 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_s32_m1:
+**	decb	x0
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_s32_m1, svint32x2_t, int32_t,
+	    svst2_s32 (p0, x0 - svcntw (), z0),
+	    svst2 (p0, x0 - svcntw (), z0))
+
+/*
+** st2_s32_m2:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_s32_m2, svint32x2_t, int32_t,
+	    svst2_s32 (p0, x0 - svcntw () * 2, z0),
+	    svst2 (p0, x0 - svcntw () * 2, z0))
+
+/*
+** st2_s32_m16:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_s32_m16, svint32x2_t, int32_t,
+	    svst2_s32 (p0, x0 - svcntw () * 16, z0),
+	    svst2 (p0, x0 - svcntw () * 16, z0))
+
+/*
+** st2_s32_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_s32_m18, svint32x2_t, int32_t,
+	    svst2_s32 (p0, x0 - svcntw () * 18, z0),
+	    svst2 (p0, x0 - svcntw () * 18, z0))
+
+/*
+** st2_vnum_s32_0:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s32_0, svint32x2_t, int32_t,
+	    svst2_vnum_s32 (p0, x0, 0, z0),
+	    svst2_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_s32_1:
+**	incb	x0
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s32_1, svint32x2_t, int32_t,
+	    svst2_vnum_s32 (p0, x0, 1, z0),
+	    svst2_vnum (p0, x0, 1, z0))
+
+/*
+** st2_vnum_s32_2:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s32_2, svint32x2_t, int32_t,
+	    svst2_vnum_s32 (p0, x0, 2, z0),
+	    svst2_vnum (p0, x0, 2, z0))
+
+/*
+** st2_vnum_s32_14:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s32_14, svint32x2_t, int32_t,
+	    svst2_vnum_s32 (p0, x0, 14, z0),
+	    svst2_vnum (p0, x0, 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_s32_16:
+**	incb	x0, all, mul #16
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s32_16, svint32x2_t, int32_t,
+	    svst2_vnum_s32 (p0, x0, 16, z0),
+	    svst2_vnum (p0, x0, 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_s32_m1:
+**	decb	x0
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s32_m1, svint32x2_t, int32_t,
+	    svst2_vnum_s32 (p0, x0, -1, z0),
+	    svst2_vnum (p0, x0, -1, z0))
+
+/*
+** st2_vnum_s32_m2:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s32_m2, svint32x2_t, int32_t,
+	    svst2_vnum_s32 (p0, x0, -2, z0),
+	    svst2_vnum (p0, x0, -2, z0))
+
+/*
+** st2_vnum_s32_m16:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s32_m16, svint32x2_t, int32_t,
+	    svst2_vnum_s32 (p0, x0, -16, z0),
+	    svst2_vnum (p0, x0, -16, z0))
+
+/*
+** st2_vnum_s32_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s32_m18, svint32x2_t, int32_t,
+	    svst2_vnum_s32 (p0, x0, -18, z0),
+	    svst2_vnum (p0, x0, -18, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st2_vnum_s32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s32_x1, svint32x2_t, int32_t,
+	    svst2_vnum_s32 (p0, x0, x1, z0),
+	    svst2_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s64.c
new file mode 100644
index 000000000..1421333cb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s64.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st2_s64_base:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_s64_base, svint64x2_t, int64_t,
+	    svst2_s64 (p0, x0, z0),
+	    svst2 (p0, x0, z0))
+
+/*
+** st2_s64_index:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_STORE (st2_s64_index, svint64x2_t, int64_t,
+	    svst2_s64 (p0, x0 + x1, z0),
+	    svst2 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_s64_1:
+**	incb	x0
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_s64_1, svint64x2_t, int64_t,
+	    svst2_s64 (p0, x0 + svcntd (), z0),
+	    svst2 (p0, x0 + svcntd (), z0))
+
+/*
+** st2_s64_2:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_s64_2, svint64x2_t, int64_t,
+	    svst2_s64 (p0, x0 + svcntd () * 2, z0),
+	    svst2 (p0, x0 + svcntd () * 2, z0))
+
+/*
+** st2_s64_14:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_s64_14, svint64x2_t, int64_t,
+	    svst2_s64 (p0, x0 + svcntd () * 14, z0),
+	    svst2 (p0, x0 + svcntd () * 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_s64_16:
+**	incb	x0, all, mul #16
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_s64_16, svint64x2_t, int64_t,
+	    svst2_s64 (p0, x0 + svcntd () * 16, z0),
+	    svst2 (p0, x0 + svcntd () * 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_s64_m1:
+**	decb	x0
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_s64_m1, svint64x2_t, int64_t,
+	    svst2_s64 (p0, x0 - svcntd (), z0),
+	    svst2 (p0, x0 - svcntd (), z0))
+
+/*
+** st2_s64_m2:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_s64_m2, svint64x2_t, int64_t,
+	    svst2_s64 (p0, x0 - svcntd () * 2, z0),
+	    svst2 (p0, x0 - svcntd () * 2, z0))
+
+/*
+** st2_s64_m16:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_s64_m16, svint64x2_t, int64_t,
+	    svst2_s64 (p0, x0 - svcntd () * 16, z0),
+	    svst2 (p0, x0 - svcntd () * 16, z0))
+
+/*
+** st2_s64_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_s64_m18, svint64x2_t, int64_t,
+	    svst2_s64 (p0, x0 - svcntd () * 18, z0),
+	    svst2 (p0, x0 - svcntd () * 18, z0))
+
+/*
+** st2_vnum_s64_0:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s64_0, svint64x2_t, int64_t,
+	    svst2_vnum_s64 (p0, x0, 0, z0),
+	    svst2_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_s64_1:
+**	incb	x0
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s64_1, svint64x2_t, int64_t,
+	    svst2_vnum_s64 (p0, x0, 1, z0),
+	    svst2_vnum (p0, x0, 1, z0))
+
+/*
+** st2_vnum_s64_2:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s64_2, svint64x2_t, int64_t,
+	    svst2_vnum_s64 (p0, x0, 2, z0),
+	    svst2_vnum (p0, x0, 2, z0))
+
+/*
+** st2_vnum_s64_14:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s64_14, svint64x2_t, int64_t,
+	    svst2_vnum_s64 (p0, x0, 14, z0),
+	    svst2_vnum (p0, x0, 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_s64_16:
+**	incb	x0, all, mul #16
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s64_16, svint64x2_t, int64_t,
+	    svst2_vnum_s64 (p0, x0, 16, z0),
+	    svst2_vnum (p0, x0, 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_s64_m1:
+**	decb	x0
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s64_m1, svint64x2_t, int64_t,
+	    svst2_vnum_s64 (p0, x0, -1, z0),
+	    svst2_vnum (p0, x0, -1, z0))
+
+/*
+** st2_vnum_s64_m2:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s64_m2, svint64x2_t, int64_t,
+	    svst2_vnum_s64 (p0, x0, -2, z0),
+	    svst2_vnum (p0, x0, -2, z0))
+
+/*
+** st2_vnum_s64_m16:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s64_m16, svint64x2_t, int64_t,
+	    svst2_vnum_s64 (p0, x0, -16, z0),
+	    svst2_vnum (p0, x0, -16, z0))
+
+/*
+** st2_vnum_s64_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s64_m18, svint64x2_t, int64_t,
+	    svst2_vnum_s64 (p0, x0, -18, z0),
+	    svst2_vnum (p0, x0, -18, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st2_vnum_s64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s64_x1, svint64x2_t, int64_t,
+	    svst2_vnum_s64 (p0, x0, x1, z0),
+	    svst2_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s8.c
new file mode 100644
index 000000000..f0b7df3c5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_s8.c
@@ -0,0 +1,204 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st2_s8_base:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_s8_base, svint8x2_t, int8_t,
+	    svst2_s8 (p0, x0, z0),
+	    svst2 (p0, x0, z0))
+
+/*
+** st2_s8_index:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, x1\]
+**	ret
+*/
+TEST_STORE (st2_s8_index, svint8x2_t, int8_t,
+	    svst2_s8 (p0, x0 + x1, z0),
+	    svst2 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_s8_1:
+**	incb	x0
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_s8_1, svint8x2_t, int8_t,
+	    svst2_s8 (p0, x0 + svcntb (), z0),
+	    svst2 (p0, x0 + svcntb (), z0))
+
+/*
+** st2_s8_2:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_s8_2, svint8x2_t, int8_t,
+	    svst2_s8 (p0, x0 + svcntb () * 2, z0),
+	    svst2 (p0, x0 + svcntb () * 2, z0))
+
+/*
+** st2_s8_14:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_s8_14, svint8x2_t, int8_t,
+	    svst2_s8 (p0, x0 + svcntb () * 14, z0),
+	    svst2 (p0, x0 + svcntb () * 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_s8_16:
+**	incb	x0, all, mul #16
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_s8_16, svint8x2_t, int8_t,
+	    svst2_s8 (p0, x0 + svcntb () * 16, z0),
+	    svst2 (p0, x0 + svcntb () * 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_s8_m1:
+**	decb	x0
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_s8_m1, svint8x2_t, int8_t,
+	    svst2_s8 (p0, x0 - svcntb (), z0),
+	    svst2 (p0, x0 - svcntb (), z0))
+
+/*
+** st2_s8_m2:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_s8_m2, svint8x2_t, int8_t,
+	    svst2_s8 (p0, x0 - svcntb () * 2, z0),
+	    svst2 (p0, x0 - svcntb () * 2, z0))
+
+/*
+** st2_s8_m16:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_s8_m16, svint8x2_t, int8_t,
+	    svst2_s8 (p0, x0 - svcntb () * 16, z0),
+	    svst2 (p0, x0 - svcntb () * 16, z0))
+
+/*
+** st2_s8_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_s8_m18, svint8x2_t, int8_t,
+	    svst2_s8 (p0, x0 - svcntb () * 18, z0),
+	    svst2 (p0, x0 - svcntb () * 18, z0))
+
+/*
+** st2_vnum_s8_0:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s8_0, svint8x2_t, int8_t,
+	    svst2_vnum_s8 (p0, x0, 0, z0),
+	    svst2_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_s8_1:
+**	incb	x0
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s8_1, svint8x2_t, int8_t,
+	    svst2_vnum_s8 (p0, x0, 1, z0),
+	    svst2_vnum (p0, x0, 1, z0))
+
+/*
+** st2_vnum_s8_2:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s8_2, svint8x2_t, int8_t,
+	    svst2_vnum_s8 (p0, x0, 2, z0),
+	    svst2_vnum (p0, x0, 2, z0))
+
+/*
+** st2_vnum_s8_14:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s8_14, svint8x2_t, int8_t,
+	    svst2_vnum_s8 (p0, x0, 14, z0),
+	    svst2_vnum (p0, x0, 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_s8_16:
+**	incb	x0, all, mul #16
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s8_16, svint8x2_t, int8_t,
+	    svst2_vnum_s8 (p0, x0, 16, z0),
+	    svst2_vnum (p0, x0, 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_s8_m1:
+**	decb	x0
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s8_m1, svint8x2_t, int8_t,
+	    svst2_vnum_s8 (p0, x0, -1, z0),
+	    svst2_vnum (p0, x0, -1, z0))
+
+/*
+** st2_vnum_s8_m2:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s8_m2, svint8x2_t, int8_t,
+	    svst2_vnum_s8 (p0, x0, -2, z0),
+	    svst2_vnum (p0, x0, -2, z0))
+
+/*
+** st2_vnum_s8_m16:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s8_m16, svint8x2_t, int8_t,
+	    svst2_vnum_s8 (p0, x0, -16, z0),
+	    svst2_vnum (p0, x0, -16, z0))
+
+/*
+** st2_vnum_s8_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_vnum_s8_m18, svint8x2_t, int8_t,
+	    svst2_vnum_s8 (p0, x0, -18, z0),
+	    svst2_vnum (p0, x0, -18, z0))
+
+/*
+** st2_vnum_s8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_STORE (st2_vnum_s8_x1, svint8x2_t, int8_t,
+	    svst2_vnum_s8 (p0, x0, x1, z0),
+	    svst2_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u16.c
new file mode 100644
index 000000000..edd32d81e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u16.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st2_u16_base:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_u16_base, svuint16x2_t, uint16_t,
+	    svst2_u16 (p0, x0, z0),
+	    svst2 (p0, x0, z0))
+
+/*
+** st2_u16_index:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st2_u16_index, svuint16x2_t, uint16_t,
+	    svst2_u16 (p0, x0 + x1, z0),
+	    svst2 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_u16_1:
+**	incb	x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_u16_1, svuint16x2_t, uint16_t,
+	    svst2_u16 (p0, x0 + svcnth (), z0),
+	    svst2 (p0, x0 + svcnth (), z0))
+
+/*
+** st2_u16_2:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_u16_2, svuint16x2_t, uint16_t,
+	    svst2_u16 (p0, x0 + svcnth () * 2, z0),
+	    svst2 (p0, x0 + svcnth () * 2, z0))
+
+/*
+** st2_u16_14:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_u16_14, svuint16x2_t, uint16_t,
+	    svst2_u16 (p0, x0 + svcnth () * 14, z0),
+	    svst2 (p0, x0 + svcnth () * 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_u16_16:
+**	incb	x0, all, mul #16
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_u16_16, svuint16x2_t, uint16_t,
+	    svst2_u16 (p0, x0 + svcnth () * 16, z0),
+	    svst2 (p0, x0 + svcnth () * 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_u16_m1:
+**	decb	x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_u16_m1, svuint16x2_t, uint16_t,
+	    svst2_u16 (p0, x0 - svcnth (), z0),
+	    svst2 (p0, x0 - svcnth (), z0))
+
+/*
+** st2_u16_m2:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_u16_m2, svuint16x2_t, uint16_t,
+	    svst2_u16 (p0, x0 - svcnth () * 2, z0),
+	    svst2 (p0, x0 - svcnth () * 2, z0))
+
+/*
+** st2_u16_m16:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_u16_m16, svuint16x2_t, uint16_t,
+	    svst2_u16 (p0, x0 - svcnth () * 16, z0),
+	    svst2 (p0, x0 - svcnth () * 16, z0))
+
+/*
+** st2_u16_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_u16_m18, svuint16x2_t, uint16_t,
+	    svst2_u16 (p0, x0 - svcnth () * 18, z0),
+	    svst2 (p0, x0 - svcnth () * 18, z0))
+
+/*
+** st2_vnum_u16_0:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u16_0, svuint16x2_t, uint16_t,
+	    svst2_vnum_u16 (p0, x0, 0, z0),
+	    svst2_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_u16_1:
+**	incb	x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u16_1, svuint16x2_t, uint16_t,
+	    svst2_vnum_u16 (p0, x0, 1, z0),
+	    svst2_vnum (p0, x0, 1, z0))
+
+/*
+** st2_vnum_u16_2:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u16_2, svuint16x2_t, uint16_t,
+	    svst2_vnum_u16 (p0, x0, 2, z0),
+	    svst2_vnum (p0, x0, 2, z0))
+
+/*
+** st2_vnum_u16_14:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u16_14, svuint16x2_t, uint16_t,
+	    svst2_vnum_u16 (p0, x0, 14, z0),
+	    svst2_vnum (p0, x0, 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_u16_16:
+**	incb	x0, all, mul #16
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u16_16, svuint16x2_t, uint16_t,
+	    svst2_vnum_u16 (p0, x0, 16, z0),
+	    svst2_vnum (p0, x0, 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_u16_m1:
+**	decb	x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u16_m1, svuint16x2_t, uint16_t,
+	    svst2_vnum_u16 (p0, x0, -1, z0),
+	    svst2_vnum (p0, x0, -1, z0))
+
+/*
+** st2_vnum_u16_m2:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u16_m2, svuint16x2_t, uint16_t,
+	    svst2_vnum_u16 (p0, x0, -2, z0),
+	    svst2_vnum (p0, x0, -2, z0))
+
+/*
+** st2_vnum_u16_m16:
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u16_m16, svuint16x2_t, uint16_t,
+	    svst2_vnum_u16 (p0, x0, -16, z0),
+	    svst2_vnum (p0, x0, -16, z0))
+
+/*
+** st2_vnum_u16_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u16_m18, svuint16x2_t, uint16_t,
+	    svst2_vnum_u16 (p0, x0, -18, z0),
+	    svst2_vnum (p0, x0, -18, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st2_vnum_u16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st2h	{z0\.h(?: - |, )z1\.h}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u16_x1, svuint16x2_t, uint16_t,
+	    svst2_vnum_u16 (p0, x0, x1, z0),
+	    svst2_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u32.c
new file mode 100644
index 000000000..46f1b5ca7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u32.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st2_u32_base:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_u32_base, svuint32x2_t, uint32_t,
+	    svst2_u32 (p0, x0, z0),
+	    svst2 (p0, x0, z0))
+
+/*
+** st2_u32_index:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (st2_u32_index, svuint32x2_t, uint32_t,
+	    svst2_u32 (p0, x0 + x1, z0),
+	    svst2 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_u32_1:
+**	incb	x0
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_u32_1, svuint32x2_t, uint32_t,
+	    svst2_u32 (p0, x0 + svcntw (), z0),
+	    svst2 (p0, x0 + svcntw (), z0))
+
+/*
+** st2_u32_2:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_u32_2, svuint32x2_t, uint32_t,
+	    svst2_u32 (p0, x0 + svcntw () * 2, z0),
+	    svst2 (p0, x0 + svcntw () * 2, z0))
+
+/*
+** st2_u32_14:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_u32_14, svuint32x2_t, uint32_t,
+	    svst2_u32 (p0, x0 + svcntw () * 14, z0),
+	    svst2 (p0, x0 + svcntw () * 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_u32_16:
+**	incb	x0, all, mul #16
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_u32_16, svuint32x2_t, uint32_t,
+	    svst2_u32 (p0, x0 + svcntw () * 16, z0),
+	    svst2 (p0, x0 + svcntw () * 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_u32_m1:
+**	decb	x0
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_u32_m1, svuint32x2_t, uint32_t,
+	    svst2_u32 (p0, x0 - svcntw (), z0),
+	    svst2 (p0, x0 - svcntw (), z0))
+
+/*
+** st2_u32_m2:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_u32_m2, svuint32x2_t, uint32_t,
+	    svst2_u32 (p0, x0 - svcntw () * 2, z0),
+	    svst2 (p0, x0 - svcntw () * 2, z0))
+
+/*
+** st2_u32_m16:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_u32_m16, svuint32x2_t, uint32_t,
+	    svst2_u32 (p0, x0 - svcntw () * 16, z0),
+	    svst2 (p0, x0 - svcntw () * 16, z0))
+
+/*
+** st2_u32_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_u32_m18, svuint32x2_t, uint32_t,
+	    svst2_u32 (p0, x0 - svcntw () * 18, z0),
+	    svst2 (p0, x0 - svcntw () * 18, z0))
+
+/*
+** st2_vnum_u32_0:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u32_0, svuint32x2_t, uint32_t,
+	    svst2_vnum_u32 (p0, x0, 0, z0),
+	    svst2_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_u32_1:
+**	incb	x0
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u32_1, svuint32x2_t, uint32_t,
+	    svst2_vnum_u32 (p0, x0, 1, z0),
+	    svst2_vnum (p0, x0, 1, z0))
+
+/*
+** st2_vnum_u32_2:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u32_2, svuint32x2_t, uint32_t,
+	    svst2_vnum_u32 (p0, x0, 2, z0),
+	    svst2_vnum (p0, x0, 2, z0))
+
+/*
+** st2_vnum_u32_14:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u32_14, svuint32x2_t, uint32_t,
+	    svst2_vnum_u32 (p0, x0, 14, z0),
+	    svst2_vnum (p0, x0, 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_u32_16:
+**	incb	x0, all, mul #16
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u32_16, svuint32x2_t, uint32_t,
+	    svst2_vnum_u32 (p0, x0, 16, z0),
+	    svst2_vnum (p0, x0, 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_u32_m1:
+**	decb	x0
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u32_m1, svuint32x2_t, uint32_t,
+	    svst2_vnum_u32 (p0, x0, -1, z0),
+	    svst2_vnum (p0, x0, -1, z0))
+
+/*
+** st2_vnum_u32_m2:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u32_m2, svuint32x2_t, uint32_t,
+	    svst2_vnum_u32 (p0, x0, -2, z0),
+	    svst2_vnum (p0, x0, -2, z0))
+
+/*
+** st2_vnum_u32_m16:
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u32_m16, svuint32x2_t, uint32_t,
+	    svst2_vnum_u32 (p0, x0, -16, z0),
+	    svst2_vnum (p0, x0, -16, z0))
+
+/*
+** st2_vnum_u32_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u32_m18, svuint32x2_t, uint32_t,
+	    svst2_vnum_u32 (p0, x0, -18, z0),
+	    svst2_vnum (p0, x0, -18, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st2_vnum_u32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st2w	{z0\.s(?: - |, )z1\.s}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u32_x1, svuint32x2_t, uint32_t,
+	    svst2_vnum_u32 (p0, x0, x1, z0),
+	    svst2_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u64.c
new file mode 100644
index 000000000..0d9202b72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u64.c
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st2_u64_base:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_u64_base, svuint64x2_t, uint64_t,
+	    svst2_u64 (p0, x0, z0),
+	    svst2 (p0, x0, z0))
+
+/*
+** st2_u64_index:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_STORE (st2_u64_index, svuint64x2_t, uint64_t,
+	    svst2_u64 (p0, x0 + x1, z0),
+	    svst2 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_u64_1:
+**	incb	x0
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_u64_1, svuint64x2_t, uint64_t,
+	    svst2_u64 (p0, x0 + svcntd (), z0),
+	    svst2 (p0, x0 + svcntd (), z0))
+
+/*
+** st2_u64_2:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_u64_2, svuint64x2_t, uint64_t,
+	    svst2_u64 (p0, x0 + svcntd () * 2, z0),
+	    svst2 (p0, x0 + svcntd () * 2, z0))
+
+/*
+** st2_u64_14:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_u64_14, svuint64x2_t, uint64_t,
+	    svst2_u64 (p0, x0 + svcntd () * 14, z0),
+	    svst2 (p0, x0 + svcntd () * 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_u64_16:
+**	incb	x0, all, mul #16
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_u64_16, svuint64x2_t, uint64_t,
+	    svst2_u64 (p0, x0 + svcntd () * 16, z0),
+	    svst2 (p0, x0 + svcntd () * 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_u64_m1:
+**	decb	x0
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_u64_m1, svuint64x2_t, uint64_t,
+	    svst2_u64 (p0, x0 - svcntd (), z0),
+	    svst2 (p0, x0 - svcntd (), z0))
+
+/*
+** st2_u64_m2:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_u64_m2, svuint64x2_t, uint64_t,
+	    svst2_u64 (p0, x0 - svcntd () * 2, z0),
+	    svst2 (p0, x0 - svcntd () * 2, z0))
+
+/*
+** st2_u64_m16:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_u64_m16, svuint64x2_t, uint64_t,
+	    svst2_u64 (p0, x0 - svcntd () * 16, z0),
+	    svst2 (p0, x0 - svcntd () * 16, z0))
+
+/*
+** st2_u64_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_u64_m18, svuint64x2_t, uint64_t,
+	    svst2_u64 (p0, x0 - svcntd () * 18, z0),
+	    svst2 (p0, x0 - svcntd () * 18, z0))
+
+/*
+** st2_vnum_u64_0:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u64_0, svuint64x2_t, uint64_t,
+	    svst2_vnum_u64 (p0, x0, 0, z0),
+	    svst2_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_u64_1:
+**	incb	x0
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u64_1, svuint64x2_t, uint64_t,
+	    svst2_vnum_u64 (p0, x0, 1, z0),
+	    svst2_vnum (p0, x0, 1, z0))
+
+/*
+** st2_vnum_u64_2:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u64_2, svuint64x2_t, uint64_t,
+	    svst2_vnum_u64 (p0, x0, 2, z0),
+	    svst2_vnum (p0, x0, 2, z0))
+
+/*
+** st2_vnum_u64_14:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u64_14, svuint64x2_t, uint64_t,
+	    svst2_vnum_u64 (p0, x0, 14, z0),
+	    svst2_vnum (p0, x0, 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_u64_16:
+**	incb	x0, all, mul #16
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u64_16, svuint64x2_t, uint64_t,
+	    svst2_vnum_u64 (p0, x0, 16, z0),
+	    svst2_vnum (p0, x0, 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_u64_m1:
+**	decb	x0
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u64_m1, svuint64x2_t, uint64_t,
+	    svst2_vnum_u64 (p0, x0, -1, z0),
+	    svst2_vnum (p0, x0, -1, z0))
+
+/*
+** st2_vnum_u64_m2:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u64_m2, svuint64x2_t, uint64_t,
+	    svst2_vnum_u64 (p0, x0, -2, z0),
+	    svst2_vnum (p0, x0, -2, z0))
+
+/*
+** st2_vnum_u64_m16:
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u64_m16, svuint64x2_t, uint64_t,
+	    svst2_vnum_u64 (p0, x0, -16, z0),
+	    svst2_vnum (p0, x0, -16, z0))
+
+/*
+** st2_vnum_u64_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u64_m18, svuint64x2_t, uint64_t,
+	    svst2_vnum_u64 (p0, x0, -18, z0),
+	    svst2_vnum (p0, x0, -18, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st2_vnum_u64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st2d	{z0\.d(?: - |, )z1\.d}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u64_x1, svuint64x2_t, uint64_t,
+	    svst2_vnum_u64 (p0, x0, x1, z0),
+	    svst2_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u8.c
new file mode 100644
index 000000000..e7ea977a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st2_u8.c
@@ -0,0 +1,204 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st2_u8_base:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_u8_base, svuint8x2_t, uint8_t,
+	    svst2_u8 (p0, x0, z0),
+	    svst2 (p0, x0, z0))
+
+/*
+** st2_u8_index:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, x1\]
+**	ret
+*/
+TEST_STORE (st2_u8_index, svuint8x2_t, uint8_t,
+	    svst2_u8 (p0, x0 + x1, z0),
+	    svst2 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_u8_1:
+**	incb	x0
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_u8_1, svuint8x2_t, uint8_t,
+	    svst2_u8 (p0, x0 + svcntb (), z0),
+	    svst2 (p0, x0 + svcntb (), z0))
+
+/*
+** st2_u8_2:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_u8_2, svuint8x2_t, uint8_t,
+	    svst2_u8 (p0, x0 + svcntb () * 2, z0),
+	    svst2 (p0, x0 + svcntb () * 2, z0))
+
+/*
+** st2_u8_14:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_u8_14, svuint8x2_t, uint8_t,
+	    svst2_u8 (p0, x0 + svcntb () * 14, z0),
+	    svst2 (p0, x0 + svcntb () * 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_u8_16:
+**	incb	x0, all, mul #16
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_u8_16, svuint8x2_t, uint8_t,
+	    svst2_u8 (p0, x0 + svcntb () * 16, z0),
+	    svst2 (p0, x0 + svcntb () * 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_u8_m1:
+**	decb	x0
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_u8_m1, svuint8x2_t, uint8_t,
+	    svst2_u8 (p0, x0 - svcntb (), z0),
+	    svst2 (p0, x0 - svcntb (), z0))
+
+/*
+** st2_u8_m2:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_u8_m2, svuint8x2_t, uint8_t,
+	    svst2_u8 (p0, x0 - svcntb () * 2, z0),
+	    svst2 (p0, x0 - svcntb () * 2, z0))
+
+/*
+** st2_u8_m16:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_u8_m16, svuint8x2_t, uint8_t,
+	    svst2_u8 (p0, x0 - svcntb () * 16, z0),
+	    svst2 (p0, x0 - svcntb () * 16, z0))
+
+/*
+** st2_u8_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_u8_m18, svuint8x2_t, uint8_t,
+	    svst2_u8 (p0, x0 - svcntb () * 18, z0),
+	    svst2 (p0, x0 - svcntb () * 18, z0))
+
+/*
+** st2_vnum_u8_0:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u8_0, svuint8x2_t, uint8_t,
+	    svst2_vnum_u8 (p0, x0, 0, z0),
+	    svst2_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_u8_1:
+**	incb	x0
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u8_1, svuint8x2_t, uint8_t,
+	    svst2_vnum_u8 (p0, x0, 1, z0),
+	    svst2_vnum (p0, x0, 1, z0))
+
+/*
+** st2_vnum_u8_2:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u8_2, svuint8x2_t, uint8_t,
+	    svst2_vnum_u8 (p0, x0, 2, z0),
+	    svst2_vnum (p0, x0, 2, z0))
+
+/*
+** st2_vnum_u8_14:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #14, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u8_14, svuint8x2_t, uint8_t,
+	    svst2_vnum_u8 (p0, x0, 14, z0),
+	    svst2_vnum (p0, x0, 14, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_u8_16:
+**	incb	x0, all, mul #16
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u8_16, svuint8x2_t, uint8_t,
+	    svst2_vnum_u8 (p0, x0, 16, z0),
+	    svst2_vnum (p0, x0, 16, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st2_vnum_u8_m1:
+**	decb	x0
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u8_m1, svuint8x2_t, uint8_t,
+	    svst2_vnum_u8 (p0, x0, -1, z0),
+	    svst2_vnum (p0, x0, -1, z0))
+
+/*
+** st2_vnum_u8_m2:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #-2, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u8_m2, svuint8x2_t, uint8_t,
+	    svst2_vnum_u8 (p0, x0, -2, z0),
+	    svst2_vnum (p0, x0, -2, z0))
+
+/*
+** st2_vnum_u8_m16:
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, #-16, mul vl\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u8_m16, svuint8x2_t, uint8_t,
+	    svst2_vnum_u8 (p0, x0, -16, z0),
+	    svst2_vnum (p0, x0, -16, z0))
+
+/*
+** st2_vnum_u8_m18:
+**	addvl	(x[0-9]+), x0, #-18
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st2_vnum_u8_m18, svuint8x2_t, uint8_t,
+	    svst2_vnum_u8 (p0, x0, -18, z0),
+	    svst2_vnum (p0, x0, -18, z0))
+
+/*
+** st2_vnum_u8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	st2b	{z0\.b(?: - |, )z1\.b}, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_STORE (st2_vnum_u8_x1, svuint8x2_t, uint8_t,
+	    svst2_vnum_u8 (p0, x0, x1, z0),
+	    svst2_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_bf16.c
new file mode 100644
index 000000000..2f921687c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_bf16.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st3_bf16_base:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_bf16_base, svbfloat16x3_t, bfloat16_t,
+	    svst3_bf16 (p0, x0, z0),
+	    svst3 (p0, x0, z0))
+
+/*
+** st3_bf16_index:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st3_bf16_index, svbfloat16x3_t, bfloat16_t,
+	    svst3_bf16 (p0, x0 + x1, z0),
+	    svst3 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_bf16_1:
+**	incb	x0
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_bf16_1, svbfloat16x3_t, bfloat16_t,
+	    svst3_bf16 (p0, x0 + svcnth (), z0),
+	    svst3 (p0, x0 + svcnth (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_bf16_2:
+**	incb	x0, all, mul #2
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_bf16_2, svbfloat16x3_t, bfloat16_t,
+	    svst3_bf16 (p0, x0 + svcnth () * 2, z0),
+	    svst3 (p0, x0 + svcnth () * 2, z0))
+
+/*
+** st3_bf16_3:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_bf16_3, svbfloat16x3_t, bfloat16_t,
+	    svst3_bf16 (p0, x0 + svcnth () * 3, z0),
+	    svst3 (p0, x0 + svcnth () * 3, z0))
+
+/*
+** st3_bf16_21:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_bf16_21, svbfloat16x3_t, bfloat16_t,
+	    svst3_bf16 (p0, x0 + svcnth () * 21, z0),
+	    svst3 (p0, x0 + svcnth () * 21, z0))
+
+/*
+** st3_bf16_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_bf16_24, svbfloat16x3_t, bfloat16_t,
+	    svst3_bf16 (p0, x0 + svcnth () * 24, z0),
+	    svst3 (p0, x0 + svcnth () * 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_bf16_m1:
+**	decb	x0
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_bf16_m1, svbfloat16x3_t, bfloat16_t,
+	    svst3_bf16 (p0, x0 - svcnth (), z0),
+	    svst3 (p0, x0 - svcnth (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_bf16_m2:
+**	decb	x0, all, mul #2
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_bf16_m2, svbfloat16x3_t, bfloat16_t,
+	    svst3_bf16 (p0, x0 - svcnth () * 2, z0),
+	    svst3 (p0, x0 - svcnth () * 2, z0))
+
+/*
+** st3_bf16_m3:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_bf16_m3, svbfloat16x3_t, bfloat16_t,
+	    svst3_bf16 (p0, x0 - svcnth () * 3, z0),
+	    svst3 (p0, x0 - svcnth () * 3, z0))
+
+/*
+** st3_bf16_m24:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_bf16_m24, svbfloat16x3_t, bfloat16_t,
+	    svst3_bf16 (p0, x0 - svcnth () * 24, z0),
+	    svst3 (p0, x0 - svcnth () * 24, z0))
+
+/*
+** st3_bf16_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_bf16_m27, svbfloat16x3_t, bfloat16_t,
+	    svst3_bf16 (p0, x0 - svcnth () * 27, z0),
+	    svst3 (p0, x0 - svcnth () * 27, z0))
+
+/*
+** st3_vnum_bf16_0:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_bf16_0, svbfloat16x3_t, bfloat16_t,
+	    svst3_vnum_bf16 (p0, x0, 0, z0),
+	    svst3_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_bf16_1:
+**	incb	x0
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_bf16_1, svbfloat16x3_t, bfloat16_t,
+	    svst3_vnum_bf16 (p0, x0, 1, z0),
+	    svst3_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_bf16_2:
+**	incb	x0, all, mul #2
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_bf16_2, svbfloat16x3_t, bfloat16_t,
+	    svst3_vnum_bf16 (p0, x0, 2, z0),
+	    svst3_vnum (p0, x0, 2, z0))
+
+/*
+** st3_vnum_bf16_3:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_bf16_3, svbfloat16x3_t, bfloat16_t,
+	    svst3_vnum_bf16 (p0, x0, 3, z0),
+	    svst3_vnum (p0, x0, 3, z0))
+
+/*
+** st3_vnum_bf16_21:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_bf16_21, svbfloat16x3_t, bfloat16_t,
+	    svst3_vnum_bf16 (p0, x0, 21, z0),
+	    svst3_vnum (p0, x0, 21, z0))
+
+/*
+** st3_vnum_bf16_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_bf16_24, svbfloat16x3_t, bfloat16_t,
+	    svst3_vnum_bf16 (p0, x0, 24, z0),
+	    svst3_vnum (p0, x0, 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_bf16_m1:
+**	decb	x0
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_bf16_m1, svbfloat16x3_t, bfloat16_t,
+	    svst3_vnum_bf16 (p0, x0, -1, z0),
+	    svst3_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_bf16_m2:
+**	decb	x0, all, mul #2
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_bf16_m2, svbfloat16x3_t, bfloat16_t,
+	    svst3_vnum_bf16 (p0, x0, -2, z0),
+	    svst3_vnum (p0, x0, -2, z0))
+
+/*
+** st3_vnum_bf16_m3:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_bf16_m3, svbfloat16x3_t, bfloat16_t,
+	    svst3_vnum_bf16 (p0, x0, -3, z0),
+	    svst3_vnum (p0, x0, -3, z0))
+
+/*
+** st3_vnum_bf16_m24:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_bf16_m24, svbfloat16x3_t, bfloat16_t,
+	    svst3_vnum_bf16 (p0, x0, -24, z0),
+	    svst3_vnum (p0, x0, -24, z0))
+
+/*
+** st3_vnum_bf16_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_bf16_m27, svbfloat16x3_t, bfloat16_t,
+	    svst3_vnum_bf16 (p0, x0, -27, z0),
+	    svst3_vnum (p0, x0, -27, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st3_vnum_bf16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st3h	{z0\.h - z2\.h}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st3_vnum_bf16_x1, svbfloat16x3_t, bfloat16_t,
+	    svst3_vnum_bf16 (p0, x0, x1, z0),
+	    svst3_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f16.c
new file mode 100644
index 000000000..388eb3708
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f16.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st3_f16_base:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_f16_base, svfloat16x3_t, float16_t,
+	    svst3_f16 (p0, x0, z0),
+	    svst3 (p0, x0, z0))
+
+/*
+** st3_f16_index:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st3_f16_index, svfloat16x3_t, float16_t,
+	    svst3_f16 (p0, x0 + x1, z0),
+	    svst3 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_f16_1:
+**	incb	x0
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_f16_1, svfloat16x3_t, float16_t,
+	    svst3_f16 (p0, x0 + svcnth (), z0),
+	    svst3 (p0, x0 + svcnth (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_f16_2:
+**	incb	x0, all, mul #2
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_f16_2, svfloat16x3_t, float16_t,
+	    svst3_f16 (p0, x0 + svcnth () * 2, z0),
+	    svst3 (p0, x0 + svcnth () * 2, z0))
+
+/*
+** st3_f16_3:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_f16_3, svfloat16x3_t, float16_t,
+	    svst3_f16 (p0, x0 + svcnth () * 3, z0),
+	    svst3 (p0, x0 + svcnth () * 3, z0))
+
+/*
+** st3_f16_21:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_f16_21, svfloat16x3_t, float16_t,
+	    svst3_f16 (p0, x0 + svcnth () * 21, z0),
+	    svst3 (p0, x0 + svcnth () * 21, z0))
+
+/*
+** st3_f16_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_f16_24, svfloat16x3_t, float16_t,
+	    svst3_f16 (p0, x0 + svcnth () * 24, z0),
+	    svst3 (p0, x0 + svcnth () * 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_f16_m1:
+**	decb	x0
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_f16_m1, svfloat16x3_t, float16_t,
+	    svst3_f16 (p0, x0 - svcnth (), z0),
+	    svst3 (p0, x0 - svcnth (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_f16_m2:
+**	decb	x0, all, mul #2
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_f16_m2, svfloat16x3_t, float16_t,
+	    svst3_f16 (p0, x0 - svcnth () * 2, z0),
+	    svst3 (p0, x0 - svcnth () * 2, z0))
+
+/*
+** st3_f16_m3:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_f16_m3, svfloat16x3_t, float16_t,
+	    svst3_f16 (p0, x0 - svcnth () * 3, z0),
+	    svst3 (p0, x0 - svcnth () * 3, z0))
+
+/*
+** st3_f16_m24:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_f16_m24, svfloat16x3_t, float16_t,
+	    svst3_f16 (p0, x0 - svcnth () * 24, z0),
+	    svst3 (p0, x0 - svcnth () * 24, z0))
+
+/*
+** st3_f16_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_f16_m27, svfloat16x3_t, float16_t,
+	    svst3_f16 (p0, x0 - svcnth () * 27, z0),
+	    svst3 (p0, x0 - svcnth () * 27, z0))
+
+/*
+** st3_vnum_f16_0:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f16_0, svfloat16x3_t, float16_t,
+	    svst3_vnum_f16 (p0, x0, 0, z0),
+	    svst3_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_f16_1:
+**	incb	x0
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f16_1, svfloat16x3_t, float16_t,
+	    svst3_vnum_f16 (p0, x0, 1, z0),
+	    svst3_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_f16_2:
+**	incb	x0, all, mul #2
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f16_2, svfloat16x3_t, float16_t,
+	    svst3_vnum_f16 (p0, x0, 2, z0),
+	    svst3_vnum (p0, x0, 2, z0))
+
+/*
+** st3_vnum_f16_3:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f16_3, svfloat16x3_t, float16_t,
+	    svst3_vnum_f16 (p0, x0, 3, z0),
+	    svst3_vnum (p0, x0, 3, z0))
+
+/*
+** st3_vnum_f16_21:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f16_21, svfloat16x3_t, float16_t,
+	    svst3_vnum_f16 (p0, x0, 21, z0),
+	    svst3_vnum (p0, x0, 21, z0))
+
+/*
+** st3_vnum_f16_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f16_24, svfloat16x3_t, float16_t,
+	    svst3_vnum_f16 (p0, x0, 24, z0),
+	    svst3_vnum (p0, x0, 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_f16_m1:
+**	decb	x0
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f16_m1, svfloat16x3_t, float16_t,
+	    svst3_vnum_f16 (p0, x0, -1, z0),
+	    svst3_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_f16_m2:
+**	decb	x0, all, mul #2
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f16_m2, svfloat16x3_t, float16_t,
+	    svst3_vnum_f16 (p0, x0, -2, z0),
+	    svst3_vnum (p0, x0, -2, z0))
+
+/*
+** st3_vnum_f16_m3:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f16_m3, svfloat16x3_t, float16_t,
+	    svst3_vnum_f16 (p0, x0, -3, z0),
+	    svst3_vnum (p0, x0, -3, z0))
+
+/*
+** st3_vnum_f16_m24:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f16_m24, svfloat16x3_t, float16_t,
+	    svst3_vnum_f16 (p0, x0, -24, z0),
+	    svst3_vnum (p0, x0, -24, z0))
+
+/*
+** st3_vnum_f16_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f16_m27, svfloat16x3_t, float16_t,
+	    svst3_vnum_f16 (p0, x0, -27, z0),
+	    svst3_vnum (p0, x0, -27, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st3_vnum_f16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st3h	{z0\.h - z2\.h}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f16_x1, svfloat16x3_t, float16_t,
+	    svst3_vnum_f16 (p0, x0, x1, z0),
+	    svst3_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f32.c
new file mode 100644
index 000000000..a5e3bdb45
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f32.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st3_f32_base:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_f32_base, svfloat32x3_t, float32_t,
+	    svst3_f32 (p0, x0, z0),
+	    svst3 (p0, x0, z0))
+
+/*
+** st3_f32_index:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (st3_f32_index, svfloat32x3_t, float32_t,
+	    svst3_f32 (p0, x0 + x1, z0),
+	    svst3 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_f32_1:
+**	incb	x0
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_f32_1, svfloat32x3_t, float32_t,
+	    svst3_f32 (p0, x0 + svcntw (), z0),
+	    svst3 (p0, x0 + svcntw (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_f32_2:
+**	incb	x0, all, mul #2
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_f32_2, svfloat32x3_t, float32_t,
+	    svst3_f32 (p0, x0 + svcntw () * 2, z0),
+	    svst3 (p0, x0 + svcntw () * 2, z0))
+
+/*
+** st3_f32_3:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_f32_3, svfloat32x3_t, float32_t,
+	    svst3_f32 (p0, x0 + svcntw () * 3, z0),
+	    svst3 (p0, x0 + svcntw () * 3, z0))
+
+/*
+** st3_f32_21:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_f32_21, svfloat32x3_t, float32_t,
+	    svst3_f32 (p0, x0 + svcntw () * 21, z0),
+	    svst3 (p0, x0 + svcntw () * 21, z0))
+
+/*
+** st3_f32_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_f32_24, svfloat32x3_t, float32_t,
+	    svst3_f32 (p0, x0 + svcntw () * 24, z0),
+	    svst3 (p0, x0 + svcntw () * 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_f32_m1:
+**	decb	x0
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_f32_m1, svfloat32x3_t, float32_t,
+	    svst3_f32 (p0, x0 - svcntw (), z0),
+	    svst3 (p0, x0 - svcntw (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_f32_m2:
+**	decb	x0, all, mul #2
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_f32_m2, svfloat32x3_t, float32_t,
+	    svst3_f32 (p0, x0 - svcntw () * 2, z0),
+	    svst3 (p0, x0 - svcntw () * 2, z0))
+
+/*
+** st3_f32_m3:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_f32_m3, svfloat32x3_t, float32_t,
+	    svst3_f32 (p0, x0 - svcntw () * 3, z0),
+	    svst3 (p0, x0 - svcntw () * 3, z0))
+
+/*
+** st3_f32_m24:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_f32_m24, svfloat32x3_t, float32_t,
+	    svst3_f32 (p0, x0 - svcntw () * 24, z0),
+	    svst3 (p0, x0 - svcntw () * 24, z0))
+
+/*
+** st3_f32_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_f32_m27, svfloat32x3_t, float32_t,
+	    svst3_f32 (p0, x0 - svcntw () * 27, z0),
+	    svst3 (p0, x0 - svcntw () * 27, z0))
+
+/*
+** st3_vnum_f32_0:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f32_0, svfloat32x3_t, float32_t,
+	    svst3_vnum_f32 (p0, x0, 0, z0),
+	    svst3_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_f32_1:
+**	incb	x0
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f32_1, svfloat32x3_t, float32_t,
+	    svst3_vnum_f32 (p0, x0, 1, z0),
+	    svst3_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_f32_2:
+**	incb	x0, all, mul #2
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f32_2, svfloat32x3_t, float32_t,
+	    svst3_vnum_f32 (p0, x0, 2, z0),
+	    svst3_vnum (p0, x0, 2, z0))
+
+/*
+** st3_vnum_f32_3:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f32_3, svfloat32x3_t, float32_t,
+	    svst3_vnum_f32 (p0, x0, 3, z0),
+	    svst3_vnum (p0, x0, 3, z0))
+
+/*
+** st3_vnum_f32_21:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f32_21, svfloat32x3_t, float32_t,
+	    svst3_vnum_f32 (p0, x0, 21, z0),
+	    svst3_vnum (p0, x0, 21, z0))
+
+/*
+** st3_vnum_f32_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f32_24, svfloat32x3_t, float32_t,
+	    svst3_vnum_f32 (p0, x0, 24, z0),
+	    svst3_vnum (p0, x0, 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_f32_m1:
+**	decb	x0
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f32_m1, svfloat32x3_t, float32_t,
+	    svst3_vnum_f32 (p0, x0, -1, z0),
+	    svst3_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_f32_m2:
+**	decb	x0, all, mul #2
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f32_m2, svfloat32x3_t, float32_t,
+	    svst3_vnum_f32 (p0, x0, -2, z0),
+	    svst3_vnum (p0, x0, -2, z0))
+
+/*
+** st3_vnum_f32_m3:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f32_m3, svfloat32x3_t, float32_t,
+	    svst3_vnum_f32 (p0, x0, -3, z0),
+	    svst3_vnum (p0, x0, -3, z0))
+
+/*
+** st3_vnum_f32_m24:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f32_m24, svfloat32x3_t, float32_t,
+	    svst3_vnum_f32 (p0, x0, -24, z0),
+	    svst3_vnum (p0, x0, -24, z0))
+
+/*
+** st3_vnum_f32_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f32_m27, svfloat32x3_t, float32_t,
+	    svst3_vnum_f32 (p0, x0, -27, z0),
+	    svst3_vnum (p0, x0, -27, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st3_vnum_f32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st3w	{z0\.s - z2\.s}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f32_x1, svfloat32x3_t, float32_t,
+	    svst3_vnum_f32 (p0, x0, x1, z0),
+	    svst3_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f64.c
new file mode 100644
index 000000000..30407da8a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_f64.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st3_f64_base:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_f64_base, svfloat64x3_t, float64_t,
+	    svst3_f64 (p0, x0, z0),
+	    svst3 (p0, x0, z0))
+
+/*
+** st3_f64_index:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_STORE (st3_f64_index, svfloat64x3_t, float64_t,
+	    svst3_f64 (p0, x0 + x1, z0),
+	    svst3 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_f64_1:
+**	incb	x0
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_f64_1, svfloat64x3_t, float64_t,
+	    svst3_f64 (p0, x0 + svcntd (), z0),
+	    svst3 (p0, x0 + svcntd (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_f64_2:
+**	incb	x0, all, mul #2
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_f64_2, svfloat64x3_t, float64_t,
+	    svst3_f64 (p0, x0 + svcntd () * 2, z0),
+	    svst3 (p0, x0 + svcntd () * 2, z0))
+
+/*
+** st3_f64_3:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_f64_3, svfloat64x3_t, float64_t,
+	    svst3_f64 (p0, x0 + svcntd () * 3, z0),
+	    svst3 (p0, x0 + svcntd () * 3, z0))
+
+/*
+** st3_f64_21:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_f64_21, svfloat64x3_t, float64_t,
+	    svst3_f64 (p0, x0 + svcntd () * 21, z0),
+	    svst3 (p0, x0 + svcntd () * 21, z0))
+
+/*
+** st3_f64_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_f64_24, svfloat64x3_t, float64_t,
+	    svst3_f64 (p0, x0 + svcntd () * 24, z0),
+	    svst3 (p0, x0 + svcntd () * 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_f64_m1:
+**	decb	x0
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_f64_m1, svfloat64x3_t, float64_t,
+	    svst3_f64 (p0, x0 - svcntd (), z0),
+	    svst3 (p0, x0 - svcntd (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_f64_m2:
+**	decb	x0, all, mul #2
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_f64_m2, svfloat64x3_t, float64_t,
+	    svst3_f64 (p0, x0 - svcntd () * 2, z0),
+	    svst3 (p0, x0 - svcntd () * 2, z0))
+
+/*
+** st3_f64_m3:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_f64_m3, svfloat64x3_t, float64_t,
+	    svst3_f64 (p0, x0 - svcntd () * 3, z0),
+	    svst3 (p0, x0 - svcntd () * 3, z0))
+
+/*
+** st3_f64_m24:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_f64_m24, svfloat64x3_t, float64_t,
+	    svst3_f64 (p0, x0 - svcntd () * 24, z0),
+	    svst3 (p0, x0 - svcntd () * 24, z0))
+
+/*
+** st3_f64_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_f64_m27, svfloat64x3_t, float64_t,
+	    svst3_f64 (p0, x0 - svcntd () * 27, z0),
+	    svst3 (p0, x0 - svcntd () * 27, z0))
+
+/*
+** st3_vnum_f64_0:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f64_0, svfloat64x3_t, float64_t,
+	    svst3_vnum_f64 (p0, x0, 0, z0),
+	    svst3_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_f64_1:
+**	incb	x0
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f64_1, svfloat64x3_t, float64_t,
+	    svst3_vnum_f64 (p0, x0, 1, z0),
+	    svst3_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_f64_2:
+**	incb	x0, all, mul #2
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f64_2, svfloat64x3_t, float64_t,
+	    svst3_vnum_f64 (p0, x0, 2, z0),
+	    svst3_vnum (p0, x0, 2, z0))
+
+/*
+** st3_vnum_f64_3:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f64_3, svfloat64x3_t, float64_t,
+	    svst3_vnum_f64 (p0, x0, 3, z0),
+	    svst3_vnum (p0, x0, 3, z0))
+
+/*
+** st3_vnum_f64_21:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f64_21, svfloat64x3_t, float64_t,
+	    svst3_vnum_f64 (p0, x0, 21, z0),
+	    svst3_vnum (p0, x0, 21, z0))
+
+/*
+** st3_vnum_f64_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f64_24, svfloat64x3_t, float64_t,
+	    svst3_vnum_f64 (p0, x0, 24, z0),
+	    svst3_vnum (p0, x0, 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_f64_m1:
+**	decb	x0
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f64_m1, svfloat64x3_t, float64_t,
+	    svst3_vnum_f64 (p0, x0, -1, z0),
+	    svst3_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_f64_m2:
+**	decb	x0, all, mul #2
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f64_m2, svfloat64x3_t, float64_t,
+	    svst3_vnum_f64 (p0, x0, -2, z0),
+	    svst3_vnum (p0, x0, -2, z0))
+
+/*
+** st3_vnum_f64_m3:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f64_m3, svfloat64x3_t, float64_t,
+	    svst3_vnum_f64 (p0, x0, -3, z0),
+	    svst3_vnum (p0, x0, -3, z0))
+
+/*
+** st3_vnum_f64_m24:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f64_m24, svfloat64x3_t, float64_t,
+	    svst3_vnum_f64 (p0, x0, -24, z0),
+	    svst3_vnum (p0, x0, -24, z0))
+
+/*
+** st3_vnum_f64_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f64_m27, svfloat64x3_t, float64_t,
+	    svst3_vnum_f64 (p0, x0, -27, z0),
+	    svst3_vnum (p0, x0, -27, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st3_vnum_f64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st3d	{z0\.d - z2\.d}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st3_vnum_f64_x1, svfloat64x3_t, float64_t,
+	    svst3_vnum_f64 (p0, x0, x1, z0),
+	    svst3_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s16.c
new file mode 100644
index 000000000..a4a1109c5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s16.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st3_s16_base:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s16_base, svint16x3_t, int16_t,
+	    svst3_s16 (p0, x0, z0),
+	    svst3 (p0, x0, z0))
+
+/*
+** st3_s16_index:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st3_s16_index, svint16x3_t, int16_t,
+	    svst3_s16 (p0, x0 + x1, z0),
+	    svst3 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_s16_1:
+**	incb	x0
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s16_1, svint16x3_t, int16_t,
+	    svst3_s16 (p0, x0 + svcnth (), z0),
+	    svst3 (p0, x0 + svcnth (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_s16_2:
+**	incb	x0, all, mul #2
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s16_2, svint16x3_t, int16_t,
+	    svst3_s16 (p0, x0 + svcnth () * 2, z0),
+	    svst3 (p0, x0 + svcnth () * 2, z0))
+
+/*
+** st3_s16_3:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_s16_3, svint16x3_t, int16_t,
+	    svst3_s16 (p0, x0 + svcnth () * 3, z0),
+	    svst3 (p0, x0 + svcnth () * 3, z0))
+
+/*
+** st3_s16_21:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_s16_21, svint16x3_t, int16_t,
+	    svst3_s16 (p0, x0 + svcnth () * 21, z0),
+	    svst3 (p0, x0 + svcnth () * 21, z0))
+
+/*
+** st3_s16_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_s16_24, svint16x3_t, int16_t,
+	    svst3_s16 (p0, x0 + svcnth () * 24, z0),
+	    svst3 (p0, x0 + svcnth () * 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_s16_m1:
+**	decb	x0
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s16_m1, svint16x3_t, int16_t,
+	    svst3_s16 (p0, x0 - svcnth (), z0),
+	    svst3 (p0, x0 - svcnth (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_s16_m2:
+**	decb	x0, all, mul #2
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s16_m2, svint16x3_t, int16_t,
+	    svst3_s16 (p0, x0 - svcnth () * 2, z0),
+	    svst3 (p0, x0 - svcnth () * 2, z0))
+
+/*
+** st3_s16_m3:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_s16_m3, svint16x3_t, int16_t,
+	    svst3_s16 (p0, x0 - svcnth () * 3, z0),
+	    svst3 (p0, x0 - svcnth () * 3, z0))
+
+/*
+** st3_s16_m24:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_s16_m24, svint16x3_t, int16_t,
+	    svst3_s16 (p0, x0 - svcnth () * 24, z0),
+	    svst3 (p0, x0 - svcnth () * 24, z0))
+
+/*
+** st3_s16_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_s16_m27, svint16x3_t, int16_t,
+	    svst3_s16 (p0, x0 - svcnth () * 27, z0),
+	    svst3 (p0, x0 - svcnth () * 27, z0))
+
+/*
+** st3_vnum_s16_0:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s16_0, svint16x3_t, int16_t,
+	    svst3_vnum_s16 (p0, x0, 0, z0),
+	    svst3_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_s16_1:
+**	incb	x0
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s16_1, svint16x3_t, int16_t,
+	    svst3_vnum_s16 (p0, x0, 1, z0),
+	    svst3_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_s16_2:
+**	incb	x0, all, mul #2
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s16_2, svint16x3_t, int16_t,
+	    svst3_vnum_s16 (p0, x0, 2, z0),
+	    svst3_vnum (p0, x0, 2, z0))
+
+/*
+** st3_vnum_s16_3:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s16_3, svint16x3_t, int16_t,
+	    svst3_vnum_s16 (p0, x0, 3, z0),
+	    svst3_vnum (p0, x0, 3, z0))
+
+/*
+** st3_vnum_s16_21:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s16_21, svint16x3_t, int16_t,
+	    svst3_vnum_s16 (p0, x0, 21, z0),
+	    svst3_vnum (p0, x0, 21, z0))
+
+/*
+** st3_vnum_s16_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s16_24, svint16x3_t, int16_t,
+	    svst3_vnum_s16 (p0, x0, 24, z0),
+	    svst3_vnum (p0, x0, 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_s16_m1:
+**	decb	x0
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s16_m1, svint16x3_t, int16_t,
+	    svst3_vnum_s16 (p0, x0, -1, z0),
+	    svst3_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_s16_m2:
+**	decb	x0, all, mul #2
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s16_m2, svint16x3_t, int16_t,
+	    svst3_vnum_s16 (p0, x0, -2, z0),
+	    svst3_vnum (p0, x0, -2, z0))
+
+/*
+** st3_vnum_s16_m3:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s16_m3, svint16x3_t, int16_t,
+	    svst3_vnum_s16 (p0, x0, -3, z0),
+	    svst3_vnum (p0, x0, -3, z0))
+
+/*
+** st3_vnum_s16_m24:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s16_m24, svint16x3_t, int16_t,
+	    svst3_vnum_s16 (p0, x0, -24, z0),
+	    svst3_vnum (p0, x0, -24, z0))
+
+/*
+** st3_vnum_s16_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s16_m27, svint16x3_t, int16_t,
+	    svst3_vnum_s16 (p0, x0, -27, z0),
+	    svst3_vnum (p0, x0, -27, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st3_vnum_s16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st3h	{z0\.h - z2\.h}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s16_x1, svint16x3_t, int16_t,
+	    svst3_vnum_s16 (p0, x0, x1, z0),
+	    svst3_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s32.c
new file mode 100644
index 000000000..2442d9b28
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s32.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st3_s32_base:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s32_base, svint32x3_t, int32_t,
+	    svst3_s32 (p0, x0, z0),
+	    svst3 (p0, x0, z0))
+
+/*
+** st3_s32_index:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (st3_s32_index, svint32x3_t, int32_t,
+	    svst3_s32 (p0, x0 + x1, z0),
+	    svst3 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_s32_1:
+**	incb	x0
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s32_1, svint32x3_t, int32_t,
+	    svst3_s32 (p0, x0 + svcntw (), z0),
+	    svst3 (p0, x0 + svcntw (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_s32_2:
+**	incb	x0, all, mul #2
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s32_2, svint32x3_t, int32_t,
+	    svst3_s32 (p0, x0 + svcntw () * 2, z0),
+	    svst3 (p0, x0 + svcntw () * 2, z0))
+
+/*
+** st3_s32_3:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_s32_3, svint32x3_t, int32_t,
+	    svst3_s32 (p0, x0 + svcntw () * 3, z0),
+	    svst3 (p0, x0 + svcntw () * 3, z0))
+
+/*
+** st3_s32_21:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_s32_21, svint32x3_t, int32_t,
+	    svst3_s32 (p0, x0 + svcntw () * 21, z0),
+	    svst3 (p0, x0 + svcntw () * 21, z0))
+
+/*
+** st3_s32_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_s32_24, svint32x3_t, int32_t,
+	    svst3_s32 (p0, x0 + svcntw () * 24, z0),
+	    svst3 (p0, x0 + svcntw () * 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_s32_m1:
+**	decb	x0
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s32_m1, svint32x3_t, int32_t,
+	    svst3_s32 (p0, x0 - svcntw (), z0),
+	    svst3 (p0, x0 - svcntw (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_s32_m2:
+**	decb	x0, all, mul #2
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s32_m2, svint32x3_t, int32_t,
+	    svst3_s32 (p0, x0 - svcntw () * 2, z0),
+	    svst3 (p0, x0 - svcntw () * 2, z0))
+
+/*
+** st3_s32_m3:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_s32_m3, svint32x3_t, int32_t,
+	    svst3_s32 (p0, x0 - svcntw () * 3, z0),
+	    svst3 (p0, x0 - svcntw () * 3, z0))
+
+/*
+** st3_s32_m24:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_s32_m24, svint32x3_t, int32_t,
+	    svst3_s32 (p0, x0 - svcntw () * 24, z0),
+	    svst3 (p0, x0 - svcntw () * 24, z0))
+
+/*
+** st3_s32_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_s32_m27, svint32x3_t, int32_t,
+	    svst3_s32 (p0, x0 - svcntw () * 27, z0),
+	    svst3 (p0, x0 - svcntw () * 27, z0))
+
+/*
+** st3_vnum_s32_0:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s32_0, svint32x3_t, int32_t,
+	    svst3_vnum_s32 (p0, x0, 0, z0),
+	    svst3_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_s32_1:
+**	incb	x0
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s32_1, svint32x3_t, int32_t,
+	    svst3_vnum_s32 (p0, x0, 1, z0),
+	    svst3_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_s32_2:
+**	incb	x0, all, mul #2
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s32_2, svint32x3_t, int32_t,
+	    svst3_vnum_s32 (p0, x0, 2, z0),
+	    svst3_vnum (p0, x0, 2, z0))
+
+/*
+** st3_vnum_s32_3:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s32_3, svint32x3_t, int32_t,
+	    svst3_vnum_s32 (p0, x0, 3, z0),
+	    svst3_vnum (p0, x0, 3, z0))
+
+/*
+** st3_vnum_s32_21:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s32_21, svint32x3_t, int32_t,
+	    svst3_vnum_s32 (p0, x0, 21, z0),
+	    svst3_vnum (p0, x0, 21, z0))
+
+/*
+** st3_vnum_s32_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s32_24, svint32x3_t, int32_t,
+	    svst3_vnum_s32 (p0, x0, 24, z0),
+	    svst3_vnum (p0, x0, 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_s32_m1:
+**	decb	x0
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s32_m1, svint32x3_t, int32_t,
+	    svst3_vnum_s32 (p0, x0, -1, z0),
+	    svst3_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_s32_m2:
+**	decb	x0, all, mul #2
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s32_m2, svint32x3_t, int32_t,
+	    svst3_vnum_s32 (p0, x0, -2, z0),
+	    svst3_vnum (p0, x0, -2, z0))
+
+/*
+** st3_vnum_s32_m3:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s32_m3, svint32x3_t, int32_t,
+	    svst3_vnum_s32 (p0, x0, -3, z0),
+	    svst3_vnum (p0, x0, -3, z0))
+
+/*
+** st3_vnum_s32_m24:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s32_m24, svint32x3_t, int32_t,
+	    svst3_vnum_s32 (p0, x0, -24, z0),
+	    svst3_vnum (p0, x0, -24, z0))
+
+/*
+** st3_vnum_s32_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s32_m27, svint32x3_t, int32_t,
+	    svst3_vnum_s32 (p0, x0, -27, z0),
+	    svst3_vnum (p0, x0, -27, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st3_vnum_s32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st3w	{z0\.s - z2\.s}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s32_x1, svint32x3_t, int32_t,
+	    svst3_vnum_s32 (p0, x0, x1, z0),
+	    svst3_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s64.c
new file mode 100644
index 000000000..eca6a7cea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s64.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st3_s64_base:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s64_base, svint64x3_t, int64_t,
+	    svst3_s64 (p0, x0, z0),
+	    svst3 (p0, x0, z0))
+
+/*
+** st3_s64_index:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_STORE (st3_s64_index, svint64x3_t, int64_t,
+	    svst3_s64 (p0, x0 + x1, z0),
+	    svst3 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_s64_1:
+**	incb	x0
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s64_1, svint64x3_t, int64_t,
+	    svst3_s64 (p0, x0 + svcntd (), z0),
+	    svst3 (p0, x0 + svcntd (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_s64_2:
+**	incb	x0, all, mul #2
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s64_2, svint64x3_t, int64_t,
+	    svst3_s64 (p0, x0 + svcntd () * 2, z0),
+	    svst3 (p0, x0 + svcntd () * 2, z0))
+
+/*
+** st3_s64_3:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_s64_3, svint64x3_t, int64_t,
+	    svst3_s64 (p0, x0 + svcntd () * 3, z0),
+	    svst3 (p0, x0 + svcntd () * 3, z0))
+
+/*
+** st3_s64_21:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_s64_21, svint64x3_t, int64_t,
+	    svst3_s64 (p0, x0 + svcntd () * 21, z0),
+	    svst3 (p0, x0 + svcntd () * 21, z0))
+
+/*
+** st3_s64_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_s64_24, svint64x3_t, int64_t,
+	    svst3_s64 (p0, x0 + svcntd () * 24, z0),
+	    svst3 (p0, x0 + svcntd () * 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_s64_m1:
+**	decb	x0
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s64_m1, svint64x3_t, int64_t,
+	    svst3_s64 (p0, x0 - svcntd (), z0),
+	    svst3 (p0, x0 - svcntd (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_s64_m2:
+**	decb	x0, all, mul #2
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s64_m2, svint64x3_t, int64_t,
+	    svst3_s64 (p0, x0 - svcntd () * 2, z0),
+	    svst3 (p0, x0 - svcntd () * 2, z0))
+
+/*
+** st3_s64_m3:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_s64_m3, svint64x3_t, int64_t,
+	    svst3_s64 (p0, x0 - svcntd () * 3, z0),
+	    svst3 (p0, x0 - svcntd () * 3, z0))
+
+/*
+** st3_s64_m24:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_s64_m24, svint64x3_t, int64_t,
+	    svst3_s64 (p0, x0 - svcntd () * 24, z0),
+	    svst3 (p0, x0 - svcntd () * 24, z0))
+
+/*
+** st3_s64_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_s64_m27, svint64x3_t, int64_t,
+	    svst3_s64 (p0, x0 - svcntd () * 27, z0),
+	    svst3 (p0, x0 - svcntd () * 27, z0))
+
+/*
+** st3_vnum_s64_0:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s64_0, svint64x3_t, int64_t,
+	    svst3_vnum_s64 (p0, x0, 0, z0),
+	    svst3_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_s64_1:
+**	incb	x0
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s64_1, svint64x3_t, int64_t,
+	    svst3_vnum_s64 (p0, x0, 1, z0),
+	    svst3_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_s64_2:
+**	incb	x0, all, mul #2
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s64_2, svint64x3_t, int64_t,
+	    svst3_vnum_s64 (p0, x0, 2, z0),
+	    svst3_vnum (p0, x0, 2, z0))
+
+/*
+** st3_vnum_s64_3:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s64_3, svint64x3_t, int64_t,
+	    svst3_vnum_s64 (p0, x0, 3, z0),
+	    svst3_vnum (p0, x0, 3, z0))
+
+/*
+** st3_vnum_s64_21:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s64_21, svint64x3_t, int64_t,
+	    svst3_vnum_s64 (p0, x0, 21, z0),
+	    svst3_vnum (p0, x0, 21, z0))
+
+/*
+** st3_vnum_s64_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s64_24, svint64x3_t, int64_t,
+	    svst3_vnum_s64 (p0, x0, 24, z0),
+	    svst3_vnum (p0, x0, 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_s64_m1:
+**	decb	x0
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s64_m1, svint64x3_t, int64_t,
+	    svst3_vnum_s64 (p0, x0, -1, z0),
+	    svst3_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_s64_m2:
+**	decb	x0, all, mul #2
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s64_m2, svint64x3_t, int64_t,
+	    svst3_vnum_s64 (p0, x0, -2, z0),
+	    svst3_vnum (p0, x0, -2, z0))
+
+/*
+** st3_vnum_s64_m3:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s64_m3, svint64x3_t, int64_t,
+	    svst3_vnum_s64 (p0, x0, -3, z0),
+	    svst3_vnum (p0, x0, -3, z0))
+
+/*
+** st3_vnum_s64_m24:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s64_m24, svint64x3_t, int64_t,
+	    svst3_vnum_s64 (p0, x0, -24, z0),
+	    svst3_vnum (p0, x0, -24, z0))
+
+/*
+** st3_vnum_s64_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s64_m27, svint64x3_t, int64_t,
+	    svst3_vnum_s64 (p0, x0, -27, z0),
+	    svst3_vnum (p0, x0, -27, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st3_vnum_s64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st3d	{z0\.d - z2\.d}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s64_x1, svint64x3_t, int64_t,
+	    svst3_vnum_s64 (p0, x0, x1, z0),
+	    svst3_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s8.c
new file mode 100644
index 000000000..a54ff4b74
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_s8.c
@@ -0,0 +1,246 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st3_s8_base:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s8_base, svint8x3_t, int8_t,
+	    svst3_s8 (p0, x0, z0),
+	    svst3 (p0, x0, z0))
+
+/*
+** st3_s8_index:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, x1\]
+**	ret
+*/
+TEST_STORE (st3_s8_index, svint8x3_t, int8_t,
+	    svst3_s8 (p0, x0 + x1, z0),
+	    svst3 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_s8_1:
+**	incb	x0
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s8_1, svint8x3_t, int8_t,
+	    svst3_s8 (p0, x0 + svcntb (), z0),
+	    svst3 (p0, x0 + svcntb (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_s8_2:
+**	incb	x0, all, mul #2
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s8_2, svint8x3_t, int8_t,
+	    svst3_s8 (p0, x0 + svcntb () * 2, z0),
+	    svst3 (p0, x0 + svcntb () * 2, z0))
+
+/*
+** st3_s8_3:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_s8_3, svint8x3_t, int8_t,
+	    svst3_s8 (p0, x0 + svcntb () * 3, z0),
+	    svst3 (p0, x0 + svcntb () * 3, z0))
+
+/*
+** st3_s8_21:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_s8_21, svint8x3_t, int8_t,
+	    svst3_s8 (p0, x0 + svcntb () * 21, z0),
+	    svst3 (p0, x0 + svcntb () * 21, z0))
+
+/*
+** st3_s8_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3b	{z0\.b - z2\.b}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_s8_24, svint8x3_t, int8_t,
+	    svst3_s8 (p0, x0 + svcntb () * 24, z0),
+	    svst3 (p0, x0 + svcntb () * 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_s8_m1:
+**	decb	x0
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s8_m1, svint8x3_t, int8_t,
+	    svst3_s8 (p0, x0 - svcntb (), z0),
+	    svst3 (p0, x0 - svcntb (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_s8_m2:
+**	decb	x0, all, mul #2
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_s8_m2, svint8x3_t, int8_t,
+	    svst3_s8 (p0, x0 - svcntb () * 2, z0),
+	    svst3 (p0, x0 - svcntb () * 2, z0))
+
+/*
+** st3_s8_m3:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_s8_m3, svint8x3_t, int8_t,
+	    svst3_s8 (p0, x0 - svcntb () * 3, z0),
+	    svst3 (p0, x0 - svcntb () * 3, z0))
+
+/*
+** st3_s8_m24:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_s8_m24, svint8x3_t, int8_t,
+	    svst3_s8 (p0, x0 - svcntb () * 24, z0),
+	    svst3 (p0, x0 - svcntb () * 24, z0))
+
+/*
+** st3_s8_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3b	{z0\.b - z2\.b}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_s8_m27, svint8x3_t, int8_t,
+	    svst3_s8 (p0, x0 - svcntb () * 27, z0),
+	    svst3 (p0, x0 - svcntb () * 27, z0))
+
+/*
+** st3_vnum_s8_0:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s8_0, svint8x3_t, int8_t,
+	    svst3_vnum_s8 (p0, x0, 0, z0),
+	    svst3_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_s8_1:
+**	incb	x0
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s8_1, svint8x3_t, int8_t,
+	    svst3_vnum_s8 (p0, x0, 1, z0),
+	    svst3_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_s8_2:
+**	incb	x0, all, mul #2
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s8_2, svint8x3_t, int8_t,
+	    svst3_vnum_s8 (p0, x0, 2, z0),
+	    svst3_vnum (p0, x0, 2, z0))
+
+/*
+** st3_vnum_s8_3:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s8_3, svint8x3_t, int8_t,
+	    svst3_vnum_s8 (p0, x0, 3, z0),
+	    svst3_vnum (p0, x0, 3, z0))
+
+/*
+** st3_vnum_s8_21:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s8_21, svint8x3_t, int8_t,
+	    svst3_vnum_s8 (p0, x0, 21, z0),
+	    svst3_vnum (p0, x0, 21, z0))
+
+/*
+** st3_vnum_s8_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3b	{z0\.b - z2\.b}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s8_24, svint8x3_t, int8_t,
+	    svst3_vnum_s8 (p0, x0, 24, z0),
+	    svst3_vnum (p0, x0, 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_s8_m1:
+**	decb	x0
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s8_m1, svint8x3_t, int8_t,
+	    svst3_vnum_s8 (p0, x0, -1, z0),
+	    svst3_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_s8_m2:
+**	decb	x0, all, mul #2
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s8_m2, svint8x3_t, int8_t,
+	    svst3_vnum_s8 (p0, x0, -2, z0),
+	    svst3_vnum (p0, x0, -2, z0))
+
+/*
+** st3_vnum_s8_m3:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s8_m3, svint8x3_t, int8_t,
+	    svst3_vnum_s8 (p0, x0, -3, z0),
+	    svst3_vnum (p0, x0, -3, z0))
+
+/*
+** st3_vnum_s8_m24:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s8_m24, svint8x3_t, int8_t,
+	    svst3_vnum_s8 (p0, x0, -24, z0),
+	    svst3_vnum (p0, x0, -24, z0))
+
+/*
+** st3_vnum_s8_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3b	{z0\.b - z2\.b}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_s8_m27, svint8x3_t, int8_t,
+	    svst3_vnum_s8 (p0, x0, -27, z0),
+	    svst3_vnum (p0, x0, -27, z0))
+
+/*
+** st3_vnum_s8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	st3b	{z0\.b - z2\.b}, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_STORE (st3_vnum_s8_x1, svint8x3_t, int8_t,
+	    svst3_vnum_s8 (p0, x0, x1, z0),
+	    svst3_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u16.c
new file mode 100644
index 000000000..d4e8efca3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u16.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st3_u16_base:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u16_base, svuint16x3_t, uint16_t,
+	    svst3_u16 (p0, x0, z0),
+	    svst3 (p0, x0, z0))
+
+/*
+** st3_u16_index:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st3_u16_index, svuint16x3_t, uint16_t,
+	    svst3_u16 (p0, x0 + x1, z0),
+	    svst3 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_u16_1:
+**	incb	x0
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u16_1, svuint16x3_t, uint16_t,
+	    svst3_u16 (p0, x0 + svcnth (), z0),
+	    svst3 (p0, x0 + svcnth (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_u16_2:
+**	incb	x0, all, mul #2
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u16_2, svuint16x3_t, uint16_t,
+	    svst3_u16 (p0, x0 + svcnth () * 2, z0),
+	    svst3 (p0, x0 + svcnth () * 2, z0))
+
+/*
+** st3_u16_3:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_u16_3, svuint16x3_t, uint16_t,
+	    svst3_u16 (p0, x0 + svcnth () * 3, z0),
+	    svst3 (p0, x0 + svcnth () * 3, z0))
+
+/*
+** st3_u16_21:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_u16_21, svuint16x3_t, uint16_t,
+	    svst3_u16 (p0, x0 + svcnth () * 21, z0),
+	    svst3 (p0, x0 + svcnth () * 21, z0))
+
+/*
+** st3_u16_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_u16_24, svuint16x3_t, uint16_t,
+	    svst3_u16 (p0, x0 + svcnth () * 24, z0),
+	    svst3 (p0, x0 + svcnth () * 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_u16_m1:
+**	decb	x0
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u16_m1, svuint16x3_t, uint16_t,
+	    svst3_u16 (p0, x0 - svcnth (), z0),
+	    svst3 (p0, x0 - svcnth (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_u16_m2:
+**	decb	x0, all, mul #2
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u16_m2, svuint16x3_t, uint16_t,
+	    svst3_u16 (p0, x0 - svcnth () * 2, z0),
+	    svst3 (p0, x0 - svcnth () * 2, z0))
+
+/*
+** st3_u16_m3:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_u16_m3, svuint16x3_t, uint16_t,
+	    svst3_u16 (p0, x0 - svcnth () * 3, z0),
+	    svst3 (p0, x0 - svcnth () * 3, z0))
+
+/*
+** st3_u16_m24:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_u16_m24, svuint16x3_t, uint16_t,
+	    svst3_u16 (p0, x0 - svcnth () * 24, z0),
+	    svst3 (p0, x0 - svcnth () * 24, z0))
+
+/*
+** st3_u16_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_u16_m27, svuint16x3_t, uint16_t,
+	    svst3_u16 (p0, x0 - svcnth () * 27, z0),
+	    svst3 (p0, x0 - svcnth () * 27, z0))
+
+/*
+** st3_vnum_u16_0:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u16_0, svuint16x3_t, uint16_t,
+	    svst3_vnum_u16 (p0, x0, 0, z0),
+	    svst3_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_u16_1:
+**	incb	x0
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u16_1, svuint16x3_t, uint16_t,
+	    svst3_vnum_u16 (p0, x0, 1, z0),
+	    svst3_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_u16_2:
+**	incb	x0, all, mul #2
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u16_2, svuint16x3_t, uint16_t,
+	    svst3_vnum_u16 (p0, x0, 2, z0),
+	    svst3_vnum (p0, x0, 2, z0))
+
+/*
+** st3_vnum_u16_3:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u16_3, svuint16x3_t, uint16_t,
+	    svst3_vnum_u16 (p0, x0, 3, z0),
+	    svst3_vnum (p0, x0, 3, z0))
+
+/*
+** st3_vnum_u16_21:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u16_21, svuint16x3_t, uint16_t,
+	    svst3_vnum_u16 (p0, x0, 21, z0),
+	    svst3_vnum (p0, x0, 21, z0))
+
+/*
+** st3_vnum_u16_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u16_24, svuint16x3_t, uint16_t,
+	    svst3_vnum_u16 (p0, x0, 24, z0),
+	    svst3_vnum (p0, x0, 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_u16_m1:
+**	decb	x0
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u16_m1, svuint16x3_t, uint16_t,
+	    svst3_vnum_u16 (p0, x0, -1, z0),
+	    svst3_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_u16_m2:
+**	decb	x0, all, mul #2
+**	st3h	{z0\.h - z2\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u16_m2, svuint16x3_t, uint16_t,
+	    svst3_vnum_u16 (p0, x0, -2, z0),
+	    svst3_vnum (p0, x0, -2, z0))
+
+/*
+** st3_vnum_u16_m3:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u16_m3, svuint16x3_t, uint16_t,
+	    svst3_vnum_u16 (p0, x0, -3, z0),
+	    svst3_vnum (p0, x0, -3, z0))
+
+/*
+** st3_vnum_u16_m24:
+**	st3h	{z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u16_m24, svuint16x3_t, uint16_t,
+	    svst3_vnum_u16 (p0, x0, -24, z0),
+	    svst3_vnum (p0, x0, -24, z0))
+
+/*
+** st3_vnum_u16_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3h	{z0\.h - z2\.h}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u16_m27, svuint16x3_t, uint16_t,
+	    svst3_vnum_u16 (p0, x0, -27, z0),
+	    svst3_vnum (p0, x0, -27, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st3_vnum_u16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st3h	{z0\.h - z2\.h}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u16_x1, svuint16x3_t, uint16_t,
+	    svst3_vnum_u16 (p0, x0, x1, z0),
+	    svst3_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u32.c
new file mode 100644
index 000000000..8be3aa957
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u32.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st3_u32_base:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u32_base, svuint32x3_t, uint32_t,
+	    svst3_u32 (p0, x0, z0),
+	    svst3 (p0, x0, z0))
+
+/*
+** st3_u32_index:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (st3_u32_index, svuint32x3_t, uint32_t,
+	    svst3_u32 (p0, x0 + x1, z0),
+	    svst3 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_u32_1:
+**	incb	x0
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u32_1, svuint32x3_t, uint32_t,
+	    svst3_u32 (p0, x0 + svcntw (), z0),
+	    svst3 (p0, x0 + svcntw (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_u32_2:
+**	incb	x0, all, mul #2
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u32_2, svuint32x3_t, uint32_t,
+	    svst3_u32 (p0, x0 + svcntw () * 2, z0),
+	    svst3 (p0, x0 + svcntw () * 2, z0))
+
+/*
+** st3_u32_3:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_u32_3, svuint32x3_t, uint32_t,
+	    svst3_u32 (p0, x0 + svcntw () * 3, z0),
+	    svst3 (p0, x0 + svcntw () * 3, z0))
+
+/*
+** st3_u32_21:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_u32_21, svuint32x3_t, uint32_t,
+	    svst3_u32 (p0, x0 + svcntw () * 21, z0),
+	    svst3 (p0, x0 + svcntw () * 21, z0))
+
+/*
+** st3_u32_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_u32_24, svuint32x3_t, uint32_t,
+	    svst3_u32 (p0, x0 + svcntw () * 24, z0),
+	    svst3 (p0, x0 + svcntw () * 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_u32_m1:
+**	decb	x0
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u32_m1, svuint32x3_t, uint32_t,
+	    svst3_u32 (p0, x0 - svcntw (), z0),
+	    svst3 (p0, x0 - svcntw (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_u32_m2:
+**	decb	x0, all, mul #2
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u32_m2, svuint32x3_t, uint32_t,
+	    svst3_u32 (p0, x0 - svcntw () * 2, z0),
+	    svst3 (p0, x0 - svcntw () * 2, z0))
+
+/*
+** st3_u32_m3:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_u32_m3, svuint32x3_t, uint32_t,
+	    svst3_u32 (p0, x0 - svcntw () * 3, z0),
+	    svst3 (p0, x0 - svcntw () * 3, z0))
+
+/*
+** st3_u32_m24:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_u32_m24, svuint32x3_t, uint32_t,
+	    svst3_u32 (p0, x0 - svcntw () * 24, z0),
+	    svst3 (p0, x0 - svcntw () * 24, z0))
+
+/*
+** st3_u32_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_u32_m27, svuint32x3_t, uint32_t,
+	    svst3_u32 (p0, x0 - svcntw () * 27, z0),
+	    svst3 (p0, x0 - svcntw () * 27, z0))
+
+/*
+** st3_vnum_u32_0:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u32_0, svuint32x3_t, uint32_t,
+	    svst3_vnum_u32 (p0, x0, 0, z0),
+	    svst3_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_u32_1:
+**	incb	x0
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u32_1, svuint32x3_t, uint32_t,
+	    svst3_vnum_u32 (p0, x0, 1, z0),
+	    svst3_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_u32_2:
+**	incb	x0, all, mul #2
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u32_2, svuint32x3_t, uint32_t,
+	    svst3_vnum_u32 (p0, x0, 2, z0),
+	    svst3_vnum (p0, x0, 2, z0))
+
+/*
+** st3_vnum_u32_3:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u32_3, svuint32x3_t, uint32_t,
+	    svst3_vnum_u32 (p0, x0, 3, z0),
+	    svst3_vnum (p0, x0, 3, z0))
+
+/*
+** st3_vnum_u32_21:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u32_21, svuint32x3_t, uint32_t,
+	    svst3_vnum_u32 (p0, x0, 21, z0),
+	    svst3_vnum (p0, x0, 21, z0))
+
+/*
+** st3_vnum_u32_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u32_24, svuint32x3_t, uint32_t,
+	    svst3_vnum_u32 (p0, x0, 24, z0),
+	    svst3_vnum (p0, x0, 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_u32_m1:
+**	decb	x0
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u32_m1, svuint32x3_t, uint32_t,
+	    svst3_vnum_u32 (p0, x0, -1, z0),
+	    svst3_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_u32_m2:
+**	decb	x0, all, mul #2
+**	st3w	{z0\.s - z2\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u32_m2, svuint32x3_t, uint32_t,
+	    svst3_vnum_u32 (p0, x0, -2, z0),
+	    svst3_vnum (p0, x0, -2, z0))
+
+/*
+** st3_vnum_u32_m3:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u32_m3, svuint32x3_t, uint32_t,
+	    svst3_vnum_u32 (p0, x0, -3, z0),
+	    svst3_vnum (p0, x0, -3, z0))
+
+/*
+** st3_vnum_u32_m24:
+**	st3w	{z0\.s - z2\.s}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u32_m24, svuint32x3_t, uint32_t,
+	    svst3_vnum_u32 (p0, x0, -24, z0),
+	    svst3_vnum (p0, x0, -24, z0))
+
+/*
+** st3_vnum_u32_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3w	{z0\.s - z2\.s}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u32_m27, svuint32x3_t, uint32_t,
+	    svst3_vnum_u32 (p0, x0, -27, z0),
+	    svst3_vnum (p0, x0, -27, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st3_vnum_u32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st3w	{z0\.s - z2\.s}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u32_x1, svuint32x3_t, uint32_t,
+	    svst3_vnum_u32 (p0, x0, x1, z0),
+	    svst3_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u64.c
new file mode 100644
index 000000000..31cb304ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u64.c
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st3_u64_base:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u64_base, svuint64x3_t, uint64_t,
+	    svst3_u64 (p0, x0, z0),
+	    svst3 (p0, x0, z0))
+
+/*
+** st3_u64_index:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_STORE (st3_u64_index, svuint64x3_t, uint64_t,
+	    svst3_u64 (p0, x0 + x1, z0),
+	    svst3 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_u64_1:
+**	incb	x0
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u64_1, svuint64x3_t, uint64_t,
+	    svst3_u64 (p0, x0 + svcntd (), z0),
+	    svst3 (p0, x0 + svcntd (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_u64_2:
+**	incb	x0, all, mul #2
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u64_2, svuint64x3_t, uint64_t,
+	    svst3_u64 (p0, x0 + svcntd () * 2, z0),
+	    svst3 (p0, x0 + svcntd () * 2, z0))
+
+/*
+** st3_u64_3:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_u64_3, svuint64x3_t, uint64_t,
+	    svst3_u64 (p0, x0 + svcntd () * 3, z0),
+	    svst3 (p0, x0 + svcntd () * 3, z0))
+
+/*
+** st3_u64_21:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_u64_21, svuint64x3_t, uint64_t,
+	    svst3_u64 (p0, x0 + svcntd () * 21, z0),
+	    svst3 (p0, x0 + svcntd () * 21, z0))
+
+/*
+** st3_u64_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_u64_24, svuint64x3_t, uint64_t,
+	    svst3_u64 (p0, x0 + svcntd () * 24, z0),
+	    svst3 (p0, x0 + svcntd () * 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_u64_m1:
+**	decb	x0
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u64_m1, svuint64x3_t, uint64_t,
+	    svst3_u64 (p0, x0 - svcntd (), z0),
+	    svst3 (p0, x0 - svcntd (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_u64_m2:
+**	decb	x0, all, mul #2
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u64_m2, svuint64x3_t, uint64_t,
+	    svst3_u64 (p0, x0 - svcntd () * 2, z0),
+	    svst3 (p0, x0 - svcntd () * 2, z0))
+
+/*
+** st3_u64_m3:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_u64_m3, svuint64x3_t, uint64_t,
+	    svst3_u64 (p0, x0 - svcntd () * 3, z0),
+	    svst3 (p0, x0 - svcntd () * 3, z0))
+
+/*
+** st3_u64_m24:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_u64_m24, svuint64x3_t, uint64_t,
+	    svst3_u64 (p0, x0 - svcntd () * 24, z0),
+	    svst3 (p0, x0 - svcntd () * 24, z0))
+
+/*
+** st3_u64_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_u64_m27, svuint64x3_t, uint64_t,
+	    svst3_u64 (p0, x0 - svcntd () * 27, z0),
+	    svst3 (p0, x0 - svcntd () * 27, z0))
+
+/*
+** st3_vnum_u64_0:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u64_0, svuint64x3_t, uint64_t,
+	    svst3_vnum_u64 (p0, x0, 0, z0),
+	    svst3_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_u64_1:
+**	incb	x0
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u64_1, svuint64x3_t, uint64_t,
+	    svst3_vnum_u64 (p0, x0, 1, z0),
+	    svst3_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_u64_2:
+**	incb	x0, all, mul #2
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u64_2, svuint64x3_t, uint64_t,
+	    svst3_vnum_u64 (p0, x0, 2, z0),
+	    svst3_vnum (p0, x0, 2, z0))
+
+/*
+** st3_vnum_u64_3:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u64_3, svuint64x3_t, uint64_t,
+	    svst3_vnum_u64 (p0, x0, 3, z0),
+	    svst3_vnum (p0, x0, 3, z0))
+
+/*
+** st3_vnum_u64_21:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u64_21, svuint64x3_t, uint64_t,
+	    svst3_vnum_u64 (p0, x0, 21, z0),
+	    svst3_vnum (p0, x0, 21, z0))
+
+/*
+** st3_vnum_u64_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u64_24, svuint64x3_t, uint64_t,
+	    svst3_vnum_u64 (p0, x0, 24, z0),
+	    svst3_vnum (p0, x0, 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_u64_m1:
+**	decb	x0
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u64_m1, svuint64x3_t, uint64_t,
+	    svst3_vnum_u64 (p0, x0, -1, z0),
+	    svst3_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_u64_m2:
+**	decb	x0, all, mul #2
+**	st3d	{z0\.d - z2\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u64_m2, svuint64x3_t, uint64_t,
+	    svst3_vnum_u64 (p0, x0, -2, z0),
+	    svst3_vnum (p0, x0, -2, z0))
+
+/*
+** st3_vnum_u64_m3:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u64_m3, svuint64x3_t, uint64_t,
+	    svst3_vnum_u64 (p0, x0, -3, z0),
+	    svst3_vnum (p0, x0, -3, z0))
+
+/*
+** st3_vnum_u64_m24:
+**	st3d	{z0\.d - z2\.d}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u64_m24, svuint64x3_t, uint64_t,
+	    svst3_vnum_u64 (p0, x0, -24, z0),
+	    svst3_vnum (p0, x0, -24, z0))
+
+/*
+** st3_vnum_u64_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3d	{z0\.d - z2\.d}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u64_m27, svuint64x3_t, uint64_t,
+	    svst3_vnum_u64 (p0, x0, -27, z0),
+	    svst3_vnum (p0, x0, -27, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st3_vnum_u64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st3d	{z0\.d - z2\.d}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u64_x1, svuint64x3_t, uint64_t,
+	    svst3_vnum_u64 (p0, x0, x1, z0),
+	    svst3_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u8.c
new file mode 100644
index 000000000..e2d5a19ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st3_u8.c
@@ -0,0 +1,246 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st3_u8_base:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u8_base, svuint8x3_t, uint8_t,
+	    svst3_u8 (p0, x0, z0),
+	    svst3 (p0, x0, z0))
+
+/*
+** st3_u8_index:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, x1\]
+**	ret
+*/
+TEST_STORE (st3_u8_index, svuint8x3_t, uint8_t,
+	    svst3_u8 (p0, x0 + x1, z0),
+	    svst3 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_u8_1:
+**	incb	x0
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u8_1, svuint8x3_t, uint8_t,
+	    svst3_u8 (p0, x0 + svcntb (), z0),
+	    svst3 (p0, x0 + svcntb (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_u8_2:
+**	incb	x0, all, mul #2
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u8_2, svuint8x3_t, uint8_t,
+	    svst3_u8 (p0, x0 + svcntb () * 2, z0),
+	    svst3 (p0, x0 + svcntb () * 2, z0))
+
+/*
+** st3_u8_3:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_u8_3, svuint8x3_t, uint8_t,
+	    svst3_u8 (p0, x0 + svcntb () * 3, z0),
+	    svst3 (p0, x0 + svcntb () * 3, z0))
+
+/*
+** st3_u8_21:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_u8_21, svuint8x3_t, uint8_t,
+	    svst3_u8 (p0, x0 + svcntb () * 21, z0),
+	    svst3 (p0, x0 + svcntb () * 21, z0))
+
+/*
+** st3_u8_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3b	{z0\.b - z2\.b}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_u8_24, svuint8x3_t, uint8_t,
+	    svst3_u8 (p0, x0 + svcntb () * 24, z0),
+	    svst3 (p0, x0 + svcntb () * 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_u8_m1:
+**	decb	x0
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u8_m1, svuint8x3_t, uint8_t,
+	    svst3_u8 (p0, x0 - svcntb (), z0),
+	    svst3 (p0, x0 - svcntb (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_u8_m2:
+**	decb	x0, all, mul #2
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_u8_m2, svuint8x3_t, uint8_t,
+	    svst3_u8 (p0, x0 - svcntb () * 2, z0),
+	    svst3 (p0, x0 - svcntb () * 2, z0))
+
+/*
+** st3_u8_m3:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_u8_m3, svuint8x3_t, uint8_t,
+	    svst3_u8 (p0, x0 - svcntb () * 3, z0),
+	    svst3 (p0, x0 - svcntb () * 3, z0))
+
+/*
+** st3_u8_m24:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_u8_m24, svuint8x3_t, uint8_t,
+	    svst3_u8 (p0, x0 - svcntb () * 24, z0),
+	    svst3 (p0, x0 - svcntb () * 24, z0))
+
+/*
+** st3_u8_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3b	{z0\.b - z2\.b}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_u8_m27, svuint8x3_t, uint8_t,
+	    svst3_u8 (p0, x0 - svcntb () * 27, z0),
+	    svst3 (p0, x0 - svcntb () * 27, z0))
+
+/*
+** st3_vnum_u8_0:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u8_0, svuint8x3_t, uint8_t,
+	    svst3_vnum_u8 (p0, x0, 0, z0),
+	    svst3_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_u8_1:
+**	incb	x0
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u8_1, svuint8x3_t, uint8_t,
+	    svst3_vnum_u8 (p0, x0, 1, z0),
+	    svst3_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_u8_2:
+**	incb	x0, all, mul #2
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u8_2, svuint8x3_t, uint8_t,
+	    svst3_vnum_u8 (p0, x0, 2, z0),
+	    svst3_vnum (p0, x0, 2, z0))
+
+/*
+** st3_vnum_u8_3:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, #3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u8_3, svuint8x3_t, uint8_t,
+	    svst3_vnum_u8 (p0, x0, 3, z0),
+	    svst3_vnum (p0, x0, 3, z0))
+
+/*
+** st3_vnum_u8_21:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, #21, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u8_21, svuint8x3_t, uint8_t,
+	    svst3_vnum_u8 (p0, x0, 21, z0),
+	    svst3_vnum (p0, x0, 21, z0))
+
+/*
+** st3_vnum_u8_24:
+**	addvl	(x[0-9]+), x0, #24
+**	st3b	{z0\.b - z2\.b}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u8_24, svuint8x3_t, uint8_t,
+	    svst3_vnum_u8 (p0, x0, 24, z0),
+	    svst3_vnum (p0, x0, 24, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_u8_m1:
+**	decb	x0
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u8_m1, svuint8x3_t, uint8_t,
+	    svst3_vnum_u8 (p0, x0, -1, z0),
+	    svst3_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st3_vnum_u8_m2:
+**	decb	x0, all, mul #2
+**	st3b	{z0\.b - z2\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u8_m2, svuint8x3_t, uint8_t,
+	    svst3_vnum_u8 (p0, x0, -2, z0),
+	    svst3_vnum (p0, x0, -2, z0))
+
+/*
+** st3_vnum_u8_m3:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, #-3, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u8_m3, svuint8x3_t, uint8_t,
+	    svst3_vnum_u8 (p0, x0, -3, z0),
+	    svst3_vnum (p0, x0, -3, z0))
+
+/*
+** st3_vnum_u8_m24:
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, #-24, mul vl\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u8_m24, svuint8x3_t, uint8_t,
+	    svst3_vnum_u8 (p0, x0, -24, z0),
+	    svst3_vnum (p0, x0, -24, z0))
+
+/*
+** st3_vnum_u8_m27:
+**	addvl	(x[0-9]+), x0, #-27
+**	st3b	{z0\.b - z2\.b}, p0, \[\1\]
+**	ret
+*/
+TEST_STORE (st3_vnum_u8_m27, svuint8x3_t, uint8_t,
+	    svst3_vnum_u8 (p0, x0, -27, z0),
+	    svst3_vnum (p0, x0, -27, z0))
+
+/*
+** st3_vnum_u8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	st3b	{z0\.b - z2\.b}, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	st3b	{z0\.b - z2\.b}, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_STORE (st3_vnum_u8_x1, svuint8x3_t, uint8_t,
+	    svst3_vnum_u8 (p0, x0, x1, z0),
+	    svst3_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_bf16.c
new file mode 100644
index 000000000..b8d9f4afa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_bf16.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st4_bf16_base:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_bf16_base, svbfloat16x4_t, bfloat16_t,
+	    svst4_bf16 (p0, x0, z0),
+	    svst4 (p0, x0, z0))
+
+/*
+** st4_bf16_index:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st4_bf16_index, svbfloat16x4_t, bfloat16_t,
+	    svst4_bf16 (p0, x0 + x1, z0),
+	    svst4 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_bf16_1:
+**	incb	x0
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_bf16_1, svbfloat16x4_t, bfloat16_t,
+	    svst4_bf16 (p0, x0 + svcnth (), z0),
+	    svst4 (p0, x0 + svcnth (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_bf16_2:
+**	incb	x0, all, mul #2
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_bf16_2, svbfloat16x4_t, bfloat16_t,
+	    svst4_bf16 (p0, x0 + svcnth () * 2, z0),
+	    svst4 (p0, x0 + svcnth () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_bf16_3:
+**	incb	x0, all, mul #3
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_bf16_3, svbfloat16x4_t, bfloat16_t,
+	    svst4_bf16 (p0, x0 + svcnth () * 3, z0),
+	    svst4 (p0, x0 + svcnth () * 3, z0))
+
+/*
+** st4_bf16_4:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_bf16_4, svbfloat16x4_t, bfloat16_t,
+	    svst4_bf16 (p0, x0 + svcnth () * 4, z0),
+	    svst4 (p0, x0 + svcnth () * 4, z0))
+
+/*
+** st4_bf16_28:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_bf16_28, svbfloat16x4_t, bfloat16_t,
+	    svst4_bf16 (p0, x0 + svcnth () * 28, z0),
+	    svst4 (p0, x0 + svcnth () * 28, z0))
+
+/*
+** st4_bf16_32:
+**	[^{]*
+**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_bf16_32, svbfloat16x4_t, bfloat16_t,
+	    svst4_bf16 (p0, x0 + svcnth () * 32, z0),
+	    svst4 (p0, x0 + svcnth () * 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_bf16_m1:
+**	decb	x0
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_bf16_m1, svbfloat16x4_t, bfloat16_t,
+	    svst4_bf16 (p0, x0 - svcnth (), z0),
+	    svst4 (p0, x0 - svcnth (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_bf16_m2:
+**	decb	x0, all, mul #2
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_bf16_m2, svbfloat16x4_t, bfloat16_t,
+	    svst4_bf16 (p0, x0 - svcnth () * 2, z0),
+	    svst4 (p0, x0 - svcnth () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_bf16_m3:
+**	decb	x0, all, mul #3
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_bf16_m3, svbfloat16x4_t, bfloat16_t,
+	    svst4_bf16 (p0, x0 - svcnth () * 3, z0),
+	    svst4 (p0, x0 - svcnth () * 3, z0))
+
+/*
+** st4_bf16_m4:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_bf16_m4, svbfloat16x4_t, bfloat16_t,
+	    svst4_bf16 (p0, x0 - svcnth () * 4, z0),
+	    svst4 (p0, x0 - svcnth () * 4, z0))
+
+/*
+** st4_bf16_m32:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_bf16_m32, svbfloat16x4_t, bfloat16_t,
+	    svst4_bf16 (p0, x0 - svcnth () * 32, z0),
+	    svst4 (p0, x0 - svcnth () * 32, z0))
+
+/*
+** st4_bf16_m36:
+**	[^{]*
+**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_bf16_m36, svbfloat16x4_t, bfloat16_t,
+	    svst4_bf16 (p0, x0 - svcnth () * 36, z0),
+	    svst4 (p0, x0 - svcnth () * 36, z0))
+
+/*
+** st4_vnum_bf16_0:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_bf16_0, svbfloat16x4_t, bfloat16_t,
+	    svst4_vnum_bf16 (p0, x0, 0, z0),
+	    svst4_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_bf16_1:
+**	incb	x0
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_bf16_1, svbfloat16x4_t, bfloat16_t,
+	    svst4_vnum_bf16 (p0, x0, 1, z0),
+	    svst4_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_bf16_2:
+**	incb	x0, all, mul #2
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_bf16_2, svbfloat16x4_t, bfloat16_t,
+	    svst4_vnum_bf16 (p0, x0, 2, z0),
+	    svst4_vnum (p0, x0, 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_bf16_3:
+**	incb	x0, all, mul #3
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_bf16_3, svbfloat16x4_t, bfloat16_t,
+	    svst4_vnum_bf16 (p0, x0, 3, z0),
+	    svst4_vnum (p0, x0, 3, z0))
+
+/*
+** st4_vnum_bf16_4:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_bf16_4, svbfloat16x4_t, bfloat16_t,
+	    svst4_vnum_bf16 (p0, x0, 4, z0),
+	    svst4_vnum (p0, x0, 4, z0))
+
+/*
+** st4_vnum_bf16_28:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_bf16_28, svbfloat16x4_t, bfloat16_t,
+	    svst4_vnum_bf16 (p0, x0, 28, z0),
+	    svst4_vnum (p0, x0, 28, z0))
+
+/*
+** st4_vnum_bf16_32:
+**	[^{]*
+**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_bf16_32, svbfloat16x4_t, bfloat16_t,
+	    svst4_vnum_bf16 (p0, x0, 32, z0),
+	    svst4_vnum (p0, x0, 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_bf16_m1:
+**	decb	x0
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_bf16_m1, svbfloat16x4_t, bfloat16_t,
+	    svst4_vnum_bf16 (p0, x0, -1, z0),
+	    svst4_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_bf16_m2:
+**	decb	x0, all, mul #2
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_bf16_m2, svbfloat16x4_t, bfloat16_t,
+	    svst4_vnum_bf16 (p0, x0, -2, z0),
+	    svst4_vnum (p0, x0, -2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_bf16_m3:
+**	decb	x0, all, mul #3
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_bf16_m3, svbfloat16x4_t, bfloat16_t,
+	    svst4_vnum_bf16 (p0, x0, -3, z0),
+	    svst4_vnum (p0, x0, -3, z0))
+
+/*
+** st4_vnum_bf16_m4:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_bf16_m4, svbfloat16x4_t, bfloat16_t,
+	    svst4_vnum_bf16 (p0, x0, -4, z0),
+	    svst4_vnum (p0, x0, -4, z0))
+
+/*
+** st4_vnum_bf16_m32:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_bf16_m32, svbfloat16x4_t, bfloat16_t,
+	    svst4_vnum_bf16 (p0, x0, -32, z0),
+	    svst4_vnum (p0, x0, -32, z0))
+
+/*
+** st4_vnum_bf16_m36:
+**	[^{]*
+**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_bf16_m36, svbfloat16x4_t, bfloat16_t,
+	    svst4_vnum_bf16 (p0, x0, -36, z0),
+	    svst4_vnum (p0, x0, -36, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st4_vnum_bf16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st4h	{z0\.h - z3\.h}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st4_vnum_bf16_x1, svbfloat16x4_t, bfloat16_t,
+	    svst4_vnum_bf16 (p0, x0, x1, z0),
+	    svst4_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f16.c
new file mode 100644
index 000000000..296bdb4a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f16.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st4_f16_base:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f16_base, svfloat16x4_t, float16_t,
+	    svst4_f16 (p0, x0, z0),
+	    svst4 (p0, x0, z0))
+
+/*
+** st4_f16_index:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st4_f16_index, svfloat16x4_t, float16_t,
+	    svst4_f16 (p0, x0 + x1, z0),
+	    svst4 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f16_1:
+**	incb	x0
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f16_1, svfloat16x4_t, float16_t,
+	    svst4_f16 (p0, x0 + svcnth (), z0),
+	    svst4 (p0, x0 + svcnth (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f16_2:
+**	incb	x0, all, mul #2
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f16_2, svfloat16x4_t, float16_t,
+	    svst4_f16 (p0, x0 + svcnth () * 2, z0),
+	    svst4 (p0, x0 + svcnth () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f16_3:
+**	incb	x0, all, mul #3
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f16_3, svfloat16x4_t, float16_t,
+	    svst4_f16 (p0, x0 + svcnth () * 3, z0),
+	    svst4 (p0, x0 + svcnth () * 3, z0))
+
+/*
+** st4_f16_4:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_f16_4, svfloat16x4_t, float16_t,
+	    svst4_f16 (p0, x0 + svcnth () * 4, z0),
+	    svst4 (p0, x0 + svcnth () * 4, z0))
+
+/*
+** st4_f16_28:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_f16_28, svfloat16x4_t, float16_t,
+	    svst4_f16 (p0, x0 + svcnth () * 28, z0),
+	    svst4 (p0, x0 + svcnth () * 28, z0))
+
+/*
+** st4_f16_32:
+**	[^{]*
+**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_f16_32, svfloat16x4_t, float16_t,
+	    svst4_f16 (p0, x0 + svcnth () * 32, z0),
+	    svst4 (p0, x0 + svcnth () * 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f16_m1:
+**	decb	x0
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f16_m1, svfloat16x4_t, float16_t,
+	    svst4_f16 (p0, x0 - svcnth (), z0),
+	    svst4 (p0, x0 - svcnth (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f16_m2:
+**	decb	x0, all, mul #2
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f16_m2, svfloat16x4_t, float16_t,
+	    svst4_f16 (p0, x0 - svcnth () * 2, z0),
+	    svst4 (p0, x0 - svcnth () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f16_m3:
+**	decb	x0, all, mul #3
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f16_m3, svfloat16x4_t, float16_t,
+	    svst4_f16 (p0, x0 - svcnth () * 3, z0),
+	    svst4 (p0, x0 - svcnth () * 3, z0))
+
+/*
+** st4_f16_m4:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_f16_m4, svfloat16x4_t, float16_t,
+	    svst4_f16 (p0, x0 - svcnth () * 4, z0),
+	    svst4 (p0, x0 - svcnth () * 4, z0))
+
+/*
+** st4_f16_m32:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_f16_m32, svfloat16x4_t, float16_t,
+	    svst4_f16 (p0, x0 - svcnth () * 32, z0),
+	    svst4 (p0, x0 - svcnth () * 32, z0))
+
+/*
+** st4_f16_m36:
+**	[^{]*
+**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_f16_m36, svfloat16x4_t, float16_t,
+	    svst4_f16 (p0, x0 - svcnth () * 36, z0),
+	    svst4 (p0, x0 - svcnth () * 36, z0))
+
+/*
+** st4_vnum_f16_0:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f16_0, svfloat16x4_t, float16_t,
+	    svst4_vnum_f16 (p0, x0, 0, z0),
+	    svst4_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f16_1:
+**	incb	x0
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f16_1, svfloat16x4_t, float16_t,
+	    svst4_vnum_f16 (p0, x0, 1, z0),
+	    svst4_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f16_2:
+**	incb	x0, all, mul #2
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f16_2, svfloat16x4_t, float16_t,
+	    svst4_vnum_f16 (p0, x0, 2, z0),
+	    svst4_vnum (p0, x0, 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f16_3:
+**	incb	x0, all, mul #3
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f16_3, svfloat16x4_t, float16_t,
+	    svst4_vnum_f16 (p0, x0, 3, z0),
+	    svst4_vnum (p0, x0, 3, z0))
+
+/*
+** st4_vnum_f16_4:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f16_4, svfloat16x4_t, float16_t,
+	    svst4_vnum_f16 (p0, x0, 4, z0),
+	    svst4_vnum (p0, x0, 4, z0))
+
+/*
+** st4_vnum_f16_28:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f16_28, svfloat16x4_t, float16_t,
+	    svst4_vnum_f16 (p0, x0, 28, z0),
+	    svst4_vnum (p0, x0, 28, z0))
+
+/*
+** st4_vnum_f16_32:
+**	[^{]*
+**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f16_32, svfloat16x4_t, float16_t,
+	    svst4_vnum_f16 (p0, x0, 32, z0),
+	    svst4_vnum (p0, x0, 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f16_m1:
+**	decb	x0
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f16_m1, svfloat16x4_t, float16_t,
+	    svst4_vnum_f16 (p0, x0, -1, z0),
+	    svst4_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f16_m2:
+**	decb	x0, all, mul #2
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f16_m2, svfloat16x4_t, float16_t,
+	    svst4_vnum_f16 (p0, x0, -2, z0),
+	    svst4_vnum (p0, x0, -2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f16_m3:
+**	decb	x0, all, mul #3
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f16_m3, svfloat16x4_t, float16_t,
+	    svst4_vnum_f16 (p0, x0, -3, z0),
+	    svst4_vnum (p0, x0, -3, z0))
+
+/*
+** st4_vnum_f16_m4:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f16_m4, svfloat16x4_t, float16_t,
+	    svst4_vnum_f16 (p0, x0, -4, z0),
+	    svst4_vnum (p0, x0, -4, z0))
+
+/*
+** st4_vnum_f16_m32:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f16_m32, svfloat16x4_t, float16_t,
+	    svst4_vnum_f16 (p0, x0, -32, z0),
+	    svst4_vnum (p0, x0, -32, z0))
+
+/*
+** st4_vnum_f16_m36:
+**	[^{]*
+**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f16_m36, svfloat16x4_t, float16_t,
+	    svst4_vnum_f16 (p0, x0, -36, z0),
+	    svst4_vnum (p0, x0, -36, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st4_vnum_f16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st4h	{z0\.h - z3\.h}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f16_x1, svfloat16x4_t, float16_t,
+	    svst4_vnum_f16 (p0, x0, x1, z0),
+	    svst4_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f32.c
new file mode 100644
index 000000000..313ed7bc0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f32.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st4_f32_base:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f32_base, svfloat32x4_t, float32_t,
+	    svst4_f32 (p0, x0, z0),
+	    svst4 (p0, x0, z0))
+
+/*
+** st4_f32_index:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (st4_f32_index, svfloat32x4_t, float32_t,
+	    svst4_f32 (p0, x0 + x1, z0),
+	    svst4 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f32_1:
+**	incb	x0
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f32_1, svfloat32x4_t, float32_t,
+	    svst4_f32 (p0, x0 + svcntw (), z0),
+	    svst4 (p0, x0 + svcntw (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f32_2:
+**	incb	x0, all, mul #2
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f32_2, svfloat32x4_t, float32_t,
+	    svst4_f32 (p0, x0 + svcntw () * 2, z0),
+	    svst4 (p0, x0 + svcntw () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f32_3:
+**	incb	x0, all, mul #3
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f32_3, svfloat32x4_t, float32_t,
+	    svst4_f32 (p0, x0 + svcntw () * 3, z0),
+	    svst4 (p0, x0 + svcntw () * 3, z0))
+
+/*
+** st4_f32_4:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_f32_4, svfloat32x4_t, float32_t,
+	    svst4_f32 (p0, x0 + svcntw () * 4, z0),
+	    svst4 (p0, x0 + svcntw () * 4, z0))
+
+/*
+** st4_f32_28:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_f32_28, svfloat32x4_t, float32_t,
+	    svst4_f32 (p0, x0 + svcntw () * 28, z0),
+	    svst4 (p0, x0 + svcntw () * 28, z0))
+
+/*
+** st4_f32_32:
+**	[^{]*
+**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_f32_32, svfloat32x4_t, float32_t,
+	    svst4_f32 (p0, x0 + svcntw () * 32, z0),
+	    svst4 (p0, x0 + svcntw () * 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f32_m1:
+**	decb	x0
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f32_m1, svfloat32x4_t, float32_t,
+	    svst4_f32 (p0, x0 - svcntw (), z0),
+	    svst4 (p0, x0 - svcntw (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f32_m2:
+**	decb	x0, all, mul #2
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f32_m2, svfloat32x4_t, float32_t,
+	    svst4_f32 (p0, x0 - svcntw () * 2, z0),
+	    svst4 (p0, x0 - svcntw () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f32_m3:
+**	decb	x0, all, mul #3
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f32_m3, svfloat32x4_t, float32_t,
+	    svst4_f32 (p0, x0 - svcntw () * 3, z0),
+	    svst4 (p0, x0 - svcntw () * 3, z0))
+
+/*
+** st4_f32_m4:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_f32_m4, svfloat32x4_t, float32_t,
+	    svst4_f32 (p0, x0 - svcntw () * 4, z0),
+	    svst4 (p0, x0 - svcntw () * 4, z0))
+
+/*
+** st4_f32_m32:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_f32_m32, svfloat32x4_t, float32_t,
+	    svst4_f32 (p0, x0 - svcntw () * 32, z0),
+	    svst4 (p0, x0 - svcntw () * 32, z0))
+
+/*
+** st4_f32_m36:
+**	[^{]*
+**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_f32_m36, svfloat32x4_t, float32_t,
+	    svst4_f32 (p0, x0 - svcntw () * 36, z0),
+	    svst4 (p0, x0 - svcntw () * 36, z0))
+
+/*
+** st4_vnum_f32_0:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f32_0, svfloat32x4_t, float32_t,
+	    svst4_vnum_f32 (p0, x0, 0, z0),
+	    svst4_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f32_1:
+**	incb	x0
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f32_1, svfloat32x4_t, float32_t,
+	    svst4_vnum_f32 (p0, x0, 1, z0),
+	    svst4_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f32_2:
+**	incb	x0, all, mul #2
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f32_2, svfloat32x4_t, float32_t,
+	    svst4_vnum_f32 (p0, x0, 2, z0),
+	    svst4_vnum (p0, x0, 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f32_3:
+**	incb	x0, all, mul #3
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f32_3, svfloat32x4_t, float32_t,
+	    svst4_vnum_f32 (p0, x0, 3, z0),
+	    svst4_vnum (p0, x0, 3, z0))
+
+/*
+** st4_vnum_f32_4:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f32_4, svfloat32x4_t, float32_t,
+	    svst4_vnum_f32 (p0, x0, 4, z0),
+	    svst4_vnum (p0, x0, 4, z0))
+
+/*
+** st4_vnum_f32_28:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f32_28, svfloat32x4_t, float32_t,
+	    svst4_vnum_f32 (p0, x0, 28, z0),
+	    svst4_vnum (p0, x0, 28, z0))
+
+/*
+** st4_vnum_f32_32:
+**	[^{]*
+**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f32_32, svfloat32x4_t, float32_t,
+	    svst4_vnum_f32 (p0, x0, 32, z0),
+	    svst4_vnum (p0, x0, 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f32_m1:
+**	decb	x0
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f32_m1, svfloat32x4_t, float32_t,
+	    svst4_vnum_f32 (p0, x0, -1, z0),
+	    svst4_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f32_m2:
+**	decb	x0, all, mul #2
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f32_m2, svfloat32x4_t, float32_t,
+	    svst4_vnum_f32 (p0, x0, -2, z0),
+	    svst4_vnum (p0, x0, -2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f32_m3:
+**	decb	x0, all, mul #3
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f32_m3, svfloat32x4_t, float32_t,
+	    svst4_vnum_f32 (p0, x0, -3, z0),
+	    svst4_vnum (p0, x0, -3, z0))
+
+/*
+** st4_vnum_f32_m4:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f32_m4, svfloat32x4_t, float32_t,
+	    svst4_vnum_f32 (p0, x0, -4, z0),
+	    svst4_vnum (p0, x0, -4, z0))
+
+/*
+** st4_vnum_f32_m32:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f32_m32, svfloat32x4_t, float32_t,
+	    svst4_vnum_f32 (p0, x0, -32, z0),
+	    svst4_vnum (p0, x0, -32, z0))
+
+/*
+** st4_vnum_f32_m36:
+**	[^{]*
+**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f32_m36, svfloat32x4_t, float32_t,
+	    svst4_vnum_f32 (p0, x0, -36, z0),
+	    svst4_vnum (p0, x0, -36, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st4_vnum_f32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st4w	{z0\.s - z3\.s}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f32_x1, svfloat32x4_t, float32_t,
+	    svst4_vnum_f32 (p0, x0, x1, z0),
+	    svst4_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f64.c
new file mode 100644
index 000000000..6c65ef016
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_f64.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st4_f64_base:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f64_base, svfloat64x4_t, float64_t,
+	    svst4_f64 (p0, x0, z0),
+	    svst4 (p0, x0, z0))
+
+/*
+** st4_f64_index:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_STORE (st4_f64_index, svfloat64x4_t, float64_t,
+	    svst4_f64 (p0, x0 + x1, z0),
+	    svst4 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f64_1:
+**	incb	x0
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f64_1, svfloat64x4_t, float64_t,
+	    svst4_f64 (p0, x0 + svcntd (), z0),
+	    svst4 (p0, x0 + svcntd (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f64_2:
+**	incb	x0, all, mul #2
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f64_2, svfloat64x4_t, float64_t,
+	    svst4_f64 (p0, x0 + svcntd () * 2, z0),
+	    svst4 (p0, x0 + svcntd () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f64_3:
+**	incb	x0, all, mul #3
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f64_3, svfloat64x4_t, float64_t,
+	    svst4_f64 (p0, x0 + svcntd () * 3, z0),
+	    svst4 (p0, x0 + svcntd () * 3, z0))
+
+/*
+** st4_f64_4:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_f64_4, svfloat64x4_t, float64_t,
+	    svst4_f64 (p0, x0 + svcntd () * 4, z0),
+	    svst4 (p0, x0 + svcntd () * 4, z0))
+
+/*
+** st4_f64_28:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_f64_28, svfloat64x4_t, float64_t,
+	    svst4_f64 (p0, x0 + svcntd () * 28, z0),
+	    svst4 (p0, x0 + svcntd () * 28, z0))
+
+/*
+** st4_f64_32:
+**	[^{]*
+**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_f64_32, svfloat64x4_t, float64_t,
+	    svst4_f64 (p0, x0 + svcntd () * 32, z0),
+	    svst4 (p0, x0 + svcntd () * 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f64_m1:
+**	decb	x0
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f64_m1, svfloat64x4_t, float64_t,
+	    svst4_f64 (p0, x0 - svcntd (), z0),
+	    svst4 (p0, x0 - svcntd (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f64_m2:
+**	decb	x0, all, mul #2
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f64_m2, svfloat64x4_t, float64_t,
+	    svst4_f64 (p0, x0 - svcntd () * 2, z0),
+	    svst4 (p0, x0 - svcntd () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_f64_m3:
+**	decb	x0, all, mul #3
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_f64_m3, svfloat64x4_t, float64_t,
+	    svst4_f64 (p0, x0 - svcntd () * 3, z0),
+	    svst4 (p0, x0 - svcntd () * 3, z0))
+
+/*
+** st4_f64_m4:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_f64_m4, svfloat64x4_t, float64_t,
+	    svst4_f64 (p0, x0 - svcntd () * 4, z0),
+	    svst4 (p0, x0 - svcntd () * 4, z0))
+
+/*
+** st4_f64_m32:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_f64_m32, svfloat64x4_t, float64_t,
+	    svst4_f64 (p0, x0 - svcntd () * 32, z0),
+	    svst4 (p0, x0 - svcntd () * 32, z0))
+
+/*
+** st4_f64_m36:
+**	[^{]*
+**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_f64_m36, svfloat64x4_t, float64_t,
+	    svst4_f64 (p0, x0 - svcntd () * 36, z0),
+	    svst4 (p0, x0 - svcntd () * 36, z0))
+
+/*
+** st4_vnum_f64_0:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f64_0, svfloat64x4_t, float64_t,
+	    svst4_vnum_f64 (p0, x0, 0, z0),
+	    svst4_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f64_1:
+**	incb	x0
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f64_1, svfloat64x4_t, float64_t,
+	    svst4_vnum_f64 (p0, x0, 1, z0),
+	    svst4_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f64_2:
+**	incb	x0, all, mul #2
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f64_2, svfloat64x4_t, float64_t,
+	    svst4_vnum_f64 (p0, x0, 2, z0),
+	    svst4_vnum (p0, x0, 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f64_3:
+**	incb	x0, all, mul #3
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f64_3, svfloat64x4_t, float64_t,
+	    svst4_vnum_f64 (p0, x0, 3, z0),
+	    svst4_vnum (p0, x0, 3, z0))
+
+/*
+** st4_vnum_f64_4:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f64_4, svfloat64x4_t, float64_t,
+	    svst4_vnum_f64 (p0, x0, 4, z0),
+	    svst4_vnum (p0, x0, 4, z0))
+
+/*
+** st4_vnum_f64_28:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f64_28, svfloat64x4_t, float64_t,
+	    svst4_vnum_f64 (p0, x0, 28, z0),
+	    svst4_vnum (p0, x0, 28, z0))
+
+/*
+** st4_vnum_f64_32:
+**	[^{]*
+**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f64_32, svfloat64x4_t, float64_t,
+	    svst4_vnum_f64 (p0, x0, 32, z0),
+	    svst4_vnum (p0, x0, 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f64_m1:
+**	decb	x0
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f64_m1, svfloat64x4_t, float64_t,
+	    svst4_vnum_f64 (p0, x0, -1, z0),
+	    svst4_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f64_m2:
+**	decb	x0, all, mul #2
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f64_m2, svfloat64x4_t, float64_t,
+	    svst4_vnum_f64 (p0, x0, -2, z0),
+	    svst4_vnum (p0, x0, -2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_f64_m3:
+**	decb	x0, all, mul #3
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f64_m3, svfloat64x4_t, float64_t,
+	    svst4_vnum_f64 (p0, x0, -3, z0),
+	    svst4_vnum (p0, x0, -3, z0))
+
+/*
+** st4_vnum_f64_m4:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f64_m4, svfloat64x4_t, float64_t,
+	    svst4_vnum_f64 (p0, x0, -4, z0),
+	    svst4_vnum (p0, x0, -4, z0))
+
+/*
+** st4_vnum_f64_m32:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f64_m32, svfloat64x4_t, float64_t,
+	    svst4_vnum_f64 (p0, x0, -32, z0),
+	    svst4_vnum (p0, x0, -32, z0))
+
+/*
+** st4_vnum_f64_m36:
+**	[^{]*
+**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f64_m36, svfloat64x4_t, float64_t,
+	    svst4_vnum_f64 (p0, x0, -36, z0),
+	    svst4_vnum (p0, x0, -36, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st4_vnum_f64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st4d	{z0\.d - z3\.d}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st4_vnum_f64_x1, svfloat64x4_t, float64_t,
+	    svst4_vnum_f64 (p0, x0, x1, z0),
+	    svst4_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s16.c
new file mode 100644
index 000000000..35ac5f803
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s16.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st4_s16_base:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s16_base, svint16x4_t, int16_t,
+	    svst4_s16 (p0, x0, z0),
+	    svst4 (p0, x0, z0))
+
+/*
+** st4_s16_index:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st4_s16_index, svint16x4_t, int16_t,
+	    svst4_s16 (p0, x0 + x1, z0),
+	    svst4 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s16_1:
+**	incb	x0
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s16_1, svint16x4_t, int16_t,
+	    svst4_s16 (p0, x0 + svcnth (), z0),
+	    svst4 (p0, x0 + svcnth (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s16_2:
+**	incb	x0, all, mul #2
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s16_2, svint16x4_t, int16_t,
+	    svst4_s16 (p0, x0 + svcnth () * 2, z0),
+	    svst4 (p0, x0 + svcnth () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s16_3:
+**	incb	x0, all, mul #3
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s16_3, svint16x4_t, int16_t,
+	    svst4_s16 (p0, x0 + svcnth () * 3, z0),
+	    svst4 (p0, x0 + svcnth () * 3, z0))
+
+/*
+** st4_s16_4:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_s16_4, svint16x4_t, int16_t,
+	    svst4_s16 (p0, x0 + svcnth () * 4, z0),
+	    svst4 (p0, x0 + svcnth () * 4, z0))
+
+/*
+** st4_s16_28:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_s16_28, svint16x4_t, int16_t,
+	    svst4_s16 (p0, x0 + svcnth () * 28, z0),
+	    svst4 (p0, x0 + svcnth () * 28, z0))
+
+/*
+** st4_s16_32:
+**	[^{]*
+**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_s16_32, svint16x4_t, int16_t,
+	    svst4_s16 (p0, x0 + svcnth () * 32, z0),
+	    svst4 (p0, x0 + svcnth () * 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s16_m1:
+**	decb	x0
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s16_m1, svint16x4_t, int16_t,
+	    svst4_s16 (p0, x0 - svcnth (), z0),
+	    svst4 (p0, x0 - svcnth (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s16_m2:
+**	decb	x0, all, mul #2
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s16_m2, svint16x4_t, int16_t,
+	    svst4_s16 (p0, x0 - svcnth () * 2, z0),
+	    svst4 (p0, x0 - svcnth () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s16_m3:
+**	decb	x0, all, mul #3
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s16_m3, svint16x4_t, int16_t,
+	    svst4_s16 (p0, x0 - svcnth () * 3, z0),
+	    svst4 (p0, x0 - svcnth () * 3, z0))
+
+/*
+** st4_s16_m4:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_s16_m4, svint16x4_t, int16_t,
+	    svst4_s16 (p0, x0 - svcnth () * 4, z0),
+	    svst4 (p0, x0 - svcnth () * 4, z0))
+
+/*
+** st4_s16_m32:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_s16_m32, svint16x4_t, int16_t,
+	    svst4_s16 (p0, x0 - svcnth () * 32, z0),
+	    svst4 (p0, x0 - svcnth () * 32, z0))
+
+/*
+** st4_s16_m36:
+**	[^{]*
+**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_s16_m36, svint16x4_t, int16_t,
+	    svst4_s16 (p0, x0 - svcnth () * 36, z0),
+	    svst4 (p0, x0 - svcnth () * 36, z0))
+
+/*
+** st4_vnum_s16_0:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s16_0, svint16x4_t, int16_t,
+	    svst4_vnum_s16 (p0, x0, 0, z0),
+	    svst4_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s16_1:
+**	incb	x0
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s16_1, svint16x4_t, int16_t,
+	    svst4_vnum_s16 (p0, x0, 1, z0),
+	    svst4_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s16_2:
+**	incb	x0, all, mul #2
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s16_2, svint16x4_t, int16_t,
+	    svst4_vnum_s16 (p0, x0, 2, z0),
+	    svst4_vnum (p0, x0, 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s16_3:
+**	incb	x0, all, mul #3
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s16_3, svint16x4_t, int16_t,
+	    svst4_vnum_s16 (p0, x0, 3, z0),
+	    svst4_vnum (p0, x0, 3, z0))
+
+/*
+** st4_vnum_s16_4:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s16_4, svint16x4_t, int16_t,
+	    svst4_vnum_s16 (p0, x0, 4, z0),
+	    svst4_vnum (p0, x0, 4, z0))
+
+/*
+** st4_vnum_s16_28:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s16_28, svint16x4_t, int16_t,
+	    svst4_vnum_s16 (p0, x0, 28, z0),
+	    svst4_vnum (p0, x0, 28, z0))
+
+/*
+** st4_vnum_s16_32:
+**	[^{]*
+**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s16_32, svint16x4_t, int16_t,
+	    svst4_vnum_s16 (p0, x0, 32, z0),
+	    svst4_vnum (p0, x0, 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s16_m1:
+**	decb	x0
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s16_m1, svint16x4_t, int16_t,
+	    svst4_vnum_s16 (p0, x0, -1, z0),
+	    svst4_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s16_m2:
+**	decb	x0, all, mul #2
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s16_m2, svint16x4_t, int16_t,
+	    svst4_vnum_s16 (p0, x0, -2, z0),
+	    svst4_vnum (p0, x0, -2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s16_m3:
+**	decb	x0, all, mul #3
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s16_m3, svint16x4_t, int16_t,
+	    svst4_vnum_s16 (p0, x0, -3, z0),
+	    svst4_vnum (p0, x0, -3, z0))
+
+/*
+** st4_vnum_s16_m4:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s16_m4, svint16x4_t, int16_t,
+	    svst4_vnum_s16 (p0, x0, -4, z0),
+	    svst4_vnum (p0, x0, -4, z0))
+
+/*
+** st4_vnum_s16_m32:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s16_m32, svint16x4_t, int16_t,
+	    svst4_vnum_s16 (p0, x0, -32, z0),
+	    svst4_vnum (p0, x0, -32, z0))
+
+/*
+** st4_vnum_s16_m36:
+**	[^{]*
+**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s16_m36, svint16x4_t, int16_t,
+	    svst4_vnum_s16 (p0, x0, -36, z0),
+	    svst4_vnum (p0, x0, -36, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st4_vnum_s16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st4h	{z0\.h - z3\.h}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s16_x1, svint16x4_t, int16_t,
+	    svst4_vnum_s16 (p0, x0, x1, z0),
+	    svst4_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s32.c
new file mode 100644
index 000000000..b8302f10d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s32.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st4_s32_base:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s32_base, svint32x4_t, int32_t,
+	    svst4_s32 (p0, x0, z0),
+	    svst4 (p0, x0, z0))
+
+/*
+** st4_s32_index:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (st4_s32_index, svint32x4_t, int32_t,
+	    svst4_s32 (p0, x0 + x1, z0),
+	    svst4 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s32_1:
+**	incb	x0
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s32_1, svint32x4_t, int32_t,
+	    svst4_s32 (p0, x0 + svcntw (), z0),
+	    svst4 (p0, x0 + svcntw (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s32_2:
+**	incb	x0, all, mul #2
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s32_2, svint32x4_t, int32_t,
+	    svst4_s32 (p0, x0 + svcntw () * 2, z0),
+	    svst4 (p0, x0 + svcntw () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s32_3:
+**	incb	x0, all, mul #3
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s32_3, svint32x4_t, int32_t,
+	    svst4_s32 (p0, x0 + svcntw () * 3, z0),
+	    svst4 (p0, x0 + svcntw () * 3, z0))
+
+/*
+** st4_s32_4:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_s32_4, svint32x4_t, int32_t,
+	    svst4_s32 (p0, x0 + svcntw () * 4, z0),
+	    svst4 (p0, x0 + svcntw () * 4, z0))
+
+/*
+** st4_s32_28:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_s32_28, svint32x4_t, int32_t,
+	    svst4_s32 (p0, x0 + svcntw () * 28, z0),
+	    svst4 (p0, x0 + svcntw () * 28, z0))
+
+/*
+** st4_s32_32:
+**	[^{]*
+**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_s32_32, svint32x4_t, int32_t,
+	    svst4_s32 (p0, x0 + svcntw () * 32, z0),
+	    svst4 (p0, x0 + svcntw () * 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s32_m1:
+**	decb	x0
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s32_m1, svint32x4_t, int32_t,
+	    svst4_s32 (p0, x0 - svcntw (), z0),
+	    svst4 (p0, x0 - svcntw (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s32_m2:
+**	decb	x0, all, mul #2
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s32_m2, svint32x4_t, int32_t,
+	    svst4_s32 (p0, x0 - svcntw () * 2, z0),
+	    svst4 (p0, x0 - svcntw () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s32_m3:
+**	decb	x0, all, mul #3
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s32_m3, svint32x4_t, int32_t,
+	    svst4_s32 (p0, x0 - svcntw () * 3, z0),
+	    svst4 (p0, x0 - svcntw () * 3, z0))
+
+/*
+** st4_s32_m4:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_s32_m4, svint32x4_t, int32_t,
+	    svst4_s32 (p0, x0 - svcntw () * 4, z0),
+	    svst4 (p0, x0 - svcntw () * 4, z0))
+
+/*
+** st4_s32_m32:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_s32_m32, svint32x4_t, int32_t,
+	    svst4_s32 (p0, x0 - svcntw () * 32, z0),
+	    svst4 (p0, x0 - svcntw () * 32, z0))
+
+/*
+** st4_s32_m36:
+**	[^{]*
+**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_s32_m36, svint32x4_t, int32_t,
+	    svst4_s32 (p0, x0 - svcntw () * 36, z0),
+	    svst4 (p0, x0 - svcntw () * 36, z0))
+
+/*
+** st4_vnum_s32_0:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s32_0, svint32x4_t, int32_t,
+	    svst4_vnum_s32 (p0, x0, 0, z0),
+	    svst4_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s32_1:
+**	incb	x0
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s32_1, svint32x4_t, int32_t,
+	    svst4_vnum_s32 (p0, x0, 1, z0),
+	    svst4_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s32_2:
+**	incb	x0, all, mul #2
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s32_2, svint32x4_t, int32_t,
+	    svst4_vnum_s32 (p0, x0, 2, z0),
+	    svst4_vnum (p0, x0, 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s32_3:
+**	incb	x0, all, mul #3
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s32_3, svint32x4_t, int32_t,
+	    svst4_vnum_s32 (p0, x0, 3, z0),
+	    svst4_vnum (p0, x0, 3, z0))
+
+/*
+** st4_vnum_s32_4:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s32_4, svint32x4_t, int32_t,
+	    svst4_vnum_s32 (p0, x0, 4, z0),
+	    svst4_vnum (p0, x0, 4, z0))
+
+/*
+** st4_vnum_s32_28:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s32_28, svint32x4_t, int32_t,
+	    svst4_vnum_s32 (p0, x0, 28, z0),
+	    svst4_vnum (p0, x0, 28, z0))
+
+/*
+** st4_vnum_s32_32:
+**	[^{]*
+**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s32_32, svint32x4_t, int32_t,
+	    svst4_vnum_s32 (p0, x0, 32, z0),
+	    svst4_vnum (p0, x0, 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s32_m1:
+**	decb	x0
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s32_m1, svint32x4_t, int32_t,
+	    svst4_vnum_s32 (p0, x0, -1, z0),
+	    svst4_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s32_m2:
+**	decb	x0, all, mul #2
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s32_m2, svint32x4_t, int32_t,
+	    svst4_vnum_s32 (p0, x0, -2, z0),
+	    svst4_vnum (p0, x0, -2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s32_m3:
+**	decb	x0, all, mul #3
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s32_m3, svint32x4_t, int32_t,
+	    svst4_vnum_s32 (p0, x0, -3, z0),
+	    svst4_vnum (p0, x0, -3, z0))
+
+/*
+** st4_vnum_s32_m4:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s32_m4, svint32x4_t, int32_t,
+	    svst4_vnum_s32 (p0, x0, -4, z0),
+	    svst4_vnum (p0, x0, -4, z0))
+
+/*
+** st4_vnum_s32_m32:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s32_m32, svint32x4_t, int32_t,
+	    svst4_vnum_s32 (p0, x0, -32, z0),
+	    svst4_vnum (p0, x0, -32, z0))
+
+/*
+** st4_vnum_s32_m36:
+**	[^{]*
+**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s32_m36, svint32x4_t, int32_t,
+	    svst4_vnum_s32 (p0, x0, -36, z0),
+	    svst4_vnum (p0, x0, -36, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st4_vnum_s32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st4w	{z0\.s - z3\.s}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s32_x1, svint32x4_t, int32_t,
+	    svst4_vnum_s32 (p0, x0, x1, z0),
+	    svst4_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s64.c
new file mode 100644
index 000000000..bf9cdf5e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s64.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st4_s64_base:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s64_base, svint64x4_t, int64_t,
+	    svst4_s64 (p0, x0, z0),
+	    svst4 (p0, x0, z0))
+
+/*
+** st4_s64_index:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_STORE (st4_s64_index, svint64x4_t, int64_t,
+	    svst4_s64 (p0, x0 + x1, z0),
+	    svst4 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s64_1:
+**	incb	x0
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s64_1, svint64x4_t, int64_t,
+	    svst4_s64 (p0, x0 + svcntd (), z0),
+	    svst4 (p0, x0 + svcntd (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s64_2:
+**	incb	x0, all, mul #2
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s64_2, svint64x4_t, int64_t,
+	    svst4_s64 (p0, x0 + svcntd () * 2, z0),
+	    svst4 (p0, x0 + svcntd () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s64_3:
+**	incb	x0, all, mul #3
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s64_3, svint64x4_t, int64_t,
+	    svst4_s64 (p0, x0 + svcntd () * 3, z0),
+	    svst4 (p0, x0 + svcntd () * 3, z0))
+
+/*
+** st4_s64_4:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_s64_4, svint64x4_t, int64_t,
+	    svst4_s64 (p0, x0 + svcntd () * 4, z0),
+	    svst4 (p0, x0 + svcntd () * 4, z0))
+
+/*
+** st4_s64_28:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_s64_28, svint64x4_t, int64_t,
+	    svst4_s64 (p0, x0 + svcntd () * 28, z0),
+	    svst4 (p0, x0 + svcntd () * 28, z0))
+
+/*
+** st4_s64_32:
+**	[^{]*
+**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_s64_32, svint64x4_t, int64_t,
+	    svst4_s64 (p0, x0 + svcntd () * 32, z0),
+	    svst4 (p0, x0 + svcntd () * 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s64_m1:
+**	decb	x0
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s64_m1, svint64x4_t, int64_t,
+	    svst4_s64 (p0, x0 - svcntd (), z0),
+	    svst4 (p0, x0 - svcntd (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s64_m2:
+**	decb	x0, all, mul #2
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s64_m2, svint64x4_t, int64_t,
+	    svst4_s64 (p0, x0 - svcntd () * 2, z0),
+	    svst4 (p0, x0 - svcntd () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s64_m3:
+**	decb	x0, all, mul #3
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s64_m3, svint64x4_t, int64_t,
+	    svst4_s64 (p0, x0 - svcntd () * 3, z0),
+	    svst4 (p0, x0 - svcntd () * 3, z0))
+
+/*
+** st4_s64_m4:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_s64_m4, svint64x4_t, int64_t,
+	    svst4_s64 (p0, x0 - svcntd () * 4, z0),
+	    svst4 (p0, x0 - svcntd () * 4, z0))
+
+/*
+** st4_s64_m32:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_s64_m32, svint64x4_t, int64_t,
+	    svst4_s64 (p0, x0 - svcntd () * 32, z0),
+	    svst4 (p0, x0 - svcntd () * 32, z0))
+
+/*
+** st4_s64_m36:
+**	[^{]*
+**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_s64_m36, svint64x4_t, int64_t,
+	    svst4_s64 (p0, x0 - svcntd () * 36, z0),
+	    svst4 (p0, x0 - svcntd () * 36, z0))
+
+/*
+** st4_vnum_s64_0:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s64_0, svint64x4_t, int64_t,
+	    svst4_vnum_s64 (p0, x0, 0, z0),
+	    svst4_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s64_1:
+**	incb	x0
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s64_1, svint64x4_t, int64_t,
+	    svst4_vnum_s64 (p0, x0, 1, z0),
+	    svst4_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s64_2:
+**	incb	x0, all, mul #2
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s64_2, svint64x4_t, int64_t,
+	    svst4_vnum_s64 (p0, x0, 2, z0),
+	    svst4_vnum (p0, x0, 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s64_3:
+**	incb	x0, all, mul #3
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s64_3, svint64x4_t, int64_t,
+	    svst4_vnum_s64 (p0, x0, 3, z0),
+	    svst4_vnum (p0, x0, 3, z0))
+
+/*
+** st4_vnum_s64_4:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s64_4, svint64x4_t, int64_t,
+	    svst4_vnum_s64 (p0, x0, 4, z0),
+	    svst4_vnum (p0, x0, 4, z0))
+
+/*
+** st4_vnum_s64_28:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s64_28, svint64x4_t, int64_t,
+	    svst4_vnum_s64 (p0, x0, 28, z0),
+	    svst4_vnum (p0, x0, 28, z0))
+
+/*
+** st4_vnum_s64_32:
+**	[^{]*
+**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s64_32, svint64x4_t, int64_t,
+	    svst4_vnum_s64 (p0, x0, 32, z0),
+	    svst4_vnum (p0, x0, 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s64_m1:
+**	decb	x0
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s64_m1, svint64x4_t, int64_t,
+	    svst4_vnum_s64 (p0, x0, -1, z0),
+	    svst4_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s64_m2:
+**	decb	x0, all, mul #2
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s64_m2, svint64x4_t, int64_t,
+	    svst4_vnum_s64 (p0, x0, -2, z0),
+	    svst4_vnum (p0, x0, -2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s64_m3:
+**	decb	x0, all, mul #3
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s64_m3, svint64x4_t, int64_t,
+	    svst4_vnum_s64 (p0, x0, -3, z0),
+	    svst4_vnum (p0, x0, -3, z0))
+
+/*
+** st4_vnum_s64_m4:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s64_m4, svint64x4_t, int64_t,
+	    svst4_vnum_s64 (p0, x0, -4, z0),
+	    svst4_vnum (p0, x0, -4, z0))
+
+/*
+** st4_vnum_s64_m32:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s64_m32, svint64x4_t, int64_t,
+	    svst4_vnum_s64 (p0, x0, -32, z0),
+	    svst4_vnum (p0, x0, -32, z0))
+
+/*
+** st4_vnum_s64_m36:
+**	[^{]*
+**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s64_m36, svint64x4_t, int64_t,
+	    svst4_vnum_s64 (p0, x0, -36, z0),
+	    svst4_vnum (p0, x0, -36, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st4_vnum_s64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st4d	{z0\.d - z3\.d}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s64_x1, svint64x4_t, int64_t,
+	    svst4_vnum_s64 (p0, x0, x1, z0),
+	    svst4_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s8.c
new file mode 100644
index 000000000..1eb0bf131
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_s8.c
@@ -0,0 +1,290 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st4_s8_base:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s8_base, svint8x4_t, int8_t,
+	    svst4_s8 (p0, x0, z0),
+	    svst4 (p0, x0, z0))
+
+/*
+** st4_s8_index:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, x1\]
+**	ret
+*/
+TEST_STORE (st4_s8_index, svint8x4_t, int8_t,
+	    svst4_s8 (p0, x0 + x1, z0),
+	    svst4 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s8_1:
+**	incb	x0
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s8_1, svint8x4_t, int8_t,
+	    svst4_s8 (p0, x0 + svcntb (), z0),
+	    svst4 (p0, x0 + svcntb (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s8_2:
+**	incb	x0, all, mul #2
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s8_2, svint8x4_t, int8_t,
+	    svst4_s8 (p0, x0 + svcntb () * 2, z0),
+	    svst4 (p0, x0 + svcntb () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s8_3:
+**	incb	x0, all, mul #3
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s8_3, svint8x4_t, int8_t,
+	    svst4_s8 (p0, x0 + svcntb () * 3, z0),
+	    svst4 (p0, x0 + svcntb () * 3, z0))
+
+/*
+** st4_s8_4:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_s8_4, svint8x4_t, int8_t,
+	    svst4_s8 (p0, x0 + svcntb () * 4, z0),
+	    svst4 (p0, x0 + svcntb () * 4, z0))
+
+/*
+** st4_s8_28:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_s8_28, svint8x4_t, int8_t,
+	    svst4_s8 (p0, x0 + svcntb () * 28, z0),
+	    svst4 (p0, x0 + svcntb () * 28, z0))
+
+/*
+** st4_s8_32:
+**	[^{]*
+**	st4b	{z0\.b - z3\.b}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_s8_32, svint8x4_t, int8_t,
+	    svst4_s8 (p0, x0 + svcntb () * 32, z0),
+	    svst4 (p0, x0 + svcntb () * 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s8_m1:
+**	decb	x0
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s8_m1, svint8x4_t, int8_t,
+	    svst4_s8 (p0, x0 - svcntb (), z0),
+	    svst4 (p0, x0 - svcntb (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s8_m2:
+**	decb	x0, all, mul #2
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s8_m2, svint8x4_t, int8_t,
+	    svst4_s8 (p0, x0 - svcntb () * 2, z0),
+	    svst4 (p0, x0 - svcntb () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_s8_m3:
+**	decb	x0, all, mul #3
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_s8_m3, svint8x4_t, int8_t,
+	    svst4_s8 (p0, x0 - svcntb () * 3, z0),
+	    svst4 (p0, x0 - svcntb () * 3, z0))
+
+/*
+** st4_s8_m4:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_s8_m4, svint8x4_t, int8_t,
+	    svst4_s8 (p0, x0 - svcntb () * 4, z0),
+	    svst4 (p0, x0 - svcntb () * 4, z0))
+
+/*
+** st4_s8_m32:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_s8_m32, svint8x4_t, int8_t,
+	    svst4_s8 (p0, x0 - svcntb () * 32, z0),
+	    svst4 (p0, x0 - svcntb () * 32, z0))
+
+/*
+** st4_s8_m36:
+**	[^{]*
+**	st4b	{z0\.b - z3\.b}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_s8_m36, svint8x4_t, int8_t,
+	    svst4_s8 (p0, x0 - svcntb () * 36, z0),
+	    svst4 (p0, x0 - svcntb () * 36, z0))
+
+/*
+** st4_vnum_s8_0:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s8_0, svint8x4_t, int8_t,
+	    svst4_vnum_s8 (p0, x0, 0, z0),
+	    svst4_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s8_1:
+**	incb	x0
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s8_1, svint8x4_t, int8_t,
+	    svst4_vnum_s8 (p0, x0, 1, z0),
+	    svst4_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s8_2:
+**	incb	x0, all, mul #2
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s8_2, svint8x4_t, int8_t,
+	    svst4_vnum_s8 (p0, x0, 2, z0),
+	    svst4_vnum (p0, x0, 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s8_3:
+**	incb	x0, all, mul #3
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s8_3, svint8x4_t, int8_t,
+	    svst4_vnum_s8 (p0, x0, 3, z0),
+	    svst4_vnum (p0, x0, 3, z0))
+
+/*
+** st4_vnum_s8_4:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s8_4, svint8x4_t, int8_t,
+	    svst4_vnum_s8 (p0, x0, 4, z0),
+	    svst4_vnum (p0, x0, 4, z0))
+
+/*
+** st4_vnum_s8_28:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s8_28, svint8x4_t, int8_t,
+	    svst4_vnum_s8 (p0, x0, 28, z0),
+	    svst4_vnum (p0, x0, 28, z0))
+
+/*
+** st4_vnum_s8_32:
+**	[^{]*
+**	st4b	{z0\.b - z3\.b}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s8_32, svint8x4_t, int8_t,
+	    svst4_vnum_s8 (p0, x0, 32, z0),
+	    svst4_vnum (p0, x0, 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s8_m1:
+**	decb	x0
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s8_m1, svint8x4_t, int8_t,
+	    svst4_vnum_s8 (p0, x0, -1, z0),
+	    svst4_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s8_m2:
+**	decb	x0, all, mul #2
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s8_m2, svint8x4_t, int8_t,
+	    svst4_vnum_s8 (p0, x0, -2, z0),
+	    svst4_vnum (p0, x0, -2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_s8_m3:
+**	decb	x0, all, mul #3
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s8_m3, svint8x4_t, int8_t,
+	    svst4_vnum_s8 (p0, x0, -3, z0),
+	    svst4_vnum (p0, x0, -3, z0))
+
+/*
+** st4_vnum_s8_m4:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s8_m4, svint8x4_t, int8_t,
+	    svst4_vnum_s8 (p0, x0, -4, z0),
+	    svst4_vnum (p0, x0, -4, z0))
+
+/*
+** st4_vnum_s8_m32:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s8_m32, svint8x4_t, int8_t,
+	    svst4_vnum_s8 (p0, x0, -32, z0),
+	    svst4_vnum (p0, x0, -32, z0))
+
+/*
+** st4_vnum_s8_m36:
+**	[^{]*
+**	st4b	{z0\.b - z3\.b}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_s8_m36, svint8x4_t, int8_t,
+	    svst4_vnum_s8 (p0, x0, -36, z0),
+	    svst4_vnum (p0, x0, -36, z0))
+
+/*
+** st4_vnum_s8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	st4b	{z0\.b - z3\.b}, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_STORE (st4_vnum_s8_x1, svint8x4_t, int8_t,
+	    svst4_vnum_s8 (p0, x0, x1, z0),
+	    svst4_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u16.c
new file mode 100644
index 000000000..5272c7f61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u16.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st4_u16_base:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u16_base, svuint16x4_t, uint16_t,
+	    svst4_u16 (p0, x0, z0),
+	    svst4 (p0, x0, z0))
+
+/*
+** st4_u16_index:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (st4_u16_index, svuint16x4_t, uint16_t,
+	    svst4_u16 (p0, x0 + x1, z0),
+	    svst4 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u16_1:
+**	incb	x0
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u16_1, svuint16x4_t, uint16_t,
+	    svst4_u16 (p0, x0 + svcnth (), z0),
+	    svst4 (p0, x0 + svcnth (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u16_2:
+**	incb	x0, all, mul #2
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u16_2, svuint16x4_t, uint16_t,
+	    svst4_u16 (p0, x0 + svcnth () * 2, z0),
+	    svst4 (p0, x0 + svcnth () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u16_3:
+**	incb	x0, all, mul #3
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u16_3, svuint16x4_t, uint16_t,
+	    svst4_u16 (p0, x0 + svcnth () * 3, z0),
+	    svst4 (p0, x0 + svcnth () * 3, z0))
+
+/*
+** st4_u16_4:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_u16_4, svuint16x4_t, uint16_t,
+	    svst4_u16 (p0, x0 + svcnth () * 4, z0),
+	    svst4 (p0, x0 + svcnth () * 4, z0))
+
+/*
+** st4_u16_28:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_u16_28, svuint16x4_t, uint16_t,
+	    svst4_u16 (p0, x0 + svcnth () * 28, z0),
+	    svst4 (p0, x0 + svcnth () * 28, z0))
+
+/*
+** st4_u16_32:
+**	[^{]*
+**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_u16_32, svuint16x4_t, uint16_t,
+	    svst4_u16 (p0, x0 + svcnth () * 32, z0),
+	    svst4 (p0, x0 + svcnth () * 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u16_m1:
+**	decb	x0
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u16_m1, svuint16x4_t, uint16_t,
+	    svst4_u16 (p0, x0 - svcnth (), z0),
+	    svst4 (p0, x0 - svcnth (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u16_m2:
+**	decb	x0, all, mul #2
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u16_m2, svuint16x4_t, uint16_t,
+	    svst4_u16 (p0, x0 - svcnth () * 2, z0),
+	    svst4 (p0, x0 - svcnth () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u16_m3:
+**	decb	x0, all, mul #3
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u16_m3, svuint16x4_t, uint16_t,
+	    svst4_u16 (p0, x0 - svcnth () * 3, z0),
+	    svst4 (p0, x0 - svcnth () * 3, z0))
+
+/*
+** st4_u16_m4:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_u16_m4, svuint16x4_t, uint16_t,
+	    svst4_u16 (p0, x0 - svcnth () * 4, z0),
+	    svst4 (p0, x0 - svcnth () * 4, z0))
+
+/*
+** st4_u16_m32:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_u16_m32, svuint16x4_t, uint16_t,
+	    svst4_u16 (p0, x0 - svcnth () * 32, z0),
+	    svst4 (p0, x0 - svcnth () * 32, z0))
+
+/*
+** st4_u16_m36:
+**	[^{]*
+**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_u16_m36, svuint16x4_t, uint16_t,
+	    svst4_u16 (p0, x0 - svcnth () * 36, z0),
+	    svst4 (p0, x0 - svcnth () * 36, z0))
+
+/*
+** st4_vnum_u16_0:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u16_0, svuint16x4_t, uint16_t,
+	    svst4_vnum_u16 (p0, x0, 0, z0),
+	    svst4_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u16_1:
+**	incb	x0
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u16_1, svuint16x4_t, uint16_t,
+	    svst4_vnum_u16 (p0, x0, 1, z0),
+	    svst4_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u16_2:
+**	incb	x0, all, mul #2
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u16_2, svuint16x4_t, uint16_t,
+	    svst4_vnum_u16 (p0, x0, 2, z0),
+	    svst4_vnum (p0, x0, 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u16_3:
+**	incb	x0, all, mul #3
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u16_3, svuint16x4_t, uint16_t,
+	    svst4_vnum_u16 (p0, x0, 3, z0),
+	    svst4_vnum (p0, x0, 3, z0))
+
+/*
+** st4_vnum_u16_4:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u16_4, svuint16x4_t, uint16_t,
+	    svst4_vnum_u16 (p0, x0, 4, z0),
+	    svst4_vnum (p0, x0, 4, z0))
+
+/*
+** st4_vnum_u16_28:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u16_28, svuint16x4_t, uint16_t,
+	    svst4_vnum_u16 (p0, x0, 28, z0),
+	    svst4_vnum (p0, x0, 28, z0))
+
+/*
+** st4_vnum_u16_32:
+**	[^{]*
+**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u16_32, svuint16x4_t, uint16_t,
+	    svst4_vnum_u16 (p0, x0, 32, z0),
+	    svst4_vnum (p0, x0, 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u16_m1:
+**	decb	x0
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u16_m1, svuint16x4_t, uint16_t,
+	    svst4_vnum_u16 (p0, x0, -1, z0),
+	    svst4_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u16_m2:
+**	decb	x0, all, mul #2
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u16_m2, svuint16x4_t, uint16_t,
+	    svst4_vnum_u16 (p0, x0, -2, z0),
+	    svst4_vnum (p0, x0, -2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u16_m3:
+**	decb	x0, all, mul #3
+**	st4h	{z0\.h - z3\.h}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u16_m3, svuint16x4_t, uint16_t,
+	    svst4_vnum_u16 (p0, x0, -3, z0),
+	    svst4_vnum (p0, x0, -3, z0))
+
+/*
+** st4_vnum_u16_m4:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u16_m4, svuint16x4_t, uint16_t,
+	    svst4_vnum_u16 (p0, x0, -4, z0),
+	    svst4_vnum (p0, x0, -4, z0))
+
+/*
+** st4_vnum_u16_m32:
+**	st4h	{z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u16_m32, svuint16x4_t, uint16_t,
+	    svst4_vnum_u16 (p0, x0, -32, z0),
+	    svst4_vnum (p0, x0, -32, z0))
+
+/*
+** st4_vnum_u16_m36:
+**	[^{]*
+**	st4h	{z0\.h - z3\.h}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u16_m36, svuint16x4_t, uint16_t,
+	    svst4_vnum_u16 (p0, x0, -36, z0),
+	    svst4_vnum (p0, x0, -36, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st4_vnum_u16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st4h	{z0\.h - z3\.h}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u16_x1, svuint16x4_t, uint16_t,
+	    svst4_vnum_u16 (p0, x0, x1, z0),
+	    svst4_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u32.c
new file mode 100644
index 000000000..8b9b322e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u32.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st4_u32_base:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u32_base, svuint32x4_t, uint32_t,
+	    svst4_u32 (p0, x0, z0),
+	    svst4 (p0, x0, z0))
+
+/*
+** st4_u32_index:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (st4_u32_index, svuint32x4_t, uint32_t,
+	    svst4_u32 (p0, x0 + x1, z0),
+	    svst4 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u32_1:
+**	incb	x0
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u32_1, svuint32x4_t, uint32_t,
+	    svst4_u32 (p0, x0 + svcntw (), z0),
+	    svst4 (p0, x0 + svcntw (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u32_2:
+**	incb	x0, all, mul #2
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u32_2, svuint32x4_t, uint32_t,
+	    svst4_u32 (p0, x0 + svcntw () * 2, z0),
+	    svst4 (p0, x0 + svcntw () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u32_3:
+**	incb	x0, all, mul #3
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u32_3, svuint32x4_t, uint32_t,
+	    svst4_u32 (p0, x0 + svcntw () * 3, z0),
+	    svst4 (p0, x0 + svcntw () * 3, z0))
+
+/*
+** st4_u32_4:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_u32_4, svuint32x4_t, uint32_t,
+	    svst4_u32 (p0, x0 + svcntw () * 4, z0),
+	    svst4 (p0, x0 + svcntw () * 4, z0))
+
+/*
+** st4_u32_28:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_u32_28, svuint32x4_t, uint32_t,
+	    svst4_u32 (p0, x0 + svcntw () * 28, z0),
+	    svst4 (p0, x0 + svcntw () * 28, z0))
+
+/*
+** st4_u32_32:
+**	[^{]*
+**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_u32_32, svuint32x4_t, uint32_t,
+	    svst4_u32 (p0, x0 + svcntw () * 32, z0),
+	    svst4 (p0, x0 + svcntw () * 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u32_m1:
+**	decb	x0
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u32_m1, svuint32x4_t, uint32_t,
+	    svst4_u32 (p0, x0 - svcntw (), z0),
+	    svst4 (p0, x0 - svcntw (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u32_m2:
+**	decb	x0, all, mul #2
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u32_m2, svuint32x4_t, uint32_t,
+	    svst4_u32 (p0, x0 - svcntw () * 2, z0),
+	    svst4 (p0, x0 - svcntw () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u32_m3:
+**	decb	x0, all, mul #3
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u32_m3, svuint32x4_t, uint32_t,
+	    svst4_u32 (p0, x0 - svcntw () * 3, z0),
+	    svst4 (p0, x0 - svcntw () * 3, z0))
+
+/*
+** st4_u32_m4:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_u32_m4, svuint32x4_t, uint32_t,
+	    svst4_u32 (p0, x0 - svcntw () * 4, z0),
+	    svst4 (p0, x0 - svcntw () * 4, z0))
+
+/*
+** st4_u32_m32:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_u32_m32, svuint32x4_t, uint32_t,
+	    svst4_u32 (p0, x0 - svcntw () * 32, z0),
+	    svst4 (p0, x0 - svcntw () * 32, z0))
+
+/*
+** st4_u32_m36:
+**	[^{]*
+**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_u32_m36, svuint32x4_t, uint32_t,
+	    svst4_u32 (p0, x0 - svcntw () * 36, z0),
+	    svst4 (p0, x0 - svcntw () * 36, z0))
+
+/*
+** st4_vnum_u32_0:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u32_0, svuint32x4_t, uint32_t,
+	    svst4_vnum_u32 (p0, x0, 0, z0),
+	    svst4_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u32_1:
+**	incb	x0
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u32_1, svuint32x4_t, uint32_t,
+	    svst4_vnum_u32 (p0, x0, 1, z0),
+	    svst4_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u32_2:
+**	incb	x0, all, mul #2
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u32_2, svuint32x4_t, uint32_t,
+	    svst4_vnum_u32 (p0, x0, 2, z0),
+	    svst4_vnum (p0, x0, 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u32_3:
+**	incb	x0, all, mul #3
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u32_3, svuint32x4_t, uint32_t,
+	    svst4_vnum_u32 (p0, x0, 3, z0),
+	    svst4_vnum (p0, x0, 3, z0))
+
+/*
+** st4_vnum_u32_4:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u32_4, svuint32x4_t, uint32_t,
+	    svst4_vnum_u32 (p0, x0, 4, z0),
+	    svst4_vnum (p0, x0, 4, z0))
+
+/*
+** st4_vnum_u32_28:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u32_28, svuint32x4_t, uint32_t,
+	    svst4_vnum_u32 (p0, x0, 28, z0),
+	    svst4_vnum (p0, x0, 28, z0))
+
+/*
+** st4_vnum_u32_32:
+**	[^{]*
+**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u32_32, svuint32x4_t, uint32_t,
+	    svst4_vnum_u32 (p0, x0, 32, z0),
+	    svst4_vnum (p0, x0, 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u32_m1:
+**	decb	x0
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u32_m1, svuint32x4_t, uint32_t,
+	    svst4_vnum_u32 (p0, x0, -1, z0),
+	    svst4_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u32_m2:
+**	decb	x0, all, mul #2
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u32_m2, svuint32x4_t, uint32_t,
+	    svst4_vnum_u32 (p0, x0, -2, z0),
+	    svst4_vnum (p0, x0, -2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u32_m3:
+**	decb	x0, all, mul #3
+**	st4w	{z0\.s - z3\.s}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u32_m3, svuint32x4_t, uint32_t,
+	    svst4_vnum_u32 (p0, x0, -3, z0),
+	    svst4_vnum (p0, x0, -3, z0))
+
+/*
+** st4_vnum_u32_m4:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u32_m4, svuint32x4_t, uint32_t,
+	    svst4_vnum_u32 (p0, x0, -4, z0),
+	    svst4_vnum (p0, x0, -4, z0))
+
+/*
+** st4_vnum_u32_m32:
+**	st4w	{z0\.s - z3\.s}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u32_m32, svuint32x4_t, uint32_t,
+	    svst4_vnum_u32 (p0, x0, -32, z0),
+	    svst4_vnum (p0, x0, -32, z0))
+
+/*
+** st4_vnum_u32_m36:
+**	[^{]*
+**	st4w	{z0\.s - z3\.s}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u32_m36, svuint32x4_t, uint32_t,
+	    svst4_vnum_u32 (p0, x0, -36, z0),
+	    svst4_vnum (p0, x0, -36, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st4_vnum_u32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st4w	{z0\.s - z3\.s}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u32_x1, svuint32x4_t, uint32_t,
+	    svst4_vnum_u32 (p0, x0, x1, z0),
+	    svst4_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u64.c
new file mode 100644
index 000000000..53b78f5ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u64.c
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st4_u64_base:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u64_base, svuint64x4_t, uint64_t,
+	    svst4_u64 (p0, x0, z0),
+	    svst4 (p0, x0, z0))
+
+/*
+** st4_u64_index:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_STORE (st4_u64_index, svuint64x4_t, uint64_t,
+	    svst4_u64 (p0, x0 + x1, z0),
+	    svst4 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u64_1:
+**	incb	x0
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u64_1, svuint64x4_t, uint64_t,
+	    svst4_u64 (p0, x0 + svcntd (), z0),
+	    svst4 (p0, x0 + svcntd (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u64_2:
+**	incb	x0, all, mul #2
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u64_2, svuint64x4_t, uint64_t,
+	    svst4_u64 (p0, x0 + svcntd () * 2, z0),
+	    svst4 (p0, x0 + svcntd () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u64_3:
+**	incb	x0, all, mul #3
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u64_3, svuint64x4_t, uint64_t,
+	    svst4_u64 (p0, x0 + svcntd () * 3, z0),
+	    svst4 (p0, x0 + svcntd () * 3, z0))
+
+/*
+** st4_u64_4:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_u64_4, svuint64x4_t, uint64_t,
+	    svst4_u64 (p0, x0 + svcntd () * 4, z0),
+	    svst4 (p0, x0 + svcntd () * 4, z0))
+
+/*
+** st4_u64_28:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_u64_28, svuint64x4_t, uint64_t,
+	    svst4_u64 (p0, x0 + svcntd () * 28, z0),
+	    svst4 (p0, x0 + svcntd () * 28, z0))
+
+/*
+** st4_u64_32:
+**	[^{]*
+**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_u64_32, svuint64x4_t, uint64_t,
+	    svst4_u64 (p0, x0 + svcntd () * 32, z0),
+	    svst4 (p0, x0 + svcntd () * 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u64_m1:
+**	decb	x0
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u64_m1, svuint64x4_t, uint64_t,
+	    svst4_u64 (p0, x0 - svcntd (), z0),
+	    svst4 (p0, x0 - svcntd (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u64_m2:
+**	decb	x0, all, mul #2
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u64_m2, svuint64x4_t, uint64_t,
+	    svst4_u64 (p0, x0 - svcntd () * 2, z0),
+	    svst4 (p0, x0 - svcntd () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u64_m3:
+**	decb	x0, all, mul #3
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u64_m3, svuint64x4_t, uint64_t,
+	    svst4_u64 (p0, x0 - svcntd () * 3, z0),
+	    svst4 (p0, x0 - svcntd () * 3, z0))
+
+/*
+** st4_u64_m4:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_u64_m4, svuint64x4_t, uint64_t,
+	    svst4_u64 (p0, x0 - svcntd () * 4, z0),
+	    svst4 (p0, x0 - svcntd () * 4, z0))
+
+/*
+** st4_u64_m32:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_u64_m32, svuint64x4_t, uint64_t,
+	    svst4_u64 (p0, x0 - svcntd () * 32, z0),
+	    svst4 (p0, x0 - svcntd () * 32, z0))
+
+/*
+** st4_u64_m36:
+**	[^{]*
+**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_u64_m36, svuint64x4_t, uint64_t,
+	    svst4_u64 (p0, x0 - svcntd () * 36, z0),
+	    svst4 (p0, x0 - svcntd () * 36, z0))
+
+/*
+** st4_vnum_u64_0:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u64_0, svuint64x4_t, uint64_t,
+	    svst4_vnum_u64 (p0, x0, 0, z0),
+	    svst4_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u64_1:
+**	incb	x0
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u64_1, svuint64x4_t, uint64_t,
+	    svst4_vnum_u64 (p0, x0, 1, z0),
+	    svst4_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u64_2:
+**	incb	x0, all, mul #2
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u64_2, svuint64x4_t, uint64_t,
+	    svst4_vnum_u64 (p0, x0, 2, z0),
+	    svst4_vnum (p0, x0, 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u64_3:
+**	incb	x0, all, mul #3
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u64_3, svuint64x4_t, uint64_t,
+	    svst4_vnum_u64 (p0, x0, 3, z0),
+	    svst4_vnum (p0, x0, 3, z0))
+
+/*
+** st4_vnum_u64_4:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u64_4, svuint64x4_t, uint64_t,
+	    svst4_vnum_u64 (p0, x0, 4, z0),
+	    svst4_vnum (p0, x0, 4, z0))
+
+/*
+** st4_vnum_u64_28:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u64_28, svuint64x4_t, uint64_t,
+	    svst4_vnum_u64 (p0, x0, 28, z0),
+	    svst4_vnum (p0, x0, 28, z0))
+
+/*
+** st4_vnum_u64_32:
+**	[^{]*
+**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u64_32, svuint64x4_t, uint64_t,
+	    svst4_vnum_u64 (p0, x0, 32, z0),
+	    svst4_vnum (p0, x0, 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u64_m1:
+**	decb	x0
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u64_m1, svuint64x4_t, uint64_t,
+	    svst4_vnum_u64 (p0, x0, -1, z0),
+	    svst4_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u64_m2:
+**	decb	x0, all, mul #2
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u64_m2, svuint64x4_t, uint64_t,
+	    svst4_vnum_u64 (p0, x0, -2, z0),
+	    svst4_vnum (p0, x0, -2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u64_m3:
+**	decb	x0, all, mul #3
+**	st4d	{z0\.d - z3\.d}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u64_m3, svuint64x4_t, uint64_t,
+	    svst4_vnum_u64 (p0, x0, -3, z0),
+	    svst4_vnum (p0, x0, -3, z0))
+
+/*
+** st4_vnum_u64_m4:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u64_m4, svuint64x4_t, uint64_t,
+	    svst4_vnum_u64 (p0, x0, -4, z0),
+	    svst4_vnum (p0, x0, -4, z0))
+
+/*
+** st4_vnum_u64_m32:
+**	st4d	{z0\.d - z3\.d}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u64_m32, svuint64x4_t, uint64_t,
+	    svst4_vnum_u64 (p0, x0, -32, z0),
+	    svst4_vnum (p0, x0, -32, z0))
+
+/*
+** st4_vnum_u64_m36:
+**	[^{]*
+**	st4d	{z0\.d - z3\.d}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u64_m36, svuint64x4_t, uint64_t,
+	    svst4_vnum_u64 (p0, x0, -36, z0),
+	    svst4_vnum (p0, x0, -36, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** st4_vnum_u64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	st4d	{z0\.d - z3\.d}, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u64_x1, svuint64x4_t, uint64_t,
+	    svst4_vnum_u64 (p0, x0, x1, z0),
+	    svst4_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u8.c
new file mode 100644
index 000000000..e7c2e7d76
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/st4_u8.c
@@ -0,0 +1,290 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st4_u8_base:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u8_base, svuint8x4_t, uint8_t,
+	    svst4_u8 (p0, x0, z0),
+	    svst4 (p0, x0, z0))
+
+/*
+** st4_u8_index:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, x1\]
+**	ret
+*/
+TEST_STORE (st4_u8_index, svuint8x4_t, uint8_t,
+	    svst4_u8 (p0, x0 + x1, z0),
+	    svst4 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u8_1:
+**	incb	x0
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u8_1, svuint8x4_t, uint8_t,
+	    svst4_u8 (p0, x0 + svcntb (), z0),
+	    svst4 (p0, x0 + svcntb (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u8_2:
+**	incb	x0, all, mul #2
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u8_2, svuint8x4_t, uint8_t,
+	    svst4_u8 (p0, x0 + svcntb () * 2, z0),
+	    svst4 (p0, x0 + svcntb () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u8_3:
+**	incb	x0, all, mul #3
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u8_3, svuint8x4_t, uint8_t,
+	    svst4_u8 (p0, x0 + svcntb () * 3, z0),
+	    svst4 (p0, x0 + svcntb () * 3, z0))
+
+/*
+** st4_u8_4:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_u8_4, svuint8x4_t, uint8_t,
+	    svst4_u8 (p0, x0 + svcntb () * 4, z0),
+	    svst4 (p0, x0 + svcntb () * 4, z0))
+
+/*
+** st4_u8_28:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_u8_28, svuint8x4_t, uint8_t,
+	    svst4_u8 (p0, x0 + svcntb () * 28, z0),
+	    svst4 (p0, x0 + svcntb () * 28, z0))
+
+/*
+** st4_u8_32:
+**	[^{]*
+**	st4b	{z0\.b - z3\.b}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_u8_32, svuint8x4_t, uint8_t,
+	    svst4_u8 (p0, x0 + svcntb () * 32, z0),
+	    svst4 (p0, x0 + svcntb () * 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u8_m1:
+**	decb	x0
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u8_m1, svuint8x4_t, uint8_t,
+	    svst4_u8 (p0, x0 - svcntb (), z0),
+	    svst4 (p0, x0 - svcntb (), z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u8_m2:
+**	decb	x0, all, mul #2
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u8_m2, svuint8x4_t, uint8_t,
+	    svst4_u8 (p0, x0 - svcntb () * 2, z0),
+	    svst4 (p0, x0 - svcntb () * 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_u8_m3:
+**	decb	x0, all, mul #3
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_u8_m3, svuint8x4_t, uint8_t,
+	    svst4_u8 (p0, x0 - svcntb () * 3, z0),
+	    svst4 (p0, x0 - svcntb () * 3, z0))
+
+/*
+** st4_u8_m4:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_u8_m4, svuint8x4_t, uint8_t,
+	    svst4_u8 (p0, x0 - svcntb () * 4, z0),
+	    svst4 (p0, x0 - svcntb () * 4, z0))
+
+/*
+** st4_u8_m32:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_u8_m32, svuint8x4_t, uint8_t,
+	    svst4_u8 (p0, x0 - svcntb () * 32, z0),
+	    svst4 (p0, x0 - svcntb () * 32, z0))
+
+/*
+** st4_u8_m36:
+**	[^{]*
+**	st4b	{z0\.b - z3\.b}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_u8_m36, svuint8x4_t, uint8_t,
+	    svst4_u8 (p0, x0 - svcntb () * 36, z0),
+	    svst4 (p0, x0 - svcntb () * 36, z0))
+
+/*
+** st4_vnum_u8_0:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u8_0, svuint8x4_t, uint8_t,
+	    svst4_vnum_u8 (p0, x0, 0, z0),
+	    svst4_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u8_1:
+**	incb	x0
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u8_1, svuint8x4_t, uint8_t,
+	    svst4_vnum_u8 (p0, x0, 1, z0),
+	    svst4_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u8_2:
+**	incb	x0, all, mul #2
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u8_2, svuint8x4_t, uint8_t,
+	    svst4_vnum_u8 (p0, x0, 2, z0),
+	    svst4_vnum (p0, x0, 2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u8_3:
+**	incb	x0, all, mul #3
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u8_3, svuint8x4_t, uint8_t,
+	    svst4_vnum_u8 (p0, x0, 3, z0),
+	    svst4_vnum (p0, x0, 3, z0))
+
+/*
+** st4_vnum_u8_4:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, #4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u8_4, svuint8x4_t, uint8_t,
+	    svst4_vnum_u8 (p0, x0, 4, z0),
+	    svst4_vnum (p0, x0, 4, z0))
+
+/*
+** st4_vnum_u8_28:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, #28, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u8_28, svuint8x4_t, uint8_t,
+	    svst4_vnum_u8 (p0, x0, 28, z0),
+	    svst4_vnum (p0, x0, 28, z0))
+
+/*
+** st4_vnum_u8_32:
+**	[^{]*
+**	st4b	{z0\.b - z3\.b}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u8_32, svuint8x4_t, uint8_t,
+	    svst4_vnum_u8 (p0, x0, 32, z0),
+	    svst4_vnum (p0, x0, 32, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u8_m1:
+**	decb	x0
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u8_m1, svuint8x4_t, uint8_t,
+	    svst4_vnum_u8 (p0, x0, -1, z0),
+	    svst4_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u8_m2:
+**	decb	x0, all, mul #2
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u8_m2, svuint8x4_t, uint8_t,
+	    svst4_vnum_u8 (p0, x0, -2, z0),
+	    svst4_vnum (p0, x0, -2, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** st4_vnum_u8_m3:
+**	decb	x0, all, mul #3
+**	st4b	{z0\.b - z3\.b}, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u8_m3, svuint8x4_t, uint8_t,
+	    svst4_vnum_u8 (p0, x0, -3, z0),
+	    svst4_vnum (p0, x0, -3, z0))
+
+/*
+** st4_vnum_u8_m4:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, #-4, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u8_m4, svuint8x4_t, uint8_t,
+	    svst4_vnum_u8 (p0, x0, -4, z0),
+	    svst4_vnum (p0, x0, -4, z0))
+
+/*
+** st4_vnum_u8_m32:
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, #-32, mul vl\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u8_m32, svuint8x4_t, uint8_t,
+	    svst4_vnum_u8 (p0, x0, -32, z0),
+	    svst4_vnum (p0, x0, -32, z0))
+
+/*
+** st4_vnum_u8_m36:
+**	[^{]*
+**	st4b	{z0\.b - z3\.b}, p0, \[x[0-9]+\]
+**	ret
+*/
+TEST_STORE (st4_vnum_u8_m36, svuint8x4_t, uint8_t,
+	    svst4_vnum_u8 (p0, x0, -36, z0),
+	    svst4_vnum (p0, x0, -36, z0))
+
+/*
+** st4_vnum_u8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	st4b	{z0\.b - z3\.b}, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	st4b	{z0\.b - z3\.b}, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_STORE (st4_vnum_u8_x1, svuint8x4_t, uint8_t,
+	    svst4_vnum_u8 (p0, x0, x1, z0),
+	    svst4_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_bf16.c
new file mode 100644
index 000000000..3c4d21f27
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_bf16.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** stnt1_bf16_base:
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_bf16_base, svbfloat16_t, bfloat16_t,
+	    svstnt1_bf16 (p0, x0, z0),
+	    svstnt1 (p0, x0, z0))
+
+/*
+** stnt1_bf16_index:
+**	stnt1h	z0\.h, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (stnt1_bf16_index, svbfloat16_t, bfloat16_t,
+	    svstnt1_bf16 (p0, x0 + x1, z0),
+	    svstnt1 (p0, x0 + x1, z0))
+
+/*
+** stnt1_bf16_1:
+**	stnt1h	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_bf16_1, svbfloat16_t, bfloat16_t,
+	    svstnt1_bf16 (p0, x0 + svcnth (), z0),
+	    svstnt1 (p0, x0 + svcnth (), z0))
+
+/*
+** stnt1_bf16_7:
+**	stnt1h	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_bf16_7, svbfloat16_t, bfloat16_t,
+	    svstnt1_bf16 (p0, x0 + svcnth () * 7, z0),
+	    svstnt1 (p0, x0 + svcnth () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_bf16_8:
+**	incb	x0, all, mul #8
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_bf16_8, svbfloat16_t, bfloat16_t,
+	    svstnt1_bf16 (p0, x0 + svcnth () * 8, z0),
+	    svstnt1 (p0, x0 + svcnth () * 8, z0))
+
+/*
+** stnt1_bf16_m1:
+**	stnt1h	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_bf16_m1, svbfloat16_t, bfloat16_t,
+	    svstnt1_bf16 (p0, x0 - svcnth (), z0),
+	    svstnt1 (p0, x0 - svcnth (), z0))
+
+/*
+** stnt1_bf16_m8:
+**	stnt1h	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_bf16_m8, svbfloat16_t, bfloat16_t,
+	    svstnt1_bf16 (p0, x0 - svcnth () * 8, z0),
+	    svstnt1 (p0, x0 - svcnth () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_bf16_m9:
+**	decb	x0, all, mul #9
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_bf16_m9, svbfloat16_t, bfloat16_t,
+	    svstnt1_bf16 (p0, x0 - svcnth () * 9, z0),
+	    svstnt1 (p0, x0 - svcnth () * 9, z0))
+
+/*
+** stnt1_vnum_bf16_0:
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+	    svstnt1_vnum_bf16 (p0, x0, 0, z0),
+	    svstnt1_vnum (p0, x0, 0, z0))
+
+/*
+** stnt1_vnum_bf16_1:
+**	stnt1h	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+	    svstnt1_vnum_bf16 (p0, x0, 1, z0),
+	    svstnt1_vnum (p0, x0, 1, z0))
+
+/*
+** stnt1_vnum_bf16_7:
+**	stnt1h	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
+	    svstnt1_vnum_bf16 (p0, x0, 7, z0),
+	    svstnt1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_bf16_8:
+**	incb	x0, all, mul #8
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
+	    svstnt1_vnum_bf16 (p0, x0, 8, z0),
+	    svstnt1_vnum (p0, x0, 8, z0))
+
+/*
+** stnt1_vnum_bf16_m1:
+**	stnt1h	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+	    svstnt1_vnum_bf16 (p0, x0, -1, z0),
+	    svstnt1_vnum (p0, x0, -1, z0))
+
+/*
+** stnt1_vnum_bf16_m8:
+**	stnt1h	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
+	    svstnt1_vnum_bf16 (p0, x0, -8, z0),
+	    svstnt1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_bf16_m9:
+**	decb	x0, all, mul #9
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
+	    svstnt1_vnum_bf16 (p0, x0, -9, z0),
+	    svstnt1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** stnt1_vnum_bf16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	stnt1h	z0\.h, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+	    svstnt1_vnum_bf16 (p0, x0, x1, z0),
+	    svstnt1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f16.c
new file mode 100644
index 000000000..a3d89caf1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f16.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** stnt1_f16_base:
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_f16_base, svfloat16_t, float16_t,
+	    svstnt1_f16 (p0, x0, z0),
+	    svstnt1 (p0, x0, z0))
+
+/*
+** stnt1_f16_index:
+**	stnt1h	z0\.h, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (stnt1_f16_index, svfloat16_t, float16_t,
+	    svstnt1_f16 (p0, x0 + x1, z0),
+	    svstnt1 (p0, x0 + x1, z0))
+
+/*
+** stnt1_f16_1:
+**	stnt1h	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_f16_1, svfloat16_t, float16_t,
+	    svstnt1_f16 (p0, x0 + svcnth (), z0),
+	    svstnt1 (p0, x0 + svcnth (), z0))
+
+/*
+** stnt1_f16_7:
+**	stnt1h	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_f16_7, svfloat16_t, float16_t,
+	    svstnt1_f16 (p0, x0 + svcnth () * 7, z0),
+	    svstnt1 (p0, x0 + svcnth () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_f16_8:
+**	incb	x0, all, mul #8
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_f16_8, svfloat16_t, float16_t,
+	    svstnt1_f16 (p0, x0 + svcnth () * 8, z0),
+	    svstnt1 (p0, x0 + svcnth () * 8, z0))
+
+/*
+** stnt1_f16_m1:
+**	stnt1h	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_f16_m1, svfloat16_t, float16_t,
+	    svstnt1_f16 (p0, x0 - svcnth (), z0),
+	    svstnt1 (p0, x0 - svcnth (), z0))
+
+/*
+** stnt1_f16_m8:
+**	stnt1h	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_f16_m8, svfloat16_t, float16_t,
+	    svstnt1_f16 (p0, x0 - svcnth () * 8, z0),
+	    svstnt1 (p0, x0 - svcnth () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_f16_m9:
+**	decb	x0, all, mul #9
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_f16_m9, svfloat16_t, float16_t,
+	    svstnt1_f16 (p0, x0 - svcnth () * 9, z0),
+	    svstnt1 (p0, x0 - svcnth () * 9, z0))
+
+/*
+** stnt1_vnum_f16_0:
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f16_0, svfloat16_t, float16_t,
+	    svstnt1_vnum_f16 (p0, x0, 0, z0),
+	    svstnt1_vnum (p0, x0, 0, z0))
+
+/*
+** stnt1_vnum_f16_1:
+**	stnt1h	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f16_1, svfloat16_t, float16_t,
+	    svstnt1_vnum_f16 (p0, x0, 1, z0),
+	    svstnt1_vnum (p0, x0, 1, z0))
+
+/*
+** stnt1_vnum_f16_7:
+**	stnt1h	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f16_7, svfloat16_t, float16_t,
+	    svstnt1_vnum_f16 (p0, x0, 7, z0),
+	    svstnt1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_f16_8:
+**	incb	x0, all, mul #8
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f16_8, svfloat16_t, float16_t,
+	    svstnt1_vnum_f16 (p0, x0, 8, z0),
+	    svstnt1_vnum (p0, x0, 8, z0))
+
+/*
+** stnt1_vnum_f16_m1:
+**	stnt1h	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f16_m1, svfloat16_t, float16_t,
+	    svstnt1_vnum_f16 (p0, x0, -1, z0),
+	    svstnt1_vnum (p0, x0, -1, z0))
+
+/*
+** stnt1_vnum_f16_m8:
+**	stnt1h	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f16_m8, svfloat16_t, float16_t,
+	    svstnt1_vnum_f16 (p0, x0, -8, z0),
+	    svstnt1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_f16_m9:
+**	decb	x0, all, mul #9
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f16_m9, svfloat16_t, float16_t,
+	    svstnt1_vnum_f16 (p0, x0, -9, z0),
+	    svstnt1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** stnt1_vnum_f16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	stnt1h	z0\.h, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f16_x1, svfloat16_t, float16_t,
+	    svstnt1_vnum_f16 (p0, x0, x1, z0),
+	    svstnt1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f32.c
new file mode 100644
index 000000000..24e890512
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** stnt1_f32_base:
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_f32_base, svfloat32_t, float32_t,
+	    svstnt1_f32 (p0, x0, z0),
+	    svstnt1 (p0, x0, z0))
+
+/*
+** stnt1_f32_index:
+**	stnt1w	z0\.s, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (stnt1_f32_index, svfloat32_t, float32_t,
+	    svstnt1_f32 (p0, x0 + x1, z0),
+	    svstnt1 (p0, x0 + x1, z0))
+
+/*
+** stnt1_f32_1:
+**	stnt1w	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_f32_1, svfloat32_t, float32_t,
+	    svstnt1_f32 (p0, x0 + svcntw (), z0),
+	    svstnt1 (p0, x0 + svcntw (), z0))
+
+/*
+** stnt1_f32_7:
+**	stnt1w	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_f32_7, svfloat32_t, float32_t,
+	    svstnt1_f32 (p0, x0 + svcntw () * 7, z0),
+	    svstnt1 (p0, x0 + svcntw () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_f32_8:
+**	incb	x0, all, mul #8
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_f32_8, svfloat32_t, float32_t,
+	    svstnt1_f32 (p0, x0 + svcntw () * 8, z0),
+	    svstnt1 (p0, x0 + svcntw () * 8, z0))
+
+/*
+** stnt1_f32_m1:
+**	stnt1w	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_f32_m1, svfloat32_t, float32_t,
+	    svstnt1_f32 (p0, x0 - svcntw (), z0),
+	    svstnt1 (p0, x0 - svcntw (), z0))
+
+/*
+** stnt1_f32_m8:
+**	stnt1w	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_f32_m8, svfloat32_t, float32_t,
+	    svstnt1_f32 (p0, x0 - svcntw () * 8, z0),
+	    svstnt1 (p0, x0 - svcntw () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_f32_m9:
+**	decb	x0, all, mul #9
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_f32_m9, svfloat32_t, float32_t,
+	    svstnt1_f32 (p0, x0 - svcntw () * 9, z0),
+	    svstnt1 (p0, x0 - svcntw () * 9, z0))
+
+/*
+** stnt1_vnum_f32_0:
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f32_0, svfloat32_t, float32_t,
+	    svstnt1_vnum_f32 (p0, x0, 0, z0),
+	    svstnt1_vnum (p0, x0, 0, z0))
+
+/*
+** stnt1_vnum_f32_1:
+**	stnt1w	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f32_1, svfloat32_t, float32_t,
+	    svstnt1_vnum_f32 (p0, x0, 1, z0),
+	    svstnt1_vnum (p0, x0, 1, z0))
+
+/*
+** stnt1_vnum_f32_7:
+**	stnt1w	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f32_7, svfloat32_t, float32_t,
+	    svstnt1_vnum_f32 (p0, x0, 7, z0),
+	    svstnt1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_f32_8:
+**	incb	x0, all, mul #8
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f32_8, svfloat32_t, float32_t,
+	    svstnt1_vnum_f32 (p0, x0, 8, z0),
+	    svstnt1_vnum (p0, x0, 8, z0))
+
+/*
+** stnt1_vnum_f32_m1:
+**	stnt1w	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f32_m1, svfloat32_t, float32_t,
+	    svstnt1_vnum_f32 (p0, x0, -1, z0),
+	    svstnt1_vnum (p0, x0, -1, z0))
+
+/*
+** stnt1_vnum_f32_m8:
+**	stnt1w	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f32_m8, svfloat32_t, float32_t,
+	    svstnt1_vnum_f32 (p0, x0, -8, z0),
+	    svstnt1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_f32_m9:
+**	decb	x0, all, mul #9
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f32_m9, svfloat32_t, float32_t,
+	    svstnt1_vnum_f32 (p0, x0, -9, z0),
+	    svstnt1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** stnt1_vnum_f32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	stnt1w	z0\.s, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f32_x1, svfloat32_t, float32_t,
+	    svstnt1_vnum_f32 (p0, x0, x1, z0),
+	    svstnt1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f64.c
new file mode 100644
index 000000000..9555a1faf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_f64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** stnt1_f64_base:
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_f64_base, svfloat64_t, float64_t,
+	    svstnt1_f64 (p0, x0, z0),
+	    svstnt1 (p0, x0, z0))
+
+/*
+** stnt1_f64_index:
+**	stnt1d	z0\.d, p0, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_STORE (stnt1_f64_index, svfloat64_t, float64_t,
+	    svstnt1_f64 (p0, x0 + x1, z0),
+	    svstnt1 (p0, x0 + x1, z0))
+
+/*
+** stnt1_f64_1:
+**	stnt1d	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_f64_1, svfloat64_t, float64_t,
+	    svstnt1_f64 (p0, x0 + svcntd (), z0),
+	    svstnt1 (p0, x0 + svcntd (), z0))
+
+/*
+** stnt1_f64_7:
+**	stnt1d	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_f64_7, svfloat64_t, float64_t,
+	    svstnt1_f64 (p0, x0 + svcntd () * 7, z0),
+	    svstnt1 (p0, x0 + svcntd () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_f64_8:
+**	incb	x0, all, mul #8
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_f64_8, svfloat64_t, float64_t,
+	    svstnt1_f64 (p0, x0 + svcntd () * 8, z0),
+	    svstnt1 (p0, x0 + svcntd () * 8, z0))
+
+/*
+** stnt1_f64_m1:
+**	stnt1d	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_f64_m1, svfloat64_t, float64_t,
+	    svstnt1_f64 (p0, x0 - svcntd (), z0),
+	    svstnt1 (p0, x0 - svcntd (), z0))
+
+/*
+** stnt1_f64_m8:
+**	stnt1d	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_f64_m8, svfloat64_t, float64_t,
+	    svstnt1_f64 (p0, x0 - svcntd () * 8, z0),
+	    svstnt1 (p0, x0 - svcntd () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_f64_m9:
+**	decb	x0, all, mul #9
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_f64_m9, svfloat64_t, float64_t,
+	    svstnt1_f64 (p0, x0 - svcntd () * 9, z0),
+	    svstnt1 (p0, x0 - svcntd () * 9, z0))
+
+/*
+** stnt1_vnum_f64_0:
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f64_0, svfloat64_t, float64_t,
+	    svstnt1_vnum_f64 (p0, x0, 0, z0),
+	    svstnt1_vnum (p0, x0, 0, z0))
+
+/*
+** stnt1_vnum_f64_1:
+**	stnt1d	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f64_1, svfloat64_t, float64_t,
+	    svstnt1_vnum_f64 (p0, x0, 1, z0),
+	    svstnt1_vnum (p0, x0, 1, z0))
+
+/*
+** stnt1_vnum_f64_7:
+**	stnt1d	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f64_7, svfloat64_t, float64_t,
+	    svstnt1_vnum_f64 (p0, x0, 7, z0),
+	    svstnt1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_f64_8:
+**	incb	x0, all, mul #8
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f64_8, svfloat64_t, float64_t,
+	    svstnt1_vnum_f64 (p0, x0, 8, z0),
+	    svstnt1_vnum (p0, x0, 8, z0))
+
+/*
+** stnt1_vnum_f64_m1:
+**	stnt1d	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f64_m1, svfloat64_t, float64_t,
+	    svstnt1_vnum_f64 (p0, x0, -1, z0),
+	    svstnt1_vnum (p0, x0, -1, z0))
+
+/*
+** stnt1_vnum_f64_m8:
+**	stnt1d	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f64_m8, svfloat64_t, float64_t,
+	    svstnt1_vnum_f64 (p0, x0, -8, z0),
+	    svstnt1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_f64_m9:
+**	decb	x0, all, mul #9
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f64_m9, svfloat64_t, float64_t,
+	    svstnt1_vnum_f64 (p0, x0, -9, z0),
+	    svstnt1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** stnt1_vnum_f64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	stnt1d	z0\.d, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_f64_x1, svfloat64_t, float64_t,
+	    svstnt1_vnum_f64 (p0, x0, x1, z0),
+	    svstnt1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s16.c
new file mode 100644
index 000000000..62e31450d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s16.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** stnt1_s16_base:
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_s16_base, svint16_t, int16_t,
+	    svstnt1_s16 (p0, x0, z0),
+	    svstnt1 (p0, x0, z0))
+
+/*
+** stnt1_s16_index:
+**	stnt1h	z0\.h, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (stnt1_s16_index, svint16_t, int16_t,
+	    svstnt1_s16 (p0, x0 + x1, z0),
+	    svstnt1 (p0, x0 + x1, z0))
+
+/*
+** stnt1_s16_1:
+**	stnt1h	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_s16_1, svint16_t, int16_t,
+	    svstnt1_s16 (p0, x0 + svcnth (), z0),
+	    svstnt1 (p0, x0 + svcnth (), z0))
+
+/*
+** stnt1_s16_7:
+**	stnt1h	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_s16_7, svint16_t, int16_t,
+	    svstnt1_s16 (p0, x0 + svcnth () * 7, z0),
+	    svstnt1 (p0, x0 + svcnth () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_s16_8:
+**	incb	x0, all, mul #8
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_s16_8, svint16_t, int16_t,
+	    svstnt1_s16 (p0, x0 + svcnth () * 8, z0),
+	    svstnt1 (p0, x0 + svcnth () * 8, z0))
+
+/*
+** stnt1_s16_m1:
+**	stnt1h	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_s16_m1, svint16_t, int16_t,
+	    svstnt1_s16 (p0, x0 - svcnth (), z0),
+	    svstnt1 (p0, x0 - svcnth (), z0))
+
+/*
+** stnt1_s16_m8:
+**	stnt1h	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_s16_m8, svint16_t, int16_t,
+	    svstnt1_s16 (p0, x0 - svcnth () * 8, z0),
+	    svstnt1 (p0, x0 - svcnth () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_s16_m9:
+**	decb	x0, all, mul #9
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_s16_m9, svint16_t, int16_t,
+	    svstnt1_s16 (p0, x0 - svcnth () * 9, z0),
+	    svstnt1 (p0, x0 - svcnth () * 9, z0))
+
+/*
+** stnt1_vnum_s16_0:
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s16_0, svint16_t, int16_t,
+	    svstnt1_vnum_s16 (p0, x0, 0, z0),
+	    svstnt1_vnum (p0, x0, 0, z0))
+
+/*
+** stnt1_vnum_s16_1:
+**	stnt1h	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s16_1, svint16_t, int16_t,
+	    svstnt1_vnum_s16 (p0, x0, 1, z0),
+	    svstnt1_vnum (p0, x0, 1, z0))
+
+/*
+** stnt1_vnum_s16_7:
+**	stnt1h	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s16_7, svint16_t, int16_t,
+	    svstnt1_vnum_s16 (p0, x0, 7, z0),
+	    svstnt1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_s16_8:
+**	incb	x0, all, mul #8
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s16_8, svint16_t, int16_t,
+	    svstnt1_vnum_s16 (p0, x0, 8, z0),
+	    svstnt1_vnum (p0, x0, 8, z0))
+
+/*
+** stnt1_vnum_s16_m1:
+**	stnt1h	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s16_m1, svint16_t, int16_t,
+	    svstnt1_vnum_s16 (p0, x0, -1, z0),
+	    svstnt1_vnum (p0, x0, -1, z0))
+
+/*
+** stnt1_vnum_s16_m8:
+**	stnt1h	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s16_m8, svint16_t, int16_t,
+	    svstnt1_vnum_s16 (p0, x0, -8, z0),
+	    svstnt1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_s16_m9:
+**	decb	x0, all, mul #9
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s16_m9, svint16_t, int16_t,
+	    svstnt1_vnum_s16 (p0, x0, -9, z0),
+	    svstnt1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** stnt1_vnum_s16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	stnt1h	z0\.h, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s16_x1, svint16_t, int16_t,
+	    svstnt1_vnum_s16 (p0, x0, x1, z0),
+	    svstnt1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s32.c
new file mode 100644
index 000000000..ff1f27c05
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** stnt1_s32_base:
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_s32_base, svint32_t, int32_t,
+	    svstnt1_s32 (p0, x0, z0),
+	    svstnt1 (p0, x0, z0))
+
+/*
+** stnt1_s32_index:
+**	stnt1w	z0\.s, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (stnt1_s32_index, svint32_t, int32_t,
+	    svstnt1_s32 (p0, x0 + x1, z0),
+	    svstnt1 (p0, x0 + x1, z0))
+
+/*
+** stnt1_s32_1:
+**	stnt1w	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_s32_1, svint32_t, int32_t,
+	    svstnt1_s32 (p0, x0 + svcntw (), z0),
+	    svstnt1 (p0, x0 + svcntw (), z0))
+
+/*
+** stnt1_s32_7:
+**	stnt1w	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_s32_7, svint32_t, int32_t,
+	    svstnt1_s32 (p0, x0 + svcntw () * 7, z0),
+	    svstnt1 (p0, x0 + svcntw () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_s32_8:
+**	incb	x0, all, mul #8
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_s32_8, svint32_t, int32_t,
+	    svstnt1_s32 (p0, x0 + svcntw () * 8, z0),
+	    svstnt1 (p0, x0 + svcntw () * 8, z0))
+
+/*
+** stnt1_s32_m1:
+**	stnt1w	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_s32_m1, svint32_t, int32_t,
+	    svstnt1_s32 (p0, x0 - svcntw (), z0),
+	    svstnt1 (p0, x0 - svcntw (), z0))
+
+/*
+** stnt1_s32_m8:
+**	stnt1w	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_s32_m8, svint32_t, int32_t,
+	    svstnt1_s32 (p0, x0 - svcntw () * 8, z0),
+	    svstnt1 (p0, x0 - svcntw () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_s32_m9:
+**	decb	x0, all, mul #9
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_s32_m9, svint32_t, int32_t,
+	    svstnt1_s32 (p0, x0 - svcntw () * 9, z0),
+	    svstnt1 (p0, x0 - svcntw () * 9, z0))
+
+/*
+** stnt1_vnum_s32_0:
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s32_0, svint32_t, int32_t,
+	    svstnt1_vnum_s32 (p0, x0, 0, z0),
+	    svstnt1_vnum (p0, x0, 0, z0))
+
+/*
+** stnt1_vnum_s32_1:
+**	stnt1w	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s32_1, svint32_t, int32_t,
+	    svstnt1_vnum_s32 (p0, x0, 1, z0),
+	    svstnt1_vnum (p0, x0, 1, z0))
+
+/*
+** stnt1_vnum_s32_7:
+**	stnt1w	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s32_7, svint32_t, int32_t,
+	    svstnt1_vnum_s32 (p0, x0, 7, z0),
+	    svstnt1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_s32_8:
+**	incb	x0, all, mul #8
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s32_8, svint32_t, int32_t,
+	    svstnt1_vnum_s32 (p0, x0, 8, z0),
+	    svstnt1_vnum (p0, x0, 8, z0))
+
+/*
+** stnt1_vnum_s32_m1:
+**	stnt1w	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s32_m1, svint32_t, int32_t,
+	    svstnt1_vnum_s32 (p0, x0, -1, z0),
+	    svstnt1_vnum (p0, x0, -1, z0))
+
+/*
+** stnt1_vnum_s32_m8:
+**	stnt1w	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s32_m8, svint32_t, int32_t,
+	    svstnt1_vnum_s32 (p0, x0, -8, z0),
+	    svstnt1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_s32_m9:
+**	decb	x0, all, mul #9
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s32_m9, svint32_t, int32_t,
+	    svstnt1_vnum_s32 (p0, x0, -9, z0),
+	    svstnt1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** stnt1_vnum_s32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	stnt1w	z0\.s, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s32_x1, svint32_t, int32_t,
+	    svstnt1_vnum_s32 (p0, x0, x1, z0),
+	    svstnt1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s64.c
new file mode 100644
index 000000000..7d548f8f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** stnt1_s64_base:
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_s64_base, svint64_t, int64_t,
+	    svstnt1_s64 (p0, x0, z0),
+	    svstnt1 (p0, x0, z0))
+
+/*
+** stnt1_s64_index:
+**	stnt1d	z0\.d, p0, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_STORE (stnt1_s64_index, svint64_t, int64_t,
+	    svstnt1_s64 (p0, x0 + x1, z0),
+	    svstnt1 (p0, x0 + x1, z0))
+
+/*
+** stnt1_s64_1:
+**	stnt1d	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_s64_1, svint64_t, int64_t,
+	    svstnt1_s64 (p0, x0 + svcntd (), z0),
+	    svstnt1 (p0, x0 + svcntd (), z0))
+
+/*
+** stnt1_s64_7:
+**	stnt1d	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_s64_7, svint64_t, int64_t,
+	    svstnt1_s64 (p0, x0 + svcntd () * 7, z0),
+	    svstnt1 (p0, x0 + svcntd () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_s64_8:
+**	incb	x0, all, mul #8
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_s64_8, svint64_t, int64_t,
+	    svstnt1_s64 (p0, x0 + svcntd () * 8, z0),
+	    svstnt1 (p0, x0 + svcntd () * 8, z0))
+
+/*
+** stnt1_s64_m1:
+**	stnt1d	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_s64_m1, svint64_t, int64_t,
+	    svstnt1_s64 (p0, x0 - svcntd (), z0),
+	    svstnt1 (p0, x0 - svcntd (), z0))
+
+/*
+** stnt1_s64_m8:
+**	stnt1d	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_s64_m8, svint64_t, int64_t,
+	    svstnt1_s64 (p0, x0 - svcntd () * 8, z0),
+	    svstnt1 (p0, x0 - svcntd () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_s64_m9:
+**	decb	x0, all, mul #9
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_s64_m9, svint64_t, int64_t,
+	    svstnt1_s64 (p0, x0 - svcntd () * 9, z0),
+	    svstnt1 (p0, x0 - svcntd () * 9, z0))
+
+/*
+** stnt1_vnum_s64_0:
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s64_0, svint64_t, int64_t,
+	    svstnt1_vnum_s64 (p0, x0, 0, z0),
+	    svstnt1_vnum (p0, x0, 0, z0))
+
+/*
+** stnt1_vnum_s64_1:
+**	stnt1d	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s64_1, svint64_t, int64_t,
+	    svstnt1_vnum_s64 (p0, x0, 1, z0),
+	    svstnt1_vnum (p0, x0, 1, z0))
+
+/*
+** stnt1_vnum_s64_7:
+**	stnt1d	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s64_7, svint64_t, int64_t,
+	    svstnt1_vnum_s64 (p0, x0, 7, z0),
+	    svstnt1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_s64_8:
+**	incb	x0, all, mul #8
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s64_8, svint64_t, int64_t,
+	    svstnt1_vnum_s64 (p0, x0, 8, z0),
+	    svstnt1_vnum (p0, x0, 8, z0))
+
+/*
+** stnt1_vnum_s64_m1:
+**	stnt1d	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s64_m1, svint64_t, int64_t,
+	    svstnt1_vnum_s64 (p0, x0, -1, z0),
+	    svstnt1_vnum (p0, x0, -1, z0))
+
+/*
+** stnt1_vnum_s64_m8:
+**	stnt1d	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s64_m8, svint64_t, int64_t,
+	    svstnt1_vnum_s64 (p0, x0, -8, z0),
+	    svstnt1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_s64_m9:
+**	decb	x0, all, mul #9
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s64_m9, svint64_t, int64_t,
+	    svstnt1_vnum_s64 (p0, x0, -9, z0),
+	    svstnt1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** stnt1_vnum_s64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	stnt1d	z0\.d, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s64_x1, svint64_t, int64_t,
+	    svstnt1_vnum_s64 (p0, x0, x1, z0),
+	    svstnt1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s8.c
new file mode 100644
index 000000000..87c88035d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_s8.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** stnt1_s8_base:
+**	stnt1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_s8_base, svint8_t, int8_t,
+	    svstnt1_s8 (p0, x0, z0),
+	    svstnt1 (p0, x0, z0))
+
+/*
+** stnt1_s8_index:
+**	stnt1b	z0\.b, p0, \[x0, x1\]
+**	ret
+*/
+TEST_STORE (stnt1_s8_index, svint8_t, int8_t,
+	    svstnt1_s8 (p0, x0 + x1, z0),
+	    svstnt1 (p0, x0 + x1, z0))
+
+/*
+** stnt1_s8_1:
+**	stnt1b	z0\.b, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_s8_1, svint8_t, int8_t,
+	    svstnt1_s8 (p0, x0 + svcntb (), z0),
+	    svstnt1 (p0, x0 + svcntb (), z0))
+
+/*
+** stnt1_s8_7:
+**	stnt1b	z0\.b, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_s8_7, svint8_t, int8_t,
+	    svstnt1_s8 (p0, x0 + svcntb () * 7, z0),
+	    svstnt1 (p0, x0 + svcntb () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_s8_8:
+**	incb	x0, all, mul #8
+**	stnt1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_s8_8, svint8_t, int8_t,
+	    svstnt1_s8 (p0, x0 + svcntb () * 8, z0),
+	    svstnt1 (p0, x0 + svcntb () * 8, z0))
+
+/*
+** stnt1_s8_m1:
+**	stnt1b	z0\.b, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_s8_m1, svint8_t, int8_t,
+	    svstnt1_s8 (p0, x0 - svcntb (), z0),
+	    svstnt1 (p0, x0 - svcntb (), z0))
+
+/*
+** stnt1_s8_m8:
+**	stnt1b	z0\.b, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_s8_m8, svint8_t, int8_t,
+	    svstnt1_s8 (p0, x0 - svcntb () * 8, z0),
+	    svstnt1 (p0, x0 - svcntb () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_s8_m9:
+**	decb	x0, all, mul #9
+**	stnt1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_s8_m9, svint8_t, int8_t,
+	    svstnt1_s8 (p0, x0 - svcntb () * 9, z0),
+	    svstnt1 (p0, x0 - svcntb () * 9, z0))
+
+/*
+** stnt1_vnum_s8_0:
+**	stnt1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s8_0, svint8_t, int8_t,
+	    svstnt1_vnum_s8 (p0, x0, 0, z0),
+	    svstnt1_vnum (p0, x0, 0, z0))
+
+/*
+** stnt1_vnum_s8_1:
+**	stnt1b	z0\.b, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s8_1, svint8_t, int8_t,
+	    svstnt1_vnum_s8 (p0, x0, 1, z0),
+	    svstnt1_vnum (p0, x0, 1, z0))
+
+/*
+** stnt1_vnum_s8_7:
+**	stnt1b	z0\.b, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s8_7, svint8_t, int8_t,
+	    svstnt1_vnum_s8 (p0, x0, 7, z0),
+	    svstnt1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_s8_8:
+**	incb	x0, all, mul #8
+**	stnt1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s8_8, svint8_t, int8_t,
+	    svstnt1_vnum_s8 (p0, x0, 8, z0),
+	    svstnt1_vnum (p0, x0, 8, z0))
+
+/*
+** stnt1_vnum_s8_m1:
+**	stnt1b	z0\.b, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s8_m1, svint8_t, int8_t,
+	    svstnt1_vnum_s8 (p0, x0, -1, z0),
+	    svstnt1_vnum (p0, x0, -1, z0))
+
+/*
+** stnt1_vnum_s8_m8:
+**	stnt1b	z0\.b, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s8_m8, svint8_t, int8_t,
+	    svstnt1_vnum_s8 (p0, x0, -8, z0),
+	    svstnt1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_s8_m9:
+**	decb	x0, all, mul #9
+**	stnt1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s8_m9, svint8_t, int8_t,
+	    svstnt1_vnum_s8 (p0, x0, -9, z0),
+	    svstnt1_vnum (p0, x0, -9, z0))
+
+/*
+** stnt1_vnum_s8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	stnt1b	z0\.b, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	stnt1b	z0\.b, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_STORE (stnt1_vnum_s8_x1, svint8_t, int8_t,
+	    svstnt1_vnum_s8 (p0, x0, x1, z0),
+	    svstnt1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u16.c
new file mode 100644
index 000000000..7d32df362
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u16.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** stnt1_u16_base:
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_u16_base, svuint16_t, uint16_t,
+	    svstnt1_u16 (p0, x0, z0),
+	    svstnt1 (p0, x0, z0))
+
+/*
+** stnt1_u16_index:
+**	stnt1h	z0\.h, p0, \[x0, x1, lsl 1\]
+**	ret
+*/
+TEST_STORE (stnt1_u16_index, svuint16_t, uint16_t,
+	    svstnt1_u16 (p0, x0 + x1, z0),
+	    svstnt1 (p0, x0 + x1, z0))
+
+/*
+** stnt1_u16_1:
+**	stnt1h	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_u16_1, svuint16_t, uint16_t,
+	    svstnt1_u16 (p0, x0 + svcnth (), z0),
+	    svstnt1 (p0, x0 + svcnth (), z0))
+
+/*
+** stnt1_u16_7:
+**	stnt1h	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_u16_7, svuint16_t, uint16_t,
+	    svstnt1_u16 (p0, x0 + svcnth () * 7, z0),
+	    svstnt1 (p0, x0 + svcnth () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_u16_8:
+**	incb	x0, all, mul #8
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_u16_8, svuint16_t, uint16_t,
+	    svstnt1_u16 (p0, x0 + svcnth () * 8, z0),
+	    svstnt1 (p0, x0 + svcnth () * 8, z0))
+
+/*
+** stnt1_u16_m1:
+**	stnt1h	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_u16_m1, svuint16_t, uint16_t,
+	    svstnt1_u16 (p0, x0 - svcnth (), z0),
+	    svstnt1 (p0, x0 - svcnth (), z0))
+
+/*
+** stnt1_u16_m8:
+**	stnt1h	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_u16_m8, svuint16_t, uint16_t,
+	    svstnt1_u16 (p0, x0 - svcnth () * 8, z0),
+	    svstnt1 (p0, x0 - svcnth () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_u16_m9:
+**	decb	x0, all, mul #9
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_u16_m9, svuint16_t, uint16_t,
+	    svstnt1_u16 (p0, x0 - svcnth () * 9, z0),
+	    svstnt1 (p0, x0 - svcnth () * 9, z0))
+
+/*
+** stnt1_vnum_u16_0:
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u16_0, svuint16_t, uint16_t,
+	    svstnt1_vnum_u16 (p0, x0, 0, z0),
+	    svstnt1_vnum (p0, x0, 0, z0))
+
+/*
+** stnt1_vnum_u16_1:
+**	stnt1h	z0\.h, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u16_1, svuint16_t, uint16_t,
+	    svstnt1_vnum_u16 (p0, x0, 1, z0),
+	    svstnt1_vnum (p0, x0, 1, z0))
+
+/*
+** stnt1_vnum_u16_7:
+**	stnt1h	z0\.h, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u16_7, svuint16_t, uint16_t,
+	    svstnt1_vnum_u16 (p0, x0, 7, z0),
+	    svstnt1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_u16_8:
+**	incb	x0, all, mul #8
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u16_8, svuint16_t, uint16_t,
+	    svstnt1_vnum_u16 (p0, x0, 8, z0),
+	    svstnt1_vnum (p0, x0, 8, z0))
+
+/*
+** stnt1_vnum_u16_m1:
+**	stnt1h	z0\.h, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u16_m1, svuint16_t, uint16_t,
+	    svstnt1_vnum_u16 (p0, x0, -1, z0),
+	    svstnt1_vnum (p0, x0, -1, z0))
+
+/*
+** stnt1_vnum_u16_m8:
+**	stnt1h	z0\.h, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u16_m8, svuint16_t, uint16_t,
+	    svstnt1_vnum_u16 (p0, x0, -8, z0),
+	    svstnt1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_u16_m9:
+**	decb	x0, all, mul #9
+**	stnt1h	z0\.h, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u16_m9, svuint16_t, uint16_t,
+	    svstnt1_vnum_u16 (p0, x0, -9, z0),
+	    svstnt1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** stnt1_vnum_u16_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	stnt1h	z0\.h, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u16_x1, svuint16_t, uint16_t,
+	    svstnt1_vnum_u16 (p0, x0, x1, z0),
+	    svstnt1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u32.c
new file mode 100644
index 000000000..cd4ccaba9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u32.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** stnt1_u32_base:
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_u32_base, svuint32_t, uint32_t,
+	    svstnt1_u32 (p0, x0, z0),
+	    svstnt1 (p0, x0, z0))
+
+/*
+** stnt1_u32_index:
+**	stnt1w	z0\.s, p0, \[x0, x1, lsl 2\]
+**	ret
+*/
+TEST_STORE (stnt1_u32_index, svuint32_t, uint32_t,
+	    svstnt1_u32 (p0, x0 + x1, z0),
+	    svstnt1 (p0, x0 + x1, z0))
+
+/*
+** stnt1_u32_1:
+**	stnt1w	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_u32_1, svuint32_t, uint32_t,
+	    svstnt1_u32 (p0, x0 + svcntw (), z0),
+	    svstnt1 (p0, x0 + svcntw (), z0))
+
+/*
+** stnt1_u32_7:
+**	stnt1w	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_u32_7, svuint32_t, uint32_t,
+	    svstnt1_u32 (p0, x0 + svcntw () * 7, z0),
+	    svstnt1 (p0, x0 + svcntw () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_u32_8:
+**	incb	x0, all, mul #8
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_u32_8, svuint32_t, uint32_t,
+	    svstnt1_u32 (p0, x0 + svcntw () * 8, z0),
+	    svstnt1 (p0, x0 + svcntw () * 8, z0))
+
+/*
+** stnt1_u32_m1:
+**	stnt1w	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_u32_m1, svuint32_t, uint32_t,
+	    svstnt1_u32 (p0, x0 - svcntw (), z0),
+	    svstnt1 (p0, x0 - svcntw (), z0))
+
+/*
+** stnt1_u32_m8:
+**	stnt1w	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_u32_m8, svuint32_t, uint32_t,
+	    svstnt1_u32 (p0, x0 - svcntw () * 8, z0),
+	    svstnt1 (p0, x0 - svcntw () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_u32_m9:
+**	decb	x0, all, mul #9
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_u32_m9, svuint32_t, uint32_t,
+	    svstnt1_u32 (p0, x0 - svcntw () * 9, z0),
+	    svstnt1 (p0, x0 - svcntw () * 9, z0))
+
+/*
+** stnt1_vnum_u32_0:
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u32_0, svuint32_t, uint32_t,
+	    svstnt1_vnum_u32 (p0, x0, 0, z0),
+	    svstnt1_vnum (p0, x0, 0, z0))
+
+/*
+** stnt1_vnum_u32_1:
+**	stnt1w	z0\.s, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u32_1, svuint32_t, uint32_t,
+	    svstnt1_vnum_u32 (p0, x0, 1, z0),
+	    svstnt1_vnum (p0, x0, 1, z0))
+
+/*
+** stnt1_vnum_u32_7:
+**	stnt1w	z0\.s, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u32_7, svuint32_t, uint32_t,
+	    svstnt1_vnum_u32 (p0, x0, 7, z0),
+	    svstnt1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_u32_8:
+**	incb	x0, all, mul #8
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u32_8, svuint32_t, uint32_t,
+	    svstnt1_vnum_u32 (p0, x0, 8, z0),
+	    svstnt1_vnum (p0, x0, 8, z0))
+
+/*
+** stnt1_vnum_u32_m1:
+**	stnt1w	z0\.s, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u32_m1, svuint32_t, uint32_t,
+	    svstnt1_vnum_u32 (p0, x0, -1, z0),
+	    svstnt1_vnum (p0, x0, -1, z0))
+
+/*
+** stnt1_vnum_u32_m8:
+**	stnt1w	z0\.s, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u32_m8, svuint32_t, uint32_t,
+	    svstnt1_vnum_u32 (p0, x0, -8, z0),
+	    svstnt1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_u32_m9:
+**	decb	x0, all, mul #9
+**	stnt1w	z0\.s, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u32_m9, svuint32_t, uint32_t,
+	    svstnt1_vnum_u32 (p0, x0, -9, z0),
+	    svstnt1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** stnt1_vnum_u32_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	stnt1w	z0\.s, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u32_x1, svuint32_t, uint32_t,
+	    svstnt1_vnum_u32 (p0, x0, x1, z0),
+	    svstnt1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u64.c
new file mode 100644
index 000000000..c8145f65c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u64.c
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** stnt1_u64_base:
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_u64_base, svuint64_t, uint64_t,
+	    svstnt1_u64 (p0, x0, z0),
+	    svstnt1 (p0, x0, z0))
+
+/*
+** stnt1_u64_index:
+**	stnt1d	z0\.d, p0, \[x0, x1, lsl 3\]
+**	ret
+*/
+TEST_STORE (stnt1_u64_index, svuint64_t, uint64_t,
+	    svstnt1_u64 (p0, x0 + x1, z0),
+	    svstnt1 (p0, x0 + x1, z0))
+
+/*
+** stnt1_u64_1:
+**	stnt1d	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_u64_1, svuint64_t, uint64_t,
+	    svstnt1_u64 (p0, x0 + svcntd (), z0),
+	    svstnt1 (p0, x0 + svcntd (), z0))
+
+/*
+** stnt1_u64_7:
+**	stnt1d	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_u64_7, svuint64_t, uint64_t,
+	    svstnt1_u64 (p0, x0 + svcntd () * 7, z0),
+	    svstnt1 (p0, x0 + svcntd () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_u64_8:
+**	incb	x0, all, mul #8
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_u64_8, svuint64_t, uint64_t,
+	    svstnt1_u64 (p0, x0 + svcntd () * 8, z0),
+	    svstnt1 (p0, x0 + svcntd () * 8, z0))
+
+/*
+** stnt1_u64_m1:
+**	stnt1d	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_u64_m1, svuint64_t, uint64_t,
+	    svstnt1_u64 (p0, x0 - svcntd (), z0),
+	    svstnt1 (p0, x0 - svcntd (), z0))
+
+/*
+** stnt1_u64_m8:
+**	stnt1d	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_u64_m8, svuint64_t, uint64_t,
+	    svstnt1_u64 (p0, x0 - svcntd () * 8, z0),
+	    svstnt1 (p0, x0 - svcntd () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_u64_m9:
+**	decb	x0, all, mul #9
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_u64_m9, svuint64_t, uint64_t,
+	    svstnt1_u64 (p0, x0 - svcntd () * 9, z0),
+	    svstnt1 (p0, x0 - svcntd () * 9, z0))
+
+/*
+** stnt1_vnum_u64_0:
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u64_0, svuint64_t, uint64_t,
+	    svstnt1_vnum_u64 (p0, x0, 0, z0),
+	    svstnt1_vnum (p0, x0, 0, z0))
+
+/*
+** stnt1_vnum_u64_1:
+**	stnt1d	z0\.d, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u64_1, svuint64_t, uint64_t,
+	    svstnt1_vnum_u64 (p0, x0, 1, z0),
+	    svstnt1_vnum (p0, x0, 1, z0))
+
+/*
+** stnt1_vnum_u64_7:
+**	stnt1d	z0\.d, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u64_7, svuint64_t, uint64_t,
+	    svstnt1_vnum_u64 (p0, x0, 7, z0),
+	    svstnt1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_u64_8:
+**	incb	x0, all, mul #8
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u64_8, svuint64_t, uint64_t,
+	    svstnt1_vnum_u64 (p0, x0, 8, z0),
+	    svstnt1_vnum (p0, x0, 8, z0))
+
+/*
+** stnt1_vnum_u64_m1:
+**	stnt1d	z0\.d, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u64_m1, svuint64_t, uint64_t,
+	    svstnt1_vnum_u64 (p0, x0, -1, z0),
+	    svstnt1_vnum (p0, x0, -1, z0))
+
+/*
+** stnt1_vnum_u64_m8:
+**	stnt1d	z0\.d, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u64_m8, svuint64_t, uint64_t,
+	    svstnt1_vnum_u64 (p0, x0, -8, z0),
+	    svstnt1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_u64_m9:
+**	decb	x0, all, mul #9
+**	stnt1d	z0\.d, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u64_m9, svuint64_t, uint64_t,
+	    svstnt1_vnum_u64 (p0, x0, -9, z0),
+	    svstnt1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK.  */
+/*
+** stnt1_vnum_u64_x1:
+**	cntb	(x[0-9]+)
+**	madd	(x[0-9]+), (x1, \1|\1, x1), x0
+**	stnt1d	z0\.d, p0, \[\2\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u64_x1, svuint64_t, uint64_t,
+	    svstnt1_vnum_u64 (p0, x0, x1, z0),
+	    svstnt1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u8.c
new file mode 100644
index 000000000..11c68f555
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/stnt1_u8.c
@@ -0,0 +1,162 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** stnt1_u8_base:
+**	stnt1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_u8_base, svuint8_t, uint8_t,
+	    svstnt1_u8 (p0, x0, z0),
+	    svstnt1 (p0, x0, z0))
+
+/*
+** stnt1_u8_index:
+**	stnt1b	z0\.b, p0, \[x0, x1\]
+**	ret
+*/
+TEST_STORE (stnt1_u8_index, svuint8_t, uint8_t,
+	    svstnt1_u8 (p0, x0 + x1, z0),
+	    svstnt1 (p0, x0 + x1, z0))
+
+/*
+** stnt1_u8_1:
+**	stnt1b	z0\.b, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_u8_1, svuint8_t, uint8_t,
+	    svstnt1_u8 (p0, x0 + svcntb (), z0),
+	    svstnt1 (p0, x0 + svcntb (), z0))
+
+/*
+** stnt1_u8_7:
+**	stnt1b	z0\.b, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_u8_7, svuint8_t, uint8_t,
+	    svstnt1_u8 (p0, x0 + svcntb () * 7, z0),
+	    svstnt1 (p0, x0 + svcntb () * 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_u8_8:
+**	incb	x0, all, mul #8
+**	stnt1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_u8_8, svuint8_t, uint8_t,
+	    svstnt1_u8 (p0, x0 + svcntb () * 8, z0),
+	    svstnt1 (p0, x0 + svcntb () * 8, z0))
+
+/*
+** stnt1_u8_m1:
+**	stnt1b	z0\.b, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_u8_m1, svuint8_t, uint8_t,
+	    svstnt1_u8 (p0, x0 - svcntb (), z0),
+	    svstnt1 (p0, x0 - svcntb (), z0))
+
+/*
+** stnt1_u8_m8:
+**	stnt1b	z0\.b, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_u8_m8, svuint8_t, uint8_t,
+	    svstnt1_u8 (p0, x0 - svcntb () * 8, z0),
+	    svstnt1 (p0, x0 - svcntb () * 8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_u8_m9:
+**	decb	x0, all, mul #9
+**	stnt1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_u8_m9, svuint8_t, uint8_t,
+	    svstnt1_u8 (p0, x0 - svcntb () * 9, z0),
+	    svstnt1 (p0, x0 - svcntb () * 9, z0))
+
+/*
+** stnt1_vnum_u8_0:
+**	stnt1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u8_0, svuint8_t, uint8_t,
+	    svstnt1_vnum_u8 (p0, x0, 0, z0),
+	    svstnt1_vnum (p0, x0, 0, z0))
+
+/*
+** stnt1_vnum_u8_1:
+**	stnt1b	z0\.b, p0, \[x0, #1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u8_1, svuint8_t, uint8_t,
+	    svstnt1_vnum_u8 (p0, x0, 1, z0),
+	    svstnt1_vnum (p0, x0, 1, z0))
+
+/*
+** stnt1_vnum_u8_7:
+**	stnt1b	z0\.b, p0, \[x0, #7, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u8_7, svuint8_t, uint8_t,
+	    svstnt1_vnum_u8 (p0, x0, 7, z0),
+	    svstnt1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_u8_8:
+**	incb	x0, all, mul #8
+**	stnt1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u8_8, svuint8_t, uint8_t,
+	    svstnt1_vnum_u8 (p0, x0, 8, z0),
+	    svstnt1_vnum (p0, x0, 8, z0))
+
+/*
+** stnt1_vnum_u8_m1:
+**	stnt1b	z0\.b, p0, \[x0, #-1, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u8_m1, svuint8_t, uint8_t,
+	    svstnt1_vnum_u8 (p0, x0, -1, z0),
+	    svstnt1_vnum (p0, x0, -1, z0))
+
+/*
+** stnt1_vnum_u8_m8:
+**	stnt1b	z0\.b, p0, \[x0, #-8, mul vl\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u8_m8, svuint8_t, uint8_t,
+	    svstnt1_vnum_u8 (p0, x0, -8, z0),
+	    svstnt1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK.  */
+/*
+** stnt1_vnum_u8_m9:
+**	decb	x0, all, mul #9
+**	stnt1b	z0\.b, p0, \[x0\]
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u8_m9, svuint8_t, uint8_t,
+	    svstnt1_vnum_u8 (p0, x0, -9, z0),
+	    svstnt1_vnum (p0, x0, -9, z0))
+
+/*
+** stnt1_vnum_u8_x1:
+**	cntb	(x[0-9]+)
+** (
+**	madd	(x[0-9]+), (?:x1, \1|\1, x1), x0
+**	stnt1b	z0\.b, p0, \[\2\]
+** |
+**	mul	(x[0-9]+), (?:x1, \1|\1, x1)
+**	stnt1b	z0\.b, p0, \[x0, \3\]
+** )
+**	ret
+*/
+TEST_STORE (stnt1_vnum_u8_x1, svuint8_t, uint8_t,
+	    svstnt1_vnum_u8 (p0, x0, x1, z0),
+	    svstnt1_vnum (p0, x0, x1, z0))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f16.c
new file mode 100644
index 000000000..bf4a0ab1e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f16.c
@@ -0,0 +1,577 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sub_f16_m_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_m_tied1, svfloat16_t,
+		z0 = svsub_f16_m (p0, z0, z1),
+		z0 = svsub_m (p0, z0, z1))
+
+/*
+** sub_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_m_tied2, svfloat16_t,
+		z0 = svsub_f16_m (p0, z1, z0),
+		z0 = svsub_m (p0, z1, z0))
+
+/*
+** sub_f16_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_m_untied, svfloat16_t,
+		z0 = svsub_f16_m (p0, z1, z2),
+		z0 = svsub_m (p0, z1, z2))
+
+/*
+** sub_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fsub	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svsub_n_f16_m (p0, z0, d4),
+		 z0 = svsub_m (p0, z0, d4))
+
+/*
+** sub_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svsub_n_f16_m (p0, z1, d4),
+		 z0 = svsub_m (p0, z1, d4))
+
+/*
+** sub_1_f16_m_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f16_m_tied1, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z0, 1),
+		z0 = svsub_m (p0, z0, 1))
+
+/*
+** sub_1_f16_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f16_m_untied, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z1, 1),
+		z0 = svsub_m (p0, z1, 1))
+
+/*
+** sub_0p5_f16_m_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f16_m_tied1, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z0, 0.5),
+		z0 = svsub_m (p0, z0, 0.5))
+
+/*
+** sub_0p5_f16_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f16_m_untied, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z1, 0.5),
+		z0 = svsub_m (p0, z1, 0.5))
+
+/*
+** sub_m1_f16_m_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f16_m_tied1, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z0, -1),
+		z0 = svsub_m (p0, z0, -1))
+
+/*
+** sub_m1_f16_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f16_m_untied, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z1, -1),
+		z0 = svsub_m (p0, z1, -1))
+
+/*
+** sub_m0p5_f16_m_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f16_m_tied1, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z0, -0.5),
+		z0 = svsub_m (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f16_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f16_m_untied, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z1, -0.5),
+		z0 = svsub_m (p0, z1, -0.5))
+
+/*
+** sub_m2_f16_m:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_f16_m, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z0, -2),
+		z0 = svsub_m (p0, z0, -2))
+
+/*
+** sub_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_z_tied1, svfloat16_t,
+		z0 = svsub_f16_z (p0, z0, z1),
+		z0 = svsub_z (p0, z0, z1))
+
+/*
+** sub_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_z_tied2, svfloat16_t,
+		z0 = svsub_f16_z (p0, z1, z0),
+		z0 = svsub_z (p0, z1, z0))
+
+/*
+** sub_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsub	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_z_untied, svfloat16_t,
+		z0 = svsub_f16_z (p0, z1, z2),
+		z0 = svsub_z (p0, z1, z2))
+
+/*
+** sub_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsub	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svsub_n_f16_z (p0, z0, d4),
+		 z0 = svsub_z (p0, z0, d4))
+
+/*
+** sub_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsub	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svsub_n_f16_z (p0, z1, d4),
+		 z0 = svsub_z (p0, z1, d4))
+
+/*
+** sub_1_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f16_z_tied1, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z0, 1),
+		z0 = svsub_z (p0, z0, 1))
+
+/*
+** sub_1_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f16_z_untied, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z1, 1),
+		z0 = svsub_z (p0, z1, 1))
+
+/*
+** sub_0p5_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f16_z_tied1, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z0, 0.5),
+		z0 = svsub_z (p0, z0, 0.5))
+
+/*
+** sub_0p5_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f16_z_untied, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z1, 0.5),
+		z0 = svsub_z (p0, z1, 0.5))
+
+/*
+** sub_m1_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f16_z_tied1, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z0, -1),
+		z0 = svsub_z (p0, z0, -1))
+
+/*
+** sub_m1_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f16_z_untied, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z1, -1),
+		z0 = svsub_z (p0, z1, -1))
+
+/*
+** sub_m0p5_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f16_z_tied1, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z0, -0.5),
+		z0 = svsub_z (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f16_z_untied, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z1, -0.5),
+		z0 = svsub_z (p0, z1, -0.5))
+
+/*
+** sub_m2_f16_z:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_f16_z, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z0, -2),
+		z0 = svsub_z (p0, z0, -2))
+
+/*
+** sub_f16_x_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_x_tied1, svfloat16_t,
+		z0 = svsub_f16_x (p0, z0, z1),
+		z0 = svsub_x (p0, z0, z1))
+
+/*
+** sub_f16_x_tied2:
+**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_x_tied2, svfloat16_t,
+		z0 = svsub_f16_x (p0, z1, z0),
+		z0 = svsub_x (p0, z1, z0))
+
+/*
+** sub_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_x_untied, svfloat16_t,
+		z0 = svsub_f16_x (p0, z1, z2),
+		z0 = svsub_x (p0, z1, z2))
+
+/*
+** sub_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fsub	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svsub_n_f16_x (p0, z0, d4),
+		 z0 = svsub_x (p0, z0, d4))
+
+/*
+** sub_h4_f16_x_untied: { xfail *-*-* }
+**	mov	z0\.h, h4
+**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svsub_n_f16_x (p0, z1, d4),
+		 z0 = svsub_x (p0, z1, d4))
+
+/*
+** sub_1_f16_x_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z0, 1),
+		z0 = svsub_x (p0, z0, 1))
+
+/*
+** sub_1_f16_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z1, 1),
+		z0 = svsub_x (p0, z1, 1))
+
+/*
+** sub_0p5_f16_x_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z0, 0.5),
+		z0 = svsub_x (p0, z0, 0.5))
+
+/*
+** sub_0p5_f16_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z1, 0.5),
+		z0 = svsub_x (p0, z1, 0.5))
+
+/*
+** sub_m1_f16_x_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z0, -1),
+		z0 = svsub_x (p0, z0, -1))
+
+/*
+** sub_m1_f16_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z1, -1),
+		z0 = svsub_x (p0, z1, -1))
+
+/*
+** sub_m0p5_f16_x_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z0, -0.5),
+		z0 = svsub_x (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f16_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z1, -0.5),
+		z0 = svsub_x (p0, z1, -0.5))
+
+/*
+** sub_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_2_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z0, 2),
+		z0 = svsub_x (p0, z0, 2))
+
+/*
+** sub_2_f16_x_untied:
+**	fmov	z0\.h, #-2\.0(?:e\+0)?
+**	fadd	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_2_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z1, 2),
+		z0 = svsub_x (p0, z1, 2))
+
+/*
+** ptrue_sub_f16_x_tied1:
+**	fsub	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f16_x_tied1, svfloat16_t,
+		z0 = svsub_f16_x (svptrue_b16 (), z0, z1),
+		z0 = svsub_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_sub_f16_x_tied2:
+**	fsub	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f16_x_tied2, svfloat16_t,
+		z0 = svsub_f16_x (svptrue_b16 (), z1, z0),
+		z0 = svsub_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_sub_f16_x_untied:
+**	fsub	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f16_x_untied, svfloat16_t,
+		z0 = svsub_f16_x (svptrue_b16 (), z1, z2),
+		z0 = svsub_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_sub_1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_1_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z0, 1),
+		z0 = svsub_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_sub_1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_1_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z1, 1),
+		z0 = svsub_x (svptrue_b16 (), z1, 1))
+
+/*
+** ptrue_sub_0p5_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z0, 0.5),
+		z0 = svsub_x (svptrue_b16 (), z0, 0.5))
+
+/*
+** ptrue_sub_0p5_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_0p5_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z1, 0.5),
+		z0 = svsub_x (svptrue_b16 (), z1, 0.5))
+
+/*
+** ptrue_sub_m1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m1_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z0, -1),
+		z0 = svsub_x (svptrue_b16 (), z0, -1))
+
+/*
+** ptrue_sub_m1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m1_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z1, -1),
+		z0 = svsub_x (svptrue_b16 (), z1, -1))
+
+/*
+** ptrue_sub_m0p5_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m0p5_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z0, -0.5),
+		z0 = svsub_x (svptrue_b16 (), z0, -0.5))
+
+/*
+** ptrue_sub_m0p5_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m0p5_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z1, -0.5),
+		z0 = svsub_x (svptrue_b16 (), z1, -0.5))
+
+/*
+** ptrue_sub_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
+**	fadd	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_2_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z0, 2),
+		z0 = svsub_x (svptrue_b16 (), z0, 2))
+
+/*
+** ptrue_sub_2_f16_x_untied:
+**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
+**	fadd	z0\.h, (z1\.h, \1|\1, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_2_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z1, 2),
+		z0 = svsub_x (svptrue_b16 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f16_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f16_notrap.c
new file mode 100644
index 000000000..e45098944
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f16_notrap.c
@@ -0,0 +1,572 @@
+/* { dg-additional-options "-fno-trapping-math" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sub_f16_m_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_m_tied1, svfloat16_t,
+		z0 = svsub_f16_m (p0, z0, z1),
+		z0 = svsub_m (p0, z0, z1))
+
+/*
+** sub_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_m_tied2, svfloat16_t,
+		z0 = svsub_f16_m (p0, z1, z0),
+		z0 = svsub_m (p0, z1, z0))
+
+/*
+** sub_f16_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_m_untied, svfloat16_t,
+		z0 = svsub_f16_m (p0, z1, z2),
+		z0 = svsub_m (p0, z1, z2))
+
+/*
+** sub_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fsub	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svsub_n_f16_m (p0, z0, d4),
+		 z0 = svsub_m (p0, z0, d4))
+
+/*
+** sub_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svsub_n_f16_m (p0, z1, d4),
+		 z0 = svsub_m (p0, z1, d4))
+
+/*
+** sub_1_f16_m_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f16_m_tied1, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z0, 1),
+		z0 = svsub_m (p0, z0, 1))
+
+/*
+** sub_1_f16_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f16_m_untied, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z1, 1),
+		z0 = svsub_m (p0, z1, 1))
+
+/*
+** sub_0p5_f16_m_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f16_m_tied1, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z0, 0.5),
+		z0 = svsub_m (p0, z0, 0.5))
+
+/*
+** sub_0p5_f16_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f16_m_untied, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z1, 0.5),
+		z0 = svsub_m (p0, z1, 0.5))
+
+/*
+** sub_m1_f16_m_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f16_m_tied1, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z0, -1),
+		z0 = svsub_m (p0, z0, -1))
+
+/*
+** sub_m1_f16_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f16_m_untied, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z1, -1),
+		z0 = svsub_m (p0, z1, -1))
+
+/*
+** sub_m0p5_f16_m_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f16_m_tied1, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z0, -0.5),
+		z0 = svsub_m (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f16_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f16_m_untied, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z1, -0.5),
+		z0 = svsub_m (p0, z1, -0.5))
+
+/*
+** sub_m2_f16_m:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_f16_m, svfloat16_t,
+		z0 = svsub_n_f16_m (p0, z0, -2),
+		z0 = svsub_m (p0, z0, -2))
+
+/*
+** sub_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_z_tied1, svfloat16_t,
+		z0 = svsub_f16_z (p0, z0, z1),
+		z0 = svsub_z (p0, z0, z1))
+
+/*
+** sub_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_z_tied2, svfloat16_t,
+		z0 = svsub_f16_z (p0, z1, z0),
+		z0 = svsub_z (p0, z1, z0))
+
+/*
+** sub_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsub	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_z_untied, svfloat16_t,
+		z0 = svsub_f16_z (p0, z1, z2),
+		z0 = svsub_z (p0, z1, z2))
+
+/*
+** sub_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsub	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svsub_n_f16_z (p0, z0, d4),
+		 z0 = svsub_z (p0, z0, d4))
+
+/*
+** sub_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsub	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svsub_n_f16_z (p0, z1, d4),
+		 z0 = svsub_z (p0, z1, d4))
+
+/*
+** sub_1_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f16_z_tied1, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z0, 1),
+		z0 = svsub_z (p0, z0, 1))
+
+/*
+** sub_1_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f16_z_untied, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z1, 1),
+		z0 = svsub_z (p0, z1, 1))
+
+/*
+** sub_0p5_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f16_z_tied1, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z0, 0.5),
+		z0 = svsub_z (p0, z0, 0.5))
+
+/*
+** sub_0p5_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f16_z_untied, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z1, 0.5),
+		z0 = svsub_z (p0, z1, 0.5))
+
+/*
+** sub_m1_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f16_z_tied1, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z0, -1),
+		z0 = svsub_z (p0, z0, -1))
+
+/*
+** sub_m1_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f16_z_untied, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z1, -1),
+		z0 = svsub_z (p0, z1, -1))
+
+/*
+** sub_m0p5_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f16_z_tied1, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z0, -0.5),
+		z0 = svsub_z (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f16_z_untied, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z1, -0.5),
+		z0 = svsub_z (p0, z1, -0.5))
+
+/*
+** sub_m2_f16_z:
+**	fmov	(z[0-9]+\.h), #2\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fadd	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_f16_z, svfloat16_t,
+		z0 = svsub_n_f16_z (p0, z0, -2),
+		z0 = svsub_z (p0, z0, -2))
+
+/*
+** sub_f16_x_tied1:
+**	fsub	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_x_tied1, svfloat16_t,
+		z0 = svsub_f16_x (p0, z0, z1),
+		z0 = svsub_x (p0, z0, z1))
+
+/*
+** sub_f16_x_tied2:
+**	fsub	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_x_tied2, svfloat16_t,
+		z0 = svsub_f16_x (p0, z1, z0),
+		z0 = svsub_x (p0, z1, z0))
+
+/*
+** sub_f16_x_untied:
+**	fsub	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f16_x_untied, svfloat16_t,
+		z0 = svsub_f16_x (p0, z1, z2),
+		z0 = svsub_x (p0, z1, z2))
+
+/*
+** sub_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fsub	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svsub_n_f16_x (p0, z0, d4),
+		 z0 = svsub_x (p0, z0, d4))
+
+/*
+** sub_h4_f16_x_untied:
+**	mov	(z[0-9]+\.h), h4
+**	fsub	z0\.h, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svsub_n_f16_x (p0, z1, d4),
+		 z0 = svsub_x (p0, z1, d4))
+
+/*
+** sub_1_f16_x_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z0, 1),
+		z0 = svsub_x (p0, z0, 1))
+
+/*
+** sub_1_f16_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z1, 1),
+		z0 = svsub_x (p0, z1, 1))
+
+/*
+** sub_0p5_f16_x_tied1:
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z0, 0.5),
+		z0 = svsub_x (p0, z0, 0.5))
+
+/*
+** sub_0p5_f16_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z1, 0.5),
+		z0 = svsub_x (p0, z1, 0.5))
+
+/*
+** sub_m1_f16_x_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z0, -1),
+		z0 = svsub_x (p0, z0, -1))
+
+/*
+** sub_m1_f16_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z1, -1),
+		z0 = svsub_x (p0, z1, -1))
+
+/*
+** sub_m0p5_f16_x_tied1:
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z0, -0.5),
+		z0 = svsub_x (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f16_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z1, -0.5),
+		z0 = svsub_x (p0, z1, -0.5))
+
+/*
+** sub_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
+**	fadd	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_2_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z0, 2),
+		z0 = svsub_x (p0, z0, 2))
+
+/*
+** sub_2_f16_x_untied:
+**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
+**	fadd	z0\.h, (z1\.h, \1|\1, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_2_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (p0, z1, 2),
+		z0 = svsub_x (p0, z1, 2))
+
+/*
+** ptrue_sub_f16_x_tied1:
+**	fsub	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f16_x_tied1, svfloat16_t,
+		z0 = svsub_f16_x (svptrue_b16 (), z0, z1),
+		z0 = svsub_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_sub_f16_x_tied2:
+**	fsub	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f16_x_tied2, svfloat16_t,
+		z0 = svsub_f16_x (svptrue_b16 (), z1, z0),
+		z0 = svsub_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_sub_f16_x_untied:
+**	fsub	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f16_x_untied, svfloat16_t,
+		z0 = svsub_f16_x (svptrue_b16 (), z1, z2),
+		z0 = svsub_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_sub_1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_1_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z0, 1),
+		z0 = svsub_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_sub_1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_1_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z1, 1),
+		z0 = svsub_x (svptrue_b16 (), z1, 1))
+
+/*
+** ptrue_sub_0p5_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z0, 0.5),
+		z0 = svsub_x (svptrue_b16 (), z0, 0.5))
+
+/*
+** ptrue_sub_0p5_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_0p5_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z1, 0.5),
+		z0 = svsub_x (svptrue_b16 (), z1, 0.5))
+
+/*
+** ptrue_sub_m1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m1_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z0, -1),
+		z0 = svsub_x (svptrue_b16 (), z0, -1))
+
+/*
+** ptrue_sub_m1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m1_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z1, -1),
+		z0 = svsub_x (svptrue_b16 (), z1, -1))
+
+/*
+** ptrue_sub_m0p5_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m0p5_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z0, -0.5),
+		z0 = svsub_x (svptrue_b16 (), z0, -0.5))
+
+/*
+** ptrue_sub_m0p5_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m0p5_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z1, -0.5),
+		z0 = svsub_x (svptrue_b16 (), z1, -0.5))
+
+/*
+** ptrue_sub_2_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
+**	fadd	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_2_f16_x_tied1, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z0, 2),
+		z0 = svsub_x (svptrue_b16 (), z0, 2))
+
+/*
+** ptrue_sub_2_f16_x_untied:
+**	fmov	(z[0-9]+\.h), #-2\.0(?:e\+0)?
+**	fadd	z0\.h, (z1\.h, \1|\1, z1\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_2_f16_x_untied, svfloat16_t,
+		z0 = svsub_n_f16_x (svptrue_b16 (), z1, 2),
+		z0 = svsub_x (svptrue_b16 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f32.c
new file mode 100644
index 000000000..05be52bad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f32.c
@@ -0,0 +1,577 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sub_f32_m_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_m_tied1, svfloat32_t,
+		z0 = svsub_f32_m (p0, z0, z1),
+		z0 = svsub_m (p0, z0, z1))
+
+/*
+** sub_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_m_tied2, svfloat32_t,
+		z0 = svsub_f32_m (p0, z1, z0),
+		z0 = svsub_m (p0, z1, z0))
+
+/*
+** sub_f32_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_m_untied, svfloat32_t,
+		z0 = svsub_f32_m (p0, z1, z2),
+		z0 = svsub_m (p0, z1, z2))
+
+/*
+** sub_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fsub	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svsub_n_f32_m (p0, z0, d4),
+		 z0 = svsub_m (p0, z0, d4))
+
+/*
+** sub_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svsub_n_f32_m (p0, z1, d4),
+		 z0 = svsub_m (p0, z1, d4))
+
+/*
+** sub_1_f32_m_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f32_m_tied1, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z0, 1),
+		z0 = svsub_m (p0, z0, 1))
+
+/*
+** sub_1_f32_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f32_m_untied, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z1, 1),
+		z0 = svsub_m (p0, z1, 1))
+
+/*
+** sub_0p5_f32_m_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f32_m_tied1, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z0, 0.5),
+		z0 = svsub_m (p0, z0, 0.5))
+
+/*
+** sub_0p5_f32_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f32_m_untied, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z1, 0.5),
+		z0 = svsub_m (p0, z1, 0.5))
+
+/*
+** sub_m1_f32_m_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f32_m_tied1, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z0, -1),
+		z0 = svsub_m (p0, z0, -1))
+
+/*
+** sub_m1_f32_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f32_m_untied, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z1, -1),
+		z0 = svsub_m (p0, z1, -1))
+
+/*
+** sub_m0p5_f32_m_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f32_m_tied1, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z0, -0.5),
+		z0 = svsub_m (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f32_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f32_m_untied, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z1, -0.5),
+		z0 = svsub_m (p0, z1, -0.5))
+
+/*
+** sub_m2_f32_m:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_f32_m, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z0, -2),
+		z0 = svsub_m (p0, z0, -2))
+
+/*
+** sub_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_z_tied1, svfloat32_t,
+		z0 = svsub_f32_z (p0, z0, z1),
+		z0 = svsub_z (p0, z0, z1))
+
+/*
+** sub_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_z_tied2, svfloat32_t,
+		z0 = svsub_f32_z (p0, z1, z0),
+		z0 = svsub_z (p0, z1, z0))
+
+/*
+** sub_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsub	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_z_untied, svfloat32_t,
+		z0 = svsub_f32_z (p0, z1, z2),
+		z0 = svsub_z (p0, z1, z2))
+
+/*
+** sub_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsub	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svsub_n_f32_z (p0, z0, d4),
+		 z0 = svsub_z (p0, z0, d4))
+
+/*
+** sub_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsub	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svsub_n_f32_z (p0, z1, d4),
+		 z0 = svsub_z (p0, z1, d4))
+
+/*
+** sub_1_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f32_z_tied1, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z0, 1),
+		z0 = svsub_z (p0, z0, 1))
+
+/*
+** sub_1_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f32_z_untied, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z1, 1),
+		z0 = svsub_z (p0, z1, 1))
+
+/*
+** sub_0p5_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f32_z_tied1, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z0, 0.5),
+		z0 = svsub_z (p0, z0, 0.5))
+
+/*
+** sub_0p5_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f32_z_untied, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z1, 0.5),
+		z0 = svsub_z (p0, z1, 0.5))
+
+/*
+** sub_m1_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f32_z_tied1, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z0, -1),
+		z0 = svsub_z (p0, z0, -1))
+
+/*
+** sub_m1_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f32_z_untied, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z1, -1),
+		z0 = svsub_z (p0, z1, -1))
+
+/*
+** sub_m0p5_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f32_z_tied1, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z0, -0.5),
+		z0 = svsub_z (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f32_z_untied, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z1, -0.5),
+		z0 = svsub_z (p0, z1, -0.5))
+
+/*
+** sub_m2_f32_z:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_f32_z, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z0, -2),
+		z0 = svsub_z (p0, z0, -2))
+
+/*
+** sub_f32_x_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_x_tied1, svfloat32_t,
+		z0 = svsub_f32_x (p0, z0, z1),
+		z0 = svsub_x (p0, z0, z1))
+
+/*
+** sub_f32_x_tied2:
+**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_x_tied2, svfloat32_t,
+		z0 = svsub_f32_x (p0, z1, z0),
+		z0 = svsub_x (p0, z1, z0))
+
+/*
+** sub_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_x_untied, svfloat32_t,
+		z0 = svsub_f32_x (p0, z1, z2),
+		z0 = svsub_x (p0, z1, z2))
+
+/*
+** sub_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fsub	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svsub_n_f32_x (p0, z0, d4),
+		 z0 = svsub_x (p0, z0, d4))
+
+/*
+** sub_s4_f32_x_untied: { xfail *-*-* }
+**	mov	z0\.s, s4
+**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svsub_n_f32_x (p0, z1, d4),
+		 z0 = svsub_x (p0, z1, d4))
+
+/*
+** sub_1_f32_x_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z0, 1),
+		z0 = svsub_x (p0, z0, 1))
+
+/*
+** sub_1_f32_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z1, 1),
+		z0 = svsub_x (p0, z1, 1))
+
+/*
+** sub_0p5_f32_x_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z0, 0.5),
+		z0 = svsub_x (p0, z0, 0.5))
+
+/*
+** sub_0p5_f32_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z1, 0.5),
+		z0 = svsub_x (p0, z1, 0.5))
+
+/*
+** sub_m1_f32_x_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z0, -1),
+		z0 = svsub_x (p0, z0, -1))
+
+/*
+** sub_m1_f32_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z1, -1),
+		z0 = svsub_x (p0, z1, -1))
+
+/*
+** sub_m0p5_f32_x_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z0, -0.5),
+		z0 = svsub_x (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f32_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z1, -0.5),
+		z0 = svsub_x (p0, z1, -0.5))
+
+/*
+** sub_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_2_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z0, 2),
+		z0 = svsub_x (p0, z0, 2))
+
+/*
+** sub_2_f32_x_untied:
+**	fmov	z0\.s, #-2\.0(?:e\+0)?
+**	fadd	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_2_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z1, 2),
+		z0 = svsub_x (p0, z1, 2))
+
+/*
+** ptrue_sub_f32_x_tied1:
+**	fsub	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f32_x_tied1, svfloat32_t,
+		z0 = svsub_f32_x (svptrue_b32 (), z0, z1),
+		z0 = svsub_x (svptrue_b32 (), z0, z1))
+
+/*
+** ptrue_sub_f32_x_tied2:
+**	fsub	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f32_x_tied2, svfloat32_t,
+		z0 = svsub_f32_x (svptrue_b32 (), z1, z0),
+		z0 = svsub_x (svptrue_b32 (), z1, z0))
+
+/*
+** ptrue_sub_f32_x_untied:
+**	fsub	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f32_x_untied, svfloat32_t,
+		z0 = svsub_f32_x (svptrue_b32 (), z1, z2),
+		z0 = svsub_x (svptrue_b32 (), z1, z2))
+
+/*
+** ptrue_sub_1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_1_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z0, 1),
+		z0 = svsub_x (svptrue_b32 (), z0, 1))
+
+/*
+** ptrue_sub_1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_1_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z1, 1),
+		z0 = svsub_x (svptrue_b32 (), z1, 1))
+
+/*
+** ptrue_sub_0p5_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z0, 0.5),
+		z0 = svsub_x (svptrue_b32 (), z0, 0.5))
+
+/*
+** ptrue_sub_0p5_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_0p5_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z1, 0.5),
+		z0 = svsub_x (svptrue_b32 (), z1, 0.5))
+
+/*
+** ptrue_sub_m1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m1_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z0, -1),
+		z0 = svsub_x (svptrue_b32 (), z0, -1))
+
+/*
+** ptrue_sub_m1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m1_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z1, -1),
+		z0 = svsub_x (svptrue_b32 (), z1, -1))
+
+/*
+** ptrue_sub_m0p5_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m0p5_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z0, -0.5),
+		z0 = svsub_x (svptrue_b32 (), z0, -0.5))
+
+/*
+** ptrue_sub_m0p5_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m0p5_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z1, -0.5),
+		z0 = svsub_x (svptrue_b32 (), z1, -0.5))
+
+/*
+** ptrue_sub_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
+**	fadd	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_2_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z0, 2),
+		z0 = svsub_x (svptrue_b32 (), z0, 2))
+
+/*
+** ptrue_sub_2_f32_x_untied:
+**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
+**	fadd	z0\.s, (z1\.s, \1|\1, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_2_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z1, 2),
+		z0 = svsub_x (svptrue_b32 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f32_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f32_notrap.c
new file mode 100644
index 000000000..eb79a253a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f32_notrap.c
@@ -0,0 +1,572 @@
+/* { dg-additional-options "-fno-trapping-math" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sub_f32_m_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_m_tied1, svfloat32_t,
+		z0 = svsub_f32_m (p0, z0, z1),
+		z0 = svsub_m (p0, z0, z1))
+
+/*
+** sub_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_m_tied2, svfloat32_t,
+		z0 = svsub_f32_m (p0, z1, z0),
+		z0 = svsub_m (p0, z1, z0))
+
+/*
+** sub_f32_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_m_untied, svfloat32_t,
+		z0 = svsub_f32_m (p0, z1, z2),
+		z0 = svsub_m (p0, z1, z2))
+
+/*
+** sub_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fsub	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svsub_n_f32_m (p0, z0, d4),
+		 z0 = svsub_m (p0, z0, d4))
+
+/*
+** sub_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svsub_n_f32_m (p0, z1, d4),
+		 z0 = svsub_m (p0, z1, d4))
+
+/*
+** sub_1_f32_m_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f32_m_tied1, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z0, 1),
+		z0 = svsub_m (p0, z0, 1))
+
+/*
+** sub_1_f32_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f32_m_untied, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z1, 1),
+		z0 = svsub_m (p0, z1, 1))
+
+/*
+** sub_0p5_f32_m_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f32_m_tied1, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z0, 0.5),
+		z0 = svsub_m (p0, z0, 0.5))
+
+/*
+** sub_0p5_f32_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f32_m_untied, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z1, 0.5),
+		z0 = svsub_m (p0, z1, 0.5))
+
+/*
+** sub_m1_f32_m_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f32_m_tied1, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z0, -1),
+		z0 = svsub_m (p0, z0, -1))
+
+/*
+** sub_m1_f32_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f32_m_untied, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z1, -1),
+		z0 = svsub_m (p0, z1, -1))
+
+/*
+** sub_m0p5_f32_m_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f32_m_tied1, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z0, -0.5),
+		z0 = svsub_m (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f32_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f32_m_untied, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z1, -0.5),
+		z0 = svsub_m (p0, z1, -0.5))
+
+/*
+** sub_m2_f32_m:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_f32_m, svfloat32_t,
+		z0 = svsub_n_f32_m (p0, z0, -2),
+		z0 = svsub_m (p0, z0, -2))
+
+/*
+** sub_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_z_tied1, svfloat32_t,
+		z0 = svsub_f32_z (p0, z0, z1),
+		z0 = svsub_z (p0, z0, z1))
+
+/*
+** sub_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_z_tied2, svfloat32_t,
+		z0 = svsub_f32_z (p0, z1, z0),
+		z0 = svsub_z (p0, z1, z0))
+
+/*
+** sub_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsub	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_z_untied, svfloat32_t,
+		z0 = svsub_f32_z (p0, z1, z2),
+		z0 = svsub_z (p0, z1, z2))
+
+/*
+** sub_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsub	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svsub_n_f32_z (p0, z0, d4),
+		 z0 = svsub_z (p0, z0, d4))
+
+/*
+** sub_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsub	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svsub_n_f32_z (p0, z1, d4),
+		 z0 = svsub_z (p0, z1, d4))
+
+/*
+** sub_1_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f32_z_tied1, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z0, 1),
+		z0 = svsub_z (p0, z0, 1))
+
+/*
+** sub_1_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f32_z_untied, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z1, 1),
+		z0 = svsub_z (p0, z1, 1))
+
+/*
+** sub_0p5_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f32_z_tied1, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z0, 0.5),
+		z0 = svsub_z (p0, z0, 0.5))
+
+/*
+** sub_0p5_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f32_z_untied, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z1, 0.5),
+		z0 = svsub_z (p0, z1, 0.5))
+
+/*
+** sub_m1_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f32_z_tied1, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z0, -1),
+		z0 = svsub_z (p0, z0, -1))
+
+/*
+** sub_m1_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f32_z_untied, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z1, -1),
+		z0 = svsub_z (p0, z1, -1))
+
+/*
+** sub_m0p5_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f32_z_tied1, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z0, -0.5),
+		z0 = svsub_z (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f32_z_untied, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z1, -0.5),
+		z0 = svsub_z (p0, z1, -0.5))
+
+/*
+** sub_m2_f32_z:
+**	fmov	(z[0-9]+\.s), #2\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fadd	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_f32_z, svfloat32_t,
+		z0 = svsub_n_f32_z (p0, z0, -2),
+		z0 = svsub_z (p0, z0, -2))
+
+/*
+** sub_f32_x_tied1:
+**	fsub	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_x_tied1, svfloat32_t,
+		z0 = svsub_f32_x (p0, z0, z1),
+		z0 = svsub_x (p0, z0, z1))
+
+/*
+** sub_f32_x_tied2:
+**	fsub	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_x_tied2, svfloat32_t,
+		z0 = svsub_f32_x (p0, z1, z0),
+		z0 = svsub_x (p0, z1, z0))
+
+/*
+** sub_f32_x_untied:
+**	fsub	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f32_x_untied, svfloat32_t,
+		z0 = svsub_f32_x (p0, z1, z2),
+		z0 = svsub_x (p0, z1, z2))
+
+/*
+** sub_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fsub	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svsub_n_f32_x (p0, z0, d4),
+		 z0 = svsub_x (p0, z0, d4))
+
+/*
+** sub_s4_f32_x_untied:
+**	mov	(z[0-9]+\.s), s4
+**	fsub	z0\.s, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svsub_n_f32_x (p0, z1, d4),
+		 z0 = svsub_x (p0, z1, d4))
+
+/*
+** sub_1_f32_x_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z0, 1),
+		z0 = svsub_x (p0, z0, 1))
+
+/*
+** sub_1_f32_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z1, 1),
+		z0 = svsub_x (p0, z1, 1))
+
+/*
+** sub_0p5_f32_x_tied1:
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z0, 0.5),
+		z0 = svsub_x (p0, z0, 0.5))
+
+/*
+** sub_0p5_f32_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z1, 0.5),
+		z0 = svsub_x (p0, z1, 0.5))
+
+/*
+** sub_m1_f32_x_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z0, -1),
+		z0 = svsub_x (p0, z0, -1))
+
+/*
+** sub_m1_f32_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z1, -1),
+		z0 = svsub_x (p0, z1, -1))
+
+/*
+** sub_m0p5_f32_x_tied1:
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z0, -0.5),
+		z0 = svsub_x (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f32_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z1, -0.5),
+		z0 = svsub_x (p0, z1, -0.5))
+
+/*
+** sub_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
+**	fadd	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_2_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z0, 2),
+		z0 = svsub_x (p0, z0, 2))
+
+/*
+** sub_2_f32_x_untied:
+**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
+**	fadd	z0\.s, (z1\.s, \1|\1, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_2_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (p0, z1, 2),
+		z0 = svsub_x (p0, z1, 2))
+
+/*
+** ptrue_sub_f32_x_tied1:
+**	fsub	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f32_x_tied1, svfloat32_t,
+		z0 = svsub_f32_x (svptrue_b32 (), z0, z1),
+		z0 = svsub_x (svptrue_b32 (), z0, z1))
+
+/*
+** ptrue_sub_f32_x_tied2:
+**	fsub	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f32_x_tied2, svfloat32_t,
+		z0 = svsub_f32_x (svptrue_b32 (), z1, z0),
+		z0 = svsub_x (svptrue_b32 (), z1, z0))
+
+/*
+** ptrue_sub_f32_x_untied:
+**	fsub	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f32_x_untied, svfloat32_t,
+		z0 = svsub_f32_x (svptrue_b32 (), z1, z2),
+		z0 = svsub_x (svptrue_b32 (), z1, z2))
+
+/*
+** ptrue_sub_1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_1_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z0, 1),
+		z0 = svsub_x (svptrue_b32 (), z0, 1))
+
+/*
+** ptrue_sub_1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_1_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z1, 1),
+		z0 = svsub_x (svptrue_b32 (), z1, 1))
+
+/*
+** ptrue_sub_0p5_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z0, 0.5),
+		z0 = svsub_x (svptrue_b32 (), z0, 0.5))
+
+/*
+** ptrue_sub_0p5_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_0p5_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z1, 0.5),
+		z0 = svsub_x (svptrue_b32 (), z1, 0.5))
+
+/*
+** ptrue_sub_m1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m1_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z0, -1),
+		z0 = svsub_x (svptrue_b32 (), z0, -1))
+
+/*
+** ptrue_sub_m1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m1_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z1, -1),
+		z0 = svsub_x (svptrue_b32 (), z1, -1))
+
+/*
+** ptrue_sub_m0p5_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m0p5_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z0, -0.5),
+		z0 = svsub_x (svptrue_b32 (), z0, -0.5))
+
+/*
+** ptrue_sub_m0p5_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m0p5_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z1, -0.5),
+		z0 = svsub_x (svptrue_b32 (), z1, -0.5))
+
+/*
+** ptrue_sub_2_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
+**	fadd	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_2_f32_x_tied1, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z0, 2),
+		z0 = svsub_x (svptrue_b32 (), z0, 2))
+
+/*
+** ptrue_sub_2_f32_x_untied:
+**	fmov	(z[0-9]+\.s), #-2\.0(?:e\+0)?
+**	fadd	z0\.s, (z1\.s, \1|\1, z1\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_2_f32_x_untied, svfloat32_t,
+		z0 = svsub_n_f32_x (svptrue_b32 (), z1, 2),
+		z0 = svsub_x (svptrue_b32 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f64.c
new file mode 100644
index 000000000..2179382c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f64.c
@@ -0,0 +1,577 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sub_f64_m_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_m_tied1, svfloat64_t,
+		z0 = svsub_f64_m (p0, z0, z1),
+		z0 = svsub_m (p0, z0, z1))
+
+/*
+** sub_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_m_tied2, svfloat64_t,
+		z0 = svsub_f64_m (p0, z1, z0),
+		z0 = svsub_m (p0, z1, z0))
+
+/*
+** sub_f64_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_m_untied, svfloat64_t,
+		z0 = svsub_f64_m (p0, z1, z2),
+		z0 = svsub_m (p0, z1, z2))
+
+/*
+** sub_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fsub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svsub_n_f64_m (p0, z0, d4),
+		 z0 = svsub_m (p0, z0, d4))
+
+/*
+** sub_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svsub_n_f64_m (p0, z1, d4),
+		 z0 = svsub_m (p0, z1, d4))
+
+/*
+** sub_1_f64_m_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f64_m_tied1, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z0, 1),
+		z0 = svsub_m (p0, z0, 1))
+
+/*
+** sub_1_f64_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f64_m_untied, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z1, 1),
+		z0 = svsub_m (p0, z1, 1))
+
+/*
+** sub_0p5_f64_m_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f64_m_tied1, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z0, 0.5),
+		z0 = svsub_m (p0, z0, 0.5))
+
+/*
+** sub_0p5_f64_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f64_m_untied, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z1, 0.5),
+		z0 = svsub_m (p0, z1, 0.5))
+
+/*
+** sub_m1_f64_m_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f64_m_tied1, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z0, -1),
+		z0 = svsub_m (p0, z0, -1))
+
+/*
+** sub_m1_f64_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f64_m_untied, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z1, -1),
+		z0 = svsub_m (p0, z1, -1))
+
+/*
+** sub_m0p5_f64_m_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f64_m_tied1, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z0, -0.5),
+		z0 = svsub_m (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f64_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f64_m_untied, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z1, -0.5),
+		z0 = svsub_m (p0, z1, -0.5))
+
+/*
+** sub_m2_f64_m:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_f64_m, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z0, -2),
+		z0 = svsub_m (p0, z0, -2))
+
+/*
+** sub_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_z_tied1, svfloat64_t,
+		z0 = svsub_f64_z (p0, z0, z1),
+		z0 = svsub_z (p0, z0, z1))
+
+/*
+** sub_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_z_tied2, svfloat64_t,
+		z0 = svsub_f64_z (p0, z1, z0),
+		z0 = svsub_z (p0, z1, z0))
+
+/*
+** sub_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsub	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_z_untied, svfloat64_t,
+		z0 = svsub_f64_z (p0, z1, z2),
+		z0 = svsub_z (p0, z1, z2))
+
+/*
+** sub_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svsub_n_f64_z (p0, z0, d4),
+		 z0 = svsub_z (p0, z0, d4))
+
+/*
+** sub_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsub	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svsub_n_f64_z (p0, z1, d4),
+		 z0 = svsub_z (p0, z1, d4))
+
+/*
+** sub_1_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f64_z_tied1, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z0, 1),
+		z0 = svsub_z (p0, z0, 1))
+
+/*
+** sub_1_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f64_z_untied, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z1, 1),
+		z0 = svsub_z (p0, z1, 1))
+
+/*
+** sub_0p5_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f64_z_tied1, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z0, 0.5),
+		z0 = svsub_z (p0, z0, 0.5))
+
+/*
+** sub_0p5_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f64_z_untied, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z1, 0.5),
+		z0 = svsub_z (p0, z1, 0.5))
+
+/*
+** sub_m1_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f64_z_tied1, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z0, -1),
+		z0 = svsub_z (p0, z0, -1))
+
+/*
+** sub_m1_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f64_z_untied, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z1, -1),
+		z0 = svsub_z (p0, z1, -1))
+
+/*
+** sub_m0p5_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f64_z_tied1, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z0, -0.5),
+		z0 = svsub_z (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f64_z_untied, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z1, -0.5),
+		z0 = svsub_z (p0, z1, -0.5))
+
+/*
+** sub_m2_f64_z:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_f64_z, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z0, -2),
+		z0 = svsub_z (p0, z0, -2))
+
+/*
+** sub_f64_x_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_x_tied1, svfloat64_t,
+		z0 = svsub_f64_x (p0, z0, z1),
+		z0 = svsub_x (p0, z0, z1))
+
+/*
+** sub_f64_x_tied2:
+**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_x_tied2, svfloat64_t,
+		z0 = svsub_f64_x (p0, z1, z0),
+		z0 = svsub_x (p0, z1, z0))
+
+/*
+** sub_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_x_untied, svfloat64_t,
+		z0 = svsub_f64_x (p0, z1, z2),
+		z0 = svsub_x (p0, z1, z2))
+
+/*
+** sub_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fsub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svsub_n_f64_x (p0, z0, d4),
+		 z0 = svsub_x (p0, z0, d4))
+
+/*
+** sub_d4_f64_x_untied: { xfail *-*-* }
+**	mov	z0\.d, d4
+**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svsub_n_f64_x (p0, z1, d4),
+		 z0 = svsub_x (p0, z1, d4))
+
+/*
+** sub_1_f64_x_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z0, 1),
+		z0 = svsub_x (p0, z0, 1))
+
+/*
+** sub_1_f64_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z1, 1),
+		z0 = svsub_x (p0, z1, 1))
+
+/*
+** sub_0p5_f64_x_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z0, 0.5),
+		z0 = svsub_x (p0, z0, 0.5))
+
+/*
+** sub_0p5_f64_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z1, 0.5),
+		z0 = svsub_x (p0, z1, 0.5))
+
+/*
+** sub_m1_f64_x_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z0, -1),
+		z0 = svsub_x (p0, z0, -1))
+
+/*
+** sub_m1_f64_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z1, -1),
+		z0 = svsub_x (p0, z1, -1))
+
+/*
+** sub_m0p5_f64_x_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z0, -0.5),
+		z0 = svsub_x (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f64_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z1, -0.5),
+		z0 = svsub_x (p0, z1, -0.5))
+
+/*
+** sub_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_2_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z0, 2),
+		z0 = svsub_x (p0, z0, 2))
+
+/*
+** sub_2_f64_x_untied:
+**	fmov	z0\.d, #-2\.0(?:e\+0)?
+**	fadd	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_2_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z1, 2),
+		z0 = svsub_x (p0, z1, 2))
+
+/*
+** ptrue_sub_f64_x_tied1:
+**	fsub	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f64_x_tied1, svfloat64_t,
+		z0 = svsub_f64_x (svptrue_b64 (), z0, z1),
+		z0 = svsub_x (svptrue_b64 (), z0, z1))
+
+/*
+** ptrue_sub_f64_x_tied2:
+**	fsub	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f64_x_tied2, svfloat64_t,
+		z0 = svsub_f64_x (svptrue_b64 (), z1, z0),
+		z0 = svsub_x (svptrue_b64 (), z1, z0))
+
+/*
+** ptrue_sub_f64_x_untied:
+**	fsub	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f64_x_untied, svfloat64_t,
+		z0 = svsub_f64_x (svptrue_b64 (), z1, z2),
+		z0 = svsub_x (svptrue_b64 (), z1, z2))
+
+/*
+** ptrue_sub_1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_1_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z0, 1),
+		z0 = svsub_x (svptrue_b64 (), z0, 1))
+
+/*
+** ptrue_sub_1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_1_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z1, 1),
+		z0 = svsub_x (svptrue_b64 (), z1, 1))
+
+/*
+** ptrue_sub_0p5_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z0, 0.5),
+		z0 = svsub_x (svptrue_b64 (), z0, 0.5))
+
+/*
+** ptrue_sub_0p5_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_0p5_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z1, 0.5),
+		z0 = svsub_x (svptrue_b64 (), z1, 0.5))
+
+/*
+** ptrue_sub_m1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m1_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z0, -1),
+		z0 = svsub_x (svptrue_b64 (), z0, -1))
+
+/*
+** ptrue_sub_m1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m1_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z1, -1),
+		z0 = svsub_x (svptrue_b64 (), z1, -1))
+
+/*
+** ptrue_sub_m0p5_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m0p5_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z0, -0.5),
+		z0 = svsub_x (svptrue_b64 (), z0, -0.5))
+
+/*
+** ptrue_sub_m0p5_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m0p5_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z1, -0.5),
+		z0 = svsub_x (svptrue_b64 (), z1, -0.5))
+
+/*
+** ptrue_sub_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
+**	fadd	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_2_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z0, 2),
+		z0 = svsub_x (svptrue_b64 (), z0, 2))
+
+/*
+** ptrue_sub_2_f64_x_untied:
+**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
+**	fadd	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_2_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z1, 2),
+		z0 = svsub_x (svptrue_b64 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f64_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f64_notrap.c
new file mode 100644
index 000000000..bd89f44b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_f64_notrap.c
@@ -0,0 +1,572 @@
+/* { dg-additional-options "-fno-trapping-math" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sub_f64_m_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_m_tied1, svfloat64_t,
+		z0 = svsub_f64_m (p0, z0, z1),
+		z0 = svsub_m (p0, z0, z1))
+
+/*
+** sub_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_m_tied2, svfloat64_t,
+		z0 = svsub_f64_m (p0, z1, z0),
+		z0 = svsub_m (p0, z1, z0))
+
+/*
+** sub_f64_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_m_untied, svfloat64_t,
+		z0 = svsub_f64_m (p0, z1, z2),
+		z0 = svsub_m (p0, z1, z2))
+
+/*
+** sub_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fsub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svsub_n_f64_m (p0, z0, d4),
+		 z0 = svsub_m (p0, z0, d4))
+
+/*
+** sub_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svsub_n_f64_m (p0, z1, d4),
+		 z0 = svsub_m (p0, z1, d4))
+
+/*
+** sub_1_f64_m_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f64_m_tied1, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z0, 1),
+		z0 = svsub_m (p0, z0, 1))
+
+/*
+** sub_1_f64_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f64_m_untied, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z1, 1),
+		z0 = svsub_m (p0, z1, 1))
+
+/*
+** sub_0p5_f64_m_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f64_m_tied1, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z0, 0.5),
+		z0 = svsub_m (p0, z0, 0.5))
+
+/*
+** sub_0p5_f64_m_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f64_m_untied, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z1, 0.5),
+		z0 = svsub_m (p0, z1, 0.5))
+
+/*
+** sub_m1_f64_m_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f64_m_tied1, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z0, -1),
+		z0 = svsub_m (p0, z0, -1))
+
+/*
+** sub_m1_f64_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f64_m_untied, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z1, -1),
+		z0 = svsub_m (p0, z1, -1))
+
+/*
+** sub_m0p5_f64_m_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f64_m_tied1, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z0, -0.5),
+		z0 = svsub_m (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f64_m_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f64_m_untied, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z1, -0.5),
+		z0 = svsub_m (p0, z1, -0.5))
+
+/*
+** sub_m2_f64_m:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_f64_m, svfloat64_t,
+		z0 = svsub_n_f64_m (p0, z0, -2),
+		z0 = svsub_m (p0, z0, -2))
+
+/*
+** sub_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_z_tied1, svfloat64_t,
+		z0 = svsub_f64_z (p0, z0, z1),
+		z0 = svsub_z (p0, z0, z1))
+
+/*
+** sub_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_z_tied2, svfloat64_t,
+		z0 = svsub_f64_z (p0, z1, z0),
+		z0 = svsub_z (p0, z1, z0))
+
+/*
+** sub_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsub	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_z_untied, svfloat64_t,
+		z0 = svsub_f64_z (p0, z1, z2),
+		z0 = svsub_z (p0, z1, z2))
+
+/*
+** sub_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svsub_n_f64_z (p0, z0, d4),
+		 z0 = svsub_z (p0, z0, d4))
+
+/*
+** sub_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsub	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svsub_n_f64_z (p0, z1, d4),
+		 z0 = svsub_z (p0, z1, d4))
+
+/*
+** sub_1_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f64_z_tied1, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z0, 1),
+		z0 = svsub_z (p0, z0, 1))
+
+/*
+** sub_1_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f64_z_untied, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z1, 1),
+		z0 = svsub_z (p0, z1, 1))
+
+/*
+** sub_0p5_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f64_z_tied1, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z0, 0.5),
+		z0 = svsub_z (p0, z0, 0.5))
+
+/*
+** sub_0p5_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f64_z_untied, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z1, 0.5),
+		z0 = svsub_z (p0, z1, 0.5))
+
+/*
+** sub_m1_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f64_z_tied1, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z0, -1),
+		z0 = svsub_z (p0, z0, -1))
+
+/*
+** sub_m1_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f64_z_untied, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z1, -1),
+		z0 = svsub_z (p0, z1, -1))
+
+/*
+** sub_m0p5_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f64_z_tied1, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z0, -0.5),
+		z0 = svsub_z (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f64_z_untied, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z1, -0.5),
+		z0 = svsub_z (p0, z1, -0.5))
+
+/*
+** sub_m2_f64_z:
+**	fmov	(z[0-9]+\.d), #2\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fadd	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_f64_z, svfloat64_t,
+		z0 = svsub_n_f64_z (p0, z0, -2),
+		z0 = svsub_z (p0, z0, -2))
+
+/*
+** sub_f64_x_tied1:
+**	fsub	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_x_tied1, svfloat64_t,
+		z0 = svsub_f64_x (p0, z0, z1),
+		z0 = svsub_x (p0, z0, z1))
+
+/*
+** sub_f64_x_tied2:
+**	fsub	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_x_tied2, svfloat64_t,
+		z0 = svsub_f64_x (p0, z1, z0),
+		z0 = svsub_x (p0, z1, z0))
+
+/*
+** sub_f64_x_untied:
+**	fsub	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_f64_x_untied, svfloat64_t,
+		z0 = svsub_f64_x (p0, z1, z2),
+		z0 = svsub_x (p0, z1, z2))
+
+/*
+** sub_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fsub	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svsub_n_f64_x (p0, z0, d4),
+		 z0 = svsub_x (p0, z0, d4))
+
+/*
+** sub_d4_f64_x_untied:
+**	mov	(z[0-9]+\.d), d4
+**	fsub	z0\.d, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (sub_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svsub_n_f64_x (p0, z1, d4),
+		 z0 = svsub_x (p0, z1, d4))
+
+/*
+** sub_1_f64_x_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z0, 1),
+		z0 = svsub_x (p0, z0, 1))
+
+/*
+** sub_1_f64_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z1, 1),
+		z0 = svsub_x (p0, z1, 1))
+
+/*
+** sub_0p5_f64_x_tied1:
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z0, 0.5),
+		z0 = svsub_x (p0, z0, 0.5))
+
+/*
+** sub_0p5_f64_x_untied:
+**	movprfx	z0, z1
+**	fsub	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_0p5_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z1, 0.5),
+		z0 = svsub_x (p0, z1, 0.5))
+
+/*
+** sub_m1_f64_x_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z0, -1),
+		z0 = svsub_x (p0, z0, -1))
+
+/*
+** sub_m1_f64_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z1, -1),
+		z0 = svsub_x (p0, z1, -1))
+
+/*
+** sub_m0p5_f64_x_tied1:
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z0, -0.5),
+		z0 = svsub_x (p0, z0, -0.5))
+
+/*
+** sub_m0p5_f64_x_untied:
+**	movprfx	z0, z1
+**	fadd	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z1, -0.5),
+		z0 = svsub_x (p0, z1, -0.5))
+
+/*
+** sub_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
+**	fadd	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_2_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z0, 2),
+		z0 = svsub_x (p0, z0, 2))
+
+/*
+** sub_2_f64_x_untied:
+**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
+**	fadd	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_2_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (p0, z1, 2),
+		z0 = svsub_x (p0, z1, 2))
+
+/*
+** ptrue_sub_f64_x_tied1:
+**	fsub	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f64_x_tied1, svfloat64_t,
+		z0 = svsub_f64_x (svptrue_b64 (), z0, z1),
+		z0 = svsub_x (svptrue_b64 (), z0, z1))
+
+/*
+** ptrue_sub_f64_x_tied2:
+**	fsub	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f64_x_tied2, svfloat64_t,
+		z0 = svsub_f64_x (svptrue_b64 (), z1, z0),
+		z0 = svsub_x (svptrue_b64 (), z1, z0))
+
+/*
+** ptrue_sub_f64_x_untied:
+**	fsub	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_f64_x_untied, svfloat64_t,
+		z0 = svsub_f64_x (svptrue_b64 (), z1, z2),
+		z0 = svsub_x (svptrue_b64 (), z1, z2))
+
+/*
+** ptrue_sub_1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_1_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z0, 1),
+		z0 = svsub_x (svptrue_b64 (), z0, 1))
+
+/*
+** ptrue_sub_1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_1_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z1, 1),
+		z0 = svsub_x (svptrue_b64 (), z1, 1))
+
+/*
+** ptrue_sub_0p5_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z0, 0.5),
+		z0 = svsub_x (svptrue_b64 (), z0, 0.5))
+
+/*
+** ptrue_sub_0p5_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_0p5_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z1, 0.5),
+		z0 = svsub_x (svptrue_b64 (), z1, 0.5))
+
+/*
+** ptrue_sub_m1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m1_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z0, -1),
+		z0 = svsub_x (svptrue_b64 (), z0, -1))
+
+/*
+** ptrue_sub_m1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m1_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z1, -1),
+		z0 = svsub_x (svptrue_b64 (), z1, -1))
+
+/*
+** ptrue_sub_m0p5_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m0p5_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z0, -0.5),
+		z0 = svsub_x (svptrue_b64 (), z0, -0.5))
+
+/*
+** ptrue_sub_m0p5_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_m0p5_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z1, -0.5),
+		z0 = svsub_x (svptrue_b64 (), z1, -0.5))
+
+/*
+** ptrue_sub_2_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
+**	fadd	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_2_f64_x_tied1, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z0, 2),
+		z0 = svsub_x (svptrue_b64 (), z0, 2))
+
+/*
+** ptrue_sub_2_f64_x_untied:
+**	fmov	(z[0-9]+\.d), #-2\.0(?:e\+0)?
+**	fadd	z0\.d, (z1\.d, \1|\1, z1\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_2_f64_x_untied, svfloat64_t,
+		z0 = svsub_n_f64_x (svptrue_b64 (), z1, 2),
+		z0 = svsub_x (svptrue_b64 (), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s16.c
new file mode 100644
index 000000000..aea8ea2b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s16.c
@@ -0,0 +1,377 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sub_s16_m_tied1:
+**	sub	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s16_m_tied1, svint16_t,
+		z0 = svsub_s16_m (p0, z0, z1),
+		z0 = svsub_m (p0, z0, z1))
+
+/*
+** sub_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	sub	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s16_m_tied2, svint16_t,
+		z0 = svsub_s16_m (p0, z1, z0),
+		z0 = svsub_m (p0, z1, z0))
+
+/*
+** sub_s16_m_untied:
+**	movprfx	z0, z1
+**	sub	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s16_m_untied, svint16_t,
+		z0 = svsub_s16_m (p0, z1, z2),
+		z0 = svsub_m (p0, z1, z2))
+
+/*
+** sub_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	sub	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s16_m_tied1, svint16_t, int16_t,
+		 z0 = svsub_n_s16_m (p0, z0, x0),
+		 z0 = svsub_m (p0, z0, x0))
+
+/*
+** sub_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	sub	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s16_m_untied, svint16_t, int16_t,
+		 z0 = svsub_n_s16_m (p0, z1, x0),
+		 z0 = svsub_m (p0, z1, x0))
+
+/*
+** sub_1_s16_m_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	add	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s16_m_tied1, svint16_t,
+		z0 = svsub_n_s16_m (p0, z0, 1),
+		z0 = svsub_m (p0, z0, 1))
+
+/*
+** sub_1_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+)\.b, #-1
+**	movprfx	z0, z1
+**	add	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s16_m_untied, svint16_t,
+		z0 = svsub_n_s16_m (p0, z1, 1),
+		z0 = svsub_m (p0, z1, 1))
+
+/*
+** sub_m2_s16_m:
+**	mov	(z[0-9]+\.h), #2
+**	add	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_s16_m, svint16_t,
+		z0 = svsub_n_s16_m (p0, z0, -2),
+		z0 = svsub_m (p0, z0, -2))
+
+/*
+** sub_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	sub	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s16_z_tied1, svint16_t,
+		z0 = svsub_s16_z (p0, z0, z1),
+		z0 = svsub_z (p0, z0, z1))
+
+/*
+** sub_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	subr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s16_z_tied2, svint16_t,
+		z0 = svsub_s16_z (p0, z1, z0),
+		z0 = svsub_z (p0, z1, z0))
+
+/*
+** sub_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	sub	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	subr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s16_z_untied, svint16_t,
+		z0 = svsub_s16_z (p0, z1, z2),
+		z0 = svsub_z (p0, z1, z2))
+
+/*
+** sub_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	sub	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s16_z_tied1, svint16_t, int16_t,
+		 z0 = svsub_n_s16_z (p0, z0, x0),
+		 z0 = svsub_z (p0, z0, x0))
+
+/*
+** sub_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	sub	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	subr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s16_z_untied, svint16_t, int16_t,
+		 z0 = svsub_n_s16_z (p0, z1, x0),
+		 z0 = svsub_z (p0, z1, x0))
+
+/*
+** sub_1_s16_z_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	add	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s16_z_tied1, svint16_t,
+		z0 = svsub_n_s16_z (p0, z0, 1),
+		z0 = svsub_z (p0, z0, 1))
+
+/*
+** sub_1_s16_z_untied:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	add	z0\.h, p0/m, z0\.h, \1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1\.h
+**	add	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s16_z_untied, svint16_t,
+		z0 = svsub_n_s16_z (p0, z1, 1),
+		z0 = svsub_z (p0, z1, 1))
+
+/*
+** sub_s16_x_tied1:
+**	sub	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s16_x_tied1, svint16_t,
+		z0 = svsub_s16_x (p0, z0, z1),
+		z0 = svsub_x (p0, z0, z1))
+
+/*
+** sub_s16_x_tied2:
+**	sub	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s16_x_tied2, svint16_t,
+		z0 = svsub_s16_x (p0, z1, z0),
+		z0 = svsub_x (p0, z1, z0))
+
+/*
+** sub_s16_x_untied:
+**	sub	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s16_x_untied, svint16_t,
+		z0 = svsub_s16_x (p0, z1, z2),
+		z0 = svsub_x (p0, z1, z2))
+
+/*
+** sub_w0_s16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	sub	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s16_x_tied1, svint16_t, int16_t,
+		 z0 = svsub_n_s16_x (p0, z0, x0),
+		 z0 = svsub_x (p0, z0, x0))
+
+/*
+** sub_w0_s16_x_untied:
+**	mov	(z[0-9]+\.h), w0
+**	sub	z0\.h, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s16_x_untied, svint16_t, int16_t,
+		 z0 = svsub_n_s16_x (p0, z1, x0),
+		 z0 = svsub_x (p0, z1, x0))
+
+/*
+** sub_1_s16_x_tied1:
+**	sub	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s16_x_tied1, svint16_t,
+		z0 = svsub_n_s16_x (p0, z0, 1),
+		z0 = svsub_x (p0, z0, 1))
+
+/*
+** sub_1_s16_x_untied:
+**	movprfx	z0, z1
+**	sub	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s16_x_untied, svint16_t,
+		z0 = svsub_n_s16_x (p0, z1, 1),
+		z0 = svsub_x (p0, z1, 1))
+
+/*
+** sub_127_s16_x:
+**	sub	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (sub_127_s16_x, svint16_t,
+		z0 = svsub_n_s16_x (p0, z0, 127),
+		z0 = svsub_x (p0, z0, 127))
+
+/*
+** sub_128_s16_x:
+**	sub	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (sub_128_s16_x, svint16_t,
+		z0 = svsub_n_s16_x (p0, z0, 128),
+		z0 = svsub_x (p0, z0, 128))
+
+/*
+** sub_255_s16_x:
+**	sub	z0\.h, z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (sub_255_s16_x, svint16_t,
+		z0 = svsub_n_s16_x (p0, z0, 255),
+		z0 = svsub_x (p0, z0, 255))
+
+/*
+** sub_256_s16_x:
+**	add	z0\.h, z0\.h, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (sub_256_s16_x, svint16_t,
+		z0 = svsub_n_s16_x (p0, z0, 256),
+		z0 = svsub_x (p0, z0, 256))
+
+/*
+** sub_257_s16_x:
+**	mov	(z[0-9]+\.h), #-257
+**	add	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_257_s16_x, svint16_t,
+		z0 = svsub_n_s16_x (p0, z0, 257),
+		z0 = svsub_x (p0, z0, 257))
+
+/*
+** sub_512_s16_x:
+**	add	z0\.h, z0\.h, #65024
+**	ret
+*/
+TEST_UNIFORM_Z (sub_512_s16_x, svint16_t,
+		z0 = svsub_n_s16_x (p0, z0, 512),
+		z0 = svsub_x (p0, z0, 512))
+
+/*
+** sub_65280_s16_x:
+**	add	z0\.h, z0\.h, #256
+**	ret
+*/
+TEST_UNIFORM_Z (sub_65280_s16_x, svint16_t,
+		z0 = svsub_n_s16_x (p0, z0, 0xff00),
+		z0 = svsub_x (p0, z0, 0xff00))
+
+/*
+** sub_m1_s16_x:
+**	add	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_s16_x, svint16_t,
+		z0 = svsub_n_s16_x (p0, z0, -1),
+		z0 = svsub_x (p0, z0, -1))
+
+/*
+** sub_m127_s16_x:
+**	add	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m127_s16_x, svint16_t,
+		z0 = svsub_n_s16_x (p0, z0, -127),
+		z0 = svsub_x (p0, z0, -127))
+
+/*
+** sub_m128_s16_x:
+**	add	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m128_s16_x, svint16_t,
+		z0 = svsub_n_s16_x (p0, z0, -128),
+		z0 = svsub_x (p0, z0, -128))
+
+/*
+** sub_m255_s16_x:
+**	add	z0\.h, z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m255_s16_x, svint16_t,
+		z0 = svsub_n_s16_x (p0, z0, -255),
+		z0 = svsub_x (p0, z0, -255))
+
+/*
+** sub_m256_s16_x:
+**	add	z0\.h, z0\.h, #256
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m256_s16_x, svint16_t,
+		z0 = svsub_n_s16_x (p0, z0, -256),
+		z0 = svsub_x (p0, z0, -256))
+
+/*
+** sub_m257_s16_x:
+**	mov	(z[0-9]+)\.b, #1
+**	add	z0\.h, (z0\.h, \1\.h|\1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m257_s16_x, svint16_t,
+		z0 = svsub_n_s16_x (p0, z0, -257),
+		z0 = svsub_x (p0, z0, -257))
+
+/*
+** sub_m512_s16_x:
+**	add	z0\.h, z0\.h, #512
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m512_s16_x, svint16_t,
+		z0 = svsub_n_s16_x (p0, z0, -512),
+		z0 = svsub_x (p0, z0, -512))
+
+/*
+** sub_m32768_s16_x:
+**	add	z0\.h, z0\.h, #32768
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m32768_s16_x, svint16_t,
+		z0 = svsub_n_s16_x (p0, z0, -0x8000),
+		z0 = svsub_x (p0, z0, -0x8000))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s32.c
new file mode 100644
index 000000000..db6f3df90
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s32.c
@@ -0,0 +1,426 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sub_s32_m_tied1:
+**	sub	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s32_m_tied1, svint32_t,
+		z0 = svsub_s32_m (p0, z0, z1),
+		z0 = svsub_m (p0, z0, z1))
+
+/*
+** sub_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	sub	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s32_m_tied2, svint32_t,
+		z0 = svsub_s32_m (p0, z1, z0),
+		z0 = svsub_m (p0, z1, z0))
+
+/*
+** sub_s32_m_untied:
+**	movprfx	z0, z1
+**	sub	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s32_m_untied, svint32_t,
+		z0 = svsub_s32_m (p0, z1, z2),
+		z0 = svsub_m (p0, z1, z2))
+
+/*
+** sub_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	sub	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svsub_n_s32_m (p0, z0, x0),
+		 z0 = svsub_m (p0, z0, x0))
+
+/*
+** sub_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	sub	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svsub_n_s32_m (p0, z1, x0),
+		 z0 = svsub_m (p0, z1, x0))
+
+/*
+** sub_1_s32_m_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	add	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s32_m_tied1, svint32_t,
+		z0 = svsub_n_s32_m (p0, z0, 1),
+		z0 = svsub_m (p0, z0, 1))
+
+/*
+** sub_1_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+)\.b, #-1
+**	movprfx	z0, z1
+**	add	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s32_m_untied, svint32_t,
+		z0 = svsub_n_s32_m (p0, z1, 1),
+		z0 = svsub_m (p0, z1, 1))
+
+/*
+** sub_m2_s32_m:
+**	mov	(z[0-9]+\.s), #2
+**	add	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_s32_m, svint32_t,
+		z0 = svsub_n_s32_m (p0, z0, -2),
+		z0 = svsub_m (p0, z0, -2))
+
+/*
+** sub_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sub	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s32_z_tied1, svint32_t,
+		z0 = svsub_s32_z (p0, z0, z1),
+		z0 = svsub_z (p0, z0, z1))
+
+/*
+** sub_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	subr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s32_z_tied2, svint32_t,
+		z0 = svsub_s32_z (p0, z1, z0),
+		z0 = svsub_z (p0, z1, z0))
+
+/*
+** sub_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	sub	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	subr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s32_z_untied, svint32_t,
+		z0 = svsub_s32_z (p0, z1, z2),
+		z0 = svsub_z (p0, z1, z2))
+
+/*
+** sub_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sub	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svsub_n_s32_z (p0, z0, x0),
+		 z0 = svsub_z (p0, z0, x0))
+
+/*
+** sub_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	sub	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	subr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svsub_n_s32_z (p0, z1, x0),
+		 z0 = svsub_z (p0, z1, x0))
+
+/*
+** sub_1_s32_z_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	add	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s32_z_tied1, svint32_t,
+		z0 = svsub_n_s32_z (p0, z0, 1),
+		z0 = svsub_z (p0, z0, 1))
+
+/*
+** sub_1_s32_z_untied:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	add	z0\.s, p0/m, z0\.s, \1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1\.s
+**	add	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s32_z_untied, svint32_t,
+		z0 = svsub_n_s32_z (p0, z1, 1),
+		z0 = svsub_z (p0, z1, 1))
+
+/*
+** sub_s32_x_tied1:
+**	sub	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s32_x_tied1, svint32_t,
+		z0 = svsub_s32_x (p0, z0, z1),
+		z0 = svsub_x (p0, z0, z1))
+
+/*
+** sub_s32_x_tied2:
+**	sub	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s32_x_tied2, svint32_t,
+		z0 = svsub_s32_x (p0, z1, z0),
+		z0 = svsub_x (p0, z1, z0))
+
+/*
+** sub_s32_x_untied:
+**	sub	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s32_x_untied, svint32_t,
+		z0 = svsub_s32_x (p0, z1, z2),
+		z0 = svsub_x (p0, z1, z2))
+
+/*
+** sub_w0_s32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	sub	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svsub_n_s32_x (p0, z0, x0),
+		 z0 = svsub_x (p0, z0, x0))
+
+/*
+** sub_w0_s32_x_untied:
+**	mov	(z[0-9]+\.s), w0
+**	sub	z0\.s, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svsub_n_s32_x (p0, z1, x0),
+		 z0 = svsub_x (p0, z1, x0))
+
+/*
+** sub_1_s32_x_tied1:
+**	sub	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s32_x_tied1, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, 1),
+		z0 = svsub_x (p0, z0, 1))
+
+/*
+** sub_1_s32_x_untied:
+**	movprfx	z0, z1
+**	sub	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s32_x_untied, svint32_t,
+		z0 = svsub_n_s32_x (p0, z1, 1),
+		z0 = svsub_x (p0, z1, 1))
+
+/*
+** sub_127_s32_x:
+**	sub	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (sub_127_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, 127),
+		z0 = svsub_x (p0, z0, 127))
+
+/*
+** sub_128_s32_x:
+**	sub	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (sub_128_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, 128),
+		z0 = svsub_x (p0, z0, 128))
+
+/*
+** sub_255_s32_x:
+**	sub	z0\.s, z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (sub_255_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, 255),
+		z0 = svsub_x (p0, z0, 255))
+
+/*
+** sub_256_s32_x:
+**	sub	z0\.s, z0\.s, #256
+**	ret
+*/
+TEST_UNIFORM_Z (sub_256_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, 256),
+		z0 = svsub_x (p0, z0, 256))
+
+/*
+** sub_511_s32_x:
+**	mov	(z[0-9]+\.s), #-511
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_511_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, 511),
+		z0 = svsub_x (p0, z0, 511))
+
+/*
+** sub_512_s32_x:
+**	sub	z0\.s, z0\.s, #512
+**	ret
+*/
+TEST_UNIFORM_Z (sub_512_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, 512),
+		z0 = svsub_x (p0, z0, 512))
+
+/*
+** sub_65280_s32_x:
+**	sub	z0\.s, z0\.s, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (sub_65280_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, 0xff00),
+		z0 = svsub_x (p0, z0, 0xff00))
+
+/*
+** sub_65535_s32_x:
+**	mov	(z[0-9]+\.s), #-65535
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_65535_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, 65535),
+		z0 = svsub_x (p0, z0, 65535))
+
+/*
+** sub_65536_s32_x:
+**	mov	(z[0-9]+\.s), #-65536
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_65536_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, 65536),
+		z0 = svsub_x (p0, z0, 65536))
+
+/*
+** sub_m1_s32_x:
+**	add	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, -1),
+		z0 = svsub_x (p0, z0, -1))
+
+/*
+** sub_m127_s32_x:
+**	add	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m127_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, -127),
+		z0 = svsub_x (p0, z0, -127))
+
+/*
+** sub_m128_s32_x:
+**	add	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m128_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, -128),
+		z0 = svsub_x (p0, z0, -128))
+
+/*
+** sub_m255_s32_x:
+**	add	z0\.s, z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m255_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, -255),
+		z0 = svsub_x (p0, z0, -255))
+
+/*
+** sub_m256_s32_x:
+**	add	z0\.s, z0\.s, #256
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m256_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, -256),
+		z0 = svsub_x (p0, z0, -256))
+
+/*
+** sub_m511_s32_x:
+**	mov	(z[0-9]+\.s), #511
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m511_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, -511),
+		z0 = svsub_x (p0, z0, -511))
+
+/*
+** sub_m512_s32_x:
+**	add	z0\.s, z0\.s, #512
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m512_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, -512),
+		z0 = svsub_x (p0, z0, -512))
+
+/*
+** sub_m32768_s32_x:
+**	add	z0\.s, z0\.s, #32768
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m32768_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, -0x8000),
+		z0 = svsub_x (p0, z0, -0x8000))
+
+/*
+** sub_m65280_s32_x:
+**	add	z0\.s, z0\.s, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m65280_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, -0xff00),
+		z0 = svsub_x (p0, z0, -0xff00))
+
+/*
+** sub_m65535_s32_x:
+**	mov	(z[0-9]+\.s), #65535
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m65535_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, -65535),
+		z0 = svsub_x (p0, z0, -65535))
+
+/*
+** sub_m65536_s32_x:
+**	mov	(z[0-9]+\.s), #65536
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m65536_s32_x, svint32_t,
+		z0 = svsub_n_s32_x (p0, z0, -65536),
+		z0 = svsub_x (p0, z0, -65536))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s64.c
new file mode 100644
index 000000000..b9184c3a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s64.c
@@ -0,0 +1,426 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sub_s64_m_tied1:
+**	sub	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s64_m_tied1, svint64_t,
+		z0 = svsub_s64_m (p0, z0, z1),
+		z0 = svsub_m (p0, z0, z1))
+
+/*
+** sub_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	sub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s64_m_tied2, svint64_t,
+		z0 = svsub_s64_m (p0, z1, z0),
+		z0 = svsub_m (p0, z1, z0))
+
+/*
+** sub_s64_m_untied:
+**	movprfx	z0, z1
+**	sub	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s64_m_untied, svint64_t,
+		z0 = svsub_s64_m (p0, z1, z2),
+		z0 = svsub_m (p0, z1, z2))
+
+/*
+** sub_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	sub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svsub_n_s64_m (p0, z0, x0),
+		 z0 = svsub_m (p0, z0, x0))
+
+/*
+** sub_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	sub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svsub_n_s64_m (p0, z1, x0),
+		 z0 = svsub_m (p0, z1, x0))
+
+/*
+** sub_1_s64_m_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	add	z0\.d, p0/m, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s64_m_tied1, svint64_t,
+		z0 = svsub_n_s64_m (p0, z0, 1),
+		z0 = svsub_m (p0, z0, 1))
+
+/*
+** sub_1_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+)\.b, #-1
+**	movprfx	z0, z1
+**	add	z0\.d, p0/m, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s64_m_untied, svint64_t,
+		z0 = svsub_n_s64_m (p0, z1, 1),
+		z0 = svsub_m (p0, z1, 1))
+
+/*
+** sub_m2_s64_m:
+**	mov	(z[0-9]+\.d), #2
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_s64_m, svint64_t,
+		z0 = svsub_n_s64_m (p0, z0, -2),
+		z0 = svsub_m (p0, z0, -2))
+
+/*
+** sub_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sub	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s64_z_tied1, svint64_t,
+		z0 = svsub_s64_z (p0, z0, z1),
+		z0 = svsub_z (p0, z0, z1))
+
+/*
+** sub_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	subr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s64_z_tied2, svint64_t,
+		z0 = svsub_s64_z (p0, z1, z0),
+		z0 = svsub_z (p0, z1, z0))
+
+/*
+** sub_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	sub	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	subr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s64_z_untied, svint64_t,
+		z0 = svsub_s64_z (p0, z1, z2),
+		z0 = svsub_z (p0, z1, z2))
+
+/*
+** sub_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svsub_n_s64_z (p0, z0, x0),
+		 z0 = svsub_z (p0, z0, x0))
+
+/*
+** sub_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	sub	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	subr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svsub_n_s64_z (p0, z1, x0),
+		 z0 = svsub_z (p0, z1, x0))
+
+/*
+** sub_1_s64_z_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	add	z0\.d, p0/m, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s64_z_tied1, svint64_t,
+		z0 = svsub_n_s64_z (p0, z0, 1),
+		z0 = svsub_z (p0, z0, 1))
+
+/*
+** sub_1_s64_z_untied:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	add	z0\.d, p0/m, z0\.d, \1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1\.d
+**	add	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s64_z_untied, svint64_t,
+		z0 = svsub_n_s64_z (p0, z1, 1),
+		z0 = svsub_z (p0, z1, 1))
+
+/*
+** sub_s64_x_tied1:
+**	sub	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s64_x_tied1, svint64_t,
+		z0 = svsub_s64_x (p0, z0, z1),
+		z0 = svsub_x (p0, z0, z1))
+
+/*
+** sub_s64_x_tied2:
+**	sub	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s64_x_tied2, svint64_t,
+		z0 = svsub_s64_x (p0, z1, z0),
+		z0 = svsub_x (p0, z1, z0))
+
+/*
+** sub_s64_x_untied:
+**	sub	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s64_x_untied, svint64_t,
+		z0 = svsub_s64_x (p0, z1, z2),
+		z0 = svsub_x (p0, z1, z2))
+
+/*
+** sub_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	sub	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svsub_n_s64_x (p0, z0, x0),
+		 z0 = svsub_x (p0, z0, x0))
+
+/*
+** sub_x0_s64_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	sub	z0\.d, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svsub_n_s64_x (p0, z1, x0),
+		 z0 = svsub_x (p0, z1, x0))
+
+/*
+** sub_1_s64_x_tied1:
+**	sub	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s64_x_tied1, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, 1),
+		z0 = svsub_x (p0, z0, 1))
+
+/*
+** sub_1_s64_x_untied:
+**	movprfx	z0, z1
+**	sub	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s64_x_untied, svint64_t,
+		z0 = svsub_n_s64_x (p0, z1, 1),
+		z0 = svsub_x (p0, z1, 1))
+
+/*
+** sub_127_s64_x:
+**	sub	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (sub_127_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, 127),
+		z0 = svsub_x (p0, z0, 127))
+
+/*
+** sub_128_s64_x:
+**	sub	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (sub_128_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, 128),
+		z0 = svsub_x (p0, z0, 128))
+
+/*
+** sub_255_s64_x:
+**	sub	z0\.d, z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (sub_255_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, 255),
+		z0 = svsub_x (p0, z0, 255))
+
+/*
+** sub_256_s64_x:
+**	sub	z0\.d, z0\.d, #256
+**	ret
+*/
+TEST_UNIFORM_Z (sub_256_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, 256),
+		z0 = svsub_x (p0, z0, 256))
+
+/*
+** sub_511_s64_x:
+**	mov	(z[0-9]+\.d), #-511
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_511_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, 511),
+		z0 = svsub_x (p0, z0, 511))
+
+/*
+** sub_512_s64_x:
+**	sub	z0\.d, z0\.d, #512
+**	ret
+*/
+TEST_UNIFORM_Z (sub_512_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, 512),
+		z0 = svsub_x (p0, z0, 512))
+
+/*
+** sub_65280_s64_x:
+**	sub	z0\.d, z0\.d, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (sub_65280_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, 0xff00),
+		z0 = svsub_x (p0, z0, 0xff00))
+
+/*
+** sub_65535_s64_x:
+**	mov	(z[0-9]+\.d), #-65535
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_65535_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, 65535),
+		z0 = svsub_x (p0, z0, 65535))
+
+/*
+** sub_65536_s64_x:
+**	mov	(z[0-9]+\.d), #-65536
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_65536_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, 65536),
+		z0 = svsub_x (p0, z0, 65536))
+
+/*
+** sub_m1_s64_x:
+**	add	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, -1),
+		z0 = svsub_x (p0, z0, -1))
+
+/*
+** sub_m127_s64_x:
+**	add	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m127_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, -127),
+		z0 = svsub_x (p0, z0, -127))
+
+/*
+** sub_m128_s64_x:
+**	add	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m128_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, -128),
+		z0 = svsub_x (p0, z0, -128))
+
+/*
+** sub_m255_s64_x:
+**	add	z0\.d, z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m255_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, -255),
+		z0 = svsub_x (p0, z0, -255))
+
+/*
+** sub_m256_s64_x:
+**	add	z0\.d, z0\.d, #256
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m256_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, -256),
+		z0 = svsub_x (p0, z0, -256))
+
+/*
+** sub_m511_s64_x:
+**	mov	(z[0-9]+\.d), #511
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m511_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, -511),
+		z0 = svsub_x (p0, z0, -511))
+
+/*
+** sub_m512_s64_x:
+**	add	z0\.d, z0\.d, #512
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m512_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, -512),
+		z0 = svsub_x (p0, z0, -512))
+
+/*
+** sub_m32768_s64_x:
+**	add	z0\.d, z0\.d, #32768
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m32768_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, -0x8000),
+		z0 = svsub_x (p0, z0, -0x8000))
+
+/*
+** sub_m65280_s64_x:
+**	add	z0\.d, z0\.d, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m65280_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, -0xff00),
+		z0 = svsub_x (p0, z0, -0xff00))
+
+/*
+** sub_m65535_s64_x:
+**	mov	(z[0-9]+\.d), #65535
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m65535_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, -65535),
+		z0 = svsub_x (p0, z0, -65535))
+
+/*
+** sub_m65536_s64_x:
+**	mov	(z[0-9]+\.d), #65536
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m65536_s64_x, svint64_t,
+		z0 = svsub_n_s64_x (p0, z0, -65536),
+		z0 = svsub_x (p0, z0, -65536))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s8.c
new file mode 100644
index 000000000..0d7ba99aa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_s8.c
@@ -0,0 +1,294 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sub_s8_m_tied1:
+**	sub	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s8_m_tied1, svint8_t,
+		z0 = svsub_s8_m (p0, z0, z1),
+		z0 = svsub_m (p0, z0, z1))
+
+/*
+** sub_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	sub	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s8_m_tied2, svint8_t,
+		z0 = svsub_s8_m (p0, z1, z0),
+		z0 = svsub_m (p0, z1, z0))
+
+/*
+** sub_s8_m_untied:
+**	movprfx	z0, z1
+**	sub	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s8_m_untied, svint8_t,
+		z0 = svsub_s8_m (p0, z1, z2),
+		z0 = svsub_m (p0, z1, z2))
+
+/*
+** sub_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	sub	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s8_m_tied1, svint8_t, int8_t,
+		 z0 = svsub_n_s8_m (p0, z0, x0),
+		 z0 = svsub_m (p0, z0, x0))
+
+/*
+** sub_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	sub	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s8_m_untied, svint8_t, int8_t,
+		 z0 = svsub_n_s8_m (p0, z1, x0),
+		 z0 = svsub_m (p0, z1, x0))
+
+/*
+** sub_1_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #-1
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s8_m_tied1, svint8_t,
+		z0 = svsub_n_s8_m (p0, z0, 1),
+		z0 = svsub_m (p0, z0, 1))
+
+/*
+** sub_1_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #-1
+**	movprfx	z0, z1
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s8_m_untied, svint8_t,
+		z0 = svsub_n_s8_m (p0, z1, 1),
+		z0 = svsub_m (p0, z1, 1))
+
+/*
+** sub_m1_s8_m:
+**	mov	(z[0-9]+\.b), #1
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_s8_m, svint8_t,
+		z0 = svsub_n_s8_m (p0, z0, -1),
+		z0 = svsub_m (p0, z0, -1))
+
+/*
+** sub_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	sub	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s8_z_tied1, svint8_t,
+		z0 = svsub_s8_z (p0, z0, z1),
+		z0 = svsub_z (p0, z0, z1))
+
+/*
+** sub_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	subr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s8_z_tied2, svint8_t,
+		z0 = svsub_s8_z (p0, z1, z0),
+		z0 = svsub_z (p0, z1, z0))
+
+/*
+** sub_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	sub	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	subr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s8_z_untied, svint8_t,
+		z0 = svsub_s8_z (p0, z1, z2),
+		z0 = svsub_z (p0, z1, z2))
+
+/*
+** sub_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	sub	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s8_z_tied1, svint8_t, int8_t,
+		 z0 = svsub_n_s8_z (p0, z0, x0),
+		 z0 = svsub_z (p0, z0, x0))
+
+/*
+** sub_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	sub	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	subr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s8_z_untied, svint8_t, int8_t,
+		 z0 = svsub_n_s8_z (p0, z1, x0),
+		 z0 = svsub_z (p0, z1, x0))
+
+/*
+** sub_1_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #-1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s8_z_tied1, svint8_t,
+		z0 = svsub_n_s8_z (p0, z0, 1),
+		z0 = svsub_z (p0, z0, 1))
+
+/*
+** sub_1_s8_z_untied:
+**	mov	(z[0-9]+\.b), #-1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	add	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	add	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s8_z_untied, svint8_t,
+		z0 = svsub_n_s8_z (p0, z1, 1),
+		z0 = svsub_z (p0, z1, 1))
+
+/*
+** sub_s8_x_tied1:
+**	sub	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s8_x_tied1, svint8_t,
+		z0 = svsub_s8_x (p0, z0, z1),
+		z0 = svsub_x (p0, z0, z1))
+
+/*
+** sub_s8_x_tied2:
+**	sub	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s8_x_tied2, svint8_t,
+		z0 = svsub_s8_x (p0, z1, z0),
+		z0 = svsub_x (p0, z1, z0))
+
+/*
+** sub_s8_x_untied:
+**	sub	z0\.b, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sub_s8_x_untied, svint8_t,
+		z0 = svsub_s8_x (p0, z1, z2),
+		z0 = svsub_x (p0, z1, z2))
+
+/*
+** sub_w0_s8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	sub	z0\.b, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s8_x_tied1, svint8_t, int8_t,
+		 z0 = svsub_n_s8_x (p0, z0, x0),
+		 z0 = svsub_x (p0, z0, x0))
+
+/*
+** sub_w0_s8_x_untied:
+**	mov	(z[0-9]+\.b), w0
+**	sub	z0\.b, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_s8_x_untied, svint8_t, int8_t,
+		 z0 = svsub_n_s8_x (p0, z1, x0),
+		 z0 = svsub_x (p0, z1, x0))
+
+/*
+** sub_1_s8_x_tied1:
+**	add	z0\.b, z0\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s8_x_tied1, svint8_t,
+		z0 = svsub_n_s8_x (p0, z0, 1),
+		z0 = svsub_x (p0, z0, 1))
+
+/*
+** sub_1_s8_x_untied:
+**	movprfx	z0, z1
+**	add	z0\.b, z0\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_s8_x_untied, svint8_t,
+		z0 = svsub_n_s8_x (p0, z1, 1),
+		z0 = svsub_x (p0, z1, 1))
+
+/*
+** sub_127_s8_x:
+**	add	z0\.b, z0\.b, #129
+**	ret
+*/
+TEST_UNIFORM_Z (sub_127_s8_x, svint8_t,
+		z0 = svsub_n_s8_x (p0, z0, 127),
+		z0 = svsub_x (p0, z0, 127))
+
+/*
+** sub_128_s8_x:
+**	add	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (sub_128_s8_x, svint8_t,
+		z0 = svsub_n_s8_x (p0, z0, 128),
+		z0 = svsub_x (p0, z0, 128))
+
+/*
+** sub_255_s8_x:
+**	add	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_255_s8_x, svint8_t,
+		z0 = svsub_n_s8_x (p0, z0, 255),
+		z0 = svsub_x (p0, z0, 255))
+
+/*
+** sub_m1_s8_x:
+**	add	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_s8_x, svint8_t,
+		z0 = svsub_n_s8_x (p0, z0, -1),
+		z0 = svsub_x (p0, z0, -1))
+
+/*
+** sub_m127_s8_x:
+**	add	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m127_s8_x, svint8_t,
+		z0 = svsub_n_s8_x (p0, z0, -127),
+		z0 = svsub_x (p0, z0, -127))
+
+/*
+** sub_m128_s8_x:
+**	add	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m128_s8_x, svint8_t,
+		z0 = svsub_n_s8_x (p0, z0, -128),
+		z0 = svsub_x (p0, z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u16.c
new file mode 100644
index 000000000..89620e159
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u16.c
@@ -0,0 +1,377 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sub_u16_m_tied1:
+**	sub	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u16_m_tied1, svuint16_t,
+		z0 = svsub_u16_m (p0, z0, z1),
+		z0 = svsub_m (p0, z0, z1))
+
+/*
+** sub_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	sub	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u16_m_tied2, svuint16_t,
+		z0 = svsub_u16_m (p0, z1, z0),
+		z0 = svsub_m (p0, z1, z0))
+
+/*
+** sub_u16_m_untied:
+**	movprfx	z0, z1
+**	sub	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u16_m_untied, svuint16_t,
+		z0 = svsub_u16_m (p0, z1, z2),
+		z0 = svsub_m (p0, z1, z2))
+
+/*
+** sub_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	sub	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svsub_n_u16_m (p0, z0, x0),
+		 z0 = svsub_m (p0, z0, x0))
+
+/*
+** sub_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	sub	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svsub_n_u16_m (p0, z1, x0),
+		 z0 = svsub_m (p0, z1, x0))
+
+/*
+** sub_1_u16_m_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	add	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u16_m_tied1, svuint16_t,
+		z0 = svsub_n_u16_m (p0, z0, 1),
+		z0 = svsub_m (p0, z0, 1))
+
+/*
+** sub_1_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+)\.b, #-1
+**	movprfx	z0, z1
+**	add	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u16_m_untied, svuint16_t,
+		z0 = svsub_n_u16_m (p0, z1, 1),
+		z0 = svsub_m (p0, z1, 1))
+
+/*
+** sub_m2_u16_m:
+**	mov	(z[0-9]+\.h), #2
+**	add	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_u16_m, svuint16_t,
+		z0 = svsub_n_u16_m (p0, z0, -2),
+		z0 = svsub_m (p0, z0, -2))
+
+/*
+** sub_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	sub	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u16_z_tied1, svuint16_t,
+		z0 = svsub_u16_z (p0, z0, z1),
+		z0 = svsub_z (p0, z0, z1))
+
+/*
+** sub_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	subr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u16_z_tied2, svuint16_t,
+		z0 = svsub_u16_z (p0, z1, z0),
+		z0 = svsub_z (p0, z1, z0))
+
+/*
+** sub_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	sub	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	subr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u16_z_untied, svuint16_t,
+		z0 = svsub_u16_z (p0, z1, z2),
+		z0 = svsub_z (p0, z1, z2))
+
+/*
+** sub_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	sub	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svsub_n_u16_z (p0, z0, x0),
+		 z0 = svsub_z (p0, z0, x0))
+
+/*
+** sub_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	sub	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	subr	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svsub_n_u16_z (p0, z1, x0),
+		 z0 = svsub_z (p0, z1, x0))
+
+/*
+** sub_1_u16_z_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	add	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u16_z_tied1, svuint16_t,
+		z0 = svsub_n_u16_z (p0, z0, 1),
+		z0 = svsub_z (p0, z0, 1))
+
+/*
+** sub_1_u16_z_untied:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	add	z0\.h, p0/m, z0\.h, \1\.h
+** |
+**	movprfx	z0\.h, p0/z, \1\.h
+**	add	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u16_z_untied, svuint16_t,
+		z0 = svsub_n_u16_z (p0, z1, 1),
+		z0 = svsub_z (p0, z1, 1))
+
+/*
+** sub_u16_x_tied1:
+**	sub	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u16_x_tied1, svuint16_t,
+		z0 = svsub_u16_x (p0, z0, z1),
+		z0 = svsub_x (p0, z0, z1))
+
+/*
+** sub_u16_x_tied2:
+**	sub	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u16_x_tied2, svuint16_t,
+		z0 = svsub_u16_x (p0, z1, z0),
+		z0 = svsub_x (p0, z1, z0))
+
+/*
+** sub_u16_x_untied:
+**	sub	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u16_x_untied, svuint16_t,
+		z0 = svsub_u16_x (p0, z1, z2),
+		z0 = svsub_x (p0, z1, z2))
+
+/*
+** sub_w0_u16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	sub	z0\.h, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svsub_n_u16_x (p0, z0, x0),
+		 z0 = svsub_x (p0, z0, x0))
+
+/*
+** sub_w0_u16_x_untied:
+**	mov	(z[0-9]+\.h), w0
+**	sub	z0\.h, z1\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svsub_n_u16_x (p0, z1, x0),
+		 z0 = svsub_x (p0, z1, x0))
+
+/*
+** sub_1_u16_x_tied1:
+**	sub	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u16_x_tied1, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z0, 1),
+		z0 = svsub_x (p0, z0, 1))
+
+/*
+** sub_1_u16_x_untied:
+**	movprfx	z0, z1
+**	sub	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u16_x_untied, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z1, 1),
+		z0 = svsub_x (p0, z1, 1))
+
+/*
+** sub_127_u16_x:
+**	sub	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (sub_127_u16_x, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z0, 127),
+		z0 = svsub_x (p0, z0, 127))
+
+/*
+** sub_128_u16_x:
+**	sub	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (sub_128_u16_x, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z0, 128),
+		z0 = svsub_x (p0, z0, 128))
+
+/*
+** sub_255_u16_x:
+**	sub	z0\.h, z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (sub_255_u16_x, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z0, 255),
+		z0 = svsub_x (p0, z0, 255))
+
+/*
+** sub_256_u16_x:
+**	add	z0\.h, z0\.h, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (sub_256_u16_x, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z0, 256),
+		z0 = svsub_x (p0, z0, 256))
+
+/*
+** sub_257_u16_x:
+**	mov	(z[0-9]+\.h), #-257
+**	add	z0\.h, (z0\.h, \1|\1, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_257_u16_x, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z0, 257),
+		z0 = svsub_x (p0, z0, 257))
+
+/*
+** sub_512_u16_x:
+**	add	z0\.h, z0\.h, #65024
+**	ret
+*/
+TEST_UNIFORM_Z (sub_512_u16_x, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z0, 512),
+		z0 = svsub_x (p0, z0, 512))
+
+/*
+** sub_65280_u16_x:
+**	add	z0\.h, z0\.h, #256
+**	ret
+*/
+TEST_UNIFORM_Z (sub_65280_u16_x, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z0, 0xff00),
+		z0 = svsub_x (p0, z0, 0xff00))
+
+/*
+** sub_m1_u16_x:
+**	add	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_u16_x, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z0, -1),
+		z0 = svsub_x (p0, z0, -1))
+
+/*
+** sub_m127_u16_x:
+**	add	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m127_u16_x, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z0, -127),
+		z0 = svsub_x (p0, z0, -127))
+
+/*
+** sub_m128_u16_x:
+**	add	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m128_u16_x, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z0, -128),
+		z0 = svsub_x (p0, z0, -128))
+
+/*
+** sub_m255_u16_x:
+**	add	z0\.h, z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m255_u16_x, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z0, -255),
+		z0 = svsub_x (p0, z0, -255))
+
+/*
+** sub_m256_u16_x:
+**	add	z0\.h, z0\.h, #256
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m256_u16_x, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z0, -256),
+		z0 = svsub_x (p0, z0, -256))
+
+/*
+** sub_m257_u16_x:
+**	mov	(z[0-9]+)\.b, #1
+**	add	z0\.h, (z0\.h, \1\.h|\1\.h, z0\.h)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m257_u16_x, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z0, -257),
+		z0 = svsub_x (p0, z0, -257))
+
+/*
+** sub_m512_u16_x:
+**	add	z0\.h, z0\.h, #512
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m512_u16_x, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z0, -512),
+		z0 = svsub_x (p0, z0, -512))
+
+/*
+** sub_m32768_u16_x:
+**	add	z0\.h, z0\.h, #32768
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m32768_u16_x, svuint16_t,
+		z0 = svsub_n_u16_x (p0, z0, -0x8000),
+		z0 = svsub_x (p0, z0, -0x8000))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u32.c
new file mode 100644
index 000000000..c4b405d4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u32.c
@@ -0,0 +1,426 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sub_u32_m_tied1:
+**	sub	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u32_m_tied1, svuint32_t,
+		z0 = svsub_u32_m (p0, z0, z1),
+		z0 = svsub_m (p0, z0, z1))
+
+/*
+** sub_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	sub	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u32_m_tied2, svuint32_t,
+		z0 = svsub_u32_m (p0, z1, z0),
+		z0 = svsub_m (p0, z1, z0))
+
+/*
+** sub_u32_m_untied:
+**	movprfx	z0, z1
+**	sub	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u32_m_untied, svuint32_t,
+		z0 = svsub_u32_m (p0, z1, z2),
+		z0 = svsub_m (p0, z1, z2))
+
+/*
+** sub_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	sub	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svsub_n_u32_m (p0, z0, x0),
+		 z0 = svsub_m (p0, z0, x0))
+
+/*
+** sub_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	sub	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svsub_n_u32_m (p0, z1, x0),
+		 z0 = svsub_m (p0, z1, x0))
+
+/*
+** sub_1_u32_m_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	add	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u32_m_tied1, svuint32_t,
+		z0 = svsub_n_u32_m (p0, z0, 1),
+		z0 = svsub_m (p0, z0, 1))
+
+/*
+** sub_1_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+)\.b, #-1
+**	movprfx	z0, z1
+**	add	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u32_m_untied, svuint32_t,
+		z0 = svsub_n_u32_m (p0, z1, 1),
+		z0 = svsub_m (p0, z1, 1))
+
+/*
+** sub_m2_u32_m:
+**	mov	(z[0-9]+\.s), #2
+**	add	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_u32_m, svuint32_t,
+		z0 = svsub_n_u32_m (p0, z0, -2),
+		z0 = svsub_m (p0, z0, -2))
+
+/*
+** sub_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sub	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u32_z_tied1, svuint32_t,
+		z0 = svsub_u32_z (p0, z0, z1),
+		z0 = svsub_z (p0, z0, z1))
+
+/*
+** sub_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	subr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u32_z_tied2, svuint32_t,
+		z0 = svsub_u32_z (p0, z1, z0),
+		z0 = svsub_z (p0, z1, z0))
+
+/*
+** sub_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	sub	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	subr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u32_z_untied, svuint32_t,
+		z0 = svsub_u32_z (p0, z1, z2),
+		z0 = svsub_z (p0, z1, z2))
+
+/*
+** sub_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sub	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svsub_n_u32_z (p0, z0, x0),
+		 z0 = svsub_z (p0, z0, x0))
+
+/*
+** sub_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	sub	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	subr	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svsub_n_u32_z (p0, z1, x0),
+		 z0 = svsub_z (p0, z1, x0))
+
+/*
+** sub_1_u32_z_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	add	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u32_z_tied1, svuint32_t,
+		z0 = svsub_n_u32_z (p0, z0, 1),
+		z0 = svsub_z (p0, z0, 1))
+
+/*
+** sub_1_u32_z_untied:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	add	z0\.s, p0/m, z0\.s, \1\.s
+** |
+**	movprfx	z0\.s, p0/z, \1\.s
+**	add	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u32_z_untied, svuint32_t,
+		z0 = svsub_n_u32_z (p0, z1, 1),
+		z0 = svsub_z (p0, z1, 1))
+
+/*
+** sub_u32_x_tied1:
+**	sub	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u32_x_tied1, svuint32_t,
+		z0 = svsub_u32_x (p0, z0, z1),
+		z0 = svsub_x (p0, z0, z1))
+
+/*
+** sub_u32_x_tied2:
+**	sub	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u32_x_tied2, svuint32_t,
+		z0 = svsub_u32_x (p0, z1, z0),
+		z0 = svsub_x (p0, z1, z0))
+
+/*
+** sub_u32_x_untied:
+**	sub	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u32_x_untied, svuint32_t,
+		z0 = svsub_u32_x (p0, z1, z2),
+		z0 = svsub_x (p0, z1, z2))
+
+/*
+** sub_w0_u32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	sub	z0\.s, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svsub_n_u32_x (p0, z0, x0),
+		 z0 = svsub_x (p0, z0, x0))
+
+/*
+** sub_w0_u32_x_untied:
+**	mov	(z[0-9]+\.s), w0
+**	sub	z0\.s, z1\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svsub_n_u32_x (p0, z1, x0),
+		 z0 = svsub_x (p0, z1, x0))
+
+/*
+** sub_1_u32_x_tied1:
+**	sub	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u32_x_tied1, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, 1),
+		z0 = svsub_x (p0, z0, 1))
+
+/*
+** sub_1_u32_x_untied:
+**	movprfx	z0, z1
+**	sub	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u32_x_untied, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z1, 1),
+		z0 = svsub_x (p0, z1, 1))
+
+/*
+** sub_127_u32_x:
+**	sub	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (sub_127_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, 127),
+		z0 = svsub_x (p0, z0, 127))
+
+/*
+** sub_128_u32_x:
+**	sub	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (sub_128_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, 128),
+		z0 = svsub_x (p0, z0, 128))
+
+/*
+** sub_255_u32_x:
+**	sub	z0\.s, z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (sub_255_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, 255),
+		z0 = svsub_x (p0, z0, 255))
+
+/*
+** sub_256_u32_x:
+**	sub	z0\.s, z0\.s, #256
+**	ret
+*/
+TEST_UNIFORM_Z (sub_256_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, 256),
+		z0 = svsub_x (p0, z0, 256))
+
+/*
+** sub_511_u32_x:
+**	mov	(z[0-9]+\.s), #-511
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_511_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, 511),
+		z0 = svsub_x (p0, z0, 511))
+
+/*
+** sub_512_u32_x:
+**	sub	z0\.s, z0\.s, #512
+**	ret
+*/
+TEST_UNIFORM_Z (sub_512_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, 512),
+		z0 = svsub_x (p0, z0, 512))
+
+/*
+** sub_65280_u32_x:
+**	sub	z0\.s, z0\.s, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (sub_65280_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, 0xff00),
+		z0 = svsub_x (p0, z0, 0xff00))
+
+/*
+** sub_65535_u32_x:
+**	mov	(z[0-9]+\.s), #-65535
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_65535_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, 65535),
+		z0 = svsub_x (p0, z0, 65535))
+
+/*
+** sub_65536_u32_x:
+**	mov	(z[0-9]+\.s), #-65536
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_65536_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, 65536),
+		z0 = svsub_x (p0, z0, 65536))
+
+/*
+** sub_m1_u32_x:
+**	add	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, -1),
+		z0 = svsub_x (p0, z0, -1))
+
+/*
+** sub_m127_u32_x:
+**	add	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m127_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, -127),
+		z0 = svsub_x (p0, z0, -127))
+
+/*
+** sub_m128_u32_x:
+**	add	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m128_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, -128),
+		z0 = svsub_x (p0, z0, -128))
+
+/*
+** sub_m255_u32_x:
+**	add	z0\.s, z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m255_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, -255),
+		z0 = svsub_x (p0, z0, -255))
+
+/*
+** sub_m256_u32_x:
+**	add	z0\.s, z0\.s, #256
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m256_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, -256),
+		z0 = svsub_x (p0, z0, -256))
+
+/*
+** sub_m511_u32_x:
+**	mov	(z[0-9]+\.s), #511
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m511_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, -511),
+		z0 = svsub_x (p0, z0, -511))
+
+/*
+** sub_m512_u32_x:
+**	add	z0\.s, z0\.s, #512
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m512_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, -512),
+		z0 = svsub_x (p0, z0, -512))
+
+/*
+** sub_m32768_u32_x:
+**	add	z0\.s, z0\.s, #32768
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m32768_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, -0x8000),
+		z0 = svsub_x (p0, z0, -0x8000))
+
+/*
+** sub_m65280_u32_x:
+**	add	z0\.s, z0\.s, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m65280_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, -0xff00),
+		z0 = svsub_x (p0, z0, -0xff00))
+
+/*
+** sub_m65535_u32_x:
+**	mov	(z[0-9]+\.s), #65535
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m65535_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, -65535),
+		z0 = svsub_x (p0, z0, -65535))
+
+/*
+** sub_m65536_u32_x:
+**	mov	(z[0-9]+\.s), #65536
+**	add	z0\.s, (z0\.s, \1|\1, z0\.s)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m65536_u32_x, svuint32_t,
+		z0 = svsub_n_u32_x (p0, z0, -65536),
+		z0 = svsub_x (p0, z0, -65536))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u64.c
new file mode 100644
index 000000000..fb7f7173a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u64.c
@@ -0,0 +1,426 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sub_u64_m_tied1:
+**	sub	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u64_m_tied1, svuint64_t,
+		z0 = svsub_u64_m (p0, z0, z1),
+		z0 = svsub_m (p0, z0, z1))
+
+/*
+** sub_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	sub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u64_m_tied2, svuint64_t,
+		z0 = svsub_u64_m (p0, z1, z0),
+		z0 = svsub_m (p0, z1, z0))
+
+/*
+** sub_u64_m_untied:
+**	movprfx	z0, z1
+**	sub	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u64_m_untied, svuint64_t,
+		z0 = svsub_u64_m (p0, z1, z2),
+		z0 = svsub_m (p0, z1, z2))
+
+/*
+** sub_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	sub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svsub_n_u64_m (p0, z0, x0),
+		 z0 = svsub_m (p0, z0, x0))
+
+/*
+** sub_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	sub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svsub_n_u64_m (p0, z1, x0),
+		 z0 = svsub_m (p0, z1, x0))
+
+/*
+** sub_1_u64_m_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	add	z0\.d, p0/m, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u64_m_tied1, svuint64_t,
+		z0 = svsub_n_u64_m (p0, z0, 1),
+		z0 = svsub_m (p0, z0, 1))
+
+/*
+** sub_1_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+)\.b, #-1
+**	movprfx	z0, z1
+**	add	z0\.d, p0/m, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u64_m_untied, svuint64_t,
+		z0 = svsub_n_u64_m (p0, z1, 1),
+		z0 = svsub_m (p0, z1, 1))
+
+/*
+** sub_m2_u64_m:
+**	mov	(z[0-9]+\.d), #2
+**	add	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m2_u64_m, svuint64_t,
+		z0 = svsub_n_u64_m (p0, z0, -2),
+		z0 = svsub_m (p0, z0, -2))
+
+/*
+** sub_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sub	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u64_z_tied1, svuint64_t,
+		z0 = svsub_u64_z (p0, z0, z1),
+		z0 = svsub_z (p0, z0, z1))
+
+/*
+** sub_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	subr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u64_z_tied2, svuint64_t,
+		z0 = svsub_u64_z (p0, z1, z0),
+		z0 = svsub_z (p0, z1, z0))
+
+/*
+** sub_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	sub	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	subr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u64_z_untied, svuint64_t,
+		z0 = svsub_u64_z (p0, z1, z2),
+		z0 = svsub_z (p0, z1, z2))
+
+/*
+** sub_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sub	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svsub_n_u64_z (p0, z0, x0),
+		 z0 = svsub_z (p0, z0, x0))
+
+/*
+** sub_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	sub	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	subr	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svsub_n_u64_z (p0, z1, x0),
+		 z0 = svsub_z (p0, z1, x0))
+
+/*
+** sub_1_u64_z_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	add	z0\.d, p0/m, z0\.d, \1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u64_z_tied1, svuint64_t,
+		z0 = svsub_n_u64_z (p0, z0, 1),
+		z0 = svsub_z (p0, z0, 1))
+
+/*
+** sub_1_u64_z_untied:
+**	mov	(z[0-9]+)\.b, #-1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	add	z0\.d, p0/m, z0\.d, \1\.d
+** |
+**	movprfx	z0\.d, p0/z, \1\.d
+**	add	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u64_z_untied, svuint64_t,
+		z0 = svsub_n_u64_z (p0, z1, 1),
+		z0 = svsub_z (p0, z1, 1))
+
+/*
+** sub_u64_x_tied1:
+**	sub	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u64_x_tied1, svuint64_t,
+		z0 = svsub_u64_x (p0, z0, z1),
+		z0 = svsub_x (p0, z0, z1))
+
+/*
+** sub_u64_x_tied2:
+**	sub	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u64_x_tied2, svuint64_t,
+		z0 = svsub_u64_x (p0, z1, z0),
+		z0 = svsub_x (p0, z1, z0))
+
+/*
+** sub_u64_x_untied:
+**	sub	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u64_x_untied, svuint64_t,
+		z0 = svsub_u64_x (p0, z1, z2),
+		z0 = svsub_x (p0, z1, z2))
+
+/*
+** sub_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	sub	z0\.d, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svsub_n_u64_x (p0, z0, x0),
+		 z0 = svsub_x (p0, z0, x0))
+
+/*
+** sub_x0_u64_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	sub	z0\.d, z1\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svsub_n_u64_x (p0, z1, x0),
+		 z0 = svsub_x (p0, z1, x0))
+
+/*
+** sub_1_u64_x_tied1:
+**	sub	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u64_x_tied1, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, 1),
+		z0 = svsub_x (p0, z0, 1))
+
+/*
+** sub_1_u64_x_untied:
+**	movprfx	z0, z1
+**	sub	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u64_x_untied, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z1, 1),
+		z0 = svsub_x (p0, z1, 1))
+
+/*
+** sub_127_u64_x:
+**	sub	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (sub_127_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, 127),
+		z0 = svsub_x (p0, z0, 127))
+
+/*
+** sub_128_u64_x:
+**	sub	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (sub_128_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, 128),
+		z0 = svsub_x (p0, z0, 128))
+
+/*
+** sub_255_u64_x:
+**	sub	z0\.d, z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (sub_255_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, 255),
+		z0 = svsub_x (p0, z0, 255))
+
+/*
+** sub_256_u64_x:
+**	sub	z0\.d, z0\.d, #256
+**	ret
+*/
+TEST_UNIFORM_Z (sub_256_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, 256),
+		z0 = svsub_x (p0, z0, 256))
+
+/*
+** sub_511_u64_x:
+**	mov	(z[0-9]+\.d), #-511
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_511_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, 511),
+		z0 = svsub_x (p0, z0, 511))
+
+/*
+** sub_512_u64_x:
+**	sub	z0\.d, z0\.d, #512
+**	ret
+*/
+TEST_UNIFORM_Z (sub_512_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, 512),
+		z0 = svsub_x (p0, z0, 512))
+
+/*
+** sub_65280_u64_x:
+**	sub	z0\.d, z0\.d, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (sub_65280_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, 0xff00),
+		z0 = svsub_x (p0, z0, 0xff00))
+
+/*
+** sub_65535_u64_x:
+**	mov	(z[0-9]+\.d), #-65535
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_65535_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, 65535),
+		z0 = svsub_x (p0, z0, 65535))
+
+/*
+** sub_65536_u64_x:
+**	mov	(z[0-9]+\.d), #-65536
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_65536_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, 65536),
+		z0 = svsub_x (p0, z0, 65536))
+
+/*
+** sub_m1_u64_x:
+**	add	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, -1),
+		z0 = svsub_x (p0, z0, -1))
+
+/*
+** sub_m127_u64_x:
+**	add	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m127_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, -127),
+		z0 = svsub_x (p0, z0, -127))
+
+/*
+** sub_m128_u64_x:
+**	add	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m128_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, -128),
+		z0 = svsub_x (p0, z0, -128))
+
+/*
+** sub_m255_u64_x:
+**	add	z0\.d, z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m255_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, -255),
+		z0 = svsub_x (p0, z0, -255))
+
+/*
+** sub_m256_u64_x:
+**	add	z0\.d, z0\.d, #256
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m256_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, -256),
+		z0 = svsub_x (p0, z0, -256))
+
+/*
+** sub_m511_u64_x:
+**	mov	(z[0-9]+\.d), #511
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m511_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, -511),
+		z0 = svsub_x (p0, z0, -511))
+
+/*
+** sub_m512_u64_x:
+**	add	z0\.d, z0\.d, #512
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m512_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, -512),
+		z0 = svsub_x (p0, z0, -512))
+
+/*
+** sub_m32768_u64_x:
+**	add	z0\.d, z0\.d, #32768
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m32768_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, -0x8000),
+		z0 = svsub_x (p0, z0, -0x8000))
+
+/*
+** sub_m65280_u64_x:
+**	add	z0\.d, z0\.d, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m65280_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, -0xff00),
+		z0 = svsub_x (p0, z0, -0xff00))
+
+/*
+** sub_m65535_u64_x:
+**	mov	(z[0-9]+\.d), #65535
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m65535_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, -65535),
+		z0 = svsub_x (p0, z0, -65535))
+
+/*
+** sub_m65536_u64_x:
+**	mov	(z[0-9]+\.d), #65536
+**	add	z0\.d, (z0\.d, \1|\1, z0\.d)
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m65536_u64_x, svuint64_t,
+		z0 = svsub_n_u64_x (p0, z0, -65536),
+		z0 = svsub_x (p0, z0, -65536))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u8.c
new file mode 100644
index 000000000..455204191
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sub_u8.c
@@ -0,0 +1,294 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sub_u8_m_tied1:
+**	sub	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u8_m_tied1, svuint8_t,
+		z0 = svsub_u8_m (p0, z0, z1),
+		z0 = svsub_m (p0, z0, z1))
+
+/*
+** sub_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	sub	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u8_m_tied2, svuint8_t,
+		z0 = svsub_u8_m (p0, z1, z0),
+		z0 = svsub_m (p0, z1, z0))
+
+/*
+** sub_u8_m_untied:
+**	movprfx	z0, z1
+**	sub	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u8_m_untied, svuint8_t,
+		z0 = svsub_u8_m (p0, z1, z2),
+		z0 = svsub_m (p0, z1, z2))
+
+/*
+** sub_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	sub	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svsub_n_u8_m (p0, z0, x0),
+		 z0 = svsub_m (p0, z0, x0))
+
+/*
+** sub_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	sub	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svsub_n_u8_m (p0, z1, x0),
+		 z0 = svsub_m (p0, z1, x0))
+
+/*
+** sub_1_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #-1
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u8_m_tied1, svuint8_t,
+		z0 = svsub_n_u8_m (p0, z0, 1),
+		z0 = svsub_m (p0, z0, 1))
+
+/*
+** sub_1_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #-1
+**	movprfx	z0, z1
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u8_m_untied, svuint8_t,
+		z0 = svsub_n_u8_m (p0, z1, 1),
+		z0 = svsub_m (p0, z1, 1))
+
+/*
+** sub_m1_u8_m:
+**	mov	(z[0-9]+\.b), #1
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_u8_m, svuint8_t,
+		z0 = svsub_n_u8_m (p0, z0, -1),
+		z0 = svsub_m (p0, z0, -1))
+
+/*
+** sub_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	sub	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u8_z_tied1, svuint8_t,
+		z0 = svsub_u8_z (p0, z0, z1),
+		z0 = svsub_z (p0, z0, z1))
+
+/*
+** sub_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	subr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u8_z_tied2, svuint8_t,
+		z0 = svsub_u8_z (p0, z1, z0),
+		z0 = svsub_z (p0, z1, z0))
+
+/*
+** sub_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	sub	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	subr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u8_z_untied, svuint8_t,
+		z0 = svsub_u8_z (p0, z1, z2),
+		z0 = svsub_z (p0, z1, z2))
+
+/*
+** sub_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	sub	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svsub_n_u8_z (p0, z0, x0),
+		 z0 = svsub_z (p0, z0, x0))
+
+/*
+** sub_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	sub	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	subr	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svsub_n_u8_z (p0, z1, x0),
+		 z0 = svsub_z (p0, z1, x0))
+
+/*
+** sub_1_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #-1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	add	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u8_z_tied1, svuint8_t,
+		z0 = svsub_n_u8_z (p0, z0, 1),
+		z0 = svsub_z (p0, z0, 1))
+
+/*
+** sub_1_u8_z_untied:
+**	mov	(z[0-9]+\.b), #-1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	add	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	add	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u8_z_untied, svuint8_t,
+		z0 = svsub_n_u8_z (p0, z1, 1),
+		z0 = svsub_z (p0, z1, 1))
+
+/*
+** sub_u8_x_tied1:
+**	sub	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u8_x_tied1, svuint8_t,
+		z0 = svsub_u8_x (p0, z0, z1),
+		z0 = svsub_x (p0, z0, z1))
+
+/*
+** sub_u8_x_tied2:
+**	sub	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u8_x_tied2, svuint8_t,
+		z0 = svsub_u8_x (p0, z1, z0),
+		z0 = svsub_x (p0, z1, z0))
+
+/*
+** sub_u8_x_untied:
+**	sub	z0\.b, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (sub_u8_x_untied, svuint8_t,
+		z0 = svsub_u8_x (p0, z1, z2),
+		z0 = svsub_x (p0, z1, z2))
+
+/*
+** sub_w0_u8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	sub	z0\.b, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svsub_n_u8_x (p0, z0, x0),
+		 z0 = svsub_x (p0, z0, x0))
+
+/*
+** sub_w0_u8_x_untied:
+**	mov	(z[0-9]+\.b), w0
+**	sub	z0\.b, z1\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (sub_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svsub_n_u8_x (p0, z1, x0),
+		 z0 = svsub_x (p0, z1, x0))
+
+/*
+** sub_1_u8_x_tied1:
+**	add	z0\.b, z0\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u8_x_tied1, svuint8_t,
+		z0 = svsub_n_u8_x (p0, z0, 1),
+		z0 = svsub_x (p0, z0, 1))
+
+/*
+** sub_1_u8_x_untied:
+**	movprfx	z0, z1
+**	add	z0\.b, z0\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (sub_1_u8_x_untied, svuint8_t,
+		z0 = svsub_n_u8_x (p0, z1, 1),
+		z0 = svsub_x (p0, z1, 1))
+
+/*
+** sub_127_u8_x:
+**	add	z0\.b, z0\.b, #129
+**	ret
+*/
+TEST_UNIFORM_Z (sub_127_u8_x, svuint8_t,
+		z0 = svsub_n_u8_x (p0, z0, 127),
+		z0 = svsub_x (p0, z0, 127))
+
+/*
+** sub_128_u8_x:
+**	add	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (sub_128_u8_x, svuint8_t,
+		z0 = svsub_n_u8_x (p0, z0, 128),
+		z0 = svsub_x (p0, z0, 128))
+
+/*
+** sub_255_u8_x:
+**	add	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_255_u8_x, svuint8_t,
+		z0 = svsub_n_u8_x (p0, z0, 255),
+		z0 = svsub_x (p0, z0, 255))
+
+/*
+** sub_m1_u8_x:
+**	add	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m1_u8_x, svuint8_t,
+		z0 = svsub_n_u8_x (p0, z0, -1),
+		z0 = svsub_x (p0, z0, -1))
+
+/*
+** sub_m127_u8_x:
+**	add	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m127_u8_x, svuint8_t,
+		z0 = svsub_n_u8_x (p0, z0, -127),
+		z0 = svsub_x (p0, z0, -127))
+
+/*
+** sub_m128_u8_x:
+**	add	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (sub_m128_u8_x, svuint8_t,
+		z0 = svsub_n_u8_x (p0, z0, -128),
+		z0 = svsub_x (p0, z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f16.c
new file mode 100644
index 000000000..e14357db2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f16.c
@@ -0,0 +1,444 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** subr_f16_m_tied1:
+**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_m_tied1, svfloat16_t,
+		z0 = svsubr_f16_m (p0, z0, z1),
+		z0 = svsubr_m (p0, z0, z1))
+
+/*
+** subr_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_m_tied2, svfloat16_t,
+		z0 = svsubr_f16_m (p0, z1, z0),
+		z0 = svsubr_m (p0, z1, z0))
+
+/*
+** subr_f16_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_m_untied, svfloat16_t,
+		z0 = svsubr_f16_m (p0, z1, z2),
+		z0 = svsubr_m (p0, z1, z2))
+
+/*
+** subr_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svsubr_n_f16_m (p0, z0, d4),
+		 z0 = svsubr_m (p0, z0, d4))
+
+/*
+** subr_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svsubr_n_f16_m (p0, z1, d4),
+		 z0 = svsubr_m (p0, z1, d4))
+
+/*
+** subr_1_f16_m_tied1:
+**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f16_m_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_m (p0, z0, 1),
+		z0 = svsubr_m (p0, z0, 1))
+
+/*
+** subr_1_f16_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f16_m_untied, svfloat16_t,
+		z0 = svsubr_n_f16_m (p0, z1, 1),
+		z0 = svsubr_m (p0, z1, 1))
+
+/*
+** subr_0p5_f16_m_tied1:
+**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f16_m_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_m (p0, z0, 0.5),
+		z0 = svsubr_m (p0, z0, 0.5))
+
+/*
+** subr_0p5_f16_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f16_m_untied, svfloat16_t,
+		z0 = svsubr_n_f16_m (p0, z1, 0.5),
+		z0 = svsubr_m (p0, z1, 0.5))
+
+/*
+** subr_m1_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f16_m_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_m (p0, z0, -1),
+		z0 = svsubr_m (p0, z0, -1))
+
+/*
+** subr_m1_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f16_m_untied, svfloat16_t,
+		z0 = svsubr_n_f16_m (p0, z1, -1),
+		z0 = svsubr_m (p0, z1, -1))
+
+/*
+** subr_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_z_tied1, svfloat16_t,
+		z0 = svsubr_f16_z (p0, z0, z1),
+		z0 = svsubr_z (p0, z0, z1))
+
+/*
+** subr_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_z_tied2, svfloat16_t,
+		z0 = svsubr_f16_z (p0, z1, z0),
+		z0 = svsubr_z (p0, z1, z0))
+
+/*
+** subr_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsubr	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_z_untied, svfloat16_t,
+		z0 = svsubr_f16_z (p0, z1, z2),
+		z0 = svsubr_z (p0, z1, z2))
+
+/*
+** subr_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svsubr_n_f16_z (p0, z0, d4),
+		 z0 = svsubr_z (p0, z0, d4))
+
+/*
+** subr_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svsubr_n_f16_z (p0, z1, d4),
+		 z0 = svsubr_z (p0, z1, d4))
+
+/*
+** subr_1_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f16_z_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_z (p0, z0, 1),
+		z0 = svsubr_z (p0, z0, 1))
+
+/*
+** subr_1_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f16_z_untied, svfloat16_t,
+		z0 = svsubr_n_f16_z (p0, z1, 1),
+		z0 = svsubr_z (p0, z1, 1))
+
+/*
+** subr_0p5_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f16_z_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_z (p0, z0, 0.5),
+		z0 = svsubr_z (p0, z0, 0.5))
+
+/*
+** subr_0p5_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f16_z_untied, svfloat16_t,
+		z0 = svsubr_n_f16_z (p0, z1, 0.5),
+		z0 = svsubr_z (p0, z1, 0.5))
+
+/*
+** subr_m1_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f16_z_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_z (p0, z0, -1),
+		z0 = svsubr_z (p0, z0, -1))
+
+/*
+** subr_m1_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f16_z_untied, svfloat16_t,
+		z0 = svsubr_n_f16_z (p0, z1, -1),
+		z0 = svsubr_z (p0, z1, -1))
+
+/*
+** subr_f16_x_tied1:
+**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_x_tied1, svfloat16_t,
+		z0 = svsubr_f16_x (p0, z0, z1),
+		z0 = svsubr_x (p0, z0, z1))
+
+/*
+** subr_f16_x_tied2:
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_x_tied2, svfloat16_t,
+		z0 = svsubr_f16_x (p0, z1, z0),
+		z0 = svsubr_x (p0, z1, z0))
+
+/*
+** subr_f16_x_untied:
+** (
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0, z2
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_x_untied, svfloat16_t,
+		z0 = svsubr_f16_x (p0, z1, z2),
+		z0 = svsubr_x (p0, z1, z2))
+
+/*
+** subr_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svsubr_n_f16_x (p0, z0, d4),
+		 z0 = svsubr_x (p0, z0, d4))
+
+/*
+** subr_h4_f16_x_untied: { xfail *-*-* }
+**	mov	z0\.h, h4
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svsubr_n_f16_x (p0, z1, d4),
+		 z0 = svsubr_x (p0, z1, d4))
+
+/*
+** subr_1_f16_x_tied1:
+**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f16_x_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_x (p0, z0, 1),
+		z0 = svsubr_x (p0, z0, 1))
+
+/*
+** subr_1_f16_x_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f16_x_untied, svfloat16_t,
+		z0 = svsubr_n_f16_x (p0, z1, 1),
+		z0 = svsubr_x (p0, z1, 1))
+
+/*
+** subr_0p5_f16_x_tied1:
+**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_x (p0, z0, 0.5),
+		z0 = svsubr_x (p0, z0, 0.5))
+
+/*
+** subr_0p5_f16_x_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f16_x_untied, svfloat16_t,
+		z0 = svsubr_n_f16_x (p0, z1, 0.5),
+		z0 = svsubr_x (p0, z1, 0.5))
+
+/*
+** subr_m1_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f16_x_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_x (p0, z0, -1),
+		z0 = svsubr_x (p0, z0, -1))
+
+/*
+** subr_m1_f16_x_untied:
+**	fmov	z0\.h, #-1\.0(?:e\+0)?
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f16_x_untied, svfloat16_t,
+		z0 = svsubr_n_f16_x (p0, z1, -1),
+		z0 = svsubr_x (p0, z1, -1))
+
+/*
+** ptrue_subr_f16_x_tied1:
+**	fsub	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f16_x_tied1, svfloat16_t,
+		z0 = svsubr_f16_x (svptrue_b16 (), z0, z1),
+		z0 = svsubr_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_subr_f16_x_tied2:
+**	fsub	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f16_x_tied2, svfloat16_t,
+		z0 = svsubr_f16_x (svptrue_b16 (), z1, z0),
+		z0 = svsubr_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_subr_f16_x_untied:
+**	fsub	z0\.h, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f16_x_untied, svfloat16_t,
+		z0 = svsubr_f16_x (svptrue_b16 (), z1, z2),
+		z0 = svsubr_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_subr_1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_1_f16_x_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_x (svptrue_b16 (), z0, 1),
+		z0 = svsubr_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_subr_1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_1_f16_x_untied, svfloat16_t,
+		z0 = svsubr_n_f16_x (svptrue_b16 (), z1, 1),
+		z0 = svsubr_x (svptrue_b16 (), z1, 1))
+
+/*
+** ptrue_subr_0p5_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_x (svptrue_b16 (), z0, 0.5),
+		z0 = svsubr_x (svptrue_b16 (), z0, 0.5))
+
+/*
+** ptrue_subr_0p5_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_0p5_f16_x_untied, svfloat16_t,
+		z0 = svsubr_n_f16_x (svptrue_b16 (), z1, 0.5),
+		z0 = svsubr_x (svptrue_b16 (), z1, 0.5))
+
+/*
+** ptrue_subr_m1_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+**	fsub	z0\.h, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_m1_f16_x_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_x (svptrue_b16 (), z0, -1),
+		z0 = svsubr_x (svptrue_b16 (), z0, -1))
+
+/*
+** ptrue_subr_m1_f16_x_untied:
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+**	fsub	z0\.h, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_m1_f16_x_untied, svfloat16_t,
+		z0 = svsubr_n_f16_x (svptrue_b16 (), z1, -1),
+		z0 = svsubr_x (svptrue_b16 (), z1, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f16_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f16_notrap.c
new file mode 100644
index 000000000..a31ebd2ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f16_notrap.c
@@ -0,0 +1,439 @@
+/* { dg-additional-options "-fno-trapping-math" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** subr_f16_m_tied1:
+**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_m_tied1, svfloat16_t,
+		z0 = svsubr_f16_m (p0, z0, z1),
+		z0 = svsubr_m (p0, z0, z1))
+
+/*
+** subr_f16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_m_tied2, svfloat16_t,
+		z0 = svsubr_f16_m (p0, z1, z0),
+		z0 = svsubr_m (p0, z1, z0))
+
+/*
+** subr_f16_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_m_untied, svfloat16_t,
+		z0 = svsubr_f16_m (p0, z1, z2),
+		z0 = svsubr_m (p0, z1, z2))
+
+/*
+** subr_h4_f16_m_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_h4_f16_m_tied1, svfloat16_t, __fp16,
+		 z0 = svsubr_n_f16_m (p0, z0, d4),
+		 z0 = svsubr_m (p0, z0, d4))
+
+/*
+** subr_h4_f16_m_untied:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_h4_f16_m_untied, svfloat16_t, __fp16,
+		 z0 = svsubr_n_f16_m (p0, z1, d4),
+		 z0 = svsubr_m (p0, z1, d4))
+
+/*
+** subr_1_f16_m_tied1:
+**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f16_m_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_m (p0, z0, 1),
+		z0 = svsubr_m (p0, z0, 1))
+
+/*
+** subr_1_f16_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f16_m_untied, svfloat16_t,
+		z0 = svsubr_n_f16_m (p0, z1, 1),
+		z0 = svsubr_m (p0, z1, 1))
+
+/*
+** subr_0p5_f16_m_tied1:
+**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f16_m_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_m (p0, z0, 0.5),
+		z0 = svsubr_m (p0, z0, 0.5))
+
+/*
+** subr_0p5_f16_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f16_m_untied, svfloat16_t,
+		z0 = svsubr_n_f16_m (p0, z1, 0.5),
+		z0 = svsubr_m (p0, z1, 0.5))
+
+/*
+** subr_m1_f16_m_tied1:
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f16_m_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_m (p0, z0, -1),
+		z0 = svsubr_m (p0, z0, -1))
+
+/*
+** subr_m1_f16_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f16_m_untied, svfloat16_t,
+		z0 = svsubr_n_f16_m (p0, z1, -1),
+		z0 = svsubr_m (p0, z1, -1))
+
+/*
+** subr_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsubr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_z_tied1, svfloat16_t,
+		z0 = svsubr_f16_z (p0, z0, z1),
+		z0 = svsubr_z (p0, z0, z1))
+
+/*
+** subr_f16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_z_tied2, svfloat16_t,
+		z0 = svsubr_f16_z (p0, z1, z0),
+		z0 = svsubr_z (p0, z1, z0))
+
+/*
+** subr_f16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsubr	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_z_untied, svfloat16_t,
+		z0 = svsubr_f16_z (p0, z1, z2),
+		z0 = svsubr_z (p0, z1, z2))
+
+/*
+** subr_h4_f16_z_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_h4_f16_z_tied1, svfloat16_t, __fp16,
+		 z0 = svsubr_n_f16_z (p0, z0, d4),
+		 z0 = svsubr_z (p0, z0, d4))
+
+/*
+** subr_h4_f16_z_untied:
+**	mov	(z[0-9]+\.h), h4
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_h4_f16_z_untied, svfloat16_t, __fp16,
+		 z0 = svsubr_n_f16_z (p0, z1, d4),
+		 z0 = svsubr_z (p0, z1, d4))
+
+/*
+** subr_1_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f16_z_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_z (p0, z0, 1),
+		z0 = svsubr_z (p0, z0, 1))
+
+/*
+** subr_1_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f16_z_untied, svfloat16_t,
+		z0 = svsubr_n_f16_z (p0, z1, 1),
+		z0 = svsubr_z (p0, z1, 1))
+
+/*
+** subr_0p5_f16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f16_z_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_z (p0, z0, 0.5),
+		z0 = svsubr_z (p0, z0, 0.5))
+
+/*
+** subr_0p5_f16_z_untied:
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f16_z_untied, svfloat16_t,
+		z0 = svsubr_n_f16_z (p0, z1, 0.5),
+		z0 = svsubr_z (p0, z1, 0.5))
+
+/*
+** subr_m1_f16_z_tied1:
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+**	movprfx	z0\.h, p0/z, z0\.h
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f16_z_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_z (p0, z0, -1),
+		z0 = svsubr_z (p0, z0, -1))
+
+/*
+** subr_m1_f16_z_untied:
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	fsubr	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	fsub	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f16_z_untied, svfloat16_t,
+		z0 = svsubr_n_f16_z (p0, z1, -1),
+		z0 = svsubr_z (p0, z1, -1))
+
+/*
+** subr_f16_x_tied1:
+**	fsub	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_x_tied1, svfloat16_t,
+		z0 = svsubr_f16_x (p0, z0, z1),
+		z0 = svsubr_x (p0, z0, z1))
+
+/*
+** subr_f16_x_tied2:
+**	fsub	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_x_tied2, svfloat16_t,
+		z0 = svsubr_f16_x (p0, z1, z0),
+		z0 = svsubr_x (p0, z1, z0))
+
+/*
+** subr_f16_x_untied:
+**	fsub	z0\.h, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f16_x_untied, svfloat16_t,
+		z0 = svsubr_f16_x (p0, z1, z2),
+		z0 = svsubr_x (p0, z1, z2))
+
+/*
+** subr_h4_f16_x_tied1:
+**	mov	(z[0-9]+\.h), h4
+**	fsub	z0\.h, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_h4_f16_x_tied1, svfloat16_t, __fp16,
+		 z0 = svsubr_n_f16_x (p0, z0, d4),
+		 z0 = svsubr_x (p0, z0, d4))
+
+/*
+** subr_h4_f16_x_untied:
+**	mov	(z[0-9]+\.h), h4
+**	fsub	z0\.h, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_h4_f16_x_untied, svfloat16_t, __fp16,
+		 z0 = svsubr_n_f16_x (p0, z1, d4),
+		 z0 = svsubr_x (p0, z1, d4))
+
+/*
+** subr_1_f16_x_tied1:
+**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f16_x_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_x (p0, z0, 1),
+		z0 = svsubr_x (p0, z0, 1))
+
+/*
+** subr_1_f16_x_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f16_x_untied, svfloat16_t,
+		z0 = svsubr_n_f16_x (p0, z1, 1),
+		z0 = svsubr_x (p0, z1, 1))
+
+/*
+** subr_0p5_f16_x_tied1:
+**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_x (p0, z0, 0.5),
+		z0 = svsubr_x (p0, z0, 0.5))
+
+/*
+** subr_0p5_f16_x_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.h, p0/m, z0\.h, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f16_x_untied, svfloat16_t,
+		z0 = svsubr_n_f16_x (p0, z1, 0.5),
+		z0 = svsubr_x (p0, z1, 0.5))
+
+/*
+** subr_m1_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+**	fsub	z0\.h, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f16_x_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_x (p0, z0, -1),
+		z0 = svsubr_x (p0, z0, -1))
+
+/*
+** subr_m1_f16_x_untied:
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+**	fsub	z0\.h, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f16_x_untied, svfloat16_t,
+		z0 = svsubr_n_f16_x (p0, z1, -1),
+		z0 = svsubr_x (p0, z1, -1))
+
+/*
+** ptrue_subr_f16_x_tied1:
+**	fsub	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f16_x_tied1, svfloat16_t,
+		z0 = svsubr_f16_x (svptrue_b16 (), z0, z1),
+		z0 = svsubr_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_subr_f16_x_tied2:
+**	fsub	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f16_x_tied2, svfloat16_t,
+		z0 = svsubr_f16_x (svptrue_b16 (), z1, z0),
+		z0 = svsubr_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_subr_f16_x_untied:
+**	fsub	z0\.h, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f16_x_untied, svfloat16_t,
+		z0 = svsubr_f16_x (svptrue_b16 (), z1, z2),
+		z0 = svsubr_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_subr_1_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_1_f16_x_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_x (svptrue_b16 (), z0, 1),
+		z0 = svsubr_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_subr_1_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_1_f16_x_untied, svfloat16_t,
+		z0 = svsubr_n_f16_x (svptrue_b16 (), z1, 1),
+		z0 = svsubr_x (svptrue_b16 (), z1, 1))
+
+/*
+** ptrue_subr_0p5_f16_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_0p5_f16_x_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_x (svptrue_b16 (), z0, 0.5),
+		z0 = svsubr_x (svptrue_b16 (), z0, 0.5))
+
+/*
+** ptrue_subr_0p5_f16_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_0p5_f16_x_untied, svfloat16_t,
+		z0 = svsubr_n_f16_x (svptrue_b16 (), z1, 0.5),
+		z0 = svsubr_x (svptrue_b16 (), z1, 0.5))
+
+/*
+** ptrue_subr_m1_f16_x_tied1:
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+**	fsub	z0\.h, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_m1_f16_x_tied1, svfloat16_t,
+		z0 = svsubr_n_f16_x (svptrue_b16 (), z0, -1),
+		z0 = svsubr_x (svptrue_b16 (), z0, -1))
+
+/*
+** ptrue_subr_m1_f16_x_untied:
+**	fmov	(z[0-9]+\.h), #-1\.0(?:e\+0)?
+**	fsub	z0\.h, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_m1_f16_x_untied, svfloat16_t,
+		z0 = svsubr_n_f16_x (svptrue_b16 (), z1, -1),
+		z0 = svsubr_x (svptrue_b16 (), z1, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f32.c
new file mode 100644
index 000000000..98dc7ad2b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f32.c
@@ -0,0 +1,444 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** subr_f32_m_tied1:
+**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_m_tied1, svfloat32_t,
+		z0 = svsubr_f32_m (p0, z0, z1),
+		z0 = svsubr_m (p0, z0, z1))
+
+/*
+** subr_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_m_tied2, svfloat32_t,
+		z0 = svsubr_f32_m (p0, z1, z0),
+		z0 = svsubr_m (p0, z1, z0))
+
+/*
+** subr_f32_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_m_untied, svfloat32_t,
+		z0 = svsubr_f32_m (p0, z1, z2),
+		z0 = svsubr_m (p0, z1, z2))
+
+/*
+** subr_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svsubr_n_f32_m (p0, z0, d4),
+		 z0 = svsubr_m (p0, z0, d4))
+
+/*
+** subr_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svsubr_n_f32_m (p0, z1, d4),
+		 z0 = svsubr_m (p0, z1, d4))
+
+/*
+** subr_1_f32_m_tied1:
+**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f32_m_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_m (p0, z0, 1),
+		z0 = svsubr_m (p0, z0, 1))
+
+/*
+** subr_1_f32_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f32_m_untied, svfloat32_t,
+		z0 = svsubr_n_f32_m (p0, z1, 1),
+		z0 = svsubr_m (p0, z1, 1))
+
+/*
+** subr_0p5_f32_m_tied1:
+**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f32_m_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_m (p0, z0, 0.5),
+		z0 = svsubr_m (p0, z0, 0.5))
+
+/*
+** subr_0p5_f32_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f32_m_untied, svfloat32_t,
+		z0 = svsubr_n_f32_m (p0, z1, 0.5),
+		z0 = svsubr_m (p0, z1, 0.5))
+
+/*
+** subr_m1_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f32_m_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_m (p0, z0, -1),
+		z0 = svsubr_m (p0, z0, -1))
+
+/*
+** subr_m1_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f32_m_untied, svfloat32_t,
+		z0 = svsubr_n_f32_m (p0, z1, -1),
+		z0 = svsubr_m (p0, z1, -1))
+
+/*
+** subr_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_z_tied1, svfloat32_t,
+		z0 = svsubr_f32_z (p0, z0, z1),
+		z0 = svsubr_z (p0, z0, z1))
+
+/*
+** subr_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_z_tied2, svfloat32_t,
+		z0 = svsubr_f32_z (p0, z1, z0),
+		z0 = svsubr_z (p0, z1, z0))
+
+/*
+** subr_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsubr	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_z_untied, svfloat32_t,
+		z0 = svsubr_f32_z (p0, z1, z2),
+		z0 = svsubr_z (p0, z1, z2))
+
+/*
+** subr_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svsubr_n_f32_z (p0, z0, d4),
+		 z0 = svsubr_z (p0, z0, d4))
+
+/*
+** subr_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svsubr_n_f32_z (p0, z1, d4),
+		 z0 = svsubr_z (p0, z1, d4))
+
+/*
+** subr_1_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f32_z_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_z (p0, z0, 1),
+		z0 = svsubr_z (p0, z0, 1))
+
+/*
+** subr_1_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f32_z_untied, svfloat32_t,
+		z0 = svsubr_n_f32_z (p0, z1, 1),
+		z0 = svsubr_z (p0, z1, 1))
+
+/*
+** subr_0p5_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f32_z_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_z (p0, z0, 0.5),
+		z0 = svsubr_z (p0, z0, 0.5))
+
+/*
+** subr_0p5_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f32_z_untied, svfloat32_t,
+		z0 = svsubr_n_f32_z (p0, z1, 0.5),
+		z0 = svsubr_z (p0, z1, 0.5))
+
+/*
+** subr_m1_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f32_z_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_z (p0, z0, -1),
+		z0 = svsubr_z (p0, z0, -1))
+
+/*
+** subr_m1_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f32_z_untied, svfloat32_t,
+		z0 = svsubr_n_f32_z (p0, z1, -1),
+		z0 = svsubr_z (p0, z1, -1))
+
+/*
+** subr_f32_x_tied1:
+**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_x_tied1, svfloat32_t,
+		z0 = svsubr_f32_x (p0, z0, z1),
+		z0 = svsubr_x (p0, z0, z1))
+
+/*
+** subr_f32_x_tied2:
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_x_tied2, svfloat32_t,
+		z0 = svsubr_f32_x (p0, z1, z0),
+		z0 = svsubr_x (p0, z1, z0))
+
+/*
+** subr_f32_x_untied:
+** (
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0, z2
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_x_untied, svfloat32_t,
+		z0 = svsubr_f32_x (p0, z1, z2),
+		z0 = svsubr_x (p0, z1, z2))
+
+/*
+** subr_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svsubr_n_f32_x (p0, z0, d4),
+		 z0 = svsubr_x (p0, z0, d4))
+
+/*
+** subr_s4_f32_x_untied: { xfail *-*-* }
+**	mov	z0\.s, s4
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svsubr_n_f32_x (p0, z1, d4),
+		 z0 = svsubr_x (p0, z1, d4))
+
+/*
+** subr_1_f32_x_tied1:
+**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f32_x_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_x (p0, z0, 1),
+		z0 = svsubr_x (p0, z0, 1))
+
+/*
+** subr_1_f32_x_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f32_x_untied, svfloat32_t,
+		z0 = svsubr_n_f32_x (p0, z1, 1),
+		z0 = svsubr_x (p0, z1, 1))
+
+/*
+** subr_0p5_f32_x_tied1:
+**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_x (p0, z0, 0.5),
+		z0 = svsubr_x (p0, z0, 0.5))
+
+/*
+** subr_0p5_f32_x_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f32_x_untied, svfloat32_t,
+		z0 = svsubr_n_f32_x (p0, z1, 0.5),
+		z0 = svsubr_x (p0, z1, 0.5))
+
+/*
+** subr_m1_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f32_x_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_x (p0, z0, -1),
+		z0 = svsubr_x (p0, z0, -1))
+
+/*
+** subr_m1_f32_x_untied:
+**	fmov	z0\.s, #-1\.0(?:e\+0)?
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f32_x_untied, svfloat32_t,
+		z0 = svsubr_n_f32_x (p0, z1, -1),
+		z0 = svsubr_x (p0, z1, -1))
+
+/*
+** ptrue_subr_f32_x_tied1:
+**	fsub	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f32_x_tied1, svfloat32_t,
+		z0 = svsubr_f32_x (svptrue_b32 (), z0, z1),
+		z0 = svsubr_x (svptrue_b32 (), z0, z1))
+
+/*
+** ptrue_subr_f32_x_tied2:
+**	fsub	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f32_x_tied2, svfloat32_t,
+		z0 = svsubr_f32_x (svptrue_b32 (), z1, z0),
+		z0 = svsubr_x (svptrue_b32 (), z1, z0))
+
+/*
+** ptrue_subr_f32_x_untied:
+**	fsub	z0\.s, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f32_x_untied, svfloat32_t,
+		z0 = svsubr_f32_x (svptrue_b32 (), z1, z2),
+		z0 = svsubr_x (svptrue_b32 (), z1, z2))
+
+/*
+** ptrue_subr_1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_1_f32_x_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_x (svptrue_b32 (), z0, 1),
+		z0 = svsubr_x (svptrue_b32 (), z0, 1))
+
+/*
+** ptrue_subr_1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_1_f32_x_untied, svfloat32_t,
+		z0 = svsubr_n_f32_x (svptrue_b32 (), z1, 1),
+		z0 = svsubr_x (svptrue_b32 (), z1, 1))
+
+/*
+** ptrue_subr_0p5_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_x (svptrue_b32 (), z0, 0.5),
+		z0 = svsubr_x (svptrue_b32 (), z0, 0.5))
+
+/*
+** ptrue_subr_0p5_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_0p5_f32_x_untied, svfloat32_t,
+		z0 = svsubr_n_f32_x (svptrue_b32 (), z1, 0.5),
+		z0 = svsubr_x (svptrue_b32 (), z1, 0.5))
+
+/*
+** ptrue_subr_m1_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+**	fsub	z0\.s, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_m1_f32_x_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_x (svptrue_b32 (), z0, -1),
+		z0 = svsubr_x (svptrue_b32 (), z0, -1))
+
+/*
+** ptrue_subr_m1_f32_x_untied:
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+**	fsub	z0\.s, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_m1_f32_x_untied, svfloat32_t,
+		z0 = svsubr_n_f32_x (svptrue_b32 (), z1, -1),
+		z0 = svsubr_x (svptrue_b32 (), z1, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f32_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f32_notrap.c
new file mode 100644
index 000000000..75ae0dc61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f32_notrap.c
@@ -0,0 +1,439 @@
+/* { dg-additional-options "-fno-trapping-math" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** subr_f32_m_tied1:
+**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_m_tied1, svfloat32_t,
+		z0 = svsubr_f32_m (p0, z0, z1),
+		z0 = svsubr_m (p0, z0, z1))
+
+/*
+** subr_f32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_m_tied2, svfloat32_t,
+		z0 = svsubr_f32_m (p0, z1, z0),
+		z0 = svsubr_m (p0, z1, z0))
+
+/*
+** subr_f32_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_m_untied, svfloat32_t,
+		z0 = svsubr_f32_m (p0, z1, z2),
+		z0 = svsubr_m (p0, z1, z2))
+
+/*
+** subr_s4_f32_m_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_s4_f32_m_tied1, svfloat32_t, float,
+		 z0 = svsubr_n_f32_m (p0, z0, d4),
+		 z0 = svsubr_m (p0, z0, d4))
+
+/*
+** subr_s4_f32_m_untied:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_s4_f32_m_untied, svfloat32_t, float,
+		 z0 = svsubr_n_f32_m (p0, z1, d4),
+		 z0 = svsubr_m (p0, z1, d4))
+
+/*
+** subr_1_f32_m_tied1:
+**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f32_m_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_m (p0, z0, 1),
+		z0 = svsubr_m (p0, z0, 1))
+
+/*
+** subr_1_f32_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f32_m_untied, svfloat32_t,
+		z0 = svsubr_n_f32_m (p0, z1, 1),
+		z0 = svsubr_m (p0, z1, 1))
+
+/*
+** subr_0p5_f32_m_tied1:
+**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f32_m_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_m (p0, z0, 0.5),
+		z0 = svsubr_m (p0, z0, 0.5))
+
+/*
+** subr_0p5_f32_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f32_m_untied, svfloat32_t,
+		z0 = svsubr_n_f32_m (p0, z1, 0.5),
+		z0 = svsubr_m (p0, z1, 0.5))
+
+/*
+** subr_m1_f32_m_tied1:
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f32_m_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_m (p0, z0, -1),
+		z0 = svsubr_m (p0, z0, -1))
+
+/*
+** subr_m1_f32_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f32_m_untied, svfloat32_t,
+		z0 = svsubr_n_f32_m (p0, z1, -1),
+		z0 = svsubr_m (p0, z1, -1))
+
+/*
+** subr_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsubr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_z_tied1, svfloat32_t,
+		z0 = svsubr_f32_z (p0, z0, z1),
+		z0 = svsubr_z (p0, z0, z1))
+
+/*
+** subr_f32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_z_tied2, svfloat32_t,
+		z0 = svsubr_f32_z (p0, z1, z0),
+		z0 = svsubr_z (p0, z1, z0))
+
+/*
+** subr_f32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsubr	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_z_untied, svfloat32_t,
+		z0 = svsubr_f32_z (p0, z1, z2),
+		z0 = svsubr_z (p0, z1, z2))
+
+/*
+** subr_s4_f32_z_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_s4_f32_z_tied1, svfloat32_t, float,
+		 z0 = svsubr_n_f32_z (p0, z0, d4),
+		 z0 = svsubr_z (p0, z0, d4))
+
+/*
+** subr_s4_f32_z_untied:
+**	mov	(z[0-9]+\.s), s4
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_s4_f32_z_untied, svfloat32_t, float,
+		 z0 = svsubr_n_f32_z (p0, z1, d4),
+		 z0 = svsubr_z (p0, z1, d4))
+
+/*
+** subr_1_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f32_z_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_z (p0, z0, 1),
+		z0 = svsubr_z (p0, z0, 1))
+
+/*
+** subr_1_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f32_z_untied, svfloat32_t,
+		z0 = svsubr_n_f32_z (p0, z1, 1),
+		z0 = svsubr_z (p0, z1, 1))
+
+/*
+** subr_0p5_f32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f32_z_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_z (p0, z0, 0.5),
+		z0 = svsubr_z (p0, z0, 0.5))
+
+/*
+** subr_0p5_f32_z_untied:
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f32_z_untied, svfloat32_t,
+		z0 = svsubr_n_f32_z (p0, z1, 0.5),
+		z0 = svsubr_z (p0, z1, 0.5))
+
+/*
+** subr_m1_f32_z_tied1:
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+**	movprfx	z0\.s, p0/z, z0\.s
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f32_z_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_z (p0, z0, -1),
+		z0 = svsubr_z (p0, z0, -1))
+
+/*
+** subr_m1_f32_z_untied:
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	fsubr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	fsub	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f32_z_untied, svfloat32_t,
+		z0 = svsubr_n_f32_z (p0, z1, -1),
+		z0 = svsubr_z (p0, z1, -1))
+
+/*
+** subr_f32_x_tied1:
+**	fsub	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_x_tied1, svfloat32_t,
+		z0 = svsubr_f32_x (p0, z0, z1),
+		z0 = svsubr_x (p0, z0, z1))
+
+/*
+** subr_f32_x_tied2:
+**	fsub	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_x_tied2, svfloat32_t,
+		z0 = svsubr_f32_x (p0, z1, z0),
+		z0 = svsubr_x (p0, z1, z0))
+
+/*
+** subr_f32_x_untied:
+**	fsub	z0\.s, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f32_x_untied, svfloat32_t,
+		z0 = svsubr_f32_x (p0, z1, z2),
+		z0 = svsubr_x (p0, z1, z2))
+
+/*
+** subr_s4_f32_x_tied1:
+**	mov	(z[0-9]+\.s), s4
+**	fsub	z0\.s, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_s4_f32_x_tied1, svfloat32_t, float,
+		 z0 = svsubr_n_f32_x (p0, z0, d4),
+		 z0 = svsubr_x (p0, z0, d4))
+
+/*
+** subr_s4_f32_x_untied:
+**	mov	(z[0-9]+\.s), s4
+**	fsub	z0\.s, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_s4_f32_x_untied, svfloat32_t, float,
+		 z0 = svsubr_n_f32_x (p0, z1, d4),
+		 z0 = svsubr_x (p0, z1, d4))
+
+/*
+** subr_1_f32_x_tied1:
+**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f32_x_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_x (p0, z0, 1),
+		z0 = svsubr_x (p0, z0, 1))
+
+/*
+** subr_1_f32_x_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f32_x_untied, svfloat32_t,
+		z0 = svsubr_n_f32_x (p0, z1, 1),
+		z0 = svsubr_x (p0, z1, 1))
+
+/*
+** subr_0p5_f32_x_tied1:
+**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_x (p0, z0, 0.5),
+		z0 = svsubr_x (p0, z0, 0.5))
+
+/*
+** subr_0p5_f32_x_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.s, p0/m, z0\.s, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f32_x_untied, svfloat32_t,
+		z0 = svsubr_n_f32_x (p0, z1, 0.5),
+		z0 = svsubr_x (p0, z1, 0.5))
+
+/*
+** subr_m1_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+**	fsub	z0\.s, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f32_x_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_x (p0, z0, -1),
+		z0 = svsubr_x (p0, z0, -1))
+
+/*
+** subr_m1_f32_x_untied:
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+**	fsub	z0\.s, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f32_x_untied, svfloat32_t,
+		z0 = svsubr_n_f32_x (p0, z1, -1),
+		z0 = svsubr_x (p0, z1, -1))
+
+/*
+** ptrue_subr_f32_x_tied1:
+**	fsub	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f32_x_tied1, svfloat32_t,
+		z0 = svsubr_f32_x (svptrue_b32 (), z0, z1),
+		z0 = svsubr_x (svptrue_b32 (), z0, z1))
+
+/*
+** ptrue_subr_f32_x_tied2:
+**	fsub	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f32_x_tied2, svfloat32_t,
+		z0 = svsubr_f32_x (svptrue_b32 (), z1, z0),
+		z0 = svsubr_x (svptrue_b32 (), z1, z0))
+
+/*
+** ptrue_subr_f32_x_untied:
+**	fsub	z0\.s, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f32_x_untied, svfloat32_t,
+		z0 = svsubr_f32_x (svptrue_b32 (), z1, z2),
+		z0 = svsubr_x (svptrue_b32 (), z1, z2))
+
+/*
+** ptrue_subr_1_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_1_f32_x_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_x (svptrue_b32 (), z0, 1),
+		z0 = svsubr_x (svptrue_b32 (), z0, 1))
+
+/*
+** ptrue_subr_1_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_1_f32_x_untied, svfloat32_t,
+		z0 = svsubr_n_f32_x (svptrue_b32 (), z1, 1),
+		z0 = svsubr_x (svptrue_b32 (), z1, 1))
+
+/*
+** ptrue_subr_0p5_f32_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_0p5_f32_x_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_x (svptrue_b32 (), z0, 0.5),
+		z0 = svsubr_x (svptrue_b32 (), z0, 0.5))
+
+/*
+** ptrue_subr_0p5_f32_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_0p5_f32_x_untied, svfloat32_t,
+		z0 = svsubr_n_f32_x (svptrue_b32 (), z1, 0.5),
+		z0 = svsubr_x (svptrue_b32 (), z1, 0.5))
+
+/*
+** ptrue_subr_m1_f32_x_tied1:
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+**	fsub	z0\.s, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_m1_f32_x_tied1, svfloat32_t,
+		z0 = svsubr_n_f32_x (svptrue_b32 (), z0, -1),
+		z0 = svsubr_x (svptrue_b32 (), z0, -1))
+
+/*
+** ptrue_subr_m1_f32_x_untied:
+**	fmov	(z[0-9]+\.s), #-1\.0(?:e\+0)?
+**	fsub	z0\.s, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_m1_f32_x_untied, svfloat32_t,
+		z0 = svsubr_n_f32_x (svptrue_b32 (), z1, -1),
+		z0 = svsubr_x (svptrue_b32 (), z1, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f64.c
new file mode 100644
index 000000000..81f1112d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f64.c
@@ -0,0 +1,444 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** subr_f64_m_tied1:
+**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_m_tied1, svfloat64_t,
+		z0 = svsubr_f64_m (p0, z0, z1),
+		z0 = svsubr_m (p0, z0, z1))
+
+/*
+** subr_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_m_tied2, svfloat64_t,
+		z0 = svsubr_f64_m (p0, z1, z0),
+		z0 = svsubr_m (p0, z1, z0))
+
+/*
+** subr_f64_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_m_untied, svfloat64_t,
+		z0 = svsubr_f64_m (p0, z1, z2),
+		z0 = svsubr_m (p0, z1, z2))
+
+/*
+** subr_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svsubr_n_f64_m (p0, z0, d4),
+		 z0 = svsubr_m (p0, z0, d4))
+
+/*
+** subr_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svsubr_n_f64_m (p0, z1, d4),
+		 z0 = svsubr_m (p0, z1, d4))
+
+/*
+** subr_1_f64_m_tied1:
+**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f64_m_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_m (p0, z0, 1),
+		z0 = svsubr_m (p0, z0, 1))
+
+/*
+** subr_1_f64_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f64_m_untied, svfloat64_t,
+		z0 = svsubr_n_f64_m (p0, z1, 1),
+		z0 = svsubr_m (p0, z1, 1))
+
+/*
+** subr_0p5_f64_m_tied1:
+**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f64_m_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_m (p0, z0, 0.5),
+		z0 = svsubr_m (p0, z0, 0.5))
+
+/*
+** subr_0p5_f64_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f64_m_untied, svfloat64_t,
+		z0 = svsubr_n_f64_m (p0, z1, 0.5),
+		z0 = svsubr_m (p0, z1, 0.5))
+
+/*
+** subr_m1_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f64_m_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_m (p0, z0, -1),
+		z0 = svsubr_m (p0, z0, -1))
+
+/*
+** subr_m1_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f64_m_untied, svfloat64_t,
+		z0 = svsubr_n_f64_m (p0, z1, -1),
+		z0 = svsubr_m (p0, z1, -1))
+
+/*
+** subr_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_z_tied1, svfloat64_t,
+		z0 = svsubr_f64_z (p0, z0, z1),
+		z0 = svsubr_z (p0, z0, z1))
+
+/*
+** subr_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_z_tied2, svfloat64_t,
+		z0 = svsubr_f64_z (p0, z1, z0),
+		z0 = svsubr_z (p0, z1, z0))
+
+/*
+** subr_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsubr	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_z_untied, svfloat64_t,
+		z0 = svsubr_f64_z (p0, z1, z2),
+		z0 = svsubr_z (p0, z1, z2))
+
+/*
+** subr_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svsubr_n_f64_z (p0, z0, d4),
+		 z0 = svsubr_z (p0, z0, d4))
+
+/*
+** subr_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svsubr_n_f64_z (p0, z1, d4),
+		 z0 = svsubr_z (p0, z1, d4))
+
+/*
+** subr_1_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f64_z_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_z (p0, z0, 1),
+		z0 = svsubr_z (p0, z0, 1))
+
+/*
+** subr_1_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f64_z_untied, svfloat64_t,
+		z0 = svsubr_n_f64_z (p0, z1, 1),
+		z0 = svsubr_z (p0, z1, 1))
+
+/*
+** subr_0p5_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f64_z_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_z (p0, z0, 0.5),
+		z0 = svsubr_z (p0, z0, 0.5))
+
+/*
+** subr_0p5_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f64_z_untied, svfloat64_t,
+		z0 = svsubr_n_f64_z (p0, z1, 0.5),
+		z0 = svsubr_z (p0, z1, 0.5))
+
+/*
+** subr_m1_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f64_z_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_z (p0, z0, -1),
+		z0 = svsubr_z (p0, z0, -1))
+
+/*
+** subr_m1_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f64_z_untied, svfloat64_t,
+		z0 = svsubr_n_f64_z (p0, z1, -1),
+		z0 = svsubr_z (p0, z1, -1))
+
+/*
+** subr_f64_x_tied1:
+**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_x_tied1, svfloat64_t,
+		z0 = svsubr_f64_x (p0, z0, z1),
+		z0 = svsubr_x (p0, z0, z1))
+
+/*
+** subr_f64_x_tied2:
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_x_tied2, svfloat64_t,
+		z0 = svsubr_f64_x (p0, z1, z0),
+		z0 = svsubr_x (p0, z1, z0))
+
+/*
+** subr_f64_x_untied:
+** (
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0, z2
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_x_untied, svfloat64_t,
+		z0 = svsubr_f64_x (p0, z1, z2),
+		z0 = svsubr_x (p0, z1, z2))
+
+/*
+** subr_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svsubr_n_f64_x (p0, z0, d4),
+		 z0 = svsubr_x (p0, z0, d4))
+
+/*
+** subr_d4_f64_x_untied: { xfail *-*-* }
+**	mov	z0\.d, d4
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svsubr_n_f64_x (p0, z1, d4),
+		 z0 = svsubr_x (p0, z1, d4))
+
+/*
+** subr_1_f64_x_tied1:
+**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f64_x_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_x (p0, z0, 1),
+		z0 = svsubr_x (p0, z0, 1))
+
+/*
+** subr_1_f64_x_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f64_x_untied, svfloat64_t,
+		z0 = svsubr_n_f64_x (p0, z1, 1),
+		z0 = svsubr_x (p0, z1, 1))
+
+/*
+** subr_0p5_f64_x_tied1:
+**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_x (p0, z0, 0.5),
+		z0 = svsubr_x (p0, z0, 0.5))
+
+/*
+** subr_0p5_f64_x_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f64_x_untied, svfloat64_t,
+		z0 = svsubr_n_f64_x (p0, z1, 0.5),
+		z0 = svsubr_x (p0, z1, 0.5))
+
+/*
+** subr_m1_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f64_x_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_x (p0, z0, -1),
+		z0 = svsubr_x (p0, z0, -1))
+
+/*
+** subr_m1_f64_x_untied:
+**	fmov	z0\.d, #-1\.0(?:e\+0)?
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f64_x_untied, svfloat64_t,
+		z0 = svsubr_n_f64_x (p0, z1, -1),
+		z0 = svsubr_x (p0, z1, -1))
+
+/*
+** ptrue_subr_f64_x_tied1:
+**	fsub	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f64_x_tied1, svfloat64_t,
+		z0 = svsubr_f64_x (svptrue_b64 (), z0, z1),
+		z0 = svsubr_x (svptrue_b64 (), z0, z1))
+
+/*
+** ptrue_subr_f64_x_tied2:
+**	fsub	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f64_x_tied2, svfloat64_t,
+		z0 = svsubr_f64_x (svptrue_b64 (), z1, z0),
+		z0 = svsubr_x (svptrue_b64 (), z1, z0))
+
+/*
+** ptrue_subr_f64_x_untied:
+**	fsub	z0\.d, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f64_x_untied, svfloat64_t,
+		z0 = svsubr_f64_x (svptrue_b64 (), z1, z2),
+		z0 = svsubr_x (svptrue_b64 (), z1, z2))
+
+/*
+** ptrue_subr_1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_1_f64_x_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_x (svptrue_b64 (), z0, 1),
+		z0 = svsubr_x (svptrue_b64 (), z0, 1))
+
+/*
+** ptrue_subr_1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_1_f64_x_untied, svfloat64_t,
+		z0 = svsubr_n_f64_x (svptrue_b64 (), z1, 1),
+		z0 = svsubr_x (svptrue_b64 (), z1, 1))
+
+/*
+** ptrue_subr_0p5_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_x (svptrue_b64 (), z0, 0.5),
+		z0 = svsubr_x (svptrue_b64 (), z0, 0.5))
+
+/*
+** ptrue_subr_0p5_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_0p5_f64_x_untied, svfloat64_t,
+		z0 = svsubr_n_f64_x (svptrue_b64 (), z1, 0.5),
+		z0 = svsubr_x (svptrue_b64 (), z1, 0.5))
+
+/*
+** ptrue_subr_m1_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+**	fsub	z0\.d, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_m1_f64_x_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_x (svptrue_b64 (), z0, -1),
+		z0 = svsubr_x (svptrue_b64 (), z0, -1))
+
+/*
+** ptrue_subr_m1_f64_x_untied:
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+**	fsub	z0\.d, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_m1_f64_x_untied, svfloat64_t,
+		z0 = svsubr_n_f64_x (svptrue_b64 (), z1, -1),
+		z0 = svsubr_x (svptrue_b64 (), z1, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f64_notrap.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f64_notrap.c
new file mode 100644
index 000000000..98598dd77
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_f64_notrap.c
@@ -0,0 +1,439 @@
+/* { dg-additional-options "-fno-trapping-math" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** subr_f64_m_tied1:
+**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_m_tied1, svfloat64_t,
+		z0 = svsubr_f64_m (p0, z0, z1),
+		z0 = svsubr_m (p0, z0, z1))
+
+/*
+** subr_f64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_m_tied2, svfloat64_t,
+		z0 = svsubr_f64_m (p0, z1, z0),
+		z0 = svsubr_m (p0, z1, z0))
+
+/*
+** subr_f64_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_m_untied, svfloat64_t,
+		z0 = svsubr_f64_m (p0, z1, z2),
+		z0 = svsubr_m (p0, z1, z2))
+
+/*
+** subr_d4_f64_m_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_d4_f64_m_tied1, svfloat64_t, double,
+		 z0 = svsubr_n_f64_m (p0, z0, d4),
+		 z0 = svsubr_m (p0, z0, d4))
+
+/*
+** subr_d4_f64_m_untied:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_d4_f64_m_untied, svfloat64_t, double,
+		 z0 = svsubr_n_f64_m (p0, z1, d4),
+		 z0 = svsubr_m (p0, z1, d4))
+
+/*
+** subr_1_f64_m_tied1:
+**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f64_m_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_m (p0, z0, 1),
+		z0 = svsubr_m (p0, z0, 1))
+
+/*
+** subr_1_f64_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f64_m_untied, svfloat64_t,
+		z0 = svsubr_n_f64_m (p0, z1, 1),
+		z0 = svsubr_m (p0, z1, 1))
+
+/*
+** subr_0p5_f64_m_tied1:
+**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f64_m_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_m (p0, z0, 0.5),
+		z0 = svsubr_m (p0, z0, 0.5))
+
+/*
+** subr_0p5_f64_m_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f64_m_untied, svfloat64_t,
+		z0 = svsubr_n_f64_m (p0, z1, 0.5),
+		z0 = svsubr_m (p0, z1, 0.5))
+
+/*
+** subr_m1_f64_m_tied1:
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f64_m_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_m (p0, z0, -1),
+		z0 = svsubr_m (p0, z0, -1))
+
+/*
+** subr_m1_f64_m_untied: { xfail *-*-* }
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f64_m_untied, svfloat64_t,
+		z0 = svsubr_n_f64_m (p0, z1, -1),
+		z0 = svsubr_m (p0, z1, -1))
+
+/*
+** subr_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsubr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_z_tied1, svfloat64_t,
+		z0 = svsubr_f64_z (p0, z0, z1),
+		z0 = svsubr_z (p0, z0, z1))
+
+/*
+** subr_f64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_z_tied2, svfloat64_t,
+		z0 = svsubr_f64_z (p0, z1, z0),
+		z0 = svsubr_z (p0, z1, z0))
+
+/*
+** subr_f64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsubr	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_z_untied, svfloat64_t,
+		z0 = svsubr_f64_z (p0, z1, z2),
+		z0 = svsubr_z (p0, z1, z2))
+
+/*
+** subr_d4_f64_z_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_d4_f64_z_tied1, svfloat64_t, double,
+		 z0 = svsubr_n_f64_z (p0, z0, d4),
+		 z0 = svsubr_z (p0, z0, d4))
+
+/*
+** subr_d4_f64_z_untied:
+**	mov	(z[0-9]+\.d), d4
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_d4_f64_z_untied, svfloat64_t, double,
+		 z0 = svsubr_n_f64_z (p0, z1, d4),
+		 z0 = svsubr_z (p0, z1, d4))
+
+/*
+** subr_1_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f64_z_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_z (p0, z0, 1),
+		z0 = svsubr_z (p0, z0, 1))
+
+/*
+** subr_1_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f64_z_untied, svfloat64_t,
+		z0 = svsubr_n_f64_z (p0, z1, 1),
+		z0 = svsubr_z (p0, z1, 1))
+
+/*
+** subr_0p5_f64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f64_z_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_z (p0, z0, 0.5),
+		z0 = svsubr_z (p0, z0, 0.5))
+
+/*
+** subr_0p5_f64_z_untied:
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f64_z_untied, svfloat64_t,
+		z0 = svsubr_n_f64_z (p0, z1, 0.5),
+		z0 = svsubr_z (p0, z1, 0.5))
+
+/*
+** subr_m1_f64_z_tied1:
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+**	movprfx	z0\.d, p0/z, z0\.d
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f64_z_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_z (p0, z0, -1),
+		z0 = svsubr_z (p0, z0, -1))
+
+/*
+** subr_m1_f64_z_untied:
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	fsubr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	fsub	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f64_z_untied, svfloat64_t,
+		z0 = svsubr_n_f64_z (p0, z1, -1),
+		z0 = svsubr_z (p0, z1, -1))
+
+/*
+** subr_f64_x_tied1:
+**	fsub	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_x_tied1, svfloat64_t,
+		z0 = svsubr_f64_x (p0, z0, z1),
+		z0 = svsubr_x (p0, z0, z1))
+
+/*
+** subr_f64_x_tied2:
+**	fsub	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_x_tied2, svfloat64_t,
+		z0 = svsubr_f64_x (p0, z1, z0),
+		z0 = svsubr_x (p0, z1, z0))
+
+/*
+** subr_f64_x_untied:
+**	fsub	z0\.d, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_f64_x_untied, svfloat64_t,
+		z0 = svsubr_f64_x (p0, z1, z2),
+		z0 = svsubr_x (p0, z1, z2))
+
+/*
+** subr_d4_f64_x_tied1:
+**	mov	(z[0-9]+\.d), d4
+**	fsub	z0\.d, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_d4_f64_x_tied1, svfloat64_t, double,
+		 z0 = svsubr_n_f64_x (p0, z0, d4),
+		 z0 = svsubr_x (p0, z0, d4))
+
+/*
+** subr_d4_f64_x_untied:
+**	mov	(z[0-9]+\.d), d4
+**	fsub	z0\.d, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZD (subr_d4_f64_x_untied, svfloat64_t, double,
+		 z0 = svsubr_n_f64_x (p0, z1, d4),
+		 z0 = svsubr_x (p0, z1, d4))
+
+/*
+** subr_1_f64_x_tied1:
+**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f64_x_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_x (p0, z0, 1),
+		z0 = svsubr_x (p0, z0, 1))
+
+/*
+** subr_1_f64_x_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, #1\.0
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_f64_x_untied, svfloat64_t,
+		z0 = svsubr_n_f64_x (p0, z1, 1),
+		z0 = svsubr_x (p0, z1, 1))
+
+/*
+** subr_0p5_f64_x_tied1:
+**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_x (p0, z0, 0.5),
+		z0 = svsubr_x (p0, z0, 0.5))
+
+/*
+** subr_0p5_f64_x_untied:
+**	movprfx	z0, z1
+**	fsubr	z0\.d, p0/m, z0\.d, #0\.5
+**	ret
+*/
+TEST_UNIFORM_Z (subr_0p5_f64_x_untied, svfloat64_t,
+		z0 = svsubr_n_f64_x (p0, z1, 0.5),
+		z0 = svsubr_x (p0, z1, 0.5))
+
+/*
+** subr_m1_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+**	fsub	z0\.d, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f64_x_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_x (p0, z0, -1),
+		z0 = svsubr_x (p0, z0, -1))
+
+/*
+** subr_m1_f64_x_untied:
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+**	fsub	z0\.d, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_f64_x_untied, svfloat64_t,
+		z0 = svsubr_n_f64_x (p0, z1, -1),
+		z0 = svsubr_x (p0, z1, -1))
+
+/*
+** ptrue_subr_f64_x_tied1:
+**	fsub	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f64_x_tied1, svfloat64_t,
+		z0 = svsubr_f64_x (svptrue_b64 (), z0, z1),
+		z0 = svsubr_x (svptrue_b64 (), z0, z1))
+
+/*
+** ptrue_subr_f64_x_tied2:
+**	fsub	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f64_x_tied2, svfloat64_t,
+		z0 = svsubr_f64_x (svptrue_b64 (), z1, z0),
+		z0 = svsubr_x (svptrue_b64 (), z1, z0))
+
+/*
+** ptrue_subr_f64_x_untied:
+**	fsub	z0\.d, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_f64_x_untied, svfloat64_t,
+		z0 = svsubr_f64_x (svptrue_b64 (), z1, z2),
+		z0 = svsubr_x (svptrue_b64 (), z1, z2))
+
+/*
+** ptrue_subr_1_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_1_f64_x_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_x (svptrue_b64 (), z0, 1),
+		z0 = svsubr_x (svptrue_b64 (), z0, 1))
+
+/*
+** ptrue_subr_1_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_1_f64_x_untied, svfloat64_t,
+		z0 = svsubr_n_f64_x (svptrue_b64 (), z1, 1),
+		z0 = svsubr_x (svptrue_b64 (), z1, 1))
+
+/*
+** ptrue_subr_0p5_f64_x_tied1:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_0p5_f64_x_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_x (svptrue_b64 (), z0, 0.5),
+		z0 = svsubr_x (svptrue_b64 (), z0, 0.5))
+
+/*
+** ptrue_subr_0p5_f64_x_untied:
+**	...
+**	ptrue	p[0-9]+\.b[^\n]*
+**	...
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_0p5_f64_x_untied, svfloat64_t,
+		z0 = svsubr_n_f64_x (svptrue_b64 (), z1, 0.5),
+		z0 = svsubr_x (svptrue_b64 (), z1, 0.5))
+
+/*
+** ptrue_subr_m1_f64_x_tied1:
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+**	fsub	z0\.d, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_m1_f64_x_tied1, svfloat64_t,
+		z0 = svsubr_n_f64_x (svptrue_b64 (), z0, -1),
+		z0 = svsubr_x (svptrue_b64 (), z0, -1))
+
+/*
+** ptrue_subr_m1_f64_x_untied:
+**	fmov	(z[0-9]+\.d), #-1\.0(?:e\+0)?
+**	fsub	z0\.d, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (ptrue_subr_m1_f64_x_untied, svfloat64_t,
+		z0 = svsubr_n_f64_x (svptrue_b64 (), z1, -1),
+		z0 = svsubr_x (svptrue_b64 (), z1, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s16.c
new file mode 100644
index 000000000..d3dad62da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s16.c
@@ -0,0 +1,324 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** subr_s16_m_tied1:
+**	subr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s16_m_tied1, svint16_t,
+		z0 = svsubr_s16_m (p0, z0, z1),
+		z0 = svsubr_m (p0, z0, z1))
+
+/*
+** subr_s16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	subr	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s16_m_tied2, svint16_t,
+		z0 = svsubr_s16_m (p0, z1, z0),
+		z0 = svsubr_m (p0, z1, z0))
+
+/*
+** subr_s16_m_untied:
+**	movprfx	z0, z1
+**	subr	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s16_m_untied, svint16_t,
+		z0 = svsubr_s16_m (p0, z1, z2),
+		z0 = svsubr_m (p0, z1, z2))
+
+/*
+** subr_w0_s16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	subr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s16_m_tied1, svint16_t, int16_t,
+		 z0 = svsubr_n_s16_m (p0, z0, x0),
+		 z0 = svsubr_m (p0, z0, x0))
+
+/*
+** subr_w0_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	subr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s16_m_untied, svint16_t, int16_t,
+		 z0 = svsubr_n_s16_m (p0, z1, x0),
+		 z0 = svsubr_m (p0, z1, x0))
+
+/*
+** subr_1_s16_m_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	subr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s16_m_tied1, svint16_t,
+		z0 = svsubr_n_s16_m (p0, z0, 1),
+		z0 = svsubr_m (p0, z0, 1))
+
+/*
+** subr_1_s16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0, z1
+**	subr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s16_m_untied, svint16_t,
+		z0 = svsubr_n_s16_m (p0, z1, 1),
+		z0 = svsubr_m (p0, z1, 1))
+
+/*
+** subr_m2_s16_m:
+**	mov	(z[0-9]+\.h), #-2
+**	subr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m2_s16_m, svint16_t,
+		z0 = svsubr_n_s16_m (p0, z0, -2),
+		z0 = svsubr_m (p0, z0, -2))
+
+/*
+** subr_s16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	subr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s16_z_tied1, svint16_t,
+		z0 = svsubr_s16_z (p0, z0, z1),
+		z0 = svsubr_z (p0, z0, z1))
+
+/*
+** subr_s16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	sub	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s16_z_tied2, svint16_t,
+		z0 = svsubr_s16_z (p0, z1, z0),
+		z0 = svsubr_z (p0, z1, z0))
+
+/*
+** subr_s16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	subr	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	sub	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s16_z_untied, svint16_t,
+		z0 = svsubr_s16_z (p0, z1, z2),
+		z0 = svsubr_z (p0, z1, z2))
+
+/*
+** subr_w0_s16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	subr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s16_z_tied1, svint16_t, int16_t,
+		 z0 = svsubr_n_s16_z (p0, z0, x0),
+		 z0 = svsubr_z (p0, z0, x0))
+
+/*
+** subr_w0_s16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	subr	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	sub	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s16_z_untied, svint16_t, int16_t,
+		 z0 = svsubr_n_s16_z (p0, z1, x0),
+		 z0 = svsubr_z (p0, z1, x0))
+
+/*
+** subr_1_s16_z_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	subr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s16_z_tied1, svint16_t,
+		z0 = svsubr_n_s16_z (p0, z0, 1),
+		z0 = svsubr_z (p0, z0, 1))
+
+/*
+** subr_1_s16_z_untied:
+**	mov	(z[0-9]+\.h), #1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	subr	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	sub	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s16_z_untied, svint16_t,
+		z0 = svsubr_n_s16_z (p0, z1, 1),
+		z0 = svsubr_z (p0, z1, 1))
+
+/*
+** subr_s16_x_tied1:
+**	sub	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s16_x_tied1, svint16_t,
+		z0 = svsubr_s16_x (p0, z0, z1),
+		z0 = svsubr_x (p0, z0, z1))
+
+/*
+** subr_s16_x_tied2:
+**	sub	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s16_x_tied2, svint16_t,
+		z0 = svsubr_s16_x (p0, z1, z0),
+		z0 = svsubr_x (p0, z1, z0))
+
+/*
+** subr_s16_x_untied:
+**	sub	z0\.h, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s16_x_untied, svint16_t,
+		z0 = svsubr_s16_x (p0, z1, z2),
+		z0 = svsubr_x (p0, z1, z2))
+
+/*
+** subr_w0_s16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	sub	z0\.h, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s16_x_tied1, svint16_t, int16_t,
+		 z0 = svsubr_n_s16_x (p0, z0, x0),
+		 z0 = svsubr_x (p0, z0, x0))
+
+/*
+** subr_w0_s16_x_untied:
+**	mov	(z[0-9]+\.h), w0
+**	sub	z0\.h, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s16_x_untied, svint16_t, int16_t,
+		 z0 = svsubr_n_s16_x (p0, z1, x0),
+		 z0 = svsubr_x (p0, z1, x0))
+
+/*
+** subr_1_s16_x_tied1:
+**	subr	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s16_x_tied1, svint16_t,
+		z0 = svsubr_n_s16_x (p0, z0, 1),
+		z0 = svsubr_x (p0, z0, 1))
+
+/*
+** subr_1_s16_x_untied:
+**	movprfx	z0, z1
+**	subr	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s16_x_untied, svint16_t,
+		z0 = svsubr_n_s16_x (p0, z1, 1),
+		z0 = svsubr_x (p0, z1, 1))
+
+/*
+** subr_127_s16_x:
+**	subr	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (subr_127_s16_x, svint16_t,
+		z0 = svsubr_n_s16_x (p0, z0, 127),
+		z0 = svsubr_x (p0, z0, 127))
+
+/*
+** subr_128_s16_x:
+**	subr	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (subr_128_s16_x, svint16_t,
+		z0 = svsubr_n_s16_x (p0, z0, 128),
+		z0 = svsubr_x (p0, z0, 128))
+
+/*
+** subr_255_s16_x:
+**	subr	z0\.h, z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (subr_255_s16_x, svint16_t,
+		z0 = svsubr_n_s16_x (p0, z0, 255),
+		z0 = svsubr_x (p0, z0, 255))
+
+/*
+** subr_256_s16_x:
+**	subr	z0\.h, z0\.h, #256
+**	ret
+*/
+TEST_UNIFORM_Z (subr_256_s16_x, svint16_t,
+		z0 = svsubr_n_s16_x (p0, z0, 256),
+		z0 = svsubr_x (p0, z0, 256))
+
+/*
+** subr_257_s16_x:
+**	mov	(z[0-9]+)\.b, #1
+**	sub	z0\.h, \1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_257_s16_x, svint16_t,
+		z0 = svsubr_n_s16_x (p0, z0, 257),
+		z0 = svsubr_x (p0, z0, 257))
+
+/*
+** subr_512_s16_x:
+**	subr	z0\.h, z0\.h, #512
+**	ret
+*/
+TEST_UNIFORM_Z (subr_512_s16_x, svint16_t,
+		z0 = svsubr_n_s16_x (p0, z0, 512),
+		z0 = svsubr_x (p0, z0, 512))
+
+/*
+** subr_65280_s16_x:
+**	subr	z0\.h, z0\.h, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (subr_65280_s16_x, svint16_t,
+		z0 = svsubr_n_s16_x (p0, z0, 0xff00),
+		z0 = svsubr_x (p0, z0, 0xff00))
+
+/*
+** subr_m1_s16_x_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	sub	z0\.h, \1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_s16_x_tied1, svint16_t,
+		z0 = svsubr_n_s16_x (p0, z0, -1),
+		z0 = svsubr_x (p0, z0, -1))
+
+/*
+** subr_m1_s16_x_untied:
+**	mov	(z[0-9]+)\.b, #-1
+**	sub	z0\.h, \1\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_s16_x_untied, svint16_t,
+		z0 = svsubr_n_s16_x (p0, z1, -1),
+		z0 = svsubr_x (p0, z1, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s32.c
new file mode 100644
index 000000000..ce62e2f21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s32.c
@@ -0,0 +1,344 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** subr_s32_m_tied1:
+**	subr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s32_m_tied1, svint32_t,
+		z0 = svsubr_s32_m (p0, z0, z1),
+		z0 = svsubr_m (p0, z0, z1))
+
+/*
+** subr_s32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	subr	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s32_m_tied2, svint32_t,
+		z0 = svsubr_s32_m (p0, z1, z0),
+		z0 = svsubr_m (p0, z1, z0))
+
+/*
+** subr_s32_m_untied:
+**	movprfx	z0, z1
+**	subr	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s32_m_untied, svint32_t,
+		z0 = svsubr_s32_m (p0, z1, z2),
+		z0 = svsubr_m (p0, z1, z2))
+
+/*
+** subr_w0_s32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	subr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s32_m_tied1, svint32_t, int32_t,
+		 z0 = svsubr_n_s32_m (p0, z0, x0),
+		 z0 = svsubr_m (p0, z0, x0))
+
+/*
+** subr_w0_s32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	subr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s32_m_untied, svint32_t, int32_t,
+		 z0 = svsubr_n_s32_m (p0, z1, x0),
+		 z0 = svsubr_m (p0, z1, x0))
+
+/*
+** subr_1_s32_m_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	subr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s32_m_tied1, svint32_t,
+		z0 = svsubr_n_s32_m (p0, z0, 1),
+		z0 = svsubr_m (p0, z0, 1))
+
+/*
+** subr_1_s32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0, z1
+**	subr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s32_m_untied, svint32_t,
+		z0 = svsubr_n_s32_m (p0, z1, 1),
+		z0 = svsubr_m (p0, z1, 1))
+
+/*
+** subr_m2_s32_m:
+**	mov	(z[0-9]+\.s), #-2
+**	subr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m2_s32_m, svint32_t,
+		z0 = svsubr_n_s32_m (p0, z0, -2),
+		z0 = svsubr_m (p0, z0, -2))
+
+/*
+** subr_s32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	subr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s32_z_tied1, svint32_t,
+		z0 = svsubr_s32_z (p0, z0, z1),
+		z0 = svsubr_z (p0, z0, z1))
+
+/*
+** subr_s32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sub	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s32_z_tied2, svint32_t,
+		z0 = svsubr_s32_z (p0, z1, z0),
+		z0 = svsubr_z (p0, z1, z0))
+
+/*
+** subr_s32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	subr	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	sub	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s32_z_untied, svint32_t,
+		z0 = svsubr_s32_z (p0, z1, z2),
+		z0 = svsubr_z (p0, z1, z2))
+
+/*
+** subr_w0_s32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	subr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s32_z_tied1, svint32_t, int32_t,
+		 z0 = svsubr_n_s32_z (p0, z0, x0),
+		 z0 = svsubr_z (p0, z0, x0))
+
+/*
+** subr_w0_s32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	subr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	sub	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s32_z_untied, svint32_t, int32_t,
+		 z0 = svsubr_n_s32_z (p0, z1, x0),
+		 z0 = svsubr_z (p0, z1, x0))
+
+/*
+** subr_1_s32_z_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	subr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s32_z_tied1, svint32_t,
+		z0 = svsubr_n_s32_z (p0, z0, 1),
+		z0 = svsubr_z (p0, z0, 1))
+
+/*
+** subr_1_s32_z_untied:
+**	mov	(z[0-9]+\.s), #1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	subr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	sub	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s32_z_untied, svint32_t,
+		z0 = svsubr_n_s32_z (p0, z1, 1),
+		z0 = svsubr_z (p0, z1, 1))
+
+/*
+** subr_s32_x_tied1:
+**	sub	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s32_x_tied1, svint32_t,
+		z0 = svsubr_s32_x (p0, z0, z1),
+		z0 = svsubr_x (p0, z0, z1))
+
+/*
+** subr_s32_x_tied2:
+**	sub	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s32_x_tied2, svint32_t,
+		z0 = svsubr_s32_x (p0, z1, z0),
+		z0 = svsubr_x (p0, z1, z0))
+
+/*
+** subr_s32_x_untied:
+**	sub	z0\.s, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s32_x_untied, svint32_t,
+		z0 = svsubr_s32_x (p0, z1, z2),
+		z0 = svsubr_x (p0, z1, z2))
+
+/*
+** subr_w0_s32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	sub	z0\.s, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s32_x_tied1, svint32_t, int32_t,
+		 z0 = svsubr_n_s32_x (p0, z0, x0),
+		 z0 = svsubr_x (p0, z0, x0))
+
+/*
+** subr_w0_s32_x_untied:
+**	mov	(z[0-9]+\.s), w0
+**	sub	z0\.s, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s32_x_untied, svint32_t, int32_t,
+		 z0 = svsubr_n_s32_x (p0, z1, x0),
+		 z0 = svsubr_x (p0, z1, x0))
+
+/*
+** subr_1_s32_x_tied1:
+**	subr	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s32_x_tied1, svint32_t,
+		z0 = svsubr_n_s32_x (p0, z0, 1),
+		z0 = svsubr_x (p0, z0, 1))
+
+/*
+** subr_1_s32_x_untied:
+**	movprfx	z0, z1
+**	subr	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s32_x_untied, svint32_t,
+		z0 = svsubr_n_s32_x (p0, z1, 1),
+		z0 = svsubr_x (p0, z1, 1))
+
+/*
+** subr_127_s32_x:
+**	subr	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (subr_127_s32_x, svint32_t,
+		z0 = svsubr_n_s32_x (p0, z0, 127),
+		z0 = svsubr_x (p0, z0, 127))
+
+/*
+** subr_128_s32_x:
+**	subr	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (subr_128_s32_x, svint32_t,
+		z0 = svsubr_n_s32_x (p0, z0, 128),
+		z0 = svsubr_x (p0, z0, 128))
+
+/*
+** subr_255_s32_x:
+**	subr	z0\.s, z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (subr_255_s32_x, svint32_t,
+		z0 = svsubr_n_s32_x (p0, z0, 255),
+		z0 = svsubr_x (p0, z0, 255))
+
+/*
+** subr_256_s32_x:
+**	subr	z0\.s, z0\.s, #256
+**	ret
+*/
+TEST_UNIFORM_Z (subr_256_s32_x, svint32_t,
+		z0 = svsubr_n_s32_x (p0, z0, 256),
+		z0 = svsubr_x (p0, z0, 256))
+
+/*
+** subr_511_s32_x:
+**	mov	(z[0-9]+\.s), #511
+**	sub	z0\.s, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_511_s32_x, svint32_t,
+		z0 = svsubr_n_s32_x (p0, z0, 511),
+		z0 = svsubr_x (p0, z0, 511))
+
+/*
+** subr_512_s32_x:
+**	subr	z0\.s, z0\.s, #512
+**	ret
+*/
+TEST_UNIFORM_Z (subr_512_s32_x, svint32_t,
+		z0 = svsubr_n_s32_x (p0, z0, 512),
+		z0 = svsubr_x (p0, z0, 512))
+
+/*
+** subr_65280_s32_x:
+**	subr	z0\.s, z0\.s, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (subr_65280_s32_x, svint32_t,
+		z0 = svsubr_n_s32_x (p0, z0, 0xff00),
+		z0 = svsubr_x (p0, z0, 0xff00))
+
+/*
+** subr_65535_s32_x:
+**	mov	(z[0-9]+\.s), #65535
+**	sub	z0\.s, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_65535_s32_x, svint32_t,
+		z0 = svsubr_n_s32_x (p0, z0, 65535),
+		z0 = svsubr_x (p0, z0, 65535))
+
+/*
+** subr_65536_s32_x:
+**	mov	(z[0-9]+\.s), #65536
+**	sub	z0\.s, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_65536_s32_x, svint32_t,
+		z0 = svsubr_n_s32_x (p0, z0, 65536),
+		z0 = svsubr_x (p0, z0, 65536))
+
+/*
+** subr_m1_s32_x_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	sub	z0\.s, \1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_s32_x_tied1, svint32_t,
+		z0 = svsubr_n_s32_x (p0, z0, -1),
+		z0 = svsubr_x (p0, z0, -1))
+
+/*
+** subr_m1_s32_x_untied:
+**	mov	(z[0-9]+)\.b, #-1
+**	sub	z0\.s, \1\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_s32_x_untied, svint32_t,
+		z0 = svsubr_n_s32_x (p0, z1, -1),
+		z0 = svsubr_x (p0, z1, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s64.c
new file mode 100644
index 000000000..ada9e977c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s64.c
@@ -0,0 +1,344 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** subr_s64_m_tied1:
+**	subr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s64_m_tied1, svint64_t,
+		z0 = svsubr_s64_m (p0, z0, z1),
+		z0 = svsubr_m (p0, z0, z1))
+
+/*
+** subr_s64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	subr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s64_m_tied2, svint64_t,
+		z0 = svsubr_s64_m (p0, z1, z0),
+		z0 = svsubr_m (p0, z1, z0))
+
+/*
+** subr_s64_m_untied:
+**	movprfx	z0, z1
+**	subr	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s64_m_untied, svint64_t,
+		z0 = svsubr_s64_m (p0, z1, z2),
+		z0 = svsubr_m (p0, z1, z2))
+
+/*
+** subr_x0_s64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	subr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_x0_s64_m_tied1, svint64_t, int64_t,
+		 z0 = svsubr_n_s64_m (p0, z0, x0),
+		 z0 = svsubr_m (p0, z0, x0))
+
+/*
+** subr_x0_s64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	subr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_x0_s64_m_untied, svint64_t, int64_t,
+		 z0 = svsubr_n_s64_m (p0, z1, x0),
+		 z0 = svsubr_m (p0, z1, x0))
+
+/*
+** subr_1_s64_m_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	subr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s64_m_tied1, svint64_t,
+		z0 = svsubr_n_s64_m (p0, z0, 1),
+		z0 = svsubr_m (p0, z0, 1))
+
+/*
+** subr_1_s64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0, z1
+**	subr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s64_m_untied, svint64_t,
+		z0 = svsubr_n_s64_m (p0, z1, 1),
+		z0 = svsubr_m (p0, z1, 1))
+
+/*
+** subr_m2_s64_m:
+**	mov	(z[0-9]+\.d), #-2
+**	subr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m2_s64_m, svint64_t,
+		z0 = svsubr_n_s64_m (p0, z0, -2),
+		z0 = svsubr_m (p0, z0, -2))
+
+/*
+** subr_s64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	subr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s64_z_tied1, svint64_t,
+		z0 = svsubr_s64_z (p0, z0, z1),
+		z0 = svsubr_z (p0, z0, z1))
+
+/*
+** subr_s64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sub	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s64_z_tied2, svint64_t,
+		z0 = svsubr_s64_z (p0, z1, z0),
+		z0 = svsubr_z (p0, z1, z0))
+
+/*
+** subr_s64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	subr	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	sub	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s64_z_untied, svint64_t,
+		z0 = svsubr_s64_z (p0, z1, z2),
+		z0 = svsubr_z (p0, z1, z2))
+
+/*
+** subr_x0_s64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	subr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_x0_s64_z_tied1, svint64_t, int64_t,
+		 z0 = svsubr_n_s64_z (p0, z0, x0),
+		 z0 = svsubr_z (p0, z0, x0))
+
+/*
+** subr_x0_s64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	subr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	sub	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_x0_s64_z_untied, svint64_t, int64_t,
+		 z0 = svsubr_n_s64_z (p0, z1, x0),
+		 z0 = svsubr_z (p0, z1, x0))
+
+/*
+** subr_1_s64_z_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	subr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s64_z_tied1, svint64_t,
+		z0 = svsubr_n_s64_z (p0, z0, 1),
+		z0 = svsubr_z (p0, z0, 1))
+
+/*
+** subr_1_s64_z_untied:
+**	mov	(z[0-9]+\.d), #1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	subr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	sub	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s64_z_untied, svint64_t,
+		z0 = svsubr_n_s64_z (p0, z1, 1),
+		z0 = svsubr_z (p0, z1, 1))
+
+/*
+** subr_s64_x_tied1:
+**	sub	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s64_x_tied1, svint64_t,
+		z0 = svsubr_s64_x (p0, z0, z1),
+		z0 = svsubr_x (p0, z0, z1))
+
+/*
+** subr_s64_x_tied2:
+**	sub	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s64_x_tied2, svint64_t,
+		z0 = svsubr_s64_x (p0, z1, z0),
+		z0 = svsubr_x (p0, z1, z0))
+
+/*
+** subr_s64_x_untied:
+**	sub	z0\.d, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s64_x_untied, svint64_t,
+		z0 = svsubr_s64_x (p0, z1, z2),
+		z0 = svsubr_x (p0, z1, z2))
+
+/*
+** subr_x0_s64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	sub	z0\.d, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_x0_s64_x_tied1, svint64_t, int64_t,
+		 z0 = svsubr_n_s64_x (p0, z0, x0),
+		 z0 = svsubr_x (p0, z0, x0))
+
+/*
+** subr_x0_s64_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	sub	z0\.d, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_x0_s64_x_untied, svint64_t, int64_t,
+		 z0 = svsubr_n_s64_x (p0, z1, x0),
+		 z0 = svsubr_x (p0, z1, x0))
+
+/*
+** subr_1_s64_x_tied1:
+**	subr	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s64_x_tied1, svint64_t,
+		z0 = svsubr_n_s64_x (p0, z0, 1),
+		z0 = svsubr_x (p0, z0, 1))
+
+/*
+** subr_1_s64_x_untied:
+**	movprfx	z0, z1
+**	subr	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s64_x_untied, svint64_t,
+		z0 = svsubr_n_s64_x (p0, z1, 1),
+		z0 = svsubr_x (p0, z1, 1))
+
+/*
+** subr_127_s64_x:
+**	subr	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (subr_127_s64_x, svint64_t,
+		z0 = svsubr_n_s64_x (p0, z0, 127),
+		z0 = svsubr_x (p0, z0, 127))
+
+/*
+** subr_128_s64_x:
+**	subr	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (subr_128_s64_x, svint64_t,
+		z0 = svsubr_n_s64_x (p0, z0, 128),
+		z0 = svsubr_x (p0, z0, 128))
+
+/*
+** subr_255_s64_x:
+**	subr	z0\.d, z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (subr_255_s64_x, svint64_t,
+		z0 = svsubr_n_s64_x (p0, z0, 255),
+		z0 = svsubr_x (p0, z0, 255))
+
+/*
+** subr_256_s64_x:
+**	subr	z0\.d, z0\.d, #256
+**	ret
+*/
+TEST_UNIFORM_Z (subr_256_s64_x, svint64_t,
+		z0 = svsubr_n_s64_x (p0, z0, 256),
+		z0 = svsubr_x (p0, z0, 256))
+
+/*
+** subr_511_s64_x:
+**	mov	(z[0-9]+\.d), #511
+**	sub	z0\.d, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_511_s64_x, svint64_t,
+		z0 = svsubr_n_s64_x (p0, z0, 511),
+		z0 = svsubr_x (p0, z0, 511))
+
+/*
+** subr_512_s64_x:
+**	subr	z0\.d, z0\.d, #512
+**	ret
+*/
+TEST_UNIFORM_Z (subr_512_s64_x, svint64_t,
+		z0 = svsubr_n_s64_x (p0, z0, 512),
+		z0 = svsubr_x (p0, z0, 512))
+
+/*
+** subr_65280_s64_x:
+**	subr	z0\.d, z0\.d, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (subr_65280_s64_x, svint64_t,
+		z0 = svsubr_n_s64_x (p0, z0, 0xff00),
+		z0 = svsubr_x (p0, z0, 0xff00))
+
+/*
+** subr_65535_s64_x:
+**	mov	(z[0-9]+\.d), #65535
+**	sub	z0\.d, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_65535_s64_x, svint64_t,
+		z0 = svsubr_n_s64_x (p0, z0, 65535),
+		z0 = svsubr_x (p0, z0, 65535))
+
+/*
+** subr_65536_s64_x:
+**	mov	(z[0-9]+\.d), #65536
+**	sub	z0\.d, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_65536_s64_x, svint64_t,
+		z0 = svsubr_n_s64_x (p0, z0, 65536),
+		z0 = svsubr_x (p0, z0, 65536))
+
+/*
+** subr_m1_s64_x_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	sub	z0\.d, \1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_s64_x_tied1, svint64_t,
+		z0 = svsubr_n_s64_x (p0, z0, -1),
+		z0 = svsubr_x (p0, z0, -1))
+
+/*
+** subr_m1_s64_x_untied:
+**	mov	(z[0-9]+)\.b, #-1
+**	sub	z0\.d, \1\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_s64_x_untied, svint64_t,
+		z0 = svsubr_n_s64_x (p0, z1, -1),
+		z0 = svsubr_x (p0, z1, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s8.c
new file mode 100644
index 000000000..90d2a6de9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_s8.c
@@ -0,0 +1,294 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** subr_s8_m_tied1:
+**	subr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s8_m_tied1, svint8_t,
+		z0 = svsubr_s8_m (p0, z0, z1),
+		z0 = svsubr_m (p0, z0, z1))
+
+/*
+** subr_s8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	subr	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s8_m_tied2, svint8_t,
+		z0 = svsubr_s8_m (p0, z1, z0),
+		z0 = svsubr_m (p0, z1, z0))
+
+/*
+** subr_s8_m_untied:
+**	movprfx	z0, z1
+**	subr	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s8_m_untied, svint8_t,
+		z0 = svsubr_s8_m (p0, z1, z2),
+		z0 = svsubr_m (p0, z1, z2))
+
+/*
+** subr_w0_s8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	subr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s8_m_tied1, svint8_t, int8_t,
+		 z0 = svsubr_n_s8_m (p0, z0, x0),
+		 z0 = svsubr_m (p0, z0, x0))
+
+/*
+** subr_w0_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	subr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s8_m_untied, svint8_t, int8_t,
+		 z0 = svsubr_n_s8_m (p0, z1, x0),
+		 z0 = svsubr_m (p0, z1, x0))
+
+/*
+** subr_1_s8_m_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	subr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s8_m_tied1, svint8_t,
+		z0 = svsubr_n_s8_m (p0, z0, 1),
+		z0 = svsubr_m (p0, z0, 1))
+
+/*
+** subr_1_s8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0, z1
+**	subr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s8_m_untied, svint8_t,
+		z0 = svsubr_n_s8_m (p0, z1, 1),
+		z0 = svsubr_m (p0, z1, 1))
+
+/*
+** subr_m1_s8_m:
+**	mov	(z[0-9]+\.b), #-1
+**	subr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_s8_m, svint8_t,
+		z0 = svsubr_n_s8_m (p0, z0, -1),
+		z0 = svsubr_m (p0, z0, -1))
+
+/*
+** subr_s8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	subr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s8_z_tied1, svint8_t,
+		z0 = svsubr_s8_z (p0, z0, z1),
+		z0 = svsubr_z (p0, z0, z1))
+
+/*
+** subr_s8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	sub	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s8_z_tied2, svint8_t,
+		z0 = svsubr_s8_z (p0, z1, z0),
+		z0 = svsubr_z (p0, z1, z0))
+
+/*
+** subr_s8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	subr	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	sub	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s8_z_untied, svint8_t,
+		z0 = svsubr_s8_z (p0, z1, z2),
+		z0 = svsubr_z (p0, z1, z2))
+
+/*
+** subr_w0_s8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	subr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s8_z_tied1, svint8_t, int8_t,
+		 z0 = svsubr_n_s8_z (p0, z0, x0),
+		 z0 = svsubr_z (p0, z0, x0))
+
+/*
+** subr_w0_s8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	subr	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	sub	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s8_z_untied, svint8_t, int8_t,
+		 z0 = svsubr_n_s8_z (p0, z1, x0),
+		 z0 = svsubr_z (p0, z1, x0))
+
+/*
+** subr_1_s8_z_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	subr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s8_z_tied1, svint8_t,
+		z0 = svsubr_n_s8_z (p0, z0, 1),
+		z0 = svsubr_z (p0, z0, 1))
+
+/*
+** subr_1_s8_z_untied:
+**	mov	(z[0-9]+\.b), #1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	subr	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	sub	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s8_z_untied, svint8_t,
+		z0 = svsubr_n_s8_z (p0, z1, 1),
+		z0 = svsubr_z (p0, z1, 1))
+
+/*
+** subr_s8_x_tied1:
+**	sub	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s8_x_tied1, svint8_t,
+		z0 = svsubr_s8_x (p0, z0, z1),
+		z0 = svsubr_x (p0, z0, z1))
+
+/*
+** subr_s8_x_tied2:
+**	sub	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s8_x_tied2, svint8_t,
+		z0 = svsubr_s8_x (p0, z1, z0),
+		z0 = svsubr_x (p0, z1, z0))
+
+/*
+** subr_s8_x_untied:
+**	sub	z0\.b, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (subr_s8_x_untied, svint8_t,
+		z0 = svsubr_s8_x (p0, z1, z2),
+		z0 = svsubr_x (p0, z1, z2))
+
+/*
+** subr_w0_s8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	sub	z0\.b, \1, z0\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s8_x_tied1, svint8_t, int8_t,
+		 z0 = svsubr_n_s8_x (p0, z0, x0),
+		 z0 = svsubr_x (p0, z0, x0))
+
+/*
+** subr_w0_s8_x_untied:
+**	mov	(z[0-9]+\.b), w0
+**	sub	z0\.b, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_s8_x_untied, svint8_t, int8_t,
+		 z0 = svsubr_n_s8_x (p0, z1, x0),
+		 z0 = svsubr_x (p0, z1, x0))
+
+/*
+** subr_1_s8_x_tied1:
+**	subr	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s8_x_tied1, svint8_t,
+		z0 = svsubr_n_s8_x (p0, z0, 1),
+		z0 = svsubr_x (p0, z0, 1))
+
+/*
+** subr_1_s8_x_untied:
+**	movprfx	z0, z1
+**	subr	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_s8_x_untied, svint8_t,
+		z0 = svsubr_n_s8_x (p0, z1, 1),
+		z0 = svsubr_x (p0, z1, 1))
+
+/*
+** subr_127_s8_x:
+**	subr	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (subr_127_s8_x, svint8_t,
+		z0 = svsubr_n_s8_x (p0, z0, 127),
+		z0 = svsubr_x (p0, z0, 127))
+
+/*
+** subr_128_s8_x:
+**	subr	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (subr_128_s8_x, svint8_t,
+		z0 = svsubr_n_s8_x (p0, z0, 128),
+		z0 = svsubr_x (p0, z0, 128))
+
+/*
+** subr_255_s8_x:
+**	subr	z0\.b, z0\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (subr_255_s8_x, svint8_t,
+		z0 = svsubr_n_s8_x (p0, z0, 255),
+		z0 = svsubr_x (p0, z0, 255))
+
+/*
+** subr_m1_s8_x:
+**	subr	z0\.b, z0\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_s8_x, svint8_t,
+		z0 = svsubr_n_s8_x (p0, z0, -1),
+		z0 = svsubr_x (p0, z0, -1))
+
+/*
+** subr_m127_s8_x:
+**	subr	z0\.b, z0\.b, #129
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m127_s8_x, svint8_t,
+		z0 = svsubr_n_s8_x (p0, z0, -127),
+		z0 = svsubr_x (p0, z0, -127))
+
+/*
+** subr_m128_s8_x:
+**	subr	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m128_s8_x, svint8_t,
+		z0 = svsubr_n_s8_x (p0, z0, -128),
+		z0 = svsubr_x (p0, z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u16.c
new file mode 100644
index 000000000..379a80fb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u16.c
@@ -0,0 +1,324 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** subr_u16_m_tied1:
+**	subr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u16_m_tied1, svuint16_t,
+		z0 = svsubr_u16_m (p0, z0, z1),
+		z0 = svsubr_m (p0, z0, z1))
+
+/*
+** subr_u16_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	subr	z0\.h, p0/m, z0\.h, \1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u16_m_tied2, svuint16_t,
+		z0 = svsubr_u16_m (p0, z1, z0),
+		z0 = svsubr_m (p0, z1, z0))
+
+/*
+** subr_u16_m_untied:
+**	movprfx	z0, z1
+**	subr	z0\.h, p0/m, z0\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u16_m_untied, svuint16_t,
+		z0 = svsubr_u16_m (p0, z1, z2),
+		z0 = svsubr_m (p0, z1, z2))
+
+/*
+** subr_w0_u16_m_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	subr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u16_m_tied1, svuint16_t, uint16_t,
+		 z0 = svsubr_n_u16_m (p0, z0, x0),
+		 z0 = svsubr_m (p0, z0, x0))
+
+/*
+** subr_w0_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0, z1
+**	subr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u16_m_untied, svuint16_t, uint16_t,
+		 z0 = svsubr_n_u16_m (p0, z1, x0),
+		 z0 = svsubr_m (p0, z1, x0))
+
+/*
+** subr_1_u16_m_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	subr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u16_m_tied1, svuint16_t,
+		z0 = svsubr_n_u16_m (p0, z0, 1),
+		z0 = svsubr_m (p0, z0, 1))
+
+/*
+** subr_1_u16_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0, z1
+**	subr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u16_m_untied, svuint16_t,
+		z0 = svsubr_n_u16_m (p0, z1, 1),
+		z0 = svsubr_m (p0, z1, 1))
+
+/*
+** subr_m2_u16_m:
+**	mov	(z[0-9]+\.h), #-2
+**	subr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m2_u16_m, svuint16_t,
+		z0 = svsubr_n_u16_m (p0, z0, -2),
+		z0 = svsubr_m (p0, z0, -2))
+
+/*
+** subr_u16_z_tied1:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	subr	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u16_z_tied1, svuint16_t,
+		z0 = svsubr_u16_z (p0, z0, z1),
+		z0 = svsubr_z (p0, z0, z1))
+
+/*
+** subr_u16_z_tied2:
+**	movprfx	z0\.h, p0/z, z0\.h
+**	sub	z0\.h, p0/m, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u16_z_tied2, svuint16_t,
+		z0 = svsubr_u16_z (p0, z1, z0),
+		z0 = svsubr_z (p0, z1, z0))
+
+/*
+** subr_u16_z_untied:
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	subr	z0\.h, p0/m, z0\.h, z2\.h
+** |
+**	movprfx	z0\.h, p0/z, z2\.h
+**	sub	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u16_z_untied, svuint16_t,
+		z0 = svsubr_u16_z (p0, z1, z2),
+		z0 = svsubr_z (p0, z1, z2))
+
+/*
+** subr_w0_u16_z_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	movprfx	z0\.h, p0/z, z0\.h
+**	subr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u16_z_tied1, svuint16_t, uint16_t,
+		 z0 = svsubr_n_u16_z (p0, z0, x0),
+		 z0 = svsubr_z (p0, z0, x0))
+
+/*
+** subr_w0_u16_z_untied:
+**	mov	(z[0-9]+\.h), w0
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	subr	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	sub	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u16_z_untied, svuint16_t, uint16_t,
+		 z0 = svsubr_n_u16_z (p0, z1, x0),
+		 z0 = svsubr_z (p0, z1, x0))
+
+/*
+** subr_1_u16_z_tied1:
+**	mov	(z[0-9]+\.h), #1
+**	movprfx	z0\.h, p0/z, z0\.h
+**	subr	z0\.h, p0/m, z0\.h, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u16_z_tied1, svuint16_t,
+		z0 = svsubr_n_u16_z (p0, z0, 1),
+		z0 = svsubr_z (p0, z0, 1))
+
+/*
+** subr_1_u16_z_untied:
+**	mov	(z[0-9]+\.h), #1
+** (
+**	movprfx	z0\.h, p0/z, z1\.h
+**	subr	z0\.h, p0/m, z0\.h, \1
+** |
+**	movprfx	z0\.h, p0/z, \1
+**	sub	z0\.h, p0/m, z0\.h, z1\.h
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u16_z_untied, svuint16_t,
+		z0 = svsubr_n_u16_z (p0, z1, 1),
+		z0 = svsubr_z (p0, z1, 1))
+
+/*
+** subr_u16_x_tied1:
+**	sub	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u16_x_tied1, svuint16_t,
+		z0 = svsubr_u16_x (p0, z0, z1),
+		z0 = svsubr_x (p0, z0, z1))
+
+/*
+** subr_u16_x_tied2:
+**	sub	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u16_x_tied2, svuint16_t,
+		z0 = svsubr_u16_x (p0, z1, z0),
+		z0 = svsubr_x (p0, z1, z0))
+
+/*
+** subr_u16_x_untied:
+**	sub	z0\.h, z2\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u16_x_untied, svuint16_t,
+		z0 = svsubr_u16_x (p0, z1, z2),
+		z0 = svsubr_x (p0, z1, z2))
+
+/*
+** subr_w0_u16_x_tied1:
+**	mov	(z[0-9]+\.h), w0
+**	sub	z0\.h, \1, z0\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u16_x_tied1, svuint16_t, uint16_t,
+		 z0 = svsubr_n_u16_x (p0, z0, x0),
+		 z0 = svsubr_x (p0, z0, x0))
+
+/*
+** subr_w0_u16_x_untied:
+**	mov	(z[0-9]+\.h), w0
+**	sub	z0\.h, \1, z1\.h
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u16_x_untied, svuint16_t, uint16_t,
+		 z0 = svsubr_n_u16_x (p0, z1, x0),
+		 z0 = svsubr_x (p0, z1, x0))
+
+/*
+** subr_1_u16_x_tied1:
+**	subr	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u16_x_tied1, svuint16_t,
+		z0 = svsubr_n_u16_x (p0, z0, 1),
+		z0 = svsubr_x (p0, z0, 1))
+
+/*
+** subr_1_u16_x_untied:
+**	movprfx	z0, z1
+**	subr	z0\.h, z0\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u16_x_untied, svuint16_t,
+		z0 = svsubr_n_u16_x (p0, z1, 1),
+		z0 = svsubr_x (p0, z1, 1))
+
+/*
+** subr_127_u16_x:
+**	subr	z0\.h, z0\.h, #127
+**	ret
+*/
+TEST_UNIFORM_Z (subr_127_u16_x, svuint16_t,
+		z0 = svsubr_n_u16_x (p0, z0, 127),
+		z0 = svsubr_x (p0, z0, 127))
+
+/*
+** subr_128_u16_x:
+**	subr	z0\.h, z0\.h, #128
+**	ret
+*/
+TEST_UNIFORM_Z (subr_128_u16_x, svuint16_t,
+		z0 = svsubr_n_u16_x (p0, z0, 128),
+		z0 = svsubr_x (p0, z0, 128))
+
+/*
+** subr_255_u16_x:
+**	subr	z0\.h, z0\.h, #255
+**	ret
+*/
+TEST_UNIFORM_Z (subr_255_u16_x, svuint16_t,
+		z0 = svsubr_n_u16_x (p0, z0, 255),
+		z0 = svsubr_x (p0, z0, 255))
+
+/*
+** subr_256_u16_x:
+**	subr	z0\.h, z0\.h, #256
+**	ret
+*/
+TEST_UNIFORM_Z (subr_256_u16_x, svuint16_t,
+		z0 = svsubr_n_u16_x (p0, z0, 256),
+		z0 = svsubr_x (p0, z0, 256))
+
+/*
+** subr_257_u16_x:
+**	mov	(z[0-9]+)\.b, #1
+**	sub	z0\.h, \1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_257_u16_x, svuint16_t,
+		z0 = svsubr_n_u16_x (p0, z0, 257),
+		z0 = svsubr_x (p0, z0, 257))
+
+/*
+** subr_512_u16_x:
+**	subr	z0\.h, z0\.h, #512
+**	ret
+*/
+TEST_UNIFORM_Z (subr_512_u16_x, svuint16_t,
+		z0 = svsubr_n_u16_x (p0, z0, 512),
+		z0 = svsubr_x (p0, z0, 512))
+
+/*
+** subr_65280_u16_x:
+**	subr	z0\.h, z0\.h, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (subr_65280_u16_x, svuint16_t,
+		z0 = svsubr_n_u16_x (p0, z0, 0xff00),
+		z0 = svsubr_x (p0, z0, 0xff00))
+
+/*
+** subr_m1_u16_x_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	sub	z0\.h, \1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_u16_x_tied1, svuint16_t,
+		z0 = svsubr_n_u16_x (p0, z0, -1),
+		z0 = svsubr_x (p0, z0, -1))
+
+/*
+** subr_m1_u16_x_untied:
+**	mov	(z[0-9]+)\.b, #-1
+**	sub	z0\.h, \1\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_u16_x_untied, svuint16_t,
+		z0 = svsubr_n_u16_x (p0, z1, -1),
+		z0 = svsubr_x (p0, z1, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u32.c
new file mode 100644
index 000000000..215f8b449
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u32.c
@@ -0,0 +1,344 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** subr_u32_m_tied1:
+**	subr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u32_m_tied1, svuint32_t,
+		z0 = svsubr_u32_m (p0, z0, z1),
+		z0 = svsubr_m (p0, z0, z1))
+
+/*
+** subr_u32_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	subr	z0\.s, p0/m, z0\.s, \1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u32_m_tied2, svuint32_t,
+		z0 = svsubr_u32_m (p0, z1, z0),
+		z0 = svsubr_m (p0, z1, z0))
+
+/*
+** subr_u32_m_untied:
+**	movprfx	z0, z1
+**	subr	z0\.s, p0/m, z0\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u32_m_untied, svuint32_t,
+		z0 = svsubr_u32_m (p0, z1, z2),
+		z0 = svsubr_m (p0, z1, z2))
+
+/*
+** subr_w0_u32_m_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	subr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u32_m_tied1, svuint32_t, uint32_t,
+		 z0 = svsubr_n_u32_m (p0, z0, x0),
+		 z0 = svsubr_m (p0, z0, x0))
+
+/*
+** subr_w0_u32_m_untied:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0, z1
+**	subr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u32_m_untied, svuint32_t, uint32_t,
+		 z0 = svsubr_n_u32_m (p0, z1, x0),
+		 z0 = svsubr_m (p0, z1, x0))
+
+/*
+** subr_1_u32_m_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	subr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u32_m_tied1, svuint32_t,
+		z0 = svsubr_n_u32_m (p0, z0, 1),
+		z0 = svsubr_m (p0, z0, 1))
+
+/*
+** subr_1_u32_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0, z1
+**	subr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u32_m_untied, svuint32_t,
+		z0 = svsubr_n_u32_m (p0, z1, 1),
+		z0 = svsubr_m (p0, z1, 1))
+
+/*
+** subr_m2_u32_m:
+**	mov	(z[0-9]+\.s), #-2
+**	subr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m2_u32_m, svuint32_t,
+		z0 = svsubr_n_u32_m (p0, z0, -2),
+		z0 = svsubr_m (p0, z0, -2))
+
+/*
+** subr_u32_z_tied1:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	subr	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u32_z_tied1, svuint32_t,
+		z0 = svsubr_u32_z (p0, z0, z1),
+		z0 = svsubr_z (p0, z0, z1))
+
+/*
+** subr_u32_z_tied2:
+**	movprfx	z0\.s, p0/z, z0\.s
+**	sub	z0\.s, p0/m, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u32_z_tied2, svuint32_t,
+		z0 = svsubr_u32_z (p0, z1, z0),
+		z0 = svsubr_z (p0, z1, z0))
+
+/*
+** subr_u32_z_untied:
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	subr	z0\.s, p0/m, z0\.s, z2\.s
+** |
+**	movprfx	z0\.s, p0/z, z2\.s
+**	sub	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u32_z_untied, svuint32_t,
+		z0 = svsubr_u32_z (p0, z1, z2),
+		z0 = svsubr_z (p0, z1, z2))
+
+/*
+** subr_w0_u32_z_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	movprfx	z0\.s, p0/z, z0\.s
+**	subr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u32_z_tied1, svuint32_t, uint32_t,
+		 z0 = svsubr_n_u32_z (p0, z0, x0),
+		 z0 = svsubr_z (p0, z0, x0))
+
+/*
+** subr_w0_u32_z_untied:
+**	mov	(z[0-9]+\.s), w0
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	subr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	sub	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u32_z_untied, svuint32_t, uint32_t,
+		 z0 = svsubr_n_u32_z (p0, z1, x0),
+		 z0 = svsubr_z (p0, z1, x0))
+
+/*
+** subr_1_u32_z_tied1:
+**	mov	(z[0-9]+\.s), #1
+**	movprfx	z0\.s, p0/z, z0\.s
+**	subr	z0\.s, p0/m, z0\.s, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u32_z_tied1, svuint32_t,
+		z0 = svsubr_n_u32_z (p0, z0, 1),
+		z0 = svsubr_z (p0, z0, 1))
+
+/*
+** subr_1_u32_z_untied:
+**	mov	(z[0-9]+\.s), #1
+** (
+**	movprfx	z0\.s, p0/z, z1\.s
+**	subr	z0\.s, p0/m, z0\.s, \1
+** |
+**	movprfx	z0\.s, p0/z, \1
+**	sub	z0\.s, p0/m, z0\.s, z1\.s
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u32_z_untied, svuint32_t,
+		z0 = svsubr_n_u32_z (p0, z1, 1),
+		z0 = svsubr_z (p0, z1, 1))
+
+/*
+** subr_u32_x_tied1:
+**	sub	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u32_x_tied1, svuint32_t,
+		z0 = svsubr_u32_x (p0, z0, z1),
+		z0 = svsubr_x (p0, z0, z1))
+
+/*
+** subr_u32_x_tied2:
+**	sub	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u32_x_tied2, svuint32_t,
+		z0 = svsubr_u32_x (p0, z1, z0),
+		z0 = svsubr_x (p0, z1, z0))
+
+/*
+** subr_u32_x_untied:
+**	sub	z0\.s, z2\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u32_x_untied, svuint32_t,
+		z0 = svsubr_u32_x (p0, z1, z2),
+		z0 = svsubr_x (p0, z1, z2))
+
+/*
+** subr_w0_u32_x_tied1:
+**	mov	(z[0-9]+\.s), w0
+**	sub	z0\.s, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u32_x_tied1, svuint32_t, uint32_t,
+		 z0 = svsubr_n_u32_x (p0, z0, x0),
+		 z0 = svsubr_x (p0, z0, x0))
+
+/*
+** subr_w0_u32_x_untied:
+**	mov	(z[0-9]+\.s), w0
+**	sub	z0\.s, \1, z1\.s
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u32_x_untied, svuint32_t, uint32_t,
+		 z0 = svsubr_n_u32_x (p0, z1, x0),
+		 z0 = svsubr_x (p0, z1, x0))
+
+/*
+** subr_1_u32_x_tied1:
+**	subr	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u32_x_tied1, svuint32_t,
+		z0 = svsubr_n_u32_x (p0, z0, 1),
+		z0 = svsubr_x (p0, z0, 1))
+
+/*
+** subr_1_u32_x_untied:
+**	movprfx	z0, z1
+**	subr	z0\.s, z0\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u32_x_untied, svuint32_t,
+		z0 = svsubr_n_u32_x (p0, z1, 1),
+		z0 = svsubr_x (p0, z1, 1))
+
+/*
+** subr_127_u32_x:
+**	subr	z0\.s, z0\.s, #127
+**	ret
+*/
+TEST_UNIFORM_Z (subr_127_u32_x, svuint32_t,
+		z0 = svsubr_n_u32_x (p0, z0, 127),
+		z0 = svsubr_x (p0, z0, 127))
+
+/*
+** subr_128_u32_x:
+**	subr	z0\.s, z0\.s, #128
+**	ret
+*/
+TEST_UNIFORM_Z (subr_128_u32_x, svuint32_t,
+		z0 = svsubr_n_u32_x (p0, z0, 128),
+		z0 = svsubr_x (p0, z0, 128))
+
+/*
+** subr_255_u32_x:
+**	subr	z0\.s, z0\.s, #255
+**	ret
+*/
+TEST_UNIFORM_Z (subr_255_u32_x, svuint32_t,
+		z0 = svsubr_n_u32_x (p0, z0, 255),
+		z0 = svsubr_x (p0, z0, 255))
+
+/*
+** subr_256_u32_x:
+**	subr	z0\.s, z0\.s, #256
+**	ret
+*/
+TEST_UNIFORM_Z (subr_256_u32_x, svuint32_t,
+		z0 = svsubr_n_u32_x (p0, z0, 256),
+		z0 = svsubr_x (p0, z0, 256))
+
+/*
+** subr_511_u32_x:
+**	mov	(z[0-9]+\.s), #511
+**	sub	z0\.s, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_511_u32_x, svuint32_t,
+		z0 = svsubr_n_u32_x (p0, z0, 511),
+		z0 = svsubr_x (p0, z0, 511))
+
+/*
+** subr_512_u32_x:
+**	subr	z0\.s, z0\.s, #512
+**	ret
+*/
+TEST_UNIFORM_Z (subr_512_u32_x, svuint32_t,
+		z0 = svsubr_n_u32_x (p0, z0, 512),
+		z0 = svsubr_x (p0, z0, 512))
+
+/*
+** subr_65280_u32_x:
+**	subr	z0\.s, z0\.s, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (subr_65280_u32_x, svuint32_t,
+		z0 = svsubr_n_u32_x (p0, z0, 0xff00),
+		z0 = svsubr_x (p0, z0, 0xff00))
+
+/*
+** subr_65535_u32_x:
+**	mov	(z[0-9]+\.s), #65535
+**	sub	z0\.s, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_65535_u32_x, svuint32_t,
+		z0 = svsubr_n_u32_x (p0, z0, 65535),
+		z0 = svsubr_x (p0, z0, 65535))
+
+/*
+** subr_65536_u32_x:
+**	mov	(z[0-9]+\.s), #65536
+**	sub	z0\.s, \1, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_65536_u32_x, svuint32_t,
+		z0 = svsubr_n_u32_x (p0, z0, 65536),
+		z0 = svsubr_x (p0, z0, 65536))
+
+/*
+** subr_m1_u32_x_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	sub	z0\.s, \1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_u32_x_tied1, svuint32_t,
+		z0 = svsubr_n_u32_x (p0, z0, -1),
+		z0 = svsubr_x (p0, z0, -1))
+
+/*
+** subr_m1_u32_x_untied:
+**	mov	(z[0-9]+)\.b, #-1
+**	sub	z0\.s, \1\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_u32_x_untied, svuint32_t,
+		z0 = svsubr_n_u32_x (p0, z1, -1),
+		z0 = svsubr_x (p0, z1, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u64.c
new file mode 100644
index 000000000..78d94515b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u64.c
@@ -0,0 +1,344 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** subr_u64_m_tied1:
+**	subr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u64_m_tied1, svuint64_t,
+		z0 = svsubr_u64_m (p0, z0, z1),
+		z0 = svsubr_m (p0, z0, z1))
+
+/*
+** subr_u64_m_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	subr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u64_m_tied2, svuint64_t,
+		z0 = svsubr_u64_m (p0, z1, z0),
+		z0 = svsubr_m (p0, z1, z0))
+
+/*
+** subr_u64_m_untied:
+**	movprfx	z0, z1
+**	subr	z0\.d, p0/m, z0\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u64_m_untied, svuint64_t,
+		z0 = svsubr_u64_m (p0, z1, z2),
+		z0 = svsubr_m (p0, z1, z2))
+
+/*
+** subr_x0_u64_m_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	subr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_x0_u64_m_tied1, svuint64_t, uint64_t,
+		 z0 = svsubr_n_u64_m (p0, z0, x0),
+		 z0 = svsubr_m (p0, z0, x0))
+
+/*
+** subr_x0_u64_m_untied:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0, z1
+**	subr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_x0_u64_m_untied, svuint64_t, uint64_t,
+		 z0 = svsubr_n_u64_m (p0, z1, x0),
+		 z0 = svsubr_m (p0, z1, x0))
+
+/*
+** subr_1_u64_m_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	subr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u64_m_tied1, svuint64_t,
+		z0 = svsubr_n_u64_m (p0, z0, 1),
+		z0 = svsubr_m (p0, z0, 1))
+
+/*
+** subr_1_u64_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0, z1
+**	subr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u64_m_untied, svuint64_t,
+		z0 = svsubr_n_u64_m (p0, z1, 1),
+		z0 = svsubr_m (p0, z1, 1))
+
+/*
+** subr_m2_u64_m:
+**	mov	(z[0-9]+\.d), #-2
+**	subr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m2_u64_m, svuint64_t,
+		z0 = svsubr_n_u64_m (p0, z0, -2),
+		z0 = svsubr_m (p0, z0, -2))
+
+/*
+** subr_u64_z_tied1:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	subr	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u64_z_tied1, svuint64_t,
+		z0 = svsubr_u64_z (p0, z0, z1),
+		z0 = svsubr_z (p0, z0, z1))
+
+/*
+** subr_u64_z_tied2:
+**	movprfx	z0\.d, p0/z, z0\.d
+**	sub	z0\.d, p0/m, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u64_z_tied2, svuint64_t,
+		z0 = svsubr_u64_z (p0, z1, z0),
+		z0 = svsubr_z (p0, z1, z0))
+
+/*
+** subr_u64_z_untied:
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	subr	z0\.d, p0/m, z0\.d, z2\.d
+** |
+**	movprfx	z0\.d, p0/z, z2\.d
+**	sub	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u64_z_untied, svuint64_t,
+		z0 = svsubr_u64_z (p0, z1, z2),
+		z0 = svsubr_z (p0, z1, z2))
+
+/*
+** subr_x0_u64_z_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	movprfx	z0\.d, p0/z, z0\.d
+**	subr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_x0_u64_z_tied1, svuint64_t, uint64_t,
+		 z0 = svsubr_n_u64_z (p0, z0, x0),
+		 z0 = svsubr_z (p0, z0, x0))
+
+/*
+** subr_x0_u64_z_untied:
+**	mov	(z[0-9]+\.d), x0
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	subr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	sub	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_x0_u64_z_untied, svuint64_t, uint64_t,
+		 z0 = svsubr_n_u64_z (p0, z1, x0),
+		 z0 = svsubr_z (p0, z1, x0))
+
+/*
+** subr_1_u64_z_tied1:
+**	mov	(z[0-9]+\.d), #1
+**	movprfx	z0\.d, p0/z, z0\.d
+**	subr	z0\.d, p0/m, z0\.d, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u64_z_tied1, svuint64_t,
+		z0 = svsubr_n_u64_z (p0, z0, 1),
+		z0 = svsubr_z (p0, z0, 1))
+
+/*
+** subr_1_u64_z_untied:
+**	mov	(z[0-9]+\.d), #1
+** (
+**	movprfx	z0\.d, p0/z, z1\.d
+**	subr	z0\.d, p0/m, z0\.d, \1
+** |
+**	movprfx	z0\.d, p0/z, \1
+**	sub	z0\.d, p0/m, z0\.d, z1\.d
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u64_z_untied, svuint64_t,
+		z0 = svsubr_n_u64_z (p0, z1, 1),
+		z0 = svsubr_z (p0, z1, 1))
+
+/*
+** subr_u64_x_tied1:
+**	sub	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u64_x_tied1, svuint64_t,
+		z0 = svsubr_u64_x (p0, z0, z1),
+		z0 = svsubr_x (p0, z0, z1))
+
+/*
+** subr_u64_x_tied2:
+**	sub	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u64_x_tied2, svuint64_t,
+		z0 = svsubr_u64_x (p0, z1, z0),
+		z0 = svsubr_x (p0, z1, z0))
+
+/*
+** subr_u64_x_untied:
+**	sub	z0\.d, z2\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u64_x_untied, svuint64_t,
+		z0 = svsubr_u64_x (p0, z1, z2),
+		z0 = svsubr_x (p0, z1, z2))
+
+/*
+** subr_x0_u64_x_tied1:
+**	mov	(z[0-9]+\.d), x0
+**	sub	z0\.d, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_x0_u64_x_tied1, svuint64_t, uint64_t,
+		 z0 = svsubr_n_u64_x (p0, z0, x0),
+		 z0 = svsubr_x (p0, z0, x0))
+
+/*
+** subr_x0_u64_x_untied:
+**	mov	(z[0-9]+\.d), x0
+**	sub	z0\.d, \1, z1\.d
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_x0_u64_x_untied, svuint64_t, uint64_t,
+		 z0 = svsubr_n_u64_x (p0, z1, x0),
+		 z0 = svsubr_x (p0, z1, x0))
+
+/*
+** subr_1_u64_x_tied1:
+**	subr	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u64_x_tied1, svuint64_t,
+		z0 = svsubr_n_u64_x (p0, z0, 1),
+		z0 = svsubr_x (p0, z0, 1))
+
+/*
+** subr_1_u64_x_untied:
+**	movprfx	z0, z1
+**	subr	z0\.d, z0\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u64_x_untied, svuint64_t,
+		z0 = svsubr_n_u64_x (p0, z1, 1),
+		z0 = svsubr_x (p0, z1, 1))
+
+/*
+** subr_127_u64_x:
+**	subr	z0\.d, z0\.d, #127
+**	ret
+*/
+TEST_UNIFORM_Z (subr_127_u64_x, svuint64_t,
+		z0 = svsubr_n_u64_x (p0, z0, 127),
+		z0 = svsubr_x (p0, z0, 127))
+
+/*
+** subr_128_u64_x:
+**	subr	z0\.d, z0\.d, #128
+**	ret
+*/
+TEST_UNIFORM_Z (subr_128_u64_x, svuint64_t,
+		z0 = svsubr_n_u64_x (p0, z0, 128),
+		z0 = svsubr_x (p0, z0, 128))
+
+/*
+** subr_255_u64_x:
+**	subr	z0\.d, z0\.d, #255
+**	ret
+*/
+TEST_UNIFORM_Z (subr_255_u64_x, svuint64_t,
+		z0 = svsubr_n_u64_x (p0, z0, 255),
+		z0 = svsubr_x (p0, z0, 255))
+
+/*
+** subr_256_u64_x:
+**	subr	z0\.d, z0\.d, #256
+**	ret
+*/
+TEST_UNIFORM_Z (subr_256_u64_x, svuint64_t,
+		z0 = svsubr_n_u64_x (p0, z0, 256),
+		z0 = svsubr_x (p0, z0, 256))
+
+/*
+** subr_511_u64_x:
+**	mov	(z[0-9]+\.d), #511
+**	sub	z0\.d, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_511_u64_x, svuint64_t,
+		z0 = svsubr_n_u64_x (p0, z0, 511),
+		z0 = svsubr_x (p0, z0, 511))
+
+/*
+** subr_512_u64_x:
+**	subr	z0\.d, z0\.d, #512
+**	ret
+*/
+TEST_UNIFORM_Z (subr_512_u64_x, svuint64_t,
+		z0 = svsubr_n_u64_x (p0, z0, 512),
+		z0 = svsubr_x (p0, z0, 512))
+
+/*
+** subr_65280_u64_x:
+**	subr	z0\.d, z0\.d, #65280
+**	ret
+*/
+TEST_UNIFORM_Z (subr_65280_u64_x, svuint64_t,
+		z0 = svsubr_n_u64_x (p0, z0, 0xff00),
+		z0 = svsubr_x (p0, z0, 0xff00))
+
+/*
+** subr_65535_u64_x:
+**	mov	(z[0-9]+\.d), #65535
+**	sub	z0\.d, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_65535_u64_x, svuint64_t,
+		z0 = svsubr_n_u64_x (p0, z0, 65535),
+		z0 = svsubr_x (p0, z0, 65535))
+
+/*
+** subr_65536_u64_x:
+**	mov	(z[0-9]+\.d), #65536
+**	sub	z0\.d, \1, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_65536_u64_x, svuint64_t,
+		z0 = svsubr_n_u64_x (p0, z0, 65536),
+		z0 = svsubr_x (p0, z0, 65536))
+
+/*
+** subr_m1_u64_x_tied1:
+**	mov	(z[0-9]+)\.b, #-1
+**	sub	z0\.d, \1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_u64_x_tied1, svuint64_t,
+		z0 = svsubr_n_u64_x (p0, z0, -1),
+		z0 = svsubr_x (p0, z0, -1))
+
+/*
+** subr_m1_u64_x_untied:
+**	mov	(z[0-9]+)\.b, #-1
+**	sub	z0\.d, \1\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_u64_x_untied, svuint64_t,
+		z0 = svsubr_n_u64_x (p0, z1, -1),
+		z0 = svsubr_x (p0, z1, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u8.c
new file mode 100644
index 000000000..fe5f96da8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/subr_u8.c
@@ -0,0 +1,294 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** subr_u8_m_tied1:
+**	subr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u8_m_tied1, svuint8_t,
+		z0 = svsubr_u8_m (p0, z0, z1),
+		z0 = svsubr_m (p0, z0, z1))
+
+/*
+** subr_u8_m_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	subr	z0\.b, p0/m, z0\.b, \1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u8_m_tied2, svuint8_t,
+		z0 = svsubr_u8_m (p0, z1, z0),
+		z0 = svsubr_m (p0, z1, z0))
+
+/*
+** subr_u8_m_untied:
+**	movprfx	z0, z1
+**	subr	z0\.b, p0/m, z0\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u8_m_untied, svuint8_t,
+		z0 = svsubr_u8_m (p0, z1, z2),
+		z0 = svsubr_m (p0, z1, z2))
+
+/*
+** subr_w0_u8_m_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	subr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u8_m_tied1, svuint8_t, uint8_t,
+		 z0 = svsubr_n_u8_m (p0, z0, x0),
+		 z0 = svsubr_m (p0, z0, x0))
+
+/*
+** subr_w0_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0, z1
+**	subr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u8_m_untied, svuint8_t, uint8_t,
+		 z0 = svsubr_n_u8_m (p0, z1, x0),
+		 z0 = svsubr_m (p0, z1, x0))
+
+/*
+** subr_1_u8_m_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	subr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u8_m_tied1, svuint8_t,
+		z0 = svsubr_n_u8_m (p0, z0, 1),
+		z0 = svsubr_m (p0, z0, 1))
+
+/*
+** subr_1_u8_m_untied: { xfail *-*-* }
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0, z1
+**	subr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u8_m_untied, svuint8_t,
+		z0 = svsubr_n_u8_m (p0, z1, 1),
+		z0 = svsubr_m (p0, z1, 1))
+
+/*
+** subr_m1_u8_m:
+**	mov	(z[0-9]+\.b), #-1
+**	subr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_u8_m, svuint8_t,
+		z0 = svsubr_n_u8_m (p0, z0, -1),
+		z0 = svsubr_m (p0, z0, -1))
+
+/*
+** subr_u8_z_tied1:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	subr	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u8_z_tied1, svuint8_t,
+		z0 = svsubr_u8_z (p0, z0, z1),
+		z0 = svsubr_z (p0, z0, z1))
+
+/*
+** subr_u8_z_tied2:
+**	movprfx	z0\.b, p0/z, z0\.b
+**	sub	z0\.b, p0/m, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u8_z_tied2, svuint8_t,
+		z0 = svsubr_u8_z (p0, z1, z0),
+		z0 = svsubr_z (p0, z1, z0))
+
+/*
+** subr_u8_z_untied:
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	subr	z0\.b, p0/m, z0\.b, z2\.b
+** |
+**	movprfx	z0\.b, p0/z, z2\.b
+**	sub	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u8_z_untied, svuint8_t,
+		z0 = svsubr_u8_z (p0, z1, z2),
+		z0 = svsubr_z (p0, z1, z2))
+
+/*
+** subr_w0_u8_z_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	movprfx	z0\.b, p0/z, z0\.b
+**	subr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u8_z_tied1, svuint8_t, uint8_t,
+		 z0 = svsubr_n_u8_z (p0, z0, x0),
+		 z0 = svsubr_z (p0, z0, x0))
+
+/*
+** subr_w0_u8_z_untied:
+**	mov	(z[0-9]+\.b), w0
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	subr	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	sub	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u8_z_untied, svuint8_t, uint8_t,
+		 z0 = svsubr_n_u8_z (p0, z1, x0),
+		 z0 = svsubr_z (p0, z1, x0))
+
+/*
+** subr_1_u8_z_tied1:
+**	mov	(z[0-9]+\.b), #1
+**	movprfx	z0\.b, p0/z, z0\.b
+**	subr	z0\.b, p0/m, z0\.b, \1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u8_z_tied1, svuint8_t,
+		z0 = svsubr_n_u8_z (p0, z0, 1),
+		z0 = svsubr_z (p0, z0, 1))
+
+/*
+** subr_1_u8_z_untied:
+**	mov	(z[0-9]+\.b), #1
+** (
+**	movprfx	z0\.b, p0/z, z1\.b
+**	subr	z0\.b, p0/m, z0\.b, \1
+** |
+**	movprfx	z0\.b, p0/z, \1
+**	sub	z0\.b, p0/m, z0\.b, z1\.b
+** )
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u8_z_untied, svuint8_t,
+		z0 = svsubr_n_u8_z (p0, z1, 1),
+		z0 = svsubr_z (p0, z1, 1))
+
+/*
+** subr_u8_x_tied1:
+**	sub	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u8_x_tied1, svuint8_t,
+		z0 = svsubr_u8_x (p0, z0, z1),
+		z0 = svsubr_x (p0, z0, z1))
+
+/*
+** subr_u8_x_tied2:
+**	sub	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u8_x_tied2, svuint8_t,
+		z0 = svsubr_u8_x (p0, z1, z0),
+		z0 = svsubr_x (p0, z1, z0))
+
+/*
+** subr_u8_x_untied:
+**	sub	z0\.b, z2\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (subr_u8_x_untied, svuint8_t,
+		z0 = svsubr_u8_x (p0, z1, z2),
+		z0 = svsubr_x (p0, z1, z2))
+
+/*
+** subr_w0_u8_x_tied1:
+**	mov	(z[0-9]+\.b), w0
+**	sub	z0\.b, \1, z0\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u8_x_tied1, svuint8_t, uint8_t,
+		 z0 = svsubr_n_u8_x (p0, z0, x0),
+		 z0 = svsubr_x (p0, z0, x0))
+
+/*
+** subr_w0_u8_x_untied:
+**	mov	(z[0-9]+\.b), w0
+**	sub	z0\.b, \1, z1\.b
+**	ret
+*/
+TEST_UNIFORM_ZX (subr_w0_u8_x_untied, svuint8_t, uint8_t,
+		 z0 = svsubr_n_u8_x (p0, z1, x0),
+		 z0 = svsubr_x (p0, z1, x0))
+
+/*
+** subr_1_u8_x_tied1:
+**	subr	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u8_x_tied1, svuint8_t,
+		z0 = svsubr_n_u8_x (p0, z0, 1),
+		z0 = svsubr_x (p0, z0, 1))
+
+/*
+** subr_1_u8_x_untied:
+**	movprfx	z0, z1
+**	subr	z0\.b, z0\.b, #1
+**	ret
+*/
+TEST_UNIFORM_Z (subr_1_u8_x_untied, svuint8_t,
+		z0 = svsubr_n_u8_x (p0, z1, 1),
+		z0 = svsubr_x (p0, z1, 1))
+
+/*
+** subr_127_u8_x:
+**	subr	z0\.b, z0\.b, #127
+**	ret
+*/
+TEST_UNIFORM_Z (subr_127_u8_x, svuint8_t,
+		z0 = svsubr_n_u8_x (p0, z0, 127),
+		z0 = svsubr_x (p0, z0, 127))
+
+/*
+** subr_128_u8_x:
+**	subr	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (subr_128_u8_x, svuint8_t,
+		z0 = svsubr_n_u8_x (p0, z0, 128),
+		z0 = svsubr_x (p0, z0, 128))
+
+/*
+** subr_255_u8_x:
+**	subr	z0\.b, z0\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (subr_255_u8_x, svuint8_t,
+		z0 = svsubr_n_u8_x (p0, z0, 255),
+		z0 = svsubr_x (p0, z0, 255))
+
+/*
+** subr_m1_u8_x:
+**	subr	z0\.b, z0\.b, #255
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m1_u8_x, svuint8_t,
+		z0 = svsubr_n_u8_x (p0, z0, -1),
+		z0 = svsubr_x (p0, z0, -1))
+
+/*
+** subr_m127_u8_x:
+**	subr	z0\.b, z0\.b, #129
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m127_u8_x, svuint8_t,
+		z0 = svsubr_n_u8_x (p0, z0, -127),
+		z0 = svsubr_x (p0, z0, -127))
+
+/*
+** subr_m128_u8_x:
+**	subr	z0\.b, z0\.b, #128
+**	ret
+*/
+TEST_UNIFORM_Z (subr_m128_u8_x, svuint8_t,
+		z0 = svsubr_n_u8_x (p0, z0, -128),
+		z0 = svsubr_x (p0, z0, -128))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_lane_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_lane_s32.c
new file mode 100644
index 000000000..c6d74a4af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_lane_s32.c
@@ -0,0 +1,97 @@
+/* { dg-require-effective-target aarch64_asm_i8mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+sve+i8mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sudot_lane_0_s32_tied1:
+**	sudot	z0\.s, z2\.b, z4\.b\[0\]
+**	ret
+*/
+TEST_TRIPLE_Z (sudot_lane_0_s32_tied1, svint32_t, svint8_t, svuint8_t,
+	       z0 = svsudot_lane_s32 (z0, z2, z4, 0),
+	       z0 = svsudot_lane (z0, z2, z4, 0))
+
+/*
+** sudot_lane_0_s32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z2
+**	sudot	z0\.s, \1\.b, z4\.b\[0\]
+**	ret
+*/
+TEST_TRIPLE_Z_REV2 (sudot_lane_0_s32_tied2, svint32_t, svint8_t, svuint8_t,
+		    z0_res = svsudot_lane_s32 (z2, z0, z4, 0),
+		    z0_res = svsudot_lane (z2, z0, z4, 0))
+
+/*
+** sudot_lane_0_s32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	sudot	z0\.s, z2\.b, \1\.b\[0\]
+**	ret
+*/
+TEST_TRIPLE_Z_REV (sudot_lane_0_s32_tied3, svint32_t, svint8_t, svuint8_t,
+		   z0_res = svsudot_lane_s32 (z4, z2, z0, 0),
+		   z0_res = svsudot_lane (z4, z2, z0, 0))
+
+/*
+** sudot_lane_0_s32_untied:
+**	movprfx	z0, z1
+**	sudot	z0\.s, z2\.b, z4\.b\[0\]
+**	ret
+*/
+TEST_TRIPLE_Z (sudot_lane_0_s32_untied, svint32_t, svint8_t, svuint8_t,
+	       z0 = svsudot_lane_s32 (z1, z2, z4, 0),
+	       z0 = svsudot_lane (z1, z2, z4, 0))
+
+/*
+** sudot_lane_1_s32:
+**	sudot	z0\.s, z2\.b, z5\.b\[1\]
+**	ret
+*/
+TEST_TRIPLE_Z (sudot_lane_1_s32, svint32_t, svint8_t, svuint8_t,
+	       z0 = svsudot_lane_s32 (z0, z2, z5, 1),
+	       z0 = svsudot_lane (z0, z2, z5, 1))
+
+/*
+** sudot_lane_2_s32:
+**	sudot	z0\.s, z2\.b, z5\.b\[2\]
+**	ret
+*/
+TEST_TRIPLE_Z (sudot_lane_2_s32, svint32_t, svint8_t, svuint8_t,
+	       z0 = svsudot_lane_s32 (z0, z2, z5, 2),
+	       z0 = svsudot_lane (z0, z2, z5, 2))
+
+/*
+** sudot_lane_3_s32:
+**	sudot	z0\.s, z2\.b, z5\.b\[3\]
+**	ret
+*/
+TEST_TRIPLE_Z (sudot_lane_3_s32, svint32_t, svint8_t, svuint8_t,
+	       z0 = svsudot_lane_s32 (z0, z2, z5, 3),
+	       z0 = svsudot_lane (z0, z2, z5, 3))
+
+/*
+** sudot_lane_z8_s32:
+**	str	d8, \[sp, -16\]!
+**	mov	(z[0-7])\.d, z8\.d
+**	sudot	z0\.s, z1\.b, \1\.b\[1\]
+**	ldr	d8, \[sp\], 16
+**	ret
+*/
+TEST_TRIPLE_LANE_REG (sudot_lane_z8_s32, svint32_t, svint8_t, svuint8_t,
+		      z8,
+		      z0 = svsudot_lane_s32 (z0, z1, z8, 1),
+		      z0 = svsudot_lane (z0, z1, z8, 1))
+
+/*
+** sudot_lane_z16_s32:
+**	mov	(z[0-7])\.d, z16\.d
+**	sudot	z0\.s, z1\.b, \1\.b\[1\]
+**	ret
+*/
+TEST_TRIPLE_LANE_REG (sudot_lane_z16_s32, svint32_t, svint8_t, svuint8_t,
+		      z16,
+		      z0 = svsudot_lane_s32 (z0, z1, z16, 1),
+		      z0 = svsudot_lane (z0, z1, z16, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c
new file mode 100644
index 000000000..4b452619e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/sudot_s32.c
@@ -0,0 +1,45 @@
+/* { dg-require-effective-target aarch64_asm_i8mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+sve+i8mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sudot_s32_tied1:
+**	usdot	z0\.s, z2\.b, z4\.b
+**	ret
+*/
+TEST_TRIPLE_Z (sudot_s32_tied1, svint32_t, svint8_t, svuint8_t,
+	       z0 = svsudot_s32 (z0, z2, z4),
+	       z0 = svsudot (z0, z2, z4))
+
+/*
+** sudot_s32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	usdot	z0\.s, z2\.b, \1\.b
+**	ret
+*/
+TEST_TRIPLE_Z_REV (sudot_s32_tied2, svint32_t, svint8_t, svuint8_t,
+		   z0_res = svsudot_s32 (z4, z2, z0),
+		   z0_res = svsudot (z4, z2, z0))
+
+/*
+** sudot_w0_s32_tied:
+**	mov	(z[0-9]+\.b), w0
+**	usdot	z0\.s, z2\.b, \1
+**	ret
+*/
+TEST_TRIPLE_ZX (sudot_w0_s32_tied, svint32_t, svint8_t, uint8_t,
+	       z0 = svsudot_n_s32 (z0, z2, x0),
+	       z0 = svsudot (z0, z2, x0))
+
+/*
+** sudot_9_s32_tied:
+**	mov	(z[0-9]+\.b), #9
+**	usdot	z0\.s, z2\.b, \1
+**	ret
+*/
+TEST_TRIPLE_Z (sudot_9_s32_tied, svint32_t, svint8_t, uint8_t,
+	       z0 = svsudot_n_s32 (z0, z2, 9),
+	       z0 = svsudot (z0, z2, 9))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_bf16.c
new file mode 100644
index 000000000..8c077d118
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_bf16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl_bf16_tied1:
+**	tbl	z0\.h, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (tbl_bf16_tied1, svbfloat16_t, svuint16_t,
+	     z0 = svtbl_bf16 (z0, z4),
+	     z0 = svtbl (z0, z4))
+
+/*
+** tbl_bf16_tied2:
+**	tbl	z0\.h, z4\.h, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (tbl_bf16_tied2, svbfloat16_t, svuint16_t,
+		 z0_res = svtbl_bf16 (z4, z0),
+		 z0_res = svtbl (z4, z0))
+
+/*
+** tbl_bf16_untied:
+**	tbl	z0\.h, z1\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (tbl_bf16_untied, svbfloat16_t, svuint16_t,
+	     z0 = svtbl_bf16 (z1, z4),
+	     z0 = svtbl (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f16.c
new file mode 100644
index 000000000..94b610412
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl_f16_tied1:
+**	tbl	z0\.h, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (tbl_f16_tied1, svfloat16_t, svuint16_t,
+	     z0 = svtbl_f16 (z0, z4),
+	     z0 = svtbl (z0, z4))
+
+/*
+** tbl_f16_tied2:
+**	tbl	z0\.h, z4\.h, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (tbl_f16_tied2, svfloat16_t, svuint16_t,
+		 z0_res = svtbl_f16 (z4, z0),
+		 z0_res = svtbl (z4, z0))
+
+/*
+** tbl_f16_untied:
+**	tbl	z0\.h, z1\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (tbl_f16_untied, svfloat16_t, svuint16_t,
+	     z0 = svtbl_f16 (z1, z4),
+	     z0 = svtbl (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f32.c
new file mode 100644
index 000000000..741d3bdcf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl_f32_tied1:
+**	tbl	z0\.s, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (tbl_f32_tied1, svfloat32_t, svuint32_t,
+	     z0 = svtbl_f32 (z0, z4),
+	     z0 = svtbl (z0, z4))
+
+/*
+** tbl_f32_tied2:
+**	tbl	z0\.s, z4\.s, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (tbl_f32_tied2, svfloat32_t, svuint32_t,
+		 z0_res = svtbl_f32 (z4, z0),
+		 z0_res = svtbl (z4, z0))
+
+/*
+** tbl_f32_untied:
+**	tbl	z0\.s, z1\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (tbl_f32_untied, svfloat32_t, svuint32_t,
+	     z0 = svtbl_f32 (z1, z4),
+	     z0 = svtbl (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f64.c
new file mode 100644
index 000000000..3c24e9a59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_f64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl_f64_tied1:
+**	tbl	z0\.d, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (tbl_f64_tied1, svfloat64_t, svuint64_t,
+	     z0 = svtbl_f64 (z0, z4),
+	     z0 = svtbl (z0, z4))
+
+/*
+** tbl_f64_tied2:
+**	tbl	z0\.d, z4\.d, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (tbl_f64_tied2, svfloat64_t, svuint64_t,
+		 z0_res = svtbl_f64 (z4, z0),
+		 z0_res = svtbl (z4, z0))
+
+/*
+** tbl_f64_untied:
+**	tbl	z0\.d, z1\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (tbl_f64_untied, svfloat64_t, svuint64_t,
+	     z0 = svtbl_f64 (z1, z4),
+	     z0 = svtbl (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s16.c
new file mode 100644
index 000000000..2ec9c389a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl_s16_tied1:
+**	tbl	z0\.h, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (tbl_s16_tied1, svint16_t, svuint16_t,
+	     z0 = svtbl_s16 (z0, z4),
+	     z0 = svtbl (z0, z4))
+
+/*
+** tbl_s16_tied2:
+**	tbl	z0\.h, z4\.h, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (tbl_s16_tied2, svint16_t, svuint16_t,
+		 z0_res = svtbl_s16 (z4, z0),
+		 z0_res = svtbl (z4, z0))
+
+/*
+** tbl_s16_untied:
+**	tbl	z0\.h, z1\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (tbl_s16_untied, svint16_t, svuint16_t,
+	     z0 = svtbl_s16 (z1, z4),
+	     z0 = svtbl (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s32.c
new file mode 100644
index 000000000..98b2d8d8b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl_s32_tied1:
+**	tbl	z0\.s, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (tbl_s32_tied1, svint32_t, svuint32_t,
+	     z0 = svtbl_s32 (z0, z4),
+	     z0 = svtbl (z0, z4))
+
+/*
+** tbl_s32_tied2:
+**	tbl	z0\.s, z4\.s, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (tbl_s32_tied2, svint32_t, svuint32_t,
+		 z0_res = svtbl_s32 (z4, z0),
+		 z0_res = svtbl (z4, z0))
+
+/*
+** tbl_s32_untied:
+**	tbl	z0\.s, z1\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (tbl_s32_untied, svint32_t, svuint32_t,
+	     z0 = svtbl_s32 (z1, z4),
+	     z0 = svtbl (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s64.c
new file mode 100644
index 000000000..0138a80d2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl_s64_tied1:
+**	tbl	z0\.d, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (tbl_s64_tied1, svint64_t, svuint64_t,
+	     z0 = svtbl_s64 (z0, z4),
+	     z0 = svtbl (z0, z4))
+
+/*
+** tbl_s64_tied2:
+**	tbl	z0\.d, z4\.d, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (tbl_s64_tied2, svint64_t, svuint64_t,
+		 z0_res = svtbl_s64 (z4, z0),
+		 z0_res = svtbl (z4, z0))
+
+/*
+** tbl_s64_untied:
+**	tbl	z0\.d, z1\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (tbl_s64_untied, svint64_t, svuint64_t,
+	     z0 = svtbl_s64 (z1, z4),
+	     z0 = svtbl (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s8.c
new file mode 100644
index 000000000..7818d1b6d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_s8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl_s8_tied1:
+**	tbl	z0\.b, z0\.b, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (tbl_s8_tied1, svint8_t, svuint8_t,
+	     z0 = svtbl_s8 (z0, z4),
+	     z0 = svtbl (z0, z4))
+
+/*
+** tbl_s8_tied2:
+**	tbl	z0\.b, z4\.b, z0\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (tbl_s8_tied2, svint8_t, svuint8_t,
+		 z0_res = svtbl_s8 (z4, z0),
+		 z0_res = svtbl (z4, z0))
+
+/*
+** tbl_s8_untied:
+**	tbl	z0\.b, z1\.b, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (tbl_s8_untied, svint8_t, svuint8_t,
+	     z0 = svtbl_s8 (z1, z4),
+	     z0 = svtbl (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u16.c
new file mode 100644
index 000000000..f15da9211
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl_u16_tied1:
+**	tbl	z0\.h, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (tbl_u16_tied1, svuint16_t, svuint16_t,
+	     z0 = svtbl_u16 (z0, z4),
+	     z0 = svtbl (z0, z4))
+
+/*
+** tbl_u16_tied2:
+**	tbl	z0\.h, z4\.h, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (tbl_u16_tied2, svuint16_t, svuint16_t,
+		 z0_res = svtbl_u16 (z4, z0),
+		 z0_res = svtbl (z4, z0))
+
+/*
+** tbl_u16_untied:
+**	tbl	z0\.h, z1\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (tbl_u16_untied, svuint16_t, svuint16_t,
+	     z0 = svtbl_u16 (z1, z4),
+	     z0 = svtbl (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u32.c
new file mode 100644
index 000000000..494300436
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl_u32_tied1:
+**	tbl	z0\.s, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (tbl_u32_tied1, svuint32_t, svuint32_t,
+	     z0 = svtbl_u32 (z0, z4),
+	     z0 = svtbl (z0, z4))
+
+/*
+** tbl_u32_tied2:
+**	tbl	z0\.s, z4\.s, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (tbl_u32_tied2, svuint32_t, svuint32_t,
+		 z0_res = svtbl_u32 (z4, z0),
+		 z0_res = svtbl (z4, z0))
+
+/*
+** tbl_u32_untied:
+**	tbl	z0\.s, z1\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (tbl_u32_untied, svuint32_t, svuint32_t,
+	     z0 = svtbl_u32 (z1, z4),
+	     z0 = svtbl (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u64.c
new file mode 100644
index 000000000..158990e12
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl_u64_tied1:
+**	tbl	z0\.d, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (tbl_u64_tied1, svuint64_t, svuint64_t,
+	     z0 = svtbl_u64 (z0, z4),
+	     z0 = svtbl (z0, z4))
+
+/*
+** tbl_u64_tied2:
+**	tbl	z0\.d, z4\.d, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (tbl_u64_tied2, svuint64_t, svuint64_t,
+		 z0_res = svtbl_u64 (z4, z0),
+		 z0_res = svtbl (z4, z0))
+
+/*
+** tbl_u64_untied:
+**	tbl	z0\.d, z1\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (tbl_u64_untied, svuint64_t, svuint64_t,
+	     z0 = svtbl_u64 (z1, z4),
+	     z0 = svtbl (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u8.c
new file mode 100644
index 000000000..a46309a95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tbl_u8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl_u8_tied1:
+**	tbl	z0\.b, z0\.b, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (tbl_u8_tied1, svuint8_t, svuint8_t,
+	     z0 = svtbl_u8 (z0, z4),
+	     z0 = svtbl (z0, z4))
+
+/*
+** tbl_u8_tied2:
+**	tbl	z0\.b, z4\.b, z0\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (tbl_u8_tied2, svuint8_t, svuint8_t,
+		 z0_res = svtbl_u8 (z4, z0),
+		 z0_res = svtbl (z4, z0))
+
+/*
+** tbl_u8_untied:
+**	tbl	z0\.b, z1\.b, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (tbl_u8_untied, svuint8_t, svuint8_t,
+	     z0 = svtbl_u8 (z1, z4),
+	     z0 = svtbl (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
new file mode 100644
index 000000000..d1f8fdb13
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/test_sve_acle.h
@@ -0,0 +1,424 @@
+#ifndef TEST_SVE_ACLE_H
+#define TEST_SVE_ACLE_H 1
+
+#include <arm_sve.h>
+
+#if defined (TEST_OVERLOADS)
+#define INVOKE(CODE1, CODE2) CODE2
+#elif defined (TEST_FULL)
+#define INVOKE(CODE1, CODE2) CODE1
+#else
+#error "Please define -DTEST_OVERLOADS or -DTEST_FULL"
+#endif
+
+#ifdef __cplusplus
+#define PROTO(NAME, RET, ARGS) extern "C" RET NAME ARGS; RET NAME ARGS
+#else
+#define PROTO(NAME, RET, ARGS) RET NAME ARGS
+#endif
+
+#define TEST_UNIFORM_Z(NAME, TYPE, CODE1, CODE2)		\
+  PROTO (NAME, TYPE, (TYPE z0, TYPE z1, TYPE z2, TYPE z3,	\
+		      svbool_t p0, svbool_t p1))		\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+    return z0;							\
+  }
+
+#define TEST_UNIFORM_P(NAME, CODE1, CODE2)		\
+  PROTO (NAME, svbool_t, (svbool_t p0, svbool_t p1,	\
+			  svbool_t p2, svbool_t p3))	\
+  {							\
+    INVOKE (CODE1, CODE2);				\
+    return p0;						\
+  }
+
+#define TEST_UNIFORM_P_SINGLE(NAME, CODE)		\
+  PROTO (NAME, svbool_t, (svbool_t p0, svbool_t p1,	\
+			  svbool_t p2, svbool_t p3))	\
+  {							\
+    CODE;						\
+    return p0;						\
+  }
+
+#define TEST_UNIFORM_S(NAME, TYPE, CODE1, CODE2)		\
+  PROTO (NAME, TYPE, (TYPE x0, TYPE x1, TYPE x2, TYPE x3,	\
+		      svbool_t p0, svbool_t p1))		\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+    return x0;							\
+  }
+
+#define TEST_DUAL_Z(NAME, TYPE1, TYPE2, CODE1, CODE2)		\
+  PROTO (NAME, TYPE1, (TYPE1 z0, TYPE1 z1, TYPE1 z2, TYPE1 z3,	\
+		       TYPE2 z4, TYPE2 z5, TYPE2 z6, TYPE2 z7,	\
+		       svbool_t p0, svbool_t p1))		\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+    return z0;							\
+  }
+
+#define TEST_DUAL_Z_REV(NAME, TYPE1, TYPE2, CODE1, CODE2)	\
+  PROTO (NAME, TYPE1, (TYPE2 z0, TYPE2 z1, TYPE2 z2, TYPE2 z3,	\
+		       TYPE1 z4, TYPE1 z5, TYPE1 z6, TYPE1 z7,	\
+		       svbool_t p0, svbool_t p1))		\
+  {								\
+    TYPE1 z0_res;						\
+    INVOKE (CODE1, CODE2);					\
+    return z0_res;						\
+  }
+
+#define TEST_TRIPLE_Z(NAME, TYPE1, TYPE2, TYPE3, CODE1, CODE2)	\
+  PROTO (NAME, TYPE1, (TYPE1 z0, TYPE1 z1, TYPE2 z2, TYPE2 z3,	\
+		       TYPE3 z4, TYPE3 z5,			\
+		       svbool_t p0, svbool_t p1))		\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+    return z0;							\
+  }
+
+#define TEST_TRIPLE_Z_REV2(NAME, TYPE1, TYPE2, TYPE3, CODE1, CODE2)\
+  PROTO (NAME, TYPE1, (TYPE2 z0, TYPE2 z1, TYPE1 z2, TYPE1 z3,	\
+		       TYPE3 z4, TYPE3 z5,			\
+		       svbool_t p0, svbool_t p1))		\
+  {								\
+    TYPE1 z0_res;						\
+    INVOKE (CODE1, CODE2);					\
+    return z0_res;						\
+  }
+
+#define TEST_TRIPLE_Z_REV(NAME, TYPE1, TYPE2, TYPE3, CODE1, CODE2)\
+  PROTO (NAME, TYPE1, (TYPE3 z0, TYPE3 z1, TYPE2 z2, TYPE2 z3,	\
+		       TYPE1 z4, TYPE1 z5,			\
+		       svbool_t p0, svbool_t p1))		\
+  {								\
+    TYPE1 z0_res;						\
+    INVOKE (CODE1, CODE2);					\
+    return z0_res;						\
+  }
+
+#define TEST_DUAL_LANE_REG(NAME, ZTYPE1, ZTYPE2, REG, CODE1, CODE2) \
+  PROTO (NAME, void, (void))					\
+  {								\
+    register ZTYPE1 z0 __asm ("z0");				\
+    register ZTYPE2 z1 __asm ("z1");				\
+    register ZTYPE2 REG __asm (#REG);				\
+    __asm volatile ("" : "=w" (z0), "=w" (z1), "=w" (REG));	\
+    INVOKE (CODE1, CODE2);					\
+    __asm volatile ("" :: "w" (z0));				\
+  }
+
+#define TEST_TYPE_CHANGE_Z(NAME, TYPE1, TYPE2, CODE1, CODE2)	\
+  PROTO (NAME, TYPE1, (TYPE2 z0, TYPE2 z1, TYPE2 z2, TYPE2 z3,	\
+		       svbool_t p0, svbool_t p1))		\
+  {								\
+    TYPE1 z0_res;						\
+    INVOKE (CODE1, CODE2);					\
+    return z0_res;						\
+  }
+
+#define TEST_TRIPLE_LANE_REG(NAME, ZTYPE1, ZTYPE2, ZTYPE3, REG, CODE1, CODE2) \
+  PROTO (NAME, void, (void))					\
+  {								\
+    register ZTYPE1 z0 __asm ("z0");				\
+    register ZTYPE2 z1 __asm ("z1");				\
+    register ZTYPE3 REG __asm (#REG);				\
+    __asm volatile ("" : "=w" (z0), "=w" (z1), "=w" (REG));	\
+    INVOKE (CODE1, CODE2);					\
+    __asm volatile ("" :: "w" (z0));				\
+  }
+
+#define TEST_TRIPLE_ZX(NAME, TYPE1, TYPE2, TYPE3, CODE1, CODE2)	\
+  PROTO (NAME, TYPE1, (TYPE1 z0, TYPE1 z1, TYPE2 z2, TYPE2 z3,	\
+		       TYPE3 x0, TYPE3 x1,			\
+		       svbool_t p0, svbool_t p1))		\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+    return z0;							\
+  }
+
+#define TEST_UNIFORM_ZX(NAME, ZTYPE, STYPE, CODE1, CODE2)	\
+  PROTO (NAME, ZTYPE, (ZTYPE z0, ZTYPE z1, ZTYPE z2, ZTYPE z3,	\
+		       svbool_t p0, STYPE x0))			\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+    return z0;							\
+  }
+
+#define TEST_UNIFORM_ZD(NAME, ZTYPE, STYPE, CODE1, CODE2)	\
+  PROTO (NAME, ZTYPE, (ZTYPE z0, ZTYPE z1, ZTYPE z2, ZTYPE z3,	\
+		       svbool_t p0, STYPE d4))			\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+    return z0;							\
+  }
+
+#define TEST_UNIFORM_PS(NAME, CODE1, CODE2)			\
+  PROTO (NAME, svbool_t, (svbool_t p0, svbool_t p1,		\
+			  svbool_t p2, svbool_t p3, bool x0))	\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+    return p0;							\
+  }
+
+#define TEST_DUAL_ZD(NAME, ZTYPE1, ZTYPE2, STYPE, CODE1, CODE2)	\
+  PROTO (NAME, ZTYPE1, (ZTYPE1 z0, ZTYPE1 z1, ZTYPE1 z2,	\
+			ZTYPE1 z3, ZTYPE2 z4, ZTYPE2 z5,	\
+			ZTYPE2 z6, STYPE d7, svbool_t p0,	\
+			svbool_t p1))				\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+    return z0;							\
+  }
+
+#define TEST_DUAL_ZX(NAME, ZTYPE1, ZTYPE2, STYPE, CODE1, CODE2)	\
+  PROTO (NAME, ZTYPE1, (ZTYPE1 z0, ZTYPE1 z1, ZTYPE1 z2,	\
+			ZTYPE1 z3, ZTYPE2 z4, ZTYPE2 z5,	\
+			ZTYPE2 z6, ZTYPE2 z7, svbool_t p0,	\
+			svbool_t p1, STYPE x0))			\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+    return z0;							\
+  }
+
+#define TEST_TYPE_CHANGE_ZX(NAME, ZTYPE1, ZTYPE2, STYPE, CODE1, CODE2) \
+  PROTO (NAME, ZTYPE1, (ZTYPE2 z0, ZTYPE2 z1, ZTYPE2 z2,	\
+			ZTYPE2 z3, svbool_t p0, svbool_t p1,	\
+			STYPE x0))				\
+  {								\
+    ZTYPE1 z0_res;						\
+    INVOKE (CODE1, CODE2);					\
+    return z0_res;						\
+  }
+
+#define TEST_LOAD(NAME, ZTYPE, STYPE, CODE1, CODE2)	\
+  PROTO (NAME, ZTYPE, (svbool_t p0, const STYPE *x0,	\
+		       intptr_t x1))			\
+  {							\
+    ZTYPE z0;						\
+    INVOKE (CODE1, CODE2);				\
+    return z0;						\
+  }
+
+#define TEST_LOAD_GATHER_SZ(NAME, RES_TYPE, STYPE, ZTYPE, CODE1, CODE2) \
+  PROTO (NAME, RES_TYPE, (ZTYPE z0, ZTYPE z1, svbool_t p0,	\
+			  const STYPE *x0))			\
+  {								\
+    RES_TYPE z0_res;						\
+    INVOKE (CODE1, CODE2);					\
+    return z0_res;						\
+  }
+
+#define TEST_LOAD_GATHER_ZS(NAME, RES_TYPE, ZTYPE, CODE1, CODE2) \
+  PROTO (NAME, RES_TYPE, (ZTYPE z0, ZTYPE z1, svbool_t p0,	\
+			  int64_t x0))				\
+  {								\
+    RES_TYPE z0_res;						\
+    INVOKE (CODE1, CODE2);					\
+    return z0_res;						\
+  }
+
+#define TEST_PREFETCH(NAME, STYPE, CODE1, CODE2)	\
+  PROTO (NAME, void, (svbool_t p0, const STYPE *x0,	\
+		      intptr_t x1))			\
+  {							\
+    INVOKE (CODE1, CODE2);				\
+  }
+
+#define TEST_PREFETCH_GATHER_SZ(NAME, ZTYPE, CODE1, CODE2)	\
+  PROTO (NAME, void, (ZTYPE z0, ZTYPE z1, svbool_t p0,		\
+		      const void *x0))				\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+  }
+
+#define TEST_PREFETCH_GATHER_ZS(NAME, ZTYPE, CODE1, CODE2)	\
+  PROTO (NAME, void, (ZTYPE z0, ZTYPE z1, svbool_t p0,		\
+		      int64_t x0))				\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+  }
+
+#define TEST_STORE(NAME, ZTYPE, STYPE, CODE1, CODE2)	\
+  PROTO (NAME, void, (ZTYPE z0, svbool_t p0, STYPE *x0,	\
+		      intptr_t x1))			\
+  {							\
+    INVOKE (CODE1, CODE2);				\
+  }
+
+#define TEST_STORE_SCATTER_SZ(NAME, DATA_TYPE, STYPE, ZTYPE, CODE1, CODE2) \
+  PROTO (NAME, void, (DATA_TYPE z0, ZTYPE z1, svbool_t p0,	\
+		      STYPE *x0))				\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+  }
+
+#define TEST_STORE_SCATTER_ZS(NAME, DATA_TYPE, ZTYPE, CODE1, CODE2) \
+  PROTO (NAME, void, (DATA_TYPE z0, ZTYPE z1, svbool_t p0,	\
+		      int64_t x0))				\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+  }
+
+#define TEST_P(NAME, CODE1, CODE2)	\
+  PROTO (NAME, svbool_t, (void))	\
+  {					\
+    svbool_t p0;			\
+    INVOKE (CODE1, CODE2);		\
+    return p0;				\
+  }
+
+#define TEST_PTEST(NAME, TYPE, CODE)				\
+  PROTO (NAME, TYPE, (svbool_t p0, svbool_t p1, svbool_t p2,	\
+		      svbool_t p3, TYPE x0, TYPE x1))		\
+  {								\
+    INVOKE (CODE, CODE);					\
+    return x0;							\
+  }
+
+#define TEST_COMPARE_S(NAME, TYPE, CODE1, CODE2)	\
+  PROTO (NAME, svbool_t, (TYPE x0, TYPE x1))		\
+  {							\
+    svbool_t p0;					\
+    INVOKE (CODE1, CODE2);				\
+    return p0;						\
+  }
+
+#define TEST_COMPARE_Z(NAME, TYPE, CODE1, CODE2)		\
+  PROTO (NAME, svbool_t, (TYPE z0, TYPE z1,			\
+			  svbool_t p0, svbool_t p1))		\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+    return p0;							\
+  }
+
+#define TEST_COMPARE_ZX(NAME, ZTYPE, STYPE, CODE1, CODE2)	\
+  PROTO (NAME, svbool_t, (ZTYPE z0, ZTYPE z1, svbool_t p0,	\
+			  svbool_t p1, STYPE x0))		\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+    return p0;							\
+  }
+
+#define TEST_COMPARE_ZD(NAME, ZTYPE, STYPE, CODE1, CODE2)	\
+  PROTO (NAME, svbool_t, (ZTYPE z0, ZTYPE z1, ZTYPE z2,		\
+			  ZTYPE z3, svbool_t p0, svbool_t p1,	\
+			  STYPE d4))				\
+  {								\
+    INVOKE (CODE1, CODE2);					\
+    return p0;							\
+  }
+
+#define TEST_COMPARE_DUAL_Z(NAME, TYPE1, TYPE2, CODE1, CODE2) \
+  PROTO (NAME, svbool_t, (TYPE1 z0, TYPE2 z1,		\
+			  svbool_t p0, svbool_t p1))	\
+  {							\
+    INVOKE (CODE1, CODE2);				\
+    return p0;						\
+  }
+
+#define TEST_REDUCTION_X(NAME, STYPE, ZTYPE, CODE1, CODE2) \
+  PROTO (NAME, STYPE, (ZTYPE z0, ZTYPE z1, svbool_t p0))   \
+  {							   \
+    STYPE x0;						   \
+    INVOKE (CODE1, CODE2);				   \
+    return x0;						   \
+  }
+
+#define TEST_REDUCTION_D(NAME, STYPE, ZTYPE, CODE1, CODE2) \
+  PROTO (NAME, STYPE, (ZTYPE z0, ZTYPE z1, svbool_t p0))   \
+  {							   \
+    STYPE d0;						   \
+    INVOKE (CODE1, CODE2);				   \
+    return d0;						   \
+  }
+
+#define TEST_FOLD_LEFT_D(NAME, STYPE, ZTYPE, CODE1, CODE2) \
+  PROTO (NAME, STYPE, (STYPE d0, STYPE d1, ZTYPE z2,	\
+		       svbool_t p0))   			\
+  {							\
+    INVOKE (CODE1, CODE2);				\
+    return d0;						\
+  }
+
+#define TEST_FOLD_LEFT_X(NAME, STYPE, ZTYPE, CODE1, CODE2) \
+  PROTO (NAME, STYPE, (STYPE x0, STYPE x1, ZTYPE z0,	\
+		       svbool_t p0))   			\
+  {							\
+    INVOKE (CODE1, CODE2);				\
+    return x0;						\
+  }
+
+#define TEST_S(NAME, ZTYPE, STYPE, CODE)	\
+  PROTO (NAME, ZTYPE, (STYPE x0, STYPE x1))	\
+  {						\
+    ZTYPE z0;					\
+    CODE;					\
+    return z0;					\
+  }
+
+#define TEST_ADR(NAME, TYPE1, TYPE2, CODE1, CODE2)	\
+  PROTO (NAME, TYPE1, (TYPE1 z0, TYPE2 z1))		\
+  {							\
+    INVOKE (CODE1, CODE2);				\
+    return z0;						\
+  }
+
+#define TEST_UNDEF(NAME, TYPE, CODE)	\
+  PROTO (NAME, TYPE, (void))		\
+  {					\
+    TYPE z0;				\
+    CODE;				\
+    return z0;				\
+  }
+
+#define TEST_CREATE(NAME, TTYPE, ZTYPE, CODE1, CODE2)		\
+  PROTO (NAME, TTYPE, (ZTYPE unused0, ZTYPE unused1,		\
+		       ZTYPE unused2, ZTYPE unused3,		\
+		       ZTYPE z4, ZTYPE z5, ZTYPE z6, ZTYPE z7))	\
+  {								\
+    TTYPE z0;							\
+    INVOKE (CODE1, CODE2);					\
+    return z0;							\
+  }
+
+#define TEST_GET(NAME, TTYPE, ZTYPE, CODE1, CODE2)		\
+  PROTO (NAME, void, (ZTYPE unused0, ZTYPE unused1,		\
+		      ZTYPE unused2, ZTYPE unused3, TTYPE z4))	\
+  {								\
+    register ZTYPE z0 __asm ("z0");				\
+    register ZTYPE z4_res __asm ("z4");				\
+    register ZTYPE z5_res __asm ("z5");				\
+    register ZTYPE z6_res __asm ("z6");				\
+    register ZTYPE z7_res __asm ("z7");				\
+    INVOKE (CODE1, CODE2);					\
+    __asm volatile ("" :: "w" (z0), "w" (z4_res), "w" (z5_res),	\
+		    "w" (z6_res), "w" (z7_res));		\
+  }
+
+#define TEST_SET(NAME, TTYPE, ZTYPE, CODE1, CODE2)		\
+  PROTO (NAME, void, (ZTYPE z0, ZTYPE z1, ZTYPE z2, ZTYPE z3,	\
+		      TTYPE z4))				\
+  {								\
+    register TTYPE z24 __asm ("z24");				\
+    INVOKE (CODE1, CODE2);					\
+    __asm volatile ("" :: "w" (z4), "w" (z24));			\
+  }
+
+#define TEST_TBL2(NAME, TTYPE, ZTYPE, UTYPE, CODE1, CODE2)	\
+  PROTO (NAME, ZTYPE, (TTYPE z0, TTYPE z2, UTYPE z4))		\
+  {								\
+    register ZTYPE z0_res __asm ("z0");				\
+    INVOKE (CODE1, CODE2);					\
+    return z0_res;						\
+  }
+
+#define TEST_TBL2_REV(NAME, TTYPE, ZTYPE, UTYPE, CODE1, CODE2)	\
+  PROTO (NAME, ZTYPE, (UTYPE z0, TTYPE z1, TTYPE z3))		\
+  {								\
+    register ZTYPE z0_res __asm ("z0");				\
+    INVOKE (CODE1, CODE2);					\
+    return z0_res;						\
+  }
+
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f16.c
new file mode 100644
index 000000000..3a00716e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f16.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tmad_0_f16_tied1:
+**	ftmad	z0\.h, z0\.h, z1\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_0_f16_tied1, svfloat16_t,
+		z0 = svtmad_f16 (z0, z1, 0),
+		z0 = svtmad (z0, z1, 0))
+
+/*
+** tmad_0_f16_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	ftmad	z0\.h, z0\.h, \1\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_0_f16_tied2, svfloat16_t,
+		z0 = svtmad_f16 (z1, z0, 0),
+		z0 = svtmad (z1, z0, 0))
+
+/*
+** tmad_0_f16_untied:
+**	movprfx	z0, z1
+**	ftmad	z0\.h, z0\.h, z2\.h, #0
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_0_f16_untied, svfloat16_t,
+		z0 = svtmad_f16 (z1, z2, 0),
+		z0 = svtmad (z1, z2, 0))
+
+/*
+** tmad_1_f16:
+**	ftmad	z0\.h, z0\.h, z1\.h, #1
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_1_f16, svfloat16_t,
+		z0 = svtmad_f16 (z0, z1, 1),
+		z0 = svtmad (z0, z1, 1))
+
+/*
+** tmad_2_f16:
+**	ftmad	z0\.h, z0\.h, z1\.h, #2
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_2_f16, svfloat16_t,
+		z0 = svtmad_f16 (z0, z1, 2),
+		z0 = svtmad (z0, z1, 2))
+
+/*
+** tmad_3_f16:
+**	ftmad	z0\.h, z0\.h, z1\.h, #3
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_3_f16, svfloat16_t,
+		z0 = svtmad_f16 (z0, z1, 3),
+		z0 = svtmad (z0, z1, 3))
+
+/*
+** tmad_4_f16:
+**	ftmad	z0\.h, z0\.h, z1\.h, #4
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_4_f16, svfloat16_t,
+		z0 = svtmad_f16 (z0, z1, 4),
+		z0 = svtmad (z0, z1, 4))
+
+/*
+** tmad_5_f16:
+**	ftmad	z0\.h, z0\.h, z1\.h, #5
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_5_f16, svfloat16_t,
+		z0 = svtmad_f16 (z0, z1, 5),
+		z0 = svtmad (z0, z1, 5))
+
+/*
+** tmad_6_f16:
+**	ftmad	z0\.h, z0\.h, z1\.h, #6
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_6_f16, svfloat16_t,
+		z0 = svtmad_f16 (z0, z1, 6),
+		z0 = svtmad (z0, z1, 6))
+
+/*
+** tmad_7_f16:
+**	ftmad	z0\.h, z0\.h, z1\.h, #7
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_7_f16, svfloat16_t,
+		z0 = svtmad_f16 (z0, z1, 7),
+		z0 = svtmad (z0, z1, 7))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f32.c
new file mode 100644
index 000000000..b73d420fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f32.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tmad_0_f32_tied1:
+**	ftmad	z0\.s, z0\.s, z1\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_0_f32_tied1, svfloat32_t,
+		z0 = svtmad_f32 (z0, z1, 0),
+		z0 = svtmad (z0, z1, 0))
+
+/*
+** tmad_0_f32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z1
+**	ftmad	z0\.s, z0\.s, \1\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_0_f32_tied2, svfloat32_t,
+		z0 = svtmad_f32 (z1, z0, 0),
+		z0 = svtmad (z1, z0, 0))
+
+/*
+** tmad_0_f32_untied:
+**	movprfx	z0, z1
+**	ftmad	z0\.s, z0\.s, z2\.s, #0
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_0_f32_untied, svfloat32_t,
+		z0 = svtmad_f32 (z1, z2, 0),
+		z0 = svtmad (z1, z2, 0))
+
+/*
+** tmad_1_f32:
+**	ftmad	z0\.s, z0\.s, z1\.s, #1
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_1_f32, svfloat32_t,
+		z0 = svtmad_f32 (z0, z1, 1),
+		z0 = svtmad (z0, z1, 1))
+
+/*
+** tmad_2_f32:
+**	ftmad	z0\.s, z0\.s, z1\.s, #2
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_2_f32, svfloat32_t,
+		z0 = svtmad_f32 (z0, z1, 2),
+		z0 = svtmad (z0, z1, 2))
+
+/*
+** tmad_3_f32:
+**	ftmad	z0\.s, z0\.s, z1\.s, #3
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_3_f32, svfloat32_t,
+		z0 = svtmad_f32 (z0, z1, 3),
+		z0 = svtmad (z0, z1, 3))
+
+/*
+** tmad_4_f32:
+**	ftmad	z0\.s, z0\.s, z1\.s, #4
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_4_f32, svfloat32_t,
+		z0 = svtmad_f32 (z0, z1, 4),
+		z0 = svtmad (z0, z1, 4))
+
+/*
+** tmad_5_f32:
+**	ftmad	z0\.s, z0\.s, z1\.s, #5
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_5_f32, svfloat32_t,
+		z0 = svtmad_f32 (z0, z1, 5),
+		z0 = svtmad (z0, z1, 5))
+
+/*
+** tmad_6_f32:
+**	ftmad	z0\.s, z0\.s, z1\.s, #6
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_6_f32, svfloat32_t,
+		z0 = svtmad_f32 (z0, z1, 6),
+		z0 = svtmad (z0, z1, 6))
+
+/*
+** tmad_7_f32:
+**	ftmad	z0\.s, z0\.s, z1\.s, #7
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_7_f32, svfloat32_t,
+		z0 = svtmad_f32 (z0, z1, 7),
+		z0 = svtmad (z0, z1, 7))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f64.c
new file mode 100644
index 000000000..fc31928a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tmad_f64.c
@@ -0,0 +1,96 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tmad_0_f64_tied1:
+**	ftmad	z0\.d, z0\.d, z1\.d, #0
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_0_f64_tied1, svfloat64_t,
+		z0 = svtmad_f64 (z0, z1, 0),
+		z0 = svtmad (z0, z1, 0))
+
+/*
+** tmad_0_f64_tied2:
+**	mov	(z[0-9]+\.d), z0\.d
+**	movprfx	z0, z1
+**	ftmad	z0\.d, z0\.d, \1, #0
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_0_f64_tied2, svfloat64_t,
+		z0 = svtmad_f64 (z1, z0, 0),
+		z0 = svtmad (z1, z0, 0))
+
+/*
+** tmad_0_f64_untied:
+**	movprfx	z0, z1
+**	ftmad	z0\.d, z0\.d, z2\.d, #0
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_0_f64_untied, svfloat64_t,
+		z0 = svtmad_f64 (z1, z2, 0),
+		z0 = svtmad (z1, z2, 0))
+
+/*
+** tmad_1_f64:
+**	ftmad	z0\.d, z0\.d, z1\.d, #1
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_1_f64, svfloat64_t,
+		z0 = svtmad_f64 (z0, z1, 1),
+		z0 = svtmad (z0, z1, 1))
+
+/*
+** tmad_2_f64:
+**	ftmad	z0\.d, z0\.d, z1\.d, #2
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_2_f64, svfloat64_t,
+		z0 = svtmad_f64 (z0, z1, 2),
+		z0 = svtmad (z0, z1, 2))
+
+/*
+** tmad_3_f64:
+**	ftmad	z0\.d, z0\.d, z1\.d, #3
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_3_f64, svfloat64_t,
+		z0 = svtmad_f64 (z0, z1, 3),
+		z0 = svtmad (z0, z1, 3))
+
+/*
+** tmad_4_f64:
+**	ftmad	z0\.d, z0\.d, z1\.d, #4
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_4_f64, svfloat64_t,
+		z0 = svtmad_f64 (z0, z1, 4),
+		z0 = svtmad (z0, z1, 4))
+
+/*
+** tmad_5_f64:
+**	ftmad	z0\.d, z0\.d, z1\.d, #5
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_5_f64, svfloat64_t,
+		z0 = svtmad_f64 (z0, z1, 5),
+		z0 = svtmad (z0, z1, 5))
+
+/*
+** tmad_6_f64:
+**	ftmad	z0\.d, z0\.d, z1\.d, #6
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_6_f64, svfloat64_t,
+		z0 = svtmad_f64 (z0, z1, 6),
+		z0 = svtmad (z0, z1, 6))
+
+/*
+** tmad_7_f64:
+**	ftmad	z0\.d, z0\.d, z1\.d, #7
+**	ret
+*/
+TEST_UNIFORM_Z (tmad_7_f64, svfloat64_t,
+		z0 = svtmad_f64 (z0, z1, 7),
+		z0 = svtmad (z0, z1, 7))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b16.c
new file mode 100644
index 000000000..902f8c397
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_b16_tied1:
+**	trn1	p0\.h, p0\.h, p1\.h
+**	ret
+*/
+TEST_UNIFORM_P (trn1_b16_tied1,
+		p0 = svtrn1_b16 (p0, p1),
+		p0 = svtrn1_b16 (p0, p1))
+
+/*
+** trn1_b16_tied2:
+**	trn1	p0\.h, p1\.h, p0\.h
+**	ret
+*/
+TEST_UNIFORM_P (trn1_b16_tied2,
+		p0 = svtrn1_b16 (p1, p0),
+		p0 = svtrn1_b16 (p1, p0))
+
+/*
+** trn1_b16_untied:
+**	trn1	p0\.h, p1\.h, p2\.h
+**	ret
+*/
+TEST_UNIFORM_P (trn1_b16_untied,
+		p0 = svtrn1_b16 (p1, p2),
+		p0 = svtrn1_b16 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b32.c
new file mode 100644
index 000000000..8c9ed5152
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_b32_tied1:
+**	trn1	p0\.s, p0\.s, p1\.s
+**	ret
+*/
+TEST_UNIFORM_P (trn1_b32_tied1,
+		p0 = svtrn1_b32 (p0, p1),
+		p0 = svtrn1_b32 (p0, p1))
+
+/*
+** trn1_b32_tied2:
+**	trn1	p0\.s, p1\.s, p0\.s
+**	ret
+*/
+TEST_UNIFORM_P (trn1_b32_tied2,
+		p0 = svtrn1_b32 (p1, p0),
+		p0 = svtrn1_b32 (p1, p0))
+
+/*
+** trn1_b32_untied:
+**	trn1	p0\.s, p1\.s, p2\.s
+**	ret
+*/
+TEST_UNIFORM_P (trn1_b32_untied,
+		p0 = svtrn1_b32 (p1, p2),
+		p0 = svtrn1_b32 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b64.c
new file mode 100644
index 000000000..55b00571d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_b64_tied1:
+**	trn1	p0\.d, p0\.d, p1\.d
+**	ret
+*/
+TEST_UNIFORM_P (trn1_b64_tied1,
+		p0 = svtrn1_b64 (p0, p1),
+		p0 = svtrn1_b64 (p0, p1))
+
+/*
+** trn1_b64_tied2:
+**	trn1	p0\.d, p1\.d, p0\.d
+**	ret
+*/
+TEST_UNIFORM_P (trn1_b64_tied2,
+		p0 = svtrn1_b64 (p1, p0),
+		p0 = svtrn1_b64 (p1, p0))
+
+/*
+** trn1_b64_untied:
+**	trn1	p0\.d, p1\.d, p2\.d
+**	ret
+*/
+TEST_UNIFORM_P (trn1_b64_untied,
+		p0 = svtrn1_b64 (p1, p2),
+		p0 = svtrn1_b64 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b8.c
new file mode 100644
index 000000000..4b5e80fbe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_b8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_b8_tied1:
+**	trn1	p0\.b, p0\.b, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (trn1_b8_tied1,
+		p0 = svtrn1_b8 (p0, p1),
+		p0 = svtrn1_b8 (p0, p1))
+
+/*
+** trn1_b8_tied2:
+**	trn1	p0\.b, p1\.b, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (trn1_b8_tied2,
+		p0 = svtrn1_b8 (p1, p0),
+		p0 = svtrn1_b8 (p1, p0))
+
+/*
+** trn1_b8_untied:
+**	trn1	p0\.b, p1\.b, p2\.b
+**	ret
+*/
+TEST_UNIFORM_P (trn1_b8_untied,
+		p0 = svtrn1_b8 (p1, p2),
+		p0 = svtrn1_b8 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_bf16.c
new file mode 100644
index 000000000..b04c7da4f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_bf16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_bf16_tied1:
+**	trn1	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_bf16_tied1, svbfloat16_t,
+		z0 = svtrn1_bf16 (z0, z1),
+		z0 = svtrn1 (z0, z1))
+
+/*
+** trn1_bf16_tied2:
+**	trn1	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_bf16_tied2, svbfloat16_t,
+		z0 = svtrn1_bf16 (z1, z0),
+		z0 = svtrn1 (z1, z0))
+
+/*
+** trn1_bf16_untied:
+**	trn1	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_bf16_untied, svbfloat16_t,
+		z0 = svtrn1_bf16 (z1, z2),
+		z0 = svtrn1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f16.c
new file mode 100644
index 000000000..373eb9dd9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_f16_tied1:
+**	trn1	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_f16_tied1, svfloat16_t,
+		z0 = svtrn1_f16 (z0, z1),
+		z0 = svtrn1 (z0, z1))
+
+/*
+** trn1_f16_tied2:
+**	trn1	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_f16_tied2, svfloat16_t,
+		z0 = svtrn1_f16 (z1, z0),
+		z0 = svtrn1 (z1, z0))
+
+/*
+** trn1_f16_untied:
+**	trn1	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_f16_untied, svfloat16_t,
+		z0 = svtrn1_f16 (z1, z2),
+		z0 = svtrn1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f32.c
new file mode 100644
index 000000000..ccd84d94e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_f32_tied1:
+**	trn1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_f32_tied1, svfloat32_t,
+		z0 = svtrn1_f32 (z0, z1),
+		z0 = svtrn1 (z0, z1))
+
+/*
+** trn1_f32_tied2:
+**	trn1	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_f32_tied2, svfloat32_t,
+		z0 = svtrn1_f32 (z1, z0),
+		z0 = svtrn1 (z1, z0))
+
+/*
+** trn1_f32_untied:
+**	trn1	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_f32_untied, svfloat32_t,
+		z0 = svtrn1_f32 (z1, z2),
+		z0 = svtrn1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f64.c
new file mode 100644
index 000000000..d3cc51948
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_f64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_f64_tied1:
+**	trn1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_f64_tied1, svfloat64_t,
+		z0 = svtrn1_f64 (z0, z1),
+		z0 = svtrn1 (z0, z1))
+
+/*
+** trn1_f64_tied2:
+**	trn1	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_f64_tied2, svfloat64_t,
+		z0 = svtrn1_f64 (z1, z0),
+		z0 = svtrn1 (z1, z0))
+
+/*
+** trn1_f64_untied:
+**	trn1	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_f64_untied, svfloat64_t,
+		z0 = svtrn1_f64 (z1, z2),
+		z0 = svtrn1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s16.c
new file mode 100644
index 000000000..466bb8c02
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_s16_tied1:
+**	trn1	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_s16_tied1, svint16_t,
+		z0 = svtrn1_s16 (z0, z1),
+		z0 = svtrn1 (z0, z1))
+
+/*
+** trn1_s16_tied2:
+**	trn1	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_s16_tied2, svint16_t,
+		z0 = svtrn1_s16 (z1, z0),
+		z0 = svtrn1 (z1, z0))
+
+/*
+** trn1_s16_untied:
+**	trn1	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_s16_untied, svint16_t,
+		z0 = svtrn1_s16 (z1, z2),
+		z0 = svtrn1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s32.c
new file mode 100644
index 000000000..24655e622
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_s32_tied1:
+**	trn1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_s32_tied1, svint32_t,
+		z0 = svtrn1_s32 (z0, z1),
+		z0 = svtrn1 (z0, z1))
+
+/*
+** trn1_s32_tied2:
+**	trn1	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_s32_tied2, svint32_t,
+		z0 = svtrn1_s32 (z1, z0),
+		z0 = svtrn1 (z1, z0))
+
+/*
+** trn1_s32_untied:
+**	trn1	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_s32_untied, svint32_t,
+		z0 = svtrn1_s32 (z1, z2),
+		z0 = svtrn1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s64.c
new file mode 100644
index 000000000..553fb610b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_s64_tied1:
+**	trn1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_s64_tied1, svint64_t,
+		z0 = svtrn1_s64 (z0, z1),
+		z0 = svtrn1 (z0, z1))
+
+/*
+** trn1_s64_tied2:
+**	trn1	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_s64_tied2, svint64_t,
+		z0 = svtrn1_s64 (z1, z0),
+		z0 = svtrn1 (z1, z0))
+
+/*
+** trn1_s64_untied:
+**	trn1	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_s64_untied, svint64_t,
+		z0 = svtrn1_s64 (z1, z2),
+		z0 = svtrn1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s8.c
new file mode 100644
index 000000000..1fa150792
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_s8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_s8_tied1:
+**	trn1	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_s8_tied1, svint8_t,
+		z0 = svtrn1_s8 (z0, z1),
+		z0 = svtrn1 (z0, z1))
+
+/*
+** trn1_s8_tied2:
+**	trn1	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_s8_tied2, svint8_t,
+		z0 = svtrn1_s8 (z1, z0),
+		z0 = svtrn1 (z1, z0))
+
+/*
+** trn1_s8_untied:
+**	trn1	z0\.b, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_s8_untied, svint8_t,
+		z0 = svtrn1_s8 (z1, z2),
+		z0 = svtrn1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u16.c
new file mode 100644
index 000000000..a3ce936f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_u16_tied1:
+**	trn1	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_u16_tied1, svuint16_t,
+		z0 = svtrn1_u16 (z0, z1),
+		z0 = svtrn1 (z0, z1))
+
+/*
+** trn1_u16_tied2:
+**	trn1	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_u16_tied2, svuint16_t,
+		z0 = svtrn1_u16 (z1, z0),
+		z0 = svtrn1 (z1, z0))
+
+/*
+** trn1_u16_untied:
+**	trn1	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_u16_untied, svuint16_t,
+		z0 = svtrn1_u16 (z1, z2),
+		z0 = svtrn1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u32.c
new file mode 100644
index 000000000..b14d7a67a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_u32_tied1:
+**	trn1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_u32_tied1, svuint32_t,
+		z0 = svtrn1_u32 (z0, z1),
+		z0 = svtrn1 (z0, z1))
+
+/*
+** trn1_u32_tied2:
+**	trn1	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_u32_tied2, svuint32_t,
+		z0 = svtrn1_u32 (z1, z0),
+		z0 = svtrn1 (z1, z0))
+
+/*
+** trn1_u32_untied:
+**	trn1	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_u32_untied, svuint32_t,
+		z0 = svtrn1_u32 (z1, z2),
+		z0 = svtrn1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u64.c
new file mode 100644
index 000000000..2ccda1d72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_u64_tied1:
+**	trn1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_u64_tied1, svuint64_t,
+		z0 = svtrn1_u64 (z0, z1),
+		z0 = svtrn1 (z0, z1))
+
+/*
+** trn1_u64_tied2:
+**	trn1	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_u64_tied2, svuint64_t,
+		z0 = svtrn1_u64 (z1, z0),
+		z0 = svtrn1 (z1, z0))
+
+/*
+** trn1_u64_untied:
+**	trn1	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_u64_untied, svuint64_t,
+		z0 = svtrn1_u64 (z1, z2),
+		z0 = svtrn1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u8.c
new file mode 100644
index 000000000..84f8d31e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1_u8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_u8_tied1:
+**	trn1	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_u8_tied1, svuint8_t,
+		z0 = svtrn1_u8 (z0, z1),
+		z0 = svtrn1 (z0, z1))
+
+/*
+** trn1_u8_tied2:
+**	trn1	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_u8_tied2, svuint8_t,
+		z0 = svtrn1_u8 (z1, z0),
+		z0 = svtrn1 (z1, z0))
+
+/*
+** trn1_u8_untied:
+**	trn1	z0\.b, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (trn1_u8_untied, svuint8_t,
+		z0 = svtrn1_u8 (z1, z2),
+		z0 = svtrn1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_bf16.c
new file mode 100644
index 000000000..f1810da9e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_bf16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1q_bf16_tied1:
+**	trn1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_bf16_tied1, svbfloat16_t,
+		z0 = svtrn1q_bf16 (z0, z1),
+		z0 = svtrn1q (z0, z1))
+
+/*
+** trn1q_bf16_tied2:
+**	trn1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_bf16_tied2, svbfloat16_t,
+		z0 = svtrn1q_bf16 (z1, z0),
+		z0 = svtrn1q (z1, z0))
+
+/*
+** trn1q_bf16_untied:
+**	trn1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_bf16_untied, svbfloat16_t,
+		z0 = svtrn1q_bf16 (z1, z2),
+		z0 = svtrn1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f16.c
new file mode 100644
index 000000000..6420d0f0a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1q_f16_tied1:
+**	trn1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_f16_tied1, svfloat16_t,
+		z0 = svtrn1q_f16 (z0, z1),
+		z0 = svtrn1q (z0, z1))
+
+/*
+** trn1q_f16_tied2:
+**	trn1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_f16_tied2, svfloat16_t,
+		z0 = svtrn1q_f16 (z1, z0),
+		z0 = svtrn1q (z1, z0))
+
+/*
+** trn1q_f16_untied:
+**	trn1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_f16_untied, svfloat16_t,
+		z0 = svtrn1q_f16 (z1, z2),
+		z0 = svtrn1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f32.c
new file mode 100644
index 000000000..6fb2eecf5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1q_f32_tied1:
+**	trn1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_f32_tied1, svfloat32_t,
+		z0 = svtrn1q_f32 (z0, z1),
+		z0 = svtrn1q (z0, z1))
+
+/*
+** trn1q_f32_tied2:
+**	trn1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_f32_tied2, svfloat32_t,
+		z0 = svtrn1q_f32 (z1, z0),
+		z0 = svtrn1q (z1, z0))
+
+/*
+** trn1q_f32_untied:
+**	trn1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_f32_untied, svfloat32_t,
+		z0 = svtrn1q_f32 (z1, z2),
+		z0 = svtrn1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f64.c
new file mode 100644
index 000000000..e786a8d04
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_f64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1q_f64_tied1:
+**	trn1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_f64_tied1, svfloat64_t,
+		z0 = svtrn1q_f64 (z0, z1),
+		z0 = svtrn1q (z0, z1))
+
+/*
+** trn1q_f64_tied2:
+**	trn1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_f64_tied2, svfloat64_t,
+		z0 = svtrn1q_f64 (z1, z0),
+		z0 = svtrn1q (z1, z0))
+
+/*
+** trn1q_f64_untied:
+**	trn1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_f64_untied, svfloat64_t,
+		z0 = svtrn1q_f64 (z1, z2),
+		z0 = svtrn1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s16.c
new file mode 100644
index 000000000..548360719
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1q_s16_tied1:
+**	trn1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_s16_tied1, svint16_t,
+		z0 = svtrn1q_s16 (z0, z1),
+		z0 = svtrn1q (z0, z1))
+
+/*
+** trn1q_s16_tied2:
+**	trn1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_s16_tied2, svint16_t,
+		z0 = svtrn1q_s16 (z1, z0),
+		z0 = svtrn1q (z1, z0))
+
+/*
+** trn1q_s16_untied:
+**	trn1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_s16_untied, svint16_t,
+		z0 = svtrn1q_s16 (z1, z2),
+		z0 = svtrn1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s32.c
new file mode 100644
index 000000000..ccb8319f7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1q_s32_tied1:
+**	trn1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_s32_tied1, svint32_t,
+		z0 = svtrn1q_s32 (z0, z1),
+		z0 = svtrn1q (z0, z1))
+
+/*
+** trn1q_s32_tied2:
+**	trn1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_s32_tied2, svint32_t,
+		z0 = svtrn1q_s32 (z1, z0),
+		z0 = svtrn1q (z1, z0))
+
+/*
+** trn1q_s32_untied:
+**	trn1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_s32_untied, svint32_t,
+		z0 = svtrn1q_s32 (z1, z2),
+		z0 = svtrn1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s64.c
new file mode 100644
index 000000000..fe8125a8a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1q_s64_tied1:
+**	trn1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_s64_tied1, svint64_t,
+		z0 = svtrn1q_s64 (z0, z1),
+		z0 = svtrn1q (z0, z1))
+
+/*
+** trn1q_s64_tied2:
+**	trn1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_s64_tied2, svint64_t,
+		z0 = svtrn1q_s64 (z1, z0),
+		z0 = svtrn1q (z1, z0))
+
+/*
+** trn1q_s64_untied:
+**	trn1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_s64_untied, svint64_t,
+		z0 = svtrn1q_s64 (z1, z2),
+		z0 = svtrn1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s8.c
new file mode 100644
index 000000000..48040c1ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_s8.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1q_s8_tied1:
+**	trn1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_s8_tied1, svint8_t,
+		z0 = svtrn1q_s8 (z0, z1),
+		z0 = svtrn1q (z0, z1))
+
+/*
+** trn1q_s8_tied2:
+**	trn1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_s8_tied2, svint8_t,
+		z0 = svtrn1q_s8 (z1, z0),
+		z0 = svtrn1q (z1, z0))
+
+/*
+** trn1q_s8_untied:
+**	trn1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_s8_untied, svint8_t,
+		z0 = svtrn1q_s8 (z1, z2),
+		z0 = svtrn1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u16.c
new file mode 100644
index 000000000..3657f919e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1q_u16_tied1:
+**	trn1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_u16_tied1, svuint16_t,
+		z0 = svtrn1q_u16 (z0, z1),
+		z0 = svtrn1q (z0, z1))
+
+/*
+** trn1q_u16_tied2:
+**	trn1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_u16_tied2, svuint16_t,
+		z0 = svtrn1q_u16 (z1, z0),
+		z0 = svtrn1q (z1, z0))
+
+/*
+** trn1q_u16_untied:
+**	trn1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_u16_untied, svuint16_t,
+		z0 = svtrn1q_u16 (z1, z2),
+		z0 = svtrn1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u32.c
new file mode 100644
index 000000000..cc5ea2878
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1q_u32_tied1:
+**	trn1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_u32_tied1, svuint32_t,
+		z0 = svtrn1q_u32 (z0, z1),
+		z0 = svtrn1q (z0, z1))
+
+/*
+** trn1q_u32_tied2:
+**	trn1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_u32_tied2, svuint32_t,
+		z0 = svtrn1q_u32 (z1, z0),
+		z0 = svtrn1q (z1, z0))
+
+/*
+** trn1q_u32_untied:
+**	trn1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_u32_untied, svuint32_t,
+		z0 = svtrn1q_u32 (z1, z2),
+		z0 = svtrn1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u64.c
new file mode 100644
index 000000000..4435b53d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1q_u64_tied1:
+**	trn1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_u64_tied1, svuint64_t,
+		z0 = svtrn1q_u64 (z0, z1),
+		z0 = svtrn1q (z0, z1))
+
+/*
+** trn1q_u64_tied2:
+**	trn1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_u64_tied2, svuint64_t,
+		z0 = svtrn1q_u64 (z1, z0),
+		z0 = svtrn1q (z1, z0))
+
+/*
+** trn1q_u64_untied:
+**	trn1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_u64_untied, svuint64_t,
+		z0 = svtrn1q_u64 (z1, z2),
+		z0 = svtrn1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u8.c
new file mode 100644
index 000000000..4ebfedbea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn1q_u8.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1q_u8_tied1:
+**	trn1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_u8_tied1, svuint8_t,
+		z0 = svtrn1q_u8 (z0, z1),
+		z0 = svtrn1q (z0, z1))
+
+/*
+** trn1q_u8_tied2:
+**	trn1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_u8_tied2, svuint8_t,
+		z0 = svtrn1q_u8 (z1, z0),
+		z0 = svtrn1q (z1, z0))
+
+/*
+** trn1q_u8_untied:
+**	trn1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn1q_u8_untied, svuint8_t,
+		z0 = svtrn1q_u8 (z1, z2),
+		z0 = svtrn1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b16.c
new file mode 100644
index 000000000..54b593afe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_b16_tied1:
+**	trn2	p0\.h, p0\.h, p1\.h
+**	ret
+*/
+TEST_UNIFORM_P (trn2_b16_tied1,
+		p0 = svtrn2_b16 (p0, p1),
+		p0 = svtrn2_b16 (p0, p1))
+
+/*
+** trn2_b16_tied2:
+**	trn2	p0\.h, p1\.h, p0\.h
+**	ret
+*/
+TEST_UNIFORM_P (trn2_b16_tied2,
+		p0 = svtrn2_b16 (p1, p0),
+		p0 = svtrn2_b16 (p1, p0))
+
+/*
+** trn2_b16_untied:
+**	trn2	p0\.h, p1\.h, p2\.h
+**	ret
+*/
+TEST_UNIFORM_P (trn2_b16_untied,
+		p0 = svtrn2_b16 (p1, p2),
+		p0 = svtrn2_b16 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b32.c
new file mode 100644
index 000000000..ead3d85cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_b32_tied1:
+**	trn2	p0\.s, p0\.s, p1\.s
+**	ret
+*/
+TEST_UNIFORM_P (trn2_b32_tied1,
+		p0 = svtrn2_b32 (p0, p1),
+		p0 = svtrn2_b32 (p0, p1))
+
+/*
+** trn2_b32_tied2:
+**	trn2	p0\.s, p1\.s, p0\.s
+**	ret
+*/
+TEST_UNIFORM_P (trn2_b32_tied2,
+		p0 = svtrn2_b32 (p1, p0),
+		p0 = svtrn2_b32 (p1, p0))
+
+/*
+** trn2_b32_untied:
+**	trn2	p0\.s, p1\.s, p2\.s
+**	ret
+*/
+TEST_UNIFORM_P (trn2_b32_untied,
+		p0 = svtrn2_b32 (p1, p2),
+		p0 = svtrn2_b32 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b64.c
new file mode 100644
index 000000000..ccca03557
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_b64_tied1:
+**	trn2	p0\.d, p0\.d, p1\.d
+**	ret
+*/
+TEST_UNIFORM_P (trn2_b64_tied1,
+		p0 = svtrn2_b64 (p0, p1),
+		p0 = svtrn2_b64 (p0, p1))
+
+/*
+** trn2_b64_tied2:
+**	trn2	p0\.d, p1\.d, p0\.d
+**	ret
+*/
+TEST_UNIFORM_P (trn2_b64_tied2,
+		p0 = svtrn2_b64 (p1, p0),
+		p0 = svtrn2_b64 (p1, p0))
+
+/*
+** trn2_b64_untied:
+**	trn2	p0\.d, p1\.d, p2\.d
+**	ret
+*/
+TEST_UNIFORM_P (trn2_b64_untied,
+		p0 = svtrn2_b64 (p1, p2),
+		p0 = svtrn2_b64 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b8.c
new file mode 100644
index 000000000..7b0803e79
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_b8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_b8_tied1:
+**	trn2	p0\.b, p0\.b, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (trn2_b8_tied1,
+		p0 = svtrn2_b8 (p0, p1),
+		p0 = svtrn2_b8 (p0, p1))
+
+/*
+** trn2_b8_tied2:
+**	trn2	p0\.b, p1\.b, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (trn2_b8_tied2,
+		p0 = svtrn2_b8 (p1, p0),
+		p0 = svtrn2_b8 (p1, p0))
+
+/*
+** trn2_b8_untied:
+**	trn2	p0\.b, p1\.b, p2\.b
+**	ret
+*/
+TEST_UNIFORM_P (trn2_b8_untied,
+		p0 = svtrn2_b8 (p1, p2),
+		p0 = svtrn2_b8 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_bf16.c
new file mode 100644
index 000000000..12028b0f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_bf16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_bf16_tied1:
+**	trn2	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_bf16_tied1, svbfloat16_t,
+		z0 = svtrn2_bf16 (z0, z1),
+		z0 = svtrn2 (z0, z1))
+
+/*
+** trn2_bf16_tied2:
+**	trn2	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_bf16_tied2, svbfloat16_t,
+		z0 = svtrn2_bf16 (z1, z0),
+		z0 = svtrn2 (z1, z0))
+
+/*
+** trn2_bf16_untied:
+**	trn2	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_bf16_untied, svbfloat16_t,
+		z0 = svtrn2_bf16 (z1, z2),
+		z0 = svtrn2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f16.c
new file mode 100644
index 000000000..112567725
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_f16_tied1:
+**	trn2	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_f16_tied1, svfloat16_t,
+		z0 = svtrn2_f16 (z0, z1),
+		z0 = svtrn2 (z0, z1))
+
+/*
+** trn2_f16_tied2:
+**	trn2	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_f16_tied2, svfloat16_t,
+		z0 = svtrn2_f16 (z1, z0),
+		z0 = svtrn2 (z1, z0))
+
+/*
+** trn2_f16_untied:
+**	trn2	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_f16_untied, svfloat16_t,
+		z0 = svtrn2_f16 (z1, z2),
+		z0 = svtrn2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f32.c
new file mode 100644
index 000000000..daee566cc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_f32_tied1:
+**	trn2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_f32_tied1, svfloat32_t,
+		z0 = svtrn2_f32 (z0, z1),
+		z0 = svtrn2 (z0, z1))
+
+/*
+** trn2_f32_tied2:
+**	trn2	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_f32_tied2, svfloat32_t,
+		z0 = svtrn2_f32 (z1, z0),
+		z0 = svtrn2 (z1, z0))
+
+/*
+** trn2_f32_untied:
+**	trn2	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_f32_untied, svfloat32_t,
+		z0 = svtrn2_f32 (z1, z2),
+		z0 = svtrn2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f64.c
new file mode 100644
index 000000000..338fee49f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_f64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_f64_tied1:
+**	trn2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_f64_tied1, svfloat64_t,
+		z0 = svtrn2_f64 (z0, z1),
+		z0 = svtrn2 (z0, z1))
+
+/*
+** trn2_f64_tied2:
+**	trn2	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_f64_tied2, svfloat64_t,
+		z0 = svtrn2_f64 (z1, z0),
+		z0 = svtrn2 (z1, z0))
+
+/*
+** trn2_f64_untied:
+**	trn2	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_f64_untied, svfloat64_t,
+		z0 = svtrn2_f64 (z1, z2),
+		z0 = svtrn2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s16.c
new file mode 100644
index 000000000..93f63de5e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_s16_tied1:
+**	trn2	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_s16_tied1, svint16_t,
+		z0 = svtrn2_s16 (z0, z1),
+		z0 = svtrn2 (z0, z1))
+
+/*
+** trn2_s16_tied2:
+**	trn2	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_s16_tied2, svint16_t,
+		z0 = svtrn2_s16 (z1, z0),
+		z0 = svtrn2 (z1, z0))
+
+/*
+** trn2_s16_untied:
+**	trn2	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_s16_untied, svint16_t,
+		z0 = svtrn2_s16 (z1, z2),
+		z0 = svtrn2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s32.c
new file mode 100644
index 000000000..82edd72f7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_s32_tied1:
+**	trn2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_s32_tied1, svint32_t,
+		z0 = svtrn2_s32 (z0, z1),
+		z0 = svtrn2 (z0, z1))
+
+/*
+** trn2_s32_tied2:
+**	trn2	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_s32_tied2, svint32_t,
+		z0 = svtrn2_s32 (z1, z0),
+		z0 = svtrn2 (z1, z0))
+
+/*
+** trn2_s32_untied:
+**	trn2	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_s32_untied, svint32_t,
+		z0 = svtrn2_s32 (z1, z2),
+		z0 = svtrn2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s64.c
new file mode 100644
index 000000000..5f43441d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_s64_tied1:
+**	trn2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_s64_tied1, svint64_t,
+		z0 = svtrn2_s64 (z0, z1),
+		z0 = svtrn2 (z0, z1))
+
+/*
+** trn2_s64_tied2:
+**	trn2	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_s64_tied2, svint64_t,
+		z0 = svtrn2_s64 (z1, z0),
+		z0 = svtrn2 (z1, z0))
+
+/*
+** trn2_s64_untied:
+**	trn2	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_s64_untied, svint64_t,
+		z0 = svtrn2_s64 (z1, z2),
+		z0 = svtrn2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s8.c
new file mode 100644
index 000000000..716538119
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_s8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_s8_tied1:
+**	trn2	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_s8_tied1, svint8_t,
+		z0 = svtrn2_s8 (z0, z1),
+		z0 = svtrn2 (z0, z1))
+
+/*
+** trn2_s8_tied2:
+**	trn2	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_s8_tied2, svint8_t,
+		z0 = svtrn2_s8 (z1, z0),
+		z0 = svtrn2 (z1, z0))
+
+/*
+** trn2_s8_untied:
+**	trn2	z0\.b, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_s8_untied, svint8_t,
+		z0 = svtrn2_s8 (z1, z2),
+		z0 = svtrn2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u16.c
new file mode 100644
index 000000000..e68d233b8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_u16_tied1:
+**	trn2	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_u16_tied1, svuint16_t,
+		z0 = svtrn2_u16 (z0, z1),
+		z0 = svtrn2 (z0, z1))
+
+/*
+** trn2_u16_tied2:
+**	trn2	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_u16_tied2, svuint16_t,
+		z0 = svtrn2_u16 (z1, z0),
+		z0 = svtrn2 (z1, z0))
+
+/*
+** trn2_u16_untied:
+**	trn2	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_u16_untied, svuint16_t,
+		z0 = svtrn2_u16 (z1, z2),
+		z0 = svtrn2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u32.c
new file mode 100644
index 000000000..e48aad179
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_u32_tied1:
+**	trn2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_u32_tied1, svuint32_t,
+		z0 = svtrn2_u32 (z0, z1),
+		z0 = svtrn2 (z0, z1))
+
+/*
+** trn2_u32_tied2:
+**	trn2	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_u32_tied2, svuint32_t,
+		z0 = svtrn2_u32 (z1, z0),
+		z0 = svtrn2 (z1, z0))
+
+/*
+** trn2_u32_untied:
+**	trn2	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_u32_untied, svuint32_t,
+		z0 = svtrn2_u32 (z1, z2),
+		z0 = svtrn2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u64.c
new file mode 100644
index 000000000..aa452275b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_u64_tied1:
+**	trn2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_u64_tied1, svuint64_t,
+		z0 = svtrn2_u64 (z0, z1),
+		z0 = svtrn2 (z0, z1))
+
+/*
+** trn2_u64_tied2:
+**	trn2	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_u64_tied2, svuint64_t,
+		z0 = svtrn2_u64 (z1, z0),
+		z0 = svtrn2 (z1, z0))
+
+/*
+** trn2_u64_untied:
+**	trn2	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_u64_untied, svuint64_t,
+		z0 = svtrn2_u64 (z1, z2),
+		z0 = svtrn2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u8.c
new file mode 100644
index 000000000..cb26b2338
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2_u8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_u8_tied1:
+**	trn2	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_u8_tied1, svuint8_t,
+		z0 = svtrn2_u8 (z0, z1),
+		z0 = svtrn2 (z0, z1))
+
+/*
+** trn2_u8_tied2:
+**	trn2	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_u8_tied2, svuint8_t,
+		z0 = svtrn2_u8 (z1, z0),
+		z0 = svtrn2 (z1, z0))
+
+/*
+** trn2_u8_untied:
+**	trn2	z0\.b, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (trn2_u8_untied, svuint8_t,
+		z0 = svtrn2_u8 (z1, z2),
+		z0 = svtrn2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_bf16.c
new file mode 100644
index 000000000..5623b54f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_bf16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2q_bf16_tied1:
+**	trn2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_bf16_tied1, svbfloat16_t,
+		z0 = svtrn2q_bf16 (z0, z1),
+		z0 = svtrn2q (z0, z1))
+
+/*
+** trn2q_bf16_tied2:
+**	trn2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_bf16_tied2, svbfloat16_t,
+		z0 = svtrn2q_bf16 (z1, z0),
+		z0 = svtrn2q (z1, z0))
+
+/*
+** trn2q_bf16_untied:
+**	trn2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_bf16_untied, svbfloat16_t,
+		z0 = svtrn2q_bf16 (z1, z2),
+		z0 = svtrn2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f16.c
new file mode 100644
index 000000000..db2190929
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2q_f16_tied1:
+**	trn2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_f16_tied1, svfloat16_t,
+		z0 = svtrn2q_f16 (z0, z1),
+		z0 = svtrn2q (z0, z1))
+
+/*
+** trn2q_f16_tied2:
+**	trn2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_f16_tied2, svfloat16_t,
+		z0 = svtrn2q_f16 (z1, z0),
+		z0 = svtrn2q (z1, z0))
+
+/*
+** trn2q_f16_untied:
+**	trn2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_f16_untied, svfloat16_t,
+		z0 = svtrn2q_f16 (z1, z2),
+		z0 = svtrn2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f32.c
new file mode 100644
index 000000000..1367a1e06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2q_f32_tied1:
+**	trn2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_f32_tied1, svfloat32_t,
+		z0 = svtrn2q_f32 (z0, z1),
+		z0 = svtrn2q (z0, z1))
+
+/*
+** trn2q_f32_tied2:
+**	trn2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_f32_tied2, svfloat32_t,
+		z0 = svtrn2q_f32 (z1, z0),
+		z0 = svtrn2q (z1, z0))
+
+/*
+** trn2q_f32_untied:
+**	trn2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_f32_untied, svfloat32_t,
+		z0 = svtrn2q_f32 (z1, z2),
+		z0 = svtrn2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f64.c
new file mode 100644
index 000000000..54325e705
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_f64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2q_f64_tied1:
+**	trn2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_f64_tied1, svfloat64_t,
+		z0 = svtrn2q_f64 (z0, z1),
+		z0 = svtrn2q (z0, z1))
+
+/*
+** trn2q_f64_tied2:
+**	trn2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_f64_tied2, svfloat64_t,
+		z0 = svtrn2q_f64 (z1, z0),
+		z0 = svtrn2q (z1, z0))
+
+/*
+** trn2q_f64_untied:
+**	trn2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_f64_untied, svfloat64_t,
+		z0 = svtrn2q_f64 (z1, z2),
+		z0 = svtrn2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s16.c
new file mode 100644
index 000000000..a0b641278
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2q_s16_tied1:
+**	trn2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_s16_tied1, svint16_t,
+		z0 = svtrn2q_s16 (z0, z1),
+		z0 = svtrn2q (z0, z1))
+
+/*
+** trn2q_s16_tied2:
+**	trn2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_s16_tied2, svint16_t,
+		z0 = svtrn2q_s16 (z1, z0),
+		z0 = svtrn2q (z1, z0))
+
+/*
+** trn2q_s16_untied:
+**	trn2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_s16_untied, svint16_t,
+		z0 = svtrn2q_s16 (z1, z2),
+		z0 = svtrn2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s32.c
new file mode 100644
index 000000000..7c128c6ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2q_s32_tied1:
+**	trn2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_s32_tied1, svint32_t,
+		z0 = svtrn2q_s32 (z0, z1),
+		z0 = svtrn2q (z0, z1))
+
+/*
+** trn2q_s32_tied2:
+**	trn2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_s32_tied2, svint32_t,
+		z0 = svtrn2q_s32 (z1, z0),
+		z0 = svtrn2q (z1, z0))
+
+/*
+** trn2q_s32_untied:
+**	trn2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_s32_untied, svint32_t,
+		z0 = svtrn2q_s32 (z1, z2),
+		z0 = svtrn2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s64.c
new file mode 100644
index 000000000..f22222525
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2q_s64_tied1:
+**	trn2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_s64_tied1, svint64_t,
+		z0 = svtrn2q_s64 (z0, z1),
+		z0 = svtrn2q (z0, z1))
+
+/*
+** trn2q_s64_tied2:
+**	trn2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_s64_tied2, svint64_t,
+		z0 = svtrn2q_s64 (z1, z0),
+		z0 = svtrn2q (z1, z0))
+
+/*
+** trn2q_s64_untied:
+**	trn2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_s64_untied, svint64_t,
+		z0 = svtrn2q_s64 (z1, z2),
+		z0 = svtrn2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s8.c
new file mode 100644
index 000000000..bd5243f35
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_s8.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2q_s8_tied1:
+**	trn2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_s8_tied1, svint8_t,
+		z0 = svtrn2q_s8 (z0, z1),
+		z0 = svtrn2q (z0, z1))
+
+/*
+** trn2q_s8_tied2:
+**	trn2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_s8_tied2, svint8_t,
+		z0 = svtrn2q_s8 (z1, z0),
+		z0 = svtrn2q (z1, z0))
+
+/*
+** trn2q_s8_untied:
+**	trn2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_s8_untied, svint8_t,
+		z0 = svtrn2q_s8 (z1, z2),
+		z0 = svtrn2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u16.c
new file mode 100644
index 000000000..8da8563b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2q_u16_tied1:
+**	trn2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_u16_tied1, svuint16_t,
+		z0 = svtrn2q_u16 (z0, z1),
+		z0 = svtrn2q (z0, z1))
+
+/*
+** trn2q_u16_tied2:
+**	trn2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_u16_tied2, svuint16_t,
+		z0 = svtrn2q_u16 (z1, z0),
+		z0 = svtrn2q (z1, z0))
+
+/*
+** trn2q_u16_untied:
+**	trn2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_u16_untied, svuint16_t,
+		z0 = svtrn2q_u16 (z1, z2),
+		z0 = svtrn2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u32.c
new file mode 100644
index 000000000..6c0af02da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2q_u32_tied1:
+**	trn2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_u32_tied1, svuint32_t,
+		z0 = svtrn2q_u32 (z0, z1),
+		z0 = svtrn2q (z0, z1))
+
+/*
+** trn2q_u32_tied2:
+**	trn2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_u32_tied2, svuint32_t,
+		z0 = svtrn2q_u32 (z1, z0),
+		z0 = svtrn2q (z1, z0))
+
+/*
+** trn2q_u32_untied:
+**	trn2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_u32_untied, svuint32_t,
+		z0 = svtrn2q_u32 (z1, z2),
+		z0 = svtrn2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u64.c
new file mode 100644
index 000000000..857595cbb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2q_u64_tied1:
+**	trn2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_u64_tied1, svuint64_t,
+		z0 = svtrn2q_u64 (z0, z1),
+		z0 = svtrn2q (z0, z1))
+
+/*
+** trn2q_u64_tied2:
+**	trn2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_u64_tied2, svuint64_t,
+		z0 = svtrn2q_u64 (z1, z0),
+		z0 = svtrn2q (z1, z0))
+
+/*
+** trn2q_u64_untied:
+**	trn2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_u64_untied, svuint64_t,
+		z0 = svtrn2q_u64 (z1, z2),
+		z0 = svtrn2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u8.c
new file mode 100644
index 000000000..1fb85b249
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/trn2q_u8.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2q_u8_tied1:
+**	trn2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_u8_tied1, svuint8_t,
+		z0 = svtrn2q_u8 (z0, z1),
+		z0 = svtrn2q (z0, z1))
+
+/*
+** trn2q_u8_tied2:
+**	trn2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_u8_tied2, svuint8_t,
+		z0 = svtrn2q_u8 (z1, z0),
+		z0 = svtrn2q (z1, z0))
+
+/*
+** trn2q_u8_untied:
+**	trn2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (trn2q_u8_untied, svuint8_t,
+		z0 = svtrn2q_u8 (z1, z2),
+		z0 = svtrn2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f16.c
new file mode 100644
index 000000000..94bc696eb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tsmul_f16_tied1:
+**	ftsmul	z0\.h, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (tsmul_f16_tied1, svfloat16_t, svuint16_t,
+	     z0 = svtsmul_f16 (z0, z4),
+	     z0 = svtsmul (z0, z4))
+
+/*
+** tsmul_f16_tied2:
+**	ftsmul	z0\.h, z4\.h, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (tsmul_f16_tied2, svfloat16_t, svuint16_t,
+		 z0_res = svtsmul_f16 (z4, z0),
+		 z0_res = svtsmul (z4, z0))
+
+/*
+** tsmul_f16_untied:
+**	ftsmul	z0\.h, z1\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (tsmul_f16_untied, svfloat16_t, svuint16_t,
+	     z0 = svtsmul_f16 (z1, z4),
+	     z0 = svtsmul (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f32.c
new file mode 100644
index 000000000..d0ec91882
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tsmul_f32_tied1:
+**	ftsmul	z0\.s, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (tsmul_f32_tied1, svfloat32_t, svuint32_t,
+	     z0 = svtsmul_f32 (z0, z4),
+	     z0 = svtsmul (z0, z4))
+
+/*
+** tsmul_f32_tied2:
+**	ftsmul	z0\.s, z4\.s, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (tsmul_f32_tied2, svfloat32_t, svuint32_t,
+		 z0_res = svtsmul_f32 (z4, z0),
+		 z0_res = svtsmul (z4, z0))
+
+/*
+** tsmul_f32_untied:
+**	ftsmul	z0\.s, z1\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (tsmul_f32_untied, svfloat32_t, svuint32_t,
+	     z0 = svtsmul_f32 (z1, z4),
+	     z0 = svtsmul (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f64.c
new file mode 100644
index 000000000..23e0da3f7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tsmul_f64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tsmul_f64_tied1:
+**	ftsmul	z0\.d, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (tsmul_f64_tied1, svfloat64_t, svuint64_t,
+	     z0 = svtsmul_f64 (z0, z4),
+	     z0 = svtsmul (z0, z4))
+
+/*
+** tsmul_f64_tied2:
+**	ftsmul	z0\.d, z4\.d, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (tsmul_f64_tied2, svfloat64_t, svuint64_t,
+		 z0_res = svtsmul_f64 (z4, z0),
+		 z0_res = svtsmul (z4, z0))
+
+/*
+** tsmul_f64_untied:
+**	ftsmul	z0\.d, z1\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (tsmul_f64_untied, svfloat64_t, svuint64_t,
+	     z0 = svtsmul_f64 (z1, z4),
+	     z0 = svtsmul (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f16.c
new file mode 100644
index 000000000..e7c3ea03b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tssel_f16_tied1:
+**	ftssel	z0\.h, z0\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (tssel_f16_tied1, svfloat16_t, svuint16_t,
+	     z0 = svtssel_f16 (z0, z4),
+	     z0 = svtssel (z0, z4))
+
+/*
+** tssel_f16_tied2:
+**	ftssel	z0\.h, z4\.h, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (tssel_f16_tied2, svfloat16_t, svuint16_t,
+		 z0_res = svtssel_f16 (z4, z0),
+		 z0_res = svtssel (z4, z0))
+
+/*
+** tssel_f16_untied:
+**	ftssel	z0\.h, z1\.h, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (tssel_f16_untied, svfloat16_t, svuint16_t,
+	     z0 = svtssel_f16 (z1, z4),
+	     z0 = svtssel (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f32.c
new file mode 100644
index 000000000..022573a19
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tssel_f32_tied1:
+**	ftssel	z0\.s, z0\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (tssel_f32_tied1, svfloat32_t, svuint32_t,
+	     z0 = svtssel_f32 (z0, z4),
+	     z0 = svtssel (z0, z4))
+
+/*
+** tssel_f32_tied2:
+**	ftssel	z0\.s, z4\.s, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (tssel_f32_tied2, svfloat32_t, svuint32_t,
+		 z0_res = svtssel_f32 (z4, z0),
+		 z0_res = svtssel (z4, z0))
+
+/*
+** tssel_f32_untied:
+**	ftssel	z0\.s, z1\.s, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (tssel_f32_untied, svfloat32_t, svuint32_t,
+	     z0 = svtssel_f32 (z1, z4),
+	     z0 = svtssel (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f64.c
new file mode 100644
index 000000000..ffcdf4224
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/tssel_f64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tssel_f64_tied1:
+**	ftssel	z0\.d, z0\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (tssel_f64_tied1, svfloat64_t, svuint64_t,
+	     z0 = svtssel_f64 (z0, z4),
+	     z0 = svtssel (z0, z4))
+
+/*
+** tssel_f64_tied2:
+**	ftssel	z0\.d, z4\.d, z0\.d
+**	ret
+*/
+TEST_DUAL_Z_REV (tssel_f64_tied2, svfloat64_t, svuint64_t,
+		 z0_res = svtssel_f64 (z4, z0),
+		 z0_res = svtssel (z4, z0))
+
+/*
+** tssel_f64_untied:
+**	ftssel	z0\.d, z1\.d, z4\.d
+**	ret
+*/
+TEST_DUAL_Z (tssel_f64_untied, svfloat64_t, svuint64_t,
+	     z0 = svtssel_f64 (z1, z4),
+	     z0 = svtssel (z1, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef2_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef2_1.c
new file mode 100644
index 000000000..fe6c4c7c7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef2_1.c
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** int8:
+**	ret
+*/
+TEST_UNDEF (int8, svint8x2_t,
+	    z0 = svundef2_s8 ())
+
+/*
+** uint8:
+**	ret
+*/
+TEST_UNDEF (uint8, svuint8x2_t,
+	    z0 = svundef2_u8 ())
+
+/*
+** int16:
+**	ret
+*/
+TEST_UNDEF (int16, svint16x2_t,
+	    z0 = svundef2_s16 ())
+
+/*
+** uint16:
+**	ret
+*/
+TEST_UNDEF (uint16, svuint16x2_t,
+	    z0 = svundef2_u16 ())
+
+/*
+** float16:
+**	ret
+*/
+TEST_UNDEF (float16, svfloat16x2_t,
+	    z0 = svundef2_f16 ())
+
+/*
+** bfloat16:
+**	ret
+*/
+TEST_UNDEF (bfloat16, svbfloat16x2_t,
+	    z0 = svundef2_bf16 ())
+
+/*
+** int32:
+**	ret
+*/
+TEST_UNDEF (int32, svint32x2_t,
+	    z0 = svundef2_s32 ())
+
+/*
+** uint32:
+**	ret
+*/
+TEST_UNDEF (uint32, svuint32x2_t,
+	    z0 = svundef2_u32 ())
+
+/*
+** float32:
+**	ret
+*/
+TEST_UNDEF (float32, svfloat32x2_t,
+	    z0 = svundef2_f32 ())
+
+/*
+** int64:
+**	ret
+*/
+TEST_UNDEF (int64, svint64x2_t,
+	    z0 = svundef2_s64 ())
+
+/*
+** uint64:
+**	ret
+*/
+TEST_UNDEF (uint64, svuint64x2_t,
+	    z0 = svundef2_u64 ())
+
+/*
+** float64:
+**	ret
+*/
+TEST_UNDEF (float64, svfloat64x2_t,
+	    z0 = svundef2_f64 ())
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef3_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef3_1.c
new file mode 100644
index 000000000..5c18c6317
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef3_1.c
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** int8:
+**	ret
+*/
+TEST_UNDEF (int8, svint8x3_t,
+	    z0 = svundef3_s8 ())
+
+/*
+** uint8:
+**	ret
+*/
+TEST_UNDEF (uint8, svuint8x3_t,
+	    z0 = svundef3_u8 ())
+
+/*
+** int16:
+**	ret
+*/
+TEST_UNDEF (int16, svint16x3_t,
+	    z0 = svundef3_s16 ())
+
+/*
+** uint16:
+**	ret
+*/
+TEST_UNDEF (uint16, svuint16x3_t,
+	    z0 = svundef3_u16 ())
+
+/*
+** float16:
+**	ret
+*/
+TEST_UNDEF (float16, svfloat16x3_t,
+	    z0 = svundef3_f16 ())
+
+/*
+** bfloat16:
+**	ret
+*/
+TEST_UNDEF (bfloat16, svbfloat16x3_t,
+	    z0 = svundef3_bf16 ())
+
+/*
+** int32:
+**	ret
+*/
+TEST_UNDEF (int32, svint32x3_t,
+	    z0 = svundef3_s32 ())
+
+/*
+** uint32:
+**	ret
+*/
+TEST_UNDEF (uint32, svuint32x3_t,
+	    z0 = svundef3_u32 ())
+
+/*
+** float32:
+**	ret
+*/
+TEST_UNDEF (float32, svfloat32x3_t,
+	    z0 = svundef3_f32 ())
+
+/*
+** int64:
+**	ret
+*/
+TEST_UNDEF (int64, svint64x3_t,
+	    z0 = svundef3_s64 ())
+
+/*
+** uint64:
+**	ret
+*/
+TEST_UNDEF (uint64, svuint64x3_t,
+	    z0 = svundef3_u64 ())
+
+/*
+** float64:
+**	ret
+*/
+TEST_UNDEF (float64, svfloat64x3_t,
+	    z0 = svundef3_f64 ())
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef4_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef4_1.c
new file mode 100644
index 000000000..4d6b86b04
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef4_1.c
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** int8:
+**	ret
+*/
+TEST_UNDEF (int8, svint8x4_t,
+	    z0 = svundef4_s8 ())
+
+/*
+** uint8:
+**	ret
+*/
+TEST_UNDEF (uint8, svuint8x4_t,
+	    z0 = svundef4_u8 ())
+
+/*
+** int16:
+**	ret
+*/
+TEST_UNDEF (int16, svint16x4_t,
+	    z0 = svundef4_s16 ())
+
+/*
+** uint16:
+**	ret
+*/
+TEST_UNDEF (uint16, svuint16x4_t,
+	    z0 = svundef4_u16 ())
+
+/*
+** float16:
+**	ret
+*/
+TEST_UNDEF (float16, svfloat16x4_t,
+	    z0 = svundef4_f16 ())
+
+/*
+** bfloat16:
+**	ret
+*/
+TEST_UNDEF (bfloat16, svbfloat16x4_t,
+	    z0 = svundef4_bf16 ())
+
+/*
+** int32:
+**	ret
+*/
+TEST_UNDEF (int32, svint32x4_t,
+	    z0 = svundef4_s32 ())
+
+/*
+** uint32:
+**	ret
+*/
+TEST_UNDEF (uint32, svuint32x4_t,
+	    z0 = svundef4_u32 ())
+
+/*
+** float32:
+**	ret
+*/
+TEST_UNDEF (float32, svfloat32x4_t,
+	    z0 = svundef4_f32 ())
+
+/*
+** int64:
+**	ret
+*/
+TEST_UNDEF (int64, svint64x4_t,
+	    z0 = svundef4_s64 ())
+
+/*
+** uint64:
+**	ret
+*/
+TEST_UNDEF (uint64, svuint64x4_t,
+	    z0 = svundef4_u64 ())
+
+/*
+** float64:
+**	ret
+*/
+TEST_UNDEF (float64, svfloat64x4_t,
+	    z0 = svundef4_f64 ())
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef_1.c
new file mode 100644
index 000000000..62873b6e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/undef_1.c
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** int8:
+**	ret
+*/
+TEST_UNDEF (int8, svint8_t,
+	    z0 = svundef_s8 ())
+
+/*
+** uint8:
+**	ret
+*/
+TEST_UNDEF (uint8, svuint8_t,
+	    z0 = svundef_u8 ())
+
+/*
+** int16:
+**	ret
+*/
+TEST_UNDEF (int16, svint16_t,
+	    z0 = svundef_s16 ())
+
+/*
+** uint16:
+**	ret
+*/
+TEST_UNDEF (uint16, svuint16_t,
+	    z0 = svundef_u16 ())
+
+/*
+** float16:
+**	ret
+*/
+TEST_UNDEF (float16, svfloat16_t,
+	    z0 = svundef_f16 ())
+
+/*
+** bfloat16:
+**	ret
+*/
+TEST_UNDEF (bfloat16, svbfloat16_t,
+	    z0 = svundef_bf16 ())
+
+/*
+** int32:
+**	ret
+*/
+TEST_UNDEF (int32, svint32_t,
+	    z0 = svundef_s32 ())
+
+/*
+** uint32:
+**	ret
+*/
+TEST_UNDEF (uint32, svuint32_t,
+	    z0 = svundef_u32 ())
+
+/*
+** float32:
+**	ret
+*/
+TEST_UNDEF (float32, svfloat32_t,
+	    z0 = svundef_f32 ())
+
+/*
+** int64:
+**	ret
+*/
+TEST_UNDEF (int64, svint64_t,
+	    z0 = svundef_s64 ())
+
+/*
+** uint64:
+**	ret
+*/
+TEST_UNDEF (uint64, svuint64_t,
+	    z0 = svundef_u64 ())
+
+/*
+** float64:
+**	ret
+*/
+TEST_UNDEF (float64, svfloat64_t,
+	    z0 = svundef_f64 ())
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_b.c
new file mode 100644
index 000000000..ff1a84aac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_b.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** unpkhi_b_tied1:
+**	punpkhi	p0\.h, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (unpkhi_b_tied1,
+		p0 = svunpkhi_b (p0),
+		p0 = svunpkhi (p0))
+
+/*
+** unpkhi_b_untied:
+**	punpkhi	p0\.h, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (unpkhi_b_untied,
+		p0 = svunpkhi_b (p1),
+		p0 = svunpkhi (p1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s16.c
new file mode 100644
index 000000000..3f79ac65f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** unpkhi_s16_tied1:
+**	sunpkhi	z0\.h, z0\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (unpkhi_s16_tied1, svint16_t, svint8_t,
+		 z0_res = svunpkhi_s16 (z0),
+		 z0_res = svunpkhi (z0))
+
+/*
+** unpkhi_s16_untied:
+**	sunpkhi	z0\.h, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (unpkhi_s16_untied, svint16_t, svint8_t,
+	     z0 = svunpkhi_s16 (z4),
+	     z0 = svunpkhi (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s32.c
new file mode 100644
index 000000000..619fb0882
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** unpkhi_s32_tied1:
+**	sunpkhi	z0\.s, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (unpkhi_s32_tied1, svint32_t, svint16_t,
+		 z0_res = svunpkhi_s32 (z0),
+		 z0_res = svunpkhi (z0))
+
+/*
+** unpkhi_s32_untied:
+**	sunpkhi	z0\.s, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (unpkhi_s32_untied, svint32_t, svint16_t,
+	     z0 = svunpkhi_s32 (z4),
+	     z0 = svunpkhi (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s64.c
new file mode 100644
index 000000000..5d6da1768
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_s64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** unpkhi_s64_tied1:
+**	sunpkhi	z0\.d, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (unpkhi_s64_tied1, svint64_t, svint32_t,
+		 z0_res = svunpkhi_s64 (z0),
+		 z0_res = svunpkhi (z0))
+
+/*
+** unpkhi_s64_untied:
+**	sunpkhi	z0\.d, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (unpkhi_s64_untied, svint64_t, svint32_t,
+	     z0 = svunpkhi_s64 (z4),
+	     z0 = svunpkhi (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u16.c
new file mode 100644
index 000000000..68f47a282
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** unpkhi_u16_tied1:
+**	uunpkhi	z0\.h, z0\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (unpkhi_u16_tied1, svuint16_t, svuint8_t,
+		 z0_res = svunpkhi_u16 (z0),
+		 z0_res = svunpkhi (z0))
+
+/*
+** unpkhi_u16_untied:
+**	uunpkhi	z0\.h, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (unpkhi_u16_untied, svuint16_t, svuint8_t,
+	     z0 = svunpkhi_u16 (z4),
+	     z0 = svunpkhi (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u32.c
new file mode 100644
index 000000000..3c4b161e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** unpkhi_u32_tied1:
+**	uunpkhi	z0\.s, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (unpkhi_u32_tied1, svuint32_t, svuint16_t,
+		 z0_res = svunpkhi_u32 (z0),
+		 z0_res = svunpkhi (z0))
+
+/*
+** unpkhi_u32_untied:
+**	uunpkhi	z0\.s, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (unpkhi_u32_untied, svuint32_t, svuint16_t,
+	     z0 = svunpkhi_u32 (z4),
+	     z0 = svunpkhi (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u64.c
new file mode 100644
index 000000000..94cfbd493
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpkhi_u64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** unpkhi_u64_tied1:
+**	uunpkhi	z0\.d, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (unpkhi_u64_tied1, svuint64_t, svuint32_t,
+		 z0_res = svunpkhi_u64 (z0),
+		 z0_res = svunpkhi (z0))
+
+/*
+** unpkhi_u64_untied:
+**	uunpkhi	z0\.d, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (unpkhi_u64_untied, svuint64_t, svuint32_t,
+	     z0 = svunpkhi_u64 (z4),
+	     z0 = svunpkhi (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_b.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_b.c
new file mode 100644
index 000000000..476ec8bc3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_b.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** unpklo_b_tied1:
+**	punpklo	p0\.h, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (unpklo_b_tied1,
+		p0 = svunpklo_b (p0),
+		p0 = svunpklo (p0))
+
+/*
+** unpklo_b_untied:
+**	punpklo	p0\.h, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (unpklo_b_untied,
+		p0 = svunpklo_b (p1),
+		p0 = svunpklo (p1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s16.c
new file mode 100644
index 000000000..a0e83ff1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** unpklo_s16_tied1:
+**	sunpklo	z0\.h, z0\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (unpklo_s16_tied1, svint16_t, svint8_t,
+		 z0_res = svunpklo_s16 (z0),
+		 z0_res = svunpklo (z0))
+
+/*
+** unpklo_s16_untied:
+**	sunpklo	z0\.h, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (unpklo_s16_untied, svint16_t, svint8_t,
+	     z0 = svunpklo_s16 (z4),
+	     z0 = svunpklo (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s32.c
new file mode 100644
index 000000000..49a14fb7b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** unpklo_s32_tied1:
+**	sunpklo	z0\.s, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (unpklo_s32_tied1, svint32_t, svint16_t,
+		 z0_res = svunpklo_s32 (z0),
+		 z0_res = svunpklo (z0))
+
+/*
+** unpklo_s32_untied:
+**	sunpklo	z0\.s, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (unpklo_s32_untied, svint32_t, svint16_t,
+	     z0 = svunpklo_s32 (z4),
+	     z0 = svunpklo (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s64.c
new file mode 100644
index 000000000..c430047e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_s64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** unpklo_s64_tied1:
+**	sunpklo	z0\.d, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (unpklo_s64_tied1, svint64_t, svint32_t,
+		 z0_res = svunpklo_s64 (z0),
+		 z0_res = svunpklo (z0))
+
+/*
+** unpklo_s64_untied:
+**	sunpklo	z0\.d, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (unpklo_s64_untied, svint64_t, svint32_t,
+	     z0 = svunpklo_s64 (z4),
+	     z0 = svunpklo (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u16.c
new file mode 100644
index 000000000..6feee4427
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u16.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** unpklo_u16_tied1:
+**	uunpklo	z0\.h, z0\.b
+**	ret
+*/
+TEST_DUAL_Z_REV (unpklo_u16_tied1, svuint16_t, svuint8_t,
+		 z0_res = svunpklo_u16 (z0),
+		 z0_res = svunpklo (z0))
+
+/*
+** unpklo_u16_untied:
+**	uunpklo	z0\.h, z4\.b
+**	ret
+*/
+TEST_DUAL_Z (unpklo_u16_untied, svuint16_t, svuint8_t,
+	     z0 = svunpklo_u16 (z4),
+	     z0 = svunpklo (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u32.c
new file mode 100644
index 000000000..c4d4efc86
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u32.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** unpklo_u32_tied1:
+**	uunpklo	z0\.s, z0\.h
+**	ret
+*/
+TEST_DUAL_Z_REV (unpklo_u32_tied1, svuint32_t, svuint16_t,
+		 z0_res = svunpklo_u32 (z0),
+		 z0_res = svunpklo (z0))
+
+/*
+** unpklo_u32_untied:
+**	uunpklo	z0\.s, z4\.h
+**	ret
+*/
+TEST_DUAL_Z (unpklo_u32_untied, svuint32_t, svuint16_t,
+	     z0 = svunpklo_u32 (z4),
+	     z0 = svunpklo (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u64.c
new file mode 100644
index 000000000..2845e37a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/unpklo_u64.c
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** unpklo_u64_tied1:
+**	uunpklo	z0\.d, z0\.s
+**	ret
+*/
+TEST_DUAL_Z_REV (unpklo_u64_tied1, svuint64_t, svuint32_t,
+		 z0_res = svunpklo_u64 (z0),
+		 z0_res = svunpklo (z0))
+
+/*
+** unpklo_u64_untied:
+**	uunpklo	z0\.d, z4\.s
+**	ret
+*/
+TEST_DUAL_Z (unpklo_u64_untied, svuint64_t, svuint32_t,
+	     z0 = svunpklo_u64 (z4),
+	     z0 = svunpklo (z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usdot_lane_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usdot_lane_s32.c
new file mode 100644
index 000000000..8fd255687
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usdot_lane_s32.c
@@ -0,0 +1,97 @@
+/* { dg-require-effective-target aarch64_asm_i8mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+sve+i8mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** usdot_lane_0_s32_tied1:
+**	usdot	z0\.s, z2\.b, z4\.b\[0\]
+**	ret
+*/
+TEST_TRIPLE_Z (usdot_lane_0_s32_tied1, svint32_t, svuint8_t, svint8_t,
+	       z0 = svusdot_lane_s32 (z0, z2, z4, 0),
+	       z0 = svusdot_lane (z0, z2, z4, 0))
+
+/*
+** usdot_lane_0_s32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z2
+**	usdot	z0\.s, \1\.b, z4\.b\[0\]
+**	ret
+*/
+TEST_TRIPLE_Z_REV2 (usdot_lane_0_s32_tied2, svint32_t, svuint8_t, svint8_t,
+		    z0_res = svusdot_lane_s32 (z2, z0, z4, 0),
+		    z0_res = svusdot_lane (z2, z0, z4, 0))
+
+/*
+** usdot_lane_0_s32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	usdot	z0\.s, z2\.b, \1\.b\[0\]
+**	ret
+*/
+TEST_TRIPLE_Z_REV (usdot_lane_0_s32_tied3, svint32_t, svuint8_t, svint8_t,
+		   z0_res = svusdot_lane_s32 (z4, z2, z0, 0),
+		   z0_res = svusdot_lane (z4, z2, z0, 0))
+
+/*
+** usdot_lane_0_s32_untied:
+**	movprfx	z0, z1
+**	usdot	z0\.s, z2\.b, z4\.b\[0\]
+**	ret
+*/
+TEST_TRIPLE_Z (usdot_lane_0_s32_untied, svint32_t, svuint8_t, svint8_t,
+	       z0 = svusdot_lane_s32 (z1, z2, z4, 0),
+	       z0 = svusdot_lane (z1, z2, z4, 0))
+
+/*
+** usdot_lane_1_s32:
+**	usdot	z0\.s, z2\.b, z5\.b\[1\]
+**	ret
+*/
+TEST_TRIPLE_Z (usdot_lane_1_s32, svint32_t, svuint8_t, svint8_t,
+	       z0 = svusdot_lane_s32 (z0, z2, z5, 1),
+	       z0 = svusdot_lane (z0, z2, z5, 1))
+
+/*
+** usdot_lane_2_s32:
+**	usdot	z0\.s, z2\.b, z5\.b\[2\]
+**	ret
+*/
+TEST_TRIPLE_Z (usdot_lane_2_s32, svint32_t, svuint8_t, svint8_t,
+	       z0 = svusdot_lane_s32 (z0, z2, z5, 2),
+	       z0 = svusdot_lane (z0, z2, z5, 2))
+
+/*
+** usdot_lane_3_s32:
+**	usdot	z0\.s, z2\.b, z5\.b\[3\]
+**	ret
+*/
+TEST_TRIPLE_Z (usdot_lane_3_s32, svint32_t, svuint8_t, svint8_t,
+	       z0 = svusdot_lane_s32 (z0, z2, z5, 3),
+	       z0 = svusdot_lane (z0, z2, z5, 3))
+
+/*
+** usdot_lane_z8_s32:
+**	str	d8, \[sp, -16\]!
+**	mov	(z[0-7])\.d, z8\.d
+**	usdot	z0\.s, z1\.b, \1\.b\[1\]
+**	ldr	d8, \[sp\], 16
+**	ret
+*/
+TEST_TRIPLE_LANE_REG (usdot_lane_z8_s32, svint32_t, svuint8_t, svint8_t,
+		      z8,
+		      z0 = svusdot_lane_s32 (z0, z1, z8, 1),
+		      z0 = svusdot_lane (z0, z1, z8, 1))
+
+/*
+** usdot_lane_z16_s32:
+**	mov	(z[0-7])\.d, z16\.d
+**	usdot	z0\.s, z1\.b, \1\.b\[1\]
+**	ret
+*/
+TEST_TRIPLE_LANE_REG (usdot_lane_z16_s32, svint32_t, svuint8_t, svint8_t,
+		      z16,
+		      z0 = svusdot_lane_s32 (z0, z1, z16, 1),
+		      z0 = svusdot_lane (z0, z1, z16, 1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usdot_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usdot_s32.c
new file mode 100644
index 000000000..ccac5cae5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usdot_s32.c
@@ -0,0 +1,46 @@
+/* { dg-require-effective-target aarch64_asm_i8mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+sve+i8mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** usdot_s32_tied1:
+**	usdot	z0\.s, z2\.b, z4\.b
+**	ret
+*/
+TEST_TRIPLE_Z (usdot_s32_tied1, svint32_t, svuint8_t, svint8_t,
+	       z0 = svusdot_s32 (z0, z2, z4),
+	       z0 = svusdot (z0, z2, z4))
+
+/*
+** usdot_s32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	usdot	z0\.s, z2\.b, \1\.b
+**	ret
+*/
+TEST_TRIPLE_Z_REV (usdot_s32_tied2, svint32_t, svuint8_t, svint8_t,
+		   z0_res = svusdot_s32 (z4, z2, z0),
+		   z0_res = svusdot (z4, z2, z0))
+
+/*
+** usdot_w0_s32_tied:
+**	mov	(z[0-9]+\.b), w0
+**	usdot	z0\.s, z2\.b, \1
+**	ret
+*/
+TEST_TRIPLE_ZX (usdot_w0_s32_tied, svint32_t, svuint8_t, int8_t,
+		z0 = svusdot_n_s32 (z0, z2, x0),
+		z0 = svusdot (z0, z2, x0))
+
+/*
+** usdot_9_s32_tied:
+**	mov	(z[0-9]+\.b), #9
+**	usdot	z0\.s, z2\.b, \1
+**	ret
+*/
+TEST_TRIPLE_Z (usdot_9_s32_tied, svint32_t, svuint8_t, int8_t,
+	       z0 = svusdot_n_s32 (z0, z2, 9),
+	       z0 = svusdot (z0, z2, 9))
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usmmla_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usmmla_s32.c
new file mode 100644
index 000000000..9440f3fd9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/usmmla_s32.c
@@ -0,0 +1,46 @@
+/* { dg-require-effective-target aarch64_asm_i8mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+sve+i8mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** usmmla_s32_tied1:
+**	usmmla	z0\.s, z2\.b, z4\.b
+**	ret
+*/
+TEST_TRIPLE_Z (usmmla_s32_tied1, svint32_t, svuint8_t, svint8_t,
+	       z0 = svusmmla_s32 (z0, z2, z4),
+	       z0 = svusmmla (z0, z2, z4))
+
+/*
+** usmmla_s32_tied2:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z2
+**	usmmla	z0\.s, \1\.b, z4\.b
+**	ret
+*/
+TEST_TRIPLE_Z_REV2 (usmmla_s32_tied2, svint32_t, svuint8_t, svint8_t,
+		    z0_res = svusmmla_s32 (z2, z0, z4),
+		    z0_res = svusmmla (z2, z0, z4))
+
+/*
+** usmmla_s32_tied3:
+**	mov	(z[0-9]+)\.d, z0\.d
+**	movprfx	z0, z4
+**	usmmla	z0\.s, z2\.b, \1\.b
+**	ret
+*/
+TEST_TRIPLE_Z_REV (usmmla_s32_tied3, svint32_t, svuint8_t, svint8_t,
+		   z0_res = svusmmla_s32 (z4, z2, z0),
+		   z0_res = svusmmla (z4, z2, z0))
+
+/*
+** usmmla_s32_untied:
+**	movprfx	z0, z1
+**	usmmla	z0\.s, z2\.b, z4\.b
+**	ret
+*/
+TEST_TRIPLE_Z (usmmla_s32_untied, svint32_t, svuint8_t, svint8_t,
+	       z0 = svusmmla_s32 (z1, z2, z4),
+	       z0 = svusmmla (z1, z2, z4))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b16.c
new file mode 100644
index 000000000..245e401aa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_b16_tied1:
+**	uzp1	p0\.h, p0\.h, p1\.h
+**	ret
+*/
+TEST_UNIFORM_P (uzp1_b16_tied1,
+		p0 = svuzp1_b16 (p0, p1),
+		p0 = svuzp1_b16 (p0, p1))
+
+/*
+** uzp1_b16_tied2:
+**	uzp1	p0\.h, p1\.h, p0\.h
+**	ret
+*/
+TEST_UNIFORM_P (uzp1_b16_tied2,
+		p0 = svuzp1_b16 (p1, p0),
+		p0 = svuzp1_b16 (p1, p0))
+
+/*
+** uzp1_b16_untied:
+**	uzp1	p0\.h, p1\.h, p2\.h
+**	ret
+*/
+TEST_UNIFORM_P (uzp1_b16_untied,
+		p0 = svuzp1_b16 (p1, p2),
+		p0 = svuzp1_b16 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b32.c
new file mode 100644
index 000000000..c88034492
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_b32_tied1:
+**	uzp1	p0\.s, p0\.s, p1\.s
+**	ret
+*/
+TEST_UNIFORM_P (uzp1_b32_tied1,
+		p0 = svuzp1_b32 (p0, p1),
+		p0 = svuzp1_b32 (p0, p1))
+
+/*
+** uzp1_b32_tied2:
+**	uzp1	p0\.s, p1\.s, p0\.s
+**	ret
+*/
+TEST_UNIFORM_P (uzp1_b32_tied2,
+		p0 = svuzp1_b32 (p1, p0),
+		p0 = svuzp1_b32 (p1, p0))
+
+/*
+** uzp1_b32_untied:
+**	uzp1	p0\.s, p1\.s, p2\.s
+**	ret
+*/
+TEST_UNIFORM_P (uzp1_b32_untied,
+		p0 = svuzp1_b32 (p1, p2),
+		p0 = svuzp1_b32 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b64.c
new file mode 100644
index 000000000..71ac5c150
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_b64_tied1:
+**	uzp1	p0\.d, p0\.d, p1\.d
+**	ret
+*/
+TEST_UNIFORM_P (uzp1_b64_tied1,
+		p0 = svuzp1_b64 (p0, p1),
+		p0 = svuzp1_b64 (p0, p1))
+
+/*
+** uzp1_b64_tied2:
+**	uzp1	p0\.d, p1\.d, p0\.d
+**	ret
+*/
+TEST_UNIFORM_P (uzp1_b64_tied2,
+		p0 = svuzp1_b64 (p1, p0),
+		p0 = svuzp1_b64 (p1, p0))
+
+/*
+** uzp1_b64_untied:
+**	uzp1	p0\.d, p1\.d, p2\.d
+**	ret
+*/
+TEST_UNIFORM_P (uzp1_b64_untied,
+		p0 = svuzp1_b64 (p1, p2),
+		p0 = svuzp1_b64 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b8.c
new file mode 100644
index 000000000..250054bb6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_b8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_b8_tied1:
+**	uzp1	p0\.b, p0\.b, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (uzp1_b8_tied1,
+		p0 = svuzp1_b8 (p0, p1),
+		p0 = svuzp1_b8 (p0, p1))
+
+/*
+** uzp1_b8_tied2:
+**	uzp1	p0\.b, p1\.b, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (uzp1_b8_tied2,
+		p0 = svuzp1_b8 (p1, p0),
+		p0 = svuzp1_b8 (p1, p0))
+
+/*
+** uzp1_b8_untied:
+**	uzp1	p0\.b, p1\.b, p2\.b
+**	ret
+*/
+TEST_UNIFORM_P (uzp1_b8_untied,
+		p0 = svuzp1_b8 (p1, p2),
+		p0 = svuzp1_b8 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_bf16.c
new file mode 100644
index 000000000..19d43ed11
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_bf16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_bf16_tied1:
+**	uzp1	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_bf16_tied1, svbfloat16_t,
+		z0 = svuzp1_bf16 (z0, z1),
+		z0 = svuzp1 (z0, z1))
+
+/*
+** uzp1_bf16_tied2:
+**	uzp1	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_bf16_tied2, svbfloat16_t,
+		z0 = svuzp1_bf16 (z1, z0),
+		z0 = svuzp1 (z1, z0))
+
+/*
+** uzp1_bf16_untied:
+**	uzp1	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_bf16_untied, svbfloat16_t,
+		z0 = svuzp1_bf16 (z1, z2),
+		z0 = svuzp1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f16.c
new file mode 100644
index 000000000..313673e9d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_f16_tied1:
+**	uzp1	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_f16_tied1, svfloat16_t,
+		z0 = svuzp1_f16 (z0, z1),
+		z0 = svuzp1 (z0, z1))
+
+/*
+** uzp1_f16_tied2:
+**	uzp1	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_f16_tied2, svfloat16_t,
+		z0 = svuzp1_f16 (z1, z0),
+		z0 = svuzp1 (z1, z0))
+
+/*
+** uzp1_f16_untied:
+**	uzp1	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_f16_untied, svfloat16_t,
+		z0 = svuzp1_f16 (z1, z2),
+		z0 = svuzp1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f32.c
new file mode 100644
index 000000000..5bbac2c60
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_f32_tied1:
+**	uzp1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_f32_tied1, svfloat32_t,
+		z0 = svuzp1_f32 (z0, z1),
+		z0 = svuzp1 (z0, z1))
+
+/*
+** uzp1_f32_tied2:
+**	uzp1	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_f32_tied2, svfloat32_t,
+		z0 = svuzp1_f32 (z1, z0),
+		z0 = svuzp1 (z1, z0))
+
+/*
+** uzp1_f32_untied:
+**	uzp1	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_f32_untied, svfloat32_t,
+		z0 = svuzp1_f32 (z1, z2),
+		z0 = svuzp1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f64.c
new file mode 100644
index 000000000..ef97b1765
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_f64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_f64_tied1:
+**	uzp1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_f64_tied1, svfloat64_t,
+		z0 = svuzp1_f64 (z0, z1),
+		z0 = svuzp1 (z0, z1))
+
+/*
+** uzp1_f64_tied2:
+**	uzp1	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_f64_tied2, svfloat64_t,
+		z0 = svuzp1_f64 (z1, z0),
+		z0 = svuzp1 (z1, z0))
+
+/*
+** uzp1_f64_untied:
+**	uzp1	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_f64_untied, svfloat64_t,
+		z0 = svuzp1_f64 (z1, z2),
+		z0 = svuzp1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s16.c
new file mode 100644
index 000000000..b77832b07
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_s16_tied1:
+**	uzp1	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_s16_tied1, svint16_t,
+		z0 = svuzp1_s16 (z0, z1),
+		z0 = svuzp1 (z0, z1))
+
+/*
+** uzp1_s16_tied2:
+**	uzp1	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_s16_tied2, svint16_t,
+		z0 = svuzp1_s16 (z1, z0),
+		z0 = svuzp1 (z1, z0))
+
+/*
+** uzp1_s16_untied:
+**	uzp1	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_s16_untied, svint16_t,
+		z0 = svuzp1_s16 (z1, z2),
+		z0 = svuzp1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s32.c
new file mode 100644
index 000000000..64291afbe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_s32_tied1:
+**	uzp1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_s32_tied1, svint32_t,
+		z0 = svuzp1_s32 (z0, z1),
+		z0 = svuzp1 (z0, z1))
+
+/*
+** uzp1_s32_tied2:
+**	uzp1	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_s32_tied2, svint32_t,
+		z0 = svuzp1_s32 (z1, z0),
+		z0 = svuzp1 (z1, z0))
+
+/*
+** uzp1_s32_untied:
+**	uzp1	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_s32_untied, svint32_t,
+		z0 = svuzp1_s32 (z1, z2),
+		z0 = svuzp1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s64.c
new file mode 100644
index 000000000..e8f7799f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_s64_tied1:
+**	uzp1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_s64_tied1, svint64_t,
+		z0 = svuzp1_s64 (z0, z1),
+		z0 = svuzp1 (z0, z1))
+
+/*
+** uzp1_s64_tied2:
+**	uzp1	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_s64_tied2, svint64_t,
+		z0 = svuzp1_s64 (z1, z0),
+		z0 = svuzp1 (z1, z0))
+
+/*
+** uzp1_s64_untied:
+**	uzp1	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_s64_untied, svint64_t,
+		z0 = svuzp1_s64 (z1, z2),
+		z0 = svuzp1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s8.c
new file mode 100644
index 000000000..98464b790
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_s8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_s8_tied1:
+**	uzp1	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_s8_tied1, svint8_t,
+		z0 = svuzp1_s8 (z0, z1),
+		z0 = svuzp1 (z0, z1))
+
+/*
+** uzp1_s8_tied2:
+**	uzp1	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_s8_tied2, svint8_t,
+		z0 = svuzp1_s8 (z1, z0),
+		z0 = svuzp1 (z1, z0))
+
+/*
+** uzp1_s8_untied:
+**	uzp1	z0\.b, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_s8_untied, svint8_t,
+		z0 = svuzp1_s8 (z1, z2),
+		z0 = svuzp1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u16.c
new file mode 100644
index 000000000..da95171fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_u16_tied1:
+**	uzp1	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_u16_tied1, svuint16_t,
+		z0 = svuzp1_u16 (z0, z1),
+		z0 = svuzp1 (z0, z1))
+
+/*
+** uzp1_u16_tied2:
+**	uzp1	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_u16_tied2, svuint16_t,
+		z0 = svuzp1_u16 (z1, z0),
+		z0 = svuzp1 (z1, z0))
+
+/*
+** uzp1_u16_untied:
+**	uzp1	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_u16_untied, svuint16_t,
+		z0 = svuzp1_u16 (z1, z2),
+		z0 = svuzp1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u32.c
new file mode 100644
index 000000000..a57cdcc06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_u32_tied1:
+**	uzp1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_u32_tied1, svuint32_t,
+		z0 = svuzp1_u32 (z0, z1),
+		z0 = svuzp1 (z0, z1))
+
+/*
+** uzp1_u32_tied2:
+**	uzp1	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_u32_tied2, svuint32_t,
+		z0 = svuzp1_u32 (z1, z0),
+		z0 = svuzp1 (z1, z0))
+
+/*
+** uzp1_u32_untied:
+**	uzp1	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_u32_untied, svuint32_t,
+		z0 = svuzp1_u32 (z1, z2),
+		z0 = svuzp1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u64.c
new file mode 100644
index 000000000..24d820359
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_u64_tied1:
+**	uzp1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_u64_tied1, svuint64_t,
+		z0 = svuzp1_u64 (z0, z1),
+		z0 = svuzp1 (z0, z1))
+
+/*
+** uzp1_u64_tied2:
+**	uzp1	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_u64_tied2, svuint64_t,
+		z0 = svuzp1_u64 (z1, z0),
+		z0 = svuzp1 (z1, z0))
+
+/*
+** uzp1_u64_untied:
+**	uzp1	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_u64_untied, svuint64_t,
+		z0 = svuzp1_u64 (z1, z2),
+		z0 = svuzp1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u8.c
new file mode 100644
index 000000000..359d4c5f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1_u8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_u8_tied1:
+**	uzp1	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_u8_tied1, svuint8_t,
+		z0 = svuzp1_u8 (z0, z1),
+		z0 = svuzp1 (z0, z1))
+
+/*
+** uzp1_u8_tied2:
+**	uzp1	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_u8_tied2, svuint8_t,
+		z0 = svuzp1_u8 (z1, z0),
+		z0 = svuzp1 (z1, z0))
+
+/*
+** uzp1_u8_untied:
+**	uzp1	z0\.b, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1_u8_untied, svuint8_t,
+		z0 = svuzp1_u8 (z1, z2),
+		z0 = svuzp1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_bf16.c
new file mode 100644
index 000000000..30a199241
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_bf16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1q_bf16_tied1:
+**	uzp1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_bf16_tied1, svbfloat16_t,
+		z0 = svuzp1q_bf16 (z0, z1),
+		z0 = svuzp1q (z0, z1))
+
+/*
+** uzp1q_bf16_tied2:
+**	uzp1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_bf16_tied2, svbfloat16_t,
+		z0 = svuzp1q_bf16 (z1, z0),
+		z0 = svuzp1q (z1, z0))
+
+/*
+** uzp1q_bf16_untied:
+**	uzp1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_bf16_untied, svbfloat16_t,
+		z0 = svuzp1q_bf16 (z1, z2),
+		z0 = svuzp1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f16.c
new file mode 100644
index 000000000..c11e5bdc4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1q_f16_tied1:
+**	uzp1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_f16_tied1, svfloat16_t,
+		z0 = svuzp1q_f16 (z0, z1),
+		z0 = svuzp1q (z0, z1))
+
+/*
+** uzp1q_f16_tied2:
+**	uzp1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_f16_tied2, svfloat16_t,
+		z0 = svuzp1q_f16 (z1, z0),
+		z0 = svuzp1q (z1, z0))
+
+/*
+** uzp1q_f16_untied:
+**	uzp1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_f16_untied, svfloat16_t,
+		z0 = svuzp1q_f16 (z1, z2),
+		z0 = svuzp1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f32.c
new file mode 100644
index 000000000..d0ac94543
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1q_f32_tied1:
+**	uzp1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_f32_tied1, svfloat32_t,
+		z0 = svuzp1q_f32 (z0, z1),
+		z0 = svuzp1q (z0, z1))
+
+/*
+** uzp1q_f32_tied2:
+**	uzp1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_f32_tied2, svfloat32_t,
+		z0 = svuzp1q_f32 (z1, z0),
+		z0 = svuzp1q (z1, z0))
+
+/*
+** uzp1q_f32_untied:
+**	uzp1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_f32_untied, svfloat32_t,
+		z0 = svuzp1q_f32 (z1, z2),
+		z0 = svuzp1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f64.c
new file mode 100644
index 000000000..ac2e5c5cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_f64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1q_f64_tied1:
+**	uzp1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_f64_tied1, svfloat64_t,
+		z0 = svuzp1q_f64 (z0, z1),
+		z0 = svuzp1q (z0, z1))
+
+/*
+** uzp1q_f64_tied2:
+**	uzp1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_f64_tied2, svfloat64_t,
+		z0 = svuzp1q_f64 (z1, z0),
+		z0 = svuzp1q (z1, z0))
+
+/*
+** uzp1q_f64_untied:
+**	uzp1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_f64_untied, svfloat64_t,
+		z0 = svuzp1q_f64 (z1, z2),
+		z0 = svuzp1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s16.c
new file mode 100644
index 000000000..aa200b24e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1q_s16_tied1:
+**	uzp1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_s16_tied1, svint16_t,
+		z0 = svuzp1q_s16 (z0, z1),
+		z0 = svuzp1q (z0, z1))
+
+/*
+** uzp1q_s16_tied2:
+**	uzp1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_s16_tied2, svint16_t,
+		z0 = svuzp1q_s16 (z1, z0),
+		z0 = svuzp1q (z1, z0))
+
+/*
+** uzp1q_s16_untied:
+**	uzp1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_s16_untied, svint16_t,
+		z0 = svuzp1q_s16 (z1, z2),
+		z0 = svuzp1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s32.c
new file mode 100644
index 000000000..eb849df74
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1q_s32_tied1:
+**	uzp1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_s32_tied1, svint32_t,
+		z0 = svuzp1q_s32 (z0, z1),
+		z0 = svuzp1q (z0, z1))
+
+/*
+** uzp1q_s32_tied2:
+**	uzp1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_s32_tied2, svint32_t,
+		z0 = svuzp1q_s32 (z1, z0),
+		z0 = svuzp1q (z1, z0))
+
+/*
+** uzp1q_s32_untied:
+**	uzp1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_s32_untied, svint32_t,
+		z0 = svuzp1q_s32 (z1, z2),
+		z0 = svuzp1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s64.c
new file mode 100644
index 000000000..e1049761c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1q_s64_tied1:
+**	uzp1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_s64_tied1, svint64_t,
+		z0 = svuzp1q_s64 (z0, z1),
+		z0 = svuzp1q (z0, z1))
+
+/*
+** uzp1q_s64_tied2:
+**	uzp1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_s64_tied2, svint64_t,
+		z0 = svuzp1q_s64 (z1, z0),
+		z0 = svuzp1q (z1, z0))
+
+/*
+** uzp1q_s64_untied:
+**	uzp1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_s64_untied, svint64_t,
+		z0 = svuzp1q_s64 (z1, z2),
+		z0 = svuzp1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s8.c
new file mode 100644
index 000000000..8aa592199
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_s8.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1q_s8_tied1:
+**	uzp1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_s8_tied1, svint8_t,
+		z0 = svuzp1q_s8 (z0, z1),
+		z0 = svuzp1q (z0, z1))
+
+/*
+** uzp1q_s8_tied2:
+**	uzp1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_s8_tied2, svint8_t,
+		z0 = svuzp1q_s8 (z1, z0),
+		z0 = svuzp1q (z1, z0))
+
+/*
+** uzp1q_s8_untied:
+**	uzp1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_s8_untied, svint8_t,
+		z0 = svuzp1q_s8 (z1, z2),
+		z0 = svuzp1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u16.c
new file mode 100644
index 000000000..00ffaab06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1q_u16_tied1:
+**	uzp1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_u16_tied1, svuint16_t,
+		z0 = svuzp1q_u16 (z0, z1),
+		z0 = svuzp1q (z0, z1))
+
+/*
+** uzp1q_u16_tied2:
+**	uzp1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_u16_tied2, svuint16_t,
+		z0 = svuzp1q_u16 (z1, z0),
+		z0 = svuzp1q (z1, z0))
+
+/*
+** uzp1q_u16_untied:
+**	uzp1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_u16_untied, svuint16_t,
+		z0 = svuzp1q_u16 (z1, z2),
+		z0 = svuzp1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u32.c
new file mode 100644
index 000000000..cd2e4db26
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1q_u32_tied1:
+**	uzp1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_u32_tied1, svuint32_t,
+		z0 = svuzp1q_u32 (z0, z1),
+		z0 = svuzp1q (z0, z1))
+
+/*
+** uzp1q_u32_tied2:
+**	uzp1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_u32_tied2, svuint32_t,
+		z0 = svuzp1q_u32 (z1, z0),
+		z0 = svuzp1q (z1, z0))
+
+/*
+** uzp1q_u32_untied:
+**	uzp1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_u32_untied, svuint32_t,
+		z0 = svuzp1q_u32 (z1, z2),
+		z0 = svuzp1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u64.c
new file mode 100644
index 000000000..7d8823329
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1q_u64_tied1:
+**	uzp1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_u64_tied1, svuint64_t,
+		z0 = svuzp1q_u64 (z0, z1),
+		z0 = svuzp1q (z0, z1))
+
+/*
+** uzp1q_u64_tied2:
+**	uzp1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_u64_tied2, svuint64_t,
+		z0 = svuzp1q_u64 (z1, z0),
+		z0 = svuzp1q (z1, z0))
+
+/*
+** uzp1q_u64_untied:
+**	uzp1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_u64_untied, svuint64_t,
+		z0 = svuzp1q_u64 (z1, z2),
+		z0 = svuzp1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u8.c
new file mode 100644
index 000000000..701a1d575
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp1q_u8.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1q_u8_tied1:
+**	uzp1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_u8_tied1, svuint8_t,
+		z0 = svuzp1q_u8 (z0, z1),
+		z0 = svuzp1q (z0, z1))
+
+/*
+** uzp1q_u8_tied2:
+**	uzp1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_u8_tied2, svuint8_t,
+		z0 = svuzp1q_u8 (z1, z0),
+		z0 = svuzp1q (z1, z0))
+
+/*
+** uzp1q_u8_untied:
+**	uzp1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp1q_u8_untied, svuint8_t,
+		z0 = svuzp1q_u8 (z1, z2),
+		z0 = svuzp1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b16.c
new file mode 100644
index 000000000..c3a91e7fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_b16_tied1:
+**	uzp2	p0\.h, p0\.h, p1\.h
+**	ret
+*/
+TEST_UNIFORM_P (uzp2_b16_tied1,
+		p0 = svuzp2_b16 (p0, p1),
+		p0 = svuzp2_b16 (p0, p1))
+
+/*
+** uzp2_b16_tied2:
+**	uzp2	p0\.h, p1\.h, p0\.h
+**	ret
+*/
+TEST_UNIFORM_P (uzp2_b16_tied2,
+		p0 = svuzp2_b16 (p1, p0),
+		p0 = svuzp2_b16 (p1, p0))
+
+/*
+** uzp2_b16_untied:
+**	uzp2	p0\.h, p1\.h, p2\.h
+**	ret
+*/
+TEST_UNIFORM_P (uzp2_b16_untied,
+		p0 = svuzp2_b16 (p1, p2),
+		p0 = svuzp2_b16 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b32.c
new file mode 100644
index 000000000..e3294a6f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_b32_tied1:
+**	uzp2	p0\.s, p0\.s, p1\.s
+**	ret
+*/
+TEST_UNIFORM_P (uzp2_b32_tied1,
+		p0 = svuzp2_b32 (p0, p1),
+		p0 = svuzp2_b32 (p0, p1))
+
+/*
+** uzp2_b32_tied2:
+**	uzp2	p0\.s, p1\.s, p0\.s
+**	ret
+*/
+TEST_UNIFORM_P (uzp2_b32_tied2,
+		p0 = svuzp2_b32 (p1, p0),
+		p0 = svuzp2_b32 (p1, p0))
+
+/*
+** uzp2_b32_untied:
+**	uzp2	p0\.s, p1\.s, p2\.s
+**	ret
+*/
+TEST_UNIFORM_P (uzp2_b32_untied,
+		p0 = svuzp2_b32 (p1, p2),
+		p0 = svuzp2_b32 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b64.c
new file mode 100644
index 000000000..3ae72e10c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_b64_tied1:
+**	uzp2	p0\.d, p0\.d, p1\.d
+**	ret
+*/
+TEST_UNIFORM_P (uzp2_b64_tied1,
+		p0 = svuzp2_b64 (p0, p1),
+		p0 = svuzp2_b64 (p0, p1))
+
+/*
+** uzp2_b64_tied2:
+**	uzp2	p0\.d, p1\.d, p0\.d
+**	ret
+*/
+TEST_UNIFORM_P (uzp2_b64_tied2,
+		p0 = svuzp2_b64 (p1, p0),
+		p0 = svuzp2_b64 (p1, p0))
+
+/*
+** uzp2_b64_untied:
+**	uzp2	p0\.d, p1\.d, p2\.d
+**	ret
+*/
+TEST_UNIFORM_P (uzp2_b64_untied,
+		p0 = svuzp2_b64 (p1, p2),
+		p0 = svuzp2_b64 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b8.c
new file mode 100644
index 000000000..726a9a079
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_b8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_b8_tied1:
+**	uzp2	p0\.b, p0\.b, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (uzp2_b8_tied1,
+		p0 = svuzp2_b8 (p0, p1),
+		p0 = svuzp2_b8 (p0, p1))
+
+/*
+** uzp2_b8_tied2:
+**	uzp2	p0\.b, p1\.b, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (uzp2_b8_tied2,
+		p0 = svuzp2_b8 (p1, p0),
+		p0 = svuzp2_b8 (p1, p0))
+
+/*
+** uzp2_b8_untied:
+**	uzp2	p0\.b, p1\.b, p2\.b
+**	ret
+*/
+TEST_UNIFORM_P (uzp2_b8_untied,
+		p0 = svuzp2_b8 (p1, p2),
+		p0 = svuzp2_b8 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_bf16.c
new file mode 100644
index 000000000..b5566bfdf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_bf16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_bf16_tied1:
+**	uzp2	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_bf16_tied1, svbfloat16_t,
+		z0 = svuzp2_bf16 (z0, z1),
+		z0 = svuzp2 (z0, z1))
+
+/*
+** uzp2_bf16_tied2:
+**	uzp2	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_bf16_tied2, svbfloat16_t,
+		z0 = svuzp2_bf16 (z1, z0),
+		z0 = svuzp2 (z1, z0))
+
+/*
+** uzp2_bf16_untied:
+**	uzp2	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_bf16_untied, svbfloat16_t,
+		z0 = svuzp2_bf16 (z1, z2),
+		z0 = svuzp2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f16.c
new file mode 100644
index 000000000..d4847ef37
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_f16_tied1:
+**	uzp2	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_f16_tied1, svfloat16_t,
+		z0 = svuzp2_f16 (z0, z1),
+		z0 = svuzp2 (z0, z1))
+
+/*
+** uzp2_f16_tied2:
+**	uzp2	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_f16_tied2, svfloat16_t,
+		z0 = svuzp2_f16 (z1, z0),
+		z0 = svuzp2 (z1, z0))
+
+/*
+** uzp2_f16_untied:
+**	uzp2	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_f16_untied, svfloat16_t,
+		z0 = svuzp2_f16 (z1, z2),
+		z0 = svuzp2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f32.c
new file mode 100644
index 000000000..c1699fc9c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_f32_tied1:
+**	uzp2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_f32_tied1, svfloat32_t,
+		z0 = svuzp2_f32 (z0, z1),
+		z0 = svuzp2 (z0, z1))
+
+/*
+** uzp2_f32_tied2:
+**	uzp2	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_f32_tied2, svfloat32_t,
+		z0 = svuzp2_f32 (z1, z0),
+		z0 = svuzp2 (z1, z0))
+
+/*
+** uzp2_f32_untied:
+**	uzp2	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_f32_untied, svfloat32_t,
+		z0 = svuzp2_f32 (z1, z2),
+		z0 = svuzp2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f64.c
new file mode 100644
index 000000000..afbf5c11a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_f64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_f64_tied1:
+**	uzp2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_f64_tied1, svfloat64_t,
+		z0 = svuzp2_f64 (z0, z1),
+		z0 = svuzp2 (z0, z1))
+
+/*
+** uzp2_f64_tied2:
+**	uzp2	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_f64_tied2, svfloat64_t,
+		z0 = svuzp2_f64 (z1, z0),
+		z0 = svuzp2 (z1, z0))
+
+/*
+** uzp2_f64_untied:
+**	uzp2	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_f64_untied, svfloat64_t,
+		z0 = svuzp2_f64 (z1, z2),
+		z0 = svuzp2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s16.c
new file mode 100644
index 000000000..e88df8734
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_s16_tied1:
+**	uzp2	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_s16_tied1, svint16_t,
+		z0 = svuzp2_s16 (z0, z1),
+		z0 = svuzp2 (z0, z1))
+
+/*
+** uzp2_s16_tied2:
+**	uzp2	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_s16_tied2, svint16_t,
+		z0 = svuzp2_s16 (z1, z0),
+		z0 = svuzp2 (z1, z0))
+
+/*
+** uzp2_s16_untied:
+**	uzp2	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_s16_untied, svint16_t,
+		z0 = svuzp2_s16 (z1, z2),
+		z0 = svuzp2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s32.c
new file mode 100644
index 000000000..2e9a73d1f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_s32_tied1:
+**	uzp2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_s32_tied1, svint32_t,
+		z0 = svuzp2_s32 (z0, z1),
+		z0 = svuzp2 (z0, z1))
+
+/*
+** uzp2_s32_tied2:
+**	uzp2	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_s32_tied2, svint32_t,
+		z0 = svuzp2_s32 (z1, z0),
+		z0 = svuzp2 (z1, z0))
+
+/*
+** uzp2_s32_untied:
+**	uzp2	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_s32_untied, svint32_t,
+		z0 = svuzp2_s32 (z1, z2),
+		z0 = svuzp2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s64.c
new file mode 100644
index 000000000..ffec78ccc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_s64_tied1:
+**	uzp2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_s64_tied1, svint64_t,
+		z0 = svuzp2_s64 (z0, z1),
+		z0 = svuzp2 (z0, z1))
+
+/*
+** uzp2_s64_tied2:
+**	uzp2	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_s64_tied2, svint64_t,
+		z0 = svuzp2_s64 (z1, z0),
+		z0 = svuzp2 (z1, z0))
+
+/*
+** uzp2_s64_untied:
+**	uzp2	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_s64_untied, svint64_t,
+		z0 = svuzp2_s64 (z1, z2),
+		z0 = svuzp2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s8.c
new file mode 100644
index 000000000..72037a088
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_s8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_s8_tied1:
+**	uzp2	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_s8_tied1, svint8_t,
+		z0 = svuzp2_s8 (z0, z1),
+		z0 = svuzp2 (z0, z1))
+
+/*
+** uzp2_s8_tied2:
+**	uzp2	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_s8_tied2, svint8_t,
+		z0 = svuzp2_s8 (z1, z0),
+		z0 = svuzp2 (z1, z0))
+
+/*
+** uzp2_s8_untied:
+**	uzp2	z0\.b, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_s8_untied, svint8_t,
+		z0 = svuzp2_s8 (z1, z2),
+		z0 = svuzp2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u16.c
new file mode 100644
index 000000000..d84f8c9ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_u16_tied1:
+**	uzp2	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_u16_tied1, svuint16_t,
+		z0 = svuzp2_u16 (z0, z1),
+		z0 = svuzp2 (z0, z1))
+
+/*
+** uzp2_u16_tied2:
+**	uzp2	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_u16_tied2, svuint16_t,
+		z0 = svuzp2_u16 (z1, z0),
+		z0 = svuzp2 (z1, z0))
+
+/*
+** uzp2_u16_untied:
+**	uzp2	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_u16_untied, svuint16_t,
+		z0 = svuzp2_u16 (z1, z2),
+		z0 = svuzp2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u32.c
new file mode 100644
index 000000000..0285ff91f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_u32_tied1:
+**	uzp2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_u32_tied1, svuint32_t,
+		z0 = svuzp2_u32 (z0, z1),
+		z0 = svuzp2 (z0, z1))
+
+/*
+** uzp2_u32_tied2:
+**	uzp2	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_u32_tied2, svuint32_t,
+		z0 = svuzp2_u32 (z1, z0),
+		z0 = svuzp2 (z1, z0))
+
+/*
+** uzp2_u32_untied:
+**	uzp2	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_u32_untied, svuint32_t,
+		z0 = svuzp2_u32 (z1, z2),
+		z0 = svuzp2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u64.c
new file mode 100644
index 000000000..1b51baf90
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_u64_tied1:
+**	uzp2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_u64_tied1, svuint64_t,
+		z0 = svuzp2_u64 (z0, z1),
+		z0 = svuzp2 (z0, z1))
+
+/*
+** uzp2_u64_tied2:
+**	uzp2	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_u64_tied2, svuint64_t,
+		z0 = svuzp2_u64 (z1, z0),
+		z0 = svuzp2 (z1, z0))
+
+/*
+** uzp2_u64_untied:
+**	uzp2	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_u64_untied, svuint64_t,
+		z0 = svuzp2_u64 (z1, z2),
+		z0 = svuzp2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u8.c
new file mode 100644
index 000000000..662e0b818
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2_u8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_u8_tied1:
+**	uzp2	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_u8_tied1, svuint8_t,
+		z0 = svuzp2_u8 (z0, z1),
+		z0 = svuzp2 (z0, z1))
+
+/*
+** uzp2_u8_tied2:
+**	uzp2	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_u8_tied2, svuint8_t,
+		z0 = svuzp2_u8 (z1, z0),
+		z0 = svuzp2 (z1, z0))
+
+/*
+** uzp2_u8_untied:
+**	uzp2	z0\.b, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2_u8_untied, svuint8_t,
+		z0 = svuzp2_u8 (z1, z2),
+		z0 = svuzp2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_bf16.c
new file mode 100644
index 000000000..bbac53a7a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_bf16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2q_bf16_tied1:
+**	uzp2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_bf16_tied1, svbfloat16_t,
+		z0 = svuzp2q_bf16 (z0, z1),
+		z0 = svuzp2q (z0, z1))
+
+/*
+** uzp2q_bf16_tied2:
+**	uzp2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_bf16_tied2, svbfloat16_t,
+		z0 = svuzp2q_bf16 (z1, z0),
+		z0 = svuzp2q (z1, z0))
+
+/*
+** uzp2q_bf16_untied:
+**	uzp2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_bf16_untied, svbfloat16_t,
+		z0 = svuzp2q_bf16 (z1, z2),
+		z0 = svuzp2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f16.c
new file mode 100644
index 000000000..e19d118fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2q_f16_tied1:
+**	uzp2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_f16_tied1, svfloat16_t,
+		z0 = svuzp2q_f16 (z0, z1),
+		z0 = svuzp2q (z0, z1))
+
+/*
+** uzp2q_f16_tied2:
+**	uzp2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_f16_tied2, svfloat16_t,
+		z0 = svuzp2q_f16 (z1, z0),
+		z0 = svuzp2q (z1, z0))
+
+/*
+** uzp2q_f16_untied:
+**	uzp2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_f16_untied, svfloat16_t,
+		z0 = svuzp2q_f16 (z1, z2),
+		z0 = svuzp2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f32.c
new file mode 100644
index 000000000..af7112b15
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2q_f32_tied1:
+**	uzp2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_f32_tied1, svfloat32_t,
+		z0 = svuzp2q_f32 (z0, z1),
+		z0 = svuzp2q (z0, z1))
+
+/*
+** uzp2q_f32_tied2:
+**	uzp2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_f32_tied2, svfloat32_t,
+		z0 = svuzp2q_f32 (z1, z0),
+		z0 = svuzp2q (z1, z0))
+
+/*
+** uzp2q_f32_untied:
+**	uzp2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_f32_untied, svfloat32_t,
+		z0 = svuzp2q_f32 (z1, z2),
+		z0 = svuzp2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f64.c
new file mode 100644
index 000000000..4109b843c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_f64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2q_f64_tied1:
+**	uzp2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_f64_tied1, svfloat64_t,
+		z0 = svuzp2q_f64 (z0, z1),
+		z0 = svuzp2q (z0, z1))
+
+/*
+** uzp2q_f64_tied2:
+**	uzp2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_f64_tied2, svfloat64_t,
+		z0 = svuzp2q_f64 (z1, z0),
+		z0 = svuzp2q (z1, z0))
+
+/*
+** uzp2q_f64_untied:
+**	uzp2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_f64_untied, svfloat64_t,
+		z0 = svuzp2q_f64 (z1, z2),
+		z0 = svuzp2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s16.c
new file mode 100644
index 000000000..0c6ab25cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2q_s16_tied1:
+**	uzp2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_s16_tied1, svint16_t,
+		z0 = svuzp2q_s16 (z0, z1),
+		z0 = svuzp2q (z0, z1))
+
+/*
+** uzp2q_s16_tied2:
+**	uzp2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_s16_tied2, svint16_t,
+		z0 = svuzp2q_s16 (z1, z0),
+		z0 = svuzp2q (z1, z0))
+
+/*
+** uzp2q_s16_untied:
+**	uzp2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_s16_untied, svint16_t,
+		z0 = svuzp2q_s16 (z1, z2),
+		z0 = svuzp2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s32.c
new file mode 100644
index 000000000..9b914e704
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2q_s32_tied1:
+**	uzp2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_s32_tied1, svint32_t,
+		z0 = svuzp2q_s32 (z0, z1),
+		z0 = svuzp2q (z0, z1))
+
+/*
+** uzp2q_s32_tied2:
+**	uzp2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_s32_tied2, svint32_t,
+		z0 = svuzp2q_s32 (z1, z0),
+		z0 = svuzp2q (z1, z0))
+
+/*
+** uzp2q_s32_untied:
+**	uzp2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_s32_untied, svint32_t,
+		z0 = svuzp2q_s32 (z1, z2),
+		z0 = svuzp2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s64.c
new file mode 100644
index 000000000..697e37d78
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2q_s64_tied1:
+**	uzp2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_s64_tied1, svint64_t,
+		z0 = svuzp2q_s64 (z0, z1),
+		z0 = svuzp2q (z0, z1))
+
+/*
+** uzp2q_s64_tied2:
+**	uzp2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_s64_tied2, svint64_t,
+		z0 = svuzp2q_s64 (z1, z0),
+		z0 = svuzp2q (z1, z0))
+
+/*
+** uzp2q_s64_untied:
+**	uzp2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_s64_untied, svint64_t,
+		z0 = svuzp2q_s64 (z1, z2),
+		z0 = svuzp2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s8.c
new file mode 100644
index 000000000..576262c5d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_s8.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2q_s8_tied1:
+**	uzp2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_s8_tied1, svint8_t,
+		z0 = svuzp2q_s8 (z0, z1),
+		z0 = svuzp2q (z0, z1))
+
+/*
+** uzp2q_s8_tied2:
+**	uzp2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_s8_tied2, svint8_t,
+		z0 = svuzp2q_s8 (z1, z0),
+		z0 = svuzp2q (z1, z0))
+
+/*
+** uzp2q_s8_untied:
+**	uzp2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_s8_untied, svint8_t,
+		z0 = svuzp2q_s8 (z1, z2),
+		z0 = svuzp2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u16.c
new file mode 100644
index 000000000..f2debc28f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2q_u16_tied1:
+**	uzp2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_u16_tied1, svuint16_t,
+		z0 = svuzp2q_u16 (z0, z1),
+		z0 = svuzp2q (z0, z1))
+
+/*
+** uzp2q_u16_tied2:
+**	uzp2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_u16_tied2, svuint16_t,
+		z0 = svuzp2q_u16 (z1, z0),
+		z0 = svuzp2q (z1, z0))
+
+/*
+** uzp2q_u16_untied:
+**	uzp2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_u16_untied, svuint16_t,
+		z0 = svuzp2q_u16 (z1, z2),
+		z0 = svuzp2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u32.c
new file mode 100644
index 000000000..ad6a4bcc0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2q_u32_tied1:
+**	uzp2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_u32_tied1, svuint32_t,
+		z0 = svuzp2q_u32 (z0, z1),
+		z0 = svuzp2q (z0, z1))
+
+/*
+** uzp2q_u32_tied2:
+**	uzp2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_u32_tied2, svuint32_t,
+		z0 = svuzp2q_u32 (z1, z0),
+		z0 = svuzp2q (z1, z0))
+
+/*
+** uzp2q_u32_untied:
+**	uzp2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_u32_untied, svuint32_t,
+		z0 = svuzp2q_u32 (z1, z2),
+		z0 = svuzp2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u64.c
new file mode 100644
index 000000000..a846aa295
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2q_u64_tied1:
+**	uzp2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_u64_tied1, svuint64_t,
+		z0 = svuzp2q_u64 (z0, z1),
+		z0 = svuzp2q (z0, z1))
+
+/*
+** uzp2q_u64_tied2:
+**	uzp2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_u64_tied2, svuint64_t,
+		z0 = svuzp2q_u64 (z1, z0),
+		z0 = svuzp2q (z1, z0))
+
+/*
+** uzp2q_u64_untied:
+**	uzp2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_u64_untied, svuint64_t,
+		z0 = svuzp2q_u64 (z1, z2),
+		z0 = svuzp2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u8.c
new file mode 100644
index 000000000..163c22659
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/uzp2q_u8.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2q_u8_tied1:
+**	uzp2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_u8_tied1, svuint8_t,
+		z0 = svuzp2q_u8 (z0, z1),
+		z0 = svuzp2q (z0, z1))
+
+/*
+** uzp2q_u8_tied2:
+**	uzp2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_u8_tied2, svuint8_t,
+		z0 = svuzp2q_u8 (z1, z0),
+		z0 = svuzp2q (z1, z0))
+
+/*
+** uzp2q_u8_untied:
+**	uzp2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (uzp2q_u8_untied, svuint8_t,
+		z0 = svuzp2q_u8 (z1, z2),
+		z0 = svuzp2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b16.c
new file mode 100644
index 000000000..c285a7a73
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b16.c
@@ -0,0 +1,173 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** whilele_rr_b16_s32:
+**	whilele	p0\.h, w0, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_rr_b16_s32, int32_t,
+		p0 = svwhilele_b16_s32 (x0, x1),
+		p0 = svwhilele_b16 (x0, x1))
+
+/*
+** whilele_0r_b16_s32:
+**	whilele	p0\.h, wzr, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_0r_b16_s32, int32_t,
+		p0 = svwhilele_b16_s32 (0, x1),
+		p0 = svwhilele_b16 (0, x1))
+
+/*
+** whilele_5r_b16_s32:
+**	mov	(w[0-9]+), #?5
+**	whilele	p0\.h, \1, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_5r_b16_s32, int32_t,
+		p0 = svwhilele_b16_s32 (5, x1),
+		p0 = svwhilele_b16 (5, x1))
+
+/*
+** whilele_r0_b16_s32:
+**	whilele	p0\.h, w0, wzr
+**	ret
+*/
+TEST_COMPARE_S (whilele_r0_b16_s32, int32_t,
+		p0 = svwhilele_b16_s32 (x0, 0),
+		p0 = svwhilele_b16 (x0, 0))
+
+/*
+** whilele_r5_b16_s32:
+**	mov	(w[0-9]+), #?5
+**	whilele	p0\.h, w0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilele_r5_b16_s32, int32_t,
+		p0 = svwhilele_b16_s32 (x0, 5),
+		p0 = svwhilele_b16 (x0, 5))
+
+/*
+** whilele_rr_b16_s64:
+**	whilele	p0\.h, x0, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_rr_b16_s64, int64_t,
+		p0 = svwhilele_b16_s64 (x0, x1),
+		p0 = svwhilele_b16 (x0, x1))
+
+/*
+** whilele_0r_b16_s64:
+**	whilele	p0\.h, xzr, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_0r_b16_s64, int64_t,
+		p0 = svwhilele_b16_s64 (0, x1),
+		p0 = svwhilele_b16 ((int64_t) 0, x1))
+
+/*
+** whilele_5r_b16_s64:
+**	mov	(x[0-9]+), #?5
+**	whilele	p0\.h, \1, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_5r_b16_s64, int64_t,
+		p0 = svwhilele_b16_s64 (5, x1),
+		p0 = svwhilele_b16 ((int64_t) 5, x1))
+
+/*
+** whilele_r0_b16_s64:
+**	whilele	p0\.h, x0, xzr
+**	ret
+*/
+TEST_COMPARE_S (whilele_r0_b16_s64, int64_t,
+		p0 = svwhilele_b16_s64 (x0, 0),
+		p0 = svwhilele_b16 (x0, (int64_t) 0))
+
+/*
+** whilele_r5_b16_s64:
+**	mov	(x[0-9]+), #?5
+**	whilele	p0\.h, x0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilele_r5_b16_s64, int64_t,
+		p0 = svwhilele_b16_s64 (x0, 5),
+		p0 = svwhilele_b16 (x0, (int64_t) 5))
+
+/*
+** whilele_rr_b16_u32:
+**	whilels	p0\.h, w0, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_rr_b16_u32, uint32_t,
+		p0 = svwhilele_b16_u32 (x0, x1),
+		p0 = svwhilele_b16 (x0, x1))
+
+/*
+** whilele_0r_b16_u32:
+**	whilels	p0\.h, wzr, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_0r_b16_u32, uint32_t,
+		p0 = svwhilele_b16_u32 (0, x1),
+		p0 = svwhilele_b16 ((uint32_t) 0, x1))
+
+/*
+** whilele_5r_b16_u32:
+**	mov	(w[0-9]+), #?5
+**	whilels	p0\.h, \1, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_5r_b16_u32, uint32_t,
+		p0 = svwhilele_b16_u32 (5, x1),
+		p0 = svwhilele_b16 ((uint32_t) 5, x1))
+
+/*
+** whilele_r5_b16_u32:
+**	mov	(w[0-9]+), #?5
+**	whilels	p0\.h, w0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilele_r5_b16_u32, uint32_t,
+		p0 = svwhilele_b16_u32 (x0, 5),
+		p0 = svwhilele_b16 (x0, (uint32_t) 5))
+
+/*
+** whilele_rr_b16_u64:
+**	whilels	p0\.h, x0, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_rr_b16_u64, uint64_t,
+		p0 = svwhilele_b16_u64 (x0, x1),
+		p0 = svwhilele_b16 (x0, x1))
+
+/*
+** whilele_0r_b16_u64:
+**	whilels	p0\.h, xzr, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_0r_b16_u64, uint64_t,
+		p0 = svwhilele_b16_u64 (0, x1),
+		p0 = svwhilele_b16 ((uint64_t) 0, x1))
+
+/*
+** whilele_5r_b16_u64:
+**	mov	(x[0-9]+), #?5
+**	whilels	p0\.h, \1, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_5r_b16_u64, uint64_t,
+		p0 = svwhilele_b16_u64 (5, x1),
+		p0 = svwhilele_b16 ((uint64_t) 5, x1))
+
+/*
+** whilele_r5_b16_u64:
+**	mov	(x[0-9]+), #?5
+**	whilels	p0\.h, x0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilele_r5_b16_u64, uint64_t,
+		p0 = svwhilele_b16_u64 (x0, 5),
+		p0 = svwhilele_b16 (x0, (uint64_t) 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b32.c
new file mode 100644
index 000000000..d369ccfa3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b32.c
@@ -0,0 +1,173 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** whilele_rr_b32_s32:
+**	whilele	p0\.s, w0, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_rr_b32_s32, int32_t,
+		p0 = svwhilele_b32_s32 (x0, x1),
+		p0 = svwhilele_b32 (x0, x1))
+
+/*
+** whilele_0r_b32_s32:
+**	whilele	p0\.s, wzr, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_0r_b32_s32, int32_t,
+		p0 = svwhilele_b32_s32 (0, x1),
+		p0 = svwhilele_b32 (0, x1))
+
+/*
+** whilele_5r_b32_s32:
+**	mov	(w[0-9]+), #?5
+**	whilele	p0\.s, \1, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_5r_b32_s32, int32_t,
+		p0 = svwhilele_b32_s32 (5, x1),
+		p0 = svwhilele_b32 (5, x1))
+
+/*
+** whilele_r0_b32_s32:
+**	whilele	p0\.s, w0, wzr
+**	ret
+*/
+TEST_COMPARE_S (whilele_r0_b32_s32, int32_t,
+		p0 = svwhilele_b32_s32 (x0, 0),
+		p0 = svwhilele_b32 (x0, 0))
+
+/*
+** whilele_r5_b32_s32:
+**	mov	(w[0-9]+), #?5
+**	whilele	p0\.s, w0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilele_r5_b32_s32, int32_t,
+		p0 = svwhilele_b32_s32 (x0, 5),
+		p0 = svwhilele_b32 (x0, 5))
+
+/*
+** whilele_rr_b32_s64:
+**	whilele	p0\.s, x0, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_rr_b32_s64, int64_t,
+		p0 = svwhilele_b32_s64 (x0, x1),
+		p0 = svwhilele_b32 (x0, x1))
+
+/*
+** whilele_0r_b32_s64:
+**	whilele	p0\.s, xzr, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_0r_b32_s64, int64_t,
+		p0 = svwhilele_b32_s64 (0, x1),
+		p0 = svwhilele_b32 ((int64_t) 0, x1))
+
+/*
+** whilele_5r_b32_s64:
+**	mov	(x[0-9]+), #?5
+**	whilele	p0\.s, \1, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_5r_b32_s64, int64_t,
+		p0 = svwhilele_b32_s64 (5, x1),
+		p0 = svwhilele_b32 ((int64_t) 5, x1))
+
+/*
+** whilele_r0_b32_s64:
+**	whilele	p0\.s, x0, xzr
+**	ret
+*/
+TEST_COMPARE_S (whilele_r0_b32_s64, int64_t,
+		p0 = svwhilele_b32_s64 (x0, 0),
+		p0 = svwhilele_b32 (x0, (int64_t) 0))
+
+/*
+** whilele_r5_b32_s64:
+**	mov	(x[0-9]+), #?5
+**	whilele	p0\.s, x0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilele_r5_b32_s64, int64_t,
+		p0 = svwhilele_b32_s64 (x0, 5),
+		p0 = svwhilele_b32 (x0, (int64_t) 5))
+
+/*
+** whilele_rr_b32_u32:
+**	whilels	p0\.s, w0, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_rr_b32_u32, uint32_t,
+		p0 = svwhilele_b32_u32 (x0, x1),
+		p0 = svwhilele_b32 (x0, x1))
+
+/*
+** whilele_0r_b32_u32:
+**	whilels	p0\.s, wzr, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_0r_b32_u32, uint32_t,
+		p0 = svwhilele_b32_u32 (0, x1),
+		p0 = svwhilele_b32 ((uint32_t) 0, x1))
+
+/*
+** whilele_5r_b32_u32:
+**	mov	(w[0-9]+), #?5
+**	whilels	p0\.s, \1, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_5r_b32_u32, uint32_t,
+		p0 = svwhilele_b32_u32 (5, x1),
+		p0 = svwhilele_b32 ((uint32_t) 5, x1))
+
+/*
+** whilele_r5_b32_u32:
+**	mov	(w[0-9]+), #?5
+**	whilels	p0\.s, w0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilele_r5_b32_u32, uint32_t,
+		p0 = svwhilele_b32_u32 (x0, 5),
+		p0 = svwhilele_b32 (x0, (uint32_t) 5))
+
+/*
+** whilele_rr_b32_u64:
+**	whilels	p0\.s, x0, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_rr_b32_u64, uint64_t,
+		p0 = svwhilele_b32_u64 (x0, x1),
+		p0 = svwhilele_b32 (x0, x1))
+
+/*
+** whilele_0r_b32_u64:
+**	whilels	p0\.s, xzr, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_0r_b32_u64, uint64_t,
+		p0 = svwhilele_b32_u64 (0, x1),
+		p0 = svwhilele_b32 ((uint64_t) 0, x1))
+
+/*
+** whilele_5r_b32_u64:
+**	mov	(x[0-9]+), #?5
+**	whilels	p0\.s, \1, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_5r_b32_u64, uint64_t,
+		p0 = svwhilele_b32_u64 (5, x1),
+		p0 = svwhilele_b32 ((uint64_t) 5, x1))
+
+/*
+** whilele_r5_b32_u64:
+**	mov	(x[0-9]+), #?5
+**	whilels	p0\.s, x0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilele_r5_b32_u64, uint64_t,
+		p0 = svwhilele_b32_u64 (x0, 5),
+		p0 = svwhilele_b32 (x0, (uint64_t) 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b64.c
new file mode 100644
index 000000000..394f51f44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b64.c
@@ -0,0 +1,173 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** whilele_rr_b64_s32:
+**	whilele	p0\.d, w0, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_rr_b64_s32, int32_t,
+		p0 = svwhilele_b64_s32 (x0, x1),
+		p0 = svwhilele_b64 (x0, x1))
+
+/*
+** whilele_0r_b64_s32:
+**	whilele	p0\.d, wzr, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_0r_b64_s32, int32_t,
+		p0 = svwhilele_b64_s32 (0, x1),
+		p0 = svwhilele_b64 (0, x1))
+
+/*
+** whilele_5r_b64_s32:
+**	mov	(w[0-9]+), #?5
+**	whilele	p0\.d, \1, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_5r_b64_s32, int32_t,
+		p0 = svwhilele_b64_s32 (5, x1),
+		p0 = svwhilele_b64 (5, x1))
+
+/*
+** whilele_r0_b64_s32:
+**	whilele	p0\.d, w0, wzr
+**	ret
+*/
+TEST_COMPARE_S (whilele_r0_b64_s32, int32_t,
+		p0 = svwhilele_b64_s32 (x0, 0),
+		p0 = svwhilele_b64 (x0, 0))
+
+/*
+** whilele_r5_b64_s32:
+**	mov	(w[0-9]+), #?5
+**	whilele	p0\.d, w0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilele_r5_b64_s32, int32_t,
+		p0 = svwhilele_b64_s32 (x0, 5),
+		p0 = svwhilele_b64 (x0, 5))
+
+/*
+** whilele_rr_b64_s64:
+**	whilele	p0\.d, x0, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_rr_b64_s64, int64_t,
+		p0 = svwhilele_b64_s64 (x0, x1),
+		p0 = svwhilele_b64 (x0, x1))
+
+/*
+** whilele_0r_b64_s64:
+**	whilele	p0\.d, xzr, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_0r_b64_s64, int64_t,
+		p0 = svwhilele_b64_s64 (0, x1),
+		p0 = svwhilele_b64 ((int64_t) 0, x1))
+
+/*
+** whilele_5r_b64_s64:
+**	mov	(x[0-9]+), #?5
+**	whilele	p0\.d, \1, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_5r_b64_s64, int64_t,
+		p0 = svwhilele_b64_s64 (5, x1),
+		p0 = svwhilele_b64 ((int64_t) 5, x1))
+
+/*
+** whilele_r0_b64_s64:
+**	whilele	p0\.d, x0, xzr
+**	ret
+*/
+TEST_COMPARE_S (whilele_r0_b64_s64, int64_t,
+		p0 = svwhilele_b64_s64 (x0, 0),
+		p0 = svwhilele_b64 (x0, (int64_t) 0))
+
+/*
+** whilele_r5_b64_s64:
+**	mov	(x[0-9]+), #?5
+**	whilele	p0\.d, x0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilele_r5_b64_s64, int64_t,
+		p0 = svwhilele_b64_s64 (x0, 5),
+		p0 = svwhilele_b64 (x0, (int64_t) 5))
+
+/*
+** whilele_rr_b64_u32:
+**	whilels	p0\.d, w0, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_rr_b64_u32, uint32_t,
+		p0 = svwhilele_b64_u32 (x0, x1),
+		p0 = svwhilele_b64 (x0, x1))
+
+/*
+** whilele_0r_b64_u32:
+**	whilels	p0\.d, wzr, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_0r_b64_u32, uint32_t,
+		p0 = svwhilele_b64_u32 (0, x1),
+		p0 = svwhilele_b64 ((uint32_t) 0, x1))
+
+/*
+** whilele_5r_b64_u32:
+**	mov	(w[0-9]+), #?5
+**	whilels	p0\.d, \1, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_5r_b64_u32, uint32_t,
+		p0 = svwhilele_b64_u32 (5, x1),
+		p0 = svwhilele_b64 ((uint32_t) 5, x1))
+
+/*
+** whilele_r5_b64_u32:
+**	mov	(w[0-9]+), #?5
+**	whilels	p0\.d, w0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilele_r5_b64_u32, uint32_t,
+		p0 = svwhilele_b64_u32 (x0, 5),
+		p0 = svwhilele_b64 (x0, (uint32_t) 5))
+
+/*
+** whilele_rr_b64_u64:
+**	whilels	p0\.d, x0, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_rr_b64_u64, uint64_t,
+		p0 = svwhilele_b64_u64 (x0, x1),
+		p0 = svwhilele_b64 (x0, x1))
+
+/*
+** whilele_0r_b64_u64:
+**	whilels	p0\.d, xzr, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_0r_b64_u64, uint64_t,
+		p0 = svwhilele_b64_u64 (0, x1),
+		p0 = svwhilele_b64 ((uint64_t) 0, x1))
+
+/*
+** whilele_5r_b64_u64:
+**	mov	(x[0-9]+), #?5
+**	whilels	p0\.d, \1, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_5r_b64_u64, uint64_t,
+		p0 = svwhilele_b64_u64 (5, x1),
+		p0 = svwhilele_b64 ((uint64_t) 5, x1))
+
+/*
+** whilele_r5_b64_u64:
+**	mov	(x[0-9]+), #?5
+**	whilels	p0\.d, x0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilele_r5_b64_u64, uint64_t,
+		p0 = svwhilele_b64_u64 (x0, 5),
+		p0 = svwhilele_b64 (x0, (uint64_t) 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b8.c
new file mode 100644
index 000000000..2ec101473
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilele_b8.c
@@ -0,0 +1,173 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** whilele_rr_b8_s32:
+**	whilele	p0\.b, w0, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_rr_b8_s32, int32_t,
+		p0 = svwhilele_b8_s32 (x0, x1),
+		p0 = svwhilele_b8 (x0, x1))
+
+/*
+** whilele_0r_b8_s32:
+**	whilele	p0\.b, wzr, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_0r_b8_s32, int32_t,
+		p0 = svwhilele_b8_s32 (0, x1),
+		p0 = svwhilele_b8 (0, x1))
+
+/*
+** whilele_5r_b8_s32:
+**	mov	(w[0-9]+), #?5
+**	whilele	p0\.b, \1, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_5r_b8_s32, int32_t,
+		p0 = svwhilele_b8_s32 (5, x1),
+		p0 = svwhilele_b8 (5, x1))
+
+/*
+** whilele_r0_b8_s32:
+**	whilele	p0\.b, w0, wzr
+**	ret
+*/
+TEST_COMPARE_S (whilele_r0_b8_s32, int32_t,
+		p0 = svwhilele_b8_s32 (x0, 0),
+		p0 = svwhilele_b8 (x0, 0))
+
+/*
+** whilele_r5_b8_s32:
+**	mov	(w[0-9]+), #?5
+**	whilele	p0\.b, w0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilele_r5_b8_s32, int32_t,
+		p0 = svwhilele_b8_s32 (x0, 5),
+		p0 = svwhilele_b8 (x0, 5))
+
+/*
+** whilele_rr_b8_s64:
+**	whilele	p0\.b, x0, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_rr_b8_s64, int64_t,
+		p0 = svwhilele_b8_s64 (x0, x1),
+		p0 = svwhilele_b8 (x0, x1))
+
+/*
+** whilele_0r_b8_s64:
+**	whilele	p0\.b, xzr, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_0r_b8_s64, int64_t,
+		p0 = svwhilele_b8_s64 (0, x1),
+		p0 = svwhilele_b8 ((int64_t) 0, x1))
+
+/*
+** whilele_5r_b8_s64:
+**	mov	(x[0-9]+), #?5
+**	whilele	p0\.b, \1, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_5r_b8_s64, int64_t,
+		p0 = svwhilele_b8_s64 (5, x1),
+		p0 = svwhilele_b8 ((int64_t) 5, x1))
+
+/*
+** whilele_r0_b8_s64:
+**	whilele	p0\.b, x0, xzr
+**	ret
+*/
+TEST_COMPARE_S (whilele_r0_b8_s64, int64_t,
+		p0 = svwhilele_b8_s64 (x0, 0),
+		p0 = svwhilele_b8 (x0, (int64_t) 0))
+
+/*
+** whilele_r5_b8_s64:
+**	mov	(x[0-9]+), #?5
+**	whilele	p0\.b, x0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilele_r5_b8_s64, int64_t,
+		p0 = svwhilele_b8_s64 (x0, 5),
+		p0 = svwhilele_b8 (x0, (int64_t) 5))
+
+/*
+** whilele_rr_b8_u32:
+**	whilels	p0\.b, w0, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_rr_b8_u32, uint32_t,
+		p0 = svwhilele_b8_u32 (x0, x1),
+		p0 = svwhilele_b8 (x0, x1))
+
+/*
+** whilele_0r_b8_u32:
+**	whilels	p0\.b, wzr, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_0r_b8_u32, uint32_t,
+		p0 = svwhilele_b8_u32 (0, x1),
+		p0 = svwhilele_b8 ((uint32_t) 0, x1))
+
+/*
+** whilele_5r_b8_u32:
+**	mov	(w[0-9]+), #?5
+**	whilels	p0\.b, \1, w1
+**	ret
+*/
+TEST_COMPARE_S (whilele_5r_b8_u32, uint32_t,
+		p0 = svwhilele_b8_u32 (5, x1),
+		p0 = svwhilele_b8 ((uint32_t) 5, x1))
+
+/*
+** whilele_r5_b8_u32:
+**	mov	(w[0-9]+), #?5
+**	whilels	p0\.b, w0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilele_r5_b8_u32, uint32_t,
+		p0 = svwhilele_b8_u32 (x0, 5),
+		p0 = svwhilele_b8 (x0, (uint32_t) 5))
+
+/*
+** whilele_rr_b8_u64:
+**	whilels	p0\.b, x0, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_rr_b8_u64, uint64_t,
+		p0 = svwhilele_b8_u64 (x0, x1),
+		p0 = svwhilele_b8 (x0, x1))
+
+/*
+** whilele_0r_b8_u64:
+**	whilels	p0\.b, xzr, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_0r_b8_u64, uint64_t,
+		p0 = svwhilele_b8_u64 (0, x1),
+		p0 = svwhilele_b8 ((uint64_t) 0, x1))
+
+/*
+** whilele_5r_b8_u64:
+**	mov	(x[0-9]+), #?5
+**	whilels	p0\.b, \1, x1
+**	ret
+*/
+TEST_COMPARE_S (whilele_5r_b8_u64, uint64_t,
+		p0 = svwhilele_b8_u64 (5, x1),
+		p0 = svwhilele_b8 ((uint64_t) 5, x1))
+
+/*
+** whilele_r5_b8_u64:
+**	mov	(x[0-9]+), #?5
+**	whilels	p0\.b, x0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilele_r5_b8_u64, uint64_t,
+		p0 = svwhilele_b8_u64 (x0, 5),
+		p0 = svwhilele_b8 (x0, (uint64_t) 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b16.c
new file mode 100644
index 000000000..14a60432b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b16.c
@@ -0,0 +1,173 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** whilelt_rr_b16_s32:
+**	whilelt	p0\.h, w0, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_rr_b16_s32, int32_t,
+		p0 = svwhilelt_b16_s32 (x0, x1),
+		p0 = svwhilelt_b16 (x0, x1))
+
+/*
+** whilelt_0r_b16_s32:
+**	whilelt	p0\.h, wzr, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_0r_b16_s32, int32_t,
+		p0 = svwhilelt_b16_s32 (0, x1),
+		p0 = svwhilelt_b16 (0, x1))
+
+/*
+** whilelt_5r_b16_s32:
+**	mov	(w[0-9]+), #?5
+**	whilelt	p0\.h, \1, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_5r_b16_s32, int32_t,
+		p0 = svwhilelt_b16_s32 (5, x1),
+		p0 = svwhilelt_b16 (5, x1))
+
+/*
+** whilelt_r0_b16_s32:
+**	whilelt	p0\.h, w0, wzr
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r0_b16_s32, int32_t,
+		p0 = svwhilelt_b16_s32 (x0, 0),
+		p0 = svwhilelt_b16 (x0, 0))
+
+/*
+** whilelt_r5_b16_s32:
+**	mov	(w[0-9]+), #?5
+**	whilelt	p0\.h, w0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r5_b16_s32, int32_t,
+		p0 = svwhilelt_b16_s32 (x0, 5),
+		p0 = svwhilelt_b16 (x0, 5))
+
+/*
+** whilelt_rr_b16_s64:
+**	whilelt	p0\.h, x0, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_rr_b16_s64, int64_t,
+		p0 = svwhilelt_b16_s64 (x0, x1),
+		p0 = svwhilelt_b16 (x0, x1))
+
+/*
+** whilelt_0r_b16_s64:
+**	whilelt	p0\.h, xzr, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_0r_b16_s64, int64_t,
+		p0 = svwhilelt_b16_s64 (0, x1),
+		p0 = svwhilelt_b16 ((int64_t) 0, x1))
+
+/*
+** whilelt_5r_b16_s64:
+**	mov	(x[0-9]+), #?5
+**	whilelt	p0\.h, \1, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_5r_b16_s64, int64_t,
+		p0 = svwhilelt_b16_s64 (5, x1),
+		p0 = svwhilelt_b16 ((int64_t) 5, x1))
+
+/*
+** whilelt_r0_b16_s64:
+**	whilelt	p0\.h, x0, xzr
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r0_b16_s64, int64_t,
+		p0 = svwhilelt_b16_s64 (x0, 0),
+		p0 = svwhilelt_b16 (x0, (int64_t) 0))
+
+/*
+** whilelt_r5_b16_s64:
+**	mov	(x[0-9]+), #?5
+**	whilelt	p0\.h, x0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r5_b16_s64, int64_t,
+		p0 = svwhilelt_b16_s64 (x0, 5),
+		p0 = svwhilelt_b16 (x0, (int64_t) 5))
+
+/*
+** whilelt_rr_b16_u32:
+**	whilelo	p0\.h, w0, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_rr_b16_u32, uint32_t,
+		p0 = svwhilelt_b16_u32 (x0, x1),
+		p0 = svwhilelt_b16 (x0, x1))
+
+/*
+** whilelt_0r_b16_u32:
+**	whilelo	p0\.h, wzr, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_0r_b16_u32, uint32_t,
+		p0 = svwhilelt_b16_u32 (0, x1),
+		p0 = svwhilelt_b16 ((uint32_t) 0, x1))
+
+/*
+** whilelt_5r_b16_u32:
+**	mov	(w[0-9]+), #?5
+**	whilelo	p0\.h, \1, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_5r_b16_u32, uint32_t,
+		p0 = svwhilelt_b16_u32 (5, x1),
+		p0 = svwhilelt_b16 ((uint32_t) 5, x1))
+
+/*
+** whilelt_r5_b16_u32:
+**	mov	(w[0-9]+), #?5
+**	whilelo	p0\.h, w0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r5_b16_u32, uint32_t,
+		p0 = svwhilelt_b16_u32 (x0, 5),
+		p0 = svwhilelt_b16 (x0, (uint32_t) 5))
+
+/*
+** whilelt_rr_b16_u64:
+**	whilelo	p0\.h, x0, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_rr_b16_u64, uint64_t,
+		p0 = svwhilelt_b16_u64 (x0, x1),
+		p0 = svwhilelt_b16 (x0, x1))
+
+/*
+** whilelt_0r_b16_u64:
+**	whilelo	p0\.h, xzr, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_0r_b16_u64, uint64_t,
+		p0 = svwhilelt_b16_u64 (0, x1),
+		p0 = svwhilelt_b16 ((uint64_t) 0, x1))
+
+/*
+** whilelt_5r_b16_u64:
+**	mov	(x[0-9]+), #?5
+**	whilelo	p0\.h, \1, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_5r_b16_u64, uint64_t,
+		p0 = svwhilelt_b16_u64 (5, x1),
+		p0 = svwhilelt_b16 ((uint64_t) 5, x1))
+
+/*
+** whilelt_r5_b16_u64:
+**	mov	(x[0-9]+), #?5
+**	whilelo	p0\.h, x0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r5_b16_u64, uint64_t,
+		p0 = svwhilelt_b16_u64 (x0, 5),
+		p0 = svwhilelt_b16 (x0, (uint64_t) 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b32.c
new file mode 100644
index 000000000..0e50bb07a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b32.c
@@ -0,0 +1,173 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** whilelt_rr_b32_s32:
+**	whilelt	p0\.s, w0, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_rr_b32_s32, int32_t,
+		p0 = svwhilelt_b32_s32 (x0, x1),
+		p0 = svwhilelt_b32 (x0, x1))
+
+/*
+** whilelt_0r_b32_s32:
+**	whilelt	p0\.s, wzr, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_0r_b32_s32, int32_t,
+		p0 = svwhilelt_b32_s32 (0, x1),
+		p0 = svwhilelt_b32 (0, x1))
+
+/*
+** whilelt_5r_b32_s32:
+**	mov	(w[0-9]+), #?5
+**	whilelt	p0\.s, \1, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_5r_b32_s32, int32_t,
+		p0 = svwhilelt_b32_s32 (5, x1),
+		p0 = svwhilelt_b32 (5, x1))
+
+/*
+** whilelt_r0_b32_s32:
+**	whilelt	p0\.s, w0, wzr
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r0_b32_s32, int32_t,
+		p0 = svwhilelt_b32_s32 (x0, 0),
+		p0 = svwhilelt_b32 (x0, 0))
+
+/*
+** whilelt_r5_b32_s32:
+**	mov	(w[0-9]+), #?5
+**	whilelt	p0\.s, w0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r5_b32_s32, int32_t,
+		p0 = svwhilelt_b32_s32 (x0, 5),
+		p0 = svwhilelt_b32 (x0, 5))
+
+/*
+** whilelt_rr_b32_s64:
+**	whilelt	p0\.s, x0, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_rr_b32_s64, int64_t,
+		p0 = svwhilelt_b32_s64 (x0, x1),
+		p0 = svwhilelt_b32 (x0, x1))
+
+/*
+** whilelt_0r_b32_s64:
+**	whilelt	p0\.s, xzr, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_0r_b32_s64, int64_t,
+		p0 = svwhilelt_b32_s64 (0, x1),
+		p0 = svwhilelt_b32 ((int64_t) 0, x1))
+
+/*
+** whilelt_5r_b32_s64:
+**	mov	(x[0-9]+), #?5
+**	whilelt	p0\.s, \1, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_5r_b32_s64, int64_t,
+		p0 = svwhilelt_b32_s64 (5, x1),
+		p0 = svwhilelt_b32 ((int64_t) 5, x1))
+
+/*
+** whilelt_r0_b32_s64:
+**	whilelt	p0\.s, x0, xzr
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r0_b32_s64, int64_t,
+		p0 = svwhilelt_b32_s64 (x0, 0),
+		p0 = svwhilelt_b32 (x0, (int64_t) 0))
+
+/*
+** whilelt_r5_b32_s64:
+**	mov	(x[0-9]+), #?5
+**	whilelt	p0\.s, x0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r5_b32_s64, int64_t,
+		p0 = svwhilelt_b32_s64 (x0, 5),
+		p0 = svwhilelt_b32 (x0, (int64_t) 5))
+
+/*
+** whilelt_rr_b32_u32:
+**	whilelo	p0\.s, w0, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_rr_b32_u32, uint32_t,
+		p0 = svwhilelt_b32_u32 (x0, x1),
+		p0 = svwhilelt_b32 (x0, x1))
+
+/*
+** whilelt_0r_b32_u32:
+**	whilelo	p0\.s, wzr, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_0r_b32_u32, uint32_t,
+		p0 = svwhilelt_b32_u32 (0, x1),
+		p0 = svwhilelt_b32 ((uint32_t) 0, x1))
+
+/*
+** whilelt_5r_b32_u32:
+**	mov	(w[0-9]+), #?5
+**	whilelo	p0\.s, \1, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_5r_b32_u32, uint32_t,
+		p0 = svwhilelt_b32_u32 (5, x1),
+		p0 = svwhilelt_b32 ((uint32_t) 5, x1))
+
+/*
+** whilelt_r5_b32_u32:
+**	mov	(w[0-9]+), #?5
+**	whilelo	p0\.s, w0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r5_b32_u32, uint32_t,
+		p0 = svwhilelt_b32_u32 (x0, 5),
+		p0 = svwhilelt_b32 (x0, (uint32_t) 5))
+
+/*
+** whilelt_rr_b32_u64:
+**	whilelo	p0\.s, x0, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_rr_b32_u64, uint64_t,
+		p0 = svwhilelt_b32_u64 (x0, x1),
+		p0 = svwhilelt_b32 (x0, x1))
+
+/*
+** whilelt_0r_b32_u64:
+**	whilelo	p0\.s, xzr, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_0r_b32_u64, uint64_t,
+		p0 = svwhilelt_b32_u64 (0, x1),
+		p0 = svwhilelt_b32 ((uint64_t) 0, x1))
+
+/*
+** whilelt_5r_b32_u64:
+**	mov	(x[0-9]+), #?5
+**	whilelo	p0\.s, \1, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_5r_b32_u64, uint64_t,
+		p0 = svwhilelt_b32_u64 (5, x1),
+		p0 = svwhilelt_b32 ((uint64_t) 5, x1))
+
+/*
+** whilelt_r5_b32_u64:
+**	mov	(x[0-9]+), #?5
+**	whilelo	p0\.s, x0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r5_b32_u64, uint64_t,
+		p0 = svwhilelt_b32_u64 (x0, 5),
+		p0 = svwhilelt_b32 (x0, (uint64_t) 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b64.c
new file mode 100644
index 000000000..539c93347
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b64.c
@@ -0,0 +1,173 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** whilelt_rr_b64_s32:
+**	whilelt	p0\.d, w0, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_rr_b64_s32, int32_t,
+		p0 = svwhilelt_b64_s32 (x0, x1),
+		p0 = svwhilelt_b64 (x0, x1))
+
+/*
+** whilelt_0r_b64_s32:
+**	whilelt	p0\.d, wzr, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_0r_b64_s32, int32_t,
+		p0 = svwhilelt_b64_s32 (0, x1),
+		p0 = svwhilelt_b64 (0, x1))
+
+/*
+** whilelt_5r_b64_s32:
+**	mov	(w[0-9]+), #?5
+**	whilelt	p0\.d, \1, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_5r_b64_s32, int32_t,
+		p0 = svwhilelt_b64_s32 (5, x1),
+		p0 = svwhilelt_b64 (5, x1))
+
+/*
+** whilelt_r0_b64_s32:
+**	whilelt	p0\.d, w0, wzr
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r0_b64_s32, int32_t,
+		p0 = svwhilelt_b64_s32 (x0, 0),
+		p0 = svwhilelt_b64 (x0, 0))
+
+/*
+** whilelt_r5_b64_s32:
+**	mov	(w[0-9]+), #?5
+**	whilelt	p0\.d, w0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r5_b64_s32, int32_t,
+		p0 = svwhilelt_b64_s32 (x0, 5),
+		p0 = svwhilelt_b64 (x0, 5))
+
+/*
+** whilelt_rr_b64_s64:
+**	whilelt	p0\.d, x0, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_rr_b64_s64, int64_t,
+		p0 = svwhilelt_b64_s64 (x0, x1),
+		p0 = svwhilelt_b64 (x0, x1))
+
+/*
+** whilelt_0r_b64_s64:
+**	whilelt	p0\.d, xzr, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_0r_b64_s64, int64_t,
+		p0 = svwhilelt_b64_s64 (0, x1),
+		p0 = svwhilelt_b64 ((int64_t) 0, x1))
+
+/*
+** whilelt_5r_b64_s64:
+**	mov	(x[0-9]+), #?5
+**	whilelt	p0\.d, \1, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_5r_b64_s64, int64_t,
+		p0 = svwhilelt_b64_s64 (5, x1),
+		p0 = svwhilelt_b64 ((int64_t) 5, x1))
+
+/*
+** whilelt_r0_b64_s64:
+**	whilelt	p0\.d, x0, xzr
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r0_b64_s64, int64_t,
+		p0 = svwhilelt_b64_s64 (x0, 0),
+		p0 = svwhilelt_b64 (x0, (int64_t) 0))
+
+/*
+** whilelt_r5_b64_s64:
+**	mov	(x[0-9]+), #?5
+**	whilelt	p0\.d, x0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r5_b64_s64, int64_t,
+		p0 = svwhilelt_b64_s64 (x0, 5),
+		p0 = svwhilelt_b64 (x0, (int64_t) 5))
+
+/*
+** whilelt_rr_b64_u32:
+**	whilelo	p0\.d, w0, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_rr_b64_u32, uint32_t,
+		p0 = svwhilelt_b64_u32 (x0, x1),
+		p0 = svwhilelt_b64 (x0, x1))
+
+/*
+** whilelt_0r_b64_u32:
+**	whilelo	p0\.d, wzr, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_0r_b64_u32, uint32_t,
+		p0 = svwhilelt_b64_u32 (0, x1),
+		p0 = svwhilelt_b64 ((uint32_t) 0, x1))
+
+/*
+** whilelt_5r_b64_u32:
+**	mov	(w[0-9]+), #?5
+**	whilelo	p0\.d, \1, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_5r_b64_u32, uint32_t,
+		p0 = svwhilelt_b64_u32 (5, x1),
+		p0 = svwhilelt_b64 ((uint32_t) 5, x1))
+
+/*
+** whilelt_r5_b64_u32:
+**	mov	(w[0-9]+), #?5
+**	whilelo	p0\.d, w0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r5_b64_u32, uint32_t,
+		p0 = svwhilelt_b64_u32 (x0, 5),
+		p0 = svwhilelt_b64 (x0, (uint32_t) 5))
+
+/*
+** whilelt_rr_b64_u64:
+**	whilelo	p0\.d, x0, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_rr_b64_u64, uint64_t,
+		p0 = svwhilelt_b64_u64 (x0, x1),
+		p0 = svwhilelt_b64 (x0, x1))
+
+/*
+** whilelt_0r_b64_u64:
+**	whilelo	p0\.d, xzr, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_0r_b64_u64, uint64_t,
+		p0 = svwhilelt_b64_u64 (0, x1),
+		p0 = svwhilelt_b64 ((uint64_t) 0, x1))
+
+/*
+** whilelt_5r_b64_u64:
+**	mov	(x[0-9]+), #?5
+**	whilelo	p0\.d, \1, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_5r_b64_u64, uint64_t,
+		p0 = svwhilelt_b64_u64 (5, x1),
+		p0 = svwhilelt_b64 ((uint64_t) 5, x1))
+
+/*
+** whilelt_r5_b64_u64:
+**	mov	(x[0-9]+), #?5
+**	whilelo	p0\.d, x0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r5_b64_u64, uint64_t,
+		p0 = svwhilelt_b64_u64 (x0, 5),
+		p0 = svwhilelt_b64 (x0, (uint64_t) 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b8.c
new file mode 100644
index 000000000..5b6a5c44d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/whilelt_b8.c
@@ -0,0 +1,173 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** whilelt_rr_b8_s32:
+**	whilelt	p0\.b, w0, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_rr_b8_s32, int32_t,
+		p0 = svwhilelt_b8_s32 (x0, x1),
+		p0 = svwhilelt_b8 (x0, x1))
+
+/*
+** whilelt_0r_b8_s32:
+**	whilelt	p0\.b, wzr, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_0r_b8_s32, int32_t,
+		p0 = svwhilelt_b8_s32 (0, x1),
+		p0 = svwhilelt_b8 (0, x1))
+
+/*
+** whilelt_5r_b8_s32:
+**	mov	(w[0-9]+), #?5
+**	whilelt	p0\.b, \1, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_5r_b8_s32, int32_t,
+		p0 = svwhilelt_b8_s32 (5, x1),
+		p0 = svwhilelt_b8 (5, x1))
+
+/*
+** whilelt_r0_b8_s32:
+**	whilelt	p0\.b, w0, wzr
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r0_b8_s32, int32_t,
+		p0 = svwhilelt_b8_s32 (x0, 0),
+		p0 = svwhilelt_b8 (x0, 0))
+
+/*
+** whilelt_r5_b8_s32:
+**	mov	(w[0-9]+), #?5
+**	whilelt	p0\.b, w0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r5_b8_s32, int32_t,
+		p0 = svwhilelt_b8_s32 (x0, 5),
+		p0 = svwhilelt_b8 (x0, 5))
+
+/*
+** whilelt_rr_b8_s64:
+**	whilelt	p0\.b, x0, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_rr_b8_s64, int64_t,
+		p0 = svwhilelt_b8_s64 (x0, x1),
+		p0 = svwhilelt_b8 (x0, x1))
+
+/*
+** whilelt_0r_b8_s64:
+**	whilelt	p0\.b, xzr, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_0r_b8_s64, int64_t,
+		p0 = svwhilelt_b8_s64 (0, x1),
+		p0 = svwhilelt_b8 ((int64_t) 0, x1))
+
+/*
+** whilelt_5r_b8_s64:
+**	mov	(x[0-9]+), #?5
+**	whilelt	p0\.b, \1, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_5r_b8_s64, int64_t,
+		p0 = svwhilelt_b8_s64 (5, x1),
+		p0 = svwhilelt_b8 ((int64_t) 5, x1))
+
+/*
+** whilelt_r0_b8_s64:
+**	whilelt	p0\.b, x0, xzr
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r0_b8_s64, int64_t,
+		p0 = svwhilelt_b8_s64 (x0, 0),
+		p0 = svwhilelt_b8 (x0, (int64_t) 0))
+
+/*
+** whilelt_r5_b8_s64:
+**	mov	(x[0-9]+), #?5
+**	whilelt	p0\.b, x0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r5_b8_s64, int64_t,
+		p0 = svwhilelt_b8_s64 (x0, 5),
+		p0 = svwhilelt_b8 (x0, (int64_t) 5))
+
+/*
+** whilelt_rr_b8_u32:
+**	whilelo	p0\.b, w0, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_rr_b8_u32, uint32_t,
+		p0 = svwhilelt_b8_u32 (x0, x1),
+		p0 = svwhilelt_b8 (x0, x1))
+
+/*
+** whilelt_0r_b8_u32:
+**	whilelo	p0\.b, wzr, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_0r_b8_u32, uint32_t,
+		p0 = svwhilelt_b8_u32 (0, x1),
+		p0 = svwhilelt_b8 ((uint32_t) 0, x1))
+
+/*
+** whilelt_5r_b8_u32:
+**	mov	(w[0-9]+), #?5
+**	whilelo	p0\.b, \1, w1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_5r_b8_u32, uint32_t,
+		p0 = svwhilelt_b8_u32 (5, x1),
+		p0 = svwhilelt_b8 ((uint32_t) 5, x1))
+
+/*
+** whilelt_r5_b8_u32:
+**	mov	(w[0-9]+), #?5
+**	whilelo	p0\.b, w0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r5_b8_u32, uint32_t,
+		p0 = svwhilelt_b8_u32 (x0, 5),
+		p0 = svwhilelt_b8 (x0, (uint32_t) 5))
+
+/*
+** whilelt_rr_b8_u64:
+**	whilelo	p0\.b, x0, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_rr_b8_u64, uint64_t,
+		p0 = svwhilelt_b8_u64 (x0, x1),
+		p0 = svwhilelt_b8 (x0, x1))
+
+/*
+** whilelt_0r_b8_u64:
+**	whilelo	p0\.b, xzr, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_0r_b8_u64, uint64_t,
+		p0 = svwhilelt_b8_u64 (0, x1),
+		p0 = svwhilelt_b8 ((uint64_t) 0, x1))
+
+/*
+** whilelt_5r_b8_u64:
+**	mov	(x[0-9]+), #?5
+**	whilelo	p0\.b, \1, x1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_5r_b8_u64, uint64_t,
+		p0 = svwhilelt_b8_u64 (5, x1),
+		p0 = svwhilelt_b8 ((uint64_t) 5, x1))
+
+/*
+** whilelt_r5_b8_u64:
+**	mov	(x[0-9]+), #?5
+**	whilelo	p0\.b, x0, \1
+**	ret
+*/
+TEST_COMPARE_S (whilelt_r5_b8_u64, uint64_t,
+		p0 = svwhilelt_b8_u64 (x0, 5),
+		p0 = svwhilelt_b8 (x0, (uint64_t) 5))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b16.c
new file mode 100644
index 000000000..269260eb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_b16_tied1:
+**	zip1	p0\.h, p0\.h, p1\.h
+**	ret
+*/
+TEST_UNIFORM_P (zip1_b16_tied1,
+		p0 = svzip1_b16 (p0, p1),
+		p0 = svzip1_b16 (p0, p1))
+
+/*
+** zip1_b16_tied2:
+**	zip1	p0\.h, p1\.h, p0\.h
+**	ret
+*/
+TEST_UNIFORM_P (zip1_b16_tied2,
+		p0 = svzip1_b16 (p1, p0),
+		p0 = svzip1_b16 (p1, p0))
+
+/*
+** zip1_b16_untied:
+**	zip1	p0\.h, p1\.h, p2\.h
+**	ret
+*/
+TEST_UNIFORM_P (zip1_b16_untied,
+		p0 = svzip1_b16 (p1, p2),
+		p0 = svzip1_b16 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b32.c
new file mode 100644
index 000000000..027609a7d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_b32_tied1:
+**	zip1	p0\.s, p0\.s, p1\.s
+**	ret
+*/
+TEST_UNIFORM_P (zip1_b32_tied1,
+		p0 = svzip1_b32 (p0, p1),
+		p0 = svzip1_b32 (p0, p1))
+
+/*
+** zip1_b32_tied2:
+**	zip1	p0\.s, p1\.s, p0\.s
+**	ret
+*/
+TEST_UNIFORM_P (zip1_b32_tied2,
+		p0 = svzip1_b32 (p1, p0),
+		p0 = svzip1_b32 (p1, p0))
+
+/*
+** zip1_b32_untied:
+**	zip1	p0\.s, p1\.s, p2\.s
+**	ret
+*/
+TEST_UNIFORM_P (zip1_b32_untied,
+		p0 = svzip1_b32 (p1, p2),
+		p0 = svzip1_b32 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b64.c
new file mode 100644
index 000000000..8add16d8e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_b64_tied1:
+**	zip1	p0\.d, p0\.d, p1\.d
+**	ret
+*/
+TEST_UNIFORM_P (zip1_b64_tied1,
+		p0 = svzip1_b64 (p0, p1),
+		p0 = svzip1_b64 (p0, p1))
+
+/*
+** zip1_b64_tied2:
+**	zip1	p0\.d, p1\.d, p0\.d
+**	ret
+*/
+TEST_UNIFORM_P (zip1_b64_tied2,
+		p0 = svzip1_b64 (p1, p0),
+		p0 = svzip1_b64 (p1, p0))
+
+/*
+** zip1_b64_untied:
+**	zip1	p0\.d, p1\.d, p2\.d
+**	ret
+*/
+TEST_UNIFORM_P (zip1_b64_untied,
+		p0 = svzip1_b64 (p1, p2),
+		p0 = svzip1_b64 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b8.c
new file mode 100644
index 000000000..8648298ac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_b8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_b8_tied1:
+**	zip1	p0\.b, p0\.b, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (zip1_b8_tied1,
+		p0 = svzip1_b8 (p0, p1),
+		p0 = svzip1_b8 (p0, p1))
+
+/*
+** zip1_b8_tied2:
+**	zip1	p0\.b, p1\.b, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (zip1_b8_tied2,
+		p0 = svzip1_b8 (p1, p0),
+		p0 = svzip1_b8 (p1, p0))
+
+/*
+** zip1_b8_untied:
+**	zip1	p0\.b, p1\.b, p2\.b
+**	ret
+*/
+TEST_UNIFORM_P (zip1_b8_untied,
+		p0 = svzip1_b8 (p1, p2),
+		p0 = svzip1_b8 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_bf16.c
new file mode 100644
index 000000000..6017cde41
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_bf16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_bf16_tied1:
+**	zip1	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_bf16_tied1, svbfloat16_t,
+		z0 = svzip1_bf16 (z0, z1),
+		z0 = svzip1 (z0, z1))
+
+/*
+** zip1_bf16_tied2:
+**	zip1	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_bf16_tied2, svbfloat16_t,
+		z0 = svzip1_bf16 (z1, z0),
+		z0 = svzip1 (z1, z0))
+
+/*
+** zip1_bf16_untied:
+**	zip1	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_bf16_untied, svbfloat16_t,
+		z0 = svzip1_bf16 (z1, z2),
+		z0 = svzip1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f16.c
new file mode 100644
index 000000000..1c6ce4e7d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_f16_tied1:
+**	zip1	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_f16_tied1, svfloat16_t,
+		z0 = svzip1_f16 (z0, z1),
+		z0 = svzip1 (z0, z1))
+
+/*
+** zip1_f16_tied2:
+**	zip1	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_f16_tied2, svfloat16_t,
+		z0 = svzip1_f16 (z1, z0),
+		z0 = svzip1 (z1, z0))
+
+/*
+** zip1_f16_untied:
+**	zip1	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_f16_untied, svfloat16_t,
+		z0 = svzip1_f16 (z1, z2),
+		z0 = svzip1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f32.c
new file mode 100644
index 000000000..288ceff3f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_f32_tied1:
+**	zip1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_f32_tied1, svfloat32_t,
+		z0 = svzip1_f32 (z0, z1),
+		z0 = svzip1 (z0, z1))
+
+/*
+** zip1_f32_tied2:
+**	zip1	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_f32_tied2, svfloat32_t,
+		z0 = svzip1_f32 (z1, z0),
+		z0 = svzip1 (z1, z0))
+
+/*
+** zip1_f32_untied:
+**	zip1	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_f32_untied, svfloat32_t,
+		z0 = svzip1_f32 (z1, z2),
+		z0 = svzip1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f64.c
new file mode 100644
index 000000000..5abbea1cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_f64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_f64_tied1:
+**	zip1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_f64_tied1, svfloat64_t,
+		z0 = svzip1_f64 (z0, z1),
+		z0 = svzip1 (z0, z1))
+
+/*
+** zip1_f64_tied2:
+**	zip1	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_f64_tied2, svfloat64_t,
+		z0 = svzip1_f64 (z1, z0),
+		z0 = svzip1 (z1, z0))
+
+/*
+** zip1_f64_untied:
+**	zip1	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_f64_untied, svfloat64_t,
+		z0 = svzip1_f64 (z1, z2),
+		z0 = svzip1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s16.c
new file mode 100644
index 000000000..8ecd20142
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_s16_tied1:
+**	zip1	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_s16_tied1, svint16_t,
+		z0 = svzip1_s16 (z0, z1),
+		z0 = svzip1 (z0, z1))
+
+/*
+** zip1_s16_tied2:
+**	zip1	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_s16_tied2, svint16_t,
+		z0 = svzip1_s16 (z1, z0),
+		z0 = svzip1 (z1, z0))
+
+/*
+** zip1_s16_untied:
+**	zip1	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_s16_untied, svint16_t,
+		z0 = svzip1_s16 (z1, z2),
+		z0 = svzip1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s32.c
new file mode 100644
index 000000000..c523885ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_s32_tied1:
+**	zip1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_s32_tied1, svint32_t,
+		z0 = svzip1_s32 (z0, z1),
+		z0 = svzip1 (z0, z1))
+
+/*
+** zip1_s32_tied2:
+**	zip1	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_s32_tied2, svint32_t,
+		z0 = svzip1_s32 (z1, z0),
+		z0 = svzip1 (z1, z0))
+
+/*
+** zip1_s32_untied:
+**	zip1	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_s32_untied, svint32_t,
+		z0 = svzip1_s32 (z1, z2),
+		z0 = svzip1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s64.c
new file mode 100644
index 000000000..d1dca7ee9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_s64_tied1:
+**	zip1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_s64_tied1, svint64_t,
+		z0 = svzip1_s64 (z0, z1),
+		z0 = svzip1 (z0, z1))
+
+/*
+** zip1_s64_tied2:
+**	zip1	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_s64_tied2, svint64_t,
+		z0 = svzip1_s64 (z1, z0),
+		z0 = svzip1 (z1, z0))
+
+/*
+** zip1_s64_untied:
+**	zip1	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_s64_untied, svint64_t,
+		z0 = svzip1_s64 (z1, z2),
+		z0 = svzip1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s8.c
new file mode 100644
index 000000000..1600ab586
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_s8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_s8_tied1:
+**	zip1	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_s8_tied1, svint8_t,
+		z0 = svzip1_s8 (z0, z1),
+		z0 = svzip1 (z0, z1))
+
+/*
+** zip1_s8_tied2:
+**	zip1	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_s8_tied2, svint8_t,
+		z0 = svzip1_s8 (z1, z0),
+		z0 = svzip1 (z1, z0))
+
+/*
+** zip1_s8_untied:
+**	zip1	z0\.b, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_s8_untied, svint8_t,
+		z0 = svzip1_s8 (z1, z2),
+		z0 = svzip1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u16.c
new file mode 100644
index 000000000..3773ed22f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_u16_tied1:
+**	zip1	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_u16_tied1, svuint16_t,
+		z0 = svzip1_u16 (z0, z1),
+		z0 = svzip1 (z0, z1))
+
+/*
+** zip1_u16_tied2:
+**	zip1	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_u16_tied2, svuint16_t,
+		z0 = svzip1_u16 (z1, z0),
+		z0 = svzip1 (z1, z0))
+
+/*
+** zip1_u16_untied:
+**	zip1	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_u16_untied, svuint16_t,
+		z0 = svzip1_u16 (z1, z2),
+		z0 = svzip1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u32.c
new file mode 100644
index 000000000..e67c121e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_u32_tied1:
+**	zip1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_u32_tied1, svuint32_t,
+		z0 = svzip1_u32 (z0, z1),
+		z0 = svzip1 (z0, z1))
+
+/*
+** zip1_u32_tied2:
+**	zip1	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_u32_tied2, svuint32_t,
+		z0 = svzip1_u32 (z1, z0),
+		z0 = svzip1 (z1, z0))
+
+/*
+** zip1_u32_untied:
+**	zip1	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_u32_untied, svuint32_t,
+		z0 = svzip1_u32 (z1, z2),
+		z0 = svzip1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u64.c
new file mode 100644
index 000000000..bb6380a6a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_u64_tied1:
+**	zip1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_u64_tied1, svuint64_t,
+		z0 = svzip1_u64 (z0, z1),
+		z0 = svzip1 (z0, z1))
+
+/*
+** zip1_u64_tied2:
+**	zip1	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_u64_tied2, svuint64_t,
+		z0 = svzip1_u64 (z1, z0),
+		z0 = svzip1 (z1, z0))
+
+/*
+** zip1_u64_untied:
+**	zip1	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_u64_untied, svuint64_t,
+		z0 = svzip1_u64 (z1, z2),
+		z0 = svzip1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u8.c
new file mode 100644
index 000000000..01d89d4fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1_u8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_u8_tied1:
+**	zip1	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_u8_tied1, svuint8_t,
+		z0 = svzip1_u8 (z0, z1),
+		z0 = svzip1 (z0, z1))
+
+/*
+** zip1_u8_tied2:
+**	zip1	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_u8_tied2, svuint8_t,
+		z0 = svzip1_u8 (z1, z0),
+		z0 = svzip1 (z1, z0))
+
+/*
+** zip1_u8_untied:
+**	zip1	z0\.b, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (zip1_u8_untied, svuint8_t,
+		z0 = svzip1_u8 (z1, z2),
+		z0 = svzip1 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_bf16.c
new file mode 100644
index 000000000..aabf7c0e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_bf16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1q_bf16_tied1:
+**	zip1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_bf16_tied1, svbfloat16_t,
+		z0 = svzip1q_bf16 (z0, z1),
+		z0 = svzip1q (z0, z1))
+
+/*
+** zip1q_bf16_tied2:
+**	zip1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_bf16_tied2, svbfloat16_t,
+		z0 = svzip1q_bf16 (z1, z0),
+		z0 = svzip1q (z1, z0))
+
+/*
+** zip1q_bf16_untied:
+**	zip1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_bf16_untied, svbfloat16_t,
+		z0 = svzip1q_bf16 (z1, z2),
+		z0 = svzip1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f16.c
new file mode 100644
index 000000000..1170cc5e7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1q_f16_tied1:
+**	zip1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_f16_tied1, svfloat16_t,
+		z0 = svzip1q_f16 (z0, z1),
+		z0 = svzip1q (z0, z1))
+
+/*
+** zip1q_f16_tied2:
+**	zip1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_f16_tied2, svfloat16_t,
+		z0 = svzip1q_f16 (z1, z0),
+		z0 = svzip1q (z1, z0))
+
+/*
+** zip1q_f16_untied:
+**	zip1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_f16_untied, svfloat16_t,
+		z0 = svzip1q_f16 (z1, z2),
+		z0 = svzip1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f32.c
new file mode 100644
index 000000000..09666da1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1q_f32_tied1:
+**	zip1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_f32_tied1, svfloat32_t,
+		z0 = svzip1q_f32 (z0, z1),
+		z0 = svzip1q (z0, z1))
+
+/*
+** zip1q_f32_tied2:
+**	zip1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_f32_tied2, svfloat32_t,
+		z0 = svzip1q_f32 (z1, z0),
+		z0 = svzip1q (z1, z0))
+
+/*
+** zip1q_f32_untied:
+**	zip1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_f32_untied, svfloat32_t,
+		z0 = svzip1q_f32 (z1, z2),
+		z0 = svzip1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f64.c
new file mode 100644
index 000000000..d77fb1c90
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_f64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1q_f64_tied1:
+**	zip1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_f64_tied1, svfloat64_t,
+		z0 = svzip1q_f64 (z0, z1),
+		z0 = svzip1q (z0, z1))
+
+/*
+** zip1q_f64_tied2:
+**	zip1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_f64_tied2, svfloat64_t,
+		z0 = svzip1q_f64 (z1, z0),
+		z0 = svzip1q (z1, z0))
+
+/*
+** zip1q_f64_untied:
+**	zip1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_f64_untied, svfloat64_t,
+		z0 = svzip1q_f64 (z1, z2),
+		z0 = svzip1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s16.c
new file mode 100644
index 000000000..92a6b5514
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1q_s16_tied1:
+**	zip1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_s16_tied1, svint16_t,
+		z0 = svzip1q_s16 (z0, z1),
+		z0 = svzip1q (z0, z1))
+
+/*
+** zip1q_s16_tied2:
+**	zip1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_s16_tied2, svint16_t,
+		z0 = svzip1q_s16 (z1, z0),
+		z0 = svzip1q (z1, z0))
+
+/*
+** zip1q_s16_untied:
+**	zip1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_s16_untied, svint16_t,
+		z0 = svzip1q_s16 (z1, z2),
+		z0 = svzip1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s32.c
new file mode 100644
index 000000000..a918d2d4c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1q_s32_tied1:
+**	zip1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_s32_tied1, svint32_t,
+		z0 = svzip1q_s32 (z0, z1),
+		z0 = svzip1q (z0, z1))
+
+/*
+** zip1q_s32_tied2:
+**	zip1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_s32_tied2, svint32_t,
+		z0 = svzip1q_s32 (z1, z0),
+		z0 = svzip1q (z1, z0))
+
+/*
+** zip1q_s32_untied:
+**	zip1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_s32_untied, svint32_t,
+		z0 = svzip1q_s32 (z1, z2),
+		z0 = svzip1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s64.c
new file mode 100644
index 000000000..be3524fd5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1q_s64_tied1:
+**	zip1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_s64_tied1, svint64_t,
+		z0 = svzip1q_s64 (z0, z1),
+		z0 = svzip1q (z0, z1))
+
+/*
+** zip1q_s64_tied2:
+**	zip1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_s64_tied2, svint64_t,
+		z0 = svzip1q_s64 (z1, z0),
+		z0 = svzip1q (z1, z0))
+
+/*
+** zip1q_s64_untied:
+**	zip1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_s64_untied, svint64_t,
+		z0 = svzip1q_s64 (z1, z2),
+		z0 = svzip1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s8.c
new file mode 100644
index 000000000..24ea2399c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_s8.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1q_s8_tied1:
+**	zip1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_s8_tied1, svint8_t,
+		z0 = svzip1q_s8 (z0, z1),
+		z0 = svzip1q (z0, z1))
+
+/*
+** zip1q_s8_tied2:
+**	zip1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_s8_tied2, svint8_t,
+		z0 = svzip1q_s8 (z1, z0),
+		z0 = svzip1q (z1, z0))
+
+/*
+** zip1q_s8_untied:
+**	zip1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_s8_untied, svint8_t,
+		z0 = svzip1q_s8 (z1, z2),
+		z0 = svzip1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u16.c
new file mode 100644
index 000000000..65caf9706
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1q_u16_tied1:
+**	zip1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_u16_tied1, svuint16_t,
+		z0 = svzip1q_u16 (z0, z1),
+		z0 = svzip1q (z0, z1))
+
+/*
+** zip1q_u16_tied2:
+**	zip1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_u16_tied2, svuint16_t,
+		z0 = svzip1q_u16 (z1, z0),
+		z0 = svzip1q (z1, z0))
+
+/*
+** zip1q_u16_untied:
+**	zip1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_u16_untied, svuint16_t,
+		z0 = svzip1q_u16 (z1, z2),
+		z0 = svzip1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u32.c
new file mode 100644
index 000000000..abd76b74f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1q_u32_tied1:
+**	zip1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_u32_tied1, svuint32_t,
+		z0 = svzip1q_u32 (z0, z1),
+		z0 = svzip1q (z0, z1))
+
+/*
+** zip1q_u32_tied2:
+**	zip1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_u32_tied2, svuint32_t,
+		z0 = svzip1q_u32 (z1, z0),
+		z0 = svzip1q (z1, z0))
+
+/*
+** zip1q_u32_untied:
+**	zip1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_u32_untied, svuint32_t,
+		z0 = svzip1q_u32 (z1, z2),
+		z0 = svzip1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u64.c
new file mode 100644
index 000000000..0e91929b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1q_u64_tied1:
+**	zip1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_u64_tied1, svuint64_t,
+		z0 = svzip1q_u64 (z0, z1),
+		z0 = svzip1q (z0, z1))
+
+/*
+** zip1q_u64_tied2:
+**	zip1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_u64_tied2, svuint64_t,
+		z0 = svzip1q_u64 (z1, z0),
+		z0 = svzip1q (z1, z0))
+
+/*
+** zip1q_u64_untied:
+**	zip1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_u64_untied, svuint64_t,
+		z0 = svzip1q_u64 (z1, z2),
+		z0 = svzip1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u8.c
new file mode 100644
index 000000000..07d484b0b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip1q_u8.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1q_u8_tied1:
+**	zip1	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_u8_tied1, svuint8_t,
+		z0 = svzip1q_u8 (z0, z1),
+		z0 = svzip1q (z0, z1))
+
+/*
+** zip1q_u8_tied2:
+**	zip1	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_u8_tied2, svuint8_t,
+		z0 = svzip1q_u8 (z1, z0),
+		z0 = svzip1q (z1, z0))
+
+/*
+** zip1q_u8_untied:
+**	zip1	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip1q_u8_untied, svuint8_t,
+		z0 = svzip1q_u8 (z1, z2),
+		z0 = svzip1q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b16.c
new file mode 100644
index 000000000..5624c9815
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_b16_tied1:
+**	zip2	p0\.h, p0\.h, p1\.h
+**	ret
+*/
+TEST_UNIFORM_P (zip2_b16_tied1,
+		p0 = svzip2_b16 (p0, p1),
+		p0 = svzip2_b16 (p0, p1))
+
+/*
+** zip2_b16_tied2:
+**	zip2	p0\.h, p1\.h, p0\.h
+**	ret
+*/
+TEST_UNIFORM_P (zip2_b16_tied2,
+		p0 = svzip2_b16 (p1, p0),
+		p0 = svzip2_b16 (p1, p0))
+
+/*
+** zip2_b16_untied:
+**	zip2	p0\.h, p1\.h, p2\.h
+**	ret
+*/
+TEST_UNIFORM_P (zip2_b16_untied,
+		p0 = svzip2_b16 (p1, p2),
+		p0 = svzip2_b16 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b32.c
new file mode 100644
index 000000000..b73d5b490
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_b32_tied1:
+**	zip2	p0\.s, p0\.s, p1\.s
+**	ret
+*/
+TEST_UNIFORM_P (zip2_b32_tied1,
+		p0 = svzip2_b32 (p0, p1),
+		p0 = svzip2_b32 (p0, p1))
+
+/*
+** zip2_b32_tied2:
+**	zip2	p0\.s, p1\.s, p0\.s
+**	ret
+*/
+TEST_UNIFORM_P (zip2_b32_tied2,
+		p0 = svzip2_b32 (p1, p0),
+		p0 = svzip2_b32 (p1, p0))
+
+/*
+** zip2_b32_untied:
+**	zip2	p0\.s, p1\.s, p2\.s
+**	ret
+*/
+TEST_UNIFORM_P (zip2_b32_untied,
+		p0 = svzip2_b32 (p1, p2),
+		p0 = svzip2_b32 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b64.c
new file mode 100644
index 000000000..9ebf050b8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_b64_tied1:
+**	zip2	p0\.d, p0\.d, p1\.d
+**	ret
+*/
+TEST_UNIFORM_P (zip2_b64_tied1,
+		p0 = svzip2_b64 (p0, p1),
+		p0 = svzip2_b64 (p0, p1))
+
+/*
+** zip2_b64_tied2:
+**	zip2	p0\.d, p1\.d, p0\.d
+**	ret
+*/
+TEST_UNIFORM_P (zip2_b64_tied2,
+		p0 = svzip2_b64 (p1, p0),
+		p0 = svzip2_b64 (p1, p0))
+
+/*
+** zip2_b64_untied:
+**	zip2	p0\.d, p1\.d, p2\.d
+**	ret
+*/
+TEST_UNIFORM_P (zip2_b64_untied,
+		p0 = svzip2_b64 (p1, p2),
+		p0 = svzip2_b64 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b8.c
new file mode 100644
index 000000000..223a22f99
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_b8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_b8_tied1:
+**	zip2	p0\.b, p0\.b, p1\.b
+**	ret
+*/
+TEST_UNIFORM_P (zip2_b8_tied1,
+		p0 = svzip2_b8 (p0, p1),
+		p0 = svzip2_b8 (p0, p1))
+
+/*
+** zip2_b8_tied2:
+**	zip2	p0\.b, p1\.b, p0\.b
+**	ret
+*/
+TEST_UNIFORM_P (zip2_b8_tied2,
+		p0 = svzip2_b8 (p1, p0),
+		p0 = svzip2_b8 (p1, p0))
+
+/*
+** zip2_b8_untied:
+**	zip2	p0\.b, p1\.b, p2\.b
+**	ret
+*/
+TEST_UNIFORM_P (zip2_b8_untied,
+		p0 = svzip2_b8 (p1, p2),
+		p0 = svzip2_b8 (p1, p2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_bf16.c
new file mode 100644
index 000000000..a9e0cfc93
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_bf16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_bf16_tied1:
+**	zip2	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_bf16_tied1, svbfloat16_t,
+		z0 = svzip2_bf16 (z0, z1),
+		z0 = svzip2 (z0, z1))
+
+/*
+** zip2_bf16_tied2:
+**	zip2	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_bf16_tied2, svbfloat16_t,
+		z0 = svzip2_bf16 (z1, z0),
+		z0 = svzip2 (z1, z0))
+
+/*
+** zip2_bf16_untied:
+**	zip2	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_bf16_untied, svbfloat16_t,
+		z0 = svzip2_bf16 (z1, z2),
+		z0 = svzip2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f16.c
new file mode 100644
index 000000000..73d4272bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_f16_tied1:
+**	zip2	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_f16_tied1, svfloat16_t,
+		z0 = svzip2_f16 (z0, z1),
+		z0 = svzip2 (z0, z1))
+
+/*
+** zip2_f16_tied2:
+**	zip2	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_f16_tied2, svfloat16_t,
+		z0 = svzip2_f16 (z1, z0),
+		z0 = svzip2 (z1, z0))
+
+/*
+** zip2_f16_untied:
+**	zip2	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_f16_untied, svfloat16_t,
+		z0 = svzip2_f16 (z1, z2),
+		z0 = svzip2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f32.c
new file mode 100644
index 000000000..2ad8ff81d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_f32_tied1:
+**	zip2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_f32_tied1, svfloat32_t,
+		z0 = svzip2_f32 (z0, z1),
+		z0 = svzip2 (z0, z1))
+
+/*
+** zip2_f32_tied2:
+**	zip2	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_f32_tied2, svfloat32_t,
+		z0 = svzip2_f32 (z1, z0),
+		z0 = svzip2 (z1, z0))
+
+/*
+** zip2_f32_untied:
+**	zip2	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_f32_untied, svfloat32_t,
+		z0 = svzip2_f32 (z1, z2),
+		z0 = svzip2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f64.c
new file mode 100644
index 000000000..de5c2646f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_f64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_f64_tied1:
+**	zip2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_f64_tied1, svfloat64_t,
+		z0 = svzip2_f64 (z0, z1),
+		z0 = svzip2 (z0, z1))
+
+/*
+** zip2_f64_tied2:
+**	zip2	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_f64_tied2, svfloat64_t,
+		z0 = svzip2_f64 (z1, z0),
+		z0 = svzip2 (z1, z0))
+
+/*
+** zip2_f64_untied:
+**	zip2	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_f64_untied, svfloat64_t,
+		z0 = svzip2_f64 (z1, z2),
+		z0 = svzip2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s16.c
new file mode 100644
index 000000000..fc366c991
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_s16_tied1:
+**	zip2	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_s16_tied1, svint16_t,
+		z0 = svzip2_s16 (z0, z1),
+		z0 = svzip2 (z0, z1))
+
+/*
+** zip2_s16_tied2:
+**	zip2	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_s16_tied2, svint16_t,
+		z0 = svzip2_s16 (z1, z0),
+		z0 = svzip2 (z1, z0))
+
+/*
+** zip2_s16_untied:
+**	zip2	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_s16_untied, svint16_t,
+		z0 = svzip2_s16 (z1, z2),
+		z0 = svzip2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s32.c
new file mode 100644
index 000000000..e56934d26
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_s32_tied1:
+**	zip2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_s32_tied1, svint32_t,
+		z0 = svzip2_s32 (z0, z1),
+		z0 = svzip2 (z0, z1))
+
+/*
+** zip2_s32_tied2:
+**	zip2	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_s32_tied2, svint32_t,
+		z0 = svzip2_s32 (z1, z0),
+		z0 = svzip2 (z1, z0))
+
+/*
+** zip2_s32_untied:
+**	zip2	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_s32_untied, svint32_t,
+		z0 = svzip2_s32 (z1, z2),
+		z0 = svzip2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s64.c
new file mode 100644
index 000000000..cefc73b72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_s64_tied1:
+**	zip2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_s64_tied1, svint64_t,
+		z0 = svzip2_s64 (z0, z1),
+		z0 = svzip2 (z0, z1))
+
+/*
+** zip2_s64_tied2:
+**	zip2	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_s64_tied2, svint64_t,
+		z0 = svzip2_s64 (z1, z0),
+		z0 = svzip2 (z1, z0))
+
+/*
+** zip2_s64_untied:
+**	zip2	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_s64_untied, svint64_t,
+		z0 = svzip2_s64 (z1, z2),
+		z0 = svzip2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s8.c
new file mode 100644
index 000000000..452bbce26
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_s8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_s8_tied1:
+**	zip2	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_s8_tied1, svint8_t,
+		z0 = svzip2_s8 (z0, z1),
+		z0 = svzip2 (z0, z1))
+
+/*
+** zip2_s8_tied2:
+**	zip2	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_s8_tied2, svint8_t,
+		z0 = svzip2_s8 (z1, z0),
+		z0 = svzip2 (z1, z0))
+
+/*
+** zip2_s8_untied:
+**	zip2	z0\.b, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_s8_untied, svint8_t,
+		z0 = svzip2_s8 (z1, z2),
+		z0 = svzip2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u16.c
new file mode 100644
index 000000000..9a20b4ed1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u16.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_u16_tied1:
+**	zip2	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_u16_tied1, svuint16_t,
+		z0 = svzip2_u16 (z0, z1),
+		z0 = svzip2 (z0, z1))
+
+/*
+** zip2_u16_tied2:
+**	zip2	z0\.h, z1\.h, z0\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_u16_tied2, svuint16_t,
+		z0 = svzip2_u16 (z1, z0),
+		z0 = svzip2 (z1, z0))
+
+/*
+** zip2_u16_untied:
+**	zip2	z0\.h, z1\.h, z2\.h
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_u16_untied, svuint16_t,
+		z0 = svzip2_u16 (z1, z2),
+		z0 = svzip2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u32.c
new file mode 100644
index 000000000..70626c66e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u32.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_u32_tied1:
+**	zip2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_u32_tied1, svuint32_t,
+		z0 = svzip2_u32 (z0, z1),
+		z0 = svzip2 (z0, z1))
+
+/*
+** zip2_u32_tied2:
+**	zip2	z0\.s, z1\.s, z0\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_u32_tied2, svuint32_t,
+		z0 = svzip2_u32 (z1, z0),
+		z0 = svzip2 (z1, z0))
+
+/*
+** zip2_u32_untied:
+**	zip2	z0\.s, z1\.s, z2\.s
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_u32_untied, svuint32_t,
+		z0 = svzip2_u32 (z1, z2),
+		z0 = svzip2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u64.c
new file mode 100644
index 000000000..43a43ff7c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u64.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_u64_tied1:
+**	zip2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_u64_tied1, svuint64_t,
+		z0 = svzip2_u64 (z0, z1),
+		z0 = svzip2 (z0, z1))
+
+/*
+** zip2_u64_tied2:
+**	zip2	z0\.d, z1\.d, z0\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_u64_tied2, svuint64_t,
+		z0 = svzip2_u64 (z1, z0),
+		z0 = svzip2 (z1, z0))
+
+/*
+** zip2_u64_untied:
+**	zip2	z0\.d, z1\.d, z2\.d
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_u64_untied, svuint64_t,
+		z0 = svzip2_u64 (z1, z2),
+		z0 = svzip2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u8.c
new file mode 100644
index 000000000..015f1844b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2_u8.c
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_u8_tied1:
+**	zip2	z0\.b, z0\.b, z1\.b
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_u8_tied1, svuint8_t,
+		z0 = svzip2_u8 (z0, z1),
+		z0 = svzip2 (z0, z1))
+
+/*
+** zip2_u8_tied2:
+**	zip2	z0\.b, z1\.b, z0\.b
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_u8_tied2, svuint8_t,
+		z0 = svzip2_u8 (z1, z0),
+		z0 = svzip2 (z1, z0))
+
+/*
+** zip2_u8_untied:
+**	zip2	z0\.b, z1\.b, z2\.b
+**	ret
+*/
+TEST_UNIFORM_Z (zip2_u8_untied, svuint8_t,
+		z0 = svzip2_u8 (z1, z2),
+		z0 = svzip2 (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_bf16.c
new file mode 100644
index 000000000..6d79136cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_bf16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2q_bf16_tied1:
+**	zip2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_bf16_tied1, svbfloat16_t,
+		z0 = svzip2q_bf16 (z0, z1),
+		z0 = svzip2q (z0, z1))
+
+/*
+** zip2q_bf16_tied2:
+**	zip2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_bf16_tied2, svbfloat16_t,
+		z0 = svzip2q_bf16 (z1, z0),
+		z0 = svzip2q (z1, z0))
+
+/*
+** zip2q_bf16_untied:
+**	zip2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_bf16_untied, svbfloat16_t,
+		z0 = svzip2q_bf16 (z1, z2),
+		z0 = svzip2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f16.c
new file mode 100644
index 000000000..984240e19
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2q_f16_tied1:
+**	zip2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_f16_tied1, svfloat16_t,
+		z0 = svzip2q_f16 (z0, z1),
+		z0 = svzip2q (z0, z1))
+
+/*
+** zip2q_f16_tied2:
+**	zip2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_f16_tied2, svfloat16_t,
+		z0 = svzip2q_f16 (z1, z0),
+		z0 = svzip2q (z1, z0))
+
+/*
+** zip2q_f16_untied:
+**	zip2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_f16_untied, svfloat16_t,
+		z0 = svzip2q_f16 (z1, z2),
+		z0 = svzip2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f32.c
new file mode 100644
index 000000000..0f8ccd804
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2q_f32_tied1:
+**	zip2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_f32_tied1, svfloat32_t,
+		z0 = svzip2q_f32 (z0, z1),
+		z0 = svzip2q (z0, z1))
+
+/*
+** zip2q_f32_tied2:
+**	zip2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_f32_tied2, svfloat32_t,
+		z0 = svzip2q_f32 (z1, z0),
+		z0 = svzip2q (z1, z0))
+
+/*
+** zip2q_f32_untied:
+**	zip2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_f32_untied, svfloat32_t,
+		z0 = svzip2q_f32 (z1, z2),
+		z0 = svzip2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f64.c
new file mode 100644
index 000000000..b5411cff7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_f64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2q_f64_tied1:
+**	zip2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_f64_tied1, svfloat64_t,
+		z0 = svzip2q_f64 (z0, z1),
+		z0 = svzip2q (z0, z1))
+
+/*
+** zip2q_f64_tied2:
+**	zip2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_f64_tied2, svfloat64_t,
+		z0 = svzip2q_f64 (z1, z0),
+		z0 = svzip2q (z1, z0))
+
+/*
+** zip2q_f64_untied:
+**	zip2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_f64_untied, svfloat64_t,
+		z0 = svzip2q_f64 (z1, z2),
+		z0 = svzip2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s16.c
new file mode 100644
index 000000000..66751fc7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2q_s16_tied1:
+**	zip2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_s16_tied1, svint16_t,
+		z0 = svzip2q_s16 (z0, z1),
+		z0 = svzip2q (z0, z1))
+
+/*
+** zip2q_s16_tied2:
+**	zip2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_s16_tied2, svint16_t,
+		z0 = svzip2q_s16 (z1, z0),
+		z0 = svzip2q (z1, z0))
+
+/*
+** zip2q_s16_untied:
+**	zip2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_s16_untied, svint16_t,
+		z0 = svzip2q_s16 (z1, z2),
+		z0 = svzip2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s32.c
new file mode 100644
index 000000000..830de3311
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2q_s32_tied1:
+**	zip2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_s32_tied1, svint32_t,
+		z0 = svzip2q_s32 (z0, z1),
+		z0 = svzip2q (z0, z1))
+
+/*
+** zip2q_s32_tied2:
+**	zip2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_s32_tied2, svint32_t,
+		z0 = svzip2q_s32 (z1, z0),
+		z0 = svzip2q (z1, z0))
+
+/*
+** zip2q_s32_untied:
+**	zip2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_s32_untied, svint32_t,
+		z0 = svzip2q_s32 (z1, z2),
+		z0 = svzip2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s64.c
new file mode 100644
index 000000000..917be4f40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2q_s64_tied1:
+**	zip2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_s64_tied1, svint64_t,
+		z0 = svzip2q_s64 (z0, z1),
+		z0 = svzip2q (z0, z1))
+
+/*
+** zip2q_s64_tied2:
+**	zip2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_s64_tied2, svint64_t,
+		z0 = svzip2q_s64 (z1, z0),
+		z0 = svzip2q (z1, z0))
+
+/*
+** zip2q_s64_untied:
+**	zip2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_s64_untied, svint64_t,
+		z0 = svzip2q_s64 (z1, z2),
+		z0 = svzip2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s8.c
new file mode 100644
index 000000000..dff6e2d7b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_s8.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2q_s8_tied1:
+**	zip2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_s8_tied1, svint8_t,
+		z0 = svzip2q_s8 (z0, z1),
+		z0 = svzip2q (z0, z1))
+
+/*
+** zip2q_s8_tied2:
+**	zip2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_s8_tied2, svint8_t,
+		z0 = svzip2q_s8 (z1, z0),
+		z0 = svzip2q (z1, z0))
+
+/*
+** zip2q_s8_untied:
+**	zip2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_s8_untied, svint8_t,
+		z0 = svzip2q_s8 (z1, z2),
+		z0 = svzip2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u16.c
new file mode 100644
index 000000000..9e194425c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u16.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2q_u16_tied1:
+**	zip2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_u16_tied1, svuint16_t,
+		z0 = svzip2q_u16 (z0, z1),
+		z0 = svzip2q (z0, z1))
+
+/*
+** zip2q_u16_tied2:
+**	zip2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_u16_tied2, svuint16_t,
+		z0 = svzip2q_u16 (z1, z0),
+		z0 = svzip2q (z1, z0))
+
+/*
+** zip2q_u16_untied:
+**	zip2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_u16_untied, svuint16_t,
+		z0 = svzip2q_u16 (z1, z2),
+		z0 = svzip2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u32.c
new file mode 100644
index 000000000..89de27f6b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u32.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2q_u32_tied1:
+**	zip2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_u32_tied1, svuint32_t,
+		z0 = svzip2q_u32 (z0, z1),
+		z0 = svzip2q (z0, z1))
+
+/*
+** zip2q_u32_tied2:
+**	zip2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_u32_tied2, svuint32_t,
+		z0 = svzip2q_u32 (z1, z0),
+		z0 = svzip2q (z1, z0))
+
+/*
+** zip2q_u32_untied:
+**	zip2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_u32_untied, svuint32_t,
+		z0 = svzip2q_u32 (z1, z2),
+		z0 = svzip2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u64.c
new file mode 100644
index 000000000..f2c9852ac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u64.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2q_u64_tied1:
+**	zip2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_u64_tied1, svuint64_t,
+		z0 = svzip2q_u64 (z0, z1),
+		z0 = svzip2q (z0, z1))
+
+/*
+** zip2q_u64_tied2:
+**	zip2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_u64_tied2, svuint64_t,
+		z0 = svzip2q_u64 (z1, z0),
+		z0 = svzip2q (z1, z0))
+
+/*
+** zip2q_u64_untied:
+**	zip2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_u64_untied, svuint64_t,
+		z0 = svzip2q_u64 (z1, z2),
+		z0 = svzip2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u8.c
new file mode 100644
index 000000000..a12905586
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/zip2q_u8.c
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2q_u8_tied1:
+**	zip2	z0\.q, z0\.q, z1\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_u8_tied1, svuint8_t,
+		z0 = svzip2q_u8 (z0, z1),
+		z0 = svzip2q (z0, z1))
+
+/*
+** zip2q_u8_tied2:
+**	zip2	z0\.q, z1\.q, z0\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_u8_tied2, svuint8_t,
+		z0 = svzip2q_u8 (z1, z0),
+		z0 = svzip2q (z1, z0))
+
+/*
+** zip2q_u8_untied:
+**	zip2	z0\.q, z1\.q, z2\.q
+**	ret
+*/
+TEST_UNIFORM_Z (zip2q_u8_untied, svuint8_t,
+		z0 = svzip2q_u8 (z1, z2),
+		z0 = svzip2q (z1, z2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/adr_index_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/adr_index_1.c
new file mode 100644
index 000000000..714265ed1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/adr_index_1.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, uint32_t *u32_ptr, svuint8_t u8, svuint16_t u16,
+    svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64)
+{
+  svadrh_index (u32); /* { dg-error {too few arguments to function 'svadrh_index'} } */
+  svadrh_index (u32, u32, u32); /* { dg-error {too many arguments to function 'svadrh_index'} } */
+  svadrh_index (u32_ptr, s32); /* { dg-error {passing '[^']*\*'[^\n]* to argument 1 of 'svadrh_index', which expects an SVE vector type} } */
+  svadrh_index (0, s32); /* { dg-error {passing 'int' to argument 1 of 'svadrh_index', which expects an SVE vector type} } */
+  svadrh_index (u16, u16); /* { dg-error {passing 'svuint16_t' to argument 1 of 'svadrh_index', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svadrh_index (s32, s32); /* { dg-error {passing 'svint32_t' to argument 1 of 'svadrh_index', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svadrh_index (f32, s32); /* { dg-error {passing 'svfloat32_t' to argument 1 of 'svadrh_index', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svadrh_index (pg, s32); /* { dg-error {passing 'svbool_t' to argument 1 of 'svadrh_index', which expects 'svuint32_t' or 'svuint64_t'} } */
+
+  svadrh_index (u32, 0); /* { dg-error {passing 'int' to argument 2 of 'svadrh_index', which expects an SVE vector type} } */
+  svadrh_index (u32, u8); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svadrh_index', which expects a vector of 32-bit or 64-bit integers} } */
+  svadrh_index (u32, u16); /* { dg-error {passing 'svuint16_t' to argument 2 of 'svadrh_index', which expects a vector of 32-bit or 64-bit integers} } */
+  svadrh_index (u32, pg); /* { dg-error {passing 'svbool_t' to argument 2 of 'svadrh_index', which expects a vector of integers} } */
+
+  svadrh_index (u32, s32);
+  svadrh_index (u32, u32);
+  svadrh_index (u32, f32); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svadrh_index', which expects a vector of integers} } */
+  svadrh_index (u32, s64); /* { dg-error {cannot combine a base of type 'svuint32_t' with an index of type 'svint64_t'} } */
+  svadrh_index (u32, u64); /* { dg-error {cannot combine a base of type 'svuint32_t' with an index of type 'svuint64_t'} } */
+  svadrh_index (u32, f64); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svadrh_index', which expects a vector of integers} } */
+
+  svadrh_index (u64, s32); /* { dg-error {cannot combine a base of type 'svuint64_t' with an index of type 'svint32_t'} } */
+  svadrh_index (u64, u32); /* { dg-error {cannot combine a base of type 'svuint64_t' with an index of type 'svuint32_t'} } */
+  svadrh_index (u64, f32); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svadrh_index', which expects a vector of integers} } */
+  svadrh_index (u64, s64);
+  svadrh_index (u64, u64);
+  svadrh_index (u64, f64); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svadrh_index', which expects a vector of integers} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/adr_offset_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/adr_offset_1.c
new file mode 100644
index 000000000..528d7ac51
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/adr_offset_1.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, uint32_t *u32_ptr, svuint8_t u8, svuint16_t u16,
+    svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64)
+{
+  svadrb_offset (u32); /* { dg-error {too few arguments to function 'svadrb_offset'} } */
+  svadrb_offset (u32, u32, u32); /* { dg-error {too many arguments to function 'svadrb_offset'} } */
+  svadrb_offset (u32_ptr, s32); /* { dg-error {passing '[^']*\*'[^\n]* to argument 1 of 'svadrb_offset', which expects an SVE vector type} } */
+  svadrb_offset (0, s32); /* { dg-error {passing 'int' to argument 1 of 'svadrb_offset', which expects an SVE vector type} } */
+  svadrb_offset (u16, u16); /* { dg-error {passing 'svuint16_t' to argument 1 of 'svadrb_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svadrb_offset (s32, s32); /* { dg-error {passing 'svint32_t' to argument 1 of 'svadrb_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svadrb_offset (f32, s32); /* { dg-error {passing 'svfloat32_t' to argument 1 of 'svadrb_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svadrb_offset (pg, s32); /* { dg-error {passing 'svbool_t' to argument 1 of 'svadrb_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
+
+  svadrb_offset (u32, 0); /* { dg-error {passing 'int' to argument 2 of 'svadrb_offset', which expects an SVE vector type} } */
+  svadrb_offset (u32, u8); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svadrb_offset', which expects a vector of 32-bit or 64-bit integers} } */
+  svadrb_offset (u32, u16); /* { dg-error {passing 'svuint16_t' to argument 2 of 'svadrb_offset', which expects a vector of 32-bit or 64-bit integers} } */
+  svadrb_offset (u32, pg); /* { dg-error {passing 'svbool_t' to argument 2 of 'svadrb_offset', which expects a vector of integers} } */
+
+  svadrb_offset (u32, s32);
+  svadrb_offset (u32, u32);
+  svadrb_offset (u32, f32); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svadrb_offset', which expects a vector of integers} } */
+  svadrb_offset (u32, s64); /* { dg-error {cannot combine a base of type 'svuint32_t' with an offset of type 'svint64_t'} } */
+  svadrb_offset (u32, u64); /* { dg-error {cannot combine a base of type 'svuint32_t' with an offset of type 'svuint64_t'} } */
+  svadrb_offset (u32, f64); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svadrb_offset', which expects a vector of integers} } */
+
+  svadrb_offset (u64, s32); /* { dg-error {cannot combine a base of type 'svuint64_t' with an offset of type 'svint32_t'} } */
+  svadrb_offset (u64, u32); /* { dg-error {cannot combine a base of type 'svuint64_t' with an offset of type 'svuint32_t'} } */
+  svadrb_offset (u64, f32); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svadrb_offset', which expects a vector of integers} } */
+  svadrb_offset (u64, s64);
+  svadrb_offset (u64, u64);
+  svadrb_offset (u64, f64); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svadrb_offset', which expects a vector of integers} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_1.c
new file mode 100644
index 000000000..8ce89fa10
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+svuint8_t
+f1 (svbool_t pg, svuint8_t u8, svint16_t s16)
+{
+  svzip1 (pg); /* { dg-error {too few arguments to function 'svzip1'} } */
+  svzip1 (pg, u8, u8); /* { dg-error {too many arguments to function 'svzip1'} } */
+  svzip1 (pg, u8); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svzip1', but previous arguments had type 'svbool_t'} } */
+  svzip1 (u8, pg); /* { dg-error {passing 'svbool_t' to argument 2 of 'svzip1', but previous arguments had type 'svuint8_t'} } */
+  svzip1 (u8, s16); /* { dg-error {passing 'svint16_t' to argument 2 of 'svzip1', but previous arguments had type 'svuint8_t'} } */
+  svzip1 (u8, 0); /* { dg-error {passing 'int' to argument 2 of 'svzip1', which expects an SVE vector type} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_int_opt_n.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_int_opt_n.c
new file mode 100644
index 000000000..965e9a13c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_int_opt_n.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svfloat16_t f16, svint16_t s16, svuint16_t u16,
+    svfloat32_t f32, svint32_t s32, svuint32_t u32)
+{
+  svscale_x (pg, f16); /* { dg-error {too few arguments to function 'svscale_x'} } */
+  svscale_x (pg, f16, s16, s16); /* { dg-error {too many arguments to function 'svscale_x'} } */
+  svscale_x (s32, f16, s32); /* { dg-error {passing 'svint32_t' to argument 1 of 'svscale_x', which expects 'svbool_t'} } */
+  svscale_x (1, f16, s32); /* { dg-error {passing 'int' to argument 1 of 'svscale_x', which expects 'svbool_t'} } */
+  svscale_x (pg, pg, s16); /* { dg-error {'svscale_x' has no form that takes 'svbool_t' arguments} } */
+  svscale_x (pg, 1, s16); /* { dg-error {passing 'int' to argument 2 of 'svscale_x', which expects an SVE vector type} } */
+  svscale_x (pg, f16, s16);
+  svscale_x (pg, f16, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svscale_x', which expects a vector of signed integers} } */
+  svscale_x (pg, f16, f16); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svscale_x', which expects a vector of signed integers} } */
+  svscale_x (pg, f16, s32); /* { dg-error {arguments 2 and 3 of 'svscale_x' must have the same element size, but the values passed here have type 'svfloat16_t' and 'svint32_t' respectively} } */
+  svscale_x (pg, f16, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svscale_x', which expects a vector of signed integers} } */
+  svscale_x (pg, f16, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svscale_x', which expects a vector of signed integers} } */
+  svscale_x (pg, f16, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svscale_x', which expects a vector of signed integers} } */
+  svscale_x (pg, f16, 0);
+  svscale_x (pg, s16, s16); /* { dg-error {'svscale_x' has no form that takes 'svint16_t' arguments} } */
+  svscale_x (pg, s16, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svscale_x', which expects a vector of signed integers} } */
+  svscale_x (pg, s16, s32); /* { dg-error {'svscale_x' has no form that takes 'svint16_t' arguments} } */
+  svscale_x (pg, s16, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svscale_x', which expects a vector of signed integers} } */
+  svscale_x (pg, u16, s16); /* { dg-error {'svscale_x' has no form that takes 'svuint16_t' arguments} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_lane_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_lane_1.c
new file mode 100644
index 000000000..f1879ca6e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_lane_1.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svfloat16_t f16, svfloat32_t f32, svfloat64_t f64,
+    svint32_t s32, int i)
+{
+  svmul_lane (f32, f32); /* { dg-error {too few arguments to function 'svmul_lane'} } */
+  svmul_lane (f32, f32, 0, 0); /* { dg-error {too many arguments to function 'svmul_lane'} } */
+  svmul_lane (pg, pg, 0); /* { dg-error {'svmul_lane' has no form that takes 'svbool_t' arguments} } */
+  svmul_lane (s32, s32, 0); /* { dg-error {'svmul_lane' has no form that takes 'svint32_t' arguments} } */
+  svmul_lane (1, f32, 0); /* { dg-error {passing 'int' to argument 1 of 'svmul_lane', which expects an SVE vector type} } */
+  svmul_lane (f32, 1, 0); /* { dg-error {passing 'int' to argument 2 of 'svmul_lane', which expects an SVE vector type} } */
+  svmul_lane (f32, f64, 0); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svmul_lane', but previous arguments had type 'svfloat32_t'} } */
+  svmul_lane (f32, f32, s32); /* { dg-error {argument 3 of 'svmul_lane' must be an integer constant expression} } */
+  svmul_lane (f32, f32, i); /* { dg-error {argument 3 of 'svmul_lane' must be an integer constant expression} } */
+
+  svmul_lane (f16, f16, 0);
+  svmul_lane (f16, f16, 7);
+  svmul_lane (f16, f16, 8); /* { dg-error {passing 8 to argument 3 of 'svmul_lane', which expects a value in the range \[0, 7\]} } */
+  svmul_lane (f16, f16, -1); /* { dg-error {passing -1 to argument 3 of 'svmul_lane', which expects a value in the range \[0, 7\]} } */
+
+  svmul_lane (f32, f32, 0);
+  svmul_lane (f32, f32, 3);
+  svmul_lane (f32, f32, 4); /* { dg-error {passing 4 to argument 3 of 'svmul_lane', which expects a value in the range \[0, 3\]} } */
+  svmul_lane (f32, f32, -1); /* { dg-error {passing -1 to argument 3 of 'svmul_lane', which expects a value in the range \[0, 3\]} } */
+
+  svmul_lane (f64, f64, 0);
+  svmul_lane (f64, f64, 1);
+  svmul_lane (f64, f64, 2); /* { dg-error {passing 2 to argument 3 of 'svmul_lane', which expects a value in the range \[0, 1\]} } */
+  svmul_lane (f64, f64, -1); /* { dg-error {passing -1 to argument 3 of 'svmul_lane', which expects a value in the range \[0, 1\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_n_1.c
new file mode 100644
index 000000000..0c69e66a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_n_1.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svuint8_t u8, svfloat16_t f16, int i, float f)
+{
+  svinsr (u8); /* { dg-error {too few arguments to function 'svinsr'} } */
+  svinsr (u8, 0, 0); /* { dg-error {too many arguments to function 'svinsr'} } */
+  svinsr (0, 0); /* { dg-error {passing 'int' to argument 1 of 'svinsr', which expects an SVE vector type} } */
+  svinsr (u8, 0);
+  svinsr (u8, -1);
+  svinsr (u8, i);
+  svinsr (u8, f);
+  svinsr (u8, u8); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svinsr', which expects a scalar element} } */
+  svinsr (pg, 0); /* { dg-error {'svinsr' has no form that takes 'svbool_t' arguments} } */
+  svinsr (f16, f);
+  svinsr (f16, f16); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svinsr', which expects a scalar element} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_1.c
new file mode 100644
index 000000000..29615e5be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+svuint8_t
+f1 (svbool_t pg, svuint8_t u8, svint8_t s8)
+{
+  svadd_u8_x (pg, u8, s8); /* { dg-error {incompatible type for argument 3 of 'svadd_u8_x'} } */
+  svadd_u8_x (pg, u8); /* { dg-error {too few arguments to function 'svadd_u8_x'} } */
+  svadd_u8_x (pg, u8, u8, u8); /* { dg-error {too many arguments to function 'svadd_u8_x'} } */
+  return svadd_s8_x (pg, s8, s8); /* { dg-error {incompatible types when returning type 'svint8_t' but 'svuint8_t' was expected} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_2.c
new file mode 100644
index 000000000..9fa83ca99
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_2.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint8_t s8, svuint8_t u8,
+    svint16_t s16, svuint16_t u16, svfloat16_t f16)
+{
+  svadd_x (pg, u8); /* { dg-error {too few arguments to function 'svadd_x'} } */
+  svadd_x (pg, u8, u8, u8); /* { dg-error {too many arguments to function 'svadd_x'} } */
+  svadd_x (u8, u8, u8); /* { dg-error {passing 'svuint8_t' to argument 1 of 'svadd_x', which expects 'svbool_t'} } */
+  svadd_x (pg, pg, pg); /* { dg-error {'svadd_x' has no form that takes 'svbool_t' arguments} } */
+  svadd_x (pg, 1, u8); /* { dg-error {passing 'int' to argument 2 of 'svadd_x', which expects an SVE vector type} } */
+  svadd_x (pg, u8, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svadd_x', but previous arguments had type 'svuint8_t'} } */
+  svadd_x (pg, u8, u8);
+  svadd_x (pg, u8, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svadd_x', but previous arguments had type 'svuint8_t'} } */
+  svadd_x (pg, u8, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svadd_x', but previous arguments had type 'svuint8_t'} } */
+  svadd_x (pg, u8, f16); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svadd_x', but previous arguments had type 'svuint8_t'} } */
+  svadd_x (pg, u8, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svadd_x', but previous arguments had type 'svuint8_t'} } */
+  svadd_x (pg, u8, 0);
+
+  svadd_x (pg, f16, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svadd_x', but previous arguments had type 'svfloat16_t'} } */
+  svadd_x (pg, f16, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svadd_x', but previous arguments had type 'svfloat16_t'} } */
+  svadd_x (pg, f16, f16);
+  svadd_x (pg, f16, 1);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_3.c
new file mode 100644
index 000000000..4d0b253e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_opt_n_3.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint8_t s8, svuint8_t u8,
+    svint16_t s16, svuint16_t u16, svfloat16_t f16)
+{
+  svand_z (pg, u8); /* { dg-error {too few arguments to function 'svand_z'} } */
+  svand_z (pg, u8, u8, u8); /* { dg-error {too many arguments to function 'svand_z'} } */
+  svand_z (u8, u8, u8); /* { dg-error {passing 'svuint8_t' to argument 1 of 'svand_z', which expects 'svbool_t'} } */
+  svand_z (pg, pg, pg);
+  svand_z (pg, 1, u8); /* { dg-error {passing 'int' to argument 2 of 'svand_z', which expects an SVE vector type} } */
+  svand_z (pg, u8, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svand_z', but previous arguments had type 'svuint8_t'} } */
+  svand_z (pg, u8, u8);
+  svand_z (pg, u8, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svand_z', but previous arguments had type 'svuint8_t'} } */
+  svand_z (pg, u8, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svand_z', but previous arguments had type 'svuint8_t'} } */
+  svand_z (pg, u8, f16); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svand_z', but previous arguments had type 'svuint8_t'} } */
+  svand_z (pg, u8, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svand_z', but previous arguments had type 'svuint8_t'} } */
+  svand_z (pg, u8, 0);
+
+  svand_z (pg, pg, u8); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svand_z', but previous arguments had type 'svbool_t'} } */
+  svand_z (pg, pg, 0); /* { dg-error {passing 'int' to argument 3 of 'svand_z', but its 'svbool_t' form does not accept scalars} } */
+
+  svand_z (pg, f16, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svand_z', but previous arguments had type 'svfloat16_t'} } */
+  svand_z (pg, f16, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svand_z', but previous arguments had type 'svfloat16_t'} } */
+  svand_z (pg, f16, f16); /* { dg-error {'svand_z' has no form that takes 'svfloat16_t' arguments} } */
+  svand_z (pg, f16, 1); /* { dg-error {'svand_z' has no form that takes 'svfloat16_t' arguments} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_rotate_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_rotate_1.c
new file mode 100644
index 000000000..8ffe91bce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_rotate_1.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svfloat32_t f32, svfloat64_t f64, svint32_t s32, int i)
+{
+  svcadd_x (pg, f32, f32); /* { dg-error {too few arguments to function 'svcadd_x'} } */
+  svcadd_x (pg, f32, f32, 90, 90); /* { dg-error {too many arguments to function 'svcadd_x'} } */
+  svcadd_x (f32, f32, f32, 90); /* { dg-error {passing 'svfloat32_t' to argument 1 of 'svcadd_x', which expects 'svbool_t'} } */
+  svcadd_x (pg, pg, pg, 90); /* { dg-error {'svcadd_x' has no form that takes 'svbool_t' arguments} } */
+  svcadd_x (pg, s32, s32, 90); /* { dg-error {'svcadd_x' has no form that takes 'svint32_t' arguments} } */
+  svcadd_x (pg, 1, f32, 90); /* { dg-error {passing 'int' to argument 2 of 'svcadd_x', which expects an SVE vector type} } */
+  svcadd_x (pg, f32, 1, 90); /* { dg-error {passing 'int' to argument 3 of 'svcadd_x', which expects an SVE vector type} } */
+  svcadd_x (pg, f32, f64, 90); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svcadd_x', but previous arguments had type 'svfloat32_t'} } */
+  svcadd_x (pg, f32, f32, s32); /* { dg-error {argument 4 of 'svcadd_x' must be an integer constant expression} } */
+  svcadd_x (pg, f32, f32, i); /* { dg-error {argument 4 of 'svcadd_x' must be an integer constant expression} } */
+  svcadd_x (pg, f32, f32, -90); /* { dg-error {passing -90 to argument 4 of 'svcadd_x', which expects either 90 or 270} } */
+  svcadd_x (pg, f32, f32, 0); /* { dg-error {passing 0 to argument 4 of 'svcadd_x', which expects either 90 or 270} } */
+  svcadd_x (pg, f32, f32, 1); /* { dg-error {passing 1 to argument 4 of 'svcadd_x', which expects either 90 or 270} } */
+  svcadd_x (pg, f32, f32, 90);
+  svcadd_x (pg, f32, f32, 180); /* { dg-error {passing 180 to argument 4 of 'svcadd_x', which expects either 90 or 270} } */
+  svcadd_x (pg, f32, f32, 270);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_n_1.c
new file mode 100644
index 000000000..c8ca5f746
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_n_1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svuint8_t u8, int i, float f)
+{
+  svdupq_lane (u8); /* { dg-error {too few arguments to function 'svdupq_lane'} } */
+  svdupq_lane (u8, 0, 0); /* { dg-error {too many arguments to function 'svdupq_lane'} } */
+  svdupq_lane (0, 0); /* { dg-error {passing 'int' to argument 1 of 'svdupq_lane', which expects an SVE vector type} } */
+  svdupq_lane (u8, 0);
+  svdupq_lane (u8, -1);
+  svdupq_lane (u8, i);
+  svdupq_lane (u8, f);
+  svdupq_lane (u8, u8); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svdupq_lane', which expects 'uint64_t'} } */
+  svdupq_lane (pg, 0); /* { dg-error {'svdupq_lane' has no form that takes 'svbool_t' arguments} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_opt_n_1.c
new file mode 100644
index 000000000..27726a80f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_opt_n_1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+svuint8_t
+f1 (svbool_t pg, svuint8_t u8, svint8_t s8, svuint64_t u64)
+{
+  svlsl_wide_u8_x (pg, u8, u8); /* { dg-error {incompatible type for argument 3 of 'svlsl_wide_u8_x'} } */
+  svlsl_wide_u8_x (pg, u8); /* { dg-error {too few arguments to function 'svlsl_wide_u8_x'} } */
+  svlsl_wide_u8_x (pg, u8, u64, u8); /* { dg-error {too many arguments to function 'svlsl_wide_u8_x'} } */
+  return svlsl_wide_s8_x (pg, s8, u64); /* { dg-error {incompatible types when returning type 'svint8_t' but 'svuint8_t' was expected} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_opt_n_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_opt_n_2.c
new file mode 100644
index 000000000..be217394f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint64_opt_n_2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svuint8_t u8, svuint64_t u64)
+{
+  svlsl_wide_x (pg, u8); /* { dg-error {too few arguments to function 'svlsl_wide_x'} } */
+  svlsl_wide_x (pg, u8, u8, u8); /* { dg-error {too many arguments to function 'svlsl_wide_x'} } */
+  svlsl_wide_x (u8, u8, u64); /* { dg-error {passing 'svuint8_t' to argument 1 of 'svlsl_wide_x', which expects 'svbool_t'} } */
+  svlsl_wide_x (pg, 1, u64); /* { dg-error {passing 'int' to argument 2 of 'svlsl_wide_x', which expects an SVE vector type} } */
+  svlsl_wide_x (pg, u8, u8); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svlsl_wide_x', which expects 'svuint64_t'} } */
+  svlsl_wide_x (pg, u64, u64); /* { dg-error {'svlsl_wide_x' has no form that takes 'svuint64_t' arguments} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_1.c
new file mode 100644
index 000000000..8f86c50b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_1.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svuint8_t u8, svint8_t s8, svuint16_t u16, svint16_t s16,
+    svfloat16_t f16)
+{
+  svtbl (u8); /* { dg-error {too few arguments to function 'svtbl'} } */
+  svtbl (u8, u8, u8); /* { dg-error {too many arguments to function 'svtbl'} } */
+  svtbl (pg, pg); /* { dg-error {passing 'svbool_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
+  svtbl (pg, u8); /* { dg-error {'svtbl' has no form that takes 'svbool_t' arguments} } */
+
+  svtbl (u8, 0); /* { dg-error {passing 'int' to argument 2 of 'svtbl', which expects an SVE vector type} } */
+  svtbl (u8, u8);
+  svtbl (u8, s8); /* { dg-error {passing 'svint8_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
+  svtbl (u8, u16); /* { dg-error {arguments 1 and 2 of 'svtbl' must have the same element size, but the values passed here have type 'svuint8_t' and 'svuint16_t' respectively} } */
+  svtbl (u8, s16); /* { dg-error {passing 'svint16_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
+  svtbl (u8, pg); /* { dg-error {passing 'svbool_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
+
+  svtbl (s8, u8);
+  svtbl (s8, s8); /* { dg-error {passing 'svint8_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
+  svtbl (s8, u16); /* { dg-error {arguments 1 and 2 of 'svtbl' must have the same element size, but the values passed here have type 'svint8_t' and 'svuint16_t' respectively} } */
+  svtbl (s8, s16); /* { dg-error {passing 'svint16_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
+  svtbl (s8, pg); /* { dg-error {passing 'svbool_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
+
+  svtbl (u16, u8); /* { dg-error {arguments 1 and 2 of 'svtbl' must have the same element size, but the values passed here have type 'svuint16_t' and 'svuint8_t' respectively} } */
+  svtbl (u16, s8); /* { dg-error {passing 'svint8_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
+  svtbl (u16, u16);
+  svtbl (u16, s16); /* { dg-error {passing 'svint16_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
+  svtbl (u16, f16); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
+
+  svtbl (s16, u8); /* { dg-error {arguments 1 and 2 of 'svtbl' must have the same element size, but the values passed here have type 'svint16_t' and 'svuint8_t' respectively} } */
+  svtbl (s16, s8); /* { dg-error {passing 'svint8_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
+  svtbl (s16, u16);
+  svtbl (s16, s16); /* { dg-error {passing 'svint16_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
+  svtbl (s16, f16); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
+
+  svtbl (f16, u8); /* { dg-error {arguments 1 and 2 of 'svtbl' must have the same element size, but the values passed here have type 'svfloat16_t' and 'svuint8_t' respectively} } */
+  svtbl (f16, s8); /* { dg-error {passing 'svint8_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
+  svtbl (f16, u16);
+  svtbl (f16, s16); /* { dg-error {passing 'svint16_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
+  svtbl (f16, f16); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svtbl', which expects a vector of unsigned integers} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_n_1.c
new file mode 100644
index 000000000..36a902e69
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_n_1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svuint8_t u8, int i, float f)
+{
+  svdup_lane (u8); /* { dg-error {too few arguments to function 'svdup_lane'} } */
+  svdup_lane (u8, 0, 0); /* { dg-error {too many arguments to function 'svdup_lane'} } */
+  svdup_lane (0, 0); /* { dg-error {passing 'int' to argument 1 of 'svdup_lane', which expects an SVE vector type} } */
+  svdup_lane (u8, 0);
+  svdup_lane (u8, -1);
+  svdup_lane (u8, i);
+  svdup_lane (u8, f);
+  svdup_lane (u8, u8); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svdup_lane', which expects a scalar integer} } */
+  svdup_lane (pg, 0); /* { dg-error {'svdup_lane' has no form that takes 'svbool_t' arguments} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_opt_n_1.c
new file mode 100644
index 000000000..b162ab405
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/binary_uint_opt_n_1.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svfloat16_t f16, svint16_t s16, svuint16_t u16,
+    svfloat32_t f32, svint32_t s32, svuint32_t u32)
+{
+  svlsl_x (pg, s16); /* { dg-error {too few arguments to function 'svlsl_x'} } */
+  svlsl_x (pg, s16, u16, u16); /* { dg-error {too many arguments to function 'svlsl_x'} } */
+  svlsl_x (s32, s32, u32); /* { dg-error {passing 'svint32_t' to argument 1 of 'svlsl_x', which expects 'svbool_t'} } */
+  svlsl_x (1, s32, u32); /* { dg-error {passing 'int' to argument 1 of 'svlsl_x', which expects 'svbool_t'} } */
+  svlsl_x (pg, pg, u16); /* { dg-error {'svlsl_x' has no form that takes 'svbool_t' arguments} } */
+  svlsl_x (pg, 1, s16); /* { dg-error {passing 'int' to argument 2 of 'svlsl_x', which expects an SVE vector type} } */
+  svlsl_x (pg, s16, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svlsl_x', which expects a vector of unsigned integers} } */
+  svlsl_x (pg, s16, u16);
+  svlsl_x (pg, s16, f16); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svlsl_x', which expects a vector of unsigned integers} } */
+  svlsl_x (pg, s16, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svlsl_x', which expects a vector of unsigned integers} } */
+  svlsl_x (pg, s16, u32); /* { dg-error {arguments 2 and 3 of 'svlsl_x' must have the same element size, but the values passed here have type 'svint16_t' and 'svuint32_t' respectively} } */
+  svlsl_x (pg, s16, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svlsl_x', which expects a vector of unsigned integers} } */
+  svlsl_x (pg, s16, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svlsl_x', which expects a vector of unsigned integers} } */
+  svlsl_x (pg, s16, 0);
+  svlsl_x (pg, f16, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svlsl_x', which expects a vector of unsigned integers} } */
+  svlsl_x (pg, f16, u16); /* { dg-error {'svlsl_x' has no form that takes 'svfloat16_t' arguments} } */
+  svlsl_x (pg, f16, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svlsl_x', which expects a vector of unsigned integers} } */
+  svlsl_x (pg, f16, u32); /* { dg-error {'svlsl_x' has no form that takes 'svfloat16_t' arguments} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/clast_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/clast_1.c
new file mode 100644
index 000000000..cb9ac946c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/clast_1.c
@@ -0,0 +1,15 @@
+#include <arm_sve.h>
+
+void
+test (svbool_t pg, svint32_t s32, svint64_t s64, int i)
+{
+  svclasta (pg, 1); /* { dg-error {too few arguments to function 'svclasta'} } */
+  svclasta (pg, 1, s32, 1); /* { dg-error {too many arguments to function 'svclasta'} } */
+  svclasta (1, 1, s32); /* { dg-error {passing 'int' to argument 1 of 'svclasta', which expects 'svbool_t'} } */
+  svclasta (pg, 1, 1); /* { dg-error {passing 'int' to argument 3 of 'svclasta', which expects an SVE vector type} } */
+  svclasta (pg, 1, pg); /* { dg-error {'svclasta' has no form that takes 'svbool_t' arguments} } */
+  svclasta (pg, i, s32);
+  svclasta (pg, s32, 1); /* { dg-error {passing 'int' to argument 3 of 'svclasta', which expects an SVE vector type} } */
+  svclasta (pg, s32, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svclasta', but previous arguments had type 'svint32_t'} } */
+  svclasta (pg, pg, pg); /* { dg-error {'svclasta' has no form that takes 'svbool_t' arguments} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_opt_n_1.c
new file mode 100644
index 000000000..71c8e86d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_opt_n_1.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint8_t s8, svuint8_t u8,
+    svint16_t s16, svuint16_t u16, svfloat16_t f16)
+{
+  svcmpeq (pg, u8); /* { dg-error {too few arguments to function 'svcmpeq'} } */
+  svcmpeq (pg, u8, u8, u8); /* { dg-error {too many arguments to function 'svcmpeq'} } */
+  svcmpeq (u8, u8, u8); /* { dg-error {passing 'svuint8_t' to argument 1 of 'svcmpeq', which expects 'svbool_t'} } */
+  svcmpeq (pg, pg, pg); /* { dg-error {'svcmpeq' has no form that takes 'svbool_t' arguments} } */
+  svcmpeq (pg, 1, u8); /* { dg-error {passing 'int' to argument 2 of 'svcmpeq', which expects an SVE vector type} } */
+  svcmpeq (pg, u8, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svcmpeq', but previous arguments had type 'svuint8_t'} } */
+  svcmpeq (pg, u8, u8);
+  svcmpeq (pg, u8, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svcmpeq', but previous arguments had type 'svuint8_t'} } */
+  svcmpeq (pg, u8, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svcmpeq', but previous arguments had type 'svuint8_t'} } */
+  svcmpeq (pg, u8, f16); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svcmpeq', but previous arguments had type 'svuint8_t'} } */
+  svcmpeq (pg, u8, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svcmpeq', but previous arguments had type 'svuint8_t'} } */
+  svcmpeq (pg, u8, 0);
+
+  svcmpeq (pg, f16, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svcmpeq', but previous arguments had type 'svfloat16_t'} } */
+  svcmpeq (pg, f16, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svcmpeq', but previous arguments had type 'svfloat16_t'} } */
+  svcmpeq (pg, f16, f16);
+  svcmpeq (pg, f16, 1);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_scalar_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_scalar_1.c
new file mode 100644
index 000000000..d5a60f841
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_scalar_1.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+#include <stdbool.h>
+
+enum signed_enum { SA = -1, SB };
+enum unsigned_enum { UA, UB };
+
+void
+test (int8_t s8, int16_t s16, int32_t s32, int64_t s64,
+      uint8_t u8, uint16_t u16, uint32_t u32, uint64_t u64,
+      bool b, enum signed_enum se, enum unsigned_enum ue,
+      int *ptr, float f32, svbool_t pg, svint32_t vec)
+{
+  svwhilele_b8 (s32); /* { dg-error {too few arguments to function 'svwhilele_b8'} } */
+  svwhilele_b8 (s32, s32, s32); /* { dg-error {too many arguments to function 'svwhilele_b8'} } */
+
+  svwhilele_b8 (b, b);
+  svwhilele_b8 (se, se);
+  svwhilele_b8 (ue, ue);
+  svwhilele_b8 (s8, s8);
+  svwhilele_b8 (u8, u8);
+  svwhilele_b8 (s16, s16);
+  svwhilele_b8 (u16, u16);
+  svwhilele_b8 (ptr, ptr); /* { dg-error {passing 'int \*' to argument 1 of 'svwhilele_b8', which expects a 32-bit or 64-bit integer type} } */
+  svwhilele_b8 (f32, f32); /* { dg-error {passing 'float' to argument 1 of 'svwhilele_b8', which expects a 32-bit or 64-bit integer type} } */
+  svwhilele_b8 (pg, pg); /* { dg-error {passing 'svbool_t' to argument 1 of 'svwhilele_b8', which expects a 32-bit or 64-bit integer type} } */
+  svwhilele_b8 (vec, vec); /* { dg-error {passing 'svint32_t' to argument 1 of 'svwhilele_b8', which expects a 32-bit or 64-bit integer type} } */
+
+  svwhilele_b8 (s32, b);
+  svwhilele_b8 (s32, se);
+  svwhilele_b8 (s32, ue); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int32_t' but argument 2 has type 'uint32_t'} } */
+  svwhilele_b8 (s32, s8);
+  svwhilele_b8 (s32, u8);
+  svwhilele_b8 (s32, s16);
+  svwhilele_b8 (s32, u16);
+
+  svwhilele_b8 (u32, b); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
+  svwhilele_b8 (u32, se); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
+  svwhilele_b8 (u32, ue);
+  svwhilele_b8 (u32, s8); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
+  svwhilele_b8 (u32, u8); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
+  svwhilele_b8 (u32, s16); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
+  svwhilele_b8 (u32, u16); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
+
+  svwhilele_b8 (s32, s32);
+  svwhilele_b8 (s32, u32); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int32_t' but argument 2 has type 'uint32_t'} } */
+  svwhilele_b8 (s32, s64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int32_t' but argument 2 has type 'int64_t'} } */
+  svwhilele_b8 (s32, u64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int32_t' but argument 2 has type 'uint64_t'} } */
+
+  svwhilele_b8 (u32, s32); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
+  svwhilele_b8 (u32, u32);
+  svwhilele_b8 (u32, s64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int64_t'} } */
+  svwhilele_b8 (u32, u64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'uint64_t'} } */
+
+  svwhilele_b8 (s64, s32); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int64_t' but argument 2 has type 'int32_t'} } */
+  svwhilele_b8 (s64, u32); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int64_t' but argument 2 has type 'uint32_t'} } */
+  svwhilele_b8 (s64, s64);
+  svwhilele_b8 (s64, u64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int64_t' but argument 2 has type 'uint64_t'} } */
+
+  svwhilele_b8 (u64, s32); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint64_t' but argument 2 has type 'int32_t'} } */
+  svwhilele_b8 (u64, u32); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint64_t' but argument 2 has type 'uint32_t'} } */
+  svwhilele_b8 (u64, s64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint64_t' but argument 2 has type 'int64_t'} } */
+  svwhilele_b8 (u64, u64);
+
+  svwhilele_b8 (0, s32);
+  svwhilele_b8 (0, u32); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int32_t' but argument 2 has type 'uint32_t'} } */
+  svwhilele_b8 (0, s64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int32_t' but argument 2 has type 'int64_t'} } */
+  svwhilele_b8 (0, u64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int32_t' but argument 2 has type 'uint64_t'} } */
+
+  svwhilele_b8 (s32, 0);
+  svwhilele_b8 (u32, 0); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
+  svwhilele_b8 (s64, 0); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int64_t' but argument 2 has type 'int32_t'} } */
+  svwhilele_b8 (u64, 0); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint64_t' but argument 2 has type 'int32_t'} } */
+
+  svwhilele_b8 (0U, s32); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int32_t'} } */
+  svwhilele_b8 (0U, u32);
+  svwhilele_b8 (0U, s64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'int64_t'} } */
+  svwhilele_b8 (0U, u64); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint32_t' but argument 2 has type 'uint64_t'} } */
+
+  svwhilele_b8 (s32, 0U); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int32_t' but argument 2 has type 'uint32_t'} } */
+  svwhilele_b8 (u32, 0U);
+  svwhilele_b8 (s64, 0U); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'int64_t' but argument 2 has type 'uint32_t'} } */
+  svwhilele_b8 (u64, 0U); /* { dg-error {call to 'svwhilele_b8' is ambiguous; argument 1 has type 'uint64_t' but argument 2 has type 'uint32_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_wide_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_wide_opt_n_1.c
new file mode 100644
index 000000000..fc5e45663
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/compare_wide_opt_n_1.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+svuint8_t
+f1 (svbool_t pg, svuint8_t u8, svint8_t s8, svint64_t s64, svuint64_t u64,
+    svfloat32_t f32, svfloat64_t f64, unsigned int x)
+{
+  svcmpeq_wide (pg, s8); /* { dg-error {too few arguments to function 'svcmpeq_wide'} } */
+  svcmpeq_wide (pg, s8, s64, s8); /* { dg-error {too many arguments to function 'svcmpeq_wide'} } */
+  svcmpeq_wide (s8, s8, s64); /* { dg-error {passing 'svint8_t' to argument 1 of 'svcmpeq_wide', which expects 'svbool_t'} } */
+  svcmpeq_wide (pg, 0, s64); /* { dg-error {passing 'int' to argument 2 of 'svcmpeq_wide', which expects an SVE vector type} } */
+  svcmpeq_wide (pg, s8, 0);
+  svcmpeq_wide (pg, s8, x);
+  svcmpeq_wide (pg, s8, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svcmpeq_wide', which expects a vector of 64-bit elements} } */
+  svcmpeq_wide (pg, s8, u8); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svcmpeq_wide', which expects a vector of 64-bit elements} } */
+  svcmpeq_wide (pg, s8, s64);
+  svcmpeq_wide (pg, s8, u64); /* { dg-error {arguments 2 and 3 of 'svcmpeq_wide' must have the same signedness, but the values passed here have type 'svint8_t' and 'svuint64_t' respectively} } */
+  svcmpeq_wide (pg, u8, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svcmpeq_wide', which expects a vector of 64-bit elements} } */
+  svcmpeq_wide (pg, u8, u64); /* { dg-error {'svcmpeq_wide' has no form that takes 'svuint8_t' arguments} } */
+  svcmpeq_wide (pg, s64, s64); /* { dg-error {'svcmpeq_wide' has no form that takes 'svint64_t' arguments} } */
+  svcmpeq_wide (pg, f32, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svcmpeq_wide', which expects a vector of 64-bit elements} } */
+  svcmpeq_wide (pg, f32, f64); /* { dg-error {'svcmpeq_wide' has no form that takes 'svfloat32_t' arguments} } */
+  svcmpeq_wide (pg, f64, f64); /* { dg-error {'svcmpeq_wide' has no form that takes 'svfloat64_t' arguments} } */
+  svcmpeq_wide (pg, pg, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svcmpeq_wide', which expects a vector of 64-bit elements} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/count_pat_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/count_pat_1.c
new file mode 100644
index 000000000..8dd76a553
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/count_pat_1.c
@@ -0,0 +1,42 @@
+#include <arm_sve.h>
+
+void
+test (enum svpattern pat, int i)
+{
+  svcntb_pat (pat); /* { dg-error {argument 1 of 'svcntb_pat' must be an integer constant expression} } */
+  svcntb_pat (i); /* { dg-error {argument 1 of 'svcntb_pat' must be an integer constant expression} } */
+  svcntb_pat ((enum svpattern) -1); /* { dg-error {passing 4294967295 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+  svcntb_pat ((enum svpattern) 0);
+  svcntb_pat ((enum svpattern) 1);
+  svcntb_pat ((enum svpattern) 2);
+  svcntb_pat ((enum svpattern) 3);
+  svcntb_pat ((enum svpattern) 4);
+  svcntb_pat ((enum svpattern) 5);
+  svcntb_pat ((enum svpattern) 6);
+  svcntb_pat ((enum svpattern) 7);
+  svcntb_pat ((enum svpattern) 8);
+  svcntb_pat ((enum svpattern) 9);
+  svcntb_pat ((enum svpattern) 10);
+  svcntb_pat ((enum svpattern) 11);
+  svcntb_pat ((enum svpattern) 12);
+  svcntb_pat ((enum svpattern) 13);
+  svcntb_pat ((enum svpattern) 14); /* { dg-error {passing 14 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+  svcntb_pat ((enum svpattern) 15); /* { dg-error {passing 15 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+  svcntb_pat ((enum svpattern) 16); /* { dg-error {passing 16 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+  svcntb_pat ((enum svpattern) 17); /* { dg-error {passing 17 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+  svcntb_pat ((enum svpattern) 18); /* { dg-error {passing 18 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+  svcntb_pat ((enum svpattern) 19); /* { dg-error {passing 19 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+  svcntb_pat ((enum svpattern) 20); /* { dg-error {passing 20 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+  svcntb_pat ((enum svpattern) 21); /* { dg-error {passing 21 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+  svcntb_pat ((enum svpattern) 22); /* { dg-error {passing 22 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+  svcntb_pat ((enum svpattern) 23); /* { dg-error {passing 23 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+  svcntb_pat ((enum svpattern) 24); /* { dg-error {passing 24 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+  svcntb_pat ((enum svpattern) 25); /* { dg-error {passing 25 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+  svcntb_pat ((enum svpattern) 26); /* { dg-error {passing 26 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+  svcntb_pat ((enum svpattern) 27); /* { dg-error {passing 27 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+  svcntb_pat ((enum svpattern) 28); /* { dg-error {passing 28 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+  svcntb_pat ((enum svpattern) 29);
+  svcntb_pat ((enum svpattern) 30);
+  svcntb_pat ((enum svpattern) 31);
+  svcntb_pat ((enum svpattern) 32); /* { dg-error {passing 32 to argument 1 of 'svcntb_pat', which expects a valid 'enum svpattern' value} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/count_vector_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/count_vector_1.c
new file mode 100644
index 000000000..daf9e0d5b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/count_vector_1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svuint32_t u32, svuint32x2_t u32x2)
+{
+  svlen (); /* { dg-error {too few arguments to function 'svlen'} } */
+  svlen (u32, u32); /* { dg-error {too many arguments to function 'svlen'} } */
+  svlen (0); /* { dg-error {passing 'int' to argument 1 of 'svlen', which expects an SVE vector type} } */
+  svlen (pg); /* { dg-error {'svlen' has no form that takes 'svbool_t' arguments} } */
+  svlen (u32x2); /* { dg-error {passing 'svuint32x2_t' to argument 1 of 'svlen', which expects a single SVE vector rather than a tuple} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_1.c
new file mode 100644
index 000000000..31321a046
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svuint8x2_t *ptr, svbool_t pg, svuint8_t u8, svfloat64_t f64,
+    svuint8x2_t u8x2, int x)
+{
+  *ptr = svcreate2 (u8); /* { dg-error {too few arguments to function 'svcreate2'} } */
+  *ptr = svcreate2 (u8, u8, u8); /* { dg-error {too many arguments to function 'svcreate2'} } */
+  *ptr = svcreate2 (u8x2, u8x2); /* { dg-error {passing 'svuint8x2_t' to argument 1 of 'svcreate2', which expects a single SVE vector rather than a tuple} } */
+  *ptr = svcreate2 (u8, f64); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svcreate2', but previous arguments had type 'svuint8_t'} } */
+  *ptr = svcreate2 (u8, pg); /* { dg-error {passing 'svbool_t' to argument 2 of 'svcreate2', but previous arguments had type 'svuint8_t'} } */
+  *ptr = svcreate2 (u8, x); /* { dg-error {passing 'int' to argument 2 of 'svcreate2', which expects an SVE vector type} } */
+  *ptr = svcreate2 (x, u8); /* { dg-error {passing 'int' to argument 1 of 'svcreate2', which expects an SVE vector type} } */
+  *ptr = svcreate2 (pg, u8); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svcreate2', but previous arguments had type 'svbool_t'} } */
+  *ptr = svcreate2 (pg, pg); /* { dg-error {'svcreate2' has no form that takes 'svbool_t' arguments} } */
+  *ptr = svcreate2 (u8, u8);
+  *ptr = svcreate2 (f64, f64); /* { dg-error {incompatible types when assigning to type 'svuint8x2_t' from type 'svfloat64x2_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_2.c
new file mode 100644
index 000000000..28ad16c2d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svuint8x2_t *ptr, svbool_t pg, svuint8_t u8, svfloat64_t f64,
+    svuint8x2_t u8x2, int x)
+{
+  *ptr = svcreate2_u8 (u8); /* { dg-error {too few arguments to function 'svcreate2_u8'} } */
+  *ptr = svcreate2_u8 (u8, u8, u8); /* { dg-error {too many arguments to function 'svcreate2_u8'} } */
+  *ptr = svcreate2_u8 (u8x2, u8x2); /* { dg-error {incompatible type for argument 1 of 'svcreate2_u8'} } */
+  /* { dg-error {incompatible type for argument 2 of 'svcreate2_u8'} "" { target *-*-* } .-1 } */
+  *ptr = svcreate2_u8 (u8, f64); /* { dg-error {incompatible type for argument 2 of 'svcreate2_u8'} } */
+  *ptr = svcreate2_u8 (u8, pg); /* { dg-error {incompatible type for argument 2 of 'svcreate2_u8'} } */
+  *ptr = svcreate2_u8 (u8, x); /* { dg-error {incompatible type for argument 2 of 'svcreate2_u8'} } */
+  *ptr = svcreate2_u8 (x, u8); /* { dg-error {incompatible type for argument 1 of 'svcreate2_u8'} } */
+  *ptr = svcreate2_u8 (pg, u8); /* { dg-error {incompatible type for argument 1 of 'svcreate2_u8'} } */
+  *ptr = svcreate2_u8 (pg, pg); /* { dg-error {incompatible type for argument 1 of 'svcreate2_u8'} } */
+  /* { dg-error {incompatible type for argument 2 of 'svcreate2_u8'} "" { target *-*-* } .-1 } */
+  *ptr = svcreate2_u8 (u8, u8);
+  *ptr = svcreate2_f64 (f64, f64); /* { dg-error {incompatible types when assigning to type 'svuint8x2_t' from type 'svfloat64x2_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_3.c
new file mode 100644
index 000000000..a88e56b31
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_3.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svfloat16x3_t *ptr, svbool_t pg, svfloat16_t f16, svfloat64_t f64,
+    svfloat16x3_t f16x3, int x)
+{
+  *ptr = svcreate3 (f16); /* { dg-error {too few arguments to function 'svcreate3'} } */
+  *ptr = svcreate3 (f16, f16); /* { dg-error {too few arguments to function 'svcreate3'} } */
+  *ptr = svcreate3 (f16, f16, f16, f16); /* { dg-error {too many arguments to function 'svcreate3'} } */
+  *ptr = svcreate3 (f16x3, f16x3, f16x3); /* { dg-error {passing 'svfloat16x3_t' to argument 1 of 'svcreate3', which expects a single SVE vector rather than a tuple} } */
+  *ptr = svcreate3 (f16, f16, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svcreate3', but previous arguments had type 'svfloat16_t'} } */
+  *ptr = svcreate3 (f16, pg, f16); /* { dg-error {passing 'svbool_t' to argument 2 of 'svcreate3', but previous arguments had type 'svfloat16_t'} } */
+  *ptr = svcreate3 (f16, x, f16); /* { dg-error {passing 'int' to argument 2 of 'svcreate3', which expects an SVE vector type} } */
+  *ptr = svcreate3 (x, f16, f16); /* { dg-error {passing 'int' to argument 1 of 'svcreate3', which expects an SVE vector type} } */
+  *ptr = svcreate3 (pg, f16, f16); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svcreate3', but previous arguments had type 'svbool_t'} } */
+  *ptr = svcreate3 (pg, pg, pg); /* { dg-error {'svcreate3' has no form that takes 'svbool_t' arguments} } */
+  *ptr = svcreate3 (f16, f16, f16);
+  *ptr = svcreate3 (f64, f64, f64); /* { dg-error {incompatible types when assigning to type 'svfloat16x3_t' from type 'svfloat64x3_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_4.c
new file mode 100644
index 000000000..c111e9f29
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_4.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svfloat16x3_t *ptr, svbool_t pg, svfloat16_t f16, svfloat64_t f64,
+    svfloat16x3_t f16x3, int x)
+{
+  *ptr = svcreate3_f16 (f16); /* { dg-error {too few arguments to function 'svcreate3_f16'} } */
+  *ptr = svcreate3_f16 (f16, f16); /* { dg-error {too few arguments to function 'svcreate3_f16'} } */
+  *ptr = svcreate3_f16 (f16, f16, f16, f16); /* { dg-error {too many arguments to function 'svcreate3_f16'} } */
+  *ptr = svcreate3_f16 (f16x3, f16x3, f16x3); /* { dg-error {incompatible type for argument 1 of 'svcreate3_f16'} } */
+  /* { dg-error {incompatible type for argument 2 of 'svcreate3_f16'} "" { target *-*-* } .-1 } */
+  /* { dg-error {incompatible type for argument 3 of 'svcreate3_f16'} "" { target *-*-* } .-2 } */
+  *ptr = svcreate3_f16 (f16, f16, f64); /* { dg-error {incompatible type for argument 3 of 'svcreate3_f16'} } */
+  *ptr = svcreate3_f16 (f16, pg, f16); /* { dg-error {incompatible type for argument 2 of 'svcreate3_f16'} } */
+  *ptr = svcreate3_f16 (f16, x, f16); /* { dg-error {incompatible type for argument 2 of 'svcreate3_f16'} } */
+  *ptr = svcreate3_f16 (x, f16, f16); /* { dg-error {incompatible type for argument 1 of 'svcreate3_f16'} } */
+  *ptr = svcreate3_f16 (pg, f16, f16); /* { dg-error {incompatible type for argument 1 of 'svcreate3_f16'} } */
+  *ptr = svcreate3_f16 (pg, pg, pg); /* { dg-error {incompatible type for argument 1 of 'svcreate3_f16'} } */
+  /* { dg-error {incompatible type for argument 2 of 'svcreate3_f16'} "" { target *-*-* } .-1 } */
+  /* { dg-error {incompatible type for argument 3 of 'svcreate3_f16'} "" { target *-*-* } .-2 } */
+  *ptr = svcreate3_f16 (f16, f16, f16);
+  *ptr = svcreate3_f64 (f64, f64, f64); /* { dg-error {incompatible types when assigning to type 'svfloat16x3_t' from type 'svfloat64x3_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_5.c
new file mode 100644
index 000000000..fed124506
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_5.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svint32x4_t *ptr, svbool_t pg, svint32_t s32, svfloat64_t f64,
+    svint32x4_t s32x4, int x)
+{
+  *ptr = svcreate4 (s32); /* { dg-error {too few arguments to function 'svcreate4'} } */
+  *ptr = svcreate4 (s32, s32); /* { dg-error {too few arguments to function 'svcreate4'} } */
+  *ptr = svcreate4 (s32, s32, s32); /* { dg-error {too few arguments to function 'svcreate4'} } */
+  *ptr = svcreate4 (s32, s32, s32, s32, s32); /* { dg-error {too many arguments to function 'svcreate4'} } */
+  *ptr = svcreate4 (s32x4, s32x4, s32x4, s32x4); /* { dg-error {passing 'svint32x4_t' to argument 1 of 'svcreate4', which expects a single SVE vector rather than a tuple} } */
+  *ptr = svcreate4 (s32, s32, s32, f64); /* { dg-error {passing 'svfloat64_t' to argument 4 of 'svcreate4', but previous arguments had type 'svint32_t'} } */
+  *ptr = svcreate4 (s32, s32, pg, s32); /* { dg-error {passing 'svbool_t' to argument 3 of 'svcreate4', but previous arguments had type 'svint32_t'} } */
+  *ptr = svcreate4 (s32, x, s32, s32); /* { dg-error {passing 'int' to argument 2 of 'svcreate4', which expects an SVE vector type} } */
+  *ptr = svcreate4 (x, s32, s32, s32); /* { dg-error {passing 'int' to argument 1 of 'svcreate4', which expects an SVE vector type} } */
+  *ptr = svcreate4 (pg, s32, s32, s32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svcreate4', but previous arguments had type 'svbool_t'} } */
+  *ptr = svcreate4 (pg, pg, pg, pg); /* { dg-error {'svcreate4' has no form that takes 'svbool_t' arguments} } */
+  *ptr = svcreate4 (s32, s32, s32, s32);
+  *ptr = svcreate4 (f64, f64, f64, f64); /* { dg-error {incompatible types when assigning to type 'svint32x4_t' from type 'svfloat64x4_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_6.c
new file mode 100644
index 000000000..b9e298acf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/create_6.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svint32x4_t *ptr, svbool_t pg, svint32_t s32, svfloat64_t f64,
+    svint32x4_t s32x4, int x)
+{
+  *ptr = svcreate4_s32 (s32); /* { dg-error {too few arguments to function 'svcreate4_s32'} } */
+  *ptr = svcreate4_s32 (s32, s32); /* { dg-error {too few arguments to function 'svcreate4_s32'} } */
+  *ptr = svcreate4_s32 (s32, s32, s32); /* { dg-error {too few arguments to function 'svcreate4_s32'} } */
+  *ptr = svcreate4_s32 (s32, s32, s32, s32, s32); /* { dg-error {too many arguments to function 'svcreate4_s32'} } */
+  *ptr = svcreate4_s32 (s32x4, s32x4, s32x4, s32x4); /* { dg-error {incompatible type for argument 1 of 'svcreate4_s32'} } */
+  /* { dg-error {incompatible type for argument 2 of 'svcreate4_s32'} "" { target *-*-* } .-1 } */
+  /* { dg-error {incompatible type for argument 3 of 'svcreate4_s32'} "" { target *-*-* } .-2 } */
+  /* { dg-error {incompatible type for argument 4 of 'svcreate4_s32'} "" { target *-*-* } .-3 } */
+  *ptr = svcreate4_s32 (s32, s32, s32, f64); /* { dg-error {incompatible type for argument 4 of 'svcreate4_s32'} } */
+  *ptr = svcreate4_s32 (s32, s32, pg, s32); /* { dg-error {incompatible type for argument 3 of 'svcreate4_s32'} } */
+  *ptr = svcreate4_s32 (s32, x, s32, s32); /* { dg-error {incompatible type for argument 2 of 'svcreate4_s32'} } */
+  *ptr = svcreate4_s32 (x, s32, s32, s32); /* { dg-error {incompatible type for argument 1 of 'svcreate4_s32'} } */
+  *ptr = svcreate4_s32 (pg, s32, s32, s32); /* { dg-error {incompatible type for argument 1 of 'svcreate4_s32'} } */
+  *ptr = svcreate4_s32 (pg, pg, pg, pg); /* { dg-error {incompatible type for argument 1 of 'svcreate4_s32'} } */
+  /* { dg-error {incompatible type for argument 2 of 'svcreate4_s32'} "" { target *-*-* } .-1 } */
+  /* { dg-error {incompatible type for argument 3 of 'svcreate4_s32'} "" { target *-*-* } .-2 } */
+  /* { dg-error {incompatible type for argument 4 of 'svcreate4_s32'} "" { target *-*-* } .-3 } */
+  *ptr = svcreate4_s32 (s32, s32, s32, s32);
+  *ptr = svcreate4_f64 (f64, f64, f64, f64); /* { dg-error {incompatible types when assigning to type 'svint32x4_t' from type 'svfloat64x4_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ext_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ext_1.c
new file mode 100644
index 000000000..bdce3926d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ext_1.c
@@ -0,0 +1,67 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16,
+    svfloat16_t f16, svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64, int i)
+{
+  svext (pg, pg, 0); /* { dg-error {'svext' has no form that takes 'svbool_t' arguments} } */
+  svext (s8, s8, i); /* { dg-error {argument 3 of 'svext' must be an integer constant expression} } */
+
+  svext (s8, s8, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 255\]} } */
+  svext (s8, s8, 0);
+  svext (s8, s8, 255);
+  svext (s8, s8, 256); /* { dg-error {passing 256 to argument 3 of 'svext', which expects a value in the range \[0, 255\]} } */
+
+  svext (u8, u8, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 255\]} } */
+  svext (u8, u8, 0);
+  svext (u8, u8, 255);
+  svext (u8, u8, 256); /* { dg-error {passing 256 to argument 3 of 'svext', which expects a value in the range \[0, 255\]} } */
+
+  svext (s16, s16, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 127\]} } */
+  svext (s16, s16, 0);
+  svext (s16, s16, 127);
+  svext (s16, s16, 128); /* { dg-error {passing 128 to argument 3 of 'svext', which expects a value in the range \[0, 127\]} } */
+
+  svext (u16, u16, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 127\]} } */
+  svext (u16, u16, 0);
+  svext (u16, u16, 127);
+  svext (u16, u16, 128); /* { dg-error {passing 128 to argument 3 of 'svext', which expects a value in the range \[0, 127\]} } */
+
+  svext (f16, f16, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 127\]} } */
+  svext (f16, f16, 0);
+  svext (f16, f16, 127);
+  svext (f16, f16, 128); /* { dg-error {passing 128 to argument 3 of 'svext', which expects a value in the range \[0, 127\]} } */
+
+  svext (s32, s32, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 63\]} } */
+  svext (s32, s32, 0);
+  svext (s32, s32, 63);
+  svext (s32, s32, 64); /* { dg-error {passing 64 to argument 3 of 'svext', which expects a value in the range \[0, 63\]} } */
+
+  svext (u32, u32, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 63\]} } */
+  svext (u32, u32, 0);
+  svext (u32, u32, 63);
+  svext (u32, u32, 64); /* { dg-error {passing 64 to argument 3 of 'svext', which expects a value in the range \[0, 63\]} } */
+
+  svext (f32, f32, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 63\]} } */
+  svext (f32, f32, 0);
+  svext (f32, f32, 63);
+  svext (f32, f32, 64); /* { dg-error {passing 64 to argument 3 of 'svext', which expects a value in the range \[0, 63\]} } */
+
+  svext (s64, s64, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 31\]} } */
+  svext (s64, s64, 0);
+  svext (s64, s64, 31);
+  svext (s64, s64, 32); /* { dg-error {passing 32 to argument 3 of 'svext', which expects a value in the range \[0, 31\]} } */
+
+  svext (u64, u64, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 31\]} } */
+  svext (u64, u64, 0);
+  svext (u64, u64, 31);
+  svext (u64, u64, 32); /* { dg-error {passing 32 to argument 3 of 'svext', which expects a value in the range \[0, 31\]} } */
+
+  svext (f64, f64, -1); /* { dg-error {passing -1 to argument 3 of 'svext', which expects a value in the range \[0, 31\]} } */
+  svext (f64, f64, 0);
+  svext (f64, f64, 31);
+  svext (f64, f64, 32); /* { dg-error {passing 32 to argument 3 of 'svext', which expects a value in the range \[0, 31\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/fold_left_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/fold_left_1.c
new file mode 100644
index 000000000..1d292786d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/fold_left_1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+svuint8_t
+f1 (svbool_t pg, int i, float f, double d, void *ptr, svfloat32_t f32,
+    svint32_t i32)
+{
+  svadda (pg, f); /* { dg-error {too few arguments to function 'svadda'} } */
+  svadda (pg, f, f32, f32); /* { dg-error {too many arguments to function 'svadda'} } */
+  svadda (f32, f, f32); /* { dg-error {passing 'svfloat32_t' to argument 1 of 'svadda', which expects 'svbool_t'} } */
+  svadda (pg, i, f32);
+  svadda (pg, f, f32);
+  svadda (pg, d, f32);
+  svadda (pg, ptr, f32); /* { dg-error {incompatible type for argument 2 of 'svadda_f32'} } */
+  svadda (pg, pg, f32); /* { dg-error {passing 'svbool_t' to argument 2 of 'svadda', which expects a scalar element} } */
+  svadda (pg, f32, f32); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svadda', which expects a scalar element} } */
+  svadda (pg, f, f); /* { dg-error {passing 'float' to argument 3 of 'svadda', which expects an SVE vector type} } */
+  svadda (pg, i, i32); /* { dg-error {'svadda' has no form that takes 'svint32_t' arguments} } */
+  svadda (pg, i, i); /* { dg-error {passing 'int' to argument 3 of 'svadda', which expects an SVE vector type} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_1.c
new file mode 100644
index 000000000..e1b99fa36
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_1.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+int svadd_n_u8_x; /* { dg-message "note: previous declaration of 'svadd_n_u8_x' was here" } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'svadd_n_u8_x' redeclared} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_2.c
new file mode 100644
index 000000000..7f653f117
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_2.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+int svadd_n_u8_x = 1; /* { dg-message "note: previous definition of 'svadd_n_u8_x' was here" } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'svadd_n_u8_x' redeclared} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_3.c
new file mode 100644
index 000000000..d9ff15a6c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_3.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+extern __SVInt8_t svadd_u8_x (__SVBool_t, __SVInt8_t, __SVInt8_t); /* { dg-message "note: previous declaration of 'svadd_u8_x' was here" } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {conflicting types for 'svadd_u8_x'} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_4.c
new file mode 100644
index 000000000..9591e3d01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_4.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+
+/* Although somewhat suspect, this isn't actively wrong, and doesn't need
+   to be diagnosed.  Any attempt to call the function before including
+   arm_sve.h will lead to a link failure.  (Same for taking its address,
+   etc.)  */
+extern __SVUint8_t svadd_u8_x (__SVBool_t, __SVUint8_t, __SVUint8_t);
+
+#pragma GCC aarch64 "arm_sve.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_5.c
new file mode 100644
index 000000000..85923611d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_5.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+
+/* There's no requirement to diagnose this.  In particular, arm_sve.h
+   is allowed to use macros to implement the functions, and defining
+   a macro that matches an existing symbol would not be diagnosed.
+
+   At the moment this works like other built-ins in the sense that the
+   explicit definition "wins".  This isn't supported behavior though.  */
+__SVUint8_t
+svadd_u8_x (__SVBool_t pg, __SVUint8_t x, __SVUint8_t y)
+{
+  return x;
+}
+
+#pragma GCC aarch64 "arm_sve.h"
+
+svuint8_t
+f (svbool_t pg, svuint8_t x, svuint8_t y)
+{
+  return svadd_u8_x (pg, x, y);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_6.c
new file mode 100644
index 000000000..1f04e4644
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/func_redef_6.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+typedef int svadd_u8_x; /* { dg-message "note: previous declaration of 'svadd_u8_x' was here" } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'svadd_u8_x' redeclared} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_1.c
new file mode 100644
index 000000000..a3ac08fa8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_1.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+svfloat64_t
+f1 (svbool_t pg, svuint8_t u8, svuint8x2_t u8x2, svuint8x3_t u8x3, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  u8 = svget2 (u8x2); /* { dg-error {too few arguments to function 'svget2'} } */
+  u8 = svget2 (u8x2, 1, 2); /* { dg-error {too many arguments to function 'svget2'} } */
+  u8 = svget2 (u8, 0); /* { dg-error {passing single vector 'svuint8_t' to argument 1 of 'svget2', which expects a tuple of 2 vectors} } */
+  u8 = svget2 (u8x3, 0); /* { dg-error {passing 'svuint8x3_t' to argument 1 of 'svget2', which expects a tuple of 2 vectors} } */
+  u8 = svget2 (pg, 0); /* { dg-error {passing 'svbool_t' to argument 1 of 'svget2', which expects a tuple of 2 vectors} } */
+  u8 = svget2 (u8x2, x); /* { dg-error {argument 2 of 'svget2' must be an integer constant expression} } */
+  u8 = svget2 (u8x2, 0);
+  f64 = svget2 (u8x2, 0); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svuint8_t'} } */
+  u8 = svget2 (u8x2, 1);
+  u8 = svget2 (u8x2, 2); /* { dg-error {passing 2 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2 (u8x2, 3); /* { dg-error {passing 3 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2 (u8x2, 4); /* { dg-error {passing 4 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2 (u8x2, 5); /* { dg-error {passing 5 to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2 (u8x2, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget2', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2 (u8x2, one); /* { dg-error {argument 2 of 'svget2' must be an integer constant expression} } */
+  u8 = svget2 (u8x2, 3 - 2);
+  u8 = svget2 (u8x2, 1.0);
+
+  return f64;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_2.c
new file mode 100644
index 000000000..4eee2439e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_2.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+svfloat64_t
+f1 (svbool_t pg, svuint8_t u8, svuint8x2_t u8x2, svint8x2_t s8x2,
+    svuint8x3_t u8x3, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  u8 = svget2_u8 (u8x2); /* { dg-error {too few arguments to function 'svget2_u8'} } */
+  u8 = svget2_u8 (u8x2, 1, 2); /* { dg-error {too many arguments to function 'svget2_u8'} } */
+  u8 = svget2_u8 (u8, 0); /* { dg-error {incompatible type for argument 1 of 'svget2_u8'} } */
+  u8 = svget2_u8 (s8x2, 0); /* { dg-error {incompatible type for argument 1 of 'svget2_u8'} } */
+  u8 = svget2_u8 (u8x3, 0); /* { dg-error {incompatible type for argument 1 of 'svget2_u8'} } */
+  u8 = svget2_u8 (pg, 0); /* { dg-error {incompatible type for argument 1 of 'svget2_u8'} } */
+  u8 = svget2_u8 (u8x2, x); /* { dg-error {argument 2 of 'svget2_u8' must be an integer constant expression} } */
+  u8 = svget2_u8 (u8x2, 0);
+  f64 = svget2_u8 (u8x2, 0); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svuint8_t'} } */
+  u8 = svget2_u8 (u8x2, 1);
+  u8 = svget2_u8 (u8x2, 2); /* { dg-error {passing 2 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2_u8 (u8x2, 3); /* { dg-error {passing 3 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2_u8 (u8x2, 4); /* { dg-error {passing 4 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2_u8 (u8x2, 5); /* { dg-error {passing 5 to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2_u8 (u8x2, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget2_u8', which expects a value in the range \[0, 1\]} } */
+  u8 = svget2_u8 (u8x2, one); /* { dg-error {argument 2 of 'svget2_u8' must be an integer constant expression} } */
+  u8 = svget2_u8 (u8x2, 3 - 2);
+  u8 = svget2_u8 (u8x2, 1.0);
+
+  return f64;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_3.c
new file mode 100644
index 000000000..0e7b2e227
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_3.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+svfloat64_t
+f1 (svbool_t pg, svfloat16_t f16, svfloat16x3_t f16x3, svfloat16x4_t f16x4,
+    int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  f16 = svget3 (f16x3); /* { dg-error {too few arguments to function 'svget3'} } */
+  f16 = svget3 (f16x3, 1, 2); /* { dg-error {too many arguments to function 'svget3'} } */
+  f16 = svget3 (f16, 0); /* { dg-error {passing single vector 'svfloat16_t' to argument 1 of 'svget3', which expects a tuple of 3 vectors} } */
+  f16 = svget3 (f16x4, 0); /* { dg-error {passing 'svfloat16x4_t' to argument 1 of 'svget3', which expects a tuple of 3 vectors} } */
+  f16 = svget3 (pg, 0); /* { dg-error {passing 'svbool_t' to argument 1 of 'svget3', which expects a tuple of 3 vectors} } */
+  f16 = svget3 (f16x3, x); /* { dg-error {argument 2 of 'svget3' must be an integer constant expression} } */
+  f16 = svget3 (f16x3, 0);
+  f64 = svget3 (f16x3, 0); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svfloat16_t'} } */
+  f16 = svget3 (f16x3, 1);
+  f16 = svget3 (f16x3, 2);
+  f16 = svget3 (f16x3, 3); /* { dg-error {passing 3 to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3 (f16x3, 4); /* { dg-error {passing 4 to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3 (f16x3, 5); /* { dg-error {passing 5 to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3 (f16x3, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget3', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3 (f16x3, one); /* { dg-error {argument 2 of 'svget3' must be an integer constant expression} } */
+  f16 = svget3 (f16x3, 3 - 2);
+  f16 = svget3 (f16x3, 1.0);
+
+  return f64;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_4.c
new file mode 100644
index 000000000..72b4f82a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_4.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+svfloat64_t
+f1 (svbool_t pg, svfloat16_t f16, svfloat16x3_t f16x3, svfloat32x3_t f32x3,
+    svfloat16x4_t f16x4, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  f16 = svget3_f16 (f16x3); /* { dg-error {too few arguments to function 'svget3_f16'} } */
+  f16 = svget3_f16 (f16x3, 1, 2); /* { dg-error {too many arguments to function 'svget3_f16'} } */
+  f16 = svget3_f16 (f16, 0); /* { dg-error {incompatible type for argument 1 of 'svget3_f16'} } */
+  f16 = svget3_f16 (f32x3, 0); /* { dg-error {incompatible type for argument 1 of 'svget3_f16'} } */
+  f16 = svget3_f16 (f16x4, 0); /* { dg-error {incompatible type for argument 1 of 'svget3_f16'} } */
+  f16 = svget3_f16 (pg, 0); /* { dg-error {incompatible type for argument 1 of 'svget3_f16'} } */
+  f16 = svget3_f16 (f16x3, x); /* { dg-error {argument 2 of 'svget3_f16' must be an integer constant expression} } */
+  f16 = svget3_f16 (f16x3, 0);
+  f64 = svget3_f16 (f16x3, 0); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svfloat16_t'} } */
+  f16 = svget3_f16 (f16x3, 1);
+  f16 = svget3_f16 (f16x3, 2);
+  f16 = svget3_f16 (f16x3, 3); /* { dg-error {passing 3 to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3_f16 (f16x3, 4); /* { dg-error {passing 4 to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3_f16 (f16x3, 5); /* { dg-error {passing 5 to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3_f16 (f16x3, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget3_f16', which expects a value in the range \[0, 2\]} } */
+  f16 = svget3_f16 (f16x3, one); /* { dg-error {argument 2 of 'svget3_f16' must be an integer constant expression} } */
+  f16 = svget3_f16 (f16x3, 3 - 2);
+  f16 = svget3_f16 (f16x3, 1.0);
+
+  return f64;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_5.c
new file mode 100644
index 000000000..b0b69b95e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_5.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+svfloat64_t
+f1 (svbool_t pg, svint32_t s32, svint32x4_t s32x4, svint32x2_t s32x2, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  s32 = svget4 (s32x4); /* { dg-error {too few arguments to function 'svget4'} } */
+  s32 = svget4 (s32x4, 1, 2); /* { dg-error {too many arguments to function 'svget4'} } */
+  s32 = svget4 (s32, 0); /* { dg-error {passing single vector 'svint32_t' to argument 1 of 'svget4', which expects a tuple of 4 vectors} } */
+  s32 = svget4 (s32x2, 0); /* { dg-error {passing 'svint32x2_t' to argument 1 of 'svget4', which expects a tuple of 4 vectors} } */
+  s32 = svget4 (pg, 0); /* { dg-error {passing 'svbool_t' to argument 1 of 'svget4', which expects a tuple of 4 vectors} } */
+  s32 = svget4 (s32x4, x); /* { dg-error {argument 2 of 'svget4' must be an integer constant expression} } */
+  s32 = svget4 (s32x4, 0);
+  f64 = svget4 (s32x4, 0); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svint32_t'} } */
+  s32 = svget4 (s32x4, 1);
+  s32 = svget4 (s32x4, 2);
+  s32 = svget4 (s32x4, 3);
+  s32 = svget4 (s32x4, 4); /* { dg-error {passing 4 to argument 2 of 'svget4', which expects a value in the range \[0, 3\]} } */
+  s32 = svget4 (s32x4, 5); /* { dg-error {passing 5 to argument 2 of 'svget4', which expects a value in the range \[0, 3\]} } */
+  s32 = svget4 (s32x4, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget4', which expects a value in the range \[0, 3\]} } */
+  s32 = svget4 (s32x4, one); /* { dg-error {argument 2 of 'svget4' must be an integer constant expression} } */
+  s32 = svget4 (s32x4, 3 - 2);
+  s32 = svget4 (s32x4, 1.0);
+
+  return f64;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_6.c
new file mode 100644
index 000000000..3801c0c4e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/get_6.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+svfloat64_t
+f1 (svbool_t pg, svint32_t s32, svint32x4_t s32x4, svfloat32x4_t f32x4,
+    svint32x2_t s32x2, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  s32 = svget4_s32 (s32x4); /* { dg-error {too few arguments to function 'svget4_s32'} } */
+  s32 = svget4_s32 (s32x4, 1, 2); /* { dg-error {too many arguments to function 'svget4_s32'} } */
+  s32 = svget4_s32 (s32, 0); /* { dg-error {incompatible type for argument 1 of 'svget4_s32'} } */
+  s32 = svget4_s32 (f32x4, 0); /* { dg-error {incompatible type for argument 1 of 'svget4_s32'} } */
+  s32 = svget4_s32 (s32x2, 0); /* { dg-error {incompatible type for argument 1 of 'svget4_s32'} } */
+  s32 = svget4_s32 (pg, 0); /* { dg-error {incompatible type for argument 1 of 'svget4_s32'} } */
+  s32 = svget4_s32 (s32x4, x); /* { dg-error {argument 2 of 'svget4_s32' must be an integer constant expression} } */
+  s32 = svget4_s32 (s32x4, 0);
+  f64 = svget4_s32 (s32x4, 0); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svint32_t'} } */
+  s32 = svget4_s32 (s32x4, 1);
+  s32 = svget4_s32 (s32x4, 2);
+  s32 = svget4_s32 (s32x4, 3);
+  s32 = svget4_s32 (s32x4, 4); /* { dg-error {passing 4 to argument 2 of 'svget4_s32', which expects a value in the range \[0, 3\]} } */
+  s32 = svget4_s32 (s32x4, 5); /* { dg-error {passing 5 to argument 2 of 'svget4_s32', which expects a value in the range \[0, 3\]} } */
+  s32 = svget4_s32 (s32x4, ~0U); /* { dg-error {passing [^ ]* to argument 2 of 'svget4_s32', which expects a value in the range \[0, 3\]} } */
+  s32 = svget4_s32 (s32x4, one); /* { dg-error {argument 2 of 'svget4_s32' must be an integer constant expression} } */
+  s32 = svget4_s32 (s32x4, 3 - 2);
+  s32 = svget4_s32 (s32x4, 1.0);
+
+  return f64;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_1.c
new file mode 100644
index 000000000..dcd291da6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_1.c
@@ -0,0 +1,37 @@
+#include <arm_sve.h>
+
+void
+test (svbool_t pg, svint8_t s8, svuint8_t u8,
+      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
+      svint64_t s64, svuint64_t u64, int16_t sh, uint16_t uh,
+      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud,
+      float f, int i)
+{
+  svqincb (sw); /* { dg-error {too few arguments to function 'svqincb'} } */
+  svqincb (sw, 1, 1); /* { dg-error {too many arguments to function 'svqincb'} } */
+
+  svqincb (pg, 1); /* { dg-error {'svqincb' has no form that takes 'svbool_t' arguments} } */
+  svqincb (s8, 1); /* { dg-error {'svqincb' has no form that takes 'svint8_t' arguments} } */
+  svqincb (u8, 1); /* { dg-error {'svqincb' has no form that takes 'svuint8_t' arguments} } */
+  svqincb (s16, 1); /* { dg-error {'svqincb' has no form that takes 'svint16_t' arguments} } */
+  svqincb (u16, 1); /* { dg-error {'svqincb' has no form that takes 'svuint16_t' arguments} } */
+  svqincb (s32, 1); /* { dg-error {'svqincb' has no form that takes 'svint32_t' arguments} } */
+  svqincb (u32, 1); /* { dg-error {'svqincb' has no form that takes 'svuint32_t' arguments} } */
+  svqincb (s64, 1); /* { dg-error {'svqincb' has no form that takes 'svint64_t' arguments} } */
+  svqincb (u64, 1); /* { dg-error {'svqincb' has no form that takes 'svuint64_t' arguments} } */
+  svqincb (sh, 1);
+  svqincb (sw, 1);
+  svqincb (sd, 1);
+  svqincb (uh, 1);
+  svqincb (uw, 1);
+  svqincb (ud, 1);
+  svqincb (f, 1); /* { dg-error {passing 'float' to argument 1 of 'svqincb', which expects a 32-bit or 64-bit integer type} } */
+  svqincb (ud, i); /* { dg-error {argument 2 of 'svqincb' must be an integer constant expression} } */
+
+  svqincb (sw, -1); /* { dg-error {passing -1 to argument 2 of 'svqincb', which expects a value in the range \[1, 16\]} } */
+  svqincb (sw, 0); /* { dg-error {passing 0 to argument 2 of 'svqincb', which expects a value in the range \[1, 16\]} } */
+  svqincb (sw, 1);
+  svqincb (sw, 2);
+  svqincb (sw, 16);
+  svqincb (sw, 17); /* { dg-error {passing 17 to argument 2 of 'svqincb', which expects a value in the range \[1, 16\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_2.c
new file mode 100644
index 000000000..e5acad187
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_2.c
@@ -0,0 +1,13 @@
+#include <arm_sve.h>
+
+void
+test (int32_t sw, int i)
+{
+  svqincb_n_s32 (sw, -1); /* { dg-error {passing -1 to argument 2 of 'svqincb_n_s32', which expects a value in the range \[1, 16\]} } */
+  svqincb_n_s32 (sw, 0); /* { dg-error {passing 0 to argument 2 of 'svqincb_n_s32', which expects a value in the range \[1, 16\]} } */
+  svqincb_n_s32 (sw, 1);
+  svqincb_n_s32 (sw, 2);
+  svqincb_n_s32 (sw, 16);
+  svqincb_n_s32 (sw, 17); /* { dg-error {passing 17 to argument 2 of 'svqincb_n_s32', which expects a value in the range \[1, 16\]} } */
+  svqincb_n_s32 (sw, i); /* { dg-error {argument 2 of 'svqincb_n_s32' must be an integer constant expression} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_3.c
new file mode 100644
index 000000000..351e7757f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_3.c
@@ -0,0 +1,26 @@
+#include <arm_sve.h>
+
+void
+test (svbool_t pg, svint8_t s8, svuint8_t u8,
+      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
+      svint64_t s64, svuint64_t u64, int16_t sh, uint16_t uh,
+      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud,
+      float f)
+{
+  svqinch (pg, 1); /* { dg-error {'svqinch' has no form that takes 'svbool_t' arguments} } */
+  svqinch (s8, 1); /* { dg-error {'svqinch' has no form that takes 'svint8_t' arguments} } */
+  svqinch (u8, 1); /* { dg-error {'svqinch' has no form that takes 'svuint8_t' arguments} } */
+  svqinch (s16, 1);
+  svqinch (u16, 1);
+  svqinch (s32, 1); /* { dg-error {'svqinch' has no form that takes 'svint32_t' arguments} } */
+  svqinch (u32, 1); /* { dg-error {'svqinch' has no form that takes 'svuint32_t' arguments} } */
+  svqinch (s64, 1); /* { dg-error {'svqinch' has no form that takes 'svint64_t' arguments} } */
+  svqinch (u64, 1); /* { dg-error {'svqinch' has no form that takes 'svuint64_t' arguments} } */
+  svqinch (sh, 1);
+  svqinch (sw, 1);
+  svqinch (sd, 1);
+  svqinch (uh, 1);
+  svqinch (uw, 1);
+  svqinch (ud, 1);
+  svqinch (f, 1); /* { dg-error {passing 'float' to argument 1 of 'svqinch', which expects a 32-bit or 64-bit integer type} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_4.c
new file mode 100644
index 000000000..e071c0229
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_4.c
@@ -0,0 +1,26 @@
+#include <arm_sve.h>
+
+void
+test (svbool_t pg, svint8_t s8, svuint8_t u8,
+      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
+      svint64_t s64, svuint64_t u64, int16_t sh, uint16_t uh,
+      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud,
+      float f)
+{
+  svqincw (pg, 1); /* { dg-error {'svqincw' has no form that takes 'svbool_t' arguments} } */
+  svqincw (s8, 1); /* { dg-error {'svqincw' has no form that takes 'svint8_t' arguments} } */
+  svqincw (u8, 1); /* { dg-error {'svqincw' has no form that takes 'svuint8_t' arguments} } */
+  svqincw (s16, 1); /* { dg-error {'svqincw' has no form that takes 'svint16_t' arguments} } */
+  svqincw (u16, 1); /* { dg-error {'svqincw' has no form that takes 'svuint16_t' arguments} } */
+  svqincw (s32, 1);
+  svqincw (u32, 1);
+  svqincw (s64, 1); /* { dg-error {'svqincw' has no form that takes 'svint64_t' arguments} } */
+  svqincw (u64, 1); /* { dg-error {'svqincw' has no form that takes 'svuint64_t' arguments} } */
+  svqincw (sh, 1);
+  svqincw (sw, 1);
+  svqincw (sd, 1);
+  svqincw (uh, 1);
+  svqincw (uw, 1);
+  svqincw (ud, 1);
+  svqincw (f, 1); /* { dg-error {passing 'float' to argument 1 of 'svqincw', which expects a 32-bit or 64-bit integer type} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_5.c
new file mode 100644
index 000000000..be9c76928
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_5.c
@@ -0,0 +1,26 @@
+#include <arm_sve.h>
+
+void
+test (svbool_t pg, svint8_t s8, svuint8_t u8,
+      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
+      svint64_t s64, svuint64_t u64, int16_t sh, uint16_t uh,
+      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud,
+      float f)
+{
+  svqincd (pg, 1); /* { dg-error {'svqincd' has no form that takes 'svbool_t' arguments} } */
+  svqincd (s8, 1); /* { dg-error {'svqincd' has no form that takes 'svint8_t' arguments} } */
+  svqincd (u8, 1); /* { dg-error {'svqincd' has no form that takes 'svuint8_t' arguments} } */
+  svqincd (s16, 1); /* { dg-error {'svqincd' has no form that takes 'svint16_t' arguments} } */
+  svqincd (u16, 1); /* { dg-error {'svqincd' has no form that takes 'svuint16_t' arguments} } */
+  svqincd (s32, 1); /* { dg-error {'svqincd' has no form that takes 'svint32_t' arguments} } */
+  svqincd (u32, 1); /* { dg-error {'svqincd' has no form that takes 'svuint32_t' arguments} } */
+  svqincd (s64, 1);
+  svqincd (u64, 1);
+  svqincd (sh, 1);
+  svqincd (sw, 1);
+  svqincd (sd, 1);
+  svqincd (uh, 1);
+  svqincd (uw, 1);
+  svqincd (ud, 1);
+  svqincd (f, 1); /* { dg-error {passing 'float' to argument 1 of 'svqincd', which expects a 32-bit or 64-bit integer type} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_1.c
new file mode 100644
index 000000000..f2e5841d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_1.c
@@ -0,0 +1,47 @@
+#include <arm_sve.h>
+
+void
+test (enum svpattern pat, svbool_t pg, svint8_t s8, svuint8_t u8,
+      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
+      svint64_t s64, svuint64_t u64, int16_t sh, uint16_t uh,
+      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud,
+      float f, int i)
+{
+  svqincb_pat (sw, pat); /* { dg-error {too few arguments to function 'svqincb_pat'} } */
+  svqincb_pat (sw, pat, 1, 1); /* { dg-error {too many arguments to function 'svqincb_pat'} } */
+
+  svqincb_pat (pg, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svbool_t' arguments} } */
+  svqincb_pat (s8, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svint8_t' arguments} } */
+  svqincb_pat (u8, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svuint8_t' arguments} } */
+  svqincb_pat (s16, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svint16_t' arguments} } */
+  svqincb_pat (u16, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svuint16_t' arguments} } */
+  svqincb_pat (s32, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svint32_t' arguments} } */
+  svqincb_pat (u32, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svuint32_t' arguments} } */
+  svqincb_pat (s64, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svint64_t' arguments} } */
+  svqincb_pat (u64, SV_ALL, 1); /* { dg-error {'svqincb_pat' has no form that takes 'svuint64_t' arguments} } */
+  svqincb_pat (sh, SV_ALL, 1);
+  svqincb_pat (sw, SV_ALL, 1);
+  svqincb_pat (sd, SV_ALL, 1);
+  svqincb_pat (uh, SV_ALL, 1);
+  svqincb_pat (uw, SV_ALL, 1);
+  svqincb_pat (ud, SV_ALL, 1);
+  svqincb_pat (f, SV_ALL, 1); /* { dg-error {passing 'float' to argument 1 of 'svqincb_pat', which expects a 32-bit or 64-bit integer type} } */
+
+  svqincb_pat (sw, pat, 1); /* { dg-error {argument 2 of 'svqincb_pat' must be an integer constant expression} } */
+  svqincb_pat (sw, i, 1); /* { dg-error {argument 2 of 'svqincb_pat' must be an integer constant expression} } */
+  svqincb_pat (sw, (enum svpattern) -1, 1); /* { dg-error {passing 4294967295 to argument 2 of 'svqincb_pat', which expects a valid 'enum svpattern' value} } */
+  svqincb_pat (sw, (enum svpattern) 0, 1);
+  svqincb_pat (sw, (enum svpattern) 13, 1);
+  svqincb_pat (sw, (enum svpattern) 14, 1); /* { dg-error {passing 14 to argument 2 of 'svqincb_pat', which expects a valid 'enum svpattern' value} } */
+  svqincb_pat (sw, (enum svpattern) 28, 1); /* { dg-error {passing 28 to argument 2 of 'svqincb_pat', which expects a valid 'enum svpattern' value} } */
+  svqincb_pat (sw, (enum svpattern) 29, 1);
+  svqincb_pat (sw, (enum svpattern) 31, 1);
+  svqincb_pat (sw, (enum svpattern) 32, 1); /* { dg-error {passing 32 to argument 2 of 'svqincb_pat', which expects a valid 'enum svpattern' value} } */
+
+  svqincb_pat (sw, SV_POW2, -1); /* { dg-error {passing -1 to argument 3 of 'svqincb_pat', which expects a value in the range \[1, 16\]} } */
+  svqincb_pat (sw, SV_POW2, 0); /* { dg-error {passing 0 to argument 3 of 'svqincb_pat', which expects a value in the range \[1, 16\]} } */
+  svqincb_pat (sw, SV_POW2, 1);
+  svqincb_pat (sw, SV_POW2, 2);
+  svqincb_pat (sw, SV_POW2, 16);
+  svqincb_pat (sw, SV_POW2, 17); /* { dg-error {passing 17 to argument 3 of 'svqincb_pat', which expects a value in the range \[1, 16\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_2.c
new file mode 100644
index 000000000..c1c1ab9d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_2.c
@@ -0,0 +1,23 @@
+#include <arm_sve.h>
+
+void
+test (int32_t sw, enum svpattern pat, int i)
+{
+  svqincb_pat_n_s32 (sw, pat, 1); /* { dg-error {argument 2 of 'svqincb_pat_n_s32' must be an integer constant expression} } */
+  svqincb_pat_n_s32 (sw, i, 1); /* { dg-error {argument 2 of 'svqincb_pat_n_s32' must be an integer constant expression} } */
+  svqincb_pat_n_s32 (sw, (enum svpattern) -1, 1); /* { dg-error {passing 4294967295 to argument 2 of 'svqincb_pat_n_s32', which expects a valid 'enum svpattern' value} } */
+  svqincb_pat_n_s32 (sw, (enum svpattern) 0, 1);
+  svqincb_pat_n_s32 (sw, (enum svpattern) 13, 1);
+  svqincb_pat_n_s32 (sw, (enum svpattern) 14, 1); /* { dg-error {passing 14 to argument 2 of 'svqincb_pat_n_s32', which expects a valid 'enum svpattern' value} } */
+  svqincb_pat_n_s32 (sw, (enum svpattern) 28, 1); /* { dg-error {passing 28 to argument 2 of 'svqincb_pat_n_s32', which expects a valid 'enum svpattern' value} } */
+  svqincb_pat_n_s32 (sw, (enum svpattern) 29, 1);
+  svqincb_pat_n_s32 (sw, (enum svpattern) 31, 1);
+  svqincb_pat_n_s32 (sw, (enum svpattern) 32, 1); /* { dg-error {passing 32 to argument 2 of 'svqincb_pat_n_s32', which expects a valid 'enum svpattern' value} } */
+
+  svqincb_pat_n_s32 (sw, SV_POW2, -1); /* { dg-error {passing -1 to argument 3 of 'svqincb_pat_n_s32', which expects a value in the range \[1, 16\]} } */
+  svqincb_pat_n_s32 (sw, SV_POW2, 0); /* { dg-error {passing 0 to argument 3 of 'svqincb_pat_n_s32', which expects a value in the range \[1, 16\]} } */
+  svqincb_pat_n_s32 (sw, SV_POW2, 1);
+  svqincb_pat_n_s32 (sw, SV_POW2, 2);
+  svqincb_pat_n_s32 (sw, SV_POW2, 16);
+  svqincb_pat_n_s32 (sw, SV_POW2, 17); /* { dg-error {passing 17 to argument 3 of 'svqincb_pat_n_s32', which expects a value in the range \[1, 16\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_3.c
new file mode 100644
index 000000000..4126b2461
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_3.c
@@ -0,0 +1,26 @@
+#include <arm_sve.h>
+
+void
+test (svbool_t pg, svint8_t s8, svuint8_t u8,
+      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
+      svint64_t s64, svuint64_t u64, int16_t sh, uint16_t uh,
+      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud,
+      float f)
+{
+  svqinch_pat (pg, SV_ALL, 1); /* { dg-error {'svqinch_pat' has no form that takes 'svbool_t' arguments} } */
+  svqinch_pat (s8, SV_ALL, 1); /* { dg-error {'svqinch_pat' has no form that takes 'svint8_t' arguments} } */
+  svqinch_pat (u8, SV_ALL, 1); /* { dg-error {'svqinch_pat' has no form that takes 'svuint8_t' arguments} } */
+  svqinch_pat (s16, SV_ALL, 1);
+  svqinch_pat (u16, SV_ALL, 1);
+  svqinch_pat (s32, SV_ALL, 1); /* { dg-error {'svqinch_pat' has no form that takes 'svint32_t' arguments} } */
+  svqinch_pat (u32, SV_ALL, 1); /* { dg-error {'svqinch_pat' has no form that takes 'svuint32_t' arguments} } */
+  svqinch_pat (s64, SV_ALL, 1); /* { dg-error {'svqinch_pat' has no form that takes 'svint64_t' arguments} } */
+  svqinch_pat (u64, SV_ALL, 1); /* { dg-error {'svqinch_pat' has no form that takes 'svuint64_t' arguments} } */
+  svqinch_pat (sh, SV_ALL, 1);
+  svqinch_pat (sw, SV_ALL, 1);
+  svqinch_pat (sd, SV_ALL, 1);
+  svqinch_pat (uh, SV_ALL, 1);
+  svqinch_pat (uw, SV_ALL, 1);
+  svqinch_pat (ud, SV_ALL, 1);
+  svqinch_pat (f, SV_ALL, 1); /* { dg-error {passing 'float' to argument 1 of 'svqinch_pat', which expects a 32-bit or 64-bit integer type} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_4.c
new file mode 100644
index 000000000..9aabbd714
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_4.c
@@ -0,0 +1,26 @@
+#include <arm_sve.h>
+
+void
+test (svbool_t pg, svint8_t s8, svuint8_t u8,
+      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
+      svint64_t s64, svuint64_t u64, int16_t sh, uint16_t uh,
+      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud,
+      float f)
+{
+  svqincw_pat (pg, SV_ALL, 1); /* { dg-error {'svqincw_pat' has no form that takes 'svbool_t' arguments} } */
+  svqincw_pat (s8, SV_ALL, 1); /* { dg-error {'svqincw_pat' has no form that takes 'svint8_t' arguments} } */
+  svqincw_pat (u8, SV_ALL, 1); /* { dg-error {'svqincw_pat' has no form that takes 'svuint8_t' arguments} } */
+  svqincw_pat (s16, SV_ALL, 1); /* { dg-error {'svqincw_pat' has no form that takes 'svint16_t' arguments} } */
+  svqincw_pat (u16, SV_ALL, 1); /* { dg-error {'svqincw_pat' has no form that takes 'svuint16_t' arguments} } */
+  svqincw_pat (s32, SV_ALL, 1);
+  svqincw_pat (u32, SV_ALL, 1);
+  svqincw_pat (s64, SV_ALL, 1); /* { dg-error {'svqincw_pat' has no form that takes 'svint64_t' arguments} } */
+  svqincw_pat (u64, SV_ALL, 1); /* { dg-error {'svqincw_pat' has no form that takes 'svuint64_t' arguments} } */
+  svqincw_pat (sh, SV_ALL, 1);
+  svqincw_pat (sw, SV_ALL, 1);
+  svqincw_pat (sd, SV_ALL, 1);
+  svqincw_pat (uh, SV_ALL, 1);
+  svqincw_pat (uw, SV_ALL, 1);
+  svqincw_pat (ud, SV_ALL, 1);
+  svqincw_pat (f, SV_ALL, 1); /* { dg-error {passing 'float' to argument 1 of 'svqincw_pat', which expects a 32-bit or 64-bit integer type} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_5.c
new file mode 100644
index 000000000..5df88c649
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pat_5.c
@@ -0,0 +1,26 @@
+#include <arm_sve.h>
+
+void
+test (svbool_t pg, svint8_t s8, svuint8_t u8,
+      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
+      svint64_t s64, svuint64_t u64, int16_t sh, uint16_t uh,
+      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud,
+      float f)
+{
+  svqincd_pat (pg, SV_ALL, 1); /* { dg-error {'svqincd_pat' has no form that takes 'svbool_t' arguments} } */
+  svqincd_pat (s8, SV_ALL, 1); /* { dg-error {'svqincd_pat' has no form that takes 'svint8_t' arguments} } */
+  svqincd_pat (u8, SV_ALL, 1); /* { dg-error {'svqincd_pat' has no form that takes 'svuint8_t' arguments} } */
+  svqincd_pat (s16, SV_ALL, 1); /* { dg-error {'svqincd_pat' has no form that takes 'svint16_t' arguments} } */
+  svqincd_pat (u16, SV_ALL, 1); /* { dg-error {'svqincd_pat' has no form that takes 'svuint16_t' arguments} } */
+  svqincd_pat (s32, SV_ALL, 1); /* { dg-error {'svqincd_pat' has no form that takes 'svint32_t' arguments} } */
+  svqincd_pat (u32, SV_ALL, 1); /* { dg-error {'svqincd_pat' has no form that takes 'svuint32_t' arguments} } */
+  svqincd_pat (s64, SV_ALL, 1);
+  svqincd_pat (u64, SV_ALL, 1);
+  svqincd_pat (sh, SV_ALL, 1);
+  svqincd_pat (sw, SV_ALL, 1);
+  svqincd_pat (sd, SV_ALL, 1);
+  svqincd_pat (uh, SV_ALL, 1);
+  svqincd_pat (uw, SV_ALL, 1);
+  svqincd_pat (ud, SV_ALL, 1);
+  svqincd_pat (f, SV_ALL, 1); /* { dg-error {passing 'float' to argument 1 of 'svqincd_pat', which expects a 32-bit or 64-bit integer type} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pred_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pred_1.c
new file mode 100644
index 000000000..a61afcd2d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pred_1.c
@@ -0,0 +1,22 @@
+#include <arm_sve.h>
+
+void
+test (svbool_t pg, svint8_t s8, svuint8_t u8,
+      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
+      svint64_t s64, svuint64_t u64, int i)
+{
+  svqincp (s32); /* { dg-error {too few arguments to function 'svqincp'} } */
+  svqincp (s32, pg, pg); /* { dg-error {too many arguments to function 'svqincp'} } */
+  svqincp (i, pg); /* { dg-error {passing 'int' to argument 1 of 'svqincp', which expects an SVE vector type} } */
+  svqincp (pg, pg); /* { dg-error {'svqincp' has no form that takes 'svbool_t' arguments} } */
+  svqincp (s8, pg); /* { dg-error {'svqincp' has no form that takes 'svint8_t' arguments} } */
+  svqincp (u8, pg); /* { dg-error {'svqincp' has no form that takes 'svuint8_t' arguments} } */
+  svqincp (s16, pg);
+  svqincp (u16, pg);
+  svqincp (s32, pg);
+  svqincp (u32, pg);
+  svqincp (s64, pg);
+  svqincp (u64, pg);
+  svqincp (u64, 0); /* { dg-error {passing 'int' to argument 2 of 'svqincp', which expects 'svbool_t'} } */
+  svqincp (u64, u64); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svqincp', which expects 'svbool_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pred_scalar_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pred_scalar_1.c
new file mode 100644
index 000000000..94ebe7e7a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/inc_dec_pred_scalar_1.c
@@ -0,0 +1,19 @@
+#include <arm_sve.h>
+
+void
+test (svbool_t pg, svint32_t s32, svuint64_t u64, int16_t sh, uint16_t uh,
+      int32_t sw, uint32_t uw, int64_t sd, uint64_t ud)
+{
+  svqincp_b8 (s32); /* { dg-error {too few arguments to function 'svqincp_b8'} } */
+  svqincp_b8 (s32, pg, pg); /* { dg-error {too many arguments to function 'svqincp_b8'} } */
+  svqincp_b8 (pg, pg); /* { dg-error {passing 'svbool_t' to argument 1 of 'svqincp_b8', which expects a 32-bit or 64-bit integer type} } */
+  svqincp_b8 (s32, pg); /* { dg-error {passing 'svint32_t' to argument 1 of 'svqincp_b8', which expects a 32-bit or 64-bit integer type} } */
+  svqincp_b8 (sh, pg);
+  svqincp_b8 (uh, pg);
+  svqincp_b8 (sw, pg);
+  svqincp_b8 (uw, pg);
+  svqincp_b8 (sd, pg);
+  svqincp_b8 (ud, pg);
+  svqincp_b8 (ud, 0); /* { dg-error {passing 'int' to argument 2 of 'svqincp_b8', which expects 'svbool_t'} } */
+  svqincp_b8 (ud, u64); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svqincp_b8', which expects 'svbool_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ld1sh_gather_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ld1sh_gather_1.c
new file mode 100644
index 000000000..91f37f6a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ld1sh_gather_1.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99 -Wpointer-sign" } */
+
+#include <arm_sve.h>
+
+struct s { int i; };
+
+void
+f1 (svbool_t pg, short *s16_ptr, unsigned short *u16_ptr,
+    svint8_t s8, svint16_t s16,
+    svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
+{
+  svld1sh_gather_index (pg, s16_ptr, s32); /* { dg-warning {implicit declaration of function 'svld1sh_gather_index'; did you mean 'svld1_gather_index'} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr); /* { dg-error {too few arguments to function 'svld1sh_gather_index_u32'} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr, s32, 0); /* { dg-error {too many arguments to function 'svld1sh_gather_index_u32'} } */
+  svld1sh_gather_index_u32 (pg, u16_ptr, s32); /* { dg-warning {pointer targets in passing argument 2 of 'svld1sh_gather_s32index_u32' differ in signedness} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr, s32);
+  svld1sh_gather_index_u32 (pg, s16_ptr, u32);
+  svld1sh_gather_index_u32 (pg, s16_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
+
+  svld1sh_gather_index_u32 (pg, 0, s32);
+  svld1sh_gather_index_u32 (pg, s, s32); /* { dg-error {'struct s' to argument 2 of 'svld1sh_gather_index_u32', which expects a vector or pointer base address} } */
+
+  svld1sh_gather_index_u32 (pg, pg, 0); /* { dg-error {passing 'svbool_t' to argument 2 of 'svld1sh_gather_index_u32', which expects 'svuint32_t'} } */
+  svld1sh_gather_index_u32 (pg, s32, 0); /* { dg-error {passing 'svint32_t' to argument 2 of 'svld1sh_gather_index_u32', which expects 'svuint32_t'} } */
+  svld1sh_gather_index_u32 (pg, u32, 0);
+  svld1sh_gather_index_u32 (pg, u64, 0); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svld1sh_gather_index_u32', which expects 'svuint32_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_1.c
new file mode 100644
index 000000000..34f989bf8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { signed char x; };
+
+svuint8_t
+f1 (svbool_t pg, signed char *s8_ptr, void *void_ptr, struct s *s_ptr,
+    float *f32_ptr, _Complex float *cf32_ptr, int **ptr_ptr)
+{
+  svld1 (pg); /* { dg-error {too few arguments to function 'svld1'} } */
+  svld1 (pg, s8_ptr, 0); /* { dg-error {too many arguments to function 'svld1'} } */
+  svld1 (0, s8_ptr); /* { dg-error {passing 'int' to argument 1 of 'svld1', which expects 'svbool_t'} } */
+  svld1 (pg, 0); /* { dg-error {passing 'int' to argument 2 of 'svld1', which expects a pointer type} } */
+  svld1 (pg, (int *) 0);
+  svld1 (pg, void_ptr); /* { dg-error {passing 'void \*' to argument 2 of 'svld1', but 'void' is not a valid SVE element type} } */
+  svld1 (pg, s_ptr); /* { dg-error {passing 'struct s \*' to argument 2 of 'svld1', but 'struct s' is not a valid SVE element type} } */
+  svld1 (pg, f32_ptr);
+  svld1 (pg, cf32_ptr); /* { dg-error {passing '_Complex float \*' to argument 2 of 'svld1', but 'complex float' is not a valid SVE element type} } */
+  svld1 (pg, ptr_ptr); /* { dg-error {passing 'int \*\*' to argument 2 of 'svld1', but 'int \*' is not a valid SVE element type} } */
+  return svld1 (pg, s8_ptr); /* { dg-error {incompatible types when returning type 'svint8_t' but 'svuint8_t' was expected} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_2.c
new file mode 100644
index 000000000..beb07f138
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_2.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { signed char x; };
+
+svuint8_t
+f1 (svbool_t pg, signed char *s8_ptr, void *void_ptr, struct s *s_ptr,
+    float *f32_ptr, _Complex float *cf32_ptr)
+{
+  svld1_s8 (pg); /* { dg-error {too few arguments to function 'svld1_s8'} } */
+  svld1_s8 (pg, s8_ptr, 0); /* { dg-error {too many arguments to function 'svld1_s8'} } */
+  svld1_s8 (0, 0); /* { dg-error {incompatible type for argument 1 of 'svld1_s8'} } */
+  svld1_s8 (pg, 0);
+  svld1_s32 (pg, (int *) 0);
+  svld1_s8 (pg, void_ptr);
+  svld1_s8 (pg, s_ptr); /* { dg-warning {passing argument 2 of 'svld1_s8' from incompatible pointer type} } */
+  svld1_f32 (pg, f32_ptr);
+  svld1_f32 (pg, cf32_ptr); /* { dg-warning {passing argument 2 of 'svld1_f32' from incompatible pointer type} } */
+  return svld1_s8 (pg, s8_ptr); /* { dg-error {incompatible types when returning type 'svint8_t' but 'svuint8_t' was expected} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_3.c
new file mode 100644
index 000000000..770203f64
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_3.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { signed char x; };
+
+svuint8_t
+f1 (svbool_t pg, signed char *s8_ptr, svint8_t s8)
+{
+  svld1_vnum (pg); /* { dg-error {too few arguments to function 'svld1_vnum'} } */
+  svld1_vnum (pg, s8_ptr); /* { dg-error {too few arguments to function 'svld1_vnum'} } */
+  svld1_vnum (pg, s8_ptr, 0, 0); /* { dg-error {too many arguments to function 'svld1_vnum'} } */
+  svld1_vnum (0, s8_ptr, 0); /* { dg-error {passing 'int' to argument 1 of 'svld1_vnum', which expects 'svbool_t'} } */
+  svld1_vnum (pg, 0, 0); /* { dg-error {passing 'int' to argument 2 of 'svld1_vnum', which expects a pointer type} } */
+  svld1_vnum (pg, s8_ptr, s8_ptr); /* { dg-warning "passing argument 3 of 'svld1_vnum_s8' makes integer from pointer without a cast" } */
+  svld1_vnum (pg, s8_ptr, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svld1_vnum', which expects 'int64_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_index_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_index_1.c
new file mode 100644
index 000000000..91f37f6a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_index_1.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99 -Wpointer-sign" } */
+
+#include <arm_sve.h>
+
+struct s { int i; };
+
+void
+f1 (svbool_t pg, short *s16_ptr, unsigned short *u16_ptr,
+    svint8_t s8, svint16_t s16,
+    svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
+{
+  svld1sh_gather_index (pg, s16_ptr, s32); /* { dg-warning {implicit declaration of function 'svld1sh_gather_index'; did you mean 'svld1_gather_index'} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr); /* { dg-error {too few arguments to function 'svld1sh_gather_index_u32'} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr, s32, 0); /* { dg-error {too many arguments to function 'svld1sh_gather_index_u32'} } */
+  svld1sh_gather_index_u32 (pg, u16_ptr, s32); /* { dg-warning {pointer targets in passing argument 2 of 'svld1sh_gather_s32index_u32' differ in signedness} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr, s32);
+  svld1sh_gather_index_u32 (pg, s16_ptr, u32);
+  svld1sh_gather_index_u32 (pg, s16_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
+  svld1sh_gather_index_u32 (pg, s16_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1sh_gather_index_u32', which expects a vector of 32-bit integers} } */
+
+  svld1sh_gather_index_u32 (pg, 0, s32);
+  svld1sh_gather_index_u32 (pg, s, s32); /* { dg-error {'struct s' to argument 2 of 'svld1sh_gather_index_u32', which expects a vector or pointer base address} } */
+
+  svld1sh_gather_index_u32 (pg, pg, 0); /* { dg-error {passing 'svbool_t' to argument 2 of 'svld1sh_gather_index_u32', which expects 'svuint32_t'} } */
+  svld1sh_gather_index_u32 (pg, s32, 0); /* { dg-error {passing 'svint32_t' to argument 2 of 'svld1sh_gather_index_u32', which expects 'svuint32_t'} } */
+  svld1sh_gather_index_u32 (pg, u32, 0);
+  svld1sh_gather_index_u32 (pg, u64, 0); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svld1sh_gather_index_u32', which expects 'svuint32_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_1.c
new file mode 100644
index 000000000..dae4d0ce1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_1.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { int i; };
+
+void
+f1 (svbool_t pg, signed char *s8_ptr, short *s16_ptr,
+    svint8_t s8, svint16_t s16,
+    svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
+{
+  svld1sb_gather_offset (pg, s8_ptr, s32); /* { dg-warning {implicit declaration of function 'svld1sb_gather_offset'; did you mean 'svld1_gather_offset'} } */
+  svld1sb_gather_offset_s32 (pg, s8_ptr); /* { dg-error {too few arguments to function 'svld1sb_gather_offset_s32'} } */
+  svld1sb_gather_offset_s32 (pg, s8_ptr, s32, 0); /* { dg-error {too many arguments to function 'svld1sb_gather_offset_s32'} } */
+  svld1sb_gather_offset_s32 (pg, s16_ptr, s32); /* { dg-warning {passing argument 2 of 'svld1sb_gather_s32offset_s32' from incompatible pointer type} } */
+  svld1sb_gather_offset_s32 (pg, s8_ptr, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svld1sb_gather_offset_s32', which expects a vector of 32-bit integers} } */
+  svld1sb_gather_offset_s32 (pg, s8_ptr, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svld1sb_gather_offset_s32', which expects a vector of 32-bit integers} } */
+  svld1sb_gather_offset_s32 (pg, s8_ptr, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svld1sb_gather_offset_s32', which expects a vector of 32-bit integers} } */
+  svld1sb_gather_offset_s32 (pg, s8_ptr, s32);
+  svld1sb_gather_offset_s32 (pg, s8_ptr, u32);
+  svld1sb_gather_offset_s32 (pg, s8_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1sb_gather_offset_s32', which expects a vector of 32-bit integers} } */
+  svld1sb_gather_offset_s32 (pg, s8_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1sb_gather_offset_s32', which expects a vector of 32-bit integers} } */
+  svld1sb_gather_offset_s32 (pg, s8_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1sb_gather_offset_s32', which expects a vector of 32-bit integers} } */
+  svld1sb_gather_offset_s32 (pg, s8_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1sb_gather_offset_s32', which expects a vector of 32-bit integers} } */
+
+  svld1sb_gather_offset_s32 (pg, 0, s32);
+  svld1sb_gather_offset_s32 (pg, s, s32); /* { dg-error {'struct s' to argument 2 of 'svld1sb_gather_offset_s32', which expects a vector or pointer base address} } */
+
+  svld1sb_gather_offset_s32 (pg, pg, 0); /* { dg-error {passing 'svbool_t' to argument 2 of 'svld1sb_gather_offset_s32', which expects 'svuint32_t'} } */
+  svld1sb_gather_offset_s32 (pg, s32, 0); /* { dg-error {passing 'svint32_t' to argument 2 of 'svld1sb_gather_offset_s32', which expects 'svuint32_t'} } */
+  svld1sb_gather_offset_s32 (pg, u32, 0);
+  svld1sb_gather_offset_s32 (pg, u64, 0); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svld1sb_gather_offset_s32', which expects 'svuint32_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_2.c
new file mode 100644
index 000000000..1bc66977c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_2.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { int i; };
+
+void
+f1 (svbool_t pg, signed char *s8_ptr, short *s16_ptr,
+    svint8_t s8, svint16_t s16,
+    svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
+{
+  svld1sb_gather_offset (pg, s8_ptr, s32); /* { dg-warning {implicit declaration of function 'svld1sb_gather_offset'; did you mean 'svld1_gather_offset'} } */
+  svld1sb_gather_offset_u32 (pg, s8_ptr); /* { dg-error {too few arguments to function 'svld1sb_gather_offset_u32'} } */
+  svld1sb_gather_offset_u32 (pg, s8_ptr, s32, 0); /* { dg-error {too many arguments to function 'svld1sb_gather_offset_u32'} } */
+  svld1sb_gather_offset_u32 (pg, s16_ptr, s32); /* { dg-warning {passing argument 2 of 'svld1sb_gather_s32offset_u32' from incompatible pointer type} } */
+  svld1sb_gather_offset_u32 (pg, s8_ptr, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svld1sb_gather_offset_u32', which expects a vector of 32-bit integers} } */
+  svld1sb_gather_offset_u32 (pg, s8_ptr, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svld1sb_gather_offset_u32', which expects a vector of 32-bit integers} } */
+  svld1sb_gather_offset_u32 (pg, s8_ptr, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svld1sb_gather_offset_u32', which expects a vector of 32-bit integers} } */
+  svld1sb_gather_offset_u32 (pg, s8_ptr, s32);
+  svld1sb_gather_offset_u32 (pg, s8_ptr, u32);
+  svld1sb_gather_offset_u32 (pg, s8_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1sb_gather_offset_u32', which expects a vector of 32-bit integers} } */
+  svld1sb_gather_offset_u32 (pg, s8_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1sb_gather_offset_u32', which expects a vector of 32-bit integers} } */
+  svld1sb_gather_offset_u32 (pg, s8_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1sb_gather_offset_u32', which expects a vector of 32-bit integers} } */
+  svld1sb_gather_offset_u32 (pg, s8_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1sb_gather_offset_u32', which expects a vector of 32-bit integers} } */
+
+  svld1sb_gather_offset_u32 (pg, 0, s32);
+  svld1sb_gather_offset_u32 (pg, s, s32); /* { dg-error {'struct s' to argument 2 of 'svld1sb_gather_offset_u32', which expects a vector or pointer base address} } */
+
+  svld1sb_gather_offset_u32 (pg, pg, 0); /* { dg-error {passing 'svbool_t' to argument 2 of 'svld1sb_gather_offset_u32', which expects 'svuint32_t'} } */
+  svld1sb_gather_offset_u32 (pg, s32, 0); /* { dg-error {passing 'svint32_t' to argument 2 of 'svld1sb_gather_offset_u32', which expects 'svuint32_t'} } */
+  svld1sb_gather_offset_u32 (pg, u32, 0);
+  svld1sb_gather_offset_u32 (pg, u64, 0); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svld1sb_gather_offset_u32', which expects 'svuint32_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_3.c
new file mode 100644
index 000000000..6522889db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_3.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { int i; };
+
+void
+f1 (svbool_t pg, signed char *s8_ptr, short *s16_ptr,
+    svint8_t s8, svint16_t s16,
+    svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
+{
+  svld1sb_gather_offset (pg, s8_ptr, s64); /* { dg-warning {implicit declaration of function 'svld1sb_gather_offset'; did you mean 'svld1_gather_offset'} } */
+  svld1sb_gather_offset_s64 (pg, s8_ptr); /* { dg-error {too few arguments to function 'svld1sb_gather_offset_s64'} } */
+  svld1sb_gather_offset_s64 (pg, s8_ptr, s64, 0); /* { dg-error {too many arguments to function 'svld1sb_gather_offset_s64'} } */
+  svld1sb_gather_offset_s64 (pg, s16_ptr, s64); /* { dg-warning {passing argument 2 of 'svld1sb_gather_s64offset_s64' from incompatible pointer type} } */
+  svld1sb_gather_offset_s64 (pg, s8_ptr, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svld1sb_gather_offset_s64', which expects a vector of 64-bit integers} } */
+  svld1sb_gather_offset_s64 (pg, s8_ptr, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svld1sb_gather_offset_s64', which expects a vector of 64-bit integers} } */
+  svld1sb_gather_offset_s64 (pg, s8_ptr, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svld1sb_gather_offset_s64', which expects a vector of 64-bit integers} } */
+  svld1sb_gather_offset_s64 (pg, s8_ptr, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svld1sb_gather_offset_s64', which expects a vector of 64-bit integers} } */
+  svld1sb_gather_offset_s64 (pg, s8_ptr, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svld1sb_gather_offset_s64', which expects a vector of 64-bit integers} } */
+  svld1sb_gather_offset_s64 (pg, s8_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1sb_gather_offset_s64', which expects a vector of 64-bit integers} } */
+  svld1sb_gather_offset_s64 (pg, s8_ptr, s64);
+  svld1sb_gather_offset_s64 (pg, s8_ptr, u64);
+  svld1sb_gather_offset_s64 (pg, s8_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1sb_gather_offset_s64', which expects a vector of 64-bit integers} } */
+
+  svld1sb_gather_offset_s64 (pg, 0, s64);
+  svld1sb_gather_offset_s64 (pg, s, s64); /* { dg-error {'struct s' to argument 2 of 'svld1sb_gather_offset_s64', which expects a vector or pointer base address} } */
+
+  svld1sb_gather_offset_s64 (pg, pg, 0); /* { dg-error {passing 'svbool_t' to argument 2 of 'svld1sb_gather_offset_s64', which expects 'svuint64_t'} } */
+  svld1sb_gather_offset_s64 (pg, s32, 0); /* { dg-error {passing 'svint32_t' to argument 2 of 'svld1sb_gather_offset_s64', which expects 'svuint64_t'} } */
+  svld1sb_gather_offset_s64 (pg, u32, 0); /* { dg-error {passing 'svuint32_t' to argument 2 of 'svld1sb_gather_offset_s64', which expects 'svuint64_t'} } */
+  svld1sb_gather_offset_s64 (pg, u64, 0);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_4.c
new file mode 100644
index 000000000..025621989
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_4.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { int i; };
+
+void
+f1 (svbool_t pg, signed char *s8_ptr, short *s16_ptr,
+    svint8_t s8, svint16_t s16,
+    svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
+{
+  svld1sb_gather_offset (pg, s8_ptr, s64); /* { dg-warning {implicit declaration of function 'svld1sb_gather_offset'; did you mean 'svld1_gather_offset'} } */
+  svld1sb_gather_offset_u64 (pg, s8_ptr); /* { dg-error {too few arguments to function 'svld1sb_gather_offset_u64'} } */
+  svld1sb_gather_offset_u64 (pg, s8_ptr, s64, 0); /* { dg-error {too many arguments to function 'svld1sb_gather_offset_u64'} } */
+  svld1sb_gather_offset_u64 (pg, s16_ptr, s64); /* { dg-warning {passing argument 2 of 'svld1sb_gather_s64offset_u64' from incompatible pointer type} } */
+  svld1sb_gather_offset_u64 (pg, s8_ptr, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svld1sb_gather_offset_u64', which expects a vector of 64-bit integers} } */
+  svld1sb_gather_offset_u64 (pg, s8_ptr, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svld1sb_gather_offset_u64', which expects a vector of 64-bit integers} } */
+  svld1sb_gather_offset_u64 (pg, s8_ptr, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svld1sb_gather_offset_u64', which expects a vector of 64-bit integers} } */
+  svld1sb_gather_offset_u64 (pg, s8_ptr, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svld1sb_gather_offset_u64', which expects a vector of 64-bit integers} } */
+  svld1sb_gather_offset_u64 (pg, s8_ptr, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svld1sb_gather_offset_u64', which expects a vector of 64-bit integers} } */
+  svld1sb_gather_offset_u64 (pg, s8_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1sb_gather_offset_u64', which expects a vector of 64-bit integers} } */
+  svld1sb_gather_offset_u64 (pg, s8_ptr, s64);
+  svld1sb_gather_offset_u64 (pg, s8_ptr, u64);
+  svld1sb_gather_offset_u64 (pg, s8_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1sb_gather_offset_u64', which expects a vector of 64-bit integers} } */
+
+  svld1sb_gather_offset_u64 (pg, 0, s64);
+  svld1sb_gather_offset_u64 (pg, s, s64); /* { dg-error {'struct s' to argument 2 of 'svld1sb_gather_offset_u64', which expects a vector or pointer base address} } */
+
+  svld1sb_gather_offset_u64 (pg, pg, 0); /* { dg-error {passing 'svbool_t' to argument 2 of 'svld1sb_gather_offset_u64', which expects 'svuint64_t'} } */
+  svld1sb_gather_offset_u64 (pg, s32, 0); /* { dg-error {passing 'svint32_t' to argument 2 of 'svld1sb_gather_offset_u64', which expects 'svuint64_t'} } */
+  svld1sb_gather_offset_u64 (pg, u32, 0); /* { dg-error {passing 'svuint32_t' to argument 2 of 'svld1sb_gather_offset_u64', which expects 'svuint64_t'} } */
+  svld1sb_gather_offset_u64 (pg, u64, 0);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_5.c
new file mode 100644
index 000000000..8d57aa020
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_ext_gather_offset_5.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { int i; };
+
+void
+f1 (svbool_t pg, unsigned char *s8_ptr, unsigned short *s16_ptr,
+    svint8_t s8, svint16_t s16,
+    svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
+{
+  svld1ub_gather_offset (pg, s8_ptr, s32); /* { dg-warning {implicit declaration of function 'svld1ub_gather_offset'; did you mean 'svld1_gather_offset'} } */
+  svld1ub_gather_offset_s32 (pg, s8_ptr); /* { dg-error {too few arguments to function 'svld1ub_gather_offset_s32'} } */
+  svld1ub_gather_offset_s32 (pg, s8_ptr, s32, 0); /* { dg-error {too many arguments to function 'svld1ub_gather_offset_s32'} } */
+  svld1ub_gather_offset_s32 (pg, s16_ptr, s32); /* { dg-warning {passing argument 2 of 'svld1ub_gather_s32offset_s32' from incompatible pointer type} } */
+  svld1ub_gather_offset_s32 (pg, s8_ptr, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svld1ub_gather_offset_s32', which expects a vector of 32-bit integers} } */
+  svld1ub_gather_offset_s32 (pg, s8_ptr, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svld1ub_gather_offset_s32', which expects a vector of 32-bit integers} } */
+  svld1ub_gather_offset_s32 (pg, s8_ptr, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svld1ub_gather_offset_s32', which expects a vector of 32-bit integers} } */
+  svld1ub_gather_offset_s32 (pg, s8_ptr, s32);
+  svld1ub_gather_offset_s32 (pg, s8_ptr, u32);
+  svld1ub_gather_offset_s32 (pg, s8_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1ub_gather_offset_s32', which expects a vector of 32-bit integers} } */
+  svld1ub_gather_offset_s32 (pg, s8_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1ub_gather_offset_s32', which expects a vector of 32-bit integers} } */
+  svld1ub_gather_offset_s32 (pg, s8_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1ub_gather_offset_s32', which expects a vector of 32-bit integers} } */
+  svld1ub_gather_offset_s32 (pg, s8_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1ub_gather_offset_s32', which expects a vector of 32-bit integers} } */
+
+  svld1ub_gather_offset_s32 (pg, 0, s32);
+  svld1ub_gather_offset_s32 (pg, s, s32); /* { dg-error {'struct s' to argument 2 of 'svld1ub_gather_offset_s32', which expects a vector or pointer base address} } */
+
+  svld1ub_gather_offset_s32 (pg, pg, 0); /* { dg-error {passing 'svbool_t' to argument 2 of 'svld1ub_gather_offset_s32', which expects 'svuint32_t'} } */
+  svld1ub_gather_offset_s32 (pg, s32, 0); /* { dg-error {passing 'svint32_t' to argument 2 of 'svld1ub_gather_offset_s32', which expects 'svuint32_t'} } */
+  svld1ub_gather_offset_s32 (pg, u32, 0);
+  svld1ub_gather_offset_s32 (pg, u64, 0); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svld1ub_gather_offset_s32', which expects 'svuint32_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_gather_sv_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_gather_sv_1.c
new file mode 100644
index 000000000..21566a9d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_gather_sv_1.c
@@ -0,0 +1,80 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { signed char x; };
+
+svuint32_t
+f1 (svbool_t pg, signed char *s8_ptr, short *s16_ptr,
+    int32_t *s32_ptr, uint32_t *u32_ptr, float *f32_ptr,
+    int64_t *s64_ptr, uint64_t *u64_ptr, double *f64_ptr,
+    void *void_ptr, struct s *s_ptr, _Complex float *cf32_ptr, int **ptr_ptr,
+    svint8_t s8, svint16_t s16,
+    svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64)
+{
+  svld1_gather_offset (pg, s32_ptr); /* { dg-error {too few arguments to function 'svld1_gather_offset'} } */
+  svld1_gather_offset (pg, s32_ptr, s32, 0); /* { dg-error {too many arguments to function 'svld1_gather_offset'} } */
+  svld1_gather_offset (0, s32_ptr, s32); /* { dg-error {passing 'int' to argument 1 of 'svld1_gather_offset', which expects 'svbool_t'} } */
+  svld1_gather_offset (pg, 0, s32); /* { dg-error {passing 'int' to argument 2 of 'svld1_gather_offset', which expects a pointer type} } */
+  svld1_gather_offset (pg, (int *) 0, s32);
+  svld1_gather_offset (pg, void_ptr, s32); /* { dg-error {passing 'void \*' to argument 2 of 'svld1_gather_offset', but 'void' is not a valid SVE element type} } */
+  svld1_gather_offset (pg, s_ptr, s32); /* { dg-error {passing 'struct s \*' to argument 2 of 'svld1_gather_offset', but 'struct s' is not a valid SVE element type} } */
+  svld1_gather_offset (pg, f32_ptr, s32);
+  svld1_gather_offset (pg, cf32_ptr, s32); /* { dg-error {passing '_Complex float \*' to argument 2 of 'svld1_gather_offset', but 'complex float' is not a valid SVE element type} } */
+  svld1_gather_offset (pg, ptr_ptr, u64); /* { dg-error {passing 'int \*\*' to argument 2 of 'svld1_gather_offset', but 'int \*' is not a valid SVE element type} } */
+  svld1_gather_offset (pg, u32, 0); /* { dg-error {passing 'svuint32_t' to argument 2 of 'svld1_gather_offset', which expects a pointer type} } */
+  /* { dg-message {an explicit type suffix is needed when using a vector of base addresses} "" { target *-*-* } .-1 } */
+  svld1_gather_offset (pg, u64, 0); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svld1_gather_offset', which expects a pointer type} } */
+  /* { dg-message {an explicit type suffix is needed when using a vector of base addresses} "" { target *-*-* } .-1 } */
+
+  svld1_gather_offset (pg, s8_ptr, s8); /* { dg-error {passing 'signed char \*' to argument 2 of 'svld1_gather_offset', which expects a pointer to 32-bit or 64-bit elements} } */
+  svld1_gather_offset (pg, s8_ptr, s32); /* { dg-error {passing 'signed char \*' to argument 2 of 'svld1_gather_offset', which expects a pointer to 32-bit or 64-bit elements} } */
+  svld1_gather_offset (pg, s16_ptr, s16); /* { dg-error {passing 'short( int)? \*' to argument 2 of 'svld1_gather_offset', which expects a pointer to 32-bit or 64-bit elements} } */
+  svld1_gather_offset (pg, s16_ptr, s32); /* { dg-error {passing 'short( int)? \*' to argument 2 of 'svld1_gather_offset', which expects a pointer to 32-bit or 64-bit elements} } */
+
+  svld1_gather_offset (pg, s32_ptr, s32);
+  svld1_gather_offset (pg, s32_ptr, u32);
+  svld1_gather_offset (pg, s32_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svint32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_offset (pg, s32_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svint32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_offset (pg, s32_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svint32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_offset (pg, s32_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svint32_t' expects a vector of 32-bit integers} } */
+
+  svld1_gather_offset (pg, u32_ptr, s32);
+  svld1_gather_offset (pg, u32_ptr, u32);
+  svld1_gather_offset (pg, u32_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svuint32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_offset (pg, u32_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svuint32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_offset (pg, u32_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svuint32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_offset (pg, u32_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svuint32_t' expects a vector of 32-bit integers} } */
+
+  svld1_gather_offset (pg, f32_ptr, s32);
+  svld1_gather_offset (pg, f32_ptr, u32);
+  svld1_gather_offset (pg, f32_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svfloat32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_offset (pg, f32_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svfloat32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_offset (pg, f32_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svfloat32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_offset (pg, f32_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svfloat32_t' expects a vector of 32-bit integers} } */
+
+  svld1_gather_offset (pg, s64_ptr, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svint64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_offset (pg, s64_ptr, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svint64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_offset (pg, s64_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svint64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_offset (pg, s64_ptr, s64);
+  svld1_gather_offset (pg, s64_ptr, u64);
+  svld1_gather_offset (pg, s64_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svint64_t' expects a vector of 64-bit integers} } */
+
+  svld1_gather_offset (pg, u64_ptr, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svuint64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_offset (pg, u64_ptr, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svuint64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_offset (pg, u64_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svuint64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_offset (pg, u64_ptr, s64);
+  svld1_gather_offset (pg, u64_ptr, u64);
+  svld1_gather_offset (pg, u64_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svuint64_t' expects a vector of 64-bit integers} } */
+
+  svld1_gather_offset (pg, f64_ptr, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svfloat64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_offset (pg, f64_ptr, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svfloat64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_offset (pg, f64_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_offset', which when loading 'svfloat64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_offset (pg, f64_ptr, s64);
+  svld1_gather_offset (pg, f64_ptr, u64);
+  svld1_gather_offset (pg, f64_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_offset', which when loading 'svfloat64_t' expects a vector of 64-bit integers} } */
+
+  return svld1_gather_offset (pg, s32_ptr, s32); /* { dg-error {incompatible types when returning type 'svint32_t' but 'svuint32_t' was expected} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_gather_sv_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_gather_sv_2.c
new file mode 100644
index 000000000..4c15fc40c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_gather_sv_2.c
@@ -0,0 +1,80 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { signed char x; };
+
+svuint32_t
+f1 (svbool_t pg, signed char *s8_ptr, short *s16_ptr,
+    int32_t *s32_ptr, uint32_t *u32_ptr, float *f32_ptr,
+    int64_t *s64_ptr, uint64_t *u64_ptr, double *f64_ptr,
+    void *void_ptr, struct s *s_ptr, _Complex float *cf32_ptr, int **ptr_ptr,
+    svint8_t s8, svint16_t s16,
+    svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64)
+{
+  svld1_gather_index (pg, s32_ptr); /* { dg-error {too few arguments to function 'svld1_gather_index'} } */
+  svld1_gather_index (pg, s32_ptr, s32, 0); /* { dg-error {too many arguments to function 'svld1_gather_index'} } */
+  svld1_gather_index (0, s32_ptr, s32); /* { dg-error {passing 'int' to argument 1 of 'svld1_gather_index', which expects 'svbool_t'} } */
+  svld1_gather_index (pg, 0, s32); /* { dg-error {passing 'int' to argument 2 of 'svld1_gather_index', which expects a pointer type} } */
+  svld1_gather_index (pg, (int *) 0, s32);
+  svld1_gather_index (pg, void_ptr, s32); /* { dg-error {passing 'void \*' to argument 2 of 'svld1_gather_index', but 'void' is not a valid SVE element type} } */
+  svld1_gather_index (pg, s_ptr, s32); /* { dg-error {passing 'struct s \*' to argument 2 of 'svld1_gather_index', but 'struct s' is not a valid SVE element type} } */
+  svld1_gather_index (pg, f32_ptr, s32);
+  svld1_gather_index (pg, cf32_ptr, s32); /* { dg-error {passing '_Complex float \*' to argument 2 of 'svld1_gather_index', but 'complex float' is not a valid SVE element type} } */
+  svld1_gather_index (pg, ptr_ptr, u64); /* { dg-error {passing 'int \*\*' to argument 2 of 'svld1_gather_index', but 'int \*' is not a valid SVE element type} } */
+  svld1_gather_index (pg, u32, 0); /* { dg-error {passing 'svuint32_t' to argument 2 of 'svld1_gather_index', which expects a pointer type} } */
+  /* { dg-message {an explicit type suffix is needed when using a vector of base addresses} "" { target *-*-* } .-1 } */
+  svld1_gather_index (pg, u64, 0); /* { dg-error {passing 'svuint64_t' to argument 2 of 'svld1_gather_index', which expects a pointer type} } */
+  /* { dg-message {an explicit type suffix is needed when using a vector of base addresses} "" { target *-*-* } .-1 } */
+
+  svld1_gather_index (pg, s8_ptr, s8); /* { dg-error {passing 'signed char \*' to argument 2 of 'svld1_gather_index', which expects a pointer to 32-bit or 64-bit elements} } */
+  svld1_gather_index (pg, s8_ptr, s32); /* { dg-error {passing 'signed char \*' to argument 2 of 'svld1_gather_index', which expects a pointer to 32-bit or 64-bit elements} } */
+  svld1_gather_index (pg, s16_ptr, s16); /* { dg-error {passing 'short( int)? \*' to argument 2 of 'svld1_gather_index', which expects a pointer to 32-bit or 64-bit elements} } */
+  svld1_gather_index (pg, s16_ptr, s32); /* { dg-error {passing 'short( int)? \*' to argument 2 of 'svld1_gather_index', which expects a pointer to 32-bit or 64-bit elements} } */
+
+  svld1_gather_index (pg, s32_ptr, s32);
+  svld1_gather_index (pg, s32_ptr, u32);
+  svld1_gather_index (pg, s32_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_index', which when loading 'svint32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_index (pg, s32_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1_gather_index', which when loading 'svint32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_index (pg, s32_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1_gather_index', which when loading 'svint32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_index (pg, s32_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_index', which when loading 'svint32_t' expects a vector of 32-bit integers} } */
+
+  svld1_gather_index (pg, u32_ptr, s32);
+  svld1_gather_index (pg, u32_ptr, u32);
+  svld1_gather_index (pg, u32_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_index', which when loading 'svuint32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_index (pg, u32_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1_gather_index', which when loading 'svuint32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_index (pg, u32_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1_gather_index', which when loading 'svuint32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_index (pg, u32_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_index', which when loading 'svuint32_t' expects a vector of 32-bit integers} } */
+
+  svld1_gather_index (pg, f32_ptr, s32);
+  svld1_gather_index (pg, f32_ptr, u32);
+  svld1_gather_index (pg, f32_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_index', which when loading 'svfloat32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_index (pg, f32_ptr, s64); /* { dg-error {passing 'svint64_t' to argument 3 of 'svld1_gather_index', which when loading 'svfloat32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_index (pg, f32_ptr, u64); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svld1_gather_index', which when loading 'svfloat32_t' expects a vector of 32-bit integers} } */
+  svld1_gather_index (pg, f32_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_index', which when loading 'svfloat32_t' expects a vector of 32-bit integers} } */
+
+  svld1_gather_index (pg, s64_ptr, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svld1_gather_index', which when loading 'svint64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_index (pg, s64_ptr, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svld1_gather_index', which when loading 'svint64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_index (pg, s64_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_index', which when loading 'svint64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_index (pg, s64_ptr, s64);
+  svld1_gather_index (pg, s64_ptr, u64);
+  svld1_gather_index (pg, s64_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_index', which when loading 'svint64_t' expects a vector of 64-bit integers} } */
+
+  svld1_gather_index (pg, u64_ptr, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svld1_gather_index', which when loading 'svuint64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_index (pg, u64_ptr, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svld1_gather_index', which when loading 'svuint64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_index (pg, u64_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_index', which when loading 'svuint64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_index (pg, u64_ptr, s64);
+  svld1_gather_index (pg, u64_ptr, u64);
+  svld1_gather_index (pg, u64_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_index', which when loading 'svuint64_t' expects a vector of 64-bit integers} } */
+
+  svld1_gather_index (pg, f64_ptr, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svld1_gather_index', which when loading 'svfloat64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_index (pg, f64_ptr, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svld1_gather_index', which when loading 'svfloat64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_index (pg, f64_ptr, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svld1_gather_index', which when loading 'svfloat64_t' expects a vector of 64-bit integers} } */
+  svld1_gather_index (pg, f64_ptr, s64);
+  svld1_gather_index (pg, f64_ptr, u64);
+  svld1_gather_index (pg, f64_ptr, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svld1_gather_index', which when loading 'svfloat64_t' expects a vector of 64-bit integers} } */
+
+  return svld1_gather_index (pg, s32_ptr, s32); /* { dg-error {incompatible types when returning type 'svint32_t' but 'svuint32_t' was expected} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_replicate_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_replicate_1.c
new file mode 100644
index 000000000..d4ff76ea8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/load_replicate_1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { signed char x; };
+
+svuint8_t
+f1 (svbool_t pg, signed char *s8_ptr, void *void_ptr, struct s *s_ptr,
+    float *f32_ptr, _Complex float *cf32_ptr, int **ptr_ptr)
+{
+  svld1rq (pg); /* { dg-error {too few arguments to function 'svld1rq'} } */
+  svld1rq (pg, s8_ptr, 0); /* { dg-error {too many arguments to function 'svld1rq'} } */
+  svld1rq (0, s8_ptr); /* { dg-error {passing 'int' to argument 1 of 'svld1rq', which expects 'svbool_t'} } */
+  svld1rq (pg, 0); /* { dg-error {passing 'int' to argument 2 of 'svld1rq', which expects a pointer type} } */
+  svld1rq (pg, (int *) 0);
+  svld1rq (pg, void_ptr); /* { dg-error {passing 'void \*' to argument 2 of 'svld1rq', but 'void' is not a valid SVE element type} } */
+  svld1rq (pg, s_ptr); /* { dg-error {passing 'struct s \*' to argument 2 of 'svld1rq', but 'struct s' is not a valid SVE element type} } */
+  svld1rq (pg, f32_ptr);
+  svld1rq (pg, cf32_ptr); /* { dg-error {passing '_Complex float \*' to argument 2 of 'svld1rq', but 'complex float' is not a valid SVE element type} } */
+  svld1rq (pg, ptr_ptr); /* { dg-error {passing 'int \*\*' to argument 2 of 'svld1rq', but 'int \*' is not a valid SVE element type} } */
+  return svld1rq (pg, s8_ptr); /* { dg-error {incompatible types when returning type 'svint8_t' but 'svuint8_t' was expected} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_1.c
new file mode 100644
index 000000000..5b0b00e96
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_1.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv8.2-a+sve+i8mm+f32mm+f64mm" } */
+
+#include <arm_sve.h>
+
+svuint32_t
+f1 (svint32_t s32, svuint8_t u8, svint8_t s8, svuint32_t u32)
+{
+  svmmla_s32 (s32); /* { dg-error {too few arguments to function 'svmmla_s32'} } */
+  svmmla_s32 (s32, s8, s8, u32); /* { dg-error {too many arguments to function 'svmmla_s32'} } */
+  svmmla_s32 (s32, u32, s8); /* { dg-error {incompatible type for argument 2 of 'svmmla_s32'} } */
+  svmmla_s32 (s32, u8, s8); /* { dg-error {incompatible type for argument 2 of 'svmmla_s32'} } */
+  svmmla_s32 (s32, s8, u8); /* { dg-error {incompatible type for argument 3 of 'svmmla_s32'} } */
+  svmmla_s32 (s32, s8, s32); /* { dg-error {incompatible type for argument 3 of 'svmmla_s32'} } */
+  svmmla_s32 (s32, s8, 0); /* { dg-error {incompatible type for argument 3 of 'svmmla_s32'} } */
+  svmmla_s32 (s32, s8, s8);
+  return svmmla_s32 (s32, s8, s8); /* { dg-error {incompatible types when returning type 'svint32_t' but 'svuint32_t' was expected} } */
+}
+
+void
+f2 (svbool_t pg, svint8_t s8, svuint8_t u8, svuint32_t u32, svint32_t s32,
+    svfloat16_t f16, svfloat32_t f32, svfloat64_t f64)
+{
+  svmmla (s32, s8); /* { dg-error {too few arguments to function 'svmmla'} } */
+  svmmla (s32, s8, s8, s8); /* { dg-error {too many arguments to function 'svmmla'} } */
+  svmmla (0, s8, s8); /* { dg-error {passing 'int' to argument 1 of 'svmmla', which expects an SVE vector type} } */
+  svmmla (pg, s8, s8); /* { dg-error {'svmmla' has no form that takes 'svbool_t' arguments} } */
+  svmmla (u8, s8, s8); /* { dg-error {'svmmla' has no form that takes 'svuint8_t' arguments} } */
+
+  svmmla (s32, 0, s8); /* { dg-error {passing 'int' to argument 2 of 'svmmla', which expects an SVE vector type} } */
+  svmmla (s32, u8, s8); /* { dg-error {arguments 1 and 2 of 'svmmla' must have the same signedness, but the values passed here have type 'svint32_t' and 'svuint8_t' respectively} } */
+  svmmla (s32, s8, u8); /* { dg-error {arguments 1 and 3 of 'svmmla' must have the same signedness, but the values passed here have type 'svint32_t' and 'svuint8_t' respectively} } */
+  svmmla (s32, s8, 0); /* { dg-error {passing 'int' to argument 3 of 'svmmla', which expects an SVE vector type} } */
+  svmmla (s32, s8, s8);
+  svmmla (s32, s32, s32); /* { dg-error {passing 'svint32_t' instead of the expected 'svint8_t' to argument 2 of 'svmmla', after passing 'svint32_t' to argument 1} } */
+  svmmla (s32, u32, u32); /* { dg-error {passing 'svuint32_t' instead of the expected 'svint8_t' to argument 2 of 'svmmla', after passing 'svint32_t' to argument 1} } */
+
+  svmmla (u32, 0, u8); /* { dg-error {passing 'int' to argument 2 of 'svmmla', which expects an SVE vector type} } */
+  svmmla (u32, s8, u8); /* { dg-error {arguments 1 and 2 of 'svmmla' must have the same signedness, but the values passed here have type 'svuint32_t' and 'svint8_t' respectively} } */
+  svmmla (u32, u8, s8); /* { dg-error {arguments 1 and 3 of 'svmmla' must have the same signedness, but the values passed here have type 'svuint32_t' and 'svint8_t' respectively} } */
+  svmmla (u32, u8, 0); /* { dg-error {passing 'int' to argument 3 of 'svmmla', which expects an SVE vector type} } */
+  svmmla (u32, u8, u8);
+  svmmla (u32, s32, s32); /* { dg-error {passing 'svint32_t' instead of the expected 'svuint8_t' to argument 2 of 'svmmla', after passing 'svuint32_t' to argument 1} } */
+  svmmla (u32, u32, u32); /* { dg-error {passing 'svuint32_t' instead of the expected 'svuint8_t' to argument 2 of 'svmmla', after passing 'svuint32_t' to argument 1} } */
+
+  svmmla (f16, s8, s8); /* { dg-error {'svmmla' has no form that takes 'svfloat16_t' arguments} } */
+  svmmla (f32, s8, s8); /* { dg-error {passing 'svint8_t' to argument 2 of 'svmmla', but previous arguments had type 'svfloat32_t'} } */
+  svmmla (f32, s32, s32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svmmla', but previous arguments had type 'svfloat32_t'} } */
+  svmmla (f32, f16, f16); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svmmla', but previous arguments had type 'svfloat32_t'} } */
+  svmmla (f64, f16, f16); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svmmla', but previous arguments had type 'svfloat64_t'} } */
+  svmmla (f32, f32, f16); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svmmla', but previous arguments had type 'svfloat32_t'} } */
+  svmmla (f64, f32, f16); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svmmla', but previous arguments had type 'svfloat64_t'} } */
+  svmmla (f64, f64, f16); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svmmla', but previous arguments had type 'svfloat64_t'} } */
+
+  svmmla (f16, f16, f16); /* { dg-error {'svmmla' has no form that takes 'svfloat16_t' arguments} } */
+  svmmla (f32, f32, f32);
+  svmmla (f64, f64, f64);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_2.c
new file mode 100644
index 000000000..b54725736
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_2.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv8.2-a+sve" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svint32_t s32, svint8_t s8)
+{
+  svmmla_s32 (s32, s8, s8); /* { dg-error {ACLE function 'svmmla_s32' requires ISA extension 'i8mm'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_3.c
new file mode 100644
index 000000000..d1c8297cc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_3.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv8.2-a+sve" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svint32_t s32, svint8_t s8)
+{
+  svmmla (s32, s8, s8); /* { dg-error {ACLE function 'svmmla_s32' requires ISA extension 'i8mm'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_4.c
new file mode 100644
index 000000000..e6c3f5f94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_4.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv8.2-a+sve" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svfloat32_t f32)
+{
+  svmmla_f32 (f32, f32, f32); /* { dg-error {ACLE function 'svmmla_f32' requires ISA extension 'f32mm'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_5.c
new file mode 100644
index 000000000..8f6f42366
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_5.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv8.2-a+sve" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svfloat32_t f32)
+{
+  svmmla (f32, f32, f32); /* { dg-error {ACLE function 'svmmla_f32' requires ISA extension 'f32mm'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_6.c
new file mode 100644
index 000000000..7ebeb4981
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_6.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv8.2-a+sve" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svfloat64_t f64)
+{
+  svmmla_f64 (f64, f64, f64); /* { dg-error {ACLE function 'svmmla_f64' requires ISA extension 'f64mm'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_7.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_7.c
new file mode 100644
index 000000000..e64ec1ea6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/mmla_7.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv8.2-a+sve" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svfloat64_t f64)
+{
+  svmmla (f64, f64, f64); /* { dg-error {ACLE function 'svmmla_f64' requires ISA extension 'f64mm'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/pattern_pred_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/pattern_pred_1.c
new file mode 100644
index 000000000..99b61bdf1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/pattern_pred_1.c
@@ -0,0 +1,14 @@
+#include <arm_sve.h>
+
+void
+test ()
+{
+  svptrue_pat_b16 ((enum svpattern) -1); /* { dg-error {passing 4294967295 to argument 1 of 'svptrue_pat_b16', which expects a valid 'enum svpattern' value} } */
+  svptrue_pat_b16 ((enum svpattern) 0);
+  svptrue_pat_b16 ((enum svpattern) 13);
+  svptrue_pat_b16 ((enum svpattern) 14); /* { dg-error {passing 14 to argument 1 of 'svptrue_pat_b16', which expects a valid 'enum svpattern' value} } */
+  svptrue_pat_b16 ((enum svpattern) 28); /* { dg-error {passing 28 to argument 1 of 'svptrue_pat_b16', which expects a valid 'enum svpattern' value} } */
+  svptrue_pat_b16 ((enum svpattern) 29);
+  svptrue_pat_b16 ((enum svpattern) 31);
+  svptrue_pat_b16 ((enum svpattern) 32); /* { dg-error {passing 32 to argument 1 of 'svptrue_pat_b16', which expects a valid 'enum svpattern' value} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_1.c
new file mode 100644
index 000000000..316f77fc7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, int32_t *s32_ptr, enum svprfop op)
+{
+  svprfb (pg, s32_ptr, op); /* { dg-error {argument 3 of 'svprfb' must be an integer constant expression} } */
+  svprfb (pg, s32_ptr, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 3 of 'svprfb', which expects a valid 'enum svprfop' value} } */
+  svprfb (pg, s32_ptr, (enum svprfop) 0);
+  svprfb (pg, s32_ptr, (enum svprfop) 5);
+  svprfb (pg, s32_ptr, (enum svprfop) 6); /* { dg-error {passing 6 to argument 3 of 'svprfb', which expects a valid 'enum svprfop' value} } */
+  svprfb (pg, s32_ptr, (enum svprfop) 7); /* { dg-error {passing 7 to argument 3 of 'svprfb', which expects a valid 'enum svprfop' value} } */
+  svprfb (pg, s32_ptr, (enum svprfop) 8);
+  svprfb (pg, s32_ptr, (enum svprfop) 14); /* { dg-error {passing 14 to argument 3 of 'svprfb', which expects a valid 'enum svprfop' value} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_1.c
new file mode 100644
index 000000000..c33c95440
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_1.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { int i; };
+
+void
+f1 (svbool_t pg, int32_t *s32_ptr, void *void_ptr, void **ptr_ptr,
+    svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16, svfloat16_t f16,
+    svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64, enum svprfop op,
+    struct s s)
+{
+  svprfh_gather_index (pg, s32_ptr, s32); /* { dg-error {too few arguments to function 'svprfh_gather_index'} } */
+  svprfh_gather_index (pg, s32_ptr, s32, SV_PLDL1KEEP, 0); /* { dg-error {too many arguments to function 'svprfh_gather_index'} } */
+  svprfh_gather_index (0, s32_ptr, s32, SV_PLDL1KEEP); /* { dg-error {passing 'int' to argument 1 of 'svprfh_gather_index', which expects 'svbool_t'} } */
+  svprfh_gather_index (pg, 0, s32, SV_PLDL1KEEP);
+  svprfh_gather_index (pg, (int *) 0, s32, SV_PLDL1KEEP);
+  svprfh_gather_index (pg, void_ptr, s32, SV_PLDL1KEEP);
+  svprfh_gather_index (pg, ptr_ptr, s32, SV_PLDL1KEEP);
+  svprfh_gather_index (pg, s, s32, SV_PLDL1KEEP); /* { dg-error {passing 'struct s' to argument 2 of 'svprfh_gather_index', which expects a vector or pointer base address} } */
+
+  svprfh_gather_index (pg, s32_ptr, s8, SV_PLDL1KEEP); /* { dg-error {passing 'svint8_t' to argument 3 of 'svprfh_gather_index', which expects a vector of 32-bit or 64-bit integers} } */
+  svprfh_gather_index (pg, s32_ptr, u8, SV_PLDL1KEEP); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svprfh_gather_index', which expects a vector of 32-bit or 64-bit integers} } */
+  svprfh_gather_index (pg, s32_ptr, s16, SV_PLDL1KEEP); /* { dg-error {passing 'svint16_t' to argument 3 of 'svprfh_gather_index', which expects a vector of 32-bit or 64-bit integers} } */
+  svprfh_gather_index (pg, s32_ptr, u16, SV_PLDL1KEEP); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svprfh_gather_index', which expects a vector of 32-bit or 64-bit integers} } */
+  svprfh_gather_index (pg, s32_ptr, f16, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svprfh_gather_index', which expects a vector of integers} } */
+  svprfh_gather_index (pg, s32_ptr, s32, SV_PLDL1KEEP);
+  svprfh_gather_index (pg, s32_ptr, u32, SV_PLDL1KEEP);
+  svprfh_gather_index (pg, s32_ptr, f32, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svprfh_gather_index', which expects a vector of integers} } */
+  svprfh_gather_index (pg, s32_ptr, s64, SV_PLDL1KEEP);
+  svprfh_gather_index (pg, s32_ptr, u64, SV_PLDL1KEEP);
+  svprfh_gather_index (pg, s32_ptr, f64, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svprfh_gather_index', which expects a vector of integers} } */
+
+  svprfh_gather_index (pg, u8, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svprfh_gather_index', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfh_gather_index (pg, u16, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svuint16_t' to argument 2 of 'svprfh_gather_index', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfh_gather_index (pg, s32, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svint32_t' to argument 2 of 'svprfh_gather_index', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfh_gather_index (pg, u32, 0, SV_PLDL1KEEP);
+  svprfh_gather_index (pg, f32, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svprfh_gather_index', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfh_gather_index (pg, s64, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svint64_t' to argument 2 of 'svprfh_gather_index', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfh_gather_index (pg, u64, 0, SV_PLDL1KEEP);
+  svprfh_gather_index (pg, f64, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svprfh_gather_index', which expects 'svuint32_t' or 'svuint64_t'} } */
+
+  svprfh_gather_index (pg, s32_ptr, s32, op); /* { dg-error {argument 4 of 'svprfh_gather_index' must be an integer constant expression} } */
+  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 4 of 'svprfh_gather_index', which expects a valid 'enum svprfop' value} } */
+  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 0);
+  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 5);
+  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 4 of 'svprfh_gather_index', which expects a valid 'enum svprfop' value} } */
+  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 4 of 'svprfh_gather_index', which expects a valid 'enum svprfop' value} } */
+  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 8);
+  svprfh_gather_index (pg, s32_ptr, s32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 4 of 'svprfh_gather_index', which expects a valid 'enum svprfop' value} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_2.c
new file mode 100644
index 000000000..3d7797305
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_index_2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, int32_t *s32_ptr, svint32_t s32, enum svprfop op)
+{
+  svprfh_gather_s32index (pg, s32_ptr, s32, op); /* { dg-error {argument 4 of 'svprfh_gather_s32index' must be an integer constant expression} } */
+  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 4 of 'svprfh_gather_s32index', which expects a valid 'enum svprfop' value} } */
+  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 0);
+  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 5);
+  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 4 of 'svprfh_gather_s32index', which expects a valid 'enum svprfop' value} } */
+  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 4 of 'svprfh_gather_s32index', which expects a valid 'enum svprfop' value} } */
+  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 8);
+  svprfh_gather_s32index (pg, s32_ptr, s32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 4 of 'svprfh_gather_s32index', which expects a valid 'enum svprfop' value} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_1.c
new file mode 100644
index 000000000..cc61901cb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_1.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { int i; };
+
+void
+f1 (svbool_t pg, int32_t *s32_ptr, void *void_ptr, void **ptr_ptr,
+    svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16, svfloat16_t f16,
+    svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64, enum svprfop op,
+    struct s s)
+{
+  svprfb_gather_offset (pg, s32_ptr, s32); /* { dg-error {too few arguments to function 'svprfb_gather_offset'} } */
+  svprfb_gather_offset (pg, s32_ptr, s32, SV_PLDL1KEEP, 0); /* { dg-error {too many arguments to function 'svprfb_gather_offset'} } */
+  svprfb_gather_offset (0, s32_ptr, s32, SV_PLDL1KEEP); /* { dg-error {passing 'int' to argument 1 of 'svprfb_gather_offset', which expects 'svbool_t'} } */
+  svprfb_gather_offset (pg, 0, s32, SV_PLDL1KEEP);
+  svprfb_gather_offset (pg, (int *) 0, s32, SV_PLDL1KEEP);
+  svprfb_gather_offset (pg, void_ptr, s32, SV_PLDL1KEEP);
+  svprfb_gather_offset (pg, ptr_ptr, s32, SV_PLDL1KEEP);
+  svprfb_gather_offset (pg, s, s32, SV_PLDL1KEEP); /* { dg-error {passing 'struct s' to argument 2 of 'svprfb_gather_offset', which expects a vector or pointer base address} } */
+
+  svprfb_gather_offset (pg, s32_ptr, s8, SV_PLDL1KEEP); /* { dg-error {passing 'svint8_t' to argument 3 of 'svprfb_gather_offset', which expects a vector of 32-bit or 64-bit integers} } */
+  svprfb_gather_offset (pg, s32_ptr, u8, SV_PLDL1KEEP); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svprfb_gather_offset', which expects a vector of 32-bit or 64-bit integers} } */
+  svprfb_gather_offset (pg, s32_ptr, s16, SV_PLDL1KEEP); /* { dg-error {passing 'svint16_t' to argument 3 of 'svprfb_gather_offset', which expects a vector of 32-bit or 64-bit integers} } */
+  svprfb_gather_offset (pg, s32_ptr, u16, SV_PLDL1KEEP); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svprfb_gather_offset', which expects a vector of 32-bit or 64-bit integers} } */
+  svprfb_gather_offset (pg, s32_ptr, f16, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svprfb_gather_offset', which expects a vector of integers} } */
+  svprfb_gather_offset (pg, s32_ptr, s32, SV_PLDL1KEEP);
+  svprfb_gather_offset (pg, s32_ptr, u32, SV_PLDL1KEEP);
+  svprfb_gather_offset (pg, s32_ptr, f32, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svprfb_gather_offset', which expects a vector of integers} } */
+  svprfb_gather_offset (pg, s32_ptr, s64, SV_PLDL1KEEP);
+  svprfb_gather_offset (pg, s32_ptr, u64, SV_PLDL1KEEP);
+  svprfb_gather_offset (pg, s32_ptr, f64, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svprfb_gather_offset', which expects a vector of integers} } */
+
+  svprfb_gather_offset (pg, u8, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svprfb_gather_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfb_gather_offset (pg, u16, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svuint16_t' to argument 2 of 'svprfb_gather_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfb_gather_offset (pg, s32, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svint32_t' to argument 2 of 'svprfb_gather_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfb_gather_offset (pg, u32, 0, SV_PLDL1KEEP);
+  svprfb_gather_offset (pg, f32, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svprfb_gather_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfb_gather_offset (pg, s64, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svint64_t' to argument 2 of 'svprfb_gather_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfb_gather_offset (pg, u64, 0, SV_PLDL1KEEP);
+  svprfb_gather_offset (pg, f64, 0, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svprfb_gather_offset', which expects 'svuint32_t' or 'svuint64_t'} } */
+
+  svprfb_gather_offset (pg, s32_ptr, s32, op); /* { dg-error {argument 4 of 'svprfb_gather_offset' must be an integer constant expression} } */
+  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 4 of 'svprfb_gather_offset', which expects a valid 'enum svprfop' value} } */
+  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 0);
+  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 5);
+  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 4 of 'svprfb_gather_offset', which expects a valid 'enum svprfop' value} } */
+  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 4 of 'svprfb_gather_offset', which expects a valid 'enum svprfop' value} } */
+  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 8);
+  svprfb_gather_offset (pg, s32_ptr, s32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 4 of 'svprfb_gather_offset', which expects a valid 'enum svprfop' value} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_2.c
new file mode 100644
index 000000000..b74721fad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_2.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint8_t s8, svuint8_t u8,
+    svint16_t s16, svuint16_t u16, svfloat16_t f16,
+    svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64, enum svprfop op)
+{
+  svprfb_gather (pg, u32); /* { dg-error {too few arguments to function 'svprfb_gather'} } */
+  svprfb_gather (pg, u32, SV_PLDL1KEEP, 0); /* { dg-error {too many arguments to function 'svprfb_gather'} } */
+  svprfb_gather (0, u32, SV_PLDL1KEEP); /* { dg-error {passing 'int' to argument 1 of 'svprfb_gather', which expects 'svbool_t'} } */
+  svprfb_gather (pg, 0, SV_PLDL1KEEP); /* { dg-error {passing 'int' to argument 2 of 'svprfb_gather', which expects an SVE vector type} } */
+
+  svprfb_gather (pg, s8, SV_PLDL1KEEP); /* { dg-error {passing 'svint8_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfb_gather (pg, u8, SV_PLDL1KEEP); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfb_gather (pg, s16, SV_PLDL1KEEP); /* { dg-error {passing 'svint16_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfb_gather (pg, u16, SV_PLDL1KEEP); /* { dg-error {passing 'svuint16_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfb_gather (pg, f16, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat16_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfb_gather (pg, s32, SV_PLDL1KEEP); /* { dg-error {passing 'svint32_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfb_gather (pg, u32, SV_PLDL1KEEP);
+  svprfb_gather (pg, f32, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfb_gather (pg, s64, SV_PLDL1KEEP); /* { dg-error {passing 'svint64_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
+  svprfb_gather (pg, u64, SV_PLDL1KEEP);
+  svprfb_gather (pg, f64, SV_PLDL1KEEP); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svprfb_gather', which expects 'svuint32_t' or 'svuint64_t'} } */
+
+  svprfb_gather (pg, u32, op); /* { dg-error {argument 3 of 'svprfb_gather' must be an integer constant expression} } */
+  svprfb_gather (pg, u32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 3 of 'svprfb_gather', which expects a valid 'enum svprfop' value} } */
+  svprfb_gather (pg, u32, (enum svprfop) 0);
+  svprfb_gather (pg, u32, (enum svprfop) 5);
+  svprfb_gather (pg, u32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 3 of 'svprfb_gather', which expects a valid 'enum svprfop' value} } */
+  svprfb_gather (pg, u32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 3 of 'svprfb_gather', which expects a valid 'enum svprfop' value} } */
+  svprfb_gather (pg, u32, (enum svprfop) 8);
+  svprfb_gather (pg, u32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 3 of 'svprfb_gather', which expects a valid 'enum svprfop' value} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_3.c
new file mode 100644
index 000000000..24b4aa190
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_3.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, int32_t *s32_ptr, svint32_t s32, enum svprfop op)
+{
+  svprfb_gather_s32offset (pg, s32_ptr, s32, op); /* { dg-error {argument 4 of 'svprfb_gather_s32offset' must be an integer constant expression} } */
+  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 4 of 'svprfb_gather_s32offset', which expects a valid 'enum svprfop' value} } */
+  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 0);
+  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 5);
+  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 4 of 'svprfb_gather_s32offset', which expects a valid 'enum svprfop' value} } */
+  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 4 of 'svprfb_gather_s32offset', which expects a valid 'enum svprfop' value} } */
+  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 8);
+  svprfb_gather_s32offset (pg, s32_ptr, s32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 4 of 'svprfb_gather_s32offset', which expects a valid 'enum svprfop' value} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_4.c
new file mode 100644
index 000000000..63ccdc5a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/prefetch_gather_offset_4.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svuint32_t u32, enum svprfop op)
+{
+  svprfb_gather_u32base (pg, u32, op); /* { dg-error {argument 3 of 'svprfb_gather_u32base' must be an integer constant expression} } */
+  svprfb_gather_u32base (pg, u32, (enum svprfop) -1); /* { dg-error {passing 4294967295 to argument 3 of 'svprfb_gather_u32base', which expects a valid 'enum svprfop' value} } */
+  svprfb_gather_u32base (pg, u32, (enum svprfop) 0);
+  svprfb_gather_u32base (pg, u32, (enum svprfop) 5);
+  svprfb_gather_u32base (pg, u32, (enum svprfop) 6); /* { dg-error {passing 6 to argument 3 of 'svprfb_gather_u32base', which expects a valid 'enum svprfop' value} } */
+  svprfb_gather_u32base (pg, u32, (enum svprfop) 7); /* { dg-error {passing 7 to argument 3 of 'svprfb_gather_u32base', which expects a valid 'enum svprfop' value} } */
+  svprfb_gather_u32base (pg, u32, (enum svprfop) 8);
+  svprfb_gather_u32base (pg, u32, (enum svprfop) 14); /* { dg-error {passing 14 to argument 3 of 'svprfb_gather_u32base', which expects a valid 'enum svprfop' value} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/reduction_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/reduction_1.c
new file mode 100644
index 000000000..ab0ef304a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/reduction_1.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svuint32x2_t u32x2)
+{
+  svorv (pg); /* { dg-error {too few arguments to function 'svorv'} } */
+  svorv (pg, u32, u32); /* { dg-error {too many arguments to function 'svorv'} } */
+  svorv (0, u32); /* { dg-error {passing 'int' to argument 1 of 'svorv', which expects 'svbool_t'} } */
+  svorv (u32, u32); /* { dg-error {passing 'svuint32_t' to argument 1 of 'svorv', which expects 'svbool_t'} } */
+  svorv (pg, 0); /* { dg-error {passing 'int' to argument 2 of 'svorv', which expects an SVE vector type} } */
+  svorv (pg, pg); /* { dg-error {'svorv' has no form that takes 'svbool_t' arguments} } */
+  svorv (pg, s32);
+  svorv (pg, u32);
+  svorv (pg, f32); /* { dg-error {'svorv' has no form that takes 'svfloat32_t' arguments} } */
+  svorv (pg, u32x2); /* { dg-error {passing 'svuint32x2_t' to argument 2 of 'svorv', which expects a single SVE vector rather than a tuple} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/reduction_wide_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/reduction_wide_1.c
new file mode 100644
index 000000000..f99a2887b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/reduction_wide_1.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svuint32x2_t u32x2)
+{
+  svaddv (pg); /* { dg-error {too few arguments to function 'svaddv'} } */
+  svaddv (pg, u32, u32); /* { dg-error {too many arguments to function 'svaddv'} } */
+  svaddv (0, u32); /* { dg-error {passing 'int' to argument 1 of 'svaddv', which expects 'svbool_t'} } */
+  svaddv (u32, u32); /* { dg-error {passing 'svuint32_t' to argument 1 of 'svaddv', which expects 'svbool_t'} } */
+  svaddv (pg, 0); /* { dg-error {passing 'int' to argument 2 of 'svaddv', which expects an SVE vector type} } */
+  svaddv (pg, pg); /* { dg-error {'svaddv' has no form that takes 'svbool_t' arguments} } */
+  svaddv (pg, s32);
+  svaddv (pg, u32);
+  svaddv (pg, f32);
+  svaddv (pg, u32x2); /* { dg-error {passing 'svuint32x2_t' to argument 2 of 'svaddv', which expects a single SVE vector rather than a tuple} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_1.c
new file mode 100644
index 000000000..f07c76102
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_1.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+svfloat64_t
+f1 (svbool_t pg, svuint8_t u8, svuint8x2_t u8x2, svuint8x3_t u8x3, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  u8x2 = svset2 (u8x2); /* { dg-error {too few arguments to function 'svset2'} } */
+  u8x2 = svset2 (u8x2, 1); /* { dg-error {too few arguments to function 'svset2'} } */
+  u8x2 = svset2 (u8x2, 1, u8, 3); /* { dg-error {too many arguments to function 'svset2'} } */
+  u8x2 = svset2 (u8, 0, u8); /* { dg-error {passing single vector 'svuint8_t' to argument 1 of 'svset2', which expects a tuple of 2 vectors} } */
+  u8x2 = svset2 (u8x3, 0, u8); /* { dg-error {passing 'svuint8x3_t' to argument 1 of 'svset2', which expects a tuple of 2 vectors} } */
+  u8x2 = svset2 (pg, 0, u8); /* { dg-error {passing 'svbool_t' to argument 1 of 'svset2', which expects a tuple of 2 vectors} } */
+  u8x2 = svset2 (u8x2, 0, u8x2); /* { dg-error {passing 'svuint8x2_t' to argument 3 of 'svset2', which expects a single SVE vector rather than a tuple} } */
+  u8x2 = svset2 (u8x2, 0, f64); /* { dg-error {passing 'svfloat64_t' instead of the expected 'svuint8_t' to argument 3 of 'svset2', after passing 'svuint8x2_t' to argument 1} } */
+  u8x2 = svset2 (u8x2, 0, pg); /* { dg-error {passing 'svbool_t' instead of the expected 'svuint8_t' to argument 3 of 'svset2', after passing 'svuint8x2_t' to argument 1} } */
+  u8x2 = svset2 (u8x2, x, u8); /* { dg-error {argument 2 of 'svset2' must be an integer constant expression} } */
+  u8x2 = svset2 (u8x2, 0, u8);
+  f64 = svset2 (u8x2, 0, u8); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svuint8x2_t'} } */
+  u8x2 = svset2 (u8x2, 1, u8);
+  u8x2 = svset2 (u8x2, 2, u8); /* { dg-error {passing 2 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2 (u8x2, 3, u8); /* { dg-error {passing 3 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2 (u8x2, 4, u8); /* { dg-error {passing 4 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2 (u8x2, 5, u8); /* { dg-error {passing 5 to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2 (u8x2, ~0U, u8); /* { dg-error {passing [^ ]* to argument 2 of 'svset2', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2 (u8x2, one, u8); /* { dg-error {argument 2 of 'svset2' must be an integer constant expression} } */
+  u8x2 = svset2 (u8x2, 3 - 2, u8);
+  u8x2 = svset2 (u8x2, 1.0, u8);
+
+  return f64;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_2.c
new file mode 100644
index 000000000..ae277eafd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_2.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+svfloat64_t
+f1 (svbool_t pg, svuint8_t u8, svuint8x2_t u8x2, svint8x2_t s8x2,
+    svuint8x3_t u8x3, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  u8x2 = svset2_u8 (u8x2); /* { dg-error {too few arguments to function 'svset2_u8'} } */
+  u8x2 = svset2_u8 (u8x2, 1); /* { dg-error {too few arguments to function 'svset2_u8'} } */
+  u8x2 = svset2_u8 (u8x2, 1, u8, 3); /* { dg-error {too many arguments to function 'svset2_u8'} } */
+  u8x2 = svset2_u8 (u8, 0, u8); /* { dg-error {incompatible type for argument 1 of 'svset2_u8'} } */
+  u8x2 = svset2_u8 (s8x2, 0, u8); /* { dg-error {incompatible type for argument 1 of 'svset2_u8'} } */
+  u8x2 = svset2_u8 (u8x3, 0, u8); /* { dg-error {incompatible type for argument 1 of 'svset2_u8'} } */
+  u8x2 = svset2_u8 (pg, 0, u8); /* { dg-error {incompatible type for argument 1 of 'svset2_u8'} } */
+  u8x2 = svset2_u8 (u8x2, 0, u8x2); /* { dg-error {incompatible type for argument 3 of 'svset2_u8'} } */
+  u8x2 = svset2_u8 (u8x2, 0, f64); /* { dg-error {incompatible type for argument 3 of 'svset2_u8'} } */
+  u8x2 = svset2_u8 (u8x2, 0, pg); /* { dg-error {incompatible type for argument 3 of 'svset2_u8'} } */
+  u8x2 = svset2_u8 (u8x2, x, u8); /* { dg-error {argument 2 of 'svset2_u8' must be an integer constant expression} } */
+  u8x2 = svset2_u8 (u8x2, 0, u8);
+  f64 = svset2_u8 (u8x2, 0, u8); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svuint8x2_t'} } */
+  u8x2 = svset2_u8 (u8x2, 1, u8);
+  u8x2 = svset2_u8 (u8x2, 2, u8); /* { dg-error {passing 2 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2_u8 (u8x2, 3, u8); /* { dg-error {passing 3 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2_u8 (u8x2, 4, u8); /* { dg-error {passing 4 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2_u8 (u8x2, 5, u8); /* { dg-error {passing 5 to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2_u8 (u8x2, ~0U, u8); /* { dg-error {passing [^ ]* to argument 2 of 'svset2_u8', which expects a value in the range \[0, 1\]} } */
+  u8x2 = svset2_u8 (u8x2, one, u8); /* { dg-error {argument 2 of 'svset2_u8' must be an integer constant expression} } */
+  u8x2 = svset2_u8 (u8x2, 3 - 2, u8);
+  u8x2 = svset2_u8 (u8x2, 1.0, u8);
+
+  return f64;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_3.c
new file mode 100644
index 000000000..543a1bea8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_3.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+svfloat64_t
+f1 (svbool_t pg, svfloat16_t f16, svfloat16x3_t f16x3, svfloat16x4_t f16x4,
+    int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  f16x3 = svset3 (f16x3); /* { dg-error {too few arguments to function 'svset3'} } */
+  f16x3 = svset3 (f16x3, 1); /* { dg-error {too few arguments to function 'svset3'} } */
+  f16x3 = svset3 (f16x3, 1, f16, 3); /* { dg-error {too many arguments to function 'svset3'} } */
+  f16x3 = svset3 (f16, 0, f16); /* { dg-error {passing single vector 'svfloat16_t' to argument 1 of 'svset3', which expects a tuple of 3 vectors} } */
+  f16x3 = svset3 (f16x4, 0, f16); /* { dg-error {passing 'svfloat16x4_t' to argument 1 of 'svset3', which expects a tuple of 3 vectors} } */
+  f16x3 = svset3 (pg, 0, f16); /* { dg-error {passing 'svbool_t' to argument 1 of 'svset3', which expects a tuple of 3 vectors} } */
+  f16x3 = svset3 (f16x3, 0, f16x3); /* { dg-error {passing 'svfloat16x3_t' to argument 3 of 'svset3', which expects a single SVE vector rather than a tuple} } */
+  f16x3 = svset3 (f16x3, 0, f64); /* { dg-error {passing 'svfloat64_t' instead of the expected 'svfloat16_t' to argument 3 of 'svset3', after passing 'svfloat16x3_t' to argument 1} } */
+  f16x3 = svset3 (f16x3, 0, pg); /* { dg-error {passing 'svbool_t' instead of the expected 'svfloat16_t' to argument 3 of 'svset3', after passing 'svfloat16x3_t' to argument 1} } */
+  f16x3 = svset3 (f16x3, x, f16); /* { dg-error {argument 2 of 'svset3' must be an integer constant expression} } */
+  f16x3 = svset3 (f16x3, 0, f16);
+  f64 = svset3 (f16x3, 0, f16); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svfloat16x3_t'} } */
+  f16x3 = svset3 (f16x3, 1, f16);
+  f16x3 = svset3 (f16x3, 2, f16);
+  f16x3 = svset3 (f16x3, 3, f16); /* { dg-error {passing 3 to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3 (f16x3, 4, f16); /* { dg-error {passing 4 to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3 (f16x3, 5, f16); /* { dg-error {passing 5 to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3 (f16x3, ~0U, f16); /* { dg-error {passing [^ ]* to argument 2 of 'svset3', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3 (f16x3, one, f16); /* { dg-error {argument 2 of 'svset3' must be an integer constant expression} } */
+  f16x3 = svset3 (f16x3, 3 - 2, f16);
+  f16x3 = svset3 (f16x3, 1.0, f16);
+
+  return f64;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_4.c
new file mode 100644
index 000000000..198b03407
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_4.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+svfloat64_t
+f1 (svbool_t pg, svfloat16_t f16, svfloat16x3_t f16x3, svuint16x3_t u16x3,
+    svfloat16x4_t f16x4, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  f16x3 = svset3_f16 (f16x3); /* { dg-error {too few arguments to function 'svset3_f16'} } */
+  f16x3 = svset3_f16 (f16x3, 1); /* { dg-error {too few arguments to function 'svset3_f16'} } */
+  f16x3 = svset3_f16 (f16x3, 1, f16, 3); /* { dg-error {too many arguments to function 'svset3_f16'} } */
+  f16x3 = svset3_f16 (f16, 0, f16); /* { dg-error {incompatible type for argument 1 of 'svset3_f16'} } */
+  f16x3 = svset3_f16 (u16x3, 0, f16); /* { dg-error {incompatible type for argument 1 of 'svset3_f16'} } */
+  f16x3 = svset3_f16 (f16x4, 0, f16); /* { dg-error {incompatible type for argument 1 of 'svset3_f16'} } */
+  f16x3 = svset3_f16 (pg, 0, f16); /* { dg-error {incompatible type for argument 1 of 'svset3_f16'} } */
+  f16x3 = svset3_f16 (f16x3, 0, f16x3); /* { dg-error {incompatible type for argument 3 of 'svset3_f16'} } */
+  f16x3 = svset3_f16 (f16x3, 0, f64); /* { dg-error {incompatible type for argument 3 of 'svset3_f16'} } */
+  f16x3 = svset3_f16 (f16x3, 0, pg); /* { dg-error {incompatible type for argument 3 of 'svset3_f16'} } */
+  f16x3 = svset3_f16 (f16x3, x, f16); /* { dg-error {argument 2 of 'svset3_f16' must be an integer constant expression} } */
+  f16x3 = svset3_f16 (f16x3, 0, f16);
+  f64 = svset3_f16 (f16x3, 0, f16); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svfloat16x3_t'} } */
+  f16x3 = svset3_f16 (f16x3, 1, f16);
+  f16x3 = svset3_f16 (f16x3, 2, f16);
+  f16x3 = svset3_f16 (f16x3, 3, f16); /* { dg-error {passing 3 to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3_f16 (f16x3, 4, f16); /* { dg-error {passing 4 to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3_f16 (f16x3, 5, f16); /* { dg-error {passing 5 to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3_f16 (f16x3, ~0U, f16); /* { dg-error {passing [^ ]* to argument 2 of 'svset3_f16', which expects a value in the range \[0, 2\]} } */
+  f16x3 = svset3_f16 (f16x3, one, f16); /* { dg-error {argument 2 of 'svset3_f16' must be an integer constant expression} } */
+  f16x3 = svset3_f16 (f16x3, 3 - 2, f16);
+  f16x3 = svset3_f16 (f16x3, 1.0, f16);
+
+  return f64;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_5.c
new file mode 100644
index 000000000..be911a731
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_5.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+svfloat64_t
+f1 (svbool_t pg, svint32_t s32, svint32x4_t s32x4, svint32x2_t s32x2, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  s32x4 = svset4 (s32x4); /* { dg-error {too few arguments to function 'svset4'} } */
+  s32x4 = svset4 (s32x4, 1); /* { dg-error {too few arguments to function 'svset4'} } */
+  s32x4 = svset4 (s32x4, 1, s32, 3); /* { dg-error {too many arguments to function 'svset4'} } */
+  s32x4 = svset4 (s32, 0, s32); /* { dg-error {passing single vector 'svint32_t' to argument 1 of 'svset4', which expects a tuple of 4 vectors} } */
+  s32x4 = svset4 (s32x2, 0, s32); /* { dg-error {passing 'svint32x2_t' to argument 1 of 'svset4', which expects a tuple of 4 vectors} } */
+  s32x4 = svset4 (pg, 0, s32); /* { dg-error {passing 'svbool_t' to argument 1 of 'svset4', which expects a tuple of 4 vectors} } */
+  s32x4 = svset4 (s32x4, 0, s32x4); /* { dg-error {passing 'svint32x4_t' to argument 3 of 'svset4', which expects a single SVE vector rather than a tuple} } */
+  s32x4 = svset4 (s32x4, 0, f64); /* { dg-error {passing 'svfloat64_t' instead of the expected 'svint32_t' to argument 3 of 'svset4', after passing 'svint32x4_t' to argument 1} } */
+  s32x4 = svset4 (s32x4, 0, pg); /* { dg-error {passing 'svbool_t' instead of the expected 'svint32_t' to argument 3 of 'svset4', after passing 'svint32x4_t' to argument 1} } */
+  s32x4 = svset4 (s32x4, x, s32); /* { dg-error {argument 2 of 'svset4' must be an integer constant expression} } */
+  s32x4 = svset4 (s32x4, 0, s32);
+  f64 = svset4 (s32x4, 0, s32); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svint32x4_t'} } */
+  s32x4 = svset4 (s32x4, 1, s32);
+  s32x4 = svset4 (s32x4, 2, s32);
+  s32x4 = svset4 (s32x4, 3, s32);
+  s32x4 = svset4 (s32x4, 4, s32); /* { dg-error {passing 4 to argument 2 of 'svset4', which expects a value in the range \[0, 3\]} } */
+  s32x4 = svset4 (s32x4, 5, s32); /* { dg-error {passing 5 to argument 2 of 'svset4', which expects a value in the range \[0, 3\]} } */
+  s32x4 = svset4 (s32x4, ~0U, s32); /* { dg-error {passing [^ ]* to argument 2 of 'svset4', which expects a value in the range \[0, 3\]} } */
+  s32x4 = svset4 (s32x4, one, s32); /* { dg-error {argument 2 of 'svset4' must be an integer constant expression} } */
+  s32x4 = svset4 (s32x4, 3 - 2, s32);
+  s32x4 = svset4 (s32x4, 1.0, s32);
+
+  return f64;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_6.c
new file mode 100644
index 000000000..cec435413
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/set_6.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+svfloat64_t
+f1 (svbool_t pg, svint32_t s32, svint32x4_t s32x4, svfloat32x4_t f32x4,
+    svint32x2_t s32x2, int x)
+{
+  const int one = 1;
+  svfloat64_t f64;
+
+  s32x4 = svset4_s32 (s32x4); /* { dg-error {too few arguments to function 'svset4_s32'} } */
+  s32x4 = svset4_s32 (s32x4, 1); /* { dg-error {too few arguments to function 'svset4_s32'} } */
+  s32x4 = svset4_s32 (s32x4, 1, s32, 3); /* { dg-error {too many arguments to function 'svset4_s32'} } */
+  s32x4 = svset4_s32 (s32, 0, s32); /* { dg-error {incompatible type for argument 1 of 'svset4_s32'} } */
+  s32x4 = svset4_s32 (f32x4, 0, s32); /* { dg-error {incompatible type for argument 1 of 'svset4_s32'} } */
+  s32x4 = svset4_s32 (s32x2, 0, s32); /* { dg-error {incompatible type for argument 1 of 'svset4_s32'} } */
+  s32x4 = svset4_s32 (pg, 0, s32); /* { dg-error {incompatible type for argument 1 of 'svset4_s32'} } */
+  s32x4 = svset4_s32 (s32x4, 0, s32x4); /* { dg-error {incompatible type for argument 3 of 'svset4_s32'} } */
+  s32x4 = svset4_s32 (s32x4, 0, f64); /* { dg-error {incompatible type for argument 3 of 'svset4_s32'} } */
+  s32x4 = svset4_s32 (s32x4, 0, pg); /* { dg-error {incompatible type for argument 3 of 'svset4_s32'} } */
+  s32x4 = svset4_s32 (s32x4, x, s32); /* { dg-error {argument 2 of 'svset4_s32' must be an integer constant expression} } */
+  s32x4 = svset4_s32 (s32x4, 0, s32);
+  f64 = svset4_s32 (s32x4, 0, s32); /* { dg-error {incompatible types when assigning to type 'svfloat64_t' from type 'svint32x4_t'} } */
+  s32x4 = svset4_s32 (s32x4, 1, s32);
+  s32x4 = svset4_s32 (s32x4, 2, s32);
+  s32x4 = svset4_s32 (s32x4, 3, s32);
+  s32x4 = svset4_s32 (s32x4, 4, s32); /* { dg-error {passing 4 to argument 2 of 'svset4_s32', which expects a value in the range \[0, 3\]} } */
+  s32x4 = svset4_s32 (s32x4, 5, s32); /* { dg-error {passing 5 to argument 2 of 'svset4_s32', which expects a value in the range \[0, 3\]} } */
+  s32x4 = svset4_s32 (s32x4, ~0U, s32); /* { dg-error {passing [^ ]* to argument 2 of 'svset4_s32', which expects a value in the range \[0, 3\]} } */
+  s32x4 = svset4_s32 (s32x4, one, s32); /* { dg-error {argument 2 of 'svset4_s32' must be an integer constant expression} } */
+  s32x4 = svset4_s32 (s32x4, 3 - 2, s32);
+  s32x4 = svset4_s32 (s32x4, 1.0, s32);
+
+  return f64;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/shift_right_imm_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/shift_right_imm_1.c
new file mode 100644
index 000000000..4dd9a9c76
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/shift_right_imm_1.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svuint8_t u8, svint8_t s8, svint16_t s16,
+    svint32_t s32, svint64_t s64, int x)
+{
+  const int one = 1;
+  u8 = svasrd_x (pg, u8, 1); /* { dg-error {'svasrd_x' has no form that takes 'svuint8_t' arguments} } */
+  s8 = svasrd_x (pg, s8, x); /* { dg-error {argument 3 of 'svasrd_x' must be an integer constant expression} } */
+  s8 = svasrd_x (pg, s8, one); /* { dg-error {argument 3 of 'svasrd_x' must be an integer constant expression} } */
+  s8 = svasrd_x (pg, s8, 0.4); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} } */
+  s8 = svasrd_x (pg, s8, 1.0);
+  s8 = svasrd_x (pg, s8, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} } */
+  s8 = svasrd_x (pg, s8, 1);
+  s8 = svasrd_x (pg, s8, 1 + 1);
+  s8 = svasrd_x (pg, s8, 8);
+  s8 = svasrd_x (pg, s8, 9); /* { dg-error {passing 9 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} } */
+  s8 = svasrd_x (pg, s8, (1ULL << 62) + 1); /* { dg-error {passing [^ ]* to argument 3 of 'svasrd_x', which expects a value in the range \[1, 8\]} } */
+  s16 = svasrd_x (pg, s16, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 16\]} } */
+  s16 = svasrd_x (pg, s16, 1);
+  s16 = svasrd_x (pg, s16, 16);
+  s16 = svasrd_x (pg, s16, 17); /* { dg-error {passing 17 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 16\]} } */
+  s32 = svasrd_x (pg, s32, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 32\]} } */
+  s32 = svasrd_x (pg, s32, 1);
+  s32 = svasrd_x (pg, s32, 32);
+  s32 = svasrd_x (pg, s32, 33); /* { dg-error {passing 33 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 32\]} } */
+  s64 = svasrd_x (pg, s64, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 64\]} } */
+  s64 = svasrd_x (pg, s64, 1);
+  s64 = svasrd_x (pg, s64, 64);
+  s64 = svasrd_x (pg, s64, 65); /* { dg-error {passing 65 to argument 3 of 'svasrd_x', which expects a value in the range \[1, 64\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/shift_right_imm_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/shift_right_imm_2.c
new file mode 100644
index 000000000..4970689e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/shift_right_imm_2.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -Wall -Wextra" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint8_t s8, svint16_t s16, svint32_t s32, svint64_t s64,
+    int x)
+{
+  const int one = 1;
+  s8 = svasrd_n_s8_x (pg, s8, x); /* { dg-error {argument 3 of 'svasrd_n_s8_x' must be an integer constant expression} } */
+  s8 = svasrd_n_s8_x (pg, s8, one); /* { dg-error {argument 3 of 'svasrd_n_s8_x' must be an integer constant expression} } */
+  s8 = svasrd_n_s8_x (pg, s8, 0.4); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s8_x', which expects a value in the range \[1, 8\]} } */
+  s8 = svasrd_n_s8_x (pg, s8, 1.0);
+  s8 = svasrd_n_s8_x (pg, s8, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s8_x', which expects a value in the range \[1, 8\]} } */
+  s8 = svasrd_n_s8_x (pg, s8, 1);
+  s8 = svasrd_n_s8_x (pg, s8, 1 + 1);
+  s8 = svasrd_n_s8_x (pg, s8, 8);
+  s8 = svasrd_n_s8_x (pg, s8, 9); /* { dg-error {passing 9 to argument 3 of 'svasrd_n_s8_x', which expects a value in the range \[1, 8\]} } */
+  s8 = svasrd_n_s8_x (pg, s8, (1ULL << 62) + 1); /* { dg-error {passing [^ ]* to argument 3 of 'svasrd_n_s8_x', which expects a value in the range \[1, 8\]} } */
+  s16 = svasrd_n_s16_x (pg, s16, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s16_x', which expects a value in the range \[1, 16\]} } */
+  s16 = svasrd_n_s16_x (pg, s16, 1);
+  s16 = svasrd_n_s16_x (pg, s16, 16);
+  s16 = svasrd_n_s16_x (pg, s16, 17); /* { dg-error {passing 17 to argument 3 of 'svasrd_n_s16_x', which expects a value in the range \[1, 16\]} } */
+  s32 = svasrd_n_s32_x (pg, s32, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s32_x', which expects a value in the range \[1, 32\]} } */
+  s32 = svasrd_n_s32_x (pg, s32, 1);
+  s32 = svasrd_n_s32_x (pg, s32, 32);
+  s32 = svasrd_n_s32_x (pg, s32, 33); /* { dg-error {passing 33 to argument 3 of 'svasrd_n_s32_x', which expects a value in the range \[1, 32\]} } */
+  s64 = svasrd_n_s64_x (pg, s64, 0); /* { dg-error {passing 0 to argument 3 of 'svasrd_n_s64_x', which expects a value in the range \[1, 64\]} } */
+  s64 = svasrd_n_s64_x (pg, s64, 1);
+  s64 = svasrd_n_s64_x (pg, s64, 64);
+  s64 = svasrd_n_s64_x (pg, s64, 65); /* { dg-error {passing 65 to argument 3 of 'svasrd_n_s64_x', which expects a value in the range \[1, 64\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_1.c
new file mode 100644
index 000000000..267db83f7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_1.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { signed char x; };
+
+svuint8_t
+f1 (svbool_t pg, signed char *s8_ptr, void *void_ptr, struct s *s_ptr,
+    float *f32_ptr, _Complex float *cf32_ptr, svint8_t s8, svfloat32_t f32,
+    struct s s)
+{
+  svst1 (pg, s8_ptr); /* { dg-error {too few arguments to function 'svst1'} } */
+  svst1 (pg, s8_ptr, s8, 0); /* { dg-error {too many arguments to function 'svst1'} } */
+  svst1 (0, s8_ptr, s8); /* { dg-error {passing 'int' to argument 1 of 'svst1', which expects 'svbool_t'} } */
+  svst1 (pg, void_ptr, 0); /* { dg-error {passing 'int' to argument 3 of 'svst1', which expects an SVE vector type} } */
+  svst1 (pg, void_ptr, pg); /* { dg-error {'svst1' has no form that takes 'svbool_t' arguments} } */
+  svst1 (pg, 0, s8);
+  svst1 (pg, (int *) 0, s8); /* { dg-warning "passing argument 2 of 'svst1_s8' from incompatible pointer type" } */
+  svst1 (pg, void_ptr, s8);
+  svst1 (pg, s_ptr, s8); /* { dg-warning "passing argument 2 of 'svst1_s8' from incompatible pointer type" } */
+  svst1 (pg, f32_ptr, s8); /* { dg-warning "passing argument 2 of 'svst1_s8' from incompatible pointer type" } */
+  svst1 (pg, f32_ptr, f32);
+  svst1 (pg, cf32_ptr, f32); /* { dg-warning "passing argument 2 of 'svst1_f32' from incompatible pointer type" } */
+  svst1 (pg, s, s8); /* { dg-error {passing 'struct s' to argument 2 of 'svst1', which expects a scalar pointer} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_2.c
new file mode 100644
index 000000000..4e4fb3c6d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_2.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { signed char x; };
+
+svuint8_t
+f1 (svbool_t pg, signed char *s8_ptr, void *void_ptr, struct s *s_ptr,
+    float *f32_ptr, _Complex float *cf32_ptr, svint8_t s8, svfloat32_t f32)
+{
+  svst1_vnum (pg, s8_ptr, 0); /* { dg-error {too few arguments to function 'svst1_vnum'} } */
+  svst1_vnum (pg, s8_ptr, 0, s8, 0); /* { dg-error {too many arguments to function 'svst1_vnum'} } */
+  svst1_vnum (0, s8_ptr, 0, s8); /* { dg-error {passing 'int' to argument 1 of 'svst1_vnum', which expects 'svbool_t'} } */
+  svst1_vnum (pg, s8_ptr, pg, s8); /* { dg-error {passing 'svbool_t' to argument 3 of 'svst1_vnum', which expects 'int64_t'} } */
+  svst1_vnum (pg, s8_ptr, s8, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svst1_vnum', which expects 'int64_t'} } */
+  svst1_vnum (pg, s8_ptr, void_ptr, s8); /* { dg-warning "passing argument 3 of 'svst1_vnum_s8' makes integer from pointer without a cast" } */
+  svst1_vnum (pg, void_ptr, 0, 0); /* { dg-error {passing 'int' to argument 4 of 'svst1_vnum', which expects an SVE vector type} } */
+  svst1_vnum (pg, void_ptr, 0, pg); /* { dg-error {'svst1_vnum' has no form that takes 'svbool_t' arguments} } */
+  svst1_vnum (pg, 0, 0, s8);
+  svst1_vnum (pg, (int *) 0, 0, s8); /* { dg-warning "passing argument 2 of 'svst1_vnum_s8' from incompatible pointer type" } */
+  svst1_vnum (pg, void_ptr, 0, s8);
+  svst1_vnum (pg, s_ptr, 0, s8); /* { dg-warning "passing argument 2 of 'svst1_vnum_s8' from incompatible pointer type" } */
+  svst1_vnum (pg, f32_ptr, 0, s8); /* { dg-warning "passing argument 2 of 'svst1_vnum_s8' from incompatible pointer type" } */
+  svst1_vnum (pg, f32_ptr, 0, f32);
+  svst1_vnum (pg, cf32_ptr, 0, f32); /* { dg-warning "passing argument 2 of 'svst1_vnum_f32' from incompatible pointer type" } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_index_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_index_1.c
new file mode 100644
index 000000000..3209149b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_index_1.c
@@ -0,0 +1,101 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { signed char x; };
+
+svuint32_t
+f1 (svbool_t pg, signed char *s8_ptr, short *s16_ptr,
+    int32_t *s32_ptr, uint32_t *u32_ptr, float *f32_ptr,
+    int64_t *s64_ptr, uint64_t *u64_ptr, double *f64_ptr,
+    void *void_ptr, struct s *s_ptr, _Complex float *cf32_ptr,
+    svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16, svfloat16_t f16,
+    svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
+{
+  svst1_scatter_index (pg, s32_ptr, s32); /* { dg-error {too few arguments to function 'svst1_scatter_index'} } */
+  svst1_scatter_index (pg, s32_ptr, s32, s32, 0); /* { dg-error {too many arguments to function 'svst1_scatter_index'} } */
+  svst1_scatter_index (0, s32_ptr, s32, s32); /* { dg-error {passing 'int' to argument 1 of 'svst1_scatter_index', which expects 'svbool_t'} } */
+  svst1_scatter_index (pg, 0, s32, s32);
+  svst1_scatter_index (pg, (int *) 0, s32, s32);
+  svst1_scatter_index (pg, void_ptr, s32, s32);
+  svst1_scatter_index (pg, s_ptr, s32, s32); /* { dg-warning "passing argument 2 of 'svst1_scatter_s32index_s32' from incompatible pointer type" } */
+  svst1_scatter_index (pg, f32_ptr, s32, s32); /* { dg-warning "passing argument 2 of 'svst1_scatter_s32index_s32' from incompatible pointer type" } */
+  svst1_scatter_index (pg, f32_ptr, s32, f32);
+  svst1_scatter_index (pg, cf32_ptr, s32, f32); /* { dg-warning "passing argument 2 of 'svst1_scatter_s32index_f32' from incompatible pointer type" } */
+  svst1_scatter_index (pg, s, s32, s32); /* { dg-error {passing 'struct s' to argument 2 of 'svst1_scatter_index', which expects a vector or pointer base address} } */
+
+  svst1_scatter_index (pg, u32, void_ptr, s32); /* { dg-warning "passing argument 3 of 'svst1_scatter_u32base_index_s32' makes integer from pointer without a cast" } */
+  svst1_scatter_index (pg, u32, pg, s32); /* { dg-error {passing 'svbool_t' to argument 3 of 'svst1_scatter_index', which expects 'int64_t'} } */
+  svst1_scatter_index (pg, u32, s32, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svst1_scatter_index', which expects 'int64_t'} } */
+
+  svst1_scatter_index (pg, void_ptr, u32, pg); /* { dg-error {passing 'svbool_t' to argument 4 of 'svst1_scatter_index', which expects a vector of 32-bit or 64-bit elements} } */
+
+  svst1_scatter_index (pg, s8_ptr, u32, s8); /* { dg-error {passing 'svint8_t' to argument 4 of 'svst1_scatter_index', which expects a vector of 32-bit or 64-bit elements} } */
+  svst1_scatter_index (pg, s8_ptr, u32, u8); /* { dg-error {passing 'svuint8_t' to argument 4 of 'svst1_scatter_index', which expects a vector of 32-bit or 64-bit elements} } */
+
+  svst1_scatter_index (pg, s16_ptr, u32, s16); /* { dg-error {passing 'svint16_t' to argument 4 of 'svst1_scatter_index', which expects a vector of 32-bit or 64-bit elements} } */
+  svst1_scatter_index (pg, s16_ptr, u32, u16); /* { dg-error {passing 'svuint16_t' to argument 4 of 'svst1_scatter_index', which expects a vector of 32-bit or 64-bit elements} } */
+  svst1_scatter_index (pg, s16_ptr, u32, f16); /* { dg-error {passing 'svfloat16_t' to argument 4 of 'svst1_scatter_index', which expects a vector of 32-bit or 64-bit elements} } */
+
+  svst1_scatter_index (pg, u32, 0, s32);
+  svst1_scatter_index (pg, s32, 0, s32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter_index', which expects 'svuint32_t'} } */
+
+  svst1_scatter_index (pg, u32, 0, u32);
+  svst1_scatter_index (pg, s32, 0, u32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter_index', which expects 'svuint32_t'} } */
+
+  svst1_scatter_index (pg, u32, 0, f32);
+  svst1_scatter_index (pg, s32, 0, f32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter_index', which expects 'svuint32_t'} } */
+
+  svst1_scatter_index (pg, u64, 0, s64);
+  svst1_scatter_index (pg, s64, 0, s64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter_index', which expects 'svuint64_t'} } */
+
+  svst1_scatter_index (pg, u64, 0, u64);
+  svst1_scatter_index (pg, s64, 0, u64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter_index', which expects 'svuint64_t'} } */
+
+  svst1_scatter_index (pg, u64, 0, f64);
+  svst1_scatter_index (pg, s64, 0, f64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter_index', which expects 'svuint64_t'} } */
+
+  svst1_scatter_index (pg, s32_ptr, s32, s32);
+  svst1_scatter_index (pg, s32_ptr, u32, s32);
+  svst1_scatter_index (pg, s32_ptr, f32, s32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svint32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_index (pg, s32_ptr, s64, s32); /* { dg-error {passing 'svint64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svint32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_index (pg, s32_ptr, u64, s32); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svint32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_index (pg, s32_ptr, f64, s32); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svint32_t' expects a vector of 32-bit integers} } */
+
+  svst1_scatter_index (pg, u32_ptr, s32, u32);
+  svst1_scatter_index (pg, u32_ptr, u32, u32);
+  svst1_scatter_index (pg, u32_ptr, f32, u32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svuint32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_index (pg, u32_ptr, s64, u32); /* { dg-error {passing 'svint64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svuint32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_index (pg, u32_ptr, u64, u32); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svuint32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_index (pg, u32_ptr, f64, u32); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svuint32_t' expects a vector of 32-bit integers} } */
+
+  svst1_scatter_index (pg, f32_ptr, s32, f32);
+  svst1_scatter_index (pg, f32_ptr, u32, f32);
+  svst1_scatter_index (pg, f32_ptr, f32, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svfloat32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_index (pg, f32_ptr, s64, f32); /* { dg-error {passing 'svint64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svfloat32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_index (pg, f32_ptr, u64, f32); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svfloat32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_index (pg, f32_ptr, f64, f32); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svfloat32_t' expects a vector of 32-bit integers} } */
+
+  svst1_scatter_index (pg, s64_ptr, s32, s64); /* { dg-error {passing 'svint32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svint64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_index (pg, s64_ptr, u32, s64); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svint64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_index (pg, s64_ptr, f32, s64); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svint64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_index (pg, s64_ptr, s64, s64);
+  svst1_scatter_index (pg, s64_ptr, u64, s64);
+  svst1_scatter_index (pg, s64_ptr, f64, s64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svint64_t' expects a vector of 64-bit integers} } */
+
+  svst1_scatter_index (pg, u64_ptr, s32, u64); /* { dg-error {passing 'svint32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svuint64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_index (pg, u64_ptr, u32, u64); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svuint64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_index (pg, u64_ptr, f32, u64); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svuint64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_index (pg, u64_ptr, s64, u64);
+  svst1_scatter_index (pg, u64_ptr, u64, u64);
+  svst1_scatter_index (pg, u64_ptr, f64, u64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svuint64_t' expects a vector of 64-bit integers} } */
+
+  svst1_scatter_index (pg, f64_ptr, s32, f64); /* { dg-error {passing 'svint32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svfloat64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_index (pg, f64_ptr, u32, f64); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svfloat64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_index (pg, f64_ptr, f32, f64); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_index', which when storing 'svfloat64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_index (pg, f64_ptr, s64, f64);
+  svst1_scatter_index (pg, f64_ptr, u64, f64);
+  svst1_scatter_index (pg, f64_ptr, f64, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_index', which when storing 'svfloat64_t' expects a vector of 64-bit integers} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_offset_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_offset_1.c
new file mode 100644
index 000000000..10abf758c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_offset_1.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { signed char x; };
+
+svuint32_t
+f1 (svbool_t pg, svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16,
+    svfloat16_t f16, svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64)
+{
+  svst1_scatter (pg, u32); /* { dg-error {too few arguments to function 'svst1_scatter'} } */
+  svst1_scatter (pg, u32, u32, 0); /* { dg-error {too many arguments to function 'svst1_scatter'} } */
+  svst1_scatter (0, u32, u32); /* { dg-error {passing 'int' to argument 1 of 'svst1_scatter', which expects 'svbool_t'} } */
+  svst1_scatter (pg, 0, u32); /* { dg-error {passing 'int' to argument 2 of 'svst1_scatter', which expects an SVE vector type} } */
+  svst1_scatter (pg, u32, 0); /* { dg-error {passing 'int' to argument 3 of 'svst1_scatter', which expects an SVE vector type} } */
+
+  svst1_scatter (pg, u32, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svst1_scatter', which expects a vector of 32-bit or 64-bit elements} } */
+
+  svst1_scatter (pg, u32, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svst1_scatter', which expects a vector of 32-bit or 64-bit elements} } */
+  svst1_scatter (pg, u32, u8); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svst1_scatter', which expects a vector of 32-bit or 64-bit elements} } */
+
+  svst1_scatter (pg, u32, s16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svst1_scatter', which expects a vector of 32-bit or 64-bit elements} } */
+  svst1_scatter (pg, u32, u16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svst1_scatter', which expects a vector of 32-bit or 64-bit elements} } */
+  svst1_scatter (pg, u32, f16); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svst1_scatter', which expects a vector of 32-bit or 64-bit elements} } */
+
+  svst1_scatter (pg, u32, s32);
+  svst1_scatter (pg, s32, s32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter', which expects 'svuint32_t'} } */
+
+  svst1_scatter (pg, u32, u32);
+  svst1_scatter (pg, s32, u32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter', which expects 'svuint32_t'} } */
+
+  svst1_scatter (pg, u32, f32);
+  svst1_scatter (pg, s32, f32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter', which expects 'svuint32_t'} } */
+
+  svst1_scatter (pg, u64, s64);
+  svst1_scatter (pg, s64, s64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter', which expects 'svuint64_t'} } */
+
+  svst1_scatter (pg, u64, u64);
+  svst1_scatter (pg, s64, u64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter', which expects 'svuint64_t'} } */
+
+  svst1_scatter (pg, u64, f64);
+  svst1_scatter (pg, s64, f64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter', which expects 'svuint64_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_offset_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_offset_2.c
new file mode 100644
index 000000000..8ee8129fa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/store_scatter_offset_2.c
@@ -0,0 +1,101 @@
+/* { dg-do compile } */
+/* { dg-options "-std=c99" } */
+
+#include <arm_sve.h>
+
+struct s { signed char x; };
+
+svuint32_t
+f1 (svbool_t pg, signed char *s8_ptr, short *s16_ptr,
+    int32_t *s32_ptr, uint32_t *u32_ptr, float *f32_ptr,
+    int64_t *s64_ptr, uint64_t *u64_ptr, double *f64_ptr,
+    void *void_ptr, struct s *s_ptr, _Complex float *cf32_ptr,
+    svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16, svfloat16_t f16,
+    svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64, svfloat64_t f64, struct s s)
+{
+  svst1_scatter_offset (pg, s32_ptr, s32); /* { dg-error {too few arguments to function 'svst1_scatter_offset'} } */
+  svst1_scatter_offset (pg, s32_ptr, s32, s32, 0); /* { dg-error {too many arguments to function 'svst1_scatter_offset'} } */
+  svst1_scatter_offset (0, s32_ptr, s32, s32); /* { dg-error {passing 'int' to argument 1 of 'svst1_scatter_offset', which expects 'svbool_t'} } */
+  svst1_scatter_offset (pg, 0, s32, s32);
+  svst1_scatter_offset (pg, (int *) 0, s32, s32);
+  svst1_scatter_offset (pg, void_ptr, s32, s32);
+  svst1_scatter_offset (pg, s_ptr, s32, s32); /* { dg-warning "passing argument 2 of 'svst1_scatter_s32offset_s32' from incompatible pointer type" } */
+  svst1_scatter_offset (pg, f32_ptr, s32, s32); /* { dg-warning "passing argument 2 of 'svst1_scatter_s32offset_s32' from incompatible pointer type" } */
+  svst1_scatter_offset (pg, f32_ptr, s32, f32);
+  svst1_scatter_offset (pg, cf32_ptr, s32, f32); /* { dg-warning "passing argument 2 of 'svst1_scatter_s32offset_f32' from incompatible pointer type" } */
+  svst1_scatter_offset (pg, s, s32, s32); /* { dg-error {passing 'struct s' to argument 2 of 'svst1_scatter_offset', which expects a vector or pointer base address} } */
+
+  svst1_scatter_offset (pg, u32, void_ptr, s32); /* { dg-warning "passing argument 3 of 'svst1_scatter_u32base_offset_s32' makes integer from pointer without a cast" } */
+  svst1_scatter_offset (pg, u32, pg, s32); /* { dg-error {passing 'svbool_t' to argument 3 of 'svst1_scatter_offset', which expects 'int64_t'} } */
+  svst1_scatter_offset (pg, u32, s32, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svst1_scatter_offset', which expects 'int64_t'} } */
+
+  svst1_scatter_offset (pg, void_ptr, u32, pg); /* { dg-error {passing 'svbool_t' to argument 4 of 'svst1_scatter_offset', which expects a vector of 32-bit or 64-bit elements} } */
+
+  svst1_scatter_offset (pg, s8_ptr, u32, s8); /* { dg-error {passing 'svint8_t' to argument 4 of 'svst1_scatter_offset', which expects a vector of 32-bit or 64-bit elements} } */
+  svst1_scatter_offset (pg, s8_ptr, u32, u8); /* { dg-error {passing 'svuint8_t' to argument 4 of 'svst1_scatter_offset', which expects a vector of 32-bit or 64-bit elements} } */
+
+  svst1_scatter_offset (pg, s16_ptr, u32, s16); /* { dg-error {passing 'svint16_t' to argument 4 of 'svst1_scatter_offset', which expects a vector of 32-bit or 64-bit elements} } */
+  svst1_scatter_offset (pg, s16_ptr, u32, u16); /* { dg-error {passing 'svuint16_t' to argument 4 of 'svst1_scatter_offset', which expects a vector of 32-bit or 64-bit elements} } */
+  svst1_scatter_offset (pg, s16_ptr, u32, f16); /* { dg-error {passing 'svfloat16_t' to argument 4 of 'svst1_scatter_offset', which expects a vector of 32-bit or 64-bit elements} } */
+
+  svst1_scatter_offset (pg, u32, 0, s32);
+  svst1_scatter_offset (pg, s32, 0, s32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter_offset', which expects 'svuint32_t'} } */
+
+  svst1_scatter_offset (pg, u32, 0, u32);
+  svst1_scatter_offset (pg, s32, 0, u32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter_offset', which expects 'svuint32_t'} } */
+
+  svst1_scatter_offset (pg, u32, 0, f32);
+  svst1_scatter_offset (pg, s32, 0, f32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svst1_scatter_offset', which expects 'svuint32_t'} } */
+
+  svst1_scatter_offset (pg, u64, 0, s64);
+  svst1_scatter_offset (pg, s64, 0, s64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter_offset', which expects 'svuint64_t'} } */
+
+  svst1_scatter_offset (pg, u64, 0, u64);
+  svst1_scatter_offset (pg, s64, 0, u64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter_offset', which expects 'svuint64_t'} } */
+
+  svst1_scatter_offset (pg, u64, 0, f64);
+  svst1_scatter_offset (pg, s64, 0, f64); /* { dg-error {passing 'svint64_t' to argument 2 of 'svst1_scatter_offset', which expects 'svuint64_t'} } */
+
+  svst1_scatter_offset (pg, s32_ptr, s32, s32);
+  svst1_scatter_offset (pg, s32_ptr, u32, s32);
+  svst1_scatter_offset (pg, s32_ptr, f32, s32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svint32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_offset (pg, s32_ptr, s64, s32); /* { dg-error {passing 'svint64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svint32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_offset (pg, s32_ptr, u64, s32); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svint32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_offset (pg, s32_ptr, f64, s32); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svint32_t' expects a vector of 32-bit integers} } */
+
+  svst1_scatter_offset (pg, u32_ptr, s32, u32);
+  svst1_scatter_offset (pg, u32_ptr, u32, u32);
+  svst1_scatter_offset (pg, u32_ptr, f32, u32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svuint32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_offset (pg, u32_ptr, s64, u32); /* { dg-error {passing 'svint64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svuint32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_offset (pg, u32_ptr, u64, u32); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svuint32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_offset (pg, u32_ptr, f64, u32); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svuint32_t' expects a vector of 32-bit integers} } */
+
+  svst1_scatter_offset (pg, f32_ptr, s32, f32);
+  svst1_scatter_offset (pg, f32_ptr, u32, f32);
+  svst1_scatter_offset (pg, f32_ptr, f32, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svfloat32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_offset (pg, f32_ptr, s64, f32); /* { dg-error {passing 'svint64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svfloat32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_offset (pg, f32_ptr, u64, f32); /* { dg-error {passing 'svuint64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svfloat32_t' expects a vector of 32-bit integers} } */
+  svst1_scatter_offset (pg, f32_ptr, f64, f32); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svfloat32_t' expects a vector of 32-bit integers} } */
+
+  svst1_scatter_offset (pg, s64_ptr, s32, s64); /* { dg-error {passing 'svint32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svint64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_offset (pg, s64_ptr, u32, s64); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svint64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_offset (pg, s64_ptr, f32, s64); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svint64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_offset (pg, s64_ptr, s64, s64);
+  svst1_scatter_offset (pg, s64_ptr, u64, s64);
+  svst1_scatter_offset (pg, s64_ptr, f64, s64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svint64_t' expects a vector of 64-bit integers} } */
+
+  svst1_scatter_offset (pg, u64_ptr, s32, u64); /* { dg-error {passing 'svint32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svuint64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_offset (pg, u64_ptr, u32, u64); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svuint64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_offset (pg, u64_ptr, f32, u64); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svuint64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_offset (pg, u64_ptr, s64, u64);
+  svst1_scatter_offset (pg, u64_ptr, u64, u64);
+  svst1_scatter_offset (pg, u64_ptr, f64, u64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svuint64_t' expects a vector of 64-bit integers} } */
+
+  svst1_scatter_offset (pg, f64_ptr, s32, f64); /* { dg-error {passing 'svint32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svfloat64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_offset (pg, f64_ptr, u32, f64); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svfloat64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_offset (pg, f64_ptr, f32, f64); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svfloat64_t' expects a vector of 64-bit integers} } */
+  svst1_scatter_offset (pg, f64_ptr, s64, f64);
+  svst1_scatter_offset (pg, f64_ptr, u64, f64);
+  svst1_scatter_offset (pg, f64_ptr, f64, f64); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svst1_scatter_offset', which when storing 'svfloat64_t' expects a vector of 64-bit integers} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_1.c
new file mode 100644
index 000000000..a9233324c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_1.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+#pragma GCC target ("arch=armv8.2-a+sve+bf16")
+
+void
+f1 (svbool_t pg, svuint8_t u8, svuint16_t u16, svint32_t s32,
+    svbfloat16_t bf16, svfloat32_t f32, svfloat64_t f64, bfloat16_t bf)
+{
+  svbfmmla (f32, bf16); /* { dg-error {too few arguments to function 'svbfmmla'} } */
+  svbfmmla (f32, bf16, bf16, 0); /* { dg-error {too many arguments to function 'svbfmmla'} } */
+  svbfmmla (0, bf16, bf16); /* { dg-error {passing 'int' to argument 1 of 'svbfmmla', which expects an SVE vector type} } */
+  svbfmmla (pg, bf16, bf16); /* { dg-error {'svbfmmla' has no form that takes 'svbool_t' arguments} } */
+  svbfmmla (u8, bf16, bf16); /* { dg-error {'svbfmmla' has no form that takes 'svuint8_t' arguments} } */
+  svbfmmla (u16, bf16, bf16); /* { dg-error {'svbfmmla' has no form that takes 'svuint16_t' arguments} } */
+  svbfmmla (f64, bf16, bf16); /* { dg-error {'svbfmmla' has no form that takes 'svfloat64_t' arguments} } */
+  svbfmmla (f32, bf16, bf16);
+  svbfmmla (f32, 0, bf16); /* { dg-error {passing 'int' to argument 2 of 'svbfmmla', which expects 'svbfloat16_t'} } */
+  svbfmmla (f32, f32, bf16); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svbfmmla', which expects 'svbfloat16_t'} } */
+  svbfmmla (f32, bf16, 0); /* { dg-error {passing 'int' to argument 3 of 'svbfmmla', which expects 'svbfloat16_t'} } */
+  svbfmmla (f32, bf16, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svbfmmla', which expects 'svbfloat16_t'} } */
+  svbfmmla (f32, bf16, bf); /* { dg-error {passing 'bfloat16_t'[^\n]* to argument 3 of 'svbfmmla', which expects 'svbfloat16_t'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_lane_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_lane_1.c
new file mode 100644
index 000000000..23f027f2d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_lane_1.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+#pragma GCC target ("arch=armv8.2-a+sve+bf16")
+
+void
+f1 (svbool_t pg, svuint8_t u8, svuint16_t u16, svint32_t s32,
+    svbfloat16_t bf16, svfloat32_t f32, svfloat64_t f64, int i)
+{
+  svbfmlalb_lane (f32, bf16, bf16); /* { dg-error {too few arguments to function 'svbfmlalb_lane'} } */
+  svbfmlalb_lane (f32, bf16, bf16, 0, 0); /* { dg-error {too many arguments to function 'svbfmlalb_lane'} } */
+  svbfmlalb_lane (0, bf16, bf16, 0); /* { dg-error {passing 'int' to argument 1 of 'svbfmlalb_lane', which expects an SVE vector type} } */
+  svbfmlalb_lane (pg, bf16, bf16, 0); /* { dg-error {'svbfmlalb_lane' has no form that takes 'svbool_t' arguments} } */
+  svbfmlalb_lane (u8, bf16, bf16, 0); /* { dg-error {'svbfmlalb_lane' has no form that takes 'svuint8_t' arguments} } */
+  svbfmlalb_lane (u16, bf16, bf16, 0); /* { dg-error {'svbfmlalb_lane' has no form that takes 'svuint16_t' arguments} } */
+  svbfmlalb_lane (f64, bf16, bf16, 0); /* { dg-error {'svbfmlalb_lane' has no form that takes 'svfloat64_t' arguments} } */
+  svbfmlalb_lane (f32, bf16, bf16, 0);
+  svbfmlalb_lane (f32, 0, bf16, 0); /* { dg-error {passing 'int' to argument 2 of 'svbfmlalb_lane', which expects 'svbfloat16_t'} } */
+  svbfmlalb_lane (f32, f32, bf16, 0); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svbfmlalb_lane', which expects 'svbfloat16_t'} } */
+  svbfmlalb_lane (f32, bf16, 0, 0); /* { dg-error {passing 'int' to argument 3 of 'svbfmlalb_lane', which expects 'svbfloat16_t'} } */
+  svbfmlalb_lane (f32, bf16, f32, 0); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svbfmlalb_lane', which expects 'svbfloat16_t'} } */
+  svbfmlalb_lane (f32, bf16, bf16, s32); /* { dg-error {argument 4 of 'svbfmlalb_lane' must be an integer constant expression} } */
+  svbfmlalb_lane (f32, bf16, bf16, i); /* { dg-error {argument 4 of 'svbfmlalb_lane' must be an integer constant expression} } */
+
+  svbfmlalb_lane (f32, bf16, bf16, 0);
+  svbfmlalb_lane (f32, bf16, bf16, 7);
+  svbfmlalb_lane (f32, bf16, bf16, 8); /* { dg-error {passing 8 to argument 4 of 'svbfmlalb_lane', which expects a value in the range \[0, 7\]} } */
+  svbfmlalb_lane (f32, bf16, bf16, -1); /* { dg-error {passing -1 to argument 4 of 'svbfmlalb_lane', which expects a value in the range \[0, 7\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_lanex2_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_lanex2_1.c
new file mode 100644
index 000000000..4755ca79a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_lanex2_1.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+#pragma GCC target ("arch=armv8.2-a+sve+bf16")
+
+void
+f1 (svbool_t pg, svuint8_t u8, svuint16_t u16, svint32_t s32,
+    svbfloat16_t bf16, svfloat32_t f32, svfloat64_t f64, int i)
+{
+  svbfdot_lane (f32, bf16, bf16); /* { dg-error {too few arguments to function 'svbfdot_lane'} } */
+  svbfdot_lane (f32, bf16, bf16, 0, 0); /* { dg-error {too many arguments to function 'svbfdot_lane'} } */
+  svbfdot_lane (0, bf16, bf16, 0); /* { dg-error {passing 'int' to argument 1 of 'svbfdot_lane', which expects an SVE vector type} } */
+  svbfdot_lane (pg, bf16, bf16, 0); /* { dg-error {'svbfdot_lane' has no form that takes 'svbool_t' arguments} } */
+  svbfdot_lane (u8, bf16, bf16, 0); /* { dg-error {'svbfdot_lane' has no form that takes 'svuint8_t' arguments} } */
+  svbfdot_lane (u16, bf16, bf16, 0); /* { dg-error {'svbfdot_lane' has no form that takes 'svuint16_t' arguments} } */
+  svbfdot_lane (f64, bf16, bf16, 0); /* { dg-error {'svbfdot_lane' has no form that takes 'svfloat64_t' arguments} } */
+  svbfdot_lane (f32, bf16, bf16, 0);
+  svbfdot_lane (f32, 0, bf16, 0); /* { dg-error {passing 'int' to argument 2 of 'svbfdot_lane', which expects 'svbfloat16_t'} } */
+  svbfdot_lane (f32, f32, bf16, 0); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svbfdot_lane', which expects 'svbfloat16_t'} } */
+  svbfdot_lane (f32, bf16, 0, 0); /* { dg-error {passing 'int' to argument 3 of 'svbfdot_lane', which expects 'svbfloat16_t'} } */
+  svbfdot_lane (f32, bf16, f32, 0); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svbfdot_lane', which expects 'svbfloat16_t'} } */
+  svbfdot_lane (f32, bf16, bf16, s32); /* { dg-error {argument 4 of 'svbfdot_lane' must be an integer constant expression} } */
+  svbfdot_lane (f32, bf16, bf16, i); /* { dg-error {argument 4 of 'svbfdot_lane' must be an integer constant expression} } */
+
+  svbfdot_lane (f32, bf16, bf16, 0);
+  svbfdot_lane (f32, bf16, bf16, 3);
+  svbfdot_lane (f32, bf16, bf16, 4); /* { dg-error {passing 4 to argument 4 of 'svbfdot_lane', which expects a value in the range \[0, 3\]} } */
+  svbfdot_lane (f32, bf16, bf16, -1); /* { dg-error {passing -1 to argument 4 of 'svbfdot_lane', which expects a value in the range \[0, 3\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_opt_n_1.c
new file mode 100644
index 000000000..2d09a8eeb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_bfloat16_opt_n_1.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+#pragma GCC target ("arch=armv8.2-a+sve+bf16")
+
+void
+f1 (svbool_t pg, svuint8_t u8, svuint16_t u16, svint32_t s32,
+    svbfloat16_t bf16, svfloat32_t f32, svfloat64_t f64, bfloat16_t bf)
+{
+  svbfdot (f32, bf16); /* { dg-error {too few arguments to function 'svbfdot'} } */
+  svbfdot (f32, bf16, bf16, 0); /* { dg-error {too many arguments to function 'svbfdot'} } */
+  svbfdot (0, bf16, bf16); /* { dg-error {passing 'int' to argument 1 of 'svbfdot', which expects an SVE vector type} } */
+  svbfdot (pg, bf16, bf16); /* { dg-error {'svbfdot' has no form that takes 'svbool_t' arguments} } */
+  svbfdot (u8, bf16, bf16); /* { dg-error {'svbfdot' has no form that takes 'svuint8_t' arguments} } */
+  svbfdot (u16, bf16, bf16); /* { dg-error {'svbfdot' has no form that takes 'svuint16_t' arguments} } */
+  svbfdot (f64, bf16, bf16); /* { dg-error {'svbfdot' has no form that takes 'svfloat64_t' arguments} } */
+  svbfdot (f32, bf16, bf16);
+  svbfdot (f32, 0, bf16); /* { dg-error {passing 'int' to argument 2 of 'svbfdot', which expects 'svbfloat16_t'} } */
+  svbfdot (f32, f32, bf16); /* { dg-error {passing 'svfloat32_t' to argument 2 of 'svbfdot', which expects 'svbfloat16_t'} } */
+  svbfdot (f32, bf16, 0); /* { dg-error {invalid conversion to type 'bfloat16_t'} } */
+  svbfdot (f32, bf16, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svbfdot', which expects 'svbfloat16_t'} } */
+  svbfdot (f32, bf16, bf);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_intq_uintq_lane_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_intq_uintq_lane_1.c
new file mode 100644
index 000000000..600be05a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_intq_uintq_lane_1.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv8.6-a+sve+i8mm" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16,
+    svint32_t s32, svuint32_t u32, svint64_t s64, svuint64_t u64,
+    svfloat32_t f32, int i)
+{
+  svsudot_lane (s32, s8, u8); /* { dg-error {too few arguments to function 'svsudot_lane'} } */
+  svsudot_lane (s32, s8, u8, 0, 0); /* { dg-error {too many arguments to function 'svsudot_lane'} } */
+  svsudot_lane (0, s8, u8, 0); /* { dg-error {passing 'int' to argument 1 of 'svsudot_lane', which expects an SVE vector type} } */
+  svsudot_lane (pg, s8, u8, 0); /* { dg-error {'svsudot_lane' has no form that takes 'svbool_t' arguments} } */
+  svsudot_lane (u8, s8, u8, 0); /* { dg-error {'svsudot_lane' has no form that takes 'svuint8_t' arguments} } */
+  svsudot_lane (f32, s8, u8, 0); /* { dg-error {'svsudot_lane' has no form that takes 'svfloat32_t' arguments} } */
+  svsudot_lane (u32, s8, u8, 0); /* { dg-error {'svsudot_lane' has no form that takes 'svuint32_t' arguments} } */
+  svsudot_lane (s32, s8, u8, 0);
+  svsudot_lane (s32, 0, u8, 0); /* { dg-error {passing 'int' to argument 2 of 'svsudot_lane', which expects an SVE vector type} } */
+  svsudot_lane (s32, s8, 0, 0); /* { dg-error {passing 'int' to argument 3 of 'svsudot_lane', which expects an SVE vector type} } */
+
+  svsudot_lane (s32, s8, u8, 0);
+  svsudot_lane (s32, u8, u8, 0); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svsudot_lane', which expects a vector of signed integers} } */
+  svsudot_lane (s32, s8, s8, 0); /* { dg-error {passing 'svint8_t' to argument 3 of 'svsudot_lane', which expects a vector of unsigned integers} } */
+  svsudot_lane (s32, s32, s32, 0); /* { dg-error {passing 'svint32_t' instead of the expected 'svint8_t' to argument 2 of 'svsudot_lane', after passing 'svint32_t' to argument 1} } */
+
+  svsudot_lane (s32, s8, u8, i); /* { dg-error {argument 4 of 'svsudot_lane' must be an integer constant expression} } */
+  svsudot_lane (s32, s8, u8, 0);
+  svsudot_lane (s32, s8, u8, 3);
+  svsudot_lane (s32, s8, u8, 4);  /* { dg-error {passing 4 to argument 4 of 'svsudot_lane', which expects a value in the range \[0, 3\]} } */
+  svsudot_lane (s32, s8, u8, -1);  /* { dg-error {passing -1 to argument 4 of 'svsudot_lane', which expects a value in the range \[0, 3\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_intq_uintq_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_intq_uintq_opt_n_1.c
new file mode 100644
index 000000000..f95ac582f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_intq_uintq_opt_n_1.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv8.6-a+sve+i8mm" } */
+
+#include <arm_sve.h>
+
+svuint32_t
+f1 (svint32_t s32, svuint8_t u8, svint8_t s8, svuint32_t u32)
+{
+  svsudot_s32 (s32); /* { dg-error {too few arguments to function 'svsudot_s32'} } */
+  svsudot_s32 (s32, s8, u8, u32); /* { dg-error {too many arguments to function 'svsudot_s32'} } */
+  svsudot_s32 (s32, s32, u8); /* { dg-error {incompatible type for argument 2 of 'svsudot_s32'} } */
+  svsudot_s32 (s32, u8, u8); /* { dg-error {incompatible type for argument 2 of 'svsudot_s32'} } */
+  svsudot_s32 (s32, s8, u32); /* { dg-error {incompatible type for argument 3 of 'svsudot_s32'} } */
+  svsudot_s32 (s32, s8, s8); /* { dg-error {incompatible type for argument 3 of 'svsudot_s32'} } */
+  svsudot_s32 (s32, s8, 0); /* { dg-error {incompatible type for argument 3 of 'svsudot_s32'} } */
+  svsudot_s32 (s32, s8, u8);
+  return svsudot_s32 (s32, s8, u8); /* { dg-error {incompatible types when returning type 'svint32_t' but 'svuint32_t' was expected} } */
+}
+
+void
+f2 (svbool_t pg, svint8_t s8, svuint8_t u8, svuint32_t u32,
+    svint32_t s32, svfloat32_t f32)
+{
+  svsudot (s32, s8); /* { dg-error {too few arguments to function 'svsudot'} } */
+  svsudot (s32, s8, u8, u8); /* { dg-error {too many arguments to function 'svsudot'} } */
+  svsudot (0, s8, u8); /* { dg-error {passing 'int' to argument 1 of 'svsudot', which expects an SVE vector type} } */
+  svsudot (pg, s8, u8); /* { dg-error {'svsudot' has no form that takes 'svbool_t' arguments} } */
+  svsudot (u8, s8, u8); /* { dg-error {'svsudot' has no form that takes 'svuint8_t' arguments} } */
+  svsudot (f32, s8, u8); /* { dg-error {'svsudot' has no form that takes 'svfloat32_t' arguments} } */
+  svsudot (s32, s8, u8);
+  svsudot (s32, 0, u8); /* { dg-error {passing 'int' to argument 2 of 'svsudot', which expects an SVE vector type} } */
+  svsudot (s32, u8, u8); /* { dg-error {passing 'svuint8_t' to argument 2 of 'svsudot', which expects a vector of signed integers} } */
+  svsudot (s32, s8, s8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svsudot', which expects a vector of unsigned integers} } */
+  svsudot (s32, s8, 0);
+  svsudot (s32, s8, u8);
+  svsudot (s32, u32, u32); /* { dg-error {passing 'svuint32_t' to argument 2 of 'svsudot', which expects a vector of signed integers} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_lane_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_lane_1.c
new file mode 100644
index 000000000..bbd1f91be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_lane_1.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svfloat16_t f16, svfloat32_t f32, svfloat64_t f64,
+    svint32_t s32, int i)
+{
+  svmla_lane (f32, f32, f32); /* { dg-error {too few arguments to function 'svmla_lane'} } */
+  svmla_lane (f32, f32, f32, 0, 0); /* { dg-error {too many arguments to function 'svmla_lane'} } */
+  svmla_lane (pg, pg, pg, 0); /* { dg-error {'svmla_lane' has no form that takes 'svbool_t' arguments} } */
+  svmla_lane (s32, s32, s32, 0); /* { dg-error {'svmla_lane' has no form that takes 'svint32_t' arguments} } */
+  svmla_lane (1, f32, f32, 0); /* { dg-error {passing 'int' to argument 1 of 'svmla_lane', which expects an SVE vector type} } */
+  svmla_lane (f32, 1, f32, 0); /* { dg-error {passing 'int' to argument 2 of 'svmla_lane', which expects an SVE vector type} } */
+  svmla_lane (f32, f32, 1, 0); /* { dg-error {passing 'int' to argument 3 of 'svmla_lane', which expects an SVE vector type} } */
+  svmla_lane (f32, f64, f32, 0); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svmla_lane', but previous arguments had type 'svfloat32_t'} } */
+  svmla_lane (f32, f32, f64, 0); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svmla_lane', but previous arguments had type 'svfloat32_t'} } */
+  svmla_lane (f32, f32, f32, s32); /* { dg-error {argument 4 of 'svmla_lane' must be an integer constant expression} } */
+  svmla_lane (f32, f32, f32, i); /* { dg-error {argument 4 of 'svmla_lane' must be an integer constant expression} } */
+
+  svmla_lane (f16, f16, f16, 0);
+  svmla_lane (f16, f16, f16, 7);
+  svmla_lane (f16, f16, f16, 8); /* { dg-error {passing 8 to argument 4 of 'svmla_lane', which expects a value in the range \[0, 7\]} } */
+  svmla_lane (f16, f16, f16, -1); /* { dg-error {passing -1 to argument 4 of 'svmla_lane', which expects a value in the range \[0, 7\]} } */
+
+  svmla_lane (f32, f32, f32, 0);
+  svmla_lane (f32, f32, f32, 3);
+  svmla_lane (f32, f32, f32, 4); /* { dg-error {passing 4 to argument 4 of 'svmla_lane', which expects a value in the range \[0, 3\]} } */
+  svmla_lane (f32, f32, f32, -1); /* { dg-error {passing -1 to argument 4 of 'svmla_lane', which expects a value in the range \[0, 3\]} } */
+
+  svmla_lane (f64, f64, f64, 0);
+  svmla_lane (f64, f64, f64, 1);
+  svmla_lane (f64, f64, f64, 2); /* { dg-error {passing 2 to argument 4 of 'svmla_lane', which expects a value in the range \[0, 1\]} } */
+  svmla_lane (f64, f64, f64, -1); /* { dg-error {passing -1 to argument 4 of 'svmla_lane', which expects a value in the range \[0, 1\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_lane_rotate_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_lane_rotate_1.c
new file mode 100644
index 000000000..bccc6c7e2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_lane_rotate_1.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svfloat16_t f16, svfloat32_t f32, svfloat64_t f64,
+    svint32_t s32, int i)
+{
+  svcmla_lane (f32, f32, f32, 0); /* { dg-error {too few arguments to function 'svcmla_lane'} } */
+  svcmla_lane (f32, f32, f32, 0, 90, 90); /* { dg-error {too many arguments to function 'svcmla_lane'} } */
+  svcmla_lane (pg, pg, pg, 0, 90); /* { dg-error {'svcmla_lane' has no form that takes 'svbool_t' arguments} } */
+  svcmla_lane (s32, s32, s32, 0, 90); /* { dg-error {'svcmla_lane' has no form that takes 'svint32_t' arguments} } */
+  svcmla_lane (f64, f64, f64, 0, 90); /* { dg-error {'svcmla_lane' has no form that takes 'svfloat64_t' arguments} } */
+  svcmla_lane (1, f32, f32, 0, 90); /* { dg-error {passing 'int' to argument 1 of 'svcmla_lane', which expects an SVE vector type} } */
+  svcmla_lane (f32, 1, f32, 0, 90); /* { dg-error {passing 'int' to argument 2 of 'svcmla_lane', which expects an SVE vector type} } */
+  svcmla_lane (f32, f32, 1, 0, 90); /* { dg-error {passing 'int' to argument 3 of 'svcmla_lane', which expects an SVE vector type} } */
+  svcmla_lane (f32, f64, f32, 0, 90); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svcmla_lane', but previous arguments had type 'svfloat32_t'} } */
+  svcmla_lane (f32, f32, f64, 0, 90); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svcmla_lane', but previous arguments had type 'svfloat32_t'} } */
+  svcmla_lane (f32, f32, f32, s32, 0); /* { dg-error {argument 4 of 'svcmla_lane' must be an integer constant expression} } */
+  svcmla_lane (f32, f32, f32, i, 0); /* { dg-error {argument 4 of 'svcmla_lane' must be an integer constant expression} } */
+
+  svcmla_lane (f16, f16, f16, 0, 0);
+  svcmla_lane (f16, f16, f16, 3, 0);
+  svcmla_lane (f16, f16, f16, 4, 0); /* { dg-error {passing 4 to argument 4 of 'svcmla_lane', which expects a value in the range \[0, 3\]} } */
+  svcmla_lane (f16, f16, f16, -1, 0); /* { dg-error {passing -1 to argument 4 of 'svcmla_lane', which expects a value in the range \[0, 3\]} } */
+
+  svcmla_lane (f32, f32, f32, 0, 0);
+  svcmla_lane (f32, f32, f32, 1, 0);
+  svcmla_lane (f32, f32, f32, 2, 0); /* { dg-error {passing 2 to argument 4 of 'svcmla_lane', which expects a value in the range \[0, 1\]} } */
+  svcmla_lane (f32, f32, f32, -1, 0); /* { dg-error {passing -1 to argument 4 of 'svcmla_lane', which expects a value in the range \[0, 1\]} } */
+
+  svcmla_lane (f32, f32, f32, 0, -90); /* { dg-error {passing -90 to argument 5 of 'svcmla_lane', which expects 0, 90, 180 or 270} } */
+  svcmla_lane (f32, f32, f32, 0, 0);
+  svcmla_lane (f32, f32, f32, 0, 1); /* { dg-error {passing 1 to argument 5 of 'svcmla_lane', which expects 0, 90, 180 or 270} } */
+  svcmla_lane (f32, f32, f32, 0, 90);
+  svcmla_lane (f32, f32, f32, 0, 180);
+  svcmla_lane (f32, f32, f32, 0, 270);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_opt_n_1.c
new file mode 100644
index 000000000..c4a80e9da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_opt_n_1.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint8_t s8, svuint8_t u8,
+    svint16_t s16, svuint16_t u16, svfloat16_t f16)
+{
+  svmla_x (pg, u8, u8); /* { dg-error {too few arguments to function 'svmla_x'} } */
+  svmla_x (pg, u8, u8, u8, u8); /* { dg-error {too many arguments to function 'svmla_x'} } */
+  svmla_x (u8, u8, u8, u8); /* { dg-error {passing 'svuint8_t' to argument 1 of 'svmla_x', which expects 'svbool_t'} } */
+  svmla_x (pg, pg, pg, pg); /* { dg-error {'svmla_x' has no form that takes 'svbool_t' arguments} } */
+  svmla_x (pg, 1, u8, u8); /* { dg-error {passing 'int' to argument 2 of 'svmla_x', which expects an SVE vector type} } */
+  svmla_x (pg, u8, s8, u8); /* { dg-error {passing 'svint8_t' to argument 3 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
+  svmla_x (pg, u8, u8, u8);
+  svmla_x (pg, u8, s16, u8); /* { dg-error {passing 'svint16_t' to argument 3 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
+  svmla_x (pg, u8, u16, u8); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
+  svmla_x (pg, u8, f16, u8); /* { dg-error {passing 'svfloat16_t' to argument 3 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
+  svmla_x (pg, u8, pg, u8); /* { dg-error {passing 'svbool_t' to argument 3 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
+  svmla_x (pg, u8, 0, u8); /* { dg-error {passing 'int' to argument 3 of 'svmla_x', which expects an SVE vector type} } */
+  svmla_x (pg, u8, u8, s8); /* { dg-error {passing 'svint8_t' to argument 4 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
+  svmla_x (pg, u8, u8, s16); /* { dg-error {passing 'svint16_t' to argument 4 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
+  svmla_x (pg, u8, u8, u16); /* { dg-error {passing 'svuint16_t' to argument 4 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
+  svmla_x (pg, u8, u8, f16); /* { dg-error {passing 'svfloat16_t' to argument 4 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
+  svmla_x (pg, u8, u8, pg); /* { dg-error {passing 'svbool_t' to argument 4 of 'svmla_x', but previous arguments had type 'svuint8_t'} } */
+  svmla_x (pg, u8, u8, 0);
+
+  svmla_x (pg, f16, s16, f16); /* { dg-error {passing 'svint16_t' to argument 3 of 'svmla_x', but previous arguments had type 'svfloat16_t'} } */
+  svmla_x (pg, f16, u16, f16); /* { dg-error {passing 'svuint16_t' to argument 3 of 'svmla_x', but previous arguments had type 'svfloat16_t'} } */
+  svmla_x (pg, f16, f16, s16); /* { dg-error {passing 'svint16_t' to argument 4 of 'svmla_x', but previous arguments had type 'svfloat16_t'} } */
+  svmla_x (pg, f16, f16, u16); /* { dg-error {passing 'svuint16_t' to argument 4 of 'svmla_x', but previous arguments had type 'svfloat16_t'} } */
+  svmla_x (pg, f16, f16, f16);
+  svmla_x (pg, f16, f16, 1);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_lane_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_lane_1.c
new file mode 100644
index 000000000..e81552b64
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_lane_1.c
@@ -0,0 +1,63 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16,
+    svint32_t s32, svuint32_t u32, svint64_t s64, svuint64_t u64,
+    svfloat32_t f32, int i)
+{
+  svdot_lane (u32, u8, u8); /* { dg-error {too few arguments to function 'svdot_lane'} } */
+  svdot_lane (u32, u8, u8, 0, 0); /* { dg-error {too many arguments to function 'svdot_lane'} } */
+  svdot_lane (0, u8, u8, 0); /* { dg-error {passing 'int' to argument 1 of 'svdot_lane', which expects an SVE vector type} } */
+  svdot_lane (pg, u8, u8, 0); /* { dg-error {'svdot_lane' has no form that takes 'svbool_t' arguments} } */
+  svdot_lane (u8, u8, u8, 0); /* { dg-error {'svdot_lane' has no form that takes 'svuint8_t' arguments} } */
+  svdot_lane (f32, u8, u8, 0); /* { dg-error {'svdot_lane' has no form that takes 'svfloat32_t' arguments} } */
+  svdot_lane (u32, u8, u8, 0);
+  svdot_lane (u32, 0, u8, 0); /* { dg-error {passing 'int' to argument 2 of 'svdot_lane', which expects an SVE vector type} } */
+  svdot_lane (u32, u8, 0, 0); /* { dg-error {passing 'int' to argument 3 of 'svdot_lane', which expects an SVE vector type} } */
+
+  svdot_lane (s32, s8, s8, 0);
+  svdot_lane (s32, u8, s8, 0); /* { dg-error {arguments 1 and 2 of 'svdot_lane' must have the same signedness, but the values passed here have type 'svint32_t' and 'svuint8_t' respectively} } */
+  svdot_lane (s32, s8, u8, 0); /* { dg-error {arguments 1 and 3 of 'svdot_lane' must have the same signedness, but the values passed here have type 'svint32_t' and 'svuint8_t' respectively} } */
+  svdot_lane (s32, s32, s32, 0); /* { dg-error {passing 'svint32_t' instead of the expected 'svint8_t' to argument 2 of 'svdot_lane', after passing 'svint32_t' to argument 1} } */
+
+  svdot_lane (u32, u8, u8, 0);
+  svdot_lane (u32, s8, u8, 0); /* { dg-error {arguments 1 and 2 of 'svdot_lane' must have the same signedness, but the values passed here have type 'svuint32_t' and 'svint8_t' respectively} } */
+  svdot_lane (u32, u8, s8, 0); /* { dg-error {arguments 1 and 3 of 'svdot_lane' must have the same signedness, but the values passed here have type 'svuint32_t' and 'svint8_t' respectively} } */
+  svdot_lane (u32, u32, u32, 0); /* { dg-error {passing 'svuint32_t' instead of the expected 'svuint8_t' to argument 2 of 'svdot_lane', after passing 'svuint32_t' to argument 1} } */
+
+  svdot_lane (s64, s16, s16, 0);
+  svdot_lane (s64, u16, s16, 0); /* { dg-error {arguments 1 and 2 of 'svdot_lane' must have the same signedness, but the values passed here have type 'svint64_t' and 'svuint16_t' respectively} } */
+  svdot_lane (s64, s16, u16, 0); /* { dg-error {arguments 1 and 3 of 'svdot_lane' must have the same signedness, but the values passed here have type 'svint64_t' and 'svuint16_t' respectively} } */
+  svdot_lane (s64, s64, s64, 0); /* { dg-error {passing 'svint64_t' instead of the expected 'svint16_t' to argument 2 of 'svdot_lane', after passing 'svint64_t' to argument 1} } */
+
+  svdot_lane (u64, u16, u16, 0);
+  svdot_lane (u64, s16, u16, 0); /* { dg-error {arguments 1 and 2 of 'svdot_lane' must have the same signedness, but the values passed here have type 'svuint64_t' and 'svint16_t' respectively} } */
+  svdot_lane (u64, u16, s16, 0); /* { dg-error {arguments 1 and 3 of 'svdot_lane' must have the same signedness, but the values passed here have type 'svuint64_t' and 'svint16_t' respectively} } */
+  svdot_lane (u64, u64, u64, 0); /* { dg-error {passing 'svuint64_t' instead of the expected 'svuint16_t' to argument 2 of 'svdot_lane', after passing 'svuint64_t' to argument 1} } */
+
+  svdot_lane (s32, s8, s8, i); /* { dg-error {argument 4 of 'svdot_lane' must be an integer constant expression} } */
+  svdot_lane (s32, s8, s8, 0);
+  svdot_lane (s32, s8, s8, 3);
+  svdot_lane (s32, s8, s8, 4);  /* { dg-error {passing 4 to argument 4 of 'svdot_lane', which expects a value in the range \[0, 3\]} } */
+  svdot_lane (s32, s8, s8, -1);  /* { dg-error {passing -1 to argument 4 of 'svdot_lane', which expects a value in the range \[0, 3\]} } */
+
+  svdot_lane (u32, u8, u8, i); /* { dg-error {argument 4 of 'svdot_lane' must be an integer constant expression} } */
+  svdot_lane (u32, u8, u8, 0);
+  svdot_lane (u32, u8, u8, 3);
+  svdot_lane (u32, u8, u8, 4);  /* { dg-error {passing 4 to argument 4 of 'svdot_lane', which expects a value in the range \[0, 3\]} } */
+  svdot_lane (u32, u8, u8, -1);  /* { dg-error {passing -1 to argument 4 of 'svdot_lane', which expects a value in the range \[0, 3\]} } */
+
+  svdot_lane (s64, s16, s16, i); /* { dg-error {argument 4 of 'svdot_lane' must be an integer constant expression} } */
+  svdot_lane (s64, s16, s16, 0);
+  svdot_lane (s64, s16, s16, 1);
+  svdot_lane (s64, s16, s16, 2);  /* { dg-error {passing 2 to argument 4 of 'svdot_lane', which expects a value in the range \[0, 1\]} } */
+  svdot_lane (s64, s16, s16, -1);  /* { dg-error {passing -1 to argument 4 of 'svdot_lane', which expects a value in the range \[0, 1\]} } */
+
+  svdot_lane (u64, u16, u16, i); /* { dg-error {argument 4 of 'svdot_lane' must be an integer constant expression} } */
+  svdot_lane (u64, u16, u16, 0);
+  svdot_lane (u64, u16, u16, 1);
+  svdot_lane (u64, u16, u16, 2);  /* { dg-error {passing 2 to argument 4 of 'svdot_lane', which expects a value in the range \[0, 1\]} } */
+  svdot_lane (u64, u16, u16, -1);  /* { dg-error {passing -1 to argument 4 of 'svdot_lane', which expects a value in the range \[0, 1\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_opt_n_1.c
new file mode 100644
index 000000000..b41e6fcce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_opt_n_1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+svint32_t
+f1 (svuint32_t u32, svuint8_t u8, svint8_t s8)
+{
+  svdot_u32 (u32); /* { dg-error {too few arguments to function 'svdot_u32'} } */
+  svdot_u32 (u32, u8, u8, u32); /* { dg-error {too many arguments to function 'svdot_u32'} } */
+  svdot_u32 (u32, u32, u8); /* { dg-error {incompatible type for argument 2 of 'svdot_u32'} } */
+  svdot_u32 (u32, s8, u8); /* { dg-error {incompatible type for argument 2 of 'svdot_u32'} } */
+  svdot_u32 (u32, u8, u32); /* { dg-error {incompatible type for argument 3 of 'svdot_u32'} } */
+  svdot_u32 (u32, u8, s8); /* { dg-error {incompatible type for argument 3 of 'svdot_u32'} } */
+  return svdot_u32 (u32, u8, u8); /* { dg-error {incompatible types when returning type 'svuint32_t' but 'svint32_t' was expected} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_opt_n_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_opt_n_2.c
new file mode 100644
index 000000000..fee4096fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_qq_opt_n_2.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint8_t s8, svuint8_t u8, svuint32_t u32,
+    svfloat32_t f32)
+{
+  svdot (u32, u8); /* { dg-error {too few arguments to function 'svdot'} } */
+  svdot (u32, u8, u8, u8); /* { dg-error {too many arguments to function 'svdot'} } */
+  svdot (0, u8, u8); /* { dg-error {passing 'int' to argument 1 of 'svdot', which expects an SVE vector type} } */
+  svdot (pg, u8, u8); /* { dg-error {'svdot' has no form that takes 'svbool_t' arguments} } */
+  svdot (u8, u8, u8); /* { dg-error {'svdot' has no form that takes 'svuint8_t' arguments} } */
+  svdot (f32, u8, u8); /* { dg-error {'svdot' has no form that takes 'svfloat32_t' arguments} } */
+  svdot (u32, u8, u8);
+  svdot (u32, 0, u8); /* { dg-error {passing 'int' to argument 2 of 'svdot', which expects an SVE vector type} } */
+  svdot (u32, s8, u8); /* { dg-error {arguments 1 and 2 of 'svdot' must have the same signedness, but the values passed here have type 'svuint32_t' and 'svint8_t' respectively} } */
+  svdot (u32, u8, 0);
+  svdot (u32, u8, s8); /* { dg-error {arguments 1 and 3 of 'svdot' must have the same signedness, but the values passed here have type 'svuint32_t' and 'svint8_t' respectively} } */
+  svdot (u32, u32, u32); /* { dg-error {passing 'svuint32_t' instead of the expected 'svuint8_t' to argument 2 of 'svdot', after passing 'svuint32_t' to argument 1} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_rotate_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_rotate_1.c
new file mode 100644
index 000000000..f340e3d1e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_rotate_1.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svfloat32_t f32, svfloat64_t f64, svint32_t s32, int i)
+{
+  svcmla_x (pg, f32, f32, f32); /* { dg-error {too few arguments to function 'svcmla_x'} } */
+  svcmla_x (pg, f32, f32, f32, 90, 90); /* { dg-error {too many arguments to function 'svcmla_x'} } */
+  svcmla_x (f32, f32, f32, f32, 90); /* { dg-error {passing 'svfloat32_t' to argument 1 of 'svcmla_x', which expects 'svbool_t'} } */
+  svcmla_x (pg, pg, pg, pg, 90); /* { dg-error {'svcmla_x' has no form that takes 'svbool_t' arguments} } */
+  svcmla_x (pg, s32, s32, s32, 90); /* { dg-error {'svcmla_x' has no form that takes 'svint32_t' arguments} } */
+  svcmla_x (pg, 1, f32, f32, 90); /* { dg-error {passing 'int' to argument 2 of 'svcmla_x', which expects an SVE vector type} } */
+  svcmla_x (pg, f32, 1, f32, 90); /* { dg-error {passing 'int' to argument 3 of 'svcmla_x', which expects an SVE vector type} } */
+  svcmla_x (pg, f32, f32, 1, 90); /* { dg-error {passing 'int' to argument 4 of 'svcmla_x', which expects an SVE vector type} } */
+  svcmla_x (pg, f32, f64, f32, 90); /* { dg-error {passing 'svfloat64_t' to argument 3 of 'svcmla_x', but previous arguments had type 'svfloat32_t'} } */
+  svcmla_x (pg, f32, f32, f64, 90); /* { dg-error {passing 'svfloat64_t' to argument 4 of 'svcmla_x', but previous arguments had type 'svfloat32_t'} } */
+  svcmla_x (pg, f32, f32, f32, s32); /* { dg-error {argument 5 of 'svcmla_x' must be an integer constant expression} } */
+  svcmla_x (pg, f32, f32, f32, i); /* { dg-error {argument 5 of 'svcmla_x' must be an integer constant expression} } */
+  svcmla_x (pg, f32, f32, f32, -90); /* { dg-error {passing -90 to argument 5 of 'svcmla_x', which expects 0, 90, 180 or 270} } */
+  svcmla_x (pg, f32, f32, f32, 0);
+  svcmla_x (pg, f32, f32, f32, 1); /* { dg-error {passing 1 to argument 5 of 'svcmla_x', which expects 0, 90, 180 or 270} } */
+  svcmla_x (pg, f32, f32, f32, 90);
+  svcmla_x (pg, f32, f32, f32, 180);
+  svcmla_x (pg, f32, f32, f32, 270);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_1.c
new file mode 100644
index 000000000..f52fb39bf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_1.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv8.6-a+sve+i8mm" } */
+
+#include <arm_sve.h>
+
+svuint32_t
+f1 (svint32_t s32, svuint8_t u8, svint8_t s8, svuint32_t u32)
+{
+  svusmmla_s32 (s32); /* { dg-error {too few arguments to function 'svusmmla_s32'} } */
+  svusmmla_s32 (s32, u8, s8, u32); /* { dg-error {too many arguments to function 'svusmmla_s32'} } */
+  svusmmla_s32 (s32, u32, s8); /* { dg-error {incompatible type for argument 2 of 'svusmmla_s32'} } */
+  svusmmla_s32 (s32, s8, s8); /* { dg-error {incompatible type for argument 2 of 'svusmmla_s32'} } */
+  svusmmla_s32 (s32, u8, u8); /* { dg-error {incompatible type for argument 3 of 'svusmmla_s32'} } */
+  svusmmla_s32 (s32, u8, s32); /* { dg-error {incompatible type for argument 3 of 'svusmmla_s32'} } */
+  svusmmla_s32 (s32, u8, 0); /* { dg-error {incompatible type for argument 3 of 'svusmmla_s32'} } */
+  svusmmla_s32 (s32, u8, s8);
+  return svusmmla_s32 (s32, u8, s8); /* { dg-error {incompatible types when returning type 'svint32_t' but 'svuint32_t' was expected} } */
+}
+
+void
+f2 (svbool_t pg, svint8_t s8, svuint8_t u8, svuint32_t u32,
+    svint32_t s32, svfloat32_t f32)
+{
+  svusmmla (s32, u8); /* { dg-error {too few arguments to function 'svusmmla'} } */
+  svusmmla (s32, u8, s8, u8); /* { dg-error {too many arguments to function 'svusmmla'} } */
+  svusmmla (0, u8, s8); /* { dg-error {passing 'int' to argument 1 of 'svusmmla', which expects an SVE vector type} } */
+  svusmmla (pg, u8, s8); /* { dg-error {'svusmmla' has no form that takes 'svbool_t' arguments} } */
+  svusmmla (u8, u8, s8); /* { dg-error {'svusmmla' has no form that takes 'svuint8_t' arguments} } */
+  svusmmla (f32, u8, s8); /* { dg-error {'svusmmla' has no form that takes 'svfloat32_t' arguments} } */
+  svusmmla (s32, u8, s8);
+  svusmmla (s32, 0, s8); /* { dg-error {passing 'int' to argument 2 of 'svusmmla', which expects an SVE vector type} } */
+  svusmmla (s32, u8, u8); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svusmmla', which expects a vector of signed integers} } */
+  svusmmla (s32, s8, s8); /* { dg-error {passing 'svint8_t' to argument 2 of 'svusmmla', which expects a vector of unsigned integers} } */
+  svusmmla (s32, u8, 0); /* { dg-error {passing 'int' to argument 3 of 'svusmmla', which expects an SVE vector type} } */
+  svusmmla (s32, u8, s8);
+  svusmmla (s32, u32, u32); /* { dg-error {passing 'svuint32_t' instead of the expected 'svuint8_t' to argument 2 of 'svusmmla', after passing 'svint32_t' to argument 1} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_lane_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_lane_1.c
new file mode 100644
index 000000000..b40cfe9e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_lane_1.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv8.6-a+sve+i8mm" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint8_t s8, svuint8_t u8, svint16_t s16, svuint16_t u16,
+    svint32_t s32, svuint32_t u32, svint64_t s64, svuint64_t u64,
+    svfloat32_t f32, int i)
+{
+  svusdot_lane (s32, u8, s8); /* { dg-error {too few arguments to function 'svusdot_lane'} } */
+  svusdot_lane (s32, u8, s8, 0, 0); /* { dg-error {too many arguments to function 'svusdot_lane'} } */
+  svusdot_lane (0, u8, s8, 0); /* { dg-error {passing 'int' to argument 1 of 'svusdot_lane', which expects an SVE vector type} } */
+  svusdot_lane (pg, u8, s8, 0); /* { dg-error {'svusdot_lane' has no form that takes 'svbool_t' arguments} } */
+  svusdot_lane (u8, u8, s8, 0); /* { dg-error {'svusdot_lane' has no form that takes 'svuint8_t' arguments} } */
+  svusdot_lane (f32, u8, s8, 0); /* { dg-error {'svusdot_lane' has no form that takes 'svfloat32_t' arguments} } */
+  svusdot_lane (u32, u8, s8, 0); /* { dg-error {'svusdot_lane' has no form that takes 'svuint32_t' arguments} } */
+  svusdot_lane (s32, u8, s8, 0);
+  svusdot_lane (s32, 0, s8, 0); /* { dg-error {passing 'int' to argument 2 of 'svusdot_lane', which expects an SVE vector type} } */
+  svusdot_lane (s32, u8, 0, 0); /* { dg-error {passing 'int' to argument 3 of 'svusdot_lane', which expects an SVE vector type} } */
+
+  svusdot_lane (s32, u8, s8, 0);
+  svusdot_lane (s32, s8, s8, 0); /* { dg-error {passing 'svint8_t' to argument 2 of 'svusdot_lane', which expects a vector of unsigned integers} } */
+  svusdot_lane (s32, u8, u8, 0); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svusdot_lane', which expects a vector of signed integers} } */
+  svusdot_lane (s32, s32, s32, 0); /* { dg-error {passing 'svint32_t' to argument 2 of 'svusdot_lane', which expects a vector of unsigned integers} } */
+
+  svusdot_lane (s32, u8, s8, i); /* { dg-error {argument 4 of 'svusdot_lane' must be an integer constant expression} } */
+  svusdot_lane (s32, u8, s8, 0);
+  svusdot_lane (s32, u8, s8, 3);
+  svusdot_lane (s32, u8, s8, 4);  /* { dg-error {passing 4 to argument 4 of 'svusdot_lane', which expects a value in the range \[0, 3\]} } */
+  svusdot_lane (s32, u8, s8, -1);  /* { dg-error {passing -1 to argument 4 of 'svusdot_lane', which expects a value in the range \[0, 3\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_opt_n_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_opt_n_1.c
new file mode 100644
index 000000000..896b80390
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/ternary_uintq_intq_opt_n_1.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv8.6-a+sve+i8mm" } */
+
+#include <arm_sve.h>
+
+svuint32_t
+f1 (svint32_t s32, svuint8_t u8, svint8_t s8, svuint32_t u32)
+{
+  svusdot_s32 (s32); /* { dg-error {too few arguments to function 'svusdot_s32'} } */
+  svusdot_s32 (s32, u8, s8, u32); /* { dg-error {too many arguments to function 'svusdot_s32'} } */
+  svusdot_s32 (s32, u32, s8); /* { dg-error {incompatible type for argument 2 of 'svusdot_s32'} } */
+  svusdot_s32 (s32, s8, s8); /* { dg-error {incompatible type for argument 2 of 'svusdot_s32'} } */
+  svusdot_s32 (s32, u8, u8); /* { dg-error {incompatible type for argument 3 of 'svusdot_s32'} } */
+  svusdot_s32 (s32, u8, s32); /* { dg-error {incompatible type for argument 3 of 'svusdot_s32'} } */
+  svusdot_s32 (s32, u8, 0); /* { dg-error {incompatible type for argument 3 of 'svusdot_s32'} } */
+  svusdot_s32 (s32, u8, s8);
+  return svusdot_s32 (s32, u8, s8); /* { dg-error {incompatible types when returning type 'svint32_t' but 'svuint32_t' was expected} } */
+}
+
+void
+f2 (svbool_t pg, svint8_t s8, svuint8_t u8, svuint32_t u32,
+    svint32_t s32, svfloat32_t f32)
+{
+  svusdot (s32, u8); /* { dg-error {too few arguments to function 'svusdot'} } */
+  svusdot (s32, u8, s8, u8); /* { dg-error {too many arguments to function 'svusdot'} } */
+  svusdot (0, u8, s8); /* { dg-error {passing 'int' to argument 1 of 'svusdot', which expects an SVE vector type} } */
+  svusdot (pg, u8, s8); /* { dg-error {'svusdot' has no form that takes 'svbool_t' arguments} } */
+  svusdot (u8, u8, s8); /* { dg-error {'svusdot' has no form that takes 'svuint8_t' arguments} } */
+  svusdot (f32, u8, s8); /* { dg-error {'svusdot' has no form that takes 'svfloat32_t' arguments} } */
+  svusdot (s32, u8, s8);
+  svusdot (s32, 0, s8); /* { dg-error {passing 'int' to argument 2 of 'svusdot', which expects an SVE vector type} } */
+  svusdot (s32, u8, u8); /* { dg-error {passing 'svuint8_t' to argument 3 of 'svusdot', which expects a vector of signed integers} } */
+  svusdot (s32, s8, s8); /* { dg-error {passing 'svint8_t' to argument 2 of 'svusdot', which expects a vector of unsigned integers} } */
+  svusdot (s32, u8, 0);
+  svusdot (s32, u8, s8);
+  svusdot (s32, u32, u32); /* { dg-error {passing 'svuint32_t' instead of the expected 'svuint8_t' to argument 2 of 'svusdot', after passing 'svint32_t' to argument 1} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/tmad_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/tmad_1.c
new file mode 100644
index 000000000..8b98fc24d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/tmad_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svfloat32_t f32, svfloat64_t f64, svint32_t s32, int i)
+{
+  svtmad (f32, f32); /* { dg-error {too few arguments to function 'svtmad'} } */
+  svtmad (f32, f32, 0, 0); /* { dg-error {too many arguments to function 'svtmad'} } */
+  svtmad (pg, pg, 0); /* { dg-error {'svtmad' has no form that takes 'svbool_t' arguments} } */
+  svtmad (s32, s32, 0); /* { dg-error {'svtmad' has no form that takes 'svint32_t' arguments} } */
+  svtmad (1, f32, 0); /* { dg-error {passing 'int' to argument 1 of 'svtmad', which expects an SVE vector type} } */
+  svtmad (f32, 1, 0); /* { dg-error {passing 'int' to argument 2 of 'svtmad', which expects an SVE vector type} } */
+  svtmad (f32, f64, 0); /* { dg-error {passing 'svfloat64_t' to argument 2 of 'svtmad', but previous arguments had type 'svfloat32_t'} } */
+  svtmad (f32, f32, s32); /* { dg-error {argument 3 of 'svtmad' must be an integer constant expression} } */
+  svtmad (f32, f32, i); /* { dg-error {argument 3 of 'svtmad' must be an integer constant expression} } */
+  svtmad (f32, f32, -1); /* { dg-error {passing -1 to argument 3 of 'svtmad', which expects a value in the range \[0, 7\]} } */
+  svtmad (f32, f32, 0);
+  svtmad (f32, f32, 1);
+  svtmad (f32, f32, 7);
+  svtmad (f32, f32, 8); /* { dg-error {passing 8 to argument 3 of 'svtmad', which expects a value in the range \[0, 7\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_1.c
new file mode 100644
index 000000000..70b2d9dd1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_1.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+int svbool_t; /* { dg-message "note: previous declaration of 'svbool_t' was here" } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'svbool_t' redeclared} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_10.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_10.c
new file mode 100644
index 000000000..8278c1cad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_10.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+typedef struct svint8x2_t svint8x2_t; /* { dg-message "note: previous declaration of 'svint8x2_t' was here" } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {conflicting types for 'svint8x2_t'} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_11.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_11.c
new file mode 100644
index 000000000..2147df72c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_11.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+
+/* This isn't explicitly allowed or disallowed, but mustn't ICE.  */
+struct svint8x2_t;
+
+#pragma GCC aarch64 "arm_sve.h"
+
+void
+f (svint8x2_t *a, struct svint8x2_t *b)
+{
+  *a = *b; /* { dg-error {dereferencing pointer to incomplete type} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_12.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_12.c
new file mode 100644
index 000000000..1a6ccbd05
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_12.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+
+/* This isn't explicitly allowed or disallowed, but mustn't ICE.  */
+struct svint8x2_t { int x; };
+
+#pragma GCC aarch64 "arm_sve.h"
+
+void
+f (svint8x2_t *a, struct svint8x2_t *b)
+{
+  *a = *b; /* { dg-error {incompatible types} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_13.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_13.c
new file mode 100644
index 000000000..62bab1f84
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_13.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-message "note: previous declaration of 'svint8x2_t' was here" } */
+
+int svint8x2_t;  /* { dg-error {'svint8x2_t' redeclared} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_14.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_14.c
new file mode 100644
index 000000000..0f00db1fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_14.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+enum svpattern { FOO }; /* { dg-message "note: originally defined here" } */
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error {redeclaration of 'enum svpattern'} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_15.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_15.c
new file mode 100644
index 000000000..ea9721749
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_15.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-message "note: originally defined here" } */
+
+enum svpattern { FOO }; /* { dg-error {redeclaration of 'enum svpattern'} } */
+enum foo { SV_ALL }; /* { dg-error {redeclaration of enumerator 'SV_ALL'} } */
+typedef int SV_POW2; /* { dg-error {'SV_POW2' redeclared as different kind of symbol} } */
+int SV_VL3; /* { dg-error {'SV_VL3' redeclared as different kind of symbol} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_16.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_16.c
new file mode 100644
index 000000000..a59dabc6c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_16.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+struct svpattern { int x; };
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error {'svpattern' defined as wrong kind of tag} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_17.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_17.c
new file mode 100644
index 000000000..027fdb2b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_17.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+struct svpattern { int x; }; /* { dg-error {'svpattern' defined as wrong kind of tag} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_18.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_18.c
new file mode 100644
index 000000000..b6706150b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_18.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+int svpattern; /* OK in C.  */
+
+#pragma GCC aarch64 "arm_sve.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_19.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_19.c
new file mode 100644
index 000000000..c6379f762
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_19.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+int svpattern; /* OK in C.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_2.c
new file mode 100644
index 000000000..ffd86ae7b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_2.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+int svint8_t; /* { dg-message "note: previous declaration of 'svint8_t' was here" } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'svint8_t' redeclared} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_20.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_20.c
new file mode 100644
index 000000000..3d770a956
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_20.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+
+enum foo { SV_VL4 };
+typedef int SV_POW2;
+int SV_ALL;
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error {redeclaration of enumerator 'SV_VL4'} } */
+/* { dg-error {'SV_POW2' redeclared as different kind of symbol} "" { target *-*-* } .-1 } */
+/* { dg-error {'SV_ALL' redeclared as different kind of symbol} "" { target *-*-* } .-2 } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_3.c
new file mode 100644
index 000000000..f42dd9680
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_3.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+int svuint16_t; /* { dg-message "note: previous declaration of 'svuint16_t' was here" } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'svuint16_t' redeclared} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_4.c
new file mode 100644
index 000000000..91c95a1f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_4.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+int svfloat32_t; /* { dg-message "note: previous declaration of 'svfloat32_t' was here" } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'svfloat32_t' redeclared} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_5.c
new file mode 100644
index 000000000..3cb6b8a1c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_5.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+typedef int svbool_t; /* { dg-message "note: previous declaration of 'svbool_t' was here" } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {conflicting types for 'svbool_t'} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_6.c
new file mode 100644
index 000000000..c051897b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_6.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+
+typedef __SVBool_t svbool_t; /* { dg-message "note: previous declaration of 'svbool_t' was here" } */
+
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error {redefinition of typedef 'svbool_t'} } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_7.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_7.c
new file mode 100644
index 000000000..fd4063154
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_7.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-std=gnu90" } */
+
+typedef __SVBool_t svbool_t;
+
+/* Without -pedantic-errors this should compile.  */
+#pragma GCC aarch64 "arm_sve.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_8.c
new file mode 100644
index 000000000..41614a304
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_8.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+int svint8x2_t; /* { dg-message "note: previous declaration of 'svint8x2_t' was here" } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {'svint8x2_t' redeclared} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_9.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_9.c
new file mode 100644
index 000000000..83b6855df
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/type_redef_9.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+
+typedef int svint8x2_t; /* { dg-message "note: previous declaration of 'svint8x2_t' was here" } */
+
+#pragma GCC aarch64 "arm_sve.h"  /* { dg-error {conflicting types for 'svint8x2_t'} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_1.c
new file mode 100644
index 000000000..eef85a01d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint32_t s32, svuint32_t u32, svfloat32_t f32)
+{
+  svabs_m (s32, pg); /* { dg-error {too few arguments to function 'svabs_m'} } */
+  svabs_m (s32, pg, s32, s32); /* { dg-error {too many arguments to function 'svabs_m'} } */
+  svabs_m (0, pg, s32); /* { dg-error {passing 'int' to argument 1 of 'svabs_m', which expects an SVE vector type} } */
+  svabs_m (s32, s32, s32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svabs_m', which expects 'svbool_t'} } */
+  svabs_m (s32, 0, s32); /* { dg-error {passing 'int' to argument 2 of 'svabs_m', which expects 'svbool_t'} } */
+  svabs_m (s32, pg, s32);
+  svabs_m (u32, pg, u32); /* { dg-error {'svabs_m' has no form that takes 'svuint32_t' arguments} } */
+  svabs_m (f32, pg, f32);
+  svabs_m (s32, pg, u32); /* { dg-error {passing 'svuint32_t' to argument 3 of 'svabs_m', but previous arguments had type 'svint32_t'} } */
+  svabs_m (s32, pg, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svabs_m', but previous arguments had type 'svint32_t'} } */
+  svabs_m (s32, pg, pg); /* { dg-error {passing 'svbool_t' to argument 3 of 'svabs_m', but previous arguments had type 'svint32_t'} } */
+  svabs_m (pg, pg, s32); /* { dg-error {passing 'svint32_t' to argument 3 of 'svabs_m', but previous arguments had type 'svbool_t'} } */
+  svabs_m (pg, pg, pg); /* { dg-error {'svabs_m' has no form that takes 'svbool_t' arguments} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_2.c
new file mode 100644
index 000000000..e94673a66
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint8_t s8, svuint8_t u8)
+{
+  svabs_x (pg); /* { dg-error {too few arguments to function 'svabs_x'} } */
+  svabs_x (pg, s8, s8); /* { dg-error {too many arguments to function 'svabs_x'} } */
+  svabs_x (s8, s8); /* { dg-error {passing 'svint8_t' to argument 1 of 'svabs_x', which expects 'svbool_t'} } */
+  svabs_x (pg, pg); /* { dg-error {'svabs_x' has no form that takes 'svbool_t' arguments} } */
+  svabs_x (pg, 1); /* { dg-error {passing 'int' to argument 2 of 'svabs_x', which expects an SVE vector type} } */
+  svabs_x (pg, s8);
+  svabs_x (pg, u8); /* { dg-error {'svabs_x' has no form that takes 'svuint8_t' arguments} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_convert_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_convert_1.c
new file mode 100644
index 000000000..caa4e623d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_convert_1.c
@@ -0,0 +1,73 @@
+#include <arm_sve.h>
+
+void
+test (svbool_t pg, svint8_t s8, svuint8_t u8,
+      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
+      svint64_t s64, svuint64_t u64, svfloat16_t f16, svfloat32_t f32,
+      svfloat64_t f64)
+{
+  svcvt_f64_x (pg); /* { dg-error {too few arguments to function 'svcvt_f64_x'} } */
+  svcvt_f64_x (pg, s32, 0); /* { dg-error {too many arguments to function 'svcvt_f64_x'} } */
+  svcvt_f64_x (s32, s32); /* { dg-error {passing 'svint32_t' to argument 1 of 'svcvt_f64_x', which expects 'svbool_t'} } */
+  svcvt_f64_x (pg, 0); /* { dg-error {passing 'int' to argument 2 of 'svcvt_f64_x', which expects an SVE vector type} } */
+
+  svcvt_f64_x (pg, s8); /* { dg-error {'svcvt_f64_x' has no form that takes 'svint8_t' arguments} } */
+  svcvt_f64_x (pg, s16); /* { dg-error {'svcvt_f64_x' has no form that takes 'svint16_t' arguments} } */
+  svcvt_f64_x (pg, s32);
+  svcvt_f64_x (pg, s64);
+  svcvt_f64_x (pg, u8); /* { dg-error {'svcvt_f64_x' has no form that takes 'svuint8_t' arguments} } */
+  svcvt_f64_x (pg, u16); /* { dg-error {'svcvt_f64_x' has no form that takes 'svuint16_t' arguments} } */
+  svcvt_f64_x (pg, u32);
+  svcvt_f64_x (pg, u64);
+  svcvt_f64_x (pg, f16);
+  svcvt_f64_x (pg, f32);
+  svcvt_f64_x (pg, f64); /* { dg-error {'svcvt_f64_x' has no form that takes 'svfloat64_t' arguments} } */
+
+  svcvt_f32_x (pg, s8); /* { dg-error {'svcvt_f32_x' has no form that takes 'svint8_t' arguments} } */
+  svcvt_f32_x (pg, s16); /* { dg-error {'svcvt_f32_x' has no form that takes 'svint16_t' arguments} } */
+  svcvt_f32_x (pg, s32);
+  svcvt_f32_x (pg, s64);
+  svcvt_f32_x (pg, u8); /* { dg-error {'svcvt_f32_x' has no form that takes 'svuint8_t' arguments} } */
+  svcvt_f32_x (pg, u16); /* { dg-error {'svcvt_f32_x' has no form that takes 'svuint16_t' arguments} } */
+  svcvt_f32_x (pg, u32);
+  svcvt_f32_x (pg, u64);
+  svcvt_f32_x (pg, f16);
+  svcvt_f32_x (pg, f32); /* { dg-error {'svcvt_f32_x' has no form that takes 'svfloat32_t' arguments} } */
+  svcvt_f32_x (pg, f64);
+
+  svcvt_f16_x (pg, s8); /* { dg-error {'svcvt_f16_x' has no form that takes 'svint8_t' arguments} } */
+  svcvt_f16_x (pg, s16);
+  svcvt_f16_x (pg, s32);
+  svcvt_f16_x (pg, s64);
+  svcvt_f16_x (pg, u8); /* { dg-error {'svcvt_f16_x' has no form that takes 'svuint8_t' arguments} } */
+  svcvt_f16_x (pg, u16);
+  svcvt_f16_x (pg, u32);
+  svcvt_f16_x (pg, u64);
+  svcvt_f16_x (pg, f16); /* { dg-error {'svcvt_f16_x' has no form that takes 'svfloat16_t' arguments} } */
+  svcvt_f16_x (pg, f32);
+  svcvt_f16_x (pg, f64);
+
+  svcvt_s64_x (pg, f16);
+  svcvt_s64_x (pg, f32);
+  svcvt_s64_x (pg, f64);
+
+  svcvt_s32_x (pg, f16);
+  svcvt_s32_x (pg, f32);
+  svcvt_s32_x (pg, f64);
+
+  svcvt_s16_x (pg, f16);
+  svcvt_s16_x (pg, f32); /* { dg-error {'svcvt_s16_x' has no form that takes 'svfloat32_t' arguments} } */
+  svcvt_s16_x (pg, f64); /* { dg-error {'svcvt_s16_x' has no form that takes 'svfloat64_t' arguments} } */
+
+  svcvt_u64_x (pg, f16);
+  svcvt_u64_x (pg, f32);
+  svcvt_u64_x (pg, f64);
+
+  svcvt_u32_x (pg, f16);
+  svcvt_u32_x (pg, f32);
+  svcvt_u32_x (pg, f64);
+
+  svcvt_u16_x (pg, f16);
+  svcvt_u16_x (pg, f32); /* { dg-error {'svcvt_u16_x' has no form that takes 'svfloat32_t' arguments} } */
+  svcvt_u16_x (pg, f64); /* { dg-error {'svcvt_u16_x' has no form that takes 'svfloat64_t' arguments} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_convert_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_convert_2.c
new file mode 100644
index 000000000..ddbd93b69
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_convert_2.c
@@ -0,0 +1,76 @@
+#include <arm_sve.h>
+
+void
+test (svbool_t pg, svint8_t s8, svuint8_t u8,
+      svint16_t s16, svuint16_t u16, svint32_t s32, svuint32_t u32,
+      svint64_t s64, svuint64_t u64, svfloat16_t f16, svfloat32_t f32,
+      svfloat64_t f64)
+{
+  svcvt_f64_m (f64, pg); /* { dg-error {too few arguments to function 'svcvt_f64_m'} } */
+  svcvt_f64_m (f64, pg, s32, 0); /* { dg-error {too many arguments to function 'svcvt_f64_m'} } */
+  svcvt_f64_m (f32, pg, s32); /* { dg-error {passing 'svfloat32_t' to argument 1 of 'svcvt_f64_m', which expects 'svfloat64_t'} } */
+  svcvt_f64_m (0, pg, s32); /* { dg-error {passing 'int' to argument 1 of 'svcvt_f64_m', which expects 'svfloat64_t'} } */
+  svcvt_f64_m (pg, pg, s32); /* { dg-error {passing 'svbool_t' to argument 1 of 'svcvt_f64_m', which expects 'svfloat64_t'} } */
+  svcvt_f64_m (f64, s32, s32); /* { dg-error {passing 'svint32_t' to argument 2 of 'svcvt_f64_m', which expects 'svbool_t'} } */
+  svcvt_f64_m (f64, pg, 0); /* { dg-error {passing 'int' to argument 3 of 'svcvt_f64_m', which expects an SVE vector type} } */
+
+  svcvt_f64_m (f64, pg, s8); /* { dg-error {'svcvt_f64_m' has no form that takes 'svint8_t' arguments} } */
+  svcvt_f64_m (f64, pg, s16); /* { dg-error {'svcvt_f64_m' has no form that takes 'svint16_t' arguments} } */
+  svcvt_f64_m (f64, pg, s32);
+  svcvt_f64_m (f64, pg, s64);
+  svcvt_f64_m (f64, pg, u8); /* { dg-error {'svcvt_f64_m' has no form that takes 'svuint8_t' arguments} } */
+  svcvt_f64_m (f64, pg, u16); /* { dg-error {'svcvt_f64_m' has no form that takes 'svuint16_t' arguments} } */
+  svcvt_f64_m (f64, pg, u32);
+  svcvt_f64_m (f64, pg, u64);
+  svcvt_f64_m (f64, pg, f16);
+  svcvt_f64_m (f64, pg, f32);
+  svcvt_f64_m (f64, pg, f64); /* { dg-error {'svcvt_f64_m' has no form that takes 'svfloat64_t' arguments} } */
+
+  svcvt_f32_m (f32, pg, s8); /* { dg-error {'svcvt_f32_m' has no form that takes 'svint8_t' arguments} } */
+  svcvt_f32_m (f32, pg, s16); /* { dg-error {'svcvt_f32_m' has no form that takes 'svint16_t' arguments} } */
+  svcvt_f32_m (f32, pg, s32);
+  svcvt_f32_m (f32, pg, s64);
+  svcvt_f32_m (f32, pg, u8); /* { dg-error {'svcvt_f32_m' has no form that takes 'svuint8_t' arguments} } */
+  svcvt_f32_m (f32, pg, u16); /* { dg-error {'svcvt_f32_m' has no form that takes 'svuint16_t' arguments} } */
+  svcvt_f32_m (f32, pg, u32);
+  svcvt_f32_m (f32, pg, u64);
+  svcvt_f32_m (f32, pg, f16);
+  svcvt_f32_m (f32, pg, f32); /* { dg-error {'svcvt_f32_m' has no form that takes 'svfloat32_t' arguments} } */
+  svcvt_f32_m (f32, pg, f64);
+
+  svcvt_f16_m (f16, pg, s8); /* { dg-error {'svcvt_f16_m' has no form that takes 'svint8_t' arguments} } */
+  svcvt_f16_m (f16, pg, s16);
+  svcvt_f16_m (f16, pg, s32);
+  svcvt_f16_m (f16, pg, s64);
+  svcvt_f16_m (f16, pg, u8); /* { dg-error {'svcvt_f16_m' has no form that takes 'svuint8_t' arguments} } */
+  svcvt_f16_m (f16, pg, u16);
+  svcvt_f16_m (f16, pg, u32);
+  svcvt_f16_m (f16, pg, u64);
+  svcvt_f16_m (f16, pg, f16); /* { dg-error {'svcvt_f16_m' has no form that takes 'svfloat16_t' arguments} } */
+  svcvt_f16_m (f16, pg, f32);
+  svcvt_f16_m (f16, pg, f64);
+
+  svcvt_s64_m (s64, pg, f16);
+  svcvt_s64_m (s64, pg, f32);
+  svcvt_s64_m (s64, pg, f64);
+
+  svcvt_s32_m (s32, pg, f16);
+  svcvt_s32_m (s32, pg, f32);
+  svcvt_s32_m (s32, pg, f64);
+
+  svcvt_s16_m (s16, pg, f16);
+  svcvt_s16_m (s16, pg, f32); /* { dg-error {'svcvt_s16_m' has no form that takes 'svfloat32_t' arguments} } */
+  svcvt_s16_m (s16, pg, f64); /* { dg-error {'svcvt_s16_m' has no form that takes 'svfloat64_t' arguments} } */
+
+  svcvt_u64_m (u64, pg, f16);
+  svcvt_u64_m (u64, pg, f32);
+  svcvt_u64_m (u64, pg, f64);
+
+  svcvt_u32_m (u32, pg, f16);
+  svcvt_u32_m (u32, pg, f32);
+  svcvt_u32_m (u32, pg, f64);
+
+  svcvt_u16_m (u16, pg, f16);
+  svcvt_u16_m (u16, pg, f32); /* { dg-error {'svcvt_u16_m' has no form that takes 'svfloat32_t' arguments} } */
+  svcvt_u16_m (u16, pg, f64); /* { dg-error {'svcvt_u16_m' has no form that takes 'svfloat64_t' arguments} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_1.c
new file mode 100644
index 000000000..888b52513
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_1.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64)
+{
+  svclz_m (u32, pg); /* { dg-error {too few arguments to function 'svclz_m'} } */
+  svclz_m (u32, pg, s32, s32); /* { dg-error {too many arguments to function 'svclz_m'} } */
+  svclz_m (0, pg, f32); /* { dg-error {passing 'int' to argument 1 of 'svclz_m', which expects an SVE vector type} } */
+  svclz_m (u32, u32, f32); /* { dg-error {passing 'svuint32_t' to argument 2 of 'svclz_m', which expects 'svbool_t'} } */
+  svclz_m (u32, 0, f32); /* { dg-error {passing 'int' to argument 2 of 'svclz_m', which expects 'svbool_t'} } */
+  svclz_m (u32, pg, s32);
+  svclz_m (u32, pg, u32);
+  svclz_m (u32, pg, f32); /* { dg-error {'svclz_m' has no form that takes 'svfloat32_t' arguments} } */
+  svclz_m (u32, pg, pg); /* { dg-error {'svclz_m' has no form that takes 'svbool_t' arguments} } */
+
+  svclz_m (pg, pg, s32); /* { dg-error {passing 'svbool_t' to argument 1 of 'svclz_m', which expects a vector of unsigned integers} } */
+  svclz_m (s32, pg, s32); /* { dg-error {passing 'svint32_t' to argument 1 of 'svclz_m', which expects a vector of unsigned integers} } */
+  svclz_m (f32, pg, s32); /* { dg-error {passing 'svfloat32_t' to argument 1 of 'svclz_m', which expects a vector of unsigned integers} } */
+  svclz_m (s64, pg, s32); /* { dg-error {passing 'svint64_t' to argument 1 of 'svclz_m', which expects a vector of unsigned integers} } */
+  svclz_m (u64, pg, s32); /* { dg-error {arguments 1 and 3 of 'svclz_m' must have the same element size, but the values passed here have type 'svuint64_t' and 'svint32_t' respectively} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_2.c
new file mode 100644
index 000000000..233e847e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_2.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-flax-vector-conversions" } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint32_t s32, svuint32_t u32, svfloat32_t f32,
+    svint64_t s64, svuint64_t u64)
+{
+  svclz_m (u32, pg); /* { dg-error {too few arguments to function 'svclz_m'} } */
+  svclz_m (u32, pg, s32, s32); /* { dg-error {too many arguments to function 'svclz_m'} } */
+  svclz_m (0, pg, f32); /* { dg-error {passing 'int' to argument 1 of 'svclz_m', which expects an SVE vector type} } */
+  svclz_m (u32, u32, f32); /* { dg-error {passing 'svuint32_t' to argument 2 of 'svclz_m', which expects 'svbool_t'} } */
+  svclz_m (u32, 0, f32); /* { dg-error {passing 'int' to argument 2 of 'svclz_m', which expects 'svbool_t'} } */
+  svclz_m (u32, pg, s32);
+  svclz_m (u32, pg, u32);
+  svclz_m (u32, pg, f32); /* { dg-error {'svclz_m' has no form that takes 'svfloat32_t' arguments} } */
+  svclz_m (u32, pg, pg); /* { dg-error {'svclz_m' has no form that takes 'svbool_t' arguments} } */
+
+  svclz_m (pg, pg, s32); /* { dg-error {passing 'svbool_t' to argument 1 of 'svclz_m', which expects a vector of unsigned integers} } */
+  svclz_m (s32, pg, s32); /* { dg-error {passing 'svint32_t' to argument 1 of 'svclz_m', which expects a vector of unsigned integers} } */
+  svclz_m (f32, pg, s32); /* { dg-error {passing 'svfloat32_t' to argument 1 of 'svclz_m', which expects a vector of unsigned integers} } */
+  svclz_m (s64, pg, s32); /* { dg-error {passing 'svint64_t' to argument 1 of 'svclz_m', which expects a vector of unsigned integers} } */
+  svclz_m (u64, pg, s32); /* { dg-error {arguments 1 and 3 of 'svclz_m' must have the same element size, but the values passed here have type 'svuint64_t' and 'svint32_t' respectively} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_3.c
new file mode 100644
index 000000000..da57b07ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_to_uint_3.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svuint8_t u8)
+{
+  svcnt_x (pg); /* { dg-error {too few arguments to function 'svcnt_x'} } */
+  svcnt_x (pg, u8, u8); /* { dg-error {too many arguments to function 'svcnt_x'} } */
+  svcnt_x (u8, u8); /* { dg-error {passing 'svuint8_t' to argument 1 of 'svcnt_x', which expects 'svbool_t'} } */
+  svcnt_x (pg, pg); /* { dg-error {'svcnt_x' has no form that takes 'svbool_t' arguments} } */
+  svcnt_x (pg, 1); /* { dg-error {passing 'int' to argument 2 of 'svcnt_x', which expects an SVE vector type} } */
+  svcnt_x (pg, u8);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_uint_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_uint_1.c
new file mode 100644
index 000000000..9c8acdf2d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_uint_1.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void
+f1 (svbool_t pg, svint8_t s8, svuint8_t u8,
+    svint16_t s16, svuint16_t u16, svfloat16_t f16)
+{
+  svexpa (); /* { dg-error {too few arguments to function 'svexpa'} } */
+  svexpa (u16, u16); /* { dg-error {too many arguments to function 'svexpa'} } */
+  svexpa (1); /* { dg-error {passing 'int' to argument 1 of 'svexpa', which expects an SVE vector type} } */
+  svexpa (pg); /* { dg-error {passing 'svbool_t' to argument 1 of 'svexpa', which expects a vector of unsigned integers} } */
+  svexpa (s8); /* { dg-error {passing 'svint8_t' to argument 1 of 'svexpa', which expects a vector of unsigned integers} } */
+  svexpa (s16); /* { dg-error {passing 'svint16_t' to argument 1 of 'svexpa', which expects a vector of unsigned integers} } */
+  svexpa (f16); /* { dg-error {passing 'svfloat16_t' to argument 1 of 'svexpa', which expects a vector of unsigned integers} } */
+
+  svexpa (u8); /* { dg-error {'svexpa' has no form that takes 'svuint8_t' arguments} } */
+  svexpa (u16);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_widen_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_widen_1.c
new file mode 100644
index 000000000..95a97a72e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/unary_widen_1.c
@@ -0,0 +1,25 @@
+#include <arm_sve.h>
+
+void
+test (svbool_t pg, svint8_t s8, svuint8_t u8,
+      svint16_t s16, svuint16_t u16, svfloat16_t f16,
+      svint32_t s32, svuint32_t u32, svfloat32_t f32,
+      svint64_t s64, svuint64_t u64, svfloat64_t f64, float f, int i)
+{
+  svunpklo (); /* { dg-error {too few arguments to function 'svunpklo'} } */
+  svunpklo (pg, s8); /* { dg-error {too many arguments to function 'svunpklo'} } */
+  svunpklo (i); /* { dg-error {passing 'int' to argument 1 of 'svunpklo', which expects an SVE vector type} } */
+  svunpklo (f); /* { dg-error {passing 'float' to argument 1 of 'svunpklo', which expects an SVE vector type} } */
+  svunpklo (pg);
+  svunpklo (s8);
+  svunpklo (s16);
+  svunpklo (s32);
+  svunpklo (s64); /* { dg-error {'svunpklo' has no form that takes 'svint64_t' arguments} } */
+  svunpklo (u8);
+  svunpklo (u16);
+  svunpklo (u32);
+  svunpklo (u64); /* { dg-error {'svunpklo' has no form that takes 'svuint64_t' arguments} } */
+  svunpklo (f16); /* { dg-error {'svunpklo' has no form that takes 'svfloat16_t' arguments} } */
+  svunpklo (f32); /* { dg-error {'svunpklo' has no form that takes 'svfloat32_t' arguments} } */
+  svunpklo (f64); /* { dg-error {'svunpklo' has no form that takes 'svfloat64_t' arguments} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/undeclared_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/undeclared_1.c
new file mode 100644
index 000000000..37524c2ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/undeclared_1.c
@@ -0,0 +1,17 @@
+#include <arm_sve.h>
+
+void
+f (svint8_t s8, svuint16_t u16, svfloat32_t f32,
+   svint16x2_t s16x2, svuint32x3_t u32x3, svfloat64x4_t f64x4,
+   svbool_t pg)
+{
+  s8 = no_ret_s8 (); /* { dg-error {incompatible types when assigning to type 'svint8_t' from type 'int'} } */
+  u16 = no_ret_u16 (); /* { dg-error {incompatible types when assigning to type 'svuint16_t' from type 'int'} } */
+  f32 = no_ret_f32 (); /* { dg-error {incompatible types when assigning to type 'svfloat32_t' from type 'int'} } */
+  s16x2 = no_ret_s16x2 (); /* { dg-error {incompatible types when assigning to type 'svint16x2_t' from type 'int'} } */
+  u32x3 = no_ret_u32x3 (); /* { dg-error {incompatible types when assigning to type 'svuint32x3_t' from type 'int'} } */
+  f64x4 = no_ret_f64x4 (); /* { dg-error {incompatible types when assigning to type 'svfloat64x4_t' from type 'int'} } */
+  pg = no_ret_pg (); /* { dg-error {incompatible types when assigning to type 'svbool_t' from type 'int'} } */
+
+  no_pass_args (pg, u16, f32, s16x2, u32x3, f64x4, pg);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/undeclared_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/undeclared_2.c
new file mode 100644
index 000000000..7e869bda8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/undeclared_2.c
@@ -0,0 +1,15 @@
+#include <arm_sve.h>
+
+void
+f (svint8_t s8, svuint16_t u16, svfloat32_t f32,
+   svint16x2_t s16x2, svuint32x3_t u32x3, svfloat64x4_t f64x4,
+   svbool_t pg)
+{
+  s8 = svlsr_x (pg, s8, 1); /* { dg-error {'svlsr_x' has no form that takes 'svint8_t' arguments} } */
+  u16 = svneg_x (pg, u16); /* { dg-error {'svneg_x' has no form that takes 'svuint16_t' arguments} } */
+  f32 = svclz_x (pg, f32); /* { dg-error {'svclz_x' has no form that takes 'svfloat32_t' arguments} } */
+  s16x2 = svcreate2 (s8); /* { dg-error {too few arguments to function 'svcreate2'} } */
+  u32x3 = svcreate3 (u16, u16, f32); /* { dg-error {passing 'svfloat32_t' to argument 3 of 'svcreate3', but previous arguments had type 'svuint16_t'} } */
+  f64x4 = svcreate4 (f32, f32, f32, f32, f32); /* { dg-error {too many arguments to function 'svcreate4'} } */
+  pg = svadd_x (pg, pg, pg); /* { dg-error {'svadd_x' has no form that takes 'svbool_t' arguments} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/add_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/add_1.c
new file mode 100644
index 000000000..f5c6285f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/add_1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#include <arm_sve.h>
+
+void
+foo (svint8_t *res1, svint8_t *res2, svbool_t pg, svint8_t a, svint8_t b)
+{
+  *res1 = svadd_m (pg, a, b);
+  *res2 = svadd_m (pg, a, b);
+}
+
+/* { dg-final { scan-tree-dump-times {svadd_s8_m|svadd_m} 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/and_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/and_1.c
new file mode 100644
index 000000000..59348cece
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/and_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svand_z (pg, x, y);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
+{
+  svbool_t res = svand_z (pg, x, y);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tands\t} 2 } } */
+/* { dg-final { scan-assembler-not {\tand\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/bic_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/bic_1.c
new file mode 100644
index 000000000..e1c484995
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/bic_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svbic_z (pg, x, y);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
+{
+  svbool_t res = svbic_z (pg, x, y);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tbics\t} 2 } } */
+/* { dg-final { scan-assembler-not {\tbic\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brka_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brka_1.c
new file mode 100644
index 000000000..24aa8f317
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brka_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svbrka_m (x, pg, y);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
+{
+  svbool_t res = svbrka_m (x, pg, y);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tbrkas\tp[0-9]+\.b, p[0-9]+/m,} 2 } } */
+/* { dg-final { scan-assembler-not {\tbrka\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brka_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brka_2.c
new file mode 100644
index 000000000..8aa338867
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brka_2.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svbool_t x, int *any, svbool_t *ptr)
+{
+  svbool_t res = svbrka_z (pg, x);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t x, int *any)
+{
+  svbool_t res = svbrka_z (pg, x);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tbrkas\tp[0-9]+\.b, p[0-9]+/z,} 2 } } */
+/* { dg-final { scan-assembler-not {\tbrka\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkb_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkb_1.c
new file mode 100644
index 000000000..07e3622ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkb_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svbrkb_m (x, pg, y);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
+{
+  svbool_t res = svbrkb_m (x, pg, y);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tbrkbs\tp[0-9]+\.b, p[0-9]+/m,} 2 } } */
+/* { dg-final { scan-assembler-not {\tbrkb\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkb_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkb_2.c
new file mode 100644
index 000000000..ee677cedd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkb_2.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svbool_t x, int *any, svbool_t *ptr)
+{
+  svbool_t res = svbrkb_z (pg, x);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t x, int *any)
+{
+  svbool_t res = svbrkb_z (pg, x);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tbrkbs\tp[0-9]+\.b, p[0-9]+/z,} 2 } } */
+/* { dg-final { scan-assembler-not {\tbrkb\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkn_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkn_1.c
new file mode 100644
index 000000000..7fd9318c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkn_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svbrkn_z (pg, x, y);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
+{
+  svbool_t res = svbrkn_z (pg, x, y);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tbrkns\t} 2 } } */
+/* { dg-final { scan-assembler-not {\tbrkn\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkpa_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkpa_1.c
new file mode 100644
index 000000000..18cca370c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkpa_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svbrkpa_z (pg, x, y);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
+{
+  svbool_t res = svbrkpa_z (pg, x, y);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tbrkpas\t} 2 } } */
+/* { dg-final { scan-assembler-not {\tbrkpa\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkpb_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkpb_1.c
new file mode 100644
index 000000000..73eb7094d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/brkpb_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svbrkpb_z (pg, x, y);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
+{
+  svbool_t res = svbrkpb_z (pg, x, y);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tbrkpbs\t} 2 } } */
+/* { dg-final { scan-assembler-not {\tbrkpb\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_1.c
new file mode 100644
index 000000000..dd8f6c494
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svint8_t x, svint64_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svcmpeq_wide (pg, x, y);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svint8_t x, svint64_t y, int *any)
+{
+  svbool_t res = svcmpeq_wide (pg, x, y);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tcmpeq\t} 2 } } */
+/* { dg-final { scan-assembler-not {\tptest\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_2.c
new file mode 100644
index 000000000..028d37516
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_2.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svint8_t x, svint8_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svcmpeq (pg, x, y);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svint8_t x, svint8_t y, int *any)
+{
+  svbool_t res = svcmpeq (pg, x, y);
+  return svptest_any (pg, res);
+}
+
+void
+test3 (svbool_t pg, svint8_t x, int *any, svbool_t *ptr)
+{
+  svbool_t res = svcmpeq (pg, x, 10);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test4 (svbool_t pg, svint8_t x, int *any)
+{
+  svbool_t res = svcmpeq (pg, x, 10);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tcmpeq\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tcmpeq\t[^\n]*, #10} 2 } } */
+/* { dg-final { scan-assembler-not {\tptest\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_3.c
new file mode 100644
index 000000000..115b26c8e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cmpeq_3.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svfloat32_t x, svfloat32_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svcmpeq (pg, x, y);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svfloat32_t x, svfloat32_t y, int *any)
+{
+  svbool_t res = svcmpeq (pg, x, y);
+  return svptest_any (pg, res);
+}
+
+void
+test3 (svbool_t pg, svfloat32_t x, int *any, svbool_t *ptr)
+{
+  svbool_t res = svcmpeq (pg, x, 0.0);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test4 (svbool_t pg, svfloat32_t x, int *any)
+{
+  svbool_t res = svcmpeq (pg, x, 0.0);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tfcmeq\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tfcmeq\t[^\n]*, #0\.0} 2 } } */
+/* { dg-final { scan-assembler-times {\tptest\t} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntb_pat_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntb_pat_1.c
new file mode 100644
index 000000000..d57a75c20
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntb_pat_1.c
@@ -0,0 +1,132 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O -msve-vector-bits=256" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** cntb_pow2:
+**	mov	x0, #?32
+**	ret
+*/
+uint64_t cntb_pow2 () { return svcntb_pat (SV_POW2); }
+
+/*
+** cntb_vl1:
+**	mov	x0, #?1
+**	ret
+*/
+uint64_t cntb_vl1 () { return svcntb_pat (SV_VL1); }
+
+/*
+** cntb_vl2:
+**	mov	x0, #?2
+**	ret
+*/
+uint64_t cntb_vl2 () { return svcntb_pat (SV_VL2); }
+
+/*
+** cntb_vl3:
+**	mov	x0, #?3
+**	ret
+*/
+uint64_t cntb_vl3 () { return svcntb_pat (SV_VL3); }
+
+/*
+** cntb_vl4:
+**	mov	x0, #?4
+**	ret
+*/
+uint64_t cntb_vl4 () { return svcntb_pat (SV_VL4); }
+
+/*
+** cntb_vl5:
+**	mov	x0, #?5
+**	ret
+*/
+uint64_t cntb_vl5 () { return svcntb_pat (SV_VL5); }
+
+/*
+** cntb_vl6:
+**	mov	x0, #?6
+**	ret
+*/
+uint64_t cntb_vl6 () { return svcntb_pat (SV_VL6); }
+
+/*
+** cntb_vl7:
+**	mov	x0, #?7
+**	ret
+*/
+uint64_t cntb_vl7 () { return svcntb_pat (SV_VL7); }
+
+/*
+** cntb_vl8:
+**	mov	x0, #?8
+**	ret
+*/
+uint64_t cntb_vl8 () { return svcntb_pat (SV_VL8); }
+
+/*
+** cntb_vl16:
+**	mov	x0, #?16
+**	ret
+*/
+uint64_t cntb_vl16 () { return svcntb_pat (SV_VL16); }
+
+/*
+** cntb_vl32:
+**	mov	x0, #?32
+**	ret
+*/
+uint64_t cntb_vl32 () { return svcntb_pat (SV_VL32); }
+
+/*
+** cntb_vl64:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntb_vl64 () { return svcntb_pat (SV_VL64); }
+
+/*
+** cntb_vl128:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntb_vl128 () { return svcntb_pat (SV_VL128); }
+
+/*
+** cntb_vl256:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntb_vl256 () { return svcntb_pat (SV_VL256); }
+
+/*
+** cntb_mul3:
+**	mov	x0, #?30
+**	ret
+*/
+uint64_t cntb_mul3 () { return svcntb_pat (SV_MUL3); }
+
+/*
+** cntb_mul4:
+**	mov	x0, #?32
+**	ret
+*/
+uint64_t cntb_mul4 () { return svcntb_pat (SV_MUL4); }
+
+/*
+** cntb_all:
+**	mov	x0, #?32
+**	ret
+*/
+uint64_t cntb_all () { return svcntb_pat (SV_ALL); }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntd_pat_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntd_pat_1.c
new file mode 100644
index 000000000..d93a32054
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntd_pat_1.c
@@ -0,0 +1,132 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O -msve-vector-bits=256" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** cntd_pow2:
+**	mov	x0, #?4
+**	ret
+*/
+uint64_t cntd_pow2 () { return svcntd_pat (SV_POW2); }
+
+/*
+** cntd_vl1:
+**	mov	x0, #?1
+**	ret
+*/
+uint64_t cntd_vl1 () { return svcntd_pat (SV_VL1); }
+
+/*
+** cntd_vl2:
+**	mov	x0, #?2
+**	ret
+*/
+uint64_t cntd_vl2 () { return svcntd_pat (SV_VL2); }
+
+/*
+** cntd_vl3:
+**	mov	x0, #?3
+**	ret
+*/
+uint64_t cntd_vl3 () { return svcntd_pat (SV_VL3); }
+
+/*
+** cntd_vl4:
+**	mov	x0, #?4
+**	ret
+*/
+uint64_t cntd_vl4 () { return svcntd_pat (SV_VL4); }
+
+/*
+** cntd_vl5:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntd_vl5 () { return svcntd_pat (SV_VL5); }
+
+/*
+** cntd_vl6:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntd_vl6 () { return svcntd_pat (SV_VL6); }
+
+/*
+** cntd_vl7:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntd_vl7 () { return svcntd_pat (SV_VL7); }
+
+/*
+** cntd_vl8:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntd_vl8 () { return svcntd_pat (SV_VL8); }
+
+/*
+** cntd_vl16:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntd_vl16 () { return svcntd_pat (SV_VL16); }
+
+/*
+** cntd_vl32:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntd_vl32 () { return svcntd_pat (SV_VL32); }
+
+/*
+** cntd_vl64:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntd_vl64 () { return svcntd_pat (SV_VL64); }
+
+/*
+** cntd_vl128:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntd_vl128 () { return svcntd_pat (SV_VL128); }
+
+/*
+** cntd_vl256:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntd_vl256 () { return svcntd_pat (SV_VL256); }
+
+/*
+** cntd_mul3:
+**	mov	x0, #?3
+**	ret
+*/
+uint64_t cntd_mul3 () { return svcntd_pat (SV_MUL3); }
+
+/*
+** cntd_mul4:
+**	mov	x0, #?4
+**	ret
+*/
+uint64_t cntd_mul4 () { return svcntd_pat (SV_MUL4); }
+
+/*
+** cntd_all:
+**	mov	x0, #?4
+**	ret
+*/
+uint64_t cntd_all () { return svcntd_pat (SV_ALL); }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cnth_pat_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cnth_pat_1.c
new file mode 100644
index 000000000..bd988f53d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cnth_pat_1.c
@@ -0,0 +1,132 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O -msve-vector-bits=256" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** cnth_pow2:
+**	mov	x0, #?16
+**	ret
+*/
+uint64_t cnth_pow2 () { return svcnth_pat (SV_POW2); }
+
+/*
+** cnth_vl1:
+**	mov	x0, #?1
+**	ret
+*/
+uint64_t cnth_vl1 () { return svcnth_pat (SV_VL1); }
+
+/*
+** cnth_vl2:
+**	mov	x0, #?2
+**	ret
+*/
+uint64_t cnth_vl2 () { return svcnth_pat (SV_VL2); }
+
+/*
+** cnth_vl3:
+**	mov	x0, #?3
+**	ret
+*/
+uint64_t cnth_vl3 () { return svcnth_pat (SV_VL3); }
+
+/*
+** cnth_vl4:
+**	mov	x0, #?4
+**	ret
+*/
+uint64_t cnth_vl4 () { return svcnth_pat (SV_VL4); }
+
+/*
+** cnth_vl5:
+**	mov	x0, #?5
+**	ret
+*/
+uint64_t cnth_vl5 () { return svcnth_pat (SV_VL5); }
+
+/*
+** cnth_vl6:
+**	mov	x0, #?6
+**	ret
+*/
+uint64_t cnth_vl6 () { return svcnth_pat (SV_VL6); }
+
+/*
+** cnth_vl7:
+**	mov	x0, #?7
+**	ret
+*/
+uint64_t cnth_vl7 () { return svcnth_pat (SV_VL7); }
+
+/*
+** cnth_vl8:
+**	mov	x0, #?8
+**	ret
+*/
+uint64_t cnth_vl8 () { return svcnth_pat (SV_VL8); }
+
+/*
+** cnth_vl16:
+**	mov	x0, #?16
+**	ret
+*/
+uint64_t cnth_vl16 () { return svcnth_pat (SV_VL16); }
+
+/*
+** cnth_vl32:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cnth_vl32 () { return svcnth_pat (SV_VL32); }
+
+/*
+** cnth_vl64:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cnth_vl64 () { return svcnth_pat (SV_VL64); }
+
+/*
+** cnth_vl128:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cnth_vl128 () { return svcnth_pat (SV_VL128); }
+
+/*
+** cnth_vl256:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cnth_vl256 () { return svcnth_pat (SV_VL256); }
+
+/*
+** cnth_mul3:
+**	mov	x0, #?15
+**	ret
+*/
+uint64_t cnth_mul3 () { return svcnth_pat (SV_MUL3); }
+
+/*
+** cnth_mul4:
+**	mov	x0, #?16
+**	ret
+*/
+uint64_t cnth_mul4 () { return svcnth_pat (SV_MUL4); }
+
+/*
+** cnth_all:
+**	mov	x0, #?16
+**	ret
+*/
+uint64_t cnth_all () { return svcnth_pat (SV_ALL); }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntw_pat_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntw_pat_1.c
new file mode 100644
index 000000000..53c8435b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/cntw_pat_1.c
@@ -0,0 +1,132 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O -msve-vector-bits=256" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** cntw_pow2:
+**	mov	x0, #?8
+**	ret
+*/
+uint64_t cntw_pow2 () { return svcntw_pat (SV_POW2); }
+
+/*
+** cntw_vl1:
+**	mov	x0, #?1
+**	ret
+*/
+uint64_t cntw_vl1 () { return svcntw_pat (SV_VL1); }
+
+/*
+** cntw_vl2:
+**	mov	x0, #?2
+**	ret
+*/
+uint64_t cntw_vl2 () { return svcntw_pat (SV_VL2); }
+
+/*
+** cntw_vl3:
+**	mov	x0, #?3
+**	ret
+*/
+uint64_t cntw_vl3 () { return svcntw_pat (SV_VL3); }
+
+/*
+** cntw_vl4:
+**	mov	x0, #?4
+**	ret
+*/
+uint64_t cntw_vl4 () { return svcntw_pat (SV_VL4); }
+
+/*
+** cntw_vl5:
+**	mov	x0, #?5
+**	ret
+*/
+uint64_t cntw_vl5 () { return svcntw_pat (SV_VL5); }
+
+/*
+** cntw_vl6:
+**	mov	x0, #?6
+**	ret
+*/
+uint64_t cntw_vl6 () { return svcntw_pat (SV_VL6); }
+
+/*
+** cntw_vl7:
+**	mov	x0, #?7
+**	ret
+*/
+uint64_t cntw_vl7 () { return svcntw_pat (SV_VL7); }
+
+/*
+** cntw_vl8:
+**	mov	x0, #?8
+**	ret
+*/
+uint64_t cntw_vl8 () { return svcntw_pat (SV_VL8); }
+
+/*
+** cntw_vl16:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntw_vl16 () { return svcntw_pat (SV_VL16); }
+
+/*
+** cntw_vl32:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntw_vl32 () { return svcntw_pat (SV_VL32); }
+
+/*
+** cntw_vl64:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntw_vl64 () { return svcntw_pat (SV_VL64); }
+
+/*
+** cntw_vl128:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntw_vl128 () { return svcntw_pat (SV_VL128); }
+
+/*
+** cntw_vl256:
+**	mov	x0, #?0
+**	ret
+*/
+uint64_t cntw_vl256 () { return svcntw_pat (SV_VL256); }
+
+/*
+** cntw_mul3:
+**	mov	x0, #?6
+**	ret
+*/
+uint64_t cntw_mul3 () { return svcntw_pat (SV_MUL3); }
+
+/*
+** cntw_mul4:
+**	mov	x0, #?8
+**	ret
+*/
+uint64_t cntw_mul4 () { return svcntw_pat (SV_MUL4); }
+
+/*
+** cntw_all:
+**	mov	x0, #?8
+**	ret
+*/
+uint64_t cntw_all () { return svcntw_pat (SV_ALL); }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_1.c
new file mode 100644
index 000000000..0442efef3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_1.c
@@ -0,0 +1,16 @@
+/* { dg-options "-g" } */
+
+#include <arm_sve.h>
+
+svbool_t f_b (svbool_t x) { return x; }
+svint8_t f_s8 (svint8_t x) { return x; }
+svuint8_t f_u8 (svuint8_t x) { return x; }
+svint16_t f_s16 (svint16_t x) { return x; }
+svuint16_t f_u16 (svuint16_t x) { return x; }
+svfloat16_t f_f16 (svfloat16_t x) { return x; }
+svint32_t f_s32 (svint32_t x) { return x; }
+svuint32_t f_u32 (svuint32_t x) { return x; }
+svfloat32_t f_f32 (svfloat32_t x) { return x; }
+svint64_t f_s64 (svint64_t x) { return x; }
+svuint64_t f_u64 (svuint64_t x) { return x; }
+svfloat64_t f_f64 (svfloat64_t x) { return x; }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_2.c
new file mode 100644
index 000000000..63a26d2e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_2.c
@@ -0,0 +1,16 @@
+/* { dg-options "-g" } */
+
+#include <arm_sve.h>
+
+svbool_t f_b (svbool_t x) { return svptrue_b32 (); }
+svint8_t f_s8 (svint8_t x) { return svdup_s8 (0); }
+svuint8_t f_u8 (svuint8_t x) { return svdup_u8 (1); }
+svint16_t f_s16 (svint16_t x) { return svdup_s16 (2); }
+svuint16_t f_u16 (svuint16_t x) { return svdup_u16 (3); }
+svfloat16_t f_f16 (svfloat16_t x) { return svdup_f16 (4); }
+svint32_t f_s32 (svint32_t x) { return svdup_s32 (5); }
+svuint32_t f_u32 (svuint32_t x) { return svdup_u32 (6); }
+svfloat32_t f_f32 (svfloat32_t x) { return svdup_f32 (7); }
+svint64_t f_s64 (svint64_t x) { return svdup_s64 (8); }
+svuint64_t f_u64 (svuint64_t x) { return svdup_u64 (9); }
+svfloat64_t f_f64 (svfloat64_t x) { return svdup_f64 (10); }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_3.c
new file mode 100644
index 000000000..ac151e465
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/debug_3.c
@@ -0,0 +1,39 @@
+/* { dg-options "-g" } */
+
+#include <arm_sve.h>
+
+svint8x2_t f2_s8 (svint8x2_t x) { return x; }
+svuint8x2_t f2_u8 (svuint8x2_t x) { return x; }
+svint16x2_t f2_s16 (svint16x2_t x) { return x; }
+svuint16x2_t f2_u16 (svuint16x2_t x) { return x; }
+svfloat16x2_t f2_f16 (svfloat16x2_t x) { return x; }
+svint32x2_t f2_s32 (svint32x2_t x) { return x; }
+svuint32x2_t f2_u32 (svuint32x2_t x) { return x; }
+svfloat32x2_t f2_f32 (svfloat32x2_t x) { return x; }
+svint64x2_t f2_s64 (svint64x2_t x) { return x; }
+svuint64x2_t f2_u64 (svuint64x2_t x) { return x; }
+svfloat64x2_t f2_f64 (svfloat64x2_t x) { return x; }
+
+svint8x3_t f3_s8 (svint8x3_t x) { return x; }
+svuint8x3_t f3_u8 (svuint8x3_t x) { return x; }
+svint16x3_t f3_s16 (svint16x3_t x) { return x; }
+svuint16x3_t f3_u16 (svuint16x3_t x) { return x; }
+svfloat16x3_t f3_f16 (svfloat16x3_t x) { return x; }
+svint32x3_t f3_s32 (svint32x3_t x) { return x; }
+svuint32x3_t f3_u32 (svuint32x3_t x) { return x; }
+svfloat32x3_t f3_f32 (svfloat32x3_t x) { return x; }
+svint64x3_t f3_s64 (svint64x3_t x) { return x; }
+svuint64x3_t f3_u64 (svuint64x3_t x) { return x; }
+svfloat64x3_t f3_f64 (svfloat64x3_t x) { return x; }
+
+svint8x4_t f4_s8 (svint8x4_t x) { return x; }
+svuint8x4_t f4_u8 (svuint8x4_t x) { return x; }
+svint16x4_t f4_s16 (svint16x4_t x) { return x; }
+svuint16x4_t f4_u16 (svuint16x4_t x) { return x; }
+svfloat16x4_t f4_f16 (svfloat16x4_t x) { return x; }
+svint32x4_t f4_s32 (svint32x4_t x) { return x; }
+svuint32x4_t f4_u32 (svuint32x4_t x) { return x; }
+svfloat32x4_t f4_f42 (svfloat32x4_t x) { return x; }
+svint64x4_t f4_s64 (svint64x4_t x) { return x; }
+svuint64x4_t f4_u64 (svuint64x4_t x) { return x; }
+svfloat64x4_t f4_f64 (svfloat64x4_t x) { return x; }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/double_pragma_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/double_pragma_1.c
new file mode 100644
index 000000000..9b3c3697c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/double_pragma_1.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "" } */
+
+/* It doesn't really matter if this produces errors about redefinitions,
+   but it mustn't trigger an ICE.  */
+#pragma GCC aarch64 "arm_sve.h"
+#pragma GCC aarch64 "arm_sve.h" /* { dg-error "duplicate definition of 'arm_sve.h'" } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
new file mode 100644
index 000000000..d71507baa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlittle-endian" } */
+
+#include <arm_sve.h>
+
+svint32_t
+dupq (int x)
+{
+  return svdupq_s32 (x, 1, 2, 3);
+}
+
+/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
+/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
+/* { dg-final { scan-assembler {\t\.word\t1\n\t\.word\t2\n\t\.word\t3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_10.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_10.c
new file mode 100644
index 000000000..f8f797c97
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_10.c
@@ -0,0 +1,66 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+svbool_t __attribute__ ((noipa))
+make_b8 (int8_t x0, int8_t x1, int8_t x2, int8_t x3,
+	 int8_t x4, int8_t x5, int8_t x6, int8_t x7,
+	 int8_t x8, int8_t x9, int8_t xa, int8_t xb,
+	 int8_t xc, int8_t xd, int8_t xe, int8_t xf)
+{
+  return svdupq_b8 (x0, x1, x2, x3, x4, x5, x6, x7,
+		    x8, x9, xa, xb, xc, xd, xe, xf);
+}
+
+svbool_t __attribute__ ((noipa))
+make_b16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
+	  int16_t x4, int16_t x5, int16_t x6, int16_t x7)
+{
+  return svdupq_b16 (x0, x1, x2, x3, x4, x5, x6, x7);
+}
+
+svbool_t __attribute__ ((noipa))
+make_b32 (int32_t x0, int32_t x1, int32_t x2, int32_t x3)
+{
+  return svdupq_b32 (x0, x1, x2, x3);
+}
+
+svbool_t __attribute__ ((noipa))
+make_b64 (int64_t x0, int64_t x1)
+{
+  return svdupq_b64 (x0, x1);
+}
+
+int8_t a[16] = { 1, 0, 0, -3, 0, 9, 11, 0, 0, 1, 0, -4, 9, 9, 0, 0 };
+
+int
+main ()
+{
+  svbool_t pg = svptrue_pat_b8 (SV_VL16);
+  svbool_t b8 = make_b8 (a[0], a[1], a[2], a[3],
+			 a[4], a[5], a[6], a[7],
+			 a[8], a[9], a[10], a[11],
+			 a[12], a[13], a[14], a[15]);
+  if (svptest_any (svptrue_b8 (),
+		   sveor_z (pg, b8, svcmpne (pg, svld1 (pg, a), 0))))
+    __builtin_abort ();
+
+  svbool_t b16 = make_b16 (a[0], a[1], a[2], a[3],
+			   a[4], a[5], a[6], a[7]);
+  if (svptest_any (svptrue_b16 (),
+		   sveor_z (pg, b16, svcmpne (pg, svld1sb_u16 (pg, a), 0))))
+    __builtin_abort ();
+
+  svbool_t b32 = make_b32 (a[0], a[1], a[2], a[3]);
+  if (svptest_any (svptrue_b32 (),
+		   sveor_z (pg, b32, svcmpne (pg, svld1sb_u32 (pg, a), 0))))
+    __builtin_abort ();
+
+  svbool_t b64 = make_b64 (a[0], a[1]);
+  if (svptest_any (svptrue_b64 (),
+		   sveor_z (pg, b64, svcmpne (pg, svld1sb_u64 (pg, a), 0))))
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
new file mode 100644
index 000000000..d494943a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbig-endian" } */
+
+/* To avoid needing big-endian header files.  */
+#pragma GCC aarch64 "arm_sve.h"
+
+svint32_t
+dupq (int x)
+{
+  return svdupq_s32 (x, 1, 2, 3);
+}
+
+/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
+/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
+/* { dg-final { scan-assembler {\t\.word\t3\n\t\.word\t2\n\t\.word\t1\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
new file mode 100644
index 000000000..4bc8259df
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlittle-endian" } */
+
+/* To avoid needing big-endian header files.  */
+#pragma GCC aarch64 "arm_sve.h"
+
+svint32_t
+dupq (int x)
+{
+  return svdupq_s32 (0, 1, x, 3);
+}
+
+/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w0\n} } } */
+/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
+/* { dg-final { scan-assembler {\t\.word\t0\n\t\.word\t1\n\t\.word\t[^\n]*\n\t\.word\t3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
new file mode 100644
index 000000000..6f9f9f2f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbig-endian" } */
+
+/* To avoid needing big-endian header files.  */
+#pragma GCC aarch64 "arm_sve.h"
+
+svint32_t
+dupq (int x)
+{
+  return svdupq_s32 (0, 1, x, 3);
+}
+
+/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w0\n} } } */
+/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
+/* { dg-final { scan-assembler {\t\.word\t3\n\t\.word\t[^\n]*\n\t\.word\t1\n\t\.word\t0\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
new file mode 100644
index 000000000..53426c9af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlittle-endian" } */
+
+#include <arm_sve.h>
+
+svint32_t
+dupq (int x1, int x2, int x3, int x4)
+{
+  return svdupq_s32 (x1, x2, x3, x4);
+}
+
+/* { dg-final { scan-assembler-not {\tldr\t} } } */
+/* { dg-final { scan-assembler {, [wx]0\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w1\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w2\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[3\], w3\n} } } */
+/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_6.c
new file mode 100644
index 000000000..dfce5e7a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_6.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbig-endian" } */
+
+/* To avoid needing big-endian header files.  */
+#pragma GCC aarch64 "arm_sve.h"
+
+svint32_t
+dupq (int x1, int x2, int x3, int x4)
+{
+  return svdupq_s32 (x1, x2, x3, x4);
+}
+
+/* { dg-final { scan-assembler-not {\tldr\t} } } */
+/* { dg-final { scan-assembler {, [wx]0\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w1\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w2\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[3\], w3\n} } } */
+/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_7.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_7.c
new file mode 100644
index 000000000..08decb5f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_7.c
@@ -0,0 +1,66 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+svint8_t __attribute__ ((noipa))
+make_s8 (int8_t x0, int8_t x1, int8_t x2, int8_t x3,
+	 int8_t x4, int8_t x5, int8_t x6, int8_t x7,
+	 int8_t x8, int8_t x9, int8_t xa, int8_t xb,
+	 int8_t xc, int8_t xd, int8_t xe, int8_t xf)
+{
+  return svdupq_s8 (x0, x1, x2, x3, x4, x5, x6, x7,
+		    x8, x9, xa, xb, xc, xd, xe, xf);
+}
+
+svint16_t __attribute__ ((noipa))
+make_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
+	  int16_t x4, int16_t x5, int16_t x6, int16_t x7)
+{
+  return svdupq_s16 (x0, x1, x2, x3, x4, x5, x6, x7);
+}
+
+svint32_t __attribute__ ((noipa))
+make_s32 (int32_t x0, int32_t x1, int32_t x2, int32_t x3)
+{
+  return svdupq_s32 (x0, x1, x2, x3);
+}
+
+svint64_t __attribute__ ((noipa))
+make_s64 (int64_t x0, int64_t x1)
+{
+  return svdupq_s64 (x0, x1);
+}
+
+int8_t a[16] = { 1, -44, 91, -24, 101, -55, 77, 83,
+		 -30, 69, 121, -128, -1, 13, 127, 26 };
+int16_t b[8] = { -716, -10288, 30604, -19258, -9418, -10435, -16001, 7300 };
+int32_t c[4] = { 1268374995, -1023602831, -891830021, -1793452959 };
+int64_t d[2] = { 0x123456789abcdefLL, -0x123456789abcdefLL };
+
+int
+main ()
+{
+  svbool_t pg = svptrue_pat_b8 (SV_VL16);
+  svint8_t s8 = make_s8 (a[0], a[1], a[2], a[3],
+			 a[4], a[5], a[6], a[7],
+			 a[8], a[9], a[10], a[11],
+			 a[12], a[13], a[14], a[15]);
+  if (svptest_any (svptrue_b8 (), svcmpne (pg, s8, svld1 (pg, a))))
+    __builtin_abort ();
+
+  svint16_t s16 = make_s16 (b[0], b[1], b[2], b[3],
+			    b[4], b[5], b[6], b[7]);
+  if (svptest_any (svptrue_b8 (), svcmpne (pg, s16, svld1 (pg, b))))
+    __builtin_abort ();
+
+  svint32_t s32 = make_s32 (c[0], c[1], c[2], c[3]);
+  if (svptest_any (svptrue_b8 (), svcmpne (pg, s32, svld1 (pg, c))))
+    __builtin_abort ();
+
+  svint64_t s64 = make_s64 (d[0], d[1]);
+  if (svptest_any (svptrue_b8 (), svcmpne (pg, s64, svld1 (pg, d))))
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_8.c
new file mode 100644
index 000000000..c20fb7324
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_8.c
@@ -0,0 +1,66 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+svuint8_t __attribute__ ((noipa))
+make_u8 (uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3,
+	 uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7,
+	 uint8_t x8, uint8_t x9, uint8_t xa, uint8_t xb,
+	 uint8_t xc, uint8_t xd, uint8_t xe, uint8_t xf)
+{
+  return svdupq_u8 (x0, x1, x2, x3, x4, x5, x6, x7,
+		    x8, x9, xa, xb, xc, xd, xe, xf);
+}
+
+svuint16_t __attribute__ ((noipa))
+make_u16 (uint16_t x0, uint16_t x1, uint16_t x2, uint16_t x3,
+	  uint16_t x4, uint16_t x5, uint16_t x6, uint16_t x7)
+{
+  return svdupq_u16 (x0, x1, x2, x3, x4, x5, x6, x7);
+}
+
+svuint32_t __attribute__ ((noipa))
+make_u32 (uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3)
+{
+  return svdupq_u32 (x0, x1, x2, x3);
+}
+
+svuint64_t __attribute__ ((noipa))
+make_u64 (uint64_t x0, uint64_t x1)
+{
+  return svdupq_u64 (x0, x1);
+}
+
+uint8_t a[16] = { 1, 212, 91, 232, 101, 201, 77, 83,
+		  226, 69, 121, 128, 255, 13, 127, 26 };
+uint16_t b[8] = { 64820, 55248, 30604, 46278, 56118, 55101, 49535, 7300 };
+uint32_t c[4] = { 1268374995, 3271364465, 3403137275, 2501514337 };
+uint64_t d[2] = { 0x123456789abcdefULL, 0xfedcba9876543210ULL };
+
+int
+main ()
+{
+  svbool_t pg = svptrue_pat_b8 (SV_VL16);
+  svuint8_t u8 = make_u8 (a[0], a[1], a[2], a[3],
+			  a[4], a[5], a[6], a[7],
+			  a[8], a[9], a[10], a[11],
+			  a[12], a[13], a[14], a[15]);
+  if (svptest_any (svptrue_b8 (), svcmpne (pg, u8, svld1 (pg, a))))
+    __builtin_abort ();
+
+  svuint16_t u16 = make_u16 (b[0], b[1], b[2], b[3],
+			     b[4], b[5], b[6], b[7]);
+  if (svptest_any (svptrue_b8 (), svcmpne (pg, u16, svld1 (pg, b))))
+    __builtin_abort ();
+
+  svuint32_t u32 = make_u32 (c[0], c[1], c[2], c[3]);
+  if (svptest_any (svptrue_b8 (), svcmpne (pg, u32, svld1 (pg, c))))
+    __builtin_abort ();
+
+  svuint64_t u64 = make_u64 (d[0], d[1]);
+  if (svptest_any (svptrue_b8 (), svcmpne (pg, u64, svld1 (pg, d))))
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_9.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_9.c
new file mode 100644
index 000000000..b29aa9474
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_9.c
@@ -0,0 +1,47 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+svfloat16_t __attribute__ ((noipa))
+make_f16 (float16_t x0, float16_t x1, float16_t x2, float16_t x3,
+	  float16_t x4, float16_t x5, float16_t x6, float16_t x7)
+{
+  return svdupq_f16 (x0, x1, x2, x3, x4, x5, x6, x7);
+}
+
+svfloat32_t __attribute__ ((noipa))
+make_f32 (float32_t x0, float32_t x1, float32_t x2, float32_t x3)
+{
+  return svdupq_f32 (x0, x1, x2, x3);
+}
+
+svfloat64_t __attribute__ ((noipa))
+make_f64 (float64_t x0, float64_t x1)
+{
+  return svdupq_f64 (x0, x1);
+}
+
+float16_t a[8] = { 1.0, -4.25, 9.75, 6.5, -2.125, 5.5, -3.75, 7.625 };
+float32_t b[4] = { 1.0, -90.25, -11.75, 141.5 };
+float64_t c[2] = { 9221.5, -4491.25 };
+
+int
+main ()
+{
+  svbool_t pg = svptrue_pat_b8 (SV_VL16);
+  svfloat16_t f16 = make_f16 (a[0], a[1], a[2], a[3],
+			      a[4], a[5], a[6], a[7]);
+  if (svptest_any (svptrue_b8 (), svcmpne (pg, f16, svld1 (pg, a))))
+    __builtin_abort ();
+
+  svfloat32_t f32 = make_f32 (b[0], b[1], b[2], b[3]);
+  if (svptest_any (svptrue_b8 (), svcmpne (pg, f32, svld1 (pg, b))))
+    __builtin_abort ();
+
+  svfloat64_t f64 = make_f64 (c[0], c[1]);
+  if (svptest_any (svptrue_b8 (), svcmpne (pg, f64, svld1 (pg, c))))
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_1.c
new file mode 100644
index 000000000..32ccb08d6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_1.c
@@ -0,0 +1,87 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+#ifndef TYPE
+#define TYPE svint8_t
+#define DUPQ svdupq_lane_s8
+#define INDEX svindex_s8
+#define COUNT 16
+#endif
+
+#define BASE 42
+
+TYPE __attribute__ ((noipa))
+dupq_var (TYPE x, uint64_t y)
+{
+  return DUPQ (x, y);
+}
+
+TYPE __attribute__ ((noipa))
+dupq_0 (TYPE x)
+{
+  return DUPQ (x, 0);
+}
+
+TYPE __attribute__ ((noipa))
+dupq_1 (TYPE x)
+{
+  return DUPQ (x, 1);
+}
+
+TYPE __attribute__ ((noipa))
+dupq_2 (TYPE x)
+{
+  return DUPQ (x, 2);
+}
+
+TYPE __attribute__ ((noipa))
+dupq_3 (TYPE x)
+{
+  return DUPQ (x, 3);
+}
+
+TYPE __attribute__ ((noipa))
+dupq_4 (TYPE x)
+{
+  return DUPQ (x, 4);
+}
+
+void __attribute__ ((noipa))
+check (TYPE x, uint64_t y)
+{
+  svbool_t pg = svptrue_b8 ();
+  if (y * 2 >= svcntd ())
+    {
+      if (svptest_any (pg, svcmpne (pg, x, 0)))
+	__builtin_abort ();
+    }
+  else
+    {
+      TYPE repeat = svand_x (pg, INDEX (0, 1), COUNT - 1);
+      TYPE expected = svadd_x (pg, repeat, BASE + y * COUNT);
+      if (svptest_any (pg, svcmpne (pg, x, expected)))
+	__builtin_abort ();
+    }
+}
+
+int
+main ()
+{
+  TYPE x = INDEX (BASE, 1);
+
+  check (dupq_0 (x), 0);
+  check (dupq_1 (x), 1);
+  check (dupq_2 (x), 2);
+  check (dupq_3 (x), 3);
+  check (dupq_4 (x), 4);
+
+  for (int i = 0; i < 63; ++i)
+    {
+      check (dupq_var (x, i), i);
+      check (dupq_var (x, (uint64_t) 1 << i), (uint64_t) 1 << i);
+    }
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_2.c
new file mode 100644
index 000000000..40de1c7dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_2.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2" } */
+
+#define TYPE svuint8_t
+#define DUPQ svdupq_lane_u8
+#define INDEX svindex_u8
+#define COUNT 16
+
+#include "dupq_lane_1.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_3.c
new file mode 100644
index 000000000..4ebe89545
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_3.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2" } */
+
+#define TYPE svint16_t
+#define DUPQ svdupq_lane_s16
+#define INDEX svindex_s16
+#define COUNT 8
+
+#include "dupq_lane_1.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_4.c
new file mode 100644
index 000000000..1be20c8e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_4.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2" } */
+
+#define TYPE svuint16_t
+#define DUPQ svdupq_lane_u16
+#define INDEX svindex_u16
+#define COUNT 8
+
+#include "dupq_lane_1.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_5.c
new file mode 100644
index 000000000..67554d06a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_5.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2" } */
+
+#define TYPE svint32_t
+#define DUPQ svdupq_lane_s32
+#define INDEX svindex_s32
+#define COUNT 4
+
+#include "dupq_lane_1.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_6.c
new file mode 100644
index 000000000..1914d2368
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_6.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2" } */
+
+#define TYPE svuint32_t
+#define DUPQ svdupq_lane_u32
+#define INDEX svindex_u32
+#define COUNT 4
+
+#include "dupq_lane_1.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_7.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_7.c
new file mode 100644
index 000000000..d7a8e52f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_7.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2" } */
+
+#define TYPE svint64_t
+#define DUPQ svdupq_lane_s64
+#define INDEX svindex_s64
+#define COUNT 2
+
+#include "dupq_lane_1.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_8.c
new file mode 100644
index 000000000..68655fefa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_lane_8.c
@@ -0,0 +1,9 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2" } */
+
+#define TYPE svuint64_t
+#define DUPQ svdupq_lane_u64
+#define INDEX svindex_u64
+#define COUNT 2
+
+#include "dupq_lane_1.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/eor_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/eor_1.c
new file mode 100644
index 000000000..357b0bfb8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/eor_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = sveor_z (pg, x, y);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
+{
+  svbool_t res = sveor_z (pg, x, y);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\teors\t} 2 } } */
+/* { dg-final { scan-assembler-not {\teor\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ld1_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ld1_1.c
new file mode 100644
index 000000000..c68a9ed99
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ld1_1.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** nop1:
+**	ret
+*/
+void nop1 (int8_t *s) { svld1 (svptrue_b8 (), s); }
+
+/*
+** nop2:
+**	ret
+*/
+void nop2 (svbool_t pg, int16_t *s) { svld1 (pg, s); }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_1.c
new file mode 100644
index 000000000..79f8bee1f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_1.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+/* Make sure that SETFFR comes first, however high the priority of the
+   LDFF1 is.  */
+svint8_t
+foo (svbool_t pg, int8_t *ptr)
+{
+  svsetffr ();
+  svint8_t x = svldff1 (pg, ptr);
+  x = svadd_x (pg, x, x);
+  x = svmul_x (pg, x, x);
+  return x;
+}
+
+/* { dg-final { scan-assembler {\tsetffr\n.*\tldff1b\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_2.c
new file mode 100644
index 000000000..7c3c8d8b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+/* Make sure that RDFFR comes after the LDFF1 and that the RDFFRs can
+   be CSEd.  */
+svint8_t
+foo (svbool_t pg, int8_t *__restrict ptr,
+     svbool_t *__restrict *__restrict preds)
+{
+  svsetffr ();
+  svint8_t x = svldff1 (pg, ptr);
+  *preds[0] = svrdffr ();
+  *preds[1] = svrdffr ();
+  return x;
+}
+
+/* { dg-final { scan-assembler {\tsetffr\n.*\tldff1b\t.*\trdffr\t} } } */
+/* { dg-final { scan-assembler-times {\trdffr\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c
new file mode 100644
index 000000000..41ad0bcea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_3.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+/* Make sure that LDFF1s can be reordered.  The load of x should come due
+   to its longer dependence chain.  */
+svint8_t
+foo (int8_t *ptr1, int8_t *ptr2)
+{
+  svsetffr ();
+  svbool_t pg = svptrue_b8 ();
+  svint8_t y = svldff1 (pg, ptr2);
+  svint8_t x = svldff1 (pg, ptr1);
+  x = svadd_x (pg, x, x);
+  x = svmul_x (pg, x, x);
+  x = svadd_x (pg, x, y);
+  return x;
+}
+
+/* { dg-final { scan-assembler {\tldff1b\tz[0-9]+\.b, p[0-7]/z, \[x0\]\n.*\tldff1b\tz[0-9]+\.b, p[0-7]/z, \[x1\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_4.c
new file mode 100644
index 000000000..c27302139
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_4.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+/* Make sure that we can use RDFFRS to test for a fault.  */
+svint8_t
+foo (svbool_t pg, int8_t *ptr, int *fault)
+{
+  svsetffr ();
+  svint8_t x = svldff1 (pg, ptr);
+  *fault = svptest_any (pg, svrdffr_z (pg));
+  return x;
+}
+
+/* { dg-final { scan-assembler {\tsetffr\n.*\tldff1b\t.*\trdffrs\t} } } */
+/* { dg-final { scan-assembler-not {\trdffr\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_5.c
new file mode 100644
index 000000000..76e7ab8ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_5.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+/* Make sure that we can use RDFFRS to read the FFR while testing for a
+   fault.  */
+svint8_t
+foo (svbool_t pg, int8_t *ptr, svbool_t *pred, int *fault)
+{
+  svsetffr ();
+  svint8_t x = svldff1 (pg, ptr);
+  svbool_t ffr = svrdffr_z (pg);
+  *fault = svptest_any (pg, ffr);
+  *pred = ffr;
+  return x;
+}
+
+/* { dg-final { scan-assembler {\tsetffr\n.*\tldff1b\t.*\trdffrs\t} } } */
+/* { dg-final { scan-assembler-not {\trdffr\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_6.c
new file mode 100644
index 000000000..7110e5f1a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_6.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+/* Make sure that we can use RDFFRS to test for a fault.  */
+svint8_t
+foo (svbool_t pg, int8_t *ptr, int *fault)
+{
+  svsetffr ();
+  svint8_t x = svldff1 (pg, ptr);
+  *fault = svptest_any (svptrue_b8 (), svrdffr ());
+  return x;
+}
+
+/* { dg-final { scan-assembler {\tsetffr\n.*\tldff1b\t.*\trdffrs\t} } } */
+/* { dg-final { scan-assembler-not {\trdffr\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_7.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_7.c
new file mode 100644
index 000000000..355fe91f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_7.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+/* Make sure that we can use RDFFRS to read the FFR while testing for a
+   fault.  */
+svint8_t
+foo (svbool_t pg, int8_t *ptr, svbool_t *pred, int *fault)
+{
+  svsetffr ();
+  svint8_t x = svldff1 (pg, ptr);
+  svbool_t ffr = svrdffr ();
+  *fault = svptest_any (svptrue_b8 (), ffr);
+  *pred = ffr;
+  return x;
+}
+
+/* { dg-final { scan-assembler {\tsetffr\n.*\tldff1b\t.*\trdffrs\t} } } */
+/* { dg-final { scan-assembler-not {\trdffr\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nand_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nand_1.c
new file mode 100644
index 000000000..0bc54c049
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nand_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svnand_z (pg, x, y);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
+{
+  svbool_t res = svnand_z (pg, x, y);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tnands\t} 2 } } */
+/* { dg-final { scan-assembler-not {\tnand\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nor_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nor_1.c
new file mode 100644
index 000000000..7973294d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nor_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svnor_z (pg, x, y);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
+{
+  svbool_t res = svnor_z (pg, x, y);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tnors\t} 2 } } */
+/* { dg-final { scan-assembler-not {\tnor\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_1.c
new file mode 100644
index 000000000..09dfacd22
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_1.c
@@ -0,0 +1,17 @@
+/* { dg-options "-march=armv8-a" } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+void
+f (svbool_t *x, svint8_t *y)
+{
+  *x = svptrue_b8 (); /* { dg-error {ACLE function '(svbool_t svptrue_b8\(\)|svptrue_b8)' requires ISA extension 'sve'} } */
+  /* { dg-message {note: you can enable 'sve' using the command-line option '-march', or by using the 'target' attribute or pragma} "" { target *-*-* } .-1 } */
+  *x = svptrue_b8 ();
+  *x = svptrue_b8 ();
+  *x = svptrue_b8 ();
+  *x = svptrue_b8 ();
+  *x = svptrue_b8 ();
+  *x = svptrue_b8 ();
+  *y = svadd_m (*x, *y, 1);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_2.c
new file mode 100644
index 000000000..594be1cf4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_2.c
@@ -0,0 +1,14 @@
+/* { dg-options "-march=armv8-a" } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+#pragma GCC target "+sve"
+
+void
+f (svbool_t *x, svint8_t *y)
+{
+  *x = svptrue_b8 ();
+  *y = svadd_m (*x, *y, 1);
+}
+
+/* { dg-final { scan-assembler {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_3.c
new file mode 100644
index 000000000..85f4eb3c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/nosve_3.c
@@ -0,0 +1,12 @@
+/* { dg-options "-march=armv8-a" } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+void __attribute__ ((target("+sve")))
+f (svbool_t *x, svint8_t *y)
+{
+  *x = svptrue_b8 ();
+  *y = svadd_m (*x, *y, 1);
+}
+
+/* { dg-final { scan-assembler {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/orn_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/orn_1.c
new file mode 100644
index 000000000..c3ed1eb61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/orn_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svorn_z (pg, x, y);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
+{
+  svbool_t res = svorn_z (pg, x, y);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\torns\t} 2 } } */
+/* { dg-final { scan-assembler-not {\torn\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/orr_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/orr_1.c
new file mode 100644
index 000000000..4456fa630
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/orr_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svbool_t x, svbool_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svorr_z (pg, x, y);
+  *any = svptest_any (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t x, svbool_t y, int *any)
+{
+  svbool_t res = svorr_z (pg, x, y);
+  return svptest_any (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\torrs\t} 2 } } */
+/* { dg-final { scan-assembler-not {\torr\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pfirst_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pfirst_1.c
new file mode 100644
index 000000000..de1ff691a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pfirst_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, int *last, svbool_t *ptr)
+{
+  svbool_t res = svpfirst (pg, svpfalse ());
+  *last = svptest_last (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg)
+{
+  svbool_t res = svpfirst (pg, svpfalse ());
+  return svptest_last (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tpfirst\t} 2 } } */
+/* { dg-final { scan-assembler-not {\tptest\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pnext_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pnext_1.c
new file mode 100644
index 000000000..bf59cb963
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pnext_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svbool_t prev, int *last, svbool_t *ptr)
+{
+  svbool_t res = svpnext_b8 (pg, prev);
+  *last = svptest_last (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t prev)
+{
+  svbool_t res = svpnext_b8 (pg, prev);
+  return svptest_last (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tpnext\t} 2 } } */
+/* { dg-final { scan-assembler-not {\tptest\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pnext_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pnext_2.c
new file mode 100644
index 000000000..9926a2bee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pnext_2.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (svbool_t pg, svbool_t prev, int *last, svbool_t *ptr)
+{
+  svbool_t res = svpnext_b16 (pg, prev);
+  *last = svptest_last (pg, res);
+  *ptr = res;
+}
+
+int
+test2 (svbool_t pg, svbool_t prev)
+{
+  svbool_t res = svpnext_b16 (pg, prev);
+  return svptest_last (pg, res);
+}
+
+void
+test3 (svbool_t pg, svbool_t prev, int *last, svbool_t *ptr)
+{
+  svbool_t res = svpnext_b32 (pg, prev);
+  *last = svptest_last (pg, res);
+  *ptr = res;
+}
+
+int
+test4 (svbool_t pg, svbool_t prev)
+{
+  svbool_t res = svpnext_b32 (pg, prev);
+  return svptest_last (pg, res);
+}
+
+void
+test5 (svbool_t pg, svbool_t prev, int *last, svbool_t *ptr)
+{
+  svbool_t res = svpnext_b64 (pg, prev);
+  *last = svptest_last (pg, res);
+  *ptr = res;
+}
+
+int
+test6 (svbool_t pg, svbool_t prev)
+{
+  svbool_t res = svpnext_b64 (pg, prev);
+  return svptest_last (pg, res);
+}
+
+/* { dg-final { scan-assembler-times {\tpnext\t} 6 } } */
+/* { dg-final { scan-assembler-times {\tptest\t} 6 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_1.c
new file mode 100644
index 000000000..69bbb1ed0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (int *last, svbool_t *ptr)
+{
+  svbool_t res = svptrue_pat_b8 (SV_VL32);
+  *last = svptest_last (svptrue_b8 (), res);
+  *ptr = res;
+}
+
+int
+test2 ()
+{
+  svbool_t res = svptrue_pat_b8 (SV_VL32);
+  return svptest_last (svptrue_b8 (), res);
+}
+
+/* { dg-final { scan-assembler-times {\tptrues\tp[0-9]+\.b, vl32\n} 2 } } */
+/* { dg-final { scan-assembler-not {\tptrue\t} { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not {\tptest\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_2.c
new file mode 100644
index 000000000..ede83405e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (int *last, svbool_t *ptr)
+{
+  svbool_t res = svptrue_pat_b16 (SV_VL16);
+  *last = svptest_last (svptrue_b16 (), res);
+  *ptr = res;
+}
+
+int
+test2 ()
+{
+  svbool_t res = svptrue_pat_b16 (SV_VL16);
+  return svptest_last (svptrue_b16 (), res);
+}
+
+/* { dg-final { scan-assembler-times {\tptrues\tp[0-9]+\.h, vl16\n} 2 } } */
+/* { dg-final { scan-assembler-not {\tptrue\t} { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not {\tptest\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_3.c
new file mode 100644
index 000000000..d2eb3fc30
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_3.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (int *last, svbool_t *ptr)
+{
+  svbool_t res = svptrue_pat_b32 (SV_VL16);
+  *last = svptest_last (svptrue_b32 (), res);
+  *ptr = res;
+}
+
+int
+test2 ()
+{
+  svbool_t res = svptrue_pat_b32 (SV_VL16);
+  return svptest_last (svptrue_b32 (), res);
+}
+
+/* { dg-final { scan-assembler-times {\tptrues\tp[0-9]+\.s, vl16\n} 2 } } */
+/* { dg-final { scan-assembler-not {\tptrue\t} { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not {\tptest\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_4.c
new file mode 100644
index 000000000..59a21da9e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_4.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (int *any, svbool_t *ptr)
+{
+  svbool_t res = svptrue_pat_b64 (SV_VL7);
+  *any = svptest_any (svptrue_b64 (), res);
+  *ptr = res;
+}
+
+int
+test2 ()
+{
+  svbool_t res = svptrue_pat_b64 (SV_VL7);
+  return svptest_any (svptrue_b64 (), res);
+}
+
+/* { dg-final { scan-assembler-times {\tptrues\tp[0-9]+\.d, vl7\n} 2 } } */
+/* { dg-final { scan-assembler-not {\tptrue\t} { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not {\tptest\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_5.c
new file mode 100644
index 000000000..c8f6d8aca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ptrue_pat_5.c
@@ -0,0 +1,188 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+b8_b16_1 (int *any, svbool_t *ptr)
+{
+  svbool_t res = svptrue_pat_b8 (SV_VL64);
+  *any = svptest_any (svptrue_b16 (), res);
+  *ptr = res;
+}
+
+int
+b8_b16_2 ()
+{
+  svbool_t res = svptrue_pat_b8 (SV_VL64);
+  return svptest_any (svptrue_b16 (), res);
+}
+
+void
+b8_b32_1 (int *any, svbool_t *ptr)
+{
+  svbool_t res = svptrue_pat_b8 (SV_VL32);
+  *any = svptest_any (svptrue_b32 (), res);
+  *ptr = res;
+}
+
+int
+b8_b32_2 ()
+{
+  svbool_t res = svptrue_pat_b8 (SV_VL32);
+  return svptest_any (svptrue_b32 (), res);
+}
+
+void
+b8_b64_1 (int *any, svbool_t *ptr)
+{
+  svbool_t res = svptrue_pat_b8 (SV_VL128);
+  *any = svptest_any (svptrue_b64 (), res);
+  *ptr = res;
+}
+
+int
+b8_b64_2 ()
+{
+  svbool_t res = svptrue_pat_b8 (SV_VL128);
+  return svptest_any (svptrue_b64 (), res);
+}
+
+void
+b16_b8_1 (int *any, svbool_t *ptr)
+{
+  svbool_t res = svptrue_pat_b16 (SV_VL32);
+  *any = svptest_any (svptrue_b8 (), res);
+  *ptr = res;
+}
+
+int
+b16_b8_2 ()
+{
+  svbool_t res = svptrue_pat_b16 (SV_VL32);
+  return svptest_any (svptrue_b8 (), res);
+}
+
+void
+b16_b32_1 (int *any, svbool_t *ptr)
+{
+  svbool_t res = svptrue_pat_b16 (SV_VL16);
+  *any = svptest_any (svptrue_b32 (), res);
+  *ptr = res;
+}
+
+int
+b16_b32_2 ()
+{
+  svbool_t res = svptrue_pat_b16 (SV_VL16);
+  return svptest_any (svptrue_b32 (), res);
+}
+
+void
+b16_b64_1 (int *any, svbool_t *ptr)
+{
+  svbool_t res = svptrue_pat_b16 (SV_VL64);
+  *any = svptest_any (svptrue_b64 (), res);
+  *ptr = res;
+}
+
+int
+b16_b64_2 ()
+{
+  svbool_t res = svptrue_pat_b16 (SV_VL64);
+  return svptest_any (svptrue_b64 (), res);
+}
+
+void
+b32_b8_1 (int *any, svbool_t *ptr)
+{
+  svbool_t res = svptrue_pat_b32 (SV_VL16);
+  *any = svptest_any (svptrue_b8 (), res);
+  *ptr = res;
+}
+
+int
+b32_b8_2 ()
+{
+  svbool_t res = svptrue_pat_b32 (SV_VL16);
+  return svptest_any (svptrue_b8 (), res);
+}
+
+void
+b32_b16_1 (int *any, svbool_t *ptr)
+{
+  svbool_t res = svptrue_pat_b32 (SV_VL6);
+  *any = svptest_any (svptrue_b16 (), res);
+  *ptr = res;
+}
+
+int
+b32_b16_2 ()
+{
+  svbool_t res = svptrue_pat_b32 (SV_VL6);
+  return svptest_any (svptrue_b16 (), res);
+}
+
+void
+b32_b64_1 (int *any, svbool_t *ptr)
+{
+  svbool_t res = svptrue_pat_b32 (SV_VL32);
+  *any = svptest_any (svptrue_b64 (), res);
+  *ptr = res;
+}
+
+int
+b32_b64_2 ()
+{
+  svbool_t res = svptrue_pat_b32 (SV_VL32);
+  return svptest_any (svptrue_b64 (), res);
+}
+
+void
+b64_b8_1 (int *any, svbool_t *ptr)
+{
+  svbool_t res = svptrue_pat_b64 (SV_VL7);
+  *any = svptest_any (svptrue_b8 (), res);
+  *ptr = res;
+}
+
+int
+b64_b8_2 ()
+{
+  svbool_t res = svptrue_pat_b64 (SV_VL7);
+  return svptest_any (svptrue_b8 (), res);
+}
+
+void
+b64_b16_1 (int *any, svbool_t *ptr)
+{
+  svbool_t res = svptrue_pat_b64 (SV_VL16);
+  *any = svptest_any (svptrue_b16 (), res);
+  *ptr = res;
+}
+
+int
+b64_b16_2 ()
+{
+  svbool_t res = svptrue_pat_b64 (SV_VL16);
+  return svptest_any (svptrue_b16 (), res);
+}
+
+void
+b64_b32_1 (int *any, svbool_t *ptr)
+{
+  svbool_t res = svptrue_pat_b64 (SV_VL32);
+  *any = svptest_any (svptrue_b32 (), res);
+  *ptr = res;
+}
+
+int
+b64_b32_2 ()
+{
+  svbool_t res = svptrue_pat_b64 (SV_VL32);
+  return svptest_any (svptrue_b32 (), res);
+}
+
+/* { dg-final { scan-assembler-not {\tptrues\n} } } */
+/* { dg-final { scan-assembler-times {\tptrue\t} 48 } } */
+/* { dg-final { scan-assembler-times {\tptest\t} 24 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/qincb_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/qincb_1.c
new file mode 100644
index 000000000..ba512f406
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/qincb_1.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** qincb_s32_s:
+**	sqincb	x0, w0, all, mul #15
+**	ret
+*/
+uint64_t qincb_s32_s (int32_t x) { return svqincb (x, 15); }
+
+/*
+** qincb_s32_z:
+**	sqincb	x([0-9]+), w0, all, mul #15
+**	uxtw	x0, w\1
+**	ret
+*/
+uint64_t qincb_s32_z (int32_t x) { return (uint32_t) svqincb (x, 15); }
+
+/*
+** qincb_u32_s:
+**	uqincb	(w[0-9]+), all, mul #15
+**	sxtw	x0, \1
+**	ret
+*/
+uint64_t qincb_u32_s (uint32_t x) { return (int32_t) svqincb (x, 15); }
+
+/*
+** qincb_u32_z:
+**	uqincb	w0, all, mul #15
+**	ret
+*/
+uint64_t qincb_u32_z (uint32_t x) { return svqincb (x, 15); }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/struct_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/struct_1.c
new file mode 100644
index 000000000..50892c85a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/struct_1.c
@@ -0,0 +1,16 @@
+#include <arm_sve.h>
+
+void
+f (svint8x2_t *a, svint8x2_t *b)
+{
+  svint8_t *ptr;
+  svint8x2_t x = *a;
+  *a = *b;
+  a = &x;
+  (void) (a == b);
+  (void) (a != b);
+  (void) (a < b);
+  (void) (a > b);
+  (void) (a <= b);
+  (void) (a >= b);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/temporaries_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/temporaries_1.c
new file mode 100644
index 000000000..2543e1e62
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/temporaries_1.c
@@ -0,0 +1,70 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { ! ilp32 } } } } */
+
+#include <arm_sve.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** test_s8:
+**	ptrue	(p[0-7])\.b, all
+**	ld1b	(z[0-9]+\.b), \1/z, \[x0\]
+**	add	\2, \2, #1
+**	st1b	\2, \1, \[x1\]
+**	ret
+*/
+void
+test_s8 (int8_t *x, int8_t *y)
+{
+  int8_t tmp1[32], tmp2[32];
+
+  svbool_t pg = svptrue_b8 ();
+  svst1 (pg, tmp1, svld1 (pg, x));
+  svst1 (pg, tmp2, svadd_x (pg, svld1 (pg, tmp1), 1));
+  svst1 (pg, y, svld1 (pg, tmp2));
+}
+
+/*
+** test_s32_b8:
+**	ptrue	(p[0-7])\.b, all
+**	ld1w	(z[0-9]+\.s), \1/z, \[x0\]
+**	add	\2, \2, #1
+**	st1w	\2, \1, \[x1\]
+**	ret
+*/
+void
+test_s32_b8 (int32_t *x, int32_t *y)
+{
+  int32_t tmp1[8], tmp2[8];
+
+  svbool_t pg = svptrue_b8 ();
+  svst1 (pg, tmp1, svld1 (pg, x));
+  svst1 (pg, tmp2, svadd_x (pg, svld1 (pg, tmp1), 1));
+  svst1 (pg, y, svld1 (pg, tmp2));
+}
+
+/*
+** test_s32_b32:
+**	ptrue	(p[0-7])\.b, all
+**	ld1w	(z[0-9]+\.s), \1/z, \[x0\]
+**	add	\2, \2, #1
+**	st1w	\2, \1, \[x1\]
+**	ret
+*/
+void
+test_s32_b32 (int32_t *x, int32_t *y)
+{
+  int32_t tmp1[8], tmp2[8];
+
+  svbool_t pg = svptrue_b32 ();
+  svst1 (pg, tmp1, svld1 (pg, x));
+  svst1 (pg, tmp2, svadd_x (pg, svld1 (pg, tmp1), 1));
+  svst1 (pg, y, svld1 (pg, tmp2));
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_1.c
new file mode 100644
index 000000000..1d5523e31
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (int32_t x, int32_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svwhilele_b8 (x, y);
+  *any = svptest_last (svptrue_b8 (), res);
+  *ptr = res;
+}
+
+int
+test2 (int32_t x, int32_t y)
+{
+  svbool_t res = svwhilele_b8 (x, y);
+  return svptest_last (svptrue_b8 (), res);
+}
+
+/* { dg-final { scan-assembler-times {\twhilele\t} 2 } } */
+/* { dg-final { scan-assembler-not {\tptrue\t} } } */
+/* { dg-final { scan-assembler-not {\tptest\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_10.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_10.c
new file mode 100644
index 000000000..ca339c41c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_10.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+/* { dg-final { scan-assembler-not {\twhilele\t} } } */
+/* { dg-final { scan-assembler-not {\twhilelt\t} } } */
+/* { dg-final { scan-assembler-not {\tptrue\t} } } */
+
+void
+test1 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b32_u32 (-1, 0);
+}
+
+void
+test2 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b16_u64 (0x80000000, 0);
+}
+
+void
+test3 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b8_u64 (0x8000000000000001ULL, 0x7ffffffffffffffeULL);
+}
+
+/* { dg-final { scan-assembler-times {\tpfalse\tp[0-7]\.b\n} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_2.c
new file mode 100644
index 000000000..020846007
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (int32_t x, int32_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svwhilele_b16 (x, y);
+  *any = svptest_last (svptrue_b16 (), res);
+  *ptr = res;
+}
+
+int
+test2 (int32_t x, int32_t y)
+{
+  svbool_t res = svwhilele_b16 (x, y);
+  return svptest_last (svptrue_b16 (), res);
+}
+
+/* { dg-final { scan-assembler-times {\twhilele\t} 2 } } */
+/* { dg-final { scan-assembler-not {\tptrue\t} } } */
+/* { dg-final { scan-assembler-not {\tptest\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_3.c
new file mode 100644
index 000000000..4a1045cf6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_3.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (int32_t x, int32_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svwhilele_b32 (x, y);
+  *any = svptest_last (svptrue_b32 (), res);
+  *ptr = res;
+}
+
+int
+test2 (int32_t x, int32_t y)
+{
+  svbool_t res = svwhilele_b32 (x, y);
+  return svptest_last (svptrue_b32 (), res);
+}
+
+/* { dg-final { scan-assembler-times {\twhilele\t} 2 } } */
+/* { dg-final { scan-assembler-not {\tptrue\t} } } */
+/* { dg-final { scan-assembler-not {\tptest\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_4.c
new file mode 100644
index 000000000..f6fb0d099
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_4.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+test1 (int32_t x, int32_t y, int *any, svbool_t *ptr)
+{
+  svbool_t res = svwhilele_b64 (x, y);
+  *any = svptest_last (svptrue_b64 (), res);
+  *ptr = res;
+}
+
+int
+test2 (int32_t x, int32_t y)
+{
+  svbool_t res = svwhilele_b64 (x, y);
+  return svptest_last (svptrue_b64 (), res);
+}
+
+/* { dg-final { scan-assembler-times {\twhilele\t} 2 } } */
+/* { dg-final { scan-assembler-not {\tptrue\t} } } */
+/* { dg-final { scan-assembler-not {\tptest\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_5.c
new file mode 100644
index 000000000..ada958b29
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_5.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+/* { dg-final { scan-assembler-not {\twhilele\t} } } */
+/* { dg-final { scan-assembler-not {\twhilelt\t} } } */
+
+void
+test1 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b32_s32 (-8, -8);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.[bhsd], vl1\n} } } */
+
+void
+test2 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b16_s64 (-1, 1);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, vl3\n} } } */
+
+void
+test3 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b16_s32 (0x7ffffffb, 0x7fffffff);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, vl5\n} } } */
+
+void
+test4 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b8_s64 (svcntb (), svcntb () + 6);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.b, vl7\n} } } */
+
+void
+test5 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b64_s64 (0, 1);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.d, vl2\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_6.c
new file mode 100644
index 000000000..00d92ba8a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_6.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+/* { dg-final { scan-assembler-not {\twhilele\t} } } */
+/* { dg-final { scan-assembler-not {\twhilelt\t} } } */
+/* { dg-final { scan-assembler-not {\tptrue\t} } } */
+
+void
+test1 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b32_s32 (-8, -9);
+}
+
+void
+test2 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b16_s64 (50, -1);
+}
+
+void
+test3 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b16_s32 (0x7ffffffb, 0x80000000);
+}
+
+void
+test4 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b8_s64 (svcntb (), 15);
+}
+
+void
+test5 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b8_s64 (svcntb (), svcntw ());
+}
+
+/* { dg-final { scan-assembler-times {\tpfalse\tp[0-7]\.b\n} 5 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_7.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_7.c
new file mode 100644
index 000000000..92488f597
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_7.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+/* { dg-final { scan-assembler-not {\twhilel[et]\t} } } */
+/* { dg-final { scan-assembler-not {\tpfalse\t} } } */
+
+void
+test1 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b8_s32 (-svcnth (), svcnth () - 1);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.b, all\n} } } */
+
+void
+test2 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b16_s64 (1, svcntw () * 2);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, all\n} } } */
+
+void
+test3 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b32_s32 (svcntd (), svcntw () + svcntd () - 1);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.s, all\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_9.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_9.c
new file mode 100644
index 000000000..e7f81a86f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilele_9.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+/* { dg-final { scan-assembler-not {\twhilele\t} } } */
+/* { dg-final { scan-assembler-not {\twhilelt\t} } } */
+
+void
+test1 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b32_u32 (1, 3);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.s, vl3\n} } } */
+
+void
+test2 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b16_u64 (svcntd (), svcntd () + 5);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, vl6\n} } } */
+
+void
+test3 (svbool_t *ptr)
+{
+  *ptr = svwhilele_b8_u32 (0x7ffffffb, 0x80000002);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.b, vl8\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_1.c
new file mode 100644
index 000000000..5c8f97e2f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_1.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+/* { dg-final { scan-assembler-not {\twhilele\t} } } */
+/* { dg-final { scan-assembler-not {\twhilelt\t} } } */
+
+void
+test1 (svbool_t *ptr)
+{
+  *ptr = svwhilelt_b32_s32 (-8, -7);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.[bhsd], vl1\n} } } */
+
+void
+test2 (svbool_t *ptr)
+{
+  *ptr = svwhilelt_b16_s64 (-1, 2);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, vl3\n} } } */
+
+void
+test3 (svbool_t *ptr)
+{
+  *ptr = svwhilelt_b16_s32 (0x7ffffffa, 0x7fffffff);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, vl5\n} } } */
+
+void
+test4 (svbool_t *ptr)
+{
+  *ptr = svwhilelt_b8_s64 (svcntb (), svcntb () + 7);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.b, vl7\n} } } */
+
+void
+test5 (svbool_t *ptr)
+{
+  *ptr = svwhilelt_b64_s64 (0, 2);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.d, vl2\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_2.c
new file mode 100644
index 000000000..2be3a5b0c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_2.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+/* { dg-final { scan-assembler-not {\twhilele\t} } } */
+/* { dg-final { scan-assembler-not {\twhilelt\t} } } */
+/* { dg-final { scan-assembler-not {\tptrue\t} } } */
+
+void
+test1 (svbool_t *ptr)
+{
+  *ptr = svwhilelt_b32_s32 (0, 0);
+}
+
+void
+test2 (svbool_t *ptr)
+{
+  *ptr = svwhilelt_b16_s64 (50, -1);
+}
+
+void
+test3 (svbool_t *ptr)
+{
+  *ptr = svwhilelt_b16_s32 (0x7ffffffb, 0x80000000);
+}
+
+void
+test4 (svbool_t *ptr)
+{
+  *ptr = svwhilelt_b8_s64 (svcntb (), svcntb ());
+}
+
+void
+test5 (svbool_t *ptr)
+{
+  *ptr = svwhilelt_b8_s64 (svcntb (), svcntw ());
+}
+
+/* { dg-final { scan-assembler-times {\tpfalse\tp[0-7]\.b\n} 5 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_3.c
new file mode 100644
index 000000000..650b2652f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/whilelt_3.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+/* { dg-final { scan-assembler-not {\twhilel[et]\t} } } */
+/* { dg-final { scan-assembler-not {\tpfalse\t} } } */
+
+void
+test1 (svbool_t *ptr)
+{
+  *ptr = svwhilelt_b8_s32 (-svcnth (), svcnth ());
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.b, all\n} } } */
+
+void
+test2 (svbool_t *ptr)
+{
+  *ptr = svwhilelt_b16_s64 (0, svcntw () * 2);
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, all\n} } } */
+
+void
+test3 (svbool_t *ptr)
+{
+  *ptr = svwhilelt_b32_s32 (svcntd (), svcntw () + svcntd ());
+}
+
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.s, all\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_1.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_1.c
new file mode 100644
index 000000000..223351c2f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_1.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#ifndef FACTOR
+#define FACTOR 2
+#endif
+
+#define LOOP(TYPE)						\
+  __attribute__ ((noipa))					\
+  void								\
+  test_##TYPE (TYPE *restrict dst, TYPE *restrict src,		\
+	       int count)					\
+  {								\
+    for (int i = 0; i < count; ++i)				\
+      dst[i] += src[i] * FACTOR;				\
+  }
+
+#define TEST_ALL(T) \
+  T (int8_t) \
+  T (int16_t) \
+  T (int32_t) \
+  T (int64_t) \
+  T (uint8_t) \
+  T (uint16_t) \
+  T (uint32_t) \
+  T (uint64_t)
+
+TEST_ALL (LOOP)
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.b,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.b,} 2 } } */
+/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.b,} } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.h,} 2 } } */
+/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.h,} } } */
+
+/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.s,} } } */
+/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.s,} } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.s, \[z[0-9]\.s, z[0-9]\.s, lsl 1\]} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, lsl 1\]} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_1_run.c
new file mode 100644
index 000000000..383a90c24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_1_run.c
@@ -0,0 +1,31 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "adr_1.c"
+
+#define N 131
+
+#define TEST_LOOP(TYPE)						\
+  {								\
+    TYPE a[N], b[N];						\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (TYPE) i * i + i % 5;				\
+	b[i] = (TYPE) i * 3 + i % 7;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE (a, b, N);					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = ((TYPE) (i * i + i % 5)			\
+			 + ((TYPE) i * 3 + i % 7) * FACTOR);	\
+	if (a[i] != expected)					\
+	  __builtin_abort ();					\
+      }								\
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_2.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_2.c
new file mode 100644
index 000000000..dc20ddbad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_2.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define FACTOR 4
+#include "adr_1.c"
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.b,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.b,} 2 } } */
+/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.b,} } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.h,} 2 } } */
+/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.h,} } } */
+
+/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.s,} } } */
+/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.s,} } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.s, \[z[0-9]\.s, z[0-9]\.s, lsl 2\]} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, lsl 2\]} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_2_run.c
new file mode 100644
index 000000000..e823d3d0a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_2_run.c
@@ -0,0 +1,5 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define FACTOR 4
+#include "adr_1_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_3.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_3.c
new file mode 100644
index 000000000..b0cb180dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_3.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define FACTOR 8
+#include "adr_1.c"
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.b,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.b,} 2 } } */
+/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.b,} } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.h,} 2 } } */
+/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.h,} } } */
+
+/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.s,} } } */
+/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.s,} } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.s, \[z[0-9]\.s, z[0-9]\.s, lsl 3\]} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, lsl 3\]} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_3_run.c
new file mode 100644
index 000000000..721dd68ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_3_run.c
@@ -0,0 +1,5 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define FACTOR 8
+#include "adr_1_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_4.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_4.c
new file mode 100644
index 000000000..7c039ba13
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_4.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define FACTOR 16
+#include "adr_1.c"
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.[bhsd],} 8 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.[bhsd],} 8 } } */
+/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.[bhsd],} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_4_run.c
new file mode 100644
index 000000000..3fb9099e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_4_run.c
@@ -0,0 +1,5 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define FACTOR 16
+#include "adr_1_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_5.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_5.c
new file mode 100644
index 000000000..ce3991cb2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_5.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define LOOP(FACTOR)						\
+  __attribute__ ((noipa))					\
+  void								\
+  test_##FACTOR (uint64_t *restrict dst,			\
+		 uint64_t *restrict src, int count)		\
+  {								\
+    for (int i = 0; i < count; ++i)				\
+      dst[i] += (src[i] & 0xffffffff) * FACTOR;			\
+  }
+
+#define TEST_ALL(T) T (1) T (2) T (4) T (8)
+
+TEST_ALL (LOOP)
+
+/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-not {\tand\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-not {\tuxtw\tz[0-9]\.d,} } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, uxtw\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, uxtw 1\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, uxtw 2\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, uxtw 3\]} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/adr_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/adr_5_run.c
new file mode 100644
index 000000000..025c38d23
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/adr_5_run.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "adr_5.c"
+
+#define N 131
+
+#define TEST_LOOP(FACTOR)						\
+  {									\
+    uint64_t a[N], b[N];						\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	a[i] = (uint64_t) i * i + i % 5;				\
+	b[i] = (uint64_t) (i * 3) << ((i & 7) * 8);			\
+	asm volatile ("" ::: "memory");					\
+      }									\
+    test_##FACTOR (a, b, N);						\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	uint64_t expected = ((uint64_t) (i * i + i % 5)			\
+			     + (((uint64_t) (i * 3) << ((i & 7) * 8))	\
+				& 0xffffffff) * FACTOR);		\
+	if (a[i] != expected)						\
+	  __builtin_abort ();						\
+      }									\
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_1.c b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_1.c
new file mode 100644
index 000000000..615d8b885
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_1.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */
+
+#include <stdint.h>
+
+#define SIGNED(S) int##S##_t
+
+#define DIV(x,y) ((x)/(y))
+#define MOD(x,y) ((x)%(y))
+
+#define TEMPLATE(OP,SIZE)						\
+void __attribute__ ((noinline, noclone))				\
+f_##OP##_##SIZE (SIGNED(SIZE) *restrict a, SIGNED(SIZE) *restrict b,	\
+		 __INTPTR_TYPE__ n)					\
+{									\
+  for (__INTPTR_TYPE__ i = 0; i < n; ++i)				\
+    a[i] = OP (b[i], ((SIGNED(SIZE))1 << ((SIZE)/2+1)));		\
+}
+#define DIVMOD(SIZE)	\
+TEMPLATE (DIV,SIZE);	\
+TEMPLATE (MOD,SIZE);
+
+DIVMOD (8);
+DIVMOD (16);
+DIVMOD (32);
+DIVMOD (64);
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 8 "vect" } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.b, p[0-9]+/m, z[0-9]+\.b, #5\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.h, p[0-9]+/m, z[0-9]+\.h, #9\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #9\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.s, p[0-9]+/m, z[0-9]+\.s, #17\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #17\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tasrd\tz[0-9]+\.d, p[0-9]+/m, z[0-9]+\.d, #33\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #33\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tasr\t%} } } */
+/* { dg-final { scan-assembler-not {\tlsr\t%} } } */
+/* { dg-final { scan-assembler-not {\tcmplt\t%} } } */
+/* { dg-final { scan-assembler-not {\tand\t%} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c b/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c
new file mode 100644
index 000000000..d86a428a7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/clastb_8.c
@@ -0,0 +1,25 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_TYPE(TYPE) \
+  void \
+  test_##TYPE (TYPE *ptr, TYPE *a, TYPE *b, TYPE min_v) \
+  { \
+    TYPE last = *ptr; \
+    for (int i = 0; i < 1024; i++) \
+      if (a[i] < min_v) \
+	last = b[i]; \
+    *ptr = last; \
+  }
+
+TEST_TYPE (uint8_t);
+TEST_TYPE (uint16_t);
+TEST_TYPE (uint32_t);
+TEST_TYPE (uint64_t);
+
+/* { dg-final { scan-assembler {\tclastb\t(b[0-9]+), p[0-7], \1, z[0-9]+\.b\n} } } */
+/* { dg-final { scan-assembler {\tclastb\t(h[0-9]+), p[0-7], \1, z[0-9]+\.h\n} } } */
+/* { dg-final { scan-assembler {\tclastb\t(s[0-9]+), p[0-7], \1, z[0-9]+\.s\n} } } */
+/* { dg-final { scan-assembler {\tclastb\t(d[0-9]+), p[0-7], \1, z[0-9]+\.d\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clrsb_1.c b/gcc/testsuite/gcc.target/aarch64/sve/clrsb_1.c
new file mode 100644
index 000000000..bdc9856fa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/clrsb_1.c
@@ -0,0 +1,22 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#include <stdint.h>
+
+void __attribute__ ((noinline, noclone))
+clrsb_32 (unsigned int *restrict dst, uint32_t *restrict src, int size)
+{
+  for (int i = 0; i < size; ++i)
+    dst[i] = __builtin_clrsb (src[i]);
+}
+
+void __attribute__ ((noinline, noclone))
+clrsb_64 (unsigned int *restrict dst, uint64_t *restrict src, int size)
+{
+  for (int i = 0; i < size; ++i)
+    dst[i] = __builtin_clrsbll (src[i]);
+}
+
+/* { dg-final { scan-assembler-times {\tcls\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tcls\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuzp1\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clrsb_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/clrsb_1_run.c
new file mode 100644
index 000000000..287630d7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/clrsb_1_run.c
@@ -0,0 +1,50 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "clrsb_1.c"
+
+extern void abort (void) __attribute__ ((noreturn));
+
+unsigned int data[] = {
+  0xffffff80, 24,
+  0xffffffff, 31,
+  0x00000000, 31,
+  0x80000000, 0,
+  0x7fffffff, 0,
+  0x000003ff, 21,
+  0x1fffffff, 2,
+  0x0000ffff, 15,
+  0xffff0000, 15
+};
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  unsigned int count = sizeof (data) / sizeof (data[0]) / 2;
+
+  uint32_t in32[count];
+  unsigned int out32[count];
+  for (unsigned int i = 0; i < count; ++i)
+    {
+      in32[i] = data[i * 2];
+      asm volatile ("" ::: "memory");
+    }
+  clrsb_32 (out32, in32, count);
+  for (unsigned int i = 0; i < count; ++i)
+    if (out32[i] != data[i * 2 + 1])
+      abort ();
+
+  uint64_t in64[count];
+  unsigned int out64[count];
+  for (unsigned int i = 0; i < count; ++i)
+    {
+      in64[i] = (uint64_t) data[i * 2] << 32;
+      asm volatile ("" ::: "memory");
+    }
+  clrsb_64 (out64, in64, count);
+  for (unsigned int i = 0; i < count; ++i)
+    if (out64[i] != (data[i * 2] ? data[i * 2 + 1] : 63))
+      abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clz_1.c b/gcc/testsuite/gcc.target/aarch64/sve/clz_1.c
new file mode 100644
index 000000000..0c7a4e6d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/clz_1.c
@@ -0,0 +1,22 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#include <stdint.h>
+
+void __attribute__ ((noinline, noclone))
+clz_32 (unsigned int *restrict dst, uint32_t *restrict src, int size)
+{
+  for (int i = 0; i < size; ++i)
+    dst[i] = __builtin_clz (src[i]);
+}
+
+void __attribute__ ((noinline, noclone))
+clz_64 (unsigned int *restrict dst, uint64_t *restrict src, int size)
+{
+  for (int i = 0; i < size; ++i)
+    dst[i] = __builtin_clzll (src[i]);
+}
+
+/* { dg-final { scan-assembler-times {\tclz\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tclz\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuzp1\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/clz_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/clz_1_run.c
new file mode 100644
index 000000000..12d9cf276
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/clz_1_run.c
@@ -0,0 +1,50 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "clz_1.c"
+
+extern void abort (void) __attribute__ ((noreturn));
+
+unsigned int data[] = {
+  0xffffff80, 0,
+  0xffffffff, 0,
+  0x00000000, 32,
+  0x80000000, 0,
+  0x7fffffff, 1,
+  0x000003ff, 22,
+  0x1fffffff, 3,
+  0x0000ffff, 16,
+  0xffff0000, 0
+};
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  unsigned int count = sizeof (data) / sizeof (data[0]) / 2;
+
+  uint32_t in32[count];
+  unsigned int out32[count];
+  for (unsigned int i = 0; i < count; ++i)
+    {
+      in32[i] = data[i * 2];
+      asm volatile ("" ::: "memory");
+    }
+  clz_32 (out32, in32, count);
+  for (unsigned int i = 0; i < count; ++i)
+    if (out32[i] != data[i * 2 + 1])
+      abort ();
+
+  uint64_t in64[count];
+  unsigned int out64[count];
+  for (unsigned int i = 0; i < count; ++i)
+    {
+      in64[i] = (uint64_t) data[i * 2] << 10;
+      asm volatile ("" ::: "memory");
+    }
+  clz_64 (out64, in64, count);
+  for (unsigned int i = 0; i < count; ++i)
+    if (out64[i] != (data[i * 2] ? data[i * 2 + 1] + 22 : 64))
+      abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cnot_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cnot_1.c
new file mode 100644
index 000000000..5fa33461c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cnot_1.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE)						\
+  void __attribute__ ((noipa))					\
+  test_##TYPE (TYPE *restrict r, TYPE *restrict a, int n)	\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      r[i] = !a[i];						\
+  }
+
+#define TEST_ALL(T)		\
+  T (int8_t)			\
+  T (int16_t)			\
+  T (int32_t)			\
+  T (int64_t)			\
+  T (uint8_t)			\
+  T (uint16_t)			\
+  T (uint32_t)			\
+  T (uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_1.c
new file mode 100644
index 000000000..c02e8ae8e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_1.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define abd(A, B) (((A) < (B) ? (B) : (A)) - ((A) < (B) ? (A) : (B)))
+
+#define DEF_LOOP(TYPE)					\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
+	       TYPE *__restrict b, TYPE *__restrict c,	\
+	       int n)					\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] < 20 ? abd (b[i], c[i]) : b[i];	\
+  }
+
+#define TEST_ALL(T) \
+  T (int8_t) \
+  T (uint8_t) \
+  T (int16_t) \
+  T (uint16_t) \
+  T (int32_t) \
+  T (uint32_t) \
+  T (int64_t) \
+  T (uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_1_run.c
new file mode 100644
index 000000000..a45beefc2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_1_run.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_abd_1.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE)						\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ((i + 2) % 3) * (i + 1);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE (r, a, b, c, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] < 20 ? abd (b[i], c[i]) : b[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_2.c
new file mode 100644
index 000000000..97901b6f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_2.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define abd(A, B) (((A) < (B) ? (B) : (A)) - ((A) < (B) ? (A) : (B)))
+
+#define DEF_LOOP(TYPE)					\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
+	       TYPE *__restrict b, TYPE *__restrict c,	\
+	       int n)					\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] < 20 ? abd (b[i], c[i]) : c[i];	\
+  }
+
+#define TEST_ALL(T) \
+  T (int8_t) \
+  T (uint8_t) \
+  T (int16_t) \
+  T (uint16_t) \
+  T (int32_t) \
+  T (uint32_t) \
+  T (int64_t) \
+  T (uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_2_run.c
new file mode 100644
index 000000000..474bc0f9a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_2_run.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_abd_2.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE)						\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ((i + 2) % 3) * (i + 1);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE (r, a, b, c, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] < 20 ? abd (b[i], c[i]) : c[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_3.c
new file mode 100644
index 000000000..dc8bc3cee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_3.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define abd(A, B) (((A) < (B) ? (B) : (A)) - ((A) < (B) ? (A) : (B)))
+
+#define DEF_LOOP(TYPE)					\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
+	       TYPE *__restrict b, TYPE *__restrict c,	\
+	       int n)					\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] < 20 ? abd (b[i], c[i]) : a[i];	\
+  }
+
+#define TEST_ALL(T) \
+  T (int8_t) \
+  T (uint8_t) \
+  T (int16_t) \
+  T (uint16_t) \
+  T (int32_t) \
+  T (uint32_t) \
+  T (int64_t) \
+  T (uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_3_run.c
new file mode 100644
index 000000000..9f1ac2df8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_3_run.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_abd_3.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE)						\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ((i + 2) % 3) * (i + 1);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE (r, a, b, c, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] < 20 ? abd (b[i], c[i]) : a[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_4.c
new file mode 100644
index 000000000..5c65e59ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_4.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define abd(A, B) (((A) < (B) ? (B) : (A)) - ((A) < (B) ? (A) : (B)))
+
+#define DEF_LOOP(TYPE)					\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
+	       TYPE *__restrict b, TYPE *__restrict c,	\
+	       int n)					\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] < 20 ? abd (b[i], c[i]) : 79;		\
+  }
+
+#define TEST_ALL(T) \
+  T (int8_t) \
+  T (uint8_t) \
+  T (int16_t) \
+  T (uint16_t) \
+  T (int32_t) \
+  T (uint32_t) \
+  T (int64_t) \
+  T (uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-times {\tsel\t} 8 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_4_run.c
new file mode 100644
index 000000000..47fd9e09f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_4_run.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_abd_4.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE)						\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ((i + 2) % 3) * (i + 1);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE (r, a, b, c, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] < 20 ? abd (b[i], c[i]) : 79;	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_5.c
new file mode 100644
index 000000000..f2c013158
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_5.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define abd(A, B) (((A) < (B) ? (B) : (A)) - ((A) < (B) ? (A) : (B)))
+
+#define DEF_LOOP(TYPE)					\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
+	       TYPE *__restrict b, TYPE *__restrict c,	\
+	       int n)					\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] < 20 ? abd (b[i], c[i]) : 0;		\
+  }
+
+#define TEST_ALL(T) \
+  T (int8_t) \
+  T (uint8_t) \
+  T (int16_t) \
+  T (uint16_t) \
+  T (int32_t) \
+  T (uint32_t) \
+  T (int64_t) \
+  T (uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/z, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_5_run.c
new file mode 100644
index 000000000..7cd44be38
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_abd_5_run.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_abd_5.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE)						\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ((i + 2) % 3) * (i + 1);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE (r, a, b, c, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] < 20 ? abd (b[i], c[i]) : 0;	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_1.c
new file mode 100644
index 000000000..bd8776637
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_1.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE)					\
+  void __attribute__ ((noipa))				\
+  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
+	       TYPE *__restrict b, int n)		\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] == 0 ? !b[i] : b[i];			\
+  }
+
+#define TEST_ALL(T) \
+  T (int8_t) \
+  T (uint8_t) \
+  T (int16_t) \
+  T (uint16_t) \
+  T (int32_t) \
+  T (uint32_t) \
+  T (int64_t) \
+  T (uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* Currently we canonicalize the ?: so that !b[i] is the "false" value.  */
+/* { dg-final { scan-assembler-not {\tsel\t} { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_1_run.c
new file mode 100644
index 000000000..802bcbb2e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_1_run.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_cnot_1.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE)						\
+  {								\
+    TYPE r[N], a[N], b[N];					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i % 3) < (i % 5);				\
+	b[i] = i % 7 < 3;					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE (r, a, b, N);					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] == 0 ? !b[i] : b[i];		\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c
new file mode 100644
index 000000000..3df2431be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE)					\
+  void __attribute__ ((noipa))				\
+  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
+	       TYPE *__restrict b, int n)		\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] == 0 ? !b[i] : a[i];			\
+  }
+
+#define TEST_ALL(T) \
+  T (int8_t) \
+  T (uint8_t) \
+  T (int16_t) \
+  T (uint16_t) \
+  T (int32_t) \
+  T (uint32_t) \
+  T (int64_t) \
+  T (uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* Currently we canonicalize the ?: so that !b[i] is the "false" value.  */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2_run.c
new file mode 100644
index 000000000..6db8bf14e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2_run.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_cnot_2.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE)						\
+  {								\
+    TYPE r[N], a[N], b[N];					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i % 3) < (i % 5);				\
+	b[i] = i % 7 < 3;					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE (r, a, b, N);					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] == 0 ? !b[i] : a[i];		\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_3.c
new file mode 100644
index 000000000..806e51788
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_3.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE)					\
+  void __attribute__ ((noipa))				\
+  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
+	       TYPE *__restrict b, int n)		\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] == 0 ? !b[i] : 127;			\
+  }
+
+#define TEST_ALL(T) \
+  T (int8_t) \
+  T (uint8_t) \
+  T (int16_t) \
+  T (uint16_t) \
+  T (int32_t) \
+  T (uint32_t) \
+  T (int64_t) \
+  T (uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 8 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_3_run.c
new file mode 100644
index 000000000..6e025e489
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_3_run.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_cnot_3.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE)						\
+  {								\
+    TYPE r[N], a[N], b[N];					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i % 3) < (i % 5);				\
+	b[i] = i % 7 < 3;					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE (r, a, b, N);					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] == 0 ? !b[i] : 127;		\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c
new file mode 100644
index 000000000..86064ebfc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(FLOAT_TYPE, INT_TYPE)				\
+  void __attribute__ ((noipa))					\
+  test_##INT_TYPE (FLOAT_TYPE *__restrict r,			\
+		   INT_TYPE *__restrict a,			\
+		   FLOAT_TYPE *__restrict b,			\
+		   INT_TYPE *__restrict pred, int n)		\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      {								\
+	FLOAT_TYPE bi = b[i];					\
+	r[i] = pred[i] ? (FLOAT_TYPE) a[i] : bi;		\
+      }								\
+  }
+
+#define TEST_ALL(T) \
+  T (_Float16, int16_t) \
+  T (_Float16, uint16_t) \
+  T (float, int32_t) \
+  T (float, uint32_t) \
+  T (double, int64_t) \
+  T (double, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1_run.c
new file mode 100644
index 000000000..1f712b485
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1_run.c
@@ -0,0 +1,29 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -ftrapping-math" } */
+
+#include "cond_convert_1.c"
+
+#define N 99
+
+#define TEST_LOOP(FLOAT_TYPE, INT_TYPE)				\
+  {								\
+    FLOAT_TYPE r[N], b[N];					\
+    INT_TYPE a[N], pred[N];					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
+	b[i] = (i % 9) * (i % 7 + 1);				\
+	pred[i] = (i % 7 < 4);					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##INT_TYPE (r, a, b, pred, N);				\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (pred[i] ? (FLOAT_TYPE) a[i] : b[i]))		\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_2.c
new file mode 100644
index 000000000..0e60b4381
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_2.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(FLOAT_TYPE, INT_TYPE)				\
+  void __attribute__ ((noipa))					\
+  test_##INT_TYPE (FLOAT_TYPE *__restrict r,			\
+		   INT_TYPE *__restrict a,			\
+		   INT_TYPE *__restrict pred, int n)		\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      r[i] = pred[i] ? (FLOAT_TYPE) a[i] : 1.0;			\
+  }
+
+#define TEST_ALL(T) \
+  T (_Float16, int16_t) \
+  T (_Float16, uint16_t) \
+  T (float, int32_t) \
+  T (float, uint32_t) \
+  T (double, int64_t) \
+  T (double, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 6 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_2_run.c
new file mode 100644
index 000000000..9a4834921
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_2_run.c
@@ -0,0 +1,28 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -ftrapping-math" } */
+
+#include "cond_convert_2.c"
+
+#define N 99
+
+#define TEST_LOOP(FLOAT_TYPE, INT_TYPE)				\
+  {								\
+    FLOAT_TYPE r[N];						\
+    INT_TYPE a[N], pred[N];					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
+	pred[i] = (i % 7 < 4);					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##INT_TYPE (r, a, pred, N);				\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (pred[i] ? (FLOAT_TYPE) a[i] : 1.0))		\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_3.c
new file mode 100644
index 000000000..a294effd4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_3.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(FLOAT_TYPE, INT_TYPE)				\
+  void __attribute__ ((noipa))					\
+  test_##INT_TYPE (FLOAT_TYPE *__restrict r,			\
+		   INT_TYPE *__restrict a,			\
+		   INT_TYPE *__restrict pred, int n)		\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      r[i] = pred[i] ? (FLOAT_TYPE) a[i] : 0.0;			\
+  }
+
+#define TEST_ALL(T) \
+  T (_Float16, int16_t) \
+  T (_Float16, uint16_t) \
+  T (float, int32_t) \
+  T (float, uint32_t) \
+  T (double, int64_t) \
+  T (double, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* Really we should be able to use MOVPRFX /z here, but at the moment
+   we're relying on combine to merge a SEL and an arithmetic operation,
+   and the SEL doesn't allow the "false" value to be zero when the "true"
+   value is a register.  */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 6 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_3_run.c
new file mode 100644
index 000000000..90021097c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_3_run.c
@@ -0,0 +1,28 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -ftrapping-math" } */
+
+#include "cond_convert_3.c"
+
+#define N 99
+
+#define TEST_LOOP(FLOAT_TYPE, INT_TYPE)				\
+  {								\
+    FLOAT_TYPE r[N];						\
+    INT_TYPE a[N], pred[N];					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
+	pred[i] = (i % 7 < 4);					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##INT_TYPE (r, a, pred, N);				\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (pred[i] ? (FLOAT_TYPE) a[i] : 0.0))		\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c
new file mode 100644
index 000000000..e3a947b26
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(FLOAT_TYPE, INT_TYPE)				\
+  void __attribute__ ((noipa))					\
+  test_##INT_TYPE (INT_TYPE *__restrict r,			\
+		   FLOAT_TYPE *__restrict a,			\
+		   INT_TYPE *__restrict b,			\
+		   INT_TYPE *__restrict pred, int n)		\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      {								\
+	INT_TYPE bi = b[i];					\
+	r[i] = pred[i] ? (INT_TYPE) a[i] : bi;			\
+      }								\
+  }
+
+#define TEST_ALL(T) \
+  T (_Float16, int16_t) \
+  T (_Float16, uint16_t) \
+  T (float, int32_t) \
+  T (float, uint32_t) \
+  T (double, int64_t) \
+  T (double, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4_run.c
new file mode 100644
index 000000000..eaadcb7d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4_run.c
@@ -0,0 +1,29 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -ftrapping-math" } */
+
+#include "cond_convert_4.c"
+
+#define N 99
+
+#define TEST_LOOP(FLOAT_TYPE, INT_TYPE)				\
+  {								\
+    INT_TYPE r[N], b[N], pred[N];				\
+    FLOAT_TYPE a[N];						\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
+	b[i] = (i % 9) * (i % 7 + 1);				\
+	pred[i] = (i % 7 < 4);					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##INT_TYPE (r, a, b, pred, N);				\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (pred[i] ? (INT_TYPE) a[i] : b[i]))		\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_5.c
new file mode 100644
index 000000000..5f3da83e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_5.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(FLOAT_TYPE, INT_TYPE)				\
+  void __attribute__ ((noipa))					\
+  test_##INT_TYPE (INT_TYPE *__restrict r,			\
+		   FLOAT_TYPE *__restrict a,			\
+		   INT_TYPE *__restrict pred, int n)		\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      r[i] = pred[i] ? (INT_TYPE) a[i] : 72;			\
+  }
+
+#define TEST_ALL(T) \
+  T (_Float16, int16_t) \
+  T (_Float16, uint16_t) \
+  T (float, int32_t) \
+  T (float, uint32_t) \
+  T (double, int64_t) \
+  T (double, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 6 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_5_run.c
new file mode 100644
index 000000000..a1f2d4977
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_5_run.c
@@ -0,0 +1,28 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -ftrapping-math" } */
+
+#include "cond_convert_5.c"
+
+#define N 99
+
+#define TEST_LOOP(FLOAT_TYPE, INT_TYPE)				\
+  {								\
+    INT_TYPE r[N], pred[N];					\
+    FLOAT_TYPE a[N];						\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
+	pred[i] = (i % 7 < 4);					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##INT_TYPE (r, a, pred, N);				\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (pred[i] ? (INT_TYPE) a[i] : 72))		\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_6.c
new file mode 100644
index 000000000..6541a2ea4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_6.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(FLOAT_TYPE, INT_TYPE)				\
+  void __attribute__ ((noipa))					\
+  test_##INT_TYPE (INT_TYPE *__restrict r,			\
+		   FLOAT_TYPE *__restrict a,			\
+		   INT_TYPE *__restrict pred, int n)		\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      r[i] = pred[i] ? (INT_TYPE) a[i] : 0;			\
+  }
+
+#define TEST_ALL(T) \
+  T (_Float16, int16_t) \
+  T (_Float16, uint16_t) \
+  T (float, int32_t) \
+  T (float, uint32_t) \
+  T (double, int64_t) \
+  T (double, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* Really we should be able to use MOVPRFX /z here, but at the moment
+   we're relying on combine to merge a SEL and an arithmetic operation,
+   and the SEL doesn't allow the "false" value to be zero when the "true"
+   value is a register.  */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 6 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_6_run.c
new file mode 100644
index 000000000..49a64b4fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_6_run.c
@@ -0,0 +1,28 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -ftrapping-math" } */
+
+#include "cond_convert_6.c"
+
+#define N 99
+
+#define TEST_LOOP(FLOAT_TYPE, INT_TYPE)				\
+  {								\
+    INT_TYPE r[N], pred[N];					\
+    FLOAT_TYPE a[N];						\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
+	pred[i] = (i % 7 < 4);					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##INT_TYPE (r, a, pred, N);				\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (pred[i] ? (INT_TYPE) a[i] : 0))		\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_1.c
new file mode 100644
index 000000000..c1f54e391
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, ABS)				\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
+	       TYPE *__restrict b, TYPE *__restrict c,	\
+	       int n)					\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] < 20 ? ABS (b[i] - c[i]) : b[i];	\
+  }
+
+#define TEST_ALL(T) \
+  T (_Float16, __builtin_fabsf16) \
+  T (float, __builtin_fabsf) \
+  T (double, __builtin_fabs)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_1_run.c
new file mode 100644
index 000000000..a4d6972b9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_1_run.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
+
+#include "cond_fabd_1.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, ABS)					\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ((i + 2) % 3) * (i + 1);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE (r, a, b, c, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] < 20 ? ABS (b[i] - c[i]) : b[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_2.c
new file mode 100644
index 000000000..dd6eecc17
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_2.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, ABS)				\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
+	       TYPE *__restrict b, TYPE *__restrict c,	\
+	       int n)					\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] < 20 ? ABS (b[i] - c[i]) : c[i];	\
+  }
+
+#define TEST_ALL(T) \
+  T (_Float16, __builtin_fabsf16) \
+  T (float, __builtin_fabsf) \
+  T (double, __builtin_fabs)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_2_run.c
new file mode 100644
index 000000000..28dc7d011
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_2_run.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
+
+#include "cond_fabd_2.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, ABS)					\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ((i + 2) % 3) * (i + 1);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE (r, a, b, c, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] < 20 ? ABS (b[i] - c[i]) : c[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_3.c
new file mode 100644
index 000000000..26fd7b265
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_3.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, ABS)				\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
+	       TYPE *__restrict b, TYPE *__restrict c,	\
+	       int n)					\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] < 20 ? ABS (b[i] - c[i]) : a[i];	\
+  }
+
+#define TEST_ALL(T) \
+  T (_Float16, __builtin_fabsf16) \
+  T (float, __builtin_fabsf) \
+  T (double, __builtin_fabs)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_3_run.c
new file mode 100644
index 000000000..be21b7f99
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_3_run.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
+
+#include "cond_fabd_3.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, ABS)					\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ((i + 2) % 3) * (i + 1);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE (r, a, b, c, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] < 20 ? ABS (b[i] - c[i]) : a[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_4.c
new file mode 100644
index 000000000..78f1fd914
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_4.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, ABS)				\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
+	       TYPE *__restrict b, TYPE *__restrict c,	\
+	       int n)					\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] < 20 ? ABS (b[i] - c[i]) : 8.0;	\
+  }
+
+#define TEST_ALL(T) \
+  T (_Float16, __builtin_fabsf16) \
+  T (float, __builtin_fabsf) \
+  T (double, __builtin_fabs)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-times {\tsel\t} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_4_run.c
new file mode 100644
index 000000000..86bdab415
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_4_run.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
+
+#include "cond_fabd_4.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, ABS)					\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ((i + 2) % 3) * (i + 1);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE (r, a, b, c, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] < 20 ? ABS (b[i] - c[i]) : 8;	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_5.c
new file mode 100644
index 000000000..e66477b3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_5.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, ABS)				\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE (TYPE *__restrict r, TYPE *__restrict a,	\
+	       TYPE *__restrict b, TYPE *__restrict c,	\
+	       int n)					\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] < 20 ? ABS (b[i] - c[i]) : 0.0;	\
+  }
+
+#define TEST_ALL(T) \
+  T (_Float16, __builtin_fabsf16) \
+  T (float, __builtin_fabsf) \
+  T (double, __builtin_fabs)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* Really we should be able to use MOVPRFX /Z here, but at the moment
+   we're relying on combine to merge a SEL and an arithmetic operation,
+   and the SEL doesn't allow zero operands.  */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z, z[0-9]+\.h\n} 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 1 { xfail *-*-* } } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_5_run.c
new file mode 100644
index 000000000..9fb5fbb81
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fabd_5_run.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-trapping-math" } */
+
+#include "cond_fabd_5.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, ABS)					\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ((i + 2) % 3) * (i + 1);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE (r, a, b, c, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] < 20 ? ABS (b[i] - c[i]) : 0;	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_1.c
new file mode 100644
index 000000000..d103e1f38
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_1.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			PRED_TYPE *__restrict pred,	\
+			int n)				\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] != 1 ? y[i] + (TYPE) CONST : y[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE, PRED_TYPE) \
+  T (TYPE, PRED_TYPE, half, 0.5) \
+  T (TYPE, PRED_TYPE, one, 1.0) \
+  T (TYPE, PRED_TYPE, two, 2.0) \
+  T (TYPE, PRED_TYPE, minus_half, -0.5) \
+  T (TYPE, PRED_TYPE, minus_one, -1.0) \
+  T (TYPE, PRED_TYPE, minus_two, -2.0)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, _Float16, int16_t) \
+  TEST_TYPE (T, float, int32_t) \
+  TEST_TYPE (T, double, int64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #-2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #-2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #-2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_1_run.c
new file mode 100644
index 000000000..956ae1435
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_1_run.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_fadd_1.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)				\
+  {									\
+    TYPE x[N], y[N];							\
+    PRED_TYPE pred[N];							\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	y[i] = i * i;							\
+	pred[i] = i % 3;						\
+      }									\
+    test_##TYPE##_##NAME (x, y, pred, N);				\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	TYPE expected = i % 3 != 1 ? y[i] + (TYPE) CONST : y[i];	\
+	if (x[i] != expected)						\
+	  __builtin_abort ();						\
+	asm volatile ("" ::: "memory");					\
+      }									\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_2.c
new file mode 100644
index 000000000..b7d02f4ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_2.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, CONST)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			TYPE *__restrict z,		\
+			int n)				\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = y[i] < 8 ? z[i] + (TYPE) CONST : y[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, half, 0.5) \
+  T (TYPE, one, 1.0) \
+  T (TYPE, two, 2.0) \
+  T (TYPE, minus_half, -0.5) \
+  T (TYPE, minus_one, -1.0) \
+  T (TYPE, minus_two, -2.0)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #-2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #-2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 6 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_2_run.c
new file mode 100644
index 000000000..debf395cc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_2_run.c
@@ -0,0 +1,31 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_fadd_2.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, CONST)					\
+  {									\
+    TYPE x[N], y[N], z[N];						\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	y[i] = i % 13;							\
+	z[i] = i * i;							\
+      }									\
+    test_##TYPE##_##NAME (x, y, z, N);					\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	TYPE expected = y[i] < 8 ? z[i] + (TYPE) CONST : y[i];		\
+	if (x[i] != expected)						\
+	  __builtin_abort ();						\
+	asm volatile ("" ::: "memory");					\
+      }									\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_3.c
new file mode 100644
index 000000000..aec0e5aca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_3.c
@@ -0,0 +1,65 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			PRED_TYPE *__restrict pred,	\
+			int n)				\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] != 1 ? y[i] + (TYPE) CONST : 4;	\
+  }
+
+#define TEST_TYPE(T, TYPE, PRED_TYPE) \
+  T (TYPE, PRED_TYPE, half, 0.5) \
+  T (TYPE, PRED_TYPE, one, 1.0) \
+  T (TYPE, PRED_TYPE, two, 2.0) \
+  T (TYPE, PRED_TYPE, minus_half, -0.5) \
+  T (TYPE, PRED_TYPE, minus_one, -1.0) \
+  T (TYPE, PRED_TYPE, minus_two, -2.0)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, _Float16, int16_t) \
+  TEST_TYPE (T, float, int32_t) \
+  TEST_TYPE (T, double, int64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #-2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #-2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #-2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
+
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_3_run.c
new file mode 100644
index 000000000..d5268c5ca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_3_run.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_fadd_3.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)			\
+  {								\
+    TYPE x[N], y[N];						\
+    PRED_TYPE pred[N];						\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	y[i] = i * i;						\
+	pred[i] = i % 3;					\
+      }								\
+    test_##TYPE##_##NAME (x, y, pred, N);			\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = i % 3 != 1 ? y[i] + (TYPE) CONST : 4;	\
+	if (x[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_4.c
new file mode 100644
index 000000000..bb276c140
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_4.c
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			PRED_TYPE *__restrict pred,	\
+			int n)				\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] != 1 ? y[i] + (TYPE) CONST : 0;	\
+  }
+
+#define TEST_TYPE(T, TYPE, PRED_TYPE) \
+  T (TYPE, PRED_TYPE, half, 0.5) \
+  T (TYPE, PRED_TYPE, one, 1.0) \
+  T (TYPE, PRED_TYPE, two, 2.0) \
+  T (TYPE, PRED_TYPE, minus_half, -0.5) \
+  T (TYPE, PRED_TYPE, minus_one, -1.0) \
+  T (TYPE, PRED_TYPE, minus_two, -2.0)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, _Float16, int16_t) \
+  TEST_TYPE (T, float, int32_t) \
+  TEST_TYPE (T, double, int64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #-2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #-2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #-2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 6 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 6 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_4_run.c
new file mode 100644
index 000000000..4ea8be661
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fadd_4_run.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_fadd_4.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)				\
+  {									\
+    TYPE x[N], y[N];							\
+    PRED_TYPE pred[N];							\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	y[i] = i * i;							\
+	pred[i] = i % 3;						\
+      }									\
+    test_##TYPE##_##NAME (x, y, pred, N);				\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	TYPE expected = i % 3 != 1 ? y[i] + (TYPE) CONST : 0;		\
+	if (x[i] != expected)						\
+	  __builtin_abort ();						\
+	asm volatile ("" ::: "memory");					\
+      }									\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_1.c
new file mode 100644
index 000000000..d0db0900e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_1.c
@@ -0,0 +1,55 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include <stdint.h>
+
+#ifndef FN
+#define FN(X) __builtin_fmax##X
+#endif
+
+#define DEF_LOOP(FN, TYPE, PRED_TYPE, NAME, CONST)	\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			PRED_TYPE *__restrict pred,	\
+			int n)				\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] != 1 ? FN (y[i], CONST) : y[i];	\
+  }
+
+#define TEST_TYPE(T, FN, TYPE, PRED_TYPE) \
+  T (FN, TYPE, PRED_TYPE, zero, 0) \
+  T (FN, TYPE, PRED_TYPE, one, 1) \
+  T (FN, TYPE, PRED_TYPE, two, 2)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, FN (f16), _Float16, int16_t) \
+  TEST_TYPE (T, FN (f32), float, int32_t) \
+  TEST_TYPE (T, FN (f64), double, int64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_1_run.c
new file mode 100644
index 000000000..00a3c41f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_1_run.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include "cond_fmaxnm_1.c"
+
+#define N 99
+
+#define TEST_LOOP(FN, TYPE, PRED_TYPE, NAME, CONST)			\
+  {									\
+    TYPE x[N], y[N];							\
+    PRED_TYPE pred[N];							\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	y[i] = i * i;							\
+	pred[i] = i % 3;						\
+      }									\
+    test_##TYPE##_##NAME (x, y, pred, N);				\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	TYPE expected = i % 3 != 1 ? FN (y[i], CONST) : y[i];		\
+	if (x[i] != expected)						\
+	  __builtin_abort ();						\
+	asm volatile ("" ::: "memory");					\
+      }									\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_2.c
new file mode 100644
index 000000000..0b535d15f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_2.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include <stdint.h>
+
+#ifndef FN
+#define FN(X) __builtin_fmax##X
+#endif
+
+#define DEF_LOOP(FN, TYPE, NAME, CONST)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			TYPE *__restrict z,		\
+			int n)				\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = y[i] < 8 ? FN (z[i], CONST) : y[i];	\
+  }
+
+#define TEST_TYPE(T, FN, TYPE) \
+  T (FN, TYPE, zero, 0) \
+  T (FN, TYPE, one, 1) \
+  T (FN, TYPE, two, 2)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, FN (f32), float) \
+  TEST_TYPE (T, FN (f64), double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 3 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_2_run.c
new file mode 100644
index 000000000..9eb4d80fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_2_run.c
@@ -0,0 +1,31 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include "cond_fmaxnm_2.c"
+
+#define N 99
+
+#define TEST_LOOP(FN, TYPE, NAME, CONST)				\
+  {									\
+    TYPE x[N], y[N], z[N];						\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	y[i] = i % 13;							\
+	z[i] = i * i;							\
+      }									\
+    test_##TYPE##_##NAME (x, y, z, N);					\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	TYPE expected = y[i] < 8 ? FN (z[i], CONST) : y[i];		\
+	if (x[i] != expected)						\
+	  __builtin_abort ();						\
+	asm volatile ("" ::: "memory");					\
+      }									\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_3.c
new file mode 100644
index 000000000..741f8f6d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_3.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include <stdint.h>
+
+#ifndef FN
+#define FN(X) __builtin_fmax##X
+#endif
+
+#define DEF_LOOP(FN, TYPE, PRED_TYPE, NAME, CONST)	\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			PRED_TYPE *__restrict pred,	\
+			int n)				\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] != 1 ? FN (y[i], CONST) : 4;	\
+  }
+
+#define TEST_TYPE(T, FN, TYPE, PRED_TYPE) \
+  T (FN, TYPE, PRED_TYPE, zero, 0) \
+  T (FN, TYPE, PRED_TYPE, one, 1) \
+  T (FN, TYPE, PRED_TYPE, two, 2)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, FN (f16), _Float16, int16_t) \
+  TEST_TYPE (T, FN (f32), float, int32_t) \
+  TEST_TYPE (T, FN (f64), double, int64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
+
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_3_run.c
new file mode 100644
index 000000000..4aac75f0e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_3_run.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include "cond_fmaxnm_3.c"
+
+#define N 99
+
+#define TEST_LOOP(FN, TYPE, PRED_TYPE, NAME, CONST)	       	\
+  {								\
+    TYPE x[N], y[N];						\
+    PRED_TYPE pred[N];						\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	y[i] = i * i;						\
+	pred[i] = i % 3;					\
+      }								\
+    test_##TYPE##_##NAME (x, y, pred, N);			\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = i % 3 != 1 ? FN (y[i], CONST) : 4;	\
+	if (x[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_4.c
new file mode 100644
index 000000000..83a53c7d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_4.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include <stdint.h>
+
+#ifndef FN
+#define FN(X) __builtin_fmax##X
+#endif
+
+#define DEF_LOOP(FN, TYPE, PRED_TYPE, NAME, CONST)	\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			PRED_TYPE *__restrict pred,	\
+			int n)				\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] != 1 ? FN (y[i], CONST) : 0;	\
+  }
+
+#define TEST_TYPE(T, FN, TYPE, PRED_TYPE) \
+  T (FN, TYPE, PRED_TYPE, zero, 0) \
+  T (FN, TYPE, PRED_TYPE, one, 1) \
+  T (FN, TYPE, PRED_TYPE, two, 2)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, FN (f16), _Float16, int16_t) \
+  TEST_TYPE (T, FN (f32), float, int32_t) \
+  TEST_TYPE (T, FN (f64), double, int64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 3 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_4_run.c
new file mode 100644
index 000000000..e1d904338
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmaxnm_4_run.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include "cond_fmaxnm_4.c"
+
+#define N 99
+
+#define TEST_LOOP(FN, TYPE, PRED_TYPE, NAME, CONST)			\
+  {									\
+    TYPE x[N], y[N];							\
+    PRED_TYPE pred[N];							\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	y[i] = i * i;							\
+	pred[i] = i % 3;						\
+      }									\
+    test_##TYPE##_##NAME (x, y, pred, N);				\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	TYPE expected = i % 3 != 1 ? FN (y[i], CONST) : 0;		\
+	if (x[i] != expected)						\
+	  __builtin_abort ();						\
+	asm volatile ("" ::: "memory");					\
+      }									\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_1.c
new file mode 100644
index 000000000..d667b2088
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_1.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define FN(X) __builtin_fmin##X
+#include "cond_fmaxnm_1.c"
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_1_run.c
new file mode 100644
index 000000000..5df2ff84b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_1_run.c
@@ -0,0 +1,5 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define FN(X) __builtin_fmin##X
+#include "cond_fmaxnm_1_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_2.c
new file mode 100644
index 000000000..d66a84b01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define FN(X) __builtin_fmin##X
+#include "cond_fmaxnm_2.c"
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 3 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_2_run.c
new file mode 100644
index 000000000..79a98bb77
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_2_run.c
@@ -0,0 +1,5 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define FN(X) __builtin_fmin##X
+#include "cond_fmaxnm_2_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_3.c
new file mode 100644
index 000000000..d39dd1825
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_3.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define FN(X) __builtin_fmin##X
+#include "cond_fmaxnm_3.c"
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
+
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_3_run.c
new file mode 100644
index 000000000..ca1a047da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_3_run.c
@@ -0,0 +1,5 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define FN(X) __builtin_fmin##X
+#include "cond_fmaxnm_3_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_4.c
new file mode 100644
index 000000000..fff6fdd37
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_4.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define FN(X) __builtin_fmin##X
+#include "cond_fmaxnm_4.c"
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 3 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_4_run.c
new file mode 100644
index 000000000..b945d0470
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fminnm_4_run.c
@@ -0,0 +1,5 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#define FN(X) __builtin_fmin##X
+#include "cond_fmaxnm_4_run.c"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_1.c
new file mode 100644
index 000000000..ce417ed85
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_1.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			PRED_TYPE *__restrict pred,	\
+			int n)				\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] != 1 ? y[i] * (TYPE) CONST : y[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE, PRED_TYPE) \
+  T (TYPE, PRED_TYPE, half, 0.5) \
+  T (TYPE, PRED_TYPE, two, 2.0) \
+  T (TYPE, PRED_TYPE, four, 4.0)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, _Float16, int16_t) \
+  TEST_TYPE (T, float, int32_t) \
+  TEST_TYPE (T, double, int64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #2\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #2\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #2\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #4\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #4\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #4\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_1_run.c
new file mode 100644
index 000000000..9ca5b5080
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_1_run.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_fmul_1.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)				\
+  {									\
+    TYPE x[N], y[N];							\
+    PRED_TYPE pred[N];							\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	y[i] = i * i;							\
+	pred[i] = i % 3;						\
+      }									\
+    test_##TYPE##_##NAME (x, y, pred, N);				\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	TYPE expected = i % 3 != 1 ? y[i] * (TYPE) CONST : y[i];	\
+	if (x[i] != expected)						\
+	  __builtin_abort ();						\
+	asm volatile ("" ::: "memory");					\
+      }									\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_2.c
new file mode 100644
index 000000000..cbf9d13a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_2.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, CONST)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			TYPE *__restrict z,		\
+			int n)				\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = y[i] < 8 ? z[i] * (TYPE) CONST : y[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, half, 0.5) \
+  T (TYPE, two, 2.0) \
+  T (TYPE, four, 4.0)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #2\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #2\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #4\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #4\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 3 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_2_run.c
new file mode 100644
index 000000000..44b283ba3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_2_run.c
@@ -0,0 +1,31 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_fmul_2.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, CONST)					\
+  {									\
+    TYPE x[N], y[N], z[N];						\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	y[i] = i % 13;							\
+	z[i] = i * i;							\
+      }									\
+    test_##TYPE##_##NAME (x, y, z, N);					\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	TYPE expected = y[i] < 8 ? z[i] * (TYPE) CONST : y[i];		\
+	if (x[i] != expected)						\
+	  __builtin_abort ();						\
+	asm volatile ("" ::: "memory");					\
+      }									\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_3.c
new file mode 100644
index 000000000..4da147e15
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_3.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			PRED_TYPE *__restrict pred,	\
+			int n)				\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] != 1 ? y[i] * (TYPE) CONST : 8;	\
+  }
+
+#define TEST_TYPE(T, TYPE, PRED_TYPE) \
+  T (TYPE, PRED_TYPE, half, 0.5) \
+  T (TYPE, PRED_TYPE, two, 2.0) \
+  T (TYPE, PRED_TYPE, four, 4.0)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, _Float16, int16_t) \
+  TEST_TYPE (T, float, int32_t) \
+  TEST_TYPE (T, double, int64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #2\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #2\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #2\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #4\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #4\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #4\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
+
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_3_run.c
new file mode 100644
index 000000000..9b81d43c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_3_run.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_fmul_3.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)			\
+  {								\
+    TYPE x[N], y[N];						\
+    PRED_TYPE pred[N];						\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	y[i] = i * i;						\
+	pred[i] = i % 3;					\
+      }								\
+    test_##TYPE##_##NAME (x, y, pred, N);			\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = i % 3 != 1 ? y[i] * (TYPE) CONST : 8;	\
+	if (x[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_4.c
new file mode 100644
index 000000000..c4fdb2b2b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_4.c
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			PRED_TYPE *__restrict pred,	\
+			int n)				\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] != 1 ? y[i] * (TYPE) CONST : 0;	\
+  }
+
+#define TEST_TYPE(T, TYPE, PRED_TYPE) \
+  T (TYPE, PRED_TYPE, half, 0.5) \
+  T (TYPE, PRED_TYPE, two, 2.0) \
+  T (TYPE, PRED_TYPE, four, 4.0)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, _Float16, int16_t) \
+  TEST_TYPE (T, float, int32_t) \
+  TEST_TYPE (T, double, int64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #2\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #2\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #2\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #4\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #4\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #4\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 3 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_4_run.c
new file mode 100644
index 000000000..b93e031e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fmul_4_run.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_fmul_4.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)				\
+  {									\
+    TYPE x[N], y[N];							\
+    PRED_TYPE pred[N];							\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	y[i] = i * i;							\
+	pred[i] = i % 3;						\
+      }									\
+    test_##TYPE##_##NAME (x, y, pred, N);				\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	TYPE expected = i % 3 != 1 ? y[i] * (TYPE) CONST : 0;		\
+	if (x[i] != expected)						\
+	  __builtin_abort ();						\
+	asm volatile ("" ::: "memory");					\
+      }									\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_1.c
new file mode 100644
index 000000000..8e7172af4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_1.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			PRED_TYPE *__restrict pred,	\
+			int n)				\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] != 1 ? (TYPE) CONST - y[i] : y[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE, PRED_TYPE) \
+  T (TYPE, PRED_TYPE, half, 0.5) \
+  T (TYPE, PRED_TYPE, one, 1.0) \
+  T (TYPE, PRED_TYPE, two, 2.0)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, _Float16, int16_t) \
+  TEST_TYPE (T, float, int32_t) \
+  TEST_TYPE (T, double, int64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_1_run.c
new file mode 100644
index 000000000..61ffac429
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_1_run.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_fsubr_1.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)				\
+  {									\
+    TYPE x[N], y[N];							\
+    PRED_TYPE pred[N];							\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	y[i] = i * i;							\
+	pred[i] = i % 3;						\
+      }									\
+    test_##TYPE##_##NAME (x, y, pred, N);				\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	TYPE expected = i % 3 != 1 ? (TYPE) CONST - y[i] : y[i];	\
+	if (x[i] != expected)						\
+	  __builtin_abort ();						\
+	asm volatile ("" ::: "memory");					\
+      }									\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_2.c
new file mode 100644
index 000000000..6d2efde94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_2.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, CONST)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			TYPE *__restrict z,		\
+			int n)				\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = y[i] < 8 ? (TYPE) CONST - z[i] : y[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, half, 0.5) \
+  T (TYPE, one, 1.0) \
+  T (TYPE, two, 2.0)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 3 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_2_run.c
new file mode 100644
index 000000000..1b25392b0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_2_run.c
@@ -0,0 +1,31 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_fsubr_2.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, CONST)					\
+  {									\
+    TYPE x[N], y[N], z[N];						\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	y[i] = i % 13;							\
+	z[i] = i * i;							\
+      }									\
+    test_##TYPE##_##NAME (x, y, z, N);					\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	TYPE expected = y[i] < 8 ? (TYPE) CONST - z[i] : y[i];		\
+	if (x[i] != expected)						\
+	  __builtin_abort ();						\
+	asm volatile ("" ::: "memory");					\
+      }									\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_3.c
new file mode 100644
index 000000000..328af5741
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_3.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			PRED_TYPE *__restrict pred,	\
+			int n)				\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] != 1 ? (TYPE) CONST - y[i] : 4;	\
+  }
+
+#define TEST_TYPE(T, TYPE, PRED_TYPE) \
+  T (TYPE, PRED_TYPE, half, 0.5) \
+  T (TYPE, PRED_TYPE, one, 1.0) \
+  T (TYPE, PRED_TYPE, two, 2.0)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, _Float16, int16_t) \
+  TEST_TYPE (T, float, int32_t) \
+  TEST_TYPE (T, double, int64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
+
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_3_run.c
new file mode 100644
index 000000000..8978287df
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_3_run.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_fsubr_3.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)			\
+  {								\
+    TYPE x[N], y[N];						\
+    PRED_TYPE pred[N];						\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	y[i] = i * i;						\
+	pred[i] = i % 3;					\
+      }								\
+    test_##TYPE##_##NAME (x, y, pred, N);			\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = i % 3 != 1 ? (TYPE) CONST - y[i] : 4;	\
+	if (x[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_4.c
new file mode 100644
index 000000000..1d420b104
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_4.c
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, PRED_TYPE, NAME, CONST)		\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			PRED_TYPE *__restrict pred,	\
+			int n)				\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] != 1 ? (TYPE) CONST - y[i] : 0;	\
+  }
+
+#define TEST_TYPE(T, TYPE, PRED_TYPE) \
+  T (TYPE, PRED_TYPE, half, 0.5) \
+  T (TYPE, PRED_TYPE, one, 1.0) \
+  T (TYPE, PRED_TYPE, two, 2.0)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, _Float16, int16_t) \
+  TEST_TYPE (T, float, int32_t) \
+  TEST_TYPE (T, double, int64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.5\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 3 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_4_run.c
new file mode 100644
index 000000000..2cb3409af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_fsubr_4_run.c
@@ -0,0 +1,32 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_fsubr_4.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, PRED_TYPE, NAME, CONST)				\
+  {									\
+    TYPE x[N], y[N];							\
+    PRED_TYPE pred[N];							\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	y[i] = i * i;							\
+	pred[i] = i % 3;						\
+      }									\
+    test_##TYPE##_##NAME (x, y, pred, N);				\
+    for (int i = 0; i < N; ++i)						\
+      {									\
+	TYPE expected = i % 3 != 1 ? (TYPE) CONST - y[i] : 0;		\
+	if (x[i] != expected)						\
+	  __builtin_abort ();						\
+	asm volatile ("" ::: "memory");					\
+      }									\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_1.c
new file mode 100644
index 000000000..a1e80b8a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_1.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define bit_and(A, B) ((A) & (B))
+#define bit_or(A, B) ((A) | (B))
+#define bit_xor(A, B) ((A) ^ (B))
+#define bit_bic(A, B) ((A) & ~(B))
+
+#define DEF_LOOP(TYPE, OP)				\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE##_##OP (TYPE *__restrict r,		\
+		      TYPE *__restrict a,		\
+		      TYPE *__restrict b,		\
+		      TYPE *__restrict c, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] < 20 ? OP (b[i], c[i]) : b[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, bit_and) \
+  T (TYPE, bit_or) \
+  T (TYPE, bit_xor) \
+  T (TYPE, bit_bic)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, int8_t) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, int16_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, int32_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, int64_t) \
+  TEST_TYPE (T, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_1_run.c
new file mode 100644
index 000000000..cb12e5609
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_1_run.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_logical_1.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, OP)					\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ((i + 2) % 3) * (i + 1);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##OP (r, a, b, c, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] < 20 ? OP (b[i], c[i]) : b[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_2.c
new file mode 100644
index 000000000..c476fe2ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_2.c
@@ -0,0 +1,63 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define bit_and(A, B) ((A) & (B))
+#define bit_or(A, B) ((A) | (B))
+#define bit_xor(A, B) ((A) ^ (B))
+#define bit_bic(A, B) ((A) & ~(B))
+
+#define DEF_LOOP(TYPE, OP)				\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE##_##OP (TYPE *__restrict r,		\
+		      TYPE *__restrict a,		\
+		      TYPE *__restrict b,		\
+		      TYPE *__restrict c, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] < 20 ? OP (b[i], c[i]) : c[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, bit_and) \
+  T (TYPE, bit_or) \
+  T (TYPE, bit_xor) \
+  T (TYPE, bit_bic)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, int8_t) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, int16_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, int32_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, int64_t) \
+  TEST_TYPE (T, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* There's no BICR or equivalent, so the BIC functions need a select.  */
+/* { dg-final { scan-assembler-times {\tsel\t} 8 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_2_run.c
new file mode 100644
index 000000000..9b9918cc8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_2_run.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_logical_2.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, OP)					\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ((i + 2) % 3) * (i + 1);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##OP (r, a, b, c, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] < 20 ? OP (b[i], c[i]) : c[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_3.c
new file mode 100644
index 000000000..7ad2c4ea3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_3.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define bit_and(A, B) ((A) & (B))
+#define bit_or(A, B) ((A) | (B))
+#define bit_xor(A, B) ((A) ^ (B))
+#define bit_bic(A, B) ((A) & ~(B))
+
+#define DEF_LOOP(TYPE, OP)				\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE##_##OP (TYPE *__restrict r,		\
+		      TYPE *__restrict a,		\
+		      TYPE *__restrict b,		\
+		      TYPE *__restrict c, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] < 20 ? OP (b[i], c[i]) : a[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, bit_and) \
+  T (TYPE, bit_or) \
+  T (TYPE, bit_xor) \
+  T (TYPE, bit_bic)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, int8_t) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, int16_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, int32_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, int64_t) \
+  TEST_TYPE (T, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b\n} 8 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 8 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 8 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 8 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_3_run.c
new file mode 100644
index 000000000..05dc78ab3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_3_run.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_logical_3.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, OP)					\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ((i + 2) % 3) * (i + 1);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##OP (r, a, b, c, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] < 20 ? OP (b[i], c[i]) : a[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_4.c
new file mode 100644
index 000000000..00217bffa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_4.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define bit_and(A, B) ((A) & (B))
+#define bit_or(A, B) ((A) | (B))
+#define bit_xor(A, B) ((A) ^ (B))
+#define bit_bic(A, B) ((A) & ~(B))
+
+#define DEF_LOOP(TYPE, OP)				\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE##_##OP (TYPE *__restrict r,		\
+		      TYPE *__restrict a,		\
+		      TYPE *__restrict b,		\
+		      TYPE *__restrict c, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] < 20 ? OP (b[i], c[i]) : 42;		\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, bit_and) \
+  T (TYPE, bit_or) \
+  T (TYPE, bit_xor) \
+  T (TYPE, bit_bic)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, int8_t) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, int16_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, int32_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, int64_t) \
+  TEST_TYPE (T, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-times {\tsel\t} 32 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_4_run.c
new file mode 100644
index 000000000..46fb11594
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_4_run.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_logical_4.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, OP)					\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ((i + 2) % 3) * (i + 1);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##OP (r, a, b, c, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] < 20 ? OP (b[i], c[i]) : 42;	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_5.c
new file mode 100644
index 000000000..36b541f21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_5.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define bit_and(A, B) ((A) & (B))
+#define bit_or(A, B) ((A) | (B))
+#define bit_xor(A, B) ((A) ^ (B))
+#define bit_bic(A, B) ((A) & ~(B))
+
+#define DEF_LOOP(TYPE, OP)				\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE##_##OP (TYPE *__restrict r,		\
+		      TYPE *__restrict a,		\
+		      TYPE *__restrict b,		\
+		      TYPE *__restrict c, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = a[i] < 20 ? OP (b[i], c[i]) : 0;		\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, bit_and) \
+  T (TYPE, bit_or) \
+  T (TYPE, bit_xor) \
+  T (TYPE, bit_bic)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, int8_t) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, int16_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, int32_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, int64_t) \
+  TEST_TYPE (T, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tand\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\torr\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\teor\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tbic\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/z, z[0-9]+\.b\n} 8 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z, z[0-9]+\.h\n} 8 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 8 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 8 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_5_run.c
new file mode 100644
index 000000000..e0da5fe58
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_logical_5_run.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_logical_5.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, OP)					\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ((i + 2) % 3) * (i + 1);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##OP (r, a, b, c, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = a[i] < 20 ? OP (b[i], c[i]) : 0;	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c
new file mode 100644
index 000000000..cb01d50f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : b[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c
new file mode 100644
index 000000000..bcfc62280
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_1_run.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_1.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : b[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c
new file mode 100644
index 000000000..b6ea1a3e2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : c;	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmad\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmsb\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c
new file mode 100644
index 000000000..79998b84e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_2_run.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_2.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = (pred[i] != 1				\
+			 ? a[i] OP b[i] * (TYPE) FACTOR		\
+			 : (TYPE) FACTOR);			\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c
new file mode 100644
index 000000000..085fccf53
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * c : a[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c
new file mode 100644
index 000000000..cbd1185b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_3_run.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_3.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] != 1 ? a[i] OP b[i] * (TYPE) FACTOR : a[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c
new file mode 100644
index 000000000..ed9f73e9c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] == 1 ? a[i] OP b[i] * c : pred[i];	\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m,} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c
new file mode 100644
index 000000000..5e078594a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_4_run.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_4.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3;					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = (pred[i] == 1				\
+			 ? a[i] OP b[i] * (TYPE) FACTOR		\
+			 : pred[i]);				\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c
new file mode 100644
index 000000000..97e233579
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] ? a[i] OP b[i] * c : 0;		\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mla|mad)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:mls|msb)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\t(?:fmla|fmad)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:fmla|fmad)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:fmla|fmad)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\t(?:fmls|fmsb)\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:fmls|fmsb)\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:fmls|fmsb)\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/z,} 2 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z,} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z,} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c
new file mode 100644
index 000000000..9de46e30f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_5_run.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_5.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 0;		\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c
new file mode 100644
index 000000000..832bdb3d8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict r,		\
+			TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE c,	\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = pred[i] ? a[i] OP b[i] * c : 5;		\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, add, +) \
+  T (TYPE, sub, -)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, uint64_t) \
+  TEST_TYPE (T, _Float16) \
+  TEST_TYPE (T, float) \
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsel\t} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c
new file mode 100644
index 000000000..59f57a2db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_6_run.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_6.c"
+
+#define FACTOR 17
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, FACTOR, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] ? a[i] OP b[i] * (TYPE) FACTOR : 5;	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c
new file mode 100644
index 000000000..5561f4219
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP, CONST)				\
+  void __attribute__ ((noipa))					\
+  test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r, 		\
+				  TYPE *__restrict a,		\
+				  TYPE *__restrict b,		\
+				  TYPE *__restrict pred, int n)	\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i];	\
+  }
+
+#define TEST_COUNT(T, TYPE, CONST) \
+  T (TYPE, add, +, CONST) \
+  T (TYPE, sub, -, CONST)
+
+#define TEST_TYPE(T, TYPE, CONST) \
+  TEST_COUNT (T, TYPE, 2) \
+  TEST_COUNT (T, TYPE, 4) \
+  TEST_COUNT (T, TYPE, CONST)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t, 0x80) \
+  TEST_TYPE (T, uint16_t, 0x8000) \
+  TEST_TYPE (T, uint32_t, 0x80000000) \
+  TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c
new file mode 100644
index 000000000..b094f40a2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_7_run.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_7.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP, CONST)			\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] != 1 ? a[i] OP b[i] * CONST : a[i];		\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c
new file mode 100644
index 000000000..d5549272e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP, CONST)				\
+  void __attribute__ ((noipa))					\
+  test_##TYPE##_##NAME##_##CONST (TYPE *__restrict r, 		\
+				  TYPE *__restrict a,		\
+				  TYPE *__restrict b,		\
+				  TYPE *__restrict pred, int n)	\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      r[i] = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i];	\
+  }
+
+#define TEST_COUNT(T, TYPE, CONST) \
+  T (TYPE, add, +, CONST) \
+  T (TYPE, sub, -, CONST)
+
+#define TEST_TYPE(T, TYPE, CONST) \
+  TEST_COUNT (T, TYPE, 2) \
+  TEST_COUNT (T, TYPE, 4) \
+  TEST_COUNT (T, TYPE, CONST)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, uint8_t, 0x80) \
+  TEST_TYPE (T, uint16_t, 0x8000) \
+  TEST_TYPE (T, uint32_t, 0x80000000) \
+  TEST_TYPE (T, uint64_t, 0x8000000000000000ULL)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, z[0-9]+\.b, #7\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, z[0-9]+\.h, #15\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, z[0-9]+\.s, #31\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #2\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, z[0-9]+\.d, #63\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c
new file mode 100644
index 000000000..7fb58aa70
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_mla_8_run.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_mla_8.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP, CONST)			\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	pred[i] = i % 3 < i % 5;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME##_##CONST (r, a, b, pred, N);		\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected						\
+	  = pred[i] != 1 ? a[i] OP b[i] * -CONST : a[i];	\
+	if (r[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_1.c
new file mode 100644
index 000000000..f2c51b291
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_1.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)					\
+  void __attribute__ ((noipa))						\
+  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
+			TYPE *__restrict b, int n)			\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      r[i] = a[i] > 20 ? b[i] OP 3 : b[i];				\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, shl, <<) \
+  T (TYPE, shr, >>)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, int8_t) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, int16_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, int32_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, int64_t) \
+  TEST_TYPE (T, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_1_run.c
new file mode 100644
index 000000000..acc403ec8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_1_run.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_shift_1.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N];					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, N);				\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP 3 : b[i]))	\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_2.c
new file mode 100644
index 000000000..c9082c9c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_2.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)					\
+  void __attribute__ ((noipa))						\
+  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
+			TYPE *__restrict b, int n)			\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      r[i] = a[i] > 20 ? b[i] OP 3 : a[i];				\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, shl, <<) \
+  T (TYPE, shr, >>)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, int8_t) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, int16_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, int32_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, int64_t) \
+  TEST_TYPE (T, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_2_run.c
new file mode 100644
index 000000000..4917d3af6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_2_run.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_shift_2.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N];					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, N);				\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP 3 : a[i]))	\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_3.c
new file mode 100644
index 000000000..55e0de8aa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_3.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)					\
+  void __attribute__ ((noipa))						\
+  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
+			TYPE *__restrict b, int n)			\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      r[i] = a[i] > 20 ? b[i] OP 3 : 72;				\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, shl, <<) \
+  T (TYPE, shr, >>)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, int8_t) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, int16_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, int32_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, int64_t) \
+  TEST_TYPE (T, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-times {\tsel\t} 16 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_3_run.c
new file mode 100644
index 000000000..194c75b8d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_3_run.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_shift_3.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N];					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, N);				\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP 3 : 72))		\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_4.c
new file mode 100644
index 000000000..32dd68199
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_4.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)					\
+  void __attribute__ ((noipa))						\
+  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
+			TYPE *__restrict b, int n)			\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      r[i] = a[i] > 20 ? b[i] OP 3 : 0;					\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, shl, <<) \
+  T (TYPE, shr, >>)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, int8_t) \
+  TEST_TYPE (T, uint8_t) \
+  TEST_TYPE (T, int16_t) \
+  TEST_TYPE (T, uint16_t) \
+  TEST_TYPE (T, int32_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, int64_t) \
+  TEST_TYPE (T, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.b, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.h, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.b, p[0-7]/z, z[0-9]+\.b\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/z, z[0-9]+\.h\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_4_run.c
new file mode 100644
index 000000000..ee263000d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_4_run.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_shift_4.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N];					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, N);				\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP 3 : 0))		\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_5.c
new file mode 100644
index 000000000..1d4491531
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_5.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)					\
+  void __attribute__ ((noipa))						\
+  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE *__restrict c, int n)	\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      r[i] = a[i] > 20 ? b[i] OP c[i] : b[i];				\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, shl, <<) \
+  T (TYPE, shr, >>)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, int32_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, int64_t) \
+  TEST_TYPE (T, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_5_run.c
new file mode 100644
index 000000000..35bf1b871
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_5_run.c
@@ -0,0 +1,28 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_shift_5.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ~i & 7;						\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, c, N);			\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP c[i] : b[i]))	\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_6.c
new file mode 100644
index 000000000..35cb67677
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_6.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)					\
+  void __attribute__ ((noipa))						\
+  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE *__restrict c, int n)	\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      r[i] = a[i] > 20 ? b[i] OP c[i] : c[i];				\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, shl, <<) \
+  T (TYPE, shr, >>)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, int32_t) \
+  TEST_TYPE (T, uint32_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlslr\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tasrr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tlsrr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_6_run.c
new file mode 100644
index 000000000..e601c6156
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_6_run.c
@@ -0,0 +1,28 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_shift_6.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ~i & 7;						\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, c, N);			\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP c[i] : c[i]))	\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_7.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_7.c
new file mode 100644
index 000000000..80154b25e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_7.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)					\
+  void __attribute__ ((noipa))						\
+  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE *__restrict c, int n)	\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      r[i] = a[i] > 20 ? b[i] OP c[i] : a[i];				\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, shl, <<) \
+  T (TYPE, shr, >>)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, int32_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, int64_t) \
+  TEST_TYPE (T, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_7_run.c
new file mode 100644
index 000000000..d23b0093d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_7_run.c
@@ -0,0 +1,28 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_shift_7.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ~i & 7;						\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, c, N);			\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP c[i] : a[i]))	\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_8.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_8.c
new file mode 100644
index 000000000..b478c0c4f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_8.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)					\
+  void __attribute__ ((noipa))						\
+  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE *__restrict c, int n)	\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      r[i] = a[i] > 20 ? b[i] OP c[i] : 91;				\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, shl, <<) \
+  T (TYPE, shr, >>)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, int32_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, int64_t) \
+  TEST_TYPE (T, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-times {\tsel\t} 8 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_8_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_8_run.c
new file mode 100644
index 000000000..72e5a7b59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_8_run.c
@@ -0,0 +1,28 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_shift_8.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ~i & 7;						\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, c, N);			\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP c[i] : 91))	\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_9.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_9.c
new file mode 100644
index 000000000..184e93ab8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_9.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_LOOP(TYPE, NAME, OP)					\
+  void __attribute__ ((noipa))						\
+  test_##TYPE##_##NAME (TYPE *__restrict r, TYPE *__restrict a,		\
+			TYPE *__restrict b, TYPE *__restrict c, int n)	\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      r[i] = a[i] > 20 ? b[i] OP c[i] : 0;				\
+  }
+
+#define TEST_TYPE(T, TYPE) \
+  T (TYPE, shl, <<) \
+  T (TYPE, shr, >>)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, int32_t) \
+  TEST_TYPE (T, uint32_t) \
+  TEST_TYPE (T, int64_t) \
+  TEST_TYPE (T, uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tlslr?\tz[0-9]+\.s, p[0-7]/m,} 2 } } */
+/* { dg-final { scan-assembler-times {\tlslr?\tz[0-9]+\.d, p[0-7]/m,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tasrr?\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tasrr?\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tlsrr?\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tlsrr?\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z, z[0-9]+\.s\n} 4 } } */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z, z[0-9]+\.d\n} 4 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_9_run.c
new file mode 100644
index 000000000..6e41ac4da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_shift_9_run.c
@@ -0,0 +1,28 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_shift_9.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N], c[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i);				\
+	b[i] = (i >> 4) << (i & 15);				\
+	c[i] = ~i & 7;						\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, c, N);			\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (TYPE) (a[i] > 20 ? b[i] OP c[i] : 0))	\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_1.c
new file mode 100644
index 000000000..2b5f9c345
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_1.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define abs(A) ((A) < 0 ? -(A) : (A))
+#define neg(A) (-(A))
+
+#define DEF_LOOP(TYPE, OP)					\
+  void __attribute__ ((noipa))					\
+  test_##TYPE##_##OP (TYPE *__restrict r, TYPE *__restrict a,	\
+		      TYPE *__restrict pred, int n)		\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      r[i] = pred[i] ? OP (a[i]) : a[i];			\
+  }
+
+#define TEST_INT_TYPE(T, TYPE) \
+  T (TYPE, abs) \
+  T (TYPE, neg)
+
+#define TEST_FLOAT_TYPE(T, TYPE, SUFFIX) \
+  T (TYPE, __builtin_fabs##SUFFIX) \
+  T (TYPE, neg)
+
+#define TEST_ALL(T) \
+  TEST_INT_TYPE (T, int8_t) \
+  TEST_INT_TYPE (T, int16_t) \
+  TEST_INT_TYPE (T, int32_t) \
+  TEST_INT_TYPE (T, int64_t) \
+  TEST_FLOAT_TYPE (T, _Float16, f16) \
+  TEST_FLOAT_TYPE (T, float, f) \
+  TEST_FLOAT_TYPE (T, double, )
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* XFAILed because the ?: gets canonicalized so that the operation is in
+   the false arm.  */
+/* { dg-final { scan-assembler-not {\tsel\t} { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_1_run.c
new file mode 100644
index 000000000..a6c1a49dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_1_run.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_unary_1.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, OP)					\
+  {								\
+    TYPE r[N], a[N], pred[N];					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
+	pred[i] = (i % 7 < 4);					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##OP (r, a, pred, N);				\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (pred[i] ? OP (a[i]) : a[i]))			\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c
new file mode 100644
index 000000000..97d1b8f5d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define abs(A) ((A) < 0 ? -(A) : (A))
+#define neg(A) (-(A))
+
+#define DEF_LOOP(TYPE, OP)					\
+  void __attribute__ ((noipa))					\
+  test_##TYPE##_##OP (TYPE *__restrict r, TYPE *__restrict a,	\
+		      TYPE *__restrict b,			\
+		      TYPE *__restrict pred, int n)		\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      {								\
+	TYPE bi = b[i];						\
+	r[i] = pred[i] ? OP (a[i]) : bi;			\
+      }								\
+  }
+
+#define TEST_INT_TYPE(T, TYPE) \
+  T (TYPE, abs) \
+  T (TYPE, neg)
+
+#define TEST_FLOAT_TYPE(T, TYPE, SUFFIX) \
+  T (TYPE, __builtin_fabs##SUFFIX) \
+  T (TYPE, neg)
+
+#define TEST_ALL(T) \
+  TEST_INT_TYPE (T, int8_t) \
+  TEST_INT_TYPE (T, int16_t) \
+  TEST_INT_TYPE (T, int32_t) \
+  TEST_INT_TYPE (T, int64_t) \
+  TEST_FLOAT_TYPE (T, _Float16, f16) \
+  TEST_FLOAT_TYPE (T, float, f) \
+  TEST_FLOAT_TYPE (T, double, )
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2_run.c
new file mode 100644
index 000000000..1a385c323
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2_run.c
@@ -0,0 +1,28 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_unary_2.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, OP)					\
+  {								\
+    TYPE r[N], a[N], b[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
+	b[i] = (i % 9) * (i % 7 + 1);				\
+	pred[i] = (i % 7 < 4);					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##OP (r, a, b, pred, N);			\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (pred[i] ? OP (a[i]) : b[i]))			\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_3.c
new file mode 100644
index 000000000..dde0fdd92
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_3.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define abs(A) ((A) < 0 ? -(A) : (A))
+#define neg(A) (-(A))
+
+#define DEF_LOOP(TYPE, OP)					\
+  void __attribute__ ((noipa))					\
+  test_##TYPE##_##OP (TYPE *__restrict r, TYPE *__restrict a,	\
+		      TYPE *__restrict pred, int n)		\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      r[i] = pred[i] ? OP (a[i]) : 5;				\
+  }
+
+#define TEST_INT_TYPE(T, TYPE) \
+  T (TYPE, abs) \
+  T (TYPE, neg)
+
+#define TEST_FLOAT_TYPE(T, TYPE, SUFFIX) \
+  T (TYPE, __builtin_fabs##SUFFIX) \
+  T (TYPE, neg)
+
+#define TEST_ALL(T) \
+  TEST_INT_TYPE (T, int8_t) \
+  TEST_INT_TYPE (T, int16_t) \
+  TEST_INT_TYPE (T, int32_t) \
+  TEST_INT_TYPE (T, int64_t) \
+  TEST_FLOAT_TYPE (T, _Float16, f16) \
+  TEST_FLOAT_TYPE (T, float, f) \
+  TEST_FLOAT_TYPE (T, double, )
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_3_run.c
new file mode 100644
index 000000000..3c72b239a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_3_run.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_unary_3.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, OP)					\
+  {								\
+    TYPE r[N], a[N], pred[N];					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
+	pred[i] = (i % 7 < 4);					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##OP (r, a, pred, N);				\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (pred[i] ? OP (a[i]) : 5))			\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_4.c
new file mode 100644
index 000000000..4604365fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_4.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define abs(A) ((A) < 0 ? -(A) : (A))
+#define neg(A) (-(A))
+
+#define DEF_LOOP(TYPE, OP)					\
+  void __attribute__ ((noipa))					\
+  test_##TYPE##_##OP (TYPE *__restrict r, TYPE *__restrict a,	\
+		      TYPE *__restrict pred, int n)		\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      r[i] = pred[i] ? OP (a[i]) : 0;				\
+  }
+
+#define TEST_INT_TYPE(T, TYPE) \
+  T (TYPE, abs) \
+  T (TYPE, neg)
+
+#define TEST_FLOAT_TYPE(T, TYPE, SUFFIX) \
+  T (TYPE, __builtin_fabs##SUFFIX) \
+  T (TYPE, neg)
+
+#define TEST_ALL(T) \
+  TEST_INT_TYPE (T, int8_t) \
+  TEST_INT_TYPE (T, int16_t) \
+  TEST_INT_TYPE (T, int32_t) \
+  TEST_INT_TYPE (T, int64_t) \
+  TEST_FLOAT_TYPE (T, _Float16, f16) \
+  TEST_FLOAT_TYPE (T, float, f) \
+  TEST_FLOAT_TYPE (T, double, )
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tabs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.b, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tneg\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabs\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfneg\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* Really we should be able to use MOVPRFX /z here, but at the moment
+   we're relying on combine to merge a SEL and an arithmetic operation,
+   and the SEL doesn't allow the "false" value to be zero when the "true"
+   value is a register.  */
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_4_run.c
new file mode 100644
index 000000000..48d254150
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_4_run.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_unary_4.c"
+
+#define N 99
+
+#define TEST_LOOP(TYPE, OP)					\
+  {								\
+    TYPE r[N], a[N], pred[N];					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = (i & 1 ? i : 3 * i) * (i % 3 == 0 ? 1 : -1);	\
+	pred[i] = (i % 7 < 4);					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##OP (r, a, pred, N);				\
+    for (int i = 0; i < N; ++i)					\
+      if (r[i] != (pred[i] ? OP (a[i]) : 0))			\
+	__builtin_abort ();					\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_1.c
new file mode 100644
index 000000000..05641199e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_1.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define NUM_ELEMS(TYPE) (320 / sizeof (TYPE))
+
+#define DEF_LOOP(TYPE, CONST)					\
+  void __attribute__ ((noipa))					\
+  test_##CONST##_##TYPE (TYPE *restrict r, TYPE *restrict a,	\
+			 TYPE *restrict b)			\
+  {								\
+    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)			\
+      r[i] = a[i] > 20 ? b[i] & CONST : b[i];			\
+  }
+
+#define TEST_ALL(T)			\
+  T (uint16_t, 0xff)			\
+					\
+  T (uint32_t, 0xff)			\
+  T (uint32_t, 0xffff)			\
+					\
+  T (uint64_t, 0xff)			\
+  T (uint64_t, 0xffff)			\
+  T (uint64_t, 0xffffffff)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x2,[^L]*\tuxtb\t\1, p[0-7]/m, \1\n} } } */
+
+/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x2,[^L]*\tuxtb\t\1, p[0-7]/m, \1\n} } } */
+/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x2,[^L]*\tuxth\t\1, p[0-7]/m, \1\n} } } */
+
+/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x2,[^L]*\tuxtb\t\1, p[0-7]/m, \1\n} } } */
+/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x2,[^L]*\tuxth\t\1, p[0-7]/m, \1\n} } } */
+/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x2,[^L]*\tuxtw\t\1, p[0-7]/m, \1\n} } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_1_run.c
new file mode 100644
index 000000000..685f39478
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_1_run.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_uxt_1.c"
+
+#define TEST_LOOP(TYPE, CONST)				\
+  {							\
+    TYPE r[NUM_ELEMS (TYPE)];				\
+    TYPE a[NUM_ELEMS (TYPE)];				\
+    TYPE b[NUM_ELEMS (TYPE)];				\
+    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)		\
+      {							\
+	a[i] = (i & 1 ? i : 3 * i);			\
+	b[i] = (i >> 4) << (i & 15);			\
+	asm volatile ("" ::: "memory");			\
+      }							\
+    test_##CONST##_##TYPE (r, a, b);			\
+    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)		\
+      if (r[i] != (a[i] > 20 ? b[i] & CONST : b[i]))	\
+	__builtin_abort ();				\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_2.c
new file mode 100644
index 000000000..c900498a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_2.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define NUM_ELEMS(TYPE) (320 / sizeof (TYPE))
+
+#define DEF_LOOP(TYPE, CONST)					\
+  void __attribute__ ((noipa))					\
+  test_##CONST##_##TYPE (TYPE *restrict r, TYPE *restrict a,	\
+			 TYPE *restrict b)			\
+  {								\
+    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)			\
+      r[i] = a[i] > 20 ? b[i] & CONST : a[i];			\
+  }
+
+#define TEST_ALL(T)			\
+  T (uint16_t, 0xff)			\
+					\
+  T (uint32_t, 0xff)			\
+  T (uint32_t, 0xffff)			\
+					\
+  T (uint64_t, 0xff)			\
+  T (uint64_t, 0xffff)			\
+  T (uint64_t, 0xffffffff)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x1,[^L]*\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x2,[^L]*\tuxtb\t\1, p[0-7]/m, \2\n} } } */
+
+/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x1,[^L]*\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x2,[^L]*\tuxtb\t\1, p[0-7]/m, \2\n} } } */
+/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x1,[^L]*\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x2,[^L]*\tuxth\t\1, p[0-7]/m, \2\n} } } */
+
+/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x1,[^L]*\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x2,[^L]*\tuxtb\t\1, p[0-7]/m, \2\n} } } */
+/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x1,[^L]*\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x2,[^L]*\tuxth\t\1, p[0-7]/m, \2\n} } } */
+/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x1,[^L]*\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x2,[^L]*\tuxtw\t\1, p[0-7]/m, \2\n} } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_2_run.c
new file mode 100644
index 000000000..75679cdf9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_2_run.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_uxt_2.c"
+
+#define TEST_LOOP(TYPE, CONST)				\
+  {							\
+    TYPE r[NUM_ELEMS (TYPE)];				\
+    TYPE a[NUM_ELEMS (TYPE)];				\
+    TYPE b[NUM_ELEMS (TYPE)];				\
+    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)		\
+      {							\
+	a[i] = (i & 1 ? i : 3 * i);			\
+	b[i] = (i >> 4) << (i & 15);			\
+	asm volatile ("" ::: "memory");			\
+      }							\
+    test_##CONST##_##TYPE (r, a, b);			\
+    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)		\
+      if (r[i] != (a[i] > 20 ? b[i] & CONST : a[i]))	\
+	__builtin_abort ();				\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_3.c
new file mode 100644
index 000000000..cf1fd0029
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_3.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define NUM_ELEMS(TYPE) (320 / sizeof (TYPE))
+
+#define DEF_LOOP(TYPE, CONST)					\
+  void __attribute__ ((noipa))					\
+  test_##CONST##_##TYPE (TYPE *restrict r, TYPE *restrict a,	\
+			 TYPE *restrict b)			\
+  {								\
+    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)			\
+      r[i] = a[i] > 20 ? b[i] & CONST : 127;			\
+  }
+
+#define TEST_ALL(T)			\
+  T (uint16_t, 0xff)			\
+					\
+  T (uint32_t, 0xff)			\
+  T (uint32_t, 0xffff)			\
+					\
+  T (uint64_t, 0xff)			\
+  T (uint64_t, 0xffff)			\
+  T (uint64_t, 0xffffffff)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+), z[0-9]+\n\tuxtb\t\1\.h, p[0-7]/m, z[0-9]+\.h\n} } } */
+
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+), z[0-9]+\n\tuxtb\t\1\.s, p[0-7]/m, z[0-9]+\.s\n} } } */
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+), z[0-9]+\n\tuxth\t\1\.s, p[0-7]/m, z[0-9]+\.s\n} } } */
+
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+), z[0-9]+\n\tuxtb\t\1\.d, p[0-7]/m, z[0-9]+\.d\n} } } */
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+), z[0-9]+\n\tuxth\t\1\.d, p[0-7]/m, z[0-9]+\.d\n} } } */
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+), z[0-9]+\n\tuxtw\t\1\.d, p[0-7]/m, z[0-9]+\.d\n} } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_3_run.c
new file mode 100644
index 000000000..3d33d3a39
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_3_run.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_uxt_3.c"
+
+#define TEST_LOOP(TYPE, CONST)				\
+  {							\
+    TYPE r[NUM_ELEMS (TYPE)];				\
+    TYPE a[NUM_ELEMS (TYPE)];				\
+    TYPE b[NUM_ELEMS (TYPE)];				\
+    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)		\
+      {							\
+	a[i] = (i & 1 ? i : 3 * i);			\
+	b[i] = (i >> 4) << (i & 15);			\
+	asm volatile ("" ::: "memory");			\
+      }							\
+    test_##CONST##_##TYPE (r, a, b);			\
+    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)		\
+      if (r[i] != (a[i] > 20 ? b[i] & CONST : 127))	\
+	__builtin_abort ();				\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_4.c
new file mode 100644
index 000000000..25c664780
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_4.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define NUM_ELEMS(TYPE) (320 / sizeof (TYPE))
+
+#define DEF_LOOP(TYPE, CONST)					\
+  void __attribute__ ((noipa))					\
+  test_##CONST##_##TYPE (TYPE *restrict r, TYPE *restrict a,	\
+			 TYPE *restrict b)			\
+  {								\
+    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)			\
+      r[i] = a[i] > 20 ? b[i] & CONST : 0;			\
+  }
+
+#define TEST_ALL(T)			\
+  T (uint16_t, 0xff)			\
+					\
+  T (uint32_t, 0xff)			\
+  T (uint32_t, 0xffff)			\
+					\
+  T (uint64_t, 0xff)			\
+  T (uint64_t, 0xffff)			\
+  T (uint64_t, 0xffffffff)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.h), (p[0-7])/z, z[0-9]+\.h\n\tuxtb\t\1, \2/m, z[0-9]+\.h\n} } } */
+
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.s), (p[0-7])/z, z[0-9]+\.s\n\tuxtb\t\1, \2/m, z[0-9]+\.s\n} } } */
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.s), (p[0-7])/z, z[0-9]+\.s\n\tuxth\t\1, \2/m, z[0-9]+\.s\n} } } */
+
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.d), (p[0-7])/z, z[0-9]+\.d\n\tuxtb\t\1, \2/m, z[0-9]+\.d\n} } } */
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.d), (p[0-7])/z, z[0-9]+\.d\n\tuxth\t\1, \2/m, z[0-9]+\.d\n} } } */
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.d), (p[0-7])/z, z[0-9]+\.d\n\tuxtw\t\1, \2/m, z[0-9]+\.d\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_4_run.c
new file mode 100644
index 000000000..f3c4374ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_4_run.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_uxt_4.c"
+
+#define TEST_LOOP(TYPE, CONST)				\
+  {							\
+    TYPE r[NUM_ELEMS (TYPE)];				\
+    TYPE a[NUM_ELEMS (TYPE)];				\
+    TYPE b[NUM_ELEMS (TYPE)];				\
+    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)		\
+      {							\
+	a[i] = (i & 1 ? i : 3 * i);			\
+	b[i] = (i >> 4) << (i & 15);			\
+	asm volatile ("" ::: "memory");			\
+      }							\
+    test_##CONST##_##TYPE (r, a, b);			\
+    for (int i = 0; i < NUM_ELEMS (TYPE); ++i)		\
+      if (r[i] != (a[i] > 20 ? b[i] & CONST : 0))	\
+	__builtin_abort ();				\
+  }
+
+int main ()
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_1.c b/gcc/testsuite/gcc.target/aarch64/sve/const_1.c
new file mode 100644
index 000000000..ae25dcb73
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/const_1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+void
+set (uint64_t *dst, int count)
+{
+  for (int i = 0; i < count; ++i)
+    dst[i] = 0xffff00ff00ffff00ULL;
+}
+
+/* { dg-final { scan-assembler {\tmovi\tv([0-9]+)\.2d, 0xffff00ff00ffff00\n.*\tdup\tz[0-9]+\.q, z\1\.q\[0\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_2.c b/gcc/testsuite/gcc.target/aarch64/sve/const_2.c
new file mode 100644
index 000000000..7b2b5c2a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/const_2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+#define TEST(TYPE, CONST)			\
+  void						\
+  set_##TYPE (TYPE *dst, int count)		\
+  {						\
+    for (int i = 0; i < count; ++i)		\
+      dst[i] = CONST;				\
+  }
+
+TEST (uint16_t, 129)
+TEST (uint32_t, 129)
+TEST (uint64_t, 129)
+
+/* { dg-final { scan-assembler {\tmovi\tv([0-9]+)\.8h, 0x81\n[^:]*\tdup\tz[0-9]+\.q, z\1\.q\[0\]\n} } } */
+/* { dg-final { scan-assembler {\tmovi\tv([0-9]+)\.4s, 0x81\n[^:]*\tdup\tz[0-9]+\.q, z\1\.q\[0\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\t(x[0-9]+), 129\n[^:]*\tmov\tz[0-9]+\.d, \1\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_3.c b/gcc/testsuite/gcc.target/aarch64/sve/const_3.c
new file mode 100644
index 000000000..c18ceaedc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/const_3.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+#define TEST(TYPE, CONST)			\
+  void						\
+  set_##TYPE (TYPE *dst, int count)		\
+  {						\
+    for (int i = 0; i < count; ++i)		\
+      dst[i] = CONST;				\
+  }
+
+TEST (uint16_t, 0x1234)
+TEST (uint32_t, 0x1234)
+TEST (uint64_t, 0x1234)
+
+/* { dg-final { scan-assembler {\tmov\t(w[0-9]+), 4660\n[^:]*\tmov\tz[0-9]+\.h, \1\n} } } */
+/* { dg-final { scan-assembler {\tmov\t(w[0-9]+), 4660\n[^:]*\tmov\tz[0-9]+\.s, \1\n} } } */
+/* { dg-final { scan-assembler {\tmov\t(x[0-9]+), 4660\n[^:]*\tmov\tz[0-9]+\.d, \1\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ext_2.c b/gcc/testsuite/gcc.target/aarch64/sve/ext_2.c
index 0fe7e4c28..5593b070c 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/ext_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/ext_2.c
@@ -14,5 +14,4 @@ foo (void)
   asm volatile ("" :: "w" (x));
 }
 
-/* { dg-final { scan-assembler {\tmov\tz0\.d, z1\.d\n} } } */
-/* { dg-final { scan-assembler {\text\tz0\.b, z0\.b, z[01]\.b, #4\n} } } */
+/* { dg-final { scan-assembler {\tmovprfx\tz0, z1\n\text\tz0\.b, z0\.b, z1\.b, #4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ext_3.c b/gcc/testsuite/gcc.target/aarch64/sve/ext_3.c
new file mode 100644
index 000000000..83c04c856
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/ext_3.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=1024" } */
+
+typedef int vnx4si __attribute__((vector_size (128)));
+
+void
+foo (void)
+{
+  register int x asm ("z0");
+  register vnx4si y asm ("z1");
+
+  asm volatile ("" : "=w" (y));
+  x = y[21];
+  asm volatile ("" :: "w" (x));
+}
+
+/* { dg-final { scan-assembler {\tmovprfx\tz0, z1\n\text\tz0\.b, z0\.b, z1\.b, #84\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c
new file mode 100644
index 000000000..13ad83be2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c
@@ -0,0 +1,35 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps" } */
+
+#define N 16
+
+typedef float *__restrict__ vnx4sf;
+typedef double *__restrict__ vnx2df;
+typedef _Float16 *__restrict__ vnx8hf_a;
+typedef __fp16 *__restrict__ vnx8hf_b;
+
+extern float fabsf (float);
+extern double fabs (double);
+
+#define FABD(type, abs, n)				\
+	void fabd_##type (type res, type a, type b)	\
+	{						\
+	    int i;					\
+	    for (i = 0; i < n; i++)			\
+		res[i] = abs (a[i] - b[i]);		\
+	}
+
+#define TEST_SVE_F_MODES(FUNC)	\
+  FUNC (vnx2df, fabs, N)	\
+  FUNC (vnx4sf, fabsf, N)	\
+  FUNC (vnx8hf_a, fabsf, N)	\
+  FUNC (vnx8hf_b, fabsf, N)	\
+
+TEST_SVE_F_MODES (FABD)
+
+/* { dg-final { scan-assembler "fabd" } } */
+/* { dg-final { scan-assembler-not "fsub" } } */
+/* { dg-final { scan-assembler-not "fabs" } } */
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfabd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fadda_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fadda_1.c
new file mode 100644
index 000000000..158cd6c84
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fadda_1.c
@@ -0,0 +1,20 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#define DO_OPS(TYPE)					\
+TYPE fold_##TYPE (TYPE *src, int count)			\
+{							\
+  TYPE res = 0;						\
+  for (int i = 0; i < count; ++i)			\
+    res += src[i];					\
+  return res;						\
+}
+
+DO_OPS (_Float16)
+DO_OPS (float)
+DO_OPS (double)
+
+/* { dg-final { scan-assembler-times {\tfadda\th[0-9]+, p[0-7], h[0-9]+, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-not "sel" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fmaxnm_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fmaxnm_1.c
new file mode 100644
index 000000000..2f0d64bd4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fmaxnm_1.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#ifndef FN
+#define FN(X) __builtin_fmax##X
+#endif
+
+#define DEF_LOOP(FN, TYPE, NAME, CONST)			\
+  void __attribute__ ((noipa))				\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = FN (y[i], CONST);				\
+  }
+
+#define TEST_TYPE(T, FN, TYPE) \
+  T (FN, TYPE, zero, 0) \
+  T (FN, TYPE, one, 1) \
+  T (FN, TYPE, two, 2)
+
+#define TEST_ALL(T) \
+  TEST_TYPE (T, FN (f16), _Float16) \
+  TEST_TYPE (T, FN (f32), float) \
+  TEST_TYPE (T, FN (f64), double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fminnm_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fminnm_1.c
new file mode 100644
index 000000000..547772e29
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fminnm_1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define FN(X) __builtin_fmin##X
+#include "fmaxnm_1.c"
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #0\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1\.0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, #1\.0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #2\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c
new file mode 100644
index 000000000..8e6004337
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1.c
@@ -0,0 +1,22 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+/* Case 1.1: Trailing constants with stepped sequence.  */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+/*
+** foo:
+**	index	(z[0-9]+\.s), #1, #1
+**	insr	\1, w1
+**	insr	\1, w0
+**	...
+*/
+__attribute__((noipa))
+vnx4si foo(int a, int b)
+{
+  return (vnx4si) { a, b, 1, 2, 3, 4, 5, 6 };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_10.c b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c
new file mode 100644
index 000000000..bee039415
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_10.c
@@ -0,0 +1,24 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+/* Case 5.4: Interleaved repeating elements and non-repeating elements.  */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+/*
+** foo:
+**	mov	(z[0-9]+\.s), w3
+**	mov	(z[0-9]+\.s), w2
+**	insr	\2, w1
+**	insr	\2, w0
+**	zip1	\2, \2, \1
+**	...
+*/
+__attribute__((noipa))
+vnx4si foo(int a, int b, int c, int f)
+{
+  return (vnx4si) { a, f, b, f, c, f, c, f };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c
new file mode 100644
index 000000000..9a6d8650e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_10_run.c
@@ -0,0 +1,21 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_10.c"
+
+int main()
+{
+  int a = 10;
+  int b = 11;
+  int c = 12;
+  int f = 13;
+
+  vnx4si v = foo (a, b, c, f);
+  int expected[] = { a, f, b, f, c, f, c, f };
+
+  for (int i = 0; i < 8; i++)
+    if (v[i] != expected[i])
+      __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_11.c b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c
new file mode 100644
index 000000000..8a9496f34
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_11.c
@@ -0,0 +1,23 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+/* Case 5.5: Interleaved repeating elements and trailing same elements.  */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+/*
+** foo:
+**	mov	(z[0-9]+\.s), w1
+**	insr	\1, w0
+**	mov	(z[0-9]+\.s), w2
+**	zip1	\1, \1, \2
+**	...
+*/
+__attribute__((noipa))
+vnx4si foo(int a, int b, int f) 
+{
+  return (vnx4si) { a, f, b, f, b, f, b, f };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_11_run.c
new file mode 100644
index 000000000..437155581
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_11_run.c
@@ -0,0 +1,20 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_11.c"
+
+int main()
+{
+  int a = 10;
+  int b = 11;
+  int f = 12;
+
+  vnx4si v = foo (a, b, f);
+  int expected[] = { a, f, b, f, b, f, b, f };
+
+  for (int i = 0; i < 8; i++)
+    if (v[i] != expected[i])
+      __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_12.c b/gcc/testsuite/gcc.target/aarch64/sve/init_12.c
new file mode 100644
index 000000000..bc698ddd3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_12.c
@@ -0,0 +1,26 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+/* Case 5.5: Interleaved repeating elements and trailing same elements.  */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+/*
+** foo:
+**	fmov	(s[0-9]+), w1
+**	mov	(z[0-9]+\.s), w2
+**	mov	(z[0-9]+\.s), w0
+**	insr	\3, \1
+**	insr	\3, \1
+**	insr	\3, \1
+**	zip1	\3, \3, \2
+**	...
+*/
+__attribute__((noipa))
+vnx4si foo(int a, int b, int f) 
+{
+  return (vnx4si) { b, f, b, f, b, f, a, f };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_12_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_12_run.c
new file mode 100644
index 000000000..5ce7edb1e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_12_run.c
@@ -0,0 +1,20 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_12.c"
+
+int main()
+{
+  int a = 10;
+  int b = 11;
+  int f = 12;
+
+  vnx4si v = foo (a, b, f);
+  int expected[] = { b, f, b, f, b, f, a, f };
+
+  for (int i = 0; i < 8; i++)
+    if (v[i] != expected[i])
+      __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_13.c b/gcc/testsuite/gcc.target/aarch64/sve/init_13.c
new file mode 100644
index 000000000..eea417063
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_13.c
@@ -0,0 +1,17 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef float vnx4sf __attribute__((vector_size (32)));
+
+/*
+** foo:
+**	mov	(z[0-9]+\.s), s0
+**	insr	\1, wzr
+**	...
+*/
+vnx4sf
+foo (float a)
+{
+  return (vnx4sf) { 0.0f, a, a, a, a, a, a, a };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_1_run.c
new file mode 100644
index 000000000..824a5cbea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_1_run.c
@@ -0,0 +1,19 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_1.c"
+
+int main()
+{
+  int a = 10;
+  int b = 11;
+
+  vnx4si v = foo (a, b);
+  int expected[] = { a, b, 1, 2, 3, 4, 5, 6 };
+
+  for (int i = 0; i < 8; i++)
+    if (v[i] != expected[i])
+      __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_2.c b/gcc/testsuite/gcc.target/aarch64/sve/init_2.c
new file mode 100644
index 000000000..0a8aa8dec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_2.c
@@ -0,0 +1,23 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+/* Case 1.2: Trailing constants with repeating sequence.  */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+/*
+** foo:
+**	...
+**	ld1rd	(z[0-9]+)\.d, p[0-9]+/z, \[x[0-9]+\]
+**	insr	\1\.s, w1
+**	insr	\1\.s, w0
+**	...
+*/
+__attribute__((noipa))
+vnx4si foo(int a, int b)
+{
+  return (vnx4si) { a, b, 2, 3, 2, 3, 2, 3 };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_2_run.c
new file mode 100644
index 000000000..86c191c77
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_2_run.c
@@ -0,0 +1,19 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_2.c"
+
+int main()
+{
+  int a = 10;
+  int b = 11;
+
+  vnx4si v = foo (a, b);
+  int expected[] = { a, b, 2, 3, 2, 3, 2, 3 };
+
+  for (int i = 0; i < 8; i++)
+    if (v[i] != expected[i])
+      __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_3.c b/gcc/testsuite/gcc.target/aarch64/sve/init_3.c
new file mode 100644
index 000000000..4a418b633
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_3.c
@@ -0,0 +1,24 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+/* Case 2.1: Leading constants with stepped sequence.  */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+/*
+** foo:
+**	index	(z[0-9]+\.s), #6, #-1
+**	insr	\1, w0
+**	insr	\1, w1
+**	rev	\1, \1
+**	...
+*/
+__attribute__((noipa))
+vnx4si foo(int a, int b)
+{
+  return (vnx4si) { 1, 2, 3, 4, 5, 6, a, b };
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_3_run.c
new file mode 100644
index 000000000..ce4de6950
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_3_run.c
@@ -0,0 +1,19 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_3.c"
+
+int main()
+{
+  int a = 10;
+  int b = 11;
+
+  vnx4si v = foo (a, b);
+  int expected[] = { 1, 2, 3, 4, 5, 6, a, b };
+
+  for (int i = 0; i < 8; i++)
+    if (v[i] != expected[i])
+      __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_4.c b/gcc/testsuite/gcc.target/aarch64/sve/init_4.c
new file mode 100644
index 000000000..0fa99c151
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_4.c
@@ -0,0 +1,24 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+/* Case 2.2: Leading constants with stepped sequence.  */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+/*
+** foo:
+**	...
+**	ld1rd	(z[0-9]+)\.d, p[0-9]+/z, \[x[0-9]+\]
+**	insr	\1\.s, w1
+**	insr	\1\.s, w0
+**	rev	\1\.s, \1\.s
+**	...
+*/
+__attribute__((noipa))
+vnx4si foo(int a, int b)
+{
+  return (vnx4si) { 3, 2, 3, 2, 3, 2, b, a };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_4_run.c
new file mode 100644
index 000000000..defee421f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_4_run.c
@@ -0,0 +1,19 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_4.c"
+
+int main()
+{
+  int a = 10;
+  int b = 11;
+
+  vnx4si v = foo (a, b);
+  int expected[] = { 3, 2, 3, 2, 3, 2, b, a };
+
+  for (int i = 0; i < 8; i++)
+    if (v[i] != expected[i])
+      __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_5.c b/gcc/testsuite/gcc.target/aarch64/sve/init_5.c
new file mode 100644
index 000000000..794e265c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_5.c
@@ -0,0 +1,22 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+/* Case 3: Trailing same element.  */ 
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+/*
+** foo:
+**	mov	(z[0-9]+\.s), w2
+**	insr	\1, w1
+**	insr	\1, w0
+**	...
+*/
+__attribute__((noipa))
+vnx4si foo(int a, int b, int c)
+{
+  return (vnx4si) { a, b, c, c, c, c, c, c };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_5_run.c
new file mode 100644
index 000000000..ba91d6fec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_5_run.c
@@ -0,0 +1,20 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_5.c"
+
+int main()
+{
+  int a = 10;
+  int b = 11;
+  int c = 12;
+
+  vnx4si v = foo (a, b, c);
+  int expected[] = { a, b, c, c, c, c, c, c };
+
+  for (int i = 0; i < 8; i++)
+    if (v[i] != expected[i])
+      __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_6.c b/gcc/testsuite/gcc.target/aarch64/sve/init_6.c
new file mode 100644
index 000000000..8443fc000
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_6.c
@@ -0,0 +1,23 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+/* Case 3: Trailing same element.  */ 
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+/*
+** foo:
+**	mov	(z[0-9]+\.s), w2
+**	insr	\1, w1
+**	insr	\1, w0
+**	rev	\1, \1
+**	...
+*/
+__attribute__((noipa))
+vnx4si foo(int a, int b, int c)
+{
+  return (vnx4si) { c, c, c, c, c, c, b, a };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_6_run.c
new file mode 100644
index 000000000..802b28f98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_6_run.c
@@ -0,0 +1,20 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_6.c"
+
+int main()
+{
+  int a = 10;
+  int b = 11;
+  int c = 12;
+
+  vnx4si v = foo (a, b, c);
+  int expected[] = { c, c, c, c, c, c, b, a };
+
+  for (int i = 0; i < 8; i++)
+    if (v[i] != expected[i])
+      __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_7.c b/gcc/testsuite/gcc.target/aarch64/sve/init_7.c
new file mode 100644
index 000000000..63dbbbe61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_7.c
@@ -0,0 +1,27 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+/* Case 5.1: All elements.  */ 
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+/*
+** foo:
+**	mov	(z[0-9]+\.s), w7
+**	insr	\1, w6
+**	insr	\1, w5
+**	insr	\1, w4
+**	insr	\1, w3
+**	insr	\1, w2
+**	insr	\1, w1
+**	insr	\1, w0
+**	...
+*/
+__attribute__((noipa))
+vnx4si foo(int a, int b, int c, int d, int e, int f, int g, int h)
+{
+  return (vnx4si) { a, b, c, d, e, f, g, h };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_7_run.c
new file mode 100644
index 000000000..61fe28508
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_7_run.c
@@ -0,0 +1,25 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_7.c"
+
+int main()
+{
+  int a = 10;
+  int b = 11;
+  int c = 12;
+  int d = 13;
+  int e = 14;
+  int f = 15;
+  int g = 16;
+  int h = 17;
+
+  vnx4si v = foo (a, b, c, d, e, f, g, h);
+  int expected[] = { a, b, c, d, e, f, g, h };
+
+  for (int i = 0; i < 8; i++)
+    if (v[i] != expected[i])
+      __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_8.c b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c
new file mode 100644
index 000000000..9c2456785
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8.c
@@ -0,0 +1,26 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+/* Case 5.2: Interleaved elements and constants.  */ 
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+/*
+** foo:
+**	...
+**	ld1w	(z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]
+**	mov	(z[0-9]+\.s), w3
+**	insr	\2, w2
+**	insr	\2, w1
+**	insr	\2, w0
+**	zip1	\2, \2, \1
+**	...
+*/
+__attribute__((noipa))
+vnx4si foo(int a, int b, int c, int d)
+{
+  return (vnx4si) { a, 1, b, 2, c, 3, d, 4 }; 
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_8_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_8_run.c
new file mode 100644
index 000000000..24a0a6e06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_8_run.c
@@ -0,0 +1,21 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_8.c"
+
+int main()
+{
+  int a = 10;
+  int b = 11;
+  int c = 12;
+  int d = 13;
+
+  vnx4si v = foo (a, b, c, d);
+  int expected[] = { a, 1, b, 2, c, 3, d, 4 };
+
+  for (int i = 0; i < 8; i++)
+    if (v[i] != expected[i])
+      __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_9.c b/gcc/testsuite/gcc.target/aarch64/sve/init_9.c
new file mode 100644
index 000000000..d22ab71e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_9.c
@@ -0,0 +1,22 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+/* Case 5.3: Repeated elements.  */ 
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+/*
+** foo:
+**	mov	(z[0-9]+\.s), w0
+**	mov	(z[0-9]+\.s), w1
+**	zip1	\1, \1, \2
+**	...
+*/
+__attribute__((noipa))
+vnx4si foo(int a, int b)
+{
+  return (vnx4si) { a, b, a, b, a, b, a, b };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_9_run.c b/gcc/testsuite/gcc.target/aarch64/sve/init_9_run.c
new file mode 100644
index 000000000..636ae3b8b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_9_run.c
@@ -0,0 +1,19 @@
+/* { dg-do run { target aarch64_sve256_hw } } */
+/* { dg-options "-O2 -msve-vector-bits=256" } */
+
+#include "init_9.c"
+
+int main()
+{
+  int a = 10;
+  int b = 11;
+
+  vnx4si v = foo (a, b);
+  int expected[] = { a, b, a, b, a, b, a, b };
+
+  for (int i = 0; i < 8; i++)
+    if (v[i] != expected[i])
+      __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ld1r_2.c b/gcc/testsuite/gcc.target/aarch64/sve/ld1r_2.c
index 2e6b59ab4..e0e0f4ee6 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/ld1r_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/ld1r_2.c
@@ -28,22 +28,6 @@
   T (int64_t)
 
 #define FOR_EACH_LOAD_BROADCAST_IMM(T)					\
-  T (int16_t, 129, imm_129)						\
-  T (int32_t, 129, imm_129)						\
-  T (int64_t, 129, imm_129)						\
-									\
-  T (int16_t, -130, imm_m130)						\
-  T (int32_t, -130, imm_m130)						\
-  T (int64_t, -130, imm_m130)						\
-									\
-  T (int16_t, 0x1234, imm_0x1234)					\
-  T (int32_t, 0x1234, imm_0x1234)					\
-  T (int64_t, 0x1234, imm_0x1234)					\
-									\
-  T (int16_t, 0xFEDC, imm_0xFEDC)					\
-  T (int32_t, 0xFEDC, imm_0xFEDC)					\
-  T (int64_t, 0xFEDC, imm_0xFEDC)					\
-									\
   T (int32_t, 0x12345678, imm_0x12345678)				\
   T (int64_t, 0x12345678, imm_0x12345678)				\
 									\
@@ -56,6 +40,6 @@ FOR_EACH_LOAD_BROADCAST (DEF_LOAD_BROADCAST)
 FOR_EACH_LOAD_BROADCAST_IMM (DEF_LOAD_BROADCAST_IMM)
 
 /* { dg-final { scan-assembler-times {\tld1rb\tz[0-9]+\.b, p[0-7]/z, } 1 } } */
-/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, p[0-7]/z, } 5 } } */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, p[0-7]/z, } 7 } } */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, p[0-7]/z, } 8 } } */
+/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, p[0-7]/z, } 1 } } */
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, p[0-7]/z, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, p[0-7]/z, } 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c b/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
index 7f02497e8..9ead9c21b 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
@@ -68,7 +68,8 @@ TEST_ALL (LOOP)
 /* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.s, w[0-9]+, w[0-9]+\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */
 /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */
-/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 8 } } */
+/* 2 for the calculations of -17 and 17.  */
+/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 10 } } */
 
 /* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #16\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #15\n} 1 } } */
@@ -85,7 +86,8 @@ TEST_ALL (LOOP)
 /* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.d, x[0-9]+, x[0-9]+\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */
 /* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */
-/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 8 } } */
+/* 2 for the calculations of -17 and 17.  */
+/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 10 } } */
 
 /* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #16\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #15\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/aarch64-sve-pcs.exp b/gcc/testsuite/gcc.target/aarch64/sve/pcs/aarch64-sve-pcs.exp
new file mode 100644
index 000000000..745887593
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/aarch64-sve-pcs.exp
@@ -0,0 +1,52 @@
+#  Specific regression driver for AArch64 SVE.
+#  Copyright (C) 2009-2019 Free Software Foundation, Inc.
+#  Contributed by ARM Ltd.
+#
+#  This file is part of GCC.
+#
+#  GCC is free software; you can redistribute it and/or modify it
+#  under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 3, or (at your option)
+#  any later version.
+#
+#  GCC is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with GCC; see the file COPYING3.  If not see
+#  <http://www.gnu.org/licenses/>.  */
+
+# GCC testsuite that uses the `dg.exp' driver.
+
+# Exit immediately if this isn't an AArch64 target.
+if {![istarget aarch64*-*-*] } then {
+    return
+}
+
+# Load support procs.
+load_lib gcc-dg.exp
+
+# If a testcase doesn't have special options, use these.
+global DEFAULT_CFLAGS
+if ![info exists DEFAULT_CFLAGS] then {
+    set DEFAULT_CFLAGS " -ansi -pedantic-errors"
+}
+
+# Initialize `dg'.
+dg-init
+
+# Force SVE if we're not testing it already.
+if { [check_effective_target_aarch64_sve] } {
+    set sve_flags ""
+} else {
+    set sve_flags "-march=armv8.2-a+sve"
+}
+
+# Main loop.
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \
+    $sve_flags $DEFAULT_CFLAGS
+
+# All done.
+dg-finish
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_1.c
new file mode 100644
index 000000000..12ae76789
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_1.c
@@ -0,0 +1,112 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+svbool_t ret_b (void) { return svptrue_b8 (); }
+
+svint8_t ret_s8 (void) { return svdup_s8 (0); }
+svint16_t ret_s16 (void) { return svdup_s16 (0); }
+svint32_t ret_s32 (void) { return svdup_s32 (0); }
+svint64_t ret_s64 (void) { return svdup_s64 (0); }
+svuint8_t ret_u8 (void) { return svdup_u8 (0); }
+svuint16_t ret_u16 (void) { return svdup_u16 (0); }
+svuint32_t ret_u32 (void) { return svdup_u32 (0); }
+svuint64_t ret_u64 (void) { return svdup_u64 (0); }
+svbfloat16_t ret_bf16 (void) { return svundef_bf16 (); }
+svfloat16_t ret_f16 (void) { return svdup_f16 (0); }
+svfloat32_t ret_f32 (void) { return svdup_f32 (0); }
+svfloat64_t ret_f64 (void) { return svdup_f64 (0); }
+
+svint8x2_t ret_s8x2 (void) { return svundef2_s8 (); }
+svint16x2_t ret_s16x2 (void) { return svundef2_s16 (); }
+svint32x2_t ret_s32x2 (void) { return svundef2_s32 (); }
+svint64x2_t ret_s64x2 (void) { return svundef2_s64 (); }
+svuint8x2_t ret_u8x2 (void) { return svundef2_u8 (); }
+svuint16x2_t ret_u16x2 (void) { return svundef2_u16 (); }
+svuint32x2_t ret_u32x2 (void) { return svundef2_u32 (); }
+svuint64x2_t ret_u64x2 (void) { return svundef2_u64 (); }
+svbfloat16x2_t ret_bf16x2 (void) { return svundef2_bf16 (); }
+svfloat16x2_t ret_f16x2 (void) { return svundef2_f16 (); }
+svfloat32x2_t ret_f32x2 (void) { return svundef2_f32 (); }
+svfloat64x2_t ret_f64x2 (void) { return svundef2_f64 (); }
+
+svint8x3_t ret_s8x3 (void) { return svundef3_s8 (); }
+svint16x3_t ret_s16x3 (void) { return svundef3_s16 (); }
+svint32x3_t ret_s32x3 (void) { return svundef3_s32 (); }
+svint64x3_t ret_s64x3 (void) { return svundef3_s64 (); }
+svuint8x3_t ret_u8x3 (void) { return svundef3_u8 (); }
+svuint16x3_t ret_u16x3 (void) { return svundef3_u16 (); }
+svuint32x3_t ret_u32x3 (void) { return svundef3_u32 (); }
+svuint64x3_t ret_u64x3 (void) { return svundef3_u64 (); }
+svbfloat16x3_t ret_bf16x3 (void) { return svundef3_bf16 (); }
+svfloat16x3_t ret_f16x3 (void) { return svundef3_f16 (); }
+svfloat32x3_t ret_f32x3 (void) { return svundef3_f32 (); }
+svfloat64x3_t ret_f64x3 (void) { return svundef3_f64 (); }
+
+svint8x4_t ret_s8x4 (void) { return svundef4_s8 (); }
+svint16x4_t ret_s16x4 (void) { return svundef4_s16 (); }
+svint32x4_t ret_s32x4 (void) { return svundef4_s32 (); }
+svint64x4_t ret_s64x4 (void) { return svundef4_s64 (); }
+svuint8x4_t ret_u8x4 (void) { return svundef4_u8 (); }
+svuint16x4_t ret_u16x4 (void) { return svundef4_u16 (); }
+svuint32x4_t ret_u32x4 (void) { return svundef4_u32 (); }
+svuint64x4_t ret_u64x4 (void) { return svundef4_u64 (); }
+svbfloat16x4_t ret_bf16x4 (void) { return svundef4_bf16 (); }
+svfloat16x4_t ret_f16x4 (void) { return svundef4_f16 (); }
+svfloat32x4_t ret_f32x4 (void) { return svundef4_f32 (); }
+svfloat64x4_t ret_f64x4 (void) { return svundef4_f64 (); }
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_b\n} } } */
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s8\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u8\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_bf16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f64\n} } } */
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s8x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s32x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u8x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u32x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_bf16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f32x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f64x2\n} } } */
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s8x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s32x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s64x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u8x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u32x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u64x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_bf16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f32x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f64x3\n} } } */
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s8x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s16x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s32x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_s64x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u8x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u16x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u32x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u64x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_bf16x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f16x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f32x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f64x4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_2.c
new file mode 100644
index 000000000..9f0741e3c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_2.c
@@ -0,0 +1,111 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void fn_b (svbool_t x) {}
+
+void fn_s8 (svint8_t x) {}
+void fn_s16 (svint16_t x) {}
+void fn_s32 (svint32_t x) {}
+void fn_s64 (svint64_t x) {}
+void fn_u8 (svuint8_t x) {}
+void fn_u16 (svuint16_t x) {}
+void fn_u32 (svuint32_t x) {}
+void fn_u64 (svuint64_t x) {}
+void fn_bf16 (svbfloat16_t x) {}
+void fn_f16 (svfloat16_t x) {}
+void fn_f32 (svfloat32_t x) {}
+void fn_f64 (svfloat64_t x) {}
+
+void fn_s8x2 (svint8x2_t x) {}
+void fn_s16x2 (svint16x2_t x) {}
+void fn_s32x2 (svint32x2_t x) {}
+void fn_s64x2 (svint64x2_t x) {}
+void fn_u8x2 (svuint8x2_t x) {}
+void fn_u16x2 (svuint16x2_t x) {}
+void fn_u32x2 (svuint32x2_t x) {}
+void fn_u64x2 (svuint64x2_t x) {}
+void fn_bf16x2 (svbfloat16x2_t x) {}
+void fn_f16x2 (svfloat16x2_t x) {}
+void fn_f32x2 (svfloat32x2_t x) {}
+void fn_f64x2 (svfloat64x2_t x) {}
+
+void fn_s8x3 (svint8x3_t x) {}
+void fn_s16x3 (svint16x3_t x) {}
+void fn_s32x3 (svint32x3_t x) {}
+void fn_s64x3 (svint64x3_t x) {}
+void fn_u8x3 (svuint8x3_t x) {}
+void fn_u16x3 (svuint16x3_t x) {}
+void fn_u32x3 (svuint32x3_t x) {}
+void fn_u64x3 (svuint64x3_t x) {}
+void fn_bf16x3 (svbfloat16x3_t x) {}
+void fn_f16x3 (svfloat16x3_t x) {}
+void fn_f32x3 (svfloat32x3_t x) {}
+void fn_f64x3 (svfloat64x3_t x) {}
+
+void fn_s8x4 (svint8x4_t x) {}
+void fn_s16x4 (svint16x4_t x) {}
+void fn_s32x4 (svint32x4_t x) {}
+void fn_s64x4 (svint64x4_t x) {}
+void fn_u8x4 (svuint8x4_t x) {}
+void fn_u16x4 (svuint16x4_t x) {}
+void fn_u32x4 (svuint32x4_t x) {}
+void fn_u64x4 (svuint64x4_t x) {}
+void fn_bf16x4 (svbfloat16x4_t x) {}
+void fn_f16x4 (svfloat16x4_t x) {}
+void fn_f32x4 (svfloat32x4_t x) {}
+void fn_f64x4 (svfloat64x4_t x) {}
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_b\n} } } */
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x2\n} } } */
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x3\n} } } */
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_3.c
new file mode 100644
index 000000000..42e7860ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_3.c
@@ -0,0 +1,107 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void fn_s8 (float d0, float d1, float d2, float d3, svint8_t x) {}
+void fn_s16 (float d0, float d1, float d2, float d3, svint16_t x) {}
+void fn_s32 (float d0, float d1, float d2, float d3, svint32_t x) {}
+void fn_s64 (float d0, float d1, float d2, float d3, svint64_t x) {}
+void fn_u8 (float d0, float d1, float d2, float d3, svuint8_t x) {}
+void fn_u16 (float d0, float d1, float d2, float d3, svuint16_t x) {}
+void fn_u32 (float d0, float d1, float d2, float d3, svuint32_t x) {}
+void fn_u64 (float d0, float d1, float d2, float d3, svuint64_t x) {}
+void fn_bf16 (float d0, float d1, float d2, float d3, svbfloat16_t x) {}
+void fn_f16 (float d0, float d1, float d2, float d3, svfloat16_t x) {}
+void fn_f32 (float d0, float d1, float d2, float d3, svfloat32_t x) {}
+void fn_f64 (float d0, float d1, float d2, float d3, svfloat64_t x) {}
+
+void fn_s8x2 (float d0, float d1, float d2, float d3, svint8x2_t x) {}
+void fn_s16x2 (float d0, float d1, float d2, float d3, svint16x2_t x) {}
+void fn_s32x2 (float d0, float d1, float d2, float d3, svint32x2_t x) {}
+void fn_s64x2 (float d0, float d1, float d2, float d3, svint64x2_t x) {}
+void fn_u8x2 (float d0, float d1, float d2, float d3, svuint8x2_t x) {}
+void fn_u16x2 (float d0, float d1, float d2, float d3, svuint16x2_t x) {}
+void fn_u32x2 (float d0, float d1, float d2, float d3, svuint32x2_t x) {}
+void fn_u64x2 (float d0, float d1, float d2, float d3, svuint64x2_t x) {}
+void fn_bf16x2 (float d0, float d1, float d2, float d3, svbfloat16x2_t x) {}
+void fn_f16x2 (float d0, float d1, float d2, float d3, svfloat16x2_t x) {}
+void fn_f32x2 (float d0, float d1, float d2, float d3, svfloat32x2_t x) {}
+void fn_f64x2 (float d0, float d1, float d2, float d3, svfloat64x2_t x) {}
+
+void fn_s8x3 (float d0, float d1, float d2, float d3, svint8x3_t x) {}
+void fn_s16x3 (float d0, float d1, float d2, float d3, svint16x3_t x) {}
+void fn_s32x3 (float d0, float d1, float d2, float d3, svint32x3_t x) {}
+void fn_s64x3 (float d0, float d1, float d2, float d3, svint64x3_t x) {}
+void fn_u8x3 (float d0, float d1, float d2, float d3, svuint8x3_t x) {}
+void fn_u16x3 (float d0, float d1, float d2, float d3, svuint16x3_t x) {}
+void fn_u32x3 (float d0, float d1, float d2, float d3, svuint32x3_t x) {}
+void fn_u64x3 (float d0, float d1, float d2, float d3, svuint64x3_t x) {}
+void fn_bf16x3 (float d0, float d1, float d2, float d3, svbfloat16x3_t x) {}
+void fn_f16x3 (float d0, float d1, float d2, float d3, svfloat16x3_t x) {}
+void fn_f32x3 (float d0, float d1, float d2, float d3, svfloat32x3_t x) {}
+void fn_f64x3 (float d0, float d1, float d2, float d3, svfloat64x3_t x) {}
+
+void fn_s8x4 (float d0, float d1, float d2, float d3, svint8x4_t x) {}
+void fn_s16x4 (float d0, float d1, float d2, float d3, svint16x4_t x) {}
+void fn_s32x4 (float d0, float d1, float d2, float d3, svint32x4_t x) {}
+void fn_s64x4 (float d0, float d1, float d2, float d3, svint64x4_t x) {}
+void fn_u8x4 (float d0, float d1, float d2, float d3, svuint8x4_t x) {}
+void fn_u16x4 (float d0, float d1, float d2, float d3, svuint16x4_t x) {}
+void fn_u32x4 (float d0, float d1, float d2, float d3, svuint32x4_t x) {}
+void fn_u64x4 (float d0, float d1, float d2, float d3, svuint64x4_t x) {}
+void fn_bf16x4 (float d0, float d1, float d2, float d3, svbfloat16x4_t x) {}
+void fn_f16x4 (float d0, float d1, float d2, float d3, svfloat16x4_t x) {}
+void fn_f32x4 (float d0, float d1, float d2, float d3, svfloat32x4_t x) {}
+void fn_f64x4 (float d0, float d1, float d2, float d3, svfloat64x4_t x) {}
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x2\n} } } */
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x3\n} } } */
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_4.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_4.c
new file mode 100644
index 000000000..7e4438ed4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_4.c
@@ -0,0 +1,155 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void fn_s8 (float d0, float d1, float d2, float d3,
+	    float d4, svint8_t x) {}
+void fn_s16 (float d0, float d1, float d2, float d3,
+	     float d4, svint16_t x) {}
+void fn_s32 (float d0, float d1, float d2, float d3,
+	     float d4, svint32_t x) {}
+void fn_s64 (float d0, float d1, float d2, float d3,
+	     float d4, svint64_t x) {}
+void fn_u8 (float d0, float d1, float d2, float d3,
+	    float d4, svuint8_t x) {}
+void fn_u16 (float d0, float d1, float d2, float d3,
+	     float d4, svuint16_t x) {}
+void fn_u32 (float d0, float d1, float d2, float d3,
+	     float d4, svuint32_t x) {}
+void fn_u64 (float d0, float d1, float d2, float d3,
+	     float d4, svuint64_t x) {}
+void fn_bf16 (float d0, float d1, float d2, float d3,
+	      float d4, svbfloat16_t x) {}
+void fn_f16 (float d0, float d1, float d2, float d3,
+	     float d4, svfloat16_t x) {}
+void fn_f32 (float d0, float d1, float d2, float d3,
+	     float d4, svfloat32_t x) {}
+void fn_f64 (float d0, float d1, float d2, float d3,
+	     float d4, svfloat64_t x) {}
+
+void fn_s8x2 (float d0, float d1, float d2, float d3,
+	      float d4, svint8x2_t x) {}
+void fn_s16x2 (float d0, float d1, float d2, float d3,
+	       float d4, svint16x2_t x) {}
+void fn_s32x2 (float d0, float d1, float d2, float d3,
+	       float d4, svint32x2_t x) {}
+void fn_s64x2 (float d0, float d1, float d2, float d3,
+	       float d4, svint64x2_t x) {}
+void fn_u8x2 (float d0, float d1, float d2, float d3,
+	      float d4, svuint8x2_t x) {}
+void fn_u16x2 (float d0, float d1, float d2, float d3,
+	       float d4, svuint16x2_t x) {}
+void fn_u32x2 (float d0, float d1, float d2, float d3,
+	       float d4, svuint32x2_t x) {}
+void fn_u64x2 (float d0, float d1, float d2, float d3,
+	       float d4, svuint64x2_t x) {}
+void fn_bf16x2 (float d0, float d1, float d2, float d3,
+		float d4, svbfloat16x2_t x) {}
+void fn_f16x2 (float d0, float d1, float d2, float d3,
+	       float d4, svfloat16x2_t x) {}
+void fn_f32x2 (float d0, float d1, float d2, float d3,
+	       float d4, svfloat32x2_t x) {}
+void fn_f64x2 (float d0, float d1, float d2, float d3,
+	       float d4, svfloat64x2_t x) {}
+
+void fn_s8x3 (float d0, float d1, float d2, float d3,
+	      float d4, svint8x3_t x) {}
+void fn_s16x3 (float d0, float d1, float d2, float d3,
+	       float d4, svint16x3_t x) {}
+void fn_s32x3 (float d0, float d1, float d2, float d3,
+	       float d4, svint32x3_t x) {}
+void fn_s64x3 (float d0, float d1, float d2, float d3,
+	       float d4, svint64x3_t x) {}
+void fn_u8x3 (float d0, float d1, float d2, float d3,
+	      float d4, svuint8x3_t x) {}
+void fn_u16x3 (float d0, float d1, float d2, float d3,
+	       float d4, svuint16x3_t x) {}
+void fn_u32x3 (float d0, float d1, float d2, float d3,
+	       float d4, svuint32x3_t x) {}
+void fn_u64x3 (float d0, float d1, float d2, float d3,
+	       float d4, svuint64x3_t x) {}
+void fn_bf16x3 (float d0, float d1, float d2, float d3,
+		float d4, svbfloat16x3_t x) {}
+void fn_f16x3 (float d0, float d1, float d2, float d3,
+	       float d4, svfloat16x3_t x) {}
+void fn_f32x3 (float d0, float d1, float d2, float d3,
+	       float d4, svfloat32x3_t x) {}
+void fn_f64x3 (float d0, float d1, float d2, float d3,
+	       float d4, svfloat64x3_t x) {}
+
+void fn_s8x4 (float d0, float d1, float d2, float d3,
+	      float d4, svint8x4_t x) {}
+void fn_s16x4 (float d0, float d1, float d2, float d3,
+	       float d4, svint16x4_t x) {}
+void fn_s32x4 (float d0, float d1, float d2, float d3,
+	       float d4, svint32x4_t x) {}
+void fn_s64x4 (float d0, float d1, float d2, float d3,
+	       float d4, svint64x4_t x) {}
+void fn_u8x4 (float d0, float d1, float d2, float d3,
+	      float d4, svuint8x4_t x) {}
+void fn_u16x4 (float d0, float d1, float d2, float d3,
+	       float d4, svuint16x4_t x) {}
+void fn_u32x4 (float d0, float d1, float d2, float d3,
+	       float d4, svuint32x4_t x) {}
+void fn_u64x4 (float d0, float d1, float d2, float d3,
+	       float d4, svuint64x4_t x) {}
+void fn_bf16x4 (float d0, float d1, float d2, float d3,
+		float d4, svbfloat16x4_t x) {}
+void fn_f16x4 (float d0, float d1, float d2, float d3,
+	       float d4, svfloat16x4_t x) {}
+void fn_f32x4 (float d0, float d1, float d2, float d3,
+	       float d4, svfloat32x4_t x) {}
+void fn_f64x4 (float d0, float d1, float d2, float d3,
+	       float d4, svfloat64x4_t x) {}
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x2\n} } } */
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x3\n} } } */
+
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s8x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s16x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s32x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s64x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u8x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_5.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_5.c
new file mode 100644
index 000000000..6dadc0492
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_5.c
@@ -0,0 +1,155 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void fn_s8 (float d0, float d1, float d2, float d3,
+	    float d4, float d5, svint8_t x) {}
+void fn_s16 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, svint16_t x) {}
+void fn_s32 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, svint32_t x) {}
+void fn_s64 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, svint64_t x) {}
+void fn_u8 (float d0, float d1, float d2, float d3,
+	    float d4, float d5, svuint8_t x) {}
+void fn_u16 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, svuint16_t x) {}
+void fn_u32 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, svuint32_t x) {}
+void fn_u64 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, svuint64_t x) {}
+void fn_bf16 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, svbfloat16_t x) {}
+void fn_f16 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, svfloat16_t x) {}
+void fn_f32 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, svfloat32_t x) {}
+void fn_f64 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, svfloat64_t x) {}
+
+void fn_s8x2 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, svint8x2_t x) {}
+void fn_s16x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svint16x2_t x) {}
+void fn_s32x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svint32x2_t x) {}
+void fn_s64x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svint64x2_t x) {}
+void fn_u8x2 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, svuint8x2_t x) {}
+void fn_u16x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svuint16x2_t x) {}
+void fn_u32x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svuint32x2_t x) {}
+void fn_u64x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svuint64x2_t x) {}
+void fn_bf16x2 (float d0, float d1, float d2, float d3,
+		float d4, float d5, svbfloat16x2_t x) {}
+void fn_f16x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svfloat16x2_t x) {}
+void fn_f32x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svfloat32x2_t x) {}
+void fn_f64x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svfloat64x2_t x) {}
+
+void fn_s8x3 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, svint8x3_t x) {}
+void fn_s16x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svint16x3_t x) {}
+void fn_s32x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svint32x3_t x) {}
+void fn_s64x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svint64x3_t x) {}
+void fn_u8x3 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, svuint8x3_t x) {}
+void fn_u16x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svuint16x3_t x) {}
+void fn_u32x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svuint32x3_t x) {}
+void fn_u64x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svuint64x3_t x) {}
+void fn_bf16x3 (float d0, float d1, float d2, float d3,
+		float d4, float d5, svbfloat16x3_t x) {}
+void fn_f16x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svfloat16x3_t x) {}
+void fn_f32x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svfloat32x3_t x) {}
+void fn_f64x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svfloat64x3_t x) {}
+
+void fn_s8x4 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, svint8x4_t x) {}
+void fn_s16x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svint16x4_t x) {}
+void fn_s32x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svint32x4_t x) {}
+void fn_s64x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svint64x4_t x) {}
+void fn_u8x4 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, svuint8x4_t x) {}
+void fn_u16x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svuint16x4_t x) {}
+void fn_u32x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svuint32x4_t x) {}
+void fn_u64x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svuint64x4_t x) {}
+void fn_bf16x4 (float d0, float d1, float d2, float d3,
+		float d4, float d5, svbfloat16x4_t x) {}
+void fn_f16x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svfloat16x4_t x) {}
+void fn_f32x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svfloat32x4_t x) {}
+void fn_f64x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, svfloat64x4_t x) {}
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x2\n} } } */
+
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s8x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s16x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s32x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s64x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u8x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x3\n} } } */
+
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s8x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s16x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s32x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s64x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u8x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_6.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_6.c
new file mode 100644
index 000000000..0ff73e259
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_6.c
@@ -0,0 +1,155 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void fn_s8 (float d0, float d1, float d2, float d3,
+	    float d4, float d5, float d6, svint8_t x) {}
+void fn_s16 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, svint16_t x) {}
+void fn_s32 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, svint32_t x) {}
+void fn_s64 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, svint64_t x) {}
+void fn_u8 (float d0, float d1, float d2, float d3,
+	    float d4, float d5, float d6, svuint8_t x) {}
+void fn_u16 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, svuint16_t x) {}
+void fn_u32 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, svuint32_t x) {}
+void fn_u64 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, svuint64_t x) {}
+void fn_bf16 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, float d6, svbfloat16_t x) {}
+void fn_f16 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, svfloat16_t x) {}
+void fn_f32 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, svfloat32_t x) {}
+void fn_f64 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, svfloat64_t x) {}
+
+void fn_s8x2 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, float d6, svint8x2_t x) {}
+void fn_s16x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svint16x2_t x) {}
+void fn_s32x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svint32x2_t x) {}
+void fn_s64x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svint64x2_t x) {}
+void fn_u8x2 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, float d6, svuint8x2_t x) {}
+void fn_u16x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svuint16x2_t x) {}
+void fn_u32x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svuint32x2_t x) {}
+void fn_u64x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svuint64x2_t x) {}
+void fn_bf16x2 (float d0, float d1, float d2, float d3,
+		float d4, float d5, float d6, svbfloat16x2_t x) {}
+void fn_f16x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svfloat16x2_t x) {}
+void fn_f32x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svfloat32x2_t x) {}
+void fn_f64x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svfloat64x2_t x) {}
+
+void fn_s8x3 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, float d6, svint8x3_t x) {}
+void fn_s16x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svint16x3_t x) {}
+void fn_s32x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svint32x3_t x) {}
+void fn_s64x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svint64x3_t x) {}
+void fn_u8x3 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, float d6, svuint8x3_t x) {}
+void fn_u16x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svuint16x3_t x) {}
+void fn_u32x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svuint32x3_t x) {}
+void fn_u64x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svuint64x3_t x) {}
+void fn_bf16x3 (float d0, float d1, float d2, float d3,
+		float d4, float d5, float d6, svbfloat16x3_t x) {}
+void fn_f16x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svfloat16x3_t x) {}
+void fn_f32x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svfloat32x3_t x) {}
+void fn_f64x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svfloat64x3_t x) {}
+
+void fn_s8x4 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, float d6, svint8x4_t x) {}
+void fn_s16x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svint16x4_t x) {}
+void fn_s32x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svint32x4_t x) {}
+void fn_s64x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svint64x4_t x) {}
+void fn_u8x4 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, float d6, svuint8x4_t x) {}
+void fn_u16x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svuint16x4_t x) {}
+void fn_u32x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svuint32x4_t x) {}
+void fn_u64x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svuint64x4_t x) {}
+void fn_bf16x4 (float d0, float d1, float d2, float d3,
+		float d4, float d5, float d6, svbfloat16x4_t x) {}
+void fn_f16x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svfloat16x4_t x) {}
+void fn_f32x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svfloat32x4_t x) {}
+void fn_f64x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, svfloat64x4_t x) {}
+
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s8\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_s64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u8\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
+
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s8x2\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s16x2\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s32x2\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s64x2\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u8x2\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x2\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x2\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x2\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x2\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x2\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x2\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x2\n} } } */
+
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s8x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s16x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s32x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s64x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u8x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x3\n} } } */
+
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s8x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s16x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s32x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_s64x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u8x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_7.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_7.c
new file mode 100644
index 000000000..4f3ff8107
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/annotate_7.c
@@ -0,0 +1,105 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void fn_s8 (float d0, float d1, float d2, float d3,
+	    float d4, float d5, float d6, float d7, svint8_t x) {}
+void fn_s16 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, float d7, svint16_t x) {}
+void fn_s32 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, float d7, svint32_t x) {}
+void fn_s64 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, float d7, svint64_t x) {}
+void fn_u8 (float d0, float d1, float d2, float d3,
+	    float d4, float d5, float d6, float d7, svuint8_t x) {}
+void fn_u16 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, float d7, svuint16_t x) {}
+void fn_u32 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, float d7, svuint32_t x) {}
+void fn_u64 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, float d7, svuint64_t x) {}
+void fn_bf16 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, float d6, float d7, svbfloat16_t x) {}
+void fn_f16 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, float d7, svfloat16_t x) {}
+void fn_f32 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, float d7, svfloat32_t x) {}
+void fn_f64 (float d0, float d1, float d2, float d3,
+	     float d4, float d5, float d6, float d7, svfloat64_t x) {}
+
+void fn_s8x2 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, float d6, float d7, svint8x2_t x) {}
+void fn_s16x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svint16x2_t x) {}
+void fn_s32x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svint32x2_t x) {}
+void fn_s64x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svint64x2_t x) {}
+void fn_u8x2 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, float d6, float d7, svuint8x2_t x) {}
+void fn_u16x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svuint16x2_t x) {}
+void fn_u32x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svuint32x2_t x) {}
+void fn_u64x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svuint64x2_t x) {}
+void fn_bf16x2 (float d0, float d1, float d2, float d3,
+		float d4, float d5, float d6, float d7, svbfloat16x2_t x) {}
+void fn_f16x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svfloat16x2_t x) {}
+void fn_f32x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svfloat32x2_t x) {}
+void fn_f64x2 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svfloat64x2_t x) {}
+
+void fn_s8x3 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, float d6, float d7, svint8x3_t x) {}
+void fn_s16x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svint16x3_t x) {}
+void fn_s32x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svint32x3_t x) {}
+void fn_s64x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svint64x3_t x) {}
+void fn_u8x3 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, float d6, float d7, svuint8x3_t x) {}
+void fn_u16x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svuint16x3_t x) {}
+void fn_u32x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svuint32x3_t x) {}
+void fn_u64x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svuint64x3_t x) {}
+void fn_bf16x3 (float d0, float d1, float d2, float d3,
+		float d4, float d5, float d6, float d7, svbfloat16x3_t x) {}
+void fn_f16x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svfloat16x3_t x) {}
+void fn_f32x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svfloat32x3_t x) {}
+void fn_f64x3 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svfloat64x3_t x) {}
+
+void fn_s8x4 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, float d6, float d7, svint8x4_t x) {}
+void fn_s16x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svint16x4_t x) {}
+void fn_s32x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svint32x4_t x) {}
+void fn_s64x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svint64x4_t x) {}
+void fn_u8x4 (float d0, float d1, float d2, float d3,
+	      float d4, float d5, float d6, float d7, svuint8x4_t x) {}
+void fn_u16x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svuint16x4_t x) {}
+void fn_u32x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svuint32x4_t x) {}
+void fn_u64x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svuint64x4_t x) {}
+void fn_bf16x4 (float d0, float d1, float d2, float d3,
+		float d4, float d5, float d6, float d7, svbfloat16x4_t x) {}
+void fn_f16x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svfloat16x4_t x) {}
+void fn_f32x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svfloat32x4_t x) {}
+void fn_f64x4 (float d0, float d1, float d2, float d3,
+	       float d4, float d5, float d6, float d7, svfloat64x4_t x) {}
+
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\t\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_1.c
new file mode 100644
index 000000000..fd9932e2e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_1.c
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-O -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+
+/*
+** callee_pred:
+**	ldr	(p[0-9]+), \[x0\]
+**	ldr	(p[0-9]+), \[x1\]
+**	brkpa	(p[0-7])\.b, p0/z, p1\.b, p2\.b
+**	brkpb	(p[0-7])\.b, \3/z, p3\.b, \1\.b
+**	brka	p0\.b, \4/z, \2\.b
+**	ret
+*/
+__SVBool_t __attribute__((noipa))
+callee_pred (__SVBool_t p0, __SVBool_t p1, __SVBool_t p2, __SVBool_t p3,
+	     __SVBool_t mem0, __SVBool_t mem1)
+{
+  p0 = svbrkpa_z (p0, p1, p2);
+  p0 = svbrkpb_z (p0, p3, mem0);
+  return svbrka_z (p0, mem1);
+}
+
+/*
+** caller_pred:
+**	...
+**	ptrue	(p[0-9]+)\.b, vl5
+**	str	\1, \[x0\]
+**	...
+**	ptrue	(p[0-9]+)\.h, vl6
+**	str	\2, \[x1\]
+**	ptrue	p3\.d, vl4
+**	ptrue	p2\.s, vl3
+**	ptrue	p1\.h, vl2
+**	ptrue	p0\.b, vl1
+**	bl	callee_pred
+**	...
+*/
+__SVBool_t __attribute__((noipa))
+caller_pred (void)
+{
+  return callee_pred (svptrue_pat_b8 (SV_VL1),
+		      svptrue_pat_b16 (SV_VL2),
+		      svptrue_pat_b32 (SV_VL3),
+		      svptrue_pat_b64 (SV_VL4),
+		      svptrue_pat_b8 (SV_VL5),
+		      svptrue_pat_b16 (SV_VL6));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_10.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_10.c
new file mode 100644
index 000000000..1bbcb770d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_10.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+/*
+** callee:
+**	fadd	s0, (s0, s6|s6, s0)
+**	ret
+*/
+float __attribute__((noipa))
+callee (float s0, double d1, svfloat32x4_t z2, svfloat64x4_t stack1,
+	float s6, double d7)
+{
+  return s0 + s6;
+}
+
+float __attribute__((noipa))
+caller (float32_t *x0, float64_t *x1)
+{
+  return callee (0.0f, 1.0,
+		 svld4 (svptrue_b8 (), x0),
+		 svld4 (svptrue_b8 (), x1),
+		 6.0f, 7.0);
+}
+
+/* { dg-final { scan-assembler {\tld4w\t{z2\.s - z5\.s}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4d\t{z[0-9]+\.d - z[0-9]+\.d}, p[0-7]/z, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tmovi\tv0\.[24]s, #0\n} } } */
+/* { dg-final { scan-assembler {\tfmov\td1, #?1\.0} } } */
+/* { dg-final { scan-assembler {\tfmov\ts6, #?6\.0} } } */
+/* { dg-final { scan-assembler {\tfmov\td7, #?7\.0} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_11_nosc.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_11_nosc.c
new file mode 100644
index 000000000..0f62e0b08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_11_nosc.c
@@ -0,0 +1,61 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O0 -g" } */
+
+#include <arm_sve.h>
+
+void __attribute__((noipa))
+callee (svbool_t p, svint8_t s8, svuint16x4_t u16, svfloat32x3_t f32,
+	svint64x2_t s64)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+
+  if (svptest_any (pg, sveor_z (pg, p, svptrue_pat_b8 (SV_VL7))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, s8, svindex_s8 (1, 2))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 0), svindex_u16 (2, 3))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 1), svindex_u16 (3, 4))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 2), svindex_u16 (4, 5))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 3), svindex_u16 (5, 6))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 0), svdup_f32 (1.0))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 1), svdup_f32 (2.0))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 2), svdup_f32 (3.0))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget2 (s64, 0), svindex_s64 (6, 7))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget2 (s64, 1), svindex_s64 (7, 8))))
+    __builtin_abort ();
+}
+
+int __attribute__((noipa))
+main (void)
+{
+  callee (svptrue_pat_b8 (SV_VL7),
+	  svindex_s8 (1, 2),
+	  svcreate4 (svindex_u16 (2, 3),
+		     svindex_u16 (3, 4),
+		     svindex_u16 (4, 5),
+		     svindex_u16 (5, 6)),
+	  svcreate3 (svdup_f32 (1.0),
+		     svdup_f32 (2.0),
+		     svdup_f32 (3.0)),
+	  svcreate2 (svindex_s64 (6, 7),
+		     svindex_s64 (7, 8)));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_11_sc.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_11_sc.c
new file mode 100644
index 000000000..8a98d58ce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_11_sc.c
@@ -0,0 +1,61 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O0 -fstack-clash-protection -g" } */
+
+#include <arm_sve.h>
+
+void __attribute__((noipa))
+callee (svbool_t p, svint8_t s8, svuint16x4_t u16, svfloat32x3_t f32,
+	svint64x2_t s64)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+
+  if (svptest_any (pg, sveor_z (pg, p, svptrue_pat_b8 (SV_VL7))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, s8, svindex_s8 (1, 2))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 0), svindex_u16 (2, 3))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 1), svindex_u16 (3, 4))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 2), svindex_u16 (4, 5))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 3), svindex_u16 (5, 6))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 0), svdup_f32 (1.0))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 1), svdup_f32 (2.0))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 2), svdup_f32 (3.0))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget2 (s64, 0), svindex_s64 (6, 7))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget2 (s64, 1), svindex_s64 (7, 8))))
+    __builtin_abort ();
+}
+
+int __attribute__((noipa))
+main (void)
+{
+  callee (svptrue_pat_b8 (SV_VL7),
+	  svindex_s8 (1, 2),
+	  svcreate4 (svindex_u16 (2, 3),
+		     svindex_u16 (3, 4),
+		     svindex_u16 (4, 5),
+		     svindex_u16 (5, 6)),
+	  svcreate3 (svdup_f32 (1.0),
+		     svdup_f32 (2.0),
+		     svdup_f32 (3.0)),
+	  svcreate2 (svindex_s64 (6, 7),
+		     svindex_s64 (7, 8)));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_2.c
new file mode 100644
index 000000000..43a50887d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_2.c
@@ -0,0 +1,70 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+
+/*
+** callee_int:
+**	ptrue	p3\.b, all
+**	ld1b	(z(?:2[4-9]|3[0-1]).b), p3/z, \[x4\]
+**	st1b	\1, p2, \[x0\]
+**	st1b	z4\.b, p1, \[x0\]
+**	st1h	z5\.h, p1, \[x1\]
+**	st1w	z6\.s, p1, \[x2\]
+**	st1d	z7\.d, p1, \[x3\]
+**	st1b	z0\.b, p0, \[x0\]
+**	st1h	z1\.h, p0, \[x1\]
+**	st1w	z2\.s, p0, \[x2\]
+**	st1d	z3\.d, p0, \[x3\]
+**	ret
+*/
+void __attribute__((noipa))
+callee_int (int8_t *x0, int16_t *x1, int32_t *x2, int64_t *x3,
+	    svint8_t z0, svint16_t z1, svint32_t z2, svint64_t z3,
+	    svint8_t z4, svint16_t z5, svint32_t z6, svint64_t z7,
+	    svint8_t z8,
+	    svbool_t p0, svbool_t p1, svbool_t p2)
+{
+  svst1 (p2, x0, z8);
+  svst1 (p1, x0, z4);
+  svst1 (p1, x1, z5);
+  svst1 (p1, x2, z6);
+  svst1 (p1, x3, z7);
+  svst1 (p0, x0, z0);
+  svst1 (p0, x1, z1);
+  svst1 (p0, x2, z2);
+  svst1 (p0, x3, z3);
+}
+
+void __attribute__((noipa))
+caller_int (int8_t *x0, int16_t *x1, int32_t *x2, int64_t *x3)
+{
+  callee_int (x0, x1, x2, x3,
+	      svdup_s8 (0),
+	      svdup_s16 (1),
+	      svdup_s32 (2),
+	      svdup_s64 (3),
+	      svdup_s8 (4),
+	      svdup_s16 (5),
+	      svdup_s32 (6),
+	      svdup_s64 (7),
+	      svdup_s8 (8),
+	      svptrue_pat_b8 (SV_VL1),
+	      svptrue_pat_b16 (SV_VL2),
+	      svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tmov\tz0\.b, #0\n} } } */
+/* { dg-final { scan-assembler {\tmov\tz1\.h, #1\n} } } */
+/* { dg-final { scan-assembler {\tmov\tz2\.s, #2\n} } } */
+/* { dg-final { scan-assembler {\tmov\tz3\.d, #3\n} } } */
+/* { dg-final { scan-assembler {\tmov\tz4\.b, #4\n} } } */
+/* { dg-final { scan-assembler {\tmov\tz5\.h, #5\n} } } */
+/* { dg-final { scan-assembler {\tmov\tz6\.s, #6\n} } } */
+/* { dg-final { scan-assembler {\tmov\tz7\.d, #7\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx4, sp\n} } } */
+/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.b), #8\n.*\tst1b\t\1, p[0-7], \[x4\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_3.c
new file mode 100644
index 000000000..49fdfc984
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_3.c
@@ -0,0 +1,70 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+
+/*
+** callee_uint:
+**	ptrue	p3\.b, all
+**	ld1b	(z(?:2[4-9]|3[0-1]).b), p3/z, \[x4\]
+**	st1b	\1, p2, \[x0\]
+**	st1b	z4\.b, p1, \[x0\]
+**	st1h	z5\.h, p1, \[x1\]
+**	st1w	z6\.s, p1, \[x2\]
+**	st1d	z7\.d, p1, \[x3\]
+**	st1b	z0\.b, p0, \[x0\]
+**	st1h	z1\.h, p0, \[x1\]
+**	st1w	z2\.s, p0, \[x2\]
+**	st1d	z3\.d, p0, \[x3\]
+**	ret
+*/
+void __attribute__((noipa))
+callee_uint (uint8_t *x0, uint16_t *x1, uint32_t *x2, uint64_t *x3,
+	     svuint8_t z0, svuint16_t z1, svuint32_t z2, svuint64_t z3,
+	     svuint8_t z4, svuint16_t z5, svuint32_t z6, svuint64_t z7,
+	     svuint8_t z8,
+	     svbool_t p0, svbool_t p1, svbool_t p2)
+{
+  svst1 (p2, x0, z8);
+  svst1 (p1, x0, z4);
+  svst1 (p1, x1, z5);
+  svst1 (p1, x2, z6);
+  svst1 (p1, x3, z7);
+  svst1 (p0, x0, z0);
+  svst1 (p0, x1, z1);
+  svst1 (p0, x2, z2);
+  svst1 (p0, x3, z3);
+}
+
+void __attribute__((noipa))
+caller_uint (uint8_t *x0, uint16_t *x1, uint32_t *x2, uint64_t *x3)
+{
+  callee_uint (x0, x1, x2, x3,
+	       svdup_u8 (0),
+	       svdup_u16 (1),
+	       svdup_u32 (2),
+	       svdup_u64 (3),
+	       svdup_u8 (4),
+	       svdup_u16 (5),
+	       svdup_u32 (6),
+	       svdup_u64 (7),
+	       svdup_u8 (8),
+	       svptrue_pat_b8 (SV_VL1),
+	       svptrue_pat_b16 (SV_VL2),
+	       svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tmov\tz0\.b, #0\n} } } */
+/* { dg-final { scan-assembler {\tmov\tz1\.h, #1\n} } } */
+/* { dg-final { scan-assembler {\tmov\tz2\.s, #2\n} } } */
+/* { dg-final { scan-assembler {\tmov\tz3\.d, #3\n} } } */
+/* { dg-final { scan-assembler {\tmov\tz4\.b, #4\n} } } */
+/* { dg-final { scan-assembler {\tmov\tz5\.h, #5\n} } } */
+/* { dg-final { scan-assembler {\tmov\tz6\.s, #6\n} } } */
+/* { dg-final { scan-assembler {\tmov\tz7\.d, #7\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx4, sp\n} } } */
+/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.b), #8\n.*\tst1b\t\1, p[0-7], \[x4\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_4.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_4.c
new file mode 100644
index 000000000..4f15fdd50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_4.c
@@ -0,0 +1,70 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+
+/*
+** callee_float:
+**	ptrue	p3\.b, all
+**	ld1h	(z(?:2[4-9]|3[0-1]).h), p3/z, \[x4\]
+**	st1h	\1, p2, \[x0\]
+**	st1h	z4\.h, p1, \[x0\]
+**	st1h	z5\.h, p1, \[x1\]
+**	st1w	z6\.s, p1, \[x2\]
+**	st1d	z7\.d, p1, \[x3\]
+**	st1h	z0\.h, p0, \[x0\]
+**	st1h	z1\.h, p0, \[x1\]
+**	st1w	z2\.s, p0, \[x2\]
+**	st1d	z3\.d, p0, \[x3\]
+**	ret
+*/
+void __attribute__((noipa))
+callee_float (float16_t *x0, float16_t *x1, float32_t *x2, float64_t *x3,
+	      svfloat16_t z0, svfloat16_t z1, svfloat32_t z2, svfloat64_t z3,
+	      svfloat16_t z4, svfloat16_t z5, svfloat32_t z6, svfloat64_t z7,
+	      svfloat16_t z8,
+	      svbool_t p0, svbool_t p1, svbool_t p2)
+{
+  svst1 (p2, x0, z8);
+  svst1 (p1, x0, z4);
+  svst1 (p1, x1, z5);
+  svst1 (p1, x2, z6);
+  svst1 (p1, x3, z7);
+  svst1 (p0, x0, z0);
+  svst1 (p0, x1, z1);
+  svst1 (p0, x2, z2);
+  svst1 (p0, x3, z3);
+}
+
+void __attribute__((noipa))
+caller_float (float16_t *x0, float16_t *x1, float32_t *x2, float64_t *x3)
+{
+  callee_float (x0, x1, x2, x3,
+		svdup_f16 (0),
+		svdup_f16 (1),
+		svdup_f32 (2),
+		svdup_f64 (3),
+		svdup_f16 (4),
+		svdup_f16 (5),
+		svdup_f32 (6),
+		svdup_f64 (7),
+		svdup_f16 (8),
+		svptrue_pat_b8 (SV_VL1),
+		svptrue_pat_b16 (SV_VL2),
+		svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tmov\tz0\.[bhsd], #0\n} } } */
+/* { dg-final { scan-assembler {\tfmov\tz1\.h, #1\.0} } } */
+/* { dg-final { scan-assembler {\tfmov\tz2\.s, #2\.0} } } */
+/* { dg-final { scan-assembler {\tfmov\tz3\.d, #3\.0} } } */
+/* { dg-final { scan-assembler {\tfmov\tz4\.h, #4\.0} } } */
+/* { dg-final { scan-assembler {\tfmov\tz5\.h, #5\.0} } } */
+/* { dg-final { scan-assembler {\tfmov\tz6\.s, #6\.0} } } */
+/* { dg-final { scan-assembler {\tfmov\tz7\.d, #7\.0} } } */
+/* { dg-final { scan-assembler {\tmov\tx4, sp\n} } } */
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.h), #8\.0.*\tst1h\t\1, p[0-7], \[x4\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_bf16.c
new file mode 100644
index 000000000..e9b63a45d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_bf16.c
@@ -0,0 +1,63 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+**	addvl	sp, sp, #-1
+**	str	p4, \[sp\]
+**	ptrue	p4\.b, all
+** (
+**	ld1h	(z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
+**	ld1h	(z[0-9]+\.h), p4/z, \[x1\]
+**	st2h	{\2 - \1}, p0, \[x0\]
+** |
+**	ld1h	(z[0-9]+\.h), p4/z, \[x1\]
+**	ld1h	(z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
+**	st2h	{\3 - \4}, p0, \[x0\]
+** )
+**	st4h	{z0\.h - z3\.h}, p1, \[x0\]
+**	st3h	{z4\.h - z6\.h}, p2, \[x0\]
+**	st1h	z7\.h, p3, \[x0\]
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svbfloat16x4_t z0, svbfloat16x3_t z4, svbfloat16x2_t stack,
+	svbfloat16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_bf16 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_bf16 (pg, x0, -8),
+	  svld3_vnum_bf16 (pg, x0, -3),
+	  svld2_vnum_bf16 (pg, x0, 0),
+	  svld1_vnum_bf16 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f16.c
new file mode 100644
index 000000000..4152f9125
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f16.c
@@ -0,0 +1,63 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+**	addvl	sp, sp, #-1
+**	str	p4, \[sp\]
+**	ptrue	p4\.b, all
+** (
+**	ld1h	(z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
+**	ld1h	(z[0-9]+\.h), p4/z, \[x1\]
+**	st2h	{\2 - \1}, p0, \[x0\]
+** |
+**	ld1h	(z[0-9]+\.h), p4/z, \[x1\]
+**	ld1h	(z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
+**	st2h	{\3 - \4}, p0, \[x0\]
+** )
+**	st4h	{z0\.h - z3\.h}, p1, \[x0\]
+**	st3h	{z4\.h - z6\.h}, p2, \[x0\]
+**	st1h	z7\.h, p3, \[x0\]
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svfloat16x4_t z0, svfloat16x3_t z4, svfloat16x2_t stack,
+	svfloat16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_f16 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_f16 (pg, x0, -8),
+	  svld3_vnum_f16 (pg, x0, -3),
+	  svld2_vnum_f16 (pg, x0, 0),
+	  svld1_vnum_f16 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f32.c
new file mode 100644
index 000000000..0f78fac79
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f32.c
@@ -0,0 +1,63 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+**	addvl	sp, sp, #-1
+**	str	p4, \[sp\]
+**	ptrue	p4\.b, all
+** (
+**	ld1w	(z[0-9]+\.s), p4/z, \[x1, #1, mul vl\]
+**	ld1w	(z[0-9]+\.s), p4/z, \[x1\]
+**	st2w	{\2 - \1}, p0, \[x0\]
+** |
+**	ld1w	(z[0-9]+\.s), p4/z, \[x1\]
+**	ld1w	(z[0-9]+\.s), p4/z, \[x1, #1, mul vl\]
+**	st2w	{\3 - \4}, p0, \[x0\]
+** )
+**	st4w	{z0\.s - z3\.s}, p1, \[x0\]
+**	st3w	{z4\.s - z6\.s}, p2, \[x0\]
+**	st1w	z7\.s, p3, \[x0\]
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svfloat32x4_t z0, svfloat32x3_t z4, svfloat32x2_t stack,
+	svfloat32_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_f32 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_f32 (pg, x0, -8),
+	  svld3_vnum_f32 (pg, x0, -3),
+	  svld2_vnum_f32 (pg, x0, 0),
+	  svld1_vnum_f32 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4w\t{z0\.s - z3\.s}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3w\t{z4\.s - z6\.s}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1w\tz7\.s, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{(z[0-9]+\.s) - z[0-9]+\.s}.*\tst1w\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{z[0-9]+\.s - (z[0-9]+\.s)}.*\tst1w\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f64.c
new file mode 100644
index 000000000..fe832d0d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_f64.c
@@ -0,0 +1,63 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+**	addvl	sp, sp, #-1
+**	str	p4, \[sp\]
+**	ptrue	p4\.b, all
+** (
+**	ld1d	(z[0-9]+\.d), p4/z, \[x1, #1, mul vl\]
+**	ld1d	(z[0-9]+\.d), p4/z, \[x1\]
+**	st2d	{\2 - \1}, p0, \[x0\]
+** |
+**	ld1d	(z[0-9]+\.d), p4/z, \[x1\]
+**	ld1d	(z[0-9]+\.d), p4/z, \[x1, #1, mul vl\]
+**	st2d	{\3 - \4}, p0, \[x0\]
+** )
+**	st4d	{z0\.d - z3\.d}, p1, \[x0\]
+**	st3d	{z4\.d - z6\.d}, p2, \[x0\]
+**	st1d	z7\.d, p3, \[x0\]
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svfloat64x4_t z0, svfloat64x3_t z4, svfloat64x2_t stack,
+	svfloat64_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_f64 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_f64 (pg, x0, -8),
+	  svld3_vnum_f64 (pg, x0, -3),
+	  svld2_vnum_f64 (pg, x0, 0),
+	  svld1_vnum_f64 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4d\t{z0\.d - z3\.d}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3d\t{z4\.d - z6\.d}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1d\tz7\.d, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{(z[0-9]+\.d) - z[0-9]+\.d}.*\tst1d\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{z[0-9]+\.d - (z[0-9]+\.d)}.*\tst1d\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s16.c
new file mode 100644
index 000000000..3f708e0f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s16.c
@@ -0,0 +1,63 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+**	addvl	sp, sp, #-1
+**	str	p4, \[sp\]
+**	ptrue	p4\.b, all
+** (
+**	ld1h	(z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
+**	ld1h	(z[0-9]+\.h), p4/z, \[x1\]
+**	st2h	{\2 - \1}, p0, \[x0\]
+** |
+**	ld1h	(z[0-9]+\.h), p4/z, \[x1\]
+**	ld1h	(z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
+**	st2h	{\3 - \4}, p0, \[x0\]
+** )
+**	st4h	{z0\.h - z3\.h}, p1, \[x0\]
+**	st3h	{z4\.h - z6\.h}, p2, \[x0\]
+**	st1h	z7\.h, p3, \[x0\]
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svint16x4_t z0, svint16x3_t z4, svint16x2_t stack,
+	svint16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_s16 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_s16 (pg, x0, -8),
+	  svld3_vnum_s16 (pg, x0, -3),
+	  svld2_vnum_s16 (pg, x0, 0),
+	  svld1_vnum_s16 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s32.c
new file mode 100644
index 000000000..8c57190ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s32.c
@@ -0,0 +1,63 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+**	addvl	sp, sp, #-1
+**	str	p4, \[sp\]
+**	ptrue	p4\.b, all
+** (
+**	ld1w	(z[0-9]+\.s), p4/z, \[x1, #1, mul vl\]
+**	ld1w	(z[0-9]+\.s), p4/z, \[x1\]
+**	st2w	{\2 - \1}, p0, \[x0\]
+** |
+**	ld1w	(z[0-9]+\.s), p4/z, \[x1\]
+**	ld1w	(z[0-9]+\.s), p4/z, \[x1, #1, mul vl\]
+**	st2w	{\3 - \4}, p0, \[x0\]
+** )
+**	st4w	{z0\.s - z3\.s}, p1, \[x0\]
+**	st3w	{z4\.s - z6\.s}, p2, \[x0\]
+**	st1w	z7\.s, p3, \[x0\]
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svint32x4_t z0, svint32x3_t z4, svint32x2_t stack,
+	svint32_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_s32 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_s32 (pg, x0, -8),
+	  svld3_vnum_s32 (pg, x0, -3),
+	  svld2_vnum_s32 (pg, x0, 0),
+	  svld1_vnum_s32 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4w\t{z0\.s - z3\.s}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3w\t{z4\.s - z6\.s}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1w\tz7\.s, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{(z[0-9]+\.s) - z[0-9]+\.s}.*\tst1w\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{z[0-9]+\.s - (z[0-9]+\.s)}.*\tst1w\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s64.c
new file mode 100644
index 000000000..e60d049fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s64.c
@@ -0,0 +1,63 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+**	addvl	sp, sp, #-1
+**	str	p4, \[sp\]
+**	ptrue	p4\.b, all
+** (
+**	ld1d	(z[0-9]+\.d), p4/z, \[x1, #1, mul vl\]
+**	ld1d	(z[0-9]+\.d), p4/z, \[x1\]
+**	st2d	{\2 - \1}, p0, \[x0\]
+** |
+**	ld1d	(z[0-9]+\.d), p4/z, \[x1\]
+**	ld1d	(z[0-9]+\.d), p4/z, \[x1, #1, mul vl\]
+**	st2d	{\3 - \4}, p0, \[x0\]
+** )
+**	st4d	{z0\.d - z3\.d}, p1, \[x0\]
+**	st3d	{z4\.d - z6\.d}, p2, \[x0\]
+**	st1d	z7\.d, p3, \[x0\]
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svint64x4_t z0, svint64x3_t z4, svint64x2_t stack,
+	svint64_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_s64 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_s64 (pg, x0, -8),
+	  svld3_vnum_s64 (pg, x0, -3),
+	  svld2_vnum_s64 (pg, x0, 0),
+	  svld1_vnum_s64 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4d\t{z0\.d - z3\.d}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3d\t{z4\.d - z6\.d}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1d\tz7\.d, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{(z[0-9]+\.d) - z[0-9]+\.d}.*\tst1d\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{z[0-9]+\.d - (z[0-9]+\.d)}.*\tst1d\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s8.c
new file mode 100644
index 000000000..bc0058372
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_s8.c
@@ -0,0 +1,63 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+**	addvl	sp, sp, #-1
+**	str	p4, \[sp\]
+**	ptrue	p4\.b, all
+** (
+**	ld1b	(z[0-9]+\.b), p4/z, \[x1, #1, mul vl\]
+**	ld1b	(z[0-9]+\.b), p4/z, \[x1\]
+**	st2b	{\2 - \1}, p0, \[x0\]
+** |
+**	ld1b	(z[0-9]+\.b), p4/z, \[x1\]
+**	ld1b	(z[0-9]+\.b), p4/z, \[x1, #1, mul vl\]
+**	st2b	{\3 - \4}, p0, \[x0\]
+** )
+**	st4b	{z0\.b - z3\.b}, p1, \[x0\]
+**	st3b	{z4\.b - z6\.b}, p2, \[x0\]
+**	st1b	z7\.b, p3, \[x0\]
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svint8x4_t z0, svint8x3_t z4, svint8x2_t stack,
+	svint8_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_s8 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_s8 (pg, x0, -8),
+	  svld3_vnum_s8 (pg, x0, -3),
+	  svld2_vnum_s8 (pg, x0, 0),
+	  svld1_vnum_s8 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4b\t{z0\.b - z3\.b}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3b\t{z4\.b - z6\.b}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1b\tz7\.b, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2b\t{(z[0-9]+\.b) - z[0-9]+\.b}.*\tst1b\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2b\t{z[0-9]+\.b - (z[0-9]+\.b)}.*\tst1b\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u16.c
new file mode 100644
index 000000000..8aa651a41
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u16.c
@@ -0,0 +1,63 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+**	addvl	sp, sp, #-1
+**	str	p4, \[sp\]
+**	ptrue	p4\.b, all
+** (
+**	ld1h	(z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
+**	ld1h	(z[0-9]+\.h), p4/z, \[x1\]
+**	st2h	{\2 - \1}, p0, \[x0\]
+** |
+**	ld1h	(z[0-9]+\.h), p4/z, \[x1\]
+**	ld1h	(z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
+**	st2h	{\3 - \4}, p0, \[x0\]
+** )
+**	st4h	{z0\.h - z3\.h}, p1, \[x0\]
+**	st3h	{z4\.h - z6\.h}, p2, \[x0\]
+**	st1h	z7\.h, p3, \[x0\]
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svuint16x4_t z0, svuint16x3_t z4, svuint16x2_t stack,
+	svuint16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_u16 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_u16 (pg, x0, -8),
+	  svld3_vnum_u16 (pg, x0, -3),
+	  svld2_vnum_u16 (pg, x0, 0),
+	  svld1_vnum_u16 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u32.c
new file mode 100644
index 000000000..9ea3066ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u32.c
@@ -0,0 +1,63 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+**	addvl	sp, sp, #-1
+**	str	p4, \[sp\]
+**	ptrue	p4\.b, all
+** (
+**	ld1w	(z[0-9]+\.s), p4/z, \[x1, #1, mul vl\]
+**	ld1w	(z[0-9]+\.s), p4/z, \[x1\]
+**	st2w	{\2 - \1}, p0, \[x0\]
+** |
+**	ld1w	(z[0-9]+\.s), p4/z, \[x1\]
+**	ld1w	(z[0-9]+\.s), p4/z, \[x1, #1, mul vl\]
+**	st2w	{\3 - \4}, p0, \[x0\]
+** )
+**	st4w	{z0\.s - z3\.s}, p1, \[x0\]
+**	st3w	{z4\.s - z6\.s}, p2, \[x0\]
+**	st1w	z7\.s, p3, \[x0\]
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svuint32x4_t z0, svuint32x3_t z4, svuint32x2_t stack,
+	svuint32_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_u32 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_u32 (pg, x0, -8),
+	  svld3_vnum_u32 (pg, x0, -3),
+	  svld2_vnum_u32 (pg, x0, 0),
+	  svld1_vnum_u32 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4w\t{z0\.s - z3\.s}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3w\t{z4\.s - z6\.s}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1w\tz7\.s, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{(z[0-9]+\.s) - z[0-9]+\.s}.*\tst1w\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{z[0-9]+\.s - (z[0-9]+\.s)}.*\tst1w\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u64.c
new file mode 100644
index 000000000..b64f3b6d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u64.c
@@ -0,0 +1,63 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+**	addvl	sp, sp, #-1
+**	str	p4, \[sp\]
+**	ptrue	p4\.b, all
+** (
+**	ld1d	(z[0-9]+\.d), p4/z, \[x1, #1, mul vl\]
+**	ld1d	(z[0-9]+\.d), p4/z, \[x1\]
+**	st2d	{\2 - \1}, p0, \[x0\]
+** |
+**	ld1d	(z[0-9]+\.d), p4/z, \[x1\]
+**	ld1d	(z[0-9]+\.d), p4/z, \[x1, #1, mul vl\]
+**	st2d	{\3 - \4}, p0, \[x0\]
+** )
+**	st4d	{z0\.d - z3\.d}, p1, \[x0\]
+**	st3d	{z4\.d - z6\.d}, p2, \[x0\]
+**	st1d	z7\.d, p3, \[x0\]
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svuint64x4_t z0, svuint64x3_t z4, svuint64x2_t stack,
+	svuint64_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_u64 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_u64 (pg, x0, -8),
+	  svld3_vnum_u64 (pg, x0, -3),
+	  svld2_vnum_u64 (pg, x0, 0),
+	  svld1_vnum_u64 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4d\t{z0\.d - z3\.d}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3d\t{z4\.d - z6\.d}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1d\tz7\.d, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{(z[0-9]+\.d) - z[0-9]+\.d}.*\tst1d\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{z[0-9]+\.d - (z[0-9]+\.d)}.*\tst1d\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u8.c
new file mode 100644
index 000000000..5575673ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_be_u8.c
@@ -0,0 +1,63 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+**	addvl	sp, sp, #-1
+**	str	p4, \[sp\]
+**	ptrue	p4\.b, all
+** (
+**	ld1b	(z[0-9]+\.b), p4/z, \[x1, #1, mul vl\]
+**	ld1b	(z[0-9]+\.b), p4/z, \[x1\]
+**	st2b	{\2 - \1}, p0, \[x0\]
+** |
+**	ld1b	(z[0-9]+\.b), p4/z, \[x1\]
+**	ld1b	(z[0-9]+\.b), p4/z, \[x1, #1, mul vl\]
+**	st2b	{\3 - \4}, p0, \[x0\]
+** )
+**	st4b	{z0\.b - z3\.b}, p1, \[x0\]
+**	st3b	{z4\.b - z6\.b}, p2, \[x0\]
+**	st1b	z7\.b, p3, \[x0\]
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svuint8x4_t z0, svuint8x3_t z4, svuint8x2_t stack,
+	svuint8_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_u8 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_u8 (pg, x0, -8),
+	  svld3_vnum_u8 (pg, x0, -3),
+	  svld2_vnum_u8 (pg, x0, 0),
+	  svld1_vnum_u8 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4b\t{z0\.b - z3\.b}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3b\t{z4\.b - z6\.b}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1b\tz7\.b, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2b\t{(z[0-9]+\.b) - z[0-9]+\.b}.*\tst1b\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2b\t{z[0-9]+\.b - (z[0-9]+\.b)}.*\tst1b\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_bf16.c
new file mode 100644
index 000000000..94d84df4a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_bf16.c
@@ -0,0 +1,58 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+** (
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	ldr	(z[0-9]+), \[x1\]
+**	st2h	{\2\.h - \1\.h}, p0, \[x0\]
+** |
+**	ldr	(z[0-9]+), \[x1\]
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	st2h	{\3\.h - \4\.h}, p0, \[x0\]
+** )
+**	st4h	{z0\.h - z3\.h}, p1, \[x0\]
+**	st3h	{z4\.h - z6\.h}, p2, \[x0\]
+**	st1h	z7\.h, p3, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svbfloat16x4_t z0, svbfloat16x3_t z4, svbfloat16x2_t stack,
+	svbfloat16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_bf16 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_bf16 (pg, x0, -8),
+	  svld3_vnum_bf16 (pg, x0, -3),
+	  svld2_vnum_bf16 (pg, x0, 0),
+	  svld1_vnum_bf16 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f16.c
new file mode 100644
index 000000000..6271365c7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f16.c
@@ -0,0 +1,58 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+** (
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	ldr	(z[0-9]+), \[x1\]
+**	st2h	{\2\.h - \1\.h}, p0, \[x0\]
+** |
+**	ldr	(z[0-9]+), \[x1\]
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	st2h	{\3\.h - \4\.h}, p0, \[x0\]
+** )
+**	st4h	{z0\.h - z3\.h}, p1, \[x0\]
+**	st3h	{z4\.h - z6\.h}, p2, \[x0\]
+**	st1h	z7\.h, p3, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svfloat16x4_t z0, svfloat16x3_t z4, svfloat16x2_t stack,
+	svfloat16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_f16 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_f16 (pg, x0, -8),
+	  svld3_vnum_f16 (pg, x0, -3),
+	  svld2_vnum_f16 (pg, x0, 0),
+	  svld1_vnum_f16 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f32.c
new file mode 100644
index 000000000..ef89de216
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f32.c
@@ -0,0 +1,58 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+** (
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	ldr	(z[0-9]+), \[x1\]
+**	st2w	{\2\.s - \1\.s}, p0, \[x0\]
+** |
+**	ldr	(z[0-9]+), \[x1\]
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	st2w	{\3\.s - \4\.s}, p0, \[x0\]
+** )
+**	st4w	{z0\.s - z3\.s}, p1, \[x0\]
+**	st3w	{z4\.s - z6\.s}, p2, \[x0\]
+**	st1w	z7\.s, p3, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svfloat32x4_t z0, svfloat32x3_t z4, svfloat32x2_t stack,
+	svfloat32_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_f32 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_f32 (pg, x0, -8),
+	  svld3_vnum_f32 (pg, x0, -3),
+	  svld2_vnum_f32 (pg, x0, 0),
+	  svld1_vnum_f32 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4w\t{z0\.s - z3\.s}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3w\t{z4\.s - z6\.s}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1w\tz7\.s, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{(z[0-9]+)\.s - z[0-9]+\.s}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{z[0-9]+\.s - (z[0-9]+)\.s}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f64.c
new file mode 100644
index 000000000..4eddf2d1f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_f64.c
@@ -0,0 +1,58 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+** (
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	ldr	(z[0-9]+), \[x1\]
+**	st2d	{\2\.d - \1\.d}, p0, \[x0\]
+** |
+**	ldr	(z[0-9]+), \[x1\]
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	st2d	{\3\.d - \4\.d}, p0, \[x0\]
+** )
+**	st4d	{z0\.d - z3\.d}, p1, \[x0\]
+**	st3d	{z4\.d - z6\.d}, p2, \[x0\]
+**	st1d	z7\.d, p3, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svfloat64x4_t z0, svfloat64x3_t z4, svfloat64x2_t stack,
+	svfloat64_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_f64 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_f64 (pg, x0, -8),
+	  svld3_vnum_f64 (pg, x0, -3),
+	  svld2_vnum_f64 (pg, x0, 0),
+	  svld1_vnum_f64 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4d\t{z0\.d - z3\.d}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3d\t{z4\.d - z6\.d}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1d\tz7\.d, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{(z[0-9]+)\.d - z[0-9]+\.d}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{z[0-9]+\.d - (z[0-9]+)\.d}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s16.c
new file mode 100644
index 000000000..a4b6af071
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s16.c
@@ -0,0 +1,58 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+** (
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	ldr	(z[0-9]+), \[x1\]
+**	st2h	{\2\.h - \1\.h}, p0, \[x0\]
+** |
+**	ldr	(z[0-9]+), \[x1\]
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	st2h	{\3\.h - \4\.h}, p0, \[x0\]
+** )
+**	st4h	{z0\.h - z3\.h}, p1, \[x0\]
+**	st3h	{z4\.h - z6\.h}, p2, \[x0\]
+**	st1h	z7\.h, p3, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svint16x4_t z0, svint16x3_t z4, svint16x2_t stack,
+	svint16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_s16 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_s16 (pg, x0, -8),
+	  svld3_vnum_s16 (pg, x0, -3),
+	  svld2_vnum_s16 (pg, x0, 0),
+	  svld1_vnum_s16 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s32.c
new file mode 100644
index 000000000..60b58d6fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s32.c
@@ -0,0 +1,58 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+** (
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	ldr	(z[0-9]+), \[x1\]
+**	st2w	{\2\.s - \1\.s}, p0, \[x0\]
+** |
+**	ldr	(z[0-9]+), \[x1\]
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	st2w	{\3\.s - \4\.s}, p0, \[x0\]
+** )
+**	st4w	{z0\.s - z3\.s}, p1, \[x0\]
+**	st3w	{z4\.s - z6\.s}, p2, \[x0\]
+**	st1w	z7\.s, p3, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svint32x4_t z0, svint32x3_t z4, svint32x2_t stack,
+	svint32_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_s32 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_s32 (pg, x0, -8),
+	  svld3_vnum_s32 (pg, x0, -3),
+	  svld2_vnum_s32 (pg, x0, 0),
+	  svld1_vnum_s32 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4w\t{z0\.s - z3\.s}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3w\t{z4\.s - z6\.s}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1w\tz7\.s, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{(z[0-9]+)\.s - z[0-9]+\.s}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{z[0-9]+\.s - (z[0-9]+)\.s}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s64.c
new file mode 100644
index 000000000..b6126aa4c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s64.c
@@ -0,0 +1,58 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+** (
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	ldr	(z[0-9]+), \[x1\]
+**	st2d	{\2\.d - \1\.d}, p0, \[x0\]
+** |
+**	ldr	(z[0-9]+), \[x1\]
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	st2d	{\3\.d - \4\.d}, p0, \[x0\]
+** )
+**	st4d	{z0\.d - z3\.d}, p1, \[x0\]
+**	st3d	{z4\.d - z6\.d}, p2, \[x0\]
+**	st1d	z7\.d, p3, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svint64x4_t z0, svint64x3_t z4, svint64x2_t stack,
+	svint64_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_s64 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_s64 (pg, x0, -8),
+	  svld3_vnum_s64 (pg, x0, -3),
+	  svld2_vnum_s64 (pg, x0, 0),
+	  svld1_vnum_s64 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4d\t{z0\.d - z3\.d}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3d\t{z4\.d - z6\.d}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1d\tz7\.d, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{(z[0-9]+)\.d - z[0-9]+\.d}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{z[0-9]+\.d - (z[0-9]+)\.d}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s8.c
new file mode 100644
index 000000000..5c16c3c8f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_s8.c
@@ -0,0 +1,58 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+** (
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	ldr	(z[0-9]+), \[x1\]
+**	st2b	{\2\.b - \1\.b}, p0, \[x0\]
+** |
+**	ldr	(z[0-9]+), \[x1\]
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	st2b	{\3\.b - \4\.b}, p0, \[x0\]
+** )
+**	st4b	{z0\.b - z3\.b}, p1, \[x0\]
+**	st3b	{z4\.b - z6\.b}, p2, \[x0\]
+**	st1b	z7\.b, p3, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svint8x4_t z0, svint8x3_t z4, svint8x2_t stack,
+	svint8_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_s8 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_s8 (pg, x0, -8),
+	  svld3_vnum_s8 (pg, x0, -3),
+	  svld2_vnum_s8 (pg, x0, 0),
+	  svld1_vnum_s8 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4b\t{z0\.b - z3\.b}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3b\t{z4\.b - z6\.b}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1b\tz7\.b, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2b\t{(z[0-9]+)\.b - z[0-9]+\.b}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2b\t{z[0-9]+\.b - (z[0-9]+)\.b}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u16.c
new file mode 100644
index 000000000..2b9a90025
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u16.c
@@ -0,0 +1,58 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+** (
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	ldr	(z[0-9]+), \[x1\]
+**	st2h	{\2\.h - \1\.h}, p0, \[x0\]
+** |
+**	ldr	(z[0-9]+), \[x1\]
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	st2h	{\3\.h - \4\.h}, p0, \[x0\]
+** )
+**	st4h	{z0\.h - z3\.h}, p1, \[x0\]
+**	st3h	{z4\.h - z6\.h}, p2, \[x0\]
+**	st1h	z7\.h, p3, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svuint16x4_t z0, svuint16x3_t z4, svuint16x2_t stack,
+	svuint16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_u16 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_u16 (pg, x0, -8),
+	  svld3_vnum_u16 (pg, x0, -3),
+	  svld2_vnum_u16 (pg, x0, 0),
+	  svld1_vnum_u16 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u32.c
new file mode 100644
index 000000000..2902f59b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u32.c
@@ -0,0 +1,58 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+** (
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	ldr	(z[0-9]+), \[x1\]
+**	st2w	{\2\.s - \1\.s}, p0, \[x0\]
+** |
+**	ldr	(z[0-9]+), \[x1\]
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	st2w	{\3\.s - \4\.s}, p0, \[x0\]
+** )
+**	st4w	{z0\.s - z3\.s}, p1, \[x0\]
+**	st3w	{z4\.s - z6\.s}, p2, \[x0\]
+**	st1w	z7\.s, p3, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svuint32x4_t z0, svuint32x3_t z4, svuint32x2_t stack,
+	svuint32_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_u32 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_u32 (pg, x0, -8),
+	  svld3_vnum_u32 (pg, x0, -3),
+	  svld2_vnum_u32 (pg, x0, 0),
+	  svld1_vnum_u32 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4w\t{z0\.s - z3\.s}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3w\t{z4\.s - z6\.s}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1w\tz7\.s, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{(z[0-9]+)\.s - z[0-9]+\.s}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{z[0-9]+\.s - (z[0-9]+)\.s}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u64.c
new file mode 100644
index 000000000..85b3cfdad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u64.c
@@ -0,0 +1,58 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+** (
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	ldr	(z[0-9]+), \[x1\]
+**	st2d	{\2\.d - \1\.d}, p0, \[x0\]
+** |
+**	ldr	(z[0-9]+), \[x1\]
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	st2d	{\3\.d - \4\.d}, p0, \[x0\]
+** )
+**	st4d	{z0\.d - z3\.d}, p1, \[x0\]
+**	st3d	{z4\.d - z6\.d}, p2, \[x0\]
+**	st1d	z7\.d, p3, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svuint64x4_t z0, svuint64x3_t z4, svuint64x2_t stack,
+	svuint64_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_u64 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_u64 (pg, x0, -8),
+	  svld3_vnum_u64 (pg, x0, -3),
+	  svld2_vnum_u64 (pg, x0, 0),
+	  svld1_vnum_u64 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4d\t{z0\.d - z3\.d}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3d\t{z4\.d - z6\.d}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1d\tz7\.d, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{(z[0-9]+)\.d - z[0-9]+\.d}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{z[0-9]+\.d - (z[0-9]+)\.d}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u8.c
new file mode 100644
index 000000000..f56acb693
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_5_le_u8.c
@@ -0,0 +1,58 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+** (
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	ldr	(z[0-9]+), \[x1\]
+**	st2b	{\2\.b - \1\.b}, p0, \[x0\]
+** |
+**	ldr	(z[0-9]+), \[x1\]
+**	ldr	(z[0-9]+), \[x1, #1, mul vl\]
+**	st2b	{\3\.b - \4\.b}, p0, \[x0\]
+** )
+**	st4b	{z0\.b - z3\.b}, p1, \[x0\]
+**	st3b	{z4\.b - z6\.b}, p2, \[x0\]
+**	st1b	z7\.b, p3, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svuint8x4_t z0, svuint8x3_t z4, svuint8x2_t stack,
+	svuint8_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  svst2 (p0, x0, stack);
+  svst4 (p1, x0, z0);
+  svst3 (p2, x0, z4);
+  svst1_u8 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee (x0,
+	  svld4_vnum_u8 (pg, x0, -8),
+	  svld3_vnum_u8 (pg, x0, -3),
+	  svld2_vnum_u8 (pg, x0, 0),
+	  svld1_vnum_u8 (pg, x0, 2),
+	  svptrue_pat_b8 (SV_VL1),
+	  svptrue_pat_b16 (SV_VL2),
+	  svptrue_pat_b32 (SV_VL3),
+	  svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4b\t{z0\.b - z3\.b}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3b\t{z4\.b - z6\.b}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1b\tz7\.b, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2b\t{(z[0-9]+)\.b - z[0-9]+\.b}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2b\t{z[0-9]+\.b - (z[0-9]+)\.b}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_bf16.c
new file mode 100644
index 000000000..84d2c406c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_bf16.c
@@ -0,0 +1,71 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	ptrue	p3\.b, all
+**	...
+**	ld1h	(z[0-9]+\.h), p3/z, \[x1, #3, mul vl\]
+**	...
+**	st4h	{z[0-9]+\.h - \1}, p0, \[x0\]
+**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
+**	st3h	{z5\.h - z7\.h}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svbfloat16x3_t z0, svbfloat16x2_t z3, svbfloat16x3_t z5,
+	 svbfloat16x4_t stack1, svbfloat16_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_bf16 (p0, x0, stack1);
+  svst2_bf16 (p1, x0, z3);
+  svst3_bf16 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1h	(z[0-9]+\.h), p3/z, \[x2\]
+**	st1h	\1, p0, \[x0\]
+**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
+**	st3h	{z0\.h - z2\.h}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svbfloat16x3_t z0, svbfloat16x2_t z3, svbfloat16x3_t z5,
+	 svbfloat16x4_t stack1, svbfloat16_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_bf16 (p0, x0, stack2);
+  svst2_bf16 (p1, x0, z3);
+  svst3_bf16 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_bf16 (pg, x0, -9),
+	   svld2_vnum_bf16 (pg, x0, -2),
+	   svld3_vnum_bf16 (pg, x0, 0),
+	   svld4_vnum_bf16 (pg, x0, 8),
+	   svld1_vnum_bf16 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f16.c
new file mode 100644
index 000000000..dd4ccc3b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f16.c
@@ -0,0 +1,71 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	ptrue	p3\.b, all
+**	...
+**	ld1h	(z[0-9]+\.h), p3/z, \[x1, #3, mul vl\]
+**	...
+**	st4h	{z[0-9]+\.h - \1}, p0, \[x0\]
+**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
+**	st3h	{z5\.h - z7\.h}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svfloat16x3_t z0, svfloat16x2_t z3, svfloat16x3_t z5,
+	 svfloat16x4_t stack1, svfloat16_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_f16 (p0, x0, stack1);
+  svst2_f16 (p1, x0, z3);
+  svst3_f16 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1h	(z[0-9]+\.h), p3/z, \[x2\]
+**	st1h	\1, p0, \[x0\]
+**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
+**	st3h	{z0\.h - z2\.h}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svfloat16x3_t z0, svfloat16x2_t z3, svfloat16x3_t z5,
+	 svfloat16x4_t stack1, svfloat16_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_f16 (p0, x0, stack2);
+  svst2_f16 (p1, x0, z3);
+  svst3_f16 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_f16 (pg, x0, -9),
+	   svld2_vnum_f16 (pg, x0, -2),
+	   svld3_vnum_f16 (pg, x0, 0),
+	   svld4_vnum_f16 (pg, x0, 8),
+	   svld1_vnum_f16 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f32.c
new file mode 100644
index 000000000..26ea2a308
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f32.c
@@ -0,0 +1,71 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	ptrue	p3\.b, all
+**	...
+**	ld1w	(z[0-9]+\.s), p3/z, \[x1, #3, mul vl\]
+**	...
+**	st4w	{z[0-9]+\.s - \1}, p0, \[x0\]
+**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
+**	st3w	{z5\.s - z7\.s}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svfloat32x3_t z0, svfloat32x2_t z3, svfloat32x3_t z5,
+	 svfloat32x4_t stack1, svfloat32_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_f32 (p0, x0, stack1);
+  svst2_f32 (p1, x0, z3);
+  svst3_f32 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1w	(z[0-9]+\.s), p3/z, \[x2\]
+**	st1w	\1, p0, \[x0\]
+**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
+**	st3w	{z0\.s - z2\.s}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svfloat32x3_t z0, svfloat32x2_t z3, svfloat32x3_t z5,
+	 svfloat32x4_t stack1, svfloat32_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_f32 (p0, x0, stack2);
+  svst2_f32 (p1, x0, z3);
+  svst3_f32 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_f32 (pg, x0, -9),
+	   svld2_vnum_f32 (pg, x0, -2),
+	   svld3_vnum_f32 (pg, x0, 0),
+	   svld4_vnum_f32 (pg, x0, 8),
+	   svld1_vnum_f32 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3w\t{z0\.s - z2\.s}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{z3\.s - z4\.s}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3w\t{z5\.s - z7\.s}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4w\t{(z[0-9]+\.s) - z[0-9]+\.s}.*\tst1w\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4w\t{z[0-9]+\.s - (z[0-9]+\.s)}.*\tst1w\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1w\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f64.c
new file mode 100644
index 000000000..62aded51c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_f64.c
@@ -0,0 +1,71 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	ptrue	p3\.b, all
+**	...
+**	ld1d	(z[0-9]+\.d), p3/z, \[x1, #3, mul vl\]
+**	...
+**	st4d	{z[0-9]+\.d - \1}, p0, \[x0\]
+**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
+**	st3d	{z5\.d - z7\.d}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svfloat64x3_t z0, svfloat64x2_t z3, svfloat64x3_t z5,
+	 svfloat64x4_t stack1, svfloat64_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_f64 (p0, x0, stack1);
+  svst2_f64 (p1, x0, z3);
+  svst3_f64 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1d	(z[0-9]+\.d), p3/z, \[x2\]
+**	st1d	\1, p0, \[x0\]
+**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
+**	st3d	{z0\.d - z2\.d}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svfloat64x3_t z0, svfloat64x2_t z3, svfloat64x3_t z5,
+	 svfloat64x4_t stack1, svfloat64_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_f64 (p0, x0, stack2);
+  svst2_f64 (p1, x0, z3);
+  svst3_f64 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_f64 (pg, x0, -9),
+	   svld2_vnum_f64 (pg, x0, -2),
+	   svld3_vnum_f64 (pg, x0, 0),
+	   svld4_vnum_f64 (pg, x0, 8),
+	   svld1_vnum_f64 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3d\t{z0\.d - z2\.d}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{z3\.d - z4\.d}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3d\t{z5\.d - z7\.d}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4d\t{(z[0-9]+\.d) - z[0-9]+\.d}.*\tst1d\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4d\t{z[0-9]+\.d - (z[0-9]+\.d)}.*\tst1d\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1d\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s16.c
new file mode 100644
index 000000000..204ef9a92
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s16.c
@@ -0,0 +1,71 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	ptrue	p3\.b, all
+**	...
+**	ld1h	(z[0-9]+\.h), p3/z, \[x1, #3, mul vl\]
+**	...
+**	st4h	{z[0-9]+\.h - \1}, p0, \[x0\]
+**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
+**	st3h	{z5\.h - z7\.h}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svint16x3_t z0, svint16x2_t z3, svint16x3_t z5,
+	 svint16x4_t stack1, svint16_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_s16 (p0, x0, stack1);
+  svst2_s16 (p1, x0, z3);
+  svst3_s16 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1h	(z[0-9]+\.h), p3/z, \[x2\]
+**	st1h	\1, p0, \[x0\]
+**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
+**	st3h	{z0\.h - z2\.h}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svint16x3_t z0, svint16x2_t z3, svint16x3_t z5,
+	 svint16x4_t stack1, svint16_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_s16 (p0, x0, stack2);
+  svst2_s16 (p1, x0, z3);
+  svst3_s16 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_s16 (pg, x0, -9),
+	   svld2_vnum_s16 (pg, x0, -2),
+	   svld3_vnum_s16 (pg, x0, 0),
+	   svld4_vnum_s16 (pg, x0, 8),
+	   svld1_vnum_s16 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s32.c
new file mode 100644
index 000000000..9ae4567a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s32.c
@@ -0,0 +1,71 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	ptrue	p3\.b, all
+**	...
+**	ld1w	(z[0-9]+\.s), p3/z, \[x1, #3, mul vl\]
+**	...
+**	st4w	{z[0-9]+\.s - \1\}, p0, \[x0\]
+**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
+**	st3w	{z5\.s - z7\.s}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svint32x3_t z0, svint32x2_t z3, svint32x3_t z5,
+	 svint32x4_t stack1, svint32_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_s32 (p0, x0, stack1);
+  svst2_s32 (p1, x0, z3);
+  svst3_s32 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1w	(z[0-9]+\.s), p3/z, \[x2\]
+**	st1w	\1, p0, \[x0\]
+**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
+**	st3w	{z0\.s - z2\.s}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svint32x3_t z0, svint32x2_t z3, svint32x3_t z5,
+	 svint32x4_t stack1, svint32_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_s32 (p0, x0, stack2);
+  svst2_s32 (p1, x0, z3);
+  svst3_s32 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_s32 (pg, x0, -9),
+	   svld2_vnum_s32 (pg, x0, -2),
+	   svld3_vnum_s32 (pg, x0, 0),
+	   svld4_vnum_s32 (pg, x0, 8),
+	   svld1_vnum_s32 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3w\t{z0\.s - z2\.s}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{z3\.s - z4\.s}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3w\t{z5\.s - z7\.s}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4w\t{(z[0-9]+\.s) - z[0-9]+\.s}.*\tst1w\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4w\t{z[0-9]+\.s - (z[0-9]+\.s)}.*\tst1w\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1w\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s64.c
new file mode 100644
index 000000000..0b8a2e213
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s64.c
@@ -0,0 +1,71 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	ptrue	p3\.b, all
+**	...
+**	ld1d	(z[0-9]+\.d), p3/z, \[x1, #3, mul vl\]
+**	...
+**	st4d	{z[0-9]+\.d - \1}, p0, \[x0\]
+**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
+**	st3d	{z5\.d - z7\.d}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svint64x3_t z0, svint64x2_t z3, svint64x3_t z5,
+	 svint64x4_t stack1, svint64_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_s64 (p0, x0, stack1);
+  svst2_s64 (p1, x0, z3);
+  svst3_s64 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1d	(z[0-9]+\.d), p3/z, \[x2\]
+**	st1d	\1, p0, \[x0\]
+**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
+**	st3d	{z0\.d - z2\.d}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svint64x3_t z0, svint64x2_t z3, svint64x3_t z5,
+	 svint64x4_t stack1, svint64_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_s64 (p0, x0, stack2);
+  svst2_s64 (p1, x0, z3);
+  svst3_s64 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_s64 (pg, x0, -9),
+	   svld2_vnum_s64 (pg, x0, -2),
+	   svld3_vnum_s64 (pg, x0, 0),
+	   svld4_vnum_s64 (pg, x0, 8),
+	   svld1_vnum_s64 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3d\t{z0\.d - z2\.d}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{z3\.d - z4\.d}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3d\t{z5\.d - z7\.d}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4d\t{(z[0-9]+\.d) - z[0-9]+\.d}.*\tst1d\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4d\t{z[0-9]+\.d - (z[0-9]+\.d)}.*\tst1d\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1d\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s8.c
new file mode 100644
index 000000000..0afbe71aa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_s8.c
@@ -0,0 +1,71 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	ptrue	p3\.b, all
+**	...
+**	ld1b	(z[0-9]+\.b), p3/z, \[x1, #3, mul vl\]
+**	...
+**	st4b	{z[0-9]+\.b - \1}, p0, \[x0\]
+**	st2b	{z3\.b - z4\.b}, p1, \[x0\]
+**	st3b	{z5\.b - z7\.b}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svint8x3_t z0, svint8x2_t z3, svint8x3_t z5,
+	 svint8x4_t stack1, svint8_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_s8 (p0, x0, stack1);
+  svst2_s8 (p1, x0, z3);
+  svst3_s8 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1b	(z[0-9]+\.b), p3/z, \[x2\]
+**	st1b	\1, p0, \[x0\]
+**	st2b	{z3\.b - z4\.b}, p1, \[x0\]
+**	st3b	{z0\.b - z2\.b}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svint8x3_t z0, svint8x2_t z3, svint8x3_t z5,
+	 svint8x4_t stack1, svint8_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_s8 (p0, x0, stack2);
+  svst2_s8 (p1, x0, z3);
+  svst3_s8 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_s8 (pg, x0, -9),
+	   svld2_vnum_s8 (pg, x0, -2),
+	   svld3_vnum_s8 (pg, x0, 0),
+	   svld4_vnum_s8 (pg, x0, 8),
+	   svld1_vnum_s8 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3b\t{z0\.b - z2\.b}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2b\t{z3\.b - z4\.b}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3b\t{z5\.b - z7\.b}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4b\t{(z[0-9]+\.b) - z[0-9]+\.b}.*\tst1b\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4b\t{z[0-9]+\.b - (z[0-9]+\.b)}.*\tst1b\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1b\t(z[0-9]+\.b), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1b\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u16.c
new file mode 100644
index 000000000..f010f5ebb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u16.c
@@ -0,0 +1,71 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	ptrue	p3\.b, all
+**	...
+**	ld1h	(z[0-9]+\.h), p3/z, \[x1, #3, mul vl\]
+**	...
+**	st4h	{z[0-9]+\.h - \1}, p0, \[x0\]
+**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
+**	st3h	{z5\.h - z7\.h}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svuint16x3_t z0, svuint16x2_t z3, svuint16x3_t z5,
+	 svuint16x4_t stack1, svuint16_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_u16 (p0, x0, stack1);
+  svst2_u16 (p1, x0, z3);
+  svst3_u16 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1h	(z[0-9]+\.h), p3/z, \[x2\]
+**	st1h	\1, p0, \[x0\]
+**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
+**	st3h	{z0\.h - z2\.h}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svuint16x3_t z0, svuint16x2_t z3, svuint16x3_t z5,
+	 svuint16x4_t stack1, svuint16_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_u16 (p0, x0, stack2);
+  svst2_u16 (p1, x0, z3);
+  svst3_u16 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_u16 (pg, x0, -9),
+	   svld2_vnum_u16 (pg, x0, -2),
+	   svld3_vnum_u16 (pg, x0, 0),
+	   svld4_vnum_u16 (pg, x0, 8),
+	   svld1_vnum_u16 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u32.c
new file mode 100644
index 000000000..60d903a31
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u32.c
@@ -0,0 +1,71 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	ptrue	p3\.b, all
+**	...
+**	ld1w	(z[0-9]+\.s), p3/z, \[x1, #3, mul vl\]
+**	...
+**	st4w	{z[0-9]+\.s - \1}, p0, \[x0\]
+**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
+**	st3w	{z5\.s - z7\.s}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svuint32x3_t z0, svuint32x2_t z3, svuint32x3_t z5,
+	 svuint32x4_t stack1, svuint32_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_u32 (p0, x0, stack1);
+  svst2_u32 (p1, x0, z3);
+  svst3_u32 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1w	(z[0-9]+\.s), p3/z, \[x2\]
+**	st1w	\1, p0, \[x0\]
+**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
+**	st3w	{z0\.s - z2\.s}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svuint32x3_t z0, svuint32x2_t z3, svuint32x3_t z5,
+	 svuint32x4_t stack1, svuint32_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_u32 (p0, x0, stack2);
+  svst2_u32 (p1, x0, z3);
+  svst3_u32 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_u32 (pg, x0, -9),
+	   svld2_vnum_u32 (pg, x0, -2),
+	   svld3_vnum_u32 (pg, x0, 0),
+	   svld4_vnum_u32 (pg, x0, 8),
+	   svld1_vnum_u32 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3w\t{z0\.s - z2\.s}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{z3\.s - z4\.s}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3w\t{z5\.s - z7\.s}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4w\t{(z[0-9]+\.s) - z[0-9]+\.s}.*\tst1w\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4w\t{z[0-9]+\.s - (z[0-9]+\.s)}.*\tst1w\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1w\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u64.c
new file mode 100644
index 000000000..948f426f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u64.c
@@ -0,0 +1,71 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	ptrue	p3\.b, all
+**	...
+**	ld1d	(z[0-9]+\.d), p3/z, \[x1, #3, mul vl\]
+**	...
+**	st4d	{z[0-9]+\.d - \1}, p0, \[x0\]
+**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
+**	st3d	{z5\.d - z7\.d}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svuint64x3_t z0, svuint64x2_t z3, svuint64x3_t z5,
+	 svuint64x4_t stack1, svuint64_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_u64 (p0, x0, stack1);
+  svst2_u64 (p1, x0, z3);
+  svst3_u64 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1d	(z[0-9]+\.d), p3/z, \[x2\]
+**	st1d	\1, p0, \[x0\]
+**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
+**	st3d	{z0\.d - z2\.d}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svuint64x3_t z0, svuint64x2_t z3, svuint64x3_t z5,
+	 svuint64x4_t stack1, svuint64_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_u64 (p0, x0, stack2);
+  svst2_u64 (p1, x0, z3);
+  svst3_u64 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_u64 (pg, x0, -9),
+	   svld2_vnum_u64 (pg, x0, -2),
+	   svld3_vnum_u64 (pg, x0, 0),
+	   svld4_vnum_u64 (pg, x0, 8),
+	   svld1_vnum_u64 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3d\t{z0\.d - z2\.d}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{z3\.d - z4\.d}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3d\t{z5\.d - z7\.d}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4d\t{(z[0-9]+\.d) - z[0-9]+\.d}.*\tst1d\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4d\t{z[0-9]+\.d - (z[0-9]+\.d)}.*\tst1d\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1d\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u8.c
new file mode 100644
index 000000000..8049ec078
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_be_u8.c
@@ -0,0 +1,71 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	ptrue	p3\.b, all
+**	...
+**	ld1b	(z[0-9]+\.b), p3/z, \[x1, #3, mul vl\]
+**	...
+**	st4b	{z[0-9]+\.b - \1}, p0, \[x0\]
+**	st2b	{z3\.b - z4\.b}, p1, \[x0\]
+**	st3b	{z5\.b - z7\.b}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svuint8x3_t z0, svuint8x2_t z3, svuint8x3_t z5,
+	 svuint8x4_t stack1, svuint8_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_u8 (p0, x0, stack1);
+  svst2_u8 (p1, x0, z3);
+  svst3_u8 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1b	(z[0-9]+\.b), p3/z, \[x2\]
+**	st1b	\1, p0, \[x0\]
+**	st2b	{z3\.b - z4\.b}, p1, \[x0\]
+**	st3b	{z0\.b - z2\.b}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svuint8x3_t z0, svuint8x2_t z3, svuint8x3_t z5,
+	 svuint8x4_t stack1, svuint8_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_u8 (p0, x0, stack2);
+  svst2_u8 (p1, x0, z3);
+  svst3_u8 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_u8 (pg, x0, -9),
+	   svld2_vnum_u8 (pg, x0, -2),
+	   svld3_vnum_u8 (pg, x0, 0),
+	   svld4_vnum_u8 (pg, x0, 8),
+	   svld1_vnum_u8 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3b\t{z0\.b - z2\.b}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2b\t{z3\.b - z4\.b}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3b\t{z5\.b - z7\.b}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4b\t{(z[0-9]+\.b) - z[0-9]+\.b}.*\tst1b\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4b\t{z[0-9]+\.b - (z[0-9]+\.b)}.*\tst1b\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1b\t(z[0-9]+\.b), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1b\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_bf16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_bf16.c
new file mode 100644
index 000000000..3dc9e42ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_bf16.c
@@ -0,0 +1,70 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	...
+**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
+**	...
+**	st4h	{z[0-9]+\.h - \1\.h}, p0, \[x0\]
+**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
+**	st3h	{z5\.h - z7\.h}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svbfloat16x3_t z0, svbfloat16x2_t z3, svbfloat16x3_t z5,
+	 svbfloat16x4_t stack1, svbfloat16_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_bf16 (p0, x0, stack1);
+  svst2_bf16 (p1, x0, z3);
+  svst3_bf16 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1h	(z[0-9]+\.h), p3/z, \[x2\]
+**	st1h	\1, p0, \[x0\]
+**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
+**	st3h	{z0\.h - z2\.h}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svbfloat16x3_t z0, svbfloat16x2_t z3, svbfloat16x3_t z5,
+	 svbfloat16x4_t stack1, svbfloat16_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_bf16 (p0, x0, stack2);
+  svst2_bf16 (p1, x0, z3);
+  svst3_bf16 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_bf16 (pg, x0, -9),
+	   svld2_vnum_bf16 (pg, x0, -2),
+	   svld3_vnum_bf16 (pg, x0, 0),
+	   svld4_vnum_bf16 (pg, x0, 8),
+	   svld1_vnum_bf16 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f16.c
new file mode 100644
index 000000000..80a2e3aae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f16.c
@@ -0,0 +1,70 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	...
+**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
+**	...
+**	st4h	{z[0-9]+\.h - \1\.h}, p0, \[x0\]
+**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
+**	st3h	{z5\.h - z7\.h}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svfloat16x3_t z0, svfloat16x2_t z3, svfloat16x3_t z5,
+	 svfloat16x4_t stack1, svfloat16_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_f16 (p0, x0, stack1);
+  svst2_f16 (p1, x0, z3);
+  svst3_f16 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1h	(z[0-9]+\.h), p3/z, \[x2\]
+**	st1h	\1, p0, \[x0\]
+**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
+**	st3h	{z0\.h - z2\.h}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svfloat16x3_t z0, svfloat16x2_t z3, svfloat16x3_t z5,
+	 svfloat16x4_t stack1, svfloat16_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_f16 (p0, x0, stack2);
+  svst2_f16 (p1, x0, z3);
+  svst3_f16 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_f16 (pg, x0, -9),
+	   svld2_vnum_f16 (pg, x0, -2),
+	   svld3_vnum_f16 (pg, x0, 0),
+	   svld4_vnum_f16 (pg, x0, 8),
+	   svld1_vnum_f16 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f32.c
new file mode 100644
index 000000000..40ff42128
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f32.c
@@ -0,0 +1,70 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	...
+**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
+**	...
+**	st4w	{z[0-9]+\.s - \1\.s}, p0, \[x0\]
+**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
+**	st3w	{z5\.s - z7\.s}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svfloat32x3_t z0, svfloat32x2_t z3, svfloat32x3_t z5,
+	 svfloat32x4_t stack1, svfloat32_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_f32 (p0, x0, stack1);
+  svst2_f32 (p1, x0, z3);
+  svst3_f32 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1w	(z[0-9]+\.s), p3/z, \[x2\]
+**	st1w	\1, p0, \[x0\]
+**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
+**	st3w	{z0\.s - z2\.s}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svfloat32x3_t z0, svfloat32x2_t z3, svfloat32x3_t z5,
+	 svfloat32x4_t stack1, svfloat32_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_f32 (p0, x0, stack2);
+  svst2_f32 (p1, x0, z3);
+  svst3_f32 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_f32 (pg, x0, -9),
+	   svld2_vnum_f32 (pg, x0, -2),
+	   svld3_vnum_f32 (pg, x0, 0),
+	   svld4_vnum_f32 (pg, x0, 8),
+	   svld1_vnum_f32 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3w\t{z0\.s - z2\.s}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{z3\.s - z4\.s}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3w\t{z5\.s - z7\.s}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4w\t{(z[0-9]+)\.s - z[0-9]+\.s}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4w\t{z[0-9]+\.s - (z[0-9]+)\.s}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1w\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f64.c
new file mode 100644
index 000000000..ee219ccdc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_f64.c
@@ -0,0 +1,70 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	...
+**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
+**	...
+**	st4d	{z[0-9]+\.d - \1\.d}, p0, \[x0\]
+**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
+**	st3d	{z5\.d - z7\.d}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svfloat64x3_t z0, svfloat64x2_t z3, svfloat64x3_t z5,
+	 svfloat64x4_t stack1, svfloat64_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_f64 (p0, x0, stack1);
+  svst2_f64 (p1, x0, z3);
+  svst3_f64 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1d	(z[0-9]+\.d), p3/z, \[x2\]
+**	st1d	\1, p0, \[x0\]
+**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
+**	st3d	{z0\.d - z2\.d}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svfloat64x3_t z0, svfloat64x2_t z3, svfloat64x3_t z5,
+	 svfloat64x4_t stack1, svfloat64_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_f64 (p0, x0, stack2);
+  svst2_f64 (p1, x0, z3);
+  svst3_f64 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_f64 (pg, x0, -9),
+	   svld2_vnum_f64 (pg, x0, -2),
+	   svld3_vnum_f64 (pg, x0, 0),
+	   svld4_vnum_f64 (pg, x0, 8),
+	   svld1_vnum_f64 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3d\t{z0\.d - z2\.d}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{z3\.d - z4\.d}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3d\t{z5\.d - z7\.d}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4d\t{(z[0-9]+)\.d - z[0-9]+\.d}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4d\t{z[0-9]+\.d - (z[0-9]+)\.d}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1d\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s16.c
new file mode 100644
index 000000000..ade75cb34
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s16.c
@@ -0,0 +1,70 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	...
+**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
+**	...
+**	st4h	{z[0-9]+\.h - \1\.h}, p0, \[x0\]
+**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
+**	st3h	{z5\.h - z7\.h}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svint16x3_t z0, svint16x2_t z3, svint16x3_t z5,
+	 svint16x4_t stack1, svint16_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_s16 (p0, x0, stack1);
+  svst2_s16 (p1, x0, z3);
+  svst3_s16 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1h	(z[0-9]+\.h), p3/z, \[x2\]
+**	st1h	\1, p0, \[x0\]
+**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
+**	st3h	{z0\.h - z2\.h}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svint16x3_t z0, svint16x2_t z3, svint16x3_t z5,
+	 svint16x4_t stack1, svint16_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_s16 (p0, x0, stack2);
+  svst2_s16 (p1, x0, z3);
+  svst3_s16 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_s16 (pg, x0, -9),
+	   svld2_vnum_s16 (pg, x0, -2),
+	   svld3_vnum_s16 (pg, x0, 0),
+	   svld4_vnum_s16 (pg, x0, 8),
+	   svld1_vnum_s16 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s32.c
new file mode 100644
index 000000000..a6c06e235
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s32.c
@@ -0,0 +1,70 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	...
+**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
+**	...
+**	st4w	{z[0-9]+\.s - \1\.s}, p0, \[x0\]
+**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
+**	st3w	{z5\.s - z7\.s}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svint32x3_t z0, svint32x2_t z3, svint32x3_t z5,
+	 svint32x4_t stack1, svint32_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_s32 (p0, x0, stack1);
+  svst2_s32 (p1, x0, z3);
+  svst3_s32 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1w	(z[0-9]+\.s), p3/z, \[x2\]
+**	st1w	\1, p0, \[x0\]
+**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
+**	st3w	{z0\.s - z2\.s}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svint32x3_t z0, svint32x2_t z3, svint32x3_t z5,
+	 svint32x4_t stack1, svint32_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_s32 (p0, x0, stack2);
+  svst2_s32 (p1, x0, z3);
+  svst3_s32 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_s32 (pg, x0, -9),
+	   svld2_vnum_s32 (pg, x0, -2),
+	   svld3_vnum_s32 (pg, x0, 0),
+	   svld4_vnum_s32 (pg, x0, 8),
+	   svld1_vnum_s32 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3w\t{z0\.s - z2\.s}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{z3\.s - z4\.s}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3w\t{z5\.s - z7\.s}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4w\t{(z[0-9]+)\.s - z[0-9]+\.s}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4w\t{z[0-9]+\.s - (z[0-9]+)\.s}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1w\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s64.c
new file mode 100644
index 000000000..219c71d82
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s64.c
@@ -0,0 +1,70 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	...
+**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
+**	...
+**	st4d	{z[0-9]+\.d - \1\.d}, p0, \[x0\]
+**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
+**	st3d	{z5\.d - z7\.d}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svint64x3_t z0, svint64x2_t z3, svint64x3_t z5,
+	 svint64x4_t stack1, svint64_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_s64 (p0, x0, stack1);
+  svst2_s64 (p1, x0, z3);
+  svst3_s64 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1d	(z[0-9]+\.d), p3/z, \[x2\]
+**	st1d	\1, p0, \[x0\]
+**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
+**	st3d	{z0\.d - z2\.d}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svint64x3_t z0, svint64x2_t z3, svint64x3_t z5,
+	 svint64x4_t stack1, svint64_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_s64 (p0, x0, stack2);
+  svst2_s64 (p1, x0, z3);
+  svst3_s64 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_s64 (pg, x0, -9),
+	   svld2_vnum_s64 (pg, x0, -2),
+	   svld3_vnum_s64 (pg, x0, 0),
+	   svld4_vnum_s64 (pg, x0, 8),
+	   svld1_vnum_s64 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3d\t{z0\.d - z2\.d}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{z3\.d - z4\.d}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3d\t{z5\.d - z7\.d}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4d\t{(z[0-9]+)\.d - z[0-9]+\.d}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4d\t{z[0-9]+\.d - (z[0-9]+)\.d}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1d\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s8.c
new file mode 100644
index 000000000..c48d391ca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_s8.c
@@ -0,0 +1,70 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	...
+**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
+**	...
+**	st4b	{z[0-9]+\.b - \1\.b}, p0, \[x0\]
+**	st2b	{z3\.b - z4\.b}, p1, \[x0\]
+**	st3b	{z5\.b - z7\.b}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svint8x3_t z0, svint8x2_t z3, svint8x3_t z5,
+	 svint8x4_t stack1, svint8_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_s8 (p0, x0, stack1);
+  svst2_s8 (p1, x0, z3);
+  svst3_s8 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1b	(z[0-9]+\.b), p3/z, \[x2\]
+**	st1b	\1, p0, \[x0\]
+**	st2b	{z3\.b - z4\.b}, p1, \[x0\]
+**	st3b	{z0\.b - z2\.b}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svint8x3_t z0, svint8x2_t z3, svint8x3_t z5,
+	 svint8x4_t stack1, svint8_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_s8 (p0, x0, stack2);
+  svst2_s8 (p1, x0, z3);
+  svst3_s8 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_s8 (pg, x0, -9),
+	   svld2_vnum_s8 (pg, x0, -2),
+	   svld3_vnum_s8 (pg, x0, 0),
+	   svld4_vnum_s8 (pg, x0, 8),
+	   svld1_vnum_s8 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3b\t{z0\.b - z2\.b}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2b\t{z3\.b - z4\.b}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3b\t{z5\.b - z7\.b}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4b\t{(z[0-9]+)\.b - z[0-9]+\.b}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4b\t{z[0-9]+\.b - (z[0-9]+)\.b}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1b\t(z[0-9]+\.b), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1b\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u16.c
new file mode 100644
index 000000000..6c635fd94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u16.c
@@ -0,0 +1,70 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	...
+**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
+**	...
+**	st4h	{z[0-9]+\.h - \1\.h}, p0, \[x0\]
+**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
+**	st3h	{z5\.h - z7\.h}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svuint16x3_t z0, svuint16x2_t z3, svuint16x3_t z5,
+	 svuint16x4_t stack1, svuint16_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_u16 (p0, x0, stack1);
+  svst2_u16 (p1, x0, z3);
+  svst3_u16 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1h	(z[0-9]+\.h), p3/z, \[x2\]
+**	st1h	\1, p0, \[x0\]
+**	st2h	{z3\.h - z4\.h}, p1, \[x0\]
+**	st3h	{z0\.h - z2\.h}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svuint16x3_t z0, svuint16x2_t z3, svuint16x3_t z5,
+	 svuint16x4_t stack1, svuint16_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_u16 (p0, x0, stack2);
+  svst2_u16 (p1, x0, z3);
+  svst3_u16 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_u16 (pg, x0, -9),
+	   svld2_vnum_u16 (pg, x0, -2),
+	   svld3_vnum_u16 (pg, x0, 0),
+	   svld4_vnum_u16 (pg, x0, 8),
+	   svld1_vnum_u16 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u32.c
new file mode 100644
index 000000000..c31d45426
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u32.c
@@ -0,0 +1,70 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	...
+**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
+**	...
+**	st4w	{z[0-9]+\.s - \1\.s}, p0, \[x0\]
+**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
+**	st3w	{z5\.s - z7\.s}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svuint32x3_t z0, svuint32x2_t z3, svuint32x3_t z5,
+	 svuint32x4_t stack1, svuint32_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_u32 (p0, x0, stack1);
+  svst2_u32 (p1, x0, z3);
+  svst3_u32 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1w	(z[0-9]+\.s), p3/z, \[x2\]
+**	st1w	\1, p0, \[x0\]
+**	st2w	{z3\.s - z4\.s}, p1, \[x0\]
+**	st3w	{z0\.s - z2\.s}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svuint32x3_t z0, svuint32x2_t z3, svuint32x3_t z5,
+	 svuint32x4_t stack1, svuint32_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_u32 (p0, x0, stack2);
+  svst2_u32 (p1, x0, z3);
+  svst3_u32 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_u32 (pg, x0, -9),
+	   svld2_vnum_u32 (pg, x0, -2),
+	   svld3_vnum_u32 (pg, x0, 0),
+	   svld4_vnum_u32 (pg, x0, 8),
+	   svld1_vnum_u32 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3w\t{z0\.s - z2\.s}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2w\t{z3\.s - z4\.s}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3w\t{z5\.s - z7\.s}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4w\t{(z[0-9]+)\.s - z[0-9]+\.s}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4w\t{z[0-9]+\.s - (z[0-9]+)\.s}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1w\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u64.c
new file mode 100644
index 000000000..969b258b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u64.c
@@ -0,0 +1,70 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	...
+**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
+**	...
+**	st4d	{z[0-9]+\.d - \1\.d}, p0, \[x0\]
+**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
+**	st3d	{z5\.d - z7\.d}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svuint64x3_t z0, svuint64x2_t z3, svuint64x3_t z5,
+	 svuint64x4_t stack1, svuint64_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_u64 (p0, x0, stack1);
+  svst2_u64 (p1, x0, z3);
+  svst3_u64 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1d	(z[0-9]+\.d), p3/z, \[x2\]
+**	st1d	\1, p0, \[x0\]
+**	st2d	{z3\.d - z4\.d}, p1, \[x0\]
+**	st3d	{z0\.d - z2\.d}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svuint64x3_t z0, svuint64x2_t z3, svuint64x3_t z5,
+	 svuint64x4_t stack1, svuint64_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_u64 (p0, x0, stack2);
+  svst2_u64 (p1, x0, z3);
+  svst3_u64 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_u64 (pg, x0, -9),
+	   svld2_vnum_u64 (pg, x0, -2),
+	   svld3_vnum_u64 (pg, x0, 0),
+	   svld4_vnum_u64 (pg, x0, 8),
+	   svld1_vnum_u64 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3d\t{z0\.d - z2\.d}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2d\t{z3\.d - z4\.d}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3d\t{z5\.d - z7\.d}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4d\t{(z[0-9]+)\.d - z[0-9]+\.d}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4d\t{z[0-9]+\.d - (z[0-9]+)\.d}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1d\t(z[0-9]+\.d), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1d\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u8.c
new file mode 100644
index 000000000..d18604784
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_6_le_u8.c
@@ -0,0 +1,70 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+**	...
+**	ldr	(z[0-9]+), \[x1, #3, mul vl\]
+**	...
+**	st4b	{z[0-9]+\.b - \1\.b}, p0, \[x0\]
+**	st2b	{z3\.b - z4\.b}, p1, \[x0\]
+**	st3b	{z5\.b - z7\.b}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svuint8x3_t z0, svuint8x2_t z3, svuint8x3_t z5,
+	 svuint8x4_t stack1, svuint8_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst4_u8 (p0, x0, stack1);
+  svst2_u8 (p1, x0, z3);
+  svst3_u8 (p2, x0, z5);
+}
+
+/*
+** callee2:
+**	ptrue	p3\.b, all
+**	ld1b	(z[0-9]+\.b), p3/z, \[x2\]
+**	st1b	\1, p0, \[x0\]
+**	st2b	{z3\.b - z4\.b}, p1, \[x0\]
+**	st3b	{z0\.b - z2\.b}, p2, \[x0\]
+**	ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svuint8x3_t z0, svuint8x2_t z3, svuint8x3_t z5,
+	 svuint8x4_t stack1, svuint8_t stack2, svbool_t p0,
+	 svbool_t p1, svbool_t p2)
+{
+  svst1_u8 (p0, x0, stack2);
+  svst2_u8 (p1, x0, z3);
+  svst3_u8 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+  svbool_t pg;
+  pg = svptrue_b8 ();
+  callee1 (x0,
+	   svld3_vnum_u8 (pg, x0, -9),
+	   svld2_vnum_u8 (pg, x0, -2),
+	   svld3_vnum_u8 (pg, x0, 0),
+	   svld4_vnum_u8 (pg, x0, 8),
+	   svld1_vnum_u8 (pg, x0, 5),
+	   svptrue_pat_b8 (SV_VL1),
+	   svptrue_pat_b16 (SV_VL2),
+	   svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3b\t{z0\.b - z2\.b}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2b\t{z3\.b - z4\.b}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3b\t{z5\.b - z7\.b}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4b\t{(z[0-9]+)\.b - z[0-9]+\.b}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4b\t{z[0-9]+\.b - (z[0-9]+)\.b}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1b\t(z[0-9]+\.b), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1b\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_7.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_7.c
new file mode 100644
index 000000000..15c022486
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_7.c
@@ -0,0 +1,30 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+/*
+** callee:
+**	...
+**	ldr	(x[0-9]+), \[sp\]
+**	...
+**	ld1b	(z[0-9]+\.b), p[1-3]/z, \[\1\]
+**	st1b	\2, p0, \[x0, x7\]
+**	ret
+*/
+void __attribute__((noipa))
+callee (int8_t *x0, int x1, int x2, int x3,
+	int x4, int x5, svbool_t p0, int x6, int64_t x7,
+	svint32x4_t z0, svint32x4_t z4, svint8_t stack)
+{
+  svst1 (p0, x0 + x7, stack);
+}
+
+void __attribute__((noipa))
+caller (int8_t *x0, svbool_t p0, svint32x4_t z0, svint32x4_t z4)
+{
+  callee (x0, 1, 2, 3, 4, 5, p0, 6, 7, z0, z4, svdup_s8 (42));
+}
+
+/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.b), #42\n.*\tst1b\t\1, p[0-7], \[(x[0-9]+)\]\n.*\tstr\t\2, \[sp\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_8.c
new file mode 100644
index 000000000..93ace26f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_8.c
@@ -0,0 +1,28 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+/*
+** callee:
+**	ptrue	(p[1-3])\.b, all
+**	ld1b	(z[0-9]+\.b), \1/z, \[x4\]
+**	st1b	\2, p0, \[x0, x7\]
+**	ret
+*/
+void __attribute__((noipa))
+callee (int8_t *x0, int x1, int x2, int x3,
+	svint32x4_t z0, svint32x4_t z4, svint8_t stack,
+	int x5, svbool_t p0, int x6, int64_t x7)
+{
+  svst1 (p0, x0 + x7, stack);
+}
+
+void __attribute__((noipa))
+caller (int8_t *x0, svbool_t p0, svint32x4_t z0, svint32x4_t z4)
+{
+  callee (x0, 1, 2, 3, z0, z4, svdup_s8 (42), 5, p0, 6, 7);
+}
+
+/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.b), #42\n.*\tst1b\t\1, p[0-7], \[x4\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_9.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_9.c
new file mode 100644
index 000000000..ad9affadf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_9.c
@@ -0,0 +1,49 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+/*
+** callee:
+**	ldr	(x[0-9]+), \[sp, 8\]
+**	ldr	p0, \[\1\]
+**	ret
+*/
+svbool_t __attribute__((noipa))
+callee (svint64x4_t z0, svint16x4_t z4,
+	svint64_t stack1, svint32_t stack2,
+	svint16_t stack3, svint8_t stack4,
+	svuint64_t stack5, svuint32_t stack6,
+	svuint16_t stack7, svuint8_t stack8,
+	svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3,
+	svbool_t stack9, svbool_t stack10)
+{
+  return stack10;
+}
+
+uint64_t __attribute__((noipa))
+caller (int64_t *x0, int16_t *x1, svbool_t p0)
+{
+  svbool_t res;
+  res = callee (svld4 (p0, x0),
+		svld4 (p0, x1),
+		svdup_s64 (1),
+		svdup_s32 (2),
+		svdup_s16 (3),
+		svdup_s8 (4),
+		svdup_u64 (5),
+		svdup_u32 (6),
+		svdup_u16 (7),
+		svdup_u8 (8),
+		svptrue_pat_b8 (SV_VL5),
+		svptrue_pat_b16 (SV_VL6),
+		svptrue_pat_b32 (SV_VL7),
+		svptrue_pat_b64 (SV_VL8),
+		svptrue_pat_b8 (SV_MUL3),
+		svptrue_pat_b16 (SV_MUL3));
+  return svcntp_b8 (res, res);
+}
+
+/* { dg-final { scan-assembler {\tptrue\t(p[0-9]+)\.b, mul3\n\tstr\t\1, \[(x[0-9]+)\]\n.*\tstr\t\2, \[sp\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\t(p[0-9]+)\.h, mul3\n\tstr\t\1, \[(x[0-9]+)\]\n.*\tstr\t\2, \[sp, 8\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/gnu_vectors_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/gnu_vectors_1.c
new file mode 100644
index 000000000..e5fceb14b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/gnu_vectors_1.c
@@ -0,0 +1,107 @@
+/* { dg-options "-O -msve-vector-bits=256 -fomit-frame-pointer" } */
+
+#include <arm_sve.h>
+
+typedef bfloat16_t bfloat16x16_t __attribute__((vector_size (32)));
+typedef float16_t float16x16_t __attribute__((vector_size (32)));
+typedef float32_t float32x8_t __attribute__((vector_size (32)));
+typedef float64_t float64x4_t __attribute__((vector_size (32)));
+typedef int8_t int8x32_t __attribute__((vector_size (32)));
+typedef int16_t int16x16_t __attribute__((vector_size (32)));
+typedef int32_t int32x8_t __attribute__((vector_size (32)));
+typedef int64_t int64x4_t __attribute__((vector_size (32)));
+typedef uint8_t uint8x32_t __attribute__((vector_size (32)));
+typedef uint16_t uint16x16_t __attribute__((vector_size (32)));
+typedef uint32_t uint32x8_t __attribute__((vector_size (32)));
+typedef uint64_t uint64x4_t __attribute__((vector_size (32)));
+
+void bfloat16_callee (bfloat16x16_t);
+void float16_callee (float16x16_t);
+void float32_callee (float32x8_t);
+void float64_callee (float64x4_t);
+void int8_callee (int8x32_t);
+void int16_callee (int16x16_t);
+void int32_callee (int32x8_t);
+void int64_callee (int64x4_t);
+void uint8_callee (uint8x32_t);
+void uint16_callee (uint16x16_t);
+void uint32_callee (uint32x8_t);
+void uint64_callee (uint64x4_t);
+
+void
+bfloat16_caller (bfloat16_t val)
+{
+  bfloat16_callee (svdup_bf16 (val));
+}
+
+void
+float16_caller (void)
+{
+  float16_callee (svdup_f16 (1.0));
+}
+
+void
+float32_caller (void)
+{
+  float32_callee (svdup_f32 (2.0));
+}
+
+void
+float64_caller (void)
+{
+  float64_callee (svdup_f64 (3.0));
+}
+
+void
+int8_caller (void)
+{
+  int8_callee (svindex_s8 (0, 1));
+}
+
+void
+int16_caller (void)
+{
+  int16_callee (svindex_s16 (0, 2));
+}
+
+void
+int32_caller (void)
+{
+  int32_callee (svindex_s32 (0, 3));
+}
+
+void
+int64_caller (void)
+{
+  int64_callee (svindex_s64 (0, 4));
+}
+
+void
+uint8_caller (void)
+{
+  uint8_callee (svindex_u8 (1, 1));
+}
+
+void
+uint16_caller (void)
+{
+  uint16_callee (svindex_u16 (1, 2));
+}
+
+void
+uint32_caller (void)
+{
+  uint32_callee (svindex_u32 (1, 3));
+}
+
+void
+uint64_caller (void)
+{
+  uint64_callee (svindex_u64 (1, 4));
+}
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x0\]} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h, p[0-7], \[x0\]} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x0\]} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x0\]} 3 } } */
+/* { dg-final { scan-assembler-times {\tadd\tx0, sp, #?16\n} 12 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/gnu_vectors_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/gnu_vectors_2.c
new file mode 100644
index 000000000..875567f01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/gnu_vectors_2.c
@@ -0,0 +1,107 @@
+/* { dg-options "-O -msve-vector-bits=256 -fomit-frame-pointer" } */
+
+#include <arm_sve.h>
+
+typedef bfloat16_t bfloat16x16_t __attribute__((vector_size (32)));
+typedef float16_t float16x16_t __attribute__((vector_size (32)));
+typedef float32_t float32x8_t __attribute__((vector_size (32)));
+typedef float64_t float64x4_t __attribute__((vector_size (32)));
+typedef int8_t int8x32_t __attribute__((vector_size (32)));
+typedef int16_t int16x16_t __attribute__((vector_size (32)));
+typedef int32_t int32x8_t __attribute__((vector_size (32)));
+typedef int64_t int64x4_t __attribute__((vector_size (32)));
+typedef uint8_t uint8x32_t __attribute__((vector_size (32)));
+typedef uint16_t uint16x16_t __attribute__((vector_size (32)));
+typedef uint32_t uint32x8_t __attribute__((vector_size (32)));
+typedef uint64_t uint64x4_t __attribute__((vector_size (32)));
+
+void bfloat16_callee (svbfloat16_t);
+void float16_callee (svfloat16_t);
+void float32_callee (svfloat32_t);
+void float64_callee (svfloat64_t);
+void int8_callee (svint8_t);
+void int16_callee (svint16_t);
+void int32_callee (svint32_t);
+void int64_callee (svint64_t);
+void uint8_callee (svuint8_t);
+void uint16_callee (svuint16_t);
+void uint32_callee (svuint32_t);
+void uint64_callee (svuint64_t);
+
+void
+bfloat16_caller (bfloat16x16_t arg)
+{
+  bfloat16_callee (arg);
+}
+
+void
+float16_caller (float16x16_t arg)
+{
+  float16_callee (arg);
+}
+
+void
+float32_caller (float32x8_t arg)
+{
+  float32_callee (arg);
+}
+
+void
+float64_caller (float64x4_t arg)
+{
+  float64_callee (arg);
+}
+
+void
+int8_caller (int8x32_t arg)
+{
+  int8_callee (arg);
+}
+
+void
+int16_caller (int16x16_t arg)
+{
+  int16_callee (arg);
+}
+
+void
+int32_caller (int32x8_t arg)
+{
+  int32_callee (arg);
+}
+
+void
+int64_caller (int64x4_t arg)
+{
+  int64_callee (arg);
+}
+
+void
+uint8_caller (uint8x32_t arg)
+{
+  uint8_callee (arg);
+}
+
+void
+uint16_caller (uint16x16_t arg)
+{
+  uint16_callee (arg);
+}
+
+void
+uint32_caller (uint32x8_t arg)
+{
+  uint32_callee (arg);
+}
+
+void
+uint64_caller (uint64x4_t arg)
+{
+  uint64_callee (arg);
+}
+
+/* { dg-final { scan-assembler-times {\tld1b\tz0\.b, p[0-7]/z, \[x0\]} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz0\.h, p[0-7]/z, \[x0\]} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz0\.s, p[0-7]/z, \[x0\]} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz0\.d, p[0-7]/z, \[x0\]} 3 } } */
+/* { dg-final { scan-assembler-not {\tst1[bhwd]\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_1.c
new file mode 100644
index 000000000..26802c87f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-prune-output "compilation terminated" } */
+
+#include <arm_sve.h>
+
+#pragma GCC target "+nosve"
+
+svbool_t return_bool ();
+
+void
+f (void)
+{
+  return_bool (); /* { dg-error {'return_bool' requires the SVE ISA extension} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_2.c
new file mode 100644
index 000000000..663165f89
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-prune-output "compilation terminated" } */
+
+#include <arm_sve.h>
+
+#pragma GCC target "+nosve"
+
+svbool_t return_bool ();
+
+void
+f (svbool_t *ptr)
+{
+  *ptr = return_bool (); /* { dg-error {'return_bool' requires the SVE ISA extension} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_3.c
new file mode 100644
index 000000000..6d5823cfd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_3.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-prune-output "compilation terminated" } */
+
+#include <arm_sve.h>
+
+#pragma GCC target "+nosve"
+
+svbool_t (*return_bool) ();
+
+void
+f (svbool_t *ptr)
+{
+  *ptr = return_bool (); /* { dg-error {calls to functions of type 'svbool_t\(\)' require the SVE ISA extension} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_4.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_4.c
new file mode 100644
index 000000000..81e31cf4f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_4.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-prune-output "compilation terminated" } */
+
+#include <arm_sve.h>
+
+#pragma GCC target "+nosve"
+
+void take_svuint8 (svuint8_t);
+
+void
+f (svuint8_t *ptr)
+{
+  take_svuint8 (*ptr); /* { dg-error {'take_svuint8' requires the SVE ISA extension} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_5.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_5.c
new file mode 100644
index 000000000..300ed00a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_5.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-prune-output "compilation terminated" } */
+
+#include <arm_sve.h>
+
+#pragma GCC target "+nosve"
+
+void take_svuint8_eventually (float, float, float, float,
+			      float, float, float, float, svuint8_t);
+
+void
+f (svuint8_t *ptr)
+{
+  take_svuint8_eventually (0, 0, 0, 0, 0, 0, 0, 0, *ptr); /* { dg-error {arguments of type '(svuint8_t|__SVUint8_t)' require the SVE ISA extension} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_6.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_6.c
new file mode 100644
index 000000000..4bddf76f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_6.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-prune-output "compilation terminated" } */
+
+#include <arm_sve.h>
+
+#pragma GCC target "+nosve"
+
+void unprototyped ();
+
+void
+f (svuint8_t *ptr)
+{
+  unprototyped (*ptr); /* { dg-error {arguments of type '(svuint8_t|__SVUint8_t)' require the SVE ISA extension} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_7.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_7.c
new file mode 100644
index 000000000..ef742711d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_7.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-prune-output "compilation terminated" } */
+
+#include <arm_sve.h>
+
+#pragma GCC target "+nosve"
+
+void f (svuint8_t x) {} /* { dg-error {'f' requires the SVE ISA extension} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_8.c
new file mode 100644
index 000000000..45b549f12
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/nosve_8.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-prune-output "compilation terminated" } */
+
+#include <arm_sve.h>
+
+#pragma GCC target "+nosve"
+
+void
+f (float a, float b, float c, float d, float e, float f, float g, float h, svuint8_t x) /* { dg-error {arguments of type '(svuint8_t|__SVUint8_t)' require the SVE ISA extension} } */
+{
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1.c
new file mode 100644
index 000000000..f6328c901
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+/*
+** callee_pred:
+**	ldr	p0, \[x0\]
+**	ret
+*/
+__SVBool_t __attribute__((noipa))
+callee_pred (__SVBool_t *ptr)
+{
+  return *ptr;
+}
+
+#include <arm_sve.h>
+
+/*
+** caller_pred:
+**	...
+**	bl	callee_pred
+**	cntp	x0, p0, p0.b
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+uint64_t __attribute__((noipa))
+caller_pred (__SVBool_t *ptr1)
+{
+  __SVBool_t p;
+  p = callee_pred (ptr1);
+  return svcntp_b8 (p, p);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_1024.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_1024.c
new file mode 100644
index 000000000..450a3f029
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_1024.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=1024 -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+/*
+** callee_pred:
+**	ldr	p0, \[x0\]
+**	ret
+*/
+__SVBool_t __attribute__((noipa))
+callee_pred (__SVBool_t *ptr)
+{
+  return *ptr;
+}
+
+#include <arm_sve.h>
+
+/*
+** caller_pred:
+**	...
+**	bl	callee_pred
+**	cntp	x0, p0, p0.b
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+uint64_t __attribute__((noipa))
+caller_pred (__SVBool_t *ptr1)
+{
+  __SVBool_t p = callee_pred (ptr1);
+  return svcntp_b8 (p, p);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_2048.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_2048.c
new file mode 100644
index 000000000..c9ea26899
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_2048.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=2048 -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+/*
+** callee_pred:
+**	ldr	p0, \[x0\]
+**	ret
+*/
+__SVBool_t __attribute__((noipa))
+callee_pred (__SVBool_t *ptr)
+{
+  return *ptr;
+}
+
+#include <arm_sve.h>
+
+/*
+** caller_pred:
+**	...
+**	bl	callee_pred
+**	cntp	x0, p0, p0.b
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+uint64_t __attribute__((noipa))
+caller_pred (__SVBool_t *ptr1)
+{
+  __SVBool_t p = callee_pred (ptr1);
+  return svcntp_b8 (p, p);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_256.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_256.c
new file mode 100644
index 000000000..62bc695d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_256.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=256 -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+/*
+** callee_pred:
+**	ldr	p0, \[x0\]
+**	ret
+*/
+__SVBool_t __attribute__((noipa))
+callee_pred (__SVBool_t *ptr)
+{
+  return *ptr;
+}
+
+#include <arm_sve.h>
+
+/*
+** caller_pred:
+**	...
+**	bl	callee_pred
+**	cntp	x0, p0, p0.b
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+uint64_t __attribute__((noipa))
+caller_pred (__SVBool_t *ptr1)
+{
+  __SVBool_t p = callee_pred (ptr1);
+  return svcntp_b8 (p, p);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_512.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_512.c
new file mode 100644
index 000000000..f687689ce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_1_512.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=512 -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+/*
+** callee_pred:
+**	ldr	p0, \[x0\]
+**	ret
+*/
+__SVBool_t __attribute__((noipa))
+callee_pred (__SVBool_t *ptr)
+{
+  return *ptr;
+}
+
+#include <arm_sve.h>
+
+/*
+** caller_pred:
+**	...
+**	bl	callee_pred
+**	cntp	x0, p0, p0.b
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+uint64_t __attribute__((noipa))
+caller_pred (__SVBool_t *ptr1)
+{
+  __SVBool_t p = callee_pred (ptr1);
+  return svcntp_b8 (p, p);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_2.c
new file mode 100644
index 000000000..efaa81394
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_2.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+
+/*
+** callee_pred:
+**	ldr	p0, \[x0\]
+**	ret
+*/
+svbool_t __attribute__((noipa))
+callee_pred (svbool_t *ptr)
+{
+  return *ptr;
+}
+
+/*
+** caller_pred:
+**	...
+**	bl	callee_pred
+**	cntp	x0, p0, p0.b
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+uint64_t __attribute__((noipa))
+caller_pred (svbool_t *ptr1)
+{
+  svbool_t p;
+  p = callee_pred (ptr1);
+  return svcntp_b8 (p, p);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_3.c
new file mode 100644
index 000000000..71046447d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_3.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-O -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+
+typedef svbool_t my_pred;
+
+/*
+** callee_pred:
+**	ldr	p0, \[x0\]
+**	ret
+*/
+my_pred __attribute__((noipa))
+callee_pred (my_pred *ptr)
+{
+  return *ptr;
+}
+
+/*
+** caller_pred:
+**	...
+**	bl	callee_pred
+**	cntp	x0, p0, p0.b
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+uint64_t __attribute__((noipa))
+caller_pred (my_pred *ptr1)
+{
+  my_pred p;
+  p = callee_pred (ptr1);
+  return svcntp_b8 (p, p);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4.c
new file mode 100644
index 000000000..00eb2cbda
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4.c
@@ -0,0 +1,264 @@
+/* { dg-do compile } */
+/* { dg-options "-O -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#define CALLEE(SUFFIX, TYPE)			\
+  TYPE __attribute__((noipa))			\
+  callee_##SUFFIX (TYPE *ptr)			\
+  {						\
+    return *ptr;				\
+  }
+
+/*
+** callee_s8:
+**	ptrue	(p[0-7])\.b, all
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s8, __SVInt8_t)
+
+/*
+** callee_u8:
+**	ptrue	(p[0-7])\.b, all
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u8, __SVUint8_t)
+
+/*
+** callee_s16:
+**	ptrue	(p[0-7])\.b, all
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s16, __SVInt16_t)
+
+/*
+** callee_u16:
+**	ptrue	(p[0-7])\.b, all
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u16, __SVUint16_t)
+
+/*
+** callee_f16:
+**	ptrue	(p[0-7])\.b, all
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f16, __SVFloat16_t)
+
+/*
+** callee_bf16:
+**	ptrue	(p[0-7])\.b, all
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
+/*
+** callee_s32:
+**	ptrue	(p[0-7])\.b, all
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s32, __SVInt32_t)
+
+/*
+** callee_u32:
+**	ptrue	(p[0-7])\.b, all
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u32, __SVUint32_t)
+
+/*
+** callee_f32:
+**	ptrue	(p[0-7])\.b, all
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f32, __SVFloat32_t)
+
+/*
+** callee_s64:
+**	ptrue	(p[0-7])\.b, all
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s64, __SVInt64_t)
+
+/*
+** callee_u64:
+**	ptrue	(p[0-7])\.b, all
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u64, __SVUint64_t)
+
+/*
+** callee_f64:
+**	ptrue	(p[0-7])\.b, all
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f64, __SVFloat64_t)
+
+#include <arm_sve.h>
+
+#define CALLER(SUFFIX, TYPE)					\
+  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+#define CALLER_BF16(SUFFIX, TYPE)				\
+  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	ptrue	(p[0-7])\.b, all
+**	saddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s8, __SVInt8_t)
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	ptrue	(p[0-7])\.b, all
+**	uaddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u8, __SVUint8_t)
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	ptrue	(p[0-7])\.b, all
+**	saddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s16, __SVInt16_t)
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	ptrue	(p[0-7])\.b, all
+**	uaddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u16, __SVUint16_t)
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	ptrue	(p[0-7])\.b, all
+**	faddv	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f16, __SVFloat16_t)
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	ptrue	(p[0-7])\.b, all
+**	lasta	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	ptrue	(p[0-7])\.b, all
+**	saddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s32, __SVInt32_t)
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	ptrue	(p[0-7])\.b, all
+**	uaddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u32, __SVUint32_t)
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	ptrue	(p[0-7])\.b, all
+**	faddv	s0, \1, z0\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f32, __SVFloat32_t)
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	ptrue	(p[0-7])\.b, all
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s64, __SVInt64_t)
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	ptrue	(p[0-7])\.b, all
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u64, __SVUint64_t)
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	ptrue	(p[0-7])\.b, all
+**	faddv	d0, \1, z0\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f64, __SVFloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_1024.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_1024.c
new file mode 100644
index 000000000..43519634c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_1024.c
@@ -0,0 +1,264 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=1024 -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#define CALLEE(SUFFIX, TYPE)			\
+  TYPE __attribute__((noipa))			\
+  callee_##SUFFIX (TYPE *ptr)			\
+  {						\
+    return *ptr;				\
+  }
+
+/*
+** callee_s8:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s8, __SVInt8_t)
+
+/*
+** callee_u8:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u8, __SVUint8_t)
+
+/*
+** callee_s16:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s16, __SVInt16_t)
+
+/*
+** callee_u16:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u16, __SVUint16_t)
+
+/*
+** callee_f16:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f16, __SVFloat16_t)
+
+/*
+** callee_bf16:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
+/*
+** callee_s32:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s32, __SVInt32_t)
+
+/*
+** callee_u32:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u32, __SVUint32_t)
+
+/*
+** callee_f32:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f32, __SVFloat32_t)
+
+/*
+** callee_s64:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s64, __SVInt64_t)
+
+/*
+** callee_u64:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u64, __SVUint64_t)
+
+/*
+** callee_f64:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f64, __SVFloat64_t)
+
+#include <arm_sve.h>
+
+#define CALLER(SUFFIX, TYPE)					\
+  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+#define CALLER_BF16(SUFFIX, TYPE)				\
+  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	ptrue	(p[0-7])\.b, vl128
+**	saddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s8, __SVInt8_t)
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	ptrue	(p[0-7])\.b, vl128
+**	uaddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u8, __SVUint8_t)
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	ptrue	(p[0-7])\.b, vl128
+**	saddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s16, __SVInt16_t)
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	ptrue	(p[0-7])\.b, vl128
+**	uaddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u16, __SVUint16_t)
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	ptrue	(p[0-7])\.b, vl128
+**	faddv	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f16, __SVFloat16_t)
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	ptrue	(p[0-7])\.b, vl128
+**	lasta	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	ptrue	(p[0-7])\.b, vl128
+**	saddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s32, __SVInt32_t)
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	ptrue	(p[0-7])\.b, vl128
+**	uaddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u32, __SVUint32_t)
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	ptrue	(p[0-7])\.b, vl128
+**	faddv	s0, \1, z0\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f32, __SVFloat32_t)
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	ptrue	(p[0-7])\.b, vl128
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s64, __SVInt64_t)
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	ptrue	(p[0-7])\.b, vl128
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u64, __SVUint64_t)
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	ptrue	(p[0-7])\.b, vl128
+**	faddv	d0, \1, z0\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f64, __SVFloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_2048.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_2048.c
new file mode 100644
index 000000000..8256645f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_2048.c
@@ -0,0 +1,264 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=2048 -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#define CALLEE(SUFFIX, TYPE)			\
+  TYPE __attribute__((noipa))			\
+  callee_##SUFFIX (TYPE *ptr)			\
+  {						\
+    return *ptr;				\
+  }
+
+/*
+** callee_s8:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s8, __SVInt8_t)
+
+/*
+** callee_u8:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u8, __SVUint8_t)
+
+/*
+** callee_s16:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s16, __SVInt16_t)
+
+/*
+** callee_u16:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u16, __SVUint16_t)
+
+/*
+** callee_f16:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f16, __SVFloat16_t)
+
+/*
+** callee_bf16:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
+/*
+** callee_s32:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s32, __SVInt32_t)
+
+/*
+** callee_u32:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u32, __SVUint32_t)
+
+/*
+** callee_f32:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f32, __SVFloat32_t)
+
+/*
+** callee_s64:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s64, __SVInt64_t)
+
+/*
+** callee_u64:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u64, __SVUint64_t)
+
+/*
+** callee_f64:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f64, __SVFloat64_t)
+
+#include <arm_sve.h>
+
+#define CALLER(SUFFIX, TYPE)					\
+  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+#define CALLER_BF16(SUFFIX, TYPE)				\
+  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	ptrue	(p[0-7])\.b, vl256
+**	saddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s8, __SVInt8_t)
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	ptrue	(p[0-7])\.b, vl256
+**	uaddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u8, __SVUint8_t)
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	ptrue	(p[0-7])\.b, vl256
+**	saddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s16, __SVInt16_t)
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	ptrue	(p[0-7])\.b, vl256
+**	uaddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u16, __SVUint16_t)
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	ptrue	(p[0-7])\.b, vl256
+**	faddv	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f16, __SVFloat16_t)
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	ptrue	(p[0-7])\.b, vl256
+**	lasta	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	ptrue	(p[0-7])\.b, vl256
+**	saddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s32, __SVInt32_t)
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	ptrue	(p[0-7])\.b, vl256
+**	uaddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u32, __SVUint32_t)
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	ptrue	(p[0-7])\.b, vl256
+**	faddv	s0, \1, z0\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f32, __SVFloat32_t)
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	ptrue	(p[0-7])\.b, vl256
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s64, __SVInt64_t)
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	ptrue	(p[0-7])\.b, vl256
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u64, __SVUint64_t)
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	ptrue	(p[0-7])\.b, vl256
+**	faddv	d0, \1, z0\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f64, __SVFloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_256.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_256.c
new file mode 100644
index 000000000..1e0f6bb96
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_256.c
@@ -0,0 +1,264 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=256 -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#define CALLEE(SUFFIX, TYPE)			\
+  TYPE __attribute__((noipa))			\
+  callee_##SUFFIX (TYPE *ptr)			\
+  {						\
+    return *ptr;				\
+  }
+
+/*
+** callee_s8:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s8, __SVInt8_t)
+
+/*
+** callee_u8:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u8, __SVUint8_t)
+
+/*
+** callee_s16:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s16, __SVInt16_t)
+
+/*
+** callee_u16:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u16, __SVUint16_t)
+
+/*
+** callee_f16:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f16, __SVFloat16_t)
+
+/*
+** callee_bf16:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
+/*
+** callee_s32:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s32, __SVInt32_t)
+
+/*
+** callee_u32:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u32, __SVUint32_t)
+
+/*
+** callee_f32:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f32, __SVFloat32_t)
+
+/*
+** callee_s64:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s64, __SVInt64_t)
+
+/*
+** callee_u64:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u64, __SVUint64_t)
+
+/*
+** callee_f64:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f64, __SVFloat64_t)
+
+#include <arm_sve.h>
+
+#define CALLER(SUFFIX, TYPE)					\
+  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+#define CALLER_BF16(SUFFIX, TYPE)				\
+  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	ptrue	(p[0-7])\.b, vl32
+**	saddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s8, __SVInt8_t)
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	ptrue	(p[0-7])\.b, vl32
+**	uaddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u8, __SVUint8_t)
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	ptrue	(p[0-7])\.b, vl32
+**	saddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s16, __SVInt16_t)
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	ptrue	(p[0-7])\.b, vl32
+**	uaddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u16, __SVUint16_t)
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	ptrue	(p[0-7])\.b, vl32
+**	faddv	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f16, __SVFloat16_t)
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	ptrue	(p[0-7])\.b, vl32
+**	lasta	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	ptrue	(p[0-7])\.b, vl32
+**	saddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s32, __SVInt32_t)
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	ptrue	(p[0-7])\.b, vl32
+**	uaddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u32, __SVUint32_t)
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	ptrue	(p[0-7])\.b, vl32
+**	faddv	s0, \1, z0\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f32, __SVFloat32_t)
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	ptrue	(p[0-7])\.b, vl32
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s64, __SVInt64_t)
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	ptrue	(p[0-7])\.b, vl32
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u64, __SVUint64_t)
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	ptrue	(p[0-7])\.b, vl32
+**	faddv	d0, \1, z0\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f64, __SVFloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_512.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_512.c
new file mode 100644
index 000000000..5b58ed734
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_4_512.c
@@ -0,0 +1,264 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=512 -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#define CALLEE(SUFFIX, TYPE)			\
+  TYPE __attribute__((noipa))			\
+  callee_##SUFFIX (TYPE *ptr)			\
+  {						\
+    return *ptr;				\
+  }
+
+/*
+** callee_s8:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s8, __SVInt8_t)
+
+/*
+** callee_u8:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u8, __SVUint8_t)
+
+/*
+** callee_s16:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s16, __SVInt16_t)
+
+/*
+** callee_u16:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u16, __SVUint16_t)
+
+/*
+** callee_f16:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f16, __SVFloat16_t)
+
+/*
+** callee_bf16:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
+/*
+** callee_s32:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s32, __SVInt32_t)
+
+/*
+** callee_u32:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u32, __SVUint32_t)
+
+/*
+** callee_f32:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f32, __SVFloat32_t)
+
+/*
+** callee_s64:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s64, __SVInt64_t)
+
+/*
+** callee_u64:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u64, __SVUint64_t)
+
+/*
+** callee_f64:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f64, __SVFloat64_t)
+
+#include <arm_sve.h>
+
+#define CALLER(SUFFIX, TYPE)					\
+  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+#define CALLER_BF16(SUFFIX, TYPE)				\
+  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	ptrue	(p[0-7])\.b, vl64
+**	saddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s8, __SVInt8_t)
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	ptrue	(p[0-7])\.b, vl64
+**	uaddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u8, __SVUint8_t)
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	ptrue	(p[0-7])\.b, vl64
+**	saddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s16, __SVInt16_t)
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	ptrue	(p[0-7])\.b, vl64
+**	uaddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u16, __SVUint16_t)
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	ptrue	(p[0-7])\.b, vl64
+**	faddv	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f16, __SVFloat16_t)
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	ptrue	(p[0-7])\.b, vl64
+**	lasta	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	ptrue	(p[0-7])\.b, vl64
+**	saddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s32, __SVInt32_t)
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	ptrue	(p[0-7])\.b, vl64
+**	uaddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u32, __SVUint32_t)
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	ptrue	(p[0-7])\.b, vl64
+**	faddv	s0, \1, z0\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f32, __SVFloat32_t)
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	ptrue	(p[0-7])\.b, vl64
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s64, __SVInt64_t)
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	ptrue	(p[0-7])\.b, vl64
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u64, __SVUint64_t)
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	ptrue	(p[0-7])\.b, vl64
+**	faddv	d0, \1, z0\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f64, __SVFloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5.c
new file mode 100644
index 000000000..55c78e16f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5.c
@@ -0,0 +1,264 @@
+/* { dg-do compile } */
+/* { dg-options "-O -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+
+#define CALLEE(SUFFIX, TYPE)			\
+  TYPE __attribute__((noipa))			\
+  callee_##SUFFIX (TYPE *ptr)			\
+  {						\
+    return *ptr;				\
+  }
+
+/*
+** callee_s8:
+**	ptrue	(p[0-7])\.b, all
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s8, svint8_t)
+
+/*
+** callee_u8:
+**	ptrue	(p[0-7])\.b, all
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u8, svuint8_t)
+
+/*
+** callee_s16:
+**	ptrue	(p[0-7])\.b, all
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s16, svint16_t)
+
+/*
+** callee_u16:
+**	ptrue	(p[0-7])\.b, all
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u16, svuint16_t)
+
+/*
+** callee_f16:
+**	ptrue	(p[0-7])\.b, all
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f16, svfloat16_t)
+
+/*
+** callee_bf16:
+**	ptrue	(p[0-7])\.b, all
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
+/*
+** callee_s32:
+**	ptrue	(p[0-7])\.b, all
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s32, svint32_t)
+
+/*
+** callee_u32:
+**	ptrue	(p[0-7])\.b, all
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u32, svuint32_t)
+
+/*
+** callee_f32:
+**	ptrue	(p[0-7])\.b, all
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f32, svfloat32_t)
+
+/*
+** callee_s64:
+**	ptrue	(p[0-7])\.b, all
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s64, svint64_t)
+
+/*
+** callee_u64:
+**	ptrue	(p[0-7])\.b, all
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u64, svuint64_t)
+
+/*
+** callee_f64:
+**	ptrue	(p[0-7])\.b, all
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f64, svfloat64_t)
+
+#define CALLER(SUFFIX, TYPE)					\
+  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+#define CALLER_BF16(SUFFIX, TYPE)				\
+  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	ptrue	(p[0-7])\.b, all
+**	saddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s8, svint8_t)
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	ptrue	(p[0-7])\.b, all
+**	uaddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u8, svuint8_t)
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	ptrue	(p[0-7])\.b, all
+**	saddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s16, svint16_t)
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	ptrue	(p[0-7])\.b, all
+**	uaddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u16, svuint16_t)
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	ptrue	(p[0-7])\.b, all
+**	faddv	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f16, svfloat16_t)
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	ptrue	(p[0-7])\.b, all
+**	lasta	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	ptrue	(p[0-7])\.b, all
+**	saddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s32, svint32_t)
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	ptrue	(p[0-7])\.b, all
+**	uaddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u32, svuint32_t)
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	ptrue	(p[0-7])\.b, all
+**	faddv	s0, \1, z0\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f32, svfloat32_t)
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	ptrue	(p[0-7])\.b, all
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s64, svint64_t)
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	ptrue	(p[0-7])\.b, all
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u64, svuint64_t)
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	ptrue	(p[0-7])\.b, all
+**	faddv	d0, \1, z0\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f64, svfloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_1024.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_1024.c
new file mode 100644
index 000000000..52e9916d8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_1024.c
@@ -0,0 +1,264 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=1024 -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+
+#define CALLEE(SUFFIX, TYPE)			\
+  TYPE __attribute__((noipa))			\
+  callee_##SUFFIX (TYPE *ptr)			\
+  {						\
+    return *ptr;				\
+  }
+
+/*
+** callee_s8:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s8, svint8_t)
+
+/*
+** callee_u8:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u8, svuint8_t)
+
+/*
+** callee_s16:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s16, svint16_t)
+
+/*
+** callee_u16:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u16, svuint16_t)
+
+/*
+** callee_f16:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f16, svfloat16_t)
+
+/*
+** callee_bf16:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
+/*
+** callee_s32:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s32, svint32_t)
+
+/*
+** callee_u32:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u32, svuint32_t)
+
+/*
+** callee_f32:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f32, svfloat32_t)
+
+/*
+** callee_s64:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s64, svint64_t)
+
+/*
+** callee_u64:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u64, svuint64_t)
+
+/*
+** callee_f64:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f64, svfloat64_t)
+
+#define CALLER(SUFFIX, TYPE)					\
+  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+#define CALLER_BF16(SUFFIX, TYPE)				\
+  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	ptrue	(p[0-7])\.b, vl128
+**	saddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s8, svint8_t)
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	ptrue	(p[0-7])\.b, vl128
+**	uaddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u8, svuint8_t)
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	ptrue	(p[0-7])\.b, vl128
+**	saddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s16, svint16_t)
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	ptrue	(p[0-7])\.b, vl128
+**	uaddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u16, svuint16_t)
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	ptrue	(p[0-7])\.b, vl128
+**	faddv	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f16, svfloat16_t)
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	ptrue	(p[0-7])\.b, vl128
+**	lasta	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	ptrue	(p[0-7])\.b, vl128
+**	saddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s32, svint32_t)
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	ptrue	(p[0-7])\.b, vl128
+**	uaddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u32, svuint32_t)
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	ptrue	(p[0-7])\.b, vl128
+**	faddv	s0, \1, z0\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f32, svfloat32_t)
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	ptrue	(p[0-7])\.b, vl128
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s64, svint64_t)
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	ptrue	(p[0-7])\.b, vl128
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u64, svuint64_t)
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	ptrue	(p[0-7])\.b, vl128
+**	faddv	d0, \1, z0\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f64, svfloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_2048.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_2048.c
new file mode 100644
index 000000000..6f37d9d6c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_2048.c
@@ -0,0 +1,264 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=2048 -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+
+#define CALLEE(SUFFIX, TYPE)			\
+  TYPE __attribute__((noipa))			\
+  callee_##SUFFIX (TYPE *ptr)			\
+  {						\
+    return *ptr;				\
+  }
+
+/*
+** callee_s8:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s8, svint8_t)
+
+/*
+** callee_u8:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u8, svuint8_t)
+
+/*
+** callee_s16:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s16, svint16_t)
+
+/*
+** callee_u16:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u16, svuint16_t)
+
+/*
+** callee_f16:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f16, svfloat16_t)
+
+/*
+** callee_bf16:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
+/*
+** callee_s32:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s32, svint32_t)
+
+/*
+** callee_u32:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u32, svuint32_t)
+
+/*
+** callee_f32:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f32, svfloat32_t)
+
+/*
+** callee_s64:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s64, svint64_t)
+
+/*
+** callee_u64:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u64, svuint64_t)
+
+/*
+** callee_f64:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f64, svfloat64_t)
+
+#define CALLER(SUFFIX, TYPE)					\
+  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+#define CALLER_BF16(SUFFIX, TYPE)				\
+  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	ptrue	(p[0-7])\.b, vl256
+**	saddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s8, svint8_t)
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	ptrue	(p[0-7])\.b, vl256
+**	uaddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u8, svuint8_t)
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	ptrue	(p[0-7])\.b, vl256
+**	saddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s16, svint16_t)
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	ptrue	(p[0-7])\.b, vl256
+**	uaddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u16, svuint16_t)
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	ptrue	(p[0-7])\.b, vl256
+**	faddv	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f16, svfloat16_t)
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	ptrue	(p[0-7])\.b, vl256
+**	lasta	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	ptrue	(p[0-7])\.b, vl256
+**	saddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s32, svint32_t)
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	ptrue	(p[0-7])\.b, vl256
+**	uaddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u32, svuint32_t)
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	ptrue	(p[0-7])\.b, vl256
+**	faddv	s0, \1, z0\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f32, svfloat32_t)
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	ptrue	(p[0-7])\.b, vl256
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s64, svint64_t)
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	ptrue	(p[0-7])\.b, vl256
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u64, svuint64_t)
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	ptrue	(p[0-7])\.b, vl256
+**	faddv	d0, \1, z0\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f64, svfloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_256.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_256.c
new file mode 100644
index 000000000..7ba094e16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_256.c
@@ -0,0 +1,264 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=256 -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+
+#define CALLEE(SUFFIX, TYPE)			\
+  TYPE __attribute__((noipa))			\
+  callee_##SUFFIX (TYPE *ptr)			\
+  {						\
+    return *ptr;				\
+  }
+
+/*
+** callee_s8:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s8, svint8_t)
+
+/*
+** callee_u8:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u8, svuint8_t)
+
+/*
+** callee_s16:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s16, svint16_t)
+
+/*
+** callee_u16:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u16, svuint16_t)
+
+/*
+** callee_f16:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f16, svfloat16_t)
+
+/*
+** callee_bf16:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
+/*
+** callee_s32:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s32, svint32_t)
+
+/*
+** callee_u32:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u32, svuint32_t)
+
+/*
+** callee_f32:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f32, svfloat32_t)
+
+/*
+** callee_s64:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s64, svint64_t)
+
+/*
+** callee_u64:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u64, svuint64_t)
+
+/*
+** callee_f64:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f64, svfloat64_t)
+
+#define CALLER(SUFFIX, TYPE)					\
+  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+#define CALLER_BF16(SUFFIX, TYPE)				\
+  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	ptrue	(p[0-7])\.b, vl32
+**	saddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s8, svint8_t)
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	ptrue	(p[0-7])\.b, vl32
+**	uaddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u8, svuint8_t)
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	ptrue	(p[0-7])\.b, vl32
+**	saddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s16, svint16_t)
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	ptrue	(p[0-7])\.b, vl32
+**	uaddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u16, svuint16_t)
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	ptrue	(p[0-7])\.b, vl32
+**	faddv	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f16, svfloat16_t)
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	ptrue	(p[0-7])\.b, vl32
+**	lasta	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	ptrue	(p[0-7])\.b, vl32
+**	saddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s32, svint32_t)
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	ptrue	(p[0-7])\.b, vl32
+**	uaddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u32, svuint32_t)
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	ptrue	(p[0-7])\.b, vl32
+**	faddv	s0, \1, z0\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f32, svfloat32_t)
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	ptrue	(p[0-7])\.b, vl32
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s64, svint64_t)
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	ptrue	(p[0-7])\.b, vl32
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u64, svuint64_t)
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	ptrue	(p[0-7])\.b, vl32
+**	faddv	d0, \1, z0\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f64, svfloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_512.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_512.c
new file mode 100644
index 000000000..36b14d420
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_5_512.c
@@ -0,0 +1,264 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=512 -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+
+#define CALLEE(SUFFIX, TYPE)			\
+  TYPE __attribute__((noipa))			\
+  callee_##SUFFIX (TYPE *ptr)			\
+  {						\
+    return *ptr;				\
+  }
+
+/*
+** callee_s8:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s8, svint8_t)
+
+/*
+** callee_u8:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u8, svuint8_t)
+
+/*
+** callee_s16:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s16, svint16_t)
+
+/*
+** callee_u16:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u16, svuint16_t)
+
+/*
+** callee_f16:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f16, svfloat16_t)
+
+/*
+** callee_bf16:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
+/*
+** callee_s32:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s32, svint32_t)
+
+/*
+** callee_u32:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u32, svuint32_t)
+
+/*
+** callee_f32:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f32, svfloat32_t)
+
+/*
+** callee_s64:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (s64, svint64_t)
+
+/*
+** callee_u64:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (u64, svuint64_t)
+
+/*
+** callee_f64:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	ret
+*/
+CALLEE (f64, svfloat64_t)
+
+#define CALLER(SUFFIX, TYPE)					\
+  typeof (svaddv (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+#define CALLER_BF16(SUFFIX, TYPE)				\
+  typeof (svlasta (svptrue_b8 (), *(TYPE *) 0))			\
+  __attribute__((noipa))					\
+  caller_##SUFFIX (TYPE *ptr1)					\
+  {								\
+    return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1));	\
+  }
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	ptrue	(p[0-7])\.b, vl64
+**	saddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s8, svint8_t)
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	ptrue	(p[0-7])\.b, vl64
+**	uaddv	(d[0-9]+), \1, z0\.b
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u8, svuint8_t)
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	ptrue	(p[0-7])\.b, vl64
+**	saddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s16, svint16_t)
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	ptrue	(p[0-7])\.b, vl64
+**	uaddv	(d[0-9]+), \1, z0\.h
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u16, svuint16_t)
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	ptrue	(p[0-7])\.b, vl64
+**	faddv	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f16, svfloat16_t)
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	ptrue	(p[0-7])\.b, vl64
+**	lasta	h0, \1, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	ptrue	(p[0-7])\.b, vl64
+**	saddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s32, svint32_t)
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	ptrue	(p[0-7])\.b, vl64
+**	uaddv	(d[0-9]+), \1, z0\.s
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u32, svuint32_t)
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	ptrue	(p[0-7])\.b, vl64
+**	faddv	s0, \1, z0\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f32, svfloat32_t)
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	ptrue	(p[0-7])\.b, vl64
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (s64, svint64_t)
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	ptrue	(p[0-7])\.b, vl64
+**	uaddv	(d[0-9]+), \1, z0\.d
+**	fmov	x0, \2
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (u64, svuint64_t)
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	ptrue	(p[0-7])\.b, vl64
+**	faddv	d0, \1, z0\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+CALLER (f64, svfloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6.c
new file mode 100644
index 000000000..72468eab1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6.c
@@ -0,0 +1,272 @@
+/* { dg-do compile } */
+/* { dg-options "-O -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <stdint.h>
+
+typedef int8_t svint8_t __attribute__ ((vector_size (32)));
+typedef uint8_t svuint8_t __attribute__ ((vector_size (32)));
+
+typedef int16_t svint16_t __attribute__ ((vector_size (32)));
+typedef uint16_t svuint16_t __attribute__ ((vector_size (32)));
+typedef __fp16 svfloat16_t __attribute__ ((vector_size (32)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (32)));
+
+typedef int32_t svint32_t __attribute__ ((vector_size (32)));
+typedef uint32_t svuint32_t __attribute__ ((vector_size (32)));
+typedef float svfloat32_t __attribute__ ((vector_size (32)));
+
+typedef int64_t svint64_t __attribute__ ((vector_size (32)));
+typedef uint64_t svuint64_t __attribute__ ((vector_size (32)));
+typedef double svfloat64_t __attribute__ ((vector_size (32)));
+
+#define CALLEE(SUFFIX, TYPE)			\
+  TYPE __attribute__((noipa))			\
+  callee_##SUFFIX (TYPE *ptr)			\
+  {						\
+    return *ptr;				\
+  }
+
+/*
+** callee_s8:
+** (
+**	ld1	({v.*}), \[x0\]
+**	st1	\1, \[x8\]
+** |
+**	ldp	(q[0-9]+, q[0-9]+), \[x0\]
+**	stp	\2, \[x8\]
+** )
+**	ret
+*/
+CALLEE (s8, svint8_t)
+
+/*
+** callee_u8:
+** (
+**	ld1	({v.*}), \[x0\]
+**	st1	\1, \[x8\]
+** |
+**	ldp	(q[0-9]+, q[0-9]+), \[x0\]
+**	stp	\2, \[x8\]
+** )
+**	ret
+*/
+CALLEE (u8, svuint8_t)
+
+/*
+** callee_s16:
+** (
+**	ld1	({v.*}), \[x0\]
+**	st1	\1, \[x8\]
+** |
+**	ldp	(q[0-9]+, q[0-9]+), \[x0\]
+**	stp	\2, \[x8\]
+** )
+**	ret
+*/
+CALLEE (s16, svint16_t)
+
+/*
+** callee_u16:
+** (
+**	ld1	({v.*}), \[x0\]
+**	st1	\1, \[x8\]
+** |
+**	ldp	(q[0-9]+, q[0-9]+), \[x0\]
+**	stp	\2, \[x8\]
+** )
+**	ret
+*/
+CALLEE (u16, svuint16_t)
+
+/* Currently we scalarize this.  */
+CALLEE (f16, svfloat16_t)
+
+/* Currently we scalarize this.  */
+CALLEE (bf16, svbfloat16_t)
+
+/*
+** callee_s32:
+** (
+**	ld1	({v.*}), \[x0\]
+**	st1	\1, \[x8\]
+** |
+**	ldp	(q[0-9]+, q[0-9]+), \[x0\]
+**	stp	\2, \[x8\]
+** )
+**	ret
+*/
+CALLEE (s32, svint32_t)
+
+/*
+** callee_u32:
+** (
+**	ld1	({v.*}), \[x0\]
+**	st1	\1, \[x8\]
+** |
+**	ldp	(q[0-9]+, q[0-9]+), \[x0\]
+**	stp	\2, \[x8\]
+** )
+**	ret
+*/
+CALLEE (u32, svuint32_t)
+
+/* Currently we scalarize this.  */
+CALLEE (f32, svfloat32_t)
+
+/*
+** callee_s64:
+** (
+**	ld1	({v.*}), \[x0\]
+**	st1	\1, \[x8\]
+** |
+**	ldp	(q[0-9]+, q[0-9]+), \[x0\]
+**	stp	\2, \[x8\]
+** )
+**	ret
+*/
+CALLEE (s64, svint64_t)
+
+/*
+** callee_u64:
+** (
+**	ld1	({v.*}), \[x0\]
+**	st1	\1, \[x8\]
+** |
+**	ldp	(q[0-9]+, q[0-9]+), \[x0\]
+**	stp	\2, \[x8\]
+** )
+**	ret
+*/
+CALLEE (u64, svuint64_t)
+
+/* Currently we scalarize this.  */
+CALLEE (f64, svfloat64_t)
+
+#define CALLER(SUFFIX, TYPE)		\
+  typeof ((*(TYPE *) 0)[0])		\
+  __attribute__((noipa))		\
+  caller_##SUFFIX (TYPE *ptr1)		\
+  {					\
+    return callee_##SUFFIX (ptr1)[0];	\
+  }
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	ldrb	w0, \[sp, 16\]
+**	ldp	x29, x30, \[sp\], 48
+**	ret
+*/
+CALLER (s8, svint8_t)
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	ldrb	w0, \[sp, 16\]
+**	ldp	x29, x30, \[sp\], 48
+**	ret
+*/
+CALLER (u8, svuint8_t)
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	ldrh	w0, \[sp, 16\]
+**	ldp	x29, x30, \[sp\], 48
+**	ret
+*/
+CALLER (s16, svint16_t)
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	ldrh	w0, \[sp, 16\]
+**	ldp	x29, x30, \[sp\], 48
+**	ret
+*/
+CALLER (u16, svuint16_t)
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	ldr	h0, \[sp, 16\]
+**	ldp	x29, x30, \[sp\], 48
+**	ret
+*/
+CALLER (f16, svfloat16_t)
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	ldr	h0, \[sp, 16\]
+**	ldp	x29, x30, \[sp\], 48
+**	ret
+*/
+CALLER (bf16, svbfloat16_t)
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	ldr	w0, \[sp, 16\]
+**	ldp	x29, x30, \[sp\], 48
+**	ret
+*/
+CALLER (s32, svint32_t)
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	ldr	w0, \[sp, 16\]
+**	ldp	x29, x30, \[sp\], 48
+**	ret
+*/
+CALLER (u32, svuint32_t)
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	ldr	s0, \[sp, 16\]
+**	ldp	x29, x30, \[sp\], 48
+**	ret
+*/
+CALLER (f32, svfloat32_t)
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	ldr	x0, \[sp, 16\]
+**	ldp	x29, x30, \[sp\], 48
+**	ret
+*/
+CALLER (s64, svint64_t)
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	ldr	x0, \[sp, 16\]
+**	ldp	x29, x30, \[sp\], 48
+**	ret
+*/
+CALLER (u64, svuint64_t)
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	ldr	d0, \[sp, 16\]
+**	ldp	x29, x30, \[sp\], 48
+**	ret
+*/
+CALLER (f64, svfloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_1024.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_1024.c
new file mode 100644
index 000000000..b6f267e76
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_1024.c
@@ -0,0 +1,287 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=1024 -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <stdint.h>
+
+typedef int8_t svint8_t __attribute__ ((vector_size (128)));
+typedef uint8_t svuint8_t __attribute__ ((vector_size (128)));
+
+typedef int16_t svint16_t __attribute__ ((vector_size (128)));
+typedef uint16_t svuint16_t __attribute__ ((vector_size (128)));
+typedef __fp16 svfloat16_t __attribute__ ((vector_size (128)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (128)));
+
+typedef int32_t svint32_t __attribute__ ((vector_size (128)));
+typedef uint32_t svuint32_t __attribute__ ((vector_size (128)));
+typedef float svfloat32_t __attribute__ ((vector_size (128)));
+
+typedef int64_t svint64_t __attribute__ ((vector_size (128)));
+typedef uint64_t svuint64_t __attribute__ ((vector_size (128)));
+typedef double svfloat64_t __attribute__ ((vector_size (128)));
+
+#define CALLEE(SUFFIX, TYPE)			\
+  TYPE __attribute__((noipa))			\
+  callee_##SUFFIX (TYPE *ptr)			\
+  {						\
+    return *ptr;				\
+  }
+
+/*
+** callee_s8:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	st1b	z0\.b, \1, \[x8\]
+**	ret
+*/
+CALLEE (s8, svint8_t)
+
+/*
+** callee_u8:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	st1b	z0\.b, \1, \[x8\]
+**	ret
+*/
+CALLEE (u8, svuint8_t)
+
+/*
+** callee_s16:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	st1h	z0\.h, \1, \[x8\]
+**	ret
+*/
+CALLEE (s16, svint16_t)
+
+/*
+** callee_u16:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	st1h	z0\.h, \1, \[x8\]
+**	ret
+*/
+CALLEE (u16, svuint16_t)
+
+/*
+** callee_f16:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	st1h	z0\.h, \1, \[x8\]
+**	ret
+*/
+CALLEE (f16, svfloat16_t)
+
+/*
+** callee_bf16:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	st1h	z0\.h, \1, \[x8\]
+**	ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
+/*
+** callee_s32:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	st1w	z0\.s, \1, \[x8\]
+**	ret
+*/
+CALLEE (s32, svint32_t)
+
+/*
+** callee_u32:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	st1w	z0\.s, \1, \[x8\]
+**	ret
+*/
+CALLEE (u32, svuint32_t)
+
+/*
+** callee_f32:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	st1w	z0\.s, \1, \[x8\]
+**	ret
+*/
+CALLEE (f32, svfloat32_t)
+
+/*
+** callee_s64:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	st1d	z0\.d, \1, \[x8\]
+**	ret
+*/
+CALLEE (s64, svint64_t)
+
+/*
+** callee_u64:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	st1d	z0\.d, \1, \[x8\]
+**	ret
+*/
+CALLEE (u64, svuint64_t)
+
+/*
+** callee_f64:
+**	ptrue	(p[0-7])\.b, vl128
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	st1d	z0\.d, \1, \[x8\]
+**	ret
+*/
+CALLEE (f64, svfloat64_t)
+
+#define CALLER(SUFFIX, TYPE)			\
+  void __attribute__((noipa))			\
+  caller_##SUFFIX (TYPE *ptr1, TYPE *ptr2)	\
+  {						\
+    *ptr2 = callee_##SUFFIX (ptr1);		\
+  }
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	...
+**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[[^]]*\]
+**	st1b	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (s8, svint8_t)
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	...
+**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[[^]]*\]
+**	st1b	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (u8, svuint8_t)
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+**	st1h	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (s16, svint16_t)
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+**	st1h	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (u16, svuint16_t)
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+**	st1h	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (f16, svfloat16_t)
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+**	st1h	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (bf16, svbfloat16_t)
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
+**	st1w	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (s32, svint32_t)
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
+**	st1w	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (u32, svuint32_t)
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
+**	st1w	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (f32, svfloat32_t)
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
+**	st1d	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (s64, svint64_t)
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
+**	st1d	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (u64, svuint64_t)
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
+**	st1d	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (f64, svfloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_2048.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_2048.c
new file mode 100644
index 000000000..46b7d683e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_2048.c
@@ -0,0 +1,287 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=2048 -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <stdint.h>
+
+typedef int8_t svint8_t __attribute__ ((vector_size (256)));
+typedef uint8_t svuint8_t __attribute__ ((vector_size (256)));
+
+typedef int16_t svint16_t __attribute__ ((vector_size (256)));
+typedef uint16_t svuint16_t __attribute__ ((vector_size (256)));
+typedef __fp16 svfloat16_t __attribute__ ((vector_size (256)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (256)));
+
+typedef int32_t svint32_t __attribute__ ((vector_size (256)));
+typedef uint32_t svuint32_t __attribute__ ((vector_size (256)));
+typedef float svfloat32_t __attribute__ ((vector_size (256)));
+
+typedef int64_t svint64_t __attribute__ ((vector_size (256)));
+typedef uint64_t svuint64_t __attribute__ ((vector_size (256)));
+typedef double svfloat64_t __attribute__ ((vector_size (256)));
+
+#define CALLEE(SUFFIX, TYPE)			\
+  TYPE __attribute__((noipa))			\
+  callee_##SUFFIX (TYPE *ptr)			\
+  {						\
+    return *ptr;				\
+  }
+
+/*
+** callee_s8:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	st1b	z0\.b, \1, \[x8\]
+**	ret
+*/
+CALLEE (s8, svint8_t)
+
+/*
+** callee_u8:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	st1b	z0\.b, \1, \[x8\]
+**	ret
+*/
+CALLEE (u8, svuint8_t)
+
+/*
+** callee_s16:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	st1h	z0\.h, \1, \[x8\]
+**	ret
+*/
+CALLEE (s16, svint16_t)
+
+/*
+** callee_u16:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	st1h	z0\.h, \1, \[x8\]
+**	ret
+*/
+CALLEE (u16, svuint16_t)
+
+/*
+** callee_f16:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	st1h	z0\.h, \1, \[x8\]
+**	ret
+*/
+CALLEE (f16, svfloat16_t)
+
+/*
+** callee_bf16:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	st1h	z0\.h, \1, \[x8\]
+**	ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
+/*
+** callee_s32:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	st1w	z0\.s, \1, \[x8\]
+**	ret
+*/
+CALLEE (s32, svint32_t)
+
+/*
+** callee_u32:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	st1w	z0\.s, \1, \[x8\]
+**	ret
+*/
+CALLEE (u32, svuint32_t)
+
+/*
+** callee_f32:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	st1w	z0\.s, \1, \[x8\]
+**	ret
+*/
+CALLEE (f32, svfloat32_t)
+
+/*
+** callee_s64:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	st1d	z0\.d, \1, \[x8\]
+**	ret
+*/
+CALLEE (s64, svint64_t)
+
+/*
+** callee_u64:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	st1d	z0\.d, \1, \[x8\]
+**	ret
+*/
+CALLEE (u64, svuint64_t)
+
+/*
+** callee_f64:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	st1d	z0\.d, \1, \[x8\]
+**	ret
+*/
+CALLEE (f64, svfloat64_t)
+
+#define CALLER(SUFFIX, TYPE)			\
+  void __attribute__((noipa))			\
+  caller_##SUFFIX (TYPE *ptr1, TYPE *ptr2)	\
+  {						\
+    *ptr2 = callee_##SUFFIX (ptr1);		\
+  }
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	...
+**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[[^]]*\]
+**	st1b	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (s8, svint8_t)
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	...
+**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[[^]]*\]
+**	st1b	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (u8, svuint8_t)
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+**	st1h	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (s16, svint16_t)
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+**	st1h	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (u16, svuint16_t)
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+**	st1h	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (f16, svfloat16_t)
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+**	st1h	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (bf16, svbfloat16_t)
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
+**	st1w	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (s32, svint32_t)
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
+**	st1w	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (u32, svuint32_t)
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
+**	st1w	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (f32, svfloat32_t)
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
+**	st1d	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (s64, svint64_t)
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
+**	st1d	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (u64, svuint64_t)
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
+**	st1d	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (f64, svfloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_256.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_256.c
new file mode 100644
index 000000000..04872493c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_256.c
@@ -0,0 +1,287 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=256 -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <stdint.h>
+
+typedef int8_t svint8_t __attribute__ ((vector_size (32)));
+typedef uint8_t svuint8_t __attribute__ ((vector_size (32)));
+
+typedef int16_t svint16_t __attribute__ ((vector_size (32)));
+typedef uint16_t svuint16_t __attribute__ ((vector_size (32)));
+typedef __fp16 svfloat16_t __attribute__ ((vector_size (32)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (32)));
+
+typedef int32_t svint32_t __attribute__ ((vector_size (32)));
+typedef uint32_t svuint32_t __attribute__ ((vector_size (32)));
+typedef float svfloat32_t __attribute__ ((vector_size (32)));
+
+typedef int64_t svint64_t __attribute__ ((vector_size (32)));
+typedef uint64_t svuint64_t __attribute__ ((vector_size (32)));
+typedef double svfloat64_t __attribute__ ((vector_size (32)));
+
+#define CALLEE(SUFFIX, TYPE)			\
+  TYPE __attribute__((noipa))			\
+  callee_##SUFFIX (TYPE *ptr)			\
+  {						\
+    return *ptr;				\
+  }
+
+/*
+** callee_s8:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	st1b	z0\.b, \1, \[x8\]
+**	ret
+*/
+CALLEE (s8, svint8_t)
+
+/*
+** callee_u8:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	st1b	z0\.b, \1, \[x8\]
+**	ret
+*/
+CALLEE (u8, svuint8_t)
+
+/*
+** callee_s16:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	st1h	z0\.h, \1, \[x8\]
+**	ret
+*/
+CALLEE (s16, svint16_t)
+
+/*
+** callee_u16:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	st1h	z0\.h, \1, \[x8\]
+**	ret
+*/
+CALLEE (u16, svuint16_t)
+
+/*
+** callee_f16:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	st1h	z0\.h, \1, \[x8\]
+**	ret
+*/
+CALLEE (f16, svfloat16_t)
+
+/*
+** callee_bf16:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	st1h	z0\.h, \1, \[x8\]
+**	ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
+/*
+** callee_s32:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	st1w	z0\.s, \1, \[x8\]
+**	ret
+*/
+CALLEE (s32, svint32_t)
+
+/*
+** callee_u32:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	st1w	z0\.s, \1, \[x8\]
+**	ret
+*/
+CALLEE (u32, svuint32_t)
+
+/*
+** callee_f32:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	st1w	z0\.s, \1, \[x8\]
+**	ret
+*/
+CALLEE (f32, svfloat32_t)
+
+/*
+** callee_s64:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	st1d	z0\.d, \1, \[x8\]
+**	ret
+*/
+CALLEE (s64, svint64_t)
+
+/*
+** callee_u64:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	st1d	z0\.d, \1, \[x8\]
+**	ret
+*/
+CALLEE (u64, svuint64_t)
+
+/*
+** callee_f64:
+**	ptrue	(p[0-7])\.b, vl32
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	st1d	z0\.d, \1, \[x8\]
+**	ret
+*/
+CALLEE (f64, svfloat64_t)
+
+#define CALLER(SUFFIX, TYPE)			\
+  void __attribute__((noipa))			\
+  caller_##SUFFIX (TYPE *ptr1, TYPE *ptr2)	\
+  {						\
+    *ptr2 = callee_##SUFFIX (ptr1);		\
+  }
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	...
+**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[[^]]*\]
+**	st1b	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (s8, svint8_t)
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	...
+**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[[^]]*\]
+**	st1b	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (u8, svuint8_t)
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+**	st1h	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (s16, svint16_t)
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+**	st1h	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (u16, svuint16_t)
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+**	st1h	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (f16, svfloat16_t)
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+**	st1h	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (bf16, svbfloat16_t)
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
+**	st1w	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (s32, svint32_t)
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
+**	st1w	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (u32, svuint32_t)
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
+**	st1w	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (f32, svfloat32_t)
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
+**	st1d	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (s64, svint64_t)
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
+**	st1d	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (u64, svuint64_t)
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
+**	st1d	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (f64, svfloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_512.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_512.c
new file mode 100644
index 000000000..9817d856a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_6_512.c
@@ -0,0 +1,287 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msve-vector-bits=512 -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <stdint.h>
+
+typedef int8_t svint8_t __attribute__ ((vector_size (64)));
+typedef uint8_t svuint8_t __attribute__ ((vector_size (64)));
+
+typedef int16_t svint16_t __attribute__ ((vector_size (64)));
+typedef uint16_t svuint16_t __attribute__ ((vector_size (64)));
+typedef __fp16 svfloat16_t __attribute__ ((vector_size (64)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (64)));
+
+typedef int32_t svint32_t __attribute__ ((vector_size (64)));
+typedef uint32_t svuint32_t __attribute__ ((vector_size (64)));
+typedef float svfloat32_t __attribute__ ((vector_size (64)));
+
+typedef int64_t svint64_t __attribute__ ((vector_size (64)));
+typedef uint64_t svuint64_t __attribute__ ((vector_size (64)));
+typedef double svfloat64_t __attribute__ ((vector_size (64)));
+
+#define CALLEE(SUFFIX, TYPE)			\
+  TYPE __attribute__((noipa))			\
+  callee_##SUFFIX (TYPE *ptr)			\
+  {						\
+    return *ptr;				\
+  }
+
+/*
+** callee_s8:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	st1b	z0\.b, \1, \[x8\]
+**	ret
+*/
+CALLEE (s8, svint8_t)
+
+/*
+** callee_u8:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1b	z0\.b, \1/z, \[x0\]
+**	st1b	z0\.b, \1, \[x8\]
+**	ret
+*/
+CALLEE (u8, svuint8_t)
+
+/*
+** callee_s16:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	st1h	z0\.h, \1, \[x8\]
+**	ret
+*/
+CALLEE (s16, svint16_t)
+
+/*
+** callee_u16:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	st1h	z0\.h, \1, \[x8\]
+**	ret
+*/
+CALLEE (u16, svuint16_t)
+
+/*
+** callee_f16:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	st1h	z0\.h, \1, \[x8\]
+**	ret
+*/
+CALLEE (f16, svfloat16_t)
+
+/*
+** callee_bf16:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1h	z0\.h, \1/z, \[x0\]
+**	st1h	z0\.h, \1, \[x8\]
+**	ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
+/*
+** callee_s32:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	st1w	z0\.s, \1, \[x8\]
+**	ret
+*/
+CALLEE (s32, svint32_t)
+
+/*
+** callee_u32:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	st1w	z0\.s, \1, \[x8\]
+**	ret
+*/
+CALLEE (u32, svuint32_t)
+
+/*
+** callee_f32:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1w	z0\.s, \1/z, \[x0\]
+**	st1w	z0\.s, \1, \[x8\]
+**	ret
+*/
+CALLEE (f32, svfloat32_t)
+
+/*
+** callee_s64:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	st1d	z0\.d, \1, \[x8\]
+**	ret
+*/
+CALLEE (s64, svint64_t)
+
+/*
+** callee_u64:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	st1d	z0\.d, \1, \[x8\]
+**	ret
+*/
+CALLEE (u64, svuint64_t)
+
+/*
+** callee_f64:
+**	ptrue	(p[0-7])\.b, vl64
+**	ld1d	z0\.d, \1/z, \[x0\]
+**	st1d	z0\.d, \1, \[x8\]
+**	ret
+*/
+CALLEE (f64, svfloat64_t)
+
+#define CALLER(SUFFIX, TYPE)			\
+  void __attribute__((noipa))			\
+  caller_##SUFFIX (TYPE *ptr1, TYPE *ptr2)	\
+  {						\
+    *ptr2 = callee_##SUFFIX (ptr1);		\
+  }
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	...
+**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[[^]]*\]
+**	st1b	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (s8, svint8_t)
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	...
+**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[[^]]*\]
+**	st1b	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (u8, svuint8_t)
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+**	st1h	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (s16, svint16_t)
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+**	st1h	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (u16, svuint16_t)
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+**	st1h	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (f16, svfloat16_t)
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+**	st1h	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (bf16, svbfloat16_t)
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
+**	st1w	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (s32, svint32_t)
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
+**	st1w	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (u32, svuint32_t)
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[[^]]*\]
+**	st1w	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (f32, svfloat32_t)
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
+**	st1d	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (s64, svint64_t)
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
+**	st1d	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (u64, svuint64_t)
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[[^]]*\]
+**	st1d	\1, \2, \[[^]]*\]
+**	...
+**	ret
+*/
+CALLER (f64, svfloat64_t)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_7.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_7.c
new file mode 100644
index 000000000..55456a3b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_7.c
@@ -0,0 +1,341 @@
+/* { dg-do compile } */
+/* { dg-options "-O -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+/*
+** callee_s8:
+**	mov	z0\.b, #1
+**	mov	z1\.b, #2
+**	ret
+*/
+svint8x2_t __attribute__((noipa))
+callee_s8 (void)
+{
+  return svcreate2 (svdup_s8 (1), svdup_s8 (2));
+}
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	trn1	z0\.b, z0\.b, z1\.b
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svint8_t __attribute__((noipa))
+caller_s8 (void)
+{
+  svint8x2_t res;
+  res = callee_s8 ();
+  return svtrn1 (svget2 (res, 0), svget2 (res, 1));
+}
+
+/*
+** callee_u8:
+**	mov	z0\.b, #3
+**	mov	z1\.b, #4
+**	ret
+*/
+svuint8x2_t __attribute__((noipa))
+callee_u8 (void)
+{
+  return svcreate2 (svdup_u8 (3), svdup_u8 (4));
+}
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	trn2	z0\.b, z1\.b, z0\.b
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svuint8_t __attribute__((noipa))
+caller_u8 (void)
+{
+  svuint8x2_t res;
+  res = callee_u8 ();
+  return svtrn2 (svget2 (res, 1), svget2 (res, 0));
+}
+
+/*
+** callee_s16:
+**	mov	z0\.h, #1
+**	mov	z1\.h, #2
+**	ret
+*/
+svint16x2_t __attribute__((noipa))
+callee_s16 (void)
+{
+  return svcreate2 (svdup_s16 (1), svdup_s16 (2));
+}
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	trn1	z0\.h, z0\.h, z1\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svint16_t __attribute__((noipa))
+caller_s16 (void)
+{
+  svint16x2_t res;
+  res = callee_s16 ();
+  return svtrn1 (svget2 (res, 0), svget2 (res, 1));
+}
+
+/*
+** callee_u16:
+**	mov	z0\.h, #3
+**	mov	z1\.h, #4
+**	ret
+*/
+svuint16x2_t __attribute__((noipa))
+callee_u16 (void)
+{
+  return svcreate2 (svdup_u16 (3), svdup_u16 (4));
+}
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	trn2	z0\.h, z1\.h, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svuint16_t __attribute__((noipa))
+caller_u16 (void)
+{
+  svuint16x2_t res;
+  res = callee_u16 ();
+  return svtrn2 (svget2 (res, 1), svget2 (res, 0));
+}
+
+/*
+** callee_f16:
+**	fmov	z0\.h, #5\.0(?:e\+0)?
+**	fmov	z1\.h, #6\.0(?:e\+0)?
+**	ret
+*/
+svfloat16x2_t __attribute__((noipa))
+callee_f16 (void)
+{
+  return svcreate2 (svdup_f16 (5), svdup_f16 (6));
+}
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	zip1	z0\.h, z1\.h, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svfloat16_t __attribute__((noipa))
+caller_f16 (void)
+{
+  svfloat16x2_t res;
+  res = callee_f16 ();
+  return svzip1 (svget2 (res, 1), svget2 (res, 0));
+}
+
+/*
+** callee_bf16:
+**	mov	z0\.h, h2
+**	mov	z1\.h, h3
+**	ret
+*/
+svbfloat16x2_t __attribute__((noipa))
+callee_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2, bfloat16_t h3)
+{
+  return svcreate2 (svdup_bf16 (h2), svdup_bf16 (h3));
+}
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	zip2	z0\.h, z1\.h, z0\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svbfloat16_t __attribute__((noipa))
+caller_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2, bfloat16_t h3)
+{
+  svbfloat16x2_t res;
+  res = callee_bf16 (h0, h1, h2, h3);
+  return svzip2 (svget2 (res, 1), svget2 (res, 0));
+}
+
+/*
+** callee_s32:
+**	mov	z0\.s, #1
+**	mov	z1\.s, #2
+**	ret
+*/
+svint32x2_t __attribute__((noipa))
+callee_s32 (void)
+{
+  return svcreate2 (svdup_s32 (1), svdup_s32 (2));
+}
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	trn1	z0\.s, z0\.s, z1\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svint32_t __attribute__((noipa))
+caller_s32 (void)
+{
+  svint32x2_t res;
+  res = callee_s32 ();
+  return svtrn1 (svget2 (res, 0), svget2 (res, 1));
+}
+
+/*
+** callee_u32:
+**	mov	z0\.s, #3
+**	mov	z1\.s, #4
+**	ret
+*/
+svuint32x2_t __attribute__((noipa))
+callee_u32 (void)
+{
+  return svcreate2 (svdup_u32 (3), svdup_u32 (4));
+}
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	trn2	z0\.s, z1\.s, z0\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svuint32_t __attribute__((noipa))
+caller_u32 (void)
+{
+  svuint32x2_t res;
+  res = callee_u32 ();
+  return svtrn2 (svget2 (res, 1), svget2 (res, 0));
+}
+
+/*
+** callee_f32:
+**	fmov	z0\.s, #5\.0(?:e\+0)?
+**	fmov	z1\.s, #6\.0(?:e\+0)?
+**	ret
+*/
+svfloat32x2_t __attribute__((noipa))
+callee_f32 (void)
+{
+  return svcreate2 (svdup_f32 (5), svdup_f32 (6));
+}
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	zip1	z0\.s, z1\.s, z0\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svfloat32_t __attribute__((noipa))
+caller_f32 (void)
+{
+  svfloat32x2_t res;
+  res = callee_f32 ();
+  return svzip1 (svget2 (res, 1), svget2 (res, 0));
+}
+
+/*
+** callee_s64:
+**	mov	z0\.d, #1
+**	mov	z1\.d, #2
+**	ret
+*/
+svint64x2_t __attribute__((noipa))
+callee_s64 (void)
+{
+  return svcreate2 (svdup_s64 (1), svdup_s64 (2));
+}
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	trn1	z0\.d, z0\.d, z1\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svint64_t __attribute__((noipa))
+caller_s64 (void)
+{
+  svint64x2_t res;
+  res = callee_s64 ();
+  return svtrn1 (svget2 (res, 0), svget2 (res, 1));
+}
+
+/*
+** callee_u64:
+**	mov	z0\.d, #3
+**	mov	z1\.d, #4
+**	ret
+*/
+svuint64x2_t __attribute__((noipa))
+callee_u64 (void)
+{
+  return svcreate2 (svdup_u64 (3), svdup_u64 (4));
+}
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	trn2	z0\.d, z1\.d, z0\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svuint64_t __attribute__((noipa))
+caller_u64 (void)
+{
+  svuint64x2_t res;
+  res = callee_u64 ();
+  return svtrn2 (svget2 (res, 1), svget2 (res, 0));
+}
+
+/*
+** callee_f64:
+**	fmov	z0\.d, #5\.0(?:e\+0)?
+**	fmov	z1\.d, #6\.0(?:e\+0)?
+**	ret
+*/
+svfloat64x2_t __attribute__((noipa))
+callee_f64 (void)
+{
+  return svcreate2 (svdup_f64 (5), svdup_f64 (6));
+}
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	zip1	z0\.d, z1\.d, z0\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svfloat64_t __attribute__((noipa))
+caller_f64 (void)
+{
+  svfloat64x2_t res;
+  res = callee_f64 ();
+  return svzip1 (svget2 (res, 1), svget2 (res, 0));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_8.c
new file mode 100644
index 000000000..9581811e7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_8.c
@@ -0,0 +1,375 @@
+/* { dg-do compile } */
+/* { dg-options "-O -frename-registers -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+/*
+** callee_s8:
+**	mov	z0\.b, #1
+**	mov	z1\.b, #2
+**	mov	z2\.b, #3
+**	ret
+*/
+svint8x3_t __attribute__((noipa))
+callee_s8 (void)
+{
+  return svcreate3 (svdup_s8 (1), svdup_s8 (2), svdup_s8 (3));
+}
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	ptrue	(p[0-7])\.b, all
+**	mad	z0\.b, \1/m, z1\.b, z2\.b
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svint8_t __attribute__((noipa))
+caller_s8 (void)
+{
+  svint8x3_t res;
+  res = callee_s8 ();
+  return svmad_x (svptrue_b8 (),
+		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
+}
+
+/*
+** callee_u8:
+**	mov	z0\.b, #4
+**	mov	z1\.b, #5
+**	mov	z2\.b, #6
+**	ret
+*/
+svuint8x3_t __attribute__((noipa))
+callee_u8 (void)
+{
+  return svcreate3 (svdup_u8 (4), svdup_u8 (5), svdup_u8 (6));
+}
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	ptrue	(p[0-7])\.b, all
+**	msb	z0\.b, \1/m, z1\.b, z2\.b
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svuint8_t __attribute__((noipa))
+caller_u8 (void)
+{
+  svuint8x3_t res;
+  res = callee_u8 ();
+  return svmsb_x (svptrue_b8 (),
+		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
+}
+
+/*
+** callee_s16:
+**	mov	z0\.h, #1
+**	mov	z1\.h, #2
+**	mov	z2\.h, #3
+**	ret
+*/
+svint16x3_t __attribute__((noipa))
+callee_s16 (void)
+{
+  return svcreate3 (svdup_s16 (1), svdup_s16 (2), svdup_s16 (3));
+}
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	ptrue	(p[0-7])\.b, all
+**	mls	z0\.h, \1/m, z1\.h, z2\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svint16_t __attribute__((noipa))
+caller_s16 (void)
+{
+  svint16x3_t res;
+  res = callee_s16 ();
+  return svmls_x (svptrue_b16 (),
+		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
+}
+
+/*
+** callee_u16:
+**	mov	z0\.h, #4
+**	mov	z1\.h, #5
+**	mov	z2\.h, #6
+**	ret
+*/
+svuint16x3_t __attribute__((noipa))
+callee_u16 (void)
+{
+  return svcreate3 (svdup_u16 (4), svdup_u16 (5), svdup_u16 (6));
+}
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	ptrue	(p[0-7])\.b, all
+**	mla	z0\.h, \1/m, z1\.h, z2\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svuint16_t __attribute__((noipa))
+caller_u16 (void)
+{
+  svuint16x3_t res;
+  res = callee_u16 ();
+  return svmla_x (svptrue_b16 (),
+		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
+}
+
+/*
+** callee_f16:
+**	fmov	z0\.h, #1\.0(?:e\+0)?
+**	fmov	z1\.h, #2\.0(?:e\+0)?
+**	fmov	z2\.h, #3\.0(?:e\+0)?
+**	ret
+*/
+svfloat16x3_t __attribute__((noipa))
+callee_f16 (void)
+{
+  return svcreate3 (svdup_f16 (1), svdup_f16 (2), svdup_f16 (3));
+}
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	ptrue	(p[0-7])\.b, all
+**	fmla	z0\.h, \1/m, z1\.h, z2\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svfloat16_t __attribute__((noipa))
+caller_f16 (void)
+{
+  svfloat16x3_t res;
+  res = callee_f16 ();
+  return svmla_x (svptrue_b16 (),
+		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
+}
+
+/*
+** callee_bf16:
+**	mov	z0\.h, h0
+**	mov	z1\.h, h1
+**	mov	z2\.h, h2
+**	ret
+*/
+svbfloat16x3_t __attribute__((noipa))
+callee_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2)
+{
+  return svcreate3 (svdup_bf16 (h0), svdup_bf16 (h1), svdup_bf16 (h2));
+}
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	trn2	z0\.h, z0\.h, z2\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svbfloat16_t __attribute__((noipa))
+caller_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2)
+{
+  svbfloat16x3_t res;
+  res = callee_bf16 (h0, h1, h2);
+  return svtrn2 (svget3 (res, 0), svget3 (res, 2));
+}
+
+/*
+** callee_s32:
+**	mov	z0\.s, #1
+**	mov	z1\.s, #2
+**	mov	z2\.s, #3
+**	ret
+*/
+svint32x3_t __attribute__((noipa))
+callee_s32 (void)
+{
+  return svcreate3 (svdup_s32 (1), svdup_s32 (2), svdup_s32 (3));
+}
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	ptrue	(p[0-7])\.b, all
+**	mad	z0\.s, \1/m, z1\.s, z2\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svint32_t __attribute__((noipa))
+caller_s32 (void)
+{
+  svint32x3_t res;
+  res = callee_s32 ();
+  return svmad_x (svptrue_b32 (),
+		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
+}
+
+/*
+** callee_u32:
+**	mov	z0\.s, #4
+**	mov	z1\.s, #5
+**	mov	z2\.s, #6
+**	ret
+*/
+svuint32x3_t __attribute__((noipa))
+callee_u32 (void)
+{
+  return svcreate3 (svdup_u32 (4), svdup_u32 (5), svdup_u32 (6));
+}
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	ptrue	(p[0-7])\.b, all
+**	msb	z0\.s, \1/m, z1\.s, z2\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svuint32_t __attribute__((noipa))
+caller_u32 (void)
+{
+  svuint32x3_t res;
+  res = callee_u32 ();
+  return svmsb_x (svptrue_b32 (),
+		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
+}
+
+/*
+** callee_f32:
+**	fmov	z0\.s, #1\.0(?:e\+0)?
+**	fmov	z1\.s, #2\.0(?:e\+0)?
+**	fmov	z2\.s, #3\.0(?:e\+0)?
+**	ret
+*/
+svfloat32x3_t __attribute__((noipa))
+callee_f32 (void)
+{
+  return svcreate3 (svdup_f32 (1), svdup_f32 (2), svdup_f32 (3));
+}
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	ptrue	(p[0-7])\.b, all
+**	fmla	z0\.s, \1/m, z1\.s, z2\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svfloat32_t __attribute__((noipa))
+caller_f32 (void)
+{
+  svfloat32x3_t res;
+  res = callee_f32 ();
+  return svmla_x (svptrue_b32 (),
+		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
+}
+
+/*
+** callee_s64:
+**	mov	z0\.d, #1
+**	mov	z1\.d, #2
+**	mov	z2\.d, #3
+**	ret
+*/
+svint64x3_t __attribute__((noipa))
+callee_s64 (void)
+{
+  return svcreate3 (svdup_s64 (1), svdup_s64 (2), svdup_s64 (3));
+}
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	ptrue	(p[0-7])\.b, all
+**	mls	z0\.d, \1/m, z1\.d, z2\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svint64_t __attribute__((noipa))
+caller_s64 (void)
+{
+  svint64x3_t res;
+  res = callee_s64 ();
+  return svmls_x (svptrue_b64 (),
+		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
+}
+
+/*
+** callee_u64:
+**	mov	z0\.d, #4
+**	mov	z1\.d, #5
+**	mov	z2\.d, #6
+**	ret
+*/
+svuint64x3_t __attribute__((noipa))
+callee_u64 (void)
+{
+  return svcreate3 (svdup_u64 (4), svdup_u64 (5), svdup_u64 (6));
+}
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	ptrue	(p[0-7])\.b, all
+**	mla	z0\.d, \1/m, z1\.d, z2\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svuint64_t __attribute__((noipa))
+caller_u64 (void)
+{
+  svuint64x3_t res;
+  res = callee_u64 ();
+  return svmla_x (svptrue_b64 (),
+		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
+}
+
+/*
+** callee_f64:
+**	fmov	z0\.d, #1\.0(?:e\+0)?
+**	fmov	z1\.d, #2\.0(?:e\+0)?
+**	fmov	z2\.d, #3\.0(?:e\+0)?
+**	ret
+*/
+svfloat64x3_t __attribute__((noipa))
+callee_f64 (void)
+{
+  return svcreate3 (svdup_f64 (1), svdup_f64 (2), svdup_f64 (3));
+}
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	ptrue	(p[0-7])\.b, all
+**	fmla	z0\.d, \1/m, z1\.d, z2\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svfloat64_t __attribute__((noipa))
+caller_f64 (void)
+{
+  svfloat64x3_t res;
+  res = callee_f64 ();
+  return svmla_x (svptrue_b64 (),
+		  svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_9.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_9.c
new file mode 100644
index 000000000..ad32e1fe5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/return_9.c
@@ -0,0 +1,438 @@
+/* { dg-do compile } */
+/* { dg-options "-O -frename-registers -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+/*
+** callee_s8:
+**	mov	z0\.b, #1
+**	mov	z1\.b, #2
+**	mov	z2\.b, #3
+**	mov	z3\.b, #4
+**	ret
+*/
+svint8x4_t __attribute__((noipa))
+callee_s8 (void)
+{
+  return svcreate4 (svdup_s8 (1), svdup_s8 (2), svdup_s8 (3), svdup_s8 (4));
+}
+
+/*
+** caller_s8:
+**	...
+**	bl	callee_s8
+**	add	(z[2-7]\.b), z2\.b, z3\.b
+**	ptrue	(p[0-7])\.b, all
+**	mla	z0\.b, \2/m, (z1\.b, \1|\1, z1\.b)
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svint8_t __attribute__((noipa))
+caller_s8 (void)
+{
+  svint8x4_t res;
+  res = callee_s8 ();
+  return svmla_x (svptrue_b8 (), svget4 (res, 0), svget4 (res, 1),
+		  svadd_x (svptrue_b8 (),
+			   svget4 (res, 2),
+			   svget4 (res, 3)));
+}
+
+/*
+** callee_u8:
+**	mov	z0\.b, #4
+**	mov	z1\.b, #5
+**	mov	z2\.b, #6
+**	mov	z3\.b, #7
+**	ret
+*/
+svuint8x4_t __attribute__((noipa))
+callee_u8 (void)
+{
+  return svcreate4 (svdup_u8 (4), svdup_u8 (5), svdup_u8 (6), svdup_u8 (7));
+}
+
+/*
+** caller_u8:
+**	...
+**	bl	callee_u8
+**	sub	(z[2-7]\.b), z2\.b, z3\.b
+**	ptrue	(p[0-7])\.b, all
+**	mla	z0\.b, \2/m, (z1\.b, \1|\1, z1\.b)
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svuint8_t __attribute__((noipa))
+caller_u8 (void)
+{
+  svuint8x4_t res;
+  res = callee_u8 ();
+  return svmla_x (svptrue_b8 (), svget4 (res, 0), svget4 (res, 1),
+		  svsub_x (svptrue_b8 (),
+			   svget4 (res, 2),
+			   svget4 (res, 3)));
+}
+
+/*
+** callee_s16:
+**	mov	z0\.h, #1
+**	mov	z1\.h, #2
+**	mov	z2\.h, #3
+**	mov	z3\.h, #4
+**	ret
+*/
+svint16x4_t __attribute__((noipa))
+callee_s16 (void)
+{
+  return svcreate4 (svdup_s16 (1), svdup_s16 (2),
+		    svdup_s16 (3), svdup_s16 (4));
+}
+
+/*
+** caller_s16:
+**	...
+**	bl	callee_s16
+**	add	(z[2-7]\.h), z2\.h, z3\.h
+**	ptrue	(p[0-7])\.b, all
+**	mad	z0\.h, \2/m, (z1\.h, \1|\1, z1\.h)
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svint16_t __attribute__((noipa))
+caller_s16 (void)
+{
+  svint16x4_t res;
+  res = callee_s16 ();
+  return svmad_x (svptrue_b16 (), svget4 (res, 0), svget4 (res, 1),
+		  svadd_x (svptrue_b16 (),
+			   svget4 (res, 2),
+			   svget4 (res, 3)));
+}
+
+/*
+** callee_u16:
+**	mov	z0\.h, #4
+**	mov	z1\.h, #5
+**	mov	z2\.h, #6
+**	mov	z3\.h, #7
+**	ret
+*/
+svuint16x4_t __attribute__((noipa))
+callee_u16 (void)
+{
+  return svcreate4 (svdup_u16 (4), svdup_u16 (5),
+		    svdup_u16 (6), svdup_u16 (7));
+}
+
+/*
+** caller_u16:
+**	...
+**	bl	callee_u16
+**	sub	(z[2-7]\.h), z2\.h, z3\.h
+**	ptrue	(p[0-7])\.b, all
+**	mad	z0\.h, \2/m, (z1\.h, \1|\1, z1\.h)
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svuint16_t __attribute__((noipa))
+caller_u16 (void)
+{
+  svuint16x4_t res;
+  res = callee_u16 ();
+  return svmad_x (svptrue_b16 (), svget4 (res, 0), svget4 (res, 1),
+		  svsub_x (svptrue_b16 (),
+			   svget4 (res, 2),
+			   svget4 (res, 3)));
+}
+
+/*
+** callee_f16:
+**	fmov	z0\.h, #1\.0(?:e\+0)?
+**	fmov	z1\.h, #2\.0(?:e\+0)?
+**	fmov	z2\.h, #3\.0(?:e\+0)?
+**	fmov	z3\.h, #4\.0(?:e\+0)?
+**	ret
+*/
+svfloat16x4_t __attribute__((noipa))
+callee_f16 (void)
+{
+  return svcreate4 (svdup_f16 (1), svdup_f16 (2),
+		    svdup_f16 (3), svdup_f16 (4));
+}
+
+/*
+** caller_f16:
+**	...
+**	bl	callee_f16
+**	fadd	(z[0-9]+\.h), z0\.h, z1\.h
+**	fmul	(z[0-9]+\.h), \1, z2\.h
+**	fadd	z0\.h, \2, z3\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svfloat16_t __attribute__((noipa))
+caller_f16 (void)
+{
+  svfloat16x4_t res;
+  res = callee_f16 ();
+  return svadd_x (svptrue_b16 (),
+		  svmul_x (svptrue_b16 (),
+			   svadd_x (svptrue_b16 (), svget4 (res, 0),
+				    svget4 (res, 1)),
+			   svget4 (res, 2)),
+		  svget4 (res, 3));
+}
+
+/*
+** callee_bf16:
+**	mov	z0\.h, h4
+**	mov	z1\.h, h5
+**	mov	z2\.h, h6
+**	mov	z3\.h, h7
+**	ret
+*/
+svbfloat16x4_t __attribute__((noipa))
+callee_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2, bfloat16_t h3,
+	     bfloat16_t h4, bfloat16_t h5, bfloat16_t h6, bfloat16_t h7)
+{
+  return svcreate4 (svdup_bf16 (h4), svdup_bf16 (h5),
+		    svdup_bf16 (h6), svdup_bf16 (h7));
+}
+
+/*
+** caller_bf16:
+**	...
+**	bl	callee_bf16
+**	trn2	z0\.h, z0\.h, z3\.h
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svbfloat16_t __attribute__((noipa))
+caller_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2, bfloat16_t h3,
+	     bfloat16_t h4, bfloat16_t h5, bfloat16_t h6, bfloat16_t h7)
+{
+  svbfloat16x4_t res;
+  res = callee_bf16 (h0, h1, h2, h3, h4, h5, h6, h7);
+  return svtrn2 (svget4 (res, 0), svget4 (res, 3));
+}
+
+/*
+** callee_s32:
+**	mov	z0\.s, #1
+**	mov	z1\.s, #2
+**	mov	z2\.s, #3
+**	mov	z3\.s, #4
+**	ret
+*/
+svint32x4_t __attribute__((noipa))
+callee_s32 (void)
+{
+  return svcreate4 (svdup_s32 (1), svdup_s32 (2),
+		    svdup_s32 (3), svdup_s32 (4));
+}
+
+/*
+** caller_s32:
+**	...
+**	bl	callee_s32
+**	add	(z[2-7]\.s), z2\.s, z3\.s
+**	ptrue	(p[0-7])\.b, all
+**	msb	z0\.s, \2/m, (z1\.s, \1|\1, z1\.s)
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svint32_t __attribute__((noipa))
+caller_s32 (void)
+{
+  svint32x4_t res;
+  res = callee_s32 ();
+  return svmsb_x (svptrue_b32 (), svget4 (res, 0), svget4 (res, 1),
+		  svadd_x (svptrue_b32 (),
+			   svget4 (res, 2),
+			   svget4 (res, 3)));
+}
+
+/*
+** callee_u32:
+**	mov	z0\.s, #4
+**	mov	z1\.s, #5
+**	mov	z2\.s, #6
+**	mov	z3\.s, #7
+**	ret
+*/
+svuint32x4_t __attribute__((noipa))
+callee_u32 (void)
+{
+  return svcreate4 (svdup_u32 (4), svdup_u32 (5),
+		    svdup_u32 (6), svdup_u32 (7));
+}
+
+/*
+** caller_u32:
+**	...
+**	bl	callee_u32
+**	sub	(z[2-7]\.s), z2\.s, z3\.s
+**	ptrue	(p[0-7])\.b, all
+**	msb	z0\.s, \2/m, (z1\.s, \1|\1, z1\.s)
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svuint32_t __attribute__((noipa))
+caller_u32 (void)
+{
+  svuint32x4_t res;
+  res = callee_u32 ();
+  return svmsb_x (svptrue_b32 (), svget4 (res, 0), svget4 (res, 1),
+		  svsub_x (svptrue_b32 (),
+			   svget4 (res, 2),
+			   svget4 (res, 3)));
+}
+
+/*
+** callee_f32:
+**	fmov	z0\.s, #1\.0(?:e\+0)?
+**	fmov	z1\.s, #2\.0(?:e\+0)?
+**	fmov	z2\.s, #3\.0(?:e\+0)?
+**	fmov	z3\.s, #4\.0(?:e\+0)?
+**	ret
+*/
+svfloat32x4_t __attribute__((noipa))
+callee_f32 (void)
+{
+  return svcreate4 (svdup_f32 (1), svdup_f32 (2),
+		    svdup_f32 (3), svdup_f32 (4));
+}
+
+/*
+** caller_f32:
+**	...
+**	bl	callee_f32
+**	fadd	(z[0-9]+\.s), z0\.s, z1\.s
+**	fmul	(z[0-9]+\.s), \1, z2\.s
+**	fadd	z0\.s, \2, z3\.s
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svfloat32_t __attribute__((noipa))
+caller_f32 (void)
+{
+  svfloat32x4_t res;
+  res = callee_f32 ();
+  return svadd_x (svptrue_b32 (),
+		  svmul_x (svptrue_b32 (),
+			   svadd_x (svptrue_b32 (), svget4 (res, 0),
+				    svget4 (res, 1)),
+			   svget4 (res, 2)),
+		  svget4 (res, 3));
+}
+
+/*
+** callee_s64:
+**	mov	z0\.d, #1
+**	mov	z1\.d, #2
+**	mov	z2\.d, #3
+**	mov	z3\.d, #4
+**	ret
+*/
+svint64x4_t __attribute__((noipa))
+callee_s64 (void)
+{
+  return svcreate4 (svdup_s64 (1), svdup_s64 (2),
+		    svdup_s64 (3), svdup_s64 (4));
+}
+
+/*
+** caller_s64:
+**	...
+**	bl	callee_s64
+**	add	(z[2-7]\.d), z2\.d, z3\.d
+**	ptrue	(p[0-7])\.b, all
+**	mls	z0\.d, \2/m, (z1\.d, \1|\1, z1\.d)
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svint64_t __attribute__((noipa))
+caller_s64 (void)
+{
+  svint64x4_t res;
+  res = callee_s64 ();
+  return svmls_x (svptrue_b64 (), svget4 (res, 0), svget4 (res, 1),
+		  svadd_x (svptrue_b64 (),
+			   svget4 (res, 2),
+			   svget4 (res, 3)));
+}
+
+/*
+** callee_u64:
+**	mov	z0\.d, #4
+**	mov	z1\.d, #5
+**	mov	z2\.d, #6
+**	mov	z3\.d, #7
+**	ret
+*/
+svuint64x4_t __attribute__((noipa))
+callee_u64 (void)
+{
+  return svcreate4 (svdup_u64 (4), svdup_u64 (5),
+		    svdup_u64 (6), svdup_u64 (7));
+}
+
+/*
+** caller_u64:
+**	...
+**	bl	callee_u64
+**	sub	(z[2-7]\.d), z2\.d, z3\.d
+**	ptrue	(p[0-7])\.b, all
+**	mls	z0\.d, \2/m, (z1\.d, \1|\1, z1\.d)
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svuint64_t __attribute__((noipa))
+caller_u64 (void)
+{
+  svuint64x4_t res;
+  res = callee_u64 ();
+  return svmls_x (svptrue_b64 (), svget4 (res, 0), svget4 (res, 1),
+		  svsub_x (svptrue_b64 (),
+			   svget4 (res, 2),
+			   svget4 (res, 3)));
+}
+
+/*
+** callee_f64:
+**	fmov	z0\.d, #1\.0(?:e\+0)?
+**	fmov	z1\.d, #2\.0(?:e\+0)?
+**	fmov	z2\.d, #3\.0(?:e\+0)?
+**	fmov	z3\.d, #4\.0(?:e\+0)?
+**	ret
+*/
+svfloat64x4_t __attribute__((noipa))
+callee_f64 (void)
+{
+  return svcreate4 (svdup_f64 (1), svdup_f64 (2),
+		    svdup_f64 (3), svdup_f64 (4));
+}
+
+/*
+** caller_f64:
+**	...
+**	bl	callee_f64
+**	fadd	(z[0-9]+\.d), z0\.d, z1\.d
+**	fmul	(z[0-9]+\.d), \1, z2\.d
+**	fadd	z0\.d, \2, z3\.d
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svfloat64_t __attribute__((noipa))
+caller_f64 (void)
+{
+  svfloat64x4_t res;
+  res = callee_f64 ();
+  return svadd_x (svptrue_b64 (),
+		  svmul_x (svptrue_b64 (),
+			   svadd_x (svptrue_b64 (), svget4 (res, 0),
+				    svget4 (res, 1)),
+			   svget4 (res, 2)),
+		  svget4 (res, 3));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_be_nowrap.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_be_nowrap.c
new file mode 100644
index 000000000..4eee04226
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_be_nowrap.c
@@ -0,0 +1,196 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mbig-endian -fno-shrink-wrap -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** test_1:
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	ptrue	p1\.b, all
+**	st1d	z8\.d, p1, \[sp, #1, mul vl\]
+**	st1d	z9\.d, p1, \[sp, #2, mul vl\]
+**	st1d	z10\.d, p1, \[sp, #3, mul vl\]
+**	st1d	z11\.d, p1, \[sp, #4, mul vl\]
+**	st1d	z12\.d, p1, \[sp, #5, mul vl\]
+**	st1d	z13\.d, p1, \[sp, #6, mul vl\]
+**	st1d	z14\.d, p1, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	st1d	z15\.d, p1, \[x11, #-8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	ptrue	p0\.b, all
+**	ptrue	p1\.b, all
+**	ld1d	z8\.d, p1/z, \[sp, #1, mul vl\]
+**	ld1d	z9\.d, p1/z, \[sp, #2, mul vl\]
+**	ld1d	z10\.d, p1/z, \[sp, #3, mul vl\]
+**	ld1d	z11\.d, p1/z, \[sp, #4, mul vl\]
+**	ld1d	z12\.d, p1/z, \[sp, #5, mul vl\]
+**	ld1d	z13\.d, p1/z, \[sp, #6, mul vl\]
+**	ld1d	z14\.d, p1/z, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	ld1d	z15\.d, p1/z, \[x11, #-8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ret
+*/
+svbool_t
+test_1 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
+		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
+		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_2:
+**	ptrue	p0\.b, all
+**	ret
+*/
+svbool_t
+test_2 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_3:
+**	addvl	sp, sp, #-6
+**	str	p5, \[sp\]
+**	str	p6, \[sp, #1, mul vl\]
+**	str	p11, \[sp, #2, mul vl\]
+**	ptrue	p1\.b, all
+**	st1d	z8\.d, p1, \[sp, #1, mul vl\]
+**	st1d	z13\.d, p1, \[sp, #2, mul vl\]
+**	str	z19, \[sp, #3, mul vl\]
+**	str	z20, \[sp, #4, mul vl\]
+**	str	z22, \[sp, #5, mul vl\]
+**	ptrue	p0\.b, all
+**	ptrue	p1\.b, all
+**	ld1d	z8\.d, p1/z, \[sp, #1, mul vl\]
+**	ld1d	z13\.d, p1/z, \[sp, #2, mul vl\]
+**	ldr	z19, \[sp, #3, mul vl\]
+**	ldr	z20, \[sp, #4, mul vl\]
+**	ldr	z22, \[sp, #5, mul vl\]
+**	ldr	p5, \[sp\]
+**	ldr	p6, \[sp, #1, mul vl\]
+**	ldr	p11, \[sp, #2, mul vl\]
+**	addvl	sp, sp, #6
+**	ret
+*/
+svbool_t
+test_3 (void)
+{
+  asm volatile ("" :::
+		"z8", "z13", "z19", "z20", "z22",
+		"p5", "p6", "p11");
+  return svptrue_b8 ();
+}
+
+/*
+** test_4:
+**	addvl	sp, sp, #-1
+**	str	p4, \[sp\]
+**	ptrue	p0\.b, all
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svbool_t
+test_4 (void)
+{
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_5:
+**	addvl	sp, sp, #-1
+**	ptrue	p1\.b, all
+**	st1d	z15\.d, p1, \[sp\]
+**	ptrue	p0\.b, all
+**	ptrue	p1\.b, all
+**	ld1d	z15\.d, p1/z, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svbool_t
+test_5 (void)
+{
+  asm volatile ("" ::: "z15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_6:
+**	addvl	sp, sp, #-2
+**	str	p4, \[sp\]
+**	ptrue	p4\.b, all
+**	st1d	z15\.d, p4, \[sp, #1, mul vl\]
+**	mov	z0\.b, #1
+**	ptrue	p4\.b, all
+**	ld1d	z15\.d, p4/z, \[sp, #1, mul vl\]
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #2
+**	ret
+*/
+svint8_t
+test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
+  return svdup_s8 (1);
+}
+
+/*
+** test_7:
+**	addvl	sp, sp, #-1
+**	str	z16, \[sp\]
+**	ptrue	p0\.b, all
+**	ldr	z16, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svbool_t
+test_7 (void)
+{
+  asm volatile ("" ::: "z16");
+  return svptrue_b8 ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_be_wrap.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_be_wrap.c
new file mode 100644
index 000000000..e88a3dd1d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_be_wrap.c
@@ -0,0 +1,196 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mbig-endian -fshrink-wrap -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** test_1:
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	ptrue	p1\.b, all
+**	st1d	z8\.d, p1, \[sp, #1, mul vl\]
+**	st1d	z9\.d, p1, \[sp, #2, mul vl\]
+**	st1d	z10\.d, p1, \[sp, #3, mul vl\]
+**	st1d	z11\.d, p1, \[sp, #4, mul vl\]
+**	st1d	z12\.d, p1, \[sp, #5, mul vl\]
+**	st1d	z13\.d, p1, \[sp, #6, mul vl\]
+**	st1d	z14\.d, p1, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	st1d	z15\.d, p1, \[x11, #-8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	ptrue	p0\.b, all
+**	ptrue	p1\.b, all
+**	ld1d	z8\.d, p1/z, \[sp, #1, mul vl\]
+**	ld1d	z9\.d, p1/z, \[sp, #2, mul vl\]
+**	ld1d	z10\.d, p1/z, \[sp, #3, mul vl\]
+**	ld1d	z11\.d, p1/z, \[sp, #4, mul vl\]
+**	ld1d	z12\.d, p1/z, \[sp, #5, mul vl\]
+**	ld1d	z13\.d, p1/z, \[sp, #6, mul vl\]
+**	ld1d	z14\.d, p1/z, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	ld1d	z15\.d, p1/z, \[x11, #-8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ret
+*/
+svbool_t
+test_1 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
+		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
+		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_2:
+**	ptrue	p0\.b, all
+**	ret
+*/
+svbool_t
+test_2 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_3:
+**	addvl	sp, sp, #-6
+**	str	p5, \[sp\]
+**	str	p6, \[sp, #1, mul vl\]
+**	str	p11, \[sp, #2, mul vl\]
+**	ptrue	p1\.b, all
+**	st1d	z8\.d, p1, \[sp, #1, mul vl\]
+**	st1d	z13\.d, p1, \[sp, #2, mul vl\]
+**	str	z19, \[sp, #3, mul vl\]
+**	str	z20, \[sp, #4, mul vl\]
+**	str	z22, \[sp, #5, mul vl\]
+**	ptrue	p0\.b, all
+**	ptrue	p1\.b, all
+**	ld1d	z8\.d, p1/z, \[sp, #1, mul vl\]
+**	ld1d	z13\.d, p1/z, \[sp, #2, mul vl\]
+**	ldr	z19, \[sp, #3, mul vl\]
+**	ldr	z20, \[sp, #4, mul vl\]
+**	ldr	z22, \[sp, #5, mul vl\]
+**	ldr	p5, \[sp\]
+**	ldr	p6, \[sp, #1, mul vl\]
+**	ldr	p11, \[sp, #2, mul vl\]
+**	addvl	sp, sp, #6
+**	ret
+*/
+svbool_t
+test_3 (void)
+{
+  asm volatile ("" :::
+		"z8", "z13", "z19", "z20", "z22",
+		"p5", "p6", "p11");
+  return svptrue_b8 ();
+}
+
+/*
+** test_4:
+**	addvl	sp, sp, #-1
+**	str	p4, \[sp\]
+**	ptrue	p0\.b, all
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svbool_t
+test_4 (void)
+{
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_5:
+**	addvl	sp, sp, #-1
+**	ptrue	p1\.b, all
+**	st1d	z15\.d, p1, \[sp\]
+**	ptrue	p0\.b, all
+**	ptrue	p1\.b, all
+**	ld1d	z15\.d, p1/z, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svbool_t
+test_5 (void)
+{
+  asm volatile ("" ::: "z15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_6:
+**	addvl	sp, sp, #-2
+**	str	p4, \[sp\]
+**	ptrue	p4\.b, all
+**	st1d	z15\.d, p4, \[sp, #1, mul vl\]
+**	mov	z0\.b, #1
+**	ptrue	p4\.b, all
+**	ld1d	z15\.d, p4/z, \[sp, #1, mul vl\]
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #2
+**	ret
+*/
+svint8_t
+test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
+  return svdup_s8 (1);
+}
+
+/*
+** test_7:
+**	addvl	sp, sp, #-1
+**	str	z16, \[sp\]
+**	ptrue	p0\.b, all
+**	ldr	z16, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svbool_t
+test_7 (void)
+{
+  asm volatile ("" ::: "z16");
+  return svptrue_b8 ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_le_nowrap.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_le_nowrap.c
new file mode 100644
index 000000000..d14cd79b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_le_nowrap.c
@@ -0,0 +1,184 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mlittle-endian -fno-shrink-wrap -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** test_1:
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	ptrue	p0\.b, all
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ret
+*/
+svbool_t
+test_1 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
+		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
+		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_2:
+**	ptrue	p0\.b, all
+**	ret
+*/
+svbool_t
+test_2 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_3:
+**	addvl	sp, sp, #-6
+**	str	p5, \[sp\]
+**	str	p6, \[sp, #1, mul vl\]
+**	str	p11, \[sp, #2, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z13, \[sp, #2, mul vl\]
+**	str	z19, \[sp, #3, mul vl\]
+**	str	z20, \[sp, #4, mul vl\]
+**	str	z22, \[sp, #5, mul vl\]
+**	ptrue	p0\.b, all
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z13, \[sp, #2, mul vl\]
+**	ldr	z19, \[sp, #3, mul vl\]
+**	ldr	z20, \[sp, #4, mul vl\]
+**	ldr	z22, \[sp, #5, mul vl\]
+**	ldr	p5, \[sp\]
+**	ldr	p6, \[sp, #1, mul vl\]
+**	ldr	p11, \[sp, #2, mul vl\]
+**	addvl	sp, sp, #6
+**	ret
+*/
+svbool_t
+test_3 (void)
+{
+  asm volatile ("" :::
+		"z8", "z13", "z19", "z20", "z22",
+		"p5", "p6", "p11");
+  return svptrue_b8 ();
+}
+
+/*
+** test_4:
+**	addvl	sp, sp, #-1
+**	str	p4, \[sp\]
+**	ptrue	p0\.b, all
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svbool_t
+test_4 (void)
+{
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_5:
+**	addvl	sp, sp, #-1
+**	str	z15, \[sp\]
+**	ptrue	p0\.b, all
+**	ldr	z15, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svbool_t
+test_5 (void)
+{
+  asm volatile ("" ::: "z15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_6:
+**	addvl	sp, sp, #-1
+**	str	z15, \[sp\]
+**	mov	z0\.b, #1
+**	ldr	z15, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svint8_t
+test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
+  return svdup_s8 (1);
+}
+
+/*
+** test_7:
+**	addvl	sp, sp, #-1
+**	str	z16, \[sp\]
+**	ptrue	p0\.b, all
+**	ldr	z16, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svbool_t
+test_7 (void)
+{
+  asm volatile ("" ::: "z16");
+  return svptrue_b8 ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_le_wrap.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_le_wrap.c
new file mode 100644
index 000000000..d81dd8e6b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_1_le_wrap.c
@@ -0,0 +1,184 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mlittle-endian -fshrink-wrap -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** test_1:
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	ptrue	p0\.b, all
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ret
+*/
+svbool_t
+test_1 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
+		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
+		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_2:
+**	ptrue	p0\.b, all
+**	ret
+*/
+svbool_t
+test_2 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_3:
+**	addvl	sp, sp, #-6
+**	str	p5, \[sp\]
+**	str	p6, \[sp, #1, mul vl\]
+**	str	p11, \[sp, #2, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z13, \[sp, #2, mul vl\]
+**	str	z19, \[sp, #3, mul vl\]
+**	str	z20, \[sp, #4, mul vl\]
+**	str	z22, \[sp, #5, mul vl\]
+**	ptrue	p0\.b, all
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z13, \[sp, #2, mul vl\]
+**	ldr	z19, \[sp, #3, mul vl\]
+**	ldr	z20, \[sp, #4, mul vl\]
+**	ldr	z22, \[sp, #5, mul vl\]
+**	ldr	p5, \[sp\]
+**	ldr	p6, \[sp, #1, mul vl\]
+**	ldr	p11, \[sp, #2, mul vl\]
+**	addvl	sp, sp, #6
+**	ret
+*/
+svbool_t
+test_3 (void)
+{
+  asm volatile ("" :::
+		"z8", "z13", "z19", "z20", "z22",
+		"p5", "p6", "p11");
+  return svptrue_b8 ();
+}
+
+/*
+** test_4:
+**	addvl	sp, sp, #-1
+**	str	p4, \[sp\]
+**	ptrue	p0\.b, all
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svbool_t
+test_4 (void)
+{
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_5:
+**	addvl	sp, sp, #-1
+**	str	z15, \[sp\]
+**	ptrue	p0\.b, all
+**	ldr	z15, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svbool_t
+test_5 (void)
+{
+  asm volatile ("" ::: "z15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_6:
+**	addvl	sp, sp, #-1
+**	str	z15, \[sp\]
+**	mov	z0\.b, #1
+**	ldr	z15, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svint8_t
+test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
+  return svdup_s8 (1);
+}
+
+/*
+** test_7:
+**	addvl	sp, sp, #-1
+**	str	z16, \[sp\]
+**	ptrue	p0\.b, all
+**	ldr	z16, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svbool_t
+test_7 (void)
+{
+  asm volatile ("" ::: "z16");
+  return svptrue_b8 ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_be_nowrap.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_be_nowrap.c
new file mode 100644
index 000000000..05aa18b3c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_be_nowrap.c
@@ -0,0 +1,271 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mbig-endian -fno-shrink-wrap -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+void standard_callee (void);
+__attribute__((aarch64_vector_pcs)) void vpcs_callee (void);
+
+/*
+** calls_standard:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	ptrue	p0\.b, all
+**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
+**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
+**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
+**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
+**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
+**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
+**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	bl	standard_callee
+**	ptrue	p0\.b, all
+**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
+**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
+**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
+**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
+**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
+**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
+**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+void calls_standard (__SVInt8_t x) { standard_callee (); }
+
+/*
+** calls_vpcs:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	ptrue	p0\.b, all
+**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
+**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
+**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
+**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
+**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
+**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
+**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	bl	vpcs_callee
+**	ptrue	p0\.b, all
+**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
+**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
+**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
+**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
+**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
+**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
+**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+void calls_vpcs (__SVInt8_t x) { vpcs_callee (); }
+
+/*
+** calls_standard_ptr:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	ptrue	p0\.b, all
+**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
+**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
+**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
+**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
+**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
+**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
+**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	blr	x0
+**	ptrue	p0\.b, all
+**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
+**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
+**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
+**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
+**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
+**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
+**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+void
+calls_standard_ptr (__SVInt8_t x, void (*fn) (void))
+{
+  fn ();
+}
+
+/*
+** calls_vpcs_ptr:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	ptrue	p0\.b, all
+**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
+**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
+**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
+**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
+**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
+**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
+**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	blr	x0
+**	ptrue	p0\.b, all
+**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
+**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
+**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
+**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
+**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
+**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
+**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+void
+calls_vpcs_ptr (__SVInt8_t x,
+		void (*__attribute__((aarch64_vector_pcs)) fn) (void))
+{
+  fn ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_be_wrap.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_be_wrap.c
new file mode 100644
index 000000000..85b7794d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_be_wrap.c
@@ -0,0 +1,271 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mbig-endian -fshrink-wrap -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+void standard_callee (void);
+__attribute__((aarch64_vector_pcs)) void vpcs_callee (void);
+
+/*
+** calls_standard:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	ptrue	p0\.b, all
+**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
+**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
+**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
+**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
+**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
+**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
+**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	bl	standard_callee
+**	ptrue	p0\.b, all
+**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
+**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
+**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
+**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
+**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
+**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
+**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+void calls_standard (__SVInt8_t x) { standard_callee (); }
+
+/*
+** calls_vpcs:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	ptrue	p0\.b, all
+**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
+**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
+**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
+**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
+**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
+**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
+**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	bl	vpcs_callee
+**	ptrue	p0\.b, all
+**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
+**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
+**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
+**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
+**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
+**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
+**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+void calls_vpcs (__SVInt8_t x) { vpcs_callee (); }
+
+/*
+** calls_standard_ptr:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	ptrue	p0\.b, all
+**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
+**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
+**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
+**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
+**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
+**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
+**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	blr	x0
+**	ptrue	p0\.b, all
+**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
+**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
+**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
+**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
+**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
+**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
+**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+void
+calls_standard_ptr (__SVInt8_t x, void (*fn) (void))
+{
+  fn ();
+}
+
+/*
+** calls_vpcs_ptr:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	ptrue	p0\.b, all
+**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
+**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
+**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
+**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
+**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
+**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
+**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	blr	x0
+**	ptrue	p0\.b, all
+**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
+**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
+**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
+**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
+**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
+**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
+**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+void
+calls_vpcs_ptr (__SVInt8_t x,
+		void (*__attribute__((aarch64_vector_pcs)) fn) (void))
+{
+  fn ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_le_nowrap.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_le_nowrap.c
new file mode 100644
index 000000000..0fcd357a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_le_nowrap.c
@@ -0,0 +1,255 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mlittle-endian -fno-shrink-wrap -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+void standard_callee (void);
+__attribute__((aarch64_vector_pcs)) void vpcs_callee (void);
+
+/*
+** calls_standard:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	bl	standard_callee
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+void calls_standard (__SVInt8_t x) { standard_callee (); }
+
+/*
+** calls_vpcs:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	bl	vpcs_callee
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+void calls_vpcs (__SVInt8_t x) { vpcs_callee (); }
+
+/*
+** calls_standard_ptr:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	blr	x0
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+void
+calls_standard_ptr (__SVInt8_t x, void (*fn) (void))
+{
+  fn ();
+}
+
+/*
+** calls_vpcs_ptr:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	blr	x0
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+void
+calls_vpcs_ptr (__SVInt8_t x,
+		void (*__attribute__((aarch64_vector_pcs)) fn) (void))
+{
+  fn ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_le_wrap.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_le_wrap.c
new file mode 100644
index 000000000..e81194c74
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_2_le_wrap.c
@@ -0,0 +1,255 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mlittle-endian -fshrink-wrap -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+void standard_callee (void);
+__attribute__((aarch64_vector_pcs)) void vpcs_callee (void);
+
+/*
+** calls_standard:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	bl	standard_callee
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+void calls_standard (__SVInt8_t x) { standard_callee (); }
+
+/*
+** calls_vpcs:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	bl	vpcs_callee
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+void calls_vpcs (__SVInt8_t x) { vpcs_callee (); }
+
+/*
+** calls_standard_ptr:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	blr	x0
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+void
+calls_standard_ptr (__SVInt8_t x, void (*fn) (void))
+{
+  fn ();
+}
+
+/*
+** calls_vpcs_ptr:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	blr	x0
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+void
+calls_vpcs_ptr (__SVInt8_t x,
+		void (*__attribute__((aarch64_vector_pcs)) fn) (void))
+{
+  fn ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_3.c
new file mode 100644
index 000000000..1fe86b0ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_3.c
@@ -0,0 +1,92 @@
+/* { dg-do compile } */
+/* { dg-options "-O -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+
+int sve_callee (svint8_t);
+
+/*
+** standard_caller:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	mov	z0\.b, #1
+**	bl	sve_callee
+**	add	w0, w0, #?1
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+int standard_caller (void) { return sve_callee (svdup_s8 (1)) + 1; }
+
+/*
+** vpcs_caller:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	mov	z0\.b, #1
+**	bl	sve_callee
+**	add	w0, w0, #?1
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+__attribute__((aarch64_vector_pcs))
+int vpcs_caller (void) { return sve_callee (svdup_s8 (1)) + 1; }
+
+/*
+** sve_caller:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	mov	z0\.b, #1
+**	bl	sve_callee
+**	add	w0, w0, #?1
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+int sve_caller (svbool_t p0) { return sve_callee (svdup_s8 (1)) + 1; }
+
+/*
+** standard_caller_ptr:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	mov	z0\.h, #1
+**	blr	x0
+**	add	w0, w0, #?1
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+int
+standard_caller_ptr (int (*fn) (__SVInt16_t))
+{
+  return fn (svdup_s16 (1)) + 1;
+}
+
+/*
+** vpcs_caller_ptr:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	mov	z0\.h, #1
+**	blr	x0
+**	add	w0, w0, #?1
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+int __attribute__((aarch64_vector_pcs))
+vpcs_caller_ptr (int (*fn) (__SVInt16_t))
+{
+  return fn (svdup_s16 (1)) + 1;
+}
+
+/*
+** sve_caller_ptr:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	mov	z0\.h, #1
+**	blr	x0
+**	add	w0, w0, #?1
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+int
+sve_caller_ptr (svbool_t pg, int (*fn) (svint16_t))
+{
+  return fn (svdup_s16 (1)) + 1;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_4_be.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_4_be.c
new file mode 100644
index 000000000..c42699dc7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_4_be.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+void standard_callee (__SVInt8_t *);
+
+/*
+** calls_standard:
+**	addvl	sp, sp, #-1
+** (
+**	stp	x29, x30, \[sp, -16\]!
+** |
+**	sub	sp, sp, #?16
+**	stp	x29, x30, \[sp\]
+** )
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	ptrue	p0\.b, all
+**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
+**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
+**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
+**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
+**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
+**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
+**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	addvl	x0, sp, #17
+**	add	x0, x0, #?16
+**	bl	standard_callee
+**	ptrue	p0\.b, all
+**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
+**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
+**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
+**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
+**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
+**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
+**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+** (
+**	ldp	x29, x30, \[sp\], 16
+**	addvl	sp, sp, #1
+** |
+**	ldp	x29, x30, \[sp\]
+**	addvl	sp, sp, #1
+**	add	sp, sp, #?16
+** )
+**	ret
+*/
+void calls_standard (__SVInt8_t x) { __SVInt8_t tmp; standard_callee (&tmp); }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_4_le.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_4_le.c
new file mode 100644
index 000000000..49fe96800
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_4_le.c
@@ -0,0 +1,80 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+void standard_callee (__SVInt8_t *);
+
+/*
+** calls_standard:
+**	addvl	sp, sp, #-1
+** (
+**	stp	x29, x30, \[sp, -16\]!
+** |
+**	sub	sp, sp, #?16
+**	stp	x29, x30, \[sp\]
+** )
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	addvl	x0, sp, #17
+**	add	x0, x0, #?16
+**	bl	standard_callee
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+** (
+**	ldp	x29, x30, \[sp\], 16
+**	addvl	sp, sp, #1
+** |
+**	ldp	x29, x30, \[sp\]
+**	addvl	sp, sp, #1
+**	add	sp, sp, #?16
+** )
+**	ret
+*/
+void calls_standard (__SVInt8_t x) { __SVInt8_t tmp; standard_callee (&tmp); }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_5_be.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_5_be.c
new file mode 100644
index 000000000..dc3282eee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_5_be.c
@@ -0,0 +1,78 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+void standard_callee (void);
+
+/*
+** calls_standard:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	ptrue	p0\.b, all
+**	st1d	z8\.d, p0, \[sp, #1, mul vl\]
+**	st1d	z9\.d, p0, \[sp, #2, mul vl\]
+**	st1d	z10\.d, p0, \[sp, #3, mul vl\]
+**	st1d	z11\.d, p0, \[sp, #4, mul vl\]
+**	st1d	z12\.d, p0, \[sp, #5, mul vl\]
+**	st1d	z13\.d, p0, \[sp, #6, mul vl\]
+**	st1d	z14\.d, p0, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	st1d	z15\.d, p0, \[x11, #-8, mul vl\]
+**	cbnz	w0, \.L[0-9]+
+**	ptrue	p0\.b, all
+**	ld1d	z8\.d, p0/z, \[sp, #1, mul vl\]
+**	ld1d	z9\.d, p0/z, \[sp, #2, mul vl\]
+**	ld1d	z10\.d, p0/z, \[sp, #3, mul vl\]
+**	ld1d	z11\.d, p0/z, \[sp, #4, mul vl\]
+**	ld1d	z12\.d, p0/z, \[sp, #5, mul vl\]
+**	ld1d	z13\.d, p0/z, \[sp, #6, mul vl\]
+**	ld1d	z14\.d, p0/z, \[sp, #7, mul vl\]
+**	addvl	x11, sp, #16
+**	ld1d	z15\.d, p0/z, \[x11, #-8, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+**	...
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	bl	standard_callee
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	b	\.L[0-9]+
+*/
+void
+calls_standard (__SVInt8_t x, int y)
+{
+  asm volatile ("" ::: "z8");
+  if (__builtin_expect (y, 0))
+    standard_callee ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_5_le.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_5_le.c
new file mode 100644
index 000000000..0d29ff2fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/saves_5_le.c
@@ -0,0 +1,74 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+void standard_callee (void);
+
+/*
+** calls_standard:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	addvl	sp, sp, #-17
+**	str	z8, \[sp, #1, mul vl\]
+**	cbnz	w0, \.L[0-9]+
+**	ldr	z8, \[sp, #1, mul vl\]
+**	addvl	sp, sp, #17
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+**	...
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	bl	standard_callee
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	b	\.L[0-9]+
+*/
+void
+calls_standard (__SVInt8_t x, int y)
+{
+  asm volatile ("" ::: "z8");
+  if (__builtin_expect (y, 0))
+    standard_callee ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c
new file mode 100644
index 000000000..485d01875
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c
@@ -0,0 +1,204 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mlittle-endian -fshrink-wrap -fstack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** test_1:
+**	cntb	x12
+**	mov	x13, #?17
+**	mul	x12, x12, x13
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x12
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	ptrue	p0\.b, all
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	addvl	sp, sp, #17
+**	ret
+*/
+svbool_t
+test_1 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
+		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
+		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_2:
+**	ptrue	p0\.b, all
+**	ret
+*/
+svbool_t
+test_2 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_3:
+**	cntb	x12, all, mul #6
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x12
+**	str	p5, \[sp\]
+**	str	p6, \[sp, #1, mul vl\]
+**	str	p11, \[sp, #2, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z13, \[sp, #2, mul vl\]
+**	str	z19, \[sp, #3, mul vl\]
+**	str	z20, \[sp, #4, mul vl\]
+**	str	z22, \[sp, #5, mul vl\]
+**	ptrue	p0\.b, all
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z13, \[sp, #2, mul vl\]
+**	ldr	z19, \[sp, #3, mul vl\]
+**	ldr	z20, \[sp, #4, mul vl\]
+**	ldr	z22, \[sp, #5, mul vl\]
+**	ldr	p5, \[sp\]
+**	ldr	p6, \[sp, #1, mul vl\]
+**	ldr	p11, \[sp, #2, mul vl\]
+**	addvl	sp, sp, #6
+**	ret
+*/
+svbool_t
+test_3 (void)
+{
+  asm volatile ("" :::
+		"z8", "z13", "z19", "z20", "z22",
+		"p5", "p6", "p11");
+  return svptrue_b8 ();
+}
+
+/*
+** test_4:
+**	cntb	x12
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x12
+**	str	p4, \[sp\]
+**	ptrue	p0\.b, all
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svbool_t
+test_4 (void)
+{
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_5:
+**	cntb	x12
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x12
+**	str	z15, \[sp\]
+**	ptrue	p0\.b, all
+**	ldr	z15, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svbool_t
+test_5 (void)
+{
+  asm volatile ("" ::: "z15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_6:
+**	cntb	x12
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x12
+**	str	z15, \[sp\]
+**	mov	z0\.b, #1
+**	ldr	z15, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svint8_t
+test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
+  return svdup_s8 (1);
+}
+
+/*
+** test_7:
+**	cntb	x12
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x12
+**	str	z16, \[sp\]
+**	ptrue	p0\.b, all
+**	ldr	z16, \[sp\]
+**	addvl	sp, sp, #1
+**	ret
+*/
+svbool_t
+test_7 (void)
+{
+  asm volatile ("" ::: "z16");
+  return svptrue_b8 ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_1024.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_1024.c
new file mode 100644
index 000000000..087e8db9e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_1024.c
@@ -0,0 +1,184 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mlittle-endian -fshrink-wrap -fstack-clash-protection -msve-vector-bits=1024 -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** test_1:
+**	sub	sp, sp, #2176
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	ptrue	p0\.b, vl128
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	add	sp, sp, #?2176
+**	ret
+*/
+svbool_t
+test_1 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
+		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
+		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_2:
+**	ptrue	p0\.b, vl128
+**	ret
+*/
+svbool_t
+test_2 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_3:
+**	sub	sp, sp, #768
+**	str	p5, \[sp\]
+**	str	p6, \[sp, #1, mul vl\]
+**	str	p11, \[sp, #2, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z13, \[sp, #2, mul vl\]
+**	str	z19, \[sp, #3, mul vl\]
+**	str	z20, \[sp, #4, mul vl\]
+**	str	z22, \[sp, #5, mul vl\]
+**	ptrue	p0\.b, vl128
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z13, \[sp, #2, mul vl\]
+**	ldr	z19, \[sp, #3, mul vl\]
+**	ldr	z20, \[sp, #4, mul vl\]
+**	ldr	z22, \[sp, #5, mul vl\]
+**	ldr	p5, \[sp\]
+**	ldr	p6, \[sp, #1, mul vl\]
+**	ldr	p11, \[sp, #2, mul vl\]
+**	add	sp, sp, #?768
+**	ret
+*/
+svbool_t
+test_3 (void)
+{
+  asm volatile ("" :::
+		"z8", "z13", "z19", "z20", "z22",
+		"p5", "p6", "p11");
+  return svptrue_b8 ();
+}
+
+/*
+** test_4:
+**	sub	sp, sp, #128
+**	str	p4, \[sp\]
+**	ptrue	p0\.b, vl128
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?128
+**	ret
+*/
+svbool_t
+test_4 (void)
+{
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_5:
+**	sub	sp, sp, #128
+**	str	z15, \[sp\]
+**	ptrue	p0\.b, vl128
+**	ldr	z15, \[sp\]
+**	add	sp, sp, #?128
+**	ret
+*/
+svbool_t
+test_5 (void)
+{
+  asm volatile ("" ::: "z15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_6:
+**	sub	sp, sp, #128
+**	str	z15, \[sp\]
+**	mov	z0\.b, #1
+**	ldr	z15, \[sp\]
+**	add	sp, sp, #?128
+**	ret
+*/
+svint8_t
+test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
+  return svdup_s8 (1);
+}
+
+/*
+** test_7:
+**	sub	sp, sp, #128
+**	str	z16, \[sp\]
+**	ptrue	p0\.b, vl128
+**	ldr	z16, \[sp\]
+**	add	sp, sp, #?128
+**	ret
+*/
+svbool_t
+test_7 (void)
+{
+  asm volatile ("" ::: "z16");
+  return svptrue_b8 ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_2048.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_2048.c
new file mode 100644
index 000000000..e8dc5d5e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_2048.c
@@ -0,0 +1,185 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mlittle-endian -fshrink-wrap -fstack-clash-protection -msve-vector-bits=2048 -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** test_1:
+**	mov	x12, #?4352
+**	sub	sp, sp, x12
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	ptrue	p0\.b, vl256
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_1 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
+		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
+		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_2:
+**	ptrue	p0\.b, vl256
+**	ret
+*/
+svbool_t
+test_2 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_3:
+**	sub	sp, sp, #1536
+**	str	p5, \[sp\]
+**	str	p6, \[sp, #1, mul vl\]
+**	str	p11, \[sp, #2, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z13, \[sp, #2, mul vl\]
+**	str	z19, \[sp, #3, mul vl\]
+**	str	z20, \[sp, #4, mul vl\]
+**	str	z22, \[sp, #5, mul vl\]
+**	ptrue	p0\.b, vl256
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z13, \[sp, #2, mul vl\]
+**	ldr	z19, \[sp, #3, mul vl\]
+**	ldr	z20, \[sp, #4, mul vl\]
+**	ldr	z22, \[sp, #5, mul vl\]
+**	ldr	p5, \[sp\]
+**	ldr	p6, \[sp, #1, mul vl\]
+**	ldr	p11, \[sp, #2, mul vl\]
+**	add	sp, sp, #?1536
+**	ret
+*/
+svbool_t
+test_3 (void)
+{
+  asm volatile ("" :::
+		"z8", "z13", "z19", "z20", "z22",
+		"p5", "p6", "p11");
+  return svptrue_b8 ();
+}
+
+/*
+** test_4:
+**	sub	sp, sp, #256
+**	str	p4, \[sp\]
+**	ptrue	p0\.b, vl256
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?256
+**	ret
+*/
+svbool_t
+test_4 (void)
+{
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_5:
+**	sub	sp, sp, #256
+**	str	z15, \[sp\]
+**	ptrue	p0\.b, vl256
+**	ldr	z15, \[sp\]
+**	add	sp, sp, #?256
+**	ret
+*/
+svbool_t
+test_5 (void)
+{
+  asm volatile ("" ::: "z15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_6:
+**	sub	sp, sp, #256
+**	str	z15, \[sp\]
+**	mov	z0\.b, #1
+**	ldr	z15, \[sp\]
+**	add	sp, sp, #?256
+**	ret
+*/
+svint8_t
+test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
+  return svdup_s8 (1);
+}
+
+/*
+** test_7:
+**	sub	sp, sp, #256
+**	str	z16, \[sp\]
+**	ptrue	p0\.b, vl256
+**	ldr	z16, \[sp\]
+**	add	sp, sp, #?256
+**	ret
+*/
+svbool_t
+test_7 (void)
+{
+  asm volatile ("" ::: "z16");
+  return svptrue_b8 ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_256.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_256.c
new file mode 100644
index 000000000..73c49e4d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_256.c
@@ -0,0 +1,184 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mlittle-endian -fshrink-wrap -fstack-clash-protection -msve-vector-bits=256 -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** test_1:
+**	sub	sp, sp, #544
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	ptrue	p0\.b, vl32
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	add	sp, sp, #?544
+**	ret
+*/
+svbool_t
+test_1 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
+		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
+		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_2:
+**	ptrue	p0\.b, vl32
+**	ret
+*/
+svbool_t
+test_2 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_3:
+**	sub	sp, sp, #192
+**	str	p5, \[sp\]
+**	str	p6, \[sp, #1, mul vl\]
+**	str	p11, \[sp, #2, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z13, \[sp, #2, mul vl\]
+**	str	z19, \[sp, #3, mul vl\]
+**	str	z20, \[sp, #4, mul vl\]
+**	str	z22, \[sp, #5, mul vl\]
+**	ptrue	p0\.b, vl32
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z13, \[sp, #2, mul vl\]
+**	ldr	z19, \[sp, #3, mul vl\]
+**	ldr	z20, \[sp, #4, mul vl\]
+**	ldr	z22, \[sp, #5, mul vl\]
+**	ldr	p5, \[sp\]
+**	ldr	p6, \[sp, #1, mul vl\]
+**	ldr	p11, \[sp, #2, mul vl\]
+**	add	sp, sp, #?192
+**	ret
+*/
+svbool_t
+test_3 (void)
+{
+  asm volatile ("" :::
+		"z8", "z13", "z19", "z20", "z22",
+		"p5", "p6", "p11");
+  return svptrue_b8 ();
+}
+
+/*
+** test_4:
+**	sub	sp, sp, #32
+**	str	p4, \[sp\]
+**	ptrue	p0\.b, vl32
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?32
+**	ret
+*/
+svbool_t
+test_4 (void)
+{
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_5:
+**	sub	sp, sp, #32
+**	str	z15, \[sp\]
+**	ptrue	p0\.b, vl32
+**	ldr	z15, \[sp\]
+**	add	sp, sp, #?32
+**	ret
+*/
+svbool_t
+test_5 (void)
+{
+  asm volatile ("" ::: "z15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_6:
+**	sub	sp, sp, #32
+**	str	z15, \[sp\]
+**	mov	z0\.b, #1
+**	ldr	z15, \[sp\]
+**	add	sp, sp, #?32
+**	ret
+*/
+svint8_t
+test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
+  return svdup_s8 (1);
+}
+
+/*
+** test_7:
+**	sub	sp, sp, #32
+**	str	z16, \[sp\]
+**	ptrue	p0\.b, vl32
+**	ldr	z16, \[sp\]
+**	add	sp, sp, #?32
+**	ret
+*/
+svbool_t
+test_7 (void)
+{
+  asm volatile ("" ::: "z16");
+  return svptrue_b8 ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_512.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_512.c
new file mode 100644
index 000000000..d4b524147
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1_512.c
@@ -0,0 +1,184 @@
+/* { dg-do compile } */
+/* { dg-options "-O -mlittle-endian -fshrink-wrap -fstack-clash-protection -msve-vector-bits=512 -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** test_1:
+**	sub	sp, sp, #1088
+**	str	p4, \[sp\]
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	str	p7, \[sp, #3, mul vl\]
+**	str	p8, \[sp, #4, mul vl\]
+**	str	p9, \[sp, #5, mul vl\]
+**	str	p10, \[sp, #6, mul vl\]
+**	str	p11, \[sp, #7, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z9, \[sp, #2, mul vl\]
+**	str	z10, \[sp, #3, mul vl\]
+**	str	z11, \[sp, #4, mul vl\]
+**	str	z12, \[sp, #5, mul vl\]
+**	str	z13, \[sp, #6, mul vl\]
+**	str	z14, \[sp, #7, mul vl\]
+**	str	z15, \[sp, #8, mul vl\]
+**	str	z16, \[sp, #9, mul vl\]
+**	str	z17, \[sp, #10, mul vl\]
+**	str	z18, \[sp, #11, mul vl\]
+**	str	z19, \[sp, #12, mul vl\]
+**	str	z20, \[sp, #13, mul vl\]
+**	str	z21, \[sp, #14, mul vl\]
+**	str	z22, \[sp, #15, mul vl\]
+**	str	z23, \[sp, #16, mul vl\]
+**	ptrue	p0\.b, vl64
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z9, \[sp, #2, mul vl\]
+**	ldr	z10, \[sp, #3, mul vl\]
+**	ldr	z11, \[sp, #4, mul vl\]
+**	ldr	z12, \[sp, #5, mul vl\]
+**	ldr	z13, \[sp, #6, mul vl\]
+**	ldr	z14, \[sp, #7, mul vl\]
+**	ldr	z15, \[sp, #8, mul vl\]
+**	ldr	z16, \[sp, #9, mul vl\]
+**	ldr	z17, \[sp, #10, mul vl\]
+**	ldr	z18, \[sp, #11, mul vl\]
+**	ldr	z19, \[sp, #12, mul vl\]
+**	ldr	z20, \[sp, #13, mul vl\]
+**	ldr	z21, \[sp, #14, mul vl\]
+**	ldr	z22, \[sp, #15, mul vl\]
+**	ldr	z23, \[sp, #16, mul vl\]
+**	ldr	p4, \[sp\]
+**	ldr	p5, \[sp, #1, mul vl\]
+**	ldr	p6, \[sp, #2, mul vl\]
+**	ldr	p7, \[sp, #3, mul vl\]
+**	ldr	p8, \[sp, #4, mul vl\]
+**	ldr	p9, \[sp, #5, mul vl\]
+**	ldr	p10, \[sp, #6, mul vl\]
+**	ldr	p11, \[sp, #7, mul vl\]
+**	add	sp, sp, #?1088
+**	ret
+*/
+svbool_t
+test_1 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
+		"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
+		"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_2:
+**	ptrue	p0\.b, vl64
+**	ret
+*/
+svbool_t
+test_2 (void)
+{
+  asm volatile ("" :::
+		"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
+		"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31",
+		"p0", "p1", "p2", "p3", "p12", "p13", "p14", "p15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_3:
+**	sub	sp, sp, #384
+**	str	p5, \[sp\]
+**	str	p6, \[sp, #1, mul vl\]
+**	str	p11, \[sp, #2, mul vl\]
+**	str	z8, \[sp, #1, mul vl\]
+**	str	z13, \[sp, #2, mul vl\]
+**	str	z19, \[sp, #3, mul vl\]
+**	str	z20, \[sp, #4, mul vl\]
+**	str	z22, \[sp, #5, mul vl\]
+**	ptrue	p0\.b, vl64
+**	ldr	z8, \[sp, #1, mul vl\]
+**	ldr	z13, \[sp, #2, mul vl\]
+**	ldr	z19, \[sp, #3, mul vl\]
+**	ldr	z20, \[sp, #4, mul vl\]
+**	ldr	z22, \[sp, #5, mul vl\]
+**	ldr	p5, \[sp\]
+**	ldr	p6, \[sp, #1, mul vl\]
+**	ldr	p11, \[sp, #2, mul vl\]
+**	add	sp, sp, #?384
+**	ret
+*/
+svbool_t
+test_3 (void)
+{
+  asm volatile ("" :::
+		"z8", "z13", "z19", "z20", "z22",
+		"p5", "p6", "p11");
+  return svptrue_b8 ();
+}
+
+/*
+** test_4:
+**	sub	sp, sp, #64
+**	str	p4, \[sp\]
+**	ptrue	p0\.b, vl64
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?64
+**	ret
+*/
+svbool_t
+test_4 (void)
+{
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_5:
+**	sub	sp, sp, #64
+**	str	z15, \[sp\]
+**	ptrue	p0\.b, vl64
+**	ldr	z15, \[sp\]
+**	add	sp, sp, #?64
+**	ret
+*/
+svbool_t
+test_5 (void)
+{
+  asm volatile ("" ::: "z15");
+  return svptrue_b8 ();
+}
+
+/*
+** test_6:
+**	sub	sp, sp, #64
+**	str	z15, \[sp\]
+**	mov	z0\.b, #1
+**	ldr	z15, \[sp\]
+**	add	sp, sp, #?64
+**	ret
+*/
+svint8_t
+test_6 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+  asm volatile ("" :: "Upa" (p0), "Upa" (p1), "Upa" (p2), "Upa" (p3) : "z15");
+  return svdup_s8 (1);
+}
+
+/*
+** test_7:
+**	sub	sp, sp, #64
+**	str	z16, \[sp\]
+**	ptrue	p0\.b, vl64
+**	ldr	z16, \[sp\]
+**	add	sp, sp, #?64
+**	ret
+*/
+svbool_t
+test_7 (void)
+{
+  asm volatile ("" ::: "z16");
+  return svptrue_b8 ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2.c
new file mode 100644
index 000000000..4622a1eed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2.c
@@ -0,0 +1,336 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fshrink-wrap -fstack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+svbool_t take_stack_args (volatile void *, void *, int, int, int,
+			  int, int, int, int);
+
+/*
+** test_1:
+**	cntb	x12
+**	add	x12, x12, #?16
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x12
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, all
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	add	sp, sp, #?16
+**	ret
+*/
+svbool_t
+test_1 (void)
+{
+  volatile int x = 1;
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_2:
+**	stp	x24, x25, \[sp, -48\]!
+**	str	x26, \[sp, 16\]
+**	cntb	x13
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x13
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, all
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ldr	x26, \[sp, 16\]
+**	ldp	x24, x25, \[sp\], 48
+**	ret
+*/
+svbool_t
+test_2 (void)
+{
+  volatile int x = 1;
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_3:
+**	cntb	x12
+**	mov	x13, #?4128
+**	add	x12, x12, x13
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x12
+**	addvl	x11, sp, #1
+**	stp	x24, x25, \[x11\]
+**	str	x26, \[x11, 16\]
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, all
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ldp	x24, x25, \[sp\]
+**	ldr	x26, \[sp, 16\]
+**	mov	x12, #?4128
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_3 (void)
+{
+  volatile int x[1024];
+  asm volatile ("" :: "r" (x) : "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_4:
+**	cntb	x12, all, mul #2
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x12
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.h, all
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #2
+**	ret
+*/
+svbool_t
+test_4 (void)
+{
+  volatile svint32_t b;
+  b = svdup_s32 (1);
+  asm volatile ("" ::: "p4");
+  return svptrue_b16 ();
+}
+
+/*
+** test_5:
+**	cntb	x12, all, mul #2
+**	add	x12, x12, #?32
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x12
+**	addvl	x11, sp, #1
+**	stp	x24, x25, \[x11\]
+**	str	x26, \[x11, 16\]
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.h, all
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ldp	x24, x25, \[sp\]
+**	ldr	x26, \[sp, 16\]
+**	addvl	sp, sp, #1
+**	add	sp, sp, #?32
+**	ret
+*/
+svbool_t
+test_5 (void)
+{
+  volatile svint32_t b;
+  b = svdup_s32 (1);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b16 ();
+}
+
+/*
+** test_6:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	cntb	x13
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x13
+**	str	p4, \[sp\]
+**	sub	sp, sp, #?16
+**	...
+**	ptrue	p0\.b, all
+**	add	sp, sp, #?16
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svbool_t
+test_6 (void)
+{
+  take_stack_args (0, 0, 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_7:
+**	cntb	x12
+**	mov	x13, #?4112
+**	add	x12, x12, x13
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x12
+**	addvl	x11, sp, #1
+**	stp	x29, x30, \[x11\]
+**	addvl	x29, sp, #1
+**	str	p4, \[sp\]
+**	sub	sp, sp, #?16
+**	...
+**	ptrue	p0\.b, all
+**	add	sp, sp, #?16
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4112
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_7 (void)
+{
+  volatile int x[1024];
+  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_8:
+**	cntb	x12
+**	mov	x13, #?4144
+**	add	x12, x12, x13
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x12
+**	addvl	x11, sp, #1
+**	stp	x29, x30, \[x11\]
+**	addvl	x29, sp, #1
+**	stp	x24, x25, \[x29, 16\]
+**	str	x26, \[x29, 32\]
+**	str	p4, \[sp\]
+**	sub	sp, sp, #?16
+**	...
+**	ptrue	p0\.b, all
+**	add	sp, sp, #?16
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ldp	x24, x25, \[sp, 16\]
+**	ldr	x26, \[sp, 32\]
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4144
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_8 (void)
+{
+  volatile int x[1024];
+  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_9:
+**	cntb	x12
+**	mov	x13, #?4112
+**	add	x12, x12, x13
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x12
+**	addvl	x11, sp, #1
+**	stp	x29, x30, \[x11\]
+**	addvl	x29, sp, #1
+**	str	p4, \[sp\]
+**	sub	sp, sp, #?16
+**	...
+**	ptrue	p0\.b, all
+**	addvl	sp, x29, #-1
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4112
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_9 (int n)
+{
+  volatile int x[1024];
+  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_10:
+**	cntb	x12
+**	mov	x13, #?4144
+**	add	x12, x12, x13
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x12
+**	addvl	x11, sp, #1
+**	stp	x29, x30, \[x11\]
+**	addvl	x29, sp, #1
+**	stp	x24, x25, \[x29, 16\]
+**	str	x26, \[x29, 32\]
+**	str	p4, \[sp\]
+**	sub	sp, sp, #?16
+**	...
+**	ptrue	p0\.b, all
+**	addvl	sp, x29, #-1
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ldp	x24, x25, \[sp, 16\]
+**	ldr	x26, \[sp, 32\]
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4144
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_10 (int n)
+{
+  volatile int x[1024];
+  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_11:
+**	cntb	x12
+**	add	x12, x12, #?3008
+**	add	x12, x12, #?126976
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x12
+**	addvl	x11, sp, #1
+**	stp	x29, x30, \[x11\]
+**	addvl	x29, sp, #1
+**	stp	x24, x25, \[x29, 16\]
+**	str	x26, \[x29, 32\]
+**	str	p4, \[sp\]
+**	sub	sp, sp, #?16
+**	...
+**	ptrue	p0\.b, all
+**	addvl	sp, x29, #-1
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ldp	x24, x25, \[sp, 16\]
+**	ldr	x26, \[sp, 32\]
+**	ldp	x29, x30, \[sp\]
+**	add	sp, sp, #?3008
+**	add	sp, sp, #?126976
+**	ret
+*/
+svbool_t
+test_11 (int n)
+{
+  volatile int x[0x7ee4];
+  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_1024.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_1024.c
new file mode 100644
index 000000000..d5a9d4444
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_1024.c
@@ -0,0 +1,285 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fshrink-wrap -fstack-clash-protection -msve-vector-bits=1024 -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+svbool_t take_stack_args (volatile void *, void *, int, int, int,
+			  int, int, int, int);
+
+/*
+** test_1:
+**	sub	sp, sp, #144
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, vl128
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?144
+**	ret
+*/
+svbool_t
+test_1 (void)
+{
+  volatile int x = 1;
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_2:
+**	sub	sp, sp, #176
+**	stp	x24, x25, \[sp, 128\]
+**	str	x26, \[sp, 144\]
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, vl128
+**	ldr	p4, \[sp\]
+**	ldp	x24, x25, \[sp, 128\]
+**	ldr	x26, \[sp, 144\]
+**	add	sp, sp, #?176
+**	ret
+*/
+svbool_t
+test_2 (void)
+{
+  volatile int x = 1;
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_3:
+**	mov	x12, #?4256
+**	sub	sp, sp, x12
+**	stp	x24, x25, \[sp, 128\]
+**	str	x26, \[sp, 144\]
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, vl128
+**	ldr	p4, \[sp\]
+**	ldp	x24, x25, \[sp, 128\]
+**	ldr	x26, \[sp, 144\]
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_3 (void)
+{
+  volatile int x[1024];
+  asm volatile ("" :: "r" (x) : "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_4:
+**	sub	sp, sp, #256
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.h, vl64
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?256
+**	ret
+*/
+svbool_t
+test_4 (void)
+{
+  volatile svint32_t b;
+  b = svdup_s32 (1);
+  asm volatile ("" ::: "p4");
+  return svptrue_b16 ();
+}
+
+/*
+** test_5:
+**	sub	sp, sp, #288
+**	stp	x24, x25, \[sp, 128\]
+**	str	x26, \[sp, 144\]
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.h, vl64
+**	ldr	p4, \[sp\]
+**	ldp	x24, x25, \[sp, 128\]
+**	ldr	x26, \[sp, 144\]
+**	add	sp, sp, #?288
+**	ret
+*/
+svbool_t
+test_5 (void)
+{
+  volatile svint32_t b;
+  b = svdup_s32 (1);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b16 ();
+}
+
+/*
+** test_6:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	sub	sp, sp, #128
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, vl128
+**	add	sp, sp, #?16
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?128
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svbool_t
+test_6 (void)
+{
+  take_stack_args (0, 0, 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_7:
+**	mov	x12, #?4240
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 128\]
+**	add	x29, sp, #?128
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl128
+**	add	sp, sp, #?16
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?128
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4112
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_7 (void)
+{
+  volatile int x[1024];
+  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_8:
+**	mov	x12, #?4272
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 128\]
+**	add	x29, sp, #?128
+**	stp	x24, x25, \[sp, 144\]
+**	str	x26, \[sp, 160\]
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl128
+**	add	sp, sp, #?16
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?128
+**	ldp	x24, x25, \[sp, 16\]
+**	ldr	x26, \[sp, 32\]
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4144
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_8 (void)
+{
+  volatile int x[1024];
+  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_9:
+**	mov	x12, #?4240
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 128\]
+**	add	x29, sp, #?128
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl128
+**	sub	sp, x29, #128
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?128
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4112
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_9 (int n)
+{
+  volatile int x[1024];
+  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_10:
+**	mov	x12, #?4272
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 128\]
+**	add	x29, sp, #?128
+**	stp	x24, x25, \[sp, 144\]
+**	str	x26, \[sp, 160\]
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl128
+**	sub	sp, x29, #128
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?128
+**	ldp	x24, x25, \[sp, 16\]
+**	ldr	x26, \[sp, 32\]
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4144
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_10 (int n)
+{
+  volatile int x[1024];
+  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_11:
+**	sub	sp, sp, #65536
+**	str	xzr, \[sp, 1024\]
+**	mov	x12, #?64576
+**	sub	sp, sp, x12
+**	str	xzr, \[sp, 1024\]
+**	stp	x29, x30, \[sp, 128\]
+**	add	x29, sp, #?128
+**	stp	x24, x25, \[sp, 144\]
+**	str	x26, \[sp, 160\]
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl128
+**	sub	sp, x29, #128
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?128
+**	ldp	x24, x25, \[sp, 16\]
+**	ldr	x26, \[sp, 32\]
+**	ldp	x29, x30, \[sp\]
+**	add	sp, sp, #?3008
+**	add	sp, sp, #?126976
+**	ret
+*/
+svbool_t
+test_11 (int n)
+{
+  volatile int x[0x7ee4];
+  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_2048.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_2048.c
new file mode 100644
index 000000000..c185e2e36
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_2048.c
@@ -0,0 +1,285 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fshrink-wrap -fstack-clash-protection -msve-vector-bits=2048 -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+svbool_t take_stack_args (volatile void *, void *, int, int, int,
+			  int, int, int, int);
+
+/*
+** test_1:
+**	sub	sp, sp, #272
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, vl256
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?272
+**	ret
+*/
+svbool_t
+test_1 (void)
+{
+  volatile int x = 1;
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_2:
+**	sub	sp, sp, #304
+**	stp	x24, x25, \[sp, 256\]
+**	str	x26, \[sp, 272\]
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, vl256
+**	ldr	p4, \[sp\]
+**	ldp	x24, x25, \[sp, 256\]
+**	ldr	x26, \[sp, 272\]
+**	add	sp, sp, #?304
+**	ret
+*/
+svbool_t
+test_2 (void)
+{
+  volatile int x = 1;
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_3:
+**	mov	x12, #?4384
+**	sub	sp, sp, x12
+**	stp	x24, x25, \[sp, 256\]
+**	str	x26, \[sp, 272\]
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, vl256
+**	ldr	p4, \[sp\]
+**	ldp	x24, x25, \[sp, 256\]
+**	ldr	x26, \[sp, 272\]
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_3 (void)
+{
+  volatile int x[1024];
+  asm volatile ("" :: "r" (x) : "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_4:
+**	sub	sp, sp, #512
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.h, vl128
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?512
+**	ret
+*/
+svbool_t
+test_4 (void)
+{
+  volatile svint32_t b;
+  b = svdup_s32 (1);
+  asm volatile ("" ::: "p4");
+  return svptrue_b16 ();
+}
+
+/*
+** test_5:
+**	sub	sp, sp, #544
+**	stp	x24, x25, \[sp, 256\]
+**	str	x26, \[sp, 272\]
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.h, vl128
+**	ldr	p4, \[sp\]
+**	ldp	x24, x25, \[sp, 256\]
+**	ldr	x26, \[sp, 272\]
+**	add	sp, sp, #?544
+**	ret
+*/
+svbool_t
+test_5 (void)
+{
+  volatile svint32_t b;
+  b = svdup_s32 (1);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b16 ();
+}
+
+/*
+** test_6:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	sub	sp, sp, #256
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, vl256
+**	add	sp, sp, #?16
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?256
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svbool_t
+test_6 (void)
+{
+  take_stack_args (0, 0, 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_7:
+**	mov	x12, #?4368
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 256\]
+**	add	x29, sp, #?256
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl256
+**	add	sp, sp, #?16
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?256
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4112
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_7 (void)
+{
+  volatile int x[1024];
+  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_8:
+**	mov	x12, #?4400
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 256\]
+**	add	x29, sp, #?256
+**	stp	x24, x25, \[sp, 272\]
+**	str	x26, \[sp, 288\]
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl256
+**	add	sp, sp, #?16
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?256
+**	ldp	x24, x25, \[sp, 16\]
+**	ldr	x26, \[sp, 32\]
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4144
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_8 (void)
+{
+  volatile int x[1024];
+  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_9:
+**	mov	x12, #?4368
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 256\]
+**	add	x29, sp, #?256
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl256
+**	sub	sp, x29, #256
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?256
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4112
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_9 (int n)
+{
+  volatile int x[1024];
+  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_10:
+**	mov	x12, #?4400
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 256\]
+**	add	x29, sp, #?256
+**	stp	x24, x25, \[sp, 272\]
+**	str	x26, \[sp, 288\]
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl256
+**	sub	sp, x29, #256
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?256
+**	ldp	x24, x25, \[sp, 16\]
+**	ldr	x26, \[sp, 32\]
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4144
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_10 (int n)
+{
+  volatile int x[1024];
+  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_11:
+**	sub	sp, sp, #65536
+**	str	xzr, \[sp, 1024\]
+**	mov	x12, #?64704
+**	sub	sp, sp, x12
+**	str	xzr, \[sp, 1024\]
+**	stp	x29, x30, \[sp, 256\]
+**	add	x29, sp, #?256
+**	stp	x24, x25, \[sp, 272\]
+**	str	x26, \[sp, 288\]
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl256
+**	sub	sp, x29, #256
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?256
+**	ldp	x24, x25, \[sp, 16\]
+**	ldr	x26, \[sp, 32\]
+**	ldp	x29, x30, \[sp\]
+**	add	sp, sp, #?3008
+**	add	sp, sp, #?126976
+**	ret
+*/
+svbool_t
+test_11 (int n)
+{
+  volatile int x[0x7ee4];
+  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_256.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_256.c
new file mode 100644
index 000000000..f8318b354
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_256.c
@@ -0,0 +1,284 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fshrink-wrap -fstack-clash-protection -msve-vector-bits=256 -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+svbool_t take_stack_args (volatile void *, void *, int, int, int,
+			  int, int, int, int);
+
+/*
+** test_1:
+**	sub	sp, sp, #48
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, vl32
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?48
+**	ret
+*/
+svbool_t
+test_1 (void)
+{
+  volatile int x = 1;
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_2:
+**	sub	sp, sp, #80
+**	stp	x24, x25, \[sp, 32\]
+**	str	x26, \[sp, 48\]
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, vl32
+**	ldr	p4, \[sp\]
+**	ldp	x24, x25, \[sp, 32\]
+**	ldr	x26, \[sp, 48\]
+**	add	sp, sp, #?80
+**	ret
+*/
+svbool_t
+test_2 (void)
+{
+  volatile int x = 1;
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_3:
+**	mov	x12, #?4160
+**	sub	sp, sp, x12
+**	stp	x24, x25, \[sp, 32\]
+**	str	x26, \[sp, 48\]
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, vl32
+**	ldr	p4, \[sp\]
+**	ldp	x24, x25, \[sp, 32\]
+**	ldr	x26, \[sp, 48\]
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_3 (void)
+{
+  volatile int x[1024];
+  asm volatile ("" :: "r" (x) : "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_4:
+**	sub	sp, sp, #64
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.h, vl16
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?64
+**	ret
+*/
+svbool_t
+test_4 (void)
+{
+  volatile svint32_t b;
+  b = svdup_s32 (1);
+  asm volatile ("" ::: "p4");
+  return svptrue_b16 ();
+}
+
+/*
+** test_5:
+**	sub	sp, sp, #96
+**	stp	x24, x25, \[sp, 32\]
+**	str	x26, \[sp, 48\]
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.h, vl16
+**	ldr	p4, \[sp\]
+**	ldp	x24, x25, \[sp, 32\]
+**	ldr	x26, \[sp, 48\]
+**	add	sp, sp, #?96
+**	ret
+*/
+svbool_t
+test_5 (void)
+{
+  volatile svint32_t b;
+  b = svdup_s32 (1);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b16 ();
+}
+
+/*
+** test_6:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	sub	sp, sp, #32
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, vl32
+**	add	sp, sp, #?16
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?32
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svbool_t
+test_6 (void)
+{
+  take_stack_args (0, 0, 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_7:
+**	mov	x12, #?4144
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 32\]
+**	add	x29, sp, #?32
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl32
+**	add	sp, sp, #?16
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?32
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4112
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_7 (void)
+{
+  volatile int x[1024];
+  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_8:
+**	mov	x12, #?4176
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 32\]
+**	add	x29, sp, #?32
+**	stp	x24, x25, \[sp, 48\]
+**	str	x26, \[sp, 64\]
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl32
+**	add	sp, sp, #?16
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?32
+**	ldp	x24, x25, \[sp, 16\]
+**	ldr	x26, \[sp, 32\]
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4144
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_8 (void)
+{
+  volatile int x[1024];
+  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_9:
+**	mov	x12, #?4144
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 32\]
+**	add	x29, sp, #?32
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl32
+**	sub	sp, x29, #32
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?32
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4112
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_9 (int n)
+{
+  volatile int x[1024];
+  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_10:
+**	mov	x12, #?4176
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 32\]
+**	add	x29, sp, #?32
+**	stp	x24, x25, \[sp, 48\]
+**	str	x26, \[sp, 64\]
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl32
+**	sub	sp, x29, #32
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?32
+**	ldp	x24, x25, \[sp, 16\]
+**	ldr	x26, \[sp, 32\]
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4144
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_10 (int n)
+{
+  volatile int x[1024];
+  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_11:
+**	sub	sp, sp, #65536
+**	str	xzr, \[sp, 1024\]
+**	mov	x12, #?64480
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 32\]
+**	add	x29, sp, #?32
+**	stp	x24, x25, \[sp, 48\]
+**	str	x26, \[sp, 64\]
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl32
+**	sub	sp, x29, #32
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?32
+**	ldp	x24, x25, \[sp, 16\]
+**	ldr	x26, \[sp, 32\]
+**	ldp	x29, x30, \[sp\]
+**	add	sp, sp, #?3008
+**	add	sp, sp, #?126976
+**	ret
+*/
+svbool_t
+test_11 (int n)
+{
+  volatile int x[0x7ee4];
+  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_512.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_512.c
new file mode 100644
index 000000000..45a23ad49
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_512.c
@@ -0,0 +1,285 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fshrink-wrap -fstack-clash-protection -msve-vector-bits=512 -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+svbool_t take_stack_args (volatile void *, void *, int, int, int,
+			  int, int, int, int);
+
+/*
+** test_1:
+**	sub	sp, sp, #80
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, vl64
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?80
+**	ret
+*/
+svbool_t
+test_1 (void)
+{
+  volatile int x = 1;
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_2:
+**	sub	sp, sp, #112
+**	stp	x24, x25, \[sp, 64\]
+**	str	x26, \[sp, 80\]
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, vl64
+**	ldr	p4, \[sp\]
+**	ldp	x24, x25, \[sp, 64\]
+**	ldr	x26, \[sp, 80\]
+**	add	sp, sp, #?112
+**	ret
+*/
+svbool_t
+test_2 (void)
+{
+  volatile int x = 1;
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_3:
+**	mov	x12, #?4192
+**	sub	sp, sp, x12
+**	stp	x24, x25, \[sp, 64\]
+**	str	x26, \[sp, 80\]
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, vl64
+**	ldr	p4, \[sp\]
+**	ldp	x24, x25, \[sp, 64\]
+**	ldr	x26, \[sp, 80\]
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_3 (void)
+{
+  volatile int x[1024];
+  asm volatile ("" :: "r" (x) : "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_4:
+**	sub	sp, sp, #128
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.h, vl32
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?128
+**	ret
+*/
+svbool_t
+test_4 (void)
+{
+  volatile svint32_t b;
+  b = svdup_s32 (1);
+  asm volatile ("" ::: "p4");
+  return svptrue_b16 ();
+}
+
+/*
+** test_5:
+**	sub	sp, sp, #160
+**	stp	x24, x25, \[sp, 64\]
+**	str	x26, \[sp, 80\]
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.h, vl32
+**	ldr	p4, \[sp\]
+**	ldp	x24, x25, \[sp, 64\]
+**	ldr	x26, \[sp, 80\]
+**	add	sp, sp, #?160
+**	ret
+*/
+svbool_t
+test_5 (void)
+{
+  volatile svint32_t b;
+  b = svdup_s32 (1);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b16 ();
+}
+
+/*
+** test_6:
+**	stp	x29, x30, \[sp, -16\]!
+**	mov	x29, sp
+**	sub	sp, sp, #64
+**	str	p4, \[sp\]
+**	...
+**	ptrue	p0\.b, vl64
+**	add	sp, sp, #?16
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?64
+**	ldp	x29, x30, \[sp\], 16
+**	ret
+*/
+svbool_t
+test_6 (void)
+{
+  take_stack_args (0, 0, 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_7:
+**	mov	x12, #?4176
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 64\]
+**	add	x29, sp, #?64
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl64
+**	add	sp, sp, #?16
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?64
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4112
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_7 (void)
+{
+  volatile int x[1024];
+  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_8:
+**	mov	x12, #?4208
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 64\]
+**	add	x29, sp, #?64
+**	stp	x24, x25, \[sp, 80\]
+**	str	x26, \[sp, 96\]
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl64
+**	add	sp, sp, #?16
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?64
+**	ldp	x24, x25, \[sp, 16\]
+**	ldr	x26, \[sp, 32\]
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4144
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_8 (void)
+{
+  volatile int x[1024];
+  take_stack_args (x, 0, 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_9:
+**	mov	x12, #?4176
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 64\]
+**	add	x29, sp, #?64
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl64
+**	sub	sp, x29, #64
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?64
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4112
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_9 (int n)
+{
+  volatile int x[1024];
+  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4");
+  return svptrue_b8 ();
+}
+
+/*
+** test_10:
+**	mov	x12, #?4208
+**	sub	sp, sp, x12
+**	stp	x29, x30, \[sp, 64\]
+**	add	x29, sp, #?64
+**	stp	x24, x25, \[sp, 80\]
+**	str	x26, \[sp, 96\]
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl64
+**	sub	sp, x29, #64
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?64
+**	ldp	x24, x25, \[sp, 16\]
+**	ldr	x26, \[sp, 32\]
+**	ldp	x29, x30, \[sp\]
+**	mov	x12, #?4144
+**	add	sp, sp, x12
+**	ret
+*/
+svbool_t
+test_10 (int n)
+{
+  volatile int x[1024];
+  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
+
+/*
+** test_11:
+**	sub	sp, sp, #65536
+**	str	xzr, \[sp, 1024\]
+**	mov	x12, #?64512
+**	sub	sp, sp, x12
+**	str	xzr, \[sp, 1024\]
+**	stp	x29, x30, \[sp, 64\]
+**	add	x29, sp, #?64
+**	stp	x24, x25, \[sp, 80\]
+**	str	x26, \[sp, 96\]
+**	str	p4, \[sp\]
+**	sub	sp, sp, #16
+**	...
+**	ptrue	p0\.b, vl64
+**	sub	sp, x29, #64
+**	ldr	p4, \[sp\]
+**	add	sp, sp, #?64
+**	ldp	x24, x25, \[sp, 16\]
+**	ldr	x26, \[sp, 32\]
+**	ldp	x29, x30, \[sp\]
+**	add	sp, sp, #?3008
+**	add	sp, sp, #?126976
+**	ret
+*/
+svbool_t
+test_11 (int n)
+{
+  volatile int x[0x7ee4];
+  take_stack_args (x, __builtin_alloca (n), 1, 2, 3, 4, 5, 6, 7);
+  asm volatile ("" ::: "p4", "x24", "x25", "x26");
+  return svptrue_b8 ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
new file mode 100644
index 000000000..3e01ec36c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
@@ -0,0 +1,63 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fshrink-wrap -fstack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** test_1:
+**	str	x24, \[sp, -32\]!
+**	cntb	x13
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x13
+**	str	p4, \[sp\]
+**	cbz	w0, [^\n]*
+**	...
+**	ptrue	p0\.b, all
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ldr	x24, \[sp\], 32
+**	ret
+*/
+svbool_t
+test_1 (int n)
+{
+  asm volatile ("" ::: "x24");
+  if (n)
+    {
+      volatile int x = 1;
+      asm volatile ("" ::: "p4");
+    }
+  return svptrue_b8 ();
+}
+
+/*
+** test_2:
+**	str	x24, \[sp, -32\]!
+**	cntb	x13
+**	mov	x11, sp
+**	...
+**	sub	sp, sp, x13
+**	str	p4, \[sp\]
+**	cbz	w0, [^\n]*
+**	str	p5, \[sp, #1, mul vl\]
+**	str	p6, \[sp, #2, mul vl\]
+**	...
+**	ptrue	p0\.b, all
+**	ldr	p4, \[sp\]
+**	addvl	sp, sp, #1
+**	ldr	x24, \[sp\], 32
+**	ret
+*/
+svbool_t
+test_2 (int n)
+{
+  asm volatile ("" ::: "x24");
+  if (n)
+    {
+      volatile int x = 1;
+      asm volatile ("" ::: "p4", "p5", "p6");
+    }
+  return svptrue_b8 ();
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/unprototyped_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/unprototyped_1.c
new file mode 100644
index 000000000..5c7ed5167
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/unprototyped_1.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+
+#include <arm_sve.h>
+
+void unprototyped ();
+
+void
+f (svuint8_t *ptr)
+{
+  unprototyped (*ptr); /* { dg-error {SVE type '(svuint8_t|__SVUint8_t)' cannot be passed to an unprototyped function} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_1.c
new file mode 100644
index 000000000..6987245a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_1.c
@@ -0,0 +1,170 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+#include <stdarg.h>
+
+/*
+** callee_0:
+**	...
+**	ldr	(p[0-7]), \[x1\]
+**	...
+**	cntp	x0, \1, \1\.b
+**	...
+**	ret
+*/
+uint64_t __attribute__((noipa))
+callee_0 (int64_t *ptr, ...)
+{
+  va_list va;
+  svbool_t pg;
+
+  va_start (va, ptr);
+  pg = va_arg (va, svbool_t);
+  va_end (va);
+  return svcntp_b8 (pg, pg);
+}
+
+/*
+** caller_0:
+**	...
+**	ptrue	(p[0-7])\.d, vl7
+**	...
+**	str	\1, \[x1\]
+**	...
+**	ret
+*/
+uint64_t __attribute__((noipa))
+caller_0 (int64_t *ptr)
+{
+  return callee_0 (ptr, svptrue_pat_b64 (SV_VL7));
+}
+
+/*
+** callee_1:
+**	...
+**	ldr	(p[0-7]), \[x2\]
+**	...
+**	cntp	x0, \1, \1\.b
+**	...
+**	ret
+*/
+uint64_t __attribute__((noipa))
+callee_1 (int64_t *ptr, ...)
+{
+  va_list va;
+  svbool_t pg;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  pg = va_arg (va, svbool_t);
+  va_end (va);
+  return svcntp_b8 (pg, pg);
+}
+
+/*
+** caller_1:
+**	...
+**	ptrue	(p[0-7])\.d, vl7
+**	...
+**	str	\1, \[x2\]
+**	...
+**	ret
+*/
+uint64_t __attribute__((noipa))
+caller_1 (int64_t *ptr)
+{
+  return callee_1 (ptr, 1, svptrue_pat_b64 (SV_VL7));
+}
+
+/*
+** callee_7:
+**	...
+**	ldr	(p[0-7]), \[x7\]
+**	...
+**	cntp	x0, \1, \1\.b
+**	...
+**	ret
+*/
+uint64_t __attribute__((noipa))
+callee_7 (int64_t *ptr, ...)
+{
+  va_list va;
+  svbool_t pg;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  pg = va_arg (va, svbool_t);
+  va_end (va);
+  return svcntp_b8 (pg, pg);
+}
+
+/*
+** caller_7:
+**	...
+**	ptrue	(p[0-7])\.d, vl7
+**	...
+**	str	\1, \[x7\]
+**	...
+**	ret
+*/
+uint64_t __attribute__((noipa))
+caller_7 (int64_t *ptr)
+{
+  return callee_7 (ptr, 1, 2, 3, 4, 5, 6, svptrue_pat_b64 (SV_VL7));
+}
+
+/* FIXME: We should be able to get rid of the va_list object.  */
+/*
+** callee_8:
+**	sub	sp, sp, #([0-9]+)
+**	...
+**	ldr	(x[0-9]+), \[sp, \1\]
+**	...
+**	ldr	(p[0-7]), \[\2\]
+**	...
+**	cntp	x0, \3, \3\.b
+**	...
+**	ret
+*/
+uint64_t __attribute__((noipa))
+callee_8 (int64_t *ptr, ...)
+{
+  va_list va;
+  svbool_t pg;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  pg = va_arg (va, svbool_t);
+  va_end (va);
+  return svcntp_b8 (pg, pg);
+}
+
+/*
+** caller_8:
+**	...
+**	ptrue	(p[0-7])\.d, vl7
+**	...
+**	str	\1, \[(x[0-9]+)\]
+**	...
+**	str	\2, \[sp\]
+**	...
+**	ret
+*/
+uint64_t __attribute__((noipa))
+caller_8 (int64_t *ptr)
+{
+  return callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svptrue_pat_b64 (SV_VL7));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f16.c
new file mode 100644
index 000000000..79098851c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f16.c
@@ -0,0 +1,170 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+#include <stdarg.h>
+
+/*
+** callee_0:
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x1\]
+**	...
+**	st1h	\1, \2, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_0 (int16_t *ptr, ...)
+{
+  va_list va;
+  svint16_t vec;
+
+  va_start (va, ptr);
+  vec = va_arg (va, svint16_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_0:
+**	...
+**	fmov	(z[0-9]+\.h), #9\.0[^\n]*
+**	...
+**	st1h	\1, p[0-7], \[x1\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_0 (int16_t *ptr)
+{
+  callee_0 (ptr, svdup_f16 (9));
+}
+
+/*
+** callee_1:
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x2\]
+**	...
+**	st1h	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_1 (int16_t *ptr, ...)
+{
+  va_list va;
+  svint16_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  vec = va_arg (va, svint16_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_1:
+**	...
+**	fmov	(z[0-9]+\.h), #9\.0[^\n]*
+**	...
+**	st1h	\1, p[0-7], \[x2\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_1 (int16_t *ptr)
+{
+  callee_1 (ptr, 1, svdup_f16 (9));
+}
+
+/*
+** callee_7:
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x7\]
+**	...
+**	st1h	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_7 (int16_t *ptr, ...)
+{
+  va_list va;
+  svint16_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint16_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_7:
+**	...
+**	fmov	(z[0-9]+\.h), #9\.0[^\n]*
+**	...
+**	st1h	\1, p[0-7], \[x7\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_7 (int16_t *ptr)
+{
+  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_f16 (9));
+}
+
+/* FIXME: We should be able to get rid of the va_list object.  */
+/*
+** callee_8:
+**	sub	sp, sp, #([0-9]+)
+**	...
+**	ldr	(x[0-9]+), \[sp, \1\]
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[\2\]
+**	...
+**	st1h	\3, \4, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_8 (int16_t *ptr, ...)
+{
+  va_list va;
+  svint16_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint16_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_8:
+**	...
+**	fmov	(z[0-9]+\.h), #9\.0[^\n]*
+**	...
+**	st1h	\1, p[0-7], \[(x[0-9]+)\]
+**	...
+**	str	\2, \[sp\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_8 (int16_t *ptr)
+{
+  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_f16 (9));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f32.c
new file mode 100644
index 000000000..325b0b2aa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f32.c
@@ -0,0 +1,170 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+#include <stdarg.h>
+
+/*
+** callee_0:
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x1\]
+**	...
+**	st1w	\1, \2, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_0 (int32_t *ptr, ...)
+{
+  va_list va;
+  svint32_t vec;
+
+  va_start (va, ptr);
+  vec = va_arg (va, svint32_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_0:
+**	...
+**	fmov	(z[0-9]+\.s), #9\.0[^\n]*
+**	...
+**	st1w	\1, p[0-7], \[x1\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_0 (int32_t *ptr)
+{
+  callee_0 (ptr, svdup_f32 (9));
+}
+
+/*
+** callee_1:
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x2\]
+**	...
+**	st1w	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_1 (int32_t *ptr, ...)
+{
+  va_list va;
+  svint32_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  vec = va_arg (va, svint32_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_1:
+**	...
+**	fmov	(z[0-9]+\.s), #9\.0[^\n]*
+**	...
+**	st1w	\1, p[0-7], \[x2\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_1 (int32_t *ptr)
+{
+  callee_1 (ptr, 1, svdup_f32 (9));
+}
+
+/*
+** callee_7:
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x7\]
+**	...
+**	st1w	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_7 (int32_t *ptr, ...)
+{
+  va_list va;
+  svint32_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint32_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_7:
+**	...
+**	fmov	(z[0-9]+\.s), #9\.0[^\n]*
+**	...
+**	st1w	\1, p[0-7], \[x7\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_7 (int32_t *ptr)
+{
+  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_f32 (9));
+}
+
+/* FIXME: We should be able to get rid of the va_list object.  */
+/*
+** callee_8:
+**	sub	sp, sp, #([0-9]+)
+**	...
+**	ldr	(x[0-9]+), \[sp, \1\]
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[\2\]
+**	...
+**	st1w	\3, \4, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_8 (int32_t *ptr, ...)
+{
+  va_list va;
+  svint32_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint32_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_8:
+**	...
+**	fmov	(z[0-9]+\.s), #9\.0[^\n]*
+**	...
+**	st1w	\1, p[0-7], \[(x[0-9]+)\]
+**	...
+**	str	\2, \[sp\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_8 (int32_t *ptr)
+{
+  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_f32 (9));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f64.c
new file mode 100644
index 000000000..07a6c707e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_f64.c
@@ -0,0 +1,170 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+#include <stdarg.h>
+
+/*
+** callee_0:
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x1\]
+**	...
+**	st1d	\1, \2, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_0 (int64_t *ptr, ...)
+{
+  va_list va;
+  svint64_t vec;
+
+  va_start (va, ptr);
+  vec = va_arg (va, svint64_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_0:
+**	...
+**	fmov	(z[0-9]+\.d), #9\.0[^\n]*
+**	...
+**	st1d	\1, p[0-7], \[x1\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_0 (int64_t *ptr)
+{
+  callee_0 (ptr, svdup_f64 (9));
+}
+
+/*
+** callee_1:
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x2\]
+**	...
+**	st1d	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_1 (int64_t *ptr, ...)
+{
+  va_list va;
+  svint64_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  vec = va_arg (va, svint64_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_1:
+**	...
+**	fmov	(z[0-9]+\.d), #9\.0[^\n]*
+**	...
+**	st1d	\1, p[0-7], \[x2\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_1 (int64_t *ptr)
+{
+  callee_1 (ptr, 1, svdup_f64 (9));
+}
+
+/*
+** callee_7:
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x7\]
+**	...
+**	st1d	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_7 (int64_t *ptr, ...)
+{
+  va_list va;
+  svint64_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint64_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_7:
+**	...
+**	fmov	(z[0-9]+\.d), #9\.0[^\n]*
+**	...
+**	st1d	\1, p[0-7], \[x7\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_7 (int64_t *ptr)
+{
+  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_f64 (9));
+}
+
+/* FIXME: We should be able to get rid of the va_list object.  */
+/*
+** callee_8:
+**	sub	sp, sp, #([0-9]+)
+**	...
+**	ldr	(x[0-9]+), \[sp, \1\]
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[\2\]
+**	...
+**	st1d	\3, \4, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_8 (int64_t *ptr, ...)
+{
+  va_list va;
+  svint64_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint64_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_8:
+**	...
+**	fmov	(z[0-9]+\.d), #9\.0[^\n]*
+**	...
+**	st1d	\1, p[0-7], \[(x[0-9]+)\]
+**	...
+**	str	\2, \[sp\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_8 (int64_t *ptr)
+{
+  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_f64 (9));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s16.c
new file mode 100644
index 000000000..173063833
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s16.c
@@ -0,0 +1,170 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+#include <stdarg.h>
+
+/*
+** callee_0:
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x1\]
+**	...
+**	st1h	\1, \2, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_0 (int16_t *ptr, ...)
+{
+  va_list va;
+  svint16_t vec;
+
+  va_start (va, ptr);
+  vec = va_arg (va, svint16_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_0:
+**	...
+**	mov	(z[0-9]+\.h), #42
+**	...
+**	st1h	\1, p[0-7], \[x1\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_0 (int16_t *ptr)
+{
+  callee_0 (ptr, svdup_s16 (42));
+}
+
+/*
+** callee_1:
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x2\]
+**	...
+**	st1h	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_1 (int16_t *ptr, ...)
+{
+  va_list va;
+  svint16_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  vec = va_arg (va, svint16_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_1:
+**	...
+**	mov	(z[0-9]+\.h), #42
+**	...
+**	st1h	\1, p[0-7], \[x2\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_1 (int16_t *ptr)
+{
+  callee_1 (ptr, 1, svdup_s16 (42));
+}
+
+/*
+** callee_7:
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x7\]
+**	...
+**	st1h	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_7 (int16_t *ptr, ...)
+{
+  va_list va;
+  svint16_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint16_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_7:
+**	...
+**	mov	(z[0-9]+\.h), #42
+**	...
+**	st1h	\1, p[0-7], \[x7\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_7 (int16_t *ptr)
+{
+  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_s16 (42));
+}
+
+/* FIXME: We should be able to get rid of the va_list object.  */
+/*
+** callee_8:
+**	sub	sp, sp, #([0-9]+)
+**	...
+**	ldr	(x[0-9]+), \[sp, \1\]
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[\2\]
+**	...
+**	st1h	\3, \4, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_8 (int16_t *ptr, ...)
+{
+  va_list va;
+  svint16_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint16_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_8:
+**	...
+**	mov	(z[0-9]+\.h), #42
+**	...
+**	st1h	\1, p[0-7], \[(x[0-9]+)\]
+**	...
+**	str	\2, \[sp\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_8 (int16_t *ptr)
+{
+  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_s16 (42));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s32.c
new file mode 100644
index 000000000..d93db8fc8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s32.c
@@ -0,0 +1,170 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+#include <stdarg.h>
+
+/*
+** callee_0:
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x1\]
+**	...
+**	st1w	\1, \2, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_0 (int32_t *ptr, ...)
+{
+  va_list va;
+  svint32_t vec;
+
+  va_start (va, ptr);
+  vec = va_arg (va, svint32_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_0:
+**	...
+**	mov	(z[0-9]+\.s), #42
+**	...
+**	st1w	\1, p[0-7], \[x1\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_0 (int32_t *ptr)
+{
+  callee_0 (ptr, svdup_s32 (42));
+}
+
+/*
+** callee_1:
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x2\]
+**	...
+**	st1w	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_1 (int32_t *ptr, ...)
+{
+  va_list va;
+  svint32_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  vec = va_arg (va, svint32_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_1:
+**	...
+**	mov	(z[0-9]+\.s), #42
+**	...
+**	st1w	\1, p[0-7], \[x2\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_1 (int32_t *ptr)
+{
+  callee_1 (ptr, 1, svdup_s32 (42));
+}
+
+/*
+** callee_7:
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x7\]
+**	...
+**	st1w	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_7 (int32_t *ptr, ...)
+{
+  va_list va;
+  svint32_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint32_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_7:
+**	...
+**	mov	(z[0-9]+\.s), #42
+**	...
+**	st1w	\1, p[0-7], \[x7\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_7 (int32_t *ptr)
+{
+  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_s32 (42));
+}
+
+/* FIXME: We should be able to get rid of the va_list object.  */
+/*
+** callee_8:
+**	sub	sp, sp, #([0-9]+)
+**	...
+**	ldr	(x[0-9]+), \[sp, \1\]
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[\2\]
+**	...
+**	st1w	\3, \4, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_8 (int32_t *ptr, ...)
+{
+  va_list va;
+  svint32_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint32_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_8:
+**	...
+**	mov	(z[0-9]+\.s), #42
+**	...
+**	st1w	\1, p[0-7], \[(x[0-9]+)\]
+**	...
+**	str	\2, \[sp\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_8 (int32_t *ptr)
+{
+  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_s32 (42));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s64.c
new file mode 100644
index 000000000..b8c77455d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s64.c
@@ -0,0 +1,170 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+#include <stdarg.h>
+
+/*
+** callee_0:
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x1\]
+**	...
+**	st1d	\1, \2, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_0 (int64_t *ptr, ...)
+{
+  va_list va;
+  svint64_t vec;
+
+  va_start (va, ptr);
+  vec = va_arg (va, svint64_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_0:
+**	...
+**	mov	(z[0-9]+\.d), #42
+**	...
+**	st1d	\1, p[0-7], \[x1\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_0 (int64_t *ptr)
+{
+  callee_0 (ptr, svdup_s64 (42));
+}
+
+/*
+** callee_1:
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x2\]
+**	...
+**	st1d	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_1 (int64_t *ptr, ...)
+{
+  va_list va;
+  svint64_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  vec = va_arg (va, svint64_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_1:
+**	...
+**	mov	(z[0-9]+\.d), #42
+**	...
+**	st1d	\1, p[0-7], \[x2\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_1 (int64_t *ptr)
+{
+  callee_1 (ptr, 1, svdup_s64 (42));
+}
+
+/*
+** callee_7:
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x7\]
+**	...
+**	st1d	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_7 (int64_t *ptr, ...)
+{
+  va_list va;
+  svint64_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint64_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_7:
+**	...
+**	mov	(z[0-9]+\.d), #42
+**	...
+**	st1d	\1, p[0-7], \[x7\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_7 (int64_t *ptr)
+{
+  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_s64 (42));
+}
+
+/* FIXME: We should be able to get rid of the va_list object.  */
+/*
+** callee_8:
+**	sub	sp, sp, #([0-9]+)
+**	...
+**	ldr	(x[0-9]+), \[sp, \1\]
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[\2\]
+**	...
+**	st1d	\3, \4, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_8 (int64_t *ptr, ...)
+{
+  va_list va;
+  svint64_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint64_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_8:
+**	...
+**	mov	(z[0-9]+\.d), #42
+**	...
+**	st1d	\1, p[0-7], \[(x[0-9]+)\]
+**	...
+**	str	\2, \[sp\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_8 (int64_t *ptr)
+{
+  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_s64 (42));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s8.c
new file mode 100644
index 000000000..de7cbe37d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s8.c
@@ -0,0 +1,170 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+#include <stdarg.h>
+
+/*
+** callee_0:
+**	...
+**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[x1\]
+**	...
+**	st1b	\1, \2, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_0 (int8_t *ptr, ...)
+{
+  va_list va;
+  svint8_t vec;
+
+  va_start (va, ptr);
+  vec = va_arg (va, svint8_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_0:
+**	...
+**	mov	(z[0-9]+\.b), #42
+**	...
+**	st1b	\1, p[0-7], \[x1\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_0 (int8_t *ptr)
+{
+  callee_0 (ptr, svdup_s8 (42));
+}
+
+/*
+** callee_1:
+**	...
+**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[x2\]
+**	...
+**	st1b	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_1 (int8_t *ptr, ...)
+{
+  va_list va;
+  svint8_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  vec = va_arg (va, svint8_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_1:
+**	...
+**	mov	(z[0-9]+\.b), #42
+**	...
+**	st1b	\1, p[0-7], \[x2\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_1 (int8_t *ptr)
+{
+  callee_1 (ptr, 1, svdup_s8 (42));
+}
+
+/*
+** callee_7:
+**	...
+**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[x7\]
+**	...
+**	st1b	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_7 (int8_t *ptr, ...)
+{
+  va_list va;
+  svint8_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint8_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_7:
+**	...
+**	mov	(z[0-9]+\.b), #42
+**	...
+**	st1b	\1, p[0-7], \[x7\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_7 (int8_t *ptr)
+{
+  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_s8 (42));
+}
+
+/* FIXME: We should be able to get rid of the va_list object.  */
+/*
+** callee_8:
+**	sub	sp, sp, #([0-9]+)
+**	...
+**	ldr	(x[0-9]+), \[sp, \1\]
+**	...
+**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[\2\]
+**	...
+**	st1b	\3, \4, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_8 (int8_t *ptr, ...)
+{
+  va_list va;
+  svint8_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint8_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_8:
+**	...
+**	mov	(z[0-9]+\.b), #42
+**	...
+**	st1b	\1, p[0-7], \[(x[0-9]+)\]
+**	...
+**	str	\2, \[sp\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_8 (int8_t *ptr)
+{
+  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_s8 (42));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u16.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u16.c
new file mode 100644
index 000000000..59c9ca7db
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u16.c
@@ -0,0 +1,170 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+#include <stdarg.h>
+
+/*
+** callee_0:
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x1\]
+**	...
+**	st1h	\1, \2, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_0 (int16_t *ptr, ...)
+{
+  va_list va;
+  svint16_t vec;
+
+  va_start (va, ptr);
+  vec = va_arg (va, svint16_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_0:
+**	...
+**	mov	(z[0-9]+\.h), #42
+**	...
+**	st1h	\1, p[0-7], \[x1\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_0 (int16_t *ptr)
+{
+  callee_0 (ptr, svdup_u16 (42));
+}
+
+/*
+** callee_1:
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x2\]
+**	...
+**	st1h	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_1 (int16_t *ptr, ...)
+{
+  va_list va;
+  svint16_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  vec = va_arg (va, svint16_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_1:
+**	...
+**	mov	(z[0-9]+\.h), #42
+**	...
+**	st1h	\1, p[0-7], \[x2\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_1 (int16_t *ptr)
+{
+  callee_1 (ptr, 1, svdup_u16 (42));
+}
+
+/*
+** callee_7:
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[x7\]
+**	...
+**	st1h	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_7 (int16_t *ptr, ...)
+{
+  va_list va;
+  svint16_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint16_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_7:
+**	...
+**	mov	(z[0-9]+\.h), #42
+**	...
+**	st1h	\1, p[0-7], \[x7\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_7 (int16_t *ptr)
+{
+  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_u16 (42));
+}
+
+/* FIXME: We should be able to get rid of the va_list object.  */
+/*
+** callee_8:
+**	sub	sp, sp, #([0-9]+)
+**	...
+**	ldr	(x[0-9]+), \[sp, \1\]
+**	...
+**	ld1h	(z[0-9]+\.h), (p[0-7])/z, \[\2\]
+**	...
+**	st1h	\3, \4, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_8 (int16_t *ptr, ...)
+{
+  va_list va;
+  svint16_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint16_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_8:
+**	...
+**	mov	(z[0-9]+\.h), #42
+**	...
+**	st1h	\1, p[0-7], \[(x[0-9]+)\]
+**	...
+**	str	\2, \[sp\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_8 (int16_t *ptr)
+{
+  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_u16 (42));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u32.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u32.c
new file mode 100644
index 000000000..3050ad5f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u32.c
@@ -0,0 +1,170 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+#include <stdarg.h>
+
+/*
+** callee_0:
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x1\]
+**	...
+**	st1w	\1, \2, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_0 (int32_t *ptr, ...)
+{
+  va_list va;
+  svint32_t vec;
+
+  va_start (va, ptr);
+  vec = va_arg (va, svint32_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_0:
+**	...
+**	mov	(z[0-9]+\.s), #42
+**	...
+**	st1w	\1, p[0-7], \[x1\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_0 (int32_t *ptr)
+{
+  callee_0 (ptr, svdup_u32 (42));
+}
+
+/*
+** callee_1:
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x2\]
+**	...
+**	st1w	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_1 (int32_t *ptr, ...)
+{
+  va_list va;
+  svint32_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  vec = va_arg (va, svint32_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_1:
+**	...
+**	mov	(z[0-9]+\.s), #42
+**	...
+**	st1w	\1, p[0-7], \[x2\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_1 (int32_t *ptr)
+{
+  callee_1 (ptr, 1, svdup_u32 (42));
+}
+
+/*
+** callee_7:
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[x7\]
+**	...
+**	st1w	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_7 (int32_t *ptr, ...)
+{
+  va_list va;
+  svint32_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint32_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_7:
+**	...
+**	mov	(z[0-9]+\.s), #42
+**	...
+**	st1w	\1, p[0-7], \[x7\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_7 (int32_t *ptr)
+{
+  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_u32 (42));
+}
+
+/* FIXME: We should be able to get rid of the va_list object.  */
+/*
+** callee_8:
+**	sub	sp, sp, #([0-9]+)
+**	...
+**	ldr	(x[0-9]+), \[sp, \1\]
+**	...
+**	ld1w	(z[0-9]+\.s), (p[0-7])/z, \[\2\]
+**	...
+**	st1w	\3, \4, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_8 (int32_t *ptr, ...)
+{
+  va_list va;
+  svint32_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint32_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_8:
+**	...
+**	mov	(z[0-9]+\.s), #42
+**	...
+**	st1w	\1, p[0-7], \[(x[0-9]+)\]
+**	...
+**	str	\2, \[sp\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_8 (int32_t *ptr)
+{
+  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_u32 (42));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u64.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u64.c
new file mode 100644
index 000000000..94322a34c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u64.c
@@ -0,0 +1,170 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+#include <stdarg.h>
+
+/*
+** callee_0:
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x1\]
+**	...
+**	st1d	\1, \2, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_0 (int64_t *ptr, ...)
+{
+  va_list va;
+  svint64_t vec;
+
+  va_start (va, ptr);
+  vec = va_arg (va, svint64_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_0:
+**	...
+**	mov	(z[0-9]+\.d), #42
+**	...
+**	st1d	\1, p[0-7], \[x1\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_0 (int64_t *ptr)
+{
+  callee_0 (ptr, svdup_u64 (42));
+}
+
+/*
+** callee_1:
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x2\]
+**	...
+**	st1d	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_1 (int64_t *ptr, ...)
+{
+  va_list va;
+  svint64_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  vec = va_arg (va, svint64_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_1:
+**	...
+**	mov	(z[0-9]+\.d), #42
+**	...
+**	st1d	\1, p[0-7], \[x2\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_1 (int64_t *ptr)
+{
+  callee_1 (ptr, 1, svdup_u64 (42));
+}
+
+/*
+** callee_7:
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[x7\]
+**	...
+**	st1d	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_7 (int64_t *ptr, ...)
+{
+  va_list va;
+  svint64_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint64_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_7:
+**	...
+**	mov	(z[0-9]+\.d), #42
+**	...
+**	st1d	\1, p[0-7], \[x7\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_7 (int64_t *ptr)
+{
+  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_u64 (42));
+}
+
+/* FIXME: We should be able to get rid of the va_list object.  */
+/*
+** callee_8:
+**	sub	sp, sp, #([0-9]+)
+**	...
+**	ldr	(x[0-9]+), \[sp, \1\]
+**	...
+**	ld1d	(z[0-9]+\.d), (p[0-7])/z, \[\2\]
+**	...
+**	st1d	\3, \4, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_8 (int64_t *ptr, ...)
+{
+  va_list va;
+  svint64_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint64_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_8:
+**	...
+**	mov	(z[0-9]+\.d), #42
+**	...
+**	st1d	\1, p[0-7], \[(x[0-9]+)\]
+**	...
+**	str	\2, \[sp\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_8 (int64_t *ptr)
+{
+  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_u64 (42));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u8.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u8.c
new file mode 100644
index 000000000..cf8ac2171
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_u8.c
@@ -0,0 +1,170 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" { target lp64 } } } */
+
+#include <arm_sve.h>
+#include <stdarg.h>
+
+/*
+** callee_0:
+**	...
+**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[x1\]
+**	...
+**	st1b	\1, \2, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_0 (int8_t *ptr, ...)
+{
+  va_list va;
+  svint8_t vec;
+
+  va_start (va, ptr);
+  vec = va_arg (va, svint8_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_0:
+**	...
+**	mov	(z[0-9]+\.b), #42
+**	...
+**	st1b	\1, p[0-7], \[x1\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_0 (int8_t *ptr)
+{
+  callee_0 (ptr, svdup_u8 (42));
+}
+
+/*
+** callee_1:
+**	...
+**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[x2\]
+**	...
+**	st1b	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_1 (int8_t *ptr, ...)
+{
+  va_list va;
+  svint8_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  vec = va_arg (va, svint8_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_1:
+**	...
+**	mov	(z[0-9]+\.b), #42
+**	...
+**	st1b	\1, p[0-7], \[x2\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_1 (int8_t *ptr)
+{
+  callee_1 (ptr, 1, svdup_u8 (42));
+}
+
+/*
+** callee_7:
+**	...
+**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[x7\]
+**	...
+**	st1b	\1, p[0-7], \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_7 (int8_t *ptr, ...)
+{
+  va_list va;
+  svint8_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint8_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_7:
+**	...
+**	mov	(z[0-9]+\.b), #42
+**	...
+**	st1b	\1, p[0-7], \[x7\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_7 (int8_t *ptr)
+{
+  callee_7 (ptr, 1, 2, 3, 4, 5, 6, svdup_u8 (42));
+}
+
+/* FIXME: We should be able to get rid of the va_list object.  */
+/*
+** callee_8:
+**	sub	sp, sp, #([0-9]+)
+**	...
+**	ldr	(x[0-9]+), \[sp, \1\]
+**	...
+**	ld1b	(z[0-9]+\.b), (p[0-7])/z, \[\2\]
+**	...
+**	st1b	\3, \4, \[x0\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+callee_8 (int8_t *ptr, ...)
+{
+  va_list va;
+  svint8_t vec;
+
+  va_start (va, ptr);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  va_arg (va, int);
+  vec = va_arg (va, svint8_t);
+  va_end (va);
+  svst1 (svptrue_b8 (), ptr, vec);
+}
+
+/*
+** caller_8:
+**	...
+**	mov	(z[0-9]+\.b), #42
+**	...
+**	st1b	\1, p[0-7], \[(x[0-9]+)\]
+**	...
+**	str	\2, \[sp\]
+**	...
+**	ret
+*/
+void __attribute__((noipa))
+caller_8 (int8_t *ptr)
+{
+  callee_8 (ptr, 1, 2, 3, 4, 5, 6, 7, svdup_u8 (42));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_3_nosc.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_3_nosc.c
new file mode 100644
index 000000000..cea69cc88
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_3_nosc.c
@@ -0,0 +1,75 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O0 -g" } */
+
+#include <arm_sve.h>
+#include <stdarg.h>
+
+void __attribute__((noipa))
+callee (int foo, ...)
+{
+  va_list va;
+  svbool_t pg, p;
+  svint8_t s8;
+  svuint16x4_t u16;
+  svfloat32x3_t f32;
+  svint64x2_t s64;
+
+  va_start (va, foo);
+  p = va_arg (va, svbool_t);
+  s8 = va_arg (va, svint8_t);
+  u16 = va_arg (va, svuint16x4_t);
+  f32 = va_arg (va, svfloat32x3_t);
+  s64 = va_arg (va, svint64x2_t);
+
+  pg = svptrue_b8 ();
+
+  if (svptest_any (pg, sveor_z (pg, p, svptrue_pat_b8 (SV_VL7))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, s8, svindex_s8 (1, 2))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 0), svindex_u16 (2, 3))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 1), svindex_u16 (3, 4))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 2), svindex_u16 (4, 5))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 3), svindex_u16 (5, 6))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 0), svdup_f32 (1.0))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 1), svdup_f32 (2.0))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 2), svdup_f32 (3.0))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget2 (s64, 0), svindex_s64 (6, 7))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget2 (s64, 1), svindex_s64 (7, 8))))
+    __builtin_abort ();
+}
+
+int __attribute__((noipa))
+main (void)
+{
+  callee (100,
+	  svptrue_pat_b8 (SV_VL7),
+	  svindex_s8 (1, 2),
+	  svcreate4 (svindex_u16 (2, 3),
+		     svindex_u16 (3, 4),
+		     svindex_u16 (4, 5),
+		     svindex_u16 (5, 6)),
+	  svcreate3 (svdup_f32 (1.0),
+		     svdup_f32 (2.0),
+		     svdup_f32 (3.0)),
+	  svcreate2 (svindex_s64 (6, 7),
+		     svindex_s64 (7, 8)));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_3_sc.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_3_sc.c
new file mode 100644
index 000000000..b939aa5ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_3_sc.c
@@ -0,0 +1,75 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O0 -fstack-clash-protection -g" } */
+
+#include <arm_sve.h>
+#include <stdarg.h>
+
+void __attribute__((noipa))
+callee (int foo, ...)
+{
+  va_list va;
+  svbool_t pg, p;
+  svint8_t s8;
+  svuint16x4_t u16;
+  svfloat32x3_t f32;
+  svint64x2_t s64;
+
+  va_start (va, foo);
+  p = va_arg (va, svbool_t);
+  s8 = va_arg (va, svint8_t);
+  u16 = va_arg (va, svuint16x4_t);
+  f32 = va_arg (va, svfloat32x3_t);
+  s64 = va_arg (va, svint64x2_t);
+
+  pg = svptrue_b8 ();
+
+  if (svptest_any (pg, sveor_z (pg, p, svptrue_pat_b8 (SV_VL7))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, s8, svindex_s8 (1, 2))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 0), svindex_u16 (2, 3))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 1), svindex_u16 (3, 4))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 2), svindex_u16 (4, 5))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget4 (u16, 3), svindex_u16 (5, 6))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 0), svdup_f32 (1.0))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 1), svdup_f32 (2.0))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget3 (f32, 2), svdup_f32 (3.0))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget2 (s64, 0), svindex_s64 (6, 7))))
+    __builtin_abort ();
+
+  if (svptest_any (pg, svcmpne (pg, svget2 (s64, 1), svindex_s64 (7, 8))))
+    __builtin_abort ();
+}
+
+int __attribute__((noipa))
+main (void)
+{
+  callee (100,
+	  svptrue_pat_b8 (SV_VL7),
+	  svindex_s8 (1, 2),
+	  svcreate4 (svindex_u16 (2, 3),
+		     svindex_u16 (3, 4),
+		     svindex_u16 (4, 5),
+		     svindex_u16 (5, 6)),
+	  svcreate3 (svdup_f32 (1.0),
+		     svdup_f32 (2.0),
+		     svdup_f32 (3.0)),
+	  svcreate2 (svindex_s64 (6, 7),
+		     svindex_s64 (7, 8)));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/vpcs_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/vpcs_1.c
new file mode 100644
index 000000000..d9f4e6c41
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/vpcs_1.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+
+__attribute__ ((aarch64_vector_pcs)) void f1 (__SVBool_t); /* { dg-error {the 'aarch64_vector_pcs' attribute cannot be applied to an SVE function type} } */
+__attribute__ ((aarch64_vector_pcs)) void f2 (__SVInt8_t s8) {} /* { dg-error {the 'aarch64_vector_pcs' attribute cannot be applied to an SVE function type} } */
+__attribute__ ((aarch64_vector_pcs)) void (*f3) (__SVInt16_t); /* { dg-error {the 'aarch64_vector_pcs' attribute cannot be applied to an SVE function type} } */
+typedef __attribute__ ((aarch64_vector_pcs)) void (*f4) (__SVInt32_t); /* { dg-error {the 'aarch64_vector_pcs' attribute cannot be applied to an SVE function type} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1.c
index a064c337b..156d04ae5 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_1.c
@@ -25,3 +25,4 @@ foo (void)
 /* We should use an induction that starts at -5, with only the last
    7 elements of the first iteration being active.  */
 /* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #-5, #5\n} } } */
+/* { dg-final { scan-assembler {\tptrue\t(p[0-9]+\.b), vl1\n.*\tnot\tp[0-7]\.b, p[0-7]/z, \1\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c
index f2113be90..e792cdf2c 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c
@@ -20,3 +20,4 @@ foo (void)
 /* { dg-final { scan-assembler {\t(adrp|adr)\tx[0-9]+, x\n} } } */
 /* We should unroll the loop three times.  */
 /* { dg-final { scan-assembler-times "\tst1w\t" 3 } } */
+/* { dg-final { scan-assembler {\tptrue\t(p[0-9]+)\.s, vl7\n.*\teor\tp[0-7]\.b, (p[0-7])/z, (\1\.b, \2\.b|\2\.b, \1\.b)\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revb_1.c b/gcc/testsuite/gcc.target/aarch64/sve/revb_1.c
index 1a3d9b4ea..9cf2f27c8 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/revb_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/revb_1.c
@@ -1,9 +1,7 @@
 /* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps -mlittle-endian" } */
 
-#include <stdint.h>
-
-typedef int8_t vnx16qi __attribute__((vector_size (32)));
+typedef __INT8_TYPE__ vnx16qi __attribute__((vector_size (32)));
 
 #define MASK_2(X, Y) (X) ^ (Y), (X + 1) ^ (Y)
 #define MASK_4(X, Y) MASK_2 (X, Y), MASK_2 (X + 2, Y)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revb_2.c b/gcc/testsuite/gcc.target/aarch64/sve/revb_2.c
new file mode 100644
index 000000000..389739cc8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/revb_2.c
@@ -0,0 +1,10 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps -mbig-endian" } */
+
+#include "revb_1.c"
+
+/* { dg-final { scan-assembler-not {\ttbl\t} } } */
+
+/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d} 1 } } */
+/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s} 1 } } */
+/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revh_1.c b/gcc/testsuite/gcc.target/aarch64/sve/revh_1.c
index 76145812b..28a0399b9 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/revh_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/revh_1.c
@@ -1,9 +1,7 @@
 /* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps -mlittle-endian" } */
 
-#include <stdint.h>
-
-typedef uint16_t vnx8hi __attribute__((vector_size (32)));
+typedef __UINT16_TYPE__ vnx8hi __attribute__((vector_size (32)));
 typedef _Float16 vnx8hf __attribute__((vector_size (32)));
 
 #define MASK_2(X, Y) (X) ^ (Y), (X + 1) ^ (Y)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revh_2.c b/gcc/testsuite/gcc.target/aarch64/sve/revh_2.c
new file mode 100644
index 000000000..e821b6402
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/revh_2.c
@@ -0,0 +1,9 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps -mbig-endian" } */
+
+#include "revh_1.c"
+
+/* { dg-final { scan-assembler-not {\ttbl\t} } } */
+
+/* { dg-final { scan-assembler-times {\trevh\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d} 2 } } */
+/* { dg-final { scan-assembler-times {\trevh\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revw_1.c b/gcc/testsuite/gcc.target/aarch64/sve/revw_1.c
index 8ac68b782..de926753c 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/revw_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/revw_1.c
@@ -1,9 +1,7 @@
 /* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O -msve-vector-bits=256 --save-temps" } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps -mlittle-endian" } */
 
-#include <stdint.h>
-
-typedef uint32_t vnx4si __attribute__((vector_size (32)));
+typedef __UINT32_TYPE__ vnx4si __attribute__((vector_size (32)));
 typedef float vnx4sf __attribute__((vector_size (32)));
 
 #define MASK_2(X, Y) (X) ^ (Y), (X + 1) ^ (Y)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revw_2.c b/gcc/testsuite/gcc.target/aarch64/sve/revw_2.c
new file mode 100644
index 000000000..17243c05c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/revw_2.c
@@ -0,0 +1,8 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=256 --save-temps -mbig-endian" } */
+
+#include "revw_1.c"
+
+/* { dg-final { scan-assembler-not {\ttbl\t} } } */
+
+/* { dg-final { scan-assembler-times {\trevw\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c b/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c
new file mode 100644
index 000000000..e7bf64a57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_SAD(TYPE1, TYPE2)						\
+TYPE1 __attribute__ ((noinline, noclone))				\
+sum_abs_##TYPE1##_##TYPE2 (TYPE2 *restrict x, TYPE2 *restrict y, int n)	\
+{									\
+  TYPE1 sum = 0;							\
+  for (int i = 0; i < n; i++)						\
+    {									\
+      sum += __builtin_abs (x[i] - y[i]);				\
+    }									\
+  return sum;								\
+}
+
+DEF_SAD(int32_t, uint8_t)
+DEF_SAD(int32_t, int8_t)
+DEF_SAD(int64_t, uint16_t)
+DEF_SAD(int64_t, int16_t)
+
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sel_1.c b/gcc/testsuite/gcc.target/aarch64/sve/sel_1.c
new file mode 100644
index 000000000..e651e5b93
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sel_1.c
@@ -0,0 +1,27 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -msve-vector-bits=256 --save-temps" } */
+
+#include <stdint.h>
+
+typedef int8_t vnx16qi __attribute__((vector_size (32)));
+
+/* Predicate vector: 1 0 1 0 ... */
+
+#define MASK_32		{ 0, 33, 2, 35, 4, 37, 6, 39, 8, 41,			\
+			  10, 43, 12, 45, 14, 47, 16, 49, 18, 51, 		\
+			  20, 53, 22, 55, 24, 57, 26, 59, 28, 61, 30, 63 }
+
+#define INDEX_32 vnx16qi
+
+#define PERMUTE(type, nunits)						\
+type permute_##type (type x, type y)					\
+{									\
+  return __builtin_shuffle (x, y, (INDEX_##nunits) MASK_##nunits);	\
+}
+
+PERMUTE(vnx16qi, 32)
+
+/* { dg-final { scan-assembler-not {\ttbl\t} } } */
+
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.b, p[0-9]+, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.h, vl16\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sel_2.c b/gcc/testsuite/gcc.target/aarch64/sve/sel_2.c
new file mode 100644
index 000000000..05391474a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sel_2.c
@@ -0,0 +1,41 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -msve-vector-bits=256 --save-temps" } */
+
+#include <stdint.h>
+
+typedef int8_t vnx16qi __attribute__((vector_size (32)));
+typedef int16_t vnx8hi __attribute__((vector_size (32)));
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+
+typedef _Float16 vnx8hf __attribute__((vector_size (32)));
+typedef float vnx4sf __attribute__((vector_size (32)));
+
+/* Predicate vector: 1 0 0 0 ... */
+
+#define MASK_32		{ 0, 33, 34, 35, 4, 37, 38, 39, 8, 41, 42, 43, 12,		\
+			  45, 46, 47, 16, 49, 50, 51, 20, 53, 54, 55, 24,		\
+			  57, 58, 59, 28, 61, 62, 63 } 
+
+/* Predicate vector: 1 0 1 0 ... */
+
+#define MASK_16		{0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31}
+
+#define INDEX_32 vnx16qi
+#define INDEX_16 vnx8hi
+
+#define PERMUTE(type, nunits)						\
+type permute_##type (type x, type y)					\
+{									\
+  return __builtin_shuffle (x, y, (INDEX_##nunits) MASK_##nunits);	\
+}
+
+PERMUTE(vnx16qi, 32)
+PERMUTE(vnx8hi, 16)
+PERMUTE(vnx8hf, 16)
+
+/* { dg-final { scan-assembler-not {\ttbl\t} } } */
+
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.b, p[0-9]+, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-9]+, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s, vl8\n} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sel_3.c b/gcc/testsuite/gcc.target/aarch64/sve/sel_3.c
new file mode 100644
index 000000000..a87492d9d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sel_3.c
@@ -0,0 +1,50 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -msve-vector-bits=256 --save-temps" } */
+
+#include <stdint.h>
+
+typedef int8_t vnx16qi __attribute__((vector_size (32)));
+typedef int16_t vnx8hi __attribute__((vector_size (32)));
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+typedef _Float16 vnx8hf __attribute__((vector_size (32)));
+typedef float vnx4sf __attribute__((vector_size (32)));
+
+/* Predicate vector: 1 0 0 0 0 0 0 0 ... */
+
+#define MASK_32		{ 0, 33, 34, 35, 36, 37, 38, 39,  \
+			  8, 41, 42, 43, 44, 45, 46, 47,  \
+			  16, 49, 50, 51, 52, 53, 54, 55, \
+			  24, 57, 58, 59, 60, 61, 62, 63  }
+
+/* Predicate vector: 1 0 0 0 ... */
+
+#define MASK_16		{ 0, 17, 18, 19, 4, 21, 22, 23, \
+			  8, 25, 26, 27, 12, 29, 30, 31 } 
+
+/* Predicate vector: 1 0 ... */
+
+#define MASK_8		{ 0, 9, 2, 11, 4, 13, 6, 15 }
+
+#define INDEX_32 vnx16qi
+#define INDEX_16 vnx8hi
+#define INDEX_8 vnx4si
+
+#define PERMUTE(type, nunits)						\
+type permute_##type (type x, type y)					\
+{									\
+  return __builtin_shuffle (x, y, (INDEX_##nunits) MASK_##nunits);	\
+}
+
+PERMUTE(vnx16qi, 32)
+PERMUTE(vnx8hi, 16)
+PERMUTE(vnx4si, 8)
+PERMUTE(vnx8hf, 16)
+PERMUTE(vnx4sf, 8)
+
+/* { dg-final { scan-assembler-not {\ttbl\t} } } */
+
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.b, p[0-9]+, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-9]+, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.s, p[0-9]+, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d, vl4\n} 5 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sel_4.c b/gcc/testsuite/gcc.target/aarch64/sve/sel_4.c
new file mode 100644
index 000000000..e9bbc5527
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sel_4.c
@@ -0,0 +1,50 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -msve-vector-bits=256 --save-temps" } */
+
+#include <stdint.h>
+
+typedef int8_t vnx16qi __attribute__((vector_size (32)));
+typedef int16_t vnx8hi __attribute__((vector_size (32)));
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+typedef int64_t vnx2di __attribute__((vector_size (32)));
+
+typedef _Float16 vnx8hf __attribute__((vector_size (32)));
+typedef float vnx4sf __attribute__((vector_size (32)));
+typedef double vnx2df __attribute__((vector_size (32)));
+
+/* Predicate vector: 1 1 0 0 ... */
+
+#define MASK_32		{ 0, 1, 34, 35, 4, 5, 38, 39, 8, 9, 42, 43, 12, 13,	\
+			  46, 47, 16, 17, 50, 51, 20, 21, 54, 55, 24, 25,	\
+			  58, 59, 28, 29, 62, 63 } 
+
+#define MASK_16		{0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31} 
+#define MASK_8		{0, 1, 10, 11, 4, 5, 14, 15} 
+#define MASK_4		{0, 1, 6, 7}
+
+#define INDEX_32 vnx16qi
+#define INDEX_16 vnx8hi
+#define INDEX_8 vnx4si
+#define INDEX_4 vnx2di
+
+#define PERMUTE(type, nunits)						\
+type permute_##type (type x, type y)					\
+{									\
+  return __builtin_shuffle (x, y, (INDEX_##nunits) MASK_##nunits);	\
+}
+
+PERMUTE(vnx16qi, 32)
+PERMUTE(vnx8hi, 16)
+PERMUTE(vnx4si, 8)
+PERMUTE(vnx2di, 4)
+
+PERMUTE(vnx8hf, 16)
+PERMUTE(vnx4sf, 8)
+PERMUTE(vnx2df, 4)
+
+/* { dg-final { scan-assembler-not {\ttbl\t} } } */
+
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.b, p[0-9]+, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-9]+, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.s, p[0-9]+, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.d, p[0-9]+, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sel_5.c b/gcc/testsuite/gcc.target/aarch64/sve/sel_5.c
new file mode 100644
index 000000000..935abb54d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sel_5.c
@@ -0,0 +1,50 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -msve-vector-bits=256 --save-temps" } */
+
+#include <stdint.h>
+
+typedef int8_t vnx16qi __attribute__((vector_size (32)));
+typedef int16_t vnx8hi __attribute__((vector_size (32)));
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+typedef int64_t vnx2di __attribute__((vector_size (32)));
+
+typedef _Float16 vnx8hf __attribute__((vector_size (32)));
+typedef float vnx4sf __attribute__((vector_size (32)));
+typedef double vnx2df __attribute__((vector_size (32)));
+
+/* Predicate vector: 1 0 0 1 ... */
+
+#define MASK_32		{ 0, 33, 34, 3, 4, 37, 38, 7, 8, 41, 42, 11, 12, 45, 46,	\
+			  15, 16, 49, 50, 19, 20, 53, 54, 23, 24, 57, 58, 27, 28,	\
+			  61, 62, 31 } 
+
+#define MASK_16		{0, 17, 18, 3, 4, 21, 22, 7, 8, 25, 26, 11, 12, 29, 30, 15}
+#define MASK_8		{0, 9, 10, 3, 4, 13, 14, 7} 
+#define MASK_4		{0, 5, 6, 3}
+
+#define INDEX_32 vnx16qi
+#define INDEX_16 vnx8hi
+#define INDEX_8 vnx4si
+#define INDEX_4 vnx2di
+
+#define PERMUTE(type, nunits)						\
+type permute_##type (type x, type y)					\
+{									\
+  return __builtin_shuffle (x, y, (INDEX_##nunits) MASK_##nunits);	\
+}
+
+PERMUTE(vnx16qi, 32)
+PERMUTE(vnx8hi, 16)
+PERMUTE(vnx4si, 8)
+PERMUTE(vnx2di, 4)
+
+PERMUTE(vnx8hf, 16)
+PERMUTE(vnx4sf, 8)
+PERMUTE(vnx2df, 4)
+
+/* { dg-final { scan-assembler-not {\ttbl\t} } } */
+
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.b, p[0-9]+, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-9]+, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.s, p[0-9]+, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.d, p[0-9]+, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sel_6.c b/gcc/testsuite/gcc.target/aarch64/sve/sel_6.c
new file mode 100644
index 000000000..772938f68
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sel_6.c
@@ -0,0 +1,42 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -msve-vector-bits=256 --save-temps" } */
+
+#include <stdint.h>
+
+typedef int32_t vnx4si __attribute__((vector_size (32)));
+typedef int64_t vnx2di __attribute__((vector_size (32)));
+
+typedef float vnx4sf __attribute__((vector_size (32)));
+typedef double vnx2df __attribute__((vector_size (32)));
+
+/* Predicate vector: 1 0 0 0 ... */
+
+#define MASK_32		{ 0, 33, 34, 35, 4, 37, 38, 39, 8, 41, 42, 43, 12,		\
+			  45, 46, 47, 16, 49, 50, 51, 20, 53, 54, 55, 24,		\
+			  57, 58, 59, 28, 61, 62, 63 } 
+
+#define MASK_16		{0, 17, 18, 19, 4, 21, 22, 23, 8, 25, 26, 27, 12, 29, 30, 31} 
+#define MASK_8		{0, 9, 10, 11, 4, 13, 14, 15}
+#define MASK_4		{0, 5, 6, 7}
+
+#define INDEX_8 vnx4si
+#define INDEX_4 vnx2di
+
+#define PERMUTE(type, nunits)						\
+type permute_##type (type x, type y)					\
+{									\
+  return __builtin_shuffle (x, y, (INDEX_##nunits) MASK_##nunits);	\
+}
+
+PERMUTE(vnx4si, 8)
+PERMUTE(vnx2di, 4)
+
+PERMUTE(vnx4sf, 8)
+PERMUTE(vnx2df, 4)
+
+/* { dg-final { scan-assembler-not {\ttbl\t} } } */
+
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.s, p[0-9]+, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.d, p[0-9]+, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d, vl4\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/shift_1.c b/gcc/testsuite/gcc.target/aarch64/sve/shift_1.c
index f4c5ebd46..5ee66da15 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/shift_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/shift_1.c
@@ -75,9 +75,9 @@ DO_IMMEDIATE_OPS (63, int64_t, 63);
 /* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
 
-/* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tasrr?\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlsrr?\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tlslr?\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
 
 /* { dg-final { scan-assembler-times {\tasr\tz[0-9]+\.b, z[0-9]+\.b, #5\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tlsr\tz[0-9]+\.b, z[0-9]+\.b, #5\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/single_1.c b/gcc/testsuite/gcc.target/aarch64/sve/single_1.c
index 11b88aef7..7764a1b0f 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/single_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/single_1.c
@@ -40,10 +40,7 @@ TEST_LOOP (double, 3.0)
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } */
 
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl32\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl16\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, vl8\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl4\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl32\n} 11 } } */
 
 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/single_2.c b/gcc/testsuite/gcc.target/aarch64/sve/single_2.c
index 1fbf4892c..42fc17b73 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/single_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/single_2.c
@@ -16,10 +16,7 @@
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } */
 
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl64\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl32\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, vl16\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl8\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl64\n} 11 } } */
 
 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/single_3.c b/gcc/testsuite/gcc.target/aarch64/sve/single_3.c
index a3688b692..338ca1e3d 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/single_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/single_3.c
@@ -16,10 +16,7 @@
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } */
 
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl128\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl64\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, vl32\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl16\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl128\n} 11 } } */
 
 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/single_4.c b/gcc/testsuite/gcc.target/aarch64/sve/single_4.c
index 08965d39f..37c78a659 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/single_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/single_4.c
@@ -16,10 +16,7 @@
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } */
 
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl256\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl128\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, vl64\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl32\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl256\n} 11 } } */
 
 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c
index 413532c07..d4b9776fe 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c
@@ -29,12 +29,9 @@ vec_slp_##TYPE (TYPE *restrict a, int n)			\
 
 TEST_ALL (VEC_PERM)
 
-/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, } 2 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 2 { target aarch64_big_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 3 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]+\.h, } 3 { target aarch64_big_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, w[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 } } */
 /* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 3 } } */
 /* { dg-final { scan-assembler-not {\tzip1\t} } } */
 /* { dg-final { scan-assembler-not {\tzip2\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
index 0f9f01a00..82dd43a4d 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
@@ -32,18 +32,17 @@ vec_slp_##TYPE (TYPE *restrict a, int n)			\
 TEST_ALL (VEC_PERM)
 
 /* 1 for each 8-bit type.  */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 2 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 2 { target aarch64_big_endian } } } */
-/* 1 for each 16-bit type and 4 for double.  */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 7 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]+\.h, } 3 { target aarch64_big_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 4 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 2 } } */
+/* 1 for each 16-bit type plus 1 for double.  */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 4 } } */
 /* 1 for each 32-bit type.  */
 /* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #41\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #25\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #31\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #62\n} 2 } } */
+/* 3 for double.  */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 3 } } */
 /* The 64-bit types need:
 
       ZIP1 ZIP1 (2 ZIP2s optimized away)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
index 8d9d5ab58..49fb828e8 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
@@ -35,10 +35,8 @@ vec_slp_##TYPE (TYPE *restrict a, int n)			\
 
 TEST_ALL (VEC_PERM)
 
-/* 1 for each 8-bit type, 4 for each 32-bit type and 8 for double.  */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 22 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 2 { target aarch64_big_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 20 { target aarch64_big_endian } } } */
+/* 1 for each 8-bit type, 4 for each 32-bit type and 4 for double.  */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 18 } } */
 /* 1 for each 16-bit type.  */
 /* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]\.h, } 3 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #99\n} 2 } } */
@@ -49,6 +47,8 @@ TEST_ALL (VEC_PERM)
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #37\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #24\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #81\n} 2 } } */
+/* 4 for double.  */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 4 } } */
 /* The 32-bit types need:
 
       ZIP1 ZIP1 (2 ZIP2s optimized away)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/smax_1.c b/gcc/testsuite/gcc.target/aarch64/sve/smax_1.c
new file mode 100644
index 000000000..050248c81
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/smax_1.c
@@ -0,0 +1,71 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <stdint.h>
+
+#define DO_REGREG_OPS(TYPE)					\
+void varith_##TYPE##_reg (TYPE *dst, TYPE *src, int count)	\
+{								\
+  for (int i = 0; i < count; ++i)				\
+    dst[i] = dst[i] > src[i] ? dst[i] : src[i];			\
+}
+
+#define DO_IMMEDIATE_OPS(VALUE, TYPE, NAME)			\
+void varithimm_##NAME##_##TYPE (TYPE *dst, int count)		\
+{								\
+  for (int i = 0; i < count; ++i)				\
+    dst[i] = dst[i] > (TYPE) VALUE ? dst[i] : (TYPE) VALUE;	\
+}
+
+#define DO_ARITH_OPS(TYPE)			\
+  DO_REGREG_OPS (TYPE);				\
+  DO_IMMEDIATE_OPS (0, TYPE, 0);		\
+  DO_IMMEDIATE_OPS (86, TYPE, 86);		\
+  DO_IMMEDIATE_OPS (109, TYPE, 109);		\
+  DO_IMMEDIATE_OPS (141, TYPE, 141);		\
+  DO_IMMEDIATE_OPS (-1, TYPE, minus1);		\
+  DO_IMMEDIATE_OPS (-110, TYPE, minus110);	\
+  DO_IMMEDIATE_OPS (-141, TYPE, minus141);
+
+DO_ARITH_OPS (int8_t)
+DO_ARITH_OPS (int16_t)
+DO_ARITH_OPS (int32_t)
+DO_ARITH_OPS (int64_t)
+
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #86\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #109\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #115\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #141\n} } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #-1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #-110\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #-115\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tsmax\tz[0-9]+\.b, z[0-9]+\.b, #-141\n} } } */
+
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.h, z[0-9]+\.h, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.h, z[0-9]+\.h, #86\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.h, z[0-9]+\.h, #109\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tsmax\tz[0-9]+\.h, z[0-9]+\.h, #141\n} } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.h, z[0-9]+\.h, #-1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.h, z[0-9]+\.h, #-110\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tsmax\tz[0-9]+\.h, z[0-9]+\.h, #-141\n} } } */
+
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.s, z[0-9]+\.s, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.s, z[0-9]+\.s, #86\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.s, z[0-9]+\.s, #109\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tsmax\tz[0-9]+\.s, z[0-9]+\.s, #141\n} } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.s, z[0-9]+\.s, #-1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.s, z[0-9]+\.s, #-110\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tsmax\tz[0-9]+\.s, z[0-9]+\.s, #-141\n} } } */
+
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.d, z[0-9]+\.d, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.d, z[0-9]+\.d, #86\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.d, z[0-9]+\.d, #109\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tsmax\tz[0-9]+\.d, z[0-9]+\.d, #141\n} } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.d, z[0-9]+\.d, #-1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tz[0-9]+\.d, z[0-9]+\.d, #-110\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tsmax\tz[0-9]+\.d, z[0-9]+\.d, #-141\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/smin_1.c b/gcc/testsuite/gcc.target/aarch64/sve/smin_1.c
new file mode 100644
index 000000000..d6a9e9467
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/smin_1.c
@@ -0,0 +1,71 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <stdint.h>
+
+#define DO_REGREG_OPS(TYPE)					\
+void varith_##TYPE##_reg (TYPE *dst, TYPE *src, int count)	\
+{								\
+  for (int i = 0; i < count; ++i)				\
+    dst[i] = dst[i] < src[i] ? dst[i] : src[i];			\
+}
+
+#define DO_IMMEDIATE_OPS(VALUE, TYPE, NAME)			\
+void varithimm_##NAME##_##TYPE (TYPE *dst, int count)		\
+{								\
+  for (int i = 0; i < count; ++i)				\
+    dst[i] = dst[i] < (TYPE) VALUE ? dst[i] : (TYPE) VALUE;	\
+}
+
+#define DO_ARITH_OPS(TYPE)			\
+  DO_REGREG_OPS (TYPE);				\
+  DO_IMMEDIATE_OPS (0, TYPE, 0);		\
+  DO_IMMEDIATE_OPS (86, TYPE, 86);		\
+  DO_IMMEDIATE_OPS (109, TYPE, 109);		\
+  DO_IMMEDIATE_OPS (141, TYPE, 141);		\
+  DO_IMMEDIATE_OPS (-1, TYPE, minus1);		\
+  DO_IMMEDIATE_OPS (-110, TYPE, minus110);	\
+  DO_IMMEDIATE_OPS (-141, TYPE, minus141);
+
+DO_ARITH_OPS (int8_t)
+DO_ARITH_OPS (int16_t)
+DO_ARITH_OPS (int32_t)
+DO_ARITH_OPS (int64_t)
+
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #86\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #109\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #115\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #141\n} } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #-1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #-110\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #-115\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tsmin\tz[0-9]+\.b, z[0-9]+\.b, #-141\n} } } */
+
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.h, z[0-9]+\.h, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.h, z[0-9]+\.h, #86\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.h, z[0-9]+\.h, #109\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tsmin\tz[0-9]+\.h, z[0-9]+\.h, #141\n} } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.h, z[0-9]+\.h, #-1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.h, z[0-9]+\.h, #-110\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tsmin\tz[0-9]+\.h, z[0-9]+\.h, #-141\n} } } */
+
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.s, z[0-9]+\.s, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.s, z[0-9]+\.s, #86\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.s, z[0-9]+\.s, #109\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tsmin\tz[0-9]+\.s, z[0-9]+\.s, #141\n} } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.s, z[0-9]+\.s, #-1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.s, z[0-9]+\.s, #-110\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tsmin\tz[0-9]+\.s, z[0-9]+\.s, #-141\n} } } */
+
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.d, z[0-9]+\.d, #0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.d, z[0-9]+\.d, #86\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.d, z[0-9]+\.d, #109\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tsmin\tz[0-9]+\.d, z[0-9]+\.d, #141\n} } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.d, z[0-9]+\.d, #-1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmin\tz[0-9]+\.d, z[0-9]+\.d, #-110\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tsmin\tz[0-9]+\.d, z[0-9]+\.d, #-141\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/spill_2.c b/gcc/testsuite/gcc.target/aarch64/sve/spill_2.c
index 28fcc4429..fcd481611 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/spill_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/spill_2.c
@@ -9,29 +9,30 @@ void consumer (void *);
   void						\
   multi_loop_##TYPE (TYPE *x, TYPE val)		\
   {						\
-    for (int i = 0; i < 7; ++i)			\
+    for (int i = 0; i < 9; ++i)			\
       x[i] += val;				\
     consumer (x);				\
-    for (int i = 0; i < 7; ++i)			\
+    for (int i = 0; i < 9; ++i)			\
       x[i] += val;				\
     consumer (x);				\
-    for (int i = 0; i < 7; ++i)			\
+    for (int i = 0; i < 9; ++i)			\
       x[i] += val;				\
     consumer (x);				\
   }
 
 /* One iteration is enough.  */
 TEST_LOOP (uint8_t);
+/* Two iterations are enough.  We specialize the second two loops based
+   on whether the first executes once or twice.  */
 TEST_LOOP (uint16_t);
-/* Two iterations are enough.  Complete unrolling makes sense
-   even at -O2.  */
+/* Three iterations are needed; ought to stay a loop.  */
 TEST_LOOP (uint32_t);
-/* Four iterations are needed; ought to stay a loop.  */
+/* Five iterations are needed; ought to stay a loop.  */
 TEST_LOOP (uint64_t);
 
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.b} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.h} 3 } } */
-/* { dg-final { scan-assembler {\twhilelo\tp[0-9]\.s} } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.h} 8 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.s} 6 } } */
 /* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.d} 6 } } */
 /* { dg-final { scan-assembler-not {\tldr\tz[0-9]} } } */
 /* { dg-final { scan-assembler-not {\tstr\tz[0-9]} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/spill_4.c b/gcc/testsuite/gcc.target/aarch64/sve/spill_4.c
index 29e1a49dc..81b3f6452 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/spill_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/spill_4.c
@@ -24,10 +24,11 @@ TEST_LOOP (uint16_t, 0x1234);
 TEST_LOOP (uint32_t, 0x12345);
 TEST_LOOP (uint64_t, 0x123456);
 
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.h,} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s,} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d,} 3 } } */
-/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h,} 3 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.b,} 6 } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-9]+\.h,} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-9]+\.s,} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-9]+\.d,} } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, w[0-9]+\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s,} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d,} 3 } } */
 /* { dg-final { scan-assembler-not {\tldr\tz[0-9]} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_1.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_1.c
index 6e3c8898a..918a58138 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_1.c
@@ -83,9 +83,9 @@ NAME(g4) (TYPE *__restrict a, TYPE *__restrict b, TYPE *__restrict c,
     }
 }
 
-/* { dg-final { scan-assembler {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} } } */
+/* { dg-final { scan-assembler {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} } } */
 /* { dg-final { scan-assembler {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} } } */
-/* { dg-final { scan-assembler {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} } } */
-/* { dg-final { scan-assembler {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} } } */
+/* { dg-final { scan-assembler {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} } } */
+/* { dg-final { scan-assembler {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} } } */
 /* { dg-final { scan-assembler {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} } } */
-/* { dg-final { scan-assembler {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} } } */
+/* { dg-final { scan-assembler {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_14.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_14.c
index 45644b67b..a16a79e51 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_14.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_14.c
@@ -43,12 +43,12 @@
 #undef NAME
 #undef TYPE
 
-/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
 
 /* { dg-final { scan-assembler-times {\tld2h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld3h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_15.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_15.c
index 814dbb3ae..bc00267c8 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_15.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_15.c
@@ -3,12 +3,12 @@
 
 #include "struct_vect_14.c"
 
-/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
 
 /* { dg-final { scan-assembler-times {\tld2h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld3h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_16.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_16.c
index 6ecf89b54..9e2a549f5 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_16.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_16.c
@@ -3,12 +3,12 @@
 
 #include "struct_vect_14.c"
 
-/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
 
 /* { dg-final { scan-assembler-times {\tld2h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld3h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_17.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_17.c
index 571c6d0d3..e791e2e12 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_17.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_17.c
@@ -3,12 +3,12 @@
 
 #include "struct_vect_14.c"
 
-/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} 1 } } */
 
 /* { dg-final { scan-assembler-times {\tld2h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tld3h\t{z[0-9]+.h - z[0-9]+.h}, p[0-7]/z, \[x[0-9]+\]\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c
index dc912e63c..3bc53b69d 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c
@@ -46,4 +46,4 @@ TEST (test)
 /* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
 
 /* The only branches should be in the vectorized loop.  */
-/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tb[.a-z]+\t} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c
index 6568dc71c..833bf0669 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c
@@ -46,4 +46,4 @@ TEST (test)
 /* Each function should have three branches: one directly to the exit
    (n <= 0), one to the single scalar epilogue iteration (n == 1),
    and one branch-back for the vectorized loop.  */
-/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 12 } } */
+/* { dg-final { scan-assembler-times {\tb[.a-z]+\t} 12 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_20.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_20.c
index 6c3520c2f..858ca74f8 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_20.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_20.c
@@ -46,4 +46,4 @@ TEST (test)
 /* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
 
 /* The only branches should be in the vectorized loop.  */
-/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tb[.a-z]+\t} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_21.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_21.c
index 4b2a5e463..95691fe9e 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_21.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_21.c
@@ -46,4 +46,4 @@ TEST (test)
 /* Each function should have three branches: one directly to the exit
    (n <= 0), one to the single scalar epilogue iteration (n == 1),
    and one branch-back for the vectorized loop.  */
-/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 12 } } */
+/* { dg-final { scan-assembler-times {\tb[.a-z]+\t} 12 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_22.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_22.c
index b61536053..8eb072505 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_22.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_22.c
@@ -46,4 +46,4 @@ TEST (test)
 /* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
 
 /* The only branches should be in the vectorized loop.  */
-/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tb[.a-z]+\t} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_23.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_23.c
index b529e0386..705b2350a 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_23.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_23.c
@@ -46,4 +46,4 @@ TEST (test)
 /* Each function should have three branches: one directly to the exit
    (n <= 0), one to the single scalar epilogue iteration (n == 1),
    and one branch-back for the vectorized loop.  */
-/* { dg-final { scan-assembler-times {\tb[a-z]+\t} 12 } } */
+/* { dg-final { scan-assembler-times {\tb[.a-z]+\t} 12 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_7.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_7.c
index b74190149..3d3070e77 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_7.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_7.c
@@ -78,9 +78,9 @@ g4 (TYPE *__restrict a, TYPE *__restrict b, TYPE *__restrict c,
     }
 }
 
-/* { dg-final { scan-assembler {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} } } */
+/* { dg-final { scan-assembler {\tld2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} } } */
 /* { dg-final { scan-assembler {\tld3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} } } */
-/* { dg-final { scan-assembler {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+\]\n} } } */
-/* { dg-final { scan-assembler {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} } } */
+/* { dg-final { scan-assembler {\tld4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7]/z, \[x[0-9]+, x[0-9]+\]\n} } } */
+/* { dg-final { scan-assembler {\tst2b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} } } */
 /* { dg-final { scan-assembler {\tst3b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} } } */
-/* { dg-final { scan-assembler {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+\]\n} } } */
+/* { dg-final { scan-assembler {\tst4b\t{z[0-9]+.b - z[0-9]+.b}, p[0-7], \[x[0-9]+, x[0-9]+\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/umax_1.c b/gcc/testsuite/gcc.target/aarch64/sve/umax_1.c
new file mode 100644
index 000000000..fffedb9c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/umax_1.c
@@ -0,0 +1,65 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <stdint.h>
+
+#define DO_REGREG_OPS(TYPE)					\
+void varith_##TYPE##_reg (TYPE *dst, TYPE *src, int count)	\
+{								\
+  for (int i = 0; i < count; ++i)				\
+    dst[i] = dst[i] > src[i] ? dst[i] : src[i];			\
+}
+
+#define DO_IMMEDIATE_OPS(VALUE, TYPE)				\
+void varithimm_##VALUE##_##TYPE (TYPE *dst, int count)		\
+{								\
+  for (int i = 0; i < count; ++i)				\
+    dst[i] = dst[i] > (TYPE) VALUE ? dst[i] : (TYPE) VALUE;	\
+}
+
+#define DO_ARITH_OPS(TYPE)		\
+  DO_REGREG_OPS (TYPE);			\
+  DO_IMMEDIATE_OPS (2, TYPE);		\
+  DO_IMMEDIATE_OPS (86, TYPE);		\
+  DO_IMMEDIATE_OPS (109, TYPE);		\
+  DO_IMMEDIATE_OPS (141, TYPE);		\
+  DO_IMMEDIATE_OPS (229, TYPE);		\
+  DO_IMMEDIATE_OPS (255, TYPE);		\
+  DO_IMMEDIATE_OPS (256, TYPE);
+
+DO_ARITH_OPS (uint8_t)
+DO_ARITH_OPS (uint16_t)
+DO_ARITH_OPS (uint32_t)
+DO_ARITH_OPS (uint64_t)
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.b, z[0-9]+\.b, #86\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.b, z[0-9]+\.b, #109\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.b, z[0-9]+\.b, #141\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.b, z[0-9]+\.b, #229\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tumax\tz[0-9]+\.b, z[0-9]+\.b, #255\n} } } */
+/* { dg-final { scan-assembler-not {\tumax\tz[0-9]+\.b, z[0-9]+\.b, #256\n} } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, z[0-9]+\.h, #86\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, z[0-9]+\.h, #109\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, z[0-9]+\.h, #141\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, z[0-9]+\.h, #229\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.h, z[0-9]+\.h, #255\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tumax\tz[0-9]+\.h, z[0-9]+\.h, #256\n} } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, z[0-9]+\.s, #86\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, z[0-9]+\.s, #109\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, z[0-9]+\.s, #141\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, z[0-9]+\.s, #229\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.s, z[0-9]+\.s, #255\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tumax\tz[0-9]+\.s, z[0-9]+\.s, #256\n} } } */
+
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.d, z[0-9]+\.d, #86\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.d, z[0-9]+\.d, #109\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.d, z[0-9]+\.d, #141\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.d, z[0-9]+\.d, #229\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumax\tz[0-9]+\.d, z[0-9]+\.d, #255\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tumax\tz[0-9]+\.d, z[0-9]+\.d, #256\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/umin_1.c b/gcc/testsuite/gcc.target/aarch64/sve/umin_1.c
new file mode 100644
index 000000000..f7cdba3b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/umin_1.c
@@ -0,0 +1,65 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <stdint.h>
+
+#define DO_REGREG_OPS(TYPE)					\
+void varith_##TYPE##_reg (TYPE *dst, TYPE *src, int count)	\
+{								\
+  for (int i = 0; i < count; ++i)				\
+    dst[i] = dst[i] < src[i] ? dst[i] : src[i];			\
+}
+
+#define DO_IMMEDIATE_OPS(VALUE, TYPE)				\
+void varithimm_##VALUE##_##TYPE (TYPE *dst, int count)		\
+{								\
+  for (int i = 0; i < count; ++i)				\
+    dst[i] = dst[i] < (TYPE) VALUE ? dst[i] : (TYPE) VALUE;	\
+}
+
+#define DO_ARITH_OPS(TYPE)		\
+  DO_REGREG_OPS (TYPE);			\
+  DO_IMMEDIATE_OPS (2, TYPE);		\
+  DO_IMMEDIATE_OPS (86, TYPE);		\
+  DO_IMMEDIATE_OPS (109, TYPE);		\
+  DO_IMMEDIATE_OPS (141, TYPE);		\
+  DO_IMMEDIATE_OPS (229, TYPE);		\
+  DO_IMMEDIATE_OPS (255, TYPE);		\
+  DO_IMMEDIATE_OPS (256, TYPE);
+
+DO_ARITH_OPS (uint8_t)
+DO_ARITH_OPS (uint16_t)
+DO_ARITH_OPS (uint32_t)
+DO_ARITH_OPS (uint64_t)
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.b, z[0-9]+\.b, #86\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.b, z[0-9]+\.b, #109\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.b, z[0-9]+\.b, #141\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.b, z[0-9]+\.b, #229\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tumin\tz[0-9]+\.b, z[0-9]+\.b, #255\n} } } */
+/* { dg-final { scan-assembler-not {\tumin\tz[0-9]+\.b, z[0-9]+\.b, #256\n} } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, z[0-9]+\.h, #86\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, z[0-9]+\.h, #109\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, z[0-9]+\.h, #141\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, z[0-9]+\.h, #229\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.h, z[0-9]+\.h, #255\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tumin\tz[0-9]+\.h, z[0-9]+\.h, #256\n} } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, z[0-9]+\.s, #86\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, z[0-9]+\.s, #109\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, z[0-9]+\.s, #141\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, z[0-9]+\.s, #229\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.s, z[0-9]+\.s, #255\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tumin\tz[0-9]+\.s, z[0-9]+\.s, #256\n} } } */
+
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.d, z[0-9]+\.d, #86\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.d, z[0-9]+\.d, #109\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.d, z[0-9]+\.d, #141\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.d, z[0-9]+\.d, #229\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tumin\tz[0-9]+\.d, z[0-9]+\.d, #255\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tumin\tz[0-9]+\.d, z[0-9]+\.d, #256\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unroll-1.c b/gcc/testsuite/gcc.target/aarch64/sve/unroll-1.c
index d4353009e..e33777fc3 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/unroll-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unroll-1.c
@@ -10,4 +10,4 @@ fully_peel_me (double *x)
     x[i] = x[i] * 2;
 }
 
-/* { dg-final { scan-assembler-times {b..\t\.L.\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tb[.a-z]+\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_17.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_17.c
new file mode 100644
index 000000000..cabcfa73e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_17.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define eq(A, B) ((A) == (B))
+#define ne(A, B) ((A) != (B))
+#define olt(A, B) ((A) < (B))
+#define ole(A, B) ((A) <= (B))
+#define oge(A, B) ((A) >= (B))
+#define ogt(A, B) ((A) > (B))
+#define ordered(A, B) (!__builtin_isunordered (A, B))
+#define unordered(A, B) (__builtin_isunordered (A, B))
+#define ueq(A, B) (!__builtin_islessgreater (A, B))
+#define ult(A, B) (__builtin_isless (A, B))
+#define ule(A, B) (__builtin_islessequal (A, B))
+#define uge(A, B) (__builtin_isgreaterequal (A, B))
+#define ugt(A, B) (__builtin_isgreater (A, B))
+#define nueq(A, B) (__builtin_islessgreater (A, B))
+#define nult(A, B) (!__builtin_isless (A, B))
+#define nule(A, B) (!__builtin_islessequal (A, B))
+#define nuge(A, B) (!__builtin_isgreaterequal (A, B))
+#define nugt(A, B) (!__builtin_isgreater (A, B))
+
+#define DEF_LOOP(CMP, EXPECT_INVALID)					\
+  void __attribute__ ((noinline, noclone))				\
+  test_##CMP##_var (__fp16 *restrict dest, __fp16 *restrict src,	\
+		    __fp16 fallback, __fp16 *restrict a,		\
+		    __fp16 *restrict b, int count)			\
+  {									\
+    for (int i = 0; i < count; ++i)					\
+      dest[i] = CMP (a[i], b[i]) ? src[i] : fallback;			\
+  }									\
+									\
+  void __attribute__ ((noinline, noclone))				\
+  test_##CMP##_zero (__fp16 *restrict dest,  __fp16 *restrict src,	\
+		     __fp16 fallback, __fp16 *restrict a,		\
+		     int count)						\
+  {									\
+    for (int i = 0; i < count; ++i)					\
+      dest[i] = CMP (a[i], (__fp16) 0) ? src[i] : fallback;		\
+  }									\
+									\
+  void __attribute__ ((noinline, noclone))				\
+  test_##CMP##_sel (__fp16 *restrict dest, __fp16 if_true,		\
+		    __fp16 if_false, __fp16 *restrict a,		\
+		    __fp16 b, int count)				\
+  {									\
+    for (int i = 0; i < count; ++i)					\
+      dest[i] = CMP (a[i], b) ? if_true : if_false;			\
+  }
+
+#define TEST_ALL(T)				\
+  T (eq, 0)					\
+  T (ne, 0)					\
+  T (olt, 1)					\
+  T (ole, 1)					\
+  T (oge, 1)					\
+  T (ogt, 1)					\
+  T (ordered, 0)				\
+  T (unordered, 0)				\
+  T (ueq, 0)					\
+  T (ult, 0)					\
+  T (ule, 0)					\
+  T (uge, 0)					\
+  T (ugt, 0)					\
+  T (nueq, 0)					\
+  T (nult, 0)					\
+  T (nule, 0)					\
+  T (nuge, 0)					\
+  T (nugt, 0)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler {\tfcmeq\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0\.0\n} { xfail *-*-* } } } */
+/* { dg-final { scan-assembler {\tfcmeq\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} } } */
+
+/* { dg-final { scan-assembler {\tfcmne\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0\.0\n} } } */
+/* { dg-final { scan-assembler {\tfcmne\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} } } */
+
+/* { dg-final { scan-assembler {\tfcmlt\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0\.0\n} } } */
+/* { dg-final { scan-assembler {\tfcmlt\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} } } */
+
+/* { dg-final { scan-assembler {\tfcmle\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0\.0\n} } } */
+/* { dg-final { scan-assembler {\tfcmle\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} } } */
+
+/* { dg-final { scan-assembler {\tfcmgt\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0\.0\n} } } */
+/* { dg-final { scan-assembler {\tfcmgt\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} } } */
+
+/* { dg-final { scan-assembler {\tfcmge\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0\.0\n} } } */
+/* { dg-final { scan-assembler {\tfcmge\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} } } */
+
+/* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, #0\.0\n} } } */
+/* { dg-final { scan-assembler {\tfcmuo\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_17_run.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_17_run.c
new file mode 100644
index 000000000..4a228c8c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_17_run.c
@@ -0,0 +1,54 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-require-effective-target fenv_exceptions } */
+
+#include <fenv.h>
+
+#include "vcond_17.c"
+
+#define N 401
+
+#define TEST_LOOP(CMP, EXPECT_INVALID)				\
+  {								\
+    __fp16 dest1[N], dest2[N], dest3[N], src[N];		\
+    __fp16 a[N], b[N];						\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	src[i] = i * i;						\
+	if (i % 5 == 0)						\
+	  a[i] = 0;						\
+	else if (i % 3)						\
+	  a[i] = i * 0.1;					\
+	else							\
+	  a[i] = i;						\
+	if (i % 7 == 0)						\
+	  b[i] = __builtin_nan ("");				\
+	else if (i % 6)						\
+	  b[i] = i * 0.1;					\
+	else							\
+	  b[i] = i;						\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    feclearexcept (FE_ALL_EXCEPT);				\
+    test_##CMP##_var (dest1, src, 11, a, b, N);			\
+    test_##CMP##_zero (dest2, src, 22, a, N);			\
+    test_##CMP##_sel (dest3, 33, 44, a, 9, N);			\
+    if (!fetestexcept (FE_INVALID) != !(EXPECT_INVALID))	\
+      __builtin_abort ();					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	if (dest1[i] != (CMP (a[i], b[i]) ? src[i] : 11))	\
+	  __builtin_abort ();					\
+	if (dest2[i] != (CMP (a[i], 0) ? src[i] : 22))		\
+	  __builtin_abort ();					\
+	if (dest3[i] != (CMP (a[i], 9) ? 33 : 44))		\
+	  __builtin_abort ();					\
+      }								\
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_18.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_18.c
new file mode 100644
index 000000000..a2590b9ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_18.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define DEF_LOOP(TYPE, NAME, CONST)			\
+  void							\
+  test_##TYPE##_##NAME (TYPE *restrict x,		\
+			TYPE *restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] > 0 ? CONST : 0;			\
+  }
+
+#define TEST_TYPE(T, TYPE)			\
+  T (TYPE, 2, 2.0)				\
+  T (TYPE, 1p25, 1.25)				\
+  T (TYPE, 32p25, 32.25)			\
+  T (TYPE, m4, -4.0)				\
+  T (TYPE, m2p5, -2.5)				\
+  T (TYPE, m64p5, -64.5)
+
+#define TEST_ALL(T)				\
+  TEST_TYPE (T, _Float16)			\
+  TEST_TYPE (T, float)				\
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/z, #16384\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/z, #15616\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/z, #-15360\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/z, #-16128\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.s), (p[0-7])/z, \1\n\tfmov\t\1, \2/m, #2\.0(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.s), (p[0-7])/z, \1\n\tfmov\t\1, \2/m, #1\.25(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.s), (p[0-7])/z, \1\n\tfmov\t\1, \2/m, #-4\.0(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.s), (p[0-7])/z, \1\n\tfmov\t\1, \2/m, #-2\.5(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.s, p[0-7], z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.d), (p[0-7])/z, \1\n\tfmov\t\1, \2/m, #2\.0(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.d), (p[0-7])/z, \1\n\tfmov\t\1, \2/m, #1\.25(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.d), (p[0-7])/z, \1\n\tfmov\t\1, \2/m, #-4\.0(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tmovprfx\t(z[0-9]+\.d), (p[0-7])/z, \1\n\tfmov\t\1, \2/m, #-2\.5(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.d, p[0-7], z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_18_run.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_18_run.c
new file mode 100644
index 000000000..279b0a3ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_18_run.c
@@ -0,0 +1,30 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "vcond_18.c"
+
+#define N 97
+
+#define TEST_LOOP(TYPE, NAME, CONST)				\
+  {								\
+    TYPE x[N], pred[N];						\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	pred[i] = i % 5 <= i % 6;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (x, pred, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	if (x[i] != (TYPE) (pred[i] > 0 ? CONST : 0))		\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int __attribute__ ((optimize (1)))
+main (int argc, char **argv)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_19.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_19.c
new file mode 100644
index 000000000..2347b7f28
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_19.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define DEF_LOOP(TYPE, NAME, CONST)			\
+  void							\
+  test_##TYPE##_##NAME (TYPE *restrict x,		\
+			TYPE *restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] > 0 ? CONST : pred[i];		\
+  }
+
+#define TEST_TYPE(T, TYPE)			\
+  T (TYPE, 2, 2.0)				\
+  T (TYPE, 1p25, 1.25)				\
+  T (TYPE, 32p25, 32.25)			\
+  T (TYPE, m4, -4.0)				\
+  T (TYPE, m2p5, -2.5)				\
+  T (TYPE, m64p5, -64.5)
+
+#define TEST_ALL(T)				\
+  TEST_TYPE (T, _Float16)			\
+  TEST_TYPE (T, float)				\
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/m, #16384\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/m, #15616\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/m, #-15360\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/m, #-16128\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.s), p[0-7]/m, #2\.0(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.s), p[0-7]/m, #1\.25(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.s), p[0-7]/m, #-4\.0(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.s), p[0-7]/m, #-2\.5(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.s, p[0-7], z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.d), p[0-7]/m, #2\.0(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.d), p[0-7]/m, #1\.25(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.d), p[0-7]/m, #-4\.0(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.d), p[0-7]/m, #-2\.5(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.d, p[0-7], z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_19_run.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_19_run.c
new file mode 100644
index 000000000..d93d8aa45
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_19_run.c
@@ -0,0 +1,30 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "vcond_19.c"
+
+#define N 97
+
+#define TEST_LOOP(TYPE, NAME, CONST)				\
+  {								\
+    TYPE x[N], pred[N];						\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	pred[i] = i % 5 <= i % 6 ? i : 0;			\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (x, pred, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	if (x[i] != (TYPE) (pred[i] > 0 ? CONST : pred[i]))	\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int __attribute__ ((optimize (1)))
+main (int argc, char **argv)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_20.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_20.c
new file mode 100644
index 000000000..bf2af1c62
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_20.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define DEF_LOOP(TYPE, NAME, CONST)			\
+  void							\
+  test_##TYPE##_##NAME (TYPE *restrict x,		\
+			TYPE *restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] > 0 ? CONST : 12.0;		\
+  }
+
+#define TEST_TYPE(T, TYPE)			\
+  T (TYPE, 2, 2.0)				\
+  T (TYPE, 1p25, 1.25)				\
+  T (TYPE, 32p25, 32.25)			\
+  T (TYPE, m4, -4.0)				\
+  T (TYPE, m2p5, -2.5)				\
+  T (TYPE, m64p5, -64.5)
+
+#define TEST_ALL(T)				\
+  TEST_TYPE (T, _Float16)			\
+  TEST_TYPE (T, float)				\
+  TEST_TYPE (T, double)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/m, #16384\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/m, #15616\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/m, #-15360\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, p[0-7]/m, #-16128\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.h, p[0-7], z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.s), p[0-7]/m, #2\.0(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.s), p[0-7]/m, #1\.25(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.s), p[0-7]/m, #-4\.0(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.s), p[0-7]/m, #-2\.5(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.s, p[0-7], z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.d), p[0-7]/m, #2\.0(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.d), p[0-7]/m, #1\.25(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.d), p[0-7]/m, #-4\.0(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler {\tfmov\t(z[0-9]+\.d), p[0-7]/m, #-2\.5(?:e[+]0)?\n} } } */
+/* { dg-final { scan-assembler-times {\tsel\tz[0-9]+\.d, p[0-7], z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+, z[0-9]+\n} 12 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_20_run.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_20_run.c
new file mode 100644
index 000000000..33c81deaa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_20_run.c
@@ -0,0 +1,30 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "vcond_20.c"
+
+#define N 97
+
+#define TEST_LOOP(TYPE, NAME, CONST)				\
+  {								\
+    TYPE x[N], pred[N];						\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	pred[i] = i % 5 <= i % 6;				\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (x, pred, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	if (x[i] != (TYPE) (pred[i] > 0 ? CONST : 12.0))	\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int __attribute__ ((optimize (1)))
+main (int argc, char **argv)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_21.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_21.c
new file mode 100644
index 000000000..d5df2e199
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_21.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define DEF_LOOP(TYPE, ABS, NAME, OP)			\
+  void							\
+  test_##TYPE##_##NAME (TYPE *restrict r,		\
+			TYPE *restrict a,		\
+			TYPE *restrict b, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      r[i] = ABS (a[i]) OP ABS (b[i]) ? 1.0 : 0.0;	\
+  }
+
+#define TEST_TYPE(T, TYPE, ABS)			\
+  T (TYPE, ABS, lt, <)				\
+  T (TYPE, ABS, le, <=)				\
+  T (TYPE, ABS, ge, >=)				\
+  T (TYPE, ABS, gt, >)
+
+#define TEST_ALL(T)				\
+  TEST_TYPE (T, _Float16, __builtin_fabsf16)	\
+  TEST_TYPE (T, float, __builtin_fabsf)		\
+  TEST_TYPE (T, double, __builtin_fabs)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tfac[lg]t\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfac[lg]e\tp[0-9]+\.h, p[0-7]/z, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfac[lg]t\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfac[lg]e\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfac[lg]t\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfac[lg]e\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_21_run.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_21_run.c
new file mode 100644
index 000000000..15c551324
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_21_run.c
@@ -0,0 +1,31 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "vcond_21.c"
+
+#define N 97
+
+#define TEST_LOOP(TYPE, ABS, NAME, OP)				\
+  {								\
+    TYPE r[N], a[N], b[N];					\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	a[i] = i % 5 * (i & 1 ? -1 : 1);			\
+	b[i] = i % 9 * (i & 2 ? -1 : 1);			\
+	asm volatile ("" ::: "memory");				\
+      }								\
+    test_##TYPE##_##NAME (r, a, b, N);				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	if (r[i] != (ABS (a[i]) OP ABS (b[i]) ? 1.0 : 0.0))	\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int __attribute__ ((optimize (1)))
+main (int argc, char **argv)
+{
+  TEST_ALL (TEST_LOOP)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
index a93a04baa..2655c4242 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/while_1.c
@@ -42,3 +42,4 @@ TEST_ALL (ADD_LOOP)
 /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x0, x[0-9]+, lsl 2\]\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x0, x[0-9]+, lsl 3\]\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x0, x[0-9]+, lsl 3\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tb\.any\t} 10 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_10.c b/gcc/testsuite/gcc.target/aarch64/sve/while_10.c
new file mode 100644
index 000000000..eaed326f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/while_10.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=512" } */
+
+#include <stdint.h>
+
+#define ADD_LOOP(TYPE, COUNT)			\
+  TYPE __attribute__ ((noinline, noclone))	\
+  vec_while_##TYPE (TYPE *restrict a)		\
+  {						\
+    for (int i = 0; i < COUNT; ++i)	       	\
+      a[i] += 1;				\
+  }
+
+#define TEST_ALL(T)				\
+  T (int8_t, 63)				\
+  T (int16_t, 30)				\
+  T (int32_t, 15)				\
+  T (int64_t, 6)
+
+TEST_ALL (ADD_LOOP)
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, mul3\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, mul3\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, mul3\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl6\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_6.c b/gcc/testsuite/gcc.target/aarch64/sve/while_6.c
new file mode 100644
index 000000000..b4cc596ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/while_6.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+#define ADD_LOOP(TYPE)				\
+  TYPE __attribute__ ((noinline, noclone))	\
+  vec_while_##TYPE (TYPE *restrict a)		\
+  {						\
+    for (int i = 0; i < 7; ++i)			\
+      a[i] += 1;				\
+  }
+
+#define TEST_ALL(T)				\
+  T (int8_t)					\
+  T (int16_t)					\
+  T (int32_t)					\
+  T (int64_t)
+
+TEST_ALL (ADD_LOOP)
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl7\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl7\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_7.c b/gcc/testsuite/gcc.target/aarch64/sve/while_7.c
new file mode 100644
index 000000000..d5ffb66a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/while_7.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+#define ADD_LOOP(TYPE)				\
+  TYPE __attribute__ ((noinline, noclone))	\
+  vec_while_##TYPE (TYPE *restrict a)		\
+  {						\
+    for (int i = 0; i < 8; ++i)			\
+      a[i] += 1;				\
+  }
+
+#define TEST_ALL(T)				\
+  T (int8_t)					\
+  T (int16_t)					\
+  T (int32_t)					\
+  T (int64_t)
+
+TEST_ALL (ADD_LOOP)
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl8\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl8\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_8.c b/gcc/testsuite/gcc.target/aarch64/sve/while_8.c
new file mode 100644
index 000000000..1c11aa849
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/while_8.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+#define ADD_LOOP(TYPE)				\
+  TYPE __attribute__ ((noinline, noclone))	\
+  vec_while_##TYPE (TYPE *restrict a)		\
+  {						\
+    for (int i = 0; i < 9; ++i)			\
+      a[i] += 1;				\
+  }
+
+#define TEST_ALL(T)				\
+  T (int8_t)					\
+  T (int16_t)					\
+  T (int32_t)					\
+  T (int64_t)
+
+TEST_ALL (ADD_LOOP)
+
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b,} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_9.c b/gcc/testsuite/gcc.target/aarch64/sve/while_9.c
new file mode 100644
index 000000000..9a8e5fe12
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/while_9.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+#define ADD_LOOP(TYPE)				\
+  TYPE __attribute__ ((noinline, noclone))	\
+  vec_while_##TYPE (TYPE *restrict a)		\
+  {						\
+    for (int i = 0; i < 16; ++i)	       	\
+      a[i] += 1;				\
+  }
+
+#define TEST_ALL(T)				\
+  T (int8_t)					\
+  T (int16_t)					\
+  T (int32_t)					\
+  T (int64_t)
+
+TEST_ALL (ADD_LOOP)
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl16\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/symbol-range-tiny.c b/gcc/testsuite/gcc.target/aarch64/symbol-range-tiny.c
index d7e46b059..fc6a4f3ec 100644
--- a/gcc/testsuite/gcc.target/aarch64/symbol-range-tiny.c
+++ b/gcc/testsuite/gcc.target/aarch64/symbol-range-tiny.c
@@ -1,12 +1,12 @@
-/* { dg-do compile } */
+/* { dg-do link } */
 /* { dg-options "-O3 -save-temps -mcmodel=tiny" } */
 
-int fixed_regs[0x00200000];
+char fixed_regs[0x00080000];
 
 int
-foo()
+main ()
 {
-  return fixed_regs[0x00080000];
+  return fixed_regs[0x000ff000];
 }
 
 /* { dg-final { scan-assembler-not "adr\tx\[0-9\]+, fixed_regs\\\+" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/symbol-range.c b/gcc/testsuite/gcc.target/aarch64/symbol-range.c
index 6574cf431..d8e82fa1b 100644
--- a/gcc/testsuite/gcc.target/aarch64/symbol-range.c
+++ b/gcc/testsuite/gcc.target/aarch64/symbol-range.c
@@ -1,12 +1,12 @@
-/* { dg-do compile } */
+/* { dg-do link } */
 /* { dg-options "-O3 -save-temps -mcmodel=small" } */
 
-int fixed_regs[0x200000000ULL];
+char fixed_regs[0x80000000];
 
 int
-foo()
+main ()
 {
-  return fixed_regs[0x100000000ULL];
+  return fixed_regs[0xfffff000];
 }
 
 /* { dg-final { scan-assembler-not "adrp\tx\[0-9\]+, fixed_regs\\\+" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sync-comp-swap.c b/gcc/testsuite/gcc.target/aarch64/sync-comp-swap.c
index e571b2f13..f56415f33 100644
--- a/gcc/testsuite/gcc.target/aarch64/sync-comp-swap.c
+++ b/gcc/testsuite/gcc.target/aarch64/sync-comp-swap.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf -mno-outline-atomics" } */
 
 #include "sync-comp-swap.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sync-op-acquire.c b/gcc/testsuite/gcc.target/aarch64/sync-op-acquire.c
index 357bf1be3..39b3144aa 100644
--- a/gcc/testsuite/gcc.target/aarch64/sync-op-acquire.c
+++ b/gcc/testsuite/gcc.target/aarch64/sync-op-acquire.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
 
 #include "sync-op-acquire.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sync-op-full.c b/gcc/testsuite/gcc.target/aarch64/sync-op-full.c
index c6ba16299..6b8b2043f 100644
--- a/gcc/testsuite/gcc.target/aarch64/sync-op-full.c
+++ b/gcc/testsuite/gcc.target/aarch64/sync-op-full.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-outline-atomics" } */
 
 #include "sync-op-full.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-10.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-10.c
new file mode 100644
index 000000000..3d6893ee0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-10.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+
+int __attribute__((aarch64_vector_pcs)) (*callee) (void);
+
+int __attribute__ ((aarch64_vector_pcs))
+caller (int *x)
+{
+  return callee () + 1;
+}
+
+/* { dg-final { scan-assembler-not {\tstp\tq} } } */
+/* { dg-final { scan-assembler-not {\tldp\tq} } } */
+/* { dg-final { scan-assembler-not {\tstr\tq} } } */
+/* { dg-final { scan-assembler-not {\tldr\tq} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-11.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-11.c
new file mode 100644
index 000000000..de99bd701
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-11.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+
+int (*callee) (void);
+
+int __attribute__ ((aarch64_vector_pcs))
+caller (int *x)
+{
+  return callee () + 1;
+}
+
+/* { dg-final { scan-assembler {\sstp\tq8, q9} } } */
+/* { dg-final { scan-assembler {\sstp\tq10, q11} } } */
+/* { dg-final { scan-assembler {\sstp\tq12, q13} } } */
+/* { dg-final { scan-assembler {\sstp\tq14, q15} } } */
+/* { dg-final { scan-assembler {\sstp\tq16, q17} } } */
+/* { dg-final { scan-assembler {\sstp\tq18, q19} } } */
+/* { dg-final { scan-assembler {\sstp\tq20, q21} } } */
+/* { dg-final { scan-assembler {\sstp\tq22, q23} } } */
+/* { dg-final { scan-assembler {\sldp\tq8, q9} } } */
+/* { dg-final { scan-assembler {\sldp\tq10, q11} } } */
+/* { dg-final { scan-assembler {\sldp\tq12, q13} } } */
+/* { dg-final { scan-assembler {\sldp\tq14, q15} } } */
+/* { dg-final { scan-assembler {\sldp\tq16, q17} } } */
+/* { dg-final { scan-assembler {\sldp\tq18, q19} } } */
+/* { dg-final { scan-assembler {\sldp\tq20, q21} } } */
+/* { dg-final { scan-assembler {\sldp\tq22, q23} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-8.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-8.c
new file mode 100644
index 000000000..6463f6c50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-8.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-std=gnu99" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
+
+#include <arm_neon.h>
+
+void __attribute__ ((aarch64_vector_pcs)) f (void);
+
+void
+g (int64x2x4_t *ptr)
+{
+  register int64x2x4_t copy asm ("v8") = *ptr;
+  int64x2x4_t save;
+  asm volatile ("" : "=w" (save) : "0" (copy));
+  f ();
+  *ptr = save;
+}
+
+/* { dg-final { scan-assembler-times {\tld1\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-9.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-9.c
new file mode 100644
index 000000000..aaa0316d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-9.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-fshrink-wrap -ffat-lto-objects" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+int callee (void);
+
+/*
+** caller:
+**	ldr	(w[0-9]+), \[x0\]
+**	cbn?z	\1, [^\n]*
+**	...
+**	ret
+*/
+int __attribute__ ((aarch64_vector_pcs))
+caller (int *x)
+{
+  if (*x)
+    return callee () + 1;
+  else
+    return 0;
+}
+
+/* { dg-final { scan-assembler {\sstp\tq8, q9} } } */
+/* { dg-final { scan-assembler {\sstp\tq10, q11} } } */
+/* { dg-final { scan-assembler {\sstp\tq12, q13} } } */
+/* { dg-final { scan-assembler {\sstp\tq14, q15} } } */
+/* { dg-final { scan-assembler {\sstp\tq16, q17} } } */
+/* { dg-final { scan-assembler {\sstp\tq18, q19} } } */
+/* { dg-final { scan-assembler {\sstp\tq20, q21} } } */
+/* { dg-final { scan-assembler {\sstp\tq22, q23} } } */
+/* { dg-final { scan-assembler {\sldp\tq8, q9} } } */
+/* { dg-final { scan-assembler {\sldp\tq10, q11} } } */
+/* { dg-final { scan-assembler {\sldp\tq12, q13} } } */
+/* { dg-final { scan-assembler {\sldp\tq14, q15} } } */
+/* { dg-final { scan-assembler {\sldp\tq16, q17} } } */
+/* { dg-final { scan-assembler {\sldp\tq18, q19} } } */
+/* { dg-final { scan-assembler {\sldp\tq20, q21} } } */
+/* { dg-final { scan-assembler {\sldp\tq22, q23} } } */
+
+/* { dg-final { scan-assembler-not {\tstp\tq[0-7],} } } */
+/* { dg-final { scan-assembler-not {\tldp\tq[0-7],} } } */
+/* { dg-final { scan-assembler-not {\tstp\tq2[4-9],} } } */
+/* { dg-final { scan-assembler-not {\tldp\tq2[4-9],} } } */
+/* { dg-final { scan-assembler-not {\tstp\td} } } */
+/* { dg-final { scan-assembler-not {\tldp\td} } } */
+/* { dg-final { scan-assembler-not {\tstr\tq} } } */
+/* { dg-final { scan-assembler-not {\tldr\tq} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c b/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c
new file mode 100644
index 000000000..ea8de4d69
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/usadv16qi-dotprod.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_ok } */
+/* { dg-add-options arm_v8_2a_dotprod_neon }  */
+/* { dg-additional-options "-O3" } */
+
+#pragma GCC target "+nosve"
+
+#define N 1024
+
+unsigned char pix1[N], pix2[N];
+
+int foo (void)
+{
+  int i_sum = 0;
+  int i;
+
+  for (i = 0; i < N; i++)
+    i_sum += __builtin_abs (pix1[i] - pix2[i]);
+
+  return i_sum;
+}
+
+/* { dg-final { scan-assembler-not {\tushll\t} } } */
+/* { dg-final { scan-assembler-not {\tushll2\t} } } */
+/* { dg-final { scan-assembler-not {\tusubl\t} } } */
+/* { dg-final { scan-assembler-not {\tusubl2\t} } } */
+/* { dg-final { scan-assembler-not {\tabs\t} } } */
+
+/* { dg-final { scan-assembler {\tuabd\t} } } */
+/* { dg-final { scan-assembler {\tudot\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/usadv16qi.c b/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
index 69ceaf425..a66e12096 100644
--- a/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
+++ b/gcc/testsuite/gcc.target/aarch64/usadv16qi.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O3" } */
 
-#pragma GCC target "+nosve"
+#pragma GCC target "+nosve+nodotprod"
 
 #define N 1024
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-clz.c b/gcc/testsuite/gcc.target/aarch64/vect-clz.c
index 044fa9e99..cd181c346 100644
--- a/gcc/testsuite/gcc.target/aarch64/vect-clz.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect-clz.c
@@ -1,6 +1,8 @@
 /* { dg-do run } */
 /* { dg-options "-O3 -save-temps -fno-inline -fno-vect-cost-model" } */
 
+#pragma GCC target "+nosve"
+
 extern void abort ();
 
 void
diff --git a/gcc/testsuite/gcc.target/i386/asm-1.c b/gcc/testsuite/gcc.target/i386/asm-1.c
index cd60a09bd..5e516d882 100644
--- a/gcc/testsuite/gcc.target/i386/asm-1.c
+++ b/gcc/testsuite/gcc.target/i386/asm-1.c
@@ -2,7 +2,7 @@
 /* { dg-require-effective-target ia32 } */
 /* { dg-options "" } */
 
-register unsigned int EAX asm ("r14"); /* { dg-error "register name" } */
+register unsigned int EAX asm ("r14"); /* { dg-error "cannot be accessed" } */
 
 void foo ()
 {
diff --git a/gcc/testsuite/gcc.target/i386/asm-7.c b/gcc/testsuite/gcc.target/i386/asm-7.c
new file mode 100644
index 000000000..d2d113626
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/asm-7.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target ia32 } */
+/* { dg-options "" } */
+
+void foo (void)
+{
+  asm volatile ("" : : : "%r12"); /* { dg-error "cannot be clobbered" } */
+}
diff --git a/gcc/testsuite/gcc.target/i386/asm-flag-0.c b/gcc/testsuite/gcc.target/i386/asm-flag-0.c
index b0c05239b..e7bd1a585 100644
--- a/gcc/testsuite/gcc.target/i386/asm-flag-0.c
+++ b/gcc/testsuite/gcc.target/i386/asm-flag-0.c
@@ -11,5 +11,5 @@ void a(void)
 void b(void)
 {
   char x;
-  asm("" : "=@ccbad"(x)); /* { dg-error "unknown asm flag output" } */
+  asm("" : "=@ccbad"(x)); /* { dg-error "unknown 'asm' flag output" } */
 }
diff --git a/gcc/testsuite/gcc.target/i386/funcspec-4.c b/gcc/testsuite/gcc.target/i386/funcspec-4.c
index 025b97dff..e345acdef 100644
--- a/gcc/testsuite/gcc.target/i386/funcspec-4.c
+++ b/gcc/testsuite/gcc.target/i386/funcspec-4.c
@@ -5,7 +5,7 @@
 extern void error1 (void) __attribute__((__target__("fma400"))); /* { dg-error "unknown" } */
 
 /* Multiple arch switches */
-extern void error2 (void) __attribute__((__target__("arch=core2,arch=k8"))); /* { dg-error "already specified" } */
+extern void error2 (void) __attribute__((__target__("arch=core2,arch=k8"))); /* { dg-error "attribute value 'arch=k8' was already specified in 'target' attribute" } */
 
 /* Unknown tune target */
 extern void error3 (void) __attribute__((__target__("tune=foobar"))); /* { dg-error "bad value" } */
diff --git a/gcc/testsuite/gcc.target/i386/inline_error.c b/gcc/testsuite/gcc.target/i386/inline_error.c
index 18e506631..57e60fbad 100644
--- a/gcc/testsuite/gcc.target/i386/inline_error.c
+++ b/gcc/testsuite/gcc.target/i386/inline_error.c
@@ -2,7 +2,7 @@
 /* { dg-options "-O0 -mno-popcnt" } */
 
 inline int __attribute__ ((__gnu_inline__, __always_inline__, target("popcnt")))
-foo () /* { dg-error "inlining failed in call to always_inline .* target specific option mismatch" } */
+foo () /* { dg-error "inlining failed in call to 'always_inline' .* target specific option mismatch" } */
 {
   return 0;
 }
diff --git a/gcc/testsuite/gcc.target/i386/interrupt-6.c b/gcc/testsuite/gcc.target/i386/interrupt-6.c
index bcbcc97c6..138b98fe1 100644
--- a/gcc/testsuite/gcc.target/i386/interrupt-6.c
+++ b/gcc/testsuite/gcc.target/i386/interrupt-6.c
@@ -31,7 +31,7 @@ fn4 (uword_t error_code, void *frame)
   error = error_code;
 }
 
-extern int fn5 (void *) __attribute__ ((interrupt)); /* { dg-error "interrupt service routine can't have non-void return value" } */
+extern int fn5 (void *) __attribute__ ((interrupt)); /* { dg-error "interrupt service routine must return 'void'" } */
 
 int
 fn5 (void *frame)
diff --git a/gcc/testsuite/gcc.target/i386/interrupt-7.c b/gcc/testsuite/gcc.target/i386/interrupt-7.c
index 506f61afa..3e2f6a0eb 100644
--- a/gcc/testsuite/gcc.target/i386/interrupt-7.c
+++ b/gcc/testsuite/gcc.target/i386/interrupt-7.c
@@ -8,5 +8,5 @@ extern void fn (void *) __attribute__((interrupt));
 void
 foo (void)
 {
-  fn (&error); /* { dg-error "interrupt service routine can't be called directly" } */
+  fn (&error); /* { dg-error "interrupt service routine cannot be called directly" } */
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr30848.c b/gcc/testsuite/gcc.target/i386/pr30848.c
index 2a9285151..9c4e22ac7 100644
--- a/gcc/testsuite/gcc.target/i386/pr30848.c
+++ b/gcc/testsuite/gcc.target/i386/pr30848.c
@@ -2,5 +2,5 @@
 
 void foo(double d)
 {
-  __asm__ ("" : "=u" (d));  /* { dg-error "output regs" } */
+  __asm__ ("" : "=u" (d));  /* { dg-error "output registers" } */
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr39082-1.c b/gcc/testsuite/gcc.target/i386/pr39082-1.c
index 2af2264c3..85b5671e9 100644
--- a/gcc/testsuite/gcc.target/i386/pr39082-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr39082-1.c
@@ -13,7 +13,7 @@ extern int bar1 (union un);
 extern union un bar2 (int);
 
 int
-foo1 (union un u) /* { dg-message "note: the ABI of passing union with long double has changed in GCC 4.4" } */
+foo1 (union un u) /* { dg-message "note: the ABI of passing union with 'long double' has changed in GCC 4.4" } */
 {
   bar1 (u);
   return u.i;
diff --git a/gcc/testsuite/gcc.target/i386/pr39678.c b/gcc/testsuite/gcc.target/i386/pr39678.c
index 0548466d6..c94c002f1 100644
--- a/gcc/testsuite/gcc.target/i386/pr39678.c
+++ b/gcc/testsuite/gcc.target/i386/pr39678.c
@@ -10,7 +10,7 @@ struct X {
 
 struct X
 foo (float *p)
-{ /* { dg-message "note: the ABI of passing structure with complex float member has changed in GCC 4.4" } */
+{ /* { dg-message "note: the ABI of passing structure with 'complex float' member has changed in GCC 4.4" } */
   struct X x;
   x.c = -3;
   __real x.val = p[0];
diff --git a/gcc/testsuite/gcc.target/i386/pr57756.c b/gcc/testsuite/gcc.target/i386/pr57756.c
index 25c565c87..9a78f62c9 100644
--- a/gcc/testsuite/gcc.target/i386/pr57756.c
+++ b/gcc/testsuite/gcc.target/i386/pr57756.c
@@ -3,7 +3,7 @@
 
 /* callee cannot be inlined into caller because it has a higher target ISA.  */
 __attribute__((always_inline,target("sse4.2")))
-__inline int callee () /* { dg-error "inlining failed in call to always_inline" }  */
+__inline int callee () /* { dg-error "inlining failed in call to 'always_inline'" }  */
 {
   return 0;
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr62120.c b/gcc/testsuite/gcc.target/i386/pr62120.c
index bfb8c4703..28d85d377 100644
--- a/gcc/testsuite/gcc.target/i386/pr62120.c
+++ b/gcc/testsuite/gcc.target/i386/pr62120.c
@@ -3,6 +3,6 @@
 
 void foo ()
 {
-  register int zmm_var asm ("ymm9");/* { dg-error "invalid register name" } */
-  register int zmm_var2 asm ("23");/* { dg-error "invalid register name" } */
+  register int zmm_var asm ("ymm9");/* { dg-error "cannot be accessed" } */
+  register int zmm_var2 asm ("23");/* { dg-error "cannot be accessed" } */
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr68843-1.c b/gcc/testsuite/gcc.target/i386/pr68843-1.c
index da0676aa6..6198ea9af 100644
--- a/gcc/testsuite/gcc.target/i386/pr68843-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr68843-1.c
@@ -5,7 +5,7 @@ double
 test ()
 {
   double x = 1.0;
-  asm ("fld %1" /* { dg-error "explicitly used regs must be grouped at top of stack" } */
+  asm ("fld %1" /* { dg-error "explicitly used registers must be grouped at top of stack" } */
        : "=&t" (x)
        : "u" (x));
   return x;
diff --git a/gcc/testsuite/gcc.target/i386/pr79804.c b/gcc/testsuite/gcc.target/i386/pr79804.c
index 10adb4466..08d1a3ea1 100644
--- a/gcc/testsuite/gcc.target/i386/pr79804.c
+++ b/gcc/testsuite/gcc.target/i386/pr79804.c
@@ -7,4 +7,4 @@ void foo (void)
   register int r19 asm ("19");
 
   asm volatile ("# %0" : "=r"(r19));  /* { dg-error "invalid use of register" } */
-}  /* { dg-error "cannot be used in asm here" } */
+}  /* { dg-error "cannot be used in 'asm' here" } */
diff --git a/gcc/testsuite/gcc.target/i386/pr82673.c b/gcc/testsuite/gcc.target/i386/pr82673.c
index 50eb5a3bc..161ec88e3 100644
--- a/gcc/testsuite/gcc.target/i386/pr82673.c
+++ b/gcc/testsuite/gcc.target/i386/pr82673.c
@@ -9,4 +9,4 @@ void
 bar (void) /* { dg-error "frame pointer required, but reserved" } */
 {
   B = &y;
-} /* { dg-error "bp cannot be used in asm here" } */
+} /* { dg-error "bp cannot be used in 'asm' here" } */
diff --git a/gcc/testsuite/gcc.target/i386/pr88809-2.c b/gcc/testsuite/gcc.target/i386/pr88809-2.c
new file mode 100644
index 000000000..b8ef51dab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88809-2.c
@@ -0,0 +1,9 @@
+/* PR target/88809 */
+/* { dg-options "-Os" } */
+
+unsigned int foo (const char *ptr)
+{
+  return __builtin_strlen (ptr);
+}
+
+/* { dg-final { scan-assembler "call\[ \t\]strlen" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr88809.c b/gcc/testsuite/gcc.target/i386/pr88809.c
new file mode 100644
index 000000000..20844ddb9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88809.c
@@ -0,0 +1,9 @@
+/* PR target/88809 */
+/* { dg-options "-O" } */
+
+unsigned int foo (const char *ptr)
+{
+  return __builtin_strlen (ptr);
+}
+
+/* { dg-final { scan-assembler "call\[ \t\]strlen" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1.c b/gcc/testsuite/gcc.target/i386/pr88828-1.c
new file mode 100644
index 000000000..a15d1fea3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1.c
@@ -0,0 +1,49 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-1a.c"
+#include "pr88828-1b.c"
+#include "pr88828-1c.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 134567;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x, z);
+  do_check (y, f, z);
+  y = foo2 (x, z);
+  do_check (y, f, z);
+  y = foo3 (x, z);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1a.c b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
new file mode 100644
index 000000000..d37b24c66
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1a.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo1 (__v4sf x, float f)
+{
+  __v4sf y = { f, x[1], x[2], x[3] };
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1b.c b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
new file mode 100644
index 000000000..af4aced65
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1b.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__attribute__((noinline, noclone))
+__v4sf
+foo2 (__v4sf x, float f)
+{
+  return vector_init (f, x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-1c.c b/gcc/testsuite/gcc.target/i386/pr88828-1c.c
new file mode 100644
index 000000000..a117f3ec7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-1c.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo3 (__v4sf x, float f)
+{
+  __v4sf y = x;
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4a.c b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
new file mode 100644
index 000000000..64043b985
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4a.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-4b.c b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
new file mode 100644
index 000000000..ad8d2b985
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-4b.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vmovss" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpinsrd" 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vmovss" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[1] };
+  y[0] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5a.c b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
new file mode 100644
index 000000000..5e908faef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5a.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler "movss" } } */
+/* { dg-final { scan-assembler-times "shufps" 2 } } */
+/* { dg-final { scan-assembler-times "movaps" 1 } } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-5b.c b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
new file mode 100644
index 000000000..988a48823
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-5b.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-times "vpermilps" 1 } } */
+/* { dg-final { scan-assembler-times "vinsertps" 1 } } */
+/* { dg-final { scan-assembler-not "vshufps" } } */
+/* { dg-final { scan-assembler-not "vmovss" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovlhps" } } */
+/* { dg-final { scan-assembler-not "vunpcklps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__attribute__((noinline, noclone))
+__v4sf
+foo (__v4sf x, float f)
+{
+  __v4sf y = { x[0], x[2], x[3], x[0] };
+  y[3] = f;
+  return y;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7.c b/gcc/testsuite/gcc.target/i386/pr88828-7.c
new file mode 100644
index 000000000..4302c2664
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7.c
@@ -0,0 +1,53 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2 -fexcess-precision=standard" } */
+
+#include "pr88828-7a.c"
+#include "pr88828-7b.c"
+
+extern void abort ();
+
+float
+bar (float x, float y)
+{
+  return x / y - y * x;
+}
+
+void
+do_check (__v4sf x, float f1[4], float f2[4])
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (x[i] != bar (f1[i], f2[i]))
+	  abort ();
+      }
+    else
+      {
+	if (x[i] != f1[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f1[4] = { -11, 2, 55553, -4 };
+  float f2[4] = { 111, 3.3, -55.553, 4.8 };
+  __v4sf x = { f1[0], f1[1], f1[2], f1[3] };
+  __v4sf y = { f2[0], f2[1], f2[2], f2[3] };
+  __v4sf z;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f1[i] || y[i] != f2[i] )
+      abort ();
+
+  z = foo1 (x, y);
+  do_check (z, f1, f2);
+  x = foo2 (x, y);
+  do_check (z, f1, f2);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7a.c b/gcc/testsuite/gcc.target/i386/pr88828-7a.c
new file mode 100644
index 000000000..f1ae57422
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+extern float bar (float, float);
+
+__v4sf
+foo1 (__v4sf x, __v4sf y)
+{
+  __v4sf z = { bar (x[0], y[0]), x[1], x[2], x[3] };
+  return z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-7b.c b/gcc/testsuite/gcc.target/i386/pr88828-7b.c
new file mode 100644
index 000000000..c027c5694
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-7b.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+extern float bar (float, float);
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo2 (__v4sf x, __v4sf y)
+{
+  return vector_init (bar (x[0], y[0]), x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8.c b/gcc/testsuite/gcc.target/i386/pr88828-8.c
new file mode 100644
index 000000000..3b8eabd22
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-8.c
@@ -0,0 +1,46 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-8a.c"
+#include "pr88828-8b.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 11.4;
+  __v4sf x = { f[0], f[1], f[2], f[3] };
+  __v4sf y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x);
+  do_check (y, f, z);
+  y = foo2 (x);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8a.c b/gcc/testsuite/gcc.target/i386/pr88828-8a.c
new file mode 100644
index 000000000..5d383dfd0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-8a.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+__v4sf
+foo1 (__v4sf x)
+{
+  __v4sf z = { 11.4, x[1], x[2], x[3] };
+  return z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-8b.c b/gcc/testsuite/gcc.target/i386/pr88828-8b.c
new file mode 100644
index 000000000..5ffbc9c31
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-8b.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+static __v4sf
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return y;
+}
+
+__v4sf
+foo2 (__v4sf x)
+{
+  return vector_init (11.4, x[1], x[2], x[3]) ;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-9.c b/gcc/testsuite/gcc.target/i386/pr88828-9.c
new file mode 100644
index 000000000..c33907b4a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-9.c
@@ -0,0 +1,46 @@
+/* { dg-do run { target sse2_runtime } } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "pr88828-9a.c"
+#include "pr88828-9b.c"
+
+extern void abort ();
+
+void
+do_check (__v4sf y, float f[4], float z)
+{
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (i == 0)
+      {
+	if (y[i] != z)
+	  abort ();
+      }
+    else
+      {
+	if (y[i] != f[i])
+	  abort ();
+      }
+}
+
+int
+main (void)
+{
+  float f[4] = { -11, 2, 55553, -4 };
+  float z = 11.4;
+  __m128 x = (__m128) (__v4sf) { f[0], f[1], f[2], f[3] };
+  __m128 y;
+  int i;
+
+  for (i = 0; i < 4; i++)
+    if (x[i] != f[i])
+      abort ();
+
+  y = foo1 (x);
+  do_check (y, f, z);
+  y = foo2 (x);
+  do_check (y, f, z);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-9a.c b/gcc/testsuite/gcc.target/i386/pr88828-9a.c
new file mode 100644
index 000000000..7f8306577
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-9a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+
+__m128
+foo1 (__m128 x)
+{
+  __v4sf z = { 11.4, ((__v4sf) x)[1], ((__v4sf) x)[2], ((__v4sf) x) [3] };
+  return (__m128) z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-9b.c b/gcc/testsuite/gcc.target/i386/pr88828-9b.c
new file mode 100644
index 000000000..6588ad15a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88828-9b.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mno-sse4" } */
+/* { dg-final { scan-assembler-not "movlhps" } } */
+/* { dg-final { scan-assembler-not "unpckhps" } } */
+/* { dg-final { scan-assembler-not "unpcklps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+
+static __m128
+vector_init (float f0,float f1, float f2,float f3)
+{
+  __v4sf y = { f0, f1, f2, f3 };
+   return (__m128) y;
+}
+
+__m128
+foo2 (__m128 x)
+{
+  return vector_init (11.4, ((__v4sf) x)[1], ((__v4sf) x)[2],
+		      ((__v4sf) x) [3]);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr88963-1.c b/gcc/testsuite/gcc.target/i386/pr88963-1.c
new file mode 100644
index 000000000..e6f15259e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88963-1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mavx2 -fdump-tree-optimized" } */
+
+typedef int VInt __attribute__((vector_size(64)));
+
+void test(VInt*__restrict a, VInt*__restrict b, 
+	  VInt*__restrict c)
+{
+  *a = *b + *c;
+}
+
+/* Vector loads and stores should be split.  */
+/* { dg-final { scan-tree-dump-not "vector\\(16\\)" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr88963-2.c b/gcc/testsuite/gcc.target/i386/pr88963-2.c
new file mode 100644
index 000000000..114f1f5c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88963-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -msse2 -fdump-tree-optimized" } */
+
+typedef int VInt __attribute__((vector_size(64)));
+
+void test(VInt*__restrict a, VInt*__restrict b, 
+	  VInt*__restrict c)
+{
+  *a = *b + *c;
+}
+
+/* Vector loads and stores should be split.  */
+/* { dg-final { scan-tree-dump-not "vector\\(16\\)" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "vector\\(8\\)" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89261.c b/gcc/testsuite/gcc.target/i386/pr89261.c
new file mode 100644
index 000000000..63882c099
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89261.c
@@ -0,0 +1,9 @@
+/* PR target/89261 */
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef double __v2df __attribute__ ((vector_size (16), aligned (1 << 28)));
+
+__v2df foo = { 1.0, 2.0 };
+
+/* { dg-final { scan-assembler "\.align\[ \t]+268435456" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr92645-2.c b/gcc/testsuite/gcc.target/i386/pr92645-2.c
new file mode 100644
index 000000000..d34ed3aa8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92645-2.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -fdump-tree-cddce1" } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v2si __attribute__((vector_size(8)));
+
+void low (v2si *dst, v4si *srcp)
+{
+  v4si src = *srcp;
+  *dst = (v2si) { src[0], src[1] };
+}
+
+void high (v2si *dst, v4si *srcp)
+{
+  v4si src = *srcp;
+  *dst = (v2si) { src[2], src[3] };
+}
+
+void even (v2si *dst, v4si *srcp)
+{
+  v4si src = *srcp;
+  *dst = (v2si) { src[0], src[2] };
+}
+
+void odd (v2si *dst, v4si *srcp)
+{
+  v4si src = *srcp;
+  *dst = (v2si) { src[1], src[3] };
+}
+
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "cddce1" } } */
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" } } */
+/* Ideally highpart extraction would elide the permutation as well.  */
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr92645-3.c b/gcc/testsuite/gcc.target/i386/pr92645-3.c
new file mode 100644
index 000000000..9c08c9fb6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92645-3.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -fdump-tree-cddce1" } */
+
+typedef int v8si __attribute__((vector_size(32)));
+typedef float v4sf __attribute__((vector_size(16)));
+
+void low (v4sf *dst, v8si *srcp)
+{
+  v8si src = *srcp;
+  *dst = (v4sf) { src[0], src[1], src[2], src[3] };
+}
+
+void high (v4sf *dst, v8si *srcp)
+{
+  v8si src = *srcp;
+  *dst = (v4sf) { src[4], src[5], src[6], src[7] };
+}
+
+void even (v4sf *dst, v8si *srcp)
+{
+  v8si src = *srcp;
+  *dst = (v4sf) { src[0], src[2], src[4], src[6] };
+}
+
+void odd (v4sf *dst, v8si *srcp)
+{
+  v8si src = *srcp;
+  *dst = (v4sf) { src[1], src[3], src[5], src[7] };
+}
+
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "cddce1" } } */
+/* Four conversions, on the smaller vector type, to not convert excess
+   elements.  */
+/* { dg-final { scan-tree-dump-times " = \\\(vector\\\(4\\\) float\\\)" 4 "cddce1" } } */
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" } } */
+/* Ideally highpart extraction would elide the VEC_PERM_EXPR as well.  */
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c b/gcc/testsuite/gcc.target/i386/pr92645-4.c
new file mode 100644
index 000000000..788a97ed1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -fdump-tree-optimized -Wno-psabi" } */
+
+typedef unsigned int u32v4 __attribute__((vector_size(16)));
+typedef unsigned short u16v16 __attribute__((vector_size(32)));
+typedef unsigned char u8v16 __attribute__((vector_size(16)));
+
+union vec128 {
+  u8v16 u8;
+  u32v4 u32;
+};
+
+#define memcpy __builtin_memcpy
+
+static u16v16 zxt(u8v16 x)
+{
+  return (u16v16) {
+    x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
+    x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+  };
+}
+
+static u8v16 narrow(u16v16 x)
+{
+  return (u8v16) {
+    x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
+    x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+  };
+}
+
+void f(char *dst, char *src, unsigned long n, unsigned c)
+{
+  unsigned ia = 255 - (c >> 24);
+  ia += ia >> 7;
+
+  union vec128 c4 = {0}, ia16 = {0};
+  c4.u32 += c;
+  ia16.u8 += (unsigned char)ia;
+
+  u16v16 c16 = (zxt(c4.u8) << 8) + 128;
+
+  for (; n; src += 16, dst += 16, n -= 4) {
+    union vec128 s;
+    memcpy(&s, src, sizeof s);
+    s.u8 = narrow((zxt(s.u8)*zxt(ia16.u8) + c16) >> 8);
+    memcpy(dst, &s, sizeof s);
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "vec_unpack_lo" 3 "optimized" } } */
+/* We're missing an opportunity to, after later optimizations, combine
+   a uniform CTOR with a vec_unpack_lo_expr to a CTOR on a converted
+   element.  */
+/* { dg-final { scan-tree-dump-times "vec_unpack_lo" 2 "optimized" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr92803.c b/gcc/testsuite/gcc.target/i386/pr92803.c
new file mode 100644
index 000000000..fc8d64efb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92803.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -Wno-psabi -mavx2 -fdump-tree-forwprop1" } */
+
+typedef double v4df __attribute__((vector_size (32)));
+typedef float v8sf __attribute__((vector_size (32)));
+typedef float v4sf __attribute__((vector_size (16)));
+typedef int v4si __attribute__((vector_size (16)));
+typedef double v2df __attribute__((vector_size (16)));
+
+v2df
+foo (v4df x, double *p, v2df y)
+{
+  return (v2df) { x[3], *p };
+}
+
+v4sf
+bar (v4si x, float *p)
+{
+  return (v4sf) { x[0], x[1], x[2], *p };
+}
+
+v4sf
+baz (v4si x)
+{
+  return (v4sf) { x[0], x[1], 3.0f, 1.0f };
+}
+
+v4sf
+barf (v8sf x)
+{
+  return (v4sf) { x[4], x[5], 1.0f, 2.0f };
+}
+
+/* We expect all CTORs to turn into permutes, the FP converting ones
+   to two each with the one with constants possibly elided in the future
+   by converting 3.0f and 1.0f "back" to integers.  */
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 6 "forwprop1" } } */
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 5 "forwprop1" { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gfortran.dg/graphite/interchange-3.f90 b/gcc/testsuite/gfortran.dg/graphite/interchange-3.f90
index 8070bbb4a..d827323ac 100644
--- a/gcc/testsuite/gfortran.dg/graphite/interchange-3.f90
+++ b/gcc/testsuite/gfortran.dg/graphite/interchange-3.f90
@@ -23,5 +23,3 @@ Program FOO
 366  format(/, ' PC = ',E12.4,/,' UC = ',E12.4,/,' VC = ',E12.4,/)
 
 end Program FOO
-
-! { dg-final { scan-tree-dump "tiled" "graphite" } }
diff --git a/gcc/testsuite/gfortran.dg/pr88833.f90 b/gcc/testsuite/gfortran.dg/pr88833.f90
new file mode 100644
index 000000000..224e6ce5f
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr88833.f90
@@ -0,0 +1,9 @@
+! { dg-do assemble { target aarch64_asm_sve_ok } }
+! { dg-options "-O3 -march=armv8.2-a+sve --save-temps" }
+
+subroutine foo(x)
+  real :: x(100)
+  x = x + 10
+end subroutine foo
+
+! { dg-final { scan-assembler {\twhilelo\tp[0-9]+\.s, wzr, (w[0-9]+).*\twhilelo\tp[0-9]+\.s, w[0-9]+, \1} } }
diff --git a/gcc/testsuite/gnat.dg/opt39.adb b/gcc/testsuite/gnat.dg/opt39.adb
index 3b12cf201..0a5ef67a2 100644
--- a/gcc/testsuite/gnat.dg/opt39.adb
+++ b/gcc/testsuite/gnat.dg/opt39.adb
@@ -27,4 +27,5 @@ begin
   end if;
 end;
 
--- { dg-final { scan-tree-dump-times "MEM" 1 "optimized" } }
+-- { dg-final { scan-tree-dump-not "MEM" "optimized" } }
+-- { dg-final { scan-tree-dump-not "tmp" "optimized" } }
diff --git a/gcc/testsuite/lib/prune.exp b/gcc/testsuite/lib/prune.exp
index 812c59e6f..a9beef48e 100644
--- a/gcc/testsuite/lib/prune.exp
+++ b/gcc/testsuite/lib/prune.exp
@@ -21,7 +21,7 @@ load_lib multiline.exp
 if ![info exists TEST_ALWAYS_FLAGS] {
     set TEST_ALWAYS_FLAGS ""
 }
-set TEST_ALWAYS_FLAGS "-fno-diagnostics-show-caret -fno-diagnostics-show-line-numbers -fdiagnostics-color=never $TEST_ALWAYS_FLAGS"
+set TEST_ALWAYS_FLAGS "-fno-diagnostics-show-caret -fno-diagnostics-show-line-numbers -fdiagnostics-color=never  -fdiagnostics-urls=never $TEST_ALWAYS_FLAGS"
 
 proc prune_gcc_output { text } {
     global srcdir
diff --git a/gcc/testsuite/lib/scanasm.exp b/gcc/testsuite/lib/scanasm.exp
index 35ccbc86f..4ff39dab3 100644
--- a/gcc/testsuite/lib/scanasm.exp
+++ b/gcc/testsuite/lib/scanasm.exp
@@ -546,3 +546,179 @@ proc scan-lto-assembler { args } {
     verbose "output_file: $output_file"
     dg-scan "scan-lto-assembler" 1 $testcase $output_file $args
 }
+
+# Read assembly file FILENAME and store a mapping from function names
+# to function bodies in array RESULT.  FILENAME has already been uploaded
+# locally where necessary and is known to exist.
+
+proc parse_function_bodies { filename result } {
+    upvar $result up_result
+
+    # Regexp for the start of a function definition (name in \1).
+    set label {^([a-zA-Z_]\S+):$}
+
+    # Regexp for the end of a function definition.
+    set terminator {^\s*\.size}
+
+    # Regexp for lines that aren't interesting.
+    set fluff {^\s*(?:\.|//)}
+
+    set fd [open $filename r]
+    set in_function 0
+    while { [gets $fd line] >= 0 } {
+	if { [regexp $label $line dummy function_name] } {
+	    set in_function 1
+	    set function_body ""
+	} elseif { $in_function } {
+	    if { [regexp $terminator $line] } {
+		set up_result($function_name) $function_body
+		set in_function 0
+	    } elseif { ![regexp $fluff $line] } {
+		append function_body $line "\n"
+	    }
+	}
+    }
+    close $fd
+}
+
+# FUNCTIONS is an array that maps function names to function bodies.
+# Return true if it contains a definition of function NAME and if
+# that definition matches BODY_REGEXP.
+
+proc check_function_body { functions name body_regexp } {
+    upvar $functions up_functions
+
+    if { ![info exists up_functions($name)] } {
+	return 0
+    }
+    return [regexp "^$body_regexp\$" $up_functions($name)]
+}
+
+# Check the implementations of functions against expected output.  Used as:
+#
+# { dg-do { check-function-bodies PREFIX TERMINATOR[ OPTION[ SELECTOR]] } }
+#
+# See sourcebuild.texi for details.
+
+proc check-function-bodies { args } {
+    if { [llength $args] < 2 } {
+	error "too few arguments to check-function-bodies"
+    }
+    if { [llength $args] > 4 } {
+	error "too many arguments to check-function-bodies"
+    }
+
+    if { [llength $args] >= 3 } {
+	set required_flag [lindex $args 2]
+
+	upvar 2 dg-extra-tool-flags extra_tool_flags
+	set flags $extra_tool_flags
+
+	global torture_current_flags
+	if { [info exists torture_current_flags] } {
+	    append flags " " $torture_current_flags
+	}
+	if { ![regexp " $required_flag " $flags] } {
+	    return
+	}
+    }
+
+    set xfail_all 0
+    if { [llength $args] >= 4 } {
+	switch [dg-process-target [lindex $args 3]] {
+	    "S" { }
+	    "N" { return }
+	    "F" { set xfail_all 1 }
+	    "P" { }
+	}
+    }
+
+    set testcase [testname-for-summary]
+    # The name might include a list of options; extract the file name.
+    set filename [lindex $testcase 0]
+
+    global srcdir
+    set input_filename "$srcdir/$filename"
+    set output_filename "[file rootname [file tail $filename]].s"
+
+    set prefix [lindex $args 0]
+    set prefix_len [string length $prefix]
+    set terminator [lindex $args 1]
+    if { [string equal $terminator ""] } {
+	set terminator "*/"
+    }
+    set terminator_len [string length $terminator]
+
+    set have_bodies 0
+    if { [is_remote host] } {
+	remote_upload host "$filename"
+    }
+    if { [file exists $output_filename] } {
+	parse_function_bodies $output_filename functions
+	set have_bodies 1
+    } else {
+	verbose -log "$testcase: output file does not exist"
+    }
+
+    set count 0
+    set function_regexp ""
+    set label {^(\S+):$}
+
+    set lineno 1
+    set fd [open $input_filename r]
+    set in_function 0
+    while { [gets $fd line] >= 0 } {
+	if { [string equal -length $prefix_len $line $prefix] } {
+	    set line [string trim [string range $line $prefix_len end]]
+	    if { !$in_function } {
+		if { [regexp "^(.*\\S)\\s+{(.*)}\$" $line dummy \
+			  line selector] } {
+		    set selector [dg-process-target $selector]
+		} else {
+		    set selector "P"
+		}
+		if { ![regexp $label $line dummy function_name] } {
+		    close $fd
+		    error "check-function-bodies: line $lineno does not have a function label"
+		}
+		set in_function 1
+		set function_regexp ""
+	    } elseif { [string equal $line "("] } {
+		append function_regexp "(?:"
+	    } elseif { [string equal $line "|"] } {
+		append function_regexp "|"
+	    } elseif { [string equal $line ")"] } {
+		append function_regexp ")"
+	    } elseif { [string equal $line "..."] } {
+		append function_regexp ".*"
+	    } else {
+		append function_regexp "\t" $line "\n"
+	    }
+	} elseif { [string equal -length $terminator_len $line $terminator] } {
+	    if { ![string equal $selector "N"] } {
+		if { $xfail_all || [string equal $selector "F"] } {
+		    setup_xfail "*-*-*"
+		}
+		set testname "$testcase check-function-bodies $function_name"
+		if { !$have_bodies } {
+		    unresolved $testname
+		} elseif { [check_function_body functions $function_name \
+				$function_regexp] } {
+		    pass $testname
+		} else {
+		    fail $testname
+		}
+	    }
+	    set in_function 0
+	    incr count
+	}
+	incr lineno
+    }
+    close $fd
+    if { $in_function } {
+	error "check-function-bodies: missing \"$terminator\""
+    }
+    if { $count == 0 } {
+	error "check-function-bodies: no matches found"
+    }
+}
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index ea9a50ccb..2eeb6883a 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3336,6 +3336,24 @@ proc check_effective_target_aarch64_sve { } {
     }]
 }
 
+# Return 1 if this is an AArch64 target supporting SVE2.
+proc check_effective_target_aarch64_sve2 { } {
+    if { ![istarget aarch64*-*-*] } {
+	return 0
+    }
+    return [check_no_compiler_messages aarch64_sve2 assembly {
+	#if !defined (__ARM_FEATURE_SVE2)
+	#error FOO
+	#endif
+    }]
+}
+
+# Return 1 if this is an AArch64 target only supporting SVE (not SVE2).
+proc check_effective_target_aarch64_sve1_only { } {
+    return [expr { [check_effective_target_aarch64_sve]
+		   && ![check_effective_target_aarch64_sve2] }]
+}
+
 # Return the size in bits of an SVE vector, or 0 if the size is variable.
 proc aarch64_sve_bits { } {
     return [check_cached_effective_target aarch64_sve_bits {
@@ -4356,6 +4374,22 @@ proc check_effective_target_aarch64_sve_hw { } {
     }]
 }
 
+# Return true if this is an AArch64 target that can run SVE2 code.
+
+proc check_effective_target_aarch64_sve2_hw { } {
+    if { ![istarget aarch64*-*-*] } {
+	return 0
+    }
+    return [check_runtime aarch64_sve2_hw_available {
+	int
+	main (void)
+	{
+	  asm volatile ("addp z0.b, p0/m, z0.b, z1.b");
+	  return 0;
+	}
+    }]
+}
+
 # Return true if this is an AArch64 target that can run SVE code and
 # if its SVE vectors have exactly BITS bits.
 
@@ -4569,6 +4603,49 @@ proc add_options_for_arm_v8_2a_dotprod_neon { flags } {
     return "$flags $et_arm_v8_2a_dotprod_neon_flags"
 }
 
+# Return 1 if the target supports ARMv8.2+i8mm Adv.SIMD Dot Product
+# instructions, 0 otherwise.  The test is valid for ARM and for AArch64.
+# Record the command line options needed.
+
+proc check_effective_target_arm_v8_2a_i8mm_ok_nocache { } {
+    global et_arm_v8_2a_i8mm_flags
+    set et_arm_v8_2a_i8mm_flags ""
+
+    if { ![istarget arm*-*-*] && ![istarget aarch64*-*-*] } {
+        return 0;
+    }
+
+    # Iterate through sets of options to find the compiler flags that
+    # need to be added to the -march option.
+    foreach flags {"" "-mfloat-abi=hard -mfpu=neon-fp-armv8" "-mfloat-abi=softfp -mfpu=neon-fp-armv8" } {
+        if { [check_no_compiler_messages_nocache \
+                  arm_v8_2a_i8mm_ok object {
+            #include <arm_neon.h>
+            #if !defined (__ARM_FEATURE_MATMUL_INT8)
+            #error "__ARM_FEATURE_MATMUL_INT8 not defined"
+            #endif
+        } "$flags -march=armv8.2-a+i8mm"] } {
+            set et_arm_v8_2a_i8mm_flags "$flags -march=armv8.2-a+i8mm"
+            return 1
+        }
+    }
+
+    return 0;
+}
+
+proc check_effective_target_arm_v8_2a_i8mm_ok { } {
+    return [check_cached_effective_target arm_v8_2a_i8mm_ok \
+                check_effective_target_arm_v8_2a_i8mm_ok_nocache]
+}
+
+proc add_options_for_arm_v8_2a_i8mm { flags } {
+    if { ! [check_effective_target_arm_v8_2a_i8mm_ok] } {
+        return "$flags"
+    }
+    global et_arm_v8_2a_i8mm_flags
+    return "$flags $et_arm_v8_2a_i8mm_flags"
+}
+
 # Return 1 if the target supports FP16 VFMAL and VFMSL
 # instructions, 0 otherwise.
 # Record the command line options needed.
@@ -4614,6 +4691,45 @@ proc add_options_for_arm_fp16fml_neon { flags } {
     return "$flags $et_arm_fp16fml_neon_flags"
 }
 
+# Return 1 if the target supports BFloat16 SIMD instructions, 0 otherwise.
+# The test is valid for ARM and for AArch64.
+
+proc check_effective_target_arm_v8_2a_bf16_neon_ok_nocache { } {
+    global et_arm_v8_2a_bf16_neon_flags
+    set et_arm_v8_2a_bf16_neon_flags ""
+
+    if { ![istarget arm*-*-*] && ![istarget aarch64*-*-*] } {
+        return 0;
+    }
+
+    foreach flags {"" "-mfloat-abi=hard -mfpu=neon-fp-armv8" "-mfloat-abi=softfp -mfpu=neon-fp-armv8" } {
+        if { [check_no_compiler_messages_nocache arm_v8_2a_bf16_neon_ok object {
+            #include <arm_neon.h>
+            #if !defined (__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)
+            #error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC not defined"
+            #endif
+        } "$flags -march=armv8.2-a+bf16"] } {
+            set et_arm_v8_2a_bf16_neon_flags "$flags -march=armv8.2-a+bf16"
+            return 1
+        }
+    }
+
+    return 0;
+}
+
+proc check_effective_target_arm_v8_2a_bf16_neon_ok { } {
+    return [check_cached_effective_target arm_v8_2a_bf16_neon_ok \
+                check_effective_target_arm_v8_2a_bf16_neon_ok_nocache]
+}
+
+proc add_options_for_arm_v8_2a_bf16_neon { flags } {
+    if { ! [check_effective_target_arm_v8_2a_bf16_neon_ok] } {
+        return "$flags"
+    }
+    global et_arm_v8_2a_bf16_neon_flags
+    return "$flags $et_arm_v8_2a_bf16_neon_flags"
+}
+
 # Return 1 if the target supports executing ARMv8 NEON instructions, 0
 # otherwise.
 
@@ -6093,7 +6209,24 @@ proc check_effective_target_vect_usad_char { } {
 
 proc check_effective_target_vect_avg_qi {} {
     return [expr { [istarget aarch64*-*-*]
-		   && ![check_effective_target_aarch64_sve] }]
+		   && ![check_effective_target_aarch64_sve1_only] }]
+}
+
+# Return 1 if the target plus current options supports both signed
+# and unsigned multiply-high-with-round-and-scale operations
+# on vectors of half-words.
+
+proc check_effective_target_vect_mulhrs_hi {} {
+    return [expr { [istarget aarch64*-*-*]
+		   && [check_effective_target_aarch64_sve2] }]
+}
+
+# Return 1 if the target plus current options supports signed division
+# by power-of-2 operations on vectors of 4-byte integers.
+
+proc check_effective_target_vect_sdiv_pow2_si {} {
+    return [expr { [istarget aarch64*-*-*]
+		   && [check_effective_target_aarch64_sve] }]
 }
 
 # Return 1 if the target plus current options supports a vector
@@ -8579,7 +8712,8 @@ proc check_effective_target_aarch64_tiny { } {
 # Create functions to check that the AArch64 assembler supports the
 # various architecture extensions via the .arch_extension pseudo-op.
 
-foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse" "dotprod" "sve"} {
+foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse" "dotprod" "sve"
+			  "i8mm" "f32mm" "f64mm" "bf16" } {
     eval [string map [list FUNC $aarch64_ext] {
 	proc check_effective_target_aarch64_asm_FUNC_ok { } {
 	  if { [istarget aarch64*-*-*] } {
diff --git a/gcc/testsuite/obj-c++.dg/stubify-1.mm b/gcc/testsuite/obj-c++.dg/stubify-1.mm
index e8f21882d..a32e28251 100644
--- a/gcc/testsuite/obj-c++.dg/stubify-1.mm
+++ b/gcc/testsuite/obj-c++.dg/stubify-1.mm
@@ -4,7 +4,7 @@
 /* { dg-do compile { target *-*-darwin* } } */
 /* { dg-skip-if "" { *-*-* } { "-fgnu-runtime" } { "" } } */
 /* { dg-require-effective-target ilp32 } */
-/* { dg-options "-mdynamic-no-pic -fno-exceptions -mmacosx-version-min=10.4 -msymbol-stubs" } */
+/* { dg-options "-Os -mdynamic-no-pic -fno-exceptions -mmacosx-version-min=10.4 -msymbol-stubs" } */
 
 typedef struct objc_object { } *id ;
 int x = 41 ;
diff --git a/gcc/testsuite/obj-c++.dg/stubify-2.mm b/gcc/testsuite/obj-c++.dg/stubify-2.mm
index 1863f986c..69fea8def 100644
--- a/gcc/testsuite/obj-c++.dg/stubify-2.mm
+++ b/gcc/testsuite/obj-c++.dg/stubify-2.mm
@@ -4,7 +4,7 @@
 /* { dg-do compile { target *-*-darwin* } } */
 /* { dg-skip-if "" { *-*-* } { "-fgnu-runtime" } { "" } } */
 /* { dg-require-effective-target ilp32 } */
-/* { dg-options "-mdynamic-no-pic -mmacosx-version-min=10.4 -msymbol-stubs" } */
+/* { dg-options "-mdynamic-no-pic -fdump-rtl-jump -mmacosx-version-min=10.4 -msymbol-stubs" } */
 
 typedef struct objc_object { } *id ;
 int x = 41 ;
@@ -30,6 +30,7 @@ extern int bogonic (int, int, int) ;
 
 /* Any symbol_ref of an un-stubified objc_msgSend is an error; look
    for "objc_msgSend" in quotes, without the $stub suffix. */
+/* { dg-final {  scan-rtl-dump-not {symbol_ref.*"objc_msgSend"} "jump" { target powerpc*-*-darwin* } } } */
 
 /* { dg-final { scan-assembler-not {(bl|call)[ \t]+_objc_msgSend\n} } } */
 /* { dg-final { scan-assembler     {(bl|call)[ \t]+L_objc_msgSend\$stub\n} } } */
diff --git a/gcc/testsuite/objc.dg/stubify-2.m b/gcc/testsuite/objc.dg/stubify-2.m
index 2930e46fc..904ac44b2 100644
--- a/gcc/testsuite/objc.dg/stubify-2.m
+++ b/gcc/testsuite/objc.dg/stubify-2.m
@@ -4,7 +4,7 @@
 /* { dg-do compile { target *-*-darwin* } } */
 /* { dg-skip-if "" { *-*-* } { "-fgnu-runtime" } { "" } } */
 /* { dg-require-effective-target ilp32 } */
-/* { dg-options "-mdynamic-no-pic -mmacosx-version-min=10.4 -msymbol-stubs" } */
+/* { dg-options "-mdynamic-no-pic -fdump-rtl-jump -mmacosx-version-min=10.4 -msymbol-stubs" } */
 
 typedef struct objc_object { } *id ;
 int x = 41 ;
@@ -30,6 +30,7 @@ extern int bogonic (int, int, int) ;
 
 /* Any symbol_ref of an un-stubified objc_msgSend is an error; look
    for "objc_msgSend" in quotes, without the $stub suffix.  */
+/* { dg-final { scan-rtl-dump-not {symbol_ref.*"objc_msgSend"} "jump" { target powerpc*-*-darwin* } } } */
 
 /* { dg-final { scan-assembler-not {(bl|call)[ \t]+_objc_msgSend\n} } } */
 /* { dg-final { scan-assembler     {(bl|call)[ \t]+L_objc_msgSend\$stub\n} } } */
diff --git a/gcc/trans-mem.c b/gcc/trans-mem.c
index 0581aae2d..8fc9f44d8 100644
--- a/gcc/trans-mem.c
+++ b/gcc/trans-mem.c
@@ -3237,8 +3237,7 @@ expand_block_edges (struct tm_region *const region, basic_block bb)
 	  || (gimple_call_flags (call_stmt) & ECF_TM_BUILTIN) == 0)
 	continue;
 
-      if (DECL_FUNCTION_CODE (gimple_call_fndecl (call_stmt))
-	  == BUILT_IN_TM_ABORT)
+      if (gimple_call_builtin_p (call_stmt, BUILT_IN_TM_ABORT))
 	{
 	  // If we have a ``_transaction_cancel [[outer]]'', there is only
 	  // one abnormal edge: to the transaction marked OUTER.
diff --git a/gcc/tree-call-cdce.c b/gcc/tree-call-cdce.c
index 2e482b37e..43f1ec6ee 100644
--- a/gcc/tree-call-cdce.c
+++ b/gcc/tree-call-cdce.c
@@ -1074,9 +1074,7 @@ use_internal_fn (gcall *call)
 	{
 	  gimple_stmt_iterator gsi = gsi_for_stmt (call);
 	  gcall *new_call = gimple_build_call_internal (IFN_SET_EDOM, 0);
-	  gimple_set_vuse (new_call, gimple_vuse (call));
-	  gimple_set_vdef (new_call, gimple_vdef (call));
-	  SSA_NAME_DEF_STMT (gimple_vdef (new_call)) = new_call;
+	  gimple_move_vops (new_call, call);
 	  gimple_set_location (new_call, gimple_location (call));
 	  gsi_replace (&gsi, new_call, false);
 	  call = new_call;
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index 621c8ea3d..527deffe4 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -9547,7 +9547,8 @@ execute_fixup_cfg (void)
 	     Keep access when store has side effect, i.e. in case when source
 	     is volatile.  */
 	  if (gimple_store_p (stmt)
-	      && !gimple_has_side_effects (stmt))
+	      && !gimple_has_side_effects (stmt)
+	      && !optimize_debug)
 	    {
 	      tree lhs = get_base_address (gimple_get_lhs (stmt));
 
diff --git a/gcc/tree-core.h b/gcc/tree-core.h
index 41d052949..26b6f46ad 100644
--- a/gcc/tree-core.h
+++ b/gcc/tree-core.h
@@ -1791,6 +1791,17 @@ struct GTY(()) tree_decl_non_common {
   tree result;
 };
 
+/* Classify a special function declaration type.  */
+
+enum function_decl_type
+{
+  NONE,
+  OPERATOR_NEW,
+  LAMBDA_FUNCTION
+
+  /* 0 values left */
+};
+
 /* FUNCTION_DECL inherits from DECL_NON_COMMON because of the use of the
    arguments/result/saved_tree fields by front ends.   It was either inherit
    FUNCTION_DECL from non_common, or inherit non_common from FUNCTION_DECL,
@@ -1815,34 +1826,32 @@ struct GTY(()) tree_function_decl {
   /* Index within a virtual table.  */
   tree vindex;
 
-  /* In a FUNCTION_DECL for which DECL_BUILT_IN holds, this is
-     DECL_FUNCTION_CODE.  Otherwise unused.
-     ???  The bitfield needs to be able to hold all target function
-	  codes as well.  */
-  ENUM_BITFIELD(built_in_function) function_code : 12;
-  ENUM_BITFIELD(built_in_class) built_in_class : 2;
+  /* In a FUNCTION_DECL this is DECL_UNCHECKED_FUNCTION_CODE.  */
+  unsigned int function_code;
 
+  ENUM_BITFIELD(built_in_class) built_in_class : 2;
   unsigned static_ctor_flag : 1;
   unsigned static_dtor_flag : 1;
-
   unsigned uninlinable : 1;
   unsigned possibly_inlined : 1;
   unsigned novops_flag : 1;
   unsigned returns_twice_flag : 1;
+
   unsigned malloc_flag : 1;
-  unsigned operator_new_flag : 1;
   unsigned declared_inline_flag : 1;
   unsigned no_inline_warning_flag : 1;
-
   unsigned no_instrument_function_entry_exit : 1;
   unsigned no_limit_stack : 1;
   unsigned disregard_inline_limits : 1;
   unsigned pure_flag : 1;
   unsigned looping_const_or_pure_flag : 1;
+
+  /* Align the bitfield to boundary of a byte.  */
+  ENUM_BITFIELD(function_decl_type) decl_type: 2;
   unsigned has_debug_args_flag : 1;
   unsigned versioned_function : 1;
-  unsigned lambda_function: 1;
-  /* No bits left.  */
+
+  /* 12 bits left for future expansion.  */
 };
 
 struct GTY(()) tree_translation_unit_decl {
diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
index ac81e10a3..38ebe4092 100644
--- a/gcc/tree-if-conv.c
+++ b/gcc/tree-if-conv.c
@@ -2142,9 +2142,7 @@ predicate_load_or_store (gimple_stmt_iterator *gsi, gassign *stmt, tree mask)
       new_stmt
 	= gimple_build_call_internal (IFN_MASK_STORE, 4, addr, ptr,
 				      mask, rhs);
-      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
-      gimple_set_vdef (new_stmt, gimple_vdef (stmt));
-      SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
+      gimple_move_vops (new_stmt, stmt);
     }
   gimple_call_set_nothrow (new_stmt, true);
   return new_stmt;
diff --git a/gcc/tree-inline.c b/gcc/tree-inline.c
index 1110089fa..784ab48c1 100644
--- a/gcc/tree-inline.c
+++ b/gcc/tree-inline.c
@@ -4585,7 +4585,7 @@ expand_call_inline (basic_block bb, gimple *stmt, copy_body_data *id,
 	  /* PR 20090218-1_0.c. Body can be provided by another module. */
 	  && (reason != CIF_BODY_NOT_AVAILABLE || !flag_generate_lto))
 	{
-	  error ("inlining failed in call to always_inline %q+F: %s", fn,
+	  error ("inlining failed in call to %<always_inline%> %q+F: %s", fn,
 		 cgraph_inline_failed_string (reason));
 	  if (gimple_location (stmt) != UNKNOWN_LOCATION)
 	    inform (gimple_location (stmt), "called from here");
@@ -4834,7 +4834,7 @@ expand_call_inline (basic_block bb, gimple *stmt, copy_body_data *id,
      we may get confused if the compiler sees that the inlined new
      function returns a pointer which was just deleted.  See bug
      33407.  */
-  if (DECL_IS_OPERATOR_NEW (fn))
+  if (DECL_IS_OPERATOR_NEW_P (fn))
     {
       return_slot = NULL;
       modify_dest = NULL;
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 8741a9a49..1321a92c4 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -584,8 +584,6 @@ extern rtl_opt_pass *make_pass_value_profile_transformations (gcc::context
 extern rtl_opt_pass *make_pass_postreload_cse (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_gcse2 (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_split_after_reload (gcc::context *ctxt);
-extern rtl_opt_pass *make_pass_branch_target_load_optimize1 (gcc::context
-							     *ctxt);
 extern rtl_opt_pass *make_pass_thread_prologue_and_epilogue (gcc::context
 							     *ctxt);
 extern rtl_opt_pass *make_pass_stack_adjustments (gcc::context *ctxt);
@@ -595,8 +593,6 @@ extern rtl_opt_pass *make_pass_if_after_reload (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_regrename (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_cprop_hardreg (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_reorder_blocks (gcc::context *ctxt);
-extern rtl_opt_pass *make_pass_branch_target_load_optimize2 (gcc::context
-							     *ctxt);
 extern rtl_opt_pass *make_pass_leaf_regs (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_split_before_sched2 (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_compare_elim_after_reload (gcc::context *ctxt);
diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
index 8e4baf013..c36bf96ef 100644
--- a/gcc/tree-sra.c
+++ b/gcc/tree-sra.c
@@ -106,6 +106,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "ipa-utils.h"
 #include "builtins.h"
 
+
 /* Enumeration of all aggregate reductions we can do.  */
 enum sra_mode { SRA_MODE_EARLY_IPA,   /* early call regularization */
 		SRA_MODE_EARLY_INTRA, /* early intraprocedural SRA */
@@ -220,8 +221,11 @@ struct access
      is not propagated in the access tree in any direction.  */
   unsigned grp_scalar_write : 1;
 
-  /* Is this access an artificial one created to scalarize some record
-     entirely? */
+  /* In a root of an access tree, true means that the entire tree should be
+     totally scalarized - that all scalar leafs should be scalarized and
+     non-root grp_total_scalarization accesses should be honored.  Otherwise,
+     non-root accesses with grp_total_scalarization should never get scalar
+     replacements.  */
   unsigned grp_total_scalarization : 1;
 
   /* Other passes of the analysis use this bit to make function
@@ -242,6 +246,10 @@ struct access
      access tree.  */
   unsigned grp_unscalarized_data : 1;
 
+  /* Set if all accesses in the group consist of the same chain of
+     COMPONENT_REFs and ARRAY_REFs.  */
+  unsigned grp_same_access_path : 1;
+
   /* Does this access and/or group contain a write access through a
      BIT_FIELD_REF?  */
   unsigned grp_partial_lhs : 1;
@@ -443,16 +451,18 @@ dump_access (FILE *f, struct access *access, bool grp)
 	     "grp_scalar_write = %d, grp_total_scalarization = %d, "
 	     "grp_hint = %d, grp_covered = %d, "
 	     "grp_unscalarizable_region = %d, grp_unscalarized_data = %d, "
-	     "grp_partial_lhs = %d, grp_to_be_replaced = %d, "
-	     "grp_to_be_debug_replaced = %d, grp_maybe_modified = %d, "
+	     "grp_same_access_path = %d, grp_partial_lhs = %d, "
+	     "grp_to_be_replaced = %d, grp_to_be_debug_replaced = %d, "
+	     "grp_maybe_modified = %d, "
 	     "grp_not_necessarilly_dereferenced = %d\n",
 	     access->grp_read, access->grp_write, access->grp_assignment_read,
 	     access->grp_assignment_write, access->grp_scalar_read,
 	     access->grp_scalar_write, access->grp_total_scalarization,
 	     access->grp_hint, access->grp_covered,
 	     access->grp_unscalarizable_region, access->grp_unscalarized_data,
-	     access->grp_partial_lhs, access->grp_to_be_replaced,
-	     access->grp_to_be_debug_replaced, access->grp_maybe_modified,
+	     access->grp_same_access_path, access->grp_partial_lhs,
+	     access->grp_to_be_replaced, access->grp_to_be_debug_replaced,
+	     access->grp_maybe_modified,
 	     access->grp_not_necessarilly_dereferenced);
   else
     fprintf (f, ", write = %d, grp_total_scalarization = %d, "
@@ -540,6 +550,15 @@ find_access_in_subtree (struct access *access, HOST_WIDE_INT offset,
       access = child;
     }
 
+  /* Total scalarization does not replace single field structures with their
+     single field but rather creates an access for them underneath.  Look for
+     it.  */
+  if (access)
+    while (access->first_child
+	   && access->first_child->offset == offset
+	   && access->first_child->size == size)
+      access = access->first_child;
+
   return access;
 }
 
@@ -971,7 +990,8 @@ create_access (tree expr, gimple *stmt, bool write)
 static bool
 scalarizable_type_p (tree type, bool const_decl)
 {
-  gcc_assert (!is_gimple_reg_type (type));
+  if (is_gimple_reg_type (type))
+    return true;
   if (type_contains_placeholder_p (type))
     return false;
 
@@ -986,8 +1006,7 @@ scalarizable_type_p (tree type, bool const_decl)
 	  if (DECL_BIT_FIELD (fld))
 	    return false;
 
-	  if (!is_gimple_reg_type (ft)
-	      && !scalarizable_type_p (ft, const_decl))
+	  if (!scalarizable_type_p (ft, const_decl))
 	    return false;
 	}
 
@@ -1017,8 +1036,7 @@ scalarizable_type_p (tree type, bool const_decl)
 	return false;
 
       tree elem = TREE_TYPE (type);
-      if (!is_gimple_reg_type (elem)
-	  && !scalarizable_type_p (elem, const_decl))
+      if (!scalarizable_type_p (elem, const_decl))
 	return false;
       return true;
     }
@@ -1027,114 +1045,6 @@ scalarizable_type_p (tree type, bool const_decl)
   }
 }
 
-static void scalarize_elem (tree, HOST_WIDE_INT, HOST_WIDE_INT, bool, tree, tree);
-
-/* Create total_scalarization accesses for all scalar fields of a member
-   of type DECL_TYPE conforming to scalarizable_type_p.  BASE
-   must be the top-most VAR_DECL representing the variable; within that,
-   OFFSET locates the member and REF must be the memory reference expression for
-   the member.  */
-
-static void
-completely_scalarize (tree base, tree decl_type, HOST_WIDE_INT offset, tree ref)
-{
-  switch (TREE_CODE (decl_type))
-    {
-    case RECORD_TYPE:
-      for (tree fld = TYPE_FIELDS (decl_type); fld; fld = DECL_CHAIN (fld))
-	if (TREE_CODE (fld) == FIELD_DECL)
-	  {
-	    HOST_WIDE_INT pos = offset + int_bit_position (fld);
-	    tree ft = TREE_TYPE (fld);
-	    tree nref = build3 (COMPONENT_REF, ft, ref, fld, NULL_TREE);
-
-	    scalarize_elem (base, pos, tree_to_uhwi (DECL_SIZE (fld)),
-			    TYPE_REVERSE_STORAGE_ORDER (decl_type),
-			    nref, ft);
-	  }
-      break;
-    case ARRAY_TYPE:
-      {
-	tree elemtype = TREE_TYPE (decl_type);
-	tree elem_size = TYPE_SIZE (elemtype);
-	gcc_assert (elem_size && tree_fits_shwi_p (elem_size));
-	HOST_WIDE_INT el_size = tree_to_shwi (elem_size);
-	gcc_assert (el_size > 0);
-
-	tree minidx = TYPE_MIN_VALUE (TYPE_DOMAIN (decl_type));
-	gcc_assert (TREE_CODE (minidx) == INTEGER_CST);
-	tree maxidx = TYPE_MAX_VALUE (TYPE_DOMAIN (decl_type));
-	/* Skip (some) zero-length arrays; others have MAXIDX == MINIDX - 1.  */
-	if (maxidx)
-	  {
-	    gcc_assert (TREE_CODE (maxidx) == INTEGER_CST);
-	    tree domain = TYPE_DOMAIN (decl_type);
-	    /* MINIDX and MAXIDX are inclusive, and must be interpreted in
-	       DOMAIN (e.g. signed int, whereas min/max may be size_int).  */
-	    offset_int idx = wi::to_offset (minidx);
-	    offset_int max = wi::to_offset (maxidx);
-	    if (!TYPE_UNSIGNED (domain))
-	      {
-		idx = wi::sext (idx, TYPE_PRECISION (domain));
-		max = wi::sext (max, TYPE_PRECISION (domain));
-	      }
-	    for (int el_off = offset; idx <= max; ++idx)
-	      {
-		tree nref = build4 (ARRAY_REF, elemtype,
-				    ref,
-				    wide_int_to_tree (domain, idx),
-				    NULL_TREE, NULL_TREE);
-		scalarize_elem (base, el_off, el_size,
-				TYPE_REVERSE_STORAGE_ORDER (decl_type),
-				nref, elemtype);
-		el_off += el_size;
-	      }
-	  }
-      }
-      break;
-    default:
-      gcc_unreachable ();
-    }
-}
-
-/* Create total_scalarization accesses for a member of type TYPE, which must
-   satisfy either is_gimple_reg_type or scalarizable_type_p.  BASE must be the
-   top-most VAR_DECL representing the variable; within that, POS and SIZE locate
-   the member, REVERSE gives its torage order. and REF must be the reference
-   expression for it.  */
-
-static void
-scalarize_elem (tree base, HOST_WIDE_INT pos, HOST_WIDE_INT size, bool reverse,
-		tree ref, tree type)
-{
-  if (is_gimple_reg_type (type))
-  {
-    struct access *access = create_access_1 (base, pos, size);
-    access->expr = ref;
-    access->type = type;
-    access->grp_total_scalarization = 1;
-    access->reverse = reverse;
-    /* Accesses for intraprocedural SRA can have their stmt NULL.  */
-  }
-  else
-    completely_scalarize (base, type, pos, ref);
-}
-
-/* Create a total_scalarization access for VAR as a whole.  VAR must be of a
-   RECORD_TYPE or ARRAY_TYPE conforming to scalarizable_type_p.  */
-
-static void
-create_total_scalarization_access (tree var)
-{
-  HOST_WIDE_INT size = tree_to_uhwi (DECL_SIZE (var));
-  struct access *access;
-
-  access = create_access_1 (var, 0, size);
-  access->expr = var;
-  access->type = TREE_TYPE (var);
-  access->grp_total_scalarization = 1;
-}
-
 /* Return true if REF has an VIEW_CONVERT_EXPR somewhere in it.  */
 
 static inline bool
@@ -1795,6 +1705,30 @@ build_ref_for_offset (location_t loc, tree base, poly_int64 offset,
   return mem_ref;
 }
 
+/* Construct and return a memory reference that is equal to a portion of
+   MODEL->expr but is based on BASE.  If this cannot be done, return NULL.  */
+
+static tree
+build_reconstructed_reference (location_t, tree base, struct access *model)
+{
+  tree expr = model->expr, prev_expr = NULL;
+  while (!types_compatible_p (TREE_TYPE (expr), TREE_TYPE (base)))
+    {
+      if (!handled_component_p (expr))
+	return NULL;
+      prev_expr = expr;
+      expr = TREE_OPERAND (expr, 0);
+    }
+
+  if (get_object_alignment (base) < get_object_alignment (expr))
+    return NULL;
+
+  TREE_OPERAND (prev_expr, 0) = base;
+  tree ref = unshare_expr (model->expr);
+  TREE_OPERAND (prev_expr, 0) = expr;
+  return ref;
+}
+
 /* Construct a memory reference to a part of an aggregate BASE at the given
    OFFSET and of the same type as MODEL.  In case this is a reference to a
    bit-field, the function will replicate the last component_ref of model's
@@ -1822,9 +1756,19 @@ build_ref_for_model (location_t loc, tree base, HOST_WIDE_INT offset,
 			      NULL_TREE);
     }
   else
-    return
-      build_ref_for_offset (loc, base, offset, model->reverse, model->type,
-			    gsi, insert_after);
+    {
+      tree res;
+      if (model->grp_same_access_path
+	  && !TREE_THIS_VOLATILE (base)
+	  && offset <= model->offset
+	  /* build_reconstructed_reference can still fail if we have already
+	     massaged BASE because of another type incompatibility.  */
+	  && (res = build_reconstructed_reference (loc, base, model)))
+	return res;
+      else
+	return build_ref_for_offset (loc, base, offset, model->reverse,
+				     model->type, gsi, insert_after);
+    }
 }
 
 /* Attempt to build a memory reference that we could but into a gimple
@@ -2076,6 +2020,69 @@ find_var_candidates (void)
   return ret;
 }
 
+/* Return true if EXP is a reference chain of COMPONENT_REFs and AREAY_REFs
+   ending either with a DECL or a MEM_REF with zero offset.  */
+
+static bool
+path_comparable_for_same_access (tree expr)
+{
+  while (handled_component_p (expr))
+    {
+      if (TREE_CODE (expr) == ARRAY_REF)
+	{
+	  /* SSA name indices can occur here too when the array is of sie one.
+	     But we cannot just re-use array_refs with SSA names elsewhere in
+	     the function, so disallow non-constant indices.  TODO: Remove this
+	     limitation after teaching build_reconstructed_reference to replace
+	     the index with the index type lower bound.  */
+	  if (TREE_CODE (TREE_OPERAND (expr, 1)) != INTEGER_CST)
+	    return false;
+	}
+      expr = TREE_OPERAND (expr, 0);
+    }
+
+  if (TREE_CODE (expr) == MEM_REF)
+    {
+      if (!zerop (TREE_OPERAND (expr, 1)))
+	return false;
+    }
+  else
+    gcc_assert (DECL_P (expr));
+
+  return true;
+}
+
+/* Assuming that EXP1 consists of only COMPONENT_REFs and ARRAY_REFs, return
+   true if the chain of these handled components are exactly the same as EXP2
+   and the expression under them is the same DECL or an equivalent MEM_REF.
+   The reference picked by compare_access_positions must go to EXP1.  */
+
+static bool
+same_access_path_p (tree exp1, tree exp2)
+{
+  if (TREE_CODE (exp1) != TREE_CODE (exp2))
+    {
+      /* Special case single-field structures loaded sometimes as the field
+	 and sometimes as the structure.  If the field is of a scalar type,
+	 compare_access_positions will put it into exp1.
+
+	 TODO: The gimple register type condition can be removed if teach
+	 compare_access_positions to put inner types first.  */
+      if (is_gimple_reg_type (TREE_TYPE (exp1))
+	  && TREE_CODE (exp1) == COMPONENT_REF
+	  && (TYPE_MAIN_VARIANT (TREE_TYPE (TREE_OPERAND (exp1, 0)))
+	      == TYPE_MAIN_VARIANT (TREE_TYPE (exp2))))
+	exp1 = TREE_OPERAND (exp1, 0);
+      else
+	return false;
+    }
+
+  if (!operand_equal_p (exp1, exp2, OEP_ADDRESS_OF))
+    return false;
+
+  return true;
+}
+
 /* Sort all accesses for the given variable, check for partial overlaps and
    return NULL if there are any.  If there are none, pick a representative for
    each combination of offset and size and create a linked list out of them.
@@ -2112,10 +2119,10 @@ sort_and_splice_var_accesses (tree var)
       bool grp_assignment_read = access->grp_assignment_read;
       bool grp_assignment_write = access->grp_assignment_write;
       bool multiple_scalar_reads = false;
-      bool total_scalarization = access->grp_total_scalarization;
       bool grp_partial_lhs = access->grp_partial_lhs;
       bool first_scalar = is_gimple_reg_type (access->type);
       bool unscalarizable_region = access->grp_unscalarizable_region;
+      bool grp_same_access_path = true;
       bool bf_non_full_precision
 	= (INTEGRAL_TYPE_P (access->type)
 	   && TYPE_PRECISION (access->type) != access->size
@@ -2134,6 +2141,8 @@ sort_and_splice_var_accesses (tree var)
 	gcc_assert (access->offset >= low
 		    && access->offset + access->size <= high);
 
+      grp_same_access_path = path_comparable_for_same_access (access->expr);
+
       j = i + 1;
       while (j < access_count)
 	{
@@ -2161,7 +2170,6 @@ sort_and_splice_var_accesses (tree var)
 	  grp_assignment_write |= ac2->grp_assignment_write;
 	  grp_partial_lhs |= ac2->grp_partial_lhs;
 	  unscalarizable_region |= ac2->grp_unscalarizable_region;
-	  total_scalarization |= ac2->grp_total_scalarization;
 	  relink_to_new_repr (access, ac2);
 
 	  /* If there are both aggregate-type and scalar-type accesses with
@@ -2184,6 +2192,11 @@ sort_and_splice_var_accesses (tree var)
 		}
 	      unscalarizable_region = true;
 	    }
+
+	  if (grp_same_access_path
+	      && !same_access_path_p (access->expr, ac2->expr))
+	    grp_same_access_path = false;
+
 	  ac2->group_representative = access;
 	  j++;
 	}
@@ -2197,11 +2210,10 @@ sort_and_splice_var_accesses (tree var)
       access->grp_scalar_write = grp_scalar_write;
       access->grp_assignment_read = grp_assignment_read;
       access->grp_assignment_write = grp_assignment_write;
-      access->grp_hint = total_scalarization
-	|| (multiple_scalar_reads && !constant_decl_p (var));
-      access->grp_total_scalarization = total_scalarization;
+      access->grp_hint = multiple_scalar_reads && !constant_decl_p (var);
       access->grp_partial_lhs = grp_partial_lhs;
       access->grp_unscalarizable_region = unscalarizable_region;
+      access->grp_same_access_path = grp_same_access_path;
 
       *prev_acc_ptr = access;
       prev_acc_ptr = &access->next_grp;
@@ -2395,6 +2407,88 @@ build_access_trees (struct access *access)
   return true;
 }
 
+/* Traverse the access forest where ROOT is the first root and verify that
+   various important invariants hold true.  */
+
+DEBUG_FUNCTION void
+verify_sra_access_forest (struct access *root)
+{
+  struct access *access = root;
+  tree first_base = root->base;
+  gcc_assert (DECL_P (first_base));
+  do
+    {
+      gcc_assert (access->base == first_base);
+      if (access->parent)
+	gcc_assert (access->offset >= access->parent->offset
+		    && access->size <= access->parent->size);
+      if (access->next_sibling)
+	gcc_assert (access->next_sibling->offset
+		    >= access->offset + access->size);
+
+      poly_int64 poffset, psize, pmax_size;
+      bool reverse;
+      tree base = get_ref_base_and_extent (access->expr, &poffset, &psize,
+					   &pmax_size, &reverse);
+      HOST_WIDE_INT offset, size, max_size;
+      if (!poffset.is_constant (&offset)
+	  || !psize.is_constant (&size)
+	  || !pmax_size.is_constant (&max_size))
+	gcc_unreachable ();
+      gcc_assert (base == first_base);
+      gcc_assert (offset == access->offset);
+      gcc_assert (access->grp_unscalarizable_region
+		  || size == max_size);
+      gcc_assert (max_size == access->size);
+      gcc_assert (reverse == access->reverse);
+
+      if (access->first_child)
+	{
+	  gcc_assert (access->first_child->parent == access);
+	  access = access->first_child;
+	}
+      else if (access->next_sibling)
+	{
+	  gcc_assert (access->next_sibling->parent == access->parent);
+	  access = access->next_sibling;
+	}
+      else
+	{
+	  while (access->parent && !access->next_sibling)
+	    access = access->parent;
+	  if (access->next_sibling)
+	    access = access->next_sibling;
+	  else
+	    {
+	      gcc_assert (access == root);
+	      root = root->next_grp;
+	      access = root;
+	    }
+	}
+    }
+  while (access);
+}
+
+/* Verify access forests of all candidates with accesses by calling
+   verify_access_forest on each on them.  */
+
+DEBUG_FUNCTION void
+verify_all_sra_access_forests (void)
+{
+  bitmap_iterator bi;
+  unsigned i;
+  EXECUTE_IF_SET_IN_BITMAP (candidate_bitmap, 0, i, bi)
+    {
+      tree var = candidate (i);
+      struct access *access = get_first_repr_for_decl (var);
+      if (access)
+	{
+	  gcc_assert (access->base == var);
+	  verify_sra_access_forest (access);
+	}
+    }
+}
+
 /* Return true if expr contains some ARRAY_REFs into a variable bounded
    array.  */
 
@@ -2412,15 +2506,16 @@ expr_with_var_bounded_array_refs_p (tree expr)
 }
 
 /* Analyze the subtree of accesses rooted in ROOT, scheduling replacements when
-   both seeming beneficial and when ALLOW_REPLACEMENTS allows it.  Also set all
-   sorts of access flags appropriately along the way, notably always set
-   grp_read and grp_assign_read according to MARK_READ and grp_write when
-   MARK_WRITE is true.
+   both seeming beneficial and when ALLOW_REPLACEMENTS allows it.  If TOTALLY
+   is set, we are totally scalarizing the aggregate.  Also set all sorts of
+   access flags appropriately along the way, notably always set grp_read and
+   grp_assign_read according to MARK_READ and grp_write when MARK_WRITE is
+   true.
 
    Creating a replacement for a scalar access is considered beneficial if its
-   grp_hint is set (this means we are either attempting total scalarization or
-   there is more than one direct read access) or according to the following
-   table:
+   grp_hint ot TOTALLY is set (this means either that there is more than one
+   direct read access or that we are attempting total scalarization) or
+   according to the following table:
 
    Access written to through a scalar type (once or more times)
    |
@@ -2451,7 +2546,7 @@ expr_with_var_bounded_array_refs_p (tree expr)
 
 static bool
 analyze_access_subtree (struct access *root, struct access *parent,
-			bool allow_replacements)
+			bool allow_replacements, bool totally)
 {
   struct access *child;
   HOST_WIDE_INT limit = root->offset + root->size;
@@ -2469,8 +2564,8 @@ analyze_access_subtree (struct access *root, struct access *parent,
 	root->grp_write = 1;
       if (parent->grp_assignment_write)
 	root->grp_assignment_write = 1;
-      if (parent->grp_total_scalarization)
-	root->grp_total_scalarization = 1;
+      if (!parent->grp_same_access_path)
+	root->grp_same_access_path = 0;
     }
 
   if (root->grp_unscalarizable_region)
@@ -2483,10 +2578,10 @@ analyze_access_subtree (struct access *root, struct access *parent,
     {
       hole |= covered_to < child->offset;
       sth_created |= analyze_access_subtree (child, root,
-					     allow_replacements && !scalar);
+					     allow_replacements && !scalar,
+					     totally);
 
       root->grp_unscalarized_data |= child->grp_unscalarized_data;
-      root->grp_total_scalarization &= child->grp_total_scalarization;
       if (child->grp_covered)
 	covered_to += child->size;
       else
@@ -2494,7 +2589,9 @@ analyze_access_subtree (struct access *root, struct access *parent,
     }
 
   if (allow_replacements && scalar && !root->first_child
-      && (root->grp_hint
+      && (totally || !root->grp_total_scalarization)
+      && (totally
+	  || root->grp_hint
 	  || ((root->grp_scalar_read || root->grp_assignment_read)
 	      && (root->grp_scalar_write || root->grp_assignment_write))))
     {
@@ -2536,6 +2633,7 @@ analyze_access_subtree (struct access *root, struct access *parent,
     {
       if (allow_replacements
 	  && scalar && !root->first_child
+	  && !root->grp_total_scalarization
 	  && (root->grp_scalar_write || root->grp_assignment_write)
 	  && !bitmap_bit_p (cannot_scalarize_away_bitmap,
 			    DECL_UID (root->base)))
@@ -2556,7 +2654,7 @@ analyze_access_subtree (struct access *root, struct access *parent,
 	root->grp_total_scalarization = 0;
     }
 
-  if (!hole || root->grp_total_scalarization)
+  if (!hole || totally)
     root->grp_covered = 1;
   else if (root->grp_write || comes_initialized_p (root->base))
     root->grp_unscalarized_data = 1; /* not covered and written to */
@@ -2572,7 +2670,8 @@ analyze_access_trees (struct access *access)
 
   while (access)
     {
-      if (analyze_access_subtree (access, NULL, true))
+      if (analyze_access_subtree (access, NULL, true,
+				  access->grp_total_scalarization))
 	ret = true;
       access = access->next_grp;
     }
@@ -2638,6 +2737,7 @@ create_artificial_child_access (struct access *parent, struct access *model,
   access->offset = new_offset;
   access->size = model->size;
   access->type = model->type;
+  access->parent = parent;
   access->grp_write = set_grp_write;
   access->grp_read = false;
   access->reverse = model->reverse;
@@ -2721,13 +2821,17 @@ propagate_subaccesses_across_link (struct access *lacc, struct access *racc)
 	  lacc->type = racc->type;
 	  if (build_user_friendly_ref_for_offset (&t, TREE_TYPE (t),
 						  lacc->offset, racc->type))
-	    lacc->expr = t;
+	    {
+	      lacc->expr = t;
+	      lacc->grp_same_access_path = true;
+	    }
 	  else
 	    {
 	      lacc->expr = build_ref_for_model (EXPR_LOCATION (lacc->base),
 						lacc->base, lacc->offset,
 						racc, NULL, false);
 	      lacc->grp_no_warning = true;
+	      lacc->grp_same_access_path = false;
 	    }
 	}
       return ret;
@@ -2840,6 +2944,369 @@ propagate_all_subaccesses (void)
     }
 }
 
+/* Return true if the forest beginning with ROOT does not contain
+   unscalarizable regions or non-byte aligned accesses.  */
+
+static bool
+can_totally_scalarize_forest_p (struct access *root)
+{
+  struct access *access = root;
+  do
+    {
+      if (access->grp_unscalarizable_region
+	  || (access->offset % BITS_PER_UNIT) != 0
+	  || (access->size % BITS_PER_UNIT) != 0
+	  || (is_gimple_reg_type (access->type)
+	      && access->first_child))
+	return false;
+
+      if (access->first_child)
+	access = access->first_child;
+      else if (access->next_sibling)
+	access = access->next_sibling;
+      else
+	{
+	  while (access->parent && !access->next_sibling)
+	    access = access->parent;
+	  if (access->next_sibling)
+	    access = access->next_sibling;
+	  else
+	    {
+	      gcc_assert (access == root);
+	      root = root->next_grp;
+	      access = root;
+	    }
+	}
+    }
+  while (access);
+  return true;
+}
+
+/* Create and return an ACCESS in PARENT spanning from POS with SIZE, TYPE and
+   reference EXPR for total scalarization purposes and mark it as such.  Within
+   the children of PARENT, link it in between PTR and NEXT_SIBLING.  */
+
+static struct access *
+create_total_scalarization_access (struct access *parent, HOST_WIDE_INT pos,
+				   HOST_WIDE_INT size, tree type, tree expr,
+				   struct access **ptr,
+				   struct access *next_sibling)
+{
+  struct access *access = access_pool.allocate ();
+  memset (access, 0, sizeof (struct access));
+  access->base = parent->base;
+  access->offset = pos;
+  access->size = size;
+  access->expr = expr;
+  access->type = type;
+  access->parent = parent;
+  access->grp_write = parent->grp_write;
+  access->grp_total_scalarization = 1;
+  access->grp_hint = 1;
+  access->grp_same_access_path = path_comparable_for_same_access (expr);
+  access->reverse = reverse_storage_order_for_component_p (expr);
+
+  access->next_sibling = next_sibling;
+  *ptr = access;
+  return access;
+}
+
+/* Create and return an ACCESS in PARENT spanning from POS with SIZE, TYPE and
+   reference EXPR for total scalarization purposes and mark it as such, link it
+   at *PTR and reshape the tree so that those elements at *PTR and their
+   siblings which fall within the part described by POS and SIZE are moved to
+   be children of the new access.  If a partial overlap is detected, return
+   NULL.  */
+
+static struct access *
+create_total_access_and_reshape (struct access *parent, HOST_WIDE_INT pos,
+				 HOST_WIDE_INT size, tree type, tree expr,
+				 struct access **ptr)
+{
+  struct access **p = ptr;
+
+  while (*p && (*p)->offset < pos + size)
+    {
+      if ((*p)->offset + (*p)->size > pos + size)
+	return NULL;
+      p = &(*p)->next_sibling;
+    }
+
+  struct access *next_child = *ptr;
+  struct access *new_acc
+    = create_total_scalarization_access (parent, pos, size, type, expr,
+					 ptr, *p);
+  if (p != ptr)
+    {
+      new_acc->first_child = next_child;
+      *p = NULL;
+      for (struct access *a = next_child; a; a = a->next_sibling)
+	a->parent = new_acc;
+    }
+  return new_acc;
+}
+
+static bool totally_scalarize_subtree (struct access *root);
+
+/* Return true if INNER is either the same type as OUTER or if it is the type
+   of a record field in OUTER at offset zero, possibly in nested
+   sub-records.  */
+
+static bool
+access_and_field_type_match_p (tree outer, tree inner)
+{
+  if (TYPE_MAIN_VARIANT (outer) == TYPE_MAIN_VARIANT (inner))
+    return true;
+  if (TREE_CODE (outer) != RECORD_TYPE)
+    return false;
+  tree fld = TYPE_FIELDS (outer);
+  while (fld)
+    {
+     if (TREE_CODE (fld) == FIELD_DECL)
+       {
+	if (!zerop (DECL_FIELD_OFFSET (fld)))
+	  return false;
+	if (TYPE_MAIN_VARIANT (TREE_TYPE (fld)) == inner)
+	  return true;
+	if (TREE_CODE (TREE_TYPE (fld)) == RECORD_TYPE)
+	  fld = TYPE_FIELDS (TREE_TYPE (fld));
+	else
+	  return false;
+       }
+     else
+       fld = DECL_CHAIN (fld);
+    }
+  return false;
+}
+
+/* Return type of total_should_skip_creating_access indicating whether a total
+   scalarization access for a field/element should be created, whether it
+   already exists or whether the entire total scalarization has to fail.  */
+
+enum total_sra_field_state {TOTAL_FLD_CREATE, TOTAL_FLD_DONE, TOTAL_FLD_FAILED};
+
+/* Do all the necessary steps in total scalarization when the given aggregate
+   type has a TYPE at POS with the given SIZE should be put into PARENT and
+   when we have processed all its siblings with smaller offsets up until and
+   including LAST_SEEN_SIBLING (which can be NULL).
+
+   If some further siblings are to be skipped, set *LAST_SEEN_SIBLING as
+   appropriate.  Return TOTAL_FLD_CREATE id the caller should carry on with
+   creating a new access, TOTAL_FLD_DONE if access or accesses capable of
+   representing the described part of the aggregate for the purposes of total
+   scalarization already exist or TOTAL_FLD_FAILED if there is a problem which
+   prevents total scalarization from happening at all.  */
+
+static enum total_sra_field_state
+total_should_skip_creating_access (struct access *parent,
+				   struct access **last_seen_sibling,
+				   tree type, HOST_WIDE_INT pos,
+				   HOST_WIDE_INT size)
+{
+  struct access *next_child;
+  if (!*last_seen_sibling)
+    next_child = parent->first_child;
+  else
+    next_child = (*last_seen_sibling)->next_sibling;
+
+  /* First, traverse the chain of siblings until it points to an access with
+     offset at least equal to POS.  Check all skipped accesses whether they
+     span the POS boundary and if so, return with a failure.  */
+  while (next_child && next_child->offset < pos)
+    {
+      if (next_child->offset + next_child->size > pos)
+	return TOTAL_FLD_FAILED;
+      *last_seen_sibling = next_child;
+      next_child = next_child->next_sibling;
+    }
+
+  /* Now check whether next_child has exactly the right POS and SIZE and if so,
+     whether it can represent what we need and can be totally scalarized
+     itself.  */
+  if (next_child && next_child->offset == pos
+      && next_child->size == size)
+    {
+      if (!is_gimple_reg_type (next_child->type)
+	  && (!access_and_field_type_match_p (type, next_child->type)
+	      || !totally_scalarize_subtree (next_child)))
+	return TOTAL_FLD_FAILED;
+
+      *last_seen_sibling = next_child;
+      return TOTAL_FLD_DONE;
+    }
+
+  /* If the child we're looking at would partially overlap, we just cannot
+     totally scalarize.  */
+  if (next_child
+      && next_child->offset < pos + size
+      && next_child->offset + next_child->size > pos + size)
+    return TOTAL_FLD_FAILED;
+
+  if (is_gimple_reg_type (type))
+    {
+      /* We don't scalarize accesses that are children of other scalar type
+	 accesses, so if we go on and create an access for a register type,
+	 there should not be any pre-existing children.  There are rare cases
+	 where the requested type is a vector but we already have register
+	 accesses for all its elements which is equally good.  Detect that
+	 situation or whether we need to bail out.  */
+
+      HOST_WIDE_INT covered = pos;
+      bool skipping = false;
+      while (next_child
+	     && next_child->offset + next_child->size <= pos + size)
+	{
+	  if (next_child->offset != covered
+	      || !is_gimple_reg_type (next_child->type))
+	    return TOTAL_FLD_FAILED;
+
+	  covered += next_child->size;
+	  *last_seen_sibling = next_child;
+	  next_child = next_child->next_sibling;
+	  skipping = true;
+	}
+
+      if (skipping)
+	{
+	  if (covered != pos + size)
+	    return TOTAL_FLD_FAILED;
+	  else
+	    return TOTAL_FLD_DONE;
+	}
+    }
+
+  return TOTAL_FLD_CREATE;
+}
+
+/* Go over sub-tree rooted in ROOT and attempt to create scalar accesses
+   spanning all uncovered areas covered by ROOT, return false if the attempt
+   failed.  All created accesses will have grp_unscalarizable_region set (and
+   should be ignored if the function returns false).  */
+
+static bool
+totally_scalarize_subtree (struct access *root)
+{
+  gcc_checking_assert (!root->grp_unscalarizable_region);
+  gcc_checking_assert (!is_gimple_reg_type (root->type));
+
+  struct access *last_seen_sibling = NULL;
+
+  switch (TREE_CODE (root->type))
+    {
+    case RECORD_TYPE:
+      for (tree fld = TYPE_FIELDS (root->type); fld; fld = DECL_CHAIN (fld))
+	if (TREE_CODE (fld) == FIELD_DECL)
+	  {
+	    tree ft = TREE_TYPE (fld);
+	    HOST_WIDE_INT fsize = tree_to_uhwi (DECL_SIZE (fld));
+	    if (!fsize)
+	      continue;
+
+	    HOST_WIDE_INT pos = root->offset + int_bit_position (fld);
+	    enum total_sra_field_state
+	      state = total_should_skip_creating_access (root,
+							 &last_seen_sibling,
+							 ft, pos, fsize);
+	    switch (state)
+	      {
+	      case TOTAL_FLD_FAILED:
+		return false;
+	      case TOTAL_FLD_DONE:
+		continue;
+	      case TOTAL_FLD_CREATE:
+		break;
+	      default:
+		gcc_unreachable ();
+	      }
+
+	    struct access **p = (last_seen_sibling
+				 ? &last_seen_sibling->next_sibling
+				 : &root->first_child);
+	    tree nref = build3 (COMPONENT_REF, ft, root->expr, fld, NULL_TREE);
+	    struct access *new_child
+	      = create_total_access_and_reshape (root, pos, fsize, ft, nref, p);
+	    if (!new_child)
+	      return false;
+
+	    if (!is_gimple_reg_type (ft)
+		&& !totally_scalarize_subtree (new_child))
+	      return false;
+	    last_seen_sibling = new_child;
+	  }
+      break;
+    case ARRAY_TYPE:
+      {
+	tree elemtype = TREE_TYPE (root->type);
+	tree elem_size = TYPE_SIZE (elemtype);
+	gcc_assert (elem_size && tree_fits_shwi_p (elem_size));
+	HOST_WIDE_INT el_size = tree_to_shwi (elem_size);
+	gcc_assert (el_size > 0);
+
+	tree minidx = TYPE_MIN_VALUE (TYPE_DOMAIN (root->type));
+	gcc_assert (TREE_CODE (minidx) == INTEGER_CST);
+	tree maxidx = TYPE_MAX_VALUE (TYPE_DOMAIN (root->type));
+	/* Skip (some) zero-length arrays; others have MAXIDX == MINIDX - 1.  */
+	if (!maxidx)
+	  goto out;
+	gcc_assert (TREE_CODE (maxidx) == INTEGER_CST);
+	tree domain = TYPE_DOMAIN (root->type);
+	/* MINIDX and MAXIDX are inclusive, and must be interpreted in
+	   DOMAIN (e.g. signed int, whereas min/max may be size_int).  */
+	offset_int idx = wi::to_offset (minidx);
+	offset_int max = wi::to_offset (maxidx);
+	if (!TYPE_UNSIGNED (domain))
+	  {
+	    idx = wi::sext (idx, TYPE_PRECISION (domain));
+	    max = wi::sext (max, TYPE_PRECISION (domain));
+	  }
+	for (HOST_WIDE_INT pos = root->offset;
+	     idx <= max;
+	     pos += el_size, ++idx)
+	  {
+	    enum total_sra_field_state
+	      state = total_should_skip_creating_access (root,
+							 &last_seen_sibling,
+							 elemtype, pos,
+							 el_size);
+	    switch (state)
+	      {
+	      case TOTAL_FLD_FAILED:
+		return false;
+	      case TOTAL_FLD_DONE:
+		continue;
+	      case TOTAL_FLD_CREATE:
+		break;
+	      default:
+		gcc_unreachable ();
+	      }
+
+	    struct access **p = (last_seen_sibling
+				 ? &last_seen_sibling->next_sibling
+				 : &root->first_child);
+	    tree nref = build4 (ARRAY_REF, elemtype, root->expr,
+				wide_int_to_tree (domain, idx),
+				NULL_TREE, NULL_TREE);
+	    struct access *new_child
+	      = create_total_access_and_reshape (root, pos, el_size, elemtype,
+						 nref, p);
+	    if (!new_child)
+	      return false;
+
+	    if (!is_gimple_reg_type (elemtype)
+		&& !totally_scalarize_subtree (new_child))
+	      return false;
+	    last_seen_sibling = new_child;
+	  }
+      }
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+ out:
+  return true;
+}
+
 /* Go through all accesses collected throughout the (intraprocedural) analysis
    stage, exclude overlapping ones, identify representatives and build trees
    out of them, making decisions about scalarization on the way.  Return true
@@ -2852,8 +3319,22 @@ analyze_all_variable_accesses (void)
   bitmap tmp = BITMAP_ALLOC (NULL);
   bitmap_iterator bi;
   unsigned i;
-  bool optimize_speed_p = !optimize_function_for_size_p (cfun);
 
+  bitmap_copy (tmp, candidate_bitmap);
+  EXECUTE_IF_SET_IN_BITMAP (tmp, 0, i, bi)
+    {
+      tree var = candidate (i);
+      struct access *access;
+
+      access = sort_and_splice_var_accesses (var);
+      if (!access || !build_access_trees (access))
+	disqualify_candidate (var,
+			      "No or inhibitingly overlapping accesses.");
+    }
+
+  propagate_all_subaccesses ();
+
+  bool optimize_speed_p = !optimize_function_for_size_p (cfun);
   enum compiler_param param = optimize_speed_p
 			? PARAM_SRA_MAX_SCALARIZATION_SIZE_SPEED
 			: PARAM_SRA_MAX_SCALARIZATION_SIZE_SIZE;
@@ -2872,46 +3353,59 @@ analyze_all_variable_accesses (void)
 	&& !bitmap_bit_p (cannot_scalarize_away_bitmap, i))
       {
 	tree var = candidate (i);
+	if (!VAR_P (var))
+	  continue;
 
-	if (VAR_P (var) && scalarizable_type_p (TREE_TYPE (var),
-						constant_decl_p (var)))
+	if (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (var))) > max_scalarization_size)
 	  {
-	    if (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (var)))
-		<= max_scalarization_size)
-	      {
-		create_total_scalarization_access (var);
-		completely_scalarize (var, TREE_TYPE (var), 0, var);
-		statistics_counter_event (cfun,
-					  "Totally-scalarized aggregates", 1);
-		if (dump_file && (dump_flags & TDF_DETAILS))
-		  {
-		    fprintf (dump_file, "Will attempt to totally scalarize ");
-		    print_generic_expr (dump_file, var);
-		    fprintf (dump_file, " (UID: %u): \n", DECL_UID (var));
-		  }
-	      }
-	    else if (dump_file && (dump_flags & TDF_DETAILS))
+	    if (dump_file && (dump_flags & TDF_DETAILS))
 	      {
 		fprintf (dump_file, "Too big to totally scalarize: ");
 		print_generic_expr (dump_file, var);
 		fprintf (dump_file, " (UID: %u)\n", DECL_UID (var));
 	      }
+	    continue;
 	  }
-      }
 
-  bitmap_copy (tmp, candidate_bitmap);
-  EXECUTE_IF_SET_IN_BITMAP (tmp, 0, i, bi)
-    {
-      tree var = candidate (i);
-      struct access *access;
+	bool all_types_ok = true;
+	for (struct access *access = get_first_repr_for_decl (var);
+	     access;
+	     access = access->next_grp)
+	  if (!can_totally_scalarize_forest_p (access)
+	      || !scalarizable_type_p (access->type, constant_decl_p (var)))
+	    {
+	      all_types_ok = false;
+	      break;
+	    }
+	if (!all_types_ok)
+	  continue;
 
-      access = sort_and_splice_var_accesses (var);
-      if (!access || !build_access_trees (access))
-	disqualify_candidate (var,
-			      "No or inhibitingly overlapping accesses.");
-    }
+	if (dump_file && (dump_flags & TDF_DETAILS))
+	  {
+	    fprintf (dump_file, "Will attempt to totally scalarize ");
+	    print_generic_expr (dump_file, var);
+	    fprintf (dump_file, " (UID: %u): \n", DECL_UID (var));
+	  }
+	bool scalarized = true;
+	for (struct access *access = get_first_repr_for_decl (var);
+	     access;
+	     access = access->next_grp)
+	  if (!is_gimple_reg_type (access->type)
+	      && !totally_scalarize_subtree (access))
+	    {
+	      scalarized = false;
+	      break;
+	    }
 
-  propagate_all_subaccesses ();
+	if (scalarized)
+	  for (struct access *access = get_first_repr_for_decl (var);
+	       access;
+	       access = access->next_grp)
+	    access->grp_total_scalarization = true;
+      }
+
+  if (flag_checking)
+    verify_all_sra_access_forests ();
 
   bitmap_copy (tmp, candidate_bitmap);
   EXECUTE_IF_SET_IN_BITMAP (tmp, 0, i, bi)
@@ -3775,25 +4269,39 @@ initialize_constant_pool_replacements (void)
       tree var = candidate (i);
       if (!constant_decl_p (var))
 	continue;
-      vec<access_p> *access_vec = get_base_access_vector (var);
-      if (!access_vec)
-	continue;
-      for (unsigned i = 0; i < access_vec->length (); i++)
+
+      struct access *access = get_first_repr_for_decl (var);
+
+      while (access)
 	{
-	  struct access *access = (*access_vec)[i];
-	  if (!access->replacement_decl)
-	    continue;
-	  gassign *stmt
-	    = gimple_build_assign (get_access_replacement (access),
-				   unshare_expr (access->expr));
-	  if (dump_file && (dump_flags & TDF_DETAILS))
+	  if (access->replacement_decl)
 	    {
-	      fprintf (dump_file, "Generating constant initializer: ");
-	      print_gimple_stmt (dump_file, stmt, 0);
-	      fprintf (dump_file, "\n");
+	      gassign *stmt
+		= gimple_build_assign (get_access_replacement (access),
+				       unshare_expr (access->expr));
+	      if (dump_file && (dump_flags & TDF_DETAILS))
+		{
+		  fprintf (dump_file, "Generating constant initializer: ");
+		  print_gimple_stmt (dump_file, stmt, 0);
+		  fprintf (dump_file, "\n");
+		}
+	      gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
+	      update_stmt (stmt);
+	    }
+
+	  if (access->first_child)
+	    access = access->first_child;
+	  else if (access->next_sibling)
+	    access = access->next_sibling;
+	  else
+	    {
+	      while (access->parent && !access->next_sibling)
+		access = access->parent;
+	      if (access->next_sibling)
+		access = access->next_sibling;
+	      else
+		access = access->next_grp;
 	    }
-	  gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
-	  update_stmt (stmt);
 	}
     }
 
diff --git a/gcc/tree-ssa-address.c b/gcc/tree-ssa-address.c
index 2e5d87734..3195a21c7 100644
--- a/gcc/tree-ssa-address.c
+++ b/gcc/tree-ssa-address.c
@@ -1141,6 +1141,35 @@ maybe_fold_tmr (tree ref)
   return new_ref;
 }
 
+/* Return the preferred index scale factor for accessing memory of mode
+   MEM_MODE in the address space of pointer BASE.  Assume that we're
+   optimizing for speed if SPEED is true and for size otherwise.  */
+unsigned int
+preferred_mem_scale_factor (tree base, machine_mode mem_mode,
+			    bool speed)
+{
+  struct mem_address parts = {};
+  addr_space_t as = TYPE_ADDR_SPACE (TREE_TYPE (base));
+  unsigned int fact = GET_MODE_UNIT_SIZE (mem_mode);
+
+  /* Addressing mode "base + index".  */
+  parts.index = integer_one_node;
+  parts.base = integer_one_node;
+  rtx addr = addr_for_mem_ref (&parts, as, false);
+  unsigned cost = address_cost (addr, mem_mode, as, speed);
+
+  /* Addressing mode "base + index << scale".  */
+  parts.step = wide_int_to_tree (sizetype, fact);
+  addr = addr_for_mem_ref (&parts, as, false);
+  unsigned new_cost = address_cost (addr, mem_mode, as, speed);
+
+  /* Compare the cost of an address with an unscaled index with
+     a scaled index and return factor if useful. */
+  if (new_cost < cost)
+    return GET_MODE_UNIT_SIZE (mem_mode);
+  return 1;
+}
+
 /* Dump PARTS to FILE.  */
 
 extern void dump_mem_address (FILE *, struct mem_address *);
diff --git a/gcc/tree-ssa-address.h b/gcc/tree-ssa-address.h
index 6fa4eae89..9812f36fb 100644
--- a/gcc/tree-ssa-address.h
+++ b/gcc/tree-ssa-address.h
@@ -39,4 +39,7 @@ tree create_mem_ref (gimple_stmt_iterator *, tree,
 extern void copy_ref_info (tree, tree);
 tree maybe_fold_tmr (tree);
 
+extern unsigned int preferred_mem_scale_factor (tree base,
+						machine_mode mem_mode,
+						bool speed);
 #endif /* GCC_TREE_SSA_ADDRESS_H */
diff --git a/gcc/tree-ssa-ccp.c b/gcc/tree-ssa-ccp.c
index 8db6a34e0..dbe2fda96 100644
--- a/gcc/tree-ssa-ccp.c
+++ b/gcc/tree-ssa-ccp.c
@@ -614,9 +614,17 @@ get_value_for_expr (tree expr, bool for_bits_p)
 	  val.mask = -1;
 	}
       if (for_bits_p
-	  && val.lattice_val == CONSTANT
-	  && TREE_CODE (val.value) == ADDR_EXPR)
-	val = get_value_from_alignment (val.value);
+	  && val.lattice_val == CONSTANT)
+	{
+	  if (TREE_CODE (val.value) == ADDR_EXPR)
+	    val = get_value_from_alignment (val.value);
+	  else if (TREE_CODE (val.value) != INTEGER_CST)
+	    {
+	      val.lattice_val = VARYING;
+	      val.value = NULL_TREE;
+	      val.mask = -1;
+	    }
+	}
       /* Fall back to a copy value.  */
       if (!for_bits_p
 	  && val.lattice_val == VARYING
@@ -2566,7 +2574,7 @@ optimize_stack_restore (gimple_stmt_iterator i)
 	  || ALLOCA_FUNCTION_CODE_P (DECL_FUNCTION_CODE (callee)))
 	return NULL_TREE;
 
-      if (DECL_FUNCTION_CODE (callee) == BUILT_IN_STACK_RESTORE)
+      if (fndecl_built_in_p (callee, BUILT_IN_STACK_RESTORE))
 	goto second_stack_restore;
     }
 
@@ -2625,9 +2633,6 @@ optimize_stdarg_builtin (gimple *call)
   bool va_list_simple_ptr;
   location_t loc = gimple_location (call);
 
-  if (gimple_code (call) != GIMPLE_CALL)
-    return NULL_TREE;
-
   callee = gimple_call_fndecl (call);
 
   cfun_va_list = targetm.fn_abi_va_list (callee);
@@ -2930,12 +2935,10 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
 				    bit, flag);
   gimple_call_set_lhs (g, new_lhs);
   gimple_set_location (g, gimple_location (call));
-  gimple_set_vuse (g, gimple_vuse (call));
-  gimple_set_vdef (g, gimple_vdef (call));
+  gimple_move_vops (g, call);
   bool throws = stmt_can_throw_internal (cfun, call);
   gimple_call_set_nothrow (as_a <gcall *> (g),
 			   gimple_call_nothrow_p (as_a <gcall *> (call)));
-  SSA_NAME_DEF_STMT (gimple_vdef (call)) = g;
   gimple_stmt_iterator gsi = *gsip;
   gsi_insert_after (&gsi, g, GSI_NEW_STMT);
   edge e = NULL;
diff --git a/gcc/tree-ssa-dce.c b/gcc/tree-ssa-dce.c
index a38899edd..be9f501c9 100644
--- a/gcc/tree-ssa-dce.c
+++ b/gcc/tree-ssa-dce.c
@@ -115,6 +115,14 @@ static bool cfg_altered;
 static int *bb_postorder;
 
 
+/* True if we should treat any stmt with a vdef as necessary.  */
+
+static inline bool
+keep_all_vdefs_p ()
+{
+  return optimize_debug;
+}
+
 /* If STMT is not already marked necessary, mark it, and add it to the
    worklist if ADD_TO_WORKLIST is true.  */
 
@@ -311,6 +319,12 @@ mark_stmt_if_obviously_necessary (gimple *stmt, bool aggressive)
       return;
     }
 
+  if (gimple_vdef (stmt) && keep_all_vdefs_p ())
+    {
+      mark_stmt_necessary (stmt, true);
+      return;
+    }
+
   return;
 }
 
@@ -526,6 +540,9 @@ mark_aliased_reaching_defs_necessary_1 (ao_ref *ref, tree vdef, void *data)
 static void
 mark_aliased_reaching_defs_necessary (gimple *stmt, tree ref)
 {
+  /* Should have been caught before calling this function.  */
+  gcc_checking_assert (!keep_all_vdefs_p ());
+
   unsigned int chain;
   ao_ref refd;
   gcc_assert (!chain_ovfl);
@@ -599,6 +616,8 @@ mark_all_reaching_defs_necessary_1 (ao_ref *ref ATTRIBUTE_UNUSED,
 static void
 mark_all_reaching_defs_necessary (gimple *stmt)
 {
+  /* Should have been caught before calling this function.  */
+  gcc_checking_assert (!keep_all_vdefs_p ());
   walk_aliased_vdefs (NULL, gimple_vuse (stmt),
 		      mark_all_reaching_defs_necessary_1, NULL, &visited);
 }
@@ -798,6 +817,10 @@ propagate_necessity (bool aggressive)
 	  if (!use)
 	    continue;
 
+	  /* No need to search for vdefs if we intrinsicly keep them all.  */
+	  if (keep_all_vdefs_p ())
+	    continue;
+
 	  /* If we dropped to simple mode make all immediately
 	     reachable definitions necessary.  */
 	  if (chain_ovfl)
diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c
index e753689a7..0d716062b 100644
--- a/gcc/tree-ssa-forwprop.c
+++ b/gcc/tree-ssa-forwprop.c
@@ -2011,16 +2011,12 @@ get_bit_field_ref_def (tree val, enum tree_code &conv_code)
     return NULL_TREE;
   enum tree_code code = gimple_assign_rhs_code (def_stmt);
   if (code == FLOAT_EXPR
-      || code == FIX_TRUNC_EXPR)
+      || code == FIX_TRUNC_EXPR
+      || CONVERT_EXPR_CODE_P (code))
     {
       tree op1 = gimple_assign_rhs1 (def_stmt);
       if (conv_code == ERROR_MARK)
-	{
-	  if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (val))),
-			GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1)))))
-	    return NULL_TREE;
-	  conv_code = code;
-	}
+	conv_code = code;
       else if (conv_code != code)
 	return NULL_TREE;
       if (TREE_CODE (op1) != SSA_NAME)
@@ -2041,109 +2037,213 @@ static bool
 simplify_vector_constructor (gimple_stmt_iterator *gsi)
 {
   gimple *stmt = gsi_stmt (*gsi);
-  tree op, op2, orig[2], type, elem_type;
+  tree op, orig[2], type, elem_type;
   unsigned elem_size, i;
   unsigned HOST_WIDE_INT nelts;
+  unsigned HOST_WIDE_INT refnelts;
   enum tree_code conv_code;
   constructor_elt *elt;
   bool maybe_ident;
 
-  gcc_checking_assert (gimple_assign_rhs_code (stmt) == CONSTRUCTOR);
-
   op = gimple_assign_rhs1 (stmt);
   type = TREE_TYPE (op);
-  gcc_checking_assert (TREE_CODE (type) == VECTOR_TYPE);
+  gcc_checking_assert (TREE_CODE (op) == CONSTRUCTOR
+		       && TREE_CODE (type) == VECTOR_TYPE);
 
   if (!TYPE_VECTOR_SUBPARTS (type).is_constant (&nelts))
     return false;
   elem_type = TREE_TYPE (type);
   elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
 
-  vec_perm_builder sel (nelts, nelts, 1);
   orig[0] = NULL;
   orig[1] = NULL;
   conv_code = ERROR_MARK;
   maybe_ident = true;
   tree one_constant = NULL_TREE;
+  tree one_nonconstant = NULL_TREE;
   auto_vec<tree> constants;
   constants.safe_grow_cleared (nelts);
+  auto_vec<std::pair<unsigned, unsigned>, 64> elts;
   FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt)
     {
       tree ref, op1;
+      unsigned int elem;
 
       if (i >= nelts)
 	return false;
 
+      /* Look for elements extracted and possibly converted from
+         another vector.  */
       op1 = get_bit_field_ref_def (elt->value, conv_code);
-      if (op1)
+      if (op1
+	  && TREE_CODE ((ref = TREE_OPERAND (op1, 0))) == SSA_NAME
+	  && VECTOR_TYPE_P (TREE_TYPE (ref))
+	  && useless_type_conversion_p (TREE_TYPE (op1),
+					TREE_TYPE (TREE_TYPE (ref)))
+	  && constant_multiple_p (bit_field_offset (op1),
+				  bit_field_size (op1), &elem)
+	  && TYPE_VECTOR_SUBPARTS (TREE_TYPE (ref)).is_constant (&refnelts))
 	{
-	  ref = TREE_OPERAND (op1, 0);
 	  unsigned int j;
 	  for (j = 0; j < 2; ++j)
 	    {
 	      if (!orig[j])
 		{
-		  if (TREE_CODE (ref) != SSA_NAME)
-		    return false;
-		  if (! VECTOR_TYPE_P (TREE_TYPE (ref))
-		      || ! useless_type_conversion_p (TREE_TYPE (op1),
-						      TREE_TYPE (TREE_TYPE (ref))))
-		    return false;
-		  if (j && !useless_type_conversion_p (TREE_TYPE (orig[0]),
-						       TREE_TYPE (ref)))
-		    return false;
-		  orig[j] = ref;
-		  break;
+		  if (j == 0
+		      || useless_type_conversion_p (TREE_TYPE (orig[0]),
+						    TREE_TYPE (ref)))
+		    break;
 		}
 	      else if (ref == orig[j])
 		break;
 	    }
-	  if (j == 2)
-	    return false;
-
-	  unsigned int elt;
-	  if (maybe_ne (bit_field_size (op1), elem_size)
-	      || !constant_multiple_p (bit_field_offset (op1), elem_size, &elt))
-	    return false;
-	  if (j)
-	    elt += nelts;
-	  if (elt != i)
-	    maybe_ident = false;
-	  sel.quick_push (elt);
+	  /* Found a suitable vector element.  */
+	  if (j < 2)
+	    {
+	      orig[j] = ref;
+	      if (elem != i || j != 0)
+		maybe_ident = false;
+	      elts.safe_push (std::make_pair (j, elem));
+	      continue;
+	    }
+	  /* Else fallthru.  */
 	}
-      else if (CONSTANT_CLASS_P (elt->value))
+      /* Handle elements not extracted from a vector.
+          1. constants by permuting with constant vector
+	  2. a unique non-constant element by permuting with a splat vector  */
+      if (orig[1]
+	  && orig[1] != error_mark_node)
+	return false;
+      orig[1] = error_mark_node;
+      if (CONSTANT_CLASS_P (elt->value))
 	{
-	  if (orig[1]
-	      && orig[1] != error_mark_node)
+	  if (one_nonconstant)
 	    return false;
-	  orig[1] = error_mark_node;
 	  if (!one_constant)
 	    one_constant = elt->value;
 	  constants[i] = elt->value;
-	  sel.quick_push (i + nelts);
-	  maybe_ident = false;
 	}
       else
-	return false;
+	{
+	  if (one_constant)
+	    return false;
+	  if (!one_nonconstant)
+	    one_nonconstant = elt->value;
+	  else if (!operand_equal_p (one_nonconstant, elt->value, 0))
+	    return false;
+	}
+      elts.safe_push (std::make_pair (1, i));
+      maybe_ident = false;
     }
   if (i < nelts)
     return false;
 
-  if (! VECTOR_TYPE_P (TREE_TYPE (orig[0]))
-      || maybe_ne (TYPE_VECTOR_SUBPARTS (type),
-		   TYPE_VECTOR_SUBPARTS (TREE_TYPE (orig[0]))))
+  if (! orig[0]
+      || ! VECTOR_TYPE_P (TREE_TYPE (orig[0])))
     return false;
-
-  tree tem;
-  if (conv_code != ERROR_MARK
-      && (! supportable_convert_operation (conv_code, type,
-					   TREE_TYPE (orig[0]),
-					   &tem, &conv_code)
-	  || conv_code == CALL_EXPR))
+  refnelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (orig[0])).to_constant ();
+  /* We currently do not handle larger destination vectors.  */
+  if (refnelts < nelts)
     return false;
 
   if (maybe_ident)
     {
+      tree conv_src_type
+	= (nelts != refnelts
+	   ? (conv_code != ERROR_MARK
+	      ? build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])), nelts)
+	      : type)
+	   : TREE_TYPE (orig[0]));
+      if (conv_code != ERROR_MARK
+	  && !supportable_convert_operation (conv_code, type, conv_src_type,
+					     &conv_code))
+	{
+	  /* Only few targets implement direct conversion patterns so try
+	     some simple special cases via VEC_[UN]PACK[_FLOAT]_LO_EXPR.  */
+	  optab optab;
+	  tree halfvectype, dblvectype;
+	  if (CONVERT_EXPR_CODE_P (conv_code)
+	      && (2 * TYPE_PRECISION (TREE_TYPE (TREE_TYPE (orig[0])))
+		  == TYPE_PRECISION (TREE_TYPE (type)))
+	      && mode_for_vector (as_a <scalar_mode>
+				  (TYPE_MODE (TREE_TYPE (TREE_TYPE (orig[0])))),
+				  nelts * 2).exists ()
+	      && (dblvectype
+		  = build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])),
+				       nelts * 2))
+	      && (optab = optab_for_tree_code (FLOAT_TYPE_P (TREE_TYPE (type))
+					       ? VEC_UNPACK_FLOAT_LO_EXPR
+					       : VEC_UNPACK_LO_EXPR,
+					       dblvectype,
+					       optab_default))
+	      && (optab_handler (optab, TYPE_MODE (dblvectype))
+		  != CODE_FOR_nothing))
+	    {
+	      gimple_seq stmts = NULL;
+	      tree dbl;
+	      if (refnelts == nelts)
+		{
+		  /* ???  Paradoxical subregs don't exist, so insert into
+		     the lower half of a wider zero vector.  */
+		  dbl = gimple_build (&stmts, BIT_INSERT_EXPR, dblvectype,
+				      build_zero_cst (dblvectype), orig[0],
+				      bitsize_zero_node);
+		}
+	      else if (refnelts == 2 * nelts)
+		dbl = orig[0];
+	      else
+		dbl = gimple_build (&stmts, BIT_FIELD_REF, dblvectype,
+				    orig[0], TYPE_SIZE (dblvectype),
+				    bitsize_zero_node);
+	      gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	      gimple_assign_set_rhs_with_ops (gsi,
+					      FLOAT_TYPE_P (TREE_TYPE (type))
+					      ? VEC_UNPACK_FLOAT_LO_EXPR
+					      : VEC_UNPACK_LO_EXPR,
+					      dbl);
+	    }
+	  else if (CONVERT_EXPR_CODE_P (conv_code)
+		   && (TYPE_PRECISION (TREE_TYPE (TREE_TYPE (orig[0])))
+		       == 2 * TYPE_PRECISION (TREE_TYPE (type)))
+		   && mode_for_vector (as_a <scalar_mode>
+				         (TYPE_MODE
+					   (TREE_TYPE (TREE_TYPE (orig[0])))),
+				       nelts / 2).exists ()
+		   && (halfvectype
+		         = build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])),
+					      nelts / 2))
+		   && (optab = optab_for_tree_code (VEC_PACK_TRUNC_EXPR,
+						    halfvectype,
+						    optab_default))
+		   && (optab_handler (optab, TYPE_MODE (halfvectype))
+		       != CODE_FOR_nothing))
+	    {
+	      gimple_seq stmts = NULL;
+	      tree low = gimple_build (&stmts, BIT_FIELD_REF, halfvectype,
+				       orig[0], TYPE_SIZE (halfvectype),
+				       bitsize_zero_node);
+	      tree hig = gimple_build (&stmts, BIT_FIELD_REF, halfvectype,
+				       orig[0], TYPE_SIZE (halfvectype),
+				       TYPE_SIZE (halfvectype));
+	      gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+	      gimple_assign_set_rhs_with_ops (gsi, VEC_PACK_TRUNC_EXPR,
+					      low, hig);
+	    }
+	  else
+	    return false;
+	  update_stmt (gsi_stmt (*gsi));
+	  return true;
+	}
+      if (nelts != refnelts)
+	{
+	  gassign *lowpart
+	    = gimple_build_assign (make_ssa_name (conv_src_type),
+				   build3 (BIT_FIELD_REF, conv_src_type,
+					   orig[0], TYPE_SIZE (conv_src_type),
+					   bitsize_zero_node));
+	  gsi_insert_before (gsi, lowpart, GSI_SAME_STMT);
+	  orig[0] = gimple_assign_lhs (lowpart);
+	}
       if (conv_code == ERROR_MARK)
 	gimple_assign_set_rhs_from_tree (gsi, orig[0]);
       else
@@ -2152,54 +2252,119 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
     }
   else
     {
-      tree mask_type;
+      tree mask_type, perm_type, conv_src_type;
+      perm_type = TREE_TYPE (orig[0]);
+      conv_src_type = (nelts == refnelts
+		       ? perm_type
+		       : build_vector_type (TREE_TYPE (perm_type), nelts));
+      if (conv_code != ERROR_MARK
+	  && !supportable_convert_operation (conv_code, type, conv_src_type,
+					     &conv_code))
+	return false;
 
-      vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts);
-      if (!can_vec_perm_const_p (TYPE_MODE (type), indices))
+      /* Now that we know the number of elements of the source build the
+	 permute vector.
+	 ???  When the second vector has constant values we can shuffle
+	 it and its source indexes to make the permutation supported.
+	 For now it mimics a blend.  */
+      vec_perm_builder sel (refnelts, refnelts, 1);
+      bool all_same_p = true;
+      for (i = 0; i < elts.length (); ++i)
+	{
+	  sel.quick_push (elts[i].second + elts[i].first * refnelts);
+	  all_same_p &= known_eq (sel[i], sel[0]);
+	}
+      /* And fill the tail with "something".  It's really don't care,
+         and ideally we'd allow VEC_PERM to have a smaller destination
+	 vector.  As a heuristic:
+
+	 (a) if what we have so far duplicates a single element, make the
+	     tail do the same
+
+	 (b) otherwise preserve a uniform orig[0].  This facilitates
+	     later pattern-matching of VEC_PERM_EXPR to a BIT_INSERT_EXPR.  */
+      for (; i < refnelts; ++i)
+	sel.quick_push (all_same_p
+			? sel[0]
+			: (elts[0].second == 0 && elts[0].first == 0
+			   ? 0 : refnelts) + i);
+      vec_perm_indices indices (sel, orig[1] ? 2 : 1, refnelts);
+      if (!can_vec_perm_const_p (TYPE_MODE (perm_type), indices))
 	return false;
       mask_type
 	= build_vector_type (build_nonstandard_integer_type (elem_size, 1),
-			     nelts);
+			     refnelts);
       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT
 	  || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)),
-		       GET_MODE_SIZE (TYPE_MODE (type))))
+		       GET_MODE_SIZE (TYPE_MODE (perm_type))))
 	return false;
-      op2 = vec_perm_indices_to_tree (mask_type, indices);
+      tree op2 = vec_perm_indices_to_tree (mask_type, indices);
+      bool converted_orig1 = false;
+      gimple_seq stmts = NULL;
       if (!orig[1])
 	orig[1] = orig[0];
-      if (orig[1] == error_mark_node)
+      else if (orig[1] == error_mark_node
+	       && one_nonconstant)
 	{
-	  tree_vector_builder vec (type, nelts, 1);
-	  for (unsigned i = 0; i < nelts; ++i)
-	    if (constants[i])
+	  /* ???  We can see if we can safely convert to the original
+	     element type.  */
+	  converted_orig1 = conv_code != ERROR_MARK;
+	  orig[1] = gimple_build_vector_from_val (&stmts, UNKNOWN_LOCATION,
+						  converted_orig1
+						  ? type : perm_type,
+						  one_nonconstant);
+	}
+      else if (orig[1] == error_mark_node)
+	{
+	  /* ???  See if we can convert the vector to the original type.  */
+	  converted_orig1 = conv_code != ERROR_MARK;
+	  unsigned n = converted_orig1 ? nelts : refnelts;
+	  tree_vector_builder vec (converted_orig1
+				   ? type : perm_type, n, 1);
+	  for (unsigned i = 0; i < n; ++i)
+	    if (i < nelts && constants[i])
 	      vec.quick_push (constants[i]);
 	    else
 	      /* ??? Push a don't-care value.  */
 	      vec.quick_push (one_constant);
 	  orig[1] = vec.build ();
 	}
-      if (conv_code == ERROR_MARK)
-	gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0],
-					orig[1], op2);
-      else if (TREE_CODE (orig[1]) == VECTOR_CST)
+      tree blend_op2 = NULL_TREE;
+      if (converted_orig1)
 	{
-	  gimple *conv
-	    = gimple_build_assign (make_ssa_name (type), conv_code, orig[0]);
-	  orig[0] = gimple_assign_lhs (conv);
-	  gsi_insert_before (gsi, conv, GSI_SAME_STMT);
-	  gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR,
-					  orig[0], orig[1], op2);
-	}
-      else
-	{
-	  gimple *perm
-	    = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])),
-				   VEC_PERM_EXPR, orig[0], orig[1], op2);
-	  orig[0] = gimple_assign_lhs (perm);
-	  gsi_insert_before (gsi, perm, GSI_SAME_STMT);
-	  gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0],
-					  NULL_TREE, NULL_TREE);
+	  /* Make sure we can do a blend in the target type.  */
+	  vec_perm_builder sel (nelts, nelts, 1);
+	  for (i = 0; i < elts.length (); ++i)
+	    sel.quick_push (elts[i].first
+			    ? elts[i].second + nelts : i);
+	  vec_perm_indices indices (sel, 2, nelts);
+	  if (!can_vec_perm_const_p (TYPE_MODE (type), indices))
+	    return false;
+	  mask_type
+	    = build_vector_type (build_nonstandard_integer_type (elem_size, 1),
+				 nelts);
+	  if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT
+	      || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)),
+			   GET_MODE_SIZE (TYPE_MODE (type))))
+	    return false;
+	  blend_op2 = vec_perm_indices_to_tree (mask_type, indices);
 	}
+      tree orig1_for_perm
+	= converted_orig1 ? build_zero_cst (perm_type) : orig[1];
+      tree res = gimple_build (&stmts, VEC_PERM_EXPR, perm_type,
+			       orig[0], orig1_for_perm, op2);
+      if (nelts != refnelts)
+	res = gimple_build (&stmts, BIT_FIELD_REF,
+			    conv_code != ERROR_MARK ? conv_src_type : type,
+			    res, TYPE_SIZE (type), bitsize_zero_node);
+      if (conv_code != ERROR_MARK)
+	res = gimple_build (&stmts, conv_code, type, res);
+      /* Blend in the actual constant.  */
+      if (converted_orig1)
+	res = gimple_build (&stmts, VEC_PERM_EXPR, type,
+			    res, orig[1], blend_op2);
+      gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+      gimple_assign_set_rhs_with_ops (gsi, SSA_NAME, res);
     }
   update_stmt (gsi_stmt (*gsi));
   return true;
@@ -2449,6 +2614,72 @@ pass_forwprop::execute (function *fun)
 	      else
 		gsi_next (&gsi);
 	    }
+	  else if (TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
+		   && TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
+		   && gimple_assign_load_p (stmt)
+		   && !gimple_has_volatile_ops (stmt)
+		   && (TREE_CODE (gimple_assign_rhs1 (stmt))
+		       != TARGET_MEM_REF)
+		   && !stmt_can_throw_internal (cfun, stmt))
+	    {
+	      /* Rewrite loads used only in BIT_FIELD_REF extractions to
+	         component-wise loads.  */
+	      use_operand_p use_p;
+	      imm_use_iterator iter;
+	      bool rewrite = true;
+	      FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
+		{
+		  gimple *use_stmt = USE_STMT (use_p);
+		  if (is_gimple_debug (use_stmt))
+		    continue;
+		  if (!is_gimple_assign (use_stmt)
+		      || gimple_assign_rhs_code (use_stmt) != BIT_FIELD_REF)
+		    {
+		      rewrite = false;
+		      break;
+		    }
+		}
+	      if (rewrite)
+		{
+		  gimple *use_stmt;
+		  FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
+		    {
+		      if (is_gimple_debug (use_stmt))
+			{
+			  if (gimple_debug_bind_p (use_stmt))
+			    {
+			      gimple_debug_bind_reset_value (use_stmt);
+			      update_stmt (use_stmt);
+			    }
+			  continue;
+			}
+
+		      tree bfr = gimple_assign_rhs1 (use_stmt);
+		      tree new_rhs = fold_build3 (BIT_FIELD_REF,
+						  TREE_TYPE (bfr),
+						  unshare_expr (rhs),
+						  TREE_OPERAND (bfr, 1),
+						  TREE_OPERAND (bfr, 2));
+		      gimple *new_stmt
+			= gimple_build_assign (gimple_assign_lhs (use_stmt),
+					       new_rhs);
+
+		      location_t loc = gimple_location (use_stmt);
+		      gimple_set_location (new_stmt, loc);
+		      gimple_stmt_iterator gsi2 = gsi_for_stmt (use_stmt);
+		      unlink_stmt_vdef (use_stmt);
+		      gsi_remove (&gsi2, true);
+
+		      gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
+		    }
+
+		  release_defs (stmt);
+		  gsi_remove (&gsi, true);
+		}
+	      else
+		gsi_next (&gsi);
+	    }
+
 	  else if (code == COMPLEX_EXPR)
 	    {
 	      /* Rewrite stores of a single-use complex build expression
@@ -2489,6 +2720,66 @@ pass_forwprop::execute (function *fun)
 	      else
 		gsi_next (&gsi);
 	    }
+	  else if (code == CONSTRUCTOR
+		   && VECTOR_TYPE_P (TREE_TYPE (rhs))
+		   && TYPE_MODE (TREE_TYPE (rhs)) == BLKmode
+		   && CONSTRUCTOR_NELTS (rhs) > 0
+		   && (!VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
+		       || (TYPE_MODE (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
+			   != BLKmode)))
+	    {
+	      /* Rewrite stores of a single-use vector constructors
+	         to component-wise stores if the mode isn't supported.  */
+	      use_operand_p use_p;
+	      gimple *use_stmt;
+	      if (single_imm_use (lhs, &use_p, &use_stmt)
+		  && gimple_store_p (use_stmt)
+		  && !gimple_has_volatile_ops (use_stmt)
+		  && !stmt_can_throw_internal (cfun, use_stmt)
+		  && is_gimple_assign (use_stmt)
+		  && (TREE_CODE (gimple_assign_lhs (use_stmt))
+		      != TARGET_MEM_REF))
+		{
+		  tree elt_t = TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value);
+		  unsigned HOST_WIDE_INT elt_w
+		    = tree_to_uhwi (TYPE_SIZE (elt_t));
+		  unsigned HOST_WIDE_INT n
+		    = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (rhs)));
+		  for (unsigned HOST_WIDE_INT bi = 0; bi < n; bi += elt_w)
+		    {
+		      unsigned HOST_WIDE_INT ci = bi / elt_w;
+		      tree new_rhs;
+		      if (ci < CONSTRUCTOR_NELTS (rhs))
+			new_rhs = CONSTRUCTOR_ELT (rhs, ci)->value;
+		      else
+			new_rhs = build_zero_cst (elt_t);
+		      tree use_lhs = gimple_assign_lhs (use_stmt);
+		      tree new_lhs = build3 (BIT_FIELD_REF,
+					     elt_t,
+					     unshare_expr (use_lhs),
+					     bitsize_int (elt_w),
+					     bitsize_int (bi));
+		      gimple *new_stmt = gimple_build_assign (new_lhs, new_rhs);
+		      location_t loc = gimple_location (use_stmt);
+		      gimple_set_location (new_stmt, loc);
+		      gimple_set_vuse (new_stmt, gimple_vuse (use_stmt));
+		      gimple_set_vdef (new_stmt,
+				       make_ssa_name (gimple_vop (cfun)));
+		      SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
+		      gimple_set_vuse (use_stmt, gimple_vdef (new_stmt));
+		      gimple_stmt_iterator gsi2 = gsi_for_stmt (use_stmt);
+		      gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
+		    }
+		  gimple_stmt_iterator gsi2 = gsi_for_stmt (use_stmt);
+		  unlink_stmt_vdef (use_stmt);
+		  release_defs (use_stmt);
+		  gsi_remove (&gsi2, true);
+		  release_defs (stmt);
+		  gsi_remove (&gsi, true);
+		}
+	      else
+		gsi_next (&gsi);
+	    }
 	  else
 	    gsi_next (&gsi);
 	}
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index fec378490..695646764 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -2461,11 +2461,13 @@ get_mem_type_for_internal_fn (gcall *call, tree *op_p)
   switch (gimple_call_internal_fn (call))
     {
     case IFN_MASK_LOAD:
+    case IFN_MASK_LOAD_LANES:
       if (op_p == gimple_call_arg_ptr (call, 0))
 	return TREE_TYPE (gimple_call_lhs (call));
       return NULL_TREE;
 
     case IFN_MASK_STORE:
+    case IFN_MASK_STORE_LANES:
       if (op_p == gimple_call_arg_ptr (call, 0))
 	return TREE_TYPE (gimple_call_arg (call, 3));
       return NULL_TREE;
@@ -3510,6 +3512,26 @@ add_iv_candidate_for_use (struct ivopts_data *data, struct iv_use *use)
     basetype = sizetype;
   record_common_cand (data, build_int_cst (basetype, 0), iv->step, use);
 
+  /* Compare the cost of an address with an unscaled index with the cost of
+    an address with a scaled index and add candidate if useful.  */
+  poly_int64 step;
+  if (use != NULL
+      && poly_int_tree_p (iv->step, &step)
+      && address_p (use->type))
+    {
+      poly_int64 new_step;
+      unsigned int fact = preferred_mem_scale_factor
+	(use->iv->base,
+	 TYPE_MODE (use->mem_type),
+	 optimize_loop_for_speed_p (data->current_loop));
+
+      if (fact != 1
+	  && multiple_p (step, fact, &new_step))
+	add_candidate (data, size_int (0),
+		       wide_int_to_tree (sizetype, new_step),
+		       true, NULL);
+    }
+
   /* Record common candidate with constant offset stripped in base.
      Like the use itself, we also add candidate directly for it.  */
   base = strip_offset (iv->base, &offset);
@@ -4036,6 +4058,94 @@ get_computation_at (struct loop *loop, gimple *at,
   return fold_convert (type, aff_combination_to_tree (&aff));
 }
 
+/* Like get_computation_at, but try harder, even if the computation
+   is more expensive.  Intended for debug stmts.  */
+
+static tree
+get_debug_computation_at (class loop *loop, gimple *at,
+			  struct iv_use *use, struct iv_cand *cand)
+{
+  if (tree ret = get_computation_at (loop, at, use, cand))
+    return ret;
+
+  tree ubase = use->iv->base, ustep = use->iv->step;
+  tree cbase = cand->iv->base, cstep = cand->iv->step;
+  tree var;
+  tree utype = TREE_TYPE (ubase), ctype = TREE_TYPE (cbase);
+  widest_int rat;
+
+  /* We must have a precision to express the values of use.  */
+  if (TYPE_PRECISION (utype) >= TYPE_PRECISION (ctype))
+    return NULL_TREE;
+
+  /* Try to handle the case that get_computation_at doesn't,
+     try to express
+     use = ubase + (var - cbase) / ratio.  */
+  if (!constant_multiple_of (cstep, fold_convert (TREE_TYPE (cstep), ustep),
+			     &rat))
+    return NULL_TREE;
+
+  bool neg_p = false;
+  if (wi::neg_p (rat))
+    {
+      if (TYPE_UNSIGNED (ctype))
+	return NULL_TREE;
+      neg_p = true;
+      rat = wi::neg (rat);
+    }
+
+  /* If both IVs can wrap around and CAND doesn't have a power of two step,
+     it is unsafe.  Consider uint16_t CAND with step 9, when wrapping around,
+     the values will be ... 0xfff0, 0xfff9, 2, 11 ... and when use is say
+     uint8_t with step 3, those values divided by 3 cast to uint8_t will be
+     ... 0x50, 0x53, 0, 3 ... rather than expected 0x50, 0x53, 0x56, 0x59.  */
+  if (!use->iv->no_overflow
+      && !cand->iv->no_overflow
+      && !integer_pow2p (cstep))
+    return NULL_TREE;
+
+  int bits = wi::exact_log2 (rat);
+  if (bits == -1)
+    bits = wi::floor_log2 (rat) + 1;
+  if (!cand->iv->no_overflow
+      && TYPE_PRECISION (utype) + bits > TYPE_PRECISION (ctype))
+    return NULL_TREE;
+
+  var = var_at_stmt (loop, cand, at);
+
+  if (POINTER_TYPE_P (ctype))
+    {
+      ctype = unsigned_type_for (ctype);
+      cbase = fold_convert (ctype, cbase);
+      cstep = fold_convert (ctype, cstep);
+      var = fold_convert (ctype, var);
+    }
+
+  ubase = unshare_expr (ubase);
+  cbase = unshare_expr (cbase);
+  if (stmt_after_increment (loop, cand, at))
+    var = fold_build2 (MINUS_EXPR, TREE_TYPE (var), var,
+		       unshare_expr (cstep));
+
+  var = fold_build2 (MINUS_EXPR, TREE_TYPE (var), var, cbase);
+  var = fold_build2 (EXACT_DIV_EXPR, TREE_TYPE (var), var,
+		     wide_int_to_tree (TREE_TYPE (var), rat));
+  if (POINTER_TYPE_P (utype))
+    {
+      var = fold_convert (sizetype, var);
+      if (neg_p)
+	var = fold_build1 (NEGATE_EXPR, sizetype, var);
+      var = fold_build2 (POINTER_PLUS_EXPR, utype, ubase, var);
+    }
+  else
+    {
+      var = fold_convert (utype, var);
+      var = fold_build2 (neg_p ? MINUS_EXPR : PLUS_EXPR, utype,
+			 ubase, var);
+    }
+  return var;
+}
+
 /* Adjust the cost COST for being in loop setup rather than loop body.
    If we're optimizing for space, the loop setup overhead is constant;
    if we're optimizing for speed, amortize it over the per-iteration cost.
@@ -7122,6 +7232,8 @@ get_alias_ptr_type_for_ptr_address (iv_use *use)
     {
     case IFN_MASK_LOAD:
     case IFN_MASK_STORE:
+    case IFN_MASK_LOAD_LANES:
+    case IFN_MASK_STORE_LANES:
       /* The second argument contains the correct alias type.  */
       gcc_assert (use->op_p = gimple_call_arg_ptr (call, 0));
       return TREE_TYPE (gimple_call_arg (call, 1));
@@ -7339,6 +7451,7 @@ remove_unused_ivs (struct ivopts_data *data, bitmap toremove)
 	      struct iv_use dummy_use;
 	      struct iv_cand *best_cand = NULL, *cand;
 	      unsigned i, best_pref = 0, cand_pref;
+	      tree comp = NULL_TREE;
 
 	      memset (&dummy_use, 0, sizeof (dummy_use));
 	      dummy_use.iv = info->iv;
@@ -7359,20 +7472,22 @@ remove_unused_ivs (struct ivopts_data *data, bitmap toremove)
 		    ? 1 : 0;
 		  if (best_cand == NULL || best_pref < cand_pref)
 		    {
-		      best_cand = cand;
-		      best_pref = cand_pref;
+		      tree this_comp
+			= get_debug_computation_at (data->current_loop,
+						    SSA_NAME_DEF_STMT (def),
+						    &dummy_use, cand);
+		      if (this_comp)
+			{
+			  best_cand = cand;
+			  best_pref = cand_pref;
+			  comp = this_comp;
+			}
 		    }
 		}
 
 	      if (!best_cand)
 		continue;
 
-	      tree comp = get_computation_at (data->current_loop,
-					      SSA_NAME_DEF_STMT (def),
-					      &dummy_use, best_cand);
-	      if (!comp)
-		continue;
-
 	      if (count > 1)
 		{
 		  tree vexpr = make_node (DEBUG_EXPR_DECL);
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 3dfda7a4f..8607a59d4 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -1040,14 +1040,9 @@ pass_cse_reciprocals::execute (function *fun)
 		      else
 			stmt2 = gimple_build_call_internal_vec (ifn, args);
 		      gimple_call_set_lhs (stmt2, arg1);
-		      if (gimple_vdef (call))
-			{
-			  gimple_set_vdef (stmt2, gimple_vdef (call));
-			  SSA_NAME_DEF_STMT (gimple_vdef (stmt2)) = stmt2;
-			}
+		      gimple_move_vops (stmt2, call);
 		      gimple_call_set_nothrow (stmt2,
 					       gimple_call_nothrow_p (call));
-		      gimple_set_vuse (stmt2, gimple_vuse (call));
 		      gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
 		      gsi_replace (&gsi2, stmt2, true);
 		    }
@@ -3048,6 +3043,8 @@ last_fma_candidate_feeds_initial_phi (fma_deferring_state *state,
 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
    with uses in additions and subtractions to form fused multiply-add
    operations.  Returns true if successful and MUL_STMT should be removed.
+   If MUL_COND is nonnull, the multiplication in MUL_STMT is conditional
+   on MUL_COND, otherwise it is unconditional.
 
    If STATE indicates that we are deferring FMA transformation, that means
    that we do not produce FMAs for basic blocks which look like:
@@ -3064,7 +3061,7 @@ last_fma_candidate_feeds_initial_phi (fma_deferring_state *state,
 
 static bool
 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
-		     fma_deferring_state *state)
+		     fma_deferring_state *state, tree mul_cond = NULL_TREE)
 {
   tree mul_result = gimple_get_lhs (mul_stmt);
   tree type = TREE_TYPE (mul_result);
@@ -3178,6 +3175,9 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
 	  return false;
 	}
 
+      if (mul_cond && cond != mul_cond)
+	return false;
+
       if (cond)
 	{
 	  if (cond == result || else_value == result)
@@ -3789,38 +3789,48 @@ math_opts_dom_walker::after_dom_children (basic_block bb)
 	}
       else if (is_gimple_call (stmt))
 	{
-	  tree fndecl = gimple_call_fndecl (stmt);
-	  if (fndecl && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
+	  switch (gimple_call_combined_fn (stmt))
 	    {
-	      switch (DECL_FUNCTION_CODE (fndecl))
+	    CASE_CFN_POW:
+	      if (gimple_call_lhs (stmt)
+		  && TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
+		  && real_equal (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
+				 &dconst2)
+		  && convert_mult_to_fma (stmt,
+					  gimple_call_arg (stmt, 0),
+					  gimple_call_arg (stmt, 0),
+					  &fma_state))
 		{
-		case BUILT_IN_POWF:
-		case BUILT_IN_POW:
-		case BUILT_IN_POWL:
-		  if (gimple_call_lhs (stmt)
-		      && TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
-		      && real_equal
-		      (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
-		       &dconst2)
-		      && convert_mult_to_fma (stmt,
-					      gimple_call_arg (stmt, 0),
-					      gimple_call_arg (stmt, 0),
-					      &fma_state))
-		    {
-		      unlink_stmt_vdef (stmt);
-		      if (gsi_remove (&gsi, true)
-			  && gimple_purge_dead_eh_edges (bb))
-			*m_cfg_changed_p = true;
-		      release_defs (stmt);
-		      continue;
-		    }
-		  break;
+		  unlink_stmt_vdef (stmt);
+		  if (gsi_remove (&gsi, true)
+		      && gimple_purge_dead_eh_edges (bb))
+		    *m_cfg_changed_p = true;
+		  release_defs (stmt);
+		  continue;
+		}
+	      break;
 
-		default:;
+	    case CFN_COND_MUL:
+	      if (convert_mult_to_fma (stmt,
+				       gimple_call_arg (stmt, 1),
+				       gimple_call_arg (stmt, 2),
+				       &fma_state,
+				       gimple_call_arg (stmt, 0)))
+
+		{
+		  gsi_remove (&gsi, true);
+		  release_defs (stmt);
+		  continue;
 		}
+	      break;
+
+	    case CFN_LAST:
+	      cancel_fma_deferring (&fma_state);
+	      break;
+
+	    default:
+	      break;
 	    }
-	  else
-	    cancel_fma_deferring (&fma_state);
 	}
       gsi_next (&gsi);
     }
diff --git a/gcc/tree-ssa-propagate.c b/gcc/tree-ssa-propagate.c
index 6b78dc1c0..0862f83e9 100644
--- a/gcc/tree-ssa-propagate.c
+++ b/gcc/tree-ssa-propagate.c
@@ -625,8 +625,7 @@ finish_update_gimple_call (gimple_stmt_iterator *si_p, gimple *new_stmt,
 {
   gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
   move_ssa_defining_stmt_for_defs (new_stmt, stmt);
-  gimple_set_vuse (new_stmt, gimple_vuse (stmt));
-  gimple_set_vdef (new_stmt, gimple_vdef (stmt));
+  gimple_move_vops (new_stmt, stmt);
   gimple_set_location (new_stmt, gimple_location (stmt));
   if (gimple_block (new_stmt) == NULL_TREE)
     gimple_set_block (new_stmt, gimple_block (stmt));
@@ -706,8 +705,7 @@ update_call_from_tree (gimple_stmt_iterator *si_p, tree expr)
           STRIP_USELESS_TYPE_CONVERSION (expr);
           new_stmt = gimple_build_assign (lhs, expr);
           move_ssa_defining_stmt_for_defs (new_stmt, stmt);
-	  gimple_set_vuse (new_stmt, gimple_vuse (stmt));
-	  gimple_set_vdef (new_stmt, gimple_vdef (stmt));
+	  gimple_move_vops (new_stmt, stmt);
         }
       else if (!TREE_SIDE_EFFECTS (expr))
         {
@@ -732,8 +730,7 @@ update_call_from_tree (gimple_stmt_iterator *si_p, tree expr)
 	  else
 	    lhs = create_tmp_var (TREE_TYPE (expr));
           new_stmt = gimple_build_assign (lhs, expr);
-	  gimple_set_vuse (new_stmt, gimple_vuse (stmt));
-	  gimple_set_vdef (new_stmt, gimple_vdef (stmt));
+	  gimple_move_vops (new_stmt, stmt);
           move_ssa_defining_stmt_for_defs (new_stmt, stmt);
         }
       gimple_set_location (new_stmt, gimple_location (stmt));
diff --git a/gcc/tree-ssa-threadedge.c b/gcc/tree-ssa-threadedge.c
index 91494d761..096584062 100644
--- a/gcc/tree-ssa-threadedge.c
+++ b/gcc/tree-ssa-threadedge.c
@@ -331,6 +331,7 @@ record_temporary_equivalences_from_stmts_at_dest (edge e,
 	{
 	  tree fndecl = gimple_call_fndecl (stmt);
 	  if (fndecl
+	      && fndecl_built_in_p (fndecl, BUILT_IN_NORMAL)
 	      && (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_OBJECT_SIZE
 		  || DECL_FUNCTION_CODE (fndecl) == BUILT_IN_CONSTANT_P))
 	    continue;
diff --git a/gcc/tree-streamer-in.c b/gcc/tree-streamer-in.c
index f6d137316..eb3e174fc 100644
--- a/gcc/tree-streamer-in.c
+++ b/gcc/tree-streamer-in.c
@@ -324,8 +324,7 @@ unpack_ts_decl_with_vis_value_fields (struct bitpack_d *bp, tree expr)
 static void
 unpack_ts_function_decl_value_fields (struct bitpack_d *bp, tree expr)
 {
-  DECL_BUILT_IN_CLASS (expr) = bp_unpack_enum (bp, built_in_class,
-					       BUILT_IN_LAST);
+  built_in_class cl = bp_unpack_enum (bp, built_in_class, BUILT_IN_LAST);
   DECL_STATIC_CONSTRUCTOR (expr) = (unsigned) bp_unpack_value (bp, 1);
   DECL_STATIC_DESTRUCTOR (expr) = (unsigned) bp_unpack_value (bp, 1);
   DECL_UNINLINABLE (expr) = (unsigned) bp_unpack_value (bp, 1);
@@ -333,7 +332,7 @@ unpack_ts_function_decl_value_fields (struct bitpack_d *bp, tree expr)
   DECL_IS_NOVOPS (expr) = (unsigned) bp_unpack_value (bp, 1);
   DECL_IS_RETURNS_TWICE (expr) = (unsigned) bp_unpack_value (bp, 1);
   DECL_IS_MALLOC (expr) = (unsigned) bp_unpack_value (bp, 1);
-  DECL_IS_OPERATOR_NEW (expr) = (unsigned) bp_unpack_value (bp, 1);
+  DECL_SET_IS_OPERATOR_NEW (expr, (unsigned) bp_unpack_value (bp, 1));
   DECL_DECLARED_INLINE_P (expr) = (unsigned) bp_unpack_value (bp, 1);
   DECL_STATIC_CHAIN (expr) = (unsigned) bp_unpack_value (bp, 1);
   DECL_NO_INLINE_WARNING_P (expr) = (unsigned) bp_unpack_value (bp, 1);
@@ -343,22 +342,22 @@ unpack_ts_function_decl_value_fields (struct bitpack_d *bp, tree expr)
   DECL_DISREGARD_INLINE_LIMITS (expr) = (unsigned) bp_unpack_value (bp, 1);
   DECL_PURE_P (expr) = (unsigned) bp_unpack_value (bp, 1);
   DECL_LOOPING_CONST_OR_PURE_P (expr) = (unsigned) bp_unpack_value (bp, 1);
-  if (DECL_BUILT_IN_CLASS (expr) != NOT_BUILT_IN)
+  unsigned int fcode = 0;
+  if (cl != NOT_BUILT_IN)
     {
-      DECL_FUNCTION_CODE (expr) = (enum built_in_function) bp_unpack_value (bp,
-	                                                                    12);
-      if (DECL_BUILT_IN_CLASS (expr) == BUILT_IN_NORMAL
-	  && DECL_FUNCTION_CODE (expr) >= END_BUILTINS)
+      fcode = bp_unpack_value (bp, 32);
+      if (cl == BUILT_IN_NORMAL && fcode >= END_BUILTINS)
 	fatal_error (input_location,
 		     "machine independent builtin code out of range");
-      else if (DECL_BUILT_IN_CLASS (expr) == BUILT_IN_MD)
+      else if (cl == BUILT_IN_MD)
 	{
-          tree result = targetm.builtin_decl (DECL_FUNCTION_CODE (expr), true);
+          tree result = targetm.builtin_decl (fcode, true);
 	  if (!result || result == error_mark_node)
 	    fatal_error (input_location,
 			 "target specific builtin not available");
 	}
     }
+  set_decl_built_in_function (expr, cl, fcode);
 }
 
 
diff --git a/gcc/tree-streamer-out.c b/gcc/tree-streamer-out.c
index 3f619e830..12693f6f4 100644
--- a/gcc/tree-streamer-out.c
+++ b/gcc/tree-streamer-out.c
@@ -295,7 +295,7 @@ pack_ts_function_decl_value_fields (struct bitpack_d *bp, tree expr)
   bp_pack_value (bp, DECL_IS_NOVOPS (expr), 1);
   bp_pack_value (bp, DECL_IS_RETURNS_TWICE (expr), 1);
   bp_pack_value (bp, DECL_IS_MALLOC (expr), 1);
-  bp_pack_value (bp, DECL_IS_OPERATOR_NEW (expr), 1);
+  bp_pack_value (bp, DECL_IS_OPERATOR_NEW_P (expr), 1);
   bp_pack_value (bp, DECL_DECLARED_INLINE_P (expr), 1);
   bp_pack_value (bp, DECL_STATIC_CHAIN (expr), 1);
   bp_pack_value (bp, DECL_NO_INLINE_WARNING_P (expr), 1);
@@ -305,7 +305,7 @@ pack_ts_function_decl_value_fields (struct bitpack_d *bp, tree expr)
   bp_pack_value (bp, DECL_PURE_P (expr), 1);
   bp_pack_value (bp, DECL_LOOPING_CONST_OR_PURE_P (expr), 1);
   if (DECL_BUILT_IN_CLASS (expr) != NOT_BUILT_IN)
-    bp_pack_value (bp, DECL_FUNCTION_CODE (expr), 12);
+    bp_pack_value (bp, DECL_UNCHECKED_FUNCTION_CODE (expr), 32);
 }
 
 
diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c
index 39bc2a82b..8d97deaf2 100644
--- a/gcc/tree-vect-generic.c
+++ b/gcc/tree-vect-generic.c
@@ -1671,7 +1671,6 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
   gimple *g;
   tree lhs = gimple_call_lhs (stmt);
   tree arg = gimple_call_arg (stmt, 0);
-  tree decl = NULL_TREE;
   tree ret_type = TREE_TYPE (lhs);
   tree arg_type = TREE_TYPE (arg);
   tree new_rhs, compute_type = TREE_TYPE (arg_type);
@@ -1698,16 +1697,9 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
 
   if (modifier == NONE && (code == FIX_TRUNC_EXPR || code == FLOAT_EXPR))
     {
-      if (supportable_convert_operation (code, ret_type, arg_type, &decl,
-					 &code1))
+      if (supportable_convert_operation (code, ret_type, arg_type, &code1))
 	{
-	  if (code1 == CALL_EXPR)
-	    {
-	      g = gimple_build_call (decl, 1, arg);
-	      gimple_call_set_lhs (g, lhs);
-	    }
-	  else
-	    g = gimple_build_assign (lhs, code1, arg);
+	  g = gimple_build_assign (lhs, code1, arg);
 	  gsi_replace (gsi, g, false);
 	  return;
 	}
@@ -1726,11 +1718,11 @@ expand_vector_conversion (gimple_stmt_iterator *gsi)
 	      tree ret1_type = build_vector_type (TREE_TYPE (ret_type), nelts);
 	      tree arg1_type = build_vector_type (TREE_TYPE (arg_type), nelts);
 	      if (supportable_convert_operation (code, ret1_type, arg1_type,
-						 &decl, &code1))
+						 &code1))
 		{
 		  new_rhs = expand_vector_piecewise (gsi, do_vec_conversion,
 						     ret_type, arg1_type, arg,
-						     decl, code1);
+						     NULL_TREE, code1);
 		  g = gimple_build_assign (lhs, new_rhs);
 		  gsi_replace (gsi, g, false);
 		  return;
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 85be01748..b76728452 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -5581,6 +5581,30 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
   return lhs;
 }
 
+/* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
+   type of the vector input.  */
+
+static internal_fn
+get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
+{
+  internal_fn mask_reduc_fn;
+
+  switch (reduc_fn)
+    {
+    case IFN_FOLD_LEFT_PLUS:
+      mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
+      break;
+
+    default:
+      return IFN_LAST;
+    }
+
+  if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
+				      OPTIMIZE_FOR_SPEED))
+    return mask_reduc_fn;
+  return IFN_LAST;
+}
+
 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
    statement.  CODE is the operation performed by STMT_INFO and OPS are
@@ -5603,6 +5627,7 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
   stmt_vec_info new_stmt_info = NULL;
+  internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
 
   int ncopies;
   if (slp_node)
@@ -5673,16 +5698,21 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
 	  def0 = negated;
 	}
 
-      if (mask)
+      if (mask && mask_reduc_fn == IFN_LAST)
 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
 				    vector_identity);
 
       /* On the first iteration the input is simply the scalar phi
 	 result, and for subsequent iterations it is the output of
 	 the preceding operation.  */
-      if (reduc_fn != IFN_LAST)
+      if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
 	{
-	  new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
+	  if (mask && mask_reduc_fn != IFN_LAST)
+	    new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
+						   def0, mask);
+	  else
+	    new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
+						   def0);
 	  /* For chained SLP reductions the output of the previous reduction
 	     operation serves as the input of the next. For the final statement
 	     the output cannot be a temporary - we reuse the original
@@ -5782,6 +5812,7 @@ use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
   switch (code)
     {
     case DOT_PROD_EXPR:
+    case SAD_EXPR:
       return true;
 
     default:
@@ -5811,6 +5842,17 @@ build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
 	break;
       }
 
+    case SAD_EXPR:
+      {
+	tree vectype = TREE_TYPE (vop[1]);
+	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
+	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
+					       mask, vop[1], vop[0]);
+	gsi_insert_before (gsi, select, GSI_SAME_STMT);
+	vop[1] = masked_op1;
+	break;
+      }
+
     default:
       gcc_unreachable ();
     }
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 026148cc4..99df38711 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -1302,7 +1302,7 @@ vect_recog_pow_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
     {
       if (flag_unsafe_math_optimizations
 	  && TREE_CODE (base) == REAL_CST
-	  && !gimple_call_internal_p (last_stmt))
+	  && gimple_call_builtin_p (last_stmt, BUILT_IN_NORMAL))
 	{
 	  combined_fn log_cfn;
 	  built_in_function exp_bfn;
@@ -1728,6 +1728,175 @@ vect_recog_over_widening_pattern (stmt_vec_info last_stmt_info, tree *type_out)
   return pattern_stmt;
 }
 
+/* Recognize the following patterns:
+
+     ATYPE a;  // narrower than TYPE
+     BTYPE b;  // narrower than TYPE
+
+   1) Multiply high with scaling
+     TYPE res = ((TYPE) a * (TYPE) b) >> c;
+   2) ... or also with rounding
+     TYPE res = (((TYPE) a * (TYPE) b) >> d + 1) >> 1;
+
+   where only the bottom half of res is used.  */
+
+static gimple *
+vect_recog_mulhs_pattern (stmt_vec_info last_stmt_info, tree *type_out)
+{
+  /* Check for a right shift.  */
+  gassign *last_stmt = dyn_cast <gassign *> (last_stmt_info->stmt);
+  if (!last_stmt
+      || gimple_assign_rhs_code (last_stmt) != RSHIFT_EXPR)
+    return NULL;
+  vec_info *vinfo = last_stmt_info->vinfo;
+
+  /* Check that the shift result is wider than the users of the
+     result need (i.e. that narrowing would be a natural choice).  */
+  tree lhs_type = TREE_TYPE (gimple_assign_lhs (last_stmt));
+  unsigned int target_precision
+    = vect_element_precision (last_stmt_info->min_output_precision);
+  if (!INTEGRAL_TYPE_P (lhs_type)
+      || target_precision >= TYPE_PRECISION (lhs_type))
+    return NULL;
+
+  /* Look through any change in sign on the outer shift input.  */
+  vect_unpromoted_value unprom_rshift_input;
+  tree rshift_input = vect_look_through_possible_promotion
+    (vinfo, gimple_assign_rhs1 (last_stmt), &unprom_rshift_input);
+  if (!rshift_input
+      || TYPE_PRECISION (TREE_TYPE (rshift_input))
+	   != TYPE_PRECISION (lhs_type))
+    return NULL;
+
+  /* Get the definition of the shift input.  */
+  stmt_vec_info rshift_input_stmt_info
+    = vect_get_internal_def (vinfo, rshift_input);
+  if (!rshift_input_stmt_info)
+    return NULL;
+  gassign *rshift_input_stmt
+    = dyn_cast <gassign *> (rshift_input_stmt_info->stmt);
+  if (!rshift_input_stmt)
+    return NULL;
+
+  stmt_vec_info mulh_stmt_info;
+  tree scale_term;
+  internal_fn ifn;
+  unsigned int expect_offset;
+
+  /* Check for the presence of the rounding term.  */
+  if (gimple_assign_rhs_code (rshift_input_stmt) == PLUS_EXPR)
+    {
+      /* Check that the outer shift was by 1.  */
+      if (!integer_onep (gimple_assign_rhs2 (last_stmt)))
+	return NULL;
+
+      /* Check that the second operand of the PLUS_EXPR is 1.  */
+      if (!integer_onep (gimple_assign_rhs2 (rshift_input_stmt)))
+	return NULL;
+
+      /* Look through any change in sign on the addition input.  */
+      vect_unpromoted_value unprom_plus_input;
+      tree plus_input = vect_look_through_possible_promotion
+	(vinfo, gimple_assign_rhs1 (rshift_input_stmt), &unprom_plus_input);
+      if (!plus_input
+	   || TYPE_PRECISION (TREE_TYPE (plus_input))
+		!= TYPE_PRECISION (TREE_TYPE (rshift_input)))
+	return NULL;
+
+      /* Get the definition of the multiply-high-scale part.  */
+      stmt_vec_info plus_input_stmt_info
+	= vect_get_internal_def (vinfo, plus_input);
+      if (!plus_input_stmt_info)
+	return NULL;
+      gassign *plus_input_stmt
+	= dyn_cast <gassign *> (plus_input_stmt_info->stmt);
+      if (!plus_input_stmt
+	  || gimple_assign_rhs_code (plus_input_stmt) != RSHIFT_EXPR)
+	return NULL;
+
+      /* Look through any change in sign on the scaling input.  */
+      vect_unpromoted_value unprom_scale_input;
+      tree scale_input = vect_look_through_possible_promotion
+	(vinfo, gimple_assign_rhs1 (plus_input_stmt), &unprom_scale_input);
+      if (!scale_input
+	  || TYPE_PRECISION (TREE_TYPE (scale_input))
+	       != TYPE_PRECISION (TREE_TYPE (plus_input)))
+	return NULL;
+
+      /* Get the definition of the multiply-high part.  */
+      mulh_stmt_info = vect_get_internal_def (vinfo, scale_input);
+      if (!mulh_stmt_info)
+	return NULL;
+
+      /* Get the scaling term.  */
+      scale_term = gimple_assign_rhs2 (plus_input_stmt);
+
+      expect_offset = target_precision + 2;
+      ifn = IFN_MULHRS;
+    }
+  else
+    {
+      mulh_stmt_info = rshift_input_stmt_info;
+      scale_term = gimple_assign_rhs2 (last_stmt);
+
+      expect_offset = target_precision + 1;
+      ifn = IFN_MULHS;
+    }
+
+  /* Check that the scaling factor is correct.  */
+  if (TREE_CODE (scale_term) != INTEGER_CST
+      || wi::to_widest (scale_term) + expect_offset
+	   != TYPE_PRECISION (lhs_type))
+    return NULL;
+
+  /* Check whether the scaling input term can be seen as two widened
+     inputs multiplied together.  */
+  vect_unpromoted_value unprom_mult[2];
+  tree new_type;
+  unsigned int nops
+    = vect_widened_op_tree (mulh_stmt_info, MULT_EXPR, WIDEN_MULT_EXPR,
+			    false, 2, unprom_mult, &new_type);
+  if (nops != 2)
+    return NULL;
+
+  vect_pattern_detected ("vect_recog_mulhs_pattern", last_stmt);
+
+  /* Adjust output precision.  */
+  if (TYPE_PRECISION (new_type) < target_precision)
+    new_type = build_nonstandard_integer_type
+      (target_precision, TYPE_UNSIGNED (new_type));
+
+  /* Check for target support.  */
+  tree new_vectype = get_vectype_for_scalar_type (vinfo, new_type);
+  if (!new_vectype
+      || !direct_internal_fn_supported_p
+	    (ifn, new_vectype, OPTIMIZE_FOR_SPEED))
+    return NULL;
+
+  /* The IR requires a valid vector type for the cast result, even though
+     it's likely to be discarded.  */
+  *type_out = get_vectype_for_scalar_type (vinfo, lhs_type);
+  if (!*type_out)
+    return NULL;
+
+  /* Generate the IFN_MULHRS call.  */
+  tree new_var = vect_recog_temp_ssa_var (new_type, NULL);
+  tree new_ops[2];
+  vect_convert_inputs (last_stmt_info, 2, new_ops, new_type,
+		       unprom_mult, new_vectype);
+  gcall *mulhrs_stmt
+    = gimple_build_call_internal (ifn, 2, new_ops[0], new_ops[1]);
+  gimple_call_set_lhs (mulhrs_stmt, new_var);
+  gimple_set_location (mulhrs_stmt, gimple_location (last_stmt));
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "created pattern stmt: %G", mulhrs_stmt);
+
+  return vect_convert_output (last_stmt_info, lhs_type,
+			      mulhrs_stmt, new_vectype);
+}
+
 /* Recognize the patterns:
 
 	    ATYPE a;  // narrower than TYPE
@@ -2872,6 +3041,37 @@ vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
       /* Pattern detected.  */
       vect_pattern_detected ("vect_recog_divmod_pattern", last_stmt);
 
+      *type_out = vectype;
+
+      /* Check if the target supports this internal function.  */
+      internal_fn ifn = IFN_DIV_POW2;
+      if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
+	{
+	  tree shift = build_int_cst (itype, tree_log2 (oprnd1));
+
+	  tree var_div = vect_recog_temp_ssa_var (itype, NULL);
+	  gimple *div_stmt = gimple_build_call_internal (ifn, 2, oprnd0, shift);
+	  gimple_call_set_lhs (div_stmt, var_div);
+
+	  if (rhs_code == TRUNC_MOD_EXPR)
+	    {
+	      append_pattern_def_seq (stmt_vinfo, div_stmt);
+	      def_stmt
+		= gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL),
+				       LSHIFT_EXPR, var_div, shift);
+	      append_pattern_def_seq (stmt_vinfo, def_stmt);
+	      pattern_stmt
+		= gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL),
+				       MINUS_EXPR, oprnd0,
+				       gimple_assign_lhs (def_stmt));
+	    }
+	  else
+	    pattern_stmt = div_stmt;
+	  gimple_set_location (pattern_stmt, gimple_location (last_stmt));
+
+	  return pattern_stmt;
+	}
+
       cond = build2 (LT_EXPR, boolean_type_node, oprnd0,
 		     build_int_cst (itype, 0));
       if (rhs_code == TRUNC_DIV_EXPR
@@ -2948,7 +3148,6 @@ vect_recog_divmod_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 				   signmask);
 	}
 
-      *type_out = vectype;
       return pattern_stmt;
     }
 
@@ -4875,6 +5074,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {
   /* Must come after over_widening, which narrows the shift as much as
      possible beforehand.  */
   { vect_recog_average_pattern, "average" },
+  { vect_recog_mulhs_pattern, "mult_high" },
   { vect_recog_cast_forwprop_pattern, "cast_forwprop" },
   { vect_recog_widen_mult_pattern, "widen_mult" },
   { vect_recog_dot_prod_pattern, "dot_prod" },
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 82b868926..68a9f7574 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -4497,7 +4497,6 @@ vectorizable_simd_clone_call (stmt_vec_info stmt_info,
 
 static gimple *
 vect_gen_widened_results_half (enum tree_code code,
-			       tree decl,
                                tree vec_oprnd0, tree vec_oprnd1, int op_type,
 			       tree vec_dest, gimple_stmt_iterator *gsi,
 			       stmt_vec_info stmt_info)
@@ -4506,26 +4505,12 @@ vect_gen_widened_results_half (enum tree_code code,
   tree new_temp;
 
   /* Generate half of the widened result:  */
-  if (code == CALL_EXPR)
-    {
-      /* Target specific support  */
-      if (op_type == binary_op)
-	new_stmt = gimple_build_call (decl, 2, vec_oprnd0, vec_oprnd1);
-      else
-	new_stmt = gimple_build_call (decl, 1, vec_oprnd0);
-      new_temp = make_ssa_name (vec_dest, new_stmt);
-      gimple_call_set_lhs (new_stmt, new_temp);
-    }
-  else
-    {
-      /* Generic support */
-      gcc_assert (op_type == TREE_CODE_LENGTH (code));
-      if (op_type != binary_op)
-	vec_oprnd1 = NULL;
-      new_stmt = gimple_build_assign (vec_dest, code, vec_oprnd0, vec_oprnd1);
-      new_temp = make_ssa_name (vec_dest, new_stmt);
-      gimple_assign_set_lhs (new_stmt, new_temp);
-    }
+  gcc_assert (op_type == TREE_CODE_LENGTH (code));
+  if (op_type != binary_op)
+    vec_oprnd1 = NULL;
+  new_stmt = gimple_build_assign (vec_dest, code, vec_oprnd0, vec_oprnd1);
+  new_temp = make_ssa_name (vec_dest, new_stmt);
+  gimple_assign_set_lhs (new_stmt, new_temp);
   vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
 
   return new_stmt;
@@ -4651,8 +4636,7 @@ vect_create_vectorized_promotion_stmts (vec<tree> *vec_oprnds0,
 					stmt_vec_info stmt_info, tree vec_dest,
 					gimple_stmt_iterator *gsi,
 					enum tree_code code1,
-					enum tree_code code2, tree decl1,
-					tree decl2, int op_type)
+					enum tree_code code2, int op_type)
 {
   int i;
   tree vop0, vop1, new_tmp1, new_tmp2;
@@ -4668,10 +4652,10 @@ vect_create_vectorized_promotion_stmts (vec<tree> *vec_oprnds0,
 	vop1 = NULL_TREE;
 
       /* Generate the two halves of promotion operation.  */
-      new_stmt1 = vect_gen_widened_results_half (code1, decl1, vop0, vop1,
+      new_stmt1 = vect_gen_widened_results_half (code1, vop0, vop1,
 						 op_type, vec_dest, gsi,
 						 stmt_info);
-      new_stmt2 = vect_gen_widened_results_half (code2, decl2, vop0, vop1,
+      new_stmt2 = vect_gen_widened_results_half (code2, vop0, vop1,
 						 op_type, vec_dest, gsi,
 						 stmt_info);
       if (is_gimple_call (new_stmt1))
@@ -4712,7 +4696,6 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK;
   enum tree_code codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
-  tree decl1 = NULL_TREE, decl2 = NULL_TREE;
   tree new_temp;
   enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
   int ndts = 2;
@@ -4883,8 +4866,7 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	  && code != FLOAT_EXPR
 	  && !CONVERT_EXPR_CODE_P (code))
 	return false;
-      if (supportable_convert_operation (code, vectype_out, vectype_in,
-					 &decl1, &code1))
+      if (supportable_convert_operation (code, vectype_out, vectype_in, &code1))
 	break;
       /* FALLTHRU */
     unsupported:
@@ -4924,7 +4906,7 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	  if (GET_MODE_SIZE (rhs_mode) == fltsz)
 	    {
 	      if (!supportable_convert_operation (code, vectype_out,
-						  cvt_type, &decl1, &codecvt1))
+						  cvt_type, &codecvt1))
 		goto unsupported;
 	    }
 	  else if (!supportable_widening_operation (code, stmt_info,
@@ -4975,7 +4957,7 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       if (cvt_type == NULL_TREE)
 	goto unsupported;
       if (!supportable_convert_operation (code, cvt_type, vectype_in,
-					  &decl1, &codecvt1))
+					  &codecvt1))
 	goto unsupported;
       if (supportable_narrowing_operation (NOP_EXPR, vectype_out, cvt_type,
 					   &code1, &multi_step_cvt,
@@ -5084,24 +5066,12 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	    {
 	      stmt_vec_info new_stmt_info;
 	      /* Arguments are ready, create the new vector stmt.  */
-	      if (code1 == CALL_EXPR)
-		{
-		  gcall *new_stmt = gimple_build_call (decl1, 1, vop0);
-		  new_temp = make_ssa_name (vec_dest, new_stmt);
-		  gimple_call_set_lhs (new_stmt, new_temp);
-		  new_stmt_info
-		    = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
-		}
-	      else
-		{
-		  gcc_assert (TREE_CODE_LENGTH (code1) == unary_op);
-		  gassign *new_stmt
-		    = gimple_build_assign (vec_dest, code1, vop0);
-		  new_temp = make_ssa_name (vec_dest, new_stmt);
-		  gimple_assign_set_lhs (new_stmt, new_temp);
-		  new_stmt_info
-		    = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
-		}
+	      gcc_assert (TREE_CODE_LENGTH (code1) == unary_op);
+	      gassign *new_stmt = gimple_build_assign (vec_dest, code1, vop0);
+	      new_temp = make_ssa_name (vec_dest, new_stmt);
+	      gimple_assign_set_lhs (new_stmt, new_temp);
+	      new_stmt_info
+		= vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
 
 	      if (slp_node)
 		SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
@@ -5193,8 +5163,7 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	      vect_create_vectorized_promotion_stmts (&vec_oprnds0,
 						      &vec_oprnds1, stmt_info,
 						      this_dest, gsi,
-						      c1, c2, decl1, decl2,
-						      op_type);
+						      c1, c2, op_type);
 	    }
 
 	  FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
@@ -5202,25 +5171,12 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	      stmt_vec_info new_stmt_info;
 	      if (cvt_type)
 		{
-		  if (codecvt1 == CALL_EXPR)
-		    {
-		      gcall *new_stmt = gimple_build_call (decl1, 1, vop0);
-		      new_temp = make_ssa_name (vec_dest, new_stmt);
-		      gimple_call_set_lhs (new_stmt, new_temp);
-		      new_stmt_info
-			= vect_finish_stmt_generation (stmt_info, new_stmt,
-						       gsi);
-		    }
-		  else
-		    {
-		      gcc_assert (TREE_CODE_LENGTH (codecvt1) == unary_op);
-		      new_temp = make_ssa_name (vec_dest);
-		      gassign *new_stmt
-			= gimple_build_assign (new_temp, codecvt1, vop0);
-		      new_stmt_info
-			= vect_finish_stmt_generation (stmt_info, new_stmt,
-						       gsi);
-		    }
+		  gcc_assert (TREE_CODE_LENGTH (codecvt1) == unary_op);
+		  new_temp = make_ssa_name (vec_dest);
+		  gassign *new_stmt
+		    = gimple_build_assign (new_temp, codecvt1, vop0);
+		  new_stmt_info
+		    = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
 		}
 	      else
 		new_stmt_info = vinfo->lookup_def (vop0);
@@ -5263,22 +5219,11 @@ vectorizable_conversion (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	  if (cvt_type)
 	    FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
 	      {
-		if (codecvt1 == CALL_EXPR)
-		  {
-		    gcall *new_stmt = gimple_build_call (decl1, 1, vop0);
-		    new_temp = make_ssa_name (vec_dest, new_stmt);
-		    gimple_call_set_lhs (new_stmt, new_temp);
-		    vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
-		  }
-		else
-		  {
-		    gcc_assert (TREE_CODE_LENGTH (codecvt1) == unary_op);
-		    new_temp = make_ssa_name (vec_dest);
-		    gassign *new_stmt
-		      = gimple_build_assign (new_temp, codecvt1, vop0);
-		    vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
-		  }
-
+		gcc_assert (TREE_CODE_LENGTH (codecvt1) == unary_op);
+		new_temp = make_ssa_name (vec_dest);
+		gassign *new_stmt
+		    = gimple_build_assign (new_temp, codecvt1, vop0);
+		vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
 		vec_oprnds0[i] = new_temp;
 	      }
 
@@ -8774,8 +8719,7 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 		    new_stmt = gimple_build_assign (vec_dest, data_ref);
 		    new_temp = make_ssa_name (vec_dest, new_stmt);
 		    gimple_assign_set_lhs (new_stmt, new_temp);
-		    gimple_set_vdef (new_stmt, gimple_vdef (stmt_info->stmt));
-		    gimple_set_vuse (new_stmt, gimple_vuse (stmt_info->stmt));
+		    gimple_move_vops (new_stmt, stmt_info->stmt);
 		    vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
 		    msq = new_temp;
 
diff --git a/gcc/tree-vector-builder.c b/gcc/tree-vector-builder.c
index f31dc13b4..d02fb950c 100644
--- a/gcc/tree-vector-builder.c
+++ b/gcc/tree-vector-builder.c
@@ -24,103 +24,6 @@ along with GCC; see the file COPYING3.  If not see
 #include "fold-const.h"
 #include "tree-vector-builder.h"
 
-/* Try to start building a new vector of type TYPE that holds the result of
-   a unary operation on VECTOR_CST T.  ALLOW_STEPPED_P is true if the
-   operation can handle stepped encodings directly, without having to
-   expand the full sequence.
-
-   Return true if the operation is possible, which it always is when
-   ALLOW_STEPPED_P is true.  Leave the builder unchanged otherwise.  */
-
-bool
-tree_vector_builder::new_unary_operation (tree type, tree t,
-					  bool allow_stepped_p)
-{
-  poly_uint64 full_nelts = TYPE_VECTOR_SUBPARTS (type);
-  gcc_assert (known_eq (full_nelts, TYPE_VECTOR_SUBPARTS (TREE_TYPE (t))));
-  unsigned int npatterns = VECTOR_CST_NPATTERNS (t);
-  unsigned int nelts_per_pattern = VECTOR_CST_NELTS_PER_PATTERN (t);
-  if (!allow_stepped_p && nelts_per_pattern > 2)
-    {
-      if (!full_nelts.is_constant ())
-	return false;
-      npatterns = full_nelts.to_constant ();
-      nelts_per_pattern = 1;
-    }
-  new_vector (type, npatterns, nelts_per_pattern);
-  return true;
-}
-
-/* Try to start building a new vector of type TYPE that holds the result of
-   a binary operation on VECTOR_CSTs T1 and T2.  ALLOW_STEPPED_P is true if
-   the operation can handle stepped encodings directly, without having to
-   expand the full sequence.
-
-   Return true if the operation is possible.  Leave the builder unchanged
-   otherwise.  */
-
-bool
-tree_vector_builder::new_binary_operation (tree type, tree t1, tree t2,
-					   bool allow_stepped_p)
-{
-  poly_uint64 full_nelts = TYPE_VECTOR_SUBPARTS (type);
-  gcc_assert (known_eq (full_nelts, TYPE_VECTOR_SUBPARTS (TREE_TYPE (t1)))
-	      && known_eq (full_nelts, TYPE_VECTOR_SUBPARTS (TREE_TYPE (t2))));
-  /* Conceptually we split the patterns in T1 and T2 until we have
-     an equal number for both.  Each split pattern requires the same
-     number of elements per pattern as the original.  E.g. splitting:
-
-       { 1, 2, 3, ... }
-
-     into two gives:
-
-       { 1, 3, 5, ... }
-       { 2, 4, 6, ... }
-
-     while splitting:
-
-       { 1, 0, ... }
-
-     into two gives:
-
-       { 1, 0, ... }
-       { 0, 0, ... }.  */
-  unsigned int npatterns = least_common_multiple (VECTOR_CST_NPATTERNS (t1),
-						  VECTOR_CST_NPATTERNS (t2));
-  unsigned int nelts_per_pattern = MAX (VECTOR_CST_NELTS_PER_PATTERN (t1),
-					VECTOR_CST_NELTS_PER_PATTERN (t2));
-  if (!allow_stepped_p && nelts_per_pattern > 2)
-    {
-      if (!full_nelts.is_constant ())
-	return false;
-      npatterns = full_nelts.to_constant ();
-      nelts_per_pattern = 1;
-    }
-  new_vector (type, npatterns, nelts_per_pattern);
-  return true;
-}
-
-/* Return the number of elements that the caller needs to operate on in
-   order to handle a binary operation on VECTOR_CSTs T1 and T2.  This static
-   function is used instead of new_binary_operation if the result of the
-   operation is not a VECTOR_CST.  */
-
-unsigned int
-tree_vector_builder::binary_encoded_nelts (tree t1, tree t2)
-{
-  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (t1));
-  gcc_assert (known_eq (nelts, TYPE_VECTOR_SUBPARTS (TREE_TYPE (t2))));
-  /* See new_binary_operation for details.  */
-  unsigned int npatterns = least_common_multiple (VECTOR_CST_NPATTERNS (t1),
-						  VECTOR_CST_NPATTERNS (t2));
-  unsigned int nelts_per_pattern = MAX (VECTOR_CST_NELTS_PER_PATTERN (t1),
-					VECTOR_CST_NELTS_PER_PATTERN (t2));
-  unsigned HOST_WIDE_INT const_nelts;
-  if (nelts.is_constant (&const_nelts))
-    return MIN (npatterns * nelts_per_pattern, const_nelts);
-  return npatterns * nelts_per_pattern;
-}
-
 /* Return a vector element with the value BASE + FACTOR * STEP.  */
 
 tree
diff --git a/gcc/tree-vector-builder.h b/gcc/tree-vector-builder.h
index 13af74ad8..add79e476 100644
--- a/gcc/tree-vector-builder.h
+++ b/gcc/tree-vector-builder.h
@@ -24,10 +24,11 @@ along with GCC; see the file COPYING3.  If not see
 
 /* This class is used to build VECTOR_CSTs from a sequence of elements.
    See vector_builder for more details.  */
-class tree_vector_builder : public vector_builder<tree, tree_vector_builder>
+class tree_vector_builder : public vector_builder<tree, tree,
+						  tree_vector_builder>
 {
-  typedef vector_builder<tree, tree_vector_builder> parent;
-  friend class vector_builder<tree, tree_vector_builder>;
+  typedef vector_builder<tree, tree, tree_vector_builder> parent;
+  friend class vector_builder<tree, tree, tree_vector_builder>;
 
 public:
   tree_vector_builder () : m_type (0) {}
@@ -37,10 +38,6 @@ public:
   tree type () const { return m_type; }
 
   void new_vector (tree, unsigned int, unsigned int);
-  bool new_unary_operation (tree, tree, bool);
-  bool new_binary_operation (tree, tree, tree, bool);
-
-  static unsigned int binary_encoded_nelts (tree, tree);
 
 private:
   bool equal_p (const_tree, const_tree) const;
@@ -51,6 +48,15 @@ private:
   bool can_elide_p (const_tree) const;
   void note_representative (tree *, tree);
 
+  static poly_uint64 shape_nelts (const_tree t)
+    { return TYPE_VECTOR_SUBPARTS (t); }
+  static poly_uint64 nelts_of (const_tree t)
+    { return VECTOR_CST_NELTS (t); }
+  static unsigned int npatterns_of (const_tree t)
+    { return VECTOR_CST_NPATTERNS (t); }
+  static unsigned int nelts_per_pattern_of (const_tree t)
+    { return VECTOR_CST_NELTS_PER_PATTERN (t); }
+
   tree m_type;
 };
 
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index c2c6377d3..71ca80937 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -288,10 +288,7 @@ adjust_simduid_builtins (hash_table<simduid_to_vf> *htab)
 		       : BUILT_IN_GOMP_ORDERED_END);
 		  gimple *g
 		    = gimple_build_call (builtin_decl_explicit (bcode), 0);
-		  tree vdef = gimple_vdef (stmt);
-		  gimple_set_vdef (g, vdef);
-		  SSA_NAME_DEF_STMT (vdef) = g;
-		  gimple_set_vuse (g, gimple_vuse (stmt));
+		  gimple_move_vops (g, stmt);
 		  gsi_replace (&i, g, true);
 		  continue;
 		}
diff --git a/gcc/tree.c b/gcc/tree.c
index c4b8eea67..62607c63a 100644
--- a/gcc/tree.c
+++ b/gcc/tree.c
@@ -1965,6 +1965,23 @@ build_index_vector (tree vec_type, poly_uint64 base, poly_uint64 step)
   return v.build ();
 }
 
+/* Return a VECTOR_CST of type VEC_TYPE in which the first NUM_A
+   elements are A and the rest are B.  */
+
+tree
+build_vector_a_then_b (tree vec_type, unsigned int num_a, tree a, tree b)
+{
+  gcc_assert (known_le (num_a, TYPE_VECTOR_SUBPARTS (vec_type)));
+  unsigned int count = constant_lower_bound (TYPE_VECTOR_SUBPARTS (vec_type));
+  /* Optimize the constant case.  */
+  if ((count & 1) == 0 && TYPE_VECTOR_SUBPARTS (vec_type).is_constant ())
+    count /= 2;
+  tree_vector_builder builder (vec_type, count, 2);
+  for (unsigned int i = 0; i < count * 2; ++i)
+    builder.quick_push (i < num_a ? a : b);
+  return builder.build ();
+}
+
 /* Something has messed with the elements of CONSTRUCTOR C after it was built;
    calculate TREE_CONSTANT and TREE_SIDE_EFFECTS.  */
 
diff --git a/gcc/tree.h b/gcc/tree.h
index 6f73593fa..356a9f544 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -2475,10 +2475,10 @@ extern machine_mode vector_type_mode (const_tree);
   (DECL_COMMON_CHECK (NODE)->decl_common.mode = (MODE))
 
 /* For FUNCTION_DECL, if it is built-in, this identifies which built-in
-   operation it is.  Note, however, that this field is overloaded, with
-   DECL_BUILT_IN_CLASS as the discriminant, so the latter must always be
-   checked before any access to the former.  */
-#define DECL_FUNCTION_CODE(NODE) \
+   operation it is.  This is only intended for low-level accesses;
+   normally DECL_FUNCTION_CODE, DECL_FE_FUNCTION_CODE or DECL_MD_FUNCTION
+   should be used instead.  */
+#define DECL_UNCHECKED_FUNCTION_CODE(NODE) \
   (FUNCTION_DECL_CHECK (NODE)->function_decl.function_code)
 
 /* Test if FCODE is a function code for an alloca operation.  */
@@ -2955,11 +2955,34 @@ extern void decl_fini_priority_insert (tree, priority_type);
 #define DECL_IS_MALLOC(NODE) \
   (FUNCTION_DECL_CHECK (NODE)->function_decl.malloc_flag)
 
+/* Macro for direct set and get of function_decl.decl_type.  */
+#define FUNCTION_DECL_DECL_TYPE(NODE) \
+  (NODE->function_decl.decl_type)
+
+/* Set decl_type of a DECL.  Set it to T when SET is true, or reset
+   it to NONE.  */
+
+static inline void
+set_function_decl_type (tree decl, function_decl_type t, bool set)
+{
+  if (set)
+    {
+      gcc_assert (FUNCTION_DECL_DECL_TYPE (decl) == NONE
+		  || FUNCTION_DECL_DECL_TYPE (decl) == t);
+      decl->function_decl.decl_type = t;
+    }
+  else if (FUNCTION_DECL_DECL_TYPE (decl) == t)
+    FUNCTION_DECL_DECL_TYPE (decl) = NONE;
+}
+
 /* Nonzero in a FUNCTION_DECL means this function should be treated as
    C++ operator new, meaning that it returns a pointer for which we
    should not use type based aliasing.  */
-#define DECL_IS_OPERATOR_NEW(NODE) \
-  (FUNCTION_DECL_CHECK (NODE)->function_decl.operator_new_flag)
+#define DECL_IS_OPERATOR_NEW_P(NODE) \
+  (FUNCTION_DECL_CHECK (NODE)->function_decl.decl_type == OPERATOR_NEW)
+
+#define DECL_SET_IS_OPERATOR_NEW(NODE, VAL) \
+  set_function_decl_type (FUNCTION_DECL_CHECK (NODE), OPERATOR_NEW, VAL)
 
 /* Nonzero in a FUNCTION_DECL means this function may return more
    than once.  */
@@ -3066,10 +3089,9 @@ extern vec<tree, va_gc> **decl_debug_args_insert (tree);
 #define DECL_STRUCT_FUNCTION(NODE) \
   (FUNCTION_DECL_CHECK (NODE)->function_decl.f)
 
-
 /* For a builtin function, identify which part of the compiler defined it.  */
 #define DECL_BUILT_IN_CLASS(NODE) \
-   (FUNCTION_DECL_CHECK (NODE)->function_decl.built_in_class)
+   ((built_in_class) FUNCTION_DECL_CHECK (NODE)->function_decl.built_in_class)
 
 /* In FUNCTION_DECL, a chain of ..._DECL nodes.  */
 #define DECL_ARGUMENTS(NODE) \
@@ -3104,8 +3126,11 @@ extern vec<tree, va_gc> **decl_debug_args_insert (tree);
    (FUNCTION_DECL_CHECK (NODE)->decl_with_vis.cxx_destructor)
 
 /* In FUNCTION_DECL, this is set if this function is a lambda function.  */
-#define DECL_LAMBDA_FUNCTION(NODE) \
-  (FUNCTION_DECL_CHECK (NODE)->function_decl.lambda_function)
+#define DECL_LAMBDA_FUNCTION_P(NODE) \
+  (FUNCTION_DECL_CHECK (NODE)->function_decl.decl_type == LAMBDA_FUNCTION)
+
+#define DECL_SET_LAMBDA_FUNCTION(NODE, VAL) \
+  set_function_decl_type (FUNCTION_DECL_CHECK (NODE), LAMBDA_FUNCTION, VAL)
 
 /* In FUNCTION_DECL that represent an virtual method this is set when
    the method is final.  */
@@ -3788,6 +3813,61 @@ valid_vector_subparts_p (poly_uint64 subparts)
   return true;
 }
 
+/* Return the built-in function that DECL represents, given that it is known
+   to be a FUNCTION_DECL with built-in class BUILT_IN_NORMAL.  */
+inline built_in_function
+DECL_FUNCTION_CODE (const_tree decl)
+{
+  const tree_function_decl &fndecl = FUNCTION_DECL_CHECK (decl)->function_decl;
+  gcc_checking_assert (fndecl.built_in_class == BUILT_IN_NORMAL);
+  return (built_in_function) fndecl.function_code;
+}
+
+/* Return the target-specific built-in function that DECL represents,
+   given that it is known to be a FUNCTION_DECL with built-in class
+   BUILT_IN_MD.  */
+inline int
+DECL_MD_FUNCTION_CODE (const_tree decl)
+{
+  const tree_function_decl &fndecl = FUNCTION_DECL_CHECK (decl)->function_decl;
+  gcc_checking_assert (fndecl.built_in_class == BUILT_IN_MD);
+  return fndecl.function_code;
+}
+
+/* Return the frontend-specific built-in function that DECL represents,
+   given that it is known to be a FUNCTION_DECL with built-in class
+   BUILT_IN_FRONTEND.  */
+inline int
+DECL_FE_FUNCTION_CODE (const_tree decl)
+{
+  const tree_function_decl &fndecl = FUNCTION_DECL_CHECK (decl)->function_decl;
+  gcc_checking_assert (fndecl.built_in_class == BUILT_IN_FRONTEND);
+  return fndecl.function_code;
+}
+
+/* Record that FUNCTION_DECL DECL represents built-in function FCODE of
+   class FCLASS.  */
+inline void
+set_decl_built_in_function (tree decl, built_in_class fclass,
+			    unsigned int fcode)
+{
+  tree_function_decl &fndecl = FUNCTION_DECL_CHECK (decl)->function_decl;
+  fndecl.built_in_class = fclass;
+  fndecl.function_code = fcode;
+}
+
+/* Record that FUNCTION_DECL NEWDECL represents the same built-in function
+   as OLDDECL (or none, if OLDDECL doesn't represent a built-in function).  */
+inline void
+copy_decl_built_in_function (tree newdecl, const_tree olddecl)
+{
+  tree_function_decl &newfndecl = FUNCTION_DECL_CHECK (newdecl)->function_decl;
+  const tree_function_decl &oldfndecl
+    = FUNCTION_DECL_CHECK (olddecl)->function_decl;
+  newfndecl.built_in_class = oldfndecl.built_in_class;
+  newfndecl.function_code = oldfndecl.function_code;
+}
+
 /* In NON_LVALUE_EXPR and VIEW_CONVERT_EXPR, set when this node is merely a
    wrapper added to express a location_t on behalf of the node's child
    (e.g. by maybe_wrap_with_location).  */
@@ -4212,6 +4292,7 @@ extern tree build_vector_from_val (tree, tree);
 extern tree build_uniform_cst (tree, tree);
 extern tree build_vec_series (tree, tree, tree);
 extern tree build_index_vector (tree, poly_uint64, poly_uint64);
+extern tree build_vector_a_then_b (tree, unsigned int, tree, tree);
 extern void recompute_constructor_flags (tree);
 extern void verify_constructor_flags (tree);
 extern tree build_constructor (tree, vec<constructor_elt, va_gc> *);
@@ -5967,9 +6048,10 @@ fndecl_built_in_p (const_tree node, built_in_class klass)
    of class KLASS with name equal to NAME.  */
 
 inline bool
-fndecl_built_in_p (const_tree node, int name, built_in_class klass)
+fndecl_built_in_p (const_tree node, unsigned int name, built_in_class klass)
 {
-  return (fndecl_built_in_p (node, klass) && DECL_FUNCTION_CODE (node) == name);
+  return (fndecl_built_in_p (node, klass)
+	  && DECL_UNCHECKED_FUNCTION_CODE (node) == name);
 }
 
 /* Return true if a FUNCTION_DECL NODE is a GCC built-in function
diff --git a/gcc/var-tracking.c b/gcc/var-tracking.c
index 96e0c93a6..982ef13d1 100644
--- a/gcc/var-tracking.c
+++ b/gcc/var-tracking.c
@@ -116,6 +116,7 @@
 #include "rtl-iter.h"
 #include "fibonacci_heap.h"
 #include "print-rtl.h"
+#include "function-abi.h"
 
 typedef fibonacci_heap <long, basic_block_def> bb_heap_t;
 typedef fibonacci_node <long, basic_block_def> bb_heap_node_t;
@@ -1238,7 +1239,7 @@ adjust_insn (basic_block bb, rtx_insn *insn)
   amd.stack_adjust = -VTI (bb)->out.stack_adjust;
 
   amd.store = true;
-  note_stores (PATTERN (insn), adjust_mem_stores, &amd);
+  note_stores (insn, adjust_mem_stores, &amd);
 
   amd.store = false;
   if (GET_CODE (PATTERN (insn)) == PARALLEL
@@ -4899,12 +4900,11 @@ dataflow_set_clear_at_call (dataflow_set *set, rtx_insn *call_insn)
 {
   unsigned int r;
   hard_reg_set_iterator hrsi;
-  HARD_REG_SET invalidated_regs;
 
-  get_call_reg_set_usage (call_insn, &invalidated_regs,
-			  regs_invalidated_by_call);
+  HARD_REG_SET callee_clobbers
+    = insn_callee_abi (call_insn).full_reg_clobbers ();
 
-  EXECUTE_IF_SET_IN_HARD_REG_SET (invalidated_regs, 0, r, hrsi)
+  EXECUTE_IF_SET_IN_HARD_REG_SET (callee_clobbers, 0, r, hrsi)
     var_regno_delete (set, r);
 
   if (MAY_HAVE_DEBUG_BIND_INSNS)
@@ -6292,14 +6292,12 @@ prepare_call_arguments (basic_block bb, rtx_insn *insn)
 		  && targetm.calls.struct_value_rtx (type, 0) == 0)
 		{
 		  tree struct_addr = build_pointer_type (TREE_TYPE (type));
-		  machine_mode mode = TYPE_MODE (struct_addr);
+		  function_arg_info arg (struct_addr, /*named=*/true);
 		  rtx reg;
 		  INIT_CUMULATIVE_ARGS (args_so_far_v, type, NULL_RTX, fndecl,
 					nargs + 1);
-		  reg = targetm.calls.function_arg (args_so_far, mode,
-						    struct_addr, true);
-		  targetm.calls.function_arg_advance (args_so_far, mode,
-						      struct_addr, true);
+		  reg = targetm.calls.function_arg (args_so_far, arg);
+		  targetm.calls.function_arg_advance (args_so_far, arg);
 		  if (reg == NULL_RTX)
 		    {
 		      for (; link; link = XEXP (link, 1))
@@ -6317,11 +6315,9 @@ prepare_call_arguments (basic_block bb, rtx_insn *insn)
 				      nargs);
 	      if (obj_type_ref && TYPE_ARG_TYPES (type) != void_list_node)
 		{
-		  machine_mode mode;
 		  t = TYPE_ARG_TYPES (type);
-		  mode = TYPE_MODE (TREE_VALUE (t));
-		  this_arg = targetm.calls.function_arg (args_so_far, mode,
-							 TREE_VALUE (t), true);
+		  function_arg_info arg (TREE_VALUE (t), /*named=*/true);
+		  this_arg = targetm.calls.function_arg (args_so_far, arg);
 		  if (this_arg && !REG_P (this_arg))
 		    this_arg = NULL_RTX;
 		  else if (this_arg == NULL_RTX)
@@ -6429,30 +6425,24 @@ prepare_call_arguments (basic_block bb, rtx_insn *insn)
 	  }
 	if (t && t != void_list_node)
 	  {
-	    tree argtype = TREE_VALUE (t);
-	    machine_mode mode = TYPE_MODE (argtype);
 	    rtx reg;
-	    if (pass_by_reference (&args_so_far_v, mode, argtype, true))
-	      {
-		argtype = build_pointer_type (argtype);
-		mode = TYPE_MODE (argtype);
-	      }
-	    reg = targetm.calls.function_arg (args_so_far, mode,
-					      argtype, true);
-	    if (TREE_CODE (argtype) == REFERENCE_TYPE
-		&& INTEGRAL_TYPE_P (TREE_TYPE (argtype))
+	    function_arg_info arg (TREE_VALUE (t), /*named=*/true);
+	    apply_pass_by_reference_rules (&args_so_far_v, arg);
+	    reg = targetm.calls.function_arg (args_so_far, arg);
+	    if (TREE_CODE (arg.type) == REFERENCE_TYPE
+		&& INTEGRAL_TYPE_P (TREE_TYPE (arg.type))
 		&& reg
 		&& REG_P (reg)
-		&& GET_MODE (reg) == mode
-		&& (GET_MODE_CLASS (mode) == MODE_INT
-		    || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
+		&& GET_MODE (reg) == arg.mode
+		&& (GET_MODE_CLASS (arg.mode) == MODE_INT
+		    || GET_MODE_CLASS (arg.mode) == MODE_PARTIAL_INT)
 		&& REG_P (x)
 		&& REGNO (x) == REGNO (reg)
-		&& GET_MODE (x) == mode
+		&& GET_MODE (x) == arg.mode
 		&& item)
 	      {
 		machine_mode indmode
-		  = TYPE_MODE (TREE_TYPE (argtype));
+		  = TYPE_MODE (TREE_TYPE (arg.type));
 		rtx mem = gen_rtx_MEM (indmode, x);
 		cselib_val *val = cselib_lookup (mem, indmode, 0, VOIDmode);
 		if (val && cselib_preserved_value_p (val))
@@ -6492,8 +6482,7 @@ prepare_call_arguments (basic_block bb, rtx_insn *insn)
 			}
 		  }
 	      }
-	    targetm.calls.function_arg_advance (args_so_far, mode,
-						argtype, true);
+	    targetm.calls.function_arg_advance (args_so_far, arg);
 	    t = TREE_CHAIN (t);
 	  }
       }
@@ -6642,7 +6631,7 @@ add_with_sets (rtx_insn *insn, struct cselib_set *sets, int n_sets)
      insert notes before it without worrying about any
      notes that MO_USEs might emit after the insn.  */
   cui.store_p = true;
-  note_stores (PATTERN (insn), add_stores, &cui);
+  note_stores (insn, add_stores, &cui);
   n2 = VTI (bb)->mos.length () - 1;
   mos = VTI (bb)->mos.address ();
 
diff --git a/gcc/vector-builder.h b/gcc/vector-builder.h
index 9967daa6e..37911ac69 100644
--- a/gcc/vector-builder.h
+++ b/gcc/vector-builder.h
@@ -45,8 +45,11 @@ along with GCC; see the file COPYING3.  If not see
       variable-length vectors.  finalize () then canonicalizes the encoding
       to a simpler form if possible.
 
-   The derived class Derived provides this functionality for specific Ts.
-   Derived needs to provide the following interface:
+   Shape is the type that specifies the number of elements in the vector
+   and (where relevant) the type of each element.
+
+   The derived class Derived provides the functionality of this class
+   for specific Ts.  Derived needs to provide the following interface:
 
       bool equal_p (T elt1, T elt2) const;
 
@@ -82,9 +85,30 @@ along with GCC; see the file COPYING3.  If not see
 
 	  Record that ELT2 is being elided, given that ELT1_PTR points to
 	  the last encoded element for the containing pattern.  This is
-	  again provided for TREE_OVERFLOW handling.  */
+	  again provided for TREE_OVERFLOW handling.
+
+      static poly_uint64 shape_nelts (Shape shape);
+
+	  Return the number of elements in SHAPE.
+
+    The class provides additional functionality for the case in which
+    T can describe a vector constant as well as an individual element.
+    This functionality requires:
+
+      static poly_uint64 nelts_of (T x);
+
+	  Return the number of elements in vector constant X.
+
+      static unsigned int npatterns_of (T x);
 
-template<typename T, typename Derived>
+	  Return the number of patterns used to encode vector constant X.
+
+      static unsigned int nelts_per_pattern_of (T x);
+
+	  Return the number of elements used to encode each pattern
+	  in vector constant X.  */
+
+template<typename T, typename Shape, typename Derived>
 class vector_builder : public auto_vec<T, 32>
 {
 public:
@@ -96,12 +120,18 @@ public:
   unsigned int encoded_nelts () const;
   bool encoded_full_vector_p () const;
   T elt (unsigned int) const;
+  unsigned int count_dups (int, int, int) const;
 
   bool operator == (const Derived &) const;
   bool operator != (const Derived &x) const { return !operator == (x); }
 
+  bool new_unary_operation (Shape, T, bool);
+  bool new_binary_operation (Shape, T, T, bool);
+
   void finalize ();
 
+  static unsigned int binary_encoded_nelts (T, T);
+
 protected:
   void new_vector (poly_uint64, unsigned int, unsigned int);
   void reshape (unsigned int, unsigned int);
@@ -120,16 +150,16 @@ private:
   unsigned int m_nelts_per_pattern;
 };
 
-template<typename T, typename Derived>
+template<typename T, typename Shape, typename Derived>
 inline const Derived *
-vector_builder<T, Derived>::derived () const
+vector_builder<T, Shape, Derived>::derived () const
 {
   return static_cast<const Derived *> (this);
 }
 
-template<typename T, typename Derived>
+template<typename T, typename Shape, typename Derived>
 inline
-vector_builder<T, Derived>::vector_builder ()
+vector_builder<T, Shape, Derived>::vector_builder ()
   : m_full_nelts (0),
     m_npatterns (0),
     m_nelts_per_pattern (0)
@@ -139,18 +169,18 @@ vector_builder<T, Derived>::vector_builder ()
    starts with these explicitly-encoded elements and may contain additional
    elided elements.  */
 
-template<typename T, typename Derived>
+template<typename T, typename Shape, typename Derived>
 inline unsigned int
-vector_builder<T, Derived>::encoded_nelts () const
+vector_builder<T, Shape, Derived>::encoded_nelts () const
 {
   return m_npatterns * m_nelts_per_pattern;
 }
 
 /* Return true if every element of the vector is explicitly encoded.  */
 
-template<typename T, typename Derived>
+template<typename T, typename Shape, typename Derived>
 inline bool
-vector_builder<T, Derived>::encoded_full_vector_p () const
+vector_builder<T, Shape, Derived>::encoded_full_vector_p () const
 {
   return known_eq (m_npatterns * m_nelts_per_pattern, m_full_nelts);
 }
@@ -158,11 +188,11 @@ vector_builder<T, Derived>::encoded_full_vector_p () const
 /* Start building a vector that has FULL_NELTS elements.  Initially
    encode it using NPATTERNS patterns with NELTS_PER_PATTERN each.  */
 
-template<typename T, typename Derived>
+template<typename T, typename Shape, typename Derived>
 void
-vector_builder<T, Derived>::new_vector (poly_uint64 full_nelts,
-					unsigned int npatterns,
-					unsigned int nelts_per_pattern)
+vector_builder<T, Shape, Derived>::new_vector (poly_uint64 full_nelts,
+					       unsigned int npatterns,
+					       unsigned int nelts_per_pattern)
 {
   m_full_nelts = full_nelts;
   m_npatterns = npatterns;
@@ -174,9 +204,9 @@ vector_builder<T, Derived>::new_vector (poly_uint64 full_nelts,
 /* Return true if this vector and OTHER have the same elements and
    are encoded in the same way.  */
 
-template<typename T, typename Derived>
+template<typename T, typename Shape, typename Derived>
 bool
-vector_builder<T, Derived>::operator == (const Derived &other) const
+vector_builder<T, Shape, Derived>::operator == (const Derived &other) const
 {
   if (maybe_ne (m_full_nelts, other.m_full_nelts)
       || m_npatterns != other.m_npatterns
@@ -194,18 +224,19 @@ vector_builder<T, Derived>::operator == (const Derived &other) const
 /* Return the value of vector element I, which might or might not be
    encoded explicitly.  */
 
-template<typename T, typename Derived>
+template<typename T, typename Shape, typename Derived>
 T
-vector_builder<T, Derived>::elt (unsigned int i) const
+vector_builder<T, Shape, Derived>::elt (unsigned int i) const
 {
-  /* This only makes sense if the encoding has been fully populated.  */
-  gcc_checking_assert (encoded_nelts () <= this->length ());
-
   /* First handle elements that are already present in the underlying
      vector, regardless of whether they're part of the encoding or not.  */
   if (i < this->length ())
     return (*this)[i];
 
+  /* Extrapolation is only possible if the encoding has been fully
+     populated.  */
+  gcc_checking_assert (encoded_nelts () <= this->length ());
+
   /* Identify the pattern that contains element I and work out the index of
      the last encoded element for that pattern.  */
   unsigned int pattern = i % m_npatterns;
@@ -223,13 +254,136 @@ vector_builder<T, Derived>::elt (unsigned int i) const
 				 derived ()->step (prev, final));
 }
 
+/* Try to start building a new vector of shape SHAPE that holds the result of
+   a unary operation on vector constant VEC.  ALLOW_STEPPED_P is true if the
+   operation can handle stepped encodings directly, without having to expand
+   the full sequence.
+
+   Return true if the operation is possible, which it always is when
+   ALLOW_STEPPED_P is true.  Leave the builder unchanged otherwise.  */
+
+template<typename T, typename Shape, typename Derived>
+bool
+vector_builder<T, Shape, Derived>::new_unary_operation (Shape shape, T vec,
+							bool allow_stepped_p)
+{
+  poly_uint64 full_nelts = Derived::shape_nelts (shape);
+  gcc_assert (known_eq (full_nelts, Derived::nelts_of (vec)));
+  unsigned int npatterns = Derived::npatterns_of (vec);
+  unsigned int nelts_per_pattern = Derived::nelts_per_pattern_of (vec);
+  if (!allow_stepped_p && nelts_per_pattern > 2)
+    {
+      if (!full_nelts.is_constant ())
+	return false;
+      npatterns = full_nelts.to_constant ();
+      nelts_per_pattern = 1;
+    }
+  derived ()->new_vector (shape, npatterns, nelts_per_pattern);
+  return true;
+}
+
+/* Try to start building a new vector of shape SHAPE that holds the result of
+   a binary operation on vector constants VEC1 and VEC2.  ALLOW_STEPPED_P is
+   true if the operation can handle stepped encodings directly, without
+   having to expand the full sequence.
+
+   Return true if the operation is possible.  Leave the builder unchanged
+   otherwise.  */
+
+template<typename T, typename Shape, typename Derived>
+bool
+vector_builder<T, Shape, Derived>::new_binary_operation (Shape shape,
+							 T vec1, T vec2,
+							 bool allow_stepped_p)
+{
+  poly_uint64 full_nelts = Derived::shape_nelts (shape);
+  gcc_assert (known_eq (full_nelts, Derived::nelts_of (vec1))
+	      && known_eq (full_nelts, Derived::nelts_of (vec2)));
+  /* Conceptually we split the patterns in VEC1 and VEC2 until we have
+     an equal number for both.  Each split pattern requires the same
+     number of elements per pattern as the original.  E.g. splitting:
+
+       { 1, 2, 3, ... }
+
+     into two gives:
+
+       { 1, 3, 5, ... }
+       { 2, 4, 6, ... }
+
+     while splitting:
+
+       { 1, 0, ... }
+
+     into two gives:
+
+       { 1, 0, ... }
+       { 0, 0, ... }.  */
+  unsigned int npatterns
+    = least_common_multiple (Derived::npatterns_of (vec1),
+			     Derived::npatterns_of (vec2));
+  unsigned int nelts_per_pattern
+    = MAX (Derived::nelts_per_pattern_of (vec1),
+	   Derived::nelts_per_pattern_of (vec2));
+  if (!allow_stepped_p && nelts_per_pattern > 2)
+    {
+      if (!full_nelts.is_constant ())
+	return false;
+      npatterns = full_nelts.to_constant ();
+      nelts_per_pattern = 1;
+    }
+  derived ()->new_vector (shape, npatterns, nelts_per_pattern);
+  return true;
+}
+
+/* Return the number of elements that the caller needs to operate on in
+   order to handle a binary operation on vector constants VEC1 and VEC2.
+   This static function is used instead of new_binary_operation if the
+   result of the operation is not a constant vector.  */
+
+template<typename T, typename Shape, typename Derived>
+unsigned int
+vector_builder<T, Shape, Derived>::binary_encoded_nelts (T vec1, T vec2)
+{
+  poly_uint64 nelts = Derived::nelts_of (vec1);
+  gcc_assert (known_eq (nelts, Derived::nelts_of (vec2)));
+  /* See new_binary_operation for details.  */
+  unsigned int npatterns
+    = least_common_multiple (Derived::npatterns_of (vec1),
+			     Derived::npatterns_of (vec2));
+  unsigned int nelts_per_pattern
+    = MAX (Derived::nelts_per_pattern_of (vec1),
+	   Derived::nelts_per_pattern_of (vec2));
+  unsigned HOST_WIDE_INT const_nelts;
+  if (nelts.is_constant (&const_nelts))
+    return MIN (npatterns * nelts_per_pattern, const_nelts);
+  return npatterns * nelts_per_pattern;
+}
+
+/* Return the number of leading duplicate elements in the range
+   [START:END:STEP].  The value is always at least 1.  */
+
+template<typename T, typename Shape, typename Derived>
+unsigned int
+vector_builder<T, Shape, Derived>::count_dups (int start, int end,
+					       int step) const
+{
+  gcc_assert ((end - start) % step == 0);
+
+  unsigned int ndups = 1;
+  for (int i = start + step;
+       i != end && derived ()->equal_p (elt (i), elt (start));
+       i += step)
+    ndups++;
+  return ndups;
+}
+
 /* Change the encoding to NPATTERNS patterns of NELTS_PER_PATTERN each,
    but without changing the underlying vector.  */
 
-template<typename T, typename Derived>
+template<typename T, typename Shape, typename Derived>
 void
-vector_builder<T, Derived>::reshape (unsigned int npatterns,
-				     unsigned int nelts_per_pattern)
+vector_builder<T, Shape, Derived>::reshape (unsigned int npatterns,
+					    unsigned int nelts_per_pattern)
 {
   unsigned int old_encoded_nelts = encoded_nelts ();
   unsigned int new_encoded_nelts = npatterns * nelts_per_pattern;
@@ -249,11 +403,11 @@ vector_builder<T, Derived>::reshape (unsigned int npatterns,
 /* Return true if elements [START, END) contain a repeating sequence of
    STEP elements.  */
 
-template<typename T, typename Derived>
+template<typename T, typename Shape, typename Derived>
 bool
-vector_builder<T, Derived>::repeating_sequence_p (unsigned int start,
-						  unsigned int end,
-						  unsigned int step)
+vector_builder<T, Shape, Derived>::repeating_sequence_p (unsigned int start,
+							 unsigned int end,
+							 unsigned int step)
 {
   for (unsigned int i = start; i < end - step; ++i)
     if (!derived ()->equal_p ((*this)[i], (*this)[i + step]))
@@ -264,11 +418,11 @@ vector_builder<T, Derived>::repeating_sequence_p (unsigned int start,
 /* Return true if elements [START, END) contain STEP interleaved linear
    series.  */
 
-template<typename T, typename Derived>
+template<typename T, typename Shape, typename Derived>
 bool
-vector_builder<T, Derived>::stepped_sequence_p (unsigned int start,
-						unsigned int end,
-						unsigned int step)
+vector_builder<T, Shape, Derived>::stepped_sequence_p (unsigned int start,
+						       unsigned int end,
+						       unsigned int step)
 {
   if (!derived ()->allow_steps_p ())
     return false;
@@ -297,9 +451,9 @@ vector_builder<T, Derived>::stepped_sequence_p (unsigned int start,
 /* Try to change the number of encoded patterns to NPATTERNS, returning
    true on success.  */
 
-template<typename T, typename Derived>
+template<typename T, typename Shape, typename Derived>
 bool
-vector_builder<T, Derived>::try_npatterns (unsigned int npatterns)
+vector_builder<T, Shape, Derived>::try_npatterns (unsigned int npatterns)
 {
   if (m_nelts_per_pattern == 1)
     {
@@ -350,9 +504,9 @@ vector_builder<T, Derived>::try_npatterns (unsigned int npatterns)
 
 /* Replace the current encoding with the canonical form.  */
 
-template<typename T, typename Derived>
+template<typename T, typename Shape, typename Derived>
 void
-vector_builder<T, Derived>::finalize ()
+vector_builder<T, Shape, Derived>::finalize ()
 {
   /* The encoding requires the same number of elements to come from each
      pattern.  */
diff --git a/libgcc/config.host b/libgcc/config.host
index 0f15fda36..9500ec2ee 100644
--- a/libgcc/config.host
+++ b/libgcc/config.host
@@ -356,6 +356,12 @@ aarch64*-*-freebsd*)
 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
 	md_unwind_header=aarch64/freebsd-unwind.h
 	;;
+aarch64*-*-netbsd*)
+	extra_parts="$extra_parts crtfastmath.o"
+	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
+	md_unwind_header=aarch64/aarch64-unwind.h
+	;;
 aarch64*-*-fuchsia*)
 	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp"
diff --git a/libgcc/config/aarch64/aarch64-unwind.h b/libgcc/config/aarch64/aarch64-unwind.h
index 223ac9157..13e6e4a6a 100644
--- a/libgcc/config/aarch64/aarch64-unwind.h
+++ b/libgcc/config/aarch64/aarch64-unwind.h
@@ -35,6 +35,23 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define MD_FROB_UPDATE_CONTEXT(context, fs) \
   aarch64_frob_update_context (context, fs)
 
+static inline int
+aarch64_cie_signed_with_b_key (struct _Unwind_Context *context)
+{
+  const struct dwarf_fde *fde = _Unwind_Find_FDE (context->bases.func,
+						  &context->bases);
+  if (fde != NULL)
+    {
+      const struct dwarf_cie *cie = get_cie (fde);
+      if (cie != NULL)
+	{
+	  char *aug_str = cie->augmentation;
+	  return strchr (aug_str, 'B') == NULL ? 0 : 1;
+	}
+    }
+  return 0;
+}
+
 /* Do AArch64 private extraction on ADDR based on context info CONTEXT and
    unwind frame info FS.  If ADDR is signed, we do address authentication on it
    using CFA of current frame.  */
@@ -43,9 +60,11 @@ static inline void *
 aarch64_post_extract_frame_addr (struct _Unwind_Context *context,
 				 _Unwind_FrameState *fs, void *addr)
 {
-  if (fs->regs.reg[DWARF_REGNUM_AARCH64_RA_STATE].loc.offset & 0x1)
+  if (context->flags & RA_SIGNED_BIT)
     {
       _Unwind_Word salt = (_Unwind_Word) context->cfa;
+      if (aarch64_cie_signed_with_b_key (context) != 0)
+	return __builtin_aarch64_autib1716 (addr, salt);
       return __builtin_aarch64_autia1716 (addr, salt);
     }
   else
@@ -62,9 +81,14 @@ aarch64_post_frob_eh_handler_addr (struct _Unwind_Context *current,
 				   ATTRIBUTE_UNUSED,
 				   void *handler_addr)
 {
-  if (current->flags & RA_A_SIGNED_BIT)
-    return __builtin_aarch64_pacia1716 (handler_addr,
+  if (current->flags & RA_SIGNED_BIT)
+    {
+      if (aarch64_cie_signed_with_b_key (current))
+	return __builtin_aarch64_pacib1716 (handler_addr,
+					    (_Unwind_Word) current->cfa);
+      return __builtin_aarch64_pacia1716 (handler_addr,
 					(_Unwind_Word) current->cfa);
+    }
   else
     return handler_addr;
 }
@@ -79,7 +103,7 @@ aarch64_frob_update_context (struct _Unwind_Context *context,
 {
   if (fs->regs.reg[DWARF_REGNUM_AARCH64_RA_STATE].loc.offset & 0x1)
     /* The flag is used for re-authenticating EH handler's address.  */
-    context->flags |= RA_A_SIGNED_BIT;
+    context->flags |= RA_SIGNED_BIT;
 
   return;
 }
diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 24b4ecee6..40ebf85a9 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -334,6 +334,9 @@ get_cie_encoding (const struct dwarf_cie *cie)
       /* LSDA encoding.  */
       else if (*aug == 'L')
 	p++;
+      /* aarch64 b-key pointer authentication.  */
+      else if (*aug == 'B')
+	p++;
       /* Otherwise end of string, or unknown augmentation.  */
       else
 	return DW_EH_PE_absptr;
diff --git a/libgcc/unwind-dw2.c b/libgcc/unwind-dw2.c
index e6130af2f..e76a1cbc4 100644
--- a/libgcc/unwind-dw2.c
+++ b/libgcc/unwind-dw2.c
@@ -136,8 +136,9 @@ struct _Unwind_Context
 #define SIGNAL_FRAME_BIT ((~(_Unwind_Word) 0 >> 1) + 1)
   /* Context which has version/args_size/by_value fields.  */
 #define EXTENDED_CONTEXT_BIT ((~(_Unwind_Word) 0 >> 2) + 1)
-  /* Bit reserved on AArch64, return address has been signed with A key.  */
-#define RA_A_SIGNED_BIT ((~(_Unwind_Word) 0 >> 3) + 1)
+  /* Bit reserved on AArch64, return address has been signed with A or B
+     key.  */
+#define RA_SIGNED_BIT ((~(_Unwind_Word) 0 >> 3) + 1)
   _Unwind_Word flags;
   /* 0 for now, can be increased when further fields are added to
      struct _Unwind_Context.  */
@@ -502,6 +503,11 @@ extract_cie_info (const struct dwarf_cie *cie, struct _Unwind_Context *context,
 	  fs->signal_frame = 1;
 	  aug += 1;
 	}
+      /* aarch64 B-key pointer authentication.  */
+      else if (aug[0] == 'B')
+	{
+	  aug += 1;
+      }
 
       /* Otherwise we have an unknown augmentation string.
 	 Bail unless we saw a 'z' prefix.  */